diff -Nru mesa-19.2.8/Android.common.mk mesa-20.0.8/Android.common.mk --- mesa-19.2.8/Android.common.mk 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/Android.common.mk 2020-06-12 01:21:16.000000000 +0000 @@ -39,7 +39,7 @@ -Wno-initializer-overrides \ -Wno-mismatched-tags \ -DPACKAGE_VERSION=\"$(MESA_VERSION)\" \ - -DPACKAGE_BUGREPORT=\"https://gitlab.freedesktop.org/mesa/mesa/issues\" + -DPACKAGE_BUGREPORT=\"https://gitlab.freedesktop.org/mesa/mesa/-/issues\" # XXX: The following __STDC_*_MACROS defines should not be needed. # It's likely due to a bug elsewhere, but let's temporarily add them @@ -103,12 +103,9 @@ LOCAL_CFLAGS += -DHAVE_SYS_SHM_H endif -ifeq ($(strip $(MESA_ENABLE_ASM)),true) ifeq ($(TARGET_ARCH),x86) LOCAL_CFLAGS += \ -DUSE_X86_ASM - -endif endif ifeq ($(ARCH_ARM_HAVE_NEON),true) LOCAL_CFLAGS_arm += -DUSE_ARM_ASM diff -Nru mesa-19.2.8/Android.mk mesa-20.0.8/Android.mk --- mesa-19.2.8/Android.mk 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/Android.mk 2020-06-12 01:21:16.000000000 +0000 @@ -24,7 +24,7 @@ # BOARD_GPU_DRIVERS should be defined. The valid values are # # classic drivers: i915 i965 -# gallium drivers: swrast freedreno i915g nouveau kmsro r300g r600g radeonsi vc4 virgl vmwgfx etnaviv iris lima +# gallium drivers: swrast freedreno i915g nouveau kmsro r300g r600g radeonsi vc4 virgl vmwgfx etnaviv iris lima panfrost # # The main target is libGLES_mesa. For each classic driver enabled, a DRI # module will also be built. DRI modules will be loaded by libGLES_mesa. @@ -43,6 +43,7 @@ MESA_COMMON_MK := $(MESA_TOP)/Android.common.mk MESA_PYTHON2 := python +MESA_PYTHON3 := python3 # Lists to convert driver names to boolean variables # in form of . @@ -61,7 +62,8 @@ virgl.HAVE_GALLIUM_VIRGL \ etnaviv.HAVE_GALLIUM_ETNAVIV \ iris.HAVE_GALLIUM_IRIS \ - lima.HAVE_GALLIUM_LIMA + lima.HAVE_GALLIUM_LIMA \ + panfrost.HAVE_GALLIUM_PANFROST ifeq ($(BOARD_GPU_DRIVERS),all) MESA_BUILD_CLASSIC := $(filter HAVE_%, $(subst ., , $(classic_drivers))) @@ -83,33 +85,20 @@ $(foreach d, $(MESA_BUILD_CLASSIC) $(MESA_BUILD_GALLIUM), $(eval $(d) := true)) -# host and target must be the same arch to generate matypes.h -ifeq ($(TARGET_ARCH),$(HOST_ARCH)) -MESA_ENABLE_ASM := true -else -MESA_ENABLE_ASM := false -endif - ifneq ($(filter true, $(HAVE_GALLIUM_RADEONSI)),) MESA_ENABLE_LLVM := true endif define mesa-build-with-llvm - $(if $(filter $(MESA_ANDROID_MAJOR_VERSION), 4 5), \ + $(if $(filter $(MESA_ANDROID_MAJOR_VERSION), 4 5 6 7), \ $(warning Unsupported LLVM version in Android $(MESA_ANDROID_MAJOR_VERSION)),) \ - $(if $(filter 6,$(MESA_ANDROID_MAJOR_VERSION)), \ - $(eval LOCAL_CFLAGS += -DHAVE_LLVM=0x0307 -DMESA_LLVM_VERSION_STRING=\"3.7\")) \ - $(if $(filter 7,$(MESA_ANDROID_MAJOR_VERSION)), \ - $(eval LOCAL_CFLAGS += -DHAVE_LLVM=0x0308 -DMESA_LLVM_VERSION_STRING=\"3.8\")) \ - $(if $(filter 8,$(MESA_ANDROID_MAJOR_VERSION)), \ - $(eval LOCAL_CFLAGS += -DHAVE_LLVM=0x0309 -DMESA_LLVM_VERSION_STRING=\"3.9\")) \ - $(if $(filter P,$(MESA_ANDROID_MAJOR_VERSION)), \ - $(eval LOCAL_CFLAGS += -DHAVE_LLVM=0x0309 -DMESA_LLVM_VERSION_STRING=\"3.9\")) \ + $(eval LOCAL_CFLAGS += -DLLVM_AVAILABLE -DMESA_LLVM_VERSION_STRING=\"3.9\") \ $(eval LOCAL_SHARED_LIBRARIES += libLLVM) endef # add subdirectories SUBDIRS := \ + src/etnaviv \ src/freedreno \ src/gbm \ src/loader \ diff -Nru mesa-19.2.8/.appveyor/appveyor_msvc.bat mesa-20.0.8/.appveyor/appveyor_msvc.bat --- mesa-19.2.8/.appveyor/appveyor_msvc.bat 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/.appveyor/appveyor_msvc.bat 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,66 @@ +goto %1 + +:install +rem Check pip +if "%buildsystem%" == "scons" ( + python --version + python -m pip --version + rem Install Mako + python -m pip install Mako==1.0.7 + rem Install pywin32 extensions, needed by SCons + python -m pip install pypiwin32 + rem Install python wheels, necessary to install SCons via pip + python -m pip install wheel + rem Install SCons + python -m pip install scons==3.0.1 + call scons --version +) else ( + python --version + python -m pip install Mako meson + meson --version + + rem Install pkg-config, which meson requires even on windows + cinst -y pkgconfiglite +) + +rem Install flex/bison +set WINFLEXBISON_ARCHIVE=win_flex_bison-%WINFLEXBISON_VERSION%.zip +if not exist "%WINFLEXBISON_ARCHIVE%" appveyor DownloadFile "https://github.com/lexxmark/winflexbison/releases/download/v%WINFLEXBISON_VERSION%/%WINFLEXBISON_ARCHIVE%" +7z x -y -owinflexbison\ "%WINFLEXBISON_ARCHIVE%" > nul +set Path=%CD%\winflexbison;%Path% +win_flex --version +win_bison --version +rem Download and extract LLVM +if not exist "%LLVM_ARCHIVE%" appveyor DownloadFile "https://people.freedesktop.org/~jrfonseca/llvm/%LLVM_ARCHIVE%" +7z x -y "%LLVM_ARCHIVE%" > nul +if "%buildsystem%" == "scons" ( + mkdir llvm\bin + set LLVM=%CD%\llvm +) else ( + move llvm subprojects\ + copy .appveyor\llvm-wrap.meson subprojects\llvm\meson.build +) +goto :eof + +:build_script +if "%buildsystem%" == "scons" ( + call scons -j%NUMBER_OF_PROCESSORS% MSVC_VERSION=14.1 llvm=1 +) else ( + call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\Common7\Tools\VsDevCmd.bat" -arch=x86 + rem We use default-library as static to affect any wraps (such as expat and zlib) + rem it would be better if we could set subprojects buildtype independently, + rem but I haven't written that patch yet :) + call meson builddir --backend=vs2017 --default-library=static -Dbuild-tests=true -Db_vscrt=mtd --buildtype=release -Dllvm=true -Dgallium-drivers=swrast -Dosmesa=gallium + pushd builddir + call msbuild mesa.sln /m + popd +) +goto :eof + +:test_script +if "%buildsystem%" == "scons" ( + call scons -j%NUMBER_OF_PROCESSORS% MSVC_VERSION=14.1 llvm=1 check +) else ( + call meson test -C builddir +) +goto :eof diff -Nru mesa-19.2.8/.appveyor/llvm-wrap.meson mesa-20.0.8/.appveyor/llvm-wrap.meson --- mesa-19.2.8/.appveyor/llvm-wrap.meson 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/.appveyor/llvm-wrap.meson 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,36 @@ +# A meson.build file for binary wrapping the LLVM used in the appvyeor CI +project('llvm', ['cpp']) + +cpp = meson.get_compiler('cpp') + +_deps = [] +_search = join_paths(meson.current_source_dir(), 'lib') +foreach d : ['LLVMAnalysis', 'LLVMAsmParser', 'LLVMAsmPrinter', + 'LLVMBinaryFormat', 'LLVMBitReader', 'LLVMBitWriter', + 'LLVMCodeGen', 'LLVMCore', 'LLVMCoroutines', 'LLVMCoverage', + 'LLVMDebugInfoCodeView', 'LLVMDebugInfoDWARF', + 'LLVMDebugInfoMSF', 'LLVMDebugInfoPDB', 'LLVMDemangle', + 'LLVMDlltoolDriver', 'LLVMExecutionEngine', 'LLVMGlobalISel', + 'LLVMInstCombine', 'LLVMInstrumentation', 'LLVMInterpreter', + 'LLVMipo', 'LLVMIRReader', 'LLVMLibDriver', 'LLVMLineEditor', + 'LLVMLinker', 'LLVMLTO', 'LLVMMCDisassembler', 'LLVMMCJIT', + 'LLVMMC', 'LLVMMCParser', 'LLVMMIRParser', 'LLVMObjCARCOpts', + 'LLVMObject', 'LLVMObjectYAML', 'LLVMOption', 'LLVMOrcJIT', + 'LLVMPasses', 'LLVMProfileData', 'LLVMRuntimeDyld', + 'LLVMScalarOpts', 'LLVMSelectionDAG', 'LLVMSupport', + 'LLVMSymbolize', 'LLVMTableGen', 'LLVMTarget', + 'LLVMTransformUtils', 'LLVMVectorize', 'LLVMX86AsmParser', + 'LLVMX86AsmPrinter', 'LLVMX86CodeGen', 'LLVMX86Desc', + 'LLVMX86Disassembler', 'LLVMX86Info', 'LLVMX86Utils', + 'LLVMXRay'] + _deps += cpp.find_library(d, dirs : _search) +endforeach + +dep_llvm = declare_dependency( + include_directories : include_directories('include'), + dependencies : _deps, + version : '5.0.1', +) + +has_rtti = false +irbuilder_h = files('include/llvm/IR/IRBuilder.h') diff -Nru mesa-19.2.8/appveyor.yml mesa-20.0.8/appveyor.yml --- mesa-19.2.8/appveyor.yml 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/appveyor.yml 2020-06-12 01:21:16.000000000 +0000 @@ -38,6 +38,7 @@ - '%LOCALAPPDATA%\pip\Cache -> appveyor.yml' - win_flex_bison-2.5.15.zip - llvm-5.0.1-msvc2017-mtd.7z +- subprojects\packagecache -> subprojects\*.wrap os: Visual Studio 2017 @@ -49,41 +50,21 @@ environment: WINFLEXBISON_VERSION: 2.5.15 LLVM_ARCHIVE: llvm-5.0.1-msvc2017-mtd.7z + matrix: + - compiler: msvc + buildsystem: scons + - compiler: msvc + buildsystem: meson + path: C:\Python38-x64;C:\Python38-x64\Scripts;%path% install: -# Check git config -- git config core.autocrlf -# Check pip -- python --version -- python -m pip --version -# Install Mako -- python -m pip install Mako==1.0.7 -# Install pywin32 extensions, needed by SCons -- python -m pip install pypiwin32 -# Install python wheels, necessary to install SCons via pip -- python -m pip install wheel -# Install SCons -- python -m pip install scons==3.0.1 -- scons --version -# Install flex/bison -- set WINFLEXBISON_ARCHIVE=win_flex_bison-%WINFLEXBISON_VERSION%.zip -- if not exist "%WINFLEXBISON_ARCHIVE%" appveyor DownloadFile "https://github.com/lexxmark/winflexbison/releases/download/v%WINFLEXBISON_VERSION%/%WINFLEXBISON_ARCHIVE%" -- 7z x -y -owinflexbison\ "%WINFLEXBISON_ARCHIVE%" > nul -- set Path=%CD%\winflexbison;%Path% -- win_flex --version -- win_bison --version -# Download and extract LLVM -- if not exist "%LLVM_ARCHIVE%" appveyor DownloadFile "https://people.freedesktop.org/~jrfonseca/llvm/%LLVM_ARCHIVE%" -- 7z x -y "%LLVM_ARCHIVE%" > nul -- mkdir llvm\bin -- set LLVM=%CD%\llvm +- cmd: .appveyor\appveyor_msvc.bat install build_script: -- scons -j%NUMBER_OF_PROCESSORS% MSVC_VERSION=14.1 llvm=1 - -after_build: -- scons -j%NUMBER_OF_PROCESSORS% MSVC_VERSION=14.1 llvm=1 check +- cmd: .appveyor\appveyor_msvc.bat build_script +test_script: +- cmd: .appveyor\appveyor_msvc.bat test_script # It's possible to setup notification here, as described in # http://www.appveyor.com/docs/notifications#appveyor-yml-configuration , but diff -Nru mesa-19.2.8/bin/.cherry-ignore mesa-20.0.8/bin/.cherry-ignore --- mesa-19.2.8/bin/.cherry-ignore 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/bin/.cherry-ignore 1970-01-01 00:00:00.000000000 +0000 @@ -1,51 +0,0 @@ -# warnings that are not useful -da5ebe30105f70e3520ce3ae145793b755552569 -6b8cb087568699ca9a6e9e8b7bf49179e622b59f - -# Jason doesn't want this applied to 19.2 (it's a revert) -d15fe8ca8262d502435c4f83985ac414f950bc5f - -# This doesn't apply to 19.2 -f833b4cada07b746a10ffa4d93fcd821920c3cb1 -d2db43fcad6a2ea2070ff5f7884411f4b7d3925c -66f2aa6ccd0b226eebe2c1a46281160b0a54d522 - -# The author requested that this not be applied to 19.2 -dcc0e23438f3e5929c2ef74d57e8207be25ecb41 - -# This doesn't apply cleanly, and no one really cares about this file on stable -# branches anyway. -bcd9224728dcb8d8fe4bcddc4bd9b2c36fcfe9dd - -# De-nominated by its author due to alternate fix not being backported -43041627445540afda1a05d11861935963660344 - -# This is immediately reverted, so just don't apply -19546108d3dd5541a189e36df4ea83b3f519e48f - -# The authors requested these not be applied to 19.2 -869e32593a9096b845dd6106f8f86e1c41fac968 -a2c3c65a31de90fdb55f76f2894860dfbafe2043 -bb0c5c487e63e88acbb792f092dd8f392bad8540 -937b9055698be0dfdb7d2e0673a989e2ecc05912 -21376cffb37018160ad3eef38b5a640ba1675a4f - -# This is reverted shortly after it was landed -4432a2d14d80081d062f7939a950d65ea3a16eed - -# These aren't relevant for 19.2 -1a05811936dd8d0c3a367c6f00629624ef39d537 -911a8261419f48dcd756f78832fa5a5f4c5b8d93 - -# This was manually backported -2afeed301010917c4eae55dcd2544f9d329df934 -4b392ced2d744fccffe95490ff57e6b41033c266 - -# This is not being backported to 19.2 due to causing build regressions for -# downstream projects -eaf43966027cf9654e91ca57aecc8f5a65b58f49 - -# Invalid sha warnings -023282a4f667695ea1dbbe9fbe1cd3a9d550a426 -2fca325ea65f068043d4c18c9cd0fe7f25bde8f7 -7564c5fc6d79a2ddec49a19f67183fb3be799fe5 diff -Nru mesa-19.2.8/bin/gen_release_notes.py mesa-20.0.8/bin/gen_release_notes.py --- mesa-19.2.8/bin/gen_release_notes.py 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/bin/gen_release_notes.py 2020-06-12 01:21:16.000000000 +0000 @@ -35,8 +35,8 @@ from mako import exceptions -CURRENT_GL_VERSION = '4.5' -CURRENT_VK_VERSION = '1.1' +CURRENT_GL_VERSION = '4.6' +CURRENT_VK_VERSION = '1.2' TEMPLATE = Template(textwrap.dedent("""\ <%! @@ -64,7 +64,7 @@ %if not bugfix: Mesa ${next_version} is a new development release. People who are concerned with stability and reliability should stick with a previous release or - wait for Mesa ${version[:-1]}1. + wait for Mesa ${next_version[:-1]}1. %else: Mesa ${next_version} is a bug fix release which fixes bugs found since the ${version} release. %endif @@ -125,7 +125,7 @@ async def gather_commits(version: str) -> str: p = await asyncio.create_subprocess_exec( - 'git', 'log', f'mesa-{version}..', '--grep', r'Closes: \(https\|#\).*', + 'git', 'log', '--oneline', f'mesa-{version}..', '--grep', r'Closes: \(https\|#\).*', stdout=asyncio.subprocess.PIPE) out, _ = await p.communicate() assert p.returncode == 0, f"git log didn't work: {version}" diff -Nru mesa-19.2.8/bin/get-pick-list.sh mesa-20.0.8/bin/get-pick-list.sh --- mesa-19.2.8/bin/get-pick-list.sh 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/bin/get-pick-list.sh 2020-06-12 01:21:16.000000000 +0000 @@ -92,7 +92,7 @@ } # Use the last branchpoint as our limit for the search -latest_branchpoint=`git merge-base upstream/master HEAD` +latest_branchpoint=`git merge-base origin/master HEAD` # List all the commits between day 1 and the branch point... git log --reverse --pretty=%H $latest_branchpoint > already_landed @@ -103,7 +103,7 @@ sed -e 's/^[[:space:]]*(cherry picked from commit[[:space:]]*//' -e 's/)//' > already_picked # Grep for potential candidates -git log --reverse --pretty=%H -i --grep='^CC:.*mesa-stable\|^CC:.*mesa-dev\|\\|\\|This reverts commit' $latest_branchpoint..upstream/master |\ +git log --reverse --pretty=%H -i --grep='^CC:.*mesa-stable\|^CC:.*mesa-dev\|\\|\\|This reverts commit' $latest_branchpoint..origin/master |\ while read sha do # Check to see whether the patch is on the ignore list. diff -Nru mesa-19.2.8/bin/install_megadrivers.py mesa-20.0.8/bin/install_megadrivers.py --- mesa-19.2.8/bin/install_megadrivers.py 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/bin/install_megadrivers.py 2020-06-12 01:21:16.000000000 +0000 @@ -1,5 +1,6 @@ +#!/usr/bin/env python3 # encoding=utf-8 -# Copyright © 2017-2018 Intel Corporation +# Copyright 2017-2018 Intel Corporation # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff -Nru mesa-19.2.8/bin/meson.build mesa-20.0.8/bin/meson.build --- mesa-19.2.8/bin/meson.build 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/bin/meson.build 2020-06-12 01:21:16.000000000 +0000 @@ -20,3 +20,4 @@ git_sha1_gen_py = files('git_sha1_gen.py') symbols_check = find_program('symbols-check.py') +install_megadrivers_py = find_program('install_megadrivers.py') diff -Nru mesa-19.2.8/bin/pick/core.py mesa-20.0.8/bin/pick/core.py --- mesa-19.2.8/bin/pick/core.py 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/bin/pick/core.py 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,367 @@ +# Copyright © 2019-2020 Intel Corporation + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +"""Core data structures and routines for pick.""" + +import asyncio +import enum +import json +import pathlib +import re +import typing + +import attr + +if typing.TYPE_CHECKING: + from .ui import UI + + import typing_extensions + + class CommitDict(typing_extensions.TypedDict): + + sha: str + description: str + nomintated: bool + nomination_type: typing.Optional[int] + resolution: typing.Optional[int] + master_sha: typing.Optional[str] + +IS_FIX = re.compile(r'^\s*fixes:\s*([a-f0-9]{6,40})', flags=re.MULTILINE | re.IGNORECASE) +# FIXME: I dislike the duplication in this regex, but I couldn't get it to work otherwise +IS_CC = re.compile(r'^\s*cc:\s*["\']?([0-9]{2}\.[0-9])?["\']?\s*["\']?([0-9]{2}\.[0-9])?["\']?\s*\ None: + """Commit the .pick_status.json file.""" + f = pathlib.Path(__file__).parent.parent.parent / '.pick_status.json' + async with COMMIT_LOCK: + p = await asyncio.create_subprocess_exec( + 'git', 'add', f.as_posix(), + stdout=asyncio.subprocess.DEVNULL, + stderr=asyncio.subprocess.DEVNULL, + ) + v = await p.wait() + if v != 0: + return False + + if amend: + cmd = ['--amend', '--no-edit'] + else: + cmd = ['--message', f'.pick_status.json: {message}'] + p = await asyncio.create_subprocess_exec( + 'git', 'commit', *cmd, + stdout=asyncio.subprocess.DEVNULL, + stderr=asyncio.subprocess.DEVNULL, + ) + v = await p.wait() + if v != 0: + return False + return True + + +@attr.s(slots=True) +class Commit: + + sha: str = attr.ib() + description: str = attr.ib() + nominated: bool = attr.ib(False) + nomination_type: typing.Optional[NominationType] = attr.ib(None) + resolution: Resolution = attr.ib(Resolution.UNRESOLVED) + master_sha: typing.Optional[str] = attr.ib(None) + because_sha: typing.Optional[str] = attr.ib(None) + + def to_json(self) -> 'CommitDict': + d: typing.Dict[str, typing.Any] = attr.asdict(self) + if self.nomination_type is not None: + d['nomination_type'] = self.nomination_type.value + if self.resolution is not None: + d['resolution'] = self.resolution.value + return typing.cast('CommitDict', d) + + @classmethod + def from_json(cls, data: 'CommitDict') -> 'Commit': + c = cls(data['sha'], data['description'], data['nominated'], master_sha=data['master_sha'], because_sha=data['because_sha']) + if data['nomination_type'] is not None: + c.nomination_type = NominationType(data['nomination_type']) + if data['resolution'] is not None: + c.resolution = Resolution(data['resolution']) + return c + + async def apply(self, ui: 'UI') -> typing.Tuple[bool, str]: + # FIXME: This isn't really enough if we fail to cherry-pick because the + # git tree will still be dirty + async with COMMIT_LOCK: + p = await asyncio.create_subprocess_exec( + 'git', 'cherry-pick', '-x', self.sha, + stdout=asyncio.subprocess.DEVNULL, + stderr=asyncio.subprocess.PIPE, + ) + _, err = await p.communicate() + + if p.returncode != 0: + return (False, err) + + self.resolution = Resolution.MERGED + await ui.feedback(f'{self.sha} ({self.description}) applied successfully') + + # Append the changes to the .pickstatus.json file + ui.save() + v = await commit_state(amend=True) + return (v, '') + + async def abort_cherry(self, ui: 'UI', err: str) -> None: + await ui.feedback(f'{self.sha} ({self.description}) failed to apply\n{err}') + async with COMMIT_LOCK: + p = await asyncio.create_subprocess_exec( + 'git', 'cherry-pick', '--abort', + stdout=asyncio.subprocess.DEVNULL, + stderr=asyncio.subprocess.DEVNULL, + ) + r = await p.wait() + await ui.feedback(f'{"Successfully" if r == 0 else "Failed to"} abort cherry-pick.') + + async def denominate(self, ui: 'UI') -> bool: + self.resolution = Resolution.DENOMINATED + ui.save() + v = await commit_state(message=f'Mark {self.sha} as denominated') + assert v + await ui.feedback(f'{self.sha} ({self.description}) denominated successfully') + return True + + async def backport(self, ui: 'UI') -> bool: + self.resolution = Resolution.BACKPORTED + ui.save() + v = await commit_state(message=f'Mark {self.sha} as backported') + assert v + await ui.feedback(f'{self.sha} ({self.description}) backported successfully') + return True + + async def resolve(self, ui: 'UI') -> None: + self.resolution = Resolution.MERGED + ui.save() + v = await commit_state(amend=True) + assert v + await ui.feedback(f'{self.sha} ({self.description}) committed successfully') + + +async def get_new_commits(sha: str) -> typing.List[typing.Tuple[str, str]]: + # TODO: config file that points to the upstream branch + p = await asyncio.create_subprocess_exec( + 'git', 'log', '--pretty=oneline', f'{sha}..master', + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.DEVNULL) + out, _ = await p.communicate() + assert p.returncode == 0, f"git log didn't work: {sha}" + return list(split_commit_list(out.decode().strip())) + + +def split_commit_list(commits: str) -> typing.Generator[typing.Tuple[str, str], None, None]: + if not commits: + return + for line in commits.split('\n'): + v = tuple(line.split(' ', 1)) + assert len(v) == 2, 'this is really just for mypy' + yield typing.cast(typing.Tuple[str, str], v) + + +async def is_commit_in_branch(sha: str) -> bool: + async with SEM: + p = await asyncio.create_subprocess_exec( + 'git', 'merge-base', '--is-ancestor', sha, 'HEAD', + stdout=asyncio.subprocess.DEVNULL, + stderr=asyncio.subprocess.DEVNULL, + ) + await p.wait() + return p.returncode == 0 + + +async def full_sha(sha: str) -> str: + async with SEM: + p = await asyncio.create_subprocess_exec( + 'git', 'rev-parse', sha, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.DEVNULL, + ) + out, _ = await p.communicate() + if p.returncode: + raise PickUIException(f'Invalid Sha {sha}') + return out.decode().strip() + + +async def resolve_nomination(commit: 'Commit', version: str) -> 'Commit': + async with SEM: + p = await asyncio.create_subprocess_exec( + 'git', 'log', '--pretty=medium', '-1', commit.sha, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.DEVNULL, + ) + _out, _ = await p.communicate() + assert p.returncode == 0, f'git log for {commit.sha} failed' + out = _out.decode() + + # We give presedence to fixes and cc tags over revert tags. + # XXX: not having the wallrus operator available makes me sad := + m = IS_FIX.search(out) + if m: + # We set the nomination_type and because_sha here so that we can later + # check to see if this fixes another staged commit. + try: + commit.because_sha = fixed = await full_sha(m.group(1)) + except PickUIException: + pass + else: + commit.nomination_type = NominationType.FIXES + if await is_commit_in_branch(fixed): + commit.nominated = True + return commit + + m = IS_CC.search(out) + if m: + if m.groups() == (None, None) or version in m.groups(): + commit.nominated = True + commit.nomination_type = NominationType.CC + return commit + + m = IS_REVERT.search(out) + if m: + # See comment for IS_FIX path + try: + commit.because_sha = reverted = await full_sha(m.group(1)) + except PickUIException: + pass + else: + commit.nomination_type = NominationType.REVERT + if await is_commit_in_branch(reverted): + commit.nominated = True + return commit + + return commit + + +async def resolve_fixes(commits: typing.List['Commit'], previous: typing.List['Commit']) -> None: + """Determine if any of the undecided commits fix/revert a staged commit. + + The are still needed if they apply to a commit that is staged for + inclusion, but not yet included. + + This must be done in order, because a commit 3 might fix commit 2 which + fixes commit 1. + """ + shas: typing.Set[str] = set(c.sha for c in previous if c.nominated) + assert None not in shas, 'None in shas' + + for commit in reversed(commits): + if not commit.nominated and commit.nomination_type is NominationType.FIXES: + commit.nominated = commit.because_sha in shas + + if commit.nominated: + shas.add(commit.sha) + + for commit in commits: + if (commit.nomination_type is NominationType.REVERT and + commit.because_sha in shas): + for oldc in reversed(commits): + if oldc.sha == commit.because_sha: + # In this case a commit that hasn't yet been applied is + # reverted, we don't want to apply that commit at all + oldc.nominated = False + oldc.resolution = Resolution.DENOMINATED + commit.nominated = False + commit.resolution = Resolution.DENOMINATED + shas.remove(commit.because_sha) + break + + +async def gather_commits(version: str, previous: typing.List['Commit'], + new: typing.List[typing.Tuple[str, str]], cb) -> typing.List['Commit']: + # We create an array of the final size up front, then we pass that array + # to the "inner" co-routine, which is turned into a list of tasks and + # collected by asyncio.gather. We do this to allow the tasks to be + # asyncrounously gathered, but to also ensure that the commits list remains + # in order. + commits = [None] * len(new) + tasks = [] + + async def inner(commit: 'Commit', version: str, commits: typing.List['Commit'], + index: int, cb) -> None: + commits[index] = await resolve_nomination(commit, version) + cb() + + for i, (sha, desc) in enumerate(new): + tasks.append(asyncio.ensure_future( + inner(Commit(sha, desc), version, commits, i, cb))) + + await asyncio.gather(*tasks) + assert None not in commits + + await resolve_fixes(commits, previous) + + for commit in commits: + if commit.resolution is Resolution.UNRESOLVED and not commit.nominated: + commit.resolution = Resolution.NOTNEEDED + + return commits + + +def load() -> typing.List['Commit']: + p = pathlib.Path(__file__).parent.parent.parent / '.pick_status.json' + if not p.exists(): + return [] + with p.open('r') as f: + raw = json.load(f) + return [Commit.from_json(c) for c in raw] + + +def save(commits: typing.Iterable['Commit']) -> None: + p = pathlib.Path(__file__).parent.parent.parent / '.pick_status.json' + commits = list(commits) + with p.open('wt') as f: + json.dump([c.to_json() for c in commits], f, indent=4) + + asyncio.ensure_future(commit_state(message=f'Update to {commits[0].sha}')) diff -Nru mesa-19.2.8/bin/pick/core_test.py mesa-20.0.8/bin/pick/core_test.py --- mesa-19.2.8/bin/pick/core_test.py 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/bin/pick/core_test.py 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,470 @@ +# Copyright © 2019-2020 Intel Corporation + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +"""Tests for pick's core data structures and routines.""" + +from unittest import mock +import textwrap +import typing + +import attr +import pytest + +from . import core + + +class TestCommit: + + @pytest.fixture + def unnominated_commit(self) -> 'core.Commit': + return core.Commit('abc123', 'sub: A commit', master_sha='45678') + + @pytest.fixture + def nominated_commit(self) -> 'core.Commit': + return core.Commit('abc123', 'sub: A commit', True, + core.NominationType.CC, core.Resolution.UNRESOLVED) + + class TestToJson: + + def test_not_nominated(self, unnominated_commit: 'core.Commit'): + c = unnominated_commit + v = c.to_json() + assert v == {'sha': 'abc123', 'description': 'sub: A commit', 'nominated': False, + 'nomination_type': None, 'resolution': core.Resolution.UNRESOLVED.value, + 'master_sha': '45678', 'because_sha': None} + + def test_nominated(self, nominated_commit: 'core.Commit'): + c = nominated_commit + v = c.to_json() + assert v == {'sha': 'abc123', + 'description': 'sub: A commit', + 'nominated': True, + 'nomination_type': core.NominationType.CC.value, + 'resolution': core.Resolution.UNRESOLVED.value, + 'master_sha': None, + 'because_sha': None} + + class TestFromJson: + + def test_not_nominated(self, unnominated_commit: 'core.Commit'): + c = unnominated_commit + v = c.to_json() + c2 = core.Commit.from_json(v) + assert c == c2 + + def test_nominated(self, nominated_commit: 'core.Commit'): + c = nominated_commit + v = c.to_json() + c2 = core.Commit.from_json(v) + assert c == c2 + + +class TestRE: + + """Tests for the regular expressions used to identify commits.""" + + class TestFixes: + + def test_simple(self): + message = textwrap.dedent("""\ + etnaviv: fix vertex buffer state emission for single stream GPUs + + GPUs with a single supported vertex stream must use the single state + address to program the stream. + + Fixes: 3d09bb390a39 (etnaviv: GC7000: State changes for HALTI3..5) + Signed-off-by: Lucas Stach + Reviewed-by: Jonathan Marek + """) + + m = core.IS_FIX.search(message) + assert m is not None + assert m.group(1) == '3d09bb390a39' + + class TestCC: + + def test_single_branch(self): + """Tests commit meant for a single branch, ie, 19.1""" + message = textwrap.dedent("""\ + radv: fix DCC fast clear code for intensity formats + + This fixes a rendering issue with DiRT 4 on GFX10. Only GFX10 was + affected because intensity formats are different. + + Cc: 19.2 + Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/1923 + Signed-off-by: Samuel Pitoiset + Reviewed-by: Bas Nieuwenhuizen + """) + + m = core.IS_CC.search(message) + assert m is not None + assert m.group(1) == '19.2' + + def test_multiple_branches(self): + """Tests commit with more than one branch specified""" + message = textwrap.dedent("""\ + radeonsi: enable zerovram for Rocket League + + Fixes corruption on game startup. + Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/1888 + + Cc: 19.1 19.2 + Reviewed-by: Pierre-Eric Pelloux-Prayer + """) + + m = core.IS_CC.search(message) + assert m is not None + assert m.group(1) == '19.1' + assert m.group(2) == '19.2' + + def test_no_branch(self): + """Tests commit with no branch specification""" + message = textwrap.dedent("""\ + anv/android: fix images created with external format support + + This fixes a case where user first creates image and then later binds it + with memory created from AHW buffer. + + Cc: + Signed-off-by: Tapani Pälli + Reviewed-by: Lionel Landwerlin + """) + + m = core.IS_CC.search(message) + assert m is not None + + def test_quotes(self): + """Tests commit with quotes around the versions""" + message = textwrap.dedent("""\ + anv: Always fill out the AUX table even if CCS is disabled + + Cc: "20.0" mesa-stable@lists.freedesktop.org + Reviewed-by: Kenneth Graunke + Tested-by: Marge Bot + Part-of: + """) + + m = core.IS_CC.search(message) + assert m is not None + assert m.group(1) == '20.0' + + def test_multiple_quotes(self): + """Tests commit with quotes around the versions""" + message = textwrap.dedent("""\ + anv: Always fill out the AUX table even if CCS is disabled + + Cc: "20.0" "20.1" mesa-stable@lists.freedesktop.org + Reviewed-by: Kenneth Graunke + Tested-by: Marge Bot + Part-of: + """) + + m = core.IS_CC.search(message) + assert m is not None + assert m.group(1) == '20.0' + assert m.group(2) == '20.1' + + def test_single_quotes(self): + """Tests commit with quotes around the versions""" + message = textwrap.dedent("""\ + anv: Always fill out the AUX table even if CCS is disabled + + Cc: '20.0' mesa-stable@lists.freedesktop.org + Reviewed-by: Kenneth Graunke + Tested-by: Marge Bot + Part-of: + """) + + m = core.IS_CC.search(message) + assert m is not None + assert m.group(1) == '20.0' + + def test_multiple_single_quotes(self): + """Tests commit with quotes around the versions""" + message = textwrap.dedent("""\ + anv: Always fill out the AUX table even if CCS is disabled + + Cc: '20.0' '20.1' mesa-stable@lists.freedesktop.org + Reviewed-by: Kenneth Graunke + Tested-by: Marge Bot + Part-of: + """) + + m = core.IS_CC.search(message) + assert m is not None + assert m.group(1) == '20.0' + assert m.group(2) == '20.1' + + class TestRevert: + + def test_simple(self): + message = textwrap.dedent("""\ + Revert "radv: do not emit PKT3_CONTEXT_CONTROL with AMDGPU 3.6.0+" + + This reverts commit 2ca8629fa9b303e24783b76a7b3b0c2513e32fbd. + + This was initially ported from RadeonSI, but in the meantime it has + been reverted because it might hang. Be conservative and re-introduce + this packet emission. + + Unfortunately this doesn't fix anything known. + + Cc: 19.2 + Signed-off-by: Samuel Pitoiset + Reviewed-by: Bas Nieuwenhuizen + """) + + m = core.IS_REVERT.search(message) + assert m is not None + assert m.group(1) == '2ca8629fa9b303e24783b76a7b3b0c2513e32fbd' + + +class TestResolveNomination: + + @attr.s(slots=True) + class FakeSubprocess: + + """A fake asyncio.subprocess like classe for use with mock.""" + + out: typing.Optional[bytes] = attr.ib(None) + returncode: int = attr.ib(0) + + async def mock(self, *_, **__): + """A dirtly little helper for mocking.""" + return self + + async def communicate(self) -> typing.Tuple[bytes, bytes]: + assert self.out is not None + return self.out, b'' + + async def wait(self) -> int: + return self.returncode + + @staticmethod + async def return_true(*_, **__) -> bool: + return True + + @staticmethod + async def return_false(*_, **__) -> bool: + return False + + @pytest.mark.asyncio + async def test_fix_is_nominated(self): + s = self.FakeSubprocess(b'Fixes: 3d09bb390a39 (etnaviv: GC7000: State changes for HALTI3..5)') + c = core.Commit('abcdef1234567890', 'a commit') + + with mock.patch('bin.pick.core.asyncio.create_subprocess_exec', s.mock): + with mock.patch('bin.pick.core.is_commit_in_branch', self.return_true): + await core.resolve_nomination(c, '') + + assert c.nominated + assert c.nomination_type is core.NominationType.FIXES + + @pytest.mark.asyncio + async def test_fix_is_not_nominated(self): + s = self.FakeSubprocess(b'Fixes: 3d09bb390a39 (etnaviv: GC7000: State changes for HALTI3..5)') + c = core.Commit('abcdef1234567890', 'a commit') + + with mock.patch('bin.pick.core.asyncio.create_subprocess_exec', s.mock): + with mock.patch('bin.pick.core.is_commit_in_branch', self.return_false): + await core.resolve_nomination(c, '') + + assert not c.nominated + assert c.nomination_type is core.NominationType.FIXES + + @pytest.mark.asyncio + async def test_cc_is_nominated(self): + s = self.FakeSubprocess(b'Cc: 16.2 ') + c = core.Commit('abcdef1234567890', 'a commit') + + with mock.patch('bin.pick.core.asyncio.create_subprocess_exec', s.mock): + await core.resolve_nomination(c, '16.2') + + assert c.nominated + assert c.nomination_type is core.NominationType.CC + + @pytest.mark.asyncio + async def test_cc_is_nominated2(self): + s = self.FakeSubprocess(b'Cc: mesa-stable@lists.freedesktop.org') + c = core.Commit('abcdef1234567890', 'a commit') + + with mock.patch('bin.pick.core.asyncio.create_subprocess_exec', s.mock): + await core.resolve_nomination(c, '16.2') + + assert c.nominated + assert c.nomination_type is core.NominationType.CC + + @pytest.mark.asyncio + async def test_cc_is_not_nominated(self): + s = self.FakeSubprocess(b'Cc: 16.2 ') + c = core.Commit('abcdef1234567890', 'a commit') + + with mock.patch('bin.pick.core.asyncio.create_subprocess_exec', s.mock): + await core.resolve_nomination(c, '16.1') + + assert not c.nominated + assert c.nomination_type is None + + @pytest.mark.asyncio + async def test_revert_is_nominated(self): + s = self.FakeSubprocess(b'This reverts commit 1234567890123456789012345678901234567890.') + c = core.Commit('abcdef1234567890', 'a commit') + + with mock.patch('bin.pick.core.asyncio.create_subprocess_exec', s.mock): + with mock.patch('bin.pick.core.is_commit_in_branch', self.return_true): + await core.resolve_nomination(c, '') + + assert c.nominated + assert c.nomination_type is core.NominationType.REVERT + + @pytest.mark.asyncio + async def test_revert_is_not_nominated(self): + s = self.FakeSubprocess(b'This reverts commit 1234567890123456789012345678901234567890.') + c = core.Commit('abcdef1234567890', 'a commit') + + with mock.patch('bin.pick.core.asyncio.create_subprocess_exec', s.mock): + with mock.patch('bin.pick.core.is_commit_in_branch', self.return_false): + await core.resolve_nomination(c, '') + + assert not c.nominated + assert c.nomination_type is core.NominationType.REVERT + + @pytest.mark.asyncio + async def test_is_fix_and_cc(self): + s = self.FakeSubprocess( + b'Fixes: 3d09bb390a39 (etnaviv: GC7000: State changes for HALTI3..5)\n' + b'Cc: 16.1 ' + ) + c = core.Commit('abcdef1234567890', 'a commit') + + with mock.patch('bin.pick.core.asyncio.create_subprocess_exec', s.mock): + with mock.patch('bin.pick.core.is_commit_in_branch', self.return_true): + await core.resolve_nomination(c, '16.1') + + assert c.nominated + assert c.nomination_type is core.NominationType.FIXES + + @pytest.mark.asyncio + async def test_is_fix_and_revert(self): + s = self.FakeSubprocess( + b'Fixes: 3d09bb390a39 (etnaviv: GC7000: State changes for HALTI3..5)\n' + b'This reverts commit 1234567890123456789012345678901234567890.' + ) + c = core.Commit('abcdef1234567890', 'a commit') + + with mock.patch('bin.pick.core.asyncio.create_subprocess_exec', s.mock): + with mock.patch('bin.pick.core.is_commit_in_branch', self.return_true): + await core.resolve_nomination(c, '16.1') + + assert c.nominated + assert c.nomination_type is core.NominationType.FIXES + + @pytest.mark.asyncio + async def test_is_cc_and_revert(self): + s = self.FakeSubprocess( + b'This reverts commit 1234567890123456789012345678901234567890.\n' + b'Cc: 16.1 ' + ) + c = core.Commit('abcdef1234567890', 'a commit') + + with mock.patch('bin.pick.core.asyncio.create_subprocess_exec', s.mock): + with mock.patch('bin.pick.core.is_commit_in_branch', self.return_true): + await core.resolve_nomination(c, '16.1') + + assert c.nominated + assert c.nomination_type is core.NominationType.CC + + +class TestResolveFixes: + + @pytest.mark.asyncio + async def test_in_new(self): + """Because commit abcd is nominated, so f123 should be as well.""" + c = [ + core.Commit('f123', 'desc', nomination_type=core.NominationType.FIXES, because_sha='abcd'), + core.Commit('abcd', 'desc', True), + ] + await core.resolve_fixes(c, []) + assert c[1].nominated + + @pytest.mark.asyncio + async def test_not_in_new(self): + """Because commit abcd is not nominated, commit f123 shouldn't be either.""" + c = [ + core.Commit('f123', 'desc', nomination_type=core.NominationType.FIXES, because_sha='abcd'), + core.Commit('abcd', 'desc'), + ] + await core.resolve_fixes(c, []) + assert not c[0].nominated + + @pytest.mark.asyncio + async def test_in_previous(self): + """Because commit abcd is nominated, so f123 should be as well.""" + p = [ + core.Commit('abcd', 'desc', True), + ] + c = [ + core.Commit('f123', 'desc', nomination_type=core.NominationType.FIXES, because_sha='abcd'), + ] + await core.resolve_fixes(c, p) + assert c[0].nominated + + @pytest.mark.asyncio + async def test_not_in_previous(self): + """Because commit abcd is not nominated, commit f123 shouldn't be either.""" + p = [ + core.Commit('abcd', 'desc'), + ] + c = [ + core.Commit('f123', 'desc', nomination_type=core.NominationType.FIXES, because_sha='abcd'), + ] + await core.resolve_fixes(c, p) + assert not c[0].nominated + + +class TestIsCommitInBranch: + + @pytest.mark.asyncio + async def test_no(self): + # Hopefully this is never true? + value = await core.is_commit_in_branch('ffffffffffffffffffffffffffffff') + assert not value + + @pytest.mark.asyncio + async def test_yes(self): + # This commit is from 2000, it better always be in the branch + value = await core.is_commit_in_branch('88f3b89a2cb77766d2009b9868c44e03abe2dbb2') + assert value + + +class TestFullSha: + + @pytest.mark.asyncio + async def test_basic(self): + # This commit is from 2000, it better always be in the branch + value = await core.full_sha('88f3b89a2cb777') + assert value + + @pytest.mark.asyncio + async def test_invalid(self): + # This commit is from 2000, it better always be in the branch + with pytest.raises(core.PickUIException): + await core.full_sha('fffffffffffffffffffffffffffffffffff') diff -Nru mesa-19.2.8/bin/pick/ui.py mesa-20.0.8/bin/pick/ui.py --- mesa-19.2.8/bin/pick/ui.py 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/bin/pick/ui.py 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,259 @@ +# Copyright © 2020-2020 Intel Corporation + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +"""Urwid UI for pick script.""" + +import asyncio +import functools +import itertools +import textwrap +import typing + +import attr +import urwid + +from . import core + +if typing.TYPE_CHECKING: + WidgetType = typing.TypeVar('WidgetType', bound=urwid.Widget) + +PALETTE = [ + ('a', 'black', 'light gray'), + ('b', 'black', 'dark red'), + ('bg', 'black', 'dark blue'), + ('reversed', 'standout', ''), +] + + +class RootWidget(urwid.Frame): + + def __init__(self, *args, ui: 'UI' = None, **kwargs): + super().__init__(*args, **kwargs) + assert ui is not None + self.ui = ui + + def keypress(self, size: int, key: str) -> typing.Optional[str]: + if key == 'q': + raise urwid.ExitMainLoop() + elif key == 'u': + asyncio.ensure_future(self.ui.update()) + elif key == 'a': + self.ui.add() + else: + return super().keypress(size, key) + return None + + +class CommitWidget(urwid.Text): + + # urwid.Text is normally not interactable, this is required to tell urwid + # to use our keypress method + _selectable = True + + def __init__(self, ui: 'UI', commit: 'core.Commit'): + super().__init__(commit.description) + self.ui = ui + self.commit = commit + + async def apply(self) -> None: + result, err = await self.commit.apply(self.ui) + if not result: + self.ui.chp_failed(self, err) + else: + self.ui.remove_commit(self) + + async def denominate(self) -> None: + await self.commit.denominate(self.ui) + self.ui.remove_commit(self) + + async def backport(self) -> None: + await self.commit.backport(self.ui) + self.ui.remove_commit(self) + + def keypress(self, size: int, key: str) -> typing.Optional[str]: + if key == 'c': + asyncio.ensure_future(self.apply()) + elif key == 'd': + asyncio.ensure_future(self.denominate()) + elif key == 'b': + asyncio.ensure_future(self.backport()) + else: + return key + return None + + +@attr.s(slots=True) +class UI: + + """Main management object. + + :previous_commits: A list of commits to master since this branch was created + :new_commits: Commits added to master since the last time this script was run + """ + + commit_list: typing.List['urwid.Button'] = attr.ib(factory=lambda: urwid.SimpleFocusListWalker([]), init=False) + feedback_box: typing.List['urwid.Text'] = attr.ib(factory=lambda: urwid.SimpleFocusListWalker([]), init=False) + header: 'urwid.Text' = attr.ib(factory=lambda: urwid.Text('Mesa Stable Picker', align='center'), init=False) + body: 'urwid.Columns' = attr.ib(attr.Factory(lambda s: s._make_body(), True), init=False) + footer: 'urwid.Columns' = attr.ib(attr.Factory(lambda s: s._make_footer(), True), init=False) + root: RootWidget = attr.ib(attr.Factory(lambda s: s._make_root(), True), init=False) + mainloop: urwid.MainLoop = attr.ib(None, init=False) + + previous_commits: typing.List['core.Commit'] = attr.ib(factory=list, init=False) + new_commits: typing.List['core.Commit'] = attr.ib(factory=list, init=False) + + def _make_body(self) -> 'urwid.Columns': + commits = urwid.ListBox(self.commit_list) + feedback = urwid.ListBox(self.feedback_box) + return urwid.Columns([commits, feedback]) + + def _make_footer(self) -> 'urwid.Columns': + body = [ + urwid.Text('[U]pdate'), + urwid.Text('[Q]uit'), + urwid.Text('[C]herry Pick'), + urwid.Text('[D]enominate'), + urwid.Text('[B]ackport'), + urwid.Text('[A]pply additional patch') + ] + return urwid.Columns(body) + + def _make_root(self) -> 'RootWidget': + return RootWidget(self.body, self.header, self.footer, 'body', ui=self) + + def render(self) -> 'WidgetType': + asyncio.ensure_future(self.update()) + return self.root + + def load(self) -> None: + self.previous_commits = core.load() + + async def update(self) -> None: + self.load() + with open('VERSION', 'r') as f: + version = f.read().strip()[:4] + if self.previous_commits: + sha = self.previous_commits[0].sha + else: + sha = f'{version}-branchpoint' + + new_commits = await core.get_new_commits(sha) + + if new_commits: + pb = urwid.ProgressBar('a', 'b', done=len(new_commits)) + o = self.mainloop.widget + self.mainloop.widget = urwid.Overlay( + urwid.Filler(urwid.LineBox(pb)), o, 'center', ('relative', 50), 'middle', ('relative', 50)) + self.new_commits = await core.gather_commits( + version, self.previous_commits, new_commits, + lambda: pb.set_completion(pb.current + 1)) + self.mainloop.widget = o + + for commit in reversed(list(itertools.chain(self.new_commits, self.previous_commits))): + if commit.nominated and commit.resolution is core.Resolution.UNRESOLVED: + b = urwid.AttrMap(CommitWidget(self, commit), None, focus_map='reversed') + self.commit_list.append(b) + self.save() + + async def feedback(self, text: str) -> None: + self.feedback_box.append(urwid.AttrMap(urwid.Text(text), None)) + + def remove_commit(self, commit: CommitWidget) -> None: + for i, c in enumerate(self.commit_list): + if c.base_widget is commit: + del self.commit_list[i] + break + + def save(self): + core.save(itertools.chain(self.new_commits, self.previous_commits)) + + def add(self) -> None: + """Add an additional commit which isn't nominated.""" + o = self.mainloop.widget + + def reset_cb(_) -> None: + self.mainloop.widget = o + + async def apply_cb(edit: urwid.Edit) -> None: + text: str = edit.get_edit_text() + + # In case the text is empty + if not text: + return + + sha = await core.full_sha(text) + for c in reversed(list(itertools.chain(self.new_commits, self.previous_commits))): + if c.sha == sha: + commit = c + break + else: + raise RuntimeError(f"Couldn't find {sha}") + + await commit.apply(self) + + q = urwid.Edit("Comit sha\n") + ok_btn = urwid.Button('Ok') + urwid.connect_signal(ok_btn, 'click', lambda _: asyncio.ensure_future(apply_cb(q))) + urwid.connect_signal(ok_btn, 'click', reset_cb) + + can_btn = urwid.Button('Cancel') + urwid.connect_signal(can_btn, 'click', reset_cb) + + cols = urwid.Columns([ok_btn, can_btn]) + pile = urwid.Pile([q, cols]) + box = urwid.LineBox(pile) + + self.mainloop.widget = urwid.Overlay( + urwid.Filler(box), o, 'center', ('relative', 50), 'middle', ('relative', 50) + ) + + def chp_failed(self, commit: 'CommitWidget', err: str) -> None: + o = self.mainloop.widget + + def reset_cb(_) -> None: + self.mainloop.widget = o + + t = urwid.Text(textwrap.dedent(f""" + Failed to apply {commit.commit.sha} {commit.commit.description} with the following error: + + {err} + + You can either cancel, or resolve the conflicts, commit the + changes and select ok.""")) + + can_btn = urwid.Button('Cancel') + urwid.connect_signal(can_btn, 'click', reset_cb) + urwid.connect_signal( + can_btn, 'click', lambda _: asyncio.ensure_future(commit.commit.abort_cherry(self, err))) + + ok_btn = urwid.Button('Ok') + urwid.connect_signal(ok_btn, 'click', reset_cb) + urwid.connect_signal( + ok_btn, 'click', lambda _: asyncio.ensure_future(commit.commit.resolve(self))) + urwid.connect_signal( + ok_btn, 'click', lambda _: self.remove_commit(commit)) + + cols = urwid.Columns([ok_btn, can_btn]) + pile = urwid.Pile([t, cols]) + box = urwid.LineBox(pile) + + self.mainloop.widget = urwid.Overlay( + urwid.Filler(box), o, 'center', ('relative', 50), 'middle', ('relative', 50) + ) diff -Nru mesa-19.2.8/bin/pick-ui.py mesa-20.0.8/bin/pick-ui.py --- mesa-19.2.8/bin/pick-ui.py 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/bin/pick-ui.py 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,33 @@ +#!/usr/bin/env python3 +# Copyright © 2019-2020 Intel Corporation + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +import asyncio + +import urwid + +from pick.ui import UI, PALETTE + +if __name__ == "__main__": + u = UI() + evl = urwid.AsyncioEventLoop(loop=asyncio.get_event_loop()) + loop = urwid.MainLoop(u.render(), PALETTE, event_loop=evl) + u.mainloop = loop + loop.run() diff -Nru mesa-19.2.8/bin/symbols-check.py mesa-20.0.8/bin/symbols-check.py --- mesa-19.2.8/bin/symbols-check.py 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/bin/symbols-check.py 2020-06-12 01:21:16.000000000 +0000 @@ -19,9 +19,10 @@ ] -def get_symbols(nm, lib): +def get_symbols_nm(nm, lib): ''' List all the (non platform-specific) symbols exported by the library + using `nm` ''' symbols = [] platform_name = platform.system() @@ -39,7 +40,35 @@ assert symbol_name[0] == '_' symbol_name = symbol_name[1:] symbols.append(symbol_name) + return symbols + +def get_symbols_dumpbin(dumpbin, lib): + ''' + List all the (non platform-specific) symbols exported by the library + using `dumpbin` + ''' + symbols = [] + output = subprocess.check_output([dumpbin, '/exports', lib], + stderr=open(os.devnull, 'w')).decode("ascii") + for line in output.splitlines(): + fields = line.split() + # The lines with the symbols are made of at least 4 columns; see details below + if len(fields) < 4: + continue + try: + # Making sure the first 3 columns are a dec counter, a hex counter + # and a hex address + _ = int(fields[0], 10) + _ = int(fields[1], 16) + _ = int(fields[2], 16) + except ValueError: + continue + symbol_name = fields[3] + # De-mangle symbols + if symbol_name[0] == '_': + symbol_name = symbol_name[1:].split('@')[0] + symbols.append(symbol_name) return symbols @@ -55,12 +84,21 @@ help='path to library') parser.add_argument('--nm', action='store', - required=True, + help='path to binary (or name in $PATH)') + parser.add_argument('--dumpbin', + action='store', help='path to binary (or name in $PATH)') args = parser.parse_args() try: - lib_symbols = get_symbols(args.nm, args.lib) + if platform.system() == 'Windows': + if not args.dumpbin: + parser.error('--dumpbin is mandatory') + lib_symbols = get_symbols_dumpbin(args.dumpbin, args.lib) + else: + if not args.nm: + parser.error('--nm is mandatory') + lib_symbols = get_symbols_nm(args.nm, args.lib) except: # We can't run this test, but we haven't technically failed it either # Return the GNU "skip" error code @@ -109,6 +147,10 @@ continue if symbol in optional_symbols: continue + if symbol[:2] == '_Z': + # Ignore random C++ symbols + #TODO: figure out if there's any way to avoid exporting them in the first place + continue unknown_symbols.append(symbol) missing_symbols = [ diff -Nru mesa-19.2.8/common.py mesa-20.0.8/common.py --- mesa-19.2.8/common.py 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/common.py 2020-06-12 01:21:16.000000000 +0000 @@ -112,6 +112,7 @@ opts.Add(BoolOption('asan', 'enable Address Sanitizer', 'no')) opts.Add('toolchain', 'compiler toolchain', default_toolchain) opts.Add(BoolOption('llvm', 'use LLVM', default_llvm)) + opts.Add(BoolOption('force_scons', 'Force enable scons on deprecated platforms', 'false')) opts.Add(BoolOption('openmp', 'EXPERIMENTAL: compile with openmp (swrast)', 'no')) opts.Add(BoolOption('debug', 'DEPRECATED: debug build', 'yes')) diff -Nru mesa-19.2.8/debian/changelog mesa-20.0.8/debian/changelog --- mesa-19.2.8/debian/changelog 2020-07-13 13:21:26.000000000 +0000 +++ mesa-20.0.8/debian/changelog 2020-07-13 13:21:27.000000000 +0000 @@ -1,3 +1,343 @@ +mesa (20.0.8-0ubuntu1~18.04.1) bionic; urgency=medium + + * Backport to bionic. + * control: Relax libclc-dev build-dep for bionic. + * Revert-meson-Use-dependency.partial_dependency.patch: Don't use a special + meson feature not in bionic. + + -- Timo Aaltonen Fri, 12 Jun 2020 14:51:05 +0300 + +mesa (20.0.8-0ubuntu1~20.04.1) focal; urgency=medium + + * Backport to focal. + - migrate to LLVM 10 (LP: #1882901) + - fix zwp_linux_dmabuf_v1 on wayland (LP: #1868520) + * control: Relax libclc-dev build-dep for the backport. + + -- Timo Aaltonen Fri, 12 Jun 2020 09:05:27 +0300 + +mesa (20.0.8-0ubuntu1) groovy; urgency=medium + + * New upstream release. + * fix-build-with-llvm-10.diff: Dropped, upstream. + + -- Timo Aaltonen Fri, 12 Jun 2020 09:04:34 +0300 + +mesa (20.0.7-1ubuntu1) groovy; urgency=medium + + * Merge from Debian. + + -- Timo Aaltonen Mon, 25 May 2020 13:02:32 +0300 + +mesa (20.0.7-1) unstable; urgency=medium + + * New upstream release. + * src_glx_dri_common.h.diff: A new attempt to fix hurd ftbfs. (Closes: + #960197) + + -- Timo Aaltonen Fri, 22 May 2020 08:36:18 +0300 + +mesa (20.0.6-3ubuntu1) groovy; urgency=medium + + * Merge from Debian. + * control: Build with llvm-10. + + -- Timo Aaltonen Fri, 08 May 2020 12:50:14 +0300 + +mesa (20.0.6-3) unstable; urgency=medium + + * src_glx_dri_common.h.diff: Dropped, breaks the build. + + -- Timo Aaltonen Fri, 08 May 2020 12:44:12 +0300 + +mesa (20.0.6-2) unstable; urgency=medium + + [ Timo Aaltonen ] + * control: Bump libdrm-dev build-dep. + + [ Svante Signell ] + * src_glx_dri_common.h.diff: Fix build on Hurd. (Closes: #959975) + + [ Frédéric Bonnard ] + * Fix ppc64el FTBFS. (Closes: #959943) + + -- Timo Aaltonen Fri, 08 May 2020 11:00:10 +0300 + +mesa (20.0.6-1) unstable; urgency=medium + + * New upstream release. + * patches: Drop upstreamed patches, refresh llvm-10 build-fix. + + -- Timo Aaltonen Tue, 05 May 2020 10:37:18 +0300 + +mesa (20.0.4-2ubuntu1~18.04.2) bionic; urgency=medium + + * rules: Disable zink, it adds libvulkan1 dependency to libgl1-mesa- + dri, and it's in universe. + + -- Timo Aaltonen Tue, 26 May 2020 12:11:32 +0300 + +mesa (20.0.4-2ubuntu1~18.04.1) bionic; urgency=medium + + * Backport for 18.04.5 HWE stack update. (LP: #1876882). + * Install EGL/GL/GLES header files again, migration to libglvnd doesn't + concern bionic. + * control: Relax debhelper requirement. + * patches: Refreshed. + * patches: Revert more things for old libglvnd/meson. + * control: Bump libdrm-dev build-dep. + * rules: Don't prefer iris over i965 on Intel to avoid needing a newer + libepoxy. + + -- Timo Aaltonen Thu, 14 May 2020 22:07:49 +0300 + +mesa (20.0.4-2ubuntu1) focal; urgency=medium + + * Merge from Debian. + * disable-intel-ccs-compression.diff: Dropped, we use iris by default. + (LP: #1863874) + * Don't build with llvm-10 until ppc64el ftbfs is fixed. + + -- Timo Aaltonen Thu, 16 Apr 2020 09:22:34 +0300 + +mesa (20.0.4-2) unstable; urgency=medium + + * fix-build-with-llvm-10.diff: Add a patch to fix build with Polly, + and build with llvm-10 again. (Closes: #956004) + * iris-drop-cache-coherent-cpu-mapping.diff: Fix corruption with iris + (Closes: #954311) (LP: #1864274) + * gallium-fix-bob-compute-shaders.diff: Fix vaapi with bob + deinterlacing. (LP: #1867188) + + -- Timo Aaltonen Wed, 15 Apr 2020 10:16:24 +0300 + +mesa (20.0.4-1ubuntu1) focal; urgency=medium + + * Merge from Debian. + + -- Timo Aaltonen Mon, 06 Apr 2020 08:55:45 +0300 + +mesa (20.0.4-1) unstable; urgency=medium + + * New upstream release. + * path_max.diff, libglx-mesa0.symbols.hurd: Fix build on Hurd. + (Closes: #955351) + * upstream/signing-key.asc: Added Eric Engestrom's key. + * source/local-options: Update extend-diff-ignore. + + -- Timo Aaltonen Sun, 05 Apr 2020 23:39:55 +0300 + +mesa (20.0.2-1ubuntu1) focal; urgency=medium + + * Merge from Debian. + + -- Timo Aaltonen Wed, 25 Mar 2020 16:37:30 +0200 + +mesa (20.0.2-1) unstable; urgency=medium + + * New upstream release. + * control: Revert to building with llvm/clang-9, until it builds with + 10. + + -- Timo Aaltonen Thu, 19 Mar 2020 16:03:53 +0200 + +mesa (20.0.1-2) experimental; urgency=medium + + * control: Fix clang build-dep to actually use libclang-10-dev. + + -- Timo Aaltonen Mon, 09 Mar 2020 23:20:30 +0200 + +mesa (20.0.1-1) experimental; urgency=medium + + * New upstream release. + + -- Timo Aaltonen Mon, 09 Mar 2020 17:40:08 +0200 + +mesa (20.0.0-1ubuntu1) focal; urgency=medium + + * Merge from Debian. + + -- Timo Aaltonen Thu, 20 Feb 2020 11:09:59 +0200 + +mesa (20.0.0-1) experimental; urgency=medium + + * New upstream release. + * control: Use debhelper-compat, bump to 12. + + -- Timo Aaltonen Thu, 20 Feb 2020 10:46:06 +0200 + +mesa (20.0.0~rc3-1ubuntu1) focal; urgency=medium + + * Merge from Debian. + * dont-enable-10bpc-by-default.diff: Dropped, gnome is fixed long ago. + + -- Timo Aaltonen Wed, 19 Feb 2020 11:39:27 +0200 + +mesa (20.0.0~rc3-1) experimental; urgency=medium + + * New upstream release candidate. + * rules: Fix dh_auto_test accepting failure. + + -- Timo Aaltonen Fri, 14 Feb 2020 14:07:05 +0200 + +mesa (20.0.0~rc2-1) experimental; urgency=medium + + * New upstream release candidate. + * rules: Make running tests non-fatal, until they're actually expected + to pass on each arch. + + -- Timo Aaltonen Mon, 10 Feb 2020 15:02:10 +0200 + +mesa (20.0.0~rc1-1) experimental; urgency=medium + + * New upstream release candidate. + * control: Add libzstd-dev to build-depends for zstd compressed shader + cache support. + * rules, fix-python-shebang.diff: Enable tests again to see where we + are now. + + -- Timo Aaltonen Tue, 04 Feb 2020 15:46:02 +0200 + +mesa (20.0.0~git20200129-1) experimental; urgency=medium + + * New upstream snapshot. + * mesa-vulkan-drivers.install: Include overlay control script. + + -- Timo Aaltonen Wed, 29 Jan 2020 09:29:20 +0200 + +mesa (19.3.3-1ubuntu1) focal; urgency=medium + + * Merge from Debian. + + -- Timo Aaltonen Tue, 28 Jan 2020 21:57:20 +0200 + +mesa (19.3.3-1) unstable; urgency=medium + + * New upstream release. + * rules: Build zink on x86. + * control: Add libglvnd-dev to libegl1-mesa-dev/libgles2-mesa- + dev/libgl1-mesa-dev Depends to ease the transition. (Closes: #949677) + * drisw-fix-depth-for-ximage.diff: Dropped, upstream. + + -- Timo Aaltonen Tue, 28 Jan 2020 21:45:02 +0200 + +mesa (19.3.2-1ubuntu1) focal; urgency=medium + + * Merge from Debian. + + -- Timo Aaltonen Wed, 15 Jan 2020 15:24:05 +0200 + +mesa (19.3.2-1) unstable; urgency=medium + + * New upstream release. + + -- Timo Aaltonen Wed, 15 Jan 2020 14:55:29 +0200 + +mesa (19.3.1-4ubuntu1) focal; urgency=medium + + * Merge from Debian. + + -- Timo Aaltonen Wed, 08 Jan 2020 14:17:39 +0200 + +mesa (19.3.1-4) unstable; urgency=medium + + [ Julien Cristau ] + * Ensure strict dependencies on libglapi-mesa (closes: #947813). + + [ Timo Aaltonen ] + * control: Add libgl{,x}-dev to mesa-common-dev Depends. (Closes: + #947392) + * drisw-fix-depth-for-ximage.diff: Fix sw driver rgbBits/depth calculation. + (Closes: #947196) + + -- Timo Aaltonen Wed, 08 Jan 2020 11:38:56 +0200 + +mesa (19.3.1-3) unstable; urgency=medium + + * control, rules: Actually build radv on mipsel, drop mips from + everywhere since it's not an arch anymore. (Closes: #947310) + + -- Timo Aaltonen Wed, 25 Dec 2019 22:05:01 +0200 + +mesa (19.3.1-2) unstable; urgency=medium + + * rules: Build radv on mips/mipsel again. + + -- Timo Aaltonen Thu, 19 Dec 2019 23:17:58 +0200 + +mesa (19.3.1-1ubuntu1) focal; urgency=medium + + * Merge from Debian. + * patches: Refreshed, i965-sync-pciids.diff dropped as it's applied + upstream. + + -- Timo Aaltonen Thu, 19 Dec 2019 21:00:23 +0200 + +mesa (19.3.1-1) unstable; urgency=medium + + * New upstream release. + * fix-radv-secure-compile.diff: Dropped, upstream. + + -- Timo Aaltonen Thu, 19 Dec 2019 16:45:25 +0200 + +mesa (19.3.0-1) experimental; urgency=medium + + * New upstream release. + * fix-radv-secure-compile.diff: Fix radv build where __NR__newselect + isn't defined. + * rules: Restore radv build on non-x86 archs. + + -- Timo Aaltonen Fri, 13 Dec 2019 01:39:02 +0200 + +mesa (19.3.0~rc6-1) experimental; urgency=medium + + * New upstream release candidate. + * control, rules: Don't build radv on non-x86. + * generate-pc-files-for-gles-and-gles2.diff: Dropped, not needed anymore. + * rules: Update clean target. + + -- Timo Aaltonen Mon, 09 Dec 2019 13:29:49 +0200 + +mesa (19.3.0~rc5-1) experimental; urgency=medium + + * New upstream release candidate. + * generate-pc-files-for-gles-and-gles2.diff: This is needed until we + have glvnd 1.2 in the archive. + + -- Timo Aaltonen Fri, 29 Nov 2019 21:56:04 +0200 + +mesa (19.2.6-1) unstable; urgency=medium + + * New upstream release. + * create-gles-pc-files-with-old-glvnd.diff: Dropped, upstream. + + -- Timo Aaltonen Fri, 29 Nov 2019 21:08:25 +0200 + +mesa (19.2.4-1ubuntu1) focal; urgency=medium + + * Merge from Debian. + * revert-set-full-thread-affinity.diff: Dropped, qemu is fixed now in + eoan and up. (LP: #1815889) + + -- Timo Aaltonen Wed, 20 Nov 2019 20:17:00 +0200 + +mesa (19.2.4-1) unstable; urgency=medium + + * New upstream release. (Closes: #944880) + + -- Timo Aaltonen Mon, 18 Nov 2019 15:25:12 +0200 + +mesa (19.2.3-1) unstable; urgency=medium + + * New upstream release. + * create-gles-pc-files-with-old-glvnd.diff: Drop the revert with a + proposed patch to fix creating gles pc files with old glvnd. + * rules: GLESv2 libs aren't built anymore, drop removing the libs. + * mesa-common-dev: Drop mangled GL/GLX headers, they're not generated + anymore. + + -- Timo Aaltonen Thu, 07 Nov 2019 14:01:26 +0200 + mesa (19.2.8-0ubuntu0~18.04.3) bionic; urgency=medium * i965-sync-pciids.diff: Add back three pci-id's for CML that were diff -Nru mesa-19.2.8/debian/control mesa-20.0.8/debian/control --- mesa-19.2.8/debian/control 2020-07-13 13:21:26.000000000 +0000 +++ mesa-20.0.8/debian/control 2020-07-13 13:21:27.000000000 +0000 @@ -7,11 +7,11 @@ Standards-Version: 4.1.4 Build-Depends: debhelper (>= 11), -# glslang-tools [amd64 arm64 armel armhf i386 mips mips64el mipsel powerpc ppc64 ppc64el s390x sparc64 x32], +# glslang-tools [amd64 arm64 armel armhf i386 mips64el mipsel powerpc ppc64 ppc64el s390x sparc64 x32], meson (>= 0.45), quilt (>= 0.63-8.2~), pkg-config, - libdrm-dev (>= 2.4.99) [!hurd-any], + libdrm-dev (>= 2.4.101) [!hurd-any], libx11-dev, libxxf86vm-dev, libexpat1-dev, @@ -21,7 +21,7 @@ libxext-dev, libva-dev (>= 1.6.0) [linux-any kfreebsd-any] , libvdpau-dev (>= 1.1.1) [linux-any kfreebsd-any], - libvulkan-dev [amd64 arm64 armel armhf i386 mips mips64el mipsel powerpc ppc64 ppc64el s390x sparc64 x32], + libvulkan-dev [amd64 arm64 armel armhf i386 mips64el mipsel powerpc ppc64 ppc64el s390x sparc64 x32], x11proto-dev, linux-libc-dev (>= 2.6.31) [linux-any], libx11-xcb-dev, @@ -34,17 +34,18 @@ libxcb-sync-dev, libxrandr-dev, libxshmfence-dev (>= 1.1), + libzstd-dev, python3, python3-mako, python3-setuptools, flex, bison, - llvm-9-dev (>= 1:9~+rc3-1~exp3) [amd64 arm64 armel armhf i386 kfreebsd-amd64 kfreebsd-i386 mips mips64el mipsel powerpc ppc64 ppc64el s390x sparc64], - libelf-dev [amd64 arm64 armel armhf i386 kfreebsd-amd64 kfreebsd-i386 mips mips64el mipsel powerpc ppc64 ppc64el s390x sparc64], + libelf-dev [amd64 arm64 armel armhf i386 kfreebsd-amd64 kfreebsd-i386 mips64el mipsel powerpc ppc64 ppc64el s390x sparc64], libwayland-dev (>= 1.15.0) [linux-any], libwayland-egl-backend-dev (>= 1.15.0) [linux-any], - libclang-9-dev (>= 1:9~+rc3-1~exp3) [amd64 arm64 armel armhf i386 kfreebsd-amd64 kfreebsd-i386 mips mips64el mipsel powerpc ppc64 ppc64el s390x sparc64], - libclc-dev (>= 0.2.0+git20190827-1~) [amd64 arm64 armel armhf i386 kfreebsd-amd64 kfreebsd-i386 mips mips64el mipsel powerpc ppc64 ppc64el s390x sparc64], + llvm-10-dev (>= 1:10.0.0-4~) [amd64 arm64 armel armhf i386 kfreebsd-amd64 kfreebsd-i386 mips64el mipsel powerpc ppc64 ppc64el s390x sparc64], + libclang-10-dev (>= 1:10.0.0-4~) [amd64 arm64 armel armhf i386 kfreebsd-amd64 kfreebsd-i386 mips64el mipsel powerpc ppc64 ppc64el s390x sparc64], + libclc-dev (>= 0.2.0+git20190827-1ubuntu0.18.04.3) [amd64 arm64 armel armhf i386 kfreebsd-amd64 kfreebsd-i386 mips64el mipsel powerpc ppc64 ppc64el s390x sparc64], wayland-protocols (>= 1.9), zlib1g-dev, libglvnd-core-dev, @@ -270,7 +271,6 @@ Depends: ${shlibs:Depends}, ${misc:Depends}, - libglapi-mesa (= ${binary:Version}), libgl1-mesa-dri, Provides: libglx-vendor Breaks: @@ -383,7 +383,6 @@ Depends: ${shlibs:Depends}, ${misc:Depends}, - libglapi-mesa (= ${binary:Version}), Pre-Depends: ${misc:Pre-Depends} Multi-Arch: same Description: Mesa Off-screen rendering extension @@ -455,7 +454,7 @@ Package: mesa-vulkan-drivers Section: libs -Architecture: amd64 arm64 armel armhf i386 mips mips64el mipsel powerpc ppc64 ppc64el s390x sparc64 x32 +Architecture: amd64 arm64 armel armhf i386 mips64el mipsel powerpc ppc64 ppc64el s390x sparc64 x32 Pre-Depends: ${misc:Pre-Depends} Depends: libvulkan1, @@ -469,7 +468,7 @@ Package: mesa-opencl-icd Section: libs -Architecture: amd64 arm64 armel armhf i386 kfreebsd-amd64 kfreebsd-i386 mips mips64el mipsel powerpc ppc64 ppc64el s390x sparc64 +Architecture: amd64 arm64 armel armhf i386 kfreebsd-amd64 kfreebsd-i386 mips64el mipsel powerpc ppc64 ppc64el s390x sparc64 Pre-Depends: ${misc:Pre-Depends} Depends: libclc-r600 (>= 0.2.0+git20180312-1~), diff -Nru mesa-19.2.8/debian/libglx-mesa0.symbols.hurd mesa-20.0.8/debian/libglx-mesa0.symbols.hurd --- mesa-19.2.8/debian/libglx-mesa0.symbols.hurd 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/debian/libglx-mesa0.symbols.hurd 2020-07-13 13:21:27.000000000 +0000 @@ -0,0 +1,6 @@ +libGLX_mesa.so.0 libglx-mesa0 + __glx_Main@Base 17.0.0~ + glAreTexturesResidentEXT@Base 0 + glDeleteTexturesEXT@Base 0 + glGenTexturesEXT@Base 0 + glIsTextureEXT@Base 0 diff -Nru mesa-19.2.8/debian/mesa-vulkan-drivers.install mesa-20.0.8/debian/mesa-vulkan-drivers.install --- mesa-19.2.8/debian/mesa-vulkan-drivers.install 2020-07-13 13:21:26.000000000 +0000 +++ mesa-20.0.8/debian/mesa-vulkan-drivers.install 2020-07-13 13:21:27.000000000 +0000 @@ -1,3 +1,4 @@ +#usr/bin/mesa-overlay-control.py #usr/share/vulkan/explicit_layer.d/*.json usr/share/vulkan/icd.d/*.json usr/lib/*/libvulkan_*.so diff -Nru mesa-19.2.8/debian/patches/build-glesv2-pc.diff mesa-20.0.8/debian/patches/build-glesv2-pc.diff --- mesa-19.2.8/debian/patches/build-glesv2-pc.diff 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/debian/patches/build-glesv2-pc.diff 2020-07-13 13:21:27.000000000 +0000 @@ -0,0 +1,11 @@ +--- a/src/mapi/meson.build ++++ b/src/mapi/meson.build +@@ -35,7 +35,7 @@ if with_shared_glapi + else + libglapi = [] + endif +-if not with_glvnd ++if not glvnd_has_headers_and_pc_files + if with_gles1 + subdir('es1api') + endif diff -Nru mesa-19.2.8/debian/patches/disable-intel-ccs-compression.diff mesa-20.0.8/debian/patches/disable-intel-ccs-compression.diff --- mesa-19.2.8/debian/patches/disable-intel-ccs-compression.diff 2020-07-13 13:21:26.000000000 +0000 +++ mesa-20.0.8/debian/patches/disable-intel-ccs-compression.diff 1970-01-01 00:00:00.000000000 +0000 @@ -1,31 +0,0 @@ -Description: Disable i965 CCS renderbuffer compression - It's a new optimization introduced in Mesa 18.0.0, however is presently - causing framebuffer corruption when logging into Xorg sessions. This - corruption is actually expected for now, until the kernel and Xorg are - extended to query and understand the CCS modifier properly. - . - Presently upstream would rather have CCS _and_ the corruption in Xorg - sessions than remove it. However that's not good enough for the Ubuntu - user experience so we're just disabling CCS while the corruption is still - a problem. It was just an optional optimization anyway. -Author: Daniel van Vugt -Bug-Ubuntu: https://launchpad.net/bugs/1753776 -Bug: https://bugs.freedesktop.org/show_bug.cgi?id=105518 -Forwarded: no -Last-Update: 2018-03-21 - ---- a/src/mesa/drivers/dri/i965/intel_screen.c -+++ b/src/mesa/drivers/dri/i965/intel_screen.c -@@ -333,7 +333,11 @@ static const struct { - { .modifier = DRM_FORMAT_MOD_LINEAR , .since_gen = 1 }, - { .modifier = I915_FORMAT_MOD_X_TILED , .since_gen = 1 }, - { .modifier = I915_FORMAT_MOD_Y_TILED , .since_gen = 6 }, -- { .modifier = I915_FORMAT_MOD_Y_TILED_CCS , .since_gen = 9 }, -+/* Disable CCS to work around Xorg login corruption (LP: #1753776). -+ * Longer-term, upstream or someone needs to extend the kernel (and Xorg?) -+ * to fix it properly: https://bugs.freedesktop.org/show_bug.cgi?id=105518 -+ * { .modifier = I915_FORMAT_MOD_Y_TILED_CCS , .since_gen = 9 }, -+ */ - }; - - static bool diff -Nru mesa-19.2.8/debian/patches/dont-enable-10bpc-by-default.diff mesa-20.0.8/debian/patches/dont-enable-10bpc-by-default.diff --- mesa-19.2.8/debian/patches/dont-enable-10bpc-by-default.diff 2020-07-13 13:21:26.000000000 +0000 +++ mesa-20.0.8/debian/patches/dont-enable-10bpc-by-default.diff 2020-07-13 13:21:27.000000000 +0000 @@ -1,9 +1,10 @@ --- a/src/gallium/auxiliary/pipe-loader/driinfo_gallium.h +++ b/src/gallium/auxiliary/pipe-loader/driinfo_gallium.h -@@ -37,5 +37,5 @@ DRI_CONF_SECTION_END - DRI_CONF_SECTION_MISCELLANEOUS +@@ -39,6 +39,6 @@ DRI_CONF_SECTION_MISCELLANEOUS DRI_CONF_ALWAYS_HAVE_DEPTH_BUFFER("false") DRI_CONF_GLSL_ZERO_INIT("false") + DRI_CONF_VS_POSITION_ALWAYS_INVARIANT("false") - DRI_CONF_ALLOW_RGB10_CONFIGS("true") + DRI_CONF_ALLOW_RGB10_CONFIGS("false") + DRI_CONF_ALLOW_FP16_CONFIGS("false") DRI_CONF_SECTION_END diff -Nru mesa-19.2.8/debian/patches/fix-ppc64el.patch mesa-20.0.8/debian/patches/fix-ppc64el.patch --- mesa-19.2.8/debian/patches/fix-ppc64el.patch 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/debian/patches/fix-ppc64el.patch 2020-07-13 13:21:27.000000000 +0000 @@ -0,0 +1,41 @@ +Description: Fix FTBFS on ppc64el +gnu++11 used to let mesa compile on ppc64el but with the use of C++14 +compilation now fails. +Let's not force gnu++11 and use defaults like on other arches but +fix the issue that happens then, that is, the collision of altivec and +c++ symbols. +For that we undefine bool, vector, pixel as advised by altivec.h . +Author: Frédéric Bonnard +--- +This patch header follows DEP-3: http://dep.debian.net/deps/dep3/ +--- a/meson.build ++++ b/meson.build +@@ -724,14 +724,6 @@ + dep_spirv_tools = null_dep + dep_llvmspirvlib = null_dep + endif +- +- if host_machine.cpu_family().startswith('ppc') and cpp.compiles(''' +- #if !defined(__VEC__) || !defined(__ALTIVEC__) +- #error "AltiVec not enabled" +- #endif''', +- name : 'Altivec') +- clover_cpp_std += ['cpp_std=gnu++11'] +- endif + else + dep_clc = null_dep + dep_spirv_tools = null_dep +--- a/include/CL/cl_platform.h ++++ b/include/CL/cl_platform.h +@@ -356,6 +356,11 @@ + /* Define basic vector types */ + #if defined( __VEC__ ) + #include /* may be omitted depending on compiler. AltiVec spec provides no way to detect whether the header is required. */ ++ #if defined(__ALTIVEC__) && !defined(__APPLE_ALTIVEC__) ++ #undef vector ++ #undef pixel ++ #undef bool ++ #endif + typedef __vector unsigned char __cl_uchar16; + typedef __vector signed char __cl_char16; + typedef __vector unsigned short __cl_ushort8; diff -Nru mesa-19.2.8/debian/patches/fix-python-shebang.diff mesa-20.0.8/debian/patches/fix-python-shebang.diff --- mesa-19.2.8/debian/patches/fix-python-shebang.diff 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/debian/patches/fix-python-shebang.diff 2020-07-13 13:21:27.000000000 +0000 @@ -0,0 +1,8 @@ +--- a/bin/symbols-check.py ++++ b/bin/symbols-check.py +@@ -1,4 +1,4 @@ +-#!/usr/bin/env python ++#!/usr/bin/env python3 + + import argparse + import os diff -Nru mesa-19.2.8/debian/patches/i965-sync-pciids.diff mesa-20.0.8/debian/patches/i965-sync-pciids.diff --- mesa-19.2.8/debian/patches/i965-sync-pciids.diff 2020-07-13 13:21:26.000000000 +0000 +++ mesa-20.0.8/debian/patches/i965-sync-pciids.diff 1970-01-01 00:00:00.000000000 +0000 @@ -1,42 +0,0 @@ ---- a/include/pci_ids/i965_pci_ids.h -+++ b/include/pci_ids/i965_pci_ids.h -@@ -205,10 +205,13 @@ CHIPSET(0x9BC0, cfl_gt2, "Intel(R) UHD G - CHIPSET(0x9BC2, cfl_gt2, "Intel(R) UHD Graphics (Comet Lake 3x8 GT2)") - CHIPSET(0x9BC4, cfl_gt2, "Intel(R) UHD Graphics (Comet Lake 3x8 GT2)") - CHIPSET(0x9BC5, cfl_gt2, "Intel(R) UHD Graphics (Comet Lake 3x8 GT2)") -+CHIPSET(0x9BC6, cfl_gt2, "Intel(R) UHD Graphics (Comet Lake 3x8 GT2)") - CHIPSET(0x9BC8, cfl_gt2, "Intel(R) UHD Graphics (Comet Lake 3x8 GT2)") - CHIPSET(0x9BCA, cfl_gt2, "Intel(R) UHD Graphics (Comet Lake 3x8 GT2)") - CHIPSET(0x9BCB, cfl_gt2, "Intel(R) UHD Graphics (Comet Lake 3x8 GT2)") - CHIPSET(0x9BCC, cfl_gt2, "Intel(R) UHD Graphics (Comet Lake 3x8 GT2)") -+CHIPSET(0x9BE6, cfl_gt2, "Intel(R) UHD Graphics (Comet Lake 3x8 GT2)") -+CHIPSET(0x9BF6, cfl_gt2, "Intel(R) UHD Graphics (Comet Lake 3x8 GT2)") - CHIPSET(0x5A49, cnl_2x8, "Intel(R) HD Graphics (Cannonlake 2x8 GT0.5)") - CHIPSET(0x5A4A, cnl_2x8, "Intel(R) HD Graphics (Cannonlake 2x8 GT0.5)") - CHIPSET(0x5A41, cnl_3x8, "Intel(R) HD Graphics (Cannonlake 3x8 GT1)") -@@ -222,17 +225,17 @@ CHIPSET(0x5A51, cnl_5x8, "Intel(R) HD Gr - CHIPSET(0x5A52, cnl_5x8, "Intel(R) HD Graphics (Cannonlake 5x8 GT2)") - CHIPSET(0x5A54, cnl_5x8, "Intel(R) HD Graphics (Cannonlake 5x8 GT2)") - CHIPSET(0x8A50, icl_8x8, "Intel(R) HD Graphics (Ice Lake 8x8 GT2)") --CHIPSET(0x8A51, icl_8x8, "Intel(R) HD Graphics (Ice Lake 8x8 GT2)") --CHIPSET(0x8A52, icl_8x8, "Intel(R) HD Graphics (Ice Lake 8x8 GT2)") --CHIPSET(0x8A53, icl_8x8, "Intel(R) HD Graphics (Ice Lake 8x8 GT2)") --CHIPSET(0x8A54, icl_6x8, "Intel(R) HD Graphics (Ice Lake 6x8 GT1.5)") --CHIPSET(0x8A56, icl_4x8, "Intel(R) HD Graphics (Ice Lake 4x8 GT1)") -+CHIPSET(0x8A51, icl_8x8, "Intel(R) Iris(R) Plus Graphics (Ice Lake 8x8 GT2)") -+CHIPSET(0x8A52, icl_8x8, "Intel(R) Iris(R) Plus Graphics (Ice Lake 8x8 GT2)") -+CHIPSET(0x8A53, icl_8x8, "Intel(R) Iris(R) Plus Graphics (Ice Lake 8x8 GT2)") -+CHIPSET(0x8A54, icl_6x8, "Intel(R) Iris(R) Plus Graphics (Ice Lake 6x8 GT1.5)") -+CHIPSET(0x8A56, icl_4x8, "Intel(R) UHD Graphics (Ice Lake 4x8 GT1)") - CHIPSET(0x8A57, icl_6x8, "Intel(R) HD Graphics (Ice Lake 6x8 GT1.5)") --CHIPSET(0x8A58, icl_4x8, "Intel(R) HD Graphics (Ice Lake 4x8 GT1)") -+CHIPSET(0x8A58, icl_4x8, "Intel(R) UHD Graphics (Ice Lake 4x8 GT1)") - CHIPSET(0x8A59, icl_6x8, "Intel(R) HD Graphics (Ice Lake 6x8 GT1.5)") --CHIPSET(0x8A5A, icl_6x8, "Intel(R) HD Graphics (Ice Lake 6x8 GT1.5)") -+CHIPSET(0x8A5A, icl_6x8, "Intel(R) Iris(R) Plus Graphics (Ice Lake 6x8 GT1.5)") - CHIPSET(0x8A5B, icl_4x8, "Intel(R) HD Graphics (Ice Lake 4x8 GT1)") --CHIPSET(0x8A5C, icl_6x8, "Intel(R) HD Graphics (Ice Lake 6x8 GT1.5)") -+CHIPSET(0x8A5C, icl_6x8, "Intel(R) Iris(R) Plus Graphics (Ice Lake 6x8 GT1.5)") - CHIPSET(0x8A5D, icl_4x8, "Intel(R) HD Graphics (Ice Lake 4x8 GT1)") - CHIPSET(0x8A71, icl_1x8, "Intel(R) HD Graphics (Ice Lake 1x8 GT0.5)") - CHIPSET(0x4500, ehl_4x8, "Intel(R) HD Graphics (Elkhart Lake 4x8)") diff -Nru mesa-19.2.8/debian/patches/path_max.diff mesa-20.0.8/debian/patches/path_max.diff --- mesa-19.2.8/debian/patches/path_max.diff 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/debian/patches/path_max.diff 2020-07-13 13:21:27.000000000 +0000 @@ -0,0 +1,21 @@ +--- a/src/compiler/glsl/tests/cache_test.c 2020-03-18 22:24:18.000000000 +0100 ++++ b/src/compiler/glsl/tests/cache_test.c 2020-03-30 12:09:05.000000000 +0200 +@@ -132,8 +132,8 @@ + { + bool sub_dirs_created = false; + +- char buf[PATH_MAX]; +- if (getcwd(buf, PATH_MAX)) { ++ char *buf = getcwd(NULL, 0); ++ if (buf) { + char *full_path = NULL; + if (asprintf(&full_path, "%s%s", buf, ++cache_dir) != -1 ) { + struct stat sb; +@@ -142,6 +142,7 @@ + + free(full_path); + } ++ free(buf); + } + + expect_true(sub_dirs_created, "create sub dirs"); diff -Nru mesa-19.2.8/debian/patches/Revert-meson-drop-Wno-foo-bug-workaround.diff mesa-20.0.8/debian/patches/Revert-meson-drop-Wno-foo-bug-workaround.diff --- mesa-19.2.8/debian/patches/Revert-meson-drop-Wno-foo-bug-workaround.diff 2020-07-13 13:21:26.000000000 +0000 +++ mesa-20.0.8/debian/patches/Revert-meson-drop-Wno-foo-bug-workaround.diff 2020-07-13 13:21:27.000000000 +0000 @@ -8,66 +8,55 @@ meson.build | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) -diff --git a/meson.build b/meson.build -index db94f85f04e..4936f17e674 100644 --- a/meson.build +++ b/meson.build -@@ -867,8 +867,6 @@ foreach a : ['-Werror=implicit-function-declaration', - '-Werror=incompatible-pointer-types', - '-Werror=format', - '-Wformat-security', -- '-Wno-missing-field-initializers', -- '-Wno-format-truncation', - '-fno-math-errno', - '-fno-trapping-math', '-Qunused-arguments'] - if cc.has_argument(a) -@@ -876,6 +874,12 @@ foreach a : ['-Werror=implicit-function-declaration', - endif - endforeach +@@ -953,8 +953,6 @@ else + '-Werror=empty-body', + '-Werror=incompatible-pointer-types', + '-Werror=int-conversion', +- '-Wno-missing-field-initializers', +- '-Wno-format-truncation', + '-fno-math-errno', + '-fno-trapping-math', + '-Qunused-arguments', +@@ -969,12 +967,15 @@ else + endif + endforeach -+foreach a : ['missing-field-initializers', 'format-truncation'] -+ if cc.has_argument('-W' + a) -+ c_args += '-Wno-' + a -+ endif -+endforeach ++ foreach a : ['missing-field-initializers', 'format-truncation'] ++ if cc.has_argument('-W' + a) ++ c_args += '-Wno-' + a ++ endif ++ endforeach + - c_vis_args = [] - if cc.has_argument('-fvisibility=hidden') - c_vis_args += '-fvisibility=hidden' -@@ -886,9 +890,6 @@ cpp_args = [] - foreach a : ['-Werror=return-type', - '-Werror=format', - '-Wformat-security', -- '-Wno-non-virtual-dtor', -- '-Wno-missing-field-initializers', -- '-Wno-format-truncation', - '-fno-math-errno', '-fno-trapping-math', - '-Qunused-arguments'] - if cpp.has_argument(a) -@@ -896,11 +897,19 @@ foreach a : ['-Werror=return-type', - endif - endforeach + _trial = [ + '-Werror=return-type', + '-Werror=empty-body', +- '-Wno-non-virtual-dtor', +- '-Wno-missing-field-initializers', +- '-Wno-format-truncation', + '-fno-math-errno', + '-fno-trapping-math', + '-Qunused-arguments', +@@ -989,9 +990,18 @@ else + endif + endforeach -+# For some reason, the test for -Wno-foo always succeeds with gcc, even if the -+# option is not supported. Hence, check for -Wfoo instead. +- foreach a : ['-Wno-override-init', '-Wno-initializer-overrides'] +- if cc.has_argument(a) +- no_override_init_args += a ++ # For some reason, the test for -Wno-foo always succeeds with gcc, even if the ++ # option is not supported. Hence, check for -Wfoo instead. + -+foreach a : ['non-virtual-dtor', 'missing-field-initializers', 'format-truncation'] -+ if cpp.has_argument('-W' + a) -+ cpp_args += '-Wno-' + a -+ endif -+endforeach ++ foreach a : ['non-virtual-dtor', 'missing-field-initializers', 'format-truncation'] ++ if cpp.has_argument('-W' + a) ++ cpp_args += '-Wno-' + a ++ endif ++ endforeach + - no_override_init_args = [] --foreach a : ['-Wno-override-init', -- '-Wno-initializer-overrides'] -- if cc.has_argument(a) -- no_override_init_args += a -+foreach a : ['override-init', 'initializer-overrides'] -+ if cc.has_argument('-W' + a) -+ no_override_init_args += '-Wno-' + a - endif - endforeach ++ foreach a : ['no-override-init', 'no-initializer-overrides'] ++ if cc.has_argument('-W' + a) ++ no_override_init_args += '-Wno-' + a + endif + endforeach --- -2.20.1 - diff -Nru mesa-19.2.8/debian/patches/Revert-meson-revert-glvnd-workaround.patch mesa-20.0.8/debian/patches/Revert-meson-revert-glvnd-workaround.patch --- mesa-19.2.8/debian/patches/Revert-meson-revert-glvnd-workaround.patch 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/debian/patches/Revert-meson-revert-glvnd-workaround.patch 2020-07-13 13:21:27.000000000 +0000 @@ -0,0 +1,103 @@ +From 6840f6b5c1a8bab4959bb60000249753f2765850 Mon Sep 17 00:00:00 2001 +From: Timo Aaltonen +Date: Thu, 7 May 2020 14:15:11 +0300 +Subject: [PATCH] Revert "meson: revert glvnd workaround" + +This reverts commit 6e21dcc5a31634232660eff1a83052c2ebb4816c. +--- + include/meson.build | 2 +- + meson.build | 4 ++++ + src/egl/meson.build | 13 +++++++++++-- + src/meson.build | 14 ++++++++++++-- + 4 files changed, 28 insertions(+), 5 deletions(-) + +diff --git a/include/meson.build b/include/meson.build +index bddbd67d328..4d73aef9ce3 100644 +--- a/include/meson.build ++++ b/include/meson.build +@@ -22,7 +22,7 @@ inc_include = include_directories('.') + inc_d3d9 = include_directories('D3D9') + inc_haikugl = include_directories('HaikuGL') + +-if not with_glvnd ++if not glvnd_has_headers_and_pc_files + if with_gles1 or with_gles2 or with_opengl or with_egl + install_headers('KHR/khrplatform.h', subdir : 'KHR') + endif +diff --git a/meson.build b/meson.build +index c48945b3c48..879426d212d 100644 +--- a/meson.build ++++ b/meson.build +@@ -1469,8 +1469,12 @@ else + endif + + dep_glvnd = null_dep ++glvnd_has_headers_and_pc_files = false + if with_glvnd + dep_glvnd = dependency('libglvnd', version : '>= 1.2.0') ++ # GLVND before 1.2 was missing its pkg-config and header files, forcing every ++ # vendor to provide them and the distro maintainers to resolve the conflict. ++ glvnd_has_headers_and_pc_files = dep_glvnd.version().version_compare('>= 1.2.0') + pre_args += '-DUSE_LIBGLVND=1' + endif + +diff --git a/src/egl/meson.build b/src/egl/meson.build +index 12d74ec37fa..693de9c8c05 100644 +--- a/src/egl/meson.build ++++ b/src/egl/meson.build +@@ -174,12 +174,21 @@ libegl = shared_library( + version : egl_lib_version, + ) + +-if not with_glvnd ++# If using glvnd the pkg-config header should not point to EGL_mesa, it should ++# point to EGL. glvnd is only available on unix like platforms so adding -l ++# should be safe here ++if not with_glvnd or not glvnd_has_headers_and_pc_files ++ if not glvnd_has_headers_and_pc_files ++ _egl = '-L${libdir} -lEGL' ++ else ++ _egl = libegl ++ endif ++ + pkg.generate( + name : 'egl', + description : 'Mesa EGL Library', + version : meson.project_version(), +- libraries : libegl, ++ libraries : _egl, + libraries_private: gl_priv_libs, + requires_private : gl_priv_reqs, + extra_cflags : gl_pkgconfig_c_flags, +diff --git a/src/meson.build b/src/meson.build +index 53b999ad22a..1463d7b261f 100644 +--- a/src/meson.build ++++ b/src/meson.build +@@ -114,12 +114,22 @@ endif + + # This must be after at least mesa, glx, and gallium, since libgl will be + # defined in one of those subdirs depending on the glx provider. +-if with_glx != 'disabled' and not with_glvnd ++if with_glx != 'disabled' and (not with_glvnd or not glvnd_has_headers_and_pc_files) ++ # If using glvnd the pkg-config header should not point to GL_mesa, it should ++ # point to GL. glvnd is only available on unix like platforms so adding -l ++ # should be safe here ++ # TODO: in the glvnd case glvnd itself should really be providing this. ++ if not glvnd_has_headers_and_pc_files ++ _gl = '-L${libdir} -lGL' ++ else ++ _gl = libgl ++ endif ++ + pkg.generate( + name : 'gl', + description : 'Mesa OpenGL Library', + version : meson.project_version(), +- libraries : libgl, ++ libraries : _gl, + libraries_private : gl_priv_libs, + requires_private : gl_priv_reqs, + variables : ['glx_tls=yes'], +-- +2.25.1 + diff -Nru mesa-19.2.8/debian/patches/Revert-meson-Test-for-Wl-build-id-sha1.diff mesa-20.0.8/debian/patches/Revert-meson-Test-for-Wl-build-id-sha1.diff --- mesa-19.2.8/debian/patches/Revert-meson-Test-for-Wl-build-id-sha1.diff 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/debian/patches/Revert-meson-Test-for-Wl-build-id-sha1.diff 2020-07-13 13:21:27.000000000 +0000 @@ -0,0 +1,30 @@ +From dfed42b838cdb31041e16594a7c86558509ff5f4 Mon Sep 17 00:00:00 2001 +From: Timo Aaltonen +Date: Thu, 7 May 2020 13:42:00 +0300 +Subject: [PATCH] Revert "meson: Test for -Wl,--build-id=sha1" + +This reverts commit c0330461c9a8fcc86227489b35ff143e5d7ee2ab. +--- + meson.build | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +diff --git a/meson.build b/meson.build +index c48945b3c48..bea95b55fb7 100644 +--- a/meson.build ++++ b/meson.build +@@ -1230,8 +1230,10 @@ if cc.links('int main() { return 0; }', + name : 'dynamic-list') + with_ld_dynamic_list = true + endif +- +-ld_args_build_id = cc.get_supported_link_arguments('-Wl,--build-id=sha1') ++ld_args_build_id = [] ++if build_machine.system() != 'darwin' ++ ld_args_build_id += '-Wl,--build-id=sha1' ++endif + + # check for dl support + dep_dl = null_dep +-- +2.25.1 + diff -Nru mesa-19.2.8/debian/patches/Revert-meson-Use-dependency.partial_dependency.patch mesa-20.0.8/debian/patches/Revert-meson-Use-dependency.partial_dependency.patch --- mesa-19.2.8/debian/patches/Revert-meson-Use-dependency.partial_dependency.patch 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/debian/patches/Revert-meson-Use-dependency.partial_dependency.patch 2020-07-13 13:21:27.000000000 +0000 @@ -0,0 +1,72 @@ +From 12eae7c2b8f12a70063a49e1d1a09c68dae7d232 Mon Sep 17 00:00:00 2001 +From: Timo Aaltonen +Date: Mon, 15 Jun 2020 13:26:50 +0300 +Subject: [PATCH] Revert "meson: Use dependency.partial_dependency()" + +This reverts commit a2776c24c7bb3a203b94b4c8e31864263c63bfc4. +--- + .pick_status.json | 2 +- + meson.build | 13 ++++++++++--- + 2 files changed, 11 insertions(+), 4 deletions(-) + +diff --git a/.pick_status.json b/.pick_status.json +index ca9b2c0492c..2999c4b318e 100644 +--- a/.pick_status.json ++++ b/.pick_status.json +@@ -18067,7 +18067,7 @@ + "description": "meson: Use dependency.partial_dependency()", + "nominated": true, + "nomination_type": 1, +- "resolution": 1, ++ "resolution": 0, + "master_sha": null, + "because_sha": "53f9131205a63fa8b282ab2a7e96c48209447da0" + }, +diff --git a/meson.build b/meson.build +index eaaca7f13aa..d98cff3100d 100644 +--- a/meson.build ++++ b/meson.build +@@ -455,6 +455,8 @@ if with_dri + endif + endif + ++prog_pkgconfig = find_program('pkg-config') ++ + _vdpau = get_option('gallium-vdpau') + if not system_has_kms_drm + if _vdpau == 'true' +@@ -481,7 +483,9 @@ with_gallium_vdpau = false + if _vdpau != 'false' + dep_vdpau = dependency('vdpau', version : '>= 1.1', required : _vdpau == 'true') + if dep_vdpau.found() +- dep_vdpau = dep_vdpau.partial_dependency(compile_args : true) ++ dep_vdpau = declare_dependency( ++ compile_args : run_command(prog_pkgconfig, ['vdpau', '--cflags']).stdout().split() ++ ) + with_gallium_vdpau = true + endif + endif +@@ -631,7 +635,9 @@ dep_va = null_dep + if _va != 'false' + dep_va = dependency('libva', version : '>= 0.38.0', required : _va == 'true') + if dep_va.found() +- dep_va_headers = dep_va.partial_dependency(compile_args : true) ++ dep_va_headers = declare_dependency( ++ compile_args : run_command(prog_pkgconfig, ['libva', '--cflags']).stdout().split() ++ ) + with_gallium_va = true + endif + endif +@@ -1563,7 +1569,8 @@ if with_platform_wayland + dep_wayland_server = dependency('wayland-server', version : '>=1.11') + if with_egl + dep_wayland_egl = dependency('wayland-egl-backend', version : '>= 3') +- dep_wayland_egl_headers = dep_wayland_egl.partial_dependency(compile_args : true) ++ dep_wayland_egl_headers = declare_dependency( ++ compile_args : run_command(prog_pkgconfig, ['wayland-egl-backend', '--cflags']).stdout().split()) + endif + wayland_dmabuf_xml = join_paths( + dep_wl_protocols.get_pkgconfig_variable('pkgdatadir'), 'unstable', +-- +2.25.1 + diff -Nru mesa-19.2.8/debian/patches/revert-requiring-new-libglvnd.diff mesa-20.0.8/debian/patches/revert-requiring-new-libglvnd.diff --- mesa-19.2.8/debian/patches/revert-requiring-new-libglvnd.diff 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/debian/patches/revert-requiring-new-libglvnd.diff 2020-07-13 13:21:27.000000000 +0000 @@ -0,0 +1,11 @@ +--- a/meson.build ++++ b/meson.build +@@ -1483,7 +1483,7 @@ endif + dep_glvnd = null_dep + glvnd_has_headers_and_pc_files = false + if with_glvnd +- dep_glvnd = dependency('libglvnd', version : '>= 1.2.0') ++ dep_glvnd = dependency('libglvnd', version : '>= 0.2.0') + # GLVND before 1.2 was missing its pkg-config and header files, forcing every + # vendor to provide them and the distro maintainers to resolve the conflict. + glvnd_has_headers_and_pc_files = dep_glvnd.version().version_compare('>= 1.2.0') diff -Nru mesa-19.2.8/debian/patches/revert-set-full-thread-affinity.diff mesa-20.0.8/debian/patches/revert-set-full-thread-affinity.diff --- mesa-19.2.8/debian/patches/revert-set-full-thread-affinity.diff 2020-07-13 13:21:26.000000000 +0000 +++ mesa-20.0.8/debian/patches/revert-set-full-thread-affinity.diff 1970-01-01 00:00:00.000000000 +0000 @@ -1,73 +0,0 @@ -commit d72507cb400c2ef8940ee194e46818d9ebf4187f -Author: Timo Aaltonen -Date: Mon Mar 4 11:32:22 2019 +0200 - - Revert "util/u_queue: add UTIL_QUEUE_INIT_SET_FULL_THREAD_AFFINITY" - - This reverts commit d877451b48a59ab0f9a4210fc736f51da5851c9a. - ---- a/src/gallium/drivers/radeonsi/si_pipe.c -+++ b/src/gallium/drivers/radeonsi/si_pipe.c -@@ -1019,8 +1019,7 @@ radeonsi_screen_create_impl(struct radeo - - if (!util_queue_init(&sscreen->shader_compiler_queue, "sh", - 64, num_comp_hi_threads, -- UTIL_QUEUE_INIT_RESIZE_IF_FULL | -- UTIL_QUEUE_INIT_SET_FULL_THREAD_AFFINITY)) { -+ UTIL_QUEUE_INIT_RESIZE_IF_FULL)) { - si_destroy_shader_cache(sscreen); - FREE(sscreen); - return NULL; -@@ -1030,7 +1029,6 @@ radeonsi_screen_create_impl(struct radeo - "shlo", - 64, num_comp_lo_threads, - UTIL_QUEUE_INIT_RESIZE_IF_FULL | -- UTIL_QUEUE_INIT_SET_FULL_THREAD_AFFINITY | - UTIL_QUEUE_INIT_USE_MINIMUM_PRIORITY)) { - si_destroy_shader_cache(sscreen); - FREE(sscreen); ---- a/src/util/disk_cache.c -+++ b/src/util/disk_cache.c -@@ -377,8 +377,7 @@ disk_cache_create(const char *gpu_name, - */ - util_queue_init(&cache->cache_queue, "disk$", 32, 1, - UTIL_QUEUE_INIT_RESIZE_IF_FULL | -- UTIL_QUEUE_INIT_USE_MINIMUM_PRIORITY | -- UTIL_QUEUE_INIT_SET_FULL_THREAD_AFFINITY); -+ UTIL_QUEUE_INIT_USE_MINIMUM_PRIORITY); - - cache->path_init_failed = false; - ---- a/src/util/u_queue.c -+++ b/src/util/u_queue.c -@@ -241,20 +241,6 @@ util_queue_thread_func(void *input) - - free(input); - --#ifdef HAVE_PTHREAD_SETAFFINITY -- if (queue->flags & UTIL_QUEUE_INIT_SET_FULL_THREAD_AFFINITY) { -- /* Don't inherit the thread affinity from the parent thread. -- * Set the full mask. -- */ -- cpu_set_t cpuset; -- CPU_ZERO(&cpuset); -- for (unsigned i = 0; i < CPU_SETSIZE; i++) -- CPU_SET(i, &cpuset); -- -- pthread_setaffinity_np(pthread_self(), sizeof(cpuset), &cpuset); -- } --#endif -- - if (strlen(queue->name) > 0) { - char name[16]; - snprintf(name, sizeof(name), "%s%i", queue->name, thread_index); ---- a/src/util/u_queue.h -+++ b/src/util/u_queue.h -@@ -48,7 +48,6 @@ extern "C" { - - #define UTIL_QUEUE_INIT_USE_MINIMUM_PRIORITY (1 << 0) - #define UTIL_QUEUE_INIT_RESIZE_IF_FULL (1 << 1) --#define UTIL_QUEUE_INIT_SET_FULL_THREAD_AFFINITY (1 << 2) - - #if defined(__GNUC__) && defined(HAVE_LINUX_FUTEX_H) - #define UTIL_QUEUE_FENCE_FUTEX diff -Nru mesa-19.2.8/debian/patches/series mesa-20.0.8/debian/patches/series --- mesa-19.2.8/debian/patches/series 2020-07-13 13:21:26.000000000 +0000 +++ mesa-20.0.8/debian/patches/series 2020-07-13 13:21:27.000000000 +0000 @@ -1,9 +1,15 @@ 07_gallium-fix-build-failure-on-powerpcspe.diff +fix-python-shebang.diff +path_max.diff +fix-ppc64el.patch +src_glx_dri_common.h.diff # Ubuntu patches. dont-enable-10bpc-by-default.diff -disable-intel-ccs-compression.diff -revert-set-full-thread-affinity.diff -i965-sync-pciids.diff Revert-meson-drop-Wno-foo-bug-workaround.diff Revert-meson-bump-required-version-to-0.46.diff +Revert-meson-revert-glvnd-workaround.patch +Revert-meson-Test-for-Wl-build-id-sha1.diff +revert-requiring-new-libglvnd.diff +build-glesv2-pc.diff +Revert-meson-Use-dependency.partial_dependency.patch diff -Nru mesa-19.2.8/debian/patches/src_glx_dri_common.h.diff mesa-20.0.8/debian/patches/src_glx_dri_common.h.diff --- mesa-19.2.8/debian/patches/src_glx_dri_common.h.diff 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/debian/patches/src_glx_dri_common.h.diff 2020-07-13 13:21:27.000000000 +0000 @@ -0,0 +1,13 @@ +--- a/src/glx/dri_common.h 2020-04-30 00:48:24.000000000 +0200 ++++ b/src/glx/dri_common.h 2020-05-10 15:22:44.000000000 +0200 +@@ -55,6 +55,10 @@ + + extern void driDestroyConfigs(const __DRIconfig **configs); + ++#ifndef __GLXDRIdrawable ++typedef struct __GLXDRIdrawableRec __GLXDRIdrawable; ++#endif ++ + extern __GLXDRIdrawable * + driFetchDrawable(struct glx_context *gc, GLXDrawable glxDrawable); + diff -Nru mesa-19.2.8/debian/rules mesa-20.0.8/debian/rules --- mesa-19.2.8/debian/rules 2020-07-13 13:21:26.000000000 +0000 +++ mesa-20.0.8/debian/rules 2020-07-13 13:21:27.000000000 +0000 @@ -78,6 +78,7 @@ ifneq (,$(filter $(DEB_HOST_ARCH),amd64 i386 kfreebsd-amd64 kfreebsd-i386 x32)) DRI_DRIVERS += i915, i965, GALLIUM_DRIVERS += iris, + confflags_GALLIUM += -Dprefer-iris=false endif DRI_DRIVERS += r200, r100, @@ -85,7 +86,7 @@ # LLVM is required for building r300g, radeonsi and llvmpipe drivers. # It's also required for building OpenCL support. - ifneq (,$(filter $(DEB_HOST_ARCH), amd64 arm64 armel armhf i386 kfreebsd-amd64 kfreebsd-i386 mips mips64el mipsel powerpc ppc64 ppc64el s390x sparc64)) + ifneq (,$(filter $(DEB_HOST_ARCH), amd64 arm64 armel armhf i386 kfreebsd-amd64 kfreebsd-i386 mips64el mipsel powerpc ppc64 ppc64el s390x sparc64)) GALLIUM_DRIVERS += radeonsi, swrast, confflags_GALLIUM += -Dllvm=true confflags_GALLIUM += -Dgallium-opencl=icd @@ -103,13 +104,13 @@ # radv needs LLVM and the Vulkan loader, so only build on the subset of # arches where we have LLVM enabled and where the Vulkan loader is built. - ifneq (,$(filter $(DEB_HOST_ARCH), amd64 arm64 armel armhf i386 mips mips64el mipsel powerpc ppc64 ppc64el s390x sparc64)) + ifneq (,$(filter $(DEB_HOST_ARCH), amd64 arm64 armel armhf i386 mips64el mipsel powerpc ppc64 ppc64el s390x sparc64)) VULKAN_DRIVERS += amd, endif # build vulkan overlay where Vulkan loader is available # needs meson 0.46 which 18.04 lacks -# ifneq (,$(filter $(DEB_HOST_ARCH), amd64 arm64 armel armhf i386 mips mips64el mipsel powerpc ppc64 ppc64el s390x sparc64 x32)) +# ifneq (,$(filter $(DEB_HOST_ARCH), amd64 arm64 armel armhf i386 mips64el mipsel powerpc ppc64 ppc64el s390x sparc64 x32)) # confflags_VULKAN += -Dvulkan-overlay-layer=true # endif @@ -138,6 +139,7 @@ -Dgallium-xvmc=false \ -Dgallium-omx=disabled \ -Db_ndebug=true \ + -Dbuild-tests=true \ $(confflags_DIRECT_RENDERING) \ $(confflags_GBM) \ $(confflags_DRI3) \ @@ -154,6 +156,7 @@ rm -rf $$(find -name Makefile.in) rm -rf bin/install-sh bin/ltmain.sh for file in debian/*.in; do rm -f $${file%%.in}; done + rm -rf src/amd/compiler/__pycache__/ src/amd/registers/__pycache__ rm -rf src/amd/vulkan/*.pyc src/amd/vulkan/__pycache__/ rm -rf src/compiler/nir/*.pyc src/compiler/nir/__pycache__/ rm -rf src/egl/generate/*.pyc src/egl/generate/__pycache__/ @@ -175,7 +178,10 @@ $${file} > $${file%%.in}; \ done +# some tests are expected to fail for now, drop this when upstream has +# better cross-build testing in place and expected failures fixed override_dh_auto_test: + -dh_auto_test allpkg = $(shell dh_listpackages -a) @@ -193,6 +199,9 @@ rm debian/tmp/usr/lib/*/libEGL_mesa.so rm debian/tmp/usr/lib/*/libGLX_mesa.so + # we only need the pkgconfig file + rm debian/tmp/usr/lib/*/libGLESv2* + # Copy the hardlinked *_dri.so correctly. install -m755 -d debian/libgl1-mesa-dri/usr/lib/${DEB_HOST_MULTIARCH}/dri/ mv debian/tmp/usr/lib/${DEB_HOST_MULTIARCH}/dri/*_dri.so \ diff -Nru mesa-19.2.8/debian/shlibs.local mesa-20.0.8/debian/shlibs.local --- mesa-19.2.8/debian/shlibs.local 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/debian/shlibs.local 2020-07-13 13:21:27.000000000 +0000 @@ -0,0 +1 @@ +libglapi 0 libglapi-mesa (= ${binary:Version}) diff -Nru mesa-19.2.8/debian/upstream/signing-key.asc mesa-20.0.8/debian/upstream/signing-key.asc --- mesa-19.2.8/debian/upstream/signing-key.asc 2020-07-13 13:21:26.000000000 +0000 +++ mesa-20.0.8/debian/upstream/signing-key.asc 2020-07-13 13:21:27.000000000 +0000 @@ -818,3 +818,21 @@ tgA= =37PC -----END PGP PUBLIC KEY BLOCK----- +-----BEGIN PGP PUBLIC KEY BLOCK----- + +mQENBF5i1J4BCACsKVBDxUpz0m03D6ojiR4zZTuLJuI8qrc5HivZiJOKBWvKUm8/ +SUSCCbJ5ka7zwfEQMepm0rdQfFj6BDqAqEAmG/WU9TTbNx3TBDtT3CBBXSDaiWjO +BFlglWKXJeUfHYpfIB1sC6aIIdSnPlU57Fs7jA9FtHbJM3SSwab/492udHZY/bKV +s53Z7KcZDKPMtuZCnGbOZ1+lu8EMHuw9TfqQZODUOvDPYjpzqZAjsTJwdNtOeyWv +dafED7PTpuod6eQnvgUKFlj6aqoZhZa9/hWdGDhexLzRGM4pwZcUMEb+1pd1TBko +wqXJbW3XLs5B/cGiEbqHyDVoaI6J/JleDY1RABEBAAG0IkVyaWMgRW5nZXN0cm9t +IDxlcmljQGVuZ2VzdHJvbS5jaD6JAU4EEwEIADgWIQRXVR3hW5aPY0HCSPaNjjGv +wyQopgUCXmLUngIbAwULCQgHAgYVCgkICwIEFgIDAQIeAQIXgAAKCRCNjjGvwyQo +puKwCACTeD0Eh+M8IIEAuGVtPgy3jCDIoOnNg+QI6VAh9qNB4Yx0uPzvOwmvAODY +fUdPbBxn9CIU1X3buJmI1G75c2ZT+ZXJbkute6oif+f2E6vLe+Ox0GeGIBNQK4iZ ++pV0JPn3seqhDJAekRBWXNJszuVNxSDFsVi+qEoTPJZze7yOcc1QMcRsnJ+S7YFg +jf/5aRrqdnLxVbph53Vrawas3rQk97Ln+vNd/Ca1HMv8vXMPtEFLk5JQI+gmdVv3 +xcpz2ss3mMU5+YnY69T1EBemZIMcE17wPo7ROvT8x1fG2fnPwHqJieQFCMGKLW29 +5peUY2OiHdLgaepbqWrB2Xxy+BY+ +=rSFv +-----END PGP PUBLIC KEY BLOCK----- diff -Nru mesa-19.2.8/docs/bugs.html mesa-20.0.8/docs/bugs.html --- mesa-19.2.8/docs/bugs.html 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/docs/bugs.html 2020-06-12 01:21:16.000000000 +0000 @@ -24,7 +24,7 @@

To file a Mesa bug, go to - + GitLab on freedesktop.org

diff -Nru mesa-19.2.8/docs/codingstyle.html mesa-20.0.8/docs/codingstyle.html --- mesa-19.2.8/docs/codingstyle.html 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/docs/codingstyle.html 2020-06-12 01:21:16.000000000 +0000 @@ -41,11 +41,11 @@
  • Opening braces go on the same line as the if/for/while statement. For example:
    -   if (condition) {
    -      foo;
    -   } else {
    -      bar;
    -   }
    +if (condition) {
    +   foo;
    +} else {
    +   bar;
    +}
     
  • Put a space before/after operators. For example, a = b + c; @@ -53,7 +53,7 @@
  • This GNU indent command generally does the right thing for formatting:
    -   indent -br -i3 -npcs --no-tabs infile.c -o outfile.c
    +indent -br -i3 -npcs --no-tabs infile.c -o outfile.c
     
  • @@ -63,47 +63,47 @@

    Single-line comments:
    -   /* null-out pointer to prevent dangling reference below */
    -   bufferObj = NULL;
    +/* null-out pointer to prevent dangling reference below */
    +bufferObj = NULL;
     
    Or,
    -   bufferObj = NULL;  /* prevent dangling reference below */
    +bufferObj = NULL;  /* prevent dangling reference below */
     
    Multi-line comment:
    -   /* If this is a new buffer object id, or one which was generated but
    -    * never used before, allocate a buffer object now.
    -    */
    +/* If this is a new buffer object id, or one which was generated but
    + * never used before, allocate a buffer object now.
    + */
     
    We try to quote the OpenGL specification where prudent:
    -   /* Page 38 of the PDF of the OpenGL ES 3.0 spec says:
    -    *
    -    *     "An INVALID_OPERATION error is generated for any of the following
    -    *     conditions:
    -    *
    -    *     * <length> is zero."
    -    *
    -    * Additionally, page 94 of the PDF of the OpenGL 4.5 core spec
    -    * (30.10.2014) also says this, so it's no longer allowed for desktop GL,
    -    * either.
    -    */
    +/* Page 38 of the PDF of the OpenGL ES 3.0 spec says:
    + *
    + *     "An INVALID_OPERATION error is generated for any of the following
    + *     conditions:
    + *
    + *     * <length> is zero."
    + *
    + * Additionally, page 94 of the PDF of the OpenGL 4.5 core spec
    + * (30.10.2014) also says this, so it's no longer allowed for desktop GL,
    + * either.
    + */
     
    Function comment example:
    -   /**
    -    * Create and initialize a new buffer object.  Called via the
    -    * ctx->Driver.CreateObject() driver callback function.
    -    * \param  name  integer name of the object
    -    * \param  type  one of GL_FOO, GL_BAR, etc.
    -    * \return  pointer to new object or NULL if error
    -    */
    -   struct gl_object *
    -   _mesa_create_object(GLuint name, GLenum type)
    -   {
    -      /* function body */
    -   }
    +/**
    + * Create and initialize a new buffer object.  Called via the
    + * ctx->Driver.CreateObject() driver callback function.
    + * \param  name  integer name of the object
    + * \param  type  one of GL_FOO, GL_BAR, etc.
    + * \return  pointer to new object or NULL if error
    + */
    +struct gl_object *
    +_mesa_create_object(GLuint name, GLenum type)
    +{
    +   /* function body */
    +}
     
  • Put the function return type and qualifiers on one line and the function @@ -113,11 +113,11 @@
  • Function names follow various conventions depending on the type of function:
    -   glFooBar()       - a public GL entry point (in glapi_dispatch.c)
    -   _mesa_FooBar()   - the internal immediate mode function
    -   save_FooBar()    - retained mode (display list) function in dlist.c
    -   foo_bar()        - a static (private) function
    -   _mesa_foo_bar()  - an internal non-static Mesa function
    +glFooBar()       - a public GL entry point (in glapi_dispatch.c)
    +_mesa_FooBar()   - the internal immediate mode function
    +save_FooBar()    - retained mode (display list) function in dlist.c
    +foo_bar()        - a static (private) function
    +_mesa_foo_bar()  - an internal non-static Mesa function
     
  • Constants, macros and enum names are ALL_UPPERCASE, with _ diff -Nru mesa-19.2.8/docs/devinfo.html mesa-20.0.8/docs/devinfo.html --- mesa-19.2.8/docs/devinfo.html 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/docs/devinfo.html 2020-06-12 01:21:16.000000000 +0000 @@ -77,9 +77,6 @@
  • - - - diff -Nru mesa-19.2.8/docs/dispatch.html mesa-20.0.8/docs/dispatch.html --- mesa-19.2.8/docs/dispatch.html 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/docs/dispatch.html 2020-06-12 01:21:16.000000000 +0000 @@ -77,17 +77,17 @@

    This can be implemented in just a few lines of C code. The file src/mesa/glapi/glapitemp.h contains code very similar to this.

    -
    - - -
    +
    +
     void glVertex3f(GLfloat x, GLfloat y, GLfloat z)
     {
         const struct _glapi_table * const dispatch = GET_DISPATCH();
     
         (*dispatch->Vertex3f)(x, y, z);
    -}
    Sample dispatch function
    -
    +} + +
    Sample dispatch function
    +

    The problem with this simple implementation is the large amount of overhead that it adds to every GL function call.

    @@ -129,15 +129,14 @@ complex, but it avoids the expensive pthread_getspecific call in the common case.

    -
    - - -
    +
    +
     #define GET_DISPATCH() \
         (_glapi_Dispatch != NULL) \
             ? _glapi_Dispatch : pthread_getspecific(&_glapi_Dispatch_key)
    -
    Improved GET_DISPATCH Implementation
    -
    + +
    Improved GET_DISPATCH Implementation
    +

    3.2. ELF TLS

    @@ -154,16 +153,15 @@ properly declared, GET_DISPACH becomes a simple variable reference.

    -
    - - -
    +
    +
     extern __thread struct _glapi_table *_glapi_tls_Dispatch
         __attribute__((tls_model("initial-exec")));
     
     #define GET_DISPATCH() _glapi_tls_Dispatch
    -
    TLS GET_DISPATCH Implementation
    -
    + +
    TLS GET_DISPATCH Implementation
    +

    Use of this path is controlled by the preprocessor define USE_ELF_TLS. Any platform capable of using ELF TLS should use this @@ -215,13 +213,12 @@ selected based on the defined preprocessor variables. The assembly code then consists of a series of invocations of the macros such as: -

    - - -
    +
    +
     GL_STUB(Color3fv, _gloffset_Color3fv)
    -
    SPARC Assembly Implementation of glColor3fv
    -
    + +
    SPARC Assembly Implementation of glColor3fv
    +

    The benefit of this technique is that changes to the calling pattern (i.e., addition of a new dispatch table pointer access method) require fewer @@ -271,8 +268,6 @@ src/mesa/glapi/glapi.c just before glprocs.h is included.

    -

    4. Automatic Generation of Dispatch Stubs

    - diff -Nru mesa-19.2.8/docs/download.html mesa-20.0.8/docs/download.html --- mesa-19.2.8/docs/download.html 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/docs/download.html 2020-06-12 01:21:16.000000000 +0000 @@ -19,10 +19,10 @@

    Downloading

    -Primary Mesa download site: -ftp.freedesktop.org (FTP) -or mesa.freedesktop.org -(HTTPS). +You can download the released versions of Mesa via +HTTPS +or +FTP.

    diff -Nru mesa-19.2.8/docs/envvars.html mesa-20.0.8/docs/envvars.html --- mesa-19.2.8/docs/envvars.html 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/docs/envvars.html 2020-06-12 01:21:16.000000000 +0000 @@ -307,6 +307,8 @@

    disable instruction compaction
    nodualobj
    suppress generation of dual-object geometry shader code
    +
    nofc
    +
    disable fast clears
    norbc
    disable single sampled render buffer compression
    optimizer
    @@ -542,6 +544,231 @@ +

    RADV driver environment variables

    +
    +
    RADV_DEBUG
    +
    a comma-separated list of named flags, which do various things: +
    +
    allbos
    +
    force all allocated buffers to be referenced in submissions
    +
    allentrypoints
    +
    enable all device/instance entrypoints
    +
    checkir
    +
    validate the LLVM IR before LLVM compiles the shader
    +
    errors
    +
    display more info about errors
    +
    info
    +
    show GPU-related information
    +
    metashaders
    +
    dump internal meta shaders
    +
    nobinning
    +
    disable primitive binning
    +
    nocache
    +
    disable shaders cache
    +
    nocompute
    +
    disable compute queue
    +
    nodcc
    +
    disable Delta Color Compression (DCC) on images
    +
    nodynamicbounds
    +
    do not check OOB access for dynamic descriptors
    +
    nofastclears
    +
    disable fast color/depthstencil clears
    +
    nohiz
    +
    disable HIZ for depthstencil images
    +
    noibs
    +
    disable directly recording command buffers in GPU-visible memory
    +
    noloadstoreopt
    +
    disable LLVM SILoadStoreOptimizer pass
    +
    nomemorycache
    +
    disable memory shaders cache
    +
    nongg
    +
    disable NGG for GFX10+
    +
    nooutoforder
    +
    disable out-of-order rasterization
    +
    noshaderballot
    +
    disable shader ballot
    +
    nosisched
    +
    disable LLVM sisched experimental scheduler
    +
    nothreadllvm
    +
    disable LLVM threaded compilation
    +
    preoptir
    +
    dump LLVM IR before any optimizations
    +
    shaders
    +
    dump shaders
    +
    shaderstats
    +
    dump shader statistics
    +
    spirv
    +
    dump SPIR-V
    +
    startup
    +
    display info at startup
    +
    syncshaders
    +
    synchronize shaders after all draws/dispatches
    +
    vmfaults
    +
    check for VM memory faults via dmesg
    +
    zerovram
    +
    initialize all memory allocated in VRAM as zero
    +
    +
    +
    RADV_FORCE_FAMILY
    +
    force the driver to use a specific family eg. gfx900 (developers only)
    +
    RADV_PERFTEST
    +
    a comma-separated list of named flags, which do various things: +
    +
    aco
    +
    enable ACO experimental compiler
    +
    bolist
    +
    enable the global BO list
    +
    cswave32
    +
    enable wave32 for compute shaders (GFX10+)
    +
    dccmsaa
    +
    enable DCC for MSAA images
    +
    dfsm
    +
    enable dfsm
    +
    gewave32
    +
    enable wave32 for vertex/tess/geometry shaders (GFX10+)
    +
    localbos
    +
    enable local BOs
    +
    nobatchchain
    +
    disable chained submissions
    +
    pswave32
    +
    enable wave32 for pixel shaders (GFX10+)
    +
    shader_ballot
    +
    enable shader ballot
    +
    sisched
    +
    enable LLVM sisched experimental scheduler
    +
    tccompatcmask
    +
    enable TC-compat cmask for MSAA images
    +
    +
    +
    RADV_SECURE_COMPILE_THREADS
    +
    maximum number of secure compile threads (up to 32)
    +
    RADV_TRACE_FILE
    +
    generate cmdbuffer tracefiles when a GPU hang is detected
    +
    + +

    radeonsi driver environment variables

    +
    +
    AMD_DEBUG
    +
    a comma-separated list of named flags, which do various things:
    +
    +
    +

    Disable features / workaround flags (useful to diagnose an issue):

    +
    nodma
    +
    Disable SDMA
    +
    nodmaclear
    +
    Disable SDMA clears
    +
    nodmacopyimage
    +
    Disable SDMA image copies
    +
    zerovram
    +
    Clear VRAM allocations.
    +
    nodcc
    +
    Disable DCC.
    +
    nodccclear
    +
    Disable DCC fast clear.
    +
    nodccfb
    +
    Disable separate DCC on the main framebuffer
    +
    nodccmsaa
    +
    Disable DCC for MSAA
    +
    nodpbb
    +
    Disable DPBB.
    +
    nodfsm
    +
    Disable DFSM.
    +
    notiling
    +
    Disable tiling
    +
    nofmask
    +
    Disable MSAA compression
    +
    nohyperz
    +
    Disable Hyper-Z
    +
    norbplus
    +
    Disable RB+.
    +
    no2d
    +
    Disable 2D tiling
    +

    Info flags:

    +
    info
    +
    Print driver information
    +
    tex
    +
    Print texture info
    +
    compute
    +
    Print compute info
    +
    vm
    +
    Print virtual addresses when creating resources
    +

    Print shaders flags:

    +
    vs
    +
    Print vertex shaders
    +
    ps
    +
    Print pixel shaders
    +
    gs
    +
    Print geometry shaders
    +
    tcs
    +
    Print tessellation control shaders
    +
    tes
    +
    Print tessellation evaluation shaders
    +
    cs
    +
    Print compute shaders
    +
    noir
    +
    Don't print the LLVM IR
    +
    nonir
    +
    Don't print NIR when printing shaders
    +
    noasm
    +
    Don't print disassembled shaders
    +
    preoptir
    +
    Print the LLVM IR before initial optimizations
    +

    Shader compilation tuning flags:

    +
    sisched
    +
    Enable LLVM SI Machine Instruction Scheduler.
    +
    gisel
    +
    Enable LLVM global instruction selector.
    +
    w32ge
    +
    Use Wave32 for vertex, tessellation, and geometry shaders.
    +
    w32ps
    +
    Use Wave32 for pixel shaders.
    +
    w32cs
    +
    Use Wave32 for computes shaders.
    +
    w64ge
    +
    Use Wave64 for vertex, tessellation, and geometry shaders.
    +
    w64ps
    +
    Use Wave64 for pixel shaders.
    +
    w64cs
    +
    Use Wave64 for computes shaders.
    +
    checkir
    +
    Enable additional sanity checks on shader IR
    +
    mono
    +
    Use old-style monolithic shaders compiled on demand
    +
    nooptvariant
    +
    Disable compiling optimized shader variants.
    +

    Advanced usage flags:

    +
    forcedma
    +
    Use SDMA for all operations when possible.
    +
    nowc
    +
    Disable GTT write combining
    +
    check_vm
    +
    Check VM faults and dump debug info.
    +
    reserve_vmid
    +
    Force VMID reservation per context.
    +
    nogfx
    +
    Disable graphics. Only multimedia compute paths can be used.
    +
    nongg
    +
    Disable NGG and use the legacy pipeline.
    +
    nggc
    +
    Always use NGG culling even when it can hurt.
    +
    nonggc
    +
    Disable NGG culling.
    +
    alwayspd
    +
    Always enable the primitive discard compute shader.
    +
    pd
    +
    Enable the primitive discard compute shader for large draw calls.
    +
    nopd
    +
    Disable the primitive discard compute shader.
    +
    switch_on_eop
    +
    Program WD/IA to switch on end-of-packet.
    +
    nooutoforder
    +
    Disable out-of-order rasterization
    +
    dpbb
    +
    Enable DPBB.
    +
    dfsm
    +
    Enable DFSM.
    +
    +

    Other Gallium drivers have their own environment variables. These may change frequently so the source code should be consulted for details. diff -Nru mesa-19.2.8/docs/features.txt mesa-20.0.8/docs/features.txt --- mesa-19.2.8/docs/features.txt 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/docs/features.txt 2020-06-12 01:21:16.000000000 +0000 @@ -118,19 +118,19 @@ - 'precise' qualifier DONE (softpipe) - Dynamically uniform sampler array indices DONE (softpipe) - Dynamically uniform UBO array indices DONE (freedreno, softpipe) - - Implicit signed -> unsigned conversions DONE (softpipe) - - Fused multiply-add DONE (softpipe) - - Packing/bitfield/conversion functions DONE (freedreno, softpipe) - - Enhanced textureGather DONE (freedreno, softpipe) - - Geometry shader instancing DONE (llvmpipe, softpipe) - - Geometry shader multiple streams DONE (softpipe) + - Implicit signed -> unsigned conversions DONE (softpipe, swr) + - Fused multiply-add DONE (softpipe, swr) + - Packing/bitfield/conversion functions DONE (freedreno, softpipe, swr) + - Enhanced textureGather DONE (freedreno, softpipe, swr) + - Geometry shader instancing DONE (llvmpipe, softpipe, swr) + - Geometry shader multiple streams DONE (softpipe, swr) - Enhanced per-sample shading DONE () - Interpolation functions DONE (softpipe) - New overload resolution rules DONE (softpipe) GL_ARB_gpu_shader_fp64 DONE (i965/gen7+, llvmpipe, softpipe, swr) GL_ARB_sample_shading DONE (freedreno/a6xx, i965/gen6+, nv50) GL_ARB_shader_subroutine DONE (freedreno, i965/gen6+, nv50, llvmpipe, softpipe, swr) - GL_ARB_tessellation_shader DONE (i965/gen7+) + GL_ARB_tessellation_shader DONE (i965/gen7+, swr) GL_ARB_texture_buffer_object_rgb32 DONE (freedreno, i965/gen6+, llvmpipe, softpipe, swr) GL_ARB_texture_cube_map_array DONE (i965/gen6+, nv50, llvmpipe, softpipe, swr) GL_ARB_texture_gather DONE (freedreno, i965/gen6+, nv50, llvmpipe, softpipe, swr) @@ -151,13 +151,13 @@ GL 4.2, GLSL 4.20 -- all DONE: i965/gen7+, nvc0, r600, radeonsi, virgl - GL_ARB_texture_compression_bptc DONE (freedreno, i965) + GL_ARB_texture_compression_bptc DONE (freedreno, i965, llvmpipe, softpipe, swr) GL_ARB_compressed_texture_pixel_storage DONE (all drivers) GL_ARB_shader_atomic_counters DONE (freedreno/a5xx+, i965, llvmpipe, softpipe) GL_ARB_texture_storage DONE (all drivers) GL_ARB_transform_feedback_instanced DONE (freedreno, i965, nv50, llvmpipe, softpipe, swr) GL_ARB_base_instance DONE (freedreno, i965, nv50, llvmpipe, softpipe, swr) - GL_ARB_shader_image_load_store DONE (freedreno/a5xx+, i965, softpipe) + GL_ARB_shader_image_load_store DONE (freedreno/a5xx+, i965, llvmpipe, softpipe) GL_ARB_conservative_depth DONE (all drivers that support GLSL 1.30) GL_ARB_shading_language_420pack DONE (all drivers that support GLSL 1.30) GL_ARB_shading_language_packing DONE (all drivers) @@ -170,18 +170,18 @@ GL_ARB_arrays_of_arrays DONE (all drivers that support GLSL 1.30) GL_ARB_ES3_compatibility DONE (all drivers that support GLSL 3.30) GL_ARB_clear_buffer_object DONE (all drivers) - GL_ARB_compute_shader DONE (freedreno/a5xx+, i965, softpipe) + GL_ARB_compute_shader DONE (freedreno/a5xx+, i965, llvmpipe, softpipe) GL_ARB_copy_image DONE (i965, nv50, softpipe, llvmpipe, swr) GL_KHR_debug DONE (all drivers) GL_ARB_explicit_uniform_location DONE (all drivers that support GLSL) - GL_ARB_fragment_layer_viewport DONE (i965, nv50, llvmpipe, softpipe) - GL_ARB_framebuffer_no_attachments DONE (freedreno, i965, softpipe) + GL_ARB_fragment_layer_viewport DONE (i965, nv50, llvmpipe, softpipe, swr) + GL_ARB_framebuffer_no_attachments DONE (freedreno, i965, llvmpipe, softpipe) GL_ARB_internalformat_query2 DONE (all drivers) GL_ARB_invalidate_subdata DONE (all drivers) GL_ARB_multi_draw_indirect DONE (freedreno, i965, llvmpipe, softpipe, swr) GL_ARB_program_interface_query DONE (all drivers) GL_ARB_robust_buffer_access_behavior DONE (i965) - GL_ARB_shader_image_size DONE (freedreno/a5xx+, i965, softpipe) + GL_ARB_shader_image_size DONE (freedreno/a5xx+, i965, llvmpipe, softpipe) GL_ARB_shader_storage_buffer_object DONE (freedreno/a5xx+, i965, llvmpipe, softpipe) GL_ARB_stencil_texturing DONE (freedreno, i965/hsw+, nv50, llvmpipe, softpipe, swr) GL_ARB_texture_buffer_range DONE (freedreno, nv50, i965, softpipe, llvmpipe, swr) @@ -204,7 +204,7 @@ - specified transform/feedback layout DONE - input/output block locations DONE GL_ARB_multi_bind DONE (all drivers) - GL_ARB_query_buffer_object DONE (i965/hsw+, virgl) + GL_ARB_query_buffer_object DONE (i965/hsw+, llvmpipe, virgl) GL_ARB_texture_mirror_clamp_to_edge DONE (i965, nv50, llvmpipe, softpipe, swr, virgl) GL_ARB_texture_stencil8 DONE (freedreno, i965/hsw+, nv50, llvmpipe, softpipe, swr, virgl) GL_ARB_vertex_type_10f_11f_11f_rev DONE (i965, nv50, llvmpipe, softpipe, swr, virgl) @@ -215,7 +215,7 @@ GL_ARB_clip_control DONE (freedreno, i965, nv50, llvmpipe, softpipe, swr) GL_ARB_conditional_render_inverted DONE (freedreno, i965, nv50, llvmpipe, softpipe, swr, virgl) GL_ARB_cull_distance DONE (i965, nv50, llvmpipe, softpipe, swr, virgl) - GL_ARB_derivative_control DONE (i965, nv50, softpipe, virgl) + GL_ARB_derivative_control DONE (i965, nv50, llvmpipe, softpipe, virgl) GL_ARB_direct_state_access DONE (all drivers) GL_ARB_get_texture_sub_image DONE (all drivers) GL_ARB_shader_texture_image_samples DONE (i965, nv50, virgl) @@ -224,18 +224,18 @@ GL_KHR_robustness DONE (freedreno, i965) GL_EXT_shader_integer_mix DONE (all drivers that support GLSL) -GL 4.6, GLSL 4.60 +GL 4.6, GLSL 4.60 -- all DONE: radeonsi - GL_ARB_gl_spirv in progress (Nicolai Hähnle, Ian Romanick) - GL_ARB_indirect_parameters DONE (i965/gen7+, nvc0, radeonsi, virgl) - GL_ARB_pipeline_statistics_query DONE (i965, nvc0, r600, radeonsi, llvmpipe, softpipe, swr) - GL_ARB_polygon_offset_clamp DONE (freedreno, i965, nv50, nvc0, r600, radeonsi, llvmpipe, swr, virgl) - GL_ARB_shader_atomic_counter_ops DONE (freedreno/a5xx+, i965/gen7+, nvc0, r600, radeonsi, llvmpipe, softpipe, virgl) - GL_ARB_shader_draw_parameters DONE (i965, nvc0, radeonsi) - GL_ARB_shader_group_vote DONE (i965, nvc0, radeonsi) - GL_ARB_spirv_extensions in progress (Nicolai Hähnle, Ian Romanick) - GL_ARB_texture_filter_anisotropic DONE (freedreno, i965, nv50, nvc0, r600, radeonsi, softpipe (*), llvmpipe (*)) - GL_ARB_transform_feedback_overflow_query DONE (i965/gen6+, nvc0, radeonsi, llvmpipe, softpipe, virgl) + GL_ARB_gl_spirv DONE (i965/gen7+) + GL_ARB_indirect_parameters DONE (i965/gen7+, nvc0, llvmpipe, virgl) + GL_ARB_pipeline_statistics_query DONE (i965, nvc0, r600, llvmpipe, softpipe, swr) + GL_ARB_polygon_offset_clamp DONE (freedreno, i965, nv50, nvc0, r600, llvmpipe, swr, virgl) + GL_ARB_shader_atomic_counter_ops DONE (freedreno/a5xx+, i965/gen7+, nvc0, r600, llvmpipe, softpipe, virgl) + GL_ARB_shader_draw_parameters DONE (i965, llvmpipe, nvc0) + GL_ARB_shader_group_vote DONE (i965, nvc0, llvmpipe) + GL_ARB_spirv_extensions DONE (i965/gen7+) + GL_ARB_texture_filter_anisotropic DONE (freedreno, i965, nv50, nvc0, r600, softpipe (*), llvmpipe (*)) + GL_ARB_transform_feedback_overflow_query DONE (i965/gen6+, nvc0, llvmpipe, softpipe, virgl) GL_KHR_no_error DONE (all drivers) (*) softpipe and llvmpipe advertise 16x anisotropy but simply ignore the setting @@ -244,14 +244,14 @@ GLES3.1, GLSL ES 3.1 -- all DONE: i965/hsw+, nvc0, r600, radeonsi, virgl GL_ARB_arrays_of_arrays DONE (all drivers that support GLSL 1.30) - GL_ARB_compute_shader DONE (freedreno/a5xx+, i965/gen7+, softpipe) + GL_ARB_compute_shader DONE (freedreno/a5xx+, i965/gen7+, llvmpipe, softpipe) GL_ARB_draw_indirect DONE (freedreno, i965/gen7+, llvmpipe, softpipe, swr) GL_ARB_explicit_uniform_location DONE (all drivers that support GLSL) - GL_ARB_framebuffer_no_attachments DONE (freedreno, i965/gen7+, softpipe) + GL_ARB_framebuffer_no_attachments DONE (freedreno, i965/gen7+, llvmpipe, softpipe) GL_ARB_program_interface_query DONE (all drivers) GL_ARB_shader_atomic_counters DONE (freedreno/a5xx+, i965/gen7+, llvmpipe, softpipe) - GL_ARB_shader_image_load_store DONE (freedreno/a5xx+, i965/gen7+, softpipe) - GL_ARB_shader_image_size DONE (freedreno/a5xx+, i965/gen7+, softpipe) + GL_ARB_shader_image_load_store DONE (freedreno/a5xx+, i965/gen7+, llvmpipe, softpipe) + GL_ARB_shader_image_size DONE (freedreno/a5xx+, i965/gen7+, llvmpipe, softpipe) GL_ARB_shader_storage_buffer_object DONE (freedreno/a5xx+, i965/gen7+, llvmpipe, softpipe) GL_ARB_shading_language_packing DONE (all drivers) GL_ARB_separate_shader_objects DONE (all drivers) @@ -311,6 +311,7 @@ GL_ARB_shader_clock DONE (i965/gen7+, nv50, nvc0, r600, radeonsi, virgl) GL_ARB_shader_stencil_export DONE (i965/gen9+, r600, radeonsi, softpipe, llvmpipe, swr, virgl) GL_ARB_shader_viewport_layer_array DONE (i965/gen6+, nvc0, radeonsi) + GL_ARB_shading_language_include DONE GL_ARB_sparse_buffer DONE (radeonsi/CIK+) GL_ARB_sparse_texture not started GL_ARB_sparse_texture2 not started @@ -347,54 +348,54 @@ GLX_ARB_robustness_share_group_isolation not started GL_EXT_direct_state_access subfeatures (in the spec order): - GL 1.1: Client commands not started - GL 1.0-1.3: Matrix and transpose matrix commands not started - GL 1.1-1.2: Texture commands not started - GL 1.2: 3D texture commands not started - GL 1.2.1: Multitexture commands not started - GL 1.2.1-3.0: Indexed texture commands not started - GL 1.2.1-3.0: Indexed generic queries not started - GL 1.2.1: EnableIndexed.. Get*Indexed not started - GL_ARB_vertex_program not started - GL 1.3: Compressed texture and multitexture commands not started - GL 1.5: Buffer commands not started - GL 2.0-2.1: Uniform and uniform matrix commands not started - GL_EXT_texture_buffer_object not started - GL_EXT_texture_integer not started - GL_EXT_gpu_shader4 not started - GL_EXT_gpu_program_parameters not started + GL 1.1: Client commands DONE + GL 1.0-1.3: Matrix and transpose matrix commands DONE + GL 1.1-1.2: Texture commands DONE + GL 1.2: 3D texture commands DONE + GL 1.2.1: Multitexture commands DONE + GL 1.2.1-3.0: Indexed texture commands DONE + GL 1.2.1-3.0: Indexed generic queries DONE + GL 1.2.1: EnableIndexed.. Get*Indexed DONE + GL_ARB_vertex_program DONE + GL 1.3: Compressed texture and multitexture commands DONE + GL 1.5: Buffer commands DONE + GL 2.0-2.1: Uniform and uniform matrix commands DONE + GL_EXT_texture_buffer_object DONE + GL_EXT_texture_integer DONE + GL_EXT_gpu_shader4 DONE + GL_EXT_gpu_program_parameters DONE GL_NV_gpu_program4 n/a GL_NV_framebuffer_multisample_coverage n/a - GL 3.0: Renderbuffer/framebuffer commands, Gen*Mipmap not started - GL 3.0: CopyBuffer command not started - GL_EXT_geometry_shader4 commands (expose in GL 3.2) not started + GL 3.0: Renderbuffer/framebuffer commands, Gen*Mipmap DONE + GL 3.0: CopyBuffer command DONE + GL_EXT_geometry_shader4 commands (expose in GL 3.2) DONE GL_NV_explicit_multisample n/a - GL 3.0: Vertex array/attrib/query/map commands not started - Matrix GL tokens not started + GL 3.0: Vertex array/attrib/query/map commands DONE + Matrix GL tokens DONE GL_EXT_direct_state_access additions from other extensions (complete list): GL_AMD_framebuffer_sample_positions n/a - GL_AMD_gpu_shader_int64 not started - GL_ARB_bindless_texture not started - GL_ARB_buffer_storage not started - GL_ARB_clear_buffer_object not started - GL_ARB_framebuffer_no_attachments not started - GL_ARB_gpu_shader_fp64 not started - GL_ARB_instanced_arrays not started - GL_ARB_internalformat_query2 not started + GL_AMD_gpu_shader_int64 n/a (not enabled in compat profile) + GL_ARB_bindless_texture DONE + GL_ARB_buffer_storage DONE + GL_ARB_clear_buffer_object DONE + GL_ARB_framebuffer_no_attachments DONE + GL_ARB_gpu_shader_fp64 DONE + GL_ARB_instanced_arrays DONE + GL_ARB_internalformat_query2 DONE GL_ARB_sparse_texture n/a - GL_ARB_sparse_buffer not started - GL_ARB_texture_buffer_range not started - GL_ARB_texture_storage not started - GL_ARB_texture_storage_multisample not started - GL_ARB_vertex_attrib_64bit not started - GL_ARB_vertex_attrib_binding not started - GL_EXT_buffer_storage not started - GL_EXT_external_buffer not started + GL_ARB_sparse_buffer DONE + GL_ARB_texture_buffer_range DONE + GL_ARB_texture_storage DONE + GL_ARB_texture_storage_multisample DONE + GL_ARB_vertex_attrib_64bit DONE + GL_ARB_vertex_attrib_binding DONE + GL_EXT_buffer_storage DONE + GL_EXT_external_buffer n/a GL_EXT_separate_shader_objects n/a GL_EXT_sparse_texture n/a GL_EXT_texture_storage n/a - GL_EXT_vertex_attrib_64bit not started + GL_EXT_vertex_attrib_64bit DONE GL_EXT_EGL_image_storage n/a GL_NV_bindless_texture n/a GL_NV_gpu_shader5 n/a @@ -408,7 +409,6 @@ GL_ARB_geometry_shader4 Superseded by GL 3.2 geometry shaders GL_ARB_matrix_palette Superseded by GL_ARB_vertex_program - GL_ARB_shading_language_include Not interesting GL_ARB_shadow_ambient Superseded by GL_ARB_fragment_program GL_ARB_vertex_blend Superseded by GL_ARB_vertex_program @@ -416,7 +416,7 @@ Vulkan 1.1 -- all DONE: anv, radv - VK_KHR_16bit_storage in progress (Alejandro) + VK_KHR_16bit_storage DONE (anv/gen8+, radv) VK_KHR_bind_memory2 DONE (anv, radv) VK_KHR_dedicated_allocation DONE (anv, radv) VK_KHR_descriptor_update_template DONE (anv, radv) @@ -435,18 +435,21 @@ VK_KHR_maintenance3 DONE (anv, radv) VK_KHR_multiview DONE (anv, radv) VK_KHR_relaxed_block_layout DONE (anv, radv) - VK_KHR_sampler_ycbcr_conversion DONE (anv) + VK_KHR_sampler_ycbcr_conversion DONE (anv, radv) VK_KHR_shader_draw_parameters DONE (anv, radv) VK_KHR_storage_buffer_storage_class DONE (anv, radv) VK_KHR_variable_pointers DONE (anv, radv) Khronos extensions that are not part of any Vulkan version: - VK_KHR_8bit_storage DONE (anv, radv) + + VK_KHR_8bit_storage DONE (anv/gen8+, radv) VK_KHR_android_surface not started VK_KHR_create_renderpass2 DONE (anv, radv) + VK_KHR_depth_stencil_resolve DONE (anv, radv) VK_KHR_display DONE (anv, radv) - VK_KHR_display_swapchain DONE (anv, radv) - VK_KHR_draw_indirect_count DONE (radv) + VK_KHR_display_swapchain not started + VK_KHR_draw_indirect_count DONE (anv, radv) + VK_KHR_driver_properties DONE (anv, radv) VK_KHR_external_fence_fd DONE (anv, radv) VK_KHR_external_fence_win32 not started VK_KHR_external_memory_fd DONE (anv, radv) @@ -456,13 +459,23 @@ VK_KHR_get_display_properties2 DONE (anv, radv) VK_KHR_get_surface_capabilities2 DONE (anv, radv) VK_KHR_image_format_list DONE (anv, radv) + VK_KHR_imageless_framebuffer DONE (anv, radv) VK_KHR_incremental_present DONE (anv, radv) VK_KHR_mir_surface not started + VK_KHR_pipeline_executable_properties DONE (anv, radv) VK_KHR_push_descriptor DONE (anv, radv) VK_KHR_sampler_mirror_clamp_to_edge DONE (anv, radv) + VK_KHR_shader_atomic_int64 DONE (anv, radv) + VK_KHR_shader_float16_int8 DONE (anv/gen8+, radv) + VK_KHR_shader_float_controls DONE (anv/gen8+, radv) + VK_KHR_shader_subgroup_extended_types DONE (radv) VK_KHR_shared_presentable_image not started VK_KHR_surface DONE (anv, radv) + VK_KHR_surface_protected_capabilities DONE (anv, radv) VK_KHR_swapchain DONE (anv, radv) + VK_KHR_swapchain_mutable_format DONE (anv, radv) + VK_KHR_uniform_buffer_standard_layout DONE (anv, radv) + VK_KHR_vulkan_memory_model not started VK_KHR_wayland_surface DONE (anv, radv) VK_KHR_win32_keyed_mutex not started VK_KHR_win32_surface not started diff -Nru mesa-19.2.8/docs/helpwanted.html mesa-20.0.8/docs/helpwanted.html --- mesa-19.2.8/docs/helpwanted.html 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/docs/helpwanted.html 2020-06-12 01:21:16.000000000 +0000 @@ -29,7 +29,7 @@ Just applying patches, testing and reporting back is helpful.

  • Driver debugging. -There are plenty of open bugs in the bug database. +There are plenty of open bugs in the bug database.
  • Remove aliasing warnings. Enable gcc's -Wstrict-aliasing=2 -fstrict-aliasing arguments, and @@ -47,7 +47,7 @@ Common To-Do lists:

    diff -Nru mesa-19.2.8/docs/index.html mesa-20.0.8/docs/index.html --- mesa-19.2.8/docs/index.html 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/docs/index.html 2020-06-12 01:21:16.000000000 +0000 @@ -16,6 +16,43 @@

    News

    +

    January 28, 2020

    Mesa 19.3.3 is released. This is a bug fix release.

    January 9, 2020

    Mesa 19.3.2 is released. This is a bug fix release.

    December 18, 2019

    Mesa 19.2.8 is released. This is a bug fix release.

    December 18, 2019

    Mesa 19.3.1 is released. This is a bug fix release.

    December 12, 2019

    Mesa 19.3.0 is released. This is a new development release. See the release notes for mor information about this release.

    December 4, 2019

    Mesa 19.2.7 is released. This is a bug fix release.

    November 21, 2019

    Mesa 19.2.6 is released. This is a bug fix release.

    November 20, 2019

    Mesa 19.2.5 is released. This is a bug fix release.

    November 13, 2019

    Mesa 19.2.4 is released. This is an emergency bugfix release, all users of 19.2.3 are recomended to upgrade immediately.

    +

    November 6, 2019

    Mesa 19.2.3 is released. This is a bug fix release.

    October 24, 2019

    Mesa 19.2.2 is released. This is a bug fix release.

    October 21, 2019

    +

    +Mesa 19.1.8 is released. +This is a bug-fix release. +

    +

    +NOTE: It is anticipated that 19.1.8 will be the final release in the +19.1 series. Users of 19.1 are encouraged to migrate to the 19.2 +series in order to obtain future fixes. +

    + +

    October 9, 2019

    Mesa 19.2.1 is released. This is a bug fix release.

    September 25, 2019

    +

    +Mesa 19.2.0 is released. +This is a new development release. See the release notes for more +information about this release +

    + +

    September 17, 2019

    +

    +Mesa 19.1.7 is released. +This is a bug-fix release. +

    + +

    September 3, 2019

    +

    +Mesa 19.1.6 is released. +This is a bug-fix release. +

    + +

    August 23, 2019

    +

    +Mesa 19.1.5 is released. +This is a bug-fix release. +

    +

    August 7, 2019

    Mesa 19.1.4 is released. @@ -1603,7 +1640,7 @@

    April 4, 2007

    -Thomas Hellström of Tungsten Graphics has written a whitepaper +Thomas Hellström of Tungsten Graphics has written a whitepaper describing the new DRI memory management system.

    @@ -2001,7 +2038,7 @@ -

    November 12, 2003

    +

    November 12, 2003

    New Mesa 5.0.2 tarballs have been uploaded to SourceForge which fix a @@ -2614,7 +2651,7 @@ quake scene, you may want to try this out, as it contains some optimizations specifically in the Q3A rendering path. -

    May 13, 1999

    +

    May 13, 1999

    For those interested in the integration of Mesa into XFree86 4.0, Precision Insight has posted their lowlevel design documents at www.precisioninsight.com.

    @@ -2655,7 +2692,7 @@

    March 18, 1999

    The new webpages are now online. Enjoy, and let me know if you find any errors. -

    February 16, 1999

    +

    February 16, 1999

    SGI releases its GLX source code.

    @@ -2665,4 +2702,4 @@ - + \ No newline at end of file diff -Nru mesa-19.2.8/docs/install.html mesa-20.0.8/docs/install.html --- mesa-19.2.8/docs/install.html 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/docs/install.html 2020-06-12 01:21:16.000000000 +0000 @@ -23,7 +23,6 @@
  • For DRI and hardware acceleration
  • Building with meson -
  • Building with autoconf (Linux/Unix/X11)
  • Building with SCons (Windows/Linux)
  • Building with AOSP (Android)
  • Library Information @@ -38,17 +37,15 @@

    Build system

      -
    • meson is required when building on *nix platforms. -
    • Autoconf was removed in 19.1.0, use meson instead -
    • SCons is required for building on -Windows and optional for Linux (it's an alternative to meson.) +
    • meson is required when building on *nix platforms and is supported on windows. +
    • SCons is an alternative for building on +Windows and Linux.
    • -
    • Android Build system when building as native Android component. Autoconf +
    • Android Build system when building as native Android component. Meson is used when when building ARC.
    -

    Compiler

    The following compilers are known to work, if you know of others or you're @@ -63,12 +60,6 @@

    Third party/extra tools.

    -

    -Note: These should not be required, when building from a release tarball. If -you think you've spotted a bug let developers know by filing a -bug report. -

    -
    • Python - Python is required. @@ -83,7 +74,9 @@ On Linux systems, flex and bison versions 2.5.35 and 2.4.1, respectively, (or later) should work. On Windows with MinGW, install flex and bison with: +

      mingw-get install msys-flex msys-bison
      +

      For MSVC on Windows, install Win flex-bison.

      @@ -114,9 +107,12 @@

      2. Building with meson

      +

      Meson >= 0.46.0 is required

      + +

      Meson is the latest build system in mesa, it is currently able to build for -*nix systems like Linux and BSD, and will be able to build for windows as well. +*nix systems like Linux and BSD, macOS, Haiku, and Windows.

      @@ -127,20 +123,22 @@ ninja -C builddir/ sudo ninja -C builddir/ install -

      -Please read the detailed meson instructions -for more information -

      -

      3. Building with autoconf (Linux/Unix/X11)

      +

      On windows you can also use the visual studio backend

      +
      +  meson builddir --backend=vs
      +  cd builddir
      +  msbuild mesa.sln /m
      +

      - Autoconf support was removed in Mesa 19.1.0. Please use meson instead. +Please read the detailed meson instructions +for more information

      -

      4. Building with SCons (Windows/Linux)

      +

      3. Building with SCons (Windows/Linux)

      To build Mesa with SCons on Linux or Windows do @@ -176,7 +174,7 @@ -

      5. Building with AOSP (Android)

      +

      4. Building with AOSP (Android)

      Currently one can build Mesa for Android as part of the AOSP project, yet @@ -195,7 +193,7 @@

      -

      6. Library Information

      +

      5. Library Information

      When compilation has finished, look in the top-level lib/ @@ -232,7 +230,7 @@

      -

      7. Building OpenGL programs with pkg-config

      +

      6. Building OpenGL programs with pkg-config

      Running ninja install will install package configuration files diff -Nru mesa-19.2.8/docs/intro.html mesa-20.0.8/docs/intro.html --- mesa-19.2.8/docs/intro.html 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/docs/intro.html 2020-06-12 01:21:16.000000000 +0000 @@ -357,46 +357,46 @@

      • Texture mapping: -
          -
        • glAreTexturesResident -
        • glBindTexture -
        • glCopyTexImage1D -
        • glCopyTexImage2D -
        • glCopyTexSubImage1D -
        • glCopyTexSubImage2D -
        • glDeleteTextures -
        • glGenTextures -
        • glIsTexture -
        • glPrioritizeTextures -
        • glTexSubImage1D -
        • glTexSubImage2D -
        +
          +
        • glAreTexturesResident +
        • glBindTexture +
        • glCopyTexImage1D +
        • glCopyTexImage2D +
        • glCopyTexSubImage1D +
        • glCopyTexSubImage2D +
        • glDeleteTextures +
        • glGenTextures +
        • glIsTexture +
        • glPrioritizeTextures +
        • glTexSubImage1D +
        • glTexSubImage2D +
      • Vertex Arrays: -
          -
        • glArrayElement -
        • glColorPointer -
        • glDrawElements -
        • glEdgeFlagPointer -
        • glIndexPointer -
        • glInterleavedArrays -
        • glNormalPointer -
        • glTexCoordPointer -
        • glVertexPointer -
        +
          +
        • glArrayElement +
        • glColorPointer +
        • glDrawElements +
        • glEdgeFlagPointer +
        • glIndexPointer +
        • glInterleavedArrays +
        • glNormalPointer +
        • glTexCoordPointer +
        • glVertexPointer +
      • Client state management: -
          -
        • glDisableClientState -
        • glEnableClientState -
        • glPopClientAttrib -
        • glPushClientAttrib -
        +
          +
        • glDisableClientState +
        • glEnableClientState +
        • glPopClientAttrib +
        • glPushClientAttrib +
      • Misc: -
          -
        • glGetPointer -
        • glIndexub -
        • glIndexubv -
        • glPolygonOffset -
        +
          +
        • glGetPointer +
        • glIndexub +
        • glIndexubv +
        • glPolygonOffset +
      diff -Nru mesa-19.2.8/docs/license.html mesa-20.0.8/docs/license.html --- mesa-19.2.8/docs/license.html 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/docs/license.html 2020-06-12 01:21:16.000000000 +0000 @@ -20,7 +20,7 @@

      Mesa is a 3-D graphics library with an API which is very similar to -that of OpenGL.* +that of OpenGL[1]. To the extent that Mesa utilizes the OpenGL command syntax or state machine, it is being used with authorization from Silicon Graphics, @@ -38,8 +38,8 @@

      -* OpenGL is a trademark of Silicon Graphics Incorporated. +[1]: OpenGL is a trademark of Silicon Graphics Incorporated.

      diff -Nru mesa-19.2.8/docs/llvmpipe.html mesa-20.0.8/docs/llvmpipe.html --- mesa-19.2.8/docs/llvmpipe.html 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/docs/llvmpipe.html 2020-06-12 01:21:16.000000000 +0000 @@ -56,7 +56,7 @@ For Linux, on a recent Debian based distribution do:

      -     aptitude install llvm-dev
      +aptitude install llvm-dev
       

      If you want development snapshot builds of LLVM for Debian and derived @@ -68,7 +68,7 @@ For a RPM-based distribution do:

      -     yum install llvm-devel
      +yum install llvm-devel
       

      @@ -120,15 +120,15 @@ To build everything on Linux invoke scons as:

      -  scons build=debug libgl-xlib
      +scons build=debug libgl-xlib
       
      Alternatively, you can build it with meson with:
      -  mkdir build
      -  cd build
      -  meson -D glx=gallium-xlib -D gallium-drivers=swrast
      -  ninja
      +mkdir build
      +cd build
      +meson -D glx=gallium-xlib -D gallium-drivers=swrast
      +ninja
       
      but the rest of these instructions assume that scons is used. @@ -136,7 +136,7 @@ For Windows the procedure is similar except the target:
      -  scons platform=windows build=debug libgl-gdi
      +scons platform=windows build=debug libgl-gdi
       
      @@ -148,11 +148,11 @@ libGL.so into

      -  build/foo/gallium/targets/libgl-xlib/libGL.so
      +build/foo/gallium/targets/libgl-xlib/libGL.so
       
      or
      -  lib/gallium/libGL.so
      +lib/gallium/libGL.so
       

      To use it set the LD_LIBRARY_PATH environment variable @@ -206,7 +206,7 @@ To profile llvmpipe you should build as

      -  scons build=profile <same-as-before>
      +scons build=profile <same-as-before>
       

      @@ -221,8 +221,8 @@

      -	perf record -g /my/application
      -	perf report
      +perf record -g /my/application
      +perf report
       

      @@ -255,7 +255,7 @@ for later analysis, e.g.:

      -  build/linux-x86_64-debug/gallium/drivers/llvmpipe/lp_test_blend -o blend.tsv
      +build/linux-x86_64-debug/gallium/drivers/llvmpipe/lp_test_blend -o blend.tsv
       
      diff -Nru mesa-19.2.8/docs/mesa.css mesa-20.0.8/docs/mesa.css --- mesa-19.2.8/docs/mesa.css 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/docs/mesa.css 2020-06-12 01:21:16.000000000 +0000 @@ -34,6 +34,20 @@ float: left; } +figure { + margin: 0.5em; + padding: 0.5em; + border: 1px solid #ccc; +} + +figure pre { + margin: 0; +} + +figure figcaption { + padding-top: 0.5em; +} + .content { position: absolute; left: 20em; diff -Nru mesa-19.2.8/docs/meson.html mesa-20.0.8/docs/meson.html --- mesa-19.2.8/docs/meson.html 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/docs/meson.html 2020-06-12 01:21:16.000000000 +0000 @@ -26,14 +26,18 @@

      1. Introduction

      For general information about Meson see the -Meson website.

      +Meson website.

      Mesa's Meson build system is generally considered stable and ready for production.

      -

      The Meson build of Mesa is tested on Linux, macOS, Cygwin and Haiku, FreeBSD, +

      Mesa requires Meson >= 0.46.0 to build. + +

      The Meson build of Mesa is tested on Linux, macOS, Windows, Cygwin, Haiku, FreeBSD, DragonflyBSD, NetBSD, and should work on OpenBSD.

      +

      Unix-like OSes

      +

      If Meson is not already installed on your system, you can typically install it with your package installer. For example:

      @@ -43,9 +47,7 @@
       
       sudo dnf install meson   # Fedora
       
      - -

      Mesa requires Meson >= 0.46.0 to build. - +

      Some older versions of meson do not check that they are too old and will error out in odd ways.

      @@ -55,14 +57,37 @@ the ninja-build package.

      +

      Windows

      + +

      +You will need to install python3 and meson as a module using pip. This is +because we use python for generating code, and rely on external modules +(mako). You also need pkg-config (a hard dependency of meson), flex, and bison. + +The easiest way to install everything you need is with chocolatey. +

      +
      +choco install python3 winflexbison pkgconfiglite
      +
      +

      You can even use chocolatey to install mingw and ninja (ninja can be used with MSVC as well)

      +
      +choco install ninja mingw
      +
      +

      Then install meson using pip

      +
      +py -3 -m pip install meson mako
      +
      + +You may need to add the python3 scripts directory to your path for meson. +

      2. Basic Usage

      The meson program is used to configure the source directory and generates either a ninja build file or Visual Studio® build files. The latter must be enabled via the --backend switch, as ninja is the default -backend on all -operating systems. +backend on all operating systems.

      @@ -70,7 +95,7 @@ directory to put built and generated sources into. We'll call that directory "build" here. It's recommended to create a - + separate build directory for each configuration you might want to use.

      @@ -101,7 +126,7 @@ For now, we have a bin/meson-options.py script that prints the options for you. If that script doesn't work for some reason, you can always look in the - + meson_options.txt file at the root of the project.

      @@ -119,7 +144,7 @@

      Note that options taking lists (such as platforms) are -a bit +a bit more complicated, but the simplest form compatible with Mesa options is to use a comma to separate values (-D platforms=drm,wayland) and brackets to represent an empty list (-D platforms=[]). @@ -153,12 +178,32 @@ ninja -C build/ xmlpool-pot xmlpool-update-po xmlpool-gmo

      -

      3. Advanced Usage

      +

      Windows specific instructions

      -
      +

      +On windows you have a couple of choices for compilers. If you installed mingw +with chocolatey and want to use ninja you should be able to open any shell +and follow the instructions above. If you want to you MSVC, clang-cl, or ICL +(the Intel Compiler), read on. +

      +

      +Both ICL and MSVC come with shell environments, the easiest way to use meson +with these it to open a shell. For clang-cl you will need to open an MSVC +shell, and then override the compilers, either using a native file, or +with the CC and CXX environment variables. +

      +

      +All of these compilers are tested and work with ninja, but if you want visual +studio integration or you just like msbuild, passing +--backend=vs to meson will generate a visual studio solution. If +you want to use ICL or clang-cl with the vsbackend you will need meson 0.52.0 +or greater. Older versions always use the microsoft compiler. +

      + +

      3. Advanced Usage

      -
      Installation Location
      -
      +

      Installation Location

      Meson default to installing libGL.so in your system's main lib/ directory and DRI drivers to a dri/ subdirectory. @@ -180,10 +225,8 @@

      Meson also honors DESTDIR for installs.

      -
      -
      Compiler Options
      -
      +

      Compiler Options

      Meson supports the common CFLAGS, CXXFLAGS, etc. environment variables but their use is discouraged because of the many caveats in using them. @@ -199,11 +242,9 @@

       meson builddir/ -Dc_args=-fmax-errors=10 -Dcpp_args=-DMAGIC=123
       
      -
      -
      Compiler Specification
      -
      +

      Compiler Specification

      Meson supports the standard CC and CXX environment variables for changing the default compiler. Note that Meson does not allow @@ -224,16 +265,28 @@

      The default compilers depends on your operating system. Meson supports most of the popular compilers, a complete list is available -here. +here.

      -
      -
      LLVM
      -

      Meson includes upstream logic to wrap llvm-config using its standard +

      LLVM

      +

      Meson includes upstream logic to wrap llvm-config using its standard dependency interface. -

      +

      +

      +As of meson 0.51.0 meson can use cmake to find llvm (the cmake finder +was added in meson 0.49.0, but LLVM cannot be found until 0.51) Due to the +way LLVM implements its cmake finder it will only find static libraries, it +will never find libllvm.so. -

      +There is also a -Dcmake_module_path option in this meson version, +which points to the root of an alternative installation (the prefix). For +example: +

      +
      +meson builddir -Dcmake_module_path=/home/user/mycmake/prefix
      +
      + +

      As of meson 0.49.0 meson also has the concept of a "native file", these files provide information about the native build environment (as opposed @@ -243,18 +296,17 @@ custom-llvm.ini

      -    [binaries]
      -    llvm-config = '/usr/local/bin/llvm/llvm-config'
      +[binaries]
      +llvm-config = '/usr/local/bin/llvm/llvm-config'
       
      Then configure meson:
      -    meson builddir/ --native-file custom-llvm.ini
      +meson builddir/ --native-file custom-llvm.ini
       
      -
      -

      +

      Meson < 0.49 doesn't support native files, so to specify a custom llvm-config you need to modify your $PATH (or %PATH% on windows), which will be searched for @@ -264,9 +316,8 @@

       PATH=/path/to/folder/with/llvm-config:$PATH meson build
       
      -
      -

      +

      For selecting llvm-config for cross compiling a "cross file" should be used. It uses the same format as the native file above: @@ -274,30 +325,96 @@

      cross-llvm.ini

      -    [binaries]
      -    ...
      -    llvm-config = '/usr/lib/llvm-config-32'
      +[binaries]
      +...
      +llvm-config = '/usr/lib/llvm-config-32'
      +cmake = '/usr/bin/cmake-for-my-arch'
       
      +

      Obviously, only cmake or llvm-config is required.

      +

      Then configure meson:

      -    meson builddir/ --cross-file cross-llvm.ini
      +meson builddir/ --cross-file cross-llvm.ini
       
      See the Cross Compilation section for more information. -
      -
      PKG_CONFIG_PATH
      -

      The +

      On windows (and in other cases), using llvm-config or cmake may be +either undesirable or impossible. Meson's solution for this is a +wrap, in +this case a "binary wrap". Follow the steps below:

      +
        +
      • Install the binaries and headers into the $mesa_src/subprojects/llvm
      • +
      • Add a meson build.build file to that directory (more on that later)
      • +
      + +

      The wrap file must define the following:

      +
        +
      • dep_llvm: a declare_dependency() object with include_directories, dependencies, and version set)
      • +
      + +

      It may also define:

      +
        +
      • irbuilder_h: a files() object pointing to llvm/IR/IRBuilder.h (this is requred for SWR)
      • +
      • has_rtti: a bool that declares whether LLVM was built with RTTI. Defaults to true
      • +
      + +

      such a meson.build file might look like:

      +
      +project('llvm', ['cpp'])
      +
      +cpp = meson.get_compiler('cpp')
      +
      +_deps = []
      +_search = join_paths(meson.current_source_dir(), 'lib')
      +foreach d : ['libLLVMCodeGen', 'libLLVMScalarOpts', 'libLLVMAnalysis',
      +             'libLLVMTransformUtils', 'libLLVMCore', 'libLLVMX86CodeGen',
      +             'libLLVMSelectionDAG', 'libLLVMipo', 'libLLVMAsmPrinter',
      +             'libLLVMInstCombine', 'libLLVMInstrumentation', 'libLLVMMC',
      +             'libLLVMGlobalISel', 'libLLVMObjectYAML', 'libLLVMDebugInfoPDB',
      +             'libLLVMVectorize', 'libLLVMPasses', 'libLLVMSupport',
      +             'libLLVMLTO', 'libLLVMObject', 'libLLVMDebugInfoCodeView',
      +             'libLLVMDebugInfoDWARF', 'libLLVMOrcJIT', 'libLLVMProfileData',
      +             'libLLVMObjCARCOpts', 'libLLVMBitReader', 'libLLVMCoroutines',
      +             'libLLVMBitWriter', 'libLLVMRuntimeDyld', 'libLLVMMIRParser',
      +             'libLLVMX86Desc', 'libLLVMAsmParser', 'libLLVMTableGen',
      +             'libLLVMFuzzMutate', 'libLLVMLinker', 'libLLVMMCParser',
      +             'libLLVMExecutionEngine', 'libLLVMCoverage', 'libLLVMInterpreter',
      +             'libLLVMTarget', 'libLLVMX86AsmParser', 'libLLVMSymbolize',
      +             'libLLVMDebugInfoMSF', 'libLLVMMCJIT', 'libLLVMXRay',
      +             'libLLVMX86AsmPrinter', 'libLLVMX86Disassembler',
      +             'libLLVMMCDisassembler', 'libLLVMOption', 'libLLVMIRReader',
      +             'libLLVMLibDriver', 'libLLVMDlltoolDriver', 'libLLVMDemangle',
      +             'libLLVMBinaryFormat', 'libLLVMLineEditor',
      +             'libLLVMWindowsManifest', 'libLLVMX86Info', 'libLLVMX86Utils']
      +  _deps += cpp.find_library(d, dirs : _search)
      +endforeach
      +
      +dep_llvm = declare_dependency(
      +  include_directories : include_directories('include'),
      +  dependencies : _deps,
      +  version : '6.0.0',
      +)
      +
      +has_rtti = false
      +irbuilder_h = files('include/llvm/IR/IRBuilder.h')
      +
      + +

      It is very important that version is defined and is accurate, if it is not, +workarounds for the wrong version of LLVM might be used resulting in build +failures.

      + +

      PKG_CONFIG_PATH

      +

      The pkg-config utility is a hard requirement for configuring and building Mesa on Unix-like systems. It is used to search for external libraries on the system. This environment variable is used to control the search path for pkg-config. For instance, setting PKG_CONFIG_PATH=/usr/X11R6/lib/pkgconfig will search for package metadata in /usr/X11R6 before the standard directories.

      -
      -
      +

      Options

      One of the oddities of meson is that some options are different when passed to the meson than to meson configure. These options are diff -Nru mesa-19.2.8/docs/release-calendar.html mesa-20.0.8/docs/release-calendar.html --- mesa-19.2.8/docs/release-calendar.html 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/docs/release-calendar.html 2020-06-12 01:21:16.000000000 +0000 @@ -41,8 +41,7 @@

      The way the release schedule works is explained here. -

      +

      Take a look here if you'd like to nominate a patch in the next stable release. @@ -60,73 +59,48 @@ Notes -19.1 -2019-08-20 -19.1.5 -Juan A. Suarez - - - -2019-09-03 -19.1.6 -Juan A. Suarez - - - -2019-09-17 -19.1.7 -Juan A. Suarez -Last planned 19.1.x release - - -19.2 -2019-08-06 -19.2.0-rc1 -Emil Velikov - - - -2019-08-13 -19.2.0-rc2 -Emil Velikov - +19.3 +2020-02-05 +19.3.4 +Dylan Baker + -2019-08-20 -19.2.0-rc3 -Emil Velikov - +2020-02-12 +19.3.5 +Dylan Baker + -2019-08-27 -19.2.0-rc4 -Emil Velikov -Last planned RC/Final release +2020-02-26 +19.3.6 +Dylan Baker +Last planned 19.3 release -19.3 -2019-10-15 -19.3.0-rc1 +20.0 +2020-01-29 +20.0.0-rc1 Dylan Baker - + -2019-10-22 -19.3.0-rc2 +2020-02-05 +20.0.0-rc2 Dylan Baker - + -2019-10-29 -19.3.0-rc3 +2020-02-12 +20.0.0-rc3 Dylan Baker - + -2019-11-05 -19.3.0-rc4 +2020-02-19 +20.0.0-rc4 Dylan Baker -Last planned RC/Final release +Or 20.0.0 final diff -Nru mesa-19.2.8/docs/releasing.html mesa-20.0.8/docs/releasing.html --- mesa-19.2.8/docs/releasing.html 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/docs/releasing.html 2020-06-12 01:21:16.000000000 +0000 @@ -26,8 +26,7 @@

    • Pre-release announcement
    • Making a new release
    • Announce the release -
    • Update the mesa3d.org website -
    • Update Bugzilla +
    • Update Gitlab Issues
    @@ -47,10 +46,10 @@ For example:

    -	Mesa 10.1.0 - 10.1 branch, feature
    -	Mesa 10.1.4 - 10.1 branch, bugfix
    -	Mesa 12.0.0 - 12.0 branch, feature
    -	Mesa 12.0.2 - 12.0 branch, bugfix
    +Mesa 10.1.0 - 10.1 branch, feature
    +Mesa 10.1.4 - 10.1 branch, bugfix
    +Mesa 12.0.0 - 12.0 branch, feature
    +Mesa 12.0.2 - 12.0 branch, bugfix
     
    @@ -184,27 +183,27 @@

    -    git show b10859ec41d09c57663a258f43fe57c12332698e
    +git show b10859ec41d09c57663a258f43fe57c12332698e
     
    -    commit b10859ec41d09c57663a258f43fe57c12332698e
    -    Author: Jonas Pfeil <pfeiljonas@gmx.de>
    -    Date:   Wed Mar 1 18:11:10 2017 +0100
    +commit b10859ec41d09c57663a258f43fe57c12332698e
    +Author: Jonas Pfeil <pfeiljonas@gmx.de>
    +Date:   Wed Mar 1 18:11:10 2017 +0100
     
    -        ralloc: Make sure ralloc() allocations match malloc()'s alignment.
    +    ralloc: Make sure ralloc() allocations match malloc()'s alignment.
     
    -        The header of ralloc needs to be aligned, because the compiler assumes
    -        ...
    +    The header of ralloc needs to be aligned, because the compiler assumes
    +    ...
     
    -        (cherry picked from commit cd2b55e536dc806f9358f71db438dd9c246cdb14)
    +    (cherry picked from commit cd2b55e536dc806f9358f71db438dd9c246cdb14)
     
    -        Squashed with commit:
    +    Squashed with commit:
     
    -        ralloc: don't leave out the alignment factor
    +    ralloc: don't leave out the alignment factor
     
    -        Experimentation shows that without alignment factor gcc and clang choose
    -        ...
    +    Experimentation shows that without alignment factor gcc and clang choose
    +    ...
     
    -        (cherry picked from commit ff494fe999510ea40e3ed5827e7818550b6de126)
    +    (cherry picked from commit ff494fe999510ea40e3ed5827e7818550b6de126)
     

    Regression/functionality testing

    @@ -237,8 +236,8 @@ in the main repository under staging/X.Y. For example:

    -	staging/18.1 - WIP branch for the 18.1 series
    -	staging/18.2 - WIP branch for the 18.2 series
    +staging/18.1 - WIP branch for the 18.1 series
    +staging/18.2 - WIP branch for the 18.2 series
     

    @@ -272,15 +271,15 @@ To setup the branchpoint:

    -	git checkout master # make sure we're in master first
    -	git tag -s X.Y-branchpoint -m "Mesa X.Y branchpoint"
    -	git checkout -b X.Y
    -	git checkout master
    -	$EDITOR VERSION # bump the version number
    -	git commit -as
    -	cp docs/relnotes/{X.Y,X.Y+1}.html # copy/create relnotes template
    -	git commit -as
    -	git push origin X.Y-branchpoint X.Y
    +git checkout master # make sure we're in master first
    +git tag -s X.Y-branchpoint -m "Mesa X.Y branchpoint"
    +git checkout -b X.Y
    +git checkout master
    +$EDITOR VERSION # bump the version number
    +git commit -as
    +cp docs/relnotes/{X.Y,X.Y+1}.html # copy/create relnotes template
    +git commit -as
    +git push origin X.Y-branchpoint X.Y
     

    @@ -483,69 +482,55 @@

    -    __glxgears_cmd='glxgears 2>&1 | grep -v "configuration file"'
    -    __es2info_cmd='es2_info 2>&1 | egrep "GL_VERSION|GL_RENDERER|.*dri\.so"'
    -    __es2gears_cmd='es2gears_x11 2>&1 | grep -v "configuration file"'
    -    test "x$LD_LIBRARY_PATH" != 'x' && __old_ld="$LD_LIBRARY_PATH"
    -    export LD_LIBRARY_PATH=`pwd`/test/usr/local/lib/:"${__old_ld}"
    -    export LIBGL_DRIVERS_PATH=`pwd`/test/usr/local/lib/dri/
    -    export LIBGL_DEBUG=verbose
    -    eval $__glxinfo_cmd
    -    eval $__glxgears_cmd
    -    eval $__es2info_cmd
    -    eval $__es2gears_cmd
    -    export LIBGL_ALWAYS_SOFTWARE=true
    -    eval $__glxinfo_cmd
    -    eval $__glxgears_cmd
    -    eval $__es2info_cmd
    -    eval $__es2gears_cmd
    -    export LIBGL_ALWAYS_SOFTWARE=true
    -    export GALLIUM_DRIVER=softpipe
    -    eval $__glxinfo_cmd
    -    eval $__glxgears_cmd
    -    eval $__es2info_cmd
    -    eval $__es2gears_cmd
    -    # Smoke test DOTA2
    -    unset LD_LIBRARY_PATH
    -    test "x$__old_ld" != 'x' && export LD_LIBRARY_PATH="$__old_ld" && unset __old_ld
    -    unset LIBGL_DRIVERS_PATH
    -    unset LIBGL_DEBUG
    -    unset LIBGL_ALWAYS_SOFTWARE
    -    unset GALLIUM_DRIVER
    -    export VK_ICD_FILENAMES=`pwd`/test/usr/local/share/vulkan/icd.d/intel_icd.x86_64.json
    -    steam steam://rungameid/570  -vconsole -vulkan
    -    unset VK_ICD_FILENAMES
    +__glxgears_cmd='glxgears 2>&1 | grep -v "configuration file"'
    +__es2info_cmd='es2_info 2>&1 | egrep "GL_VERSION|GL_RENDERER|.*dri\.so"'
    +__es2gears_cmd='es2gears_x11 2>&1 | grep -v "configuration file"'
    +test "x$LD_LIBRARY_PATH" != 'x' && __old_ld="$LD_LIBRARY_PATH"
    +export LD_LIBRARY_PATH=`pwd`/test/usr/local/lib/:"${__old_ld}"
    +export LIBGL_DRIVERS_PATH=`pwd`/test/usr/local/lib/dri/
    +export LIBGL_DEBUG=verbose
    +eval $__glxinfo_cmd
    +eval $__glxgears_cmd
    +eval $__es2info_cmd
    +eval $__es2gears_cmd
    +export LIBGL_ALWAYS_SOFTWARE=true
    +eval $__glxinfo_cmd
    +eval $__glxgears_cmd
    +eval $__es2info_cmd
    +eval $__es2gears_cmd
    +export LIBGL_ALWAYS_SOFTWARE=true
    +export GALLIUM_DRIVER=softpipe
    +eval $__glxinfo_cmd
    +eval $__glxgears_cmd
    +eval $__es2info_cmd
    +eval $__es2gears_cmd
    +# Smoke test DOTA2
    +unset LD_LIBRARY_PATH
    +test "x$__old_ld" != 'x' && export LD_LIBRARY_PATH="$__old_ld" && unset __old_ld
    +unset LIBGL_DRIVERS_PATH
    +unset LIBGL_DEBUG
    +unset LIBGL_ALWAYS_SOFTWARE
    +unset GALLIUM_DRIVER
    +export VK_ICD_FILENAMES=`pwd`/test/usr/local/share/vulkan/icd.d/intel_icd.x86_64.json
    +steam steam://rungameid/570  -vconsole -vulkan
    +unset VK_ICD_FILENAMES
     
    -

    Update version in file VERSION

    - -

    -Increment the version contained in the file VERSION at Mesa's top-level, then -commit this change. -

    -

    Create release notes for the new release

    -Create a new file docs/relnotes/X.Y.Z.html, (follow the style of the previous -release notes). Note that the sha256sums section of the release notes should -be empty (TBD) at this point. -

    +The release notes are completely generated by the +bin/gen_release_notes.py script. Simply run this script before +bumping the version -

    -Two scripts are available to help generate portions of the release notes: +The only thing left to do is add the sha256 sums.

    -
    -	./bin/bugzilla_mesa.sh
    -	./bin/shortlog_mesa.sh
    -
    +

    Update version in file VERSION

    -The first script identifies commits that reference bugzilla bugs and obtains -the descriptions of those bugs from bugzilla. The second script generates a -log of all commits. In both cases, HTML-formatted lists are printed to stdout -to be included in the release notes. +Increment the version contained in the file VERSION at Mesa's top-level, then +commit this change.

    @@ -553,7 +538,7 @@

    -	git push origin HEAD
    +git push origin HEAD
     
    @@ -564,9 +549,9 @@

    -	# For the dist/distcheck, you may want to specify which LLVM to use:
    -	# export LLVM_CONFIG=/usr/lib/llvm-3.9/bin/llvm-config
    -	../relative/path/to/release.sh . # append --dist if you've already done distcheck above
    +# For the dist/distcheck, you may want to specify which LLVM to use:
    +# export LLVM_CONFIG=/usr/lib/llvm-3.9/bin/llvm-config
    +../relative/path/to/release.sh . # append --dist if you've already done distcheck above
     

    @@ -587,20 +572,18 @@

    -	git cherry-pick -x X.Y~1
    -	git cherry-pick -x X.Y
    +git cherry-pick -x X.Y~1
    +git cherry-pick -x X.Y
     
    -

    -Also, edit docs/relnotes.html to add a link to the new release notes, -edit docs/index.html to add a news entry and a note in case of the -last release in a series, and remove the version from -docs/release-calendar.html. Then commit and push: +

    Then run the ./bin/post_verison.py X.Y.Z, where X.Y.Z is the +version you just made. This will updated docs/relnotes.html and +docs/index.html. Remove docs/release-calendar.html. Then commit and push:

    -	git commit -as -m "docs: update calendar, add news item and link release notes for X.Y.Z"
    -	git push origin master X.Y
    +git commit -as -m "docs: update calendar, add news item and link release notes for X.Y.Z"
    +git push origin master X.Y
     
    @@ -616,15 +599,7 @@

    -

    Update the mesa3d.org website

    - -

    -As the hosting was moved to freedesktop, git hooks are deployed to update the -website. Manually check that it is updated 5-10 minutes after the final git push -

    - - -

    Update Bugzilla

    +

    Update gitlab issues

    Parse through the bugreports as listed in the docs/relnotes/X.Y.Z.html diff -Nru mesa-19.2.8/docs/relnotes/19.1.5.html mesa-20.0.8/docs/relnotes/19.1.5.html --- mesa-19.2.8/docs/relnotes/19.1.5.html 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/docs/relnotes/19.1.5.html 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,119 @@ + + + + + Mesa Release Notes + + + + +

    +

    The Mesa 3D Graphics Library

    +
    + + +
    + +

    Mesa 19.1.5 Release Notes / August 23, 2019

    + +

    +Mesa 19.1.5 is a bug fix release which fixes bugs found since the 19.1.4 release. +

    +

    +Mesa 19.1.5 implements the OpenGL 4.5 API, but the version reported by +glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) / +glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used. +Some drivers don't support all the features required in OpenGL 4.5. OpenGL +4.5 is only available if requested at context creation. +Compatibility contexts may report a lower version depending on each driver. +

    + +

    SHA256 checksums

    +
    +7b54e14e35c7251b171b4cf9d84cbc1d760eafe00132117db193454999cd6eb4  mesa-19.1.5.tar.xz
    +
    + + +

    New features

    +

    None

    + + +

    Bug fixes

    + +
      + +
    • Bug 109630 - vkQuake flickering geometry under Intel
    • + +
    • Bug 110395 - Shadows are flickering in SuperTuxKart
    • + +
    • Bug 111113 - ANGLE BlitFramebufferTest.MultisampleDepthClear/ES3_OpenGL fails on Intel Ubuntu19.04
    • + +
    • Bug 111267 - [CM246] Flickering with multiple draw calls within the same graphics pipeline if a compute pipeline is present
    • + +
    + + +

    Changes

    + +

    Bas Nieuwenhuizen (4):

    +
      +
    • radv: Do non-uniform lowering before bool lowering.
    • +
    • ac/nir: Use correct cast for readfirstlane and ptrs.
    • +
    • radv: Avoid binning RAVEN hangs.
    • +
    • radv: Avoid VEGA/RAVEN scissor bug in binning.
    • +
    + +

    Danylo Piliaiev (1):

    +
      +
    • i965: Emit a dummy MEDIA_VFE_STATE before switching from GPGPU to 3D
    • +
    + +

    Eric Engestrom (1):

    +
      +
    • util: fix mem leak of program path
    • +
    + +

    Erik Faye-Lund (2):

    +
      +
    • gallium/dump: add missing query-type to short-list
    • +
    • gallium/dump: add missing query-type to short-list
    • +
    + +

    Greg V (2):

    +
      +
    • anv: remove unused Linux-specific include
    • +
    • intel/perf: use MAJOR_IN_SYSMACROS/MAJOR_IN_MKDEV
    • +
    + +

    Jason Ekstrand (1):

    +
      +
    • anv: Emit a dummy MEDIA_VFE_STATE before switching from GPGPU to 3D
    • +
    + +

    Juan A. Suarez Romero (3):

    +
      +
    • docs: add sha256 checksums for 19.1.4
    • +
    • cherry-ignore: panfrost: Make ctx->job useful
    • +
    • Update version to 19.1.5
    • +
    + +

    Marek Olšák (2):

    +
      +
    • radeonsi: disable SDMA image copies on dGPUs to fix corruption in games
    • +
    • radeonsi: fix an assertion failure: assert(!res->b.is_shared)
    • +
    + +

    Matt Turner (1):

    +
      +
    • meson: Test for program_invocation_name
    • +
    + +

    Sergii Romantsov (1):

    +
      +
    • i965/clear: clear_value better precision
    • +
    + + +
    + + diff -Nru mesa-19.2.8/docs/relnotes/19.1.6.html mesa-20.0.8/docs/relnotes/19.1.6.html --- mesa-19.2.8/docs/relnotes/19.1.6.html 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/docs/relnotes/19.1.6.html 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,132 @@ + + + + + Mesa Release Notes + + + + +
    +

    The Mesa 3D Graphics Library

    +
    + + +
    + +

    Mesa 19.1.6 Release Notes / September 3, 2019

    + +

    +Mesa 19.1.6 is a bug fix release which fixes bugs found since the 19.1.5 release. +

    +

    +Mesa 19.1.6 implements the OpenGL 4.5 API, but the version reported by +glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) / +glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used. +Some drivers don't support all the features required in OpenGL 4.5. OpenGL +4.5 is only available if requested at context creation. +Compatibility contexts may report a lower version depending on each driver. +

    + +

    SHA256 checksums

    +
    +2a369b7b48545c6486e7e44913ad022daca097c8bd937bf30dcf3f17a94d3496  mesa-19.1.6.tar.xz
    +
    + + +

    New features

    +

    None

    + + +

    Bug fixes

    + +
      + +
    • Bug 104395 - [CTS] GTF-GL46.gtf32.GL3Tests.packed_pixels.packed_pixels tests fail on 32bit Mesa
    • + +
    • Bug 111213 - VA-API nouveau SIGSEGV and asserts
    • + +
    • Bug 111241 - Shadertoy shader causing hang
    • + +
    • Bug 111411 - SPIR-V shader leads to GPU hang, sometimes making machine unstable
    • + +
    + + +

    Changes

    + +

    Andres Rodriguez (1):

    +
      +
    • radv: additional query fixes
    • +
    + +

    Daniel Schürmann (1):

    +
      +
    • nir/lcssa: handle deref instructions properly
    • +
    + +

    Danylo Piliaiev (1):

    +
      +
    • nir/loop_unroll: Prepare loop for unrolling in wrapper_unroll
    • +
    + +

    Ian Romanick (2):

    +
      +
    • nir/algrbraic: Don't optimize open-coded bitfield reverse when lowering is enabled
    • +
    • intel/compiler: Request bitfield_reverse lowering on pre-Gen7 hardware
    • +
    + +

    Ilia Mirkin (1):

    +
      +
    • gallium/vl: use compute preference for all multimedia, not just blit
    • +
    + +

    Jonas Ådahl (1):

    +
      +
    • wayland/egl: Ensure correct buffer size when allocating
    • +
    + +

    Juan A. Suarez Romero (6):

    +
      +
    • docs: add sha256 checksums for 19.1.5
    • +
    • cherry-ignore: add explicit 19.2 only nominations
    • +
    • cherry-ignore: iris: Replace devinfo->gen with GEN_GEN
    • +
    • cherry-ignore: iris: Update fast clear colors on Gen9 with direct immediate writes.
    • +
    • cherry-ignore: iris: Avoid unnecessary resolves on transfer maps
    • +
    • Update version to 19.1.6
    • +
    + +

    Kenneth Graunke (6):

    +
      +
    • iris: Fix broken aux.possible/sampler_usages bitmask handling
    • +
    • iris: Drop copy format hacks from copy region based transfer path.
    • +
    • iris: Fix large timeout handling in rel2abs()
    • +
    • util: Add a _mesa_i64roundevenf() helper.
    • +
    • mesa: Fix _mesa_float_to_unorm() on 32-bit systems.
    • +
    • intel/compiler: Fix src0/desc setter ordering
    • +
    + +

    Marek Olšák (1):

    +
      +
    • radeonsi: fix scratch buffer WAVESIZE setting leading to corruption
    • +
    + +

    Paulo Zanoni (1):

    +
      +
    • intel/fs: grab fail_msg from v32 instead of v16 when v32->run_cs fails
    • +
    + +

    Pierre-Eric Pelloux-Prayer (1):

    +
      +
    • glsl: replace 'x + (-x)' with constant 0
    • +
    + +

    Tapani Pälli (1):

    +
      +
    • egl: reset blob cache set/get functions on terminate
    • +
    + + +
    + + diff -Nru mesa-19.2.8/docs/relnotes/19.1.7.html mesa-20.0.8/docs/relnotes/19.1.7.html --- mesa-19.2.8/docs/relnotes/19.1.7.html 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/docs/relnotes/19.1.7.html 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,157 @@ + + + + + Mesa Release Notes + + + + +
    +

    The Mesa 3D Graphics Library

    +
    + + +
    + +

    Mesa 19.1.7 Release Notes / September 17, 2019

    + +

    +Mesa 19.1.7 is a bug fix release which fixes bugs found since the 19.1.6 release. +

    +

    +Mesa 19.1.7 implements the OpenGL 4.5 API, but the version reported by +glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) / +glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used. +Some drivers don't support all the features required in OpenGL 4.5. OpenGL +4.5 is only available if requested at context creation. +Compatibility contexts may report a lower version depending on each driver. +

    +

    +Mesa 19.1.7 implements the Vulkan 1.1 API, but the version reported by +the apiVersion property of the VkPhysicalDeviceProperties struct +depends on the particular driver being used. +

    + +

    SHA256 checksums

    +
    +e287920fdb38712a9fed448dc90b3ca95048c7face5db52e58361f8b6e0f3cd5  mesa-19.1.7.tar.xz
    +
    + + +

    New features

    +

    None

    + + +

    Bug fixes

    + +
      + +
    • Bug 110814 - KWin compositor crashes on launch
    • + +
    • Bug 111069 - Assertion fails in nir_opt_remove_phis.c during compilation of SPIR-V shader
    • + +
    • Bug 111271 - Crash in eglMakeCurrent
    • + +
    • Bug 111401 - Vulkan overlay layer - async compute not supported, making overlay disappear in Doom
    • + +
    • Bug 111405 - Some infinite 'do{}while' loops lead mesa to an infinite compilation
    • + +
    • Bug 111467 - WOLF RPG Editor + Gallium Nine Standalone: Rendering issue when using Iris driver
    • + +
    • Bug 111552 - Geekbench 5.0 Vulkan compute benchmark fails on Anvil
    • + +
    + + +

    Changes

    + +

    Caio Marcelo de Oliveira Filho (1):

    +
      +
    • glsl/nir: Avoid overflow when setting max_uniform_location
    • +
    + +

    Connor Abbott (1):

    +
      +
    • radv: Call nir_propagate_invariant()
    • +
    + +

    Danylo Piliaiev (1):

    +
      +
    • tgsi_to_nir: Translate TGSI_INTERPOLATE_COLOR as INTERP_MODE_NONE
    • +
    + +

    Eric Engestrom (10):

    +
      +
    • ttn: fix 64-bit shift on 32-bit `1`
    • +
    • egl: fix deadlock in malloc error path
    • +
    • util/os_file: fix double-close()
    • +
    • anv: fix format string in error message
    • +
    • nir: fix memleak in error path
    • +
    • anv: add support for driconf
    • +
    • wsi: add minImageCount override
    • +
    • anv: add support for vk_x11_override_min_image_count
    • +
    • amd: move adaptive sync to performance section, as it is defined in xmlpool
    • +
    • radv: add support for vk_x11_override_min_image_count
    • +
    + +

    Erik Faye-Lund (2):

    +
      +
    • gallium/auxiliary/indices: consistently apply start only to input
    • +
    • util: fix SSE-version needed for double opcodes
    • +
    + +

    Hal Gentz (1):

    +
      +
    • glx: Fix SEGV due to dereferencing a NULL ptr from XCB-GLX.
    • +
    + +

    Jason Ekstrand (7):

    +
      +
    • Revert "intel/fs: Move the scalar-region conversion to the generator."
    • +
    • anv: Bump maxComputeWorkgroupSize
    • +
    • nir: Don't infinitely recurse in lower_ssa_defs_to_regs_block
    • +
    • nir: Add a block_is_unreachable helper
    • +
    • nir/repair_ssa: Repair dominance for unreachable blocks
    • +
    • nir/repair_ssa: Insert deref casts when needed
    • +
    • nir/dead_cf: Repair SSA if the pass makes progress
    • +
    + +

    Juan A. Suarez Romero (3):

    +
      +
    • docs: add sha256 checksums for 19.1.6
    • +
    • cherry-ignore: add explicit 19.2 only nominations
    • +
    • Update version to 19.1.7
    • +
    + +

    Kenneth Graunke (1):

    +
      +
    • gallium: Fix util_format_get_depth_only
    • +
    + +

    Lionel Landwerlin (1):

    +
      +
    • vulkan/overlay: bounce image back to present layout
    • +
    + +

    Mauro Rossi (3):

    +
      +
    • android: radv: fix necessary dependecies
    • +
    • android: amd/common: fix missing include path
    • +
    • android: anv: libmesa_vulkan_common: add libmesa_util static dependency
    • +
    + +

    Samuel Pitoiset (1):

    +
      +
    • radv: fix allocating number of user sgprs if streamout is used
    • +
    + +

    Sergii Romantsov (1):

    +
      +
    • intel/dri: finish proper glthread
    • +
    + + +
    + + diff -Nru mesa-19.2.8/docs/relnotes/19.1.8.html mesa-20.0.8/docs/relnotes/19.1.8.html --- mesa-19.2.8/docs/relnotes/19.1.8.html 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/docs/relnotes/19.1.8.html 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,267 @@ + + + + + Mesa Release Notes + + + + +
    +

    The Mesa 3D Graphics Library

    +
    + + +
    + +

    Mesa 19.1.8 Release Notes / October 21, 2019

    + +

    +Mesa 19.1.8 is a bug fix release which fixes bugs found since the 19.1.7 release. +

    +

    +Mesa 19.1.8 implements the OpenGL 4.5 API, but the version reported by +glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) / +glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used. +Some drivers don't support all the features required in OpenGL 4.5. OpenGL +4.5 is only available if requested at context creation. +Compatibility contexts may report a lower version depending on each driver. +

    +

    +Mesa 19.1.8 implements the Vulkan 1.1 API, but the version reported by +the apiVersion property of the VkPhysicalDeviceProperties struct +depends on the particular driver being used. +

    + +

    SHA256 checksums

    +
    +f0fe8289b7d147943bf2fc2147833254881577e8f9ed3d94ddb39e430e711725  mesa-19.1.8.tar.xz
    +
    + + +

    New features

    +

    None

    + + +

    Bug fixes

    + +
      + +
    • Bug 111236 - VA-API radeonsi SIGSEGV __memmove_avx_unaligned
    • + +
    • Bug 111664 - [Bisected] Segmentation fault on FS shader compilation (mat4x3 * mat4x3)
    • + +
    • Issue #121 - Shared Memeory leakage in XCreateDrawable
    • + +
    • Issue #795 - Xorg does not render with mesa 19.1.7
    • + +
    • Issue #939 - Meson can't find 32-bit libXvMCW in non-standard path
    • + +
    • Issue #944 - Mesa doesn't build with current Scons version (3.1.0)
    • + +
    • Issue #1838 - Mesa installs gl.pc and egl.pc even with libglvnd >= 1.2.0
    • + +
    • Issue #1844 - libXvMC-1.0.12 breaks mesa build
    • + +
    • Issue #1869 - X server does not start with Mesa 19.2.0
    • + +
    • Issue #1872 - [bisected] piglit spec.arb_texture_view.bug-layers-image causes gpu hangs on IVB
    • + +
    • Issue #1878 - meson.build:1447:6: ERROR: Problem encountered: libdrm required for gallium video statetrackers when using x11
    • + +
    + + +

    Changes

    + +

    Adam Jackson (1):

    +
      +
    • docs: Update bug report URLs for the gitlab migration
    • +
    + +

    Alan Coopersmith (5):

    +
      +
    • c99_compat.h: Don't try to use 'restrict' in C++ code
    • +
    • util: Make Solaris implemention of p_atomic_add work with gcc
    • +
    • util: Workaround lack of flock on Solaris
    • +
    • meson: recognize "sunos" as the system name for Solaris
    • +
    • intel/common: include unistd.h for ioctl() prototype on Solaris
    • +
    + +

    Andreas Gottschling (1):

    +
      +
    • drisw: Fix shared memory leak on drawable resize
    • +
    + +

    Andres Gomez (3):

    +
      +
    • docs: Add the maximum implemented Vulkan API version in 19.1 rel notes
    • +
    • docs/features: Update VK_KHR_display_swapchain status
    • +
    • egl: Remove the 565 pbuffer-only EGL config under X11.
    • +
    + +

    Andrii Simiklit (1):

    +
      +
    • glsl: disallow incompatible matrices multiplication
    • +
    + +

    Arcady Goldmints-Orlov (1):

    +
      +
    • anv: fix descriptor limits on gen8
    • +
    + +

    Bas Nieuwenhuizen (2):

    +
      +
    • tu: Set up glsl types.
    • +
    • radv: Add workaround for hang in The Surge 2.
    • +
    + +

    Danylo Piliaiev (1):

    +
      +
    • st/nine: Ignore D3DSIO_RET if it is the last instruction in a shader
    • +
    + +

    Dylan Baker (5):

    +
      +
    • meson: fix logic for generating .pc files with old glvnd
    • +
    • meson: Try finding libxvmcw via pkg-config before using find_library
    • +
    • meson: Link xvmc with libxv
    • +
    • meson: gallium media state trackers require libdrm with x11
    • +
    • meson: Only error building gallium video without libdrm when the platform is drm
    • +
    + +

    Eric Engestrom (4):

    +
      +
    • gl: drop incorrect pkg-config file for glvnd
    • +
    • meson: re-add incorrect pkg-config files with GLVND for backward compatibility
    • +
    • util/anon_file: add missing #include
    • +
    • util/anon_file: const string param
    • +
    + +

    Erik Faye-Lund (1):

    +
      +
    • glsl: correct bitcast-helpers
    • +
    + +

    Greg V (1):

    +
      +
    • util: add anon_file.h for all memfd/temp file usage
    • +
    + +

    Haihao Xiang (1):

    +
      +
    • i965: support AYUV/XYUV for external import only
    • +
    + +

    Hal Gentz (1):

    +
      +
    • gallium/osmesa: Fix the inability to set no context as current.
    • +
    + +

    Jason Ekstrand (2):

    +
      +
    • nir/repair_ssa: Replace the unreachable check with the phi builder
    • +
    • intel/fs: Fix fs_inst::flags_read for ANY/ALL predicates
    • +
    + +

    Juan A. Suarez Romero (11):

    +
      +
    • docs: add sha256 checksums for 19.1.7
    • +
    • cherry-ignore: add explicit 19.2 only nominations
    • +
    • cherry-ignore: add explicit 19.3 only nominations
    • +
    • Revert "Revert "intel/fs: Move the scalar-region conversion to the generator.""
    • +
    • cherry-ignore: Revert "gallium: remove PIPE_CAP_TEXTURE_SHADOW_MAP"
    • +
    • bin/get-pick-list.sh: sha1 commits can be smaller than 8 chars
    • +
    • cherry-ignore: nir/opt_large_constants: Handle store writemasks
    • +
    • cherry-ignore: util: added missing headers in anon-file
    • +
    • cherry-ignore: radv: Fix condition for skipping the continue CS.
    • +
    • cherry-ignore: Revert "radv: disable viewport clamping even if FS doesn't write Z"
    • +
    • Update version to 19.1.8
    • +
    + +

    Ken Mays (1):

    +
      +
    • haiku: fix Mesa build
    • +
    + +

    Kenneth Graunke (4):

    +
      +
    • iris: Initialize ice->state.prim_mode to an invalid value
    • +
    • intel: Increase Gen11 compute shader scratch IDs to 64.
    • +
    • iris: Disable CCS_E for 32-bit floating point textures.
    • +
    • iris: Fix iris_rebind_buffer() for VBOs with non-zero offsets.
    • +
    + +

    Lionel Landwerlin (5):

    +
      +
    • anv: gem-stubs: return a valid fd got anv_gem_userptr()
    • +
    • intel: use proper label for Comet Lake skus
    • +
    • mesa: don't forget to clear _Layer field on texture unit
    • +
    • intel: fix subslice computation from topology data
    • +
    • intel/isl: Set null surface format to R32_UINT
    • +
    + +

    Marek Olšák (1):

    +
      +
    • gallium/vl: don't set PIPE_HANDLE_USAGE_EXPLICIT_FLUSH
    • +
    + +

    Matt Turner (1):

    +
      +
    • util: Drop preprocessor guards for glibc-2.12
    • +
    + +

    Michel Dänzer (1):

    +
      +
    • radeonsi: fix VAAPI segfault due to various bugs
    • +
    + +

    Michel Zou (2):

    +
      +
    • scons: add py3 support
    • +
    • scons: For MinGW use -posix flag.
    • +
    + +

    Paulo Zanoni (1):

    +
      +
    • intel/fs: fix SHADER_OPCODE_CLUSTER_BROADCAST for SIMD32
    • +
    + +

    Prodea Alexandru-Liviu (1):

    +
      +
    • scons/MSYS2-MinGW-W64: Fix build options defaults Signed-off-by: Prodea Alexandru-Liviu <liviuprodea@yahoo.com> Reviewed-by: Jose Fonseca <jfonseca@vmware.com> Cc: <mesa-stable@lists.freedesktop.org>
    • +
    + +

    Rhys Perry (2):

    +
      +
    • radv: always emit a position export in gs copy shaders
    • +
    • nir/opt_remove_phis: handle phis with no sources
    • +
    + +

    Samuel Iglesias Gonsálvez (1):

    +
      +
    • intel/nir: do not apply the fsin and fcos trig workarounds for consts
    • +
    + +

    Stephen Barber (1):

    +
      +
    • nouveau: add idep_nir_headers as dep for libnouveau
    • +
    + +

    Tapani Pälli (3):

    +
      +
    • iris: close screen fd on iris_destroy_screen
    • +
    • egl: check for NULL value like eglGetSyncAttribKHR does
    • +
    • util: fix os_create_anonymous_file on android
    • +
    + +

    pal1000 (2):

    +
      +
    • scons/windows: Support build with LLVM 9.
    • +
    • scons: Fix MSYS2 Mingw-w64 build.
    • +
    + + +
    + + diff -Nru mesa-19.2.8/docs/relnotes/19.2.0.html mesa-20.0.8/docs/relnotes/19.2.0.html --- mesa-19.2.8/docs/relnotes/19.2.0.html 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/docs/relnotes/19.2.0.html 2020-06-12 01:21:16.000000000 +0000 @@ -14,7 +14,7 @@
    -

    Mesa 19.2.0 Release Notes / TBD

    +

    Mesa 19.2.0 Release Notes / 2019.09.25

    Mesa 19.2.0 is a new development release. People who are concerned @@ -37,7 +37,7 @@

    SHA256 checksums

    -TBD.
    +    b060caa2a00f856431160ff7377d0e8f58f2aa48c16ee5a9e265ebdccb10852a  mesa-19.2.0.tar.xz
     
    @@ -66,8 +66,6 @@
      -
        -
      • Bug 103674 - u_queue.c:173:7: error: implicit declaration of function 'timespec_get' is invalid in C99
      • Bug 104395 - [CTS] GTF-GL46.gtf32.GL3Tests.packed_pixels.packed_pixels tests fail on 32bit Mesa
      • @@ -119,12 +117,9 @@
      • Bug 111734 - Geometry shader with double interpolators fails in LLVM
      -

    Changes

    -
      -

      Adam Jackson (1):

      • docs: Update bug report URLs for the gitlab migration
      • @@ -444,8 +439,6 @@
      • travis: Fail build if any command in if statement fails.
      -
    -
    diff -Nru mesa-19.2.8/docs/relnotes/19.2.2.html mesa-20.0.8/docs/relnotes/19.2.2.html --- mesa-19.2.8/docs/relnotes/19.2.2.html 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/docs/relnotes/19.2.2.html 2020-06-12 01:21:16.000000000 +0000 @@ -36,7 +36,7 @@

    SHA256 checksum

    -TBD.
    +    7e4f0e2678bfcf3b94f533078b514f37943378a4a8604e477c888ec8a2904394  mesa-19.2.2.tar.xz
     
    diff -Nru mesa-19.2.8/docs/relnotes/19.2.3.html mesa-20.0.8/docs/relnotes/19.2.3.html --- mesa-19.2.8/docs/relnotes/19.2.3.html 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/docs/relnotes/19.2.3.html 2020-06-12 01:21:16.000000000 +0000 @@ -36,7 +36,7 @@

    SHA256 checksum

    -TBD.
    +    5ee6e42504fe41dcc9a6eba26982656a675b2550a640946f463927ed7f1c5047  mesa-19.2.3.tar.xz
     
    diff -Nru mesa-19.2.8/docs/relnotes/19.2.6.html mesa-20.0.8/docs/relnotes/19.2.6.html --- mesa-19.2.8/docs/relnotes/19.2.6.html 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/docs/relnotes/19.2.6.html 2020-06-12 01:21:16.000000000 +0000 @@ -36,7 +36,7 @@

    SHA256 checksum

    -TBD.
    +    9d7b24fa60c82db34788196450042a55ce6cb2d70c7a8d5c31401619b6907797  mesa-19.2.6.tar.xz
     
    diff -Nru mesa-19.2.8/docs/relnotes/19.2.8.html mesa-20.0.8/docs/relnotes/19.2.8.html --- mesa-19.2.8/docs/relnotes/19.2.8.html 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/docs/relnotes/19.2.8.html 2020-06-12 01:21:16.000000000 +0000 @@ -36,7 +36,7 @@

    SHA256 checksum

    -TBD.
    +    cffa8fa755c7422ce014c39ca0b770a092d9e0bbae537ceb2609c106916e5a57  mesa-19.2.8.tar.xz
     
    diff -Nru mesa-19.2.8/docs/relnotes/19.3.0.html mesa-20.0.8/docs/relnotes/19.3.0.html --- mesa-19.2.8/docs/relnotes/19.3.0.html 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/docs/relnotes/19.3.0.html 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,3138 @@ + + + + + +Mesa Release Notes + + + + +
    +

    The Mesa 3D Graphics Library

    +
    + + +
    + +

    Mesa 19.3.0 Release Notes / 2019-12-12

    + +

    + Mesa 19.3.0 is a new development release. People who are concerned + with stability and reliability should stick with a previous release or + wait for Mesa 19.3.1. +

    +

    +Mesa 19.3.0 implements the OpenGL 4.6 API, but the version reported by +glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) / +glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used. +Some drivers don't support all the features required in OpenGL 4.6. OpenGL +4.6 is only available if requested at context creation. +Compatibility contexts may report a lower version depending on each driver. +

    +

    +Mesa 19.3.0 implements the Vulkan 1.1 API, but the version reported by +the apiVersion property of the VkPhysicalDeviceProperties struct +depends on the particular driver being used. +

    + +

    SHA256 checksum

    +
    +    5fa0e4e9dca79560f6882e362f9db36d81cf96da16cf6a84e0ada7466a99a5d7  mesa-19.3.0.tar.xz
    +
    + + +

    New features

    + +
      +
    • GL_ARB_gl_spirv on i965, iris. +
    • +
    • GL_ARB_spirv_extensions on i965, iris. +
    • +
    • GL_EXT_demote_to_helper_invocation on iris, i965. +
    • +
    • OpenGL 4.6 on i965, iris. +
    • +
    • EGL_EXT_image_flush_external +
    • +
    • VK_ANDROID_external_memory_android_hardware_buffer on RADV. +
    • +
    • VK_KHR_shader_clock on Intel, RADV. +
    • +
    • VK_KHR_shader_float_controls on Intel, RADV. +
    • +
    • VK_KHR_spirv_1_4 on Intel, RADV. +
    • +
    • VK_KHR_timeline_semaphore on RADV. +
    • +
    • VK_KHR_vulkan_memory_model on Intel. +
    • +
    • VK_EXT_shader_subgroup_ballot on Intel. +
    • +
    • VK_EXT_shader_subgroup_vote on Intel. +
    • +
    • VK_EXT_texel_buffer_alignment on RADV. +
    • +
    • VK_INTEL_performance_query on Intel. +
    • +
    • Meson support for windows using MSVC and MinGW +
    • +
    • scons has been deprecated for non windows +
    • +
    • Initial Intel gen12 (Tigerlake) support on anvil and iris +
    • +
    • New compiler backend "ACO" for RADV (RADV_PERFTEST=aco) +
    • +
    • VK_EXT_shader_demote_to_helper_invocation on RADV/ACO. +
    • +
    + +

    Bug fixes

    + +
      +
    • [RADV] The Dead Rising 4 is causing a GPU hang with LLVM backend
    • +
    • radeonsi: mpv --vo=vaapi incorrect rendering on gfx9+
    • +
    • NULL resource when playing VP9 video through VDPAU on RX 570
    • +
    • gnome-shell overview mode crash in recent mesa
    • +
    • radv/aco Jedi Fallen Order hair rendering buggy
    • +
    • [RADV] VK_KHR_timeline_semaphore balloons in runtime
    • +
    • Shadow of Mordor has randomly dancing black shadows on Talion's face
    • +
    • ld.lld: error: duplicate symbol (mesa-19.3.0-rc1)
    • +
    • triangle strip clipping with GL_FIRST_VERTEX_CONVENTION causes wrong vertex's attribute to be broadcasted for flat interpolation
    • +
    • [bisected][regression][g45,g965,ilk] piglit arb_fragment_program kil failures
    • +
    • textureSize(samplerExternalOES, int) missing in desktop mesa 19.1.7 implementation
    • +
    • HSW. Tropico 6 and SuperTuxKart have shadows flickering
    • +
    • glxgears segfaults on POWER / Xvnc
    • +
    • Objects leaving trails in Firefox with antialias and preserveDrawingBuffer in three.js WebGLRednerer with mesa 19.2
    • +
    • radv regression after 84d9551b232bdcead017b212cbb3e291486e698c: vk: error: failed to submit CS
    • +
    • Rename ACO README to README.md
    • +
    • Steam crash due to commit e137b3a9b71a2711c1f68c8a8b9c0a7407fbcc4b (bisected)
    • +
    • [Anv regression] SPIR-V abort in Aztec Ruins
    • +
    • FreeBSD does not have _GNU_SOURCE in util/strtod.c
    • +
    • glLinkProgram crash when using gcc-9 -O3 -flto due to use of uninitialised value
    • +
    • KeyError: 'force_scons':
    • +
    • link_shader and deserialize_glsl_program suddenly consume huge amount of RAM
    • +
    • build errors after "meson: add -Werror=empty-body to disallow `if(x);`"
    • +
    • performance regression in Heroes of the Storm with Mesa 19.1.1 & Polaris
    • +
    • Vulkan version of "Middle-earth: Shadow of Mordor" has graphics glitches on RADV driver (part 2)
    • +
    • swr/rasterizer/core/format_types.h:1183: undefined reference to `_mm256_cvtps_ph'
    • +
    • Meson: Building osmesa gallium and tests at the same time results in osmesa gallium build failure
    • +
    • Vulkan version of "Middle-earth: Shadow of Mordor" has graphics glitches on RADV driver
    • +
    • [amdgpu][Navi][llvm] Minimap problem in Nier Automata
    • +
    • [bisected] anon_inode:sync_file file descriptor leak
    • +
    • Cache meson packagecach in appveyor
    • +
    • Piglit tests regression in gallium drivers
    • +
    • Black ground in Dirt 4
    • +
    • Superbibles examples crashing Mesa drivers (radeonsi) and causing gpu reset
    • +
    • [CTS] dEQP-VK.graphicsfuzz.write-red-in-loop-nest crashes
    • +
    • mesa and libglvnd install the same headers
    • +
    • Multiple EGL displays with multiple window systems leads to a crash
    • +
    • Regression: Doom (2016) crashes on Mesa 19.2 and above and Radeon 380 with Vulkan (worked on Mesa 19.1)
    • +
    • Rocket League displays corruption when the game starts
    • +
    • drm.h:50:9: error: unknown type name 'uint8_t'
    • +
    • Mesa build breaks when only building radeonsi due to missing llvm coroutines symbols
    • +
    • radeonsi aborting in LLVM validation test in si_compile_tgsi_shader()
    • +
    • meson.build:1447:6: ERROR: Problem encountered: libdrm required for gallium video statetrackers when using x11
    • +
    • Mesa doesn't build with current Scons version (3.1.0)
    • +
    • libXvMC-1.0.12 breaks mesa build
    • +
    • Meson can't find 32-bit libXvMCW in non-standard path
    • +
    • Mesa installs gl.pc and egl.pc even with libglvnd >= 1.2.0
    • +
    + +

    Changes

    + +
      +

      Adam Jackson (44):

      +
    • glx: Whitespace cleanups
    • +
    • glx: Sync <GL/glxext.h> with Khronos
    • +
    • glx: Make __glXGetDrawableAttribute return true sometimes
    • +
    • glx: Unset the direct_support bit for GLX_EXT_import_context
    • +
    • Revert "glx: Unset the direct_support bit for GLX_EXT_import_context"
    • +
    • egl: Enable 10bpc EGLConfigs for platform_{device,surfaceless}
    • +
    • gallium/xlib: Fix an obvious thinko
    • +
    • mesa: Remove unused gl_config::indexBits
    • +
    • mesa: Eliminate gl_config::have{Accum,Depth,Stencil}Buffer
    • +
    • mesa: Eliminate gl_config::rgbMode
    • +
    • gallium: Require LLVM >= 3.4
    • +
    • gallium: Require LLVM >= 3.5
    • +
    • gallium: Require LLVM >= 3.6
    • +
    • gallium: Require LLVM >= 3.7
    • +
    • gallium: Require LLVM >= 3.8
    • +
    • gallium: Require LLVM >= 3.9
    • +
    • egl/dri2: Refuse to add EGLConfigs with no supported surface types
    • +
    • glx: Remove unused indirection for glx_context->fillImage
    • +
    • gallium: Restore VSX for llvm >= 4
    • +
    • ci: Run tests on i386 cross builds
    • +
    • gallium/xlib: Remove drawable caching from the MakeCurrent path
    • +
    • gallium/xlib: Remove MakeCurrent_PrevContext
    • +
    • gallium/xlib: Fix glXMakeCurrent(dpy, None, None, ctx)
    • +
    • docs: Update bug report URLs for the gitlab migration
    • +
    • glx: Avoid atof() when computing the server's GLX version
    • +
    • glx: Fix drawable lookup bugs in glXUseXFont
    • +
    • egl/wayland: Reindent the format table
    • +
    • egl/wayland: Add FP16 format support
    • +
    • egl/wayland: Implement getCapability for the dri2 and image loaders
    • +
    • egl/surfaceless: Add FP16 format support
    • +
    • libgbm: Wire up getCapability for the image loader
    • +
    • glx: Move vertex array protocol state into the indirect backend
    • +
    • glx: Lift sending the MakeCurrent request to top-level code
    • +
    • glx: Implement GLX_EXT_no_config_context
    • +
    • Revert "glx: Implement GLX_EXT_no_config_context"
    • +
    • Revert "glx: Lift sending the MakeCurrent request to top-level code"
    • +
    • drisw: Simplify GC setup
    • +
    • drisw: Fix and simplify drawable setup
    • +
    • glx: Log the filename of the drm device if we fail to open it
    • +
    • egl/dri2: Don't dlclose() the driver on dri2_load_driver_common failure
    • +
    • surfaceless: Support EGL_WL_bind_wayland_display
    • +
    • egl: Make native display detection work more than once
    • +
    • gallium/xlib: Fix xmesa drawable creation
    • +
    • gallium: Fix a bunch of undefined left-shifts in u_format_*
    • +

      +

      Alan Coopersmith (6):

      +
    • c99_compat.h: Don't try to use 'restrict' in C++ code
    • +
    • util: Make Solaris implemention of p_atomic_add work with gcc
    • +
    • util: Workaround lack of flock on Solaris
    • +
    • util: Solaris has linux-style pthread_setname_np
    • +
    • meson: recognize "sunos" as the system name for Solaris
    • +
    • intel/common: include unistd.h for ioctl() prototype on Solaris
    • +

      +

      Alejandro Piñeiro (5):

      +
    • i965: enable ARB_gl_spirv extension and ARB_spirv_extensions for gen7+
    • +
    • mesa/version: uncomment SPIR-V extensions
    • +
    • i965: Enable OpenGL 4.6 for Gen8+
    • +
    • v3d: take into account prim_counts_offset
    • +
    • v3d: adds an extra MOV for any sig.ld*
    • +

      +

      Alex Smith (1):

      +
    • radv: Change memory type order for GPUs without dedicated VRAM
    • +

      +

      Alexandros Frantzis (1):

      +
    • gitlab-ci: Update required libdrm version
    • +

      +

      Alyssa Rosenzweig (220):

      +
    • pan/decode: Eliminate DYN_MEMORY_PROP
    • +
    • pan/decode: Don't print MALI_DRAW_NONE
    • +
    • panfrost: Move pan_invocation to shared panfrost/
    • +
    • panfrost: Set workgroups z to 32 for non-instanced graphics
    • +
    • pan/decode: Don't print canonical workgroup encoding
    • +
    • panfrost: Implement workgroups_x_shift_2 quirk
    • +
    • pan/decode: Silence workgroups_x_shift_2
    • +
    • pan/decode: Fix missing NULL terminator
    • +
    • pan/decode: Don't print zero exception_status
    • +
    • pan/decode: Express tiler structures as offsets
    • +
    • pan/decode: Allow updating mmaps
    • +
    • pan/decode: Bounds check polygon list and tiler heap
    • +
    • panfrost: Move pan_tiler.c outside of Gallium
    • +
    • pan/decode: Verify and omit polygon size
    • +
    • pan/decode: Print "just right" count of texture pointers
    • +
    • panfrost: Remove DRY_RUN
    • +
    • panfrost: Correct polygon size computations
    • +
    • pan/decode: Check for a number of potential issues
    • +
    • pan/decode: Don't print unreferenced attribute memory
    • +
    • pan/decode: Add static bounds checking utility
    • +
    • pan/decode: Do not print uniform/buffers explicitly
    • +
    • pan/decode: Validate AFBC fields are zero when AFBC is disabled
    • +
    • pan/decode: Check for MFBD preload chicken bit
    • +
    • pan/decode: Mark tripped zeroes with XXX
    • +
    • pan/decode: Normalize case matching XXX format
    • +
    • pan/decode: Normalize final instances of XXX
    • +
    • panfrost: Fix scoreboarding with dependency on job #0
    • +
    • panfrost: Do not expose PIPE_CAP_TEXTURE_MIRROR_CLAMP
    • +
    • panfrost: Don't crash on GL_CLAMP
    • +
    • pan/decode: Guard attribute unknowns
    • +
    • panfrost: Don't trip the prefix magic field
    • +
    • pan/decode: Handle VARYING_DISCARD
    • +
    • pan/decode: Treat RESERVED swizzles as errors
    • +
    • pan/decode: Validate swizzles against format
    • +
    • pan/decode: Don't print the default swizzle
    • +
    • pan/decode: Use GLSL style formats/swizzles
    • +
    • pan/decode: Guard texture unknowns as zero trips
    • +
    • pan/decode: Break out pandecode_texture function
    • +
    • pan/decode: Validate texture dimensionality
    • +
    • panfrost: nr_mipmap_levels -> levels
    • +
    • panfrost: Remove ancient TODO
    • +
    • pan/decode: Pretty-print sRGB format
    • +
    • panfrost: Break up usage2 field
    • +
    • pan/decode: Use concise texture printing
    • +
    • pan/decode: Include address in union mali_attr
    • +
    • pan/decode: Validate attribute/varying buffer pointer
    • +
    • pan/decode: Cleanup mali_attr printing
    • +
    • pan/midgard: Free liveness info
    • +
    • pan/midgard: Allocate `dependencies` on stack
    • +
    • pan/decode: Don't leak FBD pointer
    • +
    • pan/decode: Remove all_zero
    • +
    • pan/bifrost: Avoid buffer overflow in disassembler
    • +
    • pan/midgard: Represent unused nodes by ~0
    • +
    • pan/midgard: Reorder bits check to fix 8-bit masks
    • +
    • pan/midgard: Simplify contradictory check.
    • +
    • panfrost: Don't check reads_point_coord
    • +
    • pan/midgard: Mark fallthrough explicitly
    • +
    • panfrost: Pay attention to framebuffer dimension sign
    • +
    • panfrost: Clarify intention with PIPE_SWIZZLE_X check
    • +
    • panfrost: Prevent potential integer overflow in instancing
    • +
    • panfrost: Hoist job != NULL check
    • +
    • panfrost: Hoist bo != NULL check before dereference
    • +
    • panfrost: Fix missing ret assignment in DRM code
    • +
    • pan/bifrost: Correct file size signedness
    • +
    • panfrost: Guard against NULL rasterizer explicitly
    • +
    • panfrost: Pass stream_output_info by reference
    • +
    • pan/midgard: Breakout texture reg select printer
    • +
    • pan/midgard: Identify and disassemble indirect texture/sampler
    • +
    • panfrost: Don't bail on PIPE_BUFFER
    • +
    • panfrost: Implement depth range clipping
    • +
    • panfrost: Fix PIPE_BUFFER spacing
    • +
    • pan/midgard,bifrost: Expand nir_const_load_to_arr
    • +
    • nir: Remove nir_const_load_to_arr
    • +
    • pan/decode: Hoist shader-db stats to shared decode
    • +
    • pan/midgard: Sketch static analysis to uniform count
    • +
    • pan/midgard: Compute work_count via writes
    • +
    • pan/midgard: Analyze simple loads/store
    • +
    • pan/midgard: Explain ffma
    • +
    • pan/midgard: Disassemble integer constants in hex
    • +
    • pan/decode: Remove mali_attr(_meta) framing
    • +
    • pan/decode: Removing uniform buffer framing
    • +
    • pan/decode: Eliminate non-FBD dumped case
    • +
    • pan/decode: Validate MFBD tags
    • +
    • pan/decode: Validate and simplify FRAGMENT payloads
    • +
    • pan/decode: Validate blend shaders don't access I/O
    • +
    • pan/decode: Fix uniform printing
    • +
    • pan/decode: Promote <no shader> to an error
    • +
    • pan/decode: Disassemble before printing shader descriptor
    • +
    • pan/decode: Validate mali_shader_meta stats
    • +
    • pan/decode: Validate, but do not print, index buffer
    • +
    • pan/decode: Downgrade shader property mismatch to warning
    • +
    • pan/decode: Decode actual varying_meta address
    • +
    • pan/decode: Print stub for uniforms
    • +
    • pan/decode: Decouple attribute/meta printing
    • +
    • pan/decode: Remove size/stride divisibility check
    • +
    • pan/decode: Handle special varyings
    • +
    • panfrost: Remove vertex buffer offset from its size
    • +
    • panfrost: Implement gl_FragCoord correctly
    • +
    • pan/midgard: Fix writeout combining
    • +
    • pan/midgard: Analyze helper invocations
    • +
    • pan/decode: Validate and quiet helper invocation flag
    • +
    • pan/midgard, bifrost: Set lower_fdph = true
    • +
    • pan/midgard: Switch constants to uint32
    • +
    • pan/midgard: Add imov->fmov optimization
    • +
    • pan/midgard: Fold ssa_args into midgard_instruction
    • +
    • pan/midgard: Fix invert fusing with r26
    • +
    • freedreno/ir3: Link directly to Sethi-Ullman paper
    • +
    • pan/midgard: Count shader-db stats by bundled instructions
    • +
    • pan/midgard: Factor out mir_is_scalar
    • +
    • pan/midgard: Extract instruction sizing helper
    • +
    • pan/midgard: Expose mir_get/set_swizzle
    • +
    • pan/midgard: Add OP_IS_CSEL_V helper
    • +
    • pan/midgard: Fix corner case in RA
    • +
    • pan/midgard: Add post-schedule iteration helpers
    • +
    • pan/midgard: Include condition in branch->src[0]
    • +
    • pan/midgard: Document Midgard scheduling requirements
    • +
    • pan/midgard: Ensure fragment writeout is in the final block
    • +
    • pan/midgard: Track csel swizzle
    • +
    • pan/midgard: Add mir_insert_instruction*scheduled helpers
    • +
    • pan/midgard: csel_swizzle with mir get swizzle
    • +
    • pan/midgard: Extend mir_special_index to writeout
    • +
    • pan/midgard: Improve mir_mask_of_read_components
    • +
    • pan/midgard: Allow NULL argument in mir_has_arg
    • +
    • pan/midgard: Track shader quadword count while scheduling
    • +
    • pan/midgard: Add scheduling barriers
    • +
    • pan/midgard: Cleanup fragment writeout branch
    • +
    • pan/midgard: Remove texture_index
    • +
    • pan/midgard: Print branches in MIR
    • +
    • pan/midgard: Print MIR by the bundle
    • +
    • pan/midgard: Fix misc. RA issues
    • +
    • pan/midgard: Do not propagate swizzles into writeout
    • +
    • pan/midgard: Handle fragment writeout in RA
    • +
    • pan/midgard: Schedule before RA
    • +
    • pan/midgard: Remove mir_opt_post_move_eliminate
    • +
    • pan/midgard: Use shared psiz clamp pass
    • +
    • pan/decode: Fix uninitialized variables
    • +
    • pan/decode: Use %zu instead of %d
    • +
    • pan/decode: Use portable format specifier for 64-bit
    • +
    • pan/decode: Add missing format specifier
    • +
    • pan/midgard: Correct issues in disassemble.c
    • +
    • pan/midgard: Fix cppcheck issues
    • +
    • pan/midgard: Remove cppwrap.cpp
    • +
    • pan/midgard: Remove mir_print_bundle
    • +
    • pan/midgard: Remove mir_rewrite_index_*_tag
    • +
    • panfrost: Mark (1 << 31) as unsigned
    • +
    • panfrost: Fix misc. issues flagged by cppcheck
    • +
    • panfrost: Remove panfrost_upload
    • +
    • pan/midgard: Add missing parans in SWIZZLE definition
    • +
    • pan/midgard: Fix component count handling for ldst
    • +
    • pan/midgard: Squeeze indices before scheduling
    • +
    • pan/midgard: Add flatten_mir helper
    • +
    • pan/midgard: Calculate dependency graph
    • +
    • pan/midgard: Initialize worklist
    • +
    • pan/midgard: Add mir_choose_instruction stub
    • +
    • pan/midgard: Add mir_update_worklist helper
    • +
    • pan/midgard: Add mir_choose_bundle helper
    • +
    • pan/midgard: Add mir_schedule_texture/ldst/alu helpers
    • +
    • pan/midgard: Remove csel constant unit force
    • +
    • pan/midgard: Add constant intersection filters
    • +
    • pan/midgard: Add predicate->exclude
    • +
    • pan/midgard: Implement predicate->unit
    • +
    • pan/midgard: Add helpers for scheduling conditionals
    • +
    • pan/midgard: Extend csel_swizzle to branches
    • +
    • pan/midgard: Implement load/store pairing
    • +
    • pan/midgard: Add mir_choose_alu helper
    • +
    • pan/midgard: Add distance metric to choose_instruction
    • +
    • pan/midgard: Use new scheduler
    • +
    • pan/midgard: Don't double check SCALAR units
    • +
    • pan/midgard: Extend choose_instruction for scalar units
    • +
    • pan/midgard: Schedule to smul/sadd
    • +
    • pan/midgard: Only one conditional per bundle allowed
    • +
    • pan/midgard: Allow 6 instructions per bundle
    • +
    • pan/midgard: Allow writeout to see into the future
    • +
    • pan/midgard: Tightly pack 32-bit constants
    • +
    • pan/midgard: Add mir_flip helper
    • +
    • pan/midgard: Add csel invert optimization
    • +
    • pan/midgard: Allow scheduling conditions with constants
    • +
    • pan/midgard: Remove mir_has_multiple_writes
    • +
    • pan/midgard: Add mir_calculate_temp_count helper
    • +
    • pan/midgard: Move RA's liveness analysis into midgard_liveness.c
    • +
    • pan/midgard: Don't try to OR live_in of successors
    • +
    • pan/midgard: Begin tracking liveness metadata
    • +
    • pan/midgard: Invalidate liveness for mir_is_live_after
    • +
    • pan/midgard: Calculate temp_count for liveness
    • +
    • pan/midgard: Replace mir_is_live_after with new pass
    • +
    • pan/midgard: Report read mask for branch arguments
    • +
    • pan/midgard: Allow non-contiguous masks in UBO lowering
    • +
    • pan/midgard: Don't try to propagate swizzles to branches
    • +
    • pan/midgard: Add perspective ops to mir_get_swizzle
    • +
    • pan/midgard: Fix mir_mask_of_read_components with dot products
    • +
    • panfrost: Disable frame throttling
    • +
    • pan/midgard: Use 16-bit liveness masks
    • +
    • pan/midgard: Allow COMPUTE jobs in panfrost_bo_access_for_stage
    • +
    • pan/midgard: Fix memory corruption in register spilling
    • +
    • pan/midgard: Do not repeatedly spill same value
    • +
    • pan/midgard: Debug mir_insert_instruction_after_scheduled
    • +
    • pan/midgard: Identify 64-bit atomic opcodes
    • +
    • pan/midgard/disasm: Fix printing 8-bit/16-bit masks
    • +
    • pan/midgard: Factor out mir_get_alu_src
    • +
    • pan/midgard: Tableize load/store ops
    • +
    • pan/midgard: Implement OP_IS_STORE with table
    • +
    • pan/midgard: Add helpers for manipulating byte masks
    • +
    • pan/midgard: Report byte masks for read components
    • +
    • pan/midgard: Simplify mir_bytemask_of_read_components
    • +
    • pan/midgard: Implement per-byte liveness tracking
    • +
    • pan/midgard: Handle nontrivial masks in texture RA
    • +
    • pan/midgard: Create dependency graph bytewise
    • +
    • pan/midgard: Implement SIMD-aware dead code elimination
    • +
    • panfrost/ci: Update expectations list
    • +
    • pan/midgard: Add mir_set_bytemask helper
    • +
    • pan/midgard: Expose more typesize manipulation routines
    • +
    • pan/midgard: Express allocated registers as offsets
    • +
    • pipe-loader: Add kmsro pipe_loader target
    • +
    • pipe-loader: Default to kmsro if probe fails
    • +
    • panfrost: Expose serialized NIR support
    • +
    • pan/midgard: Disable precise occlusion queries
    • +
    • panfrost: Cleanup _shader_upper -> shader
    • +
    • panfrost: Remove unused definitions in mali-job.h
    • +
    • pipe-loader: Build kmsro loader for with all kmsro targets
    • +
    • gallium/util: Support POLYGON in u_stream_outputs_for_vertices
    • +

      +

      Andreas Baierl (5):

      +
    • lima/ppir: Rename ppir_op_dummy to ppir_op_undef
    • +
    • lima/ppir: Add undef handling
    • +
    • lima/ppir: Add various varying fetch sources to disassembler
    • +
    • lima: Fix compiler warning in standalone compiler
    • +
    • lima: Fix crash when there are no vertex shader attributes
    • +

      +

      Andreas Gottschling (1):

      +
    • drisw: Fix shared memory leak on drawable resize
    • +

      +

      Andres Gomez (12):

      +
    • nir/algebraic: mark float optimizations returning one parameter as inexact
    • +
    • docs: Update to OpenGL 4.6 in the release notes
    • +
    • nir/opcodes: Clear variable names confusion
    • +
    • docs: Add the maximum implemented Vulkan API version in 19.1 rel notes
    • +
    • docs: Add the maximum implemented Vulkan API version in 19.2 rel notes
    • +
    • docs: Add the maximum implemented Vulkan API version in 19.3 rel notes
    • +
    • docs/features: Update status list of Vulkan extensions
    • +
    • docs/features: Update VK_KHR_display_swapchain status
    • +
    • i965/fs: add a comment about how the rounding mode in fmul is set
    • +
    • i965/fs: set rounding mode when emitting the flrp instruction
    • +
    • docs/relnotes: add support for GL_ARB_gl_spirv, GL_ARB_spirv_extensions and OpenGL 4.6 on i965 and iris
    • +
    • egl: Remove the 565 pbuffer-only EGL config under X11.
    • +

      +

      Andres Rodriguez (2):

      +
    • radv: add RADV_DEBUG=allentrypoints
    • +
    • radv: additional query fixes
    • +

      +

      Andrii Simiklit (1):

      +
    • glsl: disallow incompatible matrices multiplication
    • +

      +

      Anuj Phogat (5):

      +
    • intel/gen12: Add L3 configurations
    • +
    • intel: Add few Ice Lake brand strings
    • +
    • genxml/gen11+: Add COMMON_SLICE_CHICKEN4 register
    • +
    • intel/gen11+: Enable Hardware filtering of Semi-Pipelined State in WM
    • +
    • intel/isl/icl: Use halign 8 instead of 4 hw workaround
    • +

      +

      Arcady Goldmints-Orlov (1):

      +
    • anv: fix descriptor limits on gen8
    • +

      +

      Bas Nieuwenhuizen (63):

      +
    • radv: Use correct vgpr_comp_cnt for VS if both prim_id and instance_id are needed.
    • +
    • radv: Emit VGT_GS_ONCHIP_CNTL for tess on GFX10.
    • +
    • radv: Disable NGG for geometry shaders.
    • +
    • tu: Set up glsl types.
    • +
    • radv: Only break batch on framebuffer change with dfsm.
    • +
    • radv: Disable dfsm by default even on Raven.
    • +
    • radv: Add DFSM support.
    • +
    • glx: Remove redundant null check.
    • +
    • amd: Build aco only if radv is enabled
    • +
    • radv: Add workaround for hang in The Surge 2.
    • +
    • turnip: Add image->image blitting.
    • +
    • turnip: Always use UINT formats for copies.
    • +
    • turnip: Disallow NPoT formats.
    • +
    • turnip: Add todo for d24_s8 copies
    • +
    • radv: Fix condition for skipping the continue CS.
    • +
    • radv: Fix warning in 32-bit build.
    • +
    • meson: Always add LLVM coroutines module.
    • +
    • amd/llvm: Fix warning due to asserted-only variable.
    • +
    • radv: Implement & enable VK_EXT_texel_buffer_alignment.
    • +
    • radv: Cleanup buffer_from_fd.
    • +
    • radv: Handle device memory alloc failure with normal free.
    • +
    • radv: Split out layout code from image creation.
    • +
    • radv: Delay patching for imported images until layout time.
    • +
    • radv: Handle slightly different image dimensions.
    • +
    • radv: Unset vk_info in radv_image_create_layout.
    • +
    • radv: Add VK_ANDROID_external_memory_android_hardware_buffer.
    • +
    • radv/android: Add android hardware buffer field to device memory.
    • +
    • radv/android: Add android hardware buffer queries.
    • +
    • radv: Disallow sparse shared images.
    • +
    • radv: Derive android usage from create flags.
    • +
    • radv: Deal with Android external formats.
    • +
    • radv/android: Add android hardware buffer import/export.
    • +
    • radv: Allow Android image binding.
    • +
    • radv: Expose image handle compat types for Android handles.
    • +
    • radv: Check the size of the imported buffer.
    • +
    • radv: Enable VK_ANDROID_external_memory_android_hardware_buffer.
    • +
    • nir/dead_cf: Remove dead control flow after infinite loops.
    • +
    • radv: Fix single stage constant flush with merged shaders.
    • +
    • radv: Compute hashes in secure process for secure compilation.
    • +
    • radv: Add an early exit in the secure compile if we already have the cache entries.
    • +
    • radv: Clean up unused variable.
    • +
    • radv: Split out commandbuffer submission.
    • +
    • radv: Do sparse binding in queue submission.
    • +
    • radv: Improve fence signalling in QueueSubmit.
    • +
    • radv: Always enable syncobj when supported for all fences/semaphores.
    • +
    • radv: Split semaphore into two parts as enum+union.
    • +
    • radv: Add temporary datastructure for submissions.
    • +
    • radv: Add timelines with a VK_KHR_timeline_semaphore impl.
    • +
    • radv: Add wait-before-submit support for timelines.
    • +
    • radv: Enable VK_KHR_timeline_semaphore.
    • +
    • radv: Start signalling semaphores in WSI acquire.
    • +
    • radv: Allocate space for temp. semaphore parts.
    • +
    • radv: Fix timeout handling in syncobj wait.
    • +
    • radv: Remove _mesa_locale_init/fini calls.
    • +
    • turnip: Remove _mesa_locale_init/fini calls.
    • +
    • anv: Remove _mesa_locale_init/fini calls.
    • +
    • radv: Fix disk_cache_get size argument.
    • +
    • radv: Close all unnecessary fds in secure compile.
    • +
    • radv: Do not change scratch settings while shaders are active.
    • +
    • radv: Allocate cmdbuffer space for buffer marker write.
    • +
    • radv: Unify max_descriptor_set_size.
    • +
    • radv: Fix timeline semaphore refcounting.
    • +
    • radv: Fix RGBX Android<->Vulkan format correspondence.
    • +

      +

      Ben Crocker (1):

      +
    • llvmpipe: use ppc64le/ppc64 Large code model for JIT-compiled shaders
    • +

      +

      Boris Brezillon (73):

      +
    • panfrost: Free the instruction object in mir_remove_instruction()
    • +
    • panfrost: Free all block/instruction objects before leaving midgard_compile_shader_nir()
    • +
    • panfrost: Make sure bundle.instructions[] contains valid instructions
    • +
    • Revert "panfrost: Free all block/instruction objects before leaving midgard_compile_shader_nir()"
    • +
    • panfrost: Use ralloc() to allocate instructions to avoid leaking those objs
    • +
    • panfrost: Reset the damage area on imported resources
    • +
    • panfrost: Add transient BOs to job batches
    • +
    • panfrost: s/job/batch/
    • +
    • panfrost: Pass a batch to panfrost_drm_submit_vs_fs_batch()
    • +
    • panfrost: Stop passing a ctx to functions being passed a batch
    • +
    • panfrost: Make transient allocation rely on the BO cache
    • +
    • panfrost: Convert ctx->{scratchpad, tiler_heap, tiler_dummy} to plain BOs
    • +
    • panfrost: Get rid of unused panfrost_context fields
    • +
    • panfrost: Get rid of the now unused SLAB allocator
    • +
    • panfrost: Rename pan_bo_cache.c into pan_bo.c
    • +
    • panfrost: Fix a list_assert() in schedule_block()
    • +
    • panfrost: Rework midgard_pair_load_store() to kill the nested foreach loop
    • +
    • panfrost: Use a pipe_framebuffer_state as the batch key
    • +
    • panfrost: Get rid of the unused 'flush jobs accessing res' infra
    • +
    • panfrost: Allow testing if a specific batch is targeting a scanout FB
    • +
    • panfrost: Pass a batch to panfrost_{allocate,upload}_transient()
    • +
    • panfrost: Pass a batch to functions emitting FB descs
    • +
    • panfrost: Use ctx->wallpaper_batch in panfrost_blit_wallpaper()
    • +
    • panfrost: Pass a batch to panfrost_set_value_job()
    • +
    • panfrost: Prepare things to avoid flushes on FB switch
    • +
    • panfrost: Delay payloads[].offset_start initialization
    • +
    • panfrost: Move the fence creation in panfrost_flush()
    • +
    • panfrost: Move the batch submission logic to panfrost_batch_submit()
    • +
    • panfrost: Stop exposing internal panfrost_*_batch() functions
    • +
    • panfrost: Use the correct type for the bo_handle array
    • +
    • panfrost: Add missing panfrost_batch_add_bo() calls
    • +
    • panfrost: Add polygon_list to the batch BO set at allocation time
    • +
    • panfrost: Kill a useless memset(0) in panfrost_create_context()
    • +
    • panfrost: Stop passing has_draws to panfrost_drm_submit_vs_fs_batch()
    • +
    • panfrost: Get rid of pan_drm.c
    • +
    • panfrost: Move panfrost_bo_{reference,unreference}() to pan_bo.c
    • +
    • panfrost: s/PAN_ALLOCATE_/PAN_BO_/
    • +
    • panfrost: Move the BO API to its own header
    • +
    • panfrost: Stop exposing panfrost_bo_cache_{fetch,put}()
    • +
    • panfrost: Don't check if BO is mmaped before calling panfrost_bo_mmap()
    • +
    • panfrost: Stop passing screen around for BO operations
    • +
    • panfrost: Stop using panfrost_bo_release() outside of pan_bo.c
    • +
    • panfrost: Add panfrost_bo_{alloc,free}()
    • +
    • panfrost: Don't return imported/exported BOs to the cache
    • +
    • panfrost: Add the panfrost_batch_create_bo() helper
    • +
    • panfrost: Add FBO BOs to batch->bos earlier
    • +
    • panfrost: Allocate tiler and scratchpad BOs per-batch
    • +
    • Revert "panfrost: Rework midgard_pair_load_store() to kill the nested foreach loop"
    • +
    • panfrost: Fix indexed draws
    • +
    • dEQP-GLES2.functional.buffer.write.use.index_array.* are passing now.
    • +
    • panfrost: Add the shader BO to the batch in patch_shader_state()
    • +
    • panfrost: Extend the panfrost_batch_add_bo() API to pass access flags
    • +
    • panfrost: Make panfrost_batch->bos a hash table
    • +
    • panfrost: Add a batch fence
    • +
    • panfrost: Use the per-batch fences to wait on the last submitted batch
    • +
    • panfrost: Add a panfrost_freeze_batch() helper
    • +
    • panfrost: Start tracking inter-batch dependencies
    • +
    • panfrost: Prepare panfrost_fence for batch pipelining
    • +
    • panfrost: Add a panfrost_flush_all_batches() helper
    • +
    • panfrost: Add a panfrost_flush_batches_accessing_bo() helper
    • +
    • panfrost: Add flags to reflect the BO imported/exported state
    • +
    • panfrost: Make sure the BO is 'ready' when picked from the cache
    • +
    • panfrost: Do fine-grained flushing when preparing BO for CPU accesses
    • +
    • panfrost: Kill the explicit serialization in panfrost_batch_submit()
    • +
    • panfrost: Get rid of the flush in panfrost_set_framebuffer_state()
    • +
    • Revert "st/dri2: Implement DRI2bufferDamageExtension"
    • +
    • Revert "Revert "st/dri2: Implement DRI2bufferDamageExtension""
    • +
    • panfrost: Make sure a clear does not re-use a pre-existing batch
    • +
    • panfrost: Draw the wallpaper when only depth/stencil bufs are cleared
    • +
    • panfrost: Fix support for packed 24-bit formats
    • +
    • panfrost: Fix the DISCARD_WHOLE_RES case in transfer_map()
    • +
    • gallium: Fix the ->set_damage_region() implementation
    • +
    • panfrost: Make sure we reset the damage region of RTs at flush time
    • +

      +

      Brian Paul (3):

      +
    • st/nir: fix illegal designated initializer in st_glsl_to_nir.cpp
    • +
    • REVIEWERS: add VMware reviewers
    • +
    • Call shmget() with permission 0600 instead of 0777
    • +

      +

      Caio Marcelo de Oliveira Filho (66):

      +
    • intel/compiler: Silence maybe-uninitialized warning in GCC 9.1.1
    • +
    • anv: Drop unused local variable
    • +
    • compiler/glsl: Fix warning about unused function
    • +
    • intel/decoders: Avoid uninitialized variable warnings
    • +
    • iris: Guard GEN9-only function in Iris state to avoid warning
    • +
    • tgsi: Remove unused local
    • +
    • i965: Silence brw_blorp uninitialized warning
    • +
    • nir/lower_explicit_io: Handle 1 bit loads and stores
    • +
    • glsl/nir: Avoid overflow when setting max_uniform_location
    • +
    • mesa/st: Do not rely on name to identify special uniforms
    • +
    • compiler: Add glsl_contains_opaque() helper
    • +
    • mesa: Pack gl_program_parameter struct
    • +
    • glsl/nir: Fill in the Parameters in NIR linker
    • +
    • mesa: Fill Parameter storage indices even when not using SPIR-V
    • +
    • mesa/program: Associate uniform storage without using names
    • +
    • mesa/st: Lookup parameters without using names
    • +
    • mesa/st: Extract preprocessing NIR steps
    • +
    • mesa/st: Add support for SPIR-V shaders
    • +
    • mesa/st: Don't expect prog->nir to already exist
    • +
    • mesa/spirv: Set a few more extensions
    • +
    • gallium: Add ARB_gl_spirv support
    • +
    • glsl/nir: Add and use a gl_nir_link() function
    • +
    • iris: Enable ARB_gl_spirv and ARB_spirv_extensions
    • +
    • mesa/st: Fallback to name lookup when the variable have no Parameter
    • +
    • spirv: Update JSON and headers to 1.5
    • +
    • spirv: Handle ShaderLayer and ShaderViewportIndex capabilities
    • +
    • spirv: Add missing break for capability handling
    • +
    • intel/fs: Add Fall-through comment
    • +
    • mesa: Extension boilerplate for EXT_demote_to_helper_invocation
    • +
    • glsl: Add ir_demote
    • +
    • glsl: Parse `demote` statement
    • +
    • glsl: Add helperInvocationEXT() builtin
    • +
    • gallium: Add PIPE_CAP_DEMOTE_TO_HELPER_INVOCATION
    • +
    • iris: Enable EXT_demote_to_helper_invocation
    • +
    • i965: Enable EXT_demote_to_helper_invocation
    • +
    • docs/relnotes: Add EXT_demote_to_helper_invocation support on iris, i965
    • +
    • docs: Fix GL_EXT_demote_to_helper_invocation name
    • +
    • vulkan: Update the XML and headers to 1.1.124
    • +
    • spirv: Implement SPV_KHR_shader_clock
    • +
    • anv: Implement VK_KHR_shader_clock
    • +
    • anv: Enable VK_EXT_shader_subgroup_{ballot,vote}
    • +
    • docs: Update recently enabled VK extensions on Intel
    • +
    • intel: Add INTEL_DEBUG=nofc for disabling fast clears
    • +
    • anv: Disable fast clears when running with INTEL_DEBUG=nofc
    • +
    • iris: Disable fast clears when running with INTEL_DEBUG=nofc
    • +
    • i965: Disable fast clears when running with INTEL_DEBUG=nofc
    • +
    • vulkan: Update the XML and headers to 1.1.125
    • +
    • anv: Advertise VK_KHR_spirv_1_4
    • +
    • intel/fs/gen12: Add tests for scoreboard pass
    • +
    • nir: Add scoped_memory_barrier intrinsic
    • +
    • nir/tests: Add copy propagation tests with scoped_memory_barrier
    • +
    • intel/fs: Implement scoped_memory_barrier
    • +
    • spirv: Parse memory semantics for atomic operations
    • +
    • spirv: Emit memory barriers for atomic operations
    • +
    • spirv: Add SpvMemoryModelVulkan and related capabilities
    • +
    • spirv: Add option to emit scoped memory barriers
    • +
    • spirv: Handle MakeTexelAvailable/Visible
    • +
    • spirv: Handle MakePointerAvailable/Visible
    • +
    • anv: Implement VK_KHR_vulkan_memory_model
    • +
    • spirv: Add imageoperands_to_string helper
    • +
    • spirv: Check that only one offset is defined as Image Operand
    • +
    • spirv: Add helper to find args of Image Operands
    • +
    • anv: Fix output of INTEL_DEBUG=bat for chained batches
    • +
    • spirv: Don't fail if multiple ordering semantics bits are set
    • +
    • spirv: Don't leak GS initialization to other stages
    • +
    • anv: Initialize depth_bounds_test_enable when not explicitly set
    • +

      +

      Chris Wilson (2):

      +
    • iris: Allow packed RGB pbo uploads
    • +
    • st/mesa: Map MESA_FORMAT_RGB_UNORM8 <-> PIPE_FORMAT_R8G8B8_UNORM
    • +

      +

      Christian Gmeiner (13):

      +
    • gallium: util_set_vertex_buffers_mask(..): make use of u_bit_consecutive(..)
    • +
    • etnaviv: a bit of micro-optimization
    • +
    • Revert "gallium: remove PIPE_CAP_TEXTURE_SHADOW_MAP"
    • +
    • etnaviv: disable ARB_shadow
    • +
    • etnaviv: etna_resource_copy_region(..): drop assert
    • +
    • etnaviv: support ARB_framebuffer_object
    • +
    • etnaviv: nir: start to make use of compile_error(..)
    • +
    • etnaviv: output the same shader-db format as freedreno, v3d and intel
    • +
    • etnaviv: fix compile warnings
    • +
    • etnaviv: fix code style
    • +
    • etnaviv: store updated usage in pipe_transfer object
    • +
    • etnaviv: keep track of buffer valid ranges for PIPE_BUFFER
    • +
    • etnaviv: remove dead code
    • +

      +

      Clément Guérin (1):

      +
    • radeonsi: enable zerovram for Rocket League
    • +

      +

      Connor Abbott (40):

      +
    • st/nir: Fix num_inputs for VS inputs
    • +
    • radeonsi/nir: Don't recompute num_inputs and num_outputs
    • +
    • ac/nir: Handle const array offsets in get_deref_offset()
    • +
    • ac/nir: Assert GS input index is constant
    • +
    • radeonsi/nir: Don't add const offset to indirect
    • +
    • radeonsi/nir: Add const_index when loading GS inputs
    • +
    • radeonsi/nir: Rewrite store intrinsic gathering
    • +
    • radeonsi/nir: Rewrite output scanning
    • +
    • ac/nir: add a workaround for viewing a slice of 3D as a 2D image
    • +
    • ac/nir: Remove gfx9_stride_size_workaround_for_atomic
    • +
    • ac/nir: Rewrite gather4 integer workaround based on radeonsi
    • +
    • ac/nir: Fix gather4 integer wa with unnormalized coordinates
    • +
    • nir: Fix num_ssbos when lowering atomic counters
    • +
    • ttn: Fill out more info fields
    • +
    • radeonsi/nir: Remove uniform variable scanning
    • +
    • radv/radeonsi: Don't count read-only data when reporting code size
    • +
    • ac/nir: Support load_constant intrinsics
    • +
    • ac/nir: Enable nir_opt_large_constants
    • +
    • st/nir: Call nir_remove_unused_variables() in the opt loop
    • +
    • st/nir: Don't lower indirects when linking
    • +
    • gallium: Plumb through a way to disable GLSL const lowering
    • +
    • radeonsi/nir: Don't lower constant arrays to uniforms
    • +
    • radv: Call nir_propagate_invariant()
    • +
    • lima/gpir: Do all lowerings before rsched
    • +
    • lima/gpir: Ignore unscheduled successors in can_use_complex()
    • +
    • lima/gpir: Fix schedule_first insertion logic
    • +
    • lima/gpir: Fix fake dep handling for schedule_first nodes
    • +
    • lima/gpir: Disallow moves for schedule_first nodes
    • +
    • nir/opt_if: Fix undef handling in opt_split_alu_of_phi()
    • +
    • lima/gpir: Fix compiler warning
    • +
    • lima/gpir: Only try to place actual children
    • +
    • lima/gpir: Support branch instructions
    • +
    • lima/gpir: Use registers for values live in multiple blocks
    • +
    • lima/gpir: Fix postlog2 fixup handling
    • +
    • lima/gpir: Don't emit movs when translating from NIR
    • +
    • lima/gpir: Fix 64-bit shift in scheduler spilling
    • +
    • nir/opt_large_constants: Handle store writemasks
    • +
    • nir: Fix overlapping vars in nir_assign_io_var_locations()
    • +
    • nir/sink: Rewrite loop handling logic
    • +
    • nir/sink: Don't sink load_ubo to outside of its defining loop
    • +

      +

      Daniel Kolesa (1):

      +
    • util: add auxv based PowerPC AltiVec/VSX detection
    • +

      +

      Daniel Schürmann (44):

      +
    • nir/algebraic: some subtraction optimizations
    • +
    • aco: Initial commit of independent AMD compiler
    • +
    • radv/aco: Setup alternate path in RADV to support the experimental ACO compiler
    • +
    • radv: enable clustered reductions
    • +
    • radv/aco: enable VK_EXT_shader_demote_to_helper_invocation
    • +
    • radv: remove dead shared variables
    • +
    • aco: only emit waitcnt on loop continues if we there was some load or export
    • +
    • freedreno: Enable the nir_opt_algebraic_late() pass.
    • +
    • nir: recombine nir_op_*sub when lower_sub = false
    • +
    • nir: Remove unnecessary subtraction optimizations
    • +
    • radv/aco: Don't lower subtractions
    • +
    • aco: call nir_opt_algebraic_late() exhaustively
    • +
    • nouveau: set lower_sub = true
    • +
    • aco: re-use existing phi instruction when lowering boolean phis
    • +
    • aco: don't reorder instructions in order to lower boolean phis
    • +
    • aco: don't combine minmax3 if there is a neg or abs modifier in between
    • +
    • aco: ensure that uniform booleans are computed in WQM if their uses happen in WQM
    • +
    • aco: refactor value numbering
    • +
    • aco: restrict scheduling depending on max_waves
    • +
    • aco: only skip RAR dependencies if the variable is killed somewhere
    • +
    • aco: add can_reorder flags to load_ubo and load_constant
    • +
    • aco: don't schedule instructions through depending VMEM instructions
    • +
    • aco: Lower to CSSA
    • +
    • aco: improve live variable analysis
    • +
    • aco: remove potential critical edge on loops.
    • +
    • aco: fix live-range splits of phis
    • +
    • aco: fix transitive affinities of spilled variables
    • +
    • aco: don't insert the exec mask into set of live-out variables when spilling
    • +
    • aco: consider loop_exit blocks like merge blocks, even if they have only one predecessor
    • +
    • aco: don't add interferences between spilled phi operands
    • +
    • aco: simplify calculation of target register pressure when spilling
    • +
    • aco: ensure that spilled VGPR reloads are done after p_logical_start
    • +
    • aco: omit linear VGPRs as spill variables
    • +
    • aco: always set scratch_offset in startpgm
    • +
    • aco: implement VGPR spilling
    • +
    • docs/relnotes/new_features.txt: Add note about ACO
    • +
    • aco: fix immediate offset for spills if scratch is used
    • +
    • aco: only use single-dword loads/stores for spilling
    • +
    • aco: fix accidential reordering of instructions when scheduling
    • +
    • aco: workaround Tonga/Iceland hardware bug
    • +
    • aco: fix invalid access on Pseudo_instructions
    • +
    • aco: preserve kill flag on moved operands during RA
    • +
    • aco: don't split live-ranges of linear VGPRs
    • +
    • aco: fix a couple of value numbering issues
    • +

      +

      Daniel Stone (1):

      +
    • panfrost: Respect offset for imported resources
    • +

      +

      Danilo Spinella (1):

      +
    • egl: Include stddef.h in generated source
    • +

      +

      Danylo Piliaiev (10):

      +
    • nir/loop_unroll: Update the comments for loop_prepare_for_unroll
    • +
    • nir/loop_unroll: Prepare loop for unrolling in wrapper_unroll
    • +
    • nir/loop_analyze: Treat do{}while(false) loops as 0 iterations
    • +
    • glsl: Fix unroll of do{} while(false) like loops
    • +
    • tgsi_to_nir: Translate TGSI_INTERPOLATE_COLOR as INTERP_MODE_NONE
    • +
    • iris: Fix fence leak in iris_fence_flush
    • +
    • st/nine: Ignore D3DSIO_RET if it is the last instruction in a shader
    • +
    • intel/compiler: Fix C++ one definition rule violations
    • +
    • glsl: Initialize all fields of ir_variable in constructor
    • +
    • i965: Unify CC_STATE and BLEND_STATE atoms on Haswell as a workaround
    • +

      +

      Dave Airlie (75):

      +
    • virgl: drop unused format field
    • +
    • virgl: fix format conversion for recent gallium changes.
    • +
    • gallivm: fix atomic compare-and-swap
    • +
    • llvmpipe: refactor jit type creation
    • +
    • gallivm: make lp_build_float_to_r11g11b10 take a const src
    • +
    • gallivm: handle helper invocation (v2)
    • +
    • gallivm: move first/last level jit texture members.
    • +
    • llvmpipe: handle early test property.
    • +
    • gallivm: add a basic image limit
    • +
    • llvmpipe: move the fragment shader variant key to dynamic length.
    • +
    • draw: add jit image type for vs/gs images.
    • +
    • llvmpipe: introduce image jit type to fragment shader jit.
    • +
    • gallivm/tgsi: add image interface to tgsi builder
    • +
    • gallivm: add image load/store/atomic support
    • +
    • draw: add vs/gs images support
    • +
    • llvmpipe: add fragment shader image support
    • +
    • llvmpipe: bind vertex/geometry shader images
    • +
    • gallivm: add support for fences api on older llvm
    • +
    • gallivm: add memory barrier support
    • +
    • llvmpipe: flush on api memorybarrier.
    • +
    • llvmpipe: enable ARB_shader_image_load_store
    • +
    • docs: add shader image extensions for llvmpipe
    • +
    • gallivm: fix appveyor build after images changes
    • +
    • gallivm: disable accurate cube corner for integer textures.
    • +
    • llvmpipe: enable fb no attach
    • +
    • gallivm/flow: add counter reset for loops
    • +
    • gallivm: add coroutine support files to gallivm.
    • +
    • gallivm: add coroutine pass manager support
    • +
    • llvmpipe: reogranise jit pointer ordering
    • +
    • gallivm: add new compute related intrinsics
    • +
    • gallivm: add support for compute shared memory
    • +
    • llvmpipe: add compute threadpool + mutex
    • +
    • gallivm: add barrier support for compute shaders.
    • +
    • llvmpipe: introduce compute shader context
    • +
    • llvmpipe: add initial compute state structs
    • +
    • gallivm: add compute jit interface.
    • +
    • llvmpipe: add compute debug option
    • +
    • llvmpipe: add initial shader create/bind/destroy variants framework.
    • +
    • llvmpipe: introduce new state dirty tracking for compute.
    • +
    • llvmpipe: introduce variant building infrastrucutre.
    • +
    • llvmpipe: add compute shader generation.
    • +
    • llvmpipe: add grid launch
    • +
    • llvmpipe: add compute pipeline statistics support.
    • +
    • llvmpipe: add support for compute constant buffers.
    • +
    • llvmpipe: add compute sampler + sampler view support.
    • +
    • llvmpipe: add ssbo support to compute shaders
    • +
    • llvmpipe: add compute shader images support
    • +
    • llvmpipe: add compute shader parameter fetching support
    • +
    • llvmpipe: add local memory allocation path
    • +
    • llvmpipe: enable compute shaders if LLVM has coroutines
    • +
    • docs: add llvmpipe features for fb_no_attach and compute shaders
    • +
    • st/mesa: Prefer R8 for bitmap textures
    • +
    • st/mesa: fix R8 bitmap texture for TGSI paths.
    • +
    • llvmpipe: make texture buffer offset alignment == 16
    • +
    • llvmpipe/draw: fix image sizes for vertex/geometry shaders.
    • +
    • llvmpipe/draw: handle UBOs that are < 16 bytes.
    • +
    • gallivm/sample: add gather component selection to the key.
    • +
    • gallium: add a a new cap for changing the TGSI TG4 instruction encoding
    • +
    • st/glsl: add support for alternate TG4 encoding.
    • +
    • llvmpipe: add support for tg4 component selection.
    • +
    • gallivm: fix coroutines on aarch64 with llvm 8
    • +
    • gallivm/draw/swr: make the gs_iface not depend on tgsi.
    • +
    • nir: add a pass to lower flat shading.
    • +
    • gallium: add flatshade lowering capability
    • +
    • st/mesa: handling lower flatshading for NIR drivers.
    • +
    • llvmpipe: handle compute shader launch with 0 threads
    • +
    • zink: ask for flatshade lowering
    • +
    • zink: add dri loader
    • +
    • zink: query support (v2)
    • +
    • zink/spirv: store all values as uint.
    • +
    • zink: add support for compressed formats
    • +
    • zink: add sample mask support
    • +
    • zink: add samples to rasterizer
    • +
    • zink: attempt to get multisample resource creation right
    • +
    • llvmpipe/ppc: fix if/ifdef confusion in backport.
    • +

      +

      Dave Stevenson (1):

      +
    • broadcom/v3d: Allow importing linear BOs with arbitrary offset/stride.
    • +

      +

      Duncan Hopkins (7):

      +
    • zink: clamped limits to INT_MAX when stored as uint32_t.
    • +
    • zink: fix line-width calculation
    • +
    • zink: respect ubo buffer alignment requirement
    • +
    • zink: limited uniform buffer size so the limits is not exceeded.
    • +
    • zink: pass line width from rast_state to gfx_pipeline_state.
    • +
    • zink: Use optimal layout instead of general. Reduces valid layer warnings. Fixes RADV image noise.
    • +
    • zink: make sure src image is transfer-src-optimal
    • +

      +

      Dylan Baker (120):

      +
    • docs: Mark 19.2.0-rc2 as done and push back rc3 and rc4/final
    • +
    • glsl/tests: Handle windows \r\n new lines
    • +
    • meson: don't try to generate i18n translations on windows
    • +
    • meson: Make shared-glapi a combo
    • +
    • meson: don't build glapi_static_check_table on windows
    • +
    • add a git ignore for subprojects
    • +
    • meson: add a zlib subproject
    • +
    • meson: add a expat subproject
    • +
    • glapi: export glapi_destroy_multithread when building shared-glapi on windows
    • +
    • meson: fix dl detection on non cygwin windows
    • +
    • meson: build getopt when using msvc
    • +
    • meson: Add a platform for windows
    • +
    • meson: don't build glx or dri by default on windows
    • +
    • meson: don't allow glvnd on windows
    • +
    • meson: don't generate file into subdirs
    • +
    • Docs: mark that 19.2.0-rc3 has been released
    • +
    • scons: Make scons and meson agree about path to glapi generated headers
    • +
    • docs: Add release notes for 19.2.0
    • +
    • docs: add SHA256 sum for 19.2.0
    • +
    • docs: update calendar, add news item, and link release notes for 19.2.0
    • +
    • release: Push 19.3 back two weeks
    • +
    • bin/get-pick-list: use --oneline=pretty instead of --oneline
    • +
    • meson: fix logic for generating .pc files with old glvnd
    • +
    • meson: Try finding libxvmcw via pkg-config before using find_library
    • +
    • meson: Link xvmc with libxv
    • +
    • meson: gallium media state trackers require libdrm with x11
    • +
    • docs: update install docs for meson
    • +
    • docs: use https for mesonbuild.com
    • +
    • docs: remove stray newline
    • +
    • meson: remove -DGALLIUM_SOFTPIPE from st/osmesa
    • +
    • docs: Add use of Closes: tag for closing gitlab issues
    • +
    • docs: add a new_features.text file and remove 19.3.0 release notes
    • +
    • scripts: Add a gen_release_notes.py script
    • +
    • release: Add an update_release_calendar.py script
    • +
    • bin: delete unused releasing scripts
    • +
    • meson: Only error building gallium video without libdrm when the platform is drm
    • +
    • docs: Add relnotes for 19.2.1
    • +
    • docs: Add SHA256 sum for 19.2.1
    • +
    • docs: update calendar, add news item, and link release notes for 19.2.1
    • +
    • util: use _WIN32 instead of WIN32
    • +
    • meson: add windows compiler checks and libraries
    • +
    • meson: Add windows defines to glapi
    • +
    • meson: Add necessary defines for mesa_gallium on windows
    • +
    • meson: build gallium gdi winsys
    • +
    • meson: build wgl state tracker
    • +
    • meson: build libgl-gdi target
    • +
    • meson: build graw-gdi target
    • +
    • meson: fix gallium-osmesa to build for windows
    • +
    • meson: Don't check for posix_memalign on windows
    • +
    • util/xmlconfig: include strndup.h for windows
    • +
    • meson: fix pipe-loader compilation for windows
    • +
    • meson: don't look for rt on windows
    • +
    • meson: Add support for using win_flex and win_bison on windows
    • +
    • meson: force inclusion of inttypes.h for glcpp with msvc
    • +
    • meson: disable sse4.1 optimizations with msvc
    • +
    • meson: add switches for SWR with MSVC
    • +
    • meson: don't define USE_ELF_TLS for windows
    • +
    • meson: Add idep_getopt for tests
    • +
    • meson: Add msvc compat args to util/tests
    • +
    • meson: Set visibility and compat args for graw
    • +
    • meson: don't build gallium trivial tests on windows
    • +
    • meson: disable graw tests on mingw
    • +
    • meson: don't build or run mesa-sha1 test on windows
    • +
    • meson: maintain names of shared API libraries
    • +
    • meson: add msvc compat args to swr
    • +
    • meson: don't error on formaters with mingw
    • +
    • meson: only build timspec test if timespec is available
    • +
    • meson: glcpp tests are expected to fail on windows
    • +
    • meson/util: Don't run string_buffer tests on mingw
    • +
    • glsl/tests: Handle no-exec errors
    • +
    • docs: update meson docs for windows
    • +
    • appveyor: Add support for meson as well as scons on windows
    • +
    • gitlab-ci: Add a mingw x86_64 job
    • +
    • meson: Don't use expat on windows
    • +
    • gitlab-ci: Add a pkg-config for mingw
    • +
    • Revert "gitlab-ci: Disable meson-mingw32-x86_64 job again for now"
    • +
    • gitlab-ci: Set the meson wrapmode to disabled
    • +
    • appveyor: Cache meson's wrap downloads
    • +
    • meson/llvmpipe: Add dep_llvm to driver_swrast
    • +
    • meson: Add support for wrapping llvm
    • +
    • meson: Use cmake to find LLVM when building for windows
    • +
    • docs: update meson docs for windows
    • +
    • appveyor: Add support for building llvmpipe with meson
    • +
    • appveyor: Move appveyor script into .appveyor directory
    • +
    • docs: Add new feature for compiling for windows with meson
    • +
    • meson: Require meson >= 0.49.1 when using icc or icl
    • +
    • scons: Use print_function ins SConstruct
    • +
    • scons: Print a deprecation warning about using scons on not windows
    • +
    • scons: Also print a deprecation warning on windows
    • +
    • docs: Add release not about scons deprecation
    • +
    • docs: Add release notes for 19.2.2
    • +
    • docs: Add sha256 sum for 19.2.2
    • +
    • docs: update calendar, add news item and link release notes for 19.2.2
    • +
    • bin/gen_release_notes.py: fix conditional of bugfix
    • +
    • bin/gen_release_notes.py: strip '#' from gitlab bugs
    • +
    • bin/gen_release_notes.py: Return "None" if there are no new features
    • +
    • bin/post_version.py: Pass version as an argument
    • +
    • bin/post_version.py: white space fixes
    • +
    • bin/post_release.py: Add .html to hrefs
    • +
    • bin/gen_release_notes.py: html escape all external data
    • +
    • bin/gen_release_notes.py: Add a warning if new features are introduced in a point release
    • +
    • docs: update releasing process to use new scripts and gitlab
    • +
    • nir: Fix invalid code for MSVC
    • +
    • gitlab-ci: refactor out some common stuff for Windows and Linux
    • +
    • gitlab-ci: Add a job for meson on windows
    • +
    • VERSION: bump to rc1
    • +
    • nir: correct use of identity check in python
    • +
    • meson: Add dep_glvnd to egl deps when building with glvnd
    • +
    • Bump VERSION to 19.3.0-rc2
    • +
    • cherry-ignore: Update for 19.3-rc3 cycle
    • +
    • Bump version for -rc3
    • +
    • cherry-ignore: update for 19.3.0-rc4 cycle
    • +
    • VERSION: bump for 19.3.0-rc4
    • +
    • VERSION: Bump version for -rc5
    • +
    • VERSION: bump version for 19.3-rc6
    • +
    • cherry-ignore: update for 19.3-rc7
    • +
    • meson/broadcom: libbroadcom_cle needs expat headers
    • +
    • meson/broadcom: libbroadcom_cle also needs zlib
    • +
    • Revert "egl: avoid local modifications for eglext.h Khronos standard header file"
    • +
    • Revert "egl: move #include of local headers out of Khronos headers"
    • +

      +

      Eduardo Lima Mitev (4):

      +
    • nir: Add new texop nir_texop_tex_prefetch
    • +
    • freedreno/ir3: Add a NIR pass to select tex instructions eligible for pre-fetch
    • +
    • nir: Add a new ALU nir_op_imad24_ir3
    • +
    • freedreno/ir3: Handle newly added opcode nir_op_imad24_ir3
    • +

      +

      Emil Velikov (3):

      +
    • mesa: bump version to 19.3.0-devel
    • +
    • docs: add 19.3.0-devel release notes template
    • +
    • docs: update calendar for 19.2.x
    • +

      +

      Eric Anholt (57):

      +
    • gallium: Add a block depth field to the u_formats table.
    • +
    • gallium: Add block depth to the format utils.
    • +
    • gallium: Add the ASTC 3D formats.
    • +
    • gallium: Fix mesa format name in unit test failure path.
    • +
    • gallium: Skip generating the pack/unpack union if we don't use it.
    • +
    • gallium: Drop the useless union wrapper on pack/unpack.
    • +
    • gallium: Drop a bit of dead code from the pack/unpack python.
    • +
    • gallium: Fix big-endian addressing of non-bitmask array formats.
    • +
    • gallium: Don't emit identical endian-dependent pack/unpack code.
    • +
    • freedreno/a6xx: Fix non-mipmap filtering selection.
    • +
    • freedreno: Fix the type of single-component scaled vertex attrs.
    • +
    • gallium/osmesa: Introduce a test.
    • +
    • gallium/osmesa: Fix a race in creating the stmgr.
    • +
    • gallium/osmesa: Move 565 format selection checks where the rest are.
    • +
    • uapi: Update drm_fourcc.h
    • +
    • dri: Use DRM_FORMAT_* instead of defining our own copy.
    • +
    • gitlab-ci: Disable dEQP's watchdog timer.
    • +
    • gitlab-ci: Log the driver version that got tested.
    • +
    • freedreno: Introduce gitlab-based CI.
    • +
    • gitlab-ci/a630: Disable flappy layout_binding.ssbo.fragment_binding_array
    • +
    • egl/android: Fix build since the DRI fourcc removal.
    • +
    • gitlab-ci/a630: Drop remaining dEQP-GLES3.functional.draw.random.* xfails.
    • +
    • gitlab-ci/a630: Drop the MSAA expected failure.
    • +
    • gitlab-ci: Make the test job fail when bugs are unexpectedly fixed.
    • +
    • freedreno: Fix invalid read when a block has no instructions.
    • +
    • freedreno/a3xx: Mostly fix min-vs-mag filtering decisions on non-mipmap tex.
    • +
    • shader_enums: Move MAX_DRAW_BUFFERS to this file.
    • +
    • turnip: Add a .editorconfig and .dir-locals.el
    • +
    • turnip: Silence compiler warning about uninit pipeline.
    • +
    • turnip: Fix failure behavior of vkCreateGraphicsPipelines.
    • +
    • vc4: Enable the nir_opt_algebraic_late() pass.
    • +
    • v3d: Enable the late algebraic optimizations to get real subs.
    • +
    • nir: Make nir_search's dumping go to stderr.
    • +
    • nir: Skip emitting no-op movs from the builder.
    • +
    • nir: Keep the range analysis HT around intra-pass until we make a change.
    • +
    • nir: Factor out most of the algebraic passes C code to .c/.h.
    • +
    • nir: Fix some wonky whitespace in nir_search.h.
    • +
    • turnip: Drop unused tu_pack_clear_value() return.
    • +
    • turnip: Fill in clear color packing for r10g11b11 and rgb9e5.
    • +
    • turnip: Tell spirv_to_nir that we want fragcoord as a sysval.
    • +
    • turnip: Set up the correct tiling mode for small attachments.
    • +
    • turnip: Emit clears of gmem using linear.
    • +
    • freedreno/ci: Ban texsubimage2d_pbo.r16ui_2d, due to two flakes reported.
    • +
    • mesa: Add debug info to _mesa_format_from_format_and_type() error path.
    • +
    • mesa: Fix depth/stencil ordering in _mesa_format_from_format_and_type().
    • +
    • mesa: Add format/type matching for DEPTH/UINT_24_8.
    • +
    • mesa: Add support for array formats of depth and stencil.
    • +
    • mesa: Refactor the entirety of _mesa_format_matches_format_and_type().
    • +
    • v3d: Add Compute Shader support
    • +
    • r100/r200: factor out txformat/txfilter setup from the TFP path.
    • +
    • radeon: Fill in the TXOFFSET field containing the tile bits in our relocs.
    • +
    • radeon: Drop the unused first arg of OUT_BATCH_RELOC.
    • +
    • mesa: Replace the LA16_UNORM packed formats with one array format.
    • +
    • mesa: Replace MESA_FORMAT_L8A8/A8L8 UNORM/SNORM/SRGB with an array format.
    • +
    • gallium: Drop the unused PIPE_FORMAT_A*L* formats.
    • +
    • mesa: Redefine the RG formats as array formats.
    • +
    • ci: Disable lima until its farm can get fixed.
    • +

      +

      Eric Engestrom (104):

      +
    • scons: define MESA_LLVM_VERSION_STRING like the other build systems do
    • +
    • llvmpipe: use LLVM version string instead of re-computing it
    • +
    • swr: use LLVM version string instead of re-computing it
    • +
    • scons: add support for MAJOR_IN_{MKDEV,SYSMACROS}
    • +
    • egl: warn user if they set an invalid EGL_PLATFORM
    • +
    • ttn: fix 64-bit shift on 32-bit `1`
    • +
    • egl: fix deadlock in malloc error path
    • +
    • util/os_file: fix double-close()
    • +
    • anv: fix format string in error message
    • +
    • freedreno/drm-shim: fix mem leak
    • +
    • nir: fix memleak in error path
    • +
    • gallivm: replace `0x` version print with actual version string
    • +
    • meson/scons/android: add LLVM_AVAILABLE binary flag
    • +
    • aux/draw: replace binary HAVE_LLVM checks with LLVM_AVAILABLE
    • +
    • r600: replace binary HAVE_LLVM checks with LLVM_AVAILABLE
    • +
    • svga: replace binary HAVE_LLVM checks with LLVM_AVAILABLE
    • +
    • amd: replace major llvm version checks with LLVM_VERSION_MAJOR
    • +
    • swr: replace major llvm version checks with LLVM_VERSION_MAJOR
    • +
    • gallivm: replace major llvm version checks with LLVM_VERSION_MAJOR
    • +
    • clover: replace major llvm version checks with LLVM_VERSION_MAJOR
    • +
    • gallivm: replace more complex 3.x version check with LLVM_VERSION_MAJOR/MINOR
    • +
    • clover: replace more complex 3.x version check with LLVM_VERSION_MAJOR/MINOR
    • +
    • llvmpipe: replace more complex 3.x version check with LLVM_VERSION_MAJOR/MINOR
    • +
    • meson/scons/android: drop now-unused HAVE_LLVM
    • +
    • gallivm: drop LLVM<3.3 code paths as no build system allows that
    • +
    • anv: add support for driconf
    • +
    • wsi: add minImageCount override
    • +
    • anv: add support for vk_x11_override_min_image_count
    • +
    • amd: move adaptive sync to performance section, as it is defined in xmlpool
    • +
    • radv: add support for vk_x11_override_min_image_count
    • +
    • drirc: override minImageCount=2 for gfxbench
    • +
    • meson/iris: replace partial list of nir dep files with idep_nir_headers
    • +
    • meson/v3d: replace partial list of nir dep files with idep_nir_headers
    • +
    • gitlab-ci: rename stages to something simpler
    • +
    • gl: drop incorrect pkg-config file for glvnd
    • +
    • anv: split instance dispatch table
    • +
    • anv: implement ICD interface v4
    • +
    • meson: split compiler warnings one per line
    • +
    • radv: fix s/load/store/ copy-paste typo
    • +
    • meson: drop -Wno-foo bug workaround for Meson < 0.46
    • +
    • meson: split more compiler options to their own line
    • +
    • meson: re-add incorrect pkg-config files with GLVND for backward compatibility
    • +
    • docs/release-calendar: fix bugfix release numbers
    • +
    • docs/release-calendar: add missing <td> and </td>
    • +
    • glsl: turn runtime asserts of compile-time value into compile-time asserts
    • +
    • etnaviv: fix bitmask typo
    • +
    • docs/install: drop autotools references
    • +
    • git: delete .gitattributes
    • +
    • egl: replace MESA_EGL_NO_X11_HEADERS hack with upstream EGL_NO_X11
    • +
    • loader: replace int/1/0 with bool/true/false
    • +
    • loader: s/int/bool/ for predicate result
    • +
    • loader: use ARRAY_SIZE instead of NULL sentinel
    • +
    • meson/loader: drop unneeded *.h file
    • +
    • script: drop get_reviewer.pl
    • +
    • meson: add missing idep_nir_headers in iris_gen_libs
    • +
    • meson: use idep_nir instead of libnir in libnouveau
    • +
    • meson: use idep_nir instead of libnir in libclnir
    • +
    • meson: use idep_nir instead of libnir in gallium nine
    • +
    • meson: use idep_nir instead of libnir in haiku softpipe
    • +
    • meson: use idep_nir instead of libnir in pipe-loader
    • +
    • meson: rename libnir to _libnir to make it clear it's not meant to be used anywhere else
    • +
    • meson: drop duplicate inc_nir from libiris
    • +
    • meson: drop duplicate inc_nir from libglsl
    • +
    • meson: drop duplicate inc_nir from spirv2nir
    • +
    • meson: drop unused inc_nir
    • +
    • include: update drm-uapi
    • +
    • meson: fix sys/mkdev.h detection on Solaris
    • +
    • GL: drop symbols mangling support
    • +
    • meson: rename `glvnd_missing_pc_files` to `not glvnd_has_headers_and_pc_files`
    • +
    • meson: move a couple of include installs around
    • +
    • meson: split headers one per line
    • +
    • meson: split Mesa headers as a separate installation
    • +
    • meson: skip installation of GLVND-provided headers
    • +
    • symbols-check: ignore exported C++ symbols
    • +
    • anv: add exported symbols check
    • +
    • radv: add exported symbols check
    • +
    • gbm: turn 0/-1 bool into true/false
    • +
    • gbm: replace 1/0 bool with true/false
    • +
    • gbm: replace NULL sentinel with explicit ARRAY_SIZE()
    • +
    • gbm: use size_t for array indexes
    • +
    • gitlab-ci: set a common job parent for container stage
    • +
    • gitlab-ci: set a common job parent for build stage
    • +
    • gitlab-ci: set a common job parent for test stage
    • +
    • mesa/math: delete leftover... from 18 years ago (!)
    • +
    • mesa/math: delete duplicate extern symbol
    • +
    • util/u_atomic: fix return type of p_atomic_{inc,dec}_return() and p_atomic_{cmp,}xchg()
    • +
    • travis: don't (re)install python
    • +
    • travis: test meson install as well
    • +
    • osmesa: add missing #include <stdint.h>
    • +
    • llvmpipe: avoid compiling no-op block on release builds
    • +
    • llvmpipe: avoid generating empty-body blocks
    • +
    • meson: add -Werror=empty-body to disallow `if(x);`
    • +
    • anv: fix error message
    • +
    • anv: fix empty-body instruction
    • +
    • radv: fix empty-body instruction
    • +
    • v3d: fix empty-body instruction
    • +
    • tu: fix empty-body instruction
    • +
    • anv: add a couple printflike() annotations
    • +
    • loader: default to iris for all future PCI IDs
    • +
    • travis: fix scons build after deprecation warning
    • +
    • meson: define _GNU_SOURCE on FreeBSD
    • +
    • egl: fix _EGL_NATIVE_PLATFORM fallback
    • +
    • egl: move #include of local headers out of Khronos headers
    • +
    • vulkan: delete typo'd header
    • +

      +

      Erico Nunes (7):

      +
    • lima: fix ppir spill stack allocation
    • +
    • lima/ppir: lower selects to scalars
    • +
    • lima/ppir: enable vectorize optimization
    • +
    • lima/ppir: mark regalloc created ssa unspillable
    • +
    • lima/ppir: optimizations in regalloc spilling code
    • +
    • lima/ppir: improve regalloc spill cost calculation
    • +
    • lima: remove partial clear support from pipe->clear()
    • +

      +

      Erik Faye-Lund (210):

      +
    • gallium/auxiliary/indices: consistently apply start only to input
    • +
    • mesa/main: remove unused include
    • +
    • util: fix SSE-version needed for double opcodes
    • +
    • util: do not assume MSVC implies SSE
    • +
    • mesa/x86: improve SSE-checks for MSVC
    • +
    • util: only allow _BitScanReverse64 on 64-bit cpus
    • +
    • gallium/gdi: use GALLIUM_FOO rather than HAVE_FOO
    • +
    • st/mesa: remove always-true expression
    • +
    • .mailmap: add an alias for Michel Dänzer
    • +
    • .mailmap: add an alias for Eric Engestrom
    • +
    • .mailmap: add an alias for Bas Nieuwenhuizen
    • +
    • .mailmap: add an alias for Frank Binns
    • +
    • glsl: correct bitcast-helpers
    • +
    • loader/dri3: do not blit outside old/new buffers
    • +
    • .mailmap: specify spelling for Elie Tournier
    • +
    • .mailmap: add an alias for Alexandros Frantzis
    • +
    • .mailmap: add an alias for Gert Wollny
    • +
    • .mailmap: add an alias for Tomeu Vizoso
    • +
    • .mailmap: add a couple of aliases for Jakob Bornecrantz
    • +
    • nir: initialize uses_discard to false
    • +
    • nir: initialize needs_helper_invocations as well
    • +
    • mesa/main: prefer R8-textures instead of A8 for glBitmap in display lists
    • +
    • gallium/u_blitter: set a more sane viewport-state
    • +
    • mesa: expose alpha-ref as a state-variable
    • +
    • nir: allow passing alpha-ref state to lowering-code
    • +
    • mesa/gallium: automatically lower alpha-testing
    • +
    • st/mesa: move point_size_per_vertex-logic to helper
    • +
    • nir: add lowering-pass for point-size mov
    • +
    • mesa/gallium: automatically lower point-size
    • +
    • nir: support derefs in two-sided lighting lowering
    • +
    • mesa/gallium: automatically lower two-sided lighting
    • +
    • nir: support lowering clipdist to arrays
    • +
    • nir: support feeding state to nir_lower_clip_[vg]s
    • +
    • mesa/program: support referencing the clip-space clip-plane state
    • +
    • mesa/st: support lowering user-clip-planes automatically
    • +
    • panfrost: do not report alpha-test as supported
    • +
    • vc4: do not report alpha-test as supported
    • +
    • v3d: do not report alpha-test as supported
    • +
    • nir: drop support for using load_alpha_ref_float
    • +
    • nir: drop unused alpha_ref_float
    • +
    • mesa/st: assert that lowering is supported
    • +
    • Revert "nir: drop unused alpha_ref_float"
    • +
    • Revert "nir: drop support for using load_alpha_ref_float"
    • +
    • Revert "v3d: do not report alpha-test as supported"
    • +
    • Revert "vc4: do not report alpha-test as supported"
    • +
    • zink: introduce opengl over vulkan
    • +
    • zink: detect presence of VK_KHR_maintenance1
    • +
    • zink/spirv: implement point-sprites
    • +
    • zink: transform z-range
    • +
    • zink: remove discard_if
    • +
    • zink/spirv: implement some integer ops
    • +
    • zink/spirv: handle reading registers
    • +
    • zink/spirv: prepare for control-flow
    • +
    • zink/spirv: implement if-statements
    • +
    • zink/spirv: implement discard
    • +
    • zink/spirv: implement loops
    • +
    • zink: prepare for caching of renderpases/framebuffers
    • +
    • zink: move render-pass begin to helper
    • +
    • zink: do not leak image-views
    • +
    • zink: move cmdbuf-resetting into a helper
    • +
    • zink: prepare for multiple cmdbufs
    • +
    • zink: pass zink_render_pass to pipeline-creation
    • +
    • zink: cache programs
    • +
    • zink: move renderpass inside gfx pipeline state
    • +
    • zink: cache those pipelines
    • +
    • zink: reference renderpass and framebuffer from cmdbuf
    • +
    • zink: return old fence from zink_flush
    • +
    • zink: reference vertex and index buffers
    • +
    • zink: reference ubos and textures
    • +
    • zink: wait for idle on context-destroy
    • +
    • zink: whitespace cleanup
    • +
    • zink: reference blit/copy-region resources
    • +
    • zink: add curr_cmdbuf-helper
    • +
    • zink: delete samplers after the current cmdbuf
    • +
    • zink: texture-rects?
    • +
    • zink: store shader_info in zink_shader
    • +
    • zink: implement fmod
    • +
    • zink: track used resources
    • +
    • zink: do not destroy staging-resource, deref it
    • +
    • zink: use uvec for undefs
    • +
    • zink: emit dedicated block for variables
    • +
    • zink: ensure non-fragment shaders use lod-versions of texture
    • +
    • zink: ensure textures are transitioned properly
    • +
    • zink: assign increasing locations to varyings
    • +
    • zink: move primitive-topology stuff into program
    • +
    • zink: tweak state handling
    • +
    • zink: remove unusual alignment
    • +
    • zink: return after blitting
    • +
    • zink: implement batching
    • +
    • zink: simplify renderpass/framebuffer logic a tad
    • +
    • zink: cache render-passes
    • +
    • zink: cache framebuffers
    • +
    • zink: more batch-ism
    • +
    • zink: use helper
    • +
    • zink: fixup parameter name
    • +
    • zink: ensure sampler-views survive a batch
    • +
    • zink: remove hack-comment
    • +
    • zink: clean up render-pass management
    • +
    • zink: rename sampler-view destroy function
    • +
    • zink: pass screen instead of device to program-functions
    • +
    • zink: keep a reference to used render-passes
    • +
    • zink: prepare for shadow-samplers
    • +
    • zink: kill dead code
    • +
    • zink: clamp scissors
    • +
    • zink: do not use hash-table for regs
    • +
    • zink: squashme: forward declare hash_table
    • +
    • zink: squashme: trade cplusplus wrapper for header-guard
    • +
    • zink: fix off-by-one in assert
    • +
    • zink: reuse constants
    • +
    • zink: pool descriptors per batch
    • +
    • zink: request alpha-test lowering
    • +
    • zink/spirv: var -> regs
    • +
    • zink/spirv: rename vec_type
    • +
    • zink: do not lower io
    • +
    • zink: request ucp-lowering
    • +
    • zink: cleanup zink_end_batch
    • +
    • zink: drop unused argument
    • +
    • zink: refactor fence destruction
    • +
    • zink: only consider format-desc if checking details
    • +
    • zink: document end-of-frame hack
    • +
    • zink: use pipe_stencil_ref instead of uint32_t-array
    • +
    • zink: store sampler and image_view counts
    • +
    • zink: save original scissor and viewport
    • +
    • zink: save all supported util_blitter states
    • +
    • zink: process one aspect-mask bit at the time
    • +
    • zink: clean up opcode-emitting a bit
    • +
    • zink: add some opcodes
    • +
    • zink: add division ops
    • +
    • zink: add shift ops
    • +
    • zink: implement ineg
    • +
    • zink: more comparison-ops
    • +
    • zink: more converts
    • +
    • zink: add more compares
    • +
    • zink: crash hard on unknown queries
    • +
    • zink: abort on submit-failure
    • +
    • zink: stub resource_from_handle
    • +
    • zink: make sure imageExtent.depth is 1 for arrays
    • +
    • zink/spirv: correct opcode
    • +
    • zink: support more texturing
    • +
    • zink: wait for transfer when reading
    • +
    • zink/spirv: be a bit more strict with fragment-results
    • +
    • zink/spirv: debug-print unknown varying slots
    • +
    • zink: ensure layout is reasonable before copying
    • +
    • zink: fixup: save rasterizer
    • +
    • zink: set ExecutionModeDepthReplacing when depth is written
    • +
    • zink: avoid texelFetch until it's implemented
    • +
    • zink: remove insecure comment
    • +
    • zink: don't crash when setting rast-state to NULL
    • +
    • zink: add note about enabling PIPE_CAP_CLIP_HALFZ
    • +
    • zink/spirv: always enable Sampled1D for fragment shaders
    • +
    • zink: do not use both depth and stencil aspects for sampler-views
    • +
    • zink/spirv: support vec1 coordinates
    • +
    • zink: fixup boolean queries
    • +
    • zink: disable timestamp-queries
    • +
    • zink: move set_active_query_state-stub to zink_query.c
    • +
    • HACK: zink: suspend / resume queries on batch-boundaries
    • +
    • zink: also accept txl
    • +
    • zink: use primconvert to get rid of 8-bit indices
    • +
    • zink: initialize nr_samples for pipe_surface
    • +
    • zink: fix rendering to 3D-textures
    • +
    • zink: support shadow-samplers
    • +
    • zink: disable PIPE_CAP_QUERY_TIME_ELAPSED for now
    • +
    • zink: add missing sRGB DXT-formats
    • +
    • zink: lower point-size
    • +
    • zink/spirv: use ordered compares
    • +
    • zink/spirv: implement f2b1
    • +
    • zink/spirv: assert bit-size
    • +
    • zink/spirv: implement bcsel
    • +
    • zink/spirv: implement bitwise ops
    • +
    • zink/spirv: implement b2i32
    • +
    • zink/spirv: implement emit_select helper
    • +
    • zink/spirv: implement emit_float_const helper
    • +
    • zink/spirv: use bit_size instead of hard-coding
    • +
    • zink/spirv: add emit_bitcast-helper
    • +
    • zink/spirv: add emit_uint_const-helper
    • +
    • zink/spirv: inline get_uvec_constant into emit_load_const
    • +
    • zink/spirv: clean up get_[fu]vec_constant
    • +
    • zink/spirv: fixup b2i32 and implement b2f32
    • +
    • zink/spirv: prepare for 1-bit booleans
    • +
    • zink: do not lower bools to float
    • +
    • zink/spirv: fixup b2i32
    • +
    • zink/spirv: implement load_front_face
    • +
    • zink/spirv: alias generic varyings on non-generic ones
    • +
    • zink: lower two-sided coloring
    • +
    • zink/spirv: alias var0 on tex0 etc instead
    • +
    • zink: do not set VK_IMAGE_CREATE_2D_ARRAY_COMPATIBLE_BIT for non-3D textures
    • +
    • zink: use VK_FORMAT_B8G8R8A8_UNORM for PIPE_FORMAT_B8G8R8X8_UNORM
    • +
    • zink: implement resource_from_handle
    • +
    • zink: refactor blitting
    • +
    • zink: fixup return-value
    • +
    • zink: pass screen to zink_create_gfx_pipeline
    • +
    • zink: do not set lineWidth to invalid value
    • +
    • zink: fixup scissoring
    • +
    • zink/spirv: more complete sampler-dim handling
    • +
    • zink: simplify gl-to-vulkan lowering
    • +
    • gitlab-ci: also build Zink on CI
    • +
    • gitlab-ci: fixup debian tags
    • +
    • zink: error if VK_KHR_maintenance1 isn't supported
    • +
    • zink: emulate optional depth-formats
    • +
    • st/mesa: lower global vars to local after lowering clip
    • +
    • zink: use dynamic state for line-width
    • +
    • zink: use bitfield for dirty flagging
    • +
    • zink: drop nop descriptor-updates
    • +
    • zink: only enable KHR_external_memory_fd if supported
    • +
    • zink: emit line-width when using polygon line-mode
    • +
    • zink: use actual format for render-pass
    • +
    • zink: always allow mutating the format
    • +
    • zink: do not advertize coherent mapping
    • +
    • zink: disable fragment-shader texture-lod
    • +
    • zink: correct depth-stencil format
    • +

      +

      Francisco Jerez (56):

      +
    • intel/fs: Teach fs_inst::is_send_from_grf() about some missing send-like instructions.
    • +
    • intel/fs: Define is_payload() method of the IR instruction class.
    • +
    • intel/fs: Define is_send() convenience IR helper.
    • +
    • intel/fs: Fix constness of implied_mrf_writes() argument.
    • +
    • intel/eu: Split brw_inst ex_desc accessors for SEND(C) vs. SENDS(C).
    • +
    • intel/eu: Fix up various type conversions in brw_eu.c that are illegal C++.
    • +
    • intel/eu: Rework opcode description tables to allow efficient look-up by either HW or IR opcode.
    • +
    • intel/eu: Encode and decode native instruction opcodes from/to IR opcodes.
    • +
    • intel/ir: Drop hard-coded correspondence between IR and HW opcodes.
    • +
    • intel/ir: Represent physical and logical subsets of the CFG.
    • +
    • intel/ir: Add helper function to push block onto CFG analysis stack.
    • +
    • intel/ir: Represent logical edge of BREAK instruction.
    • +
    • intel/ir: Represent physical edge of ELSE instruction.
    • +
    • intel/ir: Represent physical edge of unconditional CONTINUE instruction.
    • +
    • intel/eu/gen12: Extend brw_inst.h macros for Gen12 support.
    • +
    • intel/eu/gen12: Add sanity-check asserts to brw_inst_bits() and brw_inst_set_bits().
    • +
    • intel/eu/gen12: Implement basic instruction binary encoding.
    • +
    • intel/eu/gen12: Implement three-source instruction binary encoding.
    • +
    • intel/eu/gen12: Implement control flow instruction binary encoding.
    • +
    • intel/eu/gen12: Implement SEND instruction binary encoding.
    • +
    • intel/eu/gen12: Implement indirect region binary encoding.
    • +
    • intel/eu/gen12: Implement compact instruction binary encoding.
    • +
    • intel/eu/gen12: Implement datatype binary encoding.
    • +
    • intel/eu/gen11+: Mark dot product opcodes as unsupported on opcode_descs table.
    • +
    • intel/eu/gen12: Add Gen12 opcode descriptions to the table.
    • +
    • intel/eu/gen12: Fix codegen of immediate source regions.
    • +
    • intel/eu/gen12: Codegen three-source instruction source and destination regions.
    • +
    • intel/eu/gen12: Codegen control flow instructions correctly.
    • +
    • intel/eu/gen12: Codegen pathological SEND source and destination regions.
    • +
    • intel/eu/gen12: Codegen SEND descriptor regions correctly.
    • +
    • intel/eu/gen12: Use SEND instruction for split sends.
    • +
    • intel/eu/gen12: Don't set DD control, it's gone.
    • +
    • intel/eu/gen12: Don't set thread control, it's gone.
    • +
    • intel/ir/gen12: Add SYNC hardware instruction.
    • +
    • intel/fs/gen12: Add codegen support for the SYNC instruction.
    • +
    • intel/eu/gen12: Add auxiliary type to represent SWSB information during codegen.
    • +
    • intel/eu/gen12: Add tracking of default SWSB state to the current brw_codegen instruction.
    • +
    • intel/eu/gen12: Set SWSB annotations in hand-crafted assembly.
    • +
    • intel/fs/gen12: Add scheduling information to the IR.
    • +
    • intel/fs/gen12: Introduce software scoreboard lowering pass.
    • +
    • intel/fs/gen12: Demodernize software scoreboard lowering pass.
    • +
    • intel/disasm/gen12: Disassemble software scoreboard information.
    • +
    • intel/disasm/gen12: Fix disassembly of some common instruction controls.
    • +
    • intel/disasm/gen12: Disassemble three-source instruction source and destination regions.
    • +
    • intel/disasm/gen12: Disassemble Gen12 SYNC instruction.
    • +
    • intel/disasm/gen12: Disassemble Gen12 SEND instructions.
    • +
    • intel/disasm: Don't disassemble saturate control on SEND instructions.
    • +
    • intel/disasm: Disassemble register file of split SEND sources.
    • +
    • intel/fs/gen12: Don't support source mods for 32x16 integer multiply.
    • +
    • intel/eu/validate/gen12: Implement integer multiply restrictions in EU validator.
    • +
    • intel/eu/validate/gen12: Fix validation of SYNC instruction.
    • +
    • intel/eu/validate/gen12: Validation fixes for SEND instruction.
    • +
    • intel/ir/gen12: Update assert in brw_stage_has_packed_dispatch().
    • +
    • intel/eu: Don't set notify descriptor field of gateway barrier message.
    • +
    • intel/fs/gen12: Fix barrier codegen.
    • +
    • intel/fs/gen11+: Fix CS_OPCODE_CS_TERMINATE codegen.
    • +

      +

      Fritz Koenig (5):

      +
    • include/GLES2: Sync GLES2 headers with Khronos
    • +
    • mesa: GetFramebufferParameteriv spelling
    • +
    • mesa: Allow MESA_framebuffer_flip_y for GLES 3
    • +
    • gallium: Enable MESA_framebuffer_flip_y
    • +
    • freedreno: reorder format check
    • +

      +

      Gert Wollny (4):

      +
    • radeonsi: Release storage for smda_uploads when the context is destroyed
    • +
    • etnaviv: enable triangle strips only when the hardware supports it
    • +
    • r600: Fix interpolateAtCentroid
    • +
    • r600: Disable eight bit three channel formats
    • +

      +

      Greg V (1):

      +
    • clover: use iterator_range in get_kernel_nodes
    • +

      +

      Gurchetan Singh (4):

      +
    • virgl: remove stride from virgl_hw_res
    • +
    • virgl: modify resource_create_from_handle(..) callback
    • +
    • virgl: modify internal structures to track winsys-supplied data
    • +
    • virgl: honor winsys supplied metadata
    • +

      +

      Haihao Xiang (1):

      +
    • i965: support AYUV/XYUV for external import only
    • +

      +

      Hal Gentz (11):

      +
    • glx: Fix SEGV due to dereferencing a NULL ptr from XCB-GLX.
    • +
    • clover: Fix build after clang r370122.
    • +
    • gallium/osmesa: Fix the inability to set no context as current.
    • +
    • egl: Add EGL_CONFIG_SELECT_GROUP_MESA ext.
    • +
    • egl: Fixes transparency with EGL and X11.
    • +
    • egl: Puts RGBA visuals in the second config selection group.
    • +
    • egl: Configs w/o double buffering support have no `EGL_WINDOW_BIT`.
    • +
    • Revert "egl: Configs w/o double buffering support have no `EGL_WINDOW_BIT`."
    • +
    • Revert "egl: Puts RGBA visuals in the second config selection group."
    • +
    • Revert "egl: Fixes transparency with EGL and X11."
    • +
    • Revert "egl: Add EGL_CONFIG_SELECT_GROUP_MESA ext."
    • +

      +

      Heinrich Fink (8):

      +
    • include: sync GL headers with registry
    • +
    • specs: Sync framebuffer_flip_y text with GL registry
    • +
    • headers: remove redundant GL token from GL wrapper
    • +
    • specs: Add GL_MESA_EGL_sync
    • +
    • registry: update gl.xml with GL_MESA_EGL_sync token
    • +
    • headers: Add GL_MESA_EGL_sync token to GL
    • +
    • egl: Add GL_MESA_EGL_sync support
    • +
    • mesa/gl: Sync with Khronos registry
    • +

      +

      Hyunjun Ko (3):

      +
    • freedreno/ir3: Add data structures to support texture pre-fetch
    • +
    • freedreno/ir3: Add support for texture sampling pre-dispatch
    • +
    • freedreno/ir3: fix printing output registers of FS.
    • +

      +

      Iago Toral (1):

      +
    • v3d: drop unused shader_rec_count member from context
    • +

      +

      Iago Toral Quiroga (13):

      +
    • prog_to_nir: VARYING_SLOT_PSIZ is a scalar
    • +
    • gallium/ttn: VARYING_SLOT_PSIZ and VARYING_SLOT_FOGC are scalar
    • +
    • nir/lower_point_size: assume scalar PSIZ
    • +
    • v3d: add missing line break for performance debug message
    • +
    • v3d: make sure we have enough space in the CL for the primitive counts packet
    • +
    • v3d: remove redundant update of queued draw calls
    • +
    • v3d: fix TF primitive counts for resume without draw
    • +
    • mesa/main: GL_GEOMETRY_SHADER_INVOCATIONS exists in GL_OES_geometry_shader
    • +
    • v3d: trivial update to obsolete comment
    • +
    • v3d: add new flag dirty TMU cache at v3d_compiler
    • +
    • broadcom: document known hardware issues for L2T flush command
    • +
    • v3d: request the kernel to flush caches when TMU is dirty
    • +
    • st/mesa: only require ESSL 3.1 for geometry shaders
    • +

      +

      Ian Romanick (22):

      +
    • nir/algrbraic: Don't optimize open-coded bitfield reverse when lowering is enabled
    • +
    • intel/compiler: Request bitfield_reverse lowering on pre-Gen7 hardware
    • +
    • nir/algebraic: Mark some value range analysis-based optimizations imprecise
    • +
    • nir/algebraic: Clean up value range analysis-based optimizations
    • +
    • nir/range-analysis: Adjust result range of exp2 to account for flush-to-zero
    • +
    • nir/range-analysis: Adjust result range of multiplication to account for flush-to-zero
    • +
    • nir/range-analysis: Fix incorrect fadd range result for (ne_zero, ne_zero)
    • +
    • nir/range-analysis: Handle constants in nir_op_mov just like nir_op_bcsel
    • +
    • nir/range-analysis: Range tracking for fpow
    • +
    • nir/range-analysis: Add a lot more assertions about the contents of tables
    • +
    • nir/algebraic: Do not apply late DPH optimization in vertex processing stages
    • +
    • nir/algebraic: Additional D3D Boolean optimization
    • +
    • nir/range-analysis: Bail if the types don't match
    • +
    • nir/range-analysis: Use types in the hash key
    • +
    • nir/range-analysis: Use types to provide better ranges from bcsel and mov
    • +
    • nir/search: Fix possible NULL dereference in is_fsign
    • +
    • intel/vec4: Don't try both sources as immediates for DPH
    • +
    • intel/compiler: Report the number of non-spill/fill SEND messages on vec4 too
    • +
    • nir/algebraic: Add the ability to mark a replacement as exact
    • +
    • nir/algebraic: Mark other comparison exact when removing a == a
    • +
    • intel/fs: Disable conditional discard optimization on Gen4 and Gen5
    • +
    • intel/compiler: Fix 'comparison is always true' warning
    • +

      +

      Icenowy Zheng (4):

      +
    • lima: reset scissor state if scissor test is disabled
    • +
    • lima: fix PLBU viewport configuration
    • +
    • lima: support rectangle texture
    • +
    • lima: do not set the PP uniforms address lowest bits
    • +

      +

      Ilia Mirkin (6):

      +
    • gallium/vl: use compute preference for all multimedia, not just blit
    • +
    • teximage: ensure that Tex*SubImage* checks format
    • +
    • gallium/tgsi: add support for DEMOTE and READ_HELPER opcodes
    • +
    • nvc0: add support for GL_EXT_demote_to_helper_invocation
    • +
    • gm107/ir: fix loading z offset for layered 3d image bindings
    • +
    • nv50/ir: mark STORE destination inputs as used
    • +

      +

      Illia Iorin (2):

      +
    • Revert "mesa/main: Fix multisample texture initialize"
    • +
    • mesa/main: Ignore filter state for MS texture completeness
    • +

      +

      Indrajit Das (1):

      +
    • radeon/vcn: exclude raven2 from vcn 2.0 encode initialization
    • +

      +

      James Xiong (5):

      +
    • gallium: simplify throttle implementation
    • +
    • gallium: rename PIPE_CAP_MAX_FRAMES_IN_FLIGHT to PIPE_CAP_THROTTLE
    • +
    • iris: finish aux import on get_param
    • +
    • gallium: do not increase ref count of the new throttle fence
    • +
    • iris: try to set the specified tiling when importing a dmabuf
    • +

      +

      Jan Beich (6):

      +
    • gallium/hud: add CPU usage support for DragonFly/NetBSD/OpenBSD
    • +
    • util: skip NEON detection if built with -mfpu=neon
    • +
    • util: detect NEON at runtime on FreeBSD
    • +
    • util: skip AltiVec detection if built with -maltivec
    • +
    • util: detect AltiVec at runtime on BSDs
    • +
    • util: simplify BSD includes
    • +

      +

      Jan Zielinski (3):

      +
    • swr/rasterizer: Enable ARB_fragment_layer_viewport
    • +
    • swr/rasterizer: Fix GS attributes processing
    • +
    • gallium/swr: Fix depth values for blit scenario
    • +

      +

      Jason Ekstrand (57):

      +
    • nir: Add explicit signs to image min/max intrinsics
    • +
    • intel/nir: Add a helper for getting BRW_AOP from an intrinsic
    • +
    • v3d: Use the correct opcodes for signed image min/max
    • +
    • intel/fs: Drop the gl_program from fs_visitor
    • +
    • intel/fs: Fix FB write inst groups
    • +
    • Revert "intel/fs: Move the scalar-region conversion to the generator."
    • +
    • anv: Bump maxComputeWorkgroupSize
    • +
    • intel/tools: Decode 3DSTATE_BINDING_TABLE_POINTERS on SNB
    • +
    • intel/tools: Decode PS kernels on SNB
    • +
    • blorp: Memset surface info to zero when initializing it
    • +
    • intel/blorp: Expose surf_retile_w_to_y internally
    • +
    • intel/blorp: Expose surf_fake_interleaved_msaa internally
    • +
    • intel/blorp: Use wide formats for nicely aligned stencil clears
    • +
    • nir: Handle complex derefs in nir_split_array_vars
    • +
    • nir: Don't infinitely recurse in lower_ssa_defs_to_regs_block
    • +
    • nir: Add a block_is_unreachable helper
    • +
    • nir/repair_ssa: Repair dominance for unreachable blocks
    • +
    • nir/repair_ssa: Insert deref casts when needed
    • +
    • nir/dead_cf: Repair SSA if the pass makes progress
    • +
    • intel/fs: Handle UNDEF in split_virtual_grfs
    • +
    • vulkan: Update the XML and headers to 1.1.123
    • +
    • Move blob from compiler/ to util/
    • +
    • util/rb_tree: Add the unit tests
    • +
    • util/rb_tree: Reverse the order of comparison functions
    • +
    • intel/fs: Allow UB, B, and HF types in brw_nir_reduction_op_identity
    • +
    • intel/fs: Allow CLUSTER_BROADCAST to do type conversion
    • +
    • intel/fs: Do 8-bit subgroup scan operations in 16 bits
    • +
    • anv: Advertise VK_KHR_shader_subgroup_extended_types
    • +
    • nir/repair_ssa: Replace the unreachable check with the phi builder
    • +
    • util/rb_tree: Replace useless ifs with asserts
    • +
    • util/rb_tree: Also test _safe iterators
    • +
    • util/rb_tree: Stop relying on &iter->field != NULL
    • +
    • intel/fs: Fix fs_inst::flags_read for ANY/ALL predicates
    • +
    • anv/pipeline: Capture serialized NIR
    • +
    • intel/eu/validate/gen12: Don't blow up on indirect src0.
    • +
    • intel/fs/gen12: Implement gl_FrontFacing on gen12+.
    • +
    • intel/genxml: Remove W-tiling on gen12
    • +
    • intel/isl: Select Y-tiling for stencil on gen12
    • +
    • intel/isl: Add isl_aux_usage_has_ccs
    • +
    • spirv/info: Add a memorymodel_to_string helper
    • +
    • Revert "mapi: Inline call x86_current_tls."
    • +
    • intel/blorp: Use surf instead of aux_surf for image dimensions
    • +
    • intel/isl: Add new aux modes available on gen12
    • +
    • intel/isl/fill_state: Separate aux_mode handling from aux_surf
    • +
    • intel/isl: Update surf_fill_state for gen12
    • +
    • intel/isl: Support HIZ_CCS in emit_depth_stencil_hiz
    • +
    • anv: Delay allocation of relocation lists
    • +
    • anv: Reduce the minimum number of relocations
    • +
    • intel/vec4: Set brw_stage_prog_data::has_ubo_pull
    • +
    • anv: Avoid emitting UBO surface states that won't be used
    • +
    • anv: Fix a potential BO handle leak
    • +
    • anv/tests: Zero-initialize instances
    • +
    • anv: Set the batch allocator for compute pipelines
    • +
    • anv: Stop bounds-checking pushed UBOs
    • +
    • anv: Set up SBE_SWIZ properly for gl_Viewport
    • +
    • anv: Re-emit all compute state on pipeline switch
    • +
    • anv: Don't leak when set_tiling fails
    • +

      +

      Jean Hertel (1):

      +
    • Fix missing dri2_load_driver on platform_drm
    • +

      +

      Jiadong Zhu (1):

      +
    • mesa: fix texStore for FORMAT_Z32_FLOAT_S8X24_UINT
    • +

      +

      Jiang, Sonny (1):

      +
    • loader: always map the "amdgpu" kernel driver name to radeonsi (v2)
    • +

      +

      John Stultz (1):

      +
    • Android.mk: Fix missing \ from recent llvm change
    • +

      +

      Jon Turney (2):

      +
    • Fix timespec_from_nsec test for 32-bit time_t
    • +
    • rbug: Fix use of alloca() without #include "c99_alloca.h"
    • +

      +

      Jonathan Gray (3):

      +
    • mapi: Adapted libglvnd x86 tsd changes
    • +
    • winsys/amdgpu: avoid double simple_mtx_unlock()
    • +
    • i965: update Makefile.sources for perf changes
    • +

      +

      Jonathan Marek (90):

      +
    • freedreno/a2xx: ir2: fix lowering of instructions after float lowering
    • +
    • freedreno/a2xx: ir2: remove pointcoord y invert
    • +
    • freedreno/a2xx: ir2: set lower_fdph
    • +
    • freedreno/a2xx: ir2: fix saturate in cp
    • +
    • freedreno/a2xx: ir2: check opcode on the right instruction in export cp
    • +
    • freedreno/a2xx: ir2: fix incorrect instruction reordering
    • +
    • freedreno/a2xx: ir2: update register state in scalar insert
    • +
    • freedreno/a2xx: fix SRC_ALPHA_SATURATE for alpha blend function
    • +
    • freedreno/a2xx: implement polygon offset
    • +
    • freedreno/a2xx: fix depth gmem restore
    • +
    • freedreno/a2xx: formats update
    • +
    • u_format: add ETC2 to util_format_srgb/util_format_linear
    • +
    • u_format: float type for R11G11B10_FLOAT/R9G9B9E5_FLOAT
    • +
    • etnaviv: fix two-sided stencil
    • +
    • turnip: fix binning shader compilation
    • +
    • turnip: use image tile_mode for gmem configuration
    • +
    • turnip: emit shader immediates
    • +
    • turnip: fix vertex_id
    • +
    • turnip: implement sampler state
    • +
    • turnip: implement image view descriptor
    • +
    • turnip: use linear tiling for scanout image
    • +
    • turnip: align layer_size
    • +
    • turnip: enable linear filtering
    • +
    • turnip: basic descriptor sets (uniform buffer and samplers)
    • +
    • turnip: lower samplers and uniform buffer indices
    • +
    • turnip: use nir_opt_copy_prop_vars
    • +
    • turnip: add some shader information in pipeline state
    • +
    • turnip: emit texture and uniform state
    • +
    • etnaviv: nir: fix gl_FrontFacing
    • +
    • etnaviv: nir: allocate contiguous components for LOAD destination
    • +
    • etnaviv: nir: set num_components for inputs/outputs
    • +
    • qetnaviv: nir: use new immediates when possible
    • +
    • etnaviv: nir: add native integers (HALTI2+)
    • +
    • etnaviv: nir: use store_deref instead of store_output
    • +
    • etnaviv: nir: remove "options" struct
    • +
    • etnaviv: remove extra allocation for shader code
    • +
    • etnaviv: nir: make lower_alu easier to follow
    • +
    • etnaviv: disable earlyZ when shader writes fragment depth
    • +
    • etnaviv: nir: fix gl_FragDepth
    • +
    • etnaviv: update headers from rnndb
    • +
    • etnaviv: implement texture comparator
    • +
    • etnaviv: set texture INT_FILTER bit
    • +
    • etnaviv: clear texture cache and flush ts when texture is modified
    • +
    • etnaviv: get addressing mode from tiling layout
    • +
    • etnaviv: rework compatible render base
    • +
    • etnaviv: rework etna_resource_create tiling choice
    • +
    • freedreno/ir3: remove input ncomp field
    • +
    • freedreno/ir3: increase size of inputs/outputs arrays
    • +
    • freedreno/ir3: implement fdd{x,y}_coarse opcodes
    • +
    • freedreno/ir3: fix GETLOD for negative LODs
    • +
    • freedreno/ir3: implement texop_texture_samples
    • +
    • freedreno/ir3: implement fquantize2f16
    • +
    • freedreno/regs: update a6xx 2d blit bits
    • +
    • turnip: fix triangle strip
    • +
    • turnip: fix 32 vertex attributes case
    • +
    • turnip: fix segmentation fault in events
    • +
    • turnip: fix segmentation fault with compute pipeline
    • +
    • turnip: fix assert failing for 0 color attachments
    • +
    • turnip: add astc format layout
    • +
    • turnip: add format_is_uint/format_is_sint
    • +
    • turnip: format table fixes
    • +
    • turnip: add more 2d_ifmt translations
    • +
    • turnip: improve view descriptor
    • +
    • turnip: improve sampler descriptor
    • +
    • turnip: add black border color
    • +
    • turnip: add VK_KHR_sampler_mirror_clamp_to_edge
    • +
    • turnip: update setup_slices
    • +
    • turnip: disable tiling as necessary
    • +
    • turnip: add anisotropy and compressed formats to device features
    • +
    • turnip: update some shader state bits from GL driver
    • +
    • turnip: fixup consts
    • +
    • turnip: add code to lower indirect samplers
    • +
    • turnip: add missing nir passes
    • +
    • turnip: use nir_assign_io_var_locations instead of nir_assign_var_locations
    • +
    • turnip: improve CmdCopyImage and implement CmdBlitImage
    • +
    • turnip: basic msaa working
    • +
    • turnip: depth/stencil
    • +
    • turnip: push constants
    • +
    • turnip: more descriptor sets
    • +
    • spirv: set correct dest_type for texture query ops
    • +
    • etnaviv: fix linear_nearest / nearest_linear filters on GC7000Lite
    • +
    • etnaviv: fix TS samplers on GC7000L
    • +
    • etnaviv: check NO_ASTC feature bit
    • +
    • freedreno/a2xx: use sysval for pointcoord
    • +
    • freedreno/a2xx: add missing vertex formats (SSCALE/USCALE/FIXED)
    • +
    • etnaviv: fix depth bias
    • +
    • etnaviv: stencil fix
    • +
    • etnaviv: fix non-pointsprite points on GC7000L
    • +
    • freedreno/ir3: disable texture prefetch for 1d array textures
    • +
    • freedreno/registers: fix a6xx_2d_blit_cntl ROTATE
    • +

      +

      Jordan Justen (42):

      +
    • intel/genxml: Handle field names with different spacing/hyphen
    • +
    • intel/genxml/gen11: Add spaces in EnableUnormPathInColorPipe
    • +
    • intel/genxml: Run sort_xml.sh to tidy gen9.xml and gen11.xml
    • +
    • intel/genxml: Add gen12.xml as a copy of gen11.xml
    • +
    • intel/genxml: Build gen12 genxml
    • +
    • intel/isl: Build gen12 using gen11 code paths
    • +
    • intel/compiler: Disable compaction on gen12 for now
    • +
    • intel/l3: Don't assert on gen12 (use gen11 config temporarily)
    • +
    • iris: Build for gen12
    • +
    • anv: Build for gen12
    • +
    • i965: Exit with error if gen12+ is detected
    • +
    • pci_id_driver_map: Support preferring iris over i965
    • +
    • anv,iris: L3ALLOC register replaces L3CNTLREG for gen12
    • +
    • iris/state: Move reg/mem load/store functions earlier in file
    • +
    • intel/ir: Lower fpow on Gen12.
    • +
    • intel/genxml,isl: Add gen12 render surface state changes
    • +
    • intel/genxml,isl: Add gen12 depth buffer changes
    • +
    • intel/genxml,isl: Add gen12 stencil buffer changes
    • +
    • intel/isl: Add gen12 depth/stencil surface alignments
    • +
    • iris: Let isl decide the supported tiling in more situations
    • +
    • intel/isl: Add R10G10B10_FLOAT_A2_UNORM format
    • +
    • iris/resource: Use isl surface alignment during bo allocation
    • +
    • intel/common: Add interface to allocate device buffers
    • +
    • anv: Implement aux-map allocator interface
    • +
    • intel/common: Add surface to aux map translation table support
    • +
    • anv/gen12: Initialize aux map context
    • +
    • genxml/gen12: Add AUX MAP register definitions
    • +
    • anv/gen12: Write GFX_AUX_TABLE base address register
    • +
    • iris/bufmgr: Initialize aux map context for gen12
    • +
    • isl/gen12: 64k surface alignment
    • +
    • iris: Map each surf to it's aux-surf in the aux-map tables
    • +
    • iris/gen12: Write GFX_AUX_TABLE base address register
    • +
    • iris: Mark aux-map BO as used by all batches
    • +
    • intel: Update alignment restrictions for HiZ surfaces.
    • +
    • iris: Set MOCS for external surfaces to uncached
    • +
    • intel/genxml: Add gen12 tile cache flush bit
    • +
    • intel/dev: Add preliminary device info for Tigerlake
    • +
    • intel/eu/validate/gen12: Add TGL to eu_validate tests.
    • +
    • docs/relnotes/new_features.txt: Add note about gen12 support
    • +
    • iris: Add IRIS_DIRTY_RENDER_BUFFER state flag
    • +
    • iris/gen11+: Move flush for render target change
    • +
    • iris: Allow max dynamic pool size of 2GB for gen12
    • +

      +

      Jose Maria Casanova Crespo (5):

      +
    • mesa: recover target_check before get_current_tex_objects
    • +
    • v3d: writes to magic registers aren't RF writes after THREND
    • +
    • v3d: flag dirty state when binding compute states
    • +
    • v3d: Explicitly expose OpenGL ES Shading Language 3.1
    • +
    • v3d: Fix predication with atomic image operations
    • +

      +

      José Fonseca (5):

      +
    • glx: Fix incompatible function pointer types.
    • +
    • util: Prevent implicit declaration of function getenv.
    • +
    • util: Prevent strcasecmp macro redefinion.
    • +
    • scons: Make GCC builds stricter.
    • +
    • scons: Fix force_scons parsing.
    • +

      +

      Juan A. Suarez Romero (14):

      +
    • docs: add release notes for 19.1.5
    • +
    • docs: add sha256 checksums for 19.1.5
    • +
    • docs: update calendar, add news item and link release notes for 19.1.5
    • +
    • docs: add release notes for 19.1.6
    • +
    • docs: add sha256 checksums for 19.1.6
    • +
    • docs: update calendar, add news item and link release notes for 19.1.6
    • +
    • docs: extend 19.1.x releases
    • +
    • docs: add release notes for 19.1.7
    • +
    • docs: add sha256 checksums for 19.1.7
    • +
    • docs: update calendar, add news item and link release notes for 19.1.7
    • +
    • bin/get-pick-list.sh: sha1 commits can be smaller than 8 chars
    • +
    • docs: add release notes for 19.1.8
    • +
    • docs: add release notes for 19.1.8
    • +
    • docs: update calendar, add news item and link release notes for 19.1.8
    • +

      +

      Karol Herbst (15):

      +
    • gallium: add blob field to pipe_llvm_program_header
    • +
    • rename pipe_llvm_program_header to pipe_binary_program_header
    • +
    • clover/functional: add id_equals helper
    • +
    • clover: add support for drivers having no proper binary format
    • +
    • clover: prepare supporting multiple IRs
    • +
    • clover: add support for passing kernels as nir to the driver
    • +
    • nvc0: expose spirv support
    • +
    • clover/nir: fix compilation with g++-5.5 and maybe earlier
    • +
    • nv50/ir: fix unnecessary parentheses warning
    • +
    • nv50/ir/nir: comparison of integer expressions of different signedness warning
    • +
    • clover/llvm: remove harmful std::move call
    • +
    • clover/codegen: remove unused get_symbol_offsets function
    • +
    • clover: eliminate "ignoring attributes on template argument" warning
    • +
    • st/mesa: fix crash for drivers supporting nir defaulting to tgsi
    • +
    • nv50/ir: remove DUMMY edge type
    • +

      +

      Ken Mays (1):

      +
    • haiku: fix Mesa build
    • +

      +

      Kenneth Graunke (86):

      +
    • gallium/ddebug: Wrap resource_get_param if available
    • +
    • gallium/trace: Wrap resource_get_param if available
    • +
    • gallium/rbug: Wrap resource_get_param if available
    • +
    • gallium/noop: Implement resource_get_param
    • +
    • iris: Replace devinfo->gen with GEN_GEN
    • +
    • iris: Fix broken aux.possible/sampler_usages bitmask handling
    • +
    • iris: Update fast clear colors on Gen9 with direct immediate writes.
    • +
    • iris: Drop copy format hacks from copy region based transfer path.
    • +
    • iris: Avoid unnecessary resolves on transfer maps
    • +
    • iris: Set MOCS in all STATE_BASE_ADDRESS commands
    • +
    • iris: Fix large timeout handling in rel2abs()
    • +
    • isl: Drop UnormPathInColorPipe for buffer surfaces.
    • +
    • isl: Don't set UnormPathInColorPipe for integer surfaces.
    • +
    • iris: Delete dead prototype
    • +
    • intel/compiler: Fix src0/desc setter ordering
    • +
    • intel/compiler: Handle bits 15:12 in brw_send_indirect_split_message()
    • +
    • intel/compiler: Refactor FB write message control setup into a helper.
    • +
    • intel/compiler: Use generic SEND for Gen7+ FB writes
    • +
    • intel/compiler: Use new Gen11 headerless RT writes for MRT cases
    • +
    • util: Add a _mesa_i64roundevenf() helper.
    • +
    • mesa: Fix _mesa_float_to_unorm() on 32-bit systems.
    • +
    • iris: Drop swizzling parameter from s8_offset.
    • +
    • iris: Don't auto-flush/dirty on transfer unmap for coherent buffers
    • +
    • iris: Actually describe bo_reuse driconf option
    • +
    • iris: Fix partial fast clear checks to account for miplevel.
    • +
    • iris: Lessen texture cache hack flush for blits/copies on Icelake.
    • +
    • iris: Report correct number of planes for planar images
    • +
    • iris: Invalidate state/texture/constant caches after STATE_BASE_ADDRESS
    • +
    • intel: Stop redirecting state cache to command streamer cache section
    • +
    • iris: Support the disable_throttling=true driconf option.
    • +
    • iris: Ignore line stipple information if it's disabled
    • +
    • iris: Add support for the always_flush_cache=true debug option.
    • +
    • iris: Optimize out redundant sampler state binds
    • +
    • iris: Avoid flushing for cache history on transfer range flushes
    • +
    • iris: Fix constant buffer sizes for non-UBOs
    • +
    • gallium: Fix util_format_get_depth_only
    • +
    • iris: Finish initializing the BO before stuffing it in the hash table
    • +
    • iris: Set bo->reusable = false in iris_bo_make_external_locked
    • +
    • st/mesa: Only pause queries if there are any active queries to pause.
    • +
    • iris: trivial whitespace fixes
    • +
    • iris: Initialize ice->state.prim_mode to an invalid value
    • +
    • st/mesa: Prefer 5551 formats for GL_UNSIGNED_SHORT_5_5_5_1.
    • +
    • st/mesa: Increase GL_POINT_SIZE_RANGE minimum to 1.0
    • +
    • intel/compiler: Set "Null Render Target" ex_desc bit on Gen11
    • +
    • iris: Skip allocating a null surface when there are 0 color regions.
    • +
    • iris: Flag IRIS_DIRTY_BINDINGS_XS on constant buffer rebinds
    • +
    • iris: Explicitly emit 3DSTATE_BTP_XS on Gen9 with DIRTY_CONSTANTS_XS
    • +
    • iris: Don't flag IRIS_DIRTY_BINDINGS for constant usage history
    • +
    • iris: Track per-stage bind history, reduce work accordingly
    • +
    • intel/compiler: Record whether any pull constant loads occur
    • +
    • iris: Avoid uploading SURFACE_STATE descriptors for UBOs if possible
    • +
    • iris: Use state_refs for draw parameters.
    • +
    • iris: Rework iris_update_draw_parameters to be more efficient
    • +
    • iris: Skip double-disabling TCS/TES/GS after BLORP operations
    • +
    • isl: Drop WaDisableSamplerL2BypassForTextureCompressedFormats on Gen11
    • +
    • st/mesa: Bail on incomplete attachments in discard_framebuffer
    • +
    • intel/genxml: Stop manually scrubbing 'α' -> "alpha"
    • +
    • broadcom/genxml: Stop manually scrubbing 'α' -> "alpha"
    • +
    • Revert "intel/gen11+: Enable Hardware filtering of Semi-Pipelined State in WM"
    • +
    • intel: Increase Gen11 compute shader scratch IDs to 64.
    • +
    • iris: Only resolve for image levels/layers which are actually in use.
    • +
    • iris: Disable CCS_E for 32-bit floating point textures.
    • +
    • iris: Fix iris_rebind_buffer() for VBOs with non-zero offsets.
    • +
    • st/dri: Perform MSAA downsampling for __DRI2_THROTTLE_COPYSUBBUFFER
    • +
    • dri: Avoid swapbuffer throttling in glXCopySubBufferMESA
    • +
    • iris: Refactor push constant allocation so we can reuse it
    • +
    • iris: Hack up a SKL/Gen9LP PS push constant fifo depth workaround
    • +
    • Revert "iris: Hack up a SKL/Gen9LP PS push constant fifo depth workaround"
    • +
    • iris: Drop bonus parameters from iris_init_*_context()
    • +
    • iris: Drop vtbl usage for some load_register calls
    • +
    • iris: Update comment about 3-component formats and buffer textures
    • +
    • iris: Properly unreference extra VBOs for draw parameters
    • +
    • st/mesa: Fix inverted polygon stipple condition
    • +
    • iris: Implement the Broadwell NP Z PMA Stall Fix
    • +
    • intel/fs/gen12: Use TCS 8_PATCH mode.
    • +
    • iris: Implement the Gen < 9 tessellation quads workaround
    • +
    • mesa: Use ctx->ReadBuffer in glReadBuffer back-to-front tests
    • +
    • mesa: Make back_to_front_if_single_buffered non-static
    • +
    • mesa: Handle pbuffers in desktop GL framebuffer attachment queries
    • +
    • intel/compiler: Report the number of non-spill/fill SEND messages
    • +
    • st/mesa: Silence chatty debug printf
    • +
    • iris: Rework edgeflag handling
    • +
    • nir: Use VARYING_SLOT_TESS_MAX to size indirect bitmasks
    • +
    • iris: Fix "Force Zero RTA Index Enable" setting again
    • +
    • driconf, glsl: Add a vs_position_always_invariant option
    • +
    • drirc: Set vs_position_always_invariant for Shadow of Mordor on Intel
    • +

      +

      Kevin Strasser (14):

      +
    • drm-uapi: Update headers for fp16 formats
    • +
    • i965: Add helper function for allowed config formats
    • +
    • gallium: Use consistent approach for config format filtering
    • +
    • dri: Add config attributes for color channel shift
    • +
    • util: move bitcount to bitscan.h
    • +
    • egl: Convert configs to use shifts and sizes instead of masks
    • +
    • glx: Add fields for color shifts
    • +
    • dri: Handle configs with floating point pixel data
    • +
    • egl: Handle dri configs with floating point pixel data
    • +
    • dri: Add fp16 formats
    • +
    • gbm: Add buffer handling and visuals for fp16 formats
    • +
    • i965: Add handling for fp16 configs
    • +
    • gallium: Add buffer and configs handling or fp16 formats
    • +
    • egl: Fix implicit declaration of ffs
    • +

      +

      Khaled Emara (2):

      +
    • freedreno/a3xx: fix texture tiling parameters
    • +
    • freedreno/a3xx: fix sysmem <-> gmem tiles transfer
    • +

      +

      Kristian Høgsberg (40):

      +
    • freedreno/a6xx: Let the GPU track streamout offsets
    • +
    • freedreno/a6xx: Implement primitive count queries on GPU
    • +
    • freedreno/a6xx: Track location of gl_Position out as we link it
    • +
    • freedreno/a6xx: Share shader state constructor and destructor
    • +
    • freedreno/a6xx: Turn on vectorize_io
    • +
    • freedreno/a6xx: Write multiple regs for SP_VS_OUT_REG and SP_VS_VPC_DST_REG
    • +
    • freedreno/regs: Fix CP_DRAW_INDX_OFFSET command
    • +
    • freedreno/regs: A couple of tess updates
    • +
    • freedreno/a6xx: Factor out const state setup
    • +
    • freedreno: Rename vp and fp to vs and fs in fd_program_stateobj
    • +
    • freedreno: Add state binding functions for HS/DS/GS
    • +
    • freedreno: Move fs functions after geometry pipeline stages
    • +
    • freedreno/a6xx: Add generic program stateobj support for HS/DS/GS
    • +
    • freedreno/ir3: Add HS/DS/GS to shader key and cache
    • +
    • freedreno/a6xx: Emit const and texture state for HS/DS/GS
    • +
    • freedreno/a6xx: Move instrlen and obj_start writes to fd6_emit_shader
    • +
    • freedreno/registers: Update with GS, HS and DS registers
    • +
    • freedreno/a6xx: Trim a few regs from fd6_emit_restore()
    • +
    • freedreno/ir3: Add support for CHSH and CHMASK instructions
    • +
    • freedreno/ir3: Use third register for offset for LDL and LDLV
    • +
    • freedreno/ir3: Extend RA with mechanism for pre-coloring registers
    • +
    • freedreno/ir3: Add new LDLW/STLW instructions
    • +
    • freedreno/ir3: Add intrinsics that map to LDLW/STLW
    • +
    • freedreno/a6xx: Add missing adjacency primitives to table
    • +
    • freedreno/ir3: Add has_gs flag to shader key
    • +
    • freedreno/ir3: Implement lowering passes for VS and GS
    • +
    • freedreno/ir3: Implement primitive layout intrinsics
    • +
    • freedreno/ir3: Setup ir3 inputs and outputs for GS
    • +
    • freedreno/ir3: Pre-color GS header and primitive ID
    • +
    • freedreno/ir3: Start GS with (ss) and (sy)
    • +
    • freedreno/ir3: End VS with CHMASK and CHSH in GS pipelines
    • +
    • freedreno/a6xx: Emit program state for GS
    • +
    • freedreno/a6xx: Support layered render targets
    • +
    • st/mesa: Also enable GS when ESSLVersion > 320
    • +
    • freedreno/blitter: Save GS state
    • +
    • freedreno/a6xx: Implement PIPE_QUERY_PRIMITIVES_GENERATED for GS
    • +
    • freedreno/ci: Add failing tests to skip list
    • +
    • freedreno/a6xx: Turn on geometry shaders
    • +
    • nir: Use BITSET for tracking varyings in lower_io_arrays
    • +
    • freedreno/a6xx: Disable geometry shaders for release
    • +

      +

      Krzysztof Raszkowski (2):

      +
    • util: Add unreachable() definition for clang compiler.
    • +
    • gallium/swr: Enable GL_ARB_gpu_shader5: multiple streams
    • +

      +

      Laurent Carlier (1):

      +
    • egl: avoid local modifications for eglext.h Khronos standard header file
    • +

      +

      Leo Liu (3):

      +
    • radeon/vcn: add RENOIR VCN decode support
    • +
    • radeon/vcn: Add VP9 8K decode support
    • +
    • radeonsi: enable 8K video decode support for HEVC and VP9
    • +

      +

      Lepton Wu (14):

      +
    • st/mesa: Allow zero as [level|layer]_override
    • +
    • virgl: Fix pipe_resource leaks under multi-sample.
    • +
    • egl/android: Only keep BGRA EGL configs as fallback
    • +
    • virgl: replace fprintf with _debug_printf
    • +
    • virgl: Remove wrong EAGAIN handling for drmIoctl
    • +
    • gbm: Add GBM_MAX_PLANES definition
    • +
    • egl/android: Remove our own reference to buffers.
    • +
    • virgl: Remove formats with unusual sample count.
    • +
    • mapi: Inline call x86_current_tls.
    • +
    • mapi: split entry_generate_or_patch for x86 tls
    • +
    • mapi: Clean up entry_patch_public for x86 tls
    • +
    • mapi: Inline call x86_current_tls.
    • +
    • mapi: Improve the x86 tsd stubs performance.
    • +
    • gallium: dri2: Use index as plane number.
    • +

      +

      Lionel Landwerlin (59):

      +
    • glsl/tests: take refs on glsl types
    • +
    • nir/tests: take reference on glsl types
    • +
    • compiler: ensure glsl types are not created without a reference
    • +
    • mesa/compiler: rework tear down of builtin/types
    • +
    • radeonsi: take reference glsl types for compile threads
    • +
    • i965: honor scanout requirement from DRI
    • +
    • util/timespec: use unsigned 64 bit integers for nsec values
    • +
    • util: fix compilation on macos
    • +
    • egl: fix platform selection
    • +
    • vulkan/overlay: bounce image back to present layout
    • +
    • intel: update product names for WHL
    • +
    • radv: store engine name
    • +
    • driconfig: add a new engine name/version parameter
    • +
    • vulkan: add vk_x11_strict_image_count option
    • +
    • util/xmlconfig: fix regexp compile failure check
    • +
    • drirc: include unreal engine version 0 to 23
    • +
    • anv: gem-stubs: return a valid fd got anv_gem_userptr()
    • +
    • intel: use proper label for Comet Lake skus
    • +
    • intel: Add new Comet Lake PCI-ids
    • +
    • mesa: don't forget to clear _Layer field on texture unit
    • +
    • intel: fix topology query
    • +
    • intel/error2aub: add support for platforms without PPGTT
    • +
    • intel: fix subslice computation from topology data
    • +
    • intel/isl: Set null surface format to R32_UINT
    • +
    • intel/isl: set surface array appropriately
    • +
    • intel/isl: set vertical surface alignment on null surfaces
    • +
    • etnaviv: remove variable from global namespace
    • +
    • anv: fix vkUpdateDescriptorSets with inline uniform blocks
    • +
    • anv: fix memory leak on device destroy
    • +
    • anv: fix unwind of vkCreateDevice fail
    • +
    • intel/perf: add mdapi maker helper
    • +
    • intel/perf: expose some utility functions
    • +
    • intel/perf: extract register configuration
    • +
    • intel/perf: move registers to their own header
    • +
    • drm-uapi: Update headers from drm-next
    • +
    • intel/perf: add support for querying kernel loaded configurations
    • +
    • intel/genxml: add generic perf counters registers
    • +
    • intel/genxml: add RPSTAT register for core frequency
    • +
    • intel/perf: add mdapi writes for register perf counters
    • +
    • anv: implement VK_INTEL_performance_query
    • +
    • docs: Add new Intel extension
    • +
    • intel/dev: store whether the device uses an aux map tables on devinfo
    • +
    • anv: Add aux-map translation for gen12+
    • +
    • intel/perf: update ICL configurations
    • +
    • intel/dump_gpu: handle context create extended ioctl
    • +
    • intel/dev: set default num_eu_per_subslice on gen12
    • +
    • mesa: check draw buffer completeness on glClearBufferfi/glClearBufferiv
    • +
    • anv: Properly handle host query reset of performance queries
    • +
    • mesa: check framebuffer completeness only after state update
    • +
    • anv: invalidate file descriptor of semaphore sync fd at vkQueueSubmit
    • +
    • anv: remove list items on batch fini
    • +
    • anv/wsi: signal the semaphore in the acquireNextImage
    • +
    • intel/perf: fix invalid hw_id in query results
    • +
    • intel/perf: set read buffer len to 0 to identify empty buffer
    • +
    • intel/perf: take into account that reports read can be fairly old
    • +
    • intel/perf: simplify the processing of OA reports
    • +
    • intel/perf: fix improper pointer access
    • +
    • anv: fix missing gen12 handling
    • +
    • anv: fix incorrect VMA alignment for CCS main surfaces
    • +

      +

      Lucas Stach (17):

      +
    • etnaviv: fix vertex buffer state emission for single stream GPUs
    • +
    • gallium/util: don't depend on implementation defined behavior in listen()
    • +
    • rbug: fix transmitted texture sizes
    • +
    • rbug: unwrap index buffer resource
    • +
    • rbug: move flush_resource initialization
    • +
    • rbug: implement missing explicit sync related fence functions
    • +
    • rbug: forward texture_barrier to pipe driver
    • +
    • rbug: forward can_create_resource to pipe driver
    • +
    • rbug: implement resource creation with modifier
    • +
    • rbug: remove superfluous NULL check
    • +
    • etnaviv: keep references to pending resources
    • +
    • etnaviv: drm: remove unused etna_cmd_stream_finish
    • +
    • etnaviv: rework the stream flush to always go through the context flush
    • +
    • etnaviv: drm: add softpin interface
    • +
    • etnaviv: check for softpin availability on Halti5 devices
    • +
    • etnaviv: add linear texture support on GC7000
    • +
    • etnaviv: GC7000: flush TX descriptor and instruction cache
    • +

      +

      Marek Olšák (161):

      +
    • radeonsi/gfx10: fix the legacy pipeline by storing as_ngg in the shader cache
    • +
    • radeonsi: move some global shader cache flags to per-binary flags
    • +
    • radeonsi/gfx10: fix tessellation for the legacy pipeline
    • +
    • radeonsi/gfx10: fix the PRIMITIVES_GENERATED query if using legacy streamout
    • +
    • radeonsi/gfx10: create the GS copy shader if using legacy streamout
    • +
    • radeonsi/gfx10: add as_ngg variant for VS as ES to select Wave32/64
    • +
    • radeonsi/gfx10: fix InstanceID for legacy VS+GS
    • +
    • radeonsi/gfx10: don't initialize VGT_INSTANCE_STEP_RATE_0
    • +
    • radeonsi/gfx10: always use the legacy pipeline for streamout
    • +
    • radeonsi/gfx10: finish up Navi14, add PCI ID
    • +
    • radeonsi/gfx10: add AMD_DEBUG=nongg
    • +
    • winsys/amdgpu+radeon: process AMD_DEBUG in addition to R600_DEBUG
    • +
    • radeonsi: add PKT3_CONTEXT_REG_RMW
    • +
    • radeonsi/gfx10: remove incorrect ngg/pos_writes_edgeflag variables
    • +
    • radeonsi/gfx10: set PA_CL_VS_OUT_CNTL with CONTEXT_REG_RMW to fix edge flags
    • +
    • radeonsi: consolidate determining VGPR_COMP_CNT for API VS
    • +
    • radeonsi: align scratch and ring buffer allocations for faster memory access
    • +
    • radeonsi: unbind blend/DSA/rasterizer state correctly in delete functions
    • +
    • radeonsi: fix scratch buffer WAVESIZE setting leading to corruption
    • +
    • ac: enable LLVM atomic optimizations
    • +
    • ac: use fma on gfx10
    • +
    • radeonsi/gfx10: use fma for TGSI_OPCODE_FMA
    • +
    • radeonsi/gfx10: don't call gfx10_destroy_query with compute-only contexts
    • +
    • radeonsi: disable DCC when importing a texture from an incompatible driver
    • +
    • radeonsi: only support at most 1024 threads per block
    • +
    • radeonsi/gfx10: fix wave occupancy computations
    • +
    • r300,r600,radeonsi: read winsys_handle::stride,offset in drivers, not winsyses
    • +
    • r300,r600,radeonsi: set winsys_handle::stride,offset in drivers, not winsyses
    • +
    • ac/surface: add RADEON_SURF_NO_FMASK
    • +
    • radeonsi: handle NO_DCC early
    • +
    • radeonsi: move HTILE allocation outside of radeonsi
    • +
    • radeonsi: move texture storage allocation outside of radeonsi
    • +
    • radeonsi: remove redundant si_texture offset and size fields
    • +
    • ac: replace HAVE_LLVM with LLVM_VERSION_MAJOR for atomic-optimizations
    • +
    • prog_to_nir, tgsi_to_nir: make sure kill doesn't discard NaNs
    • +
    • radeonsi/gfx9: honor user stride for imported buffers
    • +
    • radeonsi: add Navi12 PCI ID
    • +
    • ac: move PBB MAX_ALLOC_COUNT into radeon_info
    • +
    • ac: move num_sdp_interfaces into radeon_info
    • +
    • ac: move ac_get_max_wave64_per_simd into radeon_info
    • +
    • ac: move ac_get_num_physical_sgprs into radeon_info
    • +
    • ac: move ac_get_num_physical_vgprs into radeon_info
    • +
    • gallium: extend resource_get_param to be as capable as resource_get_handle
    • +
    • radeonsi: implement pipe_screen::resource_get_param
    • +
    • radeonsi: include drm_fourcc.h to fix the build
    • +
    • amd: add more PCI IDs for Navi14
    • +
    • ac/addrlib: fix chip identification for Vega10, Arcturus, Raven2, Renoir
    • +
    • ac: stop using PCI IDs for chip identification
    • +
    • amd: remove all PCI IDs supported by amdgpu
    • +
    • nir: don't add bindless variables to num_textures and num_images
    • +
    • nir: define 8-byte size and alignment for bindless variables
    • +
    • tgsi_to_nir: fix masked out image loads
    • +
    • tgsi_to_nir: fix 2-component system values like tess_level_inner_default
    • +
    • ac/nir: port Z compare value clamping from radeonsi
    • +
    • ac/nir: force unnormalized coordinates for RECT
    • +
    • radeonsi: initialize displayable DCC using the retile blit to prevent hangs
    • +
    • gallium/vl: don't set PIPE_HANDLE_USAGE_EXPLICIT_FLUSH
    • +
    • radeonsi/gfx10: fix L2 cache rinse programming
    • +
    • ac: fix incorrect vram_size reported by the kernel
    • +
    • ac: add radeon_info::tcc_harvested
    • +
    • radeonsi/gfx10: fix corruption for chips with harvested TCCs
    • +
    • ac: fix num_good_cu_per_sh for harvested chips
    • +
    • ac: set the number of SDPs same as the number of TCCs
    • +
    • ac: reorder and print all radeon_info fields
    • +
    • tgsi_to_nir: handle PIPE_FORMAT_NONE in image opcodes
    • +
    • ac/surface: don't allocate FMASK if there is no graphics
    • +
    • ac: add ac_build_image_get_sample_count from radeonsi
    • +
    • ac/nir: fix GLSL imageSamples()
    • +
    • winsys/radeon: initialize SIMD properties in radeon_info
    • +
    • util: use simple_mtx_t for util_range
    • +
    • gallium: add PIPE_RESOURCE_FLAG_SINGLE_THREAD_USE to skip util_range lock
    • +
    • st/mesa: use simple_mtx_t instead of mtx_t
    • +
    • radeonsi: use simple_mtx_t instead of mtx_t
    • +
    • amd: don't use AMD_FAMILY definitions from amdgpu_drm.h
    • +
    • gallium/util: remove enum numbering from util_format_layout
    • +
    • gallium/util: add planar format layouts and helpers
    • +
    • gallium/u_tests: test NV12 allocation and export
    • +
    • vl: use u_format in vl_video_buffer_formats
    • +
    • radeonsi: allocate planar multimedia formats in 1 buffer
    • +
    • radeonsi: remove si_vid_join_surfaces and use combined planar allocations
    • +
    • radeonsi: ignore metadata for non-zero planes
    • +
    • radeonsi: don't set BO metadata for non-zero planes
    • +
    • nir: add shader_info::last_msaa_image
    • +
    • tgsi/scan: add tgsi_shader_info::msaa_images_declared
    • +
    • radeonsi: fix GLSL imageSamples()
    • +
    • radeonsi: set the sample index for shader images correctly
    • +
    • radeonsi: add FMASK slots for shader images (for MSAA images)
    • +
    • radeonsi: clean up image_fetch_rsrc
    • +
    • radeonsi: apply FMASK to MSAA image loads
    • +
    • radeonsi: expand FMASK before MSAA image stores are used
    • +
    • radeonsi: enable MSAA shader images
    • +
    • nir: add a strip parameter to nir_serialize
    • +
    • nir: move gl_nir_opt_access from glsl directory
    • +
    • nir/drawpixels: handle load_color0, load_input, load_interpolated_input
    • +
    • nir/drawpixels: fix what appears to be a copy-paste bug in get_texcoord_const
    • +
    • tgsi_to_nir: add #ifdef header guards
    • +
    • nir: add nir_shader_compiler_options::lower_to_scalar
    • +
    • st/mesa: use nir_shader_compiler_options::lower_to_scalar
    • +
    • tgsi_to_nir: use nir_shader_compiler_options::lower_to_scalar
    • +
    • gallium: remove PIPE_SHADER_CAP_SCALAR_ISA
    • +
    • ac/nir: add back nir_op_fmod
    • +
    • clover: fix the nir_serialize build failure
    • +
    • st/mesa: always allocate pack/unpack buffers as staging
    • +
    • radeonsi/nir: simplify si_lower_nir signature
    • +
    • st/mesa: use *prog at the end of st_link_nir
    • +
    • st/mesa: deduplicate code for ATI fs in st_program_string_notify
    • +
    • st/mesa: simplify the signature of st_release_basic_variants
    • +
    • st/mesa: don't store stream output info to shader cache for tess ctrl shaders
    • +
    • st/mesa: remove st_compute_program in favor of st_common_program
    • +
    • st/mesa: deduplicate cases in st_deserialise_ir_program
    • +
    • st/mesa: sink TCS/TES/GS/CS translate code into st_translate_common_program
    • +
    • st/mesa: deduplicate st_common_program code in st_program_string_notify
    • +
    • st/mesa: clean up more after the removal of st_compute_program
    • +
    • st/mesa: move vertex program preparation code into st_prepare_vertex_program
    • +
    • st/mesa: unify transform feedback info translation code
    • +
    • st/mesa: finalize NIR after shader variant passes for TCS/TES/GS/CS
    • +
    • st/mesa: don't call translate_*_program functions for NIR
    • +
    • st/mesa: call prog_to_nir sooner for ARB_fp
    • +
    • st/mesa: reorder and document code in st_translate_vertex_program
    • +
    • st/mesa: call the reset callback if glGetGraphicsResetStatus returns a failure
    • +
    • radeonsi: call the reset callback if get_device_reset_status returns a failure
    • +
    • radeonsi: recreate aux_context after a GPU reset
    • +
    • gallium/u_blitter: remove an unused variable
    • +
    • st/mesa: silence a warning in st_nir_lower_tex_src_plane
    • +
    • st/mesa: call st_nir_opts for linked shaders only once
    • +
    • st/mesa: lower doubles for NIR after linking
    • +
    • st/mesa: rename st_xxx_program::tgsi to state
    • +
    • st/mesa: rename basic -> common for st_common_program
    • +
    • st/mesa: remove num_tgsi_tokens from st_xx_program
    • +
    • st/mesa: remove st_vp_variant_key in favor of st_common_variant_key
    • +
    • st/mesa: remove unused st_xxx_program::sha1
    • +
    • st/mesa: remove redundant function st_reference_compprog
    • +
    • st/mesa: merge st_fragment_program into st_common_program
    • +
    • st/mesa: don't call variables "tgsi" when they can reference NIR
    • +
    • nir: allow nir_lower_uniforms_to_ubo to be run repeatedly
    • +
    • st/mesa: replace pipe_shader_state with tgsi_token* in st_vp_variant
    • +
    • gallium/noop: implement get_disk_shader_cache and get_compiler_options
    • +
    • util/disk_cache: finish all queue jobs in destroy instead of killing them
    • +
    • util/u_queue: skip util_queue_finish if num_threads is 0
    • +
    • st/mesa: move some NIR lowering before shader caching
    • +
    • st/mesa: don't lower_global_vars_to_local for VS if there are no dead inputs
    • +
    • st/mesa: assign driver locations for VS inputs for NIR before caching
    • +
    • st/mesa: update VS shader_info for NIR after lowering passes
    • +
    • gallium: add pipe_screen::finalize_nir
    • +
    • tgsi_to_nir: use pipe_screen::finalize_nir
    • +
    • st/mesa: use pipe_screen::finalize_nir
    • +
    • radeonsi/nir: implement pipe_screen::finalize_nir
    • +
    • glsl/serialize: restructure remap table code
    • +
    • glsl/serialize: optimize for equal offsets in uniform remap tables
    • +
    • include: add the definition of EGL_EXT_image_flush_external
    • +
    • dri_interface: add interface for EGL_EXT_image_flush_external
    • +
    • st/dri: assume external consumers of back buffers can write to the buffers
    • +
    • st/dri: add support for EGL_EXT_image_flush_external
    • +
    • egl: handle EGL_IMAGE_EXTERNAL_FLUSH_EXT
    • +
    • egl: implement new functions from EGL_EXT_image_flush_external
    • +
    • docs: document new feature EGL_EXT_image_flush_external
    • +
    • radeonsi: don't print diagnostic LLVM remarks and notes
    • +
    • radeonsi: initialize shader compilers in threads on demand
    • +
    • ac: get tcc_harvested from the kernel
    • +
    • winsys/amdgpu: use the new GPU reset query
    • +
    • st/mesa: fix Sanctuary and Tropics by disabling ARB_gpu_shader5 for them
    • +

      +

      Marek Vasut (4):

      +
    • etnaviv: Make contexts track resources
    • +
    • etnaviv: Rework resource status tracking
    • +
    • etnaviv: Command buffer realloc
    • +
    • etnaviv: Rework locking
    • +

      +

      Marijn Suijten (2):

      +
    • freedreno/a5xx: enable a510
    • +
    • freedreno/ir3: Add missing ir3_nir_lower_tex_prefetch.c to Android.mk
    • +

      +

      Matt Turner (6):

      +
    • clover: Remove unused code
    • +
    • intel/compiler: Remove unreachable() from brw_reg_type.c
    • +
    • intel/compiler: Restructure instruction compaction in preparation for Gen12
    • +
    • intel/compiler: Inline get_src_index()
    • +
    • intel/compiler: Make separate src0/src1 index tables
    • +
    • intel/compiler: Add instruction compaction support on Gen12
    • +

      +

      Mauro Rossi (8):

      +
    • android: mesa: revert "Enable asm unconditionally"
    • +
    • android: anv: libmesa_vulkan_common: add libmesa_util static dependency
    • +
    • android: aco: fix undefined template 'std::__1::array' build errors
    • +
    • android: compiler/nir: build nir_divergence_analysis.c
    • +
    • android: aco: add support for libmesa_aco
    • +
    • android: amd/common: export amd/llvm headers
    • +
    • android: aco: fix Lower to CSSA
    • +
    • android: radeonsi: fix build after vl refactoring (v2)
    • +

      +

      Maya Rashish (3):

      +
    • intel/compiler: avoid truncating int64_t to int
    • +
    • meson: Test for -Wl,--build-id=sha1
    • +
    • llvmpipe: avoid left-shifting a negative number.
    • +

      +

      Michael Schellenberger Costa (1):

      +
    • aco: Cleanup insert_before_logical_end
    • +

      +

      Michel Dänzer (48):

      +
    • gitlab-ci: Move up meson-main job definition
    • +
    • gitlab-ci: Use new needs: keyword
    • +
    • gitlab-ci: Explicitly install linux-libc-dev for foreign architectures
    • +
    • gitlab-ci: Keep g++ from stretch when installing foreign toolchains
    • +
    • gitlab-ci: Add needs stanza to arm64_a306_gles2 job definition
    • +
    • gitlab-ci: Use multiple inheritance instead of YAML references
    • +
    • gitlab-ci: Simplify some job definitions by extending more similar jobs
    • +
    • gitlab-ci: Move dependencies/needs for meson-main job to .deqp-test
    • +
    • gitlab-ci: Move up meson-arm64 job definition
    • +
    • gallivm: Limit DEBUG workaround to LLVM < 7
    • +
    • swr: Limit DEBUG workaround to LLVM < 7
    • +
    • ac: Remove DEBUG workaround
    • +
    • gitlab-ci: Reference full ci-templates commit hash
    • +
    • gitlab-ci: Pass --no-remove to apt-get where possible
    • +
    • gitlab-ci: Create separate docker images for Debian stretch & buster
    • +
    • gitlab-ci: Use newer packages from backports by default
    • +
    • gitlab-ci: Use crossbuild-essential-* packages
    • +
    • gitlab-ci: Move scons build/test commands to a separate shell script
    • +
    • gitlab-ci: Test scons with all LLVM versions
    • +
    • gitlab-ci: Merge scons-nollvm and scons-llvm jobs
    • +
    • radeonsi: fix VAAPI segfault due to various bugs
    • +
    • loader: Avoid use-after-free / use of uninitialized local variables
    • +
    • gitlab-ci: Declare needs: for stretch docker image
    • +
    • gitlab-ci: Add needs: for x86 buster docker image
    • +
    • gitlab-ci: Add test-container:arm64 to needs: for arm64 test jobs
    • +
    • gitlab-ci: Set ccache path for cross compilers in meson cross file
    • +
    • gitlab-ci: Use per-job ccache
    • +
    • dri3: Pass __DRI2_THROTTLE_COPYSUBBUFFER from loader_dri3_copy_drawable
    • +
    • loader: Simplify handling of the radeonsi driver
    • +
    • gitlab-ci/lava: Add needs: for container image to test jobs
    • +
    • gitlab-ci: Remove redundant .meson-cross template script
    • +
    • gitlab-ci: Add .use-debian-10 template
    • +
    • gitlab-ci: Disable meson-mingw32-x86_64 job again for now
    • +
    • gitlab-ci: Sort ARM docker image packages in alphabetical order
    • +
    • gitlab-ci: Bring ARM docker image install script in line with x86_64
    • +
    • gitlab-ci: Explicitly list debian-10 in needs: for .deqp-test template
    • +
    • gitlab-ci: Use native aarch64 runner for ARM build jobs
    • +
    • gitlab-ci: Update the meson cross file for LLVM_VERSION as well
    • +
    • gitlab-ci: Enable llvmpipe in ARM build jobs
    • +
    • intel/compiler: Don't left-shift by >= the number of bits of the type
    • +
    • intel/compiler: Cast to target type before shifting left
    • +
    • intel/fs: Check for NULL key in fs_visitor constructor
    • +
    • gallium/util: Cast to target type before shifting left
    • +
    • util: Use uint64_t for shifting left in sign_extend and strunc
    • +
    • util/tests: Avoid int64_t overflow issues in fast_idiv_by_const test
    • +
    • gitlab-ci: Enable UBSan for the meson-vulkan job
    • +
    • gitlab-ci: Only run the pipeline if any files affecting it have changed
    • +
    • gitlab-ci: Disable meson-windows job for the time being
    • +

      +

      Michel Zou (1):

      +
    • scons: add py3 support
    • +

      +

      Nanley Chery (47):

      +
    • anv/blorp: Use BLORP_BATCH_NO_UPDATE_CLEAR_COLOR
    • +
    • anv: Properly allocate aux-tracking space for CCS_E
    • +
    • anv/formats: Disable I915_FORMAT_MOD_Y_TILED_CCS on TGL+
    • +
    • iris: Drop support for I915_FORMAT_MOD_Y_TILED_CCS on TGL+
    • +
    • isl: Disable CCS_D on Gen12+
    • +
    • anv/image: Disable CCS_D on Gen12+
    • +
    • anv/cmd_buffer: Don't assume CCS_E includes CCS_D
    • +
    • iris: Don't assume CCS_E includes CCS_D
    • +
    • isl: Round up some pitches to 512B for Gen12's CCS
    • +
    • intel/blorp: Halve the Gen12 fast-clear/resolve rectangle
    • +
    • intel/blorp: Don't assert aux slices match main slices
    • +
    • anv/private: Modify aux slice helpers for Gen12 CCS
    • +
    • i965/miptree: Avoid -Wswitch for the Gen12 aux modes
    • +
    • isl/drm: Map HiZ and CCS tilings to Y
    • +
    • iris: Allow for non-Y-tiled aux allocation
    • +
    • isl: Add and use isl_tiling_flag_to_enum()
    • +
    • isl: Redefine the CCS layout for Gen12
    • +
    • intel: Enable CCS_E for some formats on Gen12
    • +
    • intel/blorp: Disable depth testing for slow depth clears
    • +
    • iris: Clear ::has_hiz when disabling aux
    • +
    • intel: Use RENDER_SURFACE_STATE::DepthStencilResource
    • +
    • intel: Use 3DSTATE_DEPTH_BUFFER::ControlSurfaceEnable
    • +
    • intel: Enable CCS_E for R24_UNORM_X8_TYPELESS on TGL+
    • +
    • isl: Reduce assertions during aux surf creation
    • +
    • intel: Support HIZ_CCS in isl_surf_get_ccs_surf
    • +
    • intel/blorp: Assert against HiZ in surface states
    • +
    • intel/blorp: Treat HIZ_CCS like HiZ
    • +
    • iris: Don't guess the aux_usage
    • +
    • iris: Create an unusable secondary aux surface
    • +
    • iris: Define initial HIZ_CCS state and transitions
    • +
    • iris: Enable HIZ_CCS in depth buffer instructions
    • +
    • isl: Add isl_surf_supports_hiz_ccs_wt()
    • +
    • intel: Refactor blorp_can_hiz_clear_depth()
    • +
    • intel/blorp: Satisfy HIZ_CCS fast-clear alignments
    • +
    • iris: Start using blorp_can_hiz_clear_depth()
    • +
    • intel: Fix and use HIZ_CCS write through mode
    • +
    • intel/blorp: Satisfy clear color rules for HIZ_CCS
    • +
    • iris: Enable HIZ_CCS sampling
    • +
    • iris: Don't leak the resource for unsupported modifier
    • +
    • iris: Disallow incomplete resource creation
    • +
    • iris: Drop iris_resource::aux::extra_aux::bo
    • +
    • iris: Bail resource creation upon aux creation error
    • +
    • iris: Determine aux offsets within configure_aux
    • +
    • iris: Allocate main and aux surfaces together
    • +
    • gallium/dri2: Fix creation of multi-planar modifier images
    • +
    • gallium: Store the image format in winsys_handle
    • +
    • iris: Fix import of multi-planar surfaces with modifiers
    • +

      +

      Nataraj Deshpande (1):

      +
    • egl/android: Enable HAL_PIXEL_FORMAT_RGBA_FP16 format
    • +

      +

      Neil Armstrong (1):

      +
    • Revert "ci: Disable lima until its farm can get fixed."
    • +

      +

      Neil Roberts (6):

      +
    • glsl: Store the precision for a function return type
    • +
    • nir/builder: Move nir_atan and nir_atan2 from SPIR-V translator
    • +
    • nir/builtin: Add #include u_math.h to the header
    • +
    • nir/builtin: Add extern "C" guards to nir_builtin_builder.h
    • +
    • glsl: Add opcodes for atan and atan2
    • +
    • glsl/builtin: Add alternate versions of atan using new ops
    • +

      +

      OBATA Akio (1):

      +
    • util: fix to detect NetBSD properly
    • +

      +

      Paulo Zanoni (8):

      +
    • intel/fs: grab fail_msg from v32 instead of v16 when v32->run_cs fails
    • +
    • intel/fs: make scan/reduce work with SIMD32 when it fits 2 registers
    • +
    • intel/fs: roll the loop with the <0,1,0> additions in emit_scan()
    • +
    • intel/fs: the maximum supported stride width is 16
    • +
    • intel/fs: fix SHADER_OPCODE_CLUSTER_BROADCAST for SIMD32
    • +
    • intel/fs: don't forget the stride at generate_shuffle
    • +
    • intel/compiler: remove the operand restriction for src1 on GLK
    • +
    • intel/compiler: fix nir_op_{i,u}*32 on ICL
    • +

      +

      Pierre Moreau (5):

      +
    • meson: Check for SPIRV-Tools and llvm-spirv
    • +
    • clover/spirv: Add functions for validating SPIR-V binaries
    • +
    • clover/spirv: Add functions for parsing arguments, linking programs, etc.
    • +
    • clover/llvm: Add options for dumping SPIR-V binaries
    • +
    • clover/llvm: Add functions for compiling from source to SPIR-V
    • +

      +

      Pierre-Eric Pelloux Prayer (1):

      +
    • mesa: implement glTextureStorageNDEXT functions
    • +

      +

      Pierre-Eric Pelloux-Prayer (23):

      +
    • glsl: replace 'x + (-x)' with constant 0
    • +
    • mesa: fix invalid target error handling for teximage
    • +
    • mesa: add EXT_dsa glNamedRenderbufferStorageEXT and glGetNamedRenderbufferParameterivEXT
    • +
    • mesa: add EXT_dsa glClientAttribDefaultEXT / glPushClientAttribDefaultEXT
    • +
    • mesa: add EXT_dsa NamedProgram functions
    • +
    • mesa: add EXT_dsa glProgramUniform*EXT functions
    • +
    • mesa: add EXT_dsa + EXT_texture_buffer_object functions
    • +
    • mesa: add EXT_dsa + EXT_texture_integer functions
    • +
    • mesa: add EXT_dsa + EXT_gpu_shader4 functions
    • +
    • mesa: add EXT_dsa + EXT_gpu_program_parameters functions
    • +
    • mesa: add EXT_dsa glGetFloati_vEXT/glGetDoublei_vEXT
    • +
    • mesa: refactor GenerateTextureMipmap handling
    • +
    • mesa: add EXT_dsa Generate*MipmapEXT functions
    • +
    • mesa: add EXT_dsa NamedRenderbufferStorageMultisampleEXT function
    • +
    • mesa: add EXT_dsa NamedCopyBufferSubDataEXT function
    • +
    • radeonsi: align sdma byte count to dw
    • +
    • radeonsi: sdma misc fixes
    • +
    • radeonsi: disable sdma for gfx10
    • +
    • radeonsi: tell the shader disk cache what IR is used
    • +
    • mesa: enable msaa in clear_with_quad if needed
    • +
    • radeonsi: fix shader disk cache key
    • +
    • radeonsi: fix multi plane buffers creation
    • +
    • radeonsi: use gfx9.surf_offset to compute texture offset
    • +

      +

      Plamena Manolova (8):

      +
    • genxml: Add 3DSTATE_DEPTH_BOUNDS instruction.
    • +
    • iris: Add support for depth bounds testing.
    • +
    • anv: Add support for depth bounds testing.
    • +
    • genxml: Change 3DSTATE_DEPTH_BOUNDS bias.
    • +
    • anv: Set depthBounds to true in anv_GetPhysicalDeviceFeatures.
    • +
    • genxml: Add 3DSTATE_SO_BUFFER_INDEX_* instructions
    • +
    • iris: Implement new way for setting streamout buffers.
    • +
    • anv: Implement new way for setting streamout buffers.
    • +

      +

      Prodea Alexandru-Liviu (4):

      +
    • scons/windows: Fix build with LLVM>=8
    • +
    • scons/MSYS2-MinGW-W64: Fix build options defaults Signed-off-by: Prodea Alexandru-Liviu <liviuprodea@yahoo.com> Reviewed-by: Jose Fonseca <jfonseca@vmware.com> Cc: <mesa-stable@lists.freedesktop.org>
    • +
    • Appveyor/Meson: Add build test of osmesa gallium Signed-off-by: Prodea Alexandru-Liviu <liviuprodea@yahoo.com> Acked-by: Eric Engestrom <eric@engestrom.ch> Reviewed-by: Dylan Baker <dylan@pnwbakers.com>
    • +
    • Meson: Remove lib prefix from graw and osmesa when building with Mingw. Also remove version sufix from osmesa swrast on Windows.
    • +

      +

      Qiang Yu (4):

      +
    • lima: move format handling to unified place
    • +
    • lima: implement EGL_KHR_partial_update
    • +
    • lima: don't use damage system when full damage
    • +
    • lima: move damage bound build to resource
    • +

      +

      Rafael Antognolli (13):

      +
    • anv: Only re-emit non-dynamic state that has changed.
    • +
    • intel/tools: Fix aubinator usage of rb_tree.
    • +
    • anv/block_pool: Align anv_block_pool state to 64 bits.
    • +
    • intel/tools: Factor out GGTT allocation.
    • +
    • intel/tools: Use common code for GGTT address allocation.
    • +
    • intel/tools: Add basic aub_context code and helpers.
    • +
    • intel/tools: Support multiple contexts in intel_dump_gpu.
    • +
    • intel/blorp/gen12: Set FWCC when storing the clear color.
    • +
    • anv: Align fast clear color state buffer to a page.
    • +
    • iris: Align fast clear color state buffer to a page.
    • +
    • iris: Add Tile Cache Flush for Unified Cache.
    • +
    • blorp: Add Tile Cache Flush for Unified Cache.
    • +
    • anv: Add Tile Cache Flush for Unified Cache.
    • +

      +

      Rhys Perry (84):

      +
    • nir/lower_io_to_vector: allow FS outputs to be vectorized
    • +
    • nir/lower_io_to_vector: add flat mode
    • +
    • util: include u_endian.h in u_math.h
    • +
    • nir/lower_io_to_vector: don't merge compact varyings
    • +
    • radv: keep GS threads with excessive emissions which could write to memory
    • +
    • radv: always emit a position export in gs copy shaders
    • +
    • radv: never kill a NGG GS shader
    • +
    • nir/opt_remove_phis: handle phis with no sources
    • +
    • aco: run nir_lower_int64() before nir_lower_idiv()
    • +
    • aco: implement 64-bit ineg
    • +
    • aco: fix GFX9 opcode for v_xad_u32
    • +
    • aco: fix v_subrev_co_u32_e64 opcode
    • +
    • aco: fix opcode for s_mul_hi_i32
    • +
    • aco: check for duplicate opcode numbers
    • +
    • radv/aco: actually disable ACO when unsupported
    • +
    • aco,radv/aco: get dissassembly for release builds if requested
    • +
    • aco: store printed backend IR in binary
    • +
    • radv/aco: return a correct name and description for the backend IR
    • +
    • aco,radv: rename record_llvm_ir/llvm_ir_string to record_ir/ir_string
    • +
    • aco: don't CSE v_readlane_b32/v_readfirstlane_b32
    • +
    • aco: CSE readlane/readfirstlane/permute/reduce with the same exec mask
    • +
    • aco: set loop_info::has_discard for demotes
    • +
    • aco: don't remove the loop exec mask in transition_to_Exact()
    • +
    • radv/aco,aco: set lower_fmod
    • +
    • nir/print: always use the right FILE *
    • +
    • aco: fix load_constant with multiple arrays
    • +
    • nir/constant_folding: add back and use constant_fold_state
    • +
    • nir/constant_folding: fold load_constant intrinsics
    • +
    • aco: move s_andn2_b64 instructions out of the p_discard_if
    • +
    • aco: enable nir_opt_sink
    • +
    • aco: Allow literals on VOP3 instructions.
    • +
    • aco: Assemble opsel in VOP3 instructions.
    • +
    • aco: workaround GFX10 0x3f branch bug
    • +
    • aco: pad code with s_code_end on GFX10
    • +
    • aco: Initial work to avoid GFX10 hazards.
    • +
    • aco: Use the VOP3-only add/sub GFX10 instructions if needed.
    • +
    • aco: Have s_waitcnt_vscnt write to NULL.
    • +
    • radv/aco: disable NGG when ACO is used
    • +
    • aco/gfx10: fix inline uniform blocks
    • +
    • aco/gfx10: disable GFX9 1D texture workarounds
    • +
    • aco: rework scratch resource code
    • +
    • aco: update print_ir
    • +
    • nir/lower_non_uniform: lower image/texture instructions taking derefs
    • +
    • nir/lower_input_attachments: pass on non-uniform access flag
    • +
    • aco: don't apply sgprs/constants to read/write lane instructions
    • +
    • aco: use can_accept_constant in valu_can_accept_literal
    • +
    • aco: readfirstlane vgpr pointers in convert_pointer_to_64_bit()
    • +
    • aco: implement divergent vulkan_resource_index
    • +
    • aco: don't use p_as_uniform for vgpr sampler/image indices
    • +
    • aco: fix scheduling with s_memtime/s_memrealtime
    • +
    • aco: don't CSE s_memtime
    • +
    • aco: emit_split_vector() s_memtime results
    • +
    • nir/lower_idiv: add new llvm-based path
    • +
    • aco: use nir_lower_idiv_precise
    • +
    • aco: run opt_algebraic in a loop
    • +
    • aco: small stage corrections
    • +
    • aco: fix 64-bit p_extract_vector on 32-bit p_create_vector
    • +
    • aco: create load_lds/store_lds helpers
    • +
    • aco: fix sparse store_lds()
    • +
    • aco: properly combine additions into ds_write2_b64/ds_read2_b64
    • +
    • aco: use ds_read2_b64/ds_write2_b64
    • +
    • aco: add a few missing checks in value numbering
    • +
    • aco: keep can_reorder/barrier when combining addition into SMEM
    • +
    • aco: add missing bld.scc()
    • +
    • Revert "aco: only emit waitcnt on loop continues if we there was some load or export"
    • +
    • radv: round vgprs/sgprs before calculating max_waves
    • +
    • aco: increase accuracy of SGPR limits
    • +
    • aco: take LDS into account when calculating num_waves
    • +
    • aco: Fix reductions on GFX10.
    • +
    • aco: Remove dead code in reduction lowering.
    • +
    • aco: try to group together VMEM loads of the same resource
    • +
    • aco: a couple loop handling fixes for GFX10 hazard pass
    • +
    • aco: rename README to README.md
    • +
    • aco: fix new_demand calculation for first instructions
    • +
    • aco: fix shuffle with uniform operands
    • +
    • aco: fix read_invocation with VGPR lane index
    • +
    • aco: don't propagate vgprs into v_readlane/v_writelane
    • +
    • aco: don't combine literals into v_cndmask_b32/v_subb/v_addc
    • +
    • aco: fix 64-bit fsign with 0
    • +
    • aco: propagate p_wqm on an image_sample's coordinate p_create_vector
    • +
    • aco: fix i2i64
    • +
    • aco: add v_nop inbetween exec write and VMEM/DS/FLAT
    • +
    • radv: set writes_memory for global memory stores/atomics
    • +
    • nir/lower_io_to_vector: don't create arrays when not needed
    • +

      +

      Rob Clark (60):

      +
    • freedreno/ir3: convert block->predecessors to set
    • +
    • freedreno/ir3: maintain predecessors/successors
    • +
    • freedreno/ir3: do better job of marking convergence points
    • +
    • nir: remove unused constant_fold_state
    • +
    • freedreno/drm: fix 64b iova shifts
    • +
    • freedreno/ir3: use uniform base
    • +
    • freedreno/ir3: cleanup "partially const" ubo srcs
    • +
    • freedreno/ir3: fix addr/pred spilling
    • +
    • freedreno/ir3: fix mad copy propagation special case
    • +
    • freedreno/ir3: assert that only single address
    • +
    • freedreno/ir3: fix cp cmps.s opt
    • +
    • freedreno/ir3: allow copy propagation for relative
    • +
    • util: android logging support
    • +
    • freedreno/a6xx: don't tile things that are too small
    • +
    • freedreno/a6xx: fix 3d tex layout
    • +
    • freedreno: fix compiler warning
    • +
    • freedreno/a6xx: pre-calculate userconst stateobj size
    • +
    • gitlab-ci/a630: skip dEQP-GLES3.functional.fbo.msaa.2_samples.stencil_index8
    • +
    • freedreno/a6xx: un-open-code PC_PRIMITIVE_CNTL_1.PSIZE
    • +
    • freedreno/a6xx: fix binning pass vs. xfb
    • +
    • freedreno/a6xx: do streamout only in binning pass
    • +
    • freedreno/ir3: drop unused param
    • +
    • freedreno/ir3: handle multi component alu src when propagating shifts
    • +
    • freedreno: update registers
    • +
    • freedreno/ir3: remove unused ir3_instruction::inout
    • +
    • freedreno/ir3: track sysval slot for inputs
    • +
    • freedreno/ir3: don't DCE ij_pix if used for pre-fs-texture-fetch
    • +
    • freedreno/ir3: add meta instruction for pre-fs texture fetch
    • +
    • freedreno/ir3: fixup register footprint to account for prefetch
    • +
    • freedreno/ir3: add dummy bary.f(ei) for pre-fs-fetch
    • +
    • freedreno/ir3: add pre-dispatch tex fetch to disasm
    • +
    • freedreno/ir3: force i/j pixel to r0.x
    • +
    • freedreno/a6xx: add support for pre-fs texture fetch
    • +
    • turnip: add support for pre-fs texture fetch
    • +
    • freedreno/ir3: enable pre-fs texture fetch for a6xx
    • +
    • nir/search: fix the PoT helpers
    • +
    • freedreno/ir3: rename mul.s/mul.u
    • +
    • nir: Add a new ALU nir_op_imul24
    • +
    • nir: add amul instruction
    • +
    • nir: add address calc related opt rules
    • +
    • nir: add nir_lower_amul pass
    • +
    • freedreno/ir3: add rule to generate imad24
    • +
    • freedreno/ir3: optimize immed 2nd src to mad
    • +
    • freedreno/ir3: add imul24 opcode
    • +
    • freedreno/ir3: handle imad24_ir3 case in UBO lowering
    • +
    • freedreno/ir3: handle scalarized varying inputs
    • +
    • freedreno/ir3: fixup register footprint fixup
    • +
    • freedreno/ir3: debug cleanup
    • +
    • freedreno/ir3: make high regs easier to see in IR dumps
    • +
    • freedreno/ir3: propagate dest flags for collect/fanin
    • +
    • freedreno/ir3: treat high vs low reg as conversion
    • +
    • freedreno/ir3: allow copy-propagate out of fanout
    • +
    • freedreno/ir3: remove restrictions on const + (abs)/(neg)
    • +
    • freedreno/ir3: handle the progress case
    • +
    • freedreno/a6xx: remove some left over dead code
    • +
    • freedreno/a6xx: cleanup magic registers
    • +
    • freedreno/a6xx: add a618 support
    • +
    • freedreno/ir3: fix gpu hang with pre-fs-tex-fetch
    • +
    • Revert "freedreno/ir3: enable pre-fs texture fetch for a6xx"
    • +
    • nir/lower_clip: Fix incorrect driver loc for clipdist outputs
    • +

      +

      Robin Murphy (1):

      +
    • egl/gbm: Fix config validation
    • +

      +

      Rohan Garg (3):

      +
    • panfrost: Remove unused argument from panfrost_drm_submit_vs_fs_job()
    • +
    • panfrost: Jobs must be per context, not per screen
    • +
    • panfrost: protect access to shared bo cache and transient pool
    • +

      +

      Roland Scheidegger (4):

      +
    • gallivm: use fallback code for mul_hi with llvm >= 7.0
    • +
    • llvmpipe: fix CALLOC vs. free mismatches
    • +
    • llvmpipe: increase max texture size to 2GB
    • +
    • gallivm: Fix saturated signed psub/padd intrinsics on llvm 8
    • +

      +

      Roman Stratiienko (1):

      +
    • lima: Return fence unconditionally
    • +

      +

      Sagar Ghuge (26):

      +
    • intel/eu/gen12: Implement immediate 64 bit constant encoding.
    • +
    • nir: Add alpha_to_coverage lowering pass
    • +
    • intel/compiler: Remove emit_alpha_to_coverage workaround from backend
    • +
    • intel: Add missing entry for brw_nir_lower_alpha_to_coverage in Makefile
    • +
    • intel/compiler: Add Immediate support for 3 source instruction
    • +
    • intel/compiler: Set bits according to source file
    • +
    • intel/compiler: Don't move immediate in register
    • +
    • intel/compiler: Refactor disassembly of sources in 3src instruction
    • +
    • intel/isl: Don't reconfigure aux surfaces for MCS
    • +
    • iris: Initialize CCS to fast clear while using with MCS
    • +
    • iris: Define MCS_CCS state transitions and usages
    • +
    • intel/blorp: Use isl_aux_usage_has_mcs instead of comparing
    • +
    • iris: Get correct resource aux usage for copy
    • +
    • intel/isl: Support lossless compression with multisamples
    • +
    • iris: Create resource with aux_usage MCS_CCS
    • +
    • genxml/gen12: Add Stencil Buffer Resolve Enable bit
    • +
    • intel/blorp: Assign correct view while clearing depth stencil
    • +
    • intel/blorp: Add helper function for stencil buffer resolve
    • +
    • intel: Track stencil aux usage on Gen12+
    • +
    • intel/blorp: Set stencil resolve enable bit
    • +
    • iris: Resolve stencil buffer lossless compression with WM_HZ_OP packet
    • +
    • iris: Prepare stencil resource before clear depth stencil
    • +
    • iris: Prepare depth resource if clear_depth enable
    • +
    • iris: Prepare resources before stencil blit operation
    • +
    • iris: Resolve stencil resource prior to copy or used by CPU
    • +
    • intel/isl: Allow stencil buffer to support compression on Gen12+
    • +

      +

      Samuel Iglesias Gonsálvez (26):

      +
    • spirv: check support for SPV_KHR_float_controls capabilities
    • +
    • spirv/nir: keep track of SPV_KHR_float_controls execution modes
    • +
    • nir: add auxiliary functions to detect if a mode is enabled
    • +
    • nir: add support for flushing to zero denorm constants
    • +
    • util: add softfloat functions to operate with doubles and floats
    • +
    • util: add float to float16 conversions with RTZ and RTNE
    • +
    • util: add fp64 -> fp32 conversion support for RTNE and RTZ rounding modes
    • +
    • nir: add support for round to zero rounding mode to nir_op_f2f32
    • +
    • nir: mind rounding mode on fadd, fsub, fmul and fma opcodes
    • +
    • nir/opcodes: make sure f2f16_rtz and f2f16_rtne behavior is not overriden by the float controls execution mode
    • +
    • nir/constant_expressions: mind rounding mode converting from float to float16 destinations
    • +
    • nir/algebraic: disable inexact optimizations depending on float controls execution mode
    • +
    • nir: fix denorms in unpack_half_1x16()
    • +
    • nir: fix denorm flush-to-zero in sqrt's lowering at nir_lower_double_ops
    • +
    • nir: fix fmin/fmax support for doubles
    • +
    • intel/nir: do not apply the fsin and fcos trig workarounds for consts
    • +
    • i965/fs/nir: add nir_op_unpack_half_2x16_split_*_flush_to_zero
    • +
    • i965/fs/generator: refactor rounding mode helper in preparation for float controls
    • +
    • i965/fs/generator: add new opcode to set float controls modes in control register
    • +
    • i965/fs: add emit_shader_float_controls_execution_mode() and aux functions
    • +
    • i965/fs: set rounding mode when emitting fadd, fmul and ffma instructions
    • +
    • i965/fs: set rounding mode when emitting nir_op_f2f32 or nir_op_f2f16
    • +
    • i965/fs: add support for shader float control to remove_extra_rounding_modes()
    • +
    • anv: enable VK_KHR_shader_float_controls and SPV_KHR_float_controls
    • +
    • docs/relnotes: add support for VK_KHR_shader_float_controls on Intel
    • +
    • nir/algebraic: refactor inexact opcode restrictions
    • +

      +

      Samuel Pitoiset (136):

      +
    • radv/gfx10: tidy up gfx10_format_table.py
    • +
    • radv/gfx10: hardcode some depth+stencil formats in the format table
    • +
    • radv: allow to enable VK_AMD_shader_ballot only on GFX8+
    • +
    • radv: add a new debug option called RADV_DEBUG=noshaderballot
    • +
    • radv: force enable VK_AMD_shader_ballot for Wolfenstein Youngblood
    • +
    • radv: implement VK_AMD_shader_core_properties2
    • +
    • ac: fix exclusive scans on GFX8-GFX9
    • +
    • ac,radv,radeonsi: remove LLVM 7 support
    • +
    • gitlab-ci: bump LLVM to 8 for meson-vulkan and meson-clover
    • +
    • radv/gfx10: don't initialize VGT_INSTANCE_STEP_RATE_0
    • +
    • radv/gfx10: do not use NGG with NAVI14
    • +
    • radv: fix getting the index type size for uint8_t
    • +
    • radv: add radv_process_depth_image_layer() helper
    • +
    • radv: add mipmaps support for decompress/resummarize
    • +
    • radv: decompress mipmapped depth/stencil images during transitions
    • +
    • radv: allocate metadata space for mipmapped depth/stencil images
    • +
    • radv: add mipmap support for the TC-compat zrange bug
    • +
    • radv: add mipmap support for the clear depth/stencil values
    • +
    • ac: drop llvm8 from some load/store helpers
    • +
    • ac: add has_clear_state to ac_gpu_info
    • +
    • ac: add has_distributed_tess to ac_gpu_info
    • +
    • ac: add has_dcc_constant_encode to ac_gpu_info
    • +
    • ac: add has_rbplus to ac_gpu_info
    • +
    • ac: add has_load_ctx_reg_pkt to ac_gpu_info
    • +
    • ac: add has_out_of_order_rast to ac_gpu_info
    • +
    • ac: add cpdma_prefetch_writes_memory to ac_gpu_info
    • +
    • ac: add has_gfx9_scissor_bug to ac_gpu_info
    • +
    • ac: add has_tc_compat_zrange_bug to ac_gpu_info
    • +
    • ac: add rbplus_allowed to ac_gpu_info
    • +
    • ac: add has_msaa_sample_loc_bug to ac_gpu_info
    • +
    • ac: add has_ls_vgpr_init_bug to ac_gpu_info
    • +
    • radv: make use of has_ls_vgpr_init_bug
    • +
    • radv/gfx10: compute the LDS size for exporting PrimID for VS
    • +
    • ac: import linear/perspective PS input parameters from radv/radeonsi
    • +
    • ac: drop now useless lookup_interp_param from ABI
    • +
    • radv: gather info about PS inputs in the shader info pass
    • +
    • radv: move lowering PS inputs/outputs at the right place
    • +
    • radv: remove some unused fields from radv_shader_context
    • +
    • radv: remove unused shader_info parameter in ac_compile_llvm_module()
    • +
    • radv: remove useless ac_llvm_util.h include from the WSI code
    • +
    • radv: remove radv_init_llvm_target() helper
    • +
    • radv: replace ac_nir_build_if by ac_build_ifcc
    • +
    • radv: move setting can_discard to ac_fill_shader_info()
    • +
    • radv: keep a pointer to a NIR shader into radv_shader_context
    • +
    • nir: do not assume that the result of fexp2(a) is always an integral
    • +
    • radv/gfx10: always set ballot_mask_bits to 64
    • +
    • radv: merge radv_shader_variant_info into radv_shader_info
    • +
    • radv: move ac_fill_shader_info() to radv_nir_shader_info_pass()
    • +
    • radv: gather clip/cull distances in the shader info pass
    • +
    • radv: gather pointsize in the shader info pass
    • +
    • radv: gather viewport in the shader info pass
    • +
    • radv: gather layer in the shader info pass
    • +
    • radv: gather primitive ID in the shader info pass
    • +
    • radv: calculate the GSVS vertex size in the shader info pass
    • +
    • radv: calculate esgs_itemsize in the shader info pass
    • +
    • radv/gfx10: account for the subpass view for the NGG GS storage
    • +
    • radv/gfx10: make use the output usage mask when exporting NGG GS params
    • +
    • radv/gfx10: determine the number of vertices per primitive for TES
    • +
    • radv: do not pass all compiler options to the shader info pass
    • +
    • radv: fill shader info for all stages in the pipeline
    • +
    • radv: store GFX9 GS state as part of the shader info
    • +
    • radv: store GFX10 NGG state as part of the shader info
    • +
    • radv: store the ESGS ring size as part of gfx10_ngg_info
    • +
    • radv: calculate GFX9 GS and GFX10 NGG states before compiling shader variants
    • +
    • radv/gfx10: declare a LDS symbol for the NGG emit space
    • +
    • radv: fix allocating number of user sgprs if streamout is used
    • +
    • radv/winsys: add support for GS and OA domains
    • +
    • radv/gfx10: add an option to switch from legacy to NGG streamout
    • +
    • radv/gfx10: implement NGG streamout begin/end functions
    • +
    • radv/gfx10: allocate GDS/OA buffer objects for NGG streamout
    • +
    • radv/gfx10: adjust the GS NGG scratch size for streamout
    • +
    • radv/gfx10: unconditionally declare scratch space for NGG streamout without GS
    • +
    • radv/gfx10: adjust the LDS size for VS/TES NGG streamout
    • +
    • radv/gfx10: fix unnecessary LDS overallocation for NGG GS
    • +
    • radv/gfx10: compute the correct buffer size for NGG streamout
    • +
    • radv/gfx10: gather GS output for VS as NGG
    • +
    • radv/gfx10: enable NGG_WAVE_ID_EN for NGG streamout
    • +
    • radv/gfx10: make GDS idle when leaving the IB
    • +
    • radv/gfx10: make sure to wait for idle before clearing GDS
    • +
    • radv/gfx10: implement NGG streamout
    • +
    • radv/gfx10: disable unsupported transform feedback features for NGG
    • +
    • radv: fix writing depth/stencil clear values to image
    • +
    • radv: fix loading 64-bit GS inputs
    • +
    • radv/gfx10: fix VK_KHR_pipeline_executable_properties with NGG GS
    • +
    • radv/gfx10: add radv_device::use_ngg
    • +
    • radv/gfx10: add missing counter buffer to the BO list
    • +
    • radv/gfx10: fix storing/loading NGG stream outputs for VS and TES
    • +
    • radv/gfx10: use the component mask when storing/loading NGG stream outputs
    • +
    • radv/gfx10: fix storing/loading NGG stream outputs for GS
    • +
    • radv/gfx10: fix NGG streamout with triangle strips for VS
    • +
    • radv: rework the slow depthstencil clear to write depth from PS
    • +
    • Revert "radv: disable viewport clamping even if FS doesn't write Z"
    • +
    • radv: fix build
    • +
    • radv/gfx10: fix the ESGS ring size symbol
    • +
    • radv: enable lower_fmod for the LLVM path
    • +
    • ac/nir: remove unused code for nir_op_{fmod,frem}
    • +
    • radv: implement VK_KHR_shader_clock
    • +
    • drirc: enable vk_x11_override_min_image_count for DOOM
    • +
    • radv: bump minTexelBufferOffsetAlignment to 4
    • +
    • radv: get the device name from radeon_info::name
    • +
    • radv: sync before resetting query pools if timestamps have been written
    • +
    • radv: use a compute shader for copying timestamp query results
    • +
    • radv: fix DCC fast clear code for intensity formats
    • +
    • radv: rename VK_KHR_shader_float16_int8 structs/constants
    • +
    • Revert "radv: do not emit PKT3_CONTEXT_CONTROL with AMDGPU 3.6.0+"
    • +
    • radv: fix DCC fast clear code for intensity formats (correctly)
    • +
    • ac/llvm: add ac_build_canonicalize() helper
    • +
    • ac/llvm: add AC_FLOAT_MODE_ROUND_TO_ZERO
    • +
    • ac/llvm: force fneg/fabs to flush denorms to zero if requested
    • +
    • radv: implement VK_KHR_shader_float_controls
    • +
    • radv: enable VK_KHR_shader_float_controls on GFX6-GFX7
    • +
    • radv: do not print useless descriptors info in hang reports
    • +
    • radv: print which ring is dumped in hang reports
    • +
    • radv: dump trace files earlier if a GPU hang is detected
    • +
    • radv: do not dump descriptors twice in hang reports
    • +
    • radv: advertise VK_KHR_spirv_1_4
    • +
    • ac/llvm: fix ac_to_integer_type() for 32-bit const addr space pointers
    • +
    • radv: fix updating bound fast ds clear values with different aspects
    • +
    • radv: do not create meta pipelines with 16 samples
    • +
    • radv: add an assertion in radv_gfx10_compute_bin_size()
    • +
    • radv: do not emit rbplus if attachments are undefined
    • +
    • radv/gfx10: re-enable fast depth/stencil clears with separate aspects
    • +
    • radv/gfx10: fix 3D images
    • +
    • radv: fix vkUpdateDescriptorSets with inline uniform blocks
    • +
    • radv: fix a performance regression with graphics depth/stencil clears
    • +
    • radv: compute the number of records correctly for vertex buffers
    • +
    • radv: fix VK_KHR_shader_float_controls dependency on GFX6-7
    • +
    • radv: enable fast depth/stencil clears with separate aspects on GFX8
    • +
    • radv: fix OpQuantizeToF16 for NaN on GFX6-7
    • +
    • radv: fix dumping SPIR-V into hang reports
    • +
    • radv: move nomemorycache debug option at the right palce
    • +
    • radv: fix perftest options
    • +
    • radv: fix compute pipeline keys when optimizations are disabled
    • +
    • radv: fix enabling sample shading with SampleID/SamplePosition
    • +
    • radv/gfx10: fix implementation of exclusive scans
    • +
    • ac/nir: fix out-of-bound access when loading constants from global
    • +

      +

      Sergii Romantsov (4):

      +
    • intel/dri: finish proper glthread
    • +
    • nir/large_constants: more careful data copying
    • +
    • nir/large_constants: pass after lowering copy_deref
    • +
    • meta: leak of shader program when decompressing tex-images
    • +

      +

      Stephen Barber (1):

      +
    • nouveau: add idep_nir_headers as dep for libnouveau
    • +

      +

      Tapani Pälli (23):

      +
    • util: fix os_create_anonymous_file on android
    • +
    • iris/android: fix build and link with libmesa_intel_perf
    • +
    • egl: reset blob cache set/get functions on terminate
    • +
    • intel/genxml: generate pack files for gen12 on android builds
    • +
    • intel/isl: build android libmesa_isl for gen12
    • +
    • iris: build android libmesa_iris for gen12
    • +
    • anv: build libanv for gen12 in android build
    • +
    • i965: initialize bo_reuse when creating brw_bufmgr
    • +
    • iris: use driconf for 'bo_reuse' parameter
    • +
    • android: fix linking issues with liblog
    • +
    • iris: close screen fd on iris_destroy_screen
    • +
    • egl: check for NULL value like eglGetSyncAttribKHR does
    • +
    • iris: disable aux on first get_param if not created with aux
    • +
    • mesa/st: calculate texture size based on EGLImage miplevel
    • +
    • anv/android: fix images created with external format support
    • +
    • i965: setup sized internalformat for MESA_FORMAT_R10G10B10A2_UNORM
    • +
    • mesa: add [Program]Uniform*64ARB display list support
    • +
    • mesa: enable ARB_gpu_shader_int64 in compat profile
    • +
    • Revert "egl: implement new functions from EGL_EXT_image_flush_external"
    • +
    • Revert "egl: handle EGL_IMAGE_EXTERNAL_FLUSH_EXT"
    • +
    • Revert "st/dri: add support for EGL_EXT_image_flush_external"
    • +
    • Revert "st/dri: assume external consumers of back buffers can write to the buffers"
    • +
    • Revert "dri_interface: add interface for EGL_EXT_image_flush_external"
    • +

      +

      Thomas Hellstrom (2):

      +
    • svga: Fix banded DMA upload unmap
    • +
    • winsys/svga: Limit the maximum DMA hardware buffer size
    • +

      +

      Thong Thai (2):

      +
    • Revert "radeonsi: don't emit PKT3_CONTEXT_CONTROL on amdgpu"
    • +
    • radeonsi: add JPEG decode support for VCN 2.0 devices
    • +

      +

      Timothy Arceri (35):

      +
    • radeonsi/nir: fix number of used samplers
    • +
    • util/disk_cache: bump thread count assigned to disk cache queue
    • +
    • util/u_queue: track job size and limit the size of queue growth
    • +
    • util/disk_cache: make use of the total job size limiting feature
    • +
    • radeonsi/nir: lower load constants to scalar
    • +
    • glsl: fix crash compiling bindless samplers inside unnamed UBOs
    • +
    • nir: fix nir_variable_data packing
    • +
    • nir: improve nir_variable packing
    • +
    • glsl: remove propagate_invariance() call from the linker
    • +
    • radv: get topology from pipeline key rather than VkGraphicsPipelineCreateInfo
    • +
    • radv: add debug option to turn off in memory cache
    • +
    • radv: add radv_create_shaders() to radv_shader.h
    • +
    • radv: add radv_secure_compile_type enum
    • +
    • radv: add some new members to radv device and instance for secure compile
    • +
    • radv: add radv_device_use_secure_compile() helper
    • +
    • radv: allow the secure process to read and write from disk cache
    • +
    • radv: for secure compile exit early from radv_shader_variant_create()
    • +
    • radv: add radv_secure_compile()
    • +
    • radv: a support for a secure compile fork at device creation
    • +
    • radv: enable secure compile support
    • +
    • util: remove LIST_INITHEAD macro
    • +
    • util: remove LIST_ADDTAIL macro
    • +
    • util: remove LIST_ADD macro
    • +
    • util: remove LIST_REPLACE macro
    • +
    • util: remove LIST_DELINIT macro
    • +
    • util: remove LIST_DEL macro
    • +
    • util: rename list_empty() to list_is_empty()
    • +
    • util: remove LIST_IS_EMPTY macro
    • +
    • radv: allow select() calls in secure compile
    • +
    • radv: add radv_sc_read() helper
    • +
    • radv: make use of radv_sc_read()
    • +
    • radv: add some infrastructure for fresh forks for each secure compile
    • +
    • radv: add a secure_compile_open_fifo_fds() helper
    • +
    • radv: create a fresh fork for each pipeline compile
    • +
    • glsl/nir: iterate the system values list when adding varyings
    • +

      +

      Timur Kristóf (48):

      +
    • st/nine: Properly initialize GLSL types for NIR shaders.
    • +
    • nir: Carve out nir_lower_samplers from GLSL code.
    • +
    • tgsi_to_nir: Remove dependency on libglsl.
    • +
    • amd/common: Move ac_export_mrt_z to ac_llvm_build.
    • +
    • amd/common: Extract some helper functions to ac_shader_util.
    • +
    • amd/common: Add num_shared_vgprs to ac_shader_config for GFX10.
    • +
    • radv: Set shared VGPR count in radv_postprocess_config.
    • +
    • amd/common: Introduce ac_get_fs_input_vgpr_cnt.
    • +
    • radv: Add debug option to dump meta shaders.
    • +
    • radv: Fix L2 cache rinse programming.
    • +
    • amd: Move all amd/common code that depends on LLVM to amd/llvm.
    • +
    • aco: Set +wavefrontsize64 for LLVM disassembler in GFX10 wave64 mode.
    • +
    • aco: Add missing GFX10 specific fields and some README notes.
    • +
    • aco: Support GFX10 SMEM in aco_assembler.
    • +
    • aco: Support GFX10 VINTRP in aco_assembler.
    • +
    • aco: Support GFX10 DS in aco_assembler.
    • +
    • aco: Support GFX10 MUBUF in aco_assembler.
    • +
    • amd/common: Add extern "C" to some headers that were missing it.
    • +
    • aco: Link ACO with amd/common.
    • +
    • aco: Support GFX10 MTBUF in aco_assembler.
    • +
    • aco: Support GFX10 MIMG and GFX9 D16 in aco_assembler.
    • +
    • aco: Fix GFX9 FLAT, SCRATCH, GLOBAL instructions, add GFX10 support.
    • +
    • aco: Support GFX10 EXP in aco_assembler.
    • +
    • aco: Support GFX10 VOP3 and VOP1 as VOP3 in aco_assembler.
    • +
    • aco: Set GFX10 DLC bit properly.
    • +
    • aco: Use ac_get_sampler_dim, delete duplicate code.
    • +
    • aco: Set GFX10 dimensionality on the instructions that need it.
    • +
    • aco: Support subvector loops in aco_assembler.
    • +
    • aco: Fix VS input VGPRs on GFX10.
    • +
    • aco: Fix s_dcache_wb on GFX10.
    • +
    • aco: Add extra assertion for number of FS input VGPRs.
    • +
    • aco: Clean up usages of PhysReg::reg from aco_assembler.
    • +
    • aco/gfx10: Wait for pending SMEM stores before loads
    • +
    • aco/gfx10: Fix PS exports for SPI_SHADER_32_AR.
    • +
    • aco/gfx10: Update constant addresses in fix_branches_gfx10.
    • +
    • aco/gfx10: Add notes about some GFX10 hazards.
    • +
    • aco/gfx10: Mitigate VcmpxPermlaneHazard.
    • +
    • aco/gfx10: Mitigate VcmpxExecWARHazard.
    • +
    • aco/gfx10: Mitigate SMEMtoVectorWriteHazard.
    • +
    • aco/gfx10: Mitigate LdsBranchVmemWARHazard.
    • +
    • aco/gfx10: Fix mitigation of VMEMtoScalarWriteHazard.
    • +
    • aco: Refactor hazard mitigations, separate pass for GFX10.
    • +
    • st/nine: Fix build with -Werror=empty-body
    • +
    • st/nine: Fix unused variable warnings in release build.
    • +
    • aco: Implement subgroup shuffle in GFX10 wave64 mode.
    • +
    • aco: Introduce vgpr_limit to keep track of available VGPRs.
    • +
    • radv: Enable ACO on Navi.
    • +
    • ac: Handle invalid GFX10 format correctly in ac_get_tbuffer_format.
    • +

      +

      Tomeu Vizoso (19):

      +
    • panfrost/ci: Use Volt-based runner for dEQP tests
    • +
    • panfrost/ci: Print bootstrap log
    • +
    • panfrost/ci: Build kernel with CONFIG_DETECT_HUNG_TASK
    • +
    • panfrost/ci: Install qemu-arm-static into chroot
    • +
    • panfrost/ci: Print load stats
    • +
    • panfrost/ci: Print only regressions
    • +
    • panfrost/ci: Re-add support for armhf
    • +
    • panfrost/ci: Use special runner for LAVA jobs
    • +
    • panfrost/ci: Increase timeouts
    • +
    • panfrost/ci: Run dEQP with the surfaceless platform
    • +
    • panfrost/ci: Update kernel to 5.3-rc8
    • +
    • panfrost/ci: Use releases for Volt dEQP
    • +
    • gitlab-ci: Run dEQP on devices with Panfrost
    • +
    • gitlab-ci: Move LAVA-related files into top-level ci dir
    • +
    • gitlab-ci/lava: Fix image to use in test jobs
    • +
    • gitlab-ci/lava: Use files to list tests to skip
    • +
    • gitlab-ci/lava: Test Lima driver with dEQP
    • +
    • panfrost: Keep track of active BOs
    • +
    • gitlab-ci: Update kernel for LAVA jobs to 5.4-rc4
    • +

      +

      Urja Rannikko (1):

      +
    • panfrost: allocate bo for occlusion query results
    • +

      +

      Vasily Khoruzhick (35):

      +
    • lima/ppir: refactor const lowering
    • +
    • lima/ppir: clone ld_{uni,tex,var} into each block
    • +
    • lima/ppir: add support for unconditional branches and condition negation
    • +
    • lima/ppir: set write mask for texture loads if dest is reg
    • +
    • lima/ppir: fix ordering deps
    • +
    • lima/ppir: add write after read deps for registers
    • +
    • lima/ppir: add dummy op
    • +
    • lima/ppir: create ppir block for each corresponding NIR block
    • +
    • lima/ppir: turn store_color into ALU node
    • +
    • lima/ppir: validate shader outputs
    • +
    • lima/ppir: add better liveness analysis
    • +
    • lima/ppir: add control flow support
    • +
    • lima/ppir: print register index and components number for spilled register
    • +
    • lima: fix texture descriptor issues
    • +
    • lima/ppir: add common helper for creating movs
    • +
    • lima/ppir: don't assume that load coords gets value from register
    • +
    • lima/ppir: clone uniforms and load_coords into each successor
    • +
    • nir: allow specifying filter callback in lower_alu_to_scalar
    • +
    • lima/ppir: don't lower vector {b,f}csel to scalar if condition is scalar
    • +
    • lima/ppir: don't lower phis to scalar
    • +
    • lima/gpir: lower fceil
    • +
    • lima/gpir: fix warning in gpir disassembler
    • +
    • lima: run opt_algebraic between int_to_float and boot_to_float for vs
    • +
    • lima/ppir: drop fge/flt/feq/fne options
    • +
    • lima: set .out_sync field of req in lima_submit_start()
    • +
    • lima: add standalone disassembler with primitive MBS parser
    • +
    • lima: use 0 to poll if BO is busy in lima_bo_wait()
    • +
    • lima: implement BO cache
    • +
    • lima/ppir: don't attempt to clone tex coords if it's not varying
    • +
    • lima/ppir: add node dependency types
    • +
    • lima/ppir: add support for indirect load of uniforms and varyings
    • +
    • lima/ppir: add NIR pass to split varying loads
    • +
    • lima: set uniforms_address lower bits properly
    • +
    • lima/ppir: don't clone texture loads
    • +
    • lima: fix PP stack size
    • +

      +

      Vinson Lee (7):

      +
    • glx: Fix up glXQueryGLXPbufferSGIX on macOS.
    • +
    • swr: Fix build with llvm-9.0 again.
    • +
    • travis: Fail build if any command in if statement fails.
    • +
    • util: Define strchrnul on macOS.
    • +
    • swr: Fix make_unique build error.
    • +
    • scons: Add coroutines component to build.
    • +
    • meson: Add coroutines component to llvmpipe build.
    • +

      +

      Wladimir J. van der Laan (1):

      +
    • etnaviv: GC7000: Texture descriptors
    • +

      +

      Yevhenii Kolesnikov (2):

      +
    • glsl: Enable textureSize for samplerExternalOES
    • +
    • meson: Fix linkage of libgallium_nine with libgalliumvl
    • +

      +

      Zebediah Figura (1):

      +
    • Revert "draw: revert using correct order for prim decomposition."
    • +

      +

      Zhaowei Yuan (1):

      +
    • broadcom/vc4: Expand width of dst surface
    • +

      +

      Zhu, James (1):

      +
    • radeon: Fix mjpeg issue for ARCTURUS
    • +

      +

      nia (1):

      +
    • loader: include limits.h for PATH_MAX
    • +

      +

      pal1000 (3):

      +
    • scons/windows: Support build with LLVM 9.
    • +
    • scons: Fix MSYS2 Mingw-w64 build.
    • +
    • scons/windows: Enable compute shaders when possible.
    • +

      +

      renchenglei (1):

      +
    • egl/android: Enable HAL_PIXEL_FORMAT_RGBA_1010102 format
    • +

      +

      +
    + +
    + + diff -Nru mesa-19.2.8/docs/relnotes/19.3.1.html mesa-20.0.8/docs/relnotes/19.3.1.html --- mesa-19.2.8/docs/relnotes/19.3.1.html 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/docs/relnotes/19.3.1.html 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,94 @@ + + + + + +Mesa Release Notes + + + + +
    +

    The Mesa 3D Graphics Library

    +
    + + +
    + +

    Mesa 19.3.1 Release Notes / 2019-12-18

    + +

    + Mesa 19.3.1 is a bug fix release which fixes bugs found since the 19.3.0 release. +

    +

    +Mesa 19.3.1 implements the OpenGL 4.6 API, but the version reported by +glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) / +glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used. +Some drivers don't support all the features required in OpenGL 4.6. OpenGL +4.6 is only available if requested at context creation. +Compatibility contexts may report a lower version depending on each driver. +

    +

    +Mesa 19.3.1 implements the Vulkan 1.1 API, but the version reported by +the apiVersion property of the VkPhysicalDeviceProperties struct +depends on the particular driver being used. +

    + +

    SHA256 checksum

    +
    +    cd951db69c56a97ff0570a7ab2c0e39e6c5323f4cd8f4eb8274723e033beae59  mesa-19.3.1.tar.xz
    +
    + + +

    New features

    + +
      +
    • None
    • +
    + +

    Bug fixes

    + +
      +
    • i965/iris: assert when destroy GL context with active query
    • +
    • Visuals without alpha bits are not sRGB-capable
    • +
    • radv secure compile feature breaks compilation of RADV on armhf EABI (19.3-rc1)
    • +
    + +

    Changes

    + +
      +

      Bas Nieuwenhuizen (2):

      +
    • amd/common: Fix tcCompatible degradation on Stoney.
    • +
    • amd/common: Always use addrlib for HTILE tc-compat.
    • +

      +

      Dylan Baker (3):

      +
    • docs/19.3.0: Add SHA256 sums
    • +
    • cherry-ignore: update for the 19.3.1 cycle
    • +
    • docs: remove new_features.txt from stable branch
    • +

      +

      Gert Wollny (1):

      +
    • virgl: Increase the shader transfer buffer by doubling the size
    • +

      +

      Iván Briano (1):

      +
    • anv: Export filter_minmax support only when it's really supported
    • +

      +

      Kenneth Graunke (1):

      +
    • iris: Default to X-tiling for scanout buffers without modifiers
    • +

      +

      Lionel Landwerlin (2):

      +
    • anv: fix fence underlying primitive checks
    • +
    • mesa: avoid triggering assert in implementation
    • +

      +

      Luis Mendes (1):

      +
    • radv: fix radv secure compile feature breaks compilation on armhf EABI and aarch64
    • +

      +

      Tapani Pälli (2):

      +
    • dri: add __DRI_IMAGE_FORMAT_SXRGB8
    • +
    • i965: expose MESA_FORMAT_B8G8R8X8_SRGB visual
    • +

      +

      +
    + +
    + + diff -Nru mesa-19.2.8/docs/relnotes/19.3.2.html mesa-20.0.8/docs/relnotes/19.3.2.html --- mesa-19.2.8/docs/relnotes/19.3.2.html 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/docs/relnotes/19.3.2.html 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,138 @@ + + + + + +Mesa Release Notes + + + + +
    +

    The Mesa 3D Graphics Library

    +
    + + +
    + +

    Mesa 19.3.2 Release Notes / 2020-01-09

    + +

    + Mesa 19.3.2 is a bug fix release which fixes bugs found since the 19.3.1 release. +

    +

    +Mesa 19.3.2 implements the OpenGL 4.6 API, but the version reported by +glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) / +glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used. +Some drivers don't support all the features required in OpenGL 4.6. OpenGL +4.6 is only available if requested at context creation. +Compatibility contexts may report a lower version depending on each driver. +

    +

    +Mesa 19.3.2 implements the Vulkan 1.1 API, but the version reported by +the apiVersion property of the VkPhysicalDeviceProperties struct +depends on the particular driver being used. +

    + +

    SHA256 checksum

    +
    +    4e3aee324616352bbc7f58d47ab573e10f68cc7719fd045bd6d3abcdd97ee1c1  mesa-19.3.2.tar.xz
    +
    + + +

    New features

    + +
      +
    • None
    • +
    + +

    Bug fixes

    + +
      +
    • Rise of the Tomb Raider benchmark crash on Dell XPS 7390 2-in-1 w/ Iris Plus Graphics (Ice Lake 8x8 GT2)
    • +
    • Raven Ridge (2400G): Resident Evil 2 crashes my machine
    • +
    • Rocket League ingame artifacts
    • +
    • [radv] SteamVR direct mode no longer works
    • +
    • [RADV] [Navi] LOD artifacting in Halo - The Master Chief Collection (Halo Reach)
    • +
    • [ANV] unused create parameters not properly ignored
    • +
    • Blocky corruption in The Surge 2
    • +
    • radeonsi: Floating point exception on R9 270 gpu for a set of traces
    • +
    • [CTS] dEQP-VK.api.image_clearing.core.clear_color_image.2d.linear.single_layer.r32g32b32_* fail on GFX6-GFX8
    • +
    • Vulkan: Please consider adding another sample count to sampledImageIntegerSampleCounts
    • +
    • Navi10: Bitrate based encoding with VAAPI/RadeonSI unusable
    • +
    • [GFX10] Glitch rendering Custom Avatars in Beat Saber
    • +
    • intel/fs: Check for 16-bit immediates in fs_visitor::lower_mul_dword_inst is too strict
    • +
    + +

    Changes

    + +
      +

      Andrii Simiklit (3):

      +
    • glsl: fix an incorrect max_array_access after optimization of ssbo/ubo
    • +
    • glsl: fix a binding points assignment for ssbo/ubo arrays
    • +
    • glsl/nir: do not change an element index to have correct block name
    • +

      +

      Bas Nieuwenhuizen (7):

      +
    • radv: Limit workgroup size to 1024.
    • +
    • radv: Expose all sample counts for integer formats as well.
    • +
    • amd/common: Handle alignment of 96-bit formats.
    • +
    • nir: Add clone/hash/serialize support for non-uniform tex instructions.
    • +
    • spirv: Fix glsl type assert in spir2nir.
    • +
    • radv: Only use the gfx mipmap level offset/pitch for linear textures.
    • +
    • radv: Emit a BATCH_BREAK when changing pixel shaders or CB_TARGET_MASK.
    • +

      +

      Caio Marcelo de Oliveira Filho (4):

      +
    • intel/fs: Lower 64-bit MOVs after lower_load_payload()
    • +
    • intel/fs: Fix lowering of dword multiplication by 16-bit constant
    • +
    • intel/vec4: Fix lowering of multiplication by 16-bit constant
    • +
    • anv: Ignore some CreateInfo structs when rasterization is disabled
    • +

      +

      Christian Gmeiner (1):

      +
    • etnaviv: update resource status after flushing
    • +

      +

      Dylan Baker (2):

      +
    • dcos: add releanse notes for 19.3.1
    • +
    • cherry-ignore: update for 19.3.2
    • +

      +

      Eric Engestrom (4):

      +
    • util/format: remove left-over util_format_description_table declaration
    • +
    • amd: fix empty-body issues
    • +
    • nine: fix empty-body-issues
    • +
    • mesa: avoid returning a value in a void function
    • +

      +

      Gert Wollny (1):

      +
    • r600: Fix maximum line width
    • +

      +

      Jason Ekstrand (2):

      +
    • anv: Properly advertise sampledImageIntegerSampleCounts
    • +
    • intel/nir: Add a memory barrier before barrier()
    • +

      +

      Lionel Landwerlin (2):

      +
    • loader: fix close on uninitialized file descriptor value
    • +
    • anv: don't close invalid syncfd semaphore
    • +

      +

      Marek Olšák (2):

      +
    • winsys/radeon: initialize pte_fragment_size
    • +
    • radeonsi: disable SDMA on gfx8 to fix corruption on RX 580
    • +

      +

      Pierre-Eric Pelloux-Prayer (2):

      +
    • radeon/vcn2: enable rate control for hevc encoding
    • +
    • radeonsi: check ctx->sdma_cs before using it
    • +

      +

      Samuel Pitoiset (2):

      +
    • radv/gfx10: fix the out-of-bounds check for vertex descriptors
    • +
    • radv: return the correct pitch for linear mipmaps on GFX10
    • +

      +

      Timur Kristóf (1):

      +
    • aco: Fix uniform i2i64.
    • +

      +

      Yevhenii Kolesnikov (2):

      +
    • meta: Cleanup function for DrawTex
    • +
    • main: allow external textures for BindImageTexture
    • +

      +

      +
    + +
    + + diff -Nru mesa-19.2.8/docs/relnotes/19.3.3.html mesa-20.0.8/docs/relnotes/19.3.3.html --- mesa-19.2.8/docs/relnotes/19.3.3.html 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/docs/relnotes/19.3.3.html 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,193 @@ + + + + + +Mesa Release Notes + + + + +
    +

    The Mesa 3D Graphics Library

    +
    + + +
    + +

    Mesa 19.3.3 Release Notes / 2020-01-28

    + +

    + Mesa 19.3.3 is a bug fix release which fixes bugs found since the 19.3.2 release. +

    +

    +Mesa 19.3.3 implements the OpenGL 4.6 API, but the version reported by +glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) / +glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used. +Some drivers don't support all the features required in OpenGL 4.6. OpenGL +4.6 is only available if requested at context creation. +Compatibility contexts may report a lower version depending on each driver. +

    +

    +Mesa 19.3.3 implements the Vulkan 1.1 API, but the version reported by +the apiVersion property of the VkPhysicalDeviceProperties struct +depends on the particular driver being used. +

    + +

    SHA256 checksum

    +
    +    81ce4810bb25d61300f8104856461f4d49cf7cb794aa70cb572312e370c39f09  mesa-19.3.3.tar.xz
    +
    + + +

    New features

    + +
      +
    • None
    • +
    + +

    Bug fixes

    + +
      +
    • aco: Dead Rising 4 crashes in lower_to_hw_instr() on GFX6-GFX7
    • +
    • libvulkan_radeon.so crash with `free(): double free detected in tcache 2`
    • +
    • Commit be08e6a causes crash in com.android.launcher3 (Launcher)
    • +
    • Mesa no longer compiles with GCC 10
    • +
    • [bisected] [radeonsi] GPU hangs/resets while playing interlaced content on Kodi with VAAPI
    • +
    • [radeonsi] MSAA image not copied properly after image store through texture view
    • +
    • T-Rex and Manhattan onscreen performance issue on Android
    • +
    • VkSamplerCreateInfo compareEnable not respected
    • +
    • VkSamplerCreateInfo compareEnable not respected
    • +
    • Freedreno drm softpin driver implementation leaks memory
    • +
    • [POLARIS10] VRAM leak involving glTexImage2D with non-NULL data argument
    • +
    + +

    Changes

    + +
      +

      Adam Jackson (1):

      +
    • drisw: Cache the depth of the X drawable
    • +

      +

      Andrii Simiklit (1):

      +
    • mesa/st: fix a memory leak in get_version
    • +

      +

      Bas Nieuwenhuizen (2):

      +
    • radv: Disable VK_EXT_sample_locations on GFX10.
    • +
    • radv: Remove syncobj_handle variable in header.
    • +

      +

      Caio Marcelo de Oliveira Filho (1):

      +
    • intel/fs: Only use SLM fence in compute shaders
    • +

      +

      Daniel Schürmann (2):

      +
    • aco: fix unconditional demote_to_helper
    • +
    • aco: rework lower_to_cssa()
    • +

      +

      Dylan Baker (3):

      +
    • docs: add SHA256 sums for 19.3.2
    • +
    • cherry-ignore: Update for 19.3.3
    • +
    • .pick_status.json: Update to c787b8d2a16d5e2950f209b1fcbec6e6c0388845
    • +

      +

      Eric Anholt (1):

      +
    • mesa: Fix detection of invalidating both depth and stencil.
    • +

      +

      Eric Engestrom (1):

      +
    • meson: use github URL for wraps instead of completely unreliable wrapdb
    • +

      +

      Erik Faye-Lund (8):

      +
    • docs: fix typo in html tag name
    • +
    • docs: fix paragraphs
    • +
    • docs: open paragraph before closing it
    • +
    • docs: use code-tag instead of pre-tag
    • +
    • docs: use code-tags instead of pre-tags
    • +
    • docs: use code-tags instead of pre-tags
    • +
    • docs: move paragraph closing tag
    • +
    • docs: remove double-closed definition-list
    • +

      +

      Francisco Jerez (3):

      +
    • glsl: Fix software 64-bit integer to 32-bit float conversions.
    • +
    • intel/fs/gen11+: Handle ROR/ROL in lower_simd_width().
    • +
    • intel/fs/gen8+: Fix r127 dst/src overlap RA workaround for EOT message payload.
    • +

      +

      Hyunjun Ko (1):

      +
    • turnip: fix invalid VK_ERROR_OUT_OF_POOL_MEMORY
    • +

      +

      Jan Vesely (1):

      +
    • clover: Initialize Asm Parsers
    • +

      +

      Jason Ekstrand (8):

      +
    • anv: Flag descriptors dirty when gl_NumWorkgroups is used
    • +
    • intel/vec4: Support scoped_memory_barrier
    • +
    • intel/blorp: Fill out all the dwords of MI_ATOMIC
    • +
    • anv: Don't over-advertise descriptor indexing features
    • +
    • anv: Memset array properties
    • +
    • anv/blorp: Rename buffer image stride parameters
    • +
    • anv: Canonicalize buffer formats for image/buffer copies
    • +
    • anv: Stop allocating WSI event fences off the instance
    • +

      +

      Jonathan Marek (1):

      +
    • st/mesa: don't lower YUV when driver supports it natively
    • +

      +

      Kenneth Graunke (2):

      +
    • intel/compiler: Fix illegal mutation in get_nir_image_intrinsic_image
    • +
    • intel: Fix aux map alignments on 32-bit builds.
    • +

      +

      Lasse Lopperi (1):

      +
    • freedreno/drm: Fix memory leak in softpin implementation
    • +

      +

      Lionel Landwerlin (4):

      +
    • anv: fix intel perf queries availability writes
    • +
    • anv: only use VkSamplerCreateInfo::compareOp if enabled
    • +
    • intel/perf: expose timestamp begin for mdapi
    • +
    • intel/perf: report query split for mdapi
    • +

      +

      Marek Olšák (4):

      +
    • ac/gpu_info: always use distributed tessellation on gfx10
    • +
    • radeonsi: work around an LLVM crash when using llvm.amdgcn.icmp.i64.i1
    • +
    • radeonsi: clean up how internal compute dispatches are handled
    • +
    • radeonsi: don't invoke decompression inside internal launch_grid
    • +

      +

      Nataraj Deshpande (1):

      +
    • egl/android: Restrict minimum triple buffering for android color_buffers
    • +

      +

      Pierre-Eric Pelloux-Prayer (8):

      +
    • radeonsi: release saved resources in si_retile_dcc
    • +
    • radeonsi: release saved resources in si_compute_expand_fmask
    • +
    • radeonsi: release saved resources in si_compute_clear_render_target
    • +
    • radeonsi: release saved resources in si_compute_copy_image
    • +
    • radeonsi: release saved resources in si_compute_do_clear_or_copy
    • +
    • radeonsi: fix fmask expand compute shader
    • +
    • radeonsi: make sure fmask expand is done if needed
    • +
    • util: call bind_sampler_states before setting sampler_views
    • +

      +

      Rhys Perry (8):

      +
    • aco: set vm for pos0 exports on GFX10
    • +
    • aco: fix imageSize()/textureSize() with large buffers on GFX8
    • +
    • aco: fix uninitialized data in the binary
    • +
    • aco: set exec_potentially_empty for demotes
    • +
    • aco: disable add combining for ds_swizzle_b32
    • +
    • aco: don't DCE atomics with return values
    • +
    • aco: check if multiplication/clamp is live when applying output modifier
    • +
    • aco: fix off-by-one error when initializing sgpr_live_in
    • +

      +

      Samuel Pitoiset (2):

      +
    • radv: only use VkSamplerCreateInfo::compareOp if enabled
    • +
    • radv: fix double free corruption in radv_alloc_memory()
    • +

      +

      Samuel Thibault (1):

      +
    • meson: Do not require libdrm for DRI2 on hurd
    • +

      +

      Tapani Pälli (1):

      +
    • egl/android: fix buffer_count for applications setting max count
    • +

      +

      Thong Thai (1):

      +
    • mesa: Prevent _MaxLevel from being less than zero
    • +

      +

      Timur Kristóf (1):

      +
    • aco/gfx10: Fix VcmpxExecWARHazard mitigation.
    • +

      +

      +
    + +
    + + diff -Nru mesa-19.2.8/docs/relnotes/20.0.0.html mesa-20.0.8/docs/relnotes/20.0.0.html --- mesa-19.2.8/docs/relnotes/20.0.0.html 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/docs/relnotes/20.0.0.html 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,3231 @@ + + + + + +Mesa Release Notes + + + + +
    +

    The Mesa 3D Graphics Library

    +
    + + +
    + +

    Mesa 20.0.0 Release Notes / 2020-02-19

    + +

    + Mesa 20.0.0 is a new development release. People who are concerned + with stability and reliability should stick with a previous release or + wait for Mesa 19.3.1. +

    +

    +Mesa 20.0.0 implements the OpenGL 4.6 API, but the version reported by +glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) / +glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used. +Some drivers don't support all the features required in OpenGL 4.6. OpenGL +4.6 is only available if requested at context creation. +Compatibility contexts may report a lower version depending on each driver. +

    +

    +Mesa 20.0.0 implements the Vulkan 1.2 API, but the version reported by +the apiVersion property of the VkPhysicalDeviceProperties struct +depends on the particular driver being used. +

    + +

    SHA256 checksum

    +
    +  bb6db3e54b608d2536d4000b3de7dd3ae115fc114e8acbb5afff4b3bbed04b34  mesa-20.0.0.tar.xz
    +
    + + +

    New features

    + +
      +
    • OpenGL 4.6 on radeonsi. +
    • +
    • GL_ARB_gl_spirv on radeonsi. +
    • +
    • GL_ARB_spirv_extensions on radeonsi. +
    • +
    • GL_EXT_direct_state_access for compatibility profile. +
    • +
    • VK_AMD_device_coherent_memory on RADV. +
    • +
    • VK_AMD_mixed_attachment_samples on RADV. +
    • +
    • VK_AMD_shader_explicit_vertex_parameter on RADV. +
    • +
    • VK_AMD_shader_image_load_store_lod on RADV. +
    • +
    • VK_AMD_shader_fragment_mask on RADV. +
    • +
    • VK_EXT_subgroup_size_control on RADV/LLVM. +
    • +
    • VK_KHR_separate_depth_stencil_layouts on Intel, RADV. +
    • +
    • VK_KHR_shader_subgroup_extended_types on RADV. +
    • +
    • VK_KHR_swapchain_mutable_format on RADV. +
    • +
    • VK_KHR_shader_float_controls on RADV/ACO. +
    • +
    • GFX6 (Southern Islands) and GFX7 (Sea Islands) support on RADV/ACO. +
    • +
    • Wave32 support for GFX10 (Navi) on RADV/ACO. +
    • +
    • Compilation of Geometry Shaders on RADV/ACO. +
    • +
    • Vulkan 1.2 on Intel, RADV. +
    • +
    • GL_INTEL_shader_integer_functions2 and VK_INTEL_shader_integer_functions2 on Intel. +
    • +
    + +

    Bug fixes

    + +
      +
    • drisw crashes on calling NULL putImage on EGL surfaceless platform (pbuffer EGLSurface)
    • +
    • [radeonsi][vaapi][bisected] invalid VASurfaceID when playing interlaced DVB stream in Kodi
    • +
    • [RADV] GPU hangs while the cutscene plays in the game Assassin's Creed Origins
    • +
    • ACO: The Elder Scrolls Online crashes on startup (Navi)
    • +
    • Broken rendering of glxgears on S/390 architecture (64bit, BigEndian)
    • +
    • aco: sun flickering with Assassins Creeds Origins
    • +
    • !1896 broke ext_image_dma_buf_import piglit tests with radeonsi
    • +
    • aco: wrong geometry with Assassins Creed Origins on GFX6
    • +
    • valgrind errors since commit a8ec4082a41
    • +
    • OSMesa osmesa_choose_format returns a format not supported by st_new_renderbuffer_fb
    • +
    • Build error with VS on WIN
    • +
    • Using EGL_KHR_surfaceless_context causes spurious "libEGL warning: FIXME: egl/x11 doesn't support front buffer rendering."
    • +
    • !3460 broke texsubimage test with piglit on zink+anv
    • +
    • The screen is black when using ACO
    • +
    • [Regression] JavaFX unbounded VRAM+RAM usage
    • +
    • radv: implement VK_AMD_shader_explicit_vertex_parameter
    • +
    • Civilization VI crashes when loading game (AMD Vega Mobile)
    • +
    • [radeonsi] X-Server crashes when trying to start Guild Wars 2 with the commits from !3421
    • +
    • aco: implement GFX6 support
    • +
    • Add support for VK_KHR_swapchain_mutable_format
    • +
    • radv: The Surge 2 crashes in ac_get_elem_bits()
    • +
    • [Regression] JavaFX unbounded VRAM+RAM usage
    • +
    • Use the OpenCL dispatch defnitions from OpenCL_Headers
    • +
    • [regression][ilk,g965,g45] various dEQP-GLES2.functional.shaders.* failures
    • +
    • aco: Dead Rising 4 crashes in lower_to_hw_instr() on GFX6-GFX7
    • +
    • libvulkan_radeon.so crash with `free(): double free detected in tcache 2`
    • +
    • Commit be08e6a causes crash in com.android.launcher3 (Launcher)
    • +
    • anv: Regression causing issues for radv when there are no Intel devices
    • +
    • Mesa no longer compiles with GCC 10
    • +
    • [Navi/aco] Guild Wars 2 - ring gfx timeout with commit 3bca0af2
    • +
    • [radv/aco] Regression is causing a soft crash in The Witcher 3
    • +
    • [bisected] [radeonsi] GPU hangs/resets while playing interlaced content on Kodi with VAAPI
    • +
    • [radeonsi] MSAA image not copied properly after image store through texture view
    • +
    • T-Rex and Manhattan onscreen performance issue on Android
    • +
    • VkSamplerCreateInfo compareEnable not respected
    • +
    • VkSamplerCreateInfo compareEnable not respected
    • +
    • Freedreno drm softpin driver implementation leaks memory
    • +
    • [POLARIS10] VRAM leak involving glTexImage2D with non-NULL data argument
    • +
    • [regression][bisected][ivb/byt] crucible test func.push-constants.basic.q0 causes gpu hang
    • +
    • MR 3096 broke lots of piglit ext_framebuffer_object tests on Raven
    • +
    • Rise of the Tomb Raider benchmark crash on Dell XPS 7390 2-in-1 w/ Iris Plus Graphics (Ice Lake 8x8 GT2)
    • +
    • Raven Ridge (2400G): Resident Evil 2 crashes my machine
    • +
    • Common practice of glGetActiveUniform leads to O(N²) behavior in Mesa
    • +
    • Rocket League ingame artifacts
    • +
    • [radv] SteamVR direct mode no longer works
    • +
    • [ANV] unused create parameters not properly ignored
    • +
    • [Bisected] Mesa fails to start alacritty with the wayland backend (AMD Vega).
    • +
    • [iris] piglit test clip-distance-vs-gs-out fails due to VUE map mismatch between VS <-> GS stages
    • +
    • [radv] SteamVR direct mode no longer works
    • +
    • Blocky corruption in The Surge 2
    • +
    • radeonsi: Floating point exception on R9 270 gpu for a set of traces
    • +
    • [RADV] [Navi] LOD artifacting in Halo - The Master Chief Collection (Halo Reach)
    • +
    • [CTS] dEQP-VK.api.image_clearing.core.clear_color_image.2d.linear.single_layer.r32g32b32_* fail on GFX6-GFX8
    • +
    • Vulkan: Please consider adding another sample count to sampledImageIntegerSampleCounts
    • +
    • Navi10: Bitrate based encoding with VAAPI/RadeonSI unusable
    • +
    • [RADV] create parameters not properly ignored
    • +
    • [regression][bdw,gen9,hsw,icl][iris] gltcs failures on mesa=8172b1fa03f
    • +
    • Bugs in RadeonSI VAAPI implementation
    • +
    • [GFX10] Glitch rendering Custom Avatars in Beat Saber
    • +
    • intel/fs: Check for 16-bit immediates in fs_visitor::lower_mul_dword_inst is too strict
    • +
    • i965/iris: assert when destroy GL context with active query
    • +
    • Visuals without alpha bits are not sRGB-capable
    • +
    • swapchain throttling: wait for fence has 1ns timeout
    • +
    • radeonsi: OpenGL app always produces page fault in gfxhub on Navi 10
    • +
    • [regression] KHR-GLES31.core.geometry_shader.api.program_pipeline_vs_gs_capture fails for various drivers
    • +
    • [CTS] dEQP-VK.spirv_assembly.instruction.spirv1p4.entrypoint.tess_con_pc_entry_point hangs on GFX10
    • +
    • [RADV] SPIR-V warning when compiling shader using storage multisampled image array
    • +
    • [RADV] The Dead Rising 4 is causing a GPU hang with LLVM backend
    • +
    • macOS u_thread.h:156:4: error: implicit declaration of function 'pthread_getcpuclockid'
    • +
    • [Wine / Vulkan] Doom 2016 Hangs on Main Menu
    • +
    • NULL resource when playing VP9 video through VDPAU on RX 570
    • +
    • radeonsi: mpv --vo=vaapi incorrect rendering on gfx9+
    • +
    • [BSW/BDW] skia lcdblendmode & lcdoverlap test failure
    • +
    • Create a way to prefer iris vs i965 via driconf
    • +
    • [Bisected] i965: CS:GO crashes in emit_deref_copy_load_store with debug Mesa
    • +
    • radv/aco Jedi Fallen Order hair rendering buggy
    • +
    • Inaccurate information on https://www.mesa3d.org/repository.html about how to get git write access.
    • +
    • [RADV] VK_KHR_timeline_semaphore balloons in runtime
    • +
    • Shadow of Mordor has randomly dancing black shadows on Talion's face
    • +
    • gen7 crucible failures func.push-constants.basic.q0 and func.shader-subgroup-vote.basic.q0
    • +
    • GL_EXT_disjoint_timer_query failing with GL_INVALID_ENUM
    • +
    • Unreal 4 Elemental and MatineeFightScene demos misrender
    • +
    • gputest gimark has unwanted black liquorice flakes
    • +
    • triangle strip clipping with GL_FIRST_VERTEX_CONVENTION causes wrong vertex's attribute to be broadcasted for flat interpolation
    • +
    • [bisected][regression][g45,g965,ilk] piglit arb_fragment_program kil failures
    • +
    • glcts crashes since the enablement of ARB_shading_language_include
    • +
    • Android build broken
    • +
    • ld.lld: error: duplicate symbol (mesa-19.3.0-rc1)
    • +
    • Divinity: Original Sin Enhanced Edition(Native) crash on start
    • +
    • HSW. Tropico 6 and SuperTuxKart have shadows flickering
    • +
    • GL_EXT_disjoint_timer_query failing with GL_INVALID_ENUM
    • +
    • glxgears segfaults on POWER / Xvnc
    • +
    • [regression][bdw,gen9,icl][iris] piglit failures on mesa f9fd04aca15fd00889caa666ba38007268e67f5c
    • +
    • Redundant builds of libmesa_classic and libmesa_gallium
    • +
    • [IVB,BYT] [Regression] [Bisected] Core dump at launching arb_compute_shader/linker/bug-93840.shader_test
    • +
    • Vulkan drivers need access to format utils of gallium
    • +
    • Disabling lower_fragdata_array causes shader-db to crash for some drivers
    • +
    • GL_EXT_disjoint_timer_query failing with GL_INVALID_ENUM
    • +
    • Android build broken by commit 9020f51 "util/u_endian: Add error checks"
    • +
    • radv secure compile feature breaks compilation of RADV on armhf EABI (19.3-rc1)
    • +
    • radv_debug.c warnings when compiling on 32 bits : cast to pointer from integer of different size
    • +
    • Meson: Mesa3D build failure with standalone Mingw-w64 multilib
    • +
    • [regression][bisected] KHR46 VertexArrayAttribFormat has unexpectedly generated GL_INVALID_OPERATION
    • +
    • textureSize(samplerExternalOES, int) missing in desktop mesa 19.1.7 implementation
    • +
    • zink: implicly casting integers to pointers, warnings on 32-bit compile
    • +
    • Objects leaving trails in Firefox with antialias and preserveDrawingBuffer in three.js WebGLRednerer with mesa 19.2
    • +
    + +

    Changes

    + +
      +

      Aaron Watry (1):

      +
    • clover/llvm: fix build after llvm 10 commit 1dfede3122ee
    • +

      +

      Adam Jackson (1):

      +
    • drisw: Cache the depth of the X drawable
    • +

      +

      Afonso Bordado (4):

      +
    • pan/midgard: Optimize comparisions with similar operations
    • +
    • pan/midgard: Move midgard_is_branch_unit to helpers
    • +
    • pan/midgard: Optimize branches with inverted arguments
    • +
    • pan/midgard: Fix midgard_compile.h includes
    • +

      +

      Alan Coopersmith (1):

      +
    • intel/perf: adapt to platforms like Solaris without d_type in struct dirent
    • +

      +

      Alejandro Piñeiro (4):

      +
    • v3d: adds an extra MOV for any sig.ld*
    • +
    • mesa/main/util: moving gallium u_mm to util, remove main/mm
    • +
    • nir/opt_peephole_select: remove unused variables
    • +
    • turnip: remove unused descriptor state dirty
    • +

      +

      Alexander van der Grinten (1):

      +
    • egl: Fix _eglPointerIsDereferencable w/o mincore()
    • +

      +

      Alexander von Gluck IV (1):

      +
    • haiku/hgl: Fix build via header reordering
    • +

      +

      Alyssa Rosenzweig (223):

      +
    • pipe-loader: Build kmsro loader for with all kmsro targets
    • +
    • pan/midgard: Remove OP_IS_STORE_VARY
    • +
    • pan/midgard: Add a dummy source for loads
    • +
    • pan/midgard: Refactor swizzles
    • +
    • pan/midgard: Eliminate blank_alu_src
    • +
    • pan/midgard: Use fp32 blend shaders
    • +
    • pan/midgard: Validate tags when branching
    • +
    • pan/midgard: Fix quadword_count handling
    • +
    • pan/midgard: Compute bundle interference
    • +
    • pan/midgard: Add bizarre corner case
    • +
    • pan/midgard: offset_swizzle doesn't need dstsize
    • +
    • pan/midgard: Extend offset_swizzle to non-32-bit
    • +
    • pan/midgard: Extend swizzle packing for vec4/16-bit
    • +
    • pan/midgard: Extend default_phys_reg to !32-bit
    • +
    • panfrost/ci: Update T760 expectations
    • +
    • pan/midgard: Fix printing of half-registers in texture ops
    • +
    • pan/midgard: Disassemble half-steps correctly
    • +
    • pan/midgard: Pass shader stage to disassembler
    • +
    • pan/midgard: Switch base for vertex texturing on T720
    • +
    • nir: Add load_output_u8_as_fp16_pan intrinsic
    • +
    • pan/midgard: Identify ld_color_buffer_u8_as_fp16*
    • +
    • pan/midgard: Implement nir_intrinsic_load_output_u8_as_fp16_pan
    • +
    • pan/midgard: Pack load/store masks
    • +
    • panfrost: Select format-specific blending intrinsics
    • +
    • pan/midgard: Add blend shader selection bits for MRT
    • +
    • pan/midgard: Implement linearly-constrained register allocation
    • +
    • pan/midgard: Integrate LCRA
    • +
    • pan/midgard: Remove util/ra support
    • +
    • pan/midgard: Compute spill costs
    • +
    • pan/lcra: Use Chaitin's spilling heuristic
    • +
    • pan/midgard: Copypropagate vector creation
    • +
    • pan/midgard: Fix copypropagation for textures
    • +
    • pan/midgard: Generalize texture registers across GPUs
    • +
    • pan/midgard: Fix vertex texturing on early Midgard
    • +
    • pan/midgard: Use texture, not textureLod, on early Midgard
    • +
    • pan/midgard: Disassemble with old pipeline always on T720
    • +
    • pan/midgard: Prioritize texture registers
    • +
    • pan/midgard: Expand 64-bit writemasks
    • +
    • pan/midgard: Implement i2i64 and u2u64
    • +
    • pan/midgard: Fix mir_round_bytemask_down for !32b
    • +
    • pan/midgard: Pack 64-bit swizzles
    • +
    • pan/midgard: Use generic constant packing for 8/64-bit
    • +
    • pan/midgard: Implement non-aligned UBOs
    • +
    • pan/midgard: Expose more typesize helpers
    • +
    • pan/midgard: Fix masks/alignment for 64-bit loads
    • +
    • pan/midgard: Represent ld/st offset unpacked
    • +
    • pan/midgard: Use shader stage in mir_op_computes_derivative
    • +
    • panfrost: Stub out clover callbacks
    • +
    • panfrost: Pass kernel inputs as uniforms
    • +
    • panfrost: Disable tiling for GLOBAL resources
    • +
    • panfrost: Set PIPE_COMPUTE_CAP_ADDRESS_BITS to 64
    • +
    • pan/midgard: Introduce quirks checks
    • +
    • panfrost: Add the lod_bias field
    • +
    • nir: Add load_sampler_lod_paramaters_pan intrinsic
    • +
    • pan/midgard: Implement load_sampler_lod_paramaters_pan
    • +
    • pan/midgard: Add LOD bias/clamp lowering
    • +
    • pan/midgard: Describe quirk MIDGARD_BROKEN_LOD
    • +
    • pan/midgard: Enable LOD lowering only on buggy chips
    • +
    • panfrost: Add lcra.c to Android.mk
    • +
    • pan/midgard: Use lower_tex_without_implicit_lod
    • +
    • panfrost: Add information about T720 tiling
    • +
    • panfrost: Implement pan_tiler for non-hierarchy GPUs
    • +
    • panfrost: Simplify draw_flags
    • +
    • pan/midgard: Splatter on fragment out
    • +
    • gitlab-ci: Remove non-default skips from Panfrost
    • +
    • panfrost: Remove blend shader hack
    • +
    • panfrost: Update SET_VALUE with information from igt
    • +
    • panfrost: Rename SET_VALUE to WRITE_VALUE
    • +
    • gallium/util: Support POLYGON in u_stream_outputs_for_vertices
    • +
    • pan/midgard: Move spilling code out of scheduler
    • +
    • pan/midgard: Split spill node selection/spilling
    • +
    • pan/midgard: Simplify spillability test
    • +
    • pan/midgard: Remove spill cost heuristic
    • +
    • pan/midgard: Move bounds checking into LCRA
    • +
    • pan/midgard: Remove consecutive_skip code
    • +
    • pan/midgard: Remove code marked "TODO: remove me"
    • +
    • pan/midgard: Dynamically allocate r26/27 for spills
    • +
    • pan/midgard: Use no_spill bitmask
    • +
    • pan/midgard: Don't use no_spill for memory spill src
    • +
    • pan/midgard: Force alignment for csel_v
    • +
    • pan/midgard: Don't try to free NULL in LCRA
    • +
    • pan/midgard: Simplify and fix vector copyprop
    • +
    • pan/midgard: Fix shift for TLS access
    • +
    • panfrost: Describe thread local storage sizing rules
    • +
    • panfrost: Rename unknown_address_0 -> scratchpad
    • +
    • panfrost: Split stack_shift nibble from unk0
    • +
    • panfrost: Add routines to calculate stack size/shift
    • +
    • panfrost: Factor out panfrost_query_raw
    • +
    • panfrost: Query core count and thread tls alloc
    • +
    • panfrost: Route stack_size from compiler
    • +
    • panfrost: Emit SFBD/MFBD after a batch, instead of before
    • +
    • panfrost: Handle minor cppcheck issues
    • +
    • pan/midgard: Remove unused ld/st packing hepers
    • +
    • pan/midgard: Handle misc. cppcheck warnings
    • +
    • panfrost: Calculate maximum stack_size per batch
    • +
    • panfrost: Pass size to panfrost_batch_get_scratchpad
    • +
    • pandecode: Add cast
    • +
    • panfrost: Move nir_undef_to_zero to Midgard compiler
    • +
    • panfrost: Move property queries to _encoder
    • +
    • panfrost: Add panfrost_model_name helper
    • +
    • panfrost: Report GPU name in es2_info
    • +
    • ci: Remove T760/T860 from CI temporarily
    • +
    • panfrost: Pass blend RT number through
    • +
    • pan/midgard: Add schedule barrier after fragment writeout
    • +
    • pan/midgard: Writeout per render target
    • +
    • pan/midgard: Fix liveness analysis with multiple epilogues
    • +
    • pan/midgard: Set r1.w magic
    • +
    • panfrost: Fix FBD issue
    • +
    • ci: Reinstate Panfrost CI
    • +
    • panfrost: Remove fbd_type enum
    • +
    • panfrost: Pack invocation_shifts manually instead of a bit field
    • +
    • panfrost: Remove asserts in panfrost_pack_work_groups_compute
    • +
    • panfrost: Simplify sampler upload condition
    • +
    • panfrost: Don't double-create scratchpad
    • +
    • panfrost: Add PAN_MESA_DEBUG=precompile for shader-db
    • +
    • panfrost: Let precompile imply shaderdb
    • +
    • panfrost: Handle empty shaders
    • +
    • pan/midgard: Use a reg temporary for mutiple writes
    • +
    • pan/midgard: Hoist temporary coordinate for cubemaps
    • +
    • pan/midgard: Set .shadow for shadow samplers
    • +
    • pan/midgard: Set Z to shadow comparator for 2D
    • +
    • pan/midgard: Add uniform/work heuristic
    • +
    • pan/midgard: Implement textureOffset for 2D textures
    • +
    • pan/midgard: Fix crash with txs
    • +
    • pan/midgard: Lower txd with lower_tex
    • +
    • panfrost: Decode shader types in pantrace shader-db
    • +
    • pan/decode: Skip COMPUTE in blobber-db
    • +
    • pan/decode: Prefix blobberdb with MESA_SHADER_*
    • +
    • pan/decode: Append 0:0 spills:fills to blobber-db
    • +
    • pan/midgard: Fix disassembler cycle/quadword counting
    • +
    • pan/midgard: Bounds check lcra_restrict_range
    • +
    • pan/midgard: Extend IS_VEC4_ONLY to arguments
    • +
    • pan/midgard: Clamp LOD register swizzle
    • +
    • pan/midgard: Expand swizzle for texelFetch
    • +
    • pan/midgard: Fix fallthrough from offset to comparator
    • +
    • pan/midgard: Do witchcraft on texture offsets
    • +
    • pan/midgard: Generalize temp coordinate to non-2D
    • +
    • pan/midgard: Implement shadow cubemaps
    • +
    • pan/midgard: Enable lower_(un)pack_* lowering
    • +
    • pan/midgard: Support loads from R11G11B10 in a blend shader
    • +
    • pan/midgard: Add mir_upper_override helper
    • +
    • pan/midgard: Compute destination override
    • +
    • panfrost: Rename pan_instancing.c -> pan_attributes.c
    • +
    • panfrost: Factor batch/resource out of instancing routines
    • +
    • panfrost: Move instancing routines to encoder/
    • +
    • panfrost: Factor out panfrost_compute_magic_divisor
    • +
    • panfrost: Fix off-by-one in pan_invocation.c
    • +
    • pan/decode: Fix reference computation for invocations
    • +
    • panfrost: Slight cleanup of Gallium's pan_attribute.c
    • +
    • panfrost: Remove pan_shift_odd
    • +
    • pan/decode: Handle gl_VertexID/gl_InstanceID
    • +
    • panfrost: Unset vertex_id_zero_based
    • +
    • pan/midgard: Factor out emit_attr_read
    • +
    • pan/midgard: Lower gl_VertexID/gl_InstanceID to attributes
    • +
    • panfrost: Extend attribute_count for vertex builtins
    • +
    • panfrost: Route gl_VertexID through cmdstream
    • +
    • pan/midgard: Fix minor typo
    • +
    • panfrost: Remove MALI_SPECIAL_ATTRIBUTE_BASE defines
    • +
    • panfrost: Update information on fixed attributes/varyings
    • +
    • panfrost: Remove MALI_ATTR_INTERNAL
    • +
    • panfrost: Inline away MALI_NEGATIVE
    • +
    • panfrost: Implement remaining texture wrap modes
    • +
    • panfrost: Add pan_attributes.c to Android.mk
    • +
    • panfrost: Add missing #include in common header
    • +
    • panfrost: Remove mali_alt_func
    • +
    • panfrost; Update comment about work/uniform_count
    • +
    • panfrost: Remove 32-bit next_job path
    • +
    • glsl: Set .flat for gl_FrontFacing
    • +
    • pan/midgard: Promote tilebuffer reads to 32-bit
    • +
    • pan/midgard: Use type-appropriate st_vary
    • +
    • pan/midgard: Implement flat shading
    • +
    • panfrost: Identify glProvokingVertex flag
    • +
    • panfrost: Disable some CAPs we want lowered
    • +
    • panfrost: Implement integer varyings
    • +
    • panfrost: Remove MRT indirection in blend shaders
    • +
    • panfrost: Respect glPointSize()
    • +
    • pan/midgard: Convert fragment writeout to proper branches
    • +
    • pan/midgard: Remove prepacked_branch
    • +
    • panfrost: Handle RGB16F colour clear
    • +
    • panfrost: Pack MRT blend shaders into a single BO
    • +
    • pan/midgard: Fix memory corruption in constant combining
    • +
    • pan/midgard: Use better heuristic for shader termination
    • +
    • pan/midgard: Generalize IS_ALU and quadword_size
    • +
    • pan/midgard: Generate MRT writeout loops
    • +
    • pan/midgard: Remove old comment
    • +
    • pan/midgard: Identity ld_color_buffer as 32-bit
    • +
    • pan/midgard: Use upper ALU tags for MFBD writeout
    • +
    • panfrost: Texture from Z32F_S8 as R32F
    • +
    • panfrost: Support rendering to non-zero Z/S layers
    • +
    • panfrost: Implement sRGB blend shaders
    • +
    • panfrost: Cleanup tiling selection logic
    • +
    • panfrost: Report MSAA 4x supported for dEQP
    • +
    • panfrost: Handle PIPE_FORMAT_R10G10B10A2_USCALED
    • +
    • panfrost: Respect constant buffer_offset
    • +
    • panfrost: Adjust for mismatch between hardware/Gallium in arrays/cube
    • +
    • pan/midgard: Account for z/w flip in texelFetch
    • +
    • panfrost: Don't double-flip Z/W for 2D arrays
    • +
    • pan/midgard: Support indirect UBO offsets
    • +
    • panfrost: Fix linear depth textures
    • +
    • pan/midgard: Bytemasks should round up, not round down
    • +
    • panfrost: Identify un/pack colour opcodes
    • +
    • pan/midgard: Fix recursive csel scheduling
    • +
    • panfrost: Expose some functionality with dEQP flag
    • +
    • panfrost: Compile tiling routines with -O3
    • +
    • panfrost,lima: De-Galliumize tiling routines
    • +
    • panfrost: Rework linear<--->tiled conversions
    • +
    • panfrost: Add pandecode entries for ASTC/ETC formats
    • +
    • panfrost: Fix crash in compute variant allocation
    • +
    • panfrost: Drop mysterious zero=0xFFFF field
    • +
    • panfrost: Don't use implicit mali_exception_status enum
    • +
    • pan/decode: Remove last_size
    • +
    • pan/midgard: Remove pack_color define
    • +
    • pan/decode: Remove SHORT_SLIDE indirection
    • +
    • panfrost: Fix 32-bit warning for `indices`
    • +
    • pan/decode: Drop MFBD compute shader stuff
    • +
    • pan/midgard: Record TEXTURE_OP_BARRIER
    • +
    • pan/midgard: Disassemble barrier instructions
    • +
    • pan/midgard: Validate barriers use a barrier tag
    • +
    • pan/midgard: Handle tag 0x4 as texture
    • +
    • pan/midgard: Remove float_bitcast
    • +
    • pan/midgard: Fix missing prefixes
    • +
    • pan/midgard: Don't crash with constants on unknown ops
    • +
    • pan/midgard: Use fprintf instead of printf for constants
    • +

      +

      Andreas Baierl (14):

      +
    • lima: Beautify stream dumps
    • +
    • lima: Parse VS and PLBU command stream while making a dump
    • +
    • lima/streamparser: Fix typo in vs semaphore parser
    • +
    • lima/streamparser: Add findings introduced with gl_PointSize
    • +
    • lima/parser: Some fixes and cleanups
    • +
    • lima/parser: Add RSW parsing
    • +
    • lima/parser: Add texture descriptor parser
    • +
    • lima: Rotate dump files after each finished pp frame
    • +
    • lima: Fix dump file creation
    • +
    • lima/parser: Fix rsw parser
    • +
    • lima/parser: Fix VS cmd stream parser
    • +
    • lima/parser: Make rsw alpha blend parsing more readable
    • +
    • lima: Add stencil support
    • +
    • lima: Fix alpha blending
    • +

      +

      Andres Rodriguez (1):

      +
    • vulkan/wsi: disable the hardware cursor
    • +

      +

      Andrii Simiklit (5):

      +
    • main: fix several 'may be used uninitialized' warnings
    • +
    • glsl: fix an incorrect max_array_access after optimization of ssbo/ubo
    • +
    • glsl: fix a binding points assignment for ssbo/ubo arrays
    • +
    • glsl/nir: do not change an element index to have correct block name
    • +
    • mesa/st: fix a memory leak in get_version
    • +

      +

      Anthony Pesch (5):

      +
    • util: import xxhash
    • +
    • util: move fnv1a hash implementation into its own header
    • +
    • util/hash_table: replace _mesa_hash_data's fnv1a hash function with xxhash
    • +
    • util/hash_table: added hash functions for integer types
    • +
    • util/hash_table: update users to use new optimal integer hash functions
    • +

      +

      Anuj Phogat (2):

      +
    • intel: Add device info for 1x4x6 Jasper Lake
    • +
    • intel: Add pci-ids for Jasper Lake
    • +

      +

      Arno Messiaen (5):

      +
    • lima: fix stride in texture descriptor
    • +
    • lima: add layer_stride field to lima_resource struct
    • +
    • lima: introduce ppir_op_load_coords_reg to differentiate between loading texture coordinates straight from a varying vs loading them from a register
    • +
    • lima: add cubemap support
    • +
    • lima/ppir: add lod-bias support
    • +

      +

      Bas Nieuwenhuizen (33):

      +
    • radv: Fix timeout handling in syncobj wait.
    • +
    • radv: Remove _mesa_locale_init/fini calls.
    • +
    • turnip: Remove _mesa_locale_init/fini calls.
    • +
    • anv: Remove _mesa_locale_init/fini calls.
    • +
    • radv: Fix disk_cache_get size argument.
    • +
    • radv: Close all unnecessary fds in secure compile.
    • +
    • radv: Do not change scratch settings while shaders are active.
    • +
    • radv: Allocate cmdbuffer space for buffer marker write.
    • +
    • radv: Enable VK_KHR_buffer_device_address.
    • +
    • amd/llvm: Refactor ac_build_scan.
    • +
    • radv: Unify max_descriptor_set_size.
    • +
    • radv: Fix timeline semaphore refcounting.
    • +
    • radv: Fix RGBX Android<->Vulkan format correspondence.
    • +
    • amd/common: Fix tcCompatible degradation on Stoney.
    • +
    • amd/common: Always use addrlib for HTILE tc-compat.
    • +
    • radv: Limit workgroup size to 1024.
    • +
    • radv: Expose all sample counts for integer formats as well.
    • +
    • amd/common: Handle alignment of 96-bit formats.
    • +
    • nir: Add clone/hash/serialize support for non-uniform tex instructions.
    • +
    • nir: print non-uniform tex fields.
    • +
    • amd/common: Always initialize gfx9 mipmap offset/pitch.
    • +
    • turnip: Use VK_NULL_HANDLE instead of NULL.
    • +
    • meson: Enable -Werror=int-conversion.
    • +
    • Revert "amd/common: Always initialize gfx9 mipmap offset/pitch."
    • +
    • radv: Only use the gfx mipmap level offset/pitch for linear textures.
    • +
    • spirv: Fix glsl type assert in spir2nir.
    • +
    • radv: Emit a BATCH_BREAK when changing pixel shaders or CB_TARGET_MASK.
    • +
    • radv: Use new scanout gfx9 metadata flag.
    • +
    • radv: Disable VK_EXT_sample_locations on GFX10.
    • +
    • radv: Remove syncobj_handle variable in header.
    • +
    • radv: Expose VK_KHR_swapchain_mutable_format.
    • +
    • radv: Allow DCC & TC-compat HTILE with VK_IMAGE_CREATE_EXTENDED_USAGE_BIT.
    • +
    • radv: Do not set SX DISABLE bits for RB+ with unused surfaces.
    • +

      +

      Ben Crocker (1):

      +
    • llvmpipe: use ppc64le/ppc64 Large code model for JIT-compiled shaders
    • +

      +

      Bernd Kuhls (1):

      +
    • util/os_socket: Include unistd.h to fix build error
    • +

      +

      Boris Brezillon (21):

      +
    • panfrost: MALI_DEPTH_TEST is actually MALI_DEPTH_WRITEMASK
    • +
    • panfrost: Destroy the upload manager allocated in panfrost_create_context()
    • +
    • panfrost: Release the ctx->pipe_framebuffer ref
    • +
    • panfrost: Move BO cache related fields to a sub-struct
    • +
    • panfrost: Try to evict unused BOs from the cache
    • +
    • gallium: Fix the ->set_damage_region() implementation
    • +
    • panfrost: Make sure we reset the damage region of RTs at flush time
    • +
    • panfrost: Remove unneeded phi nodes
    • +
    • panfrost/midgard: Fix swizzle for store instructions
    • +
    • panfrost/midgard: Print the actual source register for store operations
    • +
    • panfrost/midgard: Use a union to manipulate embedded constants
    • +
    • panfrost/midgard: Rework mir_adjust_constants() to make it type/size agnostic
    • +
    • panfrost/midgard: Make sure promote_fmov() only promotes 32-bit imovs
    • +
    • panfrost/midgard: Factorize f2f and u2u handling
    • +
    • panfrost/midgard: Add f2f64 support
    • +
    • panfrost/midgard: Fix mir_print_instruction() for branch instructions
    • +
    • panfrost/midgard: Add 64 bits float <-> int converters
    • +
    • panfrost/midgard: Add missing lowering passes for type/size conversion ops
    • +
    • panfrost/midgard: Add a condense_writemask() helper
    • +
    • panfrost/midgard: Prettify embedded constant prints
    • +
    • panfrost: Fix the damage box clamping logic
    • +

      +

      Brian Ho (14):

      +
    • turnip: Update tu_query_pool with turnip-specific fields
    • +
    • turnip: Implement vkCreateQueryPool for occlusion queries
    • +
    • turnip: Implement vkCmdBeginQuery for occlusion queries
    • +
    • turnip: Implement vkCmdEndQuery for occlusion queries
    • +
    • turnip: Update query availability on render pass end
    • +
    • turnip: Implement vkGetQueryPoolResults for occlusion queries
    • +
    • turnip: Implement vkCmdResetQueryPool
    • +
    • turnip: Implement vkCmdCopyQueryPoolResults for occlusion queries
    • +
    • anv: Properly fetch partial results in vkGetQueryPoolResults
    • +
    • anv: Handle unavailable queries in vkCmdCopyQueryPoolResults
    • +
    • turnip: Enable occlusionQueryPrecise
    • +
    • turnip: Free event->bo on vkDestroyEvent
    • +
    • turnip: Fix vkGetQueryPoolResults with available flag
    • +
    • turnip: Fix vkCmdCopyQueryPoolResults with available flag
    • +

      +

      Brian Paul (4):

      +
    • s/APIENTRY/GLAPIENTRY/ in teximage.c
    • +
    • nir: fix a couple signed/unsigned comparison warnings in nir_builder.h
    • +
    • Call shmget() with permission 0600 instead of 0777
    • +
    • nir: no-op C99 _Pragma() with MSVC
    • +

      +

      C Stout (1):

      +
    • util/vector: Fix u_vector_foreach when head rolls over
    • +

      +

      Caio Marcelo de Oliveira Filho (24):

      +
    • spirv: Don't leak GS initialization to other stages
    • +
    • glsl: Check earlier for MaxShaderStorageBlocks and MaxUniformBlocks
    • +
    • glsl: Check earlier for MaxTextureImageUnits and MaxImageUniforms
    • +
    • anv: Initialize depth_bounds_test_enable when not explicitly set
    • +
    • spirv: Consider the sampled_image case in wa_glslang_179 workaround
    • +
    • intel/fs: Lower 64-bit MOVs after lower_load_payload()
    • +
    • intel/fs: Fix lowering of dword multiplication by 16-bit constant
    • +
    • intel/vec4: Fix lowering of multiplication by 16-bit constant
    • +
    • anv/gen12: Temporarily disable VK_KHR_buffer_device_address (and EXT)
    • +
    • spirv: Implement SPV_KHR_non_semantic_info
    • +
    • panfrost: Fix Makefile.sources
    • +
    • anv: Drop unused function parameter
    • +
    • anv: Ignore some CreateInfo structs when rasterization is disabled
    • +
    • intel/fs: Only use SLM fence in compute shaders
    • +
    • spirv: Drop EXT for PhysicalStorageBuffer symbols
    • +
    • spirv: Handle PhysicalStorageBuffer in memory barriers
    • +
    • nir: Add missing nir_var_mem_global to various passes
    • +
    • intel/fs: Add FS_OPCODE_SCHEDULING_FENCE
    • +
    • intel/fs: Add workgroup_size() helper
    • +
    • intel/fs: Don't emit fence for shared memory if only one thread is used
    • +
    • intel/fs: Don't emit control barrier if only one thread is used
    • +
    • anv: Always initialize target_stencil_layout
    • +
    • intel/compiler: Add names for SHADER_OPCODE_[IU]SUB_SAT
    • +
    • nir: Make nir_deref_path_init skip trivial casts
    • +

      +

      Chris Wilson (1):

      +
    • egl: Mention if swrast is being forced
    • +

      +

      Christian Gmeiner (24):

      +
    • drm-shim: fix EOF case
    • +
    • etnaviv: rs: upsampling is not supported
    • +
    • etnaviv: add drm-shim
    • +
    • etnaviv: drop not used config_out function param
    • +
    • etnaviv: use a more self-explanatory param name
    • +
    • etnaviv: handle 8 byte block in tiling
    • +
    • etnaviv: add support for extended pe formats
    • +
    • etnaviv: fix integer vertex formats
    • +
    • etnaviv: use NORMALIZE_SIGN_EXTEND
    • +
    • etnaviv: fix R10G10B10A2 vertex format entries
    • +
    • etnaviv: handle integer case for GENERIC_ATTRIB_SCALE
    • +
    • etnaviv: remove dead code
    • +
    • etnaviv: remove not used etna_bits_ones(..)
    • +
    • etnaviv: drop compiled_rs_state forward declaration
    • +
    • etnaviv: update resource status after flushing
    • +
    • gallium: add PIPE_CAP_MAX_VERTEX_BUFFERS
    • +
    • etnaviv: check if MSAA is supported
    • +
    • etnaviv: gc400 does not support any vertex sampler
    • +
    • etnaviv: use a better name for FE_VERTEX_STREAM_UNK14680
    • +
    • etnaviv: move state based texture structs
    • +
    • etnaviv: move descriptor based texture structs
    • +
    • etnaviv: add deqp debug option
    • +
    • etnaviv: drop default state for PE_STENCIL_CONFIG_EXT2
    • +
    • etnaviv: drm-shim: add GC400
    • +

      +

      Connor Abbott (19):

      +
    • nir: Fix non-determinism in lower_global_vars_to_local
    • +
    • radv: Rename ac_arg_regfile
    • +
    • ac: Add a shared interface between radv, radeonsi, LLVM and ACO
    • +
    • ac/nir, radv, radeonsi: Switch to using ac_shader_args
    • +
    • radv: Move argument declaration out of nir_to_llvm
    • +
    • aco: Constify radv_nir_compiler_options in isel
    • +
    • aco: Use radv_shader_args in aco_compile_shader()
    • +
    • aco: Split vector arguments at the beginning
    • +
    • aco: Make num_workgroups and local_invocation_ids one argument each
    • +
    • radv: Replace supports_spill with explict_scratch_args
    • +
    • aco: Use common argument handling
    • +
    • aco: Make unused workgroup id's 0
    • +
    • nir: Maintain the algebraic automaton's state as we work.
    • +
    • a6xx: Add more CP packets
    • +
    • freedreno: Use new macros for CP_WAIT_REG_MEM and CP_WAIT_MEM_GTE
    • +
    • freedreno: Fix CP_MEM_TO_REG flag definitions
    • +
    • freedreno: Document CP_COND_REG_EXEC more
    • +
    • freedreno: Document CP_UNK_A6XX_55
    • +
    • freedreno: Document CP_INDIRECT_BUFFER_CHAIN
    • +

      +

      Daniel Ogorchock (2):

      +
    • panfrost: Fix panfrost_bo_access memory leak
    • +
    • panfrost: Fix headers and gpu_headers memory leak
    • +

      +

      Daniel Schürmann (58):

      +
    • aco: fix immediate offset for spills if scratch is used
    • +
    • aco: only use single-dword loads/stores for spilling
    • +
    • aco: fix accidential reordering of instructions when scheduling
    • +
    • aco: workaround Tonga/Iceland hardware bug
    • +
    • aco: fix invalid access on Pseudo_instructions
    • +
    • aco: preserve kill flag on moved operands during RA
    • +
    • aco: rematerialize s_movk instructions
    • +
    • aco: check if SALU instructions are predeceeded by exec when calculating WQM needs
    • +
    • aco: value number instructions using the execution mask
    • +
    • aco: use s_and_b64 exec to reduce uniform booleans to one bit
    • +
    • amd/llvm: Add Subgroup Scan functions for SI
    • +
    • radv: Enable Subgroup Arithmetic and Clustered for SI
    • +
    • aco: don't value-number instructions from within a loop with ones after the loop.
    • +
    • aco: don't split live-ranges of linear VGPRs
    • +
    • aco: fix a couple of value numbering issues
    • +
    • aco: refactor visit_store_fs_output() to use the Builder
    • +
    • aco: Initial GFX7 Support
    • +
    • aco: SI/CI - fix sampler aniso
    • +
    • aco: fix SMEM offsets for SI/CI
    • +
    • aco: implement nir_op_fquantize2f16 for SI/CI
    • +
    • aco: only use scalar loads for readonly buffers on SI/CI
    • +
    • aco: implement nir_op_isign on SI/CI
    • +
    • aco: move buffer_store data to VGPR if needed
    • +
    • aco: implement quad swizzles for SI/CI
    • +
    • aco: recognize SI/CI SMRD hazards
    • +
    • aco: fix disassembly of writelane instructions.
    • +
    • aco: split read/writelane opcode into VOP2/VOP3 version for SI/CI
    • +
    • aco: implement 64bit VGPR shifts for SI/CI
    • +
    • aco: make 1/2*PI a literal constant on SI/CI
    • +
    • aco: implement 64bit i2b for SI /CI
    • +
    • aco: implement 64bit ine/ieq for SI/CI
    • +
    • aco: disable disassembly for SI/CI due to lack of support by LLVM
    • +
    • radv: only flush scalar cache for SSBO writes with ACO on GFX8+
    • +
    • aco: flush denorms after fmin/fmax on pre-GFX9
    • +
    • aco: don't use a scalar temporary for reductions on GFX10
    • +
    • aco: implement (clustered) reductions for SI/CI
    • +
    • aco: implement inclusive_scan for SI/CI
    • +
    • aco: implement exclusive scan for SI/CI
    • +
    • radv: disable Youngblood app profile if ACO is used
    • +
    • aco: return to loop_active mask at continue_or_break blocks
    • +
    • radv: Enable ACO on GFX7 (Sea Islands)
    • +
    • aco: use soffset for MUBUF instructions on SI/CI
    • +
    • aco: improve readfirstlane after uniform ssbo loads on GFX7
    • +
    • aco: propagate temporaries into expanded vectors
    • +
    • nir: fix printing of var_decl with more than 4 components.
    • +
    • aco: compact various Instruction classes
    • +
    • aco: compact aco::span<T> to use uint16_t offset and size instead of pointer and size_t.
    • +
    • aco: fix unconditional demote_to_helper
    • +
    • aco: rework lower_to_cssa()
    • +
    • aco: handle phi affinities transitively through parallelcopies
    • +
    • aco: ignore parallelcopies to the same register on jump threading
    • +
    • aco: fix combine_salu_not_bitwise() when SCC is used
    • +
    • aco: reorder VMEM operands in ACO IR
    • +
    • aco: fix register allocation with multiple live-range splits
    • +
    • aco: simplify adjust_sample_index_using_fmask() & get_image_coords()
    • +
    • aco: simplify gathering of MIMG address components
    • +
    • docs: add new features for RADV/ACO.
    • +
    • aco: fix image_atomic_cmp_swap
    • +

      +

      Daniel Stone (2):

      +
    • Revert "st/dri: do FLUSH_VERTICES before calling flush_resource"
    • +
    • Revert "gallium: add st_context_iface::flush_resource to call FLUSH_VERTICES"
    • +

      +

      Danylo Piliaiev (12):

      +
    • intel/blorp: Fix usage of uninitialized memory in key hashing
    • +
    • i965/program_cache: Lift restriction on shader key size
    • +
    • intel/blorp: Fix usage of uninitialized memory in key hashing
    • +
    • intel/fs: Do not lower large local arrays to scratch on gen7
    • +
    • i965: Unify CC_STATE and BLEND_STATE atoms on Haswell as a workaround
    • +
    • glsl: Add varyings to "zero-init of uninitialized vars" workaround
    • +
    • drirc: Add glsl_zero_init workaround for GpuTest
    • +
    • iris/query: Implement PIPE_QUERY_GPU_FINISHED
    • +
    • iris: Fix value of out-of-bounds accesses for vertex attributes
    • +
    • i965: Do not set front_buffer_dirty if there is no front buffer
    • +
    • st/mesa: Handle the rest renderbuffer formats from OSMesa
    • +
    • st/nir: Unify inputs_read/outputs_written before serializing NIR
    • +

      +

      Dave Airlie (74):

      +
    • nir/serialize: pack function has name and entry point into flags.
    • +
    • nir/serialize: fix serializing functions with no implementations.
    • +
    • spirv: don't store 0 to cs.ptr_size for non kernel stages.
    • +
    • spirv: get the correct type for function returns.
    • +
    • spirv/nir/opencl: handle some multiply instructions.
    • +
    • nir: add 64-bit ufind_msb lowering support. (v2)
    • +
    • nouveau: request ufind_msb64 lowering in the frontend.
    • +
    • vtn/opencl: add clz support
    • +
    • nir: fix deref offset builder
    • +
    • llvmpipe: initial query buffer object support. (v2)
    • +
    • docs: add llvmpipe to ARB_query_buffer_object.
    • +
    • gallivm: split out the flow control ir to a common file.
    • +
    • gallivm: nir->tgsi info convertor (v2)
    • +
    • gallivm: add popcount intrinsic wrapper
    • +
    • gallivm: add cttz wrapper
    • +
    • gallivm: add selection for non-32 bit types
    • +
    • gallivm: add nir->llvm translation (v2)
    • +
    • draw: add nir info gathering and building support
    • +
    • gallium: add nir lowering passes for the draw pipe stages. (v2)
    • +
    • gallivm: add swizzle support where one channel isn't defined.
    • +
    • llvmpipe: add initial nir support
    • +
    • nir/samplers: don't zero samplers_used/txf.
    • +
    • llvmpipe/images: handle undefined atomic without crashing
    • +
    • gallivm/llvmpipe: add support for front facing in sysval.
    • +
    • llvmpipe: enable texcoord semantics
    • +
    • gallium/scons: fix graw-xlib build on OSX.
    • +
    • llvmpipe: add queries disabled flag
    • +
    • llvmpipe: disable occlusion queries when requested by state tracker
    • +
    • draw: add support for collecting primitives generated outside streamout
    • +
    • llvmpipe: enable support for primitives generated outside streamout
    • +
    • aco: handle gfx7 int8/10 clamping on exports
    • +
    • gallivm: add bitfield reverse and ufind_msb
    • +
    • llvmpipe/nir: handle texcoord requirements
    • +
    • gallivm: fix transpose for when first channel isn't created
    • +
    • gallivm: fix perspective enable if usage_mask doesn't have 0 bit set
    • +
    • gallivm/nir: cleanup code and call cmp wrapper
    • +
    • gallivm/nir: copy compare ordering code from tgsi
    • +
    • gallivm: add base instance sysval support
    • +
    • gallivm/draw: add support for draw_id system value.
    • +
    • gallivm: fixup base_vertex support
    • +
    • llvmpipe: enable ARB_shader_draw_parameters.
    • +
    • vtn: convert vload/store to single value loops
    • +
    • vtn/opencl: add shuffle/shuffle support
    • +
    • gallivm/nir: wrap idiv to avoid divide by 0 (v2)
    • +
    • llvmpipe: switch to NIR by default
    • +
    • nir: sanitize work group intrinsics to always be 32-bit.
    • +
    • gallivm: add 64-bit const int creator.
    • +
    • llvmpipe/gallivm: add kernel inputs
    • +
    • gallivm: add support for 8-bit/16-bit integer builders
    • +
    • gallivm: pick integer builders for alu instructions.
    • +
    • gallivm/nir: allow 8/16-bit conversion and comparison.
    • +
    • tgsi/mesa: handle KERNEL case
    • +
    • gallivm/llvmpipe: add support for work dimension intrinsic.
    • +
    • gallivm/llvmpipe: add support for block size intrinsic
    • +
    • gallivm/llvmpipe: add support for global operations.
    • +
    • llvmpipe: handle serialized nir as a shader type.
    • +
    • llvmpipe: add support for compute shader params
    • +
    • llvmpipe/nir: use nir_max_vec_components in more places
    • +
    • gallivm: handle non-32 bit undefined
    • +
    • llvmpipe: lower hadd/add_sat
    • +
    • gallivm/nir: lower packing
    • +
    • gallivm/nir: add vec8/16 support
    • +
    • llvmpipe: add debug option to enable OpenCL support.
    • +
    • gallivm: fixup const int64 builder.
    • +
    • llvmpipe: enable ARB_shader_group_vote.
    • +
    • gallium/util: add multi_draw_indirect to util_draw_indirect.
    • +
    • llvmpipe: enable driver side multi draw indirect
    • +
    • llvmpipe: add support for ARB_indirect_parameters.
    • +
    • llvmpipe: add ARB_derivative_control support
    • +
    • gallivm: fix gather component handling.
    • +
    • llvmpipe: fix some integer instruction lowering.
    • +
    • galllivm: fix gather offset casting
    • +
    • gallivm: fix find lsb
    • +
    • gallivm/nir: add missing break for isub.
    • +

      +

      David Heidelberg (1):

      +
    • .mailmap: use correct email address
    • +

      +

      David Stevens (1):

      +
    • virgl: support emulating planar image sampling
    • +

      +

      Denis Pauk (2):

      +
    • gallium/swr: Enable support bptc format.
    • +
    • docs/features: mark GL_ARB_texture_compression_bptc as done for llvmpipe, softpipe, swr
    • +

      +

      Dongwon Kim (3):

      +
    • gallium: enable INTEL_PERFORMANCE_QUERY
    • +
    • iris: INTEL performance query implementation
    • +
    • gallium: check all planes' pipe formats in case of multi-samplers
    • +

      +

      Drew Davenport (1):

      +
    • radeonsi: Clear uninitialized variable
    • +

      +

      Drew DeVault (1):

      +
    • st_get_external_sampler_key: improve error message
    • +

      +

      Duncan Hopkins (1):

      +
    • zink: make sure src image is transfer-src-optimal
    • +

      +

      Dylan Baker (69):

      +
    • Bump VERSION to 20.0.0-devel
    • +
    • docs/new_features: Empty the feature list for the 20.0 cycle
    • +
    • nir: correct use of identity check in python
    • +
    • r200: use preprocessor for big vs little endian checks
    • +
    • r100: Use preprocessor to select big vs little endian paths
    • +
    • dri/osmesa: use preprocessor for selecting endian code paths
    • +
    • util/u_endian: Use _WIN32 instead of _MSC_VER
    • +
    • util/u_endian: set PIPE_ARCH_*_ENDIAN to 1
    • +
    • mesa/main: replace uses of _mesa_little_endian with preprocessor
    • +
    • mesa/swrast: replace instances of _mesa_little_endian with preprocessor
    • +
    • mesa/main: delete now unused _mesa_little_endian
    • +
    • gallium/osmesa: Use PIPE_ARCH_*_ENDIAN instead of little_endian function
    • +
    • util: rename PIPE_ARCH_*_ENDIAN to UTIL_ARCH_*_ENDIAN
    • +
    • util/u_endian: Add error checks
    • +
    • meson: Add dep_glvnd to egl deps when building with glvnd
    • +
    • docs: add release notes for 19.2.3
    • +
    • docs: add sha256 sum to 19.2.3 release notes
    • +
    • docs: update calendar, add news item and link release notes for 19.2.2
    • +
    • meson: gtest needs pthreads
    • +
    • gallium/osmesa: Convert osmesa test to gtest
    • +
    • osmesa/tests: Extend render test to cover other working cases
    • +
    • util: Use ZSTD for shader cache if possible
    • +
    • docs: Add release notes for 19.2.4
    • +
    • docs: Add SHA256 sum for for 19.2.4
    • +
    • docs: update calendar, add news item and link release notes for 19.2.4
    • +
    • docs: Add relnotes for 19.2.5
    • +
    • docs/relnotes/19.2.5: Add SHA256 sum
    • +
    • docs: update calendar, add news item and link release notes for 19.2.5
    • +
    • docs/release-calendar: Update for extended 19.3 rc period
    • +
    • docs: Add release notes for 19.2.6
    • +
    • docs: Add SHA256 sum for 19.2.6
    • +
    • docs: update calendar, add news item and link release notes for 19.2.6
    • +
    • gallium/auxiliary: Fix uses of gnu struct = {} extension
    • +
    • meson: Add -Werror=gnu-empty-initializer to MSVC compat args
    • +
    • docs: Add release notes for 19.2.7
    • +
    • docs: Add SHA256 sums for 19.2.7
    • +
    • docs: update calendar, add news item and link release notes for 19.2.7
    • +
    • docs: Update mesa 19.3 release calendar
    • +
    • meson/broadcom: libbroadcom_cle needs expat headers
    • +
    • meson/broadcom: libbroadcom_cle also needs zlib
    • +
    • docs: add release notes for 19.3.0
    • +
    • docs/19.3.0: Add SHA256 sums
    • +
    • docs: Update release notes, index, and calendar for 19.3.0
    • +
    • dcos: add releanse notes for 19.3.1
    • +
    • docs: Add release notes, update calendar, and add news for 19.3.1
    • +
    • docs: add relnotes for 19.2.8
    • +
    • docs/relnotes/19.2.8: Add SHA256 sum
    • +
    • docs: Add release notes, news, and update calendar for 19.2.8
    • +
    • docs: Add release notes for 19.3.2
    • +
    • docs: add SHA256 sums for 19.3.2
    • +
    • docs: Add release notes for 19.3.2, update calendar and home page
    • +
    • docs: Update release calendar for 20.0
    • +
    • docs: Add relnotes for 19.3.3 release
    • +
    • docs: Add SHA 256 sums for 19.3.3
    • +
    • docs: update news, calendar, and link release notes for 19.3.3
    • +
    • VERSION: bump to 20.0.0-rc1
    • +
    • bin/pick-ui: Add a new maintainer script for picking patches
    • +
    • .pick_status.json: Update to 0d14f41625fa00187f690f283c1eb6a22e354a71
    • +
    • .pick_status.json: Update to b550b7ef3b8d12f533b67b1a03159a127a3ff34a
    • +
    • .pick_status.json: Update to 9afdcd64f2c96f3fcc1a28912987f2e8066aa995
    • +
    • .pick_status.json: Update to 7eaf21cb6f67adbe0e79b80b4feb8c816a98a720
    • +
    • VERSION: bump to 20.0-rc2
    • +
    • .pick_status.json: Update to d8bae10bfe0f487dcaec721743cd51441bcc12f5
    • +
    • .pick_status.json: Update to 689817c9dfde9a0852f2b2489cb0fa93ffbcb215
    • +
    • .pick_status.json: Update to 23037627359e739c42b194dec54875aefbb9d00b
    • +
    • VERSION: bump for 20.0.0-rc3
    • +
    • .pick_status.json: Update to 2a98cf3b2ecea43cea148df7f77d2abadfd1c9db
    • +
    • .pick_status.json: Update to 946eacbafb47c8b94d47e7c9d2a8b02fff5a22fa
    • +
    • .pick_status.json: Update to bee5c9b0dc13dbae0ccf124124eaccebf7f2a435
    • +

      +

      Eduardo Lima Mitev (2):

      +
    • turnip: Remove failed command buffer from pool
    • +
    • turnip: Fix issues in tu_compute_pipeline_create() that may lead to crash
    • +

      +

      Elie Tournier (4):

      +
    • Docs: remove duplicate meson docs for windows
    • +
    • docs: fix ascii html representation
    • +
    • nir/algebraic: i2f(f2i()) -> trunc()
    • +
    • nir/algebraic: sqrt(x)*sqrt(x) -> fabs(x)
    • +

      +

      Emmanuel Gil Peyrot (1):

      +
    • intel/compiler: Return early if read() failed
    • +

      +

      Eric Anholt (102):

      +
    • ci: Make lava inherit the ccache setup of the .build script.
    • +
    • ci: Switch over to an autoscaling GKE cluster for builds.
    • +
    • Revert "ci: Switch over to an autoscaling GKE cluster for builds."
    • +
    • mesa/st: Add mapping of MESA_FORMAT_RGB_SNORM16 to gallium.
    • +
    • gallium: Add defines for FXT1 texture compression.
    • +
    • gallium: Add some more channel orderings of packed formats.
    • +
    • gallium: Add an equivalent of MESA_FORMAT_BGR_UNORM8.
    • +
    • gallium: Add equivalents of packed MESA_FORMAT_*UINT formats.
    • +
    • mesa: Stop defining a full separate format for RGBA_UINT8.
    • +
    • mesa/st: Test round-tripping of all compressed formats.
    • +
    • mesa: Prepare for the MESA_FORMAT_* enum to be sparse.
    • +
    • mesa: Redefine MESA_FORMAT_* in terms of PIPE_FORMAT_*.
    • +
    • mesa/st: Gut most of st_mesa_format_to_pipe_format().
    • +
    • mesa/st: Make st_pipe_format_to_mesa_format an effective no-op.
    • +
    • u_format: Fix swizzle of A1R5G5B5.
    • +
    • ci: Use several debian buster packages instead of hand-building.
    • +
    • ci: Make the skip list regexes match the full test name.
    • +
    • ci: Use cts_runner for our dEQP runs.
    • +
    • ci: Enable all of GLES3/3.1 testing for softpipe.
    • +
    • ci: Remove old commented copy of freedreno artifacts.
    • +
    • ci: Disable flappy blit tests on a630.
    • +
    • ci: Expand the freedreno blit skip regex to cover more cases.
    • +
    • util: Move gallium's PIPE_FORMAT utils to /util/format/
    • +
    • mesa: Move compile of common Mesa core files to a static lib.
    • +
    • mesa/st: Simplify st_choose_matching_format().
    • +
    • mesa: Don't put sRGB formats in the array format table.
    • +
    • mesa/st: Reuse st_choose_matching_format from st_choose_format().
    • +
    • util: Add a mapping from VkFormat to PIPE_FORMAT.
    • +
    • turnip: Drop the copy of the formats table.
    • +
    • ci: Move freedreno's parallelism to the runner instead of gitlab-ci jobs.
    • +
    • ci: Use a tag from the parallel-deqp-runner repo.
    • +
    • nir: Add a scheduler pass to reduce maximum register pressure.
    • +
    • nir: Refactor algebraic's block walk
    • +
    • nir: Make algebraic backtrack and reprocess after a replacement.
    • +
    • freedreno: Introduce a fd_resource_layer_stride() helper.
    • +
    • freedreno: Introduce a fd_resource_tile_mode() helper.
    • +
    • freedreno: Introduce a resource layout header.
    • +
    • freedreno: Convert the slice struct to the new resource header.
    • +
    • freedreno/a6xx: Log the tiling mode in resource layout debug.
    • +
    • turnip: Disable timestamp queries for now.
    • +
    • turnip: Fix unused variable warnings.
    • +
    • turnip: Drop redefinition of VALIDREG now that it's in ir3.h.
    • +
    • turnip: Reuse tu6_stage2opcode() more.
    • +
    • turnip: Add basic SSBO support.
    • +
    • turnip: Refactor the graphics pipeline create implementation.
    • +
    • turnip: Add a helper function for getting tu_buffer iovas.
    • +
    • turnip: Sanity check that we're adding valid BOs to the list.
    • +
    • turnip: Move pipeline BO list adding to BindPipeline.
    • +
    • turnip: Add support for compute shaders.
    • +
    • ci: Disable egl_ext_device_drm tests in piglit.
    • +
    • freedreno: Enable texture upload memory throttling.
    • +
    • freedreno: Stop forcing ALLOW_MAPPED_BUFFERS_DURING_EXEC off.
    • +
    • freedreno: Track the set of UBOs to be uploaded in UBO analysis.
    • +
    • freedreno: Drop the extra offset field for mipmap slices.
    • +
    • freedreno: Refactor the UBWC flags registers emission.
    • +
    • freedreno: Move UBWC layout into a slices array like the non-UBWC slices.
    • +
    • tu: Move our image layout into a freedreno_layout struct.
    • +
    • freedreno: Move a6xx's setup_slices() to a shareable helper function.
    • +
    • freedreno: Switch the 16-bit workaround to match what turnip does.
    • +
    • tu: Move UBWC layout into fdl6_layout() and use that function.
    • +
    • turnip: Lower usub_borrow.
    • +
    • turnip: Drop unused variable.
    • +
    • turnip: Add support for descriptor arrays.
    • +
    • turnip: Fix support for immutable samplers.
    • +
    • ci: Fix caselist results archiving after parallel-deqp-runner rename.
    • +
    • mesa: Fix detection of invalidating both depth and stencil.
    • +
    • mesa/st: Deduplicate the NIR uniform lowering code.
    • +
    • mesa/st: Move the vec4 type size function into core GLSL types.
    • +
    • mesa/prog: Reuse count_vec4_slots() from ir_to_mesa.
    • +
    • mesa/st: Move the dword slot counting function to glsl_types as well.
    • +
    • i965: Reuse the new core glsl_count_dword_slots().
    • +
    • nir: Fix printing of ~0 .locations.
    • +
    • turnip: Refactor linkage state setup.
    • +
    • mesa: Make atomic lowering put atomics above SSBOs.
    • +
    • gallium: Pack the atomic counters just above the SSBOs.
    • +
    • nir: Drop the ssbo_offset to atomic lowering.
    • +
    • compiler: Add a note about how num_ssbos works in the program info.
    • +
    • freedreno: Stop scattered remapping of SSBOs/images to IBOs.
    • +
    • radeonsi: Remove a bunch of default handling of pipe caps.
    • +
    • r600: Remove a bunch of default handling of pipe caps.
    • +
    • r300: Remove a bunch of default handling of pipe caps.
    • +
    • radeonsi: Drop PIPE_CAP_TGSI_ANY_REG_AS_ADDRESS.
    • +
    • turnip: Fix some whitespace around binary operators.
    • +
    • turnip: Refactor the intrinsic lowering.
    • +
    • turnip: Add limited support for storage images.
    • +
    • turnip: Disable UBWC on images used as storage images.
    • +
    • turnip: Add support for non-zero (still constant) UBO buffer indices.
    • +
    • turnip: Add support for uniform texel buffers.
    • +
    • freedreno/ir3: Plumb the ir3_shader_variant into legalize.
    • +
    • turnip: Add support for fine derivatives.
    • +
    • turnip: Fix execution of secondary cmd bufs with nothing in primary.
    • +
    • freedreno: Add some missing a6xx address declarations.
    • +
    • freedreno: Fix OUT_REG() on address regs without a .bo supplied.
    • +
    • turnip: Port krh's packing macros from freedreno to tu.
    • +
    • turnip: Convert renderpass setup to the new register packing macros.
    • +
    • turnip: Convert the rest of tu_cmd_buffer.c over to the new pack macros.
    • +
    • vulkan/wsi: Fix compiler warning when no WSI platforms are enabled.
    • +
    • iris: Silence warning about AUX_USAGE_MC.
    • +
    • mesa/st: Fix compiler warnings from INTEL_shader_integer_functions.
    • +
    • ci: Enable -Werror on the meson-i386 build.
    • +
    • tu: Fix binning address setup after pack macros change.
    • +
    • Revert "gallium: Fix big-endian addressing of non-bitmask array formats."
    • +

      +

      Eric Engestrom (58):

      +
    • meson: split out idep_xmlconfig_headers from idep_xmlconfig
    • +
    • anv: add missing xmlconfig headers dependency
    • +
    • radv: drop unnecessary xmlpool_options_h
    • +
    • pipe-loader: drop unnecessary xmlpool_options_h
    • +
    • loader: replace xmlpool_options_h with idep_xmlconfig_headers
    • +
    • targets/omx: replace xmlpool_options_h with idep_xmlconfig_headers
    • +
    • targets/va: replace xmlpool_options_h with idep_xmlconfig_headers
    • +
    • targets/vdpau: replace xmlpool_options_h with idep_xmlconfig_headers
    • +
    • targets/xa: replace xmlpool_options_h with idep_xmlconfig_headers
    • +
    • targets/xvmc: replace xmlpool_options_h with idep_xmlconfig_headers
    • +
    • dri: replace xmlpool_options_h with idep_xmlconfig_headers
    • +
    • i915: replace xmlpool_options_h with idep_xmlconfig_headers
    • +
    • nouveau: replace xmlpool_options_h with idep_xmlconfig_headers
    • +
    • r200: replace xmlpool_options_h with idep_xmlconfig_headers
    • +
    • radeon: replace xmlpool_options_h with idep_xmlconfig_headers
    • +
    • meson: move idep_xmlconfig_headers to xmlpool/
    • +
    • gitlab-ci: build a recent enough version of GLVND (ie. 1.2.0)
    • +
    • meson: require glvnd 1.2.0
    • +
    • meson: revert glvnd workaround
    • +
    • meson: add variable to control the symbols checks
    • +
    • meson: move the generic symbols check arguments to a common variable
    • +
    • meson: add windows support to symbols checks
    • +
    • meson: require `nm` again on Unix systems
    • +
    • mesa/imports: let the build system detect strtok_r()
    • +
    • egl: fix _EGL_NATIVE_PLATFORM fallback
    • +
    • egl: move #include of local headers out of Khronos headers
    • +
    • gitlab-ci: build libdrm using meson instead of autotools
    • +
    • gitlab-ci: auto-cancel CI runs when a newer commit is pushed to the same branch
    • +
    • CL: sync C headers with Khronos
    • +
    • CL: sync C++ headers with Khronos
    • +
    • vulkan: delete typo'd header
    • +
    • egl: use EGL_CAST() macro in eglmesaext.h
    • +
    • anv: add missing "fall-through" annotation
    • +
    • vk_util: drop duplicate formats in vk_format_map[]
    • +
    • meson: drop duplicate `lib` prefix on libiris_gen*
    • +
    • meson: drop `intel_` prefix on imgui_core
    • +
    • docs: reword a bit and list HTTPS before FTP
    • +
    • intel: add mi_builder_test for gen12
    • +
    • intel/compiler: add ASSERTED annotation to avoid "unused variable" warning
    • +
    • intel/compiler: replace `0` pointer with `NULL`
    • +
    • util/simple_mtx: don't set the canary when it can't be checked
    • +
    • anv: drop unused #include
    • +
    • travis: autodetect python version instead of hard-coding it
    • +
    • util/format: remove left-over util_format_description_table declaration
    • +
    • util/format: add PIPE_FORMAT_ASTC_*x*x*_SRGB to util_format_{srgb,linear}()
    • +
    • util/format: add trivial srgb<->linear conversion test
    • +
    • u_format: move format tests to util/tests/
    • +
    • amd: fix empty-body issues
    • +
    • nine: fix empty-body-issues
    • +
    • meson: simplify install_megadrivers.py invocation
    • +
    • mesa: avoid returning a value in a void function
    • +
    • meson: use github URL for wraps instead of completely unreliable wrapdb
    • +
    • egl: drop confusing mincore() error message
    • +
    • llvmpipe: drop LLVM < 3.4 support
    • +
    • util/atomic: fix return type of p_atomic_add_return() fallback
    • +
    • util/os_socket: fix header unavailable on windows
    • +
    • freedreno/perfcntrs: fix fd leak
    • +
    • util/disk_cache: check for write() failure in the zstd path
    • +

      +

      Erico Nunes (17):

      +
    • lima: fix nir shader memory leak
    • +
    • lima: fix bo submit memory leak
    • +
    • lima/ppir: enable lower_fdph
    • +
    • gallium/util: add alignment parameter to util_upload_index_buffer
    • +
    • lima: allocate separate bo to store varyings
    • +
    • lima: refactor indexed draw indices upload
    • +
    • vc4: move the draw splitting routine to shared code
    • +
    • lima: split draw calls on 64k vertices
    • +
    • lima/ppir: fix lod bias src
    • +
    • lima/ppir: remove assert on ppir_emit_tex unsupported feature
    • +
    • lima: set shader caps to optimize control flow
    • +
    • lima/ppir: remove orphan load node after cloning
    • +
    • lima/ppir: implement full liveness analysis for regalloc
    • +
    • lima/ppir: handle write to dead registers in ppir
    • +
    • lima/ppir: fix ssa undef emit
    • +
    • lima/ppir: split ppir_op_undef into undef and dummy again
    • +
    • lima/ppir: fix src read mask swizzling
    • +

      +

      Erik Faye-Lund (82):

      +
    • zink: heap-allocate samplers objects
    • +
    • zink: emit line-width when using polygon line-mode
    • +
    • anv: remove incorrect polygonMode=point early-out
    • +
    • zink: use actual format for render-pass
    • +
    • zink: always allow mutating the format
    • +
    • zink: do not advertize coherent mapping
    • +
    • zink: disable fragment-shader texture-lod
    • +
    • zink: transition resources before resolving
    • +
    • zink: always allow sampling of images
    • +
    • zink: use u_blitter when format-reinterpreting
    • +
    • zink/spirv: drop temp-array for component-count
    • +
    • zink/spirv: support loading bool constants
    • +
    • zink/spirv: implement bany_fnequal[2-4]
    • +
    • zink/spirv: implement bany_inequal[2-4]
    • +
    • zink/spirv: implement ball_iequal[2-4]
    • +
    • zink/spirv: implement ball_fequal[2-4]
    • +
    • zink: do advertize integer support in shaders
    • +
    • zink/spirv: add support for nir_op_flrp
    • +
    • zink: correct depth-stencil format
    • +
    • nir: patch up deref-vars when lowering clip-planes
    • +
    • zink: always allow transfer to/from buffers
    • +
    • zink: implement buffer-to-buffer copies
    • +
    • zink: remove no-longer-needed hack
    • +
    • zink: move format-checking to separate source
    • +
    • zink: move filter-helper to separate helper-header
    • +
    • zink: move blitting to separate source
    • +
    • zink: move drawing separate source
    • +
    • st/mesa: unmap pbo after updating cache
    • +
    • zink: use true/false instead of TRUE/FALSE
    • +
    • zink: reject invalid sample-counts
    • +
    • zink: fix crash when restoring sampler-states
    • +
    • zink: delete query rather than allocating a new one
    • +
    • zink: do not try to destroy NULL-fence
    • +
    • zink: handle calloc-failure
    • +
    • zink: avoid NULL-deref
    • +
    • zink: avoid NULL-deref
    • +
    • zink: avoid NULL-deref
    • +
    • zink: error-check right variable
    • +
    • zink: silence coverity error
    • +
    • zink: enable PIPE_CAP_MIXED_COLORBUFFER_FORMATS
    • +
    • zink: implement nir_texop_txd
    • +
    • zink: implement txf
    • +
    • zink: implement some more trivial opcodes
    • +
    • zink: simplify front-face type
    • +
    • zink: factor out builtin-var creation
    • +
    • zink: implement load_vertex_id
    • +
    • zink: use nir_fmul_imm
    • +
    • zink: remove unused code-path in lower_pos_write
    • +
    • nir/zink: move clip_halfz-lowering to common code
    • +
    • etnaviv: use nir_lower_clip_halfz instead of open-coding
    • +
    • st/mesa: use uint-samplers for sampling stencil buffers
    • +
    • zink: fixup initialization of operand_mask / num_extra_operands
    • +
    • util: initialize float-array with float-literals
    • +
    • st/wgl: eliminate implicit cast warning
    • +
    • gallium: fix a warning
    • +
    • mesa/st: use float literals
    • +
    • docs: fix typo in html tag name
    • +
    • docs: fix paragraphs
    • +
    • docs: open paragraph before closing it
    • +
    • docs: use code-tag instead of pre-tag
    • +
    • docs: use code-tags instead of pre-tags
    • +
    • docs: use code-tags instead of pre-tags
    • +
    • docs: move paragraph closing tag
    • +
    • docs: remove double-closed definition-list
    • +
    • docs: do not double-close link tag
    • +
    • docs: do not use definition-list for sub-topics
    • +
    • docs: use figure/figcaption instead of tables
    • +
    • docs: remove trailing header
    • +
    • docs: remove leading spaces
    • +
    • docs: remove trailing newlines
    • +
    • docs: use [1] instead of asterisk for footnote
    • +
    • docs: remove pointless, stray newline
    • +
    • docs: fixup indentation
    • +
    • zink: implement nir_texop_txs
    • +
    • zink: support offset-variants of texturing
    • +
    • zink: avoid incorrect vector-construction
    • +
    • zink: store image-type per texture
    • +
    • zink: support sampling non-float textures
    • +
    • zink: support arrays of samplers
    • +
    • zink: set compareEnable when setting compareOp
    • +
    • st/mesa: use uint-result for sampling stencil buffers
    • +
    • Revert "nir: Add a couple trivial abs optimizations"
    • +

      +

      Florian Will (1):

      +
    • radv/winsys: set IB flags prior to submit in the sysmem path
    • +

      +

      Francisco Jerez (26):

      +
    • glsl: Fix software 64-bit integer to 32-bit float conversions.
    • +
    • intel/fs/gen11+: Handle ROR/ROL in lower_simd_width().
    • +
    • intel/fs/gen8+: Fix r127 dst/src overlap RA workaround for EOT message payload.
    • +
    • intel/fs: Fix nir_intrinsic_load_barycentric_at_sample for SIMD32.
    • +
    • intel/fs/cse: Fix non-deterministic behavior due to inaccurate liveness calculation.
    • +
    • intel/fs: Make implied_mrf_writes() an fs_inst method.
    • +
    • intel/fs: Try to vectorize header setup in lower_load_payload().
    • +
    • intel/fs: Generalize fs_reg::is_contiguous() to register files other than VGRF.
    • +
    • intel/fs: Rework fs_inst::is_copy_payload() into multiple classification helpers.
    • +
    • intel/fs: Extend copy propagation dataflow analysis to copies with FIXED_GRF source.
    • +
    • intel/fs: Add partial support for copy-propagating FIXED_GRFs.
    • +
    • intel/fs: Add support for copy-propagating a block of multiple FIXED_GRFs.
    • +
    • intel/fs: Allow limited copy propagation of a LOAD_PAYLOAD into another.
    • +
    • intel/fs/gen4-6: Allocate registers from aligned_pairs_class based on LINTERP use.
    • +
    • intel/fs/gen6: Constrain barycentric source of LINTERP during bank conflict mitigation.
    • +
    • intel/fs/gen6: Generalize aligned_pairs_class to SIMD16 aligned barycentrics.
    • +
    • intel/fs/gen6: Use SEL instead of bashing thread payload for unlit centroid workaround.
    • +
    • intel/fs: Split fetch_payload_reg() into separate helper for barycentrics.
    • +
    • intel/fs: Introduce barycentric layout lowering pass.
    • +
    • intel/fs: Switch to standard vector layout for barycentrics at optimization time.
    • +
    • intel/fs/cse: Make HALT instruction act as CSE barrier.
    • +
    • intel/fs/gen7: Fix fs_inst::flags_written() for SHADER_OPCODE_FIND_LIVE_CHANNEL.
    • +
    • intel/fs: Add virtual instruction to load mask of live channels into flag register.
    • +
    • intel/fs/gen12: Workaround unwanted SEND execution due to broken NoMask control flow.
    • +
    • intel/fs/gen12: Fixup/simplify SWSB annotations of SIMD32 scratch writes.
    • +
    • intel/fs/gen12: Workaround data coherency issues due to broken NoMask control flow.
    • +

      +

      Fritz Koenig (1):

      +
    • freedreno: reorder format check
    • +

      +

      Georg Lehmann (3):

      +
    • Correctly wait in the fragment stage until all semaphores are signaled
    • +
    • Vulkan Overlay: Don't try to change the image layout to present twice
    • +
    • Vulkan overlay: use the corresponding image index for each swapchain
    • +

      +

      Gert Wollny (12):

      +
    • r600: Disable eight bit three channel formats
    • +
    • virgl: Increase the shader transfer buffer by doubling the size
    • +
    • gallium/tgsi_from_mesa: Add 'extern "C"' to be able to include from C++
    • +
    • nir: make nir_get_texture_size/lod available outside nir_lower_tex
    • +
    • gallium: tgsi_from_mesa - handle VARYING_SLOT_FACE
    • +
    • r600: Add functions to dump the shader info
    • +
    • r600: Make it possible to include r600_asm.h in a C++ file
    • +
    • r600/sb: Correct SB disassambler for better debugging
    • +
    • r600: Fix maximum line width
    • +
    • r600: Make SID and unsigned value
    • +
    • r600: Delete vertex buffer only if there is actually a shader state
    • +
    • mesa/st: glsl_to_nir: don't lower atomics to SSBOs if driver supports HW atomics
    • +

      +

      Guido Günther (2):

      +
    • etnaviv: drm: Don't miscalculate timeout
    • +
    • freedreno/drm: Don't miscalculate timeout
    • +

      +

      Gurchetan Singh (11):

      +
    • drirc: set allow_higher_compat_version for Faster Than Light
    • +
    • virgl/drm: update UAPI
    • +
    • teximage: split out helper from EGLImageTargetTexture2DOES
    • +
    • glapi / teximage: implement EGLImageTargetTexStorageEXT
    • +
    • dri_util: add driImageFormatToSizedInternalGLFormat function
    • +
    • i965: track if image is created by a dmabuf
    • +
    • i965: refactor intel_image_target_texture_2d
    • +
    • i965: support EXT_EGL_image_storage
    • +
    • st/dri: track if image is created by a dmabuf
    • +
    • st/mesa: refactor egl image binding a bit
    • +
    • st/mesa: implement EGLImageTargetTexStorage
    • +

      +

      Hyunjun Ko (7):

      +
    • freedreno/ir3: cleanup by removing repeated code
    • +
    • freedreno: support 16b for the sampler opcode
    • +
    • freedreno/ir3: fix printing output registers of FS.
    • +
    • freedreno/ir3: fixup when changing to mad.f16
    • +
    • freedreno/ir3: enable half precision for pre-fs texture fetch
    • +
    • turnip: fix invalid VK_ERROR_OUT_OF_POOL_MEMORY
    • +
    • freedreno/ir3: put the conversion back for half const to the right place.
    • +

      +

      Iago Toral Quiroga (32):

      +
    • v3d: rename vertex shader key (num)_fs_inputs fields
    • +
    • mesa/st: make sure we remove dead IO variables before handing NIR to backends
    • +
    • glsl: add missing initialization of the location path field
    • +
    • v3d: fix indirect BO allocation for uniforms
    • +
    • v3d: actually root the first BO in a command list in the job
    • +
    • v3d: add missing plumbing for VPM load instructions
    • +
    • v3d: add debug assert
    • +
    • v3d: enable debug options for geometry shader dumps
    • +
    • v3d: remove unused variable
    • +
    • v3d: add initial compiler plumbing for geometry shaders
    • +
    • v3d: fix packet descriptions for geometry and tessellation shaders
    • +
    • v3d: emit geometry shader state commands
    • +
    • v3d: implement geometry shader instancing
    • +
    • v3d: add 1-way SIMD packing definition
    • +
    • v3d: compute appropriate VPM memory configuration for geometry shader workloads
    • +
    • v3d: we always have at least one output segment
    • +
    • v3d: add support for adjacency primitives
    • +
    • v3d: don't try to render if shaders failed to compile
    • +
    • v3d: predicate geometry shader outputs inside non-uniform control flow
    • +
    • v3d: save geometry shader state for blitting
    • +
    • v3d: support transform feedback with geometry shaders
    • +
    • v3d: remove obsolete assertion
    • +
    • v3d: do not limit new CL space allocations with branch to 4096 bytes
    • +
    • v3d: support rendering to multi-layered framebuffers
    • +
    • v3d: move layer rendering to a separate helper
    • +
    • v3d: handle writes to gl_Layer from geometry shaders
    • +
    • v3d: fix primitive queries for geometry shaders
    • +
    • v3d: disable lowering of indirect inputs
    • +
    • v3d: support precompiling geometry shaders
    • +
    • v3d: expose OES_geometry_shader
    • +
    • u_vbuf: don't try to delete NULL driver CSO
    • +
    • v3d: fix bug when checking result of syncobj fence import
    • +

      +

      Ian Romanick (39):

      +
    • intel/compiler: Report the number of non-spill/fill SEND messages on vec4 too
    • +
    • nir/algebraic: Add the ability to mark a replacement as exact
    • +
    • nir/algebraic: Mark other comparison exact when removing a == a
    • +
    • intel/fs: Disable conditional discard optimization on Gen4 and Gen5
    • +
    • nir/range-analysis: Add pragmas to help loop unrolling
    • +
    • nir/range_analysis: Make sure the table validation only occurs once
    • +
    • nir/opt_peephole_select: Don't count some unary operations
    • +
    • intel/compiler: Increase nir_opt_peephole_select threshold
    • +
    • nir/algebraic: Simplify some Inf and NaN avoidance code
    • +
    • nir/algebraic: Rearrange bcsel sequences generated by nir_opt_peephole_select
    • +
    • intel/compiler: Fix 'comparison is always true' warning
    • +
    • mesa: Silence 'left shift of negative value' warning in BPTC compression code
    • +
    • mesa: Silence unused parameter warning
    • +
    • anv: Fix error message format string
    • +
    • mesa: Extension boilerplate for INTEL_shader_integer_functions2
    • +
    • glsl: Add new expressions for INTEL_shader_integer_functions2
    • +
    • glsl_types: Add function to get an unsigned base type from a signed type
    • +
    • glsl: Add built-in functions for INTEL_shader_integer_functions2
    • +
    • nir: Add new instructions for INTEL_shader_integer_functions2
    • +
    • nir/algebraic: Add lowering for uabs_usub and uabs_isub
    • +
    • nir/algebraic: Add lowering for 64-bit hadd and rhadd
    • +
    • nir/algebraic: Add lowering for 64-bit usub_sat
    • +
    • nir/algebraic: Add lowering for 64-bit uadd_sat
    • +
    • nir/algebraic: Add lowering for 64-bit iadd_sat and isub_sat
    • +
    • compiler: Translate GLSL IR to NIR for new INTEL_shader_integer_functions2 expressions
    • +
    • intel/fs: Don't lower integer multiplies that don't need lowering
    • +
    • intel/fs: Add SHADER_OPCODE_[IU]SUB_SAT pseudo-ops
    • +
    • intel/fs: Implement support for NIR opcodes for INTEL_shader_integer_functions2
    • +
    • nir/spirv: Translate SPIR-V to NIR for new INTEL_shader_integer_functions2 opcodes
    • +
    • spirv: Silence a bunch of unused parameter warnings
    • +
    • spirv: Add support for IntegerFunctions2INTEL capability
    • +
    • i965: Enable INTEL_shader_integer_functions2 on Gen8+
    • +
    • gallium: Add a cap bit for OpenCL-style extended integer functions
    • +
    • gallium: Add a cap bit for integer multiplication between 32-bit and 16-bit
    • +
    • iris: Enable INTEL_shader_integer_functions2
    • +
    • anv: Enable SPV_INTEL_shader_integer_functions2 and VK_INTEL_shader_integer_functions2
    • +
    • nir/algebraic: Optimize some 64-bit integer comparisons involving zero
    • +
    • relnotes: Add GL_INTEL_shader_integer_functions2 and VK_INTEL_shader_integer_functions2
    • +
    • intel/fs: Don't count integer instructions as being possibly coissue
    • +

      +

      Icecream95 (16):

      +
    • gallium/auxiliary: Reduce conversions in u_vbuf_get_minmax_index_mapped
    • +
    • gallium/auxiliary: Handle count == 0 in u_vbuf_get_minmax_index_mapped
    • +
    • panfrost: Add negative lod bias support
    • +
    • panfrost: Compact the bo_access readers array
    • +
    • panfrost: Dynamically allocate shader variants
    • +
    • panfrost: Add ETC1/ETC2 texture formats
    • +
    • panfrost: Add ASTC texture formats
    • +
    • pan/midgard: Fix bundle dynarray leak
    • +
    • pan/midgard: Fix a memory leak in the disassembler
    • +
    • pan/midgard: Support disassembling to a file
    • +
    • pan/bifrost: Support disassembling to a file
    • +
    • pan/decode: Support dumping to a file
    • +
    • pan/decode: Dump to a file
    • +
    • pan/decode: Rotate trace files
    • +
    • panfrost: Don't copy uniforms when the size is zero
    • +
    • pan/midgard: Fix a liveness info leak
    • +

      +

      Icenowy Zheng (2):

      +
    • lima: support indexed draw with bias
    • +
    • lima: fix lima_set_vertex_buffers()
    • +

      +

      Ilia Mirkin (7):

      +
    • gm107/ir: fix loading z offset for layered 3d image bindings
    • +
    • nv50/ir: mark STORE destination inputs as used
    • +
    • nv50,nvc0: fix destination coordinates of blit
    • +
    • nvc0: add dummy reset status support
    • +
    • gm107/ir: avoid combining geometry shader stores at 0x60
    • +
    • nvc0: treat all draws without color0 broadcast as MRT
    • +
    • nvc0: disable xfb's which don't have a stride
    • +

      +

      Italo Nicola (1):

      +
    • intel/compiler: remove old comment
    • +

      +

      Iván Briano (4):

      +
    • intel/compiler: Don't change hstride if not needed
    • +
    • anv: Export filter_minmax support only when it's really supported
    • +
    • anv: Export VK_KHR_buffer_device_address only when really supported
    • +
    • anv: Enable Vulkan 1.2 support
    • +

      +

      James Xiong (3):

      +
    • iris: try to set the specified tiling when importing a dmabuf
    • +
    • gallium: dmabuf support for yuv formats that are not natively supported
    • +
    • gallium: let the pipe drivers decide the supported modifiers
    • +

      +

      Jan Vesely (2):

      +
    • clover: Initialize Asm Parsers
    • +
    • clover: Use explicit conversion from llvm::StringRef to std::string
    • +

      +

      Jan Zielinski (8):

      +
    • gallium/swr: Fix depth values for blit scenario
    • +
    • swr/rasterizer: Add tessellator implementation to the rasterizer
    • +
    • gallium/swr: Fix Windows build
    • +
    • gallium/gallivm/tgsi: enable tessellation shaders
    • +
    • gallium/gallivm: enable linking lp_bld_printf function with C++ code
    • +
    • gallium/swr: implementation of tessellation shaders compilation
    • +
    • gallium/swr: fix tessellation state save/restore
    • +
    • docs: Update SWR tessellation support
    • +

      +

      Jason Ekstrand (212):

      +
    • util: Add a util_sparse_array data structure
    • +
    • anv: Move refcount to anv_bo
    • +
    • anv: Use a util_sparse_array for the GEM handle -> BO map
    • +
    • anv: Fix a relocation race condition
    • +
    • anv: Stop storing the GEM handle in anv_reloc_list_add
    • +
    • anv: Declare the bo in the anv_block_pool_foreach_bo loop
    • +
    • anv: Inline anv_block_pool_get_bo
    • +
    • anv: Replace ANV_BO_EXTERNAL with anv_bo::is_external
    • +
    • anv: Handle state pool relocations using "wrapper" BOs
    • +
    • anv: Fix a potential BO handle leak
    • +
    • anv: Rework anv_block_pool_expand_range
    • +
    • anv: Use anv_block_pool_foreach_bo in get_bo_from_pool
    • +
    • anv: Rework the internal BO allocation API
    • +
    • anv: Choose BO flags internally in anv_block_pool
    • +
    • anv/tests: Zero-initialize instances
    • +
    • anv/tests: Initialize the BO cache and device mutex
    • +
    • anv: Allocate block pool BOs from the cache
    • +
    • anv: Use the query_slot helper in vkResetQueryPoolEXT
    • +
    • anv: Allocate query pool BOs from the cache
    • +
    • anv: Set more flags on descriptor pool buffers
    • +
    • anv: Allocate descriptor buffers from the BO cache
    • +
    • util: Add a free list structure for use with util_sparse_array
    • +
    • anv: Allocate batch and fence buffers from the cache
    • +
    • anv: Allocate scratch BOs from the cache
    • +
    • anv: Allocate misc BOs from the cache
    • +
    • anv: Drop anv_bo_init and anv_bo_init_new
    • +
    • anv: Add a device parameter to anv_execbuf_add_bo
    • +
    • anv: Set the batch allocator for compute pipelines
    • +
    • anv: Use a bitset for tracking residency
    • +
    • anv: Zero released anv_bo structs
    • +
    • anv: Use the new BO alloc API for Android
    • +
    • anv: Don't delete fragment shaders that write sample mask
    • +
    • anv: Don't claim the null RT as a valid color target
    • +
    • anv: Stop compacting render targets in the binding table
    • +
    • anv: Move the RT BTI flush workaround to begin_subpass
    • +
    • spirv: Remove the type from sampled_image
    • +
    • spirv: Add a vtn_decorate_pointer helper
    • +
    • spirv: Sort out the mess that is sampled image
    • +
    • nir/builder: Add a nir_extract_bits helper
    • +
    • nir: Add tests for nir_extract_bits
    • +
    • intel/nir: Use nir_extract_bits in lower_mem_access_bit_sizes
    • +
    • intel/fs: Add DWord scattered read/write opcodes
    • +
    • intel/fs: refactor surface header setup
    • +
    • intel/nir: Plumb devinfo through lower_mem_access_bit_sizes
    • +
    • intel/fs: Implement the new load/store_scratch intrinsics
    • +
    • intel/fs: Lower large local arrays to scratch
    • +
    • anv: Lock around fetching sync file FDs from semaphores
    • +
    • anv: Plumb timeline semaphore signal/wait values through from the API
    • +
    • spirv: Fix the MSVC build
    • +
    • anv/pipeline: Assume layout != NULL
    • +
    • genxml: Mark everything in genX_pack.h always_inline
    • +
    • anv: Input attachments are always single-plane
    • +
    • anv: Flatten descriptor bindings in anv_nir_apply_pipeline_layout
    • +
    • anv: Delete dead shader constant pushing code
    • +
    • anv: Stop bounds-checking pushed UBOs
    • +
    • anv: Pre-compute push ranges for graphics pipelines
    • +
    • intel/compiler: Add a flag to avoid compacting push constants
    • +
    • anv: Re-arrange push constant data a bit
    • +
    • anv: Rework push constant handling
    • +
    • anv: Use a switch statement for binding table setup
    • +
    • anv: More carefully dirty state in BindDescriptorSets
    • +
    • anv: More carefully dirty state in BindPipeline
    • +
    • anv: Use an anv_state for the next binding table
    • +
    • anv: Emit a NULL vertex for zero base_vertex/instance
    • +
    • nir: Validate that variables are in the right lists
    • +
    • iris: Re-enable param compaction
    • +
    • Revert "i965/fs: Merge CMP and SEL into CSEL on Gen8+"
    • +
    • vulkan/enum_to_str: Handle out-of-order aliases
    • +
    • anv/entrypoints: Better handle promoted extensions
    • +
    • vulkan: Update the XML and headers to 1.1.129
    • +
    • anv: Push constants are relative to dynamic state on IVB
    • +
    • anv: Set up SBE_SWIZ properly for gl_Viewport
    • +
    • anv: Respect the always_flush_cache driconf option
    • +
    • iris: Stop setting up fake params
    • +
    • anv: Drop bo_flags from anv_bo_pool
    • +
    • anv: Add a has_softpin boolean
    • +
    • blorp: Pass the VB size to the VF cache workaround
    • +
    • anv: Always invalidate the VF cache in BeginCommandBuffer
    • +
    • anv: Apply cache flushes after setting index/draw VBs
    • +
    • anv: Use PIPE_CONTROL flushes to implement the gen8 VF cache WA
    • +
    • anv: Don't leak when set_tiling fails
    • +
    • util/atomic: Add a _return variant of p_atomic_add
    • +
    • anv: Disallow allocating above heap sizes
    • +
    • anv: Stop tracking VMA allocations
    • +
    • anv: Set up VMA heaps independently from memory heaps
    • +
    • anv: Stop advertising two heaps just for the VF cache WA
    • +
    • anv: Add an explicit_address parameter to anv_device_alloc_bo
    • +
    • util/vma: Factor out the hole splitting part of util_vma_heap_alloc
    • +
    • util/vma: Add a function to allocate a particular address range
    • +
    • anv: Add allocator support for client-visible addresses
    • +
    • anv: Use a pNext loop in AllocateMemory
    • +
    • anv: Implement VK_KHR_buffer_device_address
    • +
    • util/atomic: Add p_atomic_add_return for the unlocked path
    • +
    • vulkan/wsi: Provide the implicitly synchronized BO to vkQueueSubmit
    • +
    • vulkan/wsi: Add a hooks for signaling semaphores and fences
    • +
    • anv: Always add in EXEC_OBJECT_WRITE when specified in extra_flags
    • +
    • anv: Use submit-time implicit sync instead of allocate-time
    • +
    • anv: Add a fence_reset_reset_temporary helper
    • +
    • anv: Use BO fences/semaphores for AcquireNextImage
    • +
    • anv: Return VK_ERROR_OUT_OF_DEVICE_MEMORY for too-large buffers
    • +
    • anv: Re-capture all batch and state buffers
    • +
    • anv: Re-emit all compute state on pipeline switch
    • +
    • ANV: Stop advertising smoothLines support on gen10+
    • +
    • anv: Flush the queue on DeviceWaitIdle
    • +
    • anv: Unconditionally advertise Vulkan 1.1
    • +
    • anv: Bump the advertised patch version to 129
    • +
    • i965: Enable GL_EXT_gpu_shader4 on Gen6+
    • +
    • anv: Properly advertise sampledImageIntegerSampleCounts
    • +
    • anv: Drop unneeded struct keywords
    • +
    • blorp: Stop whacking Z24 depth to BGRA8
    • +
    • blorp: Allow reading with HiZ
    • +
    • i965/blorp: Don't resolve HiZ unless we're reinterpreting
    • +
    • intel/blorp: Use the source format when using blorp_copy with HiZ
    • +
    • anv: Allow HiZ in TRANSFER_SRC_OPTIMAL on Gen8-9
    • +
    • i965: Allow HiZ for glCopyImageSubData sources
    • +
    • intel/nir: Add a memory barrier before barrier()
    • +
    • intel/disasm: Fix decoding of src0 of SENDS
    • +
    • genxml: Remove a non-existant HW bit
    • +
    • anv: Don't add dynamic state base address to push constants on Gen7
    • +
    • anv: Flag descriptors dirty when gl_NumWorkgroups is used
    • +
    • anv: Re-use flush_descriptor_sets in flush_compute_state
    • +
    • intel/vec4: Support scoped_memory_barrier
    • +
    • nir: Handle more barriers in dead_write and copy_prop
    • +
    • nir: Handle barriers with more granularity in combine_stores
    • +
    • llmvpipe: No-op implement more barriers
    • +
    • nir: Add a new memory_barrier_tcs_patch intrinsic
    • +
    • spirv: Add a workaround for OpControlBarrier on old GLSLang
    • +
    • spirv: Add output memory semantics to OpControlBarrier in TCS
    • +
    • nir/glsl: Emit memory barriers as part of barrier()
    • +
    • intel/nir: Stop adding redundant barriers
    • +
    • nir: Rename nir_intrinsic_barrier to control_barrier
    • +
    • nir/lower_atomics_to_ssbo: Also lower barriers
    • +
    • anv: Drop an unused variable
    • +
    • intel/blorp: Fill out all the dwords of MI_ATOMIC
    • +
    • anv: Don't over-advertise descriptor indexing features
    • +
    • anv: Memset array properties
    • +
    • vulkan/wsi: Add a driconf option to force WSI to advertise BGRA8_UNORM first
    • +
    • vulkan: Update the XML and headers to 1.2.131
    • +
    • turnip: Pretend to support Vulkan 1.2
    • +
    • anv: Bump the patch version to 131
    • +
    • anv,nir: Lower quad_broadcast with dynamic index in NIR
    • +
    • anv: Implement the new core version feature queries
    • +
    • anv: Implement the new core version property queries
    • +
    • relnotes: Add Vulkan 1.2
    • +
    • anv: Drop some VK_IMAGE_TILING_OPTIMAL checks
    • +
    • anv: Support modifiers in GetImageFormatProperties2
    • +
    • vulkan/wsi: Move the ImageCreateInfo higher up
    • +
    • vulkan/wsi: Use the interface from the real modifiers extension
    • +
    • vulkan/wsi: Filter modifiers with ImageFormatProperties
    • +
    • vulkan/wsi: Implement VK_KHR_swapchain_mutable_format
    • +
    • anv/blorp: Rename buffer image stride parameters
    • +
    • anv: Canonicalize buffer formats for image/buffer copies
    • +
    • anv: Add an anv_physical_device field to anv_device
    • +
    • anv: Take an anv_device in vk_errorf
    • +
    • anv: Take a device in anv_perf_warn
    • +
    • anv: Stop allocating WSI event fences off the instance
    • +
    • anv: Drop the instance pointer from anv_device
    • +
    • anv: Move the physical device dispatch table to anv_instance
    • +
    • anv: Drop separate chipset_id fields
    • +
    • anv: Re-arrange physical_device_init
    • +
    • anv: Allow enumerating multiple physical devices
    • +
    • anv/apply_pipeline_layout: Initialize the nir_builder before use
    • +
    • intel/blorp: resize src and dst surfaces separately
    • +
    • anv: Use TRANSFER_SRC_OPTIMAL for depth/stencil MSAA resolves
    • +
    • anv: Add a layout_to_aux_state helper
    • +
    • anv: Use isl_aux_state for HiZ resolves
    • +
    • anv: Add a usage parameter to anv_layout_to_aux_usage
    • +
    • anv: Allow HiZ in read-only depth layouts
    • +
    • anv: Improve BTI change cache flushing
    • +
    • intel/fs: Don't unnecessarily fall back to indirect sends on Gen12
    • +
    • intel/disasm: Properly disassemble indirect SENDs
    • +
    • intel/isl: Plumb devinfo into isl_genX(buffer_fill_state_s)
    • +
    • intel/isl: Add a hack for the Gen12 A0 texture buffer bug
    • +
    • anv: Rework the meaning of anv_image::planes[]::aux_usage
    • +
    • anv: Replace aux_surface.isl.size_B checks with aux_usage checks
    • +
    • intel/aux-map: Add some #defines
    • +
    • intel/aux-map: Factor out some useful helpers
    • +
    • anv: Delete a redundant calculation
    • +
    • isl: Add a helper for calculating subimage memory ranges
    • +
    • anv: Add another align_down helper
    • +
    • anv: Make AUX table invalidate a PIPE_* bit
    • +
    • anv: Make anv_vma_alloc/free a lot dumber
    • +
    • anv: Rework CCS memory handling on TGL-LP
    • +
    • intel/blorp: Add support for CCS_E copies with UNORM formats
    • +
    • intel/isl: Allow CCS_E on more formats
    • +
    • intel/genxml: Make SO_DECL::"Hole Flag" a Boolean
    • +
    • anv: Insert holes for non-existant XFB varyings
    • +
    • intel/blorp: Handle bit-casting UNORM and BGRA formats
    • +
    • anv: Replace one more aux_surface.isl.size_B check
    • +
    • intel/mi_builder: Force write completion on Gen12+
    • +
    • anv: Set actual state pool sizes when we have softpin
    • +
    • anv: Re-use one old BT block in reset_batch_bo_chain
    • +
    • anv/block_pool: Ensure allocations have contiguous maps
    • +
    • anv: Rename a variable
    • +
    • genxml: Add a new 3DSTATE_SF field on gen12
    • +
    • anv,iris: Set 3DSTATE_SF::DerefBlockSize to per-poly on Gen12+
    • +
    • intel/genxml: Drop SLMEnable from L3CNTLREG on Gen11
    • +
    • iris: Set SLMEnable based on the L3$ config
    • +
    • iris: Store the L3$ configs in the screen
    • +
    • iris: Use the URB size from the L3$ config
    • +
    • i965: Re-emit l3 state before BLORP executes
    • +
    • intel: Take a gen_l3_config in gen_get_urb_config
    • +
    • intel/blorp: Always emit URB config on Gen7+
    • +
    • iris: Consolodate URB emit
    • +
    • anv: Emit URB setup earlier
    • +
    • intel/common: Return the block size from get_urb_config
    • +
    • intel/blorp: Plumb deref block size through to 3DSTATE_SF
    • +
    • anv: Plumb deref block size through to 3DSTATE_SF
    • +
    • iris: Plumb deref block size through to 3DSTATE_SF
    • +
    • anv: Always fill out the AUX table even if CCS is disabled
    • +
    • intel/fs: Write the address register with NoMask for MOV_INDIRECT
    • +
    • anv/blorp: Use the correct size for vkCmdCopyBufferToImage
    • +

      +

      Jonathan Gray (4):

      +
    • winsys/amdgpu: avoid double simple_mtx_unlock()
    • +
    • i965: update Makefile.sources for perf changes
    • +
    • util/futex: use futex syscall on OpenBSD
    • +
    • util/u_thread: don't restrict u_thread_get_time_nano() to __linux__
    • +

      +

      Jonathan Marek (98):

      +
    • freedreno: add Adreno 640 ID
    • +
    • freedreno/ir3: disable texture prefetch for 1d array textures
    • +
    • freedreno/registers: fix a6xx_2d_blit_cntl ROTATE
    • +
    • etnaviv: blt: use only for tiling, and add missing formats
    • +
    • etnaviv: separate PE and RS formats, use only RS only for tiling
    • +
    • etnaviv: blt: set TS dirty after clear
    • +
    • turnip: add display wsi
    • +
    • turnip: add x11 wsi
    • +
    • turnip: implement CmdClearColorImage/CmdClearDepthStencilImage
    • +
    • turnip: fix sRGB GMEM clear
    • +
    • util: add missing R8G8B8A8_SRGB format to vk_format_map
    • +
    • freedreno/regs: update UBWC related bits
    • +
    • turnip: implement UBWC
    • +
    • etnaviv: avoid using RS for 64bpp formats
    • +
    • etnaviv: implement 64bpp clear
    • +
    • etnaviv: blt: fix partial ZS clears with TS
    • +
    • etnaviv: support 3d/array/integer formats in texture descriptors
    • +
    • turnip: fix integer render targets
    • +
    • freedreno/registers: add missing MH perfcounter enum for a2xx
    • +
    • freedreno/perfcntrs: add a2xx MH counters
    • +
    • freedreno/perfcntrs/fdperf: fix u64 print on 32-bit builds
    • +
    • freedreno/perfcntrs/fdperf: add missing a20x compatible
    • +
    • freedreno/perfcntrs/fdperf: add missing a2xx case in select_counter
    • +
    • turnip: fix display wsi fence timing out
    • +
    • turnip: don't skip unused attachments when setting up tiling config
    • +
    • turnip: implement CmdClearAttachments
    • +
    • turnip: don't set unused BLIT_DST_INFO bits for GMEM clear
    • +
    • turnip: MSAA resolve directly from GMEM
    • +
    • turnip: allow writes to draw_cs outside of render pass
    • +
    • turnip: add function to allocate aligned memory in a substream cs
    • +
    • turnip: improve emit_textures
    • +
    • turnip: implement border color
    • +
    • turnip: add hw binning
    • +
    • turnip: fix incorrectly failing assert
    • +
    • freedreno/ir3: add GLSL_SAMPLER_DIM_SUBPASS to tex_info
    • +
    • freedreno/registers: add a6xx texture format for stencil sampler
    • +
    • turnip: fix hw binning render area
    • +
    • turnip: fix tile layout logic
    • +
    • turnip: update tile_align_w/tile_align_h
    • +
    • turnip: set load_layer_id to zero
    • +
    • turnip: set FRAG_WRITES_SAMPMASK bit
    • +
    • turnip: fix VK_IMAGE_ASPECT_STENCIL_BIT image view
    • +
    • turnip: no 8x msaa on 128bpp formats
    • +
    • turnip: add dirty bit for push constants
    • +
    • turnip: subpass rework
    • +
    • turnip: CmdClearAttachments fixes
    • +
    • turnip: implement subpass input attachments
    • +
    • etnaviv: remove sRGB formats from format table
    • +
    • etnaviv: sRGB render target support
    • +
    • etnaviv: set output mode and saturate bits
    • +
    • etnaviv: update INT_FILTER choice for GLES3 formats
    • +
    • etnaviv: disable integer vertex formats on pre-HALTI2 hardware
    • +
    • etnaviv: remove swizzle from format table
    • +
    • etnaviv: add missing formats
    • +
    • etnaviv: add missing vs_needs_z_div handling to NIR backend
    • +
    • turnip: use single substream cs
    • +
    • turnip: use common blit path for buffer copy
    • +
    • turnip: don't require src image to be set for clear blits
    • +
    • turnip: implement CmdFillBuffer/CmdUpdateBuffer
    • +
    • freedreno/ir3: lower mul_2x32_64
    • +
    • turnip: fix emit_textures for compute shaders
    • +
    • turnip: remove compute emit_border_color
    • +
    • turnip: fix emit_ibo
    • +
    • turnip: change emit_ibo to be like emit_textures
    • +
    • turnip: remove duplicate A6XX_SP_CS_CONFIG_NIBO
    • +
    • nir: add option to lower half packing opcodes
    • +
    • freedreno/ir3: lower pack/unpack ops
    • +
    • turnip: don't set LRZ enable at end of renderpass
    • +
    • freedreno/ir3: update prefetch input_offset when packing inlocs
    • +
    • turnip: add cache invalidate to fix input attachment cases
    • +
    • turnip: don't set SP_FS_CTRL_REG0_VARYING if only fragcoord is used
    • +
    • freedreno/ir3: fix vertex shader sysvals with pre_assign_inputs
    • +
    • freedreno/registers: document vertex/instance id offset bits
    • +
    • freedreno/ir3: support load_base_instance
    • +
    • turnip: emit base instance vs driver param
    • +
    • turnip: emit_compute_driver_params fixes
    • +
    • turnip: compute gmem offsets at renderpass creation time
    • +
    • turnip: implement secondary command buffers
    • +
    • nir: fix assign_io_var_locations for vertex inputs
    • +
    • turnip: minor warning fixes
    • +
    • util/format: add missing vulkan formats
    • +
    • turnip: disable B8G8R8 vertex formats
    • +
    • etnaviv: fix incorrectly failing vertex size assert
    • +
    • etnaviv: update headers from rnndb
    • +
    • etnaviv: HALTI2+ instanced draw
    • +
    • etnaviv: implement gl_VertexID/gl_InstanceID
    • +
    • etnaviv: remove unnecessary vertex_elements_state_create error checking
    • +
    • st/mesa: don't lower YUV when driver supports it natively
    • +
    • st/mesa: run st_nir_lower_tex_src_plane for lowered xyuv/ayuv
    • +
    • freedreno/ir3: allow inputs with the same location
    • +
    • turnip: remove tu_sort_variables_by_location
    • +
    • turnip: fix array/matrix varyings
    • +
    • turnip: hook up GetImageDrmFormatModifierPropertiesEXT
    • +
    • turnip: set linear tiling for scanout images
    • +
    • vulkan/wsi: remove unused image_get_modifier
    • +
    • turnip: simplify tu_physical_device_get_format_properties
    • +
    • etnaviv: implement UBOs
    • +
    • turnip: hook up cmdbuffer event set/wait
    • +

      +

      Jordan Justen (7):

      +
    • iris: Add IRIS_DIRTY_RENDER_BUFFER state flag
    • +
    • iris/gen11+: Move flush for render target change
    • +
    • iris: Allow max dynamic pool size of 2GB for gen12
    • +
    • intel: Remove unused Tigerlake PCI ID
    • +
    • iris: Fix some indentation in iris_init_render_context
    • +
    • iris: Emit CS Stall before Instruction Cache flush for gen12 WA
    • +
    • anv: Emit CS Stall before Instruction Cache flush for gen12 WA
    • +

      +

      Jose Maria Casanova Crespo (1):

      +
    • v3d: Fix predication with atomic image operations
    • +

      +

      Juan A. Suarez Romero (3):

      +
    • nir/lower_double_ops: relax lower mod()
    • +
    • Revert "nir/lower_double_ops: relax lower mod()"
    • +
    • nir/spirv: skip unreachable blocks in Phi second pass
    • +

      +

      Kai Wasserbäch (4):

      +
    • nir: fix unused variable warning in nir_lower_vars_to_explicit_types
    • +
    • nir: fix unused variable warning in find_and_update_previous_uniform_storage
    • +
    • nir: fix unused function warning in src/compiler/nir/nir.c
    • +
    • intel/gen_decoder: Fix unused-but-set-variable warning
    • +

      +

      Karol Herbst (14):

      +
    • nv50/ir: fix crash in isUniform for undefined values
    • +
    • nir/validate: validate num_components on registers and intrinsics
    • +
    • nir/serialize: fix vec8 and vec16
    • +
    • nir/tests: add serializer tests
    • +
    • nir/tests: MSVC build fix
    • +
    • spirv: handle UniformConstant for OpenCL kernels
    • +
    • clover/nir: treat UniformConstant as global memory
    • +
    • clover/nir: set spirv environment to OpenCL
    • +
    • clover/spirv: allow Int64 Atomics for supported devices
    • +
    • nir: handle nir_deref_type_ptr_as_array in rematerialize_deref_in_block
    • +
    • nv50/ir: implement global atomics and handle it for nir
    • +
    • nir/serialize: cast swizzle before shifting
    • +
    • aco: use NIR_MAX_VEC_COMPONENTS instead of 4
    • +
    • nv50ir/nir: support vec8 and vec16
    • +

      +

      Kenneth Graunke (57):

      +
    • iris: Fix "Force Zero RTA Index Enable" setting again
    • +
    • nir: Handle image arrays when setting variable data
    • +
    • Revert "intel/blorp: Fix usage of uninitialized memory in key hashing"
    • +
    • iris: Properly move edgeflag_out from output list to global list
    • +
    • iris: Wrap iris_fix_edge_flags in NIR_PASS
    • +
    • mesa: Handle GL_COLOR_INDEX in _mesa_format_from_format_and_type().
    • +
    • iris: Change keybox parenting
    • +
    • iris: Stop mutating the resource in get_rt_read_isl_surf().
    • +
    • iris: Drop 'old_address' parameter from iris_rebind_buffer
    • +
    • iris: Create an "iris_surface_state" wrapper struct
    • +
    • iris: Maintain CPU-side SURFACE_STATE copies for views and surfaces.
    • +
    • iris: Update SURFACE_STATE addresses when setting sampler views
    • +
    • iris: Disable VF cache partial address workaround on Gen11+
    • +
    • driconf, glsl: Add a vs_position_always_invariant option
    • +
    • drirc: Set vs_position_always_invariant for Shadow of Mordor on Intel
    • +
    • st/mesa: Add GL_TDFX_texture_compression_FXT1 support
    • +
    • iris: Map FXT1 texture formats
    • +
    • meson: Add a "prefer_iris" build option
    • +
    • main: Change u_mmAllocMem align2 from bytes (old API) to bits (new API)
    • +
    • meson: Include iris in default gallium-drivers for x86/x86_64
    • +
    • util: Detect use-after-destroy in simple_mtx
    • +
    • intel/genxml: Add a partial TCCNTLREG definition
    • +
    • iris: Enable Gen11 Color/Z write merging optimization
    • +
    • anv: Enable Gen11 Color/Z write merging optimization
    • +
    • intel/decoder: Make get_state_size take a full 64-bit address and a base
    • +
    • iris: Create smaller program keys without legacy features
    • +
    • iris: Default to X-tiling for scanout buffers without modifiers
    • +
    • iris: Alphabetize source files after iris_perf.c was added
    • +
    • drirc: Final Fantasy VIII: Remastered needs allow_higher_compat_version
    • +
    • iris: Make helper functions to turn iris shader keys into brw keys.
    • +
    • iris: Fix shader recompile debug printing
    • +
    • iris: Avoid replacing backing storage for buffers with no contents
    • +
    • intel: Drop Gen11 WaBTPPrefetchDisable workaround
    • +
    • st/nir: Optionally unify inputs_read/outputs_written when linking.
    • +
    • iris: Set nir_shader_compiler_options::unify_interfaces.
    • +
    • st/mesa: Allow ASTC5x5 fallbacks separately from other ASTC LDR formats.
    • +
    • iris: Disable ASTC 5x5 support on Gen9 for now.
    • +
    • iris: Delete remnants of the unimplemented ASTC 5x5 workaround
    • +
    • iris: Allow HiZ for copy_region sources
    • +
    • anv: Only enable EWA LOD algorithm when doing anisotropic filtering.
    • +
    • Revert "nir: assert that nir_lower_tex runs after lowering derefs"
    • +
    • i965: Simplify brw_get_renderer_string()
    • +
    • iris: Simplify iris_get_renderer_string()
    • +
    • intel: Use similar brand strings to the Windows drivers
    • +
    • intel/compiler: Fix illegal mutation in get_nir_image_intrinsic_image
    • +
    • iris: Fix export of fences that have already completed.
    • +
    • st/mesa: Allocate full miplevels if MaxLevel is explicitly set
    • +
    • iris: Drop some workarounds which are no longer necessary
    • +
    • anv: Drop some workarounds that are no longer necessary
    • +
    • intel: Fix aux map alignments on 32-bit builds.
    • +
    • meson: Prefer 'iris' by default over 'i965'.
    • +
    • loader: Check if the kernel driver is i915 before loading iris
    • +
    • iris: Drop 'engine' from iris_batch.
    • +
    • iris: Make iris_emit_default_l3_config pull devinfo from the batch
    • +
    • iris: Support multiple chained batches.
    • +
    • i965: Use brw_batch_references in tex_busy check
    • +
    • loader: Fix leak of kernel driver name
    • +

      +

      Kristian Høgsberg (62):

      +
    • freedreno/registers: Fix typo
    • +
    • freedreno/registers: Move SP_PRIMITIVE_CNTL and SP_VS_VPC_DST
    • +
    • freedreno/registers: Add comments about primitive counters
    • +
    • freedreno/a6xx: Fix primitive counters again
    • +
    • freedreno/a6xx: Clear sysmem with CP_BLIT
    • +
    • freedreno: Add nogmem debug option to force bypass rendering
    • +
    • freedreno/a6xx: Fix layered texture type enum
    • +
    • freedreno/a6x: Rename z/s formats
    • +
    • freedreno/a6xx: Add register offset for STG/LDG
    • +
    • freedreno/ir3: Emit link map as byte or dwords offsets as needed
    • +
    • freedreno/ir3: Add load and store intrinsics for global io
    • +
    • freedreno: Don't count primitives for patches
    • +
    • freedreno/ir3: Add ir3 intrinsics for tessellation
    • +
    • freedreno/ir3: Use imul24 in offset calculations
    • +
    • freedreno/ir3: Add tessellation field to shader key
    • +
    • freedreno/ir3: Extend geometry lowering pass to handle tessellation
    • +
    • freedreno/ir3: Add new synchronization opcodes
    • +
    • freedreno/ir3: End TES with chsh when using GS
    • +
    • freedreno/ir3: Implement tess coord intrinsic
    • +
    • freedreno/ir3: Implement TCS synchronization intrinsics
    • +
    • freedreno/ir3: Setup inputs and outputs for tessellation stages
    • +
    • freedreno/ir3: Don't assume binning shader is always VS
    • +
    • freedreno/ir3: Pre-color TCS header and primitive ID inputs
    • +
    • freedreno/ir3: Allocate const space for tessellation parameters
    • +
    • freedreno/a6xx: Build the right draw command for tessellation
    • +
    • freedreno/a6xx: Allocate and program tessellation buffer
    • +
    • freedreno/a6xx: Emit constant parameters for tessellation stages
    • +
    • freedreno/a6xx: Program state for tessellation stages
    • +
    • freedreno: Use bypass rendering for tessellation
    • +
    • freedreno/a6xx: Only set emit.hs/ds when we're drawing patches
    • +
    • freedreno/blitter: Save tessellation state
    • +
    • freedreno/a6xx: Only use merged regs and four quads for VS+FS
    • +
    • freedreno/a6xx: Turn on tessellation shaders
    • +
    • freedreno/ir3: Use regid() helper when setting up precolor regs
    • +
    • freedreno/registers: Remove duplicate register definitions
    • +
    • freedreno: New struct packing macros
    • +
    • freedreno/registers: Add 64 bit address registers
    • +
    • freedreno/a6xx: Drop stale include
    • +
    • freedreno/a6xx: Include fd6_pack.h in a few files
    • +
    • freedreno/a6xx: Convert emit_mrt() to OUT_REG()
    • +
    • freedreno/a6xx: Convert emit_zs() to OUT_REG()
    • +
    • freedreno/a6xx: Convert VSC pipe setup to OUT_REG()
    • +
    • freedreno/a6xx: Convert gmem blits to OUT_REG()
    • +
    • freedreno/a6xx: Convert some tile setup to OUT_REG()
    • +
    • freedreno/a6xx: Silence warning for unused perf counters
    • +
    • freedreno/a6xx: Document the CP_SET_DRAW_STATE enable bits
    • +
    • freedreno/a6xx: Make DEBUG_BLIT_FALLBACK only dump fallbacks
    • +
    • freedreno: Add debug flag for forcing linear layouts
    • +
    • freedreno/a6xx: Program sampler swap based on resource tiling
    • +
    • freedreno/a6xx: Pick blitter swap based on resource tiling
    • +
    • freedreno/a6xx: Add fd_resource_swap() helper
    • +
    • freedreno/a6xx: Use blitter for resolve blits
    • +
    • freedreno/a6xx: RB6_R8G8B8 is actually 32 bit RGBX
    • +
    • freedreno/a6xx: Use A6XX_SP_2D_SRC_FORMAT_MASK macro
    • +
    • freedreno/a6xx: Handle srgb blits on the blitter
    • +
    • freedreno/a6xx: Move handle_rgba_blit() up
    • +
    • freedreno/a6xx: Rewrite compressed blits in a helper function
    • +
    • freedreno/a6xx: Set up multisample sysmem MRTs correctly
    • +
    • st/mesa: Lower vars to ssa and constant prop before gl_nir_lower_buffers
    • +
    • ir3: Set up full/half register conflicts correctly
    • +
    • iris: Advertise PIPE_CAP_NATIVE_FENCE_FD
    • +
    • iris: Print warning and return *out = NULL when fd to syncobj fails
    • +

      +

      Krzysztof Raszkowski (10):

      +
    • gallium/swr: Fix GS invocation issues - Fixed proper setting gl_InvocationID. - Fixed GS vertices output memory overflow.
    • +
    • gallium/swr: Enable some ARB_gpu_shader5 extensions Enable / add to features.txt: - Enhanced textureGather. - Geometry shader instancing. - Geometry shader multiple streams.
    • +
    • gallium/swr: Fix crash when use GL_TDFX_texture_compression_FXT1 format.
    • +
    • gallivm: add TGSI bit arithmetic opcodes support
    • +
    • gallium/swr: Fix glVertexPointer race condition.
    • +
    • gallium/swr: Disable showing detected arch message.
    • +
    • docs/GL4: update gallium/swr features
    • +
    • gallium/swr: add option for static link
    • +
    • gallium/swr: Fix gcc 4.8.5 compile error
    • +
    • gallium/swr: simplify environmental variabled expansion code
    • +

      +

      Lasse Lopperi (1):

      +
    • freedreno/drm: Fix memory leak in softpin implementation
    • +

      +

      Laurent Carlier (1):

      +
    • egl: avoid local modifications for eglext.h Khronos standard header file
    • +

      +

      Leo Liu (1):

      +
    • ac: add missing Arcturus to the info of pc lines
    • +

      +

      Lepton Wu (2):

      +
    • gallium: dri2: Use index as plane number.
    • +
    • android: mesa: Revert "android: mesa: revert "Enable asm unconditionally""
    • +

      +

      Lionel Landwerlin (60):

      +
    • intel/dev: set default num_eu_per_subslice on gen12
    • +
    • intel/perf: add TGL support
    • +
    • intel/perf: fix Android build
    • +
    • mesa: check draw buffer completeness on glClearBufferfi/glClearBufferiv
    • +
    • vulkan: bump headers/registry to 1.1.127
    • +
    • anv: Properly handle host query reset of performance queries
    • +
    • anv: implement VK_KHR_separate_depth_stencil_layouts
    • +
    • mesa: check framebuffer completeness only after state update
    • +
    • anv: invalidate file descriptor of semaphore sync fd at vkQueueSubmit
    • +
    • anv: remove list items on batch fini
    • +
    • anv: detach batch emission allocation from device
    • +
    • anv: expose timeout helpers outside of anv_queue.c
    • +
    • anv: move queue init/finish to anv_queue.c
    • +
    • anv: allow NULL batch parameter to anv_queue_submit_simple_batch
    • +
    • anv: prepare driver to report submission error through queues
    • +
    • anv: refcount semaphores
    • +
    • anv: prepare the driver for delayed submissions
    • +
    • anv/wsi: signal the semaphore in the acquireNextImage
    • +
    • anv: implement VK_KHR_timeline_semaphore
    • +
    • intel/dev: flag the Elkhart Lake platform
    • +
    • intel/perf: add EHL performance query support
    • +
    • intel/perf: fix invalid hw_id in query results
    • +
    • intel/perf: set read buffer len to 0 to identify empty buffer
    • +
    • intel/perf: take into account that reports read can be fairly old
    • +
    • intel/perf: simplify the processing of OA reports
    • +
    • intel/perf: fix improper pointer access
    • +
    • anv: fix missing gen12 handling
    • +
    • anv: fix incorrect VMA alignment for CCS main surfaces
    • +
    • anv: fix fence underlying primitive checks
    • +
    • anv: fix assumptions about temporary fence payload
    • +
    • intel/perf: drop batchbuffer flushing at query begin
    • +
    • i965/iris: perf-queries: don't invalidate/flush 3d pipeline
    • +
    • anv: constify pipeline layout in nir passes
    • +
    • anv: drop unused parameter from apply layout pass
    • +
    • vulkan/wsi: error out when image fence doesn't signal
    • +
    • mesa: avoid triggering assert in implementation
    • +
    • i965/iris/perf: factor out frequency register capture
    • +
    • loader: fix close on uninitialized file descriptor value
    • +
    • anv: don't close invalid syncfd semaphore
    • +
    • anv: fix intel perf queries availability writes
    • +
    • anv: set stencil layout for input attachments
    • +
    • iris: Implement Gen12 workaround for non pipelined state
    • +
    • anv: Implement Gen12 workaround for non pipelined state
    • +
    • anv: only use VkSamplerCreateInfo::compareOp if enabled
    • +
    • anv: fix pipeline switch back for non pipelined states
    • +
    • genxml: add new Gen11+ PIPE_CONTROL field
    • +
    • iris: handle new PIPE_CONTROL field
    • +
    • iris: implement another workaround for non pipelined states
    • +
    • anv: implement another workaround for non pipelined states
    • +
    • intel/perf: expose timestamp begin for mdapi
    • +
    • intel/perf: report query split for mdapi
    • +
    • anv: enable VK_KHR_swapchain_mutable_format
    • +
    • anv: don't report error with other vendor DRM devices
    • +
    • anv: ensure prog params are initialized with 0s
    • +
    • anv/iris: warn gen12 3DSTATE_HS restriction
    • +
    • intel: Implement Gen12 workaround for array textures of size 1
    • +
    • isl: drop CCS row pitch requirement for linear surfaces
    • +
    • isl: add gen12 comment about CCS for linear tiling
    • +
    • anv: implement gen9 post sync pipe control workaround
    • +
    • anv: set MOCS on push constants
    • +

      +

      Luis Mendes (1):

      +
    • radv: fix radv secure compile feature breaks compilation on armhf EABI and aarch64
    • +

      +

      Marco Felsch (1):

      +
    • etnaviv: Fix assert when try to accumulate an invalid fd
    • +

      +

      Marek Olšák (245):

      +
    • glsl: encode/decode types using a union with bitfields for readability
    • +
    • glsl: encode vector_elements and matrix_columns better
    • +
    • glsl: encode explicit_stride for basic types better
    • +
    • glsl: encode array types better
    • +
    • glsl: encode struct/interface types better
    • +
    • st/mesa: call nir_opt_access only once
    • +
    • st/mesa: call nir_lower_flrp only once per shader
    • +
    • compiler: make variable::data::binding unsigned
    • +
    • nir: pack nir_variable::data::stream
    • +
    • nir: pack nir_variable::data::xfb_*
    • +
    • radeonsi: use IR SHA1 as the cache key for the in-memory shader cache
    • +
    • radeonsi: don't keep compute shader IR after compilation
    • +
    • radeonsi: keep serialized NIR instead of nir_shader in si_shader_selector
    • +
    • nir: pack the rest of nir_variable::data
    • +
    • nir/serialize: don't expand 16-bit variable state slots to 32 bits
    • +
    • nir/serialize: store 32-bit object IDs instead of 64-bit
    • +
    • nir/serialize: pack nir_variable flags
    • +
    • mesa: expose SPIR-V extensions in the Compatibility profile too
    • +
    • util: add blob_finish_get_buffer
    • +
    • radeonsi/nir: call nir_serialize only once per shader
    • +
    • radeonsi/nir: fix compute shader crash due to nir_binary == NULL
    • +
    • glsl/linker: pass shader_info to analyze_clip_cull_usage directly
    • +
    • compiler: pack shader_info from 160 bytes to 96 bytes
    • +
    • st/mesa: fix Sanctuary and Tropics by disabling ARB_gpu_shader5 for them
    • +
    • st/mesa: rename DEBUG_TGSI -> DEBUG_PRINT_IR
    • +
    • st/mesa: remove \n being only printed in debug builds after printed TGSI
    • +
    • st/mesa: print TCS/TES/GS/CS TGSI in the right place & keep disk cache enabled
    • +
    • st/mesa: add ST_DEBUG=nir to print NIR shaders
    • +
    • st/mesa: remove unused TGSI-only debug printing functions
    • +
    • gallium/noop: call finalize_nir
    • +
    • radeonsi/nir: remove dead function temps
    • +
    • radeonsi/nir: call nir_lower_flrp only once per shader
    • +
    • radeonsi/nir: don't lower fma, instead, fuse fma
    • +
    • mesa: enable glthread for 7 Days To Die
    • +
    • st/mesa: rename delete_basic_variant -> delete_common_variant
    • +
    • st/mesa: decrease the size of st_fp_variant_key from 48 to 40 bytes
    • +
    • st/mesa: start deduplicating some program code
    • +
    • st/mesa: initialize affected_states and uniform storage earlier in deserialize
    • +
    • st/mesa: consolidate and simplify code flagging program::affected_states
    • +
    • st/mesa: trivially merge st_vertex_program into st_common_program
    • +
    • st/mesa: rename st_common_program to st_program
    • +
    • st/mesa: cleanups after unification of st_vertex/common program
    • +
    • st/mesa: rename occurences of stcp to stp to correspond to st_program
    • +
    • st/mesa: more cleanups after unification of st_vertex/common_program
    • +
    • st/mesa: subclass st_vertex_program for VP-specific members
    • +
    • st/mesa: call nir_sweep in st_finalize_nir
    • +
    • st/mesa: keep serialized NIR instead of nir_shader in st_program
    • +
    • st/mesa: call nir_serialize only once per shader
    • +
    • nir: move data.image.access to data.access
    • +
    • nir/print: only print image.format for image variables
    • +
    • glsl_to_nir: rename image_access to mem_access
    • +
    • nir: move data.descriptor_set above data.index for better packing
    • +
    • nir: don't use GLenum16 in nir.h
    • +
    • ac: add radeon_info::num_rings and move ring_type to amd_family.h
    • +
    • ac: fill num_rings for remaining IPs
    • +
    • winsys/amdgpu: detect noop dependencies on the same ring correctly
    • +
    • nir: strip as we serialize to remove the nir_shader_clone call
    • +
    • nir/serialize: do ctx = {0} instead of manual initializations
    • +
    • util/blob: add 8-bit and 16-bit reads and writes
    • +
    • nir/serialize: pack instructions better
    • +
    • nir/serialize: pack src better and limit the object count to 1M from 1G
    • +
    • nir/serialize: don't serialize var->data for temporaries
    • +
    • nir/serialize: deduplicate serialized var types by reusing the last unique one
    • +
    • nir/serialize: try to store a diff in var data locations instead of var data
    • +
    • nir/serialize: pack load_const with non-64-bit constants better
    • +
    • nir/serialize: pack 1-component constants into 20 bits if possible
    • +
    • nir/serialize: pack nir_intrinsic_instr::const_index[] better
    • +
    • nir/serialize: try to pack two alu srcs into 1 uint32
    • +
    • nir/serialize: don't store deref types if not needed
    • +
    • nir/serialize: don't serialize mode for deref non-cast instructions
    • +
    • nir/serialize: try to put deref->var index into the unused bits of the header
    • +
    • nir/serialize: cleanup - fold nir_deref_type_var cases into switches
    • +
    • nir/serialize: try to pack both deref array src into 32 bits
    • +
    • nir/serialize: remove up to 3 consecutive equal ALU instruction headers
    • +
    • nir/serialize: reuse the writemask field for 2 src X swizzles of SSA ALU
    • +
    • nir/serialize: serialize swizzles for vec8 and vec16
    • +
    • nir/serialize: serialize writemask for vec8 and vec16
    • +
    • nir/serialize: don't serialize redundant nir_intrinsic_instr::num_components
    • +
    • nir/serialize: use 3 unused bits in intrinsic for packed_const_indices
    • +
    • nir/serialize: support any num_components for remaining instructions
    • +
    • ac: set swizzled bit in cache policy as a hint not to merge loads/stores
    • +
    • radeonsi: initialize the per-context compiler on demand
    • +
    • radeonsi/nir: don't run si_nir_opts again if there is no change
    • +
    • st/mesa: don't serialize all streamout state if there are no SO outputs
    • +
    • st/mesa: don't use redundant stp->state.ir.nir
    • +
    • st/mesa: don't call ProgramStringNotify in glsl_to_nir
    • +
    • st/mesa: propagate gl_PatchVerticesIn from TCS to TES before linking for NIR
    • +
    • st/mesa: simplify looping over linked shaders when linking NIR
    • +
    • st/mesa: don't use ** in the st_nir_link_shaders signature
    • +
    • st/mesa: add st_variant base class to simplify code for shader variants
    • +
    • ac/nir: don't rely on data.patch for tess factors
    • +
    • radeonsi/nir: implement subgroup system values for SPIR-V
    • +
    • radeonsi: simplify the interface of get_dw_address_from_generic_indices
    • +
    • radeonsi: simplify get_tcs_tes_buffer_address_from_generic_indices
    • +
    • radeonsi/nir: validate is_patch because SPIR-V doesn't set it for tess factors
    • +
    • radeonsi/nir: don't rely on data.patch for tess factors
    • +
    • radeonsi/nir: fix location_frac handling for TCS outputs
    • +
    • radeonsi/nir: support interface output types to fix SPIR-V xfb piglits
    • +
    • radeonsi: enable SPIR-V and GL 4.6 for NIR
    • +
    • util/driconfig: print ATTENTION if MESA_DEBUG=silent is not set
    • +
    • radeonsi/gfx10: simplify some duplicated NGG GS code
    • +
    • radeonsi/gfx10: fix the vertex order for triangle strips emitted by a GS
    • +
    • llvmpipe: implement TEX_LZ and TXF_LZ opcodes
    • +
    • gallivm: implement LOAD with CONSTBUF but don't enable it for llvmpipe
    • +
    • st/mesa: support UBOs for Selection/Feedback/RasterPos
    • +
    • st/mesa: save currently bound vertex samplers and sampler views in st_context
    • +
    • st/mesa: support samplers for Selection/Feedback/RasterPos
    • +
    • st/mesa: support SSBOs for Selection/Feedback/RasterPos
    • +
    • st/mesa: support shader images for Selection/Feedback/RasterPos
    • +
    • st/mesa: use a separate VS variant for the draw module
    • +
    • st/mesa: remove st_vp_variant::num_inputs
    • +
    • st/mesa: remove struct st_vp_variant in favor of st_common_variant
    • +
    • st/mesa: don't generate VS TGSI if NIR is enabled
    • +
    • draw, st/mesa: generate TGSI for ffvp/ARB_vp if draw lacks LLVM
    • +
    • st/mesa: release the draw shader properly to fix driver crashes (iris)
    • +
    • st/dri: assume external consumers of back buffers can write to the buffers
    • +
    • radeonsi: enable NIR by default and document GL 4.6 support
    • +
    • radeonsi/gfx10: disable vertex grouping
    • +
    • radeonsi/gfx10: simplify the tess_turns_off_ngg condition
    • +
    • radeonsi: don't rely on CLEAR_STATE to set PA_SC_GENERIC_SCISSOR_*
    • +
    • ac: fix ac_get_i1_sgpr_mask for Wave32
    • +
    • ac: fix the return value in cull_bbox when bbox culling is disabled
    • +
    • radeonsi: deduplicate ES and GS thread enablement code
    • +
    • radeonsi: disallow compute-based culling if polygon mode is enabled
    • +
    • radeonsi: set is_monolithic for VS prologs when the shader is really monolithic
    • +
    • radeonsi: don't wrap the VS prolog in if (ES thread) .. endif
    • +
    • radeonsi/gfx10: don't insert NGG streamout atomics if they are never used
    • +
    • radeonsi: allow generating VS prologs with 0 inputs
    • +
    • radeonsi: fix determining whether the VS prolog is needed
    • +
    • radeonsi: reset more fields in si_llvm_context_set_ir to fix reusing ctx
    • +
    • radeonsi/gfx10: fix ngg_get_ordered_id
    • +
    • amd/addrlib: update to the latest version
    • +
    • ac/surface: fix an assertion failure on gfx9 in CMASK computation
    • +
    • radeonsi/gfx10: don't declare any LDS for NGG if it's not used
    • +
    • radeonsi/gfx10: enable NGG passthrough for eligible shaders
    • +
    • radeonsi/gfx10: improve performance for TES using PrimID but not exporting it
    • +
    • Revert "u_vbuf: Regard non-constant vbufs with non-instance elements as free"
    • +
    • winsys/radeon: initialize pte_fragment_size
    • +
    • radeonsi: preserve the scanout flag for shared resources on gfx9 and gfx10
    • +
    • radeonsi: ignore PIPE_BIND_SCANOUT for imported textures
    • +
    • radeonsi: remove the "display_dcc_offset == 0" assertion
    • +
    • radeonsi: rename SDMA debug flags
    • +
    • radeonsi: remove broken and unused SI SDMA image copy code
    • +
    • radeonsi: add AMD_DEBUG=nodmaclear for debugging
    • +
    • radeonsi: add AMD_DEBUG=nodmacopyimage for debugging
    • +
    • radeonsi: rename dma_cs -> sdma_cs
    • +
    • radeonsi: move SI and CIK+ SDMA code into 1 common function for cleanups
    • +
    • radeonsi: disable SDMA on gfx8 to fix corruption on RX 580
    • +
    • radeonsi: remove TGSI
    • +
    • gallium: put u_vbuf_get_caps return values into u_vbuf_caps
    • +
    • gallium/cso_context: move non-vbuf vertex buffer and element code into helpers
    • +
    • gallium: bypass u_vbuf if it's not needed (no fallbacks and no user VBOs)
    • +
    • ac/gpu_info: always use distributed tessellation on gfx10
    • +
    • radeonsi: fix monolithic pixel shaders with two-sided colors and SampleMaskIn
    • +
    • radeonsi: fix context roll tracking in si_emit_shader_vs
    • +
    • radeonsi: test polygon mode enablement accurately
    • +
    • radeonsi: determine accurately if line stippling is enabled for performance
    • +
    • radeonsi: clean up messy si_emit_rasterizer_prim_state
    • +
    • ac: unify build_sendmsg_gs_alloc_req
    • +
    • ac: unify primitive export code
    • +
    • ac/gpu_info: add pc_lines and use it in radeonsi
    • +
    • ac: add 128-bit bitcount
    • +
    • ac: add ac_build_s_endpgm
    • +
    • radeonsi/gfx9: force the micro tile mode for MSAA resolve correctly on gfx9
    • +
    • radeonsi: rename desc_list_byte_size -> vb_desc_list_alloc_size
    • +
    • radeonsi: add si_context::num_vertex_elements
    • +
    • radeonsi: don't allow draw calls with uninitialized VS inputs
    • +
    • radeonsi: simplify si_set_vertex_buffers
    • +
    • ac,radeonsi: increase the maximum number of shader args and return values
    • +
    • radeonsi: put up to 5 VBO descriptors into user SGPRs
    • +
    • radeonsi: don't enable VBOs in user SGPRs if compute-based culling can be used
    • +
    • radeonsi: fix assertion and other failures in si_emit_graphics_shader_pointers
    • +
    • radeonsi: actually enable VBOs in user SGPRs
    • +
    • radeonsi: don't adjust depth and stencil PS output locations
    • +
    • radeonsi: rename DBG_NO_TGSI -> DBG_NO_NIR
    • +
    • radeonsi: remove TGSI from comments
    • +
    • radeonsi: rename si_shader_info -> si_shader_binary_info
    • +
    • radeonsi: fork tgsi_shader_info and tgsi_tessctrl_info
    • +
    • radeonsi: merge si_tessctrl_info into si_shader_info
    • +
    • radeonsi: clean up si_shader_info
    • +
    • radeonsi: rename si_compile_tgsi_main -> si_build_main_function
    • +
    • radeonsi: rename si_shader_create -> si_create_shader_variant for clarity
    • +
    • radeonsi: fold si_create_function into si_llvm_create_func
    • +
    • radeonsi: remove always constant ballot_mask_bits from si_llvm_context_init
    • +
    • radeonsi: move PS LLVM code into si_shader_llvm_ps.c
    • +
    • radeonsi: separate code computing info for small primitive culling
    • +
    • ac/cull: don't read Position.Z if it's not needed for culling
    • +
    • radeonsi: make si_insert_input_* functions non-static
    • +
    • radeonsi: move VS_STATE.LS_OUT_PATCH_SIZE a few bits higher to make space there
    • +
    • radeonsi/gfx10: separate code for getting edgeflags from the gs_invocation_id VGPR
    • +
    • radeonsi/gfx10: separate code for determining the number of vertices for NGG
    • +
    • radeonsi: fix si_build_wrapper_function for compute-based primitive culling
    • +
    • radeonsi: work around an LLVM crash when using llvm.amdgcn.icmp.i64.i1
    • +
    • radeonsi: move si_insert_input_* functions
    • +
    • radeonsi: move tessellation shader code into si_shader_llvm_tess.c
    • +
    • radeonsi: remove llvm_type_is_64bit
    • +
    • radeonsi: move geometry shader code into si_shader_llvm_gs.c
    • +
    • radeonsi: move code for shader resources into si_shader_llvm_resources.c
    • +
    • radeonsi: remove useless #includes
    • +
    • radeonsi: merge si_compile_llvm and si_llvm_compile functions
    • +
    • gallium: add st_context_iface::flush_resource to call FLUSH_VERTICES
    • +
    • st/dri: do FLUSH_VERTICES before calling flush_resource
    • +
    • Revert "radeonsi: unbind image before compute clear"
    • +
    • radeonsi: clean up how internal compute dispatches are handled
    • +
    • radeonsi: don't invoke decompression inside internal launch_grid
    • +
    • radeonsi: fix doubles and int64
    • +
    • radeonsi: turn an assertion into return in si_nir_store_output_tcs
    • +
    • ac: add prefix bitcount functions
    • +
    • ac: add ac_build_readlane without optimization barrier
    • +
    • radeonsi/gfx10: update comments and remove invalid TODOs
    • +
    • radeonsi/gfx10: correct VS PrimitiveID implementation for NGG
    • +
    • radeonsi/gfx10: move s_sendmsg gs_alloc_req to the beginning of shaders
    • +
    • radeonsi/gfx10: export primitives at the beginning of VS/TES
    • +
    • radeonsi/gfx10: merge main and pos/param export IF blocks into one if possible
    • +
    • radeonsi/gfx10: don't initialize VGPRs not used by NGG passthrough
    • +
    • radeonsi/gfx10: move GE_PC_ALLOC setting to shader states
    • +
    • radeonsi/gfx10: implement NGG culling for 4x wave32 subgroups
    • +
    • ac: add helper ac_build_triangle_strip_indices_to_triangle
    • +
    • radeonsi/gfx10: rewrite late alloc computation
    • +
    • radeonsi/gfx10: enable GS fast launch for triangles and strips with NGG culling
    • +
    • radeonsi: use ctx->ac. for types and integer constants
    • +
    • radeonsi: move non-LLVM code out of si_shader_llvm.c
    • +
    • radeonsi: move VS shader code into si_shader_llvm_vs.c
    • +
    • radeonsi: move si_shader_llvm_build.c content into si_shader_llvm.c
    • +
    • radeonsi: minor cleanup in si_shader_internal.h
    • +
    • radeonsi: move si_nir_build_llvm into si_shader_llvm.c
    • +
    • radeonsi: fold si_shader_context_set_ir into si_build_main_function
    • +
    • radeonsi: move more LLVM functions into si_shader_llvm.c
    • +
    • radeonsi: make si_compile_llvm return bool
    • +
    • radeonsi: make si_compile_shader return bool
    • +
    • radeonsi: change prototypes of si_is_multi_part_shader & si_is_merged_shader
    • +
    • radeonsi: separate LLVM compilation from non-LLVM code
    • +
    • util/simple_mtx: add a missing include to get ASSERTED
    • +
    • gallium/util: add a cache of live shaders for shader CSO deduplication
    • +
    • radeonsi: use the live shader cache
    • +
    • radeonsi: restructure si_shader_cache_load_shader
    • +
    • radeonsi: print shader cache stats with AMD_DEBUG=cache_stats
    • +
    • radeonsi: expose shader cache stats to the HUD
    • +
    • radeonsi: make screen available to shader part compilation
    • +
    • radeonsi: fix a regression since the addition of si_shader_llvm_vs.c
    • +
    • Revert "winsys/amdgpu: Close KMS handles for other DRM file descriptions"
    • +
    • Revert "winsys/amdgpu: Re-use amdgpu_screen_winsys when possible"
    • +
    • radeonsi: don't report that multi-plane formats are supported
    • +
    • radeonsi: fix the DCC MSAA bug workaround
    • +
    • radeonsi: don't wait for shader compilation to finish when destroying a context
    • +

      +

      Marek Vasut (5):

      +
    • etnaviv: Replace bitwise OR with logical OR
    • +
    • etnaviv: tgsi: Fix gl_FrontFacing support
    • +
    • etnaviv: Report correct number of vertex buffers
    • +
    • etnaviv: Do not filter out PIPE_FORMAT_S8_UINT_Z24_UNORM on pre-HALTI2
    • +
    • etnaviv: Destroy rsc->pending_ctx set in etna_resource_destroy()
    • +

      +

      Mark Janes (3):

      +
    • Revert "st/mesa: call nir_serialize only once per shader"
    • +
    • Revert "st/mesa: keep serialized NIR instead of nir_shader in st_program"
    • +
    • iris: separating out common perf code
    • +

      +

      Markus Wick (3):

      +
    • mapi/glapi: Generate sizeof() helpers instead of fixed sizes.
    • +
    • mesa/glthread: Implement ARB_multi_bind.
    • +
    • drirc: Enable glthread for dolphin/citra/yuzu.
    • +

      +

      Martin Fuzzey (1):

      +
    • etnaviv: update Android build files
    • +

      +

      Mathias Fröhlich (1):

      +
    • egl: Implement getImage/putImage on pbuffer swrast.
    • +

      +

      Matt Turner (19):

      +
    • intel/compiler: Use ARRAY_SIZE()
    • +
    • intel/compiler: Extract GEN_* macros into separate file
    • +
    • intel/compiler: Split has_64bit_types into float/int
    • +
    • intel/compiler: Don't disassemble align1 3-src operands on Gen < 10
    • +
    • intel/compiler: Limit compaction unit tests to specific gens
    • +
    • intel/compiler: Add NF some more places
    • +
    • intel/compiler: Add a INVALID_{,HW_}REG_TYPE macros
    • +
    • intel/compiler: Split hw_type tables
    • +
    • intel/compiler: Handle invalid inputs to brw_reg_type_to_*()
    • +
    • intel/compiler: Handle invalid compacted immediates
    • +
    • intel/compiler: Factor out brw_validate_instruction()
    • +
    • intel/compiler: Validate some instruction word encodings
    • +
    • intel/compiler: Add unit tests for new EU validation checks
    • +
    • intel/compiler: Validate fuzzed instructions
    • +
    • intel/compiler: Test compaction on Gen <= 12
    • +
    • gitlab-ci: Skip ext_timer_query/time-elapsed
    • +
    • intel/compiler: Move Gen4/5 rounding to visitor
    • +
    • util: Explain BITSET_FOREACH_SET params
    • +
    • util: Remove tmp argument from BITSET_FOREACH_SET macro
    • +

      +

      Mauro Rossi (9):

      +
    • android: aco: fix Lower to CSSA
    • +
    • android: radeonsi: fix build error due to wrong u_format.csv file path
    • +
    • android: util/format: fix include path list
    • +
    • android: radeonsi: fix build after vl refactoring (v2)
    • +
    • android: nir: add a load/store vectorization pass
    • +
    • android: util: Add a mapping from VkFormat to PIPE_FORMAT.
    • +
    • android: radv: fix vk_format_table.c generated source build
    • +
    • android: radeonsi,ac: fix building error due to ac changes
    • +
    • android: radv: build radv_shader_args.c
    • +

      +

      Michel Dänzer (36):

      +
    • gitlab-ci: Set arm job CCACHE_DIR properly
    • +
    • gitlab-ci: Use separate arm64 build/test docker images
    • +
    • gitlab-ci: Don't build libdrm for ARM
    • +
    • gitlab-ci: Use ninja -j4 for building dEQP
    • +
    • gitlab-ci: Move artifact preparation to separate script
    • +
    • gitlab-ci: Share dEQP build process between x86 & ARM test image scripts
    • +
    • gitlab-ci: Sort packages in debian-install.sh
    • +
    • gitlab-ci: Run piglit tests with llvmpipe
    • +
    • gitlab-ci: Use separate docker images for x86 build/test jobs
    • +
    • gitlab-ci: Delete install/bin from artifacts as well
    • +
    • gitlab-ci: Document that ci-templates refs must be in sync
    • +
    • gitlab-ci: Use functional container job names
    • +
    • gitlab-ci: Rename container install scripts to match job names (better)
    • +
    • gitlab-ci: Organize images using new REPO_SUFFIX templates feature
    • +
    • gitlab-ci: Directly use host-mapped directory for ccache
    • +
    • gitlab-ci: Stop reporting piglit test results via JUnit
    • +
    • gitlab-ci: Stop storing piglit test results as JUnit
    • +
    • gitlab-ci: Put HTML summary in artifacts for failed piglit jobs
    • +
    • gitlab-ci: Update to current ci-templates master
    • +
    • gitlab-ci: Run piglit glslparser & quick_shader tests separately
    • +
    • glsl/tests: Use splitlines() instead of strip()
    • +
    • gitlab-ci: Use the common run policy for LAVA jobs as well again
    • +
    • gitlab-ci: Overhaul job run policy
    • +
    • gitlab-ci: Don't exclude any piglit quick_shader tests
    • +
    • gitlab-ci: Test against LLVM / clang 9 on x86
    • +
    • gitlab-ci: Stop using manual jobs for merge requests
    • +
    • gitlab-ci: Set GIT_STRATEGY to none for the dummy job
    • +
    • gitlab-ci: Use single if for manual job rules entry
    • +
    • winsys/amdgpu: Keep a list of amdgpu_screen_winsyses in amdgpu_winsys
    • +
    • winsys/amdgpu: Keep track of retrieved KMS handles using hash tables
    • +
    • winsys/amdgpu: Only re-export KMS handles for different DRM FDs
    • +
    • util: Add os_same_file_description helper
    • +
    • winsys/amdgpu: Re-use amdgpu_screen_winsys when possible
    • +
    • winsys/amdgpu: Close KMS handles for other DRM file descriptions
    • +
    • winsys/amdgpu: Re-use amdgpu_screen_winsys when possible
    • +
    • winsys/amdgpu: Close KMS handles for other DRM file descriptions
    • +

      +

      Michel Zou (3):

      +
    • Meson: Check for dladdr with MinGW
    • +
    • disk_cache_get_function_timestamp: check for dladdr
    • +
    • Meson: Add llvm>=9 modules
    • +

      +

      Miguel Casas-Sanchez (1):

      +
    • i965: Ensure that all 2101010 image imports can pass framebuffer completeness.
    • +

      +

      Nanley Chery (3):

      +
    • gallium/dri2: Fix creation of multi-planar modifier images
    • +
    • gallium: Store the image format in winsys_handle
    • +
    • iris: Fix import of multi-planar surfaces with modifiers
    • +

      +

      Nataraj Deshpande (1):

      +
    • egl/android: Restrict minimum triple buffering for android color_buffers
    • +

      +

      Nathan Kidd (1):

      +
    • llvmpipe: Check thread creation errors
    • +

      +

      Neha Bhende (3):

      +
    • st/mesa: release tgsi tokens for shader states
    • +
    • svga: fix size of format_conversion_table[]
    • +
    • svga: Use pipe_shader_state_from_tgsi to set shader state
    • +

      +

      Neil Armstrong (3):

      +
    • Add support for T820 CI Jobs
    • +
    • ci: Remove T820 from CI temporarily
    • +
    • gitlab-ci/lava: add pipeline information in the lava job name
    • +

      +

      Neil Roberts (9):

      +
    • nir/opcodes: Add a helper function to generate the comparison binops
    • +
    • nir/opcodes: Add a helper function to generate reduce opcodes
    • +
    • nir: Add a 16-bit bool type
    • +
    • nir: Add a 8-bit bool type
    • +
    • nir/lower_alu_to_scalar: Support lowering 8- and 16-bit reduce ops
    • +
    • freedreno/ir3: Support 16-bit comparison instructions
    • +
    • freedreno/ir3: Add implementation of nir_op_b16csel
    • +
    • freedreno/ir3: Implement f2b16 and i2b16
    • +
    • freedreno/ir3: Enabling lowering 16-bit flrp
    • +

      +

      Paul Cercueil (5):

      +
    • kmsro: Extend to include ingenic-drm
    • +
    • u_vbuf: Mark vbufs incompatible if more were requested than HW supports
    • +
    • u_vbuf: Only create driver CSO if no incompatible elements
    • +
    • u_vbuf: Regard non-constant vbufs with non-instance elements as free
    • +
    • u_vbuf: Return true in u_vbuf_get_caps if nb of vbufs is below minimum
    • +

      +

      Paul Gofman (1):

      +
    • state_tracker: Handle texture view min level in st_generate_mipmap()
    • +

      +

      Paulo Zanoni (2):

      +
    • intel/compiler: remove the operand restriction for src1 on GLK
    • +
    • intel/compiler: fix nir_op_{i,u}*32 on ICL
    • +

      +

      Peng Huang (1):

      +
    • radeonsi: make si_fence_server_signal flush pipe without work
    • +

      +

      Philipp Sieweck (1):

      +
    • svga: check return value of define_query_vgpu{9,10}
    • +

      +

      Pierre Moreau (4):

      +
    • compiler/spirv: Fix uses of gnu struct = {} extension
    • +
    • include/CL: Update OpenCL headers to latest
    • +
    • clover: Use the dispatch table type from the OpenCL headers
    • +
    • clover/meson: Define OpenCL header macros
    • +

      +

      Pierre-Eric Pelloux-Prayer (54):

      +
    • radeonsi: tell the shader disk cache what IR is used
    • +
    • mesa: enable msaa in clear_with_quad if needed
    • +
    • mesa: pass vao as a function paramter
    • +
    • mesa: add EXT_dsa glVertexArray* functions declarations
    • +
    • mesa: rework _mesa_lookup_vao_err to allow usage from EXT_dsa
    • +
    • mesa: add vao/vbo lookup helper for EXT_dsa
    • +
    • mesa: add EXT_dsa glVertexArray* functions implementation
    • +
    • mesa: add gl_vertex_array_object parameter to client state helpers
    • +
    • mesa: add EXT_dsa glEnableVertexArrayEXT / glDisableVertexArrayEXT
    • +
    • mesa: add EXT_dsa EnableVertexArrayAttribEXT / DisableVertexArrayAttribEXT
    • +
    • mesa: extract helper function from _mesa_GetPointerv
    • +
    • mesa: add EXT_dsa glGetVertexArray* 4 functions
    • +
    • mesa: fix call to _mesa_lookup_vao_err
    • +
    • radeonsi: fix shader disk cache key
    • +
    • radeonsi: enable mesa_glthread for GfxBench
    • +
    • mesa: update features.txt to reflect EXT_dsa status
    • +
    • mesa: add ARB_framebuffer_no_attachments named functions
    • +
    • mesa: add ARB_vertex_attrib_64bit VertexArrayVertexAttribLOffsetEXT
    • +
    • mesa: add ARB_clear_buffer_object named functions
    • +
    • mesa: add ARB_gpu_shader_fp64 selector-less functions
    • +
    • mesa: add ARB_instanced_arrays EXT_dsa function
    • +
    • mesa: add ARB_texture_buffer_range glTextureBufferRangeEXT function
    • +
    • mesa: implement ARB_texture_storage_multisample + EXT_dsa functions
    • +
    • mesa: extend vertex_array_attrib_format to support EXT_dsa
    • +
    • mesa: add ARB_vertex_attrib_binding glVertexArray* functions
    • +
    • mesa: add ARB_sparse_buffer NamedBufferPageCommitmentEXT function
    • +
    • mesa: enable EXT_direct_state_access
    • +
    • mesa: fix warning in 32 bits build
    • +
    • radeonsi: implement sdma for GFX9
    • +
    • radeonsi: display cs blit count for AMD_DEBUG=testdma
    • +
    • radeonsi: use gfx9.surf_offset to compute texture offset
    • +
    • radeonsi: fix multi plane buffers creation
    • +
    • radeonsi: dcc dirty flag
    • +
    • st/mesa: add a notify_before_flush callback param to flush
    • +
    • st/dri: use st->flush callback to flush the backbuffer
    • +
    • radeonsi: disable dcc for 2x MSAA surface and bpe < 4
    • +
    • gallium: refuse to create buffers larger than UINT32_MAX
    • +
    • radeon/vcn2: enable rate control for hevc encoding
    • +
    • radeonsi: check ctx->sdma_cs before using it
    • +
    • radeonsi: release saved resources in si_retile_dcc
    • +
    • radeonsi: release saved resources in si_compute_expand_fmask
    • +
    • radeonsi: release saved resources in si_compute_clear_render_target
    • +
    • radeonsi: release saved resources in si_compute_copy_image
    • +
    • radeonsi: release saved resources in si_compute_clear_12bytes_buffer
    • +
    • radeonsi: release saved resources in si_compute_do_clear_or_copy
    • +
    • radeonsi: fix fmask expand compute shader
    • +
    • radeonsi: make sure fmask expand is done if needed
    • +
    • radeonsi: unbind image before compute clear
    • +
    • radeonsi: drop the negation from fmask_is_not_identity
    • +
    • util: call bind_sampler_states before setting sampler_views
    • +
    • radeonsi: move AMD_DEBUG tests to AMD_TEST
    • +
    • docs: document AMD_DEBUG variable
    • +
    • radeonsi: stop using the VM_ALWAYS_VALID flag
    • +
    • radeonsi/ngg: add VGT_FLUSH when enabling fast launch
    • +

      +

      Prodea Alexandru-Liviu (2):

      +
    • Meson: Remove lib prefix from graw and osmesa when building with Mingw. Also remove version sufix from osmesa swrast on Windows.
    • +
    • Appveyor: Quickly fix meson build. As this required use of Python 3.8, mako module also had to be updated.
    • +

      +

      Qiang Yu (3):

      +
    • lima: sync lima_drm.h with kernel
    • +
    • lima: create heap buffer with new interface if available
    • +
    • lima: add noheap debug option
    • +

      +

      Rafael Antognolli (23):

      +
    • intel/isl: Add MOCS settings to isl_device.
    • +
    • anv: Use mocs settings from isl_dev.
    • +
    • iris: Use mocs from isl_dev.
    • +
    • intel: Add workaround for stencil state.
    • +
    • intel/genxml: Add 3DSTATE_CONSTANT_ALL packet.
    • +
    • intel/aubinator: Decode 3DSTATE_CONSTANT_ALL.
    • +
    • intel/blorp: Use 3DSTATE_CONSTANT_ALL to setup push constants.
    • +
    • iris: Rework push constants emitting code.
    • +
    • iris: Use 3DSTATE_CONSTANT_ALL when possible.
    • +
    • anv: Move gen8+ push constant packet workaround.
    • +
    • anv: Add get_push_range_address() helper.
    • +
    • anv: Move code for emitting push constants into its own function.
    • +
    • anv: Use 3DSTATE_CONSTANT_ALL when possible.
    • +
    • iris: Add restriction to 3DSTATE_CONSTANT_ packets.
    • +
    • util/os_socket: Add socket related functions.
    • +
    • vulkan/overlay: Add a control socket.
    • +
    • vulkan/overlay: Add support for a control socket.
    • +
    • vulkan/overlay: Add a command to start capturing data to a file.
    • +
    • vulkan/overlay: Add basic overlay control script.
    • +
    • vulkan/overlay: Update docs.
    • +
    • iris: Implement WA for push constants.
    • +
    • utils/os_socket: Define ssize_t on windows.
    • +
    • intel: Load the driver even if I915_PARAM_REVISION is not found.
    • +

      +

      Rhys Perry (131):

      +
    • radv: adjust loop unrolling heuristics for int64
    • +
    • aco: add Instruction::usesModifiers() and add more checks in the optimizer
    • +
    • radv: fix radv_nir_get_max_workgroup_size when nir=NULL
    • +
    • aco: use DPP instead of exec modification when lowering GFX10 shuffles
    • +
    • aco: fix shuffle with uniform operands
    • +
    • nir/divergence: improve DA of shuffle
    • +
    • aco: fix read_invocation with VGPR lane index
    • +
    • aco: don't propagate vgprs into v_readlane/v_writelane
    • +
    • aco: combine read_invocation and shuffle implementations
    • +
    • radv: enable FP16/FP64 denormals earlier and only for LLVM
    • +
    • aco: don't combine literals into v_cndmask_b32/v_subb/v_addc
    • +
    • aco: fix 64-bit fsign with 0
    • +
    • aco: implement VK_KHR_shader_float_controls
    • +
    • aco: refactor reduction lowering helpers
    • +
    • aco: implement 64-bit integer reductions
    • +
    • radv/aco: enable VK_KHR_shader_subgroup_extended_types
    • +
    • nir: make nir_variable::{num_members,num_state_slots} a uint16_t
    • +
    • nir: add nir_variable::index and nir_index_vars
    • +
    • nir/large_constants: use nir_index_vars and nir_variable::index
    • +
    • docs: update features.txt for RADV
    • +
    • aco: improve waitcnt insertion around loops
    • +
    • aco: fix copy+paste error
    • +
    • aco: fix waitcnts for barriers at block ends
    • +
    • nir: add nir_num_variable_modes and nir_var_mem_push_const
    • +
    • radv: set alignment for load_ssbo/store_ssbo in meta shaders
    • +
    • nir: add a load/store vectorization pass
    • +
    • nir: add load/store vectorizer tests
    • +
    • aco: enable load/store vectorizer
    • +
    • aco: allow constant offsets for global/scratch instructions on GFX10
    • +
    • aco: set dlc/glc correctly for image loads
    • +
    • aco: propagate p_wqm on an image_sample's coordinate p_create_vector
    • +
    • aco: fix i2i64
    • +
    • aco: fix incorrect cast in parse_wait_instr()
    • +
    • aco: add v_nop inbetween exec write and VMEM/DS/FLAT
    • +
    • aco: improve WAR hazard workaround with >64bit stores
    • +
    • aco: fix GFX10 opcodes for some global/flat atomics
    • +
    • aco: fix assembly of FLAT/GLOBAL atomics
    • +
    • aco: fix SADDR with FLAT on GFX10
    • +
    • aco: don't enable store_global for helper invocations
    • +
    • aco: improve FLAT/GLOBAL scheduling
    • +
    • aco: implement global atomics
    • +
    • ac/llvm: fix pointer type for global atomics
    • +
    • ac/llvm: improve sync scope for global atomics
    • +
    • radv: set writes_memory for global memory stores/atomics
    • +
    • aco: validate the CFG
    • +
    • aco: handle loop exit and IF merge phis with break/discard
    • +
    • aco: fix block_kind_discard s_andn2 definition to exec
    • +
    • nir/lower_io_to_vector: don't create arrays when not needed
    • +
    • nir/load_store_vectorize: fix combining stores with aliasing loads between
    • +
    • aco/wave32: fix comparison optimizations
    • +
    • aco: improve jump threading with wave32
    • +
    • aco: fix vgpr alloc granule with wave32
    • +
    • aco: limit register usage for large work groups
    • +
    • aco: set vm for pos0 exports on GFX10
    • +
    • aco: fix imageSize()/textureSize() with large buffers on GFX8
    • +
    • aco: fix uninitialized data in the binary
    • +
    • aco: handle VOP3 modifiers when combining a constant comparison's NaN test
    • +
    • aco: handle omod successors with the constant in the first operand
    • +
    • aco: check usesModifiers() when identifying a neg/abs
    • +
    • aco: better handle neg/abs of sgprs
    • +
    • aco: set exec_potentially_empty for demotes
    • +
    • aco: don't DCE atomics with return values
    • +
    • aco: disable add combining for ds_swizzle_b32
    • +
    • aco: check if multiplication/clamp is live when applying output modifier
    • +
    • nir/divergence: handle load_primitive_id in GS
    • +
    • nir/lower_gs_intrinsics: add option for per-stream counts
    • +
    • aco: update IR validator
    • +
    • aco: apply literals to split mads
    • +
    • aco: combine two sgprs into a VALU if they're the same
    • +
    • aco: improve can_use_VOP3()
    • +
    • aco: rewrite literal combining
    • +
    • aco: rewrite apply_sgprs()
    • +
    • aco: add check_vop3_operands()
    • +
    • aco: be more careful with literals in combine_salu_{n2,lshl_add}
    • +
    • aco: follow through temporary when merging tests into constant comparisons
    • +
    • aco: allow applying two sgprs to an instruction
    • +
    • aco: allow an extra SGPR with multiple uses to be applied to VOP3
    • +
    • aco: take advantage of GFX10's constant bus limit and VOP3 literals
    • +
    • aco: improve creation of v_madmk_f32/v_madak_f32
    • +
    • aco: fix clamp optimization
    • +
    • aco: improve clamp optimization
    • +
    • aco: add min(-max(), ) and max(-min(), ) optimization
    • +
    • aco: don't move literal to reg when making an instruction VOP3 on GFX10
    • +
    • aco: allow input modifiers on v_cndmask_b32
    • +
    • aco: replace extract_vector with copies
    • +
    • aco: improve readfirstlane after uniform LDS loads
    • +
    • aco: add integer min/max to can_swap_operands
    • +
    • nir/sink,nir/move: move/sink load_per_vertex_input
    • +
    • nir/sink,nir/move: move/sink nir_op_mov
    • +
    • nir/algebraic: a & ~(a >> 31) -> imax(a, 0)
    • +
    • aco: fix stack buffer overflow in apply_sgprs()
    • +
    • aco: fix fall-through test in try_remove_simple_block() with back-edges
    • +
    • aco: fix operand kill flags when a temporary is used more than once
    • +
    • aco: fix off-by-one error when initializing sgpr_live_in
    • +
    • radv: move gs copy shader creation before other variants
    • +
    • aco: improve support for s_sendmsg
    • +
    • radv/aco,aco: implement GS on GFX9+
    • +
    • aco: implement GS on GFX7-8
    • +
    • radv/aco: allow ACO for GS
    • +
    • aco: explicitly mark end blocks for exports
    • +
    • aco: remove needs_instance_id
    • +
    • aco: implement GS copy shaders
    • +
    • radv/aco: use ACO for GS copy shaders
    • +
    • aco: use nir_move_copies
    • +
    • aco: fix WaR check for >64-bit FLAT/GLOBAL instructions
    • +
    • aco: fix operand to scc when selecting SGPR ufind_msb/ifind_msb
    • +
    • aco: always add sgprs to sgpr_ids when choosing literals
    • +
    • aco: fix literal application with v_cndmask_b32/v_addc_co_u32/etc
    • +
    • amd/common,radv: move vertex_format_table to ac_shader_util.{h,c}
    • +
    • aco: rework vertex fetching a bit
    • +
    • aco: skip unused channels at the start when fetching vertices
    • +
    • aco: handle unaligned vertex fetch on GFX10
    • +
    • aco: value-number MUBUF instructions
    • +
    • aco: use MUBUF in some situations instead of splitting vertex fetches
    • +
    • aco: fix rebase error from GS copy shader support
    • +
    • aco: ensure predecessors' p_logical_end is in WQM when a p_phi is in WQM
    • +
    • aco: run p_wqm instructions in WQM
    • +
    • nir/algebraic: add patterns for a >> #b << #b
    • +
    • nir/algebraic: add some half packing optimizations
    • +
    • aco: fix target calculation when vgpr spilling introduces sgpr spilling
    • +
    • aco: don't consider loop header blocks branch blocks in add_coupling_code
    • +
    • aco: don't update demand in add_coupling_code() for loop headers
    • +
    • aco: only create parallelcopy to restore exec at loop exit if needed
    • +
    • aco: don't always add logical edges from continue_break blocks to headers
    • +
    • aco: error when block has no logical preds but VGPRs are live at the start
    • +
    • aco: set exec_potentially_empty after continues/breaks in nested IFs
    • +
    • aco: improve assertion at the end of spiller
    • +
    • aco: fill reg_demand with sensible information in add_coupling_code()
    • +
    • aco: parallelcopy exec mask before s_wqm
    • +
    • aco: fix exec mask consistency issues
    • +
    • aco: fix gfx10_wave64_bpermute
    • +

      +

      Ricardo Garcia (1):

      +
    • anv: Unify GetDeviceQueue and GetDeviceQueue2
    • +

      +

      Rob Clark (89):

      +
    • freedreno/ir3: split pre-coloring to it's own function
    • +
    • freedreno/ir3: use SSA flag on dest register too
    • +
    • freedreno/ir3: ir3_print tweaks
    • +
    • freedreno/ir3/ra: move regs_count==0 check
    • +
    • freedreno/ir3/ra: remove ir print after livein/out
    • +
    • freedreno/ir3: remove obsolete comment
    • +
    • freedreno/a3xx: fix SP_FS_MRT_REG.HALF_PRECISION
    • +
    • freedreno/a4xx: fix SP_FS_MRT_REG.HALF_PRECISION
    • +
    • freedreno/ir3: sync disasm changes from envytools
    • +
    • freedreno/ir3: also track # of nops for shader-db
    • +
    • freedreno: fix eglDupNativeFenceFD error
    • +
    • freedreno/ir3: fix valgrind complaint with STLW
    • +
    • freedreno/ir3: remove half-precision output
    • +
    • freedreno/ir3: rename fanin/fanout to collect/split
    • +
    • freedreno/ir3: remove impossible condition
    • +
    • freedreno/ir3: add input/output iterators
    • +
    • freedreno/ir3: show input/output wrmask's in disasm
    • +
    • freedreno/ir3: helper to print ir if debug enabled
    • +
    • freedreno/ir3: remove first-vertex sysval
    • +
    • freedreno/ir3: simplify creating sysval inputs
    • +
    • freedreno/ir3: re-work shader inputs/outputs
    • +
    • freedreno/ir3: only tex instructions have wrmask
    • +
    • freedreno/ir3: fix gpu hang with pre-fs-tex-fetch
    • +
    • freedreno/ir3: legalize cleanups
    • +
    • freedreno/ir3: remove unused parameter
    • +
    • freedreno/perfcntrs: small cleanup
    • +
    • freedreno/perfcntrs: remove gallium dependencies
    • +
    • freedreno/perfcntrs: move to shared location
    • +
    • freedreno/perfcntrs: add accessor to get per-gen tables
    • +
    • freedreno/perfctrs/a2xx: move CP to be first group
    • +
    • freedreno/perfcntrs/a6xx: remove RBBM counters
    • +
    • freedreno/perfcntrs: add fdperf
    • +
    • freedreno/perfctrs/fdperf: periodically restore counters
    • +
    • gitlab-ci: update deqp build so we can generate xml
    • +
    • gitlab-ci/deqp: preserve full list of unexpected results
    • +
    • gitlab-ci/deqp: preserve caselists for blocks with fails
    • +
    • gitlab-ci/deqp: detect and report flakes
    • +
    • gitlab-ci: bump arm test container
    • +
    • gitlab-ci/deqp: generate xml results for fails/flakes
    • +
    • gitlab-ci/deqp: generate junit results
    • +
    • gitlab-ci/freedreno/a6xx: remove most of the flakes
    • +
    • freedreno: use rsc->slice accessor everywhere
    • +
    • freedreno: switch to layout helper
    • +
    • gitlab-ci: disable junit results for deqp
    • +
    • freedreno/ir3: remove store_output lowered to store_shared_ir3
    • +
    • freedreno/ir3: fix neverball assert in case of unused VS inputs
    • +
    • nir/lower_clip: Fix incorrect driver loc for clipdist outputs
    • +
    • freedreno/fdperf: use drmOpen()
    • +
    • freedreno/a6xx: disable LRZ when blending
    • +
    • freedreno/a5xx+a6xx: split LRZ layout to per-gen
    • +
    • freedreno/a6xx: fix LRZ layout
    • +
    • freedreno/a6xx: fix LRZ logic
    • +
    • freedreno/a6xx: enable LRZ by default
    • +
    • spirv: add OpLifetime*
    • +
    • freedreno/ir3: add last-baryf shaderdb stat
    • +
    • freedreno/ir3: add scheduler traces
    • +
    • freedreno/ir3: add iterator macros
    • +
    • freedreno/a6xx: fix OUT_REG() vs growable cmdstream
    • +
    • nir+vtn: vec8+vec16 support
    • +
    • freedreno/ir3: fix flat shading again
    • +
    • nir: assert that nir_lower_tex runs after lowering derefs
    • +
    • mesa/st: lower samplers before nir_lower_tex
    • +
    • freedreno/ir3: rename instructions
    • +
    • gitlab-ci: fix missing caselist.css/xsl
    • +
    • freedreno/a6xx: limit scratch/debug markers to debug builds
    • +
    • freedreno/a6xx: cleanup rasterizer state
    • +
    • freedreno/a6xx: separate rast stateobj for prim restart
    • +
    • freedreno/a6xx: drop a few more per-draw registers
    • +
    • freedreno/a6xx: move dynamic program state to streaming stateobj
    • +
    • freedreno/a6xx: add PROG_FB_RAST stateobj
    • +
    • freedreno/drm: fix invalid-cmdstream-size with older kernels
    • +
    • freedreno: use PIPE_CAP_RGB_OVERRIDE_DST_ALPHA_BLEND
    • +
    • mesa/st: random whitespace cleanup
    • +
    • freedreno/a6xx: remove special handling based on MRT format
    • +
    • freedreno/a6xx: convert blend state to stateobj
    • +
    • freedreno: extract vsc pipe bo from GMEM state
    • +
    • freedreno: consolidate GMEM state
    • +
    • freedreno: constify fd_tile
    • +
    • freedreno: constify fd_vsc_pipe
    • +
    • freedreno/a6xx: constify gmem state
    • +
    • freedreno/a5xx: constify gmem state
    • +
    • freedreno/a4xx: constify gmem state
    • +
    • freedreno/a3xx: constify gmem state
    • +
    • freedreno/a2xx: constify gmem state
    • +
    • freedreno: get GMEM state from batch
    • +
    • freedreno: add gmem state cache
    • +
    • freedreno: add gmem_lock
    • +
    • freedreno: remove flush-queue
    • +
    • freedreno: allow ctx->batch to be NULL
    • +

      +

      Robert Foss (5):

      +
    • nir: Build nir_lower_point_size.c in libmesa_nir
    • +
    • android: Add panfrost support to build scripts
    • +
    • android: Fix u_format_table.c being generated twice
    • +
    • panfrost: Prefix schedule_program to prevent collision
    • +
    • android: Fix whitespace issue
    • +

      +

      Rohan Garg (1):

      +
    • gitlab-ci: Use lavacli from packages
    • +

      +

      Roland Scheidegger (3):

      +
    • gallium/scons: fix graw_gdi build
    • +
    • util/atomic: Fix p_atomic_add for unlocked and msvc paths
    • +
    • winsys/svga: use new ioctl for logging
    • +

      +

      Roman Stratiienko (2):

      +
    • Android: Fix build issue without LLVM
    • +
    • panfrost: Fix Android build
    • +

      +

      Ross Zwisler (1):

      +
    • intel: limit shader geometry on BDW GT1
    • +

      +

      Sagar Ghuge (1):

      +
    • intel/compiler: Clear accumulator register before EOT
    • +

      +

      Samuel Iglesias Gonsálvez (1):

      +
    • main: fix coverity error in _mesa_program_resource_find_name()
    • +

      +

      Samuel Pitoiset (202):

      +
    • radv: declare NGG scratch for VS or TES and only on GFX10
    • +
    • radv: fix compute pipeline keys when optimizations are disabled
    • +
    • docs: document all RADV environment variables
    • +
    • radv: add a note about perftest/debug options
    • +
    • radv: fix 32-bit compiler warnings
    • +
    • nir: fix packing of nir_variable
    • +
    • radv/gfx10: enable wave32 for compute based on shader's wavesize
    • +
    • radv: hardcode the number of waves for the GFX6 LS-HS bug
    • +
    • radv: determine shaders wavesize at pipeline level
    • +
    • radv: rely on shader's wavesize when computing NGG info
    • +
    • radv: implement VK_EXT_subgroup_size_control
    • +
    • radv/gfx10: fix primitive indices orientation for NGG GS
    • +
    • ac: handle pointer types to LDS in ac_get_elem_bits()
    • +
    • gitlab-ci: build a specific libdrm version for ARM64
    • +
    • gitlab-ci: build RADV on ARM64
    • +
    • ac: fix build with recent LLVM
    • +
    • radv: remove useless RADV_DEBUG=unsafemath debug option
    • +
    • radv: make sure to not clear the ds attachment after resolves
    • +
    • ac: add radeon_info::has_l2_uncached
    • +
    • radv: implement VK_AMD_device_coherent_memory
    • +
    • spirv: fix lowering of OpGroupNonUniformAllEqual
    • +
    • ac: remove useless cast in ac_build_set_inactive()
    • +
    • ac: add 8-bit and 16-bit supports to ac_build_shuffle()
    • +
    • ac: add 8-bit and 16-bit supports to ac_build_readlane()
    • +
    • ac: add 8-bit and 16-bit supports to ac_build_set_inactive()
    • +
    • ac: add 8-bit and 16-bit supports to ac_build_dpp()
    • +
    • ac: add 8-bit and 16-bit supports to ac_build_swizzle()
    • +
    • ac: add 8-bit and 16-bit supports to get_reduction_identity()
    • +
    • ac: add 8-bit and 16-bit supports to ac_build_wwm()
    • +
    • ac: add 8-bit and 16-bit supports to ac_build_optimization_barrier()
    • +
    • ac: add 16-bit float support to ac_build_alu_op()
    • +
    • radv: advertise VK_KHR_shader_subgroup_extended_types on GFX8-GFX9
    • +
    • radv: enable VK_KHR_shader_subgroup_extended_types on GFX6-GFX7
    • +
    • docs: add missing new features for RADV
    • +
    • pipe-loader: check that the pointer to driconf_xml isn't NULL
    • +
    • gitlab-ci: move building piglit into a separate script
    • +
    • gitlab-ci: fix ldd check for Vulkan drivers
    • +
    • gitlab-ci: add a job that only build things needed for testing
    • +
    • gitlab-ci: do not build with debugoptimized for meson-main
    • +
    • gitlab-ci: build swr in meson-main
    • +
    • gitlab-ci: build GLVND in meson-clang
    • +
    • gitlab-ci: remove now useless meson-swr-glvnd build job
    • +
    • gitlab-ci: reduce the number of scons build
    • +
    • radv: disable subgroup shuffle operations on GFX10
    • +
    • ac/llvm: fix the local invocation index for wave32
    • +
    • meson: only build imgui when needed
    • +
    • radv: set the image view aspect mask during subpass transitions
    • +
    • radv: set the image view aspect mask before resolves
    • +
    • radv: rework creation of decompress/resummarize meta pipelines
    • +
    • radv: create decompress pipelines for separate depth/stencil layouts
    • +
    • radv: select the depth decompress path based on the aspect mask
    • +
    • ac/llvm: fix warning in ac_build_canonicalize()
    • +
    • radv: fix reporting subgroup size with VK_KHR_pipeline_executable_properties
    • +
    • radv: fix enabling sample shading with SampleID/SamplePosition
    • +
    • radv/gfx10: fix implementation of exclusive scans
    • +
    • ac: add 8-bit and 16-bit supports to ac_build_permlane16()
    • +
    • radv: enable VK_KHR_shader_subgroup_extended_types on GFX10
    • +
    • ac/llvm: convert src operands to pointers if necessary
    • +
    • radv: add more constants to avoid using magic numbers
    • +
    • radv,ac/nir: lower deref operations for shared memory
    • +
    • aco: drop useless lowering of deref operations for shared memory
    • +
    • ac/llvm: fix atomic var operations if source isn't a deref
    • +
    • radv: remove dead shader input/output variables
    • +
    • radv: simplify a check in radv_fixup_vertex_input_fetches()
    • +
    • radv/gfx10: fix the vertex order for triangle strips emitted by a GS
    • +
    • gitlab-ci: rename build-deqp.sh to build-deqp-gl.sh
    • +
    • gitlab-ci: add a gl suffix to the x86 test image and all test jobs
    • +
    • gitlab-ci: add a new job that builds a base test image for VK
    • +
    • gitlab-ci: build cts_runner in the x86 test image for VK
    • +
    • gitlab-ci: build dEQP VK 1.1.6 in the x86 test image for VK
    • +
    • gitlab-ci: add a new base test job for VK
    • +
    • gitlab-ci: allow to run dEQP Vulkan with DEQP_VER
    • +
    • gitlab-ci: configure the Vulkan ICD export with VK_DRIVER
    • +
    • gitlab-ci: build RADV in meson-testing
    • +
    • gitlab-ci: add a job that runs Vulkan CTS with RADV conditionally
    • +
    • radv: do not use VK_TRUE/VK_FALSE
    • +
    • radv: move emission of two PA_SC_* registers to the pipeline CS
    • +
    • radv: fix possibly wrong PA_SC_AA_CONFIG value for conservative rast
    • +
    • radv: synchronize after performing a separate depth/stencil fast clears
    • +
    • radv: do not init HTILE as compressed state when dst layout allows it
    • +
    • radv: initialize HTILE for separate depth/stencil aspects
    • +
    • radv: implement VK_KHR_separate_depth_stencil_layouts
    • +
    • gitlab-ci: set RADV_DEBUG=checkir for RADV test jobs
    • +
    • ac/nir: fix out-of-bound access when loading constants from global
    • +
    • radv: enable SpvCapabilityImageMSArray
    • +
    • radv: handle unaligned vertex fetches on GFX6/GFX10
    • +
    • radv/gfx10: fix ngg_get_ordered_id
    • +
    • radv/gfx10: fix the out-of-bounds check for vertex descriptors
    • +
    • ac: declare an enum for the OOB select field on GFX10
    • +
    • radv: init a default multisample state for the resolve FS path
    • +
    • radv: ignore pMultisampleState if rasterization is disabled
    • +
    • radv: ignore pTessellationState if the pipeline doesn't use tess
    • +
    • radv: ignore pDepthStencilState if rasterization is disabled
    • +
    • radv: tidy up radv_pipeline_init_blend_state()
    • +
    • radv: ignore pColorBlendState if rasterization is disabled
    • +
    • radv: rely on pipeline layout when creating push descriptors with template
    • +
    • radv: return the correct pitch for linear mipmaps on GFX10
    • +
    • radv: record number of color/depth samples for each subpass
    • +
    • radv: implement VK_AMD_mixed_attachment_samples
    • +
    • ac/surface: use uint16_t for mipmap level pitches
    • +
    • radv: do not fill keys from fragment shader twice
    • +
    • spirv: add SpvCapabilityImageReadWriteLodAMD
    • +
    • spirv,nir: add new lod parameter to image_{load,store} intrinsics
    • +
    • amd/llvm: handle nir_intrinsic_image_deref_{load,store} with lod
    • +
    • aco: handle nir_intrinsic_image_deref_{load,store} with lod
    • +
    • radv: advertise VK_AMD_shader_image_load_store_lod
    • +
    • radv/gfx10: disable vertex grouping
    • +
    • radv/gfx10: determine if a pipeline is eligible for NGG passthrough
    • +
    • radv/gfx10: do not declare LDS for NGG if useless
    • +
    • radv/gfx10: add support for NGG passthrough mode
    • +
    • radv/gfx10: improve performance for TES using PrimID but not exporting it
    • +
    • radv: only use VkSamplerCreateInfo::compareOp if enabled
    • +
    • radv/gfx10: enable all CUs if NGG is never used
    • +
    • radv/gfx10: simplify some duplicated NGG GS code
    • +
    • vulkan/overlay: Fix for Vulkan 1.2
    • +
    • radv: update VK_EXT_descriptor_indexing for Vulkan 1.2
    • +
    • radv: update VK_EXT_host_query_reset for Vulkan 1.2
    • +
    • radv: update VK_EXT_sampler_filter_minmax for Vulkan 1.2
    • +
    • radv: update VK_EXT_scalar_block_layout for Vulkan 1.2
    • +
    • radv: update VK_KHR_8bit_storage for Vulkan 1.2
    • +
    • radv: update VK_KHR_buffer_device_address for Vulkan 1.2
    • +
    • radv: update VK_KHR_create_renderpass2 for Vulkan 1.2
    • +
    • radv: update VK_KHR_depth_stencil_resolve for Vulkan 1.2
    • +
    • radv: update VK_KHR_draw_indirect_count for Vulkan 1.2
    • +
    • radv: update VK_KHR_driver_properties for Vulkan 1.2
    • +
    • radv: update VK_KHR_image_format_list for Vulkan 1.2
    • +
    • radv: update VK_KHR_imageless_framebuffer for Vulkan 1.2
    • +
    • radv: update VK_KHR_shader_atomic_int64 for Vulkan 1.2
    • +
    • radv: update VK_KHR_shader_float16_int8 for Vulkan 1.2
    • +
    • radv: update VK_KHR_shader_float_controls for Vulkan 1.2
    • +
    • radv: update VK_KHR_shader_subgroup_extended_types for Vulkan 1.2
    • +
    • radv: update VK_KHR_uniform_buffer_standard_layout for Vulkan 1.2
    • +
    • radv: update VK_KHR_timeline_semaphore for Vulkan 1.2
    • +
    • radv: implement Vulkan 1.1 features and properties
    • +
    • radv: implement Vulkan 1.2 features and properties
    • +
    • radv: enable Vulkan 1.2
    • +
    • aco: fix emitting SMEM instructions with no operands on GFX6-GFX7
    • +
    • aco: do not select 96-bit/128-bit variants for ds_read/ds_write on GFX6
    • +
    • aco: do not combine additions of DS instructions on GFX6
    • +
    • aco: implement stream output with vec3 on GFX6
    • +
    • aco: fix emitting slc for MUBUF instructions on GFX6-GFX7
    • +
    • aco: print assembly with CLRXdisasm for GFX6-GFX7 if found on the system
    • +
    • aco: fix constant folding of SMRD instructions on GFX6
    • +
    • aco: do not use the vec3 variant for stores on GFX6
    • +
    • aco: do not use the vec3 variant for loads on GFX6
    • +
    • aco: add new addr64 bit to MUBUF instructions on GFX6-GFX7
    • +
    • aco: implement nir_intrinsic_load_barycentric_at_sample on GFX6
    • +
    • radv: fix double free corruption in radv_alloc_memory()
    • +
    • radv: add explicit external subpass dependencies to meta operations
    • +
    • radv: handle missing implicit subpass dependencies
    • +
    • spirv: add SpvCapabilityFragmentMaskAMD
    • +
    • nir: add two new texture ops for multisample fragment color/mask fetches
    • +
    • spirv: add support for SpvOpFragment{Mask}FetchAMD operations
    • +
    • nir/lower_input_attachments: lower nir_texop_fragment_{mask}_fetch
    • +
    • ac/nir: add support for nir_texop_fragment_{mask}_fetch
    • +
    • aco: add support for nir_texop_fragment_{mask}_fetch
    • +
    • radv: advertise VK_AMD_shader_fragment_mask
    • +
    • aco: fix printing assembly with CLRXdisasm on GFX6
    • +
    • aco: fix wrong IR in nir_intrinsic_load_barycentric_at_sample
    • +
    • aco: implement nir_intrinsic_store_global on GFX6
    • +
    • aco: implement nir_intrinsic_load_global on GFX6
    • +
    • aco: implement nir_intrinsic_global_atomic_* on GFX6
    • +
    • aco: implement 64-bit nir_op_ftrunc on GFX6
    • +
    • aco: implement 64-bit nir_op_fceil on GFX6
    • +
    • aco: implement 64-bit nir_op_fround_even on GFX6
    • +
    • aco: implement 64-bit nir_op_ffloor on GFX6
    • +
    • aco: implement nir_op_f2i64/nir_op_f2u64 on GFX6
    • +
    • ac/llvm: fix missing casts in ac_build_readlane()
    • +
    • aco: combine MRTZ (depth, stencil, sample mask) exports
    • +
    • aco: fix a hardware bug for MRTZ exports on GFX6
    • +
    • aco: fix a hazard with v_interp_* and v_{read,readfirst}lane_* on GFX6
    • +
    • aco: copy the literal offset of SMEM instructions to a temporary
    • +
    • radv: enable ACO support for GFX6
    • +
    • radv: print NIR shaders after lowering FS inputs/outputs
    • +
    • radv: do not allow sparse resources with multi-planar formats
    • +
    • radv: enable VK_AMD_shader_fragment_mask on GFX6-GFX7
    • +
    • compiler: add a new explicit interpolation mode
    • +
    • spirv: add support for SpvDecorationExplicitInterpAMD
    • +
    • compiler: add PERSP to the existing barycentric system values
    • +
    • compiler: add new SYSTEM_VALUE_BARYCENTRIC_*
    • +
    • spirv: add support for SpvBuiltInBaryCoord*
    • +
    • nir: add nir_intrinsic_load_barycentric_model
    • +
    • nir: lower SYSTEM_VALUE_BARYCENTRIC_* to nir_load_barycentric()
    • +
    • nir: add nir_intrinsic_interp_deref_at_vertex
    • +
    • nir: lower interp_deref_at_vertex to load_input_vertex
    • +
    • spirv: implement SPV_AMD_shader_explicit_vertex_parameter
    • +
    • ac/llvm: implement VK_AMD_shader_explicit_vertex_parameter
    • +
    • aco: implement VK_AMD_shader_explicit_vertex_parameter
    • +
    • radv: gather which input PS variables use an explicit interpolation mode
    • +
    • radv: implement VK_AMD_shader_explicit_vertex_parameter
    • +
    • radv: bump conformance version to 1.2.0.0
    • +
    • radv: remove the non conformant VK implementation warning on GFX10
    • +
    • aco: fix VS input loads with MUBUF on GFX6
    • +
    • radv/gfx10: add a separate flag for creating a GDS OA buffer
    • +
    • radv/gfx10: implement NGG GS queries
    • +
    • radv/gfx10: re-enable NGG GS
    • +
    • radv: refactor physical device properties
    • +
    • aco: fix MUBUF VS input loads when expanding vec3 to vec4 on GFX6
    • +
    • aco: do not use ds_{read,write}2 on GFX6
    • +
    • aco: fix waiting for scalar stores before "writing back" data on GFX8-GFX9
    • +
    • aco: fix creating v_madak if v_mad_f32 has two sgpr literals
    • +
    • nir: do not use De Morgan's Law rules for flt and fge
    • +

      +

      Samuel Thibault (3):

      +
    • loader: #define PATH_MAX when undefined (eg. Hurd)
    • +
    • util: Do not fail to build on unknown pthread_setname_np
    • +
    • meson: Do not require libdrm for DRI2 on hurd
    • +

      +

      Satyajit Sahu (1):

      +
    • radeon/vcn: Handle crop parameters for encoder
    • +

      +

      Sonny Jiang (1):

      +
    • radeonsi: use compute shader for clear 12-byte buffer
    • +

      +

      Stephan Gerhold (1):

      +
    • kmsro: Add "mcde" entry point
    • +

      +

      Tapani Pälli (33):

      +
    • nir: fix couple of compile warnings
    • +
    • util/android: fix android build errors
    • +
    • Revert "egl: implement new functions from EGL_EXT_image_flush_external"
    • +
    • Revert "egl: handle EGL_IMAGE_EXTERNAL_FLUSH_EXT"
    • +
    • Revert "st/dri: add support for EGL_EXT_image_flush_external"
    • +
    • Revert "st/dri: assume external consumers of back buffers can write to the buffers"
    • +
    • Revert "dri_interface: add interface for EGL_EXT_image_flush_external"
    • +
    • mesa: allow bit queries for EXT_disjoint_timer_query
    • +
    • Revert "mesa: allow bit queries for EXT_disjoint_timer_query"
    • +
    • mesa: allow bit queries for EXT_disjoint_timer_query
    • +
    • gitlab-ci: update Piglit commit, update skips
    • +
    • mapi: add GetInteger64vEXT with EXT_disjoint_timer_query
    • +
    • glsl: handle max uniform limits with lower_const_arrays_to_uniforms
    • +
    • gitlab-ci: bump piglit checkout commit
    • +
    • glsl: additional interface redeclaration check for SSO programs
    • +
    • intel/compiler: add newline to limit_dispatch_width message
    • +
    • intel/compiler: force simd8 when dual src blending on gen8
    • +
    • dri: add __DRI_IMAGE_FORMAT_SXRGB8
    • +
    • i965: expose MESA_FORMAT_B8G8R8X8_SRGB visual
    • +
    • mesa/st/i965: add a ProgramResourceHash for quicker resource lookup
    • +
    • mesa: create program resource hash in a single place
    • +
    • iris: set depth stall enabled when depth flush enabled on gen12
    • +
    • anv: set depth stall enabled when depth flush enabled on gen12
    • +
    • isl/gen12: add reminder comment about missing WA with 3D surfaces
    • +
    • anv: fix assert in GetImageDrmFormatModifierPropertiesEXT
    • +
    • anv: add assert for isl_mod_info in choose_isl_tiling_flags
    • +
    • anv: initialize clear_color_is_zero_one
    • +
    • egl/android: fix buffer_count for applications setting max count
    • +
    • anv/android: setup gralloc1 usage from gralloc0 usage manually
    • +
    • anv/android: make format_supported_with_usage static
    • +
    • intel/vec4: fix valgrind errors with vf_values array
    • +
    • glsl: fix a memory leak with resource_set
    • +
    • iris: fix aux buf map failure in 32bits app on Android
    • +

      +

      Thomas Hellstrom (4):

      +
    • winsys/svga: Enable transhuge pages for buffer objects
    • +
    • svga: Avoid discard DMA uploads
    • +
    • gallium/util: Increase the debug_flush map depth
    • +
    • svga: Fix banded DMA upload
    • +

      +

      Thong Thai (8):

      +
    • st/va: Convert interlaced NV12 to progressive
    • +
    • util/format: Add the P010 format used for 10-bit videos
    • +
    • gallium: Add PIPE_FORMAT_P010 support
    • +
    • st/va: Add support for P010, used for 10-bit videos
    • +
    • radeon: Use P010 for decoding of 10-bit videos
    • +
    • r600: Remove HEVC related code since HEVC is not supported
    • +
    • mesa: Prevent _MaxLevel from being less than zero
    • +
    • Revert "st/va: Convert interlaced NV12 to progressive"
    • +

      +

      Timothy Arceri (66):

      +
    • glsl: just use NIR to lower outputs when driver can't read outputs
    • +
    • glsl: disable lower_fragdata_array() for NIR drivers
    • +
    • mesa: add ARB_shading_language_include stubs
    • +
    • glsl: add infrastructure for ARB_shading_language_include
    • +
    • mesa: add ARB_shading_language_include infrastructure to gl_shared_state
    • +
    • mesa: add helper to validate tokenise shader include path
    • +
    • mesa: add _mesa_lookup_shader_include() helper
    • +
    • mesa: add copy_string() helper
    • +
    • mesa: add glNamedStringARB() support
    • +
    • mesa: implement glGetNamedStringARB()
    • +
    • mesa: make error checking optional in _mesa_lookup_shader_include()
    • +
    • mesa: implement glIsNamedStringARB()
    • +
    • mesa: implement glGetNamedStringivARB()
    • +
    • mesa: split _mesa_lookup_shader_include() in two
    • +
    • mesa: implement glDeleteNamedStringARB()
    • +
    • glsl: add ARB_shading_language_include support to #line
    • +
    • glsl: pass gl_context to glcpp_parser_create()
    • +
    • glsl: add preprocessor #include support
    • +
    • glsl: error if #include used while extension is disabled
    • +
    • glsl: add can_skip_compile() helper
    • +
    • glsl: delay compilation skip if shader contains an include
    • +
    • mesa: add support cursor support for relative path shader includes
    • +
    • mesa: add shader include lookup support for relative paths
    • +
    • mesa: implement glCompileShaderIncludeARB()
    • +
    • mesa: enable ARB_shading_language_include
    • +
    • gitlab-ci: bump piglit checkout commit
    • +
    • gitlab-ci: update for arb_shading_language_include
    • +
    • compiler: move build definition of pp_standalone_scaffolding.c
    • +
    • radv: add some infrastructure for fresh forks for each secure compile
    • +
    • radv: add a secure_compile_open_fifo_fds() helper
    • +
    • radv: create a fresh fork for each pipeline compile
    • +
    • docs: update source code repository documentation
    • +
    • glsl: move calculate_array_size_and_stride() to link_uniforms.cpp
    • +
    • glsl: don't set uniform block as used when its not
    • +
    • glsl: make use of active_shader_mask when building resource list
    • +
    • glsl/nir: iterate the system values list when adding varyings
    • +
    • docs: remove mailing list as way of submitting patches
    • +
    • glsl: move nir_remap_dual_slot_attributes() call out of glsl_to_nir()
    • +
    • glsl: copy the how_declared field when converting to nir
    • +
    • nir: add some fields to nir_variable_data
    • +
    • glsl: copy the new data fields when converting to nir
    • +
    • glsl: add support for named varyings in nir_build_program_resource_list()
    • +
    • glsl: add subroutine support to nir_build_program_resource_list()
    • +
    • st/glsl_to_nir: call gl_nir_lower_buffers() a little later
    • +
    • st/glsl_to_nir: use nir based program resource list builder
    • +
    • st/glsl_to_nir: fix SSO validation regression
    • +
    • glsl: rename gl_nir_link() to gl_nir_link_spirv()
    • +
    • glsl: add gl_nir_link_check_atomic_counter_resources()
    • +
    • glsl: add new gl_nir_link_glsl() helper
    • +
    • glsl: reorder link_and_validate_uniforms() calls
    • +
    • mesa: add new UseNIRGLSLLinker constant
    • +
    • glsl: use nir linker to link atomics
    • +
    • glsl: add check_image_resources() for the nir linker
    • +
    • glsl: use nir version of check_image_resources() for nir linker
    • +
    • glsl: move check_subroutine_resources() into the shared util code
    • +
    • glsl: call check_subroutine_resources() from the nir linker
    • +
    • glsl: move uniform resource checks into the common linker code
    • +
    • glsl: call uniform resource checks from the nir linker
    • +
    • glsl: move calculate_subroutine_compat() to shared linker code
    • +
    • glsl: call calculate_subroutine_compat() from the nir linker
    • +
    • glsl: fix potential bug in nir uniform linker
    • +
    • glsl: remove bogus assert in nir uniform linking
    • +
    • glsl: fix check for matrices in blocks when using nir uniform linker
    • +
    • glsl: count uniform components and storage better in nir linking
    • +
    • glsl_to_nir: update interface type properly
    • +
    • glsl: fix gl_nir_set_uniform_initializers() for image arrays
    • +

      +

      Timur Kristóf (39):

      +
    • ac: Handle invalid GFX10 format correctly in ac_get_tbuffer_format.
    • +
    • aco: Make sure not to mistakenly propagate 64-bit constants.
    • +
    • aco: Treat all booleans as per-lane.
    • +
    • aco: Optimize out trivial code from uniform bools.
    • +
    • aco: Fix operand of s_bcnt1_i32_b64 in emit_boolean_reduce.
    • +
    • aco: Remove superfluous argument from emit_boolean_logic.
    • +
    • aco: Remove lower_linear_bool_phi, it is not needed anymore.
    • +
    • aco: Optimize load_subgroup_id to one bit field extract instruction.
    • +
    • aco/wave32: Change uniform bool optimization to work with wave32.
    • +
    • aco/wave32: Replace hardcoded numbers in spiller with wave size.
    • +
    • aco/wave32: Introduce emit_mbcnt which takes wave size into account.
    • +
    • aco/wave32: Add wave size specific opcodes to aco_builder.
    • +
    • aco/wave32: Use lane mask regclass for exec/vcc.
    • +
    • aco/wave32: Fix load_local_invocation_index to support wave32.
    • +
    • aco/wave32: Use wave_size for barrier intrinsic.
    • +
    • aco/wave32: Allow setting the subgroup ballot size to 64-bit.
    • +
    • aco/wave32: Fix reductions.
    • +
    • aco: Fix uniform i2i64.
    • +
    • ac/llvm: Fix ac_build_reduce in wave32 mode.
    • +
    • aco/wave32: Set the definitions of v_cmp instructions to the lane mask.
    • +
    • aco: Implement 64-bit constant propagation.
    • +
    • aco: Allow optimizing vote_all and nir_op_iand.
    • +
    • aco: Don't skip combine_instruction when definitions[1] is used.
    • +
    • aco: Optimize out s_and with exec, when used on uniform bitwise values.
    • +
    • aco: Flip s_cbranch / s_cselect to optimize out an s_not if possible.
    • +
    • nouveau/nvc0: add extern keyword to nvc0_miptree_vtbl.
    • +
    • intel/compiler: Fix array bounds warning on GCC 10.
    • +
    • radeon: Move si_get_pic_param to radeon_vce.c
    • +
    • r600: Move get_pic_param to radeon_vce.c
    • +
    • gallium: Fix a couple of multiple definition warnings.
    • +
    • radeon: Fix multiple definition error with radeon_debug
    • +
    • aco: Fix -Wstringop-overflow warnings in aco_span.
    • +
    • aco: Fix maybe-uninitialized warnings.
    • +
    • aco: Fix signedness compare warning.
    • +
    • aco: Make a better guess at which instructions need the VCC hint.
    • +
    • aco: Transform uniform bitwise instructions to 32-bit if possible.
    • +
    • aco/gfx10: Fix VcmpxExecWARHazard mitigation.
    • +
    • aco: Fix the meaning of is_atomic.
    • +
    • aco/optimizer: Don't combine uniform bool s_and to s_andn2.
    • +

      +

      Tomasz Pyra (1):

      +
    • gallium/swr: Fix arb_transform_feedback2
    • +

      +

      Tomeu Vizoso (38):

      +
    • gitlab-ci: Disable lima jobs
    • +
    • gitlab-ci: Run only LAVA jobs in special-named branches
    • +
    • panfrost: Add checksum fields to SFBD descriptor
    • +
    • panfrost: Set 0x10 bit on mali_shader_meta.unknown2_4 on T720
    • +
    • panfrost: Rework format encoding on SFBD
    • +
    • panfrost: Take into account texture layers in SFBD
    • +
    • panfrost: Decode blend shaders for SFBD
    • +
    • panfrost: Generate polygon list manually for SFBD
    • +
    • panfrost: Print the right zero field
    • +
    • panfrost: Pipe the GPU ID into compiler and disassembler
    • +
    • panfrost: Set depth and stencil for SFBD based on the format
    • +
    • panfrost: Multiply offset_units by 2
    • +
    • panfrost: Make sure the shader descriptor is in sync with the GL state
    • +
    • gitlab-ci: Remove limit on kernel logging
    • +
    • panfrost: Just print tiler fields as-is for Tx20
    • +
    • panfrost: Rework buffers in SFBD
    • +
    • gitlab-ci: Fix dir name for VK-GL-CTS sources
    • +
    • panfrost: Don't print the midgard_blend_rt structs on SFBD
    • +
    • panfrost: Add quirks system to cmdstream
    • +
    • panfrost: Simplify shader patching
    • +
    • panfrost: White list the Mali T720
    • +
    • gitlab-ci: Test Panfrost on T720 GPUs
    • +
    • panfrost: Add PAN_MESA_DEBUG=sync
    • +
    • panfrost: Hold a reference to sampler views
    • +
    • pan/midgard: Remove undefined behavior
    • +
    • nir: Don't copy empty array
    • +
    • util: Don't access members of NULL pointers
    • +
    • panfrost: Don't lose bits!
    • +
    • st/mesa: Don't access members of NULL pointers
    • +
    • panfrost: Handle Z24_UNORM_S8_UINT as MALI_Z32_UNORM
    • +
    • panfrost: Increase PIPE_SHADER_CAP_MAX_OUTPUTS to 16
    • +
    • panfrost: Dynamically allocate array of texture pointers
    • +
    • panfrost: Map with size of first layer for 3D textures
    • +
    • panfrost: Store internal format
    • +
    • gitlab-ci: Update kernel for LAVA to 5.5-rc1 plus fixes
    • +
    • gitlab-ci: Switch LAVA jobs to use shared dEQP runner
    • +
    • gitlab-ci: Upgrade kernel for LAVA jobs to v5.5-rc5
    • +
    • gitlab-ci: Consolidate container and build stages for LAVA
    • +

      +

      Urja Rannikko (4):

      +
    • panfrost: free last_read/write tables in mir_create_dependency_graph
    • +
    • panfrost: free allocations in schedule_block
    • +
    • panfrost: add lcra_free() to free lcra state
    • +
    • panfrost: free spill cost table in mir_spill_register
    • +

      +

      Vasily Khoruzhick (31):

      +
    • lima: add debug prints for BO cache
    • +
    • lima: align size before trying to fetch BO from cache
    • +
    • lima: ignore flags while looking for BO in cache
    • +
    • lima: set dithering flag when necessary
    • +
    • lima: add support for gl_PointSize
    • +
    • lima: enable tiling
    • +
    • lima: handle DRM_FORMAT_MOD_INVALID in resource_from_handle()
    • +
    • lima: expose tiled format modifier in query_dmabuf_modifiers()
    • +
    • lima: use single BO for GP outputs
    • +
    • lima: drop suballocator
    • +
    • lima: fix allocation of GP outputs storage for indexed draw
    • +
    • lima: postpone PP stream generation
    • +
    • lima: don't reload and redraw tiles that were not updated
    • +
    • lima: fix PP stream terminator size
    • +
    • lima: use linear layout for shared buffers if modifier is not specified
    • +
    • lima: add debug flag to disable tiling
    • +
    • lima: drop support for R8G8B8 format
    • +
    • lima: fix PLBU_CMD_PRIMITIVE_SETUP command
    • +
    • lima: fix viewport clipping
    • +
    • lima: implement polygon offset
    • +
    • lima: fix PIPE_CAP_* to mark features that aren't supported yet
    • +
    • lima: add new findings to texture descriptor
    • +
    • lima: fix handling of reverse depth range
    • +
    • ci: lava: pass CI_NODE_INDEX and CI_NODE_TOTAL to lava jobs
    • +
    • ci: Re-enable CI for lima on mali450
    • +
    • lima: implement invalidate_resource()
    • +
    • nir: don't emit ishl in _nir_mul_imm() if backend doesn't support bitops
    • +
    • lima: use imul for calculations with intrinsic src
    • +
    • lima: ppir: don't delete root ld_tex nodes without successors in current block
    • +
    • lima: ppir: always create move and update ld_tex successors for all blocks
    • +
    • lima: disable early-z if fragment shader uses discard
    • +

      +

      Vinson Lee (9):

      +
    • swr: Fix build with llvm-10.0.
    • +
    • panfrost: Fix gnu-empty-initializer build errors.
    • +
    • scons: Bump C standard to gnu11 on macOS 10.15.
    • +
    • util/u_thread: Restrict u_thread_get_time_nano on macOS.
    • +
    • swr: Fix build with llvm-10.0.
    • +
    • swr: Fix build with llvm-10.0.
    • +
    • lima: Fix build with GCC 10.
    • +
    • swr: Fix GCC 4.9 checks.
    • +
    • panfrost: Remove unused anonymous enum variables.
    • +

      +

      Wladimir J. van der Laan (2):

      +
    • u_vbuf: add logic to use a limited number of vbufs
    • +
    • u_vbuf: use single vertex buffer if it's not possible to have multiple
    • +

      +

      X512 (1):

      +
    • util/u_thread: Fix build under Haiku
    • +

      +

      Yevhenii Kolesnikov (5):

      +
    • glsl: Enable textureSize for samplerExternalOES
    • +
    • meson: Fix linkage of libgallium_nine with libgalliumvl
    • +
    • meta: Cleanup function for DrawTex
    • +
    • main: allow external textures for BindImageTexture
    • +
    • meta: Add cleanup function for Bitmap
    • +

      +

      Zebediah Figura (1):

      +
    • Revert "draw: revert using correct order for prim decomposition."
    • +

      +

      luc (1):

      +
    • zink: confused compilation macro usage for zink in target helpers.
    • +

      +

      +
    + +
    + + diff -Nru mesa-19.2.8/docs/relnotes/20.0.1.html mesa-20.0.8/docs/relnotes/20.0.1.html --- mesa-19.2.8/docs/relnotes/20.0.1.html 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/docs/relnotes/20.0.1.html 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,172 @@ + + + + + +Mesa Release Notes + + + + +
    +

    The Mesa 3D Graphics Library

    +
    + + +
    + +

    Mesa 20.0.1 Release Notes / 2020-03-05

    + +

    + Mesa 20.0.1 is a bug fix release which fixes bugs found since the 20.0.0 release. +

    +

    +Mesa 20.0.1 implements the OpenGL 4.6 API, but the version reported by +glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) / +glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used. +Some drivers don't support all the features required in OpenGL 4.6. OpenGL +4.6 is only available if requested at context creation. +Compatibility contexts may report a lower version depending on each driver. +

    +

    +Mesa 20.0.1 implements the Vulkan 1.2 API, but the version reported by +the apiVersion property of the VkPhysicalDeviceProperties struct +depends on the particular driver being used. +

    + +

    SHA256 checksum

    +
    +  6153ba3f8cb0524bbfc08e4db76b408126b2d1be8f789dffe28d1a0461eedde4  mesa-20.0.1.tar.xz
    +
    + + +

    New features

    + +
      +
    + +

    Bug fixes

    + +
      +
    • V3D/Broadcom (Raspberry Pi 4) - GLES 3.1 - GL_EXT_texture_norm16 advertised, but not usable
    • +
    • i965 assertion failure in fallback_rgbx_to_rgba
    • +
    • Compute copies do not handle SUBSAMPLED formats
    • +
    + +

    Changes

    + +
      +

      Andreas Baierl (1):

      +
    • gitlab-ci: lima: Add flaky tests to the skips list
    • +

      +

      Andrii Simiklit (1):

      +
    • Revert "glx: convert glx_config_create_list to one big calloc"
    • +

      +

      Arcady Goldmints-Orlov (1):

      +
    • spirv: Remove outdated SPIR-V decoration warnings
    • +

      +

      Bas Nieuwenhuizen (1):

      +
    • radeonsi: Fix compute copies for subsampled formats.
    • +

      +

      Caio Marcelo de Oliveira Filho (1):

      +
    • intel/gen12: Take into account opcode when decoding SWSB
    • +

      +

      Chris Wilson (1):

      +
    • iris: Fix import sync-file into syncobj
    • +

      +

      Danylo Piliaiev (1):

      +
    • i965: Do not generate D16 B5G6R5_UNORM configs on gen < 8
    • +

      +

      Dave Airlie (7):

      +
    • dri: add another get shm variant.
    • +
    • glx/drisw: add getImageShm2 path
    • +
    • glx/drisw: return false if shmid == -1
    • +
    • glx/drisw: fix shm put image fallback
    • +
    • gallivm/tgsi: fix stream id regression
    • +
    • gallivm/nir: fix integer divide SIGFPE
    • +
    • gallivm/nir: handle mod 0 better.
    • +

      +

      Dylan Baker (7):

      +
    • docs: Add release notes for 20.0.0
    • +
    • .pick_status.json: Update to 8291d728dc997e87b4d2e4e451692643a1dba881
    • +
    • .pick_status.json: Update to e4baff90812d799d586296fcad992ddcc553c359
    • +
    • .pick_status.json: Update to 01496e3d1ea0370af03e6645dbd2b864c2ace94c
    • +
    • .pick_status.json: Update to 09323634898ab3efc0150dc7d756bf36b1b89b76
    • +
    • .pick_status.json: Update to 3503cb4c28e01b34f3a25546c058150709c22348
    • +
    • .pick_status.json: Update to 0ac731b1ff96de46998948aa06081efa5140d50e
    • +

      +

      Eric Anholt (3):

      +
    • llvmpipe: Fix real uninitialized use of "atype" for SEMANTIC_FACE
    • +
    • turnip: Fix compiler warning about casting a nondispatchable handle.
    • +
    • aco: Fix signed-vs-unsigned warning.
    • +

      +

      Erik Faye-Lund (1):

      +
    • util: promote u_debug_memory.c to src/util
    • +

      +

      Ian Romanick (2):

      +
    • nir/search: Use larger type to hold linearized index
    • +
    • intel/fs: Correctly handle multiply of fsign with a source modifier
    • +

      +

      James Xiong (1):

      +
    • iris: handle the failure of converting unsupported yuv formats to isl
    • +

      +

      Jason Ekstrand (1):

      +
    • anv: Always enable the data cache
    • +

      +

      Jonathan Marek (1):

      +
    • turnip: fix srgb MRT
    • +

      +

      Jordan Justen (1):

      +
    • intel/compiler: Restrict cs_threads to 64
    • +

      +

      Jose Maria Casanova Crespo (1):

      +
    • v3d: Sync on last CS when non-compute stage uses resource written by CS
    • +

      +

      Kenneth Graunke (2):

      +
    • iris: Make mocs an inline helper in iris_resource.h
    • +
    • iris: Fix BLORP vertex buffers to respect ISL MOCS settings
    • +

      +

      Marek Olšák (5):

      +
    • mesa: fix immediate mode with tessellation and varying patch vertices
    • +
    • util: remove the dependency on kcmp.h
    • +
    • tgsi_to_nir: set num_images and num_samplers with holes correctly
    • +
    • mesa: call FLUSH_VERTICES before updating CoordReplace
    • +
    • mesa: fix incorrect prim.begin/end for glMultiDrawElements
    • +

      +

      Mathias Fröhlich (2):

      +
    • egl: Fix A2RGB10 platform_{device,surfaceless} PBuffer configs.
    • +
    • mesa: Flush vertices before changing the OpenGL state.
    • +

      +

      Michel Dänzer (1):

      +
    • st/vdpau: Only call is_video_format_supported hook if needed
    • +

      +

      Paulo Zanoni (3):

      +
    • intel: fix the gen 11 compute shader scratch IDs
    • +
    • intel: fix the gen 12 compute shader scratch IDs
    • +
    • intel/device: bdw_gt1 actually has 6 eus per subslice
    • +

      +

      Rafael Antognolli (2):

      +
    • iris: Apply the flushes when switching pipelines.
    • +
    • intel/gen12+: Disable mid thread preemption.
    • +

      +

      Rhys Perry (2):

      +
    • aco: keep track of which events are used in a barrier
    • +
    • aco: fix carry-out size for wave32 v_add_co_u32_e64
    • +

      +

      Samuel Pitoiset (3):

      +
    • ac/llvm: fix 64-bit fmed3
    • +
    • ac/llvm: fix 16-bit fmed3 on GFX8 and older gens
    • +
    • ac/llvm: flush denorms for nir_op_fmed3 on GFX8 and older gens
    • +

      +

      Tapani Pälli (4):

      +
    • mesa: introduce boolean toggle for EXT_texture_norm16
    • +
    • i965: toggle on EXT_texture_norm16
    • +
    • mesa/st: toggle EXT_texture_norm16 based on format support
    • +
    • mesa/st: fix formats required for EXT_texture_norm16
    • +

      +

      +
    + +
    + + diff -Nru mesa-19.2.8/docs/relnotes/20.0.2.html mesa-20.0.8/docs/relnotes/20.0.2.html --- mesa-19.2.8/docs/relnotes/20.0.2.html 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/docs/relnotes/20.0.2.html 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,159 @@ + + + + + +Mesa Release Notes + + + + +
    +

    The Mesa 3D Graphics Library

    +
    + + +
    + +

    Mesa 20.0.2 Release Notes / 2020-03-18

    + +

    + Mesa 20.0.2 is a bug fix release which fixes bugs found since the 20.0.1 release. +

    +

    +Mesa 20.0.2 implements the OpenGL 4.6 API, but the version reported by +glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) / +glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used. +Some drivers don't support all the features required in OpenGL 4.6. OpenGL +4.6 is only available if requested at context creation. +Compatibility contexts may report a lower version depending on each driver. +

    +

    +Mesa 20.0.2 implements the Vulkan 1.2 API, but the version reported by +the apiVersion property of the VkPhysicalDeviceProperties struct +depends on the particular driver being used. +

    + +

    SHA256 checksum

    +
    +  aa54f1cb669550606aab8ceb475105d15aeb814fca5a778ce70d0fd10e98e86f  mesa-20.0.2.tar.xz
    +
    + + +

    New features

    + +
      +
    + +

    Bug fixes

    + +
      +
    • RPCS3 / Persona 5 - Performance regression [RADV / Navi]
    • +
    • [CTS] dEQP-VK.descriptor_indexing.* fails on RADV/LLVM
    • +
    • [RadeonSI][gfx10/navi] Kerbal Space Program crash: si_draw_vbo: Assertion `0' failed
    • +
    • src/compiler/glsl/glcpp/glcpp-parse.y:1297: _token_print: Assertion `!"Error: Don't know how to print token."' failed.
    • +
    • Budget Cuts hits VK_AMD_shader_fragment_mask assert
    • +
    + +

    Changes

    + +
      +

      Andreas Baierl (1):

      +
    • gitlab-ci: Add add a set of lima flakes
    • +

      +

      Bas Nieuwenhuizen (2):

      +
    • amd/llvm: Fix divergent descriptor indexing. (v3)
    • +
    • amd/llvm: Fix divergent descriptor regressions with radeonsi.
    • +

      +

      Danylo Piliaiev (2):

      +
    • glsl: do not crash if string literal is used outside of #include/#line
    • +
    • st/mesa: Fix signed integer overflow when using util_throttle_memory_usage
    • +

      +

      Dave Airlie (1):

      +
    • gallium: fix build with latest meson and gcc10
    • +

      +

      Dylan Baker (8):

      +
    • docs: Add sha256sums for 20.0.1
    • +
    • .pick_status.json: Update to 07f1ef5656e0721282d01a8421eaca056348137d
    • +
    • .pick_status.json: Update to 70341d7746c177a4cd7377ef633e9f85afd11d54
    • +
    • .pick_status.json: Update to 625d8705f02e211e2733c3fe12845505725c37d4
    • +
    • .pick_status.json: Mark b83c9aca4a5fd02d920c90c1799137fed52dc1d9 as backported
    • +
    • .pick_status.json: Update to ee9e0d1ecae307fa48200d2604d3114070253299
    • +
    • .pick_status.json: Update to 3dd0d12aa5fefa94123269a541c94cdf57599e34
    • +
    • .pick_status.json: Update to 94e37859a96cc56cf0c5418a5af00a3e9f5a1bf5
    • +

      +

      Eric Anholt (1):

      +
    • glsl/tests: Fix waiting for disk_cache_put() to finish.
    • +

      +

      Eric Engestrom (7):

      +
    • bin/gen_release_notes.py: fix commit list command
    • +
    • .pick_status.json: Update to 24db276d11976905b2e8a44965c684bb48c3d49f
    • +
    • gen_release_notes: fix vulkan version reported
    • +
    • docs/relnotes/20.0: fix vulkan version reported
    • +
    • .pick_status.json: Update to ba03e308b66b0b88f60b99d9d47851a5e1522e6e
    • +
    • vulkan/wsi: fix cleanup when dup() fails
    • +
    • gen_release_notes: fix version in "you should wait" message
    • +

      +

      Francisco Jerez (1):

      +
    • intel/fs: Fix workaround for VxH indirect addressing bug under control flow.
    • +

      +

      Jason Ekstrand (9):

      +
    • isl: Set 3DSTATE_DEPTH_BUFFER::Depth correctly for 3D surfaces
    • +
    • iris: Don't skip fast depth clears if the color changed
    • +
    • anv: Parse VkPhysicalDeviceFeatures2 in CreateDevice
    • +
    • vulkan/wsi: Don't leak the FD when GetImageDrmFormatModifierProperties fails
    • +
    • vulkan/wsi: Return an error if dup() fails
    • +
    • anv: Use the PIPE_CONTROL instead of bits for the CS stall W/A
    • +
    • anv: Use a proper end-of-pipe sync instead of just CS stall
    • +
    • anv: Do end-of-pipe sync around MCS/CCS ops instead of CS stall
    • +
    • anv: Do an end-of-pipe sync before updating AUX table entries
    • +

      +

      José Fonseca (1):

      +
    • meson: Avoid duplicate symbols.
    • +

      +

      Kristian Høgsberg (2):

      +
    • Revert "glsl: Use a simpler formula for tanh"
    • +
    • Revert "spirv: Use a simpler and more correct implementaiton of tanh()"
    • +

      +

      Marek Olšák (4):

      +
    • Revert "mesa: check for z=0 in _mesa_Vertex3dv()"
    • +
    • radeonsi: add a bug workaround for NGG - LATE_ALLOC_GS
    • +
    • ac: add a bug workaround for the 100% NGG culling case
    • +
    • gallium/cso_context: remove cso_delete_xxx_shader helpers to fix the live cache
    • +

      +

      Martin Fuzzey (3):

      +
    • freedreno: android: fix build failure on android due to python version
    • +
    • freedreno: android: add a6xx-pack.xml.h generation to android build
    • +
    • freedreno: android: fix build of perfcounters.
    • +

      +

      Michel Dänzer (1):

      +
    • llvmpipe: Use uintptr_t for pointer values
    • +

      +

      Rafael Antognolli (3):

      +
    • anv: Wait for the GPU to be idle before invalidating the aux table.
    • +
    • iris: Split aux map initialization from invalidation.
    • +
    • iris: Wait for the GPU to be idle before invalidating the aux table.
    • +

      +

      Rob Clark (1):

      +
    • freedreno: fix FD_MESA_DEBUG=inorder
    • +

      +

      Samuel Pitoiset (5):

      +
    • aco: fix image load/store with lod and 1D images
    • +
    • nir/lower_input_attachments: remove bogus assert in try_lower_input_texop()
    • +
    • ac/llvm: add missing optimization barrier for 64-bit readlanes
    • +
    • radv: only inject implicit subpass dependencies if necessary
    • +
    • radv: fix random depth range unrestricted failures due to a cache issue
    • +

      +

      Timur Kristóf (2):

      +
    • nir: Add ability to lower non-const quad broadcasts to const ones.
    • +
    • radv: Enable lowering dynamic quad broadcasts.
    • +

      +

      Vinson Lee (1):

      +
    • st/nine: Fix incompatible-pointer-types-discards-qualifiers errors.
    • +

      +

      +
    + +
    + + diff -Nru mesa-19.2.8/docs/relnotes/20.0.3.html mesa-20.0.8/docs/relnotes/20.0.3.html --- mesa-19.2.8/docs/relnotes/20.0.3.html 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/docs/relnotes/20.0.3.html 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,177 @@ + + + + + +Mesa Release Notes + + + + +
    +

    The Mesa 3D Graphics Library

    +
    + + +
    + +

    Mesa 20.0.3 Release Notes / 2020-04-01

    + +

    + Mesa 20.0.3 is a bug fix release which fixes bugs found since the 20.0.2 release. +

    +

    +Mesa 20.0.3 implements the OpenGL 4.6 API, but the version reported by +glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) / +glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used. +Some drivers don't support all the features required in OpenGL 4.6. OpenGL +4.6 is only available if requested at context creation. +Compatibility contexts may report a lower version depending on each driver. +

    +

    +Mesa 20.0.3 implements the Vulkan 1.2 API, but the version reported by +the apiVersion property of the VkPhysicalDeviceProperties struct +depends on the particular driver being used. +

    + +

    SHA256 checksum

    +
    +d63aaf2c27143eded2f4f376f18f7a766ad997f8eeb96c357e8ade84e8a237af  mesa-20.0.3.tar.xz
    +
    + + +

    New features

    + +
      +
    + +

    Bug fixes

    + +
      +
    • RADV: flickering textures in Q.U.B.E. 2 through Proton
    • +
    • src/compiler/glsl/ast_to_hir.cpp:2134: ir_rvalue* ast_expression::do_hir(exec_list*, _mesa_glsl_parse_state*, bool): Assertion `result != NULL || !needs_rvalue' failed.
    • +
    • [ACO] Reliable crash with RPCS3 that is not present with LLVM
    • +
    • [RADV] vkCmdBindTransformFeedbackBuffersEXT pSizes optional parameter not handled
    • +
    • soft-fp64: __fsat64 incorrectly returns NaN for a NaN input. It should return zero.
    • +
    • Hang when using glWaitSync with multithreaded shared GL contexts
    • +
    + +

    Changes

    + +
      +

      Caio Marcelo de Oliveira Filho (1):

      +
    • mesa/main: Fix overflow in validation of DispatchComputeGroupSizeARB
    • +

      +

      Dylan Baker (6):

      +
    • docs/relnotes: Add sha256 sums for 20.0.2
    • +
    • .pick_status.json: Update to cf62c2b2ac69637785f55b790fdd601c17e7e9d5
    • +
    • .pick_status.json: Mark 672d10619980687acec329742f055f7f3796c1b8 as backported
    • +
    • .pick_status.json: Mark c923de68dd0ab10a5a5fb3196f539707d046d897 as backported
    • +
    • .pick_status.json: Mark 56de6f698e3f164d97f132203e8159ef0b8e9bb8 as denominated
    • +
    • .pick_status.json: Update to aee004a7c8900938d1c17f0ac299d40001b383b0
    • +

      +

      Eric Engestrom (6):

      +
    • .pick_status.json: Update to 3252041a7872c49e53bb02ffe8b079b5fc43f15e
    • +
    • .pick_status.json: Update to 12711939320e4fcd3a0d86af22da1042ad92035f
    • +
    • .pick_status.json: Update to 05069e1f0794aadd40ce9269f858e50c64254388
    • +
    • .pick_status.json: Update to 8970b7839aebefa7207c9535ac34ab4e8cc0ae25
    • +
    • .pick_status.json: Update to 5f4d9b419a1c931ad468b8b22b8a95b1216891e4
    • +
    • .pick_status.json: Update to 70ac7f5b0c46370075a35067c9f7dfe78e84b16d
    • +

      +

      Erik Faye-Lund (3):

      +
    • rbug: do not return void-value
    • +
    • pipebuffer: clean up cast-warnings
    • +
    • vtn/opencl: fully enable OpenCLstd_Clz
    • +

      +

      Francisco Jerez (1):

      +
    • intel/fs/gen12: Fix interaction of SWSB dependency combination with EU fusion workaround.
    • +

      +

      Greg V (1):

      +
    • amd/addrlib: fix build on non-x86 platforms
    • +

      +

      Ian Romanick (2):

      +
    • soft-fp64/fsat: Correctly handle NaN
    • +
    • soft-fp64: Split a block that was missing a cast on a comparison
    • +

      +

      Jason Ekstrand (5):

      +
    • intel/blorp: Add support for swizzling fast-clear colors
    • +
    • anv: Swizzle fast-clear values
    • +
    • nir/lower_int64: Lower 8 and 16-bit downcasts with nir_lower_mov64
    • +
    • anv: Account for the header in anv_state_stream_alloc
    • +
    • spirv: Implement OpCopyObject and OpCopyLogical as blind copies
    • +

      +

      John Stultz (2):

      +
    • gallium: hud_context: Fix scalar initializer warning.
    • +
    • vc4_bufmgr: Remove duplicative VC definition
    • +

      +

      Jordan Justen (2):

      +
    • intel: Update TGL PCI strings
    • +
    • intel: Add TGL PCI ID
    • +

      +

      Lionel Landwerlin (5):

      +
    • isl: implement linear tiling row pitch requirement for display
    • +
    • isl: properly filter supported display modifiers on Gen9+
    • +
    • isl: only apply main surface ccs pitch constraint with CCS
    • +
    • isl: drop min row pitch alignment when set by the driver
    • +
    • intel: add new TGL pci ids
    • +

      +

      Marek Olšák (3):

      +
    • nir: fix clip/cull_distance_array_size in nir_lower_clip_cull_distance_arrays
    • +
    • ac: fix fast division
    • +
    • st/mesa: fix use of uninitialized memory due to st_nir_lower_builtin
    • +

      +

      Marek Vasut (1):

      +
    • etnaviv: Emit PE.ALPHA_COLOR_EXT* on GPUs with half-float support
    • +

      +

      Neil Armstrong (1):

      +
    • Revert "ci: Remove T820 from CI temporarily"
    • +

      +

      Pierre-Eric Pelloux-Prayer (1):

      +
    • st/mesa: disallow deferred flush if there are multiple contexts
    • +

      +

      Rhys Perry (11):

      +
    • nir/gather_info: handle emit_vertex_with_counter
    • +
    • aco: set has_divergent_branch for discards in loops
    • +
    • aco: handle missing second predecessors at merge block phis
    • +
    • aco: skip NIR in unreachable merge blocks
    • +
    • aco: improve check for unreachable loop continue blocks
    • +
    • aco: emit IR in IF's merge block instead if the other side ends in a jump
    • +
    • aco: fix boolean undef regclass
    • +
    • nir/gather_info: fix per-vertex handling in try_mask_partial_io
    • +
    • aco: implement 64-bit VGPR constant copies in handle_operands()
    • +
    • glsl: fix race in instance getters
    • +
    • util/u_queue: fix race in total_jobs_size access
    • +

      +

      Rob Clark (2):

      +
    • freedreno/ir3/ra: fix array liveranges
    • +
    • util: fix u_fifo_pop()
    • +

      +

      Samuel Pitoiset (7):

      +
    • radv/gfx10: fix required subgroup size with VK_EXT_subgroup_size_control
    • +
    • radv/gfx10: fix required ballot size with VK_EXT_subgroup_size_control
    • +
    • radv: fix optional pSizes parameter when binding streamout buffers
    • +
    • radv: enable VK_KHR_8bit_storage on GFX6-GFX7
    • +
    • ac/nir: use llvm.amdgcn.rcp for nir_op_frcp
    • +
    • ac/nir: use llvm.amdgcn.rsq for nir_op_frsq
    • +
    • ac/nir: use llvm.amdgcn.rcp in ac_build_fdiv()
    • +

      +

      Tapani Pälli (1):

      +
    • glsl: set error_emitted true if type not ok for assignment
    • +

      +

      Thomas Hellstrom (1):

      +
    • svga, winsys/svga: Fix persistent memory discard maps
    • +

      +

      Timothy Arceri (3):

      +
    • glsl: fix varying packing for 64bit integers
    • +
    • nir: fix packing of TCS varyings not read by the TES
    • +
    • nir: fix crash in varying packing on interface mismatch
    • +

      +

      Timur Kristóf (1):

      +
    • radv/llvm: fix subgroup shuffle for chips without bpermute
    • +

      +

      +
    + +
    + + diff -Nru mesa-19.2.8/docs/relnotes/20.0.4.html mesa-20.0.8/docs/relnotes/20.0.4.html --- mesa-19.2.8/docs/relnotes/20.0.4.html 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/docs/relnotes/20.0.4.html 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,68 @@ + + + + + +Mesa Release Notes + + + + +
    +

    The Mesa 3D Graphics Library

    +
    + + +
    + +

    Mesa 20.0.4 Release Notes / 2020-04-03

    + +

    + Mesa 20.0.4 is an emergency release which reverts a serious SPIR-V regression in the 20.0.3 release. +

    +

    +Mesa 20.0.4 implements the OpenGL 4.6 API, but the version reported by +glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) / +glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used. +Some drivers don't support all the features required in OpenGL 4.6. OpenGL +4.6 is only available if requested at context creation. +Compatibility contexts may report a lower version depending on each driver. +

    +

    +Mesa 20.0.4 implements the Vulkan 1.2 API, but the version reported by +the apiVersion property of the VkPhysicalDeviceProperties struct +depends on the particular driver being used. +

    + +

    SHA256 checksum

    +
    +c4ed491517a94118a7a611810eeb92645d42ffd82280dcd51be8cc2ba1aabba5  mesa-20.0.4.tar.xz
    +
    + + +

    New features

    + +
      +
    + +

    Bug fixes

    + +
      +
    + +

    Changes

    + +
      +

      Eric Engestrom (2):

      +
    • docs/relnotes: add sha256sum for 20.0.3
    • +
    • .pick_status.json: Update to c71c1f44b055c680f073a2608a3bf560b55f8974
    • +

      +

      Jason Ekstrand (1):

      +
    • Revert "spirv: Implement OpCopyObject and OpCopyLogical as blind copies"
    • +

      +

      +
    + +
    + + diff -Nru mesa-19.2.8/docs/relnotes/20.0.5.html mesa-20.0.8/docs/relnotes/20.0.5.html --- mesa-19.2.8/docs/relnotes/20.0.5.html 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/docs/relnotes/20.0.5.html 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,213 @@ + + + + + +Mesa Release Notes + + + + +
    +

    The Mesa 3D Graphics Library

    +
    + + +
    + +

    Mesa 20.0.5 Release Notes / 2020-04-22

    + +

    + Mesa 20.0.5 is a bug fix release which fixes bugs found since the 20.0.4 release. +

    +

    +Mesa 20.0.5 implements the OpenGL 4.6 API, but the version reported by +glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) / +glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used. +Some drivers don't support all the features required in OpenGL 4.6. OpenGL +4.6 is only available if requested at context creation. +Compatibility contexts may report a lower version depending on each driver. +

    +

    +Mesa 20.0.5 implements the Vulkan 1.2 API, but the version reported by +the apiVersion property of the VkPhysicalDeviceProperties struct +depends on the particular driver being used. +

    + +

    SHA256 checksum

    +
    +  2c56a82a28cc924e40ea49752abdf1d701c9952481f53cbc7a080271597f572e  mesa-20.0.5.tar.xz
    +
    + + +

    New features

    + +
      +
    + +

    Bug fixes

    + +
      +
    • nir: nir_lower_returns can't handle nested loops
    • +
    • Graphic artifacts with Mesa 20.0.4 on intel HD 510 GPU
    • +
    • Mesa 20 regression makes Lightsprint demos crash
    • +
    • Build Fails with Clang Shared Library
    • +
    • dri_common.h:58:8: error: unknown type name '__GLXDRIdrawable'
    • +
    • Graphical glitches on Intel Graphics when Xorg started on Iris driver
    • +
    • SIGSEGV src/compiler/glsl/ast_function.cpp:53
    • +
    • manywin aborts with "i965: Failed to submit batchbuffer: Invalid argument"
    • +
    • manywin aborts with "i965: Failed to submit batchbuffer: Invalid argument"
    • +
    • manywin aborts with "i965: Failed to submit batchbuffer: Invalid argument"
    • +
    • manywin aborts with "i965: Failed to submit batchbuffer: Invalid argument"
    • +
    • v3d: transform feedback issue
    • +
    • radv: dEQP-VK.binding_model.descriptorset_random.sets4.noarray.ubolimitlow.sbolimitlow.imglimitlow.noiub.comp.noia.0 segfault
    • +
    • radv: RAVEN fails dEQP-VK.pipeline.timestamp.misc_tests.reset_query_before_copy
    • +
    • https://gitlab.freedesktop.org/mesa/mesa/-/issues/2727
    • +
    • enable storageBuffer16BitAccess feature in radv for SI and CIK
    • +
    • Weston drm-backend.so seems to fail with Mesa master and LIBGL_ALWAYS_SOFTWARE=1
    • +
    • vaapi bob deinterlacer produces wrong output height on AMD
    • +
    + +

    Changes

    + +
      +

      Arcady Goldmints-Orlov (1):

      +
    • nir: Lower returns correctly inside nested loops
    • +

      +

      Bas Nieuwenhuizen (3):

      +
    • radv: Store 64-bit availability bools if requested.
    • +
    • radv: Consider maximum sample distances for entire grid.
    • +
    • radv: Use correct buffer count with variable descriptor set sizes.
    • +

      +

      D Scott Phillips (1):

      +
    • util/sparse_array: don't stomp head's counter on pop operations
    • +

      +

      Daniel Stone (1):

      +
    • EGL: Add eglSetDamageRegionKHR to GLVND dispatch list
    • +

      +

      Danylo Piliaiev (1):

      +
    • st/mesa: Update shader info of ffvp/ARB_vp after translation to NIR
    • +

      +

      Dave Airlie (2):

      +
    • draw: free the NIR IR.
    • +
    • llvmpipe/nir: free the nir shader
    • +

      +

      Dylan Baker (6):

      +
    • .pick_status.json: Update to 089e1fb287eb9b70c191091128ed5ba7edd2960a
    • +
    • .pick_status.json: Update to 65e2eaa4d3a7095ac438fafb09d1e36a4210966e
    • +
    • .pick_status.json: Update to 28d36d26c2212276e1238fad8f0b12caab97fee8
    • +
    • .pick_status.json: Update to acf7e73be54c7f1cc52fcc9be38a9df26849200e
    • +
    • .pick_status.json: Update to 13ce637f1b28381e72470763ff5e39dd3c562476
    • +
    • .pick_status.json: Update to c3c1f4d6bcc210408f8b180727d269838b38193b
    • +

      +

      Emil Velikov (4):

      +
    • glx: set the loader_logger early and for everyone
    • +
    • egl/drm: reinstate (kms_)swrast support
    • +
    • Revert "egl/dri2: Don't dlclose() the driver on dri2_load_driver_common failure"
    • +
    • glx: omit loader_loader() for macOS
    • +

      +

      Eric Anholt (1):

      +
    • ci: Remove LLVM from ARM test drivers.
    • +

      +

      Eric Engestrom (1):

      +
    • docs/relnotes: add sha256sum for 20.0.4
    • +

      +

      Hyunjun Ko (1):

      +
    • nir: fix wrong assignment to buffer in xfb_varyings_info
    • +

      +

      Ilia Mirkin (1):

      +
    • nv50: don't try to upload MSAA settings for BUFFER textures
    • +

      +

      Jason Ekstrand (5):

      +
    • anv/image: Use align_u64 for image offsets
    • +
    • nir/load_store_vectorize: Fix shared atomic info
    • +
    • spirv: Handle OOB vector extract operations
    • +
    • intel: Add _const versions of prog_data cast helpers
    • +
    • anv: Report correct SLM size
    • +

      +

      Jose Maria Casanova Crespo (1):

      +
    • v3d: Primitive Counts Feedback needs an extra 32-bit padding.
    • +

      +

      Juan A. Suarez Romero (2):

      +
    • intel/compiler: store the FS inputs in WM prog data
    • +
    • anv/pipeline: allow more than 16 FS inputs
    • +

      +

      Karol Herbst (2):

      +
    • clover: fix build with single library clang build
    • +
    • Revert "nvc0: fix line width on GM20x+"
    • +

      +

      Lionel Landwerlin (7):

      +
    • iris: properly free resources on BO allocation failure
    • +
    • iris: share buffer managers accross screens
    • +
    • iris: make resources take a ref on the screen object
    • +
    • i965: store DRM fd on intel_screen
    • +
    • i965: share buffer managers across screens
    • +
    • iris: drop cache coherent cpu mapping for external BO
    • +
    • util/sparse_free_list: manipulate node pointers using atomic primitives
    • +

      +

      Marek Olšák (1):

      +
    • st/mesa: fix a crash due to passing a draw vertex shader into the driver
    • +

      +

      Mathias Fröhlich (1):

      +
    • i965: Move down genX_upload_sbe in profiles.
    • +

      +

      Matt Turner (1):

      +
    • meson: Specify the maximum required libdrm in dri.pc
    • +

      +

      Neil Armstrong (3):

      +
    • gitlab-ci/lava: fix handling of lava tags
    • +
    • gitlab-ci: add FILES_HOST_URL and move FILES_HOST_NAME into jobs
    • +
    • gitlab-ci: re-enable mali400/450 and t820 jobs
    • +

      +

      Rhys Perry (1):

      +
    • aco: fix 1D textureGrad() on GFX9
    • +

      +

      Rob Clark (1):

      +
    • nir: fix definition of imadsh_mix16 for vectors
    • +

      +

      Rohan Garg (1):

      +
    • ci: Split out radv build-testing on arm64
    • +

      +

      Samuel Pitoiset (9):

      +
    • ac/nir: split 8-bit load/store to global memory on GFX6
    • +
    • ac/nir: split 8-bit SSBO stores on GFX6
    • +
    • radv/llvm: enable 8-bit storage features on GFX6-GFX7
    • +
    • ac/nir: split 16-bit load/store to global memory on GFX6
    • +
    • ac/nir: split 16-bit SSBO stores on GFX6
    • +
    • radv/llvm: enable 16-bit storage features on GFX6-GFX7
    • +
    • radv: do not abort with unknown/unimplemented descriptor types
    • +
    • radv/llvm: fix exporting the viewport index if the fragment shader needs it
    • +
    • aco: fix exporting the viewport index if the fragment shader needs it
    • +

      +

      Tapani Pälli (4):

      +
    • mesa/st: unbind shader state before deleting it
    • +
    • mesa/st: release variants for active programs before unref
    • +
    • glsl: stop processing function parameters if error happened
    • +
    • mesa/st: initialize all winsys_handle fields for memory objects
    • +

      +

      Thong Thai (1):

      +
    • gallium/auxiliary/vl: fix bob compute shaders for deint yuv
    • +

      +

      Timothy Arceri (1):

      +
    • radeonsi: don't lower constant arrays to uniforms in GLSL IR
    • +

      +

      Tobias Jakobi (1):

      +
    • meson: Link Gallium Nine with ld_args_build_id
    • +

      +

      Tomeu Vizoso (2):

      +
    • gitlab-ci: Place files from the Mesa repo into the build tarball
    • +
    • gitlab-ci: Serve files for LAVA via separate service
    • +

      +

      Vinson Lee (2):

      +
    • swr/rasterizer: Use private functions for min/max to avoid namespace issues.
    • +
    • swr: Remove Byte Order Mark.
    • +

      +

      pal1000 (1):

      +
    • scons/windows: Support build with LLVM 10.
    • +

      +

      +
    + +
    + + diff -Nru mesa-19.2.8/docs/relnotes/20.0.6.html mesa-20.0.8/docs/relnotes/20.0.6.html --- mesa-19.2.8/docs/relnotes/20.0.6.html 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/docs/relnotes/20.0.6.html 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,130 @@ + + + + + +Mesa Release Notes + + + + +
    +

    The Mesa 3D Graphics Library

    +
    + + +
    + +

    Mesa 20.0.6 Release Notes / 2020-04-29

    + +

    + Mesa 20.0.6 is a bug fix release which fixes bugs found since the 20.0.5 release. +

    +

    +Mesa 20.0.6 implements the OpenGL 4.6 API, but the version reported by +glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) / +glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used. +Some drivers don't support all the features required in OpenGL 4.6. OpenGL +4.6 is only available if requested at context creation. +Compatibility contexts may report a lower version depending on each driver. +

    +

    +Mesa 20.0.6 implements the Vulkan 1.2 API, but the version reported by +the apiVersion property of the VkPhysicalDeviceProperties struct +depends on the particular driver being used. +

    + +

    SHA256 checksum

    +
    +  30b5d8e9201a01a0e88e18bb79850e67b1d28443b34c4c5cacad4bd10f668b96  mesa-20.0.6.tar.xz
    +
    + + +

    New features

    + +
      +
    + +

    Bug fixes

    + +
      +
    • dEQP-VK.subgroups.size_control.compute.* crashes on HSW and TGL
    • +
    • piglit spec.!opengl 1.0.gl-1.0-fpexceptions crash on Iris
    • +
    • SPIR-V: OpConvertUToPtr from spec constant fails to compile
    • +
    • radv: Reading ViewportIndex in fragment shader returns garbage
    • +
    • radeonsi: GL_LINES rendering is affected by GL_POINT_SPRITE
    • +
    • [ANV] gfxbench Aztec Ruins misrenders on gen11+
    • +
    • glxinfo cmd crashed
    • +
    + +

    Changes

    + +
      +

      Abhishek Kumar (1):

      +
    • anv/android: fix assert in anv_import_ahw_memory
    • +

      +

      Bas Nieuwenhuizen (1):

      +
    • radv: Use actual memory type count for setting app-visible bitset.
    • +

      +

      Danylo Piliaiev (3):

      +
    • st/mesa: Re-assign vs in locations after updating nir info for ffvp/ARB_vp
    • +
    • spirv: Expand workaround for OpControlBarrier on old GLSLang
    • +
    • st/mesa: Treat vertex inputs absent in inputMapping as zero in mesa_to_tgsi
    • +

      +

      Dylan Baker (9):

      +
    • docs: Add sha256 sums for 20.0.5
    • +
    • .pick_status.json: Update to c552b5fd1d106adc04f62fcbe71d650a9a17f7e0
    • +
    • meson: update llvm dependency logic for meson 0.54.0
    • +
    • .pick_status.json: Mark 0123b8f63415d3d320929e6112da2be2d837b262 as denominated
    • +
    • .pick_status.json: Update to 51c1c4d95a05b6eb6fce74e8d624615e4a1b38ab
    • +
    • .pick_status.json: Update to 51c1c4d95a05b6eb6fce74e8d624615e4a1b38ab
    • +
    • .pick_status.json: Update to efdb7fa9a83b0a216b1837a5912b71669bf3f984
    • +
    • .pick_status.json: Update to 42b1696ef627a5bfee29911a780fa0a4dbf04610
    • +
    • .pick_status.json: Update to 6b551d9f360e45ba4e74867dbe79ae212e4766c5
    • +

      +

      Eric Anholt (1):

      +
    • freedreno: Fix calculation of the const buffer cmdstream size.
    • +

      +

      Erik Faye-Lund (2):

      +
    • mesa/gallium: do not use enum for bit-allocated member
    • +
    • meson: correct windows-version define
    • +

      +

      Jason Ekstrand (12):

      +
    • anv: Move vb_emit setup closer to where it's used in flush_state
    • +
    • anv: Apply any needed PIPE_CONTROLs before emitting state
    • +
    • spirv: Allow constants and NULLs in SpvOpConvertUToPtr
    • +
    • anv: Properly handle all sizes of specialization constants
    • +
    • radv: Properly handle all sizes of specialization constants
    • +
    • turnip: Properly handle all sizes of specialization constants
    • +
    • nir/opt_deref: Remove certain sampler type casts
    • +
    • spirv: Fix passing combined image/samplers through function calls
    • +
    • anv: Drop an assert
    • +
    • nir/lower_subgroups: Mask off unused bits in ballot ops
    • +
    • intel/devinfo: Compute the correct L3$ size for Gen12
    • +
    • anv: Expose CS workgroup sizes based on a maximum of 64 threads
    • +

      +

      Joshua Ashton (1):

      +
    • radv: Use TRUNC_COORD on samplers
    • +

      +

      Lionel Landwerlin (5):

      +
    • iris: fail screen creation when kernel support is not there
    • +
    • intel/perf: move register definition to special file
    • +
    • intel/perf: break GL query stuff away
    • +
    • intel/perf: move mdapi query definitions to their own file
    • +
    • intel/perf: Enable MDAPI queries for Gen12
    • +

      +

      Pierre-Eric Pelloux-Prayer (1):

      +
    • radeonsi: skip vs output optimizations for some outputs
    • +

      +

      Quentin Glidic (1):

      +
    • meson: Use dependency.partial_dependency()
    • +

      +

      Samuel Pitoiset (1):

      +
    • radv: make sure to export the viewport index if FS needs it
    • +

      +

      +
    + +
    + + diff -Nru mesa-19.2.8/docs/relnotes/20.0.7.html mesa-20.0.8/docs/relnotes/20.0.7.html --- mesa-19.2.8/docs/relnotes/20.0.7.html 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/docs/relnotes/20.0.7.html 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,160 @@ + + + + + +Mesa Release Notes + + + + +
    +

    The Mesa 3D Graphics Library

    +
    + + +
    + +

    Mesa 20.0.7 Release Notes / 2020-05-14

    + +

    + Mesa 20.0.7 is a bug fix release which fixes bugs found since the 20.0.6 release. +

    +

    +Mesa 20.0.7 implements the OpenGL 4.6 API, but the version reported by +glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) / +glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used. +Some drivers don't support all the features required in OpenGL 4.6. OpenGL +4.6 is only available if requested at context creation. +Compatibility contexts may report a lower version depending on each driver. +

    +

    +Mesa 20.0.7 implements the Vulkan 1.2 API, but the version reported by +the apiVersion property of the VkPhysicalDeviceProperties struct +depends on the particular driver being used. +

    + +

    SHA256 checksum

    +
    +  fe6e258fe772c3cd2ac01741bf7408058c3ac02d66acff9a6e669bd72e3ea178  mesa-20.0.7.tar.xz
    +
    + + +

    New features

    + +
      +
    + +

    Bug fixes

    + +
      +
    • radv regression on android
    • +
    • heavy glitches on amd ryzen 5 since version 20.x
    • +
    • [bisected] [iris] mpv under wayland: failed to import supplied dmabufs: Unsupported buffer format 808669784
    • +
    • iris: Crash when trying to capture window in OBS Studio
    • +
    • mesa 20.0.5 causing kitty to crash
    • +
    • radeonsi: "Torchlight II" trace showing regression on mesa-20.0.6 [bisected]
    • +
    • [RADV/LLVM/ACO/Regression] After mesa commit a3dc7fffbb7be0f1b2ac478b16d3acc5662dff66 all games stucks at start
    • +
    • intel/compiler: Register coalesce doesn't move conditional modifiers
    • +
    + +

    Changes

    + +
      +

      Axel Davy (1):

      +
    • gallium/util: Fix leak in the live shader cache
    • +

      +

      Bas Nieuwenhuizen (2):

      +
    • radv: Extend tiling flags to 64-bit.
    • +
    • winsys/amdgpu: Retrieve WC flags from imported buffers.
    • +

      +

      Blaž Tomažič (1):

      +
    • radeonsi: Fix omitted flush when moving suballocated texture
    • +

      +

      Christopher James Halse Rogers (1):

      +
    • egl/wayland: Fix zwp_linux_dmabuf usage
    • +

      +

      D Scott Phillips (2):

      +
    • intel/fs: Update location of Render Target Array Index for gen12
    • +
    • anv,iris: Fix input vertex max for tcs on gen12
    • +

      +

      Danylo Piliaiev (1):

      +
    • i965: Fix out-of-bounds access to brw_stage_state::surf_offset
    • +

      +

      Dave Airlie (1):

      +
    • llvmpipo/nir: free compute shader NIR
    • +

      +

      Dylan Baker (16):

      +
    • docs: Add SHA256 sums for 20.0.6
    • +
    • .pick_status.json: Update to 2efa76f795cb2b2bf00b317c580aeeeddd1e9bc2
    • +
    • .pick_status.json: Update to 3fac55ce0d066d767d6c6c8308f79d0c3e566ec0
    • +
    • .pick_status.json: Mark 3fac55ce0d066d767d6c6c8308f79d0c3e566ec0 as denominated
    • +
    • .pick_status.json: Update to b97cc41aa203fd9fb9f5cf5f5aa7fd40f567917d
    • +
    • radeonsi: Retab si_get.c
    • +
    • .pick_status.json: Mark bdd2f284d90b7f07ac5e878490be8d216d0d23c6 as denominated
    • +
    • .pick_status.json: Update to 6292059662dccd3e151c731a3b108fd0b9e4c606
    • +
    • .pick_status.json: Mark d80fb024302aa6058945826a79ba0caf9611fcc1 as backported
    • +
    • .pick_status.json: Mark 9392ddab4399d796fdf37602f586965ec17f2b2a as backported
    • +
    • .pick_status.json: Update to 6d513eb0db25a272da65822f35907456b544f172
    • +
    • radeonsi: retab si_shader_llvm_ps.c
    • +
    • .pick_status.json: Update to d11e4738a86ecac6bb4cfaf5cad5c1d32169b18f
    • +
    • radeonsi: retab
    • +
    • .pick_status.json: Update to 0bea2a13212be10982e14617002a3ff851b84717
    • +
    • .pick_status.json: Update to d76e722ed63607ecead2c66ef9f3a37a12b62bab
    • +

      +

      Ian Romanick (1):

      +
    • nir/algebraic: Optimize ushr of pack_half, not ishr
    • +

      +

      Ivan Molodetskikh (1):

      +
    • egl: allow INVALID format for linux_dmabuf
    • +

      +

      Jason Ekstrand (3):

      +
    • nir/copy_prop_vars: Report progress when deleting self-copies
    • +
    • intel/fs: Don't delete coalesced MOVs if they have a cmod
    • +
    • vulkan: Allow destroying NULL debug report callbacks
    • +

      +

      Jose Maria Casanova Crespo (2):

      +
    • v3d: Fix swizzle in DXT3 and DXT5 formats
    • +
    • v3d: Include supported DXT formats to enable s3tc/dxt extensions
    • +

      +

      Lionel Landwerlin (3):

      +
    • iris: don't assert on unfinished aux import in copy paths
    • +
    • intel/perf: store the probed i915-perf version
    • +
    • anv: don't expose VK_INTEL_performance_query without kernel support
    • +

      +

      Marek Olšák (3):

      +
    • mesa: report GL_INVALID_OPERATION for invalid glTextureBuffer target
    • +
    • radeonsi: unify and align down the max SSBO/TBO/UBO buffer binding size
    • +
    • radeonsi: fix compilation of monolithic PS
    • +

      +

      Neil Armstrong (1):

      +
    • ci: disable t820/mali4xx tests
    • +

      +

      Pierre Moreau (1):

      +
    • clover/nir: Check the result of spirv_to_nir
    • +

      +

      Pierre-Eric Pelloux-Prayer (1):

      +
    • radeonsi: fix export count
    • +

      +

      Qiang Yu (1):

      +
    • panfrost: don't always build bifrost_compiler
    • +

      +

      Rhys Perry (2):

      +
    • nir: add missing group_memory_barrier handling
    • +
    • aco: consider blocks unreachable if they are in the logical cfg
    • +

      +

      Samuel Pitoiset (4):

      +
    • radv: report INITIALIZATION_FAILED when the amdgpu winsys init failed
    • +
    • radv: don't report error with other vendor DRM devices
    • +
    • aco: fix 64-bit trunc with negative exponents on GFX6
    • +
    • radv: limit the Vulkan version to 1.1 for Android
    • +

      +

      Tapani Pälli (1):

      +
    • st/mesa: destroy only own program variants when program is released
    • +

      +

      +
    + +
    + + diff -Nru mesa-19.2.8/docs/relnotes/20.0.8.html mesa-20.0.8/docs/relnotes/20.0.8.html --- mesa-19.2.8/docs/relnotes/20.0.8.html 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/docs/relnotes/20.0.8.html 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,203 @@ + + + + + +Mesa Release Notes + + + + +
    +

    The Mesa 3D Graphics Library

    +
    + + +
    + +

    Mesa 20.0.8 Release Notes / 2020-06-11

    + +

    + Mesa 20.0.8 is a bug fix release which fixes bugs found since the 20.0.7 release. +

    +

    +Mesa 20.0.8 implements the OpenGL 4.6 API, but the version reported by +glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) / +glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used. +Some drivers don't support all the features required in OpenGL 4.6. OpenGL +4.6 is only available if requested at context creation. +Compatibility contexts may report a lower version depending on each driver. +

    +

    +Mesa 20.0.8 implements the Vulkan 1.2 API, but the version reported by +the apiVersion property of the VkPhysicalDeviceProperties struct +depends on the particular driver being used. +

    + +

    SHA256 checksum

    +
    +TBD.
    +
    + + +

    New features

    + +
      +
    • VK_GOOGLE_user_type on ANV and RADV. +
    • +
    + +

    Bug fixes

    + +
      +
    • iris/i965: possible regression in 20.0.5 due to changes in buffer manager sharing across screens (firefox/mozilla#1634213)
    • +
    • [RADV] - Doom Eternal (782330) & Metro Exodus (412020) - Title requires 'RADV_DEBUG=zerovram' to eliminate colorful graphical aberrations.
    • +
    • [RADV] - Doom Eternal (782330) & Metro Exodus (412020) - Title requires 'RADV_DEBUG=zerovram' to eliminate colorful graphical aberrations.
    • +
    • NIR validation failed after glsl to nir, before function inline, wrong {src,dst}->type ?
    • +
    • Mesa 20.0.7 / 20.1.0-rc4 regression, extremally long shader compilation time in NIR
    • +
    • Mesa-git build fails on Fedora Rawhide
    • +
    • iris/i965: possible regression in 20.0.5 due to changes in buffer manager sharing across screens (firefox/mozilla#1634213)
    • +
    • Incorrect _NetBSD__ macro inside execmem.c
    • +
    • Possible invalid sizeof in device.c
    • +
    • mesa trunk master vulkan overlay-layer meson.build warning empty configuration_data() object
    • +
    • 20.0.7: mesa still is not ready to gcc 10 default settings
    • +
    • [Gen9/icl] [Bisected] [Regression] dEQP-GLES3.functional.shaders.loops.short_circuit.do_while_fragment fail
    • +
    • Reproduceable i915 gpu hang Intel Iris Plus Graphics (Ice Lake 8x8 GT2)
    • +
    • Double lock in fbobject.c
    • +
    • [bisected] Steam crashes when newest Iris built with LTO
    • +
    • freedreno: glamor issue with x11 desktops
    • +
    • Deadlock in anv_timelines_wait()
    • +
    + +

    Changes

    + +
      +

      Bas Nieuwenhuizen (3):

      +
    • radv/winsys: Remove extra sizeof multiply.
    • +
    • radv: Handle failing to create .cache dir.
    • +
    • radv: Provide a better error for permission issues with priorities.
    • +

      +

      D Scott Phillips (1):

      +
    • anv/gen11+: Disable object level preemption
    • +

      +

      Danylo Piliaiev (6):

      +
    • anv: Translate relative timeout to absolute when calling anv_timelines_wait
    • +
    • anv: Fix deadlock in anv_timelines_wait
    • +
    • meson: Disable GCC's dead store elimination for memory zeroing custom new
    • +
    • mesa: Fix double-lock of Shared->FrameBuffers and usage of wrong mutex
    • +
    • intel/fs: Work around dual-source blending hangs in combination with SIMD16
    • +
    • glsl: inline functions with unsupported return type before converting to nir
    • +

      +

      Dave Airlie (1):

      +
    • llvmpipe: compute shaders work better with all the threads.
    • +

      +

      Dylan Baker (10):

      +
    • docs/relnotes Add sha256 sums to 20.0.7
    • +
    • .pick_status.json: Update to ceae09da156309327d7ba6f4a59d3a2e9b8837d9
    • +
    • .pick_status.json: Update to a887ad7c84e14fdad7907037a39e9fee9d504bf3
    • +
    • .pick_status.json: Update to 4504d6374dbe2aa40af519c16765457bcbf81b84
    • +
    • .pick_status.json: Update to f0c102c075f8ac76629bb34619187262ccc3e9d8
    • +
    • tests: Make tests aware of meson test wrapper
    • +
    • .pick_status.json: Update to e58112bc08f99861ac634ede8db0f98cd497fc14
    • +
    • radonsi/si_state.c: retab
    • +
    • .pick_status.json: Update to 0795241dde1507e0c6a3f9ef07c281ad4f2acf7b
    • +
    • vulkan-overlay/meson: use install_data instead of configure_file
    • +

      +

      Eric Engestrom (3):

      +
    • tree-wide: fix deprecated GitLab URLs
    • +
    • glapi: remove deprecated .getchildren() that has been replace with an iterator
    • +
    • intel: fix gen_sort_tags.py
    • +

      +

      Erik Faye-Lund (2):

      +
    • zink: use general-layout when blitting to/from same resource
    • +
    • nir: reuse existing psiz-variable
    • +

      +

      Gert Wollny (1):

      +
    • nir: lower_tex: Don't normalize coordinates for TXF with RECT
    • +

      +

      Ian Romanick (1):

      +
    • anv/tests: Don't rely on assert or changing NDEBUG in tests
    • +

      +

      Ilia Mirkin (1):

      +
    • nouveau: allow invalidating coherent/persistent buffer backings
    • +

      +

      Jan Palus (1):

      +
    • targets/opencl: fix build against LLVM>=10 with Polly support
    • +

      +

      Jason Ekstrand (6):

      +
    • anv:gpu_memcpy: Emit 3DSTATE_VF_INDEXING on Gen8+
    • +
    • nir/lower_double_ops: Rework the if (progress) tree
    • +
    • nir/opt_deref: Report progress if we remove a deref
    • +
    • nir/copy_prop_vars: Record progress in more places
    • +
    • intel/vec4: Stomp the return type of RESINFO to UINT32
    • +
    • intel/fs: Fix unused texture coordinate zeroing on Gen4-5
    • +

      +

      Jonathan Marek (1):

      +
    • freedreno/a6xx: use nonbinning VS when GS is used
    • +

      +

      Joshua Ashton (1):

      +
    • radeonsi: Use TRUNC_COORD on samplers
    • +

      +

      Lionel Landwerlin (4):

      +
    • iris: fix BO destruction in error path
    • +
    • i965: don't forget to set screen on duped image
    • +
    • i965: fix export of GEM handles
    • +
    • iris: fix export of GEM handles
    • +

      +

      Lucas Stach (1):

      +
    • etnaviv: retarget transfer to render resource when necessary
    • +

      +

      Marek Olšák (2):

      +
    • radeonsi: don't expose 16xAA on chips with 1 RB due to an occlusion query issue
    • +
    • radeonsi: add a hack to disable TRUNC_COORD for shadow samplers
    • +

      +

      Marek Vasut (1):

      +
    • etnaviv: Disable seamless cube map on GC880
    • +

      +

      Michel Dänzer (1):

      +
    • util: Change os_same_file_description return type from bool to int
    • +

      +

      Nataraj Deshpande (1):

      +
    • dri_util: Update internal_format to GL_RGB8 for MESA_FORMAT_R8G8B8X8_UNORM
    • +

      +

      Neha Bhende (1):

      +
    • util: Initialize pipe_shader_state for passthrough and transform shaders
    • +

      +

      Pierre-Eric Pelloux-Prayer (1):

      +
    • omx: fix build with gcc 10
    • +

      +

      Rhys Perry (4):

      +
    • nir: fix lowering to scratch with boolean access
    • +
    • aco: fix interaction with 3f branch workaround and p_constaddr
    • +
    • aco: check instruction format before waiting for a previous SMEM store
    • +
    • aco: preserve more fields when combining additions into SMEM
    • +

      +

      Rob Clark (1):

      +
    • freedreno: clear last_fence after resource tracking
    • +

      +

      Samuel Pitoiset (4):

      +
    • spirv,radv,anv: implement no-op VK_GOOGLE_user_type
    • +
    • nir/lower_explicit_io: fix NON_UNIFORM access for UBO loads
    • +
    • radv: enable zero VRAM for Doom Eternal
    • +
    • radv: enable zero VRAM for all VKD3D (DX12->VK) games
    • +

      +

      Timothy Arceri (3):

      +
    • glsl: stop cascading errors if process_parameters() fails
    • +
    • radv: fix regression with builtin cache
    • +
    • glsl: fix potential slow compile times for GLSLOptimizeConservatively
    • +

      +

      Vinson Lee (4):

      +
    • zink: Check fopen result.
    • +
    • r300g: Remove extra printf format specifiers.
    • +
    • vdpau: Fix wrong calloc sizeof argument.
    • +
    • mesa: Fix NetBSD compiler macro.
    • +

      +

      Yevhenii Kolesnikov (1):

      +
    • intel/compiler: fix cmod propagation optimisations
    • +

      +

      +
    + +
    + + diff -Nru mesa-19.2.8/docs/relnotes/new_features.txt mesa-20.0.8/docs/relnotes/new_features.txt --- mesa-19.2.8/docs/relnotes/new_features.txt 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/docs/relnotes/new_features.txt 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1 @@ +VK_GOOGLE_user_type on ANV and RADV. diff -Nru mesa-19.2.8/docs/relnotes.html mesa-20.0.8/docs/relnotes.html --- mesa-19.2.8/docs/relnotes.html 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/docs/relnotes.html 2020-06-12 01:21:16.000000000 +0000 @@ -21,247 +21,252 @@

    +
  • 19.3.3 release notes
  • 19.3.2 release notes
  • 19.2.8 release notes
  • 19.3.1 release notes
  • 19.3.0 release notes
  • 19.2.7 release notes
  • 19.2.6 release notes
  • 19.2.5 release notes
  • 19.2.4 release notes
  • 19.2.3 release notes
  • 19.2.2 release notes
  • 19.1.8 release notes +
  • 19.2.1 release notes
  • 19.2.0 release notes +
  • 19.1.7 release notes +
  • 19.1.6 release notes +
  • 19.1.5 release notes +
  • 19.1.4 release notes +
  • 19.1.3 release notes +
  • 19.1.2 release notes +
  • 19.0.8 release notes +
  • 19.1.1 release notes +
  • 19.0.7 release notes +
  • 19.1.0 release notes +
  • 19.0.6 release notes +
  • 19.0.5 release notes +
  • 19.0.4 release notes +
  • 19.0.3 release notes +
  • 19.0.2 release notes +
  • 18.3.6 release notes +
  • 19.0.1 release notes +
  • 18.3.5 release notes +
  • 19.0.0 release notes +
  • 18.3.4 release notes +
  • 18.3.3 release notes +
  • 18.3.2 release notes +
  • 18.2.8 release notes +
  • 18.2.7 release notes +
  • 18.3.1 release notes +
  • 18.3.0 release notes +
  • 18.2.6 release notes +
  • 18.2.5 release notes +
  • 18.2.4 release notes +
  • 18.2.3 release notes +
  • 18.2.2 release notes +
  • 18.1.9 release notes +
  • 18.2.1 release notes +
  • 18.2.0 release notes +
  • 18.1.8 release notes +
  • 18.1.7 release notes +
  • 18.1.6 release notes +
  • 18.1.5 release notes +
  • 18.1.4 release notes +
  • 18.1.3 release notes +
  • 18.1.2 release notes +
  • 18.0.5 release notes +
  • 18.1.1 release notes +
  • 18.1.0 release notes +
  • 18.0.4 release notes +
  • 18.0.3 release notes +
  • 18.0.2 release notes +
  • 18.0.1 release notes +
  • 17.3.9 release notes +
  • 17.3.8 release notes +
  • 18.0.0 release notes +
  • 17.3.7 release notes +
  • 17.3.6 release notes +
  • 17.3.5 release notes +
  • 17.3.4 release notes +
  • 17.3.3 release notes +
  • 17.3.2 release notes +
  • 17.2.8 release notes +
  • 17.3.1 release notes +
  • 17.2.7 release notes +
  • 17.3.0 release notes +
  • 17.2.6 release notes +
  • 17.2.5 release notes +
  • 17.2.4 release notes +
  • 17.2.3 release notes +
  • 17.2.2 release notes +
  • 17.1.10 release notes +
  • 17.2.1 release notes +
  • 17.1.9 release notes +
  • 17.2.0 release notes +
  • 17.1.8 release notes +
  • 17.1.7 release notes +
  • 17.1.6 release notes +
  • 17.1.5 release notes +
  • 17.1.4 release notes +
  • 17.1.3 release notes +
  • 17.1.2 release notes +
  • 17.0.7 release notes +
  • 17.1.1 release notes +
  • 17.0.6 release notes +
  • 17.1.0 release notes +
  • 17.0.5 release notes +
  • 17.0.4 release notes +
  • 17.0.3 release notes +
  • 17.0.2 release notes +
  • 13.0.6 release notes +
  • 17.0.1 release notes +
  • 13.0.5 release notes +
  • 17.0.0 release notes +
  • 13.0.4 release notes +
  • 12.0.6 release notes +
  • 13.0.3 release notes +
  • 12.0.5 release notes +
  • 13.0.2 release notes +
  • 13.0.1 release notes +
  • 12.0.4 release notes +
  • 13.0.0 release notes +
  • 12.0.3 release notes +
  • 12.0.2 release notes +
  • 12.0.1 release notes +
  • 12.0.0 release notes +
  • 11.2.2 release notes +
  • 11.1.4 release notes +
  • 11.2.1 release notes +
  • 11.1.3 release notes +
  • 11.2.0 release notes +
  • 11.1.2 release notes +
  • 11.0.9 release notes +
  • 11.1.1 release notes +
  • 11.0.8 release notes +
  • 11.1.0 release notes +
  • 11.0.7 release notes +
  • 11.0.6 release notes +
  • 11.0.5 release notes +
  • 11.0.4 release notes +
  • 11.0.3 release notes +
  • 10.6.9 release notes +
  • 11.0.2 release notes +
  • 11.0.1 release notes +
  • 10.6.8 release notes +
  • 11.0.0 release notes +
  • 10.6.7 release notes +
  • 10.6.6 release notes +
  • 10.6.5 release notes +
  • 10.6.4 release notes +
  • 10.6.3 release notes +
  • 10.6.2 release notes +
  • 10.5.9 release notes +
  • 10.6.1 release notes +
  • 10.5.8 release notes +
  • 10.6.0 release notes +
  • 10.5.7 release notes +
  • 10.5.6 release notes +
  • 10.5.5 release notes +
  • 10.5.4 release notes +
  • 10.5.3 release notes +
  • 10.5.2 release notes +
  • 10.4.7 release notes +
  • 10.5.1 release notes +
  • 10.5.0 release notes +
  • 10.4.6 release notes +
  • 10.4.5 release notes +
  • 10.4.4 release notes +
  • 10.4.3 release notes +
  • 10.4.2 release notes +
  • 10.3.7 release notes +
  • 10.4.1 release notes +
  • 10.3.6 release notes +
  • 10.4 release notes +
  • 10.3.5 release notes +
  • 10.3.4 release notes +
  • 10.3.3 release notes +
  • 10.3.2 release notes +
  • 10.3.1 release notes +
  • 10.2.9 release notes +
  • 10.3 release notes +
  • 10.2.8 release notes +
  • 10.2.7 release notes +
  • 10.2.6 release notes +
  • 10.2.5 release notes +
  • 10.2.4 release notes +
  • 10.2.3 release notes +
  • 10.2.2 release notes +
  • 10.2.1 release notes +
  • 10.2 release notes +
  • 10.1.6 release notes +
  • 10.1.5 release notes +
  • 10.1.4 release notes +
  • 10.1.3 release notes +
  • 10.1.2 release notes +
  • 10.1.1 release notes +
  • 10.1 release notes +
  • 10.0.5 release notes +
  • 10.0.4 release notes +
  • 10.0.3 release notes +
  • 10.0.2 release notes +
  • 10.0.1 release notes +
  • 10.0 release notes +
  • 9.2.5 release notes +
  • 9.2.4 release notes +
  • 9.2.3 release notes +
  • 9.2.2 release notes +
  • 9.2.1 release notes +
  • 9.2 release notes +
  • 9.1.7 release notes +
  • 9.1.6 release notes +
  • 9.1.5 release notes +
  • 9.1.4 release notes +
  • 9.1.3 release notes +
  • 9.1.2 release notes +
  • 9.1.1 release notes +
  • 9.1 release notes +
  • 9.0.3 release notes +
  • 9.0.2 release notes +
  • 9.0.1 release notes +
  • 9.0 release notes +
  • 8.0.5 release notes +
  • 8.0.4 release notes +
  • 8.0.3 release notes +
  • 8.0.2 release notes +
  • 8.0.1 release notes +
  • 8.0 release notes +
  • 7.11.2 release notes +
  • 7.11.1 release notes +
  • 7.11 release notes +
  • 7.10.3 release notes +
  • 7.10.2 release notes +
  • 7.10.1 release notes +
  • 7.10 release notes +
  • 7.9.2 release notes +
  • 7.9.1 release notes +
  • 7.9 release notes +
  • 7.8.3 release notes +
  • 7.8.2 release notes +
  • 7.8.1 release notes +
  • 7.8 release notes +
  • 7.7.1 release notes +
  • 7.7 release notes +
  • 7.6.1 release notes +
  • 7.6 release notes +
  • 7.5.2 release notes +
  • 7.5.1 release notes +
  • 7.5 release notes +
  • 7.4.4 release notes +
  • 7.4.3 release notes +
  • 7.4.2 release notes +
  • 7.4.1 release notes +
  • 7.4 release notes +
  • 7.3 release notes +
  • 7.2 release notes +
  • 7.1 release notes +
  • 7.0.4 release notes +
  • 7.0.3 release notes +
  • 7.0.2 release notes +
  • 7.0.1 release notes +
  • 7.0 release notes +
  • 6.5.3 release notes +
  • 6.5.2 release notes +
  • 6.5.1 release notes +
  • 6.5 release notes +
  • 6.4.2 release notes +
  • 6.4.1 release notes +
  • 6.4 release notes +
  • Versions of Mesa prior to 6.4 are summarized in the @@ -270,32 +275,32 @@

    +
  • 6.3.1 release notes +
  • 6.3 release notes +
  • 6.2.1 release notes +
  • 6.2 release notes +
  • 6.1 release notes +
  • 6.0.1 release notes +
  • 6.0 release notes +
  • 5.1 release notes +
  • 5.0.2 release notes +
  • 5.0.1 release notes +
  • 5.0 release notes +
  • 4.1 release notes +
  • 4.0.3 release notes +
  • 4.0.2 release notes +
  • 4.0.1 release notes +
  • 4.0 release notes +
  • 3.5 release notes +
  • 3.4.2 release notes +
  • 3.4.1 release notes +
  • 3.4 release notes +
  • 3.3 release notes +
  • 3.2.1 release notes +
  • 3.2 release notes +
  • 3.1 release notes +
  • - + \ No newline at end of file diff -Nru mesa-19.2.8/docs/repository.html mesa-20.0.8/docs/repository.html --- mesa-19.2.8/docs/repository.html 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/docs/repository.html 2020-06-12 01:21:16.000000000 +0000 @@ -62,7 +62,7 @@

    Developer git Access

    -If you wish to become a Mesa developer with git-write privilege, please +If you wish to become a Mesa developer with gitlab merge privilege, please follow this procedure:

      @@ -70,41 +70,30 @@ mesa-dev mailing list.
    1. Start contributing to the project by -submitting patches to -the mesa-dev list. Specifically, +submitting patches. Specifically,
        -
      • Use git send-mail to post your patches to mesa-dev. +
      • Use gitlab to create your merge requests.
      • Wait for someone to review the code and give you a Reviewed-by statement.
      • You'll have to rely on another Mesa developer to push your initial patches after they've been reviewed.
    2. After you've demonstrated the ability to write good code and have had -a dozen or so patches accepted you can apply for an account. -
    3. Occasionally, but rarely, someone may be given a git account sooner, but -only if they're being supervised by another Mesa developer at the same -organization and planning to work in a limited area of the code or on a -separate branch. -
    4. To apply for an account, follow -these directions. -It's also appreciated if you briefly describe what you intend to do (work -on a particular driver, add a new extension, etc.) in the bugzilla record. +a dozen or so patches accepted, a maintainer may use their discretion to give +you access to merge your own code.
    -

    -Once your account is established, you can update your push url to use SSH: -

    -git remote set-url --push origin git@gitlab.freedesktop.org:mesa/mesa.git
    -
    +

    Pushing code to your gitlab account via HTTPS

    -You can also use personal access tokens -to push over HTTPS instead (useful for people behind strict proxies). +

    Useful for people behind strict proxies

    + +You can use personal access tokens +to push over HTTPS if ssh will does not suit your needs. In this case, create a token, and put it in the url as shown here:
    -git remote set-url --push origin https://USER:TOKEN@gitlab.freedesktop.org/mesa/mesa.git
    +git remote set-url --push origin https://USER:TOKEN@gitlab.freedesktop.org/your~user~name/mesa.git
     
    -

    Windows Users

    @@ -113,7 +102,7 @@ your local copy of the repository:

    -   git config --global core.autocrlf true
    +git config --global core.autocrlf true
     

    @@ -152,8 +141,8 @@ and git complains that you have not specified a branch, try:

    -    git config branch.master.remote origin
    -    git config branch.master.merge master
    +git config branch.master.remote origin
    +git config branch.master.merge master
     

    Otherwise, you have to say git pull origin master @@ -172,7 +161,7 @@

    If it has been awhile since you've done the initial clone, try

    -    git pull
    +git pull
     

    to get the latest files before you start working. @@ -180,8 +169,8 @@

    Make your changes and use

    -    git add <files to commit>
    -    git commit
    +git add <files to commit>
    +git commit
     

    to get your changes ready to push back into the fd.o repository. @@ -196,8 +185,8 @@

    To avoid this,

    -    git pull --rebase
    -    git push
    +git pull --rebase
    +git push
     

    If you are familiar with CVS or similar system, this is similar to doing a @@ -218,8 +207,8 @@

    If you want the rebase action to be the default action, then

    -    git config branch.master.rebase true
    -    git config --global branch.autosetuprebase=always
    +git config branch.master.rebase true
    +git config --global branch.autosetuprebase=always
     

    See Understanding Git Conceptually for a fairly clear explanation about all of this. diff -Nru mesa-19.2.8/docs/shading.html mesa-20.0.8/docs/shading.html --- mesa-19.2.8/docs/shading.html 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/docs/shading.html 2020-06-12 01:21:16.000000000 +0000 @@ -162,11 +162,11 @@

  • Use the built-in library functions whenever possible. For example, instead of writing this:
    -        float x = 1.0 / sqrt(y);
    +float x = 1.0 / sqrt(y);
     
    Write this:
    -        float x = inversesqrt(y);
    +float x = inversesqrt(y);
     
  • diff -Nru mesa-19.2.8/docs/sourcedocs.html mesa-20.0.8/docs/sourcedocs.html --- mesa-19.2.8/docs/sourcedocs.html 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/docs/sourcedocs.html 2020-06-12 01:21:16.000000000 +0000 @@ -31,7 +31,7 @@

    For an example of Doxygen usage in Mesa, see a recent source file -such as bufferobj.c. +such as bufferobj.c.

    diff -Nru mesa-19.2.8/docs/sourcetree.html mesa-20.0.8/docs/sourcetree.html --- mesa-19.2.8/docs/sourcetree.html 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/docs/sourcetree.html 2020-06-12 01:21:16.000000000 +0000 @@ -49,15 +49,15 @@
  • main - The core Mesa code (mainly state management)
  • drivers - Mesa drivers (not used with Gallium)
      -
    • common - code which may be shared by all drivers +
    • common - code which may be shared by all drivers
    • dri - Direct Rendering Infrastructure drivers
        -
      • common - code shared by all DRI drivers -
      • i915 - driver for Intel i915/i945 -
      • i965 - driver for Intel i965 -
      • radeon - driver for ATI R100 -
      • r200 - driver for ATI R200 -
      • XXX more +
      • common - code shared by all DRI drivers +
      • i915 - driver for Intel i915/i945 +
      • i965 - driver for Intel i965 +
      • radeon - driver for ATI R100 +
      • r200 - driver for ATI R200 +
      • XXX more
    • x11 - Xlib-based software driver
    • osmesa - off-screen software driver diff -Nru mesa-19.2.8/docs/specs/MESA_framebuffer_flip_y.txt mesa-20.0.8/docs/specs/MESA_framebuffer_flip_y.txt --- mesa-19.2.8/docs/specs/MESA_framebuffer_flip_y.txt 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/docs/specs/MESA_framebuffer_flip_y.txt 2020-06-12 01:21:16.000000000 +0000 @@ -23,15 +23,16 @@ Version - Version 2, June 4, 2019 + Version 3, August, 2019 Number - 302 + OpenGL Extension #540 + OpenGL ES Extension #302 Dependencies - OpenGL ES 3.1 or OpenGL 4.3 is required, for FramebufferParameteri. + Requires OpenGL ES 3.0, OpenGL 4.3, or ARB_framebuffer_no_attachments. Overview @@ -58,7 +59,10 @@ New Procedures and Functions - None + OpenGL ES must provide the following functions: + + void FramebufferParameteriMESA(enum target, enum pname, int param); + void GetFramebufferParameterivMESA(enum target, enum pname, int *params); New Types @@ -66,20 +70,37 @@ New Tokens - Accepted by the argument of FramebufferParameteri and - GetFramebufferParameteriv: + Accepted by the argument of FramebufferParameteriMESA and + GetFramebufferParameterivMESA: GL_FRAMEBUFFER_FLIP_Y_MESA 0x8BBB +Interactions with OpenGL 4.3, OpenGL ES 3.1, ARB_framebuffer_no_attachments +and any other versions and extensions that provide the entry points +FramebufferParameteri and GetFramebufferParameteriv + + Token GL_FRAMEBUFFER_FLIP_Y_MESA is accepted as the argument of + FramebufferParameteri and GetFramebufferParameteriv. + Errors - An INVALID_OPERATION error is generated by GetFramebufferParameteriv if the - default framebuffer is bound to and is FRAMEBUFFER_FLIP_Y_MESA. + An INVALID_OPERATION error is generated by GetFramebufferParameteriv or + GetFramebufferParameterivMESA if the default framebuffer is bound + to and is GL_FRAMEBUFFER_FLIP_Y_MESA. + + + + Revision History + Version 3, August, 2019 + Allow OpenGL ES 3.0 to implement by adding functions + FramebufferParameteriMESA and GetFramebufferParameterivMESA which were + previously only available in OpenGL ES 3.1. + Version 2, June, 2019 - Added OpenGL 4.3 as alternative requirement + Enable extension for OpenGL 4.3 and beyond Version 1, June, 2018 Initial draft (Fritz Koenig) diff -Nru mesa-19.2.8/docs/submittingpatches.html mesa-20.0.8/docs/submittingpatches.html --- mesa-19.2.8/docs/submittingpatches.html 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/docs/submittingpatches.html 2020-06-12 01:21:16.000000000 +0000 @@ -42,10 +42,8 @@ git bisect.)
    • Patches should be properly formatted.
    • Patches should be sufficiently tested before submitting. -
    • Patches should be submitted -to mesa-dev or with -a merge request -for review. +
    • Patches should be submitted via a merge request for +review.
    @@ -58,32 +56,36 @@
  • The first line should be a short, concise summary of the change prefixed with a module name. Examples:
    -    mesa: Add support for querying GL_VERTEX_ATTRIB_ARRAY_LONG
    +mesa: Add support for querying GL_VERTEX_ATTRIB_ARRAY_LONG
     
    -    gallium: add PIPE_CAP_DEVICE_RESET_STATUS_QUERY
    +gallium: add PIPE_CAP_DEVICE_RESET_STATUS_QUERY
     
    -    i965: Fix missing type in local variable declaration.
    +i965: Fix missing type in local variable declaration.
     
  • Subsequent patch comments should describe the change in more detail, if needed. For example:
    -    i965: Remove end-of-thread SEND alignment code.
    +i965: Remove end-of-thread SEND alignment code.
     
    -    This was present in Eric's initial implementation of the compaction code
    -    for Sandybridge (commit 077d01b6). There is no documentation saying this
    -    is necessary, and removing it causes no regressions in piglit on any
    -    platform.
    +This was present in Eric's initial implementation of the compaction code
    +for Sandybridge (commit 077d01b6). There is no documentation saying this
    +is necessary, and removing it causes no regressions in piglit on any
    +platform.
     
  • A "Signed-off-by:" line is not required, but not discouraged either. -
  • If a patch addresses a bugzilla issue, that should be noted in the -patch comment. For example: +
  • If a patch addresses an issue in gitlab, use the Closes: tag +For example:
    -   Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=89689
    +Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/1
     
    +

    Prefer the full url to just Closes: #1, since the url makes it +easier to get to the bug page from git log

    +Do not use the Fixes: tag for this! Mesa already uses Fixes for something else. +
  • If a patch addresses a issue introduced with earlier commit, that should be noted in the patch comment. For example:
    -   Fixes: d7b3707c612 "util/disk_cache: use stat() to check if entry is a directory"
    +Fixes: d7b3707c612 "util/disk_cache: use stat() to check if entry is a directory"
     
  • You can produce those fixes lines by running
    git config --global alias.fixes "show -s --pretty='format:Fixes: %h (\"%s\")'"
    @@ -91,46 +93,33 @@
  • If there have been several revisions to a patch during the review process, they should be noted such as in this example:
    -    st/mesa: add ARB_texture_stencil8 support (v4)
    +st/mesa: add ARB_texture_stencil8 support (v4)
     
    -    if we support stencil texturing, enable texture_stencil8
    -    there is no requirement to support native S8 for this,
    -    the texture can be converted to x24s8 fine.
    -
    -    v2: fold fixes from Marek in:
    -       a) put S8 last in the list
    -       b) fix renderable to always test for d/s renderable
    -        fixup the texture case to use a stencil only format
    -        for picking the format for the texture view.
    -    v3: hit fallback for getteximage
    -    v4: put s8 back in front, it shouldn't get picked now (Ilia)
    +if we support stencil texturing, enable texture_stencil8
    +there is no requirement to support native S8 for this,
    +the texture can be converted to x24s8 fine.
    +
    +v2: fold fixes from Marek in:
    +   a) put S8 last in the list
    +   b) fix renderable to always test for d/s renderable
    +     fixup the texture case to use a stencil only format
    +     for picking the format for the texture view.
    +v3: hit fallback for getteximage
    +v4: put s8 back in front, it shouldn't get picked now (Ilia)
     
  • If someone tested your patch, document it with a line like this:
    -    Tested-by: Joe Hacker <jhacker@foo.com>
    +Tested-by: Joe Hacker <jhacker@foo.com>
     
  • If the patch was reviewed (usually the case) or acked by someone, that should be documented with:
    -    Reviewed-by: Joe Hacker <jhacker@foo.com>
    -    Acked-by: Joe Hacker <jhacker@foo.com>
    +Reviewed-by: Joe Hacker <jhacker@foo.com>
    +Acked-by: Joe Hacker <jhacker@foo.com>
     
  • If sending later revision of a patch, add all the tags - ack, r-b, Cc: mesa-stable and/or other. This provides reviewers with quick feedback if the patch has already been reviewed. -
  • In order for your patch to reach the prospective reviewer easier/faster, -use the script scripts/get_reviewer.pl to get a list of individuals and include -them in the CC list. -

    -Please use common sense and do not blindly add everyone. -

    -
    -    $ scripts/get_reviewer.pl --help # to get the help screen
    -    $ scripts/get_reviewer.pl -f src/egl/drivers/dri2/platform_android.c
    -    Rob Herring <robh@kernel.org> (reviewer:ANDROID EGL SUPPORT,added_lines:188/700=27%,removed_lines:58/283=20%)
    -    Tomasz Figa <tfiga@chromium.org> (reviewer:ANDROID EGL SUPPORT,authored:12/41=29%,added_lines:308/700=44%,removed_lines:115/283=41%)
    -    Emil Velikov <emil.l.velikov@gmail.com> (authored:13/41=32%,removed_lines:76/283=27%)
    -
    @@ -170,68 +159,14 @@ run.

    -

    Submitting Patches

    -Patches may be submitted to the Mesa project by -email or with a -GitLab merge request. To prevent -duplicate code review, only use one method to submit your changes. -

    - -

    Mailing Patches

    - -

    -Patches may be sent to the mesa-dev mailing list for review: - -mesa-dev@lists.freedesktop.org. -When submitting a patch make sure to use -git send-email -rather than attaching patches to emails. Sending patches as -attachments prevents people from being able to provide in-line review -comments. -

    - -

    -When submitting follow-up patches you can use --in-reply-to to make v2, v3, -etc patches show up as replies to the originals. This usually works well -when you're sending out updates to individual patches (as opposed to -re-sending the whole series). Using --in-reply-to makes -it harder for reviewers to accidentally review old patches. -

    - -

    -When submitting follow-up patches you should also login to -patchwork and change the -state of your old patches to Superseded. -

    - -

    -Some companies' mail server automatically append a legal disclaimer, -usually containing something along the lines of "The information in this -email is confidential" and "distribution is strictly prohibited". -

    -

    -These legal notices prevent us from being able to accept your patch, -rendering the whole process pointless. Please make sure these are -disabled before sending your patches. (Note that you may need to contact -your email administrator for this.) -

    - -

    GitLab Merge Requests

    - -

    - GitLab Merge - Requests (MR) can also be used to submit patches for Mesa. +Patches are submitted to the Mesa project via a +GitLab Merge Request.

    - If the MR may have interest for most of the Mesa community, you can - send an email to the mesa-dev email list including a link to the MR. - Don't send the patch to mesa-dev, just the MR link. -

    -

    Add labels to your MR to help reviewers find it. For example:

      @@ -279,23 +214,22 @@

      Reviewing Patches

      - To participate in code review, you should monitor the - - mesa-dev email list and the GitLab - Mesa Merge - Requests page. + To participate in code review, you can monitor the GitLab Mesa + Merge + Requests page, and/or register for notifications in your gitlab + settings.

      -When you've reviewed a patch on the mailing list, please be unambiguous -about your review. That is, state either +When you've reviewed a patch, please be unambiguous about your review. + That is, state either

      -    Reviewed-by: Joe Hacker <jhacker@foo.com>
      +Reviewed-by: Joe Hacker <jhacker@foo.com>
       
      or
      -    Acked-by: Joe Hacker <jhacker@foo.com>
      +Acked-by: Joe Hacker <jhacker@foo.com>
       

      Rather than saying just "LGTM" or "Seems OK". @@ -305,7 +239,7 @@ If small changes are suggested, it's OK to say something like:

      -   With the above fixes, Reviewed-by: Joe Hacker <jhacker@foo.com>
      +With the above fixes, Reviewed-by: Joe Hacker <jhacker@foo.com>
       

      which tells the patch author that the patch can be committed, as long @@ -322,7 +256,7 @@ enclose the tag in backticks:

      -  `Reviewed-by: Joe Hacker <jhacker@example.com>`
      +`Reviewed-by: Joe Hacker <jhacker@example.com>`

      This is the markdown format for literal, and will prevent gitlab from hiding the < and > symbols. @@ -471,28 +405,23 @@

    • git rebase -i ... is your friend. Don't be afraid to use it.
    • Apply a fixup to commit FOO.
      -    git add ...
      -    git commit --fixup=FOO
      -    git rebase -i --autosquash ...
      +git add ...
      +git commit --fixup=FOO
      +git rebase -i --autosquash ...
       
    • Test for build breakage between patches e.g last 8 commits.
      -    git rebase -i --exec="ninja -C build/" HEAD~8
      +git rebase -i --exec="ninja -C build/" HEAD~8
       
    • Sets the default mailing address for your repo.
      -    git config --local sendemail.to mesa-dev@lists.freedesktop.org
      +git config --local sendemail.to mesa-dev@lists.freedesktop.org
       
    • Add version to subject line of patch series in this case for the last 8 commits before sending.
      -    git send-email --subject-prefix="PATCH v4" HEAD~8
      -    git send-email -v4 @~8 # shorter version, inherited from git format-patch
      -
      -
    • Configure git to use the get_reviewer.pl script interactively. Thus you -can avoid adding the world to the CC list. -
      -    git config sendemail.cccmd "./scripts/get_reviewer.pl -i"
      +git send-email --subject-prefix="PATCH v4" HEAD~8
      +git send-email -v4 @~8 # shorter version, inherited from git format-patch
       
    diff -Nru mesa-19.2.8/docs/vmware-guest.html mesa-20.0.8/docs/vmware-guest.html --- mesa-19.2.8/docs/vmware-guest.html 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/docs/vmware-guest.html 2020-06-12 01:21:16.000000000 +0000 @@ -111,18 +111,18 @@
  • Xserver version at least 1.7
  • Ubuntu: For ubuntu you need to install a number of build dependencies.
    -  sudo apt-get install git-core
    -  sudo apt-get install ninja-build meson libpthread-stubs0-dev
    -  sudo apt-get install xserver-xorg-dev x11proto-xinerama-dev libx11-xcb-dev
    -  sudo apt-get install libxcb-glx0-dev libxrender-dev
    -  sudo apt-get build-dep libgl1-mesa-dri libxcb-glx0-dev
    +sudo apt-get install git-core
    +sudo apt-get install ninja-build meson libpthread-stubs0-dev
    +sudo apt-get install xserver-xorg-dev x11proto-xinerama-dev libx11-xcb-dev
    +sudo apt-get install libxcb-glx0-dev libxrender-dev
    +sudo apt-get build-dep libgl1-mesa-dri libxcb-glx0-dev
       
  • Fedora: For Fedora you also need to install a number of build dependencies.
    -  sudo yum install mesa-libGL-devel xorg-x11-server-devel xorg-x11-util-macros
    -  sudo yum install libXrender-devel.i686
    -  sudo yum install ninja-build meson gcc expat-devel kernel-devel git-core
    -  sudo yum install makedepend flex bison
    +sudo yum install mesa-libGL-devel xorg-x11-server-devel xorg-x11-util-macros
    +sudo yum install libXrender-devel.i686
    +sudo yum install ninja-build meson gcc expat-devel kernel-devel git-core
    +sudo yum install makedepend flex bison
       
    @@ -137,27 +137,27 @@ Begin by saving your current directory location:
    -  export TOP=$PWD
    +export TOP=$PWD
       
    • Mesa/Gallium master branch. This code is used to build libGL, and the direct rendering svga driver for libGL, vmwgfx_dri.so, and the X acceleration library libxatracker.so.x.x.x.
      -  git clone https://gitlab.freedesktop.org/mesa/mesa.git
      +git clone https://gitlab.freedesktop.org/mesa/mesa.git
         
    • VMware Linux guest kernel module. Note that this repo contains the complete DRM and TTM code. The vmware-specific driver is really only the files prefixed with vmwgfx.
      -  git clone git://anongit.freedesktop.org/git/mesa/vmwgfx
      +git clone git://anongit.freedesktop.org/git/mesa/vmwgfx
         
    • libdrm, a user-space library that interfaces with drm. Most distros ship with this but it's safest to install a newer version. To get the latest code from git:
      -  git clone https://gitlab.freedesktop.org/mesa/drm.git
      +git clone https://gitlab.freedesktop.org/mesa/drm.git
         
    • xf86-video-vmware. The chainloading driver, vmware_drv.so, the legacy driver vmwlegacy_drv.so, and the vmwgfx driver vmwgfx_drv.so.
      -  git clone git://anongit.freedesktop.org/git/xorg/driver/xf86-video-vmware
      +git clone git://anongit.freedesktop.org/git/xorg/driver/xf86-video-vmware
         
    @@ -172,29 +172,29 @@

    For 32-bit Ubuntu systems:
    -  export LIBDIR=/usr/lib/i386-linux-gnu
    +export LIBDIR=/usr/lib/i386-linux-gnu
     
    For 64-bit Ubuntu systems:
    -  export LIBDIR=/usr/lib/x86_64-linux-gnu
    +export LIBDIR=/usr/lib/x86_64-linux-gnu
     
    For 32-bit Fedora systems:
    -  export LIBDIR=/usr/lib
    +export LIBDIR=/usr/lib
     
    For 64-bit Fedora systems:
    -  export LIBDIR=/usr/lib64
    +export LIBDIR=/usr/lib64
     
  • Build libdrm:
    -  cd $TOP/drm
    -  meson builddir --prefix=/usr --libdir=${LIBDIR}
    -  ninja -C builddir
    -  sudo ninja -C builddir install
    +cd $TOP/drm
    +meson builddir --prefix=/usr --libdir=${LIBDIR}
    +ninja -C builddir
    +sudo ninja -C builddir install
       
  • Build Mesa and the vmwgfx_dri.so driver, the vmwgfx_drv.so xorg driver, the X acceleration library libxatracker. @@ -206,10 +206,10 @@ The following configure options doesn't build the EGL system.

    -  cd $TOP/mesa
    -  meson builddir --prefix=/usr --libdir=${LIBDIR} -Dgallium-drivers=svga -Ddri-drivers=swrast -Dgallium-xa=true -Ddri3=false
    -  ninja -C builddir
    -  sudo ninja -C builddir install
    +cd $TOP/mesa
    +meson builddir --prefix=/usr --libdir=${LIBDIR} -Dgallium-drivers=svga -Ddri-drivers=swrast -Dgallium-xa=true -Ddri3=false
    +ninja -C builddir
    +sudo ninja -C builddir install
       

    @@ -221,34 +221,34 @@ building and replacing the current Xorg driver. First check if your system is 32- or 64-bit.

    -  cd $TOP/xf86-video-vmware
    -  ./autogen.sh --prefix=/usr --libdir=${LIBDIR}
    -  make
    -  sudo make install
    +cd $TOP/xf86-video-vmware
    +./autogen.sh --prefix=/usr --libdir=${LIBDIR}
    +make
    +sudo make install
       
  • vmwgfx kernel module. First make sure that any old version of this kernel module is removed from the system by issuing
    -  sudo rm /lib/modules/`uname -r`/kernel/drivers/gpu/drm/vmwgfx.ko*
    +sudo rm /lib/modules/`uname -r`/kernel/drivers/gpu/drm/vmwgfx.ko*
     
    Build and install:
    -  cd $TOP/vmwgfx
    -  make
    -  sudo make install
    -  sudo depmod -a
    +cd $TOP/vmwgfx
    +make
    +sudo make install
    +sudo depmod -a
     
    If you're using a Ubuntu OS:
    -  sudo update-initramfs -u
    +sudo update-initramfs -u
     
    If you're using a Fedora OS:
    -  sudo dracut --force
    +sudo dracut --force
     
    Add 'vmwgfx' to the /etc/modules file:
    -  echo vmwgfx | sudo tee -a /etc/modules
    +echo vmwgfx | sudo tee -a /etc/modules
     
    Note: some distros put DRM kernel drivers in different directories. @@ -259,7 +259,7 @@ After installing vmwgfx.ko you might want to run the following command to check that the new kernel module is in the expected place:
    -  find /lib/modules -name vmwgfx.ko -exec ls -l '{}' \;
    +find /lib/modules -name vmwgfx.ko -exec ls -l '{}' \;
     
    If you see the kernel module listed in more than one place, you may need to move things around. @@ -271,10 +271,10 @@ Now try to load the kernel module by issuing
    -  sudo modprobe vmwgfx
    +sudo modprobe vmwgfx Then type
    -  dmesg
    +dmesg to watch the debug output. It should contain a number of lines prefixed with "[vmwgfx]".

    @@ -301,7 +301,7 @@

    If you don't see this, try setting this environment variable:

    -  export LIBGL_DEBUG=verbose
    +export LIBGL_DEBUG=verbose

    then rerun glxinfo and examine the output for error messages.

    diff -Nru mesa-19.2.8/docs/xlibdriver.html mesa-20.0.8/docs/xlibdriver.html --- mesa-19.2.8/docs/xlibdriver.html 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/docs/xlibdriver.html 2020-06-12 01:21:16.000000000 +0000 @@ -64,12 +64,12 @@ Here are some examples:

    -   using csh:
    +using csh:
     	% setenv MESA_RGB_VISUAL "TrueColor 8"		// 8-bit TrueColor
     	% setenv MESA_CI_VISUAL "PseudoColor 12"	// 12-bit PseudoColor
     	% setenv MESA_RGB_VISUAL "PseudoColor 8"	// 8-bit PseudoColor
     
    -   using bash:
    +using bash:
     	$ export MESA_RGB_VISUAL="TrueColor 8"
     	$ export MESA_CI_VISUAL="PseudoColor 12"
     	$ export MESA_RGB_VISUAL="PseudoColor 8"
    @@ -146,8 +146,8 @@
     Examples:
     

    -	% export MESA_GAMMA="2.3 2.2 2.4"	// separate R,G,B values
    -	% export MESA_GAMMA="2.0"		// same gamma for R,G,B
    +% export MESA_GAMMA="2.3 2.2 2.4"	// separate R,G,B values
    +% export MESA_GAMMA="2.0"		// same gamma for R,G,B
     

    The demos/gamma.c program in mesa/demos repository may help @@ -183,7 +183,7 @@ SERVER_OVERLAY_VISUALS property:

    -	xprop -root | grep SERVER_OVERLAY_VISUALS
    +xprop -root | grep SERVER_OVERLAY_VISUALS
     
    @@ -207,8 +207,8 @@ This extension adds the GLX function:

    -    GLXPixmap glXCreateGLXPixmapMESA( Display *dpy, XVisualInfo *visual,
    -                                      Pixmap pixmap, Colormap cmap )
    +GLXPixmap glXCreateGLXPixmapMESA( Display *dpy, XVisualInfo *visual,
    +                                  Pixmap pixmap, Colormap cmap )
     

    It is an alternative to the standard glXCreateGLXPixmap() function. @@ -243,10 +243,10 @@ just before an X window is destroyed. For example:

    -         #ifdef GLX_MESA_release_buffers
    -            glXReleaseBuffersMESA( dpy, window );
    -         #endif
    -         XDestroyWindow( dpy, window );
    +#ifdef GLX_MESA_release_buffers
    +   glXReleaseBuffersMESA( dpy, window );
    +#endif
    +XDestroyWindow( dpy, window );
     

    GLX_MESA_release_buffers specification @@ -270,11 +270,11 @@

    Summary of X-related environment variables

    -   MESA_RGB_VISUAL - specifies the X visual and depth for RGB mode (X only)
    -   MESA_CI_VISUAL - specifies the X visual and depth for CI mode (X only)
    -   MESA_BACK_BUFFER - specifies how to implement the back color buffer (X only)
    -   MESA_PRIVATE_CMAP - force aux/tk libraries to use private colormaps (X only)
    -   MESA_GAMMA - gamma correction coefficients (X only)
    +MESA_RGB_VISUAL - specifies the X visual and depth for RGB mode (X only)
    +MESA_CI_VISUAL - specifies the X visual and depth for CI mode (X only)
    +MESA_BACK_BUFFER - specifies how to implement the back color buffer (X only)
    +MESA_PRIVATE_CMAP - force aux/tk libraries to use private colormaps (X only)
    +MESA_GAMMA - gamma correction coefficients (X only)
     
    diff -Nru mesa-19.2.8/.editorconfig mesa-20.0.8/.editorconfig --- mesa-19.2.8/.editorconfig 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/.editorconfig 2020-06-12 01:21:16.000000000 +0000 @@ -32,6 +32,10 @@ indent_style = space indent_size = 2 +[*.html] +indent_style = space +indent_size = 2 + [*.patch] trim_trailing_whitespace = false diff -Nru mesa-19.2.8/.gitlab-ci/debian-install.sh mesa-20.0.8/.gitlab-ci/debian-install.sh --- mesa-19.2.8/.gitlab-ci/debian-install.sh 2020-07-13 13:21:26.000000000 +0000 +++ mesa-20.0.8/.gitlab-ci/debian-install.sh 1970-01-01 00:00:00.000000000 +0000 @@ -1,285 +0,0 @@ -#!/bin/bash - -set -e -set -o xtrace - -export DEBIAN_FRONTEND=noninteractive - -CROSS_ARCHITECTURES="armhf arm64 i386" -for arch in $CROSS_ARCHITECTURES; do - dpkg --add-architecture $arch -done - -apt-get install -y \ - apt-transport-https \ - ca-certificates \ - curl \ - wget \ - unzip \ - gnupg - -curl -fsSL https://apt.llvm.org/llvm-snapshot.gpg.key | apt-key add - -echo "deb [trusted=yes] https://apt.llvm.org/stretch/ llvm-toolchain-stretch-7 main" >/etc/apt/sources.list.d/llvm7.list -echo "deb [trusted=yes] https://apt.llvm.org/stretch/ llvm-toolchain-stretch-8 main" >/etc/apt/sources.list.d/llvm8.list - -sed -i -e 's/http:\/\/deb/https:\/\/deb/g' /etc/apt/sources.list -echo 'deb https://deb.debian.org/debian stretch-backports main' >/etc/apt/sources.list.d/backports.list -echo 'deb https://deb.debian.org/debian jessie main' >/etc/apt/sources.list.d/jessie.list - -apt-get update -apt-get install -y -t stretch-backports \ - llvm-3.4-dev \ - llvm-3.9-dev \ - libclang-3.9-dev \ - llvm-4.0-dev \ - libclang-4.0-dev \ - llvm-5.0-dev \ - libclang-5.0-dev \ - llvm-6.0-dev \ - libclang-6.0-dev \ - llvm-7-dev \ - libclang-7-dev \ - llvm-8-dev \ - libclang-8-dev \ - g++ \ - clang-8 - -# Install remaining packages from Debian buster to get newer versions -echo "deb https://deb.debian.org/debian/ buster main" >/etc/apt/sources.list.d/buster.list -echo "deb https://deb.debian.org/debian/ buster-updates main" >/etc/apt/sources.list.d/buster-updates.list -apt-get update -apt-get install -y \ - git \ - bzip2 \ - zlib1g-dev \ - pkg-config \ - libxrender-dev \ - libxdamage-dev \ - libxxf86vm-dev \ - gcc \ - git \ - libepoxy-dev \ - libegl1-mesa-dev \ - libgbm-dev \ - libclc-dev \ - libxvmc-dev \ - libomxil-bellagio-dev \ - xz-utils \ - libexpat1-dev \ - libx11-xcb-dev \ - libelf-dev \ - libunwind-dev \ - libglvnd-dev \ - libgtk-3-dev \ - libpng-dev \ - libgbm-dev \ - libgles2-mesa-dev \ - python-mako \ - python3-mako \ - bison \ - flex \ - gettext \ - cmake \ - meson \ - scons - -# Cross-build Mesa deps -for arch in $CROSS_ARCHITECTURES; do - apt-get install -y \ - libdrm-dev:${arch} \ - libexpat1-dev:${arch} \ - libelf-dev:${arch} -done -apt-get install -y \ - dpkg-dev \ - gcc-aarch64-linux-gnu \ - g++-aarch64-linux-gnu \ - gcc-arm-linux-gnueabihf \ - g++-arm-linux-gnueabihf \ - gcc-i686-linux-gnu \ - g++-i686-linux-gnu - -# for 64bit windows cross-builds -apt-get install -y mingw-w64 - -# for the vulkan overlay layer -wget https://github.com/KhronosGroup/glslang/releases/download/master-tot/glslang-master-linux-Release.zip -unzip glslang-master-linux-Release.zip bin/glslangValidator -install -m755 bin/glslangValidator /usr/local/bin/ -rm bin/glslangValidator glslang-master-linux-Release.zip - - -# dependencies where we want a specific version -export XORG_RELEASES=https://xorg.freedesktop.org/releases/individual -export XCB_RELEASES=https://xcb.freedesktop.org/dist -export WAYLAND_RELEASES=https://wayland.freedesktop.org/releases - -export XORGMACROS_VERSION=util-macros-1.19.0 -export GLPROTO_VERSION=glproto-1.4.17 -export DRI2PROTO_VERSION=dri2proto-2.8 -export LIBPCIACCESS_VERSION=libpciaccess-0.13.4 -export LIBDRM_VERSION=libdrm-2.4.99 -export XCBPROTO_VERSION=xcb-proto-1.13 -export RANDRPROTO_VERSION=randrproto-1.5.0 -export LIBXRANDR_VERSION=libXrandr-1.5.0 -export LIBXCB_VERSION=libxcb-1.13 -export LIBXSHMFENCE_VERSION=libxshmfence-1.3 -export LIBVDPAU_VERSION=libvdpau-1.1 -export LIBVA_VERSION=libva-1.7.0 -export LIBWAYLAND_VERSION=wayland-1.15.0 -export WAYLAND_PROTOCOLS_VERSION=wayland-protocols-1.12 - -wget $XORG_RELEASES/util/$XORGMACROS_VERSION.tar.bz2 -tar -xvf $XORGMACROS_VERSION.tar.bz2 && rm $XORGMACROS_VERSION.tar.bz2 -cd $XORGMACROS_VERSION; ./configure; make install; cd .. -rm -rf $XORGMACROS_VERSION - -wget $XORG_RELEASES/proto/$GLPROTO_VERSION.tar.bz2 -tar -xvf $GLPROTO_VERSION.tar.bz2 && rm $GLPROTO_VERSION.tar.bz2 -cd $GLPROTO_VERSION; ./configure; make install; cd .. -rm -rf $GLPROTO_VERSION - -wget $XORG_RELEASES/proto/$DRI2PROTO_VERSION.tar.bz2 -tar -xvf $DRI2PROTO_VERSION.tar.bz2 && rm $DRI2PROTO_VERSION.tar.bz2 -cd $DRI2PROTO_VERSION; ./configure; make install; cd .. -rm -rf $DRI2PROTO_VERSION - -wget $XCB_RELEASES/$XCBPROTO_VERSION.tar.bz2 -tar -xvf $XCBPROTO_VERSION.tar.bz2 && rm $XCBPROTO_VERSION.tar.bz2 -cd $XCBPROTO_VERSION; ./configure; make install; cd .. -rm -rf $XCBPROTO_VERSION - -wget $XCB_RELEASES/$LIBXCB_VERSION.tar.bz2 -tar -xvf $LIBXCB_VERSION.tar.bz2 && rm $LIBXCB_VERSION.tar.bz2 -cd $LIBXCB_VERSION; ./configure; make install; cd .. -rm -rf $LIBXCB_VERSION - -wget $XORG_RELEASES/lib/$LIBPCIACCESS_VERSION.tar.bz2 -tar -xvf $LIBPCIACCESS_VERSION.tar.bz2 && rm $LIBPCIACCESS_VERSION.tar.bz2 -cd $LIBPCIACCESS_VERSION; ./configure; make install; cd .. -rm -rf $LIBPCIACCESS_VERSION - -wget https://dri.freedesktop.org/libdrm/$LIBDRM_VERSION.tar.bz2 -tar -xvf $LIBDRM_VERSION.tar.bz2 && rm $LIBDRM_VERSION.tar.bz2 -cd $LIBDRM_VERSION; ./configure --enable-vc4 --enable-freedreno --enable-etnaviv-experimental-api; make install; cd .. -rm -rf $LIBDRM_VERSION - -wget $XORG_RELEASES/proto/$RANDRPROTO_VERSION.tar.bz2 -tar -xvf $RANDRPROTO_VERSION.tar.bz2 && rm $RANDRPROTO_VERSION.tar.bz2 -cd $RANDRPROTO_VERSION; ./configure; make install; cd .. -rm -rf $RANDRPROTO_VERSION - -wget $XORG_RELEASES/lib/$LIBXRANDR_VERSION.tar.bz2 -tar -xvf $LIBXRANDR_VERSION.tar.bz2 && rm $LIBXRANDR_VERSION.tar.bz2 -cd $LIBXRANDR_VERSION; ./configure; make install; cd .. -rm -rf $LIBXRANDR_VERSION - -wget $XORG_RELEASES/lib/$LIBXSHMFENCE_VERSION.tar.bz2 -tar -xvf $LIBXSHMFENCE_VERSION.tar.bz2 && rm $LIBXSHMFENCE_VERSION.tar.bz2 -cd $LIBXSHMFENCE_VERSION; ./configure; make install; cd .. -rm -rf $LIBXSHMFENCE_VERSION - -wget https://people.freedesktop.org/~aplattner/vdpau/$LIBVDPAU_VERSION.tar.bz2 -tar -xvf $LIBVDPAU_VERSION.tar.bz2 && rm $LIBVDPAU_VERSION.tar.bz2 -cd $LIBVDPAU_VERSION; ./configure; make install; cd .. -rm -rf $LIBVDPAU_VERSION - -wget https://www.freedesktop.org/software/vaapi/releases/libva/$LIBVA_VERSION.tar.bz2 -tar -xvf $LIBVA_VERSION.tar.bz2 && rm $LIBVA_VERSION.tar.bz2 -cd $LIBVA_VERSION; ./configure --disable-wayland --disable-dummy-driver; make install; cd .. -rm -rf $LIBVA_VERSION - -wget $WAYLAND_RELEASES/$LIBWAYLAND_VERSION.tar.xz -tar -xvf $LIBWAYLAND_VERSION.tar.xz && rm $LIBWAYLAND_VERSION.tar.xz -cd $LIBWAYLAND_VERSION; ./configure --enable-libraries --without-host-scanner --disable-documentation --disable-dtd-validation; make install; cd .. -rm -rf $LIBWAYLAND_VERSION - -wget $WAYLAND_RELEASES/$WAYLAND_PROTOCOLS_VERSION.tar.xz -tar -xvf $WAYLAND_PROTOCOLS_VERSION.tar.xz && rm $WAYLAND_PROTOCOLS_VERSION.tar.xz -cd $WAYLAND_PROTOCOLS_VERSION; ./configure; make install; cd .. -rm -rf $WAYLAND_PROTOCOLS_VERSION - -pushd /usr/local -git clone https://gitlab.freedesktop.org/mesa/shader-db.git --depth 1 -rm -rf shader-db/.git -cd shader-db -make -popd - -# Use ccache to speed up builds -apt-get install -y ccache - -# We need xmllint to validate the XML files in Mesa -apt-get install -y libxml2-utils - - -# Generate cross build files for Meson -for arch in $CROSS_ARCHITECTURES; do - cross_file="/cross_file-$arch.txt" - /usr/share/meson/debcrossgen --arch "$arch" -o "$cross_file" - # Work around a bug in debcrossgen that should be fixed in the next release - if [ "$arch" = "i386" ]; then - sed -i "s|cpu_family = 'i686'|cpu_family = 'x86'|g" "$cross_file" - fi -done - - -############### Build dEQP -git config --global user.email "mesa@example.com" -git config --global user.name "Mesa CI" -# XXX: Use --depth 1 once we can drop the cherry-picks. -git clone \ - https://github.com/KhronosGroup/VK-GL-CTS.git \ - -b opengl-es-cts-3.2.5.1 \ - /VK-GL-CTS -cd /VK-GL-CTS -# Fix surfaceless build -git cherry-pick -x 22f41e5e321c6dcd8569c4dad91bce89f06b3670 -git cherry-pick -x 1daa8dff73161ea60ead965bd6c9f2a0a2165648 - -# surfaceless links against libkms and such despite not using it. -sed -i '/gbm/d' targets/surfaceless/surfaceless.cmake -sed -i '/libkms/d' targets/surfaceless/surfaceless.cmake -sed -i '/libgbm/d' targets/surfaceless/surfaceless.cmake - -python3 external/fetch_sources.py - -mkdir -p /deqp -cd /deqp -cmake -G Ninja \ - -DDEQP_TARGET=surfaceless \ - -DCMAKE_BUILD_TYPE=Release \ - /VK-GL-CTS -ninja - -# Copy out the mustpass lists we want from a bunch of other junk. -mkdir /deqp/mustpass -for gles in gles2 gles3 gles31; do - cp \ - /deqp/external/openglcts/modules/gl_cts/data/mustpass/gles/aosp_mustpass/3.2.5.x/$gles-master.txt \ - /deqp/mustpass/$gles-master.txt -done - -# Remove the rest of the build products that we don't need. -rm -rf /deqp/external -rm -rf /deqp/modules/internal -rm -rf /deqp/executor -rm -rf /deqp/execserver -rm -rf /deqp/modules/egl -rm -rf /deqp/framework -du -sh * -rm -rf /VK-GL-CTS - -############### Uninstall the build software - -apt-get purge -y \ - git \ - curl \ - unzip \ - gnupg \ - cmake \ - git \ - libgles2-mesa-dev \ - libgbm-dev - -apt-get autoremove -y --purge diff -Nru mesa-19.2.8/.gitlab-ci/deqp-default-skips.txt mesa-20.0.8/.gitlab-ci/deqp-default-skips.txt --- mesa-19.2.8/.gitlab-ci/deqp-default-skips.txt 2020-07-13 13:21:26.000000000 +0000 +++ mesa-20.0.8/.gitlab-ci/deqp-default-skips.txt 1970-01-01 00:00:00.000000000 +0000 @@ -1,10 +0,0 @@ -# Note: skips lists for CI are just a list of lines that, when -# non-zero-length and not starting with '#', will regex match to -# delete lines from the test list. Be careful. - -# Skip the perf/stress tests to keep runtime manageable -dEQP-GLES[0-9]*.performance -dEQP-GLES[0-9]*.stress - -# These are really slow on tiling architectures (including llvmpipe). -dEQP-GLES[0-9]*.functional.flush_finish diff -Nru mesa-19.2.8/.gitlab-ci/deqp-llvmpipe-fails.txt mesa-20.0.8/.gitlab-ci/deqp-llvmpipe-fails.txt --- mesa-19.2.8/.gitlab-ci/deqp-llvmpipe-fails.txt 2020-07-13 13:21:26.000000000 +0000 +++ mesa-20.0.8/.gitlab-ci/deqp-llvmpipe-fails.txt 1970-01-01 00:00:00.000000000 +0000 @@ -1,124 +0,0 @@ -dEQP-GLES2.functional.clipping.line.wide_line_clip_viewport_center -dEQP-GLES2.functional.clipping.line.wide_line_clip_viewport_corner -dEQP-GLES2.functional.clipping.point.wide_point_clip -dEQP-GLES2.functional.clipping.point.wide_point_clip_viewport_center -dEQP-GLES2.functional.clipping.point.wide_point_clip_viewport_corner -dEQP-GLES2.functional.clipping.triangle_vertex.clip_two.clip_neg_y_neg_z_and_neg_x_neg_y_pos_z -dEQP-GLES2.functional.clipping.triangle_vertex.clip_two.clip_pos_y_pos_z_and_neg_x_neg_y_neg_z -dEQP-GLES2.functional.fbo.render.color_clear.rbo_rgba4 -dEQP-GLES2.functional.fbo.render.color_clear.rbo_rgba4_depth_component16 -dEQP-GLES2.functional.fbo.render.color_clear.rbo_rgba4_stencil_index8 -dEQP-GLES2.functional.fbo.render.depth.rbo_rgba4_depth_component16 -dEQP-GLES2.functional.fbo.render.recreate_colorbuffer.no_rebind_rbo_rgba4 -dEQP-GLES2.functional.fbo.render.recreate_colorbuffer.no_rebind_rbo_rgba4_stencil_index8 -dEQP-GLES2.functional.fbo.render.recreate_colorbuffer.rebind_rbo_rgba4 -dEQP-GLES2.functional.fbo.render.recreate_colorbuffer.rebind_rbo_rgba4_stencil_index8 -dEQP-GLES2.functional.fbo.render.recreate_depthbuffer.no_rebind_rbo_rgba4_depth_component16 -dEQP-GLES2.functional.fbo.render.recreate_depthbuffer.rebind_rbo_rgba4_depth_component16 -dEQP-GLES2.functional.fbo.render.recreate_stencilbuffer.no_rebind_rbo_rgba4_stencil_index8 -dEQP-GLES2.functional.fbo.render.recreate_stencilbuffer.rebind_rbo_rgba4_stencil_index8 -dEQP-GLES2.functional.fbo.render.shared_colorbuffer.rbo_rgba4 -dEQP-GLES2.functional.fbo.render.shared_colorbuffer.rbo_rgba4_depth_component16 -dEQP-GLES2.functional.fbo.render.shared_depthbuffer.rbo_rgba4_depth_component16 -dEQP-GLES2.functional.polygon_offset.default_displacement_with_units -dEQP-GLES2.functional.polygon_offset.fixed16_displacement_with_units -dEQP-GLES2.functional.rasterization.interpolation.basic.line_loop_wide -dEQP-GLES2.functional.rasterization.interpolation.basic.line_strip_wide -dEQP-GLES2.functional.rasterization.interpolation.basic.lines_wide -dEQP-GLES2.functional.rasterization.interpolation.projected.line_loop_wide -dEQP-GLES2.functional.rasterization.interpolation.projected.line_strip_wide -dEQP-GLES2.functional.rasterization.interpolation.projected.lines_wide -dEQP-GLES2.functional.rasterization.limits.points -dEQP-GLES2.functional.shaders.texture_functions.fragment.texture2d_bias -dEQP-GLES2.functional.shaders.texture_functions.fragment.texture2dproj_vec3_bias -dEQP-GLES2.functional.shaders.texture_functions.fragment.texture2dproj_vec4_bias -dEQP-GLES2.functional.texture.filtering.2d.linear_mipmap_linear_linear_clamp_rgba8888 -dEQP-GLES2.functional.texture.filtering.2d.linear_mipmap_linear_linear_mirror_etc1 -dEQP-GLES2.functional.texture.filtering.2d.linear_mipmap_linear_linear_mirror_rgba8888 -dEQP-GLES2.functional.texture.filtering.2d.linear_mipmap_linear_linear_repeat_etc1 -dEQP-GLES2.functional.texture.filtering.2d.linear_mipmap_linear_linear_repeat_rgba8888 -dEQP-GLES2.functional.texture.filtering.2d.linear_mipmap_linear_nearest_clamp_rgba8888 -dEQP-GLES2.functional.texture.filtering.2d.linear_mipmap_linear_nearest_mirror_etc1 -dEQP-GLES2.functional.texture.filtering.2d.linear_mipmap_linear_nearest_mirror_rgba8888 -dEQP-GLES2.functional.texture.filtering.2d.linear_mipmap_linear_nearest_repeat_etc1 -dEQP-GLES2.functional.texture.filtering.2d.linear_mipmap_linear_nearest_repeat_l8 -dEQP-GLES2.functional.texture.filtering.2d.linear_mipmap_linear_nearest_repeat_rgb888 -dEQP-GLES2.functional.texture.filtering.2d.linear_mipmap_linear_nearest_repeat_rgba4444 -dEQP-GLES2.functional.texture.filtering.2d.linear_mipmap_linear_nearest_repeat_rgba8888 -dEQP-GLES2.functional.texture.filtering.2d.nearest_mipmap_linear_linear_clamp_rgba8888 -dEQP-GLES2.functional.texture.filtering.2d.nearest_mipmap_linear_linear_mirror_etc1 -dEQP-GLES2.functional.texture.filtering.2d.nearest_mipmap_linear_linear_mirror_rgba8888 -dEQP-GLES2.functional.texture.filtering.2d.nearest_mipmap_linear_linear_repeat_etc1 -dEQP-GLES2.functional.texture.filtering.2d.nearest_mipmap_linear_linear_repeat_rgba8888 -dEQP-GLES2.functional.texture.filtering.2d.nearest_mipmap_linear_nearest_clamp_rgba8888 -dEQP-GLES2.functional.texture.filtering.2d.nearest_mipmap_linear_nearest_mirror_etc1 -dEQP-GLES2.functional.texture.filtering.2d.nearest_mipmap_linear_nearest_mirror_rgba8888 -dEQP-GLES2.functional.texture.filtering.2d.nearest_mipmap_linear_nearest_repeat_etc1 -dEQP-GLES2.functional.texture.filtering.2d.nearest_mipmap_linear_nearest_repeat_l8 -dEQP-GLES2.functional.texture.filtering.2d.nearest_mipmap_linear_nearest_repeat_rgb888 -dEQP-GLES2.functional.texture.filtering.2d.nearest_mipmap_linear_nearest_repeat_rgba4444 -dEQP-GLES2.functional.texture.filtering.2d.nearest_mipmap_linear_nearest_repeat_rgba8888 -dEQP-GLES2.functional.texture.mipmap.2d.affine.linear_linear_repeat -dEQP-GLES2.functional.texture.mipmap.2d.affine.nearest_linear_clamp -dEQP-GLES2.functional.texture.mipmap.2d.affine.nearest_linear_mirror -dEQP-GLES2.functional.texture.mipmap.2d.affine.nearest_linear_repeat -dEQP-GLES2.functional.texture.mipmap.2d.basic.linear_linear_repeat -dEQP-GLES2.functional.texture.mipmap.2d.basic.linear_linear_repeat_non_square -dEQP-GLES2.functional.texture.mipmap.2d.basic.nearest_linear_clamp -dEQP-GLES2.functional.texture.mipmap.2d.basic.nearest_linear_clamp_non_square -dEQP-GLES2.functional.texture.mipmap.2d.basic.nearest_linear_mirror -dEQP-GLES2.functional.texture.mipmap.2d.basic.nearest_linear_mirror_non_square -dEQP-GLES2.functional.texture.mipmap.2d.basic.nearest_linear_repeat -dEQP-GLES2.functional.texture.mipmap.2d.basic.nearest_linear_repeat_non_square -dEQP-GLES2.functional.texture.mipmap.2d.projected.linear_linear_repeat -dEQP-GLES2.functional.texture.mipmap.2d.projected.nearest_linear_clamp -dEQP-GLES2.functional.texture.mipmap.2d.projected.nearest_linear_mirror -dEQP-GLES2.functional.texture.mipmap.2d.projected.nearest_linear_repeat -dEQP-GLES2.functional.texture.mipmap.cube.basic.linear_linear -dEQP-GLES2.functional.texture.mipmap.cube.basic.linear_nearest -dEQP-GLES2.functional.texture.mipmap.cube.bias.linear_linear -dEQP-GLES2.functional.texture.mipmap.cube.bias.linear_nearest -dEQP-GLES2.functional.texture.mipmap.cube.projected.linear_linear -dEQP-GLES2.functional.texture.mipmap.cube.projected.linear_nearest -dEQP-GLES2.functional.texture.vertex.2d.filtering.linear_mipmap_linear_linear_clamp -dEQP-GLES2.functional.texture.vertex.2d.filtering.linear_mipmap_linear_linear_mirror -dEQP-GLES2.functional.texture.vertex.2d.filtering.linear_mipmap_linear_linear_repeat -dEQP-GLES2.functional.texture.vertex.2d.filtering.linear_mipmap_linear_nearest_clamp -dEQP-GLES2.functional.texture.vertex.2d.filtering.linear_mipmap_linear_nearest_mirror -dEQP-GLES2.functional.texture.vertex.2d.filtering.linear_mipmap_linear_nearest_repeat -dEQP-GLES2.functional.texture.vertex.2d.filtering.nearest_mipmap_linear_linear_clamp -dEQP-GLES2.functional.texture.vertex.2d.filtering.nearest_mipmap_linear_linear_mirror -dEQP-GLES2.functional.texture.vertex.2d.filtering.nearest_mipmap_linear_linear_repeat -dEQP-GLES2.functional.texture.vertex.2d.filtering.nearest_mipmap_linear_nearest_clamp -dEQP-GLES2.functional.texture.vertex.2d.filtering.nearest_mipmap_linear_nearest_mirror -dEQP-GLES2.functional.texture.vertex.2d.filtering.nearest_mipmap_linear_nearest_repeat -dEQP-GLES2.functional.texture.vertex.2d.wrap.clamp_clamp -dEQP-GLES2.functional.texture.vertex.2d.wrap.clamp_mirror -dEQP-GLES2.functional.texture.vertex.2d.wrap.clamp_repeat -dEQP-GLES2.functional.texture.vertex.2d.wrap.mirror_clamp -dEQP-GLES2.functional.texture.vertex.2d.wrap.mirror_mirror -dEQP-GLES2.functional.texture.vertex.2d.wrap.mirror_repeat -dEQP-GLES2.functional.texture.vertex.2d.wrap.repeat_clamp -dEQP-GLES2.functional.texture.vertex.2d.wrap.repeat_mirror -dEQP-GLES2.functional.texture.vertex.2d.wrap.repeat_repeat -dEQP-GLES2.functional.texture.vertex.cube.filtering.linear_mipmap_linear_linear_clamp -dEQP-GLES2.functional.texture.vertex.cube.filtering.linear_mipmap_linear_linear_mirror -dEQP-GLES2.functional.texture.vertex.cube.filtering.linear_mipmap_linear_linear_repeat -dEQP-GLES2.functional.texture.vertex.cube.filtering.linear_mipmap_linear_nearest_clamp -dEQP-GLES2.functional.texture.vertex.cube.filtering.linear_mipmap_linear_nearest_mirror -dEQP-GLES2.functional.texture.vertex.cube.filtering.linear_mipmap_linear_nearest_repeat -dEQP-GLES2.functional.texture.vertex.cube.filtering.nearest_mipmap_linear_linear_clamp -dEQP-GLES2.functional.texture.vertex.cube.filtering.nearest_mipmap_linear_linear_mirror -dEQP-GLES2.functional.texture.vertex.cube.filtering.nearest_mipmap_linear_linear_repeat -dEQP-GLES2.functional.texture.vertex.cube.filtering.nearest_mipmap_linear_nearest_clamp -dEQP-GLES2.functional.texture.vertex.cube.filtering.nearest_mipmap_linear_nearest_mirror -dEQP-GLES2.functional.texture.vertex.cube.filtering.nearest_mipmap_linear_nearest_repeat -dEQP-GLES2.functional.texture.vertex.cube.wrap.clamp_clamp -dEQP-GLES2.functional.texture.vertex.cube.wrap.clamp_mirror -dEQP-GLES2.functional.texture.vertex.cube.wrap.clamp_repeat -dEQP-GLES2.functional.texture.vertex.cube.wrap.mirror_clamp -dEQP-GLES2.functional.texture.vertex.cube.wrap.mirror_mirror -dEQP-GLES2.functional.texture.vertex.cube.wrap.mirror_repeat -dEQP-GLES2.functional.texture.vertex.cube.wrap.repeat_clamp -dEQP-GLES2.functional.texture.vertex.cube.wrap.repeat_mirror -dEQP-GLES2.functional.texture.vertex.cube.wrap.repeat_repeat diff -Nru mesa-19.2.8/.gitlab-ci/deqp-runner.sh mesa-20.0.8/.gitlab-ci/deqp-runner.sh --- mesa-19.2.8/.gitlab-ci/deqp-runner.sh 2020-07-13 13:21:26.000000000 +0000 +++ mesa-20.0.8/.gitlab-ci/deqp-runner.sh 1970-01-01 00:00:00.000000000 +0000 @@ -1,112 +0,0 @@ -#!/bin/bash - -set -ex - -DEQP_OPTIONS=(--deqp-surface-width=256 --deqp-surface-height=256) -DEQP_OPTIONS+=(--deqp-surface-type=pbuffer) -DEQP_OPTIONS+=(--deqp-gl-config-name=rgba8888d24s8ms0) -DEQP_OPTIONS+=(--deqp-visibility=hidden) -DEQP_OPTIONS+=(--deqp-log-images=disable) -DEQP_OPTIONS+=(--deqp-watchdog=enable) -DEQP_OPTIONS+=(--deqp-crashhandler=enable) - -if [ -z "$DEQP_VER" ]; then - echo 'DEQP_VER must be set to something like "gles2" or "gles31" for the test run' - exit 1 -fi - -if [ -z "$DEQP_SKIPS" ]; then - echo 'DEQP_SKIPS must be set to something like "deqp-default-skips.txt"' - exit 1 -fi - -# Prep the expected failure list -if [ -n "$DEQP_EXPECTED_FAILS" ]; then - export DEQP_EXPECTED_FAILS=`pwd`/artifacts/$DEQP_EXPECTED_FAILS -else - export DEQP_EXPECTED_FAILS=/tmp/expect-no-failures.txt - touch $DEQP_EXPECTED_FAILS -fi -sort < $DEQP_EXPECTED_FAILS > /tmp/expected-fails.txt - -# Fix relative paths on inputs. -export DEQP_SKIPS=`pwd`/artifacts/$DEQP_SKIPS - -# Be a good citizen on the shared runners. -export LP_NUM_THREADS=4 - -# Set up the driver environment. -export LD_LIBRARY_PATH=`pwd`/install/lib/ -export EGL_PLATFORM=surfaceless - -# the runner was failing to look for libkms in /usr/local/lib for some reason -# I never figured out. -export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib - -RESULTS=`pwd`/results -mkdir -p $RESULTS - -cd /deqp/modules/$DEQP_VER - -# Generate test case list file -cp /deqp/mustpass/$DEQP_VER-master.txt /tmp/case-list.txt - -# Note: not using sorted input and comm, becuase I want to run the tests in -# the same order that dEQP would. -while read -r line; do - if echo "$line" | grep -q '^[^#]'; then - sed -i "/$line/d" /tmp/case-list.txt - fi -done < $DEQP_SKIPS - -# If the job is parallel, take the corresponding fraction of the caselist. -# Note: N~M is a gnu sed extension to match every nth line (first line is #1). -if [ -n "$CI_NODE_INDEX" ]; then - sed -ni $CI_NODE_INDEX~$CI_NODE_TOTAL"p" /tmp/case-list.txt -fi - -if [ ! -s /tmp/case-list.txt ]; then - echo "Caselist generation failed" - exit 1 -fi - -# Cannot use tee because dash doesn't have pipefail -touch /tmp/result.txt -tail -f /tmp/result.txt & - -./deqp-$DEQP_VER "${DEQP_OPTIONS[@]}" --deqp-log-filename=$RESULTS/results.qpa --deqp-caselist-file=/tmp/case-list.txt >> /tmp/result.txt -DEQP_EXITCODE=$? - -sed -ne \ - '/StatusCode="Fail"/{x;p}; s/#beginTestCaseResult //; T; h' \ - $RESULTS/results.qpa \ - > /tmp/unsorted-fails.txt - -# Scrape out the renderer that the test run used, so we can validate that the -# right driver was used. -if grep -q "dEQP-.*.info.renderer" /tmp/case-list.txt; then - # This is an ugly dependency on the .qpa format: Print 3 lines after the - # match, which happens to contain the result. - RENDERER=`sed -n '/#beginTestCaseResult dEQP-.*.info.renderer/{n;n;n;p}' $RESULTS/results.qpa | sed -n -E "s|(.*)|\1|p"` - - echo "GL_RENDERER for this test run: $RENDERER" - - if [ -n "$DEQP_RENDERER_MATCH" ]; then - echo $RENDERER | grep -q $DEQP_RENDERER_MATCH > /dev/null - fi -fi - -if [ $DEQP_EXITCODE -ne 0 ]; then - exit $DEQP_EXITCODE -fi - -sort < /tmp/unsorted-fails.txt > $RESULTS/fails.txt - -comm -23 $RESULTS/fails.txt /tmp/expected-fails.txt > /tmp/new-fails.txt -if [ -s /tmp/new-fails.txt ]; then - echo "Unexpected failures:" - cat /tmp/new-fails.txt - exit 1 -else - echo "No new failures" -fi diff -Nru mesa-19.2.8/.gitlab-ci/deqp-softpipe-fails.txt mesa-20.0.8/.gitlab-ci/deqp-softpipe-fails.txt --- mesa-19.2.8/.gitlab-ci/deqp-softpipe-fails.txt 2020-07-13 13:21:26.000000000 +0000 +++ mesa-20.0.8/.gitlab-ci/deqp-softpipe-fails.txt 1970-01-01 00:00:00.000000000 +0000 @@ -1,445 +0,0 @@ -dEQP-GLES2.functional.clipping.line.wide_line_clip_viewport_center -dEQP-GLES2.functional.clipping.line.wide_line_clip_viewport_corner -dEQP-GLES2.functional.clipping.point.wide_point_clip -dEQP-GLES2.functional.clipping.point.wide_point_clip_viewport_center -dEQP-GLES2.functional.clipping.point.wide_point_clip_viewport_corner -dEQP-GLES2.functional.clipping.triangle_vertex.clip_two.clip_neg_y_neg_z_and_neg_x_neg_y_pos_z -dEQP-GLES2.functional.clipping.triangle_vertex.clip_two.clip_pos_y_pos_z_and_neg_x_neg_y_neg_z -dEQP-GLES2.functional.polygon_offset.default_displacement_with_units -dEQP-GLES2.functional.polygon_offset.fixed16_displacement_with_units -dEQP-GLES2.functional.rasterization.interpolation.basic.line_loop_wide -dEQP-GLES2.functional.rasterization.interpolation.basic.line_strip_wide -dEQP-GLES2.functional.rasterization.interpolation.basic.lines_wide -dEQP-GLES2.functional.rasterization.interpolation.projected.line_loop_wide -dEQP-GLES2.functional.rasterization.interpolation.projected.line_strip_wide -dEQP-GLES2.functional.rasterization.interpolation.projected.lines_wide -dEQP-GLES2.functional.rasterization.limits.points -dEQP-GLES2.functional.rasterization.primitives.points -dEQP-GLES3.functional.clipping.line.wide_line_clip_viewport_center -dEQP-GLES3.functional.clipping.line.wide_line_clip_viewport_corner -dEQP-GLES3.functional.clipping.point.wide_point_clip -dEQP-GLES3.functional.clipping.point.wide_point_clip_viewport_center -dEQP-GLES3.functional.clipping.point.wide_point_clip_viewport_corner -dEQP-GLES3.functional.clipping.triangle_vertex.clip_two.clip_neg_y_neg_z_and_neg_x_neg_y_pos_z -dEQP-GLES3.functional.clipping.triangle_vertex.clip_two.clip_pos_y_pos_z_and_neg_x_neg_y_neg_z -dEQP-GLES3.functional.draw.random.124 -dEQP-GLES3.functional.fbo.depth.depth_test_clamp.depth24_stencil8 -dEQP-GLES3.functional.fbo.depth.depth_test_clamp.depth32f_stencil8 -dEQP-GLES3.functional.fbo.depth.depth_test_clamp.depth_component16 -dEQP-GLES3.functional.fbo.depth.depth_test_clamp.depth_component24 -dEQP-GLES3.functional.fbo.depth.depth_test_clamp.depth_component32f -dEQP-GLES3.functional.fbo.depth.depth_write_clamp.depth32f_stencil8 -dEQP-GLES3.functional.fbo.depth.depth_write_clamp.depth_component32f -dEQP-GLES3.functional.fbo.invalidate.sub.unbind_blit_msaa_color -dEQP-GLES3.functional.fbo.invalidate.sub.unbind_blit_msaa_depth -dEQP-GLES3.functional.fbo.invalidate.sub.unbind_blit_msaa_depth_stencil -dEQP-GLES3.functional.fbo.invalidate.sub.unbind_blit_msaa_stencil -dEQP-GLES3.functional.fbo.invalidate.whole.unbind_blit_msaa_color -dEQP-GLES3.functional.fbo.invalidate.whole.unbind_blit_msaa_depth -dEQP-GLES3.functional.fbo.invalidate.whole.unbind_blit_msaa_depth_stencil -dEQP-GLES3.functional.fbo.invalidate.whole.unbind_blit_msaa_stencil -dEQP-GLES3.functional.fbo.msaa.2_samples.depth24_stencil8 -dEQP-GLES3.functional.fbo.msaa.2_samples.depth32f_stencil8 -dEQP-GLES3.functional.fbo.msaa.2_samples.depth_component16 -dEQP-GLES3.functional.fbo.msaa.2_samples.depth_component24 -dEQP-GLES3.functional.fbo.msaa.2_samples.depth_component32f -dEQP-GLES3.functional.fbo.msaa.2_samples.r11f_g11f_b10f -dEQP-GLES3.functional.fbo.msaa.2_samples.r16f -dEQP-GLES3.functional.fbo.msaa.2_samples.r8 -dEQP-GLES3.functional.fbo.msaa.2_samples.rg16f -dEQP-GLES3.functional.fbo.msaa.2_samples.rg8 -dEQP-GLES3.functional.fbo.msaa.2_samples.rgb10_a2 -dEQP-GLES3.functional.fbo.msaa.2_samples.rgb565 -dEQP-GLES3.functional.fbo.msaa.2_samples.rgb5_a1 -dEQP-GLES3.functional.fbo.msaa.2_samples.rgb8 -dEQP-GLES3.functional.fbo.msaa.2_samples.rgba4 -dEQP-GLES3.functional.fbo.msaa.2_samples.rgba8 -dEQP-GLES3.functional.fbo.msaa.2_samples.srgb8_alpha8 -dEQP-GLES3.functional.fbo.msaa.2_samples.stencil_index8 -dEQP-GLES3.functional.fbo.msaa.4_samples.depth24_stencil8 -dEQP-GLES3.functional.fbo.msaa.4_samples.depth32f_stencil8 -dEQP-GLES3.functional.fbo.msaa.4_samples.depth_component16 -dEQP-GLES3.functional.fbo.msaa.4_samples.depth_component24 -dEQP-GLES3.functional.fbo.msaa.4_samples.depth_component32f -dEQP-GLES3.functional.fbo.msaa.4_samples.r11f_g11f_b10f -dEQP-GLES3.functional.fbo.msaa.4_samples.r16f -dEQP-GLES3.functional.fbo.msaa.4_samples.r8 -dEQP-GLES3.functional.fbo.msaa.4_samples.rg16f -dEQP-GLES3.functional.fbo.msaa.4_samples.rg8 -dEQP-GLES3.functional.fbo.msaa.4_samples.rgb10_a2 -dEQP-GLES3.functional.fbo.msaa.4_samples.rgb565 -dEQP-GLES3.functional.fbo.msaa.4_samples.rgb5_a1 -dEQP-GLES3.functional.fbo.msaa.4_samples.rgb8 -dEQP-GLES3.functional.fbo.msaa.4_samples.rgba4 -dEQP-GLES3.functional.fbo.msaa.4_samples.rgba8 -dEQP-GLES3.functional.fbo.msaa.4_samples.srgb8_alpha8 -dEQP-GLES3.functional.fbo.msaa.4_samples.stencil_index8 -dEQP-GLES3.functional.multisample.fbo_max_samples.proportionality_alpha_to_coverage -dEQP-GLES3.functional.multisample.fbo_max_samples.proportionality_sample_coverage -dEQP-GLES3.functional.multisample.fbo_max_samples.proportionality_sample_coverage_inverted -dEQP-GLES3.functional.multisample.fbo_max_samples.sample_coverage_invert -dEQP-GLES3.functional.negative_api.buffer.blit_framebuffer_multisample -dEQP-GLES3.functional.negative_api.buffer.read_pixels_fbo_format_mismatch -dEQP-GLES3.functional.polygon_offset.default_displacement_with_units -dEQP-GLES3.functional.polygon_offset.fixed16_displacement_with_units -dEQP-GLES3.functional.polygon_offset.fixed24_displacement_with_units -dEQP-GLES3.functional.polygon_offset.float32_displacement_with_units -dEQP-GLES3.functional.rasterization.fbo.rbo_multisample_max.interpolation.lines_wide -dEQP-GLES3.functional.rasterization.fbo.rbo_multisample_max.primitives.lines_wide -dEQP-GLES3.functional.rasterization.fbo.rbo_singlesample.interpolation.lines_wide -dEQP-GLES3.functional.rasterization.fbo.rbo_singlesample.primitives.points -dEQP-GLES3.functional.rasterization.fbo.texture_2d.interpolation.lines_wide -dEQP-GLES3.functional.rasterization.fbo.texture_2d.primitives.points -dEQP-GLES3.functional.rasterization.interpolation.basic.line_loop_wide -dEQP-GLES3.functional.rasterization.interpolation.basic.line_strip_wide -dEQP-GLES3.functional.rasterization.interpolation.basic.lines_wide -dEQP-GLES3.functional.rasterization.interpolation.projected.line_loop_wide -dEQP-GLES3.functional.rasterization.interpolation.projected.line_strip_wide -dEQP-GLES3.functional.rasterization.interpolation.projected.lines_wide -dEQP-GLES3.functional.rasterization.primitives.points -dEQP-GLES3.functional.rasterizer_discard.basic.write_depth_points -dEQP-GLES3.functional.rasterizer_discard.basic.write_stencil_points -dEQP-GLES3.functional.rasterizer_discard.fbo.write_depth_points -dEQP-GLES3.functional.rasterizer_discard.fbo.write_stencil_points -dEQP-GLES3.functional.rasterizer_discard.scissor.write_depth_points -dEQP-GLES3.functional.rasterizer_discard.scissor.write_stencil_points -dEQP-GLES3.functional.shaders.derivate.dfdx.fastest.fbo_msaa4.float_highp -dEQP-GLES3.functional.shaders.derivate.dfdx.fastest.fbo_msaa4.float_mediump -dEQP-GLES3.functional.shaders.derivate.dfdx.fastest.fbo_msaa4.vec2_highp -dEQP-GLES3.functional.shaders.derivate.dfdx.fastest.fbo_msaa4.vec2_mediump -dEQP-GLES3.functional.shaders.derivate.dfdx.fastest.fbo_msaa4.vec3_highp -dEQP-GLES3.functional.shaders.derivate.dfdx.fastest.fbo_msaa4.vec3_mediump -dEQP-GLES3.functional.shaders.derivate.dfdx.fastest.fbo_msaa4.vec4_highp -dEQP-GLES3.functional.shaders.derivate.dfdx.fastest.fbo_msaa4.vec4_mediump -dEQP-GLES3.functional.shaders.derivate.dfdx.fbo_msaa2.float_highp -dEQP-GLES3.functional.shaders.derivate.dfdx.fbo_msaa2.float_mediump -dEQP-GLES3.functional.shaders.derivate.dfdx.fbo_msaa2.vec2_highp -dEQP-GLES3.functional.shaders.derivate.dfdx.fbo_msaa2.vec2_mediump -dEQP-GLES3.functional.shaders.derivate.dfdx.fbo_msaa2.vec3_highp -dEQP-GLES3.functional.shaders.derivate.dfdx.fbo_msaa2.vec3_mediump -dEQP-GLES3.functional.shaders.derivate.dfdx.fbo_msaa2.vec4_highp -dEQP-GLES3.functional.shaders.derivate.dfdx.fbo_msaa2.vec4_mediump -dEQP-GLES3.functional.shaders.derivate.dfdx.fbo_msaa4.float_highp -dEQP-GLES3.functional.shaders.derivate.dfdx.fbo_msaa4.float_mediump -dEQP-GLES3.functional.shaders.derivate.dfdx.fbo_msaa4.vec2_highp -dEQP-GLES3.functional.shaders.derivate.dfdx.fbo_msaa4.vec2_mediump -dEQP-GLES3.functional.shaders.derivate.dfdx.fbo_msaa4.vec3_highp -dEQP-GLES3.functional.shaders.derivate.dfdx.fbo_msaa4.vec3_mediump -dEQP-GLES3.functional.shaders.derivate.dfdx.fbo_msaa4.vec4_highp -dEQP-GLES3.functional.shaders.derivate.dfdx.fbo_msaa4.vec4_mediump -dEQP-GLES3.functional.shaders.derivate.dfdx.nicest.fbo_msaa4.float_highp -dEQP-GLES3.functional.shaders.derivate.dfdx.nicest.fbo_msaa4.float_mediump -dEQP-GLES3.functional.shaders.derivate.dfdx.nicest.fbo_msaa4.vec2_highp -dEQP-GLES3.functional.shaders.derivate.dfdx.nicest.fbo_msaa4.vec2_mediump -dEQP-GLES3.functional.shaders.derivate.dfdx.nicest.fbo_msaa4.vec3_highp -dEQP-GLES3.functional.shaders.derivate.dfdx.nicest.fbo_msaa4.vec3_mediump -dEQP-GLES3.functional.shaders.derivate.dfdx.nicest.fbo_msaa4.vec4_highp -dEQP-GLES3.functional.shaders.derivate.dfdx.nicest.fbo_msaa4.vec4_mediump -dEQP-GLES3.functional.shaders.derivate.dfdx.texture.msaa4.float_highp -dEQP-GLES3.functional.shaders.derivate.dfdx.texture.msaa4.float_mediump -dEQP-GLES3.functional.shaders.derivate.dfdx.texture.msaa4.vec2_highp -dEQP-GLES3.functional.shaders.derivate.dfdx.texture.msaa4.vec2_mediump -dEQP-GLES3.functional.shaders.derivate.dfdx.texture.msaa4.vec3_highp -dEQP-GLES3.functional.shaders.derivate.dfdx.texture.msaa4.vec3_mediump -dEQP-GLES3.functional.shaders.derivate.dfdx.texture.msaa4.vec4_highp -dEQP-GLES3.functional.shaders.derivate.dfdx.texture.msaa4.vec4_mediump -dEQP-GLES3.functional.shaders.derivate.dfdy.fastest.fbo_msaa4.float_highp -dEQP-GLES3.functional.shaders.derivate.dfdy.fastest.fbo_msaa4.float_mediump -dEQP-GLES3.functional.shaders.derivate.dfdy.fastest.fbo_msaa4.vec2_highp -dEQP-GLES3.functional.shaders.derivate.dfdy.fastest.fbo_msaa4.vec2_mediump -dEQP-GLES3.functional.shaders.derivate.dfdy.fastest.fbo_msaa4.vec3_highp -dEQP-GLES3.functional.shaders.derivate.dfdy.fastest.fbo_msaa4.vec3_mediump -dEQP-GLES3.functional.shaders.derivate.dfdy.fastest.fbo_msaa4.vec4_highp -dEQP-GLES3.functional.shaders.derivate.dfdy.fastest.fbo_msaa4.vec4_mediump -dEQP-GLES3.functional.shaders.derivate.dfdy.fbo_msaa2.float_highp -dEQP-GLES3.functional.shaders.derivate.dfdy.fbo_msaa2.float_mediump -dEQP-GLES3.functional.shaders.derivate.dfdy.fbo_msaa2.vec2_highp -dEQP-GLES3.functional.shaders.derivate.dfdy.fbo_msaa2.vec2_mediump -dEQP-GLES3.functional.shaders.derivate.dfdy.fbo_msaa2.vec3_highp -dEQP-GLES3.functional.shaders.derivate.dfdy.fbo_msaa2.vec3_mediump -dEQP-GLES3.functional.shaders.derivate.dfdy.fbo_msaa2.vec4_highp -dEQP-GLES3.functional.shaders.derivate.dfdy.fbo_msaa2.vec4_mediump -dEQP-GLES3.functional.shaders.derivate.dfdy.fbo_msaa4.float_highp -dEQP-GLES3.functional.shaders.derivate.dfdy.fbo_msaa4.float_mediump -dEQP-GLES3.functional.shaders.derivate.dfdy.fbo_msaa4.vec2_highp -dEQP-GLES3.functional.shaders.derivate.dfdy.fbo_msaa4.vec2_mediump -dEQP-GLES3.functional.shaders.derivate.dfdy.fbo_msaa4.vec3_highp -dEQP-GLES3.functional.shaders.derivate.dfdy.fbo_msaa4.vec3_mediump -dEQP-GLES3.functional.shaders.derivate.dfdy.fbo_msaa4.vec4_highp -dEQP-GLES3.functional.shaders.derivate.dfdy.fbo_msaa4.vec4_mediump -dEQP-GLES3.functional.shaders.derivate.dfdy.nicest.fbo_msaa4.float_highp -dEQP-GLES3.functional.shaders.derivate.dfdy.nicest.fbo_msaa4.float_mediump -dEQP-GLES3.functional.shaders.derivate.dfdy.nicest.fbo_msaa4.vec2_highp -dEQP-GLES3.functional.shaders.derivate.dfdy.nicest.fbo_msaa4.vec2_mediump -dEQP-GLES3.functional.shaders.derivate.dfdy.nicest.fbo_msaa4.vec3_highp -dEQP-GLES3.functional.shaders.derivate.dfdy.nicest.fbo_msaa4.vec3_mediump -dEQP-GLES3.functional.shaders.derivate.dfdy.nicest.fbo_msaa4.vec4_highp -dEQP-GLES3.functional.shaders.derivate.dfdy.nicest.fbo_msaa4.vec4_mediump -dEQP-GLES3.functional.shaders.derivate.dfdy.texture.msaa4.float_highp -dEQP-GLES3.functional.shaders.derivate.dfdy.texture.msaa4.float_mediump -dEQP-GLES3.functional.shaders.derivate.dfdy.texture.msaa4.vec2_highp -dEQP-GLES3.functional.shaders.derivate.dfdy.texture.msaa4.vec2_mediump -dEQP-GLES3.functional.shaders.derivate.dfdy.texture.msaa4.vec3_highp -dEQP-GLES3.functional.shaders.derivate.dfdy.texture.msaa4.vec3_mediump -dEQP-GLES3.functional.shaders.derivate.dfdy.texture.msaa4.vec4_highp -dEQP-GLES3.functional.shaders.derivate.dfdy.texture.msaa4.vec4_mediump -dEQP-GLES3.functional.shaders.derivate.fwidth.fastest.fbo_msaa4.float_highp -dEQP-GLES3.functional.shaders.derivate.fwidth.fastest.fbo_msaa4.float_mediump -dEQP-GLES3.functional.shaders.derivate.fwidth.fastest.fbo_msaa4.vec2_highp -dEQP-GLES3.functional.shaders.derivate.fwidth.fastest.fbo_msaa4.vec2_mediump -dEQP-GLES3.functional.shaders.derivate.fwidth.fastest.fbo_msaa4.vec3_highp -dEQP-GLES3.functional.shaders.derivate.fwidth.fastest.fbo_msaa4.vec3_mediump -dEQP-GLES3.functional.shaders.derivate.fwidth.fastest.fbo_msaa4.vec4_highp -dEQP-GLES3.functional.shaders.derivate.fwidth.fastest.fbo_msaa4.vec4_mediump -dEQP-GLES3.functional.shaders.derivate.fwidth.fbo_msaa2.float_highp -dEQP-GLES3.functional.shaders.derivate.fwidth.fbo_msaa2.float_mediump -dEQP-GLES3.functional.shaders.derivate.fwidth.fbo_msaa2.vec2_highp -dEQP-GLES3.functional.shaders.derivate.fwidth.fbo_msaa2.vec2_mediump -dEQP-GLES3.functional.shaders.derivate.fwidth.fbo_msaa2.vec3_highp -dEQP-GLES3.functional.shaders.derivate.fwidth.fbo_msaa2.vec3_mediump -dEQP-GLES3.functional.shaders.derivate.fwidth.fbo_msaa2.vec4_highp -dEQP-GLES3.functional.shaders.derivate.fwidth.fbo_msaa2.vec4_mediump -dEQP-GLES3.functional.shaders.derivate.fwidth.fbo_msaa4.float_highp -dEQP-GLES3.functional.shaders.derivate.fwidth.fbo_msaa4.float_mediump -dEQP-GLES3.functional.shaders.derivate.fwidth.fbo_msaa4.vec2_highp -dEQP-GLES3.functional.shaders.derivate.fwidth.fbo_msaa4.vec2_mediump -dEQP-GLES3.functional.shaders.derivate.fwidth.fbo_msaa4.vec3_highp -dEQP-GLES3.functional.shaders.derivate.fwidth.fbo_msaa4.vec3_mediump -dEQP-GLES3.functional.shaders.derivate.fwidth.fbo_msaa4.vec4_highp -dEQP-GLES3.functional.shaders.derivate.fwidth.fbo_msaa4.vec4_mediump -dEQP-GLES3.functional.shaders.derivate.fwidth.nicest.fbo_msaa4.float_highp -dEQP-GLES3.functional.shaders.derivate.fwidth.nicest.fbo_msaa4.float_mediump -dEQP-GLES3.functional.shaders.derivate.fwidth.nicest.fbo_msaa4.vec2_highp -dEQP-GLES3.functional.shaders.derivate.fwidth.nicest.fbo_msaa4.vec2_mediump -dEQP-GLES3.functional.shaders.derivate.fwidth.nicest.fbo_msaa4.vec3_highp -dEQP-GLES3.functional.shaders.derivate.fwidth.nicest.fbo_msaa4.vec3_mediump -dEQP-GLES3.functional.shaders.derivate.fwidth.nicest.fbo_msaa4.vec4_highp -dEQP-GLES3.functional.shaders.derivate.fwidth.nicest.fbo_msaa4.vec4_mediump -dEQP-GLES3.functional.shaders.derivate.fwidth.texture.msaa4.float_highp -dEQP-GLES3.functional.shaders.derivate.fwidth.texture.msaa4.float_mediump -dEQP-GLES3.functional.shaders.derivate.fwidth.texture.msaa4.vec2_highp -dEQP-GLES3.functional.shaders.derivate.fwidth.texture.msaa4.vec2_mediump -dEQP-GLES3.functional.shaders.derivate.fwidth.texture.msaa4.vec3_highp -dEQP-GLES3.functional.shaders.derivate.fwidth.texture.msaa4.vec3_mediump -dEQP-GLES3.functional.shaders.derivate.fwidth.texture.msaa4.vec4_highp -dEQP-GLES3.functional.shaders.derivate.fwidth.texture.msaa4.vec4_mediump -dEQP-GLES3.functional.state_query.integers.max_samples_getfloat -dEQP-GLES3.functional.state_query.integers.max_samples_getinteger64 -dEQP-GLES3.functional.texture.filtering.3d.combinations.linear_linear_clamp_clamp_mirror -dEQP-GLES3.functional.texture.filtering.3d.combinations.linear_linear_clamp_clamp_repeat -dEQP-GLES3.functional.texture.filtering.3d.combinations.linear_linear_clamp_mirror_mirror -dEQP-GLES3.functional.texture.filtering.3d.combinations.linear_linear_clamp_mirror_repeat -dEQP-GLES3.functional.texture.filtering.3d.combinations.linear_linear_clamp_repeat_mirror -dEQP-GLES3.functional.texture.filtering.3d.combinations.linear_linear_clamp_repeat_repeat -dEQP-GLES3.functional.texture.filtering.3d.combinations.linear_linear_mirror_clamp_mirror -dEQP-GLES3.functional.texture.filtering.3d.combinations.linear_linear_mirror_clamp_repeat -dEQP-GLES3.functional.texture.filtering.3d.combinations.linear_linear_mirror_mirror_mirror -dEQP-GLES3.functional.texture.filtering.3d.combinations.linear_linear_mirror_mirror_repeat -dEQP-GLES3.functional.texture.filtering.3d.combinations.linear_linear_mirror_repeat_mirror -dEQP-GLES3.functional.texture.filtering.3d.combinations.linear_linear_mirror_repeat_repeat -dEQP-GLES3.functional.texture.filtering.3d.combinations.linear_linear_repeat_clamp_mirror -dEQP-GLES3.functional.texture.filtering.3d.combinations.linear_linear_repeat_clamp_repeat -dEQP-GLES3.functional.texture.filtering.3d.combinations.linear_linear_repeat_mirror_mirror -dEQP-GLES3.functional.texture.filtering.3d.combinations.linear_linear_repeat_mirror_repeat -dEQP-GLES3.functional.texture.filtering.3d.combinations.linear_linear_repeat_repeat_mirror -dEQP-GLES3.functional.texture.filtering.3d.combinations.linear_linear_repeat_repeat_repeat -dEQP-GLES3.functional.texture.filtering.3d.combinations.linear_mipmap_linear_linear_clamp_clamp_repeat -dEQP-GLES3.functional.texture.filtering.3d.combinations.linear_mipmap_linear_linear_clamp_mirror_repeat -dEQP-GLES3.functional.texture.filtering.3d.combinations.linear_mipmap_linear_linear_clamp_repeat_repeat -dEQP-GLES3.functional.texture.filtering.3d.combinations.linear_mipmap_linear_linear_mirror_clamp_repeat -dEQP-GLES3.functional.texture.filtering.3d.combinations.linear_mipmap_linear_linear_mirror_mirror_repeat -dEQP-GLES3.functional.texture.filtering.3d.combinations.linear_mipmap_linear_linear_mirror_repeat_repeat -dEQP-GLES3.functional.texture.filtering.3d.combinations.linear_mipmap_linear_linear_repeat_clamp_mirror -dEQP-GLES3.functional.texture.filtering.3d.combinations.linear_mipmap_linear_linear_repeat_clamp_repeat -dEQP-GLES3.functional.texture.filtering.3d.combinations.linear_mipmap_linear_linear_repeat_mirror_mirror -dEQP-GLES3.functional.texture.filtering.3d.combinations.linear_mipmap_linear_linear_repeat_mirror_repeat -dEQP-GLES3.functional.texture.filtering.3d.combinations.linear_mipmap_linear_linear_repeat_repeat_mirror -dEQP-GLES3.functional.texture.filtering.3d.combinations.linear_mipmap_linear_linear_repeat_repeat_repeat -dEQP-GLES3.functional.texture.filtering.3d.combinations.linear_mipmap_linear_nearest_repeat_clamp_mirror -dEQP-GLES3.functional.texture.filtering.3d.combinations.linear_mipmap_linear_nearest_repeat_clamp_repeat -dEQP-GLES3.functional.texture.filtering.3d.combinations.linear_mipmap_linear_nearest_repeat_mirror_mirror -dEQP-GLES3.functional.texture.filtering.3d.combinations.linear_mipmap_linear_nearest_repeat_mirror_repeat -dEQP-GLES3.functional.texture.filtering.3d.combinations.linear_mipmap_linear_nearest_repeat_repeat_mirror -dEQP-GLES3.functional.texture.filtering.3d.combinations.linear_mipmap_linear_nearest_repeat_repeat_repeat -dEQP-GLES3.functional.texture.filtering.3d.combinations.linear_mipmap_nearest_linear_clamp_clamp_repeat -dEQP-GLES3.functional.texture.filtering.3d.combinations.linear_mipmap_nearest_linear_clamp_mirror_repeat -dEQP-GLES3.functional.texture.filtering.3d.combinations.linear_mipmap_nearest_linear_clamp_repeat_repeat -dEQP-GLES3.functional.texture.filtering.3d.combinations.linear_mipmap_nearest_linear_mirror_clamp_repeat -dEQP-GLES3.functional.texture.filtering.3d.combinations.linear_mipmap_nearest_linear_mirror_mirror_repeat -dEQP-GLES3.functional.texture.filtering.3d.combinations.linear_mipmap_nearest_linear_mirror_repeat_repeat -dEQP-GLES3.functional.texture.filtering.3d.combinations.linear_mipmap_nearest_linear_repeat_clamp_mirror -dEQP-GLES3.functional.texture.filtering.3d.combinations.linear_mipmap_nearest_linear_repeat_clamp_repeat -dEQP-GLES3.functional.texture.filtering.3d.combinations.linear_mipmap_nearest_linear_repeat_mirror_mirror -dEQP-GLES3.functional.texture.filtering.3d.combinations.linear_mipmap_nearest_linear_repeat_mirror_repeat -dEQP-GLES3.functional.texture.filtering.3d.combinations.linear_mipmap_nearest_linear_repeat_repeat_mirror -dEQP-GLES3.functional.texture.filtering.3d.combinations.linear_mipmap_nearest_linear_repeat_repeat_repeat -dEQP-GLES3.functional.texture.filtering.3d.combinations.linear_mipmap_nearest_nearest_repeat_clamp_mirror -dEQP-GLES3.functional.texture.filtering.3d.combinations.linear_mipmap_nearest_nearest_repeat_clamp_repeat -dEQP-GLES3.functional.texture.filtering.3d.combinations.linear_mipmap_nearest_nearest_repeat_mirror_mirror -dEQP-GLES3.functional.texture.filtering.3d.combinations.linear_mipmap_nearest_nearest_repeat_mirror_repeat -dEQP-GLES3.functional.texture.filtering.3d.combinations.linear_mipmap_nearest_nearest_repeat_repeat_mirror -dEQP-GLES3.functional.texture.filtering.3d.combinations.linear_mipmap_nearest_nearest_repeat_repeat_repeat -dEQP-GLES3.functional.texture.filtering.3d.combinations.linear_nearest_clamp_clamp_mirror -dEQP-GLES3.functional.texture.filtering.3d.combinations.linear_nearest_clamp_clamp_repeat -dEQP-GLES3.functional.texture.filtering.3d.combinations.linear_nearest_clamp_mirror_mirror -dEQP-GLES3.functional.texture.filtering.3d.combinations.linear_nearest_clamp_mirror_repeat -dEQP-GLES3.functional.texture.filtering.3d.combinations.linear_nearest_clamp_repeat_mirror -dEQP-GLES3.functional.texture.filtering.3d.combinations.linear_nearest_clamp_repeat_repeat -dEQP-GLES3.functional.texture.filtering.3d.combinations.linear_nearest_mirror_clamp_mirror -dEQP-GLES3.functional.texture.filtering.3d.combinations.linear_nearest_mirror_clamp_repeat -dEQP-GLES3.functional.texture.filtering.3d.combinations.linear_nearest_mirror_mirror_mirror -dEQP-GLES3.functional.texture.filtering.3d.combinations.linear_nearest_mirror_mirror_repeat -dEQP-GLES3.functional.texture.filtering.3d.combinations.linear_nearest_mirror_repeat_mirror -dEQP-GLES3.functional.texture.filtering.3d.combinations.linear_nearest_mirror_repeat_repeat -dEQP-GLES3.functional.texture.filtering.3d.combinations.linear_nearest_repeat_clamp_mirror -dEQP-GLES3.functional.texture.filtering.3d.combinations.linear_nearest_repeat_clamp_repeat -dEQP-GLES3.functional.texture.filtering.3d.combinations.linear_nearest_repeat_mirror_mirror -dEQP-GLES3.functional.texture.filtering.3d.combinations.linear_nearest_repeat_mirror_repeat -dEQP-GLES3.functional.texture.filtering.3d.combinations.linear_nearest_repeat_repeat_mirror -dEQP-GLES3.functional.texture.filtering.3d.combinations.linear_nearest_repeat_repeat_repeat -dEQP-GLES3.functional.texture.filtering.3d.combinations.nearest_linear_clamp_clamp_repeat -dEQP-GLES3.functional.texture.filtering.3d.combinations.nearest_linear_clamp_mirror_repeat -dEQP-GLES3.functional.texture.filtering.3d.combinations.nearest_linear_clamp_repeat_repeat -dEQP-GLES3.functional.texture.filtering.3d.combinations.nearest_linear_mirror_clamp_repeat -dEQP-GLES3.functional.texture.filtering.3d.combinations.nearest_linear_mirror_mirror_repeat -dEQP-GLES3.functional.texture.filtering.3d.combinations.nearest_linear_mirror_repeat_repeat -dEQP-GLES3.functional.texture.filtering.3d.combinations.nearest_linear_repeat_clamp_mirror -dEQP-GLES3.functional.texture.filtering.3d.combinations.nearest_linear_repeat_clamp_repeat -dEQP-GLES3.functional.texture.filtering.3d.combinations.nearest_linear_repeat_mirror_mirror -dEQP-GLES3.functional.texture.filtering.3d.combinations.nearest_linear_repeat_mirror_repeat -dEQP-GLES3.functional.texture.filtering.3d.combinations.nearest_linear_repeat_repeat_mirror -dEQP-GLES3.functional.texture.filtering.3d.combinations.nearest_linear_repeat_repeat_repeat -dEQP-GLES3.functional.texture.filtering.3d.combinations.nearest_mipmap_linear_linear_clamp_clamp_repeat -dEQP-GLES3.functional.texture.filtering.3d.combinations.nearest_mipmap_linear_linear_clamp_mirror_repeat -dEQP-GLES3.functional.texture.filtering.3d.combinations.nearest_mipmap_linear_linear_clamp_repeat_repeat -dEQP-GLES3.functional.texture.filtering.3d.combinations.nearest_mipmap_linear_linear_mirror_clamp_repeat -dEQP-GLES3.functional.texture.filtering.3d.combinations.nearest_mipmap_linear_linear_mirror_mirror_repeat -dEQP-GLES3.functional.texture.filtering.3d.combinations.nearest_mipmap_linear_linear_mirror_repeat_repeat -dEQP-GLES3.functional.texture.filtering.3d.combinations.nearest_mipmap_linear_linear_repeat_clamp_mirror -dEQP-GLES3.functional.texture.filtering.3d.combinations.nearest_mipmap_linear_linear_repeat_clamp_repeat -dEQP-GLES3.functional.texture.filtering.3d.combinations.nearest_mipmap_linear_linear_repeat_mirror_mirror -dEQP-GLES3.functional.texture.filtering.3d.combinations.nearest_mipmap_linear_linear_repeat_mirror_repeat -dEQP-GLES3.functional.texture.filtering.3d.combinations.nearest_mipmap_linear_linear_repeat_repeat_mirror -dEQP-GLES3.functional.texture.filtering.3d.combinations.nearest_mipmap_linear_linear_repeat_repeat_repeat -dEQP-GLES3.functional.texture.filtering.3d.combinations.nearest_mipmap_nearest_linear_clamp_clamp_repeat -dEQP-GLES3.functional.texture.filtering.3d.combinations.nearest_mipmap_nearest_linear_clamp_mirror_repeat -dEQP-GLES3.functional.texture.filtering.3d.combinations.nearest_mipmap_nearest_linear_clamp_repeat_repeat -dEQP-GLES3.functional.texture.filtering.3d.combinations.nearest_mipmap_nearest_linear_mirror_clamp_repeat -dEQP-GLES3.functional.texture.filtering.3d.combinations.nearest_mipmap_nearest_linear_mirror_mirror_repeat -dEQP-GLES3.functional.texture.filtering.3d.combinations.nearest_mipmap_nearest_linear_mirror_repeat_repeat -dEQP-GLES3.functional.texture.filtering.3d.combinations.nearest_mipmap_nearest_linear_repeat_clamp_mirror -dEQP-GLES3.functional.texture.filtering.3d.combinations.nearest_mipmap_nearest_linear_repeat_clamp_repeat -dEQP-GLES3.functional.texture.filtering.3d.combinations.nearest_mipmap_nearest_linear_repeat_mirror_mirror -dEQP-GLES3.functional.texture.filtering.3d.combinations.nearest_mipmap_nearest_linear_repeat_mirror_repeat -dEQP-GLES3.functional.texture.filtering.3d.combinations.nearest_mipmap_nearest_linear_repeat_repeat_mirror -dEQP-GLES3.functional.texture.filtering.3d.combinations.nearest_mipmap_nearest_linear_repeat_repeat_repeat -dEQP-GLES3.functional.texture.filtering.3d.formats.r11f_g11f_b10f_linear -dEQP-GLES3.functional.texture.filtering.3d.formats.r11f_g11f_b10f_linear_mipmap_linear -dEQP-GLES3.functional.texture.filtering.3d.formats.r11f_g11f_b10f_linear_mipmap_nearest -dEQP-GLES3.functional.texture.filtering.3d.formats.r11f_g11f_b10f_nearest_mipmap_linear -dEQP-GLES3.functional.texture.filtering.3d.formats.r11f_g11f_b10f_nearest_mipmap_nearest -dEQP-GLES3.functional.texture.filtering.3d.formats.rgb10_a2_linear -dEQP-GLES3.functional.texture.filtering.3d.formats.rgb10_a2_linear_mipmap_linear -dEQP-GLES3.functional.texture.filtering.3d.formats.rgb10_a2_linear_mipmap_nearest -dEQP-GLES3.functional.texture.filtering.3d.formats.rgb10_a2_nearest_mipmap_linear -dEQP-GLES3.functional.texture.filtering.3d.formats.rgb10_a2_nearest_mipmap_nearest -dEQP-GLES3.functional.texture.filtering.3d.formats.rgb565_linear -dEQP-GLES3.functional.texture.filtering.3d.formats.rgb565_linear_mipmap_linear -dEQP-GLES3.functional.texture.filtering.3d.formats.rgb565_linear_mipmap_nearest -dEQP-GLES3.functional.texture.filtering.3d.formats.rgb565_nearest_mipmap_linear -dEQP-GLES3.functional.texture.filtering.3d.formats.rgb565_nearest_mipmap_nearest -dEQP-GLES3.functional.texture.filtering.3d.formats.rgb5_a1_linear -dEQP-GLES3.functional.texture.filtering.3d.formats.rgb5_a1_linear_mipmap_linear -dEQP-GLES3.functional.texture.filtering.3d.formats.rgb5_a1_linear_mipmap_nearest -dEQP-GLES3.functional.texture.filtering.3d.formats.rgb5_a1_nearest_mipmap_linear -dEQP-GLES3.functional.texture.filtering.3d.formats.rgb5_a1_nearest_mipmap_nearest -dEQP-GLES3.functional.texture.filtering.3d.formats.rgb9_e5_linear -dEQP-GLES3.functional.texture.filtering.3d.formats.rgb9_e5_linear_mipmap_linear -dEQP-GLES3.functional.texture.filtering.3d.formats.rgb9_e5_linear_mipmap_nearest -dEQP-GLES3.functional.texture.filtering.3d.formats.rgb9_e5_nearest_mipmap_linear -dEQP-GLES3.functional.texture.filtering.3d.formats.rgb9_e5_nearest_mipmap_nearest -dEQP-GLES3.functional.texture.filtering.3d.formats.rgba16f_linear -dEQP-GLES3.functional.texture.filtering.3d.formats.rgba16f_linear_mipmap_linear -dEQP-GLES3.functional.texture.filtering.3d.formats.rgba16f_linear_mipmap_nearest -dEQP-GLES3.functional.texture.filtering.3d.formats.rgba16f_nearest_mipmap_linear -dEQP-GLES3.functional.texture.filtering.3d.formats.rgba16f_nearest_mipmap_nearest -dEQP-GLES3.functional.texture.filtering.3d.formats.rgba4_linear -dEQP-GLES3.functional.texture.filtering.3d.formats.rgba4_linear_mipmap_linear -dEQP-GLES3.functional.texture.filtering.3d.formats.rgba4_linear_mipmap_nearest -dEQP-GLES3.functional.texture.filtering.3d.formats.rgba4_nearest_mipmap_linear -dEQP-GLES3.functional.texture.filtering.3d.formats.rgba4_nearest_mipmap_nearest -dEQP-GLES3.functional.texture.filtering.3d.formats.rgba8_linear -dEQP-GLES3.functional.texture.filtering.3d.formats.rgba8_linear_mipmap_linear -dEQP-GLES3.functional.texture.filtering.3d.formats.rgba8_linear_mipmap_nearest -dEQP-GLES3.functional.texture.filtering.3d.formats.rgba8_nearest_mipmap_linear -dEQP-GLES3.functional.texture.filtering.3d.formats.rgba8_nearest_mipmap_nearest -dEQP-GLES3.functional.texture.filtering.3d.formats.rgba8_snorm_linear -dEQP-GLES3.functional.texture.filtering.3d.formats.rgba8_snorm_linear_mipmap_linear -dEQP-GLES3.functional.texture.filtering.3d.formats.rgba8_snorm_linear_mipmap_nearest -dEQP-GLES3.functional.texture.filtering.3d.formats.rgba8_snorm_nearest_mipmap_linear -dEQP-GLES3.functional.texture.filtering.3d.formats.rgba8_snorm_nearest_mipmap_nearest -dEQP-GLES3.functional.texture.filtering.3d.formats.srgb8_alpha8_linear -dEQP-GLES3.functional.texture.filtering.3d.formats.srgb8_alpha8_linear_mipmap_linear -dEQP-GLES3.functional.texture.filtering.3d.formats.srgb8_alpha8_linear_mipmap_nearest -dEQP-GLES3.functional.texture.filtering.3d.formats.srgb8_alpha8_nearest_mipmap_linear -dEQP-GLES3.functional.texture.filtering.3d.formats.srgb8_alpha8_nearest_mipmap_nearest -dEQP-GLES3.functional.texture.filtering.3d.formats.srgb_r8_linear -dEQP-GLES3.functional.texture.filtering.3d.formats.srgb_r8_linear_mipmap_linear -dEQP-GLES3.functional.texture.filtering.3d.formats.srgb_r8_linear_mipmap_nearest -dEQP-GLES3.functional.texture.filtering.3d.formats.srgb_r8_nearest_mipmap_linear -dEQP-GLES3.functional.texture.filtering.3d.formats.srgb_r8_nearest_mipmap_nearest -dEQP-GLES3.functional.texture.filtering.3d.sizes.128x32x64_linear -dEQP-GLES3.functional.texture.filtering.3d.sizes.128x32x64_linear_mipmap_linear -dEQP-GLES3.functional.texture.filtering.3d.sizes.128x32x64_linear_mipmap_nearest -dEQP-GLES3.functional.texture.filtering.3d.sizes.128x32x64_nearest_mipmap_linear -dEQP-GLES3.functional.texture.filtering.3d.sizes.128x32x64_nearest_mipmap_nearest -dEQP-GLES3.functional.texture.filtering.3d.sizes.63x63x63_linear -dEQP-GLES3.functional.texture.filtering.3d.sizes.63x63x63_linear_mipmap_linear -dEQP-GLES3.functional.texture.filtering.3d.sizes.63x63x63_linear_mipmap_nearest -dEQP-GLES3.functional.texture.filtering.3d.sizes.63x63x63_nearest_mipmap_linear -dEQP-GLES3.functional.texture.filtering.3d.sizes.63x63x63_nearest_mipmap_nearest -dEQP-GLES3.functional.texture.vertex.3d.filtering.linear_linear_clamp -dEQP-GLES3.functional.texture.vertex.3d.filtering.linear_linear_mirror -dEQP-GLES3.functional.texture.vertex.3d.filtering.linear_linear_repeat -dEQP-GLES3.functional.texture.vertex.3d.filtering.linear_mipmap_linear_linear_clamp -dEQP-GLES3.functional.texture.vertex.3d.filtering.linear_mipmap_linear_linear_mirror -dEQP-GLES3.functional.texture.vertex.3d.filtering.linear_mipmap_linear_linear_repeat -dEQP-GLES3.functional.texture.vertex.3d.filtering.linear_mipmap_linear_nearest_clamp -dEQP-GLES3.functional.texture.vertex.3d.filtering.linear_mipmap_linear_nearest_mirror -dEQP-GLES3.functional.texture.vertex.3d.filtering.linear_mipmap_linear_nearest_repeat -dEQP-GLES3.functional.texture.vertex.3d.filtering.linear_mipmap_nearest_linear_repeat -dEQP-GLES3.functional.texture.vertex.3d.filtering.linear_nearest_clamp -dEQP-GLES3.functional.texture.vertex.3d.filtering.linear_nearest_mirror -dEQP-GLES3.functional.texture.vertex.3d.filtering.linear_nearest_repeat -dEQP-GLES3.functional.texture.vertex.3d.filtering.nearest_linear_repeat -dEQP-GLES3.functional.texture.vertex.3d.filtering.nearest_mipmap_linear_linear_repeat -dEQP-GLES3.functional.texture.vertex.3d.filtering.nearest_mipmap_nearest_linear_repeat -dEQP-GLES3.functional.texture.vertex.3d.wrap.clamp_clamp_clamp -dEQP-GLES3.functional.texture.vertex.3d.wrap.clamp_clamp_mirror -dEQP-GLES3.functional.texture.vertex.3d.wrap.clamp_clamp_repeat -dEQP-GLES3.functional.texture.vertex.3d.wrap.clamp_mirror_mirror -dEQP-GLES3.functional.texture.vertex.3d.wrap.clamp_mirror_repeat -dEQP-GLES3.functional.texture.vertex.3d.wrap.clamp_repeat_mirror -dEQP-GLES3.functional.texture.vertex.3d.wrap.clamp_repeat_repeat -dEQP-GLES3.functional.texture.vertex.3d.wrap.mirror_clamp_clamp -dEQP-GLES3.functional.texture.vertex.3d.wrap.mirror_clamp_mirror -dEQP-GLES3.functional.texture.vertex.3d.wrap.mirror_clamp_repeat -dEQP-GLES3.functional.texture.vertex.3d.wrap.mirror_mirror_mirror -dEQP-GLES3.functional.texture.vertex.3d.wrap.mirror_mirror_repeat -dEQP-GLES3.functional.texture.vertex.3d.wrap.mirror_repeat_mirror -dEQP-GLES3.functional.texture.vertex.3d.wrap.mirror_repeat_repeat -dEQP-GLES3.functional.texture.vertex.3d.wrap.repeat_clamp_clamp -dEQP-GLES3.functional.texture.vertex.3d.wrap.repeat_clamp_mirror -dEQP-GLES3.functional.texture.vertex.3d.wrap.repeat_clamp_repeat -dEQP-GLES3.functional.texture.vertex.3d.wrap.repeat_mirror_clamp -dEQP-GLES3.functional.texture.vertex.3d.wrap.repeat_mirror_mirror -dEQP-GLES3.functional.texture.vertex.3d.wrap.repeat_mirror_repeat -dEQP-GLES3.functional.texture.vertex.3d.wrap.repeat_repeat_clamp -dEQP-GLES3.functional.texture.vertex.3d.wrap.repeat_repeat_mirror -dEQP-GLES3.functional.texture.vertex.3d.wrap.repeat_repeat_repeat -dEQP-GLES3.functional.texture.wrap.astc_8x8.repeat_repeat_linear_divisible -dEQP-GLES3.functional.texture.wrap.astc_8x8.repeat_repeat_linear_not_divisible -dEQP-GLES3.functional.texture.wrap.astc_8x8_srgb.repeat_repeat_linear_divisible -dEQP-GLES3.functional.texture.wrap.astc_8x8_srgb.repeat_repeat_linear_not_divisible -dEQP-GLES3.functional.vertex_arrays.single_attribute.normalize.int2_10_10_10.components4_quads1 -dEQP-GLES3.functional.vertex_arrays.single_attribute.normalize.int2_10_10_10.components4_quads256 diff -Nru mesa-19.2.8/.gitlab-ci/meson-build.sh mesa-20.0.8/.gitlab-ci/meson-build.sh --- mesa-19.2.8/.gitlab-ci/meson-build.sh 2020-07-13 13:21:26.000000000 +0000 +++ mesa-20.0.8/.gitlab-ci/meson-build.sh 1970-01-01 00:00:00.000000000 +0000 @@ -1,62 +0,0 @@ -#!/bin/bash - -set -e -set -o xtrace - -# We need to control the version of llvm-config we're using, so we'll -# generate a native file to do so. This requires meson >=0.49 -if test -n "$LLVM_VERSION"; then - LLVM_CONFIG="llvm-config-${LLVM_VERSION}" - echo -e "[binaries]\nllvm-config = '`which $LLVM_CONFIG`'" > native.file - $LLVM_CONFIG --version -else - rm -f native.file - touch native.file -fi - -rm -rf _build -meson _build --native-file=native.file \ - ${CROSS+--cross /cross_file-$CROSS.txt} \ - -D prefix=`pwd`/install \ - -D libdir=lib \ - -D buildtype=${BUILDTYPE:-debug} \ - -D build-tests=true \ - -D libunwind=${UNWIND} \ - ${DRI_LOADERS} \ - -D dri-drivers=${DRI_DRIVERS:-[]} \ - ${GALLIUM_ST} \ - -D gallium-drivers=${GALLIUM_DRIVERS:-[]} \ - -D vulkan-drivers=${VULKAN_DRIVERS:-[]} \ - -D I-love-half-baked-turnips=true \ - ${EXTRA_OPTION} -cd _build -meson configure -ninja -j4 -LC_ALL=C.UTF-8 ninja test -ninja install -cd .. - -if test -n "$MESON_SHADERDB"; then - ./.gitlab-ci/run-shader-db.sh; -fi - -# Delete 2MB of includes from artifacts. -rm -rf install/include - -# Strip the drivers in the artifacts to cut 80% of the artifacts size. -if [ -n "$CROSS" ]; then - STRIP=`sed -n -E "s/strip\s*=\s*'(.*)'/\1/p" /cross_file-$CROSS.txt` - if [ -z "$STRIP" ]; then - echo "Failed to find strip command in cross file" - exit 1 - fi -else - STRIP="strip" -fi -find install -name \*.so -exec $STRIP {} \; - -# Test runs don't pull down the git tree, so put the dEQP helper -# script and associated bits there. -mkdir -p artifacts/ -cp -Rp .gitlab-ci/deqp* artifacts/ -# cp -Rp src/freedreno/ci/expected* artifacts/ diff -Nru mesa-19.2.8/.gitlab-ci/run-shader-db.sh mesa-20.0.8/.gitlab-ci/run-shader-db.sh --- mesa-19.2.8/.gitlab-ci/run-shader-db.sh 2020-07-13 13:21:26.000000000 +0000 +++ mesa-20.0.8/.gitlab-ci/run-shader-db.sh 1970-01-01 00:00:00.000000000 +0000 @@ -1,17 +0,0 @@ -set -e -set -v - -ARTIFACTSDIR=`pwd`/shader-db -mkdir -p $ARTIFACTSDIR -export DRM_SHIM_DEBUG=true - -LIBDIR=`pwd`/install/lib -export LD_LIBRARY_PATH=$LIBDIR - -cd /usr/local/shader-db - -for driver in freedreno v3d; do - env LD_PRELOAD=$LIBDIR/lib${driver}_noop_drm_shim.so \ - ./run -j 4 ./shaders \ - > $ARTIFACTSDIR/${driver}-shader-db.txt -done diff -Nru mesa-19.2.8/.gitlab-ci.yml mesa-20.0.8/.gitlab-ci.yml --- mesa-19.2.8/.gitlab-ci.yml 2020-07-13 13:21:26.000000000 +0000 +++ mesa-20.0.8/.gitlab-ci.yml 1970-01-01 00:00:00.000000000 +0000 @@ -1,382 +0,0 @@ -# This is the tag of the docker image used for the build jobs. If the -# image doesn't exist yet, the containers-build stage generates it. -# -# In order to generate a new image, one should generally change the tag. -# While removing the image from the registry would also work, that's not -# recommended except for ephemeral images during development: Replacing -# an image after a significant amount of time might pull in newer -# versions of gcc/clang or other packages, which might break the build -# with older commits using the same tag. -# -# After merging a change resulting in generating a new image to the -# main repository, it's recommended to remove the image from the source -# repository's container registry, so that the image from the main -# repository's registry will be used there as well. -variables: - UPSTREAM_REPO: mesa/mesa - DEBIAN_TAG: "2019-08-09" - DEBIAN_VERSION: stretch-slim - DEBIAN_IMAGE: "$CI_REGISTRY_IMAGE/debian/$DEBIAN_VERSION:$DEBIAN_TAG" - -include: - - project: 'wayland/ci-templates' - ref: c73dae8b84697ef18e2dbbf4fed7386d9652b0cd - file: '/templates/debian.yml' - -stages: - - containers-build - - build+test - - test - - -# When to automatically run the CI -.ci-run-policy: &ci-run-policy - only: - - branches@mesa/mesa - - merge_requests - - /^ci([-/].*)?$/ - retry: - max: 2 - when: - - runner_system_failure - -.ci-deqp-artifacts: &ci-deqp-artifacts - artifacts: - when: always - untracked: false - paths: - # Watch out! Artifacts are relative to the build dir. - # https://gitlab.com/gitlab-org/gitlab-ce/commit/8788fb925706cad594adf6917a6c5f6587dd1521 - - artifacts - -# CONTAINERS - -debian: - extends: .debian@container-ifnot-exists - stage: containers-build - <<: *ci-run-policy - variables: - GIT_STRATEGY: none # no need to pull the whole tree for rebuilding the image - DEBIAN_EXEC: 'bash .gitlab-ci/debian-install.sh' - - -# BUILD - -.build: - <<: *ci-run-policy - image: $DEBIAN_IMAGE - stage: build+test - cache: - paths: - - ccache - artifacts: - when: always - paths: - - _build/meson-logs/*.txt - # scons: - - build/*/config.log - - shader-db - variables: - CCACHE_COMPILERCHECK: "content" - # Use ccache transparently, and print stats before/after - before_script: - - export PATH="/usr/lib/ccache:$PATH" - - export CCACHE_BASEDIR="$PWD" - - export CCACHE_DIR="$PWD/ccache" - - ccache --zero-stats || true - - ccache --show-stats || true - after_script: - # In case the install dir is being saved as artifacts, tar it up - # so that symlinks and hardlinks aren't each packed separately in - # the zip file. - - if [ -d install ]; then - tar -cf artifacts/install.tar install; - fi - - export CCACHE_DIR="$PWD/ccache" - - ccache --show-stats - -.meson-build: - extends: .build - script: - - .gitlab-ci/meson-build.sh - -.scons-build: - extends: .build - variables: - SCONSFLAGS: "-j4" - script: - - if test -n "$LLVM_VERSION"; then - export LLVM_CONFIG="llvm-config-${LLVM_VERSION}"; - fi - - scons $SCONS_TARGET - - eval $SCONS_CHECK_COMMAND - -# NOTE: Building SWR is 2x (yes two) times slower than all the other -# gallium drivers combined. -# Start this early so that it doesn't limit the total run time. -# -# We also stick the glvnd build here, since we want non-glvnd in -# meson-main for actual driver CI. -meson-swr-glvnd: - extends: .meson-build - variables: - UNWIND: "true" - DRI_LOADERS: > - -D glvnd=true - -D egl=true - GALLIUM_ST: > - -D dri3=true - -D gallium-vdpau=false - -D gallium-xvmc=false - -D gallium-omx=disabled - -D gallium-va=false - -D gallium-xa=false - -D gallium-nine=false - -D gallium-opencl=disabled - GALLIUM_DRIVERS: "swr,iris" - LLVM_VERSION: "6.0" - -meson-clang: - extends: .meson-build - variables: - UNWIND: "true" - DRI_DRIVERS: "auto" - GALLIUM_DRIVERS: "auto" - VULKAN_DRIVERS: intel,amd,freedreno - CC: "ccache clang-8" - CXX: "ccache clang++-8" - before_script: - - export CCACHE_BASEDIR="$PWD" CCACHE_DIR="$PWD/ccache" - - ccache --zero-stats --show-stats || true - # clang++ breaks if it picks up the GCC 8 directory without libstdc++.so - - apt-get remove -y libgcc-8-dev - -scons-swr: - extends: .scons-build - variables: - SCONS_TARGET: "swr=1" - SCONS_CHECK_COMMAND: "true" - LLVM_VERSION: "6.0" - -scons-win64: - extends: .scons-build - variables: - SCONS_TARGET: platform=windows machine=x86_64 - SCONS_CHECK_COMMAND: "true" - -meson-main: - extends: .meson-build - variables: - UNWIND: "true" - DRI_LOADERS: > - -D glx=dri - -D gbm=true - -D egl=true - -D platforms=x11,wayland,drm,surfaceless - DRI_DRIVERS: "i915,i965,r100,r200,nouveau" - GALLIUM_ST: > - -D dri3=true - -D gallium-extra-hud=true - -D gallium-vdpau=true - -D gallium-xvmc=true - -D gallium-omx=bellagio - -D gallium-va=true - -D gallium-xa=true - -D gallium-nine=true - -D gallium-opencl=disabled - GALLIUM_DRIVERS: "iris,nouveau,kmsro,r300,r600,freedreno,swrast,svga,v3d,vc4,virgl,etnaviv,panfrost,lima" - LLVM_VERSION: "7" - EXTRA_OPTION: > - -D osmesa=gallium - -D tools=all - MESON_SHADERDB: "true" - BUILDTYPE: "debugoptimized" - <<: *ci-deqp-artifacts - -meson-clover: - extends: .meson-build - variables: - UNWIND: "true" - DRI_LOADERS: > - -D glx=disabled - -D egl=false - -D gbm=false - GALLIUM_ST: > - -D dri3=false - -D gallium-vdpau=false - -D gallium-xvmc=false - -D gallium-omx=disabled - -D gallium-va=false - -D gallium-xa=false - -D gallium-nine=false - -D gallium-opencl=icd - script: - - export GALLIUM_DRIVERS="r600,radeonsi" - - .gitlab-ci/meson-build.sh - - LLVM_VERSION=7 .gitlab-ci/meson-build.sh - - export GALLIUM_DRIVERS="i915,r600" - - LLVM_VERSION=3.9 .gitlab-ci/meson-build.sh - - LLVM_VERSION=4.0 .gitlab-ci/meson-build.sh - - LLVM_VERSION=5.0 .gitlab-ci/meson-build.sh - - LLVM_VERSION=6.0 .gitlab-ci/meson-build.sh - -meson-vulkan: - extends: .meson-build - variables: - UNWIND: "false" - DRI_LOADERS: > - -D glx=disabled - -D gbm=false - -D egl=false - -D platforms=x11,wayland,drm - -D osmesa=none - GALLIUM_ST: > - -D dri3=true - -D gallium-vdpau=false - -D gallium-xvmc=false - -D gallium-omx=disabled - -D gallium-va=false - -D gallium-xa=false - -D gallium-nine=false - -D gallium-opencl=disabled - VULKAN_DRIVERS: intel,amd,freedreno - LLVM_VERSION: "7" - EXTRA_OPTION: > - -D vulkan-overlay-layer=true - -.meson-cross: - extends: .meson-build - variables: - UNWIND: "false" - DRI_LOADERS: > - -D glx=disabled - -D gbm=false - -D egl=false - -D platforms=surfaceless - -D osmesa=none - GALLIUM_ST: > - -D dri3=false - -D gallium-vdpau=false - -D gallium-xvmc=false - -D gallium-omx=disabled - -D gallium-va=false - -D gallium-xa=false - -D gallium-nine=false - -D llvm=false - <<: *ci-deqp-artifacts - script: - - .gitlab-ci/meson-build.sh - -meson-armhf: - extends: .meson-cross - variables: - CROSS: armhf - VULKAN_DRIVERS: freedreno - GALLIUM_DRIVERS: "etnaviv,freedreno,kmsro,lima,nouveau,panfrost,tegra,v3d,vc4" - # Disable the tests since we're cross compiling. - EXTRA_OPTION: > - -D build-tests=false - -D I-love-half-baked-turnips=true - -D vulkan-overlay-layer=true - -meson-arm64: - extends: .meson-cross - variables: - CROSS: arm64 - VULKAN_DRIVERS: freedreno - GALLIUM_DRIVERS: "etnaviv,freedreno,kmsro,lima,nouveau,panfrost,tegra,v3d,vc4" - # Disable the tests since we're cross compiling. - EXTRA_OPTION: > - -D build-tests=false - -D I-love-half-baked-turnips=true - -D vulkan-overlay-layer=true - -# While the main point of this build is testing the i386 cross build, -# we also use this one to test some other options that are exclusive -# with meson-main's choices (classic swrast and osmesa) -meson-i386: - extends: .meson-cross - variables: - CROSS: i386 - VULKAN_DRIVERS: intel - DRI_DRIVERS: "swrast" - GALLIUM_DRIVERS: "iris" - # Disable i386 tests, because u_format_tests gets precision - # failures in dxtn unpacking - EXTRA_OPTION: > - -D build-tests=false - -D vulkan-overlay-layer=true - -D llvm=false - -D osmesa=classic - -scons-nollvm: - extends: .scons-build - variables: - SCONS_TARGET: "llvm=0" - SCONS_CHECK_COMMAND: "scons llvm=0 check" - -scons-llvm: - extends: .scons-build - variables: - SCONS_TARGET: "llvm=1" - SCONS_CHECK_COMMAND: "scons llvm=1 check" - LLVM_VERSION: "3.4" - # LLVM 3.4 packages were built with an old libstdc++ ABI - CXX: "g++ -D_GLIBCXX_USE_CXX11_ABI=0" - -.deqp-test: - <<: *ci-run-policy - stage: test - image: $DEBIAN_IMAGE - variables: - GIT_STRATEGY: none # testing doesn't build anything from source - DEQP_SKIPS: deqp-default-skips.txt - script: - # Note: Build dir (and thus install) may be dirty due to GIT_STRATEGY - - rm -rf install - - tar -xf artifacts/install.tar - - ./artifacts/deqp-runner.sh - artifacts: - when: on_failure - name: "$CI_JOB_NAME-$CI_COMMIT_REF_NAME" - paths: - - results/ - -test-llvmpipe-gles2: - parallel: 4 - variables: - DEQP_VER: gles2 - DEQP_EXPECTED_FAILS: deqp-llvmpipe-fails.txt - LIBGL_ALWAYS_SOFTWARE: "true" - DEQP_RENDERER_MATCH: "llvmpipe" - extends: .deqp-test - dependencies: - - meson-main - -test-softpipe-gles2: - parallel: 4 - variables: - DEQP_VER: gles2 - DEQP_EXPECTED_FAILS: deqp-softpipe-fails.txt - LIBGL_ALWAYS_SOFTWARE: "true" - DEQP_RENDERER_MATCH: "softpipe" - GALLIUM_DRIVER: "softpipe" - extends: .deqp-test - dependencies: - - meson-main - -# The GLES2 CTS run takes about 8 minutes of CPU time, while GLES3 is -# 25 minutes. Until we can get its runtime down, just do a partial -# (every 10 tests) run. -test-softpipe-gles3-limited: - variables: - DEQP_VER: gles3 - DEQP_EXPECTED_FAILS: deqp-softpipe-fails.txt - LIBGL_ALWAYS_SOFTWARE: "true" - DEQP_RENDERER_MATCH: "softpipe" - GALLIUM_DRIVER: "softpipe" - CI_NODE_INDEX: 1 - CI_NODE_TOTAL: 10 - extends: .deqp-test - dependencies: - - meson-main diff -Nru mesa-19.2.8/include/CL/cl2.hpp mesa-20.0.8/include/CL/cl2.hpp --- mesa-19.2.8/include/CL/cl2.hpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/include/CL/cl2.hpp 2020-06-12 01:21:16.000000000 +0000 @@ -29,7 +29,8 @@ /*! \file * * \brief C++ bindings for OpenCL 1.0 (rev 48), OpenCL 1.1 (rev 33), - * OpenCL 1.2 (rev 15) and OpenCL 2.0 (rev 29) + * OpenCL 1.2 (rev 15), OpenCL 2.0 (rev 29), OpenCL 2.1 (rev 17), + * and OpenCL 2.2 (V2.2-11). * \author Lee Howes and Bruce Merry * * Derived from the OpenCL 1.x C++ bindings written by @@ -40,9 +41,8 @@ * Bruce Merry, February 2013. * Tom Deakin and Simon McIntosh-Smith, July 2013 * James Price, 2015- - * - * \version 2.0.10 - * \date 2016-07-20 + * \version 2.2.0 + * \date 2019-09-18 * * Optional extension support * @@ -207,6 +207,14 @@ * applies to use of cl::Program construction and other program * build variants. * + * - CL_HPP_USE_CL_SUB_GROUPS_KHR + * + * Enable the cl_khr_subgroups extension. + * + * - CL_HPP_USE_IL_KHR + * + * Enable the cl_khr_il_program extension. + * * * \section example Example * @@ -439,13 +447,18 @@ /* Detect which version to target */ #if !defined(CL_HPP_TARGET_OPENCL_VERSION) -# pragma message("cl2.hpp: CL_HPP_TARGET_OPENCL_VERSION is not defined. It will default to 200 (OpenCL 2.0)") -# define CL_HPP_TARGET_OPENCL_VERSION 200 +# pragma message("cl2.hpp: CL_HPP_TARGET_OPENCL_VERSION is not defined. It will default to 220 (OpenCL 2.2)") +# define CL_HPP_TARGET_OPENCL_VERSION 220 #endif -#if CL_HPP_TARGET_OPENCL_VERSION != 100 && CL_HPP_TARGET_OPENCL_VERSION != 110 && CL_HPP_TARGET_OPENCL_VERSION != 120 && CL_HPP_TARGET_OPENCL_VERSION != 200 -# pragma message("cl2.hpp: CL_HPP_TARGET_OPENCL_VERSION is not a valid value (100, 110, 120 or 200). It will be set to 200") +#if CL_HPP_TARGET_OPENCL_VERSION != 100 && \ + CL_HPP_TARGET_OPENCL_VERSION != 110 && \ + CL_HPP_TARGET_OPENCL_VERSION != 120 && \ + CL_HPP_TARGET_OPENCL_VERSION != 200 && \ + CL_HPP_TARGET_OPENCL_VERSION != 210 && \ + CL_HPP_TARGET_OPENCL_VERSION != 220 +# pragma message("cl2.hpp: CL_HPP_TARGET_OPENCL_VERSION is not a valid value (100, 110, 120, 200, 210 or 220). It will be set to 220") # undef CL_HPP_TARGET_OPENCL_VERSION -# define CL_HPP_TARGET_OPENCL_VERSION 200 +# define CL_HPP_TARGET_OPENCL_VERSION 220 #endif /* Forward target OpenCL version to C headers if necessary */ @@ -462,8 +475,13 @@ #if !defined(CL_HPP_MINIMUM_OPENCL_VERSION) # define CL_HPP_MINIMUM_OPENCL_VERSION 200 #endif -#if CL_HPP_MINIMUM_OPENCL_VERSION != 100 && CL_HPP_MINIMUM_OPENCL_VERSION != 110 && CL_HPP_MINIMUM_OPENCL_VERSION != 120 && CL_HPP_MINIMUM_OPENCL_VERSION != 200 -# pragma message("cl2.hpp: CL_HPP_MINIMUM_OPENCL_VERSION is not a valid value (100, 110, 120 or 200). It will be set to 100") +#if CL_HPP_MINIMUM_OPENCL_VERSION != 100 && \ + CL_HPP_MINIMUM_OPENCL_VERSION != 110 && \ + CL_HPP_MINIMUM_OPENCL_VERSION != 120 && \ + CL_HPP_MINIMUM_OPENCL_VERSION != 200 && \ + CL_HPP_MINIMUM_OPENCL_VERSION != 210 && \ + CL_HPP_MINIMUM_OPENCL_VERSION != 220 +# pragma message("cl2.hpp: CL_HPP_MINIMUM_OPENCL_VERSION is not a valid value (100, 110, 120, 200, 210 or 220). It will be set to 100") # undef CL_HPP_MINIMUM_OPENCL_VERSION # define CL_HPP_MINIMUM_OPENCL_VERSION 100 #endif @@ -483,6 +501,12 @@ #if CL_HPP_MINIMUM_OPENCL_VERSION <= 200 && !defined(CL_USE_DEPRECATED_OPENCL_2_0_APIS) # define CL_USE_DEPRECATED_OPENCL_2_0_APIS #endif +#if CL_HPP_MINIMUM_OPENCL_VERSION <= 210 && !defined(CL_USE_DEPRECATED_OPENCL_2_1_APIS) +# define CL_USE_DEPRECATED_OPENCL_2_1_APIS +#endif +#if CL_HPP_MINIMUM_OPENCL_VERSION <= 220 && !defined(CL_USE_DEPRECATED_OPENCL_2_2_APIS) +# define CL_USE_DEPRECATED_OPENCL_2_2_APIS +#endif #ifdef _WIN32 @@ -525,6 +549,8 @@ #if defined(_MSC_VER) # define CL_HPP_DEFINE_STATIC_MEMBER_ __declspec(selectany) +#elif defined(__MINGW32__) +# define CL_HPP_DEFINE_STATIC_MEMBER_ __attribute__((selectany)) #else # define CL_HPP_DEFINE_STATIC_MEMBER_ __attribute__((weak)) #endif // !_MSC_VER @@ -803,6 +829,9 @@ #if CL_HPP_TARGET_OPENCL_VERSION >= 120 #define __GET_KERNEL_ARG_INFO_ERR CL_HPP_ERR_STR_(clGetKernelArgInfo) #endif // CL_HPP_TARGET_OPENCL_VERSION >= 120 +#if CL_HPP_TARGET_OPENCL_VERSION >= 200 +#define __GET_KERNEL_SUB_GROUP_INFO_ERR CL_HPP_ERR_STR_(clGetKernelSubGroupInfo) +#endif // CL_HPP_TARGET_OPENCL_VERSION >= 200 #define __GET_KERNEL_WORK_GROUP_INFO_ERR CL_HPP_ERR_STR_(clGetKernelWorkGroupInfo) #define __GET_PROGRAM_INFO_ERR CL_HPP_ERR_STR_(clGetProgramInfo) #define __GET_PROGRAM_BUILD_INFO_ERR CL_HPP_ERR_STR_(clGetProgramBuildInfo) @@ -833,7 +862,13 @@ #define __CREATE_KERNEL_ERR CL_HPP_ERR_STR_(clCreateKernel) #define __SET_KERNEL_ARGS_ERR CL_HPP_ERR_STR_(clSetKernelArg) #define __CREATE_PROGRAM_WITH_SOURCE_ERR CL_HPP_ERR_STR_(clCreateProgramWithSource) +#if CL_HPP_TARGET_OPENCL_VERSION >= 200 +#define __CREATE_PROGRAM_WITH_IL_ERR CL_HPP_ERR_STR_(clCreateProgramWithIL) +#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200 #define __CREATE_PROGRAM_WITH_BINARY_ERR CL_HPP_ERR_STR_(clCreateProgramWithBinary) +#if CL_HPP_TARGET_OPENCL_VERSION >= 210 +#define __CREATE_PROGRAM_WITH_IL_ERR CL_HPP_ERR_STR_(clCreateProgramWithIL) +#endif // CL_HPP_TARGET_OPENCL_VERSION >= 210 #if CL_HPP_TARGET_OPENCL_VERSION >= 120 #define __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR CL_HPP_ERR_STR_(clCreateProgramWithBuiltInKernels) #endif // CL_HPP_TARGET_OPENCL_VERSION >= 120 @@ -870,6 +905,11 @@ #if CL_HPP_TARGET_OPENCL_VERSION >= 120 #define __ENQUEUE_MIGRATE_MEM_OBJECTS_ERR CL_HPP_ERR_STR_(clEnqueueMigrateMemObjects) #endif // CL_HPP_TARGET_OPENCL_VERSION >= 120 +#if CL_HPP_TARGET_OPENCL_VERSION >= 210 +#define __ENQUEUE_MIGRATE_SVM_ERR CL_HPP_ERR_STR_(clEnqueueSVMMigrateMem) +#define __SET_DEFAULT_DEVICE_COMMAND_QUEUE_ERR CL_HPP_ERR_STR_(clSetDefaultDeviceCommandQueue) +#endif // CL_HPP_TARGET_OPENCL_VERSION >= 210 + #define __ENQUEUE_ACQUIRE_GL_ERR CL_HPP_ERR_STR_(clEnqueueAcquireGLObjects) #define __ENQUEUE_RELEASE_GL_ERR CL_HPP_ERR_STR_(clEnqueueReleaseGLObjects) @@ -884,6 +924,16 @@ #define __FINISH_ERR CL_HPP_ERR_STR_(clFinish) #define __VECTOR_CAPACITY_ERR CL_HPP_ERR_STR_(Vector capacity error) +#if CL_HPP_TARGET_OPENCL_VERSION >= 210 +#define __GET_HOST_TIMER_ERR CL_HPP_ERR_STR_(clGetHostTimer) +#define __GET_DEVICE_AND_HOST_TIMER_ERR CL_HPP_ERR_STR_(clGetDeviceAndHostTimer) +#endif +#if CL_HPP_TARGET_OPENCL_VERSION >= 220 +#define __SET_PROGRAM_RELEASE_CALLBACK_ERR CL_HPP_ERR_STR_(clSetProgramReleaseCallback) +#define __SET_PROGRAM_SPECIALIZATION_CONSTANT_ERR CL_HPP_ERR_STR_(clSetProgramSpecializationConstant) +#endif + + /** * CL 1.2 version that uses device fission. */ @@ -924,6 +974,10 @@ #define __ENQUEUE_BARRIER_WAIT_LIST_ERR CL_HPP_ERR_STR_(clEnqueueBarrierWithWaitList) #endif // CL_HPP_TARGET_OPENCL_VERSION >= 120 +#if CL_HPP_TARGET_OPENCL_VERSION >= 210 +#define __CLONE_KERNEL_ERR CL_HPP_ERR_STR_(clCloneKernel) +#endif // CL_HPP_TARGET_OPENCL_VERSION >= 210 + #endif // CL_HPP_USER_OVERRIDE_ERROR_STRINGS //! \endcond @@ -1304,6 +1358,31 @@ F(cl_pipe_info, CL_PIPE_PACKET_SIZE, cl_uint) \ F(cl_pipe_info, CL_PIPE_MAX_PACKETS, cl_uint) +#define CL_HPP_PARAM_NAME_INFO_SUBGROUP_KHR_(F) \ + F(cl_kernel_sub_group_info, CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR, size_type) \ + F(cl_kernel_sub_group_info, CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE_KHR, size_type) + +#define CL_HPP_PARAM_NAME_INFO_IL_KHR_(F) \ + F(cl_device_info, CL_DEVICE_IL_VERSION_KHR, string) \ + F(cl_program_info, CL_PROGRAM_IL_KHR, cl::vector) + +#define CL_HPP_PARAM_NAME_INFO_2_1_(F) \ + F(cl_platform_info, CL_PLATFORM_HOST_TIMER_RESOLUTION, size_type) \ + F(cl_program_info, CL_PROGRAM_IL, cl::vector) \ + F(cl_kernel_info, CL_KERNEL_MAX_NUM_SUB_GROUPS, size_type) \ + F(cl_kernel_info, CL_KERNEL_COMPILE_NUM_SUB_GROUPS, size_type) \ + F(cl_device_info, CL_DEVICE_MAX_NUM_SUB_GROUPS, cl_uint) \ + F(cl_device_info, CL_DEVICE_IL_VERSION, string) \ + F(cl_device_info, CL_DEVICE_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS, cl_bool) \ + F(cl_command_queue_info, CL_QUEUE_DEVICE_DEFAULT, cl::DeviceCommandQueue) \ + F(cl_kernel_sub_group_info, CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE, size_type) \ + F(cl_kernel_sub_group_info, CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE, size_type) \ + F(cl_kernel_sub_group_info, CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT, cl::detail::size_t_array) + +#define CL_HPP_PARAM_NAME_INFO_2_2_(F) \ + F(cl_program_info, CL_PROGRAM_SCOPE_GLOBAL_CTORS_PRESENT, cl_bool) \ + F(cl_program_info, CL_PROGRAM_SCOPE_GLOBAL_DTORS_PRESENT, cl_bool) + #define CL_HPP_PARAM_NAME_DEVICE_FISSION_(F) \ F(cl_device_info, CL_DEVICE_PARENT_DEVICE_EXT, cl_device_id) \ F(cl_device_info, CL_DEVICE_PARTITION_TYPES_EXT, cl::vector) \ @@ -1329,10 +1408,24 @@ #endif // CL_HPP_TARGET_OPENCL_VERSION >= 110 #if CL_HPP_TARGET_OPENCL_VERSION >= 120 CL_HPP_PARAM_NAME_INFO_1_2_(CL_HPP_DECLARE_PARAM_TRAITS_) -#endif // CL_HPP_TARGET_OPENCL_VERSION >= 110 +#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120 #if CL_HPP_TARGET_OPENCL_VERSION >= 200 CL_HPP_PARAM_NAME_INFO_2_0_(CL_HPP_DECLARE_PARAM_TRAITS_) -#endif // CL_HPP_TARGET_OPENCL_VERSION >= 110 +#endif // CL_HPP_TARGET_OPENCL_VERSION >= 200 +#if CL_HPP_TARGET_OPENCL_VERSION >= 210 +CL_HPP_PARAM_NAME_INFO_2_1_(CL_HPP_DECLARE_PARAM_TRAITS_) +#endif // CL_HPP_TARGET_OPENCL_VERSION >= 210 +#if CL_HPP_TARGET_OPENCL_VERSION >= 220 +CL_HPP_PARAM_NAME_INFO_2_2_(CL_HPP_DECLARE_PARAM_TRAITS_) +#endif // CL_HPP_TARGET_OPENCL_VERSION >= 220 + +#if defined(CL_HPP_USE_CL_SUB_GROUPS_KHR) && CL_HPP_TARGET_OPENCL_VERSION < 210 +CL_HPP_PARAM_NAME_INFO_SUBGROUP_KHR_(CL_HPP_DECLARE_PARAM_TRAITS_) +#endif // #if defined(CL_HPP_USE_CL_SUB_GROUPS_KHR) && CL_HPP_TARGET_OPENCL_VERSION < 210 + +#if defined(CL_HPP_USE_IL_KHR) +CL_HPP_PARAM_NAME_INFO_IL_KHR_(CL_HPP_DECLARE_PARAM_TRAITS_) +#endif // #if defined(CL_HPP_USE_IL_KHR) // Flags deprecated in OpenCL 2.0 @@ -1400,6 +1493,13 @@ CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_LOCAL_MEM_BANKS_AMD, cl_uint) #endif +#ifdef CL_DEVICE_COMPUTE_UNITS_BITFIELD_ARM +CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_COMPUTE_UNITS_BITFIELD_ARM, cl_ulong) +#endif +#ifdef CL_DEVICE_JOB_SLOTS_ARM +CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_JOB_SLOTS_ARM, cl_uint) +#endif + #ifdef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, cl_uint) #endif @@ -1710,10 +1810,7 @@ cl_type& operator ()() { return object_; } - const cl_type get() const { return object_; } - - cl_type get() { return object_; } - + cl_type get() const { return object_; } protected: template @@ -2103,6 +2200,53 @@ return param; } + +#if CL_HPP_TARGET_OPENCL_VERSION >= 210 + /** + * Return the current value of the host clock as seen by the device. + * The resolution of the device timer may be queried with the + * CL_DEVICE_PROFILING_TIMER_RESOLUTION query. + * @return The host timer value. + */ + cl_ulong getHostTimer(cl_int *error = nullptr) + { + cl_ulong retVal = 0; + cl_int err = + clGetHostTimer(this->get(), &retVal); + detail::errHandler( + err, + __GET_HOST_TIMER_ERR); + if (error) { + *error = err; + } + return retVal; + } + + /** + * Return a synchronized pair of host and device timestamps as seen by device. + * Use to correlate the clocks and get the host timer only using getHostTimer + * as a lower cost mechanism in between calls. + * The resolution of the host timer may be queried with the + * CL_PLATFORM_HOST_TIMER_RESOLUTION query. + * The resolution of the device timer may be queried with the + * CL_DEVICE_PROFILING_TIMER_RESOLUTION query. + * @return A pair of (device timer, host timer) timer values. + */ + std::pair getDeviceAndHostTimer(cl_int *error = nullptr) + { + std::pair retVal; + cl_int err = + clGetDeviceAndHostTimer(this->get(), &(retVal.first), &(retVal.second)); + detail::errHandler( + err, + __GET_DEVICE_AND_HOST_TIMER_ERR); + if (error) { + *error = err; + } + return retVal; + } +#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 210 + /** * CL 1.2 version */ @@ -3612,7 +3756,7 @@ return cl::pointer>(tmp, detail::Deleter{alloc, copies}); } - catch (std::bad_alloc b) + catch (std::bad_alloc& b) { std::allocator_traits::deallocate(alloc, tmp, copies); throw; @@ -5803,17 +5947,26 @@ return param; } -#if CL_HPP_TARGET_OPENCL_VERSION >= 200 -#if defined(CL_HPP_USE_CL_SUB_GROUPS_KHR) +#if (CL_HPP_TARGET_OPENCL_VERSION >= 200 && defined(CL_HPP_USE_CL_SUB_GROUPS_KHR)) || CL_HPP_TARGET_OPENCL_VERSION >= 210 cl_int getSubGroupInfo(const cl::Device &dev, cl_kernel_sub_group_info name, const cl::NDRange &range, size_type* param) const { +#if CL_HPP_TARGET_OPENCL_VERSION >= 210 + + return detail::errHandler( + clGetKernelSubGroupInfo(object_, dev(), name, range.size(), range.get(), sizeof(size_type), param, nullptr), + __GET_KERNEL_SUB_GROUP_INFO_ERR); + +#else // #if CL_HPP_TARGET_OPENCL_VERSION >= 210 + typedef clGetKernelSubGroupInfoKHR_fn PFN_clGetKernelSubGroupInfoKHR; static PFN_clGetKernelSubGroupInfoKHR pfn_clGetKernelSubGroupInfoKHR = NULL; CL_HPP_INIT_CL_EXT_FCN_PTR_(clGetKernelSubGroupInfoKHR); return detail::errHandler( pfn_clGetKernelSubGroupInfoKHR(object_, dev(), name, range.size(), range.get(), sizeof(size_type), param, nullptr), - __GET_KERNEL_ARG_INFO_ERR); + __GET_KERNEL_SUB_GROUP_INFO_ERR); + +#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 210 } template @@ -5826,7 +5979,6 @@ } return param; } -#endif // #if defined(CL_HPP_USE_CL_SUB_GROUPS_KHR) #endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200 #if CL_HPP_TARGET_OPENCL_VERSION >= 200 @@ -5981,6 +6133,22 @@ pointerList.data())); } #endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200 + +#if CL_HPP_TARGET_OPENCL_VERSION >= 210 + /** + * Make a deep copy of the kernel object including its arguments. + * @return A new kernel object with internal state entirely separate from that + * of the original but with any arguments set on the original intact. + */ + Kernel clone() + { + cl_int error; + Kernel retValue(clCloneKernel(this->get(), &error)); + + detail::errHandler(error, __CLONE_KERNEL_ERR); + return retValue; + } +#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 210 }; /*! \class Program @@ -6143,6 +6311,116 @@ } } + +#if CL_HPP_TARGET_OPENCL_VERSION >= 210 || (CL_HPP_TARGET_OPENCL_VERSION==200 && defined(CL_HPP_USE_IL_KHR)) + /** + * Program constructor to allow construction of program from SPIR-V or another IL. + * Valid for either OpenCL >= 2.1 or when CL_HPP_USE_IL_KHR is defined. + */ + Program( + const vector& IL, + bool build = false, + cl_int* err = NULL) + { + cl_int error; + + Context context = Context::getDefault(err); + +#if CL_HPP_TARGET_OPENCL_VERSION >= 210 + + object_ = ::clCreateProgramWithIL( + context(), static_cast(IL.data()), IL.size(), &error); + +#else // #if CL_HPP_TARGET_OPENCL_VERSION >= 210 + + typedef clCreateProgramWithILKHR_fn PFN_clCreateProgramWithILKHR; + static PFN_clCreateProgramWithILKHR pfn_clCreateProgramWithILKHR = NULL; + CL_HPP_INIT_CL_EXT_FCN_PTR_(clCreateProgramWithILKHR); + + return detail::errHandler( + pfn_clCreateProgramWithILKHR( + context(), static_cast(IL.data()), IL.size(), &error); + +#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 210 + + detail::errHandler(error, __CREATE_PROGRAM_WITH_IL_ERR); + + if (error == CL_SUCCESS && build) { + + error = ::clBuildProgram( + object_, + 0, + NULL, +#if !defined(CL_HPP_CL_1_2_DEFAULT_BUILD) + "-cl-std=CL2.0", +#else + "", +#endif // #if !defined(CL_HPP_CL_1_2_DEFAULT_BUILD) + NULL, + NULL); + + detail::buildErrHandler(error, __BUILD_PROGRAM_ERR, getBuildInfo()); + } + + if (err != NULL) { + *err = error; + } + } + + /** + * Program constructor to allow construction of program from SPIR-V or another IL + * for a specific context. + * Valid for either OpenCL >= 2.1 or when CL_HPP_USE_IL_KHR is defined. + */ + Program( + const Context& context, + const vector& IL, + bool build = false, + cl_int* err = NULL) + { + cl_int error; + +#if CL_HPP_TARGET_OPENCL_VERSION >= 210 + + object_ = ::clCreateProgramWithIL( + context(), static_cast(IL.data()), IL.size(), &error); + +#else // #if CL_HPP_TARGET_OPENCL_VERSION >= 210 + + typedef clCreateProgramWithILKHR_fn PFN_clCreateProgramWithILKHR; + static PFN_clCreateProgramWithILKHR pfn_clCreateProgramWithILKHR = NULL; + CL_HPP_INIT_CL_EXT_FCN_PTR_(clCreateProgramWithILKHR); + + return detail::errHandler( + pfn_clCreateProgramWithILKHR( + context(), static_cast(IL.data()), IL.size(), &error); + +#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 210 + + detail::errHandler(error, __CREATE_PROGRAM_WITH_IL_ERR); + + if (error == CL_SUCCESS && build) { + error = ::clBuildProgram( + object_, + 0, + NULL, +#if !defined(CL_HPP_CL_1_2_DEFAULT_BUILD) + "-cl-std=CL2.0", +#else + "", +#endif // #if !defined(CL_HPP_CL_1_2_DEFAULT_BUILD) + NULL, + NULL); + + detail::buildErrHandler(error, __BUILD_PROGRAM_ERR, getBuildInfo()); + } + + if (err != NULL) { + *err = error; + } + } +#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 210 + /** * Construct a program object from a list of devices and a per-device list of binaries. * \param context A valid OpenCL context in which to construct the program. @@ -6480,6 +6758,62 @@ } return CL_SUCCESS; } + +#if CL_HPP_TARGET_OPENCL_VERSION >= 220 + /*! \brief Registers a callback function to be called when destructors for + * program scope global variables are complete and before the + * program is released. + * + * Wraps clSetProgramReleaseCallback(). + * + * Each call to this function registers the specified user callback function + * on a callback stack associated with program. The registered user callback + * functions are called in the reverse order in which they were registered. + */ + cl_int setReleaseCallback( + void (CL_CALLBACK * pfn_notify)(cl_program program, void * user_data), + void * user_data = NULL) + { + return detail::errHandler( + ::clSetProgramReleaseCallback( + object_, + pfn_notify, + user_data), + __SET_PROGRAM_RELEASE_CALLBACK_ERR); + } + + /*! \brief Sets a SPIR-V specialization constant. + * + * Wraps clSetProgramSpecializationConstant(). + */ + template + typename std::enable_if::value, cl_int>::type + setSpecializationConstant(cl_uint index, const T &value) + { + return detail::errHandler( + ::clSetProgramSpecializationConstant( + object_, + index, + sizeof(value), + &value), + __SET_PROGRAM_SPECIALIZATION_CONSTANT_ERR); + } + + /*! \brief Sets a SPIR-V specialization constant. + * + * Wraps clSetProgramSpecializationConstant(). + */ + cl_int setSpecializationConstant(cl_uint index, size_type size, const void* value) + { + return detail::errHandler( + ::clSetProgramSpecializationConstant( + object_, + index, + size, + value), + __SET_PROGRAM_SPECIALIZATION_CONSTANT_ERR); + } +#endif // CL_HPP_TARGET_OPENCL_VERSION >= 220 }; #if CL_HPP_TARGET_OPENCL_VERSION >= 120 @@ -6601,6 +6935,22 @@ return binariesVectors; } +#if CL_HPP_TARGET_OPENCL_VERSION >= 220 +// Template specialization for clSetProgramSpecializationConstant +template <> +inline cl_int cl::Program::setSpecializationConstant(cl_uint index, const bool &value) +{ + cl_uchar ucValue = value ? CL_UCHAR_MAX : 0; + return detail::errHandler( + ::clSetProgramSpecializationConstant( + object_, + index, + sizeof(ucValue), + &ucValue), + __SET_PROGRAM_SPECIALIZATION_CONSTANT_ERR); +} +#endif // CL_HPP_TARGET_OPENCL_VERSION >= 220 + inline Kernel::Kernel(const Program& program, const char* name, cl_int* err) { cl_int error; @@ -7958,8 +8308,7 @@ for( int i = 0; i < (int)memObjects.size(); ++i ) { localMemObjects[i] = memObjects[i](); } - - + cl_int err = detail::errHandler( ::clEnqueueMigrateMemObjects( object_, @@ -7978,6 +8327,128 @@ } #endif // CL_HPP_TARGET_OPENCL_VERSION >= 120 + +#if CL_HPP_TARGET_OPENCL_VERSION >= 210 + /** + * Enqueues a command that will allow the host associate ranges within a set of + * SVM allocations with a device. + * @param sizes - The length from each pointer to migrate. + */ + template + cl_int enqueueMigrateSVM( + const cl::vector &svmRawPointers, + const cl::vector &sizes, + cl_mem_migration_flags flags = 0, + const vector* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler(::clEnqueueSVMMigrateMem( + object_, + svmRawPointers.size(), static_cast(svmRawPointers.data()), + sizes.data(), // array of sizes not passed + flags, + (events != NULL) ? (cl_uint)events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_MIGRATE_SVM_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + /** + * Enqueues a command that will allow the host associate a set of SVM allocations with + * a device. + */ + template + cl_int enqueueMigrateSVM( + const cl::vector &svmRawPointers, + cl_mem_migration_flags flags = 0, + const vector* events = NULL, + Event* event = NULL) const + { + return enqueueMigrateSVM(svmRawPointers, cl::vector(svmRawPointers.size()), flags, events, event); + } + + + /** + * Enqueues a command that will allow the host associate ranges within a set of + * SVM allocations with a device. + * @param sizes - The length from each pointer to migrate. + */ + template + cl_int enqueueMigrateSVM( + const cl::vector> &svmPointers, + const cl::vector &sizes, + cl_mem_migration_flags flags = 0, + const vector* events = NULL, + Event* event = NULL) const + { + cl::vector svmRawPointers; + svmRawPointers.reserve(svmPointers.size()); + for (auto p : svmPointers) { + svmRawPointers.push_back(static_cast(p.get())); + } + + return enqueueMigrateSVM(svmRawPointers, sizes, flags, events, event); + } + + + /** + * Enqueues a command that will allow the host associate a set of SVM allocations with + * a device. + */ + template + cl_int enqueueMigrateSVM( + const cl::vector> &svmPointers, + cl_mem_migration_flags flags = 0, + const vector* events = NULL, + Event* event = NULL) const + { + return enqueueMigrateSVM(svmPointers, cl::vector(svmPointers.size()), flags, events, event); + } + + /** + * Enqueues a command that will allow the host associate ranges within a set of + * SVM allocations with a device. + * @param sizes - The length from the beginning of each container to migrate. + */ + template + cl_int enqueueMigrateSVM( + const cl::vector> &svmContainers, + const cl::vector &sizes, + cl_mem_migration_flags flags = 0, + const vector* events = NULL, + Event* event = NULL) const + { + cl::vector svmRawPointers; + svmRawPointers.reserve(svmContainers.size()); + for (auto p : svmContainers) { + svmRawPointers.push_back(static_cast(p.data())); + } + + return enqueueMigrateSVM(svmRawPointers, sizes, flags, events, event); + } + + /** + * Enqueues a command that will allow the host associate a set of SVM allocations with + * a device. + */ + template + cl_int enqueueMigrateSVM( + const cl::vector> &svmContainers, + cl_mem_migration_flags flags = 0, + const vector* events = NULL, + Event* event = NULL) const + { + return enqueueMigrateSVM(svmContainers, cl::vector(svmContainers.size()), flags, events, event); + } + +#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 210 + cl_int enqueueNDRangeKernel( const Kernel& kernel, const NDRange& offset, @@ -8407,11 +8878,11 @@ } /*! - * Create a new default device command queue for the default device, - * in the default context and of the default size. - * If there is already a default queue for the specified device this - * function will return the pre-existing queue. - */ + * Create a new default device command queue for the default device, + * in the default context and of the default size. + * If there is already a default queue for the specified device this + * function will return the pre-existing queue. + */ static DeviceCommandQueue makeDefault( cl_int *err = nullptr) { @@ -8437,11 +8908,11 @@ } /*! - * Create a new default device command queue for the specified device - * and of the default size. - * If there is already a default queue for the specified device this - * function will return the pre-existing queue. - */ + * Create a new default device command queue for the specified device + * and of the default size. + * If there is already a default queue for the specified device this + * function will return the pre-existing queue. + */ static DeviceCommandQueue makeDefault( const Context &context, const Device &device, cl_int *err = nullptr) { @@ -8492,6 +8963,37 @@ return deviceQueue; } + + + +#if CL_HPP_TARGET_OPENCL_VERSION >= 210 + /*! + * Modify the default device command queue to be used for subsequent kernels. + * This can update the default command queue for a device repeatedly to account + * for kernels that rely on the default. + * @return updated default device command queue. + */ + static DeviceCommandQueue updateDefault(const Context &context, const Device &device, const DeviceCommandQueue &default_queue, cl_int *err = nullptr) + { + cl_int error; + error = clSetDefaultDeviceCommandQueue(context.get(), device.get(), default_queue.get()); + + detail::errHandler(error, __SET_DEFAULT_DEVICE_COMMAND_QUEUE_ERR); + if (err != NULL) { + *err = error; + } + return default_queue; + } + + /*! + * Return the current default command queue for the specified command queue + */ + static DeviceCommandQueue getDefault(const CommandQueue &queue, cl_int * err = NULL) + { + return queue.getInfo(err); + } + +#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 210 }; // DeviceCommandQueue namespace detail @@ -9604,73 +10106,101 @@ #undef CL_HPP_ERR_STR_ #if !defined(CL_HPP_USER_OVERRIDE_ERROR_STRINGS) -#undef __GET_DEVICE_INFO_ERR -#undef __GET_PLATFORM_INFO_ERR -#undef __GET_DEVICE_IDS_ERR -#undef __GET_CONTEXT_INFO_ERR -#undef __GET_EVENT_INFO_ERR -#undef __GET_EVENT_PROFILE_INFO_ERR -#undef __GET_MEM_OBJECT_INFO_ERR -#undef __GET_IMAGE_INFO_ERR -#undef __GET_SAMPLER_INFO_ERR -#undef __GET_KERNEL_INFO_ERR -#undef __GET_KERNEL_ARG_INFO_ERR -#undef __GET_KERNEL_WORK_GROUP_INFO_ERR -#undef __GET_PROGRAM_INFO_ERR -#undef __GET_PROGRAM_BUILD_INFO_ERR -#undef __GET_COMMAND_QUEUE_INFO_ERR - -#undef __CREATE_CONTEXT_ERR -#undef __CREATE_CONTEXT_FROM_TYPE_ERR -#undef __GET_SUPPORTED_IMAGE_FORMATS_ERR - -#undef __CREATE_BUFFER_ERR -#undef __CREATE_SUBBUFFER_ERR -#undef __CREATE_IMAGE2D_ERR -#undef __CREATE_IMAGE3D_ERR -#undef __CREATE_SAMPLER_ERR -#undef __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR - -#undef __CREATE_USER_EVENT_ERR -#undef __SET_USER_EVENT_STATUS_ERR -#undef __SET_EVENT_CALLBACK_ERR -#undef __SET_PRINTF_CALLBACK_ERR - -#undef __WAIT_FOR_EVENTS_ERR - -#undef __CREATE_KERNEL_ERR -#undef __SET_KERNEL_ARGS_ERR -#undef __CREATE_PROGRAM_WITH_SOURCE_ERR -#undef __CREATE_PROGRAM_WITH_BINARY_ERR -#undef __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR -#undef __BUILD_PROGRAM_ERR -#undef __CREATE_KERNELS_IN_PROGRAM_ERR - -#undef __CREATE_COMMAND_QUEUE_ERR -#undef __SET_COMMAND_QUEUE_PROPERTY_ERR -#undef __ENQUEUE_READ_BUFFER_ERR -#undef __ENQUEUE_WRITE_BUFFER_ERR -#undef __ENQUEUE_READ_BUFFER_RECT_ERR -#undef __ENQUEUE_WRITE_BUFFER_RECT_ERR -#undef __ENQEUE_COPY_BUFFER_ERR -#undef __ENQEUE_COPY_BUFFER_RECT_ERR -#undef __ENQUEUE_READ_IMAGE_ERR -#undef __ENQUEUE_WRITE_IMAGE_ERR -#undef __ENQUEUE_COPY_IMAGE_ERR -#undef __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR -#undef __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR -#undef __ENQUEUE_MAP_BUFFER_ERR -#undef __ENQUEUE_MAP_IMAGE_ERR -#undef __ENQUEUE_UNMAP_MEM_OBJECT_ERR -#undef __ENQUEUE_NDRANGE_KERNEL_ERR -#undef __ENQUEUE_TASK_ERR -#undef __ENQUEUE_NATIVE_KERNEL - -#undef __UNLOAD_COMPILER_ERR -#undef __CREATE_SUB_DEVICES_ERR - -#undef __CREATE_PIPE_ERR -#undef __GET_PIPE_INFO_ERR +#undef __GET_DEVICE_INFO_ERR +#undef __GET_PLATFORM_INFO_ERR +#undef __GET_DEVICE_IDS_ERR +#undef __GET_PLATFORM_IDS_ERR +#undef __GET_CONTEXT_INFO_ERR +#undef __GET_EVENT_INFO_ERR +#undef __GET_EVENT_PROFILE_INFO_ERR +#undef __GET_MEM_OBJECT_INFO_ERR +#undef __GET_IMAGE_INFO_ERR +#undef __GET_SAMPLER_INFO_ERR +#undef __GET_KERNEL_INFO_ERR +#undef __GET_KERNEL_ARG_INFO_ERR +#undef __GET_KERNEL_SUB_GROUP_INFO_ERR +#undef __GET_KERNEL_WORK_GROUP_INFO_ERR +#undef __GET_PROGRAM_INFO_ERR +#undef __GET_PROGRAM_BUILD_INFO_ERR +#undef __GET_COMMAND_QUEUE_INFO_ERR +#undef __CREATE_CONTEXT_ERR +#undef __CREATE_CONTEXT_FROM_TYPE_ERR +#undef __GET_SUPPORTED_IMAGE_FORMATS_ERR +#undef __CREATE_BUFFER_ERR +#undef __COPY_ERR +#undef __CREATE_SUBBUFFER_ERR +#undef __CREATE_GL_BUFFER_ERR +#undef __CREATE_GL_RENDER_BUFFER_ERR +#undef __GET_GL_OBJECT_INFO_ERR +#undef __CREATE_IMAGE_ERR +#undef __CREATE_GL_TEXTURE_ERR +#undef __IMAGE_DIMENSION_ERR +#undef __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR +#undef __CREATE_USER_EVENT_ERR +#undef __SET_USER_EVENT_STATUS_ERR +#undef __SET_EVENT_CALLBACK_ERR +#undef __WAIT_FOR_EVENTS_ERR +#undef __CREATE_KERNEL_ERR +#undef __SET_KERNEL_ARGS_ERR +#undef __CREATE_PROGRAM_WITH_SOURCE_ERR +#undef __CREATE_PROGRAM_WITH_IL_ERR +#undef __CREATE_PROGRAM_WITH_BINARY_ERR +#undef __CREATE_PROGRAM_WITH_IL_ERR +#undef __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR +#undef __BUILD_PROGRAM_ERR +#undef __COMPILE_PROGRAM_ERR +#undef __LINK_PROGRAM_ERR +#undef __CREATE_KERNELS_IN_PROGRAM_ERR +#undef __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR +#undef __CREATE_SAMPLER_WITH_PROPERTIES_ERR +#undef __SET_COMMAND_QUEUE_PROPERTY_ERR +#undef __ENQUEUE_READ_BUFFER_ERR +#undef __ENQUEUE_READ_BUFFER_RECT_ERR +#undef __ENQUEUE_WRITE_BUFFER_ERR +#undef __ENQUEUE_WRITE_BUFFER_RECT_ERR +#undef __ENQEUE_COPY_BUFFER_ERR +#undef __ENQEUE_COPY_BUFFER_RECT_ERR +#undef __ENQUEUE_FILL_BUFFER_ERR +#undef __ENQUEUE_READ_IMAGE_ERR +#undef __ENQUEUE_WRITE_IMAGE_ERR +#undef __ENQUEUE_COPY_IMAGE_ERR +#undef __ENQUEUE_FILL_IMAGE_ERR +#undef __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR +#undef __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR +#undef __ENQUEUE_MAP_BUFFER_ERR +#undef __ENQUEUE_MAP_IMAGE_ERR +#undef __ENQUEUE_UNMAP_MEM_OBJECT_ERR +#undef __ENQUEUE_NDRANGE_KERNEL_ERR +#undef __ENQUEUE_NATIVE_KERNEL +#undef __ENQUEUE_MIGRATE_MEM_OBJECTS_ERR +#undef __ENQUEUE_MIGRATE_SVM_ERR +#undef __ENQUEUE_ACQUIRE_GL_ERR +#undef __ENQUEUE_RELEASE_GL_ERR +#undef __CREATE_PIPE_ERR +#undef __GET_PIPE_INFO_ERR +#undef __RETAIN_ERR +#undef __RELEASE_ERR +#undef __FLUSH_ERR +#undef __FINISH_ERR +#undef __VECTOR_CAPACITY_ERR +#undef __CREATE_SUB_DEVICES_ERR +#undef __CREATE_SUB_DEVICES_ERR +#undef __ENQUEUE_MARKER_ERR +#undef __ENQUEUE_WAIT_FOR_EVENTS_ERR +#undef __ENQUEUE_BARRIER_ERR +#undef __UNLOAD_COMPILER_ERR +#undef __CREATE_GL_TEXTURE_2D_ERR +#undef __CREATE_GL_TEXTURE_3D_ERR +#undef __CREATE_IMAGE2D_ERR +#undef __CREATE_IMAGE3D_ERR +#undef __CREATE_COMMAND_QUEUE_ERR +#undef __ENQUEUE_TASK_ERR +#undef __CREATE_SAMPLER_ERR +#undef __ENQUEUE_MARKER_WAIT_LIST_ERR +#undef __ENQUEUE_BARRIER_WAIT_LIST_ERR +#undef __CLONE_KERNEL_ERR +#undef __GET_HOST_TIMER_ERR +#undef __GET_DEVICE_AND_HOST_TIMER_ERR #endif //CL_HPP_USER_OVERRIDE_ERROR_STRINGS diff -Nru mesa-19.2.8/include/CL/cl_ext.h mesa-20.0.8/include/CL/cl_ext.h --- mesa-19.2.8/include/CL/cl_ext.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/include/CL/cl_ext.h 2020-06-12 01:21:16.000000000 +0000 @@ -147,21 +147,29 @@ size_t length, cl_int * errcode_ret) CL_EXT_SUFFIX__VERSION_1_2; -/* Extension: cl_khr_image2D_buffer +/* Extension: cl_khr_image2d_from_buffer * - * This extension allows a 2D image to be created from a cl_mem buffer without a copy. - * The type associated with a 2D image created from a buffer in an OpenCL program is image2d_t. - * Both the sampler and sampler-less read_image built-in functions are supported for 2D images - * and 2D images created from a buffer. Similarly, the write_image built-ins are also supported - * for 2D images created from a buffer. + * This extension allows a 2D image to be created from a cl_mem buffer without + * a copy. The type associated with a 2D image created from a buffer in an + * OpenCL program is image2d_t. Both the sampler and sampler-less read_image + * built-in functions are supported for 2D images and 2D images created from + * a buffer. Similarly, the write_image built-ins are also supported for 2D + * images created from a buffer. * - * When the 2D image from buffer is created, the client must specify the width, - * height, image format (i.e. channel order and channel data type) and optionally the row pitch + * When the 2D image from buffer is created, the client must specify the + * width, height, image format (i.e. channel order and channel data type) + * and optionally the row pitch. * - * The pitch specified must be a multiple of CL_DEVICE_IMAGE_PITCH_ALIGNMENT pixels. - * The base address of the buffer must be aligned to CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT pixels. + * The pitch specified must be a multiple of + * CL_DEVICE_IMAGE_PITCH_ALIGNMENT_KHR pixels. + * The base address of the buffer must be aligned to + * CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT_KHR pixels. */ +#define CL_DEVICE_IMAGE_PITCH_ALIGNMENT_KHR 0x104A +#define CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT_KHR 0x104B + + /************************************** * cl_khr_initialize_memory extension * **************************************/ @@ -570,6 +578,49 @@ #define CL_DEVICE_MAX_NAMED_BARRIER_COUNT_KHR 0x2035 +/********************************* +* cl_khr_extended_versioning +*********************************/ + +#define CL_VERSION_MAJOR_BITS_KHR (10) +#define CL_VERSION_MINOR_BITS_KHR (10) +#define CL_VERSION_PATCH_BITS_KHR (12) + +#define CL_VERSION_MAJOR_MASK_KHR ((1 << CL_VERSION_MAJOR_BITS_KHR) - 1) +#define CL_VERSION_MINOR_MASK_KHR ((1 << CL_VERSION_MINOR_BITS_KHR) - 1) +#define CL_VERSION_PATCH_MASK_KHR ((1 << CL_VERSION_PATCH_BITS_KHR) - 1) + +#define CL_VERSION_MAJOR_KHR(version) ((version) >> (CL_VERSION_MINOR_BITS_KHR + CL_VERSION_PATCH_BITS_KHR)) +#define CL_VERSION_MINOR_KHR(version) (((version) >> CL_VERSION_PATCH_BITS_KHR) & CL_VERSION_MINOR_MASK_KHR) +#define CL_VERSION_PATCH_KHR(version) ((version) & CL_VERSION_PATCH_MASK_KHR) + +#define CL_MAKE_VERSION_KHR(major, minor, patch) \ + ((((major) & CL_VERSION_MAJOR_MASK_KHR) << (CL_VERSION_MINOR_BITS_KHR + CL_VERSION_PATCH_BITS_KHR)) | \ + (((minor) & CL_VERSION_MINOR_MASK_KHR) << CL_VERSION_PATCH_BITS_KHR) | \ + ((patch) & CL_VERSION_PATCH_MASK_KHR)) + +typedef cl_uint cl_version_khr; + +#define CL_NAME_VERSION_MAX_NAME_SIZE_KHR 64 + +typedef struct _cl_name_version_khr +{ + cl_version_khr version; + char name[CL_NAME_VERSION_MAX_NAME_SIZE_KHR]; +} cl_name_version_khr; + +/* cl_platform_info */ +#define CL_PLATFORM_NUMERIC_VERSION_KHR 0x0906 +#define CL_PLATFORM_EXTENSIONS_WITH_VERSION_KHR 0x0907 + +/* cl_device_info */ +#define CL_DEVICE_NUMERIC_VERSION_KHR 0x105E +#define CL_DEVICE_OPENCL_C_NUMERIC_VERSION_KHR 0x105F +#define CL_DEVICE_EXTENSIONS_WITH_VERSION_KHR 0x1060 +#define CL_DEVICE_ILS_WITH_VERSION_KHR 0x1061 +#define CL_DEVICE_BUILT_IN_KERNELS_WITH_VERSION_KHR 0x1062 + + /********************************** * cl_arm_import_memory extension * **********************************/ @@ -589,6 +640,12 @@ /* Protected DMA BUF memory type value for CL_IMPORT_TYPE_ARM property */ #define CL_IMPORT_TYPE_PROTECTED_ARM 0x40B5 +/* Android hardware buffer type value for CL_IMPORT_TYPE_ARM property */ +#define CL_IMPORT_TYPE_ANDROID_HARDWARE_BUFFER_ARM 0x41E2 + +/* Import memory size value to indicate a size for the whole buffer */ +#define CL_IMPORT_MEMORY_WHOLE_ALLOCATION_ARM SIZE_MAX + /* This extension adds a new function that allows for direct memory import into * OpenCL via the clImportMemoryARM function. * @@ -734,6 +791,18 @@ #endif /* CL_VERSION_1_2 */ +/********************************* +* cl_arm_job_slot_selection +*********************************/ + +#define cl_arm_job_slot_selection 1 + +/* cl_device_info */ +#define CL_DEVICE_JOB_SLOTS_ARM 0x41E0 + +/* cl_command_queue_properties */ +#define CL_QUEUE_JOB_SLOT_ARM 0x41E1 + #ifdef __cplusplus } #endif diff -Nru mesa-19.2.8/include/CL/cl.h mesa-20.0.8/include/CL/cl.h --- mesa-19.2.8/include/CL/cl.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/include/CL/cl.h 2020-06-12 01:21:16.000000000 +0000 @@ -137,19 +137,23 @@ size_t image_slice_pitch; cl_uint num_mip_levels; cl_uint num_samples; +#ifdef CL_VERSION_2_0 #ifdef __GNUC__ __extension__ /* Prevents warnings about anonymous union in -pedantic builds */ #endif #ifdef _MSC_VER -#pragma warning( push ) +#pragma warning( push ) #pragma warning( disable : 4201 ) /* Prevents warning about nameless struct/union in /W4 /Za builds */ #endif union { +#endif cl_mem buffer; +#ifdef CL_VERSION_2_0 cl_mem mem_object; }; #ifdef _MSC_VER -#pragma warning( pop ) +#pragma warning( pop ) +#endif #endif } cl_image_desc; @@ -356,10 +360,10 @@ #define CL_DEVICE_REFERENCE_COUNT 0x1047 #define CL_DEVICE_PREFERRED_INTEROP_USER_SYNC 0x1048 #define CL_DEVICE_PRINTF_BUFFER_SIZE 0x1049 -#define CL_DEVICE_IMAGE_PITCH_ALIGNMENT 0x104A -#define CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT 0x104B #endif #ifdef CL_VERSION_2_0 +#define CL_DEVICE_IMAGE_PITCH_ALIGNMENT 0x104A +#define CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT 0x104B #define CL_DEVICE_MAX_READ_WRITE_IMAGE_ARGS 0x104C #define CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE 0x104D #define CL_DEVICE_QUEUE_ON_DEVICE_PROPERTIES 0x104E diff -Nru mesa-19.2.8/include/CL/cl.hpp mesa-20.0.8/include/CL/cl.hpp --- mesa-19.2.8/include/CL/cl.hpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/include/CL/cl.hpp 2020-06-12 01:21:16.000000000 +0000 @@ -2610,7 +2610,7 @@ error = platforms[i].getDevices(type, &devices); #if defined(__CL_ENABLE_EXCEPTIONS) - } catch (Error) {} + } catch (Error &) {} // Catch if exceptions are enabled as we don't want to exit if first platform has no devices of type // We do error checking next anyway, and can throw there if needed #endif diff -Nru mesa-19.2.8/include/CL/cl_icd.h mesa-20.0.8/include/CL/cl_icd.h --- mesa-19.2.8/include/CL/cl_icd.h 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/include/CL/cl_icd.h 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,1269 @@ +/******************************************************************************* + * Copyright (c) 2019 The Khronos Group Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and/or associated documentation files (the + * "Materials"), to deal in the Materials without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Materials, and to + * permit persons to whom the Materials are furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Materials. + * + * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS + * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS + * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT + * https://www.khronos.org/registry/ + * + * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. + ******************************************************************************/ + +#ifndef OPENCL_CL_ICD_H +#define OPENCL_CL_ICD_H + +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * This file contains pointer type definitions for each of the CL API calls as + * well as a type definition for the dispatch table used by the Khronos ICD + * loader (see cl_khr_icd extension specification for background). + */ + +/* API function pointer definitions */ + +// Platform APIs +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetPlatformIDs)( + cl_uint num_entries, cl_platform_id *platforms, + cl_uint *num_platforms) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetPlatformInfo)( + cl_platform_id platform, cl_platform_info param_name, + size_t param_value_size, void *param_value, + size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; + +// Device APIs +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetDeviceIDs)( + cl_platform_id platform, cl_device_type device_type, cl_uint num_entries, + cl_device_id *devices, cl_uint *num_devices) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetDeviceInfo)( + cl_device_id device, cl_device_info param_name, size_t param_value_size, + void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; + +#ifdef CL_VERSION_1_2 + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clCreateSubDevices)( + cl_device_id in_device, + const cl_device_partition_property *partition_properties, + cl_uint num_entries, cl_device_id *out_devices, cl_uint *num_devices); + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clRetainDevice)( + cl_device_id device) CL_API_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clReleaseDevice)( + cl_device_id device) CL_API_SUFFIX__VERSION_1_2; + +#else + +typedef void *cl_api_clCreateSubDevices; +typedef void *cl_api_clRetainDevice; +typedef void *cl_api_clReleaseDevice; + +#endif + +// Context APIs +typedef CL_API_ENTRY cl_context(CL_API_CALL *cl_api_clCreateContext)( + const cl_context_properties *properties, cl_uint num_devices, + const cl_device_id *devices, + void(CL_CALLBACK *pfn_notify)(const char *, const void *, size_t, void *), + void *user_data, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_context(CL_API_CALL *cl_api_clCreateContextFromType)( + const cl_context_properties *properties, cl_device_type device_type, + void(CL_CALLBACK *pfn_notify)(const char *, const void *, size_t, void *), + void *user_data, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clRetainContext)( + cl_context context) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clReleaseContext)( + cl_context context) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetContextInfo)( + cl_context context, cl_context_info param_name, size_t param_value_size, + void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; + +// Command Queue APIs +typedef CL_API_ENTRY cl_command_queue(CL_API_CALL *cl_api_clCreateCommandQueue)( + cl_context context, cl_device_id device, + cl_command_queue_properties properties, + cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0; + +#ifdef CL_VERSION_2_0 + +typedef CL_API_ENTRY +cl_command_queue(CL_API_CALL *cl_api_clCreateCommandQueueWithProperties)( + cl_context /* context */, cl_device_id /* device */, + const cl_queue_properties * /* properties */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0; + +#else + +typedef void *cl_api_clCreateCommandQueueWithProperties; + +#endif + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clRetainCommandQueue)( + cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clReleaseCommandQueue)( + cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetCommandQueueInfo)( + cl_command_queue command_queue, cl_command_queue_info param_name, + size_t param_value_size, void *param_value, + size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; + +// Memory Object APIs +typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateBuffer)( + cl_context context, cl_mem_flags flags, size_t size, void *host_ptr, + cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0; + +#ifdef CL_VERSION_1_2 + +typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateImage)( + cl_context context, cl_mem_flags flags, const cl_image_format *image_format, + const cl_image_desc *image_desc, void *host_ptr, + cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2; + +#else + +typedef void *cl_api_clCreateImage; + +#endif + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clRetainMemObject)( + cl_mem memobj) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clReleaseMemObject)( + cl_mem memobj) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetSupportedImageFormats)( + cl_context context, cl_mem_flags flags, cl_mem_object_type image_type, + cl_uint num_entries, cl_image_format *image_formats, + cl_uint *num_image_formats) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetMemObjectInfo)( + cl_mem memobj, cl_mem_info param_name, size_t param_value_size, + void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetImageInfo)( + cl_mem image, cl_image_info param_name, size_t param_value_size, + void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; + +#ifdef CL_VERSION_2_0 + +typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreatePipe)( + cl_context /* context */, cl_mem_flags /* flags */, + cl_uint /* pipe_packet_size */, cl_uint /* pipe_max_packets */, + const cl_pipe_properties * /* properties */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0; + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetPipeInfo)( + cl_mem /* pipe */, cl_pipe_info /* param_name */, + size_t /* param_value_size */, void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_2_0; + +typedef CL_API_ENTRY void *(CL_API_CALL *cl_api_clSVMAlloc)( + cl_context /* context */, cl_svm_mem_flags /* flags */, size_t /* size */, + unsigned int /* alignment */)CL_API_SUFFIX__VERSION_2_0; + +typedef CL_API_ENTRY void(CL_API_CALL *cl_api_clSVMFree)( + cl_context /* context */, + void * /* svm_pointer */) CL_API_SUFFIX__VERSION_2_0; + +#else + +typedef void *cl_api_clCreatePipe; +typedef void *cl_api_clGetPipeInfo; +typedef void *cl_api_clSVMAlloc; +typedef void *cl_api_clSVMFree; + +#endif + +// Sampler APIs +typedef CL_API_ENTRY cl_sampler(CL_API_CALL *cl_api_clCreateSampler)( + cl_context context, cl_bool normalized_coords, + cl_addressing_mode addressing_mode, cl_filter_mode filter_mode, + cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clRetainSampler)( + cl_sampler sampler) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clReleaseSampler)( + cl_sampler sampler) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetSamplerInfo)( + cl_sampler sampler, cl_sampler_info param_name, size_t param_value_size, + void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; + +#ifdef CL_VERSION_2_0 + +typedef CL_API_ENTRY +cl_sampler(CL_API_CALL *cl_api_clCreateSamplerWithProperties)( + cl_context /* context */, + const cl_sampler_properties * /* sampler_properties */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0; + +#else + +typedef void *cl_api_clCreateSamplerWithProperties; + +#endif + +// Program Object APIs +typedef CL_API_ENTRY cl_program(CL_API_CALL *cl_api_clCreateProgramWithSource)( + cl_context context, cl_uint count, const char **strings, + const size_t *lengths, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_program(CL_API_CALL *cl_api_clCreateProgramWithBinary)( + cl_context context, cl_uint num_devices, const cl_device_id *device_list, + const size_t *lengths, const unsigned char **binaries, + cl_int *binary_status, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0; + +#ifdef CL_VERSION_1_2 + +typedef CL_API_ENTRY +cl_program(CL_API_CALL *cl_api_clCreateProgramWithBuiltInKernels)( + cl_context context, cl_uint num_devices, const cl_device_id *device_list, + const char *kernel_names, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2; + +#else + +typedef void *cl_api_clCreateProgramWithBuiltInKernels; + +#endif + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clRetainProgram)( + cl_program program) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clReleaseProgram)( + cl_program program) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clBuildProgram)( + cl_program program, cl_uint num_devices, const cl_device_id *device_list, + const char *options, + void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data), + void *user_data) CL_API_SUFFIX__VERSION_1_0; + +#ifdef CL_VERSION_1_2 + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clCompileProgram)( + cl_program program, cl_uint num_devices, const cl_device_id *device_list, + const char *options, cl_uint num_input_headers, + const cl_program *input_headers, const char **header_include_names, + void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data), + void *user_data) CL_API_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_program(CL_API_CALL *cl_api_clLinkProgram)( + cl_context context, cl_uint num_devices, const cl_device_id *device_list, + const char *options, cl_uint num_input_programs, + const cl_program *input_programs, + void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data), + void *user_data, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2; + +#else + +typedef void *cl_api_clCompileProgram; +typedef void *cl_api_clLinkProgram; + +#endif + +#ifdef CL_VERSION_2_2 + +typedef CL_API_ENTRY +cl_int(CL_API_CALL *cl_api_clSetProgramSpecializationConstant)( + cl_program program, cl_uint spec_id, size_t spec_size, + const void *spec_value) CL_API_SUFFIX__VERSION_2_2; + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clSetProgramReleaseCallback)( + cl_program program, + void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data), + void *user_data) CL_API_SUFFIX__VERSION_2_2; + +#else + +typedef void *cl_api_clSetProgramSpecializationConstant; +typedef void *cl_api_clSetProgramReleaseCallback; + +#endif + +#ifdef CL_VERSION_1_2 + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clUnloadPlatformCompiler)( + cl_platform_id platform) CL_API_SUFFIX__VERSION_1_2; + +#else + +typedef void *cl_api_clUnloadPlatformCompiler; + +#endif + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetProgramInfo)( + cl_program program, cl_program_info param_name, size_t param_value_size, + void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetProgramBuildInfo)( + cl_program program, cl_device_id device, cl_program_build_info param_name, + size_t param_value_size, void *param_value, + size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; + +// Kernel Object APIs +typedef CL_API_ENTRY cl_kernel(CL_API_CALL *cl_api_clCreateKernel)( + cl_program program, const char *kernel_name, + cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clCreateKernelsInProgram)( + cl_program program, cl_uint num_kernels, cl_kernel *kernels, + cl_uint *num_kernels_ret) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clRetainKernel)( + cl_kernel kernel) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clReleaseKernel)( + cl_kernel kernel) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clSetKernelArg)( + cl_kernel kernel, cl_uint arg_index, size_t arg_size, + const void *arg_value) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetKernelInfo)( + cl_kernel kernel, cl_kernel_info param_name, size_t param_value_size, + void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; + +#ifdef CL_VERSION_1_2 + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetKernelArgInfo)( + cl_kernel kernel, cl_uint arg_indx, cl_kernel_arg_info param_name, + size_t param_value_size, void *param_value, + size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_2; + +#else + +typedef void *cl_api_clGetKernelArgInfo; + +#endif + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetKernelWorkGroupInfo)( + cl_kernel kernel, cl_device_id device, cl_kernel_work_group_info param_name, + size_t param_value_size, void *param_value, + size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; + +#ifdef CL_VERSION_2_0 + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clSetKernelArgSVMPointer)( + cl_kernel /* kernel */, cl_uint /* arg_index */, + const void * /* arg_value */) CL_API_SUFFIX__VERSION_2_0; + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clSetKernelExecInfo)( + cl_kernel /* kernel */, cl_kernel_exec_info /* param_name */, + size_t /* param_value_size */, + const void * /* param_value */) CL_API_SUFFIX__VERSION_2_0; + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetKernelSubGroupInfoKHR)( + cl_kernel /* in_kernel */, cl_device_id /*in_device*/, + cl_kernel_sub_group_info /* param_name */, size_t /*input_value_size*/, + const void * /*input_value*/, size_t /*param_value_size*/, + void * /*param_value*/, + size_t * /*param_value_size_ret*/) CL_EXT_SUFFIX__VERSION_2_0; + +#else + +typedef void *cl_api_clSetKernelArgSVMPointer; +typedef void *cl_api_clSetKernelExecInfo; +typedef void *cl_api_clGetKernelSubGroupInfoKHR; + +#endif + +// Event Object APIs +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clWaitForEvents)( + cl_uint num_events, const cl_event *event_list) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetEventInfo)( + cl_event event, cl_event_info param_name, size_t param_value_size, + void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clRetainEvent)(cl_event event) + CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clReleaseEvent)(cl_event event) + CL_API_SUFFIX__VERSION_1_0; + +// Profiling APIs +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetEventProfilingInfo)( + cl_event event, cl_profiling_info param_name, size_t param_value_size, + void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; + +// Flush and Finish APIs +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clFlush)( + cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clFinish)( + cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0; + +// Enqueued Commands APIs +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueReadBuffer)( + cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_read, + size_t offset, size_t cb, void *ptr, cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *event) CL_API_SUFFIX__VERSION_1_0; + +#ifdef CL_VERSION_1_1 + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueReadBufferRect)( + cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_read, + const size_t *buffer_origin, const size_t *host_origin, + const size_t *region, size_t buffer_row_pitch, size_t buffer_slice_pitch, + size_t host_row_pitch, size_t host_slice_pitch, void *ptr, + cl_uint num_events_in_wait_list, const cl_event *event_wait_list, + cl_event *event) CL_API_SUFFIX__VERSION_1_1; + +#else + +typedef void *cl_api_clEnqueueReadBufferRect; + +#endif + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueWriteBuffer)( + cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_write, + size_t offset, size_t cb, const void *ptr, cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *event) CL_API_SUFFIX__VERSION_1_0; + +#ifdef CL_VERSION_1_1 + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueWriteBufferRect)( + cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_read, + const size_t *buffer_origin, const size_t *host_origin, + const size_t *region, size_t buffer_row_pitch, size_t buffer_slice_pitch, + size_t host_row_pitch, size_t host_slice_pitch, const void *ptr, + cl_uint num_events_in_wait_list, const cl_event *event_wait_list, + cl_event *event) CL_API_SUFFIX__VERSION_1_1; + +#else + +typedef void *cl_api_clEnqueueWriteBufferRect; + +#endif + +#ifdef CL_VERSION_1_2 + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueFillBuffer)( + cl_command_queue command_queue, cl_mem buffer, const void *pattern, + size_t pattern_size, size_t offset, size_t cb, + cl_uint num_events_in_wait_list, const cl_event *event_wait_list, + cl_event *event) CL_API_SUFFIX__VERSION_1_2; + +#else + +typedef void *cl_api_clEnqueueFillBuffer; + +#endif + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueCopyBuffer)( + cl_command_queue command_queue, cl_mem src_buffer, cl_mem dst_buffer, + size_t src_offset, size_t dst_offset, size_t cb, + cl_uint num_events_in_wait_list, const cl_event *event_wait_list, + cl_event *event) CL_API_SUFFIX__VERSION_1_0; + +#ifdef CL_VERSION_1_1 + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueCopyBufferRect)( + cl_command_queue command_queue, cl_mem src_buffer, cl_mem dst_buffer, + const size_t *src_origin, const size_t *dst_origin, const size_t *region, + size_t src_row_pitch, size_t src_slice_pitch, size_t dst_row_pitch, + size_t dst_slice_pitch, cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *event) CL_API_SUFFIX__VERSION_1_1; + +#else + +typedef void *cl_api_clEnqueueCopyBufferRect; + +#endif + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueReadImage)( + cl_command_queue command_queue, cl_mem image, cl_bool blocking_read, + const size_t *origin, const size_t *region, size_t row_pitch, + size_t slice_pitch, void *ptr, cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *event) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueWriteImage)( + cl_command_queue command_queue, cl_mem image, cl_bool blocking_write, + const size_t *origin, const size_t *region, size_t input_row_pitch, + size_t input_slice_pitch, const void *ptr, cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *event) CL_API_SUFFIX__VERSION_1_0; + +#ifdef CL_VERSION_1_2 + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueFillImage)( + cl_command_queue command_queue, cl_mem image, const void *fill_color, + const size_t origin[3], const size_t region[3], + cl_uint num_events_in_wait_list, const cl_event *event_wait_list, + cl_event *event) CL_API_SUFFIX__VERSION_1_2; + +#else + +typedef void *cl_api_clEnqueueFillImage; + +#endif + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueCopyImage)( + cl_command_queue command_queue, cl_mem src_image, cl_mem dst_image, + const size_t *src_origin, const size_t *dst_origin, const size_t *region, + cl_uint num_events_in_wait_list, const cl_event *event_wait_list, + cl_event *event) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueCopyImageToBuffer)( + cl_command_queue command_queue, cl_mem src_image, cl_mem dst_buffer, + const size_t *src_origin, const size_t *region, size_t dst_offset, + cl_uint num_events_in_wait_list, const cl_event *event_wait_list, + cl_event *event) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueCopyBufferToImage)( + cl_command_queue command_queue, cl_mem src_buffer, cl_mem dst_image, + size_t src_offset, const size_t *dst_origin, const size_t *region, + cl_uint num_events_in_wait_list, const cl_event *event_wait_list, + cl_event *event) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY void *(CL_API_CALL *cl_api_clEnqueueMapBuffer)( + cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_map, + cl_map_flags map_flags, size_t offset, size_t cb, + cl_uint num_events_in_wait_list, const cl_event *event_wait_list, + cl_event *event, cl_int *errcode_ret)CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY void *(CL_API_CALL *cl_api_clEnqueueMapImage)( + cl_command_queue command_queue, cl_mem image, cl_bool blocking_map, + cl_map_flags map_flags, const size_t *origin, const size_t *region, + size_t *image_row_pitch, size_t *image_slice_pitch, + cl_uint num_events_in_wait_list, const cl_event *event_wait_list, + cl_event *event, cl_int *errcode_ret)CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueUnmapMemObject)( + cl_command_queue command_queue, cl_mem memobj, void *mapped_ptr, + cl_uint num_events_in_wait_list, const cl_event *event_wait_list, + cl_event *event) CL_API_SUFFIX__VERSION_1_0; + +#ifdef CL_VERSION_1_2 + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueMigrateMemObjects)( + cl_command_queue command_queue, cl_uint num_mem_objects, + const cl_mem *mem_objects, cl_mem_migration_flags flags, + cl_uint num_events_in_wait_list, const cl_event *event_wait_list, + cl_event *event) CL_API_SUFFIX__VERSION_1_2; + +#else + +typedef void *cl_api_clEnqueueMigrateMemObjects; + +#endif + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueNDRangeKernel)( + cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, + const size_t *global_work_offset, const size_t *global_work_size, + const size_t *local_work_size, cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *event) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueTask)( + cl_command_queue command_queue, cl_kernel kernel, + cl_uint num_events_in_wait_list, const cl_event *event_wait_list, + cl_event *event) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueNativeKernel)( + cl_command_queue command_queue, void(CL_CALLBACK *user_func)(void *), + void *args, size_t cb_args, cl_uint num_mem_objects, const cl_mem *mem_list, + const void **args_mem_loc, cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *event) CL_API_SUFFIX__VERSION_1_0; + +#ifdef CL_VERSION_1_2 + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueMarkerWithWaitList)( + cl_command_queue command_queue, cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *event) CL_API_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueBarrierWithWaitList)( + cl_command_queue command_queue, cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *event) CL_API_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY void *( + CL_API_CALL *cl_api_clGetExtensionFunctionAddressForPlatform)( + cl_platform_id platform, + const char *function_name)CL_API_SUFFIX__VERSION_1_2; + +#else + +typedef void *cl_api_clEnqueueMarkerWithWaitList; +typedef void *cl_api_clEnqueueBarrierWithWaitList; +typedef void *cl_api_clGetExtensionFunctionAddressForPlatform; + +#endif + +// Shared Virtual Memory APIs + +#ifdef CL_VERSION_2_0 + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueSVMFree)( + cl_command_queue /* command_queue */, cl_uint /* num_svm_pointers */, + void ** /* svm_pointers */, + void(CL_CALLBACK *pfn_free_func)(cl_command_queue /* queue */, + cl_uint /* num_svm_pointers */, + void ** /* svm_pointers[] */, + void * /* user_data */), + void * /* user_data */, cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0; + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueSVMMemcpy)( + cl_command_queue /* command_queue */, cl_bool /* blocking_copy */, + void * /* dst_ptr */, const void * /* src_ptr */, size_t /* size */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0; + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueSVMMemFill)( + cl_command_queue /* command_queue */, void * /* svm_ptr */, + const void * /* pattern */, size_t /* pattern_size */, size_t /* size */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0; + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueSVMMap)( + cl_command_queue /* command_queue */, cl_bool /* blocking_map */, + cl_map_flags /* map_flags */, void * /* svm_ptr */, size_t /* size */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0; + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueSVMUnmap)( + cl_command_queue /* command_queue */, void * /* svm_ptr */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0; + +#else + +typedef void *cl_api_clEnqueueSVMFree; +typedef void *cl_api_clEnqueueSVMMemcpy; +typedef void *cl_api_clEnqueueSVMMemFill; +typedef void *cl_api_clEnqueueSVMMap; +typedef void *cl_api_clEnqueueSVMUnmap; + +#endif + +// Deprecated APIs +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clSetCommandQueueProperty)( + cl_command_queue command_queue, cl_command_queue_properties properties, + cl_bool enable, cl_command_queue_properties *old_properties) + CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED; + +typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateImage2D)( + cl_context context, cl_mem_flags flags, const cl_image_format *image_format, + size_t image_width, size_t image_height, size_t image_row_pitch, + void *host_ptr, cl_int *errcode_ret) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; + +typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateImage3D)( + cl_context context, cl_mem_flags flags, const cl_image_format *image_format, + size_t image_width, size_t image_height, size_t image_depth, + size_t image_row_pitch, size_t image_slice_pitch, void *host_ptr, + cl_int *errcode_ret) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clUnloadCompiler)(void) + CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueMarker)( + cl_command_queue command_queue, + cl_event *event) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueWaitForEvents)( + cl_command_queue command_queue, cl_uint num_events, + const cl_event *event_list) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueBarrier)( + cl_command_queue command_queue) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; + +typedef CL_API_ENTRY void *(CL_API_CALL *cl_api_clGetExtensionFunctionAddress)( + const char *function_name)CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; + +// GL and other APIs +typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromGLBuffer)( + cl_context context, cl_mem_flags flags, cl_GLuint bufobj, + int *errcode_ret) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromGLTexture)( + cl_context context, cl_mem_flags flags, cl_GLenum target, cl_GLint miplevel, + cl_GLuint texture, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromGLTexture2D)( + cl_context context, cl_mem_flags flags, cl_GLenum target, cl_GLint miplevel, + cl_GLuint texture, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromGLTexture3D)( + cl_context context, cl_mem_flags flags, cl_GLenum target, cl_GLint miplevel, + cl_GLuint texture, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromGLRenderbuffer)( + cl_context context, cl_mem_flags flags, cl_GLuint renderbuffer, + cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetGLObjectInfo)( + cl_mem memobj, cl_gl_object_type *gl_object_type, + cl_GLuint *gl_object_name) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetGLTextureInfo)( + cl_mem memobj, cl_gl_texture_info param_name, size_t param_value_size, + void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueAcquireGLObjects)( + cl_command_queue command_queue, cl_uint num_objects, + const cl_mem *mem_objects, cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *event) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueReleaseGLObjects)( + cl_command_queue command_queue, cl_uint num_objects, + const cl_mem *mem_objects, cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *event) CL_API_SUFFIX__VERSION_1_0; + +/* cl_khr_gl_sharing */ +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetGLContextInfoKHR)( + const cl_context_properties *properties, cl_gl_context_info param_name, + size_t param_value_size, void *param_value, size_t *param_value_size_ret); + +/* cl_khr_gl_event */ +typedef CL_API_ENTRY cl_event(CL_API_CALL *cl_api_clCreateEventFromGLsyncKHR)( + cl_context context, cl_GLsync sync, cl_int *errcode_ret); + +#if defined(_WIN32) + +/* cl_khr_d3d10_sharing */ + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetDeviceIDsFromD3D10KHR)( + cl_platform_id platform, cl_d3d10_device_source_khr d3d_device_source, + void *d3d_object, cl_d3d10_device_set_khr d3d_device_set, + cl_uint num_entries, cl_device_id *devices, + cl_uint *num_devices) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromD3D10BufferKHR)( + cl_context context, cl_mem_flags flags, ID3D10Buffer *resource, + cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromD3D10Texture2DKHR)( + cl_context context, cl_mem_flags flags, ID3D10Texture2D *resource, + UINT subresource, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromD3D10Texture3DKHR)( + cl_context context, cl_mem_flags flags, ID3D10Texture3D *resource, + UINT subresource, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY +cl_int(CL_API_CALL *cl_api_clEnqueueAcquireD3D10ObjectsKHR)( + cl_command_queue command_queue, cl_uint num_objects, + const cl_mem *mem_objects, cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *event) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY +cl_int(CL_API_CALL *cl_api_clEnqueueReleaseD3D10ObjectsKHR)( + cl_command_queue command_queue, cl_uint num_objects, + const cl_mem *mem_objects, cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *event) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL clGetDeviceIDsFromD3D10KHR( + cl_platform_id platform, cl_d3d10_device_source_khr d3d_device_source, + void *d3d_object, cl_d3d10_device_set_khr d3d_device_set, + cl_uint num_entries, cl_device_id *devices, cl_uint *num_devices); + +extern CL_API_ENTRY cl_mem CL_API_CALL +clCreateFromD3D10BufferKHR(cl_context context, cl_mem_flags flags, + ID3D10Buffer *resource, cl_int *errcode_ret); + +extern CL_API_ENTRY cl_mem CL_API_CALL clCreateFromD3D10Texture2DKHR( + cl_context context, cl_mem_flags flags, ID3D10Texture2D *resource, + UINT subresource, cl_int *errcode_ret); + +extern CL_API_ENTRY cl_mem CL_API_CALL clCreateFromD3D10Texture3DKHR( + cl_context context, cl_mem_flags flags, ID3D10Texture3D *resource, + UINT subresource, cl_int *errcode_ret); + +extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueAcquireD3D10ObjectsKHR( + cl_command_queue command_queue, cl_uint num_objects, + const cl_mem *mem_objects, cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, cl_event *event); + +extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueReleaseD3D10ObjectsKHR( + cl_command_queue command_queue, cl_uint num_objects, + const cl_mem *mem_objects, cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, cl_event *event); + +/* cl_khr_d3d11_sharing */ +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetDeviceIDsFromD3D11KHR)( + cl_platform_id platform, cl_d3d11_device_source_khr d3d_device_source, + void *d3d_object, cl_d3d11_device_set_khr d3d_device_set, + cl_uint num_entries, cl_device_id *devices, + cl_uint *num_devices) CL_API_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromD3D11BufferKHR)( + cl_context context, cl_mem_flags flags, ID3D11Buffer *resource, + cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromD3D11Texture2DKHR)( + cl_context context, cl_mem_flags flags, ID3D11Texture2D *resource, + UINT subresource, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromD3D11Texture3DKHR)( + cl_context context, cl_mem_flags flags, ID3D11Texture3D *resource, + UINT subresource, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY +cl_int(CL_API_CALL *cl_api_clEnqueueAcquireD3D11ObjectsKHR)( + cl_command_queue command_queue, cl_uint num_objects, + const cl_mem *mem_objects, cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *event) CL_API_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY +cl_int(CL_API_CALL *cl_api_clEnqueueReleaseD3D11ObjectsKHR)( + cl_command_queue command_queue, cl_uint num_objects, + const cl_mem *mem_objects, cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *event) CL_API_SUFFIX__VERSION_1_2; + +/* cl_khr_dx9_media_sharing */ +typedef CL_API_ENTRY +cl_int(CL_API_CALL *cl_api_clGetDeviceIDsFromDX9MediaAdapterKHR)( + cl_platform_id platform, cl_uint num_media_adapters, + cl_dx9_media_adapter_type_khr *media_adapters_type, void *media_adapters, + cl_dx9_media_adapter_set_khr media_adapter_set, cl_uint num_entries, + cl_device_id *devices, cl_uint *num_devices) CL_API_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromDX9MediaSurfaceKHR)( + cl_context context, cl_mem_flags flags, + cl_dx9_media_adapter_type_khr adapter_type, void *surface_info, + cl_uint plane, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY +cl_int(CL_API_CALL *cl_api_clEnqueueAcquireDX9MediaSurfacesKHR)( + cl_command_queue command_queue, cl_uint num_objects, + const cl_mem *mem_objects, cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *event) CL_API_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY +cl_int(CL_API_CALL *cl_api_clEnqueueReleaseDX9MediaSurfacesKHR)( + cl_command_queue command_queue, cl_uint num_objects, + const cl_mem *mem_objects, cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *event) CL_API_SUFFIX__VERSION_1_2; + +/* cl_khr_d3d11_sharing */ +extern CL_API_ENTRY cl_int CL_API_CALL clGetDeviceIDsFromD3D11KHR( + cl_platform_id platform, cl_d3d11_device_source_khr d3d_device_source, + void *d3d_object, cl_d3d11_device_set_khr d3d_device_set, + cl_uint num_entries, cl_device_id *devices, cl_uint *num_devices); + +extern CL_API_ENTRY cl_mem CL_API_CALL +clCreateFromD3D11BufferKHR(cl_context context, cl_mem_flags flags, + ID3D11Buffer *resource, cl_int *errcode_ret); + +extern CL_API_ENTRY cl_mem CL_API_CALL clCreateFromD3D11Texture2DKHR( + cl_context context, cl_mem_flags flags, ID3D11Texture2D *resource, + UINT subresource, cl_int *errcode_ret); + +extern CL_API_ENTRY cl_mem CL_API_CALL clCreateFromD3D11Texture3DKHR( + cl_context context, cl_mem_flags flags, ID3D11Texture3D *resource, + UINT subresource, cl_int *errcode_ret); + +extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueAcquireD3D11ObjectsKHR( + cl_command_queue command_queue, cl_uint num_objects, + const cl_mem *mem_objects, cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, cl_event *event); + +extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueReleaseD3D11ObjectsKHR( + cl_command_queue command_queue, cl_uint num_objects, + const cl_mem *mem_objects, cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, cl_event *event); + +/* cl_khr_dx9_media_sharing */ +extern CL_API_ENTRY cl_int CL_API_CALL clGetDeviceIDsFromDX9MediaAdapterKHR( + cl_platform_id platform, cl_uint num_media_adapters, + cl_dx9_media_adapter_type_khr *media_adapter_type, void *media_adapters, + cl_dx9_media_adapter_set_khr media_adapter_set, cl_uint num_entries, + cl_device_id *devices, cl_uint *num_devices); + +extern CL_API_ENTRY cl_mem CL_API_CALL clCreateFromDX9MediaSurfaceKHR( + cl_context context, cl_mem_flags flags, + cl_dx9_media_adapter_type_khr adapter_type, void *surface_info, + cl_uint plane, cl_int *errcode_ret); + +extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueAcquireDX9MediaSurfacesKHR( + cl_command_queue command_queue, cl_uint num_objects, + const cl_mem *mem_objects, cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, cl_event *event); + +extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueReleaseDX9MediaSurfacesKHR( + cl_command_queue command_queue, cl_uint num_objects, + const cl_mem *mem_objects, cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, cl_event *event); + +#else + +/* cl_khr_d3d10_sharing */ +typedef void *cl_api_clGetDeviceIDsFromD3D10KHR; +typedef void *cl_api_clCreateFromD3D10BufferKHR; +typedef void *cl_api_clCreateFromD3D10Texture2DKHR; +typedef void *cl_api_clCreateFromD3D10Texture3DKHR; +typedef void *cl_api_clEnqueueAcquireD3D10ObjectsKHR; +typedef void *cl_api_clEnqueueReleaseD3D10ObjectsKHR; + +/* cl_khr_d3d11_sharing */ +typedef void *cl_api_clGetDeviceIDsFromD3D11KHR; +typedef void *cl_api_clCreateFromD3D11BufferKHR; +typedef void *cl_api_clCreateFromD3D11Texture2DKHR; +typedef void *cl_api_clCreateFromD3D11Texture3DKHR; +typedef void *cl_api_clEnqueueAcquireD3D11ObjectsKHR; +typedef void *cl_api_clEnqueueReleaseD3D11ObjectsKHR; + +/* cl_khr_dx9_media_sharing */ +typedef void *cl_api_clCreateFromDX9MediaSurfaceKHR; +typedef void *cl_api_clEnqueueAcquireDX9MediaSurfacesKHR; +typedef void *cl_api_clEnqueueReleaseDX9MediaSurfacesKHR; +typedef void *cl_api_clGetDeviceIDsFromDX9MediaAdapterKHR; + +#endif + +/* OpenCL 1.1 */ + +#ifdef CL_VERSION_1_1 + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clSetEventCallback)( + cl_event /* event */, cl_int /* command_exec_callback_type */, + void(CL_CALLBACK * /* pfn_notify */)(cl_event, cl_int, void *), + void * /* user_data */) CL_API_SUFFIX__VERSION_1_1; + +typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateSubBuffer)( + cl_mem /* buffer */, cl_mem_flags /* flags */, + cl_buffer_create_type /* buffer_create_type */, + const void * /* buffer_create_info */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1; + +typedef CL_API_ENTRY +cl_int(CL_API_CALL *cl_api_clSetMemObjectDestructorCallback)( + cl_mem /* memobj */, + void(CL_CALLBACK * /*pfn_notify*/)(cl_mem /* memobj */, + void * /*user_data*/), + void * /*user_data */) CL_API_SUFFIX__VERSION_1_1; + +typedef CL_API_ENTRY cl_event(CL_API_CALL *cl_api_clCreateUserEvent)( + cl_context /* context */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1; + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clSetUserEventStatus)( + cl_event /* event */, + cl_int /* execution_status */) CL_API_SUFFIX__VERSION_1_1; + +#else + +typedef void *cl_api_clSetEventCallback; +typedef void *cl_api_clCreateSubBuffer; +typedef void *cl_api_clSetMemObjectDestructorCallback; +typedef void *cl_api_clCreateUserEvent; +typedef void *cl_api_clSetUserEventStatus; + +#endif + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clCreateSubDevicesEXT)( + cl_device_id in_device, + const cl_device_partition_property_ext *partition_properties, + cl_uint num_entries, cl_device_id *out_devices, cl_uint *num_devices); + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clRetainDeviceEXT)( + cl_device_id device) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clReleaseDeviceEXT)( + cl_device_id device) CL_API_SUFFIX__VERSION_1_0; + +/* cl_khr_egl_image */ +typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromEGLImageKHR)( + cl_context context, CLeglDisplayKHR display, CLeglImageKHR image, + cl_mem_flags flags, const cl_egl_image_properties_khr *properties, + cl_int *errcode_ret); + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueAcquireEGLObjectsKHR)( + cl_command_queue command_queue, cl_uint num_objects, + const cl_mem *mem_objects, cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, cl_event *event); + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueReleaseEGLObjectsKHR)( + cl_command_queue command_queue, cl_uint num_objects, + const cl_mem *mem_objects, cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, cl_event *event); + +/* cl_khr_egl_event */ +typedef CL_API_ENTRY cl_event(CL_API_CALL *cl_api_clCreateEventFromEGLSyncKHR)( + cl_context context, CLeglSyncKHR sync, CLeglDisplayKHR display, + cl_int *errcode_ret); + +#ifdef CL_VERSION_2_1 + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clSetDefaultDeviceCommandQueue)( + cl_context context, cl_device_id device, + cl_command_queue command_queue) CL_API_SUFFIX__VERSION_2_1; + +typedef CL_API_ENTRY cl_program(CL_API_CALL *cl_api_clCreateProgramWithIL)( + cl_context context, const void *il, size_t length, + cl_int *errcode_ret) CL_API_SUFFIX__VERSION_2_1; + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetKernelSubGroupInfo)( + cl_kernel kernel, cl_device_id device, cl_kernel_sub_group_info param_name, + size_t input_value_size, const void *input_value, size_t param_value_size, + void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_2_1; + +typedef CL_API_ENTRY cl_kernel(CL_API_CALL *cl_api_clCloneKernel)( + cl_kernel source_kernel, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_2_1; + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueSVMMigrateMem)( + cl_command_queue command_queue, cl_uint num_svm_pointers, + const void **svm_pointers, const size_t *sizes, + cl_mem_migration_flags flags, cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *event) CL_API_SUFFIX__VERSION_2_1; + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetDeviceAndHostTimer)( + cl_device_id device, cl_ulong *device_timestamp, + cl_ulong *host_timestamp) CL_API_SUFFIX__VERSION_2_1; + +typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetHostTimer)( + cl_device_id device, cl_ulong *host_timestamp) CL_API_SUFFIX__VERSION_2_1; + +#else + +typedef void *cl_api_clSetDefaultDeviceCommandQueue; +typedef void *cl_api_clCreateProgramWithIL; +typedef void *cl_api_clGetKernelSubGroupInfo; +typedef void *cl_api_clCloneKernel; +typedef void *cl_api_clEnqueueSVMMigrateMem; +typedef void *cl_api_clGetDeviceAndHostTimer; +typedef void *cl_api_clGetHostTimer; + +#endif + +/* Vendor dispatch table struture */ + +typedef struct _cl_icd_dispatch { + /* OpenCL 1.0 */ + cl_api_clGetPlatformIDs clGetPlatformIDs; + cl_api_clGetPlatformInfo clGetPlatformInfo; + cl_api_clGetDeviceIDs clGetDeviceIDs; + cl_api_clGetDeviceInfo clGetDeviceInfo; + cl_api_clCreateContext clCreateContext; + cl_api_clCreateContextFromType clCreateContextFromType; + cl_api_clRetainContext clRetainContext; + cl_api_clReleaseContext clReleaseContext; + cl_api_clGetContextInfo clGetContextInfo; + cl_api_clCreateCommandQueue clCreateCommandQueue; + cl_api_clRetainCommandQueue clRetainCommandQueue; + cl_api_clReleaseCommandQueue clReleaseCommandQueue; + cl_api_clGetCommandQueueInfo clGetCommandQueueInfo; + cl_api_clSetCommandQueueProperty clSetCommandQueueProperty; + cl_api_clCreateBuffer clCreateBuffer; + cl_api_clCreateImage2D clCreateImage2D; + cl_api_clCreateImage3D clCreateImage3D; + cl_api_clRetainMemObject clRetainMemObject; + cl_api_clReleaseMemObject clReleaseMemObject; + cl_api_clGetSupportedImageFormats clGetSupportedImageFormats; + cl_api_clGetMemObjectInfo clGetMemObjectInfo; + cl_api_clGetImageInfo clGetImageInfo; + cl_api_clCreateSampler clCreateSampler; + cl_api_clRetainSampler clRetainSampler; + cl_api_clReleaseSampler clReleaseSampler; + cl_api_clGetSamplerInfo clGetSamplerInfo; + cl_api_clCreateProgramWithSource clCreateProgramWithSource; + cl_api_clCreateProgramWithBinary clCreateProgramWithBinary; + cl_api_clRetainProgram clRetainProgram; + cl_api_clReleaseProgram clReleaseProgram; + cl_api_clBuildProgram clBuildProgram; + cl_api_clUnloadCompiler clUnloadCompiler; + cl_api_clGetProgramInfo clGetProgramInfo; + cl_api_clGetProgramBuildInfo clGetProgramBuildInfo; + cl_api_clCreateKernel clCreateKernel; + cl_api_clCreateKernelsInProgram clCreateKernelsInProgram; + cl_api_clRetainKernel clRetainKernel; + cl_api_clReleaseKernel clReleaseKernel; + cl_api_clSetKernelArg clSetKernelArg; + cl_api_clGetKernelInfo clGetKernelInfo; + cl_api_clGetKernelWorkGroupInfo clGetKernelWorkGroupInfo; + cl_api_clWaitForEvents clWaitForEvents; + cl_api_clGetEventInfo clGetEventInfo; + cl_api_clRetainEvent clRetainEvent; + cl_api_clReleaseEvent clReleaseEvent; + cl_api_clGetEventProfilingInfo clGetEventProfilingInfo; + cl_api_clFlush clFlush; + cl_api_clFinish clFinish; + cl_api_clEnqueueReadBuffer clEnqueueReadBuffer; + cl_api_clEnqueueWriteBuffer clEnqueueWriteBuffer; + cl_api_clEnqueueCopyBuffer clEnqueueCopyBuffer; + cl_api_clEnqueueReadImage clEnqueueReadImage; + cl_api_clEnqueueWriteImage clEnqueueWriteImage; + cl_api_clEnqueueCopyImage clEnqueueCopyImage; + cl_api_clEnqueueCopyImageToBuffer clEnqueueCopyImageToBuffer; + cl_api_clEnqueueCopyBufferToImage clEnqueueCopyBufferToImage; + cl_api_clEnqueueMapBuffer clEnqueueMapBuffer; + cl_api_clEnqueueMapImage clEnqueueMapImage; + cl_api_clEnqueueUnmapMemObject clEnqueueUnmapMemObject; + cl_api_clEnqueueNDRangeKernel clEnqueueNDRangeKernel; + cl_api_clEnqueueTask clEnqueueTask; + cl_api_clEnqueueNativeKernel clEnqueueNativeKernel; + cl_api_clEnqueueMarker clEnqueueMarker; + cl_api_clEnqueueWaitForEvents clEnqueueWaitForEvents; + cl_api_clEnqueueBarrier clEnqueueBarrier; + cl_api_clGetExtensionFunctionAddress clGetExtensionFunctionAddress; + cl_api_clCreateFromGLBuffer clCreateFromGLBuffer; + cl_api_clCreateFromGLTexture2D clCreateFromGLTexture2D; + cl_api_clCreateFromGLTexture3D clCreateFromGLTexture3D; + cl_api_clCreateFromGLRenderbuffer clCreateFromGLRenderbuffer; + cl_api_clGetGLObjectInfo clGetGLObjectInfo; + cl_api_clGetGLTextureInfo clGetGLTextureInfo; + cl_api_clEnqueueAcquireGLObjects clEnqueueAcquireGLObjects; + cl_api_clEnqueueReleaseGLObjects clEnqueueReleaseGLObjects; + cl_api_clGetGLContextInfoKHR clGetGLContextInfoKHR; + + /* cl_khr_d3d10_sharing */ + cl_api_clGetDeviceIDsFromD3D10KHR clGetDeviceIDsFromD3D10KHR; + cl_api_clCreateFromD3D10BufferKHR clCreateFromD3D10BufferKHR; + cl_api_clCreateFromD3D10Texture2DKHR clCreateFromD3D10Texture2DKHR; + cl_api_clCreateFromD3D10Texture3DKHR clCreateFromD3D10Texture3DKHR; + cl_api_clEnqueueAcquireD3D10ObjectsKHR clEnqueueAcquireD3D10ObjectsKHR; + cl_api_clEnqueueReleaseD3D10ObjectsKHR clEnqueueReleaseD3D10ObjectsKHR; + + /* OpenCL 1.1 */ + cl_api_clSetEventCallback clSetEventCallback; + cl_api_clCreateSubBuffer clCreateSubBuffer; + cl_api_clSetMemObjectDestructorCallback clSetMemObjectDestructorCallback; + cl_api_clCreateUserEvent clCreateUserEvent; + cl_api_clSetUserEventStatus clSetUserEventStatus; + cl_api_clEnqueueReadBufferRect clEnqueueReadBufferRect; + cl_api_clEnqueueWriteBufferRect clEnqueueWriteBufferRect; + cl_api_clEnqueueCopyBufferRect clEnqueueCopyBufferRect; + + /* cl_ext_device_fission */ + cl_api_clCreateSubDevicesEXT clCreateSubDevicesEXT; + cl_api_clRetainDeviceEXT clRetainDeviceEXT; + cl_api_clReleaseDeviceEXT clReleaseDeviceEXT; + + /* cl_khr_gl_event */ + cl_api_clCreateEventFromGLsyncKHR clCreateEventFromGLsyncKHR; + + /* OpenCL 1.2 */ + cl_api_clCreateSubDevices clCreateSubDevices; + cl_api_clRetainDevice clRetainDevice; + cl_api_clReleaseDevice clReleaseDevice; + cl_api_clCreateImage clCreateImage; + cl_api_clCreateProgramWithBuiltInKernels clCreateProgramWithBuiltInKernels; + cl_api_clCompileProgram clCompileProgram; + cl_api_clLinkProgram clLinkProgram; + cl_api_clUnloadPlatformCompiler clUnloadPlatformCompiler; + cl_api_clGetKernelArgInfo clGetKernelArgInfo; + cl_api_clEnqueueFillBuffer clEnqueueFillBuffer; + cl_api_clEnqueueFillImage clEnqueueFillImage; + cl_api_clEnqueueMigrateMemObjects clEnqueueMigrateMemObjects; + cl_api_clEnqueueMarkerWithWaitList clEnqueueMarkerWithWaitList; + cl_api_clEnqueueBarrierWithWaitList clEnqueueBarrierWithWaitList; + cl_api_clGetExtensionFunctionAddressForPlatform + clGetExtensionFunctionAddressForPlatform; + cl_api_clCreateFromGLTexture clCreateFromGLTexture; + + /* cl_khr_d3d11_sharing */ + cl_api_clGetDeviceIDsFromD3D11KHR clGetDeviceIDsFromD3D11KHR; + cl_api_clCreateFromD3D11BufferKHR clCreateFromD3D11BufferKHR; + cl_api_clCreateFromD3D11Texture2DKHR clCreateFromD3D11Texture2DKHR; + cl_api_clCreateFromD3D11Texture3DKHR clCreateFromD3D11Texture3DKHR; + cl_api_clCreateFromDX9MediaSurfaceKHR clCreateFromDX9MediaSurfaceKHR; + cl_api_clEnqueueAcquireD3D11ObjectsKHR clEnqueueAcquireD3D11ObjectsKHR; + cl_api_clEnqueueReleaseD3D11ObjectsKHR clEnqueueReleaseD3D11ObjectsKHR; + + /* cl_khr_dx9_media_sharing */ + cl_api_clGetDeviceIDsFromDX9MediaAdapterKHR + clGetDeviceIDsFromDX9MediaAdapterKHR; + cl_api_clEnqueueAcquireDX9MediaSurfacesKHR + clEnqueueAcquireDX9MediaSurfacesKHR; + cl_api_clEnqueueReleaseDX9MediaSurfacesKHR + clEnqueueReleaseDX9MediaSurfacesKHR; + + /* cl_khr_egl_image */ + cl_api_clCreateFromEGLImageKHR clCreateFromEGLImageKHR; + cl_api_clEnqueueAcquireEGLObjectsKHR clEnqueueAcquireEGLObjectsKHR; + cl_api_clEnqueueReleaseEGLObjectsKHR clEnqueueReleaseEGLObjectsKHR; + + /* cl_khr_egl_event */ + cl_api_clCreateEventFromEGLSyncKHR clCreateEventFromEGLSyncKHR; + + /* OpenCL 2.0 */ + cl_api_clCreateCommandQueueWithProperties clCreateCommandQueueWithProperties; + cl_api_clCreatePipe clCreatePipe; + cl_api_clGetPipeInfo clGetPipeInfo; + cl_api_clSVMAlloc clSVMAlloc; + cl_api_clSVMFree clSVMFree; + cl_api_clEnqueueSVMFree clEnqueueSVMFree; + cl_api_clEnqueueSVMMemcpy clEnqueueSVMMemcpy; + cl_api_clEnqueueSVMMemFill clEnqueueSVMMemFill; + cl_api_clEnqueueSVMMap clEnqueueSVMMap; + cl_api_clEnqueueSVMUnmap clEnqueueSVMUnmap; + cl_api_clCreateSamplerWithProperties clCreateSamplerWithProperties; + cl_api_clSetKernelArgSVMPointer clSetKernelArgSVMPointer; + cl_api_clSetKernelExecInfo clSetKernelExecInfo; + + /* cl_khr_sub_groups */ + cl_api_clGetKernelSubGroupInfoKHR clGetKernelSubGroupInfoKHR; + + /* OpenCL 2.1 */ + cl_api_clCloneKernel clCloneKernel; + cl_api_clCreateProgramWithIL clCreateProgramWithIL; + cl_api_clEnqueueSVMMigrateMem clEnqueueSVMMigrateMem; + cl_api_clGetDeviceAndHostTimer clGetDeviceAndHostTimer; + cl_api_clGetHostTimer clGetHostTimer; + cl_api_clGetKernelSubGroupInfo clGetKernelSubGroupInfo; + cl_api_clSetDefaultDeviceCommandQueue clSetDefaultDeviceCommandQueue; + + /* OpenCL 2.2 */ + cl_api_clSetProgramReleaseCallback clSetProgramReleaseCallback; + cl_api_clSetProgramSpecializationConstant clSetProgramSpecializationConstant; +} cl_icd_dispatch; + +#ifdef __cplusplus +} +#endif + +#endif /* #ifndef OPENCL_CL_ICD_H */ diff -Nru mesa-19.2.8/include/CL/cl_platform.h mesa-20.0.8/include/CL/cl_platform.h --- mesa-19.2.8/include/CL/cl_platform.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/include/CL/cl_platform.h 2020-06-12 01:21:16.000000000 +0000 @@ -67,101 +67,56 @@ #define CL_API_SUFFIX__VERSION_2_2 #define CL_EXT_SUFFIX__VERSION_2_2 + #ifdef __GNUC__ - #ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS - #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED - #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED - #else - #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED __attribute__((deprecated)) - #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED - #endif - - #ifdef CL_USE_DEPRECATED_OPENCL_1_1_APIS - #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED - #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED - #else - #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED __attribute__((deprecated)) - #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED - #endif - - #ifdef CL_USE_DEPRECATED_OPENCL_1_2_APIS - #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED - #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED - #else - #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED __attribute__((deprecated)) - #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED - #endif - - #ifdef CL_USE_DEPRECATED_OPENCL_2_0_APIS - #define CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED - #define CL_EXT_PREFIX__VERSION_2_0_DEPRECATED - #else - #define CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED __attribute__((deprecated)) - #define CL_EXT_PREFIX__VERSION_2_0_DEPRECATED - #endif - - #ifdef CL_USE_DEPRECATED_OPENCL_2_1_APIS - #define CL_EXT_SUFFIX__VERSION_2_1_DEPRECATED - #define CL_EXT_PREFIX__VERSION_2_1_DEPRECATED - #else - #define CL_EXT_SUFFIX__VERSION_2_1_DEPRECATED __attribute__((deprecated)) - #define CL_EXT_PREFIX__VERSION_2_1_DEPRECATED - #endif + #define CL_EXT_SUFFIX_DEPRECATED __attribute__((deprecated)) + #define CL_EXT_PREFIX_DEPRECATED #elif defined(_WIN32) - #ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS - #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED - #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED - #else - #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED - #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED __declspec(deprecated) - #endif - - #ifdef CL_USE_DEPRECATED_OPENCL_1_1_APIS - #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED - #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED - #else - #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED - #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED __declspec(deprecated) - #endif - - #ifdef CL_USE_DEPRECATED_OPENCL_1_2_APIS - #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED - #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED - #else - #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED - #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED __declspec(deprecated) - #endif - - #ifdef CL_USE_DEPRECATED_OPENCL_2_0_APIS - #define CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED - #define CL_EXT_PREFIX__VERSION_2_0_DEPRECATED - #else - #define CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED - #define CL_EXT_PREFIX__VERSION_2_0_DEPRECATED __declspec(deprecated) - #endif - - #ifdef CL_USE_DEPRECATED_OPENCL_2_1_APIS - #define CL_EXT_SUFFIX__VERSION_2_1_DEPRECATED - #define CL_EXT_PREFIX__VERSION_2_1_DEPRECATED - #else - #define CL_EXT_SUFFIX__VERSION_2_1_DEPRECATED - #define CL_EXT_PREFIX__VERSION_2_1_DEPRECATED __declspec(deprecated) - #endif + #define CL_EXT_SUFFIX_DEPRECATED + #define CL_EXT_PREFIX_DEPRECATED __declspec(deprecated) #else + #define CL_EXT_SUFFIX_DEPRECATED + #define CL_EXT_PREFIX_DEPRECATED +#endif + +#ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED +#else + #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED CL_EXT_SUFFIX_DEPRECATED + #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED CL_EXT_PREFIX_DEPRECATED +#endif +#ifdef CL_USE_DEPRECATED_OPENCL_1_1_APIS #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED +#else + #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED CL_EXT_SUFFIX_DEPRECATED + #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED CL_EXT_PREFIX_DEPRECATED +#endif +#ifdef CL_USE_DEPRECATED_OPENCL_1_2_APIS #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED +#else + #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED CL_EXT_SUFFIX_DEPRECATED + #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED CL_EXT_PREFIX_DEPRECATED + #endif +#ifdef CL_USE_DEPRECATED_OPENCL_2_0_APIS #define CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED #define CL_EXT_PREFIX__VERSION_2_0_DEPRECATED +#else + #define CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED CL_EXT_SUFFIX_DEPRECATED + #define CL_EXT_PREFIX__VERSION_2_0_DEPRECATED CL_EXT_PREFIX_DEPRECATED +#endif +#ifdef CL_USE_DEPRECATED_OPENCL_2_1_APIS #define CL_EXT_SUFFIX__VERSION_2_1_DEPRECATED #define CL_EXT_PREFIX__VERSION_2_1_DEPRECATED +#else + #define CL_EXT_SUFFIX__VERSION_2_1_DEPRECATED CL_EXT_SUFFIX_DEPRECATED + #define CL_EXT_PREFIX__VERSION_2_1_DEPRECATED CL_EXT_PREFIX_DEPRECATED #endif #if (defined (_WIN32) && defined(_MSC_VER)) @@ -271,16 +226,16 @@ /* scalar types */ typedef int8_t cl_char; typedef uint8_t cl_uchar; -typedef int16_t cl_short __attribute__((aligned(2))); -typedef uint16_t cl_ushort __attribute__((aligned(2))); -typedef int32_t cl_int __attribute__((aligned(4))); -typedef uint32_t cl_uint __attribute__((aligned(4))); -typedef int64_t cl_long __attribute__((aligned(8))); -typedef uint64_t cl_ulong __attribute__((aligned(8))); - -typedef uint16_t cl_half __attribute__((aligned(2))); -typedef float cl_float __attribute__((aligned(4))); -typedef double cl_double __attribute__((aligned(8))); +typedef int16_t cl_short; +typedef uint16_t cl_ushort; +typedef int32_t cl_int; +typedef uint32_t cl_uint; +typedef int64_t cl_long; +typedef uint64_t cl_ulong; + +typedef uint16_t cl_half; +typedef float cl_float; +typedef double cl_double; /* Macro names and corresponding values defined by OpenCL */ #define CL_CHAR_BIT 8 diff -Nru mesa-19.2.8/include/drm-uapi/drm_fourcc.h mesa-20.0.8/include/drm-uapi/drm_fourcc.h --- mesa-19.2.8/include/drm-uapi/drm_fourcc.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/include/drm-uapi/drm_fourcc.h 2020-06-12 01:21:16.000000000 +0000 @@ -144,6 +144,17 @@ #define DRM_FORMAT_RGBA1010102 fourcc_code('R', 'A', '3', '0') /* [31:0] R:G:B:A 10:10:10:2 little endian */ #define DRM_FORMAT_BGRA1010102 fourcc_code('B', 'A', '3', '0') /* [31:0] B:G:R:A 10:10:10:2 little endian */ +/* + * Floating point 64bpp RGB + * IEEE 754-2008 binary16 half-precision float + * [15:0] sign:exponent:mantissa 1:5:10 + */ +#define DRM_FORMAT_XRGB16161616F fourcc_code('X', 'R', '4', 'H') /* [63:0] x:R:G:B 16:16:16:16 little endian */ +#define DRM_FORMAT_XBGR16161616F fourcc_code('X', 'B', '4', 'H') /* [63:0] x:B:G:R 16:16:16:16 little endian */ + +#define DRM_FORMAT_ARGB16161616F fourcc_code('A', 'R', '4', 'H') /* [63:0] A:R:G:B 16:16:16:16 little endian */ +#define DRM_FORMAT_ABGR16161616F fourcc_code('A', 'B', '4', 'H') /* [63:0] A:B:G:R 16:16:16:16 little endian */ + /* packed YCbCr */ #define DRM_FORMAT_YUYV fourcc_code('Y', 'U', 'Y', 'V') /* [31:0] Cr0:Y1:Cb0:Y0 8:8:8:8 little endian */ #define DRM_FORMAT_YVYU fourcc_code('Y', 'V', 'Y', 'U') /* [31:0] Cb0:Y1:Cr0:Y0 8:8:8:8 little endian */ @@ -151,7 +162,29 @@ #define DRM_FORMAT_VYUY fourcc_code('V', 'Y', 'U', 'Y') /* [31:0] Y1:Cb0:Y0:Cr0 8:8:8:8 little endian */ #define DRM_FORMAT_AYUV fourcc_code('A', 'Y', 'U', 'V') /* [31:0] A:Y:Cb:Cr 8:8:8:8 little endian */ -#define DRM_FORMAT_XYUV8888 fourcc_code('X', 'Y', 'U', 'V') /* [31:0] X:Y:Cb:Cr 8:8:8:8 little endian */ +#define DRM_FORMAT_XYUV8888 fourcc_code('X', 'Y', 'U', 'V') /* [31:0] X:Y:Cb:Cr 8:8:8:8 little endian */ +#define DRM_FORMAT_VUY888 fourcc_code('V', 'U', '2', '4') /* [23:0] Cr:Cb:Y 8:8:8 little endian */ +#define DRM_FORMAT_VUY101010 fourcc_code('V', 'U', '3', '0') /* Y followed by U then V, 10:10:10. Non-linear modifier only */ + +/* + * packed Y2xx indicate for each component, xx valid data occupy msb + * 16-xx padding occupy lsb + */ +#define DRM_FORMAT_Y210 fourcc_code('Y', '2', '1', '0') /* [63:0] Cr0:0:Y1:0:Cb0:0:Y0:0 10:6:10:6:10:6:10:6 little endian per 2 Y pixels */ +#define DRM_FORMAT_Y212 fourcc_code('Y', '2', '1', '2') /* [63:0] Cr0:0:Y1:0:Cb0:0:Y0:0 12:4:12:4:12:4:12:4 little endian per 2 Y pixels */ +#define DRM_FORMAT_Y216 fourcc_code('Y', '2', '1', '6') /* [63:0] Cr0:Y1:Cb0:Y0 16:16:16:16 little endian per 2 Y pixels */ + +/* + * packed Y4xx indicate for each component, xx valid data occupy msb + * 16-xx padding occupy lsb except Y410 + */ +#define DRM_FORMAT_Y410 fourcc_code('Y', '4', '1', '0') /* [31:0] A:Cr:Y:Cb 2:10:10:10 little endian */ +#define DRM_FORMAT_Y412 fourcc_code('Y', '4', '1', '2') /* [63:0] A:0:Cr:0:Y:0:Cb:0 12:4:12:4:12:4:12:4 little endian */ +#define DRM_FORMAT_Y416 fourcc_code('Y', '4', '1', '6') /* [63:0] A:Cr:Y:Cb 16:16:16:16 little endian */ + +#define DRM_FORMAT_XVYU2101010 fourcc_code('X', 'V', '3', '0') /* [31:0] X:Cr:Y:Cb 2:10:10:10 little endian */ +#define DRM_FORMAT_XVYU12_16161616 fourcc_code('X', 'V', '3', '6') /* [63:0] X:0:Cr:0:Y:0:Cb:0 12:4:12:4:12:4:12:4 little endian */ +#define DRM_FORMAT_XVYU16161616 fourcc_code('X', 'V', '4', '8') /* [63:0] X:Cr:Y:Cb 16:16:16:16 little endian */ /* * packed YCbCr420 2x2 tiled formats @@ -168,6 +201,15 @@ #define DRM_FORMAT_X0L2 fourcc_code('X', '0', 'L', '2') /* + * 1-plane YUV 4:2:0 + * In these formats, the component ordering is specified (Y, followed by U + * then V), but the exact Linear layout is undefined. + * These formats can only be used with a non-Linear modifier. + */ +#define DRM_FORMAT_YUV420_8BIT fourcc_code('Y', 'U', '0', '8') +#define DRM_FORMAT_YUV420_10BIT fourcc_code('Y', 'U', '1', '0') + +/* * 2 plane RGB + A * index 0 = RGB plane, same format as the corresponding non _A8 format has * index 1 = A plane, [7:0] A @@ -200,6 +242,13 @@ * index 0 = Y plane, [15:0] Y:x [10:6] little endian * index 1 = Cr:Cb plane, [31:0] Cr:x:Cb:x [10:6:10:6] little endian */ +#define DRM_FORMAT_P210 fourcc_code('P', '2', '1', '0') /* 2x1 subsampled Cr:Cb plane, 10 bit per channel */ + +/* + * 2 plane YCbCr MSB aligned + * index 0 = Y plane, [15:0] Y:x [10:6] little endian + * index 1 = Cr:Cb plane, [31:0] Cr:x:Cb:x [10:6:10:6] little endian + */ #define DRM_FORMAT_P010 fourcc_code('P', '0', '1', '0') /* 2x2 subsampled Cr:Cb plane 10 bits per channel */ /* @@ -599,7 +648,21 @@ * Further information on the use of AFBC modifiers can be found in * Documentation/gpu/afbc.rst */ -#define DRM_FORMAT_MOD_ARM_AFBC(__afbc_mode) fourcc_mod_code(ARM, __afbc_mode) + +/* + * The top 4 bits (out of the 56 bits alloted for specifying vendor specific + * modifiers) denote the category for modifiers. Currently we have only two + * categories of modifiers ie AFBC and MISC. We can have a maximum of sixteen + * different categories. + */ +#define DRM_FORMAT_MOD_ARM_CODE(__type, __val) \ + fourcc_mod_code(ARM, ((__u64)(__type) << 52) | ((__val) & 0x000fffffffffffffULL)) + +#define DRM_FORMAT_MOD_ARM_TYPE_AFBC 0x00 +#define DRM_FORMAT_MOD_ARM_TYPE_MISC 0x01 + +#define DRM_FORMAT_MOD_ARM_AFBC(__afbc_mode) \ + DRM_FORMAT_MOD_ARM_CODE(DRM_FORMAT_MOD_ARM_TYPE_AFBC, __afbc_mode) /* * AFBC superblock size @@ -694,6 +757,16 @@ #define AFBC_FORMAT_MOD_BCH (1ULL << 11) /* + * Arm 16x16 Block U-Interleaved modifier + * + * This is used by Arm Mali Utgard and Midgard GPUs. It divides the image + * into 16x16 pixel blocks. Blocks are stored linearly in order, but pixels + * in the block are reordered. + */ +#define DRM_FORMAT_MOD_ARM_16X16_BLOCK_U_INTERLEAVED \ + DRM_FORMAT_MOD_ARM_CODE(DRM_FORMAT_MOD_ARM_TYPE_MISC, 1ULL) + +/* * Allwinner tiled modifier * * This tiling mode is implemented by the VPU found on all Allwinner platforms, diff -Nru mesa-19.2.8/include/drm-uapi/drm.h mesa-20.0.8/include/drm-uapi/drm.h --- mesa-19.2.8/include/drm-uapi/drm.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/include/drm-uapi/drm.h 2020-06-12 01:21:16.000000000 +0000 @@ -44,6 +44,7 @@ #else /* One of the BSDs */ +#include #include #include typedef int8_t __s8; @@ -643,6 +644,7 @@ #define DRM_CAP_PAGE_FLIP_TARGET 0x11 #define DRM_CAP_CRTC_IN_VBLANK_EVENT 0x12 #define DRM_CAP_SYNCOBJ 0x13 +#define DRM_CAP_SYNCOBJ_TIMELINE 0x14 /** DRM_IOCTL_GET_CAP ioctl argument type */ struct drm_get_cap { @@ -729,8 +731,18 @@ __u32 pad; }; +struct drm_syncobj_transfer { + __u32 src_handle; + __u32 dst_handle; + __u64 src_point; + __u64 dst_point; + __u32 flags; + __u32 pad; +}; + #define DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL (1 << 0) #define DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT (1 << 1) +#define DRM_SYNCOBJ_WAIT_FLAGS_WAIT_AVAILABLE (1 << 2) /* wait for time point to become available */ struct drm_syncobj_wait { __u64 handles; /* absolute timeout */ @@ -741,12 +753,33 @@ __u32 pad; }; +struct drm_syncobj_timeline_wait { + __u64 handles; + /* wait on specific timeline point for every handles*/ + __u64 points; + /* absolute timeout */ + __s64 timeout_nsec; + __u32 count_handles; + __u32 flags; + __u32 first_signaled; /* only valid when not waiting all */ + __u32 pad; +}; + + struct drm_syncobj_array { __u64 handles; __u32 count_handles; __u32 pad; }; +struct drm_syncobj_timeline_array { + __u64 handles; + __u64 points; + __u32 count_handles; + __u32 pad; +}; + + /* Query current scanout sequence number */ struct drm_crtc_get_sequence { __u32 crtc_id; /* requested crtc_id */ @@ -903,6 +936,11 @@ #define DRM_IOCTL_MODE_GET_LEASE DRM_IOWR(0xC8, struct drm_mode_get_lease) #define DRM_IOCTL_MODE_REVOKE_LEASE DRM_IOWR(0xC9, struct drm_mode_revoke_lease) +#define DRM_IOCTL_SYNCOBJ_TIMELINE_WAIT DRM_IOWR(0xCA, struct drm_syncobj_timeline_wait) +#define DRM_IOCTL_SYNCOBJ_QUERY DRM_IOWR(0xCB, struct drm_syncobj_timeline_array) +#define DRM_IOCTL_SYNCOBJ_TRANSFER DRM_IOWR(0xCC, struct drm_syncobj_transfer) +#define DRM_IOCTL_SYNCOBJ_TIMELINE_SIGNAL DRM_IOWR(0xCD, struct drm_syncobj_timeline_array) + /** * Device specific ioctls should only be in their respective headers * The device specific ioctl range is from 0x40 to 0x9f. diff -Nru mesa-19.2.8/include/drm-uapi/drm_mode.h mesa-20.0.8/include/drm-uapi/drm_mode.h --- mesa-19.2.8/include/drm-uapi/drm_mode.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/include/drm-uapi/drm_mode.h 2020-06-12 01:21:16.000000000 +0000 @@ -33,7 +33,15 @@ extern "C" { #endif -#define DRM_DISPLAY_INFO_LEN 32 +/** + * DOC: overview + * + * DRM exposes many UAPI and structure definition to have a consistent + * and standardized interface with user. + * Userspace can refer to these structure definitions and UAPI formats + * to communicate to driver + */ + #define DRM_CONNECTOR_NAME_LEN 32 #define DRM_DISPLAY_MODE_LEN 32 #define DRM_PROP_NAME_LEN 32 @@ -353,6 +361,7 @@ #define DRM_MODE_CONNECTOR_DSI 16 #define DRM_MODE_CONNECTOR_DPI 17 #define DRM_MODE_CONNECTOR_WRITEBACK 18 +#define DRM_MODE_CONNECTOR_SPI 19 struct drm_mode_get_connector { @@ -622,7 +631,8 @@ struct drm_color_lut { /* - * Data is U0.16 fixed point format. + * Values are mapped linearly to 0.0 - 1.0 range, with 0x0 == 0.0 and + * 0xffff == 1.0. */ __u16 red; __u16 green; @@ -630,6 +640,92 @@ __u16 reserved; }; +/** + * struct hdr_metadata_infoframe - HDR Metadata Infoframe Data. + * + * HDR Metadata Infoframe as per CTA 861.G spec. This is expected + * to match exactly with the spec. + * + * Userspace is expected to pass the metadata information as per + * the format described in this structure. + */ +struct hdr_metadata_infoframe { + /** + * @eotf: Electro-Optical Transfer Function (EOTF) + * used in the stream. + */ + __u8 eotf; + /** + * @metadata_type: Static_Metadata_Descriptor_ID. + */ + __u8 metadata_type; + /** + * @display_primaries: Color Primaries of the Data. + * These are coded as unsigned 16-bit values in units of + * 0.00002, where 0x0000 represents zero and 0xC350 + * represents 1.0000. + * @display_primaries.x: X cordinate of color primary. + * @display_primaries.y: Y cordinate of color primary. + */ + struct { + __u16 x, y; + } display_primaries[3]; + /** + * @white_point: White Point of Colorspace Data. + * These are coded as unsigned 16-bit values in units of + * 0.00002, where 0x0000 represents zero and 0xC350 + * represents 1.0000. + * @white_point.x: X cordinate of whitepoint of color primary. + * @white_point.y: Y cordinate of whitepoint of color primary. + */ + struct { + __u16 x, y; + } white_point; + /** + * @max_display_mastering_luminance: Max Mastering Display Luminance. + * This value is coded as an unsigned 16-bit value in units of 1 cd/m2, + * where 0x0001 represents 1 cd/m2 and 0xFFFF represents 65535 cd/m2. + */ + __u16 max_display_mastering_luminance; + /** + * @min_display_mastering_luminance: Min Mastering Display Luminance. + * This value is coded as an unsigned 16-bit value in units of + * 0.0001 cd/m2, where 0x0001 represents 0.0001 cd/m2 and 0xFFFF + * represents 6.5535 cd/m2. + */ + __u16 min_display_mastering_luminance; + /** + * @max_cll: Max Content Light Level. + * This value is coded as an unsigned 16-bit value in units of 1 cd/m2, + * where 0x0001 represents 1 cd/m2 and 0xFFFF represents 65535 cd/m2. + */ + __u16 max_cll; + /** + * @max_fall: Max Frame Average Light Level. + * This value is coded as an unsigned 16-bit value in units of 1 cd/m2, + * where 0x0001 represents 1 cd/m2 and 0xFFFF represents 65535 cd/m2. + */ + __u16 max_fall; +}; + +/** + * struct hdr_output_metadata - HDR output metadata + * + * Metadata Information to be passed from userspace + */ +struct hdr_output_metadata { + /** + * @metadata_type: Static_Metadata_Descriptor_ID. + */ + __u32 metadata_type; + /** + * @hdmi_metadata_type1: HDR Metadata Infoframe. + */ + union { + struct hdr_metadata_infoframe hdmi_metadata_type1; + }; +}; + #define DRM_MODE_PAGE_FLIP_EVENT 0x01 #define DRM_MODE_PAGE_FLIP_ASYNC 0x02 #define DRM_MODE_PAGE_FLIP_TARGET_ABSOLUTE 0x4 @@ -803,6 +899,10 @@ }; /** + * struct drm_mode_create_blob - Create New block property + * @data: Pointer to data to copy. + * @length: Length of data to copy. + * @blob_id: new property ID. * Create a new 'blob' data property, copying length bytes from data pointer, * and returning new blob ID. */ @@ -816,6 +916,8 @@ }; /** + * struct drm_mode_destroy_blob - Destroy user blob + * @blob_id: blob_id to destroy * Destroy a user-created blob property. */ struct drm_mode_destroy_blob { @@ -823,6 +925,12 @@ }; /** + * struct drm_mode_create_lease - Create lease + * @object_ids: Pointer to array of object ids. + * @object_count: Number of object ids. + * @flags: flags for new FD. + * @lessee_id: unique identifier for lessee. + * @fd: file descriptor to new drm_master file. * Lease mode resources, creating another drm_master. */ struct drm_mode_create_lease { @@ -840,6 +948,10 @@ }; /** + * struct drm_mode_list_lessees - List lessees + * @count_lessees: Number of lessees. + * @pad: pad. + * @lessees_ptr: Pointer to lessess. * List lesses from a drm_master */ struct drm_mode_list_lessees { @@ -860,6 +972,10 @@ }; /** + * struct drm_mode_get_lease - Get Lease + * @count_objects: Number of leased objects. + * @pad: pad. + * @objects_ptr: Pointer to objects. * Get leased objects */ struct drm_mode_get_lease { @@ -880,6 +996,8 @@ }; /** + * struct drm_mode_revoke_lease - Revoke lease + * @lessee_id: Unique ID of lessee. * Revoke lease */ struct drm_mode_revoke_lease { diff -Nru mesa-19.2.8/include/drm-uapi/etnaviv_drm.h mesa-20.0.8/include/drm-uapi/etnaviv_drm.h --- mesa-19.2.8/include/drm-uapi/etnaviv_drm.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/include/drm-uapi/etnaviv_drm.h 2020-06-12 01:21:16.000000000 +0000 @@ -73,6 +73,7 @@ #define ETNAVIV_PARAM_GPU_INSTRUCTION_COUNT 0x18 #define ETNAVIV_PARAM_GPU_NUM_CONSTANTS 0x19 #define ETNAVIV_PARAM_GPU_NUM_VARYINGS 0x1a +#define ETNAVIV_PARAM_SOFTPIN_START_ADDR 0x1b #define ETNA_MAX_PIPES 4 @@ -148,6 +149,11 @@ * then patching the cmdstream for this entry is skipped. This can * avoid kernel needing to map/access the cmdstream bo in the common * case. + * If the submit is a softpin submit (ETNA_SUBMIT_SOFTPIN) the 'presumed' + * field is interpreted as the fixed location to map the bo into the gpu + * virtual address space. If the kernel is unable to map the buffer at + * this location the submit will fail. This means userspace is responsible + * for the whole gpu virtual address management. */ #define ETNA_SUBMIT_BO_READ 0x0001 #define ETNA_SUBMIT_BO_WRITE 0x0002 @@ -177,9 +183,11 @@ #define ETNA_SUBMIT_NO_IMPLICIT 0x0001 #define ETNA_SUBMIT_FENCE_FD_IN 0x0002 #define ETNA_SUBMIT_FENCE_FD_OUT 0x0004 +#define ETNA_SUBMIT_SOFTPIN 0x0008 #define ETNA_SUBMIT_FLAGS (ETNA_SUBMIT_NO_IMPLICIT | \ ETNA_SUBMIT_FENCE_FD_IN | \ - ETNA_SUBMIT_FENCE_FD_OUT) + ETNA_SUBMIT_FENCE_FD_OUT| \ + ETNA_SUBMIT_SOFTPIN) #define ETNA_PIPE_3D 0x00 #define ETNA_PIPE_2D 0x01 #define ETNA_PIPE_VG 0x02 diff -Nru mesa-19.2.8/include/drm-uapi/i915_drm.h mesa-20.0.8/include/drm-uapi/i915_drm.h --- mesa-19.2.8/include/drm-uapi/i915_drm.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/include/drm-uapi/i915_drm.h 2020-06-12 01:21:16.000000000 +0000 @@ -63,6 +63,28 @@ #define I915_RESET_UEVENT "RESET" /* + * i915_user_extension: Base class for defining a chain of extensions + * + * Many interfaces need to grow over time. In most cases we can simply + * extend the struct and have userspace pass in more data. Another option, + * as demonstrated by Vulkan's approach to providing extensions for forward + * and backward compatibility, is to use a list of optional structs to + * provide those extra details. + * + * The key advantage to using an extension chain is that it allows us to + * redefine the interface more easily than an ever growing struct of + * increasing complexity, and for large parts of that interface to be + * entirely optional. The downside is more pointer chasing; chasing across + * the boundary with pointers encapsulated inside u64. + */ +struct i915_user_extension { + __u64 next_extension; + __u32 name; + __u32 flags; /* All undefined bits must be zero. */ + __u32 rsvd[4]; /* Reserved for future use; must be zero. */ +}; + +/* * MOCS indexes used for GPU surfaces, defining the cacheability of the * surface data and the coherency for this data wrt. CPU vs. GPU accesses. */ @@ -99,9 +121,25 @@ I915_ENGINE_CLASS_VIDEO = 2, I915_ENGINE_CLASS_VIDEO_ENHANCE = 3, + /* should be kept compact */ + I915_ENGINE_CLASS_INVALID = -1 }; +/* + * There may be more than one engine fulfilling any role within the system. + * Each engine of a class is given a unique instance number and therefore + * any engine can be specified by its class:instance tuplet. APIs that allow + * access to any engine in the system will use struct i915_engine_class_instance + * for this identification. + */ +struct i915_engine_class_instance { + __u16 engine_class; /* see enum drm_i915_gem_engine_class */ + __u16 engine_instance; +#define I915_ENGINE_CLASS_INVALID_NONE -1 +#define I915_ENGINE_CLASS_INVALID_VIRTUAL -2 +}; + /** * DOC: perf_events exposed by i915 through /sys/bus/event_sources/drivers/i915 * @@ -319,6 +357,9 @@ #define DRM_I915_PERF_ADD_CONFIG 0x37 #define DRM_I915_PERF_REMOVE_CONFIG 0x38 #define DRM_I915_QUERY 0x39 +#define DRM_I915_GEM_VM_CREATE 0x3a +#define DRM_I915_GEM_VM_DESTROY 0x3b +/* Must be kept compact -- no holes */ #define DRM_IOCTL_I915_INIT DRM_IOW( DRM_COMMAND_BASE + DRM_I915_INIT, drm_i915_init_t) #define DRM_IOCTL_I915_FLUSH DRM_IO ( DRM_COMMAND_BASE + DRM_I915_FLUSH) @@ -367,6 +408,7 @@ #define DRM_IOCTL_I915_GET_SPRITE_COLORKEY DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GET_SPRITE_COLORKEY, struct drm_intel_sprite_colorkey) #define DRM_IOCTL_I915_GEM_WAIT DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GEM_WAIT, struct drm_i915_gem_wait) #define DRM_IOCTL_I915_GEM_CONTEXT_CREATE DRM_IOWR (DRM_COMMAND_BASE + DRM_I915_GEM_CONTEXT_CREATE, struct drm_i915_gem_context_create) +#define DRM_IOCTL_I915_GEM_CONTEXT_CREATE_EXT DRM_IOWR (DRM_COMMAND_BASE + DRM_I915_GEM_CONTEXT_CREATE, struct drm_i915_gem_context_create_ext) #define DRM_IOCTL_I915_GEM_CONTEXT_DESTROY DRM_IOW (DRM_COMMAND_BASE + DRM_I915_GEM_CONTEXT_DESTROY, struct drm_i915_gem_context_destroy) #define DRM_IOCTL_I915_REG_READ DRM_IOWR (DRM_COMMAND_BASE + DRM_I915_REG_READ, struct drm_i915_reg_read) #define DRM_IOCTL_I915_GET_RESET_STATS DRM_IOWR (DRM_COMMAND_BASE + DRM_I915_GET_RESET_STATS, struct drm_i915_reset_stats) @@ -377,6 +419,8 @@ #define DRM_IOCTL_I915_PERF_ADD_CONFIG DRM_IOW(DRM_COMMAND_BASE + DRM_I915_PERF_ADD_CONFIG, struct drm_i915_perf_oa_config) #define DRM_IOCTL_I915_PERF_REMOVE_CONFIG DRM_IOW(DRM_COMMAND_BASE + DRM_I915_PERF_REMOVE_CONFIG, __u64) #define DRM_IOCTL_I915_QUERY DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_QUERY, struct drm_i915_query) +#define DRM_IOCTL_I915_GEM_VM_CREATE DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GEM_VM_CREATE, struct drm_i915_gem_vm_control) +#define DRM_IOCTL_I915_GEM_VM_DESTROY DRM_IOW (DRM_COMMAND_BASE + DRM_I915_GEM_VM_DESTROY, struct drm_i915_gem_vm_control) /* Allow drivers to submit batchbuffers directly to hardware, relying * on the security mechanisms provided by hardware. @@ -476,6 +520,8 @@ #define I915_SCHEDULER_CAP_ENABLED (1ul << 0) #define I915_SCHEDULER_CAP_PRIORITY (1ul << 1) #define I915_SCHEDULER_CAP_PREEMPTION (1ul << 2) +#define I915_SCHEDULER_CAP_SEMAPHORES (1ul << 3) +#define I915_SCHEDULER_CAP_ENGINE_BUSY_STATS (1ul << 4) #define I915_PARAM_HUC_STATUS 42 @@ -559,6 +605,21 @@ */ #define I915_PARAM_MMAP_GTT_COHERENT 52 +/* + * Query whether DRM_I915_GEM_EXECBUFFER2 supports coordination of parallel + * execution through use of explicit fence support. + * See I915_EXEC_FENCE_OUT and I915_EXEC_FENCE_SUBMIT. + */ +#define I915_PARAM_HAS_EXEC_SUBMIT_FENCE 53 + +/* + * Revision of the i915-perf uAPI. The value returned helps determine what + * i915-perf features are available. See drm_i915_perf_property_id. + */ +#define I915_PARAM_PERF_REVISION 54 + +/* Must be kept compact -- no holes and well documented */ + typedef struct drm_i915_getparam { __s32 param; /* @@ -574,6 +635,7 @@ #define I915_SETPARAM_TEX_LRU_LOG_GRANULARITY 2 #define I915_SETPARAM_ALLOW_BATCHBUFFER 3 #define I915_SETPARAM_NUM_USED_FENCES 4 +/* Must be kept compact -- no holes */ typedef struct drm_i915_setparam { int param; @@ -972,7 +1034,7 @@ * struct drm_i915_gem_exec_fence *fences. */ __u64 cliprects_ptr; -#define I915_EXEC_RING_MASK (7<<0) +#define I915_EXEC_RING_MASK (0x3f) #define I915_EXEC_DEFAULT (0<<0) #define I915_EXEC_RENDER (1<<0) #define I915_EXEC_BSD (2<<0) @@ -1078,7 +1140,16 @@ */ #define I915_EXEC_FENCE_ARRAY (1<<19) -#define __I915_EXEC_UNKNOWN_FLAGS (-(I915_EXEC_FENCE_ARRAY<<1)) +/* + * Setting I915_EXEC_FENCE_SUBMIT implies that lower_32_bits(rsvd2) represent + * a sync_file fd to wait upon (in a nonblocking manner) prior to executing + * the batch. + * + * Returns -EINVAL if the sync_file fd cannot be found. + */ +#define I915_EXEC_FENCE_SUBMIT (1 << 20) + +#define __I915_EXEC_UNKNOWN_FLAGS (-(I915_EXEC_FENCE_SUBMIT << 1)) #define I915_EXEC_CONTEXT_ID_MASK (0xffffffff) #define i915_execbuffer2_set_context_id(eb2, context) \ @@ -1120,32 +1191,34 @@ * as busy may become idle before the ioctl is completed. * * Furthermore, if the object is busy, which engine is busy is only - * provided as a guide. There are race conditions which prevent the - * report of which engines are busy from being always accurate. - * However, the converse is not true. If the object is idle, the - * result of the ioctl, that all engines are idle, is accurate. + * provided as a guide and only indirectly by reporting its class + * (there may be more than one engine in each class). There are race + * conditions which prevent the report of which engines are busy from + * being always accurate. However, the converse is not true. If the + * object is idle, the result of the ioctl, that all engines are idle, + * is accurate. * * The returned dword is split into two fields to indicate both - * the engines on which the object is being read, and the - * engine on which it is currently being written (if any). + * the engine classess on which the object is being read, and the + * engine class on which it is currently being written (if any). * * The low word (bits 0:15) indicate if the object is being written * to by any engine (there can only be one, as the GEM implicit * synchronisation rules force writes to be serialised). Only the - * engine for the last write is reported. + * engine class (offset by 1, I915_ENGINE_CLASS_RENDER is reported as + * 1 not 0 etc) for the last write is reported. * - * The high word (bits 16:31) are a bitmask of which engines are - * currently reading from the object. Multiple engines may be + * The high word (bits 16:31) are a bitmask of which engines classes + * are currently reading from the object. Multiple engines may be * reading from the object simultaneously. * - * The value of each engine is the same as specified in the - * EXECBUFFER2 ioctl, i.e. I915_EXEC_RENDER, I915_EXEC_BSD etc. - * Note I915_EXEC_DEFAULT is a symbolic value and is mapped to - * the I915_EXEC_RENDER engine for execution, and so it is never + * The value of each engine class is the same as specified in the + * I915_CONTEXT_SET_ENGINES parameter and via perf, i.e. + * I915_ENGINE_CLASS_RENDER, I915_ENGINE_CLASS_COPY, etc. * reported as active itself. Some hardware may have parallel * execution engines, e.g. multiple media engines, which are - * mapped to the same identifier in the EXECBUFFER2 ioctl and - * so are not separately reported for busyness. + * mapped to the same class identifier and so are not separately + * reported for busyness. * * Caveat emptor: * Only the boolean result of this query is reliable; that is whether @@ -1412,65 +1485,18 @@ }; struct drm_i915_gem_context_create { - /* output: id of new context*/ - __u32 ctx_id; - __u32 pad; -}; - -struct drm_i915_gem_context_destroy { - __u32 ctx_id; - __u32 pad; -}; - -struct drm_i915_reg_read { - /* - * Register offset. - * For 64bit wide registers where the upper 32bits don't immediately - * follow the lower 32bits, the offset of the lower 32bits must - * be specified - */ - __u64 offset; -#define I915_REG_READ_8B_WA (1ul << 0) - - __u64 val; /* Return value */ -}; -/* Known registers: - * - * Render engine timestamp - 0x2358 + 64bit - gen7+ - * - Note this register returns an invalid value if using the default - * single instruction 8byte read, in order to workaround that pass - * flag I915_REG_READ_8B_WA in offset field. - * - */ - -struct drm_i915_reset_stats { - __u32 ctx_id; - __u32 flags; - - /* All resets since boot/module reload, for all contexts */ - __u32 reset_count; - - /* Number of batches lost when active in GPU, for this context */ - __u32 batch_active; - - /* Number of batches lost pending for execution, for this context */ - __u32 batch_pending; - + __u32 ctx_id; /* output: id of new context*/ __u32 pad; }; -struct drm_i915_gem_userptr { - __u64 user_ptr; - __u64 user_size; +struct drm_i915_gem_context_create_ext { + __u32 ctx_id; /* output: id of new context*/ __u32 flags; -#define I915_USERPTR_READ_ONLY 0x1 -#define I915_USERPTR_UNSYNCHRONIZED 0x80000000 - /** - * Returned handle for the object. - * - * Object handles are nonzero. - */ - __u32 handle; +#define I915_CONTEXT_CREATE_FLAGS_USE_EXTENSIONS (1u << 0) +#define I915_CONTEXT_CREATE_FLAGS_SINGLE_TIMELINE (1u << 1) +#define I915_CONTEXT_CREATE_FLAGS_UNKNOWN \ + (-(I915_CONTEXT_CREATE_FLAGS_SINGLE_TIMELINE << 1)) + __u64 extensions; }; struct drm_i915_gem_context_param { @@ -1511,6 +1537,43 @@ * On creation, all new contexts are marked as recoverable. */ #define I915_CONTEXT_PARAM_RECOVERABLE 0x8 + + /* + * The id of the associated virtual memory address space (ppGTT) of + * this context. Can be retrieved and passed to another context + * (on the same fd) for both to use the same ppGTT and so share + * address layouts, and avoid reloading the page tables on context + * switches between themselves. + * + * See DRM_I915_GEM_VM_CREATE and DRM_I915_GEM_VM_DESTROY. + */ +#define I915_CONTEXT_PARAM_VM 0x9 + +/* + * I915_CONTEXT_PARAM_ENGINES: + * + * Bind this context to operate on this subset of available engines. Henceforth, + * the I915_EXEC_RING selector for DRM_IOCTL_I915_GEM_EXECBUFFER2 operates as + * an index into this array of engines; I915_EXEC_DEFAULT selecting engine[0] + * and upwards. Slots 0...N are filled in using the specified (class, instance). + * Use + * engine_class: I915_ENGINE_CLASS_INVALID, + * engine_instance: I915_ENGINE_CLASS_INVALID_NONE + * to specify a gap in the array that can be filled in later, e.g. by a + * virtual engine used for load balancing. + * + * Setting the number of engines bound to the context to 0, by passing a zero + * sized argument, will revert back to default settings. + * + * See struct i915_context_param_engines. + * + * Extensions: + * i915_context_engines_load_balance (I915_CONTEXT_ENGINES_EXT_LOAD_BALANCE) + * i915_context_engines_bond (I915_CONTEXT_ENGINES_EXT_BOND) + */ +#define I915_CONTEXT_PARAM_ENGINES 0xa +/* Must be kept compact -- no holes and well documented */ + __u64 value; }; @@ -1539,13 +1602,13 @@ /* * Engine class & instance to be configured or queried. */ - __u16 engine_class; - __u16 engine_instance; + struct i915_engine_class_instance engine; /* - * Unused for now. Must be cleared to zero. + * Unknown flags must be cleared to zero. */ __u32 flags; +#define I915_CONTEXT_SSEU_FLAG_ENGINE_INDEX (1u << 0) /* * Mask of slices to enable for the context. Valid values are a subset @@ -1573,6 +1636,199 @@ __u32 rsvd; }; +/* + * i915_context_engines_load_balance: + * + * Enable load balancing across this set of engines. + * + * Into the I915_EXEC_DEFAULT slot [0], a virtual engine is created that when + * used will proxy the execbuffer request onto one of the set of engines + * in such a way as to distribute the load evenly across the set. + * + * The set of engines must be compatible (e.g. the same HW class) as they + * will share the same logical GPU context and ring. + * + * To intermix rendering with the virtual engine and direct rendering onto + * the backing engines (bypassing the load balancing proxy), the context must + * be defined to use a single timeline for all engines. + */ +struct i915_context_engines_load_balance { + struct i915_user_extension base; + + __u16 engine_index; + __u16 num_siblings; + __u32 flags; /* all undefined flags must be zero */ + + __u64 mbz64; /* reserved for future use; must be zero */ + + struct i915_engine_class_instance engines[0]; +} __attribute__((packed)); + +#define I915_DEFINE_CONTEXT_ENGINES_LOAD_BALANCE(name__, N__) struct { \ + struct i915_user_extension base; \ + __u16 engine_index; \ + __u16 num_siblings; \ + __u32 flags; \ + __u64 mbz64; \ + struct i915_engine_class_instance engines[N__]; \ +} __attribute__((packed)) name__ + +/* + * i915_context_engines_bond: + * + * Constructed bonded pairs for execution within a virtual engine. + * + * All engines are equal, but some are more equal than others. Given + * the distribution of resources in the HW, it may be preferable to run + * a request on a given subset of engines in parallel to a request on a + * specific engine. We enable this selection of engines within a virtual + * engine by specifying bonding pairs, for any given master engine we will + * only execute on one of the corresponding siblings within the virtual engine. + * + * To execute a request in parallel on the master engine and a sibling requires + * coordination with a I915_EXEC_FENCE_SUBMIT. + */ +struct i915_context_engines_bond { + struct i915_user_extension base; + + struct i915_engine_class_instance master; + + __u16 virtual_index; /* index of virtual engine in ctx->engines[] */ + __u16 num_bonds; + + __u64 flags; /* all undefined flags must be zero */ + __u64 mbz64[4]; /* reserved for future use; must be zero */ + + struct i915_engine_class_instance engines[0]; +} __attribute__((packed)); + +#define I915_DEFINE_CONTEXT_ENGINES_BOND(name__, N__) struct { \ + struct i915_user_extension base; \ + struct i915_engine_class_instance master; \ + __u16 virtual_index; \ + __u16 num_bonds; \ + __u64 flags; \ + __u64 mbz64[4]; \ + struct i915_engine_class_instance engines[N__]; \ +} __attribute__((packed)) name__ + +struct i915_context_param_engines { + __u64 extensions; /* linked chain of extension blocks, 0 terminates */ +#define I915_CONTEXT_ENGINES_EXT_LOAD_BALANCE 0 /* see i915_context_engines_load_balance */ +#define I915_CONTEXT_ENGINES_EXT_BOND 1 /* see i915_context_engines_bond */ + struct i915_engine_class_instance engines[0]; +} __attribute__((packed)); + +#define I915_DEFINE_CONTEXT_PARAM_ENGINES(name__, N__) struct { \ + __u64 extensions; \ + struct i915_engine_class_instance engines[N__]; \ +} __attribute__((packed)) name__ + +struct drm_i915_gem_context_create_ext_setparam { +#define I915_CONTEXT_CREATE_EXT_SETPARAM 0 + struct i915_user_extension base; + struct drm_i915_gem_context_param param; +}; + +struct drm_i915_gem_context_create_ext_clone { +#define I915_CONTEXT_CREATE_EXT_CLONE 1 + struct i915_user_extension base; + __u32 clone_id; + __u32 flags; +#define I915_CONTEXT_CLONE_ENGINES (1u << 0) +#define I915_CONTEXT_CLONE_FLAGS (1u << 1) +#define I915_CONTEXT_CLONE_SCHEDATTR (1u << 2) +#define I915_CONTEXT_CLONE_SSEU (1u << 3) +#define I915_CONTEXT_CLONE_TIMELINE (1u << 4) +#define I915_CONTEXT_CLONE_VM (1u << 5) +#define I915_CONTEXT_CLONE_UNKNOWN -(I915_CONTEXT_CLONE_VM << 1) + __u64 rsvd; +}; + +struct drm_i915_gem_context_destroy { + __u32 ctx_id; + __u32 pad; +}; + +/* + * DRM_I915_GEM_VM_CREATE - + * + * Create a new virtual memory address space (ppGTT) for use within a context + * on the same file. Extensions can be provided to configure exactly how the + * address space is setup upon creation. + * + * The id of new VM (bound to the fd) for use with I915_CONTEXT_PARAM_VM is + * returned in the outparam @id. + * + * No flags are defined, with all bits reserved and must be zero. + * + * An extension chain maybe provided, starting with @extensions, and terminated + * by the @next_extension being 0. Currently, no extensions are defined. + * + * DRM_I915_GEM_VM_DESTROY - + * + * Destroys a previously created VM id, specified in @id. + * + * No extensions or flags are allowed currently, and so must be zero. + */ +struct drm_i915_gem_vm_control { + __u64 extensions; + __u32 flags; + __u32 vm_id; +}; + +struct drm_i915_reg_read { + /* + * Register offset. + * For 64bit wide registers where the upper 32bits don't immediately + * follow the lower 32bits, the offset of the lower 32bits must + * be specified + */ + __u64 offset; +#define I915_REG_READ_8B_WA (1ul << 0) + + __u64 val; /* Return value */ +}; + +/* Known registers: + * + * Render engine timestamp - 0x2358 + 64bit - gen7+ + * - Note this register returns an invalid value if using the default + * single instruction 8byte read, in order to workaround that pass + * flag I915_REG_READ_8B_WA in offset field. + * + */ + +struct drm_i915_reset_stats { + __u32 ctx_id; + __u32 flags; + + /* All resets since boot/module reload, for all contexts */ + __u32 reset_count; + + /* Number of batches lost when active in GPU, for this context */ + __u32 batch_active; + + /* Number of batches lost pending for execution, for this context */ + __u32 batch_pending; + + __u32 pad; +}; + +struct drm_i915_gem_userptr { + __u64 user_ptr; + __u64 user_size; + __u32 flags; +#define I915_USERPTR_READ_ONLY 0x1 +#define I915_USERPTR_UNSYNCHRONIZED 0x80000000 + /** + * Returned handle for the object. + * + * Object handles are nonzero. + */ + __u32 handle; +}; + enum drm_i915_oa_format { I915_OA_FORMAT_A13 = 1, /* HSW only */ I915_OA_FORMAT_A29, /* HSW only */ @@ -1595,23 +1851,31 @@ * Open the stream for a specific context handle (as used with * execbuffer2). A stream opened for a specific context this way * won't typically require root privileges. + * + * This property is available in perf revision 1. */ DRM_I915_PERF_PROP_CTX_HANDLE = 1, /** * A value of 1 requests the inclusion of raw OA unit reports as * part of stream samples. + * + * This property is available in perf revision 1. */ DRM_I915_PERF_PROP_SAMPLE_OA, /** * The value specifies which set of OA unit metrics should be * be configured, defining the contents of any OA unit reports. + * + * This property is available in perf revision 1. */ DRM_I915_PERF_PROP_OA_METRICS_SET, /** * The value specifies the size and layout of OA unit reports. + * + * This property is available in perf revision 1. */ DRM_I915_PERF_PROP_OA_FORMAT, @@ -1621,9 +1885,22 @@ * from this exponent as follows: * * 80ns * 2^(period_exponent + 1) + * + * This property is available in perf revision 1. */ DRM_I915_PERF_PROP_OA_EXPONENT, + /** + * Specifying this property is only valid when specify a context to + * filter with DRM_I915_PERF_PROP_CTX_HANDLE. Specifying this property + * will hold preemption of the particular context we want to gather + * performance data about. The execbuf2 submissions must include a + * drm_i915_gem_execbuffer_ext_perf parameter for this to apply. + * + * This property is available in perf revision 3. + */ + DRM_I915_PERF_PROP_HOLD_PREEMPTION, + DRM_I915_PERF_PROP_MAX /* non-ABI */ }; @@ -1652,6 +1929,8 @@ * to close and re-open a stream with the same configuration. * * It's undefined whether any pending data for the stream will be lost. + * + * This ioctl is available in perf revision 1. */ #define I915_PERF_IOCTL_ENABLE _IO('i', 0x0) @@ -1659,10 +1938,25 @@ * Disable data capture for a stream. * * It is an error to try and read a stream that is disabled. + * + * This ioctl is available in perf revision 1. */ #define I915_PERF_IOCTL_DISABLE _IO('i', 0x1) /** + * Change metrics_set captured by a stream. + * + * If the stream is bound to a specific context, the configuration change + * will performed __inline__ with that context such that it takes effect before + * the next execbuf submission. + * + * Returns the previously bound metrics set id, or a negative error code. + * + * This ioctl is available in perf revision 2. + */ +#define I915_PERF_IOCTL_CONFIG _IO('i', 0x2) + +/** * Common to all i915 perf records */ struct drm_i915_perf_record_header { @@ -1734,6 +2028,9 @@ struct drm_i915_query_item { __u64 query_id; #define DRM_I915_QUERY_TOPOLOGY_INFO 1 +#define DRM_I915_QUERY_ENGINE_INFO 2 +#define DRM_I915_QUERY_PERF_CONFIG 3 +/* Must be kept compact -- no holes and well documented */ /* * When set to zero by userspace, this is filled with the size of the @@ -1744,9 +2041,18 @@ __s32 length; /* - * Unused for now. Must be cleared to zero. + * When query_id == DRM_I915_QUERY_TOPOLOGY_INFO, must be 0. + * + * When query_id == DRM_I915_QUERY_PERF_CONFIG, must be one of the + * following : + * - DRM_I915_QUERY_PERF_CONFIG_LIST + * - DRM_I915_QUERY_PERF_CONFIG_DATA_FOR_UUID + * - DRM_I915_QUERY_PERF_CONFIG_FOR_UUID */ __u32 flags; +#define DRM_I915_QUERY_PERF_CONFIG_LIST 1 +#define DRM_I915_QUERY_PERF_CONFIG_DATA_FOR_UUID 2 +#define DRM_I915_QUERY_PERF_CONFIG_DATA_FOR_ID 3 /* * Data will be written at the location pointed by data_ptr when the @@ -1782,8 +2088,10 @@ * (data[X / 8] >> (X % 8)) & 1 * * - the subslice mask for each slice with one bit per subslice telling - * whether a subslice is available. The availability of subslice Y in slice - * X can be queried with the following formula : + * whether a subslice is available. Gen12 has dual-subslices, which are + * similar to two gen11 subslices. For gen12, this array represents dual- + * subslices. The availability of subslice Y in slice X can be queried + * with the following formula : * * (data[subslice_offset + * X * subslice_stride + @@ -1831,6 +2139,97 @@ __u8 data[]; }; +/** + * struct drm_i915_engine_info + * + * Describes one engine and it's capabilities as known to the driver. + */ +struct drm_i915_engine_info { + /** Engine class and instance. */ + struct i915_engine_class_instance engine; + + /** Reserved field. */ + __u32 rsvd0; + + /** Engine flags. */ + __u64 flags; + + /** Capabilities of this engine. */ + __u64 capabilities; +#define I915_VIDEO_CLASS_CAPABILITY_HEVC (1 << 0) +#define I915_VIDEO_AND_ENHANCE_CLASS_CAPABILITY_SFC (1 << 1) + + /** Reserved fields. */ + __u64 rsvd1[4]; +}; + +/** + * struct drm_i915_query_engine_info + * + * Engine info query enumerates all engines known to the driver by filling in + * an array of struct drm_i915_engine_info structures. + */ +struct drm_i915_query_engine_info { + /** Number of struct drm_i915_engine_info structs following. */ + __u32 num_engines; + + /** MBZ */ + __u32 rsvd[3]; + + /** Marker for drm_i915_engine_info structures. */ + struct drm_i915_engine_info engines[]; +}; + +/* + * Data written by the kernel with query DRM_I915_QUERY_PERF_CONFIG. + */ +struct drm_i915_query_perf_config { + union { + /* + * When query_item.flags == DRM_I915_QUERY_PERF_CONFIG_LIST, i915 sets + * this fields to the number of configurations available. + */ + __u64 n_configs; + + /* + * When query_id == DRM_I915_QUERY_PERF_CONFIG_DATA_FOR_ID, + * i915 will use the value in this field as configuration + * identifier to decide what data to write into config_ptr. + */ + __u64 config; + + /* + * When query_id == DRM_I915_QUERY_PERF_CONFIG_DATA_FOR_UUID, + * i915 will use the value in this field as configuration + * identifier to decide what data to write into config_ptr. + * + * String formatted like "%08x-%04x-%04x-%04x-%012x" + */ + char uuid[36]; + }; + + /* + * Unused for now. Must be cleared to zero. + */ + __u32 flags; + + /* + * When query_item.flags == DRM_I915_QUERY_PERF_CONFIG_LIST, i915 will + * write an array of __u64 of configuration identifiers. + * + * When query_item.flags == DRM_I915_QUERY_PERF_CONFIG_DATA, i915 will + * write a struct drm_i915_perf_oa_config. If the following fields of + * drm_i915_perf_oa_config are set not set to 0, i915 will write into + * the associated pointers the values of submitted when the + * configuration was created : + * + * - n_mux_regs + * - n_boolean_regs + * - n_flex_regs + */ + __u8 data[]; +}; + #if defined(__cplusplus) } #endif diff -Nru mesa-19.2.8/include/drm-uapi/lima_drm.h mesa-20.0.8/include/drm-uapi/lima_drm.h --- mesa-19.2.8/include/drm-uapi/lima_drm.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/include/drm-uapi/lima_drm.h 2020-06-12 01:21:16.000000000 +0000 @@ -32,12 +32,19 @@ __u64 value; /* out, parameter value */ }; +/* + * heap buffer dynamically increase backup memory size when GP task fail + * due to lack of heap memory. size field of heap buffer is an up bound of + * the backup memory which can be set to a fairly large value. + */ +#define LIMA_BO_FLAG_HEAP (1 << 0) + /** * create a buffer for used by GPU */ struct drm_lima_gem_create { __u32 size; /* in, buffer size */ - __u32 flags; /* in, currently no flags, must be zero */ + __u32 flags; /* in, buffer flags */ __u32 handle; /* out, GEM buffer handle */ __u32 pad; /* pad, must be zero */ }; diff -Nru mesa-19.2.8/include/drm-uapi/README mesa-20.0.8/include/drm-uapi/README --- mesa-19.2.8/include/drm-uapi/README 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/include/drm-uapi/README 2020-06-12 01:21:16.000000000 +0000 @@ -13,9 +13,9 @@ The last update was done at the following kernel commit : -commit a5f2fafece141ef3509e686cea576366d55cabb6 -Merge: 71f4e45a4ed3 860433ed2a55 +commit f1b4a9217efd61d0b84c6dc404596c8519ff6f59 +Merge: 400e91347e1d f3a36d469621 Author: Dave Airlie -Date: Wed Feb 20 12:16:30 2019 +1000 +Date: Tue Oct 22 15:04:00 2019 +1000 - Merge https://gitlab.freedesktop.org/drm/msm into drm-next + Merge tag 'du-next-20191016' of git://linuxtv.org/pinchartl/media into drm-next diff -Nru mesa-19.2.8/include/drm-uapi/v3d_drm.h mesa-20.0.8/include/drm-uapi/v3d_drm.h --- mesa-19.2.8/include/drm-uapi/v3d_drm.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/include/drm-uapi/v3d_drm.h 2020-06-12 01:21:16.000000000 +0000 @@ -37,6 +37,7 @@ #define DRM_V3D_GET_PARAM 0x04 #define DRM_V3D_GET_BO_OFFSET 0x05 #define DRM_V3D_SUBMIT_TFU 0x06 +#define DRM_V3D_SUBMIT_CSD 0x07 #define DRM_IOCTL_V3D_SUBMIT_CL DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_SUBMIT_CL, struct drm_v3d_submit_cl) #define DRM_IOCTL_V3D_WAIT_BO DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_WAIT_BO, struct drm_v3d_wait_bo) @@ -45,6 +46,9 @@ #define DRM_IOCTL_V3D_GET_PARAM DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_GET_PARAM, struct drm_v3d_get_param) #define DRM_IOCTL_V3D_GET_BO_OFFSET DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_GET_BO_OFFSET, struct drm_v3d_get_bo_offset) #define DRM_IOCTL_V3D_SUBMIT_TFU DRM_IOW(DRM_COMMAND_BASE + DRM_V3D_SUBMIT_TFU, struct drm_v3d_submit_tfu) +#define DRM_IOCTL_V3D_SUBMIT_CSD DRM_IOW(DRM_COMMAND_BASE + DRM_V3D_SUBMIT_CSD, struct drm_v3d_submit_csd) + +#define DRM_V3D_SUBMIT_CL_FLUSH_CACHE 0x01 /** * struct drm_v3d_submit_cl - ioctl argument for submitting commands to the 3D @@ -59,7 +63,7 @@ * flushed by the time the render done IRQ happens, which is the * trigger for out_sync. Any dirtying of cachelines by the job (only * possible using TMU writes) must be flushed by the caller using the - * CL's cache flush commands. + * DRM_V3D_SUBMIT_CL_FLUSH_CACHE_FLAG flag. */ struct drm_v3d_submit_cl { /* Pointer to the binner command list. @@ -122,8 +126,7 @@ /* Number of BO handles passed in (size is that times 4). */ __u32 bo_handle_count; - /* Pad, must be zero-filled. */ - __u32 pad; + __u32 flags; }; /** @@ -190,6 +193,8 @@ DRM_V3D_PARAM_V3D_CORE0_IDENT1, DRM_V3D_PARAM_V3D_CORE0_IDENT2, DRM_V3D_PARAM_SUPPORTS_TFU, + DRM_V3D_PARAM_SUPPORTS_CSD, + DRM_V3D_PARAM_SUPPORTS_CACHE_FLUSH, }; struct drm_v3d_get_param { @@ -230,6 +235,31 @@ __u32 out_sync; }; +/* Submits a compute shader for dispatch. This job will block on any + * previous compute shaders submitted on this fd, and any other + * synchronization must be performed with in_sync/out_sync. + */ +struct drm_v3d_submit_csd { + __u32 cfg[7]; + __u32 coef[4]; + + /* Pointer to a u32 array of the BOs that are referenced by the job. + */ + __u64 bo_handles; + + /* Number of BO handles passed in (size is that times 4). */ + __u32 bo_handle_count; + + /* sync object to block on before running the CSD job. Each + * CSD job will execute in the order submitted to its FD. + * Synchronization against rendering/TFU jobs or CSD from + * other fds requires using sync objects. + */ + __u32 in_sync; + /* Sync object to signal when the CSD job is done. */ + __u32 out_sync; +}; + #if defined(__cplusplus) } #endif diff -Nru mesa-19.2.8/include/drm-uapi/virtgpu_drm.h mesa-20.0.8/include/drm-uapi/virtgpu_drm.h --- mesa-19.2.8/include/drm-uapi/virtgpu_drm.h 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/include/drm-uapi/virtgpu_drm.h 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,182 @@ +/* + * Copyright 2013 Red Hat + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +#ifndef VIRTGPU_DRM_H +#define VIRTGPU_DRM_H + +#include "drm.h" + +#if defined(__cplusplus) +extern "C" { +#endif + +/* Please note that modifications to all structs defined here are + * subject to backwards-compatibility constraints. + * + * Do not use pointers, use __u64 instead for 32 bit / 64 bit user/kernel + * compatibility Keep fields aligned to their size + */ + +#define DRM_VIRTGPU_MAP 0x01 +#define DRM_VIRTGPU_EXECBUFFER 0x02 +#define DRM_VIRTGPU_GETPARAM 0x03 +#define DRM_VIRTGPU_RESOURCE_CREATE 0x04 +#define DRM_VIRTGPU_RESOURCE_INFO 0x05 +#define DRM_VIRTGPU_TRANSFER_FROM_HOST 0x06 +#define DRM_VIRTGPU_TRANSFER_TO_HOST 0x07 +#define DRM_VIRTGPU_WAIT 0x08 +#define DRM_VIRTGPU_GET_CAPS 0x09 + +#define VIRTGPU_EXECBUF_FENCE_FD_IN 0x01 +#define VIRTGPU_EXECBUF_FENCE_FD_OUT 0x02 +#define VIRTGPU_EXECBUF_FLAGS (\ + VIRTGPU_EXECBUF_FENCE_FD_IN |\ + VIRTGPU_EXECBUF_FENCE_FD_OUT |\ + 0) + +struct drm_virtgpu_map { + __u64 offset; /* use for mmap system call */ + __u32 handle; + __u32 pad; +}; + +struct drm_virtgpu_execbuffer { + __u32 flags; + __u32 size; + __u64 command; /* void* */ + __u64 bo_handles; + __u32 num_bo_handles; + __s32 fence_fd; /* in/out fence fd (see VIRTGPU_EXECBUF_FENCE_FD_IN/OUT) */ +}; + +#define VIRTGPU_PARAM_3D_FEATURES 1 /* do we have 3D features in the hw */ +#define VIRTGPU_PARAM_CAPSET_QUERY_FIX 2 /* do we have the capset fix */ + +struct drm_virtgpu_getparam { + __u64 param; + __u64 value; +}; + +/* NO_BO flags? NO resource flag? */ +/* resource flag for y_0_top */ +struct drm_virtgpu_resource_create { + __u32 target; + __u32 format; + __u32 bind; + __u32 width; + __u32 height; + __u32 depth; + __u32 array_size; + __u32 last_level; + __u32 nr_samples; + __u32 flags; + __u32 bo_handle; /* if this is set - recreate a new resource attached to this bo ? */ + __u32 res_handle; /* returned by kernel */ + __u32 size; /* validate transfer in the host */ + __u32 stride; /* validate transfer in the host */ +}; + +struct drm_virtgpu_resource_info { + __u32 bo_handle; + __u32 res_handle; + __u32 size; + __u32 stride; +}; + +struct drm_virtgpu_3d_box { + __u32 x; + __u32 y; + __u32 z; + __u32 w; + __u32 h; + __u32 d; +}; + +struct drm_virtgpu_3d_transfer_to_host { + __u32 bo_handle; + struct drm_virtgpu_3d_box box; + __u32 level; + __u32 offset; +}; + +struct drm_virtgpu_3d_transfer_from_host { + __u32 bo_handle; + struct drm_virtgpu_3d_box box; + __u32 level; + __u32 offset; +}; + +#define VIRTGPU_WAIT_NOWAIT 1 /* like it */ +struct drm_virtgpu_3d_wait { + __u32 handle; /* 0 is an invalid handle */ + __u32 flags; +}; + +struct drm_virtgpu_get_caps { + __u32 cap_set_id; + __u32 cap_set_ver; + __u64 addr; + __u32 size; + __u32 pad; +}; + +#define DRM_IOCTL_VIRTGPU_MAP \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_MAP, struct drm_virtgpu_map) + +#define DRM_IOCTL_VIRTGPU_EXECBUFFER \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_EXECBUFFER,\ + struct drm_virtgpu_execbuffer) + +#define DRM_IOCTL_VIRTGPU_GETPARAM \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_GETPARAM,\ + struct drm_virtgpu_getparam) + +#define DRM_IOCTL_VIRTGPU_RESOURCE_CREATE \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_RESOURCE_CREATE, \ + struct drm_virtgpu_resource_create) + +#define DRM_IOCTL_VIRTGPU_RESOURCE_INFO \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_RESOURCE_INFO, \ + struct drm_virtgpu_resource_info) + +#define DRM_IOCTL_VIRTGPU_TRANSFER_FROM_HOST \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_TRANSFER_FROM_HOST, \ + struct drm_virtgpu_3d_transfer_from_host) + +#define DRM_IOCTL_VIRTGPU_TRANSFER_TO_HOST \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_TRANSFER_TO_HOST, \ + struct drm_virtgpu_3d_transfer_to_host) + +#define DRM_IOCTL_VIRTGPU_WAIT \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_WAIT, \ + struct drm_virtgpu_3d_wait) + +#define DRM_IOCTL_VIRTGPU_GET_CAPS \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_GET_CAPS, \ + struct drm_virtgpu_get_caps) + +#if defined(__cplusplus) +} +#endif + +#endif diff -Nru mesa-19.2.8/include/EGL/eglextchromium.h mesa-20.0.8/include/EGL/eglextchromium.h --- mesa-19.2.8/include/EGL/eglextchromium.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/include/EGL/eglextchromium.h 2020-06-12 01:21:16.000000000 +0000 @@ -53,6 +53,17 @@ #endif #endif +#ifndef EGL_EXT_image_flush_external +#define EGL_EXT_image_flush_external 1 +#define EGL_IMAGE_EXTERNAL_FLUSH_EXT 0x32A2 +typedef EGLBoolean (EGLAPIENTRYP PFNEGLIMAGEFLUSHEXTERNALEXTPROC) (EGLDisplay dpy, EGLImageKHR image, const EGLAttrib *attrib_list); +typedef EGLBoolean (EGLAPIENTRYP PFNEGLIMAGEINVALIDATEEXTERNALEXTPROC) (EGLDisplay dpy, EGLImageKHR image, const EGLAttrib *attrib_list); +#ifdef EGL_EGLEXT_PROTOTYPES +EGLAPI EGLBoolean EGLAPIENTRY eglImageFlushExternalEXT (EGLDisplay dpy, EGLImageKHR image, const EGLAttrib *attrib_list); +EGLAPI EGLBoolean EGLAPIENTRY eglImageInvalidateExternalEXT (EGLDisplay dpy, EGLImageKHR image, const EGLAttrib *attrib_list); +#endif +#endif /* EGL_EXT_image_flush_external */ + #ifdef __cplusplus } #endif diff -Nru mesa-19.2.8/include/EGL/eglext.h mesa-20.0.8/include/EGL/eglext.h --- mesa-19.2.8/include/EGL/eglext.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/include/EGL/eglext.h 2020-06-12 01:21:16.000000000 +0000 @@ -1362,9 +1362,6 @@ #define EGL_NATIVE_SURFACE_TIZEN 0x32A1 #endif /* EGL_TIZEN_image_native_surface */ -#include -#include - #ifdef __cplusplus } #endif diff -Nru mesa-19.2.8/include/EGL/eglmesaext.h mesa-20.0.8/include/EGL/eglmesaext.h --- mesa-19.2.8/include/EGL/eglmesaext.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/include/EGL/eglmesaext.h 2020-06-12 01:21:16.000000000 +0000 @@ -76,7 +76,7 @@ #ifndef EGL_MESA_configless_context #define EGL_MESA_configless_context 1 -#define EGL_NO_CONFIG_MESA ((EGLConfig)0) +#define EGL_NO_CONFIG_MESA EGL_CAST(EGLConfig,0) #endif #ifndef EGL_MESA_drm_image_formats diff -Nru mesa-19.2.8/include/EGL/eglplatform.h mesa-20.0.8/include/EGL/eglplatform.h --- mesa-19.2.8/include/EGL/eglplatform.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/include/EGL/eglplatform.h 2020-06-12 01:21:16.000000000 +0000 @@ -55,6 +55,12 @@ #endif #define EGLAPIENTRYP EGLAPIENTRY* +#if defined(MESA_EGL_NO_X11_HEADERS) && !defined(EGL_NO_X11) +#warning "`MESA_EGL_NO_X11_HEADERS` is deprecated, and doesn't work with the unmodified Khronos header" +#warning "Please use `EGL_NO_X11` instead, as `MESA_EGL_NO_X11_HEADERS` will be removed soon" +#define EGL_NO_X11 +#endif + /* The types NativeDisplayType, NativeWindowType, and NativePixmapType * are aliases of window-system-dependent types, such as X Display * or * Windows Device Context. They must be defined in platform-specific @@ -116,15 +122,13 @@ typedef intptr_t EGLNativePixmapType; typedef intptr_t EGLNativeWindowType; -#elif defined(__unix__) || defined(__APPLE__) +#elif defined(__unix__) && defined(EGL_NO_X11) -#if defined(MESA_EGL_NO_X11_HEADERS) - -typedef void *EGLNativeDisplayType; +typedef void *EGLNativeDisplayType; typedef khronos_uintptr_t EGLNativePixmapType; typedef khronos_uintptr_t EGLNativeWindowType; -#else +#elif defined(__unix__) || defined(USE_X11) /* X11 (tentative) */ #include @@ -134,7 +138,11 @@ typedef Pixmap EGLNativePixmapType; typedef Window EGLNativeWindowType; -#endif /* MESA_EGL_NO_X11_HEADERS */ +#elif defined(__APPLE__) + +typedef int EGLNativeDisplayType; +typedef void *EGLNativePixmapType; +typedef void *EGLNativeWindowType; #elif defined(__HAIKU__) diff -Nru mesa-19.2.8/include/GL/glcorearb.h mesa-20.0.8/include/GL/glcorearb.h --- mesa-19.2.8/include/GL/glcorearb.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/include/GL/glcorearb.h 2020-06-12 01:21:16.000000000 +0000 @@ -3900,6 +3900,22 @@ #define GL_CONTEXT_ROBUST_ACCESS 0x90F3 #endif /* GL_KHR_robustness */ +#ifndef GL_KHR_shader_subgroup +#define GL_KHR_shader_subgroup 1 +#define GL_SUBGROUP_SIZE_KHR 0x9532 +#define GL_SUBGROUP_SUPPORTED_STAGES_KHR 0x9533 +#define GL_SUBGROUP_SUPPORTED_FEATURES_KHR 0x9534 +#define GL_SUBGROUP_QUAD_ALL_STAGES_KHR 0x9535 +#define GL_SUBGROUP_FEATURE_BASIC_BIT_KHR 0x00000001 +#define GL_SUBGROUP_FEATURE_VOTE_BIT_KHR 0x00000002 +#define GL_SUBGROUP_FEATURE_ARITHMETIC_BIT_KHR 0x00000004 +#define GL_SUBGROUP_FEATURE_BALLOT_BIT_KHR 0x00000008 +#define GL_SUBGROUP_FEATURE_SHUFFLE_BIT_KHR 0x00000010 +#define GL_SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT_KHR 0x00000020 +#define GL_SUBGROUP_FEATURE_CLUSTERED_BIT_KHR 0x00000040 +#define GL_SUBGROUP_FEATURE_QUAD_BIT_KHR 0x00000080 +#endif /* GL_KHR_shader_subgroup */ + #ifndef GL_KHR_texture_compression_astc_hdr #define GL_KHR_texture_compression_astc_hdr 1 #define GL_COMPRESSED_RGBA_ASTC_4x4_KHR 0x93B0 @@ -4010,6 +4026,10 @@ #endif #endif /* GL_EXT_EGL_image_storage */ +#ifndef GL_EXT_EGL_sync +#define GL_EXT_EGL_sync 1 +#endif /* GL_EXT_EGL_sync */ + #ifndef GL_EXT_debug_label #define GL_EXT_debug_label 1 #define GL_PROGRAM_PIPELINE_OBJECT_EXT 0x8A4F @@ -4744,6 +4764,17 @@ #endif #endif /* GL_INTEL_performance_query */ +#ifndef GL_MESA_framebuffer_flip_y +#define GL_MESA_framebuffer_flip_y 1 +#define GL_FRAMEBUFFER_FLIP_Y_MESA 0x8BBB +typedef void (APIENTRYP PFNGLFRAMEBUFFERPARAMETERIMESAPROC) (GLenum target, GLenum pname, GLint param); +typedef void (APIENTRYP PFNGLGETFRAMEBUFFERPARAMETERIVMESAPROC) (GLenum target, GLenum pname, GLint *params); +#ifdef GL_GLEXT_PROTOTYPES +GLAPI void APIENTRY glFramebufferParameteriMESA (GLenum target, GLenum pname, GLint param); +GLAPI void APIENTRY glGetFramebufferParameterivMESA (GLenum target, GLenum pname, GLint *params); +#endif +#endif /* GL_MESA_framebuffer_flip_y */ + #ifndef GL_NV_bindless_multi_draw_indirect #define GL_NV_bindless_multi_draw_indirect 1 typedef void (APIENTRYP PFNGLMULTIDRAWARRAYSINDIRECTBINDLESSNVPROC) (GLenum mode, const void *indirect, GLsizei drawCount, GLsizei stride, GLint vertexBufferCount); @@ -5657,6 +5688,11 @@ #define GL_SHADER_GLOBAL_ACCESS_BARRIER_BIT_NV 0x00000010 #endif /* GL_NV_shader_buffer_store */ +#ifndef GL_NV_shader_subgroup_partitioned +#define GL_NV_shader_subgroup_partitioned 1 +#define GL_SUBGROUP_FEATURE_PARTITIONED_BIT_NV 0x00000100 +#endif /* GL_NV_shader_subgroup_partitioned */ + #ifndef GL_NV_shader_texture_footprint #define GL_NV_shader_texture_footprint 1 #endif /* GL_NV_shader_texture_footprint */ diff -Nru mesa-19.2.8/include/GL/glext.h mesa-20.0.8/include/GL/glext.h --- mesa-19.2.8/include/GL/glext.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/include/GL/glext.h 2020-06-12 01:21:16.000000000 +0000 @@ -51,7 +51,7 @@ #define GLAPI extern #endif -#define GL_GLEXT_VERSION 20190611 +#define GL_GLEXT_VERSION 20190911 #include @@ -4997,6 +4997,22 @@ #define GL_CONTEXT_ROBUST_ACCESS 0x90F3 #endif /* GL_KHR_robustness */ +#ifndef GL_KHR_shader_subgroup +#define GL_KHR_shader_subgroup 1 +#define GL_SUBGROUP_SIZE_KHR 0x9532 +#define GL_SUBGROUP_SUPPORTED_STAGES_KHR 0x9533 +#define GL_SUBGROUP_SUPPORTED_FEATURES_KHR 0x9534 +#define GL_SUBGROUP_QUAD_ALL_STAGES_KHR 0x9535 +#define GL_SUBGROUP_FEATURE_BASIC_BIT_KHR 0x00000001 +#define GL_SUBGROUP_FEATURE_VOTE_BIT_KHR 0x00000002 +#define GL_SUBGROUP_FEATURE_ARITHMETIC_BIT_KHR 0x00000004 +#define GL_SUBGROUP_FEATURE_BALLOT_BIT_KHR 0x00000008 +#define GL_SUBGROUP_FEATURE_SHUFFLE_BIT_KHR 0x00000010 +#define GL_SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT_KHR 0x00000020 +#define GL_SUBGROUP_FEATURE_CLUSTERED_BIT_KHR 0x00000040 +#define GL_SUBGROUP_FEATURE_QUAD_BIT_KHR 0x00000080 +#endif /* GL_KHR_shader_subgroup */ + #ifndef GL_KHR_texture_compression_astc_hdr #define GL_KHR_texture_compression_astc_hdr 1 #define GL_COMPRESSED_RGBA_ASTC_4x4_KHR 0x93B0 @@ -6470,6 +6486,10 @@ #endif #endif /* GL_EXT_EGL_image_storage */ +#ifndef GL_EXT_EGL_sync +#define GL_EXT_EGL_sync 1 +#endif /* GL_EXT_EGL_sync */ + #ifndef GL_EXT_abgr #define GL_EXT_abgr 1 #define GL_ABGR_EXT 0x8000 @@ -9264,6 +9284,17 @@ #define GL_TEXTURE_2D_STACK_BINDING_MESAX 0x875E #endif /* GL_MESAX_texture_stack */ +#ifndef GL_MESA_framebuffer_flip_y +#define GL_MESA_framebuffer_flip_y 1 +#define GL_FRAMEBUFFER_FLIP_Y_MESA 0x8BBB +typedef void (APIENTRYP PFNGLFRAMEBUFFERPARAMETERIMESAPROC) (GLenum target, GLenum pname, GLint param); +typedef void (APIENTRYP PFNGLGETFRAMEBUFFERPARAMETERIVMESAPROC) (GLenum target, GLenum pname, GLint *params); +#ifdef GL_GLEXT_PROTOTYPES +GLAPI void APIENTRY glFramebufferParameteriMESA (GLenum target, GLenum pname, GLint param); +GLAPI void APIENTRY glGetFramebufferParameterivMESA (GLenum target, GLenum pname, GLint *params); +#endif +#endif /* GL_MESA_framebuffer_flip_y */ + #ifndef GL_MESA_pack_invert #define GL_MESA_pack_invert 1 #define GL_PACK_INVERT_MESA 0x8758 @@ -9377,6 +9408,25 @@ #define GL_GPU_MEMORY_INFO_EVICTED_MEMORY_NVX 0x904B #endif /* GL_NVX_gpu_memory_info */ +#ifndef GL_NVX_gpu_multicast2 +#define GL_NVX_gpu_multicast2 1 +#define GL_UPLOAD_GPU_MASK_NVX 0x954A +typedef void (APIENTRYP PFNGLUPLOADGPUMASKNVXPROC) (GLbitfield mask); +typedef void (APIENTRYP PFNGLMULTICASTVIEWPORTARRAYVNVXPROC) (GLuint gpu, GLuint first, GLsizei count, const GLfloat *v); +typedef void (APIENTRYP PFNGLMULTICASTVIEWPORTPOSITIONWSCALENVXPROC) (GLuint gpu, GLuint index, GLfloat xcoeff, GLfloat ycoeff); +typedef void (APIENTRYP PFNGLMULTICASTSCISSORARRAYVNVXPROC) (GLuint gpu, GLuint first, GLsizei count, const GLint *v); +typedef GLuint (APIENTRYP PFNGLASYNCCOPYBUFFERSUBDATANVXPROC) (GLsizei waitSemaphoreCount, const GLuint *waitSemaphoreArray, const GLuint64 *fenceValueArray, GLuint readGpu, GLbitfield writeGpuMask, GLuint readBuffer, GLuint writeBuffer, GLintptr readOffset, GLintptr writeOffset, GLsizeiptr size, GLsizei signalSemaphoreCount, const GLuint *signalSemaphoreArray, const GLuint64 *signalValueArray); +typedef GLuint (APIENTRYP PFNGLASYNCCOPYIMAGESUBDATANVXPROC) (GLsizei waitSemaphoreCount, const GLuint *waitSemaphoreArray, const GLuint64 *waitValueArray, GLuint srcGpu, GLbitfield dstGpuMask, GLuint srcName, GLenum srcTarget, GLint srcLevel, GLint srcX, GLint srcY, GLint srcZ, GLuint dstName, GLenum dstTarget, GLint dstLevel, GLint dstX, GLint dstY, GLint dstZ, GLsizei srcWidth, GLsizei srcHeight, GLsizei srcDepth, GLsizei signalSemaphoreCount, const GLuint *signalSemaphoreArray, const GLuint64 *signalValueArray); +#ifdef GL_GLEXT_PROTOTYPES +GLAPI void APIENTRY glUploadGpuMaskNVX (GLbitfield mask); +GLAPI void APIENTRY glMulticastViewportArrayvNVX (GLuint gpu, GLuint first, GLsizei count, const GLfloat *v); +GLAPI void APIENTRY glMulticastViewportPositionWScaleNVX (GLuint gpu, GLuint index, GLfloat xcoeff, GLfloat ycoeff); +GLAPI void APIENTRY glMulticastScissorArrayvNVX (GLuint gpu, GLuint first, GLsizei count, const GLint *v); +GLAPI GLuint APIENTRY glAsyncCopyBufferSubDataNVX (GLsizei waitSemaphoreCount, const GLuint *waitSemaphoreArray, const GLuint64 *fenceValueArray, GLuint readGpu, GLbitfield writeGpuMask, GLuint readBuffer, GLuint writeBuffer, GLintptr readOffset, GLintptr writeOffset, GLsizeiptr size, GLsizei signalSemaphoreCount, const GLuint *signalSemaphoreArray, const GLuint64 *signalValueArray); +GLAPI GLuint APIENTRY glAsyncCopyImageSubDataNVX (GLsizei waitSemaphoreCount, const GLuint *waitSemaphoreArray, const GLuint64 *waitValueArray, GLuint srcGpu, GLbitfield dstGpuMask, GLuint srcName, GLenum srcTarget, GLint srcLevel, GLint srcX, GLint srcY, GLint srcZ, GLuint dstName, GLenum dstTarget, GLint dstLevel, GLint dstX, GLint dstY, GLint dstZ, GLsizei srcWidth, GLsizei srcHeight, GLsizei srcDepth, GLsizei signalSemaphoreCount, const GLuint *signalSemaphoreArray, const GLuint64 *signalValueArray); +#endif +#endif /* GL_NVX_gpu_multicast2 */ + #ifndef GL_NVX_linked_gpu_multicast #define GL_NVX_linked_gpu_multicast 1 #define GL_LGPU_SEPARATE_STORAGE_BIT_NVX 0x0800 @@ -9391,6 +9441,20 @@ #endif #endif /* GL_NVX_linked_gpu_multicast */ +#ifndef GL_NVX_progress_fence +#define GL_NVX_progress_fence 1 +typedef GLuint (APIENTRYP PFNGLCREATEPROGRESSFENCENVXPROC) (void); +typedef void (APIENTRYP PFNGLSIGNALSEMAPHOREUI64NVXPROC) (GLuint signalGpu, GLsizei fenceObjectCount, const GLuint *semaphoreArray, const GLuint64 *fenceValueArray); +typedef void (APIENTRYP PFNGLWAITSEMAPHOREUI64NVXPROC) (GLuint waitGpu, GLsizei fenceObjectCount, const GLuint *semaphoreArray, const GLuint64 *fenceValueArray); +typedef void (APIENTRYP PFNGLCLIENTWAITSEMAPHOREUI64NVXPROC) (GLsizei fenceObjectCount, const GLuint *semaphoreArray, const GLuint64 *fenceValueArray); +#ifdef GL_GLEXT_PROTOTYPES +GLAPI GLuint APIENTRY glCreateProgressFenceNVX (void); +GLAPI void APIENTRY glSignalSemaphoreui64NVX (GLuint signalGpu, GLsizei fenceObjectCount, const GLuint *semaphoreArray, const GLuint64 *fenceValueArray); +GLAPI void APIENTRY glWaitSemaphoreui64NVX (GLuint waitGpu, GLsizei fenceObjectCount, const GLuint *semaphoreArray, const GLuint64 *fenceValueArray); +GLAPI void APIENTRY glClientWaitSemaphoreui64NVX (GLsizei fenceObjectCount, const GLuint *semaphoreArray, const GLuint64 *fenceValueArray); +#endif +#endif /* GL_NVX_progress_fence */ + #ifndef GL_NV_alpha_to_coverage_dither_control #define GL_NV_alpha_to_coverage_dither_control 1 #define GL_ALPHA_TO_COVERAGE_DITHER_DEFAULT_NV 0x934D @@ -10938,6 +11002,11 @@ #define GL_NV_shader_storage_buffer_object 1 #endif /* GL_NV_shader_storage_buffer_object */ +#ifndef GL_NV_shader_subgroup_partitioned +#define GL_NV_shader_subgroup_partitioned 1 +#define GL_SUBGROUP_FEATURE_PARTITIONED_BIT_NV 0x00000100 +#endif /* GL_NV_shader_subgroup_partitioned */ + #ifndef GL_NV_shader_texture_footprint #define GL_NV_shader_texture_footprint 1 #endif /* GL_NV_shader_texture_footprint */ diff -Nru mesa-19.2.8/include/GL/glxext.h mesa-20.0.8/include/GL/glxext.h --- mesa-19.2.8/include/GL/glxext.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/include/GL/glxext.h 2020-06-12 01:21:16.000000000 +0000 @@ -34,7 +34,7 @@ ** https://github.com/KhronosGroup/OpenGL-Registry */ -#define GLX_GLXEXT_VERSION 20180525 +#define GLX_GLXEXT_VERSION 20190911 /* Generated C header for: * API: glx @@ -280,6 +280,14 @@ #define GLX_BACK_BUFFER_AGE_EXT 0x20F4 #endif /* GLX_EXT_buffer_age */ +#ifndef GLX_EXT_context_priority +#define GLX_EXT_context_priority 1 +#define GLX_CONTEXT_PRIORITY_LEVEL_EXT 0x3100 +#define GLX_CONTEXT_PRIORITY_HIGH_EXT 0x3101 +#define GLX_CONTEXT_PRIORITY_MEDIUM_EXT 0x3102 +#define GLX_CONTEXT_PRIORITY_LOW_EXT 0x3103 +#endif /* GLX_EXT_context_priority */ + #ifndef GLX_EXT_create_context_es2_profile #define GLX_EXT_create_context_es2_profile 1 #define GLX_CONTEXT_ES2_PROFILE_BIT_EXT 0x00000004 @@ -476,7 +484,6 @@ #define GLX_RENDERER_OPENGL_COMPATIBILITY_PROFILE_VERSION_MESA 0x818B #define GLX_RENDERER_OPENGL_ES_PROFILE_VERSION_MESA 0x818C #define GLX_RENDERER_OPENGL_ES2_PROFILE_VERSION_MESA 0x818D -#define GLX_RENDERER_ID_MESA 0x818E typedef Bool ( *PFNGLXQUERYCURRENTRENDERERINTEGERMESAPROC) (int attribute, unsigned int *value); typedef const char *( *PFNGLXQUERYCURRENTRENDERERSTRINGMESAPROC) (int attribute); typedef Bool ( *PFNGLXQUERYRENDERERINTEGERMESAPROC) (Display *dpy, int screen, int renderer, int attribute, unsigned int *value); @@ -501,9 +508,9 @@ #define GLX_MESA_set_3dfx_mode 1 #define GLX_3DFX_WINDOW_MODE_MESA 0x1 #define GLX_3DFX_FULLSCREEN_MODE_MESA 0x2 -typedef Bool ( *PFNGLXSET3DFXMODEMESAPROC) (int mode); +typedef GLboolean ( *PFNGLXSET3DFXMODEMESAPROC) (GLint mode); #ifdef GLX_GLXEXT_PROTOTYPES -Bool glXSet3DfxModeMESA (int mode); +GLboolean glXSet3DfxModeMESA (GLint mode); #endif #endif /* GLX_MESA_set_3dfx_mode */ @@ -548,6 +555,15 @@ #define GLX_FLOAT_COMPONENTS_NV 0x20B0 #endif /* GLX_NV_float_buffer */ +#ifndef GLX_NV_multigpu_context +#define GLX_NV_multigpu_context 1 +#define GLX_CONTEXT_MULTIGPU_ATTRIB_NV 0x20AA +#define GLX_CONTEXT_MULTIGPU_ATTRIB_SINGLE_NV 0x20AB +#define GLX_CONTEXT_MULTIGPU_ATTRIB_AFR_NV 0x20AC +#define GLX_CONTEXT_MULTIGPU_ATTRIB_MULTICAST_NV 0x20AD +#define GLX_CONTEXT_MULTIGPU_ATTRIB_MULTI_DISPLAY_MULTICAST_NV 0x20AE +#endif /* GLX_NV_multigpu_context */ + #ifndef GLX_NV_multisample_coverage #define GLX_NV_multisample_coverage 1 #define GLX_COVERAGE_SAMPLES_NV 100001 @@ -836,13 +852,13 @@ #define GLX_PBUFFER_SGIX 0x8023 typedef GLXPbufferSGIX ( *PFNGLXCREATEGLXPBUFFERSGIXPROC) (Display *dpy, GLXFBConfigSGIX config, unsigned int width, unsigned int height, int *attrib_list); typedef void ( *PFNGLXDESTROYGLXPBUFFERSGIXPROC) (Display *dpy, GLXPbufferSGIX pbuf); -typedef int ( *PFNGLXQUERYGLXPBUFFERSGIXPROC) (Display *dpy, GLXPbufferSGIX pbuf, int attribute, unsigned int *value); +typedef void ( *PFNGLXQUERYGLXPBUFFERSGIXPROC) (Display *dpy, GLXPbufferSGIX pbuf, int attribute, unsigned int *value); typedef void ( *PFNGLXSELECTEVENTSGIXPROC) (Display *dpy, GLXDrawable drawable, unsigned long mask); typedef void ( *PFNGLXGETSELECTEDEVENTSGIXPROC) (Display *dpy, GLXDrawable drawable, unsigned long *mask); #ifdef GLX_GLXEXT_PROTOTYPES GLXPbufferSGIX glXCreateGLXPbufferSGIX (Display *dpy, GLXFBConfigSGIX config, unsigned int width, unsigned int height, int *attrib_list); void glXDestroyGLXPbufferSGIX (Display *dpy, GLXPbufferSGIX pbuf); -int glXQueryGLXPbufferSGIX (Display *dpy, GLXPbufferSGIX pbuf, int attribute, unsigned int *value); +void glXQueryGLXPbufferSGIX (Display *dpy, GLXPbufferSGIX pbuf, int attribute, unsigned int *value); void glXSelectEventSGIX (Display *dpy, GLXDrawable drawable, unsigned long mask); void glXGetSelectedEventSGIX (Display *dpy, GLXDrawable drawable, unsigned long *mask); #endif @@ -940,9 +956,9 @@ #ifndef GLX_SUN_get_transparent_index #define GLX_SUN_get_transparent_index 1 -typedef Status ( *PFNGLXGETTRANSPARENTINDEXSUNPROC) (Display *dpy, Window overlay, Window underlay, long *pTransparentIndex); +typedef Status ( *PFNGLXGETTRANSPARENTINDEXSUNPROC) (Display *dpy, Window overlay, Window underlay, unsigned long *pTransparentIndex); #ifdef GLX_GLXEXT_PROTOTYPES -Status glXGetTransparentIndexSUN (Display *dpy, Window overlay, Window underlay, long *pTransparentIndex); +Status glXGetTransparentIndexSUN (Display *dpy, Window overlay, Window underlay, unsigned long *pTransparentIndex); #endif #endif /* GLX_SUN_get_transparent_index */ diff -Nru mesa-19.2.8/include/GL/internal/dri_interface.h mesa-20.0.8/include/GL/internal/dri_interface.h --- mesa-19.2.8/include/GL/internal/dri_interface.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/include/GL/internal/dri_interface.h 2020-06-12 01:21:16.000000000 +0000 @@ -634,7 +634,7 @@ * SWRast Loader extension. */ #define __DRI_SWRAST_LOADER "DRI_SWRastLoader" -#define __DRI_SWRAST_LOADER_VERSION 5 +#define __DRI_SWRAST_LOADER_VERSION 6 struct __DRIswrastLoaderExtensionRec { __DRIextension base; @@ -711,6 +711,19 @@ int width, int height, int stride, int shmid, char *shmaddr, unsigned offset, void *loaderPrivate); + + /** + * get shm image to drawable (v2) + * + * There are some cases where GLX can't use SHM, but DRI + * still tries, we need to get a return type for when to + * fallback to the non-shm path. + * + * \since 6 + */ + GLboolean (*getImageShm2)(__DRIdrawable *readable, + int x, int y, int width, int height, + int shmid, void *loaderPrivate); }; /** @@ -809,7 +822,11 @@ #define __DRI_ATTRIB_YINVERTED 47 #define __DRI_ATTRIB_FRAMEBUFFER_SRGB_CAPABLE 48 #define __DRI_ATTRIB_MUTABLE_RENDER_BUFFER 49 /* EGL_MUTABLE_RENDER_BUFFER_BIT_KHR */ -#define __DRI_ATTRIB_MAX 50 +#define __DRI_ATTRIB_RED_SHIFT 50 +#define __DRI_ATTRIB_GREEN_SHIFT 51 +#define __DRI_ATTRIB_BLUE_SHIFT 52 +#define __DRI_ATTRIB_ALPHA_SHIFT 53 +#define __DRI_ATTRIB_MAX 54 /* __DRI_ATTRIB_RENDER_TYPE */ #define __DRI_ATTRIB_RGBA_BIT 0x01 @@ -1096,6 +1113,7 @@ * only BGRA ordering can be exposed. */ DRI_LOADER_CAP_RGBA_ORDERING, + DRI_LOADER_CAP_FP16, }; struct __DRIdri2LoaderExtensionRec { @@ -1336,6 +1354,9 @@ #define __DRI_IMAGE_FORMAT_ABGR2101010 0x1011 #define __DRI_IMAGE_FORMAT_SABGR8 0x1012 #define __DRI_IMAGE_FORMAT_UYVY 0x1013 +#define __DRI_IMAGE_FORMAT_XBGR16161616F 0x1014 +#define __DRI_IMAGE_FORMAT_ABGR16161616F 0x1015 +#define __DRI_IMAGE_FORMAT_SXRGB8 0x1016 #define __DRI_IMAGE_USE_SHARE 0x0001 #define __DRI_IMAGE_USE_SCANOUT 0x0002 @@ -1354,54 +1375,16 @@ (__DRI_IMAGE_TRANSFER_READ | __DRI_IMAGE_TRANSFER_WRITE) /** - * Four CC formats that matches with WL_DRM_FORMAT_* from wayland_drm.h, - * GBM_FORMAT_* from gbm.h, and DRM_FORMAT_* from drm_fourcc.h. Used with - * createImageFromNames. + * Extra fourcc formats used internally to Mesa with createImageFromNames. + * The externally-available fourccs are defined by drm_fourcc.h (DRM_FORMAT_*) + * and WL_DRM_FORMAT_* from wayland_drm.h. * * \since 5 */ -#define __DRI_IMAGE_FOURCC_R8 0x20203852 -#define __DRI_IMAGE_FOURCC_GR88 0x38385247 -#define __DRI_IMAGE_FOURCC_ARGB1555 0x35315241 -#define __DRI_IMAGE_FOURCC_R16 0x20363152 -#define __DRI_IMAGE_FOURCC_GR1616 0x32335247 -#define __DRI_IMAGE_FOURCC_RGB565 0x36314752 -#define __DRI_IMAGE_FOURCC_ARGB8888 0x34325241 -#define __DRI_IMAGE_FOURCC_XRGB8888 0x34325258 -#define __DRI_IMAGE_FOURCC_ABGR8888 0x34324241 -#define __DRI_IMAGE_FOURCC_XBGR8888 0x34324258 #define __DRI_IMAGE_FOURCC_SARGB8888 0x83324258 #define __DRI_IMAGE_FOURCC_SABGR8888 0x84324258 -#define __DRI_IMAGE_FOURCC_ARGB2101010 0x30335241 -#define __DRI_IMAGE_FOURCC_XRGB2101010 0x30335258 -#define __DRI_IMAGE_FOURCC_ABGR2101010 0x30334241 -#define __DRI_IMAGE_FOURCC_XBGR2101010 0x30334258 -#define __DRI_IMAGE_FOURCC_RGBA1010102 0x30334152 -#define __DRI_IMAGE_FOURCC_RGBX1010102 0x30335852 -#define __DRI_IMAGE_FOURCC_BGRA1010102 0x30334142 -#define __DRI_IMAGE_FOURCC_BGRX1010102 0x30335842 -#define __DRI_IMAGE_FOURCC_YUV410 0x39565559 -#define __DRI_IMAGE_FOURCC_YUV411 0x31315559 -#define __DRI_IMAGE_FOURCC_YUV420 0x32315559 -#define __DRI_IMAGE_FOURCC_YUV422 0x36315559 -#define __DRI_IMAGE_FOURCC_YUV444 0x34325559 -#define __DRI_IMAGE_FOURCC_NV12 0x3231564e -#define __DRI_IMAGE_FOURCC_NV16 0x3631564e -#define __DRI_IMAGE_FOURCC_YUYV 0x56595559 -#define __DRI_IMAGE_FOURCC_UYVY 0x59565955 -#define __DRI_IMAGE_FOURCC_AYUV 0x56555941 -#define __DRI_IMAGE_FOURCC_XYUV8888 0x56555958 - -#define __DRI_IMAGE_FOURCC_YVU410 0x39555659 -#define __DRI_IMAGE_FOURCC_YVU411 0x31315659 -#define __DRI_IMAGE_FOURCC_YVU420 0x32315659 -#define __DRI_IMAGE_FOURCC_YVU422 0x36315659 -#define __DRI_IMAGE_FOURCC_YVU444 0x34325659 - -#define __DRI_IMAGE_FOURCC_P010 0x30313050 -#define __DRI_IMAGE_FOURCC_P012 0x32313050 -#define __DRI_IMAGE_FOURCC_P016 0x36313050 +#define __DRI_IMAGE_FOURCC_SXRGB8888 0x85324258 /** * Queryable on images created by createImageFromNames. @@ -1547,8 +1530,8 @@ GLboolean (*validateUsage)(__DRIimage *image, unsigned int use); /** - * Unlike createImageFromName __DRI_IMAGE_FORMAT is not but instead - * __DRI_IMAGE_FOURCC and strides are in bytes not pixels. Stride is + * Unlike createImageFromName __DRI_IMAGE_FORMAT is not used but instead + * DRM_FORMAT_*, and strides are in bytes not pixels. Stride is * also per block and not per pixel (for non-RGB, see gallium blocks). * * \since 5 diff -Nru mesa-19.2.8/include/GL/wglext.h mesa-20.0.8/include/GL/wglext.h --- mesa-19.2.8/include/GL/wglext.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/include/GL/wglext.h 2020-06-12 01:21:16.000000000 +0000 @@ -1,12 +1,12 @@ -#ifndef __wglext_h_ -#define __wglext_h_ 1 +#ifndef __wgl_wglext_h_ +#define __wgl_wglext_h_ 1 #ifdef __cplusplus extern "C" { #endif /* -** Copyright (c) 2013-2017 The Khronos Group Inc. +** Copyright (c) 2013-2018 The Khronos Group Inc. ** ** Permission is hereby granted, free of charge, to any person obtaining a ** copy of this software and/or associated documentation files (the @@ -39,7 +39,7 @@ #include #endif -#define WGL_WGLEXT_VERSION 20170817 +#define WGL_WGLEXT_VERSION 20190728 /* Generated C header for: * API: wgl @@ -318,7 +318,7 @@ #define WGL_GPU_NUM_RB_AMD 0x21A7 #define WGL_GPU_NUM_SPI_AMD 0x21A8 typedef UINT (WINAPI * PFNWGLGETGPUIDSAMDPROC) (UINT maxCount, UINT *ids); -typedef INT (WINAPI * PFNWGLGETGPUINFOAMDPROC) (UINT id, int property, GLenum dataType, UINT size, void *data); +typedef INT (WINAPI * PFNWGLGETGPUINFOAMDPROC) (UINT id, INT property, GLenum dataType, UINT size, void *data); typedef UINT (WINAPI * PFNWGLGETCONTEXTGPUIDAMDPROC) (HGLRC hglrc); typedef HGLRC (WINAPI * PFNWGLCREATEASSOCIATEDCONTEXTAMDPROC) (UINT id); typedef HGLRC (WINAPI * PFNWGLCREATEASSOCIATEDCONTEXTATTRIBSAMDPROC) (UINT id, HGLRC hShareContext, const int *attribList); @@ -328,7 +328,7 @@ typedef VOID (WINAPI * PFNWGLBLITCONTEXTFRAMEBUFFERAMDPROC) (HGLRC dstCtx, GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1, GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1, GLbitfield mask, GLenum filter); #ifdef WGL_WGLEXT_PROTOTYPES UINT WINAPI wglGetGPUIDsAMD (UINT maxCount, UINT *ids); -INT WINAPI wglGetGPUInfoAMD (UINT id, int property, GLenum dataType, UINT size, void *data); +INT WINAPI wglGetGPUInfoAMD (UINT id, INT property, GLenum dataType, UINT size, void *data); UINT WINAPI wglGetContextGPUIDAMD (HGLRC hglrc); HGLRC WINAPI wglCreateAssociatedContextAMD (UINT id); HGLRC WINAPI wglCreateAssociatedContextAttribsAMD (UINT id, HGLRC hShareContext, const int *attribList); @@ -344,9 +344,14 @@ #define WGL_TYPE_RGBA_FLOAT_ATI 0x21A0 #endif /* WGL_ATI_pixel_format_float */ +#ifndef WGL_ATI_render_texture_rectangle +#define WGL_ATI_render_texture_rectangle 1 +#define WGL_TEXTURE_RECTANGLE_ATI 0x21A5 +#endif /* WGL_ATI_render_texture_rectangle */ + #ifndef WGL_EXT_colorspace #define WGL_EXT_colorspace 1 -#define WGL_COLORSPACE_EXT 0x3087 +#define WGL_COLORSPACE_EXT 0x309D #define WGL_COLORSPACE_SRGB_EXT 0x3089 #define WGL_COLORSPACE_LINEAR_EXT 0x308A #endif /* WGL_EXT_colorspace */ @@ -710,6 +715,15 @@ #endif #endif /* WGL_NV_gpu_affinity */ +#ifndef WGL_NV_multigpu_context +#define WGL_NV_multigpu_context 1 +#define WGL_CONTEXT_MULTIGPU_ATTRIB_NV 0x20AA +#define WGL_CONTEXT_MULTIGPU_ATTRIB_SINGLE_NV 0x20AB +#define WGL_CONTEXT_MULTIGPU_ATTRIB_AFR_NV 0x20AC +#define WGL_CONTEXT_MULTIGPU_ATTRIB_MULTICAST_NV 0x20AD +#define WGL_CONTEXT_MULTIGPU_ATTRIB_MULTI_DISPLAY_MULTICAST_NV 0x20AE +#endif /* WGL_NV_multigpu_context */ + #ifndef WGL_NV_multisample_coverage #define WGL_NV_multisample_coverage 1 #define WGL_COVERAGE_SAMPLES_NV 0x2042 @@ -720,12 +734,12 @@ #define WGL_NV_present_video 1 DECLARE_HANDLE(HVIDEOOUTPUTDEVICENV); #define WGL_NUM_VIDEO_SLOTS_NV 0x20F0 -typedef int (WINAPI * PFNWGLENUMERATEVIDEODEVICESNVPROC) (HDC hDC, HVIDEOOUTPUTDEVICENV *phDeviceList); -typedef BOOL (WINAPI * PFNWGLBINDVIDEODEVICENVPROC) (HDC hDC, unsigned int uVideoSlot, HVIDEOOUTPUTDEVICENV hVideoDevice, const int *piAttribList); +typedef int (WINAPI * PFNWGLENUMERATEVIDEODEVICESNVPROC) (HDC hDc, HVIDEOOUTPUTDEVICENV *phDeviceList); +typedef BOOL (WINAPI * PFNWGLBINDVIDEODEVICENVPROC) (HDC hDc, unsigned int uVideoSlot, HVIDEOOUTPUTDEVICENV hVideoDevice, const int *piAttribList); typedef BOOL (WINAPI * PFNWGLQUERYCURRENTCONTEXTNVPROC) (int iAttribute, int *piValue); #ifdef WGL_WGLEXT_PROTOTYPES -int WINAPI wglEnumerateVideoDevicesNV (HDC hDC, HVIDEOOUTPUTDEVICENV *phDeviceList); -BOOL WINAPI wglBindVideoDeviceNV (HDC hDC, unsigned int uVideoSlot, HVIDEOOUTPUTDEVICENV hVideoDevice, const int *piAttribList); +int WINAPI wglEnumerateVideoDevicesNV (HDC hDc, HVIDEOOUTPUTDEVICENV *phDeviceList); +BOOL WINAPI wglBindVideoDeviceNV (HDC hDc, unsigned int uVideoSlot, HVIDEOOUTPUTDEVICENV hVideoDevice, const int *piAttribList); BOOL WINAPI wglQueryCurrentContextNV (int iAttribute, int *piValue); #endif #endif /* WGL_NV_present_video */ @@ -830,14 +844,14 @@ typedef BOOL (WINAPI * PFNWGLGETSYNCVALUESOMLPROC) (HDC hdc, INT64 *ust, INT64 *msc, INT64 *sbc); typedef BOOL (WINAPI * PFNWGLGETMSCRATEOMLPROC) (HDC hdc, INT32 *numerator, INT32 *denominator); typedef INT64 (WINAPI * PFNWGLSWAPBUFFERSMSCOMLPROC) (HDC hdc, INT64 target_msc, INT64 divisor, INT64 remainder); -typedef INT64 (WINAPI * PFNWGLSWAPLAYERBUFFERSMSCOMLPROC) (HDC hdc, int fuPlanes, INT64 target_msc, INT64 divisor, INT64 remainder); +typedef INT64 (WINAPI * PFNWGLSWAPLAYERBUFFERSMSCOMLPROC) (HDC hdc, INT fuPlanes, INT64 target_msc, INT64 divisor, INT64 remainder); typedef BOOL (WINAPI * PFNWGLWAITFORMSCOMLPROC) (HDC hdc, INT64 target_msc, INT64 divisor, INT64 remainder, INT64 *ust, INT64 *msc, INT64 *sbc); typedef BOOL (WINAPI * PFNWGLWAITFORSBCOMLPROC) (HDC hdc, INT64 target_sbc, INT64 *ust, INT64 *msc, INT64 *sbc); #ifdef WGL_WGLEXT_PROTOTYPES BOOL WINAPI wglGetSyncValuesOML (HDC hdc, INT64 *ust, INT64 *msc, INT64 *sbc); BOOL WINAPI wglGetMscRateOML (HDC hdc, INT32 *numerator, INT32 *denominator); INT64 WINAPI wglSwapBuffersMscOML (HDC hdc, INT64 target_msc, INT64 divisor, INT64 remainder); -INT64 WINAPI wglSwapLayerBuffersMscOML (HDC hdc, int fuPlanes, INT64 target_msc, INT64 divisor, INT64 remainder); +INT64 WINAPI wglSwapLayerBuffersMscOML (HDC hdc, INT fuPlanes, INT64 target_msc, INT64 divisor, INT64 remainder); BOOL WINAPI wglWaitForMscOML (HDC hdc, INT64 target_msc, INT64 divisor, INT64 remainder, INT64 *ust, INT64 *msc, INT64 *sbc); BOOL WINAPI wglWaitForSbcOML (HDC hdc, INT64 target_sbc, INT64 *ust, INT64 *msc, INT64 *sbc); #endif diff -Nru mesa-19.2.8/include/GLES2/gl2ext.h mesa-20.0.8/include/GLES2/gl2ext.h --- mesa-19.2.8/include/GLES2/gl2ext.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/include/GLES2/gl2ext.h 2020-06-12 01:21:16.000000000 +0000 @@ -38,7 +38,7 @@ #define GL_APIENTRYP GL_APIENTRY* #endif -/* Generated on date 20190611 */ +/* Generated on date 20190911 */ /* Generated C header for: * API: gles2 @@ -197,6 +197,22 @@ #endif #endif /* GL_KHR_robustness */ +#ifndef GL_KHR_shader_subgroup +#define GL_KHR_shader_subgroup 1 +#define GL_SUBGROUP_SIZE_KHR 0x9532 +#define GL_SUBGROUP_SUPPORTED_STAGES_KHR 0x9533 +#define GL_SUBGROUP_SUPPORTED_FEATURES_KHR 0x9534 +#define GL_SUBGROUP_QUAD_ALL_STAGES_KHR 0x9535 +#define GL_SUBGROUP_FEATURE_BASIC_BIT_KHR 0x00000001 +#define GL_SUBGROUP_FEATURE_VOTE_BIT_KHR 0x00000002 +#define GL_SUBGROUP_FEATURE_ARITHMETIC_BIT_KHR 0x00000004 +#define GL_SUBGROUP_FEATURE_BALLOT_BIT_KHR 0x00000008 +#define GL_SUBGROUP_FEATURE_SHUFFLE_BIT_KHR 0x00000010 +#define GL_SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT_KHR 0x00000020 +#define GL_SUBGROUP_FEATURE_CLUSTERED_BIT_KHR 0x00000040 +#define GL_SUBGROUP_FEATURE_QUAD_BIT_KHR 0x00000080 +#endif /* GL_KHR_shader_subgroup */ + #ifndef GL_KHR_texture_compression_astc_hdr #define GL_KHR_texture_compression_astc_hdr 1 #define GL_COMPRESSED_RGBA_ASTC_4x4_KHR 0x93B0 @@ -2439,6 +2455,12 @@ #ifndef GL_MESA_framebuffer_flip_y #define GL_MESA_framebuffer_flip_y 1 #define GL_FRAMEBUFFER_FLIP_Y_MESA 0x8BBB +typedef void (GL_APIENTRYP PFNGLFRAMEBUFFERPARAMETERIMESAPROC) (GLenum target, GLenum pname, GLint param); +typedef void (GL_APIENTRYP PFNGLGETFRAMEBUFFERPARAMETERIVMESAPROC) (GLenum target, GLenum pname, GLint *params); +#ifdef GL_GLEXT_PROTOTYPES +GL_APICALL void GL_APIENTRY glFramebufferParameteriMESA (GLenum target, GLenum pname, GLint param); +GL_APICALL void GL_APIENTRY glGetFramebufferParameterivMESA (GLenum target, GLenum pname, GLint *params); +#endif #endif /* GL_MESA_framebuffer_flip_y */ #ifndef GL_MESA_program_binary_formats @@ -3492,6 +3514,11 @@ #define GL_NV_shader_noperspective_interpolation 1 #endif /* GL_NV_shader_noperspective_interpolation */ +#ifndef GL_NV_shader_subgroup_partitioned +#define GL_NV_shader_subgroup_partitioned 1 +#define GL_SUBGROUP_FEATURE_PARTITIONED_BIT_NV 0x00000100 +#endif /* GL_NV_shader_subgroup_partitioned */ + #ifndef GL_NV_shader_texture_footprint #define GL_NV_shader_texture_footprint 1 #endif /* GL_NV_shader_texture_footprint */ diff -Nru mesa-19.2.8/include/GLES2/gl2.h mesa-20.0.8/include/GLES2/gl2.h --- mesa-19.2.8/include/GLES2/gl2.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/include/GLES2/gl2.h 2020-06-12 01:21:16.000000000 +0000 @@ -44,7 +44,7 @@ #define GL_GLES_PROTOTYPES 1 #endif -/* Generated on date 20190611 */ +/* Generated on date 20190911 */ /* Generated C header for: * API: gles2 diff -Nru mesa-19.2.8/include/meson.build mesa-20.0.8/include/meson.build --- mesa-19.2.8/include/meson.build 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/include/meson.build 2020-06-12 01:21:16.000000000 +0000 @@ -22,7 +22,7 @@ inc_d3d9 = include_directories('D3D9') inc_haikugl = include_directories('HaikuGL') -if not glvnd_has_headers_and_pc_files +if not with_glvnd if with_gles1 or with_gles2 or with_opengl or with_egl install_headers('KHR/khrplatform.h', subdir : 'KHR') endif @@ -127,6 +127,7 @@ 'CL/cl_ext_intel.h', 'CL/cl_gl.h', 'CL/cl_gl_ext.h', + 'CL/cl_icd.h', 'CL/cl_platform.h', 'CL/cl_va_api_media_sharing_intel.h', 'CL/cl_version.h', diff -Nru mesa-19.2.8/include/pci_ids/i965_pci_ids.h mesa-20.0.8/include/pci_ids/i965_pci_ids.h --- mesa-19.2.8/include/pci_ids/i965_pci_ids.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/include/pci_ids/i965_pci_ids.h 2020-06-12 01:21:16.000000000 +0000 @@ -1,241 +1,267 @@ -#ifndef IRIS -CHIPSET(0x29A2, i965, "Intel(R) 965G") -CHIPSET(0x2992, i965, "Intel(R) 965Q") -CHIPSET(0x2982, i965, "Intel(R) 965G") -CHIPSET(0x2972, i965, "Intel(R) 946GZ") -CHIPSET(0x2A02, i965, "Intel(R) 965GM") -CHIPSET(0x2A12, i965, "Intel(R) 965GME/GLE") -CHIPSET(0x2A42, g4x, "Mobile Intel® GM45 Express Chipset") -CHIPSET(0x2E02, g4x, "Intel(R) Integrated Graphics Device") -CHIPSET(0x2E12, g4x, "Intel(R) Q45/Q43") -CHIPSET(0x2E22, g4x, "Intel(R) G45/G43") -CHIPSET(0x2E32, g4x, "Intel(R) G41") -CHIPSET(0x2E42, g4x, "Intel(R) B43") -CHIPSET(0x2E92, g4x, "Intel(R) B43") -CHIPSET(0x0042, ilk, "Intel(R) Ironlake Desktop") -CHIPSET(0x0046, ilk, "Intel(R) Ironlake Mobile") -CHIPSET(0x0102, snb_gt1, "Intel(R) Sandybridge Desktop") -CHIPSET(0x0112, snb_gt2, "Intel(R) Sandybridge Desktop") -CHIPSET(0x0122, snb_gt2, "Intel(R) Sandybridge Desktop") -CHIPSET(0x0106, snb_gt1, "Intel(R) Sandybridge Mobile") -CHIPSET(0x0116, snb_gt2, "Intel(R) Sandybridge Mobile") -CHIPSET(0x0126, snb_gt2, "Intel(R) Sandybridge Mobile") -CHIPSET(0x010A, snb_gt1, "Intel(R) Sandybridge Server") -CHIPSET(0x0152, ivb_gt1, "Intel(R) Ivybridge Desktop") -CHIPSET(0x0162, ivb_gt2, "Intel(R) Ivybridge Desktop") -CHIPSET(0x0156, ivb_gt1, "Intel(R) Ivybridge Mobile") -CHIPSET(0x0166, ivb_gt2, "Intel(R) Ivybridge Mobile") -CHIPSET(0x015a, ivb_gt1, "Intel(R) Ivybridge Server") -CHIPSET(0x016a, ivb_gt2, "Intel(R) Ivybridge Server") -CHIPSET(0x0402, hsw_gt1, "Intel(R) Haswell Desktop") -CHIPSET(0x0412, hsw_gt2, "Intel(R) Haswell Desktop") -CHIPSET(0x0422, hsw_gt3, "Intel(R) Haswell Desktop") -CHIPSET(0x0406, hsw_gt1, "Intel(R) Haswell Mobile") -CHIPSET(0x0416, hsw_gt2, "Intel(R) Haswell Mobile") -CHIPSET(0x0426, hsw_gt3, "Intel(R) Haswell Mobile") -CHIPSET(0x040A, hsw_gt1, "Intel(R) Haswell Server") -CHIPSET(0x041A, hsw_gt2, "Intel(R) Haswell Server") -CHIPSET(0x042A, hsw_gt3, "Intel(R) Haswell Server") -CHIPSET(0x040B, hsw_gt1, "Intel(R) Haswell") -CHIPSET(0x041B, hsw_gt2, "Intel(R) Haswell") -CHIPSET(0x042B, hsw_gt3, "Intel(R) Haswell") -CHIPSET(0x040E, hsw_gt1, "Intel(R) Haswell") -CHIPSET(0x041E, hsw_gt2, "Intel(R) Haswell") -CHIPSET(0x042E, hsw_gt3, "Intel(R) Haswell") -CHIPSET(0x0C02, hsw_gt1, "Intel(R) Haswell Desktop") -CHIPSET(0x0C12, hsw_gt2, "Intel(R) Haswell Desktop") -CHIPSET(0x0C22, hsw_gt3, "Intel(R) Haswell Desktop") -CHIPSET(0x0C06, hsw_gt1, "Intel(R) Haswell Mobile") -CHIPSET(0x0C16, hsw_gt2, "Intel(R) Haswell Mobile") -CHIPSET(0x0C26, hsw_gt3, "Intel(R) Haswell Mobile") -CHIPSET(0x0C0A, hsw_gt1, "Intel(R) Haswell Server") -CHIPSET(0x0C1A, hsw_gt2, "Intel(R) Haswell Server") -CHIPSET(0x0C2A, hsw_gt3, "Intel(R) Haswell Server") -CHIPSET(0x0C0B, hsw_gt1, "Intel(R) Haswell") -CHIPSET(0x0C1B, hsw_gt2, "Intel(R) Haswell") -CHIPSET(0x0C2B, hsw_gt3, "Intel(R) Haswell") -CHIPSET(0x0C0E, hsw_gt1, "Intel(R) Haswell") -CHIPSET(0x0C1E, hsw_gt2, "Intel(R) Haswell") -CHIPSET(0x0C2E, hsw_gt3, "Intel(R) Haswell") -CHIPSET(0x0A02, hsw_gt1, "Intel(R) Haswell Desktop") -CHIPSET(0x0A12, hsw_gt2, "Intel(R) Haswell Desktop") -CHIPSET(0x0A22, hsw_gt3, "Intel(R) Haswell Desktop") -CHIPSET(0x0A06, hsw_gt1, "Intel(R) Haswell Mobile") -CHIPSET(0x0A16, hsw_gt2, "Intel(R) Haswell Mobile") -CHIPSET(0x0A26, hsw_gt3, "Intel(R) Haswell Mobile") -CHIPSET(0x0A0A, hsw_gt1, "Intel(R) Haswell Server") -CHIPSET(0x0A1A, hsw_gt2, "Intel(R) Haswell Server") -CHIPSET(0x0A2A, hsw_gt3, "Intel(R) Haswell Server") -CHIPSET(0x0A0B, hsw_gt1, "Intel(R) Haswell") -CHIPSET(0x0A1B, hsw_gt2, "Intel(R) Haswell") -CHIPSET(0x0A2B, hsw_gt3, "Intel(R) Haswell") -CHIPSET(0x0A0E, hsw_gt1, "Intel(R) Haswell") -CHIPSET(0x0A1E, hsw_gt2, "Intel(R) Haswell") -CHIPSET(0x0A2E, hsw_gt3, "Intel(R) Haswell") -CHIPSET(0x0D02, hsw_gt1, "Intel(R) Haswell Desktop") -CHIPSET(0x0D12, hsw_gt2, "Intel(R) Haswell Desktop") -CHIPSET(0x0D22, hsw_gt3, "Intel(R) Haswell Desktop") -CHIPSET(0x0D06, hsw_gt1, "Intel(R) Haswell Mobile") -CHIPSET(0x0D16, hsw_gt2, "Intel(R) Haswell Mobile") -CHIPSET(0x0D26, hsw_gt3, "Intel(R) Haswell Mobile") -CHIPSET(0x0D0A, hsw_gt1, "Intel(R) Haswell Server") -CHIPSET(0x0D1A, hsw_gt2, "Intel(R) Haswell Server") -CHIPSET(0x0D2A, hsw_gt3, "Intel(R) Haswell") -CHIPSET(0x0D0B, hsw_gt1, "Intel(R) Haswell") -CHIPSET(0x0D1B, hsw_gt2, "Intel(R) Haswell") -CHIPSET(0x0D2B, hsw_gt3, "Intel(R) Haswell") -CHIPSET(0x0D0E, hsw_gt1, "Intel(R) Haswell") -CHIPSET(0x0D1E, hsw_gt2, "Intel(R) Haswell") -CHIPSET(0x0D2E, hsw_gt3, "Intel(R) Haswell") -CHIPSET(0x0F31, byt, "Intel(R) Bay Trail") -CHIPSET(0x0F32, byt, "Intel(R) Bay Trail") -CHIPSET(0x0F33, byt, "Intel(R) Bay Trail") -CHIPSET(0x0157, byt, "Intel(R) Bay Trail") -CHIPSET(0x0155, byt, "Intel(R) Bay Trail") -CHIPSET(0x22B0, chv, "Intel(R) HD Graphics (Cherrytrail)") -CHIPSET(0x22B1, chv, "Intel(R) HD Graphics XXX (Braswell)") /* Overridden in brw_get_renderer_string */ -CHIPSET(0x22B2, chv, "Intel(R) HD Graphics (Cherryview)") -CHIPSET(0x22B3, chv, "Intel(R) HD Graphics (Cherryview)") +CHIPSET(0x29A2, i965, "BW", "Intel(R) 965G") +CHIPSET(0x2992, i965, "BW", "Intel(R) 965Q") +CHIPSET(0x2982, i965, "BW", "Intel(R) 965G") +CHIPSET(0x2972, i965, "BW", "Intel(R) 946GZ") +CHIPSET(0x2A02, i965, "CL", "Intel(R) 965GM") +CHIPSET(0x2A12, i965, "CL", "Intel(R) 965GME/GLE") + +CHIPSET(0x2A42, g4x, "CTG", "Mobile Intel® GM45 Express Chipset") +CHIPSET(0x2E02, g4x, "ELK", "Intel(R) Integrated Graphics Device") +CHIPSET(0x2E12, g4x, "ELK", "Intel(R) Q45/Q43") +CHIPSET(0x2E22, g4x, "ELK", "Intel(R) G45/G43") +CHIPSET(0x2E32, g4x, "ELK", "Intel(R) G41") +CHIPSET(0x2E42, g4x, "ELK", "Intel(R) B43") +CHIPSET(0x2E92, g4x, "ELK", "Intel(R) B43") + +CHIPSET(0x0042, ilk, "ILK", "Intel(R) HD Graphics") +CHIPSET(0x0046, ilk, "ILK", "Intel(R) HD Graphics") + +CHIPSET(0x0102, snb_gt1, "SNB GT1", "Intel(R) HD Graphics 2000") +CHIPSET(0x0112, snb_gt2, "SNB GT2", "Intel(R) HD Graphics 3000") +CHIPSET(0x0122, snb_gt2, "SNB GT2", "Intel(R) HD Graphics 3000") +CHIPSET(0x0106, snb_gt1, "SNB GT1", "Intel(R) HD Graphics 2000") +CHIPSET(0x0116, snb_gt2, "SNB GT2", "Intel(R) HD Graphics 3000") +CHIPSET(0x0126, snb_gt2, "SNB GT2", "Intel(R) HD Graphics 3000") +CHIPSET(0x010A, snb_gt1, "SNB GT1", "Intel(R) HD Graphics 2000") + +CHIPSET(0x0152, ivb_gt1, "IVB GT1", "Intel(R) HD Graphics 2500") +CHIPSET(0x0162, ivb_gt2, "IVB GT2", "Intel(R) HD Graphics 4000") +CHIPSET(0x0156, ivb_gt1, "IVB GT1", "Intel(R) HD Graphics 2500") +CHIPSET(0x0166, ivb_gt2, "IVB GT2", "Intel(R) HD Graphics 4000") +CHIPSET(0x015a, ivb_gt1, "IVB GT1", "Intel(R) HD Graphics") +CHIPSET(0x016a, ivb_gt2, "IVB GT2", "Intel(R) HD Graphics P4000") + +CHIPSET(0x0402, hsw_gt1, "HSW GT1", "Intel(R) HD Graphics") +CHIPSET(0x0412, hsw_gt2, "HSW GT2", "Intel(R) HD Graphics 4600") +CHIPSET(0x0422, hsw_gt3, "HSW GT3", "Intel(R) HD Graphics") +CHIPSET(0x0406, hsw_gt1, "HSW GT1", "Intel(R) HD Graphics") +CHIPSET(0x0416, hsw_gt2, "HSW GT2", "Intel(R) HD Graphics 4600") +CHIPSET(0x0426, hsw_gt3, "HSW GT3", "Intel(R) HD Graphics") +CHIPSET(0x040A, hsw_gt1, "HSW GT1", "Intel(R) HD Graphics") +CHIPSET(0x041A, hsw_gt2, "HSW GT2", "Intel(R) HD Graphics P4600/P4700") +CHIPSET(0x042A, hsw_gt3, "HSW GT3", "Intel(R) HD Graphics") +CHIPSET(0x040B, hsw_gt1, "HSW GT1", "Intel(R) HD Graphics") +CHIPSET(0x041B, hsw_gt2, "HSW GT2", "Intel(R) HD Graphics") +CHIPSET(0x042B, hsw_gt3, "HSW GT3", "Intel(R) HD Graphics") +CHIPSET(0x040E, hsw_gt1, "HSW GT1", "Intel(R) HD Graphics") +CHIPSET(0x041E, hsw_gt2, "HSW GT2", "Intel(R) HD Graphics 4400") +CHIPSET(0x042E, hsw_gt3, "HSW GT3", "Intel(R) HD Graphics") +CHIPSET(0x0C02, hsw_gt1, "HSW GT1", "Intel(R) HD Graphics") +CHIPSET(0x0C12, hsw_gt2, "HSW GT2", "Intel(R) HD Graphics") +CHIPSET(0x0C22, hsw_gt3, "HSW GT3", "Intel(R) HD Graphics") +CHIPSET(0x0C06, hsw_gt1, "HSW GT1", "Intel(R) HD Graphics") +CHIPSET(0x0C16, hsw_gt2, "HSW GT2", "Intel(R) HD Graphics") +CHIPSET(0x0C26, hsw_gt3, "HSW GT3", "Intel(R) HD Graphics") +CHIPSET(0x0C0A, hsw_gt1, "HSW GT1", "Intel(R) HD Graphics") +CHIPSET(0x0C1A, hsw_gt2, "HSW GT2", "Intel(R) HD Graphics") +CHIPSET(0x0C2A, hsw_gt3, "HSW GT3", "Intel(R) HD Graphics") +CHIPSET(0x0C0B, hsw_gt1, "HSW GT1", "Intel(R) HD Graphics") +CHIPSET(0x0C1B, hsw_gt2, "HSW GT2", "Intel(R) HD Graphics") +CHIPSET(0x0C2B, hsw_gt3, "HSW GT3", "Intel(R) HD Graphics") +CHIPSET(0x0C0E, hsw_gt1, "HSW GT1", "Intel(R) HD Graphics") +CHIPSET(0x0C1E, hsw_gt2, "HSW GT2", "Intel(R) HD Graphics") +CHIPSET(0x0C2E, hsw_gt3, "HSW GT3", "Intel(R) HD Graphics") +CHIPSET(0x0A02, hsw_gt1, "HSW GT1", "Intel(R) HD Graphics") +CHIPSET(0x0A12, hsw_gt2, "HSW GT2", "Intel(R) HD Graphics") +CHIPSET(0x0A22, hsw_gt3, "HSW GT3", "Intel(R) HD Graphics") +CHIPSET(0x0A06, hsw_gt1, "HSW GT1", "Intel(R) HD Graphics") +CHIPSET(0x0A16, hsw_gt2, "HSW GT2", "Intel(R) HD Graphics 4400") +CHIPSET(0x0A26, hsw_gt3, "HSW GT3", "Intel(R) HD Graphics 5000") +CHIPSET(0x0A0A, hsw_gt1, "HSW GT1", "Intel(R) HD Graphics") +CHIPSET(0x0A1A, hsw_gt2, "HSW GT2", "Intel(R) HD Graphics") +CHIPSET(0x0A2A, hsw_gt3, "HSW GT3", "Intel(R) HD Graphics") +CHIPSET(0x0A0B, hsw_gt1, "HSW GT1", "Intel(R) HD Graphics") +CHIPSET(0x0A1B, hsw_gt2, "HSW GT2", "Intel(R) HD Graphics") +CHIPSET(0x0A2B, hsw_gt3, "HSW GT3", "Intel(R) HD Graphics") +CHIPSET(0x0A0E, hsw_gt1, "HSW GT1", "Intel(R) HD Graphics") +CHIPSET(0x0A1E, hsw_gt2, "HSW GT2", "Intel(R) HD Graphics 4200") +CHIPSET(0x0A2E, hsw_gt3, "HSW GT3", "Intel(R) Iris(R) Graphics 5100") +CHIPSET(0x0D02, hsw_gt1, "HSW GT1", "Intel(R) HD Graphics") +CHIPSET(0x0D12, hsw_gt2, "HSW GT2", "Intel(R) HD Graphics 4600") +CHIPSET(0x0D22, hsw_gt3, "HSW GT3", "Intel(R) Iris(R) Pro Graphics 5200") +CHIPSET(0x0D06, hsw_gt1, "HSW GT1", "Intel(R) HD Graphics") +CHIPSET(0x0D16, hsw_gt2, "HSW GT2", "Intel(R) HD Graphics") +CHIPSET(0x0D26, hsw_gt3, "HSW GT3", "Intel(R) Iris(R) Pro Graphics P5200") +CHIPSET(0x0D0A, hsw_gt1, "HSW GT1", "Intel(R) HD Graphics") +CHIPSET(0x0D1A, hsw_gt2, "HSW GT2", "Intel(R) HD Graphics") +CHIPSET(0x0D2A, hsw_gt3, "HSW GT3", "Intel(R) HD Graphics") +CHIPSET(0x0D0B, hsw_gt1, "HSW GT1", "Intel(R) HD Graphics") +CHIPSET(0x0D1B, hsw_gt2, "HSW GT2", "Intel(R) HD Graphics") +CHIPSET(0x0D2B, hsw_gt3, "HSW GT3", "Intel(R) HD Graphics") +CHIPSET(0x0D0E, hsw_gt1, "HSW GT1", "Intel(R) HD Graphics") +CHIPSET(0x0D1E, hsw_gt2, "HSW GT2", "Intel(R) HD Graphics") +CHIPSET(0x0D2E, hsw_gt3, "HSW GT3", "Intel(R) HD Graphics") + +CHIPSET(0x0F31, byt, "BYT", "Intel(R) HD Graphics") +CHIPSET(0x0F32, byt, "BYT", "Intel(R) HD Graphics") +CHIPSET(0x0F33, byt, "BYT", "Intel(R) HD Graphics") +CHIPSET(0x0157, byt, "BYT", "Intel(R) HD Graphics") +CHIPSET(0x0155, byt, "BYT", "Intel(R) HD Graphics") + +CHIPSET(0x22B0, chv, "CHV", "Intel(R) HD Graphics") +CHIPSET(0x22B1, chv, "BSW", "Intel(R) HD Graphics XXX") /* Overridden in brw_get_renderer_string */ +CHIPSET(0x22B2, chv, "CHV", "Intel(R) HD Graphics") +CHIPSET(0x22B3, chv, "CHV", "Intel(R) HD Graphics") + +#ifndef PREFER_IRIS +CHIPSET(0x1602, bdw_gt1, "BDW GT1", "Intel(R) HD Graphics") +CHIPSET(0x1606, bdw_gt1, "BDW GT1", "Intel(R) HD Graphics") +CHIPSET(0x160A, bdw_gt1, "BDW GT1", "Intel(R) HD Graphics") +CHIPSET(0x160B, bdw_gt1, "BDW GT1", "Intel(R) HD Graphics") +CHIPSET(0x160D, bdw_gt1, "BDW GT1", "Intel(R) HD Graphics") +CHIPSET(0x160E, bdw_gt1, "BDW GT1", "Intel(R) HD Graphics") +CHIPSET(0x1612, bdw_gt2, "BDW GT2", "Intel(R) HD Graphics 5600") +CHIPSET(0x1616, bdw_gt2, "BDW GT2", "Intel(R) HD Graphics 5500") +CHIPSET(0x161A, bdw_gt2, "BDW GT2", "Intel(R) HD Graphics P5700") +CHIPSET(0x161B, bdw_gt2, "BDW GT2", "Intel(R) HD Graphics") +CHIPSET(0x161D, bdw_gt2, "BDW GT2", "Intel(R) HD Graphics") +CHIPSET(0x161E, bdw_gt2, "BDW GT2", "Intel(R) HD Graphics 5300") +CHIPSET(0x1622, bdw_gt3, "BDW GT3", "Intel(R) Iris(R) Pro Graphics 6200") +CHIPSET(0x1626, bdw_gt3, "BDW GT3", "Intel(R) HD Graphics 6000") +CHIPSET(0x162A, bdw_gt3, "BDW GT3", "Intel(R) Iris(R) Pro Graphics P6300") +CHIPSET(0x162B, bdw_gt3, "BDW GT3", "Intel(R) Iris(R) Graphics 6100") +CHIPSET(0x162D, bdw_gt3, "BDW GT3", "Intel(R) HD Graphics") +CHIPSET(0x162E, bdw_gt3, "BDW GT3", "Intel(R) HD Graphics") + +CHIPSET(0x1902, skl_gt1, "SKL GT1", "Intel(R) HD Graphics 510") +CHIPSET(0x1906, skl_gt1, "SKL GT1", "Intel(R) HD Graphics 510") +CHIPSET(0x190A, skl_gt1, "SKL GT1", "Intel(R) HD Graphics") +CHIPSET(0x190B, skl_gt1, "SKL GT1", "Intel(R) HD Graphics 510") +CHIPSET(0x190E, skl_gt1, "SKL GT1", "Intel(R) HD Graphics") +CHIPSET(0x1912, skl_gt2, "SKL GT2", "Intel(R) HD Graphics 530") +CHIPSET(0x1913, skl_gt2, "SKL GT2F", "Intel(R) HD Graphics") +CHIPSET(0x1915, skl_gt2, "SKL GT2F", "Intel(R) HD Graphics") +CHIPSET(0x1916, skl_gt2, "SKL GT2", "Intel(R) HD Graphics 520") +CHIPSET(0x1917, skl_gt2, "SKL GT2F", "Intel(R) HD Graphics") +CHIPSET(0x191A, skl_gt2, "SKL GT2", "Intel(R) HD Graphics") +CHIPSET(0x191B, skl_gt2, "SKL GT2", "Intel(R) HD Graphics 530") +CHIPSET(0x191D, skl_gt2, "SKL GT2", "Intel(R) HD Graphics P530") +CHIPSET(0x191E, skl_gt2, "SKL GT2", "Intel(R) HD Graphics 515") +CHIPSET(0x1921, skl_gt2, "SKL GT2", "Intel(R) HD Graphics 520") +CHIPSET(0x1923, skl_gt3, "SKL GT3", "Intel(R) HD Graphics 535") +CHIPSET(0x1926, skl_gt3, "SKL GT3", "Intel(R) Iris(R) Graphics 540") +CHIPSET(0x1927, skl_gt3, "SKL GT3", "Intel(R) Iris(R) Graphics 550") +CHIPSET(0x192A, skl_gt4, "SKL GT4", "Intel(R) HD Graphics") +CHIPSET(0x192B, skl_gt3, "SKL GT3", "Intel(R) Iris(R) Graphics 555") +CHIPSET(0x192D, skl_gt3, "SKL GT3", "Intel(R) Iris(R) Graphics P555") +CHIPSET(0x1932, skl_gt4, "SKL GT4", "Intel(R) Iris(R) Pro Graphics 580") +CHIPSET(0x193A, skl_gt4, "SKL GT4", "Intel(R) Iris(R) Pro Graphics P580") +CHIPSET(0x193B, skl_gt4, "SKL GT4", "Intel(R) Iris(R) Pro Graphics 580") +CHIPSET(0x193D, skl_gt4, "SKL GT4", "Intel(R) Iris(R) Pro Graphics P580") + +CHIPSET(0x0A84, bxt, "BXT 3", "Intel(R) HD Graphics") +CHIPSET(0x1A84, bxt, "BXT 3", "Intel(R) HD Graphics") +CHIPSET(0x1A85, bxt_2x6, "BXT 2", "Intel(R) HD Graphics") +CHIPSET(0x5A84, bxt, "APL 3", "Intel(R) HD Graphics 505") +CHIPSET(0x5A85, bxt_2x6, "APL 2", "Intel(R) HD Graphics 500") + +CHIPSET(0x3184, glk, "GLK 3", "Intel(R) UHD Graphics 605") +CHIPSET(0x3185, glk_2x6, "GLK 2", "Intel(R) UHD Graphics 600") + +CHIPSET(0x5902, kbl_gt1, "KBL GT1", "Intel(R) HD Graphics 610") +CHIPSET(0x5906, kbl_gt1, "KBL GT1", "Intel(R) HD Graphics 610") +CHIPSET(0x590A, kbl_gt1, "KBL GT1", "Intel(R) HD Graphics") +CHIPSET(0x5908, kbl_gt1, "KBL GT1", "Intel(R) HD Graphics") +CHIPSET(0x590B, kbl_gt1, "KBL GT1", "Intel(R) HD Graphics 610") +CHIPSET(0x590E, kbl_gt1, "KBL GT1", "Intel(R) HD Graphics") +CHIPSET(0x5913, kbl_gt1_5, "KBL GT1.5", "Intel(R) HD Graphics") +CHIPSET(0x5915, kbl_gt1_5, "KBL GT1.5", "Intel(R) HD Graphics") +CHIPSET(0x5917, kbl_gt2, "KBL GT2", "Intel(R) UHD Graphics 620") +CHIPSET(0x5912, kbl_gt2, "KBL GT2", "Intel(R) HD Graphics 630") +CHIPSET(0x5916, kbl_gt2, "KBL GT2", "Intel(R) HD Graphics 620") +CHIPSET(0x591A, kbl_gt2, "KBL GT2", "Intel(R) HD Graphics P630") +CHIPSET(0x591B, kbl_gt2, "KBL GT2", "Intel(R) HD Graphics 630") +CHIPSET(0x591D, kbl_gt2, "KBL GT2", "Intel(R) HD Graphics P630") +CHIPSET(0x591E, kbl_gt2, "KBL GT2", "Intel(R) HD Graphics 615") +CHIPSET(0x5921, kbl_gt2, "KBL GT2F", "Intel(R) HD Graphics 620") +CHIPSET(0x5923, kbl_gt3, "KBL GT3", "Intel(R) HD Graphics 635") +CHIPSET(0x5926, kbl_gt3, "KBL GT3", "Intel(R) Iris(R) Plus Graphics 640 (Kaby Lake GT3e)") +CHIPSET(0x5927, kbl_gt3, "KBL GT3", "Intel(R) Iris(R) Plus Graphics 650 (Kaby Lake GT3e)") +CHIPSET(0x593B, kbl_gt4, "KBL GT4", "Intel(R) HD Graphics") + +CHIPSET(0x591C, kbl_gt2, "AML-KBL", "Intel(R) UHD Graphics 615") +CHIPSET(0x87C0, kbl_gt2, "AML-KBL", "Intel(R) UHD Graphics 617") + +CHIPSET(0x87CA, cfl_gt2, "AML-CFL", "Intel(R) UHD Graphics") + +CHIPSET(0x3E90, cfl_gt1, "CFL GT1", "Intel(R) UHD Graphics 610") +CHIPSET(0x3E93, cfl_gt1, "CFL GT1", "Intel(R) UHD Graphics 610") +CHIPSET(0x3E99, cfl_gt1, "CFL GT1", "Intel(R) UHD Graphics 610") +CHIPSET(0x3E9C, cfl_gt1, "CFL GT1", "Intel(R) UHD Graphics 610") +CHIPSET(0x3E91, cfl_gt2, "CFL GT2", "Intel(R) UHD Graphics 630") +CHIPSET(0x3E92, cfl_gt2, "CFL GT2", "Intel(R) UHD Graphics 630") +CHIPSET(0x3E96, cfl_gt2, "CFL GT2", "Intel(R) UHD Graphics P630") +CHIPSET(0x3E98, cfl_gt2, "CFL GT2", "Intel(R) UHD Graphics 630") +CHIPSET(0x3E9A, cfl_gt2, "CFL GT2", "Intel(R) UHD Graphics P630") +CHIPSET(0x3E9B, cfl_gt2, "CFL GT2", "Intel(R) UHD Graphics 630") +CHIPSET(0x3E94, cfl_gt2, "CFL GT2", "Intel(R) UHD Graphics P630") +CHIPSET(0x3EA9, cfl_gt2, "CFL GT2", "Intel(R) UHD Graphics 620") +CHIPSET(0x3EA5, cfl_gt3, "CFL GT3", "Intel(R) Iris(R) Plus Graphics 655") +CHIPSET(0x3EA6, cfl_gt3, "CFL GT3", "Intel(R) Iris(R) Plus Graphics 645") +CHIPSET(0x3EA7, cfl_gt3, "CFL GT3", "Intel(R) HD Graphics") +CHIPSET(0x3EA8, cfl_gt3, "CFL GT3", "Intel(R) Iris(R) Plus Graphics 655") + +CHIPSET(0x3EA1, cfl_gt1, "WHL GT1", "Intel(R) UHD Graphics 610") +CHIPSET(0x3EA4, cfl_gt1, "WHL GT1", "Intel(R) UHD Graphics") +CHIPSET(0x3EA0, cfl_gt2, "WHL GT2", "Intel(R) UHD Graphics 620") +CHIPSET(0x3EA3, cfl_gt2, "WHL GT2", "Intel(R) UHD Graphics") +CHIPSET(0x3EA2, cfl_gt3, "WHL GT3", "Intel(R) UHD Graphics") + +CHIPSET(0x9B21, cfl_gt1, "CML GT1", "Intel(R) UHD Graphics") +CHIPSET(0x9BA0, cfl_gt1, "CML GT1", "Intel(R) UHD Graphics") +CHIPSET(0x9BA2, cfl_gt1, "CML GT1", "Intel(R) UHD Graphics") +CHIPSET(0x9BA4, cfl_gt1, "CML GT1", "Intel(R) UHD Graphics") +CHIPSET(0x9BA5, cfl_gt1, "CML GT1", "Intel(R) UHD Graphics 610") +CHIPSET(0x9BA8, cfl_gt1, "CML GT1", "Intel(R) UHD Graphics 610") +CHIPSET(0x9BAA, cfl_gt1, "CML GT1", "Intel(R) UHD Graphics") +CHIPSET(0x9BAB, cfl_gt1, "CML GT1", "Intel(R) UHD Graphics") +CHIPSET(0x9BAC, cfl_gt1, "CML GT1", "Intel(R) UHD Graphics") +CHIPSET(0x9B41, cfl_gt2, "CML GT2", "Intel(R) UHD Graphics") +CHIPSET(0x9BC0, cfl_gt2, "CML GT2", "Intel(R) UHD Graphics") +CHIPSET(0x9BC2, cfl_gt2, "CML GT2", "Intel(R) UHD Graphics") +CHIPSET(0x9BC4, cfl_gt2, "CML GT2", "Intel(R) UHD Graphics") +CHIPSET(0x9BC5, cfl_gt2, "CML GT2", "Intel(R) UHD Graphics 630") +CHIPSET(0x9BC6, cfl_gt2, "CML GT2", "Intel(R) UHD Graphics P630") +CHIPSET(0x9BC8, cfl_gt2, "CML GT2", "Intel(R) UHD Graphics 630") +CHIPSET(0x9BCA, cfl_gt2, "CML GT2", "Intel(R) UHD Graphics") +CHIPSET(0x9BCB, cfl_gt2, "CML GT2", "Intel(R) UHD Graphics") +CHIPSET(0x9BCC, cfl_gt2, "CML GT2", "Intel(R) UHD Graphics") +CHIPSET(0x9BE6, cfl_gt2, "CML GT2", "Intel(R) UHD Graphics P630") +CHIPSET(0x9BF6, cfl_gt2, "CML GT2", "Intel(R) UHD Graphics P630") + +CHIPSET(0x5A49, cnl_gt0_5, "CNL GT0.5", "Intel(R) HD Graphics") +CHIPSET(0x5A4A, cnl_gt0_5, "CNL GT0.5", "Intel(R) HD Graphics") +CHIPSET(0x5A41, cnl_gt1, "CNL GT1", "Intel(R) HD Graphics") +CHIPSET(0x5A42, cnl_gt1, "CNL GT1", "Intel(R) HD Graphics") +CHIPSET(0x5A44, cnl_gt1, "CNL GT1", "Intel(R) HD Graphics") +CHIPSET(0x5A59, cnl_gt1_5, "CNL GT1.5", "Intel(R) HD Graphics") +CHIPSET(0x5A5A, cnl_gt1_5, "CNL GT1.5", "Intel(R) HD Graphics") +CHIPSET(0x5A5C, cnl_gt1_5, "CNL GT1.5", "Intel(R) HD Graphics") +CHIPSET(0x5A50, cnl_gt2, "CNL GT2", "Intel(R) HD Graphics") +CHIPSET(0x5A51, cnl_gt2, "CNL GT2", "Intel(R) HD Graphics") +CHIPSET(0x5A52, cnl_gt2, "CNL GT2", "Intel(R) HD Graphics") +CHIPSET(0x5A54, cnl_gt2, "CNL GT2", "Intel(R) HD Graphics") + +CHIPSET(0x8A50, icl_gt2, "ICL GT2", "Intel(R) HD Graphics") +CHIPSET(0x8A51, icl_gt2, "ICL GT2", "Intel(R) Iris(R) Plus Graphics") +CHIPSET(0x8A52, icl_gt2, "ICL GT2", "Intel(R) Iris(R) Plus Graphics") +CHIPSET(0x8A53, icl_gt2, "ICL GT2", "Intel(R) Iris(R) Plus Graphics") +CHIPSET(0x8A54, icl_gt1_5, "ICL GT1.5", "Intel(R) Iris(R) Plus Graphics") +CHIPSET(0x8A56, icl_gt1, "ICL GT1", "Intel(R) UHD Graphics") +CHIPSET(0x8A57, icl_gt1_5, "ICL GT1.5", "Intel(R) HD Graphics") +CHIPSET(0x8A58, icl_gt1, "ICL GT1", "Intel(R) UHD Graphics") +CHIPSET(0x8A59, icl_gt1_5, "ICL GT1.5", "Intel(R) HD Graphics") +CHIPSET(0x8A5A, icl_gt1_5, "ICL GT1.5", "Intel(R) Iris(R) Plus Graphics") +CHIPSET(0x8A5B, icl_gt1, "ICL GT1", "Intel(R) HD Graphics") +CHIPSET(0x8A5C, icl_gt1_5, "ICL GT1.5", "Intel(R) Iris(R) Plus Graphics") +CHIPSET(0x8A5D, icl_gt1, "ICL GT1", "Intel(R) HD Graphics") +CHIPSET(0x8A71, icl_gt0_5, "ICL GT0.5", "Intel(R) HD Graphics") + +CHIPSET(0x4500, ehl_7, "EHL", "Intel(R) UHD Graphics") +CHIPSET(0x4571, ehl_7, "EHL", "Intel(R) UHD Graphics") +CHIPSET(0x4551, ehl_5, "EHL", "Intel(R) UHD Graphics") +CHIPSET(0x4541, ehl_4, "EHL", "Intel(R) UHD Graphics") +CHIPSET(0x4E51, ehl_5, "JSL", "Intel(R) UHD Graphics") +CHIPSET(0x4E61, ehl_6, "JSL", "Intel(R) UHD Graphics") +CHIPSET(0x4E71, ehl_7, "JSL", "Intel(R) UHD Graphics") #endif -CHIPSET(0x1602, bdw_gt1, "Intel(R) Broadwell GT1") -CHIPSET(0x1606, bdw_gt1, "Intel(R) Broadwell GT1") -CHIPSET(0x160A, bdw_gt1, "Intel(R) Broadwell GT1") -CHIPSET(0x160B, bdw_gt1, "Intel(R) Broadwell GT1") -CHIPSET(0x160D, bdw_gt1, "Intel(R) Broadwell GT1") -CHIPSET(0x160E, bdw_gt1, "Intel(R) Broadwell GT1") -CHIPSET(0x1612, bdw_gt2, "Intel(R) HD Graphics 5600 (Broadwell GT2)") -CHIPSET(0x1616, bdw_gt2, "Intel(R) HD Graphics 5500 (Broadwell GT2)") -CHIPSET(0x161A, bdw_gt2, "Intel(R) Broadwell GT2") -CHIPSET(0x161B, bdw_gt2, "Intel(R) Broadwell GT2") -CHIPSET(0x161D, bdw_gt2, "Intel(R) Broadwell GT2") -CHIPSET(0x161E, bdw_gt2, "Intel(R) HD Graphics 5300 (Broadwell GT2)") -CHIPSET(0x1622, bdw_gt3, "Intel(R) Iris Pro 6200 (Broadwell GT3e)") -CHIPSET(0x1626, bdw_gt3, "Intel(R) HD Graphics 6000 (Broadwell GT3)") -CHIPSET(0x162A, bdw_gt3, "Intel(R) Iris Pro P6300 (Broadwell GT3e)") -CHIPSET(0x162B, bdw_gt3, "Intel(R) Iris 6100 (Broadwell GT3)") -CHIPSET(0x162D, bdw_gt3, "Intel(R) Broadwell GT3") -CHIPSET(0x162E, bdw_gt3, "Intel(R) Broadwell GT3") -CHIPSET(0x1902, skl_gt1, "Intel(R) HD Graphics 510 (Skylake GT1)") -CHIPSET(0x1906, skl_gt1, "Intel(R) HD Graphics 510 (Skylake GT1)") -CHIPSET(0x190A, skl_gt1, "Intel(R) Skylake GT1") -CHIPSET(0x190B, skl_gt1, "Intel(R) HD Graphics 510 (Skylake GT1)") -CHIPSET(0x190E, skl_gt1, "Intel(R) Skylake GT1") -CHIPSET(0x1912, skl_gt2, "Intel(R) HD Graphics 530 (Skylake GT2)") -CHIPSET(0x1913, skl_gt2, "Intel(R) Skylake GT2f") -CHIPSET(0x1915, skl_gt2, "Intel(R) Skylake GT2f") -CHIPSET(0x1916, skl_gt2, "Intel(R) HD Graphics 520 (Skylake GT2)") -CHIPSET(0x1917, skl_gt2, "Intel(R) Skylake GT2f") -CHIPSET(0x191A, skl_gt2, "Intel(R) Skylake GT2") -CHIPSET(0x191B, skl_gt2, "Intel(R) HD Graphics 530 (Skylake GT2)") -CHIPSET(0x191D, skl_gt2, "Intel(R) HD Graphics P530 (Skylake GT2)") -CHIPSET(0x191E, skl_gt2, "Intel(R) HD Graphics 515 (Skylake GT2)") -CHIPSET(0x1921, skl_gt2, "Intel(R) HD Graphics 520 (Skylake GT2)") -CHIPSET(0x1923, skl_gt3, "Intel(R) Skylake GT3e") -CHIPSET(0x1926, skl_gt3, "Intel(R) Iris Graphics 540 (Skylake GT3e)") -CHIPSET(0x1927, skl_gt3, "Intel(R) Iris Graphics 550 (Skylake GT3e)") -CHIPSET(0x192A, skl_gt4, "Intel(R) Skylake GT4") -CHIPSET(0x192B, skl_gt3, "Intel(R) Iris Graphics 555 (Skylake GT3e)") -CHIPSET(0x192D, skl_gt3, "Intel(R) Iris Graphics P555 (Skylake GT3e)") -CHIPSET(0x1932, skl_gt4, "Intel(R) Iris Pro Graphics 580 (Skylake GT4e)") -CHIPSET(0x193A, skl_gt4, "Intel(R) Iris Pro Graphics P580 (Skylake GT4e)") -CHIPSET(0x193B, skl_gt4, "Intel(R) Iris Pro Graphics 580 (Skylake GT4e)") -CHIPSET(0x193D, skl_gt4, "Intel(R) Iris Pro Graphics P580 (Skylake GT4e)") -CHIPSET(0x0A84, bxt, "Intel(R) HD Graphics (Broxton)") -CHIPSET(0x1A84, bxt, "Intel(R) HD Graphics (Broxton)") -CHIPSET(0x1A85, bxt_2x6, "Intel(R) HD Graphics (Broxton 2x6)") -CHIPSET(0x5A84, bxt, "Intel(R) HD Graphics 505 (Broxton)") -CHIPSET(0x5A85, bxt_2x6, "Intel(R) HD Graphics 500 (Broxton 2x6)") -CHIPSET(0x5902, kbl_gt1, "Intel(R) HD Graphics 610 (Kaby Lake GT1)") -CHIPSET(0x5906, kbl_gt1, "Intel(R) HD Graphics 610 (Kaby Lake GT1)") -CHIPSET(0x590A, kbl_gt1, "Intel(R) Kabylake GT1") -CHIPSET(0x5908, kbl_gt1, "Intel(R) Kabylake GT1") -CHIPSET(0x590B, kbl_gt1, "Intel(R) Kabylake GT1") -CHIPSET(0x590E, kbl_gt1, "Intel(R) Kabylake GT1") -CHIPSET(0x5913, kbl_gt1_5, "Intel(R) Kabylake GT1.5") -CHIPSET(0x5915, kbl_gt1_5, "Intel(R) Kabylake GT1.5") -CHIPSET(0x5917, kbl_gt2, "Intel(R) UHD Graphics 620 (Kabylake GT2)") -CHIPSET(0x5912, kbl_gt2, "Intel(R) HD Graphics 630 (Kaby Lake GT2)") -CHIPSET(0x5916, kbl_gt2, "Intel(R) HD Graphics 620 (Kaby Lake GT2)") -CHIPSET(0x591A, kbl_gt2, "Intel(R) HD Graphics P630 (Kaby Lake GT2)") -CHIPSET(0x591B, kbl_gt2, "Intel(R) HD Graphics 630 (Kaby Lake GT2)") -CHIPSET(0x591D, kbl_gt2, "Intel(R) HD Graphics P630 (Kaby Lake GT2)") -CHIPSET(0x591E, kbl_gt2, "Intel(R) HD Graphics 615 (Kaby Lake GT2)") -CHIPSET(0x5921, kbl_gt2, "Intel(R) Kabylake GT2F") -CHIPSET(0x5923, kbl_gt3, "Intel(R) Kabylake GT3") -CHIPSET(0x5926, kbl_gt3, "Intel(R) Iris Plus Graphics 640 (Kaby Lake GT3e)") -CHIPSET(0x5927, kbl_gt3, "Intel(R) Iris Plus Graphics 650 (Kaby Lake GT3e)") -CHIPSET(0x593B, kbl_gt4, "Intel(R) Kabylake GT4") -CHIPSET(0x591C, kbl_gt2, "Intel(R) Amber Lake (Kabylake) GT2") -CHIPSET(0x87C0, kbl_gt2, "Intel(R) Amber Lake (Kabylake) GT2") -CHIPSET(0x87CA, cfl_gt2, "Intel(R) Amber Lake (Coffeelake) GT2") -CHIPSET(0x3184, glk, "Intel(R) UHD Graphics 605 (Geminilake)") -CHIPSET(0x3185, glk_2x6, "Intel(R) UHD Graphics 600 (Geminilake 2x6)") -CHIPSET(0x3E90, cfl_gt1, "Intel(R) UHD Graphics 610 (Coffeelake 2x6 GT1)") -CHIPSET(0x3E93, cfl_gt1, "Intel(R) UHD Graphics 610 (Coffeelake 2x6 GT1)") -CHIPSET(0x3E99, cfl_gt1, "Intel(R) HD Graphics (Coffeelake 2x6 GT1)") -CHIPSET(0x3E9C, cfl_gt1, "Intel(R) HD Graphics (Coffeelake 2x6 GT1)") -CHIPSET(0x3E91, cfl_gt2, "Intel(R) UHD Graphics 630 (Coffeelake 3x8 GT2)") -CHIPSET(0x3E92, cfl_gt2, "Intel(R) UHD Graphics 630 (Coffeelake 3x8 GT2)") -CHIPSET(0x3E96, cfl_gt2, "Intel(R) HD Graphics (Coffeelake 3x8 GT2)") -CHIPSET(0x3E98, cfl_gt2, "Intel(R) UHD Graphics 630 (Coffeelake 3x8 GT2)") -CHIPSET(0x3E9A, cfl_gt2, "Intel(R) HD Graphics (Coffeelake 3x8 GT2)") -CHIPSET(0x3E9B, cfl_gt2, "Intel(R) UHD Graphics 630 (Coffeelake 3x8 GT2)") -CHIPSET(0x3E94, cfl_gt2, "Intel(R) HD Graphics (Coffeelake 3x8 GT2)") -CHIPSET(0x3EA9, cfl_gt2, "Intel(R) HD Graphics (Coffeelake 3x8 GT2)") -CHIPSET(0x3EA5, cfl_gt3, "Intel(R) HD Graphics (Coffeelake 3x8 GT3)") -CHIPSET(0x3EA6, cfl_gt3, "Intel(R) HD Graphics (Coffeelake 3x8 GT3)") -CHIPSET(0x3EA7, cfl_gt3, "Intel(R) HD Graphics (Coffeelake 3x8 GT3)") -CHIPSET(0x3EA8, cfl_gt3, "Intel(R) HD Graphics (Coffeelake 3x8 GT3)") -CHIPSET(0x3EA1, cfl_gt1, "Intel(R) UHD Graphics (Whiskey Lake 2x6 GT1)") -CHIPSET(0x3EA4, cfl_gt1, "Intel(R) UHD Graphics (Whiskey Lake 3x8 GT1)") -CHIPSET(0x3EA0, cfl_gt2, "Intel(R) UHD Graphics (Whiskey Lake 3x8 GT2)") -CHIPSET(0x3EA3, cfl_gt2, "Intel(R) UHD Graphics (Whiskey Lake 3x8 GT2)") -CHIPSET(0x3EA2, cfl_gt3, "Intel(R) UHD Graphics (Whiskey Lake 3x8 GT3)") -CHIPSET(0x9B21, cfl_gt1, "Intel(R) UHD Graphics (Comet Lake 2x6 GT1)") -CHIPSET(0x9BA0, cfl_gt1, "Intel(R) UHD Graphics (Comet Lake 2x6 GT1)") -CHIPSET(0x9BA2, cfl_gt1, "Intel(R) UHD Graphics (Comet Lake 2x6 GT1)") -CHIPSET(0x9BA4, cfl_gt1, "Intel(R) UHD Graphics (Comet Lake 2x6 GT1)") -CHIPSET(0x9BA5, cfl_gt1, "Intel(R) UHD Graphics (Comet Lake 2x6 GT1)") -CHIPSET(0x9BA8, cfl_gt1, "Intel(R) UHD Graphics (Comet Lake 2x6 GT1)") -CHIPSET(0x9BAA, cfl_gt1, "Intel(R) UHD Graphics (Comet Lake 2x6 GT1)") -CHIPSET(0x9BAB, cfl_gt1, "Intel(R) UHD Graphics (Comet Lake 2x6 GT1)") -CHIPSET(0x9BAC, cfl_gt1, "Intel(R) UHD Graphics (Comet Lake 2x6 GT1)") -CHIPSET(0x9B41, cfl_gt2, "Intel(R) UHD Graphics (Comet Lake 3x8 GT2)") -CHIPSET(0x9BC0, cfl_gt2, "Intel(R) UHD Graphics (Comet Lake 3x8 GT2)") -CHIPSET(0x9BC2, cfl_gt2, "Intel(R) UHD Graphics (Comet Lake 3x8 GT2)") -CHIPSET(0x9BC4, cfl_gt2, "Intel(R) UHD Graphics (Comet Lake 3x8 GT2)") -CHIPSET(0x9BC5, cfl_gt2, "Intel(R) UHD Graphics (Comet Lake 3x8 GT2)") -CHIPSET(0x9BC8, cfl_gt2, "Intel(R) UHD Graphics (Comet Lake 3x8 GT2)") -CHIPSET(0x9BCA, cfl_gt2, "Intel(R) UHD Graphics (Comet Lake 3x8 GT2)") -CHIPSET(0x9BCB, cfl_gt2, "Intel(R) UHD Graphics (Comet Lake 3x8 GT2)") -CHIPSET(0x9BCC, cfl_gt2, "Intel(R) UHD Graphics (Comet Lake 3x8 GT2)") -CHIPSET(0x5A49, cnl_2x8, "Intel(R) HD Graphics (Cannonlake 2x8 GT0.5)") -CHIPSET(0x5A4A, cnl_2x8, "Intel(R) HD Graphics (Cannonlake 2x8 GT0.5)") -CHIPSET(0x5A41, cnl_3x8, "Intel(R) HD Graphics (Cannonlake 3x8 GT1)") -CHIPSET(0x5A42, cnl_3x8, "Intel(R) HD Graphics (Cannonlake 3x8 GT1)") -CHIPSET(0x5A44, cnl_3x8, "Intel(R) HD Graphics (Cannonlake 3x8 GT1)") -CHIPSET(0x5A59, cnl_4x8, "Intel(R) HD Graphics (Cannonlake 4x8 GT1.5)") -CHIPSET(0x5A5A, cnl_4x8, "Intel(R) HD Graphics (Cannonlake 4x8 GT1.5)") -CHIPSET(0x5A5C, cnl_4x8, "Intel(R) HD Graphics (Cannonlake 4x8 GT1.5)") -CHIPSET(0x5A50, cnl_5x8, "Intel(R) HD Graphics (Cannonlake 5x8 GT2)") -CHIPSET(0x5A51, cnl_5x8, "Intel(R) HD Graphics (Cannonlake 5x8 GT2)") -CHIPSET(0x5A52, cnl_5x8, "Intel(R) HD Graphics (Cannonlake 5x8 GT2)") -CHIPSET(0x5A54, cnl_5x8, "Intel(R) HD Graphics (Cannonlake 5x8 GT2)") -CHIPSET(0x8A50, icl_8x8, "Intel(R) HD Graphics (Ice Lake 8x8 GT2)") -CHIPSET(0x8A51, icl_8x8, "Intel(R) HD Graphics (Ice Lake 8x8 GT2)") -CHIPSET(0x8A52, icl_8x8, "Intel(R) HD Graphics (Ice Lake 8x8 GT2)") -CHIPSET(0x8A53, icl_8x8, "Intel(R) HD Graphics (Ice Lake 8x8 GT2)") -CHIPSET(0x8A54, icl_6x8, "Intel(R) HD Graphics (Ice Lake 6x8 GT1.5)") -CHIPSET(0x8A56, icl_4x8, "Intel(R) HD Graphics (Ice Lake 4x8 GT1)") -CHIPSET(0x8A57, icl_6x8, "Intel(R) HD Graphics (Ice Lake 6x8 GT1.5)") -CHIPSET(0x8A58, icl_4x8, "Intel(R) HD Graphics (Ice Lake 4x8 GT1)") -CHIPSET(0x8A59, icl_6x8, "Intel(R) HD Graphics (Ice Lake 6x8 GT1.5)") -CHIPSET(0x8A5A, icl_6x8, "Intel(R) HD Graphics (Ice Lake 6x8 GT1.5)") -CHIPSET(0x8A5B, icl_4x8, "Intel(R) HD Graphics (Ice Lake 4x8 GT1)") -CHIPSET(0x8A5C, icl_6x8, "Intel(R) HD Graphics (Ice Lake 6x8 GT1.5)") -CHIPSET(0x8A5D, icl_4x8, "Intel(R) HD Graphics (Ice Lake 4x8 GT1)") -CHIPSET(0x8A71, icl_1x8, "Intel(R) HD Graphics (Ice Lake 1x8 GT0.5)") -CHIPSET(0x4500, ehl_4x8, "Intel(R) HD Graphics (Elkhart Lake 4x8)") -CHIPSET(0x4571, ehl_4x8, "Intel(R) HD Graphics (Elkhart Lake 4x8)") -CHIPSET(0x4551, ehl_4x4, "Intel(R) HD Graphics (Elkhart Lake 4x4)") -CHIPSET(0x4541, ehl_2x4, "Intel(R) HD Graphics (Elkhart Lake 2x4)") diff -Nru mesa-19.2.8/include/pci_ids/iris_pci_ids.h mesa-20.0.8/include/pci_ids/iris_pci_ids.h --- mesa-19.2.8/include/pci_ids/iris_pci_ids.h 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/include/pci_ids/iris_pci_ids.h 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,11 @@ +CHIPSET(0x9A40, tgl_gt2, "TGL GT2", "Intel(R) Xe Graphics") +CHIPSET(0x9A49, tgl_gt2, "TGL GT2", "Intel(R) Xe Graphics") +CHIPSET(0x9A59, tgl_gt2, "TGL GT2", "Intel(R) Graphics") +CHIPSET(0x9A60, tgl_gt1, "TGL GT1", "Intel(R) UHD Graphics") +CHIPSET(0x9A68, tgl_gt1, "TGL GT1", "Intel(R) UHD Graphics") +CHIPSET(0x9A70, tgl_gt1, "TGL GT1", "Intel(R) UHD Graphics") +CHIPSET(0x9A78, tgl_gt2, "TGL GT2", "Intel(R) UHD Graphics") +CHIPSET(0x9AC0, tgl_gt2, "TGL GT2", "Intel(R) UHD Graphics") +CHIPSET(0x9AC9, tgl_gt2, "TGL GT2", "Intel(R) UHD Graphics") +CHIPSET(0x9AD9, tgl_gt2, "TGL GT2", "Intel(R) UHD Graphics") +CHIPSET(0x9AF8, tgl_gt2, "TGL GT2", "Intel(R) UHD Graphics") diff -Nru mesa-19.2.8/include/pci_ids/radeonsi_pci_ids.h mesa-20.0.8/include/pci_ids/radeonsi_pci_ids.h --- mesa-19.2.8/include/pci_ids/radeonsi_pci_ids.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/include/pci_ids/radeonsi_pci_ids.h 2020-06-12 01:21:16.000000000 +0000 @@ -157,119 +157,3 @@ CHIPSET(0x67B9, HAWAII) CHIPSET(0x67BA, HAWAII) CHIPSET(0x67BE, HAWAII) - -CHIPSET(0x6900, ICELAND) -CHIPSET(0x6901, ICELAND) -CHIPSET(0x6902, ICELAND) -CHIPSET(0x6903, ICELAND) -CHIPSET(0x6907, ICELAND) - -CHIPSET(0x6920, TONGA) -CHIPSET(0x6921, TONGA) -CHIPSET(0x6928, TONGA) -CHIPSET(0x6929, TONGA) -CHIPSET(0x692B, TONGA) -CHIPSET(0x692F, TONGA) -CHIPSET(0x6930, TONGA) -CHIPSET(0x6938, TONGA) -CHIPSET(0x6939, TONGA) - -CHIPSET(0x9870, CARRIZO) -CHIPSET(0x9874, CARRIZO) -CHIPSET(0x9875, CARRIZO) -CHIPSET(0x9876, CARRIZO) -CHIPSET(0x9877, CARRIZO) - -CHIPSET(0x7300, FIJI) - -CHIPSET(0x67E0, POLARIS11) -CHIPSET(0x67E1, POLARIS11) -CHIPSET(0x67E3, POLARIS11) -CHIPSET(0x67E7, POLARIS11) -CHIPSET(0x67E8, POLARIS11) -CHIPSET(0x67E9, POLARIS11) -CHIPSET(0x67EB, POLARIS11) -CHIPSET(0x67EF, POLARIS11) -CHIPSET(0x67FF, POLARIS11) - -CHIPSET(0x67C0, POLARIS10) -CHIPSET(0x67C1, POLARIS10) -CHIPSET(0x67C2, POLARIS10) -CHIPSET(0x67C4, POLARIS10) -CHIPSET(0x67C7, POLARIS10) -CHIPSET(0x67C8, POLARIS10) -CHIPSET(0x67C9, POLARIS10) -CHIPSET(0x67CA, POLARIS10) -CHIPSET(0x67CC, POLARIS10) -CHIPSET(0x67CF, POLARIS10) -CHIPSET(0x67D0, POLARIS10) -CHIPSET(0x67DF, POLARIS10) -CHIPSET(0x6FDF, POLARIS10) - -CHIPSET(0x98E4, STONEY) - -CHIPSET(0x6980, POLARIS12) -CHIPSET(0x6981, POLARIS12) -CHIPSET(0x6985, POLARIS12) -CHIPSET(0x6986, POLARIS12) -CHIPSET(0x6987, POLARIS12) -CHIPSET(0x6995, POLARIS12) -CHIPSET(0x6997, POLARIS12) -CHIPSET(0x699F, POLARIS12) - -CHIPSET(0x694C, VEGAM) -CHIPSET(0x694E, VEGAM) -CHIPSET(0x694F, VEGAM) - -CHIPSET(0x6860, VEGA10) -CHIPSET(0x6861, VEGA10) -CHIPSET(0x6862, VEGA10) -CHIPSET(0x6863, VEGA10) -CHIPSET(0x6864, VEGA10) -CHIPSET(0x6867, VEGA10) -CHIPSET(0x6868, VEGA10) -CHIPSET(0x6869, VEGA10) -CHIPSET(0x686A, VEGA10) -CHIPSET(0x686B, VEGA10) -CHIPSET(0x686C, VEGA10) -CHIPSET(0x686D, VEGA10) -CHIPSET(0x686E, VEGA10) -CHIPSET(0x686F, VEGA10) -CHIPSET(0x687F, VEGA10) - -CHIPSET(0x69A0, VEGA12) -CHIPSET(0x69A1, VEGA12) -CHIPSET(0x69A2, VEGA12) -CHIPSET(0x69A3, VEGA12) -CHIPSET(0x69AF, VEGA12) - -CHIPSET(0x66A0, VEGA20) -CHIPSET(0x66A1, VEGA20) -CHIPSET(0x66A2, VEGA20) -CHIPSET(0x66A3, VEGA20) -CHIPSET(0x66A4, VEGA20) -CHIPSET(0x66A7, VEGA20) -CHIPSET(0x66AF, VEGA20) - -CHIPSET(0x15DD, RAVEN) -CHIPSET(0x15D8, RAVEN) - -CHIPSET(0x1636, RENOIR) - -CHIPSET(0x738C, ARCTURUS) -CHIPSET(0x7388, ARCTURUS) -CHIPSET(0x738E, ARCTURUS) - -CHIPSET(0x7310, NAVI10) -CHIPSET(0x7312, NAVI10) -CHIPSET(0x7318, NAVI10) -CHIPSET(0x7319, NAVI10) -CHIPSET(0x731A, NAVI10) -CHIPSET(0x731B, NAVI10) -CHIPSET(0x731F, NAVI10) - -CHIPSET(0x7360, NAVI12) - -CHIPSET(0x7340, NAVI14) -CHIPSET(0x7341, NAVI14) -CHIPSET(0x7347, NAVI14) diff -Nru mesa-19.2.8/include/vulkan/vk_icd.h mesa-20.0.8/include/vulkan/vk_icd.h --- mesa-19.2.8/include/vulkan/vk_icd.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/include/vulkan/vk_icd.h 2020-06-12 01:21:16.000000000 +0000 @@ -89,7 +89,8 @@ VK_ICD_WSI_PLATFORM_MACOS, VK_ICD_WSI_PLATFORM_IOS, VK_ICD_WSI_PLATFORM_DISPLAY, - VK_ICD_WSI_PLATFORM_HEADLESS + VK_ICD_WSI_PLATFORM_HEADLESS, + VK_ICD_WSI_PLATFORM_METAL, } VkIcdWsiPlatform; typedef struct { @@ -172,4 +173,11 @@ VkIcdSurfaceBase base; } VkIcdSurfaceHeadless; +#ifdef VK_USE_PLATFORM_METAL_EXT +typedef struct { + VkIcdSurfaceBase base; + const CAMetalLayer *pLayer; +} VkIcdSurfaceMetal; +#endif // VK_USE_PLATFORM_METAL_EXT + #endif // VKICD_H diff -Nru mesa-19.2.8/include/vulkan/vk_platform.h mesa-20.0.8/include/vulkan/vk_platform.h --- mesa-19.2.8/include/vulkan/vk_platform.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/include/vulkan/vk_platform.h 2020-06-12 01:21:16.000000000 +0000 @@ -2,7 +2,7 @@ // File: vk_platform.h // /* -** Copyright (c) 2014-2017 The Khronos Group Inc. +** Copyright (c) 2014-2020 The Khronos Group Inc. ** ** Licensed under the Apache License, Version 2.0 (the "License"); ** you may not use this file except in compliance with the License. diff -Nru mesa-19.2.8/include/vulkan/vulkan_android.h mesa-20.0.8/include/vulkan/vulkan_android.h --- mesa-19.2.8/include/vulkan/vulkan_android.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/include/vulkan/vulkan_android.h 2020-06-12 01:21:16.000000000 +0000 @@ -1,12 +1,8 @@ #ifndef VULKAN_ANDROID_H_ #define VULKAN_ANDROID_H_ 1 -#ifdef __cplusplus -extern "C" { -#endif - /* -** Copyright (c) 2015-2019 The Khronos Group Inc. +** Copyright (c) 2015-2020 The Khronos Group Inc. ** ** Licensed under the Apache License, Version 2.0 (the "License"); ** you may not use this file except in compliance with the License. @@ -27,6 +23,11 @@ */ +#ifdef __cplusplus +extern "C" { +#endif + + #define VK_KHR_android_surface 1 struct ANativeWindow; diff -Nru mesa-19.2.8/include/vulkan/vulkan_core.h mesa-20.0.8/include/vulkan/vulkan_core.h --- mesa-19.2.8/include/vulkan/vulkan_core.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/include/vulkan/vulkan_core.h 2020-06-12 01:21:16.000000000 +0000 @@ -1,12 +1,8 @@ #ifndef VULKAN_CORE_H_ #define VULKAN_CORE_H_ 1 -#ifdef __cplusplus -extern "C" { -#endif - /* -** Copyright (c) 2015-2019 The Khronos Group Inc. +** Copyright (c) 2015-2020 The Khronos Group Inc. ** ** Licensed under the Apache License, Version 2.0 (the "License"); ** you may not use this file except in compliance with the License. @@ -27,6 +23,11 @@ */ +#ifdef __cplusplus +extern "C" { +#endif + + #define VK_VERSION_1_0 1 #include "vk_platform.h" @@ -43,7 +44,7 @@ #define VK_VERSION_MINOR(version) (((uint32_t)(version) >> 12) & 0x3ff) #define VK_VERSION_PATCH(version) ((uint32_t)(version) & 0xfff) // Version of this file -#define VK_HEADER_VERSION 119 +#define VK_HEADER_VERSION 131 #define VK_NULL_HANDLE 0 @@ -132,8 +133,11 @@ VK_ERROR_TOO_MANY_OBJECTS = -10, VK_ERROR_FORMAT_NOT_SUPPORTED = -11, VK_ERROR_FRAGMENTED_POOL = -12, + VK_ERROR_UNKNOWN = -13, VK_ERROR_OUT_OF_POOL_MEMORY = -1000069000, VK_ERROR_INVALID_EXTERNAL_HANDLE = -1000072003, + VK_ERROR_FRAGMENTATION = -1000161000, + VK_ERROR_INVALID_OPAQUE_CAPTURE_ADDRESS = -1000257000, VK_ERROR_SURFACE_LOST_KHR = -1000000000, VK_ERROR_NATIVE_WINDOW_IN_USE_KHR = -1000000001, VK_SUBOPTIMAL_KHR = 1000001003, @@ -142,15 +146,16 @@ VK_ERROR_VALIDATION_FAILED_EXT = -1000011001, VK_ERROR_INVALID_SHADER_NV = -1000012000, VK_ERROR_INVALID_DRM_FORMAT_MODIFIER_PLANE_LAYOUT_EXT = -1000158000, - VK_ERROR_FRAGMENTATION_EXT = -1000161000, VK_ERROR_NOT_PERMITTED_EXT = -1000174001, - VK_ERROR_INVALID_DEVICE_ADDRESS_EXT = -1000244000, VK_ERROR_FULL_SCREEN_EXCLUSIVE_MODE_LOST_EXT = -1000255000, VK_ERROR_OUT_OF_POOL_MEMORY_KHR = VK_ERROR_OUT_OF_POOL_MEMORY, VK_ERROR_INVALID_EXTERNAL_HANDLE_KHR = VK_ERROR_INVALID_EXTERNAL_HANDLE, - VK_RESULT_BEGIN_RANGE = VK_ERROR_FRAGMENTED_POOL, + VK_ERROR_FRAGMENTATION_EXT = VK_ERROR_FRAGMENTATION, + VK_ERROR_INVALID_DEVICE_ADDRESS_EXT = VK_ERROR_INVALID_OPAQUE_CAPTURE_ADDRESS, + VK_ERROR_INVALID_OPAQUE_CAPTURE_ADDRESS_KHR = VK_ERROR_INVALID_OPAQUE_CAPTURE_ADDRESS, + VK_RESULT_BEGIN_RANGE = VK_ERROR_UNKNOWN, VK_RESULT_END_RANGE = VK_INCOMPLETE, - VK_RESULT_RANGE_SIZE = (VK_INCOMPLETE - VK_ERROR_FRAGMENTED_POOL + 1), + VK_RESULT_RANGE_SIZE = (VK_INCOMPLETE - VK_ERROR_UNKNOWN + 1), VK_RESULT_MAX_ENUM = 0x7FFFFFFF } VkResult; @@ -269,6 +274,56 @@ VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MAINTENANCE_3_PROPERTIES = 1000168000, VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_SUPPORT = 1000168001, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_DRAW_PARAMETERS_FEATURES = 1000063000, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_FEATURES = 49, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_PROPERTIES = 50, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES = 51, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_PROPERTIES = 52, + VK_STRUCTURE_TYPE_IMAGE_FORMAT_LIST_CREATE_INFO = 1000147000, + VK_STRUCTURE_TYPE_ATTACHMENT_DESCRIPTION_2 = 1000109000, + VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_2 = 1000109001, + VK_STRUCTURE_TYPE_SUBPASS_DESCRIPTION_2 = 1000109002, + VK_STRUCTURE_TYPE_SUBPASS_DEPENDENCY_2 = 1000109003, + VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO_2 = 1000109004, + VK_STRUCTURE_TYPE_SUBPASS_BEGIN_INFO = 1000109005, + VK_STRUCTURE_TYPE_SUBPASS_END_INFO = 1000109006, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_8BIT_STORAGE_FEATURES = 1000177000, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DRIVER_PROPERTIES = 1000196000, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_ATOMIC_INT64_FEATURES = 1000180000, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_FLOAT16_INT8_FEATURES = 1000082000, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT_CONTROLS_PROPERTIES = 1000197000, + VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_BINDING_FLAGS_CREATE_INFO = 1000161000, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DESCRIPTOR_INDEXING_FEATURES = 1000161001, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DESCRIPTOR_INDEXING_PROPERTIES = 1000161002, + VK_STRUCTURE_TYPE_DESCRIPTOR_SET_VARIABLE_DESCRIPTOR_COUNT_ALLOCATE_INFO = 1000161003, + VK_STRUCTURE_TYPE_DESCRIPTOR_SET_VARIABLE_DESCRIPTOR_COUNT_LAYOUT_SUPPORT = 1000161004, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DEPTH_STENCIL_RESOLVE_PROPERTIES = 1000199000, + VK_STRUCTURE_TYPE_SUBPASS_DESCRIPTION_DEPTH_STENCIL_RESOLVE = 1000199001, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SCALAR_BLOCK_LAYOUT_FEATURES = 1000221000, + VK_STRUCTURE_TYPE_IMAGE_STENCIL_USAGE_CREATE_INFO = 1000246000, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SAMPLER_FILTER_MINMAX_PROPERTIES = 1000130000, + VK_STRUCTURE_TYPE_SAMPLER_REDUCTION_MODE_CREATE_INFO = 1000130001, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_MEMORY_MODEL_FEATURES = 1000211000, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGELESS_FRAMEBUFFER_FEATURES = 1000108000, + VK_STRUCTURE_TYPE_FRAMEBUFFER_ATTACHMENTS_CREATE_INFO = 1000108001, + VK_STRUCTURE_TYPE_FRAMEBUFFER_ATTACHMENT_IMAGE_INFO = 1000108002, + VK_STRUCTURE_TYPE_RENDER_PASS_ATTACHMENT_BEGIN_INFO = 1000108003, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_UNIFORM_BUFFER_STANDARD_LAYOUT_FEATURES = 1000253000, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_SUBGROUP_EXTENDED_TYPES_FEATURES = 1000175000, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SEPARATE_DEPTH_STENCIL_LAYOUTS_FEATURES = 1000241000, + VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_STENCIL_LAYOUT = 1000241001, + VK_STRUCTURE_TYPE_ATTACHMENT_DESCRIPTION_STENCIL_LAYOUT = 1000241002, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_HOST_QUERY_RESET_FEATURES = 1000261000, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TIMELINE_SEMAPHORE_FEATURES = 1000207000, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TIMELINE_SEMAPHORE_PROPERTIES = 1000207001, + VK_STRUCTURE_TYPE_SEMAPHORE_TYPE_CREATE_INFO = 1000207002, + VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO = 1000207003, + VK_STRUCTURE_TYPE_SEMAPHORE_WAIT_INFO = 1000207004, + VK_STRUCTURE_TYPE_SEMAPHORE_SIGNAL_INFO = 1000207005, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_BUFFER_DEVICE_ADDRESS_FEATURES = 1000257000, + VK_STRUCTURE_TYPE_BUFFER_DEVICE_ADDRESS_INFO = 1000244001, + VK_STRUCTURE_TYPE_BUFFER_OPAQUE_CAPTURE_ADDRESS_CREATE_INFO = 1000257002, + VK_STRUCTURE_TYPE_MEMORY_OPAQUE_CAPTURE_ADDRESS_ALLOCATE_INFO = 1000257003, + VK_STRUCTURE_TYPE_DEVICE_MEMORY_OPAQUE_CAPTURE_ADDRESS_INFO = 1000257004, VK_STRUCTURE_TYPE_SWAPCHAIN_CREATE_INFO_KHR = 1000001000, VK_STRUCTURE_TYPE_PRESENT_INFO_KHR = 1000001001, VK_STRUCTURE_TYPE_DEVICE_GROUP_PRESENT_CAPABILITIES_KHR = 1000060007, @@ -328,7 +383,6 @@ VK_STRUCTURE_TYPE_COMMAND_BUFFER_INHERITANCE_CONDITIONAL_RENDERING_INFO_EXT = 1000081000, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_CONDITIONAL_RENDERING_FEATURES_EXT = 1000081001, VK_STRUCTURE_TYPE_CONDITIONAL_RENDERING_BEGIN_INFO_EXT = 1000081002, - VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_FLOAT16_INT8_FEATURES_KHR = 1000082000, VK_STRUCTURE_TYPE_PRESENT_REGIONS_KHR = 1000084000, VK_STRUCTURE_TYPE_OBJECT_TABLE_CREATE_INFO_NVX = 1000086000, VK_STRUCTURE_TYPE_INDIRECT_COMMANDS_LAYOUT_CREATE_INFO_NVX = 1000086001, @@ -352,23 +406,19 @@ VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DEPTH_CLIP_ENABLE_FEATURES_EXT = 1000102000, VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_DEPTH_CLIP_STATE_CREATE_INFO_EXT = 1000102001, VK_STRUCTURE_TYPE_HDR_METADATA_EXT = 1000105000, - VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGELESS_FRAMEBUFFER_FEATURES_KHR = 1000108000, - VK_STRUCTURE_TYPE_FRAMEBUFFER_ATTACHMENTS_CREATE_INFO_KHR = 1000108001, - VK_STRUCTURE_TYPE_FRAMEBUFFER_ATTACHMENT_IMAGE_INFO_KHR = 1000108002, - VK_STRUCTURE_TYPE_RENDER_PASS_ATTACHMENT_BEGIN_INFO_KHR = 1000108003, - VK_STRUCTURE_TYPE_ATTACHMENT_DESCRIPTION_2_KHR = 1000109000, - VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_2_KHR = 1000109001, - VK_STRUCTURE_TYPE_SUBPASS_DESCRIPTION_2_KHR = 1000109002, - VK_STRUCTURE_TYPE_SUBPASS_DEPENDENCY_2_KHR = 1000109003, - VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO_2_KHR = 1000109004, - VK_STRUCTURE_TYPE_SUBPASS_BEGIN_INFO_KHR = 1000109005, - VK_STRUCTURE_TYPE_SUBPASS_END_INFO_KHR = 1000109006, VK_STRUCTURE_TYPE_SHARED_PRESENT_SURFACE_CAPABILITIES_KHR = 1000111000, VK_STRUCTURE_TYPE_IMPORT_FENCE_WIN32_HANDLE_INFO_KHR = 1000114000, VK_STRUCTURE_TYPE_EXPORT_FENCE_WIN32_HANDLE_INFO_KHR = 1000114001, VK_STRUCTURE_TYPE_FENCE_GET_WIN32_HANDLE_INFO_KHR = 1000114002, VK_STRUCTURE_TYPE_IMPORT_FENCE_FD_INFO_KHR = 1000115000, VK_STRUCTURE_TYPE_FENCE_GET_FD_INFO_KHR = 1000115001, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PERFORMANCE_QUERY_FEATURES_KHR = 1000116000, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PERFORMANCE_QUERY_PROPERTIES_KHR = 1000116001, + VK_STRUCTURE_TYPE_QUERY_POOL_PERFORMANCE_CREATE_INFO_KHR = 1000116002, + VK_STRUCTURE_TYPE_PERFORMANCE_QUERY_SUBMIT_INFO_KHR = 1000116003, + VK_STRUCTURE_TYPE_ACQUIRE_PROFILING_LOCK_INFO_KHR = 1000116004, + VK_STRUCTURE_TYPE_PERFORMANCE_COUNTER_KHR = 1000116005, + VK_STRUCTURE_TYPE_PERFORMANCE_COUNTER_DESCRIPTION_KHR = 1000116006, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SURFACE_INFO_2_KHR = 1000119000, VK_STRUCTURE_TYPE_SURFACE_CAPABILITIES_2_KHR = 1000119001, VK_STRUCTURE_TYPE_SURFACE_FORMAT_2_KHR = 1000119002, @@ -390,8 +440,6 @@ VK_STRUCTURE_TYPE_IMPORT_ANDROID_HARDWARE_BUFFER_INFO_ANDROID = 1000129003, VK_STRUCTURE_TYPE_MEMORY_GET_ANDROID_HARDWARE_BUFFER_INFO_ANDROID = 1000129004, VK_STRUCTURE_TYPE_EXTERNAL_FORMAT_ANDROID = 1000129005, - VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SAMPLER_FILTER_MINMAX_PROPERTIES_EXT = 1000130000, - VK_STRUCTURE_TYPE_SAMPLER_REDUCTION_MODE_CREATE_INFO_EXT = 1000130001, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_INLINE_UNIFORM_BLOCK_FEATURES_EXT = 1000138000, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_INLINE_UNIFORM_BLOCK_PROPERTIES_EXT = 1000138001, VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET_INLINE_UNIFORM_BLOCK_EXT = 1000138002, @@ -401,7 +449,6 @@ VK_STRUCTURE_TYPE_PIPELINE_SAMPLE_LOCATIONS_STATE_CREATE_INFO_EXT = 1000143002, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SAMPLE_LOCATIONS_PROPERTIES_EXT = 1000143003, VK_STRUCTURE_TYPE_MULTISAMPLE_PROPERTIES_EXT = 1000143004, - VK_STRUCTURE_TYPE_IMAGE_FORMAT_LIST_CREATE_INFO_KHR = 1000147000, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_BLEND_OPERATION_ADVANCED_FEATURES_EXT = 1000148000, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_BLEND_OPERATION_ADVANCED_PROPERTIES_EXT = 1000148001, VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_ADVANCED_STATE_CREATE_INFO_EXT = 1000148002, @@ -417,11 +464,6 @@ VK_STRUCTURE_TYPE_IMAGE_DRM_FORMAT_MODIFIER_PROPERTIES_EXT = 1000158005, VK_STRUCTURE_TYPE_VALIDATION_CACHE_CREATE_INFO_EXT = 1000160000, VK_STRUCTURE_TYPE_SHADER_MODULE_VALIDATION_CACHE_CREATE_INFO_EXT = 1000160001, - VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_BINDING_FLAGS_CREATE_INFO_EXT = 1000161000, - VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DESCRIPTOR_INDEXING_FEATURES_EXT = 1000161001, - VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DESCRIPTOR_INDEXING_PROPERTIES_EXT = 1000161002, - VK_STRUCTURE_TYPE_DESCRIPTOR_SET_VARIABLE_DESCRIPTOR_COUNT_ALLOCATE_INFO_EXT = 1000161003, - VK_STRUCTURE_TYPE_DESCRIPTOR_SET_VARIABLE_DESCRIPTOR_COUNT_LAYOUT_SUPPORT_EXT = 1000161004, VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_SHADING_RATE_IMAGE_STATE_CREATE_INFO_NV = 1000164000, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADING_RATE_IMAGE_FEATURES_NV = 1000164001, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADING_RATE_IMAGE_PROPERTIES_NV = 1000164002, @@ -442,11 +484,10 @@ VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_VIEW_IMAGE_FORMAT_INFO_EXT = 1000170000, VK_STRUCTURE_TYPE_FILTER_CUBIC_IMAGE_VIEW_IMAGE_FORMAT_PROPERTIES_EXT = 1000170001, VK_STRUCTURE_TYPE_DEVICE_QUEUE_GLOBAL_PRIORITY_CREATE_INFO_EXT = 1000174000, - VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_8BIT_STORAGE_FEATURES_KHR = 1000177000, VK_STRUCTURE_TYPE_IMPORT_MEMORY_HOST_POINTER_INFO_EXT = 1000178000, VK_STRUCTURE_TYPE_MEMORY_HOST_POINTER_PROPERTIES_EXT = 1000178001, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_MEMORY_HOST_PROPERTIES_EXT = 1000178002, - VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_ATOMIC_INT64_FEATURES_KHR = 1000180000, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_CLOCK_FEATURES_KHR = 1000181000, VK_STRUCTURE_TYPE_PIPELINE_COMPILER_CONTROL_CREATE_INFO_AMD = 1000183000, VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_EXT = 1000184000, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_CORE_PROPERTIES_AMD = 1000185000, @@ -456,10 +497,6 @@ VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VERTEX_ATTRIBUTE_DIVISOR_FEATURES_EXT = 1000190002, VK_STRUCTURE_TYPE_PRESENT_FRAME_TOKEN_GGP = 1000191000, VK_STRUCTURE_TYPE_PIPELINE_CREATION_FEEDBACK_CREATE_INFO_EXT = 1000192000, - VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DRIVER_PROPERTIES_KHR = 1000196000, - VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT_CONTROLS_PROPERTIES_KHR = 1000197000, - VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DEPTH_STENCIL_RESOLVE_PROPERTIES_KHR = 1000199000, - VK_STRUCTURE_TYPE_SUBPASS_DESCRIPTION_DEPTH_STENCIL_RESOLVE_KHR = 1000199001, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COMPUTE_SHADER_DERIVATIVES_FEATURES_NV = 1000201000, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MESH_SHADER_FEATURES_NV = 1000202000, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MESH_SHADER_PROPERTIES_NV = 1000202001, @@ -476,7 +513,6 @@ VK_STRUCTURE_TYPE_PERFORMANCE_STREAM_MARKER_INFO_INTEL = 1000210003, VK_STRUCTURE_TYPE_PERFORMANCE_OVERRIDE_INFO_INTEL = 1000210004, VK_STRUCTURE_TYPE_PERFORMANCE_CONFIGURATION_ACQUIRE_INFO_INTEL = 1000210005, - VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_MEMORY_MODEL_FEATURES_KHR = 1000211000, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PCI_BUS_INFO_PROPERTIES_EXT = 1000212000, VK_STRUCTURE_TYPE_DISPLAY_NATIVE_HDR_SURFACE_CAPABILITIES_AMD = 1000213000, VK_STRUCTURE_TYPE_SWAPCHAIN_DISPLAY_NATIVE_HDR_CREATE_INFO_AMD = 1000213001, @@ -485,20 +521,19 @@ VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FRAGMENT_DENSITY_MAP_FEATURES_EXT = 1000218000, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FRAGMENT_DENSITY_MAP_PROPERTIES_EXT = 1000218001, VK_STRUCTURE_TYPE_RENDER_PASS_FRAGMENT_DENSITY_MAP_CREATE_INFO_EXT = 1000218002, - VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SCALAR_BLOCK_LAYOUT_FEATURES_EXT = 1000221000, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_SIZE_CONTROL_PROPERTIES_EXT = 1000225000, VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_REQUIRED_SUBGROUP_SIZE_CREATE_INFO_EXT = 1000225001, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_SIZE_CONTROL_FEATURES_EXT = 1000225002, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_CORE_PROPERTIES_2_AMD = 1000227000, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COHERENT_MEMORY_FEATURES_AMD = 1000229000, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MEMORY_BUDGET_PROPERTIES_EXT = 1000237000, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MEMORY_PRIORITY_FEATURES_EXT = 1000238000, VK_STRUCTURE_TYPE_MEMORY_PRIORITY_ALLOCATE_INFO_EXT = 1000238001, VK_STRUCTURE_TYPE_SURFACE_PROTECTED_CAPABILITIES_KHR = 1000239000, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DEDICATED_ALLOCATION_IMAGE_ALIASING_FEATURES_NV = 1000240000, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_BUFFER_DEVICE_ADDRESS_FEATURES_EXT = 1000244000, - VK_STRUCTURE_TYPE_BUFFER_DEVICE_ADDRESS_INFO_EXT = 1000244001, VK_STRUCTURE_TYPE_BUFFER_DEVICE_ADDRESS_CREATE_INFO_EXT = 1000244002, - VK_STRUCTURE_TYPE_IMAGE_STENCIL_USAGE_CREATE_INFO_EXT = 1000246000, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TOOL_PROPERTIES_EXT = 1000245000, VK_STRUCTURE_TYPE_VALIDATION_FEATURES_EXT = 1000247000, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_FEATURES_NV = 1000249000, VK_STRUCTURE_TYPE_COOPERATIVE_MATRIX_PROPERTIES_NV = 1000249001, @@ -508,7 +543,6 @@ VK_STRUCTURE_TYPE_FRAMEBUFFER_MIXED_SAMPLES_COMBINATION_NV = 1000250002, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FRAGMENT_SHADER_INTERLOCK_FEATURES_EXT = 1000251000, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_YCBCR_IMAGE_ARRAYS_FEATURES_EXT = 1000252000, - VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_UNIFORM_BUFFER_STANDARD_LAYOUT_FEATURES_KHR = 1000253000, VK_STRUCTURE_TYPE_SURFACE_FULL_SCREEN_EXCLUSIVE_INFO_EXT = 1000255000, VK_STRUCTURE_TYPE_SURFACE_CAPABILITIES_FULL_SCREEN_EXCLUSIVE_EXT = 1000255002, VK_STRUCTURE_TYPE_SURFACE_FULL_SCREEN_EXCLUSIVE_WIN32_INFO_EXT = 1000255001, @@ -516,7 +550,6 @@ VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_LINE_RASTERIZATION_FEATURES_EXT = 1000259000, VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_LINE_STATE_CREATE_INFO_EXT = 1000259001, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_LINE_RASTERIZATION_PROPERTIES_EXT = 1000259002, - VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_HOST_QUERY_RESET_FEATURES_EXT = 1000261000, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_INDEX_TYPE_UINT8_FEATURES_EXT = 1000265000, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PIPELINE_EXECUTABLE_PROPERTIES_FEATURES_KHR = 1000269000, VK_STRUCTURE_TYPE_PIPELINE_INFO_KHR = 1000269001, @@ -562,10 +595,22 @@ VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_SEMAPHORE_INFO_KHR = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_SEMAPHORE_INFO, VK_STRUCTURE_TYPE_EXTERNAL_SEMAPHORE_PROPERTIES_KHR = VK_STRUCTURE_TYPE_EXTERNAL_SEMAPHORE_PROPERTIES, VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_CREATE_INFO_KHR = VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_CREATE_INFO, - VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT16_INT8_FEATURES_KHR = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_FLOAT16_INT8_FEATURES_KHR, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_FLOAT16_INT8_FEATURES_KHR = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_FLOAT16_INT8_FEATURES, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT16_INT8_FEATURES_KHR = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_FLOAT16_INT8_FEATURES, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_16BIT_STORAGE_FEATURES_KHR = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_16BIT_STORAGE_FEATURES, VK_STRUCTURE_TYPE_DESCRIPTOR_UPDATE_TEMPLATE_CREATE_INFO_KHR = VK_STRUCTURE_TYPE_DESCRIPTOR_UPDATE_TEMPLATE_CREATE_INFO, VK_STRUCTURE_TYPE_SURFACE_CAPABILITIES2_EXT = VK_STRUCTURE_TYPE_SURFACE_CAPABILITIES_2_EXT, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGELESS_FRAMEBUFFER_FEATURES_KHR = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGELESS_FRAMEBUFFER_FEATURES, + VK_STRUCTURE_TYPE_FRAMEBUFFER_ATTACHMENTS_CREATE_INFO_KHR = VK_STRUCTURE_TYPE_FRAMEBUFFER_ATTACHMENTS_CREATE_INFO, + VK_STRUCTURE_TYPE_FRAMEBUFFER_ATTACHMENT_IMAGE_INFO_KHR = VK_STRUCTURE_TYPE_FRAMEBUFFER_ATTACHMENT_IMAGE_INFO, + VK_STRUCTURE_TYPE_RENDER_PASS_ATTACHMENT_BEGIN_INFO_KHR = VK_STRUCTURE_TYPE_RENDER_PASS_ATTACHMENT_BEGIN_INFO, + VK_STRUCTURE_TYPE_ATTACHMENT_DESCRIPTION_2_KHR = VK_STRUCTURE_TYPE_ATTACHMENT_DESCRIPTION_2, + VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_2_KHR = VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_2, + VK_STRUCTURE_TYPE_SUBPASS_DESCRIPTION_2_KHR = VK_STRUCTURE_TYPE_SUBPASS_DESCRIPTION_2, + VK_STRUCTURE_TYPE_SUBPASS_DEPENDENCY_2_KHR = VK_STRUCTURE_TYPE_SUBPASS_DEPENDENCY_2, + VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO_2_KHR = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO_2, + VK_STRUCTURE_TYPE_SUBPASS_BEGIN_INFO_KHR = VK_STRUCTURE_TYPE_SUBPASS_BEGIN_INFO, + VK_STRUCTURE_TYPE_SUBPASS_END_INFO_KHR = VK_STRUCTURE_TYPE_SUBPASS_END_INFO, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_FENCE_INFO_KHR = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_FENCE_INFO, VK_STRUCTURE_TYPE_EXTERNAL_FENCE_PROPERTIES_KHR = VK_STRUCTURE_TYPE_EXTERNAL_FENCE_PROPERTIES, VK_STRUCTURE_TYPE_EXPORT_FENCE_CREATE_INFO_KHR = VK_STRUCTURE_TYPE_EXPORT_FENCE_CREATE_INFO, @@ -577,11 +622,14 @@ VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VARIABLE_POINTERS_FEATURES_KHR = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VARIABLE_POINTER_FEATURES, VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS_KHR = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS, VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO_KHR = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SAMPLER_FILTER_MINMAX_PROPERTIES_EXT = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SAMPLER_FILTER_MINMAX_PROPERTIES, + VK_STRUCTURE_TYPE_SAMPLER_REDUCTION_MODE_CREATE_INFO_EXT = VK_STRUCTURE_TYPE_SAMPLER_REDUCTION_MODE_CREATE_INFO, VK_STRUCTURE_TYPE_BUFFER_MEMORY_REQUIREMENTS_INFO_2_KHR = VK_STRUCTURE_TYPE_BUFFER_MEMORY_REQUIREMENTS_INFO_2, VK_STRUCTURE_TYPE_IMAGE_MEMORY_REQUIREMENTS_INFO_2_KHR = VK_STRUCTURE_TYPE_IMAGE_MEMORY_REQUIREMENTS_INFO_2, VK_STRUCTURE_TYPE_IMAGE_SPARSE_MEMORY_REQUIREMENTS_INFO_2_KHR = VK_STRUCTURE_TYPE_IMAGE_SPARSE_MEMORY_REQUIREMENTS_INFO_2, VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2_KHR = VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2, VK_STRUCTURE_TYPE_SPARSE_IMAGE_MEMORY_REQUIREMENTS_2_KHR = VK_STRUCTURE_TYPE_SPARSE_IMAGE_MEMORY_REQUIREMENTS_2, + VK_STRUCTURE_TYPE_IMAGE_FORMAT_LIST_CREATE_INFO_KHR = VK_STRUCTURE_TYPE_IMAGE_FORMAT_LIST_CREATE_INFO, VK_STRUCTURE_TYPE_SAMPLER_YCBCR_CONVERSION_CREATE_INFO_KHR = VK_STRUCTURE_TYPE_SAMPLER_YCBCR_CONVERSION_CREATE_INFO, VK_STRUCTURE_TYPE_SAMPLER_YCBCR_CONVERSION_INFO_KHR = VK_STRUCTURE_TYPE_SAMPLER_YCBCR_CONVERSION_INFO, VK_STRUCTURE_TYPE_BIND_IMAGE_PLANE_MEMORY_INFO_KHR = VK_STRUCTURE_TYPE_BIND_IMAGE_PLANE_MEMORY_INFO, @@ -590,9 +638,41 @@ VK_STRUCTURE_TYPE_SAMPLER_YCBCR_CONVERSION_IMAGE_FORMAT_PROPERTIES_KHR = VK_STRUCTURE_TYPE_SAMPLER_YCBCR_CONVERSION_IMAGE_FORMAT_PROPERTIES, VK_STRUCTURE_TYPE_BIND_BUFFER_MEMORY_INFO_KHR = VK_STRUCTURE_TYPE_BIND_BUFFER_MEMORY_INFO, VK_STRUCTURE_TYPE_BIND_IMAGE_MEMORY_INFO_KHR = VK_STRUCTURE_TYPE_BIND_IMAGE_MEMORY_INFO, + VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_BINDING_FLAGS_CREATE_INFO_EXT = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_BINDING_FLAGS_CREATE_INFO, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DESCRIPTOR_INDEXING_FEATURES_EXT = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DESCRIPTOR_INDEXING_FEATURES, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DESCRIPTOR_INDEXING_PROPERTIES_EXT = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DESCRIPTOR_INDEXING_PROPERTIES, + VK_STRUCTURE_TYPE_DESCRIPTOR_SET_VARIABLE_DESCRIPTOR_COUNT_ALLOCATE_INFO_EXT = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_VARIABLE_DESCRIPTOR_COUNT_ALLOCATE_INFO, + VK_STRUCTURE_TYPE_DESCRIPTOR_SET_VARIABLE_DESCRIPTOR_COUNT_LAYOUT_SUPPORT_EXT = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_VARIABLE_DESCRIPTOR_COUNT_LAYOUT_SUPPORT, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MAINTENANCE_3_PROPERTIES_KHR = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MAINTENANCE_3_PROPERTIES, VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_SUPPORT_KHR = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_SUPPORT, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_SUBGROUP_EXTENDED_TYPES_FEATURES_KHR = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_SUBGROUP_EXTENDED_TYPES_FEATURES, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_8BIT_STORAGE_FEATURES_KHR = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_8BIT_STORAGE_FEATURES, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_ATOMIC_INT64_FEATURES_KHR = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_ATOMIC_INT64_FEATURES, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DRIVER_PROPERTIES_KHR = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DRIVER_PROPERTIES, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT_CONTROLS_PROPERTIES_KHR = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT_CONTROLS_PROPERTIES, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DEPTH_STENCIL_RESOLVE_PROPERTIES_KHR = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DEPTH_STENCIL_RESOLVE_PROPERTIES, + VK_STRUCTURE_TYPE_SUBPASS_DESCRIPTION_DEPTH_STENCIL_RESOLVE_KHR = VK_STRUCTURE_TYPE_SUBPASS_DESCRIPTION_DEPTH_STENCIL_RESOLVE, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TIMELINE_SEMAPHORE_FEATURES_KHR = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TIMELINE_SEMAPHORE_FEATURES, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TIMELINE_SEMAPHORE_PROPERTIES_KHR = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TIMELINE_SEMAPHORE_PROPERTIES, + VK_STRUCTURE_TYPE_SEMAPHORE_TYPE_CREATE_INFO_KHR = VK_STRUCTURE_TYPE_SEMAPHORE_TYPE_CREATE_INFO, + VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO_KHR = VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO, + VK_STRUCTURE_TYPE_SEMAPHORE_WAIT_INFO_KHR = VK_STRUCTURE_TYPE_SEMAPHORE_WAIT_INFO, + VK_STRUCTURE_TYPE_SEMAPHORE_SIGNAL_INFO_KHR = VK_STRUCTURE_TYPE_SEMAPHORE_SIGNAL_INFO, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_MEMORY_MODEL_FEATURES_KHR = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_MEMORY_MODEL_FEATURES, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SCALAR_BLOCK_LAYOUT_FEATURES_EXT = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SCALAR_BLOCK_LAYOUT_FEATURES, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SEPARATE_DEPTH_STENCIL_LAYOUTS_FEATURES_KHR = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SEPARATE_DEPTH_STENCIL_LAYOUTS_FEATURES, + VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_STENCIL_LAYOUT_KHR = VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_STENCIL_LAYOUT, + VK_STRUCTURE_TYPE_ATTACHMENT_DESCRIPTION_STENCIL_LAYOUT_KHR = VK_STRUCTURE_TYPE_ATTACHMENT_DESCRIPTION_STENCIL_LAYOUT, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_BUFFER_ADDRESS_FEATURES_EXT = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_BUFFER_DEVICE_ADDRESS_FEATURES_EXT, + VK_STRUCTURE_TYPE_BUFFER_DEVICE_ADDRESS_INFO_EXT = VK_STRUCTURE_TYPE_BUFFER_DEVICE_ADDRESS_INFO, + VK_STRUCTURE_TYPE_IMAGE_STENCIL_USAGE_CREATE_INFO_EXT = VK_STRUCTURE_TYPE_IMAGE_STENCIL_USAGE_CREATE_INFO, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_UNIFORM_BUFFER_STANDARD_LAYOUT_FEATURES_KHR = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_UNIFORM_BUFFER_STANDARD_LAYOUT_FEATURES, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_BUFFER_DEVICE_ADDRESS_FEATURES_KHR = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_BUFFER_DEVICE_ADDRESS_FEATURES, + VK_STRUCTURE_TYPE_BUFFER_DEVICE_ADDRESS_INFO_KHR = VK_STRUCTURE_TYPE_BUFFER_DEVICE_ADDRESS_INFO, + VK_STRUCTURE_TYPE_BUFFER_OPAQUE_CAPTURE_ADDRESS_CREATE_INFO_KHR = VK_STRUCTURE_TYPE_BUFFER_OPAQUE_CAPTURE_ADDRESS_CREATE_INFO, + VK_STRUCTURE_TYPE_MEMORY_OPAQUE_CAPTURE_ADDRESS_ALLOCATE_INFO_KHR = VK_STRUCTURE_TYPE_MEMORY_OPAQUE_CAPTURE_ADDRESS_ALLOCATE_INFO, + VK_STRUCTURE_TYPE_DEVICE_MEMORY_OPAQUE_CAPTURE_ADDRESS_INFO_KHR = VK_STRUCTURE_TYPE_DEVICE_MEMORY_OPAQUE_CAPTURE_ADDRESS_INFO, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_HOST_QUERY_RESET_FEATURES_EXT = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_HOST_QUERY_RESET_FEATURES, VK_STRUCTURE_TYPE_BEGIN_RANGE = VK_STRUCTURE_TYPE_APPLICATION_INFO, VK_STRUCTURE_TYPE_END_RANGE = VK_STRUCTURE_TYPE_LOADER_DEVICE_CREATE_INFO, VK_STRUCTURE_TYPE_RANGE_SIZE = (VK_STRUCTURE_TYPE_LOADER_DEVICE_CREATE_INFO - VK_STRUCTURE_TYPE_APPLICATION_INFO + 1), @@ -938,6 +1018,7 @@ VK_QUERY_TYPE_PIPELINE_STATISTICS = 1, VK_QUERY_TYPE_TIMESTAMP = 2, VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT = 1000028004, + VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR = 1000116000, VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_NV = 1000165000, VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL = 1000210000, VK_QUERY_TYPE_BEGIN_RANGE = VK_QUERY_TYPE_OCCLUSION, @@ -967,12 +1048,20 @@ VK_IMAGE_LAYOUT_PREINITIALIZED = 8, VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_STENCIL_ATTACHMENT_OPTIMAL = 1000117000, VK_IMAGE_LAYOUT_DEPTH_ATTACHMENT_STENCIL_READ_ONLY_OPTIMAL = 1000117001, + VK_IMAGE_LAYOUT_DEPTH_ATTACHMENT_OPTIMAL = 1000241000, + VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_OPTIMAL = 1000241001, + VK_IMAGE_LAYOUT_STENCIL_ATTACHMENT_OPTIMAL = 1000241002, + VK_IMAGE_LAYOUT_STENCIL_READ_ONLY_OPTIMAL = 1000241003, VK_IMAGE_LAYOUT_PRESENT_SRC_KHR = 1000001002, VK_IMAGE_LAYOUT_SHARED_PRESENT_KHR = 1000111000, VK_IMAGE_LAYOUT_SHADING_RATE_OPTIMAL_NV = 1000164003, VK_IMAGE_LAYOUT_FRAGMENT_DENSITY_MAP_OPTIMAL_EXT = 1000218000, VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_STENCIL_ATTACHMENT_OPTIMAL_KHR = VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_STENCIL_ATTACHMENT_OPTIMAL, VK_IMAGE_LAYOUT_DEPTH_ATTACHMENT_STENCIL_READ_ONLY_OPTIMAL_KHR = VK_IMAGE_LAYOUT_DEPTH_ATTACHMENT_STENCIL_READ_ONLY_OPTIMAL, + VK_IMAGE_LAYOUT_DEPTH_ATTACHMENT_OPTIMAL_KHR = VK_IMAGE_LAYOUT_DEPTH_ATTACHMENT_OPTIMAL, + VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_OPTIMAL_KHR = VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_OPTIMAL, + VK_IMAGE_LAYOUT_STENCIL_ATTACHMENT_OPTIMAL_KHR = VK_IMAGE_LAYOUT_STENCIL_ATTACHMENT_OPTIMAL, + VK_IMAGE_LAYOUT_STENCIL_READ_ONLY_OPTIMAL_KHR = VK_IMAGE_LAYOUT_STENCIL_READ_ONLY_OPTIMAL, VK_IMAGE_LAYOUT_BEGIN_RANGE = VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_END_RANGE = VK_IMAGE_LAYOUT_PREINITIALIZED, VK_IMAGE_LAYOUT_RANGE_SIZE = (VK_IMAGE_LAYOUT_PREINITIALIZED - VK_IMAGE_LAYOUT_UNDEFINED + 1), @@ -1240,6 +1329,7 @@ VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE = 2, VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER = 3, VK_SAMPLER_ADDRESS_MODE_MIRROR_CLAMP_TO_EDGE = 4, + VK_SAMPLER_ADDRESS_MODE_MIRROR_CLAMP_TO_EDGE_KHR = VK_SAMPLER_ADDRESS_MODE_MIRROR_CLAMP_TO_EDGE, VK_SAMPLER_ADDRESS_MODE_BEGIN_RANGE = VK_SAMPLER_ADDRESS_MODE_REPEAT, VK_SAMPLER_ADDRESS_MODE_END_RANGE = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER, VK_SAMPLER_ADDRESS_MODE_RANGE_SIZE = (VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER - VK_SAMPLER_ADDRESS_MODE_REPEAT + 1), @@ -1419,11 +1509,12 @@ VK_FORMAT_FEATURE_SAMPLED_IMAGE_YCBCR_CONVERSION_CHROMA_RECONSTRUCTION_EXPLICIT_FORCEABLE_BIT = 0x00200000, VK_FORMAT_FEATURE_DISJOINT_BIT = 0x00400000, VK_FORMAT_FEATURE_COSITED_CHROMA_SAMPLES_BIT = 0x00800000, + VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_MINMAX_BIT = 0x00010000, VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_CUBIC_BIT_IMG = 0x00002000, - VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_MINMAX_BIT_EXT = 0x00010000, VK_FORMAT_FEATURE_FRAGMENT_DENSITY_MAP_BIT_EXT = 0x01000000, VK_FORMAT_FEATURE_TRANSFER_SRC_BIT_KHR = VK_FORMAT_FEATURE_TRANSFER_SRC_BIT, VK_FORMAT_FEATURE_TRANSFER_DST_BIT_KHR = VK_FORMAT_FEATURE_TRANSFER_DST_BIT, + VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_MINMAX_BIT_EXT = VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_MINMAX_BIT, VK_FORMAT_FEATURE_MIDPOINT_CHROMA_SAMPLES_BIT_KHR = VK_FORMAT_FEATURE_MIDPOINT_CHROMA_SAMPLES_BIT, VK_FORMAT_FEATURE_SAMPLED_IMAGE_YCBCR_CONVERSION_LINEAR_FILTER_BIT_KHR = VK_FORMAT_FEATURE_SAMPLED_IMAGE_YCBCR_CONVERSION_LINEAR_FILTER_BIT, VK_FORMAT_FEATURE_SAMPLED_IMAGE_YCBCR_CONVERSION_SEPARATE_RECONSTRUCTION_FILTER_BIT_KHR = VK_FORMAT_FEATURE_SAMPLED_IMAGE_YCBCR_CONVERSION_SEPARATE_RECONSTRUCTION_FILTER_BIT, @@ -1506,6 +1597,8 @@ VK_MEMORY_PROPERTY_HOST_CACHED_BIT = 0x00000008, VK_MEMORY_PROPERTY_LAZILY_ALLOCATED_BIT = 0x00000010, VK_MEMORY_PROPERTY_PROTECTED_BIT = 0x00000020, + VK_MEMORY_PROPERTY_DEVICE_COHERENT_BIT_AMD = 0x00000040, + VK_MEMORY_PROPERTY_DEVICE_UNCACHED_BIT_AMD = 0x00000080, VK_MEMORY_PROPERTY_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF } VkMemoryPropertyFlagBits; typedef VkFlags VkMemoryPropertyFlags; @@ -1629,7 +1722,9 @@ VK_BUFFER_CREATE_SPARSE_RESIDENCY_BIT = 0x00000002, VK_BUFFER_CREATE_SPARSE_ALIASED_BIT = 0x00000004, VK_BUFFER_CREATE_PROTECTED_BIT = 0x00000008, - VK_BUFFER_CREATE_DEVICE_ADDRESS_CAPTURE_REPLAY_BIT_EXT = 0x00000010, + VK_BUFFER_CREATE_DEVICE_ADDRESS_CAPTURE_REPLAY_BIT = 0x00000010, + VK_BUFFER_CREATE_DEVICE_ADDRESS_CAPTURE_REPLAY_BIT_EXT = VK_BUFFER_CREATE_DEVICE_ADDRESS_CAPTURE_REPLAY_BIT, + VK_BUFFER_CREATE_DEVICE_ADDRESS_CAPTURE_REPLAY_BIT_KHR = VK_BUFFER_CREATE_DEVICE_ADDRESS_CAPTURE_REPLAY_BIT, VK_BUFFER_CREATE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF } VkBufferCreateFlagBits; typedef VkFlags VkBufferCreateFlags; @@ -1644,11 +1739,13 @@ VK_BUFFER_USAGE_INDEX_BUFFER_BIT = 0x00000040, VK_BUFFER_USAGE_VERTEX_BUFFER_BIT = 0x00000080, VK_BUFFER_USAGE_INDIRECT_BUFFER_BIT = 0x00000100, + VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT = 0x00020000, VK_BUFFER_USAGE_TRANSFORM_FEEDBACK_BUFFER_BIT_EXT = 0x00000800, VK_BUFFER_USAGE_TRANSFORM_FEEDBACK_COUNTER_BUFFER_BIT_EXT = 0x00001000, VK_BUFFER_USAGE_CONDITIONAL_RENDERING_BIT_EXT = 0x00000200, VK_BUFFER_USAGE_RAY_TRACING_BIT_NV = 0x00000400, - VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT_EXT = 0x00020000, + VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT_EXT = VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT, + VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT_KHR = VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT, VK_BUFFER_USAGE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF } VkBufferUsageFlagBits; typedef VkFlags VkBufferUsageFlags; @@ -1671,10 +1768,11 @@ VK_PIPELINE_CREATE_ALLOW_DERIVATIVES_BIT = 0x00000002, VK_PIPELINE_CREATE_DERIVATIVE_BIT = 0x00000004, VK_PIPELINE_CREATE_VIEW_INDEX_FROM_DEVICE_INDEX_BIT = 0x00000008, - VK_PIPELINE_CREATE_DISPATCH_BASE = 0x00000010, + VK_PIPELINE_CREATE_DISPATCH_BASE_BIT = 0x00000010, VK_PIPELINE_CREATE_DEFER_COMPILE_BIT_NV = 0x00000020, VK_PIPELINE_CREATE_CAPTURE_STATISTICS_BIT_KHR = 0x00000040, VK_PIPELINE_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR = 0x00000080, + VK_PIPELINE_CREATE_DISPATCH_BASE = VK_PIPELINE_CREATE_DISPATCH_BASE_BIT, VK_PIPELINE_CREATE_VIEW_INDEX_FROM_DEVICE_INDEX_BIT_KHR = VK_PIPELINE_CREATE_VIEW_INDEX_FROM_DEVICE_INDEX_BIT, VK_PIPELINE_CREATE_DISPATCH_BASE_KHR = VK_PIPELINE_CREATE_DISPATCH_BASE, VK_PIPELINE_CREATE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF @@ -1745,22 +1843,25 @@ typedef VkFlags VkSamplerCreateFlags; typedef enum VkDescriptorSetLayoutCreateFlagBits { + VK_DESCRIPTOR_SET_LAYOUT_CREATE_UPDATE_AFTER_BIND_POOL_BIT = 0x00000002, VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR = 0x00000001, - VK_DESCRIPTOR_SET_LAYOUT_CREATE_UPDATE_AFTER_BIND_POOL_BIT_EXT = 0x00000002, + VK_DESCRIPTOR_SET_LAYOUT_CREATE_UPDATE_AFTER_BIND_POOL_BIT_EXT = VK_DESCRIPTOR_SET_LAYOUT_CREATE_UPDATE_AFTER_BIND_POOL_BIT, VK_DESCRIPTOR_SET_LAYOUT_CREATE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF } VkDescriptorSetLayoutCreateFlagBits; typedef VkFlags VkDescriptorSetLayoutCreateFlags; typedef enum VkDescriptorPoolCreateFlagBits { VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT = 0x00000001, - VK_DESCRIPTOR_POOL_CREATE_UPDATE_AFTER_BIND_BIT_EXT = 0x00000002, + VK_DESCRIPTOR_POOL_CREATE_UPDATE_AFTER_BIND_BIT = 0x00000002, + VK_DESCRIPTOR_POOL_CREATE_UPDATE_AFTER_BIND_BIT_EXT = VK_DESCRIPTOR_POOL_CREATE_UPDATE_AFTER_BIND_BIT, VK_DESCRIPTOR_POOL_CREATE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF } VkDescriptorPoolCreateFlagBits; typedef VkFlags VkDescriptorPoolCreateFlags; typedef VkFlags VkDescriptorPoolResetFlags; typedef enum VkFramebufferCreateFlagBits { - VK_FRAMEBUFFER_CREATE_IMAGELESS_BIT_KHR = 0x00000001, + VK_FRAMEBUFFER_CREATE_IMAGELESS_BIT = 0x00000001, + VK_FRAMEBUFFER_CREATE_IMAGELESS_BIT_KHR = VK_FRAMEBUFFER_CREATE_IMAGELESS_BIT, VK_FRAMEBUFFER_CREATE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF } VkFramebufferCreateFlagBits; typedef VkFlags VkFramebufferCreateFlags; @@ -4031,7 +4132,11 @@ typedef enum VkMemoryAllocateFlagBits { VK_MEMORY_ALLOCATE_DEVICE_MASK_BIT = 0x00000001, + VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_BIT = 0x00000002, + VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_CAPTURE_REPLAY_BIT = 0x00000004, VK_MEMORY_ALLOCATE_DEVICE_MASK_BIT_KHR = VK_MEMORY_ALLOCATE_DEVICE_MASK_BIT, + VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_BIT_KHR = VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_BIT, + VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_CAPTURE_REPLAY_BIT_KHR = VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_CAPTURE_REPLAY_BIT, VK_MEMORY_ALLOCATE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF } VkMemoryAllocateFlagBits; typedef VkFlags VkMemoryAllocateFlags; @@ -4798,166 +4903,920 @@ #endif -#define VK_KHR_surface 1 -VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkSurfaceKHR) -#define VK_KHR_SURFACE_SPEC_VERSION 25 -#define VK_KHR_SURFACE_EXTENSION_NAME "VK_KHR_surface" +#define VK_VERSION_1_2 1 +// Vulkan 1.2 version number +#define VK_API_VERSION_1_2 VK_MAKE_VERSION(1, 2, 0)// Patch version should always be set to 0 -typedef enum VkColorSpaceKHR { - VK_COLOR_SPACE_SRGB_NONLINEAR_KHR = 0, - VK_COLOR_SPACE_DISPLAY_P3_NONLINEAR_EXT = 1000104001, - VK_COLOR_SPACE_EXTENDED_SRGB_LINEAR_EXT = 1000104002, - VK_COLOR_SPACE_DISPLAY_P3_LINEAR_EXT = 1000104003, - VK_COLOR_SPACE_DCI_P3_NONLINEAR_EXT = 1000104004, - VK_COLOR_SPACE_BT709_LINEAR_EXT = 1000104005, - VK_COLOR_SPACE_BT709_NONLINEAR_EXT = 1000104006, - VK_COLOR_SPACE_BT2020_LINEAR_EXT = 1000104007, - VK_COLOR_SPACE_HDR10_ST2084_EXT = 1000104008, - VK_COLOR_SPACE_DOLBYVISION_EXT = 1000104009, - VK_COLOR_SPACE_HDR10_HLG_EXT = 1000104010, - VK_COLOR_SPACE_ADOBERGB_LINEAR_EXT = 1000104011, - VK_COLOR_SPACE_ADOBERGB_NONLINEAR_EXT = 1000104012, - VK_COLOR_SPACE_PASS_THROUGH_EXT = 1000104013, - VK_COLOR_SPACE_EXTENDED_SRGB_NONLINEAR_EXT = 1000104014, - VK_COLOR_SPACE_DISPLAY_NATIVE_AMD = 1000213000, - VK_COLORSPACE_SRGB_NONLINEAR_KHR = VK_COLOR_SPACE_SRGB_NONLINEAR_KHR, - VK_COLOR_SPACE_DCI_P3_LINEAR_EXT = VK_COLOR_SPACE_DISPLAY_P3_LINEAR_EXT, - VK_COLOR_SPACE_BEGIN_RANGE_KHR = VK_COLOR_SPACE_SRGB_NONLINEAR_KHR, - VK_COLOR_SPACE_END_RANGE_KHR = VK_COLOR_SPACE_SRGB_NONLINEAR_KHR, - VK_COLOR_SPACE_RANGE_SIZE_KHR = (VK_COLOR_SPACE_SRGB_NONLINEAR_KHR - VK_COLOR_SPACE_SRGB_NONLINEAR_KHR + 1), - VK_COLOR_SPACE_MAX_ENUM_KHR = 0x7FFFFFFF -} VkColorSpaceKHR; +typedef uint64_t VkDeviceAddress; +#define VK_MAX_DRIVER_NAME_SIZE 256 +#define VK_MAX_DRIVER_INFO_SIZE 256 -typedef enum VkPresentModeKHR { - VK_PRESENT_MODE_IMMEDIATE_KHR = 0, - VK_PRESENT_MODE_MAILBOX_KHR = 1, - VK_PRESENT_MODE_FIFO_KHR = 2, - VK_PRESENT_MODE_FIFO_RELAXED_KHR = 3, - VK_PRESENT_MODE_SHARED_DEMAND_REFRESH_KHR = 1000111000, - VK_PRESENT_MODE_SHARED_CONTINUOUS_REFRESH_KHR = 1000111001, - VK_PRESENT_MODE_BEGIN_RANGE_KHR = VK_PRESENT_MODE_IMMEDIATE_KHR, - VK_PRESENT_MODE_END_RANGE_KHR = VK_PRESENT_MODE_FIFO_RELAXED_KHR, - VK_PRESENT_MODE_RANGE_SIZE_KHR = (VK_PRESENT_MODE_FIFO_RELAXED_KHR - VK_PRESENT_MODE_IMMEDIATE_KHR + 1), - VK_PRESENT_MODE_MAX_ENUM_KHR = 0x7FFFFFFF -} VkPresentModeKHR; +typedef enum VkDriverId { + VK_DRIVER_ID_AMD_PROPRIETARY = 1, + VK_DRIVER_ID_AMD_OPEN_SOURCE = 2, + VK_DRIVER_ID_MESA_RADV = 3, + VK_DRIVER_ID_NVIDIA_PROPRIETARY = 4, + VK_DRIVER_ID_INTEL_PROPRIETARY_WINDOWS = 5, + VK_DRIVER_ID_INTEL_OPEN_SOURCE_MESA = 6, + VK_DRIVER_ID_IMAGINATION_PROPRIETARY = 7, + VK_DRIVER_ID_QUALCOMM_PROPRIETARY = 8, + VK_DRIVER_ID_ARM_PROPRIETARY = 9, + VK_DRIVER_ID_GOOGLE_SWIFTSHADER = 10, + VK_DRIVER_ID_GGP_PROPRIETARY = 11, + VK_DRIVER_ID_BROADCOM_PROPRIETARY = 12, + VK_DRIVER_ID_AMD_PROPRIETARY_KHR = VK_DRIVER_ID_AMD_PROPRIETARY, + VK_DRIVER_ID_AMD_OPEN_SOURCE_KHR = VK_DRIVER_ID_AMD_OPEN_SOURCE, + VK_DRIVER_ID_MESA_RADV_KHR = VK_DRIVER_ID_MESA_RADV, + VK_DRIVER_ID_NVIDIA_PROPRIETARY_KHR = VK_DRIVER_ID_NVIDIA_PROPRIETARY, + VK_DRIVER_ID_INTEL_PROPRIETARY_WINDOWS_KHR = VK_DRIVER_ID_INTEL_PROPRIETARY_WINDOWS, + VK_DRIVER_ID_INTEL_OPEN_SOURCE_MESA_KHR = VK_DRIVER_ID_INTEL_OPEN_SOURCE_MESA, + VK_DRIVER_ID_IMAGINATION_PROPRIETARY_KHR = VK_DRIVER_ID_IMAGINATION_PROPRIETARY, + VK_DRIVER_ID_QUALCOMM_PROPRIETARY_KHR = VK_DRIVER_ID_QUALCOMM_PROPRIETARY, + VK_DRIVER_ID_ARM_PROPRIETARY_KHR = VK_DRIVER_ID_ARM_PROPRIETARY, + VK_DRIVER_ID_GOOGLE_SWIFTSHADER_KHR = VK_DRIVER_ID_GOOGLE_SWIFTSHADER, + VK_DRIVER_ID_GGP_PROPRIETARY_KHR = VK_DRIVER_ID_GGP_PROPRIETARY, + VK_DRIVER_ID_BROADCOM_PROPRIETARY_KHR = VK_DRIVER_ID_BROADCOM_PROPRIETARY, + VK_DRIVER_ID_BEGIN_RANGE = VK_DRIVER_ID_AMD_PROPRIETARY, + VK_DRIVER_ID_END_RANGE = VK_DRIVER_ID_BROADCOM_PROPRIETARY, + VK_DRIVER_ID_RANGE_SIZE = (VK_DRIVER_ID_BROADCOM_PROPRIETARY - VK_DRIVER_ID_AMD_PROPRIETARY + 1), + VK_DRIVER_ID_MAX_ENUM = 0x7FFFFFFF +} VkDriverId; + +typedef enum VkShaderFloatControlsIndependence { + VK_SHADER_FLOAT_CONTROLS_INDEPENDENCE_32_BIT_ONLY = 0, + VK_SHADER_FLOAT_CONTROLS_INDEPENDENCE_ALL = 1, + VK_SHADER_FLOAT_CONTROLS_INDEPENDENCE_NONE = 2, + VK_SHADER_FLOAT_CONTROLS_INDEPENDENCE_32_BIT_ONLY_KHR = VK_SHADER_FLOAT_CONTROLS_INDEPENDENCE_32_BIT_ONLY, + VK_SHADER_FLOAT_CONTROLS_INDEPENDENCE_ALL_KHR = VK_SHADER_FLOAT_CONTROLS_INDEPENDENCE_ALL, + VK_SHADER_FLOAT_CONTROLS_INDEPENDENCE_NONE_KHR = VK_SHADER_FLOAT_CONTROLS_INDEPENDENCE_NONE, + VK_SHADER_FLOAT_CONTROLS_INDEPENDENCE_BEGIN_RANGE = VK_SHADER_FLOAT_CONTROLS_INDEPENDENCE_32_BIT_ONLY, + VK_SHADER_FLOAT_CONTROLS_INDEPENDENCE_END_RANGE = VK_SHADER_FLOAT_CONTROLS_INDEPENDENCE_NONE, + VK_SHADER_FLOAT_CONTROLS_INDEPENDENCE_RANGE_SIZE = (VK_SHADER_FLOAT_CONTROLS_INDEPENDENCE_NONE - VK_SHADER_FLOAT_CONTROLS_INDEPENDENCE_32_BIT_ONLY + 1), + VK_SHADER_FLOAT_CONTROLS_INDEPENDENCE_MAX_ENUM = 0x7FFFFFFF +} VkShaderFloatControlsIndependence; + +typedef enum VkSamplerReductionMode { + VK_SAMPLER_REDUCTION_MODE_WEIGHTED_AVERAGE = 0, + VK_SAMPLER_REDUCTION_MODE_MIN = 1, + VK_SAMPLER_REDUCTION_MODE_MAX = 2, + VK_SAMPLER_REDUCTION_MODE_WEIGHTED_AVERAGE_EXT = VK_SAMPLER_REDUCTION_MODE_WEIGHTED_AVERAGE, + VK_SAMPLER_REDUCTION_MODE_MIN_EXT = VK_SAMPLER_REDUCTION_MODE_MIN, + VK_SAMPLER_REDUCTION_MODE_MAX_EXT = VK_SAMPLER_REDUCTION_MODE_MAX, + VK_SAMPLER_REDUCTION_MODE_BEGIN_RANGE = VK_SAMPLER_REDUCTION_MODE_WEIGHTED_AVERAGE, + VK_SAMPLER_REDUCTION_MODE_END_RANGE = VK_SAMPLER_REDUCTION_MODE_MAX, + VK_SAMPLER_REDUCTION_MODE_RANGE_SIZE = (VK_SAMPLER_REDUCTION_MODE_MAX - VK_SAMPLER_REDUCTION_MODE_WEIGHTED_AVERAGE + 1), + VK_SAMPLER_REDUCTION_MODE_MAX_ENUM = 0x7FFFFFFF +} VkSamplerReductionMode; + +typedef enum VkSemaphoreType { + VK_SEMAPHORE_TYPE_BINARY = 0, + VK_SEMAPHORE_TYPE_TIMELINE = 1, + VK_SEMAPHORE_TYPE_BINARY_KHR = VK_SEMAPHORE_TYPE_BINARY, + VK_SEMAPHORE_TYPE_TIMELINE_KHR = VK_SEMAPHORE_TYPE_TIMELINE, + VK_SEMAPHORE_TYPE_BEGIN_RANGE = VK_SEMAPHORE_TYPE_BINARY, + VK_SEMAPHORE_TYPE_END_RANGE = VK_SEMAPHORE_TYPE_TIMELINE, + VK_SEMAPHORE_TYPE_RANGE_SIZE = (VK_SEMAPHORE_TYPE_TIMELINE - VK_SEMAPHORE_TYPE_BINARY + 1), + VK_SEMAPHORE_TYPE_MAX_ENUM = 0x7FFFFFFF +} VkSemaphoreType; + +typedef enum VkResolveModeFlagBits { + VK_RESOLVE_MODE_NONE = 0, + VK_RESOLVE_MODE_SAMPLE_ZERO_BIT = 0x00000001, + VK_RESOLVE_MODE_AVERAGE_BIT = 0x00000002, + VK_RESOLVE_MODE_MIN_BIT = 0x00000004, + VK_RESOLVE_MODE_MAX_BIT = 0x00000008, + VK_RESOLVE_MODE_NONE_KHR = VK_RESOLVE_MODE_NONE, + VK_RESOLVE_MODE_SAMPLE_ZERO_BIT_KHR = VK_RESOLVE_MODE_SAMPLE_ZERO_BIT, + VK_RESOLVE_MODE_AVERAGE_BIT_KHR = VK_RESOLVE_MODE_AVERAGE_BIT, + VK_RESOLVE_MODE_MIN_BIT_KHR = VK_RESOLVE_MODE_MIN_BIT, + VK_RESOLVE_MODE_MAX_BIT_KHR = VK_RESOLVE_MODE_MAX_BIT, + VK_RESOLVE_MODE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF +} VkResolveModeFlagBits; +typedef VkFlags VkResolveModeFlags; + +typedef enum VkDescriptorBindingFlagBits { + VK_DESCRIPTOR_BINDING_UPDATE_AFTER_BIND_BIT = 0x00000001, + VK_DESCRIPTOR_BINDING_UPDATE_UNUSED_WHILE_PENDING_BIT = 0x00000002, + VK_DESCRIPTOR_BINDING_PARTIALLY_BOUND_BIT = 0x00000004, + VK_DESCRIPTOR_BINDING_VARIABLE_DESCRIPTOR_COUNT_BIT = 0x00000008, + VK_DESCRIPTOR_BINDING_UPDATE_AFTER_BIND_BIT_EXT = VK_DESCRIPTOR_BINDING_UPDATE_AFTER_BIND_BIT, + VK_DESCRIPTOR_BINDING_UPDATE_UNUSED_WHILE_PENDING_BIT_EXT = VK_DESCRIPTOR_BINDING_UPDATE_UNUSED_WHILE_PENDING_BIT, + VK_DESCRIPTOR_BINDING_PARTIALLY_BOUND_BIT_EXT = VK_DESCRIPTOR_BINDING_PARTIALLY_BOUND_BIT, + VK_DESCRIPTOR_BINDING_VARIABLE_DESCRIPTOR_COUNT_BIT_EXT = VK_DESCRIPTOR_BINDING_VARIABLE_DESCRIPTOR_COUNT_BIT, + VK_DESCRIPTOR_BINDING_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF +} VkDescriptorBindingFlagBits; +typedef VkFlags VkDescriptorBindingFlags; + +typedef enum VkSemaphoreWaitFlagBits { + VK_SEMAPHORE_WAIT_ANY_BIT = 0x00000001, + VK_SEMAPHORE_WAIT_ANY_BIT_KHR = VK_SEMAPHORE_WAIT_ANY_BIT, + VK_SEMAPHORE_WAIT_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF +} VkSemaphoreWaitFlagBits; +typedef VkFlags VkSemaphoreWaitFlags; +typedef struct VkPhysicalDeviceVulkan11Features { + VkStructureType sType; + void* pNext; + VkBool32 storageBuffer16BitAccess; + VkBool32 uniformAndStorageBuffer16BitAccess; + VkBool32 storagePushConstant16; + VkBool32 storageInputOutput16; + VkBool32 multiview; + VkBool32 multiviewGeometryShader; + VkBool32 multiviewTessellationShader; + VkBool32 variablePointersStorageBuffer; + VkBool32 variablePointers; + VkBool32 protectedMemory; + VkBool32 samplerYcbcrConversion; + VkBool32 shaderDrawParameters; +} VkPhysicalDeviceVulkan11Features; -typedef enum VkSurfaceTransformFlagBitsKHR { - VK_SURFACE_TRANSFORM_IDENTITY_BIT_KHR = 0x00000001, - VK_SURFACE_TRANSFORM_ROTATE_90_BIT_KHR = 0x00000002, - VK_SURFACE_TRANSFORM_ROTATE_180_BIT_KHR = 0x00000004, - VK_SURFACE_TRANSFORM_ROTATE_270_BIT_KHR = 0x00000008, - VK_SURFACE_TRANSFORM_HORIZONTAL_MIRROR_BIT_KHR = 0x00000010, - VK_SURFACE_TRANSFORM_HORIZONTAL_MIRROR_ROTATE_90_BIT_KHR = 0x00000020, - VK_SURFACE_TRANSFORM_HORIZONTAL_MIRROR_ROTATE_180_BIT_KHR = 0x00000040, - VK_SURFACE_TRANSFORM_HORIZONTAL_MIRROR_ROTATE_270_BIT_KHR = 0x00000080, - VK_SURFACE_TRANSFORM_INHERIT_BIT_KHR = 0x00000100, - VK_SURFACE_TRANSFORM_FLAG_BITS_MAX_ENUM_KHR = 0x7FFFFFFF -} VkSurfaceTransformFlagBitsKHR; -typedef VkFlags VkSurfaceTransformFlagsKHR; +typedef struct VkPhysicalDeviceVulkan11Properties { + VkStructureType sType; + void* pNext; + uint8_t deviceUUID[VK_UUID_SIZE]; + uint8_t driverUUID[VK_UUID_SIZE]; + uint8_t deviceLUID[VK_LUID_SIZE]; + uint32_t deviceNodeMask; + VkBool32 deviceLUIDValid; + uint32_t subgroupSize; + VkShaderStageFlags subgroupSupportedStages; + VkSubgroupFeatureFlags subgroupSupportedOperations; + VkBool32 subgroupQuadOperationsInAllStages; + VkPointClippingBehavior pointClippingBehavior; + uint32_t maxMultiviewViewCount; + uint32_t maxMultiviewInstanceIndex; + VkBool32 protectedNoFault; + uint32_t maxPerSetDescriptors; + VkDeviceSize maxMemoryAllocationSize; +} VkPhysicalDeviceVulkan11Properties; -typedef enum VkCompositeAlphaFlagBitsKHR { - VK_COMPOSITE_ALPHA_OPAQUE_BIT_KHR = 0x00000001, - VK_COMPOSITE_ALPHA_PRE_MULTIPLIED_BIT_KHR = 0x00000002, - VK_COMPOSITE_ALPHA_POST_MULTIPLIED_BIT_KHR = 0x00000004, - VK_COMPOSITE_ALPHA_INHERIT_BIT_KHR = 0x00000008, - VK_COMPOSITE_ALPHA_FLAG_BITS_MAX_ENUM_KHR = 0x7FFFFFFF -} VkCompositeAlphaFlagBitsKHR; -typedef VkFlags VkCompositeAlphaFlagsKHR; -typedef struct VkSurfaceCapabilitiesKHR { - uint32_t minImageCount; - uint32_t maxImageCount; - VkExtent2D currentExtent; - VkExtent2D minImageExtent; - VkExtent2D maxImageExtent; - uint32_t maxImageArrayLayers; - VkSurfaceTransformFlagsKHR supportedTransforms; - VkSurfaceTransformFlagBitsKHR currentTransform; - VkCompositeAlphaFlagsKHR supportedCompositeAlpha; - VkImageUsageFlags supportedUsageFlags; -} VkSurfaceCapabilitiesKHR; +typedef struct VkPhysicalDeviceVulkan12Features { + VkStructureType sType; + void* pNext; + VkBool32 samplerMirrorClampToEdge; + VkBool32 drawIndirectCount; + VkBool32 storageBuffer8BitAccess; + VkBool32 uniformAndStorageBuffer8BitAccess; + VkBool32 storagePushConstant8; + VkBool32 shaderBufferInt64Atomics; + VkBool32 shaderSharedInt64Atomics; + VkBool32 shaderFloat16; + VkBool32 shaderInt8; + VkBool32 descriptorIndexing; + VkBool32 shaderInputAttachmentArrayDynamicIndexing; + VkBool32 shaderUniformTexelBufferArrayDynamicIndexing; + VkBool32 shaderStorageTexelBufferArrayDynamicIndexing; + VkBool32 shaderUniformBufferArrayNonUniformIndexing; + VkBool32 shaderSampledImageArrayNonUniformIndexing; + VkBool32 shaderStorageBufferArrayNonUniformIndexing; + VkBool32 shaderStorageImageArrayNonUniformIndexing; + VkBool32 shaderInputAttachmentArrayNonUniformIndexing; + VkBool32 shaderUniformTexelBufferArrayNonUniformIndexing; + VkBool32 shaderStorageTexelBufferArrayNonUniformIndexing; + VkBool32 descriptorBindingUniformBufferUpdateAfterBind; + VkBool32 descriptorBindingSampledImageUpdateAfterBind; + VkBool32 descriptorBindingStorageImageUpdateAfterBind; + VkBool32 descriptorBindingStorageBufferUpdateAfterBind; + VkBool32 descriptorBindingUniformTexelBufferUpdateAfterBind; + VkBool32 descriptorBindingStorageTexelBufferUpdateAfterBind; + VkBool32 descriptorBindingUpdateUnusedWhilePending; + VkBool32 descriptorBindingPartiallyBound; + VkBool32 descriptorBindingVariableDescriptorCount; + VkBool32 runtimeDescriptorArray; + VkBool32 samplerFilterMinmax; + VkBool32 scalarBlockLayout; + VkBool32 imagelessFramebuffer; + VkBool32 uniformBufferStandardLayout; + VkBool32 shaderSubgroupExtendedTypes; + VkBool32 separateDepthStencilLayouts; + VkBool32 hostQueryReset; + VkBool32 timelineSemaphore; + VkBool32 bufferDeviceAddress; + VkBool32 bufferDeviceAddressCaptureReplay; + VkBool32 bufferDeviceAddressMultiDevice; + VkBool32 vulkanMemoryModel; + VkBool32 vulkanMemoryModelDeviceScope; + VkBool32 vulkanMemoryModelAvailabilityVisibilityChains; + VkBool32 shaderOutputViewportIndex; + VkBool32 shaderOutputLayer; + VkBool32 subgroupBroadcastDynamicId; +} VkPhysicalDeviceVulkan12Features; -typedef struct VkSurfaceFormatKHR { - VkFormat format; - VkColorSpaceKHR colorSpace; -} VkSurfaceFormatKHR; +typedef struct VkConformanceVersion { + uint8_t major; + uint8_t minor; + uint8_t subminor; + uint8_t patch; +} VkConformanceVersion; -typedef void (VKAPI_PTR *PFN_vkDestroySurfaceKHR)(VkInstance instance, VkSurfaceKHR surface, const VkAllocationCallbacks* pAllocator); -typedef VkResult (VKAPI_PTR *PFN_vkGetPhysicalDeviceSurfaceSupportKHR)(VkPhysicalDevice physicalDevice, uint32_t queueFamilyIndex, VkSurfaceKHR surface, VkBool32* pSupported); -typedef VkResult (VKAPI_PTR *PFN_vkGetPhysicalDeviceSurfaceCapabilitiesKHR)(VkPhysicalDevice physicalDevice, VkSurfaceKHR surface, VkSurfaceCapabilitiesKHR* pSurfaceCapabilities); -typedef VkResult (VKAPI_PTR *PFN_vkGetPhysicalDeviceSurfaceFormatsKHR)(VkPhysicalDevice physicalDevice, VkSurfaceKHR surface, uint32_t* pSurfaceFormatCount, VkSurfaceFormatKHR* pSurfaceFormats); -typedef VkResult (VKAPI_PTR *PFN_vkGetPhysicalDeviceSurfacePresentModesKHR)(VkPhysicalDevice physicalDevice, VkSurfaceKHR surface, uint32_t* pPresentModeCount, VkPresentModeKHR* pPresentModes); +typedef struct VkPhysicalDeviceVulkan12Properties { + VkStructureType sType; + void* pNext; + VkDriverId driverID; + char driverName[VK_MAX_DRIVER_NAME_SIZE]; + char driverInfo[VK_MAX_DRIVER_INFO_SIZE]; + VkConformanceVersion conformanceVersion; + VkShaderFloatControlsIndependence denormBehaviorIndependence; + VkShaderFloatControlsIndependence roundingModeIndependence; + VkBool32 shaderSignedZeroInfNanPreserveFloat16; + VkBool32 shaderSignedZeroInfNanPreserveFloat32; + VkBool32 shaderSignedZeroInfNanPreserveFloat64; + VkBool32 shaderDenormPreserveFloat16; + VkBool32 shaderDenormPreserveFloat32; + VkBool32 shaderDenormPreserveFloat64; + VkBool32 shaderDenormFlushToZeroFloat16; + VkBool32 shaderDenormFlushToZeroFloat32; + VkBool32 shaderDenormFlushToZeroFloat64; + VkBool32 shaderRoundingModeRTEFloat16; + VkBool32 shaderRoundingModeRTEFloat32; + VkBool32 shaderRoundingModeRTEFloat64; + VkBool32 shaderRoundingModeRTZFloat16; + VkBool32 shaderRoundingModeRTZFloat32; + VkBool32 shaderRoundingModeRTZFloat64; + uint32_t maxUpdateAfterBindDescriptorsInAllPools; + VkBool32 shaderUniformBufferArrayNonUniformIndexingNative; + VkBool32 shaderSampledImageArrayNonUniformIndexingNative; + VkBool32 shaderStorageBufferArrayNonUniformIndexingNative; + VkBool32 shaderStorageImageArrayNonUniformIndexingNative; + VkBool32 shaderInputAttachmentArrayNonUniformIndexingNative; + VkBool32 robustBufferAccessUpdateAfterBind; + VkBool32 quadDivergentImplicitLod; + uint32_t maxPerStageDescriptorUpdateAfterBindSamplers; + uint32_t maxPerStageDescriptorUpdateAfterBindUniformBuffers; + uint32_t maxPerStageDescriptorUpdateAfterBindStorageBuffers; + uint32_t maxPerStageDescriptorUpdateAfterBindSampledImages; + uint32_t maxPerStageDescriptorUpdateAfterBindStorageImages; + uint32_t maxPerStageDescriptorUpdateAfterBindInputAttachments; + uint32_t maxPerStageUpdateAfterBindResources; + uint32_t maxDescriptorSetUpdateAfterBindSamplers; + uint32_t maxDescriptorSetUpdateAfterBindUniformBuffers; + uint32_t maxDescriptorSetUpdateAfterBindUniformBuffersDynamic; + uint32_t maxDescriptorSetUpdateAfterBindStorageBuffers; + uint32_t maxDescriptorSetUpdateAfterBindStorageBuffersDynamic; + uint32_t maxDescriptorSetUpdateAfterBindSampledImages; + uint32_t maxDescriptorSetUpdateAfterBindStorageImages; + uint32_t maxDescriptorSetUpdateAfterBindInputAttachments; + VkResolveModeFlags supportedDepthResolveModes; + VkResolveModeFlags supportedStencilResolveModes; + VkBool32 independentResolveNone; + VkBool32 independentResolve; + VkBool32 filterMinmaxSingleComponentFormats; + VkBool32 filterMinmaxImageComponentMapping; + uint64_t maxTimelineSemaphoreValueDifference; + VkSampleCountFlags framebufferIntegerColorSampleCounts; +} VkPhysicalDeviceVulkan12Properties; -#ifndef VK_NO_PROTOTYPES -VKAPI_ATTR void VKAPI_CALL vkDestroySurfaceKHR( - VkInstance instance, - VkSurfaceKHR surface, - const VkAllocationCallbacks* pAllocator); +typedef struct VkImageFormatListCreateInfo { + VkStructureType sType; + const void* pNext; + uint32_t viewFormatCount; + const VkFormat* pViewFormats; +} VkImageFormatListCreateInfo; -VKAPI_ATTR VkResult VKAPI_CALL vkGetPhysicalDeviceSurfaceSupportKHR( - VkPhysicalDevice physicalDevice, - uint32_t queueFamilyIndex, - VkSurfaceKHR surface, - VkBool32* pSupported); +typedef struct VkAttachmentDescription2 { + VkStructureType sType; + const void* pNext; + VkAttachmentDescriptionFlags flags; + VkFormat format; + VkSampleCountFlagBits samples; + VkAttachmentLoadOp loadOp; + VkAttachmentStoreOp storeOp; + VkAttachmentLoadOp stencilLoadOp; + VkAttachmentStoreOp stencilStoreOp; + VkImageLayout initialLayout; + VkImageLayout finalLayout; +} VkAttachmentDescription2; -VKAPI_ATTR VkResult VKAPI_CALL vkGetPhysicalDeviceSurfaceCapabilitiesKHR( - VkPhysicalDevice physicalDevice, - VkSurfaceKHR surface, - VkSurfaceCapabilitiesKHR* pSurfaceCapabilities); +typedef struct VkAttachmentReference2 { + VkStructureType sType; + const void* pNext; + uint32_t attachment; + VkImageLayout layout; + VkImageAspectFlags aspectMask; +} VkAttachmentReference2; -VKAPI_ATTR VkResult VKAPI_CALL vkGetPhysicalDeviceSurfaceFormatsKHR( - VkPhysicalDevice physicalDevice, - VkSurfaceKHR surface, - uint32_t* pSurfaceFormatCount, - VkSurfaceFormatKHR* pSurfaceFormats); +typedef struct VkSubpassDescription2 { + VkStructureType sType; + const void* pNext; + VkSubpassDescriptionFlags flags; + VkPipelineBindPoint pipelineBindPoint; + uint32_t viewMask; + uint32_t inputAttachmentCount; + const VkAttachmentReference2* pInputAttachments; + uint32_t colorAttachmentCount; + const VkAttachmentReference2* pColorAttachments; + const VkAttachmentReference2* pResolveAttachments; + const VkAttachmentReference2* pDepthStencilAttachment; + uint32_t preserveAttachmentCount; + const uint32_t* pPreserveAttachments; +} VkSubpassDescription2; -VKAPI_ATTR VkResult VKAPI_CALL vkGetPhysicalDeviceSurfacePresentModesKHR( - VkPhysicalDevice physicalDevice, - VkSurfaceKHR surface, - uint32_t* pPresentModeCount, - VkPresentModeKHR* pPresentModes); -#endif +typedef struct VkSubpassDependency2 { + VkStructureType sType; + const void* pNext; + uint32_t srcSubpass; + uint32_t dstSubpass; + VkPipelineStageFlags srcStageMask; + VkPipelineStageFlags dstStageMask; + VkAccessFlags srcAccessMask; + VkAccessFlags dstAccessMask; + VkDependencyFlags dependencyFlags; + int32_t viewOffset; +} VkSubpassDependency2; +typedef struct VkRenderPassCreateInfo2 { + VkStructureType sType; + const void* pNext; + VkRenderPassCreateFlags flags; + uint32_t attachmentCount; + const VkAttachmentDescription2* pAttachments; + uint32_t subpassCount; + const VkSubpassDescription2* pSubpasses; + uint32_t dependencyCount; + const VkSubpassDependency2* pDependencies; + uint32_t correlatedViewMaskCount; + const uint32_t* pCorrelatedViewMasks; +} VkRenderPassCreateInfo2; -#define VK_KHR_swapchain 1 -VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkSwapchainKHR) -#define VK_KHR_SWAPCHAIN_SPEC_VERSION 70 -#define VK_KHR_SWAPCHAIN_EXTENSION_NAME "VK_KHR_swapchain" +typedef struct VkSubpassBeginInfo { + VkStructureType sType; + const void* pNext; + VkSubpassContents contents; +} VkSubpassBeginInfo; -typedef enum VkSwapchainCreateFlagBitsKHR { - VK_SWAPCHAIN_CREATE_SPLIT_INSTANCE_BIND_REGIONS_BIT_KHR = 0x00000001, - VK_SWAPCHAIN_CREATE_PROTECTED_BIT_KHR = 0x00000002, - VK_SWAPCHAIN_CREATE_MUTABLE_FORMAT_BIT_KHR = 0x00000004, - VK_SWAPCHAIN_CREATE_FLAG_BITS_MAX_ENUM_KHR = 0x7FFFFFFF -} VkSwapchainCreateFlagBitsKHR; -typedef VkFlags VkSwapchainCreateFlagsKHR; +typedef struct VkSubpassEndInfo { + VkStructureType sType; + const void* pNext; +} VkSubpassEndInfo; -typedef enum VkDeviceGroupPresentModeFlagBitsKHR { - VK_DEVICE_GROUP_PRESENT_MODE_LOCAL_BIT_KHR = 0x00000001, - VK_DEVICE_GROUP_PRESENT_MODE_REMOTE_BIT_KHR = 0x00000002, - VK_DEVICE_GROUP_PRESENT_MODE_SUM_BIT_KHR = 0x00000004, - VK_DEVICE_GROUP_PRESENT_MODE_LOCAL_MULTI_DEVICE_BIT_KHR = 0x00000008, - VK_DEVICE_GROUP_PRESENT_MODE_FLAG_BITS_MAX_ENUM_KHR = 0x7FFFFFFF -} VkDeviceGroupPresentModeFlagBitsKHR; -typedef VkFlags VkDeviceGroupPresentModeFlagsKHR; -typedef struct VkSwapchainCreateInfoKHR { - VkStructureType sType; - const void* pNext; - VkSwapchainCreateFlagsKHR flags; - VkSurfaceKHR surface; - uint32_t minImageCount; - VkFormat imageFormat; - VkColorSpaceKHR imageColorSpace; - VkExtent2D imageExtent; - uint32_t imageArrayLayers; - VkImageUsageFlags imageUsage; - VkSharingMode imageSharingMode; - uint32_t queueFamilyIndexCount; - const uint32_t* pQueueFamilyIndices; - VkSurfaceTransformFlagBitsKHR preTransform; - VkCompositeAlphaFlagBitsKHR compositeAlpha; - VkPresentModeKHR presentMode; - VkBool32 clipped; - VkSwapchainKHR oldSwapchain; +typedef struct VkPhysicalDevice8BitStorageFeatures { + VkStructureType sType; + void* pNext; + VkBool32 storageBuffer8BitAccess; + VkBool32 uniformAndStorageBuffer8BitAccess; + VkBool32 storagePushConstant8; +} VkPhysicalDevice8BitStorageFeatures; + +typedef struct VkPhysicalDeviceDriverProperties { + VkStructureType sType; + void* pNext; + VkDriverId driverID; + char driverName[VK_MAX_DRIVER_NAME_SIZE]; + char driverInfo[VK_MAX_DRIVER_INFO_SIZE]; + VkConformanceVersion conformanceVersion; +} VkPhysicalDeviceDriverProperties; + +typedef struct VkPhysicalDeviceShaderAtomicInt64Features { + VkStructureType sType; + void* pNext; + VkBool32 shaderBufferInt64Atomics; + VkBool32 shaderSharedInt64Atomics; +} VkPhysicalDeviceShaderAtomicInt64Features; + +typedef struct VkPhysicalDeviceShaderFloat16Int8Features { + VkStructureType sType; + void* pNext; + VkBool32 shaderFloat16; + VkBool32 shaderInt8; +} VkPhysicalDeviceShaderFloat16Int8Features; + +typedef struct VkPhysicalDeviceFloatControlsProperties { + VkStructureType sType; + void* pNext; + VkShaderFloatControlsIndependence denormBehaviorIndependence; + VkShaderFloatControlsIndependence roundingModeIndependence; + VkBool32 shaderSignedZeroInfNanPreserveFloat16; + VkBool32 shaderSignedZeroInfNanPreserveFloat32; + VkBool32 shaderSignedZeroInfNanPreserveFloat64; + VkBool32 shaderDenormPreserveFloat16; + VkBool32 shaderDenormPreserveFloat32; + VkBool32 shaderDenormPreserveFloat64; + VkBool32 shaderDenormFlushToZeroFloat16; + VkBool32 shaderDenormFlushToZeroFloat32; + VkBool32 shaderDenormFlushToZeroFloat64; + VkBool32 shaderRoundingModeRTEFloat16; + VkBool32 shaderRoundingModeRTEFloat32; + VkBool32 shaderRoundingModeRTEFloat64; + VkBool32 shaderRoundingModeRTZFloat16; + VkBool32 shaderRoundingModeRTZFloat32; + VkBool32 shaderRoundingModeRTZFloat64; +} VkPhysicalDeviceFloatControlsProperties; + +typedef struct VkDescriptorSetLayoutBindingFlagsCreateInfo { + VkStructureType sType; + const void* pNext; + uint32_t bindingCount; + const VkDescriptorBindingFlags* pBindingFlags; +} VkDescriptorSetLayoutBindingFlagsCreateInfo; + +typedef struct VkPhysicalDeviceDescriptorIndexingFeatures { + VkStructureType sType; + void* pNext; + VkBool32 shaderInputAttachmentArrayDynamicIndexing; + VkBool32 shaderUniformTexelBufferArrayDynamicIndexing; + VkBool32 shaderStorageTexelBufferArrayDynamicIndexing; + VkBool32 shaderUniformBufferArrayNonUniformIndexing; + VkBool32 shaderSampledImageArrayNonUniformIndexing; + VkBool32 shaderStorageBufferArrayNonUniformIndexing; + VkBool32 shaderStorageImageArrayNonUniformIndexing; + VkBool32 shaderInputAttachmentArrayNonUniformIndexing; + VkBool32 shaderUniformTexelBufferArrayNonUniformIndexing; + VkBool32 shaderStorageTexelBufferArrayNonUniformIndexing; + VkBool32 descriptorBindingUniformBufferUpdateAfterBind; + VkBool32 descriptorBindingSampledImageUpdateAfterBind; + VkBool32 descriptorBindingStorageImageUpdateAfterBind; + VkBool32 descriptorBindingStorageBufferUpdateAfterBind; + VkBool32 descriptorBindingUniformTexelBufferUpdateAfterBind; + VkBool32 descriptorBindingStorageTexelBufferUpdateAfterBind; + VkBool32 descriptorBindingUpdateUnusedWhilePending; + VkBool32 descriptorBindingPartiallyBound; + VkBool32 descriptorBindingVariableDescriptorCount; + VkBool32 runtimeDescriptorArray; +} VkPhysicalDeviceDescriptorIndexingFeatures; + +typedef struct VkPhysicalDeviceDescriptorIndexingProperties { + VkStructureType sType; + void* pNext; + uint32_t maxUpdateAfterBindDescriptorsInAllPools; + VkBool32 shaderUniformBufferArrayNonUniformIndexingNative; + VkBool32 shaderSampledImageArrayNonUniformIndexingNative; + VkBool32 shaderStorageBufferArrayNonUniformIndexingNative; + VkBool32 shaderStorageImageArrayNonUniformIndexingNative; + VkBool32 shaderInputAttachmentArrayNonUniformIndexingNative; + VkBool32 robustBufferAccessUpdateAfterBind; + VkBool32 quadDivergentImplicitLod; + uint32_t maxPerStageDescriptorUpdateAfterBindSamplers; + uint32_t maxPerStageDescriptorUpdateAfterBindUniformBuffers; + uint32_t maxPerStageDescriptorUpdateAfterBindStorageBuffers; + uint32_t maxPerStageDescriptorUpdateAfterBindSampledImages; + uint32_t maxPerStageDescriptorUpdateAfterBindStorageImages; + uint32_t maxPerStageDescriptorUpdateAfterBindInputAttachments; + uint32_t maxPerStageUpdateAfterBindResources; + uint32_t maxDescriptorSetUpdateAfterBindSamplers; + uint32_t maxDescriptorSetUpdateAfterBindUniformBuffers; + uint32_t maxDescriptorSetUpdateAfterBindUniformBuffersDynamic; + uint32_t maxDescriptorSetUpdateAfterBindStorageBuffers; + uint32_t maxDescriptorSetUpdateAfterBindStorageBuffersDynamic; + uint32_t maxDescriptorSetUpdateAfterBindSampledImages; + uint32_t maxDescriptorSetUpdateAfterBindStorageImages; + uint32_t maxDescriptorSetUpdateAfterBindInputAttachments; +} VkPhysicalDeviceDescriptorIndexingProperties; + +typedef struct VkDescriptorSetVariableDescriptorCountAllocateInfo { + VkStructureType sType; + const void* pNext; + uint32_t descriptorSetCount; + const uint32_t* pDescriptorCounts; +} VkDescriptorSetVariableDescriptorCountAllocateInfo; + +typedef struct VkDescriptorSetVariableDescriptorCountLayoutSupport { + VkStructureType sType; + void* pNext; + uint32_t maxVariableDescriptorCount; +} VkDescriptorSetVariableDescriptorCountLayoutSupport; + +typedef struct VkSubpassDescriptionDepthStencilResolve { + VkStructureType sType; + const void* pNext; + VkResolveModeFlagBits depthResolveMode; + VkResolveModeFlagBits stencilResolveMode; + const VkAttachmentReference2* pDepthStencilResolveAttachment; +} VkSubpassDescriptionDepthStencilResolve; + +typedef struct VkPhysicalDeviceDepthStencilResolveProperties { + VkStructureType sType; + void* pNext; + VkResolveModeFlags supportedDepthResolveModes; + VkResolveModeFlags supportedStencilResolveModes; + VkBool32 independentResolveNone; + VkBool32 independentResolve; +} VkPhysicalDeviceDepthStencilResolveProperties; + +typedef struct VkPhysicalDeviceScalarBlockLayoutFeatures { + VkStructureType sType; + void* pNext; + VkBool32 scalarBlockLayout; +} VkPhysicalDeviceScalarBlockLayoutFeatures; + +typedef struct VkImageStencilUsageCreateInfo { + VkStructureType sType; + const void* pNext; + VkImageUsageFlags stencilUsage; +} VkImageStencilUsageCreateInfo; + +typedef struct VkSamplerReductionModeCreateInfo { + VkStructureType sType; + const void* pNext; + VkSamplerReductionMode reductionMode; +} VkSamplerReductionModeCreateInfo; + +typedef struct VkPhysicalDeviceSamplerFilterMinmaxProperties { + VkStructureType sType; + void* pNext; + VkBool32 filterMinmaxSingleComponentFormats; + VkBool32 filterMinmaxImageComponentMapping; +} VkPhysicalDeviceSamplerFilterMinmaxProperties; + +typedef struct VkPhysicalDeviceVulkanMemoryModelFeatures { + VkStructureType sType; + void* pNext; + VkBool32 vulkanMemoryModel; + VkBool32 vulkanMemoryModelDeviceScope; + VkBool32 vulkanMemoryModelAvailabilityVisibilityChains; +} VkPhysicalDeviceVulkanMemoryModelFeatures; + +typedef struct VkPhysicalDeviceImagelessFramebufferFeatures { + VkStructureType sType; + void* pNext; + VkBool32 imagelessFramebuffer; +} VkPhysicalDeviceImagelessFramebufferFeatures; + +typedef struct VkFramebufferAttachmentImageInfo { + VkStructureType sType; + const void* pNext; + VkImageCreateFlags flags; + VkImageUsageFlags usage; + uint32_t width; + uint32_t height; + uint32_t layerCount; + uint32_t viewFormatCount; + const VkFormat* pViewFormats; +} VkFramebufferAttachmentImageInfo; + +typedef struct VkFramebufferAttachmentsCreateInfo { + VkStructureType sType; + const void* pNext; + uint32_t attachmentImageInfoCount; + const VkFramebufferAttachmentImageInfo* pAttachmentImageInfos; +} VkFramebufferAttachmentsCreateInfo; + +typedef struct VkRenderPassAttachmentBeginInfo { + VkStructureType sType; + const void* pNext; + uint32_t attachmentCount; + const VkImageView* pAttachments; +} VkRenderPassAttachmentBeginInfo; + +typedef struct VkPhysicalDeviceUniformBufferStandardLayoutFeatures { + VkStructureType sType; + void* pNext; + VkBool32 uniformBufferStandardLayout; +} VkPhysicalDeviceUniformBufferStandardLayoutFeatures; + +typedef struct VkPhysicalDeviceShaderSubgroupExtendedTypesFeatures { + VkStructureType sType; + void* pNext; + VkBool32 shaderSubgroupExtendedTypes; +} VkPhysicalDeviceShaderSubgroupExtendedTypesFeatures; + +typedef struct VkPhysicalDeviceSeparateDepthStencilLayoutsFeatures { + VkStructureType sType; + void* pNext; + VkBool32 separateDepthStencilLayouts; +} VkPhysicalDeviceSeparateDepthStencilLayoutsFeatures; + +typedef struct VkAttachmentReferenceStencilLayout { + VkStructureType sType; + void* pNext; + VkImageLayout stencilLayout; +} VkAttachmentReferenceStencilLayout; + +typedef struct VkAttachmentDescriptionStencilLayout { + VkStructureType sType; + void* pNext; + VkImageLayout stencilInitialLayout; + VkImageLayout stencilFinalLayout; +} VkAttachmentDescriptionStencilLayout; + +typedef struct VkPhysicalDeviceHostQueryResetFeatures { + VkStructureType sType; + void* pNext; + VkBool32 hostQueryReset; +} VkPhysicalDeviceHostQueryResetFeatures; + +typedef struct VkPhysicalDeviceTimelineSemaphoreFeatures { + VkStructureType sType; + void* pNext; + VkBool32 timelineSemaphore; +} VkPhysicalDeviceTimelineSemaphoreFeatures; + +typedef struct VkPhysicalDeviceTimelineSemaphoreProperties { + VkStructureType sType; + void* pNext; + uint64_t maxTimelineSemaphoreValueDifference; +} VkPhysicalDeviceTimelineSemaphoreProperties; + +typedef struct VkSemaphoreTypeCreateInfo { + VkStructureType sType; + const void* pNext; + VkSemaphoreType semaphoreType; + uint64_t initialValue; +} VkSemaphoreTypeCreateInfo; + +typedef struct VkTimelineSemaphoreSubmitInfo { + VkStructureType sType; + const void* pNext; + uint32_t waitSemaphoreValueCount; + const uint64_t* pWaitSemaphoreValues; + uint32_t signalSemaphoreValueCount; + const uint64_t* pSignalSemaphoreValues; +} VkTimelineSemaphoreSubmitInfo; + +typedef struct VkSemaphoreWaitInfo { + VkStructureType sType; + const void* pNext; + VkSemaphoreWaitFlags flags; + uint32_t semaphoreCount; + const VkSemaphore* pSemaphores; + const uint64_t* pValues; +} VkSemaphoreWaitInfo; + +typedef struct VkSemaphoreSignalInfo { + VkStructureType sType; + const void* pNext; + VkSemaphore semaphore; + uint64_t value; +} VkSemaphoreSignalInfo; + +typedef struct VkPhysicalDeviceBufferDeviceAddressFeatures { + VkStructureType sType; + void* pNext; + VkBool32 bufferDeviceAddress; + VkBool32 bufferDeviceAddressCaptureReplay; + VkBool32 bufferDeviceAddressMultiDevice; +} VkPhysicalDeviceBufferDeviceAddressFeatures; + +typedef struct VkBufferDeviceAddressInfo { + VkStructureType sType; + const void* pNext; + VkBuffer buffer; +} VkBufferDeviceAddressInfo; + +typedef struct VkBufferOpaqueCaptureAddressCreateInfo { + VkStructureType sType; + const void* pNext; + uint64_t opaqueCaptureAddress; +} VkBufferOpaqueCaptureAddressCreateInfo; + +typedef struct VkMemoryOpaqueCaptureAddressAllocateInfo { + VkStructureType sType; + const void* pNext; + uint64_t opaqueCaptureAddress; +} VkMemoryOpaqueCaptureAddressAllocateInfo; + +typedef struct VkDeviceMemoryOpaqueCaptureAddressInfo { + VkStructureType sType; + const void* pNext; + VkDeviceMemory memory; +} VkDeviceMemoryOpaqueCaptureAddressInfo; + +typedef void (VKAPI_PTR *PFN_vkCmdDrawIndirectCount)(VkCommandBuffer commandBuffer, VkBuffer buffer, VkDeviceSize offset, VkBuffer countBuffer, VkDeviceSize countBufferOffset, uint32_t maxDrawCount, uint32_t stride); +typedef void (VKAPI_PTR *PFN_vkCmdDrawIndexedIndirectCount)(VkCommandBuffer commandBuffer, VkBuffer buffer, VkDeviceSize offset, VkBuffer countBuffer, VkDeviceSize countBufferOffset, uint32_t maxDrawCount, uint32_t stride); +typedef VkResult (VKAPI_PTR *PFN_vkCreateRenderPass2)(VkDevice device, const VkRenderPassCreateInfo2* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkRenderPass* pRenderPass); +typedef void (VKAPI_PTR *PFN_vkCmdBeginRenderPass2)(VkCommandBuffer commandBuffer, const VkRenderPassBeginInfo* pRenderPassBegin, const VkSubpassBeginInfo* pSubpassBeginInfo); +typedef void (VKAPI_PTR *PFN_vkCmdNextSubpass2)(VkCommandBuffer commandBuffer, const VkSubpassBeginInfo* pSubpassBeginInfo, const VkSubpassEndInfo* pSubpassEndInfo); +typedef void (VKAPI_PTR *PFN_vkCmdEndRenderPass2)(VkCommandBuffer commandBuffer, const VkSubpassEndInfo* pSubpassEndInfo); +typedef void (VKAPI_PTR *PFN_vkResetQueryPool)(VkDevice device, VkQueryPool queryPool, uint32_t firstQuery, uint32_t queryCount); +typedef VkResult (VKAPI_PTR *PFN_vkGetSemaphoreCounterValue)(VkDevice device, VkSemaphore semaphore, uint64_t* pValue); +typedef VkResult (VKAPI_PTR *PFN_vkWaitSemaphores)(VkDevice device, const VkSemaphoreWaitInfo* pWaitInfo, uint64_t timeout); +typedef VkResult (VKAPI_PTR *PFN_vkSignalSemaphore)(VkDevice device, const VkSemaphoreSignalInfo* pSignalInfo); +typedef VkDeviceAddress (VKAPI_PTR *PFN_vkGetBufferDeviceAddress)(VkDevice device, const VkBufferDeviceAddressInfo* pInfo); +typedef uint64_t (VKAPI_PTR *PFN_vkGetBufferOpaqueCaptureAddress)(VkDevice device, const VkBufferDeviceAddressInfo* pInfo); +typedef uint64_t (VKAPI_PTR *PFN_vkGetDeviceMemoryOpaqueCaptureAddress)(VkDevice device, const VkDeviceMemoryOpaqueCaptureAddressInfo* pInfo); + +#ifndef VK_NO_PROTOTYPES +VKAPI_ATTR void VKAPI_CALL vkCmdDrawIndirectCount( + VkCommandBuffer commandBuffer, + VkBuffer buffer, + VkDeviceSize offset, + VkBuffer countBuffer, + VkDeviceSize countBufferOffset, + uint32_t maxDrawCount, + uint32_t stride); + +VKAPI_ATTR void VKAPI_CALL vkCmdDrawIndexedIndirectCount( + VkCommandBuffer commandBuffer, + VkBuffer buffer, + VkDeviceSize offset, + VkBuffer countBuffer, + VkDeviceSize countBufferOffset, + uint32_t maxDrawCount, + uint32_t stride); + +VKAPI_ATTR VkResult VKAPI_CALL vkCreateRenderPass2( + VkDevice device, + const VkRenderPassCreateInfo2* pCreateInfo, + const VkAllocationCallbacks* pAllocator, + VkRenderPass* pRenderPass); + +VKAPI_ATTR void VKAPI_CALL vkCmdBeginRenderPass2( + VkCommandBuffer commandBuffer, + const VkRenderPassBeginInfo* pRenderPassBegin, + const VkSubpassBeginInfo* pSubpassBeginInfo); + +VKAPI_ATTR void VKAPI_CALL vkCmdNextSubpass2( + VkCommandBuffer commandBuffer, + const VkSubpassBeginInfo* pSubpassBeginInfo, + const VkSubpassEndInfo* pSubpassEndInfo); + +VKAPI_ATTR void VKAPI_CALL vkCmdEndRenderPass2( + VkCommandBuffer commandBuffer, + const VkSubpassEndInfo* pSubpassEndInfo); + +VKAPI_ATTR void VKAPI_CALL vkResetQueryPool( + VkDevice device, + VkQueryPool queryPool, + uint32_t firstQuery, + uint32_t queryCount); + +VKAPI_ATTR VkResult VKAPI_CALL vkGetSemaphoreCounterValue( + VkDevice device, + VkSemaphore semaphore, + uint64_t* pValue); + +VKAPI_ATTR VkResult VKAPI_CALL vkWaitSemaphores( + VkDevice device, + const VkSemaphoreWaitInfo* pWaitInfo, + uint64_t timeout); + +VKAPI_ATTR VkResult VKAPI_CALL vkSignalSemaphore( + VkDevice device, + const VkSemaphoreSignalInfo* pSignalInfo); + +VKAPI_ATTR VkDeviceAddress VKAPI_CALL vkGetBufferDeviceAddress( + VkDevice device, + const VkBufferDeviceAddressInfo* pInfo); + +VKAPI_ATTR uint64_t VKAPI_CALL vkGetBufferOpaqueCaptureAddress( + VkDevice device, + const VkBufferDeviceAddressInfo* pInfo); + +VKAPI_ATTR uint64_t VKAPI_CALL vkGetDeviceMemoryOpaqueCaptureAddress( + VkDevice device, + const VkDeviceMemoryOpaqueCaptureAddressInfo* pInfo); +#endif + + +#define VK_KHR_surface 1 +VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkSurfaceKHR) +#define VK_KHR_SURFACE_SPEC_VERSION 25 +#define VK_KHR_SURFACE_EXTENSION_NAME "VK_KHR_surface" + +typedef enum VkColorSpaceKHR { + VK_COLOR_SPACE_SRGB_NONLINEAR_KHR = 0, + VK_COLOR_SPACE_DISPLAY_P3_NONLINEAR_EXT = 1000104001, + VK_COLOR_SPACE_EXTENDED_SRGB_LINEAR_EXT = 1000104002, + VK_COLOR_SPACE_DISPLAY_P3_LINEAR_EXT = 1000104003, + VK_COLOR_SPACE_DCI_P3_NONLINEAR_EXT = 1000104004, + VK_COLOR_SPACE_BT709_LINEAR_EXT = 1000104005, + VK_COLOR_SPACE_BT709_NONLINEAR_EXT = 1000104006, + VK_COLOR_SPACE_BT2020_LINEAR_EXT = 1000104007, + VK_COLOR_SPACE_HDR10_ST2084_EXT = 1000104008, + VK_COLOR_SPACE_DOLBYVISION_EXT = 1000104009, + VK_COLOR_SPACE_HDR10_HLG_EXT = 1000104010, + VK_COLOR_SPACE_ADOBERGB_LINEAR_EXT = 1000104011, + VK_COLOR_SPACE_ADOBERGB_NONLINEAR_EXT = 1000104012, + VK_COLOR_SPACE_PASS_THROUGH_EXT = 1000104013, + VK_COLOR_SPACE_EXTENDED_SRGB_NONLINEAR_EXT = 1000104014, + VK_COLOR_SPACE_DISPLAY_NATIVE_AMD = 1000213000, + VK_COLORSPACE_SRGB_NONLINEAR_KHR = VK_COLOR_SPACE_SRGB_NONLINEAR_KHR, + VK_COLOR_SPACE_DCI_P3_LINEAR_EXT = VK_COLOR_SPACE_DISPLAY_P3_LINEAR_EXT, + VK_COLOR_SPACE_BEGIN_RANGE_KHR = VK_COLOR_SPACE_SRGB_NONLINEAR_KHR, + VK_COLOR_SPACE_END_RANGE_KHR = VK_COLOR_SPACE_SRGB_NONLINEAR_KHR, + VK_COLOR_SPACE_RANGE_SIZE_KHR = (VK_COLOR_SPACE_SRGB_NONLINEAR_KHR - VK_COLOR_SPACE_SRGB_NONLINEAR_KHR + 1), + VK_COLOR_SPACE_MAX_ENUM_KHR = 0x7FFFFFFF +} VkColorSpaceKHR; + +typedef enum VkPresentModeKHR { + VK_PRESENT_MODE_IMMEDIATE_KHR = 0, + VK_PRESENT_MODE_MAILBOX_KHR = 1, + VK_PRESENT_MODE_FIFO_KHR = 2, + VK_PRESENT_MODE_FIFO_RELAXED_KHR = 3, + VK_PRESENT_MODE_SHARED_DEMAND_REFRESH_KHR = 1000111000, + VK_PRESENT_MODE_SHARED_CONTINUOUS_REFRESH_KHR = 1000111001, + VK_PRESENT_MODE_BEGIN_RANGE_KHR = VK_PRESENT_MODE_IMMEDIATE_KHR, + VK_PRESENT_MODE_END_RANGE_KHR = VK_PRESENT_MODE_FIFO_RELAXED_KHR, + VK_PRESENT_MODE_RANGE_SIZE_KHR = (VK_PRESENT_MODE_FIFO_RELAXED_KHR - VK_PRESENT_MODE_IMMEDIATE_KHR + 1), + VK_PRESENT_MODE_MAX_ENUM_KHR = 0x7FFFFFFF +} VkPresentModeKHR; + +typedef enum VkSurfaceTransformFlagBitsKHR { + VK_SURFACE_TRANSFORM_IDENTITY_BIT_KHR = 0x00000001, + VK_SURFACE_TRANSFORM_ROTATE_90_BIT_KHR = 0x00000002, + VK_SURFACE_TRANSFORM_ROTATE_180_BIT_KHR = 0x00000004, + VK_SURFACE_TRANSFORM_ROTATE_270_BIT_KHR = 0x00000008, + VK_SURFACE_TRANSFORM_HORIZONTAL_MIRROR_BIT_KHR = 0x00000010, + VK_SURFACE_TRANSFORM_HORIZONTAL_MIRROR_ROTATE_90_BIT_KHR = 0x00000020, + VK_SURFACE_TRANSFORM_HORIZONTAL_MIRROR_ROTATE_180_BIT_KHR = 0x00000040, + VK_SURFACE_TRANSFORM_HORIZONTAL_MIRROR_ROTATE_270_BIT_KHR = 0x00000080, + VK_SURFACE_TRANSFORM_INHERIT_BIT_KHR = 0x00000100, + VK_SURFACE_TRANSFORM_FLAG_BITS_MAX_ENUM_KHR = 0x7FFFFFFF +} VkSurfaceTransformFlagBitsKHR; +typedef VkFlags VkSurfaceTransformFlagsKHR; + +typedef enum VkCompositeAlphaFlagBitsKHR { + VK_COMPOSITE_ALPHA_OPAQUE_BIT_KHR = 0x00000001, + VK_COMPOSITE_ALPHA_PRE_MULTIPLIED_BIT_KHR = 0x00000002, + VK_COMPOSITE_ALPHA_POST_MULTIPLIED_BIT_KHR = 0x00000004, + VK_COMPOSITE_ALPHA_INHERIT_BIT_KHR = 0x00000008, + VK_COMPOSITE_ALPHA_FLAG_BITS_MAX_ENUM_KHR = 0x7FFFFFFF +} VkCompositeAlphaFlagBitsKHR; +typedef VkFlags VkCompositeAlphaFlagsKHR; +typedef struct VkSurfaceCapabilitiesKHR { + uint32_t minImageCount; + uint32_t maxImageCount; + VkExtent2D currentExtent; + VkExtent2D minImageExtent; + VkExtent2D maxImageExtent; + uint32_t maxImageArrayLayers; + VkSurfaceTransformFlagsKHR supportedTransforms; + VkSurfaceTransformFlagBitsKHR currentTransform; + VkCompositeAlphaFlagsKHR supportedCompositeAlpha; + VkImageUsageFlags supportedUsageFlags; +} VkSurfaceCapabilitiesKHR; + +typedef struct VkSurfaceFormatKHR { + VkFormat format; + VkColorSpaceKHR colorSpace; +} VkSurfaceFormatKHR; + +typedef void (VKAPI_PTR *PFN_vkDestroySurfaceKHR)(VkInstance instance, VkSurfaceKHR surface, const VkAllocationCallbacks* pAllocator); +typedef VkResult (VKAPI_PTR *PFN_vkGetPhysicalDeviceSurfaceSupportKHR)(VkPhysicalDevice physicalDevice, uint32_t queueFamilyIndex, VkSurfaceKHR surface, VkBool32* pSupported); +typedef VkResult (VKAPI_PTR *PFN_vkGetPhysicalDeviceSurfaceCapabilitiesKHR)(VkPhysicalDevice physicalDevice, VkSurfaceKHR surface, VkSurfaceCapabilitiesKHR* pSurfaceCapabilities); +typedef VkResult (VKAPI_PTR *PFN_vkGetPhysicalDeviceSurfaceFormatsKHR)(VkPhysicalDevice physicalDevice, VkSurfaceKHR surface, uint32_t* pSurfaceFormatCount, VkSurfaceFormatKHR* pSurfaceFormats); +typedef VkResult (VKAPI_PTR *PFN_vkGetPhysicalDeviceSurfacePresentModesKHR)(VkPhysicalDevice physicalDevice, VkSurfaceKHR surface, uint32_t* pPresentModeCount, VkPresentModeKHR* pPresentModes); + +#ifndef VK_NO_PROTOTYPES +VKAPI_ATTR void VKAPI_CALL vkDestroySurfaceKHR( + VkInstance instance, + VkSurfaceKHR surface, + const VkAllocationCallbacks* pAllocator); + +VKAPI_ATTR VkResult VKAPI_CALL vkGetPhysicalDeviceSurfaceSupportKHR( + VkPhysicalDevice physicalDevice, + uint32_t queueFamilyIndex, + VkSurfaceKHR surface, + VkBool32* pSupported); + +VKAPI_ATTR VkResult VKAPI_CALL vkGetPhysicalDeviceSurfaceCapabilitiesKHR( + VkPhysicalDevice physicalDevice, + VkSurfaceKHR surface, + VkSurfaceCapabilitiesKHR* pSurfaceCapabilities); + +VKAPI_ATTR VkResult VKAPI_CALL vkGetPhysicalDeviceSurfaceFormatsKHR( + VkPhysicalDevice physicalDevice, + VkSurfaceKHR surface, + uint32_t* pSurfaceFormatCount, + VkSurfaceFormatKHR* pSurfaceFormats); + +VKAPI_ATTR VkResult VKAPI_CALL vkGetPhysicalDeviceSurfacePresentModesKHR( + VkPhysicalDevice physicalDevice, + VkSurfaceKHR surface, + uint32_t* pPresentModeCount, + VkPresentModeKHR* pPresentModes); +#endif + + +#define VK_KHR_swapchain 1 +VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkSwapchainKHR) +#define VK_KHR_SWAPCHAIN_SPEC_VERSION 70 +#define VK_KHR_SWAPCHAIN_EXTENSION_NAME "VK_KHR_swapchain" + +typedef enum VkSwapchainCreateFlagBitsKHR { + VK_SWAPCHAIN_CREATE_SPLIT_INSTANCE_BIND_REGIONS_BIT_KHR = 0x00000001, + VK_SWAPCHAIN_CREATE_PROTECTED_BIT_KHR = 0x00000002, + VK_SWAPCHAIN_CREATE_MUTABLE_FORMAT_BIT_KHR = 0x00000004, + VK_SWAPCHAIN_CREATE_FLAG_BITS_MAX_ENUM_KHR = 0x7FFFFFFF +} VkSwapchainCreateFlagBitsKHR; +typedef VkFlags VkSwapchainCreateFlagsKHR; + +typedef enum VkDeviceGroupPresentModeFlagBitsKHR { + VK_DEVICE_GROUP_PRESENT_MODE_LOCAL_BIT_KHR = 0x00000001, + VK_DEVICE_GROUP_PRESENT_MODE_REMOTE_BIT_KHR = 0x00000002, + VK_DEVICE_GROUP_PRESENT_MODE_SUM_BIT_KHR = 0x00000004, + VK_DEVICE_GROUP_PRESENT_MODE_LOCAL_MULTI_DEVICE_BIT_KHR = 0x00000008, + VK_DEVICE_GROUP_PRESENT_MODE_FLAG_BITS_MAX_ENUM_KHR = 0x7FFFFFFF +} VkDeviceGroupPresentModeFlagBitsKHR; +typedef VkFlags VkDeviceGroupPresentModeFlagsKHR; +typedef struct VkSwapchainCreateInfoKHR { + VkStructureType sType; + const void* pNext; + VkSwapchainCreateFlagsKHR flags; + VkSurfaceKHR surface; + uint32_t minImageCount; + VkFormat imageFormat; + VkColorSpaceKHR imageColorSpace; + VkExtent2D imageExtent; + uint32_t imageArrayLayers; + VkImageUsageFlags imageUsage; + VkSharingMode imageSharingMode; + uint32_t queueFamilyIndexCount; + const uint32_t* pQueueFamilyIndices; + VkSurfaceTransformFlagBitsKHR preTransform; + VkCompositeAlphaFlagBitsKHR compositeAlpha; + VkPresentModeKHR presentMode; + VkBool32 clipped; + VkSwapchainKHR oldSwapchain; } VkSwapchainCreateInfoKHR; typedef struct VkPresentInfoKHR { @@ -5080,7 +5939,7 @@ #define VK_KHR_display 1 VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkDisplayKHR) VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkDisplayModeKHR) -#define VK_KHR_DISPLAY_SPEC_VERSION 21 +#define VK_KHR_DISPLAY_SPEC_VERSION 23 #define VK_KHR_DISPLAY_EXTENSION_NAME "VK_KHR_display" typedef enum VkDisplayPlaneAlphaFlagBitsKHR { @@ -5203,7 +6062,7 @@ #define VK_KHR_display_swapchain 1 -#define VK_KHR_DISPLAY_SWAPCHAIN_SPEC_VERSION 9 +#define VK_KHR_DISPLAY_SWAPCHAIN_SPEC_VERSION 10 #define VK_KHR_DISPLAY_SWAPCHAIN_EXTENSION_NAME "VK_KHR_display_swapchain" typedef struct VkDisplayPresentInfoKHR { VkStructureType sType; @@ -5226,7 +6085,7 @@ #define VK_KHR_sampler_mirror_clamp_to_edge 1 -#define VK_KHR_SAMPLER_MIRROR_CLAMP_TO_EDGE_SPEC_VERSION 1 +#define VK_KHR_SAMPLER_MIRROR_CLAMP_TO_EDGE_SPEC_VERSION 3 #define VK_KHR_SAMPLER_MIRROR_CLAMP_TO_EDGE_EXTENSION_NAME "VK_KHR_sampler_mirror_clamp_to_edge" @@ -5242,7 +6101,7 @@ #define VK_KHR_get_physical_device_properties2 1 -#define VK_KHR_GET_PHYSICAL_DEVICE_PROPERTIES_2_SPEC_VERSION 1 +#define VK_KHR_GET_PHYSICAL_DEVICE_PROPERTIES_2_SPEC_VERSION 2 #define VK_KHR_GET_PHYSICAL_DEVICE_PROPERTIES_2_EXTENSION_NAME "VK_KHR_get_physical_device_properties2" typedef VkPhysicalDeviceFeatures2 VkPhysicalDeviceFeatures2KHR; @@ -5307,7 +6166,7 @@ #define VK_KHR_device_group 1 -#define VK_KHR_DEVICE_GROUP_SPEC_VERSION 3 +#define VK_KHR_DEVICE_GROUP_SPEC_VERSION 4 #define VK_KHR_DEVICE_GROUP_EXTENSION_NAME "VK_KHR_device_group" typedef VkPeerMemoryFeatureFlags VkPeerMemoryFeatureFlagsKHR; @@ -5585,14 +6444,9 @@ #define VK_KHR_shader_float16_int8 1 #define VK_KHR_SHADER_FLOAT16_INT8_SPEC_VERSION 1 #define VK_KHR_SHADER_FLOAT16_INT8_EXTENSION_NAME "VK_KHR_shader_float16_int8" -typedef struct VkPhysicalDeviceShaderFloat16Int8FeaturesKHR { - VkStructureType sType; - void* pNext; - VkBool32 shaderFloat16; - VkBool32 shaderInt8; -} VkPhysicalDeviceShaderFloat16Int8FeaturesKHR; +typedef VkPhysicalDeviceShaderFloat16Int8Features VkPhysicalDeviceShaderFloat16Int8FeaturesKHR; -typedef VkPhysicalDeviceShaderFloat16Int8FeaturesKHR VkPhysicalDeviceFloat16Int8FeaturesKHR; +typedef VkPhysicalDeviceShaderFloat16Int8Features VkPhysicalDeviceFloat16Int8FeaturesKHR; @@ -5666,144 +6520,58 @@ #define VK_KHR_imageless_framebuffer 1 #define VK_KHR_IMAGELESS_FRAMEBUFFER_SPEC_VERSION 1 #define VK_KHR_IMAGELESS_FRAMEBUFFER_EXTENSION_NAME "VK_KHR_imageless_framebuffer" -typedef struct VkPhysicalDeviceImagelessFramebufferFeaturesKHR { - VkStructureType sType; - void* pNext; - VkBool32 imagelessFramebuffer; -} VkPhysicalDeviceImagelessFramebufferFeaturesKHR; +typedef VkPhysicalDeviceImagelessFramebufferFeatures VkPhysicalDeviceImagelessFramebufferFeaturesKHR; -typedef struct VkFramebufferAttachmentImageInfoKHR { - VkStructureType sType; - const void* pNext; - VkImageCreateFlags flags; - VkImageUsageFlags usage; - uint32_t width; - uint32_t height; - uint32_t layerCount; - uint32_t viewFormatCount; - const VkFormat* pViewFormats; -} VkFramebufferAttachmentImageInfoKHR; +typedef VkFramebufferAttachmentsCreateInfo VkFramebufferAttachmentsCreateInfoKHR; -typedef struct VkFramebufferAttachmentsCreateInfoKHR { - VkStructureType sType; - const void* pNext; - uint32_t attachmentImageInfoCount; - const VkFramebufferAttachmentImageInfoKHR* pAttachmentImageInfos; -} VkFramebufferAttachmentsCreateInfoKHR; +typedef VkFramebufferAttachmentImageInfo VkFramebufferAttachmentImageInfoKHR; -typedef struct VkRenderPassAttachmentBeginInfoKHR { - VkStructureType sType; - const void* pNext; - uint32_t attachmentCount; - const VkImageView* pAttachments; -} VkRenderPassAttachmentBeginInfoKHR; +typedef VkRenderPassAttachmentBeginInfo VkRenderPassAttachmentBeginInfoKHR; #define VK_KHR_create_renderpass2 1 #define VK_KHR_CREATE_RENDERPASS_2_SPEC_VERSION 1 #define VK_KHR_CREATE_RENDERPASS_2_EXTENSION_NAME "VK_KHR_create_renderpass2" -typedef struct VkAttachmentDescription2KHR { - VkStructureType sType; - const void* pNext; - VkAttachmentDescriptionFlags flags; - VkFormat format; - VkSampleCountFlagBits samples; - VkAttachmentLoadOp loadOp; - VkAttachmentStoreOp storeOp; - VkAttachmentLoadOp stencilLoadOp; - VkAttachmentStoreOp stencilStoreOp; - VkImageLayout initialLayout; - VkImageLayout finalLayout; -} VkAttachmentDescription2KHR; +typedef VkRenderPassCreateInfo2 VkRenderPassCreateInfo2KHR; -typedef struct VkAttachmentReference2KHR { - VkStructureType sType; - const void* pNext; - uint32_t attachment; - VkImageLayout layout; - VkImageAspectFlags aspectMask; -} VkAttachmentReference2KHR; +typedef VkAttachmentDescription2 VkAttachmentDescription2KHR; -typedef struct VkSubpassDescription2KHR { - VkStructureType sType; - const void* pNext; - VkSubpassDescriptionFlags flags; - VkPipelineBindPoint pipelineBindPoint; - uint32_t viewMask; - uint32_t inputAttachmentCount; - const VkAttachmentReference2KHR* pInputAttachments; - uint32_t colorAttachmentCount; - const VkAttachmentReference2KHR* pColorAttachments; - const VkAttachmentReference2KHR* pResolveAttachments; - const VkAttachmentReference2KHR* pDepthStencilAttachment; - uint32_t preserveAttachmentCount; - const uint32_t* pPreserveAttachments; -} VkSubpassDescription2KHR; +typedef VkAttachmentReference2 VkAttachmentReference2KHR; -typedef struct VkSubpassDependency2KHR { - VkStructureType sType; - const void* pNext; - uint32_t srcSubpass; - uint32_t dstSubpass; - VkPipelineStageFlags srcStageMask; - VkPipelineStageFlags dstStageMask; - VkAccessFlags srcAccessMask; - VkAccessFlags dstAccessMask; - VkDependencyFlags dependencyFlags; - int32_t viewOffset; -} VkSubpassDependency2KHR; +typedef VkSubpassDescription2 VkSubpassDescription2KHR; -typedef struct VkRenderPassCreateInfo2KHR { - VkStructureType sType; - const void* pNext; - VkRenderPassCreateFlags flags; - uint32_t attachmentCount; - const VkAttachmentDescription2KHR* pAttachments; - uint32_t subpassCount; - const VkSubpassDescription2KHR* pSubpasses; - uint32_t dependencyCount; - const VkSubpassDependency2KHR* pDependencies; - uint32_t correlatedViewMaskCount; - const uint32_t* pCorrelatedViewMasks; -} VkRenderPassCreateInfo2KHR; +typedef VkSubpassDependency2 VkSubpassDependency2KHR; -typedef struct VkSubpassBeginInfoKHR { - VkStructureType sType; - const void* pNext; - VkSubpassContents contents; -} VkSubpassBeginInfoKHR; +typedef VkSubpassBeginInfo VkSubpassBeginInfoKHR; -typedef struct VkSubpassEndInfoKHR { - VkStructureType sType; - const void* pNext; -} VkSubpassEndInfoKHR; +typedef VkSubpassEndInfo VkSubpassEndInfoKHR; -typedef VkResult (VKAPI_PTR *PFN_vkCreateRenderPass2KHR)(VkDevice device, const VkRenderPassCreateInfo2KHR* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkRenderPass* pRenderPass); -typedef void (VKAPI_PTR *PFN_vkCmdBeginRenderPass2KHR)(VkCommandBuffer commandBuffer, const VkRenderPassBeginInfo* pRenderPassBegin, const VkSubpassBeginInfoKHR* pSubpassBeginInfo); -typedef void (VKAPI_PTR *PFN_vkCmdNextSubpass2KHR)(VkCommandBuffer commandBuffer, const VkSubpassBeginInfoKHR* pSubpassBeginInfo, const VkSubpassEndInfoKHR* pSubpassEndInfo); -typedef void (VKAPI_PTR *PFN_vkCmdEndRenderPass2KHR)(VkCommandBuffer commandBuffer, const VkSubpassEndInfoKHR* pSubpassEndInfo); +typedef VkResult (VKAPI_PTR *PFN_vkCreateRenderPass2KHR)(VkDevice device, const VkRenderPassCreateInfo2* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkRenderPass* pRenderPass); +typedef void (VKAPI_PTR *PFN_vkCmdBeginRenderPass2KHR)(VkCommandBuffer commandBuffer, const VkRenderPassBeginInfo* pRenderPassBegin, const VkSubpassBeginInfo* pSubpassBeginInfo); +typedef void (VKAPI_PTR *PFN_vkCmdNextSubpass2KHR)(VkCommandBuffer commandBuffer, const VkSubpassBeginInfo* pSubpassBeginInfo, const VkSubpassEndInfo* pSubpassEndInfo); +typedef void (VKAPI_PTR *PFN_vkCmdEndRenderPass2KHR)(VkCommandBuffer commandBuffer, const VkSubpassEndInfo* pSubpassEndInfo); #ifndef VK_NO_PROTOTYPES VKAPI_ATTR VkResult VKAPI_CALL vkCreateRenderPass2KHR( VkDevice device, - const VkRenderPassCreateInfo2KHR* pCreateInfo, + const VkRenderPassCreateInfo2* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkRenderPass* pRenderPass); VKAPI_ATTR void VKAPI_CALL vkCmdBeginRenderPass2KHR( VkCommandBuffer commandBuffer, const VkRenderPassBeginInfo* pRenderPassBegin, - const VkSubpassBeginInfoKHR* pSubpassBeginInfo); + const VkSubpassBeginInfo* pSubpassBeginInfo); VKAPI_ATTR void VKAPI_CALL vkCmdNextSubpass2KHR( VkCommandBuffer commandBuffer, - const VkSubpassBeginInfoKHR* pSubpassBeginInfo, - const VkSubpassEndInfoKHR* pSubpassEndInfo); + const VkSubpassBeginInfo* pSubpassBeginInfo, + const VkSubpassEndInfo* pSubpassEndInfo); VKAPI_ATTR void VKAPI_CALL vkCmdEndRenderPass2KHR( VkCommandBuffer commandBuffer, - const VkSubpassEndInfoKHR* pSubpassEndInfo); + const VkSubpassEndInfo* pSubpassEndInfo); #endif @@ -5895,6 +6663,153 @@ #endif +#define VK_KHR_performance_query 1 +#define VK_KHR_PERFORMANCE_QUERY_SPEC_VERSION 1 +#define VK_KHR_PERFORMANCE_QUERY_EXTENSION_NAME "VK_KHR_performance_query" + +typedef enum VkPerformanceCounterUnitKHR { + VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR = 0, + VK_PERFORMANCE_COUNTER_UNIT_PERCENTAGE_KHR = 1, + VK_PERFORMANCE_COUNTER_UNIT_NANOSECONDS_KHR = 2, + VK_PERFORMANCE_COUNTER_UNIT_BYTES_KHR = 3, + VK_PERFORMANCE_COUNTER_UNIT_BYTES_PER_SECOND_KHR = 4, + VK_PERFORMANCE_COUNTER_UNIT_KELVIN_KHR = 5, + VK_PERFORMANCE_COUNTER_UNIT_WATTS_KHR = 6, + VK_PERFORMANCE_COUNTER_UNIT_VOLTS_KHR = 7, + VK_PERFORMANCE_COUNTER_UNIT_AMPS_KHR = 8, + VK_PERFORMANCE_COUNTER_UNIT_HERTZ_KHR = 9, + VK_PERFORMANCE_COUNTER_UNIT_CYCLES_KHR = 10, + VK_PERFORMANCE_COUNTER_UNIT_BEGIN_RANGE_KHR = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR, + VK_PERFORMANCE_COUNTER_UNIT_END_RANGE_KHR = VK_PERFORMANCE_COUNTER_UNIT_CYCLES_KHR, + VK_PERFORMANCE_COUNTER_UNIT_RANGE_SIZE_KHR = (VK_PERFORMANCE_COUNTER_UNIT_CYCLES_KHR - VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR + 1), + VK_PERFORMANCE_COUNTER_UNIT_MAX_ENUM_KHR = 0x7FFFFFFF +} VkPerformanceCounterUnitKHR; + +typedef enum VkPerformanceCounterScopeKHR { + VK_PERFORMANCE_COUNTER_SCOPE_COMMAND_BUFFER_KHR = 0, + VK_PERFORMANCE_COUNTER_SCOPE_RENDER_PASS_KHR = 1, + VK_PERFORMANCE_COUNTER_SCOPE_COMMAND_KHR = 2, + VK_QUERY_SCOPE_COMMAND_BUFFER_KHR = VK_PERFORMANCE_COUNTER_SCOPE_COMMAND_BUFFER_KHR, + VK_QUERY_SCOPE_RENDER_PASS_KHR = VK_PERFORMANCE_COUNTER_SCOPE_RENDER_PASS_KHR, + VK_QUERY_SCOPE_COMMAND_KHR = VK_PERFORMANCE_COUNTER_SCOPE_COMMAND_KHR, + VK_PERFORMANCE_COUNTER_SCOPE_BEGIN_RANGE_KHR = VK_PERFORMANCE_COUNTER_SCOPE_COMMAND_BUFFER_KHR, + VK_PERFORMANCE_COUNTER_SCOPE_END_RANGE_KHR = VK_PERFORMANCE_COUNTER_SCOPE_COMMAND_KHR, + VK_PERFORMANCE_COUNTER_SCOPE_RANGE_SIZE_KHR = (VK_PERFORMANCE_COUNTER_SCOPE_COMMAND_KHR - VK_PERFORMANCE_COUNTER_SCOPE_COMMAND_BUFFER_KHR + 1), + VK_PERFORMANCE_COUNTER_SCOPE_MAX_ENUM_KHR = 0x7FFFFFFF +} VkPerformanceCounterScopeKHR; + +typedef enum VkPerformanceCounterStorageKHR { + VK_PERFORMANCE_COUNTER_STORAGE_INT32_KHR = 0, + VK_PERFORMANCE_COUNTER_STORAGE_INT64_KHR = 1, + VK_PERFORMANCE_COUNTER_STORAGE_UINT32_KHR = 2, + VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR = 3, + VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR = 4, + VK_PERFORMANCE_COUNTER_STORAGE_FLOAT64_KHR = 5, + VK_PERFORMANCE_COUNTER_STORAGE_BEGIN_RANGE_KHR = VK_PERFORMANCE_COUNTER_STORAGE_INT32_KHR, + VK_PERFORMANCE_COUNTER_STORAGE_END_RANGE_KHR = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT64_KHR, + VK_PERFORMANCE_COUNTER_STORAGE_RANGE_SIZE_KHR = (VK_PERFORMANCE_COUNTER_STORAGE_FLOAT64_KHR - VK_PERFORMANCE_COUNTER_STORAGE_INT32_KHR + 1), + VK_PERFORMANCE_COUNTER_STORAGE_MAX_ENUM_KHR = 0x7FFFFFFF +} VkPerformanceCounterStorageKHR; + +typedef enum VkPerformanceCounterDescriptionFlagBitsKHR { + VK_PERFORMANCE_COUNTER_DESCRIPTION_PERFORMANCE_IMPACTING_KHR = 0x00000001, + VK_PERFORMANCE_COUNTER_DESCRIPTION_CONCURRENTLY_IMPACTED_KHR = 0x00000002, + VK_PERFORMANCE_COUNTER_DESCRIPTION_FLAG_BITS_MAX_ENUM_KHR = 0x7FFFFFFF +} VkPerformanceCounterDescriptionFlagBitsKHR; +typedef VkFlags VkPerformanceCounterDescriptionFlagsKHR; + +typedef enum VkAcquireProfilingLockFlagBitsKHR { + VK_ACQUIRE_PROFILING_LOCK_FLAG_BITS_MAX_ENUM_KHR = 0x7FFFFFFF +} VkAcquireProfilingLockFlagBitsKHR; +typedef VkFlags VkAcquireProfilingLockFlagsKHR; +typedef struct VkPhysicalDevicePerformanceQueryFeaturesKHR { + VkStructureType sType; + void* pNext; + VkBool32 performanceCounterQueryPools; + VkBool32 performanceCounterMultipleQueryPools; +} VkPhysicalDevicePerformanceQueryFeaturesKHR; + +typedef struct VkPhysicalDevicePerformanceQueryPropertiesKHR { + VkStructureType sType; + void* pNext; + VkBool32 allowCommandBufferQueryCopies; +} VkPhysicalDevicePerformanceQueryPropertiesKHR; + +typedef struct VkPerformanceCounterKHR { + VkStructureType sType; + const void* pNext; + VkPerformanceCounterUnitKHR unit; + VkPerformanceCounterScopeKHR scope; + VkPerformanceCounterStorageKHR storage; + uint8_t uuid[VK_UUID_SIZE]; +} VkPerformanceCounterKHR; + +typedef struct VkPerformanceCounterDescriptionKHR { + VkStructureType sType; + const void* pNext; + VkPerformanceCounterDescriptionFlagsKHR flags; + char name[VK_MAX_DESCRIPTION_SIZE]; + char category[VK_MAX_DESCRIPTION_SIZE]; + char description[VK_MAX_DESCRIPTION_SIZE]; +} VkPerformanceCounterDescriptionKHR; + +typedef struct VkQueryPoolPerformanceCreateInfoKHR { + VkStructureType sType; + const void* pNext; + uint32_t queueFamilyIndex; + uint32_t counterIndexCount; + const uint32_t* pCounterIndices; +} VkQueryPoolPerformanceCreateInfoKHR; + +typedef union VkPerformanceCounterResultKHR { + int32_t int32; + int64_t int64; + uint32_t uint32; + uint64_t uint64; + float float32; + double float64; +} VkPerformanceCounterResultKHR; + +typedef struct VkAcquireProfilingLockInfoKHR { + VkStructureType sType; + const void* pNext; + VkAcquireProfilingLockFlagsKHR flags; + uint64_t timeout; +} VkAcquireProfilingLockInfoKHR; + +typedef struct VkPerformanceQuerySubmitInfoKHR { + VkStructureType sType; + const void* pNext; + uint32_t counterPassIndex; +} VkPerformanceQuerySubmitInfoKHR; + +typedef VkResult (VKAPI_PTR *PFN_vkEnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR)(VkPhysicalDevice physicalDevice, uint32_t queueFamilyIndex, uint32_t* pCounterCount, VkPerformanceCounterKHR* pCounters, VkPerformanceCounterDescriptionKHR* pCounterDescriptions); +typedef void (VKAPI_PTR *PFN_vkGetPhysicalDeviceQueueFamilyPerformanceQueryPassesKHR)(VkPhysicalDevice physicalDevice, const VkQueryPoolPerformanceCreateInfoKHR* pPerformanceQueryCreateInfo, uint32_t* pNumPasses); +typedef VkResult (VKAPI_PTR *PFN_vkAcquireProfilingLockKHR)(VkDevice device, const VkAcquireProfilingLockInfoKHR* pInfo); +typedef void (VKAPI_PTR *PFN_vkReleaseProfilingLockKHR)(VkDevice device); + +#ifndef VK_NO_PROTOTYPES +VKAPI_ATTR VkResult VKAPI_CALL vkEnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR( + VkPhysicalDevice physicalDevice, + uint32_t queueFamilyIndex, + uint32_t* pCounterCount, + VkPerformanceCounterKHR* pCounters, + VkPerformanceCounterDescriptionKHR* pCounterDescriptions); + +VKAPI_ATTR void VKAPI_CALL vkGetPhysicalDeviceQueueFamilyPerformanceQueryPassesKHR( + VkPhysicalDevice physicalDevice, + const VkQueryPoolPerformanceCreateInfoKHR* pPerformanceQueryCreateInfo, + uint32_t* pNumPasses); + +VKAPI_ATTR VkResult VKAPI_CALL vkAcquireProfilingLockKHR( + VkDevice device, + const VkAcquireProfilingLockInfoKHR* pInfo); + +VKAPI_ATTR void VKAPI_CALL vkReleaseProfilingLockKHR( + VkDevice device); +#endif + + #define VK_KHR_maintenance2 1 #define VK_KHR_MAINTENANCE2_SPEC_VERSION 1 #define VK_KHR_MAINTENANCE2_EXTENSION_NAME "VK_KHR_maintenance2" @@ -6080,19 +6995,14 @@ #define VK_KHR_image_format_list 1 #define VK_KHR_IMAGE_FORMAT_LIST_SPEC_VERSION 1 #define VK_KHR_IMAGE_FORMAT_LIST_EXTENSION_NAME "VK_KHR_image_format_list" -typedef struct VkImageFormatListCreateInfoKHR { - VkStructureType sType; - const void* pNext; - uint32_t viewFormatCount; - const VkFormat* pViewFormats; -} VkImageFormatListCreateInfoKHR; +typedef VkImageFormatListCreateInfo VkImageFormatListCreateInfoKHR; #define VK_KHR_sampler_ycbcr_conversion 1 typedef VkSamplerYcbcrConversion VkSamplerYcbcrConversionKHR; -#define VK_KHR_SAMPLER_YCBCR_CONVERSION_SPEC_VERSION 1 +#define VK_KHR_SAMPLER_YCBCR_CONVERSION_SPEC_VERSION 14 #define VK_KHR_SAMPLER_YCBCR_CONVERSION_EXTENSION_NAME "VK_KHR_sampler_ycbcr_conversion" typedef VkSamplerYcbcrModelConversion VkSamplerYcbcrModelConversionKHR; @@ -6196,139 +7106,71 @@ #endif +#define VK_KHR_shader_subgroup_extended_types 1 +#define VK_KHR_SHADER_SUBGROUP_EXTENDED_TYPES_SPEC_VERSION 1 +#define VK_KHR_SHADER_SUBGROUP_EXTENDED_TYPES_EXTENSION_NAME "VK_KHR_shader_subgroup_extended_types" +typedef VkPhysicalDeviceShaderSubgroupExtendedTypesFeatures VkPhysicalDeviceShaderSubgroupExtendedTypesFeaturesKHR; + + + #define VK_KHR_8bit_storage 1 #define VK_KHR_8BIT_STORAGE_SPEC_VERSION 1 #define VK_KHR_8BIT_STORAGE_EXTENSION_NAME "VK_KHR_8bit_storage" -typedef struct VkPhysicalDevice8BitStorageFeaturesKHR { - VkStructureType sType; - void* pNext; - VkBool32 storageBuffer8BitAccess; - VkBool32 uniformAndStorageBuffer8BitAccess; - VkBool32 storagePushConstant8; -} VkPhysicalDevice8BitStorageFeaturesKHR; +typedef VkPhysicalDevice8BitStorageFeatures VkPhysicalDevice8BitStorageFeaturesKHR; #define VK_KHR_shader_atomic_int64 1 #define VK_KHR_SHADER_ATOMIC_INT64_SPEC_VERSION 1 #define VK_KHR_SHADER_ATOMIC_INT64_EXTENSION_NAME "VK_KHR_shader_atomic_int64" -typedef struct VkPhysicalDeviceShaderAtomicInt64FeaturesKHR { +typedef VkPhysicalDeviceShaderAtomicInt64Features VkPhysicalDeviceShaderAtomicInt64FeaturesKHR; + + + +#define VK_KHR_shader_clock 1 +#define VK_KHR_SHADER_CLOCK_SPEC_VERSION 1 +#define VK_KHR_SHADER_CLOCK_EXTENSION_NAME "VK_KHR_shader_clock" +typedef struct VkPhysicalDeviceShaderClockFeaturesKHR { VkStructureType sType; void* pNext; - VkBool32 shaderBufferInt64Atomics; - VkBool32 shaderSharedInt64Atomics; -} VkPhysicalDeviceShaderAtomicInt64FeaturesKHR; + VkBool32 shaderSubgroupClock; + VkBool32 shaderDeviceClock; +} VkPhysicalDeviceShaderClockFeaturesKHR; #define VK_KHR_driver_properties 1 -#define VK_MAX_DRIVER_NAME_SIZE_KHR 256 -#define VK_MAX_DRIVER_INFO_SIZE_KHR 256 #define VK_KHR_DRIVER_PROPERTIES_SPEC_VERSION 1 #define VK_KHR_DRIVER_PROPERTIES_EXTENSION_NAME "VK_KHR_driver_properties" +#define VK_MAX_DRIVER_NAME_SIZE_KHR VK_MAX_DRIVER_NAME_SIZE +#define VK_MAX_DRIVER_INFO_SIZE_KHR VK_MAX_DRIVER_INFO_SIZE +typedef VkDriverId VkDriverIdKHR; -typedef enum VkDriverIdKHR { - VK_DRIVER_ID_AMD_PROPRIETARY_KHR = 1, - VK_DRIVER_ID_AMD_OPEN_SOURCE_KHR = 2, - VK_DRIVER_ID_MESA_RADV_KHR = 3, - VK_DRIVER_ID_NVIDIA_PROPRIETARY_KHR = 4, - VK_DRIVER_ID_INTEL_PROPRIETARY_WINDOWS_KHR = 5, - VK_DRIVER_ID_INTEL_OPEN_SOURCE_MESA_KHR = 6, - VK_DRIVER_ID_IMAGINATION_PROPRIETARY_KHR = 7, - VK_DRIVER_ID_QUALCOMM_PROPRIETARY_KHR = 8, - VK_DRIVER_ID_ARM_PROPRIETARY_KHR = 9, - VK_DRIVER_ID_GOOGLE_SWIFTSHADER_KHR = 10, - VK_DRIVER_ID_GGP_PROPRIETARY_KHR = 11, - VK_DRIVER_ID_BROADCOM_PROPRIETARY_KHR = 12, - VK_DRIVER_ID_BEGIN_RANGE_KHR = VK_DRIVER_ID_AMD_PROPRIETARY_KHR, - VK_DRIVER_ID_END_RANGE_KHR = VK_DRIVER_ID_BROADCOM_PROPRIETARY_KHR, - VK_DRIVER_ID_RANGE_SIZE_KHR = (VK_DRIVER_ID_BROADCOM_PROPRIETARY_KHR - VK_DRIVER_ID_AMD_PROPRIETARY_KHR + 1), - VK_DRIVER_ID_MAX_ENUM_KHR = 0x7FFFFFFF -} VkDriverIdKHR; -typedef struct VkConformanceVersionKHR { - uint8_t major; - uint8_t minor; - uint8_t subminor; - uint8_t patch; -} VkConformanceVersionKHR; +typedef VkConformanceVersion VkConformanceVersionKHR; -typedef struct VkPhysicalDeviceDriverPropertiesKHR { - VkStructureType sType; - void* pNext; - VkDriverIdKHR driverID; - char driverName[VK_MAX_DRIVER_NAME_SIZE_KHR]; - char driverInfo[VK_MAX_DRIVER_INFO_SIZE_KHR]; - VkConformanceVersionKHR conformanceVersion; -} VkPhysicalDeviceDriverPropertiesKHR; +typedef VkPhysicalDeviceDriverProperties VkPhysicalDeviceDriverPropertiesKHR; #define VK_KHR_shader_float_controls 1 #define VK_KHR_SHADER_FLOAT_CONTROLS_SPEC_VERSION 4 #define VK_KHR_SHADER_FLOAT_CONTROLS_EXTENSION_NAME "VK_KHR_shader_float_controls" +typedef VkShaderFloatControlsIndependence VkShaderFloatControlsIndependenceKHR; -typedef enum VkShaderFloatControlsIndependenceKHR { - VK_SHADER_FLOAT_CONTROLS_INDEPENDENCE_32_BIT_ONLY_KHR = 0, - VK_SHADER_FLOAT_CONTROLS_INDEPENDENCE_ALL_KHR = 1, - VK_SHADER_FLOAT_CONTROLS_INDEPENDENCE_NONE_KHR = 2, - VK_SHADER_FLOAT_CONTROLS_INDEPENDENCE_BEGIN_RANGE_KHR = VK_SHADER_FLOAT_CONTROLS_INDEPENDENCE_32_BIT_ONLY_KHR, - VK_SHADER_FLOAT_CONTROLS_INDEPENDENCE_END_RANGE_KHR = VK_SHADER_FLOAT_CONTROLS_INDEPENDENCE_NONE_KHR, - VK_SHADER_FLOAT_CONTROLS_INDEPENDENCE_RANGE_SIZE_KHR = (VK_SHADER_FLOAT_CONTROLS_INDEPENDENCE_NONE_KHR - VK_SHADER_FLOAT_CONTROLS_INDEPENDENCE_32_BIT_ONLY_KHR + 1), - VK_SHADER_FLOAT_CONTROLS_INDEPENDENCE_MAX_ENUM_KHR = 0x7FFFFFFF -} VkShaderFloatControlsIndependenceKHR; -typedef struct VkPhysicalDeviceFloatControlsPropertiesKHR { - VkStructureType sType; - void* pNext; - VkShaderFloatControlsIndependenceKHR denormBehaviorIndependence; - VkShaderFloatControlsIndependenceKHR roundingModeIndependence; - VkBool32 shaderSignedZeroInfNanPreserveFloat16; - VkBool32 shaderSignedZeroInfNanPreserveFloat32; - VkBool32 shaderSignedZeroInfNanPreserveFloat64; - VkBool32 shaderDenormPreserveFloat16; - VkBool32 shaderDenormPreserveFloat32; - VkBool32 shaderDenormPreserveFloat64; - VkBool32 shaderDenormFlushToZeroFloat16; - VkBool32 shaderDenormFlushToZeroFloat32; - VkBool32 shaderDenormFlushToZeroFloat64; - VkBool32 shaderRoundingModeRTEFloat16; - VkBool32 shaderRoundingModeRTEFloat32; - VkBool32 shaderRoundingModeRTEFloat64; - VkBool32 shaderRoundingModeRTZFloat16; - VkBool32 shaderRoundingModeRTZFloat32; - VkBool32 shaderRoundingModeRTZFloat64; -} VkPhysicalDeviceFloatControlsPropertiesKHR; +typedef VkPhysicalDeviceFloatControlsProperties VkPhysicalDeviceFloatControlsPropertiesKHR; #define VK_KHR_depth_stencil_resolve 1 #define VK_KHR_DEPTH_STENCIL_RESOLVE_SPEC_VERSION 1 #define VK_KHR_DEPTH_STENCIL_RESOLVE_EXTENSION_NAME "VK_KHR_depth_stencil_resolve" +typedef VkResolveModeFlagBits VkResolveModeFlagBitsKHR; -typedef enum VkResolveModeFlagBitsKHR { - VK_RESOLVE_MODE_NONE_KHR = 0, - VK_RESOLVE_MODE_SAMPLE_ZERO_BIT_KHR = 0x00000001, - VK_RESOLVE_MODE_AVERAGE_BIT_KHR = 0x00000002, - VK_RESOLVE_MODE_MIN_BIT_KHR = 0x00000004, - VK_RESOLVE_MODE_MAX_BIT_KHR = 0x00000008, - VK_RESOLVE_MODE_FLAG_BITS_MAX_ENUM_KHR = 0x7FFFFFFF -} VkResolveModeFlagBitsKHR; -typedef VkFlags VkResolveModeFlagsKHR; -typedef struct VkSubpassDescriptionDepthStencilResolveKHR { - VkStructureType sType; - const void* pNext; - VkResolveModeFlagBitsKHR depthResolveMode; - VkResolveModeFlagBitsKHR stencilResolveMode; - const VkAttachmentReference2KHR* pDepthStencilResolveAttachment; -} VkSubpassDescriptionDepthStencilResolveKHR; +typedef VkResolveModeFlags VkResolveModeFlagsKHR; -typedef struct VkPhysicalDeviceDepthStencilResolvePropertiesKHR { - VkStructureType sType; - void* pNext; - VkResolveModeFlagsKHR supportedDepthResolveModes; - VkResolveModeFlagsKHR supportedStencilResolveModes; - VkBool32 independentResolveNone; - VkBool32 independentResolve; -} VkPhysicalDeviceDepthStencilResolvePropertiesKHR; +typedef VkSubpassDescriptionDepthStencilResolve VkSubpassDescriptionDepthStencilResolveKHR; + +typedef VkPhysicalDeviceDepthStencilResolveProperties VkPhysicalDeviceDepthStencilResolvePropertiesKHR; @@ -6337,18 +7179,59 @@ #define VK_KHR_SWAPCHAIN_MUTABLE_FORMAT_EXTENSION_NAME "VK_KHR_swapchain_mutable_format" +#define VK_KHR_timeline_semaphore 1 +#define VK_KHR_TIMELINE_SEMAPHORE_SPEC_VERSION 2 +#define VK_KHR_TIMELINE_SEMAPHORE_EXTENSION_NAME "VK_KHR_timeline_semaphore" +typedef VkSemaphoreType VkSemaphoreTypeKHR; + +typedef VkSemaphoreWaitFlagBits VkSemaphoreWaitFlagBitsKHR; + +typedef VkSemaphoreWaitFlags VkSemaphoreWaitFlagsKHR; + +typedef VkPhysicalDeviceTimelineSemaphoreFeatures VkPhysicalDeviceTimelineSemaphoreFeaturesKHR; + +typedef VkPhysicalDeviceTimelineSemaphoreProperties VkPhysicalDeviceTimelineSemaphorePropertiesKHR; + +typedef VkSemaphoreTypeCreateInfo VkSemaphoreTypeCreateInfoKHR; + +typedef VkTimelineSemaphoreSubmitInfo VkTimelineSemaphoreSubmitInfoKHR; + +typedef VkSemaphoreWaitInfo VkSemaphoreWaitInfoKHR; + +typedef VkSemaphoreSignalInfo VkSemaphoreSignalInfoKHR; + +typedef VkResult (VKAPI_PTR *PFN_vkGetSemaphoreCounterValueKHR)(VkDevice device, VkSemaphore semaphore, uint64_t* pValue); +typedef VkResult (VKAPI_PTR *PFN_vkWaitSemaphoresKHR)(VkDevice device, const VkSemaphoreWaitInfo* pWaitInfo, uint64_t timeout); +typedef VkResult (VKAPI_PTR *PFN_vkSignalSemaphoreKHR)(VkDevice device, const VkSemaphoreSignalInfo* pSignalInfo); + +#ifndef VK_NO_PROTOTYPES +VKAPI_ATTR VkResult VKAPI_CALL vkGetSemaphoreCounterValueKHR( + VkDevice device, + VkSemaphore semaphore, + uint64_t* pValue); + +VKAPI_ATTR VkResult VKAPI_CALL vkWaitSemaphoresKHR( + VkDevice device, + const VkSemaphoreWaitInfo* pWaitInfo, + uint64_t timeout); + +VKAPI_ATTR VkResult VKAPI_CALL vkSignalSemaphoreKHR( + VkDevice device, + const VkSemaphoreSignalInfo* pSignalInfo); +#endif + + #define VK_KHR_vulkan_memory_model 1 #define VK_KHR_VULKAN_MEMORY_MODEL_SPEC_VERSION 3 #define VK_KHR_VULKAN_MEMORY_MODEL_EXTENSION_NAME "VK_KHR_vulkan_memory_model" -typedef struct VkPhysicalDeviceVulkanMemoryModelFeaturesKHR { - VkStructureType sType; - void* pNext; - VkBool32 vulkanMemoryModel; - VkBool32 vulkanMemoryModelDeviceScope; - VkBool32 vulkanMemoryModelAvailabilityVisibilityChains; -} VkPhysicalDeviceVulkanMemoryModelFeaturesKHR; +typedef VkPhysicalDeviceVulkanMemoryModelFeatures VkPhysicalDeviceVulkanMemoryModelFeaturesKHR; + +#define VK_KHR_spirv_1_4 1 +#define VK_KHR_SPIRV_1_4_SPEC_VERSION 1 +#define VK_KHR_SPIRV_1_4_EXTENSION_NAME "VK_KHR_spirv_1_4" + #define VK_KHR_surface_protected_capabilities 1 #define VK_KHR_SURFACE_PROTECTED_CAPABILITIES_SPEC_VERSION 1 @@ -6361,15 +7244,54 @@ +#define VK_KHR_separate_depth_stencil_layouts 1 +#define VK_KHR_SEPARATE_DEPTH_STENCIL_LAYOUTS_SPEC_VERSION 1 +#define VK_KHR_SEPARATE_DEPTH_STENCIL_LAYOUTS_EXTENSION_NAME "VK_KHR_separate_depth_stencil_layouts" +typedef VkPhysicalDeviceSeparateDepthStencilLayoutsFeatures VkPhysicalDeviceSeparateDepthStencilLayoutsFeaturesKHR; + +typedef VkAttachmentReferenceStencilLayout VkAttachmentReferenceStencilLayoutKHR; + +typedef VkAttachmentDescriptionStencilLayout VkAttachmentDescriptionStencilLayoutKHR; + + + #define VK_KHR_uniform_buffer_standard_layout 1 #define VK_KHR_UNIFORM_BUFFER_STANDARD_LAYOUT_SPEC_VERSION 1 #define VK_KHR_UNIFORM_BUFFER_STANDARD_LAYOUT_EXTENSION_NAME "VK_KHR_uniform_buffer_standard_layout" -typedef struct VkPhysicalDeviceUniformBufferStandardLayoutFeaturesKHR { - VkStructureType sType; - void* pNext; - VkBool32 uniformBufferStandardLayout; -} VkPhysicalDeviceUniformBufferStandardLayoutFeaturesKHR; +typedef VkPhysicalDeviceUniformBufferStandardLayoutFeatures VkPhysicalDeviceUniformBufferStandardLayoutFeaturesKHR; + + + +#define VK_KHR_buffer_device_address 1 +#define VK_KHR_BUFFER_DEVICE_ADDRESS_SPEC_VERSION 1 +#define VK_KHR_BUFFER_DEVICE_ADDRESS_EXTENSION_NAME "VK_KHR_buffer_device_address" +typedef VkPhysicalDeviceBufferDeviceAddressFeatures VkPhysicalDeviceBufferDeviceAddressFeaturesKHR; + +typedef VkBufferDeviceAddressInfo VkBufferDeviceAddressInfoKHR; + +typedef VkBufferOpaqueCaptureAddressCreateInfo VkBufferOpaqueCaptureAddressCreateInfoKHR; + +typedef VkMemoryOpaqueCaptureAddressAllocateInfo VkMemoryOpaqueCaptureAddressAllocateInfoKHR; + +typedef VkDeviceMemoryOpaqueCaptureAddressInfo VkDeviceMemoryOpaqueCaptureAddressInfoKHR; +typedef VkDeviceAddress (VKAPI_PTR *PFN_vkGetBufferDeviceAddressKHR)(VkDevice device, const VkBufferDeviceAddressInfo* pInfo); +typedef uint64_t (VKAPI_PTR *PFN_vkGetBufferOpaqueCaptureAddressKHR)(VkDevice device, const VkBufferDeviceAddressInfo* pInfo); +typedef uint64_t (VKAPI_PTR *PFN_vkGetDeviceMemoryOpaqueCaptureAddressKHR)(VkDevice device, const VkDeviceMemoryOpaqueCaptureAddressInfo* pInfo); + +#ifndef VK_NO_PROTOTYPES +VKAPI_ATTR VkDeviceAddress VKAPI_CALL vkGetBufferDeviceAddressKHR( + VkDevice device, + const VkBufferDeviceAddressInfo* pInfo); + +VKAPI_ATTR uint64_t VKAPI_CALL vkGetBufferOpaqueCaptureAddressKHR( + VkDevice device, + const VkBufferDeviceAddressInfo* pInfo); + +VKAPI_ATTR uint64_t VKAPI_CALL vkGetDeviceMemoryOpaqueCaptureAddressKHR( + VkDevice device, + const VkDeviceMemoryOpaqueCaptureAddressInfo* pInfo); +#endif #define VK_KHR_pipeline_executable_properties 1 @@ -6811,7 +7733,7 @@ #define VK_AMD_draw_indirect_count 1 -#define VK_AMD_DRAW_INDIRECT_COUNT_SPEC_VERSION 1 +#define VK_AMD_DRAW_INDIRECT_COUNT_SPEC_VERSION 2 #define VK_AMD_DRAW_INDIRECT_COUNT_EXTENSION_NAME "VK_AMD_draw_indirect_count" typedef void (VKAPI_PTR *PFN_vkCmdDrawIndirectCountAMD)(VkCommandBuffer commandBuffer, VkBuffer buffer, VkDeviceSize offset, VkBuffer countBuffer, VkDeviceSize countBufferOffset, uint32_t maxDrawCount, uint32_t stride); typedef void (VKAPI_PTR *PFN_vkCmdDrawIndexedIndirectCountAMD)(VkCommandBuffer commandBuffer, VkBuffer buffer, VkDeviceSize offset, VkBuffer countBuffer, VkDeviceSize countBufferOffset, uint32_t maxDrawCount, uint32_t stride); @@ -6988,7 +7910,7 @@ #define VK_EXT_validation_flags 1 -#define VK_EXT_VALIDATION_FLAGS_SPEC_VERSION 1 +#define VK_EXT_VALIDATION_FLAGS_SPEC_VERSION 2 #define VK_EXT_VALIDATION_FLAGS_EXTENSION_NAME "VK_EXT_validation_flags" typedef enum VkValidationCheckEXT { @@ -7023,7 +7945,7 @@ #define VK_EXT_TEXTURE_COMPRESSION_ASTC_HDR_EXTENSION_NAME "VK_EXT_texture_compression_astc_hdr" typedef struct VkPhysicalDeviceTextureCompressionASTCHDRFeaturesEXT { VkStructureType sType; - const void* pNext; + void* pNext; VkBool32 textureCompressionASTC_HDR; } VkPhysicalDeviceTextureCompressionASTCHDRFeaturesEXT; @@ -7047,7 +7969,7 @@ #define VK_EXT_conditional_rendering 1 -#define VK_EXT_CONDITIONAL_RENDERING_SPEC_VERSION 1 +#define VK_EXT_CONDITIONAL_RENDERING_SPEC_VERSION 2 #define VK_EXT_CONDITIONAL_RENDERING_EXTENSION_NAME "VK_EXT_conditional_rendering" typedef enum VkConditionalRenderingFlagBitsEXT { @@ -7679,7 +8601,7 @@ #define VK_EXT_hdr_metadata 1 -#define VK_EXT_HDR_METADATA_SPEC_VERSION 1 +#define VK_EXT_HDR_METADATA_SPEC_VERSION 2 #define VK_EXT_HDR_METADATA_EXTENSION_NAME "VK_EXT_hdr_metadata" typedef struct VkXYColorEXT { float x; @@ -7863,30 +8785,13 @@ #define VK_EXT_sampler_filter_minmax 1 -#define VK_EXT_SAMPLER_FILTER_MINMAX_SPEC_VERSION 1 +#define VK_EXT_SAMPLER_FILTER_MINMAX_SPEC_VERSION 2 #define VK_EXT_SAMPLER_FILTER_MINMAX_EXTENSION_NAME "VK_EXT_sampler_filter_minmax" +typedef VkSamplerReductionMode VkSamplerReductionModeEXT; -typedef enum VkSamplerReductionModeEXT { - VK_SAMPLER_REDUCTION_MODE_WEIGHTED_AVERAGE_EXT = 0, - VK_SAMPLER_REDUCTION_MODE_MIN_EXT = 1, - VK_SAMPLER_REDUCTION_MODE_MAX_EXT = 2, - VK_SAMPLER_REDUCTION_MODE_BEGIN_RANGE_EXT = VK_SAMPLER_REDUCTION_MODE_WEIGHTED_AVERAGE_EXT, - VK_SAMPLER_REDUCTION_MODE_END_RANGE_EXT = VK_SAMPLER_REDUCTION_MODE_MAX_EXT, - VK_SAMPLER_REDUCTION_MODE_RANGE_SIZE_EXT = (VK_SAMPLER_REDUCTION_MODE_MAX_EXT - VK_SAMPLER_REDUCTION_MODE_WEIGHTED_AVERAGE_EXT + 1), - VK_SAMPLER_REDUCTION_MODE_MAX_ENUM_EXT = 0x7FFFFFFF -} VkSamplerReductionModeEXT; -typedef struct VkSamplerReductionModeCreateInfoEXT { - VkStructureType sType; - const void* pNext; - VkSamplerReductionModeEXT reductionMode; -} VkSamplerReductionModeCreateInfoEXT; +typedef VkSamplerReductionModeCreateInfo VkSamplerReductionModeCreateInfoEXT; -typedef struct VkPhysicalDeviceSamplerFilterMinmaxPropertiesEXT { - VkStructureType sType; - void* pNext; - VkBool32 filterMinmaxSingleComponentFormats; - VkBool32 filterMinmaxImageComponentMapping; -} VkPhysicalDeviceSamplerFilterMinmaxPropertiesEXT; +typedef VkPhysicalDeviceSamplerFilterMinmaxProperties VkPhysicalDeviceSamplerFilterMinmaxPropertiesEXT; @@ -8245,87 +9150,19 @@ #define VK_EXT_descriptor_indexing 1 #define VK_EXT_DESCRIPTOR_INDEXING_SPEC_VERSION 2 #define VK_EXT_DESCRIPTOR_INDEXING_EXTENSION_NAME "VK_EXT_descriptor_indexing" +typedef VkDescriptorBindingFlagBits VkDescriptorBindingFlagBitsEXT; -typedef enum VkDescriptorBindingFlagBitsEXT { - VK_DESCRIPTOR_BINDING_UPDATE_AFTER_BIND_BIT_EXT = 0x00000001, - VK_DESCRIPTOR_BINDING_UPDATE_UNUSED_WHILE_PENDING_BIT_EXT = 0x00000002, - VK_DESCRIPTOR_BINDING_PARTIALLY_BOUND_BIT_EXT = 0x00000004, - VK_DESCRIPTOR_BINDING_VARIABLE_DESCRIPTOR_COUNT_BIT_EXT = 0x00000008, - VK_DESCRIPTOR_BINDING_FLAG_BITS_MAX_ENUM_EXT = 0x7FFFFFFF -} VkDescriptorBindingFlagBitsEXT; -typedef VkFlags VkDescriptorBindingFlagsEXT; -typedef struct VkDescriptorSetLayoutBindingFlagsCreateInfoEXT { - VkStructureType sType; - const void* pNext; - uint32_t bindingCount; - const VkDescriptorBindingFlagsEXT* pBindingFlags; -} VkDescriptorSetLayoutBindingFlagsCreateInfoEXT; +typedef VkDescriptorBindingFlags VkDescriptorBindingFlagsEXT; -typedef struct VkPhysicalDeviceDescriptorIndexingFeaturesEXT { - VkStructureType sType; - void* pNext; - VkBool32 shaderInputAttachmentArrayDynamicIndexing; - VkBool32 shaderUniformTexelBufferArrayDynamicIndexing; - VkBool32 shaderStorageTexelBufferArrayDynamicIndexing; - VkBool32 shaderUniformBufferArrayNonUniformIndexing; - VkBool32 shaderSampledImageArrayNonUniformIndexing; - VkBool32 shaderStorageBufferArrayNonUniformIndexing; - VkBool32 shaderStorageImageArrayNonUniformIndexing; - VkBool32 shaderInputAttachmentArrayNonUniformIndexing; - VkBool32 shaderUniformTexelBufferArrayNonUniformIndexing; - VkBool32 shaderStorageTexelBufferArrayNonUniformIndexing; - VkBool32 descriptorBindingUniformBufferUpdateAfterBind; - VkBool32 descriptorBindingSampledImageUpdateAfterBind; - VkBool32 descriptorBindingStorageImageUpdateAfterBind; - VkBool32 descriptorBindingStorageBufferUpdateAfterBind; - VkBool32 descriptorBindingUniformTexelBufferUpdateAfterBind; - VkBool32 descriptorBindingStorageTexelBufferUpdateAfterBind; - VkBool32 descriptorBindingUpdateUnusedWhilePending; - VkBool32 descriptorBindingPartiallyBound; - VkBool32 descriptorBindingVariableDescriptorCount; - VkBool32 runtimeDescriptorArray; -} VkPhysicalDeviceDescriptorIndexingFeaturesEXT; +typedef VkDescriptorSetLayoutBindingFlagsCreateInfo VkDescriptorSetLayoutBindingFlagsCreateInfoEXT; -typedef struct VkPhysicalDeviceDescriptorIndexingPropertiesEXT { - VkStructureType sType; - void* pNext; - uint32_t maxUpdateAfterBindDescriptorsInAllPools; - VkBool32 shaderUniformBufferArrayNonUniformIndexingNative; - VkBool32 shaderSampledImageArrayNonUniformIndexingNative; - VkBool32 shaderStorageBufferArrayNonUniformIndexingNative; - VkBool32 shaderStorageImageArrayNonUniformIndexingNative; - VkBool32 shaderInputAttachmentArrayNonUniformIndexingNative; - VkBool32 robustBufferAccessUpdateAfterBind; - VkBool32 quadDivergentImplicitLod; - uint32_t maxPerStageDescriptorUpdateAfterBindSamplers; - uint32_t maxPerStageDescriptorUpdateAfterBindUniformBuffers; - uint32_t maxPerStageDescriptorUpdateAfterBindStorageBuffers; - uint32_t maxPerStageDescriptorUpdateAfterBindSampledImages; - uint32_t maxPerStageDescriptorUpdateAfterBindStorageImages; - uint32_t maxPerStageDescriptorUpdateAfterBindInputAttachments; - uint32_t maxPerStageUpdateAfterBindResources; - uint32_t maxDescriptorSetUpdateAfterBindSamplers; - uint32_t maxDescriptorSetUpdateAfterBindUniformBuffers; - uint32_t maxDescriptorSetUpdateAfterBindUniformBuffersDynamic; - uint32_t maxDescriptorSetUpdateAfterBindStorageBuffers; - uint32_t maxDescriptorSetUpdateAfterBindStorageBuffersDynamic; - uint32_t maxDescriptorSetUpdateAfterBindSampledImages; - uint32_t maxDescriptorSetUpdateAfterBindStorageImages; - uint32_t maxDescriptorSetUpdateAfterBindInputAttachments; -} VkPhysicalDeviceDescriptorIndexingPropertiesEXT; +typedef VkPhysicalDeviceDescriptorIndexingFeatures VkPhysicalDeviceDescriptorIndexingFeaturesEXT; -typedef struct VkDescriptorSetVariableDescriptorCountAllocateInfoEXT { - VkStructureType sType; - const void* pNext; - uint32_t descriptorSetCount; - const uint32_t* pDescriptorCounts; -} VkDescriptorSetVariableDescriptorCountAllocateInfoEXT; +typedef VkPhysicalDeviceDescriptorIndexingProperties VkPhysicalDeviceDescriptorIndexingPropertiesEXT; -typedef struct VkDescriptorSetVariableDescriptorCountLayoutSupportEXT { - VkStructureType sType; - void* pNext; - uint32_t maxVariableDescriptorCount; -} VkDescriptorSetVariableDescriptorCountLayoutSupportEXT; +typedef VkDescriptorSetVariableDescriptorCountAllocateInfo VkDescriptorSetVariableDescriptorCountAllocateInfoEXT; + +typedef VkDescriptorSetVariableDescriptorCountLayoutSupport VkDescriptorSetVariableDescriptorCountLayoutSupportEXT; @@ -8446,6 +9283,15 @@ #define VK_NV_RAY_TRACING_EXTENSION_NAME "VK_NV_ray_tracing" #define VK_SHADER_UNUSED_NV (~0U) +typedef enum VkAccelerationStructureTypeNV { + VK_ACCELERATION_STRUCTURE_TYPE_TOP_LEVEL_NV = 0, + VK_ACCELERATION_STRUCTURE_TYPE_BOTTOM_LEVEL_NV = 1, + VK_ACCELERATION_STRUCTURE_TYPE_BEGIN_RANGE_NV = VK_ACCELERATION_STRUCTURE_TYPE_TOP_LEVEL_NV, + VK_ACCELERATION_STRUCTURE_TYPE_END_RANGE_NV = VK_ACCELERATION_STRUCTURE_TYPE_BOTTOM_LEVEL_NV, + VK_ACCELERATION_STRUCTURE_TYPE_RANGE_SIZE_NV = (VK_ACCELERATION_STRUCTURE_TYPE_BOTTOM_LEVEL_NV - VK_ACCELERATION_STRUCTURE_TYPE_TOP_LEVEL_NV + 1), + VK_ACCELERATION_STRUCTURE_TYPE_MAX_ENUM_NV = 0x7FFFFFFF +} VkAccelerationStructureTypeNV; + typedef enum VkRayTracingShaderGroupTypeNV { VK_RAY_TRACING_SHADER_GROUP_TYPE_GENERAL_NV = 0, VK_RAY_TRACING_SHADER_GROUP_TYPE_TRIANGLES_HIT_GROUP_NV = 1, @@ -8465,15 +9311,6 @@ VK_GEOMETRY_TYPE_MAX_ENUM_NV = 0x7FFFFFFF } VkGeometryTypeNV; -typedef enum VkAccelerationStructureTypeNV { - VK_ACCELERATION_STRUCTURE_TYPE_TOP_LEVEL_NV = 0, - VK_ACCELERATION_STRUCTURE_TYPE_BOTTOM_LEVEL_NV = 1, - VK_ACCELERATION_STRUCTURE_TYPE_BEGIN_RANGE_NV = VK_ACCELERATION_STRUCTURE_TYPE_TOP_LEVEL_NV, - VK_ACCELERATION_STRUCTURE_TYPE_END_RANGE_NV = VK_ACCELERATION_STRUCTURE_TYPE_BOTTOM_LEVEL_NV, - VK_ACCELERATION_STRUCTURE_TYPE_RANGE_SIZE_NV = (VK_ACCELERATION_STRUCTURE_TYPE_BOTTOM_LEVEL_NV - VK_ACCELERATION_STRUCTURE_TYPE_TOP_LEVEL_NV + 1), - VK_ACCELERATION_STRUCTURE_TYPE_MAX_ENUM_NV = 0x7FFFFFFF -} VkAccelerationStructureTypeNV; - typedef enum VkCopyAccelerationStructureModeNV { VK_COPY_ACCELERATION_STRUCTURE_MODE_CLONE_NV = 0, VK_COPY_ACCELERATION_STRUCTURE_MODE_COMPACT_NV = 1, @@ -8741,7 +9578,7 @@ #define VK_NV_representative_fragment_test 1 -#define VK_NV_REPRESENTATIVE_FRAGMENT_TEST_SPEC_VERSION 1 +#define VK_NV_REPRESENTATIVE_FRAGMENT_TEST_SPEC_VERSION 2 #define VK_NV_REPRESENTATIVE_FRAGMENT_TEST_EXTENSION_NAME "VK_NV_representative_fragment_test" typedef struct VkPhysicalDeviceRepresentativeFragmentTestFeaturesNV { VkStructureType sType; @@ -8758,7 +9595,7 @@ #define VK_EXT_filter_cubic 1 -#define VK_EXT_FILTER_CUBIC_SPEC_VERSION 2 +#define VK_EXT_FILTER_CUBIC_SPEC_VERSION 3 #define VK_EXT_FILTER_CUBIC_EXTENSION_NAME "VK_EXT_filter_cubic" typedef struct VkPhysicalDeviceImageViewImageFormatInfoEXT { VkStructureType sType; @@ -8770,7 +9607,7 @@ VkStructureType sType; void* pNext; VkBool32 filterCubic; - VkBool32 filterCubicMinmax ; + VkBool32 filterCubicMinmax; } VkFilterCubicImageViewImageFormatPropertiesEXT; @@ -8900,7 +9737,7 @@ #define VK_AMD_shader_core_properties 1 -#define VK_AMD_SHADER_CORE_PROPERTIES_SPEC_VERSION 1 +#define VK_AMD_SHADER_CORE_PROPERTIES_SPEC_VERSION 2 #define VK_AMD_SHADER_CORE_PROPERTIES_EXTENSION_NAME "VK_AMD_shader_core_properties" typedef struct VkPhysicalDeviceShaderCorePropertiesAMD { VkStructureType sType; @@ -9090,7 +9927,7 @@ #define VK_NV_shader_image_footprint 1 -#define VK_NV_SHADER_IMAGE_FOOTPRINT_SPEC_VERSION 1 +#define VK_NV_SHADER_IMAGE_FOOTPRINT_SPEC_VERSION 2 #define VK_NV_SHADER_IMAGE_FOOTPRINT_EXTENSION_NAME "VK_NV_shader_image_footprint" typedef struct VkPhysicalDeviceShaderImageFootprintFeaturesNV { VkStructureType sType; @@ -9389,11 +10226,7 @@ #define VK_EXT_scalar_block_layout 1 #define VK_EXT_SCALAR_BLOCK_LAYOUT_SPEC_VERSION 1 #define VK_EXT_SCALAR_BLOCK_LAYOUT_EXTENSION_NAME "VK_EXT_scalar_block_layout" -typedef struct VkPhysicalDeviceScalarBlockLayoutFeaturesEXT { - VkStructureType sType; - void* pNext; - VkBool32 scalarBlockLayout; -} VkPhysicalDeviceScalarBlockLayoutFeaturesEXT; +typedef VkPhysicalDeviceScalarBlockLayoutFeatures VkPhysicalDeviceScalarBlockLayoutFeaturesEXT; @@ -9451,6 +10284,17 @@ +#define VK_AMD_device_coherent_memory 1 +#define VK_AMD_DEVICE_COHERENT_MEMORY_SPEC_VERSION 1 +#define VK_AMD_DEVICE_COHERENT_MEMORY_EXTENSION_NAME "VK_AMD_device_coherent_memory" +typedef struct VkPhysicalDeviceCoherentMemoryFeaturesAMD { + VkStructureType sType; + void* pNext; + VkBool32 deviceCoherentMemory; +} VkPhysicalDeviceCoherentMemoryFeaturesAMD; + + + #define VK_EXT_memory_budget 1 #define VK_EXT_MEMORY_BUDGET_SPEC_VERSION 1 #define VK_EXT_MEMORY_BUDGET_EXTENSION_NAME "VK_EXT_memory_budget" @@ -9492,7 +10336,6 @@ #define VK_EXT_buffer_device_address 1 -typedef uint64_t VkDeviceAddress; #define VK_EXT_BUFFER_DEVICE_ADDRESS_SPEC_VERSION 2 #define VK_EXT_BUFFER_DEVICE_ADDRESS_EXTENSION_NAME "VK_EXT_buffer_device_address" typedef struct VkPhysicalDeviceBufferDeviceAddressFeaturesEXT { @@ -9505,11 +10348,7 @@ typedef VkPhysicalDeviceBufferDeviceAddressFeaturesEXT VkPhysicalDeviceBufferAddressFeaturesEXT; -typedef struct VkBufferDeviceAddressInfoEXT { - VkStructureType sType; - const void* pNext; - VkBuffer buffer; -} VkBufferDeviceAddressInfoEXT; +typedef VkBufferDeviceAddressInfo VkBufferDeviceAddressInfoEXT; typedef struct VkBufferDeviceAddressCreateInfoEXT { VkStructureType sType; @@ -9517,36 +10356,68 @@ VkDeviceAddress deviceAddress; } VkBufferDeviceAddressCreateInfoEXT; -typedef VkDeviceAddress (VKAPI_PTR *PFN_vkGetBufferDeviceAddressEXT)(VkDevice device, const VkBufferDeviceAddressInfoEXT* pInfo); +typedef VkDeviceAddress (VKAPI_PTR *PFN_vkGetBufferDeviceAddressEXT)(VkDevice device, const VkBufferDeviceAddressInfo* pInfo); #ifndef VK_NO_PROTOTYPES VKAPI_ATTR VkDeviceAddress VKAPI_CALL vkGetBufferDeviceAddressEXT( VkDevice device, - const VkBufferDeviceAddressInfoEXT* pInfo); + const VkBufferDeviceAddressInfo* pInfo); +#endif + + +#define VK_EXT_tooling_info 1 +#define VK_EXT_TOOLING_INFO_SPEC_VERSION 1 +#define VK_EXT_TOOLING_INFO_EXTENSION_NAME "VK_EXT_tooling_info" + +typedef enum VkToolPurposeFlagBitsEXT { + VK_TOOL_PURPOSE_VALIDATION_BIT_EXT = 0x00000001, + VK_TOOL_PURPOSE_PROFILING_BIT_EXT = 0x00000002, + VK_TOOL_PURPOSE_TRACING_BIT_EXT = 0x00000004, + VK_TOOL_PURPOSE_ADDITIONAL_FEATURES_BIT_EXT = 0x00000008, + VK_TOOL_PURPOSE_MODIFYING_FEATURES_BIT_EXT = 0x00000010, + VK_TOOL_PURPOSE_DEBUG_REPORTING_BIT_EXT = 0x00000020, + VK_TOOL_PURPOSE_DEBUG_MARKERS_BIT_EXT = 0x00000040, + VK_TOOL_PURPOSE_FLAG_BITS_MAX_ENUM_EXT = 0x7FFFFFFF +} VkToolPurposeFlagBitsEXT; +typedef VkFlags VkToolPurposeFlagsEXT; +typedef struct VkPhysicalDeviceToolPropertiesEXT { + VkStructureType sType; + void* pNext; + char name[VK_MAX_EXTENSION_NAME_SIZE]; + char version[VK_MAX_EXTENSION_NAME_SIZE]; + VkToolPurposeFlagsEXT purposes; + char description[VK_MAX_DESCRIPTION_SIZE]; + char layer[VK_MAX_EXTENSION_NAME_SIZE]; +} VkPhysicalDeviceToolPropertiesEXT; + +typedef VkResult (VKAPI_PTR *PFN_vkGetPhysicalDeviceToolPropertiesEXT)(VkPhysicalDevice physicalDevice, uint32_t* pToolCount, VkPhysicalDeviceToolPropertiesEXT* pToolProperties); + +#ifndef VK_NO_PROTOTYPES +VKAPI_ATTR VkResult VKAPI_CALL vkGetPhysicalDeviceToolPropertiesEXT( + VkPhysicalDevice physicalDevice, + uint32_t* pToolCount, + VkPhysicalDeviceToolPropertiesEXT* pToolProperties); #endif #define VK_EXT_separate_stencil_usage 1 #define VK_EXT_SEPARATE_STENCIL_USAGE_SPEC_VERSION 1 #define VK_EXT_SEPARATE_STENCIL_USAGE_EXTENSION_NAME "VK_EXT_separate_stencil_usage" -typedef struct VkImageStencilUsageCreateInfoEXT { - VkStructureType sType; - const void* pNext; - VkImageUsageFlags stencilUsage; -} VkImageStencilUsageCreateInfoEXT; +typedef VkImageStencilUsageCreateInfo VkImageStencilUsageCreateInfoEXT; #define VK_EXT_validation_features 1 -#define VK_EXT_VALIDATION_FEATURES_SPEC_VERSION 1 +#define VK_EXT_VALIDATION_FEATURES_SPEC_VERSION 2 #define VK_EXT_VALIDATION_FEATURES_EXTENSION_NAME "VK_EXT_validation_features" typedef enum VkValidationFeatureEnableEXT { VK_VALIDATION_FEATURE_ENABLE_GPU_ASSISTED_EXT = 0, VK_VALIDATION_FEATURE_ENABLE_GPU_ASSISTED_RESERVE_BINDING_SLOT_EXT = 1, + VK_VALIDATION_FEATURE_ENABLE_BEST_PRACTICES_EXT = 2, VK_VALIDATION_FEATURE_ENABLE_BEGIN_RANGE_EXT = VK_VALIDATION_FEATURE_ENABLE_GPU_ASSISTED_EXT, - VK_VALIDATION_FEATURE_ENABLE_END_RANGE_EXT = VK_VALIDATION_FEATURE_ENABLE_GPU_ASSISTED_RESERVE_BINDING_SLOT_EXT, - VK_VALIDATION_FEATURE_ENABLE_RANGE_SIZE_EXT = (VK_VALIDATION_FEATURE_ENABLE_GPU_ASSISTED_RESERVE_BINDING_SLOT_EXT - VK_VALIDATION_FEATURE_ENABLE_GPU_ASSISTED_EXT + 1), + VK_VALIDATION_FEATURE_ENABLE_END_RANGE_EXT = VK_VALIDATION_FEATURE_ENABLE_BEST_PRACTICES_EXT, + VK_VALIDATION_FEATURE_ENABLE_RANGE_SIZE_EXT = (VK_VALIDATION_FEATURE_ENABLE_BEST_PRACTICES_EXT - VK_VALIDATION_FEATURE_ENABLE_GPU_ASSISTED_EXT + 1), VK_VALIDATION_FEATURE_ENABLE_MAX_ENUM_EXT = 0x7FFFFFFF } VkValidationFeatureEnableEXT; @@ -9712,7 +10583,7 @@ #define VK_EXT_headless_surface 1 -#define VK_EXT_HEADLESS_SURFACE_SPEC_VERSION 0 +#define VK_EXT_HEADLESS_SURFACE_SPEC_VERSION 1 #define VK_EXT_HEADLESS_SURFACE_EXTENSION_NAME "VK_EXT_headless_surface" typedef VkFlags VkHeadlessSurfaceCreateFlagsEXT; typedef struct VkHeadlessSurfaceCreateInfoEXT { @@ -9785,11 +10656,7 @@ #define VK_EXT_host_query_reset 1 #define VK_EXT_HOST_QUERY_RESET_SPEC_VERSION 1 #define VK_EXT_HOST_QUERY_RESET_EXTENSION_NAME "VK_EXT_host_query_reset" -typedef struct VkPhysicalDeviceHostQueryResetFeaturesEXT { - VkStructureType sType; - void* pNext; - VkBool32 hostQueryReset; -} VkPhysicalDeviceHostQueryResetFeaturesEXT; +typedef VkPhysicalDeviceHostQueryResetFeatures VkPhysicalDeviceHostQueryResetFeaturesEXT; typedef void (VKAPI_PTR *PFN_vkResetQueryPoolEXT)(VkDevice device, VkQueryPool queryPool, uint32_t firstQuery, uint32_t queryCount); @@ -9843,6 +10710,11 @@ } VkPhysicalDeviceTexelBufferAlignmentPropertiesEXT; + +#define VK_GOOGLE_user_type 1 +#define VK_GOOGLE_USER_TYPE_SPEC_VERSION 1 +#define VK_GOOGLE_USER_TYPE_EXTENSION_NAME "VK_GOOGLE_user_type" + #ifdef __cplusplus } #endif diff -Nru mesa-19.2.8/include/vulkan/vulkan_fuchsia.h mesa-20.0.8/include/vulkan/vulkan_fuchsia.h --- mesa-19.2.8/include/vulkan/vulkan_fuchsia.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/include/vulkan/vulkan_fuchsia.h 2020-06-12 01:21:16.000000000 +0000 @@ -1,12 +1,8 @@ #ifndef VULKAN_FUCHSIA_H_ #define VULKAN_FUCHSIA_H_ 1 -#ifdef __cplusplus -extern "C" { -#endif - /* -** Copyright (c) 2015-2019 The Khronos Group Inc. +** Copyright (c) 2015-2020 The Khronos Group Inc. ** ** Licensed under the Apache License, Version 2.0 (the "License"); ** you may not use this file except in compliance with the License. @@ -27,6 +23,11 @@ */ +#ifdef __cplusplus +extern "C" { +#endif + + #define VK_FUCHSIA_imagepipe_surface 1 #define VK_FUCHSIA_IMAGEPIPE_SURFACE_SPEC_VERSION 1 diff -Nru mesa-19.2.8/include/vulkan/vulkan_ggp.h mesa-20.0.8/include/vulkan/vulkan_ggp.h --- mesa-19.2.8/include/vulkan/vulkan_ggp.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/include/vulkan/vulkan_ggp.h 2020-06-12 01:21:16.000000000 +0000 @@ -1,12 +1,8 @@ #ifndef VULKAN_GGP_H_ #define VULKAN_GGP_H_ 1 -#ifdef __cplusplus -extern "C" { -#endif - /* -** Copyright (c) 2015-2019 The Khronos Group Inc. +** Copyright (c) 2015-2020 The Khronos Group Inc. ** ** Licensed under the Apache License, Version 2.0 (the "License"); ** you may not use this file except in compliance with the License. @@ -27,6 +23,11 @@ */ +#ifdef __cplusplus +extern "C" { +#endif + + #define VK_GGP_stream_descriptor_surface 1 #define VK_GGP_STREAM_DESCRIPTOR_SURFACE_SPEC_VERSION 1 diff -Nru mesa-19.2.8/include/vulkan/vulkan.h mesa-20.0.8/include/vulkan/vulkan.h --- mesa-19.2.8/include/vulkan/vulkan.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/include/vulkan/vulkan.h 2020-06-12 01:21:16.000000000 +0000 @@ -2,7 +2,7 @@ #define VULKAN_H_ 1 /* -** Copyright (c) 2015-2019 The Khronos Group Inc. +** Copyright (c) 2015-2020 The Khronos Group Inc. ** ** Licensed under the Apache License, Version 2.0 (the "License"); ** you may not use this file except in compliance with the License. diff -Nru mesa-19.2.8/include/vulkan/vulkan_ios.h mesa-20.0.8/include/vulkan/vulkan_ios.h --- mesa-19.2.8/include/vulkan/vulkan_ios.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/include/vulkan/vulkan_ios.h 2020-06-12 01:21:16.000000000 +0000 @@ -1,12 +1,8 @@ #ifndef VULKAN_IOS_H_ #define VULKAN_IOS_H_ 1 -#ifdef __cplusplus -extern "C" { -#endif - /* -** Copyright (c) 2015-2019 The Khronos Group Inc. +** Copyright (c) 2015-2020 The Khronos Group Inc. ** ** Licensed under the Apache License, Version 2.0 (the "License"); ** you may not use this file except in compliance with the License. @@ -27,6 +23,11 @@ */ +#ifdef __cplusplus +extern "C" { +#endif + + #define VK_MVK_ios_surface 1 #define VK_MVK_IOS_SURFACE_SPEC_VERSION 2 diff -Nru mesa-19.2.8/include/vulkan/vulkan_macos.h mesa-20.0.8/include/vulkan/vulkan_macos.h --- mesa-19.2.8/include/vulkan/vulkan_macos.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/include/vulkan/vulkan_macos.h 2020-06-12 01:21:16.000000000 +0000 @@ -1,12 +1,8 @@ #ifndef VULKAN_MACOS_H_ #define VULKAN_MACOS_H_ 1 -#ifdef __cplusplus -extern "C" { -#endif - /* -** Copyright (c) 2015-2019 The Khronos Group Inc. +** Copyright (c) 2015-2020 The Khronos Group Inc. ** ** Licensed under the Apache License, Version 2.0 (the "License"); ** you may not use this file except in compliance with the License. @@ -27,6 +23,11 @@ */ +#ifdef __cplusplus +extern "C" { +#endif + + #define VK_MVK_macos_surface 1 #define VK_MVK_MACOS_SURFACE_SPEC_VERSION 2 diff -Nru mesa-19.2.8/include/vulkan/vulkan_metal.h mesa-20.0.8/include/vulkan/vulkan_metal.h --- mesa-19.2.8/include/vulkan/vulkan_metal.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/include/vulkan/vulkan_metal.h 2020-06-12 01:21:16.000000000 +0000 @@ -1,12 +1,8 @@ #ifndef VULKAN_METAL_H_ #define VULKAN_METAL_H_ 1 -#ifdef __cplusplus -extern "C" { -#endif - /* -** Copyright (c) 2015-2019 The Khronos Group Inc. +** Copyright (c) 2015-2020 The Khronos Group Inc. ** ** Licensed under the Apache License, Version 2.0 (the "License"); ** you may not use this file except in compliance with the License. @@ -27,6 +23,11 @@ */ +#ifdef __cplusplus +extern "C" { +#endif + + #define VK_EXT_metal_surface 1 diff -Nru mesa-19.2.8/include/vulkan/vulkan_vi.h mesa-20.0.8/include/vulkan/vulkan_vi.h --- mesa-19.2.8/include/vulkan/vulkan_vi.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/include/vulkan/vulkan_vi.h 2020-06-12 01:21:16.000000000 +0000 @@ -1,12 +1,8 @@ #ifndef VULKAN_VI_H_ #define VULKAN_VI_H_ 1 -#ifdef __cplusplus -extern "C" { -#endif - /* -** Copyright (c) 2015-2019 The Khronos Group Inc. +** Copyright (c) 2015-2020 The Khronos Group Inc. ** ** Licensed under the Apache License, Version 2.0 (the "License"); ** you may not use this file except in compliance with the License. @@ -27,6 +23,11 @@ */ +#ifdef __cplusplus +extern "C" { +#endif + + #define VK_NN_vi_surface 1 #define VK_NN_VI_SURFACE_SPEC_VERSION 1 diff -Nru mesa-19.2.8/include/vulkan/vulkan_wayland.h mesa-20.0.8/include/vulkan/vulkan_wayland.h --- mesa-19.2.8/include/vulkan/vulkan_wayland.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/include/vulkan/vulkan_wayland.h 2020-06-12 01:21:16.000000000 +0000 @@ -1,12 +1,8 @@ #ifndef VULKAN_WAYLAND_H_ #define VULKAN_WAYLAND_H_ 1 -#ifdef __cplusplus -extern "C" { -#endif - /* -** Copyright (c) 2015-2019 The Khronos Group Inc. +** Copyright (c) 2015-2020 The Khronos Group Inc. ** ** Licensed under the Apache License, Version 2.0 (the "License"); ** you may not use this file except in compliance with the License. @@ -27,6 +23,11 @@ */ +#ifdef __cplusplus +extern "C" { +#endif + + #define VK_KHR_wayland_surface 1 #define VK_KHR_WAYLAND_SURFACE_SPEC_VERSION 6 diff -Nru mesa-19.2.8/include/vulkan/vulkan_win32.h mesa-20.0.8/include/vulkan/vulkan_win32.h --- mesa-19.2.8/include/vulkan/vulkan_win32.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/include/vulkan/vulkan_win32.h 2020-06-12 01:21:16.000000000 +0000 @@ -1,12 +1,8 @@ #ifndef VULKAN_WIN32_H_ #define VULKAN_WIN32_H_ 1 -#ifdef __cplusplus -extern "C" { -#endif - /* -** Copyright (c) 2015-2019 The Khronos Group Inc. +** Copyright (c) 2015-2020 The Khronos Group Inc. ** ** Licensed under the Apache License, Version 2.0 (the "License"); ** you may not use this file except in compliance with the License. @@ -27,6 +23,11 @@ */ +#ifdef __cplusplus +extern "C" { +#endif + + #define VK_KHR_win32_surface 1 #define VK_KHR_WIN32_SURFACE_SPEC_VERSION 6 @@ -246,7 +247,7 @@ #define VK_NV_win32_keyed_mutex 1 -#define VK_NV_WIN32_KEYED_MUTEX_SPEC_VERSION 1 +#define VK_NV_WIN32_KEYED_MUTEX_SPEC_VERSION 2 #define VK_NV_WIN32_KEYED_MUTEX_EXTENSION_NAME "VK_NV_win32_keyed_mutex" typedef struct VkWin32KeyedMutexAcquireReleaseInfoNV { VkStructureType sType; @@ -263,7 +264,7 @@ #define VK_EXT_full_screen_exclusive 1 -#define VK_EXT_FULL_SCREEN_EXCLUSIVE_SPEC_VERSION 3 +#define VK_EXT_FULL_SCREEN_EXCLUSIVE_SPEC_VERSION 4 #define VK_EXT_FULL_SCREEN_EXCLUSIVE_EXTENSION_NAME "VK_EXT_full_screen_exclusive" typedef enum VkFullScreenExclusiveEXT { diff -Nru mesa-19.2.8/include/vulkan/vulkan_xcb.h mesa-20.0.8/include/vulkan/vulkan_xcb.h --- mesa-19.2.8/include/vulkan/vulkan_xcb.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/include/vulkan/vulkan_xcb.h 2020-06-12 01:21:16.000000000 +0000 @@ -1,12 +1,8 @@ #ifndef VULKAN_XCB_H_ #define VULKAN_XCB_H_ 1 -#ifdef __cplusplus -extern "C" { -#endif - /* -** Copyright (c) 2015-2019 The Khronos Group Inc. +** Copyright (c) 2015-2020 The Khronos Group Inc. ** ** Licensed under the Apache License, Version 2.0 (the "License"); ** you may not use this file except in compliance with the License. @@ -27,6 +23,11 @@ */ +#ifdef __cplusplus +extern "C" { +#endif + + #define VK_KHR_xcb_surface 1 #define VK_KHR_XCB_SURFACE_SPEC_VERSION 6 diff -Nru mesa-19.2.8/include/vulkan/vulkan_xlib.h mesa-20.0.8/include/vulkan/vulkan_xlib.h --- mesa-19.2.8/include/vulkan/vulkan_xlib.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/include/vulkan/vulkan_xlib.h 2020-06-12 01:21:16.000000000 +0000 @@ -1,12 +1,8 @@ #ifndef VULKAN_XLIB_H_ #define VULKAN_XLIB_H_ 1 -#ifdef __cplusplus -extern "C" { -#endif - /* -** Copyright (c) 2015-2019 The Khronos Group Inc. +** Copyright (c) 2015-2020 The Khronos Group Inc. ** ** Licensed under the Apache License, Version 2.0 (the "License"); ** you may not use this file except in compliance with the License. @@ -27,6 +23,11 @@ */ +#ifdef __cplusplus +extern "C" { +#endif + + #define VK_KHR_xlib_surface 1 #define VK_KHR_XLIB_SURFACE_SPEC_VERSION 6 diff -Nru mesa-19.2.8/include/vulkan/vulkan_xlib_xrandr.h mesa-20.0.8/include/vulkan/vulkan_xlib_xrandr.h --- mesa-19.2.8/include/vulkan/vulkan_xlib_xrandr.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/include/vulkan/vulkan_xlib_xrandr.h 2020-06-12 01:21:16.000000000 +0000 @@ -1,12 +1,8 @@ #ifndef VULKAN_XLIB_XRANDR_H_ #define VULKAN_XLIB_XRANDR_H_ 1 -#ifdef __cplusplus -extern "C" { -#endif - /* -** Copyright (c) 2015-2019 The Khronos Group Inc. +** Copyright (c) 2015-2020 The Khronos Group Inc. ** ** Licensed under the Apache License, Version 2.0 (the "License"); ** you may not use this file except in compliance with the License. @@ -27,6 +23,11 @@ */ +#ifdef __cplusplus +extern "C" { +#endif + + #define VK_EXT_acquire_xlib_display 1 #define VK_EXT_ACQUIRE_XLIB_DISPLAY_SPEC_VERSION 1 diff -Nru mesa-19.2.8/.mailmap mesa-20.0.8/.mailmap --- mesa-19.2.8/.mailmap 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/.mailmap 2020-06-12 01:21:16.000000000 +0000 @@ -26,6 +26,8 @@ Alexander von Gluck IV Alexander von Gluck +Alexandros Frantzis + Alex Corscadden Alex Corscadden @@ -50,6 +52,8 @@ Arthur Huillet Arthur HUILLET +Bas Nieuwenhuizen + Benjamin Franzke ben Ben Skeggs @@ -129,8 +133,8 @@ David Miller Dave Miller David Miller davem69 -David Heidelberger David Heidelberg -David Heidelberger +David Heidelberg David Heidelberger +David Heidelberg David Reveman @@ -142,6 +146,8 @@ Edward O'Callaghan +Elie Tournier + Emeric Grange Emeric Emil Velikov @@ -154,6 +160,7 @@ Eric Anholt Eric Anholt Eric Engestrom +Eric Engestrom Eugeni Dodonov @@ -162,10 +169,14 @@ Feng, Haitao Haitao Feng +Frank Binns + Frank Henigman George Sapountzis George Sapountzis +Gert Wollny + Gwenole Beauchesne Hamish Marson hmarson @@ -184,6 +195,8 @@ Jakob Bornecrantz Jakob Bornecrantz Jakob Bornecrantz +Jakob Bornecrantz +Jakob Bornecrantz Jakub Bogusz @@ -328,6 +341,7 @@ Michel Dänzer Michel Dänzer Michel Daenzer Michel Dänzer Michel Daenzer +Michel Dänzer Mike Kaplinskiy Mike Kaplinksiy Mike Kaplinskiy @@ -453,6 +467,8 @@ Tom Stellard Tom Stellard Thomas Stellard +Tomeu Vizoso + Tormod Volden Török Edwin Török Edvin diff -Nru mesa-19.2.8/meson.build mesa-20.0.8/meson.build --- mesa-19.2.8/meson.build 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/meson.build 2020-06-12 01:21:16.000000000 +0000 @@ -42,7 +42,7 @@ '-D__STDC_FORMAT_MACROS', '-D__STDC_LIMIT_MACROS', '-DPACKAGE_VERSION="@0@"'.format(meson.project_version()), - '-DPACKAGE_BUGREPORT="https://gitlab.freedesktop.org/mesa/mesa/issues"', + '-DPACKAGE_BUGREPORT="https://gitlab.freedesktop.org/mesa/mesa/-/issues"', ] with_vulkan_icd_dir = get_option('vulkan-icd-dir') @@ -53,6 +53,7 @@ with_glx_direct = get_option('glx-direct') with_osmesa = get_option('osmesa') with_swr_arches = get_option('swr-arches') +with_vulkan_overlay_layer = get_option('vulkan-overlay-layer') with_tools = get_option('tools') if with_tools.contains('all') with_tools = [ @@ -69,6 +70,9 @@ ] endif +with_intel_tools = with_tools.contains('intel') or with_tools.contains('intel-ui') +with_imgui = with_intel_tools or with_vulkan_overlay_layer + dri_drivers_path = get_option('dri-drivers-path') if dri_drivers_path == '' dri_drivers_path = join_paths(get_option('prefix'), get_option('libdir'), 'dri') @@ -89,7 +93,14 @@ endif endif with_opengl = get_option('opengl') -with_shared_glapi = get_option('shared-glapi') + +# Default shared glapi off for windows, on elsewhere. +_sg = get_option('shared-glapi') +if _sg == 'auto' + with_shared_glapi = host_machine.system() != 'windows' +else + with_shared_glapi = _sg == 'true' +endif # shared-glapi is required if at least two OpenGL APIs are being built if not with_shared_glapi @@ -115,7 +126,7 @@ with_gles2 = with_gles2 != 'false' with_any_opengl = with_opengl or with_gles1 or with_gles2 # Only build shared_glapi if at least one OpenGL API is enabled -with_shared_glapi = get_option('shared-glapi') and with_any_opengl +with_shared_glapi = with_shared_glapi and with_any_opengl system_has_kms_drm = ['openbsd', 'netbsd', 'freebsd', 'gnu/kfreebsd', 'dragonfly', 'linux', 'sunos'].contains(host_machine.system()) @@ -155,7 +166,8 @@ # TODO: PPC, Sparc if ['x86', 'x86_64'].contains(host_machine.cpu_family()) gallium_drivers = [ - 'r300', 'r600', 'radeonsi', 'nouveau', 'virgl', 'svga', 'swrast' + 'r300', 'r600', 'radeonsi', 'nouveau', 'virgl', 'svga', 'swrast', + 'iris' ] elif ['arm', 'aarch64'].contains(host_machine.cpu_family()) gallium_drivers = [ @@ -191,13 +203,10 @@ with_gallium_virgl = gallium_drivers.contains('virgl') with_gallium_swr = gallium_drivers.contains('swr') with_gallium_lima = gallium_drivers.contains('lima') +with_gallium_zink = gallium_drivers.contains('zink') -if cc.get_id() == 'intel' - if meson.version().version_compare('< 0.49.0') - error('Meson does not have sufficient support of ICC before 0.49.0 to compile mesa') - elif with_gallium_swr and meson.version().version_compare('== 0.49.0') - warning('Meson as of 0.49.0 is sufficient for compiling mesa with ICC, but there are some caveats with SWR. 0.49.1 should resolve all of these') - endif +if cc.get_id().startswith('intel') and meson.version().version_compare('< 0.49.1') + error('Meson does not have sufficient support of ICC before 0.49.1 to compile mesa') endif with_gallium = gallium_drivers.length() != 0 and gallium_drivers != [''] @@ -270,10 +279,12 @@ if _platforms.contains('auto') if system_has_kms_drm _platforms = ['x11', 'wayland', 'drm', 'surfaceless'] - elif ['darwin', 'windows', 'cygwin'].contains(host_machine.system()) + elif ['darwin', 'cygwin'].contains(host_machine.system()) _platforms = ['x11', 'surfaceless'] elif ['haiku'].contains(host_machine.system()) _platforms = ['haiku'] + elif host_machine.system() == 'windows' + _platforms = ['windows'] else error('Unknown OS @0@. Please pass -Dplatforms to set platforms. Patches gladly accepted to fix this.'.format( host_machine.system())) @@ -286,6 +297,7 @@ with_platform_drm = _platforms.contains('drm') with_platform_haiku = _platforms.contains('haiku') with_platform_surfaceless = _platforms.contains('surfaceless') +with_platform_windows = _platforms.contains('windows') with_platforms = false if _platforms.length() != 0 and _platforms != [''] @@ -310,6 +322,8 @@ with_glx = 'dri' elif with_platform_haiku with_glx = 'disabled' + elif host_machine.system() == 'windows' + with_glx = 'disabled' elif with_gallium # Even when building just gallium drivers the user probably wants dri with_glx = 'dri' @@ -378,7 +392,7 @@ endif # Android uses emutls for versions <= P/28. For USE_ELF_TLS we need ELF TLS. -if not with_platform_android or get_option('platform-sdk-version') >= 29 +if host_machine.system() != 'windows' and (not with_platform_android or get_option('platform-sdk-version') >= 29) pre_args += '-DUSE_ELF_TLS' endif @@ -406,7 +420,9 @@ with_glvnd = get_option('glvnd') if with_glvnd - if with_glx == 'xlib' or with_glx == 'gallium-xlib' + if with_platform_windows + error('glvnd cannot be used on Windows') + elif with_glx == 'xlib' or with_glx == 'gallium-xlib' error('Cannot build glvnd support for GLX that is not DRI based.') elif with_glx == 'disabled' and not with_egl error('glvnd requires DRI based GLX and/or EGL') @@ -420,6 +436,7 @@ with_vulkan_icd_dir = join_paths(get_option('datadir'), 'vulkan/icd.d') endif +# GNU/Hurd includes egl_dri2, without drm. with_dri2 = (with_dri or with_any_vk) and (with_dri_platform == 'drm' or host_machine.system() == 'gnu') _dri3 = get_option('dri3') @@ -438,8 +455,6 @@ endif endif -prog_pkgconfig = find_program('pkg-config') - _vdpau = get_option('gallium-vdpau') if not system_has_kms_drm if _vdpau == 'true' @@ -466,9 +481,7 @@ if _vdpau != 'false' dep_vdpau = dependency('vdpau', version : '>= 1.1', required : _vdpau == 'true') if dep_vdpau.found() - dep_vdpau = declare_dependency( - compile_args : run_command(prog_pkgconfig, ['vdpau', '--cflags']).stdout().split() - ) + dep_vdpau = dep_vdpau.partial_dependency(compile_args : true) with_gallium_vdpau = true endif endif @@ -481,6 +494,10 @@ vdpau_drivers_path = join_paths(get_option('libdir'), 'vdpau') endif +if with_gallium_zink + dep_vulkan = dependency('vulkan') +endif + _xvmc = get_option('gallium-xvmc') if not system_has_kms_drm if _xvmc == 'true' @@ -614,9 +631,7 @@ if _va != 'false' dep_va = dependency('libva', version : '>= 0.38.0', required : _va == 'true') if dep_va.found() - dep_va_headers = declare_dependency( - compile_args : run_command(prog_pkgconfig, ['libva', '--cflags']).stdout().split() - ) + dep_va_headers = dep_va.partial_dependency(compile_args : true) with_gallium_va = true endif endif @@ -700,6 +715,16 @@ with_gallium_opencl = true with_opencl_icd = _opencl == 'icd' + with_opencl_spirv = get_option('opencl-spirv') + if with_opencl_spirv + dep_spirv_tools = dependency('SPIRV-Tools', required : true, version : '>= 2018.0') + # LLVMSPIRVLib is available at https://github.com/KhronosGroup/SPIRV-LLVM-Translator + dep_llvmspirvlib = dependency('LLVMSPIRVLib', required : true, version : '>= 0.2.1') + else + dep_spirv_tools = null_dep + dep_llvmspirvlib = null_dep + endif + if host_machine.cpu_family().startswith('ppc') and cpp.compiles(''' #if !defined(__VEC__) || !defined(__ALTIVEC__) #error "AltiVec not enabled" @@ -709,8 +734,11 @@ endif else dep_clc = null_dep + dep_spirv_tools = null_dep + dep_llvmspirvlib = null_dep with_gallium_opencl = false with_opencl_icd = false + with_opencl_spirv = false endif gl_pkgconfig_c_flags = [] @@ -734,8 +762,8 @@ endif endif else - pre_args += '-DMESA_EGL_NO_X11_HEADERS' - gl_pkgconfig_c_flags += '-DMESA_EGL_NO_X11_HEADERS' + pre_args += '-DEGL_NO_X11' + gl_pkgconfig_c_flags += '-DEGL_NO_X11' endif if with_platform_drm if with_egl and not with_gbm @@ -856,76 +884,160 @@ endif # TODO: this is very incomplete -if ['linux', 'cygwin', 'gnu', 'gnu/kfreebsd'].contains(host_machine.system()) +if ['linux', 'cygwin', 'gnu', 'freebsd', 'gnu/kfreebsd'].contains(host_machine.system()) pre_args += '-D_GNU_SOURCE' elif host_machine.system() == 'sunos' pre_args += '-D__EXTENSIONS__' +elif host_machine.system() == 'windows' + pre_args += [ + '-D_WINDOWS', '-D_WIN32_WINNT=0x0601', '-DWINVER=0x0601', + '-DPIPE_SUBSYSTEM_WINDOWS_USER', + '-D_USE_MATH_DEFINES', # XXX: scons doesn't use this for mingw + ] + if cc.get_id() == 'msvc' + pre_args += [ + '-DVC_EXTRALEAN', + '-D_CRT_SECURE_NO_WARNINGS', + '-D_CRT_SECURE_NO_DEPRECATE', + '-D_SCL_SECURE_NO_WARNINGS', + '-D_SCL_SECURE_NO_DEPRECATE', + '-D_ALLOW_KEYWORD_MACROS', + '-D_HAS_EXCEPTIONS=0', # Tell C++ STL to not use exceptions + ] + else + pre_args += ['-D__MSVCRT_VERSION__=0x0700'] + endif endif # Check for generic C arguments c_args = [] -foreach a : ['-Werror=implicit-function-declaration', - '-Werror=missing-prototypes', '-Werror=return-type', - '-Werror=incompatible-pointer-types', - '-Werror=format', - '-Wformat-security', - '-Wno-missing-field-initializers', - '-Wno-format-truncation', - '-fno-math-errno', - '-fno-trapping-math', '-Qunused-arguments'] - if cc.has_argument(a) - c_args += a +c_vis_args = [] +c_msvc_compat_args = [] +no_override_init_args = [] +cpp_args = [] +cpp_vis_args = [] +cpp_msvc_compat_args = [] +if cc.get_id() == 'msvc' + foreach a : ['/wd4018', # signed/unsigned mismatch + '/wd4056', # overflow in floating-point constant arithmetic + '/wd4244', # conversion from 'type1' to 'type2', possible loss of data + '/wd4267', # 'var' : conversion from 'size_t' to 'type', possible loss of data + '/wd4305', # trancation from 'type1' to 'type2' + '/wd4351', # new behavior: elements of array 'array' will be default initialized + '/wd4756', # overflow in constant arithmetic + '/wd4800', # forcing value to bool 'true' or 'false' (performance warning) + '/wd4996', # disabled deprecated POSIX name warnings + '/wd4291'] # no matching operator delete found + if cc.has_argument(a) + c_args += a + endif + if cpp.has_argument(a) + cpp_args += a + endif + endforeach + if cc.has_argument('-Wmicrosoft-enum-value') # Clang + c_args += '-Wno-microsoft-enum-value' + cpp_args += '-Wno-microsoft-enum-value' endif -endforeach +else + _trial = [ + '-Werror=implicit-function-declaration', + '-Werror=missing-prototypes', + '-Werror=return-type', + '-Werror=empty-body', + '-Werror=incompatible-pointer-types', + '-Werror=int-conversion', + '-Wno-missing-field-initializers', + '-Wno-format-truncation', + '-fno-math-errno', + '-fno-trapping-math', + '-Qunused-arguments', + ] + # MinGW chokes on format specifiers and I can't get it all working + if not (cc.get_id() == 'gcc' and host_machine.system() == 'windows') + _trial += ['-Werror=format', '-Wformat-security'] + endif + foreach a : _trial + if cc.has_argument(a) + c_args += a + endif + endforeach -c_vis_args = [] -if cc.has_argument('-fvisibility=hidden') - c_vis_args += '-fvisibility=hidden' -endif + _trial = [ + '-Werror=return-type', + '-Werror=empty-body', + '-Wno-non-virtual-dtor', + '-Wno-missing-field-initializers', + '-Wno-format-truncation', + '-fno-math-errno', + '-fno-trapping-math', + '-Qunused-arguments', + # Some classes use custom new operator which zeroes memory, however + # gcc does aggressive dead-store elimination which threats all writes + # to the memory before the constructor as "dead stores". + # For now we disable this optimization. + '-flifetime-dse=1', + ] + # MinGW chokes on format specifiers and I can't get it all working + if not (cc.get_id() == 'gcc' and host_machine.system() == 'windows') + _trial += ['-Werror=format', '-Wformat-security'] + endif + foreach a : _trial + if cpp.has_argument(a) + cpp_args += a + endif + endforeach -# Check for generic C++ arguments -cpp_args = [] -foreach a : ['-Werror=return-type', - '-Werror=format', - '-Wformat-security', - '-Wno-non-virtual-dtor', - '-Wno-missing-field-initializers', - '-Wno-format-truncation', - '-fno-math-errno', '-fno-trapping-math', - '-Qunused-arguments'] - if cpp.has_argument(a) - cpp_args += a + foreach a : ['-Wno-override-init', '-Wno-initializer-overrides'] + if cc.has_argument(a) + no_override_init_args += a + endif + endforeach + + if cc.has_argument('-fvisibility=hidden') + c_vis_args += '-fvisibility=hidden' endif -endforeach -no_override_init_args = [] -foreach a : ['-Wno-override-init', - '-Wno-initializer-overrides'] - if cc.has_argument(a) - no_override_init_args += a + # Check for C and C++ arguments for MSVC compatibility. These are only used + # in parts of the mesa code base that need to compile with MSVC, mainly + # common code + foreach a : ['-Werror=pointer-arith', '-Werror=vla', '-Werror=gnu-empty-initializer'] + if cc.has_argument(a) + c_msvc_compat_args += a + endif + if cpp.has_argument(a) + cpp_msvc_compat_args += a + endif + endforeach + + if cpp.has_argument('-fvisibility=hidden') + cpp_vis_args += '-fvisibility=hidden' endif -endforeach -cpp_vis_args = [] -if cpp.has_argument('-fvisibility=hidden') - cpp_vis_args += '-fvisibility=hidden' endif -# Check for C and C++ arguments for MSVC2013 compatibility. These are only used -# in parts of the mesa code base that need to compile with old versions of -# MSVC, mainly common code -c_msvc_compat_args = [] -cpp_msvc_compat_args = [] -foreach a : ['-Werror=pointer-arith', '-Werror=vla'] - if cc.has_argument(a) - c_msvc_compat_args += a - endif - if cpp.has_argument(a) - cpp_msvc_compat_args += a +# set linker arguments +if host_machine.system() == 'windows' + if cc.get_id() == 'msvc' + add_project_link_arguments( + '/fixed:no', + '/incremental:no', + '/dynamicbase', + '/nxcompat', + language : ['c', 'cpp'], + ) + else + add_project_link_arguments( + '-Wl,--nxcompat', + '-Wl,--dynamicbase', + '-static-libgcc', + '-static-libstdc++', + language : ['c', 'cpp'], + ) endif -endforeach +endif -if host_machine.cpu_family().startswith('x86') +if host_machine.cpu_family().startswith('x86') and cc.get_id() != 'msvc' pre_args += '-DUSE_SSE41' with_sse41 = true sse41_args = ['-msse4.1'] @@ -983,6 +1095,8 @@ pre_args += '-DMISSING_64BIT_ATOMICS' endif +dep_ws2_32 = cc.find_library('ws2_32', required : with_platform_windows) + # TODO: shared/static? Is this even worth doing? with_asm_arch = '' @@ -1024,9 +1138,14 @@ endif # Check for standard headers and functions -if cc.has_header_symbol('sys/sysmacros.h', 'major') +if (cc.has_header_symbol('sys/sysmacros.h', 'major') and + cc.has_header_symbol('sys/sysmacros.h', 'minor') and + cc.has_header_symbol('sys/sysmacros.h', 'makedev')) pre_args += '-DMAJOR_IN_SYSMACROS' -elif cc.has_header_symbol('sys/mkdev.h', 'major') +endif +if (cc.has_header_symbol('sys/mkdev.h', 'major') and + cc.has_header_symbol('sys/mkdev.h', 'minor') and + cc.has_header_symbol('sys/mkdev.h', 'makedev')) pre_args += '-DMAJOR_IN_MKDEV' endif @@ -1036,7 +1155,7 @@ endif endforeach -foreach f : ['strtof', 'mkostemp', 'posix_memalign', 'timespec_get', 'memfd_create', 'random_r', 'flock'] +foreach f : ['strtof', 'mkostemp', 'timespec_get', 'memfd_create', 'random_r', 'flock', 'strtok_r'] if cc.has_function(f) pre_args += '-DHAVE_@0@'.format(f.to_upper()) endif @@ -1049,6 +1168,21 @@ error('Intel tools require the program_invocation_name variable') endif +# MinGW provides a __builtin_posix_memalign function, but not a posix_memalign. +# This means that this check will succeed, but then compilation will later +# fail. MSVC doesn't have this function at all, so only check for it on +# non-windows platforms. +if host_machine.system() != 'windows' + if cc.has_function('posix_memalign') + pre_args += '-DHAVE_POSIX_MEMALIGN' + endif +endif + +if cc.has_member('struct dirent', 'd_type', prefix: '''#include + #include ''') + pre_args += '-DHAVE_DIRENT_D_TYPE' +endif + # strtod locale support if cc.links(''' #define _GNU_SOURCE @@ -1095,16 +1229,13 @@ name : 'dynamic-list') with_ld_dynamic_list = true endif -ld_args_build_id = [] -if build_machine.system() != 'darwin' - ld_args_build_id += '-Wl,--build-id=sha1' -endif + +ld_args_build_id = cc.get_supported_link_arguments('-Wl,--build-id=sha1') # check for dl support -if cc.has_function('dlopen') - dep_dl = null_dep -else - dep_dl = cc.find_library('dl') +dep_dl = null_dep +if not cc.has_function('dlopen') + dep_dl = cc.find_library('dl', required : host_machine.system() != 'windows') endif if cc.has_function('dladdr', dependencies : dep_dl) # This is really only required for megadrivers @@ -1120,15 +1251,26 @@ endif # Determine whether or not the rt library is needed for time functions -if cc.has_function('clock_gettime') +if host_machine.system() == 'windows' or cc.has_function('clock_gettime') dep_clock = null_dep else dep_clock = cc.find_library('rt') endif # TODO: some of these may be conditional -dep_zlib = dependency('zlib', version : '>= 1.2.3') +dep_zlib = dependency('zlib', version : '>= 1.2.3', fallback : ['zlib', 'zlib_dep']) pre_args += '-DHAVE_ZLIB' + +_zstd = get_option('zstd') +if _zstd != 'false' + dep_zstd = dependency('libzstd', required : _zstd == 'true') + if dep_zstd.found() + pre_args += '-DHAVE_ZSTD' + endif +else + dep_zstd = null_dep +endif + dep_thread = dependency('threads') if dep_thread.found() and host_machine.system() != 'windows' pre_args += '-DHAVE_PTHREAD' @@ -1146,7 +1288,11 @@ pre_args += '-DPTHREAD_SETAFFINITY_IN_NP_HEADER' endif endif -dep_expat = dependency('expat') +if host_machine.system() != 'windows' + dep_expat = dependency('expat', fallback : ['expat', 'expat_dep']) +else + dep_expat = null_dep +endif # this only exists on linux so either this is linux and it will be found, or # it's not linux and wont dep_m = cc.find_library('m', required : false) @@ -1160,7 +1306,7 @@ dep_libdrm_nouveau = null_dep dep_libdrm_intel = null_dep -_drm_amdgpu_ver = '2.4.99' +_drm_amdgpu_ver = '2.4.100' _drm_radeon_ver = '2.4.71' _drm_nouveau_ver = '2.4.66' _drm_intel_ver = '2.4.75' @@ -1212,7 +1358,8 @@ with_gallium_drisw_kms = false dep_libdrm = dependency( 'libdrm', version : '>=' + _drm_ver, - required : with_dri2 or with_dri3 + # GNU/Hurd includes egl_dri2, without drm. + required : (with_dri2 and host_machine.system() != 'gnu') or with_dri3 ) if dep_libdrm.found() pre_args += '-DHAVE_LIBDRM' @@ -1221,8 +1368,8 @@ endif endif -llvm_modules = ['bitwriter', 'engine', 'mcdisassembler', 'mcjit'] -llvm_optional_modules = [] +llvm_modules = ['bitwriter', 'engine', 'mcdisassembler', 'mcjit', 'core', 'executionengine', 'scalaropts', 'transformutils', 'instcombine'] +llvm_optional_modules = ['coroutines'] if with_amd_vk or with_gallium_radeonsi or with_gallium_r600 llvm_modules += ['amdgpu', 'native', 'bitreader', 'ipo'] if with_gallium_r600 @@ -1234,22 +1381,29 @@ 'all-targets', 'linker', 'coverage', 'instrumentation', 'ipo', 'irreader', 'lto', 'option', 'objcarcopts', 'profiledata', ] - llvm_optional_modules += ['coroutines'] endif if with_amd_vk or with_gallium_radeonsi - _llvm_version = '>= 7.0.0' + _llvm_version = '>= 8.0.0' elif with_gallium_swr _llvm_version = '>= 6.0.0' -elif with_gallium_opencl or with_gallium_r600 - _llvm_version = '>= 3.9.0' else - _llvm_version = '>= 3.3.0' + _llvm_version = '>= 3.9.0' endif _shared_llvm = get_option('shared-llvm') - _llvm = get_option('llvm') + +# the cmake method can only link statically, so don't attempt to use it if we +# want to link dynamically. Before 0.54.0 meson will try cmake even when shared +# linking is requested, so we need to force the config-tool method to be used +# in that case, but in 0.54.0 meson won't try the cmake method if shared +# linking is requested. +_llvm_method = 'auto' +if meson.version().version_compare('< 0.54.0') and _shared_llvm + _llvm_method = 'config-tool' +endif + dep_llvm = null_dep with_llvm = false if _llvm != 'false' @@ -1263,27 +1417,41 @@ with_gallium_opencl or _llvm == 'true' ), static : not _shared_llvm, - method : 'config-tool', + method : _llvm_method, + fallback : ['llvm', 'dep_llvm'], ) with_llvm = dep_llvm.found() endif if with_llvm - _llvm_version = dep_llvm.version().split('.') - pre_args += [ - '-DHAVE_LLVM=0x0@0@0@1@'.format(_llvm_version[0], _llvm_version[1]), - '-DMESA_LLVM_VERSION_STRING="@0@"'.format(dep_llvm.version()), - ] + pre_args += '-DLLVM_AVAILABLE' + pre_args += '-DMESA_LLVM_VERSION_STRING="@0@"'.format(dep_llvm.version()) # LLVM can be built without rtti, turning off rtti changes the ABI of C++ # programs, so we need to build all C++ code in mesa without rtti as well to # ensure that linking works. - if dep_llvm.get_configtool_variable('has-rtti') == 'NO' + # + # In meson 0.51.0 we can use cmake to find LLVM in addittion to meson's + # builtin llvm-config based finder. A new generic variable getter method + # has also been added, so we'll use that if we can, to cover the cmake case. + if dep_llvm.type_name() == 'internal' + _rtti = subproject('llvm').get_variable('has_rtti', true) + elif meson.version().version_compare('>=0.51') + # The CMake finder will return 'ON', the llvm-config will return 'YES' + _rtti = ['ON', 'YES'].contains(dep_llvm.get_variable(cmake : 'LLVM_ENABLE_RTTI', configtool: 'has-rtti')) + else + _rtti = dep_llvm.get_configtool_variable('has-rtti') == 'YES' + endif + if not _rtti if with_gallium_nouveau error('The Nouveau driver requires rtti. You either need to turn off nouveau or use an LLVM built with LLVM_ENABLE_RTTI.') elif with_gallium_opencl error('The Clover OpenCL state tracker requires rtti, you need to turn off clover or use an LLVM built with LLVM_ENABLE_RTTI.') endif - cpp_args += '-fno-rtti' + if cc.get_id() == 'msvc' + cpp_args += '/GR-' + else + cpp_args += '-fno-rtti' + endif endif elif with_amd_vk or with_gallium_radeonsi or with_gallium_swr error('The following drivers require LLVM: Radv, RadeonSI, SWR. One of these is enabled, but LLVM is disabled.') @@ -1302,12 +1470,8 @@ endif dep_glvnd = null_dep -glvnd_has_headers_and_pc_files = false if with_glvnd - dep_glvnd = dependency('libglvnd', version : '>= 0.2.0') - # GLVND before 1.2 was missing its pkg-config and header files, forcing every - # vendor to provide them and the distro maintainers to resolve the conflict. - glvnd_has_headers_and_pc_files = dep_glvnd.version().version_compare('>= 1.2.0') + dep_glvnd = dependency('libglvnd', version : '>= 1.2.0') pre_args += '-DUSE_LIBGLVND=1' endif @@ -1322,8 +1486,31 @@ # pthread stubs. Lets not and say we didn't -prog_bison = find_program('bison', required : with_any_opengl) -prog_flex = find_program('flex', required : with_any_opengl) +if host_machine.system() == 'windows' + # Prefer the winflexbison versions, they're much easier to install and have + # better windows support. + + prog_flex = find_program('win_flex', required : false) + if prog_flex.found() + # windows compatibility (uses instead of and _isatty, + # _fileno functions) + prog_flex = [prog_flex, '--wincompat'] + else + prog_flex = [find_program('lex', 'flex', required : with_any_opengl)] + endif + # Force flex to use const keyword in prototypes, as relies on __cplusplus or + # __STDC__ macro to determine whether it's safe to use const keyword, but + # MSVC never defines __STDC__ unless we disable all MSVC extensions. + prog_flex += '-DYY_USE_CONST=' + + prog_bison = find_program('win_bison', required : false) + if not prog_bison.found() + prog_bison = find_program('yacc', 'bison', required : with_any_opengl) + endif +else + prog_bison = find_program('bison', required : with_any_opengl) + prog_flex = find_program('flex', required : with_any_opengl) +endif dep_selinux = null_dep if get_option('selinux') @@ -1344,7 +1531,11 @@ if with_osmesa == 'gallium' and not with_gallium_softpipe error('OSMesa gallium requires gallium softpipe or llvmpipe.') endif - osmesa_lib_name = 'OSMesa' + if host_machine.system() == 'windows' + osmesa_lib_name = 'osmesa' + else + osmesa_lib_name = 'OSMesa' + endif osmesa_bits = get_option('osmesa-bits') if osmesa_bits != '8' if with_dri or with_glx != 'disabled' @@ -1372,8 +1563,7 @@ dep_wayland_server = dependency('wayland-server', version : '>=1.11') if with_egl dep_wayland_egl = dependency('wayland-egl-backend', version : '>= 3') - dep_wayland_egl_headers = declare_dependency( - compile_args : run_command(prog_pkgconfig, ['wayland-egl-backend', '--cflags']).stdout().split()) + dep_wayland_egl_headers = dep_wayland_egl.partial_dependency(compile_args : true) endif wayland_dmabuf_xml = join_paths( dep_wl_protocols.get_pkgconfig_variable('pkgdatadir'), 'unstable', @@ -1512,7 +1702,15 @@ pkg = import('pkgconfig') -prog_nm = find_program('nm', required : false) +if host_machine.system() == 'windows' + prog_dumpbin = find_program('dumpbin', required : false) + with_symbols_check = prog_dumpbin.found() and with_tests + symbols_check_args = ['--dumpbin', prog_dumpbin.path()] +else + prog_nm = find_program('nm') + with_symbols_check = with_tests + symbols_check_args = ['--nm', prog_nm.path()] +endif # This quirk needs to be applied to sources with functions defined in assembly # as GCC LTO drops them. See: https://bugs.freedesktop.org/show_bug.cgi?id=109391 diff -Nru mesa-19.2.8/meson_options.txt mesa-20.0.8/meson_options.txt --- mesa-19.2.8/meson_options.txt 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/meson_options.txt 2020-06-12 01:21:16.000000000 +0000 @@ -1,4 +1,4 @@ -# Copyright © 2017-2018 Intel Corporation +# Copyright © 2017-2019 Intel Corporation # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -24,6 +24,7 @@ value : ['auto'], choices : [ '', 'auto', 'x11', 'wayland', 'drm', 'surfaceless', 'haiku', 'android', + 'windows', ], description : 'window systems to support. If this is set to `auto`, all platforms applicable will be enabled.' ) @@ -60,7 +61,7 @@ choices : [ '', 'auto', 'kmsro', 'radeonsi', 'r300', 'r600', 'nouveau', 'freedreno', 'swrast', 'v3d', 'vc4', 'etnaviv', 'tegra', 'i915', 'svga', 'virgl', - 'swr', 'panfrost', 'iris', 'lima' + 'swr', 'panfrost', 'iris', 'lima', 'zink' ], description : 'List of gallium drivers to build. If this is set to auto all drivers applicable to the target OS/architecture will be built' ) @@ -143,6 +144,12 @@ description : 'build gallium "clover" OpenCL state tracker.', ) option( + 'opencl-spirv', + type : 'boolean', + value : false, + description : 'build gallium "clover" OpenCL state tracker with SPIR-V binary support.', +) +option( 'd3d-drivers-path', type : 'string', value : '', @@ -176,9 +183,10 @@ ) option( 'shared-glapi', - type : 'boolean', - value : true, - description : 'Whether to build a shared or static glapi' + type : 'combo', + value : 'auto', + choices : ['auto', 'true', 'false'], + description : 'Whether to build a shared or static glapi. Defaults to false on Windows, true elsewhere' ) option( 'gles1', @@ -307,10 +315,17 @@ description : 'Architectures to build SWR support for.', ) option( + 'shared-swr', + type : 'boolean', + value : true, + description : 'Whether to link SWR shared or statically.', +) + +option( 'tools', type : 'array', value : [], - choices : ['drm-shim', 'etnaviv', 'freedreno', 'glsl', 'intel', 'intel-ui', 'nir', 'nouveau', 'xvmc', 'lima', 'all'], + choices : ['drm-shim', 'etnaviv', 'freedreno', 'glsl', 'intel', 'intel-ui', 'nir', 'nouveau', 'xvmc', 'lima', 'panfrost', 'all'], description : 'List of tools to build. (Note: `intel-ui` selects `intel`)', ) option( @@ -339,6 +354,12 @@ value : false, description : 'Allow work-in-progress freedreno vulkan driver to be enabled', ) +option( + 'prefer-iris', + type : 'boolean', + value : true, + description : 'Prefer new Intel iris driver over older i965 driver' +) option('egl-lib-suffix', type : 'string', value : '', @@ -358,3 +379,10 @@ value : 25, description : 'Android Platform SDK version. Default: Nougat version.' ) +option( + 'zstd', + type : 'combo', + choices : ['auto', 'true', 'false'], + value : 'auto', + description : 'Use ZSTD instead of ZLIB in some cases.' +) diff -Nru mesa-19.2.8/.pick_status.json mesa-20.0.8/.pick_status.json --- mesa-19.2.8/.pick_status.json 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/.pick_status.json 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,44300 @@ +[ + { + "sha": "0795241dde1507e0c6a3f9ef07c281ad4f2acf7b", + "description": "radeonsi: require LLVM 11 for gfx10.3", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9538b9a68ed9aa0f8a231d6bf681f6f0a2a9d341", + "description": "radeonsi: add support for Sienna Cichlid", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "789cdab3b6188aa8c075eb311dbd8c05d6531d3e", + "description": "ac: align num_vgprs for gfx10.3", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2cc4bfbe01bd27298833623977d050e2a80c5c94", + "description": "radeonsi: don't set any XNACK options on gfx10.3", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "430d384c311468c6180a1d67ed90bb74d4fe1c3b", + "description": "radeonsi: set BIG_PAGE fields on gfx10.3", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7edf15ad4762eac3bfbcfbf1c8ec8ea4d955c6d6", + "description": "radeonsi: move L2_CACHE_CONTROL registers into si_emit_framebuffer_state", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "788696c7b29a5e7de8659ae4a3faedf94fe2b9cd", + "description": "radeonsi: implement R9G9B9E5 render target and image store support on gfx10.3", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a54bcb9429666fcbe38c04660cc4b3f8abbde259", + "description": "radeonsi: enable larger SDMA clears and copies on gfx10.3", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c4b5fd9ab096a0bb5106b93191b13c81cc32243b", + "description": "radeonsi: honor a user-specified pitch on gfx10.3", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "abe89e13294d98de65207dc9e91d49494748399f", + "description": "ac/surface: add displayable DCC code for gfx10.3", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a23802bcb9a42a02d34a5a36d6e66d6532813a0d", + "description": "ac,radeonsi: start adding support for gfx10.3", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a1602516d77e7cfb1bf97f8c1298a3a346313ff8", + "description": "ac,radeonsi: replace == GFX10 with >= GFX10 where it's needed", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ceaf848c564d74bcee14f0cd30b298aef86bd42b", + "description": "radeonsi: enable ARB_sparse_buffer", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "334204823eee0f5d308fedbebab75228354ec07a", + "description": "tu: Fix context faults loading unused descriptor sets", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a751051248d445c3d726a3eab8fc999b0876364e", + "description": "i965: Work around incorrect usage of glDrawRangeElements in UE4", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "487aa807bd1b70602fcb6fbdabd101d4cff7c07b", + "description": "tu: Rewrite flushing to use barriers", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "29abf49886d6f3a0118e47dba97eb3abd84e7b82", + "description": "tu: Remove useless event_write helpers", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f4f6a9be9f639d106055597f21a814b87eb5997b", + "description": "tu: Don't actually track seqno's for events", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "dfb176a0acf2326d36d4867fc43751e1b7d0d66f", + "description": "tu: Remove useless post-binning flushes", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "18c067f9f099d54eee1d5713b24ecca52295987c", + "description": "panfrost: Mark PIPE_BUFFER BOs as not renderable", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "db57624c0ca693fae38871787cabab50e58358d7", + "description": "winsys/radeon: do not cast bo->va as void*", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "839bc2daa93f88becb0494c8b1564bf40350a162", + "description": "ci: use separate docker images for baremetal builds", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "408b36a11dfe0bf3cbfda114c058ce7682b90483", + "description": "ci: add arm_test-base docker image", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d7923c74d42d4d745e6e5b2b572ac558f2b9a1ed", + "description": "radv/llvm: expose VK_EXT_shader_demote_to_helper_invocation with LLVM 9+", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d76e8131ac4c115b7a750c9f127f9fa920a9d094", + "description": "glthread: sync in glFlush for multiple contexts", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "90c34aed1d2f814ff8baca87b338d250257ae1d0", + "description": "gallium/u_vbuf: add a faster path for uploading non-interleaved attribs", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "88e8f1a38d838753542461cea56d1c1b1a5cfc5d", + "description": "gallium/u_vbuf: get rid of some pointer dereferences", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a6c747e8e0b982c707b1ff73edd1087a1c5e1db2", + "description": "nir: use bitfield_insert instead of bfi in nir_lower_double_ops", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7b4f0eadc189a1fc4607947c2f432f111a2dc20d", + "description": "turnip: fix VFD_CONTROL for binning pass", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ab72c07aefdac3dfcc6010ed357122d982771eee", + "description": "turnip: use common emit_xs_cntl to fill a6xx_sp_xs_ctrl_reg0", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e16608e2338ded844a850c6242b08f65f43c1aed", + "description": "turnip: fix HW binning with geometry shader", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6ac4d778fae0cdfa356c7ebe0db88987ccb35fa5", + "description": "turnip: correctly emit non-binning vs in transform feedback case", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6cc95abb273a130fb396f4f0dd2b233c534fd008", + "description": "freedreno/a6xx: use nonbinning VS when GS is used", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "0eebedb6190fdab8956769b2485180cc4a07119a" + }, + { + "sha": "88d5917cc1c5bd0dec26147a8779b50e94e56dd1", + "description": "turnip: clamp sampler minLod/maxLod", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "fecd83a0e8e8e19c5cceaf892cfad0f1b9c1dbc9", + "description": "turnip: update some properties based on blob driver", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8c26c9eed8b14fb9935ae228f654004d6952d867", + "description": "turnip: move HLSQ_UPDATE_CNTL write to before xs config writes", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d1f6d2f3e8683f10773e7b349ab4ff6f181233a6", + "description": "nir: Fix logic that ends combine barrier sequence", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "345b5847b42bc1889d8665ebd129913550da4352" + }, + { + "sha": "fe214d60bc9e5245bf41c86146036fc61f4535e7", + "description": "intel/fs: Add Fall-through comment", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e5bb4b1ee8c662fb2681b68c965756adbfb0df67", + "description": "spirv: Memory semantics is optional for OpControlBarrier", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "3ed2123d77d50ccb984fccdcc1cfa936a18819bf" + }, + { + "sha": "b7a3821a5cdf158659b6453c9ca97bb92270263f", + "description": "nir: Fix printing execution scope of a scoped barrier", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "345b5847b42bc1889d8665ebd129913550da4352" + }, + { + "sha": "7ec25820875ff1da0c2f5dab60cf5dda9eb51d30", + "description": "etnaviv: drop translate_blend(..)", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9f1cf0e4915262c68e5fb8bd8e87fbd0af30dbe2", + "description": "glsl: inline functions with unsupported return type before converting to nir", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "7e60d5a501f311fa5e7bca5335adc6013dc1aeb9" + }, + { + "sha": "43e69475ad8bb0edb9a454af690c3c8ac69af866", + "description": "aco: use v_xor3_b32", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1234faa7bf1519f575fc7d06b75fc4b249f54a49", + "description": "ac/gpu_info, radv: set max_wave64_per_simd to 20 on GFX10", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "86f21e4eba7ad980109f13bd5480c02593ca19fe", + "description": "nir/lower_explicit_io: fix NON_UNIFORM access for UBO loads", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b38d3cdceab07803a7dd81f8870a892fd82a077c", + "description": "nir/spirv/glsl450: increase asin(x) precision", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "008b0d1701a143df4a7e01f6c56d9ed66f68166c", + "description": "ac/nir: adjust an assertion for D16 on GFX6-GFX7", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "c3e0ba52a0ac89c163ada8791151226b5a1b6efa" + }, + { + "sha": "b3beb6207f16ac55e3934b4d4d1f178adb4f4cad", + "description": "v3d_bufmgr: fix time_t printf", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d512028d06c40ba56b642095379638b49ebf4a23", + "description": "pan_bo.h: add time.h include for time_t", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "07ba5e47e6674b568219cb91ddbcece20fe9030d", + "description": "vc4_bufmgr: fix time_t printf", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e43ab7bb05857461609ed2bd43703eb272a3ebe1", + "description": "glsl: fix potential slow compile times for GLSLOptimizeConservatively", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "0a5018c1a483abe6c4dddc6c65a7f4e939efc726" + }, + { + "sha": "dd81f4853c879c38987909f5e6e670b325f9f6af", + "description": "llvmpipe: do not enable tessellation shader without llvm coroutines support", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "eb5227173f0354aade28e46397a5bb69b2af69fb" + }, + { + "sha": "990b3782bc5f2f127345b975a68ac56aaf3e4674", + "description": "intel/compiler: fix Android build", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "689acc73989987667ad744026647acc35305839b" + }, + { + "sha": "6a841dbf4e4f0cb33bc36a8ba880a9bd6f6e7941", + "description": "intel/genxml: Migrate from deprecated xml.etree.ElementTree getchildren.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "06650a771dc44b3f5628b6d172e224a103a79762", + "description": "android: svga: fix build for GL4.1 support", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "ccb4ea5a43e89fcc93fff98c881639223f1538e5" + }, + { + "sha": "0570c7a7b5b00724c3cee0eb32ce042c362e2f96", + "description": "android: util: fix build for GL4.1 support", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "48a7456f4df53b94f0335f8b605ca2da9ed16d81" + }, + { + "sha": "faa339e666b4d0c4f88f2588cd84176e2b19ec0f", + "description": "Switch from cElementTree to ElementTree.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a61532e4dbdf4fbbf5822daeb5907efaf584a375", + "description": "Revert \"panfrost: Keep cached BOs mmap'd\"", + "nominated": false, + "nomination_type": 2, + "resolution": 4, + "master_sha": null, + "because_sha": "794c239a990e08b5a42d42607e9c5e5b0921390c" + }, + { + "sha": "d97aaad1555ad25fd13af588aa02f9ff3e081e34", + "description": "pan/midgard: Use a signed value for checking inline constants", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0bacb280a886905310c9b30c5af234c32ff582dc", + "description": "freedreno/ir3: Handle cases where we decide not to lower UBO 0 loads.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e349f502792e927a1acdeaf00e591878bd18c837", + "description": "freedreno/ir3: Drop the max_const on a6xx to 512.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "486b89430795cf12c0fe027fa070b75262337a18", + "description": "freedreno/ir3: Account for driver params in UBO max const upload.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a25347ab92bbe66c0f0c8def4d5f413f396b228d", + "description": "freedreno/ir3: Stop shifting UBO 1 down to be UBO 0.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9e58ab09ffbd18355868000b2da90a5cd73b5c09", + "description": "freedreno/ir3: Drop unnecessary alignment of pushed UBO size.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "07ec745014a9fd07a1948aa0f653b1a57b9b5e12", + "description": "freedreno/ir3: Stop pushing immediates once we've filled the constbuf.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ab29f2da42f0c5da21edc4dad82e841f243ec680", + "description": "freedreno/ir3: Refactor ir3_cp's lower_immed().", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4065861807f1f488f9e9dc0254009982ccbf7784", + "description": "freedreno: Upload gallium constbufs as needed when referenced as a UBO.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d1f9d1e26a2aab71d6ba6edcf195b972f12cff80", + "description": "freedreno/a6xx: Add support for ALPHA_TO_ONE.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ac1ab9294abd2eb24af8e810cd93b491ac22a8a1", + "description": "turnip: Add support for alphaToOne.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "79f3003445e6578abc0563b421776851d6caa21f", + "description": "turnip: Use tu_cs_emit_regs() for BLEND_CONTROL.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5d13c7477eb163c3d33aa7729e6bf0336d69156f", + "description": "radv: set keep_statistic_info with RADV_DEBUG=shaderstats", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "981d07c74a1611d8c308a96f59899fff66674c1a", + "description": "intel: fix gen_sort_tags.py", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "903e142f0d35bc550ffde321987a5b6fca1095eb" + }, + { + "sha": "bfff330f061ff8789d7d26f164725af91cfd2c74", + "description": "radv/aco: enable VK_KHR_shader_subgroup_extended_types on GFX6-GFX7", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6391f9ab4cb2b6cb26b559bc33a8e8851af65745", + "description": "aco: fix nir_intrinsic_quad_* with 8-bit in GFX6-GFX7", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e1523b34c2aeebdf2952bfad4f0e40326fb2cc7c", + "description": "aco: fix sign-extend 8-bit subgroup operations on GFX6-GFX7", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ee4bc13de2aacb7bab24a3e55e44e7e50434df94", + "description": "aco: use v_bfe_u32 for unsigned reductions sign-extension on GFX6-GFX7", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a874132cc41c2bffc8a547bbd8bc9074653912da", + "description": "intel/genxml: drop sort_xml.sh and move the loop directly in gen_sort_tags.py", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c67ef7695a2577cc3161deb158fa89f1e9dd0fbc", + "description": "radv: Use ac_surface to allocate aux surfaces.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "63db31fdfc4fe85d3357d8d34cf461333c32b970", + "description": "amd/common: Add total alignment calculation.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f70b57768346f113da1e0dc31759d48da64e98e8", + "description": "radv: Allocate values/predicates at the end of the image.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ec671e871886d773e32385f7f62193836ea25e25", + "description": "radv: Disable HTILE in ac_surface.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f84b4e2639246d23fe94b4688fd75e3d11662205", + "description": "radv: Disable DCC in ac_surface.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "81dee6cf8fa98028d98bf5bdd077a6add613ed6e", + "description": "radv: Use offsets in surface struct.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ffae3589c986574083fbd20341e6a0abe77b0741", + "description": "radv: Rely on ac_surface for avoiding cmask for linear images.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b5488a863cf14f6f02b1aa4ec41fdbdd146492e5", + "description": "radv: Enforce the contiguous memory for DCC layers in ac_surface.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d3db633f6dfd960f4052d2e491f4a869e359b98e", + "description": "radv: Pass no_metadata_planes info in to ac_surface.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "599ea341dd17cef54b5e083279cb5ec4748c7f9a", + "description": "radv: Use ac_surface to determine fmask enable.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4b7de75b4b252e1a535635e96baecedf3d98bd7a", + "description": "ci: add U-Boot specific fetch strings", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "06d817199454214a942f32d56034aaae87faa50b", + "description": "ci: extend expect-output.sh", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ef5b8bbc5ea0c55e99dd1e6c2c7a85590724aa4f", + "description": "freedreno/computerator: fix missing dependency on generated header", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "da467817e3e25d201e94326ff876374093a3ba22" + }, + { + "sha": "7a68045b5d3ca52ea9db6f4c2606ae16546187ea", + "description": "glapi: remove deprecated .getchildren() that has been replace with an iterator", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c9a9b363ce1682ef8f8d125744f5d32011332009", + "description": "radv/aco: enable 64-bit atomic features if RADV is linked with LLVM 8", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ba37d408da30d87b6848d76242d9d797dbef80a0", + "description": "svga: Performance fixes", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ccb4ea5a43e89fcc93fff98c881639223f1538e5", + "description": "svga: Add GL4.1(compatibility profile) support in svga driver", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "52ce25be8741e42ddae279f33296ed8e6835ce56", + "description": "svga/include: Headers for GL4.1 support", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "dc3505f87ed69ac843cd4ce7d269b2ab01a32ff7", + "description": "winsys/drm: Add GL4.1 support in drm winsys", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "48a7456f4df53b94f0335f8b605ca2da9ed16d81", + "description": "util: Add util functionality for GL4.1 support", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f1f81abfd440b7b8ddb51203878a6b97547c19f5", + "description": "freedreno/a6xx: more early-z", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4d7ee2749f50f7cd76ce7bd753a2b9762e3af375", + "description": "ci: bump virglrenderer to latest version", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ec98cff6a9a1e1df7d5ea5a31a0341425eccd64f", + "description": "turnip: Simplify vertex buffer bindings.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5c9728d960714b1c5eb4806a80157ce95992fcfe", + "description": "turnip: Don't bother clamping VB size.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "52942f18c60253ad79f7d7fa26775bda83860e8a", + "description": "turnip: Move vertex buffer bindings to SET_DRAW_STATE.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c8c7450fc73b888504174733e905f4a69a72062a", + "description": "llvmpipe: move coroutines out of noopt case", + "nominated": true, + "nomination_type": 1, + "resolution": 0, + "master_sha": null, + "because_sha": "d32690b43c91d4aab938da83704e4ebb68fccf6f" + }, + { + "sha": "2d1688345a622add9fef1dd5d6d87bee614d5666", + "description": "pan/mdg: Ensure ld_vary_16 is aligned", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "5f8dd413bcc221424598e6330e91e16914b2987a" + }, + { + "sha": "de8be1de132085c434532f3dc0d1c456109a8f2a", + "description": "freedreno/a6xx: Fix VFD_CONTROL emit", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "202252566bf053a31a4162e99f6fef5b82efc837", + "description": "radv: Always expose non-visible local memory type on dedicated GPUs", + "nominated": true, + "nomination_type": 0, + "resolution": 0, + "master_sha": null, + "because_sha": null + }, + { + "sha": "622e3a8510ad6ccff41b2ba2f6184b80ad67dea5", + "description": "pan/mdg: Legalize inverts with constants", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "449e5ded9340243b68183d7fffcc838cf283c89c" + }, + { + "sha": "e61a98877ccdaf7ec1f9f890f0f7c1a993ee70a1", + "description": "nir: reuse existing psiz-variable", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "878c94288a8aed3479517660be3e9a88f9b44269" + }, + { + "sha": "57e4d0aa1c16d3be36ccee4065c55901cb6fad43", + "description": "i965: fix export of GEM handles", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "4094558e8643a266dfc8da9bc073751a3736a2fb" + }, + { + "sha": "aba3aed96e4394a213e188f2f71ef045803a27c5", + "description": "iris: fix export of GEM handles", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "7557f1605968c39d680545d5b8457d17eea3b922" + }, + { + "sha": "e41e820648b1cb662cbe938c73d755331d48c6db", + "description": "i965: don't forget to set screen on duped image", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "604a86e46f67b517e43c4646080ee1993ff95ecd", + "description": "iris: fix BO destruction in error path", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c3025bde192919649999da202e7527849bf2038f", + "description": "mesa: Fix NetBSD compiler macro.", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "a63b90712aad81d544eb8931493a6c4a7805f7fb" + }, + { + "sha": "e9cda38031af98cf504fb9eb90dd4214e494ecb2", + "description": "freedreno/a6xx: also consider alpha-test for ztest-mode", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1e3731e7119c36b759ec9492a7c9ebf90b222122", + "description": "freedreno/a6xx: add early-lrz-late-z mode", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "07887c9f34c664c4e87008b9d9b76dc06a2d7c1b", + "description": "freedreno/a6xx: re-work LRZ state tracking", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "27e501bcfc585757ddf9ad6c37a0cee361c2275e", + "description": "freedreno/a6xx: update depth-plane control regs", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f6307426ed5bbc978f93fad06153c9d2e6d8d8ee", + "description": "freedreno/a6xx: sync registers from envytools", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ebcf3545db7dad66f5bce94e659720dfdd9f4805", + "description": "freedreno/ir3: split kill from no_earlyz", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "346bb81f4054b012d2f1992983e5b6cd820b33c3", + "description": "docs/features.txt: Update for freedreno", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5fb7cad95c227348d2207ab814d9a819c5f205b0", + "description": "freedreno/a6xx: Turn on robustness extensions", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8b353524b04fa9cd77e21e2d036c69f1cff30c35", + "description": "vdpau: Fix wrong calloc sizeof argument.", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "65fe0866aec7b5608419f6d184cb1fa4fe1dc45a" + }, + { + "sha": "8252bb0ec6d429b09d944826d1ddbead69387f0f", + "description": "OPTIONAL: iris: Perform BLORP buffer barriers outside of iris_blorp_exec() hook.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4b00338bdee7f91f242a1152327cd01fe58c56bd", + "description": "iris: Remove iris_flush_depth_and_render_caches().", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "46adb83a2930d346dd2ce2d4d19dfec02e0f7ca1", + "description": "iris: Emit single render target flush PIPE_CONTROL on format mismatch.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b92818849360d629bff186feb0f580a114600062", + "description": "iris: Open-code iris_cache_flush_for_read() and iris_cache_flush_for_depth().", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "74c774dce9e88f7c01700a4d719c3761650a3055", + "description": "iris: Remove render cache hash table-based synchronization.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "aa78d05a2340852560a9bc965c87ba9fa271dd1a", + "description": "iris: Remove depth cache set tracking and synchronization.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6b980725110dca19fc8a3ea365a74d21f032baa1", + "description": "iris: Perform compute predraw flushes from compute batch.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8e8198f34968e6911c2bfdf6b58c505a23cfbc9e", + "description": "iris: Remove batch argument of iris_resource_prepare_access() and friends.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "878c770d13df355432da053c015b7701b3c533e1", + "description": "iris: Insert buffer barrier in existing cache flush helpers.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e22659089837aacf6c97544fcc4c9acdda516297", + "description": "iris: Implement buffer-local memory barrier based on cache coherency matrix.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8a6349eb866952fe7fbf7834f24bcda3df807a4b", + "description": "iris: Update cache coherency matrix on PIPE_CONTROL.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "fc221875cf1fe546e0087aeef55ca976647ef9c2", + "description": "iris: Introduce cache coherency matrix for batch-local memory ordering.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4b7fd91be656ecc7944b7523b28246366cf5a8b7", + "description": "iris: Report use of any in-flight buffers on first draw call after sync boundary.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ae88e79f6959df71953db6314c78f68bd2799f3c", + "description": "iris: Drop redundant iris_address::write flag.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "eb5d1c27227302167d299bcaa2dabe623a242eb3", + "description": "iris: Annotate all BO uses with domain and sequence number information.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e81c07de41c7f6f585a2c6aa0c67b1082b609b8f", + "description": "iris: Bracket batch operations which access memory within sync regions.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8cbe9535482f3efd27fbcbb90a329e6567e8c961", + "description": "iris: Add infrastructure to partition batch into sync boundaries.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7878cbec59a394904feb512ab6a756a27242912d", + "description": "iris: Add batch-local synchronization book-keeping to iris_bo.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b73b33953161acb07bc6c266c247b9be6faf89ad", + "description": "panfrost: Mark point sprites as todo on Bifrost", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0ef527928c6acc63ce88a8df023b64d2a5c4468c", + "description": "panfrost: Fix gl_PointSize out of GL_POINTS", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3f8abd867616fc4ceccb2877cd1e629232397fb4", + "description": "panfrost: Prefer sysval for gl_PointCoord on Bifrost", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "bc7397f37633eccac6709af55033533682620387", + "description": "pan/bi: Disassemble gl_PointCoord reads.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3e4a0c2bca32fcf9f13363fa0c54b6a9b70c086b", + "description": "panfrost: Explicitly convert to 32-bit for logic-ops", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "19b4e586f62eb054bf1dc2f828d5b73abae6a7c7" + }, + { + "sha": "6d00eaf733395323ef06efd08851e49de35ba845", + "description": "panfrost: Readd MIDGARD_SHADERLESS quirk to t760", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "e53d27de61b408049c07e64911b20b117e243910" + }, + { + "sha": "46183a999bd0a56cedc7c1c08c8b58356b424009", + "description": "iris: Extend iris_context dirty state flags to 128 bits.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "45918e0d8c1ac3128b743fc4e549a60d744e3bc5", + "description": "iris: Simplify iris_batch_prepare_noop().", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "26a3c7b363133315d0ee2b03eb2ca986d4b23043", + "description": "nir/lower_tex: fixes for fp16 yuv lowering", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0f3255ef0aefaf7a7aca4f7ee8a334cf91bf2c99", + "description": "nir/builder: add bitsize conversion helpers", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "866618c5c86fbbb59036845c1d6e38d8e526b525", + "description": "nir: extract out convert_to_bitsize() helper", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "924bfb65604238439374c314bd02ff2b99441e21", + "description": "nir: get_base_type() should return enum type", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "dce7722ef89100e5dea337064a9d6631bb18822a", + "description": "panfrost: Handle writes_memory correctly", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2447b3b9d3306b33c75d503c9caf9e7322c957bf", + "description": "panfrost: Document MALI_WRITES_GLOBAL bit", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ee59d1ad773aaa720e893446fb46f4e9f3fe5940", + "description": "panfrost: Update MALI_EARLY_Z description", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7e26a02e5fce1d4d0fc3120be323ca1ba899923c", + "description": "iris: remove unused iris_bo->swizzle_mode", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "77f08982af8f0807ab489d36a5c137661e10624d", + "description": "aco: sign-extend input/identity for 16-bit subgroup ops on GFX6-GFX7", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f31c9b4edf6e8d972f26461c49c5e193bf6d9a13", + "description": "aco: fix subdword copies on GFX6-GFX7", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a521c67d223c6e32ea9040643b1bd5a3cdfea04e", + "description": "aco: implement 16-bit nir_intrinsic_quad_* on GFX6-GFX7", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6b08d269bfa310e911c4d6a8ea8297a38ad599ac", + "description": "aco: implement 16-bit reduce operations on GFX6-GFX7", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0e73d879e3a35f7491c1239f894bbb2d1c9b2529", + "description": "pan/bi: Handle vectorized load_const", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1b09c6993dc1b1113ff508d158504af90607aa18", + "description": "pan/bi: Passthrough second argument of F32_TO_F16", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8a4efe2d730cc61d42eefffd01ee0ae4f853ec0c", + "description": "pan/bi: Pack second argument of F32_TO_F16", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "323eecaf1392b8fc6951e9f4967fca6ee755ec28", + "description": "pan/bi: Fix SEL.16 swizzle", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9ed1ae4724ce5eeedf56d65c5427d2788a54d1e4", + "description": "pan/bi: Handle SEL with vec3 16-bit", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "afc18c62d795cb29bdddebed0e5ccadc727ddbcb", + "description": "panfrost: Passthrough NATIVE loads/stores", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "36af05bbdef4e91a1d0616752150ae00ad0dceb5", + "description": "pan/mdg: Handle regular nir_intrinsic_load_output", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "293d37e19d8213213719fa975d8801fb126a0c51", + "description": "pan/mdg: Allow f2u8 and friends thru", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0ae0141f5b28995e266190a24c179a8fe282d602", + "description": "pan/mdg: Handle f2u8", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f8b881f1611490e6a7a679a08dad0af717bcbb1c", + "description": "pan/mdg: Fold roundmode into applicable instructions", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "93513cd9ff127b9842e34dc331c80f55f151376a", + "description": "pan/mdg: Implement *_rtz conversions with roundmode", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6290e83190ab9714f04f4aaa6db49c87f4866ca5", + "description": "pan/mdg: Lower roundmodes", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1bef784867410e9ec2728b0cadf696a4e5168d28", + "description": "pan/mdg: Add opcode roundmode property", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2eb4c85e42452527d46b03242f6f26c8cb216e39", + "description": "pan/mdg: Add roundmode enum", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "014d2e46a712408984cd2d766fff49fcd08b7399", + "description": "pan/mdg: Distinguish blend shaders in internal shader-db", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "99446c9f7d34a0bbefa47fdd6acb539421b59d65", + "description": "panfrost: Only use AFBC YTR with RGB and RGBA", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9ac106defe351428fbe3c62547e6be918b603d32", + "description": "panfrost: Decode AFBC flag bits", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a34cc97ca3e19fb36045bf361b3a6bd865f71c7b", + "description": "glsl: when NIR linker enable use it to resize uniform arrays", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7d1eadb7906628af800ab797a7423f79bbcba56c", + "description": "glsl: gather uniform dereference info before main linking loop", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a13d8d48ce523acfac5cfe86cdece5abeaf2d097", + "description": "glsl: add update_array_sizes() helper to the NIR uniform linker", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6aea287b0a14e5634275d6116b319211885d3b8e", + "description": "glsl: add struct to gather more info about uniform array access", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d6d78f9b7ff02354af3ac8a918bb5cec6c4718e8", + "description": "util: add BITSET_LAST_BIT() helper", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f518508a817aa5af1eee988439f73ecf6279e9c5", + "description": "i965: call brw_nir_lower_uniforms() after uniform linking is complete", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "907bacea13fff7939edf67fba5b3a6eaf9f1f5dc", + "description": "gbm: document that gbm_bo_map exposes a linear view", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9f3956fea080d73d98fc28bc8cd148755b597b74", + "description": "glsl: Don't replace lrp pattern with lrp if arguments are not floats", + "nominated": true, + "nomination_type": 1, + "resolution": 0, + "master_sha": null, + "because_sha": "8d37e9915a3b21b496269a39f677a80a6e02cb2c" + }, + { + "sha": "3ed2123d77d50ccb984fccdcc1cfa936a18819bf", + "description": "spirv: Use scoped barriers for SpvOpControlBarrier", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "689acc73989987667ad744026647acc35305839b", + "description": "intel/compiler: Extract control barriers from scoped barriers", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "345b5847b42bc1889d8665ebd129913550da4352", + "description": "nir: Replace the scoped_memory barrier by a scoped_barrier", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "94438a64bf7e5cd37c56e954156d59e404d76f55", + "description": "spirv: Split the vtn_emit_scoped_memory_barrier() logic", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d3c937c0e4d1dd05072d9a7169532517ef7d0c7f", + "description": "radv: enable zero VRAM for all VKD3D (DX12->VK) games", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "fd5ffd3a83e178f14fcc69806d3a52724f05b56c", + "description": "radv: enable zero VRAM for Doom Eternal", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c183ea94afd5560b4b9718fce62d0fdab023a16e", + "description": "gitlab-ci: bump piglit checkout commit", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7873276f6895eafc56514a666d54e4a4097f1365", + "description": "glsl/spirv: remove dead uniforms in spirv nir linker", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a494b6241016d3d5995902748b40c70ae8d1ecbd", + "description": "glsl: remove dead uniforms in the nir linker", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "60bee4c70c0d956ecdcae542a515f3e3a4ee328c", + "description": "glsl: add can_remove_uniform() helper to the NIR linker", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "04dbf709edf069bc720d941fab27c53269336bcf", + "description": "nir: add callback to nir_remove_dead_variables()", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "bc79442f3fa23ecb40fcc67ea3cf4fd73fb0d3fe", + "description": "nir: add glsl_get_ifc_packing() helper", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7ac617c1172a5031818fdd907579777acf7729b0", + "description": "pan/mdg: Don't double-replicate blend on T720", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "edd56bad942dfc3a00b307093216e4ad53abe5b2", + "description": "radv: Use common gfx10_format_table.h", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "560f095dd57b67191ccbd8a5c524d4c6d0fe28e4", + "description": "radv: Include gfx10_format_table.h only from a single source file.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b351a507638ff213add8c3c79f6a7f848d09f8e9", + "description": "radeonsi: Define gfx10_format in the common header.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c98e52f88a1b24b33b4e8b95f80cf5dbbe6d2d66", + "description": "amd/common,radeonsi: Move gfx10_format_table to common.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d936f69677240069bae0f23795b56b4e5335154a", + "description": "radeonsi: Explicitly map Z16_UNORM_S8_UINT to None for GFX10.", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "273ead81f1a219b39a93abbed4db548d8eeb0e5f" + }, + { + "sha": "415c88eebcda87196b1ee09d28ea40e07b9d229f", + "description": "Revert \"CI: Disable Panfrost T720/T760\"", + "nominated": false, + "nomination_type": 2, + "resolution": 4, + "master_sha": null, + "because_sha": "ae6e1aee7d1bd49ae494b8a25ca33d092a3a145a" + }, + { + "sha": "2dfc241e36865655913d0f0d961fc76f49bf2f50", + "description": "ci: bare-metal: make it possible to use a script for serial", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a21966837acd2e053ce183c5f145afcff2fd51b7", + "description": "zink: Use store_dest_raw instead of storing an uint", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "16339646f03a5cb527f119ca572c9328fd5d3923" + }, + { + "sha": "c310677a7563b1e2d97f8216be1d60cb21204eae", + "description": "radv: Explicitly cast TIMESTAMP_NOT_READY value to uin32_t where needed.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "663e8cb4e67f8b85186631c6a3719ed83da32151", + "description": "aco: Use correct reference type in for-range-loop.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7b1bc460fd6ae9bf5efeca62227bb05e0c50ee15", + "description": "aco: Don't std::move temporary object.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "536339b0dda33241d21a0e045681419ca46fc812", + "description": "aco: Don't declare 'Block' as class, but define as struct.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c2a778ef0f1720f9fb28afd40a791488648218d0", + "description": "radv: Don't take absolute value of unsigned type.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7d2fe60f1cf7995c6a52e5160d2e40cddf8aabeb", + "description": "radv/aco: Always enable subgroup shuffle.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "045c9ffa7d7f496ba347aa7acbfc0edea37a0fc1", + "description": "aco: Implement subgroup shuffle on GFX6-7.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "14a5021aff661a26d76f330fec55d400d35443a8", + "description": "aco/gfx10: Refactor of GFX10 wave64 bpermute.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "fe3947632ce9946562a39ef95a6796b8604f1f42", + "description": "radeonsi: add a hack to disable TRUNC_COORD for shadow samplers", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "d573d1d82524b8a2e5f56938069cabc0f0176a0e" + }, + { + "sha": "85a6bcca615f9aae1ffd2a1e790ee5d980e7cc43", + "description": "radeonsi: pass at most 3 images and/or shader buffers via user SGPRs for compute", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "877c56bfdcb97e47453b86f23a13033438f0daa1", + "description": "radeonsi: remove const_buffers_declared hacks", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "4553fc66a5f23607c2e872d8ac8755c747bd0bd2" + }, + { + "sha": "ce4575b3b5950041589cd2b96a8334146d8cec32", + "description": "radeonsi: remove unused leftover code for INDIRECT_BUFFER inside IBs", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "cac24bee6202d5bf1c16caa8174494747fa2d56c", + "description": "nir: gather which images are MSAA", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6503e4be13099f41a4c287ae1983362856a39f44", + "description": "nir: gather which images are buffers", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f8ef15c061fbb0e6da255ab06d7afd8128faee48", + "description": "nir: don't count samplers and images in interface blocks", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c6c8a9bd554f51c05bba5ab2c6cbc70edae9d10f", + "description": "ac/nir: support v2f16 derivatives", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7c423dd721401eaff22c82e69ffaf70e4d31f50f", + "description": "ac/nir: set the second v_cvt_pkrtz argument to undef if it's unused", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "bfb95725aaa13a16011f16ad9ec9501b5a373ce6", + "description": "ac/nir: select v_cvt_pkrtz for all conversions from f32 to f16 for radeonsi", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1d80015eaf4fa34e51eab927d91ef49135a8bd82", + "description": "ac/nir: handle nir_op_[fiu]2[fiu]mp opcodes", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "70b6d540113df7ee3f769a3f3bbfe1b3df4b7bfc", + "description": "ac/nir: support 16-bit data in image opcodes", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c3e0ba52a0ac89c163ada8791151226b5a1b6efa", + "description": "ac/nir: support 16-bit data in buffer_load_format opcodes", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b819ba949b4f5aeef6f6b200247f9ec801774a54", + "description": "ac/nir: remove type and num_channels args from ac_build_buffer_store_common", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b98df7bf502118c194a3e8c77454355dbb086e49", + "description": "ac/nir: support vector types in the type suffix of overloaded intrinsics", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e5ea87cde8ef5fa777442cca899c179c19910a40", + "description": "ac/nir: use more types from ac_llvm_context", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "116ec850125389f82e540c336dfd44ee7103abda", + "description": "ac: rename has_double_rate_fp16 -> has_packed_math_16bit", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1af8fe4ed5225efa53a3ee0b105961aa29cdae3e", + "description": "gallium: add shader caps INT16 and FP16_DERIVATIVES", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "733bee57eb80c92736f3ef1e1267e68ee6cfade6", + "description": "glsl: lower samplers with highp coordinates correctly", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0c0803c32fd5b1c6cc037372990d2b6f84762135", + "description": "glsl: lower the precision of imageLoad", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "11929895332213363628d632f7f9f6d79b5124d1", + "description": "glsl: lower mediump partial derivatives", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6fe20ebaaa933ddd17b655e61ba3fe3d358b8513", + "description": "glsl: lower mediump integer types to int16 and uint16", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a052a9c27777fc2cc92ed7ac3cd820e828abf2f0", + "description": "glsl: handle int16 and uint16 types and add instructions for mediump", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9c14a87839d2937e07f69418285bbfc3c5b3e629", + "description": "glsl: treat lowp as mediump when lowering builtins", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "116e006693dc37245b7c0823e1a394ad9c2fb770", + "description": "nir: add options::vectorize_vec2_16bit to limit vectorization to vec2 16", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a6916d1ce823439d7de752b2f2013e9b3e434d82", + "description": "nir: fix lower_wpos for 16-bit fddy", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "92333c6d1a6e71215c82a49485ba27d1def85152", + "description": "nir: lower int16 and uint16 in nir_lower_mediump_outputs", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6f2e95f24d80c797389b5c558e0590ed10e0c0e7", + "description": "nir: add int16 and uint16 type helpers", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f798513f91884e1ae332a0726d3e1d4cf455abf0", + "description": "nir: add i2imp and u2ump opcodes for conversions to mediump", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f3310cb3e16ddc3b8f7941f4df3b5b9775b8aa94", + "description": "nir: Fold f2f16(b2f32(x)) to b2f16(x)", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d32144602c1dfd507f07774ce906dc25d2697da0", + "description": "meson: remove \"empty array\"/\"array of an empty string\" confusion", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a2903dd767f77edccb671f30e9e0686b50879cca", + "description": "turnip: fix RENDER_COMPONENTS value", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "078aa9df8daff60e52a66d8f8062dce135b94ec1" + }, + { + "sha": "d63bd09eb2f6109fbef84ba75ee59ab075686612", + "description": "CI: Disable Panfrost T720/T760", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3acd5a68a4fb324908674a73d1bf9ccec98da316", + "description": "gitlab-ci: Use separate docker images for cross builds", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a85da8e3d5e430cf661f0343d00810923de2e379", + "description": "gitlab-ci: Add x86_build-base docker image", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ae400553fbb1a9ca2add87072f7dd61621e69111", + "description": "gitlab-ci: Move meson back to x86_test-gl/vk ephemeral packages lists", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b19c094dbafbaf60568a3fb01a05372fc6a9ae11", + "description": "gitlab-ci: Stop using packages from Debian testing", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c964be0cd7ae7cded01430d3505edbbd803ddb3e", + "description": "gitlab-ci: Use Debian 10 wine-development packages", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "262e3885a288d93f618cf6552d0193a036273e64", + "description": "gitlab-ci: Move LLVM/clang 6/7 packages to the x86_build_old image", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b30b6fded8925364a811030bbcdc486b4100108b", + "description": "docs: add missing \"shader_\" in VK_KHR_shader_subgroup_extended_types", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "fb62e642ae667c99aeb3015fa77ab668af5e4ee6", + "description": "vulkan-overlay/meson: use install_data instead of configure_file", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "56ccea58ae7f6fd56cf4a1697d2cceb68866b552" + }, + { + "sha": "138c003d22739b0d1e6860ed398dd511a44cde04", + "description": "meson: deprecated 'true' and 'false' in combo options for 'enabled' and 'disabled'", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a63e5cbe489f78bc07632291cd276dbd94a3066d", + "description": "meson: use 2 space not 3 space indent", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a8e2d79e0228106d11b6ceeb38f4ffb587f0a819", + "description": "meson: use gnu_symbol_visibility argument", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "fc7301865e1eabe06d4225af596e7334c4094fe5", + "description": "drm-shim/meson: Use portable override_options for setting C standard", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "23df13c98825dd5b99a7e60fc8318ff9287a053d", + "description": "drm-shim/meson: The name of the target is a string not a list", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "17dcd535c1973fb8bdaba9eb013672825cc10b94", + "description": "meson: Use builtins for checking gnu __attributes__", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6ef314b4fa938310f282951888a1b51e31e4fa51", + "description": "meson: Use build_always_stale instead of build_always", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a16e8bfb948d1aa6e84905d0ad47960bfb9fcfed", + "description": "meson: Use the check_header function", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c1a290bdd57536d6afcff6a02f1512fba7328729", + "description": "meson: Bump required version to 0.52.0", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "30a393f4581079ced1ac05d6b74c7408fbe26f83", + "description": "pan/mdg: Enable out-of-order execution after texture ops", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7c0e82d4ab9073af45a36e6c11ed2a31580cba9e", + "description": "pan/mdg: Add quirk for missing out-of-order support", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "31de10c4342bc5c21366d14a1266e942b95295a0", + "description": "pan/mdg: Disassemble out-of-order bits", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ca6759c3f9ff56a077675bfbee3dab2b7b7afc6b", + "description": "panfrost: Remove unused nir_lower_framebuffer pass", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7de4b98193d5bcad1d0a057a9c8d865bf93be9db", + "description": "panfrost: Don't flush explicitly when mipmapping", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "975238dc2a5bcf4a0d6d8a5560d05d03321aed6e", + "description": "panfrost: Use VTX tag for vertex texturing", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "89a9cc764533f4cad123dc92c49e89bb181873c3", + "description": "panfrost: Permit AFBC of RGB8", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3a8e5eb1b11e5dfe42ebc4c10c291e36fbd06cb3", + "description": "panfrost: Fix PRESENT flag mix-up", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7c793a4867737d08e20557e0bf8432c9c5dfcb2a", + "description": "pan/mdg: Fuse f2f16 into load_interpolated_input", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5f8dd413bcc221424598e6330e91e16914b2987a", + "description": "pan/mdg: Handle 16-bit ld_vary", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e58112bc08f99861ac634ede8db0f98cd497fc14", + "description": "panfrost: Update fails list", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e42950fe96408c4addcc31990787dc5cca537476", + "description": "panfrost: Use internal_format throughout", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e7765a8c7f5b225d11373a35bd5156c9a98a6514", + "description": "panfrost: Add separate_stencil BO to batch", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6aa7f6792d30f91eccc68fcec65b81105afc347a", + "description": "panfrost: Check for large tilebuffer requirements", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c46b11438d363f27e9f4418766063c5be9b3e0c2", + "description": "panfrost: Let Gallium pack colours", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8dc8b6640334e94f4f8b66a93c68e49c04b06623", + "description": "panfrost: Account for differing types in blend lower", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0c9fe82ee92c1a6ba9a3f8230ed94bac8bbca0be", + "description": "panfrost: Conditionally allow fp16 blending", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "19b4e586f62eb054bf1dc2f828d5b73abae6a7c7", + "description": "panfrost: Switch to pan_lower_framebuffer", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4c286cc0a2146a6ab3e8be278c4c27226b6ff990", + "description": "panfrost: Un/pack sRGB via NIR", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5d14757c033b78791968390201f2ece564a4c1ce", + "description": "panfrost: Un/pack R11G11B10", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e24e248b84a2fbcc70cc1ee5c598e5a942effbce", + "description": "panfrost: Un/pack RGB10_A2_UINT", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "91cc67855186c8df05f22047df29c462ee985376", + "description": "panfrost: Un/pack RGB10_A2_UNORM", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7de0e5500b6fbab2ed66131a7a54df9f95693a17", + "description": "panfrost: Un/pack RGB565 and RGB5A1", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ff590702da3ead33976f3b764de4df78f2aa630a", + "description": "panfrost: Un/pack UNORM 4", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "eab8701e7c23cd11e991624804487ecb393c54eb", + "description": "panfrost: Flesh out dispatch", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e937dd521b240f5cd6246a18e761992c49a8e415", + "description": "panfrost: Un/pack 8-bit UNORM", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f01aabb82968077e7ed690276394074cca14bf3e", + "description": "panfrost: Un/pack pure 8-bit", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9a6483bb47fb654e3c78e5d81e8500b993d51cbd", + "description": "panfrost: Un/pack pure 16-bit", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c31bcca48e34e44288b9f5e7dbf573a492717ef3", + "description": "panfrost: Un/pack pure 32-bit", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e5fcc193f71c4767866e8c7f9f396e60a312ab6d", + "description": "panfrost: Stub out lowering boilerplate", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "dbd72a8f94ca8aad954990b527d7c510983c96f5", + "description": "panfrost: Determine classes for stores", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "18a767df35f2a71105703a1132ab5a3c1ec27313", + "description": "panfrost: Determine load classes for formats", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e53d27de61b408049c07e64911b20b117e243910", + "description": "panfrost: Add quirks for blend shader types", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "60d647f9def4b84396fd820fd7a5e6ea7a4f1f0a", + "description": "panfrost: Determine unpacked type for formats", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5c82f8a097f2824de650e6cae16d4ce73b1cb512", + "description": "panfrost: Add theory for new framebuffer lowering", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5a175e4a1b1e777b9a9185ad504c3516e55f4c3f", + "description": "pan/mdg: Implement raw colourbuf loads on T720", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4f82aad7a27e44314b0fd2461819d31efb49fd5e", + "description": "pan/mdg: Drop the u8 from the colorbuf op names", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "49840a8a58a6614615fb73c0e558863478190913", + "description": "pan/mdg: Print 8-bit constants", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0ff0291896a91dde9c6f7f0a1bc5b7c3962e72a0", + "description": "pan/mdg: Handle bitsize for packs", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e9c780b1d08092880a1ad769fffbad571f094c46", + "description": "pan/mdg: Treat packs \"specially\"", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c495c6c2957c7c30cedeaa218c2caf443ac04797", + "description": "pan/mdg: Add pack_unorm_4x8 via 8-bit", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "551d990a7c85d2b3cba567b00e6f2aceef6e2e87", + "description": "pan/mdg: Handle un/pack opcodes as moves", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "605b0e8acf307ff0f9cccd34c2cae8932ad9222a", + "description": "iris: Fixup copy'n'paste mistake in Makefile.sources", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "034329128b703f8c2e8ad1980ddb131df372ce48" + }, + { + "sha": "aaec065f03e65f75fd18f8cc24d003f220209714", + "description": "intel/dev: Don't consider all TGL SKUs as GT1 only", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "54996ad49273641e20dbb2d7aff382199e27cd10" + }, + { + "sha": "d2f8105b606269c0e71cd599f57997279385d300", + "description": "r300g: Remove extra printf format specifiers.", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "04c1536bf7abe253e0e900c311ff9474ff4d1813" + }, + { + "sha": "6e1c47b98df384b46ff41ffbf9689a93c78c040d", + "description": "nouveau: allow invalidating coherent/persistent buffer backings", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c48f42e178a1cc484870367c0cfe5fbbf71d86cc", + "description": "intel/fs: Emit HALT for discard on Gen4-5", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "94aa7997e45b5314d169bbee5bf22ad368c2fd25", + "description": "intel/fs: Fix unused texture coordinate zeroing on Gen4-5", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a7c8811fe4012b60a9bcdb2ea2ef6ab79e402809", + "description": "intel/vec4: Stomp the return type of RESINFO to UINT32", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e843303d6f18d56d7c412e6c879134f7b79372ac", + "description": "radv: fix regression with builtin cache", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "cd61f5234d2c275b21c249fc2effc058a74ecf0a" + }, + { + "sha": "7e4c8949c6f79090b7d8675b488c7bdc90477e26", + "description": "gallium/dri: Remove lowered_yuv tracking for plane mapping.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "13735c4f476f997966baa8a1f4c071867d78b401", + "description": "panfrost: Fix printf format specifier.", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "6148d1be4bb52039ccda57f25a9d27ecb7aa7541" + }, + { + "sha": "4925fb97f65f20fd52c94a080a68108c25a4095f", + "description": "glthread: don't upload for glDraw inside a display list and always sync", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "2840bc3065b9e991b2c5880a2ee02e2458a758c4" + }, + { + "sha": "cf9926714783efa4524ca2afd62a9817dcbccf06", + "description": "util/format: Add more multi-planar formats.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d491b0dfd97c27b245ad0ed0e7356377f25ebe67", + "description": "util/format: Use correct pipe format for VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM.", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "75d7ee80291d6693ca2611bf8ad2bb14a34588db" + }, + { + "sha": "273ead81f1a219b39a93abbed4db548d8eeb0e5f", + "description": "util/format: Add VK_FORMAT_D16_UNORM_S8_UINT.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f047d585ee472a314d4ad5da4dffa5e7c2a42eb5", + "description": "etnaviv: Fix memory leak on error path.", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "eed5a009897a859ec118ef84c0437be174a49da3" + }, + { + "sha": "bccb3deee2995e68a7f8a63d857f2cd298ff8361", + "description": "panfrost: Probe G31/G52 if PAN_MESA_DEBUG=bifrost", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "be8cbe0b41dcabb5a0beb7b2ab2bd6ce87eb7955", + "description": "panfrost: Add GPU IDs for G31/G52", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "229084f5de848ea83c83b6d0743edfc90eddb428", + "description": "panfrost: Disable QUAD_STRIP/POLYGON on Bifrost", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4be2cd604bc601f90eb90625bb91a040659b6767", + "description": "pan/bi: Passthrough deps of the branch target", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8230a04f513e033843da2f2e26f87ac3846c4dd7", + "description": "pan/bi: Allow two successors in header packing", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "db2c10d0325cc9c127209b11b8c36f2e5625d185", + "description": "pan/bi: Measure backwards branches as well", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a42731536d59ec2c028138d303d15c18158e85c9", + "description": "pan/bi: Add bi_foreach_block_from_rev helper", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c697992ca18e6f059d167fa0a1a9af53b3f93fea", + "description": "pan/bi: Defer block naming until after emit", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "bd6ff4f7e1845d380c366d4f643725fe76a101f5", + "description": "pan/bi: Pack unconditional branch", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e4791d2bf85045f59451dcbc0e166b3c71ec3048", + "description": "pan/bi: Set branch conditional bit", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ffe7a61a46139b9d872ec60b686aad1926b857f7", + "description": "pan/bi: Set back-to-back bit more accurately", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3aacfaf87eccee657ab9a5acc7bfe83b226f120b", + "description": "pan/bi: Set branch_conditional if b2b is set", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e945d4f79d6f4da1d6ad61ebcef43ba47aeb7833", + "description": "pan/bi: Pack proper clause offsets", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "682b63cdc2631de48d6d5e8ce739e272ae373c10", + "description": "pan/bi: Measure distance between blocks", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "64c49ab1fc48e9a82c06f1e9fc92c3cf093ef3ce", + "description": "pan/bi: Add bi_foreach_clause_in_block_from{_rev} helpers", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "64bedbfa67bc0f814ba6b0d4a587807fb9b88050", + "description": "pan/bi: Link clauses back to their blocks", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9c329567508836b5b40cfbacf29a840e1e6d4c41", + "description": "pan/bi: Preliminary branch packing", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "cd9a08d4f2360c227eb17f5b1f166ac46ca08ebe", + "description": "pan/bi: Assign constant port for branch offsets", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "cdff3ebc9a28ffa0001012ab5ad913c81de7fb8a", + "description": "pan/bi: Set branch_constant if there is a branch", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b9967ab6da8a1c383939752611ad564aee271cb7", + "description": "pan/bi: Pack branch offset constants", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "627872ef7f8be877cc8c64f0b424827a43ed8ef7", + "description": "pan/bi: Add branch constant field to IR", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f1298ae33658c7e0e1c2e07b70903338e0981bed", + "description": "pan/bi: Passthrough ZERO in branch packing", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d619ff009b57e6949e88b9a607cc8f089d6d7ad2", + "description": "pan/bi: Fix branch condition typesize", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1cdd55a81ea14df39608ef38bd6acb77369f9de1", + "description": "pan/bi: Fix CONVERT component counting", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d8c6a7187856edeb55ebd63c9274e9a780f22b35", + "description": "pan/bi: Only rewrite COMBINE dest if not SSA", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e42a5dfd4f2b22c73f4627128ac6d3dbcb10aca1", + "description": "pan/bi: Fix emit_if successor assignment", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "9a00cf3d1efe336e09906d87a8f5a50cbbe10fd6" + }, + { + "sha": "b34eb94d9c97a7bfdd64da444dcc0860f6546f89", + "description": "pan/bi: Allow printing branches without targets", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a4fc16a1d4bbfa520bdf11dbcdf41dd3a3e14829", + "description": "pan/bi: Remove schedule_barrier", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b3ae088b96d9242d7d0fabde0516ccd76279ffd5", + "description": "pan/bi: Add helper to measure clause size", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2a4e4477fc3ce3e3a914dad98b1129e90cbdf0b0", + "description": "pan/bi: Add bi_layout.c for clause layout helpers", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c3de28bb49229f195e2353d8bbaee63ff3198481", + "description": "pan/bi: Remove more artefacts of 2-pass scheduling", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4096be05af306d18bf948f92ab03ee7d6f7468f4", + "description": "pan/bi: Add MUL.i32 to disasm", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ec8665615f4b4bd019ea56fb1dd2be3716802b78", + "description": "pan/bi: Disassemble pos=0xe", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a658a4f7a5599141c678794676f4a5cfc16ba7f1", + "description": "pan/bi: Document constant count invariant", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ac64bf9b207f6a4e7f41d57ee123b173f631cb28", + "description": "pan/bi: Move bi_flip_ports out of port assignment", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "95e3776d3e0119f679bfb467028ed09226fdf95d", + "description": "pan/bi: Add FILE* argument to bi_print_registers", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "dd96b451f66c793b2a9593a58c672e6482eaa12c", + "description": "pan/bi: Drop `struct` from bi_registers", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b042ddef325ee6f88ebfff76f84173825c40d33f", + "description": "pan/bi: Move bi_registers to bi_bundle", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "79f30d8a86e9f9fe0f542c75f8ebf2e617f13135", + "description": "pan/bi: Move bi_registers to common IR structures", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "59f8f20306b5890ce2f26b12f22db682cbdbebac", + "description": "pan/bi: Remove comment about old scheduler design", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "635bf652edc9150e2e939d76cacbb74537530ef5", + "description": "pan/bi: Remove FMA? parameter from get_src", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "20f6c7a9134062b6e16f7a5bd16c197cc2055e9b", + "description": "panfrost: Preload gl_FragCoord on Bifrost", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1d194f8ac471a075984a0d4f5c6399318c525998", + "description": "panfrost: Set reads_frag_coord as a sysval", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "52875a34aaf7eaf913740f157bccce5e82f8679b", + "description": "panfrost: Don't generate gl_FragCoord varying on Bifrost", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "11470fcde266aa8b864b6a114fc923b2b8e5907a", + "description": "freedreno/a6xx: fix vsc assert", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f6f7bc2979926faff3abecfb52533e7043fc05a5", + "description": "freedreno/a6xx: Program VFD_DEST_CNTL from program stateobj", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7aa809e31c77a7b0fdfa5c3f818edabc288148cd", + "description": "freedreno/a6xx: Create stateobj for VFD_DECODE", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8952dd6d991fd5041a48df31f849e8ddbcb74046", + "description": "freedreno/a6xx: Decouple VFD_FETCH and VFD_DECODE", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c15db8928fb7d16a0cf5443fefce7efde5a50eaa", + "description": "freedreno/a6xx: Move per element offset to VFD_DECODE", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "601a029e67cc62a32cf028d87653a877c18ecfbd", + "description": "ci: Rename x86_cross_arm_test to just arm_test.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9c9ade468598e9c177612e5fc0860aa8b9b39b1d", + "description": "ci: Don't build an arm_test container now that the last user is gone.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6f4fc4ff71524d55f6b481e4580c377d3c5f3b66", + "description": "ci: Switch cheza (freedreno a630) testing to baremetal.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c89a749f66527caf72c43b433dc27de1594a87f6", + "description": "ci: Add scripts for controlling bare-metal chezas.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3a1010e21ac426736fb9af289570d3fbbaaa1884", + "description": "ci: Build a cheza kernel.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b678568a5ef56a6b87204ec1b0499b7c04591656", + "description": "ci: Disable the firmware loader user helper option in arm64 kernels.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9d645a19ebf2cc574e6ad3f84100f0e2ddd4d59b", + "description": "radv/aco: enable VK_KHR_subgroup_extended_types on GFX8+", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e22567089c829765d0b78a87d96f7dc5af9e10cd", + "description": "aco: sign-extend input/indentity for 32-bit reduce ops on GFX10", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "83dcd1690be5dbf7129ed05cc12043130a2f875c", + "description": "aco: allow gfx10_wave64_bpermute with 8-bit/16-bit input", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8ece71507db9ca8c1cd01974f81a17d1f52efd0c", + "description": "aco: allocate a temp VGPR for some 8-bit/16-bit reduction ops on GFX10", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2e0ea9bccae5d280e163479d9c46d0ad8b29a504", + "description": "aco: implement 8-bit/16-bit reductions on GFX10", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "75a730ced59701201ef4247cbe2189a9be6a9d18", + "description": "aco: fix register allocation for subdword instructions on GFX10", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ad609bf55a87200ab11ad7cf31420dcfd8dfc141", + "description": "frontend/dri: Implement mapping individual planes.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a2ee293422c09c9ecc8150ad70d29273e28c6a71", + "description": "zink: Check fopen result.", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "8d46e35d16e3936968958bcab86d61967a673305" + }, + { + "sha": "7503863fe2a48d155ec9c1778206f1e9a2dc5987", + "description": "radv/aco: enable VK_EXT_subgroup_size_control", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6f391262003e2d58395dd17d2cf1e1a6807f7a0a", + "description": "freedreno/a6xx: document LRZ flag buffer", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a3947f9d247619043ac9a2c17f746d2fbfb0e5ac", + "description": "freedreno/a6xx: LRZ fix for alpha-test", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "838666a41dcbbf566bff57e7a7b841e50bf2bdce", + "description": "util: Initialize pipe_shader_state for passthrough and transform shaders", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "f01c0565bb9ad7335395920aebc7489cb76d1b72" + }, + { + "sha": "034329128b703f8c2e8ad1980ddb131df372ce48", + "description": "iris: Rename iris_seqno to iris_fine_fence", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "682e14d3eaee8991ee08ea309cbf9a509b6e6b27", + "description": "nir: lower_tex: Don't normalize coordinates for TXF with RECT", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "1ce8060c25c7f2c7a54159fab6a6974c0ba182a8" + }, + { + "sha": "f0c102c075f8ac76629bb34619187262ccc3e9d8", + "description": "ci: Quick exit qpa extraction for non-matching qpas.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "46d9b500f4fa57affbb75ffe092dcf23a717706f", + "description": "ci: Move baremetal DEQP_NO_SAVE_RESULTS setup to the yml.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "33e0821a99fcc9e97ae0ce2065f2ce14df568c31", + "description": "ci: Add DEQP_EXPECTED_RENDERER support for VK tests.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6766d51c15fc8143466b53aa7384d0c06218f12d", + "description": "ci: Auto-detect the architecture for VK ICD filenames.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "044f50b9fdb0186ce32c5e54710f025ff677dab7", + "description": "ci: Drop old comment about enabling --deqp-watchdog.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c343d00edeb9a36a937e38664945ba6554fd4011", + "description": "ci: Drop double \".txt\" suffix on the unexpected results file.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "10c4a7cf59733ae2058a76b880ea0767a59dad4f", + "description": "spirv,radv,anv: implement no-op VK_GOOGLE_user_type", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "01ce7887bf0d6ec4619e1851002d4774aaa28a90", + "description": "aco: fix 64-bit shared_atomic_exchange", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1f2fd9c62ee167cae34d34664e6bd972169a3307", + "description": "aco: don't reorder barriers in the scheduler", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e1900ee2c70c15dea56027c21676174704f12348", + "description": "aco: preserve more fields when combining additions into SMEM", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "93c8ebfa780ebd1495095e794731881aef29e7d3" + }, + { + "sha": "95d5c1b8a1ebe4a2ce47206b9ff0af4fbfd5a31a", + "description": "aco: check instruction format before waiting for a previous SMEM store", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "1749953ea3eb2f3e33a61243cc11860795c658f3" + }, + { + "sha": "5ccc7c277c86f754f40515820b27b55296107c54", + "description": "aco: consider SDWA during value numbering", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "23ac24f5b1fdde73cf8ec1ef6cbe08d73d6776f5" + }, + { + "sha": "8aa98cebc15e6f6f8bcf42162399b5826376b3dc", + "description": "aco: fix interaction with 3f branch workaround and p_constaddr", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "93c8ebfa780ebd1495095e794731881aef29e7d3" + }, + { + "sha": "1fc1b877622e3477272a17a43fd438453484bb79", + "description": "gitlab-ci: Pull in GCC 9 from Debian testing in x86_test-gl/vk images", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c2366f01fd194c9aef91153e3050af30c57bb95a", + "description": "gitlab-ci: x86_test-base image as common base for x86_test-gl/vk", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "43111ea74589db078aa7d385d49d480097d22439", + "description": "gitlab-ci: Also list arm/x86_build in needs: of test jobs", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "bccf2a25a89622c49dcfa488763c245e6584d568", + "description": "intel: Add helper to calculate GPGPU_WALKER::RightExecutionMask", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "78e400d4a515e8d8187259ed1287dd4671dee9ca", + "description": "iris, i965: Update limits for ARB_compute_variable_group_size", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "46b428074f427ddff37d2f92a3ac0f0468d253be", + "description": "iris, i965: Drop max_variable_local_size", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "90ec26a800ca7d24237b9df9b2549452f4aa9946", + "description": "intel/fs: Generate multiple CS SIMD variants for variable group size", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9b8347c98842621a621746ec5718c95d297876c9", + "description": "anv: Use new helper functions to pick SIMD variant for CS", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "594374dd8d83a32fa9149b2b799d8fc1c51ceb87", + "description": "iris: Use new helper functions to pick SIMD variant for CS", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c9f4bda6ce52685cc835530d23348c69adfd89be", + "description": "iris: Set CS KernelStatePointer at dispatch", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ee0fc0f6dcf6093f4e3ff0796ace3cb1590a72ea", + "description": "i965: Use new helper functions to pick SIMD variant for CS", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "cb26d9c3119e089a0e0c6b0bf6cfc90193c70326", + "description": "intel/fs: Add helper to get prog_offset and simd_size", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5b5e77caa7f0225aab3701de66b7434553c66033", + "description": "intel/fs: Support INTEL_DEBUG=no8,no32 in compute shaders", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "10d0f39beb20c4cd6fe6d3f23a3b1d918653127a", + "description": "intel/fs: Remove min_dispatch_width spilling decision from RA", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9526e14b5cca2417368a41e2fb4a5835cfcbe205", + "description": "docs: update calendar, add news item, and link releases notes for 20.1.0", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e94a811a46253ebf366f991129b283ff648ae470", + "description": "docs: Add release notes for 20.1.0", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "dff1bac6345b755bfba544b144e1e9dad71be9aa", + "description": "zink: always use logical eq ops in ntv with 1bit inputs", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "df2c68ee4fd470efdc4f0121a2fe4aa5c64771e0", + "description": "pan/bi: Initialize struct fma_op_info member extended.", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "8c79c710d4e1f3e424d5abf1f9abccdfc9a59caa" + }, + { + "sha": "b3023055e075386e96fe2fbf093f0db261c0d9fa", + "description": "lima/ppir: use a ready list in node_to_instr", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "632a921bd0dd40ad3167a0b8a74fe7054a78256a" + }, + { + "sha": "9ae8b4af75ea708323352c5c016dc4c72ba9c893", + "description": "pan/bi: Suppress inf/nan for now", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6f589f4e045c0e3a353e15899e67729d08a1ded0", + "description": "pan/bi: Add CSEL.16 packing tests", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "87ca1c1eeaedaad6d430c6504f32def27ec984b1", + "description": "pan/bi: Pack compact vertex texturing", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6650fa22c79dca1264b8f77f83bba3ccbb0298b9", + "description": "pan/bi: Add f16 TEXC.vtx op", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "731dfc6066dac8da477ba02ad90d5f2145fa0811", + "description": "pan/bi: Allow vertex txl with lod=0 as compact", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "fd0324a1ce9af727442a4a7208f0c017cdd7c681", + "description": "pan/bi: Document compute_lod bit for compact tex", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d31bc0e21c4799cd34a1c18643cd15c3f1026a12", + "description": "pan/bi: Also add compact vertex texturing", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f514bdd10676ac35a0d4d48f0aefd57d21feb2c8", + "description": "pan/bi: Add TEX.vtx opcode for vertex texturing", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2fd3ad91c737c1a00a1b6ace95423fd2d8f9d577", + "description": "pan/decode: Decode Bifrost shader flags", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ee6a5a5f0521bc40457258b2f0dede8a3f2f42ba", + "description": "panfrost: Set MALI_BIFROST_EARLY_Z as necessary", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3f78f25ce9f5a72573a55c4d919bb65b80c036f7", + "description": "panfrost: Identify MALI_BIFROST_EARLY_Z flag", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1c2d0418c1aaa2b279b72c60fe0fa7a658bb0789", + "description": "panfrost: Add defines for bifrost unk1 flags", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "55e3305a5b0bd47874e99b3dd090929fc3cbfd0e", + "description": "panfrost: Document Midgard Inf/NaN suppress bit", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0e88dff374bb72a1fb28941029726e2b79ad2784", + "description": "panfrost: Ensure nonlinear strides are 16-aligned", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "bde19c0e7ba575f3c8ca8ea76c916034264a8713" + }, + { + "sha": "d45936c01cd1811fb0ca927858bca404f1292791", + "description": "panfrost: Identify Bifrost texture format swizzle", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e3692fd53ee48cd4019bc1822f044d1ffd1ad08f", + "description": "panfrost: Set unk2 to accomodate blending", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3d6cc14513c1032ff8b24b378354aa7fdb99c6fe", + "description": "panfrost: Share MRT blend flag calculation with Bifrost", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f5cf54fc1d5681edac8c4c9ce4822d5a67bc70d4", + "description": "panfrost: Force Z/S tiling on Bifrost", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f512489b2e016837b0b31e7b11948fe503f30137", + "description": "panfrost: Tweak Bifrost colour buffer magic", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "76e871d3ffc8fac11881fc3f78f86ebfec3955af", + "description": "panfrost: Tweak zsbuf magic numbers for Bifrost", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "aeb580189281c920b29c73e816b4ac86e2a26a0c", + "description": "panfrost: Adjust null_rt for Bifrost", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "83cd3f0b4e773f7db347f8d42a5cfb2584dee45d", + "description": "panfrost: Fix Bifrost blending with depth-only FBO", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a91306677c613ba7511b764b3decc9db42b24de1", + "description": "ac/gpu_info: Correct Acturus cu bitmap", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "296c04d78c9840f83e7fcaf9b45a4cee96752348", + "description": "intel/fs: Work around dual-source blending hangs in combination with SIMD16", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "dd2bd68fa69124c86cd008b256d06f44fab8e6cd", + "description": "zink: use general-layout when blitting to/from same resource", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "d2bb63c8d4cdc02b1c33afadea5becd58fb7286c" + }, + { + "sha": "d9eaac02e53944799fbadf6ab7ff6cc725b0483a", + "description": "radeonsi/drirc: enable zerovram option for 7 Days to Die", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ddfd2e626a51373f88f2a58701304e6403450705", + "description": "turnip: support VkImageDrmFormatModifierExplicitCreateInfoEXT", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "da409fb7b8c997de28db4016c788abff14bd8c57", + "description": "freedreno/layout: add explicit offset/pitch argument to fdl6_layout", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f1acf492de91e04a81950c0446c36b22b48bc94c", + "description": "glsl: fix slow linking of uniforms in the nir linker", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "95f555a93a8891ebba2a291eecd984eb2364d636" + }, + { + "sha": "f6214750eb4d53296e674dd26fc668b1029a1c8b", + "description": "glsl: stop cascading errors if process_parameters() fails", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "53e4159eaaf692071bf63365eb27a16c97c9a3e5" + }, + { + "sha": "755c0400606f821111fec76764ddb97243f2ad8f", + "description": "freedreno: Add missing va_end.", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "a0ca1462f31747d028abe9106309f6c95c3daabf" + }, + { + "sha": "e91108691d27fa9f2410c056fc909e70a6c4b9c0", + "description": "nir: Fix sources for image atomic fadd", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "247f2fb32ae39009f2e1ba6ae0f2c97573b910d8", + "description": "pan/decode: Dump unknown2", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6a19d49b2e8386c963bd921c7f1f3261d66af26c", + "description": "pan/decode: Dump missing field on Bifrost", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c2c8b1ac5791348977fed3d6056ae50af1721649", + "description": "pan/decode: Fix tiler warning", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4bc7d521b13ab64bdc6d2a75ac79a0964955125d", + "description": "pan/decode: Fix unused variable warning", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a62123572099cfa173804146771e76dae3637eab", + "description": "nouveau: Use SATURATE", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9e535629805deb88ca169f9ce8b82720f2efca02", + "description": "etnaviv: Use SATURATE", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "bb5e10af246b320796f11d6faccbf6dc4c43e4fe", + "description": "iris: Use SATURATE", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "17199107fd2f55dbed674e37922773172152eeae", + "description": "i965: Use SATURATE", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f59d02a86dcf0e34cf43a223e44f51924e9ff4a8", + "description": "intel: Use SATURATE", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7ea2ad0b3999a8e5d7334447af5d3c32537366fa", + "description": "softpipe: Use SATURATE", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9983c4cd68edab08e8bc03480c3fdd518637a3f6", + "description": "panfrost: Use SATURATE", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "82996a8cff79aa8acc700cd45c933635e59bdedb", + "description": "glsl: Use SATURATE", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a024b394272c9210d3ff1915fbb54836a8e095fc", + "description": "gallium/draw: Use SATURATE", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "747cb95e3c832ca33b848b56af458948ff0cce36", + "description": "mesa/swrast: Use SATURATE", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "05bacdb9170edc408a86ca315f195b9aabdd3651", + "description": "mesa: Use SATURATE", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0f1fde1fafb64ac6e33325b30443b53e243bfb0e", + "description": "util/format: Use SATURATE", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "35938c15e22e3021f7693425f0d2134845c81f6b", + "description": "util: Add SATURATE macro", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8cc7711924fd0f3c76e22e527e21d8f8368e5395", + "description": "intel/fs: Remove redundant assert()", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "462bc408fe953d8d4e914e78c7faef057e806872", + "description": "intel/fs: Early return when can't satisfy explicit group size", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2a308ee4c792bc64486e94374f74d221bbaa10f1", + "description": "intel/fs: Remove unused state from brw_nir_lower_cs_intrinsics", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5e0525e145a180dfbce359f83994137f8b8b7295", + "description": "intel/fs: Remove unused emission of load_simd_with_intel", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a5a413e19a82b1d97e9d7e0fc504e6012781d90b", + "description": "egl/android: Drop unused variable", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "09efdccf4a83c62e632020e8a425eba67de8dc43", + "description": "egl/android: Move get_format under HAVE_DRM_GRALLOC guard where it's used", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c26317ebd6c42fcd70a63c2a95d04f11f3c15bd5", + "description": "mesa/st: Use memset to zero out struct", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "12653beacba00146f5bf31816a7c1dc8e51735ff", + "description": "mapi: Fix a couple of warning in generated code", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8341f30f1ea87a22624031c2f5f670d1b9f8678a", + "description": "src/util: Remove out-of-range comparison", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f4e64e9f530c22e779ef5747c2a927bdd5b6c47d", + "description": "freedreno/ir3: Avoid {0} initializer for struct reginfo", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "06ab93d6949924a353aada939935737dfdcbae84", + "description": "turnip: Use {} initializer to silence warning", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "697fe1c8015d14b6d2ebc5ba70cd05439ef1490e", + "description": "turnip: Use tu6_reduction_mode() to avoid warning", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "fff17707ea959f8fd6098e3a154a8574f4912a85", + "description": "turnip: Use hw enum when emitting A6XX_RB_STENCIL_CONTROL", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6aa3004d6049afdbbe85b9f807f5f9f840cb05c9", + "description": "freedreno/gmem: split out helper to calc # of bins", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "fcecdcd82252013020a224f9e2887fcedbe23789", + "description": "freedreno/gmem: fix nbins_x/y mismatch", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "1bd38746d5a2d21a2f77e2bf1c6fec17ca5eb6ac" + }, + { + "sha": "9b91d88b3390d264d06ccc11142325e99780e808", + "description": "freedreno/gmem: add some asserts", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1679efe92755871d48f81d6b3f45158f36c6f711", + "description": "freedreno/gmemtool: add verbose mode", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9c6693f0e4326e915e40caf42f4919407bbadf32", + "description": "freedreno/gmemtool: add a405", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b20663c5ba9523d24edfb6f1ce42561e36607f4a", + "description": "freedreno/gmemtool: make GMEM alignment per-gen", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "fec8288081261ad902732f64ec5603eb96cd804a", + "description": "freedreno/gmem: make noscis debug actually do something on a6xx", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3024d009006449df1e69cce4c90a7d1c7f01e5ca", + "description": "freedreno: handle PIPE_TRANSFER_MAP_DIRECTLY", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8728c42031379be979e56a457a178ce6a5b87b08", + "description": "freedreno: clear last_fence after resource tracking", + "nominated": true, + "nomination_type": 1, + "resolution": 0, + "master_sha": null, + "because_sha": "ddb7fadaf8b1aa3004e72d6b0e28e465f8f45fba" + }, + { + "sha": "4c97a716a64a0d4990a2cc2f8185713459576ca4", + "description": "freedreno: add batch debugging", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8e2009c4481434f1b97713d8a0ec193fdccb65a6", + "description": "nir: fix lowering to scratch with boolean access", + "nominated": true, + "nomination_type": 1, + "resolution": 0, + "master_sha": null, + "because_sha": "18ed82b084c79bf63666f2da22e5d675fb01aa26" + }, + { + "sha": "e369b8931c675a6e86715c682723b085e45e0ee5", + "description": "freedreno: Use explicit *_NONE enum for undefined formats", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5ec3747fbe8e1d5a5a8b29b123b843b57ce77c6e", + "description": "freedreno/ir3: Use RESINFO for a6xx image size queries.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2ec4c53ef94901bd7d1623047c52dcdb98a9764f", + "description": "freedreno/ir3: Move handle_bindless_cat6 to compiler_nir and reuse.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2068b0143027a9f61011f3cac6b620414c85c8f6", + "description": "freedreno/ir3: Refactor out IBO source references.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "00b9099dd599ecaede1c0ddbb8fa3097e299667e", + "description": "freedreno: Set the immediate flag in a4/a5xx resinfos.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ae00da5ddb6e787ceb0a05872d271c01d04c9652", + "description": "freedreno: Fix resinfo asm, which doesn't have srcs besides IBO number.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c1cb75678d53f4616303a688f1c4a89773c1426c", + "description": "freedreno: Add more resinfo/ldgb testcases.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5d4a911d8c3d9e3f218ea136179c4cbf9a91b07d", + "description": "freedreno: Fix printing of unused src in disasm of cat6 RESINFO.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4f02b480713c9c6e5ad65d1e6ab9e4454a8d9504", + "description": "freedreno/a6xx: Fix the size of buffer image views.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3987e25c03d2049b965d6513e4ef6fe7ae93c564", + "description": "tu: Add missing storage image/texel buffer bits", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "439a4ac0250f86b8f15fc33bb9443e52045beae8", + "description": "tu: Respect VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "08d22bb908e161b15b12dd094f94de06ec0c883f", + "description": "tu: Fix IBO descriptor for cubes", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f7ab9c4eb17949e20dd8b82de01c977887481489", + "description": "glsl: cleanup vertex shader input checks", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e89d34aacab069c2074241ea0104705c18c9d67a", + "description": "glsl_to_tgsi: add fallthrough comments", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "38a4b861459b02401d3ff71670218506e7acf019", + "description": "radeonsi/gfx10: implement most performance counters", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2a3806ffa352a37ab03fca46a596bba99fcb11ca", + "description": "amd: replace SH -> SA (shader array) in comments", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2cf46f2e3d89c9cd9a30835ee2ebdf24cdd8119b", + "description": "ac/gpu_info: replace num_good_cu_per_sh with min/max_good_cu_per_sa", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8c3fe285c91dd1289849ff3d5f81e283bdb8b382", + "description": "radeonsi: don't hardcode most perf counter block counts", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f3c1833b775a3e0b0d1291ad768fbb4bb982ec22", + "description": "docs/features: mark GL_ARB_texture_multisample as done for zink", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "cd1639cbe330f2b171b72605c75f973de0cb513a", + "description": "zink: expose PIPE_CAP_TEXTURE_MULTISAMPLE", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4f90e818c8678a17203d5035c6e26427704e56db", + "description": "zink: implement nir_texop_txf_ms", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "caa83e4d7902912cd22ba82bbd5213f55a657cdd", + "description": "r600/sfn: remove debug output leftover", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "cead23cb8ac3fbfdcbc69df024d7ef0c4d9cd1b9", + "description": "r600/sfn: Correctly update the number of literals when forcing a new group", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "12381a04108332d56b46dccd9145cb6e6fa5534e", + "description": "r600/sfn: use modern c++ in printing LDS read instruction", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "eccf939b6f2a0bc44313a5bdc0f079003f6bd2cc", + "description": "r600/sfn: Fix mapping for f32tof64 and f64tof32", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "901793d558e6fcba5438fc0738226833f6147c8b", + "description": "r600: Fix duplicated subexpression in r600_asm.c", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "4422ce1b04c117f61394a6834cd7933f06ce4e1f" + }, + { + "sha": "ceab349483cb43b0a62895a3657049d4045ad324", + "description": "freedreno/drm: disallow exported buffers in bo cache", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1241f8cb4c9c6cc65106a085be81963f3505a7d5", + "description": "r600/sfn: Use correct setter method.", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "5d10e3ec6066239d732d19f69cd95da447e73e32" + }, + { + "sha": "ed1fd7bcc6f76b70ab63a6dbb1f0c9e073db84df", + "description": "zink: pass batch instead of context for queries", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "43c691b5b0f1cfba9292b316f846ad932646f0e4", + "description": "zink: do not dig into resource for nr_samples", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e38513e828df239b4ea06f50af9cecf78305eb37", + "description": "zink: use samples from state", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "fcbc022787d4fdfcfdf843d9f720a587e1f0579d", + "description": "nir: Add un/pack_32_4x8 opcodes", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "46d5b07c5c39d1b8cf10976f6574a63062dea9c4", + "description": "util: delete fnv1a hash function", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "bf4d652f3f44da7837d5ca7c514533bf8661e31e", + "description": "zink: replace fnv1a hash function with xxhash", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "33fd35e2d30f7f31a9ce465ee79d8a874355c774", + "description": "r600: replace fnv1a hash function with xxhash", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "387176829bcef7058ff2be8f175295e9f80008e5", + "description": "util/hash_table: replace fnv1a hash function with xxhash", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "013df5849897e71f62a0df12691f19f0d56cbdf3", + "description": "i965: replace fnv1a hash function with xxhash", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "edd62619a1c455226a5bc972b024ea77debecfa5", + "description": "freedreno: replace fnv1a hash function with xxhash", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0e9af023234d7dbe3349a5303312c613dd28c861", + "description": "nir: replace fnv1a hash function with xxhash", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1d647b1c48a8a25fd93fbaae5a6119e3f7d09ea5", + "description": "panfrost: Only run batch debug when specifically asked", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4c5d1e286003bedc52b92cef74013d2d3bb56f6f", + "description": "panfrost: Add debug print before query flushes", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "be784cc77b88fee2aad4b6ee3bb49e44d3bf1639", + "description": "radv: Implement vkGetSwapchainGrallocUsage2ANDROID.", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "d555794f3032594dbef3623052103900138d2356" + }, + { + "sha": "9a74746bd1f3bd28d4c4c7cba75e3245e1d25530", + "description": "EGL: sync headers with Khronos", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "045267d1e6238e45eb98f286332ee233dec53312", + "description": "st/mesa: Clear texture's views when texture is removed from Shared->TexObjects", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a51ab5f95624d1c29d9592aa2212f80dead8ecdf", + "description": "radv: Do not close fd -1 when NULL-winsys creation fails.", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "cd6ec2b1abbd96f4456e92613c2232c919bb9023" + }, + { + "sha": "cd0c5b64cccd833e0e93e29d263a6cdd86965d3c", + "description": "radv: Remove dead code.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "cd61f5234d2c275b21c249fc2effc058a74ecf0a", + "description": "radv: Handle failing to create .cache dir.", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "f4e499ec79147f4172f3669ae9dafd941aaeeb65" + }, + { + "sha": "906435fb0ee3c205c53c67cc641e73182cf5ae4d", + "description": "radv/winsys: Remove extra sizeof multiply.", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "eeff7e11544f333d211c8f8ad3679db814050cfa" + }, + { + "sha": "6c99de98eca60daf40f10291637475c03d8183b7", + "description": "gitlab-ci: Enable -Werror in `meson-s390x` job", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b3c0f82841a29e191d07028391bfe687ae01b7df", + "description": "radv: advertise VK_AMD_texture_gather_bias_lod", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2e265b94a2b7a8b681a95d512ad991d6ae3fb69a", + "description": "radv: add support for querying which formats support texture gather LOD", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "94570e87bd7702816158ce49a612e4b4e278f7f4", + "description": "aco: add support for bias/lod with texture gather", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e99c818cf0666132f0cb76dc2d78e795d0168868", + "description": "ac/nir: add support for bias/lod with texture gather", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "41dc3ce449ff6fc47691202ef7c5ee8fa37668c6", + "description": "spirv: add support for bias/lod with OpImageGather", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "dd39bf52b0783c6a20acb306eae48958ed22df63", + "description": "spirv: add SpvCapabilityImageGatherBiasLodAMD", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c7943343a0fc2463987f667658a257b74cec1782", + "description": "glsl: subroutine signatures must match exactly", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5bc18b79a4625ae39f1d44e8139017834a53736c", + "description": "radv: advertise shaderDeviceClock on GFX8+", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "14292310d930263cdea9de57ca28faee628c4a78", + "description": "ac/nir: implement nir_intrinsic_shader_clock with device scope", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b034f6cf2a267d3f5cdc24271bc61e5c496f1744", + "description": "ac/nir: fix shader clock with subgroup scope", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "cecd4aad4605de47c056913ed430ad38f14625e5", + "description": "aco: implement nir_intrinsic_shader_clock with device scope", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "37c88c670f79f4833856e9193d3b7696c8b5ad8a", + "description": "spirv: add ReadClockKHR support with device scope", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "769bf48d1667dc7836d9c4af01c37005b2dd96f5", + "description": "radv: remove useless assignment in build_streamout_vertex()", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e1fa60838edf4c6b1c01311c5fbeb28db4d29f67", + "description": "radv: cleanup physical device features", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "198e5e2e9e7d4f0ae7f52a87181728a492973296", + "description": "radv: do not return from radv_GetPhysicalDeviceFeatures2()", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c130a3402e61ba62a2d90f71d4b196b8c5597832", + "description": "r600: Use TRUNC_COORD on samplers", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4174a13459da6086963b4b43b4e0dcdd680da6db", + "description": "panfrost: Ensure final.no_colour is initialized.", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "3e4e849e6a9633702e26ee16b4a594361e42013f" + }, + { + "sha": "73c0f60d8c7c832b49da64740f5d9cbe130811b1", + "description": "r600/sfn: Initialize VertexStageExportForGS m_num_clip_dist member variable.", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "f7df2c57a207a386ba0d2130541ac9d0546670e1" + }, + { + "sha": "76a2aeeef3d314e3eb1f35029e1e86bf69ad5689", + "description": "llvmpipe: Fix variable name.", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "2e5cddacf7fb6e031540ae9f459d19cce5edefc4" + }, + { + "sha": "4e147e2c94e20d144d55996de288d6737e5a76f1", + "description": "docs: drop no-longer-relevant comment about bugzilla", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "444138d6d9f7c0a07ff043bae623efa5ae26b3c1", + "description": "tree-wide: fix deprecated GitLab URLs", + "nominated": true, + "nomination_type": 0, + "resolution": 0, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9375e72d8d1a7aedefeb1d6fbce6384d640a8d2e", + "description": "radeonsi/gfx8: enable TC-compatible HTILE from the beginning as before", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "0d83e7f4b9887346e9b7b4d44c068d340aa04f28" + }, + { + "sha": "d30e1e486dd2e78bbf98ce24cc2f3c7f4f22b56f", + "description": "radeonsi: don't enable TC-compatible HTILE for stencil if stencil doesn't use it", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "caeb44aa244082f3a304a00d5d4e32faf66fca70", + "description": "radeonsi: split si_all_descriptors_begin_new_cs and rename functions", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7b6b35c6b5e96f16e32b55279bcb6e3c4099447d", + "description": "radeonsi: move resetting tracked registers into a new function", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3509d3bd53e6b386e8e153e8e3f701b3f631fc8c", + "description": "ac: update register and packet definitions for preemption", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "56af131f330f06bf6075681c528fe70f221fbbe2", + "description": "Revert \"radeonsi: don't wait for idle at the end of gfx IBs\"", + "nominated": false, + "nomination_type": 2, + "resolution": 4, + "master_sha": null, + "because_sha": "266fec1307b26a544007423582afd8618791893c" + }, + { + "sha": "3f1f23239a149939bf0e520722758e98d10dc908", + "description": "radeonsi: decrease the max GS invocation count to 32", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3cd96b510938536e264907aa3886774a853c0821", + "description": "radeonsi: don't use INDIRECT_BUFFER within IBs", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8db739880af2d50e871b6bd27437a2ad37cf6c00", + "description": "ac/surface: don't compute single-sample CMASK if it's unaligned", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "cf61f635ff6a38aad344ebe30551eaaac6fec038" + }, + { + "sha": "21504eab78eb465e27520baa7389fa732bfefa36", + "description": "ac/gpu_info: compute the best safe IB alignment", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5f365affc906ed9b07857a6fafbb5d51f3f1a607", + "description": "freedreno: Use the right amount of &'s", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1f33ca1fed59c4d7b4abdb2121cdc907bc4ccd2e", + "description": "freedreno: Add missing break statement.", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "5a6beb6a24aa084adfd6c57edd0a64f0a044611a" + }, + { + "sha": "f0e075ce6eca7bdb26d8e55cf7d4dd459199363f", + "description": "nir/copy_prop_vars: Record progress in more places", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "96c32d77763c4b561f751ca360e6539a3c5e7f4d" + }, + { + "sha": "db6d9cdf0661fbe25b1bc767920a5f6a0944935b", + "description": "nir/opt_deref: Report progress if we remove a deref", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "a1c688517dee32c57af17d8e11029eb7470f52d4" + }, + { + "sha": "111b0a669979cf277f31c69f501982fee004e067", + "description": "nir/lower_double_ops: Rework the if (progress) tree", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "d7d35a9522ea5b524732a8b730008d725b84abdf" + }, + { + "sha": "78786a219ea2322af09576472dcc2d6d01cb9060", + "description": "frontends/va: Fix deinterlace bottom field first flag", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "569ca93751d2bebbfbd3cf673c8da447b1f2f295", + "description": "pan/mdg: Allow DCE on ld_color_buffer masks", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d8c16200e9730e4f4f56dc1478dc72dccce26203", + "description": "pan/mdg: Ensure we don't DCE into impossible masks", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "197b398c32a9b08dbd60d98d32972271b24ed07c", + "description": "pan/mdg: Lower shifts to 32-bit", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7a52e975e4e97061b6ccab1b027bc4524460a7fd", + "description": "pan/mdg: Add pack_colour_32 opcode", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f7cf5a30c747e83ef728142e472c341a01f19a70", + "description": "panfrost: Handle !independent_blend for blend shaders", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f9283eff6d16174f3efc1da81dcdb601af2ec170", + "description": "panfrost: Use _mesa_roundevenf when packing clear colours", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8bb51992c8ecc25170d8fd986f932353307a2438", + "description": "panfrost: Fix dated comment", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "be71e2fd08b364f331a7e44d34f67954f0aeebe3", + "description": "Properly check mmap return value", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "38f32372aa94519f86f3358bea1e26e11398d59c", + "description": "ci: Improve baremetal's logging of the job env var passthrough.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ae442c35982e1052267affd92a68f875159a2d08", + "description": "ci: Enable a fractional run with UBO-to-constbuf disabled on a3xx.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b4bccbde3684255f7f9d4e5cbb443ed849fe9c91", + "description": "ci: Don't forget to set NIR_VALIDATE in baremetal runs.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6839ad59e6850beaa95a2d3a401672334ebd08e0", + "description": "ci: Do an explicit NIR validation-enabled pass on freedreno a630.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "90cf494338512bde89717600efabc4135545216e", + "description": "ci: Fix DEQP_CASELIST_FILTER (used by a630 noubo run)", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "09fc9c5f6ca02e5eebc5a11771c2cc1a9df5fc7e", + "description": "gallium/swr: Fix building swr with MSVC", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "40255831230a6dba45b09ab98d5b626a5bdb18ef", + "description": "mesa: Fix double-lock of Shared->FrameBuffers and usage of wrong mutex", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "7534c536ca0f4b2b123200f421460094034f37a3" + }, + { + "sha": "0d2ec80dea8adf3089f187f8c0e9bdbcb135b4f2", + "description": "zink: hammer in an explicit wait when retrieving buffer contents for reading", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "af2d99353555715afe6e6b6ba5158a2cc0d6b015", + "description": "zink: reset query on-demand when beginning a new query from resume", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3933747d87680a3432814aa51f2f5231b2f1ed60", + "description": "zink: fix vkCmdResetQueryPool usage", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ae32a1ed20ebd49c8fb42b9dd31d26c046881f46", + "description": "zink: flush active queries on destroy and free query object", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4592c1d45df2c3ec74c29db1294cf31cf8bd1649", + "description": "zink: add SpvId returns to a couple ntv functions", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "21a7fdf97c3c2ad4119fc478fe0f3f4b90ff411f", + "description": "zink: explicitly zero some arrays in ntv", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e75effc629fbae477284ef80bb25a9646cd32cdd", + "description": "radeonsi/sdma: remove useless compare", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "004ac58509d601c4be5c7905ccd9ce0f647df05e", + "description": "amdgpu: fix unitialized variable", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d92ab0e76319220919fccf98550db9adf4030313", + "description": "radeonsi: fix inversed arguments in si_test_gds_memory_management", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "dddd91eef326dbcdaec2a7fee6fa429d1cf6542a", + "description": "amd/addrlib: fix forgotten char -> enum conversions", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "e3e704c7e7e46dfda820ea2e96fa4a32d63ef796" + }, + { + "sha": "685e79a64bbd6ead6f21b21ec47f55e06a8ce624", + "description": "glsl: Remove integer matrix support from ir_dereference_array::constant_expression_value", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "22979f90d9587e7f31c70d07b0b8517ff0bfcaa7", + "description": "freedreno/a5xx: Define the 2D blit UBWC pitch fields", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6a154aea0d3375aa8469f28bb8a85e5ee79eef4a", + "description": "freedreno/a5xx: Set MIN_LAYERSZ on 3D textures like we do on a6xx.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9f62566ef6f8e096b9bbff5bc51e74b9fe44f241", + "description": "freedreno/a5xx: Add the outline of a unit test for a5xx layout.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e7003df71774ae08e260ec01dd8fc95b20c510d1", + "description": "freedreno/fdl: Separate the list of a6xx testcases from the the test code.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a1a739995bae4f2ad2b075416e9a310957e1a146", + "description": "freedreno/a5xx: Move resource layout to fdl.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e85b6c4ab130670916ef83f32891f8c1023e4a4f", + "description": "pan/mdg: Eliminate remaining divisions from compiler", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2b9f6d30f8ba5d693e87bc172bf577c6dd83dcfc", + "description": "pan/mdg: Avoid division in printing helpers", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4f5b3802dc4c59cb99e3c81144629d5aba9c085a", + "description": "pan/mdg: Eliminate 64-bit swizzle packing division", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "28a750c5f2a4b31c4d58165d19c91310b05a26b1", + "description": "pan/mdg: Eliminate expand_writemask division", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c6c906ecdf973166d17f253d0b85894f42f62819", + "description": "pan/mdg: Cleanup comments that look like division", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "55da8bcede40ebaf3bb28b381b815ebfe9c4c3b8", + "description": "panfrost: Fix transform feedback types", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ef57325fba6a1410861f39327c4f26da91b0c163", + "description": "panfrost: Don't set CAN_DISCARD for MFBD", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1085f74239f2b81e4e17ece4b9b7a805ee8dd250", + "description": "panfrost: Avoid redundant shader executions with mask=0x0", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3e4e849e6a9633702e26ee16b4a594361e42013f", + "description": "panfrost: Disable tib read/write when colourmask = 0x0", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f69b6e91164fe672c1ae9e54b6f17387d81cd9e6", + "description": "panfrost: Remove dated comment about leaks", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6dd11a6dc34c624a90de3e064ad3552830a8d0d9", + "description": "panfrost: Limit blend shader work count", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b8bd356dff032ea3a67158f133fc24da39c9e0b7", + "description": "panfrost: Allow tiling on RECT textures", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c41cf03589bdf030f9e3ca312d86f7078a9d06bd", + "description": "panfrost: Allow bpp24 tiling", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "48cc608859cceb523da1a5e74c0e4aad91f3984c", + "description": "panfrost: Don't zero staging buffer for tiling", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9f2997dad061ee38e06f654f524dffbffc5b2a4b", + "description": "panfrost: Don't set PIPE_CAP_VERTEX_BUFFER_STRIDE_4BYTE_ALIGNED_ONLY", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5a4eeb21bf69fa0ab4e1a5954c1dd0f79441341a", + "description": "panfrost: Fill in SCALED formats to format table", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "98fc955c6e2f034d1357ea436416d9faaeb1f694", + "description": "panfrost: Remove deadcode", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "794c239a990e08b5a42d42607e9c5e5b0921390c", + "description": "panfrost: Keep cached BOs mmap'd", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "485ec761082ddfd952f80bb96b5fb5a607349b08", + "description": "panfrost: Guard experimental fp16 behind debug flag", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e6293425bfe582e15b7a1460b14e33835b90c98f", + "description": "pan/mdg: Pack 8-bit swizzles in 16-bit ops", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ca48143ec4ba78ea472016add2c7531018549cbe", + "description": "pan/mdg: Implement condense_writemask for 8-bit", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f768cb04ed114d7ac65d8bfbaf130002c81448d6", + "description": "pan/mdg: Implement vector constant printing for 8-bit", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "28201af08038343e428036d5b4676d5eb74b0199", + "description": "pan/mdg: Use shifts instead of division for RA sizes", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3d435b334b4e22a89104728f0c32d8b2864b4903", + "description": "pan/mdg: Pack barriers correctly", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "fde1f2b7cb060c1fd5a00e1447bf69b8bbd15e8e", + "description": "pan/mdg: Fix type checking issues with compute", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4e4c9f5f5ac4373dca5177cfcecc484a476cbf36", + "description": "pan/mdg: Separately pack constants to the upper half", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d475d19f097f6c6d65cf5cc5ef149ebfbddd80e9", + "description": "pan/mdg: Only combine 16-bit constants to lower half", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8b4e278628baac10c0cef5a19906362cefb3ab61", + "description": "pan/mdg: Factor out mir_adjust_constant", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b833702cc1b53cb6e0f0e486a56aa62d9650e79b", + "description": "pan/mdg: Print constant vectors less wrong", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "cd26bd9425e80dc3236942913f6bf6d670943003", + "description": "pan/mdg: Round up bytemasks when spilling", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "68d2a889b7a238b187cdf48afa2ed78874d3f23d", + "description": "pan/mdg: Print mask when dest=0", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "553c2cf16b7612d4a70bd96230dad63777ec867e", + "description": "pan/mdg: Set RA bounds for fp16", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b91d71597e4fba907d27f2a82f070c5a25abde5f", + "description": "pan/mdg: Eliminate load_64", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1ff2cabe87601d95bf945339ee1b3ea4b4d8bc72", + "description": "pan/mdg: Use type size to determine alignment", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "51582e54541a35b4eddd7dab98d8f676bcc46c53", + "description": "pan/lcra: Allow per-variable bounds to be set", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0737080ba611f6703a2cec5f4aa3389fc42338a4", + "description": "pan/lcra: Remove unused alignment parameters", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "21405f6fcfc428af3f2aa9d1bc1c3b10b25a71fa", + "description": "pan/mdg: Ignore dest.type when offseting load swizzle", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4f5bad649be3914a6965bda97ca275de989bb7c0", + "description": "pan/mdg: Don't generate conversions for fp16 LUTs", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6b023b35455c3b4329053b7381063f19611f4d38", + "description": "pan/mdg: Implement b2f16", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1108eaa90de8507d405e7751db83764770eaa931", + "description": "pan/mdg: Streamline dest_override handling", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1e4793a95c2ead611b81365ea57789bff326d7db", + "description": "pan/mdg: Remove redundant redundancy", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1cd65353c9dce4fcb3dd70733b5366b04765caaa", + "description": "pan/mdg: Defer modifier packing until emit time", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "edf1479beaef2d2f674918cfec758c1823f21e71", + "description": "pan/mdg: Remove promote_float pass", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "72c1e3a66a7ead84e0b895a7bb11d5501238a013", + "description": "pan/mdg: Promote imov to fmov on a NIR level", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3cfe2fc1b19120ada25e4b4cd1134418162f3d9f", + "description": "pan/mdg: Identify scalar integer mods", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d4a42a78d89e4de356b514a569d87c6106b9145f", + "description": "pan/mdg: Use type to determine triviality of a move", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "df3d932bb4e0f0a2b6e1d08d142cfaeb034fefa4", + "description": "pan/mdg: Use src_types to determine size in scheduling", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "95dd478ed325fef8d947f771eae02513725f0f56", + "description": "pan/mdg: Add abs/neg/shift modifiers to IR", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "31e13956e128b9409a7c34f1b5c54081079c13cb", + "description": "pan/mdg: Explain ld/st sign/zero extension", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "dbcae7c66719c53a0f8b5e4e3ed43d2223650558", + "description": "pan/mdg: Respect !32-bit sizes in RA", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8c012c8f8bb1871486d7f46fd98ff97c857fa64d", + "description": "pan/mdg: Handle dest up/lower correctly with swizzles", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8084fc3b6615201165ebf1bd46ecd91606d1849b", + "description": "pan/mdg: Include more types", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e9a4bd90a86b6c25aec388394af3a888f1184e7c", + "description": "pan/mdg: Remove mir_get_alu_src", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9915bb2c40b0cad628536d179eac47ccf3325860", + "description": "pan/mdg: Remove mir_*size routines", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "40e9bee714ebecd8ebbba39d81712ba3714319f4", + "description": "pan/mdg: Fix constant combining crash", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "eb28a3669be8e9d13b80d1a2859120058c9bccb5", + "description": "pan/mdg: Handle comparisons in fp16 path", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2d4493ee11aa653ed4c2cc51b37f9e60b534b8bc", + "description": "aco: sign-extend the input and identity for 8-bit subgroup operations", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c76595aec2c47463a3ae580c56bd19191f185acf", + "description": "aco: use a temporary SGPR for 8-bit/16-bit literal reduction identities", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b3c87c52ea4b8f311f0e87c76420e94b7149d8b0", + "description": "aco: implement 8-bit/16-bit nir_intrinsic_quad_*", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "dfa62d97a0fdfd320f8d08e44883342c186acaae", + "description": "aco: implement 8-bit/16-bit nir_intrinsic_{shuffle,_read_invocation}", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f03e56eaf0bdd94098960ca6d8b49564f5757992", + "description": "aco: implement 8-bit/16-bit nir_intrinsic_read_first_invocation", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "af7e2c61335640b4b23bdf907ea9ec94c89c218b", + "description": "aco: validate 8-bit/16-bit VGPR operands for readfirstlane/readlane/writelane", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "86e2b03e3f8862d52fd7ff0945eab423ba03ad26", + "description": "aco: implement 8-bit/16-bit reductions", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "cc79945b215c3a78074905deeb27d0300034994a", + "description": "aco: declare 8-bit/16-bit reduce operations", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "bf97150d45c9642daadedad05039209e28abe4d1", + "description": "no_extern_c.h: fix typo in comment", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "089b0310efb6d93bc78bb4b1b0d5e7494db15e27", + "description": "docs: fix broken release-calendar", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "34718070ef899b1faf3baeb3c741114ac1c04b0a" + }, + { + "sha": "40ed7fcc0bac0cf46188a527deb44b038f0c0b59", + "description": "aco: fix typo in insert_waitcnt's kill()", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "51f4b22feec3720c89458094a3245efc984115ee", + "description": "aco: don't allow unaligned subdword accesses on GFX6/7", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ae390755fea8b48f63314ecfd2699e1d2c375e76", + "description": "aco: fix corner case in register allocation", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "acec00eae0604dd439a665f2ae8942c16d057853", + "description": "aco: don't move create_vector subdword operands to unsupported register offsets", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5201985332dae703566764606cbbf4d6f56fc40d", + "description": "aco: restrict copying of create_vector operands to GFX9+", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8635c28a9264e24e5f0b64e68af31024945cda98", + "description": "clover: Address unnecessary copy warnings", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "15a27ed73b2df959b095b9e4e8413e77ee55ef30", + "description": "clover/api: Address missing braces for subobj init", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5500a2b7fc558217bbd4c2a966ab6fcadaed8b3b", + "description": "meson: Disable GCC's dead store elimination for memory zeroing custom new", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a3045cbc97d7417e3036ba8f1f6f5189c1254407", + "description": "radv/winsys: remove useless free in radv_amdgpu_create_bo_list()", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "57a4837f6be2b0c8b6a9174d0a385c7f0bfb00e1", + "description": "radv: fix duplicated expression in ac_setup_rings()", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ef042ae7c369ef7045ba73f79f633e0e76661fe9", + "description": "radv: fix missing break in radv_GetPhysicalDeviceFeatures2()", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "57e796a12a883f5845c8e0ed05ec6bdb6d055b53" + }, + { + "sha": "1ad9a8a884eccffb2a450746e5cda8d14da82004", + "description": "aco: fix missing break in label_instruction()", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "22554e1fbc20d0f642e952fda13d8f2631a898eb", + "description": "llvmpipe: compute shaders work better with all the threads.", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "1b24e3ba756ba4951086d0b1b351364acbd3af1d" + }, + { + "sha": "02a1f95386b43bf46cd1c8297d0955242f554fa2", + "description": "dri_util: Update internal_format to GL_RGB8 for MESA_FORMAT_R8G8B8X8_UNORM", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "bf576772ab4d9e76dae716640bed6de879f19567" + }, + { + "sha": "13fc03f4c0e709f6d1a8d811f9bc8a0c8c42943c", + "description": "freedreno/a6xx: Avoid stalling for occlusion queries", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1c21577246691589f0295081d208894082444a02", + "description": "freedreno/a6xx: Emit VFD setup as array writes", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5f494636faf00ac5a2f6e88b0100c642fc04536a", + "description": "freedreno/a6xx: Allocate ringbuffer based on VFD count", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3275b8082a5217888897665d6040bd16652950f8", + "description": "freedreno/a6xx: Map inputs to VFD entries up front", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5b7a73021cfa72a0e1bdccf5573b64b8b2e1eb97", + "description": "freedreno/a6xx: Create shader dependent streamout state at compile time", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9bac0dd99b3fc6b11c31b6408fd86e9a5d8047de", + "description": "compiler: delete leftover autotools test wrapper", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ba44990726cf935f54aa414b8430452358da23dc", + "description": "git_sha1_gen.py: fix whitespace", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c909370117d702d2d9e671ff9a611c85f30539c5", + "description": "git_sha1_gen.py: fix code style", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "413c6f9905262939cbe56cb021d7806eb7d97de1", + "description": "git_sha1_gen.py: fix out-of-date comment", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f68db81cbb3c8f2b901fc5dcac02d01d3e698cc3", + "description": "anv: disable VK_EXT_calibrated_timestamps when the timestamp register is unreadable", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a62ee262fd43f1c7f204de4e5b822d95c2ff6977", + "description": "anv: replace magic `| 1` with already #define'd name", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e27f311c859652e11d5e793a7041efc50cdbb2b3", + "description": "anv: pass the fd directly to anv_gem_reg_read()", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6bf40c28c9bf2e59b34ee7d7ce9e2698a57afba2", + "description": "ci: Make a530's GLES3/31 fractional runs much more complete.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6033c10092ae69ce2a0ad8fe0a25e124f6bbf50c", + "description": "ci: Disable SMP on the a5xx boards.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d1b746284985a34e61c315ba586c789c607ac3ba", + "description": "i965/vec4: Ignore swizzle of VGRF for use by var_range_end()", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "48dfb30f9231c22a6af6885dbc9ef86dc2edde1e" + }, + { + "sha": "10095387f569bfe8a07dfe2f006209635a0b3647", + "description": "r600/sfn: fix nop channel assignment.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "34718070ef899b1faf3baeb3c741114ac1c04b0a", + "description": "docs: update calendar for 20.1.0-rc4", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "81201e461772a06ed2c20207d2901f6cf9a24114", + "description": "anv/gen11+: Disable object level preemption", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5a6beb6a24aa084adfd6c57edd0a64f0a044611a", + "description": "freedreno: add adreno 650", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "72d7d2145c400816dd297a20078adea2cfcc3146", + "description": "freedreno/a6xx: use RESOLVE_TS event", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e49748521ec9182e8d2eec823182cc463709123f", + "description": "freedreno: reduce extra height alignment in a6xx layout", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f6f8a19092027ab0248e216997a5529565ce2e12", + "description": "freedreno/a6xx: split up gmem/tile alignment requirements", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "bf024c96ad33dccbbd8e823407e085e75171b5a5", + "description": "freedreno/a6xx: don't use gmem_alignw for imported buffers", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4b65fcb067c6f6a34b08d03e40edf62dc0a9472f", + "description": "freedreno/a5xx: remove unused reference to gmem_alignw in layout code", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "aa2186db0e8c8cc5ed9a9ab6995948e36067f8ba", + "description": "freedreno: move a4xx specific layout code to a4xx code", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "55803224865d735f060c55cc8940946da725cb0b", + "description": "tests: Make tests aware of meson test wrapper", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ef0d92459c3fda92fb3a42b55b7366d9b951a9bf", + "description": "gallium/auxiliary/vl: Fix compute shader scale_y for interlaced videos", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "494b7ef0c1a440c57f5a6a8a301fba4f7e551417" + }, + { + "sha": "fc06b8b7dc27d9e0b1a84e898d9f42465bd491e4", + "description": "pan/mdg: Optimize liveness computation in DCE", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c24dfc9da42abadf079b012f0d6e52fb4c829112", + "description": "pan/mdg: Precompute mir_special_index", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4cf02b5d4a649b9fe621e3ef855021389663222d", + "description": "pan/mdg: Optimize pipelining logic", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d39f95b75a641d1587151c77c23de85d3d81e89a", + "description": "pan/mdg: Emit fcsel when beneficial", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "db9e16450dac9925c0763de1971c6e18de7944f3", + "description": "intel/aub_error_decoder: print driver identifier if found", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "64473fd8f7d5d72c4cdb599fc2da8a15e465344e", + "description": "anv: add identifier BO", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "507b1ca10c62833b515dcbedf9ee56e3812b88cb", + "description": "i965: add identifier BO", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2a4c361b069bb84facc7e6b8ae19908505c12850", + "description": "iris: add identifier BO", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "805b32cab90547a576afba4a16b04603d355a4af", + "description": "intel: add identifier for debug purposes", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e81de67d858ce2d6bde9f4c7b06a05ec3c1f4f2b", + "description": "i965: store workaround_bo offset", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "07781f0afef80c22389c4ac92dbce2cf47c9ab45", + "description": "iris: store workaround address", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "33b452aae72a185c877d6f9cd806d8ffa20d13e2", + "description": "anv: store the workaround address", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0ff5b9e6923d2083da6c97b6569e50dad78a72eb", + "description": "blorp: rename workaround address function", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f36708b143fec1a09c46ea13324df08c0cb5f667", + "description": "anv: fixup unwinding of device create failure", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "faf28b83fddad6f12e536360690031b9ff2039c3", + "description": "panfrost: Enable PIPE_CAP_VERTEX_COLOR_UNCLAMPED", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3725aa7b5dbea96a747ede0182a3c8a52d756948", + "description": "glsl_type: don't serialize padding bytes from glsl_struct_field", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0d9996e223ee6893acba95c7f5100d8345044e6a", + "description": "turnip: enable 422_UNORM formats", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d070a7ba0cfb11f1e01774b9dd3775ab7cd0c4ea", + "description": "turnip: implement VK_KHR_sampler_ycbcr_conversion", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "70502f071cde4497104cf2b0d46a7e9fda5cef3a", + "description": "freedreno/registers: document 422_UNORM and 420_UNORM formats", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "75d7ee80291d6693ca2611bf8ad2bb14a34588db", + "description": "util/format: translate 422_UNORM and 420_UNORM vulkan formats", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d0e11231a4fa7c7c4da2b4f9aed47a6000687f18", + "description": "intel/perf: repurpose INTEL_DEBUG=no-oaconfig", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2001a80d4a81f2e8194b29cca301dd1b27be9acb", + "description": "anv: Implement VK_KHR_performance_query", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ceb822f9e00f57ebf7fccea4dd8acb510e28cefd", + "description": "intel/perf: reuse offset specified in the query", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "63c193e921e5936c0d4a906285627ac11fd267f7", + "description": "anv: use a query filled by the perf code", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "93924ab0917b52be1a3001509b229c87b7ee6e54", + "description": "intel/perf: report whether the platform supported", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "fe8e8e509992884963e1a17286dd159971a54cf1", + "description": "intel/perf: add counter category to generated code", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c36933e0818ade16cb9d29b91c7d4f0068f7af39", + "description": "intel/perf: add helper to compute metrics from counters", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a7890f559b34e5a97689f16f0f1cdb1651fc4de7", + "description": "intel/perf: emit counter units in generated code", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d15369332ff400aabb008f9990f7b990b3c8643e", + "description": "intel/perf: compute number of passes for a set of counters", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3f0c4c2afe4aa669e3f8cac668c3fb4dc359f57f", + "description": "intel/perf: create a unique list of counters", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "65d242ff5e57319c065cec4192dcec6237d60b91", + "description": "intel/perf: update generated code to ralloc all data", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a683e7f3dc82fabee8ae88931d608ced6c1523ab", + "description": "intel/perf: store the appropriate OA formats in queries", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8b8eaa84a3e80d1df1c2467dc31432824cffd610", + "description": "intel/perf: make pipeline statistic query loading optional", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "cc13bfbd05934f4053b633627f5bd2ef1108537b", + "description": "intel/genxml: add PIPE_CONTROL command cache invalidate bit", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "34a0ce58c7f85ea3ec3f1026469ce06602f38a5b", + "description": "anv: add a new execution mode for secondary command buffers", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a96d92a689a3e6112b5e2b4cc1b99b1152d7961a", + "description": "anv: don't reserve a particular register for draw count", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "796fccce631bf8ecb6ce2fd1a68f219788693a6e", + "description": "intel/mi-builder: add framework for self modifying batches", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "570bd760d3e1c2754fc045981d2162df67e81592", + "description": "intel/genxml: fix bits generation for MI_LOAD_REGISTER_IMM", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ee9b17fc26178dffde63f793391a54a7c67f292d", + "description": "gitlab: Ask about reproduction rate in the issue template", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "989619c05b97e57d8d8a805954ff4553c4f3763c", + "description": "nir: Add const to nir_intrinsic_src_components", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "29afa88941dc9319257f19daacec88eef9e72b29", + "description": "pan/mdg: Apply outmods", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "db7b0eb9112479a82a775c879186602e15c733ab", + "description": "pan/mdg: Use helpers for branch/discard inversion", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5500b1f2801cf7b0056cdbdec4d168bda58e36e0", + "description": "pan/mdg: Remove invert optimizations", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "449e5ded9340243b68183d7fffcc838cf283c89c", + "description": "pan/mdg: Treat inot as a modifier", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b124f5315cf5b6a81ddfc18643a3a52ed9a87a83", + "description": "pan/mdg: Apply abs/neg modifiers", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "24e2e24dc0124e5c5f9426a5f571a487b2ac6bfe", + "description": "pan/mdg: Ingest fsat_signed/fclamp_pos", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "22bb5a9acb27f12f175ca76128912b6cd16fff79", + "description": "pan/mdg: Prepare for modifier helpers", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f0455de6fc8a62d280e965eec15795c6652719e4", + "description": "pan/mdg: Drop nir_lower_to_source_mods", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "acc5afb0af5ed64b9469c07dc02ff0d7a018a24d", + "description": "pan/mdg: Remove .pos propagation pass", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "aeb55180ff34c50006dc3ba49097e9834bf32171", + "description": "panfrost: Add modifier detection helpers", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c2b0f3c17d055152795c022da8202043baa3e15f", + "description": "nir: Add fclamp_pos opcode", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0aedce417ae48293f8bebd41c3d69b759b003cf1", + "description": "nir: Add fsat_signed opcode", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "518909290b0123f3bcfec8d6854c25ce451c44e8", + "description": "tu: Support VK_FORMAT_FEATURE_BLIT_SRC_BIT for texture-only formats", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "74f1c304e8d0ba53f9b079684a5e06ca08b0c5f4", + "description": "tu: Fix buffer compressed pitch calculation with unaligned sizes", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "da68c727156babbb25392d8c2efeec410cb69d2e", + "description": "tu: Fall back to 3d blit path for BC1_RGB_* formats", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3d5cc5ff22164ab8effd5d1cc5ef3a16fb2e0984", + "description": "tu: Always initialize image_view fields for blit sources", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "cc4a02d0ed0755c5cf7b75757b402ea81ce54c24", + "description": "nir: Add a store_reg helper and use the builder in phis_to_regs", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3fdbeb70e1a1f98baa6a830901aab44ebd74c078", + "description": "nir: Add a new helper for iterating phi sources leaving a block", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2c8c5cc87d55546cf3b3bedaf0da5bd3ecede322", + "description": "nir/clone: Re-use clone_alu for nir_alu_instr_clone", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4c62dbb145712b49e011feaebd9bf9499b2dfe58", + "description": "radv/winsys: Finish mapping for sparse residency.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "fec36c0668bf8f1fd477c97549a0b50f1c55cf4d", + "description": "intel/drm-shim: Return correct values for I915_PARAM_HAS_ALIASING_PPGTT", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "0f4f1d70bfe96fc9e218ef5c2196bb677aaa251b" + }, + { + "sha": "c8635b6fd350baaf85e87f06ffb76b080094df17", + "description": "intel/drm-shim: Add noop ioctl handler for set_tiling", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "0f4f1d70bfe96fc9e218ef5c2196bb677aaa251b" + }, + { + "sha": "f8314291b33c7e33b859ffb678a330ca7ae47c9a", + "description": "radv: Expose VK_EXT_pipeline_creation_cache_control.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "32e92831456977284537d7d7af990dabf8008503", + "description": "radv: Support VK_PIPELINE_CACHE_CREATE_EXTERNALLY_SYNCHRONIZED_BIT_EXT.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e11f077bb24e8aa25572a4b8f5e5a05df48c9f56", + "description": "radv: Support VK_PIPELINE_CREATE_EARLY_RETURN_ON_FAILURE_BIT_EXT.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "dde998685e7b254b1a80ceafe64776d6d1f28ea9", + "description": "radv: Support VK_PIPELINE_COMPILE_REQUIRED_EXT.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "46624f277e4ba1ce92d9747041719134ba023824", + "description": "panfrost: Enable AFBC for Z24X8", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "82792ef19fd757bcc5571db875098d2f272f1f86", + "description": "panfrost: Fix Z24 vs Z32 mixup", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "861e7dcae6b1156f9eca5ad28b6e21a1e62358da", + "description": "panfrost: Switch formats to table", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6be9e094737aa9bf221724f14228bf2f1286bb2f", + "description": "pan/mfbd: Add format codes for PIPE_FORMAT_B5G5R5A1_UNORM", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "aca15d5cba6f8a4a7898166648fd0a6f841df1de", + "description": "nir/opt_if: use nir_src_as_bool in opt_peel_loop_initial_if helper", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "50bead32b150a869f1c17efbee8476114d1462a7", + "description": "nir/opt_if: run opt_peel_loop_initial_if after all other optimizations", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "52c8bc0130a2031904f8f4e2187baf2f3f8ff6ec" + }, + { + "sha": "d221f70299cc4b14316fe83eeb5ae28797421a63", + "description": "nir: Add documentation for each jump instruction type", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d011fbde5ceda9924bae389278e8a278eb2dd2cf", + "description": "nir: Use a switch statement in nir_handle_add_jump", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8c87082c940ddba90e7bd0aefdf834eb2c335a74", + "description": "nir: Validate jump instructions as an instruction type", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0fb3dc8d1030cf3c373bf90dea61d03d22950b58", + "description": "radv/aco: enable storageInputOutput16 on GFX9+", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "cc1a1da8abbc88e17fc6dec8d000436898187971", + "description": "aco: fix off-by-one error with 16-bit MTBUF opcodes on GFX10", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1647e098e94d6aab0b4c454ccdd5300afd1d0079", + "description": "aco: implement 16-bit interp", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "bbbb4057e64f95e95bd9f302a19e5775556ad600", + "description": "aco: emit v_interp_*_f16 instructions as VOP3 instead of VINTRP", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "34f2c4dc6a6d62677bb7478a70e71b0e5719fc4c", + "description": "aco: validate v_interp_*_f16 as VOP3 instructions instead of VINTRP", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3fba5bb9cc49aa526ce9b108229aa7e01349275d", + "description": "aco: implement 16-bit vertex fetches with tbuffer_load_format_d16_*", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7ffd3946055ec2761d7a475559f580c587e7ca78", + "description": "aco: implement 8-bit/16-bit mov's with p_create_vector", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "860b4d16f46c343863653faccaa596c193ed13b0", + "description": "aco: allow to load/store 16-bit values in VMEM for tess and geom", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9bd3b67163a0dade4268be9fbc3ca8a220c8bb7a", + "description": "aco: convert 16-bit values before exporting MRTs", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "462a5fe6f44e03e1f39238174628beda3e43fa18", + "description": "aco: store 16-bit temporary outputs as v2b", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a3fb064e000a8706319dc996788159bf84a13f0f", + "description": "Expose EGL_KHR_platform_* when EXT is supported", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "52d6b4d6c0fa480ead93786851349dbd0a54d394", + "description": "pan/decode: Fix min/max_tile_coord mixup", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "deb78eec1b7c2fa019c5f8ab8284dab2818b68f7", + "description": "pan/decode: Use a page table for tracking mmaps", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3c355f1ae8be2a7e1f9141c5433bdbb35fdcd7e6", + "description": "freedreno/ir3/validate: add checking for types and opcodes", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f484d63617afe67f9eb8ba98f96e5c3617aa43c8", + "description": "freedreno/ir3: add helpers to deal with src/dst types", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3561d34fff44ae82283dc1d4352e87d47793c40e", + "description": "freedreno/ir3: add simple validate pass", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "554f3d54ca1abac21df7ab052c1b48d9d44e774c", + "description": "freedreno/ir3: fix mismatched wrmask for overlapping VS inputs", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "16cd232dbc31fd594b8ce6b04576870b36d301f6", + "description": "freedreno/ir3/cp: fix cmps folding", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "39de27d3b9031af5115504893b5e1bd42756d3db", + "description": "freedreno/ir3/print: print cat2 condition", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7b86b5ed7d355c97b5eb889637e9dd66d98a6e1d", + "description": "freedreno/ir3: fix immed type in create_addr0()", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3474ba53b5e6560e758cef51b50d248b6fb806ec", + "description": "freedreno/ir3/cf: handle multiple cov's properly", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3db5d146e97e132e17fe7e3aca306c3523d14279", + "description": "freedreno/ir3: fix mismatched flags on split", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b24b6a83653f1b53237288215cbff94ffeb2dc56", + "description": "freedreno/ir3/group: fix for half-regs", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "fcfe5eff63358371b10f1cb75964e3f930d4c858", + "description": "freedreno/ir3: make input/output iterators declare cursor ptr", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c1d33eed417a78407d0955b603085f4e521d9edf", + "description": "freedreno/ir3: make foreach_ssa_src declar cursor ptr", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "65f604e3b3b25bb95c96062675817a3828562e26", + "description": "freedreno/ir3: make foreach_src declare cursor ptr", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "599fd861d4898a0e1c51c64f2a5ae2665e052b53", + "description": "freedreno/ir3: be iterative", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b828929ac9d14309b56177350b37def3b001e8a6", + "description": "freedreno/ir3: move where we preserve binning pass inputs", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d0cfc06a2c3ab583ab8a29d9f365cb5ee417dfe3", + "description": "freedreno/ir3: add IR3_PASS() macro", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c9e5605720f48b3094e708e5c762b340544ab3ff", + "description": "freedreno/ir3/postsched: report progress", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c953794cd6e4a5767b0739b1effc48b066e2a29c", + "description": "freedreno/ir3/legalize: report progress", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c3630c9d29b7bc7dbdf7d914d39782b4d02c5520", + "description": "freedreno/ir3/group: report progress", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "721147a05d47229252dc49460f6de1eec793bfc2", + "description": "freedreno/ir3/deps: report progress", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e4ecfde2dd07494855631865da45a8863372c3ae", + "description": "freedreno/ir3/cp: report progress", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "372e4663011e7388114d798ac1028da5dd72ab92", + "description": "freedreno/cf: report progress", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b6d121502d8e66cd891f3386ccfb0aee0d8af310", + "description": "freedreno/ir3/dce: report progress", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9beb2baaff4a2019aefe3dfc75187d75dba04286", + "description": "freedreno/ir3: juggle around ir3_debug_print()", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "947aa23eff7ac6cfb17eb7bc56df0bc9ed4bd2b9", + "description": "freedreno/ir3: remove Sethi-Ullman numbering pass", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0ceb56a5318be6968c252a994cff8045d2e8411a", + "description": "radv: fix missing break in radv_GetPhysicalDeviceProperties2()", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "57e796a12a883f5845c8e0ed05ec6bdb6d055b53" + }, + { + "sha": "bcb0038c8399f7050eb49cfdb227d3d91c1e8804", + "description": "aco: fix disassembly with LLVM 11", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ff98b1b51a6133282ac9a8ee5b3538418999992e", + "description": "r600/sfn: Fix printing ALU op without dest", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1124c3f1b629d3fa0d31dea8601aadd72a3339e0", + "description": "r600/sfn: Don't reorder outputs by location", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9f942a8e7c72c720bc7d4669a15fa4d37dd7ce7c", + "description": "r600/sfn: Fix splitting constants that come from different kcache banks.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "723ae8177e88a8a129b664371da46c0c9d004866", + "description": "r600/sfn: Fix clip vertex output as possible stream variable", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7ae4b7938e80c71b3b06f9eab26c7cab46a5d308", + "description": "r600/sfn: SSBO: Fix query of dest components", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7c247f505c253f6f2eb6652f176e21681219734c", + "description": "r600/sfn: use the per shader atomic base", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "cd2d7966ac44697d31684e2b5b5b492142915a1e", + "description": "r600/sfn: Add support for texture_samples", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "358b0a57bfd7b5f7e0e05fa4f4cfab5c106a82ff", + "description": "r600/sfn: support indirect sampler buffer reads.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2f3ce9b1d0c5ebf37c8f33d7cfa67d6b3a23a17b", + "description": "r600/sfn: assert when alu dest is missing", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "fd99a7737f70a3bf05492c7a72f4dd0509e09be2", + "description": "r600/sfn: remove pointless check", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ff92345a19a2929a6c229c23be0771acf9728c78", + "description": "r600/sfn: Don't reject VARYING_SLOT_PCNT", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "15d6d354207adea208377edbafe9ca8180a9e406", + "description": "r600/sfn: Add FS output sample_mask", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "91a618eae9f0f126367b14dec8ebe3c80b7cf825", + "description": "r600/sfn: Handle loading sample_pos", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "70b84920befb3121b58e22e176267a63a8b1317a", + "description": "r600/sfn: Take FOGC, and backcolors into account im GS outputs", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d777c040958f3881d065123b73a7abcf422809a7", + "description": "r600/sfn: Add support for viewport index output", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "607d7fb587b4c479d4af3b5e2abd8c588bffce53", + "description": "r600/sfn: Make 3vec loads skip possible moves", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ac2c3fb010c00716444e3d7b4381b29afbc92cd4", + "description": "r600/sfn: Fix handling of output register index", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9db5536643acff1dac81bbd3cae97a66228b0947", + "description": "r600/sfn: Make allocate_reserved_registers forward to a virtual function", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "041df7949656dd691b1e1484ba6c3c1a47b02285", + "description": "r600/sfn: Fix RAT instruction assembly emission", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8977946aa2aba6652e847fdfdb499c7da3a25b42", + "description": "r600/sfn: Fix GDS assembly emission", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b6eb19dd633c415761a5f2310be4be354a0ba0bd", + "description": "r600/sfn: Fix RING instruction assembly emission", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e475eae0fe8a2e81c00c1d6871740d2f0d13400f", + "description": "r600/sfn: Fix memring print output", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "13bb0a97012bb017c5d754d0c55ed0d1d513b264", + "description": "r600/sfn: skip copying LOD if the target register is is the same", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "19673ce47dfdacce81ad0095efc7eb6960aed531", + "description": "r600/sfn: re-use an allocated register in lookup", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "dfb0ba82727aeaa8c5fafe6dfce0c926e41202cf", + "description": "r600/sfn: Skip move instructions if they are only ssa and without modifiers", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "aed9618e20a8314185b7d305b2309a63a3870c66", + "description": "r600/sfn: rework getting a vector and uniforms from the value pool", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "afd47ea83be5521c4ae50ba15e84649e1339ab9f", + "description": "r600/sfn: Handle CF index loading from non-X channel", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "54c3d4bd24eee4fcf595a859e17e04aed7cabf1d", + "description": "r600: Add support for loading index register from other than chan X", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3baad0361674228eefdfa239b8a4ce22dc1bb66e", + "description": "r600: Lower lerp after tgsi_to_nir", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b689de3444ab053e2b81022537ae31fb2c38b82d", + "description": "r600: Lower int64 ops from TGSI-to-NIR shaders too", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "32305c0959dec86c24ef776209d4a92bb47e5776", + "description": "r600/sfn: Fix printing vertex fetch instruction flags", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "f718ac62688b555a933c7112f656944288d04edb" + }, + { + "sha": "65d8c692bd7943b5c7b5538d074f486e94b10e08", + "description": "r600/sfn: Unify semantic name and index query and use TEXCOORD semantic", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "667126cc82a78b69d61ffd99ba251edf210bc7a6", + "description": "Revert \"gallium/gallivm: fix compilation issues with llvm 11\"", + "nominated": false, + "nomination_type": 2, + "resolution": 4, + "master_sha": null, + "because_sha": "e2a7436dd10df70ba14d18ab7cf8ad538f80e653" + }, + { + "sha": "2a6811f0f981c8d67d0131a0b74549b641ea2247", + "description": "Revert \"ac,radeonsi: fix compilations issues with LLVM 11\"", + "nominated": false, + "nomination_type": 2, + "resolution": 4, + "master_sha": null, + "because_sha": "42b1696ef627a5bfee29911a780fa0a4dbf04610" + }, + { + "sha": "c4544f47167ab5fe170e5131ad45b20b02507bce", + "description": "nir: Consider atomic counter intrinsics when setting writes_memory", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "6a6c36e9776a5f1df2e84aead670b215712f4094" + }, + { + "sha": "ee90339cfbe960ccf59e486b845476a277cc9dc7", + "description": "llvmpipe: add gl_SampleMaskIn support.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "310823eccd81ad2bb89a9ec8b8c177bc8929cefa", + "description": "gallivm/nir: add sample_mask_in support", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0dac24790e7386a51f1d513762ef08ca20ed994d", + "description": "llvmpipe/fs: hook up the interpolation APIs.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3f71a5e25f3560292952ce7d7fe250d2b6c2f4c8", + "description": "llvmpipe: add interp instruction support", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "06c10fa3a50cdefd95512f0c6b9060834f9dd33c", + "description": "llvmpipe/interp: refactor out centroid calculations", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c1f5a23a4dfc6f3346a78e560dbeb3dcb8cca31d", + "description": "llvmpipe/interp: refactor out use of pixel center offset", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ae5f6ddc0529fcf29e26bc0c35ffa1e6ea2eb7b0", + "description": "gallivm/nir: add an interpolation interface.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "53fcb30c12ebb34329086ea4f8e0165d62c61302", + "description": "llvmpipe: remove non-simple interpolation paths.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6b7e03175d9ef0d73ce01896cc8b385978aeba6d", + "description": "llvmpipe/interp: fix interpolating frag pos for sample shading", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c9690b7471289489f346c73d3ecb8990adbc6e50", + "description": "llvmpipe: use per-sample position not sample id for interp", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "50987644834921ecf571d8010960688425e3fda5", + "description": "llvmpipe: don't use sample mask with 0 samples", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b11aa12253ad3f0ed68e98b320e873572e1fba32", + "description": "r600/sfn: add emit if start cayman support", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4746796b825d9eb607e6d0a5132339c313010146", + "description": "r600/sfn: add callstack non-evergreen support", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "19273fb227d1434d28821aa5cae8ec2e7ab46d70", + "description": "r600/sfn: cayman fix int trans op2", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "38560e0d1d0d01826cc460f5455732cda1b227bc", + "description": "r600/sfn: fix cayman float instruction emission.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ff9c95421a64a349ca70a61b1a6c9cd22198cd67", + "description": "r600/sfn: plumb the chip class into the instruction emission", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "164aed6c8142a995c6ac1c36ee7a16896f675163", + "description": "anv:gpu_memcpy: Emit 3DSTATE_VF_INDEXING on Gen8+", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "3d9747780bc2508f4474230a0998d9dba7b02d1e" + }, + { + "sha": "6a6c36e9776a5f1df2e84aead670b215712f4094", + "description": "intel/fs: Use writes_memory from shader_info", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d89c28d314b909ebbe94203170b76b82acaa249c", + "description": "nir: Use deref intrinsics to set writes_memory when gathering info", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d50069ab08a917705b40641b46594e088734c2f5", + "description": "r600: enable TEXCOORD semantic for TGSI.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "68b3b5bcab95f34073fcde7fe671cb25975218b0", + "description": "ci: Switch the baremetal runner to be an x86 docker image.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8094a9ab685b67ce5bd54b3f7a2a7b22795f5420", + "description": "ci: Update versions of packages to remove from rootfses.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "18fc6a95b672eaa331425538416f035786a1092d", + "description": "ci: Make the create-rootfs more resilient.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "588ea3184ceb928cbc11e99ba3a7cfae2aa61773", + "description": "ci: Make cmake toolchain file for deqp cross build setup.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a65521145c0da91a4fe4c9901921745a1af178f1", + "description": "ci: Autodetect whether we need cross setup in lava_arm builds.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "188916bd060e09fbbc1b3911516cd1f49b0291fd", + "description": "ci: Move cross file generation to a shared script.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "34195d69ddd03a0fbcc48831cbc5d96f32c3be82", + "description": "iris: Initialise stub iris_seqno to 0", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "fd1907efb385a6f668971e9bb93af2f64d7b8cda" + }, + { + "sha": "a6184eae3123840b1ff3d78e24454610e3013013", + "description": "freedreno/drm: handle ancient kernels", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "c9e8df61dc8737f167767f2bda8a56abcae8ad5e" + }, + { + "sha": "106c2a65dbd6b523a1a68f8b0e913294edc7bbd6", + "description": "freedreno/drm: don't pass thru 'DUMP' flag on older kernels", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "b2c23b1e48f043edee1a6aaa3c132c13edba032e" + }, + { + "sha": "e422f61e6eadade09fd904eef408746166fa9797", + "description": "freedreno/a3xx: fix rasterizer discard", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5e105068343bbaf33d555159f795b26bb2bfc429", + "description": "freedreno/fdperf: add dependency on generated headers", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4504d6374dbe2aa40af519c16765457bcbf81b84", + "description": "ci: Fix TypoError error when traces in traces.yml is an empty list", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e85dc9a240601a201ec662f1a17bbdbea2965bfd", + "description": "ci: Split test_tracie_skips_traces_without_checksum in separate cases", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "550a4f77648f68bc48df34ce9d39dc3df1461f13", + "description": "ci: Migrate tracie tests done in shell script to pytest", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "37621da7b144a6021a8e3962352ad3561e82f560", + "description": "ci: ArgumentParser receives the args from the main parameters", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "eb1f22fb014384bec96fa2c2b08e09a2cb29cddd", + "description": "ci: TRACES_DB_PATH and RESULTS_PATH defined as relative paths", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "78c46c2126109dc5c1cb2ff315aecdd69b93d1f5", + "description": "etnaviv: don't expose timer queries", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b5accb3ff997c073f9a9418c97ddd2365f81504f", + "description": "freedreno/a3xx: parameterize ubo optimization", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "475fb28377fa4f5293c1a0853f2c4260418aea7f", + "description": "freedreno: fix off-by-one in assertions checking for const sizes", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1c05e16666c78a099fd8c732fc6156a85950ee6a", + "description": "freedreno/a3xx: fix const footprint", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9048adbd246c725ae80942b6b56a742ad02295dc", + "description": "freedreno/ir3: avoid applying (sat) on bary.f", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8d86892ea3cd6239a9faae825bb657292a7ba848", + "description": "freedreno/a3xx: reinstate rgb10_a2ui texture format", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ff4df32fae58d944e309ffd8e3fd05669a8cc2c9", + "description": "freedreno/a3xx: there's no r8i/ui rb format, only rg8i/rg8ui", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "78615dcca19699737bf9cbc784b66d9ea966bcc0", + "description": "radv: use util_float_to_half_rtz", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "8b8af6d398a94cb07015c695fdfdb5c157aa72cf" + }, + { + "sha": "632a921bd0dd40ad3167a0b8a74fe7054a78256a", + "description": "lima/ppir: optimize tex loads with single successor", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a4b7699d849d28adfd057136812b00a6b4838ed0", + "description": "lima/ppir: rework tex lowering", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "92611e21c19b35d1e5c9091614b2659d789558e2", + "description": "lima/ppir: improve handling for successors in other blocks", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "96c1d5f629b3e45958e5ee41d7d8b34e52ae247d", + "description": "lima/ppir: handle failures on all ppir_emit_cf_list paths", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "fa3549c92b240aab23bd5838f1fdca2ae4caf055", + "description": "util/rand_xor: extend the urandom path to all non-Windows platforms", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d76abe98cf15226f25d93e76e383715061ada6f4", + "description": "util/rand_xor: fallback Linux to time-based instead of fixed seed", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e0ce684aae83bd6c8129cac09dc98823d786b798", + "description": "util/rand_xor: drop unused header", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f50f26325f8df7e076a0ffd2196eab1c36ff07ae", + "description": "util/rand_xor: make it clear that {,s_}rand_xorshift128plus take *exactly 2* uint64_t", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "576bff5c73ff217b15900982640dbf8f184569d5", + "description": "gitlab-ci: exclude scripts that don't affect the build", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "494b7ef0c1a440c57f5a6a8a301fba4f7e551417", + "description": "gallium/auxiliary/vl: Fix compute shader scaling for non-square pixels", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "fd6a5e112aa1fd09bc01f4cf3ca211ad73cc404e", + "description": "gallium/u_threaded: execute transfer_unmap with THREAD_SAFE directly", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "7f22e0fd29369f478da1d36520049f001cd698d1" + }, + { + "sha": "c9ccceff1092049b081db88e53db8335a0a64951", + "description": "radeonsi: test uncached clear/copy buffer performance with compute shaders", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5acf99e81f7ad3c93426e62d6df98d64d73d80b3", + "description": "radeonsi: compute perf tests - don't test 1 wave/SA limit, test no limit first", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c45a2145f5eaae53d48e3aa7e0f10f2c70f267b0", + "description": "radeonsi: disable the L2 cache for CPU read mappings of buffers", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7356144fe42939ecbc01d2066ca6ea5d0f9351a7", + "description": "radeonsi: disable the L2 cache for most CPU mappings of textures", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "36c01248049abba6d79b5ff1ddfe38306e0ddbff", + "description": "winsys/amdgpu: add RADEON_FLAG_UNCACHED for faster blits over PCIe", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "cbbc18bc678a826508ed0b904ab485c44b7348c8", + "description": "radeonsi: use display_dcc_offset for setting displayable_dcc_cb_mask", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b5ac9d18d8f30893aff1e141f89085dab557b7df", + "description": "radeonsi: use vi_dcc_enabled instead of using tex->surface.dcc_offset directly", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2c4c1b049966a2cc4619fec796ea244f53051a6c", + "description": "radeonsi: rename SI_RESOURCE_FLAG_TRANSFER to FORCE_LINEAR", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4907bb44c3c9bdeac0077ce8d0a533d3ef685920", + "description": "radeonsi: simplify setting resource usage for si_init_temp_resource_from_box", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f57276309b14be8a61e1913ab00dd6e4b1e5e8ce", + "description": "radeonsi: tweak clear/copy_buffer limits when to use compute", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b158b117e1ef69d47724f607fb5bd28389148fac", + "description": "radeonsi: optimize access pattern for compute blits with linear textures", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9f8089139f1be6f30628fad033d87fdb8c804f80", + "description": "radeonsi: use correct clear value size for EQAA in expand_fmask", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2361e8e72278cfe256f80946516be7a48534e6d5", + "description": "ac/nir: honor ACCESS_STREAM_CACHE_POLICY for L1 and L0 caches too", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d573d1d82524b8a2e5f56938069cabc0f0176a0e", + "description": "radeonsi: Use TRUNC_COORD on samplers", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "65c2362e88578575899bd208713d87b1206ad360", + "description": "iris: Use modfiy disables for 3DSTATE_WM_DEPTH_STENCIL command", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "864d8acbfdb5df17c5495b87ceba7c009f65988b", + "description": "radeon: Fix whitespaces", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f80d653d701f51f00f88601707747554c9a7af1c", + "description": "radeonsi: don't expose 16xAA on chips with 1 RB due to an occlusion query issue", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "844d561c580188dad583dd4bad3b77d55e39372f", + "description": "spirv: handle OpCopyObject correctly with any types", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9d1821adf0bc51958becf116d6df5c65514d58b6", + "description": "etnaviv: retarget transfer to render resource when necessary", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "b9627765303356328c409cd59bef43d15f4eafcf" + }, + { + "sha": "bb3545a6ee419c4802ac4153eb690a93dc2f339d", + "description": "intel: Store the aperture size in devinfo.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a887ad7c84e14fdad7907037a39e9fee9d504bf3", + "description": "st/nine: Handle full pSourceRect better", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "dbb08255708b9005b5bb719a94ebd93194f51861", + "description": "st/nine: Ignore pDirtyRegion", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1c474dde282aa7b02513097b58435a470eee23f9", + "description": "st/nine: Improve pDestRect handling", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ffed34113b652a59e6d6a9d9e212a3eac72dd216", + "description": "st/nine: Retry allocations after freeing some space", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d771e0cc60a0aa853c5e1e0df304f970a47ab774", + "description": "st/nine: Increase available GPU memory", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4cf13691beb4237a8a449dd317561066df492842", + "description": "st/nine: Add missing NULL checks", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "725ebc4657d27349ed586e5c178d5b1a971587cd", + "description": "st/nine: Fix a crash if the state is not initialized", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0222c550c726f0b80d083681d749e2891b11a318", + "description": "st/nine: Fix uninitialized variable in BEM()", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5d904d27491aab489d54746bc6650b27c5927c39", + "description": "st/nine: Improve return error code in CheckDeviceFormat", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "54a7a6908597e016912609db16532cc8eba16864", + "description": "st/nine: Pass more adapter formats for CheckDepthStencilMatch", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "edff31c0d902cd83495a1e49f306d8f031cf118b", + "description": "st/nine: Do not return invalidcall on getrenderstate", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2c61b4db7d6f9ce2638bdd35c8dcac70ea950e4e", + "description": "st/nine: Return error when setting invalid depth buffer", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c0f21cbaa124fa32509a8d9c14120fc22f0c8c63", + "description": "st/nine: Add checks for pure device", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "09ac0350fdc97a746342c63deb4e9b626959d47a", + "description": "zink: implement i2b1", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "7f6a491eec02d6c141e2b4849a3ba924926a482a" + }, + { + "sha": "4c212a1168de9ffc83a7b8e8751ea7cf3dca5c4a", + "description": "util/rand_xor: use getrandom() when available", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "cf2b285c5592e5d8fce24ab6a34eaa9c168aa129", + "description": "zink: mark depth-component cube-maps as done", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ea62c237031a05e82147bca2699269bf2d80fa05", + "description": "nir: Use 8-bit types for most info fields", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "57e796a12a883f5845c8e0ed05ec6bdb6d055b53", + "description": "radv: Implement VK_EXT_custom_border_color", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9e3c6a7ba769d358dfffa690751cac6ca7f9c5df", + "description": "radv: Provide a better error for permission issues with priorities.", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e7458f19e1de7d40ff8aa72b6a141f24d33451c6", + "description": "freedreno/uuid: Generate meaningful device and driver UUID", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9623debf48ae7dbea120389eae40d784d22eee24", + "description": "freedreno: Centralize UUID generation into new files freedreno_uuid.c/h", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "cdfede7336b6ef99aa60d955f7a173ea945602d4", + "description": "aco: split operations that use a swap's definition", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f293d02dc44e631a888073a884648543feecbe51", + "description": "tu: Advertise COLOR_ATTACHMENT_BLEND_BIT for blendable formats", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "adbdab3ee80017a4939e9cb586ea85f8c4e7efc9", + "description": "tu: Implement dual-src blending", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "078aa9df8daff60e52a66d8f8062dce135b94ec1", + "description": "tu: Move RENDER_COMPONENTS setting to pipeline state", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2a9d12d5133639946b624a1ad367ea3f9543a8fe", + "description": "ir3: Fixup dual-source blending slot", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0e0580550e1b4846d3ad7ff738f57063b05089c9", + "description": "freedreno/a6xx: Document dual-src blending enable bits", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4aeaef99c003f3c75279d9b400315685ebbba30d", + "description": "Revert \"nir/validate: validate the stride for deref_ptr_as_array\"", + "nominated": false, + "nomination_type": 2, + "resolution": 4, + "master_sha": null, + "because_sha": "667e14e7bd759a77e732c4de09fb978ee3816eaf" + }, + { + "sha": "2c6599d6d6dc908374a77d2f315d9a3e235e3656", + "description": "docs: update calendar, add news item, and link releases notes for 20.0.7", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "212ee624f81bdbf7acb17c1b343a3500130be5d3", + "description": "docs/relnotes Add sha256 sums to 20.0.7", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e5e9a0dfd7e7ef7c47108ddd4d6e5c7aff4fb7d6", + "description": "docs: Add release notes for 20.0.7", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ceae09da156309327d7ba6f4a59d3a2e9b8837d9", + "description": "intel: Silence unused parameter warning in __intel_log_use_args", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4cb2330e56568ae705e09e377ccc501051b2514e", + "description": "anv: Silence unused parameter warning in anv_image_get_clear_color_addr", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b44eb50f2d125420c2c78d45e112bc3c2ee1472c", + "description": "anv/tests: Silence unused parameter warnings in main", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f4638cfdad3199bd97cf8ca7070008186bff456a", + "description": "anv/tests: Don't rely on assert or changing NDEBUG in tests", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "f7c56475d25138234ab0bb28a10df9000de594f9" + }, + { + "sha": "66e3c74f9c7ffdd7687d4304238b50c3ba35ef47", + "description": "aco: fix WQM coalescing", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "a5fc96b533418dc2d68f17f3f19ac5f82d59b978" + }, + { + "sha": "4151bddab5d3dc082ac689e4d3a96f42fa4718ec", + "description": "anv: Fix descriptor set clean-up on BO allocation failure", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "682c81bdfb7ea28efccea1e8cbfeb7cfc67d02b8" + }, + { + "sha": "3f74c6a8815dcc5ff7f56993cc88f9e21aa81d14", + "description": "anv: Call vk_object_base_finish for image views", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "682c81bdfb7ea28efccea1e8cbfeb7cfc67d02b8" + }, + { + "sha": "ed95f69dd54c907879b90e9a2d1ddb7f56c717e4", + "description": "zink: correct PIPE_SHADER_CAP_MAX_SHADER_IMAGES", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "50ebe5a991a45de6fb04b1a7edd956a530cf5d12", + "description": "zink: do not expose real value for PIPE_CAP_MAX_VIEWPORTS", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "adc633627349d44c3e5495c442a16b0129c00989", + "description": "meta: Remove support for multisample blits", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "bb28ce79882eb6ecad7488f8827a0fdf05de1828", + "description": "meta: Coalesce the GLSL and FF paths in meta_clear", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5be77851907ef4401596c88916b682a311449b1f", + "description": "meta: Use same vertex coordinates for GLSL and FF clears", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e5d2fbf3528ed96f6b0afc953232983b8753b03a", + "description": "meta: Stop frobbing MatrixMode", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "29f10ede71ffe8352bdfda154f3994542094bcfb", + "description": "mesa: Add function to calculate an orthographic projection", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c731f2ab63d001d47995e3f5e0e8f5c74d5a2e55", + "description": "mesa: Add matrix utility functions to load matrices", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b5a8d0319b8fe14bb9f970fdca1decf6dbc9e603", + "description": "meta: Remove support for clearing integer buffers", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a5d2c40fb9c841ab28cc9ab6eebaaac44afa6e4e", + "description": "meta: Make _mesa_meta_setup_sampler static", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "27c2082a4238eeece5fb36d948ae96301d5ae506", + "description": "meta: Make _mesa_meta_texture_object_from_renderbuffer static", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "067cb2f165c3363957aa4206461829a8e53abd32", + "description": "i965: Assert that blorp always handles color blits", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "667e14e7bd759a77e732c4de09fb978ee3816eaf", + "description": "nir/validate: validate the stride for deref_ptr_as_array", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7afc9632a6d03ed8d23fbab08b564da594b9cfd6", + "description": "nir/deref: copy ptr_stride when rematerializing", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a1b69d101a60974c105df8519890e3f90fd44911", + "description": "targets/opencl: fix build against LLVM>=10 with Polly support", + "nominated": false, + "nomination_type": null, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b1151cd2ffc821a09130f87c2a266b2bfe7b0822", + "description": "freedreno: Avoid duplicate BO relocs in FD_RINGBUFFER_OBJECTs.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a6fe0799faf73970ac76e26bac2f7b38195fe1e1", + "description": "freedreno: Fix resource layout dump loop.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2eb180db94830a70e21770e3a972efc55ee8385a", + "description": "zink: disable vkCmdResolveImage when respecting render-condition", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "06b6c687e2aba075e9fc3812d80c128bd873bbce", + "description": "anv: Fix deadlock in anv_timelines_wait", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "34f32a6d6648073e2fda3fb78377124fb32bb288" + }, + { + "sha": "c059b2270724b2c02416d7d7d45e23a5e994cda5", + "description": "gitlab-ci: Install g++-mingw-w64-x86-64-win32 instead of mingw-w64", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "dcbb189bbeed39ab03b65fa821dec91cbb999f84", + "description": "gitlab-ci: Move lib{drm,pciaccess}-dev cross packages out of loop", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "da3aee926387cb9113b79c5376a6a8f4295582ac", + "description": "gitlab-ci: Install WINE from Debian testing", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "fd9b445145851dcc3d697f3df46f86f86db466c0", + "description": "gitlab-ci: Add Debian testing repository for x86_build image", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f2773d706707b14f3285f337cfeb5e37b8a60385", + "description": "gitlab-ci: Move down container_pre_build.sh invocation in x86_build.sh", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1c79ac10690d1a2e44ea30a05542d0ea3f246a56", + "description": "gitlab-ci: Update to current templates", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "cc472a2a7c6c99eb228ef138acadc16337a48ede", + "description": "zink. Changed sampler default name.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b1f023307768570a621c3050222bd32c546d72a9", + "description": "radv: enable shaderResourceMinLod", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0d63a1a84d409d08fd6c6f8a0c569b2620d6a600", + "description": "ac/llvm: add support for texturing with clamped LOD", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "aaf5706aa32208b456df6b7f53b281baf24c6f3c", + "description": "aco: add support for texturing with clamped LOD", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "47a769143be8f81e01a70467d51142d36597847f", + "description": "aco: remove useless check for nir_tex_src_bias", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0f46a3191ff31a89d6969a67f8b50dbedf085dca", + "description": "CI: Windows: Build LLVM and llvmpipe", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "69ffbcb16244fc4f1161dd4082eb93b7a80232e5", + "description": "llvmpipe: Expect increased exp precision on Windows", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "cf21b763832abc5739fc46eb0d30440587015840", + "description": "freedreno/ir3: use lower_wrmasks pass", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "42d38ad02816d72e8c3ef1d15dabb28ee9eb2f88", + "description": "nir: add pass to lower disjoint wrmask's", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a506d49faecf06d5cd2fd2d049319d907b273b83", + "description": "nir: add helper to copy const_index[]", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3d3cfea78b799af71012dcd6b84cc38a9d172e05", + "description": "nir: fix indices for ir3 ssbo_atomic intrinsics", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ea6b404294a0d6d83360bd0c5c8ab4e6f44c3555", + "description": "freedreno/ir3: use const_index accessors", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "14969aab11effa1500f114314c9b8879821b8b24", + "description": "freedreno/ir3: Drop wrmask for ir3 local and global store intrinsics", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4627bfcd69544780e30c069b77967cfb92c9d7e0", + "description": "nir: Add some docs to the metadata types", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3111cee2f627d7e681e1695e1e4b1b5b126d5c7d", + "description": "freedreno: Fix attempts to push UBO contents past the constlen on pre-a6xx.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7336caa52d90dac3e7cc82259e223e933ca95ce0", + "description": "docs: update calendar for 20.1.0-rc3", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0dd24b381c52e0b20fe70041e95611aad6b0d258", + "description": "panfrost: Fix background showing when using discard", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "15dd7933bc33bb13d146f9e0a1f79092e749f33b", + "description": "anv: Translate relative timeout to absolute when calling anv_timelines_wait", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "34f32a6d6648073e2fda3fb78377124fb32bb288" + }, + { + "sha": "0b5288492b90c9a5471152393df31691271f6a55", + "description": "anv: Set MOCS in 3DSTATE_CONSTANT_* on Gen9+", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e3d8edf3e08988b19c6861040b9ed4afa8ca5ec2", + "description": "anv: Set 3DSTATE_VF_INSTANCING on the SVGS element", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "723208988e12f2f3055360ffe8d9bd0b8414171a", + "description": "freedreno: Drop the noubo fails list for CI, since there aren't any now.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "112c65825fddd00a2136c75e09982e1878c944a4", + "description": "freedreno/a6xx: Use LDC for UBO loads.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ab93a631b4c6c2c8d682d90649df69d3016e4722", + "description": "freedreno: Trim num_ubos to just the ones we haven't lowered to constbuf.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d5176c453e5fd74f6999d09e551bcbc771845e8f", + "description": "freedreno/ir3: Move i/o offset lowering after analyze_ubo_ranges.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5387c271409dbfb12292feec2dcd61399bbb1a99", + "description": "freedreno/ir3: Leave the cursor alone during ir3_nir_try_propagate_bit_shift.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e0a4d1c4e53f770d1e4f80f112e3d36b9f09a7c4", + "description": "freedreno/ir3: Clean up a silly nir_src_for_ssa(src.ssa).", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d2a0cde390001e70e7f9696f4ac7cbca9616cbdf", + "description": "nir: Include num_ubos in the printed shader (if nonzero).", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "492d664be06ca538d4e5c1e380ab7966714a12a9", + "description": "util/ra: Add [de]serialization support", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "38e68db778439c34ebb876273fcf9139764abc80", + "description": "util/vma: Add a debug print helper", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "adbcef37d2d1f838ef24a4ab1f4332cc87b5fdad", + "description": "util/vma: Add an option to configure high/low preference", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f40f8f623a021d791886ca8e1a159e34cfcb4bfb", + "description": "util/list: Add list_foreach_entry_from_safe", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "aeb95fda541719d5fe9b8fdb531f3370228228d7", + "description": "util/list: Add a list pair iterator", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5425968d2e46eb3311a75f4bf7bedb1d9dd59459", + "description": "anv: Implement VK_EXT_custom_border_color", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5b07f142d7fae956aea55082d4b3d8e5a3d3cfb8", + "description": "anv: Add a way to reserve states from a pool", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "32d631dcd250bdfa0c8089921b50544988ee8f8b", + "description": "anv: Disable B5G6R5_UNORM_PACK16", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6ae0762f5c05284cda768fff9334eee28f3d0355", + "description": "anv: use the correct format on Android", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4cf702c3326264c616c112280fe8947bfcc5eb11", + "description": "drirc: Enable glthread for PCSX2", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "445e559e35ae3151d7587f61310ec0fc25c90982", + "description": "post_version.py: stop adding release candidates to the index and relnotes", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ae26149e2e9ccac54d64ded9a9c0a8ae84cfd300", + "description": "post_version.py: invert `is_point` into `is_first_release` to make its purpose clearer", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5fba85bcb8bcb722e0bbaa54163d8a39d6fd067d", + "description": "post_version.py: fix branch name construction for release candidates", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "64c7363f7efb6c5bf39cf4443245a82fc9904fb8", + "description": "glthread: stop using GLenum16 to get correct GL errors for out-of-bounds enums", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1152af2edacb93126b66e19399d971036fc02d79", + "description": "radeonsi: also enable tgsi_to_nir caching for compute shaders", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "45e69e7d118ad0297ee924125fe04e09cfd9251e", + "description": "radeonsi: Enable tgsi to nir disk cache", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f83f538881da04bc2a5dca2741c69b50a8880b40", + "description": "st/nine: Enable ttn cache", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4db880d8057bac3209c196edc94c6b1e521a782a", + "description": "ttn: Implement disk cache", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "522bd414f343c7a132fee17d0d6b755b9ec6766c", + "description": "ttn: Add new allow_disk_cache parameter", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6670475a449ed5166f5de997c820da16a675a6de", + "description": "freedreno/a6xx: Fix UBWC mipmapping height alignment.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "81f21ff4ef8b5ffb770d0ff3516338ffdfce3e99", + "description": "freedreno/a6xx: Fix UBWC mipmap sizing.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b5db2a257413e2c570b49b8c2171166592f3e093", + "description": "freedreno/a6xx: Fix UBWC blockheight for RG8.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9da4ce99538179108dc7694affc68a0e081404db", + "description": "freedreno: Pull the tile_alignment lookup for a layout to a helper.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "dc7ccdb3f526c2e43aa48a3f5d43fea948012371", + "description": "freedreno/a6xx: Add a testcase for UBWC buffer sharing.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e32783c644fa7230c4de07f1062cfc900a7e0e9a", + "description": "freedreno/a6xx: Improve layout testcase logging for UBWC fails.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2e4ddb6353d1fea70d4744e7ea70461dd36214b0", + "description": "freedreno/a4xx+: Increase max texture size to 16384.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1f7d1541df8a9ea040f893fc6267a3d4ea6ebf26", + "description": "nir: reset ssa-defs as non-divergent during divergence analysis instead of upfront", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1b881f3d8e4c367049aeb376602525559ef09ed5", + "description": "nir: simplify phi handling in divergence analysis", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "450b1d87ba6bb41056f2ae8c576f98d6a70fa2e4", + "description": "nir: rework phi handling in divergence analysis", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "febef2245965efb50e283b16d2a927fcaebd8ffc", + "description": "nir: refactor divergence analysis state", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b9ea0ca6ee6c36fe26a559e2a4d2fcda78a0fda3", + "description": "nir: add nir_intrinsic_elect to divergence analysis", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ca2d53f451e527b7227f073aeeaeece00d3787c8", + "description": "nir: Make \"divergent\" a property of an SSA value", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "db94a2d03d3d888da2091dc20048bc178e243966", + "description": "gallium: remove more \"state tracker\" occurences", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "74800697032569bde5d4f87b9e2dbbcee4c9d922", + "description": "gallium: rename PIPE_RESOURCE_FLAG_ST_PRIV to FRONTEND_PRIV", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8c9b9aac7d09e65195dca6681d59c10e4ef713d9", + "description": "gallium: change comments to remove 'state tracker'", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d6287a94b697ffe12a4e576a38943cdf4e90cdb0", + "description": "gallium: rename 'state tracker' to 'frontend'", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b408734e5e2fe1e1ef08080c4425ad8a7ed33579", + "description": "tu: Implement fallback linear staging blit for CopyImage", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "40e842c009699a3e8b7ffff2f75b3070df41c752", + "description": "tu: Add noubwc debug flag to disable UBWC", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ed79f805faf1ac5919a30d3284e37cc3f394e464", + "description": "tu: Add a \"scratch bo\" allocation mechanism", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7ce527a4fed1706aed9ced8e5d3432cc5abfbbd3", + "description": "aco: improve phi affinities with p_split_vector", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "51e797e233212be6e78d354ed953f616044ad7df", + "description": "aco: consider affinities when creating v_mac_f32", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "138eed45b5362c61b27544d695130bab580c879d", + "description": "aco: mark phi definitions as last-seen phi operands", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c1c0cf7a66905e8d7ad506842a41b0ad0c5b10da", + "description": "aco: fix consecutively written vgprs from vmem instructions", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0c7bed72f7948d51a2109f181e7a2d3c77dbd19e", + "description": "aco: simplify consecutive ordered vmem/lds writes optimization", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a6beb051aff4f11784cb974a1dc3647e74f2389c", + "description": "gitlab-ci: correct tracie behavior with replay errors", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "efbbf8bb81e97a2b2d2e6e018750ef36cd460676" + }, + { + "sha": "8546d1dd789b58bd0aff5ca0a231efb35c09c1ac", + "description": "gitlab-ci: create always the \"results\" directory with tracie", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "efbbf8bb81e97a2b2d2e6e018750ef36cd460676" + }, + { + "sha": "1ef03dade12b4d5056c3fe5637f9dfd98a42aae6", + "description": "radv: add a LLVM version string workaround for SotTR and ACO", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "91c757b7963f458f678226f9637f04a490085405", + "description": "turnip: use the common code for generating extensions and dispatch tables", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ddfae50b67e20895c908c9c0721d0b4cfdb18d20", + "description": "anv: use the common code for generating extensions and dispatch tables", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "857051c5c63e238f606652acb1e1f9610de68758", + "description": "radv: use the common code for generating extensions and dispatch tables", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "bee8a57942787b4a7a2ca5c375d911288ee16cbe", + "description": "vulkan: import common code for generating extensions", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9b1138e3f0e960119a46dc08794132719c93173e", + "description": "radv: implement VK_EXT_private_data", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "178adfa6a8e5694a077c3e11fdcc5bc0802839ab", + "description": "radv: use the base object struct types", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "65458528fc169ab80c99cb115bc6f86ae7b43d12", + "description": "radv: use the common base object type for VkDevice", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2b535ac61b92cdf5f1e5adca3f2a4a43e7384e1a", + "description": "etnaviv: Disable seamless cube map on GC880", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "8dd26fa2f065e78f3204357d8b474656b9ea81db" + }, + { + "sha": "f079c00ffc1c9e85321955f679e656196f724848", + "description": "freedreno/a6xx: fix max-scissor opt", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d6706fdc46fc895b3b3d4446af3d4a9ab1e700c0", + "description": "freedreno/ir3/sched: try to avoid syncs", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d95a6e3a0ca2d4a420306dd078cea05d3f21c865", + "description": "freedreno/ir3/sched: avoid scheduling outputs", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "488cf208d5d90b0f3b3c346e0abb92e71597202f", + "description": "freedreno/ir3/postsched: try to avoid (sy) syncs", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "25f4fb346e1fad34ce1f2e9e39b062a303db4ce3", + "description": "freedreno/ir3/postsched: reset sfu_delay on sync", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f351e1d137603213b5daacece5ff67ad0786d982", + "description": "freedreno/ir3: limit # of tex prefetch by shader size", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d69f6fd8529b1dcefa443a8cb31bd362bb64a28c", + "description": "freedreno/ir3: fix indirect cb0 load_ubo lowering", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "fc850080ee304c2a62f7313c4b7ebe121c3ebb53" + }, + { + "sha": "c4dc877cb5df63a6a86013e32695f72c604625ad", + "description": "freedreno/ir3: don't allow negative const_offset", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8d8ba7fb44f8130f7ded5459dbac5484cc9202a9", + "description": "panfrost: Run dEQP-GLES3.functional.shaders.derivate.* on CI", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b7bd021c700961b6eb3f9ee5b7777f4c7e1bab45", + "description": "pan/mdg: Fix derivative swizzle", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "bac29316b06a5bbee396c70c53ed0640af67c064", + "description": "pan/mdg: Set types for derivatives", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "69e4d4fabe728e49b22ddcff49153deb3bc43e94", + "description": "pan/mdg: Remove texture_op_count", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "344dd914972f9ddcef814a3cf08f832708c33146", + "description": "pan/mdg: Use analysis to set .cont/.last flags", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9a7f0e268b7baeb92d8d7d2989beb43550262406", + "description": "pan/mdg: Use the helper invo analyze passes", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d429187bf3988fca190fcbd53e416b8a46506b25", + "description": "pan/mdg: Analyze helper execution requirements", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3228b3106a672e79093f2186f3e040a7579cd7b4", + "description": "pan/mdg: Analyze helper invocation termination", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0da03c68ae3e16a339e41b967fcb689666f02296", + "description": "pan/mdg: Explain helper invocations dataflow theory", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "95fd950d35717dda29cb7876ac2f2b7852eff1c4", + "description": "intel/compiler: fix alignment assert in nir_emit_intrinsic", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "c6439792287f11f25cb2b62d699f52daefe54a44" + }, + { + "sha": "a663c595bc19d627d2fd28081412306b91554d96", + "description": "freedreno: Skip taking the lock for resource usage if it's already flagged.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "356f99161df36223091cf9721dd49e52cb9e5e3e", + "description": "freedreno: Move the resource_read early out to an inline.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d393837332a07f53b9622ca55149e63947e0f937", + "description": "freedreno: Add an early out for preparing to read a resource.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3e424bcdfcef19682f9b651f7c1a04e32f18be5c", + "description": "freedreno: Split the fd_batch_resource_used by read vs write.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "fdcadf611e03f0dc8878fb032b62510c38fe069b", + "description": "freedreno: Add a nohw flag to skip submitting to the kernel.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a43e9740646a42e3c2d914ad7e0eb20f4113d153", + "description": "turnip: Execute ir3_nir_lower_gs pass again", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1bd38746d5a2d21a2f77e2bf1c6fec17ca5eb6ac", + "description": "freedreno/gmem: rework gmem layout algo", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c46f46befe67f2e64f1492e5b46974e277ec6ac7", + "description": "freedreno/gmem: relax alignment on a6xx", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ad6e06621b15908d5f3f4c63ce4d84612e5a761c", + "description": "freedreno: add gmemtool", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ef5f238fd08bb470e0f6327ec76723d37704f13f", + "description": "freedreno/gmem: add helper to dump GMEM layout", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6a49d9c396b61ef2556afba59c495c45bfab0202", + "description": "freedreno/gmem: add div_align() helper", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "96b5a70f45612642265d7192e04e90206a4c260f", + "description": "freedreno: initialize max_scissor", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1387e778018527f1ea85f616130a2b0d59dcff19", + "description": "freedreno/gmem: don't assume scissor opt when estimating # of bins", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3c87618d357a4f75a4f47b2638c8f89939fd6c61", + "description": "vulkan: Handle vkGet/SetPrivateDataEXT on Android swapchains", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "51c6bc13ce3a70b1948ce5c4edbc1d29c5460986", + "description": "anv,vulkan: Implement VK_EXT_private_data", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d76e722ed63607ecead2c66ef9f3a37a12b62bab", + "description": "turnip: enable tiling for compressed formats", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f543d87f23555912110e2c0f67a57dcb99d94e08", + "description": "turnip: update \"fetchsize\" value to match fdl6_layout changes", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "a34b3fa198a4f87f8e07c718ec2f2e07927c6d7d" + }, + { + "sha": "f789c5975c83f12216cf915d9a791e654b3c9e15", + "description": "freedreno: Fix non-constbuf-upload UBO block indices and count.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4553fc66a5f23607c2e872d8ac8755c747bd0bd2", + "description": "nir: Fix count when we didn't lower load_uniforms but did shift load_ubos.", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "4777ee1a62f0620efa2a105215eb589fc44dfa0f" + }, + { + "sha": "0f2e44d55b01b3637fb96ce18840b8ab9250d508", + "description": "freedreno: Drop the \"write\" arg to emit_const_bo now relocs don't care.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "51d7a71bd4f086b42340b0d601d2c3ff4bc7de37", + "description": "freedreno: Replace OUT_RELOCW with OUT_RELOC.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "064f395a890158ea5502f685ef6c22e504e3f0df", + "description": "freedreno: Tell the kernel that all BOs are for writing.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b2c23b1e48f043edee1a6aaa3c132c13edba032e", + "description": "freedreno: Mark all ringbuffer BOs as to be dumped on crash.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "554b959df0d5b1117fb42d3d7d1e715ea318f079", + "description": "freedreno: Replace OUT_RELOCD with permanently flagging shader BOs for it.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9d8d936dfcdab52361b9824cdd1f3ddb41486145", + "description": "freedreno: Start moving relocs flags into the BOs.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4235624b6a1f1858a8f200c03b6492ed1b9f21ec", + "description": "aco: optimize add/sub(a, cndmask(b, 0, 1, cond)) -> addc/subbrev_co(0, a, b)", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a5fc96b533418dc2d68f17f3f19ac5f82d59b978", + "description": "aco: coalesce parallelcopies during register allocation", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "38cc649fcb54baf87a974ca2dc29d92b50c86cfa", + "description": "glthread: Fix use of alloca() without #include \"c99_alloca.h\"", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "2840bc3065b9e991b2c5880a2ee02e2458a758c4" + }, + { + "sha": "dc6c42dc77e4bf0cb07037e038c4a50afa5a8143", + "description": "etnaviv: generalize FE stall before loading shader and sampler states", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8e5fc97be63e4c4296f16fe34523b68d39776fad", + "description": "CI: Re-enable Panfrost T7x0 jobs", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "696bafac40f5f15ae140a2e844fb1b31d4918ebe" + }, + { + "sha": "8c6350d2bba852fccbda5e0534bdbb085882b3be", + "description": "radv: update the list of allowed Android extensions", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "021270cb3170ef38244d21cf3fe8780a3ef5fb3e", + "description": "radv: handle different Vulkan API versions correctly", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "69430921fc123b9016d5bf1779c0ab0ed4d95931", + "description": "radv: limit the Vulkan version to 1.1 for Android", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "7f5462e349a3f082e2944181cd610b1250d711cd" + }, + { + "sha": "50eabb7035fe361d870f504b73c15962ddf0b67e", + "description": "r600: Fix nir compiler options, i.e. don't lower IO to temps for TESS", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "6747a984f59ea9a2dd74b98d59cb8fdb028969ae" + }, + { + "sha": "f7fcbe9830d160c9610fe641ed0202397ed9c31e", + "description": "v3d/tex: use TMUSLOD register if possible", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c3af695bb0bae8aea119a2d05983acd57366b0fb", + "description": "v3d/tex: set up default values for Configuration Parameter 1 if possible", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "50c2c76ea31edf987594e8b811b7d62be71f5a33", + "description": "v3d/tex: only look up the 2nd texture gather offset for 1d non-arrays", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ad8c5bba0a44774477a1b26523a14e8679521fc3", + "description": "drirc: Enable glthread for rpcs3", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d1290e79489fb645fc73ebb365b684b7797f97b2", + "description": "pan/midgard: Fix old style shadows", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "47bfc799da61aadd60ef9cc5c4bf0651c519cc77", + "description": "gallium/util: Fix leak in the live shader cache", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "4bb919b0b8b4ed6f6a7049c3f8d294b74b50e198" + }, + { + "sha": "412e29c277774391c461d0f62c47e54d2f17814a", + "description": "nir/algebraic: Eliminate useless extract before unpack", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "bc0bbb8f0b45049e597d7d585999ae4954266347", + "description": "nir/algebraic: Add some half packing optimizations for pack_half_2x16_split", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a2bf41ec6527fbedc2a75a8072d7222298bca347", + "description": "nir/algebraic: Optimize ushr of pack_half, not ishr", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "1f72857739beed55276f263f49b3802c336b8c58" + }, + { + "sha": "ab16bff97d75301b56530c2c9a410960e2de8bc8", + "description": "intel: Delete hardcoded devinfo->urb.size values for Gen7+ (sans DG1).", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0bea2a13212be10982e14617002a3ff851b84717", + "description": "egl: Limit the EGL ver for android", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9c839e6394028a84c97ebd0a725ff512aeb75ce6", + "description": "amd/common: Fix incorrect use of asprintf instead of vasprintf", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "39d59cf87a3974142cb69dd52386d96b5e6e7dd9", + "description": "docs/features: mark GL_NV_conditional_render as done for zink", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5743fa6e709a01c5a6820320b2e87931af46e7cf", + "description": "zink: enable conditional rendering if available", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5c7dea394fc21d455cd00dff3d3881276427fc47", + "description": "zink: add a GET_PROC_ADDR macro to simplify load_device_extensions", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b8fd70eef2cb5936e8ccb4d237e50084d6ff2a7c", + "description": "zink: load vk_GetMemoryFdKHR while creating screen", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c668bdf05cf3aa6907b678c35805962ea79b258b", + "description": "radeonsi: do not use cmask with encrypted texture", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8873ea0e253b90eb91da514e36bd845d6a5f9734", + "description": "radeonsi: determine secure flag must be set for gfx IB", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "92e64f4b41ed0ab0b9291d1de0b285733fd92307", + "description": "amdgpu: use AMDGPU_IB_FLAGS_SECURE when requested", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2c2ab36f5346a07b14ee85703316f323a77c2530", + "description": "radeonsi: add support for PIPE_RESOURCE_FLAG_ENCRYPTED", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "413d91bbcb9f4a0db811c714d1cdcd731fee50b9", + "description": "gallium: PIPE_RESOURCE_FLAG_ENCRYPTED", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5c58cbe84d231b602d830f4d9e26ed8d42ddc145", + "description": "radeonsi/sdma: implement tmz support", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5d96c26b67da5a053d57782d42cb80447e4ade1a", + "description": "radeonsi: force using staging texture when uploading to secure texture", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "fe2a3b804bdf4773d853974a0a51584cd5a0f9d9", + "description": "amdgpu: add encrypted slabs support", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2853ed1a248d6be1f131a684cddaf4be5a2bb9b6", + "description": "radeonsi: allocate framebuffer texture as secure when using tmz", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5a67b52de46e42c745af3672767d0271d5425230", + "description": "radeon: add RADEON_CREATE_ENCRYPTED flag", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "856a03b4c1646cae36b0c2522a3a7b71ecec6fac", + "description": "radeonsi: add AMD_DEBUG=tmz option", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "977e19d5cfe02227756aa022a7471570aa17edf7", + "description": "amdgpu/radeon: add secure api", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "506f5d9bda64fc07ee1a216cb3aeef98491c6365", + "description": "ac/surface: remove shadowing declaration", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "7691de0dcefd7a518ee1ecc4d2cd3803e42cc803" + }, + { + "sha": "266978f7cabe2c09e5538ef1b79efdd96b2cba00", + "description": "aco: prevent invalid loads/stores vectorization if robustness is enabled", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "04718a9cd63cea9d815bffd91495069a79db8ac5", + "description": "nir: do not vectorize load/store if offset can overflow and robustness enabled", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3fba0a7a6f01496344ddb93e774b2d4bc9195e8a", + "description": "aco: fix 64-bit trunc with negative exponents on GFX6", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "56f955e4850035d915a2a87e2ebea7fa66ab5e19", + "description": "etnaviv: drm: Normalize nano seconds", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "022327f7535a7d27f965af7c318005a56b44ab85", + "description": "etnaviv: drm: Use NSEC_PER_SEC", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a92a483ff7feee1a903fd1f4caab9c2a035882b8", + "description": "freedreno: android: add adreno-pm4-pack.xml.h generation to android build", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "ee293160d7d7341e0ec5f0aaf1ceb6950f785ed8" + }, + { + "sha": "5dc3b22dd06c74cdb06cbc6cf3b6d073b221e7f3", + "description": "freedreno/drm: android: add libfreedreno_registers static dependency", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "6c688ae81f4a6249cdccf1d218da5bebaf23e4f4" + }, + { + "sha": "e622e010fd838eb30eab46800015516703b76f4d", + "description": "lima/ppir: rework select conditions", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a0c58867cddcf199cf85d270b42965678ad8af10", + "description": "lima/ppir: add fallback mov option for const scheduler", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8c47640731303ed2607d28ce2cf19a7e8f0f4006", + "description": "lima/ppir: rework store output", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "570f1420dbf361cf889c21c119816e063ecc3cea", + "description": "lima/ppir: rework emit nir to ppir", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6b21b771f7b4bdd60089c784ed35e33622c8932a", + "description": "lima/ppir: remove unused clone functions", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8c4157138f331d5053036f756aefa7006bbb6fac", + "description": "lima/ppir: duplicate consts in nir", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5e6c3861182cbf481ce3d25d0bbbcad4916c92dd", + "description": "lima/ppir: duplicate intrinsics in nir", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "09003ba0700ed0ddc57d85e82320f15a2a90c8dd", + "description": "lima/ppir: combine varying loads in node_to_instr", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c6a3987f320fa26fa88e6d86cf236d7e426aab84", + "description": "lima/ppir: do not assume single src for pipeline outputs", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "741aa3439d7d38c26d0baf20be506cf79c30978d", + "description": "lima/ppir: fix lod bias register codegen", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "cef1c73634493ef9766baa0b6a898369eff7686f", + "description": "lima/ppir: introduce liveness internal live set", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9e790fea7c178141b5bb1b82ab6bcf4034bfdb18", + "description": "genxml: pack: deal with default field not being simple integers", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "942d4538a46c3420570ccf5c5f5677380c75caf0", + "description": "genxml: factor out utility functions", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d07f69413ebd1ad56e5bba86a1891bbf571ff710", + "description": "genxml: fix invalid end value for video fields", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "af17e392b2e50c86dd9a11ee72ce119a4b0033a5", + "description": "genxml: run sorting script", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "45c33313e6ee5ec85e585b512270c3f668eb149e", + "description": "intel/dev: Add device info for RKL", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "54996ad49273641e20dbb2d7aff382199e27cd10", + "description": "intel/dev: Split .num_subslices out of GEN12_FEATURES macro", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "07b0fbea92a66499ef7c0f9b748b1034831201b1", + "description": "panfrost: don't always build bifrost_compiler", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "ec2a59cd7aa42652645e76e29a72335370c80e50" + }, + { + "sha": "727a0a53fdd4a1cf63a28c53f51d49a4164b2c89", + "description": "radeonsi: remove emacs style config file", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6c998c7adfd386eede37de49080a043ef1ec0e34", + "description": "intel/dump_gpu: Fix name of LD_PRELOAD in env append logic", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1a59590e5d686a11687151d57f2fd43d366d6720", + "description": "ac/surface: fix broken pitch override on gfx8", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "441eaef6a9e5bb6d55bb3f875d60b35a5e70042b" + }, + { + "sha": "c9e8df61dc8737f167767f2bda8a56abcae8ad5e", + "description": "freedreno: Initialize the bo's iova at creation time.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b3c4e6a597a45a0281cdcb8d70ec86ce8dabbb16", + "description": "freedreno: Rename append_bo() in case it doesn't get inlined.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e1c74f3facce57555284d23ee11bc009928806f1", + "description": "freedreno: Clean up tests around ORing in the reloc flags.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6c688ae81f4a6249cdccf1d218da5bebaf23e4f4", + "description": "freedreno: Deduplicate ringbuffer macros with computerator/fdperf", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "094c7646a3ae4980f76605a922572fe2ed78f6f1", + "description": "freedreno,tu: Don't request fragcoord components not being read.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ab5590e92bc36e2b785a088751c433d31989d778", + "description": "vulkan/object: Always include the type", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "32f20783a512129fc2d7dd8a11ffa8670cef0068" + }, + { + "sha": "d11e4738a86ecac6bb4cfaf5cad5c1d32169b18f", + "description": "anv/allocator: Add a start_offset to anv_state_pool", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "772b15ad3227e08bb4e18932ac9ecf4c29271160", + "description": "util: Make process_test path compatible with mingw native toolchains", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "f8f1413070ae079443ab31a75679cfd10cb756ed" + }, + { + "sha": "696bafac40f5f15ae140a2e844fb1b31d4918ebe", + "description": "CI: Disable Panfrost T7x0 jobs", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "78d267e6dad9dbcc9e39d7d388075d15e67ba3c3", + "description": "Linux: Change minimum priority threads from SCHED_IDLE to nice 19 SCHED_BATCH.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f66bf5ba444e95e82401d8f6fd726e5ef17b3db9", + "description": "docs/features: add zink features", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8bcfce2fcd02e9b04b7edda5c0d8a0e4b77be39c", + "description": "anv: fix alignments for uniform buffers", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "a0de2e0090535bd49b70c52917e7bdab628d354a" + }, + { + "sha": "f105b69464d908ee8b54c0bddb51909ebde4d686", + "description": "radv: report correct backend IR in hang reports when ACO is used", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "290d480c55199e013fdfb91e948046f23d9c704d", + "description": "radv: do not print the LLVM version string twice in hang reports", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b1ef1c1211373f9336aaf87de865ef7f0c29fc44", + "description": "radv: remove the LLVM version string when ACO is used", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ee2aef3ea531a03dbd50a78c943ca6e7cc99c7a7", + "description": "anv: call base finish only if pass given in DestroyRenderPass", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "682c81bdfb7ea28efccea1e8cbfeb7cfc67d02b8" + }, + { + "sha": "a885ee5258241d7ec4b9288cb15955e59d526d1a", + "description": "st/wgl: allocate and resolve msaa-textures", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "947bb04fcc03c9c1b8a04a6b3cd7f86825e6f244", + "description": "st/wgl: pass st_context_iface into stw_st_framebuffer_present_locked", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "808eb20186a23be1a1917668e374243151e6699e", + "description": "radeonsi: Fix omitted flush when moving suballocated texture", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "5e805cc74bc52f97de8f6308fc06bc96623e7e09" + }, + { + "sha": "37e89e30276724932328edb7b8bf4909606052d1", + "description": "aco: either copy-propagate or inline create_vector operands", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "2dc550202e82c5da198ad0a416a5d24dd89addd8" + }, + { + "sha": "c9e73624022a482ffe0a41d0cae007e04cc61dc4", + "description": "ac/surface: override all offsets including metadata offsets", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "441eaef6a9e5bb6d55bb3f875d60b35a5e70042b", + "description": "amd: unify code for overriding offset and stride for imported buffers", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c164ea86e193c710d41de769ddfb169ab53ced51", + "description": "ac/surface,radeonsi: move the set/get_umd_metadata code into ac_surface.c", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7691de0dcefd7a518ee1ecc4d2cd3803e42cc803", + "description": "ac/surface,radeonsi: move the set/get_bo_metadata code to ac_surface.c", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "56e37374ddbc3b66bcfa4d0dadcb1fd53074c822", + "description": "amd: assume HTILE is always rb/pipe_aligned, remove ac_surface.u.gfx9.htile", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "cf61f635ff6a38aad344ebe30551eaaac6fec038", + "description": "amd: assume CMASK is always rb/pipe_aligned, remove ac_surface.u.gfx9.cmask", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "127aaf0b9ac1eb46633df35c899b5d1a175a7220", + "description": "amd: remove duplicated definitions from amdgpu_drm.h", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "25edf9b136415fc9c079ad8613b89ce261b5c351", + "description": "amd: update amdgpu_drm.h", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "89d4b6b5c857e1efcff4d945fb83b4e77d26378d", + "description": "llvmpipe: make sample position a global array.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3b6449d45317f4441eef464b415f5c65e5103dab", + "description": "nir/algebraic: Optimize some bfe patterns", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f46eabf84e6a3227ce846f6725ad16517d146dae", + "description": "nir/algebraic: Split ibfe and ubfe with two constant sources", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0d605a8bbfd0eb84e010bb9944df53b1f7242156", + "description": "nir/algebraic: Recognize open-coded byte or word extract from bfe", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "58dfb38f784c70020fd57dc38a0fd6c7f0aed80d", + "description": "gallium/swr: Fix crashes in sampling code", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "58b66f82e648eaa9378699aadcc5862bb941483d", + "description": "panfrost: Handle MALI_RGB8_UNORM in panfrost_format_to_bifrost_blend", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9c3e82296c8bfc96ac956918a69f917a954c1f12", + "description": "panfrost: Don't trample on top of Bifrost-specific unions", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7e53cce3ba4bdbddcad5f740a915f126f2388dd6", + "description": "pan/decode: Fix flags_hi printing", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a4d41a151095f73b4b8a7ba06bf06adc7cedebd8", + "description": "panfrost: Add checksum BOs to batch", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4f17e9eef6f9f8a2986264b3da5157542983a1da", + "description": "anv: don't expose VK_INTEL_performance_query without kernel support", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "2b5f30b1d91b98ab27ba21439cd8a40a0d1ece36" + }, + { + "sha": "6d513eb0db25a272da65822f35907456b544f172", + "description": "tu: Support pipelines without a fragment shader", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7ba2333cc17e7f0a1520866bcfd60a991d34295e", + "description": "util/os_memory: never use os_memory_debug.h", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "76f79db3f5d8492370c92080b5bbea7e31827b75" + }, + { + "sha": "905edc376dd1ace6ac2af0fc351606210a0141a1", + "description": "v3d: Include supported DXT formats to enable s3tc/dxt extensions", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e3ecf48dda2ddabfbabdad83e19d280d0edb8246", + "description": "v3d: Fix swizzle in DXT3 and DXT5 formats", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "469bbd8387d11bbac2aae4c49765c748c2a9fd04" + }, + { + "sha": "17ed4a01ee2bdb773109cf06ea8f827737f88684", + "description": "docs/envvars: update RADV_FORCE_FAMILY", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5c6afd0f349b17aaa0b2e06ccee44fa709d6285c", + "description": "docs/envvars: document ACO_DEBUG", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1aaec1f3f451c60456aa4b28699fd57c7b22d021", + "description": "docs: add src/amd/ to sourcetree.html", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "38bbfd3a57d68abdc88a93b436eac9f30a397b0f", + "description": "clover/nir: Check the result of spirv_to_nir", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "deb04adf2ae605a017d7ce4e81f57db679567dfa" + }, + { + "sha": "abc4a8285776dcded21d0b7f3035c9858d061611", + "description": "nir: make fsat return 0.0 with NaN instead of passing it through", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d8a27c0bb3049963934c77d104db39ecf610e3b9", + "description": "compiler/spirv: flag nclamp/nmin/nmax as exact", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9a11aa4ece691ac9b6d8911cac6f3727ac3d7094", + "description": "docs/features: Add ARB_clear_texture to virgl", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2e6bbab9aecd8e6cfb6e2e8dd408460a96365ada", + "description": "virgl: Enable CAP_CLEAR_TEXTURE if host supports it", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e705a2a9f48d3247557d468216c797dfb1758826", + "description": "virgl: implement ARB_clear_texture", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a6321c4b5a038db4682e49e098e9368df7ea18c4", + "description": "r600: Fix warning regarding mixing enums and unsigned in ?: expression", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5469fcea75457542b699404b003bfb051a5fc6b0", + "description": "r600: remove some unused variables to silence warnings", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "79f20eb819e4ca4c09a5602c0d82ac905773cc66", + "description": "r600/sb: replace memset by using member initialization/assignment", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ee3f4ab2f439c1180067072cb943d2ba4a6c33b9", + "description": "r600: remove unused static functions", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9a244778f7304d47e40a10415e06607835f18a63", + "description": "r600: Annotate some case fallthroughs", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f9dbca8db5e2dc857410f1ab34f85ca6dc837413", + "description": "ci: run radv-fossils with Pitcairn (GFX6) and Bonaire (GFX7) too", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a44cfac502f9a740d0c21b561f270e4221cb78d7", + "description": "ci: set ACO_DEBUG=validateir,validatera global for RADV testing", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5dbf862b13214cec4765811abba91111c4b0f04d", + "description": "ci: remove unused .test-radv-fossilize rule", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a0de2e0090535bd49b70c52917e7bdab628d354a", + "description": "anv: increase minUniformBufferOffsetAlignment to 64", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e8cdf125112934b589d9682239e46bf196bd9de1", + "description": "freedreno/a6xx: enable tiled compressed textures", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "193560c44bfe35655a111870773a8ff5f08ee5dd", + "description": "freedreno/a6xx: compressed blit fixes", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "85f2cd84ac14be7d44eb7d5deb9b9756bd7bacc2", + "description": "freedreno/a6xx: Set tfetch correctly for compressed formats", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a34b3fa198a4f87f8e07c718ec2f2e07927c6d7d", + "description": "freedreno/fdl: Align after dividing by block size", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6292059662dccd3e151c731a3b108fd0b9e4c606", + "description": "docs: update calendar for 20.1.0-rc2", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2637961d29b3828e07953159d075f8544ce3fae5", + "description": "ci: Fix the nick used in IRC reporting.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2c50176dfea360775d36cbbb02027d959db6dfaa", + "description": "ci: Improve the flakes reports on IRC.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3b5e71cb181bae7646abe561a779b3d2062ba0cb", + "description": "ci: Enable IRC flake reporting on freedreno baremetal boards.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c7bbc211d681696687a134eb158d55fcb674907c", + "description": "ci: Clean up setup of the job-specific env vars in baremetal testing.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "29da52128090a1ef8ef782188c0f67c7f5ec8d19", + "description": "radeonsi: fix compilation of monolithic PS", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "8832a884345686e6a8b2c0c8aa7515ad3f775b9e" + }, + { + "sha": "d5109741f372173d6e13bdb6fff06c75def19439", + "description": "tgsi_to_nir: translate non-vec4 image stores correctly", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "784358bd6e6d59c521133c2a31fa9b88f8e18598", + "description": "i965: Fix out-of-bounds access to brw_stage_state::surf_offset", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7f6a491eec02d6c141e2b4849a3ba924926a482a", + "description": "zink: lower b2b to b2i", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "c217ee8d35fcac8ab11e7b5bfd0e053e1fed7df0" + }, + { + "sha": "f457e1b6d5814e51cb9e0ae47e8fd5936139f42f", + "description": "radv/winsys: do not count visible VRAM buffers twice in the budget", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f3e37f5d262606f1a0f5a3073835ac0ecc41f629", + "description": "radv: display an error message if the winsys init failed", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "701f2c3dfc9bd5b18b3103670f3946e723587f42", + "description": "radv: use a linked list for physical devices", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8d993c9d2c23d70d48248c9a8f8bc2855e12b18f", + "description": "radv: don't report error with other vendor DRM devices", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f03abd504102fc71ec0b18704a2ea3a92542b5f8", + "description": "radv: report INITIALIZATION_FAILED when the amdgpu winsys init failed", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9c62e63aca202bef07b20441618dd360b94b2f7d", + "description": "radv: fix a memleak if the physical device initialization failed", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b867a677e99f429d904021b37c5640d3eeaf0e8d", + "description": "radv: rename radv_devices() to radv_enumerate_physical_devices()", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c5043287415179b9432a15b4a49e595c09513a28", + "description": "radv: cleanup radv_CreateInstance()", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "dab8803af431ff2044b4d7b17bb505079bf63d1e", + "description": "llvmpipe: enable ARB_sample_shading", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8a83db420401fd3a29cae7a8b6a49b67fd2ddf58", + "description": "llvmpipe: add min samples support to the fragment shader.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d237e03a16b9e75a266799af24c2b9112f4d1126", + "description": "llvmpipe: enable GL_ARB_shader_texture_image_samples", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f036643772fca7e2f2070ab83189b695b0977a13", + "description": "gallivm/nir: hooks up texture samples queries", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8d09d621373747235d90099adf5bed647090ec97", + "description": "gallivm/sample: add num samples query for txqs (v2)", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3cc50cabf1459c6fd5db598de55ad02375aad880", + "description": "llvmpipe: enable 4x sample MSAA + texture multisample", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "94c4577331490693a887916323dee843b69bd141", + "description": "drisw: add multisample support to sw dri layer.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7898978377cfee74d69180d73118dc6b8b2d3579", + "description": "llvmpipe: don't choose pixel centers for multisample", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8297513aa900a22853a1a12ad4e98e9098e9a1f7", + "description": "llvmpipe: choose correct position for multisample", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b72f504e99307b5e9a153813f36ac08b9e9b1a05", + "description": "llvmpipe: choose multisample rasterizer functions per triangle (v2)", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "26cc01cefda3595acf261fff4a073464fca048f3", + "description": "llvmpipe: generate multisample triangle rasterizer functions (v2)", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8611a6b34b889d6ee9220767d6f19e4057dc2047", + "description": "llvmpipe: fixup multisample coverage masks for covered tiles", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2d13591ba4d9df28ef2e90f90b8eda4ff6c7fc98", + "description": "llvmpipe: build 64-bit coverage mask in rasterizer", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "88851c4798a5ee57441cc5ad71d439a5b6f1a609", + "description": "llvmpipe: add fixed point sample positions to scene.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "78b7f2283879d1f952cd13534f4c666447b5b3ea", + "description": "llvmpipe: add new rast api to pass full 64-bit mask.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c638a59fa890887ba4ef791c0186ec64a623110a", + "description": "llvmpipe: disable opaque variant for multisample", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c5021ebb15d2d4d29c9202bea01fe8d5acb62902", + "description": "llvmpipe: fix multisample occlusion queries.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "335938cffd10a7285b98999ad7a6bc8a4ed92ba1", + "description": "llvmpipe: move color storing earlier in frag shader", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "acba9a93ef23796b394b88d5352ec6ebdf14d123", + "description": "llvmpipe: pass mask store into interp for centroid interpolation", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "367332b0fce4a613676c0e6ea44c58b2c9c3b558", + "description": "llvmpipe: don't allow branch to end for early Z with multisample", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d9276ae965aadf967ee8f2ca85dab1dd31881919", + "description": "llvmpipe: handle gl_SampleMask writing.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "69009949e0418d0b1907fd31f486058642c90c92", + "description": "llvmpipe: add multisample alpha to one support", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "66a92e5d923a2e6b948c71f37a6b109a00938e9f", + "description": "llvmpipe: add multisample alpha to coverage support.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "38e81938b65ebc32654f20a65fa8c3673c0c1cf6", + "description": "llvmpipe: hook up sample position system value", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "210d714f46e72c954857ba32ca9ffcffbc264c9c", + "description": "llvmpipe: handle multisample color stores.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "102558912be91fa7d0eb22666dc7784739ca208b", + "description": "llvmpipe: interpolate Z at sample points for early depth test.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a0195240c44f55f35b2c3d2c07b64c9f97775759", + "description": "llvmpipe: handle multisample early depth test/late depth write", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9f8c7e232e6c3cf73e809e663f558752817abaad", + "description": "llvmpipe: multisample sample mask + early/late depth pass", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f12dac5e106b20704aa66f12c3783f5f005b4fe1", + "description": "llvmpipe: move some fs code around", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5e949b16c166c6e433307f25dd476d1f35fd6b7e", + "description": "llvmpipe: add per-sample depth/stencil test", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d297f2ecf1b14f573dc425fae192b1013d175493", + "description": "llvmpipe: move getting mask value out of depth code. (v2)", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "18fd62a26e1baa59b650968d798227c922c0352d", + "description": "llvmpipe: add per-sample interpolation.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8154bdf25ba2f78dc905759e50adf840471e334a", + "description": "llvmpipe: add centroid interpolation support.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5697b9c00c363568c16efbfe6289d58f233a3f11", + "description": "llvmpipe: pass interp location into interpolation code.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "339a3a4dea48e1c174be4b544cec8fac50123313", + "description": "nir/tgsi: translate the interp location", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "28cc2ed79c1ba7f994a3459de5d11102403e8187", + "description": "gallivm: add mask api to force mask", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d89499063bd96bcc5d7d9239a1da43bf568c11fa", + "description": "gallivm: add sample id/pos intrinsic support", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "455c8e35842b6e149d5d390ab9692e214db63662", + "description": "llvmpipe: add cbuf/zsbuf + coverage samples to the fragment shader key.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d2f488684ad398f5abffefb9b1424fcb1650a627", + "description": "llvmpipe: change mask input to fragment shader to 64-bit.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "67ec1760eecbcb5c52992d9098a7e220f22c7d22", + "description": "llvmpipe: add multisample bit to fragment shader key.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f5463576b94fa51a55e180c71dfbc249c2a3cc6b", + "description": "llvmpipe: plumb multisample state bit into setup code.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e47d39aee15e80976e62fec2937771b0b9346fef", + "description": "llvmpipe/rast: fix tile clearing for multisample color and depth tiles", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "01e9779c004efa8acedff83d2560880be894e7fc", + "description": "llvmpipe: record sample info for color/depth buffers in scene", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a30db60edeb7814415b04537cee8cc306ad11fd7", + "description": "llvmpipe: pass color and depth sample strides into fragment shader.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "24cf7a2b36fa784cce9eee4ae8957d3d9dac98c3", + "description": "draw: disable point/line smoothing for multisample (v2)", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4c72bb4a960e7362e7cf4cd6e3159af85c967614", + "description": "llvmpipe: handle multisample render target clears", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "782271c0e1544c36fb53ddda1fc0bc055671f82b", + "description": "llvmpipe: add clear texture support for multisample textures.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c8740cbf010f38878209c857e582260f7492976a", + "description": "llvmpipe: add multisample resource copy region support.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "178df068210d8c5ff3dd23e925160179c65a5c28", + "description": "llvmpipe: add internal multisample texture mapping path.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "cab13f91747dc2acbb257b2d9067679b93b1e057", + "description": "llvmpipe: pass incoming sample_mask into fragment shader context.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c070af85114e37e4e1e28791528f336266c93261", + "description": "llvmpipe/jit: pass fragment sample mask via jit context.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0a6150251a1ea2545826036aad9c198a61ce815a", + "description": "llvmpipe: add get_sample_position support (v2)", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f6383673c96de2102edae0e705f7960753fe848b", + "description": "llvmpipe: fix race between draw and setting fragment shader.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6befeb66070498427e139d3ff86bfd0bd15b5668", + "description": "gallium/util: split out zstencil clearing code.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "bcbe5b3d268b325c1138ddc29785b6b002c38af9", + "description": "llvmpipe: add a max samples define set to 4.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1b02eb1a4c99b1268da2c17420ece6f341b42789", + "description": "llvmpipe: add multisample support to texture allocator.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "339aec7241f77fc027ee32d1537b30505c20028c", + "description": "util: add a resource wrapper to get resource samples", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "19703900260d51b709111206caebbad3a9578f7b", + "description": "llvmpipe: add samples support to image jit", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2e5cddacf7fb6e031540ae9f459d19cce5edefc4", + "description": "llvmpipe: add num_samples/sample_stride support to jit textures", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "bc3641d6162c0e876351ee36536f44581260dac0", + "description": "draw: add support for num_samples + sample_stride to the image paths", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "026bf2659975817cb8fceb759eb80b2459df8c06", + "description": "draw: introduce sampler num samples + stride members", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "609a3bea16b14cd5bbc59c702b91367ed768d629", + "description": "gallivm/nir: add multisample image operations", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "be8a10e2651d362b61a5566092a13311ba1ffe26", + "description": "gallivm/nir: add multisample support to image size", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ae95a08b9c366d5a8558e20c2c3a11558a9c0610", + "description": "gallivm/nir/tgsi: add multisample texture sampling.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "eb5919d9d8b09a40e5d5ce38f169b71b67661249", + "description": "gallivm/sample: add multisample image operation support", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c2545c9b15c98049e43904563c52079940bc4ee3", + "description": "gallivm/sample: add multisample support for texel fetch", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b6a20804ad5afb401f1d86dc28460e922d009588", + "description": "virgl: Properly check for encode_stride when encoding transfers", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "99fce3a6d71ee9970183b3545a4fbb8234b7fa35", + "description": "llvmpipe: simple texture barrier implementation.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "870b6a60509e2dd547dc75fee9290224ad306779", + "description": "llvmpipo/nir: free compute shader NIR", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "18f896e55d96c63b11de7ed0cbe484988a1184c5" + }, + { + "sha": "d1ad1be35a5ba609fd533f2a808a473a067028d8", + "description": "draw/tess: free tessellation control shader i/o memory.", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "0d02a7b8ca794a594c2e9cc5e6d63dc591593105" + }, + { + "sha": "a46aa3dc2e4c5462630d40e152904b7d163c9233", + "description": "nir: add missing group_memory_barrier handling", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9a6bbf4c80ee9fc4294ec70073e1602ea527963c", + "description": "freedreno/ir3: Disable sin/cos range reduction for mediump.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "aac964af4aa1a215196d6ae351a11c9b6b937a22", + "description": "st/nine: Set correctly blend max_rt", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0d83e7f4b9887346e9b7b4d44c068d340aa04f28", + "description": "radeonsi: enable TC-compatible HTILE on demand for best Z/S performance", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "39571d384e02848aff8c8fe635ff4b93d740aab3", + "description": "radeonsi: allow tc_compatible_htile to be mutable", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "04085bedc26535e47d81e333ee8969b7755421b2", + "description": "radeonsi/gfx9: always use IMG_DATA_FORMAT_S8_32 for 8-bit stencil", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "345b8aed5cd85eab71b7c80bbf00d8ccc5659bb3", + "description": "ac/surface: unset RADEON_SURF_TC_COMPATIBLE_HTILE if HTILE hasn't been computed", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "266fec1307b26a544007423582afd8618791893c", + "description": "radeonsi: don't wait for idle at the end of gfx IBs", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ae4379d81e42dec4f93983dfa9f31cf30384789f", + "description": "ac/nir: export some undef as zero", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0ee1a724bf78baa3fe514036d77d3e96abc998f7", + "description": "gallium: add a new cap PIPE_CAP_GLSL_ZERO_INIT", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ea289d1502dc5739ec9bf69328c037b72dc02789", + "description": "mesa: extend GLSLZeroInit semantics", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "679421628bf89067b4cbfa85530f196ca2835717", + "description": "glsl: add a is_implicit_initializer flag", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "fa6b22d36a915f27dee576063aead9e2c577f966", + "description": "glsl: rework zero initialization", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "84f58a08634d0ea07f557ffa5b91c9c8777a2b04", + "description": "glsl: init gl_FragColor if zero_init=true", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "547e81655a0b9f6d7742e25f2e353e22c3a3b393", + "description": "radeonsi: don't print gs_copy_shader stats for shaderdb", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "dbc86fa3de6aba480f679a36b40227c0fe27c37b" + }, + { + "sha": "b0a7499d28dd5a7c89a70cea79cb14d943632609", + "description": "radv: enable shaderInt16 unconditionally with LLVM and only GFX8+ with ACO", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "64662dd5baeec19a618156b52df7a7e7adba94cf", + "description": "radeonsi: add workaround for issue 2647", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7983d97174de10670fce5a422ce6c1cb5e783b88", + "description": "zink: use nir_lower_uniforms_to_ubo", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4777ee1a62f0620efa2a105215eb589fc44dfa0f", + "description": "nir: Always create UBO variable when lowering uniforms to ubo", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "354474b9e5a9e9fdbba1b113f36d7d119d4187c0", + "description": "mesa/st: consider NumUniformBlocks instead of num_ubos when binding", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8471f7a5fa1d5c00de9f314eaccd23dd0e62e71b", + "description": "compiler/glsl: explicitly store NumUniformBlocks", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8059f206da88a7a2147c66b3057d60d775cbbfce", + "description": "glsl: rename has_implicit_uint_to_int_conversion to *_int_to_uint_*", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "403eb507f586e62acd648778dc1e7d20b5e1fa2f", + "description": "driconf: add force_integer_tex_nearest option", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "12fb7d700861fb0af639fa21c1e3b65981ee81e4", + "description": "mesa: add gl_coontext::ForceIntegerTexNearest", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "90d9f9a37ed6a84292a96a247f70ffeb88a2ccde", + "description": "aco: remove unecessary p_split_vector with v2b reg class", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b0cb38f36085ccee6e71b6e50cb4f094d7f03c58", + "description": "vulkan: Update Vulkan XML and headers to 1.2.140", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "785803a2e5472bb497a598643b2386c60c60347e", + "description": "turnip: Remove RANGE_SIZE usage", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "24f9aea770711c8b968177b6e4ff15d8fb8fb48e", + "description": "radv: Remove RANGE_SIZE usage", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c4d11ea3c43447da5add84bdd2e0c91786d9af9b", + "description": "anv: Remove RANGE_SIZE usage", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "57796946985de60204189426ca8eb7bbfa97c396", + "description": "android: iris: add iris_seqno.{c,h} to Makefile.sources", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "e31b703c4232fd59d512ab2a865161c9ce859706" + }, + { + "sha": "c4cdef64ad6d11e82894d24691348c38d7fad6ce", + "description": "ac/surface: fix MSAA crash with FORCE_SWIZZLE_MODE on gfx9", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "3dc2ccc14c0e035368fea6ae3cce8c481f3c4ad2" + }, + { + "sha": "1dcf291e3bf7050a396cee74b6baca99a575d915", + "description": "pan/bit: Add IMATH packing tests", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8fcc23bf28d2c533151465bdf1d1d0cecb59b8fc", + "description": "pan/bit: Factor out identity swizzle helper", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "36e4ffa382f9f82ff6d7ed1dd4fc66010ed48fea", + "description": "pan/bit: Use swizzle helper for round", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "118d53bf93d8ad39238fca6b43111675b1602725", + "description": "pan/bit: Remove test names", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "52cdaaacbbb546abdbf654f06e413eb250bb019d", + "description": "pan/bit: Interpret v4i8 ops", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "66163614dbfc546168bd44036669277f7a4a0209", + "description": "pan/bit: Interpret IMATH", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1799435df0f2782671596dd34c1f1965931943ad", + "description": "pan/bi: Don't schedule <32-bit IMATH to FMA", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2925e88996c6b819a6c0330fd61760a8be350837", + "description": "pan/bi: Add SUB.v2i16/SUB.v4i8 opcodes to disasm", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "10c18c6f69ed6ea8d82b300dc93c55e3180c1c74", + "description": "pan/bi: Pack ADD IADD/ISUB for 8/16/32", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a463b2c2ed452f1ee1497fbc19921c910de93237", + "description": "pan/bi: Pack FMA IADD/ISUB 32", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "cf3c3563e0ead2b7050efd12de377b6b6d25dd2d", + "description": "pan/bi: Use IMATH for nir_op_iadd", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1a94daef589be2e95b12d30733d45b9ffeaad436", + "description": "pan/bi: Rename BI_ISUB to BI_IMATH", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5c81f51c3c7fb7aa86cee3d9ba4324b1610c726d", + "description": "freedreno/ir3: Define the bindful uniform/nonuniform desc modes for cat6 a6xx.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "97b21110b8b845bb02596036bdbf9e9562cfa7a5", + "description": "freedreno/ir3: Sync some new changes from envytools.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1e5b0c92c549b392065f4856a2e5370b2157d961", + "description": "freedreno/ir3: Add some more tests of cat6 disasm.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b97cc41aa203fd9fb9f5cf5f5aa7fd40f567917d", + "description": "Revert \"ac: reassociate FP expressions for inexact instructions for radeonsi\"", + "nominated": false, + "nomination_type": 2, + "resolution": 4, + "master_sha": null, + "because_sha": "cf2f3c27533d8721abed4cdd4dfb00d4d53e8a0f" + }, + { + "sha": "5f01869f74a497ee56fd01eedbe0b2802beea63c", + "description": "pan/bit: Add ICMP tests", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9bc684cad838094ae7d26a102079862246dc9eab", + "description": "pan/bit: Add more 16-bit fmod tests", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "041ba62e8766aec2565234d56c0df02517c26ad0", + "description": "pan/bit: Add swizzles to round tests", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "35c806e701d1557db7d97671030f50704bc7b833", + "description": "pan/bi: Don't pack ICMP on FMA", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5cbdf29b7e931fcda52aa1a0e8a9551cd7309651", + "description": "pan/bi: Pack ADD ICMP 16", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5bd417228020a670e73fc8bf43e9595d1118d085", + "description": "pan/bi: Pack ADD ICMP 32", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "336d5128f9ba5036834fe1fb6420931b8aaad0ff", + "description": "pan/bi: Structify ADD ICMP 16", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "fdf154d24af8b5a3b3e1dc073e9db329c2e21c5a", + "description": "pan/bi: Pack ADD.DISCARD", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7a9b9859e72fefb0d994be81e7edb42eeaece969", + "description": "pan/bi: Handle discard/branch in get_component_count", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8ab5c97895daa86d0ac777b60f31737e491a22f3", + "description": "pan/bi: Fuse conditions into discard_if", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "201a11a13ad7c9fddd621602b729440532c9a11f", + "description": "pan/bi: Add float-only mode to condition fusing", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7d867f787f9ada8b67e4d8ee6e9e388b6d6bc9bd", + "description": "pan/bi: Emit discard (not if)", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c9ab73296c1c80796b6a3e8cfc1affb486e4dd81", + "description": "pan/bi: Handle discard_if in NIR->BIR naively", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6627b20de3511da153f2733a649b22c13d9e570a", + "description": "pan/bi: Unwrap BRANCH into CONDITIONAL class", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6e5d2072939617bd6e0abe8b36cfadca83bed6f6", + "description": "pan/bi: Remove BI_GENERIC", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "20cb039457d79dd88aebff7e92cb223ae20b83d0", + "description": "pan/bi: Structify DISCARD", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5c03340fd1a12c42fce43ba4060f39706663d541", + "description": "pan/bi: Fix DISCARD ops in disasm", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "31a41bb6a62edf77e9c311064c43fffa1c8fcd8f", + "description": "pan/bi: Disable CSEL4 emit for now", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e14e3065a9f037df5c877057cd53587b58208063", + "description": "pan/bi: Fix incorrectly flipped swizzle", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8415b3d552328de44d4602b1d85561af48ef302a", + "description": "pan/bi: Fix missing swizzle", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c9634894a67d40c101f56f0f3e963359f271ed2a", + "description": "pan/bi: Fix double-abs flipping", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ef9b4b3a0bea318bcc853f9654721b56d7e1c27d", + "description": "pan/bi: Set clause type for gl_FragCoord.z", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "47c84ee73546f1b86df808c02aa509840e6158df", + "description": "pan/bi: Lower gl_FragCoord", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c5ef35c4334d7a9e6fdc10cbf10d6f90b963e714", + "description": "pan/bi: Passthrough direct ld_var addresses", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "513c774d58044447e0c7c6169a30b9537e0f8e64", + "description": "pan/bi: Print bad instruction on src packing fail", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0561fe3a06d61a182679eb43888797af5d8cc217", + "description": "pan/bi: Futureproof COMBINE lowering against non-u32", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c48839086dbb04bbf23efc1d9bfee73f21f94561", + "description": "pan/bi: Abort on unhandled intrinsics", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "94e6263c0b0ebb81c511452c7844394802bf37b9", + "description": "pan/bi: Abort on unknown op packing", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5a415259fc7404fa473722ebdb0773c8fec3cd3d", + "description": "pan/bi: Add clause type for gl_FragCoord.zw load", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "30f07e0d843935fcb34fc28e62bc4c832219c06b", + "description": "panfrost: Setup gl_FragCoord as sysval on Bifrost", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "89a41dae7702731bee298288f3acbcbd56096b30", + "description": "etnaviv: do not use int filter when anisotropic filtering is used", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "7aaa0e59086fa2bf9c5fa7db2774cb2aa32f95b9" + }, + { + "sha": "b38e51bd969e212cce90998bc283049e74f9b33c", + "description": "etnaviv: fix SAMP_ANISOTROPY register value", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "7aaa0e59086fa2bf9c5fa7db2774cb2aa32f95b9" + }, + { + "sha": "cb1e0db23e3fa17562bb276b125aeab0b85582cb", + "description": "vulkan/wsi: Make wsi_swapchain inherit from vk_object_base", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "32f20783a512129fc2d7dd8a11ffa8670cef0068", + "description": "vulkan: Add run-time object type asserts in handle casts", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7628585dd709f3ffb18c7ca134fff1f375ad7b6a", + "description": "anv: Refactor setting descriptors with immutable sampler", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "73fb7cdbe1c8ce476f21cb6d39944a96151ec4b5", + "description": "vulkan,anv: Move the DEFINE_HANDLE_CASTS macros to vk_object.h", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "682c81bdfb7ea28efccea1e8cbfeb7cfc67d02b8", + "description": "vulkan,anv: Add a base object struct type", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "369703774cfa304f4881e0e379eb02ed98933dde", + "description": "anv: Allocate CPU-side memory for events", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4ac4e8e11f36cdfa18562804931be59a4fe08544", + "description": "anv: Stop clflushing events", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a9158f795143fb8b333e6fe33b25c2a4e4d2da15", + "description": "vulkan,anv: Add a common base object type for VkDevice", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9d10bde5a878aac440ea34dfb304812cd00b231c", + "description": "vulkan: Allow destroying NULL debug report callbacks", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "086cfa5652ec202f87c14d11e0f6c959d75987d8" + }, + { + "sha": "46b3cb011fd1c9198aeec33d453206846b579817", + "description": "st/mesa: destroy only own program variants when program is released", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "de3d7dbed521c40344c9f8b2b505b6e2b13a7636" + }, + { + "sha": "7e7bb38bd8b12fec09afc0e515480bb6c5a8475a", + "description": "radeonsi: fix export count", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "17acff01a00109c87d59b9d876fc735dd5fbe3d1" + }, + { + "sha": "af55bdd05d94eda59ee1c9331a50045000da5db5", + "description": "vtn/opencl: native sqrt support", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "337ff9c0889c86be398b10a2a962a40c1c2b2840", + "description": "vtn/opencl: native rsqrt support", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2ab6a58c197ca88d6c7e8a3f9fa841f0a594a96a", + "description": "vtn/opencl: native recip support", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a698c2eedba8195a6486cfb3a2a61dd9fcfa31bb", + "description": "vtn/opencl: native powr support", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "594c49be08002f2953a7a32bc774ce8f0fbfd6f9", + "description": "vtn/opencl: native divide support", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "bce8a86b652981db3684da943c6cbb3fd7d7f1ae", + "description": "vtn/opencl: native variants of sin/cos", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f76b379a9a68dd71e39a6ca270107384a64f67cd", + "description": "vtn/opencl: add native_tan-support", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "aab1361d59555ffe084e9da15bb41452d8495f90", + "description": "compiler/nir: move tan-calculation to helper", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "58bb8172574cf9a911af03326903034daa30a481", + "description": "mesa: check draw buffer completeness on glClearBufferfv/glClearBufferuiv", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f1a40a26a90d65c8cb2881b9a2679ed089bf2ead", + "description": "Revert \"ac/surface: remove RADEON_SURF_TC_COMPATIBLE_HTILE and assume it's always set\"", + "nominated": false, + "nomination_type": 2, + "resolution": 4, + "master_sha": null, + "because_sha": "f6d87ec8a908250af5e805c8042524ac360094a2" + }, + { + "sha": "ee8f60da19d8e495483e5aa7a84717a8e941dd6f", + "description": "i965: disable shadow batches when batch debugging.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b2164320a06f98b5ab49cbcf9d9929c5dd9c9b14", + "description": "i965: add support for gen 5 pipelined pointers to dump", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "df9629e593ee7faee617e90b644b52f049801e34", + "description": "radv: Extend tiling flags to 64-bit.", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "bfd9e7ff243a48873721fd57d9a159cc82f580d6" + }, + { + "sha": "b5f7b0ce194cb62eba35a95761b7a500478e4950", + "description": "aco: add message to static_assert", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "c99107ece02b64916031883889b9e010c99e2435" + }, + { + "sha": "8e02de4d7fc3bc7ac1f7f9faf0e18f33fe0098e3", + "description": "aco: remove use of f-strings", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "2ab45f41e08a3892138a1e9b20552621b4e18682" + }, + { + "sha": "49cc9e95266d547b89ea309798d54814d059285e", + "description": "anv: Disable extensions based on Android versions", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a77cf797f1209f70925b2bd3ceffce0ad9c3e963", + "description": "anv: Limit vulkan version to 1.1 for Android", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "33c61eb2f10526c0b90c5ad376e5b0433aec296d", + "description": "iris: Implement ARB_compute_variable_group_size", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e645bc6939794a95ecd7b0f5dbd9de07332ef365", + "description": "intel: Let drivers call brw_nir_lower_cs_intrinsics()", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2663759af0edb1ebcee3aa1ff63f846911d16076", + "description": "intel/fs: Add and use a new load_simd_width_intel intrinsic", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4b000b491a49afb12612a3cfeebeca9a528cd5e3", + "description": "intel/fs: Add an option to lower variable group size in backend", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0edb58a84eb4a2b74b1ce55fea9dc06386c56bf6", + "description": "intel/fs: Clean up variable group size handling in backend", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1800e4b58caaa89acfe45c95d0d22e533b50ee03", + "description": "iris: Implement PIPE_FLUSH_DEFERRED support.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "df09efe8df40f39dc791f39fde07b37a48157eea", + "description": "iris: Detect DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT kernel support", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "615270502c7e7083e41080d3ea3cc57a29458f66", + "description": "intel: Move anv_gem_supports_syncobj_wait to common code.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "07fb925ad82f2b61cbdd239956c16e752c64e05d", + "description": "iris: Flush any current work in iris_fence_await before adding deps", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3dbde891118af0c64a16c729be5b551447aaae18", + "description": "iris: Store a seqno for each batch in the fence", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "fd1907efb385a6f668971e9bb93af2f64d7b8cda", + "description": "iris: Convert fences to using lightweight seqno", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e31b703c4232fd59d512ab2a865161c9ce859706", + "description": "iris: Place a seqno at the end of every batch", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "fb95ac68552d84d5d4f587edfb38cdc1889ede87", + "description": "iris: Destroy transfer slab after batches", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c94379c770e86f66f17d5747e1925bd65bed65c0", + "description": "iris: Give up on not passing ice to iris_init_batch", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4a1ed75b85b91f6eb1a796bf7dceb195bcfc1bcf", + "description": "iris: Rename iris_syncpt to iris_syncobj for clarity.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "812cf5f522abd006fd9f3cb7bbcad797bd8730fb", + "description": "anv: Include linux/sync_file.h instead of cut and pasting contents", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "abf8aed68047c1fa4d28e92a1aa2ccf74e0be5dc", + "description": "iris: Include linux/sync_file.h instead of cut and pasting contents", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a807c9e91d1e80542eb35a68bc1951b85268d0c1", + "description": "panfrost: Update dEQP expectation list", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "211dee42d000ef27e573263f4c5d7a65b03dc5bf", + "description": "pan/mdg: Enable nir_opt_algebraic_distribute_src_mods", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1c2d469506c29cb114568b5b74f9e9c3fa00706b", + "description": "pan/mdg: Drop `opt` in name of midgard_opt_cull_dead_branch", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ba9f3d1702613e3d34eb0d36cb1f1935ef557267", + "description": "pan/mdg: Drop forever todo", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "23a20cfcf30d3c303b2c08ebc1e7557cb7a2e48b", + "description": "pan/mdg: Move constant switch opts to algebraic pass", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1628c144a98b5bac11571fbdbb12538ce60dcd2a", + "description": "pan/mdg: Rename .one to .sat_signed", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f47c60b41123e19c443968af2a311d84d8c96ac1", + "description": "pan/mdg: Ingest actual isub ops", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f8601110e4ecec7a1578e853f694796fe50b4e94", + "description": "glthread: Add GLAPIENTRY to _mesa_marshal_MultiDrawArrays.", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "2840bc3065b9e991b2c5880a2ee02e2458a758c4" + }, + { + "sha": "2a05ba541401ace1417aa57fab907abb4288baa2", + "description": "intel/dev: Bail when INTEL_DEVID_OVERRIDE is not valid", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "65b05ebdda18c1cebd88c72cc8f50530addb80c6", + "description": "anv,iris: Fix input vertex max for tcs on gen12", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "44754279ace72bc36b016f9ca519141ea4cad038" + }, + { + "sha": "8f01fa1fb3bbb94648ab3955860933aefbcb191a", + "description": "freedreno/ir3: Set the FS .msaa flag to true during precompiles.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "812c55b07960918db8bb047031c214f77ab1a37f", + "description": "freedreno: Immediately compile a default variant of shaders.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "29f58cfbd07b419bca2cbe1e455232c7319444f4", + "description": "freedreno/ir3: Set up outputs for multi-slot varyings.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "88dcfaf0ee24b6c858f13b684212951d3077856c", + "description": "freedreno/ir3: Stop initializing regid of so->outputs during setup.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8c1c21890969ce0f6e0df28522f04cdcd7dd482f", + "description": "freedreno/ir3: Improve shader key normalization.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6f1e3235f246048061d3126757d875d1ec05cccc", + "description": "freedreno: Emit debug messages when doing draw-time recompiles of shaders.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a361567c4607cb1e7d1d440edbf95a1aa87b4d9a", + "description": "freedreno/ir3: Remove unused half precision shader key flag.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "05be0659fe33727d2bd26b68994fdff4cad006dd", + "description": "freedreno: Fix assertion failures on GS/tess shaders with shader-db enabled.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f91e49ee29b6c513f7b6837c8c169f0438dd1b50", + "description": "freedreno/ir3: Skip tess epilogue if the program is missing stores.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "fd8f3b62a4b35a9ef2047cd45e9de3bd62436153", + "description": "freedreno: Stop doing binning shaders other than the VS in shader-db.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b420d04e1f744d15622f89180d1e3e511d92a8ba", + "description": "freedreno/ir3: Fix register allocation assertion failures.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "73f34e0d4677aaa705e49ba5bcf498d5e6d673d8", + "description": "freedreno/ir3: Drop hack to clean up split vars", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "dd8d257a30d94759fdb2891b58ec7552fcca5272", + "description": "freedreno/ir3: Lower GS builtins before lowering IO", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "79355fd9010888fefd1ce74b88aa1d000a302754", + "description": "freedreno/ir3: Add ir3_nir_lower_to_explicit_input() pass", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b7bfccf08509ada02c3f5c2992f9983b7f5fc7a7", + "description": "freedreno/ir3: Rename ir3_nir_lower_to_explicit_io", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a16ee14f37fca71c0c14d468db690aa410ac4ce4", + "description": "freedreno/ir3: Pass stream output info to ir3_shader_from_nir", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "07f89126cde6d61825bc3e69aec0b1eed1a83751", + "description": "freedreno/ir3: Fix the a3xx TF outputs stores.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b0b8011e3eab048a8a8c1a2f79ae003cddb62284", + "description": "freedreno/ir3: Set up the block predecessors for a3xx TF", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7bd15135a6dc105939a3e1c349217e6346dcf729", + "description": "intel/fs: Update location of Render Target Array Index for gen12", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7eb2bc8f52f0e4aaaac1add6236841484dabeecf", + "description": "pan/decode: Properly print tripped zeroes", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "6148d1be4bb52039ccda57f25a9d27ecb7aa7541" + }, + { + "sha": "3a81abf3b2e6c08dea296d164d6e4429e5230d83", + "description": "panfrost: Add Bifrost texture trampoline BO to batch", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "d3eb23adb50c621f49000191e6c024df01f090b7" + }, + { + "sha": "c46731527a9c73454bd07e0b93986ff1a1193c59", + "description": "pan/bi: Lower for now sincos", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3baf2514877f80c3d79c783caf4ab6a1d59479c0", + "description": "panfrost: mali_attr_meta.unknown1 is zero on Bifrost", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c4400b05be1aa68168e924066b9d05401745a879", + "description": "panfrost: GPUs newer than G-71 don't have swizzles...", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c409428006447c5e942bee1bc917ecadababe242", + "description": "pan/decode: Trace to stderr with PANDECODE_DUMP_FILE=stderr", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d6588b87bf72fc47a9028b1d03d7ed4c93452193", + "description": "panfrost: Update Bifrost fields in mali_shader_meta", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "07b31f3437ef60779f0fea83425521da3b7441f9", + "description": "pan/bi: Print shaders only if BIFROST_MESA_DEBUG=shaders", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9c7d30fb4a0ca1625d16dffb3ff2359331783fe6", + "description": "pan/bi: Enable lower_mediump_outputs NIR pass", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7104e286514500bedf495611a20413c0ea4eae2c", + "description": "panfrost: Add a bit more info about some tiler fields", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4d581a4bc6ad51b3c5478d46dd2f52f7c86a2974", + "description": "panfrost: Create additional BO for the checksum of imported BOs (Bifrost)", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "28902ba87e7166688157ea3ba0593eb1a60a5d9b", + "description": "panfrost: Split bit out of format.unk3", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7f130e76ea2b7925d6177b3baa57fe1f9dab55f4", + "description": "ci: add lists of expected failures & skipped tests for RAVEN with ACO", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "263ed2e7777875b21d3eff6939392a4a28d0ea5f", + "description": "scripts: remove unittest.mock dependency when not used", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "cc2c3b41b86d1dd8d23685c34be030f2607ed11d", + "description": "ci: fix reporting the number of unexpected/flakes", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "b8c66aeb9341d695c79a2d69935016919c42f843" + }, + { + "sha": "23daa49d4ca6bdbba989b7dd2e46a979494bf588", + "description": "gitlab-ci: Use YAML anchor for llvmpipe paths in virgl rules", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "60912f1ebd36c2bb235856ae7de9b1c3d66ef719", + "description": "freedreno: we don't need aligned vbo's", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9a7c179473f7afd126110ce18243b7061b661887", + "description": "freedreno/a6xx: add some more formats", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6f7d94580e3b603cb036bef9a1a235ee6b910bc0", + "description": "pan/decode: Don't crash on missing payload", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "bde19c0e7ba575f3c8ca8ea76c916034264a8713", + "description": "panfrost: Fix tiled texture \"stride\"s on Bifrost", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "bbecbedb4cae6ea646a2a387378daa086b0a9bde", + "description": "panfrost: Fix norm coords on bifrost sampler", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "401409eff31ed4a47b165806a28c870f63498916", + "description": "panfrost: Fix sampler wrap/filter field orders", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6148d1be4bb52039ccda57f25a9d27ecb7aa7541", + "description": "panfrost: Fix size of bifrost sampler descriptor", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "884f8699925b51f7032b4cd0e8d6ef1a8d48bc8c", + "description": "panfrost: Fix texture field size", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d04be375ccdd5612f07689f9c883c4e8c3f159c3", + "description": "pan/bit: Add round tests", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6bbedf8359a0dfb0c6165277eea2635e73e845f8", + "description": "pan/bit: Interpret ROUND", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f1f4f1b816ae4fe6ca0ce96af24f949363ed7715", + "description": "pan/bit: Add framework forinterpreting double vs float", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "130a3fba1c744dce5c052840b4f78437d4c73bf8", + "description": "pan/bi: Pack round opcodes (FMA, either 16 or 32)", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5f35cdaa8dd78ee97bccdab82ef21b6ad6fe1108", + "description": "pan/bi: Pipe multiple textures through", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "fc634dc3b2792bd84acb0f5aafd83863ac3c4528", + "description": "pan/bi: Add texture indices to IR", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f8424d3b9993d555115a82c7cea4931b89a94fea", + "description": "freedreno/a6xx: fix LRZ hang", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "dfa702e94b96318696314dc3c73b2f934b755583" + }, + { + "sha": "0e51082cfa733b3b8255bbd77fc4af46f4108c1d", + "description": "freedreno/ir3: Leave bools as 1-bit, storing them in full regs.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "769adc9546afcd72cbe7bb1caf14add5f6ac9f61", + "description": "freedreno/ir3: Drop redundant IR3_REG_HALF setup in ALU ops.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "bdd2f284d90b7f07ac5e878490be8d216d0d23c6", + "description": "radeonsi: revert an accidental change in si_clear_buffer", + "nominated": true, + "nomination_type": 1, + "resolution": 2, + "master_sha": null, + "because_sha": "7b0b085c94347cb9c94d88e11a64a6c341d95477" + }, + { + "sha": "5afec9bc9fbab57afffb1cede8b8eaad8caac491", + "description": "radeonsi: fix si_compute_clear_render_target with render condition enabled", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "19db1a540c248e330284a6c9733633d0695677a3", + "description": "radeonsi: add a workaround to fix KHR-GL45.texture_view.view_classes on gfx9", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d6acdbd9352bd4175191069139fd5f54cf2cc95f", + "description": "radeonsi: implement and use compute-based DCC decompression on gfx9-10", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d3da73954a639f8e43e6d22ac3f16a786d5e37cb", + "description": "radeonsi: add SI_IMAGE_ACCESS_DCC_OFF to ignore DCC for shader images", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "93d5c860812a2565348d07cc1a00bb973f37f034", + "description": "radeonsi: bind shader images after DCC is disabled for image stores", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "44d27fd6fba92a3b11c10d06f1b3babeffbcd4bb", + "description": "radeonsi: clean up and deduplicate code around internal compute dispatches", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e58dcc47c3bd4d3f22e9d0a943e339b4866bc616", + "description": "radeonsi: unify and align down the max SSBO/TBO/UBO buffer binding size", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "03e2adc990d239119619f22599204c1b37b83134" + }, + { + "sha": "b7ffa1560c3eed9bf89d546a1d86d11476ad5f05", + "description": "tgsi_to_nir: handle TGSI_OPCODE_BARRIER", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d35c3dc80e8ba99bb889b65fba3c28bca6d41128", + "description": "tgsi_to_nir: handle TGSI_SEMANTIC_BLOCK_SIZE", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2840bc3065b9e991b2c5880a2ee02e2458a758c4", + "description": "glthread: upload non-VBO vertices and indices for non-Indirect non-IBM draws", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1485a3ff7b52eed21cdc199aaa76ee1692dfa3c8", + "description": "glthread: handle gl{Push,Pop}ClientAttrib{DefaultEXT} for glthread states", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "57bf51a97357b0cce293659123a3353d1b726487", + "description": "glthread: handle POS vs GENERIC0 aliasing", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "09f94632e033805bee57a963ca15df131e7c7f7c", + "description": "glthread: initialize VAOs properly", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "47cf310a671b75b1552a7b5d8accc8baa8ecdefb", + "description": "glthread: track primitive restart state", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9037005d6034d6bcbeb508e0f783622e2351b957", + "description": "glthread: track instance divisor changes", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c9c9f57b022ab47251c260825e81241fdad9b3ea", + "description": "glthread: track pointers and strides for Pointer & EXT_dsa attrib functions", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "befbd54864d2959b83e3d2d46d0825f19cb4fc46", + "description": "glthread: don't use atomics for refcounting to decrease overhead on AMD Zen", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7f22e0fd29369f478da1d36520049f001cd698d1", + "description": "glthread: do glBufferSubData as unsynchronized upload + GPU copy", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "70847eb0a95f1e1b0fbd435aa0ef4091ae5bef88", + "description": "mesa: add _mesa_InternalBind{ElementBuffer,VertexBuffers} for glthread", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a82889e53733ffe11bf3c7a8be5fe53e382d02aa", + "description": "mesa: add glInternalBufferSubDataCopyMESA for glthread", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3707cef4fb3c4a2f5e015ea2525fcd41d8875f0b", + "description": "mesa: inline vbo_context inside gl_context to remove vbo_context dereferences", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "42842306d3c7ba71f89022a1ebb09a4454a1b6e0", + "description": "mesa,st/mesa: add a fast path for non-static VAOs", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2e3a9d78289ace1928e2dc093fc743cad81c911c", + "description": "mesa: don't update shaders on fixed-func state changes if user shaders are bound", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "256d5ca80af0b742d4b2bf156180a22b1976fb80", + "description": "mesa: don't set unnecessary program flags in _mesa_update_state", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b2b4afdc17294d628532593b084bd6105dd995a5", + "description": "mesa: set _NEW_FRAG_CLAMP only when needed", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "21ff963c3a1738b44b4e0fb0b9df193103e4d255", + "description": "mesa: don't call _mesa_update_state for _mesa_get_clamp_fragment_color", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f1538002b81493b5e4754746745db565cf6fe810", + "description": "st/mesa: Move _NEW_FRAG_CLAMP to NewFragClamp driver flag.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "eb04db734429971ddc240ba9dc6726a991cc0a14", + "description": "mesa: optimize glPush/PopClientAttrib by removing malloc overhead", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "beb02a781ca9a4918b7ac777aab65cc31338ee87", + "description": "freedreno/a6xx: don't set SP_FS_CTRL_REG0.VARYING for fragcoord", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "612e35c8d94241b07b32a6010ccd1a3edd473439", + "description": "iris: don't assert on unfinished aux import in copy paths", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d56b8c45547086ce23873a58de58484f59ad3a9a", + "description": "freedreno: sync registers with envytools", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "200765457ec6db1d4fc4aea9e1b98e03efd79b61", + "description": "freedreno/a6xx: more OUT_REG()", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f62cad6b7f8d6061dccc1fe548aee1477805d3e8", + "description": "freedreno: scissor vs disabled scissor micro-opt", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "373e9ab27c767b13846c81d1c20102bc583415e4", + "description": "freedreno/a6xx: convert const emit to OUT_PKT()", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "710537b19c04492939b11b2a19b010552c937477", + "description": "freedreno/ir3: inline const emit", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "aff93f54190f4c934e25b9210d59db22bdd38ec7", + "description": "freedreno/a6xx: split out const emit", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "58fd1d7ecd38daf5bcbaa225175b43b8c722b0d0", + "description": "freedreno/a6xx: convert draw packet to OUT_PKT()", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ee293160d7d7341e0ec5f0aaf1ceb6950f785ed8", + "description": "freedreno/a6xx: add OUT_PKT()", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a142bb899291ae68d00f552959cf6ee1cbb9e6c2", + "description": "freedreno/a6xx: skip unnecessary MRT blend state", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5d554987c2b856fe463afab3bd9103c2d1e41b97", + "description": "freedreno/a6xx: combine sample mask into blend state", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "880edb9dc5761aa19d89b2a40481e8c2771ff862", + "description": "freedreno/a6xx: move blend-color to stateobj", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "dfa702e94b96318696314dc3c73b2f934b755583", + "description": "freedreno/a6xx: limit LRZ state emit", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3c268afd296375f678e53fed302eea07a74af741", + "description": "freedreno/a6xx: limit PROG_FB_RAST state emit", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "46e177389fee7f5eed90e5debd122bfebb772ad4", + "description": "freedreno/a6xx: move scissor state to stateobj", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8cfa765049d571a95b14ea006f900de8a7bf5cae", + "description": "freedreno/a6xx: move const state to single stateobj", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "89dbdb806faaf1a4b3da0ce0ab597f9ced40d549", + "description": "freedreno/a6xx: avoid unnecessary clearing VS DP state", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f583dc68e5586fd468475ae833ee3ce8fab5a95b", + "description": "freedreno/a6xx: small query cleanup", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e3fc8dd0018bbba42325a2bdf378fd96bfd67a6c", + "description": "freedreno/drm: inline the things", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "75435d5e2a27466eef6452fda44098405a9aa202", + "description": "freedreno/drm: drop atomic refcnts", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4715502975410855cf9997075fa9c598df0c5211", + "description": "freedreno/ir3: Initialize the unused dwords of the immediates consts.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3fac55ce0d066d767d6c6c8308f79d0c3e566ec0", + "description": "Revert \"anv/gen12: Temporarily disable VK_KHR_buffer_device_address (and EXT)\"", + "nominated": true, + "nomination_type": 2, + "resolution": 2, + "master_sha": null, + "because_sha": "c61ad77cd260ce7666b257ce411e512e0ca12ec8" + }, + { + "sha": "4985e380dd776ac65c4ae5627138211f9d9f03ce", + "description": "intel/eu: Use non-coherent mode (BTI=253) for stateless A64 messages", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0edc29020b2830497f31b06898ca26715ecfd001", + "description": "pan/decode: Use correct printf modifier for long int", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "03963febeffadefc4f47adaf0bbd3618d5692b25", + "description": "pan/decode: Check for correct unknown field", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "bc11deb86d8bc037d842a04f8782461a5472ecf1", + "description": "panfrost: Don't leak temporary descriptors array", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3c98c452f012d20bcca3038af88bcbe7278d9c68", + "description": "panfrost: Emit blend descriptors on Bifrost", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "33b13b9fbd2998977f76bfeeacf63900b0ed9cba", + "description": "panfrost: Enumify bifrost blend types", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5e9ae4043004e5505f3c7e327d38911330c04dcb", + "description": "gitlab-ci: update tracie README after changes in main script", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "90a39af5f65e5fa01beeec526594f7e04143e7cf" + }, + { + "sha": "bd86399db012d93e81d1a7734214ee8d52ad43ce", + "description": ".mailmap: add an alias for Andres Gomez", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3cde4c3a0804347cfc5a8a2b4958a6e6a4b69a27", + "description": ".mailmap: add an alias for Iago Toral Quiroga", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2a70fee7dc9a212a0ab345b11008212f8d62cad3", + "description": "ci: Add intel to shaderdb runs", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0f4f1d70bfe96fc9e218ef5c2196bb677aaa251b", + "description": "intel: add stub_gpu tool", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8c3c1d8a9906f3a45cbe50012b2ca60a3bc21fe8", + "description": "intel/dev: print out error when platform is not found by name", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "fd3c0146723043f66bd18e376fa6526126ff18c0", + "description": "drm-shim: silence warnings", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "764ef4bf1a6fe0c256859d275c5f922d46217dbc", + "description": "drm-shim: don't create a memfd per BO", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6b34c8d35f5c8cf8febc1310012de67e6f41b09f", + "description": "drm-shim: move handle lock to shim_fd", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f78af3372131e4c2f0344396d3490abe816992cf", + "description": "gallium: extract out logicop helper", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "51a82ec3e437d1d2dc4c688578640d25b3e7f0a2", + "description": "gallivm: fix half to float conversions with llvm 11", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ec6565bb2614afe605afae516a2f1dd8ada5bcad", + "description": "cut 20.1 branch", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0842758ec0fe716f6559ca630cb8704cf7fb97bf", + "description": "intel/ir: Update performance analysis parameters for memory fence codegen changes.", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "f858fa26b4cca8834c8687f01d2ba431fcc8e006" + }, + { + "sha": "82aa4460492200c621a2f35c93519230b69dbc18", + "description": "docs: update calendar, add news item, and link releases notes for 20.0.6", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "06b5a646e865200aaee36be2d11aea57eca85aca", + "description": "docs: Add SHA256 sums for 20.0.6", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "55bb55e93c72b8519f9f50c9eb78951d67cf68e8", + "description": "docs: Add release notes for 20.0.6", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e70cfe47b3e94684fed79feff7da1cf94ac63e73", + "description": "pan/mdg: Be a bit more pedantic in invert passes", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "074815ca0e96c25df55879af9031a13bc4c20106", + "description": "pan/mdg: Track more types", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a0fe98b478767e90cc5ed976e4e953117fe38308", + "description": "freedreno: fix buffer import", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "5a8718f01b3976e1bc82362a907befef68a7f525" + }, + { + "sha": "2efa76f795cb2b2bf00b317c580aeeeddd1e9bc2", + "description": "i965: remove unused variable", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "85fe0e551fc045d03aa7739d0f1d887484ec6d12", + "description": "radv: Fix implicit sync with recent allocation changes.", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "bec92850270a046524056b8d43bbd2554ba9f2e0" + }, + { + "sha": "27cafa9a5173cae48781b724da2cc5be682eac83", + "description": "freedreno: switch to simple_mtx", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "336a8cd82a4070674ecc056be0af791bf6c7d042", + "description": "freedreno: add screen lock wrappers", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8aacaeca685c4e705a3237c2187f2f9bcef23339", + "description": "util/simple_mtx: add assert_locked()", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3e1b93ec4fa31014c322b970f7d8a057fdec04fe", + "description": "turnip: fix wrong substream size in parse_multisample_and_color_blend", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "a92d2e11095d9f1f8bc1188fd3d2b8391acc4591" + }, + { + "sha": "05e6f763e7683c13a59e14f12ce3231d892921c2", + "description": "util/ra: Improve ra_set_finalize() performance.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "53ac2dabec45cc329eb033679db6fdd5a8221851", + "description": "util/ra: Use util_dynarray for handling the conflict lists.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "57088854e60b1616f3c8a4c793b7d95a87ece9a0", + "description": "util/ra: Use util_dynarray for the adjacency list.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a1de267a21acacc4c77bbb94127bfbf1caa4bfc8", + "description": "util/ra: Sanity check that we're adding a valid reg to a class.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5bcaf30aba08b718e913b10745df5e92854ed5b6", + "description": "util/ra: Sanity check that the driver selected a valid reg.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "fc66800032946c53a01c979ee3b69ac2ba989222", + "description": "freedreno/a4xx: enable A405", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "328cc00d39808191529fa359cc21fb935c9acc89", + "description": "iris: handle PIPE_CAP_CLEAR_SCISSORED", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1c8bcad81a7ce106b37f1ee4a75b817651d6545e", + "description": "gallium: add pipe cap for scissored clears and pass scissor state to clear() hook", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "882928dcaa2133fe07b73e7e962d50625c8e6a03", + "description": "i965: Use correct constant for max_variable_local_size", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "5664bd6db383984192cf362884dd9fb17d8ed3a3" + }, + { + "sha": "91375f13ce05ab637aa6275dbb7bcb8a9c4cfdb9", + "description": "iris: move iris_vtable to iris_screen", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e581ddeeeecf9475d0634794ee126096d0f23135", + "description": "intel/fs: Don't delete coalesced MOVs if they have a cmod", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6fe7d6758a5ddc09bc5d186b8219fb9ad807fad4", + "description": "st/mesa: expose more SPIR-V capabilities", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a2542deb63adb3b5536947bcf9610c0ceca9da28", + "description": "mesa: report GL_INVALID_OPERATION for invalid glTextureBuffer target", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "98e64e538afeaa800e1cdcbc7ce5d5093b274fe7" + }, + { + "sha": "ffa314eab32b94956db1e0cd54d833333b050635", + "description": "pan/mdg: Replicate 16-bit swizzles", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c571d31b8b268aa22ebeb134589150d08db892b2", + "description": "pan/mdg: Ensure fdot is scalar out in disasm", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "95664b177f4effeae9e3e3cc1cc97629a0d1db6d", + "description": "pan/mdg: Move condense_writemask to disasm", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "efc9ab6dcced7b8afc8e9dd9f201124ca8d00797", + "description": "pan/mdg: Pass through some types from scheduling", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d8d7df6f09acb7220c35323449d512146a7cc06d", + "description": "pan/mdg: Don't crash on unknown branch target", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e27fd4b3eccff8fcea2dd381f73bdd1d96075a7d", + "description": "pan/mdg: Make some branch targets more explicit", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "dfa7c26ff829f0b819f089afa64e66f8a9321244", + "description": "pan/mdg: Always print the mask", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "459cf59c6114d3dc58a4db7187126b5b17ae6c87", + "description": "pan/mdg: Specialize swizzle to type", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "62768590d5414a40e6b22a22a7a50e66893d0451", + "description": "pan/mdg: Lower specials to 32-bit", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "bb0e85fca472ed57bd01ba13a18f55bce42ab73a", + "description": "pan/mdg: Move sampler_type emission to pack time", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "08af4c788d3e9b4eb4fcb7477dce0ca7930f974a", + "description": "pan/mdg: Set texture full fields at pack time", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4fb02174a32aa8c73898dde895af8403aa052e4a", + "description": "pan/mdg: Track texture types", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "53c183736e3ae3692eecd761c1b82676b429bc59", + "description": "pan/mdg: Track v_mov type (force uint32 for now?)", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "74fadc8859e9bd147617890e9b6094712e7083fe", + "description": "pan/mdg: Denoise prints", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "714eba87625bbfcca6e943d488de3a6032ce3dc5", + "description": "pan/mdg: Track a primary type for I/O", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "04f76ad8aec1dbd61bc5041b434cee4d7ff7c82b", + "description": "pan/mdg: Another goofy comment gone", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ecf946638e0f25d5a083d09b25b8c463b702212b", + "description": "pan/mdg: Track ALU dest type", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6757c480ab43d9020fac7a9e6233af6431ad6351", + "description": "pan/mdg: Track ALU src types", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "742b272314fa6e202ea43b2f5473aee12bf7350e", + "description": "pan/mdg: Add type fields to IR", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b9f7f06a61ead80bf035213a270aca532e34c838", + "description": "pan/bi: Share ALU type printing", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6c08e294c8bb838a4d0242683167b034be4f924b", + "description": "pan/mdg: Set lower_flrp16", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "05f5267f234774578b5365837b401f233d6c9f73", + "description": "pan/mdg: Remove old hack", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d7f98a87f28ebecc87bb14d0d9b313530a10dcc1", + "description": "pan/mdg: Remove goofy 16-bit comment", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3b10bcd41743d0cff3460b1ca7b961d9cbe45b13", + "description": "pan/mdg: Don't break SSA", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "23337fd5907e834eab05f5486af8458cc446376d", + "description": "pan/mdg: SSA_FIXED_MINIMUM already covered by PAN_IS_REG", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "63eec105b27b48ced9d68978881ccebcd18aeb8e", + "description": "pan/mdg: Use PAN_IS_REG", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d4600c43402b4473a7f0bf741be65ad8835d7f83", + "description": "pan/mdg: Remove nir_alu_src_index", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "fbbe3d4b75be9e6ac834feda5f5ce12af088b6be", + "description": "pan/bi: Use common IR indices", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5860b18665a8d44d164caaf3de080172b91f36e0", + "description": "panfrost: Move Bifrost IR indexing to common", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e3062edff42a3afe8029c7bc36136124036d3053", + "description": "panfrost: Fix BO reference counting", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "3283c7f4dadafee97e9af0e6613da43fad3c0019" + }, + { + "sha": "22a4cb4937d40d3dbd34129c8c0e6cf8673c8f95", + "description": "ac: enable displayable DCC on Navi12 & Navi14", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3b45631d7aa5131738cd34e341275e0aa797e3eb", + "description": "ac/surface: validate that DCC is enabled correctly on gfx9+", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5e31e4b6971fde00040c7a37f13f2253ae49ad34", + "description": "ac/surface: add code for gfx10 displayable DCC", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e2fbba7720a9bcafc8ca1169697e1b985e84e6f1", + "description": "ac/surface: move non-displayable DCC to the end of the buffer", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a3dc7fffbb7be0f1b2ac478b16d3acc5662dff66", + "description": "ac/surface: don't compute DCC if it's unsupported by DCN on gfx9+", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5d785f99b75c92220205862e7d6d4d4a2f503143", + "description": "ac/surface: match get_display_flag() with expectations for is_displayable", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3dc2ccc14c0e035368fea6ae3cce8c481f3c4ad2", + "description": "ac/surface: replace RADEON_SURF_OPTIMIZE_FOR_SPACE with !FORCE_SWIZZLE_MODE", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f6d87ec8a908250af5e805c8042524ac360094a2", + "description": "ac/surface: remove RADEON_SURF_TC_COMPATIBLE_HTILE and assume it's always set", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "25d3cc293e9e4b21a965fe086537a4b448424bd8", + "description": "ac/surface: rename micro tile mode enums like gfx10 uses them", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "298e247776309b4444b4c3ac26872fc1f694568c", + "description": "winsys/svga: Optionally avoid caching buffer maps", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "422148de52fc57f9f33e632883400fff49b3ad9d", + "description": "gallium/pipebuffer: Use persistent maps for slabs", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e4e1a0ac1321730bbdeb4aef89ff14281a0b56eb", + "description": "radv: Use smaller esgs_itemsize for ACO.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ee5f04c9c9c02e42739924f9f0b6efd3f9077039", + "description": "aco: Use new default driver locations.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "efa4976709afbbbfd430235bb8b71e6abb66d8e7", + "description": "radv: Use new linking helper to set default driver locations.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7aa61c84fe47f139b96b29d39b3298f30b96c89c", + "description": "nir: Add new linking helper to set linked driver locations.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7056714f5039e8f4302075677d962b5dd925e107", + "description": "aco: Set config->lds_size when TES or VS is running on HW ESGS.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "baa46878d4533f21d12bc93d5eed09436b3cc9fd", + "description": "aco: Calculate workgroup size of legacy GS.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "fdbb2968533be9a1caca731cf11c2ed3b46e6043", + "description": "aco: Remember VS/TCS output driver locations.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ab07c4ea70897d8d8c4d40bd336aee38926278bf", + "description": "aco: Use context variables instead of calculating TCS inputs/outputs.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "fd0248c37bfaa0dabbab11fc3060ebe52443eb05", + "description": "radv: Refactor calculate_tess_lds_size and get_tcs_num_patches.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9392ddab4399d796fdf37602f586965ec17f2b2a", + "description": "aco: consider blocks unreachable if they are in the logical cfg", + "nominated": true, + "nomination_type": 1, + "resolution": 3, + "master_sha": null, + "because_sha": "8d8c864beba399ae4ee2267f680d1f600ad32767" + }, + { + "sha": "98675d34c115e3a8db9b6b74e8eca01af5fff101", + "description": "egl/wayland: Fix zwp_linux_dmabuf usage", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8f0d3874411ec3c0dcb1171cad5930db70fb48b4", + "description": "iris/bufmgr: Check if iris_bo_gem_mmap failed", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "5bc3f52dd8c2b5acaae959ccae2e1fb7c769bb22" + }, + { + "sha": "1a33358b274631e0b8b493b0d885091d839f9d13", + "description": "anv: remove assert from GetImageMemoryRequirements[2]", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2f6648dc3cb989c16fea9d3de968388e0496339b", + "description": "gitlab-ci: add a list of expected failures for FIJI with ACO", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0e6afbbe5664953ec0df399fd1e1c841ce5b337b", + "description": "radv: advertise VK_EXT_robustness2", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0f1ead7b5366470dfd834e68b7b62305ac1602a8", + "description": "radv: handle NULL vertex bindings", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c1ef225d18332ed4800191d686dc1527e8156544", + "description": "radv: handle NULL descriptors", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "60cc065c7dbf0291c69638fdd6a6597050814e57", + "description": "aco: fix adjusting the sample index with FMASK if value is negative", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a112ec4c11a319ef28451e0fd4cd8320adbf7ae8", + "description": "aco: fix nir_texop_texture_samples with NULL descriptors", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "aa94213781447525e2e5da90ee9c72ad0a57527f", + "description": "ac/llvm: fix nir_texop_texture_samples with NULL descriptors", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a3cba3c771e68cddb644a3520bcc68bd6dfce07c", + "description": "intel/fs: Only stall after sending all memory fence messages", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f858fa26b4cca8834c8687f01d2ba431fcc8e006", + "description": "intel/fs,vec4: Pull stall logic for memory fences up into the IR", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0e96b0d6dd99e80c1ccbc13629ad22a946a74828", + "description": "intel/fs: Allow FS_OPCODE_SCHEDULING_FENCE stall on registers", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9248b045287658884456b2c77b652a9d8c862719", + "description": "radv: Expose 4G element texel buffers.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "506414e837da4b806c6fba1fdb4fe9efedbed94a", + "description": "iris: Fix downcast of bound_vertex_buffers from uint64_t to int", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5e2a7e11b460adab4555d3d16a49968fc5542441", + "description": "intel/ir: Remove scheduling-based cycle count estimates.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "486f3b04a59e0ee9c669e6e81197575a36e19442", + "description": "intel/ir: Pass block cycle count information explicitly to disassembler.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6579f562c307d12a2654b511a7ef85f7b4cddeae", + "description": "intel/ir: Use brw::performance object instead of CFG cycle counts for codegen stats.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "65342be3aefb1f258714064da4273ed9987f7375", + "description": "intel/fs: Add INTEL_DEBUG=no32 debugging flag.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "14f0a5cf64f6b8725ebe8ae68b19b096995ea0fe", + "description": "intel/fs: Implement performance analysis-based SIMD32 heuristic for fragment shaders.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d6aa0c261f2d9ccacaa6579432c16c61ca4cb073", + "description": "intel/fs: Heap-allocate fs_visitors in brw_compile_fs().", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "188a3659aea6dec9acf1c2fd15fcaecffe4f7d4e", + "description": "intel/ir: Import shader performance analysis pass.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c8ce1cfc9c115032aaaede691c5fe6f92c0e6168", + "description": "intel/vec4: Fix constness of vec4_instruction::reads_flag() and ::writes_flag().", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "bda1d72dd999a819b9645f55c2247bf84292bf34", + "description": "intel/fs: Replace fs_visitor::bank_conflict_cycles() with stand-alone function.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d2ed74079542dac5668ab057802bc8ede3aca618", + "description": "intel/fs: Fix constness of argument of fs_instruction_scheduler::is_compressed().", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6310a05f68ad6de50385246559dd4801b6ac925c", + "description": "intel/fs: Rename half() helpers to quarter(), allow index up to 3.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "bdad7f429a7df5dda2098042ecbc892e787da8ee", + "description": "intel/ir: Add missing initialization of backend_reg::offset during construction.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e549e4f6c0c16bddec3dc4d33cc63df4529206f3", + "description": "intel/fs/gen12: Fix Render Target Read header setup for new thread payload layout.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "72324035fb4dffcedd17dfc1c8d1f2ee2787e21a", + "description": "intel/fs/gen12: Work around dual-source blending hangs in combination with SIMD32.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d6ae079771bc8f5ae3a9e8a333c50a6cacb7a77c", + "description": "intel/fs/gen12: Fix hangs with per-sample SIMD32 fragment shader dispatch.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "35ee6b3d361b13c6380cf357ef05c9681639cfc1", + "description": "mesa: Follow OpenGL conversion rules for values that exceed storage size", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "53c36dfcfe3eb3749a53267f054870280afb0d71" + }, + { + "sha": "76c5688018931544fc36b55d3968b484cbbd56bf", + "description": "pan/bit: Add BITWISE test", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "844c3f94b5b9092c3fa904a0034883def6c177af", + "description": "pan/bit: Interpret BI_BITWISE", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a077da627300435eba90248683e778bb12631ed0", + "description": "pan/bi: Handle iand/ior/ixor in NIR->BIR", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ef9582738e5950764dcd33eddef7183e5529e5ff", + "description": "pan/bi: Pack BI_BITWISE", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9b415bf6a037867432c72294f0f03917f3b06ae8", + "description": "pan/bi: Add bitwise modifiers", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6de01faac5a20208422fb75d22f2bd88c53f53d8", + "description": "freedreno/a6xx: invalidate tex state cache entries on rebind", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ca05e6b04d2f96a3de09d6940bea8edb5c852dad", + "description": "freedreno: rebind_resource() *before* bo changes", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d9e56d8a695304a0f2fb109cea6fc46991f98007", + "description": "freedreno: rebind resource in all contexts", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f12188ff5264b29f6270c40c7592543aca6c2f4a", + "description": "freedreno: optimize rebind_resource()", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1e18c58047ef5920dbe442bc6fc42e62dc0edb7d", + "description": "freedreno: mark more state dirty when rebinding resources", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "bf97cc92216a0738b3dee743695496b68c149b54", + "description": "freedreno: don't realloc idle bo's", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "938b6ed64513ac4e0622d31d6fc5fe9f80416312", + "description": "freedreno: small whitespace fix", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a93b728bc61e263b9188a66df501bf108c9b3050", + "description": "gallium/swr: Fix crashes and failures in vertex fetch", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "de0d3d172675cdc0edaa164169eb9ca2a36a2e7d", + "description": "freedreno/log-parser: support to read gzip'd logs", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f561e516c8a01993ea83f5d48e0126d0b7b6528b", + "description": "freedreno/a6xx: pre-calculate expected vsc stream sizes", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "99d802ccc77c7897f27d75275d38c702f3db6fd8", + "description": "freedreno: add helper to estimate # of bins per pipe", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a9c255d70c303436bf4f9b1be08b2679373f17a2", + "description": "freedreno/a6xx+tu: rename VSC_DATA/VSC_DATA2", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3ee3ad561a29d5429309571db489f95e4ccaec5b", + "description": "aco: fix vgpr nir_op_vecn with sgpr operands", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c5eda3c746b6d6152279d8687cf7885c38a76e93", + "description": "aco: improve clamped integer addition disassembly workaround", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4ed83e2f947123e83f0cd1196454403b763c68da", + "description": "aco: add various GFX10 int16 opcodes", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "43f2ba39ef4962ffe4591560de0babe485d555c6", + "description": "aco: fix sub-dword overwrite check in RA validator", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "cca8d6ce061d5d45af5eabf631a8eaed366fd4c5", + "description": "aco: fix sub-dword out-of-bounds check in RA validator", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "307aca83a278938ec4a4932b7fa7dc6c8e189e60", + "description": "aco: add missing adjust_max_used_regs()", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "99ca96fbf58975d49e4ad131f907c5b01e12db85", + "description": "aco: improve RA for uneven p_split_vector", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "24116a8a561ffce9d55ab3b930d9c7648eda9cbb", + "description": "aco: don't recurse in sub-dword get_reg_simple()", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "09c584caeb2a1e7446ac2016ce7a7d8f0586774b", + "description": "aco: split self-intersecting copies instead of swapping", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "be4a34966ceefbaf70cecb56a8300a31c5b0ca46", + "description": "aco: fix neighboring register check in get_reg_simple()", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "fb59ed6bb9d70a410894afa998298f7e0c9160f5", + "description": "aco: check alignment of non-subdword registers in get_reg_specified()", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "916cc3e231a34551bc9534943bff8e17e52931bf", + "description": "aco: make RegisterFile::block() take a regclass", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b43366497b6c64bae9ac592ba0f6047b478fbe3a", + "description": "anv: Claim VK_EXT_robustness2 support", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b07d26be65b05784950e6f3ea5e82eb213058ecc", + "description": "anv: Handle null vertex buffer bindings", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "fd817291c7f87985d9ef9015cc086d1b5fd86825", + "description": "anv: Handle NULL descriptors", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ac581a06a4d33905eedcf5f18ae7be8ca3bca32c", + "description": "nir/combine_stores: Handle volatile", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "cb9292091b162e0dd7d5069646e94d03e112e3ee", + "description": "nir/dead_write_vars: Handle volatile", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ed677171675fe8ee204deac1e2089f480681b1b4", + "description": "nir/copy_prop_vars: Report progress when deleting self-copies", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "62332d139c8f6deb7fd8b72a48b34b4b652df7c1" + }, + { + "sha": "d9af5277b36a01af4cc6870c542a8059848a6e4d", + "description": "nir/copy_prop_vars: Handle volatile better", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "118f045fb7d1c6520e808317235c175833237631", + "description": "vulkan: Update Vulkan XML and headers to 1.2.139", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "76d2772472037b2b9922f748170bebbce0b2a1de", + "description": "anv: Allow all clear colors for texturing on Gen11+", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e63c662c26a6abfab5abf03a1646a236d6d730c0", + "description": "anv: Use anv_layout_to_aux_usage for color during render passes", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "30016f6e829a96782b13cfe2a31e8ff21f1dfa4a", + "description": "anv: Split color_attachment_compute_aux_usage in two", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3fe45a9b6cd956cf5215d9a382de4dde06eab1a8", + "description": "anv: Rework depth_stencil_attachment_compute_aux_usage", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "26e6da90ab387f50be40ca5ff16f143bc9555cbd", + "description": "anv: Refactor cmd_buffer_setup_attachments", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "36a74835dfa4e9ae8088d3cb12c3c81964585f12", + "description": "anv: Stop allowing non-zero clear colors in input attachments", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "bf92e96d9c671a94e12f1ada0c7eca1f26a4d54b", + "description": "anv: Disallow fast-clears which require format-reinterpretation", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "20e72e435c9650bca5da6b0c05a0fcec1fcd517a", + "description": "intel: Move swizzle_color_value from blorp to ISL", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "814dc6693593c51d7d89da54ab6191dbf862397e", + "description": "anv: Allocate surface states per-subpass", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a3d185d091fbacd314d76e702d292a363f3c8b55", + "description": "anv: Split command buffer attachment setup in three", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c195d5516128543bf54a5c758119e0e36763d1f0", + "description": "anv: Mark images written in end_subpass", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d5e30872cab029b8df1af70e8d528d77351a6f7e", + "description": "anv: Use ANV_FROM_HANDLE for pInheritanceInfo fields", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7cbc5fde1350e5d883b5943965038ee2b7600fc9", + "description": "anv: Assert surface states are valid", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "eaa8f043cdd218ae8f21a38e0a4d052a80aecef1", + "description": "anv: Stop filling out the clear color in compute_aux_usage", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5808efdf40d2496e14625322c1f9e8bdbe6f8c36", + "description": "anv: Add TRANSFER_SRC to pass usage not subpass usage", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "513ed7542a4dd510bfc1cd0724ee8da8885f6568", + "description": "anv: Return an error if allocating attachment memory fails", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0549fba3cccc7f2b48dc44819be0cc04be9e07f2", + "description": "radv: advertise VK_AMD_memory_overallocation_behavior", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5832f2b8a34fc5ca50fa0cf590539f2b8c3322f6", + "description": "radv: track memory heaps usage if overallocation is explicitly disallowed", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "32035cca3fcc1bb49cc75751d8ba324175afb14a", + "description": "radv: remove unused radv_device_memory::map_size field", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7b869710a1c29b535aceda74220594f12dc3beb0", + "description": "nir/algebraic: Require operands to iand be 32-bit", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "88eb8f190bd69c7f057063f5d88faece59477088" + }, + { + "sha": "656051d735c949021e7eb206b6c4a633cc76936f", + "description": "freedreno/ir3/ra: only assign array base in first pass", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3d8ec9676261740404a06afe992adbffd0123a00", + "description": "freedreno/ir3/ra: split out helper for array assignment", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6313b8d881e810f9ddd7f0499dddd7049e6cf958", + "description": "freedreno/ir3/ra: use ir3_debug_print helper", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8b3ac7084ab71807850416fc1324c5ca0a42e01e", + "description": "freedreno/ir3/ra: remove unused variable", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "997828e31ba00a023b147648c99e734a5451a7bd", + "description": "freedreno/computer: add script to test widening/narrowing", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6b551d9f360e45ba4e74867dbe79ae212e4766c5", + "description": "pan/bi: Add initial fcmp test", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "778e27b5acf99537f5301fdc1f04ed9467966261", + "description": "pan/bit: Interpret CMP", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "71501972e91ddc2b9796dab59cb45c5808d3c799", + "description": "pan/bit: Prepare condition evaluation for vectors", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0b8724c34016430c95ca68dd9a01280eb93c7cec", + "description": "pan/bi: Relax double-abs condition", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "81156ad55a10b2d1fddaf64ac707279c60eb3d54", + "description": "pan/bi: Pack fma.fcmp16", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7a689470d0f5dc7733780b61ca0822fe629c131a", + "description": "pan/bi: Factor out fp16 abs logic", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c94d41ad7c92a9549e16f733dcb6a0a0762e811f", + "description": "pan/bi: Pack FMA 32 FCMP", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1520131d82812c815a08e322d182f7f2dc84f627", + "description": "pan/bi: Fix source mod testing for CMP", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "12ca99f2c1ac70844153362528808858af9aec32", + "description": "pan/bi: Structify ADD ICMP 32", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ddcefefa7d5ad9fc111ca17495c002a6802dcd95", + "description": "pan/bi: Structify FMA ICMP 16", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3d41468e7df4486a4666e93439f52e0fab467438", + "description": "pan/bi: Structify FMA ICMP 32", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "527d7303cabb5512e7de2569feaaefc3a3ae4354", + "description": "pan/bi: Structify ADD FCMP16", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "74795dd3284772b8491683e66de83839880a964d", + "description": "pan/bi: Structify FMA FCMP16", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "28afe3037a2afc758ab8caecfd89a54f840ac8c6", + "description": "pan/bi Strucitfy ADD FCMP 32", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c861292ce20a202d0c18b04257ce55472a89767d", + "description": "pan/bi: Structify FMA FCMP", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7fe3c145d9728480106e8c5b4e97b289104e50e8", + "description": "pan/bi: Remove bi_round_op", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "95fc71ece29f97b093d5eb7d1146f1b55c61aae5", + "description": "pan/bi: Deduplicate csel/cmp cond", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "df486689c07d3965c9e4efa5d9444b91aecdc208", + "description": "pan/bi(t): Fix SELECT tests", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "814f2f1d33e0215a207f1dbeed645979daa21745", + "description": "pan/bi: Add CSEL.8 opcode", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e23d191245d4db63aecf83fadd80a9911a3dad40", + "description": "pan/bi: Add FCMP.GL.v2f16 on ADD opcode", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b4f2d3a51c988a53ebf5af31c6dd6d84e32d2ee3", + "description": "pan/bi: Add 64-bit int compares", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "52cc7165c6a1c7d9338edc9f0add5e8439d8a2c2", + "description": "pan/bi: Add some 8-bit compares", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2f286eed2a99da99a2934eeae6f1276894289059", + "description": "pan/bi: Add CSEL.64 opcode", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "100edfe26d947fbecd3cf2ca190478348601eb42", + "description": "pan/bi: Add bool->float opcodes", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "523e9603d3fba507be9fe9a70a24edfd0a41792b", + "description": "radv: enable FMASK for color attachments only", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "81ac741f8929b90a16a0b4251f3e6da02dde6133", + "description": "anv: Expose CS workgroup sizes based on a maximum of 64 threads", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "cf12faef614ab7cd9996410f1d161558a3853936" + }, + { + "sha": "86f67952d31554b8b4b5a9943b43ace988c6401f", + "description": "intel/devinfo: Compute the correct L3$ size for Gen12", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "8125d7960b672bcd597a2687e3078899afc52560" + }, + { + "sha": "7262c743dc84d4efa778658bb77b10850c29e014", + "description": "radv: Determine memory type for import based on fd.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f30983be3a87946083c58100d72717f9e522c949", + "description": "radv/winsys: Add function to get domains/flags from fd.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "bec92850270a046524056b8d43bbd2554ba9f2e0", + "description": "radv: Stop using memory type indices.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4a8d172d3f81ef981e386f0cc6c259c36818f697", + "description": "radv: Use actual memory type count for setting app-visible bitset.", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8e03cf15f9516642ba7f7cd7b7a2d7aad835796a", + "description": "radeonsi: Count planes for imported textures.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6747a984f59ea9a2dd74b98d59cb8fdb028969ae", + "description": "r600: Enable tesselation for NIR", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b6d4452661ae77f80d16f5c7f8d55b863ec79ab8", + "description": "r600/sfn: Add tesselation shaders", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d77b81ce50ea05736bc0554a1062156caffed358", + "description": "r600/sfn: Add lowering passes for Tesselation IO", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1b3e103d0bf1c506f9ec413be11af8bd207ad674", + "description": "r600/sfn: Move removing of unused variables", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "74e0a0a72398140a72c2f4a092982d5bef32ad14", + "description": "r600/sfn: Handle LDS output in VS", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f102301cc48913a5d61e20705218e49653b737df", + "description": "r600/sfn: derive the GS from the vertex stage for a common interface", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f7df2c57a207a386ba0d2130541ac9d0546670e1", + "description": "r600/sfn: extract class to handle the VS export to different stages", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "38038b369f92aa692188d88e7d14a8c5de209acc", + "description": "r600/sfn: Move some shader base methods to the public interface", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "93f5f9e584b3d39fc120d91b1e6276bc18e917d3", + "description": "r600/sfn: Add methods to valuepool to get a vector of values", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7cbca9cf64840627afa3f1de588442c5c2d96028", + "description": "r600/sfn: Move emission of barrier from compute shader to shader base", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "46a3033b43b9b51cae5c60eea39e7e5af325c4db", + "description": "r600/sfn: Emit some LDS instructions", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a122303711107a72dbc5ec84b8369fd4732f6c9c", + "description": "r600/sfn: Handle umul24 and umad24", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7e064659cb9665bb5d78c28156d146e4f8172a33", + "description": "r600/sfn: Add IR instruction to fetch the TESS parameters", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "075ea32e485252f0376ee7bbc84ed436e9eb4b65", + "description": "r600/sfn: Add TF write instruction", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "230beac5f8e5366082791b7b505583a5455e5495", + "description": "r600/sfn: Add LDS instruction to assembly conversion", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b9d175bed260995affc4aea0b511f8b1f0c1440d", + "description": "r600/sfn: Add LDS IO instructions to r600 IR", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "172868167ea250a077873d9df058692cc8c4cabc", + "description": "r600/sfn: Don't emit inline constants in the r600 IR", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9bc6c135acd59a48d35ce6d7fb619e064af04239", + "description": "r600/sfn: simplify UBO lowering pass", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "096a026354cb31a1c858e5ffe18ba9a2a02a9d65", + "description": "r600: Handle texcoord semantics in LDS index evaluation", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7d476a1360d3615f510719174887b9004dfdff48", + "description": "ci: bare-metal: power down device after tests", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b4cc116339f4eec9ddeba2b2efc9b07f06ae6d40", + "description": "panfrost: Fix GL_EXT_vertex_array_bgra", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0e135ca2271b12793cc7e3f36d123f4ff0917cc1", + "description": "ci: add llvmpipe paths to virgl rules", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7a0a6a718035e1a754972fbbad8b91d19f39fa42", + "description": "radv: do not expose GTT as device local memory mostly for APUs", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4a523baa00fcf12dabd2e7b054ce73ac238c11a7", + "description": "gallium/swr: Fix LLVM 11 compilation issues", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5082ac007d1758fdbe516649a1b28363ca32456c", + "description": "ci/freedreno: Add a test run of a few driver options.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b8c66aeb9341d695c79a2d69935016919c42f843", + "description": "ci: Clean up some excessive use of pipes in dEQP results processing.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "951e101fec2dba93e146a4d78d11d7cc6594985d", + "description": "ci: Allow namespacing of dEQP run results files.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "69c8dfd49f565283f599b3be9af3f1327ea78803", + "description": "freedreno: Fix calculation of the const buffer cmdstream size.", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "b4df115d3f3c7e5a1f6105c3737d0fe9a80dfcf2" + }, + { + "sha": "8b221e091427a749499179511d1c8438fd0dcd64", + "description": "ci: Add sanity checking that dEQP gets the expected GL_RENDERER.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a9e6a3ecc7de79dbaa8899d77c3b86fd303b513a", + "description": "ci: Enable --compact-display false on all dEQP runs.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "acc56300dcc0b026d219181854ae475dfc0e926e", + "description": "zink: explicitly unref old fb object when setting new one", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d3f0022a43a726af60bcd541162f8005610ba3ff", + "description": "zink: remove framebuffer cache", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "afd9274d4801d93dfaad13591d65c135d1c9e466", + "description": "st/dri: Set next in template instead of after creation. (v2)", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a1e453f504addc6c1c8b270803c85db394356770", + "description": "mesa/st: call _mesa_initialize() early", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "57f4c66028786d0efc4074811db79b784c11f9b8", + "description": "mesa/main: one_time_init() -> _mesa_initialize()", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6ff94735c93b8fcc2a3e5c6eb668b069ac4e0a1a", + "description": "mesa/main: Do not pass context to one_time_init", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ac9d30431e2c670ae134e2619be9817a99101e1d", + "description": "mesa/main: do not init remap-table per api", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9bc98778a4792e260e1f6d9000fda54734a48860", + "description": "mesa/main: do not pass context to one-time extension init", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "05c69752cfda9e66130c64c01f0c8fac613c83ad", + "description": "mesa/main: do not store unrecognized extensions in context", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9bc5b2d169d3a3c9d52fc30987eaba52e3b7dc00", + "description": "vulkan: add initial device selection layer. (v6.1)", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4a42a5058564a1d862e29eee80925ecd8b0ed1a2", + "description": "freedreno/ir3: Add support for disasm of cat2 float32 immediates.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "292231596b24a65861d6f63c1886131b33102527", + "description": "freedreno/ir3: Refactor out print_reg_src().", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3bcf819b438938e500acbf602cbacc00a864181f", + "description": "freedreno/ir3: Convert remaining disasm src prints to reginfo.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1462b00391711ff86350e802c1a3e5075a3a1632", + "description": "freedreno/ir3: Add a unit test for our disassembler.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "90984ba853297993221027c49ce31959c1634790", + "description": "freedreno/ir3: Print a space after nop counts, like qcom's disasm.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "916629f9d70d479aa1829e631792bf9ddd61004c", + "description": "freedreno/ir3: Fix the disasm of half-float STG dests.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6c01152c92cca2b8133e5a116335d0ef3a6cd474", + "description": "ci: Enable GLES 3.1 testing on db820c (a530).", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b34ee185f44c6d473e4e343d1e9f406a25dae67f", + "description": "freedreno: Fix derivatives without texturing on a3xx-a5xx.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "fa49a5032f33802fc136ba7095edaf06df1efa33", + "description": "ci: Enable GLES3 testing on db410c/db820c (freedreno a306 and a530).", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c259b3ea128405d2e6cf9c831061298959abeb79", + "description": "ci: Drop redundant freedreno stage specification.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "065068c66aba9b32ac4c65ac91549360b5eb1b7b", + "description": "freedreno/ir3: run nir_lower_pack", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "42093bb694bdcb6fb5bb762c118520c107456f4a", + "description": "nir: add pack_32_2x16_split/unpack_32_2x16_split lowering", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "cbeda7f78e36caa7e4ca775bd848e1c8d38ee5d7", + "description": "radv: Add WSI buffers to BO list only if they can be used.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9a61f2a8a9ca17e2d53dded9c1c490c890aa4a74", + "description": "vulkan/wsi: Add callback to set ownership of buffer.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "42b1696ef627a5bfee29911a780fa0a4dbf04610", + "description": "ac,radeonsi: fix compilations issues with LLVM 11", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "52aa730d07618513d6c055618069b2f4680974cc", + "description": "gallium/gallivm: remove unused header include for newer LLVM", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e2a7436dd10df70ba14d18ab7cf8ad538f80e653", + "description": "gallium/gallivm: fix compilation issues with llvm 11", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6943eda5c928a323019d48a6d2f401c74a88fb7d", + "description": "ir3: Use shared mediump output lowering", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "42c9bbaeed6c814981d7100afda05ab942d88bee", + "description": "nir: Move nir_lower_mediump_outputs from ir3", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ba8f7f3fa2c62ba8cc31dda5915b4e2a58eef00a", + "description": "nir/algebraic: Detect some kinds of malformed variable names", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "fc4eb0714cd6ddf3aaeb865ef0694fc6596f6d56", + "description": "pan/bi: Implement 16-bit COMBINE lowering", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "280b65126e8159e62517828eda12ed3789078aff", + "description": "pan/bi: Fix RA wrt 16-bit swizzles", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "64c33a459fa9a5c8b49a967c0fee75d5f80317d3", + "description": "pan/bit: Add SELECT tests", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "23ffaa16c7e227cd4d82350c223f965c0c8dac8c", + "description": "pan/bit: Interpret BI_SELECT", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a5bfe591963e2c814cd59bee52e2c1fddbe9686d", + "description": "pan/bi: Force BI_SELECT arguments scalar", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c12081dca1942bce675e73b3604ecb5955a63087", + "description": "pan/bi: Pack ADD SEL16", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d31e4879f0b0034f817e60acdd05574ed610e56b", + "description": "pan/bi: Pack FMA SEL8", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7b31f04bacfdb5420bc953ecdff0591058574f44", + "description": "pan/bi: Pack FMA SEL16", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ee561f0e6b8be3adeac4306234d7ff5027078e5c", + "description": "pan/bi: Rename BI_SWIZZLE to BI_SELECT", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b2c6cf2b6db11eb2293f59b42dfeb3d7481477b0", + "description": "pan/bi: Eliminate writemasks in the IR", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1622478fbdc885d05d43702c14b8d0b4a0e39fe3", + "description": "pan/bi: Fix ADD.v4i8 opcode", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "de12311431a6623eb4239fa6c566025174cb016f", + "description": "pan/bi: Add missing BI_VECTOR", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "667190d38a7afeeef39889a933b08c348503c071", + "description": "pan/bi: Assign blend descriptor for BLEND op", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1a8f1a324a22bcd99b31482002d4b380ffb7cb34", + "description": "pan/bi: Passthrough blend types", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5f953b8f5076ad041af05bb1dce5bcf90297a3ac", + "description": "pan/bi: Passthrough type for ATEST", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "462af10bb785fa99b082207229cd39313ab08773", + "description": "pan/bi: Pack fp16 ATEST", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c50bbfa0ab513a771167b3885fdbb2b5c75d2384", + "description": "mesa: Skip 3-byte array formats in _mesa_array_format_flip_channels", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ad5da3e63ee368e3fa420d4785c698273614683b", + "description": "mesa: replace GLenum target with gl_shader_stage in NewProgram", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "531728d6cbf0eb7d87698b9f03d0083ca0e2a7c0", + "description": "drm-uapi,radv,radeonsi: Add amdgpu_drm.h header.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "03ba57c6c53214b19aa0fdb66c680f2cadc3bbd9", + "description": "mesa: extend _mesa_bind_vertex_buffer to take ownership of the buffer reference", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e9afe045cf5382993da7d31c0bf340def7b97107", + "description": "mesa: add offset_is_int32 param into _mesa_bind_vertex_buffer for glthread", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b8223244c38ad1c433b33b37bff3f19e7a9d2cfc", + "description": "mesa: add Const.BufferCreateMapUnsynchronizedThreadSafe & MESA_MAP_THREAD_SAFE", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "19eb89b0f3da8c888177da8e1a59c10ea0abfb34", + "description": "gallium: add PIPE_CAP_MAP_UNSYNCHRONIZED_THREAD_SAFE for glthread", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "62154658426264c3f4dc5666ea04fc3fdd3d340a", + "description": "glthread: sort variables in marshal structures to pack them optimally", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6f8a387b37b3e1cd0374e1f42bc19601174c86d8", + "description": "glthread: use GLenum16 in batch buffers to save space", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b6b1ab8d548252f99df6c86cb124faa95abda26f", + "description": "glthread: reduce dereferences of the next batch", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "fc4b78f4cc31aa74054933ed65aae5712109bc4e", + "description": "glthread: use 32-bit align instead of 64-bit ALIGN", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "41671ec544dc76e7eb926910e3db69803550c3aa", + "description": "mesa: remove exec=\"dynamic\" from Draw functions that are not really dynamic", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "00b57915414da01868ff40ecacfe61db9af0d9c5", + "description": "mesa: reset primitive restart state in glClientAttribDefaultEXT", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ee0263e03fbc897767bf8b787dc0cc917481e826", + "description": "mesa: replace _NEW_EVAL with vbo_exec_update_eval_maps", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "cf2f3c27533d8721abed4cdd4dfb00d4d53e8a0f", + "description": "ac: reassociate FP expressions for inexact instructions for radeonsi", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4b9370cb0f3a2d9030e827f847f66bdefeaf08fd", + "description": "ac: generate FMA for inexact instructions for radeonsi", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f2c2a28073dd4ea0d104f284e874d136880a91a5", + "description": "ac: update and document fast math flags used by radeonsi", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3bb65c06706a58b124e4a600eeb35a7b3c5c3a23", + "description": "ac: force enable -structurizecfg-skip-uniform-regions for LLVM 11", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "eeab9c93db84e5759145891e8fdde66a5cdcf917", + "description": "st/mesa: Treat vertex inputs absent in inputMapping as zero in mesa_to_tgsi", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "d684fb37bfbc47d098158cb03c0672119a4469fe" + }, + { + "sha": "b785ad5853b1f75b2fd0280530e77cb63e71e8e8", + "description": "gitlab-ci: add lists of expected failures for RADV CI", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "574196d5f6f4769c7c5b8e126226dbda4b2f4df9", + "description": "radv: fix robust_buffer_access if enabled via VkPhysicalDeviceFeatures2", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8faa0e2c1b295d271a5ca98ac9c46462a1522524", + "description": "gallivm: fix stencil border", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "565df656513acec8c2d5fe915c51b4b901265fa7", + "description": "llvmpipe: clamp color storage for integer types.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "024b5dfc1c3eb7255bbec975d57d4002458096bd", + "description": "llvmpipe: enable stencil only formats. (v2)", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "65906d133130df5308b32c3fc92fb8690d231abc", + "description": "llvmpipe/setup: add point size clamping", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1f071db43a676c17c1765434ed0abfcf3d659815", + "description": "llvmpipe: fix d32 unorm depth conversions.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "fe5a8e1ace61cead276d0293c595536b1b9e48c8", + "description": "draw/tess: fix TES patch vertices in.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7b4a7a111754ff0849a1b7a131d359ea0b0cb847", + "description": "llvmpipe: fix ssbo alignment", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "93b8d8927579baf89bb6de58f4d3c8b11b142802", + "description": "llvmpipe: bump max images to 16", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e1c006204ff9fc32e66749b4cba9ea33b2c1af0a", + "description": "util/indirect: handle stride less than number of parameters.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "23efd323aadaec5370aa9eedf3e8c76c5fe204f4", + "description": "gallivm/nir: add helper invocation support", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "13e5f331db77b8c353981469a95e5557e3e2073a", + "description": "gallivm/nir: fix image store conversions", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "bf3c9d27706dc2362b81aad12eec1f7e48e53ddd", + "description": "tu: Don't invert point coords", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "180f98678f4329355a3abc8c1dc060b76b5afa15", + "description": "ir3: Remove VARYING_SLOT_PNTC remapping hack", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "662e9c180176e906406c65871b4c090702ed4c99", + "description": "st/nir: Fix assigning PointCoord location with !PIPE_CAP_TEXCOORD", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a64d2661340a659bdd0b729090b3aa6c135e8b4c", + "description": "freedreno/a6xx: Implement PrimID passthrough", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a661d18a394aac9d6e734f93e232d6c6ea069f94", + "description": "tu: Implement PrimID passthrough", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1f9839907a8eee15f634ff95577fbe498f1b70c2", + "description": "ir3: Skip missing VS outputs in VS out map when linking", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "cc530858c1e6adb761fca163f49432fbc71348b9", + "description": "freedreno/a6xx: Document PrimID passthrough registers", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0b44582394664087cc8d558f6c928f14e15f4616", + "description": "radv: Pass logical device to si_emit_graphics", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "bf542484ea87290f57e67558bd6ace165d8eb4a2", + "description": "freedreno/ir3: Print @tex write mask using 0x%x", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c801228f0d0567842d20da655e252c1e617b963a", + "description": "freedreno/ir3: Reset lex line number when we start parsing", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "34e7179dfa2217d003e398cae6f797a0b2dd6aee", + "description": "freedreno/ir3: Parse, but ignore @in, @out and @tex headers", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "da467817e3e25d201e94326ff876374093a3ba22", + "description": "freedreno/ir3: Move ir3 assembler to backend compiler", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "869d86e664a02de0dc9a7f31defaffaa8ef07f91", + "description": "freedreno/computerator: Decouple ir3 assembler", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "375c7a3863d141491bd81ca96605c709af118074", + "description": "Revert \"meson,ci: Disable sparse_array tests on windows\"", + "nominated": false, + "nomination_type": 2, + "resolution": 4, + "master_sha": null, + "because_sha": "6be65b077743fc80efe061b1e05cb13b2ff1a6b1" + }, + { + "sha": "cb055c6ca450768ded778e5a6797dd82b0a3d780", + "description": "gitlab-ci: install winehq-stable to get 5.0 instead of 4.0", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c8ccd63911dcec731d64c72a40ec6a3afe38eaa1", + "description": "etnaviv: Fix depth stencil ops on GC880/GC2000", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5c5c2dd48fe0910dc79d3187bed99a52b5ed2848", + "description": "radv/aco: enable 8/16-bit storage and int8/int16 on GFX8+", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "eeccb1a941e258190b5ba7a425f65599873f92ed", + "description": "aco: lower 8/16-bit integer arithmetic", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "bcd9467d5cc11709d8c6fcbbcebd90f7e5255533", + "description": "aco: improve sub-dword emit_split_vector() with sgprs", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a3dc1441f01e575d612fdbf900518a3a81df94ac", + "description": "aco: clobber scc in s_bfe_u32 in get_alu_src()", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "78389f4cbcf5b8c749dd60d9aed2b56a27b09327", + "description": "aco: handle undef p_create_vector operands in the optimizer", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "deea4b7c5aae064145f788cb408001a40526a18d", + "description": "aco: vectorize global loads/stores", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7db72066311bc3b2e536aabe52369415d5f03958", + "description": "aco: allow 8/16-bit shared loads", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "48b7beb7b049af8ef4178303caa0a77121ae426d", + "description": "aco: add and use get_buffer_store_op() helper", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "936b70c8cfe37a51a568a211acbc49b5fe997e00", + "description": "aco: refactor visit_store_scratch() to use new helpers", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "18817041f7a62806d53abee681d8eaaffcb87834", + "description": "aco: refactor visit_store_global() to use new helpers", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c7bd69b3ae9032558532f5af8b1c25ec136293c3", + "description": "aco: refactor visit_store_ssbo() to use new helpers", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f75c830433977a2a6f442457b5b3ccd5f712994a", + "description": "aco: refactor store_vmem_mubuf() to use new helpers", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "98b4cc7110d47ae2c8bab7c530cad757300a5323", + "description": "aco: refactor store_lds() to use new helpers", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "562353e1f1246bfe0f70315083b51d26d60d994b", + "description": "aco: add helpers for splitting stores", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "211a9f2057321e6a3500d9b7873085621604e336", + "description": "aco: use emit_load helper for VMEM/SMEM loads", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "57e6886f981ca629a863544df253b9ecb3604eec", + "description": "aco: refactor load_lds to use new helpers", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "542733dbbf7ae49e2df81da6dde31aa2dcd9afe8", + "description": "aco: add emit_load helper", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b77d638e1bacfdaffd010b72264ab4c0a5745e73", + "description": "aco: add and use RegClass::get() helper", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "69b92db131b00fcfb64d77f876e51770d2f0aa5b", + "description": "aco: be more careful about using SMEM for load_global", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "03568249f92eb1def932696e6ddc83fa305a7083", + "description": "radv: allocate larger shader memory slabs if needed", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "51363bd4751c2f8e388c229fe91507a7a181517c", + "description": "radv: align buffer descriptor sizes to dword", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "62ff2ff8086fd3bbff02004628e0c7498fe3294e", + "description": "aco: Move s_setprio to correct place after the gs_alloc_req.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "277f37d036159d373ec9726fe00148d5e49da875", + "description": "aco: Use 24-bit multiplication for NGG wave id and thread id.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "eafc1e7365ec52d7cb979396ff977d6301cb4b7f", + "description": "aco: Use 24-bit multiplication in TCS I/O", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "64332a0937f731fe7b090bee7d3e9f813e341e5b", + "description": "aco: Const correctness for aco_print_ir.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0c0691d43eb1fd6071e6f7fe535242206cb1706f", + "description": "aco: Const correctness for get_barrier_interaction.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f321dc33c82635b5190845e59553ccb3ccfd332f", + "description": "aco: Abort when RA can't find a register.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f2e7aee2440943f13d221fddcb3aabbb5bc3c59a", + "description": "aco: Increase barrier_count to 7 to include barrier_barrier.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "25775d346c5bd91c8def493ee6e76d1c8e44b059", + "description": "aco: Only store TCS outputs to VMEM when they are read by TES.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b779d05d711dedb32f6aca85ba4f9b28be78e7ea", + "description": "radv: Add inputs read by TES to radv_shader_info.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c3ef0275c49845f91a3f5d97088954a6d9b877d2", + "description": "turnip: add adreno 650", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "aa3624b8ab7815e7ac54ba656d4e8ffa6ae25e03", + "description": "turnip: use RESOLVE_TS event", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f81e56c9a06c06780139bc4a19befea73aa10144", + "description": "turnip: remove unused RB_UNKNOWN_8E04_blit", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c68313868921f8d7125e46091cd92dbe00f845ec", + "description": "zink: set UBO alignments in nir_intrinsic_load_uniform lowering", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "fb64954d9dd55b45239c27af122bf60c3962d006" + }, + { + "sha": "155033bbb3a577945f75ee8b67999f6393580531", + "description": "freedreno: allow FMT6_8_UNORM as a UBWC format", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9c2a11430e101b820ce8d605f3cc5b0593bb4c5e", + "description": "spirv: Rewrite CFG construction", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "80ffbe915fe1cb0b8229d349e2d02f56d17c3a19", + "description": "anv: Add support for HiZ+CCS", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "752eefdb3db18389dba56dd7c4f9ca45ebe8fadd", + "description": "intel/isl: Refactor isl_surf_get_ccs_surf", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3eb1993625cf14695c352b5996c99bcad041daf2", + "description": "intel/isl: Delete a misleading comment", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "483a1d5e6c083e76985bb86c3aeae1e4e3b50d40", + "description": "anv/cmd_buffer: Move anv_image_init_aux_tt higher", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "65e541ab16a156b0128e4c6917af86d0a6be264c", + "description": "anv: Simplify a case in layout_to_aux_usage", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5cb6c5d11d3eee083bb16942cb294434626cc14c", + "description": "intel/blorp: Allow more HiZ usages in hiz_clear_depth_stencil", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0d91dae7f0855aa6cd36d247ee126f7030f75137", + "description": "anv: Generalize some aux usage checks", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "86ded00c4021e015ac38a3274309414b2e0b6caa", + "description": "anv/blorp: Do less hard-coding of aux usages", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "54b525caf0aa9966f5c0aa359709f43038bbd5ca", + "description": "anv: Rework anv_layout_to_aux_state", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "eb0cede5866f3cbd4b3e956b9670d61822b86e15", + "description": "anv: Be more conservative about image view usage", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d2f3576d335c09313216ee052bf7570da8a61ff1", + "description": "anv: Move vk_image_layout_is_read_only higher", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5de9f4409a6d885755fc9885403423f4ff0650dd", + "description": "anv: Add a vk_image_layout_to_usage_flags helper", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e3ab86c599d50c5b2f440430b10d6700ae0bf0f4", + "description": "anv: Enable HiZ on multi-layer depth buffers.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "709f26c47df758cd3d3952c5a9edc40053ffded9", + "description": "etnaviv: support for using generic blit path", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b043c40edd36f9fc1d84dcfb3ab1c2b8de5a6035", + "description": "etnaviv: call util_blitter_save_fragment_constant_buffer_slot(..)", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e731740388ca2822ad6a9a09f49be2d05d0a5c38", + "description": "etnaviv: drop default state for FE_HALTI5_ID_CONFIG", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4b0a732db33fb2d49c86e372ec9b127dab273550", + "description": "docs/features: mark GL_ARB_texture_filter_anisotropic as done for etnaviv", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "73f7f73ef30fdc8178265f8d79ff078cdd50e5cb", + "description": "freedreno/ir3: fix incorrect conversion folding", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "dd49a404106852804544eaf695e46b2f5ccd0a0f", + "description": "freedreno/ir3: set even bit for f2f16_rtne", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "edc35c1f54a0f72b6c0f01b2156c10c904459b4f", + "description": "freedreno/ir3: fix 16-bit ssbo access", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ede1c171c550a48a7957af091e7ac84e088bc6ba", + "description": "aco: fix outdated label_vec from p_create_vector labelling", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "2dc550202e82c5da198ad0a416a5d24dd89addd8" + }, + { + "sha": "fdf9b674ee3a17c98fd266750dec3475910542f6", + "description": "nir/lower_subgroups: Mask off unused bits in ballot ops", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "d10de2530976ed3aba9c5d077e2edb141f71e7dd" + }, + { + "sha": "9c009da208b77496011f149fd1e289656da0f226", + "description": "anv: Drop an assert", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "4ef3f7e3d37ece7b4339870282cb52c5e334a68d" + }, + { + "sha": "b520a58cc1434fdc6bf7f9fd9b68c74ebad04ef2", + "description": "radeonsi: use pipe_blend_state::max_rt to update fewer blend registers", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b4fd8f19192695e5ae7d079e2cd42b610a22265e", + "description": "ac,radeonsi: simplify checking for Navi1x chips", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d8443b211e1ea5fad068f78a8b1f4e610be9b676", + "description": "ac: out-of-order rasterization is not supported on gfx10", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e43fc003e0ed9ad5ba6e19a1132457233edba6eb", + "description": "turnip: divide cube map depth by 6", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "bc5c43828970b49a9c6b2d08d9fb7f46a3300ae4", + "description": "spirv: Fix passing combined image/samplers through function calls", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a1a08a58025a6026aae0f73b67451198f5befb42", + "description": "nir/opt_deref: Remove certain sampler type casts", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f4addfdde39070879ed8b1f08fe3bd85f2b0e392", + "description": "spirv: Use nir_const_value for spec constants", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6211e79ba5f4be57c088fdf6140854f67c9a37ec", + "description": "turnip: Properly handle all sizes of specialization constants", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a4885df9f82130132fe361a547b9e61c96787d61", + "description": "radv: Properly handle all sizes of specialization constants", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a44e63398b045f0a5f56e4d719d25a8501ab53cd", + "description": "anv: Properly handle all sizes of specialization constants", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "64e4297629a1c4be501b40fb3529ff11441eff99", + "description": "spirv: Allow constants and NULLs in SpvOpConvertUToPtr", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "fb282a68bc46a1e28eaedb2670be241401f2b9da" + }, + { + "sha": "4dc7b7627671eeaefda55b21bc9a7a5a06b43c30", + "description": "anv/radv: Resolving 'GetInstanceProcAddr' should not require a valid instance", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "665250e8300e2b0f3eae27628a9a6f2666e650dd", + "description": "aco: fix v_or(s_lshl) and v_add(s_lshl) optimizations", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "d1621834f367d41500b7c1a819c046eb429fb8a6" + }, + { + "sha": "58b8fbb824f22ed7009747bdab23b919966a8d7a", + "description": "glsl: remove some duplicate code from the nir uniform linker", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ffbec55072ed952db034779e3d0505a61c833397", + "description": "glsl: some nir uniform linker fixes", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9df1d92bbd5b2c6d6382e5d9bd640313fb279e1a", + "description": "drm-shim: stub syncobj wait ioctl", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "53f151f422180a1cb9da0f0e2e12bc95abce4ce0", + "description": "drm-shim: provide a valid fake syncobj handle at creation", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "00f5ea9fdc8f5a8d460767cfa8a10639646fb665", + "description": "meson: Use dependency.partial_dependency()", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "53f9131205a63fa8b282ab2a7e96c48209447da0" + }, + { + "sha": "7aaa0e59086fa2bf9c5fa7db2774cb2aa32f95b9", + "description": "etnaviv: add anisotropic filter support", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1d4c191572250c559f2c3574a7cb1f8a96ad878d", + "description": "etnaviv: update headers from rnndb", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7d77295515ecc204dd5c5f6c6fd39e122ee5ea26", + "description": "etnaviv: anisotropic filtering is supported starting with HALTI0", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7ee9f851e26bb656cebd3e12f0d9bb860201b8fb", + "description": "spirv: Update the headers from latest Khronos master", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5620c3efd85f42a1301d63d55195704ee8365e5e", + "description": "spirv: Handle instruction aliases in vtn_gather_types", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8cba1a13fa6baa6fd55a80b5bd1fce6cbb4b12f8", + "description": "gitlab-ci: Test Virgl with traces", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5a5316ee1bc2cdc8515819d575418cdc70cb58c5", + "description": "gitlab-ci: Test OpenGL ES 3.1 on virgl", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9b7c20b315c6bc9b54d844cfc58c80d32e9cd1a6", + "description": "gitlab-ci: Allow test jobs to add options to the dEQP invocation", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "34ed5fff5b109f9fbd359bbaeeb818d48de5539a", + "description": "gitlab-ci: Update virglrenderer in the x86_test-gl image", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a3d2936a8e9e1c263e5d18b6832c238e7aa6700e", + "description": "panfrost: The texture descriptor has a pointer to a trampoline", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "36d49b1fb18a9b401c47d53ab75942d496c40e1c", + "description": "panfrost: Identify texture layout field", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ad4024968eedebc4fd05322c3346c30aa5d4d56d", + "description": "pan/decode: Remove is_zs weirdness", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e41894ba15b4150a8dfd884503ba04c2c33aab6f", + "description": "panfrost: Emit texture descriptor on bifrost", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d3eb23adb50c621f49000191e6c024df01f090b7", + "description": "panfrost: Emit sampler descriptor on bifrost", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "497977bbe612cf023a1157fe2fc1d93f88ffe1f6", + "description": "panfrost: decode textures and samplers on bifrost", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0167391a1ac9d6b5a519f67a7d0fb58eef89da0d", + "description": "panfrost: Add tentative bifrost_texture_descriptor", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "81a31911ddcf640d602ec104f7bbc3188dad3b7b", + "description": "panfrost: Set clear_color_[12] in the extra fb desc", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0a0b670d633b007e0d6394919fa0afa261614d32", + "description": "panfrost: Clean up a bit the tiler structs for Bifrost", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0d6019302eb772b1c399cecc3f5abf1940f0c0d0", + "description": "vc4: Use NIR shader's num_outputs for generating our new output.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5593d80a2c0db362e80c7733bc4a3f2899c288bf", + "description": "freedreno/ir3: Fix sizing of the inputs/outputs array.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ac937bf878c30f1879adfb31f68a4dec15d3b616", + "description": "freedreno/ir3: Fix driver_location of the added vertex_flags varying.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e82ce1852a40d2648c98317da2c0f2cf656d15c7", + "description": "gallium: Fix setup of pstipple frag coord var.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "035fd4fb9f403f281e95b63290fa6e3e6a6ee22f", + "description": "nir/lower_clip: Fix picking of unused driver locations.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "91668ae8391d3e4d14f5cfe60d2755385a81a64d", + "description": "nir/lower_two_sided_color: Fix picking of new driver location.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "49ce749d0e25d957c6a38f1165b63a31baed708d", + "description": "nir: Add umad24 and umul24 opcodes", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "42aa348dadeac7faf21ec8e9d8109255f2adf124", + "description": "nir: Add r600 specific intrinsics for tesselation shader IO", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e9add0c501c3765cae43ef60ec58404c2340991b", + "description": "drm-shim: Let the driver choose to overwrite the first render node.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5a8718f01b3976e1bc82362a907befef68a7f525", + "description": "freedreno: Make the slice pitch be bytes, not pixels.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "bd76a24fd130bb5a45fea72a3041104ccfb4a8d1", + "description": "freedreno: Introduce a \"cpp_shift\" value for cpp divs/muls.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6a6e71524dc8d6795c7d6188538c8496f2f4c025", + "description": "radv: adjust the supported subgroup stages", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "efdb7fa9a83b0a216b1837a5912b71669bf3f984", + "description": "anv: force whole EU array to be powered for perf queries", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a7998371ed0208583cdffb28a6befc1134a9a27b", + "description": "intel/perf: specify sseu configuration when supported", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8f152ed101fbf3fad3f914a19d260c3bab556c45", + "description": "intel/perf: store default sseu configuration", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ea8cb79742fb061817c11bc8ee7854d3b2583283", + "description": "include/drm-uapi: bump headers", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ff3f775476a907cd9410572d11e38b6c29e5e062", + "description": "radv: simplify checking for Navi1x chips", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0d9fe0405f5520716703d61544c6d899f051aa8b", + "description": "aco: improve code for 32-bit isign", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d1621834f367d41500b7c1a819c046eb429fb8a6", + "description": "aco: combine VALU and SALU into various VOP3 instructions", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "607fb4153da08cdec6845e8505983fafa3cd63b3", + "description": "aco: move call to store_output_to_temps in store_ls_or_es_output earlier", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b497b774a5008c5c424b05cdbc3f4e96a6765912", + "description": "aco: remove copy in load_input_from_temps()", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2dc550202e82c5da198ad0a416a5d24dd89addd8", + "description": "aco: copy-propagate p_create_vector copies of vectors", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e4383b5c7f24a20ba16b0bb4f74fe5cecf406ddf", + "description": "aco: decrease the uses of other copy operations after splitting/removing", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7f17a0a809195cefa2240f55d30f00e3fe0572b3", + "description": "meson: correct windows-version define", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "3aee462781abc0bfcce207fb64f9199b43a57542" + }, + { + "sha": "32d871b48fbf38cb309eaaa13c8b425695141b60", + "description": "nir/algebraic: don't undo lowering of 8/16-bit comparisons to 32-bit", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6d792989924ce79363f181462904fa46692a99b5", + "description": "nir/lower_bit_size: fix lowering of {imul,umul}_high", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "715ef95700c06a09582744f3d873107728615b7f", + "description": "nir/lower_bit_size: fix lowering of shifts", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "58f25098a0dc4f4976dadacdc4e7a9db42ec0c50", + "description": "radv: Use TRUNC_COORD on samplers", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7086b38c81ebe2f0520461c1bc1a7b92863cf871", + "description": "radv: make sure to export the viewport index if FS needs it", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "b424d49ac05563fd7d9f217fae9c25fc602f4330" + }, + { + "sha": "133efa112dfa08f8f28ea2dde42a6072140cd977", + "description": "radeonsi: enable support for AlphaToCoverageDitherControlNV", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ede36a2efe4d40c0ec81f465fdde761a7c335290", + "description": "mesa: add support for AlphaToCoverageDitherControlNV", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d82f0572180a0d3f4bb87cf49c325439d38ab6e3", + "description": "gallium: prepare framework for supporting AlphaToCoverageDitherControlNV", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "227df2a2badf0047a3e6e2c1d196aa1b26427b10", + "description": "turnip: Fix crashes when geometry shader constants aren't used", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "85f84ea148474554af42ca513b9cb7c43a78a738", + "description": "gallium: add # of MRT to blend state", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b88778e2de3a593587e20a8d4f0363a499f91455", + "description": "mesa/st: avoid u_vbuf for GLES", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7e1b57a6d964ac58e84ec4ece2951e4e643d6b1a", + "description": "mesa: avoid redundant VBO updates", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "155bb74ea965e9b686a6bce89c7a77065f41755f", + "description": "nir: Actually do load/store vectorization beyond vec2", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "51c1c4d95a05b6eb6fce74e8d624615e4a1b38ab", + "description": "mesa: enable GL_EXT_draw_instanced for gles2", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0edff5123c4f27ff0f8e35d29c2c45a230d3f939", + "description": "turnip: Skip unused regs when setting up streamout buffers", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "374406a7c420d266f920461f904864a94dc1b8c8" + }, + { + "sha": "e892733b80fb2ecf4f48787116e47b8230fcf951", + "description": "turnip : Fix wrong offset calculation for xfb buffer.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e34b0d65f9715b9376408a769e2005bb579128f8", + "description": "turnip: Implement and enable VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "aff02dd76ba7d5eacda231e386945cff60d6a0c5", + "description": "turnip: make the struct slot_value of queries get 2 values", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "259cae4442ba3ad1d1b8e981e47f42493d93ca86", + "description": "intel/compiler: Don't create 64-bit src1 immediates in opt_peephole_sel", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "a4b36cd3dd307d75913dbc183cdf2d0c1e97ea0e" + }, + { + "sha": "4459a70a6e86ebe9e6e58510069ed8a499e792e3", + "description": "intel/compiler: Delete abs/neg handling in fsign code", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "b7c47c4f7cfd0513ee2b98179cc22f402e5b3817" + }, + { + "sha": "220f0e10d8c27684b1c91c5291bdf8a8012e2a1a", + "description": "intel/compiler: Don't copy prop source mods into PICK_HIGH_32BIT", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "b7c47c4f7cfd0513ee2b98179cc22f402e5b3817" + }, + { + "sha": "be33cf8ad04855d37a7f756ba46c1ee49461b908", + "description": "docs: update calendar, add news item, and link releases notes for 20.0.5", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "defc6400e132c39955f1e79f84a33e8651a63c97", + "description": "docs: Add sha256 sums for 20.0.5", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c790e1c642e8bd7267c72fb218f97d90572287e1", + "description": "docs: Add relnotes for 20.0.5", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ad460c5dd6ff3974fdc4c8887d035f9dea0ad01b", + "description": "v3d: support for textureQueryLOD", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9fd180394bddbb2abd7edb39852b1419249be450", + "description": "nir: add nir_tex_instr_need_sampler helper", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "41bfd0812b4aef70a6acd6c4d389c722f45e29c5", + "description": "v3d/packet: fixing TMU_Config_Parameter_2 definition", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9967c26ae6c5afffd238a8c33b4e97457283a9ca", + "description": "v3d/tex: Configuration Parameter 1 can be only skipped if P2 can be skipped too", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d0b644d9f9d9673d3fe28c8c200209f553adeda1", + "description": "v3d/tex: don't configure tmu config 1 if not needed", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c552b5fd1d106adc04f62fcbe71d650a9a17f7e0", + "description": "turnip: implement VK_EXT_sampler_filter_minmax", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a77e2ac835e5a86965f61a1d628671cf7535890b", + "description": "turnip: enable cube arrays", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9daeb504543ccf3851ed058a860ada7d84de6f22", + "description": "turnip: implement VK_EXT_filter_cubic", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a92d2e11095d9f1f8bc1188fd3d2b8391acc4591", + "description": "turnip: implement VK_EXT_sample_locations", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "83b2f1d8cf53ec7f9defa2acdae313cdacb303af", + "description": "turnip: set shader key msaa field", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "36e0d2f39b0264e393fd4edab7c87d3e0d5454a4", + "description": "aco: coalesce v_mad's accumulator with definition's affinities", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d000d76f13e24aae701de0d4ab43bc06c3c9b361", + "description": "aco: use upper part of gap in register file if it is beneficial for striding", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d666d83be2fa03210f38b06e72410e8d8221b184", + "description": "aco: try to always find a register with stride for even sizes", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5a3c1f4f0bfbcc9ea1900891435c28df73b5afa8", + "description": "aco: stop get_reg_simple after reaching max_used_gpr", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2796cb4c2481c35b9510c03dad3a5ebe65a82d51", + "description": "aco: refactor get_reg_simple() to return early on exact matches", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6792e134f37323d8b56a60b4620e782fc0d673dd", + "description": "aco: don't create vector affinities for operands which are not killed or are duplicates", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "edc2b57ac14c6f9f3dadd3d7282e9d6ac1bc4304", + "description": "aco: allocate full register for subdword definitions if HW doesn't support it", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "97a870cf88a551cca9a1fd0773d183cddc4b2561", + "description": "aco: move attempt to find strided register into get_reg_simple()", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c7f97f110c4c42bac54fd37a15e265669c8a2ab8", + "description": "aco: use DefInfo in more places to simplify RA", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "734f86db6b44544a15dc39f91e9951f411c5207d", + "description": "aco: create and use DefInfo struct in RA", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5b2f628da319cdbc2811ab3964d8031ca962f80c", + "description": "aco: create pseudo dummy instruction in RA to be used for live-range splits", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d9f7d1d5cb149cf86667876460fdff2e395c9bb1", + "description": "aco: refactor get_reg() to also handle affinities", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7c8f4ebca93f0252419996618a49d507bbfda231", + "description": "aco: refactor get_reg() to take Temp instead of RegClass", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0a9ed981780c98c33bf14829ef5bbe5a2c409882", + "description": "aco: simplify operand handling in RA", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a5cce9528072ed4ca60c6191471eca9bbe2e08cc", + "description": "turnip: enable VK_FORMAT_S8_UINT as stencil format", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "44c6c145daadf618607abb997f20608e820daee0", + "description": "turnip: improve GMEM load/store logic", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e72201c7873ea22dadf8d1775f97400a435a8b9a", + "description": "turnip: disable depth test for S8_UINT attachment", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f13049f48a068b435f3dfb24c9af801475f16fdb", + "description": "aco: implement 64-bit sgpr swaps", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2ab45f41e08a3892138a1e9b20552621b4e18682", + "description": "aco: implement sub-dword swaps", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "83fdb1ed3dd13228bcb761a4a4532b67a24a682b", + "description": "aco: add VOP3P_instruction", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8fc24f9a45d649fb644ce78f954ae7b6a04e26db", + "description": "aco: fix copy statistic for 64-bit vgpr constant copy", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4daa3917a38a6d18ba7cc66071342b9f7fa92f53", + "description": "ir3: Fix bug with shaders that only exit via discard", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8cfa60eab85bfe08098ddd014da861b9e3f6aca3", + "description": "ir3: Don't double-insert the first block", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "66229aa16968eb60dd631a8f48f593a4fa8478d5", + "description": "spirv: Expand workaround for OpControlBarrier on old GLSLang", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f402b7c57641dd2ab4d39032574294c03d75b595", + "description": "iris: fail screen creation when kernel support is not there", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "bca97abffae221ea8cf402032538a331abf1ca8e", + "description": "gitlab-ci: add a list of excluded tests for RADV", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f1a12d6855cbbf4fc337df6f299c006b14f3435e", + "description": "meta,i965: Rip GL_EXT_texture_multisample_blit_scaled support out of meta", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c6244f9311d0aefc806a0fa14785b3de78312bba", + "description": "panfrost: Assert on unimplemented fragcoord etc", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "133c1aba051f838a7651c966bdedc1e972d3e029", + "description": "panfrost: Fix crashes with small BOs", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5c6952108c337e717df2607632b14275fc76b398", + "description": "pan/bi: Assert out multiple textures", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3551c138deb631fbe3e4710b8d4a862b79d3e360", + "description": "pan/bi: Pack TEX compact instructions", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "cd5fe3b9e0265c9a14ce9117b23a214d469f67c1", + "description": "pan/bi: Generate TEX_COMPACT instruction", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0769036a5cdbf654b8924d68a7bc3d40cf2f37be", + "description": "pan/bi: Stub out tex_compact logic", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f85746af3587bfaa9f9be0a12792e73f63d10258", + "description": "pan/bi: Add normal/compact/dual switch to IR", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "93be49b14bcd15b2c6e056a8499c435dc1bbd182", + "description": "pan/bi: Feed data register to BI_TEX", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "76d1bb03d5301698204d39a30e7e2afba86383b7", + "description": "pan/bi: Include TEX_COMPACT f16 opcode", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "bfc06b10dea5a85f64a0c7d12f2c9e748c71d756", + "description": "pan/bi: Structify TEX compact", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "cf7b952308a349994a24ea27b56e8fc5ae37a70f", + "description": "pan/bi: Disassemble f16 dual tex", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a2c735350ff0833a9a1f045a6ec8b163b01c56df", + "description": "pan/bi: Document when dual-tex is triggered", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6fe41a12e307c91794c80d7284cff0d7f885b2d4", + "description": "pan/bi: Print tex_compact coordinates", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "902c8731f4e16be4c66bb4280550a1c2d9d28537", + "description": "intel/compiler: Put back saturate on [iu]add_sat opcodes", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "b7c47c4f7cfd0513ee2b98179cc22f402e5b3817" + }, + { + "sha": "f699bb42af2b4d3959ac04ce86f1a096dc85fe69", + "description": "panfrost: Align Android makefiles with recent changes", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2f4a3c1ca07e63b725c6eb4f013c5ee9efb0c1a0", + "description": "freedreno/ir3: Drop handling FRAG_RESULT_DEPTH writing to .z", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "eab73799d16ea142dcbcabb6d5bedf860b5c9af7", + "description": "turnip: fix GMEM resolve in CmdNextSubpass", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e4521aeafc8254da639feb8a4421a445a88d9fde", + "description": "gitlab-ci: adapt query_traces_yaml to gitlab specific changes", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "acf7e73be54c7f1cc52fcc9be38a9df26849200e" + }, + { + "sha": "0a884d730455c3faf1ea48d4693c14f9f1e0c869", + "description": "egl: simplify client/platform extension handling", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "013d9e40feed336d983b728357e4ce77b871c36d", + "description": "mesa/gallium: do not use enum for bit-allocated member", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "b1c4c4c7f53cde52a8f3b3ec17cd66b89bf04199" + }, + { + "sha": "a842dc154d3cac5af7ff30f7d0501ae42a1d1d7b", + "description": "util/ralloc: fix ralloc alignment on Win64", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b7c47c4f7cfd0513ee2b98179cc22f402e5b3817", + "description": "intel/compiler: Drop nir_lower_to_source_mods() and related handling.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "fdd0ce12ac88e433c7712acd5226fa07dc870057", + "description": "meson: update llvm dependency logic for meson 0.54.0", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "821cf6942a390f5f64d8a2cff9933b24c84f7dc1" + }, + { + "sha": "8e3696137f2cb7b4f5a3824f26186ecbb06f9282", + "description": "remove final imports.h and imports.c bits", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "289f02d1d5990e052e21eb250f6d40b47d6eb12f", + "description": "dri/nouveau: replace assert with unreachable", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c3db0936ef3077b656d55208dd5ae10437ddc2f7", + "description": "mesa: move ADD_POINTERS to macros.h", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "bf188f3494b85f550a39dddbf059669c2a9ee324", + "description": "mesa|mapi: replace _mesa_[v]snprintf with [v]snprintf", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c495c3af26b73cb1d444fdd67cc6c1f0226bd168", + "description": "replace imports memory functions with utils memory functions", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "bb560f2d653dd5d080c7f03859936ce50bea5b72", + "description": "util: Add an aligned realloc function", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b85775900d084e3d27f269c3bd336b9aa356b98d", + "description": "replace malloc macros in imports.h with u_memory.h versions", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9ee6e78a8716eed09a088dad2d6153373423a565", + "description": "Replace IS_INF_OR_NAN with util_is_inf_or_nan", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "369f00259113d5c157b88d52bd002d292c21fedf", + "description": "move windows strtok_r define to u_string", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "53c36dfcfe3eb3749a53267f054870280afb0d71", + "description": "replace IROUND with util functions", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "df3ce8fb77e718ee4371fe7ca9f4a7c889319efb", + "description": "mesa/main: remove unused IROUNDD", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "64014c8302da52480643cb7711298153cfdb0d51", + "description": "Replace IROUND_POS with _mesa_roundevenf", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9d9a2819ee0e862f60abf50ba239a341b664845a", + "description": "replace IFLOOR with util_ifloor", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "72acb66527df6f38c7b8b15fa5062a616d67074b", + "description": "u_math: add x86 optimized version of ifloor", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "bd4e769515345a6b20562310334bc828c0bb6605", + "description": "replace LOG2 with util_fast_log2", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f8e4542bad7dd9bb97b2990947ef74dbb2ee75e4", + "description": "replace _mesa_logbase2 with util_logbase2", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e190e8cef2eaeabc16dda0cbd56addcd81968834", + "description": "replace _mesa_next_pow_two_* with util_next_power_of_two_*", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e533fad182e7069ee0850154f51316dd779501c3", + "description": "replace _mesa_is_pow_two with util_is_power_of_two_*", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c1e7c1f4224789f0bc4cc847cecde350e2c6d2f2", + "description": "freedreno/drm-shim: Add support for faking other adreno chips.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "cc239207463916e992367e9f53351883bf82ea06", + "description": "r600/sfn: use new temp register allocation when loading single value temporaries", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "50b66622f19aa5e3d7c393e9bbff847d16d788de", + "description": "r600/sfn: Count only literals that are not inline to split instruction groups", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9c7ce4d76e7b772e9d51dda2532a94d69bd4bee1", + "description": "r600/sfn: Fix using the result of a fetch instruction in next fetch", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "67495ff9aa6bed9bce37a064b33ef561809fc35c", + "description": "r600/sfn: Fix handling of GS inputs", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "58d6cda5f523639f016feddb5d98382f6a160ea5", + "description": "r600/sfn: Handle b2b1 like it was a mov", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "de7ea88ff852080429c46da332f38224e01e0e36", + "description": "r600/sfn: Fix null pointer deref in live range evalation", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5d10e3ec6066239d732d19f69cd95da447e73e32", + "description": "r600/nir: Pin interpolation results to channel", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5e036fef1f48d9946385b7fc13ee64e613e2264d", + "description": "r600/sfn: Implementing instructions blocks", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b51ced7306ea18df1c5ded700608f01db4f01e6d", + "description": "r600/sfn: Fix setting alignments when lowering UBOs", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "bc9cf6adff663889c3816b590e0b045956164ab0", + "description": "r600/sfn: Reduce array limit for scratch usage", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6fdc75d1c6d84533c8488c712e53e7828de41456", + "description": "r600: Dump a few more variables when requested", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f06e4ab3190a0a715447c4df4017892adb8708dc", + "description": "anv/android: fix assert in anv_import_ahw_memory", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "c79a528d2b58dfa6f3d856271cbf29697a5322a1" + }, + { + "sha": "829013d0cad0fa2513b32ae07cf8d745f6e5c62d", + "description": "st/mesa: Re-assign vs in locations after updating nir info for ffvp/ARB_vp", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "d684fb37bfbc47d098158cb03c0672119a4469fe" + }, + { + "sha": "ae169f38cead48a669d39fcd4ab7747da56e19c2", + "description": "tu: Fix the advertised maxFragmentInputComponents", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "45ec9c0f3de7795c1cb910718749ad828368ca8a", + "description": "freedreno/a6xx: Expand various varying-count bitfields", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d29fea77b9182915c1689634ff2376ac3c8fc21b", + "description": "docs: remove outdated sentence", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "d4cb9ef8260ced0a5693db18dedbdd11cbcfa7e6" + }, + { + "sha": "56f174d14eb4bdac07ae34b0538cfcc217d7eca5", + "description": "st/omx: fix gcc warnings", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "07071cac7b97b20ba3b5a633171af7ac8ac09d00", + "description": "gallium/utils: silence strncpy warning", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "dbfeec62c3852293d5f029db73ff7d63ff0f14e5", + "description": "mesa: fix crash in find_value", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7c43b8ce1b82f41e03147f824e87195ca8f1cb49", + "description": "nir: Delete the fnoise opcodes", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4386c06770508d86eaa51839871767887f903d1a", + "description": "glsl: Hard-code noise to zero in builtin_functions.cpp", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "95f555a93a8891ebba2a291eecd984eb2364d636", + "description": "st/glsl_to_nir: make use of nir linker for linking uniforms", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0f79e0f7c62d4ac34e6a4a827b5433402cf7c223", + "description": "glsl: fix gl_nir_set_uniform_initializers() for bindless textures", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "954644022771f45b1e6f719cab399a949a3fbb22", + "description": "glsl: add bindless support to nir uniform linker", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "57e65cabd4f030f325fce2ef3e52af77792c4b66", + "description": "pick-ui: show commit sha in the pick list", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "32451a15ecb6b4bb42a2b77d5125eeb1163beea2", + "description": "pick-ui: make .pick_status.json path relative to the git root instead of the script", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "26a26a358484bf03a6498c4116d8cb496f668cc1", + "description": "pick-ui: compute .pick_status.json path only once", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a4b36cd3dd307d75913dbc183cdf2d0c1e97ea0e", + "description": "intel/fs: Coalesce when the src live range is contained in the dst", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "14b8d979db4bfde66b8c5cac00a4cbabb285b816", + "description": "intel/fs: Rename block to scan_block in can_coalesce_vars", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "064d39e620f0dfd45dbdcea798e266464f769c8f", + "description": "radv: use common nir_convert_ycbcr", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7870d714598f80305d9b44a1ed12e981c67feed2", + "description": "anv: use common nir_convert_ycbcr", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "71820c6b02d71e40d413e2d080e87108e64cfeeb", + "description": "nir: convert_ycbcr: preserve alpha channel", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f8558fb1ce770a817a16bde439dd8865931f59a2", + "description": "nir: add common convert_ycbcr for vulkan csc", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c2d8a4bf176618bcbf37ef93b6ca06ecac1b1856", + "description": "nir/linking: fix issue with two compact variables in a row. (v2)", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a24ab26ff72634999ebc262dbdd3404d7b68bfea", + "description": "pick-ui: auto-scroll the feedback window", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8b8a99ba567314d5a83633a8ef73a5491976c67c", + "description": "bin/pick-ui: Add a new maintainer script for picking patches", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0123b8f63415d3d320929e6112da2be2d837b262", + "description": "bin/gen_release_notes.py: Fix version detection for .0 release", + "nominated": true, + "nomination_type": 1, + "resolution": 2, + "master_sha": null, + "because_sha": "3226b12a09bbcbd25526fd6da6257057d26ddb31" + }, + { + "sha": "4abf0837cdb14b10a58d28766d5c1d3698d8a6d8", + "description": "anv: Add support for new MMAP_OFFSET ioctl.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0d387da08349e1bdd222efae0657fc74009d9955", + "description": "anv: Add anv_device parameter to anv_gem_munmap.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d1c1ead7cd6ea8a9025703f519b09f0e0e8c673c", + "description": "iris/bufmgr: Add support for MMAP_OFFSET ioctl.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ae6f06c50939b49f7c7407d9bede00a246ab1891", + "description": "i965/bufmgr: Add support for MMAP_OFFSET ioctl.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5bc3f52dd8c2b5acaae959ccae2e1fb7c769bb22", + "description": "iris/bufmgr: Factor out GEM_MMAP ioctl from mmap_cpu and mmap_wc.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a42d7157849832485e63850abaf341b10c952ea0", + "description": "i965/bufmgr: Factor out GEM_MMAP ioctl from mmap_cpu and mmap_wc.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "16be8ff022758edf660baafb08c9084582a2ab5c", + "description": "drm-uapi: Update headers from Linux 5.7-rc1.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a1f6ae4744da830b9bf584296dd9738aa3532357", + "description": "spirv: Fix propagation of OpVariable access flags", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c76f2292b556502018ecc591f3388516c8ded469", + "description": "intel/fs,vec4: Properly account SENDs in IVB memory fence", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c3c1f4d6bcc210408f8b180727d269838b38193b", + "description": "aco: move src1 to vgpr instead of using VOP3 for VOP2 instructions during isel", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "be0bb7e1015ee0c5930b2aabd3e3de7c790be5e0", + "description": "aco: fix 64bit fsub", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "425558bfd595ed3a7a049ad0f47a46b8b3c4691e" + }, + { + "sha": "ad3ef6d0fc01fead5b72c6376387ecf5c48bfef4", + "description": "gitlab-ci: Test virgl driver", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d6b7439619c55d317bfe05094a9f503d832c9eb7", + "description": "meson: do not disable incremental linking for debug-builds", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ed29b24e233a332799eed006f71540ac4c56c5ee", + "description": "gtest: Update to 1.10.0", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "59427b6d1d07a5824272f7d6b562bcfe83d63b2b", + "description": "nir/opt_algebraic: lower 64-bit fmin3/fmax3/fmed3", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "eed0ace466d05e4ab07e638ac94a821788a8deaa", + "description": "nir/lower_int64: lower imin3/imax3/umin3/umax3/imed3/umed3", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "17acff01a00109c87d59b9d876fc735dd5fbe3d1", + "description": "radeonsi: skip vs output optimizations for some outputs", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "3ec9975555d1cc5365413ad9062f412904f944a3" + }, + { + "sha": "839818332c18a5cf59584ea3114f46aded792465", + "description": "nir/gcm: dont move movs unless we can replace them later with their src", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e4e5beee8a4cc0f7a6b27ce1ea1e04d1177442a1", + "description": "nir/gcm: be more conservative about moving instructions from loops", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "bf4a6c99d242022e6ad42af68682609401ffcd73", + "description": "nir/gcm: allow derivative dependent intrinisics to be moved earlier", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "50a6dd0d6534f802ca36b607ab5a453b531f4d78", + "description": "nir/gcm: Prefer the instruction's original block", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d4cf2df01a1f7b77660ea849ba552fd5b2d751d6", + "description": "nir/gcm: Delete dead instructions", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "dca3f351e5b70afd1a0088b829f9b45617c2e538", + "description": "nir/gcm: Add a real concept of \"progress\"", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5b1615fdb75659cb02fcbcd684046eabdb807930", + "description": "nir/gcm: Move block choosing into a helper function", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1f60f1aa3d0853b8374ec384c128eb4731fe4c85", + "description": "nir/gcm: Use an array for storing the early block", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6006a9e275750bf762f9f041f9078aaf8af4dd0e", + "description": "nir/gcm: Loop over blocks in pin_instructions", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4d083b52c04745d18d8f5446be5805077fa2c51a", + "description": "nir/dominance: Better handle unreachable blocks", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "425558bfd595ed3a7a049ad0f47a46b8b3c4691e", + "description": "aco: use v_subrev_f32 for fsub with an sgpr operand in src1", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "9be4be515f2a08b9c9e5ae1fc4c5dc9a830c2337" + }, + { + "sha": "adeef43d15092a6910dceb3605f5ee3151dd2c47", + "description": "CI: Disable Lima jobs due to lab unhealthiness", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e6c7bdc85138abe00bc9db355f302778926b38e5", + "description": "ci/windows: Make Chocolatey installs more reliable", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ec1b96fdc8bccaf2c1a4e1e3cca32b4aacbe4f7c", + "description": "nir: Lower returns correctly inside nested loops", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "79dec93ead6e3b95b1240a9d843d617a88ee9179" + }, + { + "sha": "969aeb6a93aefd037b130e4b37f58043fef493c3", + "description": "anv: Apply any needed PIPE_CONTROLs before emitting state", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ffc84eac0d5a0c30e445fcdb2f0cfd0c5bf5321d", + "description": "anv: Move vb_emit setup closer to where it's used in flush_state", + "nominated": false, + "nomination_type": null, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "06c5875fd6b8fa387a103bd0c6fad4fa5ef847a5", + "description": "Fix promotion of floats to doubles", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "94cb129d514b748db1342c6208ae4b7390bd33da", + "description": "ir3/ra: Fix off-by-one issues with live-range extension", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "cdc43775917e301a7ca654fcebb94fad08dc4131", + "description": "util/sparse_free_list: manipulate node pointers using atomic primitives", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "e4f01eca3b3cd1701f21cacbb8d29fe688ba42bb" + }, + { + "sha": "36d2a0eed69b6f584c417bdbe0ea0f4623f1b514", + "description": "glsl: only set stage ref when uniforms referenced in stage", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6afd0954e12eb75a4ce19580907b1fc4145369b9", + "description": "glsl: pull mark_array_elements_referenced() out into common helper", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5d992b539e977ac688e950866a1d872de5acec18", + "description": "glsl: fix block index in NIR uniform linker", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5dbebf49822ff3fb3bc3e6123bac30214c432b77", + "description": "glsl: error check max user assignable uniform locations", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c7355c4fb98cd052951f323b2dd241942000ac21", + "description": "glsl: fix explicit locations for the glsl linker", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5442712c6dae60529ceed0ea199751da7c9dd8e7", + "description": "Revert \"glsl: fix resizing of the uniform remap table\"", + "nominated": false, + "nomination_type": 2, + "resolution": 4, + "master_sha": null, + "because_sha": "e0aa0a839f9c168784a1f50013c83877cc876094" + }, + { + "sha": "723edf859ff2cf7dd0d4c2cb01a7d941af055265", + "description": "glsl: tidy up uniform storage value count code in NIR linker", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3e2dbb6e7059f3ebe5a6e9f4633e923e29bcc705", + "description": "glsl: fix struct offsets in the nir uniform linker", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c19ebca308f30d09949ca66cc10ea63592fe98cf", + "description": "nir: add matrix_layout to nir_variable data", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f27c707585ccca0d61b0b688defb59598f37f8b1", + "description": "anv: skip writing perfcntr in results on Gen12+", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "086ea1ac7ee41fbb6bd031bfdf281a7e276cfe7a", + "description": "intel/perf: Enable MDAPI queries for Gen12", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "15b7b56eb2fb418263199d5b47774cfb922fa343" + }, + { + "sha": "29fb5451a97a9c98821fa17dc8d7495c85c5e813", + "description": "pan/bit: Add fp16 min/max tests", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "532dfebc713bac114aa7a0f12f340f2cfae1ed52", + "description": "pan/bit: Add constants test", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "15fe8d5d7b0d7edc976ad39d93e76aab6ad0e929", + "description": "pan/bit: Add fexp2_fast test", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "20f255b18e48f61a5a0b22041c7149225f3bed64", + "description": "pan/bit: Add fexp2_fast interp", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8890fa4050dabee9f3ef4a44871a0bc696afb9c7", + "description": "pan/bit: Add FMA_MSCALE test", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b7dd5b579dadb02715029b47b284fb2979fa71c8", + "description": "pan/bit: _MSCALE interp", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1e3960a7256cc0bd3cf8d46d9f945372d86e0d7a", + "description": "pan/bit: Add BI_TABLE test", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "93fffd8a11ae78bfcb67a8ff5c0cd7842cad885f", + "description": "pan/bit: Add log2 helper interp", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1c45b58cebebf7f21035b22a98af8829fc002db0", + "description": "pan/bit: Add FMA_REDUCE test", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5546d1958b78ba9d933606960a888b62edbf8d01", + "description": "pan/bit: Add BI_REDUCE_FMA interp", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "68b4e708f14d116518edc55017dcb8da539328fd", + "description": "pan/bit: Add frexp_log test", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "36cfe722e5ab6c01216f30cad6909888425d2931", + "description": "pan/bit: Add FREXP interp support", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c05860789a858778d1cfcb5e8dec1168b1114cb0", + "description": "pan/bi: Lower special ops to 32-bit", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4d0f94103618a7f351774a2fb3208c8aefe1f315", + "description": "pan/bi: Round constants to 32-bit", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d30df466b57771ab5b28dadf6e113f46222e1b92", + "description": "pan/bi: Dump extra bits for disasm", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "590d66fa0c4af564f0f6f5c294500eb816c9477f", + "description": "pan/bi: Pack MAX.v2f16", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f87403c4c134454d991c1962b746a257f5f11d00", + "description": "pan/bi: Pack ADD.v2f16", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7e76c2b806dcef8254f0039ce27d916ca6dc2bf5", + "description": "pan/bi: Structify add and min/max fp16 ADD", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1647884cec58577024c0ff5e22d276b9e8e741f6", + "description": "pan/bi: Workaround constant packing errata", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d772bf01011fc41d14093892a46541dcb2b9b6db", + "description": "pan/bi: Try to reuse constants in ALU", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "aba7f09902f704819f0bc718a322793b265acd64", + "description": "pan/bi: Handle st_vary with <4 components", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "438e445e17864e91704c5c8a6ed33bfefe329008", + "description": "pan/bi: Fix vec2/3 handling", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "031ad0ecc2d585c109cbb5a757d07bcae344b8be", + "description": "pan/bi: Implement flog2", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8e52206dbef63ffe91f5e58043228bb51e41192c", + "description": "pan/bi: Add fexp2 implementation", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b1d4d8f7432ddff50dce4151603f70c47c01c5f9", + "description": "pan/bi: Fix lower_combine swizzle rewrite", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "60f252708fc7d0ad9b844c55b659e9fd3d6b5607", + "description": "pan/bi: Fix packing with low-nibble-set on hi constant", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "10fb5fb460b83f0cc2eca24557fbddf32cb1d0a9", + "description": "pan/bi: Fix packing with multiple constants", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "86c2a6b9fe885efa0366b262a226643184ca4ba4", + "description": "pan/bi: Fix bi_get_immediate with multiple imms", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "df69304ac829ee843cbe70b8fdd13a7a32704a48", + "description": "pan/bi: Ensure CONSTANT srcs have types", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8f70f4432cb05f96b184ff247cbca9f5c2954aa1", + "description": "pan/bi: Split src/dest index printing", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "db5c1ae8fd17f3113a88029ea90d23f10df293ed", + "description": "pan/bi: Add fexp2_fast packing", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c3eebfeb11cd57998134c0bb903c2447296f43c7", + "description": "pan/bi: Pack FMA_MSCALE", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0cb703984e68c426a173df1e0c951591dca17fb8", + "description": "pan/bi: Structify FMA_MSCALE", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4570c34fc76c16c149d6040883b92c30abe70531", + "description": "pan/bi: Add _MSCALE flag for FMA/ADD", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d3643cdd81bf2f1ad6ddf10d80e38d0ddaf9f908", + "description": "pan/bi: Add log2_help packing", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6039d51e32fe98c4e785b7a9039bfc066720c91a", + "description": "pan/bi: Pack ADD_FREXPM", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ffa9f6a7896cd1cfdfe0e8e880ab870f3da6033f", + "description": "pan/bi: Add bi_pack_fma_2src helper", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9904ed170a121bb15350b0882ddc6e54d13a8a8f", + "description": "pan/bi: Add frexp_log packing", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e067fd7b00c5a9b4ab4b60c98315344d673f5239", + "description": "pan/bi: Add log_frexpe op to IR", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "40befaa96590bb8ce93af6c6db50f054dd021ebe", + "description": "pan/bi: Add FLOG2_U op to disassembler", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "62c8c3445ed50b73ab3d40d63631cad64f084f12", + "description": "pan/bi: Add op for ADD_FREXPM", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "cc611566260eacc0d4e92d8b3dc3a8a4d7c94b6e", + "description": "pan/bi: Add special op for exp2", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "af01378dce1873c520c52a536ee7d1731c18105d", + "description": "pan/bi: Add BI_TABLE for fast table accesses", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "83d961b0c26874622a0c72cebfa40ef4952ae5d3", + "description": "pan/bi: Disable FMA scheduling for CONVERT", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "86c0ea383db65b5ec019143606189231bdc65066", + "description": "pan/bi: Add disasm for ADD.i8", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f5deed138a0b4765438135367248f1d8f0649975", + "description": "spirv,nir: Move the SPIR-V vector insert code to NIR", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "feca4396974feb0e94603151eaebb6a7a6a22ce3", + "description": "spirv: Call nir_builder directly for vector_extract", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "acaccff4d331ecc4b2794fef3f4b117abdd15abf", + "description": "nir/builder: Handle any bit-size selector in nir_extract", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4b160c67764b3f0d0a843f7542d6079aa3a7d8f2", + "description": "spirv: Error if OpCompositeInsert/Extract has OOB indices", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c478f8ad6cf57425ffdae56a7c18b62b27985ea4", + "description": "spirv,nir: Add a better vector_insert", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "380bf556bfe34357f802dc49e1e104dc8fdf951a", + "description": "spirv: Handle OOB vector extract operations", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "dc3a17997b479f91f36b2421e9c41c11a025de47", + "description": "util/sparse_array: don't stomp head's counter on pop operations", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "e4f01eca3b3cd1701f21cacbb8d29fe688ba42bb" + }, + { + "sha": "d684fb37bfbc47d098158cb03c0672119a4469fe", + "description": "st/mesa: Update shader info of ffvp/ARB_vp after translation to NIR", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "8a0dd0af3f1a6c0310a08daf4220132ec6815b31" + }, + { + "sha": "c4ca9e66ddb507831b2d35e927d6310775006894", + "description": "aco: fix exporting the viewport index if the fragment shader needs it", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b424d49ac05563fd7d9f217fae9c25fc602f4330", + "description": "radv/llvm: fix exporting the viewport index if the fragment shader needs it", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "19aa68ae315df7654bd81a4c293aeaf5f7a7f4a6", + "description": "radv: set missing SHARED_VGPR_CNT for NGG VS and ACO", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "c24d9522daefce112b7a9d03a6d1abdf60f02656" + }, + { + "sha": "fd6e44236c07583b3b838b2c7ed01fea27002ef9", + "description": "radv: fix geometry shader primitives query with ACO on GFX10", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "c24d9522daefce112b7a9d03a6d1abdf60f02656" + }, + { + "sha": "f7d620f47d53d9ad513c41730f3a24b9564e5e74", + "description": "intel/compiler: Fixup operands in fs_builder::emit() that takes array", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "39ad0c2af8b40c728a91bebf05b365803d68022e", + "description": "intel/compiler: CSEL can do saturate", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5afaa407c1a2a27a23f1827d72d5ebde8b7882fe", + "description": "intel/compiler: Only GE and L modifiers are commutative for SEL", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a80e44902f66244d257c523afe77558cf334d624", + "description": "intel/compiler: Silence unused parameter warning in update_inst_scoreboard", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c836295dfdeafe9f2a731c4dcd75d59c4494bcf3", + "description": "intel/compiler: Silence unused parameter warning in fs_live_variables::setup_one_read", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "62f70a353f86e5ebd2889feed8d1223d11827312", + "description": "intel/compiler: Silence unused parameter warnings in vec4_tcs_visitor", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "030e5ceac4446dfc340d447c33222730596030d3", + "description": "intel/blorp: Delete an unused enum", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d0d039a4d3f49f3cc89e167b46edae54cf32a6be", + "description": "anv: Emit pushed UBO bounds checking code in the back-end compiler", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "eb5a10ff63f74f9e052ecc6c7399df8e0d193345", + "description": "intel/cfg: Add first/last_block helpers", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "64e3b8d66b9d4103ff815606ff20b39246418122", + "description": "tu: Use tu_cs_add_entries() with non-render-pass secondaries", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ac0b8d58b963efbeadb4461b39f7d910996c2db7", + "description": "mesa: add interaction between compute derivatives and variable local sizes", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8c949b2aa6e0c27e92a05a57eda11672c2bdb157", + "description": "st/mesa: properly guard fallback_copy_texsubimage aginst failed maps", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8521acd660764973940fbdc2c85334670d620b45", + "description": "radeonsi: don't assume ctx is always a threaded_context", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "dcb1e8fef8ae60877a696a5bca337eba5475085d" + }, + { + "sha": "791134658e30ed843acd68d39b06156563017d15", + "description": "Revert \"CI: Disable Windows/VS2019 builds\"", + "nominated": false, + "nomination_type": 2, + "resolution": 4, + "master_sha": null, + "because_sha": "460b8b1758d953b2b820443615d73ccdb1455b5e" + }, + { + "sha": "2f009c4b494f5b158525e7ebc01c280f54227402", + "description": "docs: update for recently-added nvc0 features", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6ae214ac2eebf6a15ff41112d17940080a87d918", + "description": "nv50,nvc0: update with latest caps", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "029471c3c4eb1fde68fbebc79fdcb8e70090aab8", + "description": "intel/batch_decoder: Stop printing to stdout", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b8acf9a3d4af33cf8b6b8c870167c2aa348990a0", + "description": "anv: Report correct SLM size", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "d787a2d05e20642a7ec52ce80a830c7795a6bdc0" + }, + { + "sha": "e003104605f506333d2ac8a9c2baf9f04eaebb81", + "description": "intel: Add _const versions of prog_data cast helpers", + "nominated": false, + "nomination_type": null, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9b17d7caac76e1c2dd4579c198b2e32b762bb656", + "description": "nir: Add some sanity assertions in opt_large_constants", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "33eb43349e8c3e8216f51ec838d9b9ab23361873", + "description": "nir: Add an alignment to nir_intrinsic_load_constant", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "f1883cc73d4ea2c6d3a73dfe55c8b346f3ef8ac6" + }, + { + "sha": "8cbeb13704a59034ffe19a7ffef7b3856a1733e8", + "description": "clover: Check if the detected clang libraries are usable", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "ff1a3a00cb37d84ab9a563f0aa241714876f56b4" + }, + { + "sha": "839c886b346e0f68707804e17e9088d2e166e6d6", + "description": "aco: add missing scc clobber to nir_op_unpack_32_2x16_split_y", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ac74367befcf51917025f9fe2ce1dc431c2875fd", + "description": "aco: implement various 8/16-bit conversions", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0443a4a0af1ac76ec5284cd47c4860b3c5853cd8", + "description": "iris: Enable EXT_depth_bounds_test extension.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "daeff19608e8ef1937167ffa8b086bba5eb3f60e", + "description": "meson: tell flex that we support c99", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0752648a993c6f0fae2f4a072079000b9d84d074", + "description": "vbo: avoid including wingdi.h on win32", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b55b033f7624171d82ae1f79f3d3ad058ae0ac56", + "description": "mesa: fixup cast expression", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c55fc3543537fdc7b402584a076eb87d258b4c51", + "description": "util/tests: initialize variable", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "522bb081314131340804a160d5d53f2fbe9a10be", + "description": "wgl: silence some cast-warnings", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e9ad8af6f377a3afb5cbfb8d1844887af6894ea7", + "description": "meson: use override_options to change warning-level", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "24378086710bae7eb7b6d0cb4ec0e718d36ba32c", + "description": "turnip: image_view rework", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "300d0e2b809644262481c30cf205761abd2234bb", + "description": "turnip: don't limit framebuffer size to image size", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b6455e9a6a7dcd37483d1ff2193161c3568e74f6", + "description": "turnip: compute render_components/srgb_cntl at renderpass creation time", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d80fb024302aa6058945826a79ba0caf9611fcc1", + "description": "winsys/amdgpu: Retrieve WC flags from imported buffers.", + "nominated": true, + "nomination_type": 0, + "resolution": 3, + "master_sha": null, + "because_sha": null + }, + { + "sha": "80797edd7193409d0109d4d3378dd1b8d2597f80", + "description": "st/mesa: fix a crash due to passing a draw vertex shader into the driver", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "bc99b22a305be5e5a5f8fdbf57105d220cf4b0ab" + }, + { + "sha": "7a794b1de40df87edc359a36d14e214daa1bfe60", + "description": "CI: Disable Windows/VS2019 builds", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9ecd9463de6ba7a9f0648da16ef9371dd8a202f0", + "description": "meson: Make shared-llvm into a tri-state boolean", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0c05d46237db3095ef92d61452d886286806a827", + "description": "tu: Align GMEM resolve blit scissor", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d2e172c03f7170382e8d93b2ad04f3ca92b2020f", + "description": ".mailmap: add an alias for Zhongmin Wu", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "eafacdc0fa02039cb2d2aebd5911f59d9178c3c0", + "description": ".mailmap: add alias for Zhaowei Yuan", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "56222c13cf680362213268fbff15762f8fd1bcbf", + "description": ".mailmap: add an alias for Yaakov Selkowitz", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8be72b4c7957c48cba446563a641fae00540f613", + "description": ".mailmap: add an alias for Xavier Bouchoux", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a96e1a2d9f6c983f6ed2ee7211a267a63a19503b", + "description": ".mailmap: specify spelling for Wladimir J. van der Laan", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f9e1e5857d3d27d9e517fd695b332f1429d4e81f", + "description": ".mailmap: specify spelling for Vivek Kasireddy", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "47d17238dd5d1d7dc8a2355890d4d8581bfc89da", + "description": ".mailmap: add an alias for Varad Gautam", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "55f883b8eabecb935001da088771da436e8bdb9b", + "description": ".mailmap: add an alias for Vadym Shovkoplias", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d8982ce84cb7ab7f57b18bad7c7994763468e0d5", + "description": ".mailmap: add an alias for Topi Pohjolainen", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0399b4f2987e56963ba22ceba0a4d7a3f51ef171", + "description": ".mailmap: add an alias for Tomasz Figa", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "049ce5f41768de6201fdfc33a085f7e9efb6d487", + "description": ".mailmap: add an alias for Tom Stellard", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "05b2a4471c50a3f2b02c29b4b4a9bcd96ae8e6e9", + "description": ".mailmap: add an alias for Tim Wiederhake", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e430136cc9f2219033b2ae3ea01ec5a8ac9ba11e", + "description": ".mailmap: add a couple of aliases for Timothy Arceri", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "34ab507c1fc762fbbab1e9c13b6ff95c9bbb18ae", + "description": ".mailmap: add an alias for Timo Aaltonen", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "cb177e054a27b79088a9e9fee2c2a6cbba2128d8", + "description": ".mailmap: add an alias for Thierry Reding", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "032a603e3c5e4b1ae5be9d1afbcb91ccc2eacd72", + "description": ".mailmap: add a couple of aliases for Suresh Guttula", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6d1fe4a68730f241d01b0eaafa0969b584bf3665", + "description": ".mailmap: add a couple of aliases for Steinar H. Gunderson", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5ee82189f231551d7e25e00a50ee474bdc550955", + "description": ".mailmap: specify spelling for Sonny Jiang", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d3f36056fa9fefbd769dd3cf60d7ab5702a20c5b", + "description": ".mailmap: add an alias for Sergii Romantsov", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "00d9496a123a9b685b568a9d6482ad68206b9b4e", + "description": ".mailmap: add an alias for Samuel Li", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9a5bd1512a44db4d315110d57a9ec7010d7121aa", + "description": ".mailmap: add an alias for Rodrigo Vivi", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c6fcca4bd86c8fe7b71f45291edb517bb6f19960", + "description": ".mailmap: add an alias for Rob Clark", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b7c1f150c981c881805147d5e5d01a202da0e687", + "description": ".mailmap: add an alias for Renato Caldas", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3dac186704f1fc7f26d97d7778b052d902cb02ea", + "description": ".mailmap: specify spelling for Randy Xu", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9e904253e47014ffdaf0a3446b1212e496b24870", + "description": ".mailmap: add an alias for Qiang Yu", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3ba1d912a04d25d82ee636d7c3a095240b00c851", + "description": ".mailmap: add an alias for Plamena Manolova", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b42a25320e6d444f9fd90c2db12d209c4e87f286", + "description": ".mailmap: update aliases for Pierre-Eric Pelloux-Prayer", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3ffa511d60dbfa163e553d25cf2bcf45a569a777", + "description": ".mailmap: add an alias for Philipp Zabel", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2b82c50e794d8118de9291e6e04b7a2fa42bd479", + "description": ".mailmap: update aliases for Nicolai H\u00e4hnle", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f40c48c0c469580ab8f56e48bfc6ba50a0f2cb23", + "description": ".mailmap: add an alias for Nicholas Bishop", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "004c69fbfa67bb7fab628c3be65d636f65e6242b", + "description": ".mailmap: specify spelling for Nian Wu", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3ce0e25a988680711982a28f477f95024682b567", + "description": ".mailmap: add an alias for Neil Roberts", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1ac6946ded222e5dbfce6acd881966bc2b496923", + "description": ".mailmap: add an alias for Neha Bhende", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a9ed1085ab0ad5a2f474666496bcc1d0e52113ab", + "description": ".mailmap: add alias for Matthias Gro\u00df", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3223198b4d6b23408eee3d57beab86af71d4d160", + "description": ".mailmap: update aliases for Marc-Andr\u00e9 Lureau", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "79bb330904acb945abd8e22dccfa15d77c837f12", + "description": ".mailmap: specify spelling for Liviu Prodea", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "25dcbcbb5b08be140c8cf0963468273a6044df9b", + "description": ".mailmap: add an alias for Lionel Landwerlin", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2e53e65e23a896e258a8a4ffe123821cc2737a69", + "description": ".mailmap: add a few aliases for Kristian H\u00f8gsberg", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "69489e48bc24485e01f008290e6f65313e610ddc", + "description": ".mailmap: add a few aliases for Kevin Rogovin", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ab4c32a50e09d447be128ef3d36c8519eeab29f0", + "description": ".mailmap: add a few aliases for Karol Herbst", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "92e45975448b59c7a8d2ce6003b3711e4b29943c", + "description": ".mailmap: add an alias for Julien Isorce", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ec001fd3236ce50b4d504585588e86ded4231743", + "description": ".mailmap: clean up aliases for Jeremy Huddleston", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f131469d1471063b503a9f4e5cbf155488723700", + "description": ".mailmap: add an alias for Jan Beich", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d8cb7efd30e5a2ac49bbfe7c0f829549a487dffb", + "description": ".mailmap: specify spelling for James Zhu", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "16ed147cab84bbff4cd507ba7ab397efb016e5b1", + "description": ".mailmap: add an alias for Illia Iorin", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b7f912f11a09347191bb62d46fd9fe66b59f9c2e", + "description": ".mailmap: add an alias for Igor Gnatenko", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ebaa8765fe92ef247fb4f8eba6cefa0f50a342e3", + "description": ".mailmap: specify spelling for Henri Verbeet", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "eb96435aafa239ff0b082b8e16d78738210cb1fe", + "description": ".mailmap: specify spelling for Heinrich Fink", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "02b2dc22d30614617574ddb29a2a7008e8058d21", + "description": ".mailmap: add an alias for Harish Krupo", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3c86bc03b32690d55fbb4ac9ef4dbcdb8360bbda", + "description": ".mailmap: add an alias for Haihao Xiang", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5ea2e1044e85d85787fea02f668cdbcc9cb2649e", + "description": ".mailmap: specify spelling for Gurchetan Singh", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "194c9f9982cff810265bc240291d75819b73637b", + "description": ".mailmap: specify spelling for Francesco Ansanelli", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "edb4e21e262cb79734a5750776b6948813b2f5dc", + "description": ".mailmap: add an alias for Erik Faye-Lund", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c11e1d4408210c91ce6fd8d9fa92b47666595a41", + "description": ".mailmap: add an alias for Emmanuel Gil Peyrot", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8efee3cea374ee8179b3c25e7e78d73ac78610e2", + "description": ".mailmap: add a couple of aliases for Dylan Noblesmith", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ee068df4f58bb4678aaa0c3b7d503ce1f7b4eb0e", + "description": ".mailmap: add an alias for Dylan Baker", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e7699f92e9e3ad486a89e3a8077bdf99e808cd97", + "description": ".mailmap: add an alias for Dave Airlie", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4e793b7b4b837fae0969552b3bd8816b6901c1e5", + "description": ".mailmap: add an alias for Danylo Piliaiev", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "aa6ad898ba1d39ab374edcfae5afb17ada89da35", + "description": ".mailmap: add an alias for Daniel Sch\u00fcrmann", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e8764917c4bf6cbc287abf183886a608e853b405", + "description": ".mailmap: add an alias for Craig Stout", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7f4d26b3cd91ca87045f8b282d51f11b6735b99f", + "description": ".mailmap: specify spelling for Constantine Kharlamov", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5437ccda31b60a3f845ac1181c66c0908a3581a8", + "description": ".mailmap: add an alias for Colin McDonald", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a07c11b0fe5d343686d8ea0c1c7c896fd8b43327", + "description": ".mailmap: add a few aliases for Christoph Haag", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0d6af7f9b133c7c430aed6e44c4f43b656669f79", + "description": ".mailmap: add an alias for Christian Inci", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "832d1f913eeaed108c5994d851f495b575a8e5bf", + "description": ".mailmap: add an alias for Christian Gmeiner", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9278ea292092368feb3b71d29f5332e04f793d7b", + "description": ".mailmap: add alias for Chenglei Ren", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "43bdff8a5c28486e4d78b70ca09fa7dcfe5393f3", + "description": ".mailmap: add a couple of aliases for Chandu Babu Namburu", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d89e96313395b0a32a836f4675df18b72ca8bbeb", + "description": ".mailmap: add an alias for Chad Versace", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a3a2c49e13e23e95a5c2a87df0339e1263ebaedc", + "description": ".mailmap: update aliases for Carl-Philip H\u00e4nsch", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "88eb6b7d58b3a4fd255c60cb8d36ff5752ac70a3", + "description": ".mailmap: add an alias for Bruce Cherniak", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "da8a3098825b8dc5eeea12e91d89ac276a53ceca", + "description": ".mailmap: add an alias for Boris Brezillon", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0efc82183d6e4a84bd7bc5b0ad328ac34cdd6262", + "description": ".mailmap: add an alias for Axel Davy", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "84a9fe776646998c4b348adfd312997c5bfe170c", + "description": ".mailmap: add an alias for Anuj Phogat", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5cf8dc7f54c079e04043d41b7901a3cc402e9631", + "description": ".mailmap: add an alias for Andrii Simiklit", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "174e97e969666fe5977cb1c5d045401b8130262f", + "description": ".mailmap: add an alias for Alyssa Rosenzweig", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "12ec5b94eae9e440a49d8a919e87d26e39d8b625", + "description": ".mailmap: add an alias for Alan Swanson", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a934c8e7edd820ebb7286d0927090578cd6a3080", + "description": "mesa/st: initialize all winsys_handle fields for memory objects", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e3e704c7e7e46dfda820ea2e96fa4a32d63ef796", + "description": "amd/addrlib: Use enum instead of sparse chars to identify dimensions", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e58509cdeccc85adcf9127c6d1a462a73e7d5068", + "description": "gbm/dri: Propagate queryDmaBufModifiers return value", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "4e3a7dcf6ee4946c46ae8b35e7883a49859ef6fb" + }, + { + "sha": "b5b25ee0327f65d837ff84f59b497232d52ec25d", + "description": "zink: be less picky about tiled resources", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "040a2643c08dd0accee6942bc05836b99417ee49", + "description": "st/dri: make sure software color-buffers are linear", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1c4f68b089b26918fff55196122309ac43e78e1b", + "description": "virgl: Use ETC2 formats directly when possible.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "dcb1e8fef8ae60877a696a5bca337eba5475085d", + "description": "radeonsi: use thread_context::bytes_mapped_limit", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "15cf7d170b3391ebde58f954cd2b90fff35b1ce5", + "description": "gallium/u_threaded: flush batch when hitting mapping limit", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "35b396392880871b8cc06172dafff238e67c44cc", + "description": "radv: do not abort with unknown/unimplemented descriptor types", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "11faaf646d1397db0d902298a3f0870f79692b68", + "description": "aco: fix emitting stream output with tess eval shaders", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "91aa596ca7ef3411264181f49f58743f5c965710", + "description": "aco: implement nir_op_f2i8/nir_op_f2u8", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "04a7ec7c8a7ca63fc3e90e5d2fe1290976a77bd6", + "description": "nvc0: enable GL_NV_viewport_array2", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "cd092bf937020984d6008f93eb0d15d647c112c5", + "description": "st/mesa: add support for GL_NV_viewport_array2", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b0d0a3c916595860749220bcb3a4b1cc408ddd34", + "description": "gallium: add PIPE_CAP_VIEWPORT_MASK", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8f191e0c37356e684fa63db76174fdaee74b6982", + "description": "gallium: add TGSI_PROPERTY_LAYER_VIEWPORT_RELATIVE", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "17308c10141aa54ebf9e595920297888edf6146e", + "description": "gallium: add TGSI_SEMANTIC_VIEWPORT_MASK", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2d4787d77e3bd1986381b77105d2ca9d094ad7c0", + "description": "mesa: add NV_viewport_array2 enable, attach to glsl", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "cc6661bfc87f6a8a46455ddaf2e0fb1c1fd332ed", + "description": "glsl: add NV_viewport_array2 support", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "54424a3d13f97bb1714f168f127f40d180f78a6b", + "description": "compiler: add VARYING_SLOT_VIEWPORT_MASK", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3a9e66277af68824fcfa1650d87222f37f0582cf", + "description": "ir3: Handle load_ubo_ir3 when promoting to constants", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "abcfb6437062f469335d27d5ef60ecf20272dc26", + "description": "ir3: Fix LDC offset units", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2d489f76f48095799392a915dcedc074bbb5e52a", + "description": "Revert \"nvc0: fix line width on GM20x+\"", + "nominated": true, + "nomination_type": 2, + "resolution": 1, + "master_sha": null, + "because_sha": "a0e57432b76c32f2109dab0ad3df0ba03967441c" + }, + { + "sha": "26a1adce5bd2f0e44900f21e58ea09fea9f6690f", + "description": "anv: Fix UBO range detection in anv_nir_compute_push_layout", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "e03f9652801ad7f70091e084535a3fb6650c3acd" + }, + { + "sha": "b2e4157143439a211d2f8e761dc8afd750fa791d", + "description": "anv: Advertise SEND count through VK_EXT_pipeline_executable_properties", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2c82b13c8ff63cc296215b6b5991ac00e6f3d495", + "description": "iris: make BATCH_SZ smaller by BATCH_RESERVED bytes", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "103cb32c794e6428d155d88e42cdf6b9a19b2f31", + "description": "iris: remove useless bo->gtt_offset assignment", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c586cb23e0a72e59cac45ad1fb85d0f3c71b6562", + "description": "iris: remove unnecessary forward declaration", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f3f5016c0bfe5dbe85015dc1c3f41848b02dd245", + "description": "iris: remove hole from struct iris_bo", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0158f73f086e0fe1b29a1e7bba995acce71d6bb9", + "description": "Fix util/process test on Cygwin", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "befe2ff3a610c468fb0bbb67624cc5b531a3fefe", + "description": "llvmpipe/nir: free the nir shader", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "18f896e55d96c63b11de7ed0cbe484988a1184c5" + }, + { + "sha": "cb0a2b3df68d33be7c5bfbbb62825608b1b1948e", + "description": "draw/tess: free the NIR", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "0d02a7b8ca794a594c2e9cc5e6d63dc591593105" + }, + { + "sha": "f01c0565bb9ad7335395920aebc7489cb76d1b72", + "description": "draw: free the NIR IR.", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "bf12bc2dd7a28844103bb30a07be0440e60c5864" + }, + { + "sha": "13ce637f1b28381e72470763ff5e39dd3c562476", + "description": "freedreno/turnip: Update GRAS_LAYER_CNTL to GRAS_MAX_LAYER_INDEX", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c2399e9574591389bb5640744ab77f22699eea23", + "description": "turnip: Emit geometry shader descriptor consts", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d6d5ee29ab715c08d3b46e270f7eacde28e2e665", + "description": "turnip: Correctly set layer stride for 3D images", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7e525d29ab68d927d0f2aa7e3ef21d25e915e76d", + "description": "gallium: initialize viewport swizzle in cso_set_viewport_dims", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "4137a79c2a7edb5f0caf0964ab748da7c279b61c" + }, + { + "sha": "1aefe78b47eab9caca6a1cbfe4ecb6b5d4cf87ef", + "description": "mesa: fix enum value of VIEWPORT_SWIZZLE_POSITIVE_W_NV", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "ff168b297d94f656899a904ac147a9ce5add65c9" + }, + { + "sha": "e2650db95276662661960f87be5d4d3537d8acf5", + "description": "radv/aco: do not advertise VK_KHR_shader_subgroup_extended_types", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4ee23709724ff696db6613aba155963a1d8e0a41", + "description": "nvc0: enable ASTC and ETC on GM20B", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "22406da75671438bf9de68bc47e2b8871e5fa3e6", + "description": "glx: omit loader_loader() for macOS", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "b699d070a6de273fb3a964e05944b203d0b57090" + }, + { + "sha": "471fd41e84e1a19b970c86d8d8a23e1996cd7483", + "description": "clover: expose cl_arm_shared_virtual_memory for devices with SVM support", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "657ff3b3b816be96f61a363b4479abfe785678e1", + "description": "clover: implement cl_arm_shared_virtual_memory", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a218658556a2961af26af9c046aae23b7f58fcc8", + "description": "clover: implement SVM functions for devices with fine grained system SVM support", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d6754eb92072332fc6e7d22dd98628d22ce76531", + "description": "clover: implement clSetKernelArgSVMPointer", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "035e882819bcb853fff7a59c638a0ecbf89cb762", + "description": "clover: implement CL_DEVICE_SVM_CAPABILITIES", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c170c0cfe4bd3c08385953e6e03f4403f5cfb5b9", + "description": "clover: add stubs for SVM", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e738967d6e4cfe25d4c14dd3211b27ddb67ed8cb", + "description": "gallium: add PIPE_CAP_SYSTEM_SVM", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c818b5c089914f5d3e0d233ab531571c4d5ec13c", + "description": "aco: fix 1D textureGrad() on GFX9", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "6f718edcedd713beb547cda48aa8dc63a0af4c35" + }, + { + "sha": "8ce46f352e9e2ad103a5058895f3ab4ee164ea33", + "description": "iris: drop cache coherent cpu mapping for external BO", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "08a396033be1d7ceddf48da0563a7e4d2cb64429", + "description": "aco: fix nir_op_frexp_exp with 16-bit floats and negative exponents", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9bf8e923863230914f6bf2a4abcf257cb8778ee7", + "description": "u_blitter: fix stencil blitting", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "381e9fe64a80d98144a4ad75044edd9b878c7de7", + "description": "draw: fix user culling pipeline order. (v2)", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "30ef6f5137a5a6e9c0b08640a44e30e1aba14270", + "description": "draw/cull: run pipeline for culled points.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "dc261cdd4238038c91f9fe4232fad7b5a20050d7", + "description": "llvmpipe/setup: move line stats collection earlier.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "80fa8304c807b2ef28bf9e37bdd7afadfde216ce", + "description": "draw: fix tessellation stats query", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "335827eade38d6f0647d9a01af2a1a5a1b59dd2d", + "description": "llvmpipe: fix no tokens detections.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ccc6a48ec5be0a748d77d56168ba90e2784b0a33", + "description": "gallivm/draw: calloc prim id toavoid undef", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e20b3b37202eb775009c7a62d98515f49cda6e61", + "description": "gallivm/nir: lower implicit lod to tex.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c494ed0467573b992252fab3217ec158d5b7ccb3", + "description": "gallivm: fix left over shader vote debug", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7690606bf784c35c5318ebfe0f5162eec9b19b82", + "description": "llvmpipe/query: fix transform feedback overflow any queries.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "96e12ca7d77df365a96336561e32813818268c19", + "description": "llvmpipe: report tessellation shader statistics.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "202bc38ce9e3e52cea1448f22882d17c7e5e9a90", + "description": "draw: collect tessellation invocations statistics", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f4edc6f8bd89e97047b79a4685640d0546e90a10", + "description": "llvmpipe: fixup context leaks.", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "eb5227173f0354aade28e46397a5bb69b2af69fb" + }, + { + "sha": "68b40cfe2728889d4bd86ff073b3b69fb8608e5a", + "description": "swr: Remove Byte Order Mark.", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "c6e67f5a9373e916a8d2333585cb5787aa5f7bb7" + }, + { + "sha": "600c91fed89fb6c9389bcbdd41fe50893cfb984a", + "description": "glsl/list: Fix undefined behaviour of foreach_* macros", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "23be2160713a713a53519568efa2516b770f07ae", + "description": "freedreno/ir3: don't overwrite wrmask in ir3_SAM", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "aeb5b9cebf5c5fde2eed9be1d0ec64afd8aa8b6d", + "description": "freedreno/ir3: fix emit_tex_info split_dest", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "cb08f451d01571e0a570e99553624ae7b0d3d075", + "description": "gallium/tgsi_to_nir: Set nir_intrinsic_align_mul to 16 and offset to 0", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "e78a7a182524f091e2d77ba97bfbe057c3975cab" + }, + { + "sha": "31988baba48e974240af68aff84f8c5620d2e62a", + "description": "ir3: Fix txs with bindless", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "acf7e73be54c7f1cc52fcc9be38a9df26849200e", + "description": "gitlab-ci: make explicit tracie is gitlab specific", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1ca91683e22de3f5915081957ab829642e785ffe", + "description": "gitlab-ci: protect usage of shell variables with double quotes", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "35782b6593a676b5b057c79be69b519309c9ac69", + "description": "gitlab-ci: Vulkan tracie runner to return last command exit code", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4c6ce826af2bd633563ffd9f798d511c40805bd1", + "description": "gitlab-ci: Check the Mesa version used for tracie tests", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "fbd2be3f5db004090a311694aa1c17ff8c9fc5d8", + "description": "aco: clear moved operands in get_reg_create_vector()", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "52cc1f8237d9ed0042578777af4b28e5b33c6354", + "description": "aco: improve p_create_vector RA for sub-dword operands", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e18711cda3bf173e03d9773001f641fd85ffd18d", + "description": "aco: fix p_extract_vector validation", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "41ac44e1b3ad19b9a889fc3d1052611ae309ffb0", + "description": "aco: improve vector optimization with sub-dword vectors", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "849eb0a77634d3ce8f5532ba47946a37fde8c20e", + "description": "radv: use RMW packets for updating the maximum sample distance", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "cb6ab17d1fffe2f387ce4ec7691f926260091118", + "description": "radv: add radeon_set_context_reg_rmw() helper", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f0b94262c18284dc61755634f01eb78051b4423e", + "description": "scons/windows: Support build with LLVM 10.", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c38946e62ddce554d20bf913cebeebf25bf153a4", + "description": "meson: Link Gallium Nine with ld_args_build_id", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "4756864cdc5fee9602ab63a9fa2c4b459667a6c2" + }, + { + "sha": "924f3f3de72a05dac1757e29b971cc680349b18b", + "description": "svga: fix build on FreeBSD", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9ce4db6231462998f9426c74681a464bda928842", + "description": "freedreno/a5xx+: Skip compiling the old gmem blit programs.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2513d0257c7698b1686fe26aa5db8b7b2969d845", + "description": "pan/bit: Add BI_CONVERT tests", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9f50b19534a2a534a5cc509516211656f84f2f08", + "description": "pan/bit: Add BI_CONVERT interpretation", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "640d69d1662b03f2d28319ede50be0bf3f6e9c4b", + "description": "pan/bi: ADD packing for CONVERT", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8cfe660326a2fb455bed0499c80f38f6f17f8f0e", + "description": "pan/bi: Rewrite conversion packing", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0b000c54c01930c5637526aa875f5372399f5e55", + "description": "pan/bi: Fix incorrect swizzle packing assert", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d0cf8b977c94e2bab0ccc91bb8bad91e2bb71358", + "description": "pan/bi: Set BI_ROUNDMODE for BI_CONVERT", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2799353f5b8acc0dc6a7a94090a13dbc76c23a33", + "description": "pan/midgard: Fix f2u naming confusion", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e4268ffb99279f46d9785bdccb6617022924a6c2", + "description": "meson: Specify the maximum required libdrm in dri.pc", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "a3a16d4aa7e5a22816226d8e7417138164b10525" + }, + { + "sha": "4b24b9647d24e8651449971508347129bef9f42b", + "description": "freedreno/ir3/ra: cleanup some leftovers", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "751c11a8c7a6f54f87c62e4b49802bf80826ec42", + "description": "freedreno/ir3: rename depth->dce", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "cf74048fd14eb594dbb23e07d37cf8df44564263", + "description": "freedreno/ir3: better cleanup when removing unused instructions", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "96ff2a4099d0eb5c29255429a0e5284e461ec8d5", + "description": "freedreno/ir3/ra: handle array case for SFU select_reg opt", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "cc82521de4e8e85022a5facb1b5f52d5139d3022" + }, + { + "sha": "b787b353d04e23fdea567186f7cb422fd687bcdd", + "description": "freedreno/ir3: add mov/cov stats", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "89a78a07dec8f6fab7a80bba951b134a42bb9a2c", + "description": "freedreno/ir3/postsched: avoid moving tex ahead of kill", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "017fdab2172c845eb7dd6e11d2a5c13245374247", + "description": "freedreno/ir3/postsched: remove some leftovers", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9701008d64171b1f16be9a8a69555df2b651c37b", + "description": "freedreno/ir3/sched: awareness of partial liveness", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d2f4d332dbb552af62fe5caabe67664d98f32229", + "description": "freedreno/ir3: new pre-RA scheduler", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0f22f85fe73f89b80851bb24936202c9bba97cc6", + "description": "freedreno/ir3: fix location of inserted mov's", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "908044ef4bf00daccfbcb037144c6ebe74d021c5", + "description": "freedreno/ir3: simplify grouping pass", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "860f5981f0a0635c301cd856892747ac20f793f5", + "description": "freedreno/ir3: make falsedep use's optional", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d09e3afdcc3bd57be8b97e675948ca92b0563abd", + "description": "freedreno/ir3: spiff out disasm a bit", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "40ccbae622d8f09e9513b8837d24f55d877709c6", + "description": "freedreno/computerator: support bindless sampler instructions", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "bc9a28beed0c7764f2c17dd96a8ac6833af34445", + "description": "freedreno/computerator: support nop prefix", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "95d4a956c08acafe0167ff8ba4dcd9b912962a70", + "description": "freedreno/ir3: CSE the up/downconversion of SEL's cond's size.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "82375ccaa461c759d4a588a68ed20bcee92edf8e", + "description": "freedreno/ir3: Stop doing b2n on the SEL condition.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0d1917da86e813c5609e48dda62f7cf8a049c48f", + "description": "tnl: Code formatting in t_rebase.c", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "887ae78718832cafde7a9ddca1e410e36b4d84e3", + "description": "tnl: Code formatting in t_draw.c", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ac13258a6eb468b4cc3995b423319b9b62f54d94", + "description": "tnl: Silence unused parameter warnings in _tnl_split_inplace", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7a004f7987d179bdfba12ab7e92b0bb9a55b9df4", + "description": "tnl: Silence unused parameter warnings in dump_draw_info", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "114e078001b3bade76b80fe99e39d346e88b6a4a", + "description": "tnl: Silence unused parameter warnings in _tnl_draw_prims", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1996f1d3dd2ddd5e894ce608436219c63872570f", + "description": "tnl: Silence unused parameter 'attrib' warning in convert_half_to_float", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7a03240b635cd67f345811b86b9faf106f862ec0", + "description": "tnl: Don't dereference NULL obj pointer in t_rebase_prims", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "f3cce7087a562f77be7306e70d4e62bc214bb5fa" + }, + { + "sha": "2e43b32e72b2adf7ce865f56cf2647b137a5342a", + "description": "tnl: Don't dereference NULL obj pointer in replay_init", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "f3cce7087a562f77be7306e70d4e62bc214bb5fa" + }, + { + "sha": "65f14fd68dac4fb52c765c82f08931d7aa745e61", + "description": "tnl: Don't dereference NULL obj pointer in bind_indices", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "f3cce7087a562f77be7306e70d4e62bc214bb5fa" + }, + { + "sha": "28d36d26c2212276e1238fad8f0b12caab97fee8", + "description": "aco: fix p_extract_vector optimization in presence of unequally sized vector operands", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0e4432bfbaef1bca65239848c373cd683f083ee0", + "description": "pan/bi: Lower fsqrt", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3025ea6abe97b42bc85fc7f6e6c47bf916da0738", + "description": "panfrost: Drop dependency on nonexistant write_value", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "53e4159eaaf692071bf63365eb27a16c97c9a3e5", + "description": "glsl: stop processing function parameters if error happened", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "d1fa69ed61d5aebeb69ce8a415c098035a953d48" + }, + { + "sha": "fc1068de0d124b746cc85b58564810c5453feee3", + "description": "aco: fix nir_op_pack_32_2x16_split if one operand is a constant", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4cfaef68d7de7959f6adeaec53077a630b603184", + "description": "aco: implement 16-bit nir_op_f2i64/nir_op_f2u64", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "729bdc0d70b0661c197e89b372f1b6d15e2acd69", + "description": "aco: fix f2i64/f2u64 with sgprs if the exponent computation overflow", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6a8e5dde6684c477edddb044a38a6618bf38d014", + "description": "gitlab-ci: Use all_paths in .test-manual rules", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5e6267b20b1a0c99cc58f3d4910525f9d07d50be", + "description": "nvc0: add NV_viewport_swizzle support for GM200+", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "90fcb3fef28f79d93f0baf88292c0ba068ede810", + "description": "st/mesa: add NV_viewport_swizzle support", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ff168b297d94f656899a904ac147a9ce5add65c9", + "description": "mesa: add GL_NV_viewport_swizzle support", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4137a79c2a7edb5f0caf0964ab748da7c279b61c", + "description": "gallium: add viewport swizzling state and cap", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e2457bedd389c6799fe99b1e0d6ade36b763c6c3", + "description": "glsl: remove redudant assignment", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e667802a7cb46c0aaeb93d3753d9c356c72b604e", + "description": "mesa: remove redudant assignment", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "fd241722006def2dfdffaf260daa74dc19332cae", + "description": "mesa: remove redudant check", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "25a61cce7dc6f5ab965577ebe589ccabf58945fa", + "description": "lima: set offset when export resource", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4094558e8643a266dfc8da9bc073751a3736a2fb", + "description": "i965: share buffer managers across screens", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "865b840a6b7545405a2e28f7c2d3422fadbc5b14", + "description": "i965: store DRM fd on intel_screen", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0a497eb1303d23f04ad7d9c28abf953a9105e32a", + "description": "iris: make resources take a ref on the screen object", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7557f1605968c39d680545d5b8457d17eea3b922", + "description": "iris: share buffer managers accross screens", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "bd3e50545339ffd4f258437d6282e2cfbf113725", + "description": "iris: properly free resources on BO allocation failure", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7aa6720ba4ea8dc107c7b363bcb2a1811a25dc71", + "description": "freedreno/log: better decoding for multiple chunks per batch", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7aa55f5acbcf7fa472805fd2c155a5cc0b9cb2a8", + "description": "freedreno/log: spiff out parser some more", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b5b32387d6fe16ae6ed5d582a165f8f21afee395", + "description": "freedreno/log: android support", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "904d5d63b47661950c5eace94e9ba3341bd4cb1c", + "description": "freedreno: Fix leak of binning shader variants.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5ec1f264f1f70806fe266606d3376a898a96292d", + "description": "freedreno/ir3: Fix sz vs class confusion", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "90f7d12236c5250bc56699a3071941daef0f515a" + }, + { + "sha": "65e2eaa4d3a7095ac438fafb09d1e36a4210966e", + "description": "pan/decode: Print Bifrost blend descriptor", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "80dd692813563332b7123b2ba9c9ad25177fa392", + "description": "pan/bi: Let !b2b imply branch_cond", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3439c24bdb529e6223de9cd18824b3d72ff52649", + "description": "panfrost: Fix BI_BLEND packing", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e34add229ff06203fe852b15cb7ff32cf9e4c344", + "description": "pan/bi: Fix backwards registers ports", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "23620d183035b068fe7dbda99fdc629264ca2d9e", + "description": "panfrost: Pass compiler-appropriate options", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e30091bc5162e560320e46abf23be8748cb269ae", + "description": "panfrost: Move uniform_count to pan_assemble", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d10423989ebace442cdbd8016188b44c663883c3", + "description": "panfrost: Move varying linking to cmdstream", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "776697dd349e4d5644a72fd293d8e7e436e6184c", + "description": "pan/midgard: Remove unused max_varying variable", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "90e02db9a13527cc5c64d83201614181cc95c131", + "description": "pan/bi: Fix nondeterministic register packing", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8016906cf20ab0f1cf84d11923ac38d2230bdba3", + "description": "panfrost: Call the Bifrost compiler on bi devices", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0a9fa4bcb6875f94700ec55d4f68560713cbba72", + "description": "panfrost: Set mfbd.msaa.sample_locations on Bifrost", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "46e4246d495da2fa836baaeec89a1030f3951f2e", + "description": "panfrost: On Bifrost, set the right tiler descriptor", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "547f999e2cb668e7cf4d0c30b9c72b45bf081e52", + "description": "panfrost: Don't emit write_value jobs on Bifrost", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "30e7027e1c74139b934f9b7768936299d3925fdb", + "description": "panfrost: Pass IS_BIFROST to pandecode_jc", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7b10d4ece6407db267bf89550b013829d9ad9b20", + "description": "panfrost: Remove most usage of midgard_payload_vertex_tiler", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b010a6d5f15520677d34015c88ec89046b811372", + "description": "panfrost: Unify vertex/tiler structures", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "aee68b06c8c1ba48e171b07a8d70606dbf7936c7", + "description": "panfrost: Staticize a few cmdstream functions", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "dd09571c7782834db44ae8ebafb24cceb224c2ec", + "description": "panfrost: Populate bifrost-specific structs within mali_shader_meta", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b096a1dbd3caca5d7a67ce139421736fc127cc35", + "description": "panfrost: Add IS_BIFROST quirk", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "693480a581e018bf61ca00889fcc8c479a418319", + "description": "etnaviv: remove the \"active\" member of queries", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7cb98e02e4e072389986b1f9461e76415e14531c", + "description": "etnaviv: change begin_query(..) to a void function", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7a9cbb2b6176b723d3ee54f7b4f6c885163da616", + "description": "etnaviv: drop redundant calls to etna_acc_query_suspend(..)", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b06fdb8edd9ef999ee8707335888f7609c144102", + "description": "v3d: Primitive Counts Feedback needs an extra 32-bit padding.", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "0f2d1dfe65bfe1ee8f02ce45f100a5508debdfd4" + }, + { + "sha": "38622de2ec3328d601f415b9f910210bf64caf6f", + "description": "aco: make some reg_file helpers private and fix their uses", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "331794495ed0e8bbd87cafedfa9ef334bb43b0b7", + "description": "aco: rename aco_lower_bool_phis() -> aco_lower_phis()", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1d41521b16c73113b5f5318d6e021310760f1e97", + "description": "aco: lower subdword phis with SGPR operands", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a39df3bfce2c418a251eeafe2e309c9543a9d50d", + "description": "aco: don't constant-propagate into subdword PSEUDO instructions", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1de18708cb5d3adba452f0c7f1aa3e8e6a6fd60b", + "description": "aco: ensure correct bit representation of subdword constants", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "637f45f3909326d18d6f64ff04eeb3bef205d2f8", + "description": "aco: setup subdword regclasses for ssa_undef & load_const", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "67b567d0d0c3c53a7fc04d22ea075494aae34cbf", + "description": "aco: implement nir_op_b2f16/nir_op_i2f16/nir_op_u2f16", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3119f978e5a4b7f532a74164866355b1c25238ba", + "description": "aco: implement 16-bit comparisons", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ccf8e23f59850c92f91e54438cbc6f4fa55978f4", + "description": "aco: implement 16-bit nir_op_fmax3/nir_op_fmin3/nir_op_fmed3", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "981ced07a542f94615b22762afa679af0605c692", + "description": "aco: implement 16-bit nir_op_ldexp", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "55537ed9d3e8869eaa9890a254ab35f7ce530ae1", + "description": "aco: implement 16-bit nir_op_f2i32/nir_op_f2u32", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "68339ff7a7b7766f0111f420c54c4f7516c2d6ec", + "description": "aco: implement 16-bit nir_op_bcsel", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0646562a17a9649461b60fd8723e91dbf527e4a6", + "description": "aco: implement 16-bit nir_op_fsign", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6793ae1c5e516c9f5a21414ccc800cde4fb0e7b5", + "description": "aco: implement 16-bit nir_op_fsat", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0ecca65d119ba34acbf2a681b78a574bd862785b", + "description": "aco: implement 16-bit nir_op_fmul", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b0c60999bc533327fa986ba0d80fe6954a1292ca", + "description": "aco: implement 16-bit nir_op_fcos/nir_op_fsin", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9be4be515f2a08b9c9e5ae1fc4c5dc9a830c2337", + "description": "aco: implement 16-bit nir_op_fsub/nir_op_fadd", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b0b637ca17a54ddef959870cc938e3f872a48977", + "description": "aco: implement 16-bit nir_op_fabs/nir_op_fneg", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "acc5912786197b4cbe7a32f76762db0d93f84e2a", + "description": "aco: implement 16-bit nir_op_fmax/nir_op_fmin", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "66d5bfb09ab067469ec747aa7745f77c302e2df6", + "description": "aco: implement 16-bit nir_op_ffloor/nir_op_fceil", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c097c9f20c8a246ed5ba37fe37108056e6619c77", + "description": "aco: implement 16-bit nir_op_fsqrt/nir_op_frcp/nir_op_frsq", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "26ed9fb79e409d4ad6ac3225d3bdaf7640b8cb30", + "description": "aco: implement 16-bit nir_op_ftrunc/nir_op_fround_even", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ee96181ad94499c28525961a241eaf443691f5ec", + "description": "aco: implement 16-bit nir_op_fexp2/nir_op_flog2", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b8486041df77a537c33cb0039833322ed2886354", + "description": "aco: implement 16-bit nir_op_ffract", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a8b45d7034eb482e217133180dff3e62bfb35150", + "description": "aco: implement 16-bit nir_op_frexp_sig/nir_op_frexp_exp", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "db74ad0696d205e0991281bc0e222290ab1addd5", + "description": "intel/compiler: Remove cs_prog_data->threads", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9ff55621d9c9d299ac8e4eb2fcfe6db8a7a1b2a1", + "description": "iris: Stop using cs_prog_data->threads", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "928f5f54349902c497e9293adeae2580123afbd9", + "description": "anv: Stop using cs_prog_data->threads", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5664bd6db383984192cf362884dd9fb17d8ed3a3", + "description": "i965: Implement ARB_compute_variable_group_size", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c77dc51203a45c8ae82d5a88d3e8fe99c32fc5bc", + "description": "intel/compiler: Add support for variable workgroup size", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c54fc0d07b1a92e065000c1301971b93439595e2", + "description": "intel/compiler: Replace cs_prog_data->push.total with a helper", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0536ca20d757b5ca9fc9f989ba64a545ab8235d7", + "description": "swr/rasterizer: Use private functions for min/max to avoid namespace issues.", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "ab5570820071d97c4adfe8cd8a90083f3784fa5e" + }, + { + "sha": "089e1fb287eb9b70c191091128ed5ba7edd2960a", + "description": "tu: Implement descriptor set update templates", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e1595026f675d5dee721d7fcd7f4c856a357ba96", + "description": "tu: Add missing code for immutable samplers", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a07b55443b59572d022924b65c2ab67cd91250e4", + "description": "tu: Emit CP_LOAD_STATE6 for descriptors", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d37843fee15f953e18fa9e87c3dc4761ef054998", + "description": "tu: Switch to the bindless descriptor model", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "fc850080ee304c2a62f7313c4b7ebe121c3ebb53", + "description": "ir3: Rewrite UBO push analysis to support bindless", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "274f3815a5efaf1b82f6c2d29122f7aac2fc9a19", + "description": "ir3: Plumb through bindless support", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7d0bc13fcab225c7a129de2e18936fe197003dcc", + "description": "ir3: LDC also has a destination", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1842961e58ccb3e1036bb9657416cf89c3982c50", + "description": "ir3: Also don't propagate immediate offset with LDC", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "de7d90ef53d585ee3efd165df1bf38b20794b3e6", + "description": "ir3: Plumb through support for a1.x", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c8b0f904398cdc30ffc67c162bc3f570bf887ed9", + "description": "ir3: Add bindless instruction encoding", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "122a900d7de826dcd1056f2ad2ea4c72d9129c06", + "description": "freedreno/a6xx: Add registers for the bindless model", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e088d82aa6f676fbab30e43514f0d8ddee341836", + "description": "freedreno/a6xx: Add UBO size field", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d3b7681df28e89311c4149708311de546e0a58cc", + "description": "tu: ir3: Emit push constants directly", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "63c2e8137d4dca0f5232a3c3a98e182e7ba7f943", + "description": "tu: Dump out shader assembly when requested", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d22e2b3bd0e60a18fda0aa54e72927afdc4b5489", + "description": "aco: RA - move all std::function objects into proper functions", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5351fee56a7c0e242f8c4344efc557a3aa3bf75c", + "description": "aco: move all needed helper containers to ra_ctx", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2ae27b96efca6473e9671a22d60f6b9496001413", + "description": "aco: change live_out variables to std::unordered_set", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "acc10a7e51770919ec215351661d46fa6fc355af", + "description": "aco: change some std::map to std::unordered_map in register_allocation", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "69b6069dd288455cdb2655284c592a85d17df273", + "description": "aco: refactor try_remove_trivial_phi() in RA", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b66f474121db89ea611cbcdf07a45168481d2590", + "description": "aco: improve speed of live_var_analysis", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "09850e0a9402ef876ced7c131bb2e703a6fb9c29", + "description": "aco: during RA only insert into renames table if a variable got renamed", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "48a74b6815f591454c9d38ca5932fe1ee3654a6f", + "description": "aco: replace assignment hashmap by std::vector in register allocation", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ba482c2e5fdb2a6d76133b17e2ba2283f1e7474c", + "description": "aco: improve register assignment when live-range splits are necessary", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "fb5a7902f20ad1285fa875c93bc719a1499d1cb4", + "description": "aco: improve hashing for value numbering", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c99107ece02b64916031883889b9e010c99e2435", + "description": "aco: add explicit padding for all Instruction sub-structs", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7f962a9362ec8bc8b03b7d44a13b44291e3b599d", + "description": "aco: guarantee that Temp fits in 4 bytes", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2e084c2cb3699e846753b31bd63ed6cd18cd73f8", + "description": "turnip: new clear/blit implementation with shader path fallback", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "de6967488aa0b1a1df92741d1c045d8479d3ad7e", + "description": "turnip: add vk_format_is_snorm/is_float", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "51fe52d2fdf6dad6de424c8110ab83b90f1ea0aa", + "description": "turnip: rework format helpers", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "009082dcfff5cbbce007d1b455ad33db1954c7aa", + "description": "turnip: use dirty bits for dynamic viewport/scissor state", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ed83281f0cd1d4c60106410c0b897d6db3280e50", + "description": "turnip: save attachment samples in renderpass state", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0637eab6780b98f727ecad3be15011324b2894f5", + "description": "turnip: disable 8x msaa", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f03e63cd9964b8f2577538d02545794c199f1b62", + "description": "turnip: fix nir validate failure from push constant lowering", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "86d1a4c907e9adcfc4ea803bb0207e92f98332e2", + "description": "turnip: split up gmem/tile alignment", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f494799a7f09deebacb5696fde7514e3329de246", + "description": "turnip: RB_CCU_CNTL fixes", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "cca7c299804703e686a3b9f04a978a994871853a", + "description": "freedreno/a6xx: set bypass RB_CCU_CNTL value for blitter", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e4c05a5335c352b0aeaf1d6fbf34d1b1e0a2ba9a", + "description": "freedreno/registers: add RB_CCU_CNTL bitfields", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2d8453e6e60fa9771cd655324f7c15c054b6db94", + "description": "radv: allow TC-compat HTILE with GENERAL outside of render loops", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4de84c8cbd6f6fe46703a3a8d5283460bbeb50fc", + "description": "radv: only enable TC-compat HTILE for images readable by a shader", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "63f07a30477d3a8cb80e8344c8315a4eb0b4a8a5", + "description": "radv: only expose fp16 control features for chips with double rate fp16", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1e4bd1de9897d05c39deb483a29fbb2a477374f7", + "description": "radv: only expose storageInputOutput16 for chips with double rate fp16", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1d74c6565d74188efea8bdef3dd33c1e4aa21f60", + "description": "radv: only expose shaderFloat16 for chips with double rate fp16", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a3113e07b90d56a09e53b5bf2f77171d13a049d6", + "description": "ac,radv: add ac_gpu_info::has_double_rate_fp16", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "420ca1e4a1814c86af5fabff8658526a6e2baab1", + "description": "turnip: use buffer size instead of bo size for VFD_FETCH_SIZE", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e62f8ae15a34b51a1fe1aa5752034e3037646d33", + "description": "turnip: improve vertex input handling", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "98743f648a356f55c1b10a439efea15937d4af41", + "description": "radeonsi: fix Segmentation fault during vaapi enc test", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a7e2efa7c95b78373d8a4b7b88ceb7b3769e6eaa", + "description": "radv: Use correct buffer count with variable descriptor set sizes.", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "bb7e44a23db614e58c997487b8dd5e61edf93f92", + "description": "radv: Whitespace fixup.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8b42d26132d49e96fadadb097d931fad8e18a094", + "description": "radv: set sparseAddressSpaceSize to RADV_MAX_MEMORY_ALLOCATION_SIZE", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "cc678c9ce934c1c83f6ed26696f896ca96ddefc1", + "description": "radv: check buffer size in vkCreateBuffer()", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a3682670c821d34a6c6ffdcb0c68d4ed42916f98", + "description": "radv: Consider maximum sample distances for entire grid.", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9f005f1f850710ea456f9847b1d247aaa8f0d6d2", + "description": "radv: enable lowering of GS intrinsics for the LLVM backend", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "cd99ea7318863aa61f8a4516cd153611de85989e", + "description": "radv: remove radv_layout_has_htile() helper", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ffea3e7348e70ad5a9485aefba428d518ca9476e", + "description": "radv: cleanup creating the decompress/resummarize pipelines", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6f6276bd244d9a544baee854dec270be6cb9de6a", + "description": "radv: rename extra graphics pipeline decompress/resummarize fields", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8b7586655fbbcc52de47cc110aa145e3703929cc", + "description": "radv: rename decompress/resummarize depth/stencil functions", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d6a8591f72a9f1ce48dc0eefdb89cc0825e8acf7", + "description": "turnip: fix compute shaders crashing after geometry shader change", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "1af71bee734da7d87e0ef1b71a64e12fa81ed92e" + }, + { + "sha": "52c8bc0130a2031904f8f4e2187baf2f3f8ff6ec", + "description": "nir: make opt_if_loop_terminator() less strict", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1f649ff10764b2f47fd69013f390e9286547cc3b", + "description": "radeonsi: don't lower constant arrays to uniforms in GLSL IR", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "3e1dd99adca55454239e197d4e793ee89f44fc6c" + }, + { + "sha": "c682ea598f72ded39903015ec3188969fe468ff7", + "description": "meson: fix debug build on Android", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "940ed5078da594623639580eebefaf75d6ddad4b", + "description": "radv: Store 64-bit availability bools if requested.", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ff8daa013621019f1606dc0c188b16f1ce34fea7", + "description": "gallivm: Add missing header for powf.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4399cacaf077d7c09f641a0b6cb935ab6e0e45b4", + "description": "turnip: Drop dep_llvm from dependencies", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5789505ab3b62e40fb30663cc973a97f3a7175ba", + "description": "turnip: Make Android platform build", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "97578c69e86383fee6030fb9b7333734c670cd5c", + "description": "turnip: Stub out VK_KHR_external_{fence,semaphore}_fd", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e99f6f2ea14ce4c3c0d69250b818ea0ab58ce02b", + "description": "turnip: Add missing VKAPI_ATTR annotations", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "80c13a81b160f73f706bba4315461d77760ced76", + "description": "tracie: Reformat code to fix indentation", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "efbbf8bb81e97a2b2d2e6e018750ef36cd460676", + "description": "tracie: Print results in a machine readable format", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1618159772a087b0914828bdcdfc0e95a2def350", + "description": "freedreno/a6xx: Set a level's pitch based on minified level0 pitch, not width0.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4b881d527020b4094ebae850a2dc680c28ccbf97", + "description": "freedreno: Add the outline of a test for a6xx texture layout.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9c6bfe8733f3c74f6ccf1238ab79eeb8203b72f7", + "description": "freedreno/a6xx: Drop the \"alignment\" layout temporary.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "59a222039860128dc2520fa21d7cddc48c563e2f", + "description": "freedreno/a6xx: Remove the \"aligned_height\" temporary.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "cdff81fa9a36b4c38ae40ea4a2666e443dd34b9c", + "description": "freedreno/a6xx: Sink the per-level size temps inside the loop.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4176dfa880d02e6898a9d19d929bfeab8d87388a", + "description": "gitlab-ci: Run merge request pipelines automatically only for Marge Bot", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "42fe600c0c27f2605fa616af4ffce157598d7253", + "description": "gitlab-ci: Don't require triggering build/test jobs manually", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "27c4ef13977f0f80925f5182bcaaaf3489be7b5a", + "description": "gitlab-ci/lava: Add needs: for container image to test jobs (again)", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c12576efbec152d7a5fa9f94daf80fa8aaca4850", + "description": "gitlab-ci: Rename \"paths\" YAML anchor to \"all_paths\"", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "cf54785239d3c53eb7baf89e914d7ef3e95ce11e", + "description": "anv/gen12: Lower VK_KHR_multiview using Primitive Replication", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "395de69b1febf4cfca29482e1ff7ddd2ae400d8b", + "description": "intel/fs: Allow multiple slots for position", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "afa5447312352cd68d4688d9521cb1de25a9939c", + "description": "intel/gen12: Add XML description for 3DSTATE_PRIMITIVE_REPLICATION", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5dc85abc4fe0a27beb00ef31bb21b79dbdcfec8d", + "description": "nir: Add per_view attribute to nir_variable", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0bc77bcdb2c4f943ac1c946daaeda6295242d059", + "description": "mesa: add support for NV_pixel_buffer_object", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a1727598a0ec5b33615624f2f3640dee6bf649c3", + "description": "turnip: implement timestamp query", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d64a7d6e69a827633526c0279b3ceaf012e355c9", + "description": "turnip: Enable geometryShader device feature", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "bdf6b481d8c8e2aa4767ce238c7df5ba8164e5d5", + "description": "turnip: Enable geometry shaders for CP_DRAWs", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b80dc4f5a6e7a43f86f6c7f8f56a26b348f92382", + "description": "turnip: Populate tu_pipeline.active_stages", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8eb009631217e415869709faadaf18dbeca8f8b0", + "description": "turnip: Update maxGeometryShaderInvocations to match blob", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3550e20229bfc1872ae041e66958187ee4a97ac6", + "description": "turnip: Selectively configure GRAS_LAYER_CNTL", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "475fe500bfe495ffa0715e7a19d044b8d6f4f341", + "description": "turnip: Set up REG_A6XX_SP_GS_CONFIG", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "fceccc411ac6351f4c30876c349cb56fb62820b9", + "description": "turnip: Configure VFD_CONTROL with gsheader and primitiveid", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "012773be26aafb71ab232a5838d8e5e7dcc3dc55", + "description": "turnip: Configure VPC for geometry shaders", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6eabd6bd51406f729689cce6b3b021c2731c69f9", + "description": "turnip: Emit geometry shader obj and related consts", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1af71bee734da7d87e0ef1b71a64e12fa81ed92e", + "description": "turnip: Set has_gs in ir3_shader_key", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "db2ee3686d396eabd9233e80e368da0e9ae521be", + "description": "radv: Print shader stage before disassembly.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "aa42b504d6e8a1f7129148a1ca42ef80009b72f3", + "description": "aco: Print shader stage in aco_print_program.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c24d9522daefce112b7a9d03a6d1abdf60f02656", + "description": "radv: Enable ACO for NGG VS/TES, but disable NGG for ACO GS.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "64225c4f962c2640dd280d3817517b75c7188622", + "description": "aco/ngg: Run GS_ALLOC_REQ on priority 3 for NGG VS and TES.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e4da482d9e94c2ca5935c8cbb864287aa192778a", + "description": "aco/ngg: Schedule position exports of NGG VS/TES.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c633edad72dbc65fb8ba30a623163fcf1840361a", + "description": "aco/ngg: Implement NGG VS and TES.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c5ed0883fc78d72bd5ec5bf5de4b66a6c905a546", + "description": "aco/ngg: Setup NGG VS and TES stages.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d7b4bb3a88540cc04f3ac62d07cf1ab062b784cd", + "description": "aco/ngg: Fix exports for NGG VS and TES.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ec72c504c623c12ea870e451f2d08a482d219469", + "description": "aco/ngg: Initialize exec mask for NGG VS and TES.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1436c0b8e07898b0d5118f6959db57491a9a2a85", + "description": "aco/ngg: Add new stage for hw_ngg_gs.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "35e58314d8e4b7346c2523612f6e0f52cc964232", + "description": "aco: Treat s_setprio as a scheduling barrier.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d345bfe1958db162b1ddde85eccd3248f884f231", + "description": "aco: Extract merged_wave_info_to_mask to its own function.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "90b1047fdf84724a00be36f204e790246ad63a72", + "description": "aco: Print block_kind_export_end.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b9cbdb6a457675c33480e446851a7f1df3be22e9", + "description": "aco: Extract uniform if handling to separate functions.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "cc8a85d05a9cf47e89c6a8c5e6db98caba79e00d", + "description": "aco: Fix crash in insert_wait_states.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "eeb626257d486feba814cbd608c7cd0a22159e6e", + "description": "pan/bit: Wire up add/add op+test", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e456630bd945cd5d47641bdb060298731383f05a", + "description": "pan/bit: Add fmin/max16 tests", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "fc446dc3220f618b485f5364f85651b373db81eb", + "description": "pan/bit: Enable more debug for `run`", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0e0f7f110cbf9a39325cf7c2361a611fd8c870b3", + "description": "pan/bit: Add min/max support to interpreter", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e9967e9f8011553a5ef160eb85d03cd1d3ae31d4", + "description": "pan/bit: Unify test frontends", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f91929e515441aa7ddba804323ab251c5e6d040b", + "description": "pan/bi: Force ADD scheduling for MINMAX", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9279ed1550bfcfe5e12b618d1b73289a51baf24b", + "description": "pan/bi: Fix incorrect abs flip in fma/fadd16", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3bbce876e627e724aa85ddaf5ec70d38c8caad53", + "description": "pan/bi: Set BI_MODS for MINMAX", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "822f127fe5b60887f90d8561559d979748049961", + "description": "pan/bi: Add ADD add/min/max fp32 packing", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "36e4c6b267bf1083efd3417beb333e43d28062d7", + "description": "pan/bi: Structify ADD unit add/min/max", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f6bd0ec907a6fe61638046bd1286c7b3548fecbd", + "description": "pan/bi: Implement min/max on FMA", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "545fc7b26aeef90f5c03d2a900a8e038011758d3", + "description": "pan/bit: Add special unit test", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8e50d449501d486141db5213027b538cc6d2ecb2", + "description": "pan/bit: Add special op interpreting", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c37c799284d59b445c110d7b5ca4b1ee6fa64492", + "description": "pan/bi: Add fp16 support for frcp/frsq", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d7bb7b79a8b0a86ee3fbd21fffa8e3d0c8dd03a2", + "description": "pan/bi: Add 32-bit _FAST packing", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a6ae2d8f940df3d9e0b71b13336ca01e5b6a2c47", + "description": "pan/bi: Remove nontrivial SPECIAL ops", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "20a4b1461bab25af48d73b07ca5bafafc397eb2e", + "description": "aco: zero-initialize Temp", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "85521061d63c47b931e6d3e4a1ea540e46d358a6" + }, + { + "sha": "8dd6a51e8010a0a5b33e1a4c7a7a3251ddaa8e50", + "description": "aco: remove divergence check in sanitize_if()", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "57557783f6156862b5e946201d833298518dab75", + "description": "nir/lower_amul: fix slot calculation", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4638a16a9302a0e7ebf95dc5e025d2623127cf25", + "description": "nir: add some swizzle helpers", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e78a7a182524f091e2d77ba97bfbe057c3975cab", + "description": "nir: Assert memory loads are aligned", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "068a3bf0d7cda0301b3dfc2e258698c6848ca706", + "description": "util: move and adjust the vertex upload heuristic equation from u_vbuf", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d9cb0ec5e611b5ba469a20e27fcd4001e88e841c", + "description": "vbo: expose helper function vbo_get_minmax_index_mapped for glthread", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e69e59778c53c7176519c63fb961952e4596dfa0", + "description": "mesa: split _mesa_primitive_restart_index into a function without gl_context", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e6bc1702f484be1ecb592295c4877281075798be", + "description": "mesa: precompute _mesa_primitive_restart_index during state changes", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "10beee8a7757e956ab0e02ec8402ec1273d8edce", + "description": "mesa: remove no longer needed _mesa_is_bufferobj function", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "58fab9a6fe258395d934b69c454f1b54bcefedf1", + "description": "mesa: remove NullBufferObj", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "54525808aa58b0f94892d3f4e5919cb4ae9493cf", + "description": "mesa: don't ever bind NullBufferObj to glBindBuffer(Base,Range) slots", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f3cce7087a562f77be7306e70d4e62bc214bb5fa", + "description": "mesa: don't ever bind NullBufferObj for glBindBuffer targets", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e630271e0ec3eee7d921d76d3924873f6ee6b59b", + "description": "mesa: don't ever set NullBufferObj in gl_vertex_array_binding", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a0a0c68150b4dbba469c62159a327ae9465f6016", + "description": "mesa: optimize initialization of new VAOs", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "dbdd0149ed5b28730a31ebc2bc49f8e955523bbb", + "description": "android: aco: add various compiler statistics", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "b1544352c022953febcc2c2c448ba21551e6b215" + }, + { + "sha": "9f174eb2df128dd89f61ac07b5d394c24668a43c", + "description": "nir: fix wrong assignment to buffer in xfb_varyings_info", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "84e845c9696ab673f1d95fda47843028ed0c71a7", + "description": "mesa/st: release variants for active programs before unref", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4822cc97007b0dae4d095c507efc182628510434", + "description": "mesa/st: unbind shader state before deleting it", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "82597c46c3df19ba04fcb6694a5f4874cdc916ec", + "description": "pan/bit: Add mode to run unit tests", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1a1c55709e486df80cd05e23a7a1b312d0066c0f", + "description": "pan/bit: Make run more useful", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "50476efb619b44e62aee605f083583e8300d24f1", + "description": "pan/bit: Add csel tests", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9b262208b620fea21a7c44fbc74e17b846953ad1", + "description": "pan/bit: Add CSEL to interpreter", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "069189ff0f2beb3dd9004a1e37b8cc0cdeac4f23", + "description": "pan/bit: Add FMA tests", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "78ba6d50a42227812a3ba2b20f924f2d2cbf17db", + "description": "pan/bit: Add 16-bit fmod tests", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "40160c576d2c7f11e9690bb542a707a7cf0134af", + "description": "pan/bit: Add verbose printing for tests", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7c887d368e6d72f2b4d189a60a37ccee18b8a8cb", + "description": "pan/bit: Add helper for generating floating mod tests", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "14c534386742b44bc02349684b0a0e3972fec91d", + "description": "pan/bit: Add packing test framework", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5e3e32e368caabc50b669967b1a81b0f32102194", + "description": "pan/bit: Implement floating source mods", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "dbb8a564f2661fe8f665ea0f2e277c19259ba968", + "description": "pan/bit: Implement outmods", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ab5818560434333647a61e1e27255b4a29677c6c", + "description": "pan/bit: Add preliminary FMA/ADD/MOV implementations", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "fbe504e2217a06930cbd62e775435b8234006a02", + "description": "pan/bit: Handle read/write", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7904a29340e151361421384d05bed0bdf4077b14", + "description": "pan/bit: Stub out BIR interpreter", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8eefb2765ab2253fe99ddf3ae32a2a901046d8d1", + "description": "pan/bi: Match CSEL argument order with hw", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9114ebbe798e101b6bb2b86219cfd3ba9793068f", + "description": "pan/bi: Add helper to debug port assignment", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0ab3f687c012c7e29fbb9da348bec1854ee85fd7", + "description": "pan/bi: Handle BIFROST_FIRST_WRITE_FMA_P2_READ_P3", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "75aabc6ea1616c44833db44255d3f33a0df368ca", + "description": "pan/bi: Allow BI_FMA to take mods", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "69dde49f809dd00d76ee491e1e6a6d5b7e9b2ff2", + "description": "pan/bi: Don't gobble zero ports", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c7a6df4638de16299d9a621db31c9ee68e4addf6", + "description": "pan/bi: Fix negation in ADD.v2f16", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5f48caf98ba536b9ed753f445ebf5488dc465f46", + "description": "pan/bi: Fix duplicated source in ADD.v2f16", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "08fe1081b72b16499329a56a0c5bc8de28168335", + "description": "pan/bi: Export bi_class_name", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c04964c6909544ebcf4c631c2b8b91594cd387fe", + "description": "lima: avoid situations when scissor minx > maxx or miny > maxy", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "eed5a009897a859ec118ef84c0437be174a49da3", + "description": "etnaviv: convert perfmon queries to acc queries", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "20e0ef88ed9bebc8f34105e1c8bb725009bb7f98", + "description": "etnaviv: move generic perfmon functionality into own file", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c111f79b1ce962a9ef138d0d5c894258961e83a5", + "description": "etnaviv: extend acc sample provide with an allocate(..)", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e0bc251ef8918dd4fe89604941d8d5a0c482aae7", + "description": "etnaviv: extend result(..) to return if data is ready", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e5b0eed0f57fefebcf6ac18b008fa362d6543da9", + "description": "etnaviv: make use of a fixed size array to track of all acc query provider", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6963fcd81fe1a502a68214adc2f3e720a9f9a4c1", + "description": "etnaviv: extend acc query provider with supports(..) function", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f47b4eddd93ca538d2ebc90666955b7e2ee06494", + "description": "etnaviv: rework wait/flush logic", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d1697fef1a92c9be74ff7447e1e16a9e8e4314f1", + "description": "etnaviv: reset no_wait_cnt after triggered flush", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "238190403068edf4c6a9b761095193a88bea841d", + "description": "etnaviv: explicitly call resource_written(..)", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f2c4892512732bbf08951a188e4a5774951d6f86", + "description": "etnaviv: rework etna_acc_sample_provider", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "46096a4cb40885b202efeb1c3986a54548538a49", + "description": "etnaviv: rename hw queries to acc queries", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7af813d48a55a14401df51870feddfaa418397e1", + "description": "glx: use anonymous namespace to avoid -Wodr issues when building with LTO enabled", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "17d783b2ed4f8b18d8577c39897d6c81e1f88876", + "description": "glx: fix 630 times -Wlto-type-mismatch when building with LTO enabled", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a0a4df7e4f15ceb18fc0053b4fdd7d0cf567df4d", + "description": "Revert \"spirv: Rewrite CFG construction\"", + "nominated": false, + "nomination_type": 2, + "resolution": 4, + "master_sha": null, + "because_sha": "fa5a36dbd474fb3c755da51553c6ca18dab76a06" + }, + { + "sha": "51492f20f7e8fde5077f5c54165307eeb4cd1f2f", + "description": "Revert \"gallivm: disable rgtc/latc SNORM accellerated fetches\"", + "nominated": false, + "nomination_type": 2, + "resolution": 4, + "master_sha": null, + "because_sha": "4897e70ccd3987d470ec8622d473ee3405f6e96f" + }, + { + "sha": "aa95b6aed5bca8c56bb09d9d0a2c92184f1ba671", + "description": "gallivm/rgtc: enable fast path for snorm types.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "03204dadbc1829128f3e0a5e74f4f85851f6e708", + "description": "gallivm/rgtc: fix the truncation to 8-bit", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0b06adb75054842294e4dbbe2e5af294470862fb", + "description": "glsl: don't limit fp16 lowering to frag", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f054230ea3cb17409233660efd562be28d914127", + "description": "freedreno: limit fp16 to frag and compute", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c0d56efa31e37e9ec35cf0b65c578c513976ed32", + "description": "freedreno/ir3: also precompile compute shaders for shaderdb", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "37e052c8b0882904d80ab1721ccb1ebed727af9f", + "description": "freedreno: fix missing locking", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "d0b3ccb06076c921e60afbf0810b3b50bbce39e4" + }, + { + "sha": "f8fc690d1c2720d36893daf9beb95ec60e64a34a", + "description": "freedreno/a6xx: add some compute logging", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "629c0cee0a4c05e7096189c6bcd8b3d7d164f5f2", + "description": "freedreno/ir3/cf: use ssa-uses", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "72f6b03aec42dcfa7b38aa4eb1c94a0143e2aed0", + "description": "freedreno/ir3: add a pass to collect SSA uses", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "67dbe8088fd97e944ea9adbf7080d63f8343f475", + "description": "freedreno/ir3/cf: skip array load/store", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c2d0cc8b8d5bd2f3b194642fd0187283da51d4ae", + "description": "freedreno/ir3: fixup cat3 32b vs 16b", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e73a8a9703b3be23ece12c1092185926522b6c4d", + "description": "freedreno/ir3/cf: handle widening too", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "bf64648864224abe28d883f0c878214530ccf08c", + "description": "nir: fix definition of imadsh_mix16 for vectors", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "c27b3758fa0dcd7fade9e85c5483b8310b8263d7" + }, + { + "sha": "1d293096d0a223ea903125db7756b31aedab451a", + "description": "aco: use MUBUF to load subdword SSBO", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8cfddc91999965545eb2d973557840354f40a2fa", + "description": "aco: implement 8bit/16bit store_ssbo", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3df0a41c75256d0f71d407981962fc803bbd9afc", + "description": "aco: implement 8bit/16bit load_buffer", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c70d01445504ef9c008ed7d80f26bad9d676b61d", + "description": "aco: implement storagePushConstant8 & storagePushConstant16", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5718347c2b42ee25e5377d40024aaaa929889c44", + "description": "aco: implement vec2/3/4 with subdword operands", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "85521061d63c47b931e6d3e4a1ea540e46d358a6", + "description": "aco: prepare helper functions for subdword handling", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "fe08f0ccf94a7315bded5868b4f6a8bae744de79", + "description": "aco: add byte_align_scalar() & trim_subdword_vector() helper functions", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "23ac24f5b1fdde73cf8ec1ef6cbe08d73d6776f5", + "description": "aco: add missing conversion operations for small bitsizes", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d223e4e8de8207b320473207764f2a4d6299b2ff", + "description": "aco: don't vectorize 8/16bit load/store_ssbo", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0bb35376764f0770df8f84e6383487f17328136e", + "description": "aco: don't assume split_vector(create_vector) has the same number of elements when optimizing", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c436743b0c43f73b205b8845453fdbaada63f0d8", + "description": "aco: don't propagate SGPRs into subdword PSEUDO instructions", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8f1712ba2f833d1b20aff9d2873e41bae1adb92e", + "description": "aco: lower subdword shuffles correctly.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ca38c1f1f1cb3d2d25eee2e0806cec452b31d164", + "description": "aco: add builder function for subdword copy()", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9f779a25187d1f453255ed7dd922304545b73f5c", + "description": "aco: small refactoring of shuffle code lowering", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0680b258f4d58d2948cf12fd04ee141eee023a16", + "description": "aco: align subdword registers during RA when necessary", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "031edbc4a54d5685b05e244f8aa1e094ec246eb5", + "description": "aco: adapt register allocation for subdword registers", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2c74fc98b803dfbc4b8970e07daa944e2d591bb9", + "description": "aco: create helper function to collect variables from register area", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "aca2bbf9758307716e54fabc49d0e2682b737e8a", + "description": "aco: add notion of subdword registers to register allocator", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "90811554da8afca7099abe7c7c66e6b1c009e16f", + "description": "aco: remove unnecessary reg_file.fill() operation in get_reg_create_vector()", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7de003473cca40e36b8116a39b9457a371fc10fc", + "description": "aco: fix Temp and assignment of renamed operands during RA", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2d957311f114a2999b25ddfbbb5a41235e98d5de", + "description": "aco: print subdword registers", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3c0c28a1ffbc12b8d389e6db1782253f01cbd9c2", + "description": "aco: validate RA of subdword assignments", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "799bb103283d3324075af1277c41d44d90201034", + "description": "aco: validate uninitialized operands", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9374659426197d8d9a686332e2d8c82b124c6fff", + "description": "aco: validate register alignment of subdword operands and definitions", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ad4e104bb98019d51b1f20798c0754f2e051a8cd", + "description": "aco: validate p_create_vector with subdword elements properly", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f01bf51a2b5769aa7bb71f3c76f700c3b4257ac1", + "description": "aco: refactor regClass setup for subdword VGPRs", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c4223fa512251792beaeee0bd28b7e9cd0b4fe9e", + "description": "aco: add emission support for register-allocated sdwa sels", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8acb38447116aa294da47f17424e329c34eccbd6", + "description": "aco: add sub-dword regclasses", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9915af5ca17f94f58135413cd8034b4733c0abb4", + "description": "aco: print and validate opsel", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b84d59af50a53959fcde232ee2682e77569a7da2", + "description": "aco: add SDWA_instruction", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "00312f3c95d9ef2f545a8479d6ad289bc791974b", + "description": "aco: add comparison operators for PhysReg", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "34424b81df6e5ffb2d22c572864ab6f6b4ac1abb", + "description": "aco: make PhysReg in units of bytes", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "dc69738b0fc787fd9183a815ce43c06e005ec4b3", + "description": "nir: fix unpack_64_4x16 in lower_alu_to_scalar()", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "373f1eb9deab9d9435fba15be6de2a28bde79b87", + "description": "drm-shim: stub libdrm's use of realpath()", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c3e305616cbc53317bbace6f1f316c9167f14313", + "description": "drm-shim: return device platform as specified", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "fa5a36dbd474fb3c755da51553c6ca18dab76a06", + "description": "spirv: Rewrite CFG construction", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2de5a41595442a02b5375d13082e236e1475c0bf", + "description": "spirv: Add a parent field to vtn_cf_node", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d94e464a9fc5da334ae224810f855fff6890be50", + "description": "spirv: Make vtn_function a vtn_cf_node", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "255aacbec14c2d11d7756ec94b95244165120ff6", + "description": "spirv: Make vtn_case a vtn_cf_node", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9d7fcf1de04b69d8657671220265f8ec5e1cd274", + "description": "spirv: Add cast and loop helpers for vtn_cf_node", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8c5c65d0d6a64a3bcc057e5cf61a94eda4b72f86", + "description": "spirv: Add a vtn_block() helper", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "991c4261604b136cac0770c7d6c7345ea134129c", + "description": "intel/nir: Enable load/store vectorization", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "36a32af00897ee32ace344d020a8522c0a8c1a92", + "description": "nir/load_store_vectorize: Add support for nir_var_mem_global", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b6273291b5646887c8488f71b2119709e15e7d0e", + "description": "nir/load_store_vectorize: Use nir_iadd_imm for offsets", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "04d08ea149c05e4d5dad819269d74713aac270da", + "description": "nir/load_store_vectorize: Fix shared atomic info", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "ce9205c03bd20d26af23ca891e97a9f848a612d1" + }, + { + "sha": "c1bcb025dba7b73a865916dcda616d0479c94476", + "description": "intel/nir: Lower memory access bit sizes later", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f1883cc73d4ea2c6d3a73dfe55c8b346f3ef8ac6", + "description": "iris: Set alignments on cbuf0 and constant reads", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4c8b1003889bfb0f708d91dc7caa08a37f9caef4", + "description": "anv: Improve brw_nir_lower_mem_access_bit_sizes", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c6439792287f11f25cb2b62d699f52daefe54a44", + "description": "intel/fs: Choose memory message type based on bit size", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6e7645347288c7dd8a4b95d69a4617278aa7b7c3", + "description": "ir3: Disable copy prop for immediate ldlw offsets", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ea51f8f79ac43ff00d78cd53266f92125b2d5fd4", + "description": "radv: fix null winsys gpu_info array", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "de550805c5d96b17e7b7db4a0c62b29db354fd74" + }, + { + "sha": "319158a814ae0833573d5e4ff7150504aebae59b", + "description": "pan/midgard: Fix a divide by zero in emit_alu_bundle", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "355abfeed5fd234433a24ce983e3abc48c2d2b58", + "description": "turnip: Advertise 8 bit subpixel precision", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "61566f2ae18c7d4906f24150965aea809b7680f9", + "description": "mesa: update pipeline when re-linking a program in use", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1288ac7632b31a20497a0e75f374f66ce3d5bc3c", + "description": "nv50: don't try to upload MSAA settings for BUFFER textures", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "3bd40073b9803baf62f77ed5ac79979e037d2ed6" + }, + { + "sha": "b38c32a57380c228813ec59823fd5510ee93ce4c", + "description": "intel/aub_viewer: fix access to freed memory", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7e6aec668772eb9cac014d78ba95272a063167fe", + "description": "radv, aco: collect statistics if requested but executables are not", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "507956ed04fcdcfd44419d1b16f032e1d81d0dcb", + "description": "aco: add vmem/smem score statistic", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b1544352c022953febcc2c2c448ba21551e6b215", + "description": "aco: add various compiler statistics", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ad2703653f306f0fa751ddfd546d1d93ce348630", + "description": "radv: add code for exposing compiler statistics", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "bfb9c08e5c5474688611c339135b8feeedc9bdd3", + "description": "EGL: Add eglSetDamageRegionKHR to GLVND dispatch list", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "9827547313c7239486efbd4067529575f98f1622" + }, + { + "sha": "8af2eba4245636ff867743577433cff4009e16c7", + "description": "docs: update calendar, add news item, and link releases notes for 20.0.4", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a89b08b7449c6188b8f129f43c84f229b5101b0b", + "description": "docs/relnotes: add sha256sum for 20.0.4", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "71e6f15a240fed8c73059794c97c0f4b78f11342", + "description": "docs: add release notes for 20.0.4", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "43f785419cba3072fdfd3130ce3e51b37485739d", + "description": "util/xmlconfig: fix sha1 comparison code", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "8f48e7b1e991375f6b03fac09a4f4416eb2f1afc" + }, + { + "sha": "655e8449d0194e8482ec25e914ce7dd7fccb4f97", + "description": "radv/llvm: enable 16-bit storage features on GFX6-GFX7", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3cd5450df52c6a314979f5dadf2f6f9d83deb533", + "description": "ac/nir: split 16-bit SSBO stores on GFX6", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "55fdcc03de8dd7cf62d5b6e3d2369c55e222a822", + "description": "ac/nir: split 16-bit load/store to global memory on GFX6", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7308f2e9121d90de55da57c9e7ec06ff2b2212bc", + "description": "radv/llvm: enable 8-bit storage features on GFX6-GFX7", + "nominated": false, + "nomination_type": null, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c6bf1597d1e8abf122371118b04a85ee0aa6b3d5", + "description": "ac/nir: split 8-bit SSBO stores on GFX6", + "nominated": false, + "nomination_type": null, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "433f3380eb2ba97363ec8f47bc7d29904a4d355e", + "description": "ac/nir: split 8-bit load/store to global memory on GFX6", + "nominated": false, + "nomination_type": null, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c953292630985cdd0d295f64e880610710bbf50d", + "description": "aco: always optimize v_mad to v_madak in presence of literals", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "63b4fcba33848029e7dd1476d9f82070308a7239", + "description": "glsl/lower_precision: Use vector.back() instead of vector.end()[-1]", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ff1a3a00cb37d84ab9a563f0aa241714876f56b4", + "description": "clover: fix build with single library clang build", + "nominated": false, + "nomination_type": null, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2243f0cd0178fdc5063d049f192b4365122207b9", + "description": "radv: Filter extensions not whitelisted for Android", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d6368d404b7d69649b3d606c14dbf9610ba92494", + "description": "st/vdpau: make query test for 2D support", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c1cc79739a8745f2004917a2b170dba4124ef323", + "description": "st/vdpau: avoid asserting on new VDP_YCBCR_* formats", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c71c1f44b055c680f073a2608a3bf560b55f8974", + "description": "nir/from_ssa: Only chain movs when a src is also a dest", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "73e574acb85c06386dd59f11401eea43a2895d5a", + "description": "freedreno: Rename RB_DONE_TS", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "36133a5434d38d8a4983df3fcd31b7e5dccf00cf", + "description": "freedreno: Cleanup event names", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2077421437038e3bd73544c34f567601b912ce09", + "description": "gallivm: fix stream id fetch", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0a3a8806707b7ba4fc2e79896b63c35b1d9f80f6", + "description": "gallivm: switch the mask6/mask7 cases for signed rgtc formats", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ebb5b88a02637908b9f4bfd0644964d6347b23af", + "description": "gallivm: fix rgtc2 format", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5cc27d59a11ed11081b3f5c9acc3280ec412ebed", + "description": "anv/image: Use align_u64 for image offsets", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4e3a7dcf6ee4946c46ae8b35e7883a49859ef6fb", + "description": "gallium: enable EGL_EXT_image_dma_buf_import_modifiers unconditionally", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e0aa203fa94a58f12b0ee10ee47ba9c59bbd43f1", + "description": "driconf: whilelist more games for glthread", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d0f836e5aef29c5887cb5f3d3cba2b1d7d5d78fd", + "description": "tracie: Switch to using shutil.move for cross filesystem moves", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7b7dbd4fc832eb67a4afd013f8cb623cedcf0d51", + "description": "wgl: do not create screen from DllMain", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "99a0864b481eaf3e8c50e6057628779096747333", + "description": "wgl: move screen-init to a helper", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "098d4cf25f34183f35482ee40d4ef2c4e810269d", + "description": "wgl: drop unused member", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0a8da6102d03d50ca5b220567e78e3fa1f922fba", + "description": "wgl: drop pointless debug_printf", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "dbc86fa3de6aba480f679a36b40227c0fe27c37b", + "description": "radeonsi: dump shader stats when hitting the live cache", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8306c533fe6b220b7ac3b40084266a47640fcf33", + "description": "gallium/util: let shader live cache users know if a hit occured", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d259768e62591b4e5bb49042d6535f0a76164e3f", + "description": "glsl_to_nir: remove dead code", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "191ced539a18e4738e7e6bce7612779dced1625a", + "description": "anv/pipeline: allow more than 16 FS inputs", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "460de2159e2aa8e67f216d9ad8e9ce00cc8c9679", + "description": "intel/compiler: store the FS inputs in WM prog data", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "67c7cabd7fa9e6babb423080d53a045980e295ef", + "description": "anv: use urb_setup_attribs in SBE", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1ac9f362e0f848ef5bdc6ede12d0389bb7407ff6", + "description": "docs: update calendar, add news item, and link releases notes for 20.0.3", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a264edd74c746409ec2d9231c6475361668e5d16", + "description": "docs/relnotes: add sha256sum for 20.0.3", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2e01090b54b721fbfe8d085e9b1dcf69f9f9cd99", + "description": "docs: add release notes for 20.0.3", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2a2fd4c5308dee51d48630863255f1c6a04768a9", + "description": "gallium/llvmpipe: add an optimised 32-bit memset", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c07bbdbe8268a2c80c602f71eb413f0d84920038", + "description": "nir: place aligned members after bitfields in shader_info.tess", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "f1dd81ae1047304a4cfb0861cb85c69a2ae776ec" + }, + { + "sha": "90a8b458acea4231a921962bee220ba76e6517a2", + "description": "nir: check shader type before writing to shaderinfo.tess union", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "f1dd81ae1047304a4cfb0861cb85c69a2ae776ec" + }, + { + "sha": "e47bf7dadff612694cf61eedbabc7b313766053f", + "description": "anv: Do not sample from 3d depth image with HiZ", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0487130d3477c85abee77018e17d0ffe2d136349", + "description": "gallium/swr: Re-enable scratch space for client-memory buffers", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "37b8130bf9ecfc8c9138211ca9d89554bf70d47d", + "description": "gallium/swr: Fix array stride problem.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c1e7e83d52d912b9a1ba5009e8f84d737c1e0d21", + "description": "ci: Consistently use -j4 across x86 build jobs and -j8 on ARM.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2f424c83e072f6a21d15af1064f6e744e801fbfa", + "description": "aco: only break SMEM clauses if XNACK is enabled (mostly APUs)", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "68f325b256d96dca923f6c7d84bc6faf43911245", + "description": "Revert \"spirv: Implement OpCopyObject and OpCopyLogical as blind copies\"", + "nominated": false, + "nomination_type": 2, + "resolution": 1, + "master_sha": null, + "because_sha": "7a53e67816ed9baf7d825ed60ee59f0c05f9df48" + }, + { + "sha": "91478db20d5ccfeb9c35652cf01acdcfcd285414", + "description": "loader: fallback to kernel name, if PCI fails", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "bf1838838a695f27d9d9c486e608a9412044a598", + "description": "loader: move \"using driver...\" message to loader_get_kernel_driver_name", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e3572f977fc662bdecdac4f525f415bdc7e62147", + "description": "loader: simplify codeflow in drm_get_pci_id_for_fd", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "164f4a9a4a32b8e638ce3f3c684d147d70a3b1de", + "description": "loader: simplify loader_get_user_preferred_fd()", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "25b2b325885710dc7dc7539d6d37e77182767088", + "description": "loader: use a maximum of 64 drmDevices", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d3c91439713ecf025c7fe97aae3a4829b3f1250b", + "description": "Revert \"egl/dri2: Don't dlclose() the driver on dri2_load_driver_common failure\"", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "1b87f4058de84d7a0bb4ead0c4f4b024d4cce8fb" + }, + { + "sha": "fa5e800e05ab227786862383d3243e06c06d36d7", + "description": "egl/drm: reinstate (kms_)swrast support", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "47273d7312cb5b5b6b0b9faa814d574bbbce1c01" + }, + { + "sha": "b699d070a6de273fb3a964e05944b203d0b57090", + "description": "glx: set the loader_logger early and for everyone", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "d971a4230d54069c996bca78b6ed6a6a23377821" + }, + { + "sha": "06f758b0931794f5b8edb23587633f172e1b685d", + "description": "meson: glx: drop with_glx == dri check", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "70ac7f5b0c46370075a35067c9f7dfe78e84b16d", + "description": "mesa/main: remove unused macro", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9ddd9d454c16959d92e6c785aac77ead83fab0b3", + "description": "mesa/main: clean up extension-check for GL_TEXTURE_EXTERNAL", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "dd6b35c99ee6e4cb1be60799fd653ced2c73940e", + "description": "mesa/main: clean up extension-check for GL_RASTERIZER_DISCARD", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0006dfbaed1376b88e1f2eadbd2da3ba3cd86ef3", + "description": "mesa/main: clean up extension-check for GL_TEXTURE_CUBE_MAP_SEAMLESS", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "994675b24d74e9092d9b41541436e911ae8faf18", + "description": "mesa/main: clean up extension-check for GL_FRAGMENT_SHADER_ATI", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "541708680f471eb3bd984bdecca7ba36fe12f0aa", + "description": "mesa/main: clean up extension-check for AMD_depth_clamp_separate", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e2dbd31dc0eb69f3a1df262842635c6ff8103d84", + "description": "mesa/main: clean up extension-check for GL_DEPTH_BOUNDS_TEST", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "67a7022f83965795c5a5080f3ac5aedb7d9387b3", + "description": "mesa/main: clean up extension-check for GL_STENCIL_TEST_TWO_SIDE", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "421a1accf0e7840f476d88e63e5861ec4f8deb76", + "description": "mesa/main: clean up extension-check for GL_TEXTURE_RECTANGLE", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "81d901aef1582dfaf7e5f27f01fc9928975ad047", + "description": "mesa/main: clean up extension-check for GL_VERTEX_PROGRAM_POINT_SIZE", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a5e781aa8085ef5f6fcfbf3dc09fc8899e64e13d", + "description": "mesa/main: clean up extension-check for GL_VERTEX_PROGRAM_TWO_SIDE", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "12e228fc9c7aa06809797d4b706ee05a2eb7c735", + "description": "mesa/main: clean up extension-check for GL_VERTEX_PROGRAM", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "23570066bfdeebb54953f2c3ac6c7dc73e368f38", + "description": "mesa/main: clean-up extension-checks for point-sprites", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "70b6972140e74037109b7f8c57d442c8316d09dc", + "description": "mesa/main: correct extension-checks for GL_BLACKHOLE_RENDER_INTEL", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "74ec39f66d506c78ee62a685b7fa055faa0991b9" + }, + { + "sha": "1e3b74ee73f897bfe50c9bf27458c95870d8c317", + "description": "loader: Warn when we fail to open a device node due to permissions.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "15a9f6c07228f59401954bb18b9c2f980297f634", + "description": "svga: Treat forced coherent maps as maps of persistent memory", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "46fdc288fb52345134fd9aacd6d7ff71c7b747bb", + "description": "svga, winsys/svga: Fix persistent memory discard maps", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "71b43490dd04c03d4027230b0939b81ab91650ca" + }, + { + "sha": "1b16d6354bc9f64ed97fc400977e3ffcb4c09268", + "description": "pan/bi: Fix outmod/roundmode flip", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "12cf9f43f02ac00b9604e12f1fb26e363941d90b", + "description": "pan/bi: Handle fmov class ops", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "357b8b59065c50dd4d8eecf437bb721be38092bd", + "description": "pan/bi: Fix unused port swapping", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b150fa214b259f0039293c2e727bb77d7417c541", + "description": "pan/bi: Add cmdline option for verbose disassembly", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ae4f48b2bc3b9e1969be738fe230a7b5a880bd0d", + "description": "pan/bi: Don't set the back-to-back bit yet", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0b241c70b6a1f39840aec2fe5db43f0e33221d7b", + "description": "pan/bi: Use STAGE srcs for scheduler nops", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2292e2aa10a3ef77ef4d195c09aad334a18bd080", + "description": "pan/bi: Fix writes_component for VECTOR", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b033189dd7d0e2c403ed17b5ec5e76b0c93dbb36", + "description": "pan/bit: Wire through I/O", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b26214e9075c5b8dfc24118a1724b5dd3bb5e22b", + "description": "pan/bit: Add `run` mode to the cmdline", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "cb56d5d9f8b6df7f50cb60f734363b3c769d8d8c", + "description": "appveyor: Remove Meson job.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "59754409cc6e9c9e8f9f82a4a523e7370c373a07", + "description": "freedreno/log: fix build error", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b097e326b8b066d3697c79aec2c6c32c453757f9", + "description": "nir/algebraic: Remove a redundant fabs pattern", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "af1bc7e0c7dd1f3c4f2226f93e819e410fd7a731", + "description": "nir/algebraic: Use value range analysis to convert fmax to fsat", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "62795475e8f45f92bb8f467d9e2318fdfdba6297", + "description": "nir/algebraic: Distribute source modifiers into instructions", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c0bdf37c9100c4e473f53defccab4e2ae6b7a7b1", + "description": "nir/algebraic: Change the default cursor location when replacing a unary op", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d2b4f3f1374c179e066b1fec56875613b7e64945", + "description": "intel/vec4: Allow late copy propagation on vec4", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0f4a81430e65e09db13d2472fd46105a95ea625d", + "description": "nir: fix crash in varying packing on interface mismatch", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "26aa460940f6222565ad5eb40a21c2377c59c3a6" + }, + { + "sha": "31011c7a39f0d054b97b730c8928176d72c9707c", + "description": "freedreno/turnip: Use the NIR info to decide if we need helper invocations.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "974b9c57c1efec7b58339f23d0d35bae2c6d9890", + "description": "freedreno: Drop an unnecessary include marked \"this should go away\"", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "127fa5d00c3fbcc23b7f5fea4ab50da061477cba", + "description": "freedreno/ir3: fix android build", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "e5339fe4a47c242693962c9f90bbab8b74935cba" + }, + { + "sha": "ae7da1a01706835120bd59ea069e49cf325feaa3", + "description": "util: move ALIGN/ROUND_DOWN_TO to u_math.h", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7a53e67816ed9baf7d825ed60ee59f0c05f9df48", + "description": "spirv: Implement OpCopyObject and OpCopyLogical as blind copies", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "88c046a6d3228cc3a667cba96e4fa57c341ab162", + "description": "isl: don't warn in physical extent calculation for yuv formats", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "015f08dd4301ee6f6c00d76c03a197d1522a1e51", + "description": "isl: set bpb for Y8_UNORM", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5f4d9b419a1c931ad468b8b22b8a95b1216891e4", + "description": "scons: prune unused Makefile.sources", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "2e92d3381988a85b2a6dcc8d8a8d7158ace9f348" + }, + { + "sha": "d63acce5f43b6feb8ae81d62f5c2d6976384d81c", + "description": "tu: Return the correct alignment for images", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d84c206d85c15cb8bc7e2d3113ab40c2b65f47cc", + "description": "freedreno/fdl: Add base_align", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "896a7c28eb4b993751bb49659de7511b3fc68756", + "description": "anv/allocator: Use util_dynarray for blocks in anv_state_stream", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "63bec07e14d1cd8e01bf45bcda341bb364620cfc", + "description": "anv: Account for the header in anv_state_stream_alloc", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "955127db9376b49a41a428f5f89137cafec89b1c" + }, + { + "sha": "6e672074dd1f3c105396a9d7a9bc35ea785569c9", + "description": "st/mesa: add environment variable pin_app_thread for faster glthread on AMD Zen", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4df3c7a2079bc6d11149fa42ff0ca27ea70a7942", + "description": "gallium/u_threaded: call the driver to pin threads to L3 immediately", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4de35bed423a9e4204498b83b5be7f16399363bc", + "description": "lima: also check tiled and depth case when import", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e46b2ef7243a7f916b7d77f3495bea26f4f24d62", + "description": "lima: fix buffer import with offset", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "02ad147c5c80a124630992ae6c5ae705c6c68bed", + "description": "pan/bi: Fix handling of constants with COMBINE", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "bd19e7634027036dfc67633579750f1d45a45b74", + "description": "pan/bi: Handle fp16/abs scheduling restriction", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c88f816169cf2efa0bfcbe1e9a5b0c7948fb1ade", + "description": "pan/bi: Handle abs packing for fp16/FMA add/min", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ba8e11f0f173fd5e5ec376c6e7e582ea845b7499", + "description": "pan/bi: Handle core faddminmax16 packing", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "12a16f224767e3c235f79aa2dbacf1bfacdc4659", + "description": "pan/bi: Structify fadd/min/max16", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c12a208d78203ccd5377b7b3291018c5d2f5b08a", + "description": "pan/bi: Add v2f16 versions of rounding ops", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f81b67b857bd941ada721aa77ccbf430456eff0c", + "description": "pan/bi: Handle round opcodes in frontend", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c7170e974234d5c5bd1a6f5f7b2f76ef5acc44c1", + "description": "pan/bi: Assert out i16 related converts for now", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2fd8b2e6d4ca6fd9276baf4002228de99350e8c5", + "description": "pan/bi: Add one-source f32->f16 op", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "197c6414ea4dc61fa115b082aed694e7d36b69e5", + "description": "pan/bi: Add bifrost_fma_2src generic", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "57a8e6e8d0e52f7be6b187bb4ac5112341515f91", + "description": "pan/bi: Handle standard FMA conversions", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "499e97b5196632de4a2c4e461e849df2897ae14b", + "description": "pan/bi: Enumerate conversions", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "902f99a45d3e1a7e1ef85429c0ed4e067b2656f3", + "description": "pan/bi: Expand out FMA conversion opcodes", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "73715124ea53df1a3ef8cae6097556b98611dbb4", + "description": "pan/bi: Pack outmod and roundmode with FMA", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "158f2452f23545005f2213801e58f089ade9b7ed", + "description": "pan/bi: Add FMA16 packing", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b5148b6b490b19b4f525dc07b470cf088e0eead3", + "description": "pan/bi: Fix missing type for fmul", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5eb209a05f61dc9ab5347a0e9dcd2e97c91f1b37", + "description": "pan/bi: Finish FMA structures", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "375a7d0f32ff7ea94da9c975aa1a852d848e254b", + "description": "pan/bi: Ignore swizzle in unwritten component", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "aa77d8128e93e2ea637c7fcacb88f628ecdb1239", + "description": "pan/bi: Handle f2f* opcodes", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c2a8ef907b528b125ff0827e8ea2588a85022fd6", + "description": "panfrost: Enable PIPE_SHADER_CAP_FP16 on Bifrost", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "77e04eb2e2e0170343ab424f51ace3fbc175cb77", + "description": "pan/bi: Enable precision lowering in standalone compiler", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "683cd9b6f4f8c7c9531b39c5c5ef6f21a5396ea5", + "description": "pan/bi: Fix off-by-one in scoreboarding packing", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f3726a08743a100c7e163489800fd0560da015b9", + "description": "pan/bi: Fix overzealous write barriers", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3d7166fa698f046814eb3803ec9ef5a5438e816f", + "description": "pan/bit: Begin generating a vertex job", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a0d1be30e1b4bc6f9440851c183ea03609b4f253", + "description": "pan/bit: Submit a WRITE_VALUE job as a sanity check", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "97029c773e8c62378b2dae76ac813a8a8b9232b8", + "description": "panfrost: Stub out G31/G52 quirks", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "bf1929e4792ab5d6416b8a707bf2a8c2e694bc62", + "description": "pan/bit: Open up the device", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "39378eec578c4855dbcad19605242ca038e575ee", + "description": "panfrost: Move device open/close to root panfrost", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "fd18695a2697bf54cf11894959780c2c761a1808", + "description": "pan/bit: Link standalone compiler with en/decoder", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0f65f00a0dc438350454ba247b453cf80271a671", + "description": "panfrost: Move pan_bo to root panfrost", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3283c7f4dadafee97e9af0e6613da43fad3c0019", + "description": "panfrost: Inline reference counting routines", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "02a638cc51491a827d852313e780ca0c35f699c9", + "description": "panfrost: Isolate panfrost_bo_access_for_stage to pan_cmdstream.c", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ca8c62592c71885df653ecb008f5c0bad71420d4", + "description": "panfrost: Split panfrost_device from panfrost_screen", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "50e3b2e3904074422f5d4d9ceccfc3cce64199eb", + "description": "panfrost: Correctly identify format 0x4c", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "bd87bcb8ac9039dcae675cef977c08eadb75c438", + "description": "panfrost: Add support for R3G3B2", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "49a81a431e0b5aef53da2319b327da717603fbc6", + "description": "st/mesa: Fall back on R3G3B2 for R3_G3_B2", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "81d059c89859841bf6c4c74e29d2dd95f6dd93bf", + "description": "panfrost: Add support for B5G5R5X1", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "bad6fc48712b761351bdd63e92859456a225d53e", + "description": "panfrost: Mark 64-bit formats as unsupported", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9468f0729b1f826a8b8e84e4dea58d4a3bfe46af", + "description": "nir: Handle vec8/16 in nir_shrink_array_vars", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c26bf848ba7c5474ac99ffbe942021d8841e53ed", + "description": "nir: Handle vec8/16 in opt_undef_vecN", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "99540edfde8576b751f96ae0c686ea6300c8a5ec", + "description": "nir: Treat vec8/16 as select in opt_peephole_select", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e3554a293bfc8364b98745dc5a8d219185a84af9", + "description": "nir: Handle vec8/16 in opt_split_alu_of_phi", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2aab7999e45d2dc6c61e4fce094aa114d57e2c7a", + "description": "nir: Handle vec8/16 in lower_regs_to_ssa", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1033255952b4555b4435c6e92cdc8119a353697a", + "description": "nir: Handle vec8/16 in lower_phis_to_scalar", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ac7a940eba264d3df556df025162df8cbad5da37", + "description": "nir: Handle vec8/16 in gather_ssa_types", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a18c4ee7b07cb0c78b7d93005cc76eded4e8001c", + "description": "nir: Handle vec8/16 in bool_to_bitsize", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f5bbdf7621f882f3e769ea6941f0b6ad46e032d7", + "description": "nir: Copy propagate through vec8s and vec16s", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "842338e2f0bdf2b7025f2d29851aa90dd2106777", + "description": "nir: Add a nir_op_is_vec helper", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "84ab61160a18edab0e1698e1e54e560b57d5a9ab", + "description": "nir/algebraic: Add downcast-of-pack opts", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "14a49f31d3977c2b072b9ef2fdeebebca69fe1d7", + "description": "nir/lower_int64: Lower 8 and 16-bit downcasts with nir_lower_mov64", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "62d55f12818e55bdbe83cf164f9af37a1a6d64d8" + }, + { + "sha": "1b3aefad46bda59ff02c0d81c53fd3fbf249d8f4", + "description": "freedreno/log: avoid duplicate ts's", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2bf7dba80bb196bbb557e26017a5297c80fe2428", + "description": "freedreno/a6xx: add some more tracepoints", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "31173a7e7afac301eac4ab97c830542a803fa35c", + "description": "freedreno: add some initial fd_log tracepoints", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "55839fd41c81701c817edc0b227aef363c3f3e36", + "description": "freedreno/a6xx: timestamp logging support", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a0ca1462f31747d028abe9106309f6c95c3daabf", + "description": "freedreno: add logging infrastructure", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ffd32266780a83695ae5dd8d36b73fe970cfe4dc", + "description": "util: fix u_fifo_pop()", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "6e61d062093a71e267aed02870607fc5a0d7d8f4" + }, + { + "sha": "356b93f1023649b48774f56c7ec127bb327e8272", + "description": "freedreno: remove some obsolete debug options", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b113170559b25a14439264e9f6aa5c0a41045589", + "description": "nir/opt_loop_unroll: Fix has_nested_loop handling", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "87839680c0a48a007bce2aca9f056694ad8bd35d" + }, + { + "sha": "92afe94d28b8e6cb016fdbb59e415ec7257f5512", + "description": "freedreno: Work around UBWC flakiness.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d0b3ccb06076c921e60afbf0810b3b50bbce39e4", + "description": "freedreno: Fix detection of being in a blit for acc queries.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "57d54bcf994f2b3a963f73775a3dd756e8d96be3", + "description": "freedreno: Rename \"is_blit\" to \"is_discard_blit\"", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8cdc6c1e4b632cea6934836ca4962a555a1172c6", + "description": "freedreno/a6xx: Fix timestamp queries.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7ef61c1f1008f26a53db5fdfdb39ea1968c40284", + "description": "freedreno: Count blits in GL_TIME_ELAPSED and perf counter queries.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4a0783994810fd00e0a6727e902796dc2abe2b41", + "description": "freedreno: Associate the acc query bo with the batch.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "36612c96bd2a354b4c31eeb331d2f4bbad2f210e", + "description": "freedreno: Fix acc query handling in the presence of batch reordering.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a99ff933748a7ad00f9fcb5cb0dde536c3bef149", + "description": "freedreno: Remove the \"active\" member of queries.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b7fe793869e1f9b0a8013c5c5e161122e326540e", + "description": "freedreno: Remove always-true return from per-gen begin_query.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1ef9658906655edb6b2beaf2951a9dc81d93b827", + "description": "util/u_queue: fix race in total_jobs_size access", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d101ca3f5ad85731cedbe7ab399d4323cca1aac6", + "description": "glsl: fix race in instance getters", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f5b14d983e5afa1b8f75e6f3692830a1ee46d1df", + "description": "nir: Set UBO alignments in lower_uniforms_to_ubo", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "fb64954d9dd55b45239c27af122bf60c3962d006" + }, + { + "sha": "4a909068ade7125e32e626c870d2197e1f5896c2", + "description": "aco: look at p_{extract,split}_vector's definitions in pred_by_exec_mask()", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9197fd59dafefa0c1f8b049ff53381cd1a616ae4", + "description": "CI: Re-enable Windows VS2019 builds", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "fb64954d9dd55b45239c27af122bf60c3962d006", + "description": "nir: Validate that memory load/store ops work on whole bytes", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4e80151c5d75bf5d4b67b0791c3eb06515345a83", + "description": "anv: Set alignments on descriptor and constant loads", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c217ee8d35fcac8ab11e7b5bfd0e053e1fed7df0", + "description": "nir: Insert b2b1s around booleans in nir_lower_to", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d2dfcee7f7ebf87dae9570f1c7476eacb6240f83", + "description": "nir: Use b2b opcodes for shared and constant memory", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "16a80ff18a0c6210f9c4c6d2668537dba2349608", + "description": "aco: Implement b2b32 and b2b1", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b2db84153a75f44daa6c5ca259a62682f714f723", + "description": "nir: Add b2b opcodes", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2cb9cc56d53c20109c1deccd4e12cf2ee015aafb", + "description": "intel/nir: Run copy-prop and DCE after lower_bool_to_int32", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5278e9dea7e6b91fb6a915b775da5e14dcbca811", + "description": "etnaviv: compiled_framebuffer_state: get rid of SE_SCISSOR_*", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "22ee3eabcad25b3819aad7ca168315b540a84769", + "description": "etnaviv: s/scissor_s/scissor", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "43b4eb394cd8fe6cdf46111152354fc59fb235b0", + "description": "etnaviv: get rid of struct compiled_scissor_state", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9491c1b04d1d85335b2a9be6dafe86ad38e17075", + "description": "etnaviv: do the left shift by 16 at emit time", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5ba2d398d8a8e8b8d1bf90aa3f68df87f52c7844", + "description": "etnaviv: rework clippling calculation to be a derived state", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "95763e20cea3e85e7886421a73be7a68a84b5c80", + "description": "etnaviv: get rid of SE_CLIP_*", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "27d58a1c20cc39a87fe36455f221f6bd4ef811c0", + "description": "gitlab-ci: Prune all SCons jobs except scons-win64, and allows failures.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3935a729d998274ba78ab70e9eb6dd7dac2c2368", + "description": "nir/algebraic: add fexp2(fmul(flog2(a), 0.5) -> fsqrt(a) optimization", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2e92d3381988a85b2a6dcc8d8a8d7158ace9f348", + "description": "scons: Prune out unnecessary targets.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0f847b18bc91dced5725169e8c96bef6c077db90", + "description": "aco: Don't store LS VS outputs to LDS when TCS doesn't need them.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "798dd98d6e530afc5dab2f973785fbbd4e598dee", + "description": "aco: When LS and HS invocations are the same, pass LS outputs in temps.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0a91c086b8649a65befa3fdf3ef8460761bb87aa", + "description": "aco: Extract store_output_to_temps into a separate function.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0f35b3795d131517c6dce15d86783dd98951548a", + "description": "aco: Fix workgroup size calculation.", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "a8d15ab6daf0a07476e9dfabe513c0f1e0f3bf82" + }, + { + "sha": "99ad62ff277df284f4e6a460db7f72a463ddedc5", + "description": "aco: Extract setup_tcs_info to a separate function.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0ad65f2c55623e8578c39c5837e357f5566780cf", + "description": "aco: Zero-fill undefined elements in create_vec_from_array.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "50634ad4a08e0054c778cad14a5522f5d619c0ed", + "description": "aco: Change isel inputs/outputs to a flat array.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e4a1b246a41cfbc8829bb19526d2a4604ef94564", + "description": "aco: Treat outputs of the previous stage as inputs of the next stage.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f1dd81ae1047304a4cfb0861cb85c69a2ae776ec", + "description": "nir: Collect if shader uses cross-invocation or indirect I/O.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e7d733fdab58b7fd08aa79ef7713e7be847377f4", + "description": "aco: Use more optimal sequence at the beginning of merged shaders.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "17c779ab9e2bb9329f07299e327ac2c1c81f3cb3", + "description": "aco: Skip 2nd read of merged wave info when TCS in/out vertices are equal.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4ec48440a0b9f1045d958659cda3fde3126868c0", + "description": "aco: Allow combining LDS loads when loading tess factors.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ace3833293e5aa49bb76e11aa96ad0a01e9538bf", + "description": "aco: Allow combining TCS output VMEM stores.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e2b1d749b175e6966884598c2df8c451abf8bd98", + "description": "aco: Fix handling of tess factors.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d3f6adcaed9775283ce2fb63a39aaee85e1312b0", + "description": "aco: Extract tcs_driver_location_matches_api_mask to separate function.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e0dff5fd86179b4d265060d5fc6138bb6a50b54d", + "description": "aco: Create null exports in instruction selection instead of assembler.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "87839680c0a48a007bce2aca9f056694ad8bd35d", + "description": "nir: Fix breakage of foreach_list_typed_safe assumptions in loop unrolling", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "716a065ac05b2347054077aea389d3c877585b6f", + "description": "radeon: switch to 3-spaces style", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d7008fe46a8f689ce4ee2b14b61dc39baebccaa8", + "description": "radeonsi: switch to 3-spaces style", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "53e5e802f88134492e4c88936f2d7a961cd431a5", + "description": "radeon: fix includes", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7f52bbb7c026b846fae3ccecbab7d0b2693e5c45", + "description": "ddebug: add missing forward declaration", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "04885d61dd228f3da6f88584d3eb1d7e4c228e98", + "description": "meson: Add VS 4624 warning exclusion to remove piles of LLVM warnings", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5127160fb61927c40117f80c3abd291f9d3fa98a", + "description": "meson: disable some more warnings on msvc", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2db1d73e5301f74c6e028a67f5b9dd767ff8cfe2", + "description": "CI: Avoid htz4 runner for VS2019", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8970b7839aebefa7207c9535ac34ab4e8cc0ae25", + "description": "intel: drop unused include directories", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "231273d588a84436a96cc9c75ecf1858ca15c30c", + "description": "vulkan: drop unused include directories", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "79af30768d6cf8e28c7cf49f99fff6c2b2cb030b", + "description": "meson: inline `inc_common`", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5a32dda8e6f5780ed5e0927486f5eb4971d0162b", + "description": "meson: use existing variables in inc_common", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7df75203052fa8a8f2fbd603a4c7553752c57b0f", + "description": "mesa: Change _mesa_exec_malloc argument type.", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "e5339fe4a47c242693962c9f90bbab8b74935cba" + }, + { + "sha": "fcd3377cfe23e419b9235424cef9db4792fac80b", + "description": "gitlab-ci: Update to current templates", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "447890ad64cb64bacafce8402e013b81e09359fe", + "description": "Revert \"gitlab-ci: Disable jobs for Collabora's LAVA lab\"", + "nominated": false, + "nomination_type": 2, + "resolution": 4, + "master_sha": null, + "because_sha": "1351ee03352b12690233a73e160f92da2edecf16" + }, + { + "sha": "e6097375269a4823af3088bc2487d383c90c49f1", + "description": "radeonsi/gfx10: fix descriptors and compute registers for compute-based culling", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4ef1c8d60bd5f7ee2d8bc7e878d293256b921008", + "description": "radeonsi/gfx10: fix the wave size for compute-based culling", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b4a0087a1ceba4965b6c391e425d66f887c22de8", + "description": "radeonsi/gfx10: user correct ACQUIRE_MEM packet for compute-based culling", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "acc5bdf8870e0c47c00028f9f5502de036e79d3d", + "description": "radeonsi/gfx10: fix ds.ordered.add intrinsic for compute-based culling", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ee4d797d8b074c27de0e055d2e22b82f642f2359", + "description": "radeonsi/gfx10: don't use NGG culling if compute-based culling is used", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "65e9239977963c5caaef12cfd6b6c6e285f86381", + "description": "radeonsi: add num_vbos_in_user_sgprs into the shader cache key", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "be9455bdf767dd851c56075e6dd84b27e95dcfc2", + "description": "radeonsi: always create wait_mem_scratch for compute-based culling", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "42ce52b904323b1a165cfbc568f708ce7f39fe45", + "description": "radeonsi: set amdgpu-gds-size for mode == 2 of compute-based culling", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3381f2fa06be807c910ddb41aa239606419841d8", + "description": "radeonsi: fix incorrect ordered_wave_id initilization for compute-based culling", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d89b19cfe1bd8bd6f259b6a4d2070e8d08f3b67e", + "description": "radeonsi: remove obsolete TODO comment related to compute-based culling", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5d45ffbfb6c4db9962f705cddf90acee59f4f24b", + "description": "lima: Implement lima_texture_subdata", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6a10397a01248140de1aa92ce826dee88d445a1a", + "description": "gitlab-ci: disable vs2019 build", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f7d53275fb7e48481de00adfaff16ae8d333dd14", + "description": "freedreno/ir3/ra: re-work a6xx merged register file conflicts", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "faf276b4c85f807b4d57cd17a92ebcb421e99ea9", + "description": "freedreno/ir3/ra: split building regs/classes and conflicts", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "90f7d12236c5250bc56699a3071941daef0f515a", + "description": "freedreno/ir3/ra: pick higher numbered scalars in first pass", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1da90ca9bf9fc5b317e0b71f3f77f0bacd725969", + "description": "freedreno/ir3/ra: compute register target from liveranges", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d2cc92c747c20613190ba330f22f219d26422cfa", + "description": "freedreno/ir3/ra: fix array liveranges", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "1b658533e1109b8ff9a6578a7eb4b390454e7876" + }, + { + "sha": "6347c2ea89bde624dd16cff6741db57e89d88ad5", + "description": "freedreno/ir3/ra: add def/use iterators", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "bf0aa7ed90231540c66328a515928dd8e3324343", + "description": "freedreno/ir3/ra: drop extending output live-ranges", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0e7d24b532e1a52cb5b3da36d1ed5b2204fb0acb", + "description": "freedreno/ir3/ra: add helper to map name to array", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d99d358389ca95ce23cfccf67150cccc66ff6407", + "description": "freedreno/ir3/ra: fix target register calculation", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d20a06e40199f4082cea73a3636b87823c76ed2b", + "description": "freedreno/ir3/ra: add helper to map name to instruction", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "29992a039e9959110139353664b7eb12d991e8d0", + "description": "freedreno/ir3/ra: split-up", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6da53911c15a33bf73fb1423b3e99affaceb0f75", + "description": "freedreno/ir3/ra: add debug option for RA debug msgs", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "142f2d45516132dfe577815859179f661828c32b", + "description": "freedreno/ir3: convert debug bitfield to BITFIELD_BIT()", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3d0905582a3d75882e1fd3846a500934045aa622", + "description": "freedreno/ir3: reformat disasm output", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "afdb8e390787b8199a554a0fe688cc1150e4c58f", + "description": "freedreno/ir3: fix bogus register footprint with tess/gs", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1b4b455739dec43fea669509dc8585f6fbaa0487", + "description": "freedreno/ir3: remove unused helper", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c6a879275345c8c0b4885a68cfa8d72c2193fb8a", + "description": "freedreno/ir3: add bary_ij as src for meta:tex_prefetch", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a0de0db0e44c4a7096f57b6c242c7ec139987aa5", + "description": "freedreno/ir3: small cleanup and comments", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7d9a794f356beb73f08278df06fa1ef5670d012c", + "description": "freedreno/a6xx: register update", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "46a32f0b6bf91279d001a4905babe4e50007696e", + "description": "CI: Disable Panfrost Mali-T820 jobs", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "871bd2819d51bf9720d2ff57522ec31f254431a5", + "description": "util: remove duplicated MALLOC_STRUCT and CALLOC_STRUCT", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "71646745002504302bcef51bdb438abbe66961fc", + "description": "util: don't include p_defines.h and u_pointer.h from gallium", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "013b65635f1db45650809d8d802436c1e9d009e3", + "description": "radv: stop including files from mesa/main", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "76f79db3f5d8492370c92080b5bbea7e31827b75", + "description": "util: stop including files from mesa/main", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c42fa40a51efcf877915689bf170c67fff7e5600", + "description": "mesa: don't use <> for including internal headers", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e5339fe4a47c242693962c9f90bbab8b74935cba", + "description": "Move compiler.h and imports.h/c from src/mesa/main into src/util", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6cfe074b8617f92b4538f3006b0c4e55d6681436", + "description": "wgl: use gldrv.h instead of stw_icd.h", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ec201692645e1d7900d2500682e998a635eeb638", + "description": "wgl: add official gldrv.h header-file", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c9091f1f24394499100b475e232f6e6c54d40650", + "description": "nv50, nvc0: fix must_check warning of util_dynarray_resize_bytes", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f4a4d4607e33695faf17b98f54535df90289657b", + "description": "nv50: remove unused variable", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "c574cda3c6a3f880f99e4e22967fc82e34609942" + }, + { + "sha": "aad0e6f81049c098fd3922d61aa228e4bf791317", + "description": "intel/perf: store the probed i915-perf version", + "nominated": false, + "nomination_type": null, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8e7202d45f6da94dcfdf2b8975a2d5a45734a14c", + "description": "intel/perf: document meaning of query field", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "dde96d31b75f7895900405ab771fb3dd0dd78069", + "description": "intel/perf: move mdapi query definitions to their own file", + "nominated": false, + "nomination_type": null, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "33b9c7a7f68bb5a4362751ba7daf1ba2b10ece95", + "description": "intel/perf: break GL query stuff away", + "nominated": false, + "nomination_type": null, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f5c5574f427c710fa2ed7413dc970ccb649b16d7", + "description": "intel/perf: move register definition to special file", + "nominated": false, + "nomination_type": null, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b9d2b5dcecc278055b0687f588255c7441a9a668", + "description": "gitlab-ci/traces: Add D3D11 sample entry for POLARIS10", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "07e5b3ad50118fac990fd56a08e5e5bcade5be44", + "description": "gitlab-ci: add Wine and DXVK env variables to Vulkan's tracie runner", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6bae042b3da4d962cd19137ddcde110ff6a88359", + "description": "gitlab-ci: replay apitrace traces in headless mode", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9f4acd465edc1360a1d5ea2646379bd5db3e1598", + "description": "gitlab-ci: add apitrace's DXGI traces support", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "fb8fa83a30a1ec66982854da0a8d7870cf1d2f93", + "description": "gitlab-ci: add Wine, win64's apitrace and DXVK to the Vulkan testing container", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "05a3b49308606fbf8c30688ece3177ad6eb17515", + "description": "gitlab-ci: Don't use buster-backports packages by default for x86_test-vk", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4a8876b025b7c9aa3ec8283f31e10b835c165980", + "description": "CI: Windows: Fix Docker tag argument inversion", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "07885cbcdb0b19265379c3941600faadc8a22d71", + "description": "CI: Add native Windows VS2019 build", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "bc98de4d14f9c099e47a7de6efc3766823ca3f54", + "description": "util/test: Use MAX_PATH on Windows", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "f8f1413070ae079443ab31a75679cfd10cb756ed" + }, + { + "sha": "8f573bdaaa7c41b19edf99e891665378b76d8fd4", + "description": "util: fix process_test path", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "f8f1413070ae079443ab31a75679cfd10cb756ed" + }, + { + "sha": "1351ee03352b12690233a73e160f92da2edecf16", + "description": "gitlab-ci: Disable jobs for Collabora's LAVA lab", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b5e00f5c2bd35920a1aaf9bb676c784d5d18b6fd", + "description": "nir: fix packing of TCS varyings not read by the TES", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "26aa460940f6222565ad5eb40a21c2377c59c3a6" + }, + { + "sha": "8b9ebbcb546816f525298dc24711c1922751e312", + "description": "glsl: fix varying packing for 64bit integers", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ba2ec1f369d2c97fc7c54ecd52b0addcfd349a31", + "description": "ac/nir: use llvm.amdgcn.rcp in ac_build_fdiv()", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d548384fc686f4e9cc9e6551f9a582cc740f3233", + "description": "ac/nir: use llvm.amdgcn.rsq for nir_op_frsq", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "66426ce119b3b647f9ace62b74f18342cacd43a4", + "description": "ac/nir: use llvm.amdgcn.rcp for nir_op_frcp", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e352e7e792699661422218c1dc8ad06b4bbf6652", + "description": "x86: Add ENDBR at function entries", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9899a8e26c5c9063c3627e246981d727321e5ba3", + "description": "mesa: try to fix the android build", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "8a3e2cd9b26beb4d57ea417c4528e22c00c92126" + }, + { + "sha": "36c155a0178d6f05d65a9acfe5b7553d7a522f07", + "description": "intel/fs/gen12: Fix interaction of SWSB dependency combination with EU fusion workaround.", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "e14529ff3262a527d630cecac655f69c8ae15c3f" + }, + { + "sha": "007e623025bfc1f66686d1e1cd6a3efeff863f69", + "description": "x86_init_func_common: Add ENDBR at function entry", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2d0599b1b46968dda34953118e92d8720875162c", + "description": "intel/aub_viewer: Fix format specifier for uint64_t", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7b9f1b6ef755a07abcd396b42948ae6bf0a569a6", + "description": "panfrost: Extend the tiled store fast-path to loads", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "dac1573a3586565b8b78bd6aab3664921cc1adb1", + "description": "mesa/format_utils: Add a fast-path for RGBA to BGRA", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0847fe6e7fa6ee07420c6eed95bfee036748bc6a", + "description": "glsl: set error_emitted true if type not ok for assignment", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "d1fa69ed61d5aebeb69ce8a415c098035a953d48" + }, + { + "sha": "05069e1f0794aadd40ce9269f858e50c64254388", + "description": "gitlab-ci: Fix traces caching in tracie", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "efdce97e4bb0a4b2e5d81d2f9276f3eedd9561b6", + "description": "vtn/opencl: add rint-support", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6d69ed88f875b1ed0cc0def96067fc2a92f4d0ed", + "description": "vtn/opencl: add native exp2/log2-support", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7b2bfb6bc4e354cfaf3f9f2774906c54c03cff0e", + "description": "vtn/opencl: add native exp10/log10-support", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "25cb87bcdd558747f3dd06be033c2a93ab634d53", + "description": "vtn/opencl: add native exp/log-support", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c98e745e787f5df9ad45db6472a37d51a03b5fb8", + "description": "compiler/nir: move build_log helper into builtin-builder", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f59ae6883853fd7c7ee0965111fc19d20f6acfd2", + "description": "compiler/nir: move build_exp helper into builtin-builder", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4821ec6d8fcd0287ee9ea5afdd922da5ab787900", + "description": "vtn/opencl: fully enable OpenCLstd_Clz", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "7325f6ac987d295b101372bffcb98799251fe678" + }, + { + "sha": "51831537a2af75adc7d1611e4a7d6b02706eb32b", + "description": "gitlab-ci: re-enable mali400/450 and t820 jobs", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "842f13d8f85c16fec350418e02dc593fbbc156f4", + "description": "gitlab-ci: add FILES_HOST_URL and move FILES_HOST_NAME into jobs", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b1238498805bf600292f4663fc996e0396436435", + "description": "gitlab-ci: Serve files for LAVA via separate service", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "92f3c51560f9eb2387b1d929f694244c0b7bd577", + "description": "gitlab-ci: Place files from the Mesa repo into the build tarball", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b94c277fd1e7e8008ccc37aa2f4dd547ef92688b", + "description": "radeonsi: enable full out-of-order drawing when allow_draw_out_of_order is set", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8c053e5faded7b57fdd117ed86d572e0104c06bf", + "description": "mesa: allow out-of-order drawing to optimize immediate mode if it's safe", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0c6a667d9394c7bf0f5adef4320fd912653950af", + "description": "glsl_to_tgsi: set shader_info::writes_memory", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "85a723975bbd651fa6692655b9a4f14b4405d0ae", + "description": "nir: add and gather shader_info::writes_memory", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d269fb33b0a88045c42ff0733304ddc1e0d8f6c5", + "description": "radeonsi: Stop exposing PIPE_SHADER_CAP_FP16", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "603f38f1715b5dd88c8ef8bbd1192a5e3ce8a7c5", + "description": "util/u_process: Add util_get_process_exec_path for macOS.", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "f8f1413070ae079443ab31a75679cfd10cb756ed" + }, + { + "sha": "8cdace95acdf83bdab3d1f1a55e77aec1dfdb39e", + "description": "freedreno: ssbo: mark resource read or written depending on usage", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "061b262a0c2482e10286c4ecc96a81b47546100a", + "description": "freedreno: ssbo: keep track if a buffer gets written", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0ed053f03d936d301091dd68e96ced8dbedc83b2", + "description": "freedreno: simplify fd_set_shader_buffers(..)", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3340cbd398bb5a74287e794277d2423d11bbbc52", + "description": "freedreno: calculate modified bit mask only once", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3cbcb1b73e5f764ed87fdcd1dea8a921e73bfd82", + "description": "gallium/util: Add back (and rename) util_float_to_half implementation", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9e78f17b74a862e34891901cde8292f91adeb655", + "description": "etnaviv: Emit PE.ALPHA_COLOR_EXT* on GPUs with half-float support", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "76adf041f25defad204abea1ed49b82fd9c264d1" + }, + { + "sha": "4897e70ccd3987d470ec8622d473ee3405f6e96f", + "description": "gallivm: disable rgtc/latc SNORM accellerated fetches", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8c30b9d9878b14098d01a4bae5d51f1392e3baff", + "description": "rbug: do not return void-value", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "fb04e5da97d904ab1dc7e0182bcba77071bbe340" + }, + { + "sha": "411d7429c942bc878675ea390c9a5f1eff86ddc6", + "description": "rbug: clean up cast-warnings", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "079cb4949dd3199ea5693cc0c6ac4c3d838ee022", + "description": "pipebuffer: clean up cast-warnings", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "1a66ead1c75246224bf43e82a07b4fdb2891959a" + }, + { + "sha": "12711939320e4fcd3a0d86af22da1042ad92035f", + "description": "vulkan/overlay: Add a workaround semaphore for application presenting without one", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5533c41541f57774314517d893045eedfc5b2da1", + "description": "ac: fix ac_build_is_helper_invocation when postponed_kill is null", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "de57ea2a3da2188e1c1d9fb043028c5f823cc755" + }, + { + "sha": "84da4ded4b90d0d13e3d89d97160eead9d5c8886", + "description": "nir: update uses_demote flag in discard_to_demote pass", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "ce87da71e93d9eea7e9a2667e3273cab9c97667f" + }, + { + "sha": "fc8432e6d6c1f76621e202c773a590fa99ded730", + "description": "glsl/lower_precision: Lower builtins depending on arguments", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e7434c0a0608383c13514210e805d15678af5722", + "description": "glsl: Inline builtins in a separate pass", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1ee2ad584c95233b5cdbbed9fa5997533dc80276", + "description": "freedreno/ir3: enable nir_opt_loop_unroll on a6xx", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "61f7a1dfc566508d505230562a99099ee9c3f70f", + "description": "freedreno/ir3: Lower bools to bitsize", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "467c9a0faa0a27a14bbf5fb42bbf806382d2d3f0", + "description": "nir: add a bool bitsize lowering pass", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "75674ed4d4ec868fe116df84be1366b0fcd6c942", + "description": "freedreno: Enable mediump lowering", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "cc09745714d7c698b6adc48ed63ab6f506603088", + "description": "glsl: Add unit tests for the lower_precision pass", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "32cd3bd85014a6fbbabc3a26a16c05e8e98a8485", + "description": "glsl/standalone: Add an option to lower the precision", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b83f4b9fa23dc3d94ebb885897c19590c750cb83", + "description": "glsl: Add an IR lowering pass to convert mediump operations to 16-bit", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c525785edc33d36bbb906d3004be213d25b9467b", + "description": "glsl/hierarchical_visitor: Call leave_callback on leaf nodes", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0e1680a1e2aa67b3cb132bdd4f615694ff9454af", + "description": "glsl: Add a method to get precision from a deref instruction", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ba56684a14101820a8f8d6ebf8682e12a383288d", + "description": "i965/iris: fix crash when calling GetPerfQueryDataINTEL", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8a3e2cd9b26beb4d57ea417c4528e22c00c92126", + "description": "glthread: compile marshal_generated.c faster by breaking it up into 8 files", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "cadddbd26932a2eb4f8376e748c1cc27741afc33", + "description": "glthread: declare marshal and unmarshal functions as non-static", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "03da51eb07552fdaa2431de63235dbcf7a616ad7", + "description": "glthread: inline SET_func and add -O1 to build _mesa_create_marshal_table faster", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "238e2ed2100d4d364fefa23bac058100704c0a44", + "description": "radv: enable VK_KHR_8bit_storage on GFX6-GFX7", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "bd22a0f710ca48a49948a6463228c0d01d2fa74f", + "description": "util/u_process: fix Windows build", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "f8f1413070ae079443ab31a75679cfd10cb756ed" + }, + { + "sha": "6a4fadce129efa5cc13bffc4f0c207ff23830792", + "description": "pan/bi: Rewrite aligned vectors as well", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5a3493c536b174030d0c62e0196955d88c74066a", + "description": "pan/bi: Lower combines to rewrites for scalars", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e0a51d5308f3a9c6030c4ebc42be6be5c4b9e46a", + "description": "pan/bi: Ingest vecN directly (again)", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "04509dae7f7ec4d643c75ab57d2d658b928b323f", + "description": "turnip: implement depth clamp", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "afe27d5345f3addf770a2dee3f74c42f9ab93ae9", + "description": "turnip: fix znear clipping", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "07a8100aed64c64dee5966c5560f299d866bd0c9", + "description": "freedreno/registers: more GRAS_CL_CNTL bits, Z_CLAMP", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "43918c9a7fc76b56a521d5eea6a8d2b3fb675a15", + "description": "aco: implement 64-bit VGPR constant copies in handle_operands()", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "21ba2bc595402180fa52d793bb1e524663788338", + "description": "aco: remove dead code in handle_operands()", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9f4ba2d2b4a577a7cf6b38e835f0d1f39965bf08", + "description": "nir/gather_info: fix per-vertex handling in try_mask_partial_io", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f1cc13727c6875ee5aae7656652dda231267cccf", + "description": "radeonsi: enable workarounds for YoYo engine based games", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8f48e7b1e991375f6b03fac09a4f4416eb2f1afc", + "description": "util/xmlconfig: add new sha1 application attribute", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f8f1413070ae079443ab31a75679cfd10cb756ed", + "description": "util/u_process: add util_get_process_exec_path", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2cb965e5b60dbcd767da42360a5e18acd8803f5d", + "description": "util/os_file: extend os_read_file to return the file size", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "bd6234f24be024556a4b83e879bb65b89fea7a12", + "description": "radeonsi: clarify the conditions when FLUSH_AND_INV_DB is needed", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "67a10ea21596b2dff3ea2dc40713e59784e02ef2", + "description": "intel/dump_gpu: Handle a bunch of getparam in the no-HW case", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7fd4184378268f50ff737851a88c0e836367199d", + "description": "intel/dump_gpu: Add an ensure_device_info helper", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "be451f71ab37e6bf1bf2bc24580de202783b4331", + "description": "anv: Stop fetching the timestamp frequency ourselves", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d63d0006860ac079f411c1f0d81101741a22af10", + "description": "egl/android: enable/disable KHR_partial_update correctly", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "41412cc4b74e0b4b16f09df8c716adc57df851d7", + "description": "ci: Ban the recent popular freedreno a630 intermittent failure.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "719063d4d07a8f2ab2f256ea697083a8eb07e32a", + "description": "st/mesa: fix use of uninitialized memory due to st_nir_lower_builtin", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "17c7f4e30ed8f7a04ae3ad80e39cfbdf8d8ea46c", + "description": "aco: fix boolean undef regclass", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4ed12efb58ce194ba6e50e29d6780a5143ed66cb", + "description": "lima: Add missing source file to Android.mk", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1182a3934a5d935b792b801656161da338962004", + "description": "intel/tools/aubinator_error_decode: Decode ring buffers from HEAD to TAIL", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "84e707e6f2292dd886b8385fab0ced5122e35876", + "description": "docs/features: Update virgl OpenGL 4.5 features GL_ARB_clip_control and GL_KHR_robustness are now expose in the guest.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "49f9a0bb57219d0fa57dd10dfd69e64f6356c9df", + "description": "intel/tools/aubinator_error_decode: read HW Context before other batches", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c40acdef52b1da97c90f5cef046ae881511f7d25", + "description": "iris: Set patch count threshold in 3DSTATE_HS", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "60c789543e3738f3a39897758d7507da8c044d78", + "description": "anv: Set patch count threshold in 3DSTATE_HS", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1a5ac646cefaa183ee09b149ea31931d122c0f51", + "description": "intel/compiler: Track patch count threshold", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b3dd54fe13b52f3e9a7265ba047135e823c476c7", + "description": "intel/genxml: Add patch count threshold field on gen12", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "39ac87bf500276125407f0e916ad639f938103bd", + "description": "gitlab-ci/traces: Add Vulkan sample entries for POLARIS10", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6bca192e12a565b20cee601f51875064a5458d1a", + "description": "gitlab: add bug report template", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9d56ed199b911c1085ea558d243ab543af47ac8e", + "description": "aco: emit IR in IF's merge block instead if the other side ends in a jump", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8d8c864beba399ae4ee2267f680d1f600ad32767", + "description": "aco: improve check for unreachable loop continue blocks", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "46e94fd854e8f209ae662826e1794de4c5da2b80", + "description": "aco: skip NIR in unreachable merge blocks", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "638cbc21a1c01c87f620edc820e913e48aba2287", + "description": "aco: handle when ACO adds new continue edges", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f2c4878de9f2acfd7b23ed2deea1af094b781c7d", + "description": "aco: handle missing second predecessors at merge block phis", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f1a2e1df7882e9d3816f28d6a0827d4ac66ac8f6", + "description": "aco: set has_divergent_branch for discards in loops", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8bc3d6574c4b0acf9e9b50f6a63648f149c317c3", + "description": "gitlab-ci: add python3-requests to the test-vk container", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "90a39af5f65e5fa01beeec526594f7e04143e7cf" + }, + { + "sha": "7ac8bb33cd6025f805a390e7647506e932f4db0d", + "description": "radv/llvm: fix subgroup shuffle for chips without bpermute", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2a70a1d69d3151e6c95111a297e715e887692ce3", + "description": "panfrost: Align Android makefiles with recent changes", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6c8ccbe41b65f21622e10f4de54a6a19dc7d9afa", + "description": "gitlab-ci: add a bunch of new fossils from the Sascha Vulkan demos", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "48e920315cc37d3572e49b94605d41159faeb693", + "description": "gitlab-ci: add a new stage for RADV CI", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e22d562c176595181509873680dd0463ad428cf8", + "description": "gitlab-ci: compile fossils with more ASICs", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1517e58c1b45f5220b1f637fe4dc209768fce8b2", + "description": "gitlab-ci: compile fossils with both RADV compiler backends (LLVM/ACO)", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8b3b07afc0b97ecff0431486ca57031150985268", + "description": "gallium/gallivm: Remove workaround disabling AVX code for newer CPUs", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "de550805c5d96b17e7b7db4a0c62b29db354fd74", + "description": "radv/winsys: spoof some values for num_render_backends in the null winsys", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b911af06cd68d09b3813eab672e840ec4e76153b", + "description": "radv/winsys: fix wrong PCI ID for Vega10 in the null winsys", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "050ec8ff5344b730c521d07d55aa146eb36185ad", + "description": "glsl: Restore the IsES flag on the shader when reading from cache.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9e3efa429442ef4529843b9e7267d0d117ffa15b", + "description": "gallivm: add support for rgtc/latc fetches.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b3894e52c2f82112659725944117a59b37cbaa53", + "description": "gallivm/s3tc: split out dxt5 alpha code", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f02ae6986740813bf79bde8aad1a0ac5dd7e4d7c", + "description": "intel: Add TGL PCI ID", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1c6ef0165f03a8e8c20a2c33a78584166a73487c", + "description": "intel: Update TGL PCI strings", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d9d549ff8837b488f76981f23fa56c42164ee683", + "description": "pan/bi: Pack csel4 opcodes", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5cdc31abd63302e3da82a1bfee625019e818fc3f", + "description": "pan/bi: Default csel to \"!= 0\" mode", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "46f526eb1e4a3ca2d7f04c50f61523a680c383a4", + "description": "pan/bi: Use bi_lookup_immediate when packing", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "11bccb0564d9e24e50238fb257dd6f724ec31712", + "description": "pan/bi: Respect shift when printing immediates", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3f786ed10b14ca054e299679af2bfbe8a2dcd5c3", + "description": "pan/bi: Implement csel fusing", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5a02c871f2367abf7d87569819d7ae4ebb1336d4", + "description": "pan/bi: Add `soft` NIR->BIR condition translation", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "cd7fec782edd3c6d2e154994c15ceee65c3c0dc9", + "description": "pan/bi: Remove hacks for 1-bit booleans in IR", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "12299dead7ee589ee4a84af6058762381ef44c2c", + "description": "pan/bi: Lower bool to ints", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1097c69087d0a9a0ce3548550232f6475d18ac43", + "description": "pan/bi: Pack LD_ATTR", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0be1116b818edd56351d0415172015771eea1f44", + "description": "pan/bi: Pack st_vary", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9213b2520cdafefbb83f8f495281b0db419f85a8", + "description": "pan/bi: Add store_channels property", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c57ac9d1368e83ce001be4439c73473529e48135", + "description": "pan/bi: Generalize data register setting", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9458b017a946778ef5d065bfd61c47dafdfe3e94", + "description": "pan/bi: Flesh out st_vary IR", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "409e4f8a49094a60354656a40cd23e38362e9a67", + "description": "pan/bi: Pack ld_var_addr", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7321a17c6abcd23a281cc4209562f919b61e7cc5", + "description": "pan/bi: Pack ld_ubo ops", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "908341ea3fecbb80f070e31e1368d01194a05df4", + "description": "pan/bi: Add bi_load32_components helper", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8bb16138b658ea7eb5dfaf023463ae78f173de0e", + "description": "pan/bi: Include UBO index for sysval reads", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "fc0b49bb2cae9a0c8074faff680ddc91c8dd4bfe", + "description": "pan/bi: Index out constants in instructions", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d2d0de962ed385fd4dc71bf6d142c233b6d0998b", + "description": "pan/bi: Document constant related errata(?)", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "eb590a98d2bc29e6b3fb0792d804d76904af6603", + "description": "pan/bi: Pack a constant quadword", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "50d3f4df452d870858ed5165eb917921273f241f", + "description": "pan/bi: Add move lowering pass", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "58a51c49bbf48e92a78355401a07fd3870c1746c", + "description": "pan/bi: Add bi_emit_before helper", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6b7077efda9a8b518c7f55f497504a031c623e54", + "description": "pan/bi: Implement FMA/MOV without modifiers", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f8bbf44ca4d32889232ced844a1b939b8a86f727", + "description": "etnaviv: nir: add compile_check_limits", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "303842b2dbf30e7dd1a4cd463e76aecf81adebb8", + "description": "ac: fix fast division", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "55b0a676fdb538095b8d7c6e93a92d702534df39", + "description": "turnip: Instance can be NULL resolving 'GetInstanceProcAddr' entry point", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5cc3ab0ba0eed6e730eac869953c052f8b1e9ec2", + "description": "vbo,gallium: make glBegin/End buffer size configurable by drivers", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "11d3aa5e7bc7dc60f18e43adf35d43082feb759e", + "description": "glthread: remove the marshal_fail XML attribute", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c02a1347e5c68d7f1c68ca6b90d2736e35b1fde5", + "description": "glthread: ignore vertex arrays with user pointers if they're disabled", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0b1dd1859134e71b25ad1124535df96d435e9766", + "description": "glthread: track which vertex array attribs are enabled", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c571dda1e0929e1e8ff1686994df6601f34c7bf8", + "description": "glthread: rename non_vbo helper functions", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "bde4505f61e2964b16b04faadf4062a59e471bfd", + "description": "glthread: handle buffer unbinding via glDeleteBuffers", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "15b0719ae2d5cc80c063b1748443392f701bcdce", + "description": "mesa: put gl_thread_state inside gl_context to remove pointer indirection", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8a4114b9294c8e8f5bb977be47cc7764c9cdf490", + "description": "glthread: rename marshal.h/c to glthread_marshal.h and glthread_shaderobj.c", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "df74163995971607861fa0de06bd3d0f2024e9a0", + "description": "glthread: move buffer functions into glthread_bufferobj.c", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "37725e6c389a1135b288373a4d589806c98af291", + "description": "glthread: autogenerate prototypes for custom-marshalled functions", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4ded23a4add49c887f764c221f1aab5e0019cee2", + "description": "glthread: simplify printing safe_mul in gl_marshal.py", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "01a50e2493dec462b75e827fb09a815a67f027a0", + "description": "glthread: remove _mesa_post_marshal_hook, because it's not very useful", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "aee004a7c8900938d1c17f0ac299d40001b383b0", + "description": "util/sparse_array: Stash the node level in the node pointer", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6be65b077743fc80efe061b1e05cb13b2ff1a6b1", + "description": "meson,ci: Disable sparse_array tests on windows", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9fcd8bdbfcb556b72378ca6432ddf681b78f5a53", + "description": "util/sparse_array: Add a node_size_log2 temporary", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7893872a6c46a65d2f12bc0ae6bc4bd61fd3f8a1", + "description": "util/sparse_array: Finish the sparse_array in the tests", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8edaa843ab0f453300b981fd8f3d40b4984a75f2", + "description": "ci: Move db820c and db410c's gles3 tests to manual, like radv did.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "866a8da2a46d1e8722398450384df798619a3ca9", + "description": "tgsi/util: Change boolean for bool", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "24e82e453370e4105d73fcdf14cfb2f8922ddb9a", + "description": "util/blob: Add overwrite function for uint8", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1b49534df2197c59880ee703ff4dd813bc5f5231", + "description": "lima: add support for R and RG formats", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e763c6778ced36ba5f513391fc26952aab05d8af", + "description": "lima: split pixel and texel format tables", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d4b0e28f62421d0fb5a5bcb19038b4f6fec622c5", + "description": "zink/spirv: do not use bitwise operations on booleans", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "130c0ba1cc1b800641ed09fe7842c7ef4bce2dfb", + "description": "gitlab-ci: Restrict s390x/ppc64el jobs to packet runners", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "500842399a350481d99c691c1053a6adab095bc8", + "description": "radv/winsys: set has_syncobj_wait_for_submit in the null winsys", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "58deebe547014e64d8db3f8cc5e963efe7e0f743", + "description": "intel: add new TGL pci ids", + "nominated": false, + "nomination_type": null, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2d3223ca90ae946231c1bfbfd1b450e5e96106a3", + "description": "radv: fix optional pSizes parameter when binding streamout buffers", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "b4eb029062a944c428d6214447a852318e36016e" + }, + { + "sha": "fdc603292862dd2663b75d18e9abc6096b8020ff", + "description": "mesa/main: Fix overflow in validation of DispatchComputeGroupSizeARB", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "45ab63c0cb274b20a7ae1f390b123e13a5b46c98" + }, + { + "sha": "4ac1d3cc45121b88708ab7bfd8f3e12389a6cdfd", + "description": "driconf: enable glthread for \"From The Depths\"", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7a59d6eaa210dc22dc07de79ed7d8712b59e4042", + "description": "winsys/radeon: change to 3-space indentation", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b13d5265cce6b9879af197b629c188577ae0be2a", + "description": "glthread: don't declare unmarshal functions as inline", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "efaeac9e847a8234b1ea1cf32304c91f92b840a3", + "description": "glthread: clean up debug_print_sync code", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b00d219ec0da21c1bab89dc36aa20c9138b92226", + "description": "glthread: remove debug_print_marshal function", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "951c6acb074b18da0e595825fe05c77a0fbee96d", + "description": "glthread: don't execute any custom VAO and BindBuffer code in the Core profile", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "87f6be4456f3def3ccf6578714a993374ce98e5d", + "description": "glthread: track VAOs created by CreateVertexArrays", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "720f34d5ebd29fe8d8ffaa9098c5100346256418", + "description": "glthread: enable display lists", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4dcdf974f8df23bc0a5e284759a9379ec95192a9", + "description": "glthread: align the batch buffer to 8 bytes for pointers and doubles again", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ff0881c686cc250d492d38fd14063e4b18c951c1", + "description": "mesa: remove redundant api_loopback functions", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "98d11972334186b1b5ede148e32ac7758be56ce4", + "description": "mesa: use vbo_attrib_tmp.h to generate display list vertex attrib functions", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3252041a7872c49e53bb02ffe8b079b5fc43f15e", + "description": "anv: Only add END_OF_PIPE_SYNC if we actually have AUX_INVAL", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "43dc842cb91c195fe7bb47a7ce324425096bf6f5" + }, + { + "sha": "5b57aa79e2bd244079639bcc696251ce0f7af7c7", + "description": "freedreno: Switch to exposing only half-integer pixel centers.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5c8ba96a54feaf9bb783bb165bce91ee0c3253f9", + "description": "r600: Fix build error in sfn_nir_lower_fs_out_to_vector.cpp", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0df48e5d1f09c81bdbc9cc501c5a382c9175da33", + "description": "vc4_bufmgr: Remove duplicative VC definition", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e3bbe1fa65ede1e2504510d15b1c439fb81328fa", + "description": "etnaviv: Avoid shift overflow", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "511c6408f48097bb4c04087931e3712ade662525", + "description": "Android.mk: Tweak MESA_ENABLE_LLVM checks", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9dbff6f6ce0dea622f98c9d14336148e9afc19ae", + "description": "intel/iris: Always initialize CCS to 0", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "507abc395940debf59df19d51fdff071cdb9f6ed", + "description": "isl: drop min row pitch alignment when set by the driver", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "a3f6db2c4e927be7e7d40cbc39c8664030d2af59" + }, + { + "sha": "def3470e9bc1c3c4d93b21cf15b7105e4f553dab", + "description": "isl: only apply main surface ccs pitch constraint with CCS", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "a3f6db2c4e927be7e7d40cbc39c8664030d2af59" + }, + { + "sha": "dab0aadea9494ebf19a0c3e23a38bd01c857b49c", + "description": "isl: properly filter supported display modifiers on Gen9+", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "157a3cf3ecb6917c26508c5bf641e1b8c58e6228", + "description": "isl: implement linear tiling row pitch requirement for display", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f778c48869fb52c6afc757b307d95376aaabcf50", + "description": "ci: Only run the freedreno baremetal tests when freedreno/core changes.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7524717ba291a0df49e802c4fb690f40f7cf1f6c", + "description": "docs/release-calendar: Add calendar for 20.1 Release candidates", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "cf62c2b2ac69637785f55b790fdd601c17e7e9d5", + "description": "radv: call nir_shader_gather_info again", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5193688e1ac696928109ade1b0eb901a91607436", + "description": "nir/gather_info: handle emit_vertex_with_counter", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "36ec3cbcf88e9dc4898bbe2319cc4a5a71ba72e1", + "description": "gallium/swr: spin-lock performance improvement", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "db5cc6a7ddeddbeb1e360156db520f55a5852b99", + "description": "radeonsi: enable glsl_zero_init for Curse of the Dead Gods", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3c03718fd7813b9023c286101d4f972aa3390de9", + "description": "nir: fix clip/cull_distance_array_size in nir_lower_clip_cull_distance_arrays", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "73812999d92ed9812993f22a8807895d670fa4b8", + "description": "pan/bi: Pack BI_BLEND", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a4fb88723e74041b53d3dddda5b08f3ec94510b6", + "description": "pan/bi: Flesh out BI_BLEND", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e06426ea85fa9092e3488c9e4600181f534454b6", + "description": "pan/bi: Add ATEST packing", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b18d0ef7081540b6c8d60bfd4f13792878ea1b28", + "description": "pan/bi: Flesh out ATEST in IR", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "61260819ba3f08fccf72dfe7d7498516eec413f9", + "description": "pan/bi: Track clause types during scheduling", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e323df05a906aa91edfd4895627d28b6f9a12c6d", + "description": "pan/bi: Don't hide SCHED_ADD inside HI_LATENCY", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d797822d31c1a19580de6a357f96405f04ad916a", + "description": "pan/bi: Pretty-print clause types in disassembler", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "42af9f47c8a91caad6803fdaccf111053e9303c4", + "description": "pan/bi: Route through clause header", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d4fbf751cfb863ee4b8e7963c0c37961519da774", + "description": "pan/bi: Skip over data registers in port assignment", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "32e5a7e6e91b43105d51047cc315119928ff09ab", + "description": "pan/bi: Emit load_vary ops", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "37f14c9e50ce144cc81bebf5124e7a9cd0ef0288", + "description": "pan/bi: Pass second src for load_vary ops", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "265169f48ada87fcea8e55dc4176954fb86d1153", + "description": "pan/bi: Generalize bi_get_src a bit", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1c0e786084f865d27b7be9d834855555fb0f049f", + "description": "pan/bi: List ADD classes in bi_pack_add", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6069904bbd46592d13a87520dc256c6006b12c50", + "description": "pan/bi: Pack fadd32", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f2afcc6101047b8cfcd5fac3f144e1f3325e6207", + "description": "pan/bi: Pack BI_FMA ops", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8a3bf3f1a1f13a6f14f849c5cdcdd1874566f88d", + "description": "pan/bi: Add struct bifrost_fma_fma", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "cd40e189b6769f3cfb18557e3715a3289bebc13c", + "description": "pan/bi: Model 3-bit Bifrost srcs in IR", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "fe379776c7d7eca1bbb26af070710a1a2224b0ff", + "description": "pan/bi: Route through first_instruction field", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "90ca6a9a6b02087b97c12a6feb68381b41fa89d7", + "description": "pan/bi: Assign registers to ports", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ff39f57a48509f8a73655b6f4794cc5b73e4965c", + "description": "pan/bi: Add missing __attribute__((packed))", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9080ea8b57817e385d157c623af1bde87841c304", + "description": "pan/bi: Pack register fields", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "03a271bf15bd4aff587408be09066d2670ae47f8", + "description": "pan/bi: Add packing for register control field", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "50bce53cd0c44db531b37cb37426e3b087c788da", + "description": "pan/bi: Sketch out instruction word packing", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9269c85578bd68169681efad0fb2a3563eb280ab", + "description": "pan/bi: Setup initial clause packing", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0c5aab626bb52670267381383c823f4fb204b3d8", + "description": "docs: update calendar, add news item, and link releases notes for 20.0.2", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3c572fa571f7911a316ad9ef4697c65270c0d7e1", + "description": "docs/relnotes: Add sha256 sums for 20.0.2", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "552078aec6bad10ad04b5817611f2ff2f50c6f40", + "description": "Docs: Add release notes for 20.0.2", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3210214b67f783363c52c47a21d43a721d9388c9", + "description": "ci: Disable tests that showed intermittent fails on a530 in day 1.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "116a3ac481d4b5949027a5b0a798b0bef52e70b9", + "description": "ci: Ban the recent popular freedreno a630 flakes.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "56de6f698e3f164d97f132203e8159ef0b8e9bb8", + "description": "radv: remove wrong assert that checks compute subgroup size", + "nominated": true, + "nomination_type": 1, + "resolution": 2, + "master_sha": null, + "because_sha": "672d10619980687acec329742f055f7f3796c1b8" + }, + { + "sha": "46187bb54fe7a0ccfbafa09c5a168fb45da172d4", + "description": "anv: Swizzle fast-clear values", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3fb8f1948167c77f05f89b3a0933cbe2a1280b8d", + "description": "intel/blorp: Add support for swizzling fast-clear colors", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "bf2eb3e0eee39e79f5426dfa18d9d3b7f9dfbcb2", + "description": "soft-fp64: Split a block that was missing a cast on a comparison", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "f111d72596c4071ad38a2062699f17702bbd9c6d" + }, + { + "sha": "a8882132f9243e61ca5a5b5f63cbfcca1120ff90", + "description": "soft-fp64/fadd: Common code optimization for differing sign case", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2d1216a039889cec8d8dbd994d4e50ed47d9692c", + "description": "soft-fp64/fadd: Move common code out of both branches of an if-statement", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "16dfd06472db407aee8a9c6ec761079633c6bdec", + "description": "soft-fp64/fadd: Use absolute value of expDiff", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "da3fa01891ec41ced3cbe2b63e8e5c8252e6e7ba", + "description": "soft-fp64/fadd: Rename aFrac and bFrac variables", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3c9ff97215b0c13c82f460dcc59cb61f6b02d78c", + "description": "soft-fp64/fadd: Combine an if-statement into the preceeding else-clause", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "480565812c1472faf440b3a27864c8c34610a0f5", + "description": "soft-fp64/fadd: Reformat after previous commit", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9496a67eece5717417b3b44ad1552c57b70b2897", + "description": "soft-fp64/fadd: Delete a redundant condition check", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7078105592bf332e1080fbd7049c3a8dd9cde0cc", + "description": "soft-fp64/fadd: Just let the subtraction happen when the result will be zero", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "cae36fa217b427f67494bc67b3ca4bd0bbae517e", + "description": "soft-fp64/fadd: Pick zero or non-zero result based on subtraction result", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "70be98f17a1b8c66a92daac2a3d4eeb084bbc954", + "description": "soft-fp64/fadd: Massively split the live range of zFrac0 and zFrac1", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "73fa3a1ca44a5eb7bf1c4c5087fcacd912b62e65", + "description": "soft-fp64/fadd: Instead of tracking \"b < a\", track sign of the difference", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5b07f542e5a4ab698becbc238ff2ccc4720418bf", + "description": "soft-fp64: Optimize __fmin64 and __fmax64 by using different evaluation order [v2]", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "617a69107ee58e23ace06093bc49fa2c86b7dd4b", + "description": "soft-fp64/ffloor: Simplify the >= 0 comparison", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "abf28d6a70c3219e41c904806f77ea92d31bdb0f", + "description": "soft-fp64: Relax the way NaN is propagated", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8178fa88763a321cb5df853ee219884c2a7eedcc", + "description": "soft-fp64/fsat: Micro-optimize x >= 1 test", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b6f58b4709c240c7dd17c59674e0f63d70af70e5", + "description": "soft-fp64/fsat: Micro-optimize x < 0 test", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7673dcbd21150e67c5a36bdcc3eee419c025604b", + "description": "soft-fp64/fsat: Correctly handle NaN", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "a42163cbbc1abe02b7db4ade74b569f455942d1a" + }, + { + "sha": "b421c0466d6ec28824b297d0545fca537c13a2b7", + "description": "soft-fp64/flt: Perform checks in a different order", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f6992bf62440e847ca129ea9f79862fa5ff4c35f", + "description": "soft-fp64/fneg: Don't treat NaN specially", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "de4acd8816cb02b65ade3ddafeffc194ff97f35e", + "description": "soft-fp64: Store sign value as 0 or 0x80000000", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "598e2fc6a1834ba8b1e0ee82ecf6c016f6ed7c3e", + "description": "soft-fp64: Pick a single idiom for treating sign value as a Boolean", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "325a21f5ebca90ccac9a7c3c571ed0513c4ec3d2", + "description": "soft-fp64: Simplify __countLeadingZeros32 function", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "812230fd94e2661b1e69234f35f3ec0e3bcc9571", + "description": "soft-fp64: Don't open-code umulExtended", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d1e0227ef14291242886be48424f723bf60bc439", + "description": "soft-fp64/b2f: Reimplement using bitwise logic ops", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4e3d69ad07d9c439fc99b7c1b1e999ce556d260f", + "description": "nir/algebraic: Simplify a contradiction that can occur in __flt64_nonnan", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e0cefc5a23a62b0bcf77db469adf1d0eb9ff8165", + "description": "nir/algebraic: Constant reassociation for bitwise operations too", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1d36af93389e0cdaa36e8b972f328566487bd7d5", + "description": "nir/algebraic: Generalize some and-of-shift-right patterns [v2]", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d6d63aec18624fe4cbc2e9b06d95f858500257df", + "description": "nir/algebraic: optimize ior(ine(a, 0), ine(b, 0)) to ine(ior(a, b), 0)", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "88eb8f190bd69c7f057063f5d88faece59477088", + "description": "nir/algebraic: Simplify logic to detect sign of an integer", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e7f3a8d6959c74f63c877dd8776fe519d54f946f", + "description": "st/mesa: disallow deferred flush if there are multiple contexts", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "d17b35e671ae7c6ab6b89973506d12b958d2264d" + }, + { + "sha": "6ee971c8823d23e70ba34088fa5e9fd20eba2047", + "description": "anv: Use isl_drm_modifier_get_default_aux_state()", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0905d5a14a790229c49914fc45e1dcd9d5c43c1d", + "description": "intel/isl: Don't align linear images to 64K on Gen12+", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "94e37859a96cc56cf0c5418a5af00a3e9f5a1bf5", + "description": "radv: fix random depth range unrestricted failures due to a cache issue", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "f11ea2266644a016a898744d1283d83ab63f4fb2" + }, + { + "sha": "a6625b15a466e2648a35810c64df882ea869971c", + "description": "turnip: Do gathering xfb info after nir_remove_dead_variables", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c11a2bc202f3fed542631024c618e7df528d9e02", + "description": "turnip: Fix wrong assignment of xfb output's offset.", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "2a1d6b81ed54971d33e83b7f5545da096b13b043" + }, + { + "sha": "25a54554b319ce38dbe11f92cb2447bfb6b5b78f", + "description": "intel/decoder: don't consider header fields past dword0", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0c41937440276498b76c30657bc8d884ed8220db", + "description": "lima: decode depth/stencil write bits in RSW", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9205762caece0c4b9ecea3d56f72c6980935633a", + "description": "lima: implement zsbuf reload", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "dbceabed72977ffd49d84f926c59ff97554f349d", + "description": "lima: disable Z16 format", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8b8af6d398a94cb07015c695fdfdb5c157aa72cf", + "description": "gallium/util: Switch util_float_to_half to _mesa_float_to_half()'s impl.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8e4e2cedcf53d0f9649d51fc3acccaada96172bb", + "description": "amd/llvm: Fix divergent descriptor regressions with radeonsi.", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "b83c9aca4a5fd02d920c90c1799137fed52dc1d9" + }, + { + "sha": "040ce9a1b3b596d34e224cf3be42747bdadc7163", + "description": "gallium: fix build with latest meson and gcc10", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8dc5e174c7b96b6d4b5a6923068410f298167a39", + "description": "ac: don't set old denormals flags with LLVM >= 11", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "63a5051ea6bf4d72a02594d21a3351e44bd70da7", + "description": "ac: set new LLVM denormal flags", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "56cc10bd27b24d513de88bf7fa94a6c8f43e348f", + "description": "ac: unify denorm setting enforcement", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e4959add2f44517b2227521af5aaf2919aaa6c3b", + "description": "gallium/u_vbuf: simplify the first if statement in u_vbuf_upload_buffers", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "99a29a20d2e7b931c5ee6478665f0784eca2c0d8", + "description": "gallium/u_threaded: don't sync the thread for all unsychronized mappings", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5960dadd1f2494da6ea8fa04a46271beb66dea49", + "description": "freedreno/a5xx: Fix min-vs-mag filtering decisions on non-mipmap tex.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4bc15e78fa51e6c0df491a9fef4f99b2dfad77a9", + "description": "ci: Enable testing GLES2-3 on a530 (Dragonboard 820c).", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8997757c6abfe657a259bc5c681628e70792b67a", + "description": "ci: Enable ccaching of CMake builds as well.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ba39cc5e85ef3b2c14803d21f6fe437620432227", + "description": "ci: Enable ccache in the container builds.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "af7dca35602be1eda7481176cec596181c8fec41", + "description": "ci: Update the ci-templates commit.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d60375cbc2510ab7ad90b2654c0f6324468415cf", + "description": "anv: Do an end-of-pipe sync before updating AUX table entries", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3dd0d12aa5fefa94123269a541c94cdf57599e34", + "description": "intel/blorp: Plumb the stage through blorp upload_shader", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4c35bc7e612f806d2e8a8a7126ddcdf45597806a", + "description": "zink: zero out zink_render_pass_state", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c923de68dd0ab10a5a5fb3196f539707d046d897", + "description": "radv/gfx10: fix required ballot size with VK_EXT_subgroup_size_control", + "nominated": true, + "nomination_type": 1, + "resolution": 3, + "master_sha": null, + "because_sha": "fb07fd4e6cb9feb8c9a812dd5f859f165f213465" + }, + { + "sha": "672d10619980687acec329742f055f7f3796c1b8", + "description": "radv/gfx10: fix required subgroup size with VK_EXT_subgroup_size_control", + "nominated": true, + "nomination_type": 1, + "resolution": 3, + "master_sha": null, + "because_sha": "fb07fd4e6cb9feb8c9a812dd5f859f165f213465" + }, + { + "sha": "46e8ba1344e840f9406537ae73c841a357278924", + "description": "radv: only inject implicit subpass dependencies if necessary", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "e60de085473174e5a6b5a1e33e39006e62f5c786" + }, + { + "sha": "a0591863db891fd39bdbae199f740b0d4b5f0173", + "description": "gitlab-ci: Enable more Gallium drivers in meson-i386 job", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "106bf59ca903bd58c0bd2a9c5eff6b4180df0b24", + "description": "llvmpipe: Use uintptr_t for pointer values", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "264663d55d321225a4962073ce4b7389d3d42287" + }, + { + "sha": "c56f09124b195c5cbaabdd7eadbb0523bede3abb", + "description": "gitlab-ci: Move classic driver testing to a new meson-classic job", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c3727ae431a132c1334884675f9c6c64ea6472b2", + "description": "gitlab-ci: Fold scons-swr job into scons job", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3ff437abb3f4ecaef1e3f241392b827cc3fdd202", + "description": "tu: Fix border color with compute shaders", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "32eecf58791ea3c5367e5ff4a542ae25fc3ce61d", + "description": "gitlab-ci: Don't use buster-backports packages by default for x86_build", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "90a39af5f65e5fa01beeec526594f7e04143e7cf", + "description": "ci: Drop the git dependency in tracie", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "43873afda4f8faa2b31a2f130fab52fbc24d490f", + "description": "gitlab-ci: Use surfaceless platform also for apitrace", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2ca662fb61269e3e3d36f8aab5939bc9dce14b4a", + "description": "gitlab-ci: Update renderdoc", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ac1dbd5ef81fe9bd389e38f22da32d940c1d9e02", + "description": "lima/gpir: fix crash in schedule_insert_ready_list()", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2756b629171f61ca8e162be7b332e91a62c5c978", + "description": "lima/gpir: add better lowering for ftrunc", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b7d89476f1e7d0f3b9e751887f42b750a5ec216e", + "description": "lima/gpir: kill dead writes to regs in DCE", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8c1bcc8555ab17a1df043ebc8c2a3ebcf6c400bc", + "description": "lima/gpir: Optimize nots created from branch lowering", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "47dacf3867194fd456f015bc2a54d2ee4af30f4c", + "description": "lima/gpir: Optimize conditional break/continue", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9fb0fda8e7bfa95686fd60ee563938b8e1196437", + "description": "lima/gpir: Make lima_gpir_node_insert_child() useful", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5c3f20a25baa6f5b2baa2a6ef693e38095da5508", + "description": "panfrost: Fix gnu-empty-initializer error.", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "836686daf36cd8d7c17c909d98f5c9e07549d674" + }, + { + "sha": "2d14a8f23721cba2f66ddecbece09a024dc1b45a", + "description": "aco: fix operand order for LS VGPR init bug workaround", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "a952bf394609134ff96f4bebb41bd022c621bfa6" + }, + { + "sha": "ded7a8bb4625b28add06f8550526f2169045e87c", + "description": "aco: fix instruction encoding for LS VGPR init bug workaround", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "a952bf394609134ff96f4bebb41bd022c621bfa6" + }, + { + "sha": "ee9e0d1ecae307fa48200d2604d3114070253299", + "description": "aco: set late kill for v_interp_p1_f32 for some APUs", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1872759f55384175f8fb4277abe4bd45b85d2d9e", + "description": "aco: add a late kill flag", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c51348bd9b652aef65b5fd999165ecb8c388e61b", + "description": "aco: move some register demand helpers into aco_live_var_analysis.cpp", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e1b08b55ff461677f05e827ebeab02918096ba0a", + "description": "radv/sqtt: handle thread trace capture in sqtt_QueuePresentKHR()", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4061ac859d0b612a801ae9047afbd352fb74e006", + "description": "anv: Push UBO ranges relative to the start of the binding", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "e03f9652801ad7f70091e084535a3fb6650c3acd" + }, + { + "sha": "ae15b4fd733597880fee5357fb2d1f2f100b30c1", + "description": "anv: Fix the comparison in an assert", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "e03f9652801ad7f70091e084535a3fb6650c3acd" + }, + { + "sha": "299fad5585c2b969ceebc5ad2bd4df1ad446ee1a", + "description": "gitlab-ci: bump Vulkan CTS to 1.2.1.0", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "af6d8dea008bfc1a2cc5e82caa2f36ff6657deaa", + "description": "gitlab-ci: do not set the number of deqp-parallel jobs for RADV CTS", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4668a08e9d7bf73683f9527920fc544eb95e8497", + "description": "gitlab-ci: allow deqp-runner to use the maximum number of jobs", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "888b41f0ee0cc0f79c299d5a2a9149ae13306c12", + "description": "gitlab-ci: remove useless 'patch' package in the VK test image", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3349fe9a26d10a2d06c3f4a8ad0e5b554da2a243", + "description": "tu: Rewrite border color handling", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f6dad10d0474cc80228636d6deb45b6025583566", + "description": "meson: Avoid duplicate symbols.", + "nominated": false, + "nomination_type": null, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4b61ad372d51681a3fb41b2dc21d2d58eb2becac", + "description": "Revert \"ci: Remove T820 from CI temporarily\"", + "nominated": true, + "nomination_type": 2, + "resolution": 1, + "master_sha": null, + "because_sha": "089c8f0b8da86a05bde8359c84085e0b795abf17" + }, + { + "sha": "bbdb4b1a6d9c1f211ef7e67f3dcdf92de24c3a67", + "description": "gitlab-ci/lava: fix handling of lava tags", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "fd1436440bd84d0b48fd7282b8f012ad382483ed", + "description": "iris: allow compression conditionally for images on gen12", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d836f3fadfbfe3dfc8bd1b60b2146f676e8ff8a1", + "description": "isl: allow compression for storage images on gen12+", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "cd132a8eed94955332db6c8b553141f1b261066f", + "description": "iris: determine aux usage during predraw and state setup", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d4c879e69e2e54d3f422367a51dc4a4a82dddf22", + "description": "iris: move existing image format fallback as a helper function", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "fe2baf72e7b8068cc0abce3f0bb9bd81565c8cf8", + "description": "iris: provide dummy iris_image_view_aux_usage", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e8f0483ec408037ce7b7c6014674f13cc4461079", + "description": "intel/compiler: detect if atomic load store operations are used", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6dd654ba419d792806366f43ba9325f52eab9488", + "description": "iris: use the images_used mask in resolve pass", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5910c938a293c03337911ca3c067b4ecf4b406ee", + "description": "nir/glsl: gather bitmask of images used by program", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "51b1b102bd619b6a802807bde5f5228c1dabd1d7", + "description": "st/mesa: Fix signed integer overflow when using util_throttle_memory_usage", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "21ca322e637291b89a445159fc45b8dbf638e6c9" + }, + { + "sha": "b93a1952258ebef6319fd4f4186d704e04b3064c", + "description": "isl: Avoid EXPECT_DEATH in unit tests", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5e523c9265d3a6f092bad17364cf7d7f5ead6e98", + "description": "gallium/swr: use ElementCount type arguments for getSplat()", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a19d8c836f2263a2edeca4aea52676ecdbf2e8e9", + "description": "etnaviv: enable shareable shaders", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "fe204de632da54bc0902313a943374a413023e90", + "description": "etnaviv: get rid of etna_spec in etna_context", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4432dd6ea4fef2ae0f9a1bb3240d7df216cddf08", + "description": "anv: Dump push ranges via VK_KHR_pipeline_executable_properties", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "625d8705f02e211e2733c3fe12845505725c37d4", + "description": "aco: don't stop scheduling at exports", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6b4c31f814b8ba61c94506f42f609a2f36ab1fbb", + "description": "aco: allow barriers to be skipped during scheduling", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "928ac97875c267690c2e15f7de94b9f935afed80", + "description": "aco: add helpers for ensuring correct ordering while scheduling", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2cd760847a45f4daa7f73a7d0a6a606e3b4f1f07", + "description": "aco: add helpers for moving instructions for scheduling", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2d295ab3f35acd796826d6f06f798d8618b1d814", + "description": "radv: add llvm_compiler_shader() helper", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4d991c2de46794a5a7c10110677d55a498893586", + "description": "radv: remove unnecessary LLVM includes", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5ea32a6201983596fcabae04cb781d7d1f456636", + "description": "radv: remove radv_shader_variant::aco_used", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3fea9481776bae953f718a8630a975e027f1bc5f", + "description": "radv: cleanup occurences of use_aco everywhere", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1305b932747ff4f8ab0253b12dc979dcbfd6777b", + "description": "glsl: do not crash if string literal is used outside of #include/#line", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "67b32190f3c953c5b7091d76ddeff95c0cbfb439" + }, + { + "sha": "f8051f77ea00934cc00d9f55f1a737c50037af65", + "description": "anv: Remove duplicate code in anv_cmd_buffer_bind_descriptor_set", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0a5053b687e7c9b69ecc743a770dff99952071b3", + "description": "anv: Reduce compute pipeline batch_data size", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "925df46b7e00ee7ca4128ef2cc2f3fad68ce8d4e", + "description": "anv: Split graphics and compute bits from anv_pipeline", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "af33f0d767a72dfd89246947d89d28d3157b4f59", + "description": "anv: Use a separate field in the pipeline for compute shader", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "bff45b6a7f57694bcc0d8bb47fbc55402911113b", + "description": "anv: Decouple flush_descriptor_sets() from pipeline struct", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6df0ac265397420cbe43e2091c0e0da1fa57e9d1", + "description": "anv: Decouple flush_descriptor_sets() helpers from pipeline struct", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d1c13f01aa8f231cd377506b3932e6300f429b79", + "description": "anv: Remove redundant check in flush_descriptor_sets() helpers", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "eec04c0aae3f6dce9ac35bbe9d75fa81460c74c8", + "description": "anv: Pass the right pipe_state to flush_descriptor_sets()", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "88df3bf79ae96b0f03332828125c9e2dc653494d", + "description": "anv: Keep the shader stage in anv_shader_bin", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9bf044d2541e1612419ff2ba41758e71a6fd9a9c", + "description": "anv: Use a dynamic array for storing executables in pipeline", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9b0682df82041fe1ba7136a97a74be7ba4c08de7", + "description": "anv: Use pipeline type to decide whether or not lower multiview", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "613c9b78e3a9fa08bda2c671543d2ef0caba9dc2", + "description": "anv: Add a new enum to identify the pipeline type", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d0a52432b19f1d3e61b20a5dd4ba1a1e5546bd7d", + "description": "glsl/tests: Fix waiting for disk_cache_put() to finish.", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "50989f87e62e0b9a4796c565a103ce45c684c673" + }, + { + "sha": "e178bca5cc194ecb3e6b447620045a72f9cc465d", + "description": "glsl/tests: Catch mkdir errors to help explain when they happen.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7d54b84d49de991188a6a91bbadf00e89654f2c0", + "description": "intel/fs: Combine adjacent memory barriers", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "bf432cd831c789c02f9474f836e3259f2a73abd3", + "description": "nir: Add pass to combine adjacent scoped memory barriers", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d31a8ed8fd560fd71be10956d09987dc47f20a01", + "description": "nir: Reorder nir_scopes so wider scope has larger numeric value", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "67fc88fbb9af53924dbaf8634ec7b1fb069ec340", + "description": "nir: Don't skip a bit in nir_memory_semantics", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a46e9f4d9ac827e4e5ecfee4bf24e38cb09e349b", + "description": "radv: use ac_gpu_info::use_late_alloc", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "741dd9e32bdfeb32ebccbe131d2eb5e82769b0a5", + "description": "radv: rewrite late alloc computation", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "74e7b442f21db806a296876b84a332d212cef77b", + "description": "radv: tune primitive binning for small chips", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "22d3e047e570b098729a982901b5338b997c80a0", + "description": "radv: use better tessellation tunables on GFX9+", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6d27022ce14c2b44b1ac7cbed6b95c7fc8ceb20e", + "description": "radv/gfx10: cache metadata in L2 on small chips", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6310c666a4339d8e0460dd2e1daa3fad500ae0ca", + "description": "intel/isl: Set DepthStencilResource based on aux usage", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f047e504a51cde0a0789a848e079ab6b7d470d63", + "description": "intel: Require ISL_AUX_USAGE_STC_CCS for stencil CCS", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "56e15bf31c0a88d220d5907a533d59ca6341d96a", + "description": "iris: Use ISL_AUX_USAGE_STC_CCS for stencil CCS", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "69a0150e4e8c3a9c46375dbfb49d3ada9b47b7f1", + "description": "intel/blorp: Allow STC_CCS in blit sources", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6fa92cd015b812140b87e8ce5b4574f76f0f02e0", + "description": "intel/isl: Add a separate ISL_AUX_USAGE_STC_CCS", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "05a8e981ad6d359c0d748fe9fdda5e1270d53d78", + "description": "intel/isl: Require ISL_AUX_USAGE_HIZ_CCS_WT for HZ+CCS WT mode", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ff1f0a720d8edcfc09aa41c720ba8de3afe88d72", + "description": "iris: Use ISL_AUX_USAGE_HIZ_CCS_WT to indicate write-through HiZ", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e13ed0e9e548c99e484e9533b0030ce555465fd7", + "description": "intel/blorp: Allow HIZ_CCS_WT in copy sources", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "98dc7f56b7d17cd56ab43768058a8d9c5a8f2e0f", + "description": "intel/isl: Add a separate ISL_AUX_USAGE_HIZ_CCS_WT", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "feaedc1fbe43ed4ad4978b9ff6815711ffd640a2", + "description": "intel/isl: Clean up some aux surface logic", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "84f97a21a669a42d458d9d8c7f52b65c4af355b5", + "description": "ac: disable late alloc on small gfx10 chips", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7ba5e94c50cbc79fddc8c764c6569a0da2092b58", + "description": "ac: add radeon_info::use_late_alloc to control LATE_ALLOC globally", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "09295e95eba6f015d1731b589070cf5bbef3d581", + "description": "radeonsi: tune primitive binning for small chips", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "629b6ddd7106bd31ebb44308bd307be2a5bf6bd4", + "description": "radeonsi: set better tessellation tunables on gfx9 and gfx10", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "bf5b65d0fdfc49a6c2cbdc10fc4b6990f992deea", + "description": "radeonsi/gfx10: cache metadata in L2 on small chips", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e6e97ea92e07b78494f08197d9d5d1f35e1c0b60", + "description": "radv/sqtt: describe layout transitions with user markers", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b229302b96cbd6d1e87ca73e0b41962ad34b7da5", + "description": "radv/sqtt: describe begin/end subpass barriers with user markers", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "90550b2a3e0809c47f51f97a84a91bb3f61103f7", + "description": "nir/algebraic: coalesce fmod lowering", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "acd0dd3b4b223a423fbe9ffd118c3fbbf119d993", + "description": "nir/lower_double_ops: relax lower mod()", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b83c9aca4a5fd02d920c90c1799137fed52dc1d9", + "description": "amd/llvm: Fix divergent descriptor indexing. (v3)", + "nominated": true, + "nomination_type": 1, + "resolution": 3, + "master_sha": null, + "because_sha": "028ce527395642b68612d10c6030be5d4706a65e" + }, + { + "sha": "ba88e951871ae1df5ba567c8f4071dddbe50e286", + "description": "intel/fs: Fix NULL destinations on 3-source instructions again after late DCE", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "ba2fa1ceaf4ccb905e1d841b45f88505449db44e" + }, + { + "sha": "cfa299eadb21893348c60906dfde8feb175c7f14", + "description": "radv: Enable subgroup shuffle on GFX10 when ACO is used.", + "nominated": false, + "nomination_type": null, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "967eb2326155eaa7f2f3d3b8c459a2cb82eca1dc", + "description": "radv: Enable lowering dynamic quad broadcasts.", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ec16535b493b54c8c039576c0303f324242ae3fb", + "description": "nir: Add ability to lower non-const quad broadcasts to const ones.", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3aa83d809f6dd61e8052d39e5b3cf048c6fb8223", + "description": "gen_release_notes: resolve ambiguity by renaming `version` to `previous_version` and `next_version` to `this_version`", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "64af6b3bcf8f976ce1739798cbdfdbf334f017d9", + "description": "gen_release_notes: fix version in \"you should wait\" message", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "86079447da1e00d49db0cbff9a102eb4e71e8702" + }, + { + "sha": "dcc50f4302d9904e5c433d8bd81af6fcb3159479", + "description": "pan/bi: Interpret register allocation results", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e8139ef6453aa3a8da5a07be74dcb80a35f083e3", + "description": "pan/bi: Add register allocator", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "116c541c0745b9eb6dba3ba3d2567a1fde90cf03", + "description": "pan/bi: Fix missing src_types", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e1d95339254361d4a481b35b3d2adeb4ae417d03", + "description": "pan/bi: Fix vector handling of readmasks", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c63105f98860e96257f9c457aa3ef3c8b21edd9d", + "description": "pan/bi: Minor fixes in iteration macros", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "545dedba13a06d28fa05f3e85bd668db8ddfe80a", + "description": "pan/midgard: Remove incorrect comment in RA", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f06db4d54c1a83005e4e0e00a9d2fdeb5bface29", + "description": "panfrost: Move lcra to panfrost/util", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4d0203aa830a6dfabb163514a5a8cd5b5b7bd86e", + "description": "glsl/list: use uintptr_t for exec_node_data()'s subtraction", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "85d05b3fd713d67aa764dff60467a896a7aa1011", + "description": "aco: fix uninitialized data error in waitcnt pass", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "cc320ef9af6b84b6a1f275261b071d05c0ee6a62", + "description": "ac/llvm: add missing optimization barrier for 64-bit readlanes", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "0f45d4dc2b15e137346e1e3f064a24302e1c9048" + }, + { + "sha": "9c53a3bb223592471d38a4efbfca8d58dd47e432", + "description": "iris: toggle on PIPE_CAP_MIXED_COLOR_DEPTH_BITS", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1896b44aee637929e1e6deaca7518a7475c4d867", + "description": "turnip: Add tu6_control struct.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e4f1697b54453e3ecf132049feff60c518fd7c7c", + "description": "turnip: Enable VK_EXT_transform_feedback", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4a45c84672f7eafef23c6210f3d8dd56ae020242", + "description": "turnip: Implement an empty function vkCmdDrawIndirectByteCountEXT", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9ff1959ca5d24afe48bec20edf7e3d059d254134", + "description": "turnip: Implement stream-out emit and vkApis for transform feedback", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "374406a7c420d266f920461f904864a94dc1b8c8", + "description": "turnip: Setup stream-output when linking program", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "82fdb13c25648de2fc4d381699f5bbbd2d8768f4", + "description": "turnip: Define structs for transform feedback", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2a1d6b81ed54971d33e83b7f5545da096b13b043", + "description": "turnip: Gather information for transform feedback", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "31c420565c713a0398c7b872119acaf4f8dc3978", + "description": "egl/android: set window usage flags", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "cf5ba9d409bcae3496d094def5c3b199a8847b3c", + "description": "ci: Make a simple little bare-metal fastboot mode for db410c.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d51da8610f6c5c0a22f2a7b1c9bfd596a7c46949", + "description": "ci: Fix installation of firmware for db410c's nic.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ff1183648a69f8d082a3fa0b067ccc66b5a78c23", + "description": "ci: Print the renderer/version that our dEQP invocation is using.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "32b7ba66b0156d9fd40b059f20da79a74451f7fd", + "description": "intel/compiler: fix cmod propagation optimisations", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3b76b3bc09c6db2e218f903e0d1c7fb68c9e6458", + "description": "pan/bi: Fix swizzle for second argument to ST_VARY", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f6d96aa962d5497a3fb12b02a47ff9777e5cbfd8", + "description": "pan/bi: Implement nir_op_ffma", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "58f91718944a0cabdd907ed87efe7a239e69a55d", + "description": "pan/bi: Add dead code elimination pass", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "56e1c606f89134e7033e25ca65a23478e13365b8", + "description": "pan/bi: Add liveness analysis pass", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0bff6e5e076e5ae7f188b07ce069647ef7eff0c6", + "description": "pan/bi: Add bi_max_temp helper", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6e0479a6a88656205a1907c8987666f415a7c4a5", + "description": "pan/bi: Add bi_next/prev_op helpers", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e623007eb786ddc5fb06133f3d7c27f9a2eb18f9", + "description": "pan/bi: Add bi_bytemask_of_read_components helpers", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e94754a7c47bd59526de72115576519e015f4d76", + "description": "pan/bi: Paste over bi_has_arg", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9b75f410c44053a4fc84715dec473dadedf7aa14", + "description": "panfrost: Sync Midgard/Bifrost control flow", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "933e44dd435f285e652d29389456dbafca121482", + "description": "panfrost: Move liveness analysis to root panfrost/", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5aaaf7b12c037b25f4c0a06af4744a8893c25e50", + "description": "pan/midgard: Subclass midgard_block from pan_block", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c5dd1d542dea49a19ad3686d26a895395f7f7849", + "description": "pan/midgard: Sync midgard_block field names with Bifrost", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4998925d6a1225ea4b4ad93e38e7eaaac66fa505", + "description": "pan/midgard: Decontextualize liveness analysis core", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3bbec3bc64378c25f03c841c443b7e7c4222cd83", + "description": "pan/midgard: Localize `visited` tracking", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "218785c4a95319145b194db4ca9fe9fbc0713461", + "description": "pan/bi: Implement sysvals", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e6f5ae88a7ff758bc9a506488f7930d53b68ab19", + "description": "pan/bi: Switch to panfrost_program", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e610267510199532fe22b3c62c0ec68c12918ad4", + "description": "panfrost: Move Midgard sysval code to common Panfrost", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b756a66607d1870f96470dc121e73a95d71f04d4", + "description": "pan/midgard: Remove dest_override sysval argument", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c2ff3bb0fea546015755914ac7bdb477c2007da3", + "description": "pan/midgard: Decontextualize midgard_nir_assign_sysval_body", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "674b24dcfd34ea8c6c2ee9a0232f59519ce1d3c1", + "description": "pan/midgard: Remove indexing dependency of sysvals", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7c2647f4117a5ec73c0ce12d224318ec5557f31d", + "description": "pan/midgard: Adjust sysval-related prototypes", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c3f438e0236828839ebf639f8118cb697377bbe1", + "description": "pan/midgard: Remove unused iterators", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3a4524e2fece57502c93318351534129874c37f9", + "description": "panfrost: Promote midgard_program to panfrost/util", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "529c0ba2199852e7d894955e30620885f3bffd47", + "description": "gitlab-ci: build RADV in meson-i386 to avoid 32-bit build failures", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f0178f516f5f96c599592d86c3975234264fefc5", + "description": "radv: fix 32-bits build (again)", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "dcfc08f5b8a322cf015f7216fb0aeada117d53a4" + }, + { + "sha": "fb477cc42186d4809b955072a1c2336d64f07944", + "description": "mesa: don't unroll glMultiDrawElements with user indices for gallium", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "70298ec4c0e43a9dcda828e74d65d87dc6e3b9d4", + "description": "gallium: add PIPE_CAP_DRAW_INFO_START_WITH_USER_INDICES", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "510bd474e6744a44b3cfff7c03b7768f211f8ae9", + "description": "vbo: fix vbo_copy_vertices for GL_PATCHES and adjacency primitive types", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "4c6323c49f1f394f54ea9b9d8e514c2706e3984d" + }, + { + "sha": "218dfd8c1a50515105199704d7060ae1b0d3b101", + "description": "vbo: fix transitions from glVertexN to glVertexM where M < N", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "1f6e53e2437690de9a84e47f5587ff359b8484f2" + }, + { + "sha": "ec7d48afc40343bf314d28a7276500efb1e52361", + "description": "vbo: use vbo_exec_wrap_upgrade_vertex for glVertex in ATTR_UNION", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a398a9d7e7f8fe19eaa0c33b36ab6816472b698c", + "description": "st/mesa: keep serialized NIR instead of nir_shader in st_program", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "86d270cde49f4d74d5c1f45851ce571686ca01cc", + "description": "gitlab-ci: Don't restrict ppc64el/s390x build jobs to gstreamer runners", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "bbdf215fbd8bd3b48287079bb7b580aaa870ad2b", + "description": "gitlab-ci: Sort packages to install alphabetically", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f5235a5b73116ee285bb7f3d50bb5ad4e1269a11", + "description": "gitlab-ci: Remove unneeded python3-pilkit dependency", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "52c53c4a49547a55cac211e476474fd2fc5614ad", + "description": "gitlab-ci: Fix indentation and dangerous \"\\\" in the last multiline line", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b760ccfedb69c926803a4b8d9e61f800e9fc70ed", + "description": "vc4: Fix query_dmabuf_modifiers mis-reporting external_only property", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "61f2e8d9bbde0d4416663fd61c4f63a5a763e96a", + "description": "aco: Don't store TCS outputs to LDS when we're sure that none are read.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9b36d8c23ac405be98a0e83ace1bea4d7a95df82", + "description": "aco: Only write TCS outputs to LDS when they are read by the TCS.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4dcca269455adb1029334cefb035fd19d9d99d50", + "description": "aco: Store tess factors in VMEM only at the end of the shader.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8c3ab49c6b48299935751009c4109a4d2a3b8912", + "description": "aco: Don't generate an if when the first part of a merged HS or GS is empty.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b9695013986d3341e5bb74cfc09dc492204129f5", + "description": "radv: Enable ACO on all stages.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "cec6a856e53c2a47370652269ec4619e5cf895b9", + "description": "aco: Enable running TES as ES, including merged TES+GS.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4fe5eadfaeef515af11007d070df785be8831363", + "description": "radv: Enable ACO for TES when there is no GS.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "926bdfae7dcc8bb0c3f5748b5563fb417cd6b5fe", + "description": "aco: Implement loading TES inputs.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ec56a7093ce21ee63ca3e153613e494872a403f3", + "description": "aco: Enable streamout when TES runs on the HW VS stage.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6047e51430ed423635090a30e965db7f01eb6d72", + "description": "aco: Store TES outputs when TES runs on the HW VS stage.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1d9d1cbce9cda14bada3723134c651c226002012", + "description": "aco: Use TES output info when TES runs on the VS stage.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0e8f4baede5de386e71c0c04f05ca8f8df813bba", + "description": "aco: Setup tessellation evaluation shader variables.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "80d281c6dc9e64f51f7ffb0b649148bc7f71917f", + "description": "radv: Enable ACO for tessellation control shaders.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a952bf394609134ff96f4bebb41bd022c621bfa6", + "description": "aco: Fix LS VGPR init bug on affected hardware.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "57a7d58c5d7651ac10a41f08afd02f84064abbb3", + "description": "aco: Store VS outputs correctly when tessellation is used.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7b7f196fbc820b4f096f80428fb980dc04243255", + "description": "aco: Implement tessellation control shader input/output.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "655c050119719e185ae41bdafb1e62d71ccc3069", + "description": "aco: Fix combining DS additions in the optimizer.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c70b0d0267234716e94aeaf0e585f27c8a8e21fc", + "description": "aco: Slight fix to lds_store and lds_load.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "db93af5f1b7a04fba0899e45b64204766c6ec4aa", + "description": "aco: Refactor VS output stores in preparation for tessellation.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0062bb04accb7fdd6174b45b7db8c0fb962bdf02", + "description": "aco: Refactor load_per_vertex_input in preparation for tessellation.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4e692d65e1c1572a23a0ad6bbe3fa7fa543c9d20", + "description": "aco: Introduce new helpers for calculating address offsets.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "19d5dc9ceea8243596903fed60716a3318d0d653", + "description": "aco: Introduce new VMEM load/store helpers.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4fc1da208efa6333ce4fa6836d0348313085d635", + "description": "aco: Remove esgs_itemsize from LDS alignment calculation.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ca342701c578575f842a679680c4be2d79ba1873", + "description": "aco: Extract LDS alignment calculation to a separate function.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "fe80f22470a194f2736e2277f41acf246f27f1c8", + "description": "aco: Remove vertex_geometry_gs assertion from merged shaders.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f53d31fb9b27b490a8773173707b244c5826b5a3", + "description": "aco: Use mesa shader stage when loading inputs.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "90167112736c603f9f839506e4aa69fe3b8c848d", + "description": "aco: Setup correct HW stages when tessellation is used.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "89ff5b1e514e5473a3fa2700517904caf0bfdfa2", + "description": "aco: Implement load_view_index for TCS and TES.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "aa5eed673c42cfd20bb49410e10c78b46e405590", + "description": "aco: Implement memory_barrier_tcs_patch.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a8d15ab6daf0a07476e9dfabe513c0f1e0f3bf82", + "description": "aco: Implement control_barrier for tessellation control shaders.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2489e4dfd183919028d5a346c2dffc6138c7269f", + "description": "aco: Implement load_invocation_id for tessellation control shaders.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5107b0312a9583fa897f0fb82889df62e19803c5", + "description": "aco: Implement load_patch_vertices_in.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6edf6ad130af7cd02bc5beb0f9d68292fee1557d", + "description": "aco: Implement load_primitive_id for tessellation shaders.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "754837f3b5f1548a1d0f5689c3d340092d64704d", + "description": "aco: Implement load_tess_coord.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9ca2b254ca23781fba8eb7d1f1ca64519089088b", + "description": "aco: Setup tessellation control shader variables.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7b3316f3c9930c1991fbb512897d77001644bfa5", + "description": "aco: Extract setup_gs_variables into a separate function.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "346bd0c623fdc9882e00fdb3301b73afb9fd3fe8", + "description": "radv: Move some helper functions to the radv_shader.h header file.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "78d42d41d47f65f20cfd9ce984eebb9282ed96e4", + "description": "vdpau: remove bogus assert", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "24f2b0a8560f34745854bf8263fa7c2d0f95f2bc" + }, + { + "sha": "b6cebf64394858be8e896c73de7d896f2bb071f7", + "description": "radv: do not recursively begin/end render pass for meta operations", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c78e88e8a60fdc44cd5f95177a00aae480616559", + "description": "lima/gpir: print acc ops even if we have only one source", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "492ef353fb3e48e77469d7787d3fd2d3d8d9202d", + "description": "lima/gpir: improve disassembler output", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "bcbc2b61b53ba39df320777ede30117b249d9738", + "description": "lima: print gp uniforms if gp debug is enabled", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8f5543990e250d4a83467c8e9d6d663dc20d6d4b", + "description": "gitlab-ci: add rules:changes for RADV", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "be22995ecf868a90c6b14fce9b907cf302459e71", + "description": "gallium: hud_context: Fix scalar initializer warning.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "09fbde830f30c8a316710ef827c323be8e43bc7c", + "description": "panfrost: Move pan_afbc.c file to the the right Makefile.source file", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "67aae8f98fdda52318c6f314210492a92be7cdfa", + "description": "freedreno: Add ir3_cf.c and ir3_delay.c to Makefile.sources", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2dc300421d3079d653f106a876263904ba0faacc", + "description": "gallium/cso_context: remove cso_delete_xxx_shader helpers to fix the live cache", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "0db74f479b9c5efe52c5d021fe04ba5ce1f4e1bd" + }, + { + "sha": "1fa259b035c000b590a91a5b51412a2ff1972ab0", + "description": "vulkan/wsi: fix cleanup when dup() fails", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "f5433e4d6ce247b86daed741c07aa99f2bd02c0d" + }, + { + "sha": "6e035c01fb95686b9c48f2930104b90c7d12f0f7", + "description": "Revert \"gallium: make handles of set_global_binding 64 bit\"", + "nominated": false, + "nomination_type": 2, + "resolution": 4, + "master_sha": null, + "because_sha": "e1ffb72a05f9b50ee47767aaadbab3e47896ee14" + }, + { + "sha": "e1ffb72a05f9b50ee47767aaadbab3e47896ee14", + "description": "gallium: make handles of set_global_binding 64 bit", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0541350e3a3cca58484880df04c0db160180b726", + "description": "pan/bi: Implement comparison opcodes via BI_CMP", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6409896ca70d6f7cbcc95b370118c7fa95b7220f", + "description": "pan/bi: Print source types unconditionally", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "20c7d57ede3a5e71aa6e2ef901d42eafcc39f1ae", + "description": "pan/bi: Specify comparison op for BI_CMP", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "08ab7cecd98ca5e128ed3bfc7cc38b0a24b37181", + "description": "pan/bi: Lower b2f to bcsel", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d3823551b4cbdeda12ddc65dfb7519258627a02e", + "description": "pan/bi: Implement nir_op_bcsel", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3a1baafede488987ea3237c1b8990332c6b82f89", + "description": "pan/bi: Import algebraic pass from midgard", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "55f0d811e4aeba74842c8a86e02889ae3e86c994", + "description": "pan/bi: Add isub op", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "acab788578cd1ccd67b042798786a39394ad5f03", + "description": "pan/bi: Disable lower_sub", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1216a63ff22277fede7d7953019fc3412714a4af", + "description": "pan/bi: Implement fabs, fneg as fmov with mods", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8ed79c9ed756a550f98f588abb46099916401cde", + "description": "pan/bi: Handle special ops in NIR->BIR", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b674e39d7224e228aacaa16beee718f059c31f09", + "description": "pan/bi: Add BI_SPECIAL_* enum", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c862234ab3002b5d257e7ea3d76cddad8ca461ea", + "description": "pan/bi: Add a bunch of ALU ops", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5a5896cd76629ed2a2cd87ecdcc1f8a723280cae", + "description": "pan/bi: Implement fsat as mov.sat", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "48e50efd5de7b9ad79f89ba5183a1f45214d501d", + "description": "pan/bi: Allow inlining constants", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "929baf3f88d381313dce7883dfe827305ce55702", + "description": "pan/bi: Add initial handling of ALU ops", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "330e9a66968056330ac030e4ecb738eb3c8489e9", + "description": "pan/bi: Lower vec* to writemasks in NIR", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "69c66ffd8440a6763977641570ffde4fdd5eb3d8", + "description": "pan/bi: Remove bi_load", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e9d480ca1bcf9d410535c95cbe2f93c802b82409", + "description": "pan/bi: Introduce writemasks", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "795646d8f863ec2200fa8b92c036b0897f2bdd2b", + "description": "pan/bi: Generalize swizzles to avoid extracts", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9b8cb9f5aee3428e49d80b2154718cae6c29938c", + "description": "panfrost: Move mir_to_bytemask to common code", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ba03e308b66b0b88f60b99d9d47851a5e1522e6e", + "description": "freedreno/fdperf: set locale", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "30dd0599250c4743ded25663d32c263ab226510c", + "description": "freedreno/computerator: add performance counter support", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "af68b0d3460259b4d9255c36f31381059e482019", + "description": "vulkan/wsi: Return an error if dup() fails", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "34d2637fa76ba4dd1969f06352e191ccb228d8f3", + "description": "vulkan/wsi: Don't leak the FD when GetImageDrmFormatModifierProperties fails", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3c96e25de77b4879f54ef5727378dacf13e7e398", + "description": "freedreno/ir3: try to avoid syncs", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "cc82521de4e8e85022a5facb1b5f52d5139d3022", + "description": "freedreno/ir3: round-robin RA", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b2b349096f03803b974d1d942cfff37f77325bee", + "description": "freedreno/ir3: track register usage in first RA pass", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9ae93be8fbdf753e07fbf1b707ce3bc89105b5a9", + "description": "freedreno/ir3: fix has_latency_to_hide", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b6eb11295a7fab543d738263798b96883e3a658b", + "description": "freedreno/ir3: split out has_latency_to_hide()", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "dd2e050a8422aed2f4de601b3894cbba94ba30bb", + "description": "util/ra: move NO_REG to header", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "36aed70b591f7f4f642b26f46f7928be6d137e7b", + "description": "util/ra: spiff out select_reg_callback", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b3efa2a4da206112f6c9b5adb2df37c2efe646e6", + "description": "freedreno: fix FD_MESA_DEBUG=inorder", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "2c07e03b792d57ae807a6953f0d8ff5f4bcdffd0" + }, + { + "sha": "752b9985bed171a39bb439421d0e2cd8d0ab82aa", + "description": "freedreno/ir3: add simplified stall estimation", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "64ae2ef8bbc63750346345e331750f0e0c643103", + "description": "freedreno/ir3: remove extra nops inserted in scheduler", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ad2ff7a278a80a05021c68fa014731e8d9c10713", + "description": "freedreno/computerator: add hrsq/hlog2/hexp2", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4a8e4c18d2f088458664363eba37173bd457bab8", + "description": "freedreno/ir3: also lower lowp frag outputs", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3535797e8c991d5159871f1517fde107b6990a96", + "description": "nir/print: show variable precision", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "10eee6d8c630a346cb7d531263f69151016946d6", + "description": "intel/tools: Fix compilation with UBSan", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "74be835a84aaa9637c550ae4d71a2dfc66dc6990", + "description": "i965: Use gl_vertex_format in brw_vertex_element.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e62b82a693e6835dce809e924b4a2335486659d4", + "description": "i965: Make use of the vertex format functions in i965.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "cf929823bf8253388a863fa495844380060f68fb", + "description": "mesa: Provide gl_vertex_format accessors.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1641c872ed36ee92ce8136385936e19f8535bec3", + "description": "mesa: Remove now unused _mesa_draw_attrib.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "305724dd7b716fc1c577fde6a0dd3fa62fc0f502", + "description": "mesa: Remove now unused _mesa_draw_attrib_and_binding.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4ccda7bfd9712f1a9e29b9209e444c8e5652167c", + "description": "i965: Remove glbinding from brw_vertex_element.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "38db4f17200cb5bfeb550bb732ccace5052afb04", + "description": "i965: Reorder workaround flags computation.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e53fd073beabfa36338fa349dedfa83e0d0a4d92", + "description": "i965: Split merge_inputs and clear_buffers.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "de579ffba2e4968cf9aca046369c3cbb6684a1e2", + "description": "i965: Test original vertex array pointer to skip array upload.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b684030c3a656ffdbc93581856034e0982db46fd", + "description": "i965: Use the VAOs binding information in array setup.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e1f2c84282bbcce9d7e94ac99d2ff1b3489f6e67", + "description": "i965: Use 32 bit u_bit_scan for vertex attribute setup.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0ea3ca3eca4c0c7ff3b41ff09e6cb30b532c8bc5", + "description": "iris: Move down iris_emit_sbe_swiz in profiles.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "630154e77b778ccb594be9e572988b05b0fc28e1", + "description": "i965: Move down genX_upload_sbe in profiles.", + "nominated": false, + "nomination_type": null, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b1a6a15aafd1bca30753abe626b85f26f029207a", + "description": "panfrost: Get rid of ctx->payloads[]", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "093da77ce68dd7c57e330f817f51ffa63e99783d", + "description": "panfrost: Use ctx->active_prim in panfrost_writes_point_size()", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d66ef690d127a37e6832c1d0e9fee0f48e2c6232", + "description": "panfrost: Re-init the VT payloads at draw/launch_grid() time", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "836686daf36cd8d7c17c909d98f5c9e07549d674", + "description": "panfrost: Move panfrost_emit_varying_descriptor() to pan_cmdstream.c", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b95530bef23826d3d6ca2e2e2d0ad2d74e572a47", + "description": "panfrost: Move panfrost_emit_vertex_data() to pan_cmdstream.c", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "251e685e726173f7256d12bdb6ce882859994793", + "description": "panfrost: Inline panfrost_queue_draw() and panfrost_emit_for_draw()", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5d9995e82cdbf41efb7fb886922244eb958e4a9d", + "description": "panfrost: Move vertex/tiler payload initialization out of panfrost_draw_vbo()", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "13881a4dada8653a5857a1dad969f7c4acf3db0c", + "description": "panfrost: Move streamout offset update out of panfrost_draw_vbo()", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "046c15458575f826da430d86a3056876f1bcc638", + "description": "panfrost: Rename panfrost_stage_attributes()", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "dcc0b1ff01254554dfc0b0b777bab9ba404fe5b8", + "description": "panfrost: Move the mali_attr.src_offset adjustment to a sub-function", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "575f62ea02ecd420572fd952cff5da0304ade17c", + "description": "panfrost: Emit attribute descriptors after patching the templates", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4a2ee61a22f5b408a4ac5377b735df3d9d312f50", + "description": "panfrost: Prepare attribute for builtins at state creation time", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b692ab076a72be97bea6bc527ea1c2e55c5f4d3c", + "description": "panfrost: Ignore BO start addr when adjusting src_offset", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "128820b88681dbcad156138594ca846c95904ee8", + "description": "panfrost: Drop initial mali_attr_meta.src_offset assignment", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "528384cb6dedfa43980e89ef8dc8c8380d8645a3", + "description": "panfrost: Add an helper to emit a pair of vertex/tiler jobs", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8e0a08bc8eb2214bd2ba378a894c0b40568444cc", + "description": "panfrost: Move sampler/tex descs emission helpers to pan_cmdstream.c", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2b946a1d2b5319d2a24c04c79a64706b89efcc9e", + "description": "panfrost: Add a panfrost_sampler_desc_init() helper", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b02f97c87559256086a935836abae3f67ad478c6", + "description": "panfrost: Prepare shader_meta descriptors at emission time", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "55e014336fa69545b71f15c627bb29a7d7c39f7e", + "description": "panfrost: Prepare things to get rid of panfrost_shader_state.tripipe", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e94076f8f59c25ba1ccb5e3409b9587a9d0845e8", + "description": "panfrost: Add an helper to update the rasterizer part of a tiler job desc", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "56aeb921e92ace34e84440ff679590b98895b9e7", + "description": "panfrost: Add an helper to update the occclusion query part of a tiler job desc", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5f043cc7766fc506f8de88d38cf9b60d7a47afbf", + "description": "panfrost: Simplify panfrost_emit_for_draw() and make it private", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8ac17139b1c9b5f8b017c389a29fd373ecfd9e55", + "description": "panfrost: Stop using panfrost_emit_for_draw() for compute jobs", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0d75eb002e1e3444052eb93046368dddea9b576c", + "description": "panfrost: Move panfrost_attach_vt_framebuffer() to pan_cmdstream.c", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5d33d42b4d42abc9fb8b212222ca5f6d17d94e7d", + "description": "panfrost: Dissociate shader meta patching from the desc emission", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "36725be4d952117f75c112db7eecbd3bc1f3139e", + "description": "panfrost: Move shared mem desc emission out of panfrost_launch_grid()", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0b735a2d808205c8a3e5e496d85693db85f7716c", + "description": "panfrost: Move the const buf emission logic out of panfrost_emit_for_draw()", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a72bab1c3e6b2732f2f4e7f4f0a16ff1a8927ebd", + "description": "panfrost: Move viewport desc emission out of panfrost_emit_for_draw()", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "79f8850527756dd0d0e861a59d8c7e6700314a5d", + "description": "panfrost: Move the batch stack size adjustment out of panfrost_queue_draw()", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b28f4bb67ce385b8e87c8d2c4d29195fe557547c", + "description": "panfrost: Add an helper to retrieve the currently active shader state", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a0402f79601c69089cc3cefab51adab9c3331669", + "description": "panfrost: Assign primitive_size.pointer only if writes_point_size() returns true", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "24db276d11976905b2e8a44965c684bb48c3d49f", + "description": "radv/sqtt: describe pipeline and wait events barriers", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c04e9befc0d3eaa4ec8e04af39a11f98c4a659ba", + "description": "radv/rgp: bump the instrumentation spec version to 1", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ac0d5b6b119e21b84f687b1b38a22c6f09332a12", + "description": "radv/sqtt: describe render pass color/depthstencil clears", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b829fbb7f0fa55a7a44a27ae2d44f9c986b213b9", + "description": "radv/sqtt: describe draw/dispatch and emit event markers", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "dcfc08f5b8a322cf015f7216fb0aeada117d53a4", + "description": "radv/sqtt: describe begin/end command buffers with user markers", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "31ecf0b17dc0dfcc70eb96295e52339e9f176c8b", + "description": "radv: initial implementation of the driver internal layer SQTT", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "be700775dc2e97a414d14bc764cd2eb72a639306", + "description": "radv/sqtt: add a helper that emits thread trace userdata markers", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f4fbcfe8183b7253b411dd7ac2aadecd47bd76e4", + "description": "radv: use device entrypoints from the SQTT layer if enabled", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9c88e4a272adaf94082b36b4b94ce0c475e5f779", + "description": "radv/entrypoints: declare a driver internal layer for SQTT", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a64599a303ee8ded4bd6b3cef1f720bb7c308127", + "description": "panfrost: Pass the sampler view format when creating a tex descriptor", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ce845f44e9e8109b9daa74bc15f98063c35bc555", + "description": "Revert \"panfrost: Z24 variants should be sampled as R32UI\"", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8d0ec5b8a6fcf4ac14cb0a2346185aa24207c7d5", + "description": "gallium: Add forgotten docs for new CAPs related to transform feedback", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "251c6991a3a3b6f25239ef746f786e91a7553798", + "description": "lima: enable minmax cache for index buffers", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "53d6bb9fc633a4d0ad99c25ac4a9ca09f12d87bf", + "description": "panfrost: split index cache into shared part", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "040a7117c3b404f82a39cf7b2b232a2149ddfeec", + "description": "st/mesa: fix a possible crash with selection and feedback modes", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7b0e043d486d06a17329b38f41a8e3703c85d356", + "description": "st/mesa: flush the bitmap cache before st/dri and vbo flushes", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "45d4665dc749fa52cc165d8d22356c8d8b5b3e22", + "description": "intel/fs: Fix workaround for VxH indirect addressing bug under control flow.", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "f93dfb509cbf9474befae9919dd8c135bbd67093" + }, + { + "sha": "c144875f624d17f3d28141f2ae7311865e7b03c8", + "description": "intel/fs: Allow NOT instructions in conditional discard optimization", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ba2fa1ceaf4ccb905e1d841b45f88505449db44e", + "description": "intel/fs: Do cmod prop again after scheduling", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "461ee852486da724c79c5145fa2e50bdfa54aa55", + "description": "docs: update calendar, add news item, and link releases notes for 19.3.5", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b06471b77d5c35e04499c7ac27fce8c21bbadd2d", + "description": "docs: add release notes for 19.3.5", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5ffa6eab88332982466f7ba420eb804ed7f97694", + "description": "st/nine: Fix incompatible-pointer-types-discards-qualifiers errors.", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "fdd96578ef2dfe9c4ad5aab5858036298d444a64" + }, + { + "sha": "c1b8e84961066a25e0950e7965285b47df4bb97f", + "description": "radeonsi: determine uses_bindless_samplers correctly", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "fc65df56519af568c2e5954793c17a8aed858148", + "description": "ac: add a bug workaround for the 100% NGG culling case", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "8db00a51f85109e958631ef74a458b0614f37097" + }, + { + "sha": "7481c4be583493374925dbe53703f992616ce91c", + "description": "radeonsi: add a bug workaround for NGG - LATE_ALLOC_GS", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5ea2034f583a62f151f105db4eecfdb5f088e81a", + "description": "radeonsi: enable EXT_texture_shadow_lod", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f3728816afbbe39a563ec02ba65f3b1516d79881", + "description": "egl/android: require ANDROID_native_fence_sync for buffer age", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c7976ed43a7faa8a987b7b90e8f05471fde70620", + "description": "radv/sqtt: fix RADV_THREAD_TRACE_BUFFER_SIZE spelling", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7bbd10da23a4c9355e78cbbc4abbf024c0c1edaa", + "description": "docs/releasing: add missing
  • tags", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "68d8606c4c133489a2e95ec26b1f75b8ed5686e8", + "description": "docs: trivial fix for html structure", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "83e20139db7e55c40a7658a0bdcb115e790bd138", + "description": "glsl/opt_minmax: Add support for float16", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e3cc81e86cc71259626a52b057d67c4a77c19839", + "description": "glsl/lower_instructions: Handle fp16 for FDIV_TO_MUL_RCP", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4fcac46cbd720ec88a6762cf5cda5906eb379c9d", + "description": "glsl/lower_instructions: Handle fp16 for MOD_TO_FLOOR", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6c1c2b779abfda8c7271240f24e92cb6970106a3", + "description": "glsl/lower_instructions: Use float16 constants when appropriate", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2b39bb4fc05638c6c250e9b79c5c8dcf7361229c", + "description": "glsl/validate: Allow float16 in the expression tree", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "198d4a535b9f090f05137d335d3676f3cae1fc1f", + "description": "glsl: Add type queries for fp16+float and fp16+float+double", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ad27eb28d95e10b72ec728f52f87364db8184e32", + "description": "glsl: Handle fp16 unary operations when lowering matrix operations", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1b8edffaa5816449436fac981fd27005fcede8c9", + "description": "glsl: Add ir_unop_f2fmp", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5d6b007da8a36254e03e2ae9e83bd3330f528dc8", + "description": "glsl: Add b2f16 and f162b conversion operations", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6b9f6caf0676bf875d7b346f9d1838c2b6c6b2f1", + "description": "glsl: Add IR conversion ops for 16-bit float types", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "878a35db9dafa0cd8da19149d6f5c34e652a1459", + "description": "glsl: Expand fp16 to float before constant expression evaluation", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "505428f20b082f04787630e6d0e5f4dfbce5efb7", + "description": "glsl: Implement constant propagation for fp16", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "83afebf359983b885b22320f4d66ca7c8007593e", + "description": "glsl: Add fp16 case for ir_triop_lrp optimization", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "668ab9f19d86d34672c2e390ff6f412dae9c6191", + "description": "glsl: Add support for float16 types in the IR tree", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4068d6baff78b203477abbd3c3453a0058ecee56", + "description": "glsl: Add ir_constant constructor for fp16", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b75a166e6866c4ab3f2c525763a25c9b7ab9fb56", + "description": "freedreno/ir3: Don't fold conversions into sign", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2a9d6fdd8c5a94b574e241f9cad5662cbaef54b2", + "description": "gitlab-ci: rules:changes to test on tested drivers changes", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "61fb17e8d74b9b38f54780483157682fe9d3e312", + "description": "amd: join emit_kill() from radv and radeonsi in ac_nir_to_llvm", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "bdd7587414441920743fe476270560722b6beb18", + "description": "radv: use nir_lower_discard_to_demote to work around game bugs", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9d64ad2fe791fcdc3c8c8c2115febdea7cd3e1ba", + "description": "radeonsi: lower discard to demote when FS_CORRECT_DERIVS_AFTER_KILL is enabled", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "de57ea2a3da2188e1c1d9fb043028c5f823cc755", + "description": "amd/llvm: implement nir_intrinsic_demote(_if) and nir_intrinsic_is_helper_invocation", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ce87da71e93d9eea7e9a2667e3273cab9c97667f", + "description": "nir: add pass to lower discard() to demote()", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5adcfa68a935f866dd76f87a189108fbbf226630", + "description": "nir: gather info whether a shader uses demote_to_helper", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "66bb314cb4f531e06e254b231ae1ded963d447c4", + "description": "docs: fix typo in v20 release notes", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4390c232ade997537a083be567b28d3c542f0fb2", + "description": "Revert \"docs/relnotes/19.3: fix vulkan version reported\"", + "nominated": false, + "nomination_type": 2, + "resolution": 4, + "master_sha": null, + "because_sha": "5ff443b8aa9650f907bd9b5524bab21febe42ec9" + }, + { + "sha": "24408acca47619bdb2da1d3f2fa91b1b6e092e16", + "description": "nir: fix compilation warning on glsl_get_internal_ifc_packing", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ad66b25415745383aa9380975f16967bfb1022a8", + "description": "gallium/swr: Fix vcvtph2ps llvm intrinsic compile error", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "33b255e107b470b8d2ea5bd96d82c613244aaf47", + "description": "meson: enable -fno-common by default", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "283e815339a15fa99039c69f1e225269790ae955", + "description": "omx: fix build with gcc 10", + "nominated": false, + "nomination_type": null, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e924181ea89e5e261f8aa24564c32ed22941e752", + "description": "intel/compiler: Discount NOPs from instruction counts", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "bb3e7b0fe34e02607e14c812b7aa5a7d67f047d0", + "description": "intel/compiler: Pass shader_stats for each SIMD mode", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e7d0460d585d6df7f2776e7c1c9b0109aed7dbe2", + "description": "intel/compiler: Pass backend_shader * to cfg_t()", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "edae75037fe52a88d5f1d6c44484d714fac944d6", + "description": "intel/compiler: Mark visitor parameters to scheduler const", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "75a33e268ea4eed0391b1f77948337b747834545", + "description": "intel/compiler: Mark some methods and parameters const", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "03ac90aae517b6275809815a1b0223edd98eccd9", + "description": "intel/compiler: Make instructions_to_schedule a local variable", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "43019c6f2cb6b35589213b3ae07b3859825ab1fe", + "description": "intel/compiler: Remove unnecessary local variables", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3d0821a21676304822d3364e7ba1c064ca523825", + "description": "intel/vec4: Make implied_mrf_writes() a vec4_instruction method", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d8f3d0a3a85244450d43da44cb8eed2389969b47", + "description": "etnaviv: implement emit_string_marker", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "446062833022a86e2e679ba631931164c9e3467e", + "description": "etnaviv: increase number of supported varyings to 16", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "53c6cb1bad7dfd886d937009c363aac09cd5fa1a", + "description": "etnaviv: update headers from rnndb", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "84816c22e4cf782bf521a005cff6063932a11872", + "description": "etnaviv: ask kernel for max number of supported varyings", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0103f02acb10dcdea23461ba214307a6827a7772", + "description": "gitlab-ci: Always name artifacts archive after the job producing it", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "20c09c9c068b7dff6705cf385eac203fd12b806a", + "description": "anv: stop storing prog param data into shader blobs", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e03f9652801ad7f70091e084535a3fb6650c3acd", + "description": "anv: Bounds-check pushed UBOs when robustBufferAccess = true", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "faea84e2540810feb66ac88359b50cf69f2b3cc6", + "description": "anv: Add an align_down_u32 helper", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "61ac8cf08381f7df05b477cfc6854b3b4b88f03f", + "description": "anv: Align UBO sizes to 32B", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4610d69e37fd9472b88fcc7f1bad6530242aa105", + "description": "anv: Delete some pointless break statements", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "28c243e9ece55d0dda0cf065b2496c9f1ff05c79", + "description": "anv: Pass buffer addresses into emit_push_constant*", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ff5de35127d788584be56b047cb609effca5c80b", + "description": "anv: Mark max_push_range UNUSED and simplify the code", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "35ca2ad22e20ad3bc3301ee1e9157b8c351d959e", + "description": "anv: Parse VkPhysicalDeviceFeatures2 in CreateDevice", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "022e5c7e5a5a1ff40d7f5e8d3d768345e7746678" + }, + { + "sha": "0e4c001951a3c07d7ea4ddcd7edda69c20aa49ba", + "description": "docs/relnotes/20.0: fix vulkan version reported", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5ff443b8aa9650f907bd9b5524bab21febe42ec9", + "description": "docs/relnotes/19.3: fix vulkan version reported", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2557d614d36da58ceedfdbb021b8d1f566f7d0e9", + "description": "gen_release_notes: fix vulkan version reported", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "4ef3f7e3d37ece7b4339870282cb52c5e334a68d" + }, + { + "sha": "de30a7ae6ea3d1fa90977229bc71afed595a4d5d", + "description": "pan/bi: Fix Android.mk", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0b0be49005bf7d66d8f8fc8a9bb39dd5e29ab243", + "description": "pan/bi: Rename next-wait to simply 'wait'", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b329f8c750af96f9efb968045dcf03b0fad1b34e", + "description": "pan/bi: Add dummy scheduler", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "51e537c9fa4d10bc5b065a60095bf2d85080d3c5", + "description": "pan/bi: Implement load_const", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1ead0d3488bba096bd697048edf85470d1c5cf20", + "description": "pan/bi: Add preliminary LOAD_UNIFORM implementation", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "48910e83889a0736f61aca7c4b196d7c6420db9a", + "description": "pan/bi: Implement store_vary for vertex shaders", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d86659ca57ebe9d1752e33ed6ffe1e1b70c5f50d", + "description": "pan/bi: Add helpers for creating temporaries", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "59b476e11adf1ad2ddfc597a8f742fb23fd1ab80", + "description": "pan/bi: Implement load_input for vertex shaders", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "dabb6c6b9fd473b10ae9d63b96e7ef248b1a7ed1", + "description": "pan/bi: Implement store_output for fragment shaders", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "79c1af062341266d7ad64a0ac221394d6cbfdfdc", + "description": "pan/bi: Add bi_schedule_barrier helper", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "92a4f26e7f5249df3cb853b3a8cd9e726690d66c", + "description": "pan/bi: Add blend_location to IR for BI_BLEND", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "07671826658dfc90ead2773c864a2ba3460a97e2", + "description": "pan/bi: Implement nir_intrsinic_load_interpolated_input", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "806533ba7ff9d52583d6340b9b2b3c1212d77d79", + "description": "pan/bi: Fix destination printing", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "65c8dcca3b35a482c8378e10bb245a92e2e2bfdf", + "description": "pan/bi: Handle jumps (breaks, continues)", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "987aea14000ce6524b12d72488dc1275d5e8a991", + "description": "pan/bi: Handle loops when ingesting CFG", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9a00cf3d1efe336e09906d87a8f5a50cbbe10fd6", + "description": "pan/bi: Add support for if-else blocks", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "977a38c87f5816828fa42d1da02626d69ba1662f", + "description": "pan/bi: Call nir_lower_io_to_temporaries in cmdline", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "55dab92073f14a9b9c42175af9ddc210277bca5e", + "description": "pan/bi: Add instruction emit/remove helpers", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7fd22c3bbd781ce497304c1270f367b1cd5fd14c", + "description": "pan/bi: Print branch target", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2e9b5f8ef4b80e57c9653fcdc5e0867e9dd338a6", + "description": "pan/bi: Don't print types for unconditional branches", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5c7ee8a9746b1ae7d852b1ae3e5408378547c156", + "description": "pan/bi: Improve block printing", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "83c4562503cc96ee04d873ee5c814e43b9e61b56", + "description": "pan/bi: Walk through the NIR control flow graph", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0d29184f6985b5e88c3a32526850acd7c8f3ab46", + "description": "pan/bi: Lower and optimize NIR", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c652ff8caa8fc7608fc6b98b56324ffc230c118f", + "description": "anv: Flatten the logic add_aux_surface_if_supported (v3)", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "615c65ba1ba6a79536cbced85c13dafbd8a33375", + "description": "anv: Refactor creation of aux surfaces (v2)", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d1b7d80bc358749a4234587e1fda66596f4dd579", + "description": "anv: Add anv_image_plane_needs_shadow_surface() (v2)", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1da6b7f8a3b13f44e8d9cc101034319c0b732f08", + "description": "glsl: add subroutine support to nir linker", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b1bc24f826f4d08f22efd26f067621de84a100f0", + "description": "glsl: dont try to assign uniform storage for uniform blocks", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "576b5ace9e2e90803d1c6b9f1b1728b1e5e8c4ed", + "description": "glsl: add support for builtins to the nir uniform linker", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "79127f8d5be7ab95bde0ab30a03eb21e00df70c2", + "description": "glsl: set ShaderStorageBlocksWriteAccess in the nir linker", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "17f240b874724510d1c2bb57f292024bb8bf5ccf", + "description": "glsl: nir linker fix setting of ssbo top level array", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8ffd09f3114233f742e8cfd142c74ea3477c4c59", + "description": "glsl: find the base offset for block members from unnamed blocks", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "76ce7752403912642cd00905f1d7a5f8bf21d219", + "description": "glsl: correctly set explicit offsets for struct members", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "590a59437fdbc5929934aa55385186154b0ee537", + "description": "glsl: add std140 and std430 layouts to nir uniform linker", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "858a49a10d0e44a7e7f019137562c0d4cfbfab85", + "description": "nir: add glsl_get_std430_size() helper", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a005f1a6e7b7f885a6168f6ea94d992d03fafe6d", + "description": "nir: add glsl_get_std430_base_alignment() helper", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1ccfe821b2244d6880b2aac6641f312c7171dc49", + "description": "nir: add glsl_get_std140_size() helper", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "120a26c6f25905474464661e351e36a4c7c76aea", + "description": "nir: add glsl_get_std140_base_alignment() helper", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "262b611a5bb08ebb8d2876bc44a44952d610a248", + "description": "nir: add glsl_get_internal_ifc_packing() helper", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a02d8e040fb6cbf43a75932104e2b49807723280", + "description": "glsl: correctly find block index when linking glsl with nir linker", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "10b816d27e2e9f744a29beec294774c1d24f4f54", + "description": "glsl: add name support to nir uniform linker", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "aa9b457062cfcdb29a15e0be73bbc1a75305f89e", + "description": "glsl: move get_next_index() earlier in nir link uniforms", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "219cefe24f757e3b8df4052ae76d132e8956bee6", + "description": "glsl: move add_parameter() earlier in nir link uniforms", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "51898c8ee5edb21ac2d8cf9557dca5416ea9a304", + "description": "glsl: move nir link uniforms struct defs earlier", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4d5a0ae22cf9ad893ddb10fca48e85e5dbf9c80c", + "description": "lima: gpir: enforce instruction limit earlier", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "70349a2252a95f181de519be61ea84bd22381e4b", + "description": "intel/compiler: Calculate num_instructions in O(1) during register pressure calculation", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e5e4d016b94fc402c328d9a202504d811d2bb5ce", + "description": "intel/compiler: Move register pressure calculation into IR analysis object", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f6cdf66cd6e2515471c7944f67ddb87881c2366e", + "description": "entel/compiler: Simplify new_idom reduction in dominance tree calculation", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c9a608c0907ccdd745c8cb496e982bca68f8e6e4", + "description": "intel/compiler: Move dominance tree data structure into idom_tree object", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c2a7eababf568ecd23377408e5f837e3bb2e9943", + "description": "intel/compiler: Move idom tree calculation and related logic into analysis object", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2878817197fe94fe0c20efdf2947d63576e3ea8a", + "description": "intel/compiler: Drop invalidate_live_intervals()", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "acf24df2017598eb23c57599e39738e0ec059438", + "description": "intel/compiler/vec4: Switch liveness analysis to IR analysis framework", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ea44de6d8c93551be73d91045686b59a5aa42c25", + "description": "intel/compiler/fs: Switch liveness analysis to IR analysis framework", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "bb8cfa6837fe7967cb9b02e32bd2d1aa37631c45", + "description": "intel/compiler/vec4: Add live interval validation pass", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "24535604aa645651987e41a3bce8eee9e0b871bd", + "description": "intel/compiler/fs: Add live interval validation pass", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a9cdc14f602144620c664f7f42ea2ba0eeb58720", + "description": "intel/compiler: Pass single backend_shader argument to the vec4_live_variables constructor", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d0433971f958be7d38cb96bfe226fbabdd7998e7", + "description": "intel/compiler: Pass single backend_shader argument to the fs_live_variables constructor", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d7e84cbb0f0530bb3e065bd522e5e1814373f589", + "description": "intel/compiler: Restructure live intervals computation code", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "48dfb30f9231c22a6af6885dbc9ef86dc2edde1e", + "description": "intel/compiler: Move all live interval analysis results into vec4_live_variables", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ba73e606f63a4633fa9d8bef69f87b2d88851416", + "description": "intel/compiler: Move all live interval analysis results into fs_live_variables", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3ceb496cdf5ef0ccc79e71c8fb856535501a9446", + "description": "intel/compiler: Mark virtual_grf_interferes and vars_interfere as const", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ab6d7929864b1c80a8de5b7cd58775f02fe1a7ff", + "description": "intel/compiler: Pass detailed dependency classes to invalidate_analysis()", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "65080dc8df00d006912ade2d69d4a06c3d4c5e0a", + "description": "intel/compiler: Define more detailed analysis dependency classes", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d966a6b4c4684bc02647a8fdc69a6c88e5ed00c2", + "description": "intel/compiler: Introduce backend_shader method to propagate IR changes to analysis passes", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "03eb46f4a74c8df3de6785ffe18e968b876469b8", + "description": "intel/compiler: Introduce simple IR analysis pass framework", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "27ae3c1f684fe64e47f7a6cd374dc156f15847e0", + "description": "intel/compiler: Reverse inclusion dependency between brw_vec4_live_variables.h and brw_vec4.h", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a6fc88e91bdf9e235aa8a0a0f69f219c051cb1af", + "description": "intel/compiler: Reverse inclusion dependency between brw_fs_live_variables.h and brw_fs.h", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "06c5c4964621268f2dedd63a614ff89f4307057b", + "description": "intel/compiler: Nest definition of live variables block_data structures", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "310aef6b590d3d129b285ff8c50565af8cebacbc", + "description": "intel/compiler: Reverse inclusion dependency between brw_cfg.h and brw_shader.h", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d46fb2126d9fdd52386b001a140c1b70fec83f9e", + "description": "intel/compiler: Move base IR definitions into a separate header file", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "74e4cda64b9d114321216eefe536f80644b0f0fd", + "description": "etnaviv: add etna_constbuf_state object", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9f5802ad3e0cf303892f19d29803bba95eac9102", + "description": "st/va: add check for P010 and P016 encode/decode support", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d3758035760eadfcfcf4ce1e5dbf43ecd4d4ec29", + "description": "radeon: add support for 10-bit HEVC encoding to VCN 2.0", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8ab31808fd5ba7a2205d755ae98c67055f7f9286", + "description": "radeonsi: add 10-bit HEVC encode support for VCN2.0 devices", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2ba272135ad43a025b10b24182bc5b3ad7bef024", + "description": "nir/linker: remove reference to just SPIR-V linking", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d7a70fbb2305604ce75b1a0dbcd03e2ebe71f92a", + "description": "bin/gen_release_notes.py: fix commit list command", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "86079447da1e00d49db0cbff9a102eb4e71e8702" + }, + { + "sha": "894e2863919420a6f3e3ac55d14bc46b222de447", + "description": "docs: fix typos in the release docs", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "771f16cf6166a3911d374c3de6c19687605f1fef", + "description": "radeonsi: remove AMD_DEBUG=sisched option", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "913d2dcd231ed9c744970875883c185090389ea7", + "description": "nir/lower_input_attachments: remove bogus assert in try_lower_input_texop()", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "84b08971fbdced78dbcb085d633298db7e4bfa7f" + }, + { + "sha": "6dc38cea52ce1e498a5d7a38cadcb9e7e15bde6c", + "description": "radv/rgp: report correct system ram size", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "eeb09a01e7699cbcf1836b0aba288e0e3c8656ac", + "description": "radv/rgp: report correct cu_mask info", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b3ece36257dce867e996e2ca1060d3fabc4ea2de", + "description": "ac: add ac_gpu_info::cu_mask to store bitmask of compute units", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c6c661de3118873b9a1f933917a08213e2435601", + "description": "radv/sqtt: abort if SQTT is used on GFX6-GFX7", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "14283ddc798686c669017f15c3eb0c0272cc6888", + "description": "radv/sqtt: add support for GFX8", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d7470159353d47fe4cafa03ef0e6eaefafba41e4", + "description": "ac/registers: adjust some definitions for thread trace on GFX8", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0d55732a61b1a4f3ea6026ad2b9fe238ddc55a7d", + "description": "radv/sqtt: add radv_copy_thread_trace_info_regs() helper", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9baad41469fa4d9fe3580ae60460dec489e6c358", + "description": "radv/sqtt: tidy up radv_emit_thread_trace_{start,stop}", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6c91aa7955793c8a4c515d4d3878860defd97ed4", + "description": "radv/sqtt: fix wrong check in radv_is_thread_trace_complete()", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ba29c050a3b370ff75d229317a6c2107d736c583", + "description": "radv/winsys: fix missing initializations of shader info in the null device", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9d07d598423e4015bbc7beb7a2fdc4c657d5e0cf", + "description": "iris: Don't skip fast depth clears if the color changed", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "382b902a6db87a2b0409b760c320555d1f4eadca", + "description": "swr: Fix non-pod-varargs error.", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "ff8265b64ff19380170b50b7016191c9d53fbd1e" + }, + { + "sha": "ed0bea4495aef3dd50fc0c9b8b05836b58a3cfc1", + "description": "glthread: fall back if a param size is non-zero and a pointer param is NULL", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "57a9c1ee478c5af8cc2f9ffe78b24917deebb1b3", + "description": "glthread: fix a crash with incorrect glShaderSource parameters", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c5825b7b6e734c991c65246aff59c04ea8cde102", + "description": "glthread: add custom marshalling for glNamedBuffer(Sub)DataEXT", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b8aa5edfc5632e7c7a164566b61c21a6658025b3", + "description": "glthread: merge glBufferSubData and glNamedBufferSubData into 1 set of functions", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8eb03327497f3f0c0147ceea5c22213c4dfd1b13", + "description": "glthread: merge glBufferData and glNamedBufferData into 1 set of functions", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "93b2ee18a1c00f8b60a60e34cee3743dca45bd47", + "description": "glthread: replace custom glBindBuffer marshalling with generated one", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "85276e2c1b8dfdf090a656a7fa1b5613d373515e", + "description": "glthread: sync instead of disabling glthread for non-VBO pointers", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "28a2ad7ddf76702a5de56a7bc0d8754b7dbd66a0", + "description": "glthread: track for each VAO whether the user has set a user pointer", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d510e652d46f471a93eae5a07f7e7508633d1040", + "description": "glthread: add marshal_call_after and remove custom glFlush and glEnable code", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4970199d11907833858bbb2700ba313ae12f3a95", + "description": "glthread: don't insert an empty line after (void) cmd;", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b9eef27920ca7b670225cdc529f200b30140dc39", + "description": "glthread: add support for glMemoryObjectParameteriv, glSemaphoreParameterui64v", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b5c58bbf6ce49199eca076225a7985f3e149ffd3", + "description": "glthread: add support for glCallLists, glPatchParameterfv", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1668a9390321e31ee19292590874e9b16a498936", + "description": "glthread: add support for glClearNamedFramebuffer, glMaterial, glPointParameter", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b0a20e753110afaf4336d64928f7a11dc3901a6f", + "description": "glthread: add support for glFog, glLight, glLightModel, glTexEnv, glTexGen", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "59e96bc513be3938e2d6dc4357e4d38fa5481f6a", + "description": "glthread: add support for TexParameteri and SamplerParameteri functions", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "108fdb54c6c1b82ec3131b0c2e00d554b3729cfb", + "description": "glthread: replace custom ClearBuffer marshalling with generated one", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "88b5fb18b35e68edf2b187251df9a290f386d91c", + "description": "glthread: check the size of all variable params and clean up the code", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "358d923c8b40e71738cb3a3fb0413260361bec9b", + "description": "glthread: handle complex pointer parameters and support GL functions with strings", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d00f36ac25b25402c4d81a0229a703a1b84fc40c", + "description": "glthread: add/update count and marshal fields for many GL functions", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "fb95a4693f05b8a64d61267409fcdce937dd3383", + "description": "glthread: add GL_DRAW_INDIRECT_BUFFER tracking and generator support", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "30b6e8236400deac08aec6b0334f78b7eacb62d9", + "description": "glthread: don't increment variable_data if it's the last variable-size param", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "19dc528bbf74cb823c87420ee3e5b8cb6495a199", + "description": "glthread: don't insert _mesa_post_marshal_hook into every function", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c920572f603b5e0ac062501593a4ed6b53bc8f40", + "description": "glthread: simplify repeated function sequences in marshal_generated.c", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9dbf5ec9f7844dda9d2473a3168e3f8b0009a66d", + "description": "glthread: use int instead of size_t where it's OK", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "313e98fb8111c21fc89d2422d50dc12daec4efc6", + "description": "glthread: reduce pointer dereferences in glthread_unmarshal_batch", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "19151e2605c95498f9dbc85fa85e10e851df374d", + "description": "glthread: inline _mesa_unmarshal_dispatch_cmd and convert the switch to a table", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "245f9593b7967521bd6661d7059096c528cc7f0d", + "description": "glthread: don't prefix variable_data with const", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d93f4faefb0a867ea33b9530e9aa67ae1ed60e93", + "description": "glthread: don't generate the sync fallback if the call size is not variable", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a19c9290f44e6e73a104067a98420c273d98721b", + "description": "docs: update news, calendar, and link release notes for 20.0.1", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6b1f94e9f22e9c70da9e96bc490def5b8165c2a9", + "description": "docs: Add sha256sums for 20.0.1", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7c8766402ec6f43cd45219dc7cac310fc14101b6", + "description": "docs: add relnotes for 20.0.1", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f1890b7ad85b74bd92c858919a304dce8f7aae10", + "description": "docs: update releasing to cover updated post_version.py", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5cdaa06221f73fcb9476c30f52e3210f1e3f65b7", + "description": "bin/post_version.py: Make the git commit as well.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e3d3abb1bcc67891094f03c924fcc7110848eed2", + "description": "bin/post_version.py: Pretty print the html", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d7ada7d7e06359b01fcb6c25b295f58f10f50682", + "description": "bin/post_version.py: Update the release calendar as well", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d4cb9ef8260ced0a5693db18dedbdd11cbcfa7e6", + "description": "docs: Update release notes with current process", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7451eb9a2773a2dcde90ddc93fb5ef361b6f1b1e", + "description": "docs/submittingpatches: Fix confusing typo + missing pronoun", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "42a3d821cbd513db9abf31c4b61172dd7717f53c", + "description": "gitlab-ci: add a job that runs Fossilize on RADV/Polaris10", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "af1cd4585824e3efc941483d74915903439f27ba", + "description": "gitlab-ci: enable building the test image for VK unconditionally", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1cdb6edbe6dcfa3b0d254dc0f1f31e35be3b10ff", + "description": "gitlab-ci: add Fossilize support to detect compiler regressions", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "93fcc9ad57a7e0e64ae45988e62b24563ff9fdc3", + "description": "gitlab-ci: build Fossilize in the test image for VK", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b088a4b113f530ef8c1ad07df33b8fca8586c5d1", + "description": "aco: only reserve sgprs for vcc if it's used", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c6e0c062daa55269661b190deaec40e9749198bc", + "description": "aco: improve control flow handling in GFX6-9 NOP pass", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "47b7f104a0aa3692e9fb202741406a0c6d9ac8ad", + "description": "aco: consider non-hazard writes in handle_raw_hazard_internal", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "38743577f8b47c68ba01a9b9a982db52ef0f605d", + "description": "aco: improve get_wait_states()", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7f1b537304d4837c907a9299dab3a7acf2518b0b", + "description": "aco: add new NOP insertion pass for GFX6-9", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ce196812579d48df8da05118a2a5098d048da83d", + "description": "iris: Enable HiZ and stencil CCS for blorp blit destinations", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a0d5c7da18c9ace4a2153d1352ac44cb3bd65425", + "description": "iris: Enable CCS for copies from HiZ+CCS depth buffers", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "83b641a038704d7fe834d13dad39b0a9f841baca", + "description": "anv: Enable HiZ for VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6cec618e82aa233eee4e412a211a7bec0c40b090", + "description": "blorp: Write to depth/stencil images as depth/stencil when possible", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4531f0ffcec591e3853e78ce58f5d83cf276fb0d", + "description": "iris: Allow HiZ on blit sources", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9f5f4269a66eebfcaa3ae5cd7bdf91d88a7fc69c", + "description": "isl: Set 3DSTATE_DEPTH_BUFFER::Depth correctly for 3D surfaces", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "07f1ef5656e0721282d01a8421eaca056348137d", + "description": "docs: Update stable process around using fixes: and gitlab", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "55dac91adc40db191c385f9a2ce393f46dd9b859", + "description": "turnip: fix tile->slot calculation", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "036230341f4f2e7b11791708015342cf9385cf76", + "description": "turnip: improve binning pipe layout config", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9f9432d56c055b9704a76cad44da88d5e12f825c", + "description": "Revert \"spirv: Use a simpler and more correct implementaiton of tanh()\"", + "nominated": true, + "nomination_type": 2, + "resolution": 1, + "master_sha": null, + "because_sha": "da1c49171d0df185545cfbbd600e287f7c6160fa" + }, + { + "sha": "986e92f0ea803caf014adc40e900bc774af71da3", + "description": "Revert \"glsl: Use a simpler formula for tanh\"", + "nominated": true, + "nomination_type": 2, + "resolution": 1, + "master_sha": null, + "because_sha": "9807f502eb7a023be619a14119388b2a43271b0e" + }, + { + "sha": "bc5724faf40df9aec6c8e2e52f4017db35d21330", + "description": "pan/bi: Add bi_print_shader", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c152d4c8352aca678386eaf75da83ae95e1bd7b5", + "description": "pan/bi: Add bi_print_block", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c316d1553bc27e9f64a14fcce147de96bea430e0", + "description": "pan/bi: Add bi_print_clause", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "919cdf15b3a88cf745e3aed1a52ea45a44846b35", + "description": "pan/bi: Add bi_print_bundle for printing bi_bundle", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "bde54cb6d319fd9516507c1040d9e5fe8e7b81f2", + "description": "pan/bi: Add bi_instruction printing", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "aef0f00cbc976a29e5b66da4b2abbd2bcd9c0d52", + "description": "pan/bi: Move bi_interp_mode_name to bi_print", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5d16a8109c88c869ce17e6b680e2922bb983caa6", + "description": "pan/bi: Add BIR manipulation routines to bir.c", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5f7a3ba872c90afc251035f24f7fc7faf6498fe3", + "description": "pan/bi: Move some print routines out of the disasm", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8ec671801a8decdd5c733f2fec53726d34666a0b", + "description": "pan/bi: Add IR iteration macros", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0b26cb194cc433a9910247051024bd6468d9b05c", + "description": "pan/bi: Add quirks system", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "07228a6895b4b57efaf55e7e6b180e308ceab879", + "description": "pan/bi: Add high-latency property for classes", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "546c301ff6d12cad678b6feb1c83cf75eb36def1", + "description": "pan/bi: Add CSEL condition", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "47451bb9f1c610dc62629d829c378034df83bf57", + "description": "pan/bi: Add bi_branch data", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "73c91f14c9f94c5b2ffbd1aaaf7d7c60cb7bc3c9", + "description": "pan/bi: Extract bifrost_branch structure", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2afddc4433f49eb44654a63b1406181ee3dc25d8", + "description": "pan/bi: Add pred/successors to build CFG", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d3370bd5a50d8a490a8b57a92853ff203f07711c", + "description": "pan/bi: Add constants to bi_clause", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "cb3cd8aa56e76afa988429f0373642c53c1b4f92", + "description": "pan/bi: Add EXTRACT, MAKE_VEC synthetic ops", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8929fe0c84299cedd1ec86f49b795595ff3f90f8", + "description": "pan/bi: Add source type for conversions", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5896db957876c4dc1cd7ecb4e6eef44690b10530", + "description": "pan/bi: Add swizzles", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c70a198f24cbf5127d48673d96ad8f8153dbe729", + "description": "pan/bi: Clarify special op scheduling", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "fba1d12742db36536b6010807a59884abfb79973", + "description": "pan/bi: Add clause header fields to bi_clause", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "44ebc275fe83c007cb7c881cd5016dc1f6ec368b", + "description": "pan/bi: Add class-specific ops", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b5bdd894443507964cad63b40c0c598d115c7333", + "description": "pan/bi: Add constant field to bi_instruction", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a2c1265dd34a97cfb1abd11fa44d8cf93187c99e", + "description": "pan/bi: Add special indices", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c42002d26f4ff59e188891e5ff68d8387d1959d3", + "description": "pan/bi: Add dest_type field to bifrost_instruction", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a35854c5eee542c47e8be3c6d85a19d8fad99acc", + "description": "pan/bi: Add bi_clause, bi_bundle abstractions", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "99f3c1f34c0526a9d0a5177d71d0c4a6042c3409", + "description": "pan/bi: Add PAN_SCHED_* flags", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9643b9dd5b683d5e18c085cd49bdfe49143b861b", + "description": "pan/bi: Add bi_load_vary structure", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6a7987aba10aaf05fbe678b3f3ccf5882b687aea", + "description": "pan/bi: Pull out bifrost_load_var", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "aa2f12de562e38b7b0e154c7d467aa1d85279a32", + "description": "pan/bi: Add bi_load structure", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b93aec6df19d6daf3d6c28aad755af1cec52aab7", + "description": "pan/bi: Add bifrost_minmax_mode field", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d69bf8db6217b7309ea7a7aec8139c8151b39f3c", + "description": "pan/bi: Add a bifrost_roundmode field", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "bbf41ffb00d8d78db1cf43403ab7f6af5a2f9ec3", + "description": "pan/bi: Factor out enum bifrost_minmax_mode", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "34165c7ec0fb3a0a07f3a1ede833b8bbf336e44a", + "description": "pan/bi: Add BI_GENERIC property", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "29acd7bd8e50ac83aeeb68471f516ed6525aae99", + "description": "pan/bi: Add modifiers to bi_instruction", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7ac62121e037f3d9fbd3612d936ff736835e0b1f", + "description": "pan/bi: Add class properties", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "230be61f201d07ac95e32a82e688a05eb4093fcc", + "description": "pan/bi: Add src/dest fields to bifrost_instruction", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e7dc2a7b9beeb3fe9af00033d972f89bf436bb68", + "description": "pan/bi: Add the control flow graph", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "eceaea43e37e30e9bf7e5059d17cec445e59fbd3", + "description": "pan/bi: Stub out new compiler", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5d3a4e31138f1663b0c37b91d7263bba6025fa73", + "description": "pan/bi: Gut old compiler", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "eb15525ab798aea74b02a7160c0fa4b9ec6212be", + "description": "panfrost: Add note about preloaded varyings", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7618fe1b484eecc2246202df1e53ee607c6d70c4", + "description": "aco: fix image load/store with lod and 1D images", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "4d49a7ac737f298b136ab6447e18c6e4654f8ad5" + }, + { + "sha": "cc9493f78ed3b366aaf1f4933c76984180a81a60", + "description": "gitlab-ci: Distribute jobs across more stages", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "71436f9640647e21651ae24bb57db36968a24ba0", + "description": "gitlab-ci: Drop \"test-\" prefix from llvmpipe/softpipe job names", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "53a22c4b89c860316e3c07a9f95ad4871339049e", + "description": "vbo: merge draws even when begin==0 or end==0", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ab7209fb83ee29b59eb20db2b683eb46b5268a93", + "description": "vbo: merge more primitive types for glBegin/End (v2)", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d740e3d6ee226c20870711c8df663b3aa97c8486", + "description": "mesa: deduplicate draw indirect functions", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7700ac3d80ae70d00e3cca52b6ea3d00122c7893", + "description": "mesa: optimize get_index_size", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "450152f8d85f9f54498ea5116561f2aefe7378dc", + "description": "mesa: remove _mesa_index_buffer::index_size in favor of index_size_shift", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "df3891e74a72d275aceba91adc94a9e7dc9aa029", + "description": "Revert \"mesa: check for z=0 in _mesa_Vertex3dv()\"", + "nominated": true, + "nomination_type": 2, + "resolution": 1, + "master_sha": null, + "because_sha": "f04d7439a0ad6e13ff2912ff824553b6bcf511a4" + }, + { + "sha": "9c9c314e419e4085250e892be55ab67a71625458", + "description": "vbo: fold code from vbo_exec_fixup_vertex to vbo_exec_wrap_upgrade_vertex", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8205042be60ccfe8d1661767e6f293c97d33e2cd", + "description": "vbo: clean up conditional blocks in ATTR_UNION", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4c6323c49f1f394f54ea9b9d8e514c2706e3984d", + "description": "vbo: handle GS and tess primitive types when splitting Begin/End", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f97341a9d6d4377950e313e76f75230d80f6240d", + "description": "vbo: clean up vbo_copy_vertices", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1be1ea0b8e2d5eed5202f669d11f5644fb4b5de2", + "description": "vbo: deduplicate copy_vertices functions", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "fd8eb634fd93e61e47599fb74513eb0ab0bb3726", + "description": "vbo: don't look at the second draw's count when merging 2 glBegin/End draws", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e92a4f817d1f7a5094066e2a47a246fd5ccf94d6", + "description": "mesa: replace some index_size multiplications and divisions with shifts", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "87085c673d593e6332ca2f3fb6737b77f7087f66", + "description": "mesa: add index_size_shift = log2(index_size) into _mesa_index_buffer", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f38ffa4659aa985f5d1c78bdd5be5064b792b819", + "description": "android: r600/sfn: Add GDS instructions", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "32d3435a78675ff5ebf933d45b9b99fdc4dc7d82" + }, + { + "sha": "88c68c0ac7618c56ba17755ea6b93827437dd46a", + "description": "android: r600/sfn: fix includes and libmesa_nir dependency", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "f718ac62688b555a933c7112f656944288d04edb" + }, + { + "sha": "01778d1e3cc675e6c7627e5b7ef7acd5ad290bfb", + "description": "android: aco: fix PIPE_FORMAT related building errors", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "8d07d66180b1523d7e70681ca142955f896ebda9" + }, + { + "sha": "b20693be416ddeea0bef050edb2acd13016d6532", + "description": "nir: Flush to zero with OOB low exponents in ldexp", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ec9da8990067332292709a221a168d56bb7a38a2", + "description": "zink. Added storage CISto descriptor pool. Added storage in descriptor pool for combined image samplers as well as uniform buffers. Stops some shaders from running through a pools storage faster than zinks internal tracking.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0ac731b1ff96de46998948aa06081efa5140d50e", + "description": "gitlab-ci: Add jobs to be able to test Vulkan", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5c65f8b377a6485e6c155dad98cb8af49d594418", + "description": "gitlab-ci: Add gfxreconstruct traces support", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1d75595da4a72c2ca0eb2b9b88db09a05a873e67", + "description": "gitlab-ci: Change devices format to ", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f1b7b8c0ee6b796c90dab54e713ed6d08ef98356", + "description": "gitlab-ci: build VulkanTools into the Vulkan testing container", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "028ab482bfbd9aa703c9d6fa658020951e071b67", + "description": "gitlab-ci: build gfxreconstruct into the Vulkan testing container", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "fc2338dc44cab6095a5bb9d5f9c42901f10a493f", + "description": "gitlab-ci: add missing popd to the build-deqp-vk.sh script", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8c5e2ef19f6cbb96bb1a21e245e4e866458b0f8a", + "description": "tracie: correct typo", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "83f54e3c54ab43fc7d7946999dc9dda6a93cf67d", + "description": "etnaviv: fix alpha test on GC3000", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f95fa3d1ac0a8dfbaa95db441c2744f7ad942dec", + "description": "etnaviv: update headers from rnndb", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e5b01183a69d8492a66d0c172ac44b9d9ba5fd59", + "description": "egl/wayland: Don't invalidate buffers on no-op resize", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "311c82e1923f63070b198881d90c1098f4ff7a08", + "description": "Revert \"glx: convert glx_config_create_list to one big calloc\"", + "nominated": true, + "nomination_type": 2, + "resolution": 1, + "master_sha": null, + "because_sha": "35fc7bdf0e6ad6547e39099e7060a3d89539b56d" + }, + { + "sha": "6ceda485605b627c5d2f8974a40e092ce00cc08e", + "description": "zink. Don't set incorrect sType in VkImportMemoryFdInfoKHR struct", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3199b8b9e7f0a63075ea082f51fae28daee2bd3a", + "description": "turnip: support indirect draw", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a933934efbf343b6df3ea65ac70545bf200986ef", + "description": "android: gallium/auxiliary: fix \"Unused source files\" in tesselator", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "bd0188f9eab3370f023243bffe53431ec3019bb7" + }, + { + "sha": "aea8c9c7b1fb329a7f7d43182c054bf3b67232f3", + "description": "ci: Flip db410c back to docker mode.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5f13996262a6d72ca5b5c235647d5257ae961b66", + "description": "intel/gen12+: Disable mid thread preemption.", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "42ee6ff706f864d7666603af282ab30544ee638a", + "description": "Revert \"gallium/swr: Fix min/max range index draw\"", + "nominated": false, + "nomination_type": 2, + "resolution": 4, + "master_sha": null, + "because_sha": "5e9a2c603f38308f7027d6a5e4575e5fc24c1bd5" + }, + { + "sha": "291f40a4991a7fe6d6091051421290c585bb63a8", + "description": "vc4: fix vc4_yuv_blit overwriting fragment constant buffer slot 0", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2d1ba8638219c6c9b0aa370b12ca91c6e4b7844d", + "description": "aco: handle v_add_co_u32_e64 in parse_base_offset()", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "215df21dea14358cccc1c9d84a186221cf834c7d", + "description": "aco: fix carry-out size for wave32 v_add_co_u32_e64", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "e0bcefc3a0a15a8c7ec00cfa53fd8fffcc07342a" + }, + { + "sha": "18675363a35e98e6afa39c77b983ca6f383bfc00", + "description": "gallium/swr: fix corruptions in Unigine Heaven", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0ab5c88a0ac0dacac085c17bef23c9c67eaf9e86", + "description": "st/va: GetConfigAttributes: check profile and entrypoint combination", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "cd40110420b48b3005c9d1d4ea30e2cbcc9a3d40", + "description": "intel/isl: Implement D16_UNORM workarounds.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9fea90ad5170dd64376d22a14ac88c392813c96c", + "description": "aco: keep track of which events are used in a barrier", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "93c8ebfa780ebd1495095e794731881aef29e7d3" + }, + { + "sha": "3f31c54842d4d2e1e78dad6cab57e45cb616b344", + "description": "st/va/postproc: reallocate interlaced destination buffer", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2d32248f49ebc25d76eb32d6f7a41bb0fd2c489a", + "description": "panfrost: fix transform feedback", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "585a21ceca03b47f1b00579e43dd105bfd116fd5", + "description": "gallium: add PIPE_CAP_PSIZ_CLAMPED", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "babf7357d24b88a548f50aebca74c6ffd8f81d52", + "description": "gallium: add PIPE_CAP_VIEWPORT_TRANSFORM_LOWERED", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4ce339e74118786893b5138db37c09c4f2d830fd", + "description": "gallium: add PIPE_CAP_PACKED_STREAM_OUTPUT", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "82dc149254a791de1835e2402ed9a73511f42fdf", + "description": "glsl/linker: add xfb workaround for modified built-in variables", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4a329bea44fca8607a3e4538b18fd93864d99c18", + "description": "glsl/linker: handle array/struct members for DisableXfbPacking", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "00746fa2dab0b55b113e3543420b79f01f91e5c1", + "description": "glsl/linker: add DisableTransformFeedbackPacking workaround", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8b361df9cf73aae45fff7f766da46d585d903c92", + "description": "spirv: fix memory_barrier_tcs_patch emission", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6d839addf953630afa52200d5e9922646227d84d", + "description": "spirv: improve creation of memory_barrier", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5d713fb66e71bec3d4c9a6324fd0b6b52040ecb2", + "description": "lima: don't disable tiling if there's linear modifier in list", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "46a8cab58bc8fa897c66831b17614da4e9a36706", + "description": "ac: rename min_vgpr_alloc to min_wave64_vgpr_alloc", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "33faef6a34023217b605ca11f519e0c1cd74b51b", + "description": "ac: rename vgpr_alloc_granularity to wave64_vgpr_alloc_granularity", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9432eb3e9ce56e475d29bc59494c83815305aede", + "description": "ac: rename lds_size_per_cu to lds_size_per_workgroup", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "69628ababbd79a5865c2c5bd01b4a963eb7800f9", + "description": "turnip: Execute main cs for secondary command buffers", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5715a61fa96e8986da82ef263e06db4ef90fb106", + "description": "turnip: Promote tu_cs_get_size/is_empty to header", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "bdf20d324bfec6a6cbabf7492cb4b19f7d9de5ad", + "description": "nvc0: enable EXT_texture_shadow_lod", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "11a06dfd4ba4351848422eba357a8b41dd3b78df", + "description": "st/mesa: allow TXB2/TXL2 to work with cube array shadow textures", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1d3b0b908880029b06c0b54b8d32c53e7f4d5895", + "description": "nv50,nvc0: add newly added PIPE_CAP's to list", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "62f7197fb54d2fbb7bd5646115008d3c27a3dfb9", + "description": "anv: multiply the scratch space by 4 on gen9-10 like iris and i965", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "aa78801f0a6cfeaf3d16b4333239c0b862f73c10", + "description": "intel/device: bdw_gt1 actually has 6 eus per subslice", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9e5ce30da7fa3f1cc3badfd348e5f8fda1bbacb2", + "description": "intel: fix the gen 12 compute shader scratch IDs", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1efe139cad150072985db02227be947aec532e2b", + "description": "intel: fix the gen 11 compute shader scratch IDs", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d0c66869c1f9d454fc1c9adbd762a7a9b2756e86", + "description": "pan/bi: Move some definitions from disasm to bifrost.h", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "346262ceb6d5c6aab40b325e674a71de8860a062", + "description": "pan/bi: Structify FMA_FADD", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4fe5b59a96aad32ac1ee02a0fc10e296136e6168", + "description": "pan/bi: Squash LD_ATTR ops together", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ee957bc0f3786562293a100cbcf5e07049c522be", + "description": "pan/bi: Combine LOAD_VARYING_ADDRESS instructions by type", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "36fe378f1cd0076723fc20b2acd08efd58a72d3c", + "description": "pan/bi: Decode ADD_SHIFT properly", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8c79c710d4e1f3e424d5abf1f9abccdfc9a59caa", + "description": "pan/bi: Identify extended FMA opcodes", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b51468ed9c402c7bb982370b49dea895ed2cc677", + "description": "pan/bi: Add v4i8 mode to FMA_SHIFT", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2db454bbabf07bffbc61e68d51dfb733c11e4976", + "description": "pan/bi: Decode FMA_SHIFT properly", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "67bbaddf7d8eef7eb2b2ab3944a1a4a86d16bded", + "description": "pan/bi: Move notes on ADD ops to notes file", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7c96bd2dc5a0d3d5c7d620e0fe56f1931db52a3c", + "description": "pan/bi: Introduce CSEL4 class", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "19a449e4258174cfba13b9bab70fbab1a700fdfd", + "description": "pan/bi: Move notes on FMA opcodes from disassembler", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "dff83476c420f3f408d3d9dcf8c58e6ec89c0b1b", + "description": "pan/bi: Add ICMP.GL.NEQ op", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "178d9d42696876fe244543a2c897e4c702a51cbd", + "description": "pan/bi: Add discard ops", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3044a37a84086f4965fda46b718ce252c42b1187", + "description": "pan/decode: Skip analysis for Bifrost tiler structures", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "acd140c2e20dfe9f18b7b442a7af79e0f221cb57", + "description": "pan/decode: Fix tiler weights printing", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3f5cd446b25e57344cdb0bbd28d3e36ecdcd11ef", + "description": "pan/decode: Restore bifrost sample_locations", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5815f33c6b306be530b145418d31094ee8abe0c2", + "description": "pan/decode: Calm an assert to a pandecode error", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b4ddc6139b9534fb4559948ebcbaf96c76097d55", + "description": "iris: Wait for the GPU to be idle before invalidating the aux table.", + "nominated": false, + "nomination_type": null, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a7de6f1321a00316a59effeed93365f6979e6c69", + "description": "iris: Split aux map initialization from invalidation.", + "nominated": false, + "nomination_type": null, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "43dc842cb91c195fe7bb47a7ce324425096bf6f5", + "description": "anv: Wait for the GPU to be idle before invalidating the aux table.", + "nominated": false, + "nomination_type": null, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3ca3050de57e60e86c1e3ccfa9d57689ffc5a820", + "description": "anv: Do end-of-pipe sync around MCS/CCS ops instead of CS stall", + "nominated": false, + "nomination_type": null, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2db471953ab57e47f9d950f474c1c8267cb0d456", + "description": "anv: Use a proper end-of-pipe sync instead of just CS stall", + "nominated": false, + "nomination_type": null, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ac8d412ba39bf5634a218fff23e55c07d7ac81d6", + "description": "anv: Use the PIPE_CONTROL instead of bits for the CS stall W/A", + "nominated": false, + "nomination_type": null, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "bb2287ccdf46822dc5e7c889efce116620b9c509", + "description": "gallivm/tessellator: use private functions for min/max to avoid namespace issues", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "bd0188f9eab3370f023243bffe53431ec3019bb7" + }, + { + "sha": "c376865f5eeca535c4aa8e33bcf166052c1ce2f2", + "description": "egl: allow INVALID format for linux_dmabuf", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "646fbb1c4f39f61648c1a1153070df2452153450", + "description": "lima: add RGBA5551 and RGBA4444 formats", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ede93a32786ff8548ab4c6f48eb8af7a81872ee5", + "description": "ci: Add a disabled-by-default job for GLES3 testing on db410c.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "58659446354d2afb7ba1b7b4b7a40b2ed5f44c09", + "description": "ci: Switch testing on db410c over to LAVA.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "adcb365c1dae5e97f6863d320a774753bf76780b", + "description": "r600/sfn: Don't try to catch exceptions, the driver doesn't throw any", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b66170b53785dea3bfc00901277273b7bde9b561", + "description": "r600/sfn: Use static_cast when type is already known", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7780b50b7e7c29fef4065c1b3ba7f63edeed3028", + "description": "r600/sfn: Avoid using dynamic_cast to identify type", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3503cb4c28e01b34f3a25546c058150709c22348", + "description": "docs/features: add v3d driver", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "760fe44e8c09e533fb96d327e658aea4bfe3c883", + "description": "aco: pass vars by const &", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5469221e77bd6baf1dfe4a6c1c0149223bf2f890", + "description": "Revert \"gitlab-ci: disable panfrost runners\"", + "nominated": false, + "nomination_type": 2, + "resolution": 4, + "master_sha": null, + "because_sha": "a86662c44d7cb2541c3f613805533064219ad11f" + }, + { + "sha": "2521c81c9e898b6785cb4fc069c329c42c2990c6", + "description": "aco: Minor optimization in spill_ctx constructor", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d555794f3032594dbef3623052103900138d2356", + "description": "radv: update entrypoints generation from ANV", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "79d4d2807ff60d571c377c7f4aa729601ab873e1", + "description": "radv/sqtt: add support for GFX10", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "eea3912451f411c3d61d258cda65a27f274bac38", + "description": "ac/registers: add definitions for thread trace on GFX10", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "fedbc4c929cb1528db646c2ec24b75f92de3e281", + "description": "radv/sqtt: update SPI_CONFIG_CNTL.EXP_PRIORITY_ORDER value", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "36768eee9acb4cdd08cd56b0e75c0bb870657bbb", + "description": "radv/sqtt: do not assume that the number of shader engines is 4", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1b565e56e94a363e49b2311bf57f38ccd548cc5d", + "description": "radv/rgp: adjust trace memory/shader clocks to fix frame duration", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "fbd61b3fb66bcc3eb0f65da2c869046c24c35dc8", + "description": "mesa/st: fix formats required for EXT_texture_norm16", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "de4eb9a3bb9fb073a5bf5cc157918bfa0f62b394" + }, + { + "sha": "e58bb417b57243d9bf0faa0995522dde5bf3fbfb", + "description": "lima: Add etc1 support", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "37a670d76c245fab238f84dc31ecb281d62531e3", + "description": "doc: Update features.txt for r600 with misc supported features", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "85457e350dde0589c96083c75594d089d339fba4", + "description": "intel/tools/dump_gpu: fix getparam values", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "76bf38eaf0b6c839eaa4a36990e4b14b8095b7eb" + }, + { + "sha": "1e43910aa2e018a1819bcfef6916d5ce5e1e7276", + "description": "meson: Enable -Wno-deprecated only for bison > 2.3.", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "11a1cb2fa8dadca9d918e8421a3b26a1b176937c" + }, + { + "sha": "5306b662dd59ac8accccf25b81ee97742ce71849", + "description": "mesa: fix _mesa_draw_nonzero_divisor_bits to return nonzero divisors", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "fd6636ebc06d55b59851701c436b8b97f50fd7f4" + }, + { + "sha": "a86662c44d7cb2541c3f613805533064219ad11f", + "description": "gitlab-ci: disable panfrost runners", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "02f3af2ad1eb1732d0bfb781de5e781bf83b400d", + "description": "radv: fix size of sqtt_file_chunk_asic_info on 32-bit system", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "33f604a33187cb001bfc389a43a98e8ef634bf86", + "description": "radv: fix 32-bit build failure in radv_queue_internal_submit()", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ad094433b4180ae5d9a33562a0982d399a0bb4bc", + "description": "glsl: add some error checks to the nir uniform linker", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "61dc9354c0c9b5dae3f6763c5141ff38e3e72816", + "description": "glsl: fix sampler index calculation in nir linker", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ef47069cc30e0a3337fcb6ff0c740e1bc50879c5", + "description": "glsl: reset next_image_index count for each shader stage", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e0aa0a839f9c168784a1f50013c83877cc876094", + "description": "glsl: fix resizing of the uniform remap table", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "190a1ed170231d6f1db0526a1867a6766ccd4823", + "description": "glsl: set the correct number of images in a shader", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b232a54df11cf6864e344b08f77c0dab5ddf0baf", + "description": "glsl: set the correct number of samplers in a shader", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7dafc3050da4ab240c8ecf4ddcfba2c6192113f5", + "description": "glsl: fix possible memory leak in nir uniform linker", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "cf12faef614ab7cd9996410f1d161558a3853936", + "description": "intel/compiler: Restrict cs_threads to 64", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "932045061b5850368e8a4a5b3e6609eba6ed8d66" + }, + { + "sha": "09323634898ab3efc0150dc7d756bf36b1b89b76", + "description": "st/va: remove unneeded code", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8cb9f79413105a8eea98eaf70725cc1e28ae041c", + "description": "freedreno/ir3: add assert", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ac705edd82996b4176de6a991db38e76a11625a7", + "description": "freedreno/ir3: fix assert with getinfo", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c1f436746182027ce52c59c0efeea3d2a28dea5b", + "description": "freedreno/ir3: don't precolor unassigned inputs", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4b8e198fd2b5bfcefccd998a94a1f065c91119d3", + "description": "freedreno/ir3: fix crash with samgq workaround", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "56565b7bba54b8298d2c14c66bb87c59930b09ee", + "description": "freedreno/ir3: update SFU delay", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2cf4b5f29edbd7a01590fdf244fead5551db8d3f", + "description": "freedreno/ir3: track half-precision live values", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4353b3c1c5ae3927ad7e99b72cdf1ce63023493d", + "description": "freedreno/ir3: don't hide latency when there is none to hide", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9d2aaa589cf1c4fc8599f26a033aeeabb595f134", + "description": "freedreno/ir3: rewrite regmask to better support a6xx+", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c02cd8afbd6a2f2b1aaaec9d499e6ede55aebe8c", + "description": "freedreno/ir3: remove regmask_set_if_not()", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2fa64729db95655d61b92c5e155a4746101c1cfc", + "description": "freedreno: honor FD_MESA_DEBUG=nogrow", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "bab9db6c0263e74696f300520db6380ee0803322", + "description": "freedreno/a6xx: enable SKIP_IB2_ENABLE properly", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9724a7c1055fb661794cdd65839812974c7e00e6", + "description": "freedreno/a6xx: don't emit YIELD packet", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "45771786e4aff33292727e799c850dd9579965c5", + "description": "freedreno/a6xx: whitespace fix", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ae3e237db05e365d87f6bd8bd44957922c49046c", + "description": "freedreno/a6xx: emit LRZ clear in sysmem too", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6b605804eade2c3701745c3ef447246cfe975413", + "description": "freedreno/a6xx: remove unused param", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "141d0d1c25d031df17c7ec1931c2e78cfc04736a", + "description": "freedreno/ir3: remove from_tgsi", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c7ac1bcea0198c86a1e208ece50c1af5da2cb339", + "description": "turnip: increase array sizes in tu_descriptor_map", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d195eef05d2eb063ab82d680950b40eb9127265e", + "description": "turnip: fall back to sysmem when attachments don't fit into gmem", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "de3230e0a5d90911f8a4e376629e264278dffc00", + "description": "turnip: remove unnecessary fb size check", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "cf302c9a22fab86da0bc70f377c8f0c43f5d8d77", + "description": "turnip: don't hardcode gmem base for input attachment", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6420406f197cc4f1170c340e839701aeb253fdf0", + "description": "turnip: fix srgb MRT", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "59f29fc845ce6425959e8db9b707363b1e273445" + }, + { + "sha": "8f9e1c6047e2542186ac87569934e306301c2996", + "description": "turnip: fix hw binning + render_area offset interaction", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "de33c23370bf37b2a2fdf5be1daab6007d054c08", + "description": "turnip: minify image_view extent", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b18d6575fee00ed9db3c89b4e8c0756cc52d53e0", + "description": "turnip: remove unecessary MRT_CONTROL fill", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "33b2db5fb93348758fac748abe74fdfcdbc811ea", + "description": "turnip: move some constant state to tu6_init_hw", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7d27a9ffb36909812c249558da2485da44afd89f", + "description": "turnip: check the right alignment requirement on shader iova", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0f0662a5515d04f46dac332b69ec324bad16d812", + "description": "turnip: add r5g5b5a1_unorm/b5g5r5a1_unorm formats", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "80ceebcdd10450d5d465f4a0917196da006ba479", + "description": "turnip: rework format table to support r5g5b5a1_unorm/b5g5r5a1_unorm", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "89c6ef4233583469f30b266fcec8486f5d7d2b5a", + "description": "util/format: add missing BC4/BC5 vulkan formats", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "339f127f2b38438f64d6ff846c0a3e8c3dad83f3", + "description": "panfrost: LogicOp fixes and non 8-bit format support", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "068806c9f6be9bab84ed0d4868f16705cf4c7e6d" + }, + { + "sha": "574b03eebfba1a4b9de1a497857f0bbdca76ef19", + "description": "nir: Allow nir_format conversions to work on 32-bit values", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "cf69b9635a7fca9b865fe673073f1baff83bf759", + "description": "r600: add missing include", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "eb5227173f0354aade28e46397a5bb69b2af69fb", + "description": "llvmpipe: add support for tessellation shaders", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a3257ae7bec68d57e79928eb202cd522d708e720", + "description": "gallium/nir/tgsi: only scan fragment shader inputs for usage_mask", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "dacf8f5f5c82c18e841050af37db54ca21c026ee", + "description": "draw: hook up final bits of tessellation", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0d02a7b8ca794a594c2e9cc5e6d63dc591593105", + "description": "draw: add main tessellation code", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "76daf893ea0fdbbb53017d0395be7c23b80c256c", + "description": "draw: add JIT context/functions for tess stages.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3ecd49611792e558ecc093cfb0776c5104b979fa", + "description": "gallivm/nir: add tessellation i/o support.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "70a7603b6396fed615adc9ba06c1f0f09b5ac9ac", + "description": "gallivm/tgsi/swr: add mask vec to the tcs store", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "87359d68a980c70e6f3a65ffd528496ee498e366", + "description": "gallivm/nir: align store_var param order with load_var", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7898e37fb4201b4b8c761a6d131d2bdd7b7ef119", + "description": "gallivm/nir: add support for tess system values", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c632d806cb5702cae5bedfcb64294aca36978136", + "description": "gallivm/nir: split out 64-bit splitting code", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "bd0188f9eab3370f023243bffe53431ec3019bb7", + "description": "gallium/auxiliary: add the microsoft tessellator and a pipe wrapper.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "bf16ff317256c208e21362191bb93200925ea944", + "description": "radv: allow to capture SQTT traces with RADV_THREAD_TRACE=", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ed0c852243719c6bac420de50a2f1061250e5b51", + "description": "radv: add initial SQTT files generation support", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b3ef07db96d181323524788b0ffe0919ec376567", + "description": "radv: emit thread trace markers after every draw/dispatch call", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "768d4f0551e38bd688e3e551d5276ceff5d2316b", + "description": "radv: add initial SQ Thread Trace support for GFX9", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "94099ee64296c60fdd5c3b237eedea0ff6651ea4", + "description": "radv: add a small helper that allows to submit internal CS", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "dbbf49c8f38e0873584424e3a63a2089bb71f37d", + "description": "ac/registers: add definitions for thread trace", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3de4f6c9f0322830a9a1138e64079228ad410061", + "description": "ac: add more fields to ac_gpu_info", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3c7c021ffcaa93eaac4f58bf23177b650354de96", + "description": "ci: Enable -Werror on meson-vulkan and meson-testing.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b9773631d3e79e2310ed0eb274b4dd9426205066", + "description": "aco: Fix signed-vs-unsigned warning.", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "dba71de5c63617677fe44558f995d35fad643413" + }, + { + "sha": "2976ae2717a7a4fb1494f4adf92e1972528d24fd", + "description": "gallium/u_vbuf: silence a warning by using unreachable", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ad192385e3348b12d15f466225c2792adf42e7e7", + "description": "mesa: fix 11 warnings", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6d7b076166e6a26f68c33ea75b9f9473fc162738", + "description": "nir: fix 5 warnings", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0e25746dde6c43cc3ff020f3db56517041915c99", + "description": "gallivm: fix 5 warnings", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d18d07c9d786d484a7fc05d17a17f58209f625f9", + "description": "nir: replace GCC unroll with an option that works on GCC < 8.0", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1a61a5b1d4693631a1b6fb7e83c877792dfbf33d", + "description": "mesa: fix incorrect prim.begin/end for glMultiDrawElements", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a1f402344314cb7e773cebb7ef5c77b3c941de13", + "description": "mesa: optimize glMultiDrawArrays, call Draw only once (v2)", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e636a062f1a555f8795b3e75d6f32c5042038ab1", + "description": "mesa: don't unroll glMultiDrawElements if one count is 0", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4c5cd113b82b798634f9a71ef98241d8917c05e1", + "description": "mesa: clean up glMultiDrawElements code, use alloca for small draw count (v2)", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b78ab9c741ac758e9d1eb237f7e8ddc3b70235f5", + "description": "mesa: move num_instances and base_instance out of _mesa_prim", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "aaa758d3dd4112e6ce52b033cb6dcabfcebadb1f", + "description": "mesa: remove redundant _mesa_prim::is_indexed", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0c9850e55d11447d79bba1ad070f57ed52e85c96", + "description": "mesa/i965: remove _mesa_prim::indirect_offset", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f55ae2cdbe8eca6a28c9f1a2d8e1a637e9b61103", + "description": "gallium/u_threaded: convert dividing by index_size to a bit shift", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "28d75fc286fd761cf1de98aa9c57dd4263d5321a", + "description": "gallium/u_threaded: fix uploading user indices with start != 0", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c9e4dc8d5e8f5e860f93eb3555a507402506b59a", + "description": "gallium: pass cso_velems_state into cso_context instead of pipe_vertex_element", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6c90e39a5b854595e3bbbf30f01aaf7dc798158e", + "description": "gallium/cso_hash: inline struct cso_hash_data", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "505cd5f12b0ee7a5bac353bb53da6e78bc10513d", + "description": "gallium/cso_hash: pack cso_node better", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "950ee0a3700ece300129b15075fe67b56ff45ea6", + "description": "mesa: remove unused \"indirect\" parameter from Driver.Draw", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9556805ac4d149d370de97e6a409e99250b94ebc", + "description": "i965: stop using \"indirect\" parameter from Driver.Draw (non-indirect)", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "dab7a4d82c1a490c93da3c19d34097732ae8a15c", + "description": "anv: Remove unused field `urb.total_size`", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0bb25e4713f8d7e1eb58c071d98b081afed9f3dc", + "description": "pan/midgard: Use address analysis for globals, etc", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f5401cb8866cc74c5b3d0fde278fb1046779b415", + "description": "pan/midgard: Add address analysis framework", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "658541a7450a850c700ad5b8bf1b5425f32773fc", + "description": "pan/midgard: Force address alignment", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "93ca47e046ca1cd1385e5941f3dea731ffe8e5af", + "description": "pan/midgard: Round up bytemasks when promoting uniforms", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "fd888d351f60c7dcfaff475c083c9be7bc1be626", + "description": "pan/midgard: Fix load/store argument sizing", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ee47ce6ac3c74b547c060751f82624205c24ec77", + "description": "pan/midgard: Add LDST_ADDRESS property", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1a2bb78840face41e4329c3180c041ffef64a4e4", + "description": "pan/midgard: Extract nir_ssa_index helper", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4e60dc8f486554656d51d541e10911b7a82a5e80", + "description": "pan/midgard: Partially fix 64-bit swizzle alignment", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9c59f9f3793bf06f2415cc925ce048fe6ab593ad", + "description": "pan/midgard: Allow fusing inverted sources for inverted ops", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "21c578027fb25421af750b98bc59ecd59bcfeff4", + "description": "pan/midgard: Allow inverted inverted ops", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "995e4371055b93aa7dda3caff252b86494ef5893", + "description": "panfrost: Increase SSBO/image limit from 4->8", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1046d73af11c5a90a9d207e4554af8f1ffad62d5", + "description": "etnaviv: disable INT_FILTER for ASTC", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "811990dc1c5ee1c02956041ae6f98a770b2c9f8b", + "description": "anv: Remove unused field xfb_used from anv_pipeline", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "33f38605e9ac644de66000ec20170f7023f24243", + "description": "ci: Include db410c support in the ARM container.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "20659f18947a48714fa8ce0f433304746541e110", + "description": "ci: Shrink the arm64 kernel build a bit.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9ed6c1be6b39a519707d297bf1fed2c238dc15f4", + "description": "ci: Stop disabling ACPI in the LAVA arm64 kernel build.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "257415863b8431214f9eefa47df910053007c053", + "description": "ci: Remove LLVM from ARM test drivers.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9c0bbba85643dbfb170d45adda118b7dfab5c2b9", + "description": "ci: Split out radv build-testing on arm64", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ebfa899089b89c5765914dd9775dcc90bc391b7f", + "description": "gitlab-ci: Skip dEQP-GLES3.functional.shaders.derivate.*", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "17d775ca5d43192a450ad7dec26ba083e6c43b72", + "description": "gitlab-ci: Remove GLES3 test from Panfrost fails list", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1fa987ae5ebc5e40fb7a321a85d03c6c0a610065", + "description": "gitlab-ci: Use PAN_MESA_DEBUG=gles3 for Panfrost", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5491a13be9b7497e3583d75014f1539858b3a874", + "description": "panfrost: Add PAN_MESA_DEBUG=gles3 option", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f5b6dfcb18f821cc5425cd6eea013e28148097c4", + "description": "panfrost: Expose PIPE_CAP_PRIMITIVE_RESTART", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2fea44c6361b171c9313a75a7e9ef4cbf97602f0", + "description": "panfrost: Simplify stack shift calculation", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "40fd1f9da448b65bbd6491199e9f66cef7fdbe32", + "description": "panfrost: Reserve an extra page for spilling", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f37cec3275a3bf9d23dfbc43720bb6831eab0242", + "description": "panfrost: Default to 256 threads for TLS", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f6ca7ea551665de4a803f46ce980dfafbcbba1e6", + "description": "panfrost: Fix param getting", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4a10cfab7629752059cb56dc1f7e1bef87c72489", + "description": "panfrost: Don't set shared->unk0", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "febabb0502605a7fa742cafc85c3c4c414d8001e", + "description": "panfrost: Update spilling comment framebuffer->shared", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "03822a27e64e20e97b0f3c043e8bd3f584a3f93e", + "description": "panfrost: Fix padded_vertex_count generation", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "23c859717216253ee06482051fc08cecb63f7543", + "description": "panfrost: Fix gl_VertexID/InstanceID", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "027944c7c8ccbff940484b1ed7cc5d75b9593640" + }, + { + "sha": "a0b90b45a9bcef058c8fc6ac88ba5e71a8cedade", + "description": "pan/midgard: Don't spill near a branch", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ed528556807378f376d8e8699a73bf5481c69a38", + "description": "pan/decode: Dump scratchpad size if present", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d385c5840f9f5683e0ca2dcb254b494562838a90", + "description": "panfrost: Implement index buffer cache", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "12db69aa3f6155e9ccb1d783da589ab206dc7239", + "description": "panfrost: Combine get_index_buffer with bound computation", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e272b110bb511b3735672a61c58c5b6b944bc37b", + "description": "radeon/jpeg: fix the jpeg dt_pitch with YUYV format", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5bc71e1bacccf5c001e0685f2bff0dc05602a164", + "description": "st/va: add support YUY2", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d2e715e57a49c52a728ff0f9ca84111197a786ac", + "description": "st/va: enable 4:2:2 chroma format", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "69aadc493310bb7306d10559bf48412eb5865962", + "description": "radeonsi: fix surf_pitch for subsampled surface", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c4197fbcdde55e93693e5687842605ff70ed3d15", + "description": "gallium/vl: add 4:2:2 support", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "24f2b0a8560f34745854bf8263fa7c2d0f95f2bc", + "description": "gallium/video: remove pipe_video_buffer.chroma_format", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "87807298a307d4e38195dc04f66c26404e7cb791", + "description": "format: add format_to_chroma_format", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "fb29f0847f87504f8162ebe7b9324244387ff501", + "description": "radeonsi: test subsampled format in testdma", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9e5d2a73c5fc12841b62758a035b2bdb191b3f86", + "description": "ac/llvm: flush denorms for nir_op_fmed3 on GFX8 and older gens", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "d6a07732c9c155c73f7d2cddc10faa7eab768df9" + }, + { + "sha": "30ac733680c3dfbfd1300c5498dd1b0c0a680905", + "description": "ac/llvm: fix 16-bit fmed3 on GFX8 and older gens", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "d6a07732c9c155c73f7d2cddc10faa7eab768df9" + }, + { + "sha": "50b8c2527464dbe18a01ab6412de4465cebf2225", + "description": "ac/llvm: fix 64-bit fmed3", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "d6a07732c9c155c73f7d2cddc10faa7eab768df9" + }, + { + "sha": "636656bcd7801c703ebcf9bd4c65197e4e6cbee8", + "description": "mesa: Flush vertices before changing the OpenGL state.", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4a54f8cd2cf31d0fc952748a998fa63763b3977e", + "description": "mesa: Check for OpenGL state change before flushing vertices.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2b155b1086121ec1d6bcd3598a835c68617d9aca", + "description": "gallivm/nir: handle mod 0 better.", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "44a6b0107b37ad9644d3435cf6d2d29b6779654f" + }, + { + "sha": "5370c685da4790834671e88bedbb0f15610e9bef", + "description": "gallivm/nir: fix integer divide SIGFPE", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "c717ac1247c3c7243af2f1b2732cccb882b75f11" + }, + { + "sha": "954cf8e86b6e0d52c04098604d2daa4305bf6f70", + "description": "gallivm/tgsi: fix stream id regression", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "163d5fde06696fed2e69e000a7621087c1636749" + }, + { + "sha": "4449611ffbb0087a6d2407fb0d25496806df157b", + "description": "mesa: call FLUSH_VERTICES before updating CoordReplace", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "aae09ffb6eee6c41c73962c08f315a545c5e7dfe", + "description": "mesa: remove leftovers from ARB_shadow_ambient", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d98806117226b64001a1e99387431419e174ad4b", + "description": "cube_face_index: Use fabsf instead of fabs since we know it's floats", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6db7467b59932fd11e828d7a99de0f23f49aecb9", + "description": "cube_face_coord: Use fabsf instead of fabs since we know it's floats", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a70a605ad63d95a6e7ce7cfd61fc1ca4e9616e74", + "description": "iris: Apply the flushes when switching pipelines.", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "b8fbb39ab2c962e38f6c9d668de57582faf39b70" + }, + { + "sha": "f6d1dd34d76c1930b6f5223ae7e1c6b7f52ec4cd", + "description": "gallium/hash_table: remove some function wrappers", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "502840855acac744fbc8dd090d931adc07755ead", + "description": "gallium/hash_table: turn it into a wrapper around util/hash_table", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "10d235a84319ed4137fe0c6d22d8bb9a52d3174a", + "description": "gallium/hash_table: use the same callback signatures as util/hash_table", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "76dff2fabe065b71f0d336cb43853335dd3eb82a", + "description": "gallium/hash_table: consolidate hash tables with FD keys", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a01a875081bd52bc1c3c142a60af678171ce6c33", + "description": "gallium/hash_table: consolidate hash tables with pointer keys", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "56f31328f207f310ee9b53f3294a23b25b2687e0", + "description": "amd/addrlib: fix build on non-x86 platforms", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c798aae7390f20e74b8ebb09113e806b410ac7a7", + "description": "tgsi_to_nir: set num_images and num_samplers with holes correctly", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "349898a967f1bf28271454d88b3f1a36a4db2e06", + "description": "nir: Drop nir_tex_instr::texture_array_size", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ec2f905ca8fda0deaba4fa099a9e47028c11d7e9", + "description": "freedreno/computerator: Fix defined-but-not-used warnings from lex/yacc.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "bd53f4f56b2ca93c1fe4f5af29b5040d2b32e88f", + "description": "turnip: Fix compiler warning about casting a nondispatchable handle.", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "1c5d84fcae71b40f77891386ac53c8addee4f554" + }, + { + "sha": "ebd071d8cf034f898ce30df8277130d1625c902e", + "description": "gitlab-ci: Move to 5.5 kernel plus fixes for Panfrost", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ae5e6406df49add1d70088671491f96e442c2d85", + "description": "panfrost: Remove some more prints to stdout", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "fcd8308b289a0749f2f889e4e04cc4974d7f6af5", + "description": "gitlab-ci: Run GLES3 tests in dEQP on Panfrost", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "de4eb9a3bb9fb073a5bf5cc157918bfa0f62b394", + "description": "mesa/st: toggle EXT_texture_norm16 based on format support", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "7f467d4f73828ba782a2db38c74d33b85646dc85" + }, + { + "sha": "200a83a98394ce292fd1cdbd6e9166502379b5c9", + "description": "i965: toggle on EXT_texture_norm16", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "7f467d4f73828ba782a2db38c74d33b85646dc85" + }, + { + "sha": "dc531869a918dc75ffc09b38851b750ba62673f8", + "description": "mesa: introduce boolean toggle for EXT_texture_norm16", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "7f467d4f73828ba782a2db38c74d33b85646dc85" + }, + { + "sha": "784c454607be3e8dc395de429d9b99521d5ef8a8", + "description": "nir/lower_double_ops: add note for lowering mod", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d2e4435c205810b71990c27befde2f99ed45b9df", + "description": "radv: fix creating null devices if KHR_display is enabled", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ef0abe5404ca3bb983e4afcbf2306291a15579ca", + "description": "gitlab-ci: Add add a set of lima flakes", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4c03d203965c2e88fb7337cf0af6e691517838a9", + "description": "radv: make use of ac_gpu_info::max_wave64_per_simd", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9204ad70f2a49c783a38d926f01a54059bb3461f", + "description": "radv/gfx10: adjust the number of VGPRs used to compute waves", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "568f1504098f6204bcc842d66be0126764cb7d13", + "description": "radv/gfx10: adjust the LDS size used to compute waves", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ea91b15a31369adafb8c7ecb1e345d24ee8d1948", + "description": "radv/gfx10: adjust SGPRs/VGPRs related info", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a6df3ef6ecb3ba142b8b78beab7a7757194befad", + "description": "radv/gfx10: adjust the number of simd per compute unit", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "09d8726187f3f64ee41ae10345b77d4edf4130c9", + "description": "ac: add more ac_gpu_info related shader fields", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "974c87e449633e6a9bc761cd83ebf07663f4ed3b", + "description": "ac,radeonsi: add ac_gpu_info::lds_size_per_cu", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "cd6ec2b1abbd96f4456e92613c2232c919bb9023", + "description": "radv: implement a dummy winsys for creating devices without AMDGPU", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f280c00ba6cd3b0d3a01ae2fc3085de89ec867ec", + "description": "egl: Factor out dri2_add_pbuffer_configs_for_visuals {device,surfaceless}.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d32c458de76c9e0cc08c9ee1a7de23c3fca69298", + "description": "egl: Fix A2RGB10 platform_{device,surfaceless} PBuffer configs.", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "9acb94b6236f8a76a5558cf1cb60bac976067851" + }, + { + "sha": "87924646db280c8f5f4b227fc610e9e557f15dda", + "description": "turnip: enable fullDrawIndexUint32/independentBlend/dualSrcBlend/logicOp", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "708c3a5ffd1a69dc14dbc0e022a4c3848f9138fd", + "description": "turnip: enable sampleRateShading feature", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "cb166aea24aff734f520f78c896255ec432f7f4e", + "description": "intel/tools: Do not print type/qualifiers/name for c_literal", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5feea408893d646feb0ba873b7a4155a2fb53fe7", + "description": "intel/tools: Allow i965_disasm to disassemble c_literal input type", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2f83daedb19d25744b2b89939713c9ca9e7ad29f", + "description": "intel/tools: Print c_literals 4 byte wide", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0b0e958f4f096863fc29d8acd000caa0f0ff5bc2", + "description": "intel/tools: Add test for state register as source", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "31c29f4f55924e59b410b3f17c4b1105451bbc0c", + "description": "intel/tools: Add test for address register as source", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9526e5c359b4cfa724dd8bba52586435e29b75bf", + "description": "intel/tools: Set correct address register file and number in i965_asm", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "87d9e78f268a462b96dd489bf8d8e356801306f8", + "description": "intel/tools: Handle STATE_REG in typed source operand", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2a75e603652ef8670c1b5433dc435b11ce9309a6", + "description": "intel/tools: Handle illegal instruction", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "11a1cb2fa8dadca9d918e8421a3b26a1b176937c", + "description": "meson: Disable bison's -Wdeprecated since we still support old bison.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5dfd83d7a1ce52a42485c54ca170311449379eb9", + "description": "anv: Always enable the data cache", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d4e7a11bc3e33baa311595602719bb449ce51d31", + "description": "intel/aub_dump: stub the waits when overriding the device", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "31461e2379321cb0d4eeb28fb74fd78d2fd0bff6", + "description": "intel/tools/aub_dump: fix crash when using the default legacy context", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "76bf38eaf0b6c839eaa4a36990e4b14b8095b7eb", + "description": "intel/tools/aub_dump: move aub file initialization to maybe_init()", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3569215d493da6dbb8359f1df1259563b2c61db1", + "description": "lima: expose fragment shader derivatives capability", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "01496e3d1ea0370af03e6645dbd2b864c2ace94c", + "description": "v3d: Sync on last CS when non-compute stage uses resource written by CS", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5de8bc7c758cc86c7008708c2b7f19f40288459e", + "description": "gitlab-ci: Enable the lima job again", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "31a8075678f6517278985fe8bbaaec5100d7d826", + "description": "gitlab-ci: lima: Add flaky tests to the skips list", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5ab94df0f6a9b2fdf8c053a68486d8be4d254d01", + "description": "nir: fix gl_nir_lower_images for bindless images", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "7342b859afb5a7e7f9fb1813e7ab3a55a1c8a704" + }, + { + "sha": "26d42645f9f7ae260031685ad3e0664e8b94b32b", + "description": "freedreno/computerator: fix build dependency", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "84395190ec8cae6158737777c8def7cc3304eb3f", + "description": "glx/drisw: fix shm put image fallback", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "02c3dad0f3b4d26e0faa5cc51d06bc50d693dcdc" + }, + { + "sha": "246e4aeaef4c1f1071c64e9681fc9229aac22020", + "description": "glx/drisw: return false if shmid == -1", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "02c3dad0f3b4d26e0faa5cc51d06bc50d693dcdc" + }, + { + "sha": "8d0bab8a9352bbb780bae6e7a432e73f7204f66a", + "description": "glx/drisw: add getImageShm2 path", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "02c3dad0f3b4d26e0faa5cc51d06bc50d693dcdc" + }, + { + "sha": "466a0b2e4953018646ee344f5f6f6e9e84b66a1a", + "description": "dri: add another get shm variant.", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "02c3dad0f3b4d26e0faa5cc51d06bc50d693dcdc" + }, + { + "sha": "a91067d3f5c8357548ca2756023b81daf013e47c", + "description": "ci: Blacklist another freedreno flaky test.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6fbe3f40a912296922e0d8bc0ea525b5381341db", + "description": "intel/isl: Add isl_aux_info.c to Makefile.sources", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "58d4749e5683857d57b8b432559c9d837544ad88" + }, + { + "sha": "9ab0e92cff8bb8336cebd9dc68d02d7451ad78e1", + "description": "intel/blorp: Implement GEN:BUG:1605967699.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "36515e295c390edd713d92ce3aef35730a7d12e1", + "description": "gallium/util: remove unused debug_print_foo helpers", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "dfea933a2a09b169ff3753de764aa4662de7e4e7", + "description": "gallium/util: do not use debug_print_format", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5f0b984cb8fd424facc7b245e4649b38f4815e3c", + "description": "util: move debug_memory_{begin,end} to os_memory_debug.h", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "31a781578522d44e6832893270a7902b001a827d", + "description": "hud: add GALLIUM_HUD_SCALE", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0ee76b90d5a0b3832b9498ab4809de19a03a63fa", + "description": "turnip: move tile_load_ib/sysmem_clear_ib into draw_cs", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a410e64b6812b23e1db8b969516ae7f4ebb8b811", + "description": "turnip: make cond_exec helper easier to use", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6ede9749d2e96c788a1ef0bb642d1fbe87304386", + "description": "turnip: remove marker seqno", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "cf94124e1c28c0c5953454ad1dbe71b9de8adc55", + "description": "turnip: automatically reserve cmdstream space in emit_pkt4/emit_pkt7", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4b2a7dcd93f285ea88a110afa75cbac5d147e5cb", + "description": "turnip: add tu_device pointer to tu_cs", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a9a990a60b4669bd0af920a060d87a8e3551058e", + "description": "turnip: fix COND_EXEC reserved size in tu_query", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2275343ba346b04ebf8bafbaedfcd707702f9c06", + "description": "freedreno/computerator: add computerator", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "568e948d1fda10d825cf99f3cb05609c4ff248bd", + "description": "freedreno/ir3: allow block->predecessors to be null", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f87d412f08265ca6e5c47a5591c0ebbdefdbaf4c", + "description": "freedreno/computerator: rename prefix asm->ir3", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6ee68d796e433c2efb3313baf119a8e0ddcac759", + "description": "freedreno/computerator: polish out some of the rust", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3bb340cf4f21e12407720b7f46d2c4486a1ff2a5", + "description": "freedreno/computerator: import parser/lexer from fdre-a3xx", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6499738d3dee2c2420f8d2207442f57c432d9510", + "description": "lima: remove its hash table entry when invalidating a resource", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "c64994433c0da03d1dabf7cf561f1f1474d6554f" + }, + { + "sha": "956e4b2d371736e073542cf8556f0c972c197989", + "description": "nir, intel: Move use_scoped_memory_barrier to nir_options", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6be766336a0ebb556f44765e3d6f6af0e67070a1", + "description": "nir/tests: Use nir_scoped_memory_barrier() helper", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6ff898a6530e19815e2877f5a5da54f7e6746cfa", + "description": "nir: Add the alias NIR_MEMORY_ACQ_REL", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "424737da3e25d9fc3dd33a4b8bdb9ad914e7a5f8", + "description": "nir/builder: Add nir_scoped_memory_barrier()", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e4baff90812d799d586296fcad992ddcc553c359", + "description": "freedreno: Switch to using lowered image intrinsics.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3e16434acdd549f2721efc4ec80dc11fca1321f6", + "description": "nir: Move intel's intrinsic_image_coordinate_components() to core nir.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a7038403204b7f2017cdd390a911f2393a8f6513", + "description": "freedreno/ir3: Fix the arg to ir3_get_num_components_for_image_format()", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8aa54e6ed0ba8db984541271404add6e496ed491", + "description": "prog_to_nir: Reuse glsl_get_sampler_dim_coordinate_components().", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b8644349d1d5e05c74cd763f98ee1d8d2c10903d", + "description": "tgsi_to_nir: Reuse glsl_get_sampler_dim_coordinate_components().", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1b7de2d6b857cd4f3dd45bbcdf135f70548b1935", + "description": "freedreno/ir3: Reuse glsl_get_sampler_dim_coordinate_components() in tex_info.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d37c6ebd3ce8e20adb57001629f21ac09bee0679", + "description": "spirv_to_nir: Reuse glsl_sampler_dim_coordinate_components().", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5072719e66b0f97a572f36e86bd5396ed2ebc915", + "description": "glsl: Factor out the sampler dim coordinate components switch statement.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "12cf484d025e3ed980dbbd8d65f2f9b95c6388db", + "description": "v3d: Ask the state tracker to lower image accesses off of derefs.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9c90ecf37ffab0978a983e49ecec48faebeb181a", + "description": "gallium: Add a cap for enabling lowering of image load/store intrinsics.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7342b859afb5a7e7f9fb1813e7ab3a55a1c8a704", + "description": "nir: Make image lowering optionally handle the !bindless case as well.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "cad2d6583c9475bde584e5062a8de73fe7cbf353", + "description": "nir: Rename gl_nir_lower_bindless_images.c in preparation for extending it.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b62379ac6f699933da52d032e2b3c06ab73f9549", + "description": "i965: Use isl_aux_state_transition_write()", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b9856fbf3b7ddbe0b77bf984fe7ec4a64ad858bf", + "description": "i965: Use ISL's access preparation functions", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b00e7a6485799761aa0910b7851982a180602c03", + "description": "iris: Use isl_aux_state_transition_write()", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "af047794106b2d07e7c7eaa5b35d9790a13fb390", + "description": "iris: Use ISL's access preparation functions", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "fec957900d6612493a6e03d0e5958dbe8c362733", + "description": "iris: Use isl_aux_usage_has_fast_clear()", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "58d4749e5683857d57b8b432559c9d837544ad88", + "description": "isl: Add a module which manages aux resolves", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "daa4020948867cc2c9b38d7536a1b73bf79d2745", + "description": "freedreno/ir3: Lower output precision", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6c750d9c4d51b07076115eb1d0e1be4a2d568095", + "description": "nir/types: Add glsl_float16_type() helper", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c822460f851ae6f3c74a01b9eec9ea924a0de12d", + "description": "freedreno/ir3: handle half registers for arrays during register allocation.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9e8466a866cbfb4c6745d85e9371b43827d16c8d", + "description": "nir: Add optimization for doing removing f16/f32 conversions", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6ee375f68dabc0dd9d6d9f919b797231aad19eab", + "description": "freedreno/ir3: Add new ir3 pass to fold out fp16 conversions", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "125f867d3dc32c7269c17d3426e35a0dcd5aadc4", + "description": "nir/opcodes: Add nir_op_f2fmp", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "18124d727865f1c53b0dac644560bce177b7d233", + "description": "glapi/copyimage: Implement CopyImageSubDataNV", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ae7bda27a0691d6d89c35c9f732b6e49d726c17f", + "description": "iris: Fix import sync-file into syncobj", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "f459c56be6bf33439cccc11e932b2b5b52ba7ad8" + }, + { + "sha": "3a310fbd0b2bb5730fda57643a3e05870e70d248", + "description": "pan/midgard: Implement load/store_shared", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "fcbb3d422e40ab0759c550fb044605364c518e51", + "description": "pan/midgard: Implement nir_intrinsic_get_buffer_size", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3148937ef7c7be7a0685de333de6a5ed31ce3857", + "description": "pan/midgard: Lower SSBOs in NIR", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "99f2b6144b5c3c4f48096ed0de4b15d1d13afd96", + "description": "turnip/pipeline: Don't assume tu_shader is a valid object", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "12a22da6834f21cb089bf9ecd44c483bd264ccd3", + "description": "radv: add the trace BO to the BO list at submit time", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5e9a2c603f38308f7027d6a5e4575e5fc24c1bd5", + "description": "gallium/swr: Fix min/max range index draw", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4d57a2750485e51b34e0bc413100e4e2787a4e84", + "description": "iris: Set MOCS for constant packets on Gen12+", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4bac2fa3c6d30537e444c555f182abd9c739cfd4", + "description": "iris: Fix BLORP vertex buffers to respect ISL MOCS settings", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "a4da6008b6a0d8876eaf5a67c95d88038bbf35e6" + }, + { + "sha": "1cdf5abdfaeba5a89574d7cc374e5667be2e2f93", + "description": "iris: Make mocs an inline helper in iris_resource.h", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "a4da6008b6a0d8876eaf5a67c95d88038bbf35e6" + }, + { + "sha": "f8ab00776cc0b2a009403a8611c00341d879f9ab", + "description": "ci: Remove a useless filtering of the lava logs.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7f3f9b2b1920da3badf503a0682de7beb9e86464", + "description": "ci: Don't bother generating deqp junit results since we don't present it.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4c372d384a702e2be6887e4b2b0b6e04ab27e052", + "description": "ci: Document how LAVA runners work.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "994e258122663fd4d869b583cb9f4e0e2f36a0aa", + "description": "ci: Make LAVA job fails emit the full list of unexpected test results.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "54dbb55ea8203df354c406e67784ef6ca41ca89e", + "description": "ci: Make sure that we have a proper shell prompt for LAVA.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "985343e71aeed5f68697b440e9bcc2dc6a51bc2f", + "description": "ci: prepare-artifacts: Make the indent here match previously in the file", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "89a3856714e2410e9ae3e0ee2cafe2fdd86e8b81", + "description": "anv: Add pipe_state_for_stage() helper", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7df5d36078a8e0eeffa935a5d1a267cb431ca4db", + "description": "anv: Use intel_debug_flag_for_shader_stage()", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f58b384fbef0cbfd8349c5baa28f2973d079cd7f", + "description": "spirv: Be consistent when checking for Shader/Kernel", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5f3cbbd958d14924dded0e0a0908127f6bfa006d", + "description": "spirv: Remove outdated SPIR-V decoration warnings", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "cd4a14be063957964ae54e0e828174b9e40961e0" + }, + { + "sha": "1598370aca6459ba54915a26683a75bb66f88161", + "description": "nir/builder: Return an integer from nir_get_texture_size", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "f718ac62688b555a933c7112f656944288d04edb" + }, + { + "sha": "265e234e234f75cd5c209f76900009f81e2d6aec", + "description": "nir: Fix the nir_builder include path for nir_builtin_builder", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f5a8958910f53d924d062cbf024cebe4134f757a", + "description": "util: Change os_same_file_description return type from bool to int", + "nominated": false, + "nomination_type": null, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "228cbdfe67e465dc79558fc76a51d8115251b5e5", + "description": "winsys/amdgpu: Make local variable r signed", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "87365e263ec46f4cc3c46d49a09a9c3b27550af7", + "description": "nir/lower_ssbo: handle atomics", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7ab4e4dd963028a4620ffc00c38988da4abc1860", + "description": "nir: Add SSBO->global lowering pass", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b929565ea8d4846b1943f35b6e86d685a616ed54", + "description": "panfrost: Rewrite texture descriptor creation logic", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ad44f587a8e3adbfa76aaea88f59e1f0724805b7", + "description": "panfrost: Move format translation to root", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "58f14018b4f0e47d72f718f322f8aa0a5b8d0f0a", + "description": "panfrost: Move pan_afbc.c to root", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5ddf7ad9d2098d21e1346b8ceb2756901ae1b0c1", + "description": "panfrost: Move checksum routines to root panfrost", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2e3318b151abddd456077ec0eed13f95245ce344", + "description": "util: promote u_debug_memory.c to src/util", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "88c4680b5a50ea8840c38aa0a80acde63ef1677b" + }, + { + "sha": "8021daeb1fb58415af5d0a779368dc6617af947e", + "description": "lima: implement PLB PP stream cache", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7edde3d26bb4119be44a2bdda2c5ca0d2f8adfe9", + "description": "docs: Update index, relnotes, and release-calendar for 20.0", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0ada39f37add490a2a628236731f7dada421af52", + "description": "Docs: Add 20.0.0 release notes", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "740cb3d1939efc3c4d9e23b212074744123e9ad6", + "description": "radv: use RADEON_FLAG_ZERO_VRAM when creating the trace BO", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "37650bf93803822d6e3aefae7c2f4c7eef5d6171", + "description": "radv/winsys: add a new flag that requests zerovram allocations", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7a73446c513e2218a08ae9425a1bea49c63080a7", + "description": "gallivm: fix crash in emit_get_buffer_size", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1b610aab583211210f189b46904b66c483f8e38b", + "description": "gallivm: fix crash with bptc border color sampling", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8291d728dc997e87b4d2e4e451692643a1dba881", + "description": "aco: improve GFX9 1D ddx/ddy assertion", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "cc3d29c6e7ccca1ac738cfeafbe4685fbec533f7", + "description": "pan/midgard: Identify clamp(x, -1.0, 1.0) flag", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0263d2793ce0d061268e83afb9c225a06a4e6f25", + "description": "panfrost: Remove flush_frontbuffer", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "068806c9f6be9bab84ed0d4868f16705cf4c7e6d", + "description": "panfrost: LogicOp support", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5bfd363be4c957c1f7b5c1f3069346f2bce2cd5a", + "description": "i965: Do not generate D16 B5G6R5_UNORM configs on gen < 8", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "803ab5d6be6bc63e3eae827d7297e0cd98cc61dd", + "description": "gitlab-ci: Automated testing with OpenGL traces", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "50f1950ac0b52d291ac70bc1ce871a03ed88ba4a", + "description": "gitlab-ci: Disable the lima job for now", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f7bfb10c69dfe48a91e35523cb5ee641bdbf6988", + "description": "util: remove the dependency on kcmp.h", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "f76cbc7901f7d500f5a4f74aedfd29970d1efd00" + }, + { + "sha": "273b8cd1ca286e2f43b4a464a391fdcaac49f077", + "description": "intel/fs: Correctly handle multiply of fsign with a source modifier", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "06d2c116415c0ab163a57ed7f2522342ed43e4d4" + }, + { + "sha": "c81aa15d646215eac38c8e0b6dc1a10b35bc13c3", + "description": "gallium/auxiliary/vl: fix bob compute shaders for deint yuv", + "nominated": false, + "nomination_type": null, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "68d1757420be28e99e4e919ed2e0c6062e2460c5", + "description": "radeonsi: Fix compute copies for subsampled formats.", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "e5167a9276de1f383888714b41d3a9be2b9c1da9" + }, + { + "sha": "d795eb207ff90e4885a278910fdc87e932242da6", + "description": "turnip: add option to force use of hw binning", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "97a590af21ec0be1f3faae89a5fe59b2fa6c2d39", + "description": "docs: Mark 20.0.0-rc3 as done", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "772d60385cc17025541f99b3dbd566b942676ab1", + "description": "docs: Mark 19.3.4 as done", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "288e9fd295b14c0ad3c0bbe51dc294a00f3c9056", + "description": "docs: Add SHA256 sum for 19.3.4", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3238f4c3abf405c902ec43cb9ff6e81dcc20dbad", + "description": "docs: Add release notes for 19.3.4", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d8fe9e045f0ea24fe869c980fe4b56f4fb9437ab", + "description": "anv: Drop anv_image.c:get_surface()", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "58bdc1c748f2922b3970c3b3a41d1b0977f07886", + "description": "nir/search: Use larger type to hold linearized index", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "fbd5359a0a6f4b6d220c4cea9020ec4665ed4955" + }, + { + "sha": "912ee82521ec0507a00dd108b28bf4d864ce6d95", + "description": "gallium/util: remove unused u_surfaces.c/h", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "360ffdf4e23464879748051e57587aff938bd50d", + "description": "main/get: Converted type conversion macros to inline functions", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f1dc4c9554ce913acf4d3236b4d43b829fb92073", + "description": "Mark a few static inline helpers with ASSERTED", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d46a5cfe781a1f51a1338d994fb8097d9d579d2e", + "description": "mesa/draw: Make sure all the unused fields are initialized to zero", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "a6d31589097ef3fb99477809da7f6d571b85629e" + }, + { + "sha": "6edbb3c6d07a395c3cd0b1b5290ecac3943c4286", + "description": "mesa: Fix FLUSH_VERTICES in SubpixelPrecisionBiasNV.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d3160a6177a51035fcdb634e184a29b4ef0619e9", + "description": "panfrost: Remove old hack", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7f6f419be93531b5725bfa92ac087703ad13267b", + "description": "panfrost: Remove old comment", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "aed052f7039e7dab1a4d7374512f3f9945677b9f", + "description": "panfrost: Remove dirty tracking", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "fe5c5507bd6bb7cecb9efcf3381621d1310c2454", + "description": "aco: add some helpers for filling/testing register ranges", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "43497e30e265958e28a0a5912134832a1f5a3ff6", + "description": "aco: add RegisterFile", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7e6010106fb3c4eb5436de869183e857243c1006", + "description": "st/vdpau: Only call is_video_format_supported hook if needed", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "5d5b414a7b840a4a90050041fabd88a9af6dca43" + }, + { + "sha": "72154237fb720926d8453e7f43f0ec76a0ce7bb1", + "description": "iris: Do not dereference nullptr with pipe_reference", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d800bcd9b9664dbb0c8476ee628a3eb888802b87", + "description": "glsl/blob: Do not call memcpy if there is nothing to copy", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7685f48ece721960f6cbb105fca9ea60d256d200", + "description": "intel/bufmgr: Cast bitshift to unsigned", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d5931f285be246be95683c2fb054694d89d07657", + "description": "intel/compiler: Do not qsort zero sized array", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d596795d4d3fb79c39cf457b7b5cb557edf4d6dd", + "description": "brw_fs: Avoid zero size vla", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d4e395a27d770fdde2a8df438271dffd76384e43", + "description": "brw_nir: Cast bitshift to unsigned", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "82913bac14512dbfdb537d674377133f2daa8bfb", + "description": "docs/envvars: document RADV_TEX_ANISO", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "72f7d3d5b0969aaa1c0a538dad0b3da09d9db6b6", + "description": "gallium: Only define PIPE_ALIGNSTACK on x86.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "427870abfd0ebc449ffb1c90e9b5ebb54727ee51", + "description": "llvmpipe: Fix another uninitialized value warning, on init_val.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "81225e1f0349b0b2ddff0fbfcd832752f8177477", + "description": "llvmpipe: Silence uninitialized variable warning about \"scissor\"", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "dc8c5af99b04639d46280719cc59f4481e8bd564", + "description": "llvmpipe: Silence uninitialized variable warning about \"vals\"", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d8d34238a6dc61f6c522e2807bc83d434d27e74f", + "description": "llvmpipe: Fix warning about uninitialized \"op\" in the NIR path.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b32bd704c0a22b6eb691f93158acbbdd4c5d370a", + "description": "llvmpipe: Silence uninitialized variable warning about \"chan\"", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ce611935df5e7e6cc731523c7496c33ba6f7ef20", + "description": "llvmpipe: Silence \"possibly uninitialized value\" warning for ssbo_limit.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "45b2ccc6b30c9e4c3382e6b462a2f5357c15d3b8", + "description": "llvmpipe: Fix real uninitialized use of \"atype\" for SEMANTIC_FACE", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "502548a09c5a87d06da97be45a2386bb1e5e800e" + }, + { + "sha": "13a276ed3bad09033c00f0a08fcaad803c8a2173", + "description": "radv: Squelch possibly-undefined warning", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1427f666dcc24e71d1b5c5c8f6d1d568c3bcf210", + "description": "ci: Extend the a630 flake list to reduce spurious failures.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2e05a280b6b6d334388e3824bd82472ccbf33252", + "description": "mesa: fix immediate mode with tessellation and varying patch vertices", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a6d31589097ef3fb99477809da7f6d571b85629e", + "description": "mesa: don't use memset in glDrawArrays", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ee549c67668289e262243b6549a5faf230aa0fd6", + "description": "mesa: document _mesa_prim::begin/end", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c9246282b75500aa4e9fe926930cff1314f49607", + "description": "vbo: remove redundant code in vbo_exec_fixup_vertex", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3eeeb86cb0e8dd8ddb23b2eda0213c8f1a44feec", + "description": "vbo: remove dead code in vbo_can_merge_prims", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2491a2ddeb53f3d44b4633f967bfa5b023946bce", + "description": "st/mesa: try to fix MSVC build failure due to ALWAYS_INLINE", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "11db8e0e00a72884ba9fda953b549dd65119dc73" + }, + { + "sha": "06dc280a57a60e39e21c0c14ace6ada3a4574ea7", + "description": "freedreno/registers: cleanup CP_SET_MARKER", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7b4d6bb1ec65d87316540e076661ddffec26e3a5", + "description": "freedreno: quiet INFO_MSG", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "838ed2885df0f6fbc7dbd3c233974faf7e084cb9", + "description": "freedreno/a6xx: few register updates", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4fc31e7d33606f249bf984378cd49f0dcba6e325", + "description": "freedreno/registers: teach gen_header.py about a3xx_regid", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ecca5ef6c380cf837ce3f261631f19dd3f18f51a", + "description": "meson: explicitly disallow unsupported build directory layout", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "79788b8f7f07460af8467931501380e47b485e36", + "description": "intel/gen12: Take into account opcode when decoding SWSB", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "6154cdf924f4d0d3a6fb0cef38bc62eb4494c69c" + }, + { + "sha": "bee5c9b0dc13dbae0ccf124124eaccebf7f2a435", + "description": "panfrost: Remove enum panfrost_memory_layout", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "28e94e0a948ebdc98decb5cdbb2ddfcf31cb2b0b", + "description": "radv: Advertise VK_KHR_shader_non_semantic_info", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8004cb256a29e946b23c42ce4ad322b8a4dfd2a8", + "description": "anv: Advertise VK_KHR_shader_non_semantic_info", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2dae89ac36703eca063355affb915c933c316417", + "description": "vulkan: Update the XML and headers to 1.2.133", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7d3c48f131ec84aa759a6290a20e2b0c02ad8834", + "description": "panfrost: Debitfieldize mali_uniform_buffer_meta", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "027944c7c8ccbff940484b1ed7cc5d75b9593640", + "description": "panfrost: Avoid reading GPU memory when packing vertices", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4c52e16c9c296988dea283164622373caaf228e5", + "description": "panfrost: Cleanup transfer_map", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "308f9cf104dab55ab4d9f92150bf6952c9813dd4", + "description": "panfrost: Update scoreboarding notes", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "88323d1ba0c0e6d0ba3be5e3b14213d2b93c44f6", + "description": "panfrost: Rewrite scoreboarding routines", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "070bc883d34d5a687b800f4615d82eda3f0eb7bb", + "description": "panfrost: Print synced traces to stderr", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c46a090942b3d76b8bcbfde75c5454d449799360", + "description": "panfrost: Implement PAN_DBG_SYNC with pandecode/minimal", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "59986461255474cfb11c18e7ea8a6303e2d25afb", + "description": "pan/decode: Cleanup pandecode_jc", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4122f747ac67eca4c27ffa8d7e91d7d0c3cb02a8", + "description": "pan/decode: Add `minimal` mode", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b684ba6ce7fc6cabf42132559192e1065891e67a", + "description": "st/nir: Unify inputs_read/outputs_written before serializing NIR", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "19ed12afd170c97180a9cc7eb6c5589d4c427a40" + }, + { + "sha": "9903f10636566834a7563b6828c52fe40c5b0d71", + "description": "zink: do not convert bools to/from uint", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4d016de25052cc28d449538fddbe755aaff85d0a", + "description": "zink/spirv: uint -> raw", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7c1a2cbcadf8d4a366df3f96818c19e082764c56", + "description": "zink/spirv: unit_value -> raw_value", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "16339646f03a5cb527f119ca572c9328fd5d3923", + "description": "zink/spirv: rename functions a bit", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a6211a42473f8acc7583145dde489864963af0ed", + "description": "zink/spirv: prefer store_dest over store_dest_uint", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7e8f7df800d697c0623711996ceac40dab5527ec", + "description": "zink/spirv: do not reinvent store_dest", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "692093fbdc93343dbe500128fdd23167d73036d9", + "description": "zink: confused compilation macro usage for zink in target helpers.", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "8d46e35d16e3936968958bcab86d61967a673305" + }, + { + "sha": "b7e966dc7feaed8e0830f9f9ab5904966c7356a0", + "description": "zink: do not report texture-samplers for unsupported stages", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4a20db70de25233f1c58d76bd17f9564d13fe7b2", + "description": "zink: fix binding-usage", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "1c3f4c07047cef0dfcb9182690b22792b00d5935" + }, + { + "sha": "c095b7d5bd44f807c479d117a51f7495712d61c1", + "description": "radv: add a comment about VK_AMD_mixed_attachment_samples on GFX6-GFX7", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4159b24be7c00499c6e45dfb8209d9fc23f0836b", + "description": "radv: enable VK_NV_compute_shader_derivatives on GFX6-GFX7", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "83dd0cace6979ac5e597c2aa2e3dce51586df0af", + "description": "radv: enable VK_EXT_sampler_filter_minmax on GFX6", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "170c3a8b7b0165c0c226476563367da3d11ae81d", + "description": "radv: enable shaderStorageImageMultisample on GFX6-GFX7", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c7617d8908a970124321ce731b43d5996c3c5775", + "description": "egl: Implement getImage/putImage on pbuffer swrast.", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "d6edccee8da38d4802020d5aa4d9e11bb7aae801" + }, + { + "sha": "6fc0890cd9cf53dc41fee070faf4570314fdcbc3", + "description": "lima: rename lima_submit to lima_job", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "57d9a51d455900946f3eac18b8d1d77cb03b972c", + "description": "lima: move dump check to macro for lima_dump_command_stream_print", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5502bc83b0e4d4542bc5234fe6592db575658356", + "description": "lima: enable multi submit optimization", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "131c50569066152a8e90b948b0873b990d748e98", + "description": "lima: optinal flush submit in lima_clear", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d6ad8e590f27ce0413389f6274fa05106d45dba1", + "description": "lima: use per submit dump file", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d0dde3de25aca535a35ee58850340a0bfdab9dab", + "description": "lima: move framebuffer info to lima_submit", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ed117ee630f449fe6d2ed9e205a6dc80bed6dfa2", + "description": "lima: move clear into submit (v2)", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4b93792274f363983949502315ca693a83fdfb56", + "description": "lima: move damage_rect into lima_submit", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a4b048c046cc4861d9ce7da94eeaeca341b7e2f5", + "description": "lima: move pp_max_stack_size to lima_submit", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6a5b1c62db4cd3ab2dcabe6e32391c18f11dfb17", + "description": "lima: move resolve into lima_submit", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7e5abc11f427b67084ad791a6adab5d99717c064", + "description": "lima: move plbu/vs_cmd_array into lima_submit", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c64994433c0da03d1dabf7cf561f1f1474d6554f", + "description": "lima: track write submits of context (v3)", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "48fc5f841ae9c5b294d9084a274f49045c0dbae5", + "description": "lima: make lima_submit one time use drop data (v3)", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "545988c6172e51ea00c87abe966d5ecd03b08e98", + "description": "lima: add lima_submit_get", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0caefb6d9df8f2b53fa0703a7bd7a665dabe77b2", + "description": "lima: use lima_submit_create_stream_bo for plbu/vs_cmd and pp_stack", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ed8837f946159c34573a6b924a0ba6f879c7bb60", + "description": "lima: adjust pp_stream to use lima_submit_create_stream_bo", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e90d8b6e4d135d58e9346220a652b4835c843d07", + "description": "lima: add lima_submit_create_stream_bo", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5c78ba60140ec48996dc4ebcfff0674229a4afd4", + "description": "lima: pass submit parameter for functions in lima_submic.c (v2)", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "21a2ce71b132739b001442a4f9829de345311c35", + "description": "lima: move flush code to lima_submit.c", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "29c7235507d52d676ec1eee3ef5f9042317595c1", + "description": "lima: put hardware related info to lima_gpu.h", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "09127641f41c75f16d8fa73129153c9da6df7810", + "description": "lima: merge gp/pp submit", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "79c65fa56fd0168e3e8548e68f1de81b33742069", + "description": "lima: move syncobj from lima_submit to lima_context", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b9003111bb59635ae407b251561f1d9937d434f0", + "description": "lima: add missing resolve check for damage and reload", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "47200f5c8dda1e03ae62b8cc658574bf0b2f0fe5", + "description": "lima: add render target to submit by dirty buffer flags", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "32f17339723d76b920e7b16d171feb66d5b807eb", + "description": "lima: delay plbu head command generation to flush stage (v2)", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ccfe5f9d288fa36e85ac1140f2eae8429decaae6", + "description": "lima: delay add plb buffer to submit when flush", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "92387ca23646d2d5b8ee4666968dced0c7198f2c", + "description": "lima: pass array as parameter to PLBU and VS command macros", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c3bbe4f7f8bd88ee92179679526664467e83c45a", + "description": "lima: remove lima_ctx_buff_va submit flags (v2)", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9f924c795b9520c49572b54bfab07731d4f54e6e", + "description": "lima: always add texture bo to submit", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3c4ff27250710a2f4880d0e527b86a4b39adeb1b", + "description": "lima: use util_copy_framebuffer_state", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c8b53d802086f398e50762762227b9ac4c5c6293", + "description": "lima: remove definition of lima_is_scanout", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0c4a70b64d6f916ecdf9055b52078bf4f63e7a97", + "description": "pan/decode: Remove extraneous newline", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8ab0bf1f939af480997fafd8bf562644a60df08a", + "description": "pan/midgard: Use fprintf instead of printf for constants", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "968f36d1fc081baf2e95eef1410b06552845440b" + }, + { + "sha": "6af14d3685fac433193b92f9ad6c9f8a3eaf87ff", + "description": "pan/midgard: Don't crash with constants on unknown ops", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "24360966ab31a9bedfe07356413769218fd2e0b6" + }, + { + "sha": "5c06ecd2c64a36496f7a0a1d2811d8a90b1a0620", + "description": "pan/midgard: Identify stack barrier flag", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d3747fb1ebbb1c3d6f62abaf92b100f7f52d0f6b", + "description": "pan/midgard: Set xyzx swizzle for load_compute_arg", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f0ee55ad2a4e63cce88c9d68bfdf7d1c0e7e88b0", + "description": "pan/midgard: Infer tags entirely", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "57a84278fda2ce556905f800409658639d642962", + "description": "pan/midgard: Imply next tags", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "453c64663ce938952588325ba4c960bc63297582", + "description": "pan/midgard: Overhaul tag handling", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9168e7a65deefae7bb8a40c583c205c408cbecab", + "description": "pan/midgard: Improve barrier disassembly", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d208212f80effe4e2831fa408e976099317230b0", + "description": "pan/midgard: Use dummy tag for empty shaders", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b2cab6b6db4244cb95abb5bf13734360df8391ea", + "description": "pan/midgard: Fix 32/64 mixed swizzle packing", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a55a2e02a54cadcd9466d02021c2c7a0739c373f", + "description": "pan/midgard: Allow jumping out of a shader", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3f59098d1a7a00d51e2b15e06aba359835c7e1ea", + "description": "pan/midgard: Implement barriers", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4f0b928921dfb3ed63642ab1ce1c925fbac9f51b", + "description": "pan/midgard: Fix swizzles harder", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "fbe1fd3de0aa7c618286ee79082f7bbcd7b8e171", + "description": "pan/midgard: Fix missing prefixes", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "c1952779d68fdaea153d762fe2d83810b346085b" + }, + { + "sha": "521406a069e0f918d57fc29bba85423faf316141", + "description": "pan/midgard: Track pressure when scheduling ld/st", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9603126b74d03bc6974ea116ce4f7d80fb9573aa", + "description": "panfrost: Allocate RAM backing of shared memory", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "50138abb5a0328b530723dfef5e9a8ac9dea2692", + "description": "panfrost: Rename unknown2_8 to padding", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6d9ee3e65aea9262a9890fb34032ef7c693aef2d", + "description": "panfrost: Rename bifrost_framebuffer->mali_framebuffer", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6dc105555b43695e1de49baf946c6179987f2f4a", + "description": "panfrost: Unify bifrost_scratchpad with mali_shared_memory", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "254f40fd535ef57dee2bcc4afd97840749ce5918", + "description": "panfrost: Identify mali_shared_memory structure", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "418ca5dc1ac01045818ad3222d2e0bc51dc2e904", + "description": "panfrost: Ensure compute shader_meta is zeroed", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "058faf5a4bd448e1c188042ea017f8fbfd565b9e", + "description": "panfrost: Update comment about magic number relating to barriers", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8f5a252d350cb1abcad13f7d637548a8e0e7da33", + "description": "ci: bump debian image and change llvm deps to 8", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e7375e17958993b90d9f01d76dbc7eacdbf8c195", + "description": "gallivm/s390: fix pass init order on s390 with llvm 8 (v2)", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a603822b2fcf61085d2e0b2995bc3f43d8ec164e", + "description": "iris: Trim \"../../src/gallium/drivers/iris/\" out of debug dump filenames", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "96f247d1b33fafd42a02b58d3f5387f9b574badc", + "description": "iris: Dump frame markers with INTEL_DEBUG=submit", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e395ce03e92b707bef7080eae3c2daa7d9760e70", + "description": "gallium/cso_hash: remove another layer of pointer indirection", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e0bb7b87e26d6618d75d37a4fe2c4a271d075dbb", + "description": "gallium/cso_hash: cosmetic changes, no behavior changes", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "789ed29d59b9c7c8cbef371311bfb3c507ae725a", + "description": "gallium/cso_hash: remove always constant variable nodeSize", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a8bbf1054093f638c83a27696b841d053a83ba72", + "description": "gallium/cso_hash: make cso_hash declared within structures instead of alloc'd", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f8594a06e4a2e65c3fc458d7ddce374e9a093b6e", + "description": "gallium/cso_hash: inline a bunch of functions", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "cf86f522b255a5603176ec361cb0cfcc2416a41d", + "description": "gallium/u_vbuf: adjust the heuristic for unrolling indices", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "55d8baa285524e01eb241aa70057fb8e637fa14e", + "description": "gallium/u_upload_mgr: don't do align twice in the u_upload_alloc fast path", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "19c18d532e848bb129f0b24d694fecafecad07fb", + "description": "gallium/u_upload_mgr: reduce dereferences by adding buffer_size", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "909a2d0ed3878e4254b36d6bf8e125e0c7b1e586", + "description": "st/mesa: simplify releasing the current attrib buffer", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6954efce23e54e2bbe9ea554733685bf24acdd5f", + "description": "st/mesa: make st_setup_current static", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e3617fd00b311bd4f32886974c574ae55ba416fd", + "description": "st/mesa: change some loops from while to do..while in st_atom_array.c", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "fd6636ebc06d55b59851701c436b8b97f50fd7f4", + "description": "st/mesa: simplify determination whether a draw needs min/max index", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1d933728024f907236ffe95da5f5f0eabf6052c7", + "description": "st/mesa: simplify determination whether a draw has user vertex buffers", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "61e4c582e0fe246ac0d494a2391573f63e1ee0dc", + "description": "st/mesa: always inline the code setting non-64bit vertex elements", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3c98dccd405cf57cbc5a8534e93d7ba020d0e055", + "description": "mesa: remove unused _mesa_draw_indirect", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e6448f993b157a6d3757b69d5d4c2424e8af737e", + "description": "mesa: translate into gallium vertex formats in mesa/main", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8d3b86e34a7b0f77613c7f5669891e54d76f0cbf", + "description": "intel/fs/gen7+: Implement discard/demote for SIMD32 programs.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "04c7d3d4b19610cae8250102fefd0012b7233d9e", + "description": "intel/fs: Return consistent UW types from sample_mask_reg() in fragment shaders.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1c6853a9be28c4193fc44eebf812ecf7a02a04f9", + "description": "intel/fs: Refactor predication on sample mask into helper function.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a792e11f5ccb28f5d2430008d462c79888a077c3", + "description": "intel/fs/gen7+: Swap sample mask flag register and FIND_LIVE_CHANNEL temporary.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "083fd96a97d992b8233587f4626c4d433fbd4045", + "description": "intel/fs: Use helper for discard sample mask flag subregister number.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a6bc11a7893a75aebe7ba1888877dffa9fe0814a", + "description": "intel/fs: Make sample_mask_reg() local to brw_fs.cpp and use it in more places.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b84fa0b31e670d101ff17115519417036581f55b", + "description": "intel/fs/gen11: Work around dual-source blending hangs in combination with SIMD32.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "57dee58c82d631261c953705c32dd2d383ec2f4f", + "description": "intel/fs: Set src0 alpha present bit in header when provided in message payload.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e14529ff3262a527d630cecac655f69c8ae15c3f", + "description": "intel/fs/gen12: Workaround data coherency issues due to broken NoMask control flow.", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4e4e8d793f050eac84f2a850ab2e5c24c4c459ac", + "description": "intel/fs/gen12: Fixup/simplify SWSB annotations of SIMD32 scratch writes.", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "15e3a0d9d264beccb914432ca749d7c0b8a5d43c" + }, + { + "sha": "a8ac0bd759cbf9a5984df4bc9f553a3dca41a8ab", + "description": "intel/fs/gen12: Workaround unwanted SEND execution due to broken NoMask control flow.", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "008f95a043dac909f6e647c3102f37bb978b148c", + "description": "intel/fs: Add virtual instruction to load mask of live channels into flag register.", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b8b509fb921a7c2f687b9f7e5075c72847740ac4", + "description": "intel/fs/gen7: Fix fs_inst::flags_written() for SHADER_OPCODE_FIND_LIVE_CHANNEL.", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c9e33e5cbf6b8b953932f283e1f0abcb6c77eb1f", + "description": "intel/fs/cse: Make HALT instruction act as CSE barrier.", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "fe1b0b7c500af7c489d8dffe72bdb2fadd7cf2ab", + "description": "lima/parser: Extend rsw parsing showing strings instead of numbers", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7e2b4bf256610cc016202893d7b4b4ef60b25b53", + "description": "radeonsi: don't wait for shader compilation to finish when destroying a context", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7bee388fb50d1bf6fc63f7898ad189c7891a10a1", + "description": "egl: directly access static members instead of using _egl{Get,Set}ConfigKey()", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "946eacbafb47c8b94d47e7c9d2a8b02fff5a22fa", + "description": "freedreno/a6xx: document some unknown bits", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "75fbe089a6a29b01c2b3425b1c0fc2ee43bc5ff8", + "description": "freedreno: name sysmem color/depth flush events", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c57456aab6974abc86e7e1b0aae958a8ab3dad27", + "description": "panfrost: Simplify swizzle translation", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f3490a141c8d562fc29e714c3735f2ae8e3c0512", + "description": "panfrost: Inline panfrost_get_default_swizzle", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "efda2cfcf96b1071c18800ff3f878fe627359e8a", + "description": "spirv2nir: Add kernel spirv support", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "eeb6d6112859c3ba41f1c8914d503bf1a3dd981d", + "description": "spirv2nir: print nir shader if translation succed", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7e80b03dd15eefb7250b994e311864e720b0b1bb", + "description": "zink: do not use SpvDimRect", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f43a3fc28ff8914e4a9391233dfc3b63454c6dac", + "description": "lima: handle early-z and pixel kill better", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "582d0c5f1474ecbe52a666e30cb115144aeda7c0", + "description": "gitlab-ci: Add three more dEQP-GLES31 tests to softpipe skips", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3d16bfc42deec0e5002d4a314855a453cf340a49", + "description": "gitlab-ci: Sort random failure softpipe skips", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f86bf2e90a9ab0d76c7f8e322c07eeee4df31a7b", + "description": "docs/new_features: empty the feature list for the 20.1 cycle", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "886acbe1c535f0c564dbebdcdbcad38a5fef6c7d", + "description": "radv: remove unnecessary RADV_DEBUG=nobatchchain option", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "676869e1d4bb1660430fcdb99443238a7de50eb8", + "description": "glsl: fix gl_nir_set_uniform_initializers() for image arrays", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "2bf91733fcb5ff8f74d71de03feeb5f3d664d199" + }, + { + "sha": "6baeca36899109cd9d8e06d1b5f4b9db8becd5fa", + "description": "intel/tools: Update aubinator_error_decode.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "334788d4cc9bb8a0a6b3166e609638687efa0f3f", + "description": "freedreno: allow INVALID modifier", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "9891062642a3f35dc326b305fca2407f9041915c" + }, + { + "sha": "3547e19bbd0d70fc391f5982ea38a026a5994b1c", + "description": "intel/isl: Switch to R8_UNORM format for compatiblity", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "207a93bbff1ef0c40d8f7da339f03dbb12961a7c", + "description": "intel/isl: Move get_format_encoding function to isl", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2a98cf3b2ecea43cea148df7f77d2abadfd1c9db", + "description": "Revert \"gitlab-ci: disable a630 tests as mesa-cheza is down (again)\"", + "nominated": false, + "nomination_type": 2, + "resolution": 4, + "master_sha": null, + "because_sha": "18657c0c0a9074d3dfc0763b396929bcf34f71b4" + }, + { + "sha": "5a82273f09d92a815b1ade0d82d095a9e6b8244e", + "description": "freedreno/a6xx: fix Z24_UNORM_S8_UINT_AS_R8G8B8A8", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "18786cc7d55598e82fe1de45bd2c3ffea136418e" + }, + { + "sha": "4151d843236ab350a70d8e13e4e7c79d11ec7bb6", + "description": "iris: add support INTEL_blackhole_render", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6d35610bd57aacb494e49da692b5331b0e4d11b6", + "description": "st: add support for INTEL_blackhole_render", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5d7e9edba1876523f75c74362242aaa56629fba5", + "description": "i965: enable INTEL_blackhole_render", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "74ec39f66d506c78ee62a685b7fa055faa0991b9", + "description": "mesa: add INTEL_blackhole_render", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "08cff938b76b6fe146334e44dc97b6be8274a281", + "description": "Revert \"st/va: Convert interlaced NV12 to progressive\"", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "2add63060b51ea2ae432d10e1bd52d6cc0a4dcbb" + }, + { + "sha": "3a2977e7b5ccead8a3a0e3d7df7823325c64b90b", + "description": "anv: Reject modifiers on depth/stencil formats", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5a593bec16b1e5830bc57462abc0b056342ac876", + "description": "gallium/swr: fix rdtsc debug statistics mechanism", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "dd16ad107da7ade9c6c0f738626b29cfd875f51f", + "description": "gitlab-ci: remove load_store_vectorizer from expected s390x test failures", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "aca2458d1bf5c38f5291efa27712868d45ad8231", + "description": "nir: fix nir_const_value_as_uint bit size in load/store vectorizer tests", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0c1ba69a27090d3ef000943b47468705fe0454c4", + "description": "Revert \"nir: Add a couple trivial abs optimizations\"", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "e4d346c86db0ae332fcdf55eac0e075cfb99a7eb" + }, + { + "sha": "fdd20be324eabab7da1ba67cf7e379398d771186", + "description": "iris: fix aux buf map failure in 32bits app on Android", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b9e0947a9eedcfbcf1e0955fa430b1cfbc43021f", + "description": "radv: remove unused RADV_HASH_SHADER_IS_GEOM_COPY_SHADER", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b2531370c968373c24b0c9814610a686bd63f128", + "description": "radv: remove RADV_DEBUG=nosisched and RADV_PERFTEST=sisched", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "fa48e7edc23bd1148329a69d6e32ddf5acdb2926", + "description": "radv: remove LLVM sicheduler enable for The Talos Principle", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f7d1bf075ab833eb39c35f9e213dc4fc115b46b1", + "description": "glsl: fix a memory leak with resource_set", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "a6aedc662ebbcac747475324abe3791ff67fc1a8" + }, + { + "sha": "556c9401495930c23a10ca2a26db098d12663e27", + "description": "radv: implement VK_EXT_line_rasterization", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "fbcf05382baefe6ecdd279e2c98025f15938fadc", + "description": "radv: fix line width range and granularity", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "da64c35ff920df58b872619bb6f54ae978cb47c9", + "description": "tu: Force sysmem with mipmapped non-aligned linear stores", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f026982265afc87893015b3438bc73d09f703b92", + "description": "tu: Support input attachments with sysmem", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c1b3f9e83297494fb0ea5d41077c60a33daf37c9", + "description": "tu: Support resolve ops with sysmem rendering", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8647a24a8d66a0a3663563d4d713a5a5b28d8d64", + "description": "tu: Handle vkCmdClearAttachments() with sysmem", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "07e07daeaeb952963df37a6d0d899e0765e82ba4", + "description": "tu: Add helper for CP_COND_REG_EXEC", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6a0c4008bf78e259bb1cb9bcbe39248762b03cc8", + "description": "tu: Sysmem rendering", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "041783d49d3aaf451420ce880dcd886388f4b274", + "description": "tu: Disable linear depth attachments", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ab3db20cb5e8ef489b8331235498cd1806e4c6ea", + "description": "tu: Support multisample image clears", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a5fb515301251b476800a1560f9f8edc8ff325d6", + "description": "tu/blit: Support blits in secondary cmdstreams", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a94be3da84277701b5d74b326e5364cb854a19f1", + "description": "tu: Properly set UBWC flags in RB_RENDER_CNTL", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "49817cb3eaddf1085dadbdcadf2c3c93b02a8f16", + "description": "tu: Don't emit initial render target state in tile_load_ib", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0660cbf4262891e6380faf0d99217b2d27873051", + "description": "radeonsi: make si_fence_server_signal flush pipe without work", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "787b56ac0e5d62ad07cb0804be5275d885201262", + "description": "turnip: Add a618 support", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ef5da26089975a6f26096151da9fa94f55fc338f", + "description": "turnip: Add magic register values to tu_physical_device", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "18786cc7d55598e82fe1de45bd2c3ffea136418e", + "description": "freedreno/a6xx: use single format enum", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c13202af7a86976e6d0400f2cee77afcc60ea828", + "description": "anv: Respect ISL_SURF_USAGE_DISABLE_AUX_BIT in make_surface()", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a76fd8b08cf0274a6d8bfe97f05b96014f2a9e98", + "description": "anv: Clarify behavior of anv_image_aspect_to_plane()", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "da2b0c6c19b080a1e4d91d3b2248dd974483c9fb", + "description": "anv: Delete anv_image::ccs_e_compatible", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "483d4ec57ce0ca0df26fffea648d52a738c8f4e0", + "description": "aco: improve SCC handling in some SALU combines", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d45e9451cf47014d37816baf2656981a1c715e50", + "description": "aco: disable some instruction combining if it could change an exec operand", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e9f83185a221b446717c96c4df8dc64ced43e24f", + "description": "Rename nir_lower_constant_initializers to nir_lower_variable_initalizers", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e459c7f0a14b65617e16b92f42abad2fe5878872", + "description": "compiler/spirv: Add support for non-constant initializers", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7acc81056f7ac6a869ef6403573b2572c77dbecf", + "description": "compiler/nir: Add support for variable initialization from a pointer", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "461c40e0fd5d67dc8155ae4f6c53ddd9acd995b1", + "description": "radeon/vce: Move global function pointer si_get_pic_param to local encoder structure Multi gpu use case broken when the function was global", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "286141197dac9fc6765da3fa8bba571840bfa61c", + "description": "anv: Rename param make_surface::dev to device", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "84b791a4bb1b3eb15b9a820e9a5dc61a373392f3", + "description": "anv: Drop unused anv_image_get_surface_for_aspect_mask()", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "23037627359e739c42b194dec54875aefbb9d00b", + "description": "gitlab-ci: Only use gstreamer runners for the s390x job for now", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8e7728077435c5c5ad8c328761277f8ff3b32112", + "description": "nir: do not use De Morgan's Law rules for flt and fge", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "d1ed4ffe0b70762477709e699f95c73602f9dc5a" + }, + { + "sha": "ddd767387f336ed1578f171a2af4ca33c564d7f3", + "description": "aco: fix creating v_madak if v_mad_f32 has two sgpr literals", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "cd08d9abd76ce0002572639c26c79e051a2a0549", + "description": "radv: set the chip name to GCN-NOOP when RADV_FORCE_FAMILY is set", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a8024aaaab9812d018cfa656d0dfce6ffedc123f", + "description": "radv: make sure to not submit any IBs when RADV_FORCE_FAMILY is set", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5b335e1599e90dae9a7a1b4a1d9a4498f0d0cedd", + "description": "radv: Do not redundantly set the RB+ regs on pipeline switch.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "63345a359656246df83b416743031c1836457d23", + "description": "panfrost: Remove unused anonymous enum variables.", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "7e8de5a707f7279929d7396550024b8cdc6a8c61" + }, + { + "sha": "7792d774e0ab4f3d45d758da1e9ac80fef0d7934", + "description": "radv: Optimize emitting index buffer changes.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1d97d186fbea0de22f75346cba07133b145f95bc", + "description": "nir: Mark fmin and fmax as commutative and associative", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1886dbfe7362baa221009371434f158b97183164", + "description": "Revert \"gallium: Fix big-endian addressing of non-bitmask array formats.\"", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "d17ff2f7f1864c81c1e00d04baf20f953c6d276a" + }, + { + "sha": "11db8e0e00a72884ba9fda953b549dd65119dc73", + "description": "st/mesa: optimize st_update_array with ALWAYSINLINE", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "36cc6b105b5299ae07f005feabd8bd6eb34c5257", + "description": "mesa: don't use bitfields in _mesa_prim", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "47d7e21619218348c86ca6909cf695f78c9778c6", + "description": "mesa: remove unused _mesa_prim::is_indirect", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "734654a89cf588d88b2932b931612374ef2884eb", + "description": "\u00ed965: don't use _mesa_prim::is_indirect", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a7d03103f30bfff532a0c6d6e22fa5e7a24cad27", + "description": "vbo: merge use_buffer_objects into vbo_CreateContext to skip the big malloc", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7575a0a25120b9637d50fcf2b55a4859b505f781", + "description": "vbo: clean up resetting vertex attribs", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ee5bd8638bd123ab3ea49d513f8bba9e7cd4ae28", + "description": "vbo: also map the immediate mode buffer for read", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "27bd241103e8fa1fc34859cac6bd23d5a5fb04fe", + "description": "vbo: delay flagging FLUSH_STORED_VERTICES until glEnd", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ca99fe8a603b08883493c786037ca872c00df145", + "description": "vbo: add/update unlikely statements in ATTR_UNION", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a5f72c91e5ac38c82bcc1585e6d6f08b9929dba3", + "description": "vbo: increase the size of the immediate mode buffer to decrease draw count", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2fe771f4e963cbb3a3032f1e148fb594c3c1a2a3", + "description": "vbo: use FlushVertices flags properly and clear NeedFlush correctly", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "63a241fa3283a0c389f671a556f705d1da25dd2a", + "description": "vbo: fix resizing 64-bit vertex attributes", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "077a843c27d84d4531074bbc477391e886a7ba71", + "description": "vbo: optimize resizing vertex attributes during immediate mode", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1f6e53e2437690de9a84e47f5587ff359b8484f2", + "description": "vbo: don't store glVertex values temporarily into exec", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "cd7241c4f8082dbd07f0bcd268741c527512c66b", + "description": "vbo: pass only either uint32_t or uint64_t into ATTR_UNION", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "afa7f1984a4f1779c42e2dfa5535635d364e92a7", + "description": "vbo: don't set FLUSH_UPDATE_CURRENT for glVertex", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f8b98d48bffacc0a1b5393307c8405f4eda8e27c", + "description": "vbo: keep the immediate mode buffer always mapped for simplicity", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8c76ef5b590d5795ec2dafb9304747ed74fd37a2", + "description": "vbo: don't check ctx->NewState twice in glBegin", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f2c6de1eecb52ded412a88c4724f042c0c75d5f7", + "description": "vbo: remove a funky recursive call in glBegin", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "653bd14730035d514127b2253a025a1e98db5e75", + "description": "vbo: interleave attrsz, attrtype, and active_sz in memory", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2b22e33c10f98f2f58101881818f55b4c4b73606", + "description": "vbo: remove immediate mode code that doesn't do anything and simplify stuff", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3e0d612f5e22fee19aff0e40814db24d63f63103", + "description": "vbo: don't unmap persistent buffer mappings for glBegin/End", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "03ded3d6ce37d3be12776bcc5dcd3c4d91f33248", + "description": "vbo: skip FlushMappedBufferRange for glBegin/End by using a persistent mapping", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "10cf7a5113446c85dd39bbb12544dd4ac30a0200", + "description": "vbo: create the immediate mode buffer only in vbo_exec_vtx_map", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f89ee44ab0300b72ab957c3135858ff46187dfb5", + "description": "mesa: import PIPE_CAP_SIGNED_VERTEX_BUFFER_OFFSET handling", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "27dada7ce90315d47184c51879a3f67e99f2bab2", + "description": "mesa: remove FLUSH_CURRENT calls that have no effect", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c7c8fe1cc1a972e59885cc14778dbf5a520f48dd", + "description": "mesa: fix incorrect uses of FLUSH_CURRENT", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "01443dc7383f4634e4a66fa194ed51db74186128", + "description": "glx: print FPS with 2 decimal places", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1082e6fcb87d723986b640b5c077d05692df3511", + "description": "radeonsi: don't update states for the DCC MSAA bug on GFX6-7", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "fbb27eebc8cab1a5d70ea67a37de8d18f20a88f0", + "description": "radeonsi: fix the DCC MSAA bug workaround", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "897a4a0041e2477aa7ac487f23d85a5fc8900c49", + "description": "r600/sfn: Add some documentation", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7413aab3c837813331a706a022f493d0474caa13", + "description": "r600/sfn: Add .editorconfig file", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "110ee7ff93a42624b1e89065ec75b7649047715e", + "description": "r600/sfn: Add support for SSBO load and store", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "148f0ad4f9c4b4c291abcaa1722f5ae91f9c4014", + "description": "r600/sfn: Add support for atomic instructions", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "90a7d2e08fbd94d443fe6aeed093e4c758b169da", + "description": "r600: Make sure LLVM is not used for DRAW", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "37125b7cc220fd9b77e9882268892ca4e79a0627", + "description": "r600/sfn: Add lowering UBO access to r600 specific codes", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "32d3435a78675ff5ebf933d45b9b99fdc4dc7d82", + "description": "r600/sfn: Add GDS instructions", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5aadd809d07f6d7ce4d0cae18a410cc111c12c65", + "description": "r600/sfn: Add compute shader skeleton", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7fb5c835f7c9462e2095b6de645a0a75ad118c87", + "description": "r600/sfn: Add VS for TCS shader skeleton", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e17ac0d774b5a48a8d5a8a736e4a7a28554befa7", + "description": "r600/sfn: Add support for geometry shader", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5c7124e134395c4fe0dbc442a5b7b94f44d16aee", + "description": "r600/sfn: add emitVertex instructions", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f7ec616bedce226e0f710727d21ba2059b36d66c", + "description": "r600/sfn: Add MemRingOut instructions", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1b17316bf38b7f2c23ce648ddd718e1f48641309", + "description": "r600/sfn: Add a load GDS result instruction", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "31a4dd6451eec9cf96bec6d211e8e9b9f8032706", + "description": "r600/sfn: Add lowering arrays to scratch and according instructions", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5c19013904ef0ae68c582cd6d77fe54331b36baa", + "description": "r600/sfn: add register remapping", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "393655d5cb2ae499783408d36a96e34257473fcf", + "description": "r600/sfn: add live range evaluation for the GPR", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "24f683fe810904ae7355ddb036e1e4f37f1480c4", + "description": "r600/sfn: Add the WaitAck instruction", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e09cdb3f86ca53d4c24aa7b60d9ab44d1d679018", + "description": "r600/sfn: Add the VS in and FS out vectorization", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c5d9456d841c54b50616b293a532d106323658ce", + "description": "r600: enable NIR backend DEBUG flag for supported architectures", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f718ac62688b555a933c7112f656944288d04edb", + "description": "r600/sfn: Add a basic nir shader backend", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "295be0e8dfcc28366bcd193c44abc7913d5132d8", + "description": "r600: Update state code to accept NIR shaders", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "51285bf32ee98dcf92c4c31f7862b18ed2db322c", + "description": "r600: Add NIR compiler options", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "27cacd28ace6d6db5083beaceb35c140d827fe96", + "description": "r600: Increase space for IO values to agree with PIPE_MAX_SHADER_IN/OUTPUTS", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4422ce1b04c117f61394a6834cd7933f06ce4e1f", + "description": "r600: force new CF with TEX only if any texture value is written", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "144561dc5ec3dcbe63cb054f806247bc120b64e4", + "description": "svga: Use pipe_shader_state_from_tgsi to set shader state", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "bf12bc2dd7a28844103bb30a07be0440e60c5864" + }, + { + "sha": "470e73e7f86b4530cf789a779f43674ecec91881", + "description": "svga: fix size of format_conversion_table[]", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "84db6ba740d376b75e60c3a2a4ac0153c5b0e01a" + }, + { + "sha": "689817c9dfde9a0852f2b2489cb0fa93ffbcb215", + "description": "gallium/swr: simplify environmental variabled expansion code", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "34fd894e42ae1ec9d35bf9c4f05364b03dd4a223", + "description": "aco: fix waiting for scalar stores before \"writing back\" data on GFX8-GFX9", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7283c33b981f975361e3bfa62a339c88f2642cbb", + "description": "Vulkan overlay: use the corresponding image index for each swapchain", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "eb0195358c4e0376d93f10fb4f90703e51718779", + "description": "zink: only inspect dual-src limit if feature enabled", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e365f83740f1faa0e4d022da7b9aea9ae6dacbda", + "description": "zink: emit blend-target index", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8736ffae2eda9de1ac49200ef399170b428b9f8c", + "description": "zink: replace unset buffer with a dummy-buffer", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "18657c0c0a9074d3dfc0763b396929bcf34f71b4", + "description": "gitlab-ci: disable a630 tests as mesa-cheza is down (again)", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "35961b10da2dee4d3820ab1f250007412b06d876", + "description": "radeonsi: don't report that multi-plane formats are supported", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "a554b45d736073bbea4978118c02f7929f75cd77" + }, + { + "sha": "1c3f4c07047cef0dfcb9182690b22792b00d5935", + "description": "zink: fixup sampler-usage", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "fa915a724fbff0878478ca7dd968207bc9906016", + "description": "zink: lower away fdph", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0c36b1c0dbf2dcb8ded2509bc547a026b9624cc0", + "description": "etnaviv: enable texture upload memory throttling", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7bddaa61362924ec9dc360a2846cbdc8c647d834", + "description": "freedreno/ir3: Fold const only when the type is float", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "260bd32b58a55ac0d9870497caef3a4602e19d47", + "description": "freedreno/ir3: put the conversion back for half const to the right place.", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "cbd1f47433b7d735e3be5c8126f7f2b9343a1cdf" + }, + { + "sha": "d70192e6973aec3bbe2be70192f18b6a2257872a", + "description": "freedreno/ir3: Add cat4 mediump opcodes", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3eca6d9ce14abfc542031248be6a53c31cd113f9", + "description": "freedreno/ir3: fold const conversion into consumer", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5e2012d5c7496d04772c83e89d9fa1c9bc4087e2", + "description": "freedreno/ir3: fix printing half constant registers.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d55dfef78237380d3734f2341818daa299a7f330", + "description": "freedreno/ir3: Set IR3_REG_HALF flag on src as well in immediate MOV", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "fbfc8c35315f666d14c8b1f9ec71cb70d5e57d84", + "description": "docs: Mark 20.0-rc2 as done", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d8bae10bfe0f487dcaec721743cd51441bcc12f5", + "description": "freedreno: android: fix build of perfcounters.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "fad99243151725a3bdcab73bfd548adb2c535281", + "description": "freedreno: android: add a6xx-pack.xml.h generation to android build", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "cad400a59e47461f4965cfd19882c680cc111d94", + "description": "freedreno: android: fix build failure on android due to python version", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ff8265b64ff19380170b50b7016191c9d53fbd1e", + "description": "gallium/swr: Fix llvm11 compilation issues", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f239bb8020df4176ca539bafff327ab5c8da2c2e", + "description": "Vulkan Overlay: Don't try to change the image layout to present twice", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "320b0f66c27407008784da3606e23cb44c70ddf0" + }, + { + "sha": "4b978cd950cef844afce07993ddb697779e5648d", + "description": "aco: do not use ds_{read,write}2 on GFX6", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "4a553212fa171ddaf849d4abb2d67208390cd769" + }, + { + "sha": "da76dfb5159c2ca8ee24d64a5f85a68f28b70c65", + "description": "intel/vec4: fix valgrind errors with vf_values array", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "a8ec4082a41830cf67a4fd405402fd2d820722fd" + }, + { + "sha": "1572e8f3e136affb429e34ec2e8e10e206e0f3cd", + "description": "lima/parser: Change value name in RSW parser", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5802259e5438571c799bac2137da8bca505c6a94", + "description": "lima/parser: Extend AUX0 findings", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "cebfb3169caafddac6ea3e0e7977abe535e3bdaf", + "description": "lima/parser: Fix RSW depth test parsing", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "eaa0784fd32a6189b6ad25b72365932b4ec15bc7", + "description": "i965: remove duplicated comment", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "26ab38f1440625b85adc235140574901e60562a1", + "description": "ci: Drop turnip opt-in option", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "fbc117cba311ed086ba7885e6df423ce5ae114b7", + "description": "llvmpipe: advertise 4 vertex streams", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7e6690b1a62ff3b8e2576bd35d5f9bc530464032", + "description": "draw: don't emit vertex to streams with no outputs", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "72154c9075269d2022ede04f233a08bb9751f104", + "description": "draw: emit multiple streams to streamout.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "00c066e5a04eba9359411bac36d985d047be6ddf", + "description": "draw/gs: track emitted prims + verts per stream.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0c77007c9db74c3859f75dcd5161396ae5c16772", + "description": "draw: change geom shader output to an array of outputs.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8583fcd8f182a290f000cb303ec2e067688363b8", + "description": "gallivm/nir: add support for multiple vertex streams", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b66884131312cac4438aab89490fd6f33443247a", + "description": "gallivm/swr: add stream_id to geom epilogue emit", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9d70002744ca121bff51dd40bfa76b633320652c", + "description": "llvmpipe/query: add support for indexed queries", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "658eb691fc4266cfcb6d2a7eec17469f5eae10b5", + "description": "ci: Bump the GLES CTS version to 3.2.6.1.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b37922dd1ed4bbc19b69b543a83d8fdf49532a32", + "description": "ci: Disable a bunch of tests on freedreno a630.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b3063cbd185b3bb2e01ab4f70ca0c661f38082b4", + "description": "turnip: Drop explicit configure opt-in for turnip", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4ca77f347d2e5791907db77ef1996f01ff9aa386", + "description": "u_tile: Skip the packed temporary and just store tiles directly.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "68bb26af63acad6a42dd5c4d653c61917a69127e", + "description": "broadcom: Fix implicit declaration of ffs for Android build", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ce23911b77b026ee1894e012f0249d827047bac6", + "description": "aco: gfx10_wave64_bpermute reduce op to print_ir", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "20eb1acb6f404ffa4e502e7de8dec8ac83e7a8a8", + "description": "aco: fix gfx10_wave64_bpermute", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "9254fb4fc72ed289ffded28ef067b4582973e90c" + }, + { + "sha": "1c79afd94620925cb9e0903f24f91c3ab9ecfcb4", + "description": "Correctly wait in the fragment stage until all semaphores are signaled", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "451cf228d53ba8f51beb3dcf04370e126fb7ccb6", + "description": "svga: Fix banded DMA upload", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "287c94ea4987033f9c99a2f91c5750c9083504ca" + }, + { + "sha": "5aec9e84a86f578d0babae1d5c1800578c1c1b53", + "description": "anv: No-op submit and wait calls when no_hw is set", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f9febfae416e9fdf39a501ceb53a65c99ca78eed", + "description": "anv: set MOCS on push constants", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "67d2cb3e9367a723d6f6310f75048c6d97afe9d4" + }, + { + "sha": "a140ea1cedc5b979410796b13f33031c7b20671d", + "description": "llvmpipe: Bump test timeout to 180 seconds", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4aa7af9e9a4c19e10afaf4a3c756e62cf4d352c3", + "description": "intel: Load the driver even if I915_PARAM_REVISION is not found.", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "96e1c945f2bc4047a603753ae10fc4f27754361c" + }, + { + "sha": "20bcbcd958967f2f5bdb560951c70c71d0939329", + "description": "isl: Fix the android build.", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "5bea0cf77956d748ea929e12d12756692f04a33f" + }, + { + "sha": "a92be2fb26e6d470a6d2b179f35f54d75a7a7b90", + "description": "intel/genxml: Drop \"reserved\" enum", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "5bea0cf77956d748ea929e12d12756692f04a33f" + }, + { + "sha": "deb2bbf57ec1d0660dd85b7080bf5ebeb10e8768", + "description": "swr: Fix GCC 4.9 checks.", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "f0a22956be4802e01f2b4f3244f011212626f12d" + }, + { + "sha": "205ce0bea5e14a855a86f8b9662ba34cdd372280", + "description": "gallium: let the pipe drivers decide the supported modifiers", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "ac0219cc5b6afa6d0392a164b58e21ce95079930" + }, + { + "sha": "d8569baaed1a38cf3da9e45375fa2267d9a1eeb0", + "description": "iris: handle the failure of converting unsupported yuv formats to isl", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "76f300f2e498f101c58e182eac3ece11e527d297", + "description": "Revert \"egl: put full path to libEGL_mesa.so in GLVND json\"", + "nominated": false, + "nomination_type": 2, + "resolution": 4, + "master_sha": null, + "because_sha": "0021f7dc307f4852955359adb5ac2b7667e6d4ac" + }, + { + "sha": "9595b23a45a6bcb7ee784e2c38085a8ea2e4620a", + "description": "meson: don't bother trying `python2`", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4d34abd15c91ed67414e2e0dc1ae252f53574ef6", + "description": "aco/optimizer: Don't combine uniform bool s_and to s_andn2.", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "8a32f57fff56b3b94f1b5589feba38016f39427c" + }, + { + "sha": "a77c3d5eed45ba5abcbacdc3511a93f02adb7673", + "description": "nouveau: Reuse tgsi_get_gl_varying_semantic().", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f4f769c851f4130195b5e3b76784df93f457d571", + "description": "nouveau: reuse tgsi_get_gl_frag_result_semantic().", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f9358f6f765092c821696df8b36bf2cf70b05e49", + "description": "nouveau: Reuse tgsi_get_sysval_semantic().", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e25967d6b81b9ac9dccfe92cc92dc8d977eab592", + "description": "mesa/st: Move the SYSTEM_VALUE -> TGSI_SEMANTIC map to tgsi_from_mesa.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9891062642a3f35dc326b305fca2407f9041915c", + "description": "freedreno/a6xx: Implement layout for DRM_FORMAT_MOD_QCOM_COMPRESSED", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "ecd62ff7665d0a731ead705321e4e1ee0757974d" + }, + { + "sha": "d233c8c914ce819147197b9327bc22d1ea58b2fb", + "description": "freedreno: Add layout_resource_for_modifier screen vfunc", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "af6fb4f0a9ffe3250612acd3eb382f5eb5227e48", + "description": "freedreno: Set up supported modifiers in fd*_resource_screen_init()", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d0a7c8f4a8cd375a9448f50d777e2cc9ee95a8d1", + "description": "freedreno/a6xx: Add fd6_resource_screen_init()", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8d07d66180b1523d7e70681ca142955f896ebda9", + "description": "glsl,nir: Switch the enum representing shader image formats to PIPE_FORMAT.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5bea0cf77956d748ea929e12d12756692f04a33f", + "description": "intel/isl: Move iris's pipe-to-isl format function to isl.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "bb615e5fe3f68d0dc8210e6b09ced6913b433103", + "description": "mesa: Clean up some endianness adapters for shader image formats.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "23c137612bea1e319ecdfb894c020b6651f4909a", + "description": "gallium/swr: Fix various asserts and security issues", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "7eaf21cb6f67adbe0e79b80b4feb8c816a98a720", + "description": "pan/midgard: Fix scheduling issue with csel + render target reference", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "38c20696a5358d6898c4ee96fb127d603c1e1404", + "description": "panfrost: Set the MALI_WRITES_{Z,S} flags when needed", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8ed94d38b4169e18bf81e956241d1c8674cc2ec6", + "description": "panfrost: Add the MALI_WRITES_{Z,S} flags", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0406ea4856498c25479b1e1b191471f7222747d6", + "description": "panfrost: Z24 variants should be sampled as R32UI", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e1ba0cd452cb456e5d06ee22fdecaed451a7a48b", + "description": "pan/midgard: Add nir_intrinsic_store_zs_output_pan support", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f5619f50738c0ea2bee49d982e88f18496d7514a", + "description": "pan/midgard: Turn Z/S stores into zs_output_pan intrinsics", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "59488cbbaca1268841fe5ba42d0a1202b33be23b", + "description": "intel/fs: Don't count integer instructions as being possibly coissue", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "e64be391dd065b6a0eabee17ada038db7a28c112" + }, + { + "sha": "8455648ccae92692e4a522d23b1491d5825b030c", + "description": "tu: Move vsc_data and vsc_data2 allocation into the device", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "84bd4da468cd21dad5d41110b495b08623e82a0e", + "description": "freedreno: Fix CP_COND_EXEC", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ed5d1c1c471b9a7017625ab7d742f2895ab64b96", + "description": "freedreno: Add CP_REG_WRITE documentation", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "65197a3ac1cf4303e37927ed3faae47e41ee74e6", + "description": "freedreno: Fix CP_COND_REG_EXEC bit positions", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8be81f8a2a9f3f838cc550aeddf79657608e1008", + "description": "gitlab-ci: Build radeonsi & RADV in the ppc64el job", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "65610ec774ae095cfee3602b3f06d51d0a199791", + "description": "gitlab-ci: Add ppc64el and s390x cross-build jobs", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a443f81f2631efa0881f72472c476d9d368b5d05", + "description": "gitlab-ci: Merge ccache and libxml2-utils into main apt-get install", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a06fc0296d6ab3567320fb4ecab6d297eba08223", + "description": "gitlab-ci: Pass -j4 to make", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "84fefa206c43e6a7b9a1ff34230eed0439f59e35", + "description": "gitlab-ci: Update to latest ci-templates HEAD", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3da91b3327fb93d0364c0ca9d0216f695160831d", + "description": "radeonsi/ngg: add VGT_FLUSH when enabling fast launch", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2799676218249c5b9f1dc0a6367e459a1ad5642e", + "description": "util/disk_cache: check for write() failure in the zstd path", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "a8d941091f72923561a6c58b46ccb264b6a0e205" + }, + { + "sha": "6321e3fb9fd9cc9d817071d435c6b8a59869b8bc", + "description": "dri: delete gen-symbol-redefs.py", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "bcb611361b08528b14d3c5827ee2c4b21de1199d", + "description": "anv: implement gen12 post sync pipe control workaround", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8949d27bb8b4385e92049c18f728bdcf0a79b093", + "description": "anv: implement gen9 post sync pipe control workaround", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "19e7bcee1742a40981a0b1c06447bca22646c294", + "description": "iris: implement gen12 post sync pipe control workaround", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2c07e03b792d57ae807a6953f0d8ff5f4bcdffd0", + "description": "freedreno: allow ctx->batch to be NULL", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "22d2cbe6856fea65bf01dc96941b5127f17dacab", + "description": "freedreno: Allow UBWC on textures with multiple mipmap levels.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ecd62ff7665d0a731ead705321e4e1ee0757974d", + "description": "freedreno: Disable UBWC on Z24S8 if not TEXTURE_2D.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ddb0b35b76dfee95a3bd472538bf9510c1cbd2f7", + "description": "freedreno: Blit all array levels when uncompressing UBWC.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6b586d5a48cc11ee216f18ac093a601917861cff", + "description": "freedreno: Swap the whole resource layout in shadowing.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f9f5d3eb554e6096b762744125580a239d3809b0", + "description": "freedreno/a6xx: Disable the core layer-size setup.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "17312b4a10718ee14a80e5c1b4e2e586d8a79920", + "description": "freedreno: Rename the UBWC layer size field and store it as bytes.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b6b4118bb06c5a38d7b7bf61ab67551c3129176d", + "description": "freedreno: Include the layer size in layout debug.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "20357dfde85712503ae82aeedcfa9b2bf31f2adc", + "description": "freedreno: Move the layout debug under FD_MESA_DEBUG=layout.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "65a6dc5139fddd5e01eaedcc57fc67e0a6a28c94", + "description": "radv: Do not set SX DISABLE bits for RB+ with unused surfaces.", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "e9316fdfd4899c269a19e106a6ffa4309ae48b27" + }, + { + "sha": "17303c9851e32194550f899399859944fa5b3009", + "description": "mesa: implement missing display list functions while switching to the template", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "56de59b931ac162de932d650176fbd981cc68aed", + "description": "vbo: move reusable code from vbo_attrib_tmp.h into vbo_util.h", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "052e8f758e68061adaee7f6f95bdbb01ae8f519d", + "description": "vbo: use the template for save GLvertexformat initialization", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9ec5e96ec8297216bcab4036deaa6eb714ca4e67", + "description": "vbo: use the template for noop GLvertexformat initialization", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d447a4888fe0089c0c3589ea68daf400e866de2a", + "description": "vbo: move GLvertexformat initialization into a template header file for reuse", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "cae609326678bd00702261f756ce0c16efd530d4", + "description": "freedreno/perfcntrs: fix fd leak", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "5a13507164a26fc796f02c57a24468b834254b4d" + }, + { + "sha": "8a2c507a8abe2537df5bc1f4847c40e6d4314dec", + "description": "util: Drop unpacking from int signed to unsigned and vice versa.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "1d367c3aa57da54f5dbd6871a38b9fc1d6cbcc45", + "description": "gallium: Refactor some single-pixel util_format_read/writes.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ab081970e07d97c4218a68341d8534693d27e474", + "description": "gallium: Add and use a helper for packing uc from a color_union.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b2a2cf492decf35e1e2c622e3c45e98333ec15d7", + "description": "softpipe: Refactor pipe_get/put_tile_rgba_* paths.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8bc56551da9ce64be3e253c5131a572501ad44a7", + "description": "softpipe: Drop the raw_to* part of the tile cache interface.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6cdf523f00398b98c73743f6eb4167409ff065e0", + "description": "gallium/util: Remove pipe_get_tile_z/put_tile_z.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e986f2b7aff6c51e420fbb06553a748f15f55a01", + "description": "mesa/st: Use direct util_format_pack/unpack instead of u_tile.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c574cda3c6a3f880f99e4e22967fc82e34609942", + "description": "util: Make helper functions for pack/unpacking pixel rows.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "333c9d5bb054d5ac5518e830b535e8a4f3f80187", + "description": "clover: add trivial clCreateCommandQueueWithProperties implementation", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b064697af1dc8927756986f396c793e0e23c42e9", + "description": "gallium/osmesa: Try to fix the test for big-endian.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "dd899fd43ebc56fee6ad41126db340a9c2f7bc72", + "description": "gallium/osmesa: Fill out other format tests.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0a53918f025af606450ddb9df3d032191a82ce89", + "description": "gallium/osmesa: Fix MakeCurrent of non-8888 contexts.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "655394c6ed2ef15c66ea8234d7ab388901f7e295", + "description": "gallium/osmesa: Fix a typo in the unit test's test names.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "75c50d03422eb6a74f462419015a697f371468d6", + "description": "osmesa/tests: Cover OSMESA_RGB GL_UNSIGNED_BYTE case", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d83abf1d378be059b4f41a6a44a9bf24c7394084", + "description": "st/mesa: Handle the rest renderbuffer formats from OSMesa", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d1165ad18b5e1d8b137daff1b1ad3d11ba4445e4", + "description": "util/os_socket: fix header unavailable on windows", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "e62c3cf350a8b169e6401d5f1e1f17388cdc4b77" + }, + { + "sha": "36126b6211f1ac2da0aa94411608b2320553dbb6", + "description": "i965: Do not set front_buffer_dirty if there is no front buffer", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9afdcd64f2c96f3fcc1a28912987f2e8066aa995", + "description": "gitlab-ci: Switch kernel for LAVA jobs to 5.5", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "162927e43cdb5d6c184a4064fbd0799012fc297e", + "description": "panfrost: Use size0 when calculating the offset to a depth level", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "64541dd69875d043d90525769901d18fdde4b68b", + "description": "panfrost: Only clamp the LOD to disable mipmapping when needed", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "255227ececb4444fdf5cc2925fc1064d729021f4", + "description": "panfrost: Fix decoding of tiled 3D textures", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "fd27fb511386615cd6b44b037f9f5117846b51d4", + "description": "st/mesa: use uint-result for sampling stencil buffers", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "642125edd97384b88f491c1383a06c42ed16e11e" + }, + { + "sha": "9cdd89a34b89973fbe646de5976f190ec7bd8d1c", + "description": "pan/midgard: Remove unused variable", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0f3eb7989b38239d08d729a488ad6b737ec61ad8", + "description": "pan/midgard: Check for null consts", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8ec4028d4057da19708b4f4bce88ee16f96f322a", + "description": "panfrost: Avoid overlapping copy", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c32bd325e7688c781b7e5de58a2d0534c82f00a0", + "description": "etnaviv: Destroy rsc->pending_ctx set in etna_resource_destroy()", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "9e672e4d20fb77e1b39aee29f9e8e76a5c2af68e" + }, + { + "sha": "df6a2a719798d706d60b508106da363311a43469", + "description": "turnip: Be explicit about converting vk compare func to a6xx", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6dd57f0e3811d61c9d6179452a420f28008ca9ba", + "description": "nir: Remove always-true assert", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e3dfa8f4d694e7d64a6401752af1f973b0852aab", + "description": "glsl: Use 'using' to be explicit about visitor overloads", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0bc516fceb742e4c1ce2d47f129d19d8bb005d13", + "description": "spirv/opencl: Cast opcode up front to avoid warnings", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "67dd51606ca1670e21ce414591cc48caaf2e2e9f", + "description": "freedreno/fdperf: Cast away some ignored return values", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2be81a3bfa839e51b9719567236acaff70babd62", + "description": "nir: Make unroll pragma work on clang", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "de856c6170fbfd57f480c04069feb2fac0099060", + "description": "nir: Delete unused is_var_constant() helper", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "42f7e124cae38dc95e219097cc23f48d24e3071a", + "description": "Revert \"gitlab-ci: disable a630 tests as mesa-cheza is down\"", + "nominated": false, + "nomination_type": 2, + "resolution": 4, + "master_sha": null, + "because_sha": "f38851d84c583b1c62ea95edbc42eb5e2ad14fa8" + }, + { + "sha": "0ccda2ebff83816cecf4dcb48f367a0d5c8f5fb1", + "description": "clover: Use explicit conversion from llvm::StringRef to std::string", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "5d83314945d5f286cbe4fff9b07d2756f36a64c6", + "description": "zink: disallow depth-stencil blits with format-change", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "85d4b41f6830253b81b591fa401a9c5aea5e407a", + "description": "zink: be more careful about the mask-check", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b550b7ef3b8d12f533b67b1a03159a127a3ff34a", + "description": "panfrost: Fix the damage box clamping logic", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "65ae86b85422ae0d41624dd56f935d85b0164a13" + }, + { + "sha": "2b089e26bfe615cf616926cdddafd8439c835878", + "description": "pan/midgard: Stop leaking instruction objects in mir_schedule_alu()", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c7e68d8625e4efcc776b5352de3b16b6951fabf7", + "description": "pan/midgard: Don't check 'branch && branch->writeout' twice in mir_schedule_alu()", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ef89a52fe58e85402d7652d863da1cc90b378772", + "description": "pan/midgard: Lower bitfield extract to shifts", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c68cd39eb3797eb34a049950cb34acfd0719cde7", + "description": "pan/midgard: Make sure we pass the right RT id to emit_fragment_store()", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "25946be4c451fe1cc645a6fd3cb5d59160e93f25", + "description": "pan/midgard: Add an enum to describe the render targets", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e62c3cf350a8b169e6401d5f1e1f17388cdc4b77", + "description": "util/os_socket: Include unistd.h to fix build error", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "ef5266ebd50e7fa65c56bdb623e12ca8c233b470" + }, + { + "sha": "f38851d84c583b1c62ea95edbc42eb5e2ad14fa8", + "description": "gitlab-ci: disable a630 tests as mesa-cheza is down", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a4e627054167ff52742cf45b1aefccffb0de7071", + "description": "nv50: report max lod bias of 15.0", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0021f7dc307f4852955359adb5ac2b7667e6d4ac", + "description": "egl: put full path to libEGL_mesa.so in GLVND json", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d5fd8cd46eeedeabf8647c06489a755aea8f0080", + "description": "radv: Allow non-dedicated linear images and buffer.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "38f963226bdf8277826e80848d2a6b44cbabddd3", + "description": "pan/midgard: Implement mixed-type constant packing", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a12fe52cbc86b2d33cd5a726ce1020cdcd6c064c", + "description": "pan/midgard: Break out one-src read_components", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b74212e70153f3a199a60a95d003208144f5fac9", + "description": "panfrost: Fix non-debug builds", + "nominated": false, + "nomination_type": 1, + "resolution": 4, + "master_sha": null, + "because_sha": "226c1efe9a8b7a4f1802ab13f249dc06b2bd7d3d" + }, + { + "sha": "d7fe9af6202413aa4e6f0f53d89577ed8ea80027", + "description": "anv/blorp: Use the correct size for vkCmdCopyBufferToImage", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "dd92179a72e5263b9db730d92a883e2536aa4474" + }, + { + "sha": "8ff613dc58782eab0fa915056255aedb838e3470", + "description": "VERSION: bump after 20.0 branch point", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "02658df152d1a7fedd8ce61dbe6e84566c8c75d0", + "description": "lima: Fix build with GCC 10.", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "d71cd245d7445121342a4933cc6ed8ce3fc6e568" + }, + { + "sha": "982d61e2cdd5a5e3f82444787634fa45ba2fd44f", + "description": "freedreno/ir3: fix a dirty lie", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "752aeb7b3fe75b3c6ce8d9dadaba9c4111fa7254", + "description": "freedreno/ir3: simplify split from collect", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8d0e7d9a4c1cfe28b3cd2356e94e287e82821e1a", + "description": "freedreno/ir3: create fragcoord instructions in input block", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "fb09020ef23cc87c1c3024add572cf0a571e8ddc", + "description": "freedreno/ir3: remove unused tex arg harder", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2ffe44ec0a5dba18e4a88ca7dd1042e823f9685e", + "description": "freedreno/ir3: add RA sanity check", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2f4f46b7080a1087420939b2f4bf0bea414cd3ce", + "description": "freedreno/a6xx: fix lrz overflow", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3e79c4f0edc4a263c29f8df6169d0ad74aee7c69", + "description": "freedreno/ir3: two pass register allocation", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "b0293af7a5d821776e7e90e3892015581c497810", + "description": "freedreno/ir3: don't precolor unused inputs", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ad2587d3c8885ce4aa0403269268a1c0ab8c2cac", + "description": "freedreno/ir3: add is_tex_or_prefetch()", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "4a7a6c9ef0eb6b26d8410591353142207689d085", + "description": "freedreno/ir3: number instructions from one", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "0f78c32492ed096649b015a4967d6d56c18dd14a", + "description": "freedreno/ir3: post-RA sched pass", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3369406e44b0226295e7475e189da2e42efd7f22", + "description": "freedreno/ir3: fix kill scheduling", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9a9f78f1f9f0019687eb374aae5abcd3b0617cf4", + "description": "freedreno/ir3/ra: make use()/def() functions instead of macros", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "a5f24f966ae217981cd39e867a0de1fee029e740", + "description": "freedreno/ir3: a bit more optmsgs debug", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "300d1181c72043afe045a155079fc152fcd1283e", + "description": "freedreno/ir3: move atomic fixup after RA", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "304b50c9f8f57a115ac251f022093c8adfb0823d", + "description": "freedreno/ir3: move block-scheduling into legalize", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "093c94456bc99308bd80bcc952d1f77ea71a831c", + "description": "freedreno/ir3: move nop padding to legalize", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c803c662f990621acefd2f002d9df0d42ad8a3a0", + "description": "freedreno/ir3: split out delay helpers", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "54c795f8297d5087b013777bddac32ed47941cb7", + "description": "freedreno/ir3: fix crash when no non-input instructions", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c1194e10b2dc4d14ea21eb9bc8e607056ebaffcd", + "description": "freedreno/ir3: cleanup after lower_locals_to_regs", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f0b792ea0602dd344b8a2dce4ab582b167f6fd35", + "description": "freedreno/ir3: shuffle a few ir3_register fields", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "95831e2f66a16e2334cb1f972c9485b71955900b", + "description": "intel/gen12+: Set way_size_per_bank to 4", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "00a84c170a0a495f21008a80557a1d2b8257ea56", + "description": "intel/gen12+: Reserve 4KB of URB space per bank for Compute Engine", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c0d8b373adebbf8396b16537bd4d633ab6659900", + "description": "virgl: Use align_free for align_malloc allocated buffer", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d326d30efefd132255826ff33a9a4c51c216fe39", + "description": "freedreno/drm: readonly cmdstream", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f93dfb509cbf9474befae9919dd8c135bbd67093", + "description": "intel/fs: Write the address register with NoMask for MOV_INDIRECT", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9a95abd0f755331503e283354b44b639865f1329", + "description": "intel/tools: Handle strides better when dumping buffers", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "51d7c42165d2344d0019299d42b34c07f7f5e8d0", + "description": "intel/disasm: SEND has two sources on Gen12+", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "fa3ef6a8370bf1ce121806e60ccdedb2ddc6aa83", + "description": "intel/eu/validate: Don't validate regions of sends", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "3b323d66019bcbb56811b66947b39e77a2c7c3e0", + "description": "aco: fix image_atomic_cmp_swap", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "71440ba0f5512fe455be66ca48b253ecc37478a9" + }, + { + "sha": "0d14f41625fa00187f690f283c1eb6a22e354a71", + "description": "aco: fix MUBUF VS input loads when expanding vec3 to vec4 on GFX6", + "nominated": true, + "nomination_type": 1, + "resolution": 1, + "master_sha": null, + "because_sha": "6aecc316c000c343b25963c1356525f95ea6cafe" + }, + { + "sha": "d8410fec4efa4fb8847342a15b021501e3e2341b", + "description": "gallium/swr: Fix gcc 4.8.5 compile error", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8dacf5f9d1df95c768016a1b92465bbabed37b54", + "description": "swr: Fix build with GCC 10.", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "790516db0bfc056df0290c42565214d4148e901a", + "description": "gallium/swr: fix gcc warnings", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8405e1bef0cfa99a2e5e865cf5f933fddbd35222", + "description": "zink: implement support for derivative-control", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "f12b844e7c284f691323d4f77f2fd94c648e37e0", + "description": "zink: implement load_instance_id", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "c0ced1e79b3311cf55f3c8852417825e3fe102ef", + "description": "zink: enable texture-buffer objects", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "00edb82fde2cfebe97457cb7819e7e560c4d3a4c", + "description": "radeonsi: Add support for midstream bitrate change in encoder", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "d902e23d8094a01f752d3404ec484e0c059eb193", + "description": "panfrost: Use DBG macro to avoid noise in the console", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "25042062215c682445a70b4527e8298b30996d93", + "description": "pan/midgard: Handle nir_intrinsic_load_barycentric_centroid", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "226c1efe9a8b7a4f1802ab13f249dc06b2bd7d3d", + "description": "panfrost: Add more info to some assertions", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2d5c433aeeeb083f1a5902d58e520614d2fe35be", + "description": "panfrost: Print intended field when decoding", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + }, + { + "sha": "8c5fd2942b4fb2005b3d01fb4cab86a4162c8a90", + "description": "anv: Always fill out the AUX table even if CCS is disabled", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "2ccdf881aba7c8cd0c7175995e351e783e0fd11d", + "description": "iris: Plumb deref block size through to 3DSTATE_SF", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e6b39850f092b387881c4fb4260c9465971422aa", + "description": "anv: Plumb deref block size through to 3DSTATE_SF", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "ce9c45a60ed51ddb27bd969bdc61336f18121a07", + "description": "intel/blorp: Plumb deref block size through to 3DSTATE_SF", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "fdc0c19328fd8e02e4b1bd5c62b93ce6c4597ca1", + "description": "intel/common: Return the block size from get_urb_config", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e340a79b9c4b6ee35eaa10a685395a67d0b0b440", + "description": "anv: Emit URB setup earlier", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e928676b69bf9cafce1c0304dd473c926b9f2854", + "description": "iris: Consolodate URB emit", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "09e4c33085f15ffa691053143bec9dbf4aecfeaa", + "description": "intel/blorp: Always emit URB config on Gen7+", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "73a684964b392c4df84373e8419e355267d57ff5", + "description": "intel: Take a gen_l3_config in gen_get_urb_config", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9d05822cb8b5d3fd066c64722b76b3507a7fd24f", + "description": "i965: Re-emit l3 state before BLORP executes", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "bff7b3c7bd56c25544ea6e3ea9452358374db10a", + "description": "iris: Use the URB size from the L3$ config", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "99f3178a249525d333c5b27d755a0f99a81b3c17", + "description": "iris: Store the L3$ configs in the screen", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "6471bac99ec11c7901d6fc9bda908c047e621f5f", + "description": "iris: Set SLMEnable based on the L3$ config", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "73434b665b2ec50cbd1060ce831aec3b2e21517c", + "description": "intel/genxml: Drop SLMEnable from L3CNTLREG on Gen11", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "e1bdb127b6875df602bd736465d597725f326621", + "description": "anv,iris: Set 3DSTATE_SF::DerefBlockSize to per-poly on Gen12+", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "9da9abf8a7a605cc9b79bd4240ff715b79ac774a", + "description": "genxml: Add a new 3DSTATE_SF field on gen12", + "nominated": true, + "nomination_type": 0, + "resolution": 1, + "master_sha": null, + "because_sha": null + }, + { + "sha": "21dd0a151401956523d7facaccfa8e8cdf915c18", + "description": "docs/release-calendar: 20.0.0-rc1 has been released", + "nominated": false, + "nomination_type": null, + "resolution": 4, + "master_sha": null, + "because_sha": null + } +] \ No newline at end of file diff -Nru mesa-19.2.8/README.rst mesa-20.0.8/README.rst --- mesa-19.2.8/README.rst 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/README.rst 2020-06-12 01:21:16.000000000 +0000 @@ -56,5 +56,4 @@ documentation (`docs/submittingpatches.html `_). -Note that Mesa uses email mailing-lists for patches submission, review and -discussions. +Note that Mesa uses gitlab for patches submission, review and discussions. diff -Nru mesa-19.2.8/REVIEWERS mesa-20.0.8/REVIEWERS --- mesa-19.2.8/REVIEWERS 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/REVIEWERS 2020-06-12 01:21:16.000000000 +0000 @@ -1,30 +1,11 @@ Overview: This file is similar in syntax (or more precisly a subset) of what is - used by the MAINTAINERS file in the linux kernel. Some fields do not - apply, for example, in all cases, send patches to: - - mesa-dev@lists.freedesktop.org - - and in all cases the patchwork instance is: - - https://patchwork.freedesktop.org/project/mesa/ - + used by the MAINTAINERS file in the linux kernel. The purpose is not exactly the same the MAINTAINERS file in the linux kernel, as there are not official/formal maintainers of different subsystems in mesa, but is meant to give an idea of who to CC for - various patches for review, and to allow the use of - scripts/get_reviewer.pl as git --cc-cmd. - -Usage: - - When sending patches: - - git send-email --cc-cmd ./scripts/get_reviewer.pl ... - - Or to configure as default: - - git config sendemail.cccmd ./scripts/get_reviewer.pl + various patches for review. Descriptions of section entries: @@ -36,14 +17,6 @@ F: drivers/net/* all files in drivers/net, but not below F: */net/* all files in "any top level directory"/net One pattern per line. Multiple F: lines acceptable. - N: Files and directories with regex patterns. - N: [^a-z]tegra all files whose path contains the word tegra - One pattern per line. Multiple N: lines acceptable. - scripts/get_maintainer.pl has different behavior for files that - match F: pattern and matches of N: patterns. By default, - get_maintainer will not look at git log history when an F: pattern - match occurs. When an N: match occurs, git log history is used - to also notify the people that have git commit signatures. Maintainers List (try to look for most precise areas first) @@ -135,3 +108,13 @@ R: Eric Engestrom F: src/vulkan/ F: include/vulkan/ + +VMWARE DRIVER +R: Brian Paul +R: Charmaine Lee +F: src/gallium/drivers/svga/ + +VMWARE WINSYS CODE +R: Thomas Hellstrom +R: Deepak Rawat +F: src/gallium/winsys/svga/ diff -Nru mesa-19.2.8/scons/gallium.py mesa-20.0.8/scons/gallium.py --- mesa-19.2.8/scons/gallium.py 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/scons/gallium.py 2020-06-12 01:21:16.000000000 +0000 @@ -381,6 +381,17 @@ if check_header(env, 'sys/shm.h'): cppdefines += ['HAVE_SYS_SHM_H'] + if check_functions(env, ['strtok_r']): + cppdefines += ['HAVE_STRTOK_R'] + + #FIXME: we should really be checking for the major()/minor() + # functions/macros in these headers, but check_functions()'s + # SConf.CheckFunc() doesn't seem to support macros. + if check_header(env, 'sys/mkdev.h'): + cppdefines += ['MAJOR_IN_MKDEV'] + if check_header(env, 'sys/sysmacros.h'): + cppdefines += ['MAJOR_IN_SYSMACROS'] + if platform == 'windows': cppdefines += [ 'WIN32', @@ -476,9 +487,15 @@ '-fmessage-length=0', # be nice to Eclipse ] cflags += [ - '-Wmissing-prototypes', - '-std=gnu99', + '-Werror=implicit-function-declaration', + '-Werror=missing-prototypes', + '-Werror=return-type', + '-Werror=incompatible-pointer-types', ] + if platform == 'darwin' and host_platform.mac_ver()[0] >= '10.15': + cflags += ['-std=gnu11'] + else: + cflags += ['-std=gnu99'] if icc: cflags += [ '-std=gnu99', diff -Nru mesa-19.2.8/scons/llvm.py mesa-20.0.8/scons/llvm.py --- mesa-19.2.8/scons/llvm.py 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/scons/llvm.py 2020-06-12 01:21:16.000000000 +0000 @@ -38,7 +38,7 @@ import SCons.Util -required_llvm_version = '3.3' +required_llvm_version = '3.9' def generate(env): @@ -108,9 +108,10 @@ env.AppendUnique(CXXFLAGS = ['-posix']) # LIBS should match the output of `llvm-config --libs engine mcjit bitwriter x86asmprinter irreader` for LLVM<=7.0 - # and `llvm-config --libs engine irreader` for LLVM>=8.0 - # LLVMAggressiveInstCombine library part of engine component can be safely omitted as it's not used. - if llvm_version >= distutils.version.LooseVersion('9.0'): + # and `llvm-config --libs engine coroutines` for LLVM>=8.0 + # LLVMAggressiveInstCombine library part of engine component since LLVM 6 is only needed by Mesa3D for LLVM>=8. + # While not directly needed by Mesa3D, this library is needed by LLVMipo which is part of coroutines component. + if llvm_version >= distutils.version.LooseVersion('10.0'): env.Prepend(LIBS = [ 'LLVMX86Disassembler', 'LLVMX86AsmParser', 'LLVMX86CodeGen', 'LLVMSelectionDAG', 'LLVMAsmPrinter', @@ -129,8 +130,11 @@ 'LLVMDemangle', 'LLVMGlobalISel', 'LLVMDebugInfoMSF', 'LLVMBinaryFormat', 'LLVMRemarks', 'LLVMBitstreamReader', 'LLVMDebugInfoDWARF', + 'LLVMAggressiveInstCombine','LLVMLinker', 'LLVMVectorize', + 'LLVMInstrumentation', 'LLVMipo', 'LLVMCoroutines', + 'LLVMCFGuard', 'LLVMTextAPI', ]) - elif llvm_version >= distutils.version.LooseVersion('5.0'): + elif llvm_version >= distutils.version.LooseVersion('9.0'): env.Prepend(LIBS = [ 'LLVMX86Disassembler', 'LLVMX86AsmParser', 'LLVMX86CodeGen', 'LLVMSelectionDAG', 'LLVMAsmPrinter', @@ -139,7 +143,7 @@ 'LLVMTransformUtils', 'LLVMBitWriter', 'LLVMX86Desc', 'LLVMMCDisassembler', 'LLVMX86Info', - 'LLVMX86AsmPrinter', 'LLVMX86Utils', + 'LLVMX86Utils', 'LLVMMCJIT', 'LLVMExecutionEngine', 'LLVMTarget', 'LLVMAnalysis', 'LLVMProfileData', 'LLVMRuntimeDyld', 'LLVMObject', 'LLVMMCParser', @@ -148,8 +152,12 @@ 'LLVMIRReader', 'LLVMAsmParser', 'LLVMDemangle', 'LLVMGlobalISel', 'LLVMDebugInfoMSF', 'LLVMBinaryFormat', + 'LLVMRemarks', 'LLVMBitstreamReader', 'LLVMDebugInfoDWARF', + # Add these libraries to enable ompute shaders support. + 'LLVMAggressiveInstCombine','LLVMLinker', 'LLVMVectorize', + 'LLVMInstrumentation', 'LLVMipo', 'LLVMCoroutines', ]) - elif llvm_version >= distutils.version.LooseVersion('4.0'): + elif llvm_version >= distutils.version.LooseVersion('8.0'): env.Prepend(LIBS = [ 'LLVMX86Disassembler', 'LLVMX86AsmParser', 'LLVMX86CodeGen', 'LLVMSelectionDAG', 'LLVMAsmPrinter', @@ -166,14 +174,18 @@ 'LLVMSupport', 'LLVMIRReader', 'LLVMAsmParser', 'LLVMDemangle', 'LLVMGlobalISel', 'LLVMDebugInfoMSF', + 'LLVMBinaryFormat', + # Add these libraries to enable ompute shaders support. + 'LLVMAggressiveInstCombine', 'LLVMLinker', 'LLVMVectorize', + 'LLVMInstrumentation', 'LLVMipo', 'LLVMCoroutines', ]) - elif llvm_version >= distutils.version.LooseVersion('3.9'): + elif llvm_version >= distutils.version.LooseVersion('5.0'): env.Prepend(LIBS = [ 'LLVMX86Disassembler', 'LLVMX86AsmParser', 'LLVMX86CodeGen', 'LLVMSelectionDAG', 'LLVMAsmPrinter', 'LLVMDebugInfoCodeView', 'LLVMCodeGen', 'LLVMScalarOpts', 'LLVMInstCombine', - 'LLVMInstrumentation', 'LLVMTransformUtils', + 'LLVMTransformUtils', 'LLVMBitWriter', 'LLVMX86Desc', 'LLVMMCDisassembler', 'LLVMX86Info', 'LLVMX86AsmPrinter', 'LLVMX86Utils', @@ -182,56 +194,44 @@ 'LLVMRuntimeDyld', 'LLVMObject', 'LLVMMCParser', 'LLVMBitReader', 'LLVMMC', 'LLVMCore', 'LLVMSupport', - 'LLVMIRReader', 'LLVMASMParser' + 'LLVMIRReader', 'LLVMAsmParser', + 'LLVMDemangle', 'LLVMGlobalISel', 'LLVMDebugInfoMSF', + 'LLVMBinaryFormat', ]) - elif llvm_version >= distutils.version.LooseVersion('3.7'): + elif llvm_version >= distutils.version.LooseVersion('4.0'): env.Prepend(LIBS = [ - 'LLVMBitWriter', 'LLVMX86Disassembler', 'LLVMX86AsmParser', + 'LLVMX86Disassembler', 'LLVMX86AsmParser', 'LLVMX86CodeGen', 'LLVMSelectionDAG', 'LLVMAsmPrinter', - 'LLVMCodeGen', 'LLVMScalarOpts', 'LLVMProfileData', - 'LLVMInstCombine', 'LLVMInstrumentation', 'LLVMTransformUtils', 'LLVMipa', - 'LLVMAnalysis', 'LLVMX86Desc', 'LLVMMCDisassembler', - 'LLVMX86Info', 'LLVMX86AsmPrinter', 'LLVMX86Utils', - 'LLVMMCJIT', 'LLVMTarget', 'LLVMExecutionEngine', + 'LLVMDebugInfoCodeView', 'LLVMCodeGen', + 'LLVMScalarOpts', 'LLVMInstCombine', + 'LLVMTransformUtils', + 'LLVMBitWriter', 'LLVMX86Desc', + 'LLVMMCDisassembler', 'LLVMX86Info', + 'LLVMX86AsmPrinter', 'LLVMX86Utils', + 'LLVMMCJIT', 'LLVMExecutionEngine', 'LLVMTarget', + 'LLVMAnalysis', 'LLVMProfileData', 'LLVMRuntimeDyld', 'LLVMObject', 'LLVMMCParser', - 'LLVMBitReader', 'LLVMMC', 'LLVMCore', 'LLVMSupport' + 'LLVMBitReader', 'LLVMMC', 'LLVMCore', + 'LLVMSupport', + 'LLVMIRReader', 'LLVMAsmParser', + 'LLVMDemangle', 'LLVMGlobalISel', 'LLVMDebugInfoMSF', ]) - elif llvm_version >= distutils.version.LooseVersion('3.6'): + else: env.Prepend(LIBS = [ - 'LLVMBitWriter', 'LLVMX86Disassembler', 'LLVMX86AsmParser', + 'LLVMX86Disassembler', 'LLVMX86AsmParser', 'LLVMX86CodeGen', 'LLVMSelectionDAG', 'LLVMAsmPrinter', - 'LLVMCodeGen', 'LLVMScalarOpts', 'LLVMProfileData', - 'LLVMInstCombine', 'LLVMTransformUtils', 'LLVMipa', - 'LLVMAnalysis', 'LLVMX86Desc', 'LLVMMCDisassembler', - 'LLVMX86Info', 'LLVMX86AsmPrinter', 'LLVMX86Utils', - 'LLVMMCJIT', 'LLVMTarget', 'LLVMExecutionEngine', + 'LLVMDebugInfoCodeView', 'LLVMCodeGen', + 'LLVMScalarOpts', 'LLVMInstCombine', + 'LLVMInstrumentation', 'LLVMTransformUtils', + 'LLVMBitWriter', 'LLVMX86Desc', + 'LLVMMCDisassembler', 'LLVMX86Info', + 'LLVMX86AsmPrinter', 'LLVMX86Utils', + 'LLVMMCJIT', 'LLVMExecutionEngine', 'LLVMTarget', + 'LLVMAnalysis', 'LLVMProfileData', 'LLVMRuntimeDyld', 'LLVMObject', 'LLVMMCParser', - 'LLVMBitReader', 'LLVMMC', 'LLVMCore', 'LLVMSupport' - ]) - elif llvm_version >= distutils.version.LooseVersion('3.5'): - env.Prepend(LIBS = [ - 'LLVMMCDisassembler', - 'LLVMBitWriter', 'LLVMMCJIT', 'LLVMRuntimeDyld', - 'LLVMX86Disassembler', 'LLVMX86AsmParser', 'LLVMX86CodeGen', - 'LLVMSelectionDAG', 'LLVMAsmPrinter', 'LLVMX86Desc', - 'LLVMObject', 'LLVMMCParser', 'LLVMBitReader', 'LLVMX86Info', - 'LLVMX86AsmPrinter', 'LLVMX86Utils', 'LLVMJIT', - 'LLVMExecutionEngine', 'LLVMCodeGen', 'LLVMScalarOpts', - 'LLVMInstCombine', 'LLVMTransformUtils', 'LLVMipa', - 'LLVMAnalysis', 'LLVMTarget', 'LLVMMC', 'LLVMCore', - 'LLVMSupport' - ]) - else: - env.Prepend(LIBS = [ - 'LLVMMCDisassembler', - 'LLVMBitWriter', 'LLVMX86Disassembler', 'LLVMX86AsmParser', - 'LLVMX86CodeGen', 'LLVMX86Desc', 'LLVMSelectionDAG', - 'LLVMAsmPrinter', 'LLVMMCParser', 'LLVMX86AsmPrinter', - 'LLVMX86Utils', 'LLVMX86Info', 'LLVMMCJIT', 'LLVMJIT', - 'LLVMExecutionEngine', 'LLVMCodeGen', 'LLVMScalarOpts', - 'LLVMInstCombine', 'LLVMTransformUtils', 'LLVMipa', - 'LLVMAnalysis', 'LLVMTarget', 'LLVMMC', 'LLVMCore', - 'LLVMSupport', 'LLVMRuntimeDyld', 'LLVMObject' + 'LLVMBitReader', 'LLVMMC', 'LLVMCore', + 'LLVMSupport', + 'LLVMIRReader', 'LLVMASMParser' ]) env.Append(LIBS = [ 'imagehlp', @@ -296,11 +296,13 @@ else: components = ['engine', 'mcjit', 'bitwriter', 'mcdisassembler', 'irreader'] + if llvm_version >= distutils.version.LooseVersion('8.0'): + components.append('coroutines') + env.ParseConfig('%s --libs ' % llvm_config + ' '.join(components)) env.ParseConfig('%s --ldflags' % llvm_config) - if llvm_version >= distutils.version.LooseVersion('3.5'): - env.ParseConfig('%s --system-libs' % llvm_config) - env.Append(CXXFLAGS = ['-std=c++14']) + env.ParseConfig('%s --system-libs' % llvm_config) + env.Append(CXXFLAGS = ['-std=c++14']) except OSError: print('scons: llvm-config version %s failed' % llvm_version) return @@ -311,11 +313,9 @@ print('scons: Found LLVM version %s' % llvm_version) env['LLVM_VERSION'] = llvm_version - # Define HAVE_LLVM macro with the major/minor version number (e.g., 0x0206 for 2.6) - llvm_version_major = int(llvm_version.version[0]) - llvm_version_minor = int(llvm_version.version[1]) - llvm_version_hex = '0x%02x%02x' % (llvm_version_major, llvm_version_minor) - env.Prepend(CPPDEFINES = [('HAVE_LLVM', llvm_version_hex)]) + # Define LLVM_AVAILABLE macro to guard code blocks, and MESA_LLVM_VERSION_STRING + env.Prepend(CPPDEFINES = [('LLVM_AVAILABLE', 1)]) + env.Prepend(CPPDEFINES = [('MESA_LLVM_VERSION_STRING=\\"%s\\"' % llvm_version)]) def exists(env): return True diff -Nru mesa-19.2.8/SConstruct mesa-20.0.8/SConstruct --- mesa-19.2.8/SConstruct 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/SConstruct 2020-06-12 01:21:16.000000000 +0000 @@ -20,6 +20,7 @@ # to get the full list of options. See scons manpage for more info. # +from __future__ import print_function import os import os.path import sys @@ -66,6 +67,26 @@ Help(opts.GenerateHelpText(env)) + +####################################################################### +# Print a deprecation warning for using scons on non-windows + +if common.host_platform != 'windows' and env['platform'] != 'windows': + if env['force_scons']: + print("WARNING: Scons is deprecated for non-windows platforms (including cygwin) " + "please use meson instead.", file=sys.stderr) + else: + print("ERROR: Scons is deprecated for non-windows platforms (including cygwin) " + "please use meson instead. If you really need to use scons you " + "can add `force_scons=1` to the scons command line.", file=sys.stderr) + sys.exit(1) +else: + print("WARNING: Scons support is in the process of being deprecated on " + "on windows platforms (including mingw). If you haven't already " + "please try using meson for windows builds. Be sure to report any " + "issues you run into", file=sys.stderr) + + ####################################################################### # Environment setup @@ -73,7 +94,7 @@ mesa_version = f.read().strip() env.Append(CPPDEFINES = [ ('PACKAGE_VERSION', '\\"%s\\"' % mesa_version), - ('PACKAGE_BUGREPORT', '\\"https://gitlab.freedesktop.org/mesa/mesa/issues\\"'), + ('PACKAGE_BUGREPORT', '\\"https://gitlab.freedesktop.org/mesa/mesa/-/issues\\"'), ]) # Includes diff -Nru mesa-19.2.8/scripts/get_reviewer.pl mesa-20.0.8/scripts/get_reviewer.pl --- mesa-19.2.8/scripts/get_reviewer.pl 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/scripts/get_reviewer.pl 1970-01-01 00:00:00.000000000 +0000 @@ -1,2302 +0,0 @@ -#!/usr/bin/env perl -# (c) 2007, Joe Perches -# created from checkpatch.pl -# -# Print selected REVIEWERS information for -# the files modified in a patch or for a file -# -# usage: perl scripts/get_reviewer.pl [OPTIONS] -# perl scripts/get_reviewer.pl [OPTIONS] -f -# -# A minimally modified version of get_maintainer.pl from the -# Linux source tree, adapted for use in mesa. -# -# Licensed under the terms of the GNU GPL License version 2 - -use strict; -use warnings; - -my $P = $0; -my $V = '0.26'; - -use Getopt::Long qw(:config no_auto_abbrev); -use Cwd; - -my $cur_path = fastgetcwd() . '/'; -my $lk_path = "./"; -my $email = 1; -my $email_usename = 1; -my $email_maintainer = 1; -my $email_reviewer = 1; -my $email_list = 1; -my $email_subscriber_list = 0; -my $email_git_penguin_chiefs = 0; -my $email_git = 0; -my $email_git_all_signature_types = 0; -my $email_git_blame = 0; -my $email_git_blame_signatures = 1; -my $email_git_fallback = 1; -my $email_git_min_signatures = 1; -my $email_git_max_maintainers = 5; -my $email_git_min_percent = 15; -my $email_git_since = "1-year-ago"; -my $email_hg_since = "-365"; -my $interactive = 0; -my $email_remove_duplicates = 1; -my $email_use_mailmap = 1; -my $output_multiline = 1; -my $output_separator = ", "; -my $output_roles = 0; -my $output_rolestats = 1; -my $output_section_maxlen = 50; -my $scm = 0; -my $web = 0; -my $subsystem = 0; -my $status = 0; -my $keywords = 1; -my $sections = 0; -my $file_emails = 0; -my $from_filename = 0; -my $pattern_depth = 0; -my $version = 0; -my $help = 0; - -my $vcs_used = 0; - -my $exit = 0; - -my %commit_author_hash; -my %commit_signer_hash; - -my @penguin_chief = (); -#push(@penguin_chief, "Linus Torvalds:torvalds\@linux-foundation.org"); -#Andrew wants in on most everything - 2009/01/14 -#push(@penguin_chief, "Andrew Morton:akpm\@linux-foundation.org"); - -my @penguin_chief_names = (); -foreach my $chief (@penguin_chief) { - if ($chief =~ m/^(.*):(.*)/) { - my $chief_name = $1; - my $chief_addr = $2; - push(@penguin_chief_names, $chief_name); - } -} -my $penguin_chiefs = "\(" . join("|", @penguin_chief_names) . "\)"; - -# Signature types of people who are either -# a) responsible for the code in question, or -# b) familiar enough with it to give relevant feedback -my @signature_tags = (); -push(@signature_tags, "Signed-off-by:"); -push(@signature_tags, "Reviewed-by:"); -push(@signature_tags, "Acked-by:"); - -my $signature_pattern = "\(" . join("|", @signature_tags) . "\)"; - -# rfc822 email address - preloaded methods go here. -my $rfc822_lwsp = "(?:(?:\\r\\n)?[ \\t])"; -my $rfc822_char = '[\\000-\\377]'; - -# VCS command support: class-like functions and strings - -my %VCS_cmds; - -my %VCS_cmds_git = ( - "execute_cmd" => \&git_execute_cmd, - "available" => '(which("git") ne "") && (-e ".git")', - "find_signers_cmd" => - "git log --no-color --follow --since=\$email_git_since " . - '--numstat --no-merges ' . - '--format="GitCommit: %H%n' . - 'GitAuthor: %an <%ae>%n' . - 'GitDate: %aD%n' . - 'GitSubject: %s%n' . - '%b%n"' . - " -- \$file", - "find_commit_signers_cmd" => - "git log --no-color " . - '--numstat ' . - '--format="GitCommit: %H%n' . - 'GitAuthor: %an <%ae>%n' . - 'GitDate: %aD%n' . - 'GitSubject: %s%n' . - '%b%n"' . - " -1 \$commit", - "find_commit_author_cmd" => - "git log --no-color " . - '--numstat ' . - '--format="GitCommit: %H%n' . - 'GitAuthor: %an <%ae>%n' . - 'GitDate: %aD%n' . - 'GitSubject: %s%n"' . - " -1 \$commit", - "blame_range_cmd" => "git blame -l -L \$diff_start,+\$diff_length \$file", - "blame_file_cmd" => "git blame -l \$file", - "commit_pattern" => "^GitCommit: ([0-9a-f]{40,40})", - "blame_commit_pattern" => "^([0-9a-f]+) ", - "author_pattern" => "^GitAuthor: (.*)", - "subject_pattern" => "^GitSubject: (.*)", - "stat_pattern" => "^(\\d+)\\t(\\d+)\\t\$file\$", -); - -my %VCS_cmds_hg = ( - "execute_cmd" => \&hg_execute_cmd, - "available" => '(which("hg") ne "") && (-d ".hg")', - "find_signers_cmd" => - "hg log --date=\$email_hg_since " . - "--template='HgCommit: {node}\\n" . - "HgAuthor: {author}\\n" . - "HgSubject: {desc}\\n'" . - " -- \$file", - "find_commit_signers_cmd" => - "hg log " . - "--template='HgSubject: {desc}\\n'" . - " -r \$commit", - "find_commit_author_cmd" => - "hg log " . - "--template='HgCommit: {node}\\n" . - "HgAuthor: {author}\\n" . - "HgSubject: {desc|firstline}\\n'" . - " -r \$commit", - "blame_range_cmd" => "", # not supported - "blame_file_cmd" => "hg blame -n \$file", - "commit_pattern" => "^HgCommit: ([0-9a-f]{40,40})", - "blame_commit_pattern" => "^([ 0-9a-f]+):", - "author_pattern" => "^HgAuthor: (.*)", - "subject_pattern" => "^HgSubject: (.*)", - "stat_pattern" => "^(\\d+)\t(\\d+)\t\$file\$", -); - -my $conf = which_conf(".get_maintainer.conf"); -if (-f $conf) { - my @conf_args; - open(my $conffile, '<', "$conf") - or warn "$P: Can't find a readable .get_maintainer.conf file $!\n"; - - while (<$conffile>) { - my $line = $_; - - $line =~ s/\s*\n?$//g; - $line =~ s/^\s*//g; - $line =~ s/\s+/ /g; - - next if ($line =~ m/^\s*#/); - next if ($line =~ m/^\s*$/); - - my @words = split(" ", $line); - foreach my $word (@words) { - last if ($word =~ m/^#/); - push (@conf_args, $word); - } - } - close($conffile); - unshift(@ARGV, @conf_args) if @conf_args; -} - -my @ignore_emails = (); -my $ignore_file = which_conf(".get_maintainer.ignore"); -if (-f $ignore_file) { - open(my $ignore, '<', "$ignore_file") - or warn "$P: Can't find a readable .get_maintainer.ignore file $!\n"; - while (<$ignore>) { - my $line = $_; - - $line =~ s/\s*\n?$//; - $line =~ s/^\s*//; - $line =~ s/\s+$//; - $line =~ s/#.*$//; - - next if ($line =~ m/^\s*$/); - if (rfc822_valid($line)) { - push(@ignore_emails, $line); - } - } - close($ignore); -} - -if (!GetOptions( - 'email!' => \$email, - 'git!' => \$email_git, - 'git-all-signature-types!' => \$email_git_all_signature_types, - 'git-blame!' => \$email_git_blame, - 'git-blame-signatures!' => \$email_git_blame_signatures, - 'git-fallback!' => \$email_git_fallback, - 'git-chief-penguins!' => \$email_git_penguin_chiefs, - 'git-min-signatures=i' => \$email_git_min_signatures, - 'git-max-maintainers=i' => \$email_git_max_maintainers, - 'git-min-percent=i' => \$email_git_min_percent, - 'git-since=s' => \$email_git_since, - 'hg-since=s' => \$email_hg_since, - 'i|interactive!' => \$interactive, - 'remove-duplicates!' => \$email_remove_duplicates, - 'mailmap!' => \$email_use_mailmap, - 'm!' => \$email_maintainer, - 'r!' => \$email_reviewer, - 'n!' => \$email_usename, - 'l!' => \$email_list, - 's!' => \$email_subscriber_list, - 'multiline!' => \$output_multiline, - 'roles!' => \$output_roles, - 'rolestats!' => \$output_rolestats, - 'separator=s' => \$output_separator, - 'subsystem!' => \$subsystem, - 'status!' => \$status, - 'scm!' => \$scm, - 'web!' => \$web, - 'pattern-depth=i' => \$pattern_depth, - 'k|keywords!' => \$keywords, - 'sections!' => \$sections, - 'fe|file-emails!' => \$file_emails, - 'f|file' => \$from_filename, - 'v|version' => \$version, - 'h|help|usage' => \$help, - )) { - die "$P: invalid argument - use --help if necessary\n"; -} - -if ($help != 0) { - usage(); - exit 0; -} - -if ($version != 0) { - print("${P} ${V}\n"); - exit 0; -} - -if (-t STDIN && !@ARGV) { - # We're talking to a terminal, but have no command line arguments. - die "$P: missing patchfile or -f file - use --help if necessary\n"; -} - -$output_multiline = 0 if ($output_separator ne ", "); -$output_rolestats = 1 if ($interactive); -$output_roles = 1 if ($output_rolestats); - -if ($sections) { - $email = 0; - $email_list = 0; - $scm = 0; - $status = 0; - $subsystem = 0; - $web = 0; - $keywords = 0; - $interactive = 0; -} else { - my $selections = $email + $scm + $status + $subsystem + $web; - if ($selections == 0) { - die "$P: Missing required option: email, scm, status, subsystem or web\n"; - } -} - -if ($email && - ($email_maintainer + $email_reviewer + - $email_list + $email_subscriber_list + - $email_git + $email_git_penguin_chiefs + $email_git_blame) == 0) { - die "$P: Please select at least 1 email option\n"; -} - -if (!top_of_mesa_tree($lk_path)) { - die "$P: The current directory does not appear to be " - . "a mesa source tree.\n"; -} - -## Read REVIEWERS for type/value pairs - -my @typevalue = (); -my %keyword_hash; - -open (my $maint, '<', "${lk_path}REVIEWERS") - or die "$P: Can't open REVIEWERS: $!\n"; -while (<$maint>) { - my $line = $_; - - if ($line =~ m/^([A-Z]):\s*(.*)/) { - my $type = $1; - my $value = $2; - - ##Filename pattern matching - if ($type eq "F" || $type eq "X") { - $value =~ s@\.@\\\.@g; ##Convert . to \. - $value =~ s/\*/\.\*/g; ##Convert * to .* - $value =~ s/\?/\./g; ##Convert ? to . - ##if pattern is a directory and it lacks a trailing slash, add one - if ((-d $value)) { - $value =~ s@([^/])$@$1/@; - } - } elsif ($type eq "K") { - $keyword_hash{@typevalue} = $value; - } - push(@typevalue, "$type:$value"); - } elsif (!/^(\s)*$/) { - $line =~ s/\n$//g; - push(@typevalue, $line); - } -} -close($maint); - - -# -# Read mail address map -# - -my $mailmap; - -read_mailmap(); - -sub read_mailmap { - $mailmap = { - names => {}, - addresses => {} - }; - - return if (!$email_use_mailmap || !(-f "${lk_path}.mailmap")); - - open(my $mailmap_file, '<', "${lk_path}.mailmap") - or warn "$P: Can't open .mailmap: $!\n"; - - while (<$mailmap_file>) { - s/#.*$//; #strip comments - s/^\s+|\s+$//g; #trim - - next if (/^\s*$/); #skip empty lines - #entries have one of the following formats: - # name1 - # - # name1 - # name1 name2 - # (see man git-shortlog) - - if (/^([^<]+)<([^>]+)>$/) { - my $real_name = $1; - my $address = $2; - - $real_name =~ s/\s+$//; - ($real_name, $address) = parse_email("$real_name <$address>"); - $mailmap->{names}->{$address} = $real_name; - - } elsif (/^<([^>]+)>\s*<([^>]+)>$/) { - my $real_address = $1; - my $wrong_address = $2; - - $mailmap->{addresses}->{$wrong_address} = $real_address; - - } elsif (/^(.+)<([^>]+)>\s*<([^>]+)>$/) { - my $real_name = $1; - my $real_address = $2; - my $wrong_address = $3; - - $real_name =~ s/\s+$//; - ($real_name, $real_address) = - parse_email("$real_name <$real_address>"); - $mailmap->{names}->{$wrong_address} = $real_name; - $mailmap->{addresses}->{$wrong_address} = $real_address; - - } elsif (/^(.+)<([^>]+)>\s*(.+)\s*<([^>]+)>$/) { - my $real_name = $1; - my $real_address = $2; - my $wrong_name = $3; - my $wrong_address = $4; - - $real_name =~ s/\s+$//; - ($real_name, $real_address) = - parse_email("$real_name <$real_address>"); - - $wrong_name =~ s/\s+$//; - ($wrong_name, $wrong_address) = - parse_email("$wrong_name <$wrong_address>"); - - my $wrong_email = format_email($wrong_name, $wrong_address, 1); - $mailmap->{names}->{$wrong_email} = $real_name; - $mailmap->{addresses}->{$wrong_email} = $real_address; - } - } - close($mailmap_file); -} - -## use the filenames on the command line or find the filenames in the patchfiles - -my @files = (); -my @range = (); -my @keyword_tvi = (); -my @file_emails = (); - -if (!@ARGV) { - push(@ARGV, "&STDIN"); -} - -foreach my $file (@ARGV) { - if ($file ne "&STDIN") { - ##if $file is a directory and it lacks a trailing slash, add one - if ((-d $file)) { - $file =~ s@([^/])$@$1/@; - } elsif (!(-f $file)) { - die "$P: file '${file}' not found\n"; - } - } - if ($from_filename) { - $file =~ s/^\Q${cur_path}\E//; #strip any absolute path - $file =~ s/^\Q${lk_path}\E//; #or the path to the lk tree - push(@files, $file); - if ($file ne "REVIEWERS" && -f $file && ($keywords || $file_emails)) { - open(my $f, '<', $file) - or die "$P: Can't open $file: $!\n"; - my $text = do { local($/) ; <$f> }; - close($f); - if ($keywords) { - foreach my $line (keys %keyword_hash) { - if ($text =~ m/$keyword_hash{$line}/x) { - push(@keyword_tvi, $line); - } - } - } - if ($file_emails) { - my @poss_addr = $text =~ m$[A-Za-zÀ-ÿ\"\' \,\.\+-]*\s*[\,]*\s*[\(\<\{]{0,1}[A-Za-z0-9_\.\+-]+\@[A-Za-z0-9\.-]+\.[A-Za-z0-9]+[\)\>\}]{0,1}$g; - push(@file_emails, clean_file_emails(@poss_addr)); - } - } - } else { - my $file_cnt = @files; - my $lastfile; - - open(my $patch, "< $file") - or die "$P: Can't open $file: $!\n"; - - # We can check arbitrary information before the patch - # like the commit message, mail headers, etc... - # This allows us to match arbitrary keywords against any part - # of a git format-patch generated file (subject tags, etc...) - - my $patch_prefix = ""; #Parsing the intro - - while (<$patch>) { - my $patch_line = $_; - if (m/^\+\+\+\s+(\S+)/ or m/^---\s+(\S+)/) { - my $filename = $1; - $filename =~ s@^[^/]*/@@; - $filename =~ s@\n@@; - $lastfile = $filename; - push(@files, $filename); - $patch_prefix = "^[+-].*"; #Now parsing the actual patch - } elsif (m/^\@\@ -(\d+),(\d+)/) { - if ($email_git_blame) { - push(@range, "$lastfile:$1:$2"); - } - } elsif ($keywords) { - foreach my $line (keys %keyword_hash) { - if ($patch_line =~ m/${patch_prefix}$keyword_hash{$line}/x) { - push(@keyword_tvi, $line); - } - } - } - } - close($patch); - - if ($file_cnt == @files) { - warn "$P: file '${file}' doesn't appear to be a patch. " - . "Add -f to options?\n"; - } - @files = sort_and_uniq(@files); - } -} - -@file_emails = uniq(@file_emails); - -my %email_hash_name; -my %email_hash_address; -my @email_to = (); -my %hash_list_to; -my @list_to = (); -my @scm = (); -my @web = (); -my @subsystem = (); -my @status = (); -my %deduplicate_name_hash = (); -my %deduplicate_address_hash = (); - -my @maintainers = get_maintainers(); - -if (@maintainers) { - @maintainers = merge_email(@maintainers); - output(@maintainers); -} - -if ($scm) { - @scm = uniq(@scm); - output(@scm); -} - -if ($status) { - @status = uniq(@status); - output(@status); -} - -if ($subsystem) { - @subsystem = uniq(@subsystem); - output(@subsystem); -} - -if ($web) { - @web = uniq(@web); - output(@web); -} - -exit($exit); - -sub ignore_email_address { - my ($address) = @_; - - foreach my $ignore (@ignore_emails) { - return 1 if ($ignore eq $address); - } - - return 0; -} - -sub range_is_maintained { - my ($start, $end) = @_; - - for (my $i = $start; $i < $end; $i++) { - my $line = $typevalue[$i]; - if ($line =~ m/^([A-Z]):\s*(.*)/) { - my $type = $1; - my $value = $2; - if ($type eq 'S') { - if ($value =~ /(maintain|support)/i) { - return 1; - } - } - } - } - return 0; -} - -sub range_has_maintainer { - my ($start, $end) = @_; - - for (my $i = $start; $i < $end; $i++) { - my $line = $typevalue[$i]; - if ($line =~ m/^([A-Z]):\s*(.*)/) { - my $type = $1; - my $value = $2; - if ($type eq 'M') { - return 1; - } - } - } - return 0; -} - -sub get_maintainers { - %email_hash_name = (); - %email_hash_address = (); - %commit_author_hash = (); - %commit_signer_hash = (); - @email_to = (); - %hash_list_to = (); - @list_to = (); - @scm = (); - @web = (); - @subsystem = (); - @status = (); - %deduplicate_name_hash = (); - %deduplicate_address_hash = (); - if ($email_git_all_signature_types) { - $signature_pattern = "(.+?)[Bb][Yy]:"; - } else { - $signature_pattern = "\(" . join("|", @signature_tags) . "\)"; - } - - # Find responsible parties - - my %exact_pattern_match_hash = (); - - foreach my $file (@files) { - - my %hash; - my $tvi = find_first_section(); - while ($tvi < @typevalue) { - my $start = find_starting_index($tvi); - my $end = find_ending_index($tvi); - my $exclude = 0; - my $i; - - #Do not match excluded file patterns - - for ($i = $start; $i < $end; $i++) { - my $line = $typevalue[$i]; - if ($line =~ m/^([A-Z]):\s*(.*)/) { - my $type = $1; - my $value = $2; - if ($type eq 'X') { - if (file_match_pattern($file, $value)) { - $exclude = 1; - last; - } - } - } - } - - if (!$exclude) { - for ($i = $start; $i < $end; $i++) { - my $line = $typevalue[$i]; - if ($line =~ m/^([A-Z]):\s*(.*)/) { - my $type = $1; - my $value = $2; - if ($type eq 'F') { - if (file_match_pattern($file, $value)) { - my $value_pd = ($value =~ tr@/@@); - my $file_pd = ($file =~ tr@/@@); - $value_pd++ if (substr($value,-1,1) ne "/"); - $value_pd = -1 if ($value =~ /^\.\*/); - if ($value_pd >= $file_pd && - range_is_maintained($start, $end) && - range_has_maintainer($start, $end)) { - $exact_pattern_match_hash{$file} = 1; - } - if ($pattern_depth == 0 || - (($file_pd - $value_pd) < $pattern_depth)) { - $hash{$tvi} = $value_pd; - } - } - } elsif ($type eq 'N') { - if ($file =~ m/$value/x) { - $hash{$tvi} = 0; - } - } - } - } - } - $tvi = $end + 1; - } - - foreach my $line (sort {$hash{$b} <=> $hash{$a}} keys %hash) { - add_categories($line); - if ($sections) { - my $i; - my $start = find_starting_index($line); - my $end = find_ending_index($line); - for ($i = $start; $i < $end; $i++) { - my $line = $typevalue[$i]; - if ($line =~ /^[FX]:/) { ##Restore file patterns - $line =~ s/([^\\])\.([^\*])/$1\?$2/g; - $line =~ s/([^\\])\.$/$1\?/g; ##Convert . back to ? - $line =~ s/\\\./\./g; ##Convert \. to . - $line =~ s/\.\*/\*/g; ##Convert .* to * - } - $line =~ s/^([A-Z]):/$1:\t/g; - print("$line\n"); - } - print("\n"); - } - } - } - - if ($keywords) { - @keyword_tvi = sort_and_uniq(@keyword_tvi); - foreach my $line (@keyword_tvi) { - add_categories($line); - } - } - - foreach my $email (@email_to, @list_to) { - $email->[0] = deduplicate_email($email->[0]); - } - - foreach my $file (@files) { - if ($email && - ($email_git || ($email_git_fallback && - !$exact_pattern_match_hash{$file}))) { - vcs_file_signoffs($file); - } - if ($email && $email_git_blame) { - vcs_file_blame($file); - } - } - - if ($email) { - foreach my $chief (@penguin_chief) { - if ($chief =~ m/^(.*):(.*)/) { - my $email_address; - - $email_address = format_email($1, $2, $email_usename); - if ($email_git_penguin_chiefs) { - push(@email_to, [$email_address, 'chief penguin']); - } else { - @email_to = grep($_->[0] !~ /${email_address}/, @email_to); - } - } - } - - foreach my $email (@file_emails) { - my ($name, $address) = parse_email($email); - - my $tmp_email = format_email($name, $address, $email_usename); - push_email_address($tmp_email, ''); - add_role($tmp_email, 'in file'); - } - } - - my @to = (); - if ($email || $email_list) { - if ($email) { - @to = (@to, @email_to); - } - if ($email_list) { - @to = (@to, @list_to); - } - } - - if ($interactive) { - @to = interactive_get_maintainers(\@to); - } - - return @to; -} - -sub file_match_pattern { - my ($file, $pattern) = @_; - if (substr($pattern, -1) eq "/") { - if ($file =~ m@^$pattern@) { - return 1; - } - } else { - if ($file =~ m@^$pattern@) { - my $s1 = ($file =~ tr@/@@); - my $s2 = ($pattern =~ tr@/@@); - if ($s1 == $s2) { - return 1; - } - } - } - return 0; -} - -sub usage { - print < print email address(es) if any - --git => include recent git \*-by: signers - --git-all-signature-types => include signers regardless of signature type - or use only ${signature_pattern} signers (default: $email_git_all_signature_types) - --git-fallback => use git when no exact REVIEWERS pattern (default: $email_git_fallback) - --git-chief-penguins => include ${penguin_chiefs} - --git-min-signatures => number of signatures required (default: $email_git_min_signatures) - --git-max-maintainers => maximum maintainers to add (default: $email_git_max_maintainers) - --git-min-percent => minimum percentage of commits required (default: $email_git_min_percent) - --git-blame => use git blame to find modified commits for patch or file - --git-blame-signatures => when used with --git-blame, also include all commit signers - --git-since => git history to use (default: $email_git_since) - --hg-since => hg history to use (default: $email_hg_since) - --interactive => display a menu (mostly useful if used with the --git option) - --m => include maintainer(s) if any - --r => include reviewer(s) if any - --n => include name 'Full Name ' - --l => include list(s) if any - --s => include subscriber only list(s) if any - --remove-duplicates => minimize duplicate email names/addresses - --roles => show roles (status:subsystem, git-signer, list, etc...) - --rolestats => show roles and statistics (commits/total_commits, %) - --file-emails => add email addresses found in -f file (default: 0 (off)) - --scm => print SCM tree(s) if any - --status => print status if any - --subsystem => print subsystem name if any - --web => print website(s) if any - -Output type options: - --separator [, ] => separator for multiple entries on 1 line - using --separator also sets --nomultiline if --separator is not [, ] - --multiline => print 1 entry per line - -Other options: - --pattern-depth => Number of pattern directory traversals (default: 0 (all)) - --keywords => scan patch for keywords (default: $keywords) - --sections => print all of the subsystem sections with pattern matches - --mailmap => use .mailmap file (default: $email_use_mailmap) - --version => show version - --help => show this help information - -Default options: - [--email --nogit --git-fallback --m --r --n --l --multiline --pattern-depth=0 - --remove-duplicates --rolestats] - -Notes: - Using "-f directory" may give unexpected results: - Used with "--git", git signators for _all_ files in and below - directory are examined as git recurses directories. - Any specified X: (exclude) pattern matches are _not_ ignored. - Used with "--nogit", directory is used as a pattern match, - no individual file within the directory or subdirectory - is matched. - Used with "--git-blame", does not iterate all files in directory - Using "--git-blame" is slow and may add old committers and authors - that are no longer active maintainers to the output. - Using "--roles" or "--rolestats" with git send-email --cc-cmd or any - other automated tools that expect only ["name"] - may not work because of additional output after . - Using "--rolestats" and "--git-blame" shows the #/total=% commits, - not the percentage of the entire file authored. # of commits is - not a good measure of amount of code authored. 1 major commit may - contain a thousand lines, 5 trivial commits may modify a single line. - If git is not installed, but mercurial (hg) is installed and an .hg - repository exists, the following options apply to mercurial: - --git, - --git-min-signatures, --git-max-maintainers, --git-min-percent, and - --git-blame - Use --hg-since not --git-since to control date selection - File ".get_maintainer.conf", if it exists in the linux kernel source root - directory, can change whatever get_maintainer defaults are desired. - Entries in this file can be any command line argument. - This file is prepended to any additional command line arguments. - Multiple lines and # comments are allowed. - Most options have both positive and negative forms. - The negative forms for -- are --no and --no-. - -EOT -} - -sub top_of_mesa_tree { - my ($lk_path) = @_; - - if ($lk_path ne "" && substr($lk_path,length($lk_path)-1,1) ne "/") { - $lk_path .= "/"; - } - if ( (-f "${lk_path}docs/mesa.css") - && (-f "${lk_path}docs/features.txt") - && (-f "${lk_path}src/mesa/main/version.c") - && (-f "${lk_path}REVIEWERS") - && (-d "${lk_path}scripts")) { - return 1; - } - return 0; -} - -sub parse_email { - my ($formatted_email) = @_; - - my $name = ""; - my $address = ""; - - if ($formatted_email =~ /^([^<]+)<(.+\@.*)>.*$/) { - $name = $1; - $address = $2; - } elsif ($formatted_email =~ /^\s*<(.+\@\S*)>.*$/) { - $address = $1; - } elsif ($formatted_email =~ /^(.+\@\S*).*$/) { - $address = $1; - } - - $name =~ s/^\s+|\s+$//g; - $name =~ s/^\"|\"$//g; - $address =~ s/^\s+|\s+$//g; - - if ($name =~ /[^\w \-]/i) { ##has "must quote" chars - $name =~ s/(?"; - } - } else { - $formatted_email = $address; - } - - return $formatted_email; -} - -sub find_first_section { - my $index = 0; - - while ($index < @typevalue) { - my $tv = $typevalue[$index]; - if (($tv =~ m/^([A-Z]):\s*(.*)/)) { - last; - } - $index++; - } - - return $index; -} - -sub find_starting_index { - my ($index) = @_; - - while ($index > 0) { - my $tv = $typevalue[$index]; - if (!($tv =~ m/^([A-Z]):\s*(.*)/)) { - last; - } - $index--; - } - - return $index; -} - -sub find_ending_index { - my ($index) = @_; - - while ($index < @typevalue) { - my $tv = $typevalue[$index]; - if (!($tv =~ m/^([A-Z]):\s*(.*)/)) { - last; - } - $index++; - } - - return $index; -} - -sub get_subsystem_name { - my ($index) = @_; - - my $start = find_starting_index($index); - - my $subsystem = $typevalue[$start]; - if ($output_section_maxlen && length($subsystem) > $output_section_maxlen) { - $subsystem = substr($subsystem, 0, $output_section_maxlen - 3); - $subsystem =~ s/\s*$//; - $subsystem = $subsystem . "..."; - } - return $subsystem; -} - -sub get_maintainer_role { - my ($index) = @_; - - my $i; - my $start = find_starting_index($index); - my $end = find_ending_index($index); - - my $role = "unknown"; - my $subsystem = get_subsystem_name($index); - - for ($i = $start + 1; $i < $end; $i++) { - my $tv = $typevalue[$i]; - if ($tv =~ m/^([A-Z]):\s*(.*)/) { - my $ptype = $1; - my $pvalue = $2; - if ($ptype eq "S") { - $role = $pvalue; - } - } - } - - $role = lc($role); - if ($role eq "supported") { - $role = "supporter"; - } elsif ($role eq "maintained") { - $role = "maintainer"; - } elsif ($role eq "odd fixes") { - $role = "odd fixer"; - } elsif ($role eq "orphan") { - $role = "orphan minder"; - } elsif ($role eq "obsolete") { - $role = "obsolete minder"; - } elsif ($role eq "buried alive in reporters") { - $role = "chief penguin"; - } - - return $role . ":" . $subsystem; -} - -sub get_list_role { - my ($index) = @_; - - my $subsystem = get_subsystem_name($index); - - if ($subsystem eq "THE REST") { - $subsystem = ""; - } - - return $subsystem; -} - -sub add_categories { - my ($index) = @_; - - my $i; - my $start = find_starting_index($index); - my $end = find_ending_index($index); - - push(@subsystem, $typevalue[$start]); - - for ($i = $start + 1; $i < $end; $i++) { - my $tv = $typevalue[$i]; - if ($tv =~ m/^([A-Z]):\s*(.*)/) { - my $ptype = $1; - my $pvalue = $2; - if ($ptype eq "L") { - my $list_address = $pvalue; - my $list_additional = ""; - my $list_role = get_list_role($i); - - if ($list_role ne "") { - $list_role = ":" . $list_role; - } - if ($list_address =~ m/([^\s]+)\s+(.*)$/) { - $list_address = $1; - $list_additional = $2; - } - if ($list_additional =~ m/subscribers-only/) { - if ($email_subscriber_list) { - if (!$hash_list_to{lc($list_address)}) { - $hash_list_to{lc($list_address)} = 1; - push(@list_to, [$list_address, - "subscriber list${list_role}"]); - } - } - } else { - if ($email_list) { - if (!$hash_list_to{lc($list_address)}) { - $hash_list_to{lc($list_address)} = 1; - if ($list_additional =~ m/moderated/) { - push(@list_to, [$list_address, - "moderated list${list_role}"]); - } else { - push(@list_to, [$list_address, - "open list${list_role}"]); - } - } - } - } - } elsif ($ptype eq "M") { - my ($name, $address) = parse_email($pvalue); - if ($name eq "") { - if ($i > 0) { - my $tv = $typevalue[$i - 1]; - if ($tv =~ m/^([A-Z]):\s*(.*)/) { - if ($1 eq "P") { - $name = $2; - $pvalue = format_email($name, $address, $email_usename); - } - } - } - } - if ($email_maintainer) { - my $role = get_maintainer_role($i); - push_email_addresses($pvalue, $role); - } - } elsif ($ptype eq "R") { - my ($name, $address) = parse_email($pvalue); - if ($name eq "") { - if ($i > 0) { - my $tv = $typevalue[$i - 1]; - if ($tv =~ m/^([A-Z]):\s*(.*)/) { - if ($1 eq "P") { - $name = $2; - $pvalue = format_email($name, $address, $email_usename); - } - } - } - } - if ($email_reviewer) { - my $subsystem = get_subsystem_name($i); - push_email_addresses($pvalue, "reviewer:$subsystem"); - } - } elsif ($ptype eq "T") { - push(@scm, $pvalue); - } elsif ($ptype eq "W") { - push(@web, $pvalue); - } elsif ($ptype eq "S") { - push(@status, $pvalue); - } - } - } -} - -sub email_inuse { - my ($name, $address) = @_; - - return 1 if (($name eq "") && ($address eq "")); - return 1 if (($name ne "") && exists($email_hash_name{lc($name)})); - return 1 if (($address ne "") && exists($email_hash_address{lc($address)})); - - return 0; -} - -sub push_email_address { - my ($line, $role) = @_; - - my ($name, $address) = parse_email($line); - - if ($address eq "") { - return 0; - } - - if (!$email_remove_duplicates) { - push(@email_to, [format_email($name, $address, $email_usename), $role]); - } elsif (!email_inuse($name, $address)) { - push(@email_to, [format_email($name, $address, $email_usename), $role]); - $email_hash_name{lc($name)}++ if ($name ne ""); - $email_hash_address{lc($address)}++; - } - - return 1; -} - -sub push_email_addresses { - my ($address, $role) = @_; - - my @address_list = (); - - if (rfc822_valid($address)) { - push_email_address($address, $role); - } elsif (@address_list = rfc822_validlist($address)) { - my $array_count = shift(@address_list); - while (my $entry = shift(@address_list)) { - push_email_address($entry, $role); - } - } else { - if (!push_email_address($address, $role)) { - warn("Invalid REVIEWERS address: '" . $address . "'\n"); - } - } -} - -sub add_role { - my ($line, $role) = @_; - - my ($name, $address) = parse_email($line); - my $email = format_email($name, $address, $email_usename); - - foreach my $entry (@email_to) { - if ($email_remove_duplicates) { - my ($entry_name, $entry_address) = parse_email($entry->[0]); - if (($name eq $entry_name || $address eq $entry_address) - && ($role eq "" || !($entry->[1] =~ m/$role/)) - ) { - if ($entry->[1] eq "") { - $entry->[1] = "$role"; - } else { - $entry->[1] = "$entry->[1],$role"; - } - } - } else { - if ($email eq $entry->[0] - && ($role eq "" || !($entry->[1] =~ m/$role/)) - ) { - if ($entry->[1] eq "") { - $entry->[1] = "$role"; - } else { - $entry->[1] = "$entry->[1],$role"; - } - } - } - } -} - -sub which { - my ($bin) = @_; - - foreach my $path (split(/:/, $ENV{PATH})) { - if (-e "$path/$bin") { - return "$path/$bin"; - } - } - - return ""; -} - -sub which_conf { - my ($conf) = @_; - - foreach my $path (split(/:/, ".:$ENV{HOME}:.scripts")) { - if (-e "$path/$conf") { - return "$path/$conf"; - } - } - - return ""; -} - -sub mailmap_email { - my ($line) = @_; - - my ($name, $address) = parse_email($line); - my $email = format_email($name, $address, 1); - my $real_name = $name; - my $real_address = $address; - - if (exists $mailmap->{names}->{$email} || - exists $mailmap->{addresses}->{$email}) { - if (exists $mailmap->{names}->{$email}) { - $real_name = $mailmap->{names}->{$email}; - } - if (exists $mailmap->{addresses}->{$email}) { - $real_address = $mailmap->{addresses}->{$email}; - } - } else { - if (exists $mailmap->{names}->{$address}) { - $real_name = $mailmap->{names}->{$address}; - } - if (exists $mailmap->{addresses}->{$address}) { - $real_address = $mailmap->{addresses}->{$address}; - } - } - return format_email($real_name, $real_address, 1); -} - -sub mailmap { - my (@addresses) = @_; - - my @mapped_emails = (); - foreach my $line (@addresses) { - push(@mapped_emails, mailmap_email($line)); - } - merge_by_realname(@mapped_emails) if ($email_use_mailmap); - return @mapped_emails; -} - -sub merge_by_realname { - my %address_map; - my (@emails) = @_; - - foreach my $email (@emails) { - my ($name, $address) = parse_email($email); - if (exists $address_map{$name}) { - $address = $address_map{$name}; - $email = format_email($name, $address, 1); - } else { - $address_map{$name} = $address; - } - } -} - -sub git_execute_cmd { - my ($cmd) = @_; - my @lines = (); - - my $output = `$cmd`; - $output =~ s/^\s*//gm; - @lines = split("\n", $output); - - return @lines; -} - -sub hg_execute_cmd { - my ($cmd) = @_; - my @lines = (); - - my $output = `$cmd`; - @lines = split("\n", $output); - - return @lines; -} - -sub extract_formatted_signatures { - my (@signature_lines) = @_; - - my @type = @signature_lines; - - s/\s*(.*):.*/$1/ for (@type); - - # cut -f2- -d":" - s/\s*.*:\s*(.+)\s*/$1/ for (@signature_lines); - -## Reformat email addresses (with names) to avoid badly written signatures - - foreach my $signer (@signature_lines) { - $signer = deduplicate_email($signer); - } - - return (\@type, \@signature_lines); -} - -sub vcs_find_signers { - my ($cmd, $file) = @_; - my $commits; - my @lines = (); - my @signatures = (); - my @authors = (); - my @stats = (); - - @lines = &{$VCS_cmds{"execute_cmd"}}($cmd); - - my $pattern = $VCS_cmds{"commit_pattern"}; - my $author_pattern = $VCS_cmds{"author_pattern"}; - my $stat_pattern = $VCS_cmds{"stat_pattern"}; - - $stat_pattern =~ s/(\$\w+)/$1/eeg; #interpolate $stat_pattern - - $commits = grep(/$pattern/, @lines); # of commits - - @authors = grep(/$author_pattern/, @lines); - @signatures = grep(/^[ \t]*${signature_pattern}.*\@.*$/, @lines); - @stats = grep(/$stat_pattern/, @lines); - -# print("stats: <@stats>\n"); - - return (0, \@signatures, \@authors, \@stats) if !@signatures; - - save_commits_by_author(@lines) if ($interactive); - save_commits_by_signer(@lines) if ($interactive); - - if (!$email_git_penguin_chiefs) { - @signatures = grep(!/${penguin_chiefs}/i, @signatures); - } - - my ($author_ref, $authors_ref) = extract_formatted_signatures(@authors); - my ($types_ref, $signers_ref) = extract_formatted_signatures(@signatures); - - return ($commits, $signers_ref, $authors_ref, \@stats); -} - -sub vcs_find_author { - my ($cmd) = @_; - my @lines = (); - - @lines = &{$VCS_cmds{"execute_cmd"}}($cmd); - - if (!$email_git_penguin_chiefs) { - @lines = grep(!/${penguin_chiefs}/i, @lines); - } - - return @lines if !@lines; - - my @authors = (); - foreach my $line (@lines) { - if ($line =~ m/$VCS_cmds{"author_pattern"}/) { - my $author = $1; - my ($name, $address) = parse_email($author); - $author = format_email($name, $address, 1); - push(@authors, $author); - } - } - - save_commits_by_author(@lines) if ($interactive); - save_commits_by_signer(@lines) if ($interactive); - - return @authors; -} - -sub vcs_save_commits { - my ($cmd) = @_; - my @lines = (); - my @commits = (); - - @lines = &{$VCS_cmds{"execute_cmd"}}($cmd); - - foreach my $line (@lines) { - if ($line =~ m/$VCS_cmds{"blame_commit_pattern"}/) { - push(@commits, $1); - } - } - - return @commits; -} - -sub vcs_blame { - my ($file) = @_; - my $cmd; - my @commits = (); - - return @commits if (!(-f $file)); - - if (@range && $VCS_cmds{"blame_range_cmd"} eq "") { - my @all_commits = (); - - $cmd = $VCS_cmds{"blame_file_cmd"}; - $cmd =~ s/(\$\w+)/$1/eeg; #interpolate $cmd - @all_commits = vcs_save_commits($cmd); - - foreach my $file_range_diff (@range) { - next if (!($file_range_diff =~ m/(.+):(.+):(.+)/)); - my $diff_file = $1; - my $diff_start = $2; - my $diff_length = $3; - next if ("$file" ne "$diff_file"); - for (my $i = $diff_start; $i < $diff_start + $diff_length; $i++) { - push(@commits, $all_commits[$i]); - } - } - } elsif (@range) { - foreach my $file_range_diff (@range) { - next if (!($file_range_diff =~ m/(.+):(.+):(.+)/)); - my $diff_file = $1; - my $diff_start = $2; - my $diff_length = $3; - next if ("$file" ne "$diff_file"); - $cmd = $VCS_cmds{"blame_range_cmd"}; - $cmd =~ s/(\$\w+)/$1/eeg; #interpolate $cmd - push(@commits, vcs_save_commits($cmd)); - } - } else { - $cmd = $VCS_cmds{"blame_file_cmd"}; - $cmd =~ s/(\$\w+)/$1/eeg; #interpolate $cmd - @commits = vcs_save_commits($cmd); - } - - foreach my $commit (@commits) { - $commit =~ s/^\^//g; - } - - return @commits; -} - -my $printed_novcs = 0; -sub vcs_exists { - %VCS_cmds = %VCS_cmds_git; - return 1 if eval $VCS_cmds{"available"}; - %VCS_cmds = %VCS_cmds_hg; - return 2 if eval $VCS_cmds{"available"}; - %VCS_cmds = (); - if (!$printed_novcs) { - warn("$P: No supported VCS found. Add --nogit to options?\n"); - warn("Using a git repository produces better results.\n"); - $printed_novcs = 1; - } - return 0; -} - -sub vcs_is_git { - vcs_exists(); - return $vcs_used == 1; -} - -sub vcs_is_hg { - return $vcs_used == 2; -} - -sub interactive_get_maintainers { - my ($list_ref) = @_; - my @list = @$list_ref; - - vcs_exists(); - - my %selected; - my %authored; - my %signed; - my $count = 0; - my $maintained = 0; - foreach my $entry (@list) { - $maintained = 1 if ($entry->[1] =~ /^(maintainer|supporter)/i); - $selected{$count} = 1; - $authored{$count} = 0; - $signed{$count} = 0; - $count++; - } - - #menu loop - my $done = 0; - my $print_options = 0; - my $redraw = 1; - while (!$done) { - $count = 0; - if ($redraw) { - printf STDERR "\n%1s %2s %-65s", - "*", "#", "email/list and role:stats"; - if ($email_git || - ($email_git_fallback && !$maintained) || - $email_git_blame) { - print STDERR "auth sign"; - } - print STDERR "\n"; - foreach my $entry (@list) { - my $email = $entry->[0]; - my $role = $entry->[1]; - my $sel = ""; - $sel = "*" if ($selected{$count}); - my $commit_author = $commit_author_hash{$email}; - my $commit_signer = $commit_signer_hash{$email}; - my $authored = 0; - my $signed = 0; - $authored++ for (@{$commit_author}); - $signed++ for (@{$commit_signer}); - printf STDERR "%1s %2d %-65s", $sel, $count + 1, $email; - printf STDERR "%4d %4d", $authored, $signed - if ($authored > 0 || $signed > 0); - printf STDERR "\n %s\n", $role; - if ($authored{$count}) { - my $commit_author = $commit_author_hash{$email}; - foreach my $ref (@{$commit_author}) { - print STDERR " Author: @{$ref}[1]\n"; - } - } - if ($signed{$count}) { - my $commit_signer = $commit_signer_hash{$email}; - foreach my $ref (@{$commit_signer}) { - print STDERR " @{$ref}[2]: @{$ref}[1]\n"; - } - } - - $count++; - } - } - my $date_ref = \$email_git_since; - $date_ref = \$email_hg_since if (vcs_is_hg()); - if ($print_options) { - $print_options = 0; - if (vcs_exists()) { - print STDERR <; - chomp($input); - - $redraw = 1; - my $rerun = 0; - my @wish = split(/[, ]+/, $input); - foreach my $nr (@wish) { - $nr = lc($nr); - my $sel = substr($nr, 0, 1); - my $str = substr($nr, 1); - my $val = 0; - $val = $1 if $str =~ /^(\d+)$/; - - if ($sel eq "y") { - $interactive = 0; - $done = 1; - $output_rolestats = 0; - $output_roles = 0; - last; - } elsif ($nr =~ /^\d+$/ && $nr > 0 && $nr <= $count) { - $selected{$nr - 1} = !$selected{$nr - 1}; - } elsif ($sel eq "*" || $sel eq '^') { - my $toggle = 0; - $toggle = 1 if ($sel eq '*'); - for (my $i = 0; $i < $count; $i++) { - $selected{$i} = $toggle; - } - } elsif ($sel eq "0") { - for (my $i = 0; $i < $count; $i++) { - $selected{$i} = !$selected{$i}; - } - } elsif ($sel eq "t") { - if (lc($str) eq "m") { - for (my $i = 0; $i < $count; $i++) { - $selected{$i} = !$selected{$i} - if ($list[$i]->[1] =~ /^(maintainer|supporter)/i); - } - } elsif (lc($str) eq "g") { - for (my $i = 0; $i < $count; $i++) { - $selected{$i} = !$selected{$i} - if ($list[$i]->[1] =~ /^(author|commit|signer)/i); - } - } elsif (lc($str) eq "l") { - for (my $i = 0; $i < $count; $i++) { - $selected{$i} = !$selected{$i} - if ($list[$i]->[1] =~ /^(open list)/i); - } - } elsif (lc($str) eq "s") { - for (my $i = 0; $i < $count; $i++) { - $selected{$i} = !$selected{$i} - if ($list[$i]->[1] =~ /^(subscriber list)/i); - } - } - } elsif ($sel eq "a") { - if ($val > 0 && $val <= $count) { - $authored{$val - 1} = !$authored{$val - 1}; - } elsif ($str eq '*' || $str eq '^') { - my $toggle = 0; - $toggle = 1 if ($str eq '*'); - for (my $i = 0; $i < $count; $i++) { - $authored{$i} = $toggle; - } - } - } elsif ($sel eq "s") { - if ($val > 0 && $val <= $count) { - $signed{$val - 1} = !$signed{$val - 1}; - } elsif ($str eq '*' || $str eq '^') { - my $toggle = 0; - $toggle = 1 if ($str eq '*'); - for (my $i = 0; $i < $count; $i++) { - $signed{$i} = $toggle; - } - } - } elsif ($sel eq "o") { - $print_options = 1; - $redraw = 1; - } elsif ($sel eq "g") { - if ($str eq "f") { - bool_invert(\$email_git_fallback); - } else { - bool_invert(\$email_git); - } - $rerun = 1; - } elsif ($sel eq "b") { - if ($str eq "s") { - bool_invert(\$email_git_blame_signatures); - } else { - bool_invert(\$email_git_blame); - } - $rerun = 1; - } elsif ($sel eq "c") { - if ($val > 0) { - $email_git_min_signatures = $val; - $rerun = 1; - } - } elsif ($sel eq "x") { - if ($val > 0) { - $email_git_max_maintainers = $val; - $rerun = 1; - } - } elsif ($sel eq "%") { - if ($str ne "" && $val >= 0) { - $email_git_min_percent = $val; - $rerun = 1; - } - } elsif ($sel eq "d") { - if (vcs_is_git()) { - $email_git_since = $str; - } elsif (vcs_is_hg()) { - $email_hg_since = $str; - } - $rerun = 1; - } elsif ($sel eq "t") { - bool_invert(\$email_git_all_signature_types); - $rerun = 1; - } elsif ($sel eq "f") { - bool_invert(\$file_emails); - $rerun = 1; - } elsif ($sel eq "r") { - bool_invert(\$email_remove_duplicates); - $rerun = 1; - } elsif ($sel eq "m") { - bool_invert(\$email_use_mailmap); - read_mailmap(); - $rerun = 1; - } elsif ($sel eq "k") { - bool_invert(\$keywords); - $rerun = 1; - } elsif ($sel eq "p") { - if ($str ne "" && $val >= 0) { - $pattern_depth = $val; - $rerun = 1; - } - } elsif ($sel eq "h" || $sel eq "?") { - print STDERR <[0]; - $address = $deduplicate_name_hash{lc($name)}->[1]; - $matched = 1; - } elsif ($deduplicate_address_hash{lc($address)}) { - $name = $deduplicate_address_hash{lc($address)}->[0]; - $address = $deduplicate_address_hash{lc($address)}->[1]; - $matched = 1; - } - if (!$matched) { - $deduplicate_name_hash{lc($name)} = [ $name, $address ]; - $deduplicate_address_hash{lc($address)} = [ $name, $address ]; - } - $email = format_email($name, $address, 1); - $email = mailmap_email($email); - return $email; -} - -sub save_commits_by_author { - my (@lines) = @_; - - my @authors = (); - my @commits = (); - my @subjects = (); - - foreach my $line (@lines) { - if ($line =~ m/$VCS_cmds{"author_pattern"}/) { - my $author = $1; - $author = deduplicate_email($author); - push(@authors, $author); - } - push(@commits, $1) if ($line =~ m/$VCS_cmds{"commit_pattern"}/); - push(@subjects, $1) if ($line =~ m/$VCS_cmds{"subject_pattern"}/); - } - - for (my $i = 0; $i < @authors; $i++) { - my $exists = 0; - foreach my $ref(@{$commit_author_hash{$authors[$i]}}) { - if (@{$ref}[0] eq $commits[$i] && - @{$ref}[1] eq $subjects[$i]) { - $exists = 1; - last; - } - } - if (!$exists) { - push(@{$commit_author_hash{$authors[$i]}}, - [ ($commits[$i], $subjects[$i]) ]); - } - } -} - -sub save_commits_by_signer { - my (@lines) = @_; - - my $commit = ""; - my $subject = ""; - - foreach my $line (@lines) { - $commit = $1 if ($line =~ m/$VCS_cmds{"commit_pattern"}/); - $subject = $1 if ($line =~ m/$VCS_cmds{"subject_pattern"}/); - if ($line =~ /^[ \t]*${signature_pattern}.*\@.*$/) { - my @signatures = ($line); - my ($types_ref, $signers_ref) = extract_formatted_signatures(@signatures); - my @types = @$types_ref; - my @signers = @$signers_ref; - - my $type = $types[0]; - my $signer = $signers[0]; - - $signer = deduplicate_email($signer); - - my $exists = 0; - foreach my $ref(@{$commit_signer_hash{$signer}}) { - if (@{$ref}[0] eq $commit && - @{$ref}[1] eq $subject && - @{$ref}[2] eq $type) { - $exists = 1; - last; - } - } - if (!$exists) { - push(@{$commit_signer_hash{$signer}}, - [ ($commit, $subject, $type) ]); - } - } - } -} - -sub vcs_assign { - my ($role, $divisor, @lines) = @_; - - my %hash; - my $count = 0; - - return if (@lines <= 0); - - if ($divisor <= 0) { - warn("Bad divisor in " . (caller(0))[3] . ": $divisor\n"); - $divisor = 1; - } - - @lines = mailmap(@lines); - - return if (@lines <= 0); - - @lines = sort(@lines); - - # uniq -c - $hash{$_}++ for @lines; - - # sort -rn - foreach my $line (sort {$hash{$b} <=> $hash{$a}} keys %hash) { - my $sign_offs = $hash{$line}; - my $percent = $sign_offs * 100 / $divisor; - - $percent = 100 if ($percent > 100); - next if (ignore_email_address($line)); - $count++; - last if ($sign_offs < $email_git_min_signatures || - $count > $email_git_max_maintainers || - $percent < $email_git_min_percent); - push_email_address($line, ''); - if ($output_rolestats) { - my $fmt_percent = sprintf("%.0f", $percent); - add_role($line, "$role:$sign_offs/$divisor=$fmt_percent%"); - } else { - add_role($line, $role); - } - } -} - -sub vcs_file_signoffs { - my ($file) = @_; - - my $authors_ref; - my $signers_ref; - my $stats_ref; - my @authors = (); - my @signers = (); - my @stats = (); - my $commits; - - $vcs_used = vcs_exists(); - return if (!$vcs_used); - - my $cmd = $VCS_cmds{"find_signers_cmd"}; - $cmd =~ s/(\$\w+)/$1/eeg; # interpolate $cmd - - ($commits, $signers_ref, $authors_ref, $stats_ref) = vcs_find_signers($cmd, $file); - - @signers = @{$signers_ref} if defined $signers_ref; - @authors = @{$authors_ref} if defined $authors_ref; - @stats = @{$stats_ref} if defined $stats_ref; - -# print("commits: <$commits>\nsigners:<@signers>\nauthors: <@authors>\nstats: <@stats>\n"); - - foreach my $signer (@signers) { - $signer = deduplicate_email($signer); - } - - vcs_assign("commit_signer", $commits, @signers); - vcs_assign("authored", $commits, @authors); - if ($#authors == $#stats) { - my $stat_pattern = $VCS_cmds{"stat_pattern"}; - $stat_pattern =~ s/(\$\w+)/$1/eeg; #interpolate $stat_pattern - - my $added = 0; - my $deleted = 0; - for (my $i = 0; $i <= $#stats; $i++) { - if ($stats[$i] =~ /$stat_pattern/) { - $added += $1; - $deleted += $2; - } - } - my @tmp_authors = uniq(@authors); - foreach my $author (@tmp_authors) { - $author = deduplicate_email($author); - } - @tmp_authors = uniq(@tmp_authors); - my @list_added = (); - my @list_deleted = (); - foreach my $author (@tmp_authors) { - my $auth_added = 0; - my $auth_deleted = 0; - for (my $i = 0; $i <= $#stats; $i++) { - if ($author eq deduplicate_email($authors[$i]) && - $stats[$i] =~ /$stat_pattern/) { - $auth_added += $1; - $auth_deleted += $2; - } - } - for (my $i = 0; $i < $auth_added; $i++) { - push(@list_added, $author); - } - for (my $i = 0; $i < $auth_deleted; $i++) { - push(@list_deleted, $author); - } - } - vcs_assign("added_lines", $added, @list_added); - vcs_assign("removed_lines", $deleted, @list_deleted); - } -} - -sub vcs_file_blame { - my ($file) = @_; - - my @signers = (); - my @all_commits = (); - my @commits = (); - my $total_commits; - my $total_lines; - - $vcs_used = vcs_exists(); - return if (!$vcs_used); - - @all_commits = vcs_blame($file); - @commits = uniq(@all_commits); - $total_commits = @commits; - $total_lines = @all_commits; - - if ($email_git_blame_signatures) { - if (vcs_is_hg()) { - my $commit_count; - my $commit_authors_ref; - my $commit_signers_ref; - my $stats_ref; - my @commit_authors = (); - my @commit_signers = (); - my $commit = join(" -r ", @commits); - my $cmd; - - $cmd = $VCS_cmds{"find_commit_signers_cmd"}; - $cmd =~ s/(\$\w+)/$1/eeg; #substitute variables in $cmd - - ($commit_count, $commit_signers_ref, $commit_authors_ref, $stats_ref) = vcs_find_signers($cmd, $file); - @commit_authors = @{$commit_authors_ref} if defined $commit_authors_ref; - @commit_signers = @{$commit_signers_ref} if defined $commit_signers_ref; - - push(@signers, @commit_signers); - } else { - foreach my $commit (@commits) { - my $commit_count; - my $commit_authors_ref; - my $commit_signers_ref; - my $stats_ref; - my @commit_authors = (); - my @commit_signers = (); - my $cmd; - - $cmd = $VCS_cmds{"find_commit_signers_cmd"}; - $cmd =~ s/(\$\w+)/$1/eeg; #substitute variables in $cmd - - ($commit_count, $commit_signers_ref, $commit_authors_ref, $stats_ref) = vcs_find_signers($cmd, $file); - @commit_authors = @{$commit_authors_ref} if defined $commit_authors_ref; - @commit_signers = @{$commit_signers_ref} if defined $commit_signers_ref; - - push(@signers, @commit_signers); - } - } - } - - if ($from_filename) { - if ($output_rolestats) { - my @blame_signers; - if (vcs_is_hg()) {{ # Double brace for last exit - my $commit_count; - my @commit_signers = (); - @commits = uniq(@commits); - @commits = sort(@commits); - my $commit = join(" -r ", @commits); - my $cmd; - - $cmd = $VCS_cmds{"find_commit_author_cmd"}; - $cmd =~ s/(\$\w+)/$1/eeg; #substitute variables in $cmd - - my @lines = (); - - @lines = &{$VCS_cmds{"execute_cmd"}}($cmd); - - if (!$email_git_penguin_chiefs) { - @lines = grep(!/${penguin_chiefs}/i, @lines); - } - - last if !@lines; - - my @authors = (); - foreach my $line (@lines) { - if ($line =~ m/$VCS_cmds{"author_pattern"}/) { - my $author = $1; - $author = deduplicate_email($author); - push(@authors, $author); - } - } - - save_commits_by_author(@lines) if ($interactive); - save_commits_by_signer(@lines) if ($interactive); - - push(@signers, @authors); - }} - else { - foreach my $commit (@commits) { - my $i; - my $cmd = $VCS_cmds{"find_commit_author_cmd"}; - $cmd =~ s/(\$\w+)/$1/eeg; #interpolate $cmd - my @author = vcs_find_author($cmd); - next if !@author; - - my $formatted_author = deduplicate_email($author[0]); - - my $count = grep(/$commit/, @all_commits); - for ($i = 0; $i < $count ; $i++) { - push(@blame_signers, $formatted_author); - } - } - } - if (@blame_signers) { - vcs_assign("authored lines", $total_lines, @blame_signers); - } - } - foreach my $signer (@signers) { - $signer = deduplicate_email($signer); - } - vcs_assign("commits", $total_commits, @signers); - } else { - foreach my $signer (@signers) { - $signer = deduplicate_email($signer); - } - vcs_assign("modified commits", $total_commits, @signers); - } -} - -sub uniq { - my (@parms) = @_; - - my %saw; - @parms = grep(!$saw{$_}++, @parms); - return @parms; -} - -sub sort_and_uniq { - my (@parms) = @_; - - my %saw; - @parms = sort @parms; - @parms = grep(!$saw{$_}++, @parms); - return @parms; -} - -sub clean_file_emails { - my (@file_emails) = @_; - my @fmt_emails = (); - - foreach my $email (@file_emails) { - $email =~ s/[\(\<\{]{0,1}([A-Za-z0-9_\.\+-]+\@[A-Za-z0-9\.-]+)[\)\>\}]{0,1}/\<$1\>/g; - my ($name, $address) = parse_email($email); - if ($name eq '"[,\.]"') { - $name = ""; - } - - my @nw = split(/[^A-Za-zÀ-ÿ\'\,\.\+-]/, $name); - if (@nw > 2) { - my $first = $nw[@nw - 3]; - my $middle = $nw[@nw - 2]; - my $last = $nw[@nw - 1]; - - if (((length($first) == 1 && $first =~ m/[A-Za-z]/) || - (length($first) == 2 && substr($first, -1) eq ".")) || - (length($middle) == 1 || - (length($middle) == 2 && substr($middle, -1) eq "."))) { - $name = "$first $middle $last"; - } else { - $name = "$middle $last"; - } - } - - if (substr($name, -1) =~ /[,\.]/) { - $name = substr($name, 0, length($name) - 1); - } elsif (substr($name, -2) =~ /[,\.]"/) { - $name = substr($name, 0, length($name) - 2) . '"'; - } - - if (substr($name, 0, 1) =~ /[,\.]/) { - $name = substr($name, 1, length($name) - 1); - } elsif (substr($name, 0, 2) =~ /"[,\.]/) { - $name = '"' . substr($name, 2, length($name) - 2); - } - - my $fmt_email = format_email($name, $address, $email_usename); - push(@fmt_emails, $fmt_email); - } - return @fmt_emails; -} - -sub merge_email { - my @lines; - my %saw; - - for (@_) { - my ($address, $role) = @$_; - if (!$saw{$address}) { - if ($output_roles) { - push(@lines, "$address ($role)"); - } else { - push(@lines, $address); - } - $saw{$address} = 1; - } - } - - return @lines; -} - -sub output { - my (@parms) = @_; - - if ($output_multiline) { - foreach my $line (@parms) { - print("${line}\n"); - } - } else { - print(join($output_separator, @parms)); - print("\n"); - } -} - -my $rfc822re; - -sub make_rfc822re { -# Basic lexical tokens are specials, domain_literal, quoted_string, atom, and -# comment. We must allow for rfc822_lwsp (or comments) after each of these. -# This regexp will only work on addresses which have had comments stripped -# and replaced with rfc822_lwsp. - - my $specials = '()<>@,;:\\\\".\\[\\]'; - my $controls = '\\000-\\037\\177'; - - my $dtext = "[^\\[\\]\\r\\\\]"; - my $domain_literal = "\\[(?:$dtext|\\\\.)*\\]$rfc822_lwsp*"; - - my $quoted_string = "\"(?:[^\\\"\\r\\\\]|\\\\.|$rfc822_lwsp)*\"$rfc822_lwsp*"; - -# Use zero-width assertion to spot the limit of an atom. A simple -# $rfc822_lwsp* causes the regexp engine to hang occasionally. - my $atom = "[^$specials $controls]+(?:$rfc822_lwsp+|\\Z|(?=[\\[\"$specials]))"; - my $word = "(?:$atom|$quoted_string)"; - my $localpart = "$word(?:\\.$rfc822_lwsp*$word)*"; - - my $sub_domain = "(?:$atom|$domain_literal)"; - my $domain = "$sub_domain(?:\\.$rfc822_lwsp*$sub_domain)*"; - - my $addr_spec = "$localpart\@$rfc822_lwsp*$domain"; - - my $phrase = "$word*"; - my $route = "(?:\@$domain(?:,\@$rfc822_lwsp*$domain)*:$rfc822_lwsp*)"; - my $route_addr = "\\<$rfc822_lwsp*$route?$addr_spec\\>$rfc822_lwsp*"; - my $mailbox = "(?:$addr_spec|$phrase$route_addr)"; - - my $group = "$phrase:$rfc822_lwsp*(?:$mailbox(?:,\\s*$mailbox)*)?;\\s*"; - my $address = "(?:$mailbox|$group)"; - - return "$rfc822_lwsp*$address"; -} - -sub rfc822_strip_comments { - my $s = shift; -# Recursively remove comments, and replace with a single space. The simpler -# regexps in the Email Addressing FAQ are imperfect - they will miss escaped -# chars in atoms, for example. - - while ($s =~ s/^((?:[^"\\]|\\.)* - (?:"(?:[^"\\]|\\.)*"(?:[^"\\]|\\.)*)*) - \((?:[^()\\]|\\.)*\)/$1 /osx) {} - return $s; -} - -# valid: returns true if the parameter is an RFC822 valid address -# -sub rfc822_valid { - my $s = rfc822_strip_comments(shift); - - if (!$rfc822re) { - $rfc822re = make_rfc822re(); - } - - return $s =~ m/^$rfc822re$/so && $s =~ m/^$rfc822_char*$/; -} - -# validlist: In scalar context, returns true if the parameter is an RFC822 -# valid list of addresses. -# -# In list context, returns an empty list on failure (an invalid -# address was found); otherwise a list whose first element is the -# number of addresses found and whose remaining elements are the -# addresses. This is needed to disambiguate failure (invalid) -# from success with no addresses found, because an empty string is -# a valid list. - -sub rfc822_validlist { - my $s = rfc822_strip_comments(shift); - - if (!$rfc822re) { - $rfc822re = make_rfc822re(); - } - # * null list items are valid according to the RFC - # * the '1' business is to aid in distinguishing failure from no results - - my @r; - if ($s =~ m/^(?:$rfc822re)?(?:,(?:$rfc822re)?)*$/so && - $s =~ m/^$rfc822_char*$/) { - while ($s =~ m/(?:^|,$rfc822_lwsp*)($rfc822re)/gos) { - push(@r, $1); - } - return wantarray ? (scalar(@r), @r) : 1; - } - return wantarray ? () : 0; -} diff -Nru mesa-19.2.8/src/amd/addrlib/inc/addrinterface.h mesa-20.0.8/src/amd/addrlib/inc/addrinterface.h --- mesa-19.2.8/src/amd/addrlib/inc/addrinterface.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/addrlib/inc/addrinterface.h 2020-06-12 01:21:16.000000000 +0000 @@ -308,7 +308,8 @@ UINT_32 useHtileSliceAlign : 1; ///< Do htile single slice alignment UINT_32 allowLargeThickTile : 1; ///< Allow 64*thickness*bytesPerPixel > rowSize UINT_32 forceDccAndTcCompat : 1; ///< Force enable DCC and TC compatibility - UINT_32 reserved : 24; ///< Reserved bits for future use + UINT_32 nonPower2MemConfig : 1; ///< Physical video memory size is not power of 2 + UINT_32 reserved : 23; ///< Reserved bits for future use }; UINT_32 value; @@ -347,9 +348,6 @@ ///< CI registers------------------------------------------------- const UINT_32* pMacroTileConfig; ///< Global macro tile mode table UINT_32 noOfMacroEntries; ///< Number of entries in pMacroTileConfig - - ///< GFX9 HW parameters - UINT_32 blockVarSizeLog2; ///< SW_VAR_* block size } ADDR_REGISTER_VALUE; /** @@ -3549,12 +3547,14 @@ { struct { - UINT_32 micro : 1; // 256B block for 2D resource - UINT_32 macro4KB : 1; // 4KB for 2D/3D resource - UINT_32 macro64KB : 1; // 64KB for 2D/3D resource - UINT_32 var : 1; // VAR block - UINT_32 linear : 1; // Linear block - UINT_32 reserved : 27; + UINT_32 micro : 1; // 256B block for 2D resource + UINT_32 macroThin4KB : 1; // Thin 4KB for 2D/3D resource + UINT_32 macroThick4KB : 1; // Thick 4KB for 3D resource + UINT_32 macroThin64KB : 1; // Thin 64KB for 2D/3D resource + UINT_32 macroThick64KB : 1; // Thick 64KB for 3D resource + UINT_32 var : 1; // VAR block + UINT_32 linear : 1; // Linear block + UINT_32 reserved : 25; }; UINT_32 value; @@ -3594,38 +3594,38 @@ { struct { - UINT_32 swLinear : 1; - UINT_32 sw256B_S : 1; - UINT_32 sw256B_D : 1; - UINT_32 sw256B_R : 1; - UINT_32 sw4KB_Z : 1; - UINT_32 sw4KB_S : 1; - UINT_32 sw4KB_D : 1; - UINT_32 sw4KB_R : 1; - UINT_32 sw64KB_Z : 1; - UINT_32 sw64KB_S : 1; - UINT_32 sw64KB_D : 1; - UINT_32 sw64KB_R : 1; - UINT_32 swVar_Z : 1; - UINT_32 swVar_S : 1; - UINT_32 swVar_D : 1; - UINT_32 swVar_R : 1; - UINT_32 sw64KB_Z_T : 1; - UINT_32 sw64KB_S_T : 1; - UINT_32 sw64KB_D_T : 1; - UINT_32 sw64KB_R_T : 1; - UINT_32 sw4KB_Z_X : 1; - UINT_32 sw4KB_S_X : 1; - UINT_32 sw4KB_D_X : 1; - UINT_32 sw4KB_R_X : 1; - UINT_32 sw64KB_Z_X : 1; - UINT_32 sw64KB_S_X : 1; - UINT_32 sw64KB_D_X : 1; - UINT_32 sw64KB_R_X : 1; - UINT_32 swVar_Z_X : 1; - UINT_32 swVar_S_X : 1; - UINT_32 swVar_D_X : 1; - UINT_32 swVar_R_X : 1; + UINT_32 swLinear : 1; + UINT_32 sw256B_S : 1; + UINT_32 sw256B_D : 1; + UINT_32 sw256B_R : 1; + UINT_32 sw4KB_Z : 1; + UINT_32 sw4KB_S : 1; + UINT_32 sw4KB_D : 1; + UINT_32 sw4KB_R : 1; + UINT_32 sw64KB_Z : 1; + UINT_32 sw64KB_S : 1; + UINT_32 sw64KB_D : 1; + UINT_32 sw64KB_R : 1; + UINT_32 swReserved0 : 1; + UINT_32 swReserved1 : 1; + UINT_32 swReserved2 : 1; + UINT_32 swReserved3 : 1; + UINT_32 sw64KB_Z_T : 1; + UINT_32 sw64KB_S_T : 1; + UINT_32 sw64KB_D_T : 1; + UINT_32 sw64KB_R_T : 1; + UINT_32 sw4KB_Z_X : 1; + UINT_32 sw4KB_S_X : 1; + UINT_32 sw4KB_D_X : 1; + UINT_32 sw4KB_R_X : 1; + UINT_32 sw64KB_Z_X : 1; + UINT_32 sw64KB_S_X : 1; + UINT_32 sw64KB_D_X : 1; + UINT_32 sw64KB_R_X : 1; + UINT_32 swVar_Z_X : 1; + UINT_32 swReserved4 : 1; + UINT_32 swReserved5 : 1; + UINT_32 swVar_R_X : 1; }; UINT_32 value; diff -Nru mesa-19.2.8/src/amd/addrlib/inc/addrtypes.h mesa-20.0.8/src/amd/addrlib/inc/addrtypes.h --- mesa-19.2.8/src/amd/addrlib/inc/addrtypes.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/addrlib/inc/addrtypes.h 2020-06-12 01:21:16.000000000 +0000 @@ -87,14 +87,8 @@ #endif #ifndef ADDR_FASTCALL - #if defined(BRAHMA_ARM) - #define ADDR_FASTCALL - #elif defined(__GNUC__) - #if defined(__i386__) - #define ADDR_FASTCALL __attribute__((regparm(0))) - #else - #define ADDR_FASTCALL - #endif + #if defined(__GNUC__) + #define ADDR_FASTCALL __attribute__((regparm(0))) #else #define ADDR_FASTCALL __fastcall #endif @@ -119,7 +113,11 @@ #define ADDR_INLINE __inline #endif // #if defined(__GNUC__) -#define ADDR_API ADDR_FASTCALL //default call convention is fast call +#if defined(__amd64__) || defined(__x86_64__) || defined(__i386__) + #define ADDR_API ADDR_FASTCALL // default call convention is fast call +#else + #define ADDR_API +#endif /** **************************************************************************************************** @@ -203,22 +201,32 @@ /** **************************************************************************************************** * @brief -* Neutral enums that define swizzle modes for Gfx9 ASIC +* Neutral enums that define swizzle modes for Gfx9+ ASIC * @note * -* ADDR_SW_LINEAR linear aligned addressing mode, for 1D/2D/3D resouce -* ADDR_SW_256B_* addressing block aligned size is 256B, for 2D/3D resouce -* ADDR_SW_4KB_* addressing block aligned size is 4KB, for 2D/3D resouce -* ADDR_SW_64KB_* addressing block aligned size is 64KB, for 2D/3D resouce -* ADDR_SW_VAR_* addressing block aligned size is ASIC specific, for 2D/3D resouce -* -* ADDR_SW_*_Z For 2D resouce, represents Z-order swizzle mode for depth/stencil/FMask - For 3D resouce, represents a swizzle mode similar to legacy thick tile mode -* ADDR_SW_*_S represents standard swizzle mode defined by MS -* ADDR_SW_*_D For 2D resouce, represents a swizzle mode for displayable resource -* For 3D resouce, represents a swizzle mode which places each slice in order & pixel +* ADDR_SW_LINEAR linear aligned addressing mode, for 1D/2D/3D resource +* ADDR_SW_256B_* addressing block aligned size is 256B, for 2D/3D resource +* ADDR_SW_4KB_* addressing block aligned size is 4KB, for 2D/3D resource +* ADDR_SW_64KB_* addressing block aligned size is 64KB, for 2D/3D resource +* +* ADDR_SW_*_Z For GFX9: + - for 2D resource, represents Z-order swizzle mode for depth/stencil/FMask + - for 3D resource, represents a swizzle mode similar to legacy thick tile mode + For GFX10: + - represents Z-order swizzle mode for depth/stencil/FMask +* ADDR_SW_*_S For GFX9+: + - represents standard swizzle mode defined by MS +* ADDR_SW_*_D For GFX9: + - for 2D resource, represents a swizzle mode for displayable resource +* - for 3D resource, represents a swizzle mode which places each slice in order & pixel + For GFX10: + - for 2D resource, represents a swizzle mode for displayable resource + - for 3D resource, represents a swizzle mode similar to legacy thick tile mode within slice is placed as 2D ADDR_SW_*_S. Don't use this combination if possible! -* ADDR_SW_*_R For 2D resouce only, represents a swizzle mode for rotated displayable resource +* ADDR_SW_*_R For GFX9: + - 2D resource only, represents a swizzle mode for rotated displayable resource + For GFX10: + - represents a swizzle mode for render target resource * **************************************************************************************************** */ @@ -236,10 +244,10 @@ ADDR_SW_64KB_S = 9, ADDR_SW_64KB_D = 10, ADDR_SW_64KB_R = 11, - ADDR_SW_VAR_Z = 12, - ADDR_SW_VAR_S = 13, - ADDR_SW_VAR_D = 14, - ADDR_SW_VAR_R = 15, + ADDR_SW_RESERVED0 = 12, + ADDR_SW_RESERVED1 = 13, + ADDR_SW_RESERVED2 = 14, + ADDR_SW_RESERVED3 = 15, ADDR_SW_64KB_Z_T = 16, ADDR_SW_64KB_S_T = 17, ADDR_SW_64KB_D_T = 18, @@ -253,17 +261,11 @@ ADDR_SW_64KB_D_X = 26, ADDR_SW_64KB_R_X = 27, ADDR_SW_VAR_Z_X = 28, - ADDR_SW_VAR_S_X = 29, - ADDR_SW_VAR_D_X = 30, + ADDR_SW_RESERVED4 = 29, + ADDR_SW_RESERVED5 = 30, ADDR_SW_VAR_R_X = 31, ADDR_SW_LINEAR_GENERAL = 32, ADDR_SW_MAX_TYPE = 33, - - // Used for represent block with identical size - ADDR_SW_256B = ADDR_SW_256B_S, - ADDR_SW_4KB = ADDR_SW_4KB_S_X, - ADDR_SW_64KB = ADDR_SW_64KB_S_X, - ADDR_SW_VAR = ADDR_SW_VAR_S_X, } AddrSwizzleMode; /** @@ -316,7 +318,9 @@ ADDR_SW_Z = 0, // Resource basic swizzle mode is ZOrder ADDR_SW_S = 1, // Resource basic swizzle mode is Standard ADDR_SW_D = 2, // Resource basic swizzle mode is Display - ADDR_SW_R = 3, // Resource basic swizzle mode is Rotated + ADDR_SW_R = 3, // Resource basic swizzle mode is Rotated/Render optimized + ADDR_SW_L = 4, // Resource basic swizzle mode is Linear + ADDR_SW_MAX_SWTYPE } AddrSwType; /** diff -Nru mesa-19.2.8/src/amd/addrlib/src/addrinterface.cpp mesa-20.0.8/src/amd/addrlib/src/addrinterface.cpp --- mesa-19.2.8/src/amd/addrlib/src/addrinterface.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/addrlib/src/addrinterface.cpp 2020-06-12 01:21:16.000000000 +0000 @@ -60,7 +60,6 @@ ADDR_CREATE_OUTPUT* pAddrCreateOut) ///< [out] address lib handle { ADDR_E_RETURNCODE returnCode = ADDR_OK; - { returnCode = Lib::Create(pAddrCreateIn, pAddrCreateOut); } diff -Nru mesa-19.2.8/src/amd/addrlib/src/chip/gfx10/gfx10_gb_reg.h mesa-20.0.8/src/amd/addrlib/src/chip/gfx10/gfx10_gb_reg.h --- mesa-19.2.8/src/amd/addrlib/src/chip/gfx10/gfx10_gb_reg.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/addrlib/src/chip/gfx10/gfx10_gb_reg.h 2020-06-12 01:21:16.000000000 +0000 @@ -39,9 +39,9 @@ // #include "util/u_endian.h" -#if defined(PIPE_ARCH_LITTLE_ENDIAN) +#if UTIL_ARCH_LITTLE_ENDIAN #define LITTLEENDIAN_CPU -#elif defined(PIPE_ARCH_BIG_ENDIAN) +#elif UTIL_ARCH_BIG_ENDIAN #define BIGENDIAN_CPU #endif diff -Nru mesa-19.2.8/src/amd/addrlib/src/chip/gfx9/gfx9_gb_reg.h mesa-20.0.8/src/amd/addrlib/src/chip/gfx9/gfx9_gb_reg.h --- mesa-19.2.8/src/amd/addrlib/src/chip/gfx9/gfx9_gb_reg.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/addrlib/src/chip/gfx9/gfx9_gb_reg.h 2020-06-12 01:21:16.000000000 +0000 @@ -39,9 +39,9 @@ // #include "util/u_endian.h" -#if defined(PIPE_ARCH_LITTLE_ENDIAN) +#if UTIL_ARCH_LITTLE_ENDIAN #define LITTLEENDIAN_CPU -#elif defined(PIPE_ARCH_BIG_ENDIAN) +#elif UTIL_ARCH_BIG_ENDIAN #define BIGENDIAN_CPU #endif diff -Nru mesa-19.2.8/src/amd/addrlib/src/chip/r800/si_gb_reg.h mesa-20.0.8/src/amd/addrlib/src/chip/r800/si_gb_reg.h --- mesa-19.2.8/src/amd/addrlib/src/chip/r800/si_gb_reg.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/addrlib/src/chip/r800/si_gb_reg.h 2020-06-12 01:21:16.000000000 +0000 @@ -40,9 +40,9 @@ // #include "util/u_endian.h" -#if defined(PIPE_ARCH_LITTLE_ENDIAN) +#if UTIL_ARCH_LITTLE_ENDIAN #define LITTLEENDIAN_CPU -#elif defined(PIPE_ARCH_BIG_ENDIAN) +#elif UTIL_ARCH_BIG_ENDIAN #define BIGENDIAN_CPU #endif diff -Nru mesa-19.2.8/src/amd/addrlib/src/core/addrcommon.h mesa-20.0.8/src/amd/addrlib/src/core/addrcommon.h --- mesa-19.2.8/src/amd/addrlib/src/core/addrcommon.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/addrlib/src/core/addrcommon.h 2020-06-12 01:21:16.000000000 +0000 @@ -46,12 +46,16 @@ // ADDR_LNX_KERNEL_BUILD is for internal build // Moved from addrinterface.h so __KERNEL__ is not needed any more -#if !defined(__APPLE__) || defined(HAVE_TSERVER) +#if ADDR_LNX_KERNEL_BUILD // || (defined(__GNUC__) && defined(__KERNEL__)) + #include +#elif !defined(__APPLE__) || defined(HAVE_TSERVER) #include #include - #include #endif +#include +#include "util/macros.h" + //////////////////////////////////////////////////////////////////////////////////////////////////// // Platform specific debug break defines //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -64,7 +68,7 @@ #define ADDR_DBG_BREAK() { __debugbreak(); } #endif #else - #define ADDR_DBG_BREAK() + #define ADDR_DBG_BREAK() do {} while(0) #endif //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -143,24 +147,20 @@ #define ADDRDPF 1 ? (void)0 : (void) -#define ADDR_PRNT(a) +#define ADDR_PRNT(a) do {} while(0) -#define ADDR_DBG_BREAK() +#define ADDR_DBG_BREAK() do {} while(0) -#define ADDR_INFO(cond, a) +#define ADDR_INFO(cond, a) do {} while(0) -#define ADDR_WARN(cond, a) +#define ADDR_WARN(cond, a) do {} while(0) -#define ADDR_EXIT(cond, a) +#define ADDR_EXIT(cond, a) do {} while(0) #endif // DEBUG //////////////////////////////////////////////////////////////////////////////////////////////////// -#if defined(static_assert) -#define ADDR_C_ASSERT(__e) static_assert(__e, "") -#else -#define ADDR_C_ASSERT(__e) typedef char __ADDR_C_ASSERT__[(__e) ? 1 : -1] -#endif +#define ADDR_C_ASSERT(__e) STATIC_ASSERT(__e) namespace Addr { @@ -270,7 +270,8 @@ UINT_32 disableLinearOpt : 1; ///< Disallow tile modes to be optimized to linear UINT_32 use32bppFor422Fmt : 1; ///< View 422 formats as 32 bits per pixel element UINT_32 forceDccAndTcCompat : 1; ///< Force enable DCC and TC compatibility - UINT_32 reserved : 20; ///< Reserved bits for future use + UINT_32 nonPower2MemConfig : 1; ///< Physical video memory size is not power of 2 + UINT_32 reserved : 19; ///< Reserved bits for future use }; UINT_32 value; @@ -926,6 +927,21 @@ return mask; } +/** +**************************************************************************************************** +* ShiftCeil +* +* @brief +* Apply righ-shift with ceiling +**************************************************************************************************** +*/ +static inline UINT_32 ShiftCeil( + UINT_32 a, ///< [in] value to be right-shifted + UINT_32 b) ///< [in] number of bits to shift +{ + return (a >> b) + (((a & ((1 << b) - 1)) != 0) ? 1 : 0); +} + } // Addr #endif // __ADDR_COMMON_H__ diff -Nru mesa-19.2.8/src/amd/addrlib/src/core/addrlib2.cpp mesa-20.0.8/src/amd/addrlib/src/core/addrlib2.cpp --- mesa-19.2.8/src/amd/addrlib/src/core/addrlib2.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/addrlib/src/core/addrlib2.cpp 2020-06-12 01:21:16.000000000 +0000 @@ -73,7 +73,8 @@ m_rbPerSeLog2(0), m_maxCompFragLog2(0), m_pipeInterleaveLog2(0), - m_blockVarSizeLog2(0) + m_blockVarSizeLog2(0), + m_numEquations(0) { } @@ -98,7 +99,8 @@ m_rbPerSeLog2(0), m_maxCompFragLog2(0), m_pipeInterleaveLog2(0), - m_blockVarSizeLog2(0) + m_blockVarSizeLog2(0), + m_numEquations(0) { } @@ -1363,30 +1365,20 @@ AddrResourceType resourceType, AddrSwizzleMode swizzleMode) const { - ADDR_E_RETURNCODE returnCode = ComputeBlockDimension(pWidth, - pHeight, - pDepth, - bpp, - resourceType, - swizzleMode); - - if ((returnCode == ADDR_OK) && (numSamples > 1) && IsThin(resourceType, swizzleMode)) - { - const UINT_32 log2blkSize = GetBlockSizeLog2(swizzleMode); - const UINT_32 log2sample = Log2(numSamples); - const UINT_32 q = log2sample >> 1; - const UINT_32 r = log2sample & 1; + ADDR_E_RETURNCODE returnCode = ADDR_OK; - if (log2blkSize & 1) - { - *pWidth >>= q; - *pHeight >>= (q + r); - } - else - { - *pWidth >>= (q + r); - *pHeight >>= q; - } + if (IsThick(resourceType, swizzleMode)) + { + ComputeThickBlockDimension(pWidth, pHeight, pDepth, bpp, resourceType, swizzleMode); + } + else if (IsThin(resourceType, swizzleMode)) + { + ComputeThinBlockDimension(pWidth, pHeight, pDepth, bpp, numSamples, resourceType, swizzleMode); + } + else + { + ADDR_ASSERT_ALWAYS(); + returnCode = ADDR_INVALIDPARAMS; } return returnCode; @@ -1394,6 +1386,46 @@ /** ************************************************************************************************************************ +* Lib::ComputeThinBlockDimension +* +* @brief +* Internal function to get thin block width/height/depth in element from surface input params. +* +* @return +* N/A +************************************************************************************************************************ +*/ +VOID Lib::ComputeThinBlockDimension( + UINT_32* pWidth, + UINT_32* pHeight, + UINT_32* pDepth, + UINT_32 bpp, + UINT_32 numSamples, + AddrResourceType resourceType, + AddrSwizzleMode swizzleMode) const +{ + ADDR_ASSERT(IsThin(resourceType, swizzleMode)); + + // GFX9/GFX10 use different dimension amplifying logic: say for 128KB block + 1xAA + 1BPE, the dimension of thin + // swizzle mode will be [256W * 512H] on GFX9 ASICs and [512W * 256H] on GFX10 ASICs. Since GFX10 is newer HWL so we + // make its implementation into base class (in order to save future change on new HWLs) + const UINT_32 log2BlkSize = GetBlockSizeLog2(swizzleMode); + const UINT_32 log2EleBytes = Log2(bpp >> 3); + const UINT_32 log2Samples = Log2(Max(numSamples, 1u)); + const UINT_32 log2NumEle = log2BlkSize - log2EleBytes - log2Samples; + + // For "1xAA/4xAA cases" or "2xAA/8xAA + odd log2BlkSize cases", width == height or width == 2 * height; + // For other cases, height == width or height == 2 * width + const BOOL_32 widthPrecedent = ((log2Samples & 1) == 0) || ((log2BlkSize & 1) != 0); + const UINT_32 log2Width = (log2NumEle + (widthPrecedent ? 1 : 0)) / 2; + + *pWidth = 1u << log2Width; + *pHeight = 1u << (log2NumEle - log2Width); + *pDepth = 1; +} + +/** +************************************************************************************************************************ * Lib::ComputeBlockDimension * * @brief @@ -1404,42 +1436,22 @@ ************************************************************************************************************************ */ ADDR_E_RETURNCODE Lib::ComputeBlockDimension( - UINT_32* pWidth, - UINT_32* pHeight, - UINT_32* pDepth, - UINT_32 bpp, - AddrResourceType resourceType, - AddrSwizzleMode swizzleMode) const + UINT_32* pWidth, + UINT_32* pHeight, + UINT_32* pDepth, + UINT_32 bpp, + AddrResourceType resourceType, + AddrSwizzleMode swizzleMode) const { ADDR_E_RETURNCODE returnCode = ADDR_OK; - UINT_32 eleBytes = bpp >> 3; - UINT_32 microBlockSizeTableIndex = Log2(eleBytes); - UINT_32 log2blkSize = GetBlockSizeLog2(swizzleMode); - - if (IsThin(resourceType, swizzleMode)) + if (IsThick(resourceType, swizzleMode)) { - UINT_32 log2blkSizeIn256B = log2blkSize - 8; - UINT_32 widthAmp = log2blkSizeIn256B / 2; - UINT_32 heightAmp = log2blkSizeIn256B - widthAmp; - - ADDR_ASSERT(microBlockSizeTableIndex < sizeof(Block256_2d) / sizeof(Block256_2d[0])); - - *pWidth = (Block256_2d[microBlockSizeTableIndex].w << widthAmp); - *pHeight = (Block256_2d[microBlockSizeTableIndex].h << heightAmp); - *pDepth = 1; + ComputeThickBlockDimension(pWidth, pHeight, pDepth, bpp, resourceType, swizzleMode); } - else if (IsThick(resourceType, swizzleMode)) + else if (IsThin(resourceType, swizzleMode)) { - UINT_32 log2blkSizeIn1KB = log2blkSize - 10; - UINT_32 averageAmp = log2blkSizeIn1KB / 3; - UINT_32 restAmp = log2blkSizeIn1KB % 3; - - ADDR_ASSERT(microBlockSizeTableIndex < sizeof(Block1K_3d) / sizeof(Block1K_3d[0])); - - *pWidth = Block1K_3d[microBlockSizeTableIndex].w << averageAmp; - *pHeight = Block1K_3d[microBlockSizeTableIndex].h << (averageAmp + (restAmp / 2)); - *pDepth = Block1K_3d[microBlockSizeTableIndex].d << (averageAmp + ((restAmp != 0) ? 1 : 0)); + ComputeThinBlockDimension(pWidth, pHeight, pDepth, bpp, 0, resourceType, swizzleMode); } else { @@ -1452,6 +1464,42 @@ /** ************************************************************************************************************************ +* Lib::ComputeThickBlockDimension +* +* @brief +* Internal function to get block width/height/depth in element for thick swizzle mode +* +* @return +* N/A +************************************************************************************************************************ +*/ +VOID Lib::ComputeThickBlockDimension( + UINT_32* pWidth, + UINT_32* pHeight, + UINT_32* pDepth, + UINT_32 bpp, + AddrResourceType resourceType, + AddrSwizzleMode swizzleMode) const +{ + ADDR_ASSERT(IsThick(resourceType, swizzleMode)); + + const UINT_32 log2BlkSize = GetBlockSizeLog2(swizzleMode); + const UINT_32 eleBytes = bpp >> 3; + const UINT_32 microBlockSizeTableIndex = Log2(eleBytes); + + ADDR_ASSERT(microBlockSizeTableIndex < sizeof(Block1K_3d) / sizeof(Block1K_3d[0])); + + const UINT_32 log2blkSizeIn1KB = log2BlkSize - 10; + const UINT_32 averageAmp = log2blkSizeIn1KB / 3; + const UINT_32 restAmp = log2blkSizeIn1KB % 3; + + *pWidth = Block1K_3d[microBlockSizeTableIndex].w << averageAmp; + *pHeight = Block1K_3d[microBlockSizeTableIndex].h << (averageAmp + (restAmp / 2)); + *pDepth = Block1K_3d[microBlockSizeTableIndex].d << (averageAmp + ((restAmp != 0) ? 1 : 0)); +} + +/** +************************************************************************************************************************ * Lib::GetMipTailDim * * @brief @@ -1469,11 +1517,11 @@ UINT_32 blockDepth) const { Dim3d out = {blockWidth, blockHeight, blockDepth}; - UINT_32 log2blkSize = GetBlockSizeLog2(swizzleMode); + UINT_32 log2BlkSize = GetBlockSizeLog2(swizzleMode); if (IsThick(resourceType, swizzleMode)) { - UINT_32 dim = log2blkSize % 3; + UINT_32 dim = log2BlkSize % 3; if (dim == 0) { @@ -1490,11 +1538,22 @@ } else { - if (log2blkSize & 1) + ADDR_ASSERT(IsThin(resourceType, swizzleMode)); + + // GFX9/GFX10 use different dimension shrinking logic for mipmap tail: say for 128KB block + 2BPE, the maximum + // dimension of mipmap tail level will be [256W * 128H] on GFX9 ASICs and [128W * 256H] on GFX10 ASICs. Since + // GFX10 is newer HWL so we make its implementation into base class, in order to save future change on new HWLs. + // And assert log2BlkSize will always be an even value on GFX9, so we never need the logic wrapped by DEBUG... +#if DEBUG + if ((log2BlkSize & 1) && (m_chipFamily == ADDR_CHIP_FAMILY_AI)) { + // Should never go here... + ADDR_ASSERT_ALWAYS(); + out.h >>= 1; } else +#endif { out.w >>= 1; } @@ -1873,7 +1932,52 @@ pOut->pixelHeight <<= 1; // Double size - pOut->surfSize <<= 1; + pOut->surfSize <<= 1; + pOut->sliceSize <<= 1; +} + +/** +************************************************************************************************************************ +* Lib::FilterInvalidEqSwizzleMode +* +* @brief +* Filter out swizzle mode(s) if it doesn't have valid equation index +* +* @return +* N/A +************************************************************************************************************************ +*/ +VOID Lib::FilterInvalidEqSwizzleMode( + ADDR2_SWMODE_SET& allowedSwModeSet, + AddrResourceType resourceType, + UINT_32 elemLog2 + ) const +{ + if (resourceType != ADDR_RSRC_TEX_1D) + { + UINT_32 allowedSwModeSetVal = allowedSwModeSet.value; + const UINT_32 rsrcTypeIdx = static_cast(resourceType) - 1; + UINT_32 validSwModeSet = allowedSwModeSetVal; + + for (UINT_32 swModeIdx = 0; validSwModeSet != 0; swModeIdx++) + { + if (validSwModeSet & 1) + { + if (m_equationLookupTable[rsrcTypeIdx][swModeIdx][elemLog2] == ADDR_INVALID_EQUATION_INDEX) + { + allowedSwModeSetVal &= ~(1u << swModeIdx); + } + } + + validSwModeSet >>= 1; + } + + // Only apply the filtering if at least one valid swizzle mode remains + if (allowedSwModeSetVal != 0) + { + allowedSwModeSet.value = allowedSwModeSetVal; + } + } } } // V2 diff -Nru mesa-19.2.8/src/amd/addrlib/src/core/addrlib2.h mesa-20.0.8/src/amd/addrlib/src/core/addrlib2.h --- mesa-19.2.8/src/amd/addrlib/src/core/addrlib2.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/addrlib/src/core/addrlib2.h 2020-06-12 01:21:16.000000000 +0000 @@ -68,6 +68,8 @@ UINT_32 isT : 1; // T mode UINT_32 isRtOpt : 1; // mode opt for render target + + UINT_32 reserved : 20; // Reserved bits }; struct Dim2d @@ -87,25 +89,14 @@ enum AddrBlockType { AddrBlockMicro = 0, // Resource uses 256B block - AddrBlock4KB = 1, // Resource uses 4KB block - AddrBlock64KB = 2, // Resource uses 64KB block - AddrBlockVar = 3, // Resource uses var block, only valid for GFX9 - AddrBlockLinear = 4, // Resource uses linear swizzle mode + AddrBlockThin4KB = 1, // Resource uses thin 4KB block + AddrBlockThick4KB = 2, // Resource uses thick 4KB block + AddrBlockThin64KB = 3, // Resource uses thin 64KB block + AddrBlockThick64KB = 4, // Resource uses thick 64KB block + AddrBlockVar = 5, // Resource uses var block, only valid for GFX9 + AddrBlockLinear = 6, // Resource uses linear swizzle mode - AddrBlockMaxTiledType = AddrBlock64KB + 1, -}; - -enum AddrBlockSet -{ - AddrBlockSetMicro = 1 << AddrBlockMicro, - AddrBlockSetMacro4KB = 1 << AddrBlock4KB, - AddrBlockSetMacro64KB = 1 << AddrBlock64KB, - AddrBlockSetVar = 1 << AddrBlockVar, - AddrBlockSetLinear = 1 << AddrBlockLinear, - - AddrBlockSetMacro = AddrBlockSetMacro4KB | AddrBlockSetMacro64KB, - AddrBlockSet2dGfx10 = AddrBlockSetMicro | AddrBlockSetMacro, - AddrBlockSet3dGfx10 = AddrBlockSetMacro, + AddrBlockMaxTiledType = AddrBlockVar + 1, }; enum AddrSwSet @@ -115,11 +106,17 @@ AddrSwSetD = 1 << ADDR_SW_D, AddrSwSetR = 1 << ADDR_SW_R, - AddrSwSetAll = AddrSwSetZ | AddrSwSetS | AddrSwSetD | AddrSwSetR, - AddrSwSet3dThinGfx10 = AddrSwSetZ | AddrSwSetR, - AddrSwSetColorGfx10 = AddrSwSetS | AddrSwSetD | AddrSwSetR, + AddrSwSetAll = AddrSwSetZ | AddrSwSetS | AddrSwSetD | AddrSwSetR, }; +const UINT_32 Size256 = 256u; +const UINT_32 Size4K = 4096u; +const UINT_32 Size64K = 65536u; + +const UINT_32 Log2Size256 = 8u; +const UINT_32 Log2Size4K = 12u; +const UINT_32 Log2Size64K = 16u; + /** ************************************************************************************************************************ * @brief This class contains asic independent address lib functionalities @@ -237,6 +234,15 @@ static const UINT_32 MaxMipLevels = 16; + BOOL_32 IsValidSwMode(AddrSwizzleMode swizzleMode) const + { + // Don't dereference a reinterpret_cast pointer so as not to break + // strict-aliasing rules. + UINT_32 mode; + memcpy(&mode, &m_swizzleModeTable[swizzleMode], sizeof(UINT_32)); + return mode != 0; + } + // Checking block size BOOL_32 IsBlock256b(AddrSwizzleMode swizzleMode) const { @@ -356,7 +362,7 @@ { blockSizeLog2 = 16; } - else if (IsBlockVariable(swizzleMode)) + else if (IsBlockVariable(swizzleMode) && (m_blockVarSizeLog2 != 0)) { blockSizeLog2 = m_blockVarSizeLog2; } @@ -653,12 +659,29 @@ AddrSwizzleMode swizzleMode) const; ADDR_E_RETURNCODE ComputeBlockDimension( - UINT_32* pWidth, - UINT_32* pHeight, - UINT_32* pDepth, - UINT_32 bpp, - AddrResourceType resourceType, - AddrSwizzleMode swizzleMode) const; + UINT_32* pWidth, + UINT_32* pHeight, + UINT_32* pDepth, + UINT_32 bpp, + AddrResourceType resourceType, + AddrSwizzleMode swizzleMode) const; + + virtual VOID ComputeThinBlockDimension( + UINT_32* pWidth, + UINT_32* pHeight, + UINT_32* pDepth, + UINT_32 bpp, + UINT_32 numSamples, + AddrResourceType resourceType, + AddrSwizzleMode swizzleMode) const; + + VOID ComputeThickBlockDimension( + UINT_32* pWidth, + UINT_32* pHeight, + UINT_32* pDepth, + UINT_32 bpp, + AddrResourceType resourceType, + AddrSwizzleMode swizzleMode) const; static UINT_64 ComputePadSize( const Dim3d* pBlkDim, @@ -793,6 +816,11 @@ VOID ComputeQbStereoInfo(ADDR2_COMPUTE_SURFACE_INFO_OUTPUT* pOut) const; + VOID FilterInvalidEqSwizzleMode( + ADDR2_SWMODE_SET& allowedSwModeSet, + AddrResourceType resourceType, + UINT_32 elemLog2) const; + UINT_32 m_se; ///< Number of shader engine UINT_32 m_rbPerSe; ///< Number of render backend per shader engine UINT_32 m_maxCompFrag; ///< Number of max compressed fragment @@ -809,6 +837,22 @@ SwizzleModeFlags m_swizzleModeTable[ADDR_SW_MAX_TYPE]; ///< Swizzle mode table + // Max number of swizzle mode supported for equation + static const UINT_32 MaxSwModeType = 32; + // Max number of resource type (2D/3D) supported for equation + static const UINT_32 MaxRsrcType = 2; + // Max number of bpp (8bpp/16bpp/32bpp/64bpp/128bpp) + static const UINT_32 MaxElementBytesLog2 = 5; + // Almost all swizzle mode + resource type support equation + static const UINT_32 EquationTableSize = MaxElementBytesLog2 * MaxSwModeType * MaxRsrcType; + // Equation table + ADDR_EQUATION m_equationTable[EquationTableSize]; + + // Number of equation entries in the table + UINT_32 m_numEquations; + // Equation lookup table according to bpp and tile index + UINT_32 m_equationLookupTable[MaxRsrcType][MaxSwModeType][MaxElementBytesLog2]; + private: // Disallow the copy constructor Lib(const Lib& a); diff -Nru mesa-19.2.8/src/amd/addrlib/src/core/addrlib.cpp mesa-20.0.8/src/amd/addrlib/src/core/addrlib.cpp --- mesa-19.2.8/src/amd/addrlib/src/core/addrlib.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/addrlib/src/core/addrlib.cpp 2020-06-12 01:21:16.000000000 +0000 @@ -250,6 +250,7 @@ pLib->m_configFlags.useHtileSliceAlign = pCreateIn->createFlags.useHtileSliceAlign; pLib->m_configFlags.allowLargeThickTile = pCreateIn->createFlags.allowLargeThickTile; pLib->m_configFlags.forceDccAndTcCompat = pCreateIn->createFlags.forceDccAndTcCompat; + pLib->m_configFlags.nonPower2MemConfig = pCreateIn->createFlags.nonPower2MemConfig; pLib->m_configFlags.disableLinearOpt = FALSE; pLib->SetChipFamily(pCreateIn->chipFamily, pCreateIn->chipRevision); diff -Nru mesa-19.2.8/src/amd/addrlib/src/core/addrobject.cpp mesa-20.0.8/src/amd/addrlib/src/core/addrobject.cpp --- mesa-19.2.8/src/amd/addrlib/src/core/addrobject.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/addrlib/src/core/addrobject.cpp 2020-06-12 01:21:16.000000000 +0000 @@ -119,7 +119,7 @@ size_t objSize ///< [in] Size to allocate ) const { - return ClientAlloc(objSize, &m_client); + return ClientAlloc(objSize, &m_client);; } /** @@ -216,16 +216,20 @@ #if DEBUG if (m_client.callbacks.debugPrint != NULL) { + va_list ap; + + va_start(ap, pDebugString); + ADDR_DEBUGPRINT_INPUT debugPrintInput = {0}; debugPrintInput.size = sizeof(ADDR_DEBUGPRINT_INPUT); debugPrintInput.pDebugString = const_cast(pDebugString); debugPrintInput.hClient = m_client.handle; - va_start(debugPrintInput.ap, pDebugString); + va_copy(debugPrintInput.ap, ap); m_client.callbacks.debugPrint(&debugPrintInput); - va_end(debugPrintInput.ap); + va_end(ap); } #endif } diff -Nru mesa-19.2.8/src/amd/addrlib/src/gfx10/gfx10addrlib.cpp mesa-20.0.8/src/amd/addrlib/src/gfx10/gfx10addrlib.cpp --- mesa-19.2.8/src/amd/addrlib/src/gfx10/gfx10addrlib.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/addrlib/src/gfx10/gfx10addrlib.cpp 2020-06-12 01:21:16.000000000 +0000 @@ -33,7 +33,6 @@ #include "gfx10addrlib.h" #include "gfx10_gb_reg.h" -#include "gfx10SwizzlePattern.h" #include "amdgpu_asic_addr.h" @@ -66,63 +65,54 @@ //////////////////////////////////////////////////////////////////////////////////////////////////// const SwizzleModeFlags Gfx10Lib::SwizzleModeTable[ADDR_SW_MAX_TYPE] = -{//Linear 256B 4KB 64KB Var Z Std Disp Rot XOR T RtOpt - {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // ADDR_SW_LINEAR - {0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0}, // ADDR_SW_256B_S - {0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0}, // ADDR_SW_256B_D - {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // Reserved - - {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // Reserved - {0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0}, // ADDR_SW_4KB_S - {0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0}, // ADDR_SW_4KB_D - {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // Reserved - - {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // Reserved - {0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0}, // ADDR_SW_64KB_S - {0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0}, // ADDR_SW_64KB_D - {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // Reserved - - {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // Reserved - {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // Reserved - {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // Reserved - {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // Reserved - - {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // Reserved - {0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0}, // ADDR_SW_64KB_S_T - {0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0}, // ADDR_SW_64KB_D_T - {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // Reserved - - {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // Reserved - {0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0}, // ADDR_SW_4KB_S_X - {0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0}, // ADDR_SW_4KB_D_X - {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // Reserved - - {0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0}, // ADDR_SW_64KB_Z_X - {0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0}, // ADDR_SW_64KB_S_X - {0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0}, // ADDR_SW_64KB_D_X - {0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1}, // ADDR_SW_64KB_R_X - - {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // Reserved - {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // Reserved - {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // Reserved - {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // Reserved - {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // ADDR_SW_LINEAR_GENERAL +{//Linear 256B 4KB 64KB Var Z Std Disp Rot XOR T RtOpt Reserved + {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // ADDR_SW_LINEAR + {0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0}, // ADDR_SW_256B_S + {0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0}, // ADDR_SW_256B_D + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // Reserved + + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // Reserved + {0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0}, // ADDR_SW_4KB_S + {0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0}, // ADDR_SW_4KB_D + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // Reserved + + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // Reserved + {0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0}, // ADDR_SW_64KB_S + {0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0}, // ADDR_SW_64KB_D + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // Reserved + + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // Reserved + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // Reserved + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // Reserved + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // Reserved + + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // Reserved + {0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0}, // ADDR_SW_64KB_S_T + {0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0}, // ADDR_SW_64KB_D_T + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // Reserved + + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // Reserved + {0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0}, // ADDR_SW_4KB_S_X + {0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0}, // ADDR_SW_4KB_D_X + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // Reserved + + {0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0}, // ADDR_SW_64KB_Z_X + {0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0}, // ADDR_SW_64KB_S_X + {0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0}, // ADDR_SW_64KB_D_X + {0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0}, // ADDR_SW_64KB_R_X + + {0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0}, // ADDR_SW_VAR_Z_X + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // Reserved + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // Reserved + {0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0}, // ADDR_SW_VAR_R_X + {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // ADDR_SW_LINEAR_GENERAL }; const Dim3d Gfx10Lib::Block256_3d[] = {{8, 4, 8}, {4, 4, 8}, {4, 4, 4}, {4, 2, 4}, {2, 2, 4}}; -const Dim3d Gfx10Lib::Block64K_3d[] = {{64, 32, 32}, {32 , 32, 32}, {32, 32, 16}, {32, 16, 16}, {16, 16, 16}}; -const Dim3d Gfx10Lib::Block4K_3d[] = {{16, 16, 16}, {8, 16, 16}, {8, 16, 8}, {8, 8, 8}, {4, 8, 8}}; - -const Dim2d Gfx10Lib::Block64K_2d[] = {{256, 256}, {256 , 128}, {128, 128}, {128, 64}, {64, 64}}; -const Dim2d Gfx10Lib::Block4K_2d[] = {{64, 64}, {64, 32}, {32, 32}, {32, 16}, {16, 16}}; - const Dim3d Gfx10Lib::Block64K_Log2_3d[] = {{6, 5, 5}, {5, 5, 5}, {5, 5, 4}, {5, 4, 4}, {4, 4, 4}}; const Dim3d Gfx10Lib::Block4K_Log2_3d[] = {{4, 4, 4}, {3, 4, 4}, {3, 4, 3}, {3, 3, 3}, {2, 3, 3}}; -const Dim2d Gfx10Lib::Block64K_Log2_2d[] = {{8, 8}, {8, 7}, {7, 7}, {7, 6}, {6, 6}}; -const Dim2d Gfx10Lib::Block4K_Log2_2d[] = {{6, 6}, {6, 5}, {5, 5}, {5, 4}, {4, 4}}; - /** ************************************************************************************************************************ * Gfx10Lib::Gfx10Lib @@ -135,7 +125,9 @@ Gfx10Lib::Gfx10Lib(const Client* pClient) : Lib(pClient), - m_numEquations(0) + m_colorBaseIndex(0), + m_xmaskBaseIndex(0), + m_dccBaseIndex(0) { m_class = AI_ADDRLIB; memset(&m_settings, 0, sizeof(m_settings)); @@ -172,7 +164,8 @@ { ADDR_E_RETURNCODE ret = ADDR_OK; - if ((pIn->swizzleMode != ADDR_SW_64KB_Z_X) || + if (((pIn->swizzleMode != ADDR_SW_64KB_Z_X) && + ((pIn->swizzleMode != ADDR_SW_VAR_Z_X) || (m_blockVarSizeLog2 == 0))) || (pIn->hTileFlags.pipeAligned != TRUE)) { ret = ADDR_INVALIDPARAMS; @@ -182,7 +175,7 @@ Dim3d metaBlk = {0}; const UINT_32 metaBlkSize = GetMetaBlkSize(Gfx10DataDepthStencil, ADDR_RSRC_TEX_2D, - ADDR_SW_64KB_Z_X, + pIn->swizzleMode, 0, 0, TRUE, @@ -281,8 +274,10 @@ { ADDR_E_RETURNCODE ret = ADDR_OK; - if ((pIn->resourceType != ADDR_RSRC_TEX_2D) || - (pIn->cMaskFlags.pipeAligned != TRUE)) + if ((pIn->resourceType != ADDR_RSRC_TEX_2D) || + (pIn->cMaskFlags.pipeAligned != TRUE) || + ((pIn->swizzleMode != ADDR_SW_64KB_Z_X) && + ((pIn->swizzleMode != ADDR_SW_VAR_Z_X) || (m_blockVarSizeLog2 == 0)))) { ret = ADDR_INVALIDPARAMS; } @@ -291,7 +286,7 @@ Dim3d metaBlk = {0}; const UINT_32 metaBlkSize = GetMetaBlkSize(Gfx10DataFmask, ADDR_RSRC_TEX_2D, - ADDR_SW_64KB_Z_X, + pIn->swizzleMode, 0, 0, TRUE, @@ -512,7 +507,10 @@ const ADDR2_COMPUTE_CMASK_ADDRFROMCOORD_INPUT* pIn, ///< [in] input structure ADDR2_COMPUTE_CMASK_ADDRFROMCOORD_OUTPUT* pOut) ///< [out] output structure { - ADDR2_COMPUTE_CMASK_INFO_INPUT input = {0}; + // Only support pipe aligned CMask + ADDR_ASSERT(pIn->cMaskFlags.pipeAligned == TRUE); + + ADDR2_COMPUTE_CMASK_INFO_INPUT input = {}; input.size = sizeof(input); input.cMaskFlags = pIn->cMaskFlags; input.colorFlags = pIn->colorFlags; @@ -522,27 +520,23 @@ input.swizzleMode = pIn->swizzleMode; input.resourceType = pIn->resourceType; - ADDR2_COMPUTE_CMASK_INFO_OUTPUT output = {0}; + ADDR2_COMPUTE_CMASK_INFO_OUTPUT output = {}; output.size = sizeof(output); ADDR_E_RETURNCODE returnCode = ComputeCmaskInfo(&input, &output); if (returnCode == ADDR_OK) { - const UINT_32 fmaskBpp = GetFmaskBpp(pIn->numSamples, pIn->numFrags); - const UINT_32 fmaskElemLog2 = Log2(fmaskBpp >> 3); - const UINT_32 numPipeLog2 = m_pipesLog2; - const UINT_32 pipeMask = (1 << numPipeLog2) - 1; - const UINT_32 fmaskBppType = 4; - const UINT_32 numPipeType = 8; - const UINT_32 index = ((m_pipeInterleaveLog2 - 8) * (fmaskBppType * numPipeType)) + - ((numPipeLog2 + 1) * fmaskBppType) + - fmaskElemLog2; + const UINT_32 fmaskBpp = GetFmaskBpp(pIn->numSamples, pIn->numFrags); + const UINT_32 fmaskElemLog2 = Log2(fmaskBpp >> 3); + const UINT_32 pipeMask = (1 << m_pipesLog2) - 1; + const UINT_32 index = m_xmaskBaseIndex + fmaskElemLog2; + const UINT_8* patIdxTable = (pIn->swizzleMode == ADDR_SW_VAR_Z_X) ? CMASK_VAR_RBPLUS_PATIDX : + (m_settings.supportRbPlus ? CMASK_64K_RBPLUS_PATIDX : CMASK_64K_PATIDX); - const UINT_64* pPattern = CMASK_64K[index]; const UINT_32 blkSizeLog2 = Log2(output.metaBlkWidth) + Log2(output.metaBlkHeight) - 7; const UINT_32 blkMask = (1 << blkSizeLog2) - 1; - const UINT_32 blkOffset = ComputeOffsetFromSwizzlePattern(pPattern, + const UINT_32 blkOffset = ComputeOffsetFromSwizzlePattern(CMASK_SW_PATTERN[patIdxTable[index]], blkSizeLog2 + 1, // +1 for nibble offset pIn->x, pIn->y, @@ -605,11 +599,12 @@ { const UINT_32 numSampleLog2 = Log2(pIn->numSamples); const UINT_32 pipeMask = (1 << m_pipesLog2) - 1; - const UINT_32 index = m_htileBaseIndex + numSampleLog2; - const UINT_64* pPattern = HTILE_64K[index]; + const UINT_32 index = m_xmaskBaseIndex + numSampleLog2; + const UINT_8* patIdxTable = m_settings.supportRbPlus ? HTILE_RBPLUS_PATIDX : HTILE_PATIDX; + const UINT_32 blkSizeLog2 = Log2(output.metaBlkWidth) + Log2(output.metaBlkHeight) - 4; const UINT_32 blkMask = (1 << blkSizeLog2) - 1; - const UINT_32 blkOffset = ComputeOffsetFromSwizzlePattern(pPattern, + const UINT_32 blkOffset = ComputeOffsetFromSwizzlePattern(HTILE_SW_PATTERN[patIdxTable[index]], blkSizeLog2 + 1, // +1 for nibble offset pIn->x, pIn->y, @@ -698,27 +693,53 @@ if (returnCode == ADDR_OK) { - const UINT_32 elemLog2 = Log2(pIn->bpp >> 3); - const UINT_32 numPipeLog2 = m_pipesLog2; - const UINT_32 pipeMask = (1 << numPipeLog2) - 1; - const UINT_32 alignPipeType = 7; - const UINT_32 unalignPipeType = 3; - const UINT_32 numPipeType = alignPipeType + unalignPipeType; - UINT_32 index = ((m_pipeInterleaveLog2 - 8) * (MaxNumOfBpp * numPipeType)) + elemLog2; + const UINT_32 elemLog2 = Log2(pIn->bpp >> 3); + const UINT_32 numPipeLog2 = m_pipesLog2; + const UINT_32 pipeMask = (1 << numPipeLog2) - 1; + UINT_32 index = m_dccBaseIndex + elemLog2; + const UINT_8* patIdxTable; - if (pIn->dccKeyFlags.pipeAligned) + if (m_settings.supportRbPlus) { - index += (numPipeLog2 + unalignPipeType) * MaxNumOfBpp; + patIdxTable = DCC_64K_R_X_RBPLUS_PATIDX; + + if (pIn->dccKeyFlags.pipeAligned) + { + index += MaxNumOfBpp; + + if (m_numPkrLog2 < 2) + { + index += m_pipesLog2 * MaxNumOfBpp; + } + else + { + // 4 groups for "m_numPkrLog2 < 2" case + index += 4 * MaxNumOfBpp; + + const UINT_32 dccPipePerPkr = 3; + + index += (m_numPkrLog2 - 2) * dccPipePerPkr * MaxNumOfBpp + + (m_pipesLog2 - m_numPkrLog2) * MaxNumOfBpp; + } + } } else { - index += Min(numPipeLog2, 2u) * MaxNumOfBpp; + patIdxTable = DCC_64K_R_X_PATIDX; + + if (pIn->dccKeyFlags.pipeAligned) + { + index += (numPipeLog2 + UnalignedDccType) * MaxNumOfBpp; + } + else + { + index += Min(numPipeLog2, UnalignedDccType - 1) * MaxNumOfBpp; + } } - const UINT_64* pPattern = DCC_64K_R_X[index]; const UINT_32 blkSizeLog2 = Log2(output.metaBlkWidth) + Log2(output.metaBlkHeight) + elemLog2 - 8; const UINT_32 blkMask = (1 << blkSizeLog2) - 1; - const UINT_32 blkOffset = ComputeOffsetFromSwizzlePattern(pPattern, + const UINT_32 blkOffset = ComputeOffsetFromSwizzlePattern(DCC_64K_R_X_SW_PATTERN[patIdxTable[index]], blkSizeLog2 + 1, // +1 for nibble offset pIn->x, pIn->y, @@ -822,6 +843,7 @@ // Addr::V2::Lib::ComputePipeBankXor()/ComputeSlicePipeBankXor() requires pipe interleave to be exactly 8 bits, and // any larger value requires a post-process (left shift) on the output pipeBankXor bits. + // And more importantly, SW AddrLib doesn't support sw equation/pattern for PI != 256 case. ADDR_ASSERT(m_pipeInterleaveBytes == ADDR_PIPEINTERLEAVE_256B); switch (gbAddrConfig.bits.MAX_COMPRESSED_FRAGS) @@ -848,53 +870,47 @@ break; } - if (m_settings.supportRbPlus) { - m_numPkrLog2 = gbAddrConfig.bits.NUM_PKRS; - m_numSaLog2 = (m_numPkrLog2 > 0) ? (m_numPkrLog2 - 1) : 0; + // Skip unaligned case + m_xmaskBaseIndex += MaxNumOfAA; - ADDR_ASSERT((m_numPkrLog2 <= m_pipesLog2) && ((m_pipesLog2 - m_numPkrLog2) <= 2)); + m_xmaskBaseIndex += m_pipesLog2 * MaxNumOfAA; + m_colorBaseIndex += m_pipesLog2 * MaxNumOfBpp; - const UINT_32 maxPipeInterleaveType = 3; + if (m_settings.supportRbPlus) + { + m_numPkrLog2 = gbAddrConfig.bits.NUM_PKRS; + m_numSaLog2 = (m_numPkrLog2 > 0) ? (m_numPkrLog2 - 1) : 0; - m_colorBaseIndex = sizeof(SW_64K_R_X_1xaa_RBPLUS) / - sizeof(SW_64K_R_X_1xaa_RBPLUS[0]) / - maxPipeInterleaveType * - (m_pipeInterleaveLog2 - 8); - m_htileBaseIndex = sizeof(HTILE_64K_RBPLUS) / - sizeof(HTILE_64K_RBPLUS[0]) / - maxPipeInterleaveType * - (m_pipeInterleaveLog2 - 8); + ADDR_ASSERT((m_numPkrLog2 <= m_pipesLog2) && ((m_pipesLog2 - m_numPkrLog2) <= 2)); - // Skip unaligned case - m_htileBaseIndex += MaxNumOfAA; + ADDR_C_ASSERT(sizeof(HTILE_RBPLUS_PATIDX) / sizeof(HTILE_RBPLUS_PATIDX[0]) == + sizeof(CMASK_64K_RBPLUS_PATIDX) / sizeof(CMASK_64K_RBPLUS_PATIDX[0])); - if (m_numPkrLog2 < 2) - { - m_colorBaseIndex += m_pipesLog2 * MaxNumOfBpp; - m_htileBaseIndex += m_pipesLog2 * MaxNumOfAA; + if (m_numPkrLog2 >= 2) + { + m_colorBaseIndex += (2 * m_numPkrLog2 - 2) * MaxNumOfBpp; + m_xmaskBaseIndex += (m_numPkrLog2 - 1) * 3 * MaxNumOfAA; + } } else { - m_colorBaseIndex += (2 * m_numPkrLog2 - 2 + m_pipesLog2) * MaxNumOfBpp; + const UINT_32 numPipeType = static_cast(ADDR_CONFIG_64_PIPE) - + static_cast(ADDR_CONFIG_1_PIPE) + + 1; - const UINT_32 htilePipePerPkr = 4; + ADDR_C_ASSERT(sizeof(HTILE_PATIDX) / sizeof(HTILE_PATIDX[0]) == (numPipeType + 1) * MaxNumOfAA); - m_htileBaseIndex += (m_numPkrLog2 - 1) * htilePipePerPkr * MaxNumOfAA + - (m_pipesLog2 + 1 - m_numPkrLog2) * MaxNumOfAA; + ADDR_C_ASSERT(sizeof(HTILE_PATIDX) / sizeof(HTILE_PATIDX[0]) == + sizeof(CMASK_64K_PATIDX) / sizeof(CMASK_64K_PATIDX[0])); } } - else - { - const UINT_32 numPipeType = static_cast(ADDR_CONFIG_64_PIPE) - - static_cast(ADDR_CONFIG_1_PIPE) + - 1; - m_colorBaseIndex = (m_pipeInterleaveLog2 - 8) * (MaxNumOfBpp * numPipeType) + - (m_pipesLog2 * MaxNumOfBpp); - - m_htileBaseIndex = (m_pipeInterleaveLog2 - 8) * (MaxNumOfAA * (numPipeType + 1)) + - (m_pipesLog2 + 1) * MaxNumOfAA; + if (m_settings.supportRbPlus) + { + // VAR block size = 16K * num_pipes. For 4 pipe configuration, SW_VAR_* mode swizzle patterns are same as the + // corresponding SW_64KB_* mode + m_blockVarSizeLog2 = m_pipesLog2 + 14; } if (valid) @@ -1284,14 +1300,17 @@ ************************************************************************************************************************ */ VOID Gfx10Lib::ConvertSwizzlePatternToEquation( - UINT_32 elemLog2, ///< [in] element bytes log2 - AddrResourceType rsrcType, ///< [in] resource type - AddrSwizzleMode swMode, ///< [in] swizzle mode - const UINT_64* pPattern, ///< [in] swizzle pattern - ADDR_EQUATION* pEquation) ///< [out] equation converted from swizzle pattern + UINT_32 elemLog2, ///< [in] element bytes log2 + AddrResourceType rsrcType, ///< [in] resource type + AddrSwizzleMode swMode, ///< [in] swizzle mode + const ADDR_SW_PATINFO* pPatInfo, ///< [in] swizzle pattern infor + ADDR_EQUATION* pEquation) ///< [out] equation converted from swizzle pattern const { - const ADDR_BIT_SETTING* pSwizzle = reinterpret_cast(pPattern); + ADDR_BIT_SETTING fullSwizzlePattern[20]; + GetSwizzlePatternFromPatternInfo(pPatInfo, fullSwizzlePattern); + + const ADDR_BIT_SETTING* pSwizzle = fullSwizzlePattern; const UINT_32 blockSizeLog2 = GetBlockSizeLog2(swMode); pEquation->numBits = blockSizeLog2; @@ -1342,10 +1361,13 @@ } else if (IsThin(rsrcType, swMode)) { - const UINT_32 blkXLog2 = (blockSizeLog2 == 12) ? Block4K_Log2_2d[elemLog2].w : Block64K_Log2_2d[elemLog2].w; - const UINT_32 blkYLog2 = (blockSizeLog2 == 12) ? Block4K_Log2_2d[elemLog2].h : Block64K_Log2_2d[elemLog2].h; - const UINT_32 blkXMask = (1 << blkXLog2) - 1; - const UINT_32 blkYMask = (1 << blkYLog2) - 1; + Dim3d dim; + ComputeThinBlockDimension(&dim.w, &dim.h, &dim.d, 8u << elemLog2, 0, rsrcType, swMode); + + const UINT_32 blkXLog2 = Log2(dim.w); + const UINT_32 blkYLog2 = Log2(dim.h); + const UINT_32 blkXMask = dim.w - 1; + const UINT_32 blkYMask = dim.h - 1; ADDR_BIT_SETTING swizzle[ADDR_MAX_EQUATION_BIT]; UINT_32 xMask = 0; @@ -1541,7 +1563,7 @@ ADDR_ASSERT((xMask == blkXMask) && (yMask == blkYMask)); } - else if (IsEquationCompatibleThick(rsrcType, swMode)) + else { const UINT_32 blkXLog2 = (blockSizeLog2 == 12) ? Block4K_Log2_3d[elemLog2].w : Block64K_Log2_3d[elemLog2].w; const UINT_32 blkYLog2 = (blockSizeLog2 == 12) ? Block4K_Log2_3d[elemLog2].h : Block64K_Log2_3d[elemLog2].h; @@ -1823,27 +1845,40 @@ { const AddrResourceType rsrcType = static_cast(rsrcTypeIdx + ADDR_RSRC_TEX_2D); - for (UINT_32 swModeIdx = 0; swModeIdx < MaxSwMode; swModeIdx++) + for (UINT_32 swModeIdx = 0; swModeIdx < MaxSwModeType; swModeIdx++) { const AddrSwizzleMode swMode = static_cast(swModeIdx); for (UINT_32 elemLog2 = 0; elemLog2 < MaxElementBytesLog2; elemLog2++) { - UINT_32 equationIndex = ADDR_INVALID_EQUATION_INDEX; - const UINT_64* pPattern = GetSwizzlePattern(swMode, rsrcType, elemLog2, 1); + UINT_32 equationIndex = ADDR_INVALID_EQUATION_INDEX; + const ADDR_SW_PATINFO* pPatInfo = GetSwizzlePatternInfo(swMode, rsrcType, elemLog2, 1); - if (pPattern != NULL) + if (pPatInfo != NULL) { - ADDR_EQUATION equation = {}; + ADDR_ASSERT(IsValidSwMode(swMode)); - ConvertSwizzlePatternToEquation(elemLog2, rsrcType, swMode, pPattern, &equation); + if (pPatInfo->maxItemCount <= 3) + { + ADDR_EQUATION equation = {}; - equationIndex = m_numEquations; - ADDR_ASSERT(equationIndex < EquationTableSize); + ConvertSwizzlePatternToEquation(elemLog2, rsrcType, swMode, pPatInfo, &equation); - m_equationTable[equationIndex] = equation; + equationIndex = m_numEquations; + ADDR_ASSERT(equationIndex < EquationTableSize); - m_numEquations++; + m_equationTable[equationIndex] = equation; + + m_numEquations++; + } + else + { + // We only see "ill" equation from 64/128 BPE + 3D resource + SW_64KB_D_X under RB+ case + ADDR_ASSERT((elemLog2 == 3) || (elemLog2 == 4)); + ADDR_ASSERT(rsrcTypeIdx == 1); + ADDR_ASSERT(swMode == ADDR_SW_64KB_D_X); + ADDR_ASSERT(m_settings.supportRbPlus == 1); + } } m_equationLookupTable[rsrcTypeIdx][swModeIdx][elemLog2] = equationIndex; @@ -2179,7 +2214,7 @@ { BOOL_32 valid = TRUE; - if (pIn->swizzleMode >= ADDR_SW_MAX_TYPE) + if ((pIn->swizzleMode >= ADDR_SW_MAX_TYPE) || (IsValidSwMode(pIn->swizzleMode) == FALSE)) { ADDR_ASSERT_ALWAYS(); valid = FALSE; @@ -2198,8 +2233,10 @@ const BOOL_32 thin3d = flags.view3dAs2dArray; const BOOL_32 linear = IsLinear(swizzle); const BOOL_32 blk256B = IsBlock256b(swizzle); + const BOOL_32 blkVar = IsBlockVariable(swizzle); const BOOL_32 isNonPrtXor = IsNonPrtXor(swizzle); const BOOL_32 prt = flags.prt; + const BOOL_32 fmask = flags.fmask; // Misc check if ((pIn->numFrags > 1) && @@ -2236,7 +2273,8 @@ else if (tex2d) { if (((swizzleMask & Gfx10Rsrc2dSwModeMask) == 0) || - (prt && ((swizzleMask & Gfx10Rsrc2dPrtSwModeMask) == 0))) + (prt && ((swizzleMask & Gfx10Rsrc2dPrtSwModeMask) == 0)) || + (fmask && ((swizzleMask & Gfx10ZSwModeMask) == 0))) { ADDR_ASSERT_ALWAYS(); valid = FALSE; @@ -2312,6 +2350,14 @@ valid = FALSE; } } + else if (blkVar) + { + if (m_blockVarSizeLog2 == 0) + { + ADDR_ASSERT_ALWAYS(); + valid = FALSE; + } + } return valid; } @@ -2354,13 +2400,89 @@ if (pIn->flags.fmask) { - pOut->swizzleMode = ADDR_SW_64KB_Z_X; - pOut->resourceType = ADDR_RSRC_TEX_2D; - pOut->validBlockSet.value = AddrBlockSetMacro64KB; - pOut->canXor = TRUE; - pOut->validSwTypeSet.value = AddrSwSetZ; - pOut->clientPreferredSwSet = pOut->validSwTypeSet; - pOut->validSwModeSet.value = Gfx10ZSwModeMask; + const BOOL_32 forbid64KbBlockType = pIn->forbiddenBlock.macroThin64KB ? TRUE : FALSE; + const BOOL_32 forbidVarBlockType = ((m_blockVarSizeLog2 == 0) || (pIn->forbiddenBlock.var != 0)); + + if (forbid64KbBlockType && forbidVarBlockType) + { + // Invalid combination... + ADDR_ASSERT_ALWAYS(); + returnCode = ADDR_INVALIDPARAMS; + } + else + { + pOut->resourceType = ADDR_RSRC_TEX_2D; + pOut->validBlockSet.value = 0; + pOut->validBlockSet.macroThin64KB = forbid64KbBlockType ? 0 : 1; + pOut->validBlockSet.var = forbidVarBlockType ? 0 : 1; + pOut->validSwModeSet.value = 0; + pOut->validSwModeSet.sw64KB_Z_X = forbid64KbBlockType ? 0 : 1; + pOut->validSwModeSet.swVar_Z_X = forbidVarBlockType ? 0 : 1; + pOut->canXor = TRUE; + pOut->validSwTypeSet.value = AddrSwSetZ; + pOut->clientPreferredSwSet = pOut->validSwTypeSet; + + BOOL_32 use64KbBlockType = (forbid64KbBlockType == FALSE); + + if ((forbid64KbBlockType == FALSE) && (forbidVarBlockType == FALSE)) + { + const UINT_8 maxFmaskSwizzleModeType = 2; + const UINT_32 ratioLow = pIn->flags.minimizeAlign ? 1 : (pIn->flags.opt4space ? 3 : 2); + const UINT_32 ratioHi = pIn->flags.minimizeAlign ? 1 : (pIn->flags.opt4space ? 2 : 1); + const UINT_32 fmaskBpp = GetFmaskBpp(pIn->numSamples, pIn->numFrags); + const UINT_32 numSlices = Max(pIn->numSlices, 1u); + const UINT_32 width = Max(pIn->width, 1u); + const UINT_32 height = Max(pIn->height, 1u); + const UINT_64 sizeAlignInElement = Max(NextPow2(pIn->minSizeAlign) / (fmaskBpp >> 3), 1u); + + AddrSwizzleMode swMode[maxFmaskSwizzleModeType] = {ADDR_SW_64KB_Z_X, ADDR_SW_VAR_Z_X}; + Dim3d blkDim[maxFmaskSwizzleModeType] = {{0}, {0}}; + Dim3d padDim[maxFmaskSwizzleModeType] = {{0}, {0}}; + UINT_64 padSize[maxFmaskSwizzleModeType] = {0}; + + for (UINT_8 i = 0; i < maxFmaskSwizzleModeType; i++) + { + ComputeBlockDimensionForSurf(&blkDim[i].w, + &blkDim[i].h, + &blkDim[i].d, + fmaskBpp, + 1, + pOut->resourceType, + swMode[i]); + + padSize[i] = ComputePadSize(&blkDim[i], width, height, numSlices, &padDim[i]); + padSize[i] = PowTwoAlign(padSize[i], sizeAlignInElement); + } + + if (GetBlockSizeLog2(swMode[1]) >= GetBlockSizeLog2(swMode[0])) + { + if ((padSize[1] * ratioHi) <= (padSize[0] * ratioLow)) + { + use64KbBlockType = FALSE; + } + } + else + { + if ((padSize[1] * ratioLow) < (padSize[0] * ratioHi)) + { + use64KbBlockType = FALSE; + } + } + } + else if (forbidVarBlockType) + { + use64KbBlockType = TRUE; + } + + if (use64KbBlockType) + { + pOut->swizzleMode = ADDR_SW_64KB_Z_X; + } + else + { + pOut->swizzleMode = ADDR_SW_VAR_Z_X; + } + } } else { @@ -2413,10 +2535,22 @@ { // Forbid swizzle mode(s) by client setting ADDR2_SWMODE_SET allowedSwModeSet = {}; - allowedSwModeSet.value |= pIn->forbiddenBlock.linear ? 0 : Gfx10LinearSwModeMask; - allowedSwModeSet.value |= pIn->forbiddenBlock.micro ? 0 : Gfx10Blk256BSwModeMask; - allowedSwModeSet.value |= pIn->forbiddenBlock.macro4KB ? 0 : Gfx10Blk4KBSwModeMask; - allowedSwModeSet.value |= pIn->forbiddenBlock.macro64KB ? 0 : Gfx10Blk64KBSwModeMask; + allowedSwModeSet.value |= pIn->forbiddenBlock.linear ? 0 : Gfx10LinearSwModeMask; + allowedSwModeSet.value |= pIn->forbiddenBlock.micro ? 0 : Gfx10Blk256BSwModeMask; + allowedSwModeSet.value |= + pIn->forbiddenBlock.macroThin4KB ? 0 : + ((pOut->resourceType == ADDR_RSRC_TEX_3D) ? 0 : Gfx10Blk4KBSwModeMask); + allowedSwModeSet.value |= + pIn->forbiddenBlock.macroThick4KB ? 0 : + ((pOut->resourceType == ADDR_RSRC_TEX_3D) ? Gfx10Rsrc3dThick4KBSwModeMask : 0); + allowedSwModeSet.value |= + pIn->forbiddenBlock.macroThin64KB ? 0 : + ((pOut->resourceType == ADDR_RSRC_TEX_3D) ? Gfx10Rsrc3dThin64KBSwModeMask : Gfx10Blk64KBSwModeMask); + allowedSwModeSet.value |= + pIn->forbiddenBlock.macroThick64KB ? 0 : + ((pOut->resourceType == ADDR_RSRC_TEX_3D) ? Gfx10Rsrc3dThick64KBSwModeMask : 0); + allowedSwModeSet.value |= + pIn->forbiddenBlock.var ? 0 : (m_blockVarSizeLog2 ? Gfx10BlkVarSwModeMask : 0); if (pIn->preferredSwSet.value != 0) { @@ -2433,17 +2567,22 @@ if (pIn->maxAlign > 0) { - if (pIn->maxAlign < GetBlockSize(ADDR_SW_64KB)) + if (pIn->maxAlign < (1u << m_blockVarSizeLog2)) + { + allowedSwModeSet.value &= ~Gfx10BlkVarSwModeMask; + } + + if (pIn->maxAlign < Size64K) { allowedSwModeSet.value &= ~Gfx10Blk64KBSwModeMask; } - if (pIn->maxAlign < GetBlockSize(ADDR_SW_4KB)) + if (pIn->maxAlign < Size4K) { allowedSwModeSet.value &= ~Gfx10Blk4KBSwModeMask; } - if (pIn->maxAlign < GetBlockSize(ADDR_SW_256B)) + if (pIn->maxAlign < Size256) { allowedSwModeSet.value &= ~Gfx10Blk256BSwModeMask; } @@ -2463,11 +2602,6 @@ case ADDR_RSRC_TEX_3D: allowedSwModeSet.value &= pIn->flags.prt ? Gfx10Rsrc3dPrtSwModeMask : Gfx10Rsrc3dSwModeMask; - if (m_settings.supportRbPlus) - { - allowedSwModeSet.value &= ~Gfx10DisplaySwModeMask; - } - if (pIn->flags.view3dAs2dArray) { allowedSwModeSet.value &= Gfx10Rsrc3dThinSwModeMask; @@ -2536,7 +2670,7 @@ pOut->resourceType = pIn->resourceType; pOut->validSwModeSet = allowedSwModeSet; pOut->canXor = (allowedSwModeSet.value & Gfx10XorSwModeMask) ? TRUE : FALSE; - pOut->validBlockSet = GetAllowedBlockSet(allowedSwModeSet); + pOut->validBlockSet = GetAllowedBlockSet(allowedSwModeSet, pOut->resourceType); pOut->validSwTypeSet = GetAllowedSwSet(allowedSwModeSet); pOut->clientPreferredSwSet = pIn->preferredSwSet; @@ -2546,6 +2680,28 @@ pOut->clientPreferredSwSet.value = AddrSwSetAll; } + // Apply optional restrictions + if ((pIn->flags.depth || pIn->flags.stencil) && msaa && m_configFlags.nonPower2MemConfig) + { + if ((allowedSwModeSet.value &= ~Gfx10BlkVarSwModeMask) != 0) + { + // MSAA depth in non power of 2 memory configs would suffer from non-local channel accesses from + // the GL2 in VAR mode, so it should be avoided. + allowedSwModeSet.value &= ~Gfx10BlkVarSwModeMask; + } + else + { + // We should still be able to use VAR for non power of 2 memory configs with MSAA z/stencil. + // But we have to suffer from low performance because there is no other choice... + ADDR_ASSERT_ALWAYS(); + } + } + + if (pIn->flags.needEquation) + { + FilterInvalidEqSwizzleMode(allowedSwModeSet, pIn->resourceType, Log2(bpp >> 3)); + } + if (allowedSwModeSet.value == Gfx10LinearSwModeMask) { pOut->swizzleMode = ADDR_SW_LINEAR; @@ -2555,15 +2711,34 @@ // Always ignore linear swizzle mode if there is other choice. allowedSwModeSet.swLinear = 0; - ADDR2_BLOCK_SET allowedBlockSet = GetAllowedBlockSet(allowedSwModeSet); + ADDR2_BLOCK_SET allowedBlockSet = GetAllowedBlockSet(allowedSwModeSet, pOut->resourceType); // Determine block size if there is 2 or more block type candidates if (IsPow2(allowedBlockSet.value) == FALSE) { - const AddrSwizzleMode swMode[AddrBlockMaxTiledType] = {ADDR_SW_256B, ADDR_SW_4KB, ADDR_SW_64KB}; - Dim3d blkDim[AddrBlockMaxTiledType] = {{0}, {0}, {0}}; - Dim3d padDim[AddrBlockMaxTiledType] = {{0}, {0}, {0}}; - UINT_64 padSize[AddrBlockMaxTiledType] = {0}; + AddrSwizzleMode swMode[AddrBlockMaxTiledType] = { ADDR_SW_LINEAR }; + + if (m_blockVarSizeLog2 != 0) + { + swMode[AddrBlockVar] = ADDR_SW_VAR_R_X; + } + + if (pOut->resourceType == ADDR_RSRC_TEX_3D) + { + swMode[AddrBlockThick4KB] = ADDR_SW_4KB_S; + swMode[AddrBlockThin64KB] = ADDR_SW_64KB_R_X; + swMode[AddrBlockThick64KB] = ADDR_SW_64KB_S; + } + else + { + swMode[AddrBlockMicro] = ADDR_SW_256B_S; + swMode[AddrBlockThin4KB] = ADDR_SW_4KB_S; + swMode[AddrBlockThin64KB] = ADDR_SW_64KB_S; + } + + Dim3d blkDim[AddrBlockMaxTiledType] = {{0}, {0}, {0}, {0}, {0}, {0}}; + Dim3d padDim[AddrBlockMaxTiledType] = {{0}, {0}, {0}, {0}, {0}, {0}}; + UINT_64 padSize[AddrBlockMaxTiledType] = {0}; const UINT_32 ratioLow = pIn->flags.minimizeAlign ? 1 : (pIn->flags.opt4space ? 3 : 2); const UINT_32 ratioHi = pIn->flags.minimizeAlign ? 1 : (pIn->flags.opt4space ? 2 : 1); @@ -2584,14 +2759,37 @@ swMode[i]); padSize[i] = ComputePadSize(&blkDim[i], width, height, numSlices, &padDim[i]); - padSize[i] = PowTwoAlign(padSize[i], sizeAlignInElement); + padSize[i] = PowTwoAlign(padSize[i] * numFrags, sizeAlignInElement); - if ((minSize == 0) || - ((padSize[i] * ratioHi) <= (minSize * ratioLow))) + if (minSize == 0) { minSize = padSize[i]; minSizeBlk = i; } + else + { + // Due to the fact that VAR block size = 16KB * m_pipes, it is possible that VAR + // block size < 64KB. And ratio[Hi/Low] logic implicitly requires iterating from + // smaller block type to bigger block type. So we have to correct comparing logic + // according to the size of existing "minimun block" and size of coming/comparing + // block. The new logic can also be useful to any future change about AddrBlockType. + if (GetBlockSizeLog2(swMode[i]) >= GetBlockSizeLog2(swMode[minSizeBlk])) + { + if ((padSize[i] * ratioHi) <= (minSize * ratioLow)) + { + minSize = padSize[i]; + minSizeBlk = i; + } + } + else + { + if ((padSize[i] * ratioLow) < (minSize * ratioHi)) + { + minSize = padSize[i]; + minSizeBlk = i; + } + } + } } } @@ -2604,21 +2802,38 @@ if (minSizeBlk == AddrBlockMicro) { + ADDR_ASSERT(pOut->resourceType != ADDR_RSRC_TEX_3D); allowedSwModeSet.value &= Gfx10Blk256BSwModeMask; } - else if (minSizeBlk == AddrBlock4KB) + else if (minSizeBlk == AddrBlockThick4KB) { + ADDR_ASSERT(pOut->resourceType == ADDR_RSRC_TEX_3D); + allowedSwModeSet.value &= Gfx10Rsrc3dThick4KBSwModeMask; + } + else if (minSizeBlk == AddrBlockThin4KB) + { + ADDR_ASSERT(pOut->resourceType != ADDR_RSRC_TEX_3D); allowedSwModeSet.value &= Gfx10Blk4KBSwModeMask; } + else if (minSizeBlk == AddrBlockThick64KB) + { + ADDR_ASSERT(pOut->resourceType == ADDR_RSRC_TEX_3D); + allowedSwModeSet.value &= Gfx10Rsrc3dThick64KBSwModeMask; + } + else if (minSizeBlk == AddrBlockThin64KB) + { + allowedSwModeSet.value &= (pOut->resourceType == ADDR_RSRC_TEX_3D) ? + Gfx10Rsrc3dThin64KBSwModeMask : Gfx10Blk64KBSwModeMask; + } else { - ADDR_ASSERT(minSizeBlk == AddrBlock64KB); - allowedSwModeSet.value &= Gfx10Blk64KBSwModeMask; + ADDR_ASSERT(minSizeBlk == AddrBlockVar); + allowedSwModeSet.value &= Gfx10BlkVarSwModeMask; } } // Block type should be determined. - ADDR_ASSERT(IsPow2(GetAllowedBlockSet(allowedSwModeSet).value)); + ADDR_ASSERT(IsPow2(GetAllowedBlockSet(allowedSwModeSet, pOut->resourceType).value)); ADDR2_SWTYPE_SET allowedSwSet = GetAllowedSwSet(allowedSwModeSet); @@ -2659,7 +2874,9 @@ } else if (pIn->resourceType == ADDR_RSRC_TEX_3D) { - if (pIn->flags.color && GetAllowedBlockSet(allowedSwModeSet).macro64KB && allowedSwSet.sw_D) + if (pIn->flags.color && + GetAllowedBlockSet(allowedSwModeSet, pOut->resourceType).macroThick64KB && + allowedSwSet.sw_D) { allowedSwModeSet.value &= Gfx10DisplaySwModeMask; } @@ -2702,8 +2919,8 @@ // Swizzle type should be determined. ADDR_ASSERT(IsPow2(GetAllowedSwSet(allowedSwModeSet).value)); - // Determine swizzle mode now - always select the "largest" swizzle mode for a given block type + - // swizzle type combination. For example, for AddrBlock64KB + ADDR_SW_S, select SW_64KB_S_X(25) if it's + // Determine swizzle mode now. Always select the "largest" swizzle mode for a given block type + + // swizzle type combination. E.g, for AddrBlockThin64KB + ADDR_SW_S, select SW_64KB_S_X(25) if it's // available, or otherwise select SW_64KB_S_T(17) if it's available, or otherwise select SW_64KB_S(9). pOut->swizzleMode = static_cast(Log2NonPow2(allowedSwModeSet.value)); } @@ -3382,209 +3599,253 @@ /** ************************************************************************************************************************ -* Gfx10Lib::GetSwizzlePattern +* Gfx10Lib::GetSwizzlePatternInfo * * @brief * Get swizzle pattern * * @return -* Swizzle pattern +* Swizzle pattern information ************************************************************************************************************************ */ -const UINT_64* Gfx10Lib::GetSwizzlePattern( +const ADDR_SW_PATINFO* Gfx10Lib::GetSwizzlePatternInfo( AddrSwizzleMode swizzleMode, ///< Swizzle mode AddrResourceType resourceType, ///< Resource type UINT_32 elemLog2, ///< Element size in bytes log2 UINT_32 numFrag ///< Number of fragment ) const { - const UINT_32 index = IsXor(swizzleMode) ? (m_colorBaseIndex + elemLog2) : elemLog2; - const UINT_64* pSwizzlePattern = NULL; - const UINT_32 swizzleMask = 1 << swizzleMode; + const UINT_32 index = IsXor(swizzleMode) ? (m_colorBaseIndex + elemLog2) : elemLog2; + const ADDR_SW_PATINFO* patInfo = NULL; + const UINT_32 swizzleMask = 1 << swizzleMode; - if (IsLinear(swizzleMode)) + if (IsLinear(swizzleMode) == FALSE) { - pSwizzlePattern = NULL; - } - else if (resourceType == ADDR_RSRC_TEX_3D) - { - ADDR_ASSERT(numFrag == 1); - - if ((swizzleMask & Gfx10Rsrc3dSwModeMask) == 0) + if (IsBlockVariable(swizzleMode)) { - pSwizzlePattern = NULL; - } - else if (IsRtOptSwizzle(swizzleMode)) - { - pSwizzlePattern = m_settings.supportRbPlus ? SW_64K_R_X_1xaa_RBPLUS[index] : SW_64K_R_X_1xaa[index]; - } - else if (IsZOrderSwizzle(swizzleMode)) - { - pSwizzlePattern = m_settings.supportRbPlus ? SW_64K_Z_X_1xaa_RBPLUS[index] : SW_64K_Z_X_1xaa[index]; - } - else if (IsDisplaySwizzle(resourceType, swizzleMode)) - { - ADDR_ASSERT(swizzleMode == ADDR_SW_64KB_D_X); - pSwizzlePattern = m_settings.supportRbPlus ? SW_64K_D3_X_RBPLUS[index] : SW_64K_D3_X[index]; - } - else - { - ADDR_ASSERT(IsStandardSwizzle(resourceType, swizzleMode)); - - if (IsBlock4kb(swizzleMode)) - { - if (swizzleMode == ADDR_SW_4KB_S) - { - pSwizzlePattern = m_settings.supportRbPlus ? SW_4K_S3_RBPLUS[index] : SW_4K_S3[index]; - } - else - { - ADDR_ASSERT(swizzleMode == ADDR_SW_4KB_S_X); - pSwizzlePattern = m_settings.supportRbPlus ? SW_4K_S3_X_RBPLUS[index] : SW_4K_S3_X[index]; - } - } - else + if (m_blockVarSizeLog2 != 0) { - if (swizzleMode == ADDR_SW_64KB_S) - { - pSwizzlePattern = m_settings.supportRbPlus ? SW_64K_S3_RBPLUS[index] : SW_64K_S3[index]; - } - else if (swizzleMode == ADDR_SW_64KB_S_X) + ADDR_ASSERT(m_settings.supportRbPlus); + + if (IsRtOptSwizzle(swizzleMode)) { - pSwizzlePattern = m_settings.supportRbPlus ? SW_64K_S3_X_RBPLUS[index] : SW_64K_S3_X[index]; + if (numFrag == 1) + { + patInfo = SW_VAR_R_X_1xaa_RBPLUS_PATINFO; + } + else if (numFrag == 2) + { + patInfo = SW_VAR_R_X_2xaa_RBPLUS_PATINFO; + } + else if (numFrag == 4) + { + patInfo = SW_VAR_R_X_4xaa_RBPLUS_PATINFO; + } + else + { + ADDR_ASSERT(numFrag == 8); + patInfo = SW_VAR_R_X_8xaa_RBPLUS_PATINFO; + } } - else + else if (IsZOrderSwizzle(swizzleMode)) { - ADDR_ASSERT(swizzleMode == ADDR_SW_64KB_S_T); - pSwizzlePattern = m_settings.supportRbPlus ? SW_64K_S3_T_RBPLUS[index] : SW_64K_S3_T[index]; + if (numFrag == 1) + { + patInfo = SW_VAR_Z_X_1xaa_RBPLUS_PATINFO; + } + else if (numFrag == 2) + { + patInfo = SW_VAR_Z_X_2xaa_RBPLUS_PATINFO; + } + else if (numFrag == 4) + { + patInfo = SW_VAR_Z_X_4xaa_RBPLUS_PATINFO; + } + else + { + ADDR_ASSERT(numFrag == 8); + patInfo = SW_VAR_Z_X_8xaa_RBPLUS_PATINFO; + } } } } - - } - else - { - if ((swizzleMask & Gfx10Rsrc2dSwModeMask) == 0) - { - pSwizzlePattern = NULL; - } - else if (IsBlock256b(swizzleMode)) - { - if (swizzleMode == ADDR_SW_256B_S) - { - pSwizzlePattern = m_settings.supportRbPlus ? SW_256_S_RBPLUS[index] : SW_256_S[index]; - } - else - { - ADDR_ASSERT(swizzleMode == ADDR_SW_256B_D); - pSwizzlePattern = m_settings.supportRbPlus ? SW_256_D_RBPLUS[index] : SW_256_D[index]; - } - } - else if (IsBlock4kb(swizzleMode)) + else if (resourceType == ADDR_RSRC_TEX_3D) { - if (IsStandardSwizzle(resourceType, swizzleMode)) + ADDR_ASSERT(numFrag == 1); + + if ((swizzleMask & Gfx10Rsrc3dSwModeMask) != 0) { - if (swizzleMode == ADDR_SW_4KB_S) + if (IsRtOptSwizzle(swizzleMode)) { - pSwizzlePattern = m_settings.supportRbPlus ? SW_4K_S_RBPLUS[index] : SW_4K_S[index]; + patInfo = m_settings.supportRbPlus ? SW_64K_R_X_1xaa_RBPLUS_PATINFO : SW_64K_R_X_1xaa_PATINFO; } - else + else if (IsZOrderSwizzle(swizzleMode)) { - ADDR_ASSERT(swizzleMode == ADDR_SW_4KB_S_X); - pSwizzlePattern = m_settings.supportRbPlus ? SW_4K_S_X_RBPLUS[index] : SW_4K_S_X[index]; + patInfo = m_settings.supportRbPlus ? SW_64K_Z_X_1xaa_RBPLUS_PATINFO : SW_64K_Z_X_1xaa_PATINFO; } - } - else - { - if (swizzleMode == ADDR_SW_4KB_D) + else if (IsDisplaySwizzle(resourceType, swizzleMode)) { - pSwizzlePattern = m_settings.supportRbPlus ? SW_4K_D_RBPLUS[index] : SW_4K_D[index]; + ADDR_ASSERT(swizzleMode == ADDR_SW_64KB_D_X); + patInfo = m_settings.supportRbPlus ? SW_64K_D3_X_RBPLUS_PATINFO : SW_64K_D3_X_PATINFO; } else { - ADDR_ASSERT(swizzleMode == ADDR_SW_4KB_D_X); - pSwizzlePattern = m_settings.supportRbPlus ? SW_4K_D_X_RBPLUS[index] : SW_4K_D_X[index]; + ADDR_ASSERT(IsStandardSwizzle(resourceType, swizzleMode)); + + if (IsBlock4kb(swizzleMode)) + { + if (swizzleMode == ADDR_SW_4KB_S) + { + patInfo = m_settings.supportRbPlus ? SW_4K_S3_RBPLUS_PATINFO : SW_4K_S3_PATINFO; + } + else + { + ADDR_ASSERT(swizzleMode == ADDR_SW_4KB_S_X); + patInfo = m_settings.supportRbPlus ? SW_4K_S3_X_RBPLUS_PATINFO : SW_4K_S3_X_PATINFO; + } + } + else + { + if (swizzleMode == ADDR_SW_64KB_S) + { + patInfo = m_settings.supportRbPlus ? SW_64K_S3_RBPLUS_PATINFO : SW_64K_S3_PATINFO; + } + else if (swizzleMode == ADDR_SW_64KB_S_X) + { + patInfo = m_settings.supportRbPlus ? SW_64K_S3_X_RBPLUS_PATINFO : SW_64K_S3_X_PATINFO; + } + else + { + ADDR_ASSERT(swizzleMode == ADDR_SW_64KB_S_T); + patInfo = m_settings.supportRbPlus ? SW_64K_S3_T_RBPLUS_PATINFO : SW_64K_S3_T_PATINFO; + } + } } } } else { - if (IsRtOptSwizzle(swizzleMode)) - { - if (numFrag == 1) - { - pSwizzlePattern = m_settings.supportRbPlus ? SW_64K_R_X_1xaa_RBPLUS[index] : SW_64K_R_X_1xaa[index]; - } - else if (numFrag == 2) - { - pSwizzlePattern = m_settings.supportRbPlus ? SW_64K_R_X_2xaa_RBPLUS[index] : SW_64K_R_X_2xaa[index]; - } - else if (numFrag == 4) - { - pSwizzlePattern = m_settings.supportRbPlus ? SW_64K_R_X_4xaa_RBPLUS[index] : SW_64K_R_X_4xaa[index]; - } - else - { - ADDR_ASSERT(numFrag == 8); - pSwizzlePattern = m_settings.supportRbPlus ? SW_64K_R_X_8xaa_RBPLUS[index] : SW_64K_R_X_8xaa[index]; - } - } - else if (IsZOrderSwizzle(swizzleMode)) - { - if (numFrag == 1) - { - pSwizzlePattern = m_settings.supportRbPlus ? SW_64K_Z_X_1xaa_RBPLUS[index] : SW_64K_Z_X_1xaa[index]; - } - else if (numFrag == 2) - { - pSwizzlePattern = m_settings.supportRbPlus ? SW_64K_Z_X_2xaa_RBPLUS[index] : SW_64K_Z_X_2xaa[index]; - } - else if (numFrag == 4) - { - pSwizzlePattern = m_settings.supportRbPlus ? SW_64K_Z_X_4xaa_RBPLUS[index] : SW_64K_Z_X_4xaa[index]; - } - else - { - ADDR_ASSERT(numFrag == 8); - pSwizzlePattern = m_settings.supportRbPlus ? SW_64K_Z_X_8xaa_RBPLUS[index] : SW_64K_Z_X_8xaa[index]; - } - } - else if (IsDisplaySwizzle(resourceType, swizzleMode)) + if ((swizzleMask & Gfx10Rsrc2dSwModeMask) != 0) { - if (swizzleMode == ADDR_SW_64KB_D) + if (IsBlock256b(swizzleMode)) { - pSwizzlePattern = m_settings.supportRbPlus ? SW_64K_D_RBPLUS[index] : SW_64K_D[index]; - } - else if (swizzleMode == ADDR_SW_64KB_D_X) - { - pSwizzlePattern = m_settings.supportRbPlus ? SW_64K_D_X_RBPLUS[index] : SW_64K_D_X[index]; - } - else - { - ADDR_ASSERT(swizzleMode == ADDR_SW_64KB_D_T); - pSwizzlePattern = m_settings.supportRbPlus ? SW_64K_D_T_RBPLUS[index] : SW_64K_D_T[index]; - } - } - else - { - if (swizzleMode == ADDR_SW_64KB_S) - { - pSwizzlePattern = m_settings.supportRbPlus ? SW_64K_S_RBPLUS[index] : SW_64K_S[index]; + if (swizzleMode == ADDR_SW_256B_S) + { + patInfo = m_settings.supportRbPlus ? SW_256_S_RBPLUS_PATINFO : SW_256_S_PATINFO; + } + else + { + ADDR_ASSERT(swizzleMode == ADDR_SW_256B_D); + patInfo = m_settings.supportRbPlus ? SW_256_D_RBPLUS_PATINFO : SW_256_D_PATINFO; + } } - else if (swizzleMode == ADDR_SW_64KB_S_X) + else if (IsBlock4kb(swizzleMode)) { - pSwizzlePattern = m_settings.supportRbPlus ? SW_64K_S_X_RBPLUS[index] : SW_64K_S_X[index]; + if (IsStandardSwizzle(resourceType, swizzleMode)) + { + if (swizzleMode == ADDR_SW_4KB_S) + { + patInfo = m_settings.supportRbPlus ? SW_4K_S_RBPLUS_PATINFO : SW_4K_S_PATINFO; + } + else + { + ADDR_ASSERT(swizzleMode == ADDR_SW_4KB_S_X); + patInfo = m_settings.supportRbPlus ? SW_4K_S_X_RBPLUS_PATINFO : SW_4K_S_X_PATINFO; + } + } + else + { + if (swizzleMode == ADDR_SW_4KB_D) + { + patInfo = m_settings.supportRbPlus ? SW_4K_D_RBPLUS_PATINFO : SW_4K_D_PATINFO; + } + else + { + ADDR_ASSERT(swizzleMode == ADDR_SW_4KB_D_X); + patInfo = m_settings.supportRbPlus ? SW_4K_D_X_RBPLUS_PATINFO : SW_4K_D_X_PATINFO; + } + } } else { - ADDR_ASSERT(swizzleMode == ADDR_SW_64KB_S_T); - pSwizzlePattern = m_settings.supportRbPlus ? SW_64K_S_T_RBPLUS[index] : SW_64K_S_T[index]; + if (IsRtOptSwizzle(swizzleMode)) + { + if (numFrag == 1) + { + patInfo = m_settings.supportRbPlus ? SW_64K_R_X_1xaa_RBPLUS_PATINFO : SW_64K_R_X_1xaa_PATINFO; + } + else if (numFrag == 2) + { + patInfo = m_settings.supportRbPlus ? SW_64K_R_X_2xaa_RBPLUS_PATINFO : SW_64K_R_X_2xaa_PATINFO; + } + else if (numFrag == 4) + { + patInfo = m_settings.supportRbPlus ? SW_64K_R_X_4xaa_RBPLUS_PATINFO : SW_64K_R_X_4xaa_PATINFO; + } + else + { + ADDR_ASSERT(numFrag == 8); + patInfo = m_settings.supportRbPlus ? SW_64K_R_X_8xaa_RBPLUS_PATINFO : SW_64K_R_X_8xaa_PATINFO; + } + } + else if (IsZOrderSwizzle(swizzleMode)) + { + if (numFrag == 1) + { + patInfo = m_settings.supportRbPlus ? SW_64K_Z_X_1xaa_RBPLUS_PATINFO : SW_64K_Z_X_1xaa_PATINFO; + } + else if (numFrag == 2) + { + patInfo = m_settings.supportRbPlus ? SW_64K_Z_X_2xaa_RBPLUS_PATINFO : SW_64K_Z_X_2xaa_PATINFO; + } + else if (numFrag == 4) + { + patInfo = m_settings.supportRbPlus ? SW_64K_Z_X_4xaa_RBPLUS_PATINFO : SW_64K_Z_X_4xaa_PATINFO; + } + else + { + ADDR_ASSERT(numFrag == 8); + patInfo = m_settings.supportRbPlus ? SW_64K_Z_X_8xaa_RBPLUS_PATINFO : SW_64K_Z_X_8xaa_PATINFO; + } + } + else if (IsDisplaySwizzle(resourceType, swizzleMode)) + { + if (swizzleMode == ADDR_SW_64KB_D) + { + patInfo = m_settings.supportRbPlus ? SW_64K_D_RBPLUS_PATINFO : SW_64K_D_PATINFO; + } + else if (swizzleMode == ADDR_SW_64KB_D_X) + { + patInfo = m_settings.supportRbPlus ? SW_64K_D_X_RBPLUS_PATINFO : SW_64K_D_X_PATINFO; + } + else + { + ADDR_ASSERT(swizzleMode == ADDR_SW_64KB_D_T); + patInfo = m_settings.supportRbPlus ? SW_64K_D_T_RBPLUS_PATINFO : SW_64K_D_T_PATINFO; + } + } + else + { + if (swizzleMode == ADDR_SW_64KB_S) + { + patInfo = m_settings.supportRbPlus ? SW_64K_S_RBPLUS_PATINFO : SW_64K_S_PATINFO; + } + else if (swizzleMode == ADDR_SW_64KB_S_X) + { + patInfo = m_settings.supportRbPlus ? SW_64K_S_X_RBPLUS_PATINFO : SW_64K_S_X_PATINFO; + } + else + { + ADDR_ASSERT(swizzleMode == ADDR_SW_64KB_S_T); + patInfo = m_settings.supportRbPlus ? SW_64K_S_T_RBPLUS_PATINFO : SW_64K_S_T_PATINFO; + } + } } } } } - return pSwizzlePattern; + return (patInfo != NULL) ? &patInfo[index] : NULL; } /** @@ -3699,23 +3960,29 @@ if (localIn.numFrags > 1) { - const UINT_64* pPattern = GetSwizzlePattern(pIn->swizzleMode, - pIn->resourceType, - elemLog2, - localIn.numFrags); + const ADDR_SW_PATINFO* pPatInfo = GetSwizzlePatternInfo(pIn->swizzleMode, + pIn->resourceType, + elemLog2, + localIn.numFrags); - if (pPattern != NULL) + if (pPatInfo != NULL) { const UINT_32 pb = localOut.pitch / localOut.blockWidth; const UINT_32 yb = pIn->y / localOut.blockHeight; const UINT_32 xb = pIn->x / localOut.blockWidth; const UINT_64 blkIdx = yb * pb + xb; - const UINT_32 blkOffset = ComputeOffsetFromSwizzlePattern(pPattern, - blkSizeLog2, - pIn->x, - pIn->y, - pIn->slice, - pIn->sample); + + ADDR_BIT_SETTING fullSwizzlePattern[20]; + GetSwizzlePatternFromPatternInfo(pPatInfo, fullSwizzlePattern); + + const UINT_32 blkOffset = + ComputeOffsetFromSwizzlePattern(reinterpret_cast(fullSwizzlePattern), + blkSizeLog2, + pIn->x, + pIn->y, + pIn->slice, + pIn->sample); + pOut->addr = (localOut.sliceSize * pIn->slice) + (blkIdx << blkSizeLog2) + (blkOffset ^ pipeBankXor); @@ -3775,7 +4042,7 @@ */ UINT_32 Gfx10Lib::HwlComputeMaxBaseAlignments() const { - return GetBlockSize(ADDR_SW_64KB); + return m_blockVarSizeLog2 ? Max(Size64K, 1u << m_blockVarSizeLog2) : Size64K; } /** @@ -3790,26 +4057,47 @@ */ UINT_32 Gfx10Lib::HwlComputeMaxMetaBaseAlignments() const { - // Max base alignment for Htile - Dim3d metaBlk = {0}; - const UINT_32 metaBlkSize = GetMetaBlkSize(Gfx10DataDepthStencil, - ADDR_RSRC_TEX_2D, - ADDR_SW_64KB_Z_X, - 0, - 0, - TRUE, - &metaBlk); - - const UINT_32 maxBaseAlignHtile = Max(metaBlkSize, 1u << (m_pipesLog2 + 11u)); - - // Max base alignment for Cmask - const UINT_32 maxBaseAlignCmask = GetMetaBlkSize(Gfx10DataFmask, - ADDR_RSRC_TEX_2D, - ADDR_SW_64KB_Z_X, - 0, - 0, - TRUE, - &metaBlk); + Dim3d metaBlk; + + const AddrSwizzleMode ValidSwizzleModeForXmask[] = + { + ADDR_SW_64KB_Z_X, + m_blockVarSizeLog2 ? ADDR_SW_VAR_Z_X : ADDR_SW_64KB_Z_X, + }; + + UINT_32 maxBaseAlignHtile = 0; + UINT_32 maxBaseAlignCmask = 0; + + for (UINT_32 swIdx = 0; swIdx < sizeof(ValidSwizzleModeForXmask) / sizeof(ValidSwizzleModeForXmask[0]); swIdx++) + { + for (UINT_32 bppLog2 = 0; bppLog2 < 3; bppLog2++) + { + for (UINT_32 numFragLog2 = 0; numFragLog2 < 4; numFragLog2++) + { + // Max base alignment for Htile + const UINT_32 metaBlkSizeHtile = GetMetaBlkSize(Gfx10DataDepthStencil, + ADDR_RSRC_TEX_2D, + ValidSwizzleModeForXmask[swIdx], + bppLog2, + numFragLog2, + TRUE, + &metaBlk); + + maxBaseAlignHtile = Max(maxBaseAlignHtile, metaBlkSizeHtile); + } + } + + // Max base alignment for Cmask + const UINT_32 metaBlkSizeCmask = GetMetaBlkSize(Gfx10DataFmask, + ADDR_RSRC_TEX_2D, + ValidSwizzleModeForXmask[swIdx], + 0, + 0, + TRUE, + &metaBlk); + + maxBaseAlignCmask = Max(maxBaseAlignCmask, metaBlkSizeCmask); + } // Max base alignment for 2D Dcc const AddrSwizzleMode ValidSwizzleModeForDcc2D[] = @@ -3817,6 +4105,7 @@ ADDR_SW_64KB_S_X, ADDR_SW_64KB_D_X, ADDR_SW_64KB_R_X, + m_blockVarSizeLog2 ? ADDR_SW_VAR_R_X : ADDR_SW_64KB_R_X, }; UINT_32 maxBaseAlignDcc2D = 0; @@ -3847,6 +4136,7 @@ ADDR_SW_64KB_S_X, ADDR_SW_64KB_D_X, ADDR_SW_64KB_R_X, + m_blockVarSizeLog2 ? ADDR_SW_VAR_R_X : ADDR_SW_64KB_R_X, }; UINT_32 maxBaseAlignDcc3D = 0; diff -Nru mesa-19.2.8/src/amd/addrlib/src/gfx10/gfx10addrlib.h mesa-20.0.8/src/amd/addrlib/src/gfx10/gfx10addrlib.h --- mesa-19.2.8/src/amd/addrlib/src/gfx10/gfx10addrlib.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/addrlib/src/gfx10/gfx10addrlib.h 2020-06-12 01:21:16.000000000 +0000 @@ -36,6 +36,7 @@ #include "addrlib2.h" #include "coord.h" +#include "gfx10SwizzlePattern.h" namespace Addr { @@ -93,7 +94,11 @@ (1u << ADDR_SW_64KB_D_X) | (1u << ADDR_SW_64KB_R_X); -const UINT_32 Gfx10ZSwModeMask = (1u << ADDR_SW_64KB_Z_X); +const UINT_32 Gfx10BlkVarSwModeMask = (1u << ADDR_SW_VAR_Z_X) | + (1u << ADDR_SW_VAR_R_X); + +const UINT_32 Gfx10ZSwModeMask = (1u << ADDR_SW_64KB_Z_X) | + (1u << ADDR_SW_VAR_Z_X); const UINT_32 Gfx10StandardSwModeMask = (1u << ADDR_SW_256B_S) | (1u << ADDR_SW_4KB_S) | @@ -109,14 +114,16 @@ (1u << ADDR_SW_4KB_D_X) | (1u << ADDR_SW_64KB_D_X); -const UINT_32 Gfx10RenderSwModeMask = (1u << ADDR_SW_64KB_R_X); +const UINT_32 Gfx10RenderSwModeMask = (1u << ADDR_SW_64KB_R_X) | + (1u << ADDR_SW_VAR_R_X); const UINT_32 Gfx10XSwModeMask = (1u << ADDR_SW_4KB_S_X) | (1u << ADDR_SW_4KB_D_X) | (1u << ADDR_SW_64KB_Z_X) | (1u << ADDR_SW_64KB_S_X) | (1u << ADDR_SW_64KB_D_X) | - (1u << ADDR_SW_64KB_R_X); + (1u << ADDR_SW_64KB_R_X) | + Gfx10BlkVarSwModeMask; const UINT_32 Gfx10TSwModeMask = (1u << ADDR_SW_64KB_S_T) | (1u << ADDR_SW_64KB_D_T); @@ -131,7 +138,8 @@ const UINT_32 Gfx10Rsrc2dSwModeMask = Gfx10LinearSwModeMask | Gfx10Blk256BSwModeMask | Gfx10Blk4KBSwModeMask | - Gfx10Blk64KBSwModeMask; + Gfx10Blk64KBSwModeMask | + Gfx10BlkVarSwModeMask; const UINT_32 Gfx10Rsrc3dSwModeMask = (1u << ADDR_SW_LINEAR) | (1u << ADDR_SW_4KB_S) | @@ -141,14 +149,23 @@ (1u << ADDR_SW_64KB_Z_X) | (1u << ADDR_SW_64KB_S_X) | (1u << ADDR_SW_64KB_D_X) | - (1u << ADDR_SW_64KB_R_X); + (1u << ADDR_SW_64KB_R_X) | + Gfx10BlkVarSwModeMask; const UINT_32 Gfx10Rsrc2dPrtSwModeMask = (Gfx10Blk4KBSwModeMask | Gfx10Blk64KBSwModeMask) & ~Gfx10XSwModeMask; const UINT_32 Gfx10Rsrc3dPrtSwModeMask = Gfx10Rsrc2dPrtSwModeMask & ~Gfx10DisplaySwModeMask; -const UINT_32 Gfx10Rsrc3dThinSwModeMask = (1u << ADDR_SW_64KB_Z_X) | - (1u << ADDR_SW_64KB_R_X); +const UINT_32 Gfx10Rsrc3dThin64KBSwModeMask = (1u << ADDR_SW_64KB_Z_X) | + (1u << ADDR_SW_64KB_R_X); + +const UINT_32 Gfx10Rsrc3dThinSwModeMask = Gfx10Rsrc3dThin64KBSwModeMask | Gfx10BlkVarSwModeMask; + +const UINT_32 Gfx10Rsrc3dThickSwModeMask = Gfx10Rsrc3dSwModeMask & ~(Gfx10Rsrc3dThinSwModeMask | Gfx10LinearSwModeMask); + +const UINT_32 Gfx10Rsrc3dThick4KBSwModeMask = Gfx10Rsrc3dThickSwModeMask & Gfx10Blk4KBSwModeMask; + +const UINT_32 Gfx10Rsrc3dThick64KBSwModeMask = Gfx10Rsrc3dThickSwModeMask & Gfx10Blk64KBSwModeMask; const UINT_32 Gfx10MsaaSwModeMask = Gfx10ZSwModeMask | Gfx10RenderSwModeMask; @@ -290,6 +307,14 @@ const ADDR2_COMPUTE_SURFACE_ADDRFROMCOORD_INPUT* pIn, ADDR2_COMPUTE_SURFACE_ADDRFROMCOORD_OUTPUT* pOut) const; + virtual UINT_32 HwlComputeMaxBaseAlignments() const; + + virtual UINT_32 HwlComputeMaxMetaBaseAlignments() const; + + virtual BOOL_32 HwlInitGlobalParams(const ADDR_CREATE_INPUT* pCreateIn); + + virtual ChipFamily HwlConvertChipFamily(UINT_32 uChipFamily, UINT_32 uChipRevision); + // Initialize equation table VOID InitEquationTable(); @@ -309,6 +334,7 @@ const ADDR2_COMPUTE_SURFACE_ADDRFROMCOORD_INPUT* pIn, ADDR2_COMPUTE_SURFACE_ADDRFROMCOORD_OUTPUT* pOut) const; +private: UINT_32 ComputeOffsetFromSwizzlePattern( const UINT_64* pPattern, UINT_32 numBits, @@ -351,13 +377,6 @@ return compressBlkDim; } - static UINT_32 ShiftCeil( - UINT_32 a, - UINT_32 b) - { - return (a >> b) + (((a & ((1 << b) - 1)) != 0) ? 1 : 0); - } - static void GetMipSize( UINT_32 mip0Width, UINT_32 mip0Height, @@ -376,18 +395,39 @@ } } - const UINT_64* GetSwizzlePattern( + const ADDR_SW_PATINFO* GetSwizzlePatternInfo( AddrSwizzleMode swizzleMode, AddrResourceType resourceType, UINT_32 log2Elem, UINT_32 numFrag) const; + VOID GetSwizzlePatternFromPatternInfo( + const ADDR_SW_PATINFO* pPatInfo, + ADDR_BIT_SETTING (&pSwizzle)[20]) const + { + memcpy(pSwizzle, + GFX10_SW_PATTERN_NIBBLE01[pPatInfo->nibble01Idx], + sizeof(GFX10_SW_PATTERN_NIBBLE01[pPatInfo->nibble01Idx])); + + memcpy(&pSwizzle[8], + GFX10_SW_PATTERN_NIBBLE2[pPatInfo->nibble2Idx], + sizeof(GFX10_SW_PATTERN_NIBBLE2[pPatInfo->nibble2Idx])); + + memcpy(&pSwizzle[12], + GFX10_SW_PATTERN_NIBBLE3[pPatInfo->nibble3Idx], + sizeof(GFX10_SW_PATTERN_NIBBLE3[pPatInfo->nibble3Idx])); + + memcpy(&pSwizzle[16], + GFX10_SW_PATTERN_NIBBLE4[pPatInfo->nibble4Idx], + sizeof(GFX10_SW_PATTERN_NIBBLE4[pPatInfo->nibble4Idx])); + } + VOID ConvertSwizzlePatternToEquation( - UINT_32 elemLog2, - AddrResourceType rsrcType, - AddrSwizzleMode swMode, - const UINT_64* pPattern, - ADDR_EQUATION* pEquation) const; + UINT_32 elemLog2, + AddrResourceType rsrcType, + AddrSwizzleMode swMode, + const ADDR_SW_PATINFO* pPatInfo, + ADDR_EQUATION* pEquation) const; static INT_32 GetMetaElementSizeLog2(Gfx10DataType dataType); @@ -429,14 +469,6 @@ BOOL_32 pipeAlign, Dim3d* pBlock) const; - BOOL_32 IsEquationCompatibleThick( - AddrResourceType resourceType, - AddrSwizzleMode swizzleMode) const - { - return IsThick(resourceType, swizzleMode) && - ((m_settings.supportRbPlus == 0) || (swizzleMode != ADDR_SW_64KB_D_X)); - } - INT_32 GetPipeRotateAmount( AddrResourceType resourceType, AddrSwizzleMode swizzleMode) const; @@ -460,61 +492,29 @@ } - static const Dim3d Block256_3d[MaxNumOfBpp]; - static const Dim3d Block64K_3d[MaxNumOfBpp]; - static const Dim3d Block4K_3d[MaxNumOfBpp]; - static const Dim3d Block64K_Log2_3d[MaxNumOfBpp]; - static const Dim3d Block4K_Log2_3d[MaxNumOfBpp]; - - static const Dim2d Block64K_2d[MaxNumOfBpp]; - static const Dim2d Block4K_2d[MaxNumOfBpp]; - - static const Dim2d Block64K_Log2_2d[MaxNumOfBpp]; - static const Dim2d Block4K_Log2_2d[MaxNumOfBpp]; - - static const SwizzleModeFlags SwizzleModeTable[ADDR_SW_MAX_TYPE]; - - // Max number of swizzle mode supported for equation - static const UINT_32 MaxSwMode = 32; - // Max number of resource type (2D/3D) supported for equation - static const UINT_32 MaxRsrcType = 2; - // Max number of bpp (8bpp/16bpp/32bpp/64bpp/128bpp) - static const UINT_32 MaxElementBytesLog2 = 5; - // Almost all swizzle mode + resource type support equation - static const UINT_32 EquationTableSize = MaxElementBytesLog2 * MaxSwMode * MaxRsrcType; - // Equation table - ADDR_EQUATION m_equationTable[EquationTableSize]; - - // Number of equation entries in the table - UINT_32 m_numEquations; - // Equation lookup table according to bpp and tile index - UINT_32 m_equationLookupTable[MaxRsrcType][MaxSwMode][MaxElementBytesLog2]; - // Number of packers log2 - UINT_32 m_numPkrLog2; - // Number of shader array log2 - UINT_32 m_numSaLog2; - -private: - virtual UINT_32 HwlComputeMaxBaseAlignments() const; - - virtual UINT_32 HwlComputeMaxMetaBaseAlignments() const; - - virtual BOOL_32 HwlInitGlobalParams(const ADDR_CREATE_INPUT* pCreateIn); - - virtual ChipFamily HwlConvertChipFamily(UINT_32 uChipFamily, UINT_32 uChipRevision); - BOOL_32 IsValidDisplaySwizzleMode(const ADDR2_COMPUTE_SURFACE_INFO_INPUT* pIn) const; UINT_32 GetMaxNumMipsInTail(UINT_32 blockSizeLog2, BOOL_32 isThin) const; - static ADDR2_BLOCK_SET GetAllowedBlockSet(ADDR2_SWMODE_SET allowedSwModeSet) + static ADDR2_BLOCK_SET GetAllowedBlockSet(ADDR2_SWMODE_SET allowedSwModeSet, AddrResourceType rsrcType) { ADDR2_BLOCK_SET allowedBlockSet = {}; - allowedBlockSet.micro = (allowedSwModeSet.value & Gfx10Blk256BSwModeMask) ? TRUE : FALSE; - allowedBlockSet.macro4KB = (allowedSwModeSet.value & Gfx10Blk4KBSwModeMask) ? TRUE : FALSE; - allowedBlockSet.macro64KB = (allowedSwModeSet.value & Gfx10Blk64KBSwModeMask) ? TRUE : FALSE; - allowedBlockSet.linear = (allowedSwModeSet.value & Gfx10LinearSwModeMask) ? TRUE : FALSE; + allowedBlockSet.micro = (allowedSwModeSet.value & Gfx10Blk256BSwModeMask) ? TRUE : FALSE; + allowedBlockSet.linear = (allowedSwModeSet.value & Gfx10LinearSwModeMask) ? TRUE : FALSE; + allowedBlockSet.var = (allowedSwModeSet.value & Gfx10BlkVarSwModeMask) ? TRUE : FALSE; + + if (rsrcType == ADDR_RSRC_TEX_3D) + { + allowedBlockSet.macroThick4KB = (allowedSwModeSet.value & Gfx10Rsrc3dThick4KBSwModeMask) ? TRUE : FALSE; + allowedBlockSet.macroThin64KB = (allowedSwModeSet.value & Gfx10Rsrc3dThin64KBSwModeMask) ? TRUE : FALSE; + allowedBlockSet.macroThick64KB = (allowedSwModeSet.value & Gfx10Rsrc3dThick64KBSwModeMask) ? TRUE : FALSE; + } + else + { + allowedBlockSet.macroThin4KB = (allowedSwModeSet.value & Gfx10Blk4KBSwModeMask) ? TRUE : FALSE; + allowedBlockSet.macroThin64KB = (allowedSwModeSet.value & Gfx10Blk64KBSwModeMask) ? TRUE : FALSE; + } return allowedBlockSet; } @@ -554,12 +554,26 @@ BOOL_32 ValidateNonSwModeParams(const ADDR2_COMPUTE_SURFACE_INFO_INPUT* pIn) const; BOOL_32 ValidateSwModeParams(const ADDR2_COMPUTE_SURFACE_INFO_INPUT* pIn) const; - static const UINT_32 ColumnBits = 2; - static const UINT_32 BankBits = 4; + static const UINT_32 ColumnBits = 2; + static const UINT_32 BankBits = 4; + static const UINT_32 UnalignedDccType = 3; + + static const Dim3d Block256_3d[MaxNumOfBpp]; + static const Dim3d Block64K_Log2_3d[MaxNumOfBpp]; + static const Dim3d Block4K_Log2_3d[MaxNumOfBpp]; + + static const SwizzleModeFlags SwizzleModeTable[ADDR_SW_MAX_TYPE]; + + // Number of packers log2 + UINT_32 m_numPkrLog2; + // Number of shader array log2 + UINT_32 m_numSaLog2; Gfx10ChipSettings m_settings; - UINT_32 m_colorBaseIndex; - UINT_32 m_htileBaseIndex; + + UINT_32 m_colorBaseIndex; + UINT_32 m_xmaskBaseIndex; + UINT_32 m_dccBaseIndex; }; } // V2 diff -Nru mesa-19.2.8/src/amd/addrlib/src/gfx10/gfx10SwizzlePattern.h mesa-20.0.8/src/amd/addrlib/src/gfx10/gfx10SwizzlePattern.h --- mesa-19.2.8/src/amd/addrlib/src/gfx10/gfx10SwizzlePattern.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/addrlib/src/gfx10/gfx10SwizzlePattern.h 2020-06-12 01:21:16.000000000 +0000 @@ -57,6 +57,20 @@ /** ************************************************************************************************************************ +* @brief Swizzle pattern information +************************************************************************************************************************ +*/ +struct ADDR_SW_PATINFO +{ + UINT_8 maxItemCount; + UINT_8 nibble01Idx; + UINT_16 nibble2Idx; + UINT_16 nibble3Idx; + UINT_8 nibble4Idx; +}; + +/** +************************************************************************************************************************ * InitBit * * @brief @@ -105,7322 +119,5998 @@ const UINT_64 S1 = InitBit(3, 1); const UINT_64 S2 = InitBit(3, 2); -// Color data swizzle pattern -const UINT_64 SW_256_S[][8]= +const ADDR_SW_PATINFO SW_256_S_PATINFO[] = { - {X0, X1, X2, X3, Y0, Y1, Y2, Y3 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2 }, - {0, 0, 0, X0, Y0, Y1, X1, X2 }, - {0, 0, 0, 0, Y0, Y1, X0, X1 }, -}; - -const UINT_64 SW_256_D[][8]= -{ - {X0, X1, X2, Y1, Y0, Y2, X3, Y3 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2 }, - {0, 0, 0, X0, Y0, X1, X2, Y1 }, - {0, 0, 0, 0, X0, Y0, X1, Y1 }, -}; - -const UINT_64 SW_4K_S[][12]= -{ - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Y5, X5 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4, X4 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Y3, X4 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Y3, X3 }, -}; - -const UINT_64 SW_4K_D[][12]= -{ - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5, X5 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4, X4 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3, X4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3, X3 }, -}; - -const UINT_64 SW_4K_S_X[][12]= -{ - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Y5, X5 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4, X4 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Y3, X4 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Y3, X3 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Z0 ^ X4 ^ Y4, X4, Y5, X5 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Z0 ^ Y3 ^ X4, X4, Y4, X5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Z0 ^ X3 ^ Y3, X3, Y4, X4 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Z0 ^ Y2 ^ X3, X3, Y3, X4 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Z0 ^ X2 ^ Y2, X2, Y3, X3 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5, Y5, X5 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Z1 ^ Y3 ^ X5, Z0 ^ X4 ^ Y4, Y4, X5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Z1 ^ Y3 ^ X4, Z0 ^ X3 ^ Y4, Y4, X4 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Z1 ^ Y2 ^ X4, Z0 ^ X3 ^ Y3, Y3, X4 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Z1 ^ Y2 ^ X3, Z0 ^ X2 ^ Y3, Y3, X3 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Z2 ^ Y4 ^ X6, Z1 ^ X4 ^ Y6, Z0 ^ X5 ^ Y5, X5 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Z2 ^ Y3 ^ X6, Z1 ^ X4 ^ Y5, Z0 ^ Y4 ^ X5, X5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Z2 ^ Y3 ^ X5, Z1 ^ X3 ^ Y5, Z0 ^ X4 ^ Y4, X4 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2 ^ Z2 ^ X5, Z1 ^ X3 ^ Y4, Z0 ^ Y3 ^ X4, X4 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2 ^ Z2 ^ X4, Z1 ^ X2 ^ Y4, Z0 ^ X3 ^ Y3, X3 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Z3 ^ Y4 ^ X7, Z2 ^ X4 ^ Y7, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3 ^ Z3 ^ X7, Z2 ^ X4 ^ Y6, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3 ^ Z3 ^ X6, Z2 ^ X3 ^ Y6, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2 ^ Z3 ^ X6, Z2 ^ X3 ^ Y5, Z1 ^ Y3 ^ X5, Z0 ^ X4 ^ Y4 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2 ^ Z3 ^ X5, X2 ^ Z2 ^ Y5, Z1 ^ Y3 ^ X4, Z0 ^ X3 ^ Y4 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Z3 ^ Y4 ^ X7, Z2 ^ X4 ^ Y7, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3 ^ Z3 ^ X7, Z2 ^ X4 ^ Y6, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3 ^ Z3 ^ X6, Z2 ^ X3 ^ Y6, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2 ^ Z3 ^ X6, Z2 ^ X3 ^ Y5, Z1 ^ Y3 ^ X5, Z0 ^ X4 ^ Y4 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2 ^ Z3 ^ X5, X2 ^ Z2 ^ Y5, Z1 ^ Y3 ^ X4, Z0 ^ X3 ^ Y4 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Z3 ^ Y4 ^ X7, Z2 ^ X4 ^ Y7, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3 ^ Z3 ^ X7, Z2 ^ X4 ^ Y6, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3 ^ Z3 ^ X6, Z2 ^ X3 ^ Y6, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2 ^ Z3 ^ X6, Z2 ^ X3 ^ Y5, Z1 ^ Y3 ^ X5, Z0 ^ X4 ^ Y4 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2 ^ Z3 ^ X5, X2 ^ Z2 ^ Y5, Z1 ^ Y3 ^ X4, Z0 ^ X3 ^ Y4 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Y5, X5 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4, X4 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Y3, X4 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Y3, X3 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, Z0 ^ X4 ^ Y5, Y5, X5 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Z0 ^ X4 ^ Y4, Y4, X5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, Z0 ^ X3 ^ Y4, Y4, X4 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, Z0 ^ X3 ^ Y3, Y3, X4 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, Z0 ^ X2 ^ Y3, Y3, X3 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, Z1 ^ X4 ^ Y6, Z0 ^ X5 ^ Y5, X5 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Z1 ^ X4 ^ Y5, Z0 ^ Y4 ^ X5, X5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, Z1 ^ X3 ^ Y5, Z0 ^ X4 ^ Y4, X4 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, Z1 ^ X3 ^ Y4, Z0 ^ Y3 ^ X4, X4 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, Z1 ^ X2 ^ Y4, Z0 ^ X3 ^ Y3, X3 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, Z2 ^ X4 ^ Y7, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Z2 ^ X4 ^ Y6, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, Z2 ^ X3 ^ Y6, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, Z2 ^ X3 ^ Y5, Z1 ^ Y3 ^ X5, Z0 ^ X4 ^ Y4 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2 ^ Z2 ^ Y5, Z1 ^ Y3 ^ X4, Z0 ^ X3 ^ Y4 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, Z2 ^ X4 ^ Y7, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Z2 ^ X4 ^ Y6, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, Z2 ^ X3 ^ Y6, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, Z2 ^ X3 ^ Y5, Z1 ^ Y3 ^ X5, Z0 ^ X4 ^ Y4 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2 ^ Z2 ^ Y5, Z1 ^ Y3 ^ X4, Z0 ^ X3 ^ Y4 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, Z2 ^ X4 ^ Y7, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Z2 ^ X4 ^ Y6, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, Z2 ^ X3 ^ Y6, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, Z2 ^ X3 ^ Y5, Z1 ^ Y3 ^ X5, Z0 ^ X4 ^ Y4 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2 ^ Z2 ^ Y5, Z1 ^ Y3 ^ X4, Z0 ^ X3 ^ Y4 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, Z2 ^ X4 ^ Y7, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Z2 ^ X4 ^ Y6, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, Z2 ^ X3 ^ Y6, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, Z2 ^ X3 ^ Y5, Z1 ^ Y3 ^ X5, Z0 ^ X4 ^ Y4 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2 ^ Z2 ^ Y5, Z1 ^ Y3 ^ X4, Z0 ^ X3 ^ Y4 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Y5, X5 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4, X4 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Y3, X4 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Y3, X3 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Z0 ^ X5 ^ Y5, X5 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Z0 ^ Y4 ^ X5, X5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Z0 ^ X4 ^ Y4, X4 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Z0 ^ Y3 ^ X4, X4 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Z0 ^ X3 ^ Y3, X3 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Z1 ^ Y3 ^ X5, Z0 ^ X4 ^ Y4 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Z1 ^ Y3 ^ X4, Z0 ^ X3 ^ Y4 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Z1 ^ Y3 ^ X5, Z0 ^ X4 ^ Y4 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Z1 ^ Y3 ^ X4, Z0 ^ X3 ^ Y4 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Z1 ^ Y3 ^ X5, Z0 ^ X4 ^ Y4 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Z1 ^ Y3 ^ X4, Z0 ^ X3 ^ Y4 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Z1 ^ Y3 ^ X5, Z0 ^ X4 ^ Y4 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Z1 ^ Y3 ^ X4, Z0 ^ X3 ^ Y4 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Z1 ^ Y3 ^ X5, Z0 ^ X4 ^ Y4 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Z1 ^ Y3 ^ X4, Z0 ^ X3 ^ Y4 }, -}; - -const UINT_64 SW_4K_D_X[][12]= -{ - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5, X5 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4, X4 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3, X4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3, X3 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Z0 ^ X4 ^ Y4, X4, Y5, X5 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Z0 ^ Y3 ^ X4, X4, Y4, X5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Z0 ^ X3 ^ Y3, X3, Y4, X4 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Z0 ^ Y2 ^ X3, X3, Y3, X4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Z0 ^ X2 ^ Y2, X2, Y3, X3 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5, Y5, X5 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Z1 ^ Y3 ^ X5, Z0 ^ X4 ^ Y4, Y4, X5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Z1 ^ Y3 ^ X4, Z0 ^ X3 ^ Y4, Y4, X4 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Z1 ^ Y2 ^ X4, Z0 ^ X3 ^ Y3, Y3, X4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Z1 ^ Y2 ^ X3, Z0 ^ X2 ^ Y3, Y3, X3 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Z2 ^ Y4 ^ X6, Z1 ^ X4 ^ Y6, Z0 ^ X5 ^ Y5, X5 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Z2 ^ Y3 ^ X6, Z1 ^ X4 ^ Y5, Z0 ^ Y4 ^ X5, X5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Z2 ^ Y3 ^ X5, Z1 ^ X3 ^ Y5, Z0 ^ X4 ^ Y4, X4 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2 ^ Z2 ^ X5, Z1 ^ X3 ^ Y4, Z0 ^ Y3 ^ X4, X4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2 ^ Z2 ^ X4, Z1 ^ X2 ^ Y4, Z0 ^ X3 ^ Y3, X3 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Z3 ^ Y4 ^ X7, Z2 ^ X4 ^ Y7, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3 ^ Z3 ^ X7, Z2 ^ X4 ^ Y6, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3 ^ Z3 ^ X6, Z2 ^ X3 ^ Y6, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2 ^ Z3 ^ X6, Z2 ^ X3 ^ Y5, Z1 ^ Y3 ^ X5, Z0 ^ X4 ^ Y4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2 ^ Z3 ^ X5, X2 ^ Z2 ^ Y5, Z1 ^ Y3 ^ X4, Z0 ^ X3 ^ Y4 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Z3 ^ Y4 ^ X7, Z2 ^ X4 ^ Y7, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3 ^ Z3 ^ X7, Z2 ^ X4 ^ Y6, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3 ^ Z3 ^ X6, Z2 ^ X3 ^ Y6, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2 ^ Z3 ^ X6, Z2 ^ X3 ^ Y5, Z1 ^ Y3 ^ X5, Z0 ^ X4 ^ Y4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2 ^ Z3 ^ X5, X2 ^ Z2 ^ Y5, Z1 ^ Y3 ^ X4, Z0 ^ X3 ^ Y4 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Z3 ^ Y4 ^ X7, Z2 ^ X4 ^ Y7, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3 ^ Z3 ^ X7, Z2 ^ X4 ^ Y6, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3 ^ Z3 ^ X6, Z2 ^ X3 ^ Y6, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2 ^ Z3 ^ X6, Z2 ^ X3 ^ Y5, Z1 ^ Y3 ^ X5, Z0 ^ X4 ^ Y4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2 ^ Z3 ^ X5, X2 ^ Z2 ^ Y5, Z1 ^ Y3 ^ X4, Z0 ^ X3 ^ Y4 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5, X5 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4, X4 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3, X4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3, X3 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, Z0 ^ X4 ^ Y5, Y5, X5 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Z0 ^ X4 ^ Y4, Y4, X5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, Z0 ^ X3 ^ Y4, Y4, X4 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, Z0 ^ X3 ^ Y3, Y3, X4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, Z0 ^ X2 ^ Y3, Y3, X3 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, Z1 ^ X4 ^ Y6, Z0 ^ X5 ^ Y5, X5 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Z1 ^ X4 ^ Y5, Z0 ^ Y4 ^ X5, X5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, Z1 ^ X3 ^ Y5, Z0 ^ X4 ^ Y4, X4 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, Z1 ^ X3 ^ Y4, Z0 ^ Y3 ^ X4, X4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, Z1 ^ X2 ^ Y4, Z0 ^ X3 ^ Y3, X3 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, Z2 ^ X4 ^ Y7, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Z2 ^ X4 ^ Y6, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, Z2 ^ X3 ^ Y6, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, Z2 ^ X3 ^ Y5, Z1 ^ Y3 ^ X5, Z0 ^ X4 ^ Y4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2 ^ Z2 ^ Y5, Z1 ^ Y3 ^ X4, Z0 ^ X3 ^ Y4 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, Z2 ^ X4 ^ Y7, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Z2 ^ X4 ^ Y6, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, Z2 ^ X3 ^ Y6, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, Z2 ^ X3 ^ Y5, Z1 ^ Y3 ^ X5, Z0 ^ X4 ^ Y4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2 ^ Z2 ^ Y5, Z1 ^ Y3 ^ X4, Z0 ^ X3 ^ Y4 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, Z2 ^ X4 ^ Y7, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Z2 ^ X4 ^ Y6, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, Z2 ^ X3 ^ Y6, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, Z2 ^ X3 ^ Y5, Z1 ^ Y3 ^ X5, Z0 ^ X4 ^ Y4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2 ^ Z2 ^ Y5, Z1 ^ Y3 ^ X4, Z0 ^ X3 ^ Y4 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, Z2 ^ X4 ^ Y7, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Z2 ^ X4 ^ Y6, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, Z2 ^ X3 ^ Y6, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, Z2 ^ X3 ^ Y5, Z1 ^ Y3 ^ X5, Z0 ^ X4 ^ Y4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2 ^ Z2 ^ Y5, Z1 ^ Y3 ^ X4, Z0 ^ X3 ^ Y4 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5, X5 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4, X4 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3, X4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3, X3 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Z0 ^ X5 ^ Y5, X5 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Z0 ^ Y4 ^ X5, X5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Z0 ^ X4 ^ Y4, X4 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Z0 ^ Y3 ^ X4, X4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Z0 ^ X3 ^ Y3, X3 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Z1 ^ Y3 ^ X5, Z0 ^ X4 ^ Y4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Z1 ^ Y3 ^ X4, Z0 ^ X3 ^ Y4 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Z1 ^ Y3 ^ X5, Z0 ^ X4 ^ Y4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Z1 ^ Y3 ^ X4, Z0 ^ X3 ^ Y4 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Z1 ^ Y3 ^ X5, Z0 ^ X4 ^ Y4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Z1 ^ Y3 ^ X4, Z0 ^ X3 ^ Y4 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Z1 ^ Y3 ^ X5, Z0 ^ X4 ^ Y4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Z1 ^ Y3 ^ X4, Z0 ^ X3 ^ Y4 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Z1 ^ Y3 ^ X5, Z0 ^ X4 ^ Y4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Z1 ^ Y3 ^ X4, Z0 ^ X3 ^ Y4 }, -}; - -const UINT_64 SW_64K_S[][16]= -{ - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Y3, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Y3, X3, Y4, X4, Y5, X5 }, -}; - -const UINT_64 SW_64K_D[][16]= -{ - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3, X3, Y4, X4, Y5, X5 }, -}; - -const UINT_64 SW_64K_S_T[][16]= -{ - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Y3, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, X4 ^ Y4, X4, Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3 ^ X4, X4, Y4, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, X3 ^ Y3, X3, Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2 ^ X3, X3, Y3, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, X2 ^ Y2, X2, Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4 ^ X5, X4 ^ Y5, Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3 ^ X5, X4 ^ Y4, Y4, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3 ^ X4, X3 ^ Y4, Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2 ^ X4, X3 ^ Y3, Y3, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2 ^ X3, X2 ^ Y3, Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4 ^ X6, X4 ^ Y6, X5 ^ Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3 ^ X6, X4 ^ Y5, Y4 ^ X5, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3 ^ X5, X3 ^ Y5, X4 ^ Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2 ^ X5, X3 ^ Y4, Y3 ^ X4, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2 ^ X4, X2 ^ Y4, X3 ^ Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4 ^ X7, X4 ^ Y7, Y5 ^ X6, X5 ^ Y6, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3 ^ X7, X4 ^ Y6, Y4 ^ X6, X5 ^ Y5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3 ^ X6, X3 ^ Y6, Y4 ^ X5, X4 ^ Y5, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2 ^ X6, X3 ^ Y5, Y3 ^ X5, X4 ^ Y4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2 ^ X5, X2 ^ Y5, Y3 ^ X4, X3 ^ Y4, Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^ X7, X5 ^ Y6, Y5 ^ X6, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4 ^ X6, X4 ^ Y6, X5 ^ Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Y3 ^ X6, X4 ^ Y5, Y4 ^ X5, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Y3 ^ X5, X3 ^ Y5, X4 ^ Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Y5, X5, Y6 ^ X7, X6 ^ Y7, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5, Y5 ^ X7, X6 ^ Y6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4, X4, Y5 ^ X6, X5 ^ Y6, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Y3, X4, Y4 ^ X6, X5 ^ Y5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Y3, X3, Y4 ^ X5, X4 ^ Y5, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Y3, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4 ^ Y5, Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4 ^ Y4, Y4, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3 ^ Y4, Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3 ^ Y3, Y3, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2 ^ Y3, Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4 ^ Y6, X5 ^ Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4 ^ Y5, Y4 ^ X5, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3 ^ Y5, X4 ^ Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3 ^ Y4, Y3 ^ X4, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2 ^ Y4, X3 ^ Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4 ^ Y7, Y5 ^ X6, X5 ^ Y6, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4 ^ Y6, Y4 ^ X6, X5 ^ Y5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3 ^ Y6, Y4 ^ X5, X4 ^ Y5, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3 ^ Y5, Y3 ^ X5, X4 ^ Y4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2 ^ Y5, Y3 ^ X4, X3 ^ Y4, Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^ X7, X5 ^ Y6, Y5 ^ X6, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4 ^ X6, X4 ^ Y6, X5 ^ Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Y3 ^ X6, X4 ^ Y5, Y4 ^ X5, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Y3 ^ X5, X3 ^ Y5, X4 ^ Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Y5, X5, Y6 ^ X7, X6 ^ Y7, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5, Y5 ^ X7, X6 ^ Y6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4, X4, Y5 ^ X6, X5 ^ Y6, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Y3, X4, Y4 ^ X6, X5 ^ Y5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Y3, X3, Y4 ^ X5, X4 ^ Y5, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Y5, X5, Y6, X6, X7 ^ Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6 ^ X7, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4, X4, Y5, X5, X6 ^ Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Y3, X4, Y4, X5, Y5 ^ X6, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Y3, X3, Y4, X4, X5 ^ Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Y3, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, X5 ^ Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^ X5, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, X4 ^ Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Y3 ^ X4, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, X3 ^ Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Y5 ^ X6, X5 ^ Y6, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^ X6, X5 ^ Y5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4 ^ X5, X4 ^ Y5, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Y3 ^ X5, X4 ^ Y4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Y3 ^ X4, X3 ^ Y4, Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^ X7, X5 ^ Y6, Y5 ^ X6, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4 ^ X6, X4 ^ Y6, X5 ^ Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Y3 ^ X6, X4 ^ Y5, Y4 ^ X5, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Y3 ^ X5, X3 ^ Y5, X4 ^ Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Y5, X5, Y6 ^ X7, X6 ^ Y7, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5, Y5 ^ X7, X6 ^ Y6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4, X4, Y5 ^ X6, X5 ^ Y6, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Y3, X4, Y4 ^ X6, X5 ^ Y5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Y3, X3, Y4 ^ X5, X4 ^ Y5, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Y5, X5, Y6, X6, X7 ^ Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6 ^ X7, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4, X4, Y5, X5, X6 ^ Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Y3, X4, Y4, X5, Y5 ^ X6, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Y3, X3, Y4, X4, X5 ^ Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Y3, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Y3, X3, Y4, X4, Y5, X5 }, -}; - -const UINT_64 SW_64K_D_T[][16]= -{ - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, X4 ^ Y4, X4, Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3 ^ X4, X4, Y4, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, X3 ^ Y3, X3, Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2 ^ X3, X3, Y3, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2 ^ Y2, X2, Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4 ^ X5, X4 ^ Y5, Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3 ^ X5, X4 ^ Y4, Y4, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3 ^ X4, X3 ^ Y4, Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2 ^ X4, X3 ^ Y3, Y3, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2 ^ X3, X2 ^ Y3, Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4 ^ X6, X4 ^ Y6, X5 ^ Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3 ^ X6, X4 ^ Y5, Y4 ^ X5, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3 ^ X5, X3 ^ Y5, X4 ^ Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2 ^ X5, X3 ^ Y4, Y3 ^ X4, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2 ^ X4, X2 ^ Y4, X3 ^ Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4 ^ X7, X4 ^ Y7, Y5 ^ X6, X5 ^ Y6, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3 ^ X7, X4 ^ Y6, Y4 ^ X6, X5 ^ Y5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3 ^ X6, X3 ^ Y6, Y4 ^ X5, X4 ^ Y5, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2 ^ X6, X3 ^ Y5, Y3 ^ X5, X4 ^ Y4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2 ^ X5, X2 ^ Y5, Y3 ^ X4, X3 ^ Y4, Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^ X7, X5 ^ Y6, Y5 ^ X6, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4 ^ X6, X4 ^ Y6, X5 ^ Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3 ^ X6, X4 ^ Y5, Y4 ^ X5, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3 ^ X5, X3 ^ Y5, X4 ^ Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5, X5, Y6 ^ X7, X6 ^ Y7, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5, Y5 ^ X7, X6 ^ Y6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4, X4, Y5 ^ X6, X5 ^ Y6, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3, X4, Y4 ^ X6, X5 ^ Y5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3, X3, Y4 ^ X5, X4 ^ Y5, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4 ^ Y5, Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4 ^ Y4, Y4, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3 ^ Y4, Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3 ^ Y3, Y3, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2 ^ Y3, Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4 ^ Y6, X5 ^ Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4 ^ Y5, Y4 ^ X5, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3 ^ Y5, X4 ^ Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3 ^ Y4, Y3 ^ X4, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2 ^ Y4, X3 ^ Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4 ^ Y7, Y5 ^ X6, X5 ^ Y6, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4 ^ Y6, Y4 ^ X6, X5 ^ Y5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3 ^ Y6, Y4 ^ X5, X4 ^ Y5, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3 ^ Y5, Y3 ^ X5, X4 ^ Y4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2 ^ Y5, Y3 ^ X4, X3 ^ Y4, Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^ X7, X5 ^ Y6, Y5 ^ X6, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4 ^ X6, X4 ^ Y6, X5 ^ Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3 ^ X6, X4 ^ Y5, Y4 ^ X5, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3 ^ X5, X3 ^ Y5, X4 ^ Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5, X5, Y6 ^ X7, X6 ^ Y7, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5, Y5 ^ X7, X6 ^ Y6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4, X4, Y5 ^ X6, X5 ^ Y6, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3, X4, Y4 ^ X6, X5 ^ Y5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3, X3, Y4 ^ X5, X4 ^ Y5, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5, X5, Y6, X6, X7 ^ Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6 ^ X7, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4, X4, Y5, X5, X6 ^ Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3, X4, Y4, X5, Y5 ^ X6, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3, X3, Y4, X4, X5 ^ Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, X5 ^ Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^ X5, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, X4 ^ Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3 ^ X4, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, X3 ^ Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5 ^ X6, X5 ^ Y6, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^ X6, X5 ^ Y5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4 ^ X5, X4 ^ Y5, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3 ^ X5, X4 ^ Y4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3 ^ X4, X3 ^ Y4, Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^ X7, X5 ^ Y6, Y5 ^ X6, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4 ^ X6, X4 ^ Y6, X5 ^ Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3 ^ X6, X4 ^ Y5, Y4 ^ X5, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3 ^ X5, X3 ^ Y5, X4 ^ Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5, X5, Y6 ^ X7, X6 ^ Y7, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5, Y5 ^ X7, X6 ^ Y6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4, X4, Y5 ^ X6, X5 ^ Y6, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3, X4, Y4 ^ X6, X5 ^ Y5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3, X3, Y4 ^ X5, X4 ^ Y5, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5, X5, Y6, X6, X7 ^ Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6 ^ X7, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4, X4, Y5, X5, X6 ^ Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3, X4, Y4, X5, Y5 ^ X6, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3, X3, Y4, X4, X5 ^ Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3, X3, Y4, X4, Y5, X5 }, -}; - -const UINT_64 SW_64K_S_X[][16]= -{ - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Y3, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Z0 ^ X4 ^ Y4, X4, Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Z0 ^ Y3 ^ X4, X4, Y4, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Z0 ^ X3 ^ Y3, X3, Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Z0 ^ Y2 ^ X3, X3, Y3, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Z0 ^ X2 ^ Y2, X2, Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5, Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Z1 ^ Y3 ^ X5, Z0 ^ X4 ^ Y4, Y4, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Z1 ^ Y3 ^ X4, Z0 ^ X3 ^ Y4, Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Z1 ^ Y2 ^ X4, Z0 ^ X3 ^ Y3, Y3, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Z1 ^ Y2 ^ X3, Z0 ^ X2 ^ Y3, Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Z2 ^ Y4 ^ X6, Z1 ^ X4 ^ Y6, Z0 ^ X5 ^ Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Z2 ^ Y3 ^ X6, Z1 ^ X4 ^ Y5, Z0 ^ Y4 ^ X5, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Z2 ^ Y3 ^ X5, Z1 ^ X3 ^ Y5, Z0 ^ X4 ^ Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2 ^ Z2 ^ X5, Z1 ^ X3 ^ Y4, Z0 ^ Y3 ^ X4, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2 ^ Z2 ^ X4, Z1 ^ X2 ^ Y4, Z0 ^ X3 ^ Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Z3 ^ Y4 ^ X7, Z2 ^ X4 ^ Y7, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3 ^ Z3 ^ X7, Z2 ^ X4 ^ Y6, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3 ^ Z3 ^ X6, Z2 ^ X3 ^ Y6, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2 ^ Z3 ^ X6, Z2 ^ X3 ^ Y5, Z1 ^ Y3 ^ X5, Z0 ^ X4 ^ Y4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2 ^ Z3 ^ X5, X2 ^ Z2 ^ Y5, Z1 ^ Y3 ^ X4, Z0 ^ X3 ^ Y4, Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4 ^ Z4 ^ X8, Z3 ^ X4 ^ Y8, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3 ^ Z4 ^ X8, Z3 ^ X4 ^ Y7, Z2 ^ Y4 ^ X7, Z1 ^ X5 ^ Y6, Z0 ^ Y5 ^ X6, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3 ^ Z4 ^ X7, X3 ^ Z3 ^ Y7, Z2 ^ Y4 ^ X6, Z1 ^ X4 ^ Y6, Z0 ^ X5 ^ Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2 ^ Z4 ^ X7, X3 ^ Z3 ^ Y6, Z2 ^ Y3 ^ X6, Z1 ^ X4 ^ Y5, Z0 ^ Y4 ^ X5, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2 ^ Z4 ^ X6, X2 ^ Z3 ^ Y6, Z2 ^ Y3 ^ X5, Z1 ^ X3 ^ Y5, Z0 ^ X4 ^ Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4 ^ Z5 ^ X9, X4 ^ Z4 ^ Y9, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3 ^ Z5 ^ X9, X4 ^ Z4 ^ Y8, Z3 ^ Y4 ^ X8, Z2 ^ X5 ^ Y7, Z1 ^ Y5 ^ X7, Z0 ^ X6 ^ Y6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3 ^ Z5 ^ X8, X3 ^ Z4 ^ Y8, Z3 ^ Y4 ^ X7, Z2 ^ X4 ^ Y7, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2 ^ Z5 ^ X8, X3 ^ Z4 ^ Y7, Y3 ^ Z3 ^ X7, Z2 ^ X4 ^ Y6, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2 ^ Z5 ^ X7, X2 ^ Z4 ^ Y7, Y3 ^ Z3 ^ X6, Z2 ^ X3 ^ Y6, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Y3, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, Z0 ^ X4 ^ Y5, Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Z0 ^ X4 ^ Y4, Y4, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, Z0 ^ X3 ^ Y4, Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, Z0 ^ X3 ^ Y3, Y3, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, Z0 ^ X2 ^ Y3, Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, Z1 ^ X4 ^ Y6, Z0 ^ X5 ^ Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Z1 ^ X4 ^ Y5, Z0 ^ Y4 ^ X5, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, Z1 ^ X3 ^ Y5, Z0 ^ X4 ^ Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, Z1 ^ X3 ^ Y4, Z0 ^ Y3 ^ X4, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, Z1 ^ X2 ^ Y4, Z0 ^ X3 ^ Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, Z2 ^ X4 ^ Y7, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Z2 ^ X4 ^ Y6, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, Z2 ^ X3 ^ Y6, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, Z2 ^ X3 ^ Y5, Z1 ^ Y3 ^ X5, Z0 ^ X4 ^ Y4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2 ^ Z2 ^ Y5, Z1 ^ Y3 ^ X4, Z0 ^ X3 ^ Y4, Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, Z3 ^ X4 ^ Y8, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Z3 ^ X4 ^ Y7, Z2 ^ Y4 ^ X7, Z1 ^ X5 ^ Y6, Z0 ^ Y5 ^ X6, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3 ^ Z3 ^ Y7, Z2 ^ Y4 ^ X6, Z1 ^ X4 ^ Y6, Z0 ^ X5 ^ Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3 ^ Z3 ^ Y6, Z2 ^ Y3 ^ X6, Z1 ^ X4 ^ Y5, Z0 ^ Y4 ^ X5, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2 ^ Z3 ^ Y6, Z2 ^ Y3 ^ X5, Z1 ^ X3 ^ Y5, Z0 ^ X4 ^ Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4 ^ Z4 ^ Y9, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4 ^ Z4 ^ Y8, Z3 ^ Y4 ^ X8, Z2 ^ X5 ^ Y7, Z1 ^ Y5 ^ X7, Z0 ^ X6 ^ Y6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3 ^ Z4 ^ Y8, Z3 ^ Y4 ^ X7, Z2 ^ X4 ^ Y7, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3 ^ Z4 ^ Y7, Y3 ^ Z3 ^ X7, Z2 ^ X4 ^ Y6, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2 ^ Z4 ^ Y7, Y3 ^ Z3 ^ X6, Z2 ^ X3 ^ Y6, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4 ^ Z5 ^Y10, Z4 ^ Y5 ^ X9, Z3 ^ X5 ^ Y9, Z2 ^ Y6 ^ X8, Z1 ^ X6 ^ Y8, Z0 ^ X7 ^ Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4 ^ Z5 ^ Y9, Y4 ^ Z4 ^ X9, Z3 ^ X5 ^ Y8, Z2 ^ Y5 ^ X8, Z1 ^ X6 ^ Y7, Z0 ^ Y6 ^ X7, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3 ^ Z5 ^ Y9, Y4 ^ Z4 ^ X8, Z3 ^ X4 ^ Y8, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3 ^ Z5 ^ Y8, Y3 ^ Z4 ^ X8, Z3 ^ X4 ^ Y7, Z2 ^ Y4 ^ X7, Z1 ^ X5 ^ Y6, Z0 ^ Y5 ^ X6, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2 ^ Z5 ^ Y8, Y3 ^ Z4 ^ X7, X3 ^ Z3 ^ Y7, Z2 ^ Y4 ^ X6, Z1 ^ X4 ^ Y6, Z0 ^ X5 ^ Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Y3, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Z0 ^ X5 ^ Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Z0 ^ Y4 ^ X5, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Z0 ^ X4 ^ Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Z0 ^ Y3 ^ X4, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Z0 ^ X3 ^ Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Z1 ^ Y3 ^ X5, Z0 ^ X4 ^ Y4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Z1 ^ Y3 ^ X4, Z0 ^ X3 ^ Y4, Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Z2 ^ Y4 ^ X7, Z1 ^ X5 ^ Y6, Z0 ^ Y5 ^ X6, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Z2 ^ Y4 ^ X6, Z1 ^ X4 ^ Y6, Z0 ^ X5 ^ Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Z2 ^ Y3 ^ X6, Z1 ^ X4 ^ Y5, Z0 ^ Y4 ^ X5, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Z2 ^ Y3 ^ X5, Z1 ^ X3 ^ Y5, Z0 ^ X4 ^ Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Z3 ^ Y4 ^ X8, Z2 ^ X5 ^ Y7, Z1 ^ Y5 ^ X7, Z0 ^ X6 ^ Y6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Z3 ^ Y4 ^ X7, Z2 ^ X4 ^ Y7, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Y3 ^ Z3 ^ X7, Z2 ^ X4 ^ Y6, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Y3 ^ Z3 ^ X6, Z2 ^ X3 ^ Y6, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Z4 ^ Y5 ^ X9, Z3 ^ X5 ^ Y9, Z2 ^ Y6 ^ X8, Z1 ^ X6 ^ Y8, Z0 ^ X7 ^ Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^ Z4 ^ X9, Z3 ^ X5 ^ Y8, Z2 ^ Y5 ^ X8, Z1 ^ X6 ^ Y7, Z0 ^ Y6 ^ X7, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4 ^ Z4 ^ X8, Z3 ^ X4 ^ Y8, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Y3 ^ Z4 ^ X8, Z3 ^ X4 ^ Y7, Z2 ^ Y4 ^ X7, Z1 ^ X5 ^ Y6, Z0 ^ Y5 ^ X6, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Y3 ^ Z4 ^ X7, X3 ^ Z3 ^ Y7, Z2 ^ Y4 ^ X6, Z1 ^ X4 ^ Y6, Z0 ^ X5 ^ Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Y5 ^ Z5 ^X10, Z4 ^ X5 ^Y10, Z3 ^ Y6 ^ X9, Z2 ^ X6 ^ Y9, Z1 ^ Y7 ^ X8, Z0 ^ X7 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^ Z5 ^X10, Z4 ^ X5 ^ Y9, Z3 ^ Y5 ^ X9, Z2 ^ X6 ^ Y8, Z1 ^ Y6 ^ X8, Z0 ^ X7 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4 ^ Z5 ^ X9, X4 ^ Z4 ^ Y9, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Y3 ^ Z5 ^ X9, X4 ^ Z4 ^ Y8, Z3 ^ Y4 ^ X8, Z2 ^ X5 ^ Y7, Z1 ^ Y5 ^ X7, Z0 ^ X6 ^ Y6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Y3 ^ Z5 ^ X8, X3 ^ Z4 ^ Y8, Z3 ^ Y4 ^ X7, Z2 ^ X4 ^ Y7, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6 }, -}; - -const UINT_64 SW_64K_D_X[][16]= -{ - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Z0 ^ X4 ^ Y4, X4, Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Z0 ^ Y3 ^ X4, X4, Y4, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Z0 ^ X3 ^ Y3, X3, Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Z0 ^ Y2 ^ X3, X3, Y3, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Z0 ^ X2 ^ Y2, X2, Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5, Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Z1 ^ Y3 ^ X5, Z0 ^ X4 ^ Y4, Y4, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Z1 ^ Y3 ^ X4, Z0 ^ X3 ^ Y4, Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Z1 ^ Y2 ^ X4, Z0 ^ X3 ^ Y3, Y3, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Z1 ^ Y2 ^ X3, Z0 ^ X2 ^ Y3, Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Z2 ^ Y4 ^ X6, Z1 ^ X4 ^ Y6, Z0 ^ X5 ^ Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Z2 ^ Y3 ^ X6, Z1 ^ X4 ^ Y5, Z0 ^ Y4 ^ X5, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Z2 ^ Y3 ^ X5, Z1 ^ X3 ^ Y5, Z0 ^ X4 ^ Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2 ^ Z2 ^ X5, Z1 ^ X3 ^ Y4, Z0 ^ Y3 ^ X4, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2 ^ Z2 ^ X4, Z1 ^ X2 ^ Y4, Z0 ^ X3 ^ Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Z3 ^ Y4 ^ X7, Z2 ^ X4 ^ Y7, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3 ^ Z3 ^ X7, Z2 ^ X4 ^ Y6, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3 ^ Z3 ^ X6, Z2 ^ X3 ^ Y6, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2 ^ Z3 ^ X6, Z2 ^ X3 ^ Y5, Z1 ^ Y3 ^ X5, Z0 ^ X4 ^ Y4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2 ^ Z3 ^ X5, X2 ^ Z2 ^ Y5, Z1 ^ Y3 ^ X4, Z0 ^ X3 ^ Y4, Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4 ^ Z4 ^ X8, Z3 ^ X4 ^ Y8, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3 ^ Z4 ^ X8, Z3 ^ X4 ^ Y7, Z2 ^ Y4 ^ X7, Z1 ^ X5 ^ Y6, Z0 ^ Y5 ^ X6, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3 ^ Z4 ^ X7, X3 ^ Z3 ^ Y7, Z2 ^ Y4 ^ X6, Z1 ^ X4 ^ Y6, Z0 ^ X5 ^ Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2 ^ Z4 ^ X7, X3 ^ Z3 ^ Y6, Z2 ^ Y3 ^ X6, Z1 ^ X4 ^ Y5, Z0 ^ Y4 ^ X5, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2 ^ Z4 ^ X6, X2 ^ Z3 ^ Y6, Z2 ^ Y3 ^ X5, Z1 ^ X3 ^ Y5, Z0 ^ X4 ^ Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4 ^ Z5 ^ X9, X4 ^ Z4 ^ Y9, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3 ^ Z5 ^ X9, X4 ^ Z4 ^ Y8, Z3 ^ Y4 ^ X8, Z2 ^ X5 ^ Y7, Z1 ^ Y5 ^ X7, Z0 ^ X6 ^ Y6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3 ^ Z5 ^ X8, X3 ^ Z4 ^ Y8, Z3 ^ Y4 ^ X7, Z2 ^ X4 ^ Y7, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2 ^ Z5 ^ X8, X3 ^ Z4 ^ Y7, Y3 ^ Z3 ^ X7, Z2 ^ X4 ^ Y6, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2 ^ Z5 ^ X7, X2 ^ Z4 ^ Y7, Y3 ^ Z3 ^ X6, Z2 ^ X3 ^ Y6, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, Z0 ^ X4 ^ Y5, Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Z0 ^ X4 ^ Y4, Y4, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, Z0 ^ X3 ^ Y4, Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, Z0 ^ X3 ^ Y3, Y3, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, Z0 ^ X2 ^ Y3, Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, Z1 ^ X4 ^ Y6, Z0 ^ X5 ^ Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Z1 ^ X4 ^ Y5, Z0 ^ Y4 ^ X5, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, Z1 ^ X3 ^ Y5, Z0 ^ X4 ^ Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, Z1 ^ X3 ^ Y4, Z0 ^ Y3 ^ X4, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, Z1 ^ X2 ^ Y4, Z0 ^ X3 ^ Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, Z2 ^ X4 ^ Y7, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Z2 ^ X4 ^ Y6, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, Z2 ^ X3 ^ Y6, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, Z2 ^ X3 ^ Y5, Z1 ^ Y3 ^ X5, Z0 ^ X4 ^ Y4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2 ^ Z2 ^ Y5, Z1 ^ Y3 ^ X4, Z0 ^ X3 ^ Y4, Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, Z3 ^ X4 ^ Y8, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Z3 ^ X4 ^ Y7, Z2 ^ Y4 ^ X7, Z1 ^ X5 ^ Y6, Z0 ^ Y5 ^ X6, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3 ^ Z3 ^ Y7, Z2 ^ Y4 ^ X6, Z1 ^ X4 ^ Y6, Z0 ^ X5 ^ Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3 ^ Z3 ^ Y6, Z2 ^ Y3 ^ X6, Z1 ^ X4 ^ Y5, Z0 ^ Y4 ^ X5, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2 ^ Z3 ^ Y6, Z2 ^ Y3 ^ X5, Z1 ^ X3 ^ Y5, Z0 ^ X4 ^ Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4 ^ Z4 ^ Y9, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4 ^ Z4 ^ Y8, Z3 ^ Y4 ^ X8, Z2 ^ X5 ^ Y7, Z1 ^ Y5 ^ X7, Z0 ^ X6 ^ Y6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3 ^ Z4 ^ Y8, Z3 ^ Y4 ^ X7, Z2 ^ X4 ^ Y7, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3 ^ Z4 ^ Y7, Y3 ^ Z3 ^ X7, Z2 ^ X4 ^ Y6, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2 ^ Z4 ^ Y7, Y3 ^ Z3 ^ X6, Z2 ^ X3 ^ Y6, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4 ^ Z5 ^Y10, Z4 ^ Y5 ^ X9, Z3 ^ X5 ^ Y9, Z2 ^ Y6 ^ X8, Z1 ^ X6 ^ Y8, Z0 ^ X7 ^ Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4 ^ Z5 ^ Y9, Y4 ^ Z4 ^ X9, Z3 ^ X5 ^ Y8, Z2 ^ Y5 ^ X8, Z1 ^ X6 ^ Y7, Z0 ^ Y6 ^ X7, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3 ^ Z5 ^ Y9, Y4 ^ Z4 ^ X8, Z3 ^ X4 ^ Y8, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3 ^ Z5 ^ Y8, Y3 ^ Z4 ^ X8, Z3 ^ X4 ^ Y7, Z2 ^ Y4 ^ X7, Z1 ^ X5 ^ Y6, Z0 ^ Y5 ^ X6, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2 ^ Z5 ^ Y8, Y3 ^ Z4 ^ X7, X3 ^ Z3 ^ Y7, Z2 ^ Y4 ^ X6, Z1 ^ X4 ^ Y6, Z0 ^ X5 ^ Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Z0 ^ X5 ^ Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Z0 ^ Y4 ^ X5, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Z0 ^ X4 ^ Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Z0 ^ Y3 ^ X4, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Z0 ^ X3 ^ Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Z1 ^ Y3 ^ X5, Z0 ^ X4 ^ Y4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Z1 ^ Y3 ^ X4, Z0 ^ X3 ^ Y4, Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Z2 ^ Y4 ^ X7, Z1 ^ X5 ^ Y6, Z0 ^ Y5 ^ X6, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Z2 ^ Y4 ^ X6, Z1 ^ X4 ^ Y6, Z0 ^ X5 ^ Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Z2 ^ Y3 ^ X6, Z1 ^ X4 ^ Y5, Z0 ^ Y4 ^ X5, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Z2 ^ Y3 ^ X5, Z1 ^ X3 ^ Y5, Z0 ^ X4 ^ Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Z3 ^ Y4 ^ X8, Z2 ^ X5 ^ Y7, Z1 ^ Y5 ^ X7, Z0 ^ X6 ^ Y6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Z3 ^ Y4 ^ X7, Z2 ^ X4 ^ Y7, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3 ^ Z3 ^ X7, Z2 ^ X4 ^ Y6, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3 ^ Z3 ^ X6, Z2 ^ X3 ^ Y6, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Z4 ^ Y5 ^ X9, Z3 ^ X5 ^ Y9, Z2 ^ Y6 ^ X8, Z1 ^ X6 ^ Y8, Z0 ^ X7 ^ Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^ Z4 ^ X9, Z3 ^ X5 ^ Y8, Z2 ^ Y5 ^ X8, Z1 ^ X6 ^ Y7, Z0 ^ Y6 ^ X7, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4 ^ Z4 ^ X8, Z3 ^ X4 ^ Y8, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3 ^ Z4 ^ X8, Z3 ^ X4 ^ Y7, Z2 ^ Y4 ^ X7, Z1 ^ X5 ^ Y6, Z0 ^ Y5 ^ X6, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3 ^ Z4 ^ X7, X3 ^ Z3 ^ Y7, Z2 ^ Y4 ^ X6, Z1 ^ X4 ^ Y6, Z0 ^ X5 ^ Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5 ^ Z5 ^X10, Z4 ^ X5 ^Y10, Z3 ^ Y6 ^ X9, Z2 ^ X6 ^ Y9, Z1 ^ Y7 ^ X8, Z0 ^ X7 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^ Z5 ^X10, Z4 ^ X5 ^ Y9, Z3 ^ Y5 ^ X9, Z2 ^ X6 ^ Y8, Z1 ^ Y6 ^ X8, Z0 ^ X7 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4 ^ Z5 ^ X9, X4 ^ Z4 ^ Y9, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3 ^ Z5 ^ X9, X4 ^ Z4 ^ Y8, Z3 ^ Y4 ^ X8, Z2 ^ X5 ^ Y7, Z1 ^ Y5 ^ X7, Z0 ^ X6 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3 ^ Z5 ^ X8, X3 ^ Z4 ^ Y8, Z3 ^ Y4 ^ X7, Z2 ^ X4 ^ Y7, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6 }, -}; - -const UINT_64 SW_64K_R_X_1xaa[][16]= -{ - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y4, Z0 ^ X3 ^ Y3, X4, Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Z0 ^ X3 ^ Y3, X4, Y4, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Z0 ^ X3 ^ Y3, X3, Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Z0 ^ X3 ^ Y3, X3, Y2, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Z0 ^ X3 ^ Y3, X2, Y2, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y4, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y4, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y3, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y2, X3, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y2, X2, Y3, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y4, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X4, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X3, Y4, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X3, Y2, X4, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X2, Y2, X3, Y4, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y4, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y4, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y3, X4, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y2, X3, Y4, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y2, X2, Y3, X4 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y4, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X4, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X3, Y4, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, Y2, X3, Y4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, X2, Y3, X4 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y4, X3 ^ Y3 ^ Z5, X4 ^ Y4 ^ Z4, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, X3 ^ Y3 ^ Z5, X4 ^ Y4 ^ Z4, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7, Y4, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, X3 ^ Y3 ^ Z5, X4 ^ Y4 ^ Z4, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7, Y3, X4 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X8, Z1 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7, Y3, X4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7, Y3, X4 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y4, X4, Z0 ^ X3 ^ Y3, Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, X4, Z0 ^ X3 ^ Y3, Y4, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, X3, Z0 ^ X3 ^ Y3, Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, X3, Z0 ^ X3 ^ Y3, Y2, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Z0 ^ X3 ^ Y3, Y2, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y4, Y5, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y4, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, X3, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, X2, Y3, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y4, X5, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, X4, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, X3, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y4, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, X3, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y2, X4, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y2, X3, Y4, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y4, Y6, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y4, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X4, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X3, Y4, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X2, Y3, X4 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y4, X6, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, X4, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, X3, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, Y4, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X3, Y4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, Y3, X4 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y4, Y7, X3 ^ Y3 ^ Z5, X4 ^ Y4 ^ Z4, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y4, X3 ^ Y3 ^ Z5, X4 ^ Y4 ^ Z4, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3 ^ Y3 ^ Z5, X4 ^ Y4 ^ Z4, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7, X4 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y3, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X8, Z1 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7, X4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y3, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7, X4 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y4, X4, Y5, Z0 ^ X3 ^ Y3, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, X4, Y4, Z0 ^ X3 ^ Y3, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, X3, Y4, Z0 ^ X3 ^ Y3, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, X3, Y2, Z0 ^ X3 ^ Y3, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y2, Z0 ^ X3 ^ Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y4, Y5, X5, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y4, X5, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X4, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y3, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y4, X5, Y6, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, X4, Y5, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, X3, Y4, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, X3, Y2, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X4, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y2, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X3, Y4, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y4, Y6, X6, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y4, X6, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X4, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y4, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y3, X4 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y4, X6, Y7, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, X4, Y6, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, X3, Y4, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, Y4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y3, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, X4 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y4, Y7, X7, X3 ^ Y3 ^ Z5, X4 ^ Y4 ^ Z4, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y4, X7, X3 ^ Y3 ^ Z5, X4 ^ Y4 ^ Z4, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X4, X3 ^ Y3 ^ Z5, X4 ^ Y4 ^ Z4, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y3, X4, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X8, Z1 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y3, X4, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7 }, -}; - -const UINT_64 SW_64K_R_X_2xaa[][16]= -{ - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5, X5, Y6, X6, Y7, S0 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6, S0 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4, X4, Y5, X5, Y6, S0 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3, X4, Y4, X5, Y5, S0 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3, X3, Y4, X4, Y5, S0 ^ Y6 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y4, Z0 ^ X3 ^ Y3, X4, Y5, X5, Y6, X6, Y7, S0 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Z0 ^ X3 ^ Y3, X4, Y4, X5, Y5, X6, Y6, S0 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Z0 ^ X3 ^ Y3, X3, Y4, X4, Y5, X5, Y6, S0 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Z0 ^ X3 ^ Y3, X3, Y2, X4, Y4, X5, Y5, S0 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Z0 ^ X3 ^ Y3, X2, Y2, X3, Y4, X4, Y5, S0 ^ Y6 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y4, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y5, X5, Y6, X6, Y7, S0 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y4, X5, Y5, X6, Y6, S0 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y3, X4, Y5, X5, Y6, S0 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y2, X3, Y4, X5, Y5, S0 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y2, X2, Y3, X4, Y5, S0 ^ Y6 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y4, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5, Y6, X6, Y7, S0 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X4, Y5, X6, Y6, S0 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X3, Y4, X5, Y6, S0 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X3, Y2, X4, Y5, S0 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X2, Y2, X3, Y4, S0 ^ Y6 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y4, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y6, X6, Y7, S0 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y4, X6, Y6, S0 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y3, X4, Y6, S0 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y2, X3, Y4, S0 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, S0 ^ X5 ^ Y6, Y2, X2, Y3, X4 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y4, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X6, Y7, S0 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X4, Y6, S0 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X3, Y4, S0 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, S0 ^ X6 ^ Y6, Y2, X3, Y4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, S0 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, X2, Y3, X4 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y4, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X8, Z1 ^ X5 ^ Y8, Z0 ^ Y6 ^ X7, Z5 ^ X6 ^ Y7, Y7, S0 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X8, Z1 ^ X5 ^ Y8, Z0 ^ Y6 ^ X7, Z5 ^ X6 ^ Y7, Y4, S0 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X8, Z1 ^ X5 ^ Y8, Z0 ^ Y6 ^ X7, S0 ^ X6 ^ Y7, Y3, X4 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, S0 ^ Y6 ^ X7, Y2 ^ X6 ^ Y7, X3, Y4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, S0 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7, Y3, X4 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5, X5, Y6, X6, Y7, S0 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6, S0 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4, X4, Y5, X5, Y6, S0 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3, X4, Y4, X5, Y5, S0 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3, X3, Y4, X4, Y5, S0 ^ Y6 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y4, X4, Z0 ^ X3 ^ Y3, Y5, X5, Y6, X6, Y7, S0 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, X4, Z0 ^ X3 ^ Y3, Y4, X5, Y5, X6, Y6, S0 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, X3, Z0 ^ X3 ^ Y3, Y4, X4, Y5, X5, Y6, S0 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, X3, Z0 ^ X3 ^ Y3, Y2, X4, Y4, X5, Y5, S0 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Z0 ^ X3 ^ Y3, Y2, X3, Y4, X4, Y5, S0 ^ Y6 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y4, Y5, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, X5, Y6, X6, Y7, S0 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y4, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, X5, Y5, X6, Y6, S0 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, X4, Y5, X5, Y6, S0 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, X3, Y4, X5, Y5, S0 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, X2, Y3, X4, Y5, S0 ^ Y6 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y4, X5, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y6, X6, Y7, S0 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, X4, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y5, X6, Y6, S0 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, X3, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y4, X5, Y6, S0 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, X3, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y2, X4, Y5, S0 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y2, X3, Y4, S0 ^ Y6 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y4, Y6, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X6, Y7, S0 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y4, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X6, Y6, S0 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X4, Y6, S0 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X3, Y4, S0 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, S0 ^ X5 ^ Y6, X2, Y3, X4 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y4, X6, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, Y7, S0 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, X4, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, Y6, S0 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, X3, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, Y4, S0 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, S0 ^ X6 ^ Y6, X3, Y4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, S0 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, Y3, X4 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y4, Y7, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X8, Z1 ^ X5 ^ Y8, Z0 ^ Y6 ^ X7, Z5 ^ X6 ^ Y7, S0 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y4, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X8, Z1 ^ X5 ^ Y8, Z0 ^ Y6 ^ X7, Z5 ^ X6 ^ Y7, S0 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X8, Z1 ^ X5 ^ Y8, Z0 ^ Y6 ^ X7, S0 ^ X6 ^ Y7, X4 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, X3, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, S0 ^ Y6 ^ X7, Y2 ^ X6 ^ Y7, Y4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y3, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, S0 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7, X4 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5, X5, Y6, X6, Y7, S0 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6, S0 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4, X4, Y5, X5, Y6, S0 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3, X4, Y4, X5, Y5, S0 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3, X3, Y4, X4, Y5, S0 ^ Y6 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y4, X4, Y5, Z0 ^ X3 ^ Y3, X5, Y6, X6, Y7, S0 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, X4, Y4, Z0 ^ X3 ^ Y3, X5, Y5, X6, Y6, S0 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, X3, Y4, Z0 ^ X3 ^ Y3, X4, Y5, X5, Y6, S0 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, X3, Y2, Z0 ^ X3 ^ Y3, X4, Y4, X5, Y5, S0 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y2, Z0 ^ X3 ^ Y3, X3, Y4, X4, Y5, S0 ^ Y6 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y4, Y5, X5, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y6, X6, Y7, S0 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y4, X5, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y5, X6, Y6, S0 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X4, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y5, X5, Y6, S0 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y4, X5, Y5, S0 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y3, X4, Y5, S0 ^ Y6 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y4, X5, Y6, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X6, Y7, S0 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, X4, Y5, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X6, Y6, S0 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, X3, Y4, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5, Y6, S0 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, X3, Y2, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X4, Y5, S0 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y2, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X3, Y4, S0 ^ Y6 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y4, Y6, X6, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y7, S0 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y4, X6, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y6, S0 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X4, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y6, S0 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y4, S0 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, S0 ^ X5 ^ Y6, Y3, X4 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y4, X6, Y7, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, S0 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, X4, Y6, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, S0 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, X3, Y4, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, S0 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, S0 ^ X6 ^ Y6, Y4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y3, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, S0 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, X4 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y4, Y7, S0 ^ Y8, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X8, Z1 ^ X5 ^ Y8, Z0 ^ Y6 ^ X7, Z5 ^ X6 ^ Y7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y4, S0 ^ Y7, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X8, Z1 ^ X5 ^ Y8, Z0 ^ Y6 ^ X7, Z5 ^ X6 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X4, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X8, Z1 ^ X5 ^ Y8, Z0 ^ Y6 ^ X7, S0 ^ X6 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, X3, Y4, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, S0 ^ Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y3, X4, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, S0 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7 }, -}; - -const UINT_64 SW_64K_R_X_4xaa[][16]= -{ - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5, X5, Y6, X6, S0 ^ Y7, S1 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5, Y5, X6, S0 ^ Y6, S1 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4, X4, Y5, X5, S0 ^ Y6, S1 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3, X4, Y4, X5, S0 ^ Y5, S1 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3, X3, Y4, X4, S0 ^ Y5, S1 ^ Y6 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y4, Z0 ^ X3 ^ Y3, X4, Y5, X5, Y6, X6, S0 ^ Y7, S1 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Z0 ^ X3 ^ Y3, X4, Y4, X5, Y5, X6, S0 ^ Y6, S1 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Z0 ^ X3 ^ Y3, X3, Y4, X4, Y5, X5, S0 ^ Y6, S1 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Z0 ^ X3 ^ Y3, X3, Y2, X4, Y4, X5, S0 ^ Y5, S1 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Z0 ^ X3 ^ Y3, X2, Y2, X3, Y4, X4, S0 ^ Y5, S1 ^ Y6 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y4, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y5, X5, Y6, X6, S0 ^ Y7, S1 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y4, X5, Y5, X6, S0 ^ Y6, S1 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y3, X4, Y5, X5, S0 ^ Y6, S1 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y2, X3, Y4, X5, S0 ^ Y5, S1 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y2, X2, Y3, X4, S0 ^ Y5, S1 ^ Y6 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y4, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5, Y6, X6, S0 ^ Y7, S1 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X4, Y5, X6, S0 ^ Y6, S1 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X3, Y4, X5, S0 ^ Y6, S1 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y2, X3, Y4, S0 ^ Y5, S1 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, S1 ^ X5 ^ Y5, X2, Y2, X3, Y4, S0 ^ Y6 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y4, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y6, X6, S0 ^ Y7, S1 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y4, X6, S0 ^ Y6, S1 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y3, X4, S0 ^ Y6, S1 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, S1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y2, X3, Y4, S0 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, S1 ^ Y5 ^ X6, S0 ^ X5 ^ Y6, Y2, X2, Y3, X4 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y4, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, Z4 ^ X6 ^ Y6, X6, S0 ^ Y7, S1 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, Z4 ^ X6 ^ Y6, Y4, S0 ^ Y6, S1 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, S1 ^ X6 ^ Y6, X3, Y4, S0 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, S1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, S0 ^ X6 ^ Y6, Y2, X3, Y4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, S1 ^ Y5 ^ X7, S0 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, X2, Y3, X4 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y4, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, Z5 ^ Y6 ^ X7, Z4 ^ X6 ^ Y7, S0 ^ Y7, S1 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, S1 ^ Y6 ^ X7, Z4 ^ X6 ^ Y7, Y4, S0 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, S1 ^ Y6 ^ X7, S0 ^ X6 ^ Y7, Y3, X4 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, S1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, S0 ^ Y6 ^ X7, Y2 ^ X6 ^ Y7, X3, Y4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, S1 ^ Y5 ^ X8, S0 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7, Y3, X4 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5, X5, Y6, X6, S0 ^ Y7, S1 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5, Y5, X6, S0 ^ Y6, S1 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4, X4, Y5, X5, S0 ^ Y6, S1 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3, X4, Y4, X5, S0 ^ Y5, S1 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3, X3, Y4, X4, S0 ^ Y5, S1 ^ Y6 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y4, X4, Z0 ^ X3 ^ Y3, Y5, X5, Y6, X6, S0 ^ Y7, S1 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, X4, Z0 ^ X3 ^ Y3, Y4, X5, Y5, X6, S0 ^ Y6, S1 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, X3, Z0 ^ X3 ^ Y3, Y4, X4, Y5, X5, S0 ^ Y6, S1 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, X3, Z0 ^ X3 ^ Y3, Y2, X4, Y4, X5, S0 ^ Y5, S1 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Z0 ^ X3 ^ Y3, Y2, X3, Y4, X4, S0 ^ Y5, S1 ^ Y6 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y4, Y5, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, X5, Y6, X6, S0 ^ Y7, S1 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y4, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, X5, Y5, X6, S0 ^ Y6, S1 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, X4, Y5, X5, S0 ^ Y6, S1 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, X3, Y4, X5, S0 ^ Y5, S1 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, X2, Y3, X4, S0 ^ Y5, S1 ^ Y6 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y4, X5, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y6, X6, S0 ^ Y7, S1 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, X4, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y5, X6, S0 ^ Y6, S1 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, X3, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y4, X5, S0 ^ Y6, S1 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X3, Y4, S0 ^ Y5, S1 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, S1 ^ X5 ^ Y5, Y2, X3, Y4, S0 ^ Y6 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y4, Y6, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X6, S0 ^ Y7, S1 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y4, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X6, S0 ^ Y6, S1 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X4, S0 ^ Y6, S1 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, S1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X3, Y4, S0 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, S1 ^ Y5 ^ X6, S0 ^ X5 ^ Y6, X2, Y3, X4 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y4, X6, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, Z4 ^ X6 ^ Y6, S0 ^ Y7, S1 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y4, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, Z4 ^ X6 ^ Y6, S0 ^ Y6, S1 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, X3, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, S1 ^ X6 ^ Y6, Y4, S0 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, S1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, S0 ^ X6 ^ Y6, X3, Y4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, S1 ^ Y5 ^ X7, S0 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, Y3, X4 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y4, S0 ^ Y7, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, Z5 ^ Y6 ^ X7, Z4 ^ X6 ^ Y7, S1 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y4, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, S1 ^ Y6 ^ X7, Z4 ^ X6 ^ Y7, S0 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, S1 ^ Y6 ^ X7, S0 ^ X6 ^ Y7, X4 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, X3, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, S1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, S0 ^ Y6 ^ X7, Y2 ^ X6 ^ Y7, Y4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y3, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, S1 ^ Y5 ^ X8, S0 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7, X4 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5, X5, Y6, X6, S0 ^ Y7, S1 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5, Y5, X6, S0 ^ Y6, S1 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4, X4, Y5, X5, S0 ^ Y6, S1 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3, X4, Y4, X5, S0 ^ Y5, S1 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3, X3, Y4, X4, S0 ^ Y5, S1 ^ Y6 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y4, X4, Y5, Z0 ^ X3 ^ Y3, X5, Y6, X6, S0 ^ Y7, S1 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, X4, Y4, Z0 ^ X3 ^ Y3, X5, Y5, X6, S0 ^ Y6, S1 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, X3, Y4, Z0 ^ X3 ^ Y3, X4, Y5, X5, S0 ^ Y6, S1 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, X3, Y2, Z0 ^ X3 ^ Y3, X4, Y4, X5, S0 ^ Y5, S1 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y2, Z0 ^ X3 ^ Y3, X3, Y4, X4, S0 ^ Y5, S1 ^ Y6 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y4, Y5, X5, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y6, X6, S0 ^ Y7, S1 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y4, X5, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y5, X6, S0 ^ Y6, S1 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X4, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y5, X5, S0 ^ Y6, S1 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y4, X5, S0 ^ Y5, S1 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y3, X4, S0 ^ Y5, S1 ^ Y6 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y4, X5, Y6, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X6, S0 ^ Y7, S1 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, X4, Y5, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X6, S0 ^ Y6, S1 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, X3, Y4, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5, S0 ^ Y6, S1 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y4, S0 ^ Y5, S1 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y2, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, S1 ^ X5 ^ Y5, X3, Y4, S0 ^ Y6 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y4, Y6, X6, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, S0 ^ Y7, S1 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y4, X6, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, S0 ^ Y6, S1 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X4, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, S0 ^ Y6, S1 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, S1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y4, S0 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, S1 ^ Y5 ^ X6, S0 ^ X5 ^ Y6, Y3, X4 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y4, X6, S0 ^ Y7, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, Z4 ^ X6 ^ Y6, S1 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y4, S0 ^ Y6, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, Z4 ^ X6 ^ Y6, S1 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, X3, Y4, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, S1 ^ X6 ^ Y6, S0 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, S1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, S0 ^ X6 ^ Y6, Y4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y3, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, S1 ^ Y5 ^ X7, S0 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, X4 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y4, S0 ^ Y7, S1 ^ Y8, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, Z5 ^ Y6 ^ X7, Z4 ^ X6 ^ Y7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y4, S0 ^ Y7, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, S1 ^ Y6 ^ X7, Z4 ^ X6 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X4, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, S1 ^ Y6 ^ X7, S0 ^ X6 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, X3, Y4, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, S1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, S0 ^ Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y3, X4, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, S1 ^ Y5 ^ X8, S0 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7 }, -}; - -const UINT_64 SW_64K_R_X_8xaa[][16]= -{ - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5, X5, Y6, S0 ^ Y7, S1 ^ Y8, S2 ^ Y9 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5, Y5, S0 ^ Y6, S1 ^ Y7, S2 ^ Y8 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4, X4, Y5, S0 ^ Y6, S1 ^ Y7, S2 ^ Y8 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3, X4, Y4, S0 ^ Y5, S1 ^ Y6, S2 ^ Y7 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3, X3, Y4, S0 ^ Y5, S1 ^ Y6, S2 ^ Y7 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y4, Z0 ^ X3 ^ Y3, X4, Y5, X5, Y6, S0 ^ Y7, S1 ^ Y8, S2 ^ Y9 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Z0 ^ X3 ^ Y3, X4, Y4, X5, Y5, S0 ^ Y6, S1 ^ Y7, S2 ^ Y8 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Z0 ^ X3 ^ Y3, X3, Y4, X4, Y5, S0 ^ Y6, S1 ^ Y7, S2 ^ Y8 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Z0 ^ X3 ^ Y3, X3, Y2, X4, Y4, S0 ^ Y5, S1 ^ Y6, S2 ^ Y7 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Z0 ^ X3 ^ Y3, X2, Y2, X3, Y4, S0 ^ Y5, S1 ^ Y6, S2 ^ Y7 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y4, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y5, X5, Y6, S0 ^ Y7, S1 ^ Y8, S2 ^ Y9 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y4, X5, Y5, S0 ^ Y6, S1 ^ Y7, S2 ^ Y8 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y3, X4, Y5, S0 ^ Y6, S1 ^ Y7, S2 ^ Y8 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y2, X3, Y4, S0 ^ Y5, S1 ^ Y6, S2 ^ Y7 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, X2, Y2, X3, S0 ^ Y5, S1 ^ Y6, S2 ^ Y7 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y4, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5, Y6, S0 ^ Y7, S1 ^ Y8, S2 ^ Y9 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X4, Y5, S0 ^ Y6, S1 ^ Y7, S2 ^ Y8 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X3, Y4, S0 ^ Y6, S1 ^ Y7, S2 ^ Y8 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, S2 ^ X5 ^ Y5, Y2, X3, Y4, S0 ^ Y6, S1 ^ Y7 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, S2 ^ X5 ^ Y5, X2, Y2, X3, S0 ^ Y6, S1 ^ Y7 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y4, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, Z3 ^ X5 ^ Y6, Y6, S0 ^ Y7, S1 ^ Y8, S2 ^ Y9 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, Z3 ^ X5 ^ Y6, Y4, S0 ^ Y6, S1 ^ Y7, S2 ^ Y8 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, S2 ^ X5 ^ Y6, Y3, X4, S0 ^ Y6, S1 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, S2 ^ Y5 ^ X6, S1 ^ X5 ^ Y6, Y2, X3, Y4, S0 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, S2 ^ Y5 ^ X6, S1 ^ X5 ^ Y6, X2, Y2, X3, S0 ^ Y6 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y4, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, Z4 ^ X5 ^ Y7, Z3 ^ X6 ^ Y6, S0 ^ Y7, S1 ^ Y8, S2 ^ Y9 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, Z3 ^ X5 ^ Y7, S2 ^ X6 ^ Y6, Y4, S0 ^ Y7, S1 ^ Y8 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, S2 ^ X5 ^ Y7, S1 ^ X6 ^ Y6, X3, Y4, S0 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, S2 ^ Y5 ^ X7, S1 ^ X5 ^ Y7, S0 ^ X6 ^ Y6, Y2, X3, Y4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, S2 ^ Y5 ^ X7, S1 ^ X5 ^ Y7, S0 ^ X6 ^ Y6, X2, Y2, X3 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y4, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, Z4 ^ X5 ^ Y8, Z3 ^ Y6 ^ X7, S2 ^ X6 ^ Y7, S0 ^ Y7, S1 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, Z3 ^ X5 ^ Y8, S2 ^ Y6 ^ X7, S1 ^ X6 ^ Y7, Y4, S0 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, S2 ^ X5 ^ Y8, S1 ^ Y6 ^ X7, S0 ^ X6 ^ Y7, Y3, X4 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, S2 ^ Y5 ^ X8, S1 ^ X5 ^ Y8, S0 ^ Y6 ^ X7, Y2 ^ X6 ^ Y7, X3, Y4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, S2 ^ Y5 ^ X8, S1 ^ X5 ^ Y8, S0 ^ Y6 ^ X7, X2 ^ X6 ^ Y7, Y2, X3 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5, X5, Y6, S0 ^ Y7, S1 ^ Y8, S2 ^ Y9 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5, Y5, S0 ^ Y6, S1 ^ Y7, S2 ^ Y8 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4, X4, Y5, S0 ^ Y6, S1 ^ Y7, S2 ^ Y8 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3, X4, Y4, S0 ^ Y5, S1 ^ Y6, S2 ^ Y7 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3, X3, Y4, S0 ^ Y5, S1 ^ Y6, S2 ^ Y7 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y4, X4, Z0 ^ X3 ^ Y3, Y5, X5, Y6, S0 ^ Y7, S1 ^ Y8, S2 ^ Y9 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, X4, Z0 ^ X3 ^ Y3, Y4, X5, Y5, S0 ^ Y6, S1 ^ Y7, S2 ^ Y8 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, X3, Z0 ^ X3 ^ Y3, Y4, X4, Y5, S0 ^ Y6, S1 ^ Y7, S2 ^ Y8 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, X3, Z0 ^ X3 ^ Y3, Y2, X4, Y4, S0 ^ Y5, S1 ^ Y6, S2 ^ Y7 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Z0 ^ X3 ^ Y3, Y2, X3, Y4, S0 ^ Y5, S1 ^ Y6, S2 ^ Y7 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y4, Y5, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, X5, Y6, S0 ^ Y7, S1 ^ Y8, S2 ^ Y9 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y4, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, X5, Y5, S0 ^ Y6, S1 ^ Y7, S2 ^ Y8 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, X4, Y5, S0 ^ Y6, S1 ^ Y7, S2 ^ Y8 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, X3, Y4, S0 ^ Y5, S1 ^ Y6, S2 ^ Y7 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y2, X3, S0 ^ Y5, S1 ^ Y6, S2 ^ Y7 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y4, X5, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y6, S0 ^ Y7, S1 ^ Y8, S2 ^ Y9 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, X4, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y5, S0 ^ Y6, S1 ^ Y7, S2 ^ Y8 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, X3, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y4, S0 ^ Y6, S1 ^ Y7, S2 ^ Y8 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, S2 ^ X5 ^ Y5, X3, Y4, S0 ^ Y6, S1 ^ Y7 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, S2 ^ X5 ^ Y5, Y2, X3, S0 ^ Y6, S1 ^ Y7 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y4, Y6, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, Z3 ^ X5 ^ Y6, S0 ^ Y7, S1 ^ Y8, S2 ^ Y9 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y4, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, Z3 ^ X5 ^ Y6, S0 ^ Y6, S1 ^ Y7, S2 ^ Y8 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, S2 ^ X5 ^ Y6, X4, S0 ^ Y6, S1 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, S2 ^ Y5 ^ X6, S1 ^ X5 ^ Y6, X3, Y4, S0 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, S2 ^ Y5 ^ X6, S1 ^ X5 ^ Y6, Y2, X3, S0 ^ Y6 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y4, S0 ^ Y7, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, Z4 ^ X5 ^ Y7, Z3 ^ X6 ^ Y6, S1 ^ Y8, S2 ^ Y9 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y4, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, Z3 ^ X5 ^ Y7, S2 ^ X6 ^ Y6, S0 ^ Y7, S1 ^ Y8 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, X3, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, S2 ^ X5 ^ Y7, S1 ^ X6 ^ Y6, Y4, S0 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, S2 ^ Y5 ^ X7, S1 ^ X5 ^ Y7, S0 ^ X6 ^ Y6, X3, Y4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, S2 ^ Y5 ^ X7, S1 ^ X5 ^ Y7, S0 ^ X6 ^ Y6, Y2, X3 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y4, S0 ^ Y7, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, Z4 ^ X5 ^ Y8, Z3 ^ Y6 ^ X7, S2 ^ X6 ^ Y7, S1 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y4, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, Z3 ^ X5 ^ Y8, S2 ^ Y6 ^ X7, S1 ^ X6 ^ Y7, S0 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, S2 ^ X5 ^ Y8, S1 ^ Y6 ^ X7, S0 ^ X6 ^ Y7, X4 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, X3, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, S2 ^ Y5 ^ X8, S1 ^ X5 ^ Y8, S0 ^ Y6 ^ X7, Y2 ^ X6 ^ Y7, Y4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, S2 ^ Y5 ^ X8, S1 ^ X5 ^ Y8, S0 ^ Y6 ^ X7, X2 ^ X6 ^ Y7, X3 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5, X5, Y6, S0 ^ Y7, S1 ^ Y8, S2 ^ Y9 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5, Y5, S0 ^ Y6, S1 ^ Y7, S2 ^ Y8 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4, X4, Y5, S0 ^ Y6, S1 ^ Y7, S2 ^ Y8 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3, X4, Y4, S0 ^ Y5, S1 ^ Y6, S2 ^ Y7 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3, X3, Y4, S0 ^ Y5, S1 ^ Y6, S2 ^ Y7 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y4, X4, Y5, Z0 ^ X3 ^ Y3, X5, Y6, S0 ^ Y7, S1 ^ Y8, S2 ^ Y9 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, X4, Y4, Z0 ^ X3 ^ Y3, X5, Y5, S0 ^ Y6, S1 ^ Y7, S2 ^ Y8 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, X3, Y4, Z0 ^ X3 ^ Y3, X4, Y5, S0 ^ Y6, S1 ^ Y7, S2 ^ Y8 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, X3, Y2, Z0 ^ X3 ^ Y3, X4, Y4, S0 ^ Y5, S1 ^ Y6, S2 ^ Y7 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y2, Z0 ^ X3 ^ Y3, X3, Y4, S0 ^ Y5, S1 ^ Y6, S2 ^ Y7 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y4, Y5, X5, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y6, S0 ^ Y7, S1 ^ Y8, S2 ^ Y9 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y4, X5, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y5, S0 ^ Y6, S1 ^ Y7, S2 ^ Y8 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X4, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y5, S0 ^ Y6, S1 ^ Y7, S2 ^ Y8 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y4, S0 ^ Y5, S1 ^ Y6, S2 ^ Y7 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y2, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, X3, S0 ^ Y5, S1 ^ Y6, S2 ^ Y7 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y4, X5, Y6, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, S0 ^ Y7, S1 ^ Y8, S2 ^ Y9 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, X4, Y5, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, S0 ^ Y6, S1 ^ Y7, S2 ^ Y8 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, X3, Y4, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, S0 ^ Y6, S1 ^ Y7, S2 ^ Y8 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, S2 ^ X5 ^ Y5, Y4, S0 ^ Y6, S1 ^ Y7 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y2, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, S2 ^ X5 ^ Y5, X3, S0 ^ Y6, S1 ^ Y7 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y4, Y6, S0 ^ Y7, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, Z3 ^ X5 ^ Y6, S1 ^ Y8, S2 ^ Y9 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y4, S0 ^ Y6, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, Z3 ^ X5 ^ Y6, S1 ^ Y7, S2 ^ Y8 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X4, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, S2 ^ X5 ^ Y6, S0 ^ Y6, S1 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, S2 ^ Y5 ^ X6, S1 ^ X5 ^ Y6, Y4, S0 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y2, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, S2 ^ Y5 ^ X6, S1 ^ X5 ^ Y6, X3, S0 ^ Y6 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y4, S0 ^ Y7, S1 ^ Y8, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, Z4 ^ X5 ^ Y7, Z3 ^ X6 ^ Y6, S2 ^ Y9 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y4, S0 ^ Y7, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, Z3 ^ X5 ^ Y7, S2 ^ X6 ^ Y6, S1 ^ Y8 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, X3, Y4, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, S2 ^ X5 ^ Y7, S1 ^ X6 ^ Y6, S0 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, S2 ^ Y5 ^ X7, S1 ^ X5 ^ Y7, S0 ^ X6 ^ Y6, Y4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y2, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, S2 ^ Y5 ^ X7, S1 ^ X5 ^ Y7, S0 ^ X6 ^ Y6, X3 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y4, S0 ^ Y7, S1 ^ Y8, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, Z4 ^ X5 ^ Y8, Z3 ^ Y6 ^ X7, S2 ^ X6 ^ Y7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y4, S0 ^ Y7, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, Z3 ^ X5 ^ Y8, S2 ^ Y6 ^ X7, S1 ^ X6 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X4, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, S2 ^ X5 ^ Y8, S1 ^ Y6 ^ X7, S0 ^ X6 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, X3, Y4, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, S2 ^ Y5 ^ X8, S1 ^ X5 ^ Y8, S0 ^ Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X3, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, S2 ^ Y5 ^ X8, S1 ^ X5 ^ Y8, S0 ^ Y6 ^ X7, X2 ^ X6 ^ Y7 }, -}; - -const UINT_64 SW_64K_Z_X_1xaa[][16]= -{ - {X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4, X4, Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, Y0, X1, Y1, X2, Y2, Y3, X3, Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3, X3, Y4, X4, Y5, X5 }, - {X0, Y0, X1, Y1, X2, Y2, X3, Y4, Z0 ^ X3 ^ Y3, X4, Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, Y0, X1, Y1, X2, Y2, X3, Z0 ^ X3 ^ Y3, X4, Y4, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, Y0, X1, Y1, X2, Y2, Z0 ^ X3 ^ Y3, X3, Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, Y1, X2, Z0 ^ X3 ^ Y3, X3, Y2, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Z0 ^ X3 ^ Y3, X2, Y2, X3, Y4, X4, Y5, X5 }, - {X0, Y0, X1, Y1, X2, Y2, X3, Y4, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, Y0, X1, Y1, X2, Y2, X3, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y4, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, Y0, X1, Y1, X2, Y2, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y3, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, Y1, X2, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y2, X3, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y2, X2, Y3, X4, Y5, X5 }, - {X0, Y0, X1, Y1, X2, Y2, X3, Y4, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, Y0, X1, Y1, X2, Y2, X3, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X4, Y5, X6, Y6, X7 }, - {0, 0, X0, Y0, X1, Y1, X2, Y2, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X3, Y4, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, Y1, X2, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X3, Y2, X4, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X2, Y2, X3, Y4, X5 }, - {X0, Y0, X1, Y1, X2, Y2, X3, Y4, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y6, X6, Y7, X7 }, - {0, X0, Y0, X1, Y1, X2, Y2, X3, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y4, X6, Y6, X7 }, - {0, 0, X0, Y0, X1, Y1, X2, Y2, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y3, X4, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, Y1, X2, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y2, X3, Y4, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y2, X2, Y3, X4 }, - {X0, Y0, X1, Y1, X2, Y2, X3, Y4, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X6, Y7, X7 }, - {0, X0, Y0, X1, Y1, X2, Y2, X3, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X4, Y6, X7 }, - {0, 0, X0, Y0, X1, Y1, X2, Y2, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X3, Y4, X6 }, - {0, 0, 0, X0, Y0, X1, Y1, X2, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, Y2, X3, Y4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, X2, Y3, X4 }, - {X0, Y0, X1, Y1, X2, Y2, X3, Y4, X3 ^ Y3 ^ Z5, X4 ^ Y4 ^ Z4, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7, Y7, X7 }, - {0, X0, Y0, X1, Y1, X2, Y2, X3, X3 ^ Y3 ^ Z5, X4 ^ Y4 ^ Z4, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7, Y4, X7 }, - {0, 0, X0, Y0, X1, Y1, X2, Y2, X3 ^ Y3 ^ Z5, X4 ^ Y4 ^ Z4, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7, Y3, X4 }, - {0, 0, 0, X0, Y0, X1, Y1, X2, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X8, Z1 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7, Y3, X4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7, Y3, X4 }, - {X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4, X4, Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, Y0, X1, Y1, X2, Y2, Y3, X3, Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3, X3, Y4, X4, Y5, X5 }, - {X0, Y0, X1, Y1, X2, Y2, X3, Y4, X4, Z0 ^ X3 ^ Y3, Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, Y0, X1, Y1, X2, Y2, X3, X4, Z0 ^ X3 ^ Y3, Y4, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, Y0, X1, Y1, X2, Y2, X3, Z0 ^ X3 ^ Y3, Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, Y1, X2, X3, Z0 ^ X3 ^ Y3, Y2, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Z0 ^ X3 ^ Y3, Y2, X3, Y4, X4, Y5, X5 }, - {X0, Y0, X1, Y1, X2, Y2, X3, Y4, Y5, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, X5, Y6, X6, Y7, X7 }, - {0, X0, Y0, X1, Y1, X2, Y2, X3, Y4, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, Y0, X1, Y1, X2, Y2, Y3, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, Y1, X2, Y2, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, X3, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, X2, Y3, X4, Y5, X5 }, - {X0, Y0, X1, Y1, X2, Y2, X3, Y4, X5, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y6, X6, Y7, X7 }, - {0, X0, Y0, X1, Y1, X2, Y2, X3, X4, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y5, X6, Y6, X7 }, - {0, 0, X0, Y0, X1, Y1, X2, Y2, X3, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y4, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, Y1, X2, X3, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y2, X4, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y2, X3, Y4, X5 }, - {X0, Y0, X1, Y1, X2, Y2, X3, Y4, Y6, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X6, Y7, X7 }, - {0, X0, Y0, X1, Y1, X2, Y2, X3, Y4, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X6, Y6, X7 }, - {0, 0, X0, Y0, X1, Y1, X2, Y2, Y3, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X4, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, Y1, X2, Y2, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X3, Y4, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X2, Y3, X4 }, - {X0, Y0, X1, Y1, X2, Y2, X3, Y4, X6, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, Y7, X7 }, - {0, X0, Y0, X1, Y1, X2, Y2, X3, X4, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, Y6, X7 }, - {0, 0, X0, Y0, X1, Y1, X2, Y2, X3, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, Y4, X6 }, - {0, 0, 0, X0, Y0, X1, Y1, X2, Y2, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X3, Y4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, Y3, X4 }, - {X0, Y0, X1, Y1, X2, Y2, X3, Y4, Y7, X3 ^ Y3 ^ Z5, X4 ^ Y4 ^ Z4, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7, X7 }, - {0, X0, Y0, X1, Y1, X2, Y2, X3, Y4, X3 ^ Y3 ^ Z5, X4 ^ Y4 ^ Z4, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7, X7 }, - {0, 0, X0, Y0, X1, Y1, X2, Y2, Y3, X3 ^ Y3 ^ Z5, X4 ^ Y4 ^ Z4, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7, X4 }, - {0, 0, 0, X0, Y0, X1, Y1, X2, Y3, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X8, Z1 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7, X4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y3, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7, X4 }, - {X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4, X4, Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, Y0, X1, Y1, X2, Y2, Y3, X3, Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3, X3, Y4, X4, Y5, X5 }, - {X0, Y0, X1, Y1, X2, Y2, X3, Y4, X4, Y5, Z0 ^ X3 ^ Y3, X5, Y6, X6, Y7, X7 }, - {0, X0, Y0, X1, Y1, X2, Y2, X3, X4, Y4, Z0 ^ X3 ^ Y3, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, Y0, X1, Y1, X2, Y2, X3, Y4, Z0 ^ X3 ^ Y3, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, Y1, X2, X3, Y2, Z0 ^ X3 ^ Y3, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y2, Z0 ^ X3 ^ Y3, X3, Y4, X4, Y5, X5 }, - {X0, Y0, X1, Y1, X2, Y2, X3, Y4, Y5, X5, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y6, X6, Y7, X7 }, - {0, X0, Y0, X1, Y1, X2, Y2, X3, Y4, X5, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y5, X6, Y6, X7 }, - {0, 0, X0, Y0, X1, Y1, X2, Y2, Y3, X4, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, Y1, X2, Y2, X3, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y3, X4, Y5, X5 }, - {X0, Y0, X1, Y1, X2, Y2, X3, Y4, X5, Y6, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X6, Y7, X7 }, - {0, X0, Y0, X1, Y1, X2, Y2, X3, X4, Y5, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X6, Y6, X7 }, - {0, 0, X0, Y0, X1, Y1, X2, Y2, X3, Y4, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, Y1, X2, X3, Y2, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X4, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y2, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X3, Y4, X5 }, - {X0, Y0, X1, Y1, X2, Y2, X3, Y4, Y6, X6, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y7, X7 }, - {0, X0, Y0, X1, Y1, X2, Y2, X3, Y4, X6, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y6, X7 }, - {0, 0, X0, Y0, X1, Y1, X2, Y2, Y3, X4, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, Y1, X2, Y2, X3, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y4, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y3, X4 }, - {X0, Y0, X1, Y1, X2, Y2, X3, Y4, X6, Y7, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X7 }, - {0, X0, Y0, X1, Y1, X2, Y2, X3, X4, Y6, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X7 }, - {0, 0, X0, Y0, X1, Y1, X2, Y2, X3, Y4, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X6 }, - {0, 0, 0, X0, Y0, X1, Y1, X2, Y2, X3, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, Y4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y3, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, X4 }, - {X0, Y0, X1, Y1, X2, Y2, X3, Y4, Y7, X7, X3 ^ Y3 ^ Z5, X4 ^ Y4 ^ Z4, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7 }, - {0, X0, Y0, X1, Y1, X2, Y2, X3, Y4, X7, X3 ^ Y3 ^ Z5, X4 ^ Y4 ^ Z4, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7 }, - {0, 0, X0, Y0, X1, Y1, X2, Y2, Y3, X4, X3 ^ Y3 ^ Z5, X4 ^ Y4 ^ Z4, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, Y1, X2, Y3, X4, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X8, Z1 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y3, X4, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7 }, -}; - -const UINT_64 SW_64K_Z_X_2xaa[][16]= -{ - {S0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4, X4, Y5, X5, Y6, X6, Y7 }, - {0, S0, X0, Y0, X1, Y1, X2, Y2, Y3, X3, Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, S0, X0, Y0, X1, Y1, X2, Y3, X3, Y4, X4, Y5, X5, Y6, Y2 ^ Y7 }, - {0, 0, 0, S0, X0, Y0, X1, Y1, X2, Y3, X3, Y4, X4, Y5, X5, Y2 ^ Y6 }, - {0, 0, 0, 0, S0, X0, Y0, X1, Y2, X2, Y3, X3, Y4, X4, Y5, Y1 ^ Y6 }, - {S0, X0, Y0, X1, Y1, X2, Y2, X3, Z0 ^ X3 ^ Y3, Y4, X4, Y5, X5, Y6, X6, Y7 }, - {0, S0, X0, Y0, X1, Y1, X2, Y2, Z0 ^ X3 ^ Y3, X3, Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, S0, X0, Y0, X1, Y1, X2, Z0 ^ X3 ^ Y3, X3, Y4, X4, Y5, X5, Y6, Y2 ^ Y7 }, - {0, 0, 0, S0, X0, Y0, X1, Y1, Z0 ^ X3 ^ Y3, X2, X3, Y4, X4, Y5, X5, Y2 ^ Y6 }, - {0, 0, 0, 0, S0, X0, Y0, X1, Z0 ^ X3 ^ Y3, X2, Y2, X3, Y4, X4, Y5, Y1 ^ Y6 }, - {S0, X0, Y0, X1, Y1, X2, Y2, X3, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y4, Y5, X5, Y6, X6, Y7 }, - {0, S0, X0, Y0, X1, Y1, X2, Y2, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y3, X4, Y5, X5, Y6, X6 }, - {0, 0, S0, X0, Y0, X1, Y1, X2, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y3, X4, Y5, X5, Y6, Y2 ^ Y7 }, - {0, 0, 0, S0, X0, Y0, X1, Y1, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, X2, Y3, X4, Y5, X5, Y2 ^ Y6 }, - {0, 0, 0, 0, S0, X0, Y0, X1, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y2, X2, Y3, X4, Y5, Y1 ^ Y6 }, - {S0, X0, Y0, X1, Y1, X2, Y2, X3, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y4, X5, Y6, X6, Y7 }, - {0, S0, X0, Y0, X1, Y1, X2, Y2, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X3, Y4, X5, Y6, X6 }, - {0, 0, S0, X0, Y0, X1, Y1, X2, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X3, Y4, X5, Y6, Y2 ^ Y7 }, - {0, 0, 0, S0, X0, Y0, X1, Y1, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X2, X3, Y4, X5, Y2 ^ Y6 }, - {0, 0, 0, 0, S0, X0, Y0, X1, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X2, Y2, X3, Y4, Y1 ^ Y6 }, - {S0, X0, Y0, X1, Y1, X2, Y2, X3, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y4, Y6, X6, Y7 }, - {0, S0, X0, Y0, X1, Y1, X2, Y2, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y3, X4, Y6, X6 }, - {0, 0, S0, X0, Y0, X1, Y1, X2, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y3, X4, Y6, Y2 ^ Y7 }, - {0, 0, 0, S0, X0, Y0, X1, Y1, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X2, Y3, X4, Y2 ^ Y6 }, - {0, 0, 0, 0, S0, X0, Y0, X1, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, Y2 ^ X5 ^ Y6, Y1, X3, Y4, X2 ^ Y6 }, - {S0, X0, Y0, X1, Y1, X2, Y2, X3, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, Y4, X6, Y7 }, - {0, S0, X0, Y0, X1, Y1, X2, Y2, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X3, Y4, X6 }, - {0, 0, S0, X0, Y0, X1, Y1, X2, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X3, Y4, Y2 ^ Y7 }, - {0, 0, 0, S0, X0, Y0, X1, Y1, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, Y3, X4, X2 ^ Y7 }, - {0, 0, 0, 0, S0, X0, Y0, X1, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, Y2 ^ X5 ^ Y7, X2 ^ X6 ^ Y6, X3, Y4, Y1 ^ Y7 }, - {S0, X0, Y0, X1, Y1, X2, Y2, X3, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X8, Z1 ^ X5 ^ Y8, Z0 ^ Y6 ^ X7, Z5 ^ X6 ^ Y7, Y4, Y7 }, - {0, S0, X0, Y0, X1, Y1, X2, Y2, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X8, Z1 ^ X5 ^ Y8, Z0 ^ Y6 ^ X7, Z5 ^ X6 ^ Y7, Y3, X4 }, - {0, 0, S0, X0, Y0, X1, Y1, X2, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X8, Z1 ^ X5 ^ Y8, Z0 ^ Y6 ^ X7, Y2 ^ X6 ^ Y7, X3, Y4 }, - {0, 0, 0, S0, X0, Y0, X1, Y1, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7, Y3, X4 }, - {0, 0, 0, 0, S0, X0, Y0, X1, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, Y2 ^ X5 ^ Y8, X2 ^ Y6 ^ X7, Y1 ^ X6 ^ Y7, X3, Y4 }, - {S0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4, X4, Y5, X5, Y6, X6, Y7 }, - {0, S0, X0, Y0, X1, Y1, X2, Y2, Y3, X3, Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, S0, X0, Y0, X1, Y1, X2, Y3, X3, Y4, X4, Y5, X5, Y6, Y2 ^ Y7 }, - {0, 0, 0, S0, X0, Y0, X1, Y1, X2, Y3, X3, Y4, X4, Y5, X5, Y2 ^ Y6 }, - {0, 0, 0, 0, S0, X0, Y0, X1, Y2, X2, Y3, X3, Y4, X4, Y5, Y1 ^ Y6 }, - {S0, X0, Y0, X1, Y1, X2, Y2, X3, Y4, Z0 ^ X3 ^ Y3, X4, Y5, X5, Y6, X6, Y7 }, - {0, S0, X0, Y0, X1, Y1, X2, Y2, X3, Z0 ^ X3 ^ Y3, Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, S0, X0, Y0, X1, Y1, X2, X3, Z0 ^ X3 ^ Y3, Y4, X4, Y5, X5, Y6, Y2 ^ Y7 }, - {0, 0, 0, S0, X0, Y0, X1, Y1, X2, Z0 ^ X3 ^ Y3, X3, Y4, X4, Y5, X5, Y2 ^ Y6 }, - {0, 0, 0, 0, S0, X0, Y0, X1, X2, Z0 ^ X3 ^ Y3, Y2, X3, Y4, X4, Y5, Y1 ^ Y6 }, - {S0, X0, Y0, X1, Y1, X2, Y2, X3, Y4, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y5, X5, Y6, X6, Y7 }, - {0, S0, X0, Y0, X1, Y1, X2, Y2, Y3, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, S0, X0, Y0, X1, Y1, X2, Y3, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, X4, Y5, X5, Y6, Y2 ^ Y7 }, - {0, 0, 0, S0, X0, Y0, X1, Y1, X2, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y3, X4, Y5, X5, Y2 ^ Y6 }, - {0, 0, 0, 0, S0, X0, Y0, X1, Y2, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, X2, Y3, X4, Y5, Y1 ^ Y6 }, - {S0, X0, Y0, X1, Y1, X2, Y2, X3, Y4, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5, Y6, X6, Y7 }, - {0, S0, X0, Y0, X1, Y1, X2, Y2, X3, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y4, X5, Y6, X6 }, - {0, 0, S0, X0, Y0, X1, Y1, X2, X3, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y4, X5, Y6, Y2 ^ Y7 }, - {0, 0, 0, S0, X0, Y0, X1, Y1, X2, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X3, Y4, X5, Y2 ^ Y6 }, - {0, 0, 0, 0, S0, X0, Y0, X1, X2, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y2, X3, Y4, Y1 ^ Y6 }, - {S0, X0, Y0, X1, Y1, X2, Y2, X3, Y4, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y6, X6, Y7 }, - {0, S0, X0, Y0, X1, Y1, X2, Y2, Y3, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X4, Y6, X6 }, - {0, 0, S0, X0, Y0, X1, Y1, X2, Y3, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X4, Y6, Y2 ^ Y7 }, - {0, 0, 0, S0, X0, Y0, X1, Y1, X2, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y3, X4, Y2 ^ Y6 }, - {0, 0, 0, 0, S0, X0, Y0, X1, Y1, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, Y2 ^ X5 ^ Y6, X3, Y4, X2 ^ Y6 }, - {S0, X0, Y0, X1, Y1, X2, Y2, X3, Y4, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X6, Y7 }, - {0, S0, X0, Y0, X1, Y1, X2, Y2, X3, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, Y4, X6 }, - {0, 0, S0, X0, Y0, X1, Y1, X2, X3, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, Y4, Y2 ^ Y7 }, - {0, 0, 0, S0, X0, Y0, X1, Y1, Y3, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, X4, X2 ^ Y7 }, - {0, 0, 0, 0, S0, X0, Y0, X1, X3, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, Y2 ^ X5 ^ Y7, X2 ^ X6 ^ Y6, Y4, Y1 ^ Y7 }, - {S0, X0, Y0, X1, Y1, X2, Y2, X3, Y4, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X8, Z1 ^ X5 ^ Y8, Z0 ^ Y6 ^ X7, Z5 ^ X6 ^ Y7, Y7 }, - {0, S0, X0, Y0, X1, Y1, X2, Y2, Y3, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X8, Z1 ^ X5 ^ Y8, Z0 ^ Y6 ^ X7, Z5 ^ X6 ^ Y7, X4 }, - {0, 0, S0, X0, Y0, X1, Y1, X2, X3, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X8, Z1 ^ X5 ^ Y8, Z0 ^ Y6 ^ X7, Y2 ^ X6 ^ Y7, Y4 }, - {0, 0, 0, S0, X0, Y0, X1, Y1, Y3, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7, X4 }, - {0, 0, 0, 0, S0, X0, Y0, X1, X3, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, Y2 ^ X5 ^ Y8, X2 ^ Y6 ^ X7, Y1 ^ X6 ^ Y7, Y4 }, - {S0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4, X4, Y5, X5, Y6, X6, Y7 }, - {0, S0, X0, Y0, X1, Y1, X2, Y2, Y3, X3, Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, S0, X0, Y0, X1, Y1, X2, Y3, X3, Y4, X4, Y5, X5, Y6, Y2 ^ Y7 }, - {0, 0, 0, S0, X0, Y0, X1, Y1, X2, Y3, X3, Y4, X4, Y5, X5, Y2 ^ Y6 }, - {0, 0, 0, 0, S0, X0, Y0, X1, Y2, X2, Y3, X3, Y4, X4, Y5, Y1 ^ Y6 }, - {S0, X0, Y0, X1, Y1, X2, Y2, X3, Y4, X4, Z0 ^ X3 ^ Y3, Y5, X5, Y6, X6, Y7 }, - {0, S0, X0, Y0, X1, Y1, X2, Y2, X3, Y4, Z0 ^ X3 ^ Y3, X4, Y5, X5, Y6, X6 }, - {0, 0, S0, X0, Y0, X1, Y1, X2, X3, Y4, Z0 ^ X3 ^ Y3, X4, Y5, X5, Y6, Y2 ^ Y7 }, - {0, 0, 0, S0, X0, Y0, X1, Y1, X2, X3, Z0 ^ X3 ^ Y3, Y4, X4, Y5, X5, Y2 ^ Y6 }, - {0, 0, 0, 0, S0, X0, Y0, X1, X2, Y2, Z0 ^ X3 ^ Y3, X3, Y4, X4, Y5, Y1 ^ Y6 }, - {S0, X0, Y0, X1, Y1, X2, Y2, X3, Y4, Y5, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, X5, Y6, X6, Y7 }, - {0, S0, X0, Y0, X1, Y1, X2, Y2, Y3, X4, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y5, X5, Y6, X6 }, - {0, 0, S0, X0, Y0, X1, Y1, X2, Y3, X4, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y5, X5, Y6, Y2 ^ Y7 }, - {0, 0, 0, S0, X0, Y0, X1, Y1, X2, Y3, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, X4, Y5, X5, Y2 ^ Y6 }, - {0, 0, 0, 0, S0, X0, Y0, X1, Y2, X2, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y3, X4, Y5, Y1 ^ Y6 }, - {S0, X0, Y0, X1, Y1, X2, Y2, X3, Y4, X5, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y6, X6, Y7 }, - {0, S0, X0, Y0, X1, Y1, X2, Y2, X3, Y4, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5, Y6, X6 }, - {0, 0, S0, X0, Y0, X1, Y1, X2, X3, Y4, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5, Y6, Y2 ^ Y7 }, - {0, 0, 0, S0, X0, Y0, X1, Y1, X2, X3, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y4, X5, Y2 ^ Y6 }, - {0, 0, 0, 0, S0, X0, Y0, X1, X2, Y2, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X3, Y4, Y1 ^ Y6 }, - {S0, X0, Y0, X1, Y1, X2, Y2, X3, Y4, Y6, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X6, Y7 }, - {0, S0, X0, Y0, X1, Y1, X2, Y2, Y3, X4, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y6, X6 }, - {0, 0, S0, X0, Y0, X1, Y1, X2, Y3, X4, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y6, Y2 ^ Y7 }, - {0, 0, 0, S0, X0, Y0, X1, Y1, X2, Y3, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X4, Y2 ^ Y6 }, - {0, 0, 0, 0, S0, X0, Y0, X1, Y1, X3, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, Y2 ^ X5 ^ Y6, Y4, X2 ^ Y6 }, - {S0, X0, Y0, X1, Y1, X2, Y2, X3, Y4, X6, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, Y7 }, - {0, S0, X0, Y0, X1, Y1, X2, Y2, X3, Y4, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X6 }, - {0, 0, S0, X0, Y0, X1, Y1, X2, X3, Y4, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, Y2 ^ Y7 }, - {0, 0, 0, S0, X0, Y0, X1, Y1, Y3, X4, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, X2 ^ Y7 }, - {0, 0, 0, 0, S0, X0, Y0, X1, X3, Y4, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, Y2 ^ X5 ^ Y7, X2 ^ X6 ^ Y6, Y1 ^ Y7 }, - {S0, X0, Y0, X1, Y1, X2, Y2, X3, Y4, Y7, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X8, Z1 ^ X5 ^ Y8, Z0 ^ Y6 ^ X7, Z5 ^ X6 ^ Y7 }, - {0, S0, X0, Y0, X1, Y1, X2, Y2, Y3, X4, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X8, Z1 ^ X5 ^ Y8, Z0 ^ Y6 ^ X7, Z5 ^ X6 ^ Y7 }, - {0, 0, S0, X0, Y0, X1, Y1, X2, X3, Y4, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X8, Z1 ^ X5 ^ Y8, Z0 ^ Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {0, 0, 0, S0, X0, Y0, X1, Y1, Y3, X4, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7 }, - {0, 0, 0, 0, S0, X0, Y0, X1, X3, Y4, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, Y2 ^ X5 ^ Y8, X2 ^ Y6 ^ X7, Y1 ^ X6 ^ Y7 }, -}; - -const UINT_64 SW_64K_Z_X_4xaa[][16]= -{ - {S0, S1, X0, Y0, X1, Y1, X2, Y2, Y3, X3, Y4, X4, Y5, X5, Y6, X6 }, - {0, S0, S1, X0, Y0, X1, Y1, X2, X3, Y3, X4, Y4, X5, Y5, X6, Y2 ^ Y6 }, - {0, 0, S0, S1, X0, Y0, X1, Y1, Y3, X3, Y4, X4, Y5, X5, Y2 ^ Y6, X2 ^ Y7 }, - {0, 0, 0, S0, S1, X0, Y0, X1, Y2, X3, Y3, X4, Y4, X5, Y1 ^ Y5, X2 ^ Y6 }, - {0, 0, 0, 0, S0, S1, X0, Y0, Y2, X2, Y3, X3, Y4, X4, Y1 ^ Y5, X1 ^ Y6 }, - {S0, S1, X0, Y0, X1, Y1, X2, Y2, Z0 ^ X3 ^ Y3, X3, Y4, X4, Y5, X5, Y6, X6 }, - {0, S0, S1, X0, Y0, X1, Y1, X2, Z0 ^ X3 ^ Y3, X3, X4, Y4, X5, Y5, X6, Y2 ^ Y6 }, - {0, 0, S0, S1, X0, Y0, X1, Y1, Z0 ^ X3 ^ Y3, X3, Y4, X4, Y5, X5, X2 ^ Y6, Y2 ^ Y7 }, - {0, 0, 0, S0, S1, X0, Y0, X1, Z0 ^ X3 ^ Y3, X3, Y2, X4, Y4, X5, X2 ^ Y5, Y1 ^ Y6 }, - {0, 0, 0, 0, S0, S1, X0, Y0, Z0 ^ X3 ^ Y3, X2, Y2, X3, Y4, X4, X1 ^ Y5, Y1 ^ Y6 }, - {S0, S1, X0, Y0, X1, Y1, X2, Y2, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y3, X4, Y5, X5, Y6, X6 }, - {0, S0, S1, X0, Y0, X1, Y1, X2, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, X3, Y4, X5, Y5, X6, Y2 ^ Y6 }, - {0, 0, S0, S1, X0, Y0, X1, Y1, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y3, X4, Y5, X5, Y2 ^ Y6, X2 ^ Y7 }, - {0, 0, 0, S0, S1, X0, Y0, X1, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y2, X3, Y4, X5, Y1 ^ Y5, X2 ^ Y6 }, - {0, 0, 0, 0, S0, S1, X0, Y0, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y2, X2, Y3, X4, Y1 ^ Y5, X1 ^ Y6 }, - {S0, S1, X0, Y0, X1, Y1, X2, Y2, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X3, Y4, X5, Y6, X6 }, - {0, S0, S1, X0, Y0, X1, Y1, X2, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X3, X4, Y5, X6, Y2 ^ Y6 }, - {0, 0, S0, S1, X0, Y0, X1, Y1, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X3, Y4, X5, X2 ^ Y6, Y2 ^ Y7 }, - {0, 0, 0, S0, S1, X0, Y0, X1, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y2, X3, Y4, Y1 ^ Y5, X2 ^ Y6 }, - {0, 0, 0, 0, S0, S1, X0, Y0, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y2 ^ X5 ^ Y5, X2, Y3, X4, X1 ^ Y6, Y1 ^ Y7 }, - {S0, S1, X0, Y0, X1, Y1, X2, Y2, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y3, X4, Y6, X6 }, - {0, S0, S1, X0, Y0, X1, Y1, X2, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X3, Y4, X6, Y2 ^ Y6 }, - {0, 0, S0, S1, X0, Y0, X1, Y1, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y3, X4, Y2 ^ Y6, X2 ^ Y7 }, - {0, 0, 0, S0, S1, X0, Y0, X1, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Y2 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y3, X4, Y1 ^ Y6, X2 ^ Y7 }, - {0, 0, 0, 0, S0, S1, X0, Y0, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y2 ^ Y5 ^ X6, X1 ^ X5 ^ Y6, Y3, X4, Y1 ^ Y6, X2 ^ Y7 }, - {S0, S1, X0, Y0, X1, Y1, X2, Y2, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, Z4 ^ X6 ^ Y6, X3, Y4, X6 }, - {0, S0, S1, X0, Y0, X1, Y1, X2, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, Z4 ^ X6 ^ Y6, X3, Y4, Y2 ^ Y6 }, - {0, 0, S0, S1, X0, Y0, X1, Y1, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, Y3, X4, X2 ^ Y7 }, - {0, 0, 0, S0, S1, X0, Y0, X1, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Y2 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, Y1 ^ X6 ^ Y6, Y3, X4, X2 ^ Y7 }, - {0, 0, 0, 0, S0, S1, X0, Y0, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y2 ^ Y5 ^ X7, X1 ^ X5 ^ Y7, Y1 ^ X6 ^ Y6, Y3, X4, X2 ^ Y7 }, - {S0, S1, X0, Y0, X1, Y1, X2, Y2, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, Z5 ^ Y6 ^ X7, Z4 ^ X6 ^ Y7, Y3, X4 }, - {0, S0, S1, X0, Y0, X1, Y1, X2, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, Z4 ^ X6 ^ Y7, Y3, X4 }, - {0, 0, S0, S1, X0, Y0, X1, Y1, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7, Y3, X4 }, - {0, 0, 0, S0, S1, X0, Y0, X1, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Y2 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, Y1 ^ Y6 ^ X7, X2 ^ X6 ^ Y7, Y3, X4 }, - {0, 0, 0, 0, S0, S1, X0, Y0, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y2 ^ Y5 ^ X8, X1 ^ X5 ^ Y8, Y1 ^ Y6 ^ X7, X2 ^ X6 ^ Y7, Y3, X4 }, - {S0, S1, X0, Y0, X1, Y1, X2, Y2, Y3, X3, Y4, X4, Y5, X5, Y6, X6 }, - {0, S0, S1, X0, Y0, X1, Y1, X2, X3, Y3, X4, Y4, X5, Y5, X6, Y2 ^ Y6 }, - {0, 0, S0, S1, X0, Y0, X1, Y1, Y3, X3, Y4, X4, Y5, X5, Y2 ^ Y6, X2 ^ Y7 }, - {0, 0, 0, S0, S1, X0, Y0, X1, Y2, X3, Y3, X4, Y4, X5, Y1 ^ Y5, X2 ^ Y6 }, - {0, 0, 0, 0, S0, S1, X0, Y0, Y2, X2, Y3, X3, Y4, X4, Y1 ^ Y5, X1 ^ Y6 }, - {S0, S1, X0, Y0, X1, Y1, X2, Y2, X3, Z0 ^ X3 ^ Y3, Y4, X4, Y5, X5, Y6, X6 }, - {0, S0, S1, X0, Y0, X1, Y1, X2, X3, Z0 ^ X3 ^ Y3, X4, Y4, X5, Y5, X6, Y2 ^ Y6 }, - {0, 0, S0, S1, X0, Y0, X1, Y1, X3, Z0 ^ X3 ^ Y3, Y4, X4, Y5, X5, X2 ^ Y6, Y2 ^ Y7 }, - {0, 0, 0, S0, S1, X0, Y0, X1, X3, Z0 ^ X3 ^ Y3, Y2, X4, Y4, X5, X2 ^ Y5, Y1 ^ Y6 }, - {0, 0, 0, 0, S0, S1, X0, Y0, X2, Z0 ^ X3 ^ Y3, Y2, X3, Y4, X4, X1 ^ Y5, Y1 ^ Y6 }, - {S0, S1, X0, Y0, X1, Y1, X2, Y2, Y3, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, X4, Y5, X5, Y6, X6 }, - {0, S0, S1, X0, Y0, X1, Y1, X2, X3, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y4, X5, Y5, X6, Y2 ^ Y6 }, - {0, 0, S0, S1, X0, Y0, X1, Y1, Y3, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, X4, Y5, X5, Y2 ^ Y6, X2 ^ Y7 }, - {0, 0, 0, S0, S1, X0, Y0, X1, Y2, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, X3, Y4, X5, Y1 ^ Y5, X2 ^ Y6 }, - {0, 0, 0, 0, S0, S1, X0, Y0, Y2, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, X2, Y3, X4, Y1 ^ Y5, X1 ^ Y6 }, - {S0, S1, X0, Y0, X1, Y1, X2, Y2, X3, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y4, X5, Y6, X6 }, - {0, S0, S1, X0, Y0, X1, Y1, X2, X3, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X4, Y5, X6, Y2 ^ Y6 }, - {0, 0, S0, S1, X0, Y0, X1, Y1, X3, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y4, X5, X2 ^ Y6, Y2 ^ Y7 }, - {0, 0, 0, S0, S1, X0, Y0, X1, Y2, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X3, Y4, Y1 ^ Y5, X2 ^ Y6 }, - {0, 0, 0, 0, S0, S1, X0, Y0, X2, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y2 ^ X5 ^ Y5, Y3, X4, X1 ^ Y6, Y1 ^ Y7 }, - {S0, S1, X0, Y0, X1, Y1, X2, Y2, Y3, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X4, Y6, X6 }, - {0, S0, S1, X0, Y0, X1, Y1, X2, X3, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y4, X6, Y2 ^ Y6 }, - {0, 0, S0, S1, X0, Y0, X1, Y1, Y3, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X4, Y2 ^ Y6, X2 ^ Y7 }, - {0, 0, 0, S0, S1, X0, Y0, X1, Y3, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Y2 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X4, Y1 ^ Y6, X2 ^ Y7 }, - {0, 0, 0, 0, S0, S1, X0, Y0, Y3, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y2 ^ Y5 ^ X6, X1 ^ X5 ^ Y6, X4, Y1 ^ Y6, X2 ^ Y7 }, - {S0, S1, X0, Y0, X1, Y1, X2, Y2, X3, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, Z4 ^ X6 ^ Y6, Y4, X6 }, - {0, S0, S1, X0, Y0, X1, Y1, X2, X3, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, Z4 ^ X6 ^ Y6, Y4, Y2 ^ Y6 }, - {0, 0, S0, S1, X0, Y0, X1, Y1, Y3, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, X4, X2 ^ Y7 }, - {0, 0, 0, S0, S1, X0, Y0, X1, Y3, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Y2 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, Y1 ^ X6 ^ Y6, X4, X2 ^ Y7 }, - {0, 0, 0, 0, S0, S1, X0, Y0, Y3, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y2 ^ Y5 ^ X7, X1 ^ X5 ^ Y7, Y1 ^ X6 ^ Y6, X4, X2 ^ Y7 }, - {S0, S1, X0, Y0, X1, Y1, X2, Y2, Y3, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, Z5 ^ Y6 ^ X7, Z4 ^ X6 ^ Y7, X4 }, - {0, S0, S1, X0, Y0, X1, Y1, X2, Y3, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, Z4 ^ X6 ^ Y7, X4 }, - {0, 0, S0, S1, X0, Y0, X1, Y1, Y3, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7, X4 }, - {0, 0, 0, S0, S1, X0, Y0, X1, Y3, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Y2 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, Y1 ^ Y6 ^ X7, X2 ^ X6 ^ Y7, X4 }, - {0, 0, 0, 0, S0, S1, X0, Y0, Y3, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y2 ^ Y5 ^ X8, X1 ^ X5 ^ Y8, Y1 ^ Y6 ^ X7, X2 ^ X6 ^ Y7, X4 }, - {S0, S1, X0, Y0, X1, Y1, X2, Y2, Y3, X3, Y4, X4, Y5, X5, Y6, X6 }, - {0, S0, S1, X0, Y0, X1, Y1, X2, X3, Y3, X4, Y4, X5, Y5, X6, Y2 ^ Y6 }, - {0, 0, S0, S1, X0, Y0, X1, Y1, Y3, X3, Y4, X4, Y5, X5, Y2 ^ Y6, X2 ^ Y7 }, - {0, 0, 0, S0, S1, X0, Y0, X1, Y2, X3, Y3, X4, Y4, X5, Y1 ^ Y5, X2 ^ Y6 }, - {0, 0, 0, 0, S0, S1, X0, Y0, Y2, X2, Y3, X3, Y4, X4, Y1 ^ Y5, X1 ^ Y6 }, - {S0, S1, X0, Y0, X1, Y1, X2, Y2, X3, Y4, Z0 ^ X3 ^ Y3, X4, Y5, X5, Y6, X6 }, - {0, S0, S1, X0, Y0, X1, Y1, X2, X3, X4, Z0 ^ X3 ^ Y3, Y4, X5, Y5, X6, Y2 ^ Y6 }, - {0, 0, S0, S1, X0, Y0, X1, Y1, X3, Y4, Z0 ^ X3 ^ Y3, X4, Y5, X5, X2 ^ Y6, Y2 ^ Y7 }, - {0, 0, 0, S0, S1, X0, Y0, X1, X3, Y2, Z0 ^ X3 ^ Y3, X4, Y4, X5, X2 ^ Y5, Y1 ^ Y6 }, - {0, 0, 0, 0, S0, S1, X0, Y0, X2, Y2, Z0 ^ X3 ^ Y3, X3, Y4, X4, X1 ^ Y5, Y1 ^ Y6 }, - {S0, S1, X0, Y0, X1, Y1, X2, Y2, Y3, X4, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y5, X5, Y6, X6 }, - {0, S0, S1, X0, Y0, X1, Y1, X2, X3, Y4, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, X5, Y5, X6, Y2 ^ Y6 }, - {0, 0, S0, S1, X0, Y0, X1, Y1, Y3, X4, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y5, X5, Y2 ^ Y6, X2 ^ Y7 }, - {0, 0, 0, S0, S1, X0, Y0, X1, Y2, X3, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y4, X5, Y1 ^ Y5, X2 ^ Y6 }, - {0, 0, 0, 0, S0, S1, X0, Y0, Y2, X2, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y3, X4, Y1 ^ Y5, X1 ^ Y6 }, - {S0, S1, X0, Y0, X1, Y1, X2, Y2, X3, Y4, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5, Y6, X6 }, - {0, S0, S1, X0, Y0, X1, Y1, X2, X3, X4, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y5, X6, Y2 ^ Y6 }, - {0, 0, S0, S1, X0, Y0, X1, Y1, X3, Y4, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5, X2 ^ Y6, Y2 ^ Y7 }, - {0, 0, 0, S0, S1, X0, Y0, X1, Y2, X3, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y4, Y1 ^ Y5, X2 ^ Y6 }, - {0, 0, 0, 0, S0, S1, X0, Y0, X2, Y3, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y2 ^ X5 ^ Y5, X4, X1 ^ Y6, Y1 ^ Y7 }, - {S0, S1, X0, Y0, X1, Y1, X2, Y2, Y3, X4, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y6, X6 }, - {0, S0, S1, X0, Y0, X1, Y1, X2, X3, Y4, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X6, Y2 ^ Y6 }, - {0, 0, S0, S1, X0, Y0, X1, Y1, Y3, X4, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y2 ^ Y6, X2 ^ Y7 }, - {0, 0, 0, S0, S1, X0, Y0, X1, Y3, X4, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Y2 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y1 ^ Y6, X2 ^ Y7 }, - {0, 0, 0, 0, S0, S1, X0, Y0, Y3, X4, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y2 ^ Y5 ^ X6, X1 ^ X5 ^ Y6, Y1 ^ Y6, X2 ^ Y7 }, - {S0, S1, X0, Y0, X1, Y1, X2, Y2, X3, Y4, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, Z4 ^ X6 ^ Y6, X6 }, - {0, S0, S1, X0, Y0, X1, Y1, X2, X3, Y4, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, Z4 ^ X6 ^ Y6, Y2 ^ Y6 }, - {0, 0, S0, S1, X0, Y0, X1, Y1, Y3, X4, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, X2 ^ Y7 }, - {0, 0, 0, S0, S1, X0, Y0, X1, Y3, X4, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Y2 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, Y1 ^ X6 ^ Y6, X2 ^ Y7 }, - {0, 0, 0, 0, S0, S1, X0, Y0, Y3, X4, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y2 ^ Y5 ^ X7, X1 ^ X5 ^ Y7, Y1 ^ X6 ^ Y6, X2 ^ Y7 }, - {S0, S1, X0, Y0, X1, Y1, X2, Y2, Y3, X4, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, Z5 ^ Y6 ^ X7, Z4 ^ X6 ^ Y7 }, - {0, S0, S1, X0, Y0, X1, Y1, X2, Y3, X4, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, Z4 ^ X6 ^ Y7 }, - {0, 0, S0, S1, X0, Y0, X1, Y1, Y3, X4, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7 }, - {0, 0, 0, S0, S1, X0, Y0, X1, Y3, X4, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Y2 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, Y1 ^ Y6 ^ X7, X2 ^ X6 ^ Y7 }, - {0, 0, 0, 0, S0, S1, X0, Y0, Y3, X4, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y2 ^ Y5 ^ X8, X1 ^ X5 ^ Y8, Y1 ^ Y6 ^ X7, X2 ^ X6 ^ Y7 }, -}; - -const UINT_64 SW_64K_Z_X_8xaa[][16]= -{ - {S0, S1, S2, X0, Y0, X1, Y1, X2, Y3, X3, Y4, X4, Y5, X5, Y6, Y2 ^ Y7 }, - {0, S0, S1, S2, X0, Y0, X1, Y1, Y3, X3, Y4, X4, Y5, X5, Y2 ^ Y6, X2 ^ Y7 }, - {0, 0, S0, S1, S2, X0, Y0, X1, Y3, X3, Y4, X4, Y5, Y1 ^ Y6, Y2 ^ Y7, X2 ^ Y8 }, - {0, 0, 0, S0, S1, S2, X0, Y0, X2, Y3, X3, Y4, X4, Y1 ^ Y5, X1 ^ Y6, Y2 ^ Y7 }, - {0, 0, 0, 0, S0, S1, S2, X0, Y2, X2, Y3, X3, Y4, Y0 ^ Y5, Y1 ^ Y6, X1 ^ Y7 }, - {S0, S1, S2, X0, Y0, X1, Y1, X2, Z0 ^ X3 ^ Y3, X3, Y4, X4, Y5, X5, Y6, Y2 ^ Y7 }, - {0, S0, S1, S2, X0, Y0, X1, Y1, Z0 ^ X3 ^ Y3, X3, Y4, X4, Y5, X5, X2 ^ Y6, Y2 ^ Y7 }, - {0, 0, S0, S1, S2, X0, Y0, X1, Z0 ^ X3 ^ Y3, X3, Y4, X4, Y5, Y1 ^ Y6, X2 ^ Y7, Y2 ^ Y8 }, - {0, 0, 0, S0, S1, S2, X0, Y0, Z0 ^ X3 ^ Y3, Y2, X3, Y4, X4, X1 ^ Y5, Y1 ^ Y6, X2 ^ Y7 }, - {0, 0, 0, 0, S0, S1, S2, X0, Z0 ^ X3 ^ Y3, X2, Y2, X3, Y4, Y0 ^ Y5, X1 ^ Y6, Y1 ^ Y7 }, - {S0, S1, S2, X0, Y0, X1, Y1, X2, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y3, X4, Y5, X5, Y6, Y2 ^ Y7 }, - {0, S0, S1, S2, X0, Y0, X1, Y1, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y3, X4, Y5, X5, Y2 ^ Y6, X2 ^ Y7 }, - {0, 0, S0, S1, S2, X0, Y0, X1, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y3, X4, Y5, Y1 ^ Y6, Y2 ^ Y7, X2 ^ Y8 }, - {0, 0, 0, S0, S1, S2, X0, Y0, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, X2, Y3, X4, Y1 ^ Y5, X1 ^ Y6, Y2 ^ Y7 }, - {0, 0, 0, 0, S0, S1, S2, X0, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, X2, Y2, X3, Y0 ^ Y5, X1 ^ Y6, Y1 ^ Y7 }, - {S0, S1, S2, X0, Y0, X1, Y1, X2, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X3, Y4, X5, Y6, Y2 ^ Y7 }, - {0, S0, S1, S2, X0, Y0, X1, Y1, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X3, Y4, X5, X2 ^ Y6, Y2 ^ Y7 }, - {0, 0, S0, S1, S2, X0, Y0, X1, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X3, Y4, Y1 ^ Y6, X2 ^ Y7, Y2 ^ Y8 }, - {0, 0, 0, S0, S1, S2, X0, Y0, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y2 ^ X5 ^ Y5, Y3, X4, X1 ^ Y6, Y1 ^ Y7, X2 ^ Y8 }, - {0, 0, 0, 0, S0, S1, S2, X0, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y0 ^ X5 ^ Y5, Y2, X3, X1 ^ Y6, Y1 ^ Y7, X2 ^ Y8 }, - {S0, S1, S2, X0, Y0, X1, Y1, X2, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, Z3 ^ X5 ^ Y6, Y3, X4, Y6, Y2 ^ Y7 }, - {0, S0, S1, S2, X0, Y0, X1, Y1, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, Z3 ^ X5 ^ Y6, Y3, X4, Y2 ^ Y6, X2 ^ Y7 }, - {0, 0, S0, S1, S2, X0, Y0, X1, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, Y2 ^ X5 ^ Y6, X3, Y4, X2 ^ Y6, Y1 ^ Y7 }, - {0, 0, 0, S0, S1, S2, X0, Y0, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y2 ^ Y5 ^ X6, X1 ^ X5 ^ Y6, Y3, X4, Y1 ^ Y6, X2 ^ Y7 }, - {0, 0, 0, 0, S0, S1, S2, X0, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y0 ^ Y5 ^ X6, X1 ^ X5 ^ Y6, X3, Y1 ^ Y6, X2 ^ Y7, Y2 ^ Y8 }, - {S0, S1, S2, X0, Y0, X1, Y1, X2, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, Z4 ^ X5 ^ Y7, Z3 ^ X6 ^ Y6, X3, Y4, Y2 ^ Y7 }, - {0, S0, S1, S2, X0, Y0, X1, Y1, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, Z3 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, Y3, X4, X2 ^ Y7 }, - {0, 0, S0, S1, S2, X0, Y0, X1, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, Y2 ^ X5 ^ Y7, X2 ^ X6 ^ Y6, X3, Y4, Y1 ^ Y7 }, - {0, 0, 0, S0, S1, S2, X0, Y0, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y2 ^ Y5 ^ X7, X1 ^ X5 ^ Y7, Y1 ^ X6 ^ Y6, Y3, X4, X2 ^ Y7 }, - {0, 0, 0, 0, S0, S1, S2, X0, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y0 ^ Y5 ^ X7, X1 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, X3, X2 ^ Y7, Y1 ^ Y8 }, - {S0, S1, S2, X0, Y0, X1, Y1, X2, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, Z4 ^ X5 ^ Y8, Z3 ^ Y6 ^ X7, Y2 ^ X6 ^ Y7, X3, Y4 }, - {0, S0, S1, S2, X0, Y0, X1, Y1, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, Z3 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7, Y3, X4 }, - {0, 0, S0, S1, S2, X0, Y0, X1, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, Y2 ^ X5 ^ Y8, X2 ^ Y6 ^ X7, Y1 ^ X6 ^ Y7, X3, Y4 }, - {0, 0, 0, S0, S1, S2, X0, Y0, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y2 ^ Y5 ^ X8, X1 ^ X5 ^ Y8, Y1 ^ Y6 ^ X7, X2 ^ X6 ^ Y7, Y3, X4 }, - {0, 0, 0, 0, S0, S1, S2, X0, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y0 ^ Y5 ^ X8, X1 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7, X3, Y1 ^ Y7 }, - {S0, S1, S2, X0, Y0, X1, Y1, X2, Y3, X3, Y4, X4, Y5, X5, Y6, Y2 ^ Y7 }, - {0, S0, S1, S2, X0, Y0, X1, Y1, Y3, X3, Y4, X4, Y5, X5, Y2 ^ Y6, X2 ^ Y7 }, - {0, 0, S0, S1, S2, X0, Y0, X1, Y3, X3, Y4, X4, Y5, Y1 ^ Y6, Y2 ^ Y7, X2 ^ Y8 }, - {0, 0, 0, S0, S1, S2, X0, Y0, X2, Y3, X3, Y4, X4, Y1 ^ Y5, X1 ^ Y6, Y2 ^ Y7 }, - {0, 0, 0, 0, S0, S1, S2, X0, Y2, X2, Y3, X3, Y4, Y0 ^ Y5, Y1 ^ Y6, X1 ^ Y7 }, - {S0, S1, S2, X0, Y0, X1, Y1, X2, X3, Z0 ^ X3 ^ Y3, Y4, X4, Y5, X5, Y6, Y2 ^ Y7 }, - {0, S0, S1, S2, X0, Y0, X1, Y1, X3, Z0 ^ X3 ^ Y3, Y4, X4, Y5, X5, X2 ^ Y6, Y2 ^ Y7 }, - {0, 0, S0, S1, S2, X0, Y0, X1, X3, Z0 ^ X3 ^ Y3, Y4, X4, Y5, Y1 ^ Y6, X2 ^ Y7, Y2 ^ Y8 }, - {0, 0, 0, S0, S1, S2, X0, Y0, Y2, Z0 ^ X3 ^ Y3, X3, Y4, X4, X1 ^ Y5, Y1 ^ Y6, X2 ^ Y7 }, - {0, 0, 0, 0, S0, S1, S2, X0, X2, Z0 ^ X3 ^ Y3, Y2, X3, Y4, Y0 ^ Y5, X1 ^ Y6, Y1 ^ Y7 }, - {S0, S1, S2, X0, Y0, X1, Y1, X2, Y3, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, X4, Y5, X5, Y6, Y2 ^ Y7 }, - {0, S0, S1, S2, X0, Y0, X1, Y1, Y3, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, X4, Y5, X5, Y2 ^ Y6, X2 ^ Y7 }, - {0, 0, S0, S1, S2, X0, Y0, X1, Y3, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, X4, Y5, Y1 ^ Y6, Y2 ^ Y7, X2 ^ Y8 }, - {0, 0, 0, S0, S1, S2, X0, Y0, X2, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y3, X4, Y1 ^ Y5, X1 ^ Y6, Y2 ^ Y7 }, - {0, 0, 0, 0, S0, S1, S2, X0, X2, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y2, X3, Y0 ^ Y5, X1 ^ Y6, Y1 ^ Y7 }, - {S0, S1, S2, X0, Y0, X1, Y1, X2, X3, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y4, X5, Y6, Y2 ^ Y7 }, - {0, S0, S1, S2, X0, Y0, X1, Y1, X3, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y4, X5, X2 ^ Y6, Y2 ^ Y7 }, - {0, 0, S0, S1, S2, X0, Y0, X1, X3, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y4, Y1 ^ Y6, X2 ^ Y7, Y2 ^ Y8 }, - {0, 0, 0, S0, S1, S2, X0, Y0, Y3, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y2 ^ X5 ^ Y5, X4, X1 ^ Y6, Y1 ^ Y7, X2 ^ Y8 }, - {0, 0, 0, 0, S0, S1, S2, X0, Y2, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y0 ^ X5 ^ Y5, X3, X1 ^ Y6, Y1 ^ Y7, X2 ^ Y8 }, - {S0, S1, S2, X0, Y0, X1, Y1, X2, Y3, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, Z3 ^ X5 ^ Y6, X4, Y6, Y2 ^ Y7 }, - {0, S0, S1, S2, X0, Y0, X1, Y1, Y3, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, Z3 ^ X5 ^ Y6, X4, Y2 ^ Y6, X2 ^ Y7 }, - {0, 0, S0, S1, S2, X0, Y0, X1, X3, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, Y2 ^ X5 ^ Y6, Y4, X2 ^ Y6, Y1 ^ Y7 }, - {0, 0, 0, S0, S1, S2, X0, Y0, Y3, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y2 ^ Y5 ^ X6, X1 ^ X5 ^ Y6, X4, Y1 ^ Y6, X2 ^ Y7 }, - {0, 0, 0, 0, S0, S1, S2, X0, X3, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y0 ^ Y5 ^ X6, X1 ^ X5 ^ Y6, Y1 ^ Y6, X2 ^ Y7, Y2 ^ Y8 }, - {S0, S1, S2, X0, Y0, X1, Y1, X2, X3, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, Z4 ^ X5 ^ Y7, Z3 ^ X6 ^ Y6, Y4, Y2 ^ Y7 }, - {0, S0, S1, S2, X0, Y0, X1, Y1, Y3, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, Z3 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, X4, X2 ^ Y7 }, - {0, 0, S0, S1, S2, X0, Y0, X1, X3, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, Y2 ^ X5 ^ Y7, X2 ^ X6 ^ Y6, Y4, Y1 ^ Y7 }, - {0, 0, 0, S0, S1, S2, X0, Y0, Y3, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y2 ^ Y5 ^ X7, X1 ^ X5 ^ Y7, Y1 ^ X6 ^ Y6, X4, X2 ^ Y7 }, - {0, 0, 0, 0, S0, S1, S2, X0, X3, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y0 ^ Y5 ^ X7, X1 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, X2 ^ Y7, Y1 ^ Y8 }, - {S0, S1, S2, X0, Y0, X1, Y1, X2, X3, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, Z4 ^ X5 ^ Y8, Z3 ^ Y6 ^ X7, Y2 ^ X6 ^ Y7, Y4 }, - {0, S0, S1, S2, X0, Y0, X1, Y1, Y3, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, Z3 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7, X4 }, - {0, 0, S0, S1, S2, X0, Y0, X1, X3, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, Y2 ^ X5 ^ Y8, X2 ^ Y6 ^ X7, Y1 ^ X6 ^ Y7, Y4 }, - {0, 0, 0, S0, S1, S2, X0, Y0, Y3, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y2 ^ Y5 ^ X8, X1 ^ X5 ^ Y8, Y1 ^ Y6 ^ X7, X2 ^ X6 ^ Y7, X4 }, - {0, 0, 0, 0, S0, S1, S2, X0, X3, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y0 ^ Y5 ^ X8, X1 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7, Y1 ^ Y7 }, - {S0, S1, S2, X0, Y0, X1, Y1, X2, Y3, X3, Y4, X4, Y5, X5, Y6, Y2 ^ Y7 }, - {0, S0, S1, S2, X0, Y0, X1, Y1, Y3, X3, Y4, X4, Y5, X5, Y2 ^ Y6, X2 ^ Y7 }, - {0, 0, S0, S1, S2, X0, Y0, X1, Y3, X3, Y4, X4, Y5, Y1 ^ Y6, Y2 ^ Y7, X2 ^ Y8 }, - {0, 0, 0, S0, S1, S2, X0, Y0, X2, Y3, X3, Y4, X4, Y1 ^ Y5, X1 ^ Y6, Y2 ^ Y7 }, - {0, 0, 0, 0, S0, S1, S2, X0, Y2, X2, Y3, X3, Y4, Y0 ^ Y5, Y1 ^ Y6, X1 ^ Y7 }, - {S0, S1, S2, X0, Y0, X1, Y1, X2, X3, Y4, Z0 ^ X3 ^ Y3, X4, Y5, X5, Y6, Y2 ^ Y7 }, - {0, S0, S1, S2, X0, Y0, X1, Y1, X3, Y4, Z0 ^ X3 ^ Y3, X4, Y5, X5, X2 ^ Y6, Y2 ^ Y7 }, - {0, 0, S0, S1, S2, X0, Y0, X1, X3, Y4, Z0 ^ X3 ^ Y3, X4, Y5, Y1 ^ Y6, X2 ^ Y7, Y2 ^ Y8 }, - {0, 0, 0, S0, S1, S2, X0, Y0, Y2, X3, Z0 ^ X3 ^ Y3, Y4, X4, X1 ^ Y5, Y1 ^ Y6, X2 ^ Y7 }, - {0, 0, 0, 0, S0, S1, S2, X0, X2, Y2, Z0 ^ X3 ^ Y3, X3, Y4, Y0 ^ Y5, X1 ^ Y6, Y1 ^ Y7 }, - {S0, S1, S2, X0, Y0, X1, Y1, X2, Y3, X4, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y5, X5, Y6, Y2 ^ Y7 }, - {0, S0, S1, S2, X0, Y0, X1, Y1, Y3, X4, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y5, X5, Y2 ^ Y6, X2 ^ Y7 }, - {0, 0, S0, S1, S2, X0, Y0, X1, Y3, X4, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y5, Y1 ^ Y6, Y2 ^ Y7, X2 ^ Y8 }, - {0, 0, 0, S0, S1, S2, X0, Y0, X2, Y3, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, X4, Y1 ^ Y5, X1 ^ Y6, Y2 ^ Y7 }, - {0, 0, 0, 0, S0, S1, S2, X0, X2, Y2, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, X3, Y0 ^ Y5, X1 ^ Y6, Y1 ^ Y7 }, - {S0, S1, S2, X0, Y0, X1, Y1, X2, X3, Y4, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5, Y6, Y2 ^ Y7 }, - {0, S0, S1, S2, X0, Y0, X1, Y1, X3, Y4, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5, X2 ^ Y6, Y2 ^ Y7 }, - {0, 0, S0, S1, S2, X0, Y0, X1, X3, Y4, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y1 ^ Y6, X2 ^ Y7, Y2 ^ Y8 }, - {0, 0, 0, S0, S1, S2, X0, Y0, Y3, X4, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y2 ^ X5 ^ Y5, X1 ^ Y6, Y1 ^ Y7, X2 ^ Y8 }, - {0, 0, 0, 0, S0, S1, S2, X0, Y2, X3, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y0 ^ X5 ^ Y5, X1 ^ Y6, Y1 ^ Y7, X2 ^ Y8 }, - {S0, S1, S2, X0, Y0, X1, Y1, X2, Y3, X4, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, Z3 ^ X5 ^ Y6, Y6, Y2 ^ Y7 }, - {0, S0, S1, S2, X0, Y0, X1, Y1, Y3, X4, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, Z3 ^ X5 ^ Y6, Y2 ^ Y6, X2 ^ Y7 }, - {0, 0, S0, S1, S2, X0, Y0, X1, X3, Y4, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, Y2 ^ X5 ^ Y6, X2 ^ Y6, Y1 ^ Y7 }, - {0, 0, 0, S0, S1, S2, X0, Y0, Y3, X4, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y2 ^ Y5 ^ X6, X1 ^ X5 ^ Y6, Y1 ^ Y6, X2 ^ Y7 }, - {0, 0, 0, 0, S0, S1, S2, X0, X3, Y1 ^ Y6, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y0 ^ Y5 ^ X6, X1 ^ X5 ^ Y6, X2 ^ Y7, Y2 ^ Y8 }, - {S0, S1, S2, X0, Y0, X1, Y1, X2, X3, Y4, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, Z4 ^ X5 ^ Y7, Z3 ^ X6 ^ Y6, Y2 ^ Y7 }, - {0, S0, S1, S2, X0, Y0, X1, Y1, Y3, X4, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, Z3 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, X2 ^ Y7 }, - {0, 0, S0, S1, S2, X0, Y0, X1, X3, Y4, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, Y2 ^ X5 ^ Y7, X2 ^ X6 ^ Y6, Y1 ^ Y7 }, - {0, 0, 0, S0, S1, S2, X0, Y0, Y3, X4, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y2 ^ Y5 ^ X7, X1 ^ X5 ^ Y7, Y1 ^ X6 ^ Y6, X2 ^ Y7 }, - {0, 0, 0, 0, S0, S1, S2, X0, X3, X2 ^ Y7, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y0 ^ Y5 ^ X7, X1 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, Y1 ^ Y8 }, - {S0, S1, S2, X0, Y0, X1, Y1, X2, X3, Y4, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, Z4 ^ X5 ^ Y8, Z3 ^ Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {0, S0, S1, S2, X0, Y0, X1, Y1, Y3, X4, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, Z3 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7 }, - {0, 0, S0, S1, S2, X0, Y0, X1, X3, Y4, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, Y2 ^ X5 ^ Y8, X2 ^ Y6 ^ X7, Y1 ^ X6 ^ Y7 }, - {0, 0, 0, S0, S1, S2, X0, Y0, Y3, X4, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y2 ^ Y5 ^ X8, X1 ^ X5 ^ Y8, Y1 ^ Y6 ^ X7, X2 ^ X6 ^ Y7 }, - {0, 0, 0, 0, S0, S1, S2, X0, X3, Y1 ^ Y7, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y0 ^ Y5 ^ X8, X1 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7 }, -}; - -const UINT_64 SW_4K_S3[][12]= -{ - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3, Y3 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3, Y3 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2, Y3 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2, Y2 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2, Y2 }, -}; - -const UINT_64 SW_4K_S3_X[][12]= -{ - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3, Y3 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3, Y3 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2, Y3 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2, Y2 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2, Y2 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2 ^ X3 ^ Z3, X3, Z3, Y3 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, X2 ^ Y2 ^ Z3, X2, Z3, Y3 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, X2 ^ Y2 ^ Z2, X2, Z2, Y3 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1 ^ X2 ^ Z2, X2, Z2, Y2 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, X1 ^ Y1 ^ Z2, X1, Z2, Y2 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2 ^ X4 ^ Z4, X3 ^ Y3 ^ Z3, Z3, Y3 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2 ^ X3 ^ Z4, X2 ^ Y3 ^ Z3, Z3, Y3 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2 ^ X3 ^ Z3, X2 ^ Z2 ^ Y3, Z2, Y3 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1 ^ X3 ^ Z3, X2 ^ Y2 ^ Z2, Z2, Y2 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1 ^ X2 ^ Z3, X1 ^ Y2 ^ Z2, Z2, Y2 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2 ^ X5 ^ Z5, X3 ^ Y4 ^ Z4, Y3 ^ Z3 ^ X4, Y3 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2 ^ X4 ^ Z5, X2 ^ Y4 ^ Z4, X3 ^ Y3 ^ Z3, Y3 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2 ^ X4 ^ Z4, X2 ^ Z3 ^ Y4, Z2 ^ X3 ^ Y3, Y3 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1 ^ X4 ^ Z4, X2 ^ Y3 ^ Z3, Y2 ^ Z2 ^ X3, Y2 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1 ^ X3 ^ Z4, X1 ^ Y3 ^ Z3, X2 ^ Y2 ^ Z2, Y2 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2 ^ X6 ^ Z6, X3 ^ Y5 ^ Z5, Z3 ^ Y4 ^ X5, Y3 ^ X4 ^ Z4 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2 ^ X5 ^ Z6, X2 ^ Y5 ^ Z5, Z3 ^ X4 ^ Y4, X3 ^ Y3 ^ Z4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2 ^ X5 ^ Z5, X2 ^ Z4 ^ Y5, Z2 ^ X4 ^ Y4, X3 ^ Y3 ^ Z3 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1 ^ X5 ^ Z5, X2 ^ Y4 ^ Z4, Z2 ^ Y3 ^ X4, Y2 ^ X3 ^ Z3 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1 ^ X4 ^ Z5, X1 ^ Y4 ^ Z4, Z2 ^ X3 ^ Y3, X2 ^ Y2 ^ Z3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2 ^ X6 ^ Z6, X3 ^ Y5 ^ Z5, Z3 ^ Y4 ^ X5, Y3 ^ X4 ^ Z4 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2 ^ X5 ^ Z6, X2 ^ Y5 ^ Z5, Z3 ^ X4 ^ Y4, X3 ^ Y3 ^ Z4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2 ^ X5 ^ Z5, X2 ^ Z4 ^ Y5, Z2 ^ X4 ^ Y4, X3 ^ Y3 ^ Z3 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1 ^ X5 ^ Z5, X2 ^ Y4 ^ Z4, Z2 ^ Y3 ^ X4, Y2 ^ X3 ^ Z3 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1 ^ X4 ^ Z5, X1 ^ Y4 ^ Z4, Z2 ^ X3 ^ Y3, X2 ^ Y2 ^ Z3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2 ^ X6 ^ Z6, X3 ^ Y5 ^ Z5, Z3 ^ Y4 ^ X5, Y3 ^ X4 ^ Z4 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2 ^ X5 ^ Z6, X2 ^ Y5 ^ Z5, Z3 ^ X4 ^ Y4, X3 ^ Y3 ^ Z4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2 ^ X5 ^ Z5, X2 ^ Z4 ^ Y5, Z2 ^ X4 ^ Y4, X3 ^ Y3 ^ Z3 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1 ^ X5 ^ Z5, X2 ^ Y4 ^ Z4, Z2 ^ Y3 ^ X4, Y2 ^ X3 ^ Z3 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1 ^ X4 ^ Z5, X1 ^ Y4 ^ Z4, Z2 ^ X3 ^ Y3, X2 ^ Y2 ^ Z3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3, Y3 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3, Y3 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2, Y3 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2, Y2 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2, Y2 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3 ^ Y3 ^ Z3, Z3, Y3 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2 ^ Y3 ^ Z3, Z3, Y3 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2 ^ Z2 ^ Y3, Z2, Y3 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2 ^ Y2 ^ Z2, Z2, Y2 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1 ^ Y2 ^ Z2, Z2, Y2 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3 ^ Y4 ^ Z4, Y3 ^ Z3 ^ X4, Y3 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2 ^ Y4 ^ Z4, X3 ^ Y3 ^ Z3, Y3 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2 ^ Z3 ^ Y4, Z2 ^ X3 ^ Y3, Y3 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2 ^ Y3 ^ Z3, Y2 ^ Z2 ^ X3, Y2 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1 ^ Y3 ^ Z3, X2 ^ Y2 ^ Z2, Y2 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3 ^ Y5 ^ Z5, Z3 ^ Y4 ^ X5, Y3 ^ X4 ^ Z4 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2 ^ Y5 ^ Z5, Z3 ^ X4 ^ Y4, X3 ^ Y3 ^ Z4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2 ^ Z4 ^ Y5, Z2 ^ X4 ^ Y4, X3 ^ Y3 ^ Z3 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2 ^ Y4 ^ Z4, Z2 ^ Y3 ^ X4, Y2 ^ X3 ^ Z3 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1 ^ Y4 ^ Z4, Z2 ^ X3 ^ Y3, X2 ^ Y2 ^ Z3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3 ^ Y5 ^ Z5, Z3 ^ Y4 ^ X5, Y3 ^ X4 ^ Z4 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2 ^ Y5 ^ Z5, Z3 ^ X4 ^ Y4, X3 ^ Y3 ^ Z4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2 ^ Z4 ^ Y5, Z2 ^ X4 ^ Y4, X3 ^ Y3 ^ Z3 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2 ^ Y4 ^ Z4, Z2 ^ Y3 ^ X4, Y2 ^ X3 ^ Z3 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1 ^ Y4 ^ Z4, Z2 ^ X3 ^ Y3, X2 ^ Y2 ^ Z3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3 ^ Y5 ^ Z5, Z3 ^ Y4 ^ X5, Y3 ^ X4 ^ Z4 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2 ^ Y5 ^ Z5, Z3 ^ X4 ^ Y4, X3 ^ Y3 ^ Z4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2 ^ Z4 ^ Y5, Z2 ^ X4 ^ Y4, X3 ^ Y3 ^ Z3 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2 ^ Y4 ^ Z4, Z2 ^ Y3 ^ X4, Y2 ^ X3 ^ Z3 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1 ^ Y4 ^ Z4, Z2 ^ X3 ^ Y3, X2 ^ Y2 ^ Z3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3 ^ Y5 ^ Z5, Z3 ^ Y4 ^ X5, Y3 ^ X4 ^ Z4 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2 ^ Y5 ^ Z5, Z3 ^ X4 ^ Y4, X3 ^ Y3 ^ Z4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2 ^ Z4 ^ Y5, Z2 ^ X4 ^ Y4, X3 ^ Y3 ^ Z3 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2 ^ Y4 ^ Z4, Z2 ^ Y3 ^ X4, Y2 ^ X3 ^ Z3 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1 ^ Y4 ^ Z4, Z2 ^ X3 ^ Y3, X2 ^ Y2 ^ Z3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3, Y3 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3, Y3 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2, Y3 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2, Y2 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2, Y2 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Y3 ^ Z3 ^ X4, Y3 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, X3 ^ Y3 ^ Z3, Y3 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2 ^ X3 ^ Y3, Y3 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Y2 ^ Z2 ^ X3, Y2 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, X2 ^ Y2 ^ Z2, Y2 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3 ^ Y4 ^ X5, Y3 ^ X4 ^ Z4 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3 ^ X4 ^ Y4, X3 ^ Y3 ^ Z4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2 ^ X4 ^ Y4, X3 ^ Y3 ^ Z3 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2 ^ Y3 ^ X4, Y2 ^ X3 ^ Z3 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2 ^ X3 ^ Y3, X2 ^ Y2 ^ Z3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3 ^ Y4 ^ X5, Y3 ^ X4 ^ Z4 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3 ^ X4 ^ Y4, X3 ^ Y3 ^ Z4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2 ^ X4 ^ Y4, X3 ^ Y3 ^ Z3 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2 ^ Y3 ^ X4, Y2 ^ X3 ^ Z3 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2 ^ X3 ^ Y3, X2 ^ Y2 ^ Z3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3 ^ Y4 ^ X5, Y3 ^ X4 ^ Z4 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3 ^ X4 ^ Y4, X3 ^ Y3 ^ Z4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2 ^ X4 ^ Y4, X3 ^ Y3 ^ Z3 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2 ^ Y3 ^ X4, Y2 ^ X3 ^ Z3 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2 ^ X3 ^ Y3, X2 ^ Y2 ^ Z3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3 ^ Y4 ^ X5, Y3 ^ X4 ^ Z4 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3 ^ X4 ^ Y4, X3 ^ Y3 ^ Z4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2 ^ X4 ^ Y4, X3 ^ Y3 ^ Z3 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2 ^ Y3 ^ X4, Y2 ^ X3 ^ Z3 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2 ^ X3 ^ Y3, X2 ^ Y2 ^ Z3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3 ^ Y4 ^ X5, Y3 ^ X4 ^ Z4 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3 ^ X4 ^ Y4, X3 ^ Y3 ^ Z4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2 ^ X4 ^ Y4, X3 ^ Y3 ^ Z3 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2 ^ Y3 ^ X4, Y2 ^ X3 ^ Z3 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2 ^ X3 ^ Y3, X2 ^ Y2 ^ Z3 }, -}; - -const UINT_64 SW_64K_S3[][16]= -{ - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3, Y3, X4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3, Y3, X3, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2, Y3, X3, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2, Y2, X3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2, Y2, X2, Z3, Y3, X3 }, -}; - -const UINT_64 SW_64K_S3_X[][16]= -{ - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3, Y3, X4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3, Y3, X3, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2, Y3, X3, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2, Y2, X3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2, Y2, X2, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2 ^ X3 ^ Z3, X3, Z3, Y3, X4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, X2 ^ Y2 ^ Z3, X2, Z3, Y3, X3, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, X2 ^ Y2 ^ Z2, X2, Z2, Y3, X3, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1 ^ X2 ^ Z2, X2, Z2, Y2, X3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, X1 ^ Y1 ^ Z2, X1, Z2, Y2, X2, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2 ^ X4 ^ Z4, X3 ^ Y3 ^ Z3, Z3, Y3, X4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2 ^ X3 ^ Z4, X2 ^ Y3 ^ Z3, Z3, Y3, X3, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2 ^ X3 ^ Z3, X2 ^ Z2 ^ Y3, Z2, Y3, X3, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1 ^ X3 ^ Z3, X2 ^ Y2 ^ Z2, Z2, Y2, X3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1 ^ X2 ^ Z3, X1 ^ Y2 ^ Z2, Z2, Y2, X2, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2 ^ X5 ^ Z5, X3 ^ Y4 ^ Z4, Y3 ^ Z3 ^ X4, Y3, X4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2 ^ X4 ^ Z5, X2 ^ Y4 ^ Z4, X3 ^ Y3 ^ Z3, Y3, X3, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2 ^ X4 ^ Z4, X2 ^ Z3 ^ Y4, Z2 ^ X3 ^ Y3, Y3, X3, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1 ^ X4 ^ Z4, X2 ^ Y3 ^ Z3, Y2 ^ Z2 ^ X3, Y2, X3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1 ^ X3 ^ Z4, X1 ^ Y3 ^ Z3, X2 ^ Y2 ^ Z2, Y2, X2, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2 ^ X6 ^ Z6, X3 ^ Y5 ^ Z5, Z3 ^ Y4 ^ X5, Y3 ^ X4 ^ Z4, X4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2 ^ X5 ^ Z6, X2 ^ Y5 ^ Z5, Z3 ^ X4 ^ Y4, X3 ^ Y3 ^ Z4, X3, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2 ^ X5 ^ Z5, X2 ^ Z4 ^ Y5, Z2 ^ X4 ^ Y4, X3 ^ Y3 ^ Z3, X3, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1 ^ X5 ^ Z5, X2 ^ Y4 ^ Z4, Z2 ^ Y3 ^ X4, Y2 ^ X3 ^ Z3, X3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1 ^ X4 ^ Z5, X1 ^ Y4 ^ Z4, Z2 ^ X3 ^ Y3, X2 ^ Y2 ^ Z3, X2, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2 ^ X7 ^ Z7, X3 ^ Y6 ^ Z6, Z3 ^ Y5 ^ X6, Y3 ^ X5 ^ Z5, X4 ^ Y4 ^ Z4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2 ^ X6 ^ Z7, X2 ^ Y6 ^ Z6, Z3 ^ X5 ^ Y5, Y3 ^ X4 ^ Z5, X3 ^ Y4 ^ Z4, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2 ^ X6 ^ Z6, X2 ^ Z5 ^ Y6, Z2 ^ X5 ^ Y5, Y3 ^ X4 ^ Z4, X3 ^ Z3 ^ Y4, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1 ^ X6 ^ Z6, X2 ^ Y5 ^ Z5, Z2 ^ Y4 ^ X5, Y2 ^ X4 ^ Z4, X3 ^ Y3 ^ Z3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1 ^ X5 ^ Z6, X1 ^ Y5 ^ Z5, Z2 ^ X4 ^ Y4, Y2 ^ X3 ^ Z4, X2 ^ Y3 ^ Z3, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2 ^ X8 ^ Z8, X3 ^ Y7 ^ Z7, Z3 ^ Y6 ^ X7, Y3 ^ X6 ^ Z6, X4 ^ Y5 ^ Z5, Y4 ^ Z4 ^ X5, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2 ^ X7 ^ Z8, X2 ^ Y7 ^ Z7, Z3 ^ X6 ^ Y6, Y3 ^ X5 ^ Z6, X3 ^ Y5 ^ Z5, X4 ^ Y4 ^ Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2 ^ X7 ^ Z7, X2 ^ Z6 ^ Y7, Z2 ^ X6 ^ Y6, Y3 ^ X5 ^ Z5, X3 ^ Z4 ^ Y5, Z3 ^ X4 ^ Y4, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1 ^ X7 ^ Z7, X2 ^ Y6 ^ Z6, Z2 ^ Y5 ^ X6, Y2 ^ X5 ^ Z5, X3 ^ Y4 ^ Z4, Y3 ^ Z3 ^ X4, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1 ^ X6 ^ Z7, X1 ^ Y6 ^ Z6, Z2 ^ X5 ^ Y5, Y2 ^ X4 ^ Z5, X2 ^ Y4 ^ Z4, X3 ^ Y3 ^ Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3, Y3, X4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3, Y3, X3, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2, Y3, X3, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2, Y2, X3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2, Y2, X2, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3 ^ Y3 ^ Z3, Z3, Y3, X4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2 ^ Y3 ^ Z3, Z3, Y3, X3, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2 ^ Z2 ^ Y3, Z2, Y3, X3, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2 ^ Y2 ^ Z2, Z2, Y2, X3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1 ^ Y2 ^ Z2, Z2, Y2, X2, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3 ^ Y4 ^ Z4, Y3 ^ Z3 ^ X4, Y3, X4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2 ^ Y4 ^ Z4, X3 ^ Y3 ^ Z3, Y3, X3, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2 ^ Z3 ^ Y4, Z2 ^ X3 ^ Y3, Y3, X3, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2 ^ Y3 ^ Z3, Y2 ^ Z2 ^ X3, Y2, X3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1 ^ Y3 ^ Z3, X2 ^ Y2 ^ Z2, Y2, X2, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3 ^ Y5 ^ Z5, Z3 ^ Y4 ^ X5, Y3 ^ X4 ^ Z4, X4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2 ^ Y5 ^ Z5, Z3 ^ X4 ^ Y4, X3 ^ Y3 ^ Z4, X3, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2 ^ Z4 ^ Y5, Z2 ^ X4 ^ Y4, X3 ^ Y3 ^ Z3, X3, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2 ^ Y4 ^ Z4, Z2 ^ Y3 ^ X4, Y2 ^ X3 ^ Z3, X3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1 ^ Y4 ^ Z4, Z2 ^ X3 ^ Y3, X2 ^ Y2 ^ Z3, X2, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3 ^ Y6 ^ Z6, Z3 ^ Y5 ^ X6, Y3 ^ X5 ^ Z5, X4 ^ Y4 ^ Z4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2 ^ Y6 ^ Z6, Z3 ^ X5 ^ Y5, Y3 ^ X4 ^ Z5, X3 ^ Y4 ^ Z4, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2 ^ Z5 ^ Y6, Z2 ^ X5 ^ Y5, Y3 ^ X4 ^ Z4, X3 ^ Z3 ^ Y4, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2 ^ Y5 ^ Z5, Z2 ^ Y4 ^ X5, Y2 ^ X4 ^ Z4, X3 ^ Y3 ^ Z3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1 ^ Y5 ^ Z5, Z2 ^ X4 ^ Y4, Y2 ^ X3 ^ Z4, X2 ^ Y3 ^ Z3, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3 ^ Y7 ^ Z7, Z3 ^ Y6 ^ X7, Y3 ^ X6 ^ Z6, X4 ^ Y5 ^ Z5, Y4 ^ Z4 ^ X5, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2 ^ Y7 ^ Z7, Z3 ^ X6 ^ Y6, Y3 ^ X5 ^ Z6, X3 ^ Y5 ^ Z5, X4 ^ Y4 ^ Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2 ^ Z6 ^ Y7, Z2 ^ X6 ^ Y6, Y3 ^ X5 ^ Z5, X3 ^ Z4 ^ Y5, Z3 ^ X4 ^ Y4, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2 ^ Y6 ^ Z6, Z2 ^ Y5 ^ X6, Y2 ^ X5 ^ Z5, X3 ^ Y4 ^ Z4, Y3 ^ Z3 ^ X4, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1 ^ Y6 ^ Z6, Z2 ^ X5 ^ Y5, Y2 ^ X4 ^ Z5, X2 ^ Y4 ^ Z4, X3 ^ Y3 ^ Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3 ^ Y8 ^ Z8, Z3 ^ Y7 ^ X8, Y3 ^ X7 ^ Z7, X4 ^ Y6 ^ Z6, Z4 ^ Y5 ^ X6, Y4 ^ X5 ^ Z5, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2 ^ Y8 ^ Z8, Z3 ^ X7 ^ Y7, Y3 ^ X6 ^ Z7, X3 ^ Y6 ^ Z6, Z4 ^ X5 ^ Y5, X4 ^ Y4 ^ Z5, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2 ^ Z7 ^ Y8, Z2 ^ X7 ^ Y7, Y3 ^ X6 ^ Z6, X3 ^ Z5 ^ Y6, Z3 ^ X5 ^ Y5, X4 ^ Y4 ^ Z4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2 ^ Y7 ^ Z7, Z2 ^ Y6 ^ X7, Y2 ^ X6 ^ Z6, X3 ^ Y5 ^ Z5, Z3 ^ Y4 ^ X5, Y3 ^ X4 ^ Z4, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1 ^ Y7 ^ Z7, Z2 ^ X6 ^ Y6, Y2 ^ X5 ^ Z6, X2 ^ Y5 ^ Z5, Z3 ^ X4 ^ Y4, X3 ^ Y3 ^ Z4, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3, Y3, X4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3, Y3, X3, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2, Y3, X3, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2, Y2, X3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2, Y2, X2, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Y3 ^ Z3 ^ X4, Y3, X4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, X3 ^ Y3 ^ Z3, Y3, X3, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2 ^ X3 ^ Y3, Y3, X3, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Y2 ^ Z2 ^ X3, Y2, X3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, X2 ^ Y2 ^ Z2, Y2, X2, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3 ^ Y4 ^ X5, Y3 ^ X4 ^ Z4, X4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3 ^ X4 ^ Y4, X3 ^ Y3 ^ Z4, X3, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2 ^ X4 ^ Y4, X3 ^ Y3 ^ Z3, X3, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2 ^ Y3 ^ X4, Y2 ^ X3 ^ Z3, X3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2 ^ X3 ^ Y3, X2 ^ Y2 ^ Z3, X2, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3 ^ Y5 ^ X6, Y3 ^ X5 ^ Z5, X4 ^ Y4 ^ Z4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3 ^ X5 ^ Y5, Y3 ^ X4 ^ Z5, X3 ^ Y4 ^ Z4, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2 ^ X5 ^ Y5, Y3 ^ X4 ^ Z4, X3 ^ Z3 ^ Y4, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2 ^ Y4 ^ X5, Y2 ^ X4 ^ Z4, X3 ^ Y3 ^ Z3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2 ^ X4 ^ Y4, Y2 ^ X3 ^ Z4, X2 ^ Y3 ^ Z3, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3 ^ Y6 ^ X7, Y3 ^ X6 ^ Z6, X4 ^ Y5 ^ Z5, Y4 ^ Z4 ^ X5, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3 ^ X6 ^ Y6, Y3 ^ X5 ^ Z6, X3 ^ Y5 ^ Z5, X4 ^ Y4 ^ Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2 ^ X6 ^ Y6, Y3 ^ X5 ^ Z5, X3 ^ Z4 ^ Y5, Z3 ^ X4 ^ Y4, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2 ^ Y5 ^ X6, Y2 ^ X5 ^ Z5, X3 ^ Y4 ^ Z4, Y3 ^ Z3 ^ X4, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2 ^ X5 ^ Y5, Y2 ^ X4 ^ Z5, X2 ^ Y4 ^ Z4, X3 ^ Y3 ^ Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3 ^ Y7 ^ X8, Y3 ^ X7 ^ Z7, X4 ^ Y6 ^ Z6, Z4 ^ Y5 ^ X6, Y4 ^ X5 ^ Z5, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3 ^ X7 ^ Y7, Y3 ^ X6 ^ Z7, X3 ^ Y6 ^ Z6, Z4 ^ X5 ^ Y5, X4 ^ Y4 ^ Z5, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2 ^ X7 ^ Y7, Y3 ^ X6 ^ Z6, X3 ^ Z5 ^ Y6, Z3 ^ X5 ^ Y5, X4 ^ Y4 ^ Z4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2 ^ Y6 ^ X7, Y2 ^ X6 ^ Z6, X3 ^ Y5 ^ Z5, Z3 ^ Y4 ^ X5, Y3 ^ X4 ^ Z4, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2 ^ X6 ^ Y6, Y2 ^ X5 ^ Z6, X2 ^ Y5 ^ Z5, Z3 ^ X4 ^ Y4, X3 ^ Y3 ^ Z4, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3 ^ Y8 ^ X9, Y3 ^ X8 ^ Z8, X4 ^ Y7 ^ Z7, Z4 ^ Y6 ^ X7, Y4 ^ X6 ^ Z6, X5 ^ Y5 ^ Z5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3 ^ X8 ^ Y8, Y3 ^ X7 ^ Z8, X3 ^ Y7 ^ Z7, Z4 ^ X6 ^ Y6, Y4 ^ X5 ^ Z6, X4 ^ Y5 ^ Z5 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2 ^ X8 ^ Y8, Y3 ^ X7 ^ Z7, X3 ^ Z6 ^ Y7, Z3 ^ X6 ^ Y6, Y4 ^ X5 ^ Z5, X4 ^ Z4 ^ Y5 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2 ^ Y7 ^ X8, Y2 ^ X7 ^ Z7, X3 ^ Y6 ^ Z6, Z3 ^ Y5 ^ X6, Y3 ^ X5 ^ Z5, X4 ^ Y4 ^ Z4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2 ^ X7 ^ Y7, Y2 ^ X6 ^ Z7, X2 ^ Y6 ^ Z6, Z3 ^ X5 ^ Y5, Y3 ^ X4 ^ Z5, X3 ^ Y4 ^ Z4 }, -}; - -const UINT_64 SW_64K_S3_T[][16]= -{ - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3, Y3, X4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3, Y3, X3, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2, Y3, X3, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2, Y2, X3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2, Y2, X2, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2 ^ X3 ^ Z3, X3, Z3, Y3, X4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, X2 ^ Y2 ^ Z3, X2, Z3, Y3, X3, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, X2 ^ Y2 ^ Z2, X2, Z2, Y3, X3, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1 ^ X2 ^ Z2, X2, Z2, Y2, X3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, X1 ^ Y1 ^ Z2, X1, Z2, Y2, X2, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2 ^ X4 ^ Z4, X3 ^ Y3 ^ Z3, Z3, Y3, X4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2 ^ X3 ^ Z4, X2 ^ Y3 ^ Z3, Z3, Y3, X3, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2 ^ X3 ^ Z3, X2 ^ Z2 ^ Y3, Z2, Y3, X3, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1 ^ X3 ^ Z3, X2 ^ Y2 ^ Z2, Z2, Y2, X3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1 ^ X2 ^ Z3, X1 ^ Y2 ^ Z2, Z2, Y2, X2, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2 ^ X5, X3 ^ Y4 ^ Z4, Y3 ^ Z3 ^ X4, Y3, X4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2 ^ X4, X2 ^ Y4 ^ Z4, X3 ^ Y3 ^ Z3, Y3, X3, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2 ^ X4, X2 ^ Z3 ^ Y4, Z2 ^ X3 ^ Y3, Y3, X3, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1 ^ X4, X2 ^ Y3 ^ Z3, Y2 ^ Z2 ^ X3, Y2, X3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1 ^ X3, X1 ^ Y3 ^ Z3, X2 ^ Y2 ^ Z2, Y2, X2, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3 ^ Y4 ^ X5, Y3 ^ X4 ^ Z4, X4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3 ^ X4 ^ Y4, X3 ^ Y3 ^ Z4, X3, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2 ^ X4 ^ Y4, X3 ^ Y3 ^ Z3, X3, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2 ^ Y3 ^ X4, Y2 ^ X3 ^ Z3, X3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2 ^ X3 ^ Y3, X2 ^ Y2 ^ Z3, X2, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3, Y3 ^ X5, X4 ^ Y4 ^ Z4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3, Y3 ^ X4, X3 ^ Y4 ^ Z4, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2, Y3 ^ X4, X3 ^ Z3 ^ Y4, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2, Y2 ^ X4, X3 ^ Y3 ^ Z3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2, Y2 ^ X3, X2 ^ Y3 ^ Z3, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3, Y3, X4, Y4 ^ Z4 ^ X5, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3, Y3, X3, X4 ^ Y4 ^ Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2, Y3, X3, Z3 ^ X4 ^ Y4, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2, Y2, X3, Y3 ^ Z3 ^ X4, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2, Y2, X2, X3 ^ Y3 ^ Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3, Y3, X4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3, Y3, X3, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2, Y3, X3, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2, Y2, X3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2, Y2, X2, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3 ^ Y3 ^ Z3, Z3, Y3, X4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2 ^ Y3 ^ Z3, Z3, Y3, X3, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2 ^ Z2 ^ Y3, Z2, Y3, X3, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2 ^ Y2 ^ Z2, Z2, Y2, X3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1 ^ Y2 ^ Z2, Z2, Y2, X2, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3 ^ Y4 ^ Z4, Y3 ^ Z3 ^ X4, Y3, X4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2 ^ Y4 ^ Z4, X3 ^ Y3 ^ Z3, Y3, X3, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2 ^ Z3 ^ Y4, Z2 ^ X3 ^ Y3, Y3, X3, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2 ^ Y3 ^ Z3, Y2 ^ Z2 ^ X3, Y2, X3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1 ^ Y3 ^ Z3, X2 ^ Y2 ^ Z2, Y2, X2, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3 ^ Y4 ^ X5, Y3 ^ X4 ^ Z4, X4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3 ^ X4 ^ Y4, X3 ^ Y3 ^ Z4, X3, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2 ^ X4 ^ Y4, X3 ^ Y3 ^ Z3, X3, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2 ^ Y3 ^ X4, Y2 ^ X3 ^ Z3, X3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2 ^ X3 ^ Y3, X2 ^ Y2 ^ Z3, X2, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3, Y3 ^ X5, X4 ^ Y4 ^ Z4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3, Y3 ^ X4, X3 ^ Y4 ^ Z4, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2, Y3 ^ X4, X3 ^ Z3 ^ Y4, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2, Y2 ^ X4, X3 ^ Y3 ^ Z3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2, Y2 ^ X3, X2 ^ Y3 ^ Z3, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3, Y3, X4, Y4 ^ Z4 ^ X5, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3, Y3, X3, X4 ^ Y4 ^ Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2, Y3, X3, Z3 ^ X4 ^ Y4, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2, Y2, X3, Y3 ^ Z3 ^ X4, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2, Y2, X2, X3 ^ Y3 ^ Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3, Y3, X4, Z4, Y4 ^ X5, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3, Y3, X3, Z4, X4 ^ Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2, Y3, X3, Z3, X4 ^ Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2, Y2, X3, Z3, Y3 ^ X4, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2, Y2, X2, Z3, X3 ^ Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3, Y3, X4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3, Y3, X3, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2, Y3, X3, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2, Y2, X3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2, Y2, X2, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Y3 ^ Z3 ^ X4, Y3, X4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, X3 ^ Y3 ^ Z3, Y3, X3, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2 ^ X3 ^ Y3, Y3, X3, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Y2 ^ Z2 ^ X3, Y2, X3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, X2 ^ Y2 ^ Z2, Y2, X2, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3 ^ Y4 ^ X5, Y3 ^ X4 ^ Z4, X4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3 ^ X4 ^ Y4, X3 ^ Y3 ^ Z4, X3, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2 ^ X4 ^ Y4, X3 ^ Y3 ^ Z3, X3, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2 ^ Y3 ^ X4, Y2 ^ X3 ^ Z3, X3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2 ^ X3 ^ Y3, X2 ^ Y2 ^ Z3, X2, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3, Y3 ^ X5, X4 ^ Y4 ^ Z4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3, Y3 ^ X4, X3 ^ Y4 ^ Z4, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2, Y3 ^ X4, X3 ^ Z3 ^ Y4, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2, Y2 ^ X4, X3 ^ Y3 ^ Z3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2, Y2 ^ X3, X2 ^ Y3 ^ Z3, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3, Y3, X4, Y4 ^ Z4 ^ X5, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3, Y3, X3, X4 ^ Y4 ^ Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2, Y3, X3, Z3 ^ X4 ^ Y4, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2, Y2, X3, Y3 ^ Z3 ^ X4, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2, Y2, X2, X3 ^ Y3 ^ Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3, Y3, X4, Z4, Y4 ^ X5, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3, Y3, X3, Z4, X4 ^ Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2, Y3, X3, Z3, X4 ^ Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2, Y2, X3, Z3, Y3 ^ X4, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2, Y2, X2, Z3, X3 ^ Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3, Y3, X4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3, Y3, X3, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2, Y3, X3, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2, Y2, X3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2, Y2, X2, Z3, Y3, X3 }, -}; - -const UINT_64 SW_64K_D3_X[][16]= -{ - {X0, X1, Z0, Y0, Y1, Z1, X2, Z2, Y2, X3, Z3, Y3, X4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, X1, Z1, Y1, Z2, Y2, X2, Z3, Y3, X3, Z4, Y4, X4 }, - {0, 0, X0, Y0, X1, Z0, Y1, Z1, Y2, X2, Z2, Y3, X3, Z3, Y4, X4 }, - {0, 0, 0, X0, Y0, Z0, X1, Z1, Y1, X2, Z2, Y2, X3, Z3, Y3, X4 }, - {0, 0, 0, 0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Y1, Z1, X2, Z2, X3 ^ Y3, X3, Z3, Y2, X4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, X1, Z1, Y1, Z2, X3 ^ Y3, X2, Z3, Y2, X3, Z4, Y4, X4 }, - {0, 0, X0, Y0, X1, Z0, Y1, Z1, X3 ^ Y3, X2, Z2, Y2, X3, Z3, Y4, X4 }, - {0, 0, 0, X0, Y0, Z0, X1, Z1, X3 ^ Y3, X2, Z2, Y1, X3, Z3, Y2, X4 }, - {0, 0, 0, 0, X0, Z0, Y0, Z1, X3 ^ Y3, X1, Z2, Y1, X2, Z3, Y2, X3 }, - {X0, X1, Z0, Y0, Y1, Z1, X2, Z2, X3 ^ Y3, X4 ^ Y4, Z3, Y2, X3, Z4, Y4, X5 }, - {0, X0, Z0, Y0, X1, Z1, Y1, Z2, X3 ^ Y3, X4 ^ Y4, Z3, Y2, X2, Z4, Y3, X4 }, - {0, 0, X0, Y0, X1, Z0, Y1, Z1, X3 ^ Y3, X4 ^ Y4, Z2, Y2, X2, Z3, Y3, X4 }, - {0, 0, 0, X0, Y0, Z0, X1, Z1, X3 ^ Y3, X4 ^ Y4, Z2, Y1, X2, Z3, Y2, X3 }, - {0, 0, 0, 0, X0, Z0, Y0, Z1, X3 ^ Y3, X1 ^ X4 ^ Y4, Z2, Y1, X2, Z3, Y2, X3 }, - {X0, X1, Z0, Y0, Y1, Z1, X2, Z2, X3 ^ Y3, X4 ^ Y4, X5 ^ Y5, Z3, Y2, X3, Z4, Y4 }, - {0, X0, Z0, Y0, X1, Z1, Y1, Z2, X3 ^ Y3, X4 ^ Y4, Z3 ^ X5 ^ Y5, Y2, X2, Z4, Y3, X4 }, - {0, 0, X0, Y0, X1, Z0, Y1, Z1, X3 ^ Y3, X4 ^ Y4, Z2 ^ X5 ^ Y5, Y2, X2, Z3, Y3, X4 }, - {0, 0, 0, X0, Y0, Z0, X1, Z1, X3 ^ Y3, X4 ^ Y4, Z2 ^ X5 ^ Y5, Y1, X2, Z3, Y2, X3 }, - {0, 0, 0, 0, X0, Z0, Y0, Z1, X3 ^ Y3, X1 ^ X4 ^ Y4, Z2 ^ X5 ^ Y5, Y1, X2, Z3, Y2, X3 }, - {X0, X1, Z0, Y0, Y1, Z1, X2, Z2, X3 ^ Y3, X4 ^ Y4, Y2 ^ Y5 ^ X6, X5 ^ Y6, Z3, Y3, X4, Z4 }, - {0, X0, Z0, Y0, X1, Z1, Y1, Z2, X3 ^ Y3, X4 ^ Y4, Z3 ^ Y5 ^ X6, Y2 ^ X5 ^ Y6, X2, Z4, Y3, X4 }, - {0, 0, X0, Y0, X1, Z0, Y1, Z1, X3 ^ Y3, X4 ^ Y4, Z2 ^ Y5 ^ X6, Y2 ^ X5 ^ Y6, X2, Z3, Y3, X4 }, - {0, 0, 0, X0, Y0, Z0, X1, Z1, X3 ^ Y3, X4 ^ Y4, Z2 ^ Y5 ^ X6, Y1 ^ X5 ^ Y6, X2, Z3, Y2, X3 }, - {0, 0, 0, 0, X0, Z0, Y0, Z1, X3 ^ Y3, X1 ^ X4 ^ Y4, Z2 ^ Y5 ^ X6, Y1 ^ X5 ^ Y6, X2, Z3, Y2, X3 }, - {X0, X1, Z0, Y0, Y1, Z1, X2, Z2, X3 ^ Y3, X4 ^ Y4, Y2 ^ Y5 ^ X7, X5 ^ Y7, Z3 ^ X6 ^ Y6, Y3, X4, Z4 }, - {0, X0, Z0, Y0, X1, Z1, Y1, Z2, X3 ^ Y3, X4 ^ Y4, Z3 ^ Y5 ^ X7, Y2 ^ X5 ^ Y7, X2 ^ X6 ^ Y6, Z4, Y3, X4 }, - {0, 0, X0, Y0, X1, Z0, Y1, Z1, X3 ^ Y3, X4 ^ Y4, Z2 ^ Y5 ^ X7, Y2 ^ X5 ^ Y7, X2 ^ X6 ^ Y6, Z3, Y3, X4 }, - {0, 0, 0, X0, Y0, Z0, X1, Z1, X3 ^ Y3, X4 ^ Y4, Z2 ^ Y5 ^ X7, Y1 ^ X5 ^ Y7, X2 ^ X6 ^ Y6, Z3, Y2, X3 }, - {0, 0, 0, 0, X0, Z0, Y0, Z1, X3 ^ Y3, X1 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Y1 ^ X5 ^ Y7, X2 ^ X6 ^ Y6, Z3, Y2, X3 }, - {X0, X1, Z0, Y0, Y1, Z1, X2, Z2, X3 ^ Y3, X4 ^ Y4, Y2 ^ Y5 ^ X8, X5 ^ Y8, Z3 ^ Y6 ^ X7, Z4 ^ X6 ^ Y7, Y3, X4 }, - {0, X0, Z0, Y0, X1, Z1, Y1, Z2, X3 ^ Y3, X4 ^ Y4, Z3 ^ Y5 ^ X8, Y2 ^ X5 ^ Y8, X2 ^ Y6 ^ X7, Z4 ^ X6 ^ Y7, Y3, X4 }, - {0, 0, X0, Y0, X1, Z0, Y1, Z1, X3 ^ Y3, X4 ^ Y4, Z2 ^ Y5 ^ X8, Y2 ^ X5 ^ Y8, X2 ^ Y6 ^ X7, Z3 ^ X6 ^ Y7, Y3, X4 }, - {0, 0, 0, X0, Y0, Z0, X1, Z1, X3 ^ Y3, X4 ^ Y4, Z2 ^ Y5 ^ X8, Y1 ^ X5 ^ Y8, X2 ^ Y6 ^ X7, Z3 ^ X6 ^ Y7, Y2, X3 }, - {0, 0, 0, 0, X0, Z0, Y0, Z1, X3 ^ Y3, X1 ^ X4 ^ Y4, Z2 ^ Y5 ^ X8, Y1 ^ X5 ^ Y8, X2 ^ Y6 ^ X7, Z3 ^ X6 ^ Y7, Y2, X3 }, - {X0, X1, Z0, Y0, Y1, Z1, X2, Z2, Y2, X3, Z3, Y3, X4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, X1, Z1, Y1, Z2, Y2, X2, Z3, Y3, X3, Z4, Y4, X4 }, - {0, 0, X0, Y0, X1, Z0, Y1, Z1, Y2, X2, Z2, Y3, X3, Z3, Y4, X4 }, - {0, 0, 0, X0, Y0, Z0, X1, Z1, Y1, X2, Z2, Y2, X3, Z3, Y3, X4 }, - {0, 0, 0, 0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Y1, Z1, X2, Z2, X3, X3 ^ Y3, Z3, Y2, X4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, X1, Z1, Y1, Z2, X2, X3 ^ Y3, Z3, Y2, X3, Z4, Y4, X4 }, - {0, 0, X0, Y0, X1, Z0, Y1, Z1, X2, X3 ^ Y3, Z2, Y2, X3, Z3, Y4, X4 }, - {0, 0, 0, X0, Y0, Z0, X1, Z1, X2, X3 ^ Y3, Z2, Y1, X3, Z3, Y2, X4 }, - {0, 0, 0, 0, X0, Z0, Y0, Z1, X1, X3 ^ Y3, Z2, Y1, X2, Z3, Y2, X3 }, - {X0, X1, Z0, Y0, Y1, Z1, X2, Z2, Z3, X3 ^ Y3, X4 ^ Y4, Y2, X3, Z4, Y4, X5 }, - {0, X0, Z0, Y0, X1, Z1, Y1, Z2, Z3, X3 ^ Y3, X4 ^ Y4, Y2, X2, Z4, Y3, X4 }, - {0, 0, X0, Y0, X1, Z0, Y1, Z1, Z2, X3 ^ Y3, X4 ^ Y4, Y2, X2, Z3, Y3, X4 }, - {0, 0, 0, X0, Y0, Z0, X1, Z1, Z2, X3 ^ Y3, X4 ^ Y4, Y1, X2, Z3, Y2, X3 }, - {0, 0, 0, 0, X0, Z0, Y0, Z1, Z2, X3 ^ Y3, X1 ^ X4 ^ Y4, Y1, X2, Z3, Y2, X3 }, - {X0, X1, Z0, Y0, Y1, Z1, X2, Z2, Z3, X3 ^ Y3, X4 ^ Y4, X5 ^ Y5, Y2, X3, Z4, Y4 }, - {0, X0, Z0, Y0, X1, Z1, Y1, Z2, Y2, X3 ^ Y3, X4 ^ Y4, Z3 ^ X5 ^ Y5, X2, Z4, Y3, X4 }, - {0, 0, X0, Y0, X1, Z0, Y1, Z1, Y2, X3 ^ Y3, X4 ^ Y4, Z2 ^ X5 ^ Y5, X2, Z3, Y3, X4 }, - {0, 0, 0, X0, Y0, Z0, X1, Z1, Y1, X3 ^ Y3, X4 ^ Y4, Z2 ^ X5 ^ Y5, X2, Z3, Y2, X3 }, - {0, 0, 0, 0, X0, Z0, Y0, Z1, Y1, X3 ^ Y3, X1 ^ X4 ^ Y4, Z2 ^ X5 ^ Y5, X2, Z3, Y2, X3 }, - {X0, X1, Z0, Y0, Y1, Z1, X2, Z2, Z3, X3 ^ Y3, X4 ^ Y4, Y2 ^ Y5 ^ X6, X5 ^ Y6, Y3, X4, Z4 }, - {0, X0, Z0, Y0, X1, Z1, Y1, Z2, X2, X3 ^ Y3, X4 ^ Y4, Z3 ^ Y5 ^ X6, Y2 ^ X5 ^ Y6, Z4, Y3, X4 }, - {0, 0, X0, Y0, X1, Z0, Y1, Z1, X2, X3 ^ Y3, X4 ^ Y4, Z2 ^ Y5 ^ X6, Y2 ^ X5 ^ Y6, Z3, Y3, X4 }, - {0, 0, 0, X0, Y0, Z0, X1, Z1, X2, X3 ^ Y3, X4 ^ Y4, Z2 ^ Y5 ^ X6, Y1 ^ X5 ^ Y6, Z3, Y2, X3 }, - {0, 0, 0, 0, X0, Z0, Y0, Z1, X2, X3 ^ Y3, X1 ^ X4 ^ Y4, Z2 ^ Y5 ^ X6, Y1 ^ X5 ^ Y6, Z3, Y2, X3 }, - {X0, X1, Z0, Y0, Y1, Z1, X2, Z2, Y3, X3 ^ Y3, X4 ^ Y4, Y2 ^ Y5 ^ X7, X5 ^ Y7, Z3 ^ X6 ^ Y6, X4, Z4 }, - {0, X0, Z0, Y0, X1, Z1, Y1, Z2, Z4, X3 ^ Y3, X4 ^ Y4, Z3 ^ Y5 ^ X7, Y2 ^ X5 ^ Y7, X2 ^ X6 ^ Y6, Y3, X4 }, - {0, 0, X0, Y0, X1, Z0, Y1, Z1, Z3, X3 ^ Y3, X4 ^ Y4, Z2 ^ Y5 ^ X7, Y2 ^ X5 ^ Y7, X2 ^ X6 ^ Y6, Y3, X4 }, - {0, 0, 0, X0, Y0, Z0, X1, Z1, Z3, X3 ^ Y3, X4 ^ Y4, Z2 ^ Y5 ^ X7, Y1 ^ X5 ^ Y7, X2 ^ X6 ^ Y6, Y2, X3 }, - {0, 0, 0, 0, X0, Z0, Y0, Z1, Z3, X3 ^ Y3, X1 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Y1 ^ X5 ^ Y7, X2 ^ X6 ^ Y6, Y2, X3 }, - {X0, X1, Z0, Y0, Y1, Z1, X2, Z2, Y3, X3 ^ Y3, X4 ^ Y4, Y2 ^ Y5 ^ X8, X5 ^ Y8, Z3 ^ Y6 ^ X7, Z4 ^ X6 ^ Y7, X4 }, - {0, X0, Z0, Y0, X1, Z1, Y1, Z2, Y3, X3 ^ Y3, X4 ^ Y4, Z3 ^ Y5 ^ X8, Y2 ^ X5 ^ Y8, X2 ^ Y6 ^ X7, Z4 ^ X6 ^ Y7, X4 }, - {0, 0, X0, Y0, X1, Z0, Y1, Z1, Y3, X3 ^ Y3, X4 ^ Y4, Z2 ^ Y5 ^ X8, Y2 ^ X5 ^ Y8, X2 ^ Y6 ^ X7, Z3 ^ X6 ^ Y7, X4 }, - {0, 0, 0, X0, Y0, Z0, X1, Z1, Y2, X3 ^ Y3, X4 ^ Y4, Z2 ^ Y5 ^ X8, Y1 ^ X5 ^ Y8, X2 ^ Y6 ^ X7, Z3 ^ X6 ^ Y7, X3 }, - {0, 0, 0, 0, X0, Z0, Y0, Z1, Y2, X3 ^ Y3, X1 ^ X4 ^ Y4, Z2 ^ Y5 ^ X8, Y1 ^ X5 ^ Y8, X2 ^ Y6 ^ X7, Z3 ^ X6 ^ Y7, X3 }, - {X0, X1, Z0, Y0, Y1, Z1, X2, Z2, Y2, X3, Z3, Y3, X4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, X1, Z1, Y1, Z2, Y2, X2, Z3, Y3, X3, Z4, Y4, X4 }, - {0, 0, X0, Y0, X1, Z0, Y1, Z1, Y2, X2, Z2, Y3, X3, Z3, Y4, X4 }, - {0, 0, 0, X0, Y0, Z0, X1, Z1, Y1, X2, Z2, Y2, X3, Z3, Y3, X4 }, - {0, 0, 0, 0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Y1, Z1, X2, Z2, X3, Z3, X3 ^ Y3, Y2, X4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, X1, Z1, Y1, Z2, X2, Z3, X3 ^ Y3, Y2, X3, Z4, Y4, X4 }, - {0, 0, X0, Y0, X1, Z0, Y1, Z1, X2, Z2, X3 ^ Y3, Y2, X3, Z3, Y4, X4 }, - {0, 0, 0, X0, Y0, Z0, X1, Z1, X2, Z2, X3 ^ Y3, Y1, X3, Z3, Y2, X4 }, - {0, 0, 0, 0, X0, Z0, Y0, Z1, X1, Z2, X3 ^ Y3, Y1, X2, Z3, Y2, X3 }, - {X0, X1, Z0, Y0, Y1, Z1, X2, Z2, Z3, Y2, X3 ^ Y3, X4 ^ Y4, X3, Z4, Y4, X5 }, - {0, X0, Z0, Y0, X1, Z1, Y1, Z2, Z3, Y2, X3 ^ Y3, X4 ^ Y4, X2, Z4, Y3, X4 }, - {0, 0, X0, Y0, X1, Z0, Y1, Z1, Z2, Y2, X3 ^ Y3, X4 ^ Y4, X2, Z3, Y3, X4 }, - {0, 0, 0, X0, Y0, Z0, X1, Z1, Z2, Y1, X3 ^ Y3, X4 ^ Y4, X2, Z3, Y2, X3 }, - {0, 0, 0, 0, X0, Z0, Y0, Z1, Z2, Y1, X3 ^ Y3, X1 ^ X4 ^ Y4, X2, Z3, Y2, X3 }, - {X0, X1, Z0, Y0, Y1, Z1, X2, Z2, Z3, Y2, X3 ^ Y3, X4 ^ Y4, X5 ^ Y5, X3, Z4, Y4 }, - {0, X0, Z0, Y0, X1, Z1, Y1, Z2, Y2, X2, X3 ^ Y3, X4 ^ Y4, Z3 ^ X5 ^ Y5, Z4, Y3, X4 }, - {0, 0, X0, Y0, X1, Z0, Y1, Z1, Y2, X2, X3 ^ Y3, X4 ^ Y4, Z2 ^ X5 ^ Y5, Z3, Y3, X4 }, - {0, 0, 0, X0, Y0, Z0, X1, Z1, Y1, X2, X3 ^ Y3, X4 ^ Y4, Z2 ^ X5 ^ Y5, Z3, Y2, X3 }, - {0, 0, 0, 0, X0, Z0, Y0, Z1, Y1, X2, X3 ^ Y3, X1 ^ X4 ^ Y4, Z2 ^ X5 ^ Y5, Z3, Y2, X3 }, - {X0, X1, Z0, Y0, Y1, Z1, X2, Z2, Z3, Y3, X3 ^ Y3, X4 ^ Y4, Y2 ^ Y5 ^ X6, X5 ^ Y6, X4, Z4 }, - {0, X0, Z0, Y0, X1, Z1, Y1, Z2, X2, Z4, X3 ^ Y3, X4 ^ Y4, Z3 ^ Y5 ^ X6, Y2 ^ X5 ^ Y6, Y3, X4 }, - {0, 0, X0, Y0, X1, Z0, Y1, Z1, X2, Z3, X3 ^ Y3, X4 ^ Y4, Z2 ^ Y5 ^ X6, Y2 ^ X5 ^ Y6, Y3, X4 }, - {0, 0, 0, X0, Y0, Z0, X1, Z1, X2, Z3, X3 ^ Y3, X4 ^ Y4, Z2 ^ Y5 ^ X6, Y1 ^ X5 ^ Y6, Y2, X3 }, - {0, 0, 0, 0, X0, Z0, Y0, Z1, X2, Z3, X3 ^ Y3, X1 ^ X4 ^ Y4, Z2 ^ Y5 ^ X6, Y1 ^ X5 ^ Y6, Y2, X3 }, - {X0, X1, Z0, Y0, Y1, Z1, X2, Z2, Y3, X4, X3 ^ Y3, X4 ^ Y4, Y2 ^ Y5 ^ X7, X5 ^ Y7, Z3 ^ X6 ^ Y6, Z4 }, - {0, X0, Z0, Y0, X1, Z1, Y1, Z2, Z4, Y3, X3 ^ Y3, X4 ^ Y4, Z3 ^ Y5 ^ X7, Y2 ^ X5 ^ Y7, X2 ^ X6 ^ Y6, X4 }, - {0, 0, X0, Y0, X1, Z0, Y1, Z1, Z3, Y3, X3 ^ Y3, X4 ^ Y4, Z2 ^ Y5 ^ X7, Y2 ^ X5 ^ Y7, X2 ^ X6 ^ Y6, X4 }, - {0, 0, 0, X0, Y0, Z0, X1, Z1, Z3, Y2, X3 ^ Y3, X4 ^ Y4, Z2 ^ Y5 ^ X7, Y1 ^ X5 ^ Y7, X2 ^ X6 ^ Y6, X3 }, - {0, 0, 0, 0, X0, Z0, Y0, Z1, Z3, Y2, X3 ^ Y3, X1 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Y1 ^ X5 ^ Y7, X2 ^ X6 ^ Y6, X3 }, - {X0, X1, Z0, Y0, Y1, Z1, X2, Z2, Y3, X4, X3 ^ Y3, X4 ^ Y4, Y2 ^ Y5 ^ X8, X5 ^ Y8, Z3 ^ Y6 ^ X7, Z4 ^ X6 ^ Y7 }, - {0, X0, Z0, Y0, X1, Z1, Y1, Z2, Y3, X4, X3 ^ Y3, X4 ^ Y4, Z3 ^ Y5 ^ X8, Y2 ^ X5 ^ Y8, X2 ^ Y6 ^ X7, Z4 ^ X6 ^ Y7 }, - {0, 0, X0, Y0, X1, Z0, Y1, Z1, Y3, X4, X3 ^ Y3, X4 ^ Y4, Z2 ^ Y5 ^ X8, Y2 ^ X5 ^ Y8, X2 ^ Y6 ^ X7, Z3 ^ X6 ^ Y7 }, - {0, 0, 0, X0, Y0, Z0, X1, Z1, Y2, X3, X3 ^ Y3, X4 ^ Y4, Z2 ^ Y5 ^ X8, Y1 ^ X5 ^ Y8, X2 ^ Y6 ^ X7, Z3 ^ X6 ^ Y7 }, - {0, 0, 0, 0, X0, Z0, Y0, Z1, Y2, X3, X3 ^ Y3, X1 ^ X4 ^ Y4, Z2 ^ Y5 ^ X8, Y1 ^ X5 ^ Y8, X2 ^ Y6 ^ X7, Z3 ^ X6 ^ Y7 }, -}; - -// Meta data swizzle pattern -const UINT_64 HTILE_64K[][18]= -{ - {0, 0, 0, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, 0, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, 0, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, 0, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, 0, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, 0, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, 0, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, 0, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, 0, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y4, X4, X5, Y5, X6, Z0 ^ X3 ^ Y3, Y6, X7, Y7, 0, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y4, X4, X5, Y5, X6, Z0 ^ X3 ^ Y3, Y6, X7, Y7, 0, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y4, X4, X5, Y5, X6, Z0 ^ X3 ^ Y3, Y6, X7, Y7, 0, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y4, X4, X5, Y5, X6, Z0 ^ X3 ^ Y3, Y6, X7, Y7, 0, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y4, X5, Y5, X6, Y6, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, X7, Y7, X8, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y4, X5, Y5, X6, Y6, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, X7, Y7, X8, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y4, X5, Y5, X6, Y6, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, X7, Y7, X8, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y4, X5, Y5, X6, Y6, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, X7, Y7, X8, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y4, Y5, X6, Y6, X7, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y7, X8, Y8, 0, 0, 0 }, - {0, 0, 0, X3, Y4, Y5, X6, Y6, X7, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y7, X8, Y8, 0, 0, 0 }, - {0, 0, 0, X3, Y4, Y5, X6, Y6, X7, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y7, X8, Y8, 0, 0, 0 }, - {0, 0, 0, X3, Y4, Y5, X6, Y6, X7, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y7, X8, Y8, 0, 0, 0 }, - {0, 0, 0, X3, Y4, X6, Y6, X7, Y7, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X8, Y8, X9, 0, 0 }, - {0, 0, 0, X3, Y4, X6, Y6, X7, Y7, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X8, Y8, X9, 0, 0 }, - {0, 0, 0, X3, Y4, X6, Y6, X7, Y7, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X8, Y8, X9, 0, 0 }, - {0, 0, 0, X3, Y4, X6, Y6, X7, Y7, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X8, Y8, X9, 0, 0 }, - {0, 0, 0, X3, Y4, Y6, X7, Y7, X8, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, Y8, X9, Y9, 0 }, - {0, 0, 0, X3, Y4, Y6, X7, Y7, X8, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, Y8, X9, Y9, 0 }, - {0, 0, 0, X3, Y4, Y6, X7, Y7, X8, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, X6 ^ Y6, Y8, X9, Y9, 0 }, - {0, 0, 0, X3, Y4, Y6, X7, Y7, X8, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, Y8, X9, Y9, 0 }, - {0, 0, 0, X3, Y4, X7, Y7, X8, Y8, X3 ^ Y3 ^ Z5, X4 ^ Y4 ^ Z4, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7, X9, Y9, X10 }, - {0, 0, 0, X3, Y4, X7, Y7, X8, Y8, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X8, Z1 ^ X5 ^ Y8, Z0 ^ Y6 ^ X7, X6 ^ Y7, X9, Y9, X10 }, - {0, 0, 0, X3, Y4, X7, Y7, X8, Y8, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, Y6 ^ X7, X6 ^ Y7, X9, Y9, X10 }, - {0, 0, 0, X3, Y4, X7, Y7, X8, Y8, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, X6 ^ Y7, X9, Y9, X10 }, - {0, 0, 0, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, 0, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, 0, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, 0, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, 0, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, 0, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, 0, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, 0, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, 0, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y4, X4, X5, Y5, X6, Y6, Z0 ^ X3 ^ Y3, X7, Y7, 0, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y4, X4, X5, Y5, X6, Y6, Z0 ^ X3 ^ Y3, X7, Y7, 0, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y4, X4, X5, Y5, X6, Y6, Z0 ^ X3 ^ Y3, X7, Y7, 0, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y4, X4, X5, Y5, X6, Y6, Z0 ^ X3 ^ Y3, X7, Y7, 0, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y4, X5, Y5, X6, Y6, X7, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y7, X8, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y4, X5, Y5, X6, Y6, X7, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y7, X8, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y4, X5, Y5, X6, Y6, X7, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y7, X8, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y4, X5, Y5, X6, Y6, X7, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y7, X8, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y4, Y5, X6, Y6, X7, Y7, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X8, Y8, 0, 0, 0 }, - {0, 0, 0, X3, Y4, Y5, X6, Y6, X7, Y7, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X8, Y8, 0, 0, 0 }, - {0, 0, 0, X3, Y4, Y5, X6, Y6, X7, Y7, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X8, Y8, 0, 0, 0 }, - {0, 0, 0, X3, Y4, Y5, X6, Y6, X7, Y7, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X8, Y8, 0, 0, 0 }, - {0, 0, 0, X3, Y4, X6, Y6, X7, Y7, X8, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y8, X9, 0, 0 }, - {0, 0, 0, X3, Y4, X6, Y6, X7, Y7, X8, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y8, X9, 0, 0 }, - {0, 0, 0, X3, Y4, X6, Y6, X7, Y7, X8, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y8, X9, 0, 0 }, - {0, 0, 0, X3, Y4, X6, Y6, X7, Y7, X8, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, Y8, X9, 0, 0 }, - {0, 0, 0, X3, Y4, Y6, X7, Y7, X8, Y8, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X9, Y9, 0 }, - {0, 0, 0, X3, Y4, Y6, X7, Y7, X8, Y8, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X9, Y9, 0 }, - {0, 0, 0, X3, Y4, Y6, X7, Y7, X8, Y8, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, X6 ^ Y6, X9, Y9, 0 }, - {0, 0, 0, X3, Y4, Y6, X7, Y7, X8, Y8, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X9, Y9, 0 }, - {0, 0, 0, X3, Y4, X7, Y7, X8, Y8, X9, X3 ^ Y3 ^ Z5, X4 ^ Y4 ^ Z4, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7, Y9, X10 }, - {0, 0, 0, X3, Y4, X7, Y7, X8, Y8, X9, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X8, Z1 ^ X5 ^ Y8, Z0 ^ Y6 ^ X7, X6 ^ Y7, Y9, X10 }, - {0, 0, 0, X3, Y4, X7, Y7, X8, Y8, X9, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, Y6 ^ X7, X6 ^ Y7, Y9, X10 }, - {0, 0, 0, X3, Y4, X7, Y7, X8, Y8, X9, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, X6 ^ Y7, Y9, X10 }, - {0, 0, 0, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, 0, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, 0, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, 0, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, 0, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, 0, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, 0, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, 0, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, 0, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y4, X4, X5, Y5, X6, Y6, X7, Z0 ^ X3 ^ Y3, Y7, 0, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y4, X4, X5, Y5, X6, Y6, X7, Z0 ^ X3 ^ Y3, Y7, 0, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y4, X4, X5, Y5, X6, Y6, X7, Z0 ^ X3 ^ Y3, Y7, 0, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y4, X4, X5, Y5, X6, Y6, X7, Z0 ^ X3 ^ Y3, Y7, 0, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y4, X5, Y5, X6, Y6, X7, Y7, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, X8, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y4, X5, Y5, X6, Y6, X7, Y7, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, X8, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y4, X5, Y5, X6, Y6, X7, Y7, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, X8, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y4, X5, Y5, X6, Y6, X7, Y7, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, X8, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y4, Y5, X6, Y6, X7, Y7, X8, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y8, 0, 0, 0 }, - {0, 0, 0, X3, Y4, Y5, X6, Y6, X7, Y7, X8, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y8, 0, 0, 0 }, - {0, 0, 0, X3, Y4, Y5, X6, Y6, X7, Y7, X8, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y8, 0, 0, 0 }, - {0, 0, 0, X3, Y4, Y5, X6, Y6, X7, Y7, X8, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y8, 0, 0, 0 }, - {0, 0, 0, X3, Y4, X6, Y6, X7, Y7, X8, Y8, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X9, 0, 0 }, - {0, 0, 0, X3, Y4, X6, Y6, X7, Y7, X8, Y8, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X9, 0, 0 }, - {0, 0, 0, X3, Y4, X6, Y6, X7, Y7, X8, Y8, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X9, 0, 0 }, - {0, 0, 0, X3, Y4, X6, Y6, X7, Y7, X8, Y8, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X9, 0, 0 }, - {0, 0, 0, X3, Y4, Y6, X7, Y7, X8, Y8, X9, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, Y9, 0 }, - {0, 0, 0, X3, Y4, Y6, X7, Y7, X8, Y8, X9, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, Y9, 0 }, - {0, 0, 0, X3, Y4, Y6, X7, Y7, X8, Y8, X9, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, X6 ^ Y6, Y9, 0 }, - {0, 0, 0, X3, Y4, Y6, X7, Y7, X8, Y8, X9, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, Y9, 0 }, - {0, 0, 0, X3, Y4, X7, Y7, X8, Y8, X9, Y9, X3 ^ Y3 ^ Z5, X4 ^ Y4 ^ Z4, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7, X10 }, - {0, 0, 0, X3, Y4, X7, Y7, X8, Y8, X9, Y9, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X8, Z1 ^ X5 ^ Y8, Z0 ^ Y6 ^ X7, X6 ^ Y7, X10 }, - {0, 0, 0, X3, Y4, X7, Y7, X8, Y8, X9, Y9, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, Y6 ^ X7, X6 ^ Y7, X10 }, - {0, 0, 0, X3, Y4, X7, Y7, X8, Y8, X9, Y9, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, X6 ^ Y7, X10 }, -}; - -const UINT_64 CMASK_64K[][17]= -{ - {X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, 0, 0, 0, 0 }, - {X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, 0, 0, 0, 0 }, - {X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, 0, 0, 0, 0 }, - {X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, 0, 0, 0, 0 }, - {X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, 0, 0, 0, 0 }, - {X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, 0, 0, 0, 0 }, - {X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, 0, 0, 0, 0 }, - {X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, 0, 0, 0, 0 }, - {X3, Y4, X4, X5, Y5, X6, Y6, X7, Y7, Z0 ^ X3 ^ Y3, X8, Y8, X9, 0, 0, 0, 0 }, - {X3, Y4, X4, X5, Y5, X6, Y6, X7, Y7, Z0 ^ X3 ^ Y3, X8, Y8, X9, 0, 0, 0, 0 }, - {X3, Y4, X4, X5, Y5, X6, Y6, X7, Y7, Z0 ^ X3 ^ Y3, X8, Y8, X9, 0, 0, 0, 0 }, - {X3, Y4, X4, X5, Y5, X6, Y6, X7, Y7, Z0 ^ X3 ^ Y3, X8, Y8, X9, 0, 0, 0, 0 }, - {X3, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y8, X9, 0, 0, 0, 0 }, - {X3, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y8, X9, 0, 0, 0, 0 }, - {X3, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y8, X9, 0, 0, 0, 0 }, - {X3, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y8, X9, 0, 0, 0, 0 }, - {X3, Y4, Y5, X6, Y6, X7, Y7, X8, Y8, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X9, 0, 0, 0, 0 }, - {X3, Y4, Y5, X6, Y6, X7, Y7, X8, Y8, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X9, 0, 0, 0, 0 }, - {X3, Y4, Y5, X6, Y6, X7, Y7, X8, Y8, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X9, 0, 0, 0, 0 }, - {X3, Y4, Y5, X6, Y6, X7, Y7, X8, Y8, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X9, 0, 0, 0, 0 }, - {X3, Y4, X6, Y6, X7, Y7, X8, Y8, X9, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, 0, 0, 0, 0 }, - {X3, Y4, X6, Y6, X7, Y7, X8, Y8, X9, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, 0, 0, 0, 0 }, - {X3, Y4, X6, Y6, X7, Y7, X8, Y8, X9, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, 0, 0, 0, 0 }, - {X3, Y4, X6, Y6, X7, Y7, X8, Y8, X9, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, 0, 0, 0, 0 }, - {X3, Y4, Y6, X7, Y7, X8, Y8, X9, Y9, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, 0, 0, 0 }, - {X3, Y4, Y6, X7, Y7, X8, Y8, X9, Y9, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, 0, 0, 0 }, - {X3, Y4, Y6, X7, Y7, X8, Y8, X9, Y9, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, 0, 0, 0 }, - {X3, Y4, Y6, X7, Y7, X8, Y8, X9, Y9, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, 0, 0, 0 }, - {X3, Y4, X7, Y7, X8, Y8, X9, Y9, X10, X3 ^ Y3 ^ Z5, X4 ^ Y4 ^ Z4, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7, 0, 0 }, - {X3, Y4, X7, Y7, X8, Y8, X9, Y9, X10, X3 ^ Y3 ^ Z5, X4 ^ Y4 ^ Z4, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7, 0, 0 }, - {X3, Y4, X7, Y7, X8, Y8, X9, Y9, X10, X3 ^ Y3 ^ Z5, X4 ^ Y4 ^ Z4, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7, 0, 0 }, - {X3, Y4, X7, Y7, X8, Y8, X9, Y9, X10, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X8, Z1 ^ X5 ^ Y8, Y6 ^ X7, Z0 ^ X6 ^ Y7, 0, 0 }, - {X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, 0, 0, 0, 0 }, - {X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, 0, 0, 0, 0 }, - {X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, 0, 0, 0, 0 }, - {X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, 0, 0, 0, 0 }, - {X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, 0, 0, 0, 0 }, - {X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, 0, 0, 0, 0 }, - {X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, 0, 0, 0, 0 }, - {X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, 0, 0, 0, 0 }, - {X3, Y4, X4, X5, Y5, X6, Y6, X7, Y7, X8, Z0 ^ X3 ^ Y3, Y8, X9, 0, 0, 0, 0 }, - {X3, Y4, X4, X5, Y5, X6, Y6, X7, Y7, X8, Z0 ^ X3 ^ Y3, Y8, X9, 0, 0, 0, 0 }, - {X3, Y4, X4, X5, Y5, X6, Y6, X7, Y7, X8, Z0 ^ X3 ^ Y3, Y8, X9, 0, 0, 0, 0 }, - {X3, Y4, X4, X5, Y5, X6, Y6, X7, Y7, X8, Z0 ^ X3 ^ Y3, Y8, X9, 0, 0, 0, 0 }, - {X3, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, X9, 0, 0, 0, 0 }, - {X3, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, X9, 0, 0, 0, 0 }, - {X3, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, X9, 0, 0, 0, 0 }, - {X3, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, X9, 0, 0, 0, 0 }, - {X3, Y4, Y5, X6, Y6, X7, Y7, X8, Y8, X9, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, 0, 0, 0, 0 }, - {X3, Y4, Y5, X6, Y6, X7, Y7, X8, Y8, X9, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, 0, 0, 0, 0 }, - {X3, Y4, Y5, X6, Y6, X7, Y7, X8, Y8, X9, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, 0, 0, 0, 0 }, - {X3, Y4, Y5, X6, Y6, X7, Y7, X8, Y8, X9, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, 0, 0, 0, 0 }, - {X3, Y4, X6, Y6, X7, Y7, X8, Y8, X9, Y9, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, 0, 0, 0 }, - {X3, Y4, X6, Y6, X7, Y7, X8, Y8, X9, Y9, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, 0, 0, 0 }, - {X3, Y4, X6, Y6, X7, Y7, X8, Y8, X9, Y9, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, 0, 0, 0 }, - {X3, Y4, X6, Y6, X7, Y7, X8, Y8, X9, Y9, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, 0, 0, 0 }, - {X3, Y4, Y6, X7, Y7, X8, Y8, X9, Y9, X10, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, 0, 0 }, - {X3, Y4, Y6, X7, Y7, X8, Y8, X9, Y9, X10, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, 0, 0 }, - {X3, Y4, Y6, X7, Y7, X8, Y8, X9, Y9, X10, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, 0, 0 }, - {X3, Y4, Y6, X7, Y7, X8, Y8, X9, Y9, X10, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, 0, 0 }, - {X3, Y4, X7, Y7, X8, Y8, X9, Y9, X10, Y10, X3 ^ Y3 ^ Z5, X4 ^ Y4 ^ Z4, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7, 0 }, - {X3, Y4, X7, Y7, X8, Y8, X9, Y9, X10, Y10, X3 ^ Y3 ^ Z5, X4 ^ Y4 ^ Z4, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7, 0 }, - {X3, Y4, X7, Y7, X8, Y8, X9, Y9, X10, Y10, X3 ^ Y3 ^ Z5, X4 ^ Y4 ^ Z4, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7, 0 }, - {X3, Y4, X7, Y7, X8, Y8, X9, Y9, X10, Y10, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X8, Z1 ^ X5 ^ Y8, Y6 ^ X7, Z0 ^ X6 ^ Y7, 0 }, - {X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, 0, 0, 0, 0 }, - {X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, 0, 0, 0, 0 }, - {X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, 0, 0, 0, 0 }, - {X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, 0, 0, 0, 0 }, - {X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, 0, 0, 0, 0 }, - {X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, 0, 0, 0, 0 }, - {X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, 0, 0, 0, 0 }, - {X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, 0, 0, 0, 0 }, - {X3, Y4, X4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, Z0 ^ X3 ^ Y3, X9, 0, 0, 0, 0 }, - {X3, Y4, X4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, Z0 ^ X3 ^ Y3, X9, 0, 0, 0, 0 }, - {X3, Y4, X4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, Z0 ^ X3 ^ Y3, X9, 0, 0, 0, 0 }, - {X3, Y4, X4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, Z0 ^ X3 ^ Y3, X9, 0, 0, 0, 0 }, - {X3, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, 0, 0, 0, 0 }, - {X3, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, 0, 0, 0, 0 }, - {X3, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, 0, 0, 0, 0 }, - {X3, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, 0, 0, 0, 0 }, - {X3, Y4, Y5, X6, Y6, X7, Y7, X8, Y8, X9, Y9, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, 0, 0, 0 }, - {X3, Y4, Y5, X6, Y6, X7, Y7, X8, Y8, X9, Y9, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, 0, 0, 0 }, - {X3, Y4, Y5, X6, Y6, X7, Y7, X8, Y8, X9, Y9, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, 0, 0, 0 }, - {X3, Y4, Y5, X6, Y6, X7, Y7, X8, Y8, X9, Y9, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, 0, 0, 0 }, - {X3, Y4, X6, Y6, X7, Y7, X8, Y8, X9, Y9, X10, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, 0, 0 }, - {X3, Y4, X6, Y6, X7, Y7, X8, Y8, X9, Y9, X10, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, 0, 0 }, - {X3, Y4, X6, Y6, X7, Y7, X8, Y8, X9, Y9, X10, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, 0, 0 }, - {X3, Y4, X6, Y6, X7, Y7, X8, Y8, X9, Y9, X10, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, 0, 0 }, - {X3, Y4, Y6, X7, Y7, X8, Y8, X9, Y9, X10, Y10, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, 0 }, - {X3, Y4, Y6, X7, Y7, X8, Y8, X9, Y9, X10, Y10, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, 0 }, - {X3, Y4, Y6, X7, Y7, X8, Y8, X9, Y9, X10, Y10, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, 0 }, - {X3, Y4, Y6, X7, Y7, X8, Y8, X9, Y9, X10, Y10, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, 0 }, - {X3, Y4, X7, Y7, X8, Y8, X9, Y9, X10, Y10, X11, X3 ^ Y3 ^ Z5, X4 ^ Y4 ^ Z4, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7 }, - {X3, Y4, X7, Y7, X8, Y8, X9, Y9, X10, Y10, X11, X3 ^ Y3 ^ Z5, X4 ^ Y4 ^ Z4, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7 }, - {X3, Y4, X7, Y7, X8, Y8, X9, Y9, X10, Y10, X11, X3 ^ Y3 ^ Z5, X4 ^ Y4 ^ Z4, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7 }, - {X3, Y4, X7, Y7, X8, Y8, X9, Y9, X10, Y10, X11, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X8, Z1 ^ X5 ^ Y8, Y6 ^ X7, Z0 ^ X6 ^ Y7 }, -}; - -const UINT_64 DCC_64K_R_X[][17]= -{ - {0, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, Y9, 0, 0, 0, 0 }, - {0, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, 0, 0, 0, 0 }, - {0, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, 0, 0, 0, 0 }, - {0, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, 0, 0, 0, 0 }, - {0, X2, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, 0, 0, 0, 0 }, - {0, X3 ^ Y3, X4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, Y9, 0, 0, 0, 0 }, - {0, X3 ^ Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, 0, 0, 0, 0 }, - {0, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, 0, 0, 0, 0 }, - {0, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, 0, 0, 0, 0 }, - {0, X2, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, 0, 0, 0, 0 }, - {0, X3 ^ Y3, X4 ^ Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, Y9, 0, 0, 0, 0 }, - {0, X3 ^ Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, 0, 0, 0, 0 }, - {0, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, 0, 0, 0, 0 }, - {0, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, 0, 0, 0, 0 }, - {0, X2, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, 0, 0, 0, 0 }, - {0, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, Y9, 0, 0, 0, 0 }, - {0, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, 0, 0, 0, 0 }, - {0, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, 0, 0, 0, 0 }, - {0, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, 0, 0, 0, 0 }, - {0, X2, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, 0, 0, 0, 0 }, - {0, X4, X5, Y5, X6, Y6, X7, Y7, X8, Z0 ^ X3 ^ Y3, Y8, X9, Y9, 0, 0, 0, 0 }, - {0, Y4, X4, X5, Y5, X6, Y6, X7, Y7, Z0 ^ X3 ^ Y3, X8, Y8, X9, 0, 0, 0, 0 }, - {0, X3, Y4, X4, X5, Y5, X6, Y6, X7, Z0 ^ X3 ^ Y3, Y7, X8, Y8, 0, 0, 0, 0 }, - {0, Y2, X3, Y4, X4, X5, Y5, X6, Y6, Z0 ^ X3 ^ Y3, X7, Y7, X8, 0, 0, 0, 0 }, - {0, X2, Y2, X3, Y4, X4, X5, Y5, X6, Z0 ^ X3 ^ Y3, Y6, X7, Y7, 0, 0, 0, 0 }, - {0, X5, Y5, X6, Y6, X7, Y7, X8, Y8, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, X9, Y9, 0, 0, 0, 0 }, - {0, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y8, X9, 0, 0, 0, 0 }, - {0, X3, Y4, X5, Y5, X6, Y6, X7, Y7, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, X8, Y8, 0, 0, 0, 0 }, - {0, Y2, X3, Y4, X5, Y5, X6, Y6, X7, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y7, X8, 0, 0, 0, 0 }, - {0, X2, Y2, X3, Y4, X5, Y5, X6, Y6, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, X7, Y7, 0, 0, 0, 0 }, - {0, Y5, X6, Y6, X7, Y7, X8, Y8, X9, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y9, 0, 0, 0, 0 }, - {0, Y4, Y5, X6, Y6, X7, Y7, X8, Y8, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X9, 0, 0, 0, 0 }, - {0, X3, Y4, Y5, X6, Y6, X7, Y7, X8, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y8, 0, 0, 0, 0 }, - {0, Y2, X3, Y4, Y5, X6, Y6, X7, Y7, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X8, 0, 0, 0, 0 }, - {0, X2, Y2, X3, Y4, Y5, X6, Y6, X7, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y7, 0, 0, 0, 0 }, - {0, X6, Y6, X7, Y7, X8, Y8, X9, Y9, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, 0, 0, 0, 0 }, - {0, Y4, X6, Y6, X7, Y7, X8, Y8, X9, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, 0, 0, 0, 0 }, - {0, X3, Y4, X6, Y6, X7, Y7, X8, Y8, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, 0, 0, 0, 0 }, - {0, Y2, X3, Y4, X6, Y6, X7, Y7, X8, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, 0, 0, 0, 0 }, - {0, X2, Y2, X3, Y4, X6, Y6, X7, Y7, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, 0, 0, 0, 0 }, - {0, Y6, X7, Y7, X8, Y8, X9, Y9, X10, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, 0, 0, 0 }, - {0, Y4, Y6, X7, Y7, X8, Y8, X9, Y9, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, 0, 0, 0 }, - {0, X3, Y4, Y6, X7, Y7, X8, Y8, X9, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, 0, 0, 0 }, - {0, Y2, X3, Y4, Y6, X7, Y7, X8, Y8, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, 0, 0, 0 }, - {0, X2, X3, Y4, Y6, X7, Y7, Y2, X8, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, 0, 0, 0 }, - {0, X7, Y7, X8, Y8, X9, Y9, X10, Y10, X3 ^ Y3 ^ Z5, X4 ^ Y4 ^ Z4, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7, 0, 0 }, - {0, Y4, X7, Y7, X8, Y8, X9, Y9, X10, X3 ^ Y3 ^ Z5, X4 ^ Y4 ^ Z4, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7, 0, 0 }, - {0, X3, Y4, X7, Y7, X8, Y8, X9, Y9, X3 ^ Y3 ^ Z5, X4 ^ Y4 ^ Z4, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7, 0, 0 }, - {0, X3, Y4, X7, Y7, X8, Y8, Y2, X9, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X8, Z1 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7, 0, 0 }, - {0, X3, Y4, X7, Y7, X8, Y8, X2, Y2, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7, 0, 0 }, - {0, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, Y9, 0, 0, 0, 0 }, - {0, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, 0, 0, 0, 0 }, - {0, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, 0, 0, 0, 0 }, - {0, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, 0, 0, 0, 0 }, - {0, X2, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, 0, 0, 0, 0 }, - {0, X3 ^ Y3, X4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, Y9, 0, 0, 0, 0 }, - {0, X3 ^ Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, 0, 0, 0, 0 }, - {0, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, 0, 0, 0, 0 }, - {0, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, 0, 0, 0, 0 }, - {0, X2, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, 0, 0, 0, 0 }, - {0, X3 ^ Y3, X4 ^ Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, Y9, 0, 0, 0, 0 }, - {0, X3 ^ Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, 0, 0, 0, 0 }, - {0, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, 0, 0, 0, 0 }, - {0, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, 0, 0, 0, 0 }, - {0, X2, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, 0, 0, 0, 0 }, - {0, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, Y9, 0, 0, 0, 0 }, - {0, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, 0, 0, 0, 0 }, - {0, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, 0, 0, 0, 0 }, - {0, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, 0, 0, 0, 0 }, - {0, X2, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, 0, 0, 0, 0 }, - {0, X4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, Z0 ^ X3 ^ Y3, X9, Y9, 0, 0, 0, 0 }, - {0, Y4, X4, X5, Y5, X6, Y6, X7, Y7, X8, Z0 ^ X3 ^ Y3, Y8, X9, 0, 0, 0, 0 }, - {0, X3, Y4, X4, X5, Y5, X6, Y6, X7, Y7, Z0 ^ X3 ^ Y3, X8, Y8, 0, 0, 0, 0 }, - {0, Y2, X3, Y4, X4, X5, Y5, X6, Y6, X7, Z0 ^ X3 ^ Y3, Y7, X8, 0, 0, 0, 0 }, - {0, X2, Y2, X3, Y4, X4, X5, Y5, X6, Y6, Z0 ^ X3 ^ Y3, X7, Y7, 0, 0, 0, 0 }, - {0, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y9, 0, 0, 0, 0 }, - {0, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, X9, 0, 0, 0, 0 }, - {0, X3, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y8, 0, 0, 0, 0 }, - {0, Y2, X3, Y4, X5, Y5, X6, Y6, X7, Y7, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, X8, 0, 0, 0, 0 }, - {0, X2, Y2, X3, Y4, X5, Y5, X6, Y6, X7, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, Y7, 0, 0, 0, 0 }, - {0, Y5, X6, Y6, X7, Y7, X8, Y8, X9, Y9, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, 0, 0, 0, 0 }, - {0, Y4, Y5, X6, Y6, X7, Y7, X8, Y8, X9, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, 0, 0, 0, 0 }, - {0, X3, Y4, Y5, X6, Y6, X7, Y7, X8, Y8, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, 0, 0, 0, 0 }, - {0, Y2, X3, Y4, Y5, X6, Y6, X7, Y7, X8, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, 0, 0, 0, 0 }, - {0, X2, Y2, X3, Y4, Y5, X6, Y6, X7, Y7, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, 0, 0, 0, 0 }, - {0, X6, Y6, X7, Y7, X8, Y8, X9, Y9, X10, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, 0, 0, 0 }, - {0, Y4, X6, Y6, X7, Y7, X8, Y8, X9, Y9, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, 0, 0, 0 }, - {0, X3, Y4, X6, Y6, X7, Y7, X8, Y8, X9, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, 0, 0, 0 }, - {0, Y2, X3, Y4, X6, Y6, X7, Y7, X8, Y8, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, 0, 0, 0 }, - {0, X2, Y2, X3, Y4, X6, Y6, X7, Y7, X8, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, 0, 0, 0 }, - {0, Y6, X7, Y7, X8, Y8, X9, Y9, X10, Y10, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, 0, 0 }, - {0, Y4, Y6, X7, Y7, X8, Y8, X9, Y9, X10, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, 0, 0 }, - {0, X3, Y4, Y6, X7, Y7, X8, Y8, X9, Y9, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, 0, 0 }, - {0, Y2, X3, Y4, Y6, X7, Y7, X8, Y8, X9, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, 0, 0 }, - {0, X2, X3, Y4, Y6, X7, Y7, Y2, X8, Y8, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, 0, 0 }, - {0, X7, Y7, X8, Y8, X9, Y9, X10, Y10, X11, X3 ^ Y3 ^ Z5, X4 ^ Y4 ^ Z4, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7, 0 }, - {0, Y4, X7, Y7, X8, Y8, X9, Y9, X10, Y10, X3 ^ Y3 ^ Z5, X4 ^ Y4 ^ Z4, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7, 0 }, - {0, X3, Y4, X7, Y7, X8, Y8, X9, Y9, X10, X3 ^ Y3 ^ Z5, X4 ^ Y4 ^ Z4, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7, 0 }, - {0, X3, Y4, X7, Y7, X8, Y8, Y2, X9, Y9, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X8, Z1 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7, 0 }, - {0, X3, Y4, X7, Y7, X8, Y8, X2, Y2, X9, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7, 0 }, - {0, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, Y9, 0, 0, 0, 0 }, - {0, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, 0, 0, 0, 0 }, - {0, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, 0, 0, 0, 0 }, - {0, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, 0, 0, 0, 0 }, - {0, X2, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, 0, 0, 0, 0 }, - {0, X3 ^ Y3, X4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, Y9, 0, 0, 0, 0 }, - {0, X3 ^ Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, 0, 0, 0, 0 }, - {0, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, 0, 0, 0, 0 }, - {0, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, 0, 0, 0, 0 }, - {0, X2, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, 0, 0, 0, 0 }, - {0, X3 ^ Y3, X4 ^ Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, Y9, 0, 0, 0, 0 }, - {0, X3 ^ Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, 0, 0, 0, 0 }, - {0, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, 0, 0, 0, 0 }, - {0, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, 0, 0, 0, 0 }, - {0, X2, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, 0, 0, 0, 0 }, - {0, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, Y9, 0, 0, 0, 0 }, - {0, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, 0, 0, 0, 0 }, - {0, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, 0, 0, 0, 0 }, - {0, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, 0, 0, 0, 0 }, - {0, X2, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, 0, 0, 0, 0 }, - {0, X4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, Z0 ^ X3 ^ Y3, Y9, 0, 0, 0, 0 }, - {0, Y4, X4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, Z0 ^ X3 ^ Y3, X9, 0, 0, 0, 0 }, - {0, X3, Y4, X4, X5, Y5, X6, Y6, X7, Y7, X8, Z0 ^ X3 ^ Y3, Y8, 0, 0, 0, 0 }, - {0, Y2, X3, Y4, X4, X5, Y5, X6, Y6, X7, Y7, Z0 ^ X3 ^ Y3, X8, 0, 0, 0, 0 }, - {0, X2, Y2, X3, Y4, X4, X5, Y5, X6, Y6, X7, Z0 ^ X3 ^ Y3, Y7, 0, 0, 0, 0 }, - {0, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, Y9, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, 0, 0, 0, 0 }, - {0, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, 0, 0, 0, 0 }, - {0, X3, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, 0, 0, 0, 0 }, - {0, Y2, X3, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, 0, 0, 0, 0 }, - {0, X2, Y2, X3, Y4, X5, Y5, X6, Y6, X7, Y7, Z1 ^ X3 ^ Y3, Z0 ^ X4 ^ Y4, 0, 0, 0, 0 }, - {0, Y5, X6, Y6, X7, Y7, X8, Y8, X9, Y9, X10, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, 0, 0, 0 }, - {0, Y4, Y5, X6, Y6, X7, Y7, X8, Y8, X9, Y9, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, 0, 0, 0 }, - {0, X3, Y4, Y5, X6, Y6, X7, Y7, X8, Y8, X9, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, 0, 0, 0 }, - {0, Y2, X3, Y4, Y5, X6, Y6, X7, Y7, X8, Y8, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, 0, 0, 0 }, - {0, X2, Y2, X3, Y4, Y5, X6, Y6, X7, Y7, X8, Z2 ^ X3 ^ Y3, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, 0, 0, 0 }, - {0, X6, Y6, X7, Y7, X8, Y8, X9, Y9, X10, Y10, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, 0, 0 }, - {0, Y4, X6, Y6, X7, Y7, X8, Y8, X9, Y9, X10, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, 0, 0 }, - {0, X3, Y4, X6, Y6, X7, Y7, X8, Y8, X9, Y9, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, 0, 0 }, - {0, Y2, X3, Y4, X6, Y6, X7, Y7, X8, Y8, X9, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, 0, 0 }, - {0, X2, Y2, X3, Y4, X6, Y6, X7, Y7, X8, Y8, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, 0, 0 }, - {0, Y6, X7, Y7, X8, Y8, X9, Y9, X10, Y10, X11, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, 0 }, - {0, Y4, Y6, X7, Y7, X8, Y8, X9, Y9, X10, Y10, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, 0 }, - {0, X3, Y4, Y6, X7, Y7, X8, Y8, X9, Y9, X10, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, 0 }, - {0, Y2, X3, Y4, Y6, X7, Y7, X8, Y8, X9, Y9, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, 0 }, - {0, X2, X3, Y4, Y6, X7, Y7, Y2, X8, Y8, X9, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, 0 }, - {0, X7, Y7, X8, Y8, X9, Y9, X10, Y10, X11, Y11, X3 ^ Y3 ^ Z5, X4 ^ Y4 ^ Z4, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7 }, - {0, Y4, X7, Y7, X8, Y8, X9, Y9, X10, Y10, X11, X3 ^ Y3 ^ Z5, X4 ^ Y4 ^ Z4, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7 }, - {0, X3, Y4, X7, Y7, X8, Y8, X9, Y9, X10, Y10, X3 ^ Y3 ^ Z5, X4 ^ Y4 ^ Z4, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7 }, - {0, X3, Y4, X7, Y7, X8, Y8, Y2, X9, Y9, X10, X3 ^ Y3 ^ Z4, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X8, Z1 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7 }, - {0, X3, Y4, X7, Y7, X8, Y8, X2, Y2, X9, Y9, X3 ^ Y3 ^ Z3, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7 }, -}; - -// Rb plus color data swizzle pattern -const UINT_64 SW_256_S_RBPLUS[][8]= -{ - {X0, X1, X2, X3, Y0, Y1, Y2, Y3 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2 }, - {0, 0, 0, X0, Y0, Y1, X1, X2 }, - {0, 0, 0, 0, Y0, Y1, X0, X1 }, -}; - -const UINT_64 SW_256_D_RBPLUS[][8]= -{ - {X0, X1, X2, Y1, Y0, Y2, X3, Y3 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2 }, - {0, 0, 0, X0, Y0, X1, X2, Y1 }, - {0, 0, 0, 0, X0, Y0, X1, Y1 }, -}; - -const UINT_64 SW_4K_S_RBPLUS[][12]= -{ - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Y5, X5 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4, X4 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Y3, X4 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Y3, X3 }, -}; - -const UINT_64 SW_4K_D_RBPLUS[][12]= -{ - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5, X5 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3, Y4, X4 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3, X4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3, X3 }, -}; - -const UINT_64 SW_4K_S_X_RBPLUS[][12]= -{ - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Y5, X5 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4, X4 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Y3, X4 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Y3, X3 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Z0 ^ X4 ^ Y4, X4, Y5, X5 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Z0 ^ Y3 ^ X4, X4, Y4, X5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Z0 ^ X3 ^ Y3, X3, Y4, X4 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Z0 ^ Y2 ^ X3, X3, Y3, X4 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Z0 ^ X2 ^ Y2, X2, Y3, X3 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4 ^ X5, Z0 ^ X4 ^ Y5, Y5, X5 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3 ^ X5, Z0 ^ X4 ^ Y4, Y4, X5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3 ^ X4, Z0 ^ X3 ^ Y4, Y4, X4 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2 ^ X4, Z0 ^ X3 ^ Y3, Y3, X4 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2 ^ X3, Z0 ^ X2 ^ Y3, Y3, X3 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4 ^ X6, X4 ^ Y6, Z0 ^ X5 ^ Y5, X5 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3 ^ X6, X4 ^ Y5, Z0 ^ Y4 ^ X5, X5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3 ^ X5, X3 ^ Y5, Z0 ^ X4 ^ Y4, X4 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2 ^ X5, X3 ^ Y4, Z0 ^ Y3 ^ X4, X4 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2 ^ X4, X2 ^ Y4, Z0 ^ X3 ^ Y3, X3 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5, Y5, X5 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Z1 ^ Y3 ^ X5, Z0 ^ X4 ^ Y4, Y4, X5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Z1 ^ Y3 ^ X4, Z0 ^ X3 ^ Y4, Y4, X4 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Z1 ^ Y2 ^ X4, Z0 ^ X3 ^ Y3, Y3, X4 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Z1 ^ Y2 ^ X3, Z0 ^ X2 ^ Y3, Y3, X3 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4 ^ X6, Z1 ^ X4 ^ Y6, Z0 ^ X5 ^ Y5, X5 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3 ^ X6, Z1 ^ X4 ^ Y5, Z0 ^ Y4 ^ X5, X5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3 ^ X5, Z1 ^ X3 ^ Y5, Z0 ^ X4 ^ Y4, X4 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2 ^ X5, Z1 ^ X3 ^ Y4, Z0 ^ Y3 ^ X4, X4 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2 ^ X4, Z1 ^ X2 ^ Y4, Z0 ^ X3 ^ Y3, X3 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4 ^ X7, X4 ^ Y7, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3 ^ X7, X4 ^ Y6, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3 ^ X6, X3 ^ Y6, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2 ^ X6, X3 ^ Y5, Z1 ^ Y3 ^ X5, Z0 ^ X4 ^ Y4 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2 ^ X5, X2 ^ Y5, Z1 ^ Y3 ^ X4, Z0 ^ X3 ^ Y4 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Z2 ^ Y4 ^ X6, Z1 ^ X4 ^ Y6, Z0 ^ X5 ^ Y5, X5 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Z2 ^ Y3 ^ X6, Z1 ^ X4 ^ Y5, Z0 ^ Y4 ^ X5, X5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Z2 ^ Y3 ^ X5, Z1 ^ X3 ^ Y5, Z0 ^ X4 ^ Y4, X4 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2 ^ Z2 ^ X5, Z1 ^ X3 ^ Y4, Z0 ^ Y3 ^ X4, X4 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2 ^ Z2 ^ X4, Z1 ^ X2 ^ Y4, Z0 ^ X3 ^ Y3, X3 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4 ^ X7, Z2 ^ X4 ^ Y7, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3 ^ X7, Z2 ^ X4 ^ Y6, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3 ^ X6, Z2 ^ X3 ^ Y6, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2 ^ X6, Z2 ^ X3 ^ Y5, Z1 ^ Y3 ^ X5, Z0 ^ X4 ^ Y4 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2 ^ X5, X2 ^ Z2 ^ Y5, Z1 ^ Y3 ^ X4, Z0 ^ X3 ^ Y4 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4 ^ X7, X4 ^ Y7, Z2 ^ Y5 ^ X6, Z1 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3 ^ X7, X4 ^ Y6, Z2 ^ Y4 ^ X6, Z1 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3 ^ X6, X3 ^ Y6, Z2 ^ Y4 ^ X5, Z1 ^ X4 ^ Y5 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2 ^ X6, X3 ^ Y5, Z2 ^ Y3 ^ X5, Z1 ^ X4 ^ Y4 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2 ^ X5, X2 ^ Y5, Z2 ^ Y3 ^ X4, Z1 ^ X3 ^ Y4 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Z3 ^ Y4 ^ X7, Z2 ^ X4 ^ Y7, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3 ^ Z3 ^ X7, Z2 ^ X4 ^ Y6, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3 ^ Z3 ^ X6, Z2 ^ X3 ^ Y6, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2 ^ Z3 ^ X6, Z2 ^ X3 ^ Y5, Z1 ^ Y3 ^ X5, Z0 ^ X4 ^ Y4 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2 ^ Z3 ^ X5, X2 ^ Z2 ^ Y5, Z1 ^ Y3 ^ X4, Z0 ^ X3 ^ Y4 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4 ^ X7, Z3 ^ X4 ^ Y7, Z2 ^ Y5 ^ X6, Z1 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3 ^ X7, Z3 ^ X4 ^ Y6, Z2 ^ Y4 ^ X6, Z1 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3 ^ X6, X3 ^ Z3 ^ Y6, Z2 ^ Y4 ^ X5, Z1 ^ X4 ^ Y5 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2 ^ X6, X3 ^ Z3 ^ Y5, Z2 ^ Y3 ^ X5, Z1 ^ X4 ^ Y4 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2 ^ X5, X2 ^ Z3 ^ Y5, Z2 ^ Y3 ^ X4, Z1 ^ X3 ^ Y4 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4 ^ X7, X4 ^ Y7, Z3 ^ Y5 ^ X6, Z2 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3 ^ X7, X4 ^ Y6, Z3 ^ Y4 ^ X6, Z2 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3 ^ X6, X3 ^ Y6, Z3 ^ Y4 ^ X5, Z2 ^ X4 ^ Y5 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2 ^ X6, X3 ^ Y5, Y3 ^ Z3 ^ X5, Z2 ^ X4 ^ Y4 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2 ^ X5, X2 ^ Y5, Y3 ^ Z3 ^ X4, Z2 ^ X3 ^ Y4 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Z3 ^ Y4 ^ X7, Z2 ^ X4 ^ Y7, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3 ^ Z3 ^ X7, Z2 ^ X4 ^ Y6, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3 ^ Z3 ^ X6, Z2 ^ X3 ^ Y6, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2 ^ Z3 ^ X6, Z2 ^ X3 ^ Y5, Z1 ^ Y3 ^ X5, Z0 ^ X4 ^ Y4 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2 ^ Z3 ^ X5, X2 ^ Z2 ^ Y5, Z1 ^ Y3 ^ X4, Z0 ^ X3 ^ Y4 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4 ^ X7, Z3 ^ X4 ^ Y7, Z2 ^ Y5 ^ X6, Z1 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3 ^ X7, Z3 ^ X4 ^ Y6, Z2 ^ Y4 ^ X6, Z1 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3 ^ X6, X3 ^ Z3 ^ Y6, Z2 ^ Y4 ^ X5, Z1 ^ X4 ^ Y5 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2 ^ X6, X3 ^ Z3 ^ Y5, Z2 ^ Y3 ^ X5, Z1 ^ X4 ^ Y4 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2 ^ X5, X2 ^ Z3 ^ Y5, Z2 ^ Y3 ^ X4, Z1 ^ X3 ^ Y4 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Y5, X5 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4, X4 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Y3, X4 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Y3, X3 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, Z0 ^ X4 ^ Y5, Y5, X5 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Z0 ^ X4 ^ Y4, Y4, X5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, Z0 ^ X3 ^ Y4, Y4, X4 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, Z0 ^ X3 ^ Y3, Y3, X4 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, Z0 ^ X2 ^ Y3, Y3, X3 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4 ^ Y6, Z0 ^ X5 ^ Y5, X5 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4 ^ Y5, Z0 ^ Y4 ^ X5, X5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3 ^ Y5, Z0 ^ X4 ^ Y4, X4 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3 ^ Y4, Z0 ^ Y3 ^ X4, X4 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2 ^ Y4, Z0 ^ X3 ^ Y3, X3 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4 ^ Y7, Y5 ^ X6, Z0 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4 ^ Y6, Y4 ^ X6, Z0 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3 ^ Y6, Y4 ^ X5, Z0 ^ X4 ^ Y5 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3 ^ Y5, Y3 ^ X5, Z0 ^ X4 ^ Y4 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2 ^ Y5, Y3 ^ X4, Z0 ^ X3 ^ Y4 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, Z1 ^ X4 ^ Y6, Z0 ^ X5 ^ Y5, X5 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Z1 ^ X4 ^ Y5, Z0 ^ Y4 ^ X5, X5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, Z1 ^ X3 ^ Y5, Z0 ^ X4 ^ Y4, X4 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, Z1 ^ X3 ^ Y4, Z0 ^ Y3 ^ X4, X4 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, Z1 ^ X2 ^ Y4, Z0 ^ X3 ^ Y3, X3 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4 ^ Y7, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4 ^ Y6, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3 ^ Y6, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3 ^ Y5, Z1 ^ Y3 ^ X5, Z0 ^ X4 ^ Y4 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2 ^ Y5, Z1 ^ Y3 ^ X4, Z0 ^ X3 ^ Y4 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4 ^ Y7, Y5 ^ X6, Z1 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4 ^ Y6, Y4 ^ X6, Z1 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3 ^ Y6, Y4 ^ X5, Z1 ^ X4 ^ Y5 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3 ^ Y5, Y3 ^ X5, Z1 ^ X4 ^ Y4 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2 ^ Y5, Y3 ^ X4, Z1 ^ X3 ^ Y4 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, Z2 ^ X4 ^ Y7, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Z2 ^ X4 ^ Y6, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, Z2 ^ X3 ^ Y6, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, Z2 ^ X3 ^ Y5, Z1 ^ Y3 ^ X5, Z0 ^ X4 ^ Y4 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2 ^ Z2 ^ Y5, Z1 ^ Y3 ^ X4, Z0 ^ X3 ^ Y4 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4 ^ Y7, Z2 ^ Y5 ^ X6, Z1 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4 ^ Y6, Z2 ^ Y4 ^ X6, Z1 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3 ^ Y6, Z2 ^ Y4 ^ X5, Z1 ^ X4 ^ Y5 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3 ^ Y5, Z2 ^ Y3 ^ X5, Z1 ^ X4 ^ Y4 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2 ^ Y5, Z2 ^ Y3 ^ X4, Z1 ^ X3 ^ Y4 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4 ^ Y7, Y5 ^ X6, Z2 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4 ^ Y6, Y4 ^ X6, Z2 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3 ^ Y6, Y4 ^ X5, Z2 ^ X4 ^ Y5 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3 ^ Y5, Y3 ^ X5, Z2 ^ X4 ^ Y4 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2 ^ Y5, Y3 ^ X4, Z2 ^ X3 ^ Y4 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, Z2 ^ X4 ^ Y7, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Z2 ^ X4 ^ Y6, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, Z2 ^ X3 ^ Y6, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, Z2 ^ X3 ^ Y5, Z1 ^ Y3 ^ X5, Z0 ^ X4 ^ Y4 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2 ^ Z2 ^ Y5, Z1 ^ Y3 ^ X4, Z0 ^ X3 ^ Y4 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4 ^ Y7, Z2 ^ Y5 ^ X6, Z1 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4 ^ Y6, Z2 ^ Y4 ^ X6, Z1 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3 ^ Y6, Z2 ^ Y4 ^ X5, Z1 ^ X4 ^ Y5 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3 ^ Y5, Z2 ^ Y3 ^ X5, Z1 ^ X4 ^ Y4 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2 ^ Y5, Z2 ^ Y3 ^ X4, Z1 ^ X3 ^ Y4 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4 ^ Y7, Y5 ^ X6, Z2 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4 ^ Y6, Y4 ^ X6, Z2 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3 ^ Y6, Y4 ^ X5, Z2 ^ X4 ^ Y5 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3 ^ Y5, Y3 ^ X5, Z2 ^ X4 ^ Y4 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2 ^ Y5, Y3 ^ X4, Z2 ^ X3 ^ Y4 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, Z2 ^ X4 ^ Y7, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Z2 ^ X4 ^ Y6, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, Z2 ^ X3 ^ Y6, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, Z2 ^ X3 ^ Y5, Z1 ^ Y3 ^ X5, Z0 ^ X4 ^ Y4 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2 ^ Z2 ^ Y5, Z1 ^ Y3 ^ X4, Z0 ^ X3 ^ Y4 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4 ^ Y7, Z2 ^ Y5 ^ X6, Z1 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4 ^ Y6, Z2 ^ Y4 ^ X6, Z1 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3 ^ Y6, Z2 ^ Y4 ^ X5, Z1 ^ X4 ^ Y5 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3 ^ Y5, Z2 ^ Y3 ^ X5, Z1 ^ X4 ^ Y4 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2 ^ Y5, Z2 ^ Y3 ^ X4, Z1 ^ X3 ^ Y4 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Y5, X5 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4, X4 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Y3, X4 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Y3, X3 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Z0 ^ X5 ^ Y5, X5 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Z0 ^ Y4 ^ X5, X5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Z0 ^ X4 ^ Y4, X4 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Z0 ^ Y3 ^ X4, X4 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Z0 ^ X3 ^ Y3, X3 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Y5 ^ X6, Z0 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^ X6, Z0 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4 ^ X5, Z0 ^ X4 ^ Y5 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Y3 ^ X5, Z0 ^ X4 ^ Y4 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Y3 ^ X4, Z0 ^ X3 ^ Y4 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Y5 ^ X6, X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^ X6, X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4 ^ X5, X4 ^ Y5 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Y3 ^ X5, X4 ^ Y4 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Y3 ^ X4, X3 ^ Y4 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Z1 ^ Y3 ^ X5, Z0 ^ X4 ^ Y4 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Z1 ^ Y3 ^ X4, Z0 ^ X3 ^ Y4 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Y5 ^ X6, Z1 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^ X6, Z1 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4 ^ X5, Z1 ^ X4 ^ Y5 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Y3 ^ X5, Z1 ^ X4 ^ Y4 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Y3 ^ X4, Z1 ^ X3 ^ Y4 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Y5 ^ X6, X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^ X6, X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4 ^ X5, X4 ^ Y5 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Y3 ^ X5, X4 ^ Y4 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Y3 ^ X4, X3 ^ Y4 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Z1 ^ Y3 ^ X5, Z0 ^ X4 ^ Y4 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Z1 ^ Y3 ^ X4, Z0 ^ X3 ^ Y4 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Y5 ^ X6, Z1 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^ X6, Z1 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4 ^ X5, Z1 ^ X4 ^ Y5 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Y3 ^ X5, Z1 ^ X4 ^ Y4 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Y3 ^ X4, Z1 ^ X3 ^ Y4 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Y5 ^ X6, X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^ X6, X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4 ^ X5, X4 ^ Y5 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Y3 ^ X5, X4 ^ Y4 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Y3 ^ X4, X3 ^ Y4 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Z1 ^ Y3 ^ X5, Z0 ^ X4 ^ Y4 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Z1 ^ Y3 ^ X4, Z0 ^ X3 ^ Y4 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Y5 ^ X6, Z1 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^ X6, Z1 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4 ^ X5, Z1 ^ X4 ^ Y5 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Y3 ^ X5, Z1 ^ X4 ^ Y4 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Y3 ^ X4, Z1 ^ X3 ^ Y4 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Y5 ^ X6, X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^ X6, X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4 ^ X5, X4 ^ Y5 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Y3 ^ X5, X4 ^ Y4 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Y3 ^ X4, X3 ^ Y4 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Z1 ^ Y3 ^ X5, Z0 ^ X4 ^ Y4 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Z1 ^ Y3 ^ X4, Z0 ^ X3 ^ Y4 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Y5 ^ X6, Z1 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^ X6, Z1 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4 ^ X5, Z1 ^ X4 ^ Y5 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Y3 ^ X5, Z1 ^ X4 ^ Y4 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Y3 ^ X4, Z1 ^ X3 ^ Y4 }, -}; - -const UINT_64 SW_4K_D_X_RBPLUS[][12]= -{ - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5, X5 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3, Y4, X4 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3, X4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3, X3 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Z0 ^ X4 ^ Y4, X4, Y5, X5 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Z0 ^ Y3 ^ X4, X4, Y4, X5 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Z0 ^ X3 ^ Y3, X3, Y4, X4 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Z0 ^ Y2 ^ X3, X3, Y3, X4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Z0 ^ X2 ^ Y2, X2, Y3, X3 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4 ^ X5, Z0 ^ X4 ^ Y5, Y5, X5 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3 ^ X5, Z0 ^ X4 ^ Y4, Y4, X5 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3 ^ X4, Z0 ^ X3 ^ Y4, Y4, X4 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2 ^ X4, Z0 ^ X3 ^ Y3, Y3, X4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2 ^ X3, Z0 ^ X2 ^ Y3, Y3, X3 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4 ^ X6, X4 ^ Y6, Z0 ^ X5 ^ Y5, X5 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3 ^ X6, X4 ^ Y5, Z0 ^ Y4 ^ X5, X5 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3 ^ X5, X3 ^ Y5, Z0 ^ X4 ^ Y4, X4 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2 ^ X5, X3 ^ Y4, Z0 ^ Y3 ^ X4, X4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2 ^ X4, X2 ^ Y4, Z0 ^ X3 ^ Y3, X3 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5, Y5, X5 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Z1 ^ Y3 ^ X5, Z0 ^ X4 ^ Y4, Y4, X5 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Z1 ^ Y3 ^ X4, Z0 ^ X3 ^ Y4, Y4, X4 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Z1 ^ Y2 ^ X4, Z0 ^ X3 ^ Y3, Y3, X4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Z1 ^ Y2 ^ X3, Z0 ^ X2 ^ Y3, Y3, X3 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4 ^ X6, Z1 ^ X4 ^ Y6, Z0 ^ X5 ^ Y5, X5 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3 ^ X6, Z1 ^ X4 ^ Y5, Z0 ^ Y4 ^ X5, X5 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3 ^ X5, Z1 ^ X3 ^ Y5, Z0 ^ X4 ^ Y4, X4 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2 ^ X5, Z1 ^ X3 ^ Y4, Z0 ^ Y3 ^ X4, X4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2 ^ X4, Z1 ^ X2 ^ Y4, Z0 ^ X3 ^ Y3, X3 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4 ^ X7, X4 ^ Y7, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3 ^ X7, X4 ^ Y6, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3 ^ X6, X3 ^ Y6, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2 ^ X6, X3 ^ Y5, Z1 ^ Y3 ^ X5, Z0 ^ X4 ^ Y4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2 ^ X5, X2 ^ Y5, Z1 ^ Y3 ^ X4, Z0 ^ X3 ^ Y4 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Z2 ^ Y4 ^ X6, Z1 ^ X4 ^ Y6, Z0 ^ X5 ^ Y5, X5 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Z2 ^ Y3 ^ X6, Z1 ^ X4 ^ Y5, Z0 ^ Y4 ^ X5, X5 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Z2 ^ Y3 ^ X5, Z1 ^ X3 ^ Y5, Z0 ^ X4 ^ Y4, X4 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2 ^ Z2 ^ X5, Z1 ^ X3 ^ Y4, Z0 ^ Y3 ^ X4, X4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2 ^ Z2 ^ X4, Z1 ^ X2 ^ Y4, Z0 ^ X3 ^ Y3, X3 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4 ^ X7, Z2 ^ X4 ^ Y7, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3 ^ X7, Z2 ^ X4 ^ Y6, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3 ^ X6, Z2 ^ X3 ^ Y6, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2 ^ X6, Z2 ^ X3 ^ Y5, Z1 ^ Y3 ^ X5, Z0 ^ X4 ^ Y4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2 ^ X5, X2 ^ Z2 ^ Y5, Z1 ^ Y3 ^ X4, Z0 ^ X3 ^ Y4 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4 ^ X7, X4 ^ Y7, Z2 ^ Y5 ^ X6, Z1 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3 ^ X7, X4 ^ Y6, Z2 ^ Y4 ^ X6, Z1 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3 ^ X6, X3 ^ Y6, Z2 ^ Y4 ^ X5, Z1 ^ X4 ^ Y5 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2 ^ X6, X3 ^ Y5, Z2 ^ Y3 ^ X5, Z1 ^ X4 ^ Y4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2 ^ X5, X2 ^ Y5, Z2 ^ Y3 ^ X4, Z1 ^ X3 ^ Y4 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Z3 ^ Y4 ^ X7, Z2 ^ X4 ^ Y7, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3 ^ Z3 ^ X7, Z2 ^ X4 ^ Y6, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3 ^ Z3 ^ X6, Z2 ^ X3 ^ Y6, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2 ^ Z3 ^ X6, Z2 ^ X3 ^ Y5, Z1 ^ Y3 ^ X5, Z0 ^ X4 ^ Y4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2 ^ Z3 ^ X5, X2 ^ Z2 ^ Y5, Z1 ^ Y3 ^ X4, Z0 ^ X3 ^ Y4 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4 ^ X7, Z3 ^ X4 ^ Y7, Z2 ^ Y5 ^ X6, Z1 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3 ^ X7, Z3 ^ X4 ^ Y6, Z2 ^ Y4 ^ X6, Z1 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3 ^ X6, X3 ^ Z3 ^ Y6, Z2 ^ Y4 ^ X5, Z1 ^ X4 ^ Y5 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2 ^ X6, X3 ^ Z3 ^ Y5, Z2 ^ Y3 ^ X5, Z1 ^ X4 ^ Y4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2 ^ X5, X2 ^ Z3 ^ Y5, Z2 ^ Y3 ^ X4, Z1 ^ X3 ^ Y4 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4 ^ X7, X4 ^ Y7, Z3 ^ Y5 ^ X6, Z2 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3 ^ X7, X4 ^ Y6, Z3 ^ Y4 ^ X6, Z2 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3 ^ X6, X3 ^ Y6, Z3 ^ Y4 ^ X5, Z2 ^ X4 ^ Y5 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2 ^ X6, X3 ^ Y5, Y3 ^ Z3 ^ X5, Z2 ^ X4 ^ Y4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2 ^ X5, X2 ^ Y5, Y3 ^ Z3 ^ X4, Z2 ^ X3 ^ Y4 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Z3 ^ Y4 ^ X7, Z2 ^ X4 ^ Y7, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3 ^ Z3 ^ X7, Z2 ^ X4 ^ Y6, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3 ^ Z3 ^ X6, Z2 ^ X3 ^ Y6, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2 ^ Z3 ^ X6, Z2 ^ X3 ^ Y5, Z1 ^ Y3 ^ X5, Z0 ^ X4 ^ Y4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2 ^ Z3 ^ X5, X2 ^ Z2 ^ Y5, Z1 ^ Y3 ^ X4, Z0 ^ X3 ^ Y4 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4 ^ X7, Z3 ^ X4 ^ Y7, Z2 ^ Y5 ^ X6, Z1 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3 ^ X7, Z3 ^ X4 ^ Y6, Z2 ^ Y4 ^ X6, Z1 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3 ^ X6, X3 ^ Z3 ^ Y6, Z2 ^ Y4 ^ X5, Z1 ^ X4 ^ Y5 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2 ^ X6, X3 ^ Z3 ^ Y5, Z2 ^ Y3 ^ X5, Z1 ^ X4 ^ Y4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2 ^ X5, X2 ^ Z3 ^ Y5, Z2 ^ Y3 ^ X4, Z1 ^ X3 ^ Y4 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5, X5 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3, Y4, X4 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3, X4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3, X3 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, Z0 ^ X4 ^ Y5, Y5, X5 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Z0 ^ X4 ^ Y4, Y4, X5 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, Z0 ^ X3 ^ Y4, Y4, X4 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, Z0 ^ X3 ^ Y3, Y3, X4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, Z0 ^ X2 ^ Y3, Y3, X3 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4 ^ Y6, Z0 ^ X5 ^ Y5, X5 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4 ^ Y5, Z0 ^ Y4 ^ X5, X5 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3 ^ Y5, Z0 ^ X4 ^ Y4, X4 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3 ^ Y4, Z0 ^ Y3 ^ X4, X4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2 ^ Y4, Z0 ^ X3 ^ Y3, X3 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4 ^ Y7, Y5 ^ X6, Z0 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4 ^ Y6, Y4 ^ X6, Z0 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3 ^ Y6, Y4 ^ X5, Z0 ^ X4 ^ Y5 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3 ^ Y5, Y3 ^ X5, Z0 ^ X4 ^ Y4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2 ^ Y5, Y3 ^ X4, Z0 ^ X3 ^ Y4 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, Z1 ^ X4 ^ Y6, Z0 ^ X5 ^ Y5, X5 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Z1 ^ X4 ^ Y5, Z0 ^ Y4 ^ X5, X5 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, Z1 ^ X3 ^ Y5, Z0 ^ X4 ^ Y4, X4 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, Z1 ^ X3 ^ Y4, Z0 ^ Y3 ^ X4, X4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, Z1 ^ X2 ^ Y4, Z0 ^ X3 ^ Y3, X3 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4 ^ Y7, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4 ^ Y6, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3 ^ Y6, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3 ^ Y5, Z1 ^ Y3 ^ X5, Z0 ^ X4 ^ Y4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2 ^ Y5, Z1 ^ Y3 ^ X4, Z0 ^ X3 ^ Y4 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4 ^ Y7, Y5 ^ X6, Z1 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4 ^ Y6, Y4 ^ X6, Z1 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3 ^ Y6, Y4 ^ X5, Z1 ^ X4 ^ Y5 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3 ^ Y5, Y3 ^ X5, Z1 ^ X4 ^ Y4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2 ^ Y5, Y3 ^ X4, Z1 ^ X3 ^ Y4 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, Z2 ^ X4 ^ Y7, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Z2 ^ X4 ^ Y6, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, Z2 ^ X3 ^ Y6, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, Z2 ^ X3 ^ Y5, Z1 ^ Y3 ^ X5, Z0 ^ X4 ^ Y4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2 ^ Z2 ^ Y5, Z1 ^ Y3 ^ X4, Z0 ^ X3 ^ Y4 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4 ^ Y7, Z2 ^ Y5 ^ X6, Z1 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4 ^ Y6, Z2 ^ Y4 ^ X6, Z1 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3 ^ Y6, Z2 ^ Y4 ^ X5, Z1 ^ X4 ^ Y5 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3 ^ Y5, Z2 ^ Y3 ^ X5, Z1 ^ X4 ^ Y4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2 ^ Y5, Z2 ^ Y3 ^ X4, Z1 ^ X3 ^ Y4 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4 ^ Y7, Y5 ^ X6, Z2 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4 ^ Y6, Y4 ^ X6, Z2 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3 ^ Y6, Y4 ^ X5, Z2 ^ X4 ^ Y5 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3 ^ Y5, Y3 ^ X5, Z2 ^ X4 ^ Y4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2 ^ Y5, Y3 ^ X4, Z2 ^ X3 ^ Y4 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, Z2 ^ X4 ^ Y7, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Z2 ^ X4 ^ Y6, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, Z2 ^ X3 ^ Y6, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, Z2 ^ X3 ^ Y5, Z1 ^ Y3 ^ X5, Z0 ^ X4 ^ Y4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2 ^ Z2 ^ Y5, Z1 ^ Y3 ^ X4, Z0 ^ X3 ^ Y4 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4 ^ Y7, Z2 ^ Y5 ^ X6, Z1 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4 ^ Y6, Z2 ^ Y4 ^ X6, Z1 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3 ^ Y6, Z2 ^ Y4 ^ X5, Z1 ^ X4 ^ Y5 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3 ^ Y5, Z2 ^ Y3 ^ X5, Z1 ^ X4 ^ Y4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2 ^ Y5, Z2 ^ Y3 ^ X4, Z1 ^ X3 ^ Y4 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4 ^ Y7, Y5 ^ X6, Z2 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4 ^ Y6, Y4 ^ X6, Z2 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3 ^ Y6, Y4 ^ X5, Z2 ^ X4 ^ Y5 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3 ^ Y5, Y3 ^ X5, Z2 ^ X4 ^ Y4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2 ^ Y5, Y3 ^ X4, Z2 ^ X3 ^ Y4 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, Z2 ^ X4 ^ Y7, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Z2 ^ X4 ^ Y6, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, Z2 ^ X3 ^ Y6, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, Z2 ^ X3 ^ Y5, Z1 ^ Y3 ^ X5, Z0 ^ X4 ^ Y4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2 ^ Z2 ^ Y5, Z1 ^ Y3 ^ X4, Z0 ^ X3 ^ Y4 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4 ^ Y7, Z2 ^ Y5 ^ X6, Z1 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4 ^ Y6, Z2 ^ Y4 ^ X6, Z1 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3 ^ Y6, Z2 ^ Y4 ^ X5, Z1 ^ X4 ^ Y5 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3 ^ Y5, Z2 ^ Y3 ^ X5, Z1 ^ X4 ^ Y4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2 ^ Y5, Z2 ^ Y3 ^ X4, Z1 ^ X3 ^ Y4 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5, X5 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3, Y4, X4 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3, X4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3, X3 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Z0 ^ X5 ^ Y5, X5 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Z0 ^ Y4 ^ X5, X5 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3, Z0 ^ X4 ^ Y4, X4 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Z0 ^ Y3 ^ X4, X4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Z0 ^ X3 ^ Y3, X3 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5 ^ X6, Z0 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^ X6, Z0 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3, Y4 ^ X5, Z0 ^ X4 ^ Y5 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3 ^ X5, Z0 ^ X4 ^ Y4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3 ^ X4, Z0 ^ X3 ^ Y4 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5 ^ X6, X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^ X6, X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3, Y4 ^ X5, X4 ^ Y5 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3 ^ X5, X4 ^ Y4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3 ^ X4, X3 ^ Y4 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Z1 ^ Y3 ^ X5, Z0 ^ X4 ^ Y4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Z1 ^ Y3 ^ X4, Z0 ^ X3 ^ Y4 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5 ^ X6, Z1 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^ X6, Z1 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3, Y4 ^ X5, Z1 ^ X4 ^ Y5 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3 ^ X5, Z1 ^ X4 ^ Y4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3 ^ X4, Z1 ^ X3 ^ Y4 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5 ^ X6, X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^ X6, X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3, Y4 ^ X5, X4 ^ Y5 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3 ^ X5, X4 ^ Y4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3 ^ X4, X3 ^ Y4 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Z1 ^ Y3 ^ X5, Z0 ^ X4 ^ Y4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Z1 ^ Y3 ^ X4, Z0 ^ X3 ^ Y4 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5 ^ X6, Z1 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^ X6, Z1 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3, Y4 ^ X5, Z1 ^ X4 ^ Y5 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3 ^ X5, Z1 ^ X4 ^ Y4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3 ^ X4, Z1 ^ X3 ^ Y4 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5 ^ X6, X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^ X6, X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3, Y4 ^ X5, X4 ^ Y5 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3 ^ X5, X4 ^ Y4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3 ^ X4, X3 ^ Y4 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Z1 ^ Y3 ^ X5, Z0 ^ X4 ^ Y4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Z1 ^ Y3 ^ X4, Z0 ^ X3 ^ Y4 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5 ^ X6, Z1 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^ X6, Z1 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3, Y4 ^ X5, Z1 ^ X4 ^ Y5 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3 ^ X5, Z1 ^ X4 ^ Y4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3 ^ X4, Z1 ^ X3 ^ Y4 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5 ^ X6, X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^ X6, X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3, Y4 ^ X5, X4 ^ Y5 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3 ^ X5, X4 ^ Y4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3 ^ X4, X3 ^ Y4 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Z1 ^ Y3 ^ X5, Z0 ^ X4 ^ Y4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Z1 ^ Y3 ^ X4, Z0 ^ X3 ^ Y4 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5 ^ X6, Z1 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^ X6, Z1 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3, Y4 ^ X5, Z1 ^ X4 ^ Y5 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3 ^ X5, Z1 ^ X4 ^ Y4 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3 ^ X4, Z1 ^ X3 ^ Y4 }, -}; - -const UINT_64 SW_64K_S_RBPLUS[][16]= -{ - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Y3, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Y3, X3, Y4, X4, Y5, X5 }, -}; - -const UINT_64 SW_64K_D_RBPLUS[][16]= -{ - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3, Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3, X3, Y4, X4, Y5, X5 }, -}; - -const UINT_64 SW_64K_S_T_RBPLUS[][16]= -{ - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Y3, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, X4 ^ Y4, X4, Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3 ^ X4, X4, Y4, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, X3 ^ Y3, X3, Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2 ^ X3, X3, Y3, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, X2 ^ Y2, X2, Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4 ^ X5, X4 ^ Y5, Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3 ^ X5, X4 ^ Y4, Y4, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3 ^ X4, X3 ^ Y4, Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2 ^ X4, X3 ^ Y3, Y3, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2 ^ X3, X2 ^ Y3, Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4 ^ X6, X4 ^ Y6, X5 ^ Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3 ^ X6, X4 ^ Y5, Y4 ^ X5, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3 ^ X5, X3 ^ Y5, X4 ^ Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2 ^ X5, X3 ^ Y4, Y3 ^ X4, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2 ^ X4, X2 ^ Y4, X3 ^ Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4 ^ X5, X4 ^ Y5, Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3 ^ X5, X4 ^ Y4, Y4, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3 ^ X4, X3 ^ Y4, Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2 ^ X4, X3 ^ Y3, Y3, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2 ^ X3, X2 ^ Y3, Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4 ^ X6, X4 ^ Y6, X5 ^ Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3 ^ X6, X4 ^ Y5, Y4 ^ X5, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3 ^ X5, X3 ^ Y5, X4 ^ Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2 ^ X5, X3 ^ Y4, Y3 ^ X4, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2 ^ X4, X2 ^ Y4, X3 ^ Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4 ^ X7, X4 ^ Y7, Y5 ^ X6, X5 ^ Y6, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3 ^ X7, X4 ^ Y6, Y4 ^ X6, X5 ^ Y5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3 ^ X6, X3 ^ Y6, Y4 ^ X5, X4 ^ Y5, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2 ^ X6, X3 ^ Y5, Y3 ^ X5, X4 ^ Y4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2 ^ X5, X2 ^ Y5, Y3 ^ X4, X3 ^ Y4, Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4 ^ X6, X4 ^ Y6, X5 ^ Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3 ^ X6, X4 ^ Y5, Y4 ^ X5, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3 ^ X5, X3 ^ Y5, X4 ^ Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2 ^ X5, X3 ^ Y4, Y3 ^ X4, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2 ^ X4, X2 ^ Y4, X3 ^ Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4 ^ X7, X4 ^ Y7, Y5 ^ X6, X5 ^ Y6, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3 ^ X7, X4 ^ Y6, Y4 ^ X6, X5 ^ Y5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3 ^ X6, X3 ^ Y6, Y4 ^ X5, X4 ^ Y5, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2 ^ X6, X3 ^ Y5, Y3 ^ X5, X4 ^ Y4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2 ^ X5, X2 ^ Y5, Y3 ^ X4, X3 ^ Y4, Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^ X7, X5 ^ Y6, Y5 ^ X6, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4 ^ X6, X4 ^ Y6, X5 ^ Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Y3 ^ X6, X4 ^ Y5, Y4 ^ X5, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Y3 ^ X5, X3 ^ Y5, X4 ^ Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4 ^ X7, X4 ^ Y7, Y5 ^ X6, X5 ^ Y6, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3 ^ X7, X4 ^ Y6, Y4 ^ X6, X5 ^ Y5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3 ^ X6, X3 ^ Y6, Y4 ^ X5, X4 ^ Y5, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2 ^ X6, X3 ^ Y5, Y3 ^ X5, X4 ^ Y4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2 ^ X5, X2 ^ Y5, Y3 ^ X4, X3 ^ Y4, Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^ X7, X5 ^ Y6, Y5 ^ X6, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4 ^ X6, X4 ^ Y6, X5 ^ Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Y3 ^ X6, X4 ^ Y5, Y4 ^ X5, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Y3 ^ X5, X3 ^ Y5, X4 ^ Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Y5, X5, Y6 ^ X7, X6 ^ Y7, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5, Y5 ^ X7, X6 ^ Y6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4, X4, Y5 ^ X6, X5 ^ Y6, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Y3, X4, Y4 ^ X6, X5 ^ Y5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Y3, X3, Y4 ^ X5, X4 ^ Y5, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^ X7, X5 ^ Y6, Y5 ^ X6, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4 ^ X6, X4 ^ Y6, X5 ^ Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Y3 ^ X6, X4 ^ Y5, Y4 ^ X5, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Y3 ^ X5, X3 ^ Y5, X4 ^ Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Y5, X5, Y6 ^ X7, X6 ^ Y7, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5, Y5 ^ X7, X6 ^ Y6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4, X4, Y5 ^ X6, X5 ^ Y6, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Y3, X4, Y4 ^ X6, X5 ^ Y5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Y3, X3, Y4 ^ X5, X4 ^ Y5, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Y3, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4 ^ Y5, Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4 ^ Y4, Y4, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3 ^ Y4, Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3 ^ Y3, Y3, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2 ^ Y3, Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4 ^ Y6, X5 ^ Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4 ^ Y5, Y4 ^ X5, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3 ^ Y5, X4 ^ Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3 ^ Y4, Y3 ^ X4, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2 ^ Y4, X3 ^ Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4 ^ Y7, Y5 ^ X6, X5 ^ Y6, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4 ^ Y6, Y4 ^ X6, X5 ^ Y5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3 ^ Y6, Y4 ^ X5, X4 ^ Y5, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3 ^ Y5, Y3 ^ X5, X4 ^ Y4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2 ^ Y5, Y3 ^ X4, X3 ^ Y4, Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4 ^ Y6, X5 ^ Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4 ^ Y5, Y4 ^ X5, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3 ^ Y5, X4 ^ Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3 ^ Y4, Y3 ^ X4, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2 ^ Y4, X3 ^ Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4 ^ Y7, Y5 ^ X6, X5 ^ Y6, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4 ^ Y6, Y4 ^ X6, X5 ^ Y5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3 ^ Y6, Y4 ^ X5, X4 ^ Y5, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3 ^ Y5, Y3 ^ X5, X4 ^ Y4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2 ^ Y5, Y3 ^ X4, X3 ^ Y4, Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^ X7, X5 ^ Y6, Y5 ^ X6, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4 ^ X6, X4 ^ Y6, X5 ^ Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Y3 ^ X6, X4 ^ Y5, Y4 ^ X5, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Y3 ^ X5, X3 ^ Y5, X4 ^ Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4 ^ Y7, Y5 ^ X6, X5 ^ Y6, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4 ^ Y6, Y4 ^ X6, X5 ^ Y5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3 ^ Y6, Y4 ^ X5, X4 ^ Y5, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3 ^ Y5, Y3 ^ X5, X4 ^ Y4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2 ^ Y5, Y3 ^ X4, X3 ^ Y4, Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^ X7, X5 ^ Y6, Y5 ^ X6, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4 ^ X6, X4 ^ Y6, X5 ^ Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Y3 ^ X6, X4 ^ Y5, Y4 ^ X5, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Y3 ^ X5, X3 ^ Y5, X4 ^ Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Y5, X5, Y6 ^ X7, X6 ^ Y7, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5, Y5 ^ X7, X6 ^ Y6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4, X4, Y5 ^ X6, X5 ^ Y6, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Y3, X4, Y4 ^ X6, X5 ^ Y5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Y3, X3, Y4 ^ X5, X4 ^ Y5, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^ X7, X5 ^ Y6, Y5 ^ X6, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4 ^ X6, X4 ^ Y6, X5 ^ Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Y3 ^ X6, X4 ^ Y5, Y4 ^ X5, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Y3 ^ X5, X3 ^ Y5, X4 ^ Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Y5, X5, Y6 ^ X7, X6 ^ Y7, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5, Y5 ^ X7, X6 ^ Y6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4, X4, Y5 ^ X6, X5 ^ Y6, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Y3, X4, Y4 ^ X6, X5 ^ Y5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Y3, X3, Y4 ^ X5, X4 ^ Y5, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Y5, X5, Y6, X6, X7 ^ Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6 ^ X7, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4, X4, Y5, X5, X6 ^ Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Y3, X4, Y4, X5, Y5 ^ X6, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Y3, X3, Y4, X4, X5 ^ Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Y5, X5, Y6 ^ X7, X6 ^ Y7, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5, Y5 ^ X7, X6 ^ Y6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4, X4, Y5 ^ X6, X5 ^ Y6, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Y3, X4, Y4 ^ X6, X5 ^ Y5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Y3, X3, Y4 ^ X5, X4 ^ Y5, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Y5, X5, Y6, X6, X7 ^ Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6 ^ X7, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4, X4, Y5, X5, X6 ^ Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Y3, X4, Y4, X5, Y5 ^ X6, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Y3, X3, Y4, X4, X5 ^ Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Y3, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, X5 ^ Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^ X5, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, X4 ^ Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Y3 ^ X4, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, X3 ^ Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Y5 ^ X6, X5 ^ Y6, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^ X6, X5 ^ Y5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4 ^ X5, X4 ^ Y5, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Y3 ^ X5, X4 ^ Y4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Y3 ^ X4, X3 ^ Y4, Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^ X7, X5 ^ Y6, Y5 ^ X6, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4 ^ X6, X4 ^ Y6, X5 ^ Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Y3 ^ X6, X4 ^ Y5, Y4 ^ X5, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Y3 ^ X5, X3 ^ Y5, X4 ^ Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Y5 ^ X6, X5 ^ Y6, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^ X6, X5 ^ Y5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4 ^ X5, X4 ^ Y5, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Y3 ^ X5, X4 ^ Y4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Y3 ^ X4, X3 ^ Y4, Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^ X7, X5 ^ Y6, Y5 ^ X6, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4 ^ X6, X4 ^ Y6, X5 ^ Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Y3 ^ X6, X4 ^ Y5, Y4 ^ X5, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Y3 ^ X5, X3 ^ Y5, X4 ^ Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Y5, X5, Y6 ^ X7, X6 ^ Y7, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5, Y5 ^ X7, X6 ^ Y6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4, X4, Y5 ^ X6, X5 ^ Y6, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Y3, X4, Y4 ^ X6, X5 ^ Y5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Y3, X3, Y4 ^ X5, X4 ^ Y5, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^ X7, X5 ^ Y6, Y5 ^ X6, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4 ^ X6, X4 ^ Y6, X5 ^ Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Y3 ^ X6, X4 ^ Y5, Y4 ^ X5, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Y3 ^ X5, X3 ^ Y5, X4 ^ Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Y5, X5, Y6 ^ X7, X6 ^ Y7, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5, Y5 ^ X7, X6 ^ Y6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4, X4, Y5 ^ X6, X5 ^ Y6, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Y3, X4, Y4 ^ X6, X5 ^ Y5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Y3, X3, Y4 ^ X5, X4 ^ Y5, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Y5, X5, Y6, X6, X7 ^ Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6 ^ X7, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4, X4, Y5, X5, X6 ^ Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Y3, X4, Y4, X5, Y5 ^ X6, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Y3, X3, Y4, X4, X5 ^ Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Y5, X5, Y6 ^ X7, X6 ^ Y7, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5, Y5 ^ X7, X6 ^ Y6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4, X4, Y5 ^ X6, X5 ^ Y6, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Y3, X4, Y4 ^ X6, X5 ^ Y5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Y3, X3, Y4 ^ X5, X4 ^ Y5, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Y5, X5, Y6, X6, X7 ^ Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6 ^ X7, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4, X4, Y5, X5, X6 ^ Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Y3, X4, Y4, X5, Y5 ^ X6, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Y3, X3, Y4, X4, X5 ^ Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Y3, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Y5, X5, Y6, X6, X7 ^ Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6 ^ X7, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4, X4, Y5, X5, X6 ^ Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Y3, X4, Y4, X5, Y5 ^ X6, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Y3, X3, Y4, X4, X5 ^ Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Y3, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Y3, X3, Y4, X4, Y5, X5 }, -}; - -const UINT_64 SW_64K_D_T_RBPLUS[][16]= -{ - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3, Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, X4 ^ Y4, X4, Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3 ^ X4, X4, Y4, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3 ^ Y3, X3, Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2 ^ X3, X3, Y3, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2 ^ Y2, X2, Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4 ^ X5, X4 ^ Y5, Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3 ^ X5, X4 ^ Y4, Y4, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3 ^ X4, X3 ^ Y4, Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2 ^ X4, X3 ^ Y3, Y3, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2 ^ X3, X2 ^ Y3, Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4 ^ X6, X4 ^ Y6, X5 ^ Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3 ^ X6, X4 ^ Y5, Y4 ^ X5, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3 ^ X5, X3 ^ Y5, X4 ^ Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2 ^ X5, X3 ^ Y4, Y3 ^ X4, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2 ^ X4, X2 ^ Y4, X3 ^ Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4 ^ X5, X4 ^ Y5, Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3 ^ X5, X4 ^ Y4, Y4, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3 ^ X4, X3 ^ Y4, Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2 ^ X4, X3 ^ Y3, Y3, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2 ^ X3, X2 ^ Y3, Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4 ^ X6, X4 ^ Y6, X5 ^ Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3 ^ X6, X4 ^ Y5, Y4 ^ X5, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3 ^ X5, X3 ^ Y5, X4 ^ Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2 ^ X5, X3 ^ Y4, Y3 ^ X4, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2 ^ X4, X2 ^ Y4, X3 ^ Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4 ^ X7, X4 ^ Y7, Y5 ^ X6, X5 ^ Y6, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3 ^ X7, X4 ^ Y6, Y4 ^ X6, X5 ^ Y5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3 ^ X6, X3 ^ Y6, Y4 ^ X5, X4 ^ Y5, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2 ^ X6, X3 ^ Y5, Y3 ^ X5, X4 ^ Y4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2 ^ X5, X2 ^ Y5, Y3 ^ X4, X3 ^ Y4, Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4 ^ X6, X4 ^ Y6, X5 ^ Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3 ^ X6, X4 ^ Y5, Y4 ^ X5, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3 ^ X5, X3 ^ Y5, X4 ^ Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2 ^ X5, X3 ^ Y4, Y3 ^ X4, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2 ^ X4, X2 ^ Y4, X3 ^ Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4 ^ X7, X4 ^ Y7, Y5 ^ X6, X5 ^ Y6, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3 ^ X7, X4 ^ Y6, Y4 ^ X6, X5 ^ Y5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3 ^ X6, X3 ^ Y6, Y4 ^ X5, X4 ^ Y5, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2 ^ X6, X3 ^ Y5, Y3 ^ X5, X4 ^ Y4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2 ^ X5, X2 ^ Y5, Y3 ^ X4, X3 ^ Y4, Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^ X7, X5 ^ Y6, Y5 ^ X6, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3, Y4 ^ X6, X4 ^ Y6, X5 ^ Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3 ^ X6, X4 ^ Y5, Y4 ^ X5, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3 ^ X5, X3 ^ Y5, X4 ^ Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4 ^ X7, X4 ^ Y7, Y5 ^ X6, X5 ^ Y6, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3 ^ X7, X4 ^ Y6, Y4 ^ X6, X5 ^ Y5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3 ^ X6, X3 ^ Y6, Y4 ^ X5, X4 ^ Y5, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2 ^ X6, X3 ^ Y5, Y3 ^ X5, X4 ^ Y4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2 ^ X5, X2 ^ Y5, Y3 ^ X4, X3 ^ Y4, Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^ X7, X5 ^ Y6, Y5 ^ X6, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3, Y4 ^ X6, X4 ^ Y6, X5 ^ Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3 ^ X6, X4 ^ Y5, Y4 ^ X5, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3 ^ X5, X3 ^ Y5, X4 ^ Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5, X5, Y6 ^ X7, X6 ^ Y7, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5, Y5 ^ X7, X6 ^ Y6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3, Y4, X4, Y5 ^ X6, X5 ^ Y6, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3, X4, Y4 ^ X6, X5 ^ Y5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3, X3, Y4 ^ X5, X4 ^ Y5, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^ X7, X5 ^ Y6, Y5 ^ X6, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3, Y4 ^ X6, X4 ^ Y6, X5 ^ Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3 ^ X6, X4 ^ Y5, Y4 ^ X5, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3 ^ X5, X3 ^ Y5, X4 ^ Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5, X5, Y6 ^ X7, X6 ^ Y7, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5, Y5 ^ X7, X6 ^ Y6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3, Y4, X4, Y5 ^ X6, X5 ^ Y6, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3, X4, Y4 ^ X6, X5 ^ Y5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3, X3, Y4 ^ X5, X4 ^ Y5, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3, Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4 ^ Y5, Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4 ^ Y4, Y4, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3 ^ Y4, Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3 ^ Y3, Y3, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2 ^ Y3, Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4 ^ Y6, X5 ^ Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4 ^ Y5, Y4 ^ X5, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3 ^ Y5, X4 ^ Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3 ^ Y4, Y3 ^ X4, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2 ^ Y4, X3 ^ Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4 ^ Y7, Y5 ^ X6, X5 ^ Y6, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4 ^ Y6, Y4 ^ X6, X5 ^ Y5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3 ^ Y6, Y4 ^ X5, X4 ^ Y5, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3 ^ Y5, Y3 ^ X5, X4 ^ Y4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2 ^ Y5, Y3 ^ X4, X3 ^ Y4, Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4 ^ Y6, X5 ^ Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4 ^ Y5, Y4 ^ X5, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3 ^ Y5, X4 ^ Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3 ^ Y4, Y3 ^ X4, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2 ^ Y4, X3 ^ Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4 ^ Y7, Y5 ^ X6, X5 ^ Y6, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4 ^ Y6, Y4 ^ X6, X5 ^ Y5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3 ^ Y6, Y4 ^ X5, X4 ^ Y5, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3 ^ Y5, Y3 ^ X5, X4 ^ Y4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2 ^ Y5, Y3 ^ X4, X3 ^ Y4, Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^ X7, X5 ^ Y6, Y5 ^ X6, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3, Y4 ^ X6, X4 ^ Y6, X5 ^ Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3 ^ X6, X4 ^ Y5, Y4 ^ X5, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3 ^ X5, X3 ^ Y5, X4 ^ Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4 ^ Y7, Y5 ^ X6, X5 ^ Y6, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4 ^ Y6, Y4 ^ X6, X5 ^ Y5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3 ^ Y6, Y4 ^ X5, X4 ^ Y5, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3 ^ Y5, Y3 ^ X5, X4 ^ Y4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2 ^ Y5, Y3 ^ X4, X3 ^ Y4, Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^ X7, X5 ^ Y6, Y5 ^ X6, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3, Y4 ^ X6, X4 ^ Y6, X5 ^ Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3 ^ X6, X4 ^ Y5, Y4 ^ X5, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3 ^ X5, X3 ^ Y5, X4 ^ Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5, X5, Y6 ^ X7, X6 ^ Y7, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5, Y5 ^ X7, X6 ^ Y6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3, Y4, X4, Y5 ^ X6, X5 ^ Y6, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3, X4, Y4 ^ X6, X5 ^ Y5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3, X3, Y4 ^ X5, X4 ^ Y5, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^ X7, X5 ^ Y6, Y5 ^ X6, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3, Y4 ^ X6, X4 ^ Y6, X5 ^ Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3 ^ X6, X4 ^ Y5, Y4 ^ X5, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3 ^ X5, X3 ^ Y5, X4 ^ Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5, X5, Y6 ^ X7, X6 ^ Y7, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5, Y5 ^ X7, X6 ^ Y6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3, Y4, X4, Y5 ^ X6, X5 ^ Y6, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3, X4, Y4 ^ X6, X5 ^ Y5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3, X3, Y4 ^ X5, X4 ^ Y5, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5, X5, Y6, X6, X7 ^ Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6 ^ X7, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3, Y4, X4, Y5, X5, X6 ^ Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3, X4, Y4, X5, Y5 ^ X6, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3, X3, Y4, X4, X5 ^ Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5, X5, Y6 ^ X7, X6 ^ Y7, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5, Y5 ^ X7, X6 ^ Y6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3, Y4, X4, Y5 ^ X6, X5 ^ Y6, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3, X4, Y4 ^ X6, X5 ^ Y5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3, X3, Y4 ^ X5, X4 ^ Y5, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5, X5, Y6, X6, X7 ^ Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6 ^ X7, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3, Y4, X4, Y5, X5, X6 ^ Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3, X4, Y4, X5, Y5 ^ X6, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3, X3, Y4, X4, X5 ^ Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3, Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, X5 ^ Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^ X5, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3, X4 ^ Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3 ^ X4, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, X3 ^ Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5 ^ X6, X5 ^ Y6, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^ X6, X5 ^ Y5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3, Y4 ^ X5, X4 ^ Y5, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3 ^ X5, X4 ^ Y4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3 ^ X4, X3 ^ Y4, Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^ X7, X5 ^ Y6, Y5 ^ X6, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3, Y4 ^ X6, X4 ^ Y6, X5 ^ Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3 ^ X6, X4 ^ Y5, Y4 ^ X5, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3 ^ X5, X3 ^ Y5, X4 ^ Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5 ^ X6, X5 ^ Y6, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^ X6, X5 ^ Y5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3, Y4 ^ X5, X4 ^ Y5, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3 ^ X5, X4 ^ Y4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3 ^ X4, X3 ^ Y4, Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^ X7, X5 ^ Y6, Y5 ^ X6, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3, Y4 ^ X6, X4 ^ Y6, X5 ^ Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3 ^ X6, X4 ^ Y5, Y4 ^ X5, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3 ^ X5, X3 ^ Y5, X4 ^ Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5, X5, Y6 ^ X7, X6 ^ Y7, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5, Y5 ^ X7, X6 ^ Y6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3, Y4, X4, Y5 ^ X6, X5 ^ Y6, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3, X4, Y4 ^ X6, X5 ^ Y5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3, X3, Y4 ^ X5, X4 ^ Y5, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^ X7, X5 ^ Y6, Y5 ^ X6, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3, Y4 ^ X6, X4 ^ Y6, X5 ^ Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3 ^ X6, X4 ^ Y5, Y4 ^ X5, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3 ^ X5, X3 ^ Y5, X4 ^ Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5, X5, Y6 ^ X7, X6 ^ Y7, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5, Y5 ^ X7, X6 ^ Y6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3, Y4, X4, Y5 ^ X6, X5 ^ Y6, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3, X4, Y4 ^ X6, X5 ^ Y5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3, X3, Y4 ^ X5, X4 ^ Y5, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5, X5, Y6, X6, X7 ^ Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6 ^ X7, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3, Y4, X4, Y5, X5, X6 ^ Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3, X4, Y4, X5, Y5 ^ X6, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3, X3, Y4, X4, X5 ^ Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5, X5, Y6 ^ X7, X6 ^ Y7, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5, Y5 ^ X7, X6 ^ Y6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3, Y4, X4, Y5 ^ X6, X5 ^ Y6, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3, X4, Y4 ^ X6, X5 ^ Y5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3, X3, Y4 ^ X5, X4 ^ Y5, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5, X5, Y6, X6, X7 ^ Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6 ^ X7, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3, Y4, X4, Y5, X5, X6 ^ Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3, X4, Y4, X5, Y5 ^ X6, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3, X3, Y4, X4, X5 ^ Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3, Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5, X5, Y6, X6, X7 ^ Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6 ^ X7, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3, Y4, X4, Y5, X5, X6 ^ Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3, X4, Y4, X5, Y5 ^ X6, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3, X3, Y4, X4, X5 ^ Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3, Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3, X3, Y4, X4, Y5, X5 }, -}; - -const UINT_64 SW_64K_S_X_RBPLUS[][16]= -{ - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Y3, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Z0 ^ X4 ^ Y4, X4, Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Z0 ^ Y3 ^ X4, X4, Y4, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Z0 ^ X3 ^ Y3, X3, Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Z0 ^ Y2 ^ X3, X3, Y3, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Z0 ^ X2 ^ Y2, X2, Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4 ^ X5, Z0 ^ X4 ^ Y5, Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3 ^ X5, Z0 ^ X4 ^ Y4, Y4, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3 ^ X4, Z0 ^ X3 ^ Y4, Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2 ^ X4, Z0 ^ X3 ^ Y3, Y3, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2 ^ X3, Z0 ^ X2 ^ Y3, Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4 ^ X6, X4 ^ Y6, Z0 ^ X5 ^ Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3 ^ X6, X4 ^ Y5, Z0 ^ Y4 ^ X5, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3 ^ X5, X3 ^ Y5, Z0 ^ X4 ^ Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2 ^ X5, X3 ^ Y4, Z0 ^ Y3 ^ X4, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2 ^ X4, X2 ^ Y4, Z0 ^ X3 ^ Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5, Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Z1 ^ Y3 ^ X5, Z0 ^ X4 ^ Y4, Y4, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Z1 ^ Y3 ^ X4, Z0 ^ X3 ^ Y4, Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Z1 ^ Y2 ^ X4, Z0 ^ X3 ^ Y3, Y3, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Z1 ^ Y2 ^ X3, Z0 ^ X2 ^ Y3, Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4 ^ X6, Z1 ^ X4 ^ Y6, Z0 ^ X5 ^ Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3 ^ X6, Z1 ^ X4 ^ Y5, Z0 ^ Y4 ^ X5, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3 ^ X5, Z1 ^ X3 ^ Y5, Z0 ^ X4 ^ Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2 ^ X5, Z1 ^ X3 ^ Y4, Z0 ^ Y3 ^ X4, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2 ^ X4, Z1 ^ X2 ^ Y4, Z0 ^ X3 ^ Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4 ^ X7, X4 ^ Y7, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3 ^ X7, X4 ^ Y6, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3 ^ X6, X3 ^ Y6, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2 ^ X6, X3 ^ Y5, Z1 ^ Y3 ^ X5, Z0 ^ X4 ^ Y4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2 ^ X5, X2 ^ Y5, Z1 ^ Y3 ^ X4, Z0 ^ X3 ^ Y4, Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Z2 ^ Y4 ^ X6, Z1 ^ X4 ^ Y6, Z0 ^ X5 ^ Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Z2 ^ Y3 ^ X6, Z1 ^ X4 ^ Y5, Z0 ^ Y4 ^ X5, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Z2 ^ Y3 ^ X5, Z1 ^ X3 ^ Y5, Z0 ^ X4 ^ Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2 ^ Z2 ^ X5, Z1 ^ X3 ^ Y4, Z0 ^ Y3 ^ X4, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2 ^ Z2 ^ X4, Z1 ^ X2 ^ Y4, Z0 ^ X3 ^ Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4 ^ X7, Z2 ^ X4 ^ Y7, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3 ^ X7, Z2 ^ X4 ^ Y6, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3 ^ X6, Z2 ^ X3 ^ Y6, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2 ^ X6, Z2 ^ X3 ^ Y5, Z1 ^ Y3 ^ X5, Z0 ^ X4 ^ Y4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2 ^ X5, X2 ^ Z2 ^ Y5, Z1 ^ Y3 ^ X4, Z0 ^ X3 ^ Y4, Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4 ^ X8, X4 ^ Y8, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3 ^ X8, X4 ^ Y7, Z2 ^ Y4 ^ X7, Z1 ^ X5 ^ Y6, Z0 ^ Y5 ^ X6, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3 ^ X7, X3 ^ Y7, Z2 ^ Y4 ^ X6, Z1 ^ X4 ^ Y6, Z0 ^ X5 ^ Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2 ^ X7, X3 ^ Y6, Z2 ^ Y3 ^ X6, Z1 ^ X4 ^ Y5, Z0 ^ Y4 ^ X5, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2 ^ X6, X2 ^ Y6, Z2 ^ Y3 ^ X5, Z1 ^ X3 ^ Y5, Z0 ^ X4 ^ Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Z3 ^ Y4 ^ X7, Z2 ^ X4 ^ Y7, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3 ^ Z3 ^ X7, Z2 ^ X4 ^ Y6, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3 ^ Z3 ^ X6, Z2 ^ X3 ^ Y6, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2 ^ Z3 ^ X6, Z2 ^ X3 ^ Y5, Z1 ^ Y3 ^ X5, Z0 ^ X4 ^ Y4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2 ^ Z3 ^ X5, X2 ^ Z2 ^ Y5, Z1 ^ Y3 ^ X4, Z0 ^ X3 ^ Y4, Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4 ^ X8, Z3 ^ X4 ^ Y8, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3 ^ X8, Z3 ^ X4 ^ Y7, Z2 ^ Y4 ^ X7, Z1 ^ X5 ^ Y6, Z0 ^ Y5 ^ X6, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3 ^ X7, X3 ^ Z3 ^ Y7, Z2 ^ Y4 ^ X6, Z1 ^ X4 ^ Y6, Z0 ^ X5 ^ Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2 ^ X7, X3 ^ Z3 ^ Y6, Z2 ^ Y3 ^ X6, Z1 ^ X4 ^ Y5, Z0 ^ Y4 ^ X5, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2 ^ X6, X2 ^ Z3 ^ Y6, Z2 ^ Y3 ^ X5, Z1 ^ X3 ^ Y5, Z0 ^ X4 ^ Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4 ^ X9, X4 ^ Y9, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3 ^ X9, X4 ^ Y8, Z3 ^ Y4 ^ X8, Z2 ^ X5 ^ Y7, Z1 ^ Y5 ^ X7, Z0 ^ X6 ^ Y6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3 ^ X8, X3 ^ Y8, Z3 ^ Y4 ^ X7, Z2 ^ X4 ^ Y7, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2 ^ X8, X3 ^ Y7, Y3 ^ Z3 ^ X7, Z2 ^ X4 ^ Y6, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2 ^ X7, X2 ^ Y7, Y3 ^ Z3 ^ X6, Z2 ^ X3 ^ Y6, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4 ^ Z4 ^ X8, Z3 ^ X4 ^ Y8, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3 ^ Z4 ^ X8, Z3 ^ X4 ^ Y7, Z2 ^ Y4 ^ X7, Z1 ^ X5 ^ Y6, Z0 ^ Y5 ^ X6, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3 ^ Z4 ^ X7, X3 ^ Z3 ^ Y7, Z2 ^ Y4 ^ X6, Z1 ^ X4 ^ Y6, Z0 ^ X5 ^ Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2 ^ Z4 ^ X7, X3 ^ Z3 ^ Y6, Z2 ^ Y3 ^ X6, Z1 ^ X4 ^ Y5, Z0 ^ Y4 ^ X5, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2 ^ Z4 ^ X6, X2 ^ Z3 ^ Y6, Z2 ^ Y3 ^ X5, Z1 ^ X3 ^ Y5, Z0 ^ X4 ^ Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4 ^ X9, X4 ^ Z4 ^ Y9, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3 ^ X9, X4 ^ Z4 ^ Y8, Z3 ^ Y4 ^ X8, Z2 ^ X5 ^ Y7, Z1 ^ Y5 ^ X7, Z0 ^ X6 ^ Y6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3 ^ X8, X3 ^ Z4 ^ Y8, Z3 ^ Y4 ^ X7, Z2 ^ X4 ^ Y7, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2 ^ X8, X3 ^ Z4 ^ Y7, Y3 ^ Z3 ^ X7, Z2 ^ X4 ^ Y6, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2 ^ X7, X2 ^ Z4 ^ Y7, Y3 ^ Z3 ^ X6, Z2 ^ X3 ^ Y6, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Y3, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, Z0 ^ X4 ^ Y5, Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Z0 ^ X4 ^ Y4, Y4, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, Z0 ^ X3 ^ Y4, Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, Z0 ^ X3 ^ Y3, Y3, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, Z0 ^ X2 ^ Y3, Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4 ^ Y6, Z0 ^ X5 ^ Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4 ^ Y5, Z0 ^ Y4 ^ X5, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3 ^ Y5, Z0 ^ X4 ^ Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3 ^ Y4, Z0 ^ Y3 ^ X4, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2 ^ Y4, Z0 ^ X3 ^ Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4 ^ Y7, Y5 ^ X6, Z0 ^ X5 ^ Y6, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4 ^ Y6, Y4 ^ X6, Z0 ^ X5 ^ Y5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3 ^ Y6, Y4 ^ X5, Z0 ^ X4 ^ Y5, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3 ^ Y5, Y3 ^ X5, Z0 ^ X4 ^ Y4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2 ^ Y5, Y3 ^ X4, Z0 ^ X3 ^ Y4, Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, Z1 ^ X4 ^ Y6, Z0 ^ X5 ^ Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Z1 ^ X4 ^ Y5, Z0 ^ Y4 ^ X5, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, Z1 ^ X3 ^ Y5, Z0 ^ X4 ^ Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, Z1 ^ X3 ^ Y4, Z0 ^ Y3 ^ X4, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, Z1 ^ X2 ^ Y4, Z0 ^ X3 ^ Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4 ^ Y7, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4 ^ Y6, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3 ^ Y6, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3 ^ Y5, Z1 ^ Y3 ^ X5, Z0 ^ X4 ^ Y4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2 ^ Y5, Z1 ^ Y3 ^ X4, Z0 ^ X3 ^ Y4, Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4 ^ Y8, Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4 ^ Y7, Y4 ^ X7, Z1 ^ X5 ^ Y6, Z0 ^ Y5 ^ X6, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3 ^ Y7, Y4 ^ X6, Z1 ^ X4 ^ Y6, Z0 ^ X5 ^ Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3 ^ Y6, Y3 ^ X6, Z1 ^ X4 ^ Y5, Z0 ^ Y4 ^ X5, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2 ^ Y6, Y3 ^ X5, Z1 ^ X3 ^ Y5, Z0 ^ X4 ^ Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, Z2 ^ X4 ^ Y7, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Z2 ^ X4 ^ Y6, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, Z2 ^ X3 ^ Y6, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, Z2 ^ X3 ^ Y5, Z1 ^ Y3 ^ X5, Z0 ^ X4 ^ Y4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2 ^ Z2 ^ Y5, Z1 ^ Y3 ^ X4, Z0 ^ X3 ^ Y4, Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4 ^ Y8, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4 ^ Y7, Z2 ^ Y4 ^ X7, Z1 ^ X5 ^ Y6, Z0 ^ Y5 ^ X6, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3 ^ Y7, Z2 ^ Y4 ^ X6, Z1 ^ X4 ^ Y6, Z0 ^ X5 ^ Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3 ^ Y6, Z2 ^ Y3 ^ X6, Z1 ^ X4 ^ Y5, Z0 ^ Y4 ^ X5, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2 ^ Y6, Z2 ^ Y3 ^ X5, Z1 ^ X3 ^ Y5, Z0 ^ X4 ^ Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4 ^ Y9, Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4 ^ Y8, Y4 ^ X8, Z2 ^ X5 ^ Y7, Z1 ^ Y5 ^ X7, Z0 ^ X6 ^ Y6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3 ^ Y8, Y4 ^ X7, Z2 ^ X4 ^ Y7, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3 ^ Y7, Y3 ^ X7, Z2 ^ X4 ^ Y6, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2 ^ Y7, Y3 ^ X6, Z2 ^ X3 ^ Y6, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, Z3 ^ X4 ^ Y8, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Z3 ^ X4 ^ Y7, Z2 ^ Y4 ^ X7, Z1 ^ X5 ^ Y6, Z0 ^ Y5 ^ X6, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3 ^ Z3 ^ Y7, Z2 ^ Y4 ^ X6, Z1 ^ X4 ^ Y6, Z0 ^ X5 ^ Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3 ^ Z3 ^ Y6, Z2 ^ Y3 ^ X6, Z1 ^ X4 ^ Y5, Z0 ^ Y4 ^ X5, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2 ^ Z3 ^ Y6, Z2 ^ Y3 ^ X5, Z1 ^ X3 ^ Y5, Z0 ^ X4 ^ Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4 ^ Y9, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4 ^ Y8, Z3 ^ Y4 ^ X8, Z2 ^ X5 ^ Y7, Z1 ^ Y5 ^ X7, Z0 ^ X6 ^ Y6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3 ^ Y8, Z3 ^ Y4 ^ X7, Z2 ^ X4 ^ Y7, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3 ^ Y7, Y3 ^ Z3 ^ X7, Z2 ^ X4 ^ Y6, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2 ^ Y7, Y3 ^ Z3 ^ X6, Z2 ^ X3 ^ Y6, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4 ^Y10, Y5 ^ X9, Z3 ^ X5 ^ Y9, Z2 ^ Y6 ^ X8, Z1 ^ X6 ^ Y8, Z0 ^ X7 ^ Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4 ^ Y9, Y4 ^ X9, Z3 ^ X5 ^ Y8, Z2 ^ Y5 ^ X8, Z1 ^ X6 ^ Y7, Z0 ^ Y6 ^ X7, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3 ^ Y9, Y4 ^ X8, Z3 ^ X4 ^ Y8, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3 ^ Y8, Y3 ^ X8, Z3 ^ X4 ^ Y7, Z2 ^ Y4 ^ X7, Z1 ^ X5 ^ Y6, Z0 ^ Y5 ^ X6, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2 ^ Y8, Y3 ^ X7, X3 ^ Z3 ^ Y7, Z2 ^ Y4 ^ X6, Z1 ^ X4 ^ Y6, Z0 ^ X5 ^ Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4 ^ Z4 ^ Y9, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4 ^ Z4 ^ Y8, Z3 ^ Y4 ^ X8, Z2 ^ X5 ^ Y7, Z1 ^ Y5 ^ X7, Z0 ^ X6 ^ Y6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3 ^ Z4 ^ Y8, Z3 ^ Y4 ^ X7, Z2 ^ X4 ^ Y7, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3 ^ Z4 ^ Y7, Y3 ^ Z3 ^ X7, Z2 ^ X4 ^ Y6, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2 ^ Z4 ^ Y7, Y3 ^ Z3 ^ X6, Z2 ^ X3 ^ Y6, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4 ^Y10, Z4 ^ Y5 ^ X9, Z3 ^ X5 ^ Y9, Z2 ^ Y6 ^ X8, Z1 ^ X6 ^ Y8, Z0 ^ X7 ^ Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4 ^ Y9, Y4 ^ Z4 ^ X9, Z3 ^ X5 ^ Y8, Z2 ^ Y5 ^ X8, Z1 ^ X6 ^ Y7, Z0 ^ Y6 ^ X7, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3 ^ Y9, Y4 ^ Z4 ^ X8, Z3 ^ X4 ^ Y8, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3 ^ Y8, Y3 ^ Z4 ^ X8, Z3 ^ X4 ^ Y7, Z2 ^ Y4 ^ X7, Z1 ^ X5 ^ Y6, Z0 ^ Y5 ^ X6, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2 ^ Y8, Y3 ^ Z4 ^ X7, X3 ^ Z3 ^ Y7, Z2 ^ Y4 ^ X6, Z1 ^ X4 ^ Y6, Z0 ^ X5 ^ Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Y3, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Z0 ^ X5 ^ Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Z0 ^ Y4 ^ X5, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Z0 ^ X4 ^ Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Z0 ^ Y3 ^ X4, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Z0 ^ X3 ^ Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Y5 ^ X6, Z0 ^ X5 ^ Y6, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^ X6, Z0 ^ X5 ^ Y5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4 ^ X5, Z0 ^ X4 ^ Y5, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Y3 ^ X5, Z0 ^ X4 ^ Y4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Y3 ^ X4, Z0 ^ X3 ^ Y4, Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Y5 ^ X7, X5 ^ Y7, Z0 ^ X6 ^ Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^ X7, X5 ^ Y6, Z0 ^ Y5 ^ X6, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4 ^ X6, X4 ^ Y6, Z0 ^ X5 ^ Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Y3 ^ X6, X4 ^ Y5, Z0 ^ Y4 ^ X5, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Y3 ^ X5, X3 ^ Y5, Z0 ^ X4 ^ Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Z1 ^ Y3 ^ X5, Z0 ^ X4 ^ Y4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Z1 ^ Y3 ^ X4, Z0 ^ X3 ^ Y4, Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^ X7, Z1 ^ X5 ^ Y6, Z0 ^ Y5 ^ X6, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4 ^ X6, Z1 ^ X4 ^ Y6, Z0 ^ X5 ^ Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Y3 ^ X6, Z1 ^ X4 ^ Y5, Z0 ^ Y4 ^ X5, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Y3 ^ X5, Z1 ^ X3 ^ Y5, Z0 ^ X4 ^ Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Y5 ^ X8, X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^ X8, X5 ^ Y7, Z1 ^ Y5 ^ X7, Z0 ^ X6 ^ Y6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4 ^ X7, X4 ^ Y7, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Y3 ^ X7, X4 ^ Y6, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Y3 ^ X6, X3 ^ Y6, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Z2 ^ Y4 ^ X7, Z1 ^ X5 ^ Y6, Z0 ^ Y5 ^ X6, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Z2 ^ Y4 ^ X6, Z1 ^ X4 ^ Y6, Z0 ^ X5 ^ Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Z2 ^ Y3 ^ X6, Z1 ^ X4 ^ Y5, Z0 ^ Y4 ^ X5, X5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Z2 ^ Y3 ^ X5, Z1 ^ X3 ^ Y5, Z0 ^ X4 ^ Y4, X4, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^ X8, Z2 ^ X5 ^ Y7, Z1 ^ Y5 ^ X7, Z0 ^ X6 ^ Y6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4 ^ X7, Z2 ^ X4 ^ Y7, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Y3 ^ X7, Z2 ^ X4 ^ Y6, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Y3 ^ X6, Z2 ^ X3 ^ Y6, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Y5 ^ X9, X5 ^ Y9, Z2 ^ Y6 ^ X8, Z1 ^ X6 ^ Y8, Z0 ^ X7 ^ Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^ X9, X5 ^ Y8, Z2 ^ Y5 ^ X8, Z1 ^ X6 ^ Y7, Z0 ^ Y6 ^ X7, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4 ^ X8, X4 ^ Y8, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Y3 ^ X8, X4 ^ Y7, Z2 ^ Y4 ^ X7, Z1 ^ X5 ^ Y6, Z0 ^ Y5 ^ X6, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Y3 ^ X7, X3 ^ Y7, Z2 ^ Y4 ^ X6, Z1 ^ X4 ^ Y6, Z0 ^ X5 ^ Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Z3 ^ Y4 ^ X8, Z2 ^ X5 ^ Y7, Z1 ^ Y5 ^ X7, Z0 ^ X6 ^ Y6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Z3 ^ Y4 ^ X7, Z2 ^ X4 ^ Y7, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Y3 ^ Z3 ^ X7, Z2 ^ X4 ^ Y6, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5, Y5, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Y3 ^ Z3 ^ X6, Z2 ^ X3 ^ Y6, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5, Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Y5 ^ X9, Z3 ^ X5 ^ Y9, Z2 ^ Y6 ^ X8, Z1 ^ X6 ^ Y8, Z0 ^ X7 ^ Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^ X9, Z3 ^ X5 ^ Y8, Z2 ^ Y5 ^ X8, Z1 ^ X6 ^ Y7, Z0 ^ Y6 ^ X7, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4 ^ X8, Z3 ^ X4 ^ Y8, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Y3 ^ X8, Z3 ^ X4 ^ Y7, Z2 ^ Y4 ^ X7, Z1 ^ X5 ^ Y6, Z0 ^ Y5 ^ X6, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Y3 ^ X7, X3 ^ Z3 ^ Y7, Z2 ^ Y4 ^ X6, Z1 ^ X4 ^ Y6, Z0 ^ X5 ^ Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Y5 ^X10, X5 ^Y10, Z3 ^ Y6 ^ X9, Z2 ^ X6 ^ Y9, Z1 ^ Y7 ^ X8, Z0 ^ X7 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^X10, X5 ^ Y9, Z3 ^ Y5 ^ X9, Z2 ^ X6 ^ Y8, Z1 ^ Y6 ^ X8, Z0 ^ X7 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4 ^ X9, X4 ^ Y9, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Y3 ^ X9, X4 ^ Y8, Z3 ^ Y4 ^ X8, Z2 ^ X5 ^ Y7, Z1 ^ Y5 ^ X7, Z0 ^ X6 ^ Y6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Y3 ^ X8, X3 ^ Y8, Z3 ^ Y4 ^ X7, Z2 ^ X4 ^ Y7, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Z4 ^ Y5 ^ X9, Z3 ^ X5 ^ Y9, Z2 ^ Y6 ^ X8, Z1 ^ X6 ^ Y8, Z0 ^ X7 ^ Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^ Z4 ^ X9, Z3 ^ X5 ^ Y8, Z2 ^ Y5 ^ X8, Z1 ^ X6 ^ Y7, Z0 ^ Y6 ^ X7, X7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4 ^ Z4 ^ X8, Z3 ^ X4 ^ Y8, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X6 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Y3 ^ Z4 ^ X8, Z3 ^ X4 ^ Y7, Z2 ^ Y4 ^ X7, Z1 ^ X5 ^ Y6, Z0 ^ Y5 ^ X6, X6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Y3 ^ Z4 ^ X7, X3 ^ Z3 ^ Y7, Z2 ^ Y4 ^ X6, Z1 ^ X4 ^ Y6, Z0 ^ X5 ^ Y5, X5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X4, Y5 ^X10, Z4 ^ X5 ^Y10, Z3 ^ Y6 ^ X9, Z2 ^ X6 ^ Y9, Z1 ^ Y7 ^ X8, Z0 ^ X7 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^X10, Z4 ^ X5 ^ Y9, Z3 ^ Y5 ^ X9, Z2 ^ X6 ^ Y8, Z1 ^ Y6 ^ X8, Z0 ^ X7 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, Y2, X2, Y3, X3, Y4 ^ X9, X4 ^ Z4 ^ Y9, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7 }, - {0, 0, 0, X0, Y0, Y1, X1, X2, Y2, X3, Y3 ^ X9, X4 ^ Z4 ^ Y8, Z3 ^ Y4 ^ X8, Z2 ^ X5 ^ Y7, Z1 ^ Y5 ^ X7, Z0 ^ X6 ^ Y6 }, - {0, 0, 0, 0, Y0, Y1, X0, X1, Y2, X2, Y3 ^ X8, X3 ^ Z4 ^ Y8, Z3 ^ Y4 ^ X7, Z2 ^ X4 ^ Y7, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6 }, -}; - -const UINT_64 SW_64K_D_X_RBPLUS[][16]= -{ - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3, Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Z0 ^ X4 ^ Y4, X4, Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Z0 ^ Y3 ^ X4, X4, Y4, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Z0 ^ X3 ^ Y3, X3, Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Z0 ^ Y2 ^ X3, X3, Y3, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Z0 ^ X2 ^ Y2, X2, Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4 ^ X5, Z0 ^ X4 ^ Y5, Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3 ^ X5, Z0 ^ X4 ^ Y4, Y4, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3 ^ X4, Z0 ^ X3 ^ Y4, Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2 ^ X4, Z0 ^ X3 ^ Y3, Y3, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2 ^ X3, Z0 ^ X2 ^ Y3, Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4 ^ X6, X4 ^ Y6, Z0 ^ X5 ^ Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3 ^ X6, X4 ^ Y5, Z0 ^ Y4 ^ X5, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3 ^ X5, X3 ^ Y5, Z0 ^ X4 ^ Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2 ^ X5, X3 ^ Y4, Z0 ^ Y3 ^ X4, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2 ^ X4, X2 ^ Y4, Z0 ^ X3 ^ Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5, Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Z1 ^ Y3 ^ X5, Z0 ^ X4 ^ Y4, Y4, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Z1 ^ Y3 ^ X4, Z0 ^ X3 ^ Y4, Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Z1 ^ Y2 ^ X4, Z0 ^ X3 ^ Y3, Y3, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Z1 ^ Y2 ^ X3, Z0 ^ X2 ^ Y3, Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4 ^ X6, Z1 ^ X4 ^ Y6, Z0 ^ X5 ^ Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3 ^ X6, Z1 ^ X4 ^ Y5, Z0 ^ Y4 ^ X5, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3 ^ X5, Z1 ^ X3 ^ Y5, Z0 ^ X4 ^ Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2 ^ X5, Z1 ^ X3 ^ Y4, Z0 ^ Y3 ^ X4, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2 ^ X4, Z1 ^ X2 ^ Y4, Z0 ^ X3 ^ Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4 ^ X7, X4 ^ Y7, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3 ^ X7, X4 ^ Y6, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3 ^ X6, X3 ^ Y6, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2 ^ X6, X3 ^ Y5, Z1 ^ Y3 ^ X5, Z0 ^ X4 ^ Y4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2 ^ X5, X2 ^ Y5, Z1 ^ Y3 ^ X4, Z0 ^ X3 ^ Y4, Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Z2 ^ Y4 ^ X6, Z1 ^ X4 ^ Y6, Z0 ^ X5 ^ Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Z2 ^ Y3 ^ X6, Z1 ^ X4 ^ Y5, Z0 ^ Y4 ^ X5, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Z2 ^ Y3 ^ X5, Z1 ^ X3 ^ Y5, Z0 ^ X4 ^ Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2 ^ Z2 ^ X5, Z1 ^ X3 ^ Y4, Z0 ^ Y3 ^ X4, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2 ^ Z2 ^ X4, Z1 ^ X2 ^ Y4, Z0 ^ X3 ^ Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4 ^ X7, Z2 ^ X4 ^ Y7, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3 ^ X7, Z2 ^ X4 ^ Y6, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3 ^ X6, Z2 ^ X3 ^ Y6, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2 ^ X6, Z2 ^ X3 ^ Y5, Z1 ^ Y3 ^ X5, Z0 ^ X4 ^ Y4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2 ^ X5, X2 ^ Z2 ^ Y5, Z1 ^ Y3 ^ X4, Z0 ^ X3 ^ Y4, Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4 ^ X8, X4 ^ Y8, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3 ^ X8, X4 ^ Y7, Z2 ^ Y4 ^ X7, Z1 ^ X5 ^ Y6, Z0 ^ Y5 ^ X6, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3 ^ X7, X3 ^ Y7, Z2 ^ Y4 ^ X6, Z1 ^ X4 ^ Y6, Z0 ^ X5 ^ Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2 ^ X7, X3 ^ Y6, Z2 ^ Y3 ^ X6, Z1 ^ X4 ^ Y5, Z0 ^ Y4 ^ X5, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2 ^ X6, X2 ^ Y6, Z2 ^ Y3 ^ X5, Z1 ^ X3 ^ Y5, Z0 ^ X4 ^ Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Z3 ^ Y4 ^ X7, Z2 ^ X4 ^ Y7, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3 ^ Z3 ^ X7, Z2 ^ X4 ^ Y6, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3 ^ Z3 ^ X6, Z2 ^ X3 ^ Y6, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2 ^ Z3 ^ X6, Z2 ^ X3 ^ Y5, Z1 ^ Y3 ^ X5, Z0 ^ X4 ^ Y4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2 ^ Z3 ^ X5, X2 ^ Z2 ^ Y5, Z1 ^ Y3 ^ X4, Z0 ^ X3 ^ Y4, Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4 ^ X8, Z3 ^ X4 ^ Y8, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3 ^ X8, Z3 ^ X4 ^ Y7, Z2 ^ Y4 ^ X7, Z1 ^ X5 ^ Y6, Z0 ^ Y5 ^ X6, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3 ^ X7, X3 ^ Z3 ^ Y7, Z2 ^ Y4 ^ X6, Z1 ^ X4 ^ Y6, Z0 ^ X5 ^ Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2 ^ X7, X3 ^ Z3 ^ Y6, Z2 ^ Y3 ^ X6, Z1 ^ X4 ^ Y5, Z0 ^ Y4 ^ X5, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2 ^ X6, X2 ^ Z3 ^ Y6, Z2 ^ Y3 ^ X5, Z1 ^ X3 ^ Y5, Z0 ^ X4 ^ Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4 ^ X9, X4 ^ Y9, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3 ^ X9, X4 ^ Y8, Z3 ^ Y4 ^ X8, Z2 ^ X5 ^ Y7, Z1 ^ Y5 ^ X7, Z0 ^ X6 ^ Y6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3 ^ X8, X3 ^ Y8, Z3 ^ Y4 ^ X7, Z2 ^ X4 ^ Y7, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2 ^ X8, X3 ^ Y7, Y3 ^ Z3 ^ X7, Z2 ^ X4 ^ Y6, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2 ^ X7, X2 ^ Y7, Y3 ^ Z3 ^ X6, Z2 ^ X3 ^ Y6, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4 ^ Z4 ^ X8, Z3 ^ X4 ^ Y8, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3 ^ Z4 ^ X8, Z3 ^ X4 ^ Y7, Z2 ^ Y4 ^ X7, Z1 ^ X5 ^ Y6, Z0 ^ Y5 ^ X6, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3 ^ Z4 ^ X7, X3 ^ Z3 ^ Y7, Z2 ^ Y4 ^ X6, Z1 ^ X4 ^ Y6, Z0 ^ X5 ^ Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2 ^ Z4 ^ X7, X3 ^ Z3 ^ Y6, Z2 ^ Y3 ^ X6, Z1 ^ X4 ^ Y5, Z0 ^ Y4 ^ X5, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2 ^ Z4 ^ X6, X2 ^ Z3 ^ Y6, Z2 ^ Y3 ^ X5, Z1 ^ X3 ^ Y5, Z0 ^ X4 ^ Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4 ^ X9, X4 ^ Z4 ^ Y9, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3 ^ X9, X4 ^ Z4 ^ Y8, Z3 ^ Y4 ^ X8, Z2 ^ X5 ^ Y7, Z1 ^ Y5 ^ X7, Z0 ^ X6 ^ Y6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3 ^ X8, X3 ^ Z4 ^ Y8, Z3 ^ Y4 ^ X7, Z2 ^ X4 ^ Y7, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2 ^ X8, X3 ^ Z4 ^ Y7, Y3 ^ Z3 ^ X7, Z2 ^ X4 ^ Y6, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2 ^ X7, X2 ^ Z4 ^ Y7, Y3 ^ Z3 ^ X6, Z2 ^ X3 ^ Y6, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3, Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, Z0 ^ X4 ^ Y5, Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Z0 ^ X4 ^ Y4, Y4, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, Z0 ^ X3 ^ Y4, Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, Z0 ^ X3 ^ Y3, Y3, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, Z0 ^ X2 ^ Y3, Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4 ^ Y6, Z0 ^ X5 ^ Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4 ^ Y5, Z0 ^ Y4 ^ X5, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3 ^ Y5, Z0 ^ X4 ^ Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3 ^ Y4, Z0 ^ Y3 ^ X4, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2 ^ Y4, Z0 ^ X3 ^ Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4 ^ Y7, Y5 ^ X6, Z0 ^ X5 ^ Y6, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4 ^ Y6, Y4 ^ X6, Z0 ^ X5 ^ Y5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3 ^ Y6, Y4 ^ X5, Z0 ^ X4 ^ Y5, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3 ^ Y5, Y3 ^ X5, Z0 ^ X4 ^ Y4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2 ^ Y5, Y3 ^ X4, Z0 ^ X3 ^ Y4, Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, Z1 ^ X4 ^ Y6, Z0 ^ X5 ^ Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Z1 ^ X4 ^ Y5, Z0 ^ Y4 ^ X5, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, Z1 ^ X3 ^ Y5, Z0 ^ X4 ^ Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, Z1 ^ X3 ^ Y4, Z0 ^ Y3 ^ X4, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, Z1 ^ X2 ^ Y4, Z0 ^ X3 ^ Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4 ^ Y7, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4 ^ Y6, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3 ^ Y6, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3 ^ Y5, Z1 ^ Y3 ^ X5, Z0 ^ X4 ^ Y4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2 ^ Y5, Z1 ^ Y3 ^ X4, Z0 ^ X3 ^ Y4, Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4 ^ Y8, Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4 ^ Y7, Y4 ^ X7, Z1 ^ X5 ^ Y6, Z0 ^ Y5 ^ X6, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3 ^ Y7, Y4 ^ X6, Z1 ^ X4 ^ Y6, Z0 ^ X5 ^ Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3 ^ Y6, Y3 ^ X6, Z1 ^ X4 ^ Y5, Z0 ^ Y4 ^ X5, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2 ^ Y6, Y3 ^ X5, Z1 ^ X3 ^ Y5, Z0 ^ X4 ^ Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, Z2 ^ X4 ^ Y7, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Z2 ^ X4 ^ Y6, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, Z2 ^ X3 ^ Y6, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, Z2 ^ X3 ^ Y5, Z1 ^ Y3 ^ X5, Z0 ^ X4 ^ Y4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2 ^ Z2 ^ Y5, Z1 ^ Y3 ^ X4, Z0 ^ X3 ^ Y4, Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4 ^ Y8, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4 ^ Y7, Z2 ^ Y4 ^ X7, Z1 ^ X5 ^ Y6, Z0 ^ Y5 ^ X6, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3 ^ Y7, Z2 ^ Y4 ^ X6, Z1 ^ X4 ^ Y6, Z0 ^ X5 ^ Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3 ^ Y6, Z2 ^ Y3 ^ X6, Z1 ^ X4 ^ Y5, Z0 ^ Y4 ^ X5, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2 ^ Y6, Z2 ^ Y3 ^ X5, Z1 ^ X3 ^ Y5, Z0 ^ X4 ^ Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4 ^ Y9, Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4 ^ Y8, Y4 ^ X8, Z2 ^ X5 ^ Y7, Z1 ^ Y5 ^ X7, Z0 ^ X6 ^ Y6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3 ^ Y8, Y4 ^ X7, Z2 ^ X4 ^ Y7, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3 ^ Y7, Y3 ^ X7, Z2 ^ X4 ^ Y6, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2 ^ Y7, Y3 ^ X6, Z2 ^ X3 ^ Y6, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, Z3 ^ X4 ^ Y8, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Z3 ^ X4 ^ Y7, Z2 ^ Y4 ^ X7, Z1 ^ X5 ^ Y6, Z0 ^ Y5 ^ X6, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3 ^ Z3 ^ Y7, Z2 ^ Y4 ^ X6, Z1 ^ X4 ^ Y6, Z0 ^ X5 ^ Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3 ^ Z3 ^ Y6, Z2 ^ Y3 ^ X6, Z1 ^ X4 ^ Y5, Z0 ^ Y4 ^ X5, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2 ^ Z3 ^ Y6, Z2 ^ Y3 ^ X5, Z1 ^ X3 ^ Y5, Z0 ^ X4 ^ Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4 ^ Y9, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4 ^ Y8, Z3 ^ Y4 ^ X8, Z2 ^ X5 ^ Y7, Z1 ^ Y5 ^ X7, Z0 ^ X6 ^ Y6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3 ^ Y8, Z3 ^ Y4 ^ X7, Z2 ^ X4 ^ Y7, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3 ^ Y7, Y3 ^ Z3 ^ X7, Z2 ^ X4 ^ Y6, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2 ^ Y7, Y3 ^ Z3 ^ X6, Z2 ^ X3 ^ Y6, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4 ^Y10, Y5 ^ X9, Z3 ^ X5 ^ Y9, Z2 ^ Y6 ^ X8, Z1 ^ X6 ^ Y8, Z0 ^ X7 ^ Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4 ^ Y9, Y4 ^ X9, Z3 ^ X5 ^ Y8, Z2 ^ Y5 ^ X8, Z1 ^ X6 ^ Y7, Z0 ^ Y6 ^ X7, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3 ^ Y9, Y4 ^ X8, Z3 ^ X4 ^ Y8, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3 ^ Y8, Y3 ^ X8, Z3 ^ X4 ^ Y7, Z2 ^ Y4 ^ X7, Z1 ^ X5 ^ Y6, Z0 ^ Y5 ^ X6, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2 ^ Y8, Y3 ^ X7, X3 ^ Z3 ^ Y7, Z2 ^ Y4 ^ X6, Z1 ^ X4 ^ Y6, Z0 ^ X5 ^ Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4 ^ Z4 ^ Y9, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4 ^ Z4 ^ Y8, Z3 ^ Y4 ^ X8, Z2 ^ X5 ^ Y7, Z1 ^ Y5 ^ X7, Z0 ^ X6 ^ Y6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3 ^ Z4 ^ Y8, Z3 ^ Y4 ^ X7, Z2 ^ X4 ^ Y7, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3 ^ Z4 ^ Y7, Y3 ^ Z3 ^ X7, Z2 ^ X4 ^ Y6, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2 ^ Z4 ^ Y7, Y3 ^ Z3 ^ X6, Z2 ^ X3 ^ Y6, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4 ^Y10, Z4 ^ Y5 ^ X9, Z3 ^ X5 ^ Y9, Z2 ^ Y6 ^ X8, Z1 ^ X6 ^ Y8, Z0 ^ X7 ^ Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4 ^ Y9, Y4 ^ Z4 ^ X9, Z3 ^ X5 ^ Y8, Z2 ^ Y5 ^ X8, Z1 ^ X6 ^ Y7, Z0 ^ Y6 ^ X7, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3 ^ Y9, Y4 ^ Z4 ^ X8, Z3 ^ X4 ^ Y8, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3 ^ Y8, Y3 ^ Z4 ^ X8, Z3 ^ X4 ^ Y7, Z2 ^ Y4 ^ X7, Z1 ^ X5 ^ Y6, Z0 ^ Y5 ^ X6, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2 ^ Y8, Y3 ^ Z4 ^ X7, X3 ^ Z3 ^ Y7, Z2 ^ Y4 ^ X6, Z1 ^ X4 ^ Y6, Z0 ^ X5 ^ Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3, Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Z0 ^ X5 ^ Y5, X5, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Z0 ^ Y4 ^ X5, X5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3, Z0 ^ X4 ^ Y4, X4, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Z0 ^ Y3 ^ X4, X4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Z0 ^ X3 ^ Y3, X3, Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5 ^ X6, Z0 ^ X5 ^ Y6, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^ X6, Z0 ^ X5 ^ Y5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3, Y4 ^ X5, Z0 ^ X4 ^ Y5, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3 ^ X5, Z0 ^ X4 ^ Y4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3 ^ X4, Z0 ^ X3 ^ Y4, Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5 ^ X7, X5 ^ Y7, Z0 ^ X6 ^ Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^ X7, X5 ^ Y6, Z0 ^ Y5 ^ X6, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3, Y4 ^ X6, X4 ^ Y6, Z0 ^ X5 ^ Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3 ^ X6, X4 ^ Y5, Z0 ^ Y4 ^ X5, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3 ^ X5, X3 ^ Y5, Z0 ^ X4 ^ Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5, Y5, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5, Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Z1 ^ Y3 ^ X5, Z0 ^ X4 ^ Y4, Y4, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Z1 ^ Y3 ^ X4, Z0 ^ X3 ^ Y4, Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^ X7, Z1 ^ X5 ^ Y6, Z0 ^ Y5 ^ X6, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3, Y4 ^ X6, Z1 ^ X4 ^ Y6, Z0 ^ X5 ^ Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3 ^ X6, Z1 ^ X4 ^ Y5, Z0 ^ Y4 ^ X5, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3 ^ X5, Z1 ^ X3 ^ Y5, Z0 ^ X4 ^ Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5 ^ X8, X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^ X8, X5 ^ Y7, Z1 ^ Y5 ^ X7, Z0 ^ X6 ^ Y6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3, Y4 ^ X7, X4 ^ Y7, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3 ^ X7, X4 ^ Y6, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3 ^ X6, X3 ^ Y6, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X6, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Z2 ^ Y4 ^ X7, Z1 ^ X5 ^ Y6, Z0 ^ Y5 ^ X6, X6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3, Z2 ^ Y4 ^ X6, Z1 ^ X4 ^ Y6, Z0 ^ X5 ^ Y5, X5, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Z2 ^ Y3 ^ X6, Z1 ^ X4 ^ Y5, Z0 ^ Y4 ^ X5, X5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Z2 ^ Y3 ^ X5, Z1 ^ X3 ^ Y5, Z0 ^ X4 ^ Y4, X4, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^ X8, Z2 ^ X5 ^ Y7, Z1 ^ Y5 ^ X7, Z0 ^ X6 ^ Y6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3, Y4 ^ X7, Z2 ^ X4 ^ Y7, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3 ^ X7, Z2 ^ X4 ^ Y6, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3 ^ X6, Z2 ^ X3 ^ Y6, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5 ^ X9, X5 ^ Y9, Z2 ^ Y6 ^ X8, Z1 ^ X6 ^ Y8, Z0 ^ X7 ^ Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^ X9, X5 ^ Y8, Z2 ^ Y5 ^ X8, Z1 ^ X6 ^ Y7, Z0 ^ Y6 ^ X7, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3, Y4 ^ X8, X4 ^ Y8, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3 ^ X8, X4 ^ Y7, Z2 ^ Y4 ^ X7, Z1 ^ X5 ^ Y6, Z0 ^ Y5 ^ X6, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3 ^ X7, X3 ^ Y7, Z2 ^ Y4 ^ X6, Z1 ^ X4 ^ Y6, Z0 ^ X5 ^ Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7, Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Z3 ^ Y4 ^ X8, Z2 ^ X5 ^ Y7, Z1 ^ Y5 ^ X7, Z0 ^ X6 ^ Y6, Y6, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3, Z3 ^ Y4 ^ X7, Z2 ^ X4 ^ Y7, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3 ^ Z3 ^ X7, Z2 ^ X4 ^ Y6, Z1 ^ Y4 ^ X6, Z0 ^ X5 ^ Y5, Y5, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3 ^ Z3 ^ X6, Z2 ^ X3 ^ Y6, Z1 ^ Y4 ^ X5, Z0 ^ X4 ^ Y5, Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5 ^ X9, Z3 ^ X5 ^ Y9, Z2 ^ Y6 ^ X8, Z1 ^ X6 ^ Y8, Z0 ^ X7 ^ Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^ X9, Z3 ^ X5 ^ Y8, Z2 ^ Y5 ^ X8, Z1 ^ X6 ^ Y7, Z0 ^ Y6 ^ X7, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3, Y4 ^ X8, Z3 ^ X4 ^ Y8, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3 ^ X8, Z3 ^ X4 ^ Y7, Z2 ^ Y4 ^ X7, Z1 ^ X5 ^ Y6, Z0 ^ Y5 ^ X6, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3 ^ X7, X3 ^ Z3 ^ Y7, Z2 ^ Y4 ^ X6, Z1 ^ X4 ^ Y6, Z0 ^ X5 ^ Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5 ^X10, X5 ^Y10, Z3 ^ Y6 ^ X9, Z2 ^ X6 ^ Y9, Z1 ^ Y7 ^ X8, Z0 ^ X7 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^X10, X5 ^ Y9, Z3 ^ Y5 ^ X9, Z2 ^ X6 ^ Y8, Z1 ^ Y6 ^ X8, Z0 ^ X7 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3, Y4 ^ X9, X4 ^ Y9, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3 ^ X9, X4 ^ Y8, Z3 ^ Y4 ^ X8, Z2 ^ X5 ^ Y7, Z1 ^ Y5 ^ X7, Z0 ^ X6 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3 ^ X8, X3 ^ Y8, Z3 ^ Y4 ^ X7, Z2 ^ X4 ^ Y7, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Z4 ^ Y5 ^ X9, Z3 ^ X5 ^ Y9, Z2 ^ Y6 ^ X8, Z1 ^ X6 ^ Y8, Z0 ^ X7 ^ Y7, X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^ Z4 ^ X9, Z3 ^ X5 ^ Y8, Z2 ^ Y5 ^ X8, Z1 ^ X6 ^ Y7, Z0 ^ Y6 ^ X7, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3, Y4 ^ Z4 ^ X8, Z3 ^ X4 ^ Y8, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3 ^ Z4 ^ X8, Z3 ^ X4 ^ Y7, Z2 ^ Y4 ^ X7, Z1 ^ X5 ^ Y6, Z0 ^ Y5 ^ X6, X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3 ^ Z4 ^ X7, X3 ^ Z3 ^ Y7, Z2 ^ Y4 ^ X6, Z1 ^ X4 ^ Y6, Z0 ^ X5 ^ Y5, X5 }, - {X0, X1, X2, Y1, Y0, Y2, X3, Y3, Y4, X4, Y5 ^X10, Z4 ^ X5 ^Y10, Z3 ^ Y6 ^ X9, Z2 ^ X6 ^ Y9, Z1 ^ Y7 ^ X8, Z0 ^ X7 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^X10, Z4 ^ X5 ^ Y9, Z3 ^ Y5 ^ X9, Z2 ^ X6 ^ Y8, Z1 ^ Y6 ^ X8, Z0 ^ X7 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y3, X3, Y4 ^ X9, X4 ^ Z4 ^ Y9, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3 ^ X9, X4 ^ Z4 ^ Y8, Z3 ^ Y4 ^ X8, Z2 ^ X5 ^ Y7, Z1 ^ Y5 ^ X7, Z0 ^ X6 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y2, X2, Y3 ^ X8, X3 ^ Z4 ^ Y8, Z3 ^ Y4 ^ X7, Z2 ^ X4 ^ Y7, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6 }, -}; - -const UINT_64 SW_64K_R_X_1xaa_RBPLUS[][16]= -{ - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, X4, Y4, X5 ^Y10, Y5 ^X10, X6 ^ Y9, Y6 ^ X9, X7 ^ Y8, Y7 ^ X8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^X10, X5 ^ Y9, Y5 ^ X9, X6 ^ Y8, Y6 ^ X8, X7 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y3, X4 ^ Y9, Y4 ^ X9, X5 ^ Y8, Y5 ^ X8, X6 ^ Y7, Y6 ^ X7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3 ^ X9, X4 ^ Y8, Y4 ^ X8, X5 ^ Y7, Y5 ^ X7, X6 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y2, X3 ^ Y8, Y3 ^ X8, X4 ^ Y7, Y4 ^ X7, X5 ^ Y6, Y5 ^ X6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Z0 ^ X4 ^ Y4, Y4, X5, Y5 ^X10, X6 ^ Y9, Y6 ^ X9, X7 ^ Y8, Y7 ^ X8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Z0 ^ X4 ^ Y4, Y3, Y4, X5 ^ Y9, X6 ^ Y8, Y5 ^ X9, X7 ^ Y7, Y6 ^ X8 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Z0 ^ X4 ^ Y4, X3, Y3, Y4 ^ X9, X5 ^ Y8, Y5 ^ X8, X6 ^ Y7, Y6 ^ X7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Z0 ^ X4 ^ Y4, Y2, X3, Y3 ^ X9, X5 ^ Y7, Y4 ^ X8, X6 ^ Y6, Y5 ^ X7 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Z0 ^ X4 ^ Y4, X2, Y2, Y3 ^ X8, X3 ^ Y7, Y4 ^ X7, X5 ^ Y6, Y5 ^ X6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5, Y5, X6 ^ Y9, Y6 ^ X9, X7 ^ Y8, Y7 ^ X8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, Y3, X5, X6 ^ Y8, Y5 ^ X9, X7 ^ Y7, Y6 ^ X8 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X3, Y3, X5 ^ Y8, Y5 ^ X8, X6 ^ Y7, Y6 ^ X7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, Y2, X3, Y3 ^ X8, X5 ^ Y7, X6 ^ Y6, Y5 ^ X7 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X2, Y2, Y3 ^ X7, X3 ^ Y7, X5 ^ Y6, Y5 ^ X6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ X6 ^ Y6, Y5, X6, Y6 ^ X9, X7 ^ Y8, Y7 ^ X8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ X6 ^ Y6, Y3, Y5, X6 ^ Y8, X7 ^ Y7, Y6 ^ X8 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ X6 ^ Y6, X3, Y3, Y5 ^ X8, X6 ^ Y7, Y6 ^ X7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ X6 ^ Y6, Y2, X3, Y3 ^ X8, X6 ^ Y6, Y5 ^ X7 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ X6 ^ Y6, X2, Y2, Y3 ^ X7, X3 ^ Y6, Y5 ^ X6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, X5, X6, Y6 ^ X9, X7 ^ Y8, Y7 ^ X8, Z0 ^ X5 ^ Y5 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Y3, X5, X6 ^ Y8, Y6 ^ X8, X7 ^ Y7, Z0 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, X3, Y3, X5 ^ Y8, X6 ^ Y7, Y6 ^ X7, Z0 ^ X5 ^ Y5 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Y2, X3, Y3 ^ X7, X5 ^ Y7, X6 ^ Y6, Z0 ^ X5 ^ Y5 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, X2, Y2, X3 ^ Y7, Y3 ^ X6, X5 ^ Y6, Z0 ^ X5 ^ Y5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5, X6, Y6 ^ X9, X7 ^ Y8, Y7 ^ X8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y3, X5, X6 ^ Y8, Y6 ^ X8, X7 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X3, Y3, X5 ^ Y8, X6 ^ Y7, Y6 ^ X7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y2, X3, Y3 ^ X7, X5 ^ Y7, X6 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X2, Y2, X3 ^ Y7, Y3 ^ X6, X5 ^ Y6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ X7 ^ Y7, X6, Y6, X7 ^ Y8, Y7 ^ X8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ X7 ^ Y7, Y3, X6, Y6 ^ X8, X7 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ X7 ^ Y7, X3, Y3, X6 ^ Y7, Y6 ^ X7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ X7 ^ Y7, Y2, X3, Y3 ^ X7, X6 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ X7 ^ Y7, X2, Y2, X3 ^ Y6, Y3 ^ X6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, X6, Y6, X7 ^ Y8, Y7 ^ X8, Z0 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Y3, X6, X7 ^ Y7, Y6 ^ X8, Z0 ^ X5 ^ Y6 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, X3, Y3, X6 ^ Y7, Y6 ^ X7, Z0 ^ X5 ^ Y6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Y2, X3, Y3 ^ X7, X6 ^ Y6, Z0 ^ X5 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, X2, Y2, Y3 ^ X6, X3 ^ Y6, Z0 ^ X5 ^ Y6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X6, Y6, X7 ^ Y8, Y7 ^ X8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y3, X6, X7 ^ Y7, Y6 ^ X8 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X3, Y3, X6 ^ Y7, Y6 ^ X7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y2, X3, Y3 ^ X7, X6 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X2, Y2, Y3 ^ X6, X3 ^ Y6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X6 ^ X8 ^ Y8, Y6, X7, Y7 ^ X8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X6 ^ X8 ^ Y8, Y3, Y6, X7 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X6 ^ X8 ^ Y8, X3, Y3, Y6 ^ X7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X6 ^ X8 ^ Y8, Y2, X3, Y3 ^ X7 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X3 ^ X8 ^ Y8, X2, Y2, Y3 ^ X6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, X6, X7, Y7 ^ X8, Z0 ^ X6 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Y3, X6, X7 ^ Y7, Z0 ^ X6 ^ Y6 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, X3, Y3, X6 ^ Y7, Z0 ^ X6 ^ Y6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Y2, X3, Y3 ^ X7, Z0 ^ X6 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y4 ^ X8 ^ Y8, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, X2, X3, Y3 ^ X6, Y2 ^ X6 ^ Y6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X6, X7, Y7 ^ X8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, Y3, X6, X7 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X3, Y3, X6 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, Y2, X3, Y3 ^ X7 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y4 ^ X8 ^ Y8, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, X2, X3, Y3 ^ X6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X6 ^ X9 ^ Y9, X7, Y7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X6 ^ X9 ^ Y9, Y3, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X6 ^ X9 ^ Y9, X3, Y3 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X3 ^ X9 ^ Y9, Y2, Y3 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y4 ^ X8 ^ Y8, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, X3 ^ X9 ^ Y9, X2, Y3 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4 ^ X9 ^ Y9, X4 ^ Y4 ^ Z4, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, X7, Y7, Z0 ^ X6 ^ Y7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y4 ^ X9 ^ Y9, X4 ^ Y4 ^ Z4, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Y3, X7, Z0 ^ X6 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y4 ^ X9 ^ Y9, X4 ^ Y4 ^ Z4, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, X3, Y3, Z0 ^ X6 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y4 ^ X9 ^ Y9, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X8, Z1 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X3, Y3, Z0 ^ X6 ^ Y7 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y4 ^ X9 ^ Y9, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X3, Y3, X2 ^ X6 ^ Y7 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4 ^ X9 ^ Y9, X4 ^ Y4 ^ Z4, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7, X7, Y7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y4 ^ X9 ^ Y9, X4 ^ Y4 ^ Z4, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7, Y3, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y4 ^ X9 ^ Y9, X4 ^ Y4 ^ Z4, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7, X3, Y3 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y4 ^ X9 ^ Y9, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X8, Z1 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7, X3, Y3 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y4 ^ X9 ^ Y9, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7, X3, Y3 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, X4, Y4, X5 ^Y10, Y5 ^X10, X6 ^ Y9, Y6 ^ X9, X7 ^ Y8, Y7 ^ X8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^X10, X5 ^ Y9, Y5 ^ X9, X6 ^ Y8, Y6 ^ X8, X7 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y3, X4 ^ Y9, Y4 ^ X9, X5 ^ Y8, Y5 ^ X8, X6 ^ Y7, Y6 ^ X7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3 ^ X9, X4 ^ Y8, Y4 ^ X8, X5 ^ Y7, Y5 ^ X7, X6 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y2, X3 ^ Y8, Y3 ^ X8, X4 ^ Y7, Y4 ^ X7, X5 ^ Y6, Y5 ^ X6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, Z0 ^ X4 ^ Y4, X5, Y5 ^X10, X6 ^ Y9, Y6 ^ X9, X7 ^ Y8, Y7 ^ X8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Z0 ^ X4 ^ Y4, Y4, X5 ^ Y9, X6 ^ Y8, Y5 ^ X9, X7 ^ Y7, Y6 ^ X8 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Z0 ^ X4 ^ Y4, Y3, Y4 ^ X9, X5 ^ Y8, Y5 ^ X8, X6 ^ Y7, Y6 ^ X7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, Z0 ^ X4 ^ Y4, X3, Y3 ^ X9, X5 ^ Y7, Y4 ^ X8, X6 ^ Y6, Y5 ^ X7 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Z0 ^ X4 ^ Y4, Y2, Y3 ^ X8, X3 ^ Y7, Y4 ^ X7, X5 ^ Y6, Y5 ^ X6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, X5, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, Y5, X6 ^ Y9, Y6 ^ X9, X7 ^ Y8, Y7 ^ X8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5, X6 ^ Y8, Y5 ^ X9, X7 ^ Y7, Y6 ^ X8 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, Y3, X5 ^ Y8, Y5 ^ X8, X6 ^ Y7, Y6 ^ X7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X3, Y3 ^ X8, X5 ^ Y7, X6 ^ Y6, Y5 ^ X7 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, Y2, Y3 ^ X7, X3 ^ Y7, X5 ^ Y6, Y5 ^ X6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y5, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ X6 ^ Y6, X6, Y6 ^ X9, X7 ^ Y8, Y7 ^ X8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ X6 ^ Y6, Y5, X6 ^ Y8, X7 ^ Y7, Y6 ^ X8 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ X6 ^ Y6, Y3, Y5 ^ X8, X6 ^ Y7, Y6 ^ X7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ X6 ^ Y6, X3, Y3 ^ X8, X6 ^ Y6, Y5 ^ X7 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ X6 ^ Y6, Y2, Y3 ^ X7, X3 ^ Y6, Y5 ^ X6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, X5, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, X6, Y6 ^ X9, X7 ^ Y8, Y7 ^ X8, Z0 ^ X5 ^ Y5 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, X5, X6 ^ Y8, Y6 ^ X8, X7 ^ Y7, Z0 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Y3, X5 ^ Y8, X6 ^ Y7, Y6 ^ X7, Z0 ^ X5 ^ Y5 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, X3, Y3 ^ X7, X5 ^ Y7, X6 ^ Y6, Z0 ^ X5 ^ Y5 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Y2, X3 ^ Y7, Y3 ^ X6, X5 ^ Y6, Z0 ^ X5 ^ Y5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, X5, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X6, Y6 ^ X9, X7 ^ Y8, Y7 ^ X8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5, X6 ^ Y8, Y6 ^ X8, X7 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y3, X5 ^ Y8, X6 ^ Y7, Y6 ^ X7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X3, Y3 ^ X7, X5 ^ Y7, X6 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y2, X3 ^ Y7, Y3 ^ X6, X5 ^ Y6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, X6, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ X7 ^ Y7, Y6, X7 ^ Y8, Y7 ^ X8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ X7 ^ Y7, X6, Y6 ^ X8, X7 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ X7 ^ Y7, Y3, X6 ^ Y7, Y6 ^ X7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ X7 ^ Y7, X3, Y3 ^ X7, X6 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ X7 ^ Y7, Y2, X3 ^ Y6, Y3 ^ X6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, X6, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Y6, X7 ^ Y8, Y7 ^ X8, Z0 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, X6, X7 ^ Y7, Y6 ^ X8, Z0 ^ X5 ^ Y6 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Y3, X6 ^ Y7, Y6 ^ X7, Z0 ^ X5 ^ Y6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, X3, Y3 ^ X7, X6 ^ Y6, Z0 ^ X5 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Y2, Y3 ^ X6, X3 ^ Y6, Z0 ^ X5 ^ Y6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, X6, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y6, X7 ^ Y8, Y7 ^ X8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X6, X7 ^ Y7, Y6 ^ X8 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y3, X6 ^ Y7, Y6 ^ X7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X3, Y3 ^ X7, X6 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y2, Y3 ^ X6, X3 ^ Y6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y6, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X6 ^ X8 ^ Y8, X7, Y7 ^ X8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X6 ^ X8 ^ Y8, Y6, X7 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X6 ^ X8 ^ Y8, Y3, Y6 ^ X7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X6 ^ X8 ^ Y8, X3, Y3 ^ X7 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X3 ^ X8 ^ Y8, Y2, Y3 ^ X6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, X6, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, X7, Y7 ^ X8, Z0 ^ X6 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, X6, X7 ^ Y7, Z0 ^ X6 ^ Y6 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Y3, X6 ^ Y7, Z0 ^ X6 ^ Y6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, X3, Y3 ^ X7, Z0 ^ X6 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y4 ^ X8 ^ Y8, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, X3, Y3 ^ X6, Y2 ^ X6 ^ Y6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, X6, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X7, Y7 ^ X8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X6, X7 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, Y3, X6 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X3, Y3 ^ X7 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y4 ^ X8 ^ Y8, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, X3, Y3 ^ X6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, X7, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X6 ^ X9 ^ Y9, Y7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X6 ^ X9 ^ Y9, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X6 ^ X9 ^ Y9, Y3 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X3 ^ X9 ^ Y9, Y3 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y4 ^ X8 ^ Y8, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, X3 ^ X9 ^ Y9, Y3 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, X7, Y4 ^ X9 ^ Y9, X4 ^ Y4 ^ Z4, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Y7, Z0 ^ X6 ^ Y7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Y4 ^ X9 ^ Y9, X4 ^ Y4 ^ Z4, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, X7, Z0 ^ X6 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y4 ^ X9 ^ Y9, X4 ^ Y4 ^ Z4, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Y3, Z0 ^ X6 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, X3, Y4 ^ X9 ^ Y9, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X8, Z1 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, Y3, Z0 ^ X6 ^ Y7 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X3, Y4 ^ X9 ^ Y9, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, Y3, X2 ^ X6 ^ Y7 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, X7, Y4 ^ X9 ^ Y9, X4 ^ Y4 ^ Z4, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7, Y7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Y4 ^ X9 ^ Y9, X4 ^ Y4 ^ Z4, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7, X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y4 ^ X9 ^ Y9, X4 ^ Y4 ^ Z4, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7, Y3 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, X3, Y4 ^ X9 ^ Y9, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X8, Z1 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7, Y3 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X3, Y4 ^ X9 ^ Y9, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7, Y3 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, X4, Y4, X5 ^Y10, Y5 ^X10, X6 ^ Y9, Y6 ^ X9, X7 ^ Y8, Y7 ^ X8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^X10, X5 ^ Y9, Y5 ^ X9, X6 ^ Y8, Y6 ^ X8, X7 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y3, X4 ^ Y9, Y4 ^ X9, X5 ^ Y8, Y5 ^ X8, X6 ^ Y7, Y6 ^ X7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3 ^ X9, X4 ^ Y8, Y4 ^ X8, X5 ^ Y7, Y5 ^ X7, X6 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y2, X3 ^ Y8, Y3 ^ X8, X4 ^ Y7, Y4 ^ X7, X5 ^ Y6, Y5 ^ X6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X5, Z0 ^ X4 ^ Y4, Y5 ^X10, X6 ^ Y9, Y6 ^ X9, X7 ^ Y8, Y7 ^ X8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Y4, Z0 ^ X4 ^ Y4, X5 ^ Y9, X6 ^ Y8, Y5 ^ X9, X7 ^ Y7, Y6 ^ X8 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y3, Z0 ^ X4 ^ Y4, Y4 ^ X9, X5 ^ Y8, Y5 ^ X8, X6 ^ Y7, Y6 ^ X7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Z0 ^ X4 ^ Y4, Y3 ^ X9, X5 ^ Y7, Y4 ^ X8, X6 ^ Y6, Y5 ^ X7 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y2, Z0 ^ X4 ^ Y4, Y3 ^ X8, X3 ^ Y7, Y4 ^ X7, X5 ^ Y6, Y5 ^ X6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, X5, Y5, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X6 ^ Y9, Y6 ^ X9, X7 ^ Y8, Y7 ^ X8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X5, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X6 ^ Y8, Y5 ^ X9, X7 ^ Y7, Y6 ^ X8 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y8, Y5 ^ X8, X6 ^ Y7, Y6 ^ X7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, Y3 ^ X8, X5 ^ Y7, X6 ^ Y6, Y5 ^ X7 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, Y3 ^ X7, X3 ^ Y7, X5 ^ Y6, Y5 ^ X6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y5, X6, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ X6 ^ Y6, Y6 ^ X9, X7 ^ Y8, Y7 ^ X8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Y5, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ X6 ^ Y6, X6 ^ Y8, X7 ^ Y7, Y6 ^ X8 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ X6 ^ Y6, Y5 ^ X8, X6 ^ Y7, Y6 ^ X7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ X6 ^ Y6, Y3 ^ X8, X6 ^ Y6, Y5 ^ X7 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ X6 ^ Y6, Y3 ^ X7, X3 ^ Y6, Y5 ^ X6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, X5, X6, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Y6 ^ X9, X7 ^ Y8, Y7 ^ X8, Z0 ^ X5 ^ Y5 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X5, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, X6 ^ Y8, Y6 ^ X8, X7 ^ Y7, Z0 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, X5 ^ Y8, X6 ^ Y7, Y6 ^ X7, Z0 ^ X5 ^ Y5 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Y3 ^ X7, X5 ^ Y7, X6 ^ Y6, Z0 ^ X5 ^ Y5 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, X3 ^ Y7, Y3 ^ X6, X5 ^ Y6, Z0 ^ X5 ^ Y5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, X5, X6, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y6 ^ X9, X7 ^ Y8, Y7 ^ X8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X5, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X6 ^ Y8, Y6 ^ X8, X7 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y8, X6 ^ Y7, Y6 ^ X7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y3 ^ X7, X5 ^ Y7, X6 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X3 ^ Y7, Y3 ^ X6, X5 ^ Y6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, X6, Y6, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ X7 ^ Y7, X7 ^ Y8, Y7 ^ X8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X6, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ X7 ^ Y7, Y6 ^ X8, X7 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ X7 ^ Y7, X6 ^ Y7, Y6 ^ X7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ X7 ^ Y7, Y3 ^ X7, X6 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ X7 ^ Y7, X3 ^ Y6, Y3 ^ X6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, X6, Y6, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, X7 ^ Y8, Y7 ^ X8, Z0 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X6, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, X7 ^ Y7, Y6 ^ X8, Z0 ^ X5 ^ Y6 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y3, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, X6 ^ Y7, Y6 ^ X7, Z0 ^ X5 ^ Y6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Y3 ^ X7, X6 ^ Y6, Z0 ^ X5 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Y3 ^ X6, X3 ^ Y6, Z0 ^ X5 ^ Y6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, X6, Y6, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X7 ^ Y8, Y7 ^ X8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X6, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X7 ^ Y7, Y6 ^ X8 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y3, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X6 ^ Y7, Y6 ^ X7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y3 ^ X7, X6 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y3 ^ X6, X3 ^ Y6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y6, X7, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X6 ^ X8 ^ Y8, Y7 ^ X8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Y6, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X6 ^ X8 ^ Y8, X7 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y3, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X6 ^ X8 ^ Y8, Y6 ^ X7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X6 ^ X8 ^ Y8, Y3 ^ X7 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X3 ^ X8 ^ Y8, Y3 ^ X6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, X6, X7, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Y7 ^ X8, Z0 ^ X6 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X6, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, X7 ^ Y7, Z0 ^ X6 ^ Y6 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y3, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, X6 ^ Y7, Z0 ^ X6 ^ Y6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Y3 ^ X7, Z0 ^ X6 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, X3, Y4 ^ X8 ^ Y8, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, Y3 ^ X6, Y2 ^ X6 ^ Y6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, X6, X7, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, Y7 ^ X8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X6, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X7 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y3, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X6 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, Y3 ^ X7 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, X3, Y4 ^ X8 ^ Y8, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, Y3 ^ X6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, X7, Y7, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X6 ^ X9 ^ Y9 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X7, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X6 ^ X9 ^ Y9 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y3, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X6 ^ X9 ^ Y9 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, Y3, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X3 ^ X9 ^ Y9 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y3, Y4 ^ X8 ^ Y8, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, X3 ^ X9 ^ Y9 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, X7, Y7, Y4 ^ X9 ^ Y9, X4 ^ Y4 ^ Z4, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X7, Y4 ^ X9 ^ Y9, X4 ^ Y4 ^ Z4, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y3, Y4 ^ X9 ^ Y9, X4 ^ Y4 ^ Z4, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, X3, Y3, Y4 ^ X9 ^ Y9, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X8, Z1 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X3, Y3, Y4 ^ X9 ^ Y9, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, X7, Y7, Y4 ^ X9 ^ Y9, X4 ^ Y4 ^ Z4, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X7, Y4 ^ X9 ^ Y9, X4 ^ Y4 ^ Z4, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y3, Y4 ^ X9 ^ Y9, X4 ^ Y4 ^ Z4, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, X3, Y3, Y4 ^ X9 ^ Y9, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X8, Z1 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X3, Y3, Y4 ^ X9 ^ Y9, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7 }, -}; - -const UINT_64 SW_64K_R_X_2xaa_RBPLUS[][16]= -{ - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, X4, Y4, Y5 ^X10, X5 ^Y10, Y6 ^ X9, X6 ^ Y9, Y7 ^ X8, S0 ^ X7 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^X10, X5 ^ Y9, Y5 ^ X9, X6 ^ Y8, Y6 ^ X8, S0 ^ X7 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y3, Y4 ^ X9, X4 ^ Y9, Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, S0 ^ X6 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3 ^ X9, X4 ^ Y8, Y4 ^ X8, X5 ^ Y7, Y5 ^ X7, S0 ^ X6 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y2, Y3 ^ X8, X3 ^ Y8, Y4 ^ X7, X4 ^ Y7, Y5 ^ X6, S0 ^ X5 ^ Y6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Z0 ^ X4 ^ Y4, Y4, X5, Y5 ^X10, Y6 ^ X9, X6 ^ Y9, Y7 ^ X8, S0 ^ X7 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Z0 ^ X4 ^ Y4, Y3, Y4, X5 ^ Y9, Y5 ^ X9, X6 ^ Y8, Y6 ^ X8, S0 ^ X7 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Z0 ^ X4 ^ Y4, X3, Y3, Y4 ^ X9, Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, S0 ^ X6 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Z0 ^ X4 ^ Y4, Y2, X3, Y3 ^ X9, Y4 ^ X8, X5 ^ Y7, Y5 ^ X7, S0 ^ X6 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Z0 ^ X4 ^ Y4, X2, Y2, Y3 ^ X8, Y4 ^ X7, X3 ^ Y7, Y5 ^ X6, S0 ^ X5 ^ Y6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5, Y5, Y6 ^ X9, X6 ^ Y9, Y7 ^ X8, S0 ^ X7 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, Y3, X5, Y5 ^ X9, X6 ^ Y8, Y6 ^ X8, S0 ^ X7 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X3, Y3, Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, S0 ^ X6 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, Y2, X3, Y3 ^ X8, X5 ^ Y7, Y5 ^ X7, S0 ^ X6 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X2, Y2, Y3 ^ X7, X3 ^ Y7, Y5 ^ X6, S0 ^ X5 ^ Y6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ X6 ^ Y6, Y5, X6, Y6 ^ X9, Y7 ^ X8, S0 ^ X7 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ X6 ^ Y6, Y3, Y5, X6 ^ Y8, Y6 ^ X8, S0 ^ X7 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ X6 ^ Y6, X3, Y3, Y5 ^ X8, Y6 ^ X7, S0 ^ X6 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ X6 ^ Y6, Y2, X3, Y3 ^ X8, Y5 ^ X7, S0 ^ X6 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, S0 ^ X6 ^ Y6, X2, Y2, Y3 ^ X6, X3 ^ Y6, X5 ^ Y5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, X5, X6, Y6 ^ X9, Y7 ^ X8, S0 ^ X7 ^ Y8, Z0 ^ X5 ^ Y5 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Y3, X5, X6 ^ Y8, Y6 ^ X8, S0 ^ X7 ^ Y7, Z0 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, X3, Y3, X5 ^ Y8, Y6 ^ X7, S0 ^ X6 ^ Y7, Z0 ^ X5 ^ Y5 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Y2, X3, Y3 ^ X7, X5 ^ Y7, S0 ^ X6 ^ Y6, Z0 ^ X5 ^ Y5 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, X2, Y2, Y3 ^ X6, X3 ^ Y7, S0 ^ X5 ^ Y6, Z0 ^ X5 ^ Y5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5, X6, Y6 ^ X9, Y7 ^ X8, S0 ^ X7 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y3, X5, X6 ^ Y8, Y6 ^ X8, S0 ^ X7 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X3, Y3, X5 ^ Y8, Y6 ^ X7, S0 ^ X6 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y2, X3, Y3 ^ X7, X5 ^ Y7, S0 ^ X6 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X2, Y2, Y3 ^ X6, X3 ^ Y7, S0 ^ X5 ^ Y6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ X7 ^ Y7, X6, Y6, Y7 ^ X8, S0 ^ X7 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ X7 ^ Y7, Y3, X6, Y6 ^ X8, S0 ^ X7 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ X7 ^ Y7, X3, Y3, Y6 ^ X7, S0 ^ X6 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ X7 ^ Y7, Y2, X3, Y3 ^ X7, S0 ^ X6 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, S0 ^ X7 ^ Y7, X2, Y2, Y3 ^ X5, X3 ^ Y6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, X6, Y6, Y7 ^ X8, S0 ^ X7 ^ Y8, Z0 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Y3, X6, Y6 ^ X8, S0 ^ X7 ^ Y7, Z0 ^ X5 ^ Y6 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, X3, Y3, Y6 ^ X7, S0 ^ X6 ^ Y7, Z0 ^ X5 ^ Y6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Y2, X3, Y3 ^ X7, S0 ^ X6 ^ Y6, Z0 ^ X5 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X2, Y2, Y3 ^ X6, X3 ^ Y6, S0 ^ X5 ^ Y6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X6, Y6, Y7 ^ X8, S0 ^ X7 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y3, X6, Y6 ^ X8, S0 ^ X7 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X3, Y3, Y6 ^ X7, S0 ^ X6 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y2, X3, Y3 ^ X7, S0 ^ X6 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, S0 ^ X5 ^ Y6, X2, Y2, Y3 ^ X6, X3 ^ Y6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X6 ^ X8 ^ Y8, Y6, Y7, S0 ^ X7 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X6 ^ X8 ^ Y8, Y3, Y6, S0 ^ X7 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, S0 ^ X8 ^ Y8, X3, Y3, X6 ^ Y6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, S0 ^ X8 ^ Y8, Y2, X3, Y3 ^ X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, S0 ^ X5 ^ Y6, X3 ^ X8 ^ Y8, X2, Y2, Y3 ^ X6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, X6, Y7, S0 ^ X7 ^ Y8, Z0 ^ X6 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Y3, X6, S0 ^ X7 ^ Y7, Z0 ^ X6 ^ Y6 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, X3, Y3, S0 ^ X6 ^ Y7, Z0 ^ X6 ^ Y6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y4 ^ X8 ^ Y8, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, Y2, X3, Y3 ^ X6, S0 ^ X6 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, S0 ^ X5 ^ Y7, X2, X3, Y3 ^ X6, Y2 ^ X6 ^ Y6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X6, Y7, S0 ^ X7 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, Y3, X6, S0 ^ X7 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X3, Y3, S0 ^ X6 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y4 ^ X8 ^ Y8, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, S0 ^ X6 ^ Y6, Y2, X3, Y3 ^ X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, S0 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, X2, X3, Y3 ^ X6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X6 ^ X9 ^ Y9, Y7, S0 ^ X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X6 ^ X9 ^ Y9, Y3, S0 ^ X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, S0 ^ X9 ^ Y9, X3, Y3 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y4 ^ X8 ^ Y8, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, S0 ^ X6 ^ Y6, X3 ^ X9 ^ Y9, Y2, Y3 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, S0 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, X3 ^ X9 ^ Y9, X2, Y3 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4 ^ X9 ^ Y9, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X8, Z1 ^ X5 ^ Y8, Z0 ^ Y6 ^ X7, Y7, S0 ^ X7, Z4 ^ X6 ^ Y7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y4 ^ X9 ^ Y9, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X8, Z1 ^ X5 ^ Y8, Z0 ^ Y6 ^ X7, Y3, S0 ^ X7, Z4 ^ X6 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y4 ^ X9 ^ Y9, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X8, Z1 ^ X5 ^ Y8, Z0 ^ Y6 ^ X7, X3, Y3, S0 ^ X6 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y4 ^ X9 ^ Y9, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, S0 ^ Y6 ^ X7, X3, Y3, Y2 ^ X6 ^ Y7 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, S0 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X3, Y3, X2 ^ X6 ^ Y7 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4 ^ X9 ^ Y9, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X8, Z1 ^ X5 ^ Y8, Z0 ^ Y6 ^ X7, Z4 ^ X6 ^ Y7, Y7, S0 ^ X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y4 ^ X9 ^ Y9, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X8, Z1 ^ X5 ^ Y8, Z0 ^ Y6 ^ X7, Z4 ^ X6 ^ Y7, Y3, S0 ^ X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y4 ^ X9 ^ Y9, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X8, Z1 ^ X5 ^ Y8, Z0 ^ Y6 ^ X7, S0 ^ X6 ^ Y7, X3, Y3 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y4 ^ X9 ^ Y9, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, S0 ^ Y6 ^ X7, Y2 ^ X6 ^ Y7, X3, Y3 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, S0 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7, X3, Y3 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, X4, Y4, Y5 ^X10, X5 ^Y10, Y6 ^ X9, X6 ^ Y9, Y7 ^ X8, S0 ^ X7 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^X10, X5 ^ Y9, Y5 ^ X9, X6 ^ Y8, Y6 ^ X8, S0 ^ X7 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y3, Y4 ^ X9, X4 ^ Y9, Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, S0 ^ X6 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3 ^ X9, X4 ^ Y8, Y4 ^ X8, X5 ^ Y7, Y5 ^ X7, S0 ^ X6 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y2, Y3 ^ X8, X3 ^ Y8, Y4 ^ X7, X4 ^ Y7, Y5 ^ X6, S0 ^ X5 ^ Y6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, Z0 ^ X4 ^ Y4, X5, Y5 ^X10, Y6 ^ X9, X6 ^ Y9, Y7 ^ X8, S0 ^ X7 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Z0 ^ X4 ^ Y4, Y4, X5 ^ Y9, Y5 ^ X9, X6 ^ Y8, Y6 ^ X8, S0 ^ X7 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Z0 ^ X4 ^ Y4, Y3, Y4 ^ X9, Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, S0 ^ X6 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, Z0 ^ X4 ^ Y4, X3, Y3 ^ X9, Y4 ^ X8, X5 ^ Y7, Y5 ^ X7, S0 ^ X6 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Z0 ^ X4 ^ Y4, Y2, Y3 ^ X8, Y4 ^ X7, X3 ^ Y7, Y5 ^ X6, S0 ^ X5 ^ Y6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, X5, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, Y5, Y6 ^ X9, X6 ^ Y9, Y7 ^ X8, S0 ^ X7 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5, Y5 ^ X9, X6 ^ Y8, Y6 ^ X8, S0 ^ X7 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, Y3, Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, S0 ^ X6 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X3, Y3 ^ X8, X5 ^ Y7, Y5 ^ X7, S0 ^ X6 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, Y2, Y3 ^ X7, X3 ^ Y7, Y5 ^ X6, S0 ^ X5 ^ Y6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y5, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ X6 ^ Y6, X6, Y6 ^ X9, Y7 ^ X8, S0 ^ X7 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ X6 ^ Y6, Y5, X6 ^ Y8, Y6 ^ X8, S0 ^ X7 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ X6 ^ Y6, Y3, Y5 ^ X8, Y6 ^ X7, S0 ^ X6 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ X6 ^ Y6, X3, Y3 ^ X8, Y5 ^ X7, S0 ^ X6 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, S0 ^ X6 ^ Y6, Y2, Y3 ^ X6, X3 ^ Y6, X5 ^ Y5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, X5, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, X6, Y6 ^ X9, Y7 ^ X8, S0 ^ X7 ^ Y8, Z0 ^ X5 ^ Y5 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, X5, X6 ^ Y8, Y6 ^ X8, S0 ^ X7 ^ Y7, Z0 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Y3, X5 ^ Y8, Y6 ^ X7, S0 ^ X6 ^ Y7, Z0 ^ X5 ^ Y5 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, X3, Y3 ^ X7, X5 ^ Y7, S0 ^ X6 ^ Y6, Z0 ^ X5 ^ Y5 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Y2, Y3 ^ X6, X3 ^ Y7, S0 ^ X5 ^ Y6, Z0 ^ X5 ^ Y5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, X5, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X6, Y6 ^ X9, Y7 ^ X8, S0 ^ X7 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5, X6 ^ Y8, Y6 ^ X8, S0 ^ X7 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y3, X5 ^ Y8, Y6 ^ X7, S0 ^ X6 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X3, Y3 ^ X7, X5 ^ Y7, S0 ^ X6 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y2, Y3 ^ X6, X3 ^ Y7, S0 ^ X5 ^ Y6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, X6, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ X7 ^ Y7, Y6, Y7 ^ X8, S0 ^ X7 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ X7 ^ Y7, X6, Y6 ^ X8, S0 ^ X7 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ X7 ^ Y7, Y3, Y6 ^ X7, S0 ^ X6 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ X7 ^ Y7, X3, Y3 ^ X7, S0 ^ X6 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, S0 ^ X7 ^ Y7, Y2, Y3 ^ X5, X3 ^ Y6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, X6, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Y6, Y7 ^ X8, S0 ^ X7 ^ Y8, Z0 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, X6, Y6 ^ X8, S0 ^ X7 ^ Y7, Z0 ^ X5 ^ Y6 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Y3, Y6 ^ X7, S0 ^ X6 ^ Y7, Z0 ^ X5 ^ Y6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, X3, Y3 ^ X7, S0 ^ X6 ^ Y6, Z0 ^ X5 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, Y2, Y3 ^ X6, X3 ^ Y6, S0 ^ X5 ^ Y6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, X6, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y6, Y7 ^ X8, S0 ^ X7 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X6, Y6 ^ X8, S0 ^ X7 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y3, Y6 ^ X7, S0 ^ X6 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X3, Y3 ^ X7, S0 ^ X6 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, S0 ^ X5 ^ Y6, Y2, Y3 ^ X6, X3 ^ Y6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y6, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X6 ^ X8 ^ Y8, Y7, S0 ^ X7 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X6 ^ X8 ^ Y8, Y6, S0 ^ X7 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, S0 ^ X8 ^ Y8, Y3, X6 ^ Y6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, S0 ^ X8 ^ Y8, X3, Y3 ^ X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, S0 ^ X5 ^ Y6, X3 ^ X8 ^ Y8, Y2, Y3 ^ X6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, X6, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Y7, S0 ^ X7 ^ Y8, Z0 ^ X6 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, X6, S0 ^ X7 ^ Y7, Z0 ^ X6 ^ Y6 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Y3, S0 ^ X6 ^ Y7, Z0 ^ X6 ^ Y6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, Y4 ^ X8 ^ Y8, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, X3, Y3 ^ X6, S0 ^ X6 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, S0 ^ X5 ^ Y7, X3, Y3 ^ X6, Y2 ^ X6 ^ Y6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, X6, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, Y7, S0 ^ X7 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X6, S0 ^ X7 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, Y3, S0 ^ X6 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, Y4 ^ X8 ^ Y8, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, S0 ^ X6 ^ Y6, X3, Y3 ^ X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, S0 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, X3, Y3 ^ X6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y7, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X6 ^ X9 ^ Y9, S0 ^ X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X6 ^ X9 ^ Y9, S0 ^ X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, S0 ^ X9 ^ Y9, Y3 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, Y4 ^ X8 ^ Y8, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, S0 ^ X6 ^ Y6, X3 ^ X9 ^ Y9, Y3 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, S0 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, X3 ^ X9 ^ Y9, Y3 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y7, Y4 ^ X9 ^ Y9, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X8, Z1 ^ X5 ^ Y8, Z0 ^ Y6 ^ X7, S0 ^ X7, Z4 ^ X6 ^ Y7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Y4 ^ X9 ^ Y9, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X8, Z1 ^ X5 ^ Y8, Z0 ^ Y6 ^ X7, S0 ^ X7, Z4 ^ X6 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y4 ^ X9 ^ Y9, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X8, Z1 ^ X5 ^ Y8, Z0 ^ Y6 ^ X7, Y3, S0 ^ X6 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, X3, Y4 ^ X9 ^ Y9, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, S0 ^ Y6 ^ X7, Y3, Y2 ^ X6 ^ Y7 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X3, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, S0 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, Y3, X2 ^ X6 ^ Y7 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y7, Y4 ^ X9 ^ Y9, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X8, Z1 ^ X5 ^ Y8, Z0 ^ Y6 ^ X7, Z4 ^ X6 ^ Y7, S0 ^ X7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Y4 ^ X9 ^ Y9, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X8, Z1 ^ X5 ^ Y8, Z0 ^ Y6 ^ X7, Z4 ^ X6 ^ Y7, S0 ^ X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y4 ^ X9 ^ Y9, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X8, Z1 ^ X5 ^ Y8, Z0 ^ Y6 ^ X7, S0 ^ X6 ^ Y7, Y3 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, X3, Y4 ^ X9 ^ Y9, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, S0 ^ Y6 ^ X7, Y2 ^ X6 ^ Y7, Y3 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X3, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, S0 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7, Y3 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, X4, Y4, Y5 ^X10, X5 ^Y10, Y6 ^ X9, X6 ^ Y9, Y7 ^ X8, S0 ^ X7 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^X10, X5 ^ Y9, Y5 ^ X9, X6 ^ Y8, Y6 ^ X8, S0 ^ X7 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y3, Y4 ^ X9, X4 ^ Y9, Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, S0 ^ X6 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3 ^ X9, X4 ^ Y8, Y4 ^ X8, X5 ^ Y7, Y5 ^ X7, S0 ^ X6 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y2, Y3 ^ X8, X3 ^ Y8, Y4 ^ X7, X4 ^ Y7, Y5 ^ X6, S0 ^ X5 ^ Y6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X5, Z0 ^ X4 ^ Y4, Y5 ^X10, Y6 ^ X9, X6 ^ Y9, Y7 ^ X8, S0 ^ X7 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Y4, Z0 ^ X4 ^ Y4, X5 ^ Y9, Y5 ^ X9, X6 ^ Y8, Y6 ^ X8, S0 ^ X7 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y3, Z0 ^ X4 ^ Y4, Y4 ^ X9, Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, S0 ^ X6 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Z0 ^ X4 ^ Y4, Y3 ^ X9, Y4 ^ X8, X5 ^ Y7, Y5 ^ X7, S0 ^ X6 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y2, Z0 ^ X4 ^ Y4, Y3 ^ X8, Y4 ^ X7, X3 ^ Y7, Y5 ^ X6, S0 ^ X5 ^ Y6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, X5, Y5, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, Y6 ^ X9, X6 ^ Y9, Y7 ^ X8, S0 ^ X7 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X5, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, Y5 ^ X9, X6 ^ Y8, Y6 ^ X8, S0 ^ X7 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, S0 ^ X6 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, Y3 ^ X8, X5 ^ Y7, Y5 ^ X7, S0 ^ X6 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, Y3 ^ X7, X3 ^ Y7, Y5 ^ X6, S0 ^ X5 ^ Y6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y5, X6, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ X6 ^ Y6, Y6 ^ X9, Y7 ^ X8, S0 ^ X7 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Y5, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ X6 ^ Y6, X6 ^ Y8, Y6 ^ X8, S0 ^ X7 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ X6 ^ Y6, Y5 ^ X8, Y6 ^ X7, S0 ^ X6 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ X6 ^ Y6, Y3 ^ X8, Y5 ^ X7, S0 ^ X6 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, S0 ^ X6 ^ Y6, Y3 ^ X6, X3 ^ Y6, X5 ^ Y5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, X5, X6, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Y6 ^ X9, Y7 ^ X8, S0 ^ X7 ^ Y8, Z0 ^ X5 ^ Y5 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X5, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, X6 ^ Y8, Y6 ^ X8, S0 ^ X7 ^ Y7, Z0 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, X5 ^ Y8, Y6 ^ X7, S0 ^ X6 ^ Y7, Z0 ^ X5 ^ Y5 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Y3 ^ X7, X5 ^ Y7, S0 ^ X6 ^ Y6, Z0 ^ X5 ^ Y5 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Y3 ^ X6, X3 ^ Y7, S0 ^ X5 ^ Y6, Z0 ^ X5 ^ Y5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, X5, X6, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y6 ^ X9, Y7 ^ X8, S0 ^ X7 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X5, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X6 ^ Y8, Y6 ^ X8, S0 ^ X7 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y8, Y6 ^ X7, S0 ^ X6 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y3 ^ X7, X5 ^ Y7, S0 ^ X6 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y3 ^ X6, X3 ^ Y7, S0 ^ X5 ^ Y6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, X6, Y6, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ X7 ^ Y7, Y7 ^ X8, S0 ^ X7 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X6, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ X7 ^ Y7, Y6 ^ X8, S0 ^ X7 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ X7 ^ Y7, Y6 ^ X7, S0 ^ X6 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ X7 ^ Y7, Y3 ^ X7, S0 ^ X6 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, S0 ^ X7 ^ Y7, Y3 ^ X5, X3 ^ Y6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, X6, Y6, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Y7 ^ X8, S0 ^ X7 ^ Y8, Z0 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X6, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Y6 ^ X8, S0 ^ X7 ^ Y7, Z0 ^ X5 ^ Y6 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y3, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Y6 ^ X7, S0 ^ X6 ^ Y7, Z0 ^ X5 ^ Y6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Y3 ^ X7, S0 ^ X6 ^ Y6, Z0 ^ X5 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, Y3 ^ X6, X3 ^ Y6, S0 ^ X5 ^ Y6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, X6, Y6, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y7 ^ X8, S0 ^ X7 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X6, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y6 ^ X8, S0 ^ X7 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y3, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y6 ^ X7, S0 ^ X6 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y3 ^ X7, S0 ^ X6 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, S0 ^ X5 ^ Y6, Y3 ^ X6, X3 ^ Y6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y6, Y7, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X6 ^ X8 ^ Y8, S0 ^ X7 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Y6, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X6 ^ X8 ^ Y8, S0 ^ X7 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y3, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, S0 ^ X8 ^ Y8, X6 ^ Y6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, S0 ^ X8 ^ Y8, Y3 ^ X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, S0 ^ X5 ^ Y6, X3 ^ X8 ^ Y8, Y3 ^ X6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, X6, Y7, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, S0 ^ X7 ^ Y8, Z0 ^ X6 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X6, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, S0 ^ X7 ^ Y7, Z0 ^ X6 ^ Y6 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y3, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, S0 ^ X6 ^ Y7, Z0 ^ X6 ^ Y6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y4 ^ X8 ^ Y8, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, Y3 ^ X6, S0 ^ X6 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, X3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, S0 ^ X5 ^ Y7, Y3 ^ X6, Y2 ^ X6 ^ Y6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, X6, Y7, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, S0 ^ X7 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X6, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, S0 ^ X7 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y3, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, S0 ^ X6 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y4 ^ X8 ^ Y8, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, S0 ^ X6 ^ Y6, Y3 ^ X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, X3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, S0 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, Y3 ^ X6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y7, S0 ^ X7, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X6 ^ X9 ^ Y9 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, S0 ^ X7, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X6 ^ X9 ^ Y9 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y3, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, S0 ^ X9 ^ Y9 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, Y3, Y4 ^ X8 ^ Y8, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, S0 ^ X6 ^ Y6, X3 ^ X9 ^ Y9 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, S0 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, X3 ^ X9 ^ Y9 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y7, S0 ^ X7, Y4 ^ X9 ^ Y9, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X8, Z1 ^ X5 ^ Y8, Z0 ^ Y6 ^ X7, Z4 ^ X6 ^ Y7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, S0 ^ X7, Y4 ^ X9 ^ Y9, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X8, Z1 ^ X5 ^ Y8, Z0 ^ Y6 ^ X7, Z4 ^ X6 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y3, Y4 ^ X9 ^ Y9, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X8, Z1 ^ X5 ^ Y8, Z0 ^ Y6 ^ X7, S0 ^ X6 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, X3, Y3, Y4 ^ X9 ^ Y9, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, S0 ^ Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X3, Y3, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, S0 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y7, S0 ^ X7, Y4 ^ X9 ^ Y9, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X8, Z1 ^ X5 ^ Y8, Z0 ^ Y6 ^ X7, Z4 ^ X6 ^ Y7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, S0 ^ X7, Y4 ^ X9 ^ Y9, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X8, Z1 ^ X5 ^ Y8, Z0 ^ Y6 ^ X7, Z4 ^ X6 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y3, Y4 ^ X9 ^ Y9, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X8, Z1 ^ X5 ^ Y8, Z0 ^ Y6 ^ X7, S0 ^ X6 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, X3, Y3, Y4 ^ X9 ^ Y9, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, S0 ^ Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X3, Y3, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, S0 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7 }, -}; - -const UINT_64 SW_64K_R_X_4xaa_RBPLUS[][16]= -{ - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, X4, Y4, X5 ^Y10, Y5 ^X10, X6 ^ Y9, Y6 ^ X9, S0 ^ X7 ^ Y8, S1 ^ Y7 ^ X8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^X10, X5 ^ Y9, Y5 ^ X9, X6 ^ Y8, S0 ^ Y6 ^ X8, S1 ^ X7 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y3, X4 ^ Y9, Y4 ^ X9, X5 ^ Y8, Y5 ^ X8, S0 ^ X6 ^ Y7, S1 ^ Y6 ^ X7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3 ^ X9, X4 ^ Y8, Y4 ^ X8, X5 ^ Y7, S0 ^ Y5 ^ X7, S1 ^ X6 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y2, X3 ^ Y8, Y3 ^ X8, X4 ^ Y7, Y4 ^ X7, S0 ^ X5 ^ Y6, S1 ^ Y5 ^ X6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Z0 ^ X4 ^ Y4, Y4, X5, Y5 ^X10, X6 ^ Y9, Y6 ^ X9, S0 ^ X7 ^ Y8, S1 ^ Y7 ^ X8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Z0 ^ X4 ^ Y4, Y3, Y4, X5 ^ Y9, X6 ^ Y8, Y5 ^ X9, S0 ^ X7 ^ Y7, S1 ^ Y6 ^ X8 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Z0 ^ X4 ^ Y4, X3, Y3, Y4 ^ X9, X5 ^ Y8, Y5 ^ X8, S0 ^ X6 ^ Y7, S1 ^ Y6 ^ X7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Z0 ^ X4 ^ Y4, Y2, X3, Y3 ^ X9, X5 ^ Y7, Y4 ^ X8, S0 ^ X6 ^ Y6, S1 ^ Y5 ^ X7 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Z0 ^ X4 ^ Y4, X2, Y2, Y3 ^ X8, X3 ^ Y7, Y4 ^ X7, S0 ^ X5 ^ Y6, S1 ^ Y5 ^ X6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5, Y5, X6 ^ Y9, Y6 ^ X9, S0 ^ X7 ^ Y8, S1 ^ Y7 ^ X8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, Y3, X5, X6 ^ Y8, Y5 ^ X9, S0 ^ X7 ^ Y7, S1 ^ Y6 ^ X8 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X3, Y3, X5 ^ Y8, Y5 ^ X8, S0 ^ X6 ^ Y7, S1 ^ Y6 ^ X7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, Y2, X3, Y3 ^ X8, X5 ^ Y7, S0 ^ X6 ^ Y6, S1 ^ Y5 ^ X7 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X2, Y2, Y3 ^ X7, X3 ^ Y7, S0 ^ X5 ^ Y6, S1 ^ Y5 ^ X6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ X6 ^ Y6, Y5, X6, Y6 ^ X9, S0 ^ X7 ^ Y8, S1 ^ Y7 ^ X8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ X6 ^ Y6, Y3, Y5, X6 ^ Y8, S0 ^ X7 ^ Y7, S1 ^ Y6 ^ X8 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ X6 ^ Y6, X3, Y3, Y5 ^ X8, S0 ^ X6 ^ Y7, S1 ^ Y6 ^ X7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ X6 ^ Y6, Y2, X3, Y3 ^ X8, S0 ^ X6 ^ Y6, S1 ^ Y5 ^ X7 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, S1 ^ X6 ^ Y6, X2, Y2, Y3 ^ X6, X3 ^ Y6, S0 ^ X5 ^ Y5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, X5, X6, Y6 ^ X9, S0 ^ X7 ^ Y8, S1 ^ Y7 ^ X8, Z0 ^ X5 ^ Y5 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Y3, X5, X6 ^ Y8, S0 ^ Y6 ^ X8, S1 ^ X7 ^ Y7, Z0 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, X3, Y3, X5 ^ Y8, S0 ^ X6 ^ Y7, S1 ^ Y6 ^ X7, Z0 ^ X5 ^ Y5 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Y2, X3, Y3 ^ X8, S0 ^ X6 ^ Y6, S1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y5 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y4 ^ X6 ^ Y6, Z0 ^ X4 ^ Y4, X2, Y2, Y3 ^ X6, X3 ^ Y7, S0 ^ X5 ^ Y6, S1 ^ X5 ^ Y5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5, X6, Y6 ^ X9, S0 ^ X7 ^ Y8, S1 ^ Y7 ^ X8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y3, X5, X6 ^ Y8, S0 ^ Y6 ^ X8, S1 ^ X7 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X3, Y3, X5 ^ Y8, S0 ^ X6 ^ Y7, S1 ^ Y6 ^ X7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y2, X3, Y3 ^ X8, S0 ^ X6 ^ Y6, S1 ^ Y5 ^ X7 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y4 ^ X6 ^ Y6, Z0 ^ X4 ^ Y4, S1 ^ X5 ^ Y5, X2, Y2, Y3 ^ X6, X3 ^ Y7, S0 ^ X5 ^ Y6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ X7 ^ Y7, X6, Y6, S0 ^ X7 ^ Y8, S1 ^ Y7 ^ X8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ X7 ^ Y7, Y3, X6, S0 ^ Y6 ^ X8, S1 ^ X7 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ X7 ^ Y7, X3, Y3, S0 ^ X6 ^ Y7, S1 ^ Y6 ^ X7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, S1 ^ X7 ^ Y7, Y2, X3, Y3 ^ X7, S0 ^ Y5 ^ X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y4 ^ X6 ^ Y6, Z0 ^ X4 ^ Y4, S1 ^ X5 ^ Y5, S0 ^ X7 ^ Y7, X2, Y2, Y3 ^ X5, X3 ^ Y6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, X6, Y6, S0 ^ X7 ^ Y8, S1 ^ Y7 ^ X8, Z0 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Y3, X6, S0 ^ X7 ^ Y7, S1 ^ Y6 ^ X8, Z0 ^ X5 ^ Y6 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, X3, Y3, S0 ^ X6 ^ Y7, S1 ^ Y6 ^ X7, Z0 ^ X5 ^ Y6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, S1 ^ Y5 ^ X6, Y2, X3, Y3 ^ X7, S0 ^ X6 ^ Y6, Z0 ^ X5 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y4 ^ X7 ^ Y7, Z0 ^ X4 ^ Y4, S1 ^ Y5 ^ X6, X2, Y2, Y3 ^ X6, X3 ^ Y6, S0 ^ X5 ^ Y6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X6, Y6, S0 ^ X7 ^ Y8, S1 ^ Y7 ^ X8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y3, X6, S0 ^ X7 ^ Y7, S1 ^ Y6 ^ X8 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X3, Y3, S0 ^ X6 ^ Y7, S1 ^ Y6 ^ X7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, S1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y2, X3, Y3 ^ X7, S0 ^ X6 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y4 ^ X7 ^ Y7, Z0 ^ X4 ^ Y4, S1 ^ Y5 ^ X6, S0 ^ X5 ^ Y6, X2, Y2, Y3 ^ X6, X3 ^ Y6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X6 ^ X8 ^ Y8, Y6, S0 ^ X7, S1 ^ Y7 ^ X8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X6 ^ X8 ^ Y8, Y3, S0 ^ X7, S1 ^ Y6 ^ X8 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, S1 ^ X8 ^ Y8, X3, Y3, S0 ^ X6 ^ Y6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, S1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, S0 ^ X8 ^ Y8, Y2, X3, Y3 ^ X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y4 ^ X7 ^ Y7, Z0 ^ X4 ^ Y4, S1 ^ Y5 ^ X6, S0 ^ X5 ^ Y6, X3 ^ X8 ^ Y8, X2, Y2, Y3 ^ X6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4 ^ X8 ^ Y8, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, X6, S0 ^ X7, S1 ^ Y7 ^ X8, Z3 ^ X6 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y4 ^ X8 ^ Y8, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, Y3, S0 ^ X7, S1 ^ Y6 ^ X8, Z3 ^ X6 ^ Y6 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y4 ^ X8 ^ Y8, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, X3, Y3, S0 ^ X6 ^ Y7, S1 ^ X6 ^ Y6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, S1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, Y2, X3, Y3 ^ X6, S0 ^ X6 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y4 ^ X8 ^ Y8, Z0 ^ X4 ^ Y4, S1 ^ Y5 ^ X7, S0 ^ X5 ^ Y7, X2, X3, Y3 ^ X6, Y2 ^ X6 ^ Y6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4 ^ X8 ^ Y8, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, Z3 ^ X6 ^ Y6, X6, S0 ^ X7, S1 ^ Y7 ^ X8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y4 ^ X8 ^ Y8, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, Z3 ^ X6 ^ Y6, Y3, S0 ^ X7, S1 ^ Y6 ^ X8 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y4 ^ X8 ^ Y8, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, S1 ^ X6 ^ Y6, X3, Y3, S0 ^ X6 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, S1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, S0 ^ X6 ^ Y6, Y2, X3, Y3 ^ X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y4 ^ X8 ^ Y8, Z0 ^ X4 ^ Y4, S1 ^ Y5 ^ X7, S0 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, X2, X3, Y3 ^ X6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4 ^ X8 ^ Y8, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, Z3 ^ X6 ^ Y6, X6 ^ X9 ^ Y9, S0 ^ X7, S1 ^ Y7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y4 ^ X8 ^ Y8, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, Z3 ^ X6 ^ Y6, S1 ^ X9 ^ Y9, Y3, S0 ^ X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y4 ^ X8 ^ Y8, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, S1 ^ X6 ^ Y6, S0 ^ X9 ^ Y9, X3, Y3 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, S1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, S0 ^ X6 ^ Y6, X3 ^ X9 ^ Y9, Y2, Y3 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y4 ^ X8 ^ Y8, Z0 ^ X4 ^ Y4, S1 ^ Y5 ^ X7, S0 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, X3 ^ X9 ^ Y9, X2, Y3 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4 ^ X9 ^ Y9, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, Z4 ^ Y6 ^ X7, S0 ^ X7, S1 ^ Y7, Z3 ^ X6 ^ Y7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y4 ^ X9 ^ Y9, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, S1 ^ Y6 ^ X7, Y3, S0 ^ X7, Z3 ^ X6 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y4 ^ X9 ^ Y9, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, S1 ^ Y6 ^ X7, X3, Y3, S0 ^ X6 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, S1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, S0 ^ Y6 ^ X7, X3, Y3, Y2 ^ X6 ^ Y7 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y4 ^ X9 ^ Y9, Z0 ^ X4 ^ Y4, S1 ^ Y5 ^ X8, S0 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X3, Y3, X2 ^ X6 ^ Y7 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4 ^ X9 ^ Y9, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, Z4 ^ Y6 ^ X7, Z3 ^ X6 ^ Y7, S0 ^ X7, S1 ^ Y7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y4 ^ X9 ^ Y9, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, S1 ^ Y6 ^ X7, Z3 ^ X6 ^ Y7, Y3, S0 ^ X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y4 ^ X9 ^ Y9, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, S1 ^ Y6 ^ X7, S0 ^ X6 ^ Y7, X3, Y3 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, S1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, S0 ^ Y6 ^ X7, Y2 ^ X6 ^ Y7, X3, Y3 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y4 ^ X9 ^ Y9, Z0 ^ X4 ^ Y4, S1 ^ Y5 ^ X8, S0 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7, X3, Y3 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, X4, Y4, X5 ^Y10, Y5 ^X10, X6 ^ Y9, Y6 ^ X9, S0 ^ X7 ^ Y8, S1 ^ Y7 ^ X8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^X10, X5 ^ Y9, Y5 ^ X9, X6 ^ Y8, S0 ^ Y6 ^ X8, S1 ^ X7 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y3, X4 ^ Y9, Y4 ^ X9, X5 ^ Y8, Y5 ^ X8, S0 ^ X6 ^ Y7, S1 ^ Y6 ^ X7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3 ^ X9, X4 ^ Y8, Y4 ^ X8, X5 ^ Y7, S0 ^ Y5 ^ X7, S1 ^ X6 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y2, X3 ^ Y8, Y3 ^ X8, X4 ^ Y7, Y4 ^ X7, S0 ^ X5 ^ Y6, S1 ^ Y5 ^ X6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, Z0 ^ X4 ^ Y4, X5, Y5 ^X10, X6 ^ Y9, Y6 ^ X9, S0 ^ X7 ^ Y8, S1 ^ Y7 ^ X8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Z0 ^ X4 ^ Y4, Y4, X5 ^ Y9, X6 ^ Y8, Y5 ^ X9, S0 ^ X7 ^ Y7, S1 ^ Y6 ^ X8 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Z0 ^ X4 ^ Y4, Y3, Y4 ^ X9, X5 ^ Y8, Y5 ^ X8, S0 ^ X6 ^ Y7, S1 ^ Y6 ^ X7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, Z0 ^ X4 ^ Y4, X3, Y3 ^ X9, X5 ^ Y7, Y4 ^ X8, S0 ^ X6 ^ Y6, S1 ^ Y5 ^ X7 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Z0 ^ X4 ^ Y4, Y2, Y3 ^ X8, X3 ^ Y7, Y4 ^ X7, S0 ^ X5 ^ Y6, S1 ^ Y5 ^ X6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, X5, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, Y5, X6 ^ Y9, Y6 ^ X9, S0 ^ X7 ^ Y8, S1 ^ Y7 ^ X8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5, X6 ^ Y8, Y5 ^ X9, S0 ^ X7 ^ Y7, S1 ^ Y6 ^ X8 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, Y3, X5 ^ Y8, Y5 ^ X8, S0 ^ X6 ^ Y7, S1 ^ Y6 ^ X7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X3, Y3 ^ X8, X5 ^ Y7, S0 ^ X6 ^ Y6, S1 ^ Y5 ^ X7 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, Y2, Y3 ^ X7, X3 ^ Y7, S0 ^ X5 ^ Y6, S1 ^ Y5 ^ X6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y5, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ X6 ^ Y6, X6, Y6 ^ X9, S0 ^ X7 ^ Y8, S1 ^ Y7 ^ X8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ X6 ^ Y6, Y5, X6 ^ Y8, S0 ^ X7 ^ Y7, S1 ^ Y6 ^ X8 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ X6 ^ Y6, Y3, Y5 ^ X8, S0 ^ X6 ^ Y7, S1 ^ Y6 ^ X7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ X6 ^ Y6, X3, Y3 ^ X8, S0 ^ X6 ^ Y6, S1 ^ Y5 ^ X7 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, S1 ^ X6 ^ Y6, Y2, Y3 ^ X6, X3 ^ Y6, S0 ^ X5 ^ Y5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, X5, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, X6, Y6 ^ X9, S0 ^ X7 ^ Y8, S1 ^ Y7 ^ X8, Z0 ^ X5 ^ Y5 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, X5, X6 ^ Y8, S0 ^ Y6 ^ X8, S1 ^ X7 ^ Y7, Z0 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Y3, X5 ^ Y8, S0 ^ X6 ^ Y7, S1 ^ Y6 ^ X7, Z0 ^ X5 ^ Y5 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, X3, Y3 ^ X8, S0 ^ X6 ^ Y6, S1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y5 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y4 ^ X6 ^ Y6, Z0 ^ X4 ^ Y4, Y2, Y3 ^ X6, X3 ^ Y7, S0 ^ X5 ^ Y6, S1 ^ X5 ^ Y5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, X5, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X6, Y6 ^ X9, S0 ^ X7 ^ Y8, S1 ^ Y7 ^ X8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5, X6 ^ Y8, S0 ^ Y6 ^ X8, S1 ^ X7 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y3, X5 ^ Y8, S0 ^ X6 ^ Y7, S1 ^ Y6 ^ X7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X3, Y3 ^ X8, S0 ^ X6 ^ Y6, S1 ^ Y5 ^ X7 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y4 ^ X6 ^ Y6, Z0 ^ X4 ^ Y4, S1 ^ X5 ^ Y5, Y2, Y3 ^ X6, X3 ^ Y7, S0 ^ X5 ^ Y6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, X6, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ X7 ^ Y7, Y6, S0 ^ X7 ^ Y8, S1 ^ Y7 ^ X8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ X7 ^ Y7, X6, S0 ^ Y6 ^ X8, S1 ^ X7 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ X7 ^ Y7, Y3, S0 ^ X6 ^ Y7, S1 ^ Y6 ^ X7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, S1 ^ X7 ^ Y7, X3, Y3 ^ X7, S0 ^ Y5 ^ X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y4 ^ X6 ^ Y6, Z0 ^ X4 ^ Y4, S1 ^ X5 ^ Y5, S0 ^ X7 ^ Y7, Y2, Y3 ^ X5, X3 ^ Y6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, X6, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Y6, S0 ^ X7 ^ Y8, S1 ^ Y7 ^ X8, Z0 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, X6, S0 ^ X7 ^ Y7, S1 ^ Y6 ^ X8, Z0 ^ X5 ^ Y6 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Y3, S0 ^ X6 ^ Y7, S1 ^ Y6 ^ X7, Z0 ^ X5 ^ Y6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, S1 ^ Y5 ^ X6, X3, Y3 ^ X7, S0 ^ X6 ^ Y6, Z0 ^ X5 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y4 ^ X7 ^ Y7, Z0 ^ X4 ^ Y4, S1 ^ Y5 ^ X6, Y2, Y3 ^ X6, X3 ^ Y6, S0 ^ X5 ^ Y6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, X6, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y6, S0 ^ X7 ^ Y8, S1 ^ Y7 ^ X8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X6, S0 ^ X7 ^ Y7, S1 ^ Y6 ^ X8 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y3, S0 ^ X6 ^ Y7, S1 ^ Y6 ^ X7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, S1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X3, Y3 ^ X7, S0 ^ X6 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y4 ^ X7 ^ Y7, Z0 ^ X4 ^ Y4, S1 ^ Y5 ^ X6, S0 ^ X5 ^ Y6, Y2, Y3 ^ X6, X3 ^ Y6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y6, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X6 ^ X8 ^ Y8, S0 ^ X7, S1 ^ Y7 ^ X8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X6 ^ X8 ^ Y8, S0 ^ X7, S1 ^ Y6 ^ X8 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, S1 ^ X8 ^ Y8, Y3, S0 ^ X6 ^ Y6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, S1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, S0 ^ X8 ^ Y8, X3, Y3 ^ X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y4 ^ X7 ^ Y7, Z0 ^ X4 ^ Y4, S1 ^ Y5 ^ X6, S0 ^ X5 ^ Y6, X3 ^ X8 ^ Y8, Y2, Y3 ^ X6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, X6, Y4 ^ X8 ^ Y8, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, S0 ^ X7, S1 ^ Y7 ^ X8, Z3 ^ X6 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Y4 ^ X8 ^ Y8, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, S0 ^ X7, S1 ^ Y6 ^ X8, Z3 ^ X6 ^ Y6 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y4 ^ X8 ^ Y8, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, Y3, S0 ^ X6 ^ Y7, S1 ^ X6 ^ Y6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, S1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, X3, Y3 ^ X6, S0 ^ X6 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y4 ^ X8 ^ Y8, Z0 ^ X4 ^ Y4, S1 ^ Y5 ^ X7, S0 ^ X5 ^ Y7, X3, Y3 ^ X6, Y2 ^ X6 ^ Y6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, X6, Y4 ^ X8 ^ Y8, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, Z3 ^ X6 ^ Y6, S0 ^ X7, S1 ^ Y7 ^ X8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Y4 ^ X8 ^ Y8, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, Z3 ^ X6 ^ Y6, S0 ^ X7, S1 ^ Y6 ^ X8 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y4 ^ X8 ^ Y8, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, S1 ^ X6 ^ Y6, Y3, S0 ^ X6 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, S1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, S0 ^ X6 ^ Y6, X3, Y3 ^ X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y4 ^ X8 ^ Y8, Z0 ^ X4 ^ Y4, S1 ^ Y5 ^ X7, S0 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, X3, Y3 ^ X6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, S0 ^ X7, Y4 ^ X8 ^ Y8, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, Z3 ^ X6 ^ Y6, X6 ^ X9 ^ Y9, S1 ^ Y7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Y4 ^ X8 ^ Y8, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, Z3 ^ X6 ^ Y6, S1 ^ X9 ^ Y9, S0 ^ X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y4 ^ X8 ^ Y8, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, S1 ^ X6 ^ Y6, S0 ^ X9 ^ Y9, Y3 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, S1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, S0 ^ X6 ^ Y6, X3 ^ X9 ^ Y9, Y3 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y4 ^ X8 ^ Y8, Z0 ^ X4 ^ Y4, S1 ^ Y5 ^ X7, S0 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, X3 ^ X9 ^ Y9, Y3 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, S0 ^ X7, Y4 ^ X9 ^ Y9, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, Z4 ^ Y6 ^ X7, S1 ^ Y7, Z3 ^ X6 ^ Y7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Y4 ^ X9 ^ Y9, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, S1 ^ Y6 ^ X7, S0 ^ X7, Z3 ^ X6 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y4 ^ X9 ^ Y9, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, S1 ^ Y6 ^ X7, Y3, S0 ^ X6 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, X3, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, S1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, S0 ^ Y6 ^ X7, Y3, Y2 ^ X6 ^ Y7 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X3, Y4 ^ X9 ^ Y9, Z0 ^ X4 ^ Y4, S1 ^ Y5 ^ X8, S0 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, Y3, X2 ^ X6 ^ Y7 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, S0 ^ X7, Y4 ^ X9 ^ Y9, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, Z4 ^ Y6 ^ X7, Z3 ^ X6 ^ Y7, S1 ^ Y7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Y4 ^ X9 ^ Y9, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, S1 ^ Y6 ^ X7, Z3 ^ X6 ^ Y7, S0 ^ X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y4 ^ X9 ^ Y9, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, S1 ^ Y6 ^ X7, S0 ^ X6 ^ Y7, Y3 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, X3, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, S1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, S0 ^ Y6 ^ X7, Y2 ^ X6 ^ Y7, Y3 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X3, Y4 ^ X9 ^ Y9, Z0 ^ X4 ^ Y4, S1 ^ Y5 ^ X8, S0 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7, Y3 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, X4, Y4, X5 ^Y10, Y5 ^X10, X6 ^ Y9, Y6 ^ X9, S0 ^ X7 ^ Y8, S1 ^ Y7 ^ X8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^X10, X5 ^ Y9, Y5 ^ X9, X6 ^ Y8, S0 ^ Y6 ^ X8, S1 ^ X7 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y3, X4 ^ Y9, Y4 ^ X9, X5 ^ Y8, Y5 ^ X8, S0 ^ X6 ^ Y7, S1 ^ Y6 ^ X7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3 ^ X9, X4 ^ Y8, Y4 ^ X8, X5 ^ Y7, S0 ^ Y5 ^ X7, S1 ^ X6 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y2, X3 ^ Y8, Y3 ^ X8, X4 ^ Y7, Y4 ^ X7, S0 ^ X5 ^ Y6, S1 ^ Y5 ^ X6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X5, Z0 ^ X4 ^ Y4, Y5 ^X10, X6 ^ Y9, Y6 ^ X9, S0 ^ X7 ^ Y8, S1 ^ Y7 ^ X8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Y4, Z0 ^ X4 ^ Y4, X5 ^ Y9, X6 ^ Y8, Y5 ^ X9, S0 ^ X7 ^ Y7, S1 ^ Y6 ^ X8 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y3, Z0 ^ X4 ^ Y4, Y4 ^ X9, X5 ^ Y8, Y5 ^ X8, S0 ^ X6 ^ Y7, S1 ^ Y6 ^ X7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Z0 ^ X4 ^ Y4, Y3 ^ X9, X5 ^ Y7, Y4 ^ X8, S0 ^ X6 ^ Y6, S1 ^ Y5 ^ X7 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y2, Z0 ^ X4 ^ Y4, Y3 ^ X8, X3 ^ Y7, Y4 ^ X7, S0 ^ X5 ^ Y6, S1 ^ Y5 ^ X6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, X5, Y5, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X6 ^ Y9, Y6 ^ X9, S0 ^ X7 ^ Y8, S1 ^ Y7 ^ X8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X5, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X6 ^ Y8, Y5 ^ X9, S0 ^ X7 ^ Y7, S1 ^ Y6 ^ X8 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y8, Y5 ^ X8, S0 ^ X6 ^ Y7, S1 ^ Y6 ^ X7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, Y3 ^ X8, X5 ^ Y7, S0 ^ X6 ^ Y6, S1 ^ Y5 ^ X7 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, Y3 ^ X7, X3 ^ Y7, S0 ^ X5 ^ Y6, S1 ^ Y5 ^ X6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y5, X6, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ X6 ^ Y6, Y6 ^ X9, S0 ^ X7 ^ Y8, S1 ^ Y7 ^ X8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Y5, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ X6 ^ Y6, X6 ^ Y8, S0 ^ X7 ^ Y7, S1 ^ Y6 ^ X8 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ X6 ^ Y6, Y5 ^ X8, S0 ^ X6 ^ Y7, S1 ^ Y6 ^ X7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ X6 ^ Y6, Y3 ^ X8, S0 ^ X6 ^ Y6, S1 ^ Y5 ^ X7 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, S1 ^ X6 ^ Y6, Y3 ^ X6, X3 ^ Y6, S0 ^ X5 ^ Y5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, X5, X6, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Y6 ^ X9, S0 ^ X7 ^ Y8, S1 ^ Y7 ^ X8, Z0 ^ X5 ^ Y5 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X5, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, X6 ^ Y8, S0 ^ Y6 ^ X8, S1 ^ X7 ^ Y7, Z0 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, X5 ^ Y8, S0 ^ X6 ^ Y7, S1 ^ Y6 ^ X7, Z0 ^ X5 ^ Y5 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Y3 ^ X8, S0 ^ X6 ^ Y6, S1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y5 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X6 ^ Y6, Z0 ^ X4 ^ Y4, Y3 ^ X6, X3 ^ Y7, S0 ^ X5 ^ Y6, S1 ^ X5 ^ Y5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, X5, X6, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y6 ^ X9, S0 ^ X7 ^ Y8, S1 ^ Y7 ^ X8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X5, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X6 ^ Y8, S0 ^ Y6 ^ X8, S1 ^ X7 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y8, S0 ^ X6 ^ Y7, S1 ^ Y6 ^ X7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y3 ^ X8, S0 ^ X6 ^ Y6, S1 ^ Y5 ^ X7 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X6 ^ Y6, Z0 ^ X4 ^ Y4, S1 ^ X5 ^ Y5, Y3 ^ X6, X3 ^ Y7, S0 ^ X5 ^ Y6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, X6, Y6, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ X7 ^ Y7, S0 ^ X7 ^ Y8, S1 ^ Y7 ^ X8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X6, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ X7 ^ Y7, S0 ^ Y6 ^ X8, S1 ^ X7 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ X7 ^ Y7, S0 ^ X6 ^ Y7, S1 ^ Y6 ^ X7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, S1 ^ X7 ^ Y7, Y3 ^ X7, S0 ^ Y5 ^ X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X6 ^ Y6, Z0 ^ X4 ^ Y4, S1 ^ X5 ^ Y5, S0 ^ X7 ^ Y7, Y3 ^ X5, X3 ^ Y6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, X6, Y6, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, S0 ^ X7 ^ Y8, S1 ^ Y7 ^ X8, Z0 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X6, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, S0 ^ X7 ^ Y7, S1 ^ Y6 ^ X8, Z0 ^ X5 ^ Y6 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y3, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, S0 ^ X6 ^ Y7, S1 ^ Y6 ^ X7, Z0 ^ X5 ^ Y6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, S1 ^ Y5 ^ X6, Y3 ^ X7, S0 ^ X6 ^ Y6, Z0 ^ X5 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X7 ^ Y7, Z0 ^ X4 ^ Y4, S1 ^ Y5 ^ X6, Y3 ^ X6, X3 ^ Y6, S0 ^ X5 ^ Y6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, X6, Y6, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, S0 ^ X7 ^ Y8, S1 ^ Y7 ^ X8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X6, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, S0 ^ X7 ^ Y7, S1 ^ Y6 ^ X8 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y3, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, S0 ^ X6 ^ Y7, S1 ^ Y6 ^ X7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, S1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y3 ^ X7, S0 ^ X6 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X7 ^ Y7, Z0 ^ X4 ^ Y4, S1 ^ Y5 ^ X6, S0 ^ X5 ^ Y6, Y3 ^ X6, X3 ^ Y6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y6, S0 ^ X7, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X6 ^ X8 ^ Y8, S1 ^ Y7 ^ X8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, S0 ^ X7, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X6 ^ X8 ^ Y8, S1 ^ Y6 ^ X8 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y3, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, S1 ^ X8 ^ Y8, S0 ^ X6 ^ Y6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, S1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, S0 ^ X8 ^ Y8, Y3 ^ X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X7 ^ Y7, Z0 ^ X4 ^ Y4, S1 ^ Y5 ^ X6, S0 ^ X5 ^ Y6, X3 ^ X8 ^ Y8, Y3 ^ X6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, X6, S0 ^ X7, Y4 ^ X8 ^ Y8, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, S1 ^ Y7 ^ X8, Z3 ^ X6 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, S0 ^ X7, Y4 ^ X8 ^ Y8, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, S1 ^ Y6 ^ X8, Z3 ^ X6 ^ Y6 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y3, Y4 ^ X8 ^ Y8, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, S0 ^ X6 ^ Y7, S1 ^ X6 ^ Y6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, S1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, Y3 ^ X6, S0 ^ X6 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, X3, Y4 ^ X8 ^ Y8, Z0 ^ X4 ^ Y4, S1 ^ Y5 ^ X7, S0 ^ X5 ^ Y7, Y3 ^ X6, Y2 ^ X6 ^ Y6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, X6, S0 ^ X7, Y4 ^ X8 ^ Y8, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, Z3 ^ X6 ^ Y6, S1 ^ Y7 ^ X8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, S0 ^ X7, Y4 ^ X8 ^ Y8, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, Z3 ^ X6 ^ Y6, S1 ^ Y6 ^ X8 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y3, Y4 ^ X8 ^ Y8, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, S1 ^ X6 ^ Y6, S0 ^ X6 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, S1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, S0 ^ X6 ^ Y6, Y3 ^ X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, X3, Y4 ^ X8 ^ Y8, Z0 ^ X4 ^ Y4, S1 ^ Y5 ^ X7, S0 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, Y3 ^ X6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, S0 ^ X7, S1 ^ Y7, Y4 ^ X8 ^ Y8, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, Z3 ^ X6 ^ Y6, X6 ^ X9 ^ Y9 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, S0 ^ X7, Y4 ^ X8 ^ Y8, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, Z3 ^ X6 ^ Y6, S1 ^ X9 ^ Y9 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y3, Y4 ^ X8 ^ Y8, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, S1 ^ X6 ^ Y6, S0 ^ X9 ^ Y9 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, Y3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, S1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, S0 ^ X6 ^ Y6, X3 ^ X9 ^ Y9 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y3, Y4 ^ X8 ^ Y8, Z0 ^ X4 ^ Y4, S1 ^ Y5 ^ X7, S0 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, X3 ^ X9 ^ Y9 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, S0 ^ X7, S1 ^ Y7, Y4 ^ X9 ^ Y9, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, Z4 ^ Y6 ^ X7, Z3 ^ X6 ^ Y7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, S0 ^ X7, Y4 ^ X9 ^ Y9, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, S1 ^ Y6 ^ X7, Z3 ^ X6 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y3, Y4 ^ X9 ^ Y9, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, S1 ^ Y6 ^ X7, S0 ^ X6 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, X3, Y3, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, S1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, S0 ^ Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X3, Y3, Y4 ^ X9 ^ Y9, Z0 ^ X4 ^ Y4, S1 ^ Y5 ^ X8, S0 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, S0 ^ X7, S1 ^ Y7, Y4 ^ X9 ^ Y9, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, Z4 ^ Y6 ^ X7, Z3 ^ X6 ^ Y7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, S0 ^ X7, Y4 ^ X9 ^ Y9, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, S1 ^ Y6 ^ X7, Z3 ^ X6 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y3, Y4 ^ X9 ^ Y9, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, S1 ^ Y6 ^ X7, S0 ^ X6 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, X3, Y3, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, S1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, S0 ^ Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X3, Y3, Y4 ^ X9 ^ Y9, Z0 ^ X4 ^ Y4, S1 ^ Y5 ^ X8, S0 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7 }, -}; - -const UINT_64 SW_64K_R_X_8xaa_RBPLUS[][16]= -{ - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, X4, Y4, Y5 ^X10, X5 ^Y10, Y6 ^ X9, S0 ^ X6 ^ Y9, S1 ^ Y7 ^ X8, S2 ^ X7 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^X10, X5 ^ Y9, Y5 ^ X9, S0 ^ X6 ^ Y8, S1 ^ Y6 ^ X8, S2 ^ X7 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y3, Y4 ^ X9, X4 ^ Y9, Y5 ^ X8, S0 ^ X5 ^ Y8, S1 ^ Y6 ^ X7, S2 ^ X6 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3 ^ X9, X4 ^ Y8, Y4 ^ X8, S0 ^ X5 ^ Y7, S1 ^ Y5 ^ X7, S2 ^ X6 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y2, Y3 ^ X8, X3 ^ Y8, Y4 ^ X7, S0 ^ X4 ^ Y7, S1 ^ Y5 ^ X6, S2 ^ X5 ^ Y6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Z0 ^ X4 ^ Y4, Y4, X5, Y5 ^X10, Y6 ^ X9, S0 ^ X6 ^ Y9, S1 ^ Y7 ^ X8, S2 ^ X7 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Z0 ^ X4 ^ Y4, Y3, Y4, X5 ^ Y9, Y5 ^ X9, S0 ^ X6 ^ Y8, S1 ^ Y6 ^ X8, S2 ^ X7 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Z0 ^ X4 ^ Y4, X3, Y3, Y4 ^ X9, Y5 ^ X8, S0 ^ X5 ^ Y8, S1 ^ Y6 ^ X7, S2 ^ X6 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Z0 ^ X4 ^ Y4, Y2, X3, Y3 ^ X9, Y4 ^ X8, S0 ^ X5 ^ Y7, S1 ^ Y5 ^ X7, S2 ^ X6 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Z0 ^ X4 ^ Y4, X2, Y2, X3 ^ Y7, Y3 ^ X7, S0 ^ X4 ^ Y6, S1 ^ Y4 ^ X6, S2 ^ X5 ^ Y5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5, Y5, Y6 ^ X9, S0 ^ X6 ^ Y9, S1 ^ Y7 ^ X8, S2 ^ X7 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, Y3, X5, Y5 ^ X9, S0 ^ X6 ^ Y8, S1 ^ Y6 ^ X8, S2 ^ X7 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X3, Y3, Y5 ^ X8, S0 ^ X5 ^ Y8, S1 ^ Y6 ^ X7, S2 ^ X6 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, Y2, X3, Y3 ^ X8, S0 ^ X5 ^ Y7, S1 ^ Y5 ^ X7, S2 ^ X6 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y4 ^ X5 ^ Y5, S2 ^ X4 ^ Y4, X2, Y2, Y3 ^ X6, X3 ^ Y7, S0 ^ X4 ^ Y6, S1 ^ X5 ^ Y5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ X6 ^ Y6, Y5, Y6, S0 ^ X6 ^ Y9, S1 ^ Y7 ^ X8, S2 ^ X7 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ X6 ^ Y6, Y3, Y5, S0 ^ X6 ^ Y8, S1 ^ Y6 ^ X8, S2 ^ X7 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, S2 ^ X6 ^ Y6, X3, Y3, Y5 ^ X7, S0 ^ X5 ^ Y7, S1 ^ X6 ^ Y6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, S2 ^ X6 ^ Y6, Y2, X3, Y3 ^ X7, S0 ^ X5 ^ Y6, S1 ^ Y5 ^ X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y4 ^ X5 ^ Y5, S2 ^ X4 ^ Y4, S1 ^ X6 ^ Y6, X2, Y2, Y3 ^ X5, X3 ^ Y6, S0 ^ X4 ^ Y5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, X5, Y6, S0 ^ X6 ^ Y9, S1 ^ Y7 ^ X8, S2 ^ X7 ^ Y8, Z0 ^ X5 ^ Y5 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Y3, X5, S0 ^ X6 ^ Y8, S1 ^ Y6 ^ X8, S2 ^ X7 ^ Y7, Z0 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, X3, Y3, S0 ^ X5 ^ Y8, S1 ^ Y6 ^ X7, S2 ^ X6 ^ Y7, Z0 ^ X5 ^ Y5 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y4 ^ X6 ^ Y6, Z0 ^ X4 ^ Y4, Y2, X3, Y3 ^ X7, S0 ^ X5 ^ Y7, S1 ^ X6 ^ Y6, S2 ^ X5 ^ Y5 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y4 ^ X6 ^ Y6, S2 ^ X4 ^ Y4, X2, Y2, Y3 ^ X6, X3 ^ Y7, S0 ^ X5 ^ Y6, S1 ^ X5 ^ Y5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5, Y6, S0 ^ X6 ^ Y9, S1 ^ Y7 ^ X8, S2 ^ X7 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y3, X5, S0 ^ X6 ^ Y8, S1 ^ Y6 ^ X8, S2 ^ X7 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X3, Y3, S0 ^ X5 ^ Y8, S1 ^ Y6 ^ X7, S2 ^ X6 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y4 ^ X6 ^ Y6, Z0 ^ X4 ^ Y4, S2 ^ X5 ^ Y5, Y2, X3, Y3 ^ X7, S0 ^ X5 ^ Y7, S1 ^ X6 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y4 ^ X6 ^ Y6, S2 ^ X4 ^ Y4, S1 ^ X5 ^ Y5, X2, Y2, Y3 ^ X6, X3 ^ Y7, S0 ^ X5 ^ Y6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ X7 ^ Y7, Y6, S0 ^ X6, S1 ^ Y7 ^ X8, S2 ^ X7 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ X7 ^ Y7, Y3, S0 ^ X6, S1 ^ Y6 ^ X8, S2 ^ X7 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, S2 ^ X7 ^ Y7, X3, Y3, S0 ^ X5 ^ Y7, S1 ^ X6 ^ Y6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y4 ^ X6 ^ Y6, Z0 ^ X4 ^ Y4, S2 ^ X5 ^ Y5, S1 ^ X7 ^ Y7, Y2, X3, Y3 ^ X6, S0 ^ X5 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y4 ^ X6 ^ Y6, S2 ^ X4 ^ Y4, S1 ^ X5 ^ Y5, S0 ^ X7 ^ Y7, X2, Y2, Y3 ^ X5, X3 ^ Y6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, Y6, S0 ^ X6, S1 ^ Y7 ^ X8, S2 ^ X7 ^ Y8, Z2 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, Y3, S0 ^ X6, S1 ^ Y6 ^ X8, S2 ^ X7 ^ Y7, Z2 ^ X5 ^ Y6 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X3, Y3, S0 ^ X6 ^ Y7, S1 ^ Y6 ^ X7, S2 ^ X5 ^ Y6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y4 ^ X7 ^ Y7, Z0 ^ X4 ^ Y4, S2 ^ Y5 ^ X6, Y2, X3, Y3 ^ X7, S0 ^ X6 ^ Y6, S1 ^ X5 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y4 ^ X7 ^ Y7, S2 ^ X4 ^ Y4, S1 ^ Y5 ^ X6, X2, Y2, Y3 ^ X6, X3 ^ Y6, S0 ^ X5 ^ Y6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, Z2 ^ X5 ^ Y6, Y6, S0 ^ X6, S1 ^ Y7 ^ X8, S2 ^ X7 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, Z2 ^ X5 ^ Y6, Y3, S0 ^ X6, S1 ^ Y6 ^ X8, S2 ^ X7 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, S2 ^ X5 ^ Y6, X3, Y3, S0 ^ X6 ^ Y7, S1 ^ Y6 ^ X7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y4 ^ X7 ^ Y7, Z0 ^ X4 ^ Y4, S2 ^ Y5 ^ X6, S1 ^ X5 ^ Y6, Y2, X3, Y3 ^ X7, S0 ^ X6 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y4 ^ X7 ^ Y7, S2 ^ X4 ^ Y4, S1 ^ Y5 ^ X6, S0 ^ X5 ^ Y6, X2, Y2, Y3 ^ X6, X3 ^ Y6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, Z2 ^ X5 ^ Y6, S2 ^ X8 ^ Y8, Y6, S0 ^ X6, S1 ^ X7 ^ Y7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, Z2 ^ X5 ^ Y6, S2 ^ X8 ^ Y8, Y3, S0 ^ X6, S1 ^ Y6 ^ X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, S2 ^ X5 ^ Y6, S1 ^ X8 ^ Y8, X3, Y3, S0 ^ X6 ^ Y6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y4 ^ X7 ^ Y7, Z0 ^ X4 ^ Y4, S2 ^ Y5 ^ X6, S1 ^ X5 ^ Y6, S0 ^ X8 ^ Y8, Y2, X3, Y3 ^ X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y4 ^ X7 ^ Y7, S2 ^ X4 ^ Y4, S1 ^ Y5 ^ X6, S0 ^ X5 ^ Y6, X3 ^ X8 ^ Y8, X2, Y2, Y3 ^ X6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, Z3 ^ X5 ^ Y7, S0 ^ X6, S1 ^ Y7, S2 ^ X7 ^ Y8, Z2 ^ X6 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, Z2 ^ X5 ^ Y7, Y3, S0 ^ X6, S1 ^ X7 ^ Y7, S2 ^ X6 ^ Y6 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, S2 ^ X5 ^ Y7, X3, Y3, S0 ^ X6 ^ Y7, S1 ^ X6 ^ Y6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y4 ^ X8 ^ Y8, Z0 ^ X4 ^ Y4, S2 ^ Y5 ^ X7, S1 ^ X5 ^ Y7, Y2, X3, Y3 ^ X6, S0 ^ X6 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y4 ^ X8 ^ Y8, S2 ^ X4 ^ Y4, S1 ^ Y5 ^ X7, S0 ^ X5 ^ Y7, X2, X3, Y3 ^ X6, Y2 ^ X6 ^ Y6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, Z3 ^ X5 ^ Y7, Z2 ^ X6 ^ Y6, S0 ^ X6, S1 ^ Y7, S2 ^ X7 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, Z2 ^ X5 ^ Y7, S2 ^ X6 ^ Y6, Y3, S0 ^ X6, S1 ^ X7 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, S2 ^ X5 ^ Y7, S1 ^ X6 ^ Y6, X3, Y3, S0 ^ X6 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y4 ^ X8 ^ Y8, Z0 ^ X4 ^ Y4, S2 ^ Y5 ^ X7, S1 ^ X5 ^ Y7, S0 ^ X6 ^ Y6, Y2, X3, Y3 ^ X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y4 ^ X8 ^ Y8, S2 ^ X4 ^ Y4, S1 ^ Y5 ^ X7, S0 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, X2, X3, Y3 ^ X6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, Z3 ^ X5 ^ Y7, Z2 ^ X6 ^ Y6, S2 ^ X9 ^ Y9, S0 ^ X6, S1 ^ Y7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, Z2 ^ X5 ^ Y7, S2 ^ X6 ^ Y6, S1 ^ X9 ^ Y9, Y3, S0 ^ X6 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, S2 ^ X5 ^ Y7, S1 ^ X6 ^ Y6, S0 ^ X9 ^ Y9, X3, Y3 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y4 ^ X8 ^ Y8, Z0 ^ X4 ^ Y4, S2 ^ Y5 ^ X7, S1 ^ X5 ^ Y7, S0 ^ X6 ^ Y6, X3 ^ X9 ^ Y9, Y2, Y3 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y4 ^ X8 ^ Y8, S2 ^ X4 ^ Y4, S1 ^ Y5 ^ X7, S0 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, X3 ^ X9 ^ Y9, X2, Y3 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, Z3 ^ X5 ^ Y8, Z2 ^ Y6 ^ X7, S0 ^ X7, S1 ^ Y7, S2 ^ X6 ^ Y7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, S2 ^ Y6 ^ X7, Y3, S0 ^ X7, S1 ^ X6 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, S2 ^ X5 ^ Y8, S1 ^ Y6 ^ X7, X3, Y3, S0 ^ X6 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y4 ^ X9 ^ Y9, Z0 ^ X4 ^ Y4, S2 ^ Y5 ^ X8, S1 ^ X5 ^ Y8, S0 ^ Y6 ^ X7, X3, Y3, Y2 ^ X6 ^ Y7 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y4 ^ X9 ^ Y9, S2 ^ X4 ^ Y4, S1 ^ Y5 ^ X8, S0 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X3, Y3, X2 ^ X6 ^ Y7 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, Z3 ^ X5 ^ Y8, Z2 ^ Y6 ^ X7, S2 ^ X6 ^ Y7, S0 ^ X7, S1 ^ Y7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, S2 ^ Y6 ^ X7, S1 ^ X6 ^ Y7, Y3, S0 ^ X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, S2 ^ X5 ^ Y8, S1 ^ Y6 ^ X7, S0 ^ X6 ^ Y7, X3, Y3 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y4 ^ X9 ^ Y9, Z0 ^ X4 ^ Y4, S2 ^ Y5 ^ X8, S1 ^ X5 ^ Y8, S0 ^ Y6 ^ X7, Y2 ^ X6 ^ Y7, X3, Y3 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y4 ^ X9 ^ Y9, S2 ^ X4 ^ Y4, S1 ^ Y5 ^ X8, S0 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7, X3, Y3 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, X4, Y4, Y5 ^X10, X5 ^Y10, Y6 ^ X9, S0 ^ X6 ^ Y9, S1 ^ Y7 ^ X8, S2 ^ X7 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^X10, X5 ^ Y9, Y5 ^ X9, S0 ^ X6 ^ Y8, S1 ^ Y6 ^ X8, S2 ^ X7 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y3, Y4 ^ X9, X4 ^ Y9, Y5 ^ X8, S0 ^ X5 ^ Y8, S1 ^ Y6 ^ X7, S2 ^ X6 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3 ^ X9, X4 ^ Y8, Y4 ^ X8, S0 ^ X5 ^ Y7, S1 ^ Y5 ^ X7, S2 ^ X6 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y2, Y3 ^ X8, X3 ^ Y8, Y4 ^ X7, S0 ^ X4 ^ Y7, S1 ^ Y5 ^ X6, S2 ^ X5 ^ Y6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, Z0 ^ X4 ^ Y4, X5, Y5 ^X10, Y6 ^ X9, S0 ^ X6 ^ Y9, S1 ^ Y7 ^ X8, S2 ^ X7 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Z0 ^ X4 ^ Y4, Y4, X5 ^ Y9, Y5 ^ X9, S0 ^ X6 ^ Y8, S1 ^ Y6 ^ X8, S2 ^ X7 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Z0 ^ X4 ^ Y4, Y3, Y4 ^ X9, Y5 ^ X8, S0 ^ X5 ^ Y8, S1 ^ Y6 ^ X7, S2 ^ X6 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, Z0 ^ X4 ^ Y4, X3, Y3 ^ X9, Y4 ^ X8, S0 ^ X5 ^ Y7, S1 ^ Y5 ^ X7, S2 ^ X6 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Z0 ^ X4 ^ Y4, Y2, X3 ^ Y7, Y3 ^ X7, S0 ^ X4 ^ Y6, S1 ^ Y4 ^ X6, S2 ^ X5 ^ Y5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, X5, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, Y5, Y6 ^ X9, S0 ^ X6 ^ Y9, S1 ^ Y7 ^ X8, S2 ^ X7 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5, Y5 ^ X9, S0 ^ X6 ^ Y8, S1 ^ Y6 ^ X8, S2 ^ X7 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, Y3, Y5 ^ X8, S0 ^ X5 ^ Y8, S1 ^ Y6 ^ X7, S2 ^ X6 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X3, Y3 ^ X8, S0 ^ X5 ^ Y7, S1 ^ Y5 ^ X7, S2 ^ X6 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y4 ^ X5 ^ Y5, S2 ^ X4 ^ Y4, Y2, Y3 ^ X6, X3 ^ Y7, S0 ^ X4 ^ Y6, S1 ^ X5 ^ Y5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y5, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ X6 ^ Y6, Y6, S0 ^ X6 ^ Y9, S1 ^ Y7 ^ X8, S2 ^ X7 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ X6 ^ Y6, Y5, S0 ^ X6 ^ Y8, S1 ^ Y6 ^ X8, S2 ^ X7 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, S2 ^ X6 ^ Y6, Y3, Y5 ^ X7, S0 ^ X5 ^ Y7, S1 ^ X6 ^ Y6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, S2 ^ X6 ^ Y6, X3, Y3 ^ X7, S0 ^ X5 ^ Y6, S1 ^ Y5 ^ X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y4 ^ X5 ^ Y5, S2 ^ X4 ^ Y4, S1 ^ X6 ^ Y6, Y2, Y3 ^ X5, X3 ^ Y6, S0 ^ X4 ^ Y5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, X5, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Y6, S0 ^ X6 ^ Y9, S1 ^ Y7 ^ X8, S2 ^ X7 ^ Y8, Z0 ^ X5 ^ Y5 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, X5, S0 ^ X6 ^ Y8, S1 ^ Y6 ^ X8, S2 ^ X7 ^ Y7, Z0 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Y3, S0 ^ X5 ^ Y8, S1 ^ Y6 ^ X7, S2 ^ X6 ^ Y7, Z0 ^ X5 ^ Y5 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, Y4 ^ X6 ^ Y6, Z0 ^ X4 ^ Y4, X3, Y3 ^ X7, S0 ^ X5 ^ Y7, S1 ^ X6 ^ Y6, S2 ^ X5 ^ Y5 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y4 ^ X6 ^ Y6, S2 ^ X4 ^ Y4, Y2, Y3 ^ X6, X3 ^ Y7, S0 ^ X5 ^ Y6, S1 ^ X5 ^ Y5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, X5, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y6, S0 ^ X6 ^ Y9, S1 ^ Y7 ^ X8, S2 ^ X7 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5, S0 ^ X6 ^ Y8, S1 ^ Y6 ^ X8, S2 ^ X7 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y3, S0 ^ X5 ^ Y8, S1 ^ Y6 ^ X7, S2 ^ X6 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, Y4 ^ X6 ^ Y6, Z0 ^ X4 ^ Y4, S2 ^ X5 ^ Y5, X3, Y3 ^ X7, S0 ^ X5 ^ Y7, S1 ^ X6 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y4 ^ X6 ^ Y6, S2 ^ X4 ^ Y4, S1 ^ X5 ^ Y5, Y2, Y3 ^ X6, X3 ^ Y7, S0 ^ X5 ^ Y6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y6, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ X7 ^ Y7, S0 ^ X6, S1 ^ Y7 ^ X8, S2 ^ X7 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ X7 ^ Y7, S0 ^ X6, S1 ^ Y6 ^ X8, S2 ^ X7 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, S2 ^ X7 ^ Y7, Y3, S0 ^ X5 ^ Y7, S1 ^ X6 ^ Y6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, Y4 ^ X6 ^ Y6, Z0 ^ X4 ^ Y4, S2 ^ X5 ^ Y5, S1 ^ X7 ^ Y7, X3, Y3 ^ X6, S0 ^ X5 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y4 ^ X6 ^ Y6, S2 ^ X4 ^ Y4, S1 ^ X5 ^ Y5, S0 ^ X7 ^ Y7, Y2, Y3 ^ X5, X3 ^ Y6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y6, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, S0 ^ X6, S1 ^ Y7 ^ X8, S2 ^ X7 ^ Y8, Z2 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, S0 ^ X6, S1 ^ Y6 ^ X8, S2 ^ X7 ^ Y7, Z2 ^ X5 ^ Y6 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, Y3, S0 ^ X6 ^ Y7, S1 ^ Y6 ^ X7, S2 ^ X5 ^ Y6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, Y4 ^ X7 ^ Y7, Z0 ^ X4 ^ Y4, S2 ^ Y5 ^ X6, X3, Y3 ^ X7, S0 ^ X6 ^ Y6, S1 ^ X5 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y4 ^ X7 ^ Y7, S2 ^ X4 ^ Y4, S1 ^ Y5 ^ X6, Y2, Y3 ^ X6, X3 ^ Y6, S0 ^ X5 ^ Y6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y6, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, Z2 ^ X5 ^ Y6, S0 ^ X6, S1 ^ Y7 ^ X8, S2 ^ X7 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, Z2 ^ X5 ^ Y6, S0 ^ X6, S1 ^ Y6 ^ X8, S2 ^ X7 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, S2 ^ X5 ^ Y6, Y3, S0 ^ X6 ^ Y7, S1 ^ Y6 ^ X7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, Y4 ^ X7 ^ Y7, Z0 ^ X4 ^ Y4, S2 ^ Y5 ^ X6, S1 ^ X5 ^ Y6, X3, Y3 ^ X7, S0 ^ X6 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y4 ^ X7 ^ Y7, S2 ^ X4 ^ Y4, S1 ^ Y5 ^ X6, S0 ^ X5 ^ Y6, Y2, Y3 ^ X6, X3 ^ Y6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y6, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, Z2 ^ X5 ^ Y6, S2 ^ X8 ^ Y8, S0 ^ X6, S1 ^ X7 ^ Y7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, Z2 ^ X5 ^ Y6, S2 ^ X8 ^ Y8, S0 ^ X6, S1 ^ Y6 ^ X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, S2 ^ X5 ^ Y6, S1 ^ X8 ^ Y8, Y3, S0 ^ X6 ^ Y6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, Y4 ^ X7 ^ Y7, Z0 ^ X4 ^ Y4, S2 ^ Y5 ^ X6, S1 ^ X5 ^ Y6, S0 ^ X8 ^ Y8, X3, Y3 ^ X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y4 ^ X7 ^ Y7, S2 ^ X4 ^ Y4, S1 ^ Y5 ^ X6, S0 ^ X5 ^ Y6, X3 ^ X8 ^ Y8, Y2, Y3 ^ X6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, S0 ^ X6, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, Z3 ^ X5 ^ Y7, S1 ^ Y7, S2 ^ X7 ^ Y8, Z2 ^ X6 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, Z2 ^ X5 ^ Y7, S0 ^ X6, S1 ^ X7 ^ Y7, S2 ^ X6 ^ Y6 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, S2 ^ X5 ^ Y7, Y3, S0 ^ X6 ^ Y7, S1 ^ X6 ^ Y6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, Y4 ^ X8 ^ Y8, Z0 ^ X4 ^ Y4, S2 ^ Y5 ^ X7, S1 ^ X5 ^ Y7, X3, Y3 ^ X6, S0 ^ X6 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y4 ^ X8 ^ Y8, S2 ^ X4 ^ Y4, S1 ^ Y5 ^ X7, S0 ^ X5 ^ Y7, X3, Y3 ^ X6, Y2 ^ X6 ^ Y6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, S0 ^ X6, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, Z3 ^ X5 ^ Y7, Z2 ^ X6 ^ Y6, S1 ^ Y7, S2 ^ X7 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, Z2 ^ X5 ^ Y7, S2 ^ X6 ^ Y6, S0 ^ X6, S1 ^ X7 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, S2 ^ X5 ^ Y7, S1 ^ X6 ^ Y6, Y3, S0 ^ X6 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, Y4 ^ X8 ^ Y8, Z0 ^ X4 ^ Y4, S2 ^ Y5 ^ X7, S1 ^ X5 ^ Y7, S0 ^ X6 ^ Y6, X3, Y3 ^ X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y4 ^ X8 ^ Y8, S2 ^ X4 ^ Y4, S1 ^ Y5 ^ X7, S0 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, X3, Y3 ^ X6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, S0 ^ X6, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, Z3 ^ X5 ^ Y7, Z2 ^ X6 ^ Y6, S2 ^ X9 ^ Y9, S1 ^ Y7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, Z2 ^ X5 ^ Y7, S2 ^ X6 ^ Y6, S1 ^ X9 ^ Y9, S0 ^ X6 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, S2 ^ X5 ^ Y7, S1 ^ X6 ^ Y6, S0 ^ X9 ^ Y9, Y3 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, Y4 ^ X8 ^ Y8, Z0 ^ X4 ^ Y4, S2 ^ Y5 ^ X7, S1 ^ X5 ^ Y7, S0 ^ X6 ^ Y6, X3 ^ X9 ^ Y9, Y3 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y4 ^ X8 ^ Y8, S2 ^ X4 ^ Y4, S1 ^ Y5 ^ X7, S0 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, X3 ^ X9 ^ Y9, Y3 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, S0 ^ X7, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, Z3 ^ X5 ^ Y8, Z2 ^ Y6 ^ X7, S1 ^ Y7, S2 ^ X6 ^ Y7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, S2 ^ Y6 ^ X7, S0 ^ X7, S1 ^ X6 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, S2 ^ X5 ^ Y8, S1 ^ Y6 ^ X7, Y3, S0 ^ X6 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, X3, Y4 ^ X9 ^ Y9, Z0 ^ X4 ^ Y4, S2 ^ Y5 ^ X8, S1 ^ X5 ^ Y8, S0 ^ Y6 ^ X7, Y3, Y2 ^ X6 ^ Y7 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X3, Y4 ^ X9 ^ Y9, S2 ^ X4 ^ Y4, S1 ^ Y5 ^ X8, S0 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, Y3, X2 ^ X6 ^ Y7 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, S0 ^ X7, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, Z3 ^ X5 ^ Y8, Z2 ^ Y6 ^ X7, S2 ^ X6 ^ Y7, S1 ^ Y7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, S2 ^ Y6 ^ X7, S1 ^ X6 ^ Y7, S0 ^ X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, S2 ^ X5 ^ Y8, S1 ^ Y6 ^ X7, S0 ^ X6 ^ Y7, Y3 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, X3, Y4 ^ X9 ^ Y9, Z0 ^ X4 ^ Y4, S2 ^ Y5 ^ X8, S1 ^ X5 ^ Y8, S0 ^ Y6 ^ X7, Y2 ^ X6 ^ Y7, Y3 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X3, Y4 ^ X9 ^ Y9, S2 ^ X4 ^ Y4, S1 ^ Y5 ^ X8, S0 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7, Y3 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, X4, Y4, Y5 ^X10, X5 ^Y10, Y6 ^ X9, S0 ^ X6 ^ Y9, S1 ^ Y7 ^ X8, S2 ^ X7 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X4, Y4 ^X10, X5 ^ Y9, Y5 ^ X9, S0 ^ X6 ^ Y8, S1 ^ Y6 ^ X8, S2 ^ X7 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y3, Y4 ^ X9, X4 ^ Y9, Y5 ^ X8, S0 ^ X5 ^ Y8, S1 ^ Y6 ^ X7, S2 ^ X6 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y3 ^ X9, X4 ^ Y8, Y4 ^ X8, S0 ^ X5 ^ Y7, S1 ^ Y5 ^ X7, S2 ^ X6 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y2, Y3 ^ X8, X3 ^ Y8, Y4 ^ X7, S0 ^ X4 ^ Y7, S1 ^ Y5 ^ X6, S2 ^ X5 ^ Y6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y4, X5, Z0 ^ X4 ^ Y4, Y5 ^X10, Y6 ^ X9, S0 ^ X6 ^ Y9, S1 ^ Y7 ^ X8, S2 ^ X7 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Y4, Z0 ^ X4 ^ Y4, X5 ^ Y9, Y5 ^ X9, S0 ^ X6 ^ Y8, S1 ^ Y6 ^ X8, S2 ^ X7 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y3, Z0 ^ X4 ^ Y4, Y4 ^ X9, Y5 ^ X8, S0 ^ X5 ^ Y8, S1 ^ Y6 ^ X7, S2 ^ X6 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Z0 ^ X4 ^ Y4, Y3 ^ X9, Y4 ^ X8, S0 ^ X5 ^ Y7, S1 ^ Y5 ^ X7, S2 ^ X6 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y2, Z0 ^ X4 ^ Y4, X3 ^ Y7, Y3 ^ X7, S0 ^ X4 ^ Y6, S1 ^ Y4 ^ X6, S2 ^ X5 ^ Y5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, X5, Y5, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, Y6 ^ X9, S0 ^ X6 ^ Y9, S1 ^ Y7 ^ X8, S2 ^ X7 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X5, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, Y5 ^ X9, S0 ^ X6 ^ Y8, S1 ^ Y6 ^ X8, S2 ^ X7 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, Y5 ^ X8, S0 ^ X5 ^ Y8, S1 ^ Y6 ^ X7, S2 ^ X6 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, Y3 ^ X8, S0 ^ X5 ^ Y7, S1 ^ Y5 ^ X7, S2 ^ X6 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X5 ^ Y5, S2 ^ X4 ^ Y4, Y3 ^ X6, X3 ^ Y7, S0 ^ X4 ^ Y6, S1 ^ X5 ^ Y5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y5, Y6, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ X6 ^ Y6, S0 ^ X6 ^ Y9, S1 ^ Y7 ^ X8, S2 ^ X7 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, Y5, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ X6 ^ Y6, S0 ^ X6 ^ Y8, S1 ^ Y6 ^ X8, S2 ^ X7 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, S2 ^ X6 ^ Y6, Y5 ^ X7, S0 ^ X5 ^ Y7, S1 ^ X6 ^ Y6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, S2 ^ X6 ^ Y6, Y3 ^ X7, S0 ^ X5 ^ Y6, S1 ^ Y5 ^ X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X5 ^ Y5, S2 ^ X4 ^ Y4, S1 ^ X6 ^ Y6, Y3 ^ X5, X3 ^ Y6, S0 ^ X4 ^ Y5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, X5, Y6, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, S0 ^ X6 ^ Y9, S1 ^ Y7 ^ X8, S2 ^ X7 ^ Y8, Z0 ^ X5 ^ Y5 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X5, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, S0 ^ X6 ^ Y8, S1 ^ Y6 ^ X8, S2 ^ X7 ^ Y7, Z0 ^ X5 ^ Y5 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, S0 ^ X5 ^ Y8, S1 ^ Y6 ^ X7, S2 ^ X6 ^ Y7, Z0 ^ X5 ^ Y5 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y4 ^ X6 ^ Y6, Z0 ^ X4 ^ Y4, Y3 ^ X7, S0 ^ X5 ^ Y7, S1 ^ X6 ^ Y6, S2 ^ X5 ^ Y5 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X6 ^ Y6, S2 ^ X4 ^ Y4, Y3 ^ X6, X3 ^ Y7, S0 ^ X5 ^ Y6, S1 ^ X5 ^ Y5 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, X5, Y6, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, S0 ^ X6 ^ Y9, S1 ^ Y7 ^ X8, S2 ^ X7 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, X5, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, S0 ^ X6 ^ Y8, S1 ^ Y6 ^ X8, S2 ^ X7 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, S0 ^ X5 ^ Y8, S1 ^ Y6 ^ X7, S2 ^ X6 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y4 ^ X6 ^ Y6, Z0 ^ X4 ^ Y4, S2 ^ X5 ^ Y5, Y3 ^ X7, S0 ^ X5 ^ Y7, S1 ^ X6 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X6 ^ Y6, S2 ^ X4 ^ Y4, S1 ^ X5 ^ Y5, Y3 ^ X6, X3 ^ Y7, S0 ^ X5 ^ Y6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y6, S0 ^ X6, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ X7 ^ Y7, S1 ^ Y7 ^ X8, S2 ^ X7 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, S0 ^ X6, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ X7 ^ Y7, S1 ^ Y6 ^ X8, S2 ^ X7 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, S2 ^ X7 ^ Y7, S0 ^ X5 ^ Y7, S1 ^ X6 ^ Y6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y4 ^ X6 ^ Y6, Z0 ^ X4 ^ Y4, S2 ^ X5 ^ Y5, S1 ^ X7 ^ Y7, Y3 ^ X6, S0 ^ X5 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X6 ^ Y6, S2 ^ X4 ^ Y4, S1 ^ X5 ^ Y5, S0 ^ X7 ^ Y7, Y3 ^ X5, X3 ^ Y6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y6, S0 ^ X6, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, S1 ^ Y7 ^ X8, S2 ^ X7 ^ Y8, Z2 ^ X5 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, S0 ^ X6, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, S1 ^ Y6 ^ X8, S2 ^ X7 ^ Y7, Z2 ^ X5 ^ Y6 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, S0 ^ X6 ^ Y7, S1 ^ Y6 ^ X7, S2 ^ X5 ^ Y6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y4 ^ X7 ^ Y7, Z0 ^ X4 ^ Y4, S2 ^ Y5 ^ X6, Y3 ^ X7, S0 ^ X6 ^ Y6, S1 ^ X5 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X7 ^ Y7, S2 ^ X4 ^ Y4, S1 ^ Y5 ^ X6, Y3 ^ X6, X3 ^ Y6, S0 ^ X5 ^ Y6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y6, S0 ^ X6, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, Z2 ^ X5 ^ Y6, S1 ^ Y7 ^ X8, S2 ^ X7 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, S0 ^ X6, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, Z2 ^ X5 ^ Y6, S1 ^ Y6 ^ X8, S2 ^ X7 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, S2 ^ X5 ^ Y6, S0 ^ X6 ^ Y7, S1 ^ Y6 ^ X7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y4 ^ X7 ^ Y7, Z0 ^ X4 ^ Y4, S2 ^ Y5 ^ X6, S1 ^ X5 ^ Y6, Y3 ^ X7, S0 ^ X6 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X7 ^ Y7, S2 ^ X4 ^ Y4, S1 ^ Y5 ^ X6, S0 ^ X5 ^ Y6, Y3 ^ X6, X3 ^ Y6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, Y6, S0 ^ X6, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, Z2 ^ X5 ^ Y6, S2 ^ X8 ^ Y8, S1 ^ X7 ^ Y7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, S0 ^ X6, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, Z2 ^ X5 ^ Y6, S2 ^ X8 ^ Y8, S1 ^ Y6 ^ X7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, S2 ^ X5 ^ Y6, S1 ^ X8 ^ Y8, S0 ^ X6 ^ Y6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y4 ^ X7 ^ Y7, Z0 ^ X4 ^ Y4, S2 ^ Y5 ^ X6, S1 ^ X5 ^ Y6, S0 ^ X8 ^ Y8, Y3 ^ X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X7 ^ Y7, S2 ^ X4 ^ Y4, S1 ^ Y5 ^ X6, S0 ^ X5 ^ Y6, X3 ^ X8 ^ Y8, Y3 ^ X6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, S0 ^ X6, S1 ^ Y7, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, Z3 ^ X5 ^ Y7, S2 ^ X7 ^ Y8, Z2 ^ X6 ^ Y6 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, S0 ^ X6, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, Z2 ^ X5 ^ Y7, S1 ^ X7 ^ Y7, S2 ^ X6 ^ Y6 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, S2 ^ X5 ^ Y7, S0 ^ X6 ^ Y7, S1 ^ X6 ^ Y6 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y4 ^ X8 ^ Y8, Z0 ^ X4 ^ Y4, S2 ^ Y5 ^ X7, S1 ^ X5 ^ Y7, Y3 ^ X6, S0 ^ X6 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, X3, Y4 ^ X8 ^ Y8, S2 ^ X4 ^ Y4, S1 ^ Y5 ^ X7, S0 ^ X5 ^ Y7, Y3 ^ X6, Y2 ^ X6 ^ Y6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, S0 ^ X6, S1 ^ Y7, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, Z3 ^ X5 ^ Y7, Z2 ^ X6 ^ Y6, S2 ^ X7 ^ Y8 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, S0 ^ X6, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, Z2 ^ X5 ^ Y7, S2 ^ X6 ^ Y6, S1 ^ X7 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, S2 ^ X5 ^ Y7, S1 ^ X6 ^ Y6, S0 ^ X6 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, X3, Y4 ^ X8 ^ Y8, Z0 ^ X4 ^ Y4, S2 ^ Y5 ^ X7, S1 ^ X5 ^ Y7, S0 ^ X6 ^ Y6, Y3 ^ X6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, X3, Y4 ^ X8 ^ Y8, S2 ^ X4 ^ Y4, S1 ^ Y5 ^ X7, S0 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, Y3 ^ X6 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, S0 ^ X6, S1 ^ Y7, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, Z3 ^ X5 ^ Y7, Z2 ^ X6 ^ Y6, S2 ^ X9 ^ Y9 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, S0 ^ X6, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, Z2 ^ X5 ^ Y7, S2 ^ X6 ^ Y6, S1 ^ X9 ^ Y9 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, S2 ^ X5 ^ Y7, S1 ^ X6 ^ Y6, S0 ^ X9 ^ Y9 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, Y2, Y3, Y4 ^ X8 ^ Y8, Z0 ^ X4 ^ Y4, S2 ^ Y5 ^ X7, S1 ^ X5 ^ Y7, S0 ^ X6 ^ Y6, X3 ^ X9 ^ Y9 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y3, Y4 ^ X8 ^ Y8, S2 ^ X4 ^ Y4, S1 ^ Y5 ^ X7, S0 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, X3 ^ X9 ^ Y9 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, S0 ^ X7, S1 ^ Y7, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, Z3 ^ X5 ^ Y8, Z2 ^ Y6 ^ X7, S2 ^ X6 ^ Y7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, S0 ^ X7, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, S2 ^ Y6 ^ X7, S1 ^ X6 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y3, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, S2 ^ X5 ^ Y8, S1 ^ Y6 ^ X7, S0 ^ X6 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, X3, Y3, Y4 ^ X9 ^ Y9, Z0 ^ X4 ^ Y4, S2 ^ Y5 ^ X8, S1 ^ X5 ^ Y8, S0 ^ Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X3, Y3, Y4 ^ X9 ^ Y9, S2 ^ X4 ^ Y4, S1 ^ Y5 ^ X8, S0 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7 }, - {X0, X1, X2, X3, Y0, Y1, Y2, Y3, S0 ^ X7, S1 ^ Y7, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, Z3 ^ X5 ^ Y8, Z2 ^ Y6 ^ X7, S2 ^ X6 ^ Y7 }, - {0, X0, X1, X2, Y0, Y1, Y2, X3, Y3, S0 ^ X7, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, S2 ^ Y6 ^ X7, S1 ^ X6 ^ Y7 }, - {0, 0, X0, X1, Y0, Y1, X2, Y2, X3, Y3, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, S2 ^ X5 ^ Y8, S1 ^ Y6 ^ X7, S0 ^ X6 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, X2, Y1, X3, Y3, Y4 ^ X9 ^ Y9, Z0 ^ X4 ^ Y4, S2 ^ Y5 ^ X8, S1 ^ X5 ^ Y8, S0 ^ Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X3, Y3, Y4 ^ X9 ^ Y9, S2 ^ X4 ^ Y4, S1 ^ Y5 ^ X8, S0 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7 }, -}; - -const UINT_64 SW_64K_Z_X_1xaa_RBPLUS[][16]= -{ - {X0, Y0, X1, Y1, X2, Y2, X3, Y3, X4, Y4, X5 ^Y10, Y5 ^X10, X6 ^ Y9, Y6 ^ X9, X7 ^ Y8, Y7 ^ X8 }, - {0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, X4, Y4 ^X10, X5 ^ Y9, Y5 ^ X9, X6 ^ Y8, Y6 ^ X8, X7 ^ Y7 }, - {0, 0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, X4 ^ Y9, Y4 ^ X9, X5 ^ Y8, Y5 ^ X8, X6 ^ Y7, Y6 ^ X7 }, - {0, 0, 0, X0, Y0, X1, Y1, X2, Y2, X3, Y3 ^ X9, X4 ^ Y8, Y4 ^ X8, X5 ^ Y7, Y5 ^ X7, X6 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y2, X3 ^ Y8, Y3 ^ X8, X4 ^ Y7, Y4 ^ X7, X5 ^ Y6, Y5 ^ X6 }, - {X0, Y0, X1, Y1, X2, Y2, X3, Y3, Z0 ^ X4 ^ Y4, Y4, X5, Y5 ^X10, X6 ^ Y9, Y6 ^ X9, X7 ^ Y8, Y7 ^ X8 }, - {0, X0, Y0, X1, Y1, X2, Y2, X3, Z0 ^ X4 ^ Y4, Y3, Y4, X5 ^ Y9, X6 ^ Y8, Y5 ^ X9, X7 ^ Y7, Y6 ^ X8 }, - {0, 0, X0, Y0, X1, Y1, X2, Y2, Z0 ^ X4 ^ Y4, X3, Y3, Y4 ^ X9, X5 ^ Y8, Y5 ^ X8, X6 ^ Y7, Y6 ^ X7 }, - {0, 0, 0, X0, Y0, X1, Y1, X2, Z0 ^ X4 ^ Y4, Y2, X3, Y3 ^ X9, X5 ^ Y7, Y4 ^ X8, X6 ^ Y6, Y5 ^ X7 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Z0 ^ X4 ^ Y4, X2, Y2, Y3 ^ X8, X3 ^ Y7, Y4 ^ X7, X5 ^ Y6, Y5 ^ X6 }, - {X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5, Y5, X6 ^ Y9, Y6 ^ X9, X7 ^ Y8, Y7 ^ X8 }, - {0, X0, Y0, X1, Y1, X2, Y2, X3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, Y3, X5, X6 ^ Y8, Y5 ^ X9, X7 ^ Y7, Y6 ^ X8 }, - {0, 0, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X3, Y3, X5 ^ Y8, Y5 ^ X8, X6 ^ Y7, Y6 ^ X7 }, - {0, 0, 0, X0, Y0, X1, Y1, X2, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, Y2, X3, Y3 ^ X8, X5 ^ Y7, X6 ^ Y6, Y5 ^ X7 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X2, Y2, Y3 ^ X7, X3 ^ Y7, X5 ^ Y6, Y5 ^ X6 }, - {X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, Y5, X6, Y6 ^ X9, X7 ^ Y8, Y7 ^ X8 }, - {0, X0, Y0, X1, Y1, X2, Y2, X3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, Y3, Y5, X6 ^ Y8, X7 ^ Y7, Y6 ^ X8 }, - {0, 0, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, X3, Y3, Y5 ^ X8, X6 ^ Y7, Y6 ^ X7 }, - {0, 0, 0, X0, Y0, X1, Y1, X2, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, Y2, X3, Y3 ^ X8, X6 ^ Y6, Y5 ^ X7 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, X2, Y2, Y3 ^ X7, X3 ^ Y6, Y5 ^ X6 }, - {X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, X5, X6, Y6 ^ X9, X7 ^ Y8, Y7 ^ X8, Z0 ^ X5 ^ Y5 }, - {0, X0, Y0, X1, Y1, X2, Y2, X3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Y3, X5, X6 ^ Y8, Y6 ^ X8, X7 ^ Y7, Z0 ^ X5 ^ Y5 }, - {0, 0, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, X3, Y3, X5 ^ Y8, X6 ^ Y7, Y6 ^ X7, Z0 ^ X5 ^ Y5 }, - {0, 0, 0, X0, Y0, X1, Y1, X2, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Y2, X3, Y3 ^ X7, X5 ^ Y7, X6 ^ Y6, Z0 ^ X5 ^ Y5 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, X2, Y2, X3 ^ Y7, Y3 ^ X6, X5 ^ Y6, Z0 ^ X5 ^ Y5 }, - {X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5, X6, Y6 ^ X9, X7 ^ Y8, Y7 ^ X8 }, - {0, X0, Y0, X1, Y1, X2, Y2, X3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y3, X5, X6 ^ Y8, Y6 ^ X8, X7 ^ Y7 }, - {0, 0, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X3, Y3, X5 ^ Y8, X6 ^ Y7, Y6 ^ X7 }, - {0, 0, 0, X0, Y0, X1, Y1, X2, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y2, X3, Y3 ^ X7, X5 ^ Y7, X6 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X2, Y2, X3 ^ Y7, Y3 ^ X6, X5 ^ Y6 }, - {X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y6, X6, Y6, X7 ^ Y8, Y7 ^ X8 }, - {0, X0, Y0, X1, Y1, X2, Y2, X3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y6, Y3, X6, Y6 ^ X8, X7 ^ Y7 }, - {0, 0, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y6, X3, Y3, X6 ^ Y7, Y6 ^ X7 }, - {0, 0, 0, X0, Y0, X1, Y1, X2, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y6, Y2, X3, Y3 ^ X7, X6 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y6, X2, Y2, X3 ^ Y6, Y3 ^ X6 }, - {X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X6, Y6, X7 ^ Y8, Y7 ^ X8, X5 ^ Y6 }, - {0, X0, Y0, X1, Y1, X2, Y2, X3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, Y3, X6, X7 ^ Y7, Y6 ^ X8, X5 ^ Y6 }, - {0, 0, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X3, Y3, X6 ^ Y7, Y6 ^ X7, X5 ^ Y6 }, - {0, 0, 0, X0, Y0, X1, Y1, X2, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Y2, X3, Y3 ^ X7, X6 ^ Y6, Z0 ^ X5 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, X2, Y2, Y3 ^ X6, X3 ^ Y6, Z0 ^ X5 ^ Y6 }, - {X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X6, Y6, X7 ^ Y8, Y7 ^ X8 }, - {0, X0, Y0, X1, Y1, X2, Y2, X3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, Y3, X6, X7 ^ Y7, Y6 ^ X8 }, - {0, 0, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X3, Y3, X6 ^ Y7, Y6 ^ X7 }, - {0, 0, 0, X0, Y0, X1, Y1, X2, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y2, X3, Y3 ^ X7, X6 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X2, Y2, Y3 ^ X6, X3 ^ Y6 }, - {X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X6 ^ Y6, Y6, X7, Y7 ^ X8 }, - {0, X0, Y0, X1, Y1, X2, Y2, X3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X6 ^ Y6, Y3, Y6, X7 ^ Y7 }, - {0, 0, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X6 ^ Y6, X3, Y3, Y6 ^ X7 }, - {0, 0, 0, X0, Y0, X1, Y1, X2, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X6 ^ Y6, Y2, X3, Y3 ^ X7 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X3 ^ Y6, X2, Y2, Y3 ^ X6 }, - {X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6, X7, Y7 ^ X8, X6 ^ Y6 }, - {0, X0, Y0, X1, Y1, X2, Y2, X3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, Y3, X6, X7 ^ Y7, X6 ^ Y6 }, - {0, 0, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X3, Y3, X6 ^ Y7, X6 ^ Y6 }, - {0, 0, 0, X0, Y0, X1, Y1, X2, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Y2, X3, Y3 ^ X7, Z0 ^ X6 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y4 ^ X8 ^ Y8, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, X2, X3, Y3 ^ X6, Y2 ^ X6 ^ Y6 }, - {X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X6, X7, Y7 ^ X8 }, - {0, X0, Y0, X1, Y1, X2, Y2, X3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, Y3, X6, X7 ^ Y7 }, - {0, 0, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X3, Y3, X6 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, Y1, X2, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, Y2, X3, Y3 ^ X7 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y4 ^ X8 ^ Y8, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, X2, X3, Y3 ^ X6 }, - {X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X6 ^ Y8, X7, Y7 }, - {0, X0, Y0, X1, Y1, X2, Y2, X3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X6 ^ Y8, Y3, X7 }, - {0, 0, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X6 ^ Y8, X3, Y3 }, - {0, 0, 0, X0, Y0, X1, Y1, X2, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X3 ^ Y8, Y2, Y3 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y4 ^ X8 ^ Y8, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, X3 ^ Y8, X2, Y3 }, - {X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, X7, Y7, X6 ^ Y7 }, - {0, X0, Y0, X1, Y1, X2, Y2, X3, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, Y3, X7, X6 ^ Y7 }, - {0, 0, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, X3, Y3, X6 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, Y1, X2, Y4 ^ X9 ^ Y9, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X8, Z1 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X3, Y3, Z0 ^ X6 ^ Y7 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y4 ^ X9 ^ Y9, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X3, Y3, X2 ^ X6 ^ Y7 }, - {X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, X6 ^ Y7, X7, Y7 }, - {0, X0, Y0, X1, Y1, X2, Y2, X3, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, X6 ^ Y7, Y3, X7 }, - {0, 0, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, X6 ^ Y7, X3, Y3 }, - {0, 0, 0, X0, Y0, X1, Y1, X2, Y4 ^ X9 ^ Y9, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X8, Z1 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7, X3, Y3 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, Y4 ^ X9 ^ Y9, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7, X3, Y3 }, - {X0, Y0, X1, Y1, X2, Y2, X3, Y3, X4, Y4, X5 ^Y10, Y5 ^X10, X6 ^ Y9, Y6 ^ X9, X7 ^ Y8, Y7 ^ X8 }, - {0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, X4, Y4 ^X10, X5 ^ Y9, Y5 ^ X9, X6 ^ Y8, Y6 ^ X8, X7 ^ Y7 }, - {0, 0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, X4 ^ Y9, Y4 ^ X9, X5 ^ Y8, Y5 ^ X8, X6 ^ Y7, Y6 ^ X7 }, - {0, 0, 0, X0, Y0, X1, Y1, X2, Y2, X3, Y3 ^ X9, X4 ^ Y8, Y4 ^ X8, X5 ^ Y7, Y5 ^ X7, X6 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y2, X3 ^ Y8, Y3 ^ X8, X4 ^ Y7, Y4 ^ X7, X5 ^ Y6, Y5 ^ X6 }, - {X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4, Z0 ^ X4 ^ Y4, X5, Y5 ^X10, X6 ^ Y9, Y6 ^ X9, X7 ^ Y8, Y7 ^ X8 }, - {0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, Z0 ^ X4 ^ Y4, Y4, X5 ^ Y9, X6 ^ Y8, Y5 ^ X9, X7 ^ Y7, Y6 ^ X8 }, - {0, 0, X0, Y0, X1, Y1, X2, Y2, X3, Z0 ^ X4 ^ Y4, Y3, Y4 ^ X9, X5 ^ Y8, Y5 ^ X8, X6 ^ Y7, Y6 ^ X7 }, - {0, 0, 0, X0, Y0, X1, Y1, X2, Y2, Z0 ^ X4 ^ Y4, X3, Y3 ^ X9, X5 ^ Y7, Y4 ^ X8, X6 ^ Y6, Y5 ^ X7 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Z0 ^ X4 ^ Y4, Y2, Y3 ^ X8, X3 ^ Y7, Y4 ^ X7, X5 ^ Y6, Y5 ^ X6 }, - {X0, Y0, X1, Y1, X2, Y2, X3, Y3, X5, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, Y5, X6 ^ Y9, Y6 ^ X9, X7 ^ Y8, Y7 ^ X8 }, - {0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5, X6 ^ Y8, Y5 ^ X9, X7 ^ Y7, Y6 ^ X8 }, - {0, 0, X0, Y0, X1, Y1, X2, Y2, X3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, Y3, X5 ^ Y8, Y5 ^ X8, X6 ^ Y7, Y6 ^ X7 }, - {0, 0, 0, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X3, Y3 ^ X8, X5 ^ Y7, X6 ^ Y6, Y5 ^ X7 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, Y2, Y3 ^ X7, X3 ^ Y7, X5 ^ Y6, Y5 ^ X6 }, - {X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y5, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, X6, Y6 ^ X9, X7 ^ Y8, Y7 ^ X8 }, - {0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, Y5, X6 ^ Y8, X7 ^ Y7, Y6 ^ X8 }, - {0, 0, X0, Y0, X1, Y1, X2, Y2, X3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, Y3, Y5 ^ X8, X6 ^ Y7, Y6 ^ X7 }, - {0, 0, 0, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, X3, Y3 ^ X8, X6 ^ Y6, Y5 ^ X7 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, Y2, Y3 ^ X7, X3 ^ Y6, Y5 ^ X6 }, - {X0, Y0, X1, Y1, X2, Y2, X3, Y3, X5, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, X6, Y6 ^ X9, X7 ^ Y8, Y7 ^ X8, Z0 ^ X5 ^ Y5 }, - {0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, X5, X6 ^ Y8, Y6 ^ X8, X7 ^ Y7, Z0 ^ X5 ^ Y5 }, - {0, 0, X0, Y0, X1, Y1, X2, Y2, X3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Y3, X5 ^ Y8, X6 ^ Y7, Y6 ^ X7, Z0 ^ X5 ^ Y5 }, - {0, 0, 0, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, X3, Y3 ^ X7, X5 ^ Y7, X6 ^ Y6, Z0 ^ X5 ^ Y5 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Y2, X3 ^ Y7, Y3 ^ X6, X5 ^ Y6, Z0 ^ X5 ^ Y5 }, - {X0, Y0, X1, Y1, X2, Y2, X3, Y3, X5, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X6, Y6 ^ X9, X7 ^ Y8, Y7 ^ X8 }, - {0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5, X6 ^ Y8, Y6 ^ X8, X7 ^ Y7 }, - {0, 0, X0, Y0, X1, Y1, X2, Y2, X3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y3, X5 ^ Y8, X6 ^ Y7, Y6 ^ X7 }, - {0, 0, 0, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X3, Y3 ^ X7, X5 ^ Y7, X6 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y2, X3 ^ Y7, Y3 ^ X6, X5 ^ Y6 }, - {X0, Y0, X1, Y1, X2, Y2, X3, Y3, X6, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y6, Y6, X7 ^ Y8, Y7 ^ X8 }, - {0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y6, X6, Y6 ^ X8, X7 ^ Y7 }, - {0, 0, X0, Y0, X1, Y1, X2, Y2, X3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y6, Y3, X6 ^ Y7, Y6 ^ X7 }, - {0, 0, 0, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y6, X3, Y3 ^ X7, X6 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y6, Y2, X3 ^ Y6, Y3 ^ X6 }, - {X0, Y0, X1, Y1, X2, Y2, X3, Y3, X6, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, Y6, X7 ^ Y8, Y7 ^ X8, X5 ^ Y6 }, - {0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X6, X7 ^ Y7, Y6 ^ X8, X5 ^ Y6 }, - {0, 0, X0, Y0, X1, Y1, X2, Y2, X3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, Y3, X6 ^ Y7, Y6 ^ X7, X5 ^ Y6 }, - {0, 0, 0, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, X3, Y3 ^ X7, X6 ^ Y6, Z0 ^ X5 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Y2, Y3 ^ X6, X3 ^ Y6, Z0 ^ X5 ^ Y6 }, - {X0, Y0, X1, Y1, X2, Y2, X3, Y3, X6, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, Y6, X7 ^ Y8, Y7 ^ X8 }, - {0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X6, X7 ^ Y7, Y6 ^ X8 }, - {0, 0, X0, Y0, X1, Y1, X2, Y2, X3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, Y3, X6 ^ Y7, Y6 ^ X7 }, - {0, 0, 0, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X3, Y3 ^ X7, X6 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y2, Y3 ^ X6, X3 ^ Y6 }, - {X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y6, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X6 ^ Y6, X7, Y7 ^ X8 }, - {0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X6 ^ Y6, Y6, X7 ^ Y7 }, - {0, 0, X0, Y0, X1, Y1, X2, Y2, X3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X6 ^ Y6, Y3, Y6 ^ X7 }, - {0, 0, 0, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X6 ^ Y6, X3, Y3 ^ X7 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X3 ^ Y6, Y2, Y3 ^ X6 }, - {X0, Y0, X1, Y1, X2, Y2, X3, Y3, X6, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X7, Y7 ^ X8, X6 ^ Y6 }, - {0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6, X7 ^ Y7, X6 ^ Y6 }, - {0, 0, X0, Y0, X1, Y1, X2, Y2, X3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, Y3, X6 ^ Y7, X6 ^ Y6 }, - {0, 0, 0, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, X3, Y3 ^ X7, Z0 ^ X6 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y4 ^ X8 ^ Y8, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, X3, Y3 ^ X6, Y2 ^ X6 ^ Y6 }, - {X0, Y0, X1, Y1, X2, Y2, X3, Y3, X6, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X7, Y7 ^ X8 }, - {0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X6, X7 ^ Y7 }, - {0, 0, X0, Y0, X1, Y1, X2, Y2, X3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, Y3, X6 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X3, Y3 ^ X7 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y4 ^ X8 ^ Y8, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, X3, Y3 ^ X6 }, - {X0, Y0, X1, Y1, X2, Y2, X3, Y3, X7, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X6 ^ Y8, Y7 }, - {0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X6 ^ Y8, X7 }, - {0, 0, X0, Y0, X1, Y1, X2, Y2, X3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X6 ^ Y8, Y3 }, - {0, 0, 0, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X3 ^ Y8, Y3 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y4 ^ X8 ^ Y8, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, X3 ^ Y8, Y3 }, - {X0, Y0, X1, Y1, X2, Y2, X3, Y3, X7, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, Y7, X6 ^ Y7 }, - {0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, X7, X6 ^ Y7 }, - {0, 0, X0, Y0, X1, Y1, X2, Y2, X3, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, Y3, X6 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, Y1, X2, X3, Y4 ^ X9 ^ Y9, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X8, Z1 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, Y3, Z0 ^ X6 ^ Y7 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X3, Y4 ^ X9 ^ Y9, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, Y3, X2 ^ X6 ^ Y7 }, - {X0, Y0, X1, Y1, X2, Y2, X3, Y3, X7, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, X6 ^ Y7, Y7 }, - {0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, X6 ^ Y7, X7 }, - {0, 0, X0, Y0, X1, Y1, X2, Y2, X3, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, X6 ^ Y7, Y3 }, - {0, 0, 0, X0, Y0, X1, Y1, X2, X3, Y4 ^ X9 ^ Y9, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X8, Z1 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7, Y3 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X3, Y4 ^ X9 ^ Y9, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7, Y3 }, - {X0, Y0, X1, Y1, X2, Y2, X3, Y3, X4, Y4, X5 ^Y10, Y5 ^X10, X6 ^ Y9, Y6 ^ X9, X7 ^ Y8, Y7 ^ X8 }, - {0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, X4, Y4 ^X10, X5 ^ Y9, Y5 ^ X9, X6 ^ Y8, Y6 ^ X8, X7 ^ Y7 }, - {0, 0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, X4 ^ Y9, Y4 ^ X9, X5 ^ Y8, Y5 ^ X8, X6 ^ Y7, Y6 ^ X7 }, - {0, 0, 0, X0, Y0, X1, Y1, X2, Y2, X3, Y3 ^ X9, X4 ^ Y8, Y4 ^ X8, X5 ^ Y7, Y5 ^ X7, X6 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y2, X3 ^ Y8, Y3 ^ X8, X4 ^ Y7, Y4 ^ X7, X5 ^ Y6, Y5 ^ X6 }, - {X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4, X5, Z0 ^ X4 ^ Y4, Y5 ^X10, X6 ^ Y9, Y6 ^ X9, X7 ^ Y8, Y7 ^ X8 }, - {0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4, Z0 ^ X4 ^ Y4, X5 ^ Y9, X6 ^ Y8, Y5 ^ X9, X7 ^ Y7, Y6 ^ X8 }, - {0, 0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, Z0 ^ X4 ^ Y4, Y4 ^ X9, X5 ^ Y8, Y5 ^ X8, X6 ^ Y7, Y6 ^ X7 }, - {0, 0, 0, X0, Y0, X1, Y1, X2, Y2, X3, Z0 ^ X4 ^ Y4, Y3 ^ X9, X5 ^ Y7, Y4 ^ X8, X6 ^ Y6, Y5 ^ X7 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y2, Z0 ^ X4 ^ Y4, Y3 ^ X8, X3 ^ Y7, Y4 ^ X7, X5 ^ Y6, Y5 ^ X6 }, - {X0, Y0, X1, Y1, X2, Y2, X3, Y3, X5, Y5, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X6 ^ Y9, Y6 ^ X9, X7 ^ Y8, Y7 ^ X8 }, - {0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, X5, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X6 ^ Y8, Y5 ^ X9, X7 ^ Y7, Y6 ^ X8 }, - {0, 0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y8, Y5 ^ X8, X6 ^ Y7, Y6 ^ X7 }, - {0, 0, 0, X0, Y0, X1, Y1, X2, Y2, X3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, Y3 ^ X8, X5 ^ Y7, X6 ^ Y6, Y5 ^ X7 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, Y3 ^ X7, X3 ^ Y7, X5 ^ Y6, Y5 ^ X6 }, - {X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y5, X6, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, Y6 ^ X9, X7 ^ Y8, Y7 ^ X8 }, - {0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y5, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, X6 ^ Y8, X7 ^ Y7, Y6 ^ X8 }, - {0, 0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, Y5 ^ X8, X6 ^ Y7, Y6 ^ X7 }, - {0, 0, 0, X0, Y0, X1, Y1, X2, Y2, X3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, Y3 ^ X8, X6 ^ Y6, Y5 ^ X7 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, Y3 ^ X7, X3 ^ Y6, Y5 ^ X6 }, - {X0, Y0, X1, Y1, X2, Y2, X3, Y3, X5, X6, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Y6 ^ X9, X7 ^ Y8, Y7 ^ X8, Z0 ^ X5 ^ Y5 }, - {0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, X5, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, X6 ^ Y8, Y6 ^ X8, X7 ^ Y7, Z0 ^ X5 ^ Y5 }, - {0, 0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, X5 ^ Y8, X6 ^ Y7, Y6 ^ X7, Z0 ^ X5 ^ Y5 }, - {0, 0, 0, X0, Y0, X1, Y1, X2, Y2, X3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Y3 ^ X7, X5 ^ Y7, X6 ^ Y6, Z0 ^ X5 ^ Y5 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, X3 ^ Y7, Y3 ^ X6, X5 ^ Y6, Z0 ^ X5 ^ Y5 }, - {X0, Y0, X1, Y1, X2, Y2, X3, Y3, X5, X6, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y6 ^ X9, X7 ^ Y8, Y7 ^ X8 }, - {0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, X5, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X6 ^ Y8, Y6 ^ X8, X7 ^ Y7 }, - {0, 0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y8, X6 ^ Y7, Y6 ^ X7 }, - {0, 0, 0, X0, Y0, X1, Y1, X2, Y2, X3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y3 ^ X7, X5 ^ Y7, X6 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X3 ^ Y7, Y3 ^ X6, X5 ^ Y6 }, - {X0, Y0, X1, Y1, X2, Y2, X3, Y3, X6, Y6, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y6, X7 ^ Y8, Y7 ^ X8 }, - {0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, X6, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y6, Y6 ^ X8, X7 ^ Y7 }, - {0, 0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y6, X6 ^ Y7, Y6 ^ X7 }, - {0, 0, 0, X0, Y0, X1, Y1, X2, Y2, X3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y6, Y3 ^ X7, X6 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y6, X3 ^ Y6, Y3 ^ X6 }, - {X0, Y0, X1, Y1, X2, Y2, X3, Y3, X6, Y6, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X7 ^ Y8, Y7 ^ X8, X5 ^ Y6 }, - {0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, X6, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X7 ^ Y7, Y6 ^ X8, X5 ^ Y6 }, - {0, 0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X6 ^ Y7, Y6 ^ X7, X5 ^ Y6 }, - {0, 0, 0, X0, Y0, X1, Y1, X2, Y2, X3, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Y3 ^ X7, X6 ^ Y6, Z0 ^ X5 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Y3 ^ X6, X3 ^ Y6, Z0 ^ X5 ^ Y6 }, - {X0, Y0, X1, Y1, X2, Y2, X3, Y3, X6, Y6, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X7 ^ Y8, Y7 ^ X8 }, - {0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, X6, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X7 ^ Y7, Y6 ^ X8 }, - {0, 0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X6 ^ Y7, Y6 ^ X7 }, - {0, 0, 0, X0, Y0, X1, Y1, X2, Y2, X3, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y3 ^ X7, X6 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y3 ^ X6, X3 ^ Y6 }, - {X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y6, X7, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X6 ^ Y6, Y7 ^ X8 }, - {0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y6, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X6 ^ Y6, X7 ^ Y7 }, - {0, 0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X6 ^ Y6, Y6 ^ X7 }, - {0, 0, 0, X0, Y0, X1, Y1, X2, Y2, X3, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X6 ^ Y6, Y3 ^ X7 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X3 ^ Y6, Y3 ^ X6 }, - {X0, Y0, X1, Y1, X2, Y2, X3, Y3, X6, X7, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, Y7 ^ X8, X6 ^ Y6 }, - {0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, X6, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X7 ^ Y7, X6 ^ Y6 }, - {0, 0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y7, X6 ^ Y6 }, - {0, 0, 0, X0, Y0, X1, Y1, X2, Y2, X3, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Y3 ^ X7, Z0 ^ X6 ^ Y6 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, X3, Y4 ^ X8 ^ Y8, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, Y3 ^ X6, Y2 ^ X6 ^ Y6 }, - {X0, Y0, X1, Y1, X2, Y2, X3, Y3, X6, X7, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, Y7 ^ X8 }, - {0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, X6, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X7 ^ Y7 }, - {0, 0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X6 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, Y1, X2, Y2, X3, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, Y3 ^ X7 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, X3, Y4 ^ X8 ^ Y8, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, Y3 ^ X6 }, - {X0, Y0, X1, Y1, X2, Y2, X3, Y3, X7, Y7, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X6 ^ Y8 }, - {0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, X7, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X6 ^ Y8 }, - {0, 0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X6 ^ Y8 }, - {0, 0, 0, X0, Y0, X1, Y1, X2, Y2, Y3, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X3 ^ Y8 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X2, Y3, Y4 ^ X8 ^ Y8, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, X3 ^ Y8 }, - {X0, Y0, X1, Y1, X2, Y2, X3, Y3, X7, Y7, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, X6 ^ Y7 }, - {0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, X7, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, X6 ^ Y7 }, - {0, 0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, X6 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, Y1, X2, X3, Y3, Y4 ^ X9 ^ Y9, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X8, Z1 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X3, Y3, Y4 ^ X9 ^ Y9, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7 }, - {X0, Y0, X1, Y1, X2, Y2, X3, Y3, X7, Y7, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, X6 ^ Y7 }, - {0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, X7, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, X6 ^ Y7 }, - {0, 0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, X6 ^ Y7 }, - {0, 0, 0, X0, Y0, X1, Y1, X2, X3, Y3, Y4 ^ X9 ^ Y9, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X8, Z1 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7 }, - {0, 0, 0, 0, X0, Y0, X1, Y1, X3, Y3, Y4 ^ X9 ^ Y9, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7 }, -}; - -const UINT_64 SW_64K_Z_X_2xaa_RBPLUS[][16]= -{ - {S0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, X4, Y4 ^X10, Y5 ^ X9, X5 ^ Y9, Y6 ^ X8, X6 ^ Y8, X7 ^ Y7 }, - {0, S0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, X4 ^ Y9, Y4 ^ X9, X5 ^ Y8, Y5 ^ X8, X6 ^ Y7, Y6 ^ X7 }, - {0, 0, S0, X0, Y0, X1, Y1, X2, X3, Y3, Y4 ^ X9, X4 ^ Y9, Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {0, 0, 0, S0, X0, Y0, X1, Y1, Y2, X3, Y3 ^ X9, X4 ^ Y8, Y4 ^ X8, X5 ^ Y7, Y5 ^ X7, X2 ^ X6 ^ Y6 }, - {0, 0, 0, 0, S0, X0, Y0, X1, X2, Y2, Y3 ^ X8, X3 ^ Y8, Y4 ^ X7, X4 ^ Y7, Y5 ^ X6, Y1 ^ X5 ^ Y6 }, - {S0, X0, Y0, X1, Y1, X2, Y2, X3, Z0 ^ X4 ^ Y4, Y3, Y4, Y5 ^ X9, X5 ^ Y9, Y6 ^ X8, X6 ^ Y8, X7 ^ Y7 }, - {0, S0, X0, Y0, X1, Y1, X2, Y2, Z0 ^ X4 ^ Y4, X3, Y3, Y4 ^ X9, X5 ^ Y8, Y5 ^ X8, X6 ^ Y7, Y6 ^ X7 }, - {0, 0, S0, X0, Y0, X1, Y1, X2, Z0 ^ X4 ^ Y4, X3, Y3, Y4 ^ X9, Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {0, 0, 0, S0, X0, Y0, X1, Y1, Z0 ^ X4 ^ Y4, X2, X3, Y3 ^ X9, Y4 ^ X8, X5 ^ Y7, Y5 ^ X7, Y2 ^ X6 ^ Y6 }, - {0, 0, 0, 0, S0, X0, Y0, X1, Z0 ^ X4 ^ Y4, X2, Y2, Y3 ^ X8, Y4 ^ X7, X3 ^ Y7, Y5 ^ X6, Y1 ^ X5 ^ Y6 }, - {S0, X0, Y0, X1, Y1, X2, Y2, X3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, Y3, X5, Y5 ^ X9, Y6 ^ X8, X6 ^ Y8, X7 ^ Y7 }, - {0, S0, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X3, Y3, X5 ^ Y8, Y5 ^ X8, X6 ^ Y7, Y6 ^ X7 }, - {0, 0, S0, X0, Y0, X1, Y1, X2, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X3, Y3, Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {0, 0, 0, S0, X0, Y0, X1, Y1, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X2, X3, Y3 ^ X8, X5 ^ Y7, Y5 ^ X7, Y2 ^ X6 ^ Y6 }, - {0, 0, 0, 0, S0, X0, Y0, X1, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X2, Y2, Y3 ^ X7, X3 ^ Y7, Y5 ^ X6, Y1 ^ X5 ^ Y6 }, - {S0, X0, Y0, X1, Y1, X2, Y2, X3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, Y3, X5, Y6 ^ X8, X6 ^ Y8, X7 ^ Y7 }, - {0, S0, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, X3, Y3, X5 ^ Y8, X6 ^ Y7, Y6 ^ X7 }, - {0, 0, S0, X0, Y0, X1, Y1, X2, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, X3, Y3, X5 ^ Y8, Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {0, 0, 0, S0, X0, Y0, X1, Y1, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, X2, X3, Y3 ^ X7, X5 ^ Y7, Y2 ^ X6 ^ Y6 }, - {0, 0, 0, 0, S0, X0, Y0, X1, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, X2, Y2, Y3 ^ X6, X3 ^ Y7, Y1 ^ X5 ^ Y6 }, - {S0, X0, Y0, X1, Y1, X2, Y2, X3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Y3, X5, Y6 ^ X8, X6 ^ Y8, X7 ^ Y7, Z0 ^ X5 ^ Y5 }, - {0, S0, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, X3, Y3, X5 ^ Y8, X6 ^ Y7, Y6 ^ X7, Z0 ^ X5 ^ Y5 }, - {0, 0, S0, X0, Y0, X1, Y1, X2, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, X3, Y3, X5 ^ Y8, Y6 ^ X7, Y2 ^ X6 ^ Y7, Z0 ^ X5 ^ Y5 }, - {0, 0, 0, S0, X0, Y0, X1, Y1, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Y2, X3, Y3 ^ X7, X5 ^ Y7, X2 ^ X6 ^ Y6, Z0 ^ X5 ^ Y5 }, - {0, 0, 0, 0, S0, X0, Y0, X1, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, X2, Y2, Y3 ^ X6, X3 ^ Y7, Y1 ^ X5 ^ Y6, Z0 ^ X5 ^ Y5 }, - {S0, X0, Y0, X1, Y1, X2, Y2, X3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y3, X5, Y6 ^ X8, X6 ^ Y8, X7 ^ Y7 }, - {0, S0, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X3, Y3, X5 ^ Y8, X6 ^ Y7, Y6 ^ X7 }, - {0, 0, S0, X0, Y0, X1, Y1, X2, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X3, Y3, X5 ^ Y8, Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {0, 0, 0, S0, X0, Y0, X1, Y1, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y2, X3, Y3 ^ X7, X5 ^ Y7, X2 ^ X6 ^ Y6 }, - {0, 0, 0, 0, S0, X0, Y0, X1, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X2, Y2, Y3 ^ X6, X3 ^ Y7, Y1 ^ X5 ^ Y6 }, - {S0, X0, Y0, X1, Y1, X2, Y2, X3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y6, Y3, X5, X6 ^ Y8, X7 ^ Y7 }, - {0, S0, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y6, X3, Y3, X5 ^ Y8, X6 ^ Y7 }, - {0, 0, S0, X0, Y0, X1, Y1, X2, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y6, X3, Y3, X5 ^ Y8, Y2 ^ X6 ^ Y7 }, - {0, 0, 0, S0, X0, Y0, X1, Y1, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X2 ^ X5 ^ Y6, Y2, X3, Y3 ^ X6, X5 ^ Y6 }, - {0, 0, 0, 0, S0, X0, Y0, X1, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y1 ^ X5 ^ Y6, X2, Y2, Y3 ^ X5, X3 ^ Y6 }, - {S0, X0, Y0, X1, Y1, X2, Y2, X3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, Y3, X6, Y6 ^ X8, X7 ^ Y7, X5 ^ Y6 }, - {0, S0, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X3, Y3, X6 ^ Y7, Y6 ^ X7, X5 ^ Y6 }, - {0, 0, S0, X0, Y0, X1, Y1, X2, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X3, Y3, Y6 ^ X7, Y2 ^ X6 ^ Y7, X5 ^ Y6 }, - {0, 0, 0, S0, X0, Y0, X1, Y1, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, X2, X3, Y3 ^ X7, Y2 ^ X6 ^ Y6, Z0 ^ X5 ^ Y6 }, - {0, 0, 0, 0, S0, X0, Y0, X1, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X2, X3, Y3 ^ X7, Y2 ^ X6 ^ Y6, Y1 ^ X5 ^ Y6 }, - {S0, X0, Y0, X1, Y1, X2, Y2, X3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, Y3, X6, Y6 ^ X8, X7 ^ Y7 }, - {0, S0, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X3, Y3, X6 ^ Y7, Y6 ^ X7 }, - {0, 0, S0, X0, Y0, X1, Y1, X2, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X3, Y3, Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {0, 0, 0, S0, X0, Y0, X1, Y1, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X2, X3, Y3 ^ X7, Y2 ^ X6 ^ Y6 }, - {0, 0, 0, 0, S0, X0, Y0, X1, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, Y1 ^ X5 ^ Y6, X2, X3, Y3 ^ X7, Y2 ^ X6 ^ Y6 }, - {S0, X0, Y0, X1, Y1, X2, Y2, X3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X6 ^ Y6, Y3, X6, X7 ^ Y7 }, - {0, S0, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X6 ^ Y6, X3, Y3, X6 ^ Y7 }, - {0, 0, S0, X0, Y0, X1, Y1, X2, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X6 ^ Y6, X3, Y3, Y2 ^ X6 ^ Y7 }, - {0, 0, 0, S0, X0, Y0, X1, Y1, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y2 ^ X6 ^ Y6, X2, X3, Y3 ^ X6 }, - {0, 0, 0, 0, S0, X0, Y0, X1, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, Y1 ^ X5 ^ Y6, Y2 ^ X6 ^ Y6, X2, X3, Y3 ^ X6 }, - {S0, X0, Y0, X1, Y1, X2, Y2, X3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, Y3, X6, X7 ^ Y7, X6 ^ Y6 }, - {0, S0, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X3, Y3, X6 ^ Y7, X6 ^ Y6 }, - {0, 0, S0, X0, Y0, X1, Y1, X2, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X3, Y3, Y2 ^ X6 ^ Y7, X6 ^ Y6 }, - {0, 0, 0, S0, X0, Y0, X1, Y1, Y4 ^ X8 ^ Y8, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, X3, Y3, X2 ^ X6 ^ Y7, Y2 ^ X6 ^ Y6 }, - {0, 0, 0, 0, S0, X0, Y0, X1, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, Y1 ^ X5 ^ Y7, X3, Y3, X2 ^ X6 ^ Y7, Y2 ^ X6 ^ Y6 }, - {S0, X0, Y0, X1, Y1, X2, Y2, X3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, Y3, X6, X7 ^ Y7 }, - {0, S0, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X3, Y3, X6 ^ Y7 }, - {0, 0, S0, X0, Y0, X1, Y1, X2, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X3, Y3, Y2 ^ X6 ^ Y7 }, - {0, 0, 0, S0, X0, Y0, X1, Y1, Y4 ^ X8 ^ Y8, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, X3, Y3, X2 ^ X6 ^ Y7 }, - {0, 0, 0, 0, S0, X0, Y0, X1, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, Y1 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, X3, Y3, X2 ^ X6 ^ Y7 }, - {S0, X0, Y0, X1, Y1, X2, Y2, X3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X6 ^ Y8, Y3, Y7 }, - {0, S0, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X6 ^ Y8, X3, Y3 }, - {0, 0, S0, X0, Y0, X1, Y1, X2, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, Y2 ^ X6 ^ Y8, X3, Y3 }, - {0, 0, 0, S0, X0, Y0, X1, Y1, Y4 ^ X8 ^ Y8, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, X2 ^ X6 ^ Y8, X3, Y3 }, - {0, 0, 0, 0, S0, X0, Y0, X1, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, Y1 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, X2 ^ X6 ^ Y8, X3, Y3 }, - {S0, X0, Y0, X1, Y1, X2, Y2, X3, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, Y3, Y7, X6 ^ Y7 }, - {0, S0, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, X3, Y3, X6 ^ Y7 }, - {0, 0, S0, X0, Y0, X1, Y1, X2, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, X3, Y3, Y2 ^ X6 ^ Y7 }, - {0, 0, 0, S0, X0, Y0, X1, Y1, Y4 ^ X9 ^ Y9, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X3, Y3, X2 ^ X6 ^ Y7 }, - {0, 0, 0, 0, S0, X0, Y0, X1, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, Y1 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X3, Y3, X2 ^ X6 ^ Y7 }, - {S0, X0, Y0, X1, Y1, X2, Y2, X3, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, X6 ^ Y7, Y3, Y7 }, - {0, S0, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, X6 ^ Y7, X3, Y3 }, - {0, 0, S0, X0, Y0, X1, Y1, X2, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, Y2 ^ X6 ^ Y7, X3, Y3 }, - {0, 0, 0, S0, X0, Y0, X1, Y1, Y4 ^ X9 ^ Y9, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7, X3, Y3 }, - {0, 0, 0, 0, S0, X0, Y0, X1, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, Y1 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7, X3, Y3 }, - {S0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, X4, Y4 ^X10, Y5 ^ X9, X5 ^ Y9, Y6 ^ X8, X6 ^ Y8, X7 ^ Y7 }, - {0, S0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, X4 ^ Y9, Y4 ^ X9, X5 ^ Y8, Y5 ^ X8, X6 ^ Y7, Y6 ^ X7 }, - {0, 0, S0, X0, Y0, X1, Y1, X2, X3, Y3, Y4 ^ X9, X4 ^ Y9, Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {0, 0, 0, S0, X0, Y0, X1, Y1, Y2, X3, Y3 ^ X9, X4 ^ Y8, Y4 ^ X8, X5 ^ Y7, Y5 ^ X7, X2 ^ X6 ^ Y6 }, - {0, 0, 0, 0, S0, X0, Y0, X1, X2, Y2, Y3 ^ X8, X3 ^ Y8, Y4 ^ X7, X4 ^ Y7, Y5 ^ X6, Y1 ^ X5 ^ Y6 }, - {S0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, Z0 ^ X4 ^ Y4, Y4, Y5 ^ X9, X5 ^ Y9, Y6 ^ X8, X6 ^ Y8, X7 ^ Y7 }, - {0, S0, X0, Y0, X1, Y1, X2, Y2, X3, Z0 ^ X4 ^ Y4, Y3, Y4 ^ X9, X5 ^ Y8, Y5 ^ X8, X6 ^ Y7, Y6 ^ X7 }, - {0, 0, S0, X0, Y0, X1, Y1, X2, X3, Z0 ^ X4 ^ Y4, Y3, Y4 ^ X9, Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {0, 0, 0, S0, X0, Y0, X1, Y1, X2, Z0 ^ X4 ^ Y4, X3, Y3 ^ X9, Y4 ^ X8, X5 ^ Y7, Y5 ^ X7, Y2 ^ X6 ^ Y6 }, - {0, 0, 0, 0, S0, X0, Y0, X1, X2, Z0 ^ X4 ^ Y4, Y2, Y3 ^ X8, Y4 ^ X7, X3 ^ Y7, Y5 ^ X6, Y1 ^ X5 ^ Y6 }, - {S0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5, Y5 ^ X9, Y6 ^ X8, X6 ^ Y8, X7 ^ Y7 }, - {0, S0, X0, Y0, X1, Y1, X2, Y2, X3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, Y3, X5 ^ Y8, Y5 ^ X8, X6 ^ Y7, Y6 ^ X7 }, - {0, 0, S0, X0, Y0, X1, Y1, X2, X3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, Y3, Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {0, 0, 0, S0, X0, Y0, X1, Y1, X2, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X3, Y3 ^ X8, X5 ^ Y7, Y5 ^ X7, Y2 ^ X6 ^ Y6 }, - {0, 0, 0, 0, S0, X0, Y0, X1, X2, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, Y2, Y3 ^ X7, X3 ^ Y7, Y5 ^ X6, Y1 ^ X5 ^ Y6 }, - {S0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, X5, Y6 ^ X8, X6 ^ Y8, X7 ^ Y7 }, - {0, S0, X0, Y0, X1, Y1, X2, Y2, X3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, Y3, X5 ^ Y8, X6 ^ Y7, Y6 ^ X7 }, - {0, 0, S0, X0, Y0, X1, Y1, X2, X3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, Y3, X5 ^ Y8, Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {0, 0, 0, S0, X0, Y0, X1, Y1, X2, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, X3, Y3 ^ X7, X5 ^ Y7, Y2 ^ X6 ^ Y6 }, - {0, 0, 0, 0, S0, X0, Y0, X1, X2, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, Y2, Y3 ^ X6, X3 ^ Y7, Y1 ^ X5 ^ Y6 }, - {S0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, X5, Y6 ^ X8, X6 ^ Y8, X7 ^ Y7, Z0 ^ X5 ^ Y5 }, - {0, S0, X0, Y0, X1, Y1, X2, Y2, X3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Y3, X5 ^ Y8, X6 ^ Y7, Y6 ^ X7, Z0 ^ X5 ^ Y5 }, - {0, 0, S0, X0, Y0, X1, Y1, X2, X3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Y3, X5 ^ Y8, Y6 ^ X7, Y2 ^ X6 ^ Y7, Z0 ^ X5 ^ Y5 }, - {0, 0, 0, S0, X0, Y0, X1, Y1, Y2, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, X3, Y3 ^ X7, X5 ^ Y7, X2 ^ X6 ^ Y6, Z0 ^ X5 ^ Y5 }, - {0, 0, 0, 0, S0, X0, Y0, X1, X2, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Y2, Y3 ^ X6, X3 ^ Y7, Y1 ^ X5 ^ Y6, Z0 ^ X5 ^ Y5 }, - {S0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5, Y6 ^ X8, X6 ^ Y8, X7 ^ Y7 }, - {0, S0, X0, Y0, X1, Y1, X2, Y2, X3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y3, X5 ^ Y8, X6 ^ Y7, Y6 ^ X7 }, - {0, 0, S0, X0, Y0, X1, Y1, X2, X3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y3, X5 ^ Y8, Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {0, 0, 0, S0, X0, Y0, X1, Y1, Y2, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X3, Y3 ^ X7, X5 ^ Y7, X2 ^ X6 ^ Y6 }, - {0, 0, 0, 0, S0, X0, Y0, X1, X2, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y2, Y3 ^ X6, X3 ^ Y7, Y1 ^ X5 ^ Y6 }, - {S0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y6, X5, X6 ^ Y8, X7 ^ Y7 }, - {0, S0, X0, Y0, X1, Y1, X2, Y2, X3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y6, Y3, X5 ^ Y8, X6 ^ Y7 }, - {0, 0, S0, X0, Y0, X1, Y1, X2, X3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y6, Y3, X5 ^ Y8, Y2 ^ X6 ^ Y7 }, - {0, 0, 0, S0, X0, Y0, X1, Y1, Y2, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X2 ^ X5 ^ Y6, X3, Y3 ^ X6, X5 ^ Y6 }, - {0, 0, 0, 0, S0, X0, Y0, X1, X2, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y1 ^ X5 ^ Y6, Y2, Y3 ^ X5, X3 ^ Y6 }, - {S0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X6, Y6 ^ X8, X7 ^ Y7, X5 ^ Y6 }, - {0, S0, X0, Y0, X1, Y1, X2, Y2, X3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, Y3, X6 ^ Y7, Y6 ^ X7, X5 ^ Y6 }, - {0, 0, S0, X0, Y0, X1, Y1, X2, X3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, Y3, Y6 ^ X7, Y2 ^ X6 ^ Y7, X5 ^ Y6 }, - {0, 0, 0, S0, X0, Y0, X1, Y1, X2, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, X3, Y3 ^ X7, Y2 ^ X6 ^ Y6, Z0 ^ X5 ^ Y6 }, - {0, 0, 0, 0, S0, X0, Y0, X1, X2, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X3, Y3 ^ X7, Y2 ^ X6 ^ Y6, Y1 ^ X5 ^ Y6 }, - {S0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X6, Y6 ^ X8, X7 ^ Y7 }, - {0, S0, X0, Y0, X1, Y1, X2, Y2, X3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, Y3, X6 ^ Y7, Y6 ^ X7 }, - {0, 0, S0, X0, Y0, X1, Y1, X2, X3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, Y3, Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {0, 0, 0, S0, X0, Y0, X1, Y1, X2, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X3, Y3 ^ X7, Y2 ^ X6 ^ Y6 }, - {0, 0, 0, 0, S0, X0, Y0, X1, X2, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, Y1 ^ X5 ^ Y6, X3, Y3 ^ X7, Y2 ^ X6 ^ Y6 }, - {S0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X6 ^ Y6, X6, X7 ^ Y7 }, - {0, S0, X0, Y0, X1, Y1, X2, Y2, X3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X6 ^ Y6, Y3, X6 ^ Y7 }, - {0, 0, S0, X0, Y0, X1, Y1, X2, X3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X6 ^ Y6, Y3, Y2 ^ X6 ^ Y7 }, - {0, 0, 0, S0, X0, Y0, X1, Y1, X2, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y2 ^ X6 ^ Y6, X3, Y3 ^ X6 }, - {0, 0, 0, 0, S0, X0, Y0, X1, X2, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, Y1 ^ X5 ^ Y6, Y2 ^ X6 ^ Y6, X3, Y3 ^ X6 }, - {S0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6, X7 ^ Y7, X6 ^ Y6 }, - {0, S0, X0, Y0, X1, Y1, X2, Y2, X3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, Y3, X6 ^ Y7, X6 ^ Y6 }, - {0, 0, S0, X0, Y0, X1, Y1, X2, X3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, Y3, Y2 ^ X6 ^ Y7, X6 ^ Y6 }, - {0, 0, 0, S0, X0, Y0, X1, Y1, X3, Y4 ^ X8 ^ Y8, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, Y3, X2 ^ X6 ^ Y7, Y2 ^ X6 ^ Y6 }, - {0, 0, 0, 0, S0, X0, Y0, X1, X3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, Y1 ^ X5 ^ Y7, Y3, X2 ^ X6 ^ Y7, Y2 ^ X6 ^ Y6 }, - {S0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X6, X7 ^ Y7 }, - {0, S0, X0, Y0, X1, Y1, X2, Y2, X3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, Y3, X6 ^ Y7 }, - {0, 0, S0, X0, Y0, X1, Y1, X2, X3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, Y3, Y2 ^ X6 ^ Y7 }, - {0, 0, 0, S0, X0, Y0, X1, Y1, X3, Y4 ^ X8 ^ Y8, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, Y3, X2 ^ X6 ^ Y7 }, - {0, 0, 0, 0, S0, X0, Y0, X1, X3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, Y1 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, Y3, X2 ^ X6 ^ Y7 }, - {S0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X6 ^ Y8, Y7 }, - {0, S0, X0, Y0, X1, Y1, X2, Y2, X3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X6 ^ Y8, Y3 }, - {0, 0, S0, X0, Y0, X1, Y1, X2, X3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, Y2 ^ X6 ^ Y8, Y3 }, - {0, 0, 0, S0, X0, Y0, X1, Y1, X3, Y4 ^ X8 ^ Y8, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, X2 ^ X6 ^ Y8, Y3 }, - {0, 0, 0, 0, S0, X0, Y0, X1, X3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, Y1 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, X2 ^ X6 ^ Y8, Y3 }, - {S0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, Y7, X6 ^ Y7 }, - {0, S0, X0, Y0, X1, Y1, X2, Y2, X3, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, Y3, X6 ^ Y7 }, - {0, 0, S0, X0, Y0, X1, Y1, X2, X3, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, Y3, Y2 ^ X6 ^ Y7 }, - {0, 0, 0, S0, X0, Y0, X1, Y1, X3, Y4 ^ X9 ^ Y9, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, Y3, X2 ^ X6 ^ Y7 }, - {0, 0, 0, 0, S0, X0, Y0, X1, X3, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, Y1 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, Y3, X2 ^ X6 ^ Y7 }, - {S0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, X6 ^ Y7, Y7 }, - {0, S0, X0, Y0, X1, Y1, X2, Y2, X3, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, X6 ^ Y7, Y3 }, - {0, 0, S0, X0, Y0, X1, Y1, X2, X3, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, Y2 ^ X6 ^ Y7, Y3 }, - {0, 0, 0, S0, X0, Y0, X1, Y1, X3, Y4 ^ X9 ^ Y9, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7, Y3 }, - {0, 0, 0, 0, S0, X0, Y0, X1, X3, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, Y1 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7, Y3 }, - {S0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, X4, Y4 ^X10, Y5 ^ X9, X5 ^ Y9, Y6 ^ X8, X6 ^ Y8, X7 ^ Y7 }, - {0, S0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, X4 ^ Y9, Y4 ^ X9, X5 ^ Y8, Y5 ^ X8, X6 ^ Y7, Y6 ^ X7 }, - {0, 0, S0, X0, Y0, X1, Y1, X2, X3, Y3, Y4 ^ X9, X4 ^ Y9, Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {0, 0, 0, S0, X0, Y0, X1, Y1, Y2, X3, Y3 ^ X9, X4 ^ Y8, Y4 ^ X8, X5 ^ Y7, Y5 ^ X7, X2 ^ X6 ^ Y6 }, - {0, 0, 0, 0, S0, X0, Y0, X1, X2, Y2, Y3 ^ X8, X3 ^ Y8, Y4 ^ X7, X4 ^ Y7, Y5 ^ X6, Y1 ^ X5 ^ Y6 }, - {S0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4, Z0 ^ X4 ^ Y4, Y5 ^ X9, X5 ^ Y9, Y6 ^ X8, X6 ^ Y8, X7 ^ Y7 }, - {0, S0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, Z0 ^ X4 ^ Y4, Y4 ^ X9, X5 ^ Y8, Y5 ^ X8, X6 ^ Y7, Y6 ^ X7 }, - {0, 0, S0, X0, Y0, X1, Y1, X2, X3, Y3, Z0 ^ X4 ^ Y4, Y4 ^ X9, Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {0, 0, 0, S0, X0, Y0, X1, Y1, X2, X3, Z0 ^ X4 ^ Y4, Y3 ^ X9, Y4 ^ X8, X5 ^ Y7, Y5 ^ X7, Y2 ^ X6 ^ Y6 }, - {0, 0, 0, 0, S0, X0, Y0, X1, X2, Y2, Z0 ^ X4 ^ Y4, Y3 ^ X8, Y4 ^ X7, X3 ^ Y7, Y5 ^ X6, Y1 ^ X5 ^ Y6 }, - {S0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, X5, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, Y5 ^ X9, Y6 ^ X8, X6 ^ Y8, X7 ^ Y7 }, - {0, S0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y8, Y5 ^ X8, X6 ^ Y7, Y6 ^ X7 }, - {0, 0, S0, X0, Y0, X1, Y1, X2, X3, Y3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {0, 0, 0, S0, X0, Y0, X1, Y1, X2, X3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, Y3 ^ X8, X5 ^ Y7, Y5 ^ X7, Y2 ^ X6 ^ Y6 }, - {0, 0, 0, 0, S0, X0, Y0, X1, X2, Y2, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, Y3 ^ X7, X3 ^ Y7, Y5 ^ X6, Y1 ^ X5 ^ Y6 }, - {S0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, X5, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, Y6 ^ X8, X6 ^ Y8, X7 ^ Y7 }, - {0, S0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, X5 ^ Y8, X6 ^ Y7, Y6 ^ X7 }, - {0, 0, S0, X0, Y0, X1, Y1, X2, X3, Y3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, X5 ^ Y8, Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {0, 0, 0, S0, X0, Y0, X1, Y1, X2, X3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, Y3 ^ X7, X5 ^ Y7, Y2 ^ X6 ^ Y6 }, - {0, 0, 0, 0, S0, X0, Y0, X1, X2, Y2, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, Y3 ^ X6, X3 ^ Y7, Y1 ^ X5 ^ Y6 }, - {S0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, X5, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Y6 ^ X8, X6 ^ Y8, X7 ^ Y7, Z0 ^ X5 ^ Y5 }, - {0, S0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, X5 ^ Y8, X6 ^ Y7, Y6 ^ X7, Z0 ^ X5 ^ Y5 }, - {0, 0, S0, X0, Y0, X1, Y1, X2, X3, Y3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, X5 ^ Y8, Y6 ^ X7, Y2 ^ X6 ^ Y7, Z0 ^ X5 ^ Y5 }, - {0, 0, 0, S0, X0, Y0, X1, Y1, Y2, X3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Y3 ^ X7, X5 ^ Y7, X2 ^ X6 ^ Y6, Z0 ^ X5 ^ Y5 }, - {0, 0, 0, 0, S0, X0, Y0, X1, X2, Y2, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Y3 ^ X6, X3 ^ Y7, Y1 ^ X5 ^ Y6, Z0 ^ X5 ^ Y5 }, - {S0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, X5, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y6 ^ X8, X6 ^ Y8, X7 ^ Y7 }, - {0, S0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y8, X6 ^ Y7, Y6 ^ X7 }, - {0, 0, S0, X0, Y0, X1, Y1, X2, X3, Y3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y8, Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {0, 0, 0, S0, X0, Y0, X1, Y1, Y2, X3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y3 ^ X7, X5 ^ Y7, X2 ^ X6 ^ Y6 }, - {0, 0, 0, 0, S0, X0, Y0, X1, X2, Y2, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y3 ^ X6, X3 ^ Y7, Y1 ^ X5 ^ Y6 }, - {S0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, X5, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y6, X6 ^ Y8, X7 ^ Y7 }, - {0, S0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y6, X5 ^ Y8, X6 ^ Y7 }, - {0, 0, S0, X0, Y0, X1, Y1, X2, X3, Y3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y6, X5 ^ Y8, Y2 ^ X6 ^ Y7 }, - {0, 0, 0, S0, X0, Y0, X1, Y1, Y2, X3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X2 ^ X5 ^ Y6, Y3 ^ X6, X5 ^ Y6 }, - {0, 0, 0, 0, S0, X0, Y0, X1, X2, Y2, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y1 ^ X5 ^ Y6, Y3 ^ X5, X3 ^ Y6 }, - {S0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, X6, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, Y6 ^ X8, X7 ^ Y7, X5 ^ Y6 }, - {0, S0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X6 ^ Y7, Y6 ^ X7, X5 ^ Y6 }, - {0, 0, S0, X0, Y0, X1, Y1, X2, X3, Y3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, Y6 ^ X7, Y2 ^ X6 ^ Y7, X5 ^ Y6 }, - {0, 0, 0, S0, X0, Y0, X1, Y1, X2, X3, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Y3 ^ X7, Y2 ^ X6 ^ Y6, Z0 ^ X5 ^ Y6 }, - {0, 0, 0, 0, S0, X0, Y0, X1, X2, X3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, Y3 ^ X7, Y2 ^ X6 ^ Y6, Y1 ^ X5 ^ Y6 }, - {S0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, X6, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, Y6 ^ X8, X7 ^ Y7 }, - {0, S0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X6 ^ Y7, Y6 ^ X7 }, - {0, 0, S0, X0, Y0, X1, Y1, X2, X3, Y3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {0, 0, 0, S0, X0, Y0, X1, Y1, X2, X3, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y3 ^ X7, Y2 ^ X6 ^ Y6 }, - {0, 0, 0, 0, S0, X0, Y0, X1, X2, X3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, Y1 ^ X5 ^ Y6, Y3 ^ X7, Y2 ^ X6 ^ Y6 }, - {S0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, X6, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X6 ^ Y6, X7 ^ Y7 }, - {0, S0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X6 ^ Y6, X6 ^ Y7 }, - {0, 0, S0, X0, Y0, X1, Y1, X2, X3, Y3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X6 ^ Y6, Y2 ^ X6 ^ Y7 }, - {0, 0, 0, S0, X0, Y0, X1, Y1, X2, X3, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y2 ^ X6 ^ Y6, Y3 ^ X6 }, - {0, 0, 0, 0, S0, X0, Y0, X1, X2, X3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, Y1 ^ X5 ^ Y6, Y2 ^ X6 ^ Y6, Y3 ^ X6 }, - {S0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, X6, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X7 ^ Y7, X6 ^ Y6 }, - {0, S0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y7, X6 ^ Y6 }, - {0, 0, S0, X0, Y0, X1, Y1, X2, X3, Y3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, Y2 ^ X6 ^ Y7, X6 ^ Y6 }, - {0, 0, 0, S0, X0, Y0, X1, Y1, X3, Y3, Y4 ^ X8 ^ Y8, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, X2 ^ X6 ^ Y7, Y2 ^ X6 ^ Y6 }, - {0, 0, 0, 0, S0, X0, Y0, X1, X3, Y3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, Y1 ^ X5 ^ Y7, X2 ^ X6 ^ Y7, Y2 ^ X6 ^ Y6 }, - {S0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, X6, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X7 ^ Y7 }, - {0, S0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X6 ^ Y7 }, - {0, 0, S0, X0, Y0, X1, Y1, X2, X3, Y3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, Y2 ^ X6 ^ Y7 }, - {0, 0, 0, S0, X0, Y0, X1, Y1, X3, Y3, Y4 ^ X8 ^ Y8, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, X2 ^ X6 ^ Y7 }, - {0, 0, 0, 0, S0, X0, Y0, X1, X3, Y3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, Y1 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, X2 ^ X6 ^ Y7 }, - {S0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y7, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X6 ^ Y8 }, - {0, S0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X6 ^ Y8 }, - {0, 0, S0, X0, Y0, X1, Y1, X2, X3, Y3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, Y2 ^ X6 ^ Y8 }, - {0, 0, 0, S0, X0, Y0, X1, Y1, X3, Y3, Y4 ^ X8 ^ Y8, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, X2 ^ X6 ^ Y8 }, - {0, 0, 0, 0, S0, X0, Y0, X1, X3, Y3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, Y1 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, X2 ^ X6 ^ Y8 }, - {S0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y7, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, X6 ^ Y7 }, - {0, S0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, X6 ^ Y7 }, - {0, 0, S0, X0, Y0, X1, Y1, X2, X3, Y3, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {0, 0, 0, S0, X0, Y0, X1, Y1, X3, Y3, Y4 ^ X9 ^ Y9, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7 }, - {0, 0, 0, 0, S0, X0, Y0, X1, X3, Y3, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, Y1 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7 }, - {S0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y7, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, X6 ^ Y7 }, - {0, S0, X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, X6 ^ Y7 }, - {0, 0, S0, X0, Y0, X1, Y1, X2, X3, Y3, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {0, 0, 0, S0, X0, Y0, X1, Y1, X3, Y3, Y4 ^ X9 ^ Y9, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7 }, - {0, 0, 0, 0, S0, X0, Y0, X1, X3, Y3, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, Y1 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7 }, -}; - -const UINT_64 SW_64K_Z_X_4xaa_RBPLUS[][16]= -{ - {S0, S1, X0, Y0, X1, Y1, X2, Y2, X3, Y3, X4 ^ Y9, Y4 ^ X9, X5 ^ Y8, Y5 ^ X8, X6 ^ Y7, Y6 ^ X7 }, - {0, S0, S1, X0, Y0, X1, Y1, X2, X3, Y3, X4 ^ Y9, Y4 ^ X9, X5 ^ Y8, Y5 ^ X8, X6 ^ Y7, Y2 ^ Y6 ^ X7 }, - {0, 0, S0, S1, X0, Y0, X1, Y1, X3, Y3, X4 ^ Y9, Y4 ^ X9, X5 ^ Y8, Y5 ^ X8, X2 ^ X6 ^ Y7, Y2 ^ Y6 ^ X7 }, - {0, 0, 0, S0, S1, X0, Y0, X1, Y2, X3, Y3 ^ X9, X4 ^ Y8, Y4 ^ X8, X5 ^ Y7, X2 ^ X6 ^ Y6, Y1 ^ Y5 ^ X7 }, - {0, 0, 0, 0, S0, S1, X0, Y0, X2, Y2, X3 ^ Y8, Y3 ^ X8, X4 ^ Y7, Y4 ^ X7, X1 ^ X5 ^ Y6, Y1 ^ Y5 ^ X6 }, - {S0, S1, X0, Y0, X1, Y1, X2, Y2, Z0 ^ X4 ^ Y4, X3, Y3, Y4 ^ X9, X5 ^ Y8, Y5 ^ X8, X6 ^ Y7, Y6 ^ X7 }, - {0, S0, S1, X0, Y0, X1, Y1, X2, Z0 ^ X4 ^ Y4, X3, Y3, X5 ^ Y8, Y4 ^ X9, X6 ^ Y7, Y5 ^ X8, Y2 ^ Y6 ^ X7 }, - {0, 0, S0, S1, X0, Y0, X1, Y1, Z0 ^ X4 ^ Y4, X3, Y3, Y4 ^ X9, X5 ^ Y8, Y5 ^ X8, X2 ^ Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {0, 0, 0, S0, S1, X0, Y0, X1, Z0 ^ X4 ^ Y4, Y2, X3, Y3 ^ X9, X5 ^ Y7, Y4 ^ X8, X2 ^ Y5 ^ X7, Y1 ^ X6 ^ Y6 }, - {0, 0, 0, 0, S0, S1, X0, Y0, Z0 ^ X4 ^ Y4, X2, Y2, Y3 ^ X8, X3 ^ Y7, Y4 ^ X7, X1 ^ Y5 ^ X6, Y1 ^ X5 ^ Y6 }, - {S0, S1, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X3, Y3, X5 ^ Y8, Y5 ^ X8, X6 ^ Y7, Y6 ^ X7 }, - {0, S0, S1, X0, Y0, X1, Y1, X2, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X3, Y3, X5 ^ Y8, X6 ^ Y7, Y5 ^ X8, Y2 ^ Y6 ^ X7 }, - {0, 0, S0, S1, X0, Y0, X1, Y1, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X3, Y3, X5 ^ Y8, Y5 ^ X8, X2 ^ Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {0, 0, 0, S0, S1, X0, Y0, X1, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, Y2, X3, Y3 ^ X8, X5 ^ Y7, X2 ^ Y5 ^ X7, Y1 ^ X6 ^ Y6 }, - {0, 0, 0, 0, S0, S1, X0, Y0, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X2, Y2, Y3 ^ X7, X3 ^ Y7, X1 ^ Y5 ^ X6, Y1 ^ X5 ^ Y6 }, - {S0, S1, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, X3, Y3, Y5 ^ X8, X6 ^ Y7, Y6 ^ X7 }, - {0, S0, S1, X0, Y0, X1, Y1, X2, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, X3, Y3, X6 ^ Y7, Y5 ^ X8, Y2 ^ Y6 ^ X7 }, - {0, 0, S0, S1, X0, Y0, X1, Y1, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, X3, Y3, Y5 ^ X8, X2 ^ Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {0, 0, 0, S0, S1, X0, Y0, X1, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, Y2, X3, Y3 ^ X8, X2 ^ Y5 ^ X7, Y1 ^ X6 ^ Y6 }, - {0, 0, 0, 0, S0, S1, X0, Y0, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, Y1 ^ X5 ^ Y5, X2, Y2, Y3 ^ X6, X3 ^ Y6, X1 ^ X5 ^ Y5 }, - {S0, S1, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, X3, Y3, X5 ^ Y8, X6 ^ Y7, Y6 ^ X7, Z0 ^ X5 ^ Y5 }, - {0, S0, S1, X0, Y0, X1, Y1, X2, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, X3, Y3, X5 ^ Y8, X6 ^ Y7, Y2 ^ Y6 ^ X7, Z0 ^ X5 ^ Y5 }, - {0, 0, S0, S1, X0, Y0, X1, Y1, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, X3, Y3, X5 ^ Y8, X2 ^ X6 ^ Y7, Y2 ^ Y6 ^ X7, Z0 ^ X5 ^ Y5 }, - {0, 0, 0, S0, S1, X0, Y0, X1, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, X2, X3, Y3 ^ X8, Y2 ^ Y5 ^ X7, Y1 ^ X6 ^ Y6, Z0 ^ X5 ^ Y5 }, - {0, 0, 0, 0, S0, S1, X0, Y0, Y4 ^ X6 ^ Y6, Z0 ^ X4 ^ Y4, X2, X3, Y3 ^ X7, Y2 ^ X6 ^ Y6, X1 ^ X5 ^ Y7, Y1 ^ X5 ^ Y5 }, - {S0, S1, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X3, Y3, X5 ^ Y8, X6 ^ Y7, Y6 ^ X7 }, - {0, S0, S1, X0, Y0, X1, Y1, X2, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X3, Y3, X5 ^ Y8, X6 ^ Y7, Y2 ^ Y6 ^ X7 }, - {0, 0, S0, S1, X0, Y0, X1, Y1, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X3, Y3, X5 ^ Y8, X2 ^ X6 ^ Y7, Y2 ^ Y6 ^ X7 }, - {0, 0, 0, S0, S1, X0, Y0, X1, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X2, X3, Y3 ^ X8, Y2 ^ Y5 ^ X7, Y1 ^ X6 ^ Y6 }, - {0, 0, 0, 0, S0, S1, X0, Y0, Y4 ^ X6 ^ Y6, Z0 ^ X4 ^ Y4, Y1 ^ X5 ^ Y5, X2, X3, Y3 ^ X7, Y2 ^ X6 ^ Y6, X1 ^ X5 ^ Y7 }, - {S0, S1, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y6, X3, Y3, X6 ^ Y7, Y6 ^ X7 }, - {0, S0, S1, X0, Y0, X1, Y1, X2, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y6, X3, Y3, X6 ^ Y7, Y2 ^ Y6 ^ X7 }, - {0, 0, S0, S1, X0, Y0, X1, Y1, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y6, X3, Y3, X2 ^ X6 ^ Y7, Y2 ^ Y6 ^ X7 }, - {0, 0, 0, S0, S1, X0, Y0, X1, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y1 ^ X5 ^ Y6, X2, X3, Y3 ^ X7, Y2 ^ Y5 ^ X6 }, - {0, 0, 0, 0, S0, S1, X0, Y0, Y4 ^ X6 ^ Y6, Z0 ^ X4 ^ Y4, Y1 ^ X5 ^ Y5, X1 ^ X5 ^ Y6, X2, X3, Y3 ^ X6, Y2 ^ X5 ^ Y6 }, - {S0, S1, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X3, Y3, X6 ^ Y7, Y6 ^ X7, X5 ^ Y6 }, - {0, S0, S1, X0, Y0, X1, Y1, X2, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X3, Y3, X6 ^ Y7, Y2 ^ Y6 ^ X7, X5 ^ Y6 }, - {0, 0, S0, S1, X0, Y0, X1, Y1, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X3, Y3, X2 ^ Y6 ^ X7, Y2 ^ X6 ^ Y7, X5 ^ Y6 }, - {0, 0, 0, S0, S1, X0, Y0, X1, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Y1 ^ Y5 ^ X6, X3, Y3, X2 ^ Y6 ^ X7, Y2 ^ X6 ^ Y7, Z0 ^ X5 ^ Y6 }, - {0, 0, 0, 0, S0, S1, X0, Y0, Y4 ^ X7 ^ Y7, Z0 ^ X4 ^ Y4, Y1 ^ Y5 ^ X6, X3, Y3, X2 ^ Y6 ^ X7, Y2 ^ X6 ^ Y7, X1 ^ X5 ^ Y6 }, - {S0, S1, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X3, Y3, X6 ^ Y7, Y6 ^ X7 }, - {0, S0, S1, X0, Y0, X1, Y1, X2, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X3, Y3, X6 ^ Y7, Y2 ^ Y6 ^ X7 }, - {0, 0, S0, S1, X0, Y0, X1, Y1, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X3, Y3, X2 ^ Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {0, 0, 0, S0, S1, X0, Y0, X1, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Y1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X3, Y3, X2 ^ Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {0, 0, 0, 0, S0, S1, X0, Y0, Y4 ^ X7 ^ Y7, Z0 ^ X4 ^ Y4, Y1 ^ Y5 ^ X6, X1 ^ X5 ^ Y6, X3, Y3, X2 ^ Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {S0, S1, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X6 ^ Y6, X3, Y3, Y6 ^ X7 }, - {0, S0, S1, X0, Y0, X1, Y1, X2, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X6 ^ Y6, X3, Y3, Y2 ^ Y6 ^ X7 }, - {0, 0, S0, S1, X0, Y0, X1, Y1, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, Y2 ^ X6 ^ Y6, X3, Y3, X2 ^ X6 ^ Y6 }, - {0, 0, 0, S0, S1, X0, Y0, X1, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Y1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y2 ^ X6 ^ Y6, X3, Y3, X2 ^ X6 ^ Y6 }, - {0, 0, 0, 0, S0, S1, X0, Y0, Y4 ^ X7 ^ Y7, Z0 ^ X4 ^ Y4, Y1 ^ Y5 ^ X6, X1 ^ X5 ^ Y6, Y2 ^ X6 ^ Y6, X3, Y3, X2 ^ X6 ^ Y6 }, - {S0, S1, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X3, Y3, X6 ^ Y7, X6 ^ Y6 }, - {0, S0, S1, X0, Y0, X1, Y1, X2, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X3, Y3, Y2 ^ Y6 ^ X7, X6 ^ Y6 }, - {0, 0, S0, S1, X0, Y0, X1, Y1, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X3, Y3, X2 ^ X6 ^ Y7, Y2 ^ X6 ^ Y6 }, - {0, 0, 0, S0, S1, X0, Y0, X1, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Y1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, X3, Y3, X2 ^ X6 ^ Y7, Y2 ^ X6 ^ Y6 }, - {0, 0, 0, 0, S0, S1, X0, Y0, Y4 ^ X8 ^ Y8, Z0 ^ X4 ^ Y4, Y1 ^ Y5 ^ X7, X1 ^ X5 ^ Y7, X3, Y3, X2 ^ X6 ^ Y7, Y2 ^ X6 ^ Y6 }, - {S0, S1, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X3, Y3, X6 ^ Y7 }, - {0, S0, S1, X0, Y0, X1, Y1, X2, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X3, Y3, Y2 ^ Y6 ^ X7 }, - {0, 0, S0, S1, X0, Y0, X1, Y1, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, Y2 ^ X6 ^ Y6, X3, Y3, X2 ^ X6 ^ Y7 }, - {0, 0, 0, S0, S1, X0, Y0, X1, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Y1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, X3, Y3, X2 ^ X6 ^ Y7 }, - {0, 0, 0, 0, S0, S1, X0, Y0, Y4 ^ X8 ^ Y8, Z0 ^ X4 ^ Y4, Y1 ^ Y5 ^ X7, X1 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, X3, Y3, X2 ^ X6 ^ Y7 }, - {S0, S1, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X6 ^ Y8, X3, Y3 }, - {0, S0, S1, X0, Y0, X1, Y1, X2, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, Y2 ^ X6 ^ Y8, X3, Y3 }, - {0, 0, S0, S1, X0, Y0, X1, Y1, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, Y2 ^ X6 ^ Y6, X2 ^ X6 ^ Y8, X3, Y3 }, - {0, 0, 0, S0, S1, X0, Y0, X1, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Y1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, X2 ^ X6 ^ Y8, X3, Y3 }, - {0, 0, 0, 0, S0, S1, X0, Y0, Y4 ^ X8 ^ Y8, Z0 ^ X4 ^ Y4, Y1 ^ Y5 ^ X7, X1 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, X2 ^ X6 ^ Y8, X3, Y3 }, - {S0, S1, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, X3, Y3, X6 ^ Y7 }, - {0, S0, S1, X0, Y0, X1, Y1, X2, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y2 ^ Y6 ^ X7, X3, Y3, X6 ^ Y7 }, - {0, 0, S0, S1, X0, Y0, X1, Y1, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y2 ^ Y6 ^ X7, X3, Y3, X2 ^ X6 ^ Y7 }, - {0, 0, 0, S0, S1, X0, Y0, X1, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Y1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X3, Y3, X2 ^ X6 ^ Y7 }, - {0, 0, 0, 0, S0, S1, X0, Y0, Y4 ^ X9 ^ Y9, Z0 ^ X4 ^ Y4, Y1 ^ Y5 ^ X8, X1 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X3, Y3, X2 ^ X6 ^ Y7 }, - {S0, S1, X0, Y0, X1, Y1, X2, Y2, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, X6 ^ Y7, X3, Y3 }, - {0, S0, S1, X0, Y0, X1, Y1, X2, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y2 ^ Y6 ^ X7, X6 ^ Y7, X3, Y3 }, - {0, 0, S0, S1, X0, Y0, X1, Y1, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7, X3, Y3 }, - {0, 0, 0, S0, S1, X0, Y0, X1, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Y1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7, X3, Y3 }, - {0, 0, 0, 0, S0, S1, X0, Y0, Y4 ^ X9 ^ Y9, Z0 ^ X4 ^ Y4, Y1 ^ Y5 ^ X8, X1 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7, X3, Y3 }, - {S0, S1, X0, Y0, X1, Y1, X2, Y2, X3, Y3, X4 ^ Y9, Y4 ^ X9, X5 ^ Y8, Y5 ^ X8, X6 ^ Y7, Y6 ^ X7 }, - {0, S0, S1, X0, Y0, X1, Y1, X2, X3, Y3, X4 ^ Y9, Y4 ^ X9, X5 ^ Y8, Y5 ^ X8, X6 ^ Y7, Y2 ^ Y6 ^ X7 }, - {0, 0, S0, S1, X0, Y0, X1, Y1, X3, Y3, X4 ^ Y9, Y4 ^ X9, X5 ^ Y8, Y5 ^ X8, X2 ^ X6 ^ Y7, Y2 ^ Y6 ^ X7 }, - {0, 0, 0, S0, S1, X0, Y0, X1, Y2, X3, Y3 ^ X9, X4 ^ Y8, Y4 ^ X8, X5 ^ Y7, X2 ^ X6 ^ Y6, Y1 ^ Y5 ^ X7 }, - {0, 0, 0, 0, S0, S1, X0, Y0, X2, Y2, X3 ^ Y8, Y3 ^ X8, X4 ^ Y7, Y4 ^ X7, X1 ^ X5 ^ Y6, Y1 ^ Y5 ^ X6 }, - {S0, S1, X0, Y0, X1, Y1, X2, Y2, X3, Z0 ^ X4 ^ Y4, Y3, Y4 ^ X9, X5 ^ Y8, Y5 ^ X8, X6 ^ Y7, Y6 ^ X7 }, - {0, S0, S1, X0, Y0, X1, Y1, X2, X3, Z0 ^ X4 ^ Y4, Y3, X5 ^ Y8, Y4 ^ X9, X6 ^ Y7, Y5 ^ X8, Y2 ^ Y6 ^ X7 }, - {0, 0, S0, S1, X0, Y0, X1, Y1, X3, Z0 ^ X4 ^ Y4, Y3, Y4 ^ X9, X5 ^ Y8, Y5 ^ X8, X2 ^ Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {0, 0, 0, S0, S1, X0, Y0, X1, Y2, Z0 ^ X4 ^ Y4, X3, Y3 ^ X9, X5 ^ Y7, Y4 ^ X8, X2 ^ Y5 ^ X7, Y1 ^ X6 ^ Y6 }, - {0, 0, 0, 0, S0, S1, X0, Y0, X2, Z0 ^ X4 ^ Y4, Y2, Y3 ^ X8, X3 ^ Y7, Y4 ^ X7, X1 ^ Y5 ^ X6, Y1 ^ X5 ^ Y6 }, - {S0, S1, X0, Y0, X1, Y1, X2, Y2, X3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, Y3, X5 ^ Y8, Y5 ^ X8, X6 ^ Y7, Y6 ^ X7 }, - {0, S0, S1, X0, Y0, X1, Y1, X2, X3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, Y3, X5 ^ Y8, X6 ^ Y7, Y5 ^ X8, Y2 ^ Y6 ^ X7 }, - {0, 0, S0, S1, X0, Y0, X1, Y1, X3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, Y3, X5 ^ Y8, Y5 ^ X8, X2 ^ Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {0, 0, 0, S0, S1, X0, Y0, X1, Y2, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X3, Y3 ^ X8, X5 ^ Y7, X2 ^ Y5 ^ X7, Y1 ^ X6 ^ Y6 }, - {0, 0, 0, 0, S0, S1, X0, Y0, X2, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, Y2, Y3 ^ X7, X3 ^ Y7, X1 ^ Y5 ^ X6, Y1 ^ X5 ^ Y6 }, - {S0, S1, X0, Y0, X1, Y1, X2, Y2, X3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, Y3, Y5 ^ X8, X6 ^ Y7, Y6 ^ X7 }, - {0, S0, S1, X0, Y0, X1, Y1, X2, X3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, Y3, X6 ^ Y7, Y5 ^ X8, Y2 ^ Y6 ^ X7 }, - {0, 0, S0, S1, X0, Y0, X1, Y1, X3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, Y3, Y5 ^ X8, X2 ^ Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {0, 0, 0, S0, S1, X0, Y0, X1, Y2, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, X3, Y3 ^ X8, X2 ^ Y5 ^ X7, Y1 ^ X6 ^ Y6 }, - {0, 0, 0, 0, S0, S1, X0, Y0, X2, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, Y1 ^ X5 ^ Y5, Y2, Y3 ^ X6, X3 ^ Y6, X1 ^ X5 ^ Y5 }, - {S0, S1, X0, Y0, X1, Y1, X2, Y2, X3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Y3, X5 ^ Y8, X6 ^ Y7, Y6 ^ X7, Z0 ^ X5 ^ Y5 }, - {0, S0, S1, X0, Y0, X1, Y1, X2, X3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Y3, X5 ^ Y8, X6 ^ Y7, Y2 ^ Y6 ^ X7, Z0 ^ X5 ^ Y5 }, - {0, 0, S0, S1, X0, Y0, X1, Y1, X3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Y3, X5 ^ Y8, X2 ^ X6 ^ Y7, Y2 ^ Y6 ^ X7, Z0 ^ X5 ^ Y5 }, - {0, 0, 0, S0, S1, X0, Y0, X1, X2, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, X3, Y3 ^ X8, Y2 ^ Y5 ^ X7, Y1 ^ X6 ^ Y6, Z0 ^ X5 ^ Y5 }, - {0, 0, 0, 0, S0, S1, X0, Y0, X2, Y4 ^ X6 ^ Y6, Z0 ^ X4 ^ Y4, X3, Y3 ^ X7, Y2 ^ X6 ^ Y6, X1 ^ X5 ^ Y7, Y1 ^ X5 ^ Y5 }, - {S0, S1, X0, Y0, X1, Y1, X2, Y2, X3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y3, X5 ^ Y8, X6 ^ Y7, Y6 ^ X7 }, - {0, S0, S1, X0, Y0, X1, Y1, X2, X3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y3, X5 ^ Y8, X6 ^ Y7, Y2 ^ Y6 ^ X7 }, - {0, 0, S0, S1, X0, Y0, X1, Y1, X3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y3, X5 ^ Y8, X2 ^ X6 ^ Y7, Y2 ^ Y6 ^ X7 }, - {0, 0, 0, S0, S1, X0, Y0, X1, X2, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X3, Y3 ^ X8, Y2 ^ Y5 ^ X7, Y1 ^ X6 ^ Y6 }, - {0, 0, 0, 0, S0, S1, X0, Y0, X2, Y4 ^ X6 ^ Y6, Z0 ^ X4 ^ Y4, Y1 ^ X5 ^ Y5, X3, Y3 ^ X7, Y2 ^ X6 ^ Y6, X1 ^ X5 ^ Y7 }, - {S0, S1, X0, Y0, X1, Y1, X2, Y2, X3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y6, Y3, X6 ^ Y7, Y6 ^ X7 }, - {0, S0, S1, X0, Y0, X1, Y1, X2, X3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y6, Y3, X6 ^ Y7, Y2 ^ Y6 ^ X7 }, - {0, 0, S0, S1, X0, Y0, X1, Y1, X3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y6, Y3, X2 ^ X6 ^ Y7, Y2 ^ Y6 ^ X7 }, - {0, 0, 0, S0, S1, X0, Y0, X1, X2, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y1 ^ X5 ^ Y6, X3, Y3 ^ X7, Y2 ^ Y5 ^ X6 }, - {0, 0, 0, 0, S0, S1, X0, Y0, X2, Y4 ^ X6 ^ Y6, Z0 ^ X4 ^ Y4, Y1 ^ X5 ^ Y5, X1 ^ X5 ^ Y6, X3, Y3 ^ X6, Y2 ^ X5 ^ Y6 }, - {S0, S1, X0, Y0, X1, Y1, X2, Y2, X3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, Y3, X6 ^ Y7, Y6 ^ X7, X5 ^ Y6 }, - {0, S0, S1, X0, Y0, X1, Y1, X2, X3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, Y3, X6 ^ Y7, Y2 ^ Y6 ^ X7, X5 ^ Y6 }, - {0, 0, S0, S1, X0, Y0, X1, Y1, X3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, Y3, X2 ^ Y6 ^ X7, Y2 ^ X6 ^ Y7, X5 ^ Y6 }, - {0, 0, 0, S0, S1, X0, Y0, X1, X3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Y1 ^ Y5 ^ X6, Y3, X2 ^ Y6 ^ X7, Y2 ^ X6 ^ Y7, Z0 ^ X5 ^ Y6 }, - {0, 0, 0, 0, S0, S1, X0, Y0, X3, Y4 ^ X7 ^ Y7, Z0 ^ X4 ^ Y4, Y1 ^ Y5 ^ X6, Y3, X2 ^ Y6 ^ X7, Y2 ^ X6 ^ Y7, X1 ^ X5 ^ Y6 }, - {S0, S1, X0, Y0, X1, Y1, X2, Y2, X3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, Y3, X6 ^ Y7, Y6 ^ X7 }, - {0, S0, S1, X0, Y0, X1, Y1, X2, X3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, Y3, X6 ^ Y7, Y2 ^ Y6 ^ X7 }, - {0, 0, S0, S1, X0, Y0, X1, Y1, X3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, Y3, X2 ^ Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {0, 0, 0, S0, S1, X0, Y0, X1, X3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Y1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y3, X2 ^ Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {0, 0, 0, 0, S0, S1, X0, Y0, X3, Y4 ^ X7 ^ Y7, Z0 ^ X4 ^ Y4, Y1 ^ Y5 ^ X6, X1 ^ X5 ^ Y6, Y3, X2 ^ Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {S0, S1, X0, Y0, X1, Y1, X2, Y2, X3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X6 ^ Y6, Y3, Y6 ^ X7 }, - {0, S0, S1, X0, Y0, X1, Y1, X2, X3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X6 ^ Y6, Y3, Y2 ^ Y6 ^ X7 }, - {0, 0, S0, S1, X0, Y0, X1, Y1, X3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, Y2 ^ X6 ^ Y6, Y3, X2 ^ X6 ^ Y6 }, - {0, 0, 0, S0, S1, X0, Y0, X1, X3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Y1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y2 ^ X6 ^ Y6, Y3, X2 ^ X6 ^ Y6 }, - {0, 0, 0, 0, S0, S1, X0, Y0, X3, Y4 ^ X7 ^ Y7, Z0 ^ X4 ^ Y4, Y1 ^ Y5 ^ X6, X1 ^ X5 ^ Y6, Y2 ^ X6 ^ Y6, Y3, X2 ^ X6 ^ Y6 }, - {S0, S1, X0, Y0, X1, Y1, X2, Y2, X3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, Y3, X6 ^ Y7, X6 ^ Y6 }, - {0, S0, S1, X0, Y0, X1, Y1, X2, X3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, Y3, Y2 ^ Y6 ^ X7, X6 ^ Y6 }, - {0, 0, S0, S1, X0, Y0, X1, Y1, X3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, Y3, X2 ^ X6 ^ Y7, Y2 ^ X6 ^ Y6 }, - {0, 0, 0, S0, S1, X0, Y0, X1, X3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Y1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, Y3, X2 ^ X6 ^ Y7, Y2 ^ X6 ^ Y6 }, - {0, 0, 0, 0, S0, S1, X0, Y0, X3, Y4 ^ X8 ^ Y8, Z0 ^ X4 ^ Y4, Y1 ^ Y5 ^ X7, X1 ^ X5 ^ Y7, Y3, X2 ^ X6 ^ Y7, Y2 ^ X6 ^ Y6 }, - {S0, S1, X0, Y0, X1, Y1, X2, Y2, X3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, Y3, X6 ^ Y7 }, - {0, S0, S1, X0, Y0, X1, Y1, X2, X3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, Y3, Y2 ^ Y6 ^ X7 }, - {0, 0, S0, S1, X0, Y0, X1, Y1, X3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, Y2 ^ X6 ^ Y6, Y3, X2 ^ X6 ^ Y7 }, - {0, 0, 0, S0, S1, X0, Y0, X1, X3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Y1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, Y3, X2 ^ X6 ^ Y7 }, - {0, 0, 0, 0, S0, S1, X0, Y0, X3, Y4 ^ X8 ^ Y8, Z0 ^ X4 ^ Y4, Y1 ^ Y5 ^ X7, X1 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, Y3, X2 ^ X6 ^ Y7 }, - {S0, S1, X0, Y0, X1, Y1, X2, Y2, X3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X6 ^ Y8, Y3 }, - {0, S0, S1, X0, Y0, X1, Y1, X2, X3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, Y2 ^ X6 ^ Y8, Y3 }, - {0, 0, S0, S1, X0, Y0, X1, Y1, X3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, Y2 ^ X6 ^ Y6, X2 ^ X6 ^ Y8, Y3 }, - {0, 0, 0, S0, S1, X0, Y0, X1, X3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Y1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, X2 ^ X6 ^ Y8, Y3 }, - {0, 0, 0, 0, S0, S1, X0, Y0, X3, Y4 ^ X8 ^ Y8, Z0 ^ X4 ^ Y4, Y1 ^ Y5 ^ X7, X1 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, X2 ^ X6 ^ Y8, Y3 }, - {S0, S1, X0, Y0, X1, Y1, X2, Y2, X3, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, Y3, X6 ^ Y7 }, - {0, S0, S1, X0, Y0, X1, Y1, X2, X3, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y2 ^ Y6 ^ X7, Y3, X6 ^ Y7 }, - {0, 0, S0, S1, X0, Y0, X1, Y1, X3, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y2 ^ Y6 ^ X7, Y3, X2 ^ X6 ^ Y7 }, - {0, 0, 0, S0, S1, X0, Y0, X1, X3, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Y1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, Y3, X2 ^ X6 ^ Y7 }, - {0, 0, 0, 0, S0, S1, X0, Y0, X3, Y4 ^ X9 ^ Y9, Z0 ^ X4 ^ Y4, Y1 ^ Y5 ^ X8, X1 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, Y3, X2 ^ X6 ^ Y7 }, - {S0, S1, X0, Y0, X1, Y1, X2, Y2, X3, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, X6 ^ Y7, Y3 }, - {0, S0, S1, X0, Y0, X1, Y1, X2, X3, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y2 ^ Y6 ^ X7, X6 ^ Y7, Y3 }, - {0, 0, S0, S1, X0, Y0, X1, Y1, X3, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7, Y3 }, - {0, 0, 0, S0, S1, X0, Y0, X1, X3, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Y1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7, Y3 }, - {0, 0, 0, 0, S0, S1, X0, Y0, X3, Y4 ^ X9 ^ Y9, Z0 ^ X4 ^ Y4, Y1 ^ Y5 ^ X8, X1 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7, Y3 }, - {S0, S1, X0, Y0, X1, Y1, X2, Y2, X3, Y3, X4 ^ Y9, Y4 ^ X9, X5 ^ Y8, Y5 ^ X8, X6 ^ Y7, Y6 ^ X7 }, - {0, S0, S1, X0, Y0, X1, Y1, X2, X3, Y3, X4 ^ Y9, Y4 ^ X9, X5 ^ Y8, Y5 ^ X8, X6 ^ Y7, Y2 ^ Y6 ^ X7 }, - {0, 0, S0, S1, X0, Y0, X1, Y1, X3, Y3, X4 ^ Y9, Y4 ^ X9, X5 ^ Y8, Y5 ^ X8, X2 ^ X6 ^ Y7, Y2 ^ Y6 ^ X7 }, - {0, 0, 0, S0, S1, X0, Y0, X1, Y2, X3, Y3 ^ X9, X4 ^ Y8, Y4 ^ X8, X5 ^ Y7, X2 ^ X6 ^ Y6, Y1 ^ Y5 ^ X7 }, - {0, 0, 0, 0, S0, S1, X0, Y0, X2, Y2, X3 ^ Y8, Y3 ^ X8, X4 ^ Y7, Y4 ^ X7, X1 ^ X5 ^ Y6, Y1 ^ Y5 ^ X6 }, - {S0, S1, X0, Y0, X1, Y1, X2, Y2, X3, Y3, Z0 ^ X4 ^ Y4, Y4 ^ X9, X5 ^ Y8, Y5 ^ X8, X6 ^ Y7, Y6 ^ X7 }, - {0, S0, S1, X0, Y0, X1, Y1, X2, X3, Y3, Z0 ^ X4 ^ Y4, X5 ^ Y8, Y4 ^ X9, X6 ^ Y7, Y5 ^ X8, Y2 ^ Y6 ^ X7 }, - {0, 0, S0, S1, X0, Y0, X1, Y1, X3, Y3, Z0 ^ X4 ^ Y4, Y4 ^ X9, X5 ^ Y8, Y5 ^ X8, X2 ^ Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {0, 0, 0, S0, S1, X0, Y0, X1, Y2, X3, Z0 ^ X4 ^ Y4, Y3 ^ X9, X5 ^ Y7, Y4 ^ X8, X2 ^ Y5 ^ X7, Y1 ^ X6 ^ Y6 }, - {0, 0, 0, 0, S0, S1, X0, Y0, X2, Y2, Z0 ^ X4 ^ Y4, Y3 ^ X8, X3 ^ Y7, Y4 ^ X7, X1 ^ Y5 ^ X6, Y1 ^ X5 ^ Y6 }, - {S0, S1, X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y8, Y5 ^ X8, X6 ^ Y7, Y6 ^ X7 }, - {0, S0, S1, X0, Y0, X1, Y1, X2, X3, Y3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y8, X6 ^ Y7, Y5 ^ X8, Y2 ^ Y6 ^ X7 }, - {0, 0, S0, S1, X0, Y0, X1, Y1, X3, Y3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y8, Y5 ^ X8, X2 ^ Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {0, 0, 0, S0, S1, X0, Y0, X1, Y2, X3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, Y3 ^ X8, X5 ^ Y7, X2 ^ Y5 ^ X7, Y1 ^ X6 ^ Y6 }, - {0, 0, 0, 0, S0, S1, X0, Y0, X2, Y2, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, Y3 ^ X7, X3 ^ Y7, X1 ^ Y5 ^ X6, Y1 ^ X5 ^ Y6 }, - {S0, S1, X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, Y5 ^ X8, X6 ^ Y7, Y6 ^ X7 }, - {0, S0, S1, X0, Y0, X1, Y1, X2, X3, Y3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, X6 ^ Y7, Y5 ^ X8, Y2 ^ Y6 ^ X7 }, - {0, 0, S0, S1, X0, Y0, X1, Y1, X3, Y3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, Y5 ^ X8, X2 ^ Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {0, 0, 0, S0, S1, X0, Y0, X1, Y2, X3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, Y3 ^ X8, X2 ^ Y5 ^ X7, Y1 ^ X6 ^ Y6 }, - {0, 0, 0, 0, S0, S1, X0, Y0, X2, Y2, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, Y1 ^ X5 ^ Y5, Y3 ^ X6, X3 ^ Y6, X1 ^ X5 ^ Y5 }, - {S0, S1, X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, X5 ^ Y8, X6 ^ Y7, Y6 ^ X7, Z0 ^ X5 ^ Y5 }, - {0, S0, S1, X0, Y0, X1, Y1, X2, X3, Y3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, X5 ^ Y8, X6 ^ Y7, Y2 ^ Y6 ^ X7, Z0 ^ X5 ^ Y5 }, - {0, 0, S0, S1, X0, Y0, X1, Y1, X3, Y3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, X5 ^ Y8, X2 ^ X6 ^ Y7, Y2 ^ Y6 ^ X7, Z0 ^ X5 ^ Y5 }, - {0, 0, 0, S0, S1, X0, Y0, X1, X2, X3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Y3 ^ X8, Y2 ^ Y5 ^ X7, Y1 ^ X6 ^ Y6, Z0 ^ X5 ^ Y5 }, - {0, 0, 0, 0, S0, S1, X0, Y0, X2, X3, Y4 ^ X6 ^ Y6, Z0 ^ X4 ^ Y4, Y3 ^ X7, Y2 ^ X6 ^ Y6, X1 ^ X5 ^ Y7, Y1 ^ X5 ^ Y5 }, - {S0, S1, X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y8, X6 ^ Y7, Y6 ^ X7 }, - {0, S0, S1, X0, Y0, X1, Y1, X2, X3, Y3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y8, X6 ^ Y7, Y2 ^ Y6 ^ X7 }, - {0, 0, S0, S1, X0, Y0, X1, Y1, X3, Y3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y8, X2 ^ X6 ^ Y7, Y2 ^ Y6 ^ X7 }, - {0, 0, 0, S0, S1, X0, Y0, X1, X2, X3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y3 ^ X8, Y2 ^ Y5 ^ X7, Y1 ^ X6 ^ Y6 }, - {0, 0, 0, 0, S0, S1, X0, Y0, X2, X3, Y4 ^ X6 ^ Y6, Z0 ^ X4 ^ Y4, Y1 ^ X5 ^ Y5, Y3 ^ X7, Y2 ^ X6 ^ Y6, X1 ^ X5 ^ Y7 }, - {S0, S1, X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y6, X6 ^ Y7, Y6 ^ X7 }, - {0, S0, S1, X0, Y0, X1, Y1, X2, X3, Y3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y6, X6 ^ Y7, Y2 ^ Y6 ^ X7 }, - {0, 0, S0, S1, X0, Y0, X1, Y1, X3, Y3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y6, X2 ^ X6 ^ Y7, Y2 ^ Y6 ^ X7 }, - {0, 0, 0, S0, S1, X0, Y0, X1, X2, X3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y1 ^ X5 ^ Y6, Y3 ^ X7, Y2 ^ Y5 ^ X6 }, - {0, 0, 0, 0, S0, S1, X0, Y0, X2, X3, Y4 ^ X6 ^ Y6, Z0 ^ X4 ^ Y4, Y1 ^ X5 ^ Y5, X1 ^ X5 ^ Y6, Y3 ^ X6, Y2 ^ X5 ^ Y6 }, - {S0, S1, X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X6 ^ Y7, Y6 ^ X7, X5 ^ Y6 }, - {0, S0, S1, X0, Y0, X1, Y1, X2, X3, Y3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X6 ^ Y7, Y2 ^ Y6 ^ X7, X5 ^ Y6 }, - {0, 0, S0, S1, X0, Y0, X1, Y1, X3, Y3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X2 ^ Y6 ^ X7, Y2 ^ X6 ^ Y7, X5 ^ Y6 }, - {0, 0, 0, S0, S1, X0, Y0, X1, X3, Y3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Y1 ^ Y5 ^ X6, X2 ^ Y6 ^ X7, Y2 ^ X6 ^ Y7, Z0 ^ X5 ^ Y6 }, - {0, 0, 0, 0, S0, S1, X0, Y0, X3, Y3, Y4 ^ X7 ^ Y7, Z0 ^ X4 ^ Y4, Y1 ^ Y5 ^ X6, X2 ^ Y6 ^ X7, Y2 ^ X6 ^ Y7, X1 ^ X5 ^ Y6 }, - {S0, S1, X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X6 ^ Y7, Y6 ^ X7 }, - {0, S0, S1, X0, Y0, X1, Y1, X2, X3, Y3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X6 ^ Y7, Y2 ^ Y6 ^ X7 }, - {0, 0, S0, S1, X0, Y0, X1, Y1, X3, Y3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X2 ^ Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {0, 0, 0, S0, S1, X0, Y0, X1, X3, Y3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Y1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X2 ^ Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {0, 0, 0, 0, S0, S1, X0, Y0, X3, Y3, Y4 ^ X7 ^ Y7, Z0 ^ X4 ^ Y4, Y1 ^ Y5 ^ X6, X1 ^ X5 ^ Y6, X2 ^ Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {S0, S1, X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X6 ^ Y6, Y6 ^ X7 }, - {0, S0, S1, X0, Y0, X1, Y1, X2, X3, Y3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X6 ^ Y6, Y2 ^ Y6 ^ X7 }, - {0, 0, S0, S1, X0, Y0, X1, Y1, X3, Y3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, Y2 ^ X6 ^ Y6, X2 ^ X6 ^ Y6 }, - {0, 0, 0, S0, S1, X0, Y0, X1, X3, Y3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Y1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y2 ^ X6 ^ Y6, X2 ^ X6 ^ Y6 }, - {0, 0, 0, 0, S0, S1, X0, Y0, X3, Y3, Y4 ^ X7 ^ Y7, Z0 ^ X4 ^ Y4, Y1 ^ Y5 ^ X6, X1 ^ X5 ^ Y6, Y2 ^ X6 ^ Y6, X2 ^ X6 ^ Y6 }, - {S0, S1, X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y7, X6 ^ Y6 }, - {0, S0, S1, X0, Y0, X1, Y1, X2, X3, Y3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, Y2 ^ Y6 ^ X7, X6 ^ Y6 }, - {0, 0, S0, S1, X0, Y0, X1, Y1, X3, Y3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X2 ^ X6 ^ Y7, Y2 ^ X6 ^ Y6 }, - {0, 0, 0, S0, S1, X0, Y0, X1, X3, Y3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Y1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, X2 ^ X6 ^ Y7, Y2 ^ X6 ^ Y6 }, - {0, 0, 0, 0, S0, S1, X0, Y0, X3, Y3, Y4 ^ X8 ^ Y8, Z0 ^ X4 ^ Y4, Y1 ^ Y5 ^ X7, X1 ^ X5 ^ Y7, X2 ^ X6 ^ Y7, Y2 ^ X6 ^ Y6 }, - {S0, S1, X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X6 ^ Y7 }, - {0, S0, S1, X0, Y0, X1, Y1, X2, X3, Y3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, Y2 ^ Y6 ^ X7 }, - {0, 0, S0, S1, X0, Y0, X1, Y1, X3, Y3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, Y2 ^ X6 ^ Y6, X2 ^ X6 ^ Y7 }, - {0, 0, 0, S0, S1, X0, Y0, X1, X3, Y3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Y1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, X2 ^ X6 ^ Y7 }, - {0, 0, 0, 0, S0, S1, X0, Y0, X3, Y3, Y4 ^ X8 ^ Y8, Z0 ^ X4 ^ Y4, Y1 ^ Y5 ^ X7, X1 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, X2 ^ X6 ^ Y7 }, - {S0, S1, X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X6 ^ Y8 }, - {0, S0, S1, X0, Y0, X1, Y1, X2, X3, Y3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, Y2 ^ X6 ^ Y8 }, - {0, 0, S0, S1, X0, Y0, X1, Y1, X3, Y3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, Y2 ^ X6 ^ Y6, X2 ^ X6 ^ Y8 }, - {0, 0, 0, S0, S1, X0, Y0, X1, X3, Y3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Y1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, X2 ^ X6 ^ Y8 }, - {0, 0, 0, 0, S0, S1, X0, Y0, X3, Y3, Y4 ^ X8 ^ Y8, Z0 ^ X4 ^ Y4, Y1 ^ Y5 ^ X7, X1 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, X2 ^ X6 ^ Y8 }, - {S0, S1, X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, X6 ^ Y7 }, - {0, S0, S1, X0, Y0, X1, Y1, X2, X3, Y3, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y2 ^ Y6 ^ X7, X6 ^ Y7 }, - {0, 0, S0, S1, X0, Y0, X1, Y1, X3, Y3, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7 }, - {0, 0, 0, S0, S1, X0, Y0, X1, X3, Y3, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Y1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7 }, - {0, 0, 0, 0, S0, S1, X0, Y0, X3, Y3, Y4 ^ X9 ^ Y9, Z0 ^ X4 ^ Y4, Y1 ^ Y5 ^ X8, X1 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7 }, - {S0, S1, X0, Y0, X1, Y1, X2, Y2, X3, Y3, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, X6 ^ Y7 }, - {0, S0, S1, X0, Y0, X1, Y1, X2, X3, Y3, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y2 ^ Y6 ^ X7, X6 ^ Y7 }, - {0, 0, S0, S1, X0, Y0, X1, Y1, X3, Y3, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7 }, - {0, 0, 0, S0, S1, X0, Y0, X1, X3, Y3, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Y1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7 }, - {0, 0, 0, 0, S0, S1, X0, Y0, X3, Y3, Y4 ^ X9 ^ Y9, Z0 ^ X4 ^ Y4, Y1 ^ Y5 ^ X8, X1 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7 }, -}; - -const UINT_64 SW_64K_Z_X_8xaa_RBPLUS[][16]= -{ - {S0, S1, S2, X0, Y0, X1, Y1, X2, X3, Y3, Y4 ^ X9, X4 ^ Y9, Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {0, S0, S1, S2, X0, Y0, X1, Y1, X3, Y3, X4 ^ Y9, Y4 ^ X9, X5 ^ Y8, Y5 ^ X8, X2 ^ X6 ^ Y7, Y2 ^ Y6 ^ X7 }, - {0, 0, S0, S1, S2, X0, Y0, X1, X3, Y3, Y4 ^ X9, X4 ^ Y9, Y5 ^ X8, Y1 ^ X5 ^ Y8, X2 ^ X6 ^ Y7, Y2 ^ Y6 ^ X7 }, - {0, 0, 0, S0, S1, S2, X0, Y0, Y2, X3, Y3 ^ X9, X4 ^ Y8, Y4 ^ X8, X1 ^ X5 ^ Y7, Y1 ^ Y5 ^ X7, X2 ^ X6 ^ Y6 }, - {0, 0, 0, 0, S0, S1, S2, X0, X2, Y2, Y3 ^ X8, X3 ^ Y8, Y4 ^ X7, Y0 ^ X4 ^ Y7, X1 ^ X5 ^ Y6, Y1 ^ Y5 ^ X6 }, - {S0, S1, S2, X0, Y0, X1, Y1, X2, Z0 ^ X4 ^ Y4, X3, Y3, Y4 ^ X9, Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {0, S0, S1, S2, X0, Y0, X1, Y1, Z0 ^ X4 ^ Y4, X3, Y3, Y4 ^ X9, X5 ^ Y8, Y5 ^ X8, X2 ^ Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {0, 0, S0, S1, S2, X0, Y0, X1, Z0 ^ X4 ^ Y4, X3, Y3, Y4 ^ X9, Y5 ^ X8, Y1 ^ X5 ^ Y8, X2 ^ X6 ^ Y7, Y2 ^ Y6 ^ X7 }, - {0, 0, 0, S0, S1, S2, X0, Y0, Z0 ^ X4 ^ Y4, X2, X3, Y3 ^ X9, Y4 ^ X8, Y1 ^ X5 ^ Y7, X1 ^ Y5 ^ X7, Y2 ^ X6 ^ Y6 }, - {0, 0, 0, 0, S0, S1, S2, X0, Z0 ^ X4 ^ Y4, X2, Y2, X3 ^ Y7, Y3 ^ X7, Y0 ^ X4 ^ Y6, X1 ^ Y4 ^ X6, Y1 ^ X5 ^ Y5 }, - {S0, S1, S2, X0, Y0, X1, Y1, X2, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X3, Y3, Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {0, S0, S1, S2, X0, Y0, X1, Y1, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X3, Y3, X5 ^ Y8, Y5 ^ X8, X2 ^ Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {0, 0, S0, S1, S2, X0, Y0, X1, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X3, Y3, Y5 ^ X8, Y1 ^ X5 ^ Y8, X2 ^ X6 ^ Y7, Y2 ^ Y6 ^ X7 }, - {0, 0, 0, S0, S1, S2, X0, Y0, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X2, X3, Y3 ^ X8, Y1 ^ X5 ^ Y7, X1 ^ Y5 ^ X7, Y2 ^ X6 ^ Y6 }, - {0, 0, 0, 0, S0, S1, S2, X0, Y4 ^ X5 ^ Y5, Y0 ^ X4 ^ Y4, X2, X3, Y3 ^ X7, Y1 ^ X4 ^ Y7, Y2 ^ X5 ^ Y6, X1 ^ Y5 ^ X6 }, - {S0, S1, S2, X0, Y0, X1, Y1, X2, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, X3, Y3, X5 ^ Y8, Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {0, S0, S1, S2, X0, Y0, X1, Y1, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, X3, Y3, X5 ^ Y8, X2 ^ Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {0, 0, S0, S1, S2, X0, Y0, X1, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, X3, Y3, Y1 ^ X5 ^ Y8, X2 ^ X6 ^ Y7, Y2 ^ Y6 ^ X7 }, - {0, 0, 0, S0, S1, S2, X0, Y0, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, Y2 ^ X5 ^ Y5, X2, X3, Y3 ^ X7, Y1 ^ X5 ^ Y6, X1 ^ Y5 ^ X6 }, - {0, 0, 0, 0, S0, S1, S2, X0, Y4 ^ X5 ^ Y5, Y0 ^ X4 ^ Y4, X1 ^ X5 ^ Y5, X2, X3, Y3 ^ X6, Y1 ^ X4 ^ Y6, Y2 ^ X5 ^ Y5 }, - {S0, S1, S2, X0, Y0, X1, Y1, X2, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, X3, Y3, X5 ^ Y8, Y6 ^ X7, Y2 ^ X6 ^ Y7, Z0 ^ X5 ^ Y5 }, - {0, S0, S1, S2, X0, Y0, X1, Y1, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, X3, Y3, X5 ^ Y8, X2 ^ X6 ^ Y7, Y2 ^ Y6 ^ X7, Z0 ^ X5 ^ Y5 }, - {0, 0, S0, S1, S2, X0, Y0, X1, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, X3, Y3, Y1 ^ X5 ^ Y8, X2 ^ X6 ^ Y7, Y2 ^ Y6 ^ X7, Z0 ^ X5 ^ Y5 }, - {0, 0, 0, S0, S1, S2, X0, Y0, Y4 ^ X6 ^ Y6, Z0 ^ X4 ^ Y4, X3, Y3, X1 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7, Y1 ^ X5 ^ Y5 }, - {0, 0, 0, 0, S0, S1, S2, X0, Y4 ^ X6 ^ Y6, Y0 ^ X4 ^ Y4, X3, Y3, X1 ^ X5 ^ Y8, X2 ^ X6 ^ Y7, Y2 ^ Y6 ^ X7, Y1 ^ X5 ^ Y5 }, - {S0, S1, S2, X0, Y0, X1, Y1, X2, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X3, Y3, X5 ^ Y8, Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {0, S0, S1, S2, X0, Y0, X1, Y1, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X3, Y3, X5 ^ Y8, X2 ^ X6 ^ Y7, Y2 ^ Y6 ^ X7 }, - {0, 0, S0, S1, S2, X0, Y0, X1, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X3, Y3, Y1 ^ X5 ^ Y8, X2 ^ X6 ^ Y7, Y2 ^ Y6 ^ X7 }, - {0, 0, 0, S0, S1, S2, X0, Y0, Y4 ^ X6 ^ Y6, Z0 ^ X4 ^ Y4, Y1 ^ X5 ^ Y5, X3, Y3, X1 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7 }, - {0, 0, 0, 0, S0, S1, S2, X0, Y4 ^ X6 ^ Y6, Y0 ^ X4 ^ Y4, Y1 ^ X5 ^ Y5, X3, Y3, X1 ^ X5 ^ Y8, X2 ^ X6 ^ Y7, Y2 ^ Y6 ^ X7 }, - {S0, S1, S2, X0, Y0, X1, Y1, X2, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y6, X3, Y3, Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {0, S0, S1, S2, X0, Y0, X1, Y1, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y6, X3, Y3, X2 ^ X6 ^ Y7, Y2 ^ Y6 ^ X7 }, - {0, 0, S0, S1, S2, X0, Y0, X1, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y2 ^ X5 ^ Y6, X3, Y3, Y1 ^ X5 ^ Y7, X2 ^ X6 ^ Y6 }, - {0, 0, 0, S0, S1, S2, X0, Y0, Y4 ^ X6 ^ Y6, Z0 ^ X4 ^ Y4, Y1 ^ X5 ^ Y5, X2 ^ X5 ^ Y6, X3, Y3, X1 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6 }, - {0, 0, 0, 0, S0, S1, S2, X0, Y4 ^ X6 ^ Y6, Y0 ^ X4 ^ Y4, Y1 ^ X5 ^ Y5, Y2 ^ X5 ^ Y6, X3, Y3, X1 ^ X5 ^ Y7, X2 ^ X6 ^ Y6 }, - {S0, S1, S2, X0, Y0, X1, Y1, X2, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X3, Y3, Y6 ^ X7, Y2 ^ X6 ^ Y7, X5 ^ Y6 }, - {0, S0, S1, S2, X0, Y0, X1, Y1, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X3, Y3, X2 ^ Y6 ^ X7, Y2 ^ X6 ^ Y7, X5 ^ Y6 }, - {0, 0, S0, S1, S2, X0, Y0, X1, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X3, Y3, X2 ^ Y6 ^ X7, Y1 ^ X6 ^ Y7, Y2 ^ X5 ^ Y6 }, - {0, 0, 0, S0, S1, S2, X0, Y0, Y4 ^ X7 ^ Y7, Z0 ^ X4 ^ Y4, Y1 ^ Y5 ^ X6, X3, Y3, X2 ^ Y6 ^ X7, Y2 ^ X6 ^ Y7, X1 ^ X5 ^ Y6 }, - {0, 0, 0, 0, S0, S1, S2, X0, Y4 ^ X7 ^ Y7, Y0 ^ X4 ^ Y4, Y1 ^ Y5 ^ X6, X3, Y3, X2 ^ Y6 ^ X7, Y2 ^ X6 ^ Y7, X1 ^ X5 ^ Y6 }, - {S0, S1, S2, X0, Y0, X1, Y1, X2, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X3, Y3, Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {0, S0, S1, S2, X0, Y0, X1, Y1, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X3, Y3, X2 ^ Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {0, 0, S0, S1, S2, X0, Y0, X1, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, Y2 ^ X5 ^ Y6, X3, Y3, X2 ^ Y6 ^ X7, Y1 ^ X6 ^ Y7 }, - {0, 0, 0, S0, S1, S2, X0, Y0, Y4 ^ X7 ^ Y7, Z0 ^ X4 ^ Y4, Y1 ^ Y5 ^ X6, X1 ^ X5 ^ Y6, X3, Y3, X2 ^ Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {0, 0, 0, 0, S0, S1, S2, X0, Y4 ^ X7 ^ Y7, Y0 ^ X4 ^ Y4, Y1 ^ Y5 ^ X6, X1 ^ X5 ^ Y6, X3, Y3, X2 ^ Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {S0, S1, S2, X0, Y0, X1, Y1, X2, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X6 ^ Y6, X3, Y3, Y2 ^ X6 ^ Y7 }, - {0, S0, S1, S2, X0, Y0, X1, Y1, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, Y2 ^ X6 ^ Y6, X3, Y3, X2 ^ X6 ^ Y6 }, - {0, 0, S0, S1, S2, X0, Y0, X1, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, Y2 ^ X5 ^ Y6, X2 ^ X6 ^ Y6, X3, Y3, Y1 ^ X6 ^ Y6 }, - {0, 0, 0, S0, S1, S2, X0, Y0, Y4 ^ X7 ^ Y7, Z0 ^ X4 ^ Y4, Y1 ^ Y5 ^ X6, X1 ^ X5 ^ Y6, Y2 ^ X6 ^ Y6, X3, Y3, X2 ^ X6 ^ Y6 }, - {0, 0, 0, 0, S0, S1, S2, X0, Y4 ^ X7 ^ Y7, Y0 ^ X4 ^ Y4, Y1 ^ Y5 ^ X6, X1 ^ X5 ^ Y6, X2 ^ X6 ^ Y6, X3, Y3, Y2 ^ X6 ^ Y6 }, - {S0, S1, S2, X0, Y0, X1, Y1, X2, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X3, Y3, Y2 ^ X6 ^ Y7, X6 ^ Y6 }, - {0, S0, S1, S2, X0, Y0, X1, Y1, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X3, Y3, X2 ^ X6 ^ Y7, Y2 ^ X6 ^ Y6 }, - {0, 0, S0, S1, S2, X0, Y0, X1, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, Y2 ^ X5 ^ Y7, X3, Y3, Y1 ^ X6 ^ Y7, X2 ^ X6 ^ Y6 }, - {0, 0, 0, S0, S1, S2, X0, Y0, Y4 ^ X8 ^ Y8, Z0 ^ X4 ^ Y4, Y1 ^ Y5 ^ X7, X1 ^ X5 ^ Y7, X3, Y3, X2 ^ X6 ^ Y7, Y2 ^ X6 ^ Y6 }, - {0, 0, 0, 0, S0, S1, S2, X0, Y4 ^ X8 ^ Y8, Y0 ^ X4 ^ Y4, Y1 ^ Y5 ^ X7, X1 ^ X5 ^ Y7, X3, Y3, X2 ^ X6 ^ Y7, Y2 ^ X6 ^ Y6 }, - {S0, S1, S2, X0, Y0, X1, Y1, X2, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X3, Y3, Y2 ^ X6 ^ Y7 }, - {0, S0, S1, S2, X0, Y0, X1, Y1, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, Y2 ^ X6 ^ Y6, X3, Y3, X2 ^ X6 ^ Y7 }, - {0, 0, S0, S1, S2, X0, Y0, X1, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X2 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, X3, Y3, Y1 ^ X6 ^ Y7 }, - {0, 0, 0, S0, S1, S2, X0, Y0, Y4 ^ X8 ^ Y8, Z0 ^ X4 ^ Y4, Y1 ^ Y5 ^ X7, X1 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, X3, Y3, X2 ^ X6 ^ Y7 }, - {0, 0, 0, 0, S0, S1, S2, X0, Y4 ^ X8 ^ Y8, Y0 ^ X4 ^ Y4, Y1 ^ Y5 ^ X7, X1 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, X3, Y3, X2 ^ X6 ^ Y7 }, - {S0, S1, S2, X0, Y0, X1, Y1, X2, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, Y2 ^ X6 ^ Y8, X3, Y3 }, - {0, S0, S1, S2, X0, Y0, X1, Y1, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, Y2 ^ X6 ^ Y6, X2 ^ X6 ^ Y8, X3, Y3 }, - {0, 0, S0, S1, S2, X0, Y0, X1, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X2 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, Y1 ^ X6 ^ Y8, X3, Y3 }, - {0, 0, 0, S0, S1, S2, X0, Y0, Y4 ^ X8 ^ Y8, Z0 ^ X4 ^ Y4, Y1 ^ Y5 ^ X7, X1 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, X2 ^ X6 ^ Y8, X3, Y3 }, - {0, 0, 0, 0, S0, S1, S2, X0, Y4 ^ X8 ^ Y8, Y0 ^ X4 ^ Y4, Y1 ^ Y5 ^ X7, X1 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, X2 ^ X6 ^ Y8, X3, Y3 }, - {S0, S1, S2, X0, Y0, X1, Y1, X2, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, X3, Y3, Y2 ^ X6 ^ Y7 }, - {0, S0, S1, S2, X0, Y0, X1, Y1, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y2 ^ Y6 ^ X7, X3, Y3, X2 ^ X6 ^ Y7 }, - {0, 0, S0, S1, S2, X0, Y0, X1, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X2 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X3, Y3, Y1 ^ X6 ^ Y7 }, - {0, 0, 0, S0, S1, S2, X0, Y0, Y4 ^ X9 ^ Y9, Z0 ^ X4 ^ Y4, Y1 ^ Y5 ^ X8, X1 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X3, Y3, X2 ^ X6 ^ Y7 }, - {0, 0, 0, 0, S0, S1, S2, X0, Y4 ^ X9 ^ Y9, Y0 ^ X4 ^ Y4, Y1 ^ Y5 ^ X8, X1 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X3, Y3, X2 ^ X6 ^ Y7 }, - {S0, S1, S2, X0, Y0, X1, Y1, X2, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, Y2 ^ X6 ^ Y7, X3, Y3 }, - {0, S0, S1, S2, X0, Y0, X1, Y1, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7, X3, Y3 }, - {0, 0, S0, S1, S2, X0, Y0, X1, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, Y1 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7, X3, Y3 }, - {0, 0, 0, S0, S1, S2, X0, Y0, Y4 ^ X9 ^ Y9, Z0 ^ X4 ^ Y4, Y1 ^ Y5 ^ X8, X1 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7, X3, Y3 }, - {0, 0, 0, 0, S0, S1, S2, X0, Y4 ^ X9 ^ Y9, Y0 ^ X4 ^ Y4, Y1 ^ Y5 ^ X8, X1 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7, X3, Y3 }, - {S0, S1, S2, X0, Y0, X1, Y1, X2, X3, Y3, Y4 ^ X9, X4 ^ Y9, Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {0, S0, S1, S2, X0, Y0, X1, Y1, X3, Y3, X4 ^ Y9, Y4 ^ X9, X5 ^ Y8, Y5 ^ X8, X2 ^ X6 ^ Y7, Y2 ^ Y6 ^ X7 }, - {0, 0, S0, S1, S2, X0, Y0, X1, X3, Y3, Y4 ^ X9, X4 ^ Y9, Y5 ^ X8, Y1 ^ X5 ^ Y8, X2 ^ X6 ^ Y7, Y2 ^ Y6 ^ X7 }, - {0, 0, 0, S0, S1, S2, X0, Y0, Y2, X3, Y3 ^ X9, X4 ^ Y8, Y4 ^ X8, X1 ^ X5 ^ Y7, Y1 ^ Y5 ^ X7, X2 ^ X6 ^ Y6 }, - {0, 0, 0, 0, S0, S1, S2, X0, X2, Y2, Y3 ^ X8, X3 ^ Y8, Y4 ^ X7, Y0 ^ X4 ^ Y7, X1 ^ X5 ^ Y6, Y1 ^ Y5 ^ X6 }, - {S0, S1, S2, X0, Y0, X1, Y1, X2, X3, Z0 ^ X4 ^ Y4, Y3, Y4 ^ X9, Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {0, S0, S1, S2, X0, Y0, X1, Y1, X3, Z0 ^ X4 ^ Y4, Y3, Y4 ^ X9, X5 ^ Y8, Y5 ^ X8, X2 ^ Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {0, 0, S0, S1, S2, X0, Y0, X1, X3, Z0 ^ X4 ^ Y4, Y3, Y4 ^ X9, Y5 ^ X8, Y1 ^ X5 ^ Y8, X2 ^ X6 ^ Y7, Y2 ^ Y6 ^ X7 }, - {0, 0, 0, S0, S1, S2, X0, Y0, X2, Z0 ^ X4 ^ Y4, X3, Y3 ^ X9, Y4 ^ X8, Y1 ^ X5 ^ Y7, X1 ^ Y5 ^ X7, Y2 ^ X6 ^ Y6 }, - {0, 0, 0, 0, S0, S1, S2, X0, X2, Z0 ^ X4 ^ Y4, Y2, X3 ^ Y7, Y3 ^ X7, Y0 ^ X4 ^ Y6, X1 ^ Y4 ^ X6, Y1 ^ X5 ^ Y5 }, - {S0, S1, S2, X0, Y0, X1, Y1, X2, X3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, Y3, Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {0, S0, S1, S2, X0, Y0, X1, Y1, X3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, Y3, X5 ^ Y8, Y5 ^ X8, X2 ^ Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {0, 0, S0, S1, S2, X0, Y0, X1, X3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, Y3, Y5 ^ X8, Y1 ^ X5 ^ Y8, X2 ^ X6 ^ Y7, Y2 ^ Y6 ^ X7 }, - {0, 0, 0, S0, S1, S2, X0, Y0, X2, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X3, Y3 ^ X8, Y1 ^ X5 ^ Y7, X1 ^ Y5 ^ X7, Y2 ^ X6 ^ Y6 }, - {0, 0, 0, 0, S0, S1, S2, X0, X2, Y4 ^ X5 ^ Y5, Y0 ^ X4 ^ Y4, X3, Y3 ^ X7, Y1 ^ X4 ^ Y7, Y2 ^ X5 ^ Y6, X1 ^ Y5 ^ X6 }, - {S0, S1, S2, X0, Y0, X1, Y1, X2, X3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, Y3, X5 ^ Y8, Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {0, S0, S1, S2, X0, Y0, X1, Y1, X3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, Y3, X5 ^ Y8, X2 ^ Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {0, 0, S0, S1, S2, X0, Y0, X1, X3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, Y3, Y1 ^ X5 ^ Y8, X2 ^ X6 ^ Y7, Y2 ^ Y6 ^ X7 }, - {0, 0, 0, S0, S1, S2, X0, Y0, X2, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, Y2 ^ X5 ^ Y5, X3, Y3 ^ X7, Y1 ^ X5 ^ Y6, X1 ^ Y5 ^ X6 }, - {0, 0, 0, 0, S0, S1, S2, X0, X2, Y4 ^ X5 ^ Y5, Y0 ^ X4 ^ Y4, X1 ^ X5 ^ Y5, X3, Y3 ^ X6, Y1 ^ X4 ^ Y6, Y2 ^ X5 ^ Y5 }, - {S0, S1, S2, X0, Y0, X1, Y1, X2, X3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Y3, X5 ^ Y8, Y6 ^ X7, Y2 ^ X6 ^ Y7, Z0 ^ X5 ^ Y5 }, - {0, S0, S1, S2, X0, Y0, X1, Y1, X3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Y3, X5 ^ Y8, X2 ^ X6 ^ Y7, Y2 ^ Y6 ^ X7, Z0 ^ X5 ^ Y5 }, - {0, 0, S0, S1, S2, X0, Y0, X1, X3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Y3, Y1 ^ X5 ^ Y8, X2 ^ X6 ^ Y7, Y2 ^ Y6 ^ X7, Z0 ^ X5 ^ Y5 }, - {0, 0, 0, S0, S1, S2, X0, Y0, X3, Y4 ^ X6 ^ Y6, Z0 ^ X4 ^ Y4, Y3, X1 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7, Y1 ^ X5 ^ Y5 }, - {0, 0, 0, 0, S0, S1, S2, X0, X3, Y4 ^ X6 ^ Y6, Y0 ^ X4 ^ Y4, Y3, X1 ^ X5 ^ Y8, X2 ^ X6 ^ Y7, Y2 ^ Y6 ^ X7, Y1 ^ X5 ^ Y5 }, - {S0, S1, S2, X0, Y0, X1, Y1, X2, X3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y3, X5 ^ Y8, Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {0, S0, S1, S2, X0, Y0, X1, Y1, X3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y3, X5 ^ Y8, X2 ^ X6 ^ Y7, Y2 ^ Y6 ^ X7 }, - {0, 0, S0, S1, S2, X0, Y0, X1, X3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y3, Y1 ^ X5 ^ Y8, X2 ^ X6 ^ Y7, Y2 ^ Y6 ^ X7 }, - {0, 0, 0, S0, S1, S2, X0, Y0, X3, Y4 ^ X6 ^ Y6, Z0 ^ X4 ^ Y4, Y1 ^ X5 ^ Y5, Y3, X1 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7 }, - {0, 0, 0, 0, S0, S1, S2, X0, X3, Y4 ^ X6 ^ Y6, Y0 ^ X4 ^ Y4, Y1 ^ X5 ^ Y5, Y3, X1 ^ X5 ^ Y8, X2 ^ X6 ^ Y7, Y2 ^ Y6 ^ X7 }, - {S0, S1, S2, X0, Y0, X1, Y1, X2, X3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y6, Y3, Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {0, S0, S1, S2, X0, Y0, X1, Y1, X3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y6, Y3, X2 ^ X6 ^ Y7, Y2 ^ Y6 ^ X7 }, - {0, 0, S0, S1, S2, X0, Y0, X1, X3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y2 ^ X5 ^ Y6, Y3, Y1 ^ X5 ^ Y7, X2 ^ X6 ^ Y6 }, - {0, 0, 0, S0, S1, S2, X0, Y0, X3, Y4 ^ X6 ^ Y6, Z0 ^ X4 ^ Y4, Y1 ^ X5 ^ Y5, X2 ^ X5 ^ Y6, Y3, X1 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6 }, - {0, 0, 0, 0, S0, S1, S2, X0, X3, Y4 ^ X6 ^ Y6, Y0 ^ X4 ^ Y4, Y1 ^ X5 ^ Y5, Y2 ^ X5 ^ Y6, Y3, X1 ^ X5 ^ Y7, X2 ^ X6 ^ Y6 }, - {S0, S1, S2, X0, Y0, X1, Y1, X2, X3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, Y3, Y6 ^ X7, Y2 ^ X6 ^ Y7, X5 ^ Y6 }, - {0, S0, S1, S2, X0, Y0, X1, Y1, X3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, Y3, X2 ^ Y6 ^ X7, Y2 ^ X6 ^ Y7, X5 ^ Y6 }, - {0, 0, S0, S1, S2, X0, Y0, X1, X3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, Y3, X2 ^ Y6 ^ X7, Y1 ^ X6 ^ Y7, Y2 ^ X5 ^ Y6 }, - {0, 0, 0, S0, S1, S2, X0, Y0, X3, Y4 ^ X7 ^ Y7, Z0 ^ X4 ^ Y4, Y1 ^ Y5 ^ X6, Y3, X2 ^ Y6 ^ X7, Y2 ^ X6 ^ Y7, X1 ^ X5 ^ Y6 }, - {0, 0, 0, 0, S0, S1, S2, X0, X3, Y4 ^ X7 ^ Y7, Y0 ^ X4 ^ Y4, Y1 ^ Y5 ^ X6, Y3, X2 ^ Y6 ^ X7, Y2 ^ X6 ^ Y7, X1 ^ X5 ^ Y6 }, - {S0, S1, S2, X0, Y0, X1, Y1, X2, X3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, Y3, Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {0, S0, S1, S2, X0, Y0, X1, Y1, X3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, Y3, X2 ^ Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {0, 0, S0, S1, S2, X0, Y0, X1, X3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, Y2 ^ X5 ^ Y6, Y3, X2 ^ Y6 ^ X7, Y1 ^ X6 ^ Y7 }, - {0, 0, 0, S0, S1, S2, X0, Y0, X3, Y4 ^ X7 ^ Y7, Z0 ^ X4 ^ Y4, Y1 ^ Y5 ^ X6, X1 ^ X5 ^ Y6, Y3, X2 ^ Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {0, 0, 0, 0, S0, S1, S2, X0, X3, Y4 ^ X7 ^ Y7, Y0 ^ X4 ^ Y4, Y1 ^ Y5 ^ X6, X1 ^ X5 ^ Y6, Y3, X2 ^ Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {S0, S1, S2, X0, Y0, X1, Y1, X2, X3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X6 ^ Y6, Y3, Y2 ^ X6 ^ Y7 }, - {0, S0, S1, S2, X0, Y0, X1, Y1, X3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, Y2 ^ X6 ^ Y6, Y3, X2 ^ X6 ^ Y6 }, - {0, 0, S0, S1, S2, X0, Y0, X1, X3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, Y2 ^ X5 ^ Y6, X2 ^ X6 ^ Y6, Y3, Y1 ^ X6 ^ Y6 }, - {0, 0, 0, S0, S1, S2, X0, Y0, X3, Y4 ^ X7 ^ Y7, Z0 ^ X4 ^ Y4, Y1 ^ Y5 ^ X6, X1 ^ X5 ^ Y6, Y2 ^ X6 ^ Y6, Y3, X2 ^ X6 ^ Y6 }, - {0, 0, 0, 0, S0, S1, S2, X0, X3, Y4 ^ X7 ^ Y7, Y0 ^ X4 ^ Y4, Y1 ^ Y5 ^ X6, X1 ^ X5 ^ Y6, X2 ^ X6 ^ Y6, Y3, Y2 ^ X6 ^ Y6 }, - {S0, S1, S2, X0, Y0, X1, Y1, X2, X3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, Y3, Y2 ^ X6 ^ Y7, X6 ^ Y6 }, - {0, S0, S1, S2, X0, Y0, X1, Y1, X3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, Y3, X2 ^ X6 ^ Y7, Y2 ^ X6 ^ Y6 }, - {0, 0, S0, S1, S2, X0, Y0, X1, X3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, Y2 ^ X5 ^ Y7, Y3, Y1 ^ X6 ^ Y7, X2 ^ X6 ^ Y6 }, - {0, 0, 0, S0, S1, S2, X0, Y0, X3, Y4 ^ X8 ^ Y8, Z0 ^ X4 ^ Y4, Y1 ^ Y5 ^ X7, X1 ^ X5 ^ Y7, Y3, X2 ^ X6 ^ Y7, Y2 ^ X6 ^ Y6 }, - {0, 0, 0, 0, S0, S1, S2, X0, X3, Y4 ^ X8 ^ Y8, Y0 ^ X4 ^ Y4, Y1 ^ Y5 ^ X7, X1 ^ X5 ^ Y7, Y3, X2 ^ X6 ^ Y7, Y2 ^ X6 ^ Y6 }, - {S0, S1, S2, X0, Y0, X1, Y1, X2, X3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, Y3, Y2 ^ X6 ^ Y7 }, - {0, S0, S1, S2, X0, Y0, X1, Y1, X3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, Y2 ^ X6 ^ Y6, Y3, X2 ^ X6 ^ Y7 }, - {0, 0, S0, S1, S2, X0, Y0, X1, X3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X2 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, Y3, Y1 ^ X6 ^ Y7 }, - {0, 0, 0, S0, S1, S2, X0, Y0, X3, Y4 ^ X8 ^ Y8, Z0 ^ X4 ^ Y4, Y1 ^ Y5 ^ X7, X1 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, Y3, X2 ^ X6 ^ Y7 }, - {0, 0, 0, 0, S0, S1, S2, X0, X3, Y4 ^ X8 ^ Y8, Y0 ^ X4 ^ Y4, Y1 ^ Y5 ^ X7, X1 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, Y3, X2 ^ X6 ^ Y7 }, - {S0, S1, S2, X0, Y0, X1, Y1, X2, X3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, Y2 ^ X6 ^ Y8, Y3 }, - {0, S0, S1, S2, X0, Y0, X1, Y1, X3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, Y2 ^ X6 ^ Y6, X2 ^ X6 ^ Y8, Y3 }, - {0, 0, S0, S1, S2, X0, Y0, X1, X3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X2 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, Y1 ^ X6 ^ Y8, Y3 }, - {0, 0, 0, S0, S1, S2, X0, Y0, X3, Y4 ^ X8 ^ Y8, Z0 ^ X4 ^ Y4, Y1 ^ Y5 ^ X7, X1 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, X2 ^ X6 ^ Y8, Y3 }, - {0, 0, 0, 0, S0, S1, S2, X0, X3, Y4 ^ X8 ^ Y8, Y0 ^ X4 ^ Y4, Y1 ^ Y5 ^ X7, X1 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, X2 ^ X6 ^ Y8, Y3 }, - {S0, S1, S2, X0, Y0, X1, Y1, X2, X3, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, Y3, Y2 ^ X6 ^ Y7 }, - {0, S0, S1, S2, X0, Y0, X1, Y1, X3, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y2 ^ Y6 ^ X7, Y3, X2 ^ X6 ^ Y7 }, - {0, 0, S0, S1, S2, X0, Y0, X1, X3, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X2 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, Y3, Y1 ^ X6 ^ Y7 }, - {0, 0, 0, S0, S1, S2, X0, Y0, X3, Y4 ^ X9 ^ Y9, Z0 ^ X4 ^ Y4, Y1 ^ Y5 ^ X8, X1 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, Y3, X2 ^ X6 ^ Y7 }, - {0, 0, 0, 0, S0, S1, S2, X0, X3, Y4 ^ X9 ^ Y9, Y0 ^ X4 ^ Y4, Y1 ^ Y5 ^ X8, X1 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, Y3, X2 ^ X6 ^ Y7 }, - {S0, S1, S2, X0, Y0, X1, Y1, X2, X3, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, Y2 ^ X6 ^ Y7, Y3 }, - {0, S0, S1, S2, X0, Y0, X1, Y1, X3, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7, Y3 }, - {0, 0, S0, S1, S2, X0, Y0, X1, X3, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, Y1 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7, Y3 }, - {0, 0, 0, S0, S1, S2, X0, Y0, X3, Y4 ^ X9 ^ Y9, Z0 ^ X4 ^ Y4, Y1 ^ Y5 ^ X8, X1 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7, Y3 }, - {0, 0, 0, 0, S0, S1, S2, X0, X3, Y4 ^ X9 ^ Y9, Y0 ^ X4 ^ Y4, Y1 ^ Y5 ^ X8, X1 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7, Y3 }, - {S0, S1, S2, X0, Y0, X1, Y1, X2, X3, Y3, Y4 ^ X9, X4 ^ Y9, Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {0, S0, S1, S2, X0, Y0, X1, Y1, X3, Y3, X4 ^ Y9, Y4 ^ X9, X5 ^ Y8, Y5 ^ X8, X2 ^ X6 ^ Y7, Y2 ^ Y6 ^ X7 }, - {0, 0, S0, S1, S2, X0, Y0, X1, X3, Y3, Y4 ^ X9, X4 ^ Y9, Y5 ^ X8, Y1 ^ X5 ^ Y8, X2 ^ X6 ^ Y7, Y2 ^ Y6 ^ X7 }, - {0, 0, 0, S0, S1, S2, X0, Y0, Y2, X3, Y3 ^ X9, X4 ^ Y8, Y4 ^ X8, X1 ^ X5 ^ Y7, Y1 ^ Y5 ^ X7, X2 ^ X6 ^ Y6 }, - {0, 0, 0, 0, S0, S1, S2, X0, X2, Y2, Y3 ^ X8, X3 ^ Y8, Y4 ^ X7, Y0 ^ X4 ^ Y7, X1 ^ X5 ^ Y6, Y1 ^ Y5 ^ X6 }, - {S0, S1, S2, X0, Y0, X1, Y1, X2, X3, Y3, Z0 ^ X4 ^ Y4, Y4 ^ X9, Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {0, S0, S1, S2, X0, Y0, X1, Y1, X3, Y3, Z0 ^ X4 ^ Y4, Y4 ^ X9, X5 ^ Y8, Y5 ^ X8, X2 ^ Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {0, 0, S0, S1, S2, X0, Y0, X1, X3, Y3, Z0 ^ X4 ^ Y4, Y4 ^ X9, Y5 ^ X8, Y1 ^ X5 ^ Y8, X2 ^ X6 ^ Y7, Y2 ^ Y6 ^ X7 }, - {0, 0, 0, S0, S1, S2, X0, Y0, X2, X3, Z0 ^ X4 ^ Y4, Y3 ^ X9, Y4 ^ X8, Y1 ^ X5 ^ Y7, X1 ^ Y5 ^ X7, Y2 ^ X6 ^ Y6 }, - {0, 0, 0, 0, S0, S1, S2, X0, X2, Y2, Z0 ^ X4 ^ Y4, X3 ^ Y7, Y3 ^ X7, Y0 ^ X4 ^ Y6, X1 ^ Y4 ^ X6, Y1 ^ X5 ^ Y5 }, - {S0, S1, S2, X0, Y0, X1, Y1, X2, X3, Y3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {0, S0, S1, S2, X0, Y0, X1, Y1, X3, Y3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y8, Y5 ^ X8, X2 ^ Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {0, 0, S0, S1, S2, X0, Y0, X1, X3, Y3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, Y5 ^ X8, Y1 ^ X5 ^ Y8, X2 ^ X6 ^ Y7, Y2 ^ Y6 ^ X7 }, - {0, 0, 0, S0, S1, S2, X0, Y0, X2, X3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, Y3 ^ X8, Y1 ^ X5 ^ Y7, X1 ^ Y5 ^ X7, Y2 ^ X6 ^ Y6 }, - {0, 0, 0, 0, S0, S1, S2, X0, X2, X3, Y4 ^ X5 ^ Y5, Y0 ^ X4 ^ Y4, Y3 ^ X7, Y1 ^ X4 ^ Y7, Y2 ^ X5 ^ Y6, X1 ^ Y5 ^ X6 }, - {S0, S1, S2, X0, Y0, X1, Y1, X2, X3, Y3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, X5 ^ Y8, Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {0, S0, S1, S2, X0, Y0, X1, Y1, X3, Y3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, X5 ^ Y8, X2 ^ Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {0, 0, S0, S1, S2, X0, Y0, X1, X3, Y3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, Y1 ^ X5 ^ Y8, X2 ^ X6 ^ Y7, Y2 ^ Y6 ^ X7 }, - {0, 0, 0, S0, S1, S2, X0, Y0, X2, X3, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, Y2 ^ X5 ^ Y5, Y3 ^ X7, Y1 ^ X5 ^ Y6, X1 ^ Y5 ^ X6 }, - {0, 0, 0, 0, S0, S1, S2, X0, X2, X3, Y4 ^ X5 ^ Y5, Y0 ^ X4 ^ Y4, X1 ^ X5 ^ Y5, Y3 ^ X6, Y1 ^ X4 ^ Y6, Y2 ^ X5 ^ Y5 }, - {S0, S1, S2, X0, Y0, X1, Y1, X2, X3, Y3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, X5 ^ Y8, Y6 ^ X7, Y2 ^ X6 ^ Y7, Z0 ^ X5 ^ Y5 }, - {0, S0, S1, S2, X0, Y0, X1, Y1, X3, Y3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, X5 ^ Y8, X2 ^ X6 ^ Y7, Y2 ^ Y6 ^ X7, Z0 ^ X5 ^ Y5 }, - {0, 0, S0, S1, S2, X0, Y0, X1, X3, Y3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Y1 ^ X5 ^ Y8, X2 ^ X6 ^ Y7, Y2 ^ Y6 ^ X7, Z0 ^ X5 ^ Y5 }, - {0, 0, 0, S0, S1, S2, X0, Y0, X3, Y3, Y4 ^ X6 ^ Y6, Z0 ^ X4 ^ Y4, X1 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7, Y1 ^ X5 ^ Y5 }, - {0, 0, 0, 0, S0, S1, S2, X0, X3, Y3, Y4 ^ X6 ^ Y6, Y0 ^ X4 ^ Y4, X1 ^ X5 ^ Y8, X2 ^ X6 ^ Y7, Y2 ^ Y6 ^ X7, Y1 ^ X5 ^ Y5 }, - {S0, S1, S2, X0, Y0, X1, Y1, X2, X3, Y3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y8, Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {0, S0, S1, S2, X0, Y0, X1, Y1, X3, Y3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y8, X2 ^ X6 ^ Y7, Y2 ^ Y6 ^ X7 }, - {0, 0, S0, S1, S2, X0, Y0, X1, X3, Y3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y1 ^ X5 ^ Y8, X2 ^ X6 ^ Y7, Y2 ^ Y6 ^ X7 }, - {0, 0, 0, S0, S1, S2, X0, Y0, X3, Y3, Y4 ^ X6 ^ Y6, Z0 ^ X4 ^ Y4, Y1 ^ X5 ^ Y5, X1 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7 }, - {0, 0, 0, 0, S0, S1, S2, X0, X3, Y3, Y4 ^ X6 ^ Y6, Y0 ^ X4 ^ Y4, Y1 ^ X5 ^ Y5, X1 ^ X5 ^ Y8, X2 ^ X6 ^ Y7, Y2 ^ Y6 ^ X7 }, - {S0, S1, S2, X0, Y0, X1, Y1, X2, X3, Y3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y6, Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {0, S0, S1, S2, X0, Y0, X1, Y1, X3, Y3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y6, X2 ^ X6 ^ Y7, Y2 ^ Y6 ^ X7 }, - {0, 0, S0, S1, S2, X0, Y0, X1, X3, Y3, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y2 ^ X5 ^ Y6, Y1 ^ X5 ^ Y7, X2 ^ X6 ^ Y6 }, - {0, 0, 0, S0, S1, S2, X0, Y0, X3, Y3, Y4 ^ X6 ^ Y6, Z0 ^ X4 ^ Y4, Y1 ^ X5 ^ Y5, X2 ^ X5 ^ Y6, X1 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6 }, - {0, 0, 0, 0, S0, S1, S2, X0, X3, Y3, Y4 ^ X6 ^ Y6, Y0 ^ X4 ^ Y4, Y1 ^ X5 ^ Y5, Y2 ^ X5 ^ Y6, X1 ^ X5 ^ Y7, X2 ^ X6 ^ Y6 }, - {S0, S1, S2, X0, Y0, X1, Y1, X2, X3, Y3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, Y6 ^ X7, Y2 ^ X6 ^ Y7, X5 ^ Y6 }, - {0, S0, S1, S2, X0, Y0, X1, Y1, X3, Y3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X2 ^ Y6 ^ X7, Y2 ^ X6 ^ Y7, X5 ^ Y6 }, - {0, 0, S0, S1, S2, X0, Y0, X1, X3, Y3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X2 ^ Y6 ^ X7, Y1 ^ X6 ^ Y7, Y2 ^ X5 ^ Y6 }, - {0, 0, 0, S0, S1, S2, X0, Y0, X3, Y3, Y4 ^ X7 ^ Y7, Z0 ^ X4 ^ Y4, Y1 ^ Y5 ^ X6, X2 ^ Y6 ^ X7, Y2 ^ X6 ^ Y7, X1 ^ X5 ^ Y6 }, - {0, 0, 0, 0, S0, S1, S2, X0, X3, Y3, Y4 ^ X7 ^ Y7, Y0 ^ X4 ^ Y4, Y1 ^ Y5 ^ X6, X2 ^ Y6 ^ X7, Y2 ^ X6 ^ Y7, X1 ^ X5 ^ Y6 }, - {S0, S1, S2, X0, Y0, X1, Y1, X2, X3, Y3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {0, S0, S1, S2, X0, Y0, X1, Y1, X3, Y3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X2 ^ Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {0, 0, S0, S1, S2, X0, Y0, X1, X3, Y3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, Y2 ^ X5 ^ Y6, X2 ^ Y6 ^ X7, Y1 ^ X6 ^ Y7 }, - {0, 0, 0, S0, S1, S2, X0, Y0, X3, Y3, Y4 ^ X7 ^ Y7, Z0 ^ X4 ^ Y4, Y1 ^ Y5 ^ X6, X1 ^ X5 ^ Y6, X2 ^ Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {0, 0, 0, 0, S0, S1, S2, X0, X3, Y3, Y4 ^ X7 ^ Y7, Y0 ^ X4 ^ Y4, Y1 ^ Y5 ^ X6, X1 ^ X5 ^ Y6, X2 ^ Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {S0, S1, S2, X0, Y0, X1, Y1, X2, X3, Y3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X6 ^ Y6, Y2 ^ X6 ^ Y7 }, - {0, S0, S1, S2, X0, Y0, X1, Y1, X3, Y3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, Y2 ^ X6 ^ Y6, X2 ^ X6 ^ Y6 }, - {0, 0, S0, S1, S2, X0, Y0, X1, X3, Y3, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, Y2 ^ X5 ^ Y6, X2 ^ X6 ^ Y6, Y1 ^ X6 ^ Y6 }, - {0, 0, 0, S0, S1, S2, X0, Y0, X3, Y3, Y4 ^ X7 ^ Y7, Z0 ^ X4 ^ Y4, Y1 ^ Y5 ^ X6, X1 ^ X5 ^ Y6, Y2 ^ X6 ^ Y6, X2 ^ X6 ^ Y6 }, - {0, 0, 0, 0, S0, S1, S2, X0, X3, Y3, Y4 ^ X7 ^ Y7, Y0 ^ X4 ^ Y4, Y1 ^ Y5 ^ X6, X1 ^ X5 ^ Y6, X2 ^ X6 ^ Y6, Y2 ^ X6 ^ Y6 }, - {S0, S1, S2, X0, Y0, X1, Y1, X2, X3, Y3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, Y2 ^ X6 ^ Y7, X6 ^ Y6 }, - {0, S0, S1, S2, X0, Y0, X1, Y1, X3, Y3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X2 ^ X6 ^ Y7, Y2 ^ X6 ^ Y6 }, - {0, 0, S0, S1, S2, X0, Y0, X1, X3, Y3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, Y2 ^ X5 ^ Y7, Y1 ^ X6 ^ Y7, X2 ^ X6 ^ Y6 }, - {0, 0, 0, S0, S1, S2, X0, Y0, X3, Y3, Y4 ^ X8 ^ Y8, Z0 ^ X4 ^ Y4, Y1 ^ Y5 ^ X7, X1 ^ X5 ^ Y7, X2 ^ X6 ^ Y7, Y2 ^ X6 ^ Y6 }, - {0, 0, 0, 0, S0, S1, S2, X0, X3, Y3, Y4 ^ X8 ^ Y8, Y0 ^ X4 ^ Y4, Y1 ^ Y5 ^ X7, X1 ^ X5 ^ Y7, X2 ^ X6 ^ Y7, Y2 ^ X6 ^ Y6 }, - {S0, S1, S2, X0, Y0, X1, Y1, X2, X3, Y3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, Y2 ^ X6 ^ Y7 }, - {0, S0, S1, S2, X0, Y0, X1, Y1, X3, Y3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, Y2 ^ X6 ^ Y6, X2 ^ X6 ^ Y7 }, - {0, 0, S0, S1, S2, X0, Y0, X1, X3, Y3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X2 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, Y1 ^ X6 ^ Y7 }, - {0, 0, 0, S0, S1, S2, X0, Y0, X3, Y3, Y4 ^ X8 ^ Y8, Z0 ^ X4 ^ Y4, Y1 ^ Y5 ^ X7, X1 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, X2 ^ X6 ^ Y7 }, - {0, 0, 0, 0, S0, S1, S2, X0, X3, Y3, Y4 ^ X8 ^ Y8, Y0 ^ X4 ^ Y4, Y1 ^ Y5 ^ X7, X1 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, X2 ^ X6 ^ Y7 }, - {S0, S1, S2, X0, Y0, X1, Y1, X2, X3, Y3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, Y2 ^ X6 ^ Y8 }, - {0, S0, S1, S2, X0, Y0, X1, Y1, X3, Y3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, Y2 ^ X6 ^ Y6, X2 ^ X6 ^ Y8 }, - {0, 0, S0, S1, S2, X0, Y0, X1, X3, Y3, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X2 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, Y1 ^ X6 ^ Y8 }, - {0, 0, 0, S0, S1, S2, X0, Y0, X3, Y3, Y4 ^ X8 ^ Y8, Z0 ^ X4 ^ Y4, Y1 ^ Y5 ^ X7, X1 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, X2 ^ X6 ^ Y8 }, - {0, 0, 0, 0, S0, S1, S2, X0, X3, Y3, Y4 ^ X8 ^ Y8, Y0 ^ X4 ^ Y4, Y1 ^ Y5 ^ X7, X1 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, X2 ^ X6 ^ Y8 }, - {S0, S1, S2, X0, Y0, X1, Y1, X2, X3, Y3, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {0, S0, S1, S2, X0, Y0, X1, Y1, X3, Y3, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7 }, - {0, 0, S0, S1, S2, X0, Y0, X1, X3, Y3, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X2 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, Y1 ^ X6 ^ Y7 }, - {0, 0, 0, S0, S1, S2, X0, Y0, X3, Y3, Y4 ^ X9 ^ Y9, Z0 ^ X4 ^ Y4, Y1 ^ Y5 ^ X8, X1 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7 }, - {0, 0, 0, 0, S0, S1, S2, X0, X3, Y3, Y4 ^ X9 ^ Y9, Y0 ^ X4 ^ Y4, Y1 ^ Y5 ^ X8, X1 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7 }, - {S0, S1, S2, X0, Y0, X1, Y1, X2, X3, Y3, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, Y2 ^ X6 ^ Y7 }, - {0, S0, S1, S2, X0, Y0, X1, Y1, X3, Y3, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7 }, - {0, 0, S0, S1, S2, X0, Y0, X1, X3, Y3, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, Y1 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7 }, - {0, 0, 0, S0, S1, S2, X0, Y0, X3, Y3, Y4 ^ X9 ^ Y9, Z0 ^ X4 ^ Y4, Y1 ^ Y5 ^ X8, X1 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7 }, - {0, 0, 0, 0, S0, S1, S2, X0, X3, Y3, Y4 ^ X9 ^ Y9, Y0 ^ X4 ^ Y4, Y1 ^ Y5 ^ X8, X1 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7 }, -}; - -const UINT_64 SW_4K_S3_RBPLUS[][12]= -{ - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3, Y3 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3, Y3 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2, Y3 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2, Y2 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2, Y2 }, -}; - -const UINT_64 SW_4K_S3_X_RBPLUS[][12]= -{ - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3, Y3 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3, Y3 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2, Y3 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2, Y2 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2, Y2 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2 ^ X3 ^ Z3, X3, Z3, Y3 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, X2 ^ Y2 ^ Z3, X2, Z3, Y3 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, X2 ^ Y2 ^ Z2, X2, Z2, Y3 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1 ^ X2 ^ Z2, X2, Z2, Y2 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, X1 ^ Y1 ^ Z2, X1, Z2, Y2 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2 ^ X4 ^ Z4, X3 ^ Y3 ^ Z3, Z3, Y3 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2 ^ X3 ^ Z4, X2 ^ Y3 ^ Z3, Z3, Y3 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2 ^ X3 ^ Z3, X2 ^ Z2 ^ Y3, Z2, Y3 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1 ^ X3 ^ Z3, X2 ^ Y2 ^ Z2, Z2, Y2 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1 ^ X2 ^ Z3, X1 ^ Y2 ^ Z2, Z2, Y2 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2 ^ X5 ^ Z5, X3 ^ Y4 ^ Z4, Y3 ^ Z3 ^ X4, Y3 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2 ^ X4 ^ Z5, X2 ^ Y4 ^ Z4, X3 ^ Y3 ^ Z3, Y3 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2 ^ X4 ^ Z4, X2 ^ Z3 ^ Y4, Z2 ^ X3 ^ Y3, Y3 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1 ^ X4 ^ Z4, X2 ^ Y3 ^ Z3, Y2 ^ Z2 ^ X3, Y2 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1 ^ X3 ^ Z4, X1 ^ Y3 ^ Z3, X2 ^ Y2 ^ Z2, Y2 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2 ^ X4 ^ Z4, X3 ^ Y3 ^ Z3, Z3, Y3 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2 ^ X3 ^ Z4, X2 ^ Y3 ^ Z3, Z3, Y3 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2 ^ X3 ^ Z3, X2 ^ Z2 ^ Y3, Z2, Y3 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1 ^ X3 ^ Z3, X2 ^ Y2 ^ Z2, Z2, Y2 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1 ^ X2 ^ Z3, X1 ^ Y2 ^ Z2, Z2, Y2 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2 ^ X5 ^ Z5, X3 ^ Y4 ^ Z4, Y3 ^ Z3 ^ X4, Y3 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2 ^ X4 ^ Z5, X2 ^ Y4 ^ Z4, X3 ^ Y3 ^ Z3, Y3 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2 ^ X4 ^ Z4, X2 ^ Z3 ^ Y4, Z2 ^ X3 ^ Y3, Y3 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1 ^ X4 ^ Z4, X2 ^ Y3 ^ Z3, Y2 ^ Z2 ^ X3, Y2 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1 ^ X3 ^ Z4, X1 ^ Y3 ^ Z3, X2 ^ Y2 ^ Z2, Y2 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2 ^ X6 ^ Z6, X3 ^ Y5 ^ Z5, Z3 ^ Y4 ^ X5, Y3 ^ X4 ^ Z4 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2 ^ X5 ^ Z6, X2 ^ Y5 ^ Z5, Z3 ^ X4 ^ Y4, X3 ^ Y3 ^ Z4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2 ^ X5 ^ Z5, X2 ^ Z4 ^ Y5, Z2 ^ X4 ^ Y4, X3 ^ Y3 ^ Z3 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1 ^ X5 ^ Z5, X2 ^ Y4 ^ Z4, Z2 ^ Y3 ^ X4, Y2 ^ X3 ^ Z3 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1 ^ X4 ^ Z5, X1 ^ Y4 ^ Z4, Z2 ^ X3 ^ Y3, X2 ^ Y2 ^ Z3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2 ^ X5 ^ Z5, X3 ^ Y4 ^ Z4, Y3 ^ Z3 ^ X4, Y3 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2 ^ X4 ^ Z5, X2 ^ Y4 ^ Z4, X3 ^ Y3 ^ Z3, Y3 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2 ^ X4 ^ Z4, X2 ^ Z3 ^ Y4, Z2 ^ X3 ^ Y3, Y3 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1 ^ X4 ^ Z4, X2 ^ Y3 ^ Z3, Y2 ^ Z2 ^ X3, Y2 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1 ^ X3 ^ Z4, X1 ^ Y3 ^ Z3, X2 ^ Y2 ^ Z2, Y2 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2 ^ X6 ^ Z6, X3 ^ Y5 ^ Z5, Z3 ^ Y4 ^ X5, Y3 ^ X4 ^ Z4 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2 ^ X5 ^ Z6, X2 ^ Y5 ^ Z5, Z3 ^ X4 ^ Y4, X3 ^ Y3 ^ Z4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2 ^ X5 ^ Z5, X2 ^ Z4 ^ Y5, Z2 ^ X4 ^ Y4, X3 ^ Y3 ^ Z3 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1 ^ X5 ^ Z5, X2 ^ Y4 ^ Z4, Z2 ^ Y3 ^ X4, Y2 ^ X3 ^ Z3 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1 ^ X4 ^ Z5, X1 ^ Y4 ^ Z4, Z2 ^ X3 ^ Y3, X2 ^ Y2 ^ Z3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2 ^ X6 ^ Z6, X3 ^ Y5 ^ Z5, Z3 ^ Y4 ^ X5, Y3 ^ X4 ^ Z4 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2 ^ X5 ^ Z6, X2 ^ Y5 ^ Z5, Z3 ^ X4 ^ Y4, X3 ^ Y3 ^ Z4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2 ^ X5 ^ Z5, X2 ^ Z4 ^ Y5, Z2 ^ X4 ^ Y4, X3 ^ Y3 ^ Z3 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1 ^ X5 ^ Z5, X2 ^ Y4 ^ Z4, Z2 ^ Y3 ^ X4, Y2 ^ X3 ^ Z3 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1 ^ X4 ^ Z5, X1 ^ Y4 ^ Z4, Z2 ^ X3 ^ Y3, X2 ^ Y2 ^ Z3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2 ^ X6 ^ Z6, X3 ^ Y5 ^ Z5, Z3 ^ Y4 ^ X5, Y3 ^ X4 ^ Z4 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2 ^ X5 ^ Z6, X2 ^ Y5 ^ Z5, Z3 ^ X4 ^ Y4, X3 ^ Y3 ^ Z4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2 ^ X5 ^ Z5, X2 ^ Z4 ^ Y5, Z2 ^ X4 ^ Y4, X3 ^ Y3 ^ Z3 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1 ^ X5 ^ Z5, X2 ^ Y4 ^ Z4, Z2 ^ Y3 ^ X4, Y2 ^ X3 ^ Z3 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1 ^ X4 ^ Z5, X1 ^ Y4 ^ Z4, Z2 ^ X3 ^ Y3, X2 ^ Y2 ^ Z3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2 ^ X6 ^ Z6, X3 ^ Y5 ^ Z5, Z3 ^ Y4 ^ X5, Y3 ^ X4 ^ Z4 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2 ^ X5 ^ Z6, X2 ^ Y5 ^ Z5, Z3 ^ X4 ^ Y4, X3 ^ Y3 ^ Z4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2 ^ X5 ^ Z5, X2 ^ Z4 ^ Y5, Z2 ^ X4 ^ Y4, X3 ^ Y3 ^ Z3 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1 ^ X5 ^ Z5, X2 ^ Y4 ^ Z4, Z2 ^ Y3 ^ X4, Y2 ^ X3 ^ Z3 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1 ^ X4 ^ Z5, X1 ^ Y4 ^ Z4, Z2 ^ X3 ^ Y3, X2 ^ Y2 ^ Z3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2 ^ X6 ^ Z6, X3 ^ Y5 ^ Z5, Z3 ^ Y4 ^ X5, Y3 ^ X4 ^ Z4 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2 ^ X5 ^ Z6, X2 ^ Y5 ^ Z5, Z3 ^ X4 ^ Y4, X3 ^ Y3 ^ Z4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2 ^ X5 ^ Z5, X2 ^ Z4 ^ Y5, Z2 ^ X4 ^ Y4, X3 ^ Y3 ^ Z3 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1 ^ X5 ^ Z5, X2 ^ Y4 ^ Z4, Z2 ^ Y3 ^ X4, Y2 ^ X3 ^ Z3 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1 ^ X4 ^ Z5, X1 ^ Y4 ^ Z4, Z2 ^ X3 ^ Y3, X2 ^ Y2 ^ Z3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2 ^ X6 ^ Z6, X3 ^ Y5 ^ Z5, Z3 ^ Y4 ^ X5, Y3 ^ X4 ^ Z4 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2 ^ X5 ^ Z6, X2 ^ Y5 ^ Z5, Z3 ^ X4 ^ Y4, X3 ^ Y3 ^ Z4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2 ^ X5 ^ Z5, X2 ^ Z4 ^ Y5, Z2 ^ X4 ^ Y4, X3 ^ Y3 ^ Z3 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1 ^ X5 ^ Z5, X2 ^ Y4 ^ Z4, Z2 ^ Y3 ^ X4, Y2 ^ X3 ^ Z3 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1 ^ X4 ^ Z5, X1 ^ Y4 ^ Z4, Z2 ^ X3 ^ Y3, X2 ^ Y2 ^ Z3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2 ^ X6 ^ Z6, X3 ^ Y5 ^ Z5, Z3 ^ Y4 ^ X5, Y3 ^ X4 ^ Z4 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2 ^ X5 ^ Z6, X2 ^ Y5 ^ Z5, Z3 ^ X4 ^ Y4, X3 ^ Y3 ^ Z4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2 ^ X5 ^ Z5, X2 ^ Z4 ^ Y5, Z2 ^ X4 ^ Y4, X3 ^ Y3 ^ Z3 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1 ^ X5 ^ Z5, X2 ^ Y4 ^ Z4, Z2 ^ Y3 ^ X4, Y2 ^ X3 ^ Z3 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1 ^ X4 ^ Z5, X1 ^ Y4 ^ Z4, Z2 ^ X3 ^ Y3, X2 ^ Y2 ^ Z3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3, Y3 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3, Y3 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2, Y3 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2, Y2 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2, Y2 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3 ^ Y3 ^ Z3, Z3, Y3 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2 ^ Y3 ^ Z3, Z3, Y3 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2 ^ Z2 ^ Y3, Z2, Y3 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2 ^ Y2 ^ Z2, Z2, Y2 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1 ^ Y2 ^ Z2, Z2, Y2 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3 ^ Y4 ^ Z4, Y3 ^ Z3 ^ X4, Y3 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2 ^ Y4 ^ Z4, X3 ^ Y3 ^ Z3, Y3 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2 ^ Z3 ^ Y4, Z2 ^ X3 ^ Y3, Y3 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2 ^ Y3 ^ Z3, Y2 ^ Z2 ^ X3, Y2 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1 ^ Y3 ^ Z3, X2 ^ Y2 ^ Z2, Y2 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3 ^ Y5 ^ Z5, Z3 ^ Y4 ^ X5, Y3 ^ X4 ^ Z4 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2 ^ Y5 ^ Z5, Z3 ^ X4 ^ Y4, X3 ^ Y3 ^ Z4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2 ^ Z4 ^ Y5, Z2 ^ X4 ^ Y4, X3 ^ Y3 ^ Z3 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2 ^ Y4 ^ Z4, Z2 ^ Y3 ^ X4, Y2 ^ X3 ^ Z3 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1 ^ Y4 ^ Z4, Z2 ^ X3 ^ Y3, X2 ^ Y2 ^ Z3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3 ^ Y4 ^ Z4, Y3 ^ Z3 ^ X4, Y3 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2 ^ Y4 ^ Z4, X3 ^ Y3 ^ Z3, Y3 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2 ^ Z3 ^ Y4, Z2 ^ X3 ^ Y3, Y3 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2 ^ Y3 ^ Z3, Y2 ^ Z2 ^ X3, Y2 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1 ^ Y3 ^ Z3, X2 ^ Y2 ^ Z2, Y2 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3 ^ Y5 ^ Z5, Z3 ^ Y4 ^ X5, Y3 ^ X4 ^ Z4 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2 ^ Y5 ^ Z5, Z3 ^ X4 ^ Y4, X3 ^ Y3 ^ Z4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2 ^ Z4 ^ Y5, Z2 ^ X4 ^ Y4, X3 ^ Y3 ^ Z3 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2 ^ Y4 ^ Z4, Z2 ^ Y3 ^ X4, Y2 ^ X3 ^ Z3 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1 ^ Y4 ^ Z4, Z2 ^ X3 ^ Y3, X2 ^ Y2 ^ Z3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3 ^ Y5 ^ Z5, Z3 ^ Y4 ^ X5, Y3 ^ X4 ^ Z4 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2 ^ Y5 ^ Z5, Z3 ^ X4 ^ Y4, X3 ^ Y3 ^ Z4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2 ^ Z4 ^ Y5, Z2 ^ X4 ^ Y4, X3 ^ Y3 ^ Z3 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2 ^ Y4 ^ Z4, Z2 ^ Y3 ^ X4, Y2 ^ X3 ^ Z3 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1 ^ Y4 ^ Z4, Z2 ^ X3 ^ Y3, X2 ^ Y2 ^ Z3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3 ^ Y5 ^ Z5, Z3 ^ Y4 ^ X5, Y3 ^ X4 ^ Z4 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2 ^ Y5 ^ Z5, Z3 ^ X4 ^ Y4, X3 ^ Y3 ^ Z4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2 ^ Z4 ^ Y5, Z2 ^ X4 ^ Y4, X3 ^ Y3 ^ Z3 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2 ^ Y4 ^ Z4, Z2 ^ Y3 ^ X4, Y2 ^ X3 ^ Z3 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1 ^ Y4 ^ Z4, Z2 ^ X3 ^ Y3, X2 ^ Y2 ^ Z3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3 ^ Y5 ^ Z5, Z3 ^ Y4 ^ X5, Y3 ^ X4 ^ Z4 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2 ^ Y5 ^ Z5, Z3 ^ X4 ^ Y4, X3 ^ Y3 ^ Z4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2 ^ Z4 ^ Y5, Z2 ^ X4 ^ Y4, X3 ^ Y3 ^ Z3 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2 ^ Y4 ^ Z4, Z2 ^ Y3 ^ X4, Y2 ^ X3 ^ Z3 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1 ^ Y4 ^ Z4, Z2 ^ X3 ^ Y3, X2 ^ Y2 ^ Z3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3 ^ Y5 ^ Z5, Z3 ^ Y4 ^ X5, Y3 ^ X4 ^ Z4 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2 ^ Y5 ^ Z5, Z3 ^ X4 ^ Y4, X3 ^ Y3 ^ Z4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2 ^ Z4 ^ Y5, Z2 ^ X4 ^ Y4, X3 ^ Y3 ^ Z3 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2 ^ Y4 ^ Z4, Z2 ^ Y3 ^ X4, Y2 ^ X3 ^ Z3 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1 ^ Y4 ^ Z4, Z2 ^ X3 ^ Y3, X2 ^ Y2 ^ Z3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3 ^ Y5 ^ Z5, Z3 ^ Y4 ^ X5, Y3 ^ X4 ^ Z4 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2 ^ Y5 ^ Z5, Z3 ^ X4 ^ Y4, X3 ^ Y3 ^ Z4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2 ^ Z4 ^ Y5, Z2 ^ X4 ^ Y4, X3 ^ Y3 ^ Z3 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2 ^ Y4 ^ Z4, Z2 ^ Y3 ^ X4, Y2 ^ X3 ^ Z3 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1 ^ Y4 ^ Z4, Z2 ^ X3 ^ Y3, X2 ^ Y2 ^ Z3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3 ^ Y5 ^ Z5, Z3 ^ Y4 ^ X5, Y3 ^ X4 ^ Z4 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2 ^ Y5 ^ Z5, Z3 ^ X4 ^ Y4, X3 ^ Y3 ^ Z4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2 ^ Z4 ^ Y5, Z2 ^ X4 ^ Y4, X3 ^ Y3 ^ Z3 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2 ^ Y4 ^ Z4, Z2 ^ Y3 ^ X4, Y2 ^ X3 ^ Z3 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1 ^ Y4 ^ Z4, Z2 ^ X3 ^ Y3, X2 ^ Y2 ^ Z3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3 ^ Y5 ^ Z5, Z3 ^ Y4 ^ X5, Y3 ^ X4 ^ Z4 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2 ^ Y5 ^ Z5, Z3 ^ X4 ^ Y4, X3 ^ Y3 ^ Z4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2 ^ Z4 ^ Y5, Z2 ^ X4 ^ Y4, X3 ^ Y3 ^ Z3 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2 ^ Y4 ^ Z4, Z2 ^ Y3 ^ X4, Y2 ^ X3 ^ Z3 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1 ^ Y4 ^ Z4, Z2 ^ X3 ^ Y3, X2 ^ Y2 ^ Z3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3 ^ Y5 ^ Z5, Z3 ^ Y4 ^ X5, Y3 ^ X4 ^ Z4 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2 ^ Y5 ^ Z5, Z3 ^ X4 ^ Y4, X3 ^ Y3 ^ Z4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2 ^ Z4 ^ Y5, Z2 ^ X4 ^ Y4, X3 ^ Y3 ^ Z3 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2 ^ Y4 ^ Z4, Z2 ^ Y3 ^ X4, Y2 ^ X3 ^ Z3 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1 ^ Y4 ^ Z4, Z2 ^ X3 ^ Y3, X2 ^ Y2 ^ Z3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3 ^ Y5 ^ Z5, Z3 ^ Y4 ^ X5, Y3 ^ X4 ^ Z4 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2 ^ Y5 ^ Z5, Z3 ^ X4 ^ Y4, X3 ^ Y3 ^ Z4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2 ^ Z4 ^ Y5, Z2 ^ X4 ^ Y4, X3 ^ Y3 ^ Z3 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2 ^ Y4 ^ Z4, Z2 ^ Y3 ^ X4, Y2 ^ X3 ^ Z3 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1 ^ Y4 ^ Z4, Z2 ^ X3 ^ Y3, X2 ^ Y2 ^ Z3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3, Y3 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3, Y3 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2, Y3 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2, Y2 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2, Y2 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Y3 ^ Z3 ^ X4, Y3 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, X3 ^ Y3 ^ Z3, Y3 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2 ^ X3 ^ Y3, Y3 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Y2 ^ Z2 ^ X3, Y2 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, X2 ^ Y2 ^ Z2, Y2 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3 ^ Y4 ^ X5, Y3 ^ X4 ^ Z4 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3 ^ X4 ^ Y4, X3 ^ Y3 ^ Z4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2 ^ X4 ^ Y4, X3 ^ Y3 ^ Z3 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2 ^ Y3 ^ X4, Y2 ^ X3 ^ Z3 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2 ^ X3 ^ Y3, X2 ^ Y2 ^ Z3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3 ^ Y4 ^ X5, Y3 ^ X4 ^ Z4 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3 ^ X4 ^ Y4, X3 ^ Y3 ^ Z4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2 ^ X4 ^ Y4, X3 ^ Y3 ^ Z3 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2 ^ Y3 ^ X4, Y2 ^ X3 ^ Z3 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2 ^ X3 ^ Y3, X2 ^ Y2 ^ Z3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3 ^ Y4 ^ X5, Y3 ^ X4 ^ Z4 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3 ^ X4 ^ Y4, X3 ^ Y3 ^ Z4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2 ^ X4 ^ Y4, X3 ^ Y3 ^ Z3 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2 ^ Y3 ^ X4, Y2 ^ X3 ^ Z3 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2 ^ X3 ^ Y3, X2 ^ Y2 ^ Z3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3 ^ Y4 ^ X5, Y3 ^ X4 ^ Z4 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3 ^ X4 ^ Y4, X3 ^ Y3 ^ Z4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2 ^ X4 ^ Y4, X3 ^ Y3 ^ Z3 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2 ^ Y3 ^ X4, Y2 ^ X3 ^ Z3 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2 ^ X3 ^ Y3, X2 ^ Y2 ^ Z3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3 ^ Y4 ^ X5, Y3 ^ X4 ^ Z4 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3 ^ X4 ^ Y4, X3 ^ Y3 ^ Z4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2 ^ X4 ^ Y4, X3 ^ Y3 ^ Z3 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2 ^ Y3 ^ X4, Y2 ^ X3 ^ Z3 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2 ^ X3 ^ Y3, X2 ^ Y2 ^ Z3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3 ^ Y4 ^ X5, Y3 ^ X4 ^ Z4 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3 ^ X4 ^ Y4, X3 ^ Y3 ^ Z4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2 ^ X4 ^ Y4, X3 ^ Y3 ^ Z3 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2 ^ Y3 ^ X4, Y2 ^ X3 ^ Z3 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2 ^ X3 ^ Y3, X2 ^ Y2 ^ Z3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3 ^ Y4 ^ X5, Y3 ^ X4 ^ Z4 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3 ^ X4 ^ Y4, X3 ^ Y3 ^ Z4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2 ^ X4 ^ Y4, X3 ^ Y3 ^ Z3 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2 ^ Y3 ^ X4, Y2 ^ X3 ^ Z3 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2 ^ X3 ^ Y3, X2 ^ Y2 ^ Z3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3 ^ Y4 ^ X5, Y3 ^ X4 ^ Z4 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3 ^ X4 ^ Y4, X3 ^ Y3 ^ Z4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2 ^ X4 ^ Y4, X3 ^ Y3 ^ Z3 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2 ^ Y3 ^ X4, Y2 ^ X3 ^ Z3 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2 ^ X3 ^ Y3, X2 ^ Y2 ^ Z3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3 ^ Y4 ^ X5, Y3 ^ X4 ^ Z4 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3 ^ X4 ^ Y4, X3 ^ Y3 ^ Z4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2 ^ X4 ^ Y4, X3 ^ Y3 ^ Z3 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2 ^ Y3 ^ X4, Y2 ^ X3 ^ Z3 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2 ^ X3 ^ Y3, X2 ^ Y2 ^ Z3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3 ^ Y4 ^ X5, Y3 ^ X4 ^ Z4 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3 ^ X4 ^ Y4, X3 ^ Y3 ^ Z4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2 ^ X4 ^ Y4, X3 ^ Y3 ^ Z3 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2 ^ Y3 ^ X4, Y2 ^ X3 ^ Z3 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2 ^ X3 ^ Y3, X2 ^ Y2 ^ Z3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3 ^ Y4 ^ X5, Y3 ^ X4 ^ Z4 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3 ^ X4 ^ Y4, X3 ^ Y3 ^ Z4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2 ^ X4 ^ Y4, X3 ^ Y3 ^ Z3 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2 ^ Y3 ^ X4, Y2 ^ X3 ^ Z3 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2 ^ X3 ^ Y3, X2 ^ Y2 ^ Z3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3 ^ Y4 ^ X5, Y3 ^ X4 ^ Z4 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3 ^ X4 ^ Y4, X3 ^ Y3 ^ Z4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2 ^ X4 ^ Y4, X3 ^ Y3 ^ Z3 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2 ^ Y3 ^ X4, Y2 ^ X3 ^ Z3 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2 ^ X3 ^ Y3, X2 ^ Y2 ^ Z3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3 ^ Y4 ^ X5, Y3 ^ X4 ^ Z4 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3 ^ X4 ^ Y4, X3 ^ Y3 ^ Z4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2 ^ X4 ^ Y4, X3 ^ Y3 ^ Z3 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2 ^ Y3 ^ X4, Y2 ^ X3 ^ Z3 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2 ^ X3 ^ Y3, X2 ^ Y2 ^ Z3 }, -}; - -const UINT_64 SW_64K_S3_RBPLUS[][16]= -{ - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3, Y3, X4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3, Y3, X3, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2, Y3, X3, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2, Y2, X3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2, Y2, X2, Z3, Y3, X3 }, -}; - -const UINT_64 SW_64K_S3_X_RBPLUS[][16]= -{ - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3, Y3, X4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3, Y3, X3, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2, Y3, X3, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2, Y2, X3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2, Y2, X2, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2 ^ X3 ^ Z3, X3, Z3, Y3, X4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, X2 ^ Y2 ^ Z3, X2, Z3, Y3, X3, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, X2 ^ Y2 ^ Z2, X2, Z2, Y3, X3, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1 ^ X2 ^ Z2, X2, Z2, Y2, X3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, X1 ^ Y1 ^ Z2, X1, Z2, Y2, X2, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2 ^ X4 ^ Z4, X3 ^ Y3 ^ Z3, Z3, Y3, X4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2 ^ X3 ^ Z4, X2 ^ Y3 ^ Z3, Z3, Y3, X3, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2 ^ X3 ^ Z3, X2 ^ Z2 ^ Y3, Z2, Y3, X3, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1 ^ X3 ^ Z3, X2 ^ Y2 ^ Z2, Z2, Y2, X3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1 ^ X2 ^ Z3, X1 ^ Y2 ^ Z2, Z2, Y2, X2, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2 ^ X5 ^ Z5, X3 ^ Y4 ^ Z4, Y3 ^ Z3 ^ X4, Y3, X4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2 ^ X4 ^ Z5, X2 ^ Y4 ^ Z4, X3 ^ Y3 ^ Z3, Y3, X3, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2 ^ X4 ^ Z4, X2 ^ Z3 ^ Y4, Z2 ^ X3 ^ Y3, Y3, X3, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1 ^ X4 ^ Z4, X2 ^ Y3 ^ Z3, Y2 ^ Z2 ^ X3, Y2, X3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1 ^ X3 ^ Z4, X1 ^ Y3 ^ Z3, X2 ^ Y2 ^ Z2, Y2, X2, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2 ^ X4 ^ Z4, X3 ^ Y3 ^ Z3, Z3, Y3, X4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2 ^ X3 ^ Z4, X2 ^ Y3 ^ Z3, Z3, Y3, X3, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2 ^ X3 ^ Z3, X2 ^ Z2 ^ Y3, Z2, Y3, X3, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1 ^ X3 ^ Z3, X2 ^ Y2 ^ Z2, Z2, Y2, X3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1 ^ X2 ^ Z3, X1 ^ Y2 ^ Z2, Z2, Y2, X2, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2 ^ X5 ^ Z5, X3 ^ Y4 ^ Z4, Y3 ^ Z3 ^ X4, Y3, X4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2 ^ X4 ^ Z5, X2 ^ Y4 ^ Z4, X3 ^ Y3 ^ Z3, Y3, X3, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2 ^ X4 ^ Z4, X2 ^ Z3 ^ Y4, Z2 ^ X3 ^ Y3, Y3, X3, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1 ^ X4 ^ Z4, X2 ^ Y3 ^ Z3, Y2 ^ Z2 ^ X3, Y2, X3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1 ^ X3 ^ Z4, X1 ^ Y3 ^ Z3, X2 ^ Y2 ^ Z2, Y2, X2, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2 ^ X6 ^ Z6, X3 ^ Y5 ^ Z5, Z3 ^ Y4 ^ X5, Y3 ^ X4 ^ Z4, X4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2 ^ X5 ^ Z6, X2 ^ Y5 ^ Z5, Z3 ^ X4 ^ Y4, X3 ^ Y3 ^ Z4, X3, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2 ^ X5 ^ Z5, X2 ^ Z4 ^ Y5, Z2 ^ X4 ^ Y4, X3 ^ Y3 ^ Z3, X3, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1 ^ X5 ^ Z5, X2 ^ Y4 ^ Z4, Z2 ^ Y3 ^ X4, Y2 ^ X3 ^ Z3, X3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1 ^ X4 ^ Z5, X1 ^ Y4 ^ Z4, Z2 ^ X3 ^ Y3, X2 ^ Y2 ^ Z3, X2, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2 ^ X5 ^ Z5, X3 ^ Y4 ^ Z4, Y3 ^ Z3 ^ X4, Y3, X4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2 ^ X4 ^ Z5, X2 ^ Y4 ^ Z4, X3 ^ Y3 ^ Z3, Y3, X3, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2 ^ X4 ^ Z4, X2 ^ Z3 ^ Y4, Z2 ^ X3 ^ Y3, Y3, X3, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1 ^ X4 ^ Z4, X2 ^ Y3 ^ Z3, Y2 ^ Z2 ^ X3, Y2, X3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1 ^ X3 ^ Z4, X1 ^ Y3 ^ Z3, X2 ^ Y2 ^ Z2, Y2, X2, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2 ^ X6 ^ Z6, X3 ^ Y5 ^ Z5, Z3 ^ Y4 ^ X5, Y3 ^ X4 ^ Z4, X4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2 ^ X5 ^ Z6, X2 ^ Y5 ^ Z5, Z3 ^ X4 ^ Y4, X3 ^ Y3 ^ Z4, X3, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2 ^ X5 ^ Z5, X2 ^ Z4 ^ Y5, Z2 ^ X4 ^ Y4, X3 ^ Y3 ^ Z3, X3, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1 ^ X5 ^ Z5, X2 ^ Y4 ^ Z4, Z2 ^ Y3 ^ X4, Y2 ^ X3 ^ Z3, X3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1 ^ X4 ^ Z5, X1 ^ Y4 ^ Z4, Z2 ^ X3 ^ Y3, X2 ^ Y2 ^ Z3, X2, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2 ^ X7 ^ Z7, X3 ^ Y6 ^ Z6, Z3 ^ Y5 ^ X6, Y3 ^ X5 ^ Z5, X4 ^ Y4 ^ Z4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2 ^ X6 ^ Z7, X2 ^ Y6 ^ Z6, Z3 ^ X5 ^ Y5, Y3 ^ X4 ^ Z5, X3 ^ Y4 ^ Z4, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2 ^ X6 ^ Z6, X2 ^ Z5 ^ Y6, Z2 ^ X5 ^ Y5, Y3 ^ X4 ^ Z4, X3 ^ Z3 ^ Y4, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1 ^ X6 ^ Z6, X2 ^ Y5 ^ Z5, Z2 ^ Y4 ^ X5, Y2 ^ X4 ^ Z4, X3 ^ Y3 ^ Z3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1 ^ X5 ^ Z6, X1 ^ Y5 ^ Z5, Z2 ^ X4 ^ Y4, Y2 ^ X3 ^ Z4, X2 ^ Y3 ^ Z3, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2 ^ X6 ^ Z6, X3 ^ Y5 ^ Z5, Z3 ^ Y4 ^ X5, Y3 ^ X4 ^ Z4, X4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2 ^ X5 ^ Z6, X2 ^ Y5 ^ Z5, Z3 ^ X4 ^ Y4, X3 ^ Y3 ^ Z4, X3, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2 ^ X5 ^ Z5, X2 ^ Z4 ^ Y5, Z2 ^ X4 ^ Y4, X3 ^ Y3 ^ Z3, X3, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1 ^ X5 ^ Z5, X2 ^ Y4 ^ Z4, Z2 ^ Y3 ^ X4, Y2 ^ X3 ^ Z3, X3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1 ^ X4 ^ Z5, X1 ^ Y4 ^ Z4, Z2 ^ X3 ^ Y3, X2 ^ Y2 ^ Z3, X2, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2 ^ X7 ^ Z7, X3 ^ Y6 ^ Z6, Z3 ^ Y5 ^ X6, Y3 ^ X5 ^ Z5, X4 ^ Y4 ^ Z4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2 ^ X6 ^ Z7, X2 ^ Y6 ^ Z6, Z3 ^ X5 ^ Y5, Y3 ^ X4 ^ Z5, X3 ^ Y4 ^ Z4, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2 ^ X6 ^ Z6, X2 ^ Z5 ^ Y6, Z2 ^ X5 ^ Y5, Y3 ^ X4 ^ Z4, X3 ^ Z3 ^ Y4, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1 ^ X6 ^ Z6, X2 ^ Y5 ^ Z5, Z2 ^ Y4 ^ X5, Y2 ^ X4 ^ Z4, X3 ^ Y3 ^ Z3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1 ^ X5 ^ Z6, X1 ^ Y5 ^ Z5, Z2 ^ X4 ^ Y4, Y2 ^ X3 ^ Z4, X2 ^ Y3 ^ Z3, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2 ^ X8 ^ Z8, X3 ^ Y7 ^ Z7, Z3 ^ Y6 ^ X7, Y3 ^ X6 ^ Z6, X4 ^ Y5 ^ Z5, Y4 ^ Z4 ^ X5, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2 ^ X7 ^ Z8, X2 ^ Y7 ^ Z7, Z3 ^ X6 ^ Y6, Y3 ^ X5 ^ Z6, X3 ^ Y5 ^ Z5, X4 ^ Y4 ^ Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2 ^ X7 ^ Z7, X2 ^ Z6 ^ Y7, Z2 ^ X6 ^ Y6, Y3 ^ X5 ^ Z5, X3 ^ Z4 ^ Y5, Z3 ^ X4 ^ Y4, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1 ^ X7 ^ Z7, X2 ^ Y6 ^ Z6, Z2 ^ Y5 ^ X6, Y2 ^ X5 ^ Z5, X3 ^ Y4 ^ Z4, Y3 ^ Z3 ^ X4, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1 ^ X6 ^ Z7, X1 ^ Y6 ^ Z6, Z2 ^ X5 ^ Y5, Y2 ^ X4 ^ Z5, X2 ^ Y4 ^ Z4, X3 ^ Y3 ^ Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2 ^ X7 ^ Z7, X3 ^ Y6 ^ Z6, Z3 ^ Y5 ^ X6, Y3 ^ X5 ^ Z5, X4 ^ Y4 ^ Z4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2 ^ X6 ^ Z7, X2 ^ Y6 ^ Z6, Z3 ^ X5 ^ Y5, Y3 ^ X4 ^ Z5, X3 ^ Y4 ^ Z4, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2 ^ X6 ^ Z6, X2 ^ Z5 ^ Y6, Z2 ^ X5 ^ Y5, Y3 ^ X4 ^ Z4, X3 ^ Z3 ^ Y4, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1 ^ X6 ^ Z6, X2 ^ Y5 ^ Z5, Z2 ^ Y4 ^ X5, Y2 ^ X4 ^ Z4, X3 ^ Y3 ^ Z3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1 ^ X5 ^ Z6, X1 ^ Y5 ^ Z5, Z2 ^ X4 ^ Y4, Y2 ^ X3 ^ Z4, X2 ^ Y3 ^ Z3, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2 ^ X8 ^ Z8, X3 ^ Y7 ^ Z7, Z3 ^ Y6 ^ X7, Y3 ^ X6 ^ Z6, X4 ^ Y5 ^ Z5, Y4 ^ Z4 ^ X5, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2 ^ X7 ^ Z8, X2 ^ Y7 ^ Z7, Z3 ^ X6 ^ Y6, Y3 ^ X5 ^ Z6, X3 ^ Y5 ^ Z5, X4 ^ Y4 ^ Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2 ^ X7 ^ Z7, X2 ^ Z6 ^ Y7, Z2 ^ X6 ^ Y6, Y3 ^ X5 ^ Z5, X3 ^ Z4 ^ Y5, Z3 ^ X4 ^ Y4, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1 ^ X7 ^ Z7, X2 ^ Y6 ^ Z6, Z2 ^ Y5 ^ X6, Y2 ^ X5 ^ Z5, X3 ^ Y4 ^ Z4, Y3 ^ Z3 ^ X4, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1 ^ X6 ^ Z7, X1 ^ Y6 ^ Z6, Z2 ^ X5 ^ Y5, Y2 ^ X4 ^ Z5, X2 ^ Y4 ^ Z4, X3 ^ Y3 ^ Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3, Y3, X4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3, Y3, X3, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2, Y3, X3, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2, Y2, X3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2, Y2, X2, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3 ^ Y3 ^ Z3, Z3, Y3, X4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2 ^ Y3 ^ Z3, Z3, Y3, X3, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2 ^ Z2 ^ Y3, Z2, Y3, X3, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2 ^ Y2 ^ Z2, Z2, Y2, X3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1 ^ Y2 ^ Z2, Z2, Y2, X2, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3 ^ Y4 ^ Z4, Y3 ^ Z3 ^ X4, Y3, X4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2 ^ Y4 ^ Z4, X3 ^ Y3 ^ Z3, Y3, X3, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2 ^ Z3 ^ Y4, Z2 ^ X3 ^ Y3, Y3, X3, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2 ^ Y3 ^ Z3, Y2 ^ Z2 ^ X3, Y2, X3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1 ^ Y3 ^ Z3, X2 ^ Y2 ^ Z2, Y2, X2, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3 ^ Y5 ^ Z5, Z3 ^ Y4 ^ X5, Y3 ^ X4 ^ Z4, X4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2 ^ Y5 ^ Z5, Z3 ^ X4 ^ Y4, X3 ^ Y3 ^ Z4, X3, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2 ^ Z4 ^ Y5, Z2 ^ X4 ^ Y4, X3 ^ Y3 ^ Z3, X3, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2 ^ Y4 ^ Z4, Z2 ^ Y3 ^ X4, Y2 ^ X3 ^ Z3, X3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1 ^ Y4 ^ Z4, Z2 ^ X3 ^ Y3, X2 ^ Y2 ^ Z3, X2, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3 ^ Y4 ^ Z4, Y3 ^ Z3 ^ X4, Y3, X4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2 ^ Y4 ^ Z4, X3 ^ Y3 ^ Z3, Y3, X3, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2 ^ Z3 ^ Y4, Z2 ^ X3 ^ Y3, Y3, X3, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2 ^ Y3 ^ Z3, Y2 ^ Z2 ^ X3, Y2, X3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1 ^ Y3 ^ Z3, X2 ^ Y2 ^ Z2, Y2, X2, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3 ^ Y5 ^ Z5, Z3 ^ Y4 ^ X5, Y3 ^ X4 ^ Z4, X4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2 ^ Y5 ^ Z5, Z3 ^ X4 ^ Y4, X3 ^ Y3 ^ Z4, X3, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2 ^ Z4 ^ Y5, Z2 ^ X4 ^ Y4, X3 ^ Y3 ^ Z3, X3, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2 ^ Y4 ^ Z4, Z2 ^ Y3 ^ X4, Y2 ^ X3 ^ Z3, X3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1 ^ Y4 ^ Z4, Z2 ^ X3 ^ Y3, X2 ^ Y2 ^ Z3, X2, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3 ^ Y6 ^ Z6, Z3 ^ Y5 ^ X6, Y3 ^ X5 ^ Z5, X4 ^ Y4 ^ Z4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2 ^ Y6 ^ Z6, Z3 ^ X5 ^ Y5, Y3 ^ X4 ^ Z5, X3 ^ Y4 ^ Z4, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2 ^ Z5 ^ Y6, Z2 ^ X5 ^ Y5, Y3 ^ X4 ^ Z4, X3 ^ Z3 ^ Y4, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2 ^ Y5 ^ Z5, Z2 ^ Y4 ^ X5, Y2 ^ X4 ^ Z4, X3 ^ Y3 ^ Z3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1 ^ Y5 ^ Z5, Z2 ^ X4 ^ Y4, Y2 ^ X3 ^ Z4, X2 ^ Y3 ^ Z3, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3 ^ Y5 ^ Z5, Z3 ^ Y4 ^ X5, Y3 ^ X4 ^ Z4, X4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2 ^ Y5 ^ Z5, Z3 ^ X4 ^ Y4, X3 ^ Y3 ^ Z4, X3, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2 ^ Z4 ^ Y5, Z2 ^ X4 ^ Y4, X3 ^ Y3 ^ Z3, X3, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2 ^ Y4 ^ Z4, Z2 ^ Y3 ^ X4, Y2 ^ X3 ^ Z3, X3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1 ^ Y4 ^ Z4, Z2 ^ X3 ^ Y3, X2 ^ Y2 ^ Z3, X2, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3 ^ Y6 ^ Z6, Z3 ^ Y5 ^ X6, Y3 ^ X5 ^ Z5, X4 ^ Y4 ^ Z4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2 ^ Y6 ^ Z6, Z3 ^ X5 ^ Y5, Y3 ^ X4 ^ Z5, X3 ^ Y4 ^ Z4, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2 ^ Z5 ^ Y6, Z2 ^ X5 ^ Y5, Y3 ^ X4 ^ Z4, X3 ^ Z3 ^ Y4, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2 ^ Y5 ^ Z5, Z2 ^ Y4 ^ X5, Y2 ^ X4 ^ Z4, X3 ^ Y3 ^ Z3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1 ^ Y5 ^ Z5, Z2 ^ X4 ^ Y4, Y2 ^ X3 ^ Z4, X2 ^ Y3 ^ Z3, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3 ^ Y7 ^ Z7, Z3 ^ Y6 ^ X7, Y3 ^ X6 ^ Z6, X4 ^ Y5 ^ Z5, Y4 ^ Z4 ^ X5, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2 ^ Y7 ^ Z7, Z3 ^ X6 ^ Y6, Y3 ^ X5 ^ Z6, X3 ^ Y5 ^ Z5, X4 ^ Y4 ^ Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2 ^ Z6 ^ Y7, Z2 ^ X6 ^ Y6, Y3 ^ X5 ^ Z5, X3 ^ Z4 ^ Y5, Z3 ^ X4 ^ Y4, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2 ^ Y6 ^ Z6, Z2 ^ Y5 ^ X6, Y2 ^ X5 ^ Z5, X3 ^ Y4 ^ Z4, Y3 ^ Z3 ^ X4, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1 ^ Y6 ^ Z6, Z2 ^ X5 ^ Y5, Y2 ^ X4 ^ Z5, X2 ^ Y4 ^ Z4, X3 ^ Y3 ^ Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3 ^ Y6 ^ Z6, Z3 ^ Y5 ^ X6, Y3 ^ X5 ^ Z5, X4 ^ Y4 ^ Z4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2 ^ Y6 ^ Z6, Z3 ^ X5 ^ Y5, Y3 ^ X4 ^ Z5, X3 ^ Y4 ^ Z4, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2 ^ Z5 ^ Y6, Z2 ^ X5 ^ Y5, Y3 ^ X4 ^ Z4, X3 ^ Z3 ^ Y4, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2 ^ Y5 ^ Z5, Z2 ^ Y4 ^ X5, Y2 ^ X4 ^ Z4, X3 ^ Y3 ^ Z3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1 ^ Y5 ^ Z5, Z2 ^ X4 ^ Y4, Y2 ^ X3 ^ Z4, X2 ^ Y3 ^ Z3, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3 ^ Y7 ^ Z7, Z3 ^ Y6 ^ X7, Y3 ^ X6 ^ Z6, X4 ^ Y5 ^ Z5, Y4 ^ Z4 ^ X5, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2 ^ Y7 ^ Z7, Z3 ^ X6 ^ Y6, Y3 ^ X5 ^ Z6, X3 ^ Y5 ^ Z5, X4 ^ Y4 ^ Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2 ^ Z6 ^ Y7, Z2 ^ X6 ^ Y6, Y3 ^ X5 ^ Z5, X3 ^ Z4 ^ Y5, Z3 ^ X4 ^ Y4, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2 ^ Y6 ^ Z6, Z2 ^ Y5 ^ X6, Y2 ^ X5 ^ Z5, X3 ^ Y4 ^ Z4, Y3 ^ Z3 ^ X4, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1 ^ Y6 ^ Z6, Z2 ^ X5 ^ Y5, Y2 ^ X4 ^ Z5, X2 ^ Y4 ^ Z4, X3 ^ Y3 ^ Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3 ^ Y8 ^ Z8, Z3 ^ Y7 ^ X8, Y3 ^ X7 ^ Z7, X4 ^ Y6 ^ Z6, Z4 ^ Y5 ^ X6, Y4 ^ X5 ^ Z5, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2 ^ Y8 ^ Z8, Z3 ^ X7 ^ Y7, Y3 ^ X6 ^ Z7, X3 ^ Y6 ^ Z6, Z4 ^ X5 ^ Y5, X4 ^ Y4 ^ Z5, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2 ^ Z7 ^ Y8, Z2 ^ X7 ^ Y7, Y3 ^ X6 ^ Z6, X3 ^ Z5 ^ Y6, Z3 ^ X5 ^ Y5, X4 ^ Y4 ^ Z4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2 ^ Y7 ^ Z7, Z2 ^ Y6 ^ X7, Y2 ^ X6 ^ Z6, X3 ^ Y5 ^ Z5, Z3 ^ Y4 ^ X5, Y3 ^ X4 ^ Z4, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1 ^ Y7 ^ Z7, Z2 ^ X6 ^ Y6, Y2 ^ X5 ^ Z6, X2 ^ Y5 ^ Z5, Z3 ^ X4 ^ Y4, X3 ^ Y3 ^ Z4, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3 ^ Y7 ^ Z7, Z3 ^ Y6 ^ X7, Y3 ^ X6 ^ Z6, X4 ^ Y5 ^ Z5, Y4 ^ Z4 ^ X5, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2 ^ Y7 ^ Z7, Z3 ^ X6 ^ Y6, Y3 ^ X5 ^ Z6, X3 ^ Y5 ^ Z5, X4 ^ Y4 ^ Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2 ^ Z6 ^ Y7, Z2 ^ X6 ^ Y6, Y3 ^ X5 ^ Z5, X3 ^ Z4 ^ Y5, Z3 ^ X4 ^ Y4, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2 ^ Y6 ^ Z6, Z2 ^ Y5 ^ X6, Y2 ^ X5 ^ Z5, X3 ^ Y4 ^ Z4, Y3 ^ Z3 ^ X4, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1 ^ Y6 ^ Z6, Z2 ^ X5 ^ Y5, Y2 ^ X4 ^ Z5, X2 ^ Y4 ^ Z4, X3 ^ Y3 ^ Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3 ^ Y8 ^ Z8, Z3 ^ Y7 ^ X8, Y3 ^ X7 ^ Z7, X4 ^ Y6 ^ Z6, Z4 ^ Y5 ^ X6, Y4 ^ X5 ^ Z5, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2 ^ Y8 ^ Z8, Z3 ^ X7 ^ Y7, Y3 ^ X6 ^ Z7, X3 ^ Y6 ^ Z6, Z4 ^ X5 ^ Y5, X4 ^ Y4 ^ Z5, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2 ^ Z7 ^ Y8, Z2 ^ X7 ^ Y7, Y3 ^ X6 ^ Z6, X3 ^ Z5 ^ Y6, Z3 ^ X5 ^ Y5, X4 ^ Y4 ^ Z4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2 ^ Y7 ^ Z7, Z2 ^ Y6 ^ X7, Y2 ^ X6 ^ Z6, X3 ^ Y5 ^ Z5, Z3 ^ Y4 ^ X5, Y3 ^ X4 ^ Z4, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1 ^ Y7 ^ Z7, Z2 ^ X6 ^ Y6, Y2 ^ X5 ^ Z6, X2 ^ Y5 ^ Z5, Z3 ^ X4 ^ Y4, X3 ^ Y3 ^ Z4, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3, Y3, X4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3, Y3, X3, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2, Y3, X3, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2, Y2, X3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2, Y2, X2, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Y3 ^ Z3 ^ X4, Y3, X4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, X3 ^ Y3 ^ Z3, Y3, X3, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2 ^ X3 ^ Y3, Y3, X3, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Y2 ^ Z2 ^ X3, Y2, X3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, X2 ^ Y2 ^ Z2, Y2, X2, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3 ^ Y4 ^ X5, Y3 ^ X4 ^ Z4, X4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3 ^ X4 ^ Y4, X3 ^ Y3 ^ Z4, X3, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2 ^ X4 ^ Y4, X3 ^ Y3 ^ Z3, X3, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2 ^ Y3 ^ X4, Y2 ^ X3 ^ Z3, X3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2 ^ X3 ^ Y3, X2 ^ Y2 ^ Z3, X2, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3 ^ Y5 ^ X6, Y3 ^ X5 ^ Z5, X4 ^ Y4 ^ Z4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3 ^ X5 ^ Y5, Y3 ^ X4 ^ Z5, X3 ^ Y4 ^ Z4, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2 ^ X5 ^ Y5, Y3 ^ X4 ^ Z4, X3 ^ Z3 ^ Y4, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2 ^ Y4 ^ X5, Y2 ^ X4 ^ Z4, X3 ^ Y3 ^ Z3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2 ^ X4 ^ Y4, Y2 ^ X3 ^ Z4, X2 ^ Y3 ^ Z3, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3 ^ Y4 ^ X5, Y3 ^ X4 ^ Z4, X4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3 ^ X4 ^ Y4, X3 ^ Y3 ^ Z4, X3, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2 ^ X4 ^ Y4, X3 ^ Y3 ^ Z3, X3, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2 ^ Y3 ^ X4, Y2 ^ X3 ^ Z3, X3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2 ^ X3 ^ Y3, X2 ^ Y2 ^ Z3, X2, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3 ^ Y5 ^ X6, Y3 ^ X5 ^ Z5, X4 ^ Y4 ^ Z4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3 ^ X5 ^ Y5, Y3 ^ X4 ^ Z5, X3 ^ Y4 ^ Z4, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2 ^ X5 ^ Y5, Y3 ^ X4 ^ Z4, X3 ^ Z3 ^ Y4, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2 ^ Y4 ^ X5, Y2 ^ X4 ^ Z4, X3 ^ Y3 ^ Z3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2 ^ X4 ^ Y4, Y2 ^ X3 ^ Z4, X2 ^ Y3 ^ Z3, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3 ^ Y6 ^ X7, Y3 ^ X6 ^ Z6, X4 ^ Y5 ^ Z5, Y4 ^ Z4 ^ X5, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3 ^ X6 ^ Y6, Y3 ^ X5 ^ Z6, X3 ^ Y5 ^ Z5, X4 ^ Y4 ^ Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2 ^ X6 ^ Y6, Y3 ^ X5 ^ Z5, X3 ^ Z4 ^ Y5, Z3 ^ X4 ^ Y4, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2 ^ Y5 ^ X6, Y2 ^ X5 ^ Z5, X3 ^ Y4 ^ Z4, Y3 ^ Z3 ^ X4, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2 ^ X5 ^ Y5, Y2 ^ X4 ^ Z5, X2 ^ Y4 ^ Z4, X3 ^ Y3 ^ Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3 ^ Y5 ^ X6, Y3 ^ X5 ^ Z5, X4 ^ Y4 ^ Z4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3 ^ X5 ^ Y5, Y3 ^ X4 ^ Z5, X3 ^ Y4 ^ Z4, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2 ^ X5 ^ Y5, Y3 ^ X4 ^ Z4, X3 ^ Z3 ^ Y4, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2 ^ Y4 ^ X5, Y2 ^ X4 ^ Z4, X3 ^ Y3 ^ Z3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2 ^ X4 ^ Y4, Y2 ^ X3 ^ Z4, X2 ^ Y3 ^ Z3, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3 ^ Y6 ^ X7, Y3 ^ X6 ^ Z6, X4 ^ Y5 ^ Z5, Y4 ^ Z4 ^ X5, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3 ^ X6 ^ Y6, Y3 ^ X5 ^ Z6, X3 ^ Y5 ^ Z5, X4 ^ Y4 ^ Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2 ^ X6 ^ Y6, Y3 ^ X5 ^ Z5, X3 ^ Z4 ^ Y5, Z3 ^ X4 ^ Y4, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2 ^ Y5 ^ X6, Y2 ^ X5 ^ Z5, X3 ^ Y4 ^ Z4, Y3 ^ Z3 ^ X4, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2 ^ X5 ^ Y5, Y2 ^ X4 ^ Z5, X2 ^ Y4 ^ Z4, X3 ^ Y3 ^ Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3 ^ Y7 ^ X8, Y3 ^ X7 ^ Z7, X4 ^ Y6 ^ Z6, Z4 ^ Y5 ^ X6, Y4 ^ X5 ^ Z5, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3 ^ X7 ^ Y7, Y3 ^ X6 ^ Z7, X3 ^ Y6 ^ Z6, Z4 ^ X5 ^ Y5, X4 ^ Y4 ^ Z5, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2 ^ X7 ^ Y7, Y3 ^ X6 ^ Z6, X3 ^ Z5 ^ Y6, Z3 ^ X5 ^ Y5, X4 ^ Y4 ^ Z4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2 ^ Y6 ^ X7, Y2 ^ X6 ^ Z6, X3 ^ Y5 ^ Z5, Z3 ^ Y4 ^ X5, Y3 ^ X4 ^ Z4, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2 ^ X6 ^ Y6, Y2 ^ X5 ^ Z6, X2 ^ Y5 ^ Z5, Z3 ^ X4 ^ Y4, X3 ^ Y3 ^ Z4, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3 ^ Y6 ^ X7, Y3 ^ X6 ^ Z6, X4 ^ Y5 ^ Z5, Y4 ^ Z4 ^ X5, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3 ^ X6 ^ Y6, Y3 ^ X5 ^ Z6, X3 ^ Y5 ^ Z5, X4 ^ Y4 ^ Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2 ^ X6 ^ Y6, Y3 ^ X5 ^ Z5, X3 ^ Z4 ^ Y5, Z3 ^ X4 ^ Y4, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2 ^ Y5 ^ X6, Y2 ^ X5 ^ Z5, X3 ^ Y4 ^ Z4, Y3 ^ Z3 ^ X4, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2 ^ X5 ^ Y5, Y2 ^ X4 ^ Z5, X2 ^ Y4 ^ Z4, X3 ^ Y3 ^ Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3 ^ Y7 ^ X8, Y3 ^ X7 ^ Z7, X4 ^ Y6 ^ Z6, Z4 ^ Y5 ^ X6, Y4 ^ X5 ^ Z5, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3 ^ X7 ^ Y7, Y3 ^ X6 ^ Z7, X3 ^ Y6 ^ Z6, Z4 ^ X5 ^ Y5, X4 ^ Y4 ^ Z5, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2 ^ X7 ^ Y7, Y3 ^ X6 ^ Z6, X3 ^ Z5 ^ Y6, Z3 ^ X5 ^ Y5, X4 ^ Y4 ^ Z4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2 ^ Y6 ^ X7, Y2 ^ X6 ^ Z6, X3 ^ Y5 ^ Z5, Z3 ^ Y4 ^ X5, Y3 ^ X4 ^ Z4, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2 ^ X6 ^ Y6, Y2 ^ X5 ^ Z6, X2 ^ Y5 ^ Z5, Z3 ^ X4 ^ Y4, X3 ^ Y3 ^ Z4, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3 ^ Y8 ^ X9, Y3 ^ X8 ^ Z8, X4 ^ Y7 ^ Z7, Z4 ^ Y6 ^ X7, Y4 ^ X6 ^ Z6, X5 ^ Y5 ^ Z5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3 ^ X8 ^ Y8, Y3 ^ X7 ^ Z8, X3 ^ Y7 ^ Z7, Z4 ^ X6 ^ Y6, Y4 ^ X5 ^ Z6, X4 ^ Y5 ^ Z5 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2 ^ X8 ^ Y8, Y3 ^ X7 ^ Z7, X3 ^ Z6 ^ Y7, Z3 ^ X6 ^ Y6, Y4 ^ X5 ^ Z5, X4 ^ Z4 ^ Y5 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2 ^ Y7 ^ X8, Y2 ^ X7 ^ Z7, X3 ^ Y6 ^ Z6, Z3 ^ Y5 ^ X6, Y3 ^ X5 ^ Z5, X4 ^ Y4 ^ Z4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2 ^ X7 ^ Y7, Y2 ^ X6 ^ Z7, X2 ^ Y6 ^ Z6, Z3 ^ X5 ^ Y5, Y3 ^ X4 ^ Z5, X3 ^ Y4 ^ Z4 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3 ^ Y7 ^ X8, Y3 ^ X7 ^ Z7, X4 ^ Y6 ^ Z6, Z4 ^ Y5 ^ X6, Y4 ^ X5 ^ Z5, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3 ^ X7 ^ Y7, Y3 ^ X6 ^ Z7, X3 ^ Y6 ^ Z6, Z4 ^ X5 ^ Y5, X4 ^ Y4 ^ Z5, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2 ^ X7 ^ Y7, Y3 ^ X6 ^ Z6, X3 ^ Z5 ^ Y6, Z3 ^ X5 ^ Y5, X4 ^ Y4 ^ Z4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2 ^ Y6 ^ X7, Y2 ^ X6 ^ Z6, X3 ^ Y5 ^ Z5, Z3 ^ Y4 ^ X5, Y3 ^ X4 ^ Z4, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2 ^ X6 ^ Y6, Y2 ^ X5 ^ Z6, X2 ^ Y5 ^ Z5, Z3 ^ X4 ^ Y4, X3 ^ Y3 ^ Z4, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3 ^ Y8 ^ X9, Y3 ^ X8 ^ Z8, X4 ^ Y7 ^ Z7, Z4 ^ Y6 ^ X7, Y4 ^ X6 ^ Z6, X5 ^ Y5 ^ Z5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3 ^ X8 ^ Y8, Y3 ^ X7 ^ Z8, X3 ^ Y7 ^ Z7, Z4 ^ X6 ^ Y6, Y4 ^ X5 ^ Z6, X4 ^ Y5 ^ Z5 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2 ^ X8 ^ Y8, Y3 ^ X7 ^ Z7, X3 ^ Z6 ^ Y7, Z3 ^ X6 ^ Y6, Y4 ^ X5 ^ Z5, X4 ^ Z4 ^ Y5 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2 ^ Y7 ^ X8, Y2 ^ X7 ^ Z7, X3 ^ Y6 ^ Z6, Z3 ^ Y5 ^ X6, Y3 ^ X5 ^ Z5, X4 ^ Y4 ^ Z4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2 ^ X7 ^ Y7, Y2 ^ X6 ^ Z7, X2 ^ Y6 ^ Z6, Z3 ^ X5 ^ Y5, Y3 ^ X4 ^ Z5, X3 ^ Y4 ^ Z4 }, -}; - -const UINT_64 SW_64K_S3_T_RBPLUS[][16]= -{ - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3, Y3, X4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3, Y3, X3, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2, Y3, X3, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2, Y2, X3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2, Y2, X2, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2 ^ X3 ^ Z3, X3, Z3, Y3, X4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, X2 ^ Y2 ^ Z3, X2, Z3, Y3, X3, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, X2 ^ Y2 ^ Z2, X2, Z2, Y3, X3, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1 ^ X2 ^ Z2, X2, Z2, Y2, X3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, X1 ^ Y1 ^ Z2, X1, Z2, Y2, X2, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2 ^ X4 ^ Z4, X3 ^ Y3 ^ Z3, Z3, Y3, X4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2 ^ X3 ^ Z4, X2 ^ Y3 ^ Z3, Z3, Y3, X3, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2 ^ X3 ^ Z3, X2 ^ Z2 ^ Y3, Z2, Y3, X3, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1 ^ X3 ^ Z3, X2 ^ Y2 ^ Z2, Z2, Y2, X3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1 ^ X2 ^ Z3, X1 ^ Y2 ^ Z2, Z2, Y2, X2, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2 ^ X5, X3 ^ Y4 ^ Z4, Y3 ^ Z3 ^ X4, Y3, X4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2 ^ X4, X2 ^ Y4 ^ Z4, X3 ^ Y3 ^ Z3, Y3, X3, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2 ^ X4, X2 ^ Z3 ^ Y4, Z2 ^ X3 ^ Y3, Y3, X3, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1 ^ X4, X2 ^ Y3 ^ Z3, Y2 ^ Z2 ^ X3, Y2, X3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1 ^ X3, X1 ^ Y3 ^ Z3, X2 ^ Y2 ^ Z2, Y2, X2, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2 ^ X4 ^ Z4, X3 ^ Y3 ^ Z3, Z3, Y3, X4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2 ^ X3 ^ Z4, X2 ^ Y3 ^ Z3, Z3, Y3, X3, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2 ^ X3 ^ Z3, X2 ^ Z2 ^ Y3, Z2, Y3, X3, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1 ^ X3 ^ Z3, X2 ^ Y2 ^ Z2, Z2, Y2, X3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1 ^ X2 ^ Z3, X1 ^ Y2 ^ Z2, Z2, Y2, X2, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2 ^ X5, X3 ^ Y4 ^ Z4, Y3 ^ Z3 ^ X4, Y3, X4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2 ^ X4, X2 ^ Y4 ^ Z4, X3 ^ Y3 ^ Z3, Y3, X3, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2 ^ X4, X2 ^ Z3 ^ Y4, Z2 ^ X3 ^ Y3, Y3, X3, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1 ^ X4, X2 ^ Y3 ^ Z3, Y2 ^ Z2 ^ X3, Y2, X3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1 ^ X3, X1 ^ Y3 ^ Z3, X2 ^ Y2 ^ Z2, Y2, X2, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3 ^ Y4 ^ X5, Y3 ^ X4 ^ Z4, X4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3 ^ X4 ^ Y4, X3 ^ Y3 ^ Z4, X3, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2 ^ X4 ^ Y4, X3 ^ Y3 ^ Z3, X3, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2 ^ Y3 ^ X4, Y2 ^ X3 ^ Z3, X3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2 ^ X3 ^ Y3, X2 ^ Y2 ^ Z3, X2, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2 ^ X5, X3 ^ Y4 ^ Z4, Y3 ^ Z3 ^ X4, Y3, X4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2 ^ X4, X2 ^ Y4 ^ Z4, X3 ^ Y3 ^ Z3, Y3, X3, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2 ^ X4, X2 ^ Z3 ^ Y4, Z2 ^ X3 ^ Y3, Y3, X3, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1 ^ X4, X2 ^ Y3 ^ Z3, Y2 ^ Z2 ^ X3, Y2, X3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1 ^ X3, X1 ^ Y3 ^ Z3, X2 ^ Y2 ^ Z2, Y2, X2, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3 ^ Y4 ^ X5, Y3 ^ X4 ^ Z4, X4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3 ^ X4 ^ Y4, X3 ^ Y3 ^ Z4, X3, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2 ^ X4 ^ Y4, X3 ^ Y3 ^ Z3, X3, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2 ^ Y3 ^ X4, Y2 ^ X3 ^ Z3, X3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2 ^ X3 ^ Y3, X2 ^ Y2 ^ Z3, X2, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3, Y3 ^ X5, X4 ^ Y4 ^ Z4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3, Y3 ^ X4, X3 ^ Y4 ^ Z4, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2, Y3 ^ X4, X3 ^ Z3 ^ Y4, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2, Y2 ^ X4, X3 ^ Y3 ^ Z3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2, Y2 ^ X3, X2 ^ Y3 ^ Z3, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3 ^ Y4 ^ X5, Y3 ^ X4 ^ Z4, X4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3 ^ X4 ^ Y4, X3 ^ Y3 ^ Z4, X3, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2 ^ X4 ^ Y4, X3 ^ Y3 ^ Z3, X3, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2 ^ Y3 ^ X4, Y2 ^ X3 ^ Z3, X3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2 ^ X3 ^ Y3, X2 ^ Y2 ^ Z3, X2, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3, Y3 ^ X5, X4 ^ Y4 ^ Z4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3, Y3 ^ X4, X3 ^ Y4 ^ Z4, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2, Y3 ^ X4, X3 ^ Z3 ^ Y4, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2, Y2 ^ X4, X3 ^ Y3 ^ Z3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2, Y2 ^ X3, X2 ^ Y3 ^ Z3, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3, Y3, X4, Y4 ^ Z4 ^ X5, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3, Y3, X3, X4 ^ Y4 ^ Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2, Y3, X3, Z3 ^ X4 ^ Y4, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2, Y2, X3, Y3 ^ Z3 ^ X4, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2, Y2, X2, X3 ^ Y3 ^ Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3, Y3 ^ X5, X4 ^ Y4 ^ Z4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3, Y3 ^ X4, X3 ^ Y4 ^ Z4, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2, Y3 ^ X4, X3 ^ Z3 ^ Y4, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2, Y2 ^ X4, X3 ^ Y3 ^ Z3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2, Y2 ^ X3, X2 ^ Y3 ^ Z3, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3, Y3, X4, Y4 ^ Z4 ^ X5, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3, Y3, X3, X4 ^ Y4 ^ Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2, Y3, X3, Z3 ^ X4 ^ Y4, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2, Y2, X3, Y3 ^ Z3 ^ X4, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2, Y2, X2, X3 ^ Y3 ^ Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3, Y3, X4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3, Y3, X3, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2, Y3, X3, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2, Y2, X3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2, Y2, X2, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3 ^ Y3 ^ Z3, Z3, Y3, X4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2 ^ Y3 ^ Z3, Z3, Y3, X3, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2 ^ Z2 ^ Y3, Z2, Y3, X3, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2 ^ Y2 ^ Z2, Z2, Y2, X3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1 ^ Y2 ^ Z2, Z2, Y2, X2, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3 ^ Y4 ^ Z4, Y3 ^ Z3 ^ X4, Y3, X4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2 ^ Y4 ^ Z4, X3 ^ Y3 ^ Z3, Y3, X3, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2 ^ Z3 ^ Y4, Z2 ^ X3 ^ Y3, Y3, X3, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2 ^ Y3 ^ Z3, Y2 ^ Z2 ^ X3, Y2, X3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1 ^ Y3 ^ Z3, X2 ^ Y2 ^ Z2, Y2, X2, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3 ^ Y4 ^ X5, Y3 ^ X4 ^ Z4, X4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3 ^ X4 ^ Y4, X3 ^ Y3 ^ Z4, X3, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2 ^ X4 ^ Y4, X3 ^ Y3 ^ Z3, X3, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2 ^ Y3 ^ X4, Y2 ^ X3 ^ Z3, X3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2 ^ X3 ^ Y3, X2 ^ Y2 ^ Z3, X2, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3 ^ Y4 ^ Z4, Y3 ^ Z3 ^ X4, Y3, X4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2 ^ Y4 ^ Z4, X3 ^ Y3 ^ Z3, Y3, X3, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2 ^ Z3 ^ Y4, Z2 ^ X3 ^ Y3, Y3, X3, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2 ^ Y3 ^ Z3, Y2 ^ Z2 ^ X3, Y2, X3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1 ^ Y3 ^ Z3, X2 ^ Y2 ^ Z2, Y2, X2, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3 ^ Y4 ^ X5, Y3 ^ X4 ^ Z4, X4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3 ^ X4 ^ Y4, X3 ^ Y3 ^ Z4, X3, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2 ^ X4 ^ Y4, X3 ^ Y3 ^ Z3, X3, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2 ^ Y3 ^ X4, Y2 ^ X3 ^ Z3, X3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2 ^ X3 ^ Y3, X2 ^ Y2 ^ Z3, X2, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3, Y3 ^ X5, X4 ^ Y4 ^ Z4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3, Y3 ^ X4, X3 ^ Y4 ^ Z4, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2, Y3 ^ X4, X3 ^ Z3 ^ Y4, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2, Y2 ^ X4, X3 ^ Y3 ^ Z3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2, Y2 ^ X3, X2 ^ Y3 ^ Z3, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3 ^ Y4 ^ X5, Y3 ^ X4 ^ Z4, X4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3 ^ X4 ^ Y4, X3 ^ Y3 ^ Z4, X3, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2 ^ X4 ^ Y4, X3 ^ Y3 ^ Z3, X3, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2 ^ Y3 ^ X4, Y2 ^ X3 ^ Z3, X3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2 ^ X3 ^ Y3, X2 ^ Y2 ^ Z3, X2, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3, Y3 ^ X5, X4 ^ Y4 ^ Z4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3, Y3 ^ X4, X3 ^ Y4 ^ Z4, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2, Y3 ^ X4, X3 ^ Z3 ^ Y4, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2, Y2 ^ X4, X3 ^ Y3 ^ Z3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2, Y2 ^ X3, X2 ^ Y3 ^ Z3, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3, Y3, X4, Y4 ^ Z4 ^ X5, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3, Y3, X3, X4 ^ Y4 ^ Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2, Y3, X3, Z3 ^ X4 ^ Y4, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2, Y2, X3, Y3 ^ Z3 ^ X4, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2, Y2, X2, X3 ^ Y3 ^ Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3, Y3 ^ X5, X4 ^ Y4 ^ Z4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3, Y3 ^ X4, X3 ^ Y4 ^ Z4, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2, Y3 ^ X4, X3 ^ Z3 ^ Y4, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2, Y2 ^ X4, X3 ^ Y3 ^ Z3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2, Y2 ^ X3, X2 ^ Y3 ^ Z3, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3, Y3, X4, Y4 ^ Z4 ^ X5, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3, Y3, X3, X4 ^ Y4 ^ Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2, Y3, X3, Z3 ^ X4 ^ Y4, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2, Y2, X3, Y3 ^ Z3 ^ X4, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2, Y2, X2, X3 ^ Y3 ^ Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3, Y3, X4, Z4, Y4 ^ X5, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3, Y3, X3, Z4, X4 ^ Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2, Y3, X3, Z3, X4 ^ Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2, Y2, X3, Z3, Y3 ^ X4, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2, Y2, X2, Z3, X3 ^ Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3, Y3, X4, Y4 ^ Z4 ^ X5, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3, Y3, X3, X4 ^ Y4 ^ Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2, Y3, X3, Z3 ^ X4 ^ Y4, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2, Y2, X3, Y3 ^ Z3 ^ X4, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2, Y2, X2, X3 ^ Y3 ^ Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3, Y3, X4, Z4, Y4 ^ X5, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3, Y3, X3, Z4, X4 ^ Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2, Y3, X3, Z3, X4 ^ Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2, Y2, X3, Z3, Y3 ^ X4, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2, Y2, X2, Z3, X3 ^ Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3, Y3, X4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3, Y3, X3, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2, Y3, X3, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2, Y2, X3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2, Y2, X2, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Y3 ^ Z3 ^ X4, Y3, X4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, X3 ^ Y3 ^ Z3, Y3, X3, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2 ^ X3 ^ Y3, Y3, X3, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Y2 ^ Z2 ^ X3, Y2, X3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, X2 ^ Y2 ^ Z2, Y2, X2, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3 ^ Y4 ^ X5, Y3 ^ X4 ^ Z4, X4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3 ^ X4 ^ Y4, X3 ^ Y3 ^ Z4, X3, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2 ^ X4 ^ Y4, X3 ^ Y3 ^ Z3, X3, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2 ^ Y3 ^ X4, Y2 ^ X3 ^ Z3, X3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2 ^ X3 ^ Y3, X2 ^ Y2 ^ Z3, X2, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3, Y3 ^ X5, X4 ^ Y4 ^ Z4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3, Y3 ^ X4, X3 ^ Y4 ^ Z4, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2, Y3 ^ X4, X3 ^ Z3 ^ Y4, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2, Y2 ^ X4, X3 ^ Y3 ^ Z3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2, Y2 ^ X3, X2 ^ Y3 ^ Z3, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3 ^ Y4 ^ X5, Y3 ^ X4 ^ Z4, X4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3 ^ X4 ^ Y4, X3 ^ Y3 ^ Z4, X3, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2 ^ X4 ^ Y4, X3 ^ Y3 ^ Z3, X3, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2 ^ Y3 ^ X4, Y2 ^ X3 ^ Z3, X3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2 ^ X3 ^ Y3, X2 ^ Y2 ^ Z3, X2, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3, Y3 ^ X5, X4 ^ Y4 ^ Z4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3, Y3 ^ X4, X3 ^ Y4 ^ Z4, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2, Y3 ^ X4, X3 ^ Z3 ^ Y4, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2, Y2 ^ X4, X3 ^ Y3 ^ Z3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2, Y2 ^ X3, X2 ^ Y3 ^ Z3, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3, Y3, X4, Y4 ^ Z4 ^ X5, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3, Y3, X3, X4 ^ Y4 ^ Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2, Y3, X3, Z3 ^ X4 ^ Y4, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2, Y2, X3, Y3 ^ Z3 ^ X4, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2, Y2, X2, X3 ^ Y3 ^ Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3, Y3 ^ X5, X4 ^ Y4 ^ Z4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3, Y3 ^ X4, X3 ^ Y4 ^ Z4, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2, Y3 ^ X4, X3 ^ Z3 ^ Y4, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2, Y2 ^ X4, X3 ^ Y3 ^ Z3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2, Y2 ^ X3, X2 ^ Y3 ^ Z3, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3, Y3, X4, Y4 ^ Z4 ^ X5, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3, Y3, X3, X4 ^ Y4 ^ Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2, Y3, X3, Z3 ^ X4 ^ Y4, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2, Y2, X3, Y3 ^ Z3 ^ X4, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2, Y2, X2, X3 ^ Y3 ^ Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3, Y3, X4, Z4, Y4 ^ X5, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3, Y3, X3, Z4, X4 ^ Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2, Y3, X3, Z3, X4 ^ Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2, Y2, X3, Z3, Y3 ^ X4, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2, Y2, X2, Z3, X3 ^ Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3, Y3, X4, Y4 ^ Z4 ^ X5, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3, Y3, X3, X4 ^ Y4 ^ Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2, Y3, X3, Z3 ^ X4 ^ Y4, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2, Y2, X3, Y3 ^ Z3 ^ X4, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2, Y2, X2, X3 ^ Y3 ^ Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3, Y3, X4, Z4, Y4 ^ X5, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3, Y3, X3, Z4, X4 ^ Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2, Y3, X3, Z3, X4 ^ Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2, Y2, X3, Z3, Y3 ^ X4, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2, Y2, X2, Z3, X3 ^ Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3, Y3, X4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3, Y3, X3, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2, Y3, X3, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2, Y2, X3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2, Y2, X2, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3, Y3, X4, Z4, Y4 ^ X5, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3, Y3, X3, Z4, X4 ^ Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2, Y3, X3, Z3, X4 ^ Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2, Y2, X3, Z3, Y3 ^ X4, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2, Y2, X2, Z3, X3 ^ Y3, X3 }, - {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, Y2, X3, Z3, Y3, X4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3, Y3, X3, Z4, Y4, X4 }, - {0, 0, X0, Y0, Z0, Y1, X1, Z1, Y2, X2, Z2, Y3, X3, Z3, Y4, X4 }, - {0, 0, 0, X0, Z0, Y0, X1, Z1, Y1, X2, Z2, Y2, X3, Z3, Y3, X4 }, - {0, 0, 0, 0, Z0, Y0, X0, Z1, Y1, X1, Z2, Y2, X2, Z3, Y3, X3 }, -}; - -const UINT_64 SW_64K_D3_X_RBPLUS[][16]= -{ - {X0, X1, Z0, Y0, Y1, Z1, X2, Z2, Y2, X3, Z3, Y3, X4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, X1, Z1, Y1, Z2, Y2, X2, Z3, Y3, X3, Z4, Y4, X4 }, - {0, 0, X0, Y0, X1, Z0, Y1, Z1, Y2, X2, Z2, Y3, X3, Z3, Y4, X4 }, - {0, 0, 0, X0, Y0, Z0, X1, Z1, Y1, X2, Z2, Y2, X3, Z3, Y3, X4 }, - {0, 0, 0, 0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Y1, Z1, X2, Z2, X4 ^ Y4, Y2, Z3, Y3, X3, Z4, Y4, X5 }, - {0, X0, Z0, Y0, X1, Z1, Y1, Z2, X4 ^ Y4, Y2, Z3, Y3, X2, Z4, Y4, X3 }, - {0, 0, X0, Y0, X1, Z0, Y1, Z1, X4 ^ Y4, Y2, Z2, Y3, X2, Z3, Y4, X3 }, - {0, 0, 0, X0, Y0, Z0, X1, Z1, X4 ^ Y4, Y1, Z2, Y2, X2, Z3, Y3, X3 }, - {0, 0, 0, 0, X0, Z0, Y0, Z1, Y1 ^ X4 ^ Y4, X1, Z2, Y2, X2, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Y1, Z1, X2, Z2, Y4 ^ X5 ^ Y5, X4 ^ Y4, Y2, Z3, Y3, X3, Z4, X5 }, - {0, X0, Z0, Y0, X1, Z1, Y1, Z2, Y4 ^ X5 ^ Y5, X4 ^ Y4, Y2, Z3, Y3, X2, Z4, X3 }, - {0, 0, X0, Y0, X1, Z0, Y1, Z1, Y4 ^ X5 ^ Y5, X4 ^ Y4, Y2, Z2, Y3, X2, Z3, X3 }, - {0, 0, 0, X0, Y0, Z0, X1, Z1, Z3 ^ Y4 ^ X5 ^ Y5,X4 ^ Y4, Y1, Z2, Y2, X2, Y3, X3 }, - {0, 0, 0, 0, X0, Z0, Y0, Z1, Z3 ^ Y4 ^ X5 ^ Y5,Y1 ^ X4 ^ Y4, X1, Z2, Y2, X2, Y3, X3 }, - {X0, X1, Z0, Y0, Y1, Z1, X2, Z2, Y4 ^ X5 ^ Y5, X4 ^ Y4, Z3 ^ X5, Y2, Y3, X3, Z4, X5 }, - {0, X0, Z0, Y0, X1, Z1, Y1, Z2, Y4 ^ X5 ^ Y5, X4 ^ Y4, Z3 ^ X5, Y2, Y3, X2, Z4, X3 }, - {0, 0, X0, Y0, X1, Z0, Y1, Z1, Y4 ^ X5 ^ Y5, X4 ^ Y4, Z2 ^ X5, Y2, Y3, X2, Z3, X3 }, - {0, 0, 0, X0, Y0, Z0, X1, Z1, Z3 ^ Y4 ^ X5 ^ Y5,X4 ^ Y4, Z2 ^ X5, Y1, Y2, X2, Y3, X3 }, - {0, 0, 0, 0, X0, Z0, Y0, Z1, Z3 ^ Y4 ^ X5 ^ Y5,Y1 ^ X4 ^ Y4, Z2 ^ X5, X1, Y2, X2, Y3, X3 }, - {X0, X1, Z0, Y0, Y1, Z1, X2, Z2, Y4 ^ X6 ^ Y6, X4 ^ Y4, Y2, Y3, Z3, X3, Z4, X5 ^ Y5 }, - {0, X0, Z0, Y0, X1, Z1, Y1, Z2, Y4 ^ X6 ^ Y6, X4 ^ Y4, Z3, Y3, X2, Z4, X3, Y2 ^ X5 ^ Y5 }, - {0, 0, X0, Y0, X1, Z0, Y1, Z1, Y4 ^ X6 ^ Y6, X4 ^ Y4, Z2, Y3, X2, Z3, X3, Y2 ^ X5 ^ Y5 }, - {0, 0, 0, X0, Y0, Z0, X1, Z1, Z3 ^ Y4 ^ X6 ^ Y6,X4 ^ Y4, Z2, Y2, X2, Y3, X3, Y1 ^ X5 ^ Y5 }, - {0, 0, 0, 0, X0, Z0, Y0, Z1, Z3 ^ Y4 ^ X6 ^ Y6,Y1 ^ X4 ^ Y4, Z2, Y2, X2, Y3, X3, X1 ^ X5 ^ Y5 }, - {X0, X1, Z0, Y0, Y1, Z1, X2, Z2, Y4 ^ X6 ^ Y6, X4 ^ Y4, X5 ^ Y5, Y2, Y3, Z3, X3, Z4 }, - {0, X0, Z0, Y0, X1, Z1, Y1, Z2, Y4 ^ X6 ^ Y6, X4 ^ Y4, Y2 ^ X5 ^ Y5, Z3, Y3, X2, Z4, X3 }, - {0, 0, X0, Y0, X1, Z0, Y1, Z1, Y4 ^ X6 ^ Y6, X4 ^ Y4, Y2 ^ X5 ^ Y5, Z2, Y3, X2, Z3, X3 }, - {0, 0, 0, X0, Y0, Z0, X1, Z1, Z3 ^ Y4 ^ X6 ^ Y6,X4 ^ Y4, Y1 ^ X5 ^ Y5, Z2, Y2, X2, Y3, X3 }, - {0, 0, 0, 0, X0, Z0, Y0, Z1, Z3 ^ Y4 ^ X6 ^ Y6,Y1 ^ X4 ^ Y4, X1 ^ X5 ^ Y5, Z2, Y2, X2, Y3, X3 }, - {X0, X1, Z0, Y0, Y1, Z1, X2, Z2, Y4 ^ X6 ^ Y6, X4 ^ Y4, X5 ^ Y5, Z3 ^ X6, Y2, Y3, X3, Z4 }, - {0, X0, Z0, Y0, X1, Z1, Y1, Z2, Y4 ^ X6 ^ Y6, X4 ^ Y4, Y2 ^ X5 ^ Y5, Z3 ^ X6, Y3, X2, Z4, X3 }, - {0, 0, X0, Y0, X1, Z0, Y1, Z1, Y4 ^ X6 ^ Y6, X4 ^ Y4, Y2 ^ X5 ^ Y5, Z2 ^ X6, Y3, X2, Z3, X3 }, - {0, 0, 0, X0, Y0, Z0, X1, Z1, Z3 ^ Y4 ^ X6 ^ Y6,X4 ^ Y4, Y1 ^ X5 ^ Y5, Z2 ^ X6, Y2, X2, Y3, X3 }, - {0, 0, 0, 0, X0, Z0, Y0, Z1, Z3 ^ Y4 ^ X6 ^ Y6,Y1 ^ X4 ^ Y4, X1 ^ X5 ^ Y5, Z2 ^ X6, Y2, X2, Y3, X3 }, - {X0, X1, Z0, Y0, Y1, Z1, X2, Z2, Y4 ^ X7 ^ Y7, X4 ^ Y4, Y2 ^ Y5 ^ X6, Y3, Z3, X3, Z4, X5 ^ Y6 }, - {0, X0, Z0, Y0, X1, Z1, Y1, Z2, Y4 ^ X7 ^ Y7, X4 ^ Y4, Y2 ^ Y5 ^ X6, Y3, X2, Z4, X3, Z3 ^ X5 ^ Y6 }, - {0, 0, X0, Y0, X1, Z0, Y1, Z1, Y4 ^ X7 ^ Y7, X4 ^ Y4, Y2 ^ Y5 ^ X6, Y3, X2, Z3, X3, Z2 ^ X5 ^ Y6 }, - {0, 0, 0, X0, Y0, Z0, X1, Z1, Z3 ^ Y4 ^ X7 ^ Y7,X4 ^ Y4, Y1 ^ Y5 ^ X6, Y2, X2, Y3, X3, Z2 ^ X5 ^ Y6 }, - {0, 0, 0, 0, X0, Z0, Y0, Z1, Z3 ^ Y4 ^ X7 ^ Y7,Y1 ^ X4 ^ Y4, X1 ^ Y5 ^ X6, Y2, X2, Y3, X3, Z2 ^ X5 ^ Y6 }, - {X0, X1, Z0, Y0, Y1, Z1, X2, Z2, Y4 ^ X7 ^ Y7, X4 ^ Y4, Y2 ^ Y5 ^ X6, X5 ^ Y6, Y3, Z3, X3, Z4 }, - {0, X0, Z0, Y0, X1, Z1, Y1, Z2, Y4 ^ X7 ^ Y7, X4 ^ Y4, Y2 ^ Y5 ^ X6, Z3 ^ X5 ^ Y6, Y3, X2, Z4, X3 }, - {0, 0, X0, Y0, X1, Z0, Y1, Z1, Y4 ^ X7 ^ Y7, X4 ^ Y4, Y2 ^ Y5 ^ X6, Z2 ^ X5 ^ Y6, Y3, X2, Z3, X3 }, - {0, 0, 0, X0, Y0, Z0, X1, Z1, Z3 ^ Y4 ^ X7 ^ Y7,X4 ^ Y4, Y1 ^ Y5 ^ X6, Z2 ^ X5 ^ Y6, Y2, X2, Y3, X3 }, - {0, 0, 0, 0, X0, Z0, Y0, Z1, Z3 ^ Y4 ^ X7 ^ Y7,Y1 ^ X4 ^ Y4, X1 ^ Y5 ^ X6, Z2 ^ X5 ^ Y6, Y2, X2, Y3, X3 }, - {X0, X1, Z0, Y0, Y1, Z1, X2, Z2, Y4 ^ X7 ^ Y7, X4 ^ Y4, Y2 ^ Y5 ^ X6, X5 ^ Y6, Z3 ^ X7, Y3, X3, Z4 }, - {0, X0, Z0, Y0, X1, Z1, Y1, Z2, Y4 ^ X7 ^ Y7, X4 ^ Y4, Y2 ^ Y5 ^ X6, Y3 ^ X5 ^ Y6, Z3 ^ X7, X2, Z4, X3 }, - {0, 0, X0, Y0, X1, Z0, Y1, Z1, Y4 ^ X7 ^ Y7, X4 ^ Y4, Y2 ^ Y5 ^ X6, Y3 ^ X5 ^ Y6, Z2 ^ X7, X2, Z3, X3 }, - {0, 0, 0, X0, Y0, Z0, X1, Z1, Z3 ^ Y4 ^ X7 ^ Y7,X4 ^ Y4, Y1 ^ Y5 ^ X6, Y2 ^ X5 ^ Y6, Z2 ^ X7, X2, Y3, X3 }, - {0, 0, 0, 0, X0, Z0, Y0, Z1, Z3 ^ Y4 ^ X7 ^ Y7,Y1 ^ X4 ^ Y4, X1 ^ Y5 ^ X6, Y2 ^ X5 ^ Y6, Z2 ^ X7, X2, Y3, X3 }, - {X0, X1, Z0, Y0, Y1, Z1, X2, Z2, Y4 ^ X8 ^ Y8, X4 ^ Y4, Y2 ^ Y5 ^ X7, X5 ^ Y7, Z3, X3, Z4, Y3 ^ X6 ^ Y6 }, - {0, X0, Z0, Y0, X1, Z1, Y1, Z2, Y4 ^ X8 ^ Y8, X4 ^ Y4, Y2 ^ Y5 ^ X7, Z3 ^ X5 ^ Y7, X2, Z4, X3, Y3 ^ X6 ^ Y6 }, - {0, 0, X0, Y0, X1, Z0, Y1, Z1, Y4 ^ X8 ^ Y8, X4 ^ Y4, Y2 ^ Y5 ^ X7, Z2 ^ X5 ^ Y7, X2, Z3, X3, Y3 ^ X6 ^ Y6 }, - {0, 0, 0, X0, Y0, Z0, X1, Z1, Z3 ^ Y4 ^ X8 ^ Y8,X4 ^ Y4, Y1 ^ Y5 ^ X7, Z2 ^ X5 ^ Y7, X2, Y3, X3, Y2 ^ X6 ^ Y6 }, - {0, 0, 0, 0, X0, Z0, Y0, Z1, Z3 ^ Y4 ^ X8 ^ Y8,Y1 ^ X4 ^ Y4, X1 ^ Y5 ^ X7, Z2 ^ X5 ^ Y7, X2, Y3, X3, Y2 ^ X6 ^ Y6 }, - {X0, X1, Z0, Y0, Y1, Z1, X2, Z2, Y4 ^ X8 ^ Y8, X4 ^ Y4, Y2 ^ Y5 ^ X7, X5 ^ Y7, Y3 ^ X6 ^ Y6, Z3, X3, Z4 }, - {0, X0, Z0, Y0, X1, Z1, Y1, Z2, Y4 ^ X8 ^ Y8, X4 ^ Y4, Y2 ^ Y5 ^ X7, Z3 ^ X5 ^ Y7, Y3 ^ X6 ^ Y6, X2, Z4, X3 }, - {0, 0, X0, Y0, X1, Z0, Y1, Z1, Y4 ^ X8 ^ Y8, X4 ^ Y4, Y2 ^ Y5 ^ X7, Z2 ^ X5 ^ Y7, Y3 ^ X6 ^ Y6, X2, Z3, X3 }, - {0, 0, 0, X0, Y0, Z0, X1, Z1, Z3 ^ Y4 ^ X8 ^ Y8,X4 ^ Y4, Y1 ^ Y5 ^ X7, Z2 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, X2, Y3, X3 }, - {0, 0, 0, 0, X0, Z0, Y0, Z1, Z3 ^ Y4 ^ X8 ^ Y8,Y1 ^ X4 ^ Y4, X1 ^ Y5 ^ X7, Z2 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, X2, Y3, X3 }, - {X0, X1, Z0, Y0, Y1, Z1, X2, Z2, Y4 ^ X8 ^ Y8, X4 ^ Y4, Y2 ^ Y5 ^ X7, X5 ^ Y7, Y3 ^ X6 ^ Y6, Z3 ^ X8, X3, Z4 }, - {0, X0, Z0, Y0, X1, Z1, Y1, Z2, Y4 ^ X8 ^ Y8, X4 ^ Y4, Y2 ^ Y5 ^ X7, Y3 ^ X5 ^ Y7, X2 ^ X6 ^ Y6, Z3 ^ X8, Z4, X3 }, - {0, 0, X0, Y0, X1, Z0, Y1, Z1, Y4 ^ X8 ^ Y8, X4 ^ Y4, Y2 ^ Y5 ^ X7, Y3 ^ X5 ^ Y7, X2 ^ X6 ^ Y6, Z2 ^ X8, Z3, X3 }, - {0, 0, 0, X0, Y0, Z0, X1, Z1, Z3 ^ Y4 ^ X8 ^ Y8,X4 ^ Y4, Y1 ^ Y5 ^ X7, Y2 ^ X5 ^ Y7, X2 ^ X6 ^ Y6, Z2 ^ X8, Y3, X3 }, - {0, 0, 0, 0, X0, Z0, Y0, Z1, Z3 ^ Y4 ^ X8 ^ Y8,Y1 ^ X4 ^ Y4, X1 ^ Y5 ^ X7, Y2 ^ X5 ^ Y7, X2 ^ X6 ^ Y6, Z2 ^ X8, Y3, X3 }, - {X0, X1, Z0, Y0, Y1, Z1, X2, Z2, Y4 ^ X9 ^ Y9, X4 ^ Y4, Y2 ^ Y5 ^ X8, X5 ^ Y8, Y3 ^ Y6 ^ X7, X3, Z4, Z3 ^ X6 ^ Y7 }, - {0, X0, Z0, Y0, X1, Z1, Y1, Z2, Y4 ^ X9 ^ Y9, X4 ^ Y4, Y2 ^ Y5 ^ X8, Z3 ^ X5 ^ Y8, Y3 ^ Y6 ^ X7, Z4, X3, X2 ^ X6 ^ Y7 }, - {0, 0, X0, Y0, X1, Z0, Y1, Z1, Y4 ^ X9 ^ Y9, X4 ^ Y4, Y2 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Y3 ^ Y6 ^ X7, Z3, X3, X2 ^ X6 ^ Y7 }, - {0, 0, 0, X0, Y0, Z0, X1, Z1, Z3 ^ Y4 ^ X9 ^ Y9,X4 ^ Y4, Y1 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, Y3, X3, X2 ^ X6 ^ Y7 }, - {0, 0, 0, 0, X0, Z0, Y0, Z1, Z3 ^ Y4 ^ X9 ^ Y9,Y1 ^ X4 ^ Y4, X1 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, Y3, X3, X2 ^ X6 ^ Y7 }, - {X0, X1, Z0, Y0, Y1, Z1, X2, Z2, Y4 ^ X9 ^ Y9, X4 ^ Y4, Y2 ^ Y5 ^ X8, X5 ^ Y8, Y3 ^ Y6 ^ X7, Z3 ^ X6 ^ Y7, X3, Z4 }, - {0, X0, Z0, Y0, X1, Z1, Y1, Z2, Y4 ^ X9 ^ Y9, X4 ^ Y4, Y2 ^ Y5 ^ X8, Z3 ^ X5 ^ Y8, Y3 ^ Y6 ^ X7, X2 ^ X6 ^ Y7, Z4, X3 }, - {0, 0, X0, Y0, X1, Z0, Y1, Z1, Y4 ^ X9 ^ Y9, X4 ^ Y4, Y2 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Y3 ^ Y6 ^ X7, X2 ^ X6 ^ Y7, Z3, X3 }, - {0, 0, 0, X0, Y0, Z0, X1, Z1, Z3 ^ Y4 ^ X9 ^ Y9,X4 ^ Y4, Y1 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7, Y3, X3 }, - {0, 0, 0, 0, X0, Z0, Y0, Z1, Z3 ^ Y4 ^ X9 ^ Y9,Y1 ^ X4 ^ Y4, X1 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7, Y3, X3 }, - {X0, X1, Z0, Y0, Y1, Z1, X2, Z2, Y2, X3, Z3, Y3, X4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, X1, Z1, Y1, Z2, Y2, X2, Z3, Y3, X3, Z4, Y4, X4 }, - {0, 0, X0, Y0, X1, Z0, Y1, Z1, Y2, X2, Z2, Y3, X3, Z3, Y4, X4 }, - {0, 0, 0, X0, Y0, Z0, X1, Z1, Y1, X2, Z2, Y2, X3, Z3, Y3, X4 }, - {0, 0, 0, 0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Y1, Z1, X2, Z2, Y2, X4 ^ Y4, Z3, Y3, X3, Z4, Y4, X5 }, - {0, X0, Z0, Y0, X1, Z1, Y1, Z2, Y2, X4 ^ Y4, Z3, Y3, X2, Z4, Y4, X3 }, - {0, 0, X0, Y0, X1, Z0, Y1, Z1, Y2, X4 ^ Y4, Z2, Y3, X2, Z3, Y4, X3 }, - {0, 0, 0, X0, Y0, Z0, X1, Z1, Y1, X4 ^ Y4, Z2, Y2, X2, Z3, Y3, X3 }, - {0, 0, 0, 0, X0, Z0, Y0, Z1, X1, Y1 ^ X4 ^ Y4, Z2, Y2, X2, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Y1, Z1, X2, Z2, Y2, Y4 ^ X5 ^ Y5, X4 ^ Y4, Z3, Y3, X3, Z4, X5 }, - {0, X0, Z0, Y0, X1, Z1, Y1, Z2, Y2, Y4 ^ X5 ^ Y5, X4 ^ Y4, Z3, Y3, X2, Z4, X3 }, - {0, 0, X0, Y0, X1, Z0, Y1, Z1, Y2, Y4 ^ X5 ^ Y5, X4 ^ Y4, Z2, Y3, X2, Z3, X3 }, - {0, 0, 0, X0, Y0, Z0, X1, Z1, Y1, Z3 ^ Y4 ^ X5 ^ Y5,X4 ^ Y4, Z2, Y2, X2, Y3, X3 }, - {0, 0, 0, 0, X0, Z0, Y0, Z1, X1, Z3 ^ Y4 ^ X5 ^ Y5,Y1 ^ X4 ^ Y4, Z2, Y2, X2, Y3, X3 }, - {X0, X1, Z0, Y0, Y1, Z1, X2, Z2, Y2, Y4 ^ X5 ^ Y5, X4 ^ Y4, Z3 ^ X5, Y3, X3, Z4, X5 }, - {0, X0, Z0, Y0, X1, Z1, Y1, Z2, Y2, Y4 ^ X5 ^ Y5, X4 ^ Y4, Z3 ^ X5, Y3, X2, Z4, X3 }, - {0, 0, X0, Y0, X1, Z0, Y1, Z1, Y2, Y4 ^ X5 ^ Y5, X4 ^ Y4, Z2 ^ X5, Y3, X2, Z3, X3 }, - {0, 0, 0, X0, Y0, Z0, X1, Z1, Y1, Z3 ^ Y4 ^ X5 ^ Y5,X4 ^ Y4, Z2 ^ X5, Y2, X2, Y3, X3 }, - {0, 0, 0, 0, X0, Z0, Y0, Z1, X1, Z3 ^ Y4 ^ X5 ^ Y5,Y1 ^ X4 ^ Y4, Z2 ^ X5, Y2, X2, Y3, X3 }, - {X0, X1, Z0, Y0, Y1, Z1, X2, Z2, Y2, Y4 ^ X6 ^ Y6, X4 ^ Y4, Y3, Z3, X3, Z4, X5 ^ Y5 }, - {0, X0, Z0, Y0, X1, Z1, Y1, Z2, Z3, Y4 ^ X6 ^ Y6, X4 ^ Y4, Y3, X2, Z4, X3, Y2 ^ X5 ^ Y5 }, - {0, 0, X0, Y0, X1, Z0, Y1, Z1, Z2, Y4 ^ X6 ^ Y6, X4 ^ Y4, Y3, X2, Z3, X3, Y2 ^ X5 ^ Y5 }, - {0, 0, 0, X0, Y0, Z0, X1, Z1, Z2, Z3 ^ Y4 ^ X6 ^ Y6,X4 ^ Y4, Y2, X2, Y3, X3, Y1 ^ X5 ^ Y5 }, - {0, 0, 0, 0, X0, Z0, Y0, Z1, Z2, Z3 ^ Y4 ^ X6 ^ Y6,Y1 ^ X4 ^ Y4, Y2, X2, Y3, X3, X1 ^ X5 ^ Y5 }, - {X0, X1, Z0, Y0, Y1, Z1, X2, Z2, Y2, Y4 ^ X6 ^ Y6, X4 ^ Y4, X5 ^ Y5, Y3, Z3, X3, Z4 }, - {0, X0, Z0, Y0, X1, Z1, Y1, Z2, Z3, Y4 ^ X6 ^ Y6, X4 ^ Y4, Y2 ^ X5 ^ Y5, Y3, X2, Z4, X3 }, - {0, 0, X0, Y0, X1, Z0, Y1, Z1, Z2, Y4 ^ X6 ^ Y6, X4 ^ Y4, Y2 ^ X5 ^ Y5, Y3, X2, Z3, X3 }, - {0, 0, 0, X0, Y0, Z0, X1, Z1, Z2, Z3 ^ Y4 ^ X6 ^ Y6,X4 ^ Y4, Y1 ^ X5 ^ Y5, Y2, X2, Y3, X3 }, - {0, 0, 0, 0, X0, Z0, Y0, Z1, Z2, Z3 ^ Y4 ^ X6 ^ Y6,Y1 ^ X4 ^ Y4, X1 ^ X5 ^ Y5, Y2, X2, Y3, X3 }, - {X0, X1, Z0, Y0, Y1, Z1, X2, Z2, Y2, Y4 ^ X6 ^ Y6, X4 ^ Y4, X5 ^ Y5, Z3 ^ X6, Y3, X3, Z4 }, - {0, X0, Z0, Y0, X1, Z1, Y1, Z2, Y3, Y4 ^ X6 ^ Y6, X4 ^ Y4, Y2 ^ X5 ^ Y5, Z3 ^ X6, X2, Z4, X3 }, - {0, 0, X0, Y0, X1, Z0, Y1, Z1, Y3, Y4 ^ X6 ^ Y6, X4 ^ Y4, Y2 ^ X5 ^ Y5, Z2 ^ X6, X2, Z3, X3 }, - {0, 0, 0, X0, Y0, Z0, X1, Z1, Y2, Z3 ^ Y4 ^ X6 ^ Y6,X4 ^ Y4, Y1 ^ X5 ^ Y5, Z2 ^ X6, X2, Y3, X3 }, - {0, 0, 0, 0, X0, Z0, Y0, Z1, Y2, Z3 ^ Y4 ^ X6 ^ Y6,Y1 ^ X4 ^ Y4, X1 ^ X5 ^ Y5, Z2 ^ X6, X2, Y3, X3 }, - {X0, X1, Z0, Y0, Y1, Z1, X2, Z2, Y3, Y4 ^ X7 ^ Y7, X4 ^ Y4, Y2 ^ Y5 ^ X6, Z3, X3, Z4, X5 ^ Y6 }, - {0, X0, Z0, Y0, X1, Z1, Y1, Z2, Y3, Y4 ^ X7 ^ Y7, X4 ^ Y4, Y2 ^ Y5 ^ X6, X2, Z4, X3, Z3 ^ X5 ^ Y6 }, - {0, 0, X0, Y0, X1, Z0, Y1, Z1, Y3, Y4 ^ X7 ^ Y7, X4 ^ Y4, Y2 ^ Y5 ^ X6, X2, Z3, X3, Z2 ^ X5 ^ Y6 }, - {0, 0, 0, X0, Y0, Z0, X1, Z1, Y2, Z3 ^ Y4 ^ X7 ^ Y7,X4 ^ Y4, Y1 ^ Y5 ^ X6, X2, Y3, X3, Z2 ^ X5 ^ Y6 }, - {0, 0, 0, 0, X0, Z0, Y0, Z1, Y2, Z3 ^ Y4 ^ X7 ^ Y7,Y1 ^ X4 ^ Y4, X1 ^ Y5 ^ X6, X2, Y3, X3, Z2 ^ X5 ^ Y6 }, - {X0, X1, Z0, Y0, Y1, Z1, X2, Z2, Y3, Y4 ^ X7 ^ Y7, X4 ^ Y4, Y2 ^ Y5 ^ X6, X5 ^ Y6, Z3, X3, Z4 }, - {0, X0, Z0, Y0, X1, Z1, Y1, Z2, Y3, Y4 ^ X7 ^ Y7, X4 ^ Y4, Y2 ^ Y5 ^ X6, Z3 ^ X5 ^ Y6, X2, Z4, X3 }, - {0, 0, X0, Y0, X1, Z0, Y1, Z1, Y3, Y4 ^ X7 ^ Y7, X4 ^ Y4, Y2 ^ Y5 ^ X6, Z2 ^ X5 ^ Y6, X2, Z3, X3 }, - {0, 0, 0, X0, Y0, Z0, X1, Z1, Y2, Z3 ^ Y4 ^ X7 ^ Y7,X4 ^ Y4, Y1 ^ Y5 ^ X6, Z2 ^ X5 ^ Y6, X2, Y3, X3 }, - {0, 0, 0, 0, X0, Z0, Y0, Z1, Y2, Z3 ^ Y4 ^ X7 ^ Y7,Y1 ^ X4 ^ Y4, X1 ^ Y5 ^ X6, Z2 ^ X5 ^ Y6, X2, Y3, X3 }, - {X0, X1, Z0, Y0, Y1, Z1, X2, Z2, Y3, Y4 ^ X7 ^ Y7, X4 ^ Y4, Y2 ^ Y5 ^ X6, X5 ^ Y6, Z3 ^ X7, X3, Z4 }, - {0, X0, Z0, Y0, X1, Z1, Y1, Z2, X2, Y4 ^ X7 ^ Y7, X4 ^ Y4, Y2 ^ Y5 ^ X6, Y3 ^ X5 ^ Y6, Z3 ^ X7, Z4, X3 }, - {0, 0, X0, Y0, X1, Z0, Y1, Z1, X2, Y4 ^ X7 ^ Y7, X4 ^ Y4, Y2 ^ Y5 ^ X6, Y3 ^ X5 ^ Y6, Z2 ^ X7, Z3, X3 }, - {0, 0, 0, X0, Y0, Z0, X1, Z1, X2, Z3 ^ Y4 ^ X7 ^ Y7,X4 ^ Y4, Y1 ^ Y5 ^ X6, Y2 ^ X5 ^ Y6, Z2 ^ X7, Y3, X3 }, - {0, 0, 0, 0, X0, Z0, Y0, Z1, X2, Z3 ^ Y4 ^ X7 ^ Y7,Y1 ^ X4 ^ Y4, X1 ^ Y5 ^ X6, Y2 ^ X5 ^ Y6, Z2 ^ X7, Y3, X3 }, - {X0, X1, Z0, Y0, Y1, Z1, X2, Z2, Z3, Y4 ^ X8 ^ Y8, X4 ^ Y4, Y2 ^ Y5 ^ X7, X5 ^ Y7, X3, Z4, Y3 ^ X6 ^ Y6 }, - {0, X0, Z0, Y0, X1, Z1, Y1, Z2, X2, Y4 ^ X8 ^ Y8, X4 ^ Y4, Y2 ^ Y5 ^ X7, Z3 ^ X5 ^ Y7, Z4, X3, Y3 ^ X6 ^ Y6 }, - {0, 0, X0, Y0, X1, Z0, Y1, Z1, X2, Y4 ^ X8 ^ Y8, X4 ^ Y4, Y2 ^ Y5 ^ X7, Z2 ^ X5 ^ Y7, Z3, X3, Y3 ^ X6 ^ Y6 }, - {0, 0, 0, X0, Y0, Z0, X1, Z1, X2, Z3 ^ Y4 ^ X8 ^ Y8,X4 ^ Y4, Y1 ^ Y5 ^ X7, Z2 ^ X5 ^ Y7, Y3, X3, Y2 ^ X6 ^ Y6 }, - {0, 0, 0, 0, X0, Z0, Y0, Z1, X2, Z3 ^ Y4 ^ X8 ^ Y8,Y1 ^ X4 ^ Y4, X1 ^ Y5 ^ X7, Z2 ^ X5 ^ Y7, Y3, X3, Y2 ^ X6 ^ Y6 }, - {X0, X1, Z0, Y0, Y1, Z1, X2, Z2, Z3, Y4 ^ X8 ^ Y8, X4 ^ Y4, Y2 ^ Y5 ^ X7, X5 ^ Y7, Y3 ^ X6 ^ Y6, X3, Z4 }, - {0, X0, Z0, Y0, X1, Z1, Y1, Z2, X2, Y4 ^ X8 ^ Y8, X4 ^ Y4, Y2 ^ Y5 ^ X7, Z3 ^ X5 ^ Y7, Y3 ^ X6 ^ Y6, Z4, X3 }, - {0, 0, X0, Y0, X1, Z0, Y1, Z1, X2, Y4 ^ X8 ^ Y8, X4 ^ Y4, Y2 ^ Y5 ^ X7, Z2 ^ X5 ^ Y7, Y3 ^ X6 ^ Y6, Z3, X3 }, - {0, 0, 0, X0, Y0, Z0, X1, Z1, X2, Z3 ^ Y4 ^ X8 ^ Y8,X4 ^ Y4, Y1 ^ Y5 ^ X7, Z2 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, Y3, X3 }, - {0, 0, 0, 0, X0, Z0, Y0, Z1, X2, Z3 ^ Y4 ^ X8 ^ Y8,Y1 ^ X4 ^ Y4, X1 ^ Y5 ^ X7, Z2 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, Y3, X3 }, - {X0, X1, Z0, Y0, Y1, Z1, X2, Z2, X3, Y4 ^ X8 ^ Y8, X4 ^ Y4, Y2 ^ Y5 ^ X7, X5 ^ Y7, Y3 ^ X6 ^ Y6, Z3 ^ X8, Z4 }, - {0, X0, Z0, Y0, X1, Z1, Y1, Z2, Z4, Y4 ^ X8 ^ Y8, X4 ^ Y4, Y2 ^ Y5 ^ X7, Y3 ^ X5 ^ Y7, X2 ^ X6 ^ Y6, Z3 ^ X8, X3 }, - {0, 0, X0, Y0, X1, Z0, Y1, Z1, Z3, Y4 ^ X8 ^ Y8, X4 ^ Y4, Y2 ^ Y5 ^ X7, Y3 ^ X5 ^ Y7, X2 ^ X6 ^ Y6, Z2 ^ X8, X3 }, - {0, 0, 0, X0, Y0, Z0, X1, Z1, Y3, Z3 ^ Y4 ^ X8 ^ Y8,X4 ^ Y4, Y1 ^ Y5 ^ X7, Y2 ^ X5 ^ Y7, X2 ^ X6 ^ Y6, Z2 ^ X8, X3 }, - {0, 0, 0, 0, X0, Z0, Y0, Z1, Y3, Z3 ^ Y4 ^ X8 ^ Y8,Y1 ^ X4 ^ Y4, X1 ^ Y5 ^ X7, Y2 ^ X5 ^ Y7, X2 ^ X6 ^ Y6, Z2 ^ X8, X3 }, - {X0, X1, Z0, Y0, Y1, Z1, X2, Z2, X3, Y4 ^ X9 ^ Y9, X4 ^ Y4, Y2 ^ Y5 ^ X8, X5 ^ Y8, Y3 ^ Y6 ^ X7, Z4, Z3 ^ X6 ^ Y7 }, - {0, X0, Z0, Y0, X1, Z1, Y1, Z2, Z4, Y4 ^ X9 ^ Y9, X4 ^ Y4, Y2 ^ Y5 ^ X8, Z3 ^ X5 ^ Y8, Y3 ^ Y6 ^ X7, X3, X2 ^ X6 ^ Y7 }, - {0, 0, X0, Y0, X1, Z0, Y1, Z1, Z3, Y4 ^ X9 ^ Y9, X4 ^ Y4, Y2 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Y3 ^ Y6 ^ X7, X3, X2 ^ X6 ^ Y7 }, - {0, 0, 0, X0, Y0, Z0, X1, Z1, Y3, Z3 ^ Y4 ^ X9 ^ Y9,X4 ^ Y4, Y1 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X3, X2 ^ X6 ^ Y7 }, - {0, 0, 0, 0, X0, Z0, Y0, Z1, Y3, Z3 ^ Y4 ^ X9 ^ Y9,Y1 ^ X4 ^ Y4, X1 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X3, X2 ^ X6 ^ Y7 }, - {X0, X1, Z0, Y0, Y1, Z1, X2, Z2, X3, Y4 ^ X9 ^ Y9, X4 ^ Y4, Y2 ^ Y5 ^ X8, X5 ^ Y8, Y3 ^ Y6 ^ X7, Z3 ^ X6 ^ Y7, Z4 }, - {0, X0, Z0, Y0, X1, Z1, Y1, Z2, Z4, Y4 ^ X9 ^ Y9, X4 ^ Y4, Y2 ^ Y5 ^ X8, Z3 ^ X5 ^ Y8, Y3 ^ Y6 ^ X7, X2 ^ X6 ^ Y7, X3 }, - {0, 0, X0, Y0, X1, Z0, Y1, Z1, Z3, Y4 ^ X9 ^ Y9, X4 ^ Y4, Y2 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Y3 ^ Y6 ^ X7, X2 ^ X6 ^ Y7, X3 }, - {0, 0, 0, X0, Y0, Z0, X1, Z1, Y3, Z3 ^ Y4 ^ X9 ^ Y9,X4 ^ Y4, Y1 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7, X3 }, - {0, 0, 0, 0, X0, Z0, Y0, Z1, Y3, Z3 ^ Y4 ^ X9 ^ Y9,Y1 ^ X4 ^ Y4, X1 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7, X3 }, - {X0, X1, Z0, Y0, Y1, Z1, X2, Z2, Y2, X3, Z3, Y3, X4, Z4, Y4, X5 }, - {0, X0, Z0, Y0, X1, Z1, Y1, Z2, Y2, X2, Z3, Y3, X3, Z4, Y4, X4 }, - {0, 0, X0, Y0, X1, Z0, Y1, Z1, Y2, X2, Z2, Y3, X3, Z3, Y4, X4 }, - {0, 0, 0, X0, Y0, Z0, X1, Z1, Y1, X2, Z2, Y2, X3, Z3, Y3, X4 }, - {0, 0, 0, 0, X0, Z0, Y0, Z1, Y1, X1, Z2, Y2, X2, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Y1, Z1, X2, Z2, Y2, Z3, X4 ^ Y4, Y3, X3, Z4, Y4, X5 }, - {0, X0, Z0, Y0, X1, Z1, Y1, Z2, Y2, Z3, X4 ^ Y4, Y3, X2, Z4, Y4, X3 }, - {0, 0, X0, Y0, X1, Z0, Y1, Z1, Y2, Z2, X4 ^ Y4, Y3, X2, Z3, Y4, X3 }, - {0, 0, 0, X0, Y0, Z0, X1, Z1, Y1, Z2, X4 ^ Y4, Y2, X2, Z3, Y3, X3 }, - {0, 0, 0, 0, X0, Z0, Y0, Z1, X1, Z2, Y1 ^ X4 ^ Y4, Y2, X2, Z3, Y3, X3 }, - {X0, X1, Z0, Y0, Y1, Z1, X2, Z2, Y2, Z3, Y4 ^ X5 ^ Y5, X4 ^ Y4, Y3, X3, Z4, X5 }, - {0, X0, Z0, Y0, X1, Z1, Y1, Z2, Y2, Z3, Y4 ^ X5 ^ Y5, X4 ^ Y4, Y3, X2, Z4, X3 }, - {0, 0, X0, Y0, X1, Z0, Y1, Z1, Y2, Z2, Y4 ^ X5 ^ Y5, X4 ^ Y4, Y3, X2, Z3, X3 }, - {0, 0, 0, X0, Y0, Z0, X1, Z1, Y1, Z2, Z3 ^ Y4 ^ X5 ^ Y5,X4 ^ Y4, Y2, X2, Y3, X3 }, - {0, 0, 0, 0, X0, Z0, Y0, Z1, X1, Z2, Z3 ^ Y4 ^ X5 ^ Y5,Y1 ^ X4 ^ Y4, Y2, X2, Y3, X3 }, - {X0, X1, Z0, Y0, Y1, Z1, X2, Z2, Y2, Y3, Y4 ^ X5 ^ Y5, X4 ^ Y4, Z3 ^ X5, X3, Z4, X5 }, - {0, X0, Z0, Y0, X1, Z1, Y1, Z2, Y2, Y3, Y4 ^ X5 ^ Y5, X4 ^ Y4, Z3 ^ X5, X2, Z4, X3 }, - {0, 0, X0, Y0, X1, Z0, Y1, Z1, Y2, Y3, Y4 ^ X5 ^ Y5, X4 ^ Y4, Z2 ^ X5, X2, Z3, X3 }, - {0, 0, 0, X0, Y0, Z0, X1, Z1, Y1, Y2, Z3 ^ Y4 ^ X5 ^ Y5,X4 ^ Y4, Z2 ^ X5, X2, Y3, X3 }, - {0, 0, 0, 0, X0, Z0, Y0, Z1, X1, Y2, Z3 ^ Y4 ^ X5 ^ Y5,Y1 ^ X4 ^ Y4, Z2 ^ X5, X2, Y3, X3 }, - {X0, X1, Z0, Y0, Y1, Z1, X2, Z2, Y2, Y3, Y4 ^ X6 ^ Y6, X4 ^ Y4, Z3, X3, Z4, X5 ^ Y5 }, - {0, X0, Z0, Y0, X1, Z1, Y1, Z2, Z3, Y3, Y4 ^ X6 ^ Y6, X4 ^ Y4, X2, Z4, X3, Y2 ^ X5 ^ Y5 }, - {0, 0, X0, Y0, X1, Z0, Y1, Z1, Z2, Y3, Y4 ^ X6 ^ Y6, X4 ^ Y4, X2, Z3, X3, Y2 ^ X5 ^ Y5 }, - {0, 0, 0, X0, Y0, Z0, X1, Z1, Z2, Y2, Z3 ^ Y4 ^ X6 ^ Y6,X4 ^ Y4, X2, Y3, X3, Y1 ^ X5 ^ Y5 }, - {0, 0, 0, 0, X0, Z0, Y0, Z1, Z2, Y2, Z3 ^ Y4 ^ X6 ^ Y6,Y1 ^ X4 ^ Y4, X2, Y3, X3, X1 ^ X5 ^ Y5 }, - {X0, X1, Z0, Y0, Y1, Z1, X2, Z2, Y2, Y3, Y4 ^ X6 ^ Y6, X4 ^ Y4, X5 ^ Y5, Z3, X3, Z4 }, - {0, X0, Z0, Y0, X1, Z1, Y1, Z2, Z3, Y3, Y4 ^ X6 ^ Y6, X4 ^ Y4, Y2 ^ X5 ^ Y5, X2, Z4, X3 }, - {0, 0, X0, Y0, X1, Z0, Y1, Z1, Z2, Y3, Y4 ^ X6 ^ Y6, X4 ^ Y4, Y2 ^ X5 ^ Y5, X2, Z3, X3 }, - {0, 0, 0, X0, Y0, Z0, X1, Z1, Z2, Y2, Z3 ^ Y4 ^ X6 ^ Y6,X4 ^ Y4, Y1 ^ X5 ^ Y5, X2, Y3, X3 }, - {0, 0, 0, 0, X0, Z0, Y0, Z1, Z2, Y2, Z3 ^ Y4 ^ X6 ^ Y6,Y1 ^ X4 ^ Y4, X1 ^ X5 ^ Y5, X2, Y3, X3 }, - {X0, X1, Z0, Y0, Y1, Z1, X2, Z2, Y2, Y3, Y4 ^ X6 ^ Y6, X4 ^ Y4, X5 ^ Y5, Z3 ^ X6, X3, Z4 }, - {0, X0, Z0, Y0, X1, Z1, Y1, Z2, Y3, X2, Y4 ^ X6 ^ Y6, X4 ^ Y4, Y2 ^ X5 ^ Y5, Z3 ^ X6, Z4, X3 }, - {0, 0, X0, Y0, X1, Z0, Y1, Z1, Y3, X2, Y4 ^ X6 ^ Y6, X4 ^ Y4, Y2 ^ X5 ^ Y5, Z2 ^ X6, Z3, X3 }, - {0, 0, 0, X0, Y0, Z0, X1, Z1, Y2, X2, Z3 ^ Y4 ^ X6 ^ Y6,X4 ^ Y4, Y1 ^ X5 ^ Y5, Z2 ^ X6, Y3, X3 }, - {0, 0, 0, 0, X0, Z0, Y0, Z1, Y2, X2, Z3 ^ Y4 ^ X6 ^ Y6,Y1 ^ X4 ^ Y4, X1 ^ X5 ^ Y5, Z2 ^ X6, Y3, X3 }, - {X0, X1, Z0, Y0, Y1, Z1, X2, Z2, Y3, Z3, Y4 ^ X7 ^ Y7, X4 ^ Y4, Y2 ^ Y5 ^ X6, X3, Z4, X5 ^ Y6 }, - {0, X0, Z0, Y0, X1, Z1, Y1, Z2, Y3, X2, Y4 ^ X7 ^ Y7, X4 ^ Y4, Y2 ^ Y5 ^ X6, Z4, X3, Z3 ^ X5 ^ Y6 }, - {0, 0, X0, Y0, X1, Z0, Y1, Z1, Y3, X2, Y4 ^ X7 ^ Y7, X4 ^ Y4, Y2 ^ Y5 ^ X6, Z3, X3, Z2 ^ X5 ^ Y6 }, - {0, 0, 0, X0, Y0, Z0, X1, Z1, Y2, X2, Z3 ^ Y4 ^ X7 ^ Y7,X4 ^ Y4, Y1 ^ Y5 ^ X6, Y3, X3, Z2 ^ X5 ^ Y6 }, - {0, 0, 0, 0, X0, Z0, Y0, Z1, Y2, X2, Z3 ^ Y4 ^ X7 ^ Y7,Y1 ^ X4 ^ Y4, X1 ^ Y5 ^ X6, Y3, X3, Z2 ^ X5 ^ Y6 }, - {X0, X1, Z0, Y0, Y1, Z1, X2, Z2, Y3, Z3, Y4 ^ X7 ^ Y7, X4 ^ Y4, Y2 ^ Y5 ^ X6, X5 ^ Y6, X3, Z4 }, - {0, X0, Z0, Y0, X1, Z1, Y1, Z2, Y3, X2, Y4 ^ X7 ^ Y7, X4 ^ Y4, Y2 ^ Y5 ^ X6, Z3 ^ X5 ^ Y6, Z4, X3 }, - {0, 0, X0, Y0, X1, Z0, Y1, Z1, Y3, X2, Y4 ^ X7 ^ Y7, X4 ^ Y4, Y2 ^ Y5 ^ X6, Z2 ^ X5 ^ Y6, Z3, X3 }, - {0, 0, 0, X0, Y0, Z0, X1, Z1, Y2, X2, Z3 ^ Y4 ^ X7 ^ Y7,X4 ^ Y4, Y1 ^ Y5 ^ X6, Z2 ^ X5 ^ Y6, Y3, X3 }, - {0, 0, 0, 0, X0, Z0, Y0, Z1, Y2, X2, Z3 ^ Y4 ^ X7 ^ Y7,Y1 ^ X4 ^ Y4, X1 ^ Y5 ^ X6, Z2 ^ X5 ^ Y6, Y3, X3 }, - {X0, X1, Z0, Y0, Y1, Z1, X2, Z2, Y3, X3, Y4 ^ X7 ^ Y7, X4 ^ Y4, Y2 ^ Y5 ^ X6, X5 ^ Y6, Z3 ^ X7, Z4 }, - {0, X0, Z0, Y0, X1, Z1, Y1, Z2, X2, Z4, Y4 ^ X7 ^ Y7, X4 ^ Y4, Y2 ^ Y5 ^ X6, Y3 ^ X5 ^ Y6, Z3 ^ X7, X3 }, - {0, 0, X0, Y0, X1, Z0, Y1, Z1, X2, Z3, Y4 ^ X7 ^ Y7, X4 ^ Y4, Y2 ^ Y5 ^ X6, Y3 ^ X5 ^ Y6, Z2 ^ X7, X3 }, - {0, 0, 0, X0, Y0, Z0, X1, Z1, X2, Y3, Z3 ^ Y4 ^ X7 ^ Y7,X4 ^ Y4, Y1 ^ Y5 ^ X6, Y2 ^ X5 ^ Y6, Z2 ^ X7, X3 }, - {0, 0, 0, 0, X0, Z0, Y0, Z1, X2, Y3, Z3 ^ Y4 ^ X7 ^ Y7,Y1 ^ X4 ^ Y4, X1 ^ Y5 ^ X6, Y2 ^ X5 ^ Y6, Z2 ^ X7, X3 }, - {X0, X1, Z0, Y0, Y1, Z1, X2, Z2, Z3, X3, Y4 ^ X8 ^ Y8, X4 ^ Y4, Y2 ^ Y5 ^ X7, X5 ^ Y7, Z4, Y3 ^ X6 ^ Y6 }, - {0, X0, Z0, Y0, X1, Z1, Y1, Z2, X2, Z4, Y4 ^ X8 ^ Y8, X4 ^ Y4, Y2 ^ Y5 ^ X7, Z3 ^ X5 ^ Y7, X3, Y3 ^ X6 ^ Y6 }, - {0, 0, X0, Y0, X1, Z0, Y1, Z1, X2, Z3, Y4 ^ X8 ^ Y8, X4 ^ Y4, Y2 ^ Y5 ^ X7, Z2 ^ X5 ^ Y7, X3, Y3 ^ X6 ^ Y6 }, - {0, 0, 0, X0, Y0, Z0, X1, Z1, X2, Y3, Z3 ^ Y4 ^ X8 ^ Y8,X4 ^ Y4, Y1 ^ Y5 ^ X7, Z2 ^ X5 ^ Y7, X3, Y2 ^ X6 ^ Y6 }, - {0, 0, 0, 0, X0, Z0, Y0, Z1, X2, Y3, Z3 ^ Y4 ^ X8 ^ Y8,Y1 ^ X4 ^ Y4, X1 ^ Y5 ^ X7, Z2 ^ X5 ^ Y7, X3, Y2 ^ X6 ^ Y6 }, - {X0, X1, Z0, Y0, Y1, Z1, X2, Z2, Z3, X3, Y4 ^ X8 ^ Y8, X4 ^ Y4, Y2 ^ Y5 ^ X7, X5 ^ Y7, Y3 ^ X6 ^ Y6, Z4 }, - {0, X0, Z0, Y0, X1, Z1, Y1, Z2, X2, Z4, Y4 ^ X8 ^ Y8, X4 ^ Y4, Y2 ^ Y5 ^ X7, Z3 ^ X5 ^ Y7, Y3 ^ X6 ^ Y6, X3 }, - {0, 0, X0, Y0, X1, Z0, Y1, Z1, X2, Z3, Y4 ^ X8 ^ Y8, X4 ^ Y4, Y2 ^ Y5 ^ X7, Z2 ^ X5 ^ Y7, Y3 ^ X6 ^ Y6, X3 }, - {0, 0, 0, X0, Y0, Z0, X1, Z1, X2, Y3, Z3 ^ Y4 ^ X8 ^ Y8,X4 ^ Y4, Y1 ^ Y5 ^ X7, Z2 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, X3 }, - {0, 0, 0, 0, X0, Z0, Y0, Z1, X2, Y3, Z3 ^ Y4 ^ X8 ^ Y8,Y1 ^ X4 ^ Y4, X1 ^ Y5 ^ X7, Z2 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, X3 }, - {X0, X1, Z0, Y0, Y1, Z1, X2, Z2, X3, Z4, Y4 ^ X8 ^ Y8, X4 ^ Y4, Y2 ^ Y5 ^ X7, X5 ^ Y7, Y3 ^ X6 ^ Y6, Z3 ^ X8 }, - {0, X0, Z0, Y0, X1, Z1, Y1, Z2, Z4, X3, Y4 ^ X8 ^ Y8, X4 ^ Y4, Y2 ^ Y5 ^ X7, Y3 ^ X5 ^ Y7, X2 ^ X6 ^ Y6, Z3 ^ X8 }, - {0, 0, X0, Y0, X1, Z0, Y1, Z1, Z3, X3, Y4 ^ X8 ^ Y8, X4 ^ Y4, Y2 ^ Y5 ^ X7, Y3 ^ X5 ^ Y7, X2 ^ X6 ^ Y6, Z2 ^ X8 }, - {0, 0, 0, X0, Y0, Z0, X1, Z1, Y3, X3, Z3 ^ Y4 ^ X8 ^ Y8,X4 ^ Y4, Y1 ^ Y5 ^ X7, Y2 ^ X5 ^ Y7, X2 ^ X6 ^ Y6, Z2 ^ X8 }, - {0, 0, 0, 0, X0, Z0, Y0, Z1, Y3, X3, Z3 ^ Y4 ^ X8 ^ Y8,Y1 ^ X4 ^ Y4, X1 ^ Y5 ^ X7, Y2 ^ X5 ^ Y7, X2 ^ X6 ^ Y6, Z2 ^ X8 }, - {X0, X1, Z0, Y0, Y1, Z1, X2, Z2, X3, Z4, Y4 ^ X9 ^ Y9, X4 ^ Y4, Y2 ^ Y5 ^ X8, X5 ^ Y8, Y3 ^ Y6 ^ X7, Z3 ^ X6 ^ Y7 }, - {0, X0, Z0, Y0, X1, Z1, Y1, Z2, Z4, X3, Y4 ^ X9 ^ Y9, X4 ^ Y4, Y2 ^ Y5 ^ X8, Z3 ^ X5 ^ Y8, Y3 ^ Y6 ^ X7, X2 ^ X6 ^ Y7 }, - {0, 0, X0, Y0, X1, Z0, Y1, Z1, Z3, X3, Y4 ^ X9 ^ Y9, X4 ^ Y4, Y2 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Y3 ^ Y6 ^ X7, X2 ^ X6 ^ Y7 }, - {0, 0, 0, X0, Y0, Z0, X1, Z1, Y3, X3, Z3 ^ Y4 ^ X9 ^ Y9,X4 ^ Y4, Y1 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7 }, - {0, 0, 0, 0, X0, Z0, Y0, Z1, Y3, X3, Z3 ^ Y4 ^ X9 ^ Y9,Y1 ^ X4 ^ Y4, X1 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7 }, - {X0, X1, Z0, Y0, Y1, Z1, X2, Z2, X3, Z4, Y4 ^ X9 ^ Y9, X4 ^ Y4, Y2 ^ Y5 ^ X8, X5 ^ Y8, Y3 ^ Y6 ^ X7, Z3 ^ X6 ^ Y7 }, - {0, X0, Z0, Y0, X1, Z1, Y1, Z2, Z4, X3, Y4 ^ X9 ^ Y9, X4 ^ Y4, Y2 ^ Y5 ^ X8, Z3 ^ X5 ^ Y8, Y3 ^ Y6 ^ X7, X2 ^ X6 ^ Y7 }, - {0, 0, X0, Y0, X1, Z0, Y1, Z1, Z3, X3, Y4 ^ X9 ^ Y9, X4 ^ Y4, Y2 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Y3 ^ Y6 ^ X7, X2 ^ X6 ^ Y7 }, - {0, 0, 0, X0, Y0, Z0, X1, Z1, Y3, X3, Z3 ^ Y4 ^ X9 ^ Y9,X4 ^ Y4, Y1 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7 }, - {0, 0, 0, 0, X0, Z0, Y0, Z1, Y3, X3, Z3 ^ Y4 ^ X9 ^ Y9,Y1 ^ X4 ^ Y4, X1 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7 }, -}; - -// Rb plus meta data swizzle pattern -const UINT_64 HTILE_64K_RBPLUS[][18]= -{ - {0, 0, 0, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, 0, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, 0, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, 0, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, 0, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, 0, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, 0, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, 0, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, 0, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X5, Y5, X6, Z0 ^ X4 ^ Y4, Y6, X7, Y7, 0, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X5, Y5, X6, Z0 ^ X4 ^ Y4, Y6, X7, Y7, 0, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X5, Y5, X6, Z0 ^ X4 ^ Y4, Y6, X7, Y7, 0, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X5, Y5, X6, Z0 ^ X4 ^ Y4, Y6, X7, Y7, 0, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, X5, Y5, X6, Y6, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X7, Y7, X8, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, X5, Y5, X6, Y6, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X7, Y7, X8, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, X5, Y5, X6, Y6, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X7, Y7, X8, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, X5, Y5, X6, Y6, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X7, Y7, X8, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, Y5, X6, Y6, X7, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, Y7, X8, Y8, 0, 0, 0 }, - {0, 0, 0, X3, Y3, Y5, X6, Y6, X7, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, Y7, X8, Y8, 0, 0, 0 }, - {0, 0, 0, X3, Y3, Y5, X6, Y6, X7, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, Y7, X8, Y8, 0, 0, 0 }, - {0, 0, 0, X3, Y3, Y5, X6, Y6, X7, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, Y7, X8, Y8, 0, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X5, Y5, X6, Z0 ^ X4 ^ Y4, Y6, X7, Y7, 0, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X5, Y5, X6, Z0 ^ X4 ^ Y4, Y6, X7, Y7, 0, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X5, Y5, X6, Z0 ^ X4 ^ Y4, Y6, X7, Y7, 0, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X5, Y5, X6, Z0 ^ X4 ^ Y4, Y6, X7, Y7, 0, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X5, X6, Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X7, Y7, X8, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X5, X6, Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X7, Y7, X8, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X5, X6, Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X7, Y7, X8, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X5, X6, Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X7, Y7, X8, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, X5, X6, Y6, X7, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y7, X8, Y8, 0, 0, 0 }, - {0, 0, 0, X3, Y3, X5, X6, Y6, X7, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y7, X8, Y8, 0, 0, 0 }, - {0, 0, 0, X3, Y3, X5, X6, Y6, X7, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y7, X8, Y8, 0, 0, 0 }, - {0, 0, 0, X3, Y3, X5, X6, Y6, X7, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y7, X8, Y8, 0, 0, 0 }, - {0, 0, 0, X3, Y3, X6, Y6, X7, Y7, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y6, X8, Y8, X9, 0, 0 }, - {0, 0, 0, X3, Y3, X6, Y6, X7, Y7, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y6, X8, Y8, X9, 0, 0 }, - {0, 0, 0, X3, Y3, X6, Y6, X7, Y7, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y6, X8, Y8, X9, 0, 0 }, - {0, 0, 0, X3, Y3, X6, Y6, X7, Y7, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y6, X8, Y8, X9, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X5, X6, Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X7, Y7, X8, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X5, X6, Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X7, Y7, X8, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X5, X6, Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X7, Y7, X8, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X5, X6, Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X7, Y7, X8, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X6, Y6, X7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, Y7, X8, Y8, 0, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X6, Y6, X7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, Y7, X8, Y8, 0, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X6, Y6, X7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, Y7, X8, Y8, 0, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X6, Y6, X7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, Y7, X8, Y8, 0, 0, 0 }, - {0, 0, 0, X3, Y3, X6, Y6, X7, Y7, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X8, Y8, X9, 0, 0 }, - {0, 0, 0, X3, Y3, X6, Y6, X7, Y7, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X8, Y8, X9, 0, 0 }, - {0, 0, 0, X3, Y3, X6, Y6, X7, Y7, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X8, Y8, X9, 0, 0 }, - {0, 0, 0, X3, Y3, X6, Y6, X7, Y7, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X8, Y8, X9, 0, 0 }, - {0, 0, 0, X3, Y3, Y6, X7, Y7, X8, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X6 ^ Y6, Y8, X9, Y9, 0 }, - {0, 0, 0, X3, Y3, Y6, X7, Y7, X8, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X6 ^ Y6, Y8, X9, Y9, 0 }, - {0, 0, 0, X3, Y3, Y6, X7, Y7, X8, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X6 ^ Y6, Y8, X9, Y9, 0 }, - {0, 0, 0, X3, Y3, Y6, X7, Y7, X8, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X6 ^ Y6, Y8, X9, Y9, 0 }, - {0, 0, 0, X3, Y3, Y4, X6, Y6, X7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, Y7, X8, Y8, 0, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X6, Y6, X7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, Y7, X8, Y8, 0, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X6, Y6, X7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, Y7, X8, Y8, 0, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X6, Y6, X7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, Y7, X8, Y8, 0, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X6, X7, Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X8, Y8, X9, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X6, X7, Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X8, Y8, X9, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X6, X7, Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X8, Y8, X9, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X6, X7, Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X8, Y8, X9, 0, 0 }, - {0, 0, 0, X3, Y3, X6, X7, Y7, X8, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, Y8, X9, Y9, 0 }, - {0, 0, 0, X3, Y3, X6, X7, Y7, X8, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, Y8, X9, Y9, 0 }, - {0, 0, 0, X3, Y3, X6, X7, Y7, X8, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, Y8, X9, Y9, 0 }, - {0, 0, 0, X3, Y3, X6, X7, Y7, X8, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, Y8, X9, Y9, 0 }, - {0, 0, 0, X3, Y3, X7, Y7, X8, Y8, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X6 ^ Y8, X9, Y9, X10 }, - {0, 0, 0, X3, Y3, X7, Y7, X8, Y8, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X6 ^ Y8, X9, Y9, X10 }, - {0, 0, 0, X3, Y3, X7, Y7, X8, Y8, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X6 ^ Y8, X9, Y9, X10 }, - {0, 0, 0, X3, Y3, X7, Y7, X8, Y8, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X6 ^ Y8, X9, Y9, X10 }, - {0, 0, 0, X3, Y3, Y4, X6, X7, Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X8, Y8, X9, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X6, X7, Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X8, Y8, X9, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X6, X7, Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X8, Y8, X9, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X6, X7, Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X8, Y8, X9, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X7, Y7, X8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, X6 ^ Y7, Y8, X9, Y9, 0 }, - {0, 0, 0, X3, Y3, Y4, X7, Y7, X8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, X6 ^ Y7, Y8, X9, Y9, 0 }, - {0, 0, 0, X3, Y3, Y4, X7, Y7, X8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, X6 ^ Y7, Y8, X9, Y9, 0 }, - {0, 0, 0, X3, Y3, Y4, X7, Y7, X8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, X6 ^ Y7, Y8, X9, Y9, 0 }, - {0, 0, 0, X3, Y3, X7, Y7, X8, Y8, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, X6 ^ Y7, X9, Y9, X10 }, - {0, 0, 0, X3, Y3, X7, Y7, X8, Y8, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, X6 ^ Y7, X9, Y9, X10 }, - {0, 0, 0, X3, Y3, X7, Y7, X8, Y8, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, X6 ^ Y7, X9, Y9, X10 }, - {0, 0, 0, X3, Y3, X7, Y7, X8, Y8, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, X6 ^ Y7, X9, Y9, X10 }, - {0, 0, 0, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, 0, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, 0, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, 0, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, 0, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, 0, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, 0, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, 0, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, 0, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X5, Y5, X6, Y6, Z0 ^ X4 ^ Y4, X7, Y7, 0, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X5, Y5, X6, Y6, Z0 ^ X4 ^ Y4, X7, Y7, 0, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X5, Y5, X6, Y6, Z0 ^ X4 ^ Y4, X7, Y7, 0, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X5, Y5, X6, Y6, Z0 ^ X4 ^ Y4, X7, Y7, 0, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, X5, Y5, X6, Y6, X7, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, Y7, X8, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, X5, Y5, X6, Y6, X7, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, Y7, X8, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, X5, Y5, X6, Y6, X7, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, Y7, X8, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, X5, Y5, X6, Y6, X7, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, Y7, X8, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, Y5, X6, Y6, X7, Y7, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, X8, Y8, 0, 0, 0 }, - {0, 0, 0, X3, Y3, Y5, X6, Y6, X7, Y7, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, X8, Y8, 0, 0, 0 }, - {0, 0, 0, X3, Y3, Y5, X6, Y6, X7, Y7, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, X8, Y8, 0, 0, 0 }, - {0, 0, 0, X3, Y3, Y5, X6, Y6, X7, Y7, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, X8, Y8, 0, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X5, Y5, X6, Y6, Z0 ^ X4 ^ Y4, X7, Y7, 0, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X5, Y5, X6, Y6, Z0 ^ X4 ^ Y4, X7, Y7, 0, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X5, Y5, X6, Y6, Z0 ^ X4 ^ Y4, X7, Y7, 0, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X5, Y5, X6, Y6, Z0 ^ X4 ^ Y4, X7, Y7, 0, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X5, X6, Y6, X7, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y7, X8, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X5, X6, Y6, X7, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y7, X8, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X5, X6, Y6, X7, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y7, X8, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X5, X6, Y6, X7, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y7, X8, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, X5, X6, Y6, X7, Y7, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X8, Y8, 0, 0, 0 }, - {0, 0, 0, X3, Y3, X5, X6, Y6, X7, Y7, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X8, Y8, 0, 0, 0 }, - {0, 0, 0, X3, Y3, X5, X6, Y6, X7, Y7, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X8, Y8, 0, 0, 0 }, - {0, 0, 0, X3, Y3, X5, X6, Y6, X7, Y7, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X8, Y8, 0, 0, 0 }, - {0, 0, 0, X3, Y3, X6, Y6, X7, Y7, X8, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y6, Y8, X9, 0, 0 }, - {0, 0, 0, X3, Y3, X6, Y6, X7, Y7, X8, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y6, Y8, X9, 0, 0 }, - {0, 0, 0, X3, Y3, X6, Y6, X7, Y7, X8, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y6, Y8, X9, 0, 0 }, - {0, 0, 0, X3, Y3, X6, Y6, X7, Y7, X8, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y6, Y8, X9, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X5, X6, Y6, X7, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y7, X8, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X5, X6, Y6, X7, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y7, X8, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X5, X6, Y6, X7, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y7, X8, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X5, X6, Y6, X7, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y7, X8, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X6, Y6, X7, Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X8, Y8, 0, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X6, Y6, X7, Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X8, Y8, 0, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X6, Y6, X7, Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X8, Y8, 0, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X6, Y6, X7, Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X8, Y8, 0, 0, 0 }, - {0, 0, 0, X3, Y3, X6, Y6, X7, Y7, X8, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, Y8, X9, 0, 0 }, - {0, 0, 0, X3, Y3, X6, Y6, X7, Y7, X8, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, Y8, X9, 0, 0 }, - {0, 0, 0, X3, Y3, X6, Y6, X7, Y7, X8, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, Y8, X9, 0, 0 }, - {0, 0, 0, X3, Y3, X6, Y6, X7, Y7, X8, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, Y8, X9, 0, 0 }, - {0, 0, 0, X3, Y3, Y6, X7, Y7, X8, Y8, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X6 ^ Y6, X9, Y9, 0 }, - {0, 0, 0, X3, Y3, Y6, X7, Y7, X8, Y8, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X6 ^ Y6, X9, Y9, 0 }, - {0, 0, 0, X3, Y3, Y6, X7, Y7, X8, Y8, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X6 ^ Y6, X9, Y9, 0 }, - {0, 0, 0, X3, Y3, Y6, X7, Y7, X8, Y8, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X6 ^ Y6, X9, Y9, 0 }, - {0, 0, 0, X3, Y3, Y4, X6, Y6, X7, Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X8, Y8, 0, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X6, Y6, X7, Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X8, Y8, 0, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X6, Y6, X7, Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X8, Y8, 0, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X6, Y6, X7, Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X8, Y8, 0, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X6, X7, Y7, X8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, Y8, X9, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X6, X7, Y7, X8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, Y8, X9, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X6, X7, Y7, X8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, Y8, X9, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X6, X7, Y7, X8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, Y8, X9, 0, 0 }, - {0, 0, 0, X3, Y3, X6, X7, Y7, X8, Y8, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X9, Y9, 0 }, - {0, 0, 0, X3, Y3, X6, X7, Y7, X8, Y8, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X9, Y9, 0 }, - {0, 0, 0, X3, Y3, X6, X7, Y7, X8, Y8, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X9, Y9, 0 }, - {0, 0, 0, X3, Y3, X6, X7, Y7, X8, Y8, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X9, Y9, 0 }, - {0, 0, 0, X3, Y3, X7, Y7, X8, Y8, X9, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X6 ^ Y8, Y9, X10 }, - {0, 0, 0, X3, Y3, X7, Y7, X8, Y8, X9, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X6 ^ Y8, Y9, X10 }, - {0, 0, 0, X3, Y3, X7, Y7, X8, Y8, X9, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X6 ^ Y8, Y9, X10 }, - {0, 0, 0, X3, Y3, X7, Y7, X8, Y8, X9, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X6 ^ Y8, Y9, X10 }, - {0, 0, 0, X3, Y3, Y4, X6, X7, Y7, X8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, Y8, X9, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X6, X7, Y7, X8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, Y8, X9, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X6, X7, Y7, X8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, Y8, X9, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X6, X7, Y7, X8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, Y8, X9, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X7, Y7, X8, Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, X6 ^ Y7, X9, Y9, 0 }, - {0, 0, 0, X3, Y3, Y4, X7, Y7, X8, Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, X6 ^ Y7, X9, Y9, 0 }, - {0, 0, 0, X3, Y3, Y4, X7, Y7, X8, Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, X6 ^ Y7, X9, Y9, 0 }, - {0, 0, 0, X3, Y3, Y4, X7, Y7, X8, Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, X6 ^ Y7, X9, Y9, 0 }, - {0, 0, 0, X3, Y3, X7, Y7, X8, Y8, X9, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, X6 ^ Y7, Y9, X10 }, - {0, 0, 0, X3, Y3, X7, Y7, X8, Y8, X9, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, X6 ^ Y7, Y9, X10 }, - {0, 0, 0, X3, Y3, X7, Y7, X8, Y8, X9, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, X6 ^ Y7, Y9, X10 }, - {0, 0, 0, X3, Y3, X7, Y7, X8, Y8, X9, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, X6 ^ Y7, Y9, X10 }, - {0, 0, 0, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, 0, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, 0, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, 0, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, 0, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, 0, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, 0, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, 0, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, 0, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X5, Y5, X6, Y6, X7, Z0 ^ X4 ^ Y4, Y7, 0, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X5, Y5, X6, Y6, X7, Z0 ^ X4 ^ Y4, Y7, 0, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X5, Y5, X6, Y6, X7, Z0 ^ X4 ^ Y4, Y7, 0, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X5, Y5, X6, Y6, X7, Z0 ^ X4 ^ Y4, Y7, 0, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, X5, Y5, X6, Y6, X7, Y7, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X8, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, X5, Y5, X6, Y6, X7, Y7, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X8, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, X5, Y5, X6, Y6, X7, Y7, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X8, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, X5, Y5, X6, Y6, X7, Y7, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X8, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, Y5, X6, Y6, X7, Y7, X8, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, Y8, 0, 0, 0 }, - {0, 0, 0, X3, Y3, Y5, X6, Y6, X7, Y7, X8, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, Y8, 0, 0, 0 }, - {0, 0, 0, X3, Y3, Y5, X6, Y6, X7, Y7, X8, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, Y8, 0, 0, 0 }, - {0, 0, 0, X3, Y3, Y5, X6, Y6, X7, Y7, X8, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, Y8, 0, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X5, Y5, X6, Y6, X7, Z0 ^ X4 ^ Y4, Y7, 0, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X5, Y5, X6, Y6, X7, Z0 ^ X4 ^ Y4, Y7, 0, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X5, Y5, X6, Y6, X7, Z0 ^ X4 ^ Y4, Y7, 0, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X5, Y5, X6, Y6, X7, Z0 ^ X4 ^ Y4, Y7, 0, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X5, X6, Y6, X7, Y7, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X8, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X5, X6, Y6, X7, Y7, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X8, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X5, X6, Y6, X7, Y7, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X8, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X5, X6, Y6, X7, Y7, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X8, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, X5, X6, Y6, X7, Y7, X8, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y8, 0, 0, 0 }, - {0, 0, 0, X3, Y3, X5, X6, Y6, X7, Y7, X8, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y8, 0, 0, 0 }, - {0, 0, 0, X3, Y3, X5, X6, Y6, X7, Y7, X8, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y8, 0, 0, 0 }, - {0, 0, 0, X3, Y3, X5, X6, Y6, X7, Y7, X8, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y8, 0, 0, 0 }, - {0, 0, 0, X3, Y3, X6, Y6, X7, Y7, X8, Y8, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y6, X9, 0, 0 }, - {0, 0, 0, X3, Y3, X6, Y6, X7, Y7, X8, Y8, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y6, X9, 0, 0 }, - {0, 0, 0, X3, Y3, X6, Y6, X7, Y7, X8, Y8, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y6, X9, 0, 0 }, - {0, 0, 0, X3, Y3, X6, Y6, X7, Y7, X8, Y8, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y6, X9, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X5, X6, Y6, X7, Y7, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X8, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X5, X6, Y6, X7, Y7, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X8, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X5, X6, Y6, X7, Y7, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X8, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X5, X6, Y6, X7, Y7, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X8, 0, 0, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X6, Y6, X7, Y7, X8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, Y8, 0, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X6, Y6, X7, Y7, X8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, Y8, 0, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X6, Y6, X7, Y7, X8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, Y8, 0, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X6, Y6, X7, Y7, X8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, Y8, 0, 0, 0 }, - {0, 0, 0, X3, Y3, X6, Y6, X7, Y7, X8, Y8, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X9, 0, 0 }, - {0, 0, 0, X3, Y3, X6, Y6, X7, Y7, X8, Y8, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X9, 0, 0 }, - {0, 0, 0, X3, Y3, X6, Y6, X7, Y7, X8, Y8, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X9, 0, 0 }, - {0, 0, 0, X3, Y3, X6, Y6, X7, Y7, X8, Y8, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X9, 0, 0 }, - {0, 0, 0, X3, Y3, Y6, X7, Y7, X8, Y8, X9, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X6 ^ Y6, Y9, 0 }, - {0, 0, 0, X3, Y3, Y6, X7, Y7, X8, Y8, X9, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X6 ^ Y6, Y9, 0 }, - {0, 0, 0, X3, Y3, Y6, X7, Y7, X8, Y8, X9, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X6 ^ Y6, Y9, 0 }, - {0, 0, 0, X3, Y3, Y6, X7, Y7, X8, Y8, X9, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X6 ^ Y6, Y9, 0 }, - {0, 0, 0, X3, Y3, Y4, X6, Y6, X7, Y7, X8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, Y8, 0, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X6, Y6, X7, Y7, X8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, Y8, 0, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X6, Y6, X7, Y7, X8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, Y8, 0, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X6, Y6, X7, Y7, X8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, Y8, 0, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X6, X7, Y7, X8, Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X9, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X6, X7, Y7, X8, Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X9, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X6, X7, Y7, X8, Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X9, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X6, X7, Y7, X8, Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X9, 0, 0 }, - {0, 0, 0, X3, Y3, X6, X7, Y7, X8, Y8, X9, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, Y9, 0 }, - {0, 0, 0, X3, Y3, X6, X7, Y7, X8, Y8, X9, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, Y9, 0 }, - {0, 0, 0, X3, Y3, X6, X7, Y7, X8, Y8, X9, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, Y9, 0 }, - {0, 0, 0, X3, Y3, X6, X7, Y7, X8, Y8, X9, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, Y9, 0 }, - {0, 0, 0, X3, Y3, X7, Y7, X8, Y8, X9, Y9, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X6 ^ Y8, X10 }, - {0, 0, 0, X3, Y3, X7, Y7, X8, Y8, X9, Y9, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X6 ^ Y8, X10 }, - {0, 0, 0, X3, Y3, X7, Y7, X8, Y8, X9, Y9, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X6 ^ Y8, X10 }, - {0, 0, 0, X3, Y3, X7, Y7, X8, Y8, X9, Y9, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X6 ^ Y8, X10 }, - {0, 0, 0, X3, Y3, Y4, X6, X7, Y7, X8, Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X9, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X6, X7, Y7, X8, Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X9, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X6, X7, Y7, X8, Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X9, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X6, X7, Y7, X8, Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X9, 0, 0 }, - {0, 0, 0, X3, Y3, Y4, X7, Y7, X8, Y8, X9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, X6 ^ Y7, Y9, 0 }, - {0, 0, 0, X3, Y3, Y4, X7, Y7, X8, Y8, X9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, X6 ^ Y7, Y9, 0 }, - {0, 0, 0, X3, Y3, Y4, X7, Y7, X8, Y8, X9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, X6 ^ Y7, Y9, 0 }, - {0, 0, 0, X3, Y3, Y4, X7, Y7, X8, Y8, X9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, X6 ^ Y7, Y9, 0 }, - {0, 0, 0, X3, Y3, X7, Y7, X8, Y8, X9, Y9, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, X6 ^ Y7, X10 }, - {0, 0, 0, X3, Y3, X7, Y7, X8, Y8, X9, Y9, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, X6 ^ Y7, X10 }, - {0, 0, 0, X3, Y3, X7, Y7, X8, Y8, X9, Y9, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, X6 ^ Y7, X10 }, - {0, 0, 0, X3, Y3, X7, Y7, X8, Y8, X9, Y9, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, X6 ^ Y7, X10 }, -}; - -const UINT_64 CMASK_64K_RBPLUS[][17]= -{ - {X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, 0, 0, 0, 0 }, - {X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, 0, 0, 0, 0 }, - {X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, 0, 0, 0, 0 }, - {X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, 0, 0, 0, 0 }, - {X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, 0, 0, 0, 0 }, - {X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, 0, 0, 0, 0 }, - {X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, 0, 0, 0, 0 }, - {X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, 0, 0, 0, 0 }, - {X3, Y3, Y4, X5, Y5, X6, Y6, X7, Y7, Z0 ^ X4 ^ Y4, X8, Y8, X9, 0, 0, 0, 0 }, - {X3, Y3, Y4, X5, Y5, X6, Y6, X7, Y7, Z0 ^ X4 ^ Y4, X8, Y8, X9, 0, 0, 0, 0 }, - {X3, Y3, Y4, X5, Y5, X6, Y6, X7, Y7, Z0 ^ X4 ^ Y4, X8, Y8, X9, 0, 0, 0, 0 }, - {X3, Y3, Y4, X5, Y5, X6, Y6, X7, Y7, Z0 ^ X4 ^ Y4, X8, Y8, X9, 0, 0, 0, 0 }, - {X3, Y3, X5, Y5, X6, Y6, X7, Y7, X8, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, Y8, X9, 0, 0, 0, 0 }, - {X3, Y3, X5, Y5, X6, Y6, X7, Y7, X8, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, Y8, X9, 0, 0, 0, 0 }, - {X3, Y3, X5, Y5, X6, Y6, X7, Y7, X8, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, Y8, X9, 0, 0, 0, 0 }, - {X3, Y3, X5, Y5, X6, Y6, X7, Y7, X8, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, Y8, X9, 0, 0, 0, 0 }, - {X3, Y3, Y5, X6, Y6, X7, Y7, X8, Y8, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, X9, 0, 0, 0, 0 }, - {X3, Y3, Y5, X6, Y6, X7, Y7, X8, Y8, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, X9, 0, 0, 0, 0 }, - {X3, Y3, Y5, X6, Y6, X7, Y7, X8, Y8, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, X9, 0, 0, 0, 0 }, - {X3, Y3, Y5, X6, Y6, X7, Y7, X8, Y8, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, X9, 0, 0, 0, 0 }, - {X3, Y3, Y4, X5, Y5, X6, Y6, X7, Y7, Z0 ^ X4 ^ Y4, X8, Y8, X9, 0, 0, 0, 0 }, - {X3, Y3, Y4, X5, Y5, X6, Y6, X7, Y7, Z0 ^ X4 ^ Y4, X8, Y8, X9, 0, 0, 0, 0 }, - {X3, Y3, Y4, X5, Y5, X6, Y6, X7, Y7, Z0 ^ X4 ^ Y4, X8, Y8, X9, 0, 0, 0, 0 }, - {X3, Y3, Y4, X5, Y5, X6, Y6, X7, Y7, Z0 ^ X4 ^ Y4, X8, Y8, X9, 0, 0, 0, 0 }, - {X3, Y3, Y4, X5, X6, Y6, X7, Y7, X8, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y8, X9, 0, 0, 0, 0 }, - {X3, Y3, Y4, X5, X6, Y6, X7, Y7, X8, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y8, X9, 0, 0, 0, 0 }, - {X3, Y3, Y4, X5, X6, Y6, X7, Y7, X8, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y8, X9, 0, 0, 0, 0 }, - {X3, Y3, Y4, X5, X6, Y6, X7, Y7, X8, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y8, X9, 0, 0, 0, 0 }, - {X3, Y3, X5, X6, Y6, X7, Y7, X8, Y8, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X9, 0, 0, 0, 0 }, - {X3, Y3, X5, X6, Y6, X7, Y7, X8, Y8, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X9, 0, 0, 0, 0 }, - {X3, Y3, X5, X6, Y6, X7, Y7, X8, Y8, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X9, 0, 0, 0, 0 }, - {X3, Y3, X5, X6, Y6, X7, Y7, X8, Y8, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X9, 0, 0, 0, 0 }, - {X3, Y3, X6, Y6, X7, Y7, X8, Y8, X9, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y6, 0, 0, 0, 0 }, - {X3, Y3, X6, Y6, X7, Y7, X8, Y8, X9, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y6, 0, 0, 0, 0 }, - {X3, Y3, X6, Y6, X7, Y7, X8, Y8, X9, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y6, 0, 0, 0, 0 }, - {X3, Y3, X6, Y6, X7, Y7, X8, Y8, X9, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y6, 0, 0, 0, 0 }, - {X3, Y3, Y4, X5, X6, Y6, X7, Y7, X8, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y8, X9, 0, 0, 0, 0 }, - {X3, Y3, Y4, X5, X6, Y6, X7, Y7, X8, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y8, X9, 0, 0, 0, 0 }, - {X3, Y3, Y4, X5, X6, Y6, X7, Y7, X8, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y8, X9, 0, 0, 0, 0 }, - {X3, Y3, Y4, X5, X6, Y6, X7, Y7, X8, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y8, X9, 0, 0, 0, 0 }, - {X3, Y3, Y4, X6, Y6, X7, Y7, X8, Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X9, 0, 0, 0, 0 }, - {X3, Y3, Y4, X6, Y6, X7, Y7, X8, Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X9, 0, 0, 0, 0 }, - {X3, Y3, Y4, X6, Y6, X7, Y7, X8, Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X9, 0, 0, 0, 0 }, - {X3, Y3, Y4, X6, Y6, X7, Y7, X8, Y8, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X9, 0, 0, 0, 0 }, - {X3, Y3, X6, Y6, X7, Y7, X8, Y8, X9, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, 0, 0, 0, 0 }, - {X3, Y3, X6, Y6, X7, Y7, X8, Y8, X9, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, 0, 0, 0, 0 }, - {X3, Y3, X6, Y6, X7, Y7, X8, Y8, X9, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, 0, 0, 0, 0 }, - {X3, Y3, X6, Y6, X7, Y7, X8, Y8, X9, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, 0, 0, 0, 0 }, - {X3, Y3, Y6, X7, Y7, X8, Y8, X9, Y9, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X6 ^ Y6, 0, 0, 0 }, - {X3, Y3, Y6, X7, Y7, X8, Y8, X9, Y9, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X6 ^ Y6, 0, 0, 0 }, - {X3, Y3, Y6, X7, Y7, X8, Y8, X9, Y9, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X6 ^ Y6, 0, 0, 0 }, - {X3, Y3, Y6, X7, Y7, X8, Y8, X9, Y9, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X6 ^ Y6, 0, 0, 0 }, - {X3, Y3, Y4, X6, Y6, X7, Y7, X8, Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X9, 0, 0, 0, 0 }, - {X3, Y3, Y4, X6, Y6, X7, Y7, X8, Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X9, 0, 0, 0, 0 }, - {X3, Y3, Y4, X6, Y6, X7, Y7, X8, Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X9, 0, 0, 0, 0 }, - {X3, Y3, Y4, X6, Y6, X7, Y7, X8, Y8, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X9, 0, 0, 0, 0 }, - {X3, Y3, Y4, X6, X7, Y7, X8, Y8, X9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, 0, 0, 0, 0 }, - {X3, Y3, Y4, X6, X7, Y7, X8, Y8, X9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, 0, 0, 0, 0 }, - {X3, Y3, Y4, X6, X7, Y7, X8, Y8, X9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, 0, 0, 0, 0 }, - {X3, Y3, Y4, X6, X7, Y7, X8, Y8, X9, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, 0, 0, 0, 0 }, - {X3, Y3, X6, X7, Y7, X8, Y8, X9, Y9, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, 0, 0, 0 }, - {X3, Y3, X6, X7, Y7, X8, Y8, X9, Y9, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, 0, 0, 0 }, - {X3, Y3, X6, X7, Y7, X8, Y8, X9, Y9, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, 0, 0, 0 }, - {X3, Y3, X6, X7, Y7, X8, Y8, X9, Y9, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, 0, 0, 0 }, - {X3, Y3, X7, Y7, X8, Y8, X9, Y9, X10, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X6 ^ Y8, 0, 0 }, - {X3, Y3, X7, Y7, X8, Y8, X9, Y9, X10, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X6 ^ Y8, 0, 0 }, - {X3, Y3, X7, Y7, X8, Y8, X9, Y9, X10, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X6 ^ Y8, 0, 0 }, - {X3, Y3, X6, X7, Y7, X8, X9, Y9, X10, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X3 ^ Y8, 0, 0 }, - {X3, Y3, Y4, X6, X7, Y7, X8, Y8, X9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, 0, 0, 0, 0 }, - {X3, Y3, Y4, X6, X7, Y7, X8, Y8, X9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, 0, 0, 0, 0 }, - {X3, Y3, Y4, X6, X7, Y7, X8, Y8, X9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, 0, 0, 0, 0 }, - {X3, Y3, Y4, X6, X7, Y7, X8, Y8, X9, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, 0, 0, 0, 0 }, - {X3, Y3, Y4, X7, Y7, X8, Y8, X9, Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, X6 ^ Y7, 0, 0, 0 }, - {X3, Y3, Y4, X7, Y7, X8, Y8, X9, Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, X6 ^ Y7, 0, 0, 0 }, - {X3, Y3, Y4, X7, Y7, X8, Y8, X9, Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, X6 ^ Y7, 0, 0, 0 }, - {X3, Y3, Y4, X7, Y7, X8, Y8, X9, Y9, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X8, Z1 ^ X5 ^ Y8, Y6 ^ X7, Z0 ^ X6 ^ Y7, 0, 0, 0 }, - {X3, Y3, X7, Y7, X8, Y8, X9, Y9, X10, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, X6 ^ Y7, 0, 0 }, - {X3, Y3, X7, Y7, X8, Y8, X9, Y9, X10, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, X6 ^ Y7, 0, 0 }, - {X3, Y3, X7, Y7, X8, Y8, X9, Y9, X10, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, X6 ^ Y7, 0, 0 }, - {X3, Y3, X7, Y7, X8, Y8, X9, Y9, X10, Y4 ^ X9 ^ Y9, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X8, Z1 ^ X5 ^ Y8, Y6 ^ X7, Z0 ^ X6 ^ Y7, 0, 0 }, - {X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, 0, 0, 0, 0 }, - {X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, 0, 0, 0, 0 }, - {X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, 0, 0, 0, 0 }, - {X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, 0, 0, 0, 0 }, - {X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, 0, 0, 0, 0 }, - {X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, 0, 0, 0, 0 }, - {X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, 0, 0, 0, 0 }, - {X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, 0, 0, 0, 0 }, - {X3, Y3, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Z0 ^ X4 ^ Y4, Y8, X9, 0, 0, 0, 0 }, - {X3, Y3, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Z0 ^ X4 ^ Y4, Y8, X9, 0, 0, 0, 0 }, - {X3, Y3, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Z0 ^ X4 ^ Y4, Y8, X9, 0, 0, 0, 0 }, - {X3, Y3, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Z0 ^ X4 ^ Y4, Y8, X9, 0, 0, 0, 0 }, - {X3, Y3, X5, Y5, X6, Y6, X7, Y7, X8, Y8, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X9, 0, 0, 0, 0 }, - {X3, Y3, X5, Y5, X6, Y6, X7, Y7, X8, Y8, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X9, 0, 0, 0, 0 }, - {X3, Y3, X5, Y5, X6, Y6, X7, Y7, X8, Y8, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X9, 0, 0, 0, 0 }, - {X3, Y3, X5, Y5, X6, Y6, X7, Y7, X8, Y8, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X9, 0, 0, 0, 0 }, - {X3, Y3, Y5, X6, Y6, X7, Y7, X8, Y8, X9, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, 0, 0, 0, 0 }, - {X3, Y3, Y5, X6, Y6, X7, Y7, X8, Y8, X9, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, 0, 0, 0, 0 }, - {X3, Y3, Y5, X6, Y6, X7, Y7, X8, Y8, X9, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, 0, 0, 0, 0 }, - {X3, Y3, Y5, X6, Y6, X7, Y7, X8, Y8, X9, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, 0, 0, 0, 0 }, - {X3, Y3, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Z0 ^ X4 ^ Y4, Y8, X9, 0, 0, 0, 0 }, - {X3, Y3, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Z0 ^ X4 ^ Y4, Y8, X9, 0, 0, 0, 0 }, - {X3, Y3, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Z0 ^ X4 ^ Y4, Y8, X9, 0, 0, 0, 0 }, - {X3, Y3, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Z0 ^ X4 ^ Y4, Y8, X9, 0, 0, 0, 0 }, - {X3, Y3, Y4, X5, X6, Y6, X7, Y7, X8, Y8, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X9, 0, 0, 0, 0 }, - {X3, Y3, Y4, X5, X6, Y6, X7, Y7, X8, Y8, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X9, 0, 0, 0, 0 }, - {X3, Y3, Y4, X5, X6, Y6, X7, Y7, X8, Y8, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X9, 0, 0, 0, 0 }, - {X3, Y3, Y4, X5, X6, Y6, X7, Y7, X8, Y8, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X9, 0, 0, 0, 0 }, - {X3, Y3, X5, X6, Y6, X7, Y7, X8, Y8, X9, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, 0, 0, 0, 0 }, - {X3, Y3, X5, X6, Y6, X7, Y7, X8, Y8, X9, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, 0, 0, 0, 0 }, - {X3, Y3, X5, X6, Y6, X7, Y7, X8, Y8, X9, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, 0, 0, 0, 0 }, - {X3, Y3, X5, X6, Y6, X7, Y7, X8, Y8, X9, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, 0, 0, 0, 0 }, - {X3, Y3, X6, Y6, X7, Y7, X8, Y8, X9, Y9, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y6, 0, 0, 0 }, - {X3, Y3, X6, Y6, X7, Y7, X8, Y8, X9, Y9, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y6, 0, 0, 0 }, - {X3, Y3, X6, Y6, X7, Y7, X8, Y8, X9, Y9, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y6, 0, 0, 0 }, - {X3, Y3, X6, Y6, X7, Y7, X8, Y8, X9, Y9, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y6, 0, 0, 0 }, - {X3, Y3, Y4, X5, X6, Y6, X7, Y7, X8, Y8, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X9, 0, 0, 0, 0 }, - {X3, Y3, Y4, X5, X6, Y6, X7, Y7, X8, Y8, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X9, 0, 0, 0, 0 }, - {X3, Y3, Y4, X5, X6, Y6, X7, Y7, X8, Y8, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X9, 0, 0, 0, 0 }, - {X3, Y3, Y4, X5, X6, Y6, X7, Y7, X8, Y8, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X9, 0, 0, 0, 0 }, - {X3, Y3, Y4, X6, Y6, X7, Y7, X8, Y8, X9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, 0, 0, 0, 0 }, - {X3, Y3, Y4, X6, Y6, X7, Y7, X8, Y8, X9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, 0, 0, 0, 0 }, - {X3, Y3, Y4, X6, Y6, X7, Y7, X8, Y8, X9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, 0, 0, 0, 0 }, - {X3, Y3, Y4, X6, Y6, X7, Y7, X8, Y8, X9, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, 0, 0, 0, 0 }, - {X3, Y3, X6, Y6, X7, Y7, X8, Y8, X9, Y9, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, 0, 0, 0 }, - {X3, Y3, X6, Y6, X7, Y7, X8, Y8, X9, Y9, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, 0, 0, 0 }, - {X3, Y3, X6, Y6, X7, Y7, X8, Y8, X9, Y9, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, 0, 0, 0 }, - {X3, Y3, X6, Y6, X7, Y7, X8, Y8, X9, Y9, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, 0, 0, 0 }, - {X3, Y3, Y6, X7, Y7, X8, Y8, X9, Y9, X10, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X6 ^ Y6, 0, 0 }, - {X3, Y3, Y6, X7, Y7, X8, Y8, X9, Y9, X10, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X6 ^ Y6, 0, 0 }, - {X3, Y3, Y6, X7, Y7, X8, Y8, X9, Y9, X10, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X6 ^ Y6, 0, 0 }, - {X3, Y3, Y6, X7, Y7, X8, Y8, X9, Y9, X10, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X6 ^ Y6, 0, 0 }, - {X3, Y3, Y4, X6, Y6, X7, Y7, X8, Y8, X9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, 0, 0, 0, 0 }, - {X3, Y3, Y4, X6, Y6, X7, Y7, X8, Y8, X9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, 0, 0, 0, 0 }, - {X3, Y3, Y4, X6, Y6, X7, Y7, X8, Y8, X9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, 0, 0, 0, 0 }, - {X3, Y3, Y4, X6, Y6, X7, Y7, X8, Y8, X9, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, 0, 0, 0, 0 }, - {X3, Y3, Y4, X6, X7, Y7, X8, Y8, X9, Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, 0, 0, 0 }, - {X3, Y3, Y4, X6, X7, Y7, X8, Y8, X9, Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, 0, 0, 0 }, - {X3, Y3, Y4, X6, X7, Y7, X8, Y8, X9, Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, 0, 0, 0 }, - {X3, Y3, Y4, X6, X7, Y7, X8, Y8, X9, Y9, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, 0, 0, 0 }, - {X3, Y3, X6, X7, Y7, X8, Y8, X9, Y9, X10, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, 0, 0 }, - {X3, Y3, X6, X7, Y7, X8, Y8, X9, Y9, X10, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, 0, 0 }, - {X3, Y3, X6, X7, Y7, X8, Y8, X9, Y9, X10, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, 0, 0 }, - {X3, Y3, X6, X7, Y7, X8, Y8, X9, Y9, X10, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, 0, 0 }, - {X3, Y3, X7, Y7, X8, Y8, X9, Y9, X10, Y10, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X6 ^ Y8, 0 }, - {X3, Y3, X7, Y7, X8, Y8, X9, Y9, X10, Y10, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X6 ^ Y8, 0 }, - {X3, Y3, X7, Y7, X8, Y8, X9, Y9, X10, Y10, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X6 ^ Y8, 0 }, - {X3, Y3, X6, X7, Y7, X8, X9, Y9, X10, Y10, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X3 ^ Y8, 0 }, - {X3, Y3, Y4, X6, X7, Y7, X8, Y8, X9, Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, 0, 0, 0 }, - {X3, Y3, Y4, X6, X7, Y7, X8, Y8, X9, Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, 0, 0, 0 }, - {X3, Y3, Y4, X6, X7, Y7, X8, Y8, X9, Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, 0, 0, 0 }, - {X3, Y3, Y4, X6, X7, Y7, X8, Y8, X9, Y9, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, 0, 0, 0 }, - {X3, Y3, Y4, X7, Y7, X8, Y8, X9, Y9, X10, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, X6 ^ Y7, 0, 0 }, - {X3, Y3, Y4, X7, Y7, X8, Y8, X9, Y9, X10, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, X6 ^ Y7, 0, 0 }, - {X3, Y3, Y4, X7, Y7, X8, Y8, X9, Y9, X10, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, X6 ^ Y7, 0, 0 }, - {X3, Y3, Y4, X7, Y7, X8, Y8, X9, Y9, X10, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X8, Z1 ^ X5 ^ Y8, Y6 ^ X7, Z0 ^ X6 ^ Y7, 0, 0 }, - {X3, Y3, X7, Y7, X8, Y8, X9, Y9, X10, Y10, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, X6 ^ Y7, 0 }, - {X3, Y3, X7, Y7, X8, Y8, X9, Y9, X10, Y10, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, X6 ^ Y7, 0 }, - {X3, Y3, X7, Y7, X8, Y8, X9, Y9, X10, Y10, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, X6 ^ Y7, 0 }, - {X3, Y3, X7, Y7, X8, Y8, X9, Y9, X10, Y10, Y4 ^ X9 ^ Y9, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X8, Z1 ^ X5 ^ Y8, Y6 ^ X7, Z0 ^ X6 ^ Y7, 0 }, - {X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, 0, 0, 0, 0 }, - {X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, 0, 0, 0, 0 }, - {X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, 0, 0, 0, 0 }, - {X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, 0, 0, 0, 0 }, - {X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, 0, 0, 0, 0 }, - {X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, 0, 0, 0, 0 }, - {X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, 0, 0, 0, 0 }, - {X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, 0, 0, 0, 0 }, - {X3, Y3, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, Z0 ^ X4 ^ Y4, X9, 0, 0, 0, 0 }, - {X3, Y3, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, Z0 ^ X4 ^ Y4, X9, 0, 0, 0, 0 }, - {X3, Y3, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, Z0 ^ X4 ^ Y4, X9, 0, 0, 0, 0 }, - {X3, Y3, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, Z0 ^ X4 ^ Y4, X9, 0, 0, 0, 0 }, - {X3, Y3, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, 0, 0, 0, 0 }, - {X3, Y3, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, 0, 0, 0, 0 }, - {X3, Y3, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, 0, 0, 0, 0 }, - {X3, Y3, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, 0, 0, 0, 0 }, - {X3, Y3, Y5, X6, Y6, X7, Y7, X8, Y8, X9, Y9, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, 0, 0, 0 }, - {X3, Y3, Y5, X6, Y6, X7, Y7, X8, Y8, X9, Y9, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, 0, 0, 0 }, - {X3, Y3, Y5, X6, Y6, X7, Y7, X8, Y8, X9, Y9, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, 0, 0, 0 }, - {X3, Y3, Y5, X6, Y6, X7, Y7, X8, Y8, X9, Y9, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, 0, 0, 0 }, - {X3, Y3, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, Z0 ^ X4 ^ Y4, X9, 0, 0, 0, 0 }, - {X3, Y3, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, Z0 ^ X4 ^ Y4, X9, 0, 0, 0, 0 }, - {X3, Y3, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, Z0 ^ X4 ^ Y4, X9, 0, 0, 0, 0 }, - {X3, Y3, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, Z0 ^ X4 ^ Y4, X9, 0, 0, 0, 0 }, - {X3, Y3, Y4, X5, X6, Y6, X7, Y7, X8, Y8, X9, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, 0, 0, 0, 0 }, - {X3, Y3, Y4, X5, X6, Y6, X7, Y7, X8, Y8, X9, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, 0, 0, 0, 0 }, - {X3, Y3, Y4, X5, X6, Y6, X7, Y7, X8, Y8, X9, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, 0, 0, 0, 0 }, - {X3, Y3, Y4, X5, X6, Y6, X7, Y7, X8, Y8, X9, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, 0, 0, 0, 0 }, - {X3, Y3, X5, X6, Y6, X7, Y7, X8, Y8, X9, Y9, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, 0, 0, 0 }, - {X3, Y3, X5, X6, Y6, X7, Y7, X8, Y8, X9, Y9, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, 0, 0, 0 }, - {X3, Y3, X5, X6, Y6, X7, Y7, X8, Y8, X9, Y9, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, 0, 0, 0 }, - {X3, Y3, X5, X6, Y6, X7, Y7, X8, Y8, X9, Y9, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, 0, 0, 0 }, - {X3, Y3, X6, Y6, X7, Y7, X8, Y8, X9, Y9, X10, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y6, 0, 0 }, - {X3, Y3, X6, Y6, X7, Y7, X8, Y8, X9, Y9, X10, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y6, 0, 0 }, - {X3, Y3, X6, Y6, X7, Y7, X8, Y8, X9, Y9, X10, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y6, 0, 0 }, - {X3, Y3, X6, Y6, X7, Y7, X8, Y8, X9, Y9, X10, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y6, 0, 0 }, - {X3, Y3, Y4, X5, X6, Y6, X7, Y7, X8, Y8, X9, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, 0, 0, 0, 0 }, - {X3, Y3, Y4, X5, X6, Y6, X7, Y7, X8, Y8, X9, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, 0, 0, 0, 0 }, - {X3, Y3, Y4, X5, X6, Y6, X7, Y7, X8, Y8, X9, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, 0, 0, 0, 0 }, - {X3, Y3, Y4, X5, X6, Y6, X7, Y7, X8, Y8, X9, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, 0, 0, 0, 0 }, - {X3, Y3, Y4, X6, Y6, X7, Y7, X8, Y8, X9, Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, 0, 0, 0 }, - {X3, Y3, Y4, X6, Y6, X7, Y7, X8, Y8, X9, Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, 0, 0, 0 }, - {X3, Y3, Y4, X6, Y6, X7, Y7, X8, Y8, X9, Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, 0, 0, 0 }, - {X3, Y3, Y4, X6, Y6, X7, Y7, X8, Y8, X9, Y9, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, 0, 0, 0 }, - {X3, Y3, X6, Y6, X7, Y7, X8, Y8, X9, Y9, X10, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, 0, 0 }, - {X3, Y3, X6, Y6, X7, Y7, X8, Y8, X9, Y9, X10, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, 0, 0 }, - {X3, Y3, X6, Y6, X7, Y7, X8, Y8, X9, Y9, X10, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, 0, 0 }, - {X3, Y3, X6, Y6, X7, Y7, X8, Y8, X9, Y9, X10, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, 0, 0 }, - {X3, Y3, Y6, X7, Y7, X8, Y8, X9, Y9, X10, Y10, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X6 ^ Y6, 0 }, - {X3, Y3, Y6, X7, Y7, X8, Y8, X9, Y9, X10, Y10, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X6 ^ Y6, 0 }, - {X3, Y3, Y6, X7, Y7, X8, Y8, X9, Y9, X10, Y10, Y4 ^ X7 ^ Y7, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, X6 ^ Y6, 0 }, - {X3, Y3, Y6, X7, Y7, X8, Y8, X9, Y9, X10, Y10, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X6 ^ Y6, 0 }, - {X3, Y3, Y4, X6, Y6, X7, Y7, X8, Y8, X9, Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, 0, 0, 0 }, - {X3, Y3, Y4, X6, Y6, X7, Y7, X8, Y8, X9, Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, 0, 0, 0 }, - {X3, Y3, Y4, X6, Y6, X7, Y7, X8, Y8, X9, Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X6, X5 ^ Y6, 0, 0, 0 }, - {X3, Y3, Y4, X6, Y6, X7, Y7, X8, Y8, X9, Y9, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, 0, 0, 0 }, - {X3, Y3, Y4, X6, X7, Y7, X8, Y8, X9, Y9, X10, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, 0, 0 }, - {X3, Y3, Y4, X6, X7, Y7, X8, Y8, X9, Y9, X10, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, 0, 0 }, - {X3, Y3, Y4, X6, X7, Y7, X8, Y8, X9, Y9, X10, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, 0, 0 }, - {X3, Y3, Y4, X6, X7, Y7, X8, Y8, X9, Y9, X10, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, 0, 0 }, - {X3, Y3, X6, X7, Y7, X8, Y8, X9, Y9, X10, Y10, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, 0 }, - {X3, Y3, X6, X7, Y7, X8, Y8, X9, Y9, X10, Y10, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, 0 }, - {X3, Y3, X6, X7, Y7, X8, Y8, X9, Y9, X10, Y10, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, 0 }, - {X3, Y3, X6, X7, Y7, X8, Y8, X9, Y9, X10, Y10, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, 0 }, - {X3, Y3, X7, Y7, X8, Y8, X9, Y9, X10, Y10, X11, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X6 ^ Y8 }, - {X3, Y3, X7, Y7, X8, Y8, X9, Y9, X10, Y10, X11, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X6 ^ Y8 }, - {X3, Y3, X7, Y7, X8, Y8, X9, Y9, X10, Y10, X11, Y4 ^ X8 ^ Y8, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, X6 ^ Y8 }, - {X3, Y3, X6, X7, Y7, X8, X9, Y9, X10, Y10, X11, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X3 ^ Y8 }, - {X3, Y3, Y4, X6, X7, Y7, X8, Y8, X9, Y9, X10, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, 0, 0 }, - {X3, Y3, Y4, X6, X7, Y7, X8, Y8, X9, Y9, X10, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, 0, 0 }, - {X3, Y3, Y4, X6, X7, Y7, X8, Y8, X9, Y9, X10, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X7, X5 ^ Y7, X6 ^ Y6, 0, 0 }, - {X3, Y3, Y4, X6, X7, Y7, X8, Y8, X9, Y9, X10, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, 0, 0 }, - {X3, Y3, Y4, X7, Y7, X8, Y8, X9, Y9, X10, Y10, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, X6 ^ Y7, 0 }, - {X3, Y3, Y4, X7, Y7, X8, Y8, X9, Y9, X10, Y10, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, X6 ^ Y7, 0 }, - {X3, Y3, Y4, X7, Y7, X8, Y8, X9, Y9, X10, Y10, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, X6 ^ Y7, 0 }, - {X3, Y3, Y4, X7, Y7, X8, Y8, X9, Y9, X10, Y10, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X8, Z1 ^ X5 ^ Y8, Y6 ^ X7, Z0 ^ X6 ^ Y7, 0 }, - {X3, Y3, X7, Y7, X8, Y8, X9, Y9, X10, Y10, X11, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, X6 ^ Y7 }, - {X3, Y3, X7, Y7, X8, Y8, X9, Y9, X10, Y10, X11, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, X6 ^ Y7 }, - {X3, Y3, X7, Y7, X8, Y8, X9, Y9, X10, Y10, X11, Y4 ^ X9 ^ Y9, Z1 ^ X4 ^ Y4, Z0 ^ Y5 ^ X8, X5 ^ Y8, Y6 ^ X7, X6 ^ Y7 }, - {X3, Y3, X7, Y7, X8, Y8, X9, Y9, X10, Y10, X11, Y4 ^ X9 ^ Y9, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X8, Z1 ^ X5 ^ Y8, Y6 ^ X7, Z0 ^ X6 ^ Y7 }, -}; - -const UINT_64 DCC_64K_R_X_RBPLUS[][17]= -{ - {0, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, Y9, 0, 0, 0, 0 }, - {0, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, 0, 0, 0, 0 }, - {0, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, 0, 0, 0, 0 }, - {0, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, 0, 0, 0, 0 }, - {0, X2, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, 0, 0, 0, 0 }, - {0, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, Y9, 0, 0, 0, 0 }, - {0, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, 0, 0, 0, 0 }, - {0, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, 0, 0, 0, 0 }, - {0, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, 0, 0, 0, 0 }, - {0, X2, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, 0, 0, 0, 0 }, - {0, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Z0 ^ X4 ^ Y4, Y8, X9, Y9, 0, 0, 0, 0 }, - {0, Y3, Y4, X5, Y5, X6, Y6, X7, Y7, Z0 ^ X4 ^ Y4, X8, Y8, X9, 0, 0, 0, 0 }, - {0, X3, Y3, Y4, X5, Y5, X6, Y6, X7, Z0 ^ X4 ^ Y4, Y7, X8, Y8, 0, 0, 0, 0 }, - {0, Y2, X3, Y3, Y4, X5, Y5, X6, Y6, Z0 ^ X4 ^ Y4, X7, Y7, X8, 0, 0, 0, 0 }, - {0, X2, Y2, X3, Y3, Y4, X5, Y5, X6, Z0 ^ X4 ^ Y4, Y6, X7, Y7, 0, 0, 0, 0 }, - {0, X5, Y5, X6, Y6, X7, Y7, X8, Y8, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X9, Y9, 0, 0, 0, 0 }, - {0, Y3, X5, Y5, X6, Y6, X7, Y7, X8, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, Y8, X9, 0, 0, 0, 0 }, - {0, X3, Y3, X5, Y5, X6, Y6, X7, Y7, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X8, Y8, 0, 0, 0, 0 }, - {0, Y2, X3, Y3, X5, Y5, X6, Y6, X7, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, Y7, X8, 0, 0, 0, 0 }, - {0, X2, Y2, X3, Y3, X5, Y5, X6, Y6, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X7, Y7, 0, 0, 0, 0 }, - {0, Y5, X6, Y6, X7, Y7, X8, Y8, X9, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, Y9, 0, 0, 0, 0 }, - {0, Y3, Y5, X6, Y6, X7, Y7, X8, Y8, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, X9, 0, 0, 0, 0 }, - {0, X3, Y3, Y5, X6, Y6, X7, Y7, X8, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, Y8, 0, 0, 0, 0 }, - {0, Y2, X3, Y3, Y5, X6, Y6, X7, Y7, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, X8, 0, 0, 0, 0 }, - {0, X2, Y2, X3, Y3, Y5, X6, Y6, X7, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, Y7, 0, 0, 0, 0 }, - {0, Y4, X5, X6, Y6, X7, Y7, X8, Y8, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X9, Y9, 0, 0, 0, 0 }, - {0, Y3, Y4, X5, X6, Y6, X7, Y7, X8, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y8, X9, 0, 0, 0, 0 }, - {0, X3, Y3, Y4, X5, X6, Y6, X7, Y7, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X8, Y8, 0, 0, 0, 0 }, - {0, Y2, X3, Y3, Y4, X5, X6, Y6, X7, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y7, X8, 0, 0, 0, 0 }, - {0, X2, Y2, X3, Y3, Y4, X5, X6, Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X7, Y7, 0, 0, 0, 0 }, - {0, X5, X6, Y6, X7, Y7, X8, Y8, X9, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y9, 0, 0, 0, 0 }, - {0, Y3, X5, X6, Y6, X7, Y7, X8, Y8, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X9, 0, 0, 0, 0 }, - {0, X3, Y3, X5, X6, Y6, X7, Y7, X8, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y8, 0, 0, 0, 0 }, - {0, Y2, X3, Y3, X5, X6, Y6, X7, Y7, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X8, 0, 0, 0, 0 }, - {0, X2, Y2, X3, Y3, X5, X6, Y6, X7, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y7, 0, 0, 0, 0 }, - {0, X6, Y6, X7, Y7, X8, Y8, X9, Y9, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y6, 0, 0, 0, 0 }, - {0, Y3, X6, Y6, X7, Y7, X8, Y8, X9, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y6, 0, 0, 0, 0 }, - {0, X3, Y3, X6, Y6, X7, Y7, X8, Y8, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y6, 0, 0, 0, 0 }, - {0, Y2, X3, Y3, X6, Y6, X7, Y7, X8, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y6, 0, 0, 0, 0 }, - {0, X2, Y2, X3, Y3, X6, Y6, X7, Y7, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y6, 0, 0, 0, 0 }, - {0, Y4, X6, Y6, X7, Y7, X8, Y8, X9, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y9, 0, 0, 0, 0 }, - {0, Y3, Y4, X6, Y6, X7, Y7, X8, Y8, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X9, 0, 0, 0, 0 }, - {0, X3, Y3, Y4, X6, Y6, X7, Y7, X8, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y8, 0, 0, 0, 0 }, - {0, Y2, X3, Y3, Y4, X6, Y6, X7, Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X8, 0, 0, 0, 0 }, - {0, X2, Y2, X3, Y3, Y4, X6, Y6, X7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, Y7, 0, 0, 0, 0 }, - {0, X6, Y6, X7, Y7, X8, Y8, X9, Y9, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, 0, 0, 0, 0 }, - {0, Y3, X6, Y6, X7, Y7, X8, Y8, X9, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, 0, 0, 0, 0 }, - {0, X3, Y3, X6, Y6, X7, Y7, X8, Y8, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, 0, 0, 0, 0 }, - {0, Y2, X3, Y3, X6, Y6, X7, Y7, X8, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, 0, 0, 0, 0 }, - {0, X2, Y2, X3, Y3, X6, Y6, X7, Y7, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, 0, 0, 0, 0 }, - {0, Y6, X7, Y7, X8, Y8, X9, Y9, X10, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X6 ^ Y6, 0, 0, 0 }, - {0, Y3, Y6, X7, Y7, X8, Y8, X9, Y9, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X6 ^ Y6, 0, 0, 0 }, - {0, X3, Y3, Y6, X7, Y7, X8, Y8, X9, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X6 ^ Y6, 0, 0, 0 }, - {0, Y2, X3, Y3, Y6, X7, Y7, X8, Y8, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X6 ^ Y6, 0, 0, 0 }, - {0, X2, Y2, Y3, X6, Y6, X7, Y7, X8, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X3 ^ Y6, 0, 0, 0 }, - {0, Y4, X6, X7, Y7, X8, Y8, X9, Y9, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, 0, 0, 0, 0 }, - {0, Y3, Y4, X6, X7, Y7, X8, Y8, X9, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, 0, 0, 0, 0 }, - {0, X3, Y3, Y4, X6, X7, Y7, X8, Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, 0, 0, 0, 0 }, - {0, Y2, X3, Y3, Y4, X6, X7, Y7, X8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, 0, 0, 0, 0 }, - {0, X2, X3, Y3, Y4, X6, X7, Y2, Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, 0, 0, 0, 0 }, - {0, X6, X7, Y7, X8, Y8, X9, Y9, X10, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, 0, 0, 0 }, - {0, Y3, X6, X7, Y7, X8, Y8, X9, Y9, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, 0, 0, 0 }, - {0, X3, Y3, X6, X7, Y7, X8, Y8, X9, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, 0, 0, 0 }, - {0, Y2, X3, Y3, X6, X7, Y7, X8, Y8, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, 0, 0, 0 }, - {0, X2, X3, Y3, X6, X7, Y7, Y2, X8, Y4 ^ X8 ^ Y8, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, 0, 0, 0 }, - {0, X7, Y7, X8, Y8, X9, Y9, X10, Y10, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X6 ^ Y8, 0, 0 }, - {0, Y3, X7, Y7, X8, Y8, X9, Y9, X10, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X6 ^ Y8, 0, 0 }, - {0, X3, Y3, X7, Y7, X8, Y8, X9, Y9, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X6 ^ Y8, 0, 0 }, - {0, Y2, Y3, X6, X7, Y7, X8, Y8, X9, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X3 ^ Y8, 0, 0 }, - {0, X2, Y3, X6, X7, Y7, X8, Y2, Y8, Y4 ^ X8 ^ Y8, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, X3 ^ Y8, 0, 0 }, - {0, Y4, X7, Y7, X8, Y8, X9, Y9, X10, X4 ^ Y4 ^ Z4, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7, 0, 0, 0 }, - {0, Y3, Y4, X7, Y7, X8, Y8, X9, Y9, X4 ^ Y4 ^ Z4, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7, 0, 0, 0 }, - {0, X3, Y3, Y4, X7, Y7, X8, Y8, X9, X4 ^ Y4 ^ Z4, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7, 0, 0, 0 }, - {0, X3, Y3, Y4, X7, Y7, X8, Y2, Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X8, Z1 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7, 0, 0, 0 }, - {0, X3, Y3, Y4, X7, Y7, X8, X2, Y2, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7, 0, 0, 0 }, - {0, X7, Y7, X8, Y8, X9, Y9, X10, Y10, Y4 ^ X9 ^ Y9, X4 ^ Y4 ^ Z4, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7, 0, 0 }, - {0, Y3, X7, Y7, X8, Y8, X9, Y9, X10, Y4 ^ X9 ^ Y9, X4 ^ Y4 ^ Z4, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7, 0, 0 }, - {0, X3, Y3, X7, Y7, X8, Y8, X9, Y9, Y4 ^ X9 ^ Y9, X4 ^ Y4 ^ Z4, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7, 0, 0 }, - {0, X3, Y3, X7, Y7, X8, Y8, Y2, X9, Y4 ^ X9 ^ Y9, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X8, Z1 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7, 0, 0 }, - {0, X3, Y3, X7, Y7, X8, Y8, X2, Y2, Y4 ^ X9 ^ Y9, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7, 0, 0 }, - {0, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, Y9, 0, 0, 0, 0 }, - {0, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, 0, 0, 0, 0 }, - {0, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, 0, 0, 0, 0 }, - {0, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, 0, 0, 0, 0 }, - {0, X2, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, 0, 0, 0, 0 }, - {0, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, Y9, 0, 0, 0, 0 }, - {0, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, 0, 0, 0, 0 }, - {0, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, 0, 0, 0, 0 }, - {0, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, 0, 0, 0, 0 }, - {0, X2, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, 0, 0, 0, 0 }, - {0, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, Z0 ^ X4 ^ Y4, X9, Y9, 0, 0, 0, 0 }, - {0, Y3, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Z0 ^ X4 ^ Y4, Y8, X9, 0, 0, 0, 0 }, - {0, X3, Y3, Y4, X5, Y5, X6, Y6, X7, Y7, Z0 ^ X4 ^ Y4, X8, Y8, 0, 0, 0, 0 }, - {0, Y2, X3, Y3, Y4, X5, Y5, X6, Y6, X7, Z0 ^ X4 ^ Y4, Y7, X8, 0, 0, 0, 0 }, - {0, X2, Y2, X3, Y3, Y4, X5, Y5, X6, Y6, Z0 ^ X4 ^ Y4, X7, Y7, 0, 0, 0, 0 }, - {0, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, Y9, 0, 0, 0, 0 }, - {0, Y3, X5, Y5, X6, Y6, X7, Y7, X8, Y8, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X9, 0, 0, 0, 0 }, - {0, X3, Y3, X5, Y5, X6, Y6, X7, Y7, X8, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, Y8, 0, 0, 0, 0 }, - {0, Y2, X3, Y3, X5, Y5, X6, Y6, X7, Y7, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X8, 0, 0, 0, 0 }, - {0, X2, Y2, X3, Y3, X5, Y5, X6, Y6, X7, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, Y7, 0, 0, 0, 0 }, - {0, Y5, X6, Y6, X7, Y7, X8, Y8, X9, Y9, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, 0, 0, 0, 0 }, - {0, Y3, Y5, X6, Y6, X7, Y7, X8, Y8, X9, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, 0, 0, 0, 0 }, - {0, X3, Y3, Y5, X6, Y6, X7, Y7, X8, Y8, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, 0, 0, 0, 0 }, - {0, Y2, X3, Y3, Y5, X6, Y6, X7, Y7, X8, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, 0, 0, 0, 0 }, - {0, X2, Y2, X3, Y3, Y5, X6, Y6, X7, Y7, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, 0, 0, 0, 0 }, - {0, Y4, X5, X6, Y6, X7, Y7, X8, Y8, X9, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y9, 0, 0, 0, 0 }, - {0, Y3, Y4, X5, X6, Y6, X7, Y7, X8, Y8, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X9, 0, 0, 0, 0 }, - {0, X3, Y3, Y4, X5, X6, Y6, X7, Y7, X8, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y8, 0, 0, 0, 0 }, - {0, Y2, X3, Y3, Y4, X5, X6, Y6, X7, Y7, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X8, 0, 0, 0, 0 }, - {0, X2, Y2, X3, Y3, Y4, X5, X6, Y6, X7, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, Y7, 0, 0, 0, 0 }, - {0, X5, X6, Y6, X7, Y7, X8, Y8, X9, Y9, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, 0, 0, 0, 0 }, - {0, Y3, X5, X6, Y6, X7, Y7, X8, Y8, X9, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, 0, 0, 0, 0 }, - {0, X3, Y3, X5, X6, Y6, X7, Y7, X8, Y8, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, 0, 0, 0, 0 }, - {0, Y2, X3, Y3, X5, X6, Y6, X7, Y7, X8, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, 0, 0, 0, 0 }, - {0, X2, Y2, X3, Y3, X5, X6, Y6, X7, Y7, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, 0, 0, 0, 0 }, - {0, X6, Y6, X7, Y7, X8, Y8, X9, Y9, X10, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y6, 0, 0, 0 }, - {0, Y3, X6, Y6, X7, Y7, X8, Y8, X9, Y9, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y6, 0, 0, 0 }, - {0, X3, Y3, X6, Y6, X7, Y7, X8, Y8, X9, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y6, 0, 0, 0 }, - {0, Y2, X3, Y3, X6, Y6, X7, Y7, X8, Y8, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y6, 0, 0, 0 }, - {0, X2, Y2, X3, Y3, X6, Y6, X7, Y7, X8, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y6, 0, 0, 0 }, - {0, Y4, X6, Y6, X7, Y7, X8, Y8, X9, Y9, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, 0, 0, 0, 0 }, - {0, Y3, Y4, X6, Y6, X7, Y7, X8, Y8, X9, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, 0, 0, 0, 0 }, - {0, X3, Y3, Y4, X6, Y6, X7, Y7, X8, Y8, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, 0, 0, 0, 0 }, - {0, Y2, X3, Y3, Y4, X6, Y6, X7, Y7, X8, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, 0, 0, 0, 0 }, - {0, X2, Y2, X3, Y3, Y4, X6, Y6, X7, Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, 0, 0, 0, 0 }, - {0, X6, Y6, X7, Y7, X8, Y8, X9, Y9, X10, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, 0, 0, 0 }, - {0, Y3, X6, Y6, X7, Y7, X8, Y8, X9, Y9, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, 0, 0, 0 }, - {0, X3, Y3, X6, Y6, X7, Y7, X8, Y8, X9, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, 0, 0, 0 }, - {0, Y2, X3, Y3, X6, Y6, X7, Y7, X8, Y8, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, 0, 0, 0 }, - {0, X2, Y2, X3, Y3, X6, Y6, X7, Y7, X8, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, 0, 0, 0 }, - {0, Y6, X7, Y7, X8, Y8, X9, Y9, X10, Y10, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X6 ^ Y6, 0, 0 }, - {0, Y3, Y6, X7, Y7, X8, Y8, X9, Y9, X10, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X6 ^ Y6, 0, 0 }, - {0, X3, Y3, Y6, X7, Y7, X8, Y8, X9, Y9, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X6 ^ Y6, 0, 0 }, - {0, Y2, X3, Y3, Y6, X7, Y7, X8, Y8, X9, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X6 ^ Y6, 0, 0 }, - {0, X2, Y2, Y3, X6, Y6, X7, Y7, X8, Y8, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X3 ^ Y6, 0, 0 }, - {0, Y4, X6, X7, Y7, X8, Y8, X9, Y9, X10, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, 0, 0, 0 }, - {0, Y3, Y4, X6, X7, Y7, X8, Y8, X9, Y9, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, 0, 0, 0 }, - {0, X3, Y3, Y4, X6, X7, Y7, X8, Y8, X9, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, 0, 0, 0 }, - {0, Y2, X3, Y3, Y4, X6, X7, Y7, X8, Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, 0, 0, 0 }, - {0, X2, X3, Y3, Y4, X6, X7, Y2, Y7, X8, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, 0, 0, 0 }, - {0, X6, X7, Y7, X8, Y8, X9, Y9, X10, Y10, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, 0, 0 }, - {0, Y3, X6, X7, Y7, X8, Y8, X9, Y9, X10, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, 0, 0 }, - {0, X3, Y3, X6, X7, Y7, X8, Y8, X9, Y9, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, 0, 0 }, - {0, Y2, X3, Y3, X6, X7, Y7, X8, Y8, X9, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, 0, 0 }, - {0, X2, X3, Y3, X6, X7, Y7, Y2, X8, Y8, Y4 ^ X8 ^ Y8, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, 0, 0 }, - {0, X7, Y7, X8, Y8, X9, Y9, X10, Y10, X11, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X6 ^ Y8, 0 }, - {0, Y3, X7, Y7, X8, Y8, X9, Y9, X10, Y10, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X6 ^ Y8, 0 }, - {0, X3, Y3, X7, Y7, X8, Y8, X9, Y9, X10, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X6 ^ Y8, 0 }, - {0, Y2, Y3, X6, X7, Y7, X8, Y8, X9, Y9, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X3 ^ Y8, 0 }, - {0, X2, Y3, X6, X7, Y7, X8, Y2, Y8, X9, Y4 ^ X8 ^ Y8, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, X3 ^ Y8, 0 }, - {0, Y4, X7, Y7, X8, Y8, X9, Y9, X10, Y10, X4 ^ Y4 ^ Z4, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7, 0, 0 }, - {0, Y3, Y4, X7, Y7, X8, Y8, X9, Y9, X10, X4 ^ Y4 ^ Z4, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7, 0, 0 }, - {0, X3, Y3, Y4, X7, Y7, X8, Y8, X9, Y9, X4 ^ Y4 ^ Z4, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7, 0, 0 }, - {0, X3, Y3, Y4, X7, Y7, X8, Y2, Y8, X9, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X8, Z1 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7, 0, 0 }, - {0, X3, Y3, Y4, X7, Y7, X8, X2, Y2, Y8, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7, 0, 0 }, - {0, X7, Y7, X8, Y8, X9, Y9, X10, Y10, X11, Y4 ^ X9 ^ Y9, X4 ^ Y4 ^ Z4, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7, 0 }, - {0, Y3, X7, Y7, X8, Y8, X9, Y9, X10, Y10, Y4 ^ X9 ^ Y9, X4 ^ Y4 ^ Z4, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7, 0 }, - {0, X3, Y3, X7, Y7, X8, Y8, X9, Y9, X10, Y4 ^ X9 ^ Y9, X4 ^ Y4 ^ Z4, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7, 0 }, - {0, X3, Y3, X7, Y7, X8, Y8, Y2, X9, Y9, Y4 ^ X9 ^ Y9, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X8, Z1 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7, 0 }, - {0, X3, Y3, X7, Y7, X8, Y8, X2, Y2, X9, Y4 ^ X9 ^ Y9, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7, 0 }, - {0, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, Y9, 0, 0, 0, 0 }, - {0, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, 0, 0, 0, 0 }, - {0, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, 0, 0, 0, 0 }, - {0, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, 0, 0, 0, 0 }, - {0, X2, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, 0, 0, 0, 0 }, - {0, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, Y9, 0, 0, 0, 0 }, - {0, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, 0, 0, 0, 0 }, - {0, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, 0, 0, 0, 0 }, - {0, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, 0, 0, 0, 0 }, - {0, X2, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, 0, 0, 0, 0 }, - {0, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, Z0 ^ X4 ^ Y4, Y9, 0, 0, 0, 0 }, - {0, Y3, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, Z0 ^ X4 ^ Y4, X9, 0, 0, 0, 0 }, - {0, X3, Y3, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Z0 ^ X4 ^ Y4, Y8, 0, 0, 0, 0 }, - {0, Y2, X3, Y3, Y4, X5, Y5, X6, Y6, X7, Y7, Z0 ^ X4 ^ Y4, X8, 0, 0, 0, 0 }, - {0, X2, Y2, X3, Y3, Y4, X5, Y5, X6, Y6, X7, Z0 ^ X4 ^ Y4, Y7, 0, 0, 0, 0 }, - {0, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, Y9, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, 0, 0, 0, 0 }, - {0, Y3, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, 0, 0, 0, 0 }, - {0, X3, Y3, X5, Y5, X6, Y6, X7, Y7, X8, Y8, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, 0, 0, 0, 0 }, - {0, Y2, X3, Y3, X5, Y5, X6, Y6, X7, Y7, X8, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, 0, 0, 0, 0 }, - {0, X2, Y2, X3, Y3, X5, Y5, X6, Y6, X7, Y7, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, 0, 0, 0, 0 }, - {0, Y5, X6, Y6, X7, Y7, X8, Y8, X9, Y9, X10, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, 0, 0, 0 }, - {0, Y3, Y5, X6, Y6, X7, Y7, X8, Y8, X9, Y9, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, 0, 0, 0 }, - {0, X3, Y3, Y5, X6, Y6, X7, Y7, X8, Y8, X9, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, 0, 0, 0 }, - {0, Y2, X3, Y3, Y5, X6, Y6, X7, Y7, X8, Y8, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, 0, 0, 0 }, - {0, X2, Y2, X3, Y3, Y5, X6, Y6, X7, Y7, X8, Y4 ^ X5 ^ Y5, Z0 ^ X4 ^ Y4, X5 ^ Y5, 0, 0, 0 }, - {0, Y4, X5, X6, Y6, X7, Y7, X8, Y8, X9, Y9, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, 0, 0, 0, 0 }, - {0, Y3, Y4, X5, X6, Y6, X7, Y7, X8, Y8, X9, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, 0, 0, 0, 0 }, - {0, X3, Y3, Y4, X5, X6, Y6, X7, Y7, X8, Y8, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, 0, 0, 0, 0 }, - {0, Y2, X3, Y3, Y4, X5, X6, Y6, X7, Y7, X8, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, 0, 0, 0, 0 }, - {0, X2, Y2, X3, Y3, Y4, X5, X6, Y6, X7, Y7, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, 0, 0, 0, 0 }, - {0, X5, X6, Y6, X7, Y7, X8, Y8, X9, Y9, X10, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, 0, 0, 0 }, - {0, Y3, X5, X6, Y6, X7, Y7, X8, Y8, X9, Y9, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, 0, 0, 0 }, - {0, X3, Y3, X5, X6, Y6, X7, Y7, X8, Y8, X9, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, 0, 0, 0 }, - {0, Y2, X3, Y3, X5, X6, Y6, X7, Y7, X8, Y8, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, 0, 0, 0 }, - {0, X2, Y2, X3, Y3, X5, X6, Y6, X7, Y7, X8, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, 0, 0, 0 }, - {0, X6, Y6, X7, Y7, X8, Y8, X9, Y9, X10, Y10, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y6, 0, 0 }, - {0, Y3, X6, Y6, X7, Y7, X8, Y8, X9, Y9, X10, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y6, 0, 0 }, - {0, X3, Y3, X6, Y6, X7, Y7, X8, Y8, X9, Y9, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y6, 0, 0 }, - {0, Y2, X3, Y3, X6, Y6, X7, Y7, X8, Y8, X9, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y6, 0, 0 }, - {0, X2, Y2, X3, Y3, X6, Y6, X7, Y7, X8, Y8, Y4 ^ X6 ^ Y6, Z1 ^ X4 ^ Y4, Z0 ^ X5 ^ Y5, X5 ^ Y6, 0, 0 }, - {0, Y4, X6, Y6, X7, Y7, X8, Y8, X9, Y9, X10, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, 0, 0, 0 }, - {0, Y3, Y4, X6, Y6, X7, Y7, X8, Y8, X9, Y9, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, 0, 0, 0 }, - {0, X3, Y3, Y4, X6, Y6, X7, Y7, X8, Y8, X9, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, 0, 0, 0 }, - {0, Y2, X3, Y3, Y4, X6, Y6, X7, Y7, X8, Y8, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, 0, 0, 0 }, - {0, X2, Y2, X3, Y3, Y4, X6, Y6, X7, Y7, X8, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, 0, 0, 0 }, - {0, X6, Y6, X7, Y7, X8, Y8, X9, Y9, X10, Y10, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, 0, 0 }, - {0, Y3, X6, Y6, X7, Y7, X8, Y8, X9, Y9, X10, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, 0, 0 }, - {0, X3, Y3, X6, Y6, X7, Y7, X8, Y8, X9, Y9, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, 0, 0 }, - {0, Y2, X3, Y3, X6, Y6, X7, Y7, X8, Y8, X9, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, 0, 0 }, - {0, X2, Y2, X3, Y3, X6, Y6, X7, Y7, X8, Y8, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, 0, 0 }, - {0, Y6, X7, Y7, X8, Y8, X9, Y9, X10, Y10, X11, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X6 ^ Y6, 0 }, - {0, Y3, Y6, X7, Y7, X8, Y8, X9, Y9, X10, Y10, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X6 ^ Y6, 0 }, - {0, X3, Y3, Y6, X7, Y7, X8, Y8, X9, Y9, X10, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X6 ^ Y6, 0 }, - {0, Y2, X3, Y3, Y6, X7, Y7, X8, Y8, X9, Y9, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X6 ^ Y6, 0 }, - {0, X2, Y2, Y3, X6, Y6, X7, Y7, X8, Y8, X9, Y4 ^ X7 ^ Y7, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X6, Z0 ^ X5 ^ Y6, X3 ^ Y6, 0 }, - {0, Y4, X6, X7, Y7, X8, Y8, X9, Y9, X10, Y10, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, 0, 0 }, - {0, Y3, Y4, X6, X7, Y7, X8, Y8, X9, Y9, X10, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, 0, 0 }, - {0, X3, Y3, Y4, X6, X7, Y7, X8, Y8, X9, Y9, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, 0, 0 }, - {0, Y2, X3, Y3, Y4, X6, X7, Y7, X8, Y8, X9, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, 0, 0 }, - {0, X2, X3, Y3, Y4, X6, X7, Y2, Y7, X8, Y8, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, 0, 0 }, - {0, X6, X7, Y7, X8, Y8, X9, Y9, X10, Y10, X11, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, 0 }, - {0, Y3, X6, X7, Y7, X8, Y8, X9, Y9, X10, Y10, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, 0 }, - {0, X3, Y3, X6, X7, Y7, X8, Y8, X9, Y9, X10, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, 0 }, - {0, Y2, X3, Y3, X6, X7, Y7, X8, Y8, X9, Y9, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, 0 }, - {0, X2, X3, Y3, X6, X7, Y7, Y2, X8, Y8, X9, Y4 ^ X8 ^ Y8, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, 0 }, - {0, X7, Y7, X8, Y8, X9, Y9, X10, Y10, X11, Y11, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X6 ^ Y8 }, - {0, Y3, X7, Y7, X8, Y8, X9, Y9, X10, Y10, X11, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X6 ^ Y8 }, - {0, X3, Y3, X7, Y7, X8, Y8, X9, Y9, X10, Y10, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X6 ^ Y8 }, - {0, Y2, Y3, X6, X7, Y7, X8, Y8, X9, Y9, X10, Y4 ^ X8 ^ Y8, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X7, Z1 ^ X5 ^ Y7, Z0 ^ X6 ^ Y6, X3 ^ Y8 }, - {0, X2, Y3, X6, X7, Y7, X8, Y2, Y8, X9, Y9, Y4 ^ X8 ^ Y8, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X7, Z0 ^ X5 ^ Y7, Y2 ^ X6 ^ Y6, X3 ^ Y8 }, - {0, Y4, X7, Y7, X8, Y8, X9, Y9, X10, Y10, X11, X4 ^ Y4 ^ Z4, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7, 0 }, - {0, Y3, Y4, X7, Y7, X8, Y8, X9, Y9, X10, Y10, X4 ^ Y4 ^ Z4, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7, 0 }, - {0, X3, Y3, Y4, X7, Y7, X8, Y8, X9, Y9, X10, X4 ^ Y4 ^ Z4, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7, 0 }, - {0, X3, Y3, Y4, X7, Y7, X8, Y2, Y8, X9, Y9, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X8, Z1 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7, 0 }, - {0, X3, Y3, Y4, X7, Y7, X8, X2, Y2, Y8, X9, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7, 0 }, - {0, X7, Y7, X8, Y8, X9, Y9, X10, Y10, X11, Y11, Y4 ^ X9 ^ Y9, X4 ^ Y4 ^ Z4, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7 }, - {0, Y3, X7, Y7, X8, Y8, X9, Y9, X10, Y10, X11, Y4 ^ X9 ^ Y9, X4 ^ Y4 ^ Z4, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7 }, - {0, X3, Y3, X7, Y7, X8, Y8, X9, Y9, X10, Y10, Y4 ^ X9 ^ Y9, X4 ^ Y4 ^ Z4, Z3 ^ Y5 ^ X8, Z2 ^ X5 ^ Y8, Z1 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7 }, - {0, X3, Y3, X7, Y7, X8, Y8, Y2, X9, Y9, X10, Y4 ^ X9 ^ Y9, Z3 ^ X4 ^ Y4, Z2 ^ Y5 ^ X8, Z1 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, Z0 ^ X6 ^ Y7 }, - {0, X3, Y3, X7, Y7, X8, Y8, X2, Y2, X9, Y9, Y4 ^ X9 ^ Y9, Z2 ^ X4 ^ Y4, Z1 ^ Y5 ^ X8, Z0 ^ X5 ^ Y8, Y2 ^ Y6 ^ X7, X2 ^ X6 ^ Y7 }, + { 1, 0, 0, 0, 0, } , // 1 pipes 1 bpe @ SW_256_S @ Navi1x + { 1, 1, 0, 0, 0, } , // 1 pipes 2 bpe @ SW_256_S @ Navi1x + { 1, 2, 0, 0, 0, } , // 1 pipes 4 bpe @ SW_256_S @ Navi1x + { 1, 3, 0, 0, 0, } , // 1 pipes 8 bpe @ SW_256_S @ Navi1x + { 1, 4, 0, 0, 0, } , // 1 pipes 16 bpe @ SW_256_S @ Navi1x + { 1, 0, 0, 0, 0, } , // 2 pipes 1 bpe @ SW_256_S @ Navi1x + { 1, 1, 0, 0, 0, } , // 2 pipes 2 bpe @ SW_256_S @ Navi1x + { 1, 2, 0, 0, 0, } , // 2 pipes 4 bpe @ SW_256_S @ Navi1x + { 1, 3, 0, 0, 0, } , // 2 pipes 8 bpe @ SW_256_S @ Navi1x + { 1, 4, 0, 0, 0, } , // 2 pipes 16 bpe @ SW_256_S @ Navi1x + { 1, 0, 0, 0, 0, } , // 4 pipes 1 bpe @ SW_256_S @ Navi1x + { 1, 1, 0, 0, 0, } , // 4 pipes 2 bpe @ SW_256_S @ Navi1x + { 1, 2, 0, 0, 0, } , // 4 pipes 4 bpe @ SW_256_S @ Navi1x + { 1, 3, 0, 0, 0, } , // 4 pipes 8 bpe @ SW_256_S @ Navi1x + { 1, 4, 0, 0, 0, } , // 4 pipes 16 bpe @ SW_256_S @ Navi1x + { 1, 0, 0, 0, 0, } , // 8 pipes 1 bpe @ SW_256_S @ Navi1x + { 1, 1, 0, 0, 0, } , // 8 pipes 2 bpe @ SW_256_S @ Navi1x + { 1, 2, 0, 0, 0, } , // 8 pipes 4 bpe @ SW_256_S @ Navi1x + { 1, 3, 0, 0, 0, } , // 8 pipes 8 bpe @ SW_256_S @ Navi1x + { 1, 4, 0, 0, 0, } , // 8 pipes 16 bpe @ SW_256_S @ Navi1x + { 1, 0, 0, 0, 0, } , // 16 pipes 1 bpe @ SW_256_S @ Navi1x + { 1, 1, 0, 0, 0, } , // 16 pipes 2 bpe @ SW_256_S @ Navi1x + { 1, 2, 0, 0, 0, } , // 16 pipes 4 bpe @ SW_256_S @ Navi1x + { 1, 3, 0, 0, 0, } , // 16 pipes 8 bpe @ SW_256_S @ Navi1x + { 1, 4, 0, 0, 0, } , // 16 pipes 16 bpe @ SW_256_S @ Navi1x + { 1, 0, 0, 0, 0, } , // 32 pipes 1 bpe @ SW_256_S @ Navi1x + { 1, 1, 0, 0, 0, } , // 32 pipes 2 bpe @ SW_256_S @ Navi1x + { 1, 2, 0, 0, 0, } , // 32 pipes 4 bpe @ SW_256_S @ Navi1x + { 1, 3, 0, 0, 0, } , // 32 pipes 8 bpe @ SW_256_S @ Navi1x + { 1, 4, 0, 0, 0, } , // 32 pipes 16 bpe @ SW_256_S @ Navi1x + { 1, 0, 0, 0, 0, } , // 64 pipes 1 bpe @ SW_256_S @ Navi1x + { 1, 1, 0, 0, 0, } , // 64 pipes 2 bpe @ SW_256_S @ Navi1x + { 1, 2, 0, 0, 0, } , // 64 pipes 4 bpe @ SW_256_S @ Navi1x + { 1, 3, 0, 0, 0, } , // 64 pipes 8 bpe @ SW_256_S @ Navi1x + { 1, 4, 0, 0, 0, } , // 64 pipes 16 bpe @ SW_256_S @ Navi1x +}; + +const ADDR_SW_PATINFO SW_256_D_PATINFO[] = +{ + { 1, 5, 0, 0, 0, } , // 1 pipes 1 bpe @ SW_256_D @ Navi1x + { 1, 1, 0, 0, 0, } , // 1 pipes 2 bpe @ SW_256_D @ Navi1x + { 1, 2, 0, 0, 0, } , // 1 pipes 4 bpe @ SW_256_D @ Navi1x + { 1, 6, 0, 0, 0, } , // 1 pipes 8 bpe @ SW_256_D @ Navi1x + { 1, 7, 0, 0, 0, } , // 1 pipes 16 bpe @ SW_256_D @ Navi1x + { 1, 5, 0, 0, 0, } , // 2 pipes 1 bpe @ SW_256_D @ Navi1x + { 1, 1, 0, 0, 0, } , // 2 pipes 2 bpe @ SW_256_D @ Navi1x + { 1, 2, 0, 0, 0, } , // 2 pipes 4 bpe @ SW_256_D @ Navi1x + { 1, 6, 0, 0, 0, } , // 2 pipes 8 bpe @ SW_256_D @ Navi1x + { 1, 7, 0, 0, 0, } , // 2 pipes 16 bpe @ SW_256_D @ Navi1x + { 1, 5, 0, 0, 0, } , // 4 pipes 1 bpe @ SW_256_D @ Navi1x + { 1, 1, 0, 0, 0, } , // 4 pipes 2 bpe @ SW_256_D @ Navi1x + { 1, 2, 0, 0, 0, } , // 4 pipes 4 bpe @ SW_256_D @ Navi1x + { 1, 6, 0, 0, 0, } , // 4 pipes 8 bpe @ SW_256_D @ Navi1x + { 1, 7, 0, 0, 0, } , // 4 pipes 16 bpe @ SW_256_D @ Navi1x + { 1, 5, 0, 0, 0, } , // 8 pipes 1 bpe @ SW_256_D @ Navi1x + { 1, 1, 0, 0, 0, } , // 8 pipes 2 bpe @ SW_256_D @ Navi1x + { 1, 2, 0, 0, 0, } , // 8 pipes 4 bpe @ SW_256_D @ Navi1x + { 1, 6, 0, 0, 0, } , // 8 pipes 8 bpe @ SW_256_D @ Navi1x + { 1, 7, 0, 0, 0, } , // 8 pipes 16 bpe @ SW_256_D @ Navi1x + { 1, 5, 0, 0, 0, } , // 16 pipes 1 bpe @ SW_256_D @ Navi1x + { 1, 1, 0, 0, 0, } , // 16 pipes 2 bpe @ SW_256_D @ Navi1x + { 1, 2, 0, 0, 0, } , // 16 pipes 4 bpe @ SW_256_D @ Navi1x + { 1, 6, 0, 0, 0, } , // 16 pipes 8 bpe @ SW_256_D @ Navi1x + { 1, 7, 0, 0, 0, } , // 16 pipes 16 bpe @ SW_256_D @ Navi1x + { 1, 5, 0, 0, 0, } , // 32 pipes 1 bpe @ SW_256_D @ Navi1x + { 1, 1, 0, 0, 0, } , // 32 pipes 2 bpe @ SW_256_D @ Navi1x + { 1, 2, 0, 0, 0, } , // 32 pipes 4 bpe @ SW_256_D @ Navi1x + { 1, 6, 0, 0, 0, } , // 32 pipes 8 bpe @ SW_256_D @ Navi1x + { 1, 7, 0, 0, 0, } , // 32 pipes 16 bpe @ SW_256_D @ Navi1x + { 1, 5, 0, 0, 0, } , // 64 pipes 1 bpe @ SW_256_D @ Navi1x + { 1, 1, 0, 0, 0, } , // 64 pipes 2 bpe @ SW_256_D @ Navi1x + { 1, 2, 0, 0, 0, } , // 64 pipes 4 bpe @ SW_256_D @ Navi1x + { 1, 6, 0, 0, 0, } , // 64 pipes 8 bpe @ SW_256_D @ Navi1x + { 1, 7, 0, 0, 0, } , // 64 pipes 16 bpe @ SW_256_D @ Navi1x +}; + +const ADDR_SW_PATINFO SW_4K_S_PATINFO[] = +{ + { 1, 0, 1, 0, 0, } , // 1 pipes 1 bpe @ SW_4K_S @ Navi1x + { 1, 1, 2, 0, 0, } , // 1 pipes 2 bpe @ SW_4K_S @ Navi1x + { 1, 2, 3, 0, 0, } , // 1 pipes 4 bpe @ SW_4K_S @ Navi1x + { 1, 3, 4, 0, 0, } , // 1 pipes 8 bpe @ SW_4K_S @ Navi1x + { 1, 4, 5, 0, 0, } , // 1 pipes 16 bpe @ SW_4K_S @ Navi1x + { 1, 0, 1, 0, 0, } , // 2 pipes 1 bpe @ SW_4K_S @ Navi1x + { 1, 1, 2, 0, 0, } , // 2 pipes 2 bpe @ SW_4K_S @ Navi1x + { 1, 2, 3, 0, 0, } , // 2 pipes 4 bpe @ SW_4K_S @ Navi1x + { 1, 3, 4, 0, 0, } , // 2 pipes 8 bpe @ SW_4K_S @ Navi1x + { 1, 4, 5, 0, 0, } , // 2 pipes 16 bpe @ SW_4K_S @ Navi1x + { 1, 0, 1, 0, 0, } , // 4 pipes 1 bpe @ SW_4K_S @ Navi1x + { 1, 1, 2, 0, 0, } , // 4 pipes 2 bpe @ SW_4K_S @ Navi1x + { 1, 2, 3, 0, 0, } , // 4 pipes 4 bpe @ SW_4K_S @ Navi1x + { 1, 3, 4, 0, 0, } , // 4 pipes 8 bpe @ SW_4K_S @ Navi1x + { 1, 4, 5, 0, 0, } , // 4 pipes 16 bpe @ SW_4K_S @ Navi1x + { 1, 0, 1, 0, 0, } , // 8 pipes 1 bpe @ SW_4K_S @ Navi1x + { 1, 1, 2, 0, 0, } , // 8 pipes 2 bpe @ SW_4K_S @ Navi1x + { 1, 2, 3, 0, 0, } , // 8 pipes 4 bpe @ SW_4K_S @ Navi1x + { 1, 3, 4, 0, 0, } , // 8 pipes 8 bpe @ SW_4K_S @ Navi1x + { 1, 4, 5, 0, 0, } , // 8 pipes 16 bpe @ SW_4K_S @ Navi1x + { 1, 0, 1, 0, 0, } , // 16 pipes 1 bpe @ SW_4K_S @ Navi1x + { 1, 1, 2, 0, 0, } , // 16 pipes 2 bpe @ SW_4K_S @ Navi1x + { 1, 2, 3, 0, 0, } , // 16 pipes 4 bpe @ SW_4K_S @ Navi1x + { 1, 3, 4, 0, 0, } , // 16 pipes 8 bpe @ SW_4K_S @ Navi1x + { 1, 4, 5, 0, 0, } , // 16 pipes 16 bpe @ SW_4K_S @ Navi1x + { 1, 0, 1, 0, 0, } , // 32 pipes 1 bpe @ SW_4K_S @ Navi1x + { 1, 1, 2, 0, 0, } , // 32 pipes 2 bpe @ SW_4K_S @ Navi1x + { 1, 2, 3, 0, 0, } , // 32 pipes 4 bpe @ SW_4K_S @ Navi1x + { 1, 3, 4, 0, 0, } , // 32 pipes 8 bpe @ SW_4K_S @ Navi1x + { 1, 4, 5, 0, 0, } , // 32 pipes 16 bpe @ SW_4K_S @ Navi1x + { 1, 0, 1, 0, 0, } , // 64 pipes 1 bpe @ SW_4K_S @ Navi1x + { 1, 1, 2, 0, 0, } , // 64 pipes 2 bpe @ SW_4K_S @ Navi1x + { 1, 2, 3, 0, 0, } , // 64 pipes 4 bpe @ SW_4K_S @ Navi1x + { 1, 3, 4, 0, 0, } , // 64 pipes 8 bpe @ SW_4K_S @ Navi1x + { 1, 4, 5, 0, 0, } , // 64 pipes 16 bpe @ SW_4K_S @ Navi1x +}; + +const ADDR_SW_PATINFO SW_4K_D_PATINFO[] = +{ + { 1, 5, 1, 0, 0, } , // 1 pipes 1 bpe @ SW_4K_D @ Navi1x + { 1, 1, 2, 0, 0, } , // 1 pipes 2 bpe @ SW_4K_D @ Navi1x + { 1, 2, 3, 0, 0, } , // 1 pipes 4 bpe @ SW_4K_D @ Navi1x + { 1, 6, 4, 0, 0, } , // 1 pipes 8 bpe @ SW_4K_D @ Navi1x + { 1, 7, 5, 0, 0, } , // 1 pipes 16 bpe @ SW_4K_D @ Navi1x + { 1, 5, 1, 0, 0, } , // 2 pipes 1 bpe @ SW_4K_D @ Navi1x + { 1, 1, 2, 0, 0, } , // 2 pipes 2 bpe @ SW_4K_D @ Navi1x + { 1, 2, 3, 0, 0, } , // 2 pipes 4 bpe @ SW_4K_D @ Navi1x + { 1, 6, 4, 0, 0, } , // 2 pipes 8 bpe @ SW_4K_D @ Navi1x + { 1, 7, 5, 0, 0, } , // 2 pipes 16 bpe @ SW_4K_D @ Navi1x + { 1, 5, 1, 0, 0, } , // 4 pipes 1 bpe @ SW_4K_D @ Navi1x + { 1, 1, 2, 0, 0, } , // 4 pipes 2 bpe @ SW_4K_D @ Navi1x + { 1, 2, 3, 0, 0, } , // 4 pipes 4 bpe @ SW_4K_D @ Navi1x + { 1, 6, 4, 0, 0, } , // 4 pipes 8 bpe @ SW_4K_D @ Navi1x + { 1, 7, 5, 0, 0, } , // 4 pipes 16 bpe @ SW_4K_D @ Navi1x + { 1, 5, 1, 0, 0, } , // 8 pipes 1 bpe @ SW_4K_D @ Navi1x + { 1, 1, 2, 0, 0, } , // 8 pipes 2 bpe @ SW_4K_D @ Navi1x + { 1, 2, 3, 0, 0, } , // 8 pipes 4 bpe @ SW_4K_D @ Navi1x + { 1, 6, 4, 0, 0, } , // 8 pipes 8 bpe @ SW_4K_D @ Navi1x + { 1, 7, 5, 0, 0, } , // 8 pipes 16 bpe @ SW_4K_D @ Navi1x + { 1, 5, 1, 0, 0, } , // 16 pipes 1 bpe @ SW_4K_D @ Navi1x + { 1, 1, 2, 0, 0, } , // 16 pipes 2 bpe @ SW_4K_D @ Navi1x + { 1, 2, 3, 0, 0, } , // 16 pipes 4 bpe @ SW_4K_D @ Navi1x + { 1, 6, 4, 0, 0, } , // 16 pipes 8 bpe @ SW_4K_D @ Navi1x + { 1, 7, 5, 0, 0, } , // 16 pipes 16 bpe @ SW_4K_D @ Navi1x + { 1, 5, 1, 0, 0, } , // 32 pipes 1 bpe @ SW_4K_D @ Navi1x + { 1, 1, 2, 0, 0, } , // 32 pipes 2 bpe @ SW_4K_D @ Navi1x + { 1, 2, 3, 0, 0, } , // 32 pipes 4 bpe @ SW_4K_D @ Navi1x + { 1, 6, 4, 0, 0, } , // 32 pipes 8 bpe @ SW_4K_D @ Navi1x + { 1, 7, 5, 0, 0, } , // 32 pipes 16 bpe @ SW_4K_D @ Navi1x + { 1, 5, 1, 0, 0, } , // 64 pipes 1 bpe @ SW_4K_D @ Navi1x + { 1, 1, 2, 0, 0, } , // 64 pipes 2 bpe @ SW_4K_D @ Navi1x + { 1, 2, 3, 0, 0, } , // 64 pipes 4 bpe @ SW_4K_D @ Navi1x + { 1, 6, 4, 0, 0, } , // 64 pipes 8 bpe @ SW_4K_D @ Navi1x + { 1, 7, 5, 0, 0, } , // 64 pipes 16 bpe @ SW_4K_D @ Navi1x +}; + +const ADDR_SW_PATINFO SW_4K_S_X_PATINFO[] = +{ + { 1, 0, 1, 0, 0, } , // 1 pipes 1 bpe @ SW_4K_S_X @ Navi1x + { 1, 1, 2, 0, 0, } , // 1 pipes 2 bpe @ SW_4K_S_X @ Navi1x + { 1, 2, 3, 0, 0, } , // 1 pipes 4 bpe @ SW_4K_S_X @ Navi1x + { 1, 3, 4, 0, 0, } , // 1 pipes 8 bpe @ SW_4K_S_X @ Navi1x + { 1, 4, 5, 0, 0, } , // 1 pipes 16 bpe @ SW_4K_S_X @ Navi1x + { 3, 0, 6, 0, 0, } , // 2 pipes 1 bpe @ SW_4K_S_X @ Navi1x + { 3, 1, 7, 0, 0, } , // 2 pipes 2 bpe @ SW_4K_S_X @ Navi1x + { 3, 2, 8, 0, 0, } , // 2 pipes 4 bpe @ SW_4K_S_X @ Navi1x + { 3, 3, 9, 0, 0, } , // 2 pipes 8 bpe @ SW_4K_S_X @ Navi1x + { 3, 4, 10, 0, 0, } , // 2 pipes 16 bpe @ SW_4K_S_X @ Navi1x + { 3, 0, 11, 0, 0, } , // 4 pipes 1 bpe @ SW_4K_S_X @ Navi1x + { 3, 1, 12, 0, 0, } , // 4 pipes 2 bpe @ SW_4K_S_X @ Navi1x + { 3, 2, 13, 0, 0, } , // 4 pipes 4 bpe @ SW_4K_S_X @ Navi1x + { 3, 3, 14, 0, 0, } , // 4 pipes 8 bpe @ SW_4K_S_X @ Navi1x + { 3, 4, 15, 0, 0, } , // 4 pipes 16 bpe @ SW_4K_S_X @ Navi1x + { 3, 0, 16, 0, 0, } , // 8 pipes 1 bpe @ SW_4K_S_X @ Navi1x + { 3, 1, 17, 0, 0, } , // 8 pipes 2 bpe @ SW_4K_S_X @ Navi1x + { 3, 2, 18, 0, 0, } , // 8 pipes 4 bpe @ SW_4K_S_X @ Navi1x + { 3, 3, 19, 0, 0, } , // 8 pipes 8 bpe @ SW_4K_S_X @ Navi1x + { 3, 4, 20, 0, 0, } , // 8 pipes 16 bpe @ SW_4K_S_X @ Navi1x + { 3, 0, 21, 0, 0, } , // 16 pipes 1 bpe @ SW_4K_S_X @ Navi1x + { 3, 1, 22, 0, 0, } , // 16 pipes 2 bpe @ SW_4K_S_X @ Navi1x + { 3, 2, 23, 0, 0, } , // 16 pipes 4 bpe @ SW_4K_S_X @ Navi1x + { 3, 3, 24, 0, 0, } , // 16 pipes 8 bpe @ SW_4K_S_X @ Navi1x + { 3, 4, 25, 0, 0, } , // 16 pipes 16 bpe @ SW_4K_S_X @ Navi1x + { 3, 0, 21, 0, 0, } , // 32 pipes 1 bpe @ SW_4K_S_X @ Navi1x + { 3, 1, 22, 0, 0, } , // 32 pipes 2 bpe @ SW_4K_S_X @ Navi1x + { 3, 2, 23, 0, 0, } , // 32 pipes 4 bpe @ SW_4K_S_X @ Navi1x + { 3, 3, 24, 0, 0, } , // 32 pipes 8 bpe @ SW_4K_S_X @ Navi1x + { 3, 4, 25, 0, 0, } , // 32 pipes 16 bpe @ SW_4K_S_X @ Navi1x + { 3, 0, 21, 0, 0, } , // 64 pipes 1 bpe @ SW_4K_S_X @ Navi1x + { 3, 1, 22, 0, 0, } , // 64 pipes 2 bpe @ SW_4K_S_X @ Navi1x + { 3, 2, 23, 0, 0, } , // 64 pipes 4 bpe @ SW_4K_S_X @ Navi1x + { 3, 3, 24, 0, 0, } , // 64 pipes 8 bpe @ SW_4K_S_X @ Navi1x + { 3, 4, 25, 0, 0, } , // 64 pipes 16 bpe @ SW_4K_S_X @ Navi1x +}; + +const ADDR_SW_PATINFO SW_4K_D_X_PATINFO[] = +{ + { 1, 5, 1, 0, 0, } , // 1 pipes 1 bpe @ SW_4K_D_X @ Navi1x + { 1, 1, 2, 0, 0, } , // 1 pipes 2 bpe @ SW_4K_D_X @ Navi1x + { 1, 2, 3, 0, 0, } , // 1 pipes 4 bpe @ SW_4K_D_X @ Navi1x + { 1, 6, 4, 0, 0, } , // 1 pipes 8 bpe @ SW_4K_D_X @ Navi1x + { 1, 7, 5, 0, 0, } , // 1 pipes 16 bpe @ SW_4K_D_X @ Navi1x + { 3, 5, 6, 0, 0, } , // 2 pipes 1 bpe @ SW_4K_D_X @ Navi1x + { 3, 1, 7, 0, 0, } , // 2 pipes 2 bpe @ SW_4K_D_X @ Navi1x + { 3, 2, 8, 0, 0, } , // 2 pipes 4 bpe @ SW_4K_D_X @ Navi1x + { 3, 6, 9, 0, 0, } , // 2 pipes 8 bpe @ SW_4K_D_X @ Navi1x + { 3, 7, 10, 0, 0, } , // 2 pipes 16 bpe @ SW_4K_D_X @ Navi1x + { 3, 5, 11, 0, 0, } , // 4 pipes 1 bpe @ SW_4K_D_X @ Navi1x + { 3, 1, 12, 0, 0, } , // 4 pipes 2 bpe @ SW_4K_D_X @ Navi1x + { 3, 2, 13, 0, 0, } , // 4 pipes 4 bpe @ SW_4K_D_X @ Navi1x + { 3, 6, 14, 0, 0, } , // 4 pipes 8 bpe @ SW_4K_D_X @ Navi1x + { 3, 7, 15, 0, 0, } , // 4 pipes 16 bpe @ SW_4K_D_X @ Navi1x + { 3, 5, 16, 0, 0, } , // 8 pipes 1 bpe @ SW_4K_D_X @ Navi1x + { 3, 1, 17, 0, 0, } , // 8 pipes 2 bpe @ SW_4K_D_X @ Navi1x + { 3, 2, 18, 0, 0, } , // 8 pipes 4 bpe @ SW_4K_D_X @ Navi1x + { 3, 6, 19, 0, 0, } , // 8 pipes 8 bpe @ SW_4K_D_X @ Navi1x + { 3, 7, 20, 0, 0, } , // 8 pipes 16 bpe @ SW_4K_D_X @ Navi1x + { 3, 5, 21, 0, 0, } , // 16 pipes 1 bpe @ SW_4K_D_X @ Navi1x + { 3, 1, 22, 0, 0, } , // 16 pipes 2 bpe @ SW_4K_D_X @ Navi1x + { 3, 2, 23, 0, 0, } , // 16 pipes 4 bpe @ SW_4K_D_X @ Navi1x + { 3, 6, 24, 0, 0, } , // 16 pipes 8 bpe @ SW_4K_D_X @ Navi1x + { 3, 7, 25, 0, 0, } , // 16 pipes 16 bpe @ SW_4K_D_X @ Navi1x + { 3, 5, 21, 0, 0, } , // 32 pipes 1 bpe @ SW_4K_D_X @ Navi1x + { 3, 1, 22, 0, 0, } , // 32 pipes 2 bpe @ SW_4K_D_X @ Navi1x + { 3, 2, 23, 0, 0, } , // 32 pipes 4 bpe @ SW_4K_D_X @ Navi1x + { 3, 6, 24, 0, 0, } , // 32 pipes 8 bpe @ SW_4K_D_X @ Navi1x + { 3, 7, 25, 0, 0, } , // 32 pipes 16 bpe @ SW_4K_D_X @ Navi1x + { 3, 5, 21, 0, 0, } , // 64 pipes 1 bpe @ SW_4K_D_X @ Navi1x + { 3, 1, 22, 0, 0, } , // 64 pipes 2 bpe @ SW_4K_D_X @ Navi1x + { 3, 2, 23, 0, 0, } , // 64 pipes 4 bpe @ SW_4K_D_X @ Navi1x + { 3, 6, 24, 0, 0, } , // 64 pipes 8 bpe @ SW_4K_D_X @ Navi1x + { 3, 7, 25, 0, 0, } , // 64 pipes 16 bpe @ SW_4K_D_X @ Navi1x +}; + +const ADDR_SW_PATINFO SW_4K_S3_PATINFO[] = +{ + { 1, 29, 131, 0, 0, } , // 1 pipes 1 bpe @ SW_4K_S3 @ Navi1x + { 1, 30, 132, 0, 0, } , // 1 pipes 2 bpe @ SW_4K_S3 @ Navi1x + { 1, 31, 133, 0, 0, } , // 1 pipes 4 bpe @ SW_4K_S3 @ Navi1x + { 1, 32, 134, 0, 0, } , // 1 pipes 8 bpe @ SW_4K_S3 @ Navi1x + { 1, 33, 135, 0, 0, } , // 1 pipes 16 bpe @ SW_4K_S3 @ Navi1x + { 1, 29, 131, 0, 0, } , // 2 pipes 1 bpe @ SW_4K_S3 @ Navi1x + { 1, 30, 132, 0, 0, } , // 2 pipes 2 bpe @ SW_4K_S3 @ Navi1x + { 1, 31, 133, 0, 0, } , // 2 pipes 4 bpe @ SW_4K_S3 @ Navi1x + { 1, 32, 134, 0, 0, } , // 2 pipes 8 bpe @ SW_4K_S3 @ Navi1x + { 1, 33, 135, 0, 0, } , // 2 pipes 16 bpe @ SW_4K_S3 @ Navi1x + { 1, 29, 131, 0, 0, } , // 4 pipes 1 bpe @ SW_4K_S3 @ Navi1x + { 1, 30, 132, 0, 0, } , // 4 pipes 2 bpe @ SW_4K_S3 @ Navi1x + { 1, 31, 133, 0, 0, } , // 4 pipes 4 bpe @ SW_4K_S3 @ Navi1x + { 1, 32, 134, 0, 0, } , // 4 pipes 8 bpe @ SW_4K_S3 @ Navi1x + { 1, 33, 135, 0, 0, } , // 4 pipes 16 bpe @ SW_4K_S3 @ Navi1x + { 1, 29, 131, 0, 0, } , // 8 pipes 1 bpe @ SW_4K_S3 @ Navi1x + { 1, 30, 132, 0, 0, } , // 8 pipes 2 bpe @ SW_4K_S3 @ Navi1x + { 1, 31, 133, 0, 0, } , // 8 pipes 4 bpe @ SW_4K_S3 @ Navi1x + { 1, 32, 134, 0, 0, } , // 8 pipes 8 bpe @ SW_4K_S3 @ Navi1x + { 1, 33, 135, 0, 0, } , // 8 pipes 16 bpe @ SW_4K_S3 @ Navi1x + { 1, 29, 131, 0, 0, } , // 16 pipes 1 bpe @ SW_4K_S3 @ Navi1x + { 1, 30, 132, 0, 0, } , // 16 pipes 2 bpe @ SW_4K_S3 @ Navi1x + { 1, 31, 133, 0, 0, } , // 16 pipes 4 bpe @ SW_4K_S3 @ Navi1x + { 1, 32, 134, 0, 0, } , // 16 pipes 8 bpe @ SW_4K_S3 @ Navi1x + { 1, 33, 135, 0, 0, } , // 16 pipes 16 bpe @ SW_4K_S3 @ Navi1x + { 1, 29, 131, 0, 0, } , // 32 pipes 1 bpe @ SW_4K_S3 @ Navi1x + { 1, 30, 132, 0, 0, } , // 32 pipes 2 bpe @ SW_4K_S3 @ Navi1x + { 1, 31, 133, 0, 0, } , // 32 pipes 4 bpe @ SW_4K_S3 @ Navi1x + { 1, 32, 134, 0, 0, } , // 32 pipes 8 bpe @ SW_4K_S3 @ Navi1x + { 1, 33, 135, 0, 0, } , // 32 pipes 16 bpe @ SW_4K_S3 @ Navi1x + { 1, 29, 131, 0, 0, } , // 64 pipes 1 bpe @ SW_4K_S3 @ Navi1x + { 1, 30, 132, 0, 0, } , // 64 pipes 2 bpe @ SW_4K_S3 @ Navi1x + { 1, 31, 133, 0, 0, } , // 64 pipes 4 bpe @ SW_4K_S3 @ Navi1x + { 1, 32, 134, 0, 0, } , // 64 pipes 8 bpe @ SW_4K_S3 @ Navi1x + { 1, 33, 135, 0, 0, } , // 64 pipes 16 bpe @ SW_4K_S3 @ Navi1x +}; + +const ADDR_SW_PATINFO SW_4K_S3_X_PATINFO[] = +{ + { 1, 29, 131, 0, 0, } , // 1 pipes 1 bpe @ SW_4K_S3_X @ Navi1x + { 1, 30, 132, 0, 0, } , // 1 pipes 2 bpe @ SW_4K_S3_X @ Navi1x + { 1, 31, 133, 0, 0, } , // 1 pipes 4 bpe @ SW_4K_S3_X @ Navi1x + { 1, 32, 134, 0, 0, } , // 1 pipes 8 bpe @ SW_4K_S3_X @ Navi1x + { 1, 33, 135, 0, 0, } , // 1 pipes 16 bpe @ SW_4K_S3_X @ Navi1x + { 3, 29, 136, 0, 0, } , // 2 pipes 1 bpe @ SW_4K_S3_X @ Navi1x + { 3, 30, 137, 0, 0, } , // 2 pipes 2 bpe @ SW_4K_S3_X @ Navi1x + { 3, 31, 138, 0, 0, } , // 2 pipes 4 bpe @ SW_4K_S3_X @ Navi1x + { 3, 32, 139, 0, 0, } , // 2 pipes 8 bpe @ SW_4K_S3_X @ Navi1x + { 3, 33, 140, 0, 0, } , // 2 pipes 16 bpe @ SW_4K_S3_X @ Navi1x + { 3, 29, 141, 0, 0, } , // 4 pipes 1 bpe @ SW_4K_S3_X @ Navi1x + { 3, 30, 142, 0, 0, } , // 4 pipes 2 bpe @ SW_4K_S3_X @ Navi1x + { 3, 31, 143, 0, 0, } , // 4 pipes 4 bpe @ SW_4K_S3_X @ Navi1x + { 3, 32, 144, 0, 0, } , // 4 pipes 8 bpe @ SW_4K_S3_X @ Navi1x + { 3, 33, 145, 0, 0, } , // 4 pipes 16 bpe @ SW_4K_S3_X @ Navi1x + { 3, 29, 146, 0, 0, } , // 8 pipes 1 bpe @ SW_4K_S3_X @ Navi1x + { 3, 30, 147, 0, 0, } , // 8 pipes 2 bpe @ SW_4K_S3_X @ Navi1x + { 3, 31, 148, 0, 0, } , // 8 pipes 4 bpe @ SW_4K_S3_X @ Navi1x + { 3, 32, 149, 0, 0, } , // 8 pipes 8 bpe @ SW_4K_S3_X @ Navi1x + { 3, 33, 150, 0, 0, } , // 8 pipes 16 bpe @ SW_4K_S3_X @ Navi1x + { 3, 29, 151, 0, 0, } , // 16 pipes 1 bpe @ SW_4K_S3_X @ Navi1x + { 3, 30, 152, 0, 0, } , // 16 pipes 2 bpe @ SW_4K_S3_X @ Navi1x + { 3, 31, 153, 0, 0, } , // 16 pipes 4 bpe @ SW_4K_S3_X @ Navi1x + { 3, 32, 154, 0, 0, } , // 16 pipes 8 bpe @ SW_4K_S3_X @ Navi1x + { 3, 33, 155, 0, 0, } , // 16 pipes 16 bpe @ SW_4K_S3_X @ Navi1x + { 3, 29, 151, 0, 0, } , // 32 pipes 1 bpe @ SW_4K_S3_X @ Navi1x + { 3, 30, 152, 0, 0, } , // 32 pipes 2 bpe @ SW_4K_S3_X @ Navi1x + { 3, 31, 153, 0, 0, } , // 32 pipes 4 bpe @ SW_4K_S3_X @ Navi1x + { 3, 32, 154, 0, 0, } , // 32 pipes 8 bpe @ SW_4K_S3_X @ Navi1x + { 3, 33, 155, 0, 0, } , // 32 pipes 16 bpe @ SW_4K_S3_X @ Navi1x + { 3, 29, 151, 0, 0, } , // 64 pipes 1 bpe @ SW_4K_S3_X @ Navi1x + { 3, 30, 152, 0, 0, } , // 64 pipes 2 bpe @ SW_4K_S3_X @ Navi1x + { 3, 31, 153, 0, 0, } , // 64 pipes 4 bpe @ SW_4K_S3_X @ Navi1x + { 3, 32, 154, 0, 0, } , // 64 pipes 8 bpe @ SW_4K_S3_X @ Navi1x + { 3, 33, 155, 0, 0, } , // 64 pipes 16 bpe @ SW_4K_S3_X @ Navi1x +}; + +const ADDR_SW_PATINFO SW_64K_S_PATINFO[] = +{ + { 1, 0, 1, 1, 0, } , // 1 pipes 1 bpe @ SW_64K_S @ Navi1x + { 1, 1, 2, 2, 0, } , // 1 pipes 2 bpe @ SW_64K_S @ Navi1x + { 1, 2, 3, 3, 0, } , // 1 pipes 4 bpe @ SW_64K_S @ Navi1x + { 1, 3, 4, 4, 0, } , // 1 pipes 8 bpe @ SW_64K_S @ Navi1x + { 1, 4, 5, 5, 0, } , // 1 pipes 16 bpe @ SW_64K_S @ Navi1x + { 1, 0, 1, 1, 0, } , // 2 pipes 1 bpe @ SW_64K_S @ Navi1x + { 1, 1, 2, 2, 0, } , // 2 pipes 2 bpe @ SW_64K_S @ Navi1x + { 1, 2, 3, 3, 0, } , // 2 pipes 4 bpe @ SW_64K_S @ Navi1x + { 1, 3, 4, 4, 0, } , // 2 pipes 8 bpe @ SW_64K_S @ Navi1x + { 1, 4, 5, 5, 0, } , // 2 pipes 16 bpe @ SW_64K_S @ Navi1x + { 1, 0, 1, 1, 0, } , // 4 pipes 1 bpe @ SW_64K_S @ Navi1x + { 1, 1, 2, 2, 0, } , // 4 pipes 2 bpe @ SW_64K_S @ Navi1x + { 1, 2, 3, 3, 0, } , // 4 pipes 4 bpe @ SW_64K_S @ Navi1x + { 1, 3, 4, 4, 0, } , // 4 pipes 8 bpe @ SW_64K_S @ Navi1x + { 1, 4, 5, 5, 0, } , // 4 pipes 16 bpe @ SW_64K_S @ Navi1x + { 1, 0, 1, 1, 0, } , // 8 pipes 1 bpe @ SW_64K_S @ Navi1x + { 1, 1, 2, 2, 0, } , // 8 pipes 2 bpe @ SW_64K_S @ Navi1x + { 1, 2, 3, 3, 0, } , // 8 pipes 4 bpe @ SW_64K_S @ Navi1x + { 1, 3, 4, 4, 0, } , // 8 pipes 8 bpe @ SW_64K_S @ Navi1x + { 1, 4, 5, 5, 0, } , // 8 pipes 16 bpe @ SW_64K_S @ Navi1x + { 1, 0, 1, 1, 0, } , // 16 pipes 1 bpe @ SW_64K_S @ Navi1x + { 1, 1, 2, 2, 0, } , // 16 pipes 2 bpe @ SW_64K_S @ Navi1x + { 1, 2, 3, 3, 0, } , // 16 pipes 4 bpe @ SW_64K_S @ Navi1x + { 1, 3, 4, 4, 0, } , // 16 pipes 8 bpe @ SW_64K_S @ Navi1x + { 1, 4, 5, 5, 0, } , // 16 pipes 16 bpe @ SW_64K_S @ Navi1x + { 1, 0, 1, 1, 0, } , // 32 pipes 1 bpe @ SW_64K_S @ Navi1x + { 1, 1, 2, 2, 0, } , // 32 pipes 2 bpe @ SW_64K_S @ Navi1x + { 1, 2, 3, 3, 0, } , // 32 pipes 4 bpe @ SW_64K_S @ Navi1x + { 1, 3, 4, 4, 0, } , // 32 pipes 8 bpe @ SW_64K_S @ Navi1x + { 1, 4, 5, 5, 0, } , // 32 pipes 16 bpe @ SW_64K_S @ Navi1x + { 1, 0, 1, 1, 0, } , // 64 pipes 1 bpe @ SW_64K_S @ Navi1x + { 1, 1, 2, 2, 0, } , // 64 pipes 2 bpe @ SW_64K_S @ Navi1x + { 1, 2, 3, 3, 0, } , // 64 pipes 4 bpe @ SW_64K_S @ Navi1x + { 1, 3, 4, 4, 0, } , // 64 pipes 8 bpe @ SW_64K_S @ Navi1x + { 1, 4, 5, 5, 0, } , // 64 pipes 16 bpe @ SW_64K_S @ Navi1x +}; + +const ADDR_SW_PATINFO SW_64K_D_PATINFO[] = +{ + { 1, 5, 1, 1, 0, } , // 1 pipes 1 bpe @ SW_64K_D @ Navi1x + { 1, 1, 2, 2, 0, } , // 1 pipes 2 bpe @ SW_64K_D @ Navi1x + { 1, 2, 3, 3, 0, } , // 1 pipes 4 bpe @ SW_64K_D @ Navi1x + { 1, 6, 4, 4, 0, } , // 1 pipes 8 bpe @ SW_64K_D @ Navi1x + { 1, 7, 5, 5, 0, } , // 1 pipes 16 bpe @ SW_64K_D @ Navi1x + { 1, 5, 1, 1, 0, } , // 2 pipes 1 bpe @ SW_64K_D @ Navi1x + { 1, 1, 2, 2, 0, } , // 2 pipes 2 bpe @ SW_64K_D @ Navi1x + { 1, 2, 3, 3, 0, } , // 2 pipes 4 bpe @ SW_64K_D @ Navi1x + { 1, 6, 4, 4, 0, } , // 2 pipes 8 bpe @ SW_64K_D @ Navi1x + { 1, 7, 5, 5, 0, } , // 2 pipes 16 bpe @ SW_64K_D @ Navi1x + { 1, 5, 1, 1, 0, } , // 4 pipes 1 bpe @ SW_64K_D @ Navi1x + { 1, 1, 2, 2, 0, } , // 4 pipes 2 bpe @ SW_64K_D @ Navi1x + { 1, 2, 3, 3, 0, } , // 4 pipes 4 bpe @ SW_64K_D @ Navi1x + { 1, 6, 4, 4, 0, } , // 4 pipes 8 bpe @ SW_64K_D @ Navi1x + { 1, 7, 5, 5, 0, } , // 4 pipes 16 bpe @ SW_64K_D @ Navi1x + { 1, 5, 1, 1, 0, } , // 8 pipes 1 bpe @ SW_64K_D @ Navi1x + { 1, 1, 2, 2, 0, } , // 8 pipes 2 bpe @ SW_64K_D @ Navi1x + { 1, 2, 3, 3, 0, } , // 8 pipes 4 bpe @ SW_64K_D @ Navi1x + { 1, 6, 4, 4, 0, } , // 8 pipes 8 bpe @ SW_64K_D @ Navi1x + { 1, 7, 5, 5, 0, } , // 8 pipes 16 bpe @ SW_64K_D @ Navi1x + { 1, 5, 1, 1, 0, } , // 16 pipes 1 bpe @ SW_64K_D @ Navi1x + { 1, 1, 2, 2, 0, } , // 16 pipes 2 bpe @ SW_64K_D @ Navi1x + { 1, 2, 3, 3, 0, } , // 16 pipes 4 bpe @ SW_64K_D @ Navi1x + { 1, 6, 4, 4, 0, } , // 16 pipes 8 bpe @ SW_64K_D @ Navi1x + { 1, 7, 5, 5, 0, } , // 16 pipes 16 bpe @ SW_64K_D @ Navi1x + { 1, 5, 1, 1, 0, } , // 32 pipes 1 bpe @ SW_64K_D @ Navi1x + { 1, 1, 2, 2, 0, } , // 32 pipes 2 bpe @ SW_64K_D @ Navi1x + { 1, 2, 3, 3, 0, } , // 32 pipes 4 bpe @ SW_64K_D @ Navi1x + { 1, 6, 4, 4, 0, } , // 32 pipes 8 bpe @ SW_64K_D @ Navi1x + { 1, 7, 5, 5, 0, } , // 32 pipes 16 bpe @ SW_64K_D @ Navi1x + { 1, 5, 1, 1, 0, } , // 64 pipes 1 bpe @ SW_64K_D @ Navi1x + { 1, 1, 2, 2, 0, } , // 64 pipes 2 bpe @ SW_64K_D @ Navi1x + { 1, 2, 3, 3, 0, } , // 64 pipes 4 bpe @ SW_64K_D @ Navi1x + { 1, 6, 4, 4, 0, } , // 64 pipes 8 bpe @ SW_64K_D @ Navi1x + { 1, 7, 5, 5, 0, } , // 64 pipes 16 bpe @ SW_64K_D @ Navi1x +}; + +const ADDR_SW_PATINFO SW_64K_S_T_PATINFO[] = +{ + { 1, 0, 1, 1, 0, } , // 1 pipes 1 bpe @ SW_64K_S_T @ Navi1x + { 1, 1, 2, 2, 0, } , // 1 pipes 2 bpe @ SW_64K_S_T @ Navi1x + { 1, 2, 3, 3, 0, } , // 1 pipes 4 bpe @ SW_64K_S_T @ Navi1x + { 1, 3, 4, 4, 0, } , // 1 pipes 8 bpe @ SW_64K_S_T @ Navi1x + { 1, 4, 5, 5, 0, } , // 1 pipes 16 bpe @ SW_64K_S_T @ Navi1x + { 2, 0, 36, 1, 0, } , // 2 pipes 1 bpe @ SW_64K_S_T @ Navi1x + { 2, 1, 37, 2, 0, } , // 2 pipes 2 bpe @ SW_64K_S_T @ Navi1x + { 2, 2, 38, 3, 0, } , // 2 pipes 4 bpe @ SW_64K_S_T @ Navi1x + { 2, 3, 39, 4, 0, } , // 2 pipes 8 bpe @ SW_64K_S_T @ Navi1x + { 2, 4, 40, 5, 0, } , // 2 pipes 16 bpe @ SW_64K_S_T @ Navi1x + { 2, 0, 41, 1, 0, } , // 4 pipes 1 bpe @ SW_64K_S_T @ Navi1x + { 2, 1, 42, 2, 0, } , // 4 pipes 2 bpe @ SW_64K_S_T @ Navi1x + { 2, 2, 43, 3, 0, } , // 4 pipes 4 bpe @ SW_64K_S_T @ Navi1x + { 2, 3, 44, 4, 0, } , // 4 pipes 8 bpe @ SW_64K_S_T @ Navi1x + { 2, 4, 45, 5, 0, } , // 4 pipes 16 bpe @ SW_64K_S_T @ Navi1x + { 2, 0, 46, 1, 0, } , // 8 pipes 1 bpe @ SW_64K_S_T @ Navi1x + { 2, 1, 47, 2, 0, } , // 8 pipes 2 bpe @ SW_64K_S_T @ Navi1x + { 2, 2, 48, 3, 0, } , // 8 pipes 4 bpe @ SW_64K_S_T @ Navi1x + { 2, 3, 49, 4, 0, } , // 8 pipes 8 bpe @ SW_64K_S_T @ Navi1x + { 2, 4, 50, 5, 0, } , // 8 pipes 16 bpe @ SW_64K_S_T @ Navi1x + { 2, 0, 51, 1, 0, } , // 16 pipes 1 bpe @ SW_64K_S_T @ Navi1x + { 2, 1, 52, 2, 0, } , // 16 pipes 2 bpe @ SW_64K_S_T @ Navi1x + { 2, 2, 53, 3, 0, } , // 16 pipes 4 bpe @ SW_64K_S_T @ Navi1x + { 2, 3, 54, 4, 0, } , // 16 pipes 8 bpe @ SW_64K_S_T @ Navi1x + { 2, 4, 55, 5, 0, } , // 16 pipes 16 bpe @ SW_64K_S_T @ Navi1x + { 2, 0, 56, 16, 0, } , // 32 pipes 1 bpe @ SW_64K_S_T @ Navi1x + { 2, 1, 57, 17, 0, } , // 32 pipes 2 bpe @ SW_64K_S_T @ Navi1x + { 2, 2, 58, 18, 0, } , // 32 pipes 4 bpe @ SW_64K_S_T @ Navi1x + { 2, 3, 59, 19, 0, } , // 32 pipes 8 bpe @ SW_64K_S_T @ Navi1x + { 2, 4, 60, 20, 0, } , // 32 pipes 16 bpe @ SW_64K_S_T @ Navi1x + { 2, 0, 1, 21, 0, } , // 64 pipes 1 bpe @ SW_64K_S_T @ Navi1x + { 2, 1, 2, 22, 0, } , // 64 pipes 2 bpe @ SW_64K_S_T @ Navi1x + { 2, 2, 3, 23, 0, } , // 64 pipes 4 bpe @ SW_64K_S_T @ Navi1x + { 2, 3, 4, 24, 0, } , // 64 pipes 8 bpe @ SW_64K_S_T @ Navi1x + { 2, 4, 5, 25, 0, } , // 64 pipes 16 bpe @ SW_64K_S_T @ Navi1x +}; + +const ADDR_SW_PATINFO SW_64K_D_T_PATINFO[] = +{ + { 1, 5, 1, 1, 0, } , // 1 pipes 1 bpe @ SW_64K_D_T @ Navi1x + { 1, 1, 2, 2, 0, } , // 1 pipes 2 bpe @ SW_64K_D_T @ Navi1x + { 1, 2, 3, 3, 0, } , // 1 pipes 4 bpe @ SW_64K_D_T @ Navi1x + { 1, 6, 4, 4, 0, } , // 1 pipes 8 bpe @ SW_64K_D_T @ Navi1x + { 1, 7, 5, 5, 0, } , // 1 pipes 16 bpe @ SW_64K_D_T @ Navi1x + { 2, 5, 36, 1, 0, } , // 2 pipes 1 bpe @ SW_64K_D_T @ Navi1x + { 2, 1, 37, 2, 0, } , // 2 pipes 2 bpe @ SW_64K_D_T @ Navi1x + { 2, 2, 38, 3, 0, } , // 2 pipes 4 bpe @ SW_64K_D_T @ Navi1x + { 2, 6, 39, 4, 0, } , // 2 pipes 8 bpe @ SW_64K_D_T @ Navi1x + { 2, 7, 40, 5, 0, } , // 2 pipes 16 bpe @ SW_64K_D_T @ Navi1x + { 2, 5, 41, 1, 0, } , // 4 pipes 1 bpe @ SW_64K_D_T @ Navi1x + { 2, 1, 42, 2, 0, } , // 4 pipes 2 bpe @ SW_64K_D_T @ Navi1x + { 2, 2, 43, 3, 0, } , // 4 pipes 4 bpe @ SW_64K_D_T @ Navi1x + { 2, 6, 44, 4, 0, } , // 4 pipes 8 bpe @ SW_64K_D_T @ Navi1x + { 2, 7, 45, 5, 0, } , // 4 pipes 16 bpe @ SW_64K_D_T @ Navi1x + { 2, 5, 46, 1, 0, } , // 8 pipes 1 bpe @ SW_64K_D_T @ Navi1x + { 2, 1, 47, 2, 0, } , // 8 pipes 2 bpe @ SW_64K_D_T @ Navi1x + { 2, 2, 48, 3, 0, } , // 8 pipes 4 bpe @ SW_64K_D_T @ Navi1x + { 2, 6, 49, 4, 0, } , // 8 pipes 8 bpe @ SW_64K_D_T @ Navi1x + { 2, 7, 50, 5, 0, } , // 8 pipes 16 bpe @ SW_64K_D_T @ Navi1x + { 2, 5, 51, 1, 0, } , // 16 pipes 1 bpe @ SW_64K_D_T @ Navi1x + { 2, 1, 52, 2, 0, } , // 16 pipes 2 bpe @ SW_64K_D_T @ Navi1x + { 2, 2, 53, 3, 0, } , // 16 pipes 4 bpe @ SW_64K_D_T @ Navi1x + { 2, 6, 54, 4, 0, } , // 16 pipes 8 bpe @ SW_64K_D_T @ Navi1x + { 2, 7, 55, 5, 0, } , // 16 pipes 16 bpe @ SW_64K_D_T @ Navi1x + { 2, 5, 56, 16, 0, } , // 32 pipes 1 bpe @ SW_64K_D_T @ Navi1x + { 2, 1, 57, 17, 0, } , // 32 pipes 2 bpe @ SW_64K_D_T @ Navi1x + { 2, 2, 58, 18, 0, } , // 32 pipes 4 bpe @ SW_64K_D_T @ Navi1x + { 2, 6, 59, 19, 0, } , // 32 pipes 8 bpe @ SW_64K_D_T @ Navi1x + { 2, 7, 60, 20, 0, } , // 32 pipes 16 bpe @ SW_64K_D_T @ Navi1x + { 2, 5, 1, 21, 0, } , // 64 pipes 1 bpe @ SW_64K_D_T @ Navi1x + { 2, 1, 2, 22, 0, } , // 64 pipes 2 bpe @ SW_64K_D_T @ Navi1x + { 2, 2, 3, 23, 0, } , // 64 pipes 4 bpe @ SW_64K_D_T @ Navi1x + { 2, 6, 4, 24, 0, } , // 64 pipes 8 bpe @ SW_64K_D_T @ Navi1x + { 2, 7, 5, 25, 0, } , // 64 pipes 16 bpe @ SW_64K_D_T @ Navi1x +}; + +const ADDR_SW_PATINFO SW_64K_S_X_PATINFO[] = +{ + { 1, 0, 1, 1, 0, } , // 1 pipes 1 bpe @ SW_64K_S_X @ Navi1x + { 1, 1, 2, 2, 0, } , // 1 pipes 2 bpe @ SW_64K_S_X @ Navi1x + { 1, 2, 3, 3, 0, } , // 1 pipes 4 bpe @ SW_64K_S_X @ Navi1x + { 1, 3, 4, 4, 0, } , // 1 pipes 8 bpe @ SW_64K_S_X @ Navi1x + { 1, 4, 5, 5, 0, } , // 1 pipes 16 bpe @ SW_64K_S_X @ Navi1x + { 3, 0, 6, 1, 0, } , // 2 pipes 1 bpe @ SW_64K_S_X @ Navi1x + { 3, 1, 7, 2, 0, } , // 2 pipes 2 bpe @ SW_64K_S_X @ Navi1x + { 3, 2, 8, 3, 0, } , // 2 pipes 4 bpe @ SW_64K_S_X @ Navi1x + { 3, 3, 9, 4, 0, } , // 2 pipes 8 bpe @ SW_64K_S_X @ Navi1x + { 3, 4, 10, 5, 0, } , // 2 pipes 16 bpe @ SW_64K_S_X @ Navi1x + { 3, 0, 11, 1, 0, } , // 4 pipes 1 bpe @ SW_64K_S_X @ Navi1x + { 3, 1, 12, 2, 0, } , // 4 pipes 2 bpe @ SW_64K_S_X @ Navi1x + { 3, 2, 13, 3, 0, } , // 4 pipes 4 bpe @ SW_64K_S_X @ Navi1x + { 3, 3, 14, 4, 0, } , // 4 pipes 8 bpe @ SW_64K_S_X @ Navi1x + { 3, 4, 15, 5, 0, } , // 4 pipes 16 bpe @ SW_64K_S_X @ Navi1x + { 3, 0, 16, 1, 0, } , // 8 pipes 1 bpe @ SW_64K_S_X @ Navi1x + { 3, 1, 17, 2, 0, } , // 8 pipes 2 bpe @ SW_64K_S_X @ Navi1x + { 3, 2, 18, 3, 0, } , // 8 pipes 4 bpe @ SW_64K_S_X @ Navi1x + { 3, 3, 19, 4, 0, } , // 8 pipes 8 bpe @ SW_64K_S_X @ Navi1x + { 3, 4, 20, 5, 0, } , // 8 pipes 16 bpe @ SW_64K_S_X @ Navi1x + { 3, 0, 21, 1, 0, } , // 16 pipes 1 bpe @ SW_64K_S_X @ Navi1x + { 3, 1, 22, 2, 0, } , // 16 pipes 2 bpe @ SW_64K_S_X @ Navi1x + { 3, 2, 23, 3, 0, } , // 16 pipes 4 bpe @ SW_64K_S_X @ Navi1x + { 3, 3, 24, 4, 0, } , // 16 pipes 8 bpe @ SW_64K_S_X @ Navi1x + { 3, 4, 25, 5, 0, } , // 16 pipes 16 bpe @ SW_64K_S_X @ Navi1x + { 3, 0, 26, 6, 0, } , // 32 pipes 1 bpe @ SW_64K_S_X @ Navi1x + { 3, 1, 27, 7, 0, } , // 32 pipes 2 bpe @ SW_64K_S_X @ Navi1x + { 3, 2, 28, 8, 0, } , // 32 pipes 4 bpe @ SW_64K_S_X @ Navi1x + { 3, 3, 29, 9, 0, } , // 32 pipes 8 bpe @ SW_64K_S_X @ Navi1x + { 3, 4, 30, 10, 0, } , // 32 pipes 16 bpe @ SW_64K_S_X @ Navi1x + { 3, 0, 31, 11, 0, } , // 64 pipes 1 bpe @ SW_64K_S_X @ Navi1x + { 3, 1, 32, 12, 0, } , // 64 pipes 2 bpe @ SW_64K_S_X @ Navi1x + { 3, 2, 33, 13, 0, } , // 64 pipes 4 bpe @ SW_64K_S_X @ Navi1x + { 3, 3, 34, 14, 0, } , // 64 pipes 8 bpe @ SW_64K_S_X @ Navi1x + { 3, 4, 35, 15, 0, } , // 64 pipes 16 bpe @ SW_64K_S_X @ Navi1x +}; + +const ADDR_SW_PATINFO SW_64K_D_X_PATINFO[] = +{ + { 1, 5, 1, 1, 0, } , // 1 pipes 1 bpe @ SW_64K_D_X @ Navi1x + { 1, 1, 2, 2, 0, } , // 1 pipes 2 bpe @ SW_64K_D_X @ Navi1x + { 1, 2, 3, 3, 0, } , // 1 pipes 4 bpe @ SW_64K_D_X @ Navi1x + { 1, 6, 4, 4, 0, } , // 1 pipes 8 bpe @ SW_64K_D_X @ Navi1x + { 1, 7, 5, 5, 0, } , // 1 pipes 16 bpe @ SW_64K_D_X @ Navi1x + { 3, 5, 6, 1, 0, } , // 2 pipes 1 bpe @ SW_64K_D_X @ Navi1x + { 3, 1, 7, 2, 0, } , // 2 pipes 2 bpe @ SW_64K_D_X @ Navi1x + { 3, 2, 8, 3, 0, } , // 2 pipes 4 bpe @ SW_64K_D_X @ Navi1x + { 3, 6, 9, 4, 0, } , // 2 pipes 8 bpe @ SW_64K_D_X @ Navi1x + { 3, 7, 10, 5, 0, } , // 2 pipes 16 bpe @ SW_64K_D_X @ Navi1x + { 3, 5, 11, 1, 0, } , // 4 pipes 1 bpe @ SW_64K_D_X @ Navi1x + { 3, 1, 12, 2, 0, } , // 4 pipes 2 bpe @ SW_64K_D_X @ Navi1x + { 3, 2, 13, 3, 0, } , // 4 pipes 4 bpe @ SW_64K_D_X @ Navi1x + { 3, 6, 14, 4, 0, } , // 4 pipes 8 bpe @ SW_64K_D_X @ Navi1x + { 3, 7, 15, 5, 0, } , // 4 pipes 16 bpe @ SW_64K_D_X @ Navi1x + { 3, 5, 16, 1, 0, } , // 8 pipes 1 bpe @ SW_64K_D_X @ Navi1x + { 3, 1, 17, 2, 0, } , // 8 pipes 2 bpe @ SW_64K_D_X @ Navi1x + { 3, 2, 18, 3, 0, } , // 8 pipes 4 bpe @ SW_64K_D_X @ Navi1x + { 3, 6, 19, 4, 0, } , // 8 pipes 8 bpe @ SW_64K_D_X @ Navi1x + { 3, 7, 20, 5, 0, } , // 8 pipes 16 bpe @ SW_64K_D_X @ Navi1x + { 3, 5, 21, 1, 0, } , // 16 pipes 1 bpe @ SW_64K_D_X @ Navi1x + { 3, 1, 22, 2, 0, } , // 16 pipes 2 bpe @ SW_64K_D_X @ Navi1x + { 3, 2, 23, 3, 0, } , // 16 pipes 4 bpe @ SW_64K_D_X @ Navi1x + { 3, 6, 24, 4, 0, } , // 16 pipes 8 bpe @ SW_64K_D_X @ Navi1x + { 3, 7, 25, 5, 0, } , // 16 pipes 16 bpe @ SW_64K_D_X @ Navi1x + { 3, 5, 26, 6, 0, } , // 32 pipes 1 bpe @ SW_64K_D_X @ Navi1x + { 3, 1, 27, 7, 0, } , // 32 pipes 2 bpe @ SW_64K_D_X @ Navi1x + { 3, 2, 28, 8, 0, } , // 32 pipes 4 bpe @ SW_64K_D_X @ Navi1x + { 3, 6, 29, 9, 0, } , // 32 pipes 8 bpe @ SW_64K_D_X @ Navi1x + { 3, 7, 30, 10, 0, } , // 32 pipes 16 bpe @ SW_64K_D_X @ Navi1x + { 3, 5, 31, 11, 0, } , // 64 pipes 1 bpe @ SW_64K_D_X @ Navi1x + { 3, 1, 32, 12, 0, } , // 64 pipes 2 bpe @ SW_64K_D_X @ Navi1x + { 3, 2, 33, 13, 0, } , // 64 pipes 4 bpe @ SW_64K_D_X @ Navi1x + { 3, 6, 34, 14, 0, } , // 64 pipes 8 bpe @ SW_64K_D_X @ Navi1x + { 3, 7, 35, 15, 0, } , // 64 pipes 16 bpe @ SW_64K_D_X @ Navi1x +}; + +const ADDR_SW_PATINFO SW_64K_R_X_1xaa_PATINFO[] = +{ + { 1, 5, 1, 1, 0, } , // 1 pipes 1 bpe @ SW_64K_R_X 1xaa @ Navi1x + { 1, 1, 2, 2, 0, } , // 1 pipes 2 bpe @ SW_64K_R_X 1xaa @ Navi1x + { 1, 2, 3, 3, 0, } , // 1 pipes 4 bpe @ SW_64K_R_X 1xaa @ Navi1x + { 1, 6, 4, 4, 0, } , // 1 pipes 8 bpe @ SW_64K_R_X 1xaa @ Navi1x + { 1, 7, 5, 5, 0, } , // 1 pipes 16 bpe @ SW_64K_R_X 1xaa @ Navi1x + { 3, 28, 61, 1, 0, } , // 2 pipes 1 bpe @ SW_64K_R_X 1xaa @ Navi1x + { 3, 1, 62, 2, 0, } , // 2 pipes 2 bpe @ SW_64K_R_X 1xaa @ Navi1x + { 3, 2, 8, 3, 0, } , // 2 pipes 4 bpe @ SW_64K_R_X 1xaa @ Navi1x + { 3, 6, 63, 4, 0, } , // 2 pipes 8 bpe @ SW_64K_R_X 1xaa @ Navi1x + { 3, 7, 64, 5, 0, } , // 2 pipes 16 bpe @ SW_64K_R_X 1xaa @ Navi1x + { 3, 28, 65, 1, 0, } , // 4 pipes 1 bpe @ SW_64K_R_X 1xaa @ Navi1x + { 3, 1, 66, 2, 0, } , // 4 pipes 2 bpe @ SW_64K_R_X 1xaa @ Navi1x + { 3, 2, 67, 3, 0, } , // 4 pipes 4 bpe @ SW_64K_R_X 1xaa @ Navi1x + { 3, 6, 68, 4, 0, } , // 4 pipes 8 bpe @ SW_64K_R_X 1xaa @ Navi1x + { 3, 7, 69, 26, 0, } , // 4 pipes 16 bpe @ SW_64K_R_X 1xaa @ Navi1x + { 3, 28, 70, 1, 0, } , // 8 pipes 1 bpe @ SW_64K_R_X 1xaa @ Navi1x + { 3, 1, 71, 2, 0, } , // 8 pipes 2 bpe @ SW_64K_R_X 1xaa @ Navi1x + { 3, 2, 72, 27, 0, } , // 8 pipes 4 bpe @ SW_64K_R_X 1xaa @ Navi1x + { 3, 6, 72, 28, 0, } , // 8 pipes 8 bpe @ SW_64K_R_X 1xaa @ Navi1x + { 3, 7, 73, 29, 0, } , // 8 pipes 16 bpe @ SW_64K_R_X 1xaa @ Navi1x + { 3, 28, 74, 1, 0, } , // 16 pipes 1 bpe @ SW_64K_R_X 1xaa @ Navi1x + { 3, 1, 74, 30, 0, } , // 16 pipes 2 bpe @ SW_64K_R_X 1xaa @ Navi1x + { 3, 2, 74, 31, 0, } , // 16 pipes 4 bpe @ SW_64K_R_X 1xaa @ Navi1x + { 3, 6, 74, 32, 0, } , // 16 pipes 8 bpe @ SW_64K_R_X 1xaa @ Navi1x + { 3, 7, 74, 33, 0, } , // 16 pipes 16 bpe @ SW_64K_R_X 1xaa @ Navi1x + { 3, 28, 75, 6, 0, } , // 32 pipes 1 bpe @ SW_64K_R_X 1xaa @ Navi1x + { 3, 1, 75, 34, 0, } , // 32 pipes 2 bpe @ SW_64K_R_X 1xaa @ Navi1x + { 3, 2, 75, 35, 0, } , // 32 pipes 4 bpe @ SW_64K_R_X 1xaa @ Navi1x + { 3, 6, 75, 36, 0, } , // 32 pipes 8 bpe @ SW_64K_R_X 1xaa @ Navi1x + { 3, 7, 76, 37, 0, } , // 32 pipes 16 bpe @ SW_64K_R_X 1xaa @ Navi1x + { 3, 28, 77, 11, 0, } , // 64 pipes 1 bpe @ SW_64K_R_X 1xaa @ Navi1x + { 3, 1, 77, 38, 0, } , // 64 pipes 2 bpe @ SW_64K_R_X 1xaa @ Navi1x + { 3, 2, 77, 39, 0, } , // 64 pipes 4 bpe @ SW_64K_R_X 1xaa @ Navi1x + { 3, 6, 78, 40, 0, } , // 64 pipes 8 bpe @ SW_64K_R_X 1xaa @ Navi1x + { 3, 7, 79, 41, 0, } , // 64 pipes 16 bpe @ SW_64K_R_X 1xaa @ Navi1x +}; + +const ADDR_SW_PATINFO SW_64K_R_X_2xaa_PATINFO[] = +{ + { 2, 5, 1, 99, 0, } , // 1 pipes 1 bpe @ SW_64K_R_X 2xaa @ Navi1x + { 2, 1, 2, 100, 0, } , // 1 pipes 2 bpe @ SW_64K_R_X 2xaa @ Navi1x + { 2, 2, 3, 101, 0, } , // 1 pipes 4 bpe @ SW_64K_R_X 2xaa @ Navi1x + { 2, 6, 4, 102, 0, } , // 1 pipes 8 bpe @ SW_64K_R_X 2xaa @ Navi1x + { 2, 7, 5, 103, 0, } , // 1 pipes 16 bpe @ SW_64K_R_X 2xaa @ Navi1x + { 3, 28, 61, 99, 0, } , // 2 pipes 1 bpe @ SW_64K_R_X 2xaa @ Navi1x + { 3, 1, 62, 100, 0, } , // 2 pipes 2 bpe @ SW_64K_R_X 2xaa @ Navi1x + { 3, 2, 8, 101, 0, } , // 2 pipes 4 bpe @ SW_64K_R_X 2xaa @ Navi1x + { 3, 6, 63, 102, 0, } , // 2 pipes 8 bpe @ SW_64K_R_X 2xaa @ Navi1x + { 3, 7, 64, 103, 0, } , // 2 pipes 16 bpe @ SW_64K_R_X 2xaa @ Navi1x + { 3, 28, 65, 99, 0, } , // 4 pipes 1 bpe @ SW_64K_R_X 2xaa @ Navi1x + { 3, 1, 66, 100, 0, } , // 4 pipes 2 bpe @ SW_64K_R_X 2xaa @ Navi1x + { 3, 2, 67, 101, 0, } , // 4 pipes 4 bpe @ SW_64K_R_X 2xaa @ Navi1x + { 3, 6, 68, 102, 0, } , // 4 pipes 8 bpe @ SW_64K_R_X 2xaa @ Navi1x + { 3, 7, 69, 104, 0, } , // 4 pipes 16 bpe @ SW_64K_R_X 2xaa @ Navi1x + { 3, 28, 70, 99, 0, } , // 8 pipes 1 bpe @ SW_64K_R_X 2xaa @ Navi1x + { 3, 1, 71, 100, 0, } , // 8 pipes 2 bpe @ SW_64K_R_X 2xaa @ Navi1x + { 3, 2, 72, 105, 0, } , // 8 pipes 4 bpe @ SW_64K_R_X 2xaa @ Navi1x + { 3, 6, 72, 106, 0, } , // 8 pipes 8 bpe @ SW_64K_R_X 2xaa @ Navi1x + { 3, 7, 73, 107, 0, } , // 8 pipes 16 bpe @ SW_64K_R_X 2xaa @ Navi1x + { 3, 28, 74, 99, 0, } , // 16 pipes 1 bpe @ SW_64K_R_X 2xaa @ Navi1x + { 3, 1, 74, 108, 0, } , // 16 pipes 2 bpe @ SW_64K_R_X 2xaa @ Navi1x + { 3, 2, 74, 109, 0, } , // 16 pipes 4 bpe @ SW_64K_R_X 2xaa @ Navi1x + { 3, 6, 74, 107, 0, } , // 16 pipes 8 bpe @ SW_64K_R_X 2xaa @ Navi1x + { 3, 7, 113, 33, 0, } , // 16 pipes 16 bpe @ SW_64K_R_X 2xaa @ Navi1x + { 3, 28, 75, 110, 0, } , // 32 pipes 1 bpe @ SW_64K_R_X 2xaa @ Navi1x + { 3, 1, 75, 111, 0, } , // 32 pipes 2 bpe @ SW_64K_R_X 2xaa @ Navi1x + { 3, 2, 75, 112, 0, } , // 32 pipes 4 bpe @ SW_64K_R_X 2xaa @ Navi1x + { 3, 6, 76, 113, 0, } , // 32 pipes 8 bpe @ SW_64K_R_X 2xaa @ Navi1x + { 3, 7, 114, 37, 0, } , // 32 pipes 16 bpe @ SW_64K_R_X 2xaa @ Navi1x + { 3, 28, 78, 114, 0, } , // 64 pipes 1 bpe @ SW_64K_R_X 2xaa @ Navi1x + { 3, 1, 78, 115, 0, } , // 64 pipes 2 bpe @ SW_64K_R_X 2xaa @ Navi1x + { 3, 2, 78, 116, 0, } , // 64 pipes 4 bpe @ SW_64K_R_X 2xaa @ Navi1x + { 3, 6, 79, 117, 0, } , // 64 pipes 8 bpe @ SW_64K_R_X 2xaa @ Navi1x + { 3, 7, 115, 41, 0, } , // 64 pipes 16 bpe @ SW_64K_R_X 2xaa @ Navi1x +}; + +const ADDR_SW_PATINFO SW_64K_R_X_4xaa_PATINFO[] = +{ + { 2, 5, 1, 118, 0, } , // 1 pipes 1 bpe @ SW_64K_R_X 4xaa @ Navi1x + { 2, 1, 2, 119, 0, } , // 1 pipes 2 bpe @ SW_64K_R_X 4xaa @ Navi1x + { 2, 2, 3, 120, 0, } , // 1 pipes 4 bpe @ SW_64K_R_X 4xaa @ Navi1x + { 2, 6, 4, 121, 0, } , // 1 pipes 8 bpe @ SW_64K_R_X 4xaa @ Navi1x + { 2, 7, 5, 122, 0, } , // 1 pipes 16 bpe @ SW_64K_R_X 4xaa @ Navi1x + { 3, 28, 61, 118, 0, } , // 2 pipes 1 bpe @ SW_64K_R_X 4xaa @ Navi1x + { 3, 1, 62, 119, 0, } , // 2 pipes 2 bpe @ SW_64K_R_X 4xaa @ Navi1x + { 3, 2, 8, 120, 0, } , // 2 pipes 4 bpe @ SW_64K_R_X 4xaa @ Navi1x + { 3, 6, 63, 121, 0, } , // 2 pipes 8 bpe @ SW_64K_R_X 4xaa @ Navi1x + { 3, 7, 64, 122, 0, } , // 2 pipes 16 bpe @ SW_64K_R_X 4xaa @ Navi1x + { 3, 28, 65, 118, 0, } , // 4 pipes 1 bpe @ SW_64K_R_X 4xaa @ Navi1x + { 3, 1, 66, 119, 0, } , // 4 pipes 2 bpe @ SW_64K_R_X 4xaa @ Navi1x + { 3, 2, 67, 120, 0, } , // 4 pipes 4 bpe @ SW_64K_R_X 4xaa @ Navi1x + { 3, 6, 68, 121, 0, } , // 4 pipes 8 bpe @ SW_64K_R_X 4xaa @ Navi1x + { 3, 7, 69, 123, 0, } , // 4 pipes 16 bpe @ SW_64K_R_X 4xaa @ Navi1x + { 3, 28, 70, 118, 0, } , // 8 pipes 1 bpe @ SW_64K_R_X 4xaa @ Navi1x + { 3, 1, 71, 119, 0, } , // 8 pipes 2 bpe @ SW_64K_R_X 4xaa @ Navi1x + { 3, 2, 72, 124, 0, } , // 8 pipes 4 bpe @ SW_64K_R_X 4xaa @ Navi1x + { 3, 6, 93, 125, 0, } , // 8 pipes 8 bpe @ SW_64K_R_X 4xaa @ Navi1x + { 3, 7, 116, 107, 0, } , // 8 pipes 16 bpe @ SW_64K_R_X 4xaa @ Navi1x + { 3, 28, 74, 118, 0, } , // 16 pipes 1 bpe @ SW_64K_R_X 4xaa @ Navi1x + { 3, 1, 74, 126, 0, } , // 16 pipes 2 bpe @ SW_64K_R_X 4xaa @ Navi1x + { 3, 2, 74, 127, 0, } , // 16 pipes 4 bpe @ SW_64K_R_X 4xaa @ Navi1x + { 3, 6, 117, 107, 0, } , // 16 pipes 8 bpe @ SW_64K_R_X 4xaa @ Navi1x + { 3, 7, 118, 33, 0, } , // 16 pipes 16 bpe @ SW_64K_R_X 4xaa @ Navi1x + { 3, 28, 76, 128, 0, } , // 32 pipes 1 bpe @ SW_64K_R_X 4xaa @ Navi1x + { 3, 1, 76, 129, 0, } , // 32 pipes 2 bpe @ SW_64K_R_X 4xaa @ Navi1x + { 3, 2, 76, 130, 0, } , // 32 pipes 4 bpe @ SW_64K_R_X 4xaa @ Navi1x + { 3, 6, 119, 113, 0, } , // 32 pipes 8 bpe @ SW_64K_R_X 4xaa @ Navi1x + { 3, 7, 120, 37, 0, } , // 32 pipes 16 bpe @ SW_64K_R_X 4xaa @ Navi1x + { 3, 28, 79, 131, 0, } , // 64 pipes 1 bpe @ SW_64K_R_X 4xaa @ Navi1x + { 3, 1, 79, 132, 0, } , // 64 pipes 2 bpe @ SW_64K_R_X 4xaa @ Navi1x + { 3, 2, 79, 133, 0, } , // 64 pipes 4 bpe @ SW_64K_R_X 4xaa @ Navi1x + { 3, 6, 121, 117, 0, } , // 64 pipes 8 bpe @ SW_64K_R_X 4xaa @ Navi1x + { 3, 7, 122, 41, 0, } , // 64 pipes 16 bpe @ SW_64K_R_X 4xaa @ Navi1x +}; + +const ADDR_SW_PATINFO SW_64K_R_X_8xaa_PATINFO[] = +{ + { 2, 5, 1, 134, 0, } , // 1 pipes 1 bpe @ SW_64K_R_X 8xaa @ Navi1x + { 2, 1, 2, 135, 0, } , // 1 pipes 2 bpe @ SW_64K_R_X 8xaa @ Navi1x + { 2, 2, 3, 135, 0, } , // 1 pipes 4 bpe @ SW_64K_R_X 8xaa @ Navi1x + { 2, 6, 4, 136, 0, } , // 1 pipes 8 bpe @ SW_64K_R_X 8xaa @ Navi1x + { 2, 7, 5, 136, 0, } , // 1 pipes 16 bpe @ SW_64K_R_X 8xaa @ Navi1x + { 3, 28, 61, 134, 0, } , // 2 pipes 1 bpe @ SW_64K_R_X 8xaa @ Navi1x + { 3, 1, 62, 135, 0, } , // 2 pipes 2 bpe @ SW_64K_R_X 8xaa @ Navi1x + { 3, 2, 8, 135, 0, } , // 2 pipes 4 bpe @ SW_64K_R_X 8xaa @ Navi1x + { 3, 6, 63, 136, 0, } , // 2 pipes 8 bpe @ SW_64K_R_X 8xaa @ Navi1x + { 3, 7, 64, 136, 0, } , // 2 pipes 16 bpe @ SW_64K_R_X 8xaa @ Navi1x + { 3, 28, 65, 134, 0, } , // 4 pipes 1 bpe @ SW_64K_R_X 8xaa @ Navi1x + { 3, 1, 66, 135, 0, } , // 4 pipes 2 bpe @ SW_64K_R_X 8xaa @ Navi1x + { 3, 2, 67, 135, 0, } , // 4 pipes 4 bpe @ SW_64K_R_X 8xaa @ Navi1x + { 3, 6, 68, 136, 0, } , // 4 pipes 8 bpe @ SW_64K_R_X 8xaa @ Navi1x + { 3, 7, 102, 137, 0, } , // 4 pipes 16 bpe @ SW_64K_R_X 8xaa @ Navi1x + { 3, 28, 70, 134, 0, } , // 8 pipes 1 bpe @ SW_64K_R_X 8xaa @ Navi1x + { 3, 1, 71, 135, 0, } , // 8 pipes 2 bpe @ SW_64K_R_X 8xaa @ Navi1x + { 3, 2, 72, 138, 0, } , // 8 pipes 4 bpe @ SW_64K_R_X 8xaa @ Navi1x + { 3, 6, 123, 139, 0, } , // 8 pipes 8 bpe @ SW_64K_R_X 8xaa @ Navi1x + { 3, 7, 124, 140, 0, } , // 8 pipes 16 bpe @ SW_64K_R_X 8xaa @ Navi1x + { 3, 28, 105, 134, 0, } , // 16 pipes 1 bpe @ SW_64K_R_X 8xaa @ Navi1x + { 3, 1, 105, 138, 0, } , // 16 pipes 2 bpe @ SW_64K_R_X 8xaa @ Navi1x + { 3, 2, 125, 127, 0, } , // 16 pipes 4 bpe @ SW_64K_R_X 8xaa @ Navi1x + { 3, 6, 126, 107, 0, } , // 16 pipes 8 bpe @ SW_64K_R_X 8xaa @ Navi1x + { 3, 7, 126, 141, 0, } , // 16 pipes 16 bpe @ SW_64K_R_X 8xaa @ Navi1x + { 3, 28, 107, 142, 0, } , // 32 pipes 1 bpe @ SW_64K_R_X 8xaa @ Navi1x + { 3, 1, 108, 143, 0, } , // 32 pipes 2 bpe @ SW_64K_R_X 8xaa @ Navi1x + { 3, 2, 127, 130, 0, } , // 32 pipes 4 bpe @ SW_64K_R_X 8xaa @ Navi1x + { 3, 6, 128, 113, 0, } , // 32 pipes 8 bpe @ SW_64K_R_X 8xaa @ Navi1x + { 3, 7, 128, 144, 0, } , // 32 pipes 16 bpe @ SW_64K_R_X 8xaa @ Navi1x + { 3, 28, 110, 145, 0, } , // 64 pipes 1 bpe @ SW_64K_R_X 8xaa @ Navi1x + { 3, 1, 111, 146, 0, } , // 64 pipes 2 bpe @ SW_64K_R_X 8xaa @ Navi1x + { 3, 2, 129, 133, 0, } , // 64 pipes 4 bpe @ SW_64K_R_X 8xaa @ Navi1x + { 3, 6, 130, 117, 0, } , // 64 pipes 8 bpe @ SW_64K_R_X 8xaa @ Navi1x + { 3, 7, 130, 147, 0, } , // 64 pipes 16 bpe @ SW_64K_R_X 8xaa @ Navi1x +}; + +const ADDR_SW_PATINFO SW_64K_Z_X_1xaa_PATINFO[] = +{ + { 1, 8, 1, 1, 0, } , // 1 pipes 1 bpe @ SW_64K_Z_X 1xaa @ Navi1x + { 1, 9, 2, 2, 0, } , // 1 pipes 2 bpe @ SW_64K_Z_X 1xaa @ Navi1x + { 1, 10, 3, 3, 0, } , // 1 pipes 4 bpe @ SW_64K_Z_X 1xaa @ Navi1x + { 1, 11, 4, 4, 0, } , // 1 pipes 8 bpe @ SW_64K_Z_X 1xaa @ Navi1x + { 1, 7, 5, 5, 0, } , // 1 pipes 16 bpe @ SW_64K_Z_X 1xaa @ Navi1x + { 3, 12, 61, 1, 0, } , // 2 pipes 1 bpe @ SW_64K_Z_X 1xaa @ Navi1x + { 3, 9, 62, 2, 0, } , // 2 pipes 2 bpe @ SW_64K_Z_X 1xaa @ Navi1x + { 3, 10, 8, 3, 0, } , // 2 pipes 4 bpe @ SW_64K_Z_X 1xaa @ Navi1x + { 3, 11, 63, 4, 0, } , // 2 pipes 8 bpe @ SW_64K_Z_X 1xaa @ Navi1x + { 3, 7, 64, 5, 0, } , // 2 pipes 16 bpe @ SW_64K_Z_X 1xaa @ Navi1x + { 3, 12, 65, 1, 0, } , // 4 pipes 1 bpe @ SW_64K_Z_X 1xaa @ Navi1x + { 3, 9, 66, 2, 0, } , // 4 pipes 2 bpe @ SW_64K_Z_X 1xaa @ Navi1x + { 3, 10, 67, 3, 0, } , // 4 pipes 4 bpe @ SW_64K_Z_X 1xaa @ Navi1x + { 3, 11, 68, 4, 0, } , // 4 pipes 8 bpe @ SW_64K_Z_X 1xaa @ Navi1x + { 3, 7, 69, 26, 0, } , // 4 pipes 16 bpe @ SW_64K_Z_X 1xaa @ Navi1x + { 3, 12, 70, 1, 0, } , // 8 pipes 1 bpe @ SW_64K_Z_X 1xaa @ Navi1x + { 3, 9, 71, 2, 0, } , // 8 pipes 2 bpe @ SW_64K_Z_X 1xaa @ Navi1x + { 3, 10, 72, 27, 0, } , // 8 pipes 4 bpe @ SW_64K_Z_X 1xaa @ Navi1x + { 3, 11, 72, 28, 0, } , // 8 pipes 8 bpe @ SW_64K_Z_X 1xaa @ Navi1x + { 3, 7, 73, 29, 0, } , // 8 pipes 16 bpe @ SW_64K_Z_X 1xaa @ Navi1x + { 3, 12, 74, 1, 0, } , // 16 pipes 1 bpe @ SW_64K_Z_X 1xaa @ Navi1x + { 3, 9, 74, 30, 0, } , // 16 pipes 2 bpe @ SW_64K_Z_X 1xaa @ Navi1x + { 3, 10, 74, 31, 0, } , // 16 pipes 4 bpe @ SW_64K_Z_X 1xaa @ Navi1x + { 3, 11, 74, 32, 0, } , // 16 pipes 8 bpe @ SW_64K_Z_X 1xaa @ Navi1x + { 3, 7, 74, 33, 0, } , // 16 pipes 16 bpe @ SW_64K_Z_X 1xaa @ Navi1x + { 3, 12, 75, 6, 0, } , // 32 pipes 1 bpe @ SW_64K_Z_X 1xaa @ Navi1x + { 3, 9, 75, 34, 0, } , // 32 pipes 2 bpe @ SW_64K_Z_X 1xaa @ Navi1x + { 3, 10, 75, 35, 0, } , // 32 pipes 4 bpe @ SW_64K_Z_X 1xaa @ Navi1x + { 3, 11, 75, 36, 0, } , // 32 pipes 8 bpe @ SW_64K_Z_X 1xaa @ Navi1x + { 3, 7, 76, 37, 0, } , // 32 pipes 16 bpe @ SW_64K_Z_X 1xaa @ Navi1x + { 3, 12, 77, 11, 0, } , // 64 pipes 1 bpe @ SW_64K_Z_X 1xaa @ Navi1x + { 3, 9, 77, 38, 0, } , // 64 pipes 2 bpe @ SW_64K_Z_X 1xaa @ Navi1x + { 3, 10, 77, 39, 0, } , // 64 pipes 4 bpe @ SW_64K_Z_X 1xaa @ Navi1x + { 3, 11, 78, 40, 0, } , // 64 pipes 8 bpe @ SW_64K_Z_X 1xaa @ Navi1x + { 3, 7, 79, 41, 0, } , // 64 pipes 16 bpe @ SW_64K_Z_X 1xaa @ Navi1x +}; + +const ADDR_SW_PATINFO SW_64K_Z_X_2xaa_PATINFO[] = +{ + { 1, 13, 80, 42, 0, } , // 1 pipes 1 bpe @ SW_64K_Z_X 2xaa @ Navi1x + { 1, 14, 3, 3, 0, } , // 1 pipes 2 bpe @ SW_64K_Z_X 2xaa @ Navi1x + { 2, 15, 3, 43, 0, } , // 1 pipes 4 bpe @ SW_64K_Z_X 2xaa @ Navi1x + { 2, 16, 81, 44, 0, } , // 1 pipes 8 bpe @ SW_64K_Z_X 2xaa @ Navi1x + { 2, 17, 5, 45, 0, } , // 1 pipes 16 bpe @ SW_64K_Z_X 2xaa @ Navi1x + { 3, 13, 82, 42, 0, } , // 2 pipes 1 bpe @ SW_64K_Z_X 2xaa @ Navi1x + { 3, 14, 8, 3, 0, } , // 2 pipes 2 bpe @ SW_64K_Z_X 2xaa @ Navi1x + { 3, 15, 8, 43, 0, } , // 2 pipes 4 bpe @ SW_64K_Z_X 2xaa @ Navi1x + { 3, 16, 83, 44, 0, } , // 2 pipes 8 bpe @ SW_64K_Z_X 2xaa @ Navi1x + { 3, 17, 64, 45, 0, } , // 2 pipes 16 bpe @ SW_64K_Z_X 2xaa @ Navi1x + { 3, 13, 84, 42, 0, } , // 4 pipes 1 bpe @ SW_64K_Z_X 2xaa @ Navi1x + { 3, 14, 67, 3, 0, } , // 4 pipes 2 bpe @ SW_64K_Z_X 2xaa @ Navi1x + { 3, 15, 67, 43, 0, } , // 4 pipes 4 bpe @ SW_64K_Z_X 2xaa @ Navi1x + { 3, 16, 85, 44, 0, } , // 4 pipes 8 bpe @ SW_64K_Z_X 2xaa @ Navi1x + { 3, 17, 69, 46, 0, } , // 4 pipes 16 bpe @ SW_64K_Z_X 2xaa @ Navi1x + { 3, 13, 86, 42, 0, } , // 8 pipes 1 bpe @ SW_64K_Z_X 2xaa @ Navi1x + { 3, 14, 72, 27, 0, } , // 8 pipes 2 bpe @ SW_64K_Z_X 2xaa @ Navi1x + { 3, 15, 72, 47, 0, } , // 8 pipes 4 bpe @ SW_64K_Z_X 2xaa @ Navi1x + { 3, 16, 73, 48, 0, } , // 8 pipes 8 bpe @ SW_64K_Z_X 2xaa @ Navi1x + { 3, 17, 73, 49, 0, } , // 8 pipes 16 bpe @ SW_64K_Z_X 2xaa @ Navi1x + { 3, 13, 74, 50, 0, } , // 16 pipes 1 bpe @ SW_64K_Z_X 2xaa @ Navi1x + { 3, 14, 74, 31, 0, } , // 16 pipes 2 bpe @ SW_64K_Z_X 2xaa @ Navi1x + { 3, 15, 74, 51, 0, } , // 16 pipes 4 bpe @ SW_64K_Z_X 2xaa @ Navi1x + { 3, 16, 74, 52, 0, } , // 16 pipes 8 bpe @ SW_64K_Z_X 2xaa @ Navi1x + { 3, 17, 87, 53, 0, } , // 16 pipes 16 bpe @ SW_64K_Z_X 2xaa @ Navi1x + { 3, 13, 75, 54, 0, } , // 32 pipes 1 bpe @ SW_64K_Z_X 2xaa @ Navi1x + { 3, 14, 75, 35, 0, } , // 32 pipes 2 bpe @ SW_64K_Z_X 2xaa @ Navi1x + { 3, 15, 75, 55, 0, } , // 32 pipes 4 bpe @ SW_64K_Z_X 2xaa @ Navi1x + { 3, 16, 76, 56, 0, } , // 32 pipes 8 bpe @ SW_64K_Z_X 2xaa @ Navi1x + { 3, 17, 88, 57, 0, } , // 32 pipes 16 bpe @ SW_64K_Z_X 2xaa @ Navi1x + { 3, 13, 78, 58, 0, } , // 64 pipes 1 bpe @ SW_64K_Z_X 2xaa @ Navi1x + { 3, 14, 78, 59, 0, } , // 64 pipes 2 bpe @ SW_64K_Z_X 2xaa @ Navi1x + { 3, 15, 78, 60, 0, } , // 64 pipes 4 bpe @ SW_64K_Z_X 2xaa @ Navi1x + { 3, 16, 79, 41, 0, } , // 64 pipes 8 bpe @ SW_64K_Z_X 2xaa @ Navi1x + { 3, 17, 89, 61, 0, } , // 64 pipes 16 bpe @ SW_64K_Z_X 2xaa @ Navi1x +}; + +const ADDR_SW_PATINFO SW_64K_Z_X_4xaa_PATINFO[] = +{ + { 1, 18, 3, 3, 0, } , // 1 pipes 1 bpe @ SW_64K_Z_X 4xaa @ Navi1x + { 2, 19, 90, 62, 0, } , // 1 pipes 2 bpe @ SW_64K_Z_X 4xaa @ Navi1x + { 2, 20, 3, 63, 0, } , // 1 pipes 4 bpe @ SW_64K_Z_X 4xaa @ Navi1x + { 2, 21, 4, 64, 0, } , // 1 pipes 8 bpe @ SW_64K_Z_X 4xaa @ Navi1x + { 2, 22, 5, 65, 0, } , // 1 pipes 16 bpe @ SW_64K_Z_X 4xaa @ Navi1x + { 3, 18, 8, 3, 0, } , // 2 pipes 1 bpe @ SW_64K_Z_X 4xaa @ Navi1x + { 3, 19, 91, 62, 0, } , // 2 pipes 2 bpe @ SW_64K_Z_X 4xaa @ Navi1x + { 3, 20, 8, 66, 0, } , // 2 pipes 4 bpe @ SW_64K_Z_X 4xaa @ Navi1x + { 3, 21, 63, 67, 0, } , // 2 pipes 8 bpe @ SW_64K_Z_X 4xaa @ Navi1x + { 3, 22, 64, 68, 0, } , // 2 pipes 16 bpe @ SW_64K_Z_X 4xaa @ Navi1x + { 3, 18, 67, 3, 0, } , // 4 pipes 1 bpe @ SW_64K_Z_X 4xaa @ Navi1x + { 3, 19, 92, 62, 0, } , // 4 pipes 2 bpe @ SW_64K_Z_X 4xaa @ Navi1x + { 3, 20, 67, 63, 0, } , // 4 pipes 4 bpe @ SW_64K_Z_X 4xaa @ Navi1x + { 3, 21, 68, 64, 0, } , // 4 pipes 8 bpe @ SW_64K_Z_X 4xaa @ Navi1x + { 3, 22, 69, 69, 0, } , // 4 pipes 16 bpe @ SW_64K_Z_X 4xaa @ Navi1x + { 3, 18, 72, 27, 0, } , // 8 pipes 1 bpe @ SW_64K_Z_X 4xaa @ Navi1x + { 3, 19, 72, 70, 0, } , // 8 pipes 2 bpe @ SW_64K_Z_X 4xaa @ Navi1x + { 3, 20, 72, 71, 0, } , // 8 pipes 4 bpe @ SW_64K_Z_X 4xaa @ Navi1x + { 3, 21, 93, 72, 0, } , // 8 pipes 8 bpe @ SW_64K_Z_X 4xaa @ Navi1x + { 3, 22, 94, 73, 0, } , // 8 pipes 16 bpe @ SW_64K_Z_X 4xaa @ Navi1x + { 3, 18, 74, 31, 0, } , // 16 pipes 1 bpe @ SW_64K_Z_X 4xaa @ Navi1x + { 3, 19, 74, 74, 0, } , // 16 pipes 2 bpe @ SW_64K_Z_X 4xaa @ Navi1x + { 3, 20, 74, 75, 0, } , // 16 pipes 4 bpe @ SW_64K_Z_X 4xaa @ Navi1x + { 3, 21, 95, 76, 0, } , // 16 pipes 8 bpe @ SW_64K_Z_X 4xaa @ Navi1x + { 3, 22, 96, 76, 0, } , // 16 pipes 16 bpe @ SW_64K_Z_X 4xaa @ Navi1x + { 3, 18, 76, 77, 0, } , // 32 pipes 1 bpe @ SW_64K_Z_X 4xaa @ Navi1x + { 3, 19, 76, 78, 0, } , // 32 pipes 2 bpe @ SW_64K_Z_X 4xaa @ Navi1x + { 3, 20, 76, 56, 0, } , // 32 pipes 4 bpe @ SW_64K_Z_X 4xaa @ Navi1x + { 3, 21, 97, 79, 0, } , // 32 pipes 8 bpe @ SW_64K_Z_X 4xaa @ Navi1x + { 3, 22, 98, 79, 0, } , // 32 pipes 16 bpe @ SW_64K_Z_X 4xaa @ Navi1x + { 3, 18, 79, 80, 0, } , // 64 pipes 1 bpe @ SW_64K_Z_X 4xaa @ Navi1x + { 3, 19, 79, 81, 0, } , // 64 pipes 2 bpe @ SW_64K_Z_X 4xaa @ Navi1x + { 3, 20, 79, 41, 0, } , // 64 pipes 4 bpe @ SW_64K_Z_X 4xaa @ Navi1x + { 3, 21, 99, 82, 0, } , // 64 pipes 8 bpe @ SW_64K_Z_X 4xaa @ Navi1x + { 3, 22, 100, 82, 0, } , // 64 pipes 16 bpe @ SW_64K_Z_X 4xaa @ Navi1x +}; + +const ADDR_SW_PATINFO SW_64K_Z_X_8xaa_PATINFO[] = +{ + { 2, 23, 3, 43, 0, } , // 1 pipes 1 bpe @ SW_64K_Z_X 8xaa @ Navi1x + { 2, 24, 3, 63, 0, } , // 1 pipes 2 bpe @ SW_64K_Z_X 8xaa @ Navi1x + { 2, 25, 3, 83, 0, } , // 1 pipes 4 bpe @ SW_64K_Z_X 8xaa @ Navi1x + { 2, 26, 81, 84, 0, } , // 1 pipes 8 bpe @ SW_64K_Z_X 8xaa @ Navi1x + { 2, 27, 5, 85, 0, } , // 1 pipes 16 bpe @ SW_64K_Z_X 8xaa @ Navi1x + { 3, 23, 8, 43, 0, } , // 2 pipes 1 bpe @ SW_64K_Z_X 8xaa @ Navi1x + { 3, 24, 8, 66, 0, } , // 2 pipes 2 bpe @ SW_64K_Z_X 8xaa @ Navi1x + { 3, 25, 8, 86, 0, } , // 2 pipes 4 bpe @ SW_64K_Z_X 8xaa @ Navi1x + { 3, 26, 101, 87, 0, } , // 2 pipes 8 bpe @ SW_64K_Z_X 8xaa @ Navi1x + { 3, 27, 64, 88, 0, } , // 2 pipes 16 bpe @ SW_64K_Z_X 8xaa @ Navi1x + { 3, 23, 67, 43, 0, } , // 4 pipes 1 bpe @ SW_64K_Z_X 8xaa @ Navi1x + { 3, 24, 67, 63, 0, } , // 4 pipes 2 bpe @ SW_64K_Z_X 8xaa @ Navi1x + { 3, 25, 67, 83, 0, } , // 4 pipes 4 bpe @ SW_64K_Z_X 8xaa @ Navi1x + { 3, 26, 85, 84, 0, } , // 4 pipes 8 bpe @ SW_64K_Z_X 8xaa @ Navi1x + { 3, 27, 102, 89, 0, } , // 4 pipes 16 bpe @ SW_64K_Z_X 8xaa @ Navi1x + { 3, 23, 72, 47, 0, } , // 8 pipes 1 bpe @ SW_64K_Z_X 8xaa @ Navi1x + { 3, 24, 72, 71, 0, } , // 8 pipes 2 bpe @ SW_64K_Z_X 8xaa @ Navi1x + { 3, 25, 72, 90, 0, } , // 8 pipes 4 bpe @ SW_64K_Z_X 8xaa @ Navi1x + { 3, 26, 103, 91, 0, } , // 8 pipes 8 bpe @ SW_64K_Z_X 8xaa @ Navi1x + { 3, 27, 104, 92, 0, } , // 8 pipes 16 bpe @ SW_64K_Z_X 8xaa @ Navi1x + { 3, 23, 105, 51, 0, } , // 16 pipes 1 bpe @ SW_64K_Z_X 8xaa @ Navi1x + { 3, 24, 105, 75, 0, } , // 16 pipes 2 bpe @ SW_64K_Z_X 8xaa @ Navi1x + { 3, 25, 87, 93, 0, } , // 16 pipes 4 bpe @ SW_64K_Z_X 8xaa @ Navi1x + { 3, 26, 96, 76, 0, } , // 16 pipes 8 bpe @ SW_64K_Z_X 8xaa @ Navi1x + { 3, 27, 106, 94, 0, } , // 16 pipes 16 bpe @ SW_64K_Z_X 8xaa @ Navi1x + { 3, 23, 107, 95, 0, } , // 32 pipes 1 bpe @ SW_64K_Z_X 8xaa @ Navi1x + { 3, 24, 108, 56, 0, } , // 32 pipes 2 bpe @ SW_64K_Z_X 8xaa @ Navi1x + { 3, 25, 88, 57, 0, } , // 32 pipes 4 bpe @ SW_64K_Z_X 8xaa @ Navi1x + { 3, 26, 98, 79, 0, } , // 32 pipes 8 bpe @ SW_64K_Z_X 8xaa @ Navi1x + { 3, 27, 109, 96, 0, } , // 32 pipes 16 bpe @ SW_64K_Z_X 8xaa @ Navi1x + { 3, 23, 110, 97, 0, } , // 64 pipes 1 bpe @ SW_64K_Z_X 8xaa @ Navi1x + { 3, 24, 111, 41, 0, } , // 64 pipes 2 bpe @ SW_64K_Z_X 8xaa @ Navi1x + { 3, 25, 89, 61, 0, } , // 64 pipes 4 bpe @ SW_64K_Z_X 8xaa @ Navi1x + { 3, 26, 100, 82, 0, } , // 64 pipes 8 bpe @ SW_64K_Z_X 8xaa @ Navi1x + { 3, 27, 112, 98, 0, } , // 64 pipes 16 bpe @ SW_64K_Z_X 8xaa @ Navi1x +}; + +const ADDR_SW_PATINFO SW_64K_S3_PATINFO[] = +{ + { 1, 29, 131, 148, 0, } , // 1 pipes 1 bpe @ SW_64K_S3 @ Navi1x + { 1, 30, 132, 149, 0, } , // 1 pipes 2 bpe @ SW_64K_S3 @ Navi1x + { 1, 31, 133, 150, 0, } , // 1 pipes 4 bpe @ SW_64K_S3 @ Navi1x + { 1, 32, 134, 151, 0, } , // 1 pipes 8 bpe @ SW_64K_S3 @ Navi1x + { 1, 33, 135, 152, 0, } , // 1 pipes 16 bpe @ SW_64K_S3 @ Navi1x + { 1, 29, 131, 148, 0, } , // 2 pipes 1 bpe @ SW_64K_S3 @ Navi1x + { 1, 30, 132, 149, 0, } , // 2 pipes 2 bpe @ SW_64K_S3 @ Navi1x + { 1, 31, 133, 150, 0, } , // 2 pipes 4 bpe @ SW_64K_S3 @ Navi1x + { 1, 32, 134, 151, 0, } , // 2 pipes 8 bpe @ SW_64K_S3 @ Navi1x + { 1, 33, 135, 152, 0, } , // 2 pipes 16 bpe @ SW_64K_S3 @ Navi1x + { 1, 29, 131, 148, 0, } , // 4 pipes 1 bpe @ SW_64K_S3 @ Navi1x + { 1, 30, 132, 149, 0, } , // 4 pipes 2 bpe @ SW_64K_S3 @ Navi1x + { 1, 31, 133, 150, 0, } , // 4 pipes 4 bpe @ SW_64K_S3 @ Navi1x + { 1, 32, 134, 151, 0, } , // 4 pipes 8 bpe @ SW_64K_S3 @ Navi1x + { 1, 33, 135, 152, 0, } , // 4 pipes 16 bpe @ SW_64K_S3 @ Navi1x + { 1, 29, 131, 148, 0, } , // 8 pipes 1 bpe @ SW_64K_S3 @ Navi1x + { 1, 30, 132, 149, 0, } , // 8 pipes 2 bpe @ SW_64K_S3 @ Navi1x + { 1, 31, 133, 150, 0, } , // 8 pipes 4 bpe @ SW_64K_S3 @ Navi1x + { 1, 32, 134, 151, 0, } , // 8 pipes 8 bpe @ SW_64K_S3 @ Navi1x + { 1, 33, 135, 152, 0, } , // 8 pipes 16 bpe @ SW_64K_S3 @ Navi1x + { 1, 29, 131, 148, 0, } , // 16 pipes 1 bpe @ SW_64K_S3 @ Navi1x + { 1, 30, 132, 149, 0, } , // 16 pipes 2 bpe @ SW_64K_S3 @ Navi1x + { 1, 31, 133, 150, 0, } , // 16 pipes 4 bpe @ SW_64K_S3 @ Navi1x + { 1, 32, 134, 151, 0, } , // 16 pipes 8 bpe @ SW_64K_S3 @ Navi1x + { 1, 33, 135, 152, 0, } , // 16 pipes 16 bpe @ SW_64K_S3 @ Navi1x + { 1, 29, 131, 148, 0, } , // 32 pipes 1 bpe @ SW_64K_S3 @ Navi1x + { 1, 30, 132, 149, 0, } , // 32 pipes 2 bpe @ SW_64K_S3 @ Navi1x + { 1, 31, 133, 150, 0, } , // 32 pipes 4 bpe @ SW_64K_S3 @ Navi1x + { 1, 32, 134, 151, 0, } , // 32 pipes 8 bpe @ SW_64K_S3 @ Navi1x + { 1, 33, 135, 152, 0, } , // 32 pipes 16 bpe @ SW_64K_S3 @ Navi1x + { 1, 29, 131, 148, 0, } , // 64 pipes 1 bpe @ SW_64K_S3 @ Navi1x + { 1, 30, 132, 149, 0, } , // 64 pipes 2 bpe @ SW_64K_S3 @ Navi1x + { 1, 31, 133, 150, 0, } , // 64 pipes 4 bpe @ SW_64K_S3 @ Navi1x + { 1, 32, 134, 151, 0, } , // 64 pipes 8 bpe @ SW_64K_S3 @ Navi1x + { 1, 33, 135, 152, 0, } , // 64 pipes 16 bpe @ SW_64K_S3 @ Navi1x +}; + +const ADDR_SW_PATINFO SW_64K_S3_X_PATINFO[] = +{ + { 1, 29, 131, 148, 0, } , // 1 pipes 1 bpe @ SW_64K_S3_X @ Navi1x + { 1, 30, 132, 149, 0, } , // 1 pipes 2 bpe @ SW_64K_S3_X @ Navi1x + { 1, 31, 133, 150, 0, } , // 1 pipes 4 bpe @ SW_64K_S3_X @ Navi1x + { 1, 32, 134, 151, 0, } , // 1 pipes 8 bpe @ SW_64K_S3_X @ Navi1x + { 1, 33, 135, 152, 0, } , // 1 pipes 16 bpe @ SW_64K_S3_X @ Navi1x + { 3, 29, 136, 148, 0, } , // 2 pipes 1 bpe @ SW_64K_S3_X @ Navi1x + { 3, 30, 137, 149, 0, } , // 2 pipes 2 bpe @ SW_64K_S3_X @ Navi1x + { 3, 31, 138, 150, 0, } , // 2 pipes 4 bpe @ SW_64K_S3_X @ Navi1x + { 3, 32, 139, 151, 0, } , // 2 pipes 8 bpe @ SW_64K_S3_X @ Navi1x + { 3, 33, 140, 152, 0, } , // 2 pipes 16 bpe @ SW_64K_S3_X @ Navi1x + { 3, 29, 141, 148, 0, } , // 4 pipes 1 bpe @ SW_64K_S3_X @ Navi1x + { 3, 30, 142, 149, 0, } , // 4 pipes 2 bpe @ SW_64K_S3_X @ Navi1x + { 3, 31, 143, 150, 0, } , // 4 pipes 4 bpe @ SW_64K_S3_X @ Navi1x + { 3, 32, 144, 151, 0, } , // 4 pipes 8 bpe @ SW_64K_S3_X @ Navi1x + { 3, 33, 145, 152, 0, } , // 4 pipes 16 bpe @ SW_64K_S3_X @ Navi1x + { 3, 29, 146, 148, 0, } , // 8 pipes 1 bpe @ SW_64K_S3_X @ Navi1x + { 3, 30, 147, 149, 0, } , // 8 pipes 2 bpe @ SW_64K_S3_X @ Navi1x + { 3, 31, 148, 150, 0, } , // 8 pipes 4 bpe @ SW_64K_S3_X @ Navi1x + { 3, 32, 149, 151, 0, } , // 8 pipes 8 bpe @ SW_64K_S3_X @ Navi1x + { 3, 33, 150, 152, 0, } , // 8 pipes 16 bpe @ SW_64K_S3_X @ Navi1x + { 3, 29, 151, 148, 0, } , // 16 pipes 1 bpe @ SW_64K_S3_X @ Navi1x + { 3, 30, 152, 149, 0, } , // 16 pipes 2 bpe @ SW_64K_S3_X @ Navi1x + { 3, 31, 153, 150, 0, } , // 16 pipes 4 bpe @ SW_64K_S3_X @ Navi1x + { 3, 32, 154, 151, 0, } , // 16 pipes 8 bpe @ SW_64K_S3_X @ Navi1x + { 3, 33, 155, 152, 0, } , // 16 pipes 16 bpe @ SW_64K_S3_X @ Navi1x + { 3, 29, 156, 153, 0, } , // 32 pipes 1 bpe @ SW_64K_S3_X @ Navi1x + { 3, 30, 157, 154, 0, } , // 32 pipes 2 bpe @ SW_64K_S3_X @ Navi1x + { 3, 31, 158, 155, 0, } , // 32 pipes 4 bpe @ SW_64K_S3_X @ Navi1x + { 3, 32, 159, 156, 0, } , // 32 pipes 8 bpe @ SW_64K_S3_X @ Navi1x + { 3, 33, 160, 157, 0, } , // 32 pipes 16 bpe @ SW_64K_S3_X @ Navi1x + { 3, 29, 161, 158, 0, } , // 64 pipes 1 bpe @ SW_64K_S3_X @ Navi1x + { 3, 30, 162, 159, 0, } , // 64 pipes 2 bpe @ SW_64K_S3_X @ Navi1x + { 3, 31, 163, 160, 0, } , // 64 pipes 4 bpe @ SW_64K_S3_X @ Navi1x + { 3, 32, 164, 161, 0, } , // 64 pipes 8 bpe @ SW_64K_S3_X @ Navi1x + { 3, 33, 165, 162, 0, } , // 64 pipes 16 bpe @ SW_64K_S3_X @ Navi1x +}; + +const ADDR_SW_PATINFO SW_64K_S3_T_PATINFO[] = +{ + { 1, 29, 131, 148, 0, } , // 1 pipes 1 bpe @ SW_64K_S3_T @ Navi1x + { 1, 30, 132, 149, 0, } , // 1 pipes 2 bpe @ SW_64K_S3_T @ Navi1x + { 1, 31, 133, 150, 0, } , // 1 pipes 4 bpe @ SW_64K_S3_T @ Navi1x + { 1, 32, 134, 151, 0, } , // 1 pipes 8 bpe @ SW_64K_S3_T @ Navi1x + { 1, 33, 135, 152, 0, } , // 1 pipes 16 bpe @ SW_64K_S3_T @ Navi1x + { 3, 29, 136, 148, 0, } , // 2 pipes 1 bpe @ SW_64K_S3_T @ Navi1x + { 3, 30, 137, 149, 0, } , // 2 pipes 2 bpe @ SW_64K_S3_T @ Navi1x + { 3, 31, 138, 150, 0, } , // 2 pipes 4 bpe @ SW_64K_S3_T @ Navi1x + { 3, 32, 139, 151, 0, } , // 2 pipes 8 bpe @ SW_64K_S3_T @ Navi1x + { 3, 33, 140, 152, 0, } , // 2 pipes 16 bpe @ SW_64K_S3_T @ Navi1x + { 3, 29, 141, 148, 0, } , // 4 pipes 1 bpe @ SW_64K_S3_T @ Navi1x + { 3, 30, 142, 149, 0, } , // 4 pipes 2 bpe @ SW_64K_S3_T @ Navi1x + { 3, 31, 143, 150, 0, } , // 4 pipes 4 bpe @ SW_64K_S3_T @ Navi1x + { 3, 32, 144, 151, 0, } , // 4 pipes 8 bpe @ SW_64K_S3_T @ Navi1x + { 3, 33, 145, 152, 0, } , // 4 pipes 16 bpe @ SW_64K_S3_T @ Navi1x + { 3, 29, 166, 148, 0, } , // 8 pipes 1 bpe @ SW_64K_S3_T @ Navi1x + { 3, 30, 167, 149, 0, } , // 8 pipes 2 bpe @ SW_64K_S3_T @ Navi1x + { 3, 31, 168, 150, 0, } , // 8 pipes 4 bpe @ SW_64K_S3_T @ Navi1x + { 3, 32, 169, 151, 0, } , // 8 pipes 8 bpe @ SW_64K_S3_T @ Navi1x + { 3, 33, 170, 152, 0, } , // 8 pipes 16 bpe @ SW_64K_S3_T @ Navi1x + { 3, 29, 171, 148, 0, } , // 16 pipes 1 bpe @ SW_64K_S3_T @ Navi1x + { 3, 30, 172, 149, 0, } , // 16 pipes 2 bpe @ SW_64K_S3_T @ Navi1x + { 3, 31, 173, 150, 0, } , // 16 pipes 4 bpe @ SW_64K_S3_T @ Navi1x + { 3, 32, 174, 151, 0, } , // 16 pipes 8 bpe @ SW_64K_S3_T @ Navi1x + { 3, 33, 175, 152, 0, } , // 16 pipes 16 bpe @ SW_64K_S3_T @ Navi1x + { 3, 29, 176, 153, 0, } , // 32 pipes 1 bpe @ SW_64K_S3_T @ Navi1x + { 3, 30, 177, 154, 0, } , // 32 pipes 2 bpe @ SW_64K_S3_T @ Navi1x + { 3, 31, 178, 155, 0, } , // 32 pipes 4 bpe @ SW_64K_S3_T @ Navi1x + { 3, 32, 179, 156, 0, } , // 32 pipes 8 bpe @ SW_64K_S3_T @ Navi1x + { 3, 33, 180, 157, 0, } , // 32 pipes 16 bpe @ SW_64K_S3_T @ Navi1x + { 3, 29, 131, 163, 0, } , // 64 pipes 1 bpe @ SW_64K_S3_T @ Navi1x + { 3, 30, 132, 164, 0, } , // 64 pipes 2 bpe @ SW_64K_S3_T @ Navi1x + { 3, 31, 133, 165, 0, } , // 64 pipes 4 bpe @ SW_64K_S3_T @ Navi1x + { 3, 32, 134, 166, 0, } , // 64 pipes 8 bpe @ SW_64K_S3_T @ Navi1x + { 3, 33, 135, 167, 0, } , // 64 pipes 16 bpe @ SW_64K_S3_T @ Navi1x +}; + +const ADDR_SW_PATINFO SW_64K_D3_X_PATINFO[] = +{ + { 1, 34, 131, 148, 0, } , // 1 pipes 1 bpe @ SW_64K_D3_X @ Navi1x + { 1, 35, 132, 149, 0, } , // 1 pipes 2 bpe @ SW_64K_D3_X @ Navi1x + { 1, 36, 133, 150, 0, } , // 1 pipes 4 bpe @ SW_64K_D3_X @ Navi1x + { 1, 37, 134, 151, 0, } , // 1 pipes 8 bpe @ SW_64K_D3_X @ Navi1x + { 1, 38, 135, 152, 0, } , // 1 pipes 16 bpe @ SW_64K_D3_X @ Navi1x + { 2, 34, 181, 148, 0, } , // 2 pipes 1 bpe @ SW_64K_D3_X @ Navi1x + { 2, 35, 182, 149, 0, } , // 2 pipes 2 bpe @ SW_64K_D3_X @ Navi1x + { 2, 36, 183, 150, 0, } , // 2 pipes 4 bpe @ SW_64K_D3_X @ Navi1x + { 2, 37, 184, 168, 0, } , // 2 pipes 8 bpe @ SW_64K_D3_X @ Navi1x + { 2, 38, 185, 169, 0, } , // 2 pipes 16 bpe @ SW_64K_D3_X @ Navi1x + { 2, 34, 186, 170, 0, } , // 4 pipes 1 bpe @ SW_64K_D3_X @ Navi1x + { 2, 35, 186, 171, 0, } , // 4 pipes 2 bpe @ SW_64K_D3_X @ Navi1x + { 2, 36, 187, 172, 0, } , // 4 pipes 4 bpe @ SW_64K_D3_X @ Navi1x + { 2, 37, 188, 169, 0, } , // 4 pipes 8 bpe @ SW_64K_D3_X @ Navi1x + { 3, 38, 189, 169, 0, } , // 4 pipes 16 bpe @ SW_64K_D3_X @ Navi1x + { 2, 34, 190, 173, 0, } , // 8 pipes 1 bpe @ SW_64K_D3_X @ Navi1x + { 3, 35, 191, 171, 0, } , // 8 pipes 2 bpe @ SW_64K_D3_X @ Navi1x + { 3, 36, 192, 172, 0, } , // 8 pipes 4 bpe @ SW_64K_D3_X @ Navi1x + { 3, 37, 193, 169, 0, } , // 8 pipes 8 bpe @ SW_64K_D3_X @ Navi1x + { 3, 38, 194, 169, 0, } , // 8 pipes 16 bpe @ SW_64K_D3_X @ Navi1x + { 3, 34, 195, 174, 0, } , // 16 pipes 1 bpe @ SW_64K_D3_X @ Navi1x + { 3, 35, 196, 171, 0, } , // 16 pipes 2 bpe @ SW_64K_D3_X @ Navi1x + { 3, 36, 197, 172, 0, } , // 16 pipes 4 bpe @ SW_64K_D3_X @ Navi1x + { 3, 37, 198, 169, 0, } , // 16 pipes 8 bpe @ SW_64K_D3_X @ Navi1x + { 3, 38, 199, 169, 0, } , // 16 pipes 16 bpe @ SW_64K_D3_X @ Navi1x + { 3, 34, 200, 175, 0, } , // 32 pipes 1 bpe @ SW_64K_D3_X @ Navi1x + { 3, 35, 201, 176, 0, } , // 32 pipes 2 bpe @ SW_64K_D3_X @ Navi1x + { 3, 36, 202, 177, 0, } , // 32 pipes 4 bpe @ SW_64K_D3_X @ Navi1x + { 3, 37, 203, 178, 0, } , // 32 pipes 8 bpe @ SW_64K_D3_X @ Navi1x + { 3, 38, 204, 178, 0, } , // 32 pipes 16 bpe @ SW_64K_D3_X @ Navi1x + { 3, 34, 205, 179, 0, } , // 64 pipes 1 bpe @ SW_64K_D3_X @ Navi1x + { 3, 35, 206, 180, 0, } , // 64 pipes 2 bpe @ SW_64K_D3_X @ Navi1x + { 3, 36, 207, 181, 0, } , // 64 pipes 4 bpe @ SW_64K_D3_X @ Navi1x + { 3, 37, 208, 182, 0, } , // 64 pipes 8 bpe @ SW_64K_D3_X @ Navi1x + { 3, 38, 209, 182, 0, } , // 64 pipes 16 bpe @ SW_64K_D3_X @ Navi1x +}; + +const ADDR_SW_PATINFO SW_256_S_RBPLUS_PATINFO[] = +{ + { 1, 0, 0, 0, 0, } , // 1 pipes (1 PKRs) 1 bpe @ SW_256_S @ RbPlus + { 1, 1, 0, 0, 0, } , // 1 pipes (1 PKRs) 2 bpe @ SW_256_S @ RbPlus + { 1, 2, 0, 0, 0, } , // 1 pipes (1 PKRs) 4 bpe @ SW_256_S @ RbPlus + { 1, 3, 0, 0, 0, } , // 1 pipes (1 PKRs) 8 bpe @ SW_256_S @ RbPlus + { 1, 4, 0, 0, 0, } , // 1 pipes (1 PKRs) 16 bpe @ SW_256_S @ RbPlus + { 1, 0, 0, 0, 0, } , // 2 pipes (1-2 PKRs) 1 bpe @ SW_256_S @ RbPlus + { 1, 1, 0, 0, 0, } , // 2 pipes (1-2 PKRs) 2 bpe @ SW_256_S @ RbPlus + { 1, 2, 0, 0, 0, } , // 2 pipes (1-2 PKRs) 4 bpe @ SW_256_S @ RbPlus + { 1, 3, 0, 0, 0, } , // 2 pipes (1-2 PKRs) 8 bpe @ SW_256_S @ RbPlus + { 1, 4, 0, 0, 0, } , // 2 pipes (1-2 PKRs) 16 bpe @ SW_256_S @ RbPlus + { 1, 0, 0, 0, 0, } , // 4 pipes (1-2 PKRs) 1 bpe @ SW_256_S @ RbPlus + { 1, 1, 0, 0, 0, } , // 4 pipes (1-2 PKRs) 2 bpe @ SW_256_S @ RbPlus + { 1, 2, 0, 0, 0, } , // 4 pipes (1-2 PKRs) 4 bpe @ SW_256_S @ RbPlus + { 1, 3, 0, 0, 0, } , // 4 pipes (1-2 PKRs) 8 bpe @ SW_256_S @ RbPlus + { 1, 4, 0, 0, 0, } , // 4 pipes (1-2 PKRs) 16 bpe @ SW_256_S @ RbPlus + { 1, 0, 0, 0, 0, } , // 8 pipes (2 PKRs) 1 bpe @ SW_256_S @ RbPlus + { 1, 1, 0, 0, 0, } , // 8 pipes (2 PKRs) 2 bpe @ SW_256_S @ RbPlus + { 1, 2, 0, 0, 0, } , // 8 pipes (2 PKRs) 4 bpe @ SW_256_S @ RbPlus + { 1, 3, 0, 0, 0, } , // 8 pipes (2 PKRs) 8 bpe @ SW_256_S @ RbPlus + { 1, 4, 0, 0, 0, } , // 8 pipes (2 PKRs) 16 bpe @ SW_256_S @ RbPlus + { 1, 0, 0, 0, 0, } , // 4 pipes (4 PKRs) 1 bpe @ SW_256_S @ RbPlus + { 1, 1, 0, 0, 0, } , // 4 pipes (4 PKRs) 2 bpe @ SW_256_S @ RbPlus + { 1, 2, 0, 0, 0, } , // 4 pipes (4 PKRs) 4 bpe @ SW_256_S @ RbPlus + { 1, 3, 0, 0, 0, } , // 4 pipes (4 PKRs) 8 bpe @ SW_256_S @ RbPlus + { 1, 4, 0, 0, 0, } , // 4 pipes (4 PKRs) 16 bpe @ SW_256_S @ RbPlus + { 1, 0, 0, 0, 0, } , // 8 pipes (4 PKRs) 1 bpe @ SW_256_S @ RbPlus + { 1, 1, 0, 0, 0, } , // 8 pipes (4 PKRs) 2 bpe @ SW_256_S @ RbPlus + { 1, 2, 0, 0, 0, } , // 8 pipes (4 PKRs) 4 bpe @ SW_256_S @ RbPlus + { 1, 3, 0, 0, 0, } , // 8 pipes (4 PKRs) 8 bpe @ SW_256_S @ RbPlus + { 1, 4, 0, 0, 0, } , // 8 pipes (4 PKRs) 16 bpe @ SW_256_S @ RbPlus + { 1, 0, 0, 0, 0, } , // 16 pipes (4 PKRs) 1 bpe @ SW_256_S @ RbPlus + { 1, 1, 0, 0, 0, } , // 16 pipes (4 PKRs) 2 bpe @ SW_256_S @ RbPlus + { 1, 2, 0, 0, 0, } , // 16 pipes (4 PKRs) 4 bpe @ SW_256_S @ RbPlus + { 1, 3, 0, 0, 0, } , // 16 pipes (4 PKRs) 8 bpe @ SW_256_S @ RbPlus + { 1, 4, 0, 0, 0, } , // 16 pipes (4 PKRs) 16 bpe @ SW_256_S @ RbPlus + { 1, 0, 0, 0, 0, } , // 8 pipes (8 PKRs) 1 bpe @ SW_256_S @ RbPlus + { 1, 1, 0, 0, 0, } , // 8 pipes (8 PKRs) 2 bpe @ SW_256_S @ RbPlus + { 1, 2, 0, 0, 0, } , // 8 pipes (8 PKRs) 4 bpe @ SW_256_S @ RbPlus + { 1, 3, 0, 0, 0, } , // 8 pipes (8 PKRs) 8 bpe @ SW_256_S @ RbPlus + { 1, 4, 0, 0, 0, } , // 8 pipes (8 PKRs) 16 bpe @ SW_256_S @ RbPlus + { 1, 0, 0, 0, 0, } , // 16 pipes (8 PKRs) 1 bpe @ SW_256_S @ RbPlus + { 1, 1, 0, 0, 0, } , // 16 pipes (8 PKRs) 2 bpe @ SW_256_S @ RbPlus + { 1, 2, 0, 0, 0, } , // 16 pipes (8 PKRs) 4 bpe @ SW_256_S @ RbPlus + { 1, 3, 0, 0, 0, } , // 16 pipes (8 PKRs) 8 bpe @ SW_256_S @ RbPlus + { 1, 4, 0, 0, 0, } , // 16 pipes (8 PKRs) 16 bpe @ SW_256_S @ RbPlus + { 1, 0, 0, 0, 0, } , // 32 pipes (8 PKRs) 1 bpe @ SW_256_S @ RbPlus + { 1, 1, 0, 0, 0, } , // 32 pipes (8 PKRs) 2 bpe @ SW_256_S @ RbPlus + { 1, 2, 0, 0, 0, } , // 32 pipes (8 PKRs) 4 bpe @ SW_256_S @ RbPlus + { 1, 3, 0, 0, 0, } , // 32 pipes (8 PKRs) 8 bpe @ SW_256_S @ RbPlus + { 1, 4, 0, 0, 0, } , // 32 pipes (8 PKRs) 16 bpe @ SW_256_S @ RbPlus + { 1, 0, 0, 0, 0, } , // 16 pipes (16 PKRs) 1 bpe @ SW_256_S @ RbPlus + { 1, 1, 0, 0, 0, } , // 16 pipes (16 PKRs) 2 bpe @ SW_256_S @ RbPlus + { 1, 2, 0, 0, 0, } , // 16 pipes (16 PKRs) 4 bpe @ SW_256_S @ RbPlus + { 1, 3, 0, 0, 0, } , // 16 pipes (16 PKRs) 8 bpe @ SW_256_S @ RbPlus + { 1, 4, 0, 0, 0, } , // 16 pipes (16 PKRs) 16 bpe @ SW_256_S @ RbPlus + { 1, 0, 0, 0, 0, } , // 32 pipes (16 PKRs) 1 bpe @ SW_256_S @ RbPlus + { 1, 1, 0, 0, 0, } , // 32 pipes (16 PKRs) 2 bpe @ SW_256_S @ RbPlus + { 1, 2, 0, 0, 0, } , // 32 pipes (16 PKRs) 4 bpe @ SW_256_S @ RbPlus + { 1, 3, 0, 0, 0, } , // 32 pipes (16 PKRs) 8 bpe @ SW_256_S @ RbPlus + { 1, 4, 0, 0, 0, } , // 32 pipes (16 PKRs) 16 bpe @ SW_256_S @ RbPlus + { 1, 0, 0, 0, 0, } , // 64 pipes (16 PKRs) 1 bpe @ SW_256_S @ RbPlus + { 1, 1, 0, 0, 0, } , // 64 pipes (16 PKRs) 2 bpe @ SW_256_S @ RbPlus + { 1, 2, 0, 0, 0, } , // 64 pipes (16 PKRs) 4 bpe @ SW_256_S @ RbPlus + { 1, 3, 0, 0, 0, } , // 64 pipes (16 PKRs) 8 bpe @ SW_256_S @ RbPlus + { 1, 4, 0, 0, 0, } , // 64 pipes (16 PKRs) 16 bpe @ SW_256_S @ RbPlus + { 1, 0, 0, 0, 0, } , // 32 pipes (32 PKRs) 1 bpe @ SW_256_S @ RbPlus + { 1, 1, 0, 0, 0, } , // 32 pipes (32 PKRs) 2 bpe @ SW_256_S @ RbPlus + { 1, 2, 0, 0, 0, } , // 32 pipes (32 PKRs) 4 bpe @ SW_256_S @ RbPlus + { 1, 3, 0, 0, 0, } , // 32 pipes (32 PKRs) 8 bpe @ SW_256_S @ RbPlus + { 1, 4, 0, 0, 0, } , // 32 pipes (32 PKRs) 16 bpe @ SW_256_S @ RbPlus + { 1, 0, 0, 0, 0, } , // 64 pipes (32 PKRs) 1 bpe @ SW_256_S @ RbPlus + { 1, 1, 0, 0, 0, } , // 64 pipes (32 PKRs) 2 bpe @ SW_256_S @ RbPlus + { 1, 2, 0, 0, 0, } , // 64 pipes (32 PKRs) 4 bpe @ SW_256_S @ RbPlus + { 1, 3, 0, 0, 0, } , // 64 pipes (32 PKRs) 8 bpe @ SW_256_S @ RbPlus + { 1, 4, 0, 0, 0, } , // 64 pipes (32 PKRs) 16 bpe @ SW_256_S @ RbPlus +}; + +const ADDR_SW_PATINFO SW_256_D_RBPLUS_PATINFO[] = +{ + { 1, 5, 0, 0, 0, } , // 1 pipes (1 PKRs) 1 bpe @ SW_256_D @ RbPlus + { 1, 1, 0, 0, 0, } , // 1 pipes (1 PKRs) 2 bpe @ SW_256_D @ RbPlus + { 1, 39, 0, 0, 0, } , // 1 pipes (1 PKRs) 4 bpe @ SW_256_D @ RbPlus + { 1, 6, 0, 0, 0, } , // 1 pipes (1 PKRs) 8 bpe @ SW_256_D @ RbPlus + { 1, 7, 0, 0, 0, } , // 1 pipes (1 PKRs) 16 bpe @ SW_256_D @ RbPlus + { 1, 5, 0, 0, 0, } , // 2 pipes (1-2 PKRs) 1 bpe @ SW_256_D @ RbPlus + { 1, 1, 0, 0, 0, } , // 2 pipes (1-2 PKRs) 2 bpe @ SW_256_D @ RbPlus + { 1, 39, 0, 0, 0, } , // 2 pipes (1-2 PKRs) 4 bpe @ SW_256_D @ RbPlus + { 1, 6, 0, 0, 0, } , // 2 pipes (1-2 PKRs) 8 bpe @ SW_256_D @ RbPlus + { 1, 7, 0, 0, 0, } , // 2 pipes (1-2 PKRs) 16 bpe @ SW_256_D @ RbPlus + { 1, 5, 0, 0, 0, } , // 4 pipes (1-2 PKRs) 1 bpe @ SW_256_D @ RbPlus + { 1, 1, 0, 0, 0, } , // 4 pipes (1-2 PKRs) 2 bpe @ SW_256_D @ RbPlus + { 1, 39, 0, 0, 0, } , // 4 pipes (1-2 PKRs) 4 bpe @ SW_256_D @ RbPlus + { 1, 6, 0, 0, 0, } , // 4 pipes (1-2 PKRs) 8 bpe @ SW_256_D @ RbPlus + { 1, 7, 0, 0, 0, } , // 4 pipes (1-2 PKRs) 16 bpe @ SW_256_D @ RbPlus + { 1, 5, 0, 0, 0, } , // 8 pipes (2 PKRs) 1 bpe @ SW_256_D @ RbPlus + { 1, 1, 0, 0, 0, } , // 8 pipes (2 PKRs) 2 bpe @ SW_256_D @ RbPlus + { 1, 39, 0, 0, 0, } , // 8 pipes (2 PKRs) 4 bpe @ SW_256_D @ RbPlus + { 1, 6, 0, 0, 0, } , // 8 pipes (2 PKRs) 8 bpe @ SW_256_D @ RbPlus + { 1, 7, 0, 0, 0, } , // 8 pipes (2 PKRs) 16 bpe @ SW_256_D @ RbPlus + { 1, 5, 0, 0, 0, } , // 4 pipes (4 PKRs) 1 bpe @ SW_256_D @ RbPlus + { 1, 1, 0, 0, 0, } , // 4 pipes (4 PKRs) 2 bpe @ SW_256_D @ RbPlus + { 1, 39, 0, 0, 0, } , // 4 pipes (4 PKRs) 4 bpe @ SW_256_D @ RbPlus + { 1, 6, 0, 0, 0, } , // 4 pipes (4 PKRs) 8 bpe @ SW_256_D @ RbPlus + { 1, 7, 0, 0, 0, } , // 4 pipes (4 PKRs) 16 bpe @ SW_256_D @ RbPlus + { 1, 5, 0, 0, 0, } , // 8 pipes (4 PKRs) 1 bpe @ SW_256_D @ RbPlus + { 1, 1, 0, 0, 0, } , // 8 pipes (4 PKRs) 2 bpe @ SW_256_D @ RbPlus + { 1, 39, 0, 0, 0, } , // 8 pipes (4 PKRs) 4 bpe @ SW_256_D @ RbPlus + { 1, 6, 0, 0, 0, } , // 8 pipes (4 PKRs) 8 bpe @ SW_256_D @ RbPlus + { 1, 7, 0, 0, 0, } , // 8 pipes (4 PKRs) 16 bpe @ SW_256_D @ RbPlus + { 1, 5, 0, 0, 0, } , // 16 pipes (4 PKRs) 1 bpe @ SW_256_D @ RbPlus + { 1, 1, 0, 0, 0, } , // 16 pipes (4 PKRs) 2 bpe @ SW_256_D @ RbPlus + { 1, 39, 0, 0, 0, } , // 16 pipes (4 PKRs) 4 bpe @ SW_256_D @ RbPlus + { 1, 6, 0, 0, 0, } , // 16 pipes (4 PKRs) 8 bpe @ SW_256_D @ RbPlus + { 1, 7, 0, 0, 0, } , // 16 pipes (4 PKRs) 16 bpe @ SW_256_D @ RbPlus + { 1, 5, 0, 0, 0, } , // 8 pipes (8 PKRs) 1 bpe @ SW_256_D @ RbPlus + { 1, 1, 0, 0, 0, } , // 8 pipes (8 PKRs) 2 bpe @ SW_256_D @ RbPlus + { 1, 39, 0, 0, 0, } , // 8 pipes (8 PKRs) 4 bpe @ SW_256_D @ RbPlus + { 1, 6, 0, 0, 0, } , // 8 pipes (8 PKRs) 8 bpe @ SW_256_D @ RbPlus + { 1, 7, 0, 0, 0, } , // 8 pipes (8 PKRs) 16 bpe @ SW_256_D @ RbPlus + { 1, 5, 0, 0, 0, } , // 16 pipes (8 PKRs) 1 bpe @ SW_256_D @ RbPlus + { 1, 1, 0, 0, 0, } , // 16 pipes (8 PKRs) 2 bpe @ SW_256_D @ RbPlus + { 1, 39, 0, 0, 0, } , // 16 pipes (8 PKRs) 4 bpe @ SW_256_D @ RbPlus + { 1, 6, 0, 0, 0, } , // 16 pipes (8 PKRs) 8 bpe @ SW_256_D @ RbPlus + { 1, 7, 0, 0, 0, } , // 16 pipes (8 PKRs) 16 bpe @ SW_256_D @ RbPlus + { 1, 5, 0, 0, 0, } , // 32 pipes (8 PKRs) 1 bpe @ SW_256_D @ RbPlus + { 1, 1, 0, 0, 0, } , // 32 pipes (8 PKRs) 2 bpe @ SW_256_D @ RbPlus + { 1, 39, 0, 0, 0, } , // 32 pipes (8 PKRs) 4 bpe @ SW_256_D @ RbPlus + { 1, 6, 0, 0, 0, } , // 32 pipes (8 PKRs) 8 bpe @ SW_256_D @ RbPlus + { 1, 7, 0, 0, 0, } , // 32 pipes (8 PKRs) 16 bpe @ SW_256_D @ RbPlus + { 1, 5, 0, 0, 0, } , // 16 pipes (16 PKRs) 1 bpe @ SW_256_D @ RbPlus + { 1, 1, 0, 0, 0, } , // 16 pipes (16 PKRs) 2 bpe @ SW_256_D @ RbPlus + { 1, 39, 0, 0, 0, } , // 16 pipes (16 PKRs) 4 bpe @ SW_256_D @ RbPlus + { 1, 6, 0, 0, 0, } , // 16 pipes (16 PKRs) 8 bpe @ SW_256_D @ RbPlus + { 1, 7, 0, 0, 0, } , // 16 pipes (16 PKRs) 16 bpe @ SW_256_D @ RbPlus + { 1, 5, 0, 0, 0, } , // 32 pipes (16 PKRs) 1 bpe @ SW_256_D @ RbPlus + { 1, 1, 0, 0, 0, } , // 32 pipes (16 PKRs) 2 bpe @ SW_256_D @ RbPlus + { 1, 39, 0, 0, 0, } , // 32 pipes (16 PKRs) 4 bpe @ SW_256_D @ RbPlus + { 1, 6, 0, 0, 0, } , // 32 pipes (16 PKRs) 8 bpe @ SW_256_D @ RbPlus + { 1, 7, 0, 0, 0, } , // 32 pipes (16 PKRs) 16 bpe @ SW_256_D @ RbPlus + { 1, 5, 0, 0, 0, } , // 64 pipes (16 PKRs) 1 bpe @ SW_256_D @ RbPlus + { 1, 1, 0, 0, 0, } , // 64 pipes (16 PKRs) 2 bpe @ SW_256_D @ RbPlus + { 1, 39, 0, 0, 0, } , // 64 pipes (16 PKRs) 4 bpe @ SW_256_D @ RbPlus + { 1, 6, 0, 0, 0, } , // 64 pipes (16 PKRs) 8 bpe @ SW_256_D @ RbPlus + { 1, 7, 0, 0, 0, } , // 64 pipes (16 PKRs) 16 bpe @ SW_256_D @ RbPlus + { 1, 5, 0, 0, 0, } , // 32 pipes (32 PKRs) 1 bpe @ SW_256_D @ RbPlus + { 1, 1, 0, 0, 0, } , // 32 pipes (32 PKRs) 2 bpe @ SW_256_D @ RbPlus + { 1, 39, 0, 0, 0, } , // 32 pipes (32 PKRs) 4 bpe @ SW_256_D @ RbPlus + { 1, 6, 0, 0, 0, } , // 32 pipes (32 PKRs) 8 bpe @ SW_256_D @ RbPlus + { 1, 7, 0, 0, 0, } , // 32 pipes (32 PKRs) 16 bpe @ SW_256_D @ RbPlus + { 1, 5, 0, 0, 0, } , // 64 pipes (32 PKRs) 1 bpe @ SW_256_D @ RbPlus + { 1, 1, 0, 0, 0, } , // 64 pipes (32 PKRs) 2 bpe @ SW_256_D @ RbPlus + { 1, 39, 0, 0, 0, } , // 64 pipes (32 PKRs) 4 bpe @ SW_256_D @ RbPlus + { 1, 6, 0, 0, 0, } , // 64 pipes (32 PKRs) 8 bpe @ SW_256_D @ RbPlus + { 1, 7, 0, 0, 0, } , // 64 pipes (32 PKRs) 16 bpe @ SW_256_D @ RbPlus +}; + +const ADDR_SW_PATINFO SW_4K_S_RBPLUS_PATINFO[] = +{ + { 1, 0, 1, 0, 0, } , // 1 pipes (1 PKRs) 1 bpe @ SW_4K_S @ RbPlus + { 1, 1, 2, 0, 0, } , // 1 pipes (1 PKRs) 2 bpe @ SW_4K_S @ RbPlus + { 1, 2, 3, 0, 0, } , // 1 pipes (1 PKRs) 4 bpe @ SW_4K_S @ RbPlus + { 1, 3, 4, 0, 0, } , // 1 pipes (1 PKRs) 8 bpe @ SW_4K_S @ RbPlus + { 1, 4, 5, 0, 0, } , // 1 pipes (1 PKRs) 16 bpe @ SW_4K_S @ RbPlus + { 1, 0, 1, 0, 0, } , // 2 pipes (1-2 PKRs) 1 bpe @ SW_4K_S @ RbPlus + { 1, 1, 2, 0, 0, } , // 2 pipes (1-2 PKRs) 2 bpe @ SW_4K_S @ RbPlus + { 1, 2, 3, 0, 0, } , // 2 pipes (1-2 PKRs) 4 bpe @ SW_4K_S @ RbPlus + { 1, 3, 4, 0, 0, } , // 2 pipes (1-2 PKRs) 8 bpe @ SW_4K_S @ RbPlus + { 1, 4, 5, 0, 0, } , // 2 pipes (1-2 PKRs) 16 bpe @ SW_4K_S @ RbPlus + { 1, 0, 1, 0, 0, } , // 4 pipes (1-2 PKRs) 1 bpe @ SW_4K_S @ RbPlus + { 1, 1, 2, 0, 0, } , // 4 pipes (1-2 PKRs) 2 bpe @ SW_4K_S @ RbPlus + { 1, 2, 3, 0, 0, } , // 4 pipes (1-2 PKRs) 4 bpe @ SW_4K_S @ RbPlus + { 1, 3, 4, 0, 0, } , // 4 pipes (1-2 PKRs) 8 bpe @ SW_4K_S @ RbPlus + { 1, 4, 5, 0, 0, } , // 4 pipes (1-2 PKRs) 16 bpe @ SW_4K_S @ RbPlus + { 1, 0, 1, 0, 0, } , // 8 pipes (2 PKRs) 1 bpe @ SW_4K_S @ RbPlus + { 1, 1, 2, 0, 0, } , // 8 pipes (2 PKRs) 2 bpe @ SW_4K_S @ RbPlus + { 1, 2, 3, 0, 0, } , // 8 pipes (2 PKRs) 4 bpe @ SW_4K_S @ RbPlus + { 1, 3, 4, 0, 0, } , // 8 pipes (2 PKRs) 8 bpe @ SW_4K_S @ RbPlus + { 1, 4, 5, 0, 0, } , // 8 pipes (2 PKRs) 16 bpe @ SW_4K_S @ RbPlus + { 1, 0, 1, 0, 0, } , // 4 pipes (4 PKRs) 1 bpe @ SW_4K_S @ RbPlus + { 1, 1, 2, 0, 0, } , // 4 pipes (4 PKRs) 2 bpe @ SW_4K_S @ RbPlus + { 1, 2, 3, 0, 0, } , // 4 pipes (4 PKRs) 4 bpe @ SW_4K_S @ RbPlus + { 1, 3, 4, 0, 0, } , // 4 pipes (4 PKRs) 8 bpe @ SW_4K_S @ RbPlus + { 1, 4, 5, 0, 0, } , // 4 pipes (4 PKRs) 16 bpe @ SW_4K_S @ RbPlus + { 1, 0, 1, 0, 0, } , // 8 pipes (4 PKRs) 1 bpe @ SW_4K_S @ RbPlus + { 1, 1, 2, 0, 0, } , // 8 pipes (4 PKRs) 2 bpe @ SW_4K_S @ RbPlus + { 1, 2, 3, 0, 0, } , // 8 pipes (4 PKRs) 4 bpe @ SW_4K_S @ RbPlus + { 1, 3, 4, 0, 0, } , // 8 pipes (4 PKRs) 8 bpe @ SW_4K_S @ RbPlus + { 1, 4, 5, 0, 0, } , // 8 pipes (4 PKRs) 16 bpe @ SW_4K_S @ RbPlus + { 1, 0, 1, 0, 0, } , // 16 pipes (4 PKRs) 1 bpe @ SW_4K_S @ RbPlus + { 1, 1, 2, 0, 0, } , // 16 pipes (4 PKRs) 2 bpe @ SW_4K_S @ RbPlus + { 1, 2, 3, 0, 0, } , // 16 pipes (4 PKRs) 4 bpe @ SW_4K_S @ RbPlus + { 1, 3, 4, 0, 0, } , // 16 pipes (4 PKRs) 8 bpe @ SW_4K_S @ RbPlus + { 1, 4, 5, 0, 0, } , // 16 pipes (4 PKRs) 16 bpe @ SW_4K_S @ RbPlus + { 1, 0, 1, 0, 0, } , // 8 pipes (8 PKRs) 1 bpe @ SW_4K_S @ RbPlus + { 1, 1, 2, 0, 0, } , // 8 pipes (8 PKRs) 2 bpe @ SW_4K_S @ RbPlus + { 1, 2, 3, 0, 0, } , // 8 pipes (8 PKRs) 4 bpe @ SW_4K_S @ RbPlus + { 1, 3, 4, 0, 0, } , // 8 pipes (8 PKRs) 8 bpe @ SW_4K_S @ RbPlus + { 1, 4, 5, 0, 0, } , // 8 pipes (8 PKRs) 16 bpe @ SW_4K_S @ RbPlus + { 1, 0, 1, 0, 0, } , // 16 pipes (8 PKRs) 1 bpe @ SW_4K_S @ RbPlus + { 1, 1, 2, 0, 0, } , // 16 pipes (8 PKRs) 2 bpe @ SW_4K_S @ RbPlus + { 1, 2, 3, 0, 0, } , // 16 pipes (8 PKRs) 4 bpe @ SW_4K_S @ RbPlus + { 1, 3, 4, 0, 0, } , // 16 pipes (8 PKRs) 8 bpe @ SW_4K_S @ RbPlus + { 1, 4, 5, 0, 0, } , // 16 pipes (8 PKRs) 16 bpe @ SW_4K_S @ RbPlus + { 1, 0, 1, 0, 0, } , // 32 pipes (8 PKRs) 1 bpe @ SW_4K_S @ RbPlus + { 1, 1, 2, 0, 0, } , // 32 pipes (8 PKRs) 2 bpe @ SW_4K_S @ RbPlus + { 1, 2, 3, 0, 0, } , // 32 pipes (8 PKRs) 4 bpe @ SW_4K_S @ RbPlus + { 1, 3, 4, 0, 0, } , // 32 pipes (8 PKRs) 8 bpe @ SW_4K_S @ RbPlus + { 1, 4, 5, 0, 0, } , // 32 pipes (8 PKRs) 16 bpe @ SW_4K_S @ RbPlus + { 1, 0, 1, 0, 0, } , // 16 pipes (16 PKRs) 1 bpe @ SW_4K_S @ RbPlus + { 1, 1, 2, 0, 0, } , // 16 pipes (16 PKRs) 2 bpe @ SW_4K_S @ RbPlus + { 1, 2, 3, 0, 0, } , // 16 pipes (16 PKRs) 4 bpe @ SW_4K_S @ RbPlus + { 1, 3, 4, 0, 0, } , // 16 pipes (16 PKRs) 8 bpe @ SW_4K_S @ RbPlus + { 1, 4, 5, 0, 0, } , // 16 pipes (16 PKRs) 16 bpe @ SW_4K_S @ RbPlus + { 1, 0, 1, 0, 0, } , // 32 pipes (16 PKRs) 1 bpe @ SW_4K_S @ RbPlus + { 1, 1, 2, 0, 0, } , // 32 pipes (16 PKRs) 2 bpe @ SW_4K_S @ RbPlus + { 1, 2, 3, 0, 0, } , // 32 pipes (16 PKRs) 4 bpe @ SW_4K_S @ RbPlus + { 1, 3, 4, 0, 0, } , // 32 pipes (16 PKRs) 8 bpe @ SW_4K_S @ RbPlus + { 1, 4, 5, 0, 0, } , // 32 pipes (16 PKRs) 16 bpe @ SW_4K_S @ RbPlus + { 1, 0, 1, 0, 0, } , // 64 pipes (16 PKRs) 1 bpe @ SW_4K_S @ RbPlus + { 1, 1, 2, 0, 0, } , // 64 pipes (16 PKRs) 2 bpe @ SW_4K_S @ RbPlus + { 1, 2, 3, 0, 0, } , // 64 pipes (16 PKRs) 4 bpe @ SW_4K_S @ RbPlus + { 1, 3, 4, 0, 0, } , // 64 pipes (16 PKRs) 8 bpe @ SW_4K_S @ RbPlus + { 1, 4, 5, 0, 0, } , // 64 pipes (16 PKRs) 16 bpe @ SW_4K_S @ RbPlus + { 1, 0, 1, 0, 0, } , // 32 pipes (32 PKRs) 1 bpe @ SW_4K_S @ RbPlus + { 1, 1, 2, 0, 0, } , // 32 pipes (32 PKRs) 2 bpe @ SW_4K_S @ RbPlus + { 1, 2, 3, 0, 0, } , // 32 pipes (32 PKRs) 4 bpe @ SW_4K_S @ RbPlus + { 1, 3, 4, 0, 0, } , // 32 pipes (32 PKRs) 8 bpe @ SW_4K_S @ RbPlus + { 1, 4, 5, 0, 0, } , // 32 pipes (32 PKRs) 16 bpe @ SW_4K_S @ RbPlus + { 1, 0, 1, 0, 0, } , // 64 pipes (32 PKRs) 1 bpe @ SW_4K_S @ RbPlus + { 1, 1, 2, 0, 0, } , // 64 pipes (32 PKRs) 2 bpe @ SW_4K_S @ RbPlus + { 1, 2, 3, 0, 0, } , // 64 pipes (32 PKRs) 4 bpe @ SW_4K_S @ RbPlus + { 1, 3, 4, 0, 0, } , // 64 pipes (32 PKRs) 8 bpe @ SW_4K_S @ RbPlus + { 1, 4, 5, 0, 0, } , // 64 pipes (32 PKRs) 16 bpe @ SW_4K_S @ RbPlus +}; + +const ADDR_SW_PATINFO SW_4K_D_RBPLUS_PATINFO[] = +{ + { 1, 5, 1, 0, 0, } , // 1 pipes (1 PKRs) 1 bpe @ SW_4K_D @ RbPlus + { 1, 1, 2, 0, 0, } , // 1 pipes (1 PKRs) 2 bpe @ SW_4K_D @ RbPlus + { 1, 39, 3, 0, 0, } , // 1 pipes (1 PKRs) 4 bpe @ SW_4K_D @ RbPlus + { 1, 6, 4, 0, 0, } , // 1 pipes (1 PKRs) 8 bpe @ SW_4K_D @ RbPlus + { 1, 7, 5, 0, 0, } , // 1 pipes (1 PKRs) 16 bpe @ SW_4K_D @ RbPlus + { 1, 5, 1, 0, 0, } , // 2 pipes (1-2 PKRs) 1 bpe @ SW_4K_D @ RbPlus + { 1, 1, 2, 0, 0, } , // 2 pipes (1-2 PKRs) 2 bpe @ SW_4K_D @ RbPlus + { 1, 39, 3, 0, 0, } , // 2 pipes (1-2 PKRs) 4 bpe @ SW_4K_D @ RbPlus + { 1, 6, 4, 0, 0, } , // 2 pipes (1-2 PKRs) 8 bpe @ SW_4K_D @ RbPlus + { 1, 7, 5, 0, 0, } , // 2 pipes (1-2 PKRs) 16 bpe @ SW_4K_D @ RbPlus + { 1, 5, 1, 0, 0, } , // 4 pipes (1-2 PKRs) 1 bpe @ SW_4K_D @ RbPlus + { 1, 1, 2, 0, 0, } , // 4 pipes (1-2 PKRs) 2 bpe @ SW_4K_D @ RbPlus + { 1, 39, 3, 0, 0, } , // 4 pipes (1-2 PKRs) 4 bpe @ SW_4K_D @ RbPlus + { 1, 6, 4, 0, 0, } , // 4 pipes (1-2 PKRs) 8 bpe @ SW_4K_D @ RbPlus + { 1, 7, 5, 0, 0, } , // 4 pipes (1-2 PKRs) 16 bpe @ SW_4K_D @ RbPlus + { 1, 5, 1, 0, 0, } , // 8 pipes (2 PKRs) 1 bpe @ SW_4K_D @ RbPlus + { 1, 1, 2, 0, 0, } , // 8 pipes (2 PKRs) 2 bpe @ SW_4K_D @ RbPlus + { 1, 39, 3, 0, 0, } , // 8 pipes (2 PKRs) 4 bpe @ SW_4K_D @ RbPlus + { 1, 6, 4, 0, 0, } , // 8 pipes (2 PKRs) 8 bpe @ SW_4K_D @ RbPlus + { 1, 7, 5, 0, 0, } , // 8 pipes (2 PKRs) 16 bpe @ SW_4K_D @ RbPlus + { 1, 5, 1, 0, 0, } , // 4 pipes (4 PKRs) 1 bpe @ SW_4K_D @ RbPlus + { 1, 1, 2, 0, 0, } , // 4 pipes (4 PKRs) 2 bpe @ SW_4K_D @ RbPlus + { 1, 39, 3, 0, 0, } , // 4 pipes (4 PKRs) 4 bpe @ SW_4K_D @ RbPlus + { 1, 6, 4, 0, 0, } , // 4 pipes (4 PKRs) 8 bpe @ SW_4K_D @ RbPlus + { 1, 7, 5, 0, 0, } , // 4 pipes (4 PKRs) 16 bpe @ SW_4K_D @ RbPlus + { 1, 5, 1, 0, 0, } , // 8 pipes (4 PKRs) 1 bpe @ SW_4K_D @ RbPlus + { 1, 1, 2, 0, 0, } , // 8 pipes (4 PKRs) 2 bpe @ SW_4K_D @ RbPlus + { 1, 39, 3, 0, 0, } , // 8 pipes (4 PKRs) 4 bpe @ SW_4K_D @ RbPlus + { 1, 6, 4, 0, 0, } , // 8 pipes (4 PKRs) 8 bpe @ SW_4K_D @ RbPlus + { 1, 7, 5, 0, 0, } , // 8 pipes (4 PKRs) 16 bpe @ SW_4K_D @ RbPlus + { 1, 5, 1, 0, 0, } , // 16 pipes (4 PKRs) 1 bpe @ SW_4K_D @ RbPlus + { 1, 1, 2, 0, 0, } , // 16 pipes (4 PKRs) 2 bpe @ SW_4K_D @ RbPlus + { 1, 39, 3, 0, 0, } , // 16 pipes (4 PKRs) 4 bpe @ SW_4K_D @ RbPlus + { 1, 6, 4, 0, 0, } , // 16 pipes (4 PKRs) 8 bpe @ SW_4K_D @ RbPlus + { 1, 7, 5, 0, 0, } , // 16 pipes (4 PKRs) 16 bpe @ SW_4K_D @ RbPlus + { 1, 5, 1, 0, 0, } , // 8 pipes (8 PKRs) 1 bpe @ SW_4K_D @ RbPlus + { 1, 1, 2, 0, 0, } , // 8 pipes (8 PKRs) 2 bpe @ SW_4K_D @ RbPlus + { 1, 39, 3, 0, 0, } , // 8 pipes (8 PKRs) 4 bpe @ SW_4K_D @ RbPlus + { 1, 6, 4, 0, 0, } , // 8 pipes (8 PKRs) 8 bpe @ SW_4K_D @ RbPlus + { 1, 7, 5, 0, 0, } , // 8 pipes (8 PKRs) 16 bpe @ SW_4K_D @ RbPlus + { 1, 5, 1, 0, 0, } , // 16 pipes (8 PKRs) 1 bpe @ SW_4K_D @ RbPlus + { 1, 1, 2, 0, 0, } , // 16 pipes (8 PKRs) 2 bpe @ SW_4K_D @ RbPlus + { 1, 39, 3, 0, 0, } , // 16 pipes (8 PKRs) 4 bpe @ SW_4K_D @ RbPlus + { 1, 6, 4, 0, 0, } , // 16 pipes (8 PKRs) 8 bpe @ SW_4K_D @ RbPlus + { 1, 7, 5, 0, 0, } , // 16 pipes (8 PKRs) 16 bpe @ SW_4K_D @ RbPlus + { 1, 5, 1, 0, 0, } , // 32 pipes (8 PKRs) 1 bpe @ SW_4K_D @ RbPlus + { 1, 1, 2, 0, 0, } , // 32 pipes (8 PKRs) 2 bpe @ SW_4K_D @ RbPlus + { 1, 39, 3, 0, 0, } , // 32 pipes (8 PKRs) 4 bpe @ SW_4K_D @ RbPlus + { 1, 6, 4, 0, 0, } , // 32 pipes (8 PKRs) 8 bpe @ SW_4K_D @ RbPlus + { 1, 7, 5, 0, 0, } , // 32 pipes (8 PKRs) 16 bpe @ SW_4K_D @ RbPlus + { 1, 5, 1, 0, 0, } , // 16 pipes (16 PKRs) 1 bpe @ SW_4K_D @ RbPlus + { 1, 1, 2, 0, 0, } , // 16 pipes (16 PKRs) 2 bpe @ SW_4K_D @ RbPlus + { 1, 39, 3, 0, 0, } , // 16 pipes (16 PKRs) 4 bpe @ SW_4K_D @ RbPlus + { 1, 6, 4, 0, 0, } , // 16 pipes (16 PKRs) 8 bpe @ SW_4K_D @ RbPlus + { 1, 7, 5, 0, 0, } , // 16 pipes (16 PKRs) 16 bpe @ SW_4K_D @ RbPlus + { 1, 5, 1, 0, 0, } , // 32 pipes (16 PKRs) 1 bpe @ SW_4K_D @ RbPlus + { 1, 1, 2, 0, 0, } , // 32 pipes (16 PKRs) 2 bpe @ SW_4K_D @ RbPlus + { 1, 39, 3, 0, 0, } , // 32 pipes (16 PKRs) 4 bpe @ SW_4K_D @ RbPlus + { 1, 6, 4, 0, 0, } , // 32 pipes (16 PKRs) 8 bpe @ SW_4K_D @ RbPlus + { 1, 7, 5, 0, 0, } , // 32 pipes (16 PKRs) 16 bpe @ SW_4K_D @ RbPlus + { 1, 5, 1, 0, 0, } , // 64 pipes (16 PKRs) 1 bpe @ SW_4K_D @ RbPlus + { 1, 1, 2, 0, 0, } , // 64 pipes (16 PKRs) 2 bpe @ SW_4K_D @ RbPlus + { 1, 39, 3, 0, 0, } , // 64 pipes (16 PKRs) 4 bpe @ SW_4K_D @ RbPlus + { 1, 6, 4, 0, 0, } , // 64 pipes (16 PKRs) 8 bpe @ SW_4K_D @ RbPlus + { 1, 7, 5, 0, 0, } , // 64 pipes (16 PKRs) 16 bpe @ SW_4K_D @ RbPlus + { 1, 5, 1, 0, 0, } , // 32 pipes (32 PKRs) 1 bpe @ SW_4K_D @ RbPlus + { 1, 1, 2, 0, 0, } , // 32 pipes (32 PKRs) 2 bpe @ SW_4K_D @ RbPlus + { 1, 39, 3, 0, 0, } , // 32 pipes (32 PKRs) 4 bpe @ SW_4K_D @ RbPlus + { 1, 6, 4, 0, 0, } , // 32 pipes (32 PKRs) 8 bpe @ SW_4K_D @ RbPlus + { 1, 7, 5, 0, 0, } , // 32 pipes (32 PKRs) 16 bpe @ SW_4K_D @ RbPlus + { 1, 5, 1, 0, 0, } , // 64 pipes (32 PKRs) 1 bpe @ SW_4K_D @ RbPlus + { 1, 1, 2, 0, 0, } , // 64 pipes (32 PKRs) 2 bpe @ SW_4K_D @ RbPlus + { 1, 39, 3, 0, 0, } , // 64 pipes (32 PKRs) 4 bpe @ SW_4K_D @ RbPlus + { 1, 6, 4, 0, 0, } , // 64 pipes (32 PKRs) 8 bpe @ SW_4K_D @ RbPlus + { 1, 7, 5, 0, 0, } , // 64 pipes (32 PKRs) 16 bpe @ SW_4K_D @ RbPlus +}; + +const ADDR_SW_PATINFO SW_4K_S_X_RBPLUS_PATINFO[] = +{ + { 1, 0, 1, 0, 0, } , // 1 pipes (1 PKRs) 1 bpe @ SW_4K_S_X @ RbPlus + { 1, 1, 2, 0, 0, } , // 1 pipes (1 PKRs) 2 bpe @ SW_4K_S_X @ RbPlus + { 1, 2, 3, 0, 0, } , // 1 pipes (1 PKRs) 4 bpe @ SW_4K_S_X @ RbPlus + { 1, 3, 4, 0, 0, } , // 1 pipes (1 PKRs) 8 bpe @ SW_4K_S_X @ RbPlus + { 1, 4, 5, 0, 0, } , // 1 pipes (1 PKRs) 16 bpe @ SW_4K_S_X @ RbPlus + { 3, 0, 6, 0, 0, } , // 2 pipes (1-2 PKRs) 1 bpe @ SW_4K_S_X @ RbPlus + { 3, 1, 7, 0, 0, } , // 2 pipes (1-2 PKRs) 2 bpe @ SW_4K_S_X @ RbPlus + { 3, 2, 8, 0, 0, } , // 2 pipes (1-2 PKRs) 4 bpe @ SW_4K_S_X @ RbPlus + { 3, 3, 9, 0, 0, } , // 2 pipes (1-2 PKRs) 8 bpe @ SW_4K_S_X @ RbPlus + { 3, 4, 10, 0, 0, } , // 2 pipes (1-2 PKRs) 16 bpe @ SW_4K_S_X @ RbPlus + { 3, 0, 210, 0, 0, } , // 4 pipes (1-2 PKRs) 1 bpe @ SW_4K_S_X @ RbPlus + { 3, 1, 211, 0, 0, } , // 4 pipes (1-2 PKRs) 2 bpe @ SW_4K_S_X @ RbPlus + { 3, 2, 212, 0, 0, } , // 4 pipes (1-2 PKRs) 4 bpe @ SW_4K_S_X @ RbPlus + { 3, 3, 213, 0, 0, } , // 4 pipes (1-2 PKRs) 8 bpe @ SW_4K_S_X @ RbPlus + { 3, 4, 214, 0, 0, } , // 4 pipes (1-2 PKRs) 16 bpe @ SW_4K_S_X @ RbPlus + { 3, 0, 215, 0, 0, } , // 8 pipes (2 PKRs) 1 bpe @ SW_4K_S_X @ RbPlus + { 3, 1, 216, 0, 0, } , // 8 pipes (2 PKRs) 2 bpe @ SW_4K_S_X @ RbPlus + { 3, 2, 217, 0, 0, } , // 8 pipes (2 PKRs) 4 bpe @ SW_4K_S_X @ RbPlus + { 3, 3, 218, 0, 0, } , // 8 pipes (2 PKRs) 8 bpe @ SW_4K_S_X @ RbPlus + { 3, 4, 219, 0, 0, } , // 8 pipes (2 PKRs) 16 bpe @ SW_4K_S_X @ RbPlus + { 3, 0, 11, 0, 0, } , // 4 pipes (4 PKRs) 1 bpe @ SW_4K_S_X @ RbPlus + { 3, 1, 12, 0, 0, } , // 4 pipes (4 PKRs) 2 bpe @ SW_4K_S_X @ RbPlus + { 3, 2, 13, 0, 0, } , // 4 pipes (4 PKRs) 4 bpe @ SW_4K_S_X @ RbPlus + { 3, 3, 14, 0, 0, } , // 4 pipes (4 PKRs) 8 bpe @ SW_4K_S_X @ RbPlus + { 3, 4, 15, 0, 0, } , // 4 pipes (4 PKRs) 16 bpe @ SW_4K_S_X @ RbPlus + { 3, 0, 220, 0, 0, } , // 8 pipes (4 PKRs) 1 bpe @ SW_4K_S_X @ RbPlus + { 3, 1, 221, 0, 0, } , // 8 pipes (4 PKRs) 2 bpe @ SW_4K_S_X @ RbPlus + { 3, 2, 222, 0, 0, } , // 8 pipes (4 PKRs) 4 bpe @ SW_4K_S_X @ RbPlus + { 3, 3, 223, 0, 0, } , // 8 pipes (4 PKRs) 8 bpe @ SW_4K_S_X @ RbPlus + { 3, 4, 224, 0, 0, } , // 8 pipes (4 PKRs) 16 bpe @ SW_4K_S_X @ RbPlus + { 3, 0, 225, 0, 0, } , // 16 pipes (4 PKRs) 1 bpe @ SW_4K_S_X @ RbPlus + { 3, 1, 226, 0, 0, } , // 16 pipes (4 PKRs) 2 bpe @ SW_4K_S_X @ RbPlus + { 3, 2, 227, 0, 0, } , // 16 pipes (4 PKRs) 4 bpe @ SW_4K_S_X @ RbPlus + { 3, 3, 228, 0, 0, } , // 16 pipes (4 PKRs) 8 bpe @ SW_4K_S_X @ RbPlus + { 3, 4, 229, 0, 0, } , // 16 pipes (4 PKRs) 16 bpe @ SW_4K_S_X @ RbPlus + { 3, 0, 16, 0, 0, } , // 8 pipes (8 PKRs) 1 bpe @ SW_4K_S_X @ RbPlus + { 3, 1, 17, 0, 0, } , // 8 pipes (8 PKRs) 2 bpe @ SW_4K_S_X @ RbPlus + { 3, 2, 18, 0, 0, } , // 8 pipes (8 PKRs) 4 bpe @ SW_4K_S_X @ RbPlus + { 3, 3, 19, 0, 0, } , // 8 pipes (8 PKRs) 8 bpe @ SW_4K_S_X @ RbPlus + { 3, 4, 20, 0, 0, } , // 8 pipes (8 PKRs) 16 bpe @ SW_4K_S_X @ RbPlus + { 3, 0, 230, 0, 0, } , // 16 pipes (8 PKRs) 1 bpe @ SW_4K_S_X @ RbPlus + { 3, 1, 231, 0, 0, } , // 16 pipes (8 PKRs) 2 bpe @ SW_4K_S_X @ RbPlus + { 3, 2, 232, 0, 0, } , // 16 pipes (8 PKRs) 4 bpe @ SW_4K_S_X @ RbPlus + { 3, 3, 233, 0, 0, } , // 16 pipes (8 PKRs) 8 bpe @ SW_4K_S_X @ RbPlus + { 3, 4, 234, 0, 0, } , // 16 pipes (8 PKRs) 16 bpe @ SW_4K_S_X @ RbPlus + { 3, 0, 235, 0, 0, } , // 32 pipes (8 PKRs) 1 bpe @ SW_4K_S_X @ RbPlus + { 3, 1, 236, 0, 0, } , // 32 pipes (8 PKRs) 2 bpe @ SW_4K_S_X @ RbPlus + { 3, 2, 237, 0, 0, } , // 32 pipes (8 PKRs) 4 bpe @ SW_4K_S_X @ RbPlus + { 3, 3, 238, 0, 0, } , // 32 pipes (8 PKRs) 8 bpe @ SW_4K_S_X @ RbPlus + { 3, 4, 239, 0, 0, } , // 32 pipes (8 PKRs) 16 bpe @ SW_4K_S_X @ RbPlus + { 3, 0, 21, 0, 0, } , // 16 pipes (16 PKRs) 1 bpe @ SW_4K_S_X @ RbPlus + { 3, 1, 22, 0, 0, } , // 16 pipes (16 PKRs) 2 bpe @ SW_4K_S_X @ RbPlus + { 3, 2, 23, 0, 0, } , // 16 pipes (16 PKRs) 4 bpe @ SW_4K_S_X @ RbPlus + { 3, 3, 24, 0, 0, } , // 16 pipes (16 PKRs) 8 bpe @ SW_4K_S_X @ RbPlus + { 3, 4, 25, 0, 0, } , // 16 pipes (16 PKRs) 16 bpe @ SW_4K_S_X @ RbPlus + { 3, 0, 240, 0, 0, } , // 32 pipes (16 PKRs) 1 bpe @ SW_4K_S_X @ RbPlus + { 3, 1, 241, 0, 0, } , // 32 pipes (16 PKRs) 2 bpe @ SW_4K_S_X @ RbPlus + { 3, 2, 242, 0, 0, } , // 32 pipes (16 PKRs) 4 bpe @ SW_4K_S_X @ RbPlus + { 3, 3, 243, 0, 0, } , // 32 pipes (16 PKRs) 8 bpe @ SW_4K_S_X @ RbPlus + { 3, 4, 244, 0, 0, } , // 32 pipes (16 PKRs) 16 bpe @ SW_4K_S_X @ RbPlus + { 3, 0, 245, 0, 0, } , // 64 pipes (16 PKRs) 1 bpe @ SW_4K_S_X @ RbPlus + { 3, 1, 246, 0, 0, } , // 64 pipes (16 PKRs) 2 bpe @ SW_4K_S_X @ RbPlus + { 3, 2, 247, 0, 0, } , // 64 pipes (16 PKRs) 4 bpe @ SW_4K_S_X @ RbPlus + { 3, 3, 248, 0, 0, } , // 64 pipes (16 PKRs) 8 bpe @ SW_4K_S_X @ RbPlus + { 3, 4, 249, 0, 0, } , // 64 pipes (16 PKRs) 16 bpe @ SW_4K_S_X @ RbPlus + { 3, 0, 21, 0, 0, } , // 32 pipes (32 PKRs) 1 bpe @ SW_4K_S_X @ RbPlus + { 3, 1, 22, 0, 0, } , // 32 pipes (32 PKRs) 2 bpe @ SW_4K_S_X @ RbPlus + { 3, 2, 23, 0, 0, } , // 32 pipes (32 PKRs) 4 bpe @ SW_4K_S_X @ RbPlus + { 3, 3, 24, 0, 0, } , // 32 pipes (32 PKRs) 8 bpe @ SW_4K_S_X @ RbPlus + { 3, 4, 25, 0, 0, } , // 32 pipes (32 PKRs) 16 bpe @ SW_4K_S_X @ RbPlus + { 3, 0, 240, 0, 0, } , // 64 pipes (32 PKRs) 1 bpe @ SW_4K_S_X @ RbPlus + { 3, 1, 241, 0, 0, } , // 64 pipes (32 PKRs) 2 bpe @ SW_4K_S_X @ RbPlus + { 3, 2, 242, 0, 0, } , // 64 pipes (32 PKRs) 4 bpe @ SW_4K_S_X @ RbPlus + { 3, 3, 243, 0, 0, } , // 64 pipes (32 PKRs) 8 bpe @ SW_4K_S_X @ RbPlus + { 3, 4, 244, 0, 0, } , // 64 pipes (32 PKRs) 16 bpe @ SW_4K_S_X @ RbPlus +}; + +const ADDR_SW_PATINFO SW_4K_D_X_RBPLUS_PATINFO[] = +{ + { 1, 5, 1, 0, 0, } , // 1 pipes (1 PKRs) 1 bpe @ SW_4K_D_X @ RbPlus + { 1, 1, 2, 0, 0, } , // 1 pipes (1 PKRs) 2 bpe @ SW_4K_D_X @ RbPlus + { 1, 39, 3, 0, 0, } , // 1 pipes (1 PKRs) 4 bpe @ SW_4K_D_X @ RbPlus + { 1, 6, 4, 0, 0, } , // 1 pipes (1 PKRs) 8 bpe @ SW_4K_D_X @ RbPlus + { 1, 7, 5, 0, 0, } , // 1 pipes (1 PKRs) 16 bpe @ SW_4K_D_X @ RbPlus + { 3, 5, 6, 0, 0, } , // 2 pipes (1-2 PKRs) 1 bpe @ SW_4K_D_X @ RbPlus + { 3, 1, 7, 0, 0, } , // 2 pipes (1-2 PKRs) 2 bpe @ SW_4K_D_X @ RbPlus + { 3, 39, 8, 0, 0, } , // 2 pipes (1-2 PKRs) 4 bpe @ SW_4K_D_X @ RbPlus + { 3, 6, 9, 0, 0, } , // 2 pipes (1-2 PKRs) 8 bpe @ SW_4K_D_X @ RbPlus + { 3, 7, 10, 0, 0, } , // 2 pipes (1-2 PKRs) 16 bpe @ SW_4K_D_X @ RbPlus + { 3, 5, 210, 0, 0, } , // 4 pipes (1-2 PKRs) 1 bpe @ SW_4K_D_X @ RbPlus + { 3, 1, 211, 0, 0, } , // 4 pipes (1-2 PKRs) 2 bpe @ SW_4K_D_X @ RbPlus + { 3, 39, 212, 0, 0, } , // 4 pipes (1-2 PKRs) 4 bpe @ SW_4K_D_X @ RbPlus + { 3, 6, 213, 0, 0, } , // 4 pipes (1-2 PKRs) 8 bpe @ SW_4K_D_X @ RbPlus + { 3, 7, 214, 0, 0, } , // 4 pipes (1-2 PKRs) 16 bpe @ SW_4K_D_X @ RbPlus + { 3, 5, 215, 0, 0, } , // 8 pipes (2 PKRs) 1 bpe @ SW_4K_D_X @ RbPlus + { 3, 1, 216, 0, 0, } , // 8 pipes (2 PKRs) 2 bpe @ SW_4K_D_X @ RbPlus + { 3, 39, 217, 0, 0, } , // 8 pipes (2 PKRs) 4 bpe @ SW_4K_D_X @ RbPlus + { 3, 6, 218, 0, 0, } , // 8 pipes (2 PKRs) 8 bpe @ SW_4K_D_X @ RbPlus + { 3, 7, 219, 0, 0, } , // 8 pipes (2 PKRs) 16 bpe @ SW_4K_D_X @ RbPlus + { 3, 5, 11, 0, 0, } , // 4 pipes (4 PKRs) 1 bpe @ SW_4K_D_X @ RbPlus + { 3, 1, 12, 0, 0, } , // 4 pipes (4 PKRs) 2 bpe @ SW_4K_D_X @ RbPlus + { 3, 39, 13, 0, 0, } , // 4 pipes (4 PKRs) 4 bpe @ SW_4K_D_X @ RbPlus + { 3, 6, 14, 0, 0, } , // 4 pipes (4 PKRs) 8 bpe @ SW_4K_D_X @ RbPlus + { 3, 7, 15, 0, 0, } , // 4 pipes (4 PKRs) 16 bpe @ SW_4K_D_X @ RbPlus + { 3, 5, 220, 0, 0, } , // 8 pipes (4 PKRs) 1 bpe @ SW_4K_D_X @ RbPlus + { 3, 1, 221, 0, 0, } , // 8 pipes (4 PKRs) 2 bpe @ SW_4K_D_X @ RbPlus + { 3, 39, 222, 0, 0, } , // 8 pipes (4 PKRs) 4 bpe @ SW_4K_D_X @ RbPlus + { 3, 6, 223, 0, 0, } , // 8 pipes (4 PKRs) 8 bpe @ SW_4K_D_X @ RbPlus + { 3, 7, 224, 0, 0, } , // 8 pipes (4 PKRs) 16 bpe @ SW_4K_D_X @ RbPlus + { 3, 5, 225, 0, 0, } , // 16 pipes (4 PKRs) 1 bpe @ SW_4K_D_X @ RbPlus + { 3, 1, 226, 0, 0, } , // 16 pipes (4 PKRs) 2 bpe @ SW_4K_D_X @ RbPlus + { 3, 39, 227, 0, 0, } , // 16 pipes (4 PKRs) 4 bpe @ SW_4K_D_X @ RbPlus + { 3, 6, 228, 0, 0, } , // 16 pipes (4 PKRs) 8 bpe @ SW_4K_D_X @ RbPlus + { 3, 7, 229, 0, 0, } , // 16 pipes (4 PKRs) 16 bpe @ SW_4K_D_X @ RbPlus + { 3, 5, 16, 0, 0, } , // 8 pipes (8 PKRs) 1 bpe @ SW_4K_D_X @ RbPlus + { 3, 1, 17, 0, 0, } , // 8 pipes (8 PKRs) 2 bpe @ SW_4K_D_X @ RbPlus + { 3, 39, 18, 0, 0, } , // 8 pipes (8 PKRs) 4 bpe @ SW_4K_D_X @ RbPlus + { 3, 6, 19, 0, 0, } , // 8 pipes (8 PKRs) 8 bpe @ SW_4K_D_X @ RbPlus + { 3, 7, 20, 0, 0, } , // 8 pipes (8 PKRs) 16 bpe @ SW_4K_D_X @ RbPlus + { 3, 5, 230, 0, 0, } , // 16 pipes (8 PKRs) 1 bpe @ SW_4K_D_X @ RbPlus + { 3, 1, 231, 0, 0, } , // 16 pipes (8 PKRs) 2 bpe @ SW_4K_D_X @ RbPlus + { 3, 39, 232, 0, 0, } , // 16 pipes (8 PKRs) 4 bpe @ SW_4K_D_X @ RbPlus + { 3, 6, 233, 0, 0, } , // 16 pipes (8 PKRs) 8 bpe @ SW_4K_D_X @ RbPlus + { 3, 7, 234, 0, 0, } , // 16 pipes (8 PKRs) 16 bpe @ SW_4K_D_X @ RbPlus + { 3, 5, 235, 0, 0, } , // 32 pipes (8 PKRs) 1 bpe @ SW_4K_D_X @ RbPlus + { 3, 1, 236, 0, 0, } , // 32 pipes (8 PKRs) 2 bpe @ SW_4K_D_X @ RbPlus + { 3, 39, 237, 0, 0, } , // 32 pipes (8 PKRs) 4 bpe @ SW_4K_D_X @ RbPlus + { 3, 6, 238, 0, 0, } , // 32 pipes (8 PKRs) 8 bpe @ SW_4K_D_X @ RbPlus + { 3, 7, 239, 0, 0, } , // 32 pipes (8 PKRs) 16 bpe @ SW_4K_D_X @ RbPlus + { 3, 5, 21, 0, 0, } , // 16 pipes (16 PKRs) 1 bpe @ SW_4K_D_X @ RbPlus + { 3, 1, 22, 0, 0, } , // 16 pipes (16 PKRs) 2 bpe @ SW_4K_D_X @ RbPlus + { 3, 39, 23, 0, 0, } , // 16 pipes (16 PKRs) 4 bpe @ SW_4K_D_X @ RbPlus + { 3, 6, 24, 0, 0, } , // 16 pipes (16 PKRs) 8 bpe @ SW_4K_D_X @ RbPlus + { 3, 7, 25, 0, 0, } , // 16 pipes (16 PKRs) 16 bpe @ SW_4K_D_X @ RbPlus + { 3, 5, 240, 0, 0, } , // 32 pipes (16 PKRs) 1 bpe @ SW_4K_D_X @ RbPlus + { 3, 1, 241, 0, 0, } , // 32 pipes (16 PKRs) 2 bpe @ SW_4K_D_X @ RbPlus + { 3, 39, 242, 0, 0, } , // 32 pipes (16 PKRs) 4 bpe @ SW_4K_D_X @ RbPlus + { 3, 6, 243, 0, 0, } , // 32 pipes (16 PKRs) 8 bpe @ SW_4K_D_X @ RbPlus + { 3, 7, 244, 0, 0, } , // 32 pipes (16 PKRs) 16 bpe @ SW_4K_D_X @ RbPlus + { 3, 5, 245, 0, 0, } , // 64 pipes (16 PKRs) 1 bpe @ SW_4K_D_X @ RbPlus + { 3, 1, 246, 0, 0, } , // 64 pipes (16 PKRs) 2 bpe @ SW_4K_D_X @ RbPlus + { 3, 39, 247, 0, 0, } , // 64 pipes (16 PKRs) 4 bpe @ SW_4K_D_X @ RbPlus + { 3, 6, 248, 0, 0, } , // 64 pipes (16 PKRs) 8 bpe @ SW_4K_D_X @ RbPlus + { 3, 7, 249, 0, 0, } , // 64 pipes (16 PKRs) 16 bpe @ SW_4K_D_X @ RbPlus + { 3, 5, 21, 0, 0, } , // 32 pipes (32 PKRs) 1 bpe @ SW_4K_D_X @ RbPlus + { 3, 1, 22, 0, 0, } , // 32 pipes (32 PKRs) 2 bpe @ SW_4K_D_X @ RbPlus + { 3, 39, 23, 0, 0, } , // 32 pipes (32 PKRs) 4 bpe @ SW_4K_D_X @ RbPlus + { 3, 6, 24, 0, 0, } , // 32 pipes (32 PKRs) 8 bpe @ SW_4K_D_X @ RbPlus + { 3, 7, 25, 0, 0, } , // 32 pipes (32 PKRs) 16 bpe @ SW_4K_D_X @ RbPlus + { 3, 5, 240, 0, 0, } , // 64 pipes (32 PKRs) 1 bpe @ SW_4K_D_X @ RbPlus + { 3, 1, 241, 0, 0, } , // 64 pipes (32 PKRs) 2 bpe @ SW_4K_D_X @ RbPlus + { 3, 39, 242, 0, 0, } , // 64 pipes (32 PKRs) 4 bpe @ SW_4K_D_X @ RbPlus + { 3, 6, 243, 0, 0, } , // 64 pipes (32 PKRs) 8 bpe @ SW_4K_D_X @ RbPlus + { 3, 7, 244, 0, 0, } , // 64 pipes (32 PKRs) 16 bpe @ SW_4K_D_X @ RbPlus +}; + +const ADDR_SW_PATINFO SW_4K_S3_RBPLUS_PATINFO[] = +{ + { 1, 29, 131, 0, 0, } , // 1 pipes (1 PKRs) 1 bpe @ SW_4K_S3 @ RbPlus + { 1, 30, 132, 0, 0, } , // 1 pipes (1 PKRs) 2 bpe @ SW_4K_S3 @ RbPlus + { 1, 31, 133, 0, 0, } , // 1 pipes (1 PKRs) 4 bpe @ SW_4K_S3 @ RbPlus + { 1, 32, 134, 0, 0, } , // 1 pipes (1 PKRs) 8 bpe @ SW_4K_S3 @ RbPlus + { 1, 33, 135, 0, 0, } , // 1 pipes (1 PKRs) 16 bpe @ SW_4K_S3 @ RbPlus + { 1, 29, 131, 0, 0, } , // 2 pipes (1-2 PKRs) 1 bpe @ SW_4K_S3 @ RbPlus + { 1, 30, 132, 0, 0, } , // 2 pipes (1-2 PKRs) 2 bpe @ SW_4K_S3 @ RbPlus + { 1, 31, 133, 0, 0, } , // 2 pipes (1-2 PKRs) 4 bpe @ SW_4K_S3 @ RbPlus + { 1, 32, 134, 0, 0, } , // 2 pipes (1-2 PKRs) 8 bpe @ SW_4K_S3 @ RbPlus + { 1, 33, 135, 0, 0, } , // 2 pipes (1-2 PKRs) 16 bpe @ SW_4K_S3 @ RbPlus + { 1, 29, 131, 0, 0, } , // 4 pipes (1-2 PKRs) 1 bpe @ SW_4K_S3 @ RbPlus + { 1, 30, 132, 0, 0, } , // 4 pipes (1-2 PKRs) 2 bpe @ SW_4K_S3 @ RbPlus + { 1, 31, 133, 0, 0, } , // 4 pipes (1-2 PKRs) 4 bpe @ SW_4K_S3 @ RbPlus + { 1, 32, 134, 0, 0, } , // 4 pipes (1-2 PKRs) 8 bpe @ SW_4K_S3 @ RbPlus + { 1, 33, 135, 0, 0, } , // 4 pipes (1-2 PKRs) 16 bpe @ SW_4K_S3 @ RbPlus + { 1, 29, 131, 0, 0, } , // 8 pipes (2 PKRs) 1 bpe @ SW_4K_S3 @ RbPlus + { 1, 30, 132, 0, 0, } , // 8 pipes (2 PKRs) 2 bpe @ SW_4K_S3 @ RbPlus + { 1, 31, 133, 0, 0, } , // 8 pipes (2 PKRs) 4 bpe @ SW_4K_S3 @ RbPlus + { 1, 32, 134, 0, 0, } , // 8 pipes (2 PKRs) 8 bpe @ SW_4K_S3 @ RbPlus + { 1, 33, 135, 0, 0, } , // 8 pipes (2 PKRs) 16 bpe @ SW_4K_S3 @ RbPlus + { 1, 29, 131, 0, 0, } , // 4 pipes (4 PKRs) 1 bpe @ SW_4K_S3 @ RbPlus + { 1, 30, 132, 0, 0, } , // 4 pipes (4 PKRs) 2 bpe @ SW_4K_S3 @ RbPlus + { 1, 31, 133, 0, 0, } , // 4 pipes (4 PKRs) 4 bpe @ SW_4K_S3 @ RbPlus + { 1, 32, 134, 0, 0, } , // 4 pipes (4 PKRs) 8 bpe @ SW_4K_S3 @ RbPlus + { 1, 33, 135, 0, 0, } , // 4 pipes (4 PKRs) 16 bpe @ SW_4K_S3 @ RbPlus + { 1, 29, 131, 0, 0, } , // 8 pipes (4 PKRs) 1 bpe @ SW_4K_S3 @ RbPlus + { 1, 30, 132, 0, 0, } , // 8 pipes (4 PKRs) 2 bpe @ SW_4K_S3 @ RbPlus + { 1, 31, 133, 0, 0, } , // 8 pipes (4 PKRs) 4 bpe @ SW_4K_S3 @ RbPlus + { 1, 32, 134, 0, 0, } , // 8 pipes (4 PKRs) 8 bpe @ SW_4K_S3 @ RbPlus + { 1, 33, 135, 0, 0, } , // 8 pipes (4 PKRs) 16 bpe @ SW_4K_S3 @ RbPlus + { 1, 29, 131, 0, 0, } , // 16 pipes (4 PKRs) 1 bpe @ SW_4K_S3 @ RbPlus + { 1, 30, 132, 0, 0, } , // 16 pipes (4 PKRs) 2 bpe @ SW_4K_S3 @ RbPlus + { 1, 31, 133, 0, 0, } , // 16 pipes (4 PKRs) 4 bpe @ SW_4K_S3 @ RbPlus + { 1, 32, 134, 0, 0, } , // 16 pipes (4 PKRs) 8 bpe @ SW_4K_S3 @ RbPlus + { 1, 33, 135, 0, 0, } , // 16 pipes (4 PKRs) 16 bpe @ SW_4K_S3 @ RbPlus + { 1, 29, 131, 0, 0, } , // 8 pipes (8 PKRs) 1 bpe @ SW_4K_S3 @ RbPlus + { 1, 30, 132, 0, 0, } , // 8 pipes (8 PKRs) 2 bpe @ SW_4K_S3 @ RbPlus + { 1, 31, 133, 0, 0, } , // 8 pipes (8 PKRs) 4 bpe @ SW_4K_S3 @ RbPlus + { 1, 32, 134, 0, 0, } , // 8 pipes (8 PKRs) 8 bpe @ SW_4K_S3 @ RbPlus + { 1, 33, 135, 0, 0, } , // 8 pipes (8 PKRs) 16 bpe @ SW_4K_S3 @ RbPlus + { 1, 29, 131, 0, 0, } , // 16 pipes (8 PKRs) 1 bpe @ SW_4K_S3 @ RbPlus + { 1, 30, 132, 0, 0, } , // 16 pipes (8 PKRs) 2 bpe @ SW_4K_S3 @ RbPlus + { 1, 31, 133, 0, 0, } , // 16 pipes (8 PKRs) 4 bpe @ SW_4K_S3 @ RbPlus + { 1, 32, 134, 0, 0, } , // 16 pipes (8 PKRs) 8 bpe @ SW_4K_S3 @ RbPlus + { 1, 33, 135, 0, 0, } , // 16 pipes (8 PKRs) 16 bpe @ SW_4K_S3 @ RbPlus + { 1, 29, 131, 0, 0, } , // 32 pipes (8 PKRs) 1 bpe @ SW_4K_S3 @ RbPlus + { 1, 30, 132, 0, 0, } , // 32 pipes (8 PKRs) 2 bpe @ SW_4K_S3 @ RbPlus + { 1, 31, 133, 0, 0, } , // 32 pipes (8 PKRs) 4 bpe @ SW_4K_S3 @ RbPlus + { 1, 32, 134, 0, 0, } , // 32 pipes (8 PKRs) 8 bpe @ SW_4K_S3 @ RbPlus + { 1, 33, 135, 0, 0, } , // 32 pipes (8 PKRs) 16 bpe @ SW_4K_S3 @ RbPlus + { 1, 29, 131, 0, 0, } , // 16 pipes (16 PKRs) 1 bpe @ SW_4K_S3 @ RbPlus + { 1, 30, 132, 0, 0, } , // 16 pipes (16 PKRs) 2 bpe @ SW_4K_S3 @ RbPlus + { 1, 31, 133, 0, 0, } , // 16 pipes (16 PKRs) 4 bpe @ SW_4K_S3 @ RbPlus + { 1, 32, 134, 0, 0, } , // 16 pipes (16 PKRs) 8 bpe @ SW_4K_S3 @ RbPlus + { 1, 33, 135, 0, 0, } , // 16 pipes (16 PKRs) 16 bpe @ SW_4K_S3 @ RbPlus + { 1, 29, 131, 0, 0, } , // 32 pipes (16 PKRs) 1 bpe @ SW_4K_S3 @ RbPlus + { 1, 30, 132, 0, 0, } , // 32 pipes (16 PKRs) 2 bpe @ SW_4K_S3 @ RbPlus + { 1, 31, 133, 0, 0, } , // 32 pipes (16 PKRs) 4 bpe @ SW_4K_S3 @ RbPlus + { 1, 32, 134, 0, 0, } , // 32 pipes (16 PKRs) 8 bpe @ SW_4K_S3 @ RbPlus + { 1, 33, 135, 0, 0, } , // 32 pipes (16 PKRs) 16 bpe @ SW_4K_S3 @ RbPlus + { 1, 29, 131, 0, 0, } , // 64 pipes (16 PKRs) 1 bpe @ SW_4K_S3 @ RbPlus + { 1, 30, 132, 0, 0, } , // 64 pipes (16 PKRs) 2 bpe @ SW_4K_S3 @ RbPlus + { 1, 31, 133, 0, 0, } , // 64 pipes (16 PKRs) 4 bpe @ SW_4K_S3 @ RbPlus + { 1, 32, 134, 0, 0, } , // 64 pipes (16 PKRs) 8 bpe @ SW_4K_S3 @ RbPlus + { 1, 33, 135, 0, 0, } , // 64 pipes (16 PKRs) 16 bpe @ SW_4K_S3 @ RbPlus + { 1, 29, 131, 0, 0, } , // 32 pipes (32 PKRs) 1 bpe @ SW_4K_S3 @ RbPlus + { 1, 30, 132, 0, 0, } , // 32 pipes (32 PKRs) 2 bpe @ SW_4K_S3 @ RbPlus + { 1, 31, 133, 0, 0, } , // 32 pipes (32 PKRs) 4 bpe @ SW_4K_S3 @ RbPlus + { 1, 32, 134, 0, 0, } , // 32 pipes (32 PKRs) 8 bpe @ SW_4K_S3 @ RbPlus + { 1, 33, 135, 0, 0, } , // 32 pipes (32 PKRs) 16 bpe @ SW_4K_S3 @ RbPlus + { 1, 29, 131, 0, 0, } , // 64 pipes (32 PKRs) 1 bpe @ SW_4K_S3 @ RbPlus + { 1, 30, 132, 0, 0, } , // 64 pipes (32 PKRs) 2 bpe @ SW_4K_S3 @ RbPlus + { 1, 31, 133, 0, 0, } , // 64 pipes (32 PKRs) 4 bpe @ SW_4K_S3 @ RbPlus + { 1, 32, 134, 0, 0, } , // 64 pipes (32 PKRs) 8 bpe @ SW_4K_S3 @ RbPlus + { 1, 33, 135, 0, 0, } , // 64 pipes (32 PKRs) 16 bpe @ SW_4K_S3 @ RbPlus +}; + +const ADDR_SW_PATINFO SW_4K_S3_X_RBPLUS_PATINFO[] = +{ + { 1, 29, 131, 0, 0, } , // 1 pipes (1 PKRs) 1 bpe @ SW_4K_S3_X @ RbPlus + { 1, 30, 132, 0, 0, } , // 1 pipes (1 PKRs) 2 bpe @ SW_4K_S3_X @ RbPlus + { 1, 31, 133, 0, 0, } , // 1 pipes (1 PKRs) 4 bpe @ SW_4K_S3_X @ RbPlus + { 1, 32, 134, 0, 0, } , // 1 pipes (1 PKRs) 8 bpe @ SW_4K_S3_X @ RbPlus + { 1, 33, 135, 0, 0, } , // 1 pipes (1 PKRs) 16 bpe @ SW_4K_S3_X @ RbPlus + { 3, 29, 136, 0, 0, } , // 2 pipes (1-2 PKRs) 1 bpe @ SW_4K_S3_X @ RbPlus + { 3, 30, 137, 0, 0, } , // 2 pipes (1-2 PKRs) 2 bpe @ SW_4K_S3_X @ RbPlus + { 3, 31, 138, 0, 0, } , // 2 pipes (1-2 PKRs) 4 bpe @ SW_4K_S3_X @ RbPlus + { 3, 32, 139, 0, 0, } , // 2 pipes (1-2 PKRs) 8 bpe @ SW_4K_S3_X @ RbPlus + { 3, 33, 140, 0, 0, } , // 2 pipes (1-2 PKRs) 16 bpe @ SW_4K_S3_X @ RbPlus + { 3, 29, 141, 0, 0, } , // 4 pipes (1-2 PKRs) 1 bpe @ SW_4K_S3_X @ RbPlus + { 3, 30, 142, 0, 0, } , // 4 pipes (1-2 PKRs) 2 bpe @ SW_4K_S3_X @ RbPlus + { 3, 31, 143, 0, 0, } , // 4 pipes (1-2 PKRs) 4 bpe @ SW_4K_S3_X @ RbPlus + { 3, 32, 144, 0, 0, } , // 4 pipes (1-2 PKRs) 8 bpe @ SW_4K_S3_X @ RbPlus + { 3, 33, 145, 0, 0, } , // 4 pipes (1-2 PKRs) 16 bpe @ SW_4K_S3_X @ RbPlus + { 3, 29, 146, 0, 0, } , // 8 pipes (2 PKRs) 1 bpe @ SW_4K_S3_X @ RbPlus + { 3, 30, 147, 0, 0, } , // 8 pipes (2 PKRs) 2 bpe @ SW_4K_S3_X @ RbPlus + { 3, 31, 148, 0, 0, } , // 8 pipes (2 PKRs) 4 bpe @ SW_4K_S3_X @ RbPlus + { 3, 32, 149, 0, 0, } , // 8 pipes (2 PKRs) 8 bpe @ SW_4K_S3_X @ RbPlus + { 3, 33, 150, 0, 0, } , // 8 pipes (2 PKRs) 16 bpe @ SW_4K_S3_X @ RbPlus + { 3, 29, 141, 0, 0, } , // 4 pipes (4 PKRs) 1 bpe @ SW_4K_S3_X @ RbPlus + { 3, 30, 142, 0, 0, } , // 4 pipes (4 PKRs) 2 bpe @ SW_4K_S3_X @ RbPlus + { 3, 31, 143, 0, 0, } , // 4 pipes (4 PKRs) 4 bpe @ SW_4K_S3_X @ RbPlus + { 3, 32, 144, 0, 0, } , // 4 pipes (4 PKRs) 8 bpe @ SW_4K_S3_X @ RbPlus + { 3, 33, 145, 0, 0, } , // 4 pipes (4 PKRs) 16 bpe @ SW_4K_S3_X @ RbPlus + { 3, 29, 146, 0, 0, } , // 8 pipes (4 PKRs) 1 bpe @ SW_4K_S3_X @ RbPlus + { 3, 30, 147, 0, 0, } , // 8 pipes (4 PKRs) 2 bpe @ SW_4K_S3_X @ RbPlus + { 3, 31, 148, 0, 0, } , // 8 pipes (4 PKRs) 4 bpe @ SW_4K_S3_X @ RbPlus + { 3, 32, 149, 0, 0, } , // 8 pipes (4 PKRs) 8 bpe @ SW_4K_S3_X @ RbPlus + { 3, 33, 150, 0, 0, } , // 8 pipes (4 PKRs) 16 bpe @ SW_4K_S3_X @ RbPlus + { 3, 29, 151, 0, 0, } , // 16 pipes (4 PKRs) 1 bpe @ SW_4K_S3_X @ RbPlus + { 3, 30, 152, 0, 0, } , // 16 pipes (4 PKRs) 2 bpe @ SW_4K_S3_X @ RbPlus + { 3, 31, 153, 0, 0, } , // 16 pipes (4 PKRs) 4 bpe @ SW_4K_S3_X @ RbPlus + { 3, 32, 154, 0, 0, } , // 16 pipes (4 PKRs) 8 bpe @ SW_4K_S3_X @ RbPlus + { 3, 33, 155, 0, 0, } , // 16 pipes (4 PKRs) 16 bpe @ SW_4K_S3_X @ RbPlus + { 3, 29, 146, 0, 0, } , // 8 pipes (8 PKRs) 1 bpe @ SW_4K_S3_X @ RbPlus + { 3, 30, 147, 0, 0, } , // 8 pipes (8 PKRs) 2 bpe @ SW_4K_S3_X @ RbPlus + { 3, 31, 148, 0, 0, } , // 8 pipes (8 PKRs) 4 bpe @ SW_4K_S3_X @ RbPlus + { 3, 32, 149, 0, 0, } , // 8 pipes (8 PKRs) 8 bpe @ SW_4K_S3_X @ RbPlus + { 3, 33, 150, 0, 0, } , // 8 pipes (8 PKRs) 16 bpe @ SW_4K_S3_X @ RbPlus + { 3, 29, 151, 0, 0, } , // 16 pipes (8 PKRs) 1 bpe @ SW_4K_S3_X @ RbPlus + { 3, 30, 152, 0, 0, } , // 16 pipes (8 PKRs) 2 bpe @ SW_4K_S3_X @ RbPlus + { 3, 31, 153, 0, 0, } , // 16 pipes (8 PKRs) 4 bpe @ SW_4K_S3_X @ RbPlus + { 3, 32, 154, 0, 0, } , // 16 pipes (8 PKRs) 8 bpe @ SW_4K_S3_X @ RbPlus + { 3, 33, 155, 0, 0, } , // 16 pipes (8 PKRs) 16 bpe @ SW_4K_S3_X @ RbPlus + { 3, 29, 151, 0, 0, } , // 32 pipes (8 PKRs) 1 bpe @ SW_4K_S3_X @ RbPlus + { 3, 30, 152, 0, 0, } , // 32 pipes (8 PKRs) 2 bpe @ SW_4K_S3_X @ RbPlus + { 3, 31, 153, 0, 0, } , // 32 pipes (8 PKRs) 4 bpe @ SW_4K_S3_X @ RbPlus + { 3, 32, 154, 0, 0, } , // 32 pipes (8 PKRs) 8 bpe @ SW_4K_S3_X @ RbPlus + { 3, 33, 155, 0, 0, } , // 32 pipes (8 PKRs) 16 bpe @ SW_4K_S3_X @ RbPlus + { 3, 29, 151, 0, 0, } , // 16 pipes (16 PKRs) 1 bpe @ SW_4K_S3_X @ RbPlus + { 3, 30, 152, 0, 0, } , // 16 pipes (16 PKRs) 2 bpe @ SW_4K_S3_X @ RbPlus + { 3, 31, 153, 0, 0, } , // 16 pipes (16 PKRs) 4 bpe @ SW_4K_S3_X @ RbPlus + { 3, 32, 154, 0, 0, } , // 16 pipes (16 PKRs) 8 bpe @ SW_4K_S3_X @ RbPlus + { 3, 33, 155, 0, 0, } , // 16 pipes (16 PKRs) 16 bpe @ SW_4K_S3_X @ RbPlus + { 3, 29, 151, 0, 0, } , // 32 pipes (16 PKRs) 1 bpe @ SW_4K_S3_X @ RbPlus + { 3, 30, 152, 0, 0, } , // 32 pipes (16 PKRs) 2 bpe @ SW_4K_S3_X @ RbPlus + { 3, 31, 153, 0, 0, } , // 32 pipes (16 PKRs) 4 bpe @ SW_4K_S3_X @ RbPlus + { 3, 32, 154, 0, 0, } , // 32 pipes (16 PKRs) 8 bpe @ SW_4K_S3_X @ RbPlus + { 3, 33, 155, 0, 0, } , // 32 pipes (16 PKRs) 16 bpe @ SW_4K_S3_X @ RbPlus + { 3, 29, 151, 0, 0, } , // 64 pipes (16 PKRs) 1 bpe @ SW_4K_S3_X @ RbPlus + { 3, 30, 152, 0, 0, } , // 64 pipes (16 PKRs) 2 bpe @ SW_4K_S3_X @ RbPlus + { 3, 31, 153, 0, 0, } , // 64 pipes (16 PKRs) 4 bpe @ SW_4K_S3_X @ RbPlus + { 3, 32, 154, 0, 0, } , // 64 pipes (16 PKRs) 8 bpe @ SW_4K_S3_X @ RbPlus + { 3, 33, 155, 0, 0, } , // 64 pipes (16 PKRs) 16 bpe @ SW_4K_S3_X @ RbPlus + { 3, 29, 151, 0, 0, } , // 32 pipes (32 PKRs) 1 bpe @ SW_4K_S3_X @ RbPlus + { 3, 30, 152, 0, 0, } , // 32 pipes (32 PKRs) 2 bpe @ SW_4K_S3_X @ RbPlus + { 3, 31, 153, 0, 0, } , // 32 pipes (32 PKRs) 4 bpe @ SW_4K_S3_X @ RbPlus + { 3, 32, 154, 0, 0, } , // 32 pipes (32 PKRs) 8 bpe @ SW_4K_S3_X @ RbPlus + { 3, 33, 155, 0, 0, } , // 32 pipes (32 PKRs) 16 bpe @ SW_4K_S3_X @ RbPlus + { 3, 29, 151, 0, 0, } , // 64 pipes (32 PKRs) 1 bpe @ SW_4K_S3_X @ RbPlus + { 3, 30, 152, 0, 0, } , // 64 pipes (32 PKRs) 2 bpe @ SW_4K_S3_X @ RbPlus + { 3, 31, 153, 0, 0, } , // 64 pipes (32 PKRs) 4 bpe @ SW_4K_S3_X @ RbPlus + { 3, 32, 154, 0, 0, } , // 64 pipes (32 PKRs) 8 bpe @ SW_4K_S3_X @ RbPlus + { 3, 33, 155, 0, 0, } , // 64 pipes (32 PKRs) 16 bpe @ SW_4K_S3_X @ RbPlus +}; + +const ADDR_SW_PATINFO SW_64K_S_RBPLUS_PATINFO[] = +{ + { 1, 0, 1, 1, 0, } , // 1 pipes (1 PKRs) 1 bpe @ SW_64K_S @ RbPlus + { 1, 1, 2, 2, 0, } , // 1 pipes (1 PKRs) 2 bpe @ SW_64K_S @ RbPlus + { 1, 2, 3, 3, 0, } , // 1 pipes (1 PKRs) 4 bpe @ SW_64K_S @ RbPlus + { 1, 3, 4, 4, 0, } , // 1 pipes (1 PKRs) 8 bpe @ SW_64K_S @ RbPlus + { 1, 4, 5, 5, 0, } , // 1 pipes (1 PKRs) 16 bpe @ SW_64K_S @ RbPlus + { 1, 0, 1, 1, 0, } , // 2 pipes (1-2 PKRs) 1 bpe @ SW_64K_S @ RbPlus + { 1, 1, 2, 2, 0, } , // 2 pipes (1-2 PKRs) 2 bpe @ SW_64K_S @ RbPlus + { 1, 2, 3, 3, 0, } , // 2 pipes (1-2 PKRs) 4 bpe @ SW_64K_S @ RbPlus + { 1, 3, 4, 4, 0, } , // 2 pipes (1-2 PKRs) 8 bpe @ SW_64K_S @ RbPlus + { 1, 4, 5, 5, 0, } , // 2 pipes (1-2 PKRs) 16 bpe @ SW_64K_S @ RbPlus + { 1, 0, 1, 1, 0, } , // 4 pipes (1-2 PKRs) 1 bpe @ SW_64K_S @ RbPlus + { 1, 1, 2, 2, 0, } , // 4 pipes (1-2 PKRs) 2 bpe @ SW_64K_S @ RbPlus + { 1, 2, 3, 3, 0, } , // 4 pipes (1-2 PKRs) 4 bpe @ SW_64K_S @ RbPlus + { 1, 3, 4, 4, 0, } , // 4 pipes (1-2 PKRs) 8 bpe @ SW_64K_S @ RbPlus + { 1, 4, 5, 5, 0, } , // 4 pipes (1-2 PKRs) 16 bpe @ SW_64K_S @ RbPlus + { 1, 0, 1, 1, 0, } , // 8 pipes (2 PKRs) 1 bpe @ SW_64K_S @ RbPlus + { 1, 1, 2, 2, 0, } , // 8 pipes (2 PKRs) 2 bpe @ SW_64K_S @ RbPlus + { 1, 2, 3, 3, 0, } , // 8 pipes (2 PKRs) 4 bpe @ SW_64K_S @ RbPlus + { 1, 3, 4, 4, 0, } , // 8 pipes (2 PKRs) 8 bpe @ SW_64K_S @ RbPlus + { 1, 4, 5, 5, 0, } , // 8 pipes (2 PKRs) 16 bpe @ SW_64K_S @ RbPlus + { 1, 0, 1, 1, 0, } , // 4 pipes (4 PKRs) 1 bpe @ SW_64K_S @ RbPlus + { 1, 1, 2, 2, 0, } , // 4 pipes (4 PKRs) 2 bpe @ SW_64K_S @ RbPlus + { 1, 2, 3, 3, 0, } , // 4 pipes (4 PKRs) 4 bpe @ SW_64K_S @ RbPlus + { 1, 3, 4, 4, 0, } , // 4 pipes (4 PKRs) 8 bpe @ SW_64K_S @ RbPlus + { 1, 4, 5, 5, 0, } , // 4 pipes (4 PKRs) 16 bpe @ SW_64K_S @ RbPlus + { 1, 0, 1, 1, 0, } , // 8 pipes (4 PKRs) 1 bpe @ SW_64K_S @ RbPlus + { 1, 1, 2, 2, 0, } , // 8 pipes (4 PKRs) 2 bpe @ SW_64K_S @ RbPlus + { 1, 2, 3, 3, 0, } , // 8 pipes (4 PKRs) 4 bpe @ SW_64K_S @ RbPlus + { 1, 3, 4, 4, 0, } , // 8 pipes (4 PKRs) 8 bpe @ SW_64K_S @ RbPlus + { 1, 4, 5, 5, 0, } , // 8 pipes (4 PKRs) 16 bpe @ SW_64K_S @ RbPlus + { 1, 0, 1, 1, 0, } , // 16 pipes (4 PKRs) 1 bpe @ SW_64K_S @ RbPlus + { 1, 1, 2, 2, 0, } , // 16 pipes (4 PKRs) 2 bpe @ SW_64K_S @ RbPlus + { 1, 2, 3, 3, 0, } , // 16 pipes (4 PKRs) 4 bpe @ SW_64K_S @ RbPlus + { 1, 3, 4, 4, 0, } , // 16 pipes (4 PKRs) 8 bpe @ SW_64K_S @ RbPlus + { 1, 4, 5, 5, 0, } , // 16 pipes (4 PKRs) 16 bpe @ SW_64K_S @ RbPlus + { 1, 0, 1, 1, 0, } , // 8 pipes (8 PKRs) 1 bpe @ SW_64K_S @ RbPlus + { 1, 1, 2, 2, 0, } , // 8 pipes (8 PKRs) 2 bpe @ SW_64K_S @ RbPlus + { 1, 2, 3, 3, 0, } , // 8 pipes (8 PKRs) 4 bpe @ SW_64K_S @ RbPlus + { 1, 3, 4, 4, 0, } , // 8 pipes (8 PKRs) 8 bpe @ SW_64K_S @ RbPlus + { 1, 4, 5, 5, 0, } , // 8 pipes (8 PKRs) 16 bpe @ SW_64K_S @ RbPlus + { 1, 0, 1, 1, 0, } , // 16 pipes (8 PKRs) 1 bpe @ SW_64K_S @ RbPlus + { 1, 1, 2, 2, 0, } , // 16 pipes (8 PKRs) 2 bpe @ SW_64K_S @ RbPlus + { 1, 2, 3, 3, 0, } , // 16 pipes (8 PKRs) 4 bpe @ SW_64K_S @ RbPlus + { 1, 3, 4, 4, 0, } , // 16 pipes (8 PKRs) 8 bpe @ SW_64K_S @ RbPlus + { 1, 4, 5, 5, 0, } , // 16 pipes (8 PKRs) 16 bpe @ SW_64K_S @ RbPlus + { 1, 0, 1, 1, 0, } , // 32 pipes (8 PKRs) 1 bpe @ SW_64K_S @ RbPlus + { 1, 1, 2, 2, 0, } , // 32 pipes (8 PKRs) 2 bpe @ SW_64K_S @ RbPlus + { 1, 2, 3, 3, 0, } , // 32 pipes (8 PKRs) 4 bpe @ SW_64K_S @ RbPlus + { 1, 3, 4, 4, 0, } , // 32 pipes (8 PKRs) 8 bpe @ SW_64K_S @ RbPlus + { 1, 4, 5, 5, 0, } , // 32 pipes (8 PKRs) 16 bpe @ SW_64K_S @ RbPlus + { 1, 0, 1, 1, 0, } , // 16 pipes (16 PKRs) 1 bpe @ SW_64K_S @ RbPlus + { 1, 1, 2, 2, 0, } , // 16 pipes (16 PKRs) 2 bpe @ SW_64K_S @ RbPlus + { 1, 2, 3, 3, 0, } , // 16 pipes (16 PKRs) 4 bpe @ SW_64K_S @ RbPlus + { 1, 3, 4, 4, 0, } , // 16 pipes (16 PKRs) 8 bpe @ SW_64K_S @ RbPlus + { 1, 4, 5, 5, 0, } , // 16 pipes (16 PKRs) 16 bpe @ SW_64K_S @ RbPlus + { 1, 0, 1, 1, 0, } , // 32 pipes (16 PKRs) 1 bpe @ SW_64K_S @ RbPlus + { 1, 1, 2, 2, 0, } , // 32 pipes (16 PKRs) 2 bpe @ SW_64K_S @ RbPlus + { 1, 2, 3, 3, 0, } , // 32 pipes (16 PKRs) 4 bpe @ SW_64K_S @ RbPlus + { 1, 3, 4, 4, 0, } , // 32 pipes (16 PKRs) 8 bpe @ SW_64K_S @ RbPlus + { 1, 4, 5, 5, 0, } , // 32 pipes (16 PKRs) 16 bpe @ SW_64K_S @ RbPlus + { 1, 0, 1, 1, 0, } , // 64 pipes (16 PKRs) 1 bpe @ SW_64K_S @ RbPlus + { 1, 1, 2, 2, 0, } , // 64 pipes (16 PKRs) 2 bpe @ SW_64K_S @ RbPlus + { 1, 2, 3, 3, 0, } , // 64 pipes (16 PKRs) 4 bpe @ SW_64K_S @ RbPlus + { 1, 3, 4, 4, 0, } , // 64 pipes (16 PKRs) 8 bpe @ SW_64K_S @ RbPlus + { 1, 4, 5, 5, 0, } , // 64 pipes (16 PKRs) 16 bpe @ SW_64K_S @ RbPlus + { 1, 0, 1, 1, 0, } , // 32 pipes (32 PKRs) 1 bpe @ SW_64K_S @ RbPlus + { 1, 1, 2, 2, 0, } , // 32 pipes (32 PKRs) 2 bpe @ SW_64K_S @ RbPlus + { 1, 2, 3, 3, 0, } , // 32 pipes (32 PKRs) 4 bpe @ SW_64K_S @ RbPlus + { 1, 3, 4, 4, 0, } , // 32 pipes (32 PKRs) 8 bpe @ SW_64K_S @ RbPlus + { 1, 4, 5, 5, 0, } , // 32 pipes (32 PKRs) 16 bpe @ SW_64K_S @ RbPlus + { 1, 0, 1, 1, 0, } , // 64 pipes (32 PKRs) 1 bpe @ SW_64K_S @ RbPlus + { 1, 1, 2, 2, 0, } , // 64 pipes (32 PKRs) 2 bpe @ SW_64K_S @ RbPlus + { 1, 2, 3, 3, 0, } , // 64 pipes (32 PKRs) 4 bpe @ SW_64K_S @ RbPlus + { 1, 3, 4, 4, 0, } , // 64 pipes (32 PKRs) 8 bpe @ SW_64K_S @ RbPlus + { 1, 4, 5, 5, 0, } , // 64 pipes (32 PKRs) 16 bpe @ SW_64K_S @ RbPlus +}; + +const ADDR_SW_PATINFO SW_64K_D_RBPLUS_PATINFO[] = +{ + { 1, 5, 1, 1, 0, } , // 1 pipes (1 PKRs) 1 bpe @ SW_64K_D @ RbPlus + { 1, 1, 2, 2, 0, } , // 1 pipes (1 PKRs) 2 bpe @ SW_64K_D @ RbPlus + { 1, 39, 3, 3, 0, } , // 1 pipes (1 PKRs) 4 bpe @ SW_64K_D @ RbPlus + { 1, 6, 4, 4, 0, } , // 1 pipes (1 PKRs) 8 bpe @ SW_64K_D @ RbPlus + { 1, 7, 5, 5, 0, } , // 1 pipes (1 PKRs) 16 bpe @ SW_64K_D @ RbPlus + { 1, 5, 1, 1, 0, } , // 2 pipes (1-2 PKRs) 1 bpe @ SW_64K_D @ RbPlus + { 1, 1, 2, 2, 0, } , // 2 pipes (1-2 PKRs) 2 bpe @ SW_64K_D @ RbPlus + { 1, 39, 3, 3, 0, } , // 2 pipes (1-2 PKRs) 4 bpe @ SW_64K_D @ RbPlus + { 1, 6, 4, 4, 0, } , // 2 pipes (1-2 PKRs) 8 bpe @ SW_64K_D @ RbPlus + { 1, 7, 5, 5, 0, } , // 2 pipes (1-2 PKRs) 16 bpe @ SW_64K_D @ RbPlus + { 1, 5, 1, 1, 0, } , // 4 pipes (1-2 PKRs) 1 bpe @ SW_64K_D @ RbPlus + { 1, 1, 2, 2, 0, } , // 4 pipes (1-2 PKRs) 2 bpe @ SW_64K_D @ RbPlus + { 1, 39, 3, 3, 0, } , // 4 pipes (1-2 PKRs) 4 bpe @ SW_64K_D @ RbPlus + { 1, 6, 4, 4, 0, } , // 4 pipes (1-2 PKRs) 8 bpe @ SW_64K_D @ RbPlus + { 1, 7, 5, 5, 0, } , // 4 pipes (1-2 PKRs) 16 bpe @ SW_64K_D @ RbPlus + { 1, 5, 1, 1, 0, } , // 8 pipes (2 PKRs) 1 bpe @ SW_64K_D @ RbPlus + { 1, 1, 2, 2, 0, } , // 8 pipes (2 PKRs) 2 bpe @ SW_64K_D @ RbPlus + { 1, 39, 3, 3, 0, } , // 8 pipes (2 PKRs) 4 bpe @ SW_64K_D @ RbPlus + { 1, 6, 4, 4, 0, } , // 8 pipes (2 PKRs) 8 bpe @ SW_64K_D @ RbPlus + { 1, 7, 5, 5, 0, } , // 8 pipes (2 PKRs) 16 bpe @ SW_64K_D @ RbPlus + { 1, 5, 1, 1, 0, } , // 4 pipes (4 PKRs) 1 bpe @ SW_64K_D @ RbPlus + { 1, 1, 2, 2, 0, } , // 4 pipes (4 PKRs) 2 bpe @ SW_64K_D @ RbPlus + { 1, 39, 3, 3, 0, } , // 4 pipes (4 PKRs) 4 bpe @ SW_64K_D @ RbPlus + { 1, 6, 4, 4, 0, } , // 4 pipes (4 PKRs) 8 bpe @ SW_64K_D @ RbPlus + { 1, 7, 5, 5, 0, } , // 4 pipes (4 PKRs) 16 bpe @ SW_64K_D @ RbPlus + { 1, 5, 1, 1, 0, } , // 8 pipes (4 PKRs) 1 bpe @ SW_64K_D @ RbPlus + { 1, 1, 2, 2, 0, } , // 8 pipes (4 PKRs) 2 bpe @ SW_64K_D @ RbPlus + { 1, 39, 3, 3, 0, } , // 8 pipes (4 PKRs) 4 bpe @ SW_64K_D @ RbPlus + { 1, 6, 4, 4, 0, } , // 8 pipes (4 PKRs) 8 bpe @ SW_64K_D @ RbPlus + { 1, 7, 5, 5, 0, } , // 8 pipes (4 PKRs) 16 bpe @ SW_64K_D @ RbPlus + { 1, 5, 1, 1, 0, } , // 16 pipes (4 PKRs) 1 bpe @ SW_64K_D @ RbPlus + { 1, 1, 2, 2, 0, } , // 16 pipes (4 PKRs) 2 bpe @ SW_64K_D @ RbPlus + { 1, 39, 3, 3, 0, } , // 16 pipes (4 PKRs) 4 bpe @ SW_64K_D @ RbPlus + { 1, 6, 4, 4, 0, } , // 16 pipes (4 PKRs) 8 bpe @ SW_64K_D @ RbPlus + { 1, 7, 5, 5, 0, } , // 16 pipes (4 PKRs) 16 bpe @ SW_64K_D @ RbPlus + { 1, 5, 1, 1, 0, } , // 8 pipes (8 PKRs) 1 bpe @ SW_64K_D @ RbPlus + { 1, 1, 2, 2, 0, } , // 8 pipes (8 PKRs) 2 bpe @ SW_64K_D @ RbPlus + { 1, 39, 3, 3, 0, } , // 8 pipes (8 PKRs) 4 bpe @ SW_64K_D @ RbPlus + { 1, 6, 4, 4, 0, } , // 8 pipes (8 PKRs) 8 bpe @ SW_64K_D @ RbPlus + { 1, 7, 5, 5, 0, } , // 8 pipes (8 PKRs) 16 bpe @ SW_64K_D @ RbPlus + { 1, 5, 1, 1, 0, } , // 16 pipes (8 PKRs) 1 bpe @ SW_64K_D @ RbPlus + { 1, 1, 2, 2, 0, } , // 16 pipes (8 PKRs) 2 bpe @ SW_64K_D @ RbPlus + { 1, 39, 3, 3, 0, } , // 16 pipes (8 PKRs) 4 bpe @ SW_64K_D @ RbPlus + { 1, 6, 4, 4, 0, } , // 16 pipes (8 PKRs) 8 bpe @ SW_64K_D @ RbPlus + { 1, 7, 5, 5, 0, } , // 16 pipes (8 PKRs) 16 bpe @ SW_64K_D @ RbPlus + { 1, 5, 1, 1, 0, } , // 32 pipes (8 PKRs) 1 bpe @ SW_64K_D @ RbPlus + { 1, 1, 2, 2, 0, } , // 32 pipes (8 PKRs) 2 bpe @ SW_64K_D @ RbPlus + { 1, 39, 3, 3, 0, } , // 32 pipes (8 PKRs) 4 bpe @ SW_64K_D @ RbPlus + { 1, 6, 4, 4, 0, } , // 32 pipes (8 PKRs) 8 bpe @ SW_64K_D @ RbPlus + { 1, 7, 5, 5, 0, } , // 32 pipes (8 PKRs) 16 bpe @ SW_64K_D @ RbPlus + { 1, 5, 1, 1, 0, } , // 16 pipes (16 PKRs) 1 bpe @ SW_64K_D @ RbPlus + { 1, 1, 2, 2, 0, } , // 16 pipes (16 PKRs) 2 bpe @ SW_64K_D @ RbPlus + { 1, 39, 3, 3, 0, } , // 16 pipes (16 PKRs) 4 bpe @ SW_64K_D @ RbPlus + { 1, 6, 4, 4, 0, } , // 16 pipes (16 PKRs) 8 bpe @ SW_64K_D @ RbPlus + { 1, 7, 5, 5, 0, } , // 16 pipes (16 PKRs) 16 bpe @ SW_64K_D @ RbPlus + { 1, 5, 1, 1, 0, } , // 32 pipes (16 PKRs) 1 bpe @ SW_64K_D @ RbPlus + { 1, 1, 2, 2, 0, } , // 32 pipes (16 PKRs) 2 bpe @ SW_64K_D @ RbPlus + { 1, 39, 3, 3, 0, } , // 32 pipes (16 PKRs) 4 bpe @ SW_64K_D @ RbPlus + { 1, 6, 4, 4, 0, } , // 32 pipes (16 PKRs) 8 bpe @ SW_64K_D @ RbPlus + { 1, 7, 5, 5, 0, } , // 32 pipes (16 PKRs) 16 bpe @ SW_64K_D @ RbPlus + { 1, 5, 1, 1, 0, } , // 64 pipes (16 PKRs) 1 bpe @ SW_64K_D @ RbPlus + { 1, 1, 2, 2, 0, } , // 64 pipes (16 PKRs) 2 bpe @ SW_64K_D @ RbPlus + { 1, 39, 3, 3, 0, } , // 64 pipes (16 PKRs) 4 bpe @ SW_64K_D @ RbPlus + { 1, 6, 4, 4, 0, } , // 64 pipes (16 PKRs) 8 bpe @ SW_64K_D @ RbPlus + { 1, 7, 5, 5, 0, } , // 64 pipes (16 PKRs) 16 bpe @ SW_64K_D @ RbPlus + { 1, 5, 1, 1, 0, } , // 32 pipes (32 PKRs) 1 bpe @ SW_64K_D @ RbPlus + { 1, 1, 2, 2, 0, } , // 32 pipes (32 PKRs) 2 bpe @ SW_64K_D @ RbPlus + { 1, 39, 3, 3, 0, } , // 32 pipes (32 PKRs) 4 bpe @ SW_64K_D @ RbPlus + { 1, 6, 4, 4, 0, } , // 32 pipes (32 PKRs) 8 bpe @ SW_64K_D @ RbPlus + { 1, 7, 5, 5, 0, } , // 32 pipes (32 PKRs) 16 bpe @ SW_64K_D @ RbPlus + { 1, 5, 1, 1, 0, } , // 64 pipes (32 PKRs) 1 bpe @ SW_64K_D @ RbPlus + { 1, 1, 2, 2, 0, } , // 64 pipes (32 PKRs) 2 bpe @ SW_64K_D @ RbPlus + { 1, 39, 3, 3, 0, } , // 64 pipes (32 PKRs) 4 bpe @ SW_64K_D @ RbPlus + { 1, 6, 4, 4, 0, } , // 64 pipes (32 PKRs) 8 bpe @ SW_64K_D @ RbPlus + { 1, 7, 5, 5, 0, } , // 64 pipes (32 PKRs) 16 bpe @ SW_64K_D @ RbPlus +}; + +const ADDR_SW_PATINFO SW_64K_S_T_RBPLUS_PATINFO[] = +{ + { 1, 0, 1, 1, 0, } , // 1 pipes (1 PKRs) 1 bpe @ SW_64K_S_T @ RbPlus + { 1, 1, 2, 2, 0, } , // 1 pipes (1 PKRs) 2 bpe @ SW_64K_S_T @ RbPlus + { 1, 2, 3, 3, 0, } , // 1 pipes (1 PKRs) 4 bpe @ SW_64K_S_T @ RbPlus + { 1, 3, 4, 4, 0, } , // 1 pipes (1 PKRs) 8 bpe @ SW_64K_S_T @ RbPlus + { 1, 4, 5, 5, 0, } , // 1 pipes (1 PKRs) 16 bpe @ SW_64K_S_T @ RbPlus + { 2, 0, 36, 1, 0, } , // 2 pipes (1-2 PKRs) 1 bpe @ SW_64K_S_T @ RbPlus + { 2, 1, 37, 2, 0, } , // 2 pipes (1-2 PKRs) 2 bpe @ SW_64K_S_T @ RbPlus + { 2, 2, 38, 3, 0, } , // 2 pipes (1-2 PKRs) 4 bpe @ SW_64K_S_T @ RbPlus + { 2, 3, 39, 4, 0, } , // 2 pipes (1-2 PKRs) 8 bpe @ SW_64K_S_T @ RbPlus + { 2, 4, 40, 5, 0, } , // 2 pipes (1-2 PKRs) 16 bpe @ SW_64K_S_T @ RbPlus + { 2, 0, 41, 1, 0, } , // 4 pipes (1-2 PKRs) 1 bpe @ SW_64K_S_T @ RbPlus + { 2, 1, 42, 2, 0, } , // 4 pipes (1-2 PKRs) 2 bpe @ SW_64K_S_T @ RbPlus + { 2, 2, 43, 3, 0, } , // 4 pipes (1-2 PKRs) 4 bpe @ SW_64K_S_T @ RbPlus + { 2, 3, 44, 4, 0, } , // 4 pipes (1-2 PKRs) 8 bpe @ SW_64K_S_T @ RbPlus + { 2, 4, 45, 5, 0, } , // 4 pipes (1-2 PKRs) 16 bpe @ SW_64K_S_T @ RbPlus + { 2, 0, 46, 1, 0, } , // 8 pipes (2 PKRs) 1 bpe @ SW_64K_S_T @ RbPlus + { 2, 1, 47, 2, 0, } , // 8 pipes (2 PKRs) 2 bpe @ SW_64K_S_T @ RbPlus + { 2, 2, 48, 3, 0, } , // 8 pipes (2 PKRs) 4 bpe @ SW_64K_S_T @ RbPlus + { 2, 3, 49, 4, 0, } , // 8 pipes (2 PKRs) 8 bpe @ SW_64K_S_T @ RbPlus + { 2, 4, 50, 5, 0, } , // 8 pipes (2 PKRs) 16 bpe @ SW_64K_S_T @ RbPlus + { 2, 0, 41, 1, 0, } , // 4 pipes (4 PKRs) 1 bpe @ SW_64K_S_T @ RbPlus + { 2, 1, 42, 2, 0, } , // 4 pipes (4 PKRs) 2 bpe @ SW_64K_S_T @ RbPlus + { 2, 2, 43, 3, 0, } , // 4 pipes (4 PKRs) 4 bpe @ SW_64K_S_T @ RbPlus + { 2, 3, 44, 4, 0, } , // 4 pipes (4 PKRs) 8 bpe @ SW_64K_S_T @ RbPlus + { 2, 4, 45, 5, 0, } , // 4 pipes (4 PKRs) 16 bpe @ SW_64K_S_T @ RbPlus + { 2, 0, 46, 1, 0, } , // 8 pipes (4 PKRs) 1 bpe @ SW_64K_S_T @ RbPlus + { 2, 1, 47, 2, 0, } , // 8 pipes (4 PKRs) 2 bpe @ SW_64K_S_T @ RbPlus + { 2, 2, 48, 3, 0, } , // 8 pipes (4 PKRs) 4 bpe @ SW_64K_S_T @ RbPlus + { 2, 3, 49, 4, 0, } , // 8 pipes (4 PKRs) 8 bpe @ SW_64K_S_T @ RbPlus + { 2, 4, 50, 5, 0, } , // 8 pipes (4 PKRs) 16 bpe @ SW_64K_S_T @ RbPlus + { 2, 0, 51, 1, 0, } , // 16 pipes (4 PKRs) 1 bpe @ SW_64K_S_T @ RbPlus + { 2, 1, 52, 2, 0, } , // 16 pipes (4 PKRs) 2 bpe @ SW_64K_S_T @ RbPlus + { 2, 2, 53, 3, 0, } , // 16 pipes (4 PKRs) 4 bpe @ SW_64K_S_T @ RbPlus + { 2, 3, 54, 4, 0, } , // 16 pipes (4 PKRs) 8 bpe @ SW_64K_S_T @ RbPlus + { 2, 4, 55, 5, 0, } , // 16 pipes (4 PKRs) 16 bpe @ SW_64K_S_T @ RbPlus + { 2, 0, 46, 1, 0, } , // 8 pipes (8 PKRs) 1 bpe @ SW_64K_S_T @ RbPlus + { 2, 1, 47, 2, 0, } , // 8 pipes (8 PKRs) 2 bpe @ SW_64K_S_T @ RbPlus + { 2, 2, 48, 3, 0, } , // 8 pipes (8 PKRs) 4 bpe @ SW_64K_S_T @ RbPlus + { 2, 3, 49, 4, 0, } , // 8 pipes (8 PKRs) 8 bpe @ SW_64K_S_T @ RbPlus + { 2, 4, 50, 5, 0, } , // 8 pipes (8 PKRs) 16 bpe @ SW_64K_S_T @ RbPlus + { 2, 0, 51, 1, 0, } , // 16 pipes (8 PKRs) 1 bpe @ SW_64K_S_T @ RbPlus + { 2, 1, 52, 2, 0, } , // 16 pipes (8 PKRs) 2 bpe @ SW_64K_S_T @ RbPlus + { 2, 2, 53, 3, 0, } , // 16 pipes (8 PKRs) 4 bpe @ SW_64K_S_T @ RbPlus + { 2, 3, 54, 4, 0, } , // 16 pipes (8 PKRs) 8 bpe @ SW_64K_S_T @ RbPlus + { 2, 4, 55, 5, 0, } , // 16 pipes (8 PKRs) 16 bpe @ SW_64K_S_T @ RbPlus + { 2, 0, 56, 16, 0, } , // 32 pipes (8 PKRs) 1 bpe @ SW_64K_S_T @ RbPlus + { 2, 1, 57, 17, 0, } , // 32 pipes (8 PKRs) 2 bpe @ SW_64K_S_T @ RbPlus + { 2, 2, 58, 18, 0, } , // 32 pipes (8 PKRs) 4 bpe @ SW_64K_S_T @ RbPlus + { 2, 3, 59, 19, 0, } , // 32 pipes (8 PKRs) 8 bpe @ SW_64K_S_T @ RbPlus + { 2, 4, 60, 20, 0, } , // 32 pipes (8 PKRs) 16 bpe @ SW_64K_S_T @ RbPlus + { 2, 0, 51, 1, 0, } , // 16 pipes (16 PKRs) 1 bpe @ SW_64K_S_T @ RbPlus + { 2, 1, 52, 2, 0, } , // 16 pipes (16 PKRs) 2 bpe @ SW_64K_S_T @ RbPlus + { 2, 2, 53, 3, 0, } , // 16 pipes (16 PKRs) 4 bpe @ SW_64K_S_T @ RbPlus + { 2, 3, 54, 4, 0, } , // 16 pipes (16 PKRs) 8 bpe @ SW_64K_S_T @ RbPlus + { 2, 4, 55, 5, 0, } , // 16 pipes (16 PKRs) 16 bpe @ SW_64K_S_T @ RbPlus + { 2, 0, 56, 16, 0, } , // 32 pipes (16 PKRs) 1 bpe @ SW_64K_S_T @ RbPlus + { 2, 1, 57, 17, 0, } , // 32 pipes (16 PKRs) 2 bpe @ SW_64K_S_T @ RbPlus + { 2, 2, 58, 18, 0, } , // 32 pipes (16 PKRs) 4 bpe @ SW_64K_S_T @ RbPlus + { 2, 3, 59, 19, 0, } , // 32 pipes (16 PKRs) 8 bpe @ SW_64K_S_T @ RbPlus + { 2, 4, 60, 20, 0, } , // 32 pipes (16 PKRs) 16 bpe @ SW_64K_S_T @ RbPlus + { 2, 0, 1, 21, 0, } , // 64 pipes (16 PKRs) 1 bpe @ SW_64K_S_T @ RbPlus + { 2, 1, 2, 22, 0, } , // 64 pipes (16 PKRs) 2 bpe @ SW_64K_S_T @ RbPlus + { 2, 2, 3, 23, 0, } , // 64 pipes (16 PKRs) 4 bpe @ SW_64K_S_T @ RbPlus + { 2, 3, 4, 24, 0, } , // 64 pipes (16 PKRs) 8 bpe @ SW_64K_S_T @ RbPlus + { 2, 4, 5, 25, 0, } , // 64 pipes (16 PKRs) 16 bpe @ SW_64K_S_T @ RbPlus + { 2, 0, 56, 16, 0, } , // 32 pipes (32 PKRs) 1 bpe @ SW_64K_S_T @ RbPlus + { 2, 1, 57, 17, 0, } , // 32 pipes (32 PKRs) 2 bpe @ SW_64K_S_T @ RbPlus + { 2, 2, 58, 18, 0, } , // 32 pipes (32 PKRs) 4 bpe @ SW_64K_S_T @ RbPlus + { 2, 3, 59, 19, 0, } , // 32 pipes (32 PKRs) 8 bpe @ SW_64K_S_T @ RbPlus + { 2, 4, 60, 20, 0, } , // 32 pipes (32 PKRs) 16 bpe @ SW_64K_S_T @ RbPlus + { 2, 0, 1, 21, 0, } , // 64 pipes (32 PKRs) 1 bpe @ SW_64K_S_T @ RbPlus + { 2, 1, 2, 22, 0, } , // 64 pipes (32 PKRs) 2 bpe @ SW_64K_S_T @ RbPlus + { 2, 2, 3, 23, 0, } , // 64 pipes (32 PKRs) 4 bpe @ SW_64K_S_T @ RbPlus + { 2, 3, 4, 24, 0, } , // 64 pipes (32 PKRs) 8 bpe @ SW_64K_S_T @ RbPlus + { 2, 4, 5, 25, 0, } , // 64 pipes (32 PKRs) 16 bpe @ SW_64K_S_T @ RbPlus +}; + +const ADDR_SW_PATINFO SW_64K_D_T_RBPLUS_PATINFO[] = +{ + { 1, 5, 1, 1, 0, } , // 1 pipes (1 PKRs) 1 bpe @ SW_64K_D_T @ RbPlus + { 1, 1, 2, 2, 0, } , // 1 pipes (1 PKRs) 2 bpe @ SW_64K_D_T @ RbPlus + { 1, 39, 3, 3, 0, } , // 1 pipes (1 PKRs) 4 bpe @ SW_64K_D_T @ RbPlus + { 1, 6, 4, 4, 0, } , // 1 pipes (1 PKRs) 8 bpe @ SW_64K_D_T @ RbPlus + { 1, 7, 5, 5, 0, } , // 1 pipes (1 PKRs) 16 bpe @ SW_64K_D_T @ RbPlus + { 2, 5, 36, 1, 0, } , // 2 pipes (1-2 PKRs) 1 bpe @ SW_64K_D_T @ RbPlus + { 2, 1, 37, 2, 0, } , // 2 pipes (1-2 PKRs) 2 bpe @ SW_64K_D_T @ RbPlus + { 2, 39, 38, 3, 0, } , // 2 pipes (1-2 PKRs) 4 bpe @ SW_64K_D_T @ RbPlus + { 2, 6, 39, 4, 0, } , // 2 pipes (1-2 PKRs) 8 bpe @ SW_64K_D_T @ RbPlus + { 2, 7, 40, 5, 0, } , // 2 pipes (1-2 PKRs) 16 bpe @ SW_64K_D_T @ RbPlus + { 2, 5, 41, 1, 0, } , // 4 pipes (1-2 PKRs) 1 bpe @ SW_64K_D_T @ RbPlus + { 2, 1, 42, 2, 0, } , // 4 pipes (1-2 PKRs) 2 bpe @ SW_64K_D_T @ RbPlus + { 2, 39, 43, 3, 0, } , // 4 pipes (1-2 PKRs) 4 bpe @ SW_64K_D_T @ RbPlus + { 2, 6, 44, 4, 0, } , // 4 pipes (1-2 PKRs) 8 bpe @ SW_64K_D_T @ RbPlus + { 2, 7, 45, 5, 0, } , // 4 pipes (1-2 PKRs) 16 bpe @ SW_64K_D_T @ RbPlus + { 2, 5, 46, 1, 0, } , // 8 pipes (2 PKRs) 1 bpe @ SW_64K_D_T @ RbPlus + { 2, 1, 47, 2, 0, } , // 8 pipes (2 PKRs) 2 bpe @ SW_64K_D_T @ RbPlus + { 2, 39, 48, 3, 0, } , // 8 pipes (2 PKRs) 4 bpe @ SW_64K_D_T @ RbPlus + { 2, 6, 49, 4, 0, } , // 8 pipes (2 PKRs) 8 bpe @ SW_64K_D_T @ RbPlus + { 2, 7, 50, 5, 0, } , // 8 pipes (2 PKRs) 16 bpe @ SW_64K_D_T @ RbPlus + { 2, 5, 41, 1, 0, } , // 4 pipes (4 PKRs) 1 bpe @ SW_64K_D_T @ RbPlus + { 2, 1, 42, 2, 0, } , // 4 pipes (4 PKRs) 2 bpe @ SW_64K_D_T @ RbPlus + { 2, 39, 43, 3, 0, } , // 4 pipes (4 PKRs) 4 bpe @ SW_64K_D_T @ RbPlus + { 2, 6, 44, 4, 0, } , // 4 pipes (4 PKRs) 8 bpe @ SW_64K_D_T @ RbPlus + { 2, 7, 45, 5, 0, } , // 4 pipes (4 PKRs) 16 bpe @ SW_64K_D_T @ RbPlus + { 2, 5, 46, 1, 0, } , // 8 pipes (4 PKRs) 1 bpe @ SW_64K_D_T @ RbPlus + { 2, 1, 47, 2, 0, } , // 8 pipes (4 PKRs) 2 bpe @ SW_64K_D_T @ RbPlus + { 2, 39, 48, 3, 0, } , // 8 pipes (4 PKRs) 4 bpe @ SW_64K_D_T @ RbPlus + { 2, 6, 49, 4, 0, } , // 8 pipes (4 PKRs) 8 bpe @ SW_64K_D_T @ RbPlus + { 2, 7, 50, 5, 0, } , // 8 pipes (4 PKRs) 16 bpe @ SW_64K_D_T @ RbPlus + { 2, 5, 51, 1, 0, } , // 16 pipes (4 PKRs) 1 bpe @ SW_64K_D_T @ RbPlus + { 2, 1, 52, 2, 0, } , // 16 pipes (4 PKRs) 2 bpe @ SW_64K_D_T @ RbPlus + { 2, 39, 53, 3, 0, } , // 16 pipes (4 PKRs) 4 bpe @ SW_64K_D_T @ RbPlus + { 2, 6, 54, 4, 0, } , // 16 pipes (4 PKRs) 8 bpe @ SW_64K_D_T @ RbPlus + { 2, 7, 55, 5, 0, } , // 16 pipes (4 PKRs) 16 bpe @ SW_64K_D_T @ RbPlus + { 2, 5, 46, 1, 0, } , // 8 pipes (8 PKRs) 1 bpe @ SW_64K_D_T @ RbPlus + { 2, 1, 47, 2, 0, } , // 8 pipes (8 PKRs) 2 bpe @ SW_64K_D_T @ RbPlus + { 2, 39, 48, 3, 0, } , // 8 pipes (8 PKRs) 4 bpe @ SW_64K_D_T @ RbPlus + { 2, 6, 49, 4, 0, } , // 8 pipes (8 PKRs) 8 bpe @ SW_64K_D_T @ RbPlus + { 2, 7, 50, 5, 0, } , // 8 pipes (8 PKRs) 16 bpe @ SW_64K_D_T @ RbPlus + { 2, 5, 51, 1, 0, } , // 16 pipes (8 PKRs) 1 bpe @ SW_64K_D_T @ RbPlus + { 2, 1, 52, 2, 0, } , // 16 pipes (8 PKRs) 2 bpe @ SW_64K_D_T @ RbPlus + { 2, 39, 53, 3, 0, } , // 16 pipes (8 PKRs) 4 bpe @ SW_64K_D_T @ RbPlus + { 2, 6, 54, 4, 0, } , // 16 pipes (8 PKRs) 8 bpe @ SW_64K_D_T @ RbPlus + { 2, 7, 55, 5, 0, } , // 16 pipes (8 PKRs) 16 bpe @ SW_64K_D_T @ RbPlus + { 2, 5, 56, 16, 0, } , // 32 pipes (8 PKRs) 1 bpe @ SW_64K_D_T @ RbPlus + { 2, 1, 57, 17, 0, } , // 32 pipes (8 PKRs) 2 bpe @ SW_64K_D_T @ RbPlus + { 2, 39, 58, 18, 0, } , // 32 pipes (8 PKRs) 4 bpe @ SW_64K_D_T @ RbPlus + { 2, 6, 59, 19, 0, } , // 32 pipes (8 PKRs) 8 bpe @ SW_64K_D_T @ RbPlus + { 2, 7, 60, 20, 0, } , // 32 pipes (8 PKRs) 16 bpe @ SW_64K_D_T @ RbPlus + { 2, 5, 51, 1, 0, } , // 16 pipes (16 PKRs) 1 bpe @ SW_64K_D_T @ RbPlus + { 2, 1, 52, 2, 0, } , // 16 pipes (16 PKRs) 2 bpe @ SW_64K_D_T @ RbPlus + { 2, 39, 53, 3, 0, } , // 16 pipes (16 PKRs) 4 bpe @ SW_64K_D_T @ RbPlus + { 2, 6, 54, 4, 0, } , // 16 pipes (16 PKRs) 8 bpe @ SW_64K_D_T @ RbPlus + { 2, 7, 55, 5, 0, } , // 16 pipes (16 PKRs) 16 bpe @ SW_64K_D_T @ RbPlus + { 2, 5, 56, 16, 0, } , // 32 pipes (16 PKRs) 1 bpe @ SW_64K_D_T @ RbPlus + { 2, 1, 57, 17, 0, } , // 32 pipes (16 PKRs) 2 bpe @ SW_64K_D_T @ RbPlus + { 2, 39, 58, 18, 0, } , // 32 pipes (16 PKRs) 4 bpe @ SW_64K_D_T @ RbPlus + { 2, 6, 59, 19, 0, } , // 32 pipes (16 PKRs) 8 bpe @ SW_64K_D_T @ RbPlus + { 2, 7, 60, 20, 0, } , // 32 pipes (16 PKRs) 16 bpe @ SW_64K_D_T @ RbPlus + { 2, 5, 1, 21, 0, } , // 64 pipes (16 PKRs) 1 bpe @ SW_64K_D_T @ RbPlus + { 2, 1, 2, 22, 0, } , // 64 pipes (16 PKRs) 2 bpe @ SW_64K_D_T @ RbPlus + { 2, 39, 3, 23, 0, } , // 64 pipes (16 PKRs) 4 bpe @ SW_64K_D_T @ RbPlus + { 2, 6, 4, 24, 0, } , // 64 pipes (16 PKRs) 8 bpe @ SW_64K_D_T @ RbPlus + { 2, 7, 5, 25, 0, } , // 64 pipes (16 PKRs) 16 bpe @ SW_64K_D_T @ RbPlus + { 2, 5, 56, 16, 0, } , // 32 pipes (32 PKRs) 1 bpe @ SW_64K_D_T @ RbPlus + { 2, 1, 57, 17, 0, } , // 32 pipes (32 PKRs) 2 bpe @ SW_64K_D_T @ RbPlus + { 2, 39, 58, 18, 0, } , // 32 pipes (32 PKRs) 4 bpe @ SW_64K_D_T @ RbPlus + { 2, 6, 59, 19, 0, } , // 32 pipes (32 PKRs) 8 bpe @ SW_64K_D_T @ RbPlus + { 2, 7, 60, 20, 0, } , // 32 pipes (32 PKRs) 16 bpe @ SW_64K_D_T @ RbPlus + { 2, 5, 1, 21, 0, } , // 64 pipes (32 PKRs) 1 bpe @ SW_64K_D_T @ RbPlus + { 2, 1, 2, 22, 0, } , // 64 pipes (32 PKRs) 2 bpe @ SW_64K_D_T @ RbPlus + { 2, 39, 3, 23, 0, } , // 64 pipes (32 PKRs) 4 bpe @ SW_64K_D_T @ RbPlus + { 2, 6, 4, 24, 0, } , // 64 pipes (32 PKRs) 8 bpe @ SW_64K_D_T @ RbPlus + { 2, 7, 5, 25, 0, } , // 64 pipes (32 PKRs) 16 bpe @ SW_64K_D_T @ RbPlus +}; + +const ADDR_SW_PATINFO SW_64K_S_X_RBPLUS_PATINFO[] = +{ + { 1, 0, 1, 1, 0, } , // 1 pipes (1 PKRs) 1 bpe @ SW_64K_S_X @ RbPlus + { 1, 1, 2, 2, 0, } , // 1 pipes (1 PKRs) 2 bpe @ SW_64K_S_X @ RbPlus + { 1, 2, 3, 3, 0, } , // 1 pipes (1 PKRs) 4 bpe @ SW_64K_S_X @ RbPlus + { 1, 3, 4, 4, 0, } , // 1 pipes (1 PKRs) 8 bpe @ SW_64K_S_X @ RbPlus + { 1, 4, 5, 5, 0, } , // 1 pipes (1 PKRs) 16 bpe @ SW_64K_S_X @ RbPlus + { 3, 0, 6, 1, 0, } , // 2 pipes (1-2 PKRs) 1 bpe @ SW_64K_S_X @ RbPlus + { 3, 1, 7, 2, 0, } , // 2 pipes (1-2 PKRs) 2 bpe @ SW_64K_S_X @ RbPlus + { 3, 2, 8, 3, 0, } , // 2 pipes (1-2 PKRs) 4 bpe @ SW_64K_S_X @ RbPlus + { 3, 3, 9, 4, 0, } , // 2 pipes (1-2 PKRs) 8 bpe @ SW_64K_S_X @ RbPlus + { 3, 4, 10, 5, 0, } , // 2 pipes (1-2 PKRs) 16 bpe @ SW_64K_S_X @ RbPlus + { 3, 0, 210, 1, 0, } , // 4 pipes (1-2 PKRs) 1 bpe @ SW_64K_S_X @ RbPlus + { 3, 1, 211, 2, 0, } , // 4 pipes (1-2 PKRs) 2 bpe @ SW_64K_S_X @ RbPlus + { 3, 2, 212, 3, 0, } , // 4 pipes (1-2 PKRs) 4 bpe @ SW_64K_S_X @ RbPlus + { 3, 3, 213, 4, 0, } , // 4 pipes (1-2 PKRs) 8 bpe @ SW_64K_S_X @ RbPlus + { 3, 4, 214, 5, 0, } , // 4 pipes (1-2 PKRs) 16 bpe @ SW_64K_S_X @ RbPlus + { 3, 0, 215, 1, 0, } , // 8 pipes (2 PKRs) 1 bpe @ SW_64K_S_X @ RbPlus + { 3, 1, 216, 2, 0, } , // 8 pipes (2 PKRs) 2 bpe @ SW_64K_S_X @ RbPlus + { 3, 2, 217, 3, 0, } , // 8 pipes (2 PKRs) 4 bpe @ SW_64K_S_X @ RbPlus + { 3, 3, 218, 4, 0, } , // 8 pipes (2 PKRs) 8 bpe @ SW_64K_S_X @ RbPlus + { 3, 4, 219, 5, 0, } , // 8 pipes (2 PKRs) 16 bpe @ SW_64K_S_X @ RbPlus + { 3, 0, 11, 1, 0, } , // 4 pipes (4 PKRs) 1 bpe @ SW_64K_S_X @ RbPlus + { 3, 1, 12, 2, 0, } , // 4 pipes (4 PKRs) 2 bpe @ SW_64K_S_X @ RbPlus + { 3, 2, 13, 3, 0, } , // 4 pipes (4 PKRs) 4 bpe @ SW_64K_S_X @ RbPlus + { 3, 3, 14, 4, 0, } , // 4 pipes (4 PKRs) 8 bpe @ SW_64K_S_X @ RbPlus + { 3, 4, 15, 5, 0, } , // 4 pipes (4 PKRs) 16 bpe @ SW_64K_S_X @ RbPlus + { 3, 0, 220, 1, 0, } , // 8 pipes (4 PKRs) 1 bpe @ SW_64K_S_X @ RbPlus + { 3, 1, 221, 2, 0, } , // 8 pipes (4 PKRs) 2 bpe @ SW_64K_S_X @ RbPlus + { 3, 2, 222, 3, 0, } , // 8 pipes (4 PKRs) 4 bpe @ SW_64K_S_X @ RbPlus + { 3, 3, 223, 4, 0, } , // 8 pipes (4 PKRs) 8 bpe @ SW_64K_S_X @ RbPlus + { 3, 4, 224, 5, 0, } , // 8 pipes (4 PKRs) 16 bpe @ SW_64K_S_X @ RbPlus + { 3, 0, 225, 1, 0, } , // 16 pipes (4 PKRs) 1 bpe @ SW_64K_S_X @ RbPlus + { 3, 1, 226, 2, 0, } , // 16 pipes (4 PKRs) 2 bpe @ SW_64K_S_X @ RbPlus + { 3, 2, 227, 3, 0, } , // 16 pipes (4 PKRs) 4 bpe @ SW_64K_S_X @ RbPlus + { 3, 3, 228, 4, 0, } , // 16 pipes (4 PKRs) 8 bpe @ SW_64K_S_X @ RbPlus + { 3, 4, 229, 5, 0, } , // 16 pipes (4 PKRs) 16 bpe @ SW_64K_S_X @ RbPlus + { 3, 0, 16, 1, 0, } , // 8 pipes (8 PKRs) 1 bpe @ SW_64K_S_X @ RbPlus + { 3, 1, 17, 2, 0, } , // 8 pipes (8 PKRs) 2 bpe @ SW_64K_S_X @ RbPlus + { 3, 2, 18, 3, 0, } , // 8 pipes (8 PKRs) 4 bpe @ SW_64K_S_X @ RbPlus + { 3, 3, 19, 4, 0, } , // 8 pipes (8 PKRs) 8 bpe @ SW_64K_S_X @ RbPlus + { 3, 4, 20, 5, 0, } , // 8 pipes (8 PKRs) 16 bpe @ SW_64K_S_X @ RbPlus + { 3, 0, 230, 1, 0, } , // 16 pipes (8 PKRs) 1 bpe @ SW_64K_S_X @ RbPlus + { 3, 1, 231, 2, 0, } , // 16 pipes (8 PKRs) 2 bpe @ SW_64K_S_X @ RbPlus + { 3, 2, 232, 3, 0, } , // 16 pipes (8 PKRs) 4 bpe @ SW_64K_S_X @ RbPlus + { 3, 3, 233, 4, 0, } , // 16 pipes (8 PKRs) 8 bpe @ SW_64K_S_X @ RbPlus + { 3, 4, 234, 5, 0, } , // 16 pipes (8 PKRs) 16 bpe @ SW_64K_S_X @ RbPlus + { 3, 0, 250, 6, 0, } , // 32 pipes (8 PKRs) 1 bpe @ SW_64K_S_X @ RbPlus + { 3, 1, 251, 7, 0, } , // 32 pipes (8 PKRs) 2 bpe @ SW_64K_S_X @ RbPlus + { 3, 2, 252, 8, 0, } , // 32 pipes (8 PKRs) 4 bpe @ SW_64K_S_X @ RbPlus + { 3, 3, 253, 9, 0, } , // 32 pipes (8 PKRs) 8 bpe @ SW_64K_S_X @ RbPlus + { 3, 4, 254, 10, 0, } , // 32 pipes (8 PKRs) 16 bpe @ SW_64K_S_X @ RbPlus + { 3, 0, 21, 1, 0, } , // 16 pipes (16 PKRs) 1 bpe @ SW_64K_S_X @ RbPlus + { 3, 1, 22, 2, 0, } , // 16 pipes (16 PKRs) 2 bpe @ SW_64K_S_X @ RbPlus + { 3, 2, 23, 3, 0, } , // 16 pipes (16 PKRs) 4 bpe @ SW_64K_S_X @ RbPlus + { 3, 3, 24, 4, 0, } , // 16 pipes (16 PKRs) 8 bpe @ SW_64K_S_X @ RbPlus + { 3, 4, 25, 5, 0, } , // 16 pipes (16 PKRs) 16 bpe @ SW_64K_S_X @ RbPlus + { 3, 0, 255, 6, 0, } , // 32 pipes (16 PKRs) 1 bpe @ SW_64K_S_X @ RbPlus + { 3, 1, 256, 7, 0, } , // 32 pipes (16 PKRs) 2 bpe @ SW_64K_S_X @ RbPlus + { 3, 2, 257, 8, 0, } , // 32 pipes (16 PKRs) 4 bpe @ SW_64K_S_X @ RbPlus + { 3, 3, 258, 9, 0, } , // 32 pipes (16 PKRs) 8 bpe @ SW_64K_S_X @ RbPlus + { 3, 4, 259, 10, 0, } , // 32 pipes (16 PKRs) 16 bpe @ SW_64K_S_X @ RbPlus + { 3, 0, 260, 11, 0, } , // 64 pipes (16 PKRs) 1 bpe @ SW_64K_S_X @ RbPlus + { 3, 1, 261, 12, 0, } , // 64 pipes (16 PKRs) 2 bpe @ SW_64K_S_X @ RbPlus + { 3, 2, 262, 13, 0, } , // 64 pipes (16 PKRs) 4 bpe @ SW_64K_S_X @ RbPlus + { 3, 3, 263, 14, 0, } , // 64 pipes (16 PKRs) 8 bpe @ SW_64K_S_X @ RbPlus + { 3, 4, 264, 15, 0, } , // 64 pipes (16 PKRs) 16 bpe @ SW_64K_S_X @ RbPlus + { 3, 0, 26, 6, 0, } , // 32 pipes (32 PKRs) 1 bpe @ SW_64K_S_X @ RbPlus + { 3, 1, 27, 7, 0, } , // 32 pipes (32 PKRs) 2 bpe @ SW_64K_S_X @ RbPlus + { 3, 2, 28, 8, 0, } , // 32 pipes (32 PKRs) 4 bpe @ SW_64K_S_X @ RbPlus + { 3, 3, 29, 9, 0, } , // 32 pipes (32 PKRs) 8 bpe @ SW_64K_S_X @ RbPlus + { 3, 4, 30, 10, 0, } , // 32 pipes (32 PKRs) 16 bpe @ SW_64K_S_X @ RbPlus + { 3, 0, 265, 11, 0, } , // 64 pipes (32 PKRs) 1 bpe @ SW_64K_S_X @ RbPlus + { 3, 1, 266, 12, 0, } , // 64 pipes (32 PKRs) 2 bpe @ SW_64K_S_X @ RbPlus + { 3, 2, 267, 13, 0, } , // 64 pipes (32 PKRs) 4 bpe @ SW_64K_S_X @ RbPlus + { 3, 3, 268, 14, 0, } , // 64 pipes (32 PKRs) 8 bpe @ SW_64K_S_X @ RbPlus + { 3, 4, 269, 15, 0, } , // 64 pipes (32 PKRs) 16 bpe @ SW_64K_S_X @ RbPlus +}; + +const ADDR_SW_PATINFO SW_64K_D_X_RBPLUS_PATINFO[] = +{ + { 1, 5, 1, 1, 0, } , // 1 pipes (1 PKRs) 1 bpe @ SW_64K_D_X @ RbPlus + { 1, 1, 2, 2, 0, } , // 1 pipes (1 PKRs) 2 bpe @ SW_64K_D_X @ RbPlus + { 1, 39, 3, 3, 0, } , // 1 pipes (1 PKRs) 4 bpe @ SW_64K_D_X @ RbPlus + { 1, 6, 4, 4, 0, } , // 1 pipes (1 PKRs) 8 bpe @ SW_64K_D_X @ RbPlus + { 1, 7, 5, 5, 0, } , // 1 pipes (1 PKRs) 16 bpe @ SW_64K_D_X @ RbPlus + { 3, 5, 6, 1, 0, } , // 2 pipes (1-2 PKRs) 1 bpe @ SW_64K_D_X @ RbPlus + { 3, 1, 7, 2, 0, } , // 2 pipes (1-2 PKRs) 2 bpe @ SW_64K_D_X @ RbPlus + { 3, 39, 8, 3, 0, } , // 2 pipes (1-2 PKRs) 4 bpe @ SW_64K_D_X @ RbPlus + { 3, 6, 9, 4, 0, } , // 2 pipes (1-2 PKRs) 8 bpe @ SW_64K_D_X @ RbPlus + { 3, 7, 10, 5, 0, } , // 2 pipes (1-2 PKRs) 16 bpe @ SW_64K_D_X @ RbPlus + { 3, 5, 210, 1, 0, } , // 4 pipes (1-2 PKRs) 1 bpe @ SW_64K_D_X @ RbPlus + { 3, 1, 211, 2, 0, } , // 4 pipes (1-2 PKRs) 2 bpe @ SW_64K_D_X @ RbPlus + { 3, 39, 212, 3, 0, } , // 4 pipes (1-2 PKRs) 4 bpe @ SW_64K_D_X @ RbPlus + { 3, 6, 213, 4, 0, } , // 4 pipes (1-2 PKRs) 8 bpe @ SW_64K_D_X @ RbPlus + { 3, 7, 214, 5, 0, } , // 4 pipes (1-2 PKRs) 16 bpe @ SW_64K_D_X @ RbPlus + { 3, 5, 215, 1, 0, } , // 8 pipes (2 PKRs) 1 bpe @ SW_64K_D_X @ RbPlus + { 3, 1, 216, 2, 0, } , // 8 pipes (2 PKRs) 2 bpe @ SW_64K_D_X @ RbPlus + { 3, 39, 217, 3, 0, } , // 8 pipes (2 PKRs) 4 bpe @ SW_64K_D_X @ RbPlus + { 3, 6, 218, 4, 0, } , // 8 pipes (2 PKRs) 8 bpe @ SW_64K_D_X @ RbPlus + { 3, 7, 219, 5, 0, } , // 8 pipes (2 PKRs) 16 bpe @ SW_64K_D_X @ RbPlus + { 3, 5, 11, 1, 0, } , // 4 pipes (4 PKRs) 1 bpe @ SW_64K_D_X @ RbPlus + { 3, 1, 12, 2, 0, } , // 4 pipes (4 PKRs) 2 bpe @ SW_64K_D_X @ RbPlus + { 3, 39, 13, 3, 0, } , // 4 pipes (4 PKRs) 4 bpe @ SW_64K_D_X @ RbPlus + { 3, 6, 14, 4, 0, } , // 4 pipes (4 PKRs) 8 bpe @ SW_64K_D_X @ RbPlus + { 3, 7, 15, 5, 0, } , // 4 pipes (4 PKRs) 16 bpe @ SW_64K_D_X @ RbPlus + { 3, 5, 220, 1, 0, } , // 8 pipes (4 PKRs) 1 bpe @ SW_64K_D_X @ RbPlus + { 3, 1, 221, 2, 0, } , // 8 pipes (4 PKRs) 2 bpe @ SW_64K_D_X @ RbPlus + { 3, 39, 222, 3, 0, } , // 8 pipes (4 PKRs) 4 bpe @ SW_64K_D_X @ RbPlus + { 3, 6, 223, 4, 0, } , // 8 pipes (4 PKRs) 8 bpe @ SW_64K_D_X @ RbPlus + { 3, 7, 224, 5, 0, } , // 8 pipes (4 PKRs) 16 bpe @ SW_64K_D_X @ RbPlus + { 3, 5, 225, 1, 0, } , // 16 pipes (4 PKRs) 1 bpe @ SW_64K_D_X @ RbPlus + { 3, 1, 226, 2, 0, } , // 16 pipes (4 PKRs) 2 bpe @ SW_64K_D_X @ RbPlus + { 3, 39, 227, 3, 0, } , // 16 pipes (4 PKRs) 4 bpe @ SW_64K_D_X @ RbPlus + { 3, 6, 228, 4, 0, } , // 16 pipes (4 PKRs) 8 bpe @ SW_64K_D_X @ RbPlus + { 3, 7, 229, 5, 0, } , // 16 pipes (4 PKRs) 16 bpe @ SW_64K_D_X @ RbPlus + { 3, 5, 16, 1, 0, } , // 8 pipes (8 PKRs) 1 bpe @ SW_64K_D_X @ RbPlus + { 3, 1, 17, 2, 0, } , // 8 pipes (8 PKRs) 2 bpe @ SW_64K_D_X @ RbPlus + { 3, 39, 18, 3, 0, } , // 8 pipes (8 PKRs) 4 bpe @ SW_64K_D_X @ RbPlus + { 3, 6, 19, 4, 0, } , // 8 pipes (8 PKRs) 8 bpe @ SW_64K_D_X @ RbPlus + { 3, 7, 20, 5, 0, } , // 8 pipes (8 PKRs) 16 bpe @ SW_64K_D_X @ RbPlus + { 3, 5, 230, 1, 0, } , // 16 pipes (8 PKRs) 1 bpe @ SW_64K_D_X @ RbPlus + { 3, 1, 231, 2, 0, } , // 16 pipes (8 PKRs) 2 bpe @ SW_64K_D_X @ RbPlus + { 3, 39, 232, 3, 0, } , // 16 pipes (8 PKRs) 4 bpe @ SW_64K_D_X @ RbPlus + { 3, 6, 233, 4, 0, } , // 16 pipes (8 PKRs) 8 bpe @ SW_64K_D_X @ RbPlus + { 3, 7, 234, 5, 0, } , // 16 pipes (8 PKRs) 16 bpe @ SW_64K_D_X @ RbPlus + { 3, 5, 250, 6, 0, } , // 32 pipes (8 PKRs) 1 bpe @ SW_64K_D_X @ RbPlus + { 3, 1, 251, 7, 0, } , // 32 pipes (8 PKRs) 2 bpe @ SW_64K_D_X @ RbPlus + { 3, 39, 252, 8, 0, } , // 32 pipes (8 PKRs) 4 bpe @ SW_64K_D_X @ RbPlus + { 3, 6, 253, 9, 0, } , // 32 pipes (8 PKRs) 8 bpe @ SW_64K_D_X @ RbPlus + { 3, 7, 254, 10, 0, } , // 32 pipes (8 PKRs) 16 bpe @ SW_64K_D_X @ RbPlus + { 3, 5, 21, 1, 0, } , // 16 pipes (16 PKRs) 1 bpe @ SW_64K_D_X @ RbPlus + { 3, 1, 22, 2, 0, } , // 16 pipes (16 PKRs) 2 bpe @ SW_64K_D_X @ RbPlus + { 3, 39, 23, 3, 0, } , // 16 pipes (16 PKRs) 4 bpe @ SW_64K_D_X @ RbPlus + { 3, 6, 24, 4, 0, } , // 16 pipes (16 PKRs) 8 bpe @ SW_64K_D_X @ RbPlus + { 3, 7, 25, 5, 0, } , // 16 pipes (16 PKRs) 16 bpe @ SW_64K_D_X @ RbPlus + { 3, 5, 255, 6, 0, } , // 32 pipes (16 PKRs) 1 bpe @ SW_64K_D_X @ RbPlus + { 3, 1, 256, 7, 0, } , // 32 pipes (16 PKRs) 2 bpe @ SW_64K_D_X @ RbPlus + { 3, 39, 257, 8, 0, } , // 32 pipes (16 PKRs) 4 bpe @ SW_64K_D_X @ RbPlus + { 3, 6, 258, 9, 0, } , // 32 pipes (16 PKRs) 8 bpe @ SW_64K_D_X @ RbPlus + { 3, 7, 259, 10, 0, } , // 32 pipes (16 PKRs) 16 bpe @ SW_64K_D_X @ RbPlus + { 3, 5, 260, 11, 0, } , // 64 pipes (16 PKRs) 1 bpe @ SW_64K_D_X @ RbPlus + { 3, 1, 261, 12, 0, } , // 64 pipes (16 PKRs) 2 bpe @ SW_64K_D_X @ RbPlus + { 3, 39, 262, 13, 0, } , // 64 pipes (16 PKRs) 4 bpe @ SW_64K_D_X @ RbPlus + { 3, 6, 263, 14, 0, } , // 64 pipes (16 PKRs) 8 bpe @ SW_64K_D_X @ RbPlus + { 3, 7, 264, 15, 0, } , // 64 pipes (16 PKRs) 16 bpe @ SW_64K_D_X @ RbPlus + { 3, 5, 26, 6, 0, } , // 32 pipes (32 PKRs) 1 bpe @ SW_64K_D_X @ RbPlus + { 3, 1, 27, 7, 0, } , // 32 pipes (32 PKRs) 2 bpe @ SW_64K_D_X @ RbPlus + { 3, 39, 28, 8, 0, } , // 32 pipes (32 PKRs) 4 bpe @ SW_64K_D_X @ RbPlus + { 3, 6, 29, 9, 0, } , // 32 pipes (32 PKRs) 8 bpe @ SW_64K_D_X @ RbPlus + { 3, 7, 30, 10, 0, } , // 32 pipes (32 PKRs) 16 bpe @ SW_64K_D_X @ RbPlus + { 3, 5, 265, 11, 0, } , // 64 pipes (32 PKRs) 1 bpe @ SW_64K_D_X @ RbPlus + { 3, 1, 266, 12, 0, } , // 64 pipes (32 PKRs) 2 bpe @ SW_64K_D_X @ RbPlus + { 3, 39, 267, 13, 0, } , // 64 pipes (32 PKRs) 4 bpe @ SW_64K_D_X @ RbPlus + { 3, 6, 268, 14, 0, } , // 64 pipes (32 PKRs) 8 bpe @ SW_64K_D_X @ RbPlus + { 3, 7, 269, 15, 0, } , // 64 pipes (32 PKRs) 16 bpe @ SW_64K_D_X @ RbPlus +}; + +const ADDR_SW_PATINFO SW_64K_R_X_1xaa_RBPLUS_PATINFO[] = +{ + { 2, 0, 347, 193, 0, } , // 1 pipes (1 PKRs) 1 bpe @ SW_64K_R_X 1xaa @ RbPlus + { 2, 1, 348, 366, 0, } , // 1 pipes (1 PKRs) 2 bpe @ SW_64K_R_X 1xaa @ RbPlus + { 2, 39, 349, 195, 0, } , // 1 pipes (1 PKRs) 4 bpe @ SW_64K_R_X 1xaa @ RbPlus + { 2, 6, 350, 367, 0, } , // 1 pipes (1 PKRs) 8 bpe @ SW_64K_R_X 1xaa @ RbPlus + { 2, 7, 351, 368, 0, } , // 1 pipes (1 PKRs) 16 bpe @ SW_64K_R_X 1xaa @ RbPlus + { 3, 0, 352, 193, 0, } , // 2 pipes (1-2 PKRs) 1 bpe @ SW_64K_R_X 1xaa @ RbPlus + { 3, 1, 353, 194, 0, } , // 2 pipes (1-2 PKRs) 2 bpe @ SW_64K_R_X 1xaa @ RbPlus + { 3, 39, 354, 195, 0, } , // 2 pipes (1-2 PKRs) 4 bpe @ SW_64K_R_X 1xaa @ RbPlus + { 3, 6, 355, 369, 0, } , // 2 pipes (1-2 PKRs) 8 bpe @ SW_64K_R_X 1xaa @ RbPlus + { 3, 7, 356, 370, 0, } , // 2 pipes (1-2 PKRs) 16 bpe @ SW_64K_R_X 1xaa @ RbPlus + { 3, 0, 280, 193, 0, } , // 4 pipes (1-2 PKRs) 1 bpe @ SW_64K_R_X 1xaa @ RbPlus + { 3, 1, 281, 194, 0, } , // 4 pipes (1-2 PKRs) 2 bpe @ SW_64K_R_X 1xaa @ RbPlus + { 3, 39, 282, 195, 0, } , // 4 pipes (1-2 PKRs) 4 bpe @ SW_64K_R_X 1xaa @ RbPlus + { 3, 6, 283, 196, 0, } , // 4 pipes (1-2 PKRs) 8 bpe @ SW_64K_R_X 1xaa @ RbPlus + { 3, 7, 284, 197, 0, } , // 4 pipes (1-2 PKRs) 16 bpe @ SW_64K_R_X 1xaa @ RbPlus + { 3, 0, 394, 219, 0, } , // 8 pipes (2 PKRs) 1 bpe @ SW_64K_R_X 1xaa @ RbPlus + { 3, 1, 395, 371, 0, } , // 8 pipes (2 PKRs) 2 bpe @ SW_64K_R_X 1xaa @ RbPlus + { 3, 39, 396, 372, 0, } , // 8 pipes (2 PKRs) 4 bpe @ SW_64K_R_X 1xaa @ RbPlus + { 3, 6, 397, 373, 0, } , // 8 pipes (2 PKRs) 8 bpe @ SW_64K_R_X 1xaa @ RbPlus + { 3, 7, 398, 374, 0, } , // 8 pipes (2 PKRs) 16 bpe @ SW_64K_R_X 1xaa @ RbPlus + { 3, 0, 290, 203, 0, } , // 4 pipes (4 PKRs) 1 bpe @ SW_64K_R_X 1xaa @ RbPlus + { 3, 1, 291, 204, 0, } , // 4 pipes (4 PKRs) 2 bpe @ SW_64K_R_X 1xaa @ RbPlus + { 3, 39, 292, 205, 0, } , // 4 pipes (4 PKRs) 4 bpe @ SW_64K_R_X 1xaa @ RbPlus + { 3, 6, 293, 206, 0, } , // 4 pipes (4 PKRs) 8 bpe @ SW_64K_R_X 1xaa @ RbPlus + { 3, 7, 294, 207, 0, } , // 4 pipes (4 PKRs) 16 bpe @ SW_64K_R_X 1xaa @ RbPlus + { 3, 0, 295, 219, 0, } , // 8 pipes (4 PKRs) 1 bpe @ SW_64K_R_X 1xaa @ RbPlus + { 3, 1, 296, 375, 0, } , // 8 pipes (4 PKRs) 2 bpe @ SW_64K_R_X 1xaa @ RbPlus + { 3, 39, 297, 376, 0, } , // 8 pipes (4 PKRs) 4 bpe @ SW_64K_R_X 1xaa @ RbPlus + { 3, 6, 298, 377, 0, } , // 8 pipes (4 PKRs) 8 bpe @ SW_64K_R_X 1xaa @ RbPlus + { 3, 7, 299, 378, 0, } , // 8 pipes (4 PKRs) 16 bpe @ SW_64K_R_X 1xaa @ RbPlus + { 3, 0, 399, 379, 0, } , // 16 pipes (4 PKRs) 1 bpe @ SW_64K_R_X 1xaa @ RbPlus + { 3, 1, 399, 380, 0, } , // 16 pipes (4 PKRs) 2 bpe @ SW_64K_R_X 1xaa @ RbPlus + { 3, 39, 399, 381, 0, } , // 16 pipes (4 PKRs) 4 bpe @ SW_64K_R_X 1xaa @ RbPlus + { 3, 6, 399, 382, 0, } , // 16 pipes (4 PKRs) 8 bpe @ SW_64K_R_X 1xaa @ RbPlus + { 3, 7, 399, 383, 0, } , // 16 pipes (4 PKRs) 16 bpe @ SW_64K_R_X 1xaa @ RbPlus + { 3, 0, 400, 669, 0, } , // 8 pipes (8 PKRs) 1 bpe @ SW_64K_R_X 1xaa @ RbPlus + { 3, 1, 401, 670, 0, } , // 8 pipes (8 PKRs) 2 bpe @ SW_64K_R_X 1xaa @ RbPlus + { 3, 39, 402, 671, 0, } , // 8 pipes (8 PKRs) 4 bpe @ SW_64K_R_X 1xaa @ RbPlus + { 3, 6, 304, 387, 0, } , // 8 pipes (8 PKRs) 8 bpe @ SW_64K_R_X 1xaa @ RbPlus + { 3, 7, 305, 388, 0, } , // 8 pipes (8 PKRs) 16 bpe @ SW_64K_R_X 1xaa @ RbPlus + { 3, 0, 307, 379, 0, } , // 16 pipes (8 PKRs) 1 bpe @ SW_64K_R_X 1xaa @ RbPlus + { 3, 1, 307, 389, 0, } , // 16 pipes (8 PKRs) 2 bpe @ SW_64K_R_X 1xaa @ RbPlus + { 3, 39, 307, 381, 0, } , // 16 pipes (8 PKRs) 4 bpe @ SW_64K_R_X 1xaa @ RbPlus + { 3, 6, 307, 382, 0, } , // 16 pipes (8 PKRs) 8 bpe @ SW_64K_R_X 1xaa @ RbPlus + { 3, 7, 307, 390, 0, } , // 16 pipes (8 PKRs) 16 bpe @ SW_64K_R_X 1xaa @ RbPlus + { 3, 0, 307, 672, 0, } , // 32 pipes (8 PKRs) 1 bpe @ SW_64K_R_X 1xaa @ RbPlus + { 3, 1, 307, 673, 0, } , // 32 pipes (8 PKRs) 2 bpe @ SW_64K_R_X 1xaa @ RbPlus + { 3, 39, 307, 674, 0, } , // 32 pipes (8 PKRs) 4 bpe @ SW_64K_R_X 1xaa @ RbPlus + { 3, 6, 307, 675, 0, } , // 32 pipes (8 PKRs) 8 bpe @ SW_64K_R_X 1xaa @ RbPlus + { 3, 7, 307, 676, 0, } , // 32 pipes (8 PKRs) 16 bpe @ SW_64K_R_X 1xaa @ RbPlus + { 3, 0, 309, 677, 0, } , // 16 pipes (16 PKRs) 1 bpe @ SW_64K_R_X 1xaa @ RbPlus + { 3, 1, 309, 678, 0, } , // 16 pipes (16 PKRs) 2 bpe @ SW_64K_R_X 1xaa @ RbPlus + { 3, 39, 309, 679, 0, } , // 16 pipes (16 PKRs) 4 bpe @ SW_64K_R_X 1xaa @ RbPlus + { 3, 6, 309, 399, 0, } , // 16 pipes (16 PKRs) 8 bpe @ SW_64K_R_X 1xaa @ RbPlus + { 3, 7, 323, 400, 0, } , // 16 pipes (16 PKRs) 16 bpe @ SW_64K_R_X 1xaa @ RbPlus + { 3, 0, 309, 680, 0, } , // 32 pipes (16 PKRs) 1 bpe @ SW_64K_R_X 1xaa @ RbPlus + { 3, 1, 309, 681, 0, } , // 32 pipes (16 PKRs) 2 bpe @ SW_64K_R_X 1xaa @ RbPlus + { 3, 39, 309, 682, 0, } , // 32 pipes (16 PKRs) 4 bpe @ SW_64K_R_X 1xaa @ RbPlus + { 3, 6, 309, 404, 0, } , // 32 pipes (16 PKRs) 8 bpe @ SW_64K_R_X 1xaa @ RbPlus + { 3, 7, 323, 405, 0, } , // 32 pipes (16 PKRs) 16 bpe @ SW_64K_R_X 1xaa @ RbPlus + { 3, 0, 309, 505, 0, } , // 64 pipes (16 PKRs) 1 bpe @ SW_64K_R_X 1xaa @ RbPlus + { 3, 1, 309, 506, 0, } , // 64 pipes (16 PKRs) 2 bpe @ SW_64K_R_X 1xaa @ RbPlus + { 3, 39, 309, 507, 0, } , // 64 pipes (16 PKRs) 4 bpe @ SW_64K_R_X 1xaa @ RbPlus + { 3, 6, 309, 683, 0, } , // 64 pipes (16 PKRs) 8 bpe @ SW_64K_R_X 1xaa @ RbPlus + { 3, 7, 323, 684, 0, } , // 64 pipes (16 PKRs) 16 bpe @ SW_64K_R_X 1xaa @ RbPlus + { 3, 0, 311, 685, 0, } , // 32 pipes (32 PKRs) 1 bpe @ SW_64K_R_X 1xaa @ RbPlus + { 3, 1, 311, 686, 0, } , // 32 pipes (32 PKRs) 2 bpe @ SW_64K_R_X 1xaa @ RbPlus + { 3, 39, 311, 687, 0, } , // 32 pipes (32 PKRs) 4 bpe @ SW_64K_R_X 1xaa @ RbPlus + { 3, 6, 318, 411, 0, } , // 32 pipes (32 PKRs) 8 bpe @ SW_64K_R_X 1xaa @ RbPlus + { 3, 7, 324, 412, 0, } , // 32 pipes (32 PKRs) 16 bpe @ SW_64K_R_X 1xaa @ RbPlus + { 3, 0, 311, 513, 0, } , // 64 pipes (32 PKRs) 1 bpe @ SW_64K_R_X 1xaa @ RbPlus + { 3, 1, 311, 514, 0, } , // 64 pipes (32 PKRs) 2 bpe @ SW_64K_R_X 1xaa @ RbPlus + { 3, 39, 311, 515, 0, } , // 64 pipes (32 PKRs) 4 bpe @ SW_64K_R_X 1xaa @ RbPlus + { 3, 6, 318, 413, 0, } , // 64 pipes (32 PKRs) 8 bpe @ SW_64K_R_X 1xaa @ RbPlus + { 3, 7, 324, 414, 0, } , // 64 pipes (32 PKRs) 16 bpe @ SW_64K_R_X 1xaa @ RbPlus +}; + +const ADDR_SW_PATINFO SW_64K_R_X_2xaa_RBPLUS_PATINFO[] = +{ + { 3, 0, 424, 526, 0, } , // 1 pipes (1 PKRs) 1 bpe @ SW_64K_R_X 2xaa @ RbPlus + { 3, 1, 348, 527, 0, } , // 1 pipes (1 PKRs) 2 bpe @ SW_64K_R_X 2xaa @ RbPlus + { 3, 39, 358, 528, 0, } , // 1 pipes (1 PKRs) 4 bpe @ SW_64K_R_X 2xaa @ RbPlus + { 3, 6, 350, 688, 0, } , // 1 pipes (1 PKRs) 8 bpe @ SW_64K_R_X 2xaa @ RbPlus + { 3, 7, 359, 689, 0, } , // 1 pipes (1 PKRs) 16 bpe @ SW_64K_R_X 2xaa @ RbPlus + { 3, 0, 352, 526, 0, } , // 2 pipes (1-2 PKRs) 1 bpe @ SW_64K_R_X 2xaa @ RbPlus + { 3, 1, 353, 527, 0, } , // 2 pipes (1-2 PKRs) 2 bpe @ SW_64K_R_X 2xaa @ RbPlus + { 3, 39, 354, 528, 0, } , // 2 pipes (1-2 PKRs) 4 bpe @ SW_64K_R_X 2xaa @ RbPlus + { 3, 6, 355, 688, 0, } , // 2 pipes (1-2 PKRs) 8 bpe @ SW_64K_R_X 2xaa @ RbPlus + { 3, 7, 356, 690, 0, } , // 2 pipes (1-2 PKRs) 16 bpe @ SW_64K_R_X 2xaa @ RbPlus + { 3, 0, 280, 526, 0, } , // 4 pipes (1-2 PKRs) 1 bpe @ SW_64K_R_X 2xaa @ RbPlus + { 3, 1, 281, 527, 0, } , // 4 pipes (1-2 PKRs) 2 bpe @ SW_64K_R_X 2xaa @ RbPlus + { 3, 39, 282, 528, 0, } , // 4 pipes (1-2 PKRs) 4 bpe @ SW_64K_R_X 2xaa @ RbPlus + { 3, 6, 283, 529, 0, } , // 4 pipes (1-2 PKRs) 8 bpe @ SW_64K_R_X 2xaa @ RbPlus + { 3, 7, 284, 530, 0, } , // 4 pipes (1-2 PKRs) 16 bpe @ SW_64K_R_X 2xaa @ RbPlus + { 3, 0, 394, 691, 0, } , // 8 pipes (2 PKRs) 1 bpe @ SW_64K_R_X 2xaa @ RbPlus + { 3, 1, 395, 692, 0, } , // 8 pipes (2 PKRs) 2 bpe @ SW_64K_R_X 2xaa @ RbPlus + { 3, 39, 396, 693, 0, } , // 8 pipes (2 PKRs) 4 bpe @ SW_64K_R_X 2xaa @ RbPlus + { 3, 6, 397, 694, 0, } , // 8 pipes (2 PKRs) 8 bpe @ SW_64K_R_X 2xaa @ RbPlus + { 3, 7, 425, 695, 0, } , // 8 pipes (2 PKRs) 16 bpe @ SW_64K_R_X 2xaa @ RbPlus + { 3, 0, 290, 534, 0, } , // 4 pipes (4 PKRs) 1 bpe @ SW_64K_R_X 2xaa @ RbPlus + { 3, 1, 291, 535, 0, } , // 4 pipes (4 PKRs) 2 bpe @ SW_64K_R_X 2xaa @ RbPlus + { 3, 39, 292, 536, 0, } , // 4 pipes (4 PKRs) 4 bpe @ SW_64K_R_X 2xaa @ RbPlus + { 3, 6, 293, 537, 0, } , // 4 pipes (4 PKRs) 8 bpe @ SW_64K_R_X 2xaa @ RbPlus + { 3, 7, 294, 538, 0, } , // 4 pipes (4 PKRs) 16 bpe @ SW_64K_R_X 2xaa @ RbPlus + { 3, 0, 295, 691, 0, } , // 8 pipes (4 PKRs) 1 bpe @ SW_64K_R_X 2xaa @ RbPlus + { 3, 1, 296, 696, 0, } , // 8 pipes (4 PKRs) 2 bpe @ SW_64K_R_X 2xaa @ RbPlus + { 3, 39, 297, 697, 0, } , // 8 pipes (4 PKRs) 4 bpe @ SW_64K_R_X 2xaa @ RbPlus + { 3, 6, 298, 698, 0, } , // 8 pipes (4 PKRs) 8 bpe @ SW_64K_R_X 2xaa @ RbPlus + { 3, 7, 299, 699, 0, } , // 8 pipes (4 PKRs) 16 bpe @ SW_64K_R_X 2xaa @ RbPlus + { 3, 0, 399, 700, 0, } , // 16 pipes (4 PKRs) 1 bpe @ SW_64K_R_X 2xaa @ RbPlus + { 3, 1, 399, 701, 0, } , // 16 pipes (4 PKRs) 2 bpe @ SW_64K_R_X 2xaa @ RbPlus + { 3, 39, 399, 702, 0, } , // 16 pipes (4 PKRs) 4 bpe @ SW_64K_R_X 2xaa @ RbPlus + { 3, 6, 399, 703, 0, } , // 16 pipes (4 PKRs) 8 bpe @ SW_64K_R_X 2xaa @ RbPlus + { 3, 7, 426, 429, 0, } , // 16 pipes (4 PKRs) 16 bpe @ SW_64K_R_X 2xaa @ RbPlus + { 3, 0, 400, 704, 0, } , // 8 pipes (8 PKRs) 1 bpe @ SW_64K_R_X 2xaa @ RbPlus + { 3, 1, 401, 705, 0, } , // 8 pipes (8 PKRs) 2 bpe @ SW_64K_R_X 2xaa @ RbPlus + { 3, 39, 402, 706, 0, } , // 8 pipes (8 PKRs) 4 bpe @ SW_64K_R_X 2xaa @ RbPlus + { 3, 6, 304, 707, 0, } , // 8 pipes (8 PKRs) 8 bpe @ SW_64K_R_X 2xaa @ RbPlus + { 3, 7, 364, 708, 0, } , // 8 pipes (8 PKRs) 16 bpe @ SW_64K_R_X 2xaa @ RbPlus + { 3, 0, 307, 700, 0, } , // 16 pipes (8 PKRs) 1 bpe @ SW_64K_R_X 2xaa @ RbPlus + { 3, 1, 307, 701, 0, } , // 16 pipes (8 PKRs) 2 bpe @ SW_64K_R_X 2xaa @ RbPlus + { 3, 39, 307, 702, 0, } , // 16 pipes (8 PKRs) 4 bpe @ SW_64K_R_X 2xaa @ RbPlus + { 3, 6, 307, 703, 0, } , // 16 pipes (8 PKRs) 8 bpe @ SW_64K_R_X 2xaa @ RbPlus + { 3, 7, 427, 390, 0, } , // 16 pipes (8 PKRs) 16 bpe @ SW_64K_R_X 2xaa @ RbPlus + { 3, 0, 307, 709, 0, } , // 32 pipes (8 PKRs) 1 bpe @ SW_64K_R_X 2xaa @ RbPlus + { 3, 1, 307, 710, 0, } , // 32 pipes (8 PKRs) 2 bpe @ SW_64K_R_X 2xaa @ RbPlus + { 3, 39, 307, 711, 0, } , // 32 pipes (8 PKRs) 4 bpe @ SW_64K_R_X 2xaa @ RbPlus + { 3, 6, 307, 712, 0, } , // 32 pipes (8 PKRs) 8 bpe @ SW_64K_R_X 2xaa @ RbPlus + { 3, 7, 427, 676, 0, } , // 32 pipes (8 PKRs) 16 bpe @ SW_64K_R_X 2xaa @ RbPlus + { 3, 0, 309, 713, 0, } , // 16 pipes (16 PKRs) 1 bpe @ SW_64K_R_X 2xaa @ RbPlus + { 3, 1, 309, 714, 0, } , // 16 pipes (16 PKRs) 2 bpe @ SW_64K_R_X 2xaa @ RbPlus + { 3, 39, 309, 715, 0, } , // 16 pipes (16 PKRs) 4 bpe @ SW_64K_R_X 2xaa @ RbPlus + { 3, 6, 323, 716, 0, } , // 16 pipes (16 PKRs) 8 bpe @ SW_64K_R_X 2xaa @ RbPlus + { 3, 7, 428, 400, 0, } , // 16 pipes (16 PKRs) 16 bpe @ SW_64K_R_X 2xaa @ RbPlus + { 3, 0, 309, 717, 0, } , // 32 pipes (16 PKRs) 1 bpe @ SW_64K_R_X 2xaa @ RbPlus + { 3, 1, 309, 718, 0, } , // 32 pipes (16 PKRs) 2 bpe @ SW_64K_R_X 2xaa @ RbPlus + { 3, 39, 309, 719, 0, } , // 32 pipes (16 PKRs) 4 bpe @ SW_64K_R_X 2xaa @ RbPlus + { 3, 6, 323, 720, 0, } , // 32 pipes (16 PKRs) 8 bpe @ SW_64K_R_X 2xaa @ RbPlus + { 3, 7, 428, 405, 0, } , // 32 pipes (16 PKRs) 16 bpe @ SW_64K_R_X 2xaa @ RbPlus + { 3, 0, 309, 721, 0, } , // 64 pipes (16 PKRs) 1 bpe @ SW_64K_R_X 2xaa @ RbPlus + { 3, 1, 309, 722, 0, } , // 64 pipes (16 PKRs) 2 bpe @ SW_64K_R_X 2xaa @ RbPlus + { 3, 39, 309, 723, 0, } , // 64 pipes (16 PKRs) 4 bpe @ SW_64K_R_X 2xaa @ RbPlus + { 3, 6, 323, 724, 0, } , // 64 pipes (16 PKRs) 8 bpe @ SW_64K_R_X 2xaa @ RbPlus + { 3, 7, 428, 684, 0, } , // 64 pipes (16 PKRs) 16 bpe @ SW_64K_R_X 2xaa @ RbPlus + { 3, 0, 318, 725, 0, } , // 32 pipes (32 PKRs) 1 bpe @ SW_64K_R_X 2xaa @ RbPlus + { 3, 1, 318, 726, 0, } , // 32 pipes (32 PKRs) 2 bpe @ SW_64K_R_X 2xaa @ RbPlus + { 3, 39, 318, 727, 0, } , // 32 pipes (32 PKRs) 4 bpe @ SW_64K_R_X 2xaa @ RbPlus + { 3, 6, 324, 728, 0, } , // 32 pipes (32 PKRs) 8 bpe @ SW_64K_R_X 2xaa @ RbPlus + { 3, 7, 429, 412, 0, } , // 32 pipes (32 PKRs) 16 bpe @ SW_64K_R_X 2xaa @ RbPlus + { 3, 0, 318, 729, 0, } , // 64 pipes (32 PKRs) 1 bpe @ SW_64K_R_X 2xaa @ RbPlus + { 3, 1, 318, 730, 0, } , // 64 pipes (32 PKRs) 2 bpe @ SW_64K_R_X 2xaa @ RbPlus + { 3, 39, 318, 731, 0, } , // 64 pipes (32 PKRs) 4 bpe @ SW_64K_R_X 2xaa @ RbPlus + { 3, 6, 324, 732, 0, } , // 64 pipes (32 PKRs) 8 bpe @ SW_64K_R_X 2xaa @ RbPlus + { 3, 7, 429, 414, 0, } , // 64 pipes (32 PKRs) 16 bpe @ SW_64K_R_X 2xaa @ RbPlus +}; + +const ADDR_SW_PATINFO SW_64K_R_X_4xaa_RBPLUS_PATINFO[] = +{ + { 3, 0, 347, 566, 0, } , // 1 pipes (1 PKRs) 1 bpe @ SW_64K_R_X 4xaa @ RbPlus + { 3, 1, 348, 733, 0, } , // 1 pipes (1 PKRs) 2 bpe @ SW_64K_R_X 4xaa @ RbPlus + { 3, 39, 349, 568, 0, } , // 1 pipes (1 PKRs) 4 bpe @ SW_64K_R_X 4xaa @ RbPlus + { 3, 6, 350, 734, 0, } , // 1 pipes (1 PKRs) 8 bpe @ SW_64K_R_X 4xaa @ RbPlus + { 3, 7, 351, 735, 0, } , // 1 pipes (1 PKRs) 16 bpe @ SW_64K_R_X 4xaa @ RbPlus + { 3, 0, 352, 566, 0, } , // 2 pipes (1-2 PKRs) 1 bpe @ SW_64K_R_X 4xaa @ RbPlus + { 3, 1, 353, 567, 0, } , // 2 pipes (1-2 PKRs) 2 bpe @ SW_64K_R_X 4xaa @ RbPlus + { 3, 39, 354, 568, 0, } , // 2 pipes (1-2 PKRs) 4 bpe @ SW_64K_R_X 4xaa @ RbPlus + { 3, 6, 355, 736, 0, } , // 2 pipes (1-2 PKRs) 8 bpe @ SW_64K_R_X 4xaa @ RbPlus + { 3, 7, 356, 737, 0, } , // 2 pipes (1-2 PKRs) 16 bpe @ SW_64K_R_X 4xaa @ RbPlus + { 3, 0, 280, 566, 0, } , // 4 pipes (1-2 PKRs) 1 bpe @ SW_64K_R_X 4xaa @ RbPlus + { 3, 1, 281, 567, 0, } , // 4 pipes (1-2 PKRs) 2 bpe @ SW_64K_R_X 4xaa @ RbPlus + { 3, 39, 282, 568, 0, } , // 4 pipes (1-2 PKRs) 4 bpe @ SW_64K_R_X 4xaa @ RbPlus + { 3, 6, 283, 569, 0, } , // 4 pipes (1-2 PKRs) 8 bpe @ SW_64K_R_X 4xaa @ RbPlus + { 3, 7, 284, 570, 0, } , // 4 pipes (1-2 PKRs) 16 bpe @ SW_64K_R_X 4xaa @ RbPlus + { 3, 0, 394, 587, 0, } , // 8 pipes (2 PKRs) 1 bpe @ SW_64K_R_X 4xaa @ RbPlus + { 3, 1, 395, 738, 0, } , // 8 pipes (2 PKRs) 2 bpe @ SW_64K_R_X 4xaa @ RbPlus + { 3, 39, 396, 739, 0, } , // 8 pipes (2 PKRs) 4 bpe @ SW_64K_R_X 4xaa @ RbPlus + { 3, 6, 397, 740, 0, } , // 8 pipes (2 PKRs) 8 bpe @ SW_64K_R_X 4xaa @ RbPlus + { 3, 7, 430, 741, 0, } , // 8 pipes (2 PKRs) 16 bpe @ SW_64K_R_X 4xaa @ RbPlus + { 3, 0, 290, 576, 0, } , // 4 pipes (4 PKRs) 1 bpe @ SW_64K_R_X 4xaa @ RbPlus + { 3, 1, 291, 577, 0, } , // 4 pipes (4 PKRs) 2 bpe @ SW_64K_R_X 4xaa @ RbPlus + { 3, 39, 292, 578, 0, } , // 4 pipes (4 PKRs) 4 bpe @ SW_64K_R_X 4xaa @ RbPlus + { 3, 6, 293, 579, 0, } , // 4 pipes (4 PKRs) 8 bpe @ SW_64K_R_X 4xaa @ RbPlus + { 3, 7, 405, 580, 0, } , // 4 pipes (4 PKRs) 16 bpe @ SW_64K_R_X 4xaa @ RbPlus + { 3, 0, 295, 587, 0, } , // 8 pipes (4 PKRs) 1 bpe @ SW_64K_R_X 4xaa @ RbPlus + { 3, 1, 296, 742, 0, } , // 8 pipes (4 PKRs) 2 bpe @ SW_64K_R_X 4xaa @ RbPlus + { 3, 39, 297, 743, 0, } , // 8 pipes (4 PKRs) 4 bpe @ SW_64K_R_X 4xaa @ RbPlus + { 3, 6, 298, 740, 0, } , // 8 pipes (4 PKRs) 8 bpe @ SW_64K_R_X 4xaa @ RbPlus + { 3, 7, 431, 699, 0, } , // 8 pipes (4 PKRs) 16 bpe @ SW_64K_R_X 4xaa @ RbPlus + { 3, 0, 399, 744, 0, } , // 16 pipes (4 PKRs) 1 bpe @ SW_64K_R_X 4xaa @ RbPlus + { 3, 1, 399, 745, 0, } , // 16 pipes (4 PKRs) 2 bpe @ SW_64K_R_X 4xaa @ RbPlus + { 3, 39, 399, 746, 0, } , // 16 pipes (4 PKRs) 4 bpe @ SW_64K_R_X 4xaa @ RbPlus + { 3, 6, 432, 747, 0, } , // 16 pipes (4 PKRs) 8 bpe @ SW_64K_R_X 4xaa @ RbPlus + { 3, 7, 433, 429, 0, } , // 16 pipes (4 PKRs) 16 bpe @ SW_64K_R_X 4xaa @ RbPlus + { 3, 0, 400, 748, 0, } , // 8 pipes (8 PKRs) 1 bpe @ SW_64K_R_X 4xaa @ RbPlus + { 3, 1, 401, 749, 0, } , // 8 pipes (8 PKRs) 2 bpe @ SW_64K_R_X 4xaa @ RbPlus + { 3, 39, 402, 750, 0, } , // 8 pipes (8 PKRs) 4 bpe @ SW_64K_R_X 4xaa @ RbPlus + { 3, 6, 434, 707, 0, } , // 8 pipes (8 PKRs) 8 bpe @ SW_64K_R_X 4xaa @ RbPlus + { 3, 7, 435, 708, 0, } , // 8 pipes (8 PKRs) 16 bpe @ SW_64K_R_X 4xaa @ RbPlus + { 3, 0, 307, 744, 0, } , // 16 pipes (8 PKRs) 1 bpe @ SW_64K_R_X 4xaa @ RbPlus + { 3, 1, 307, 751, 0, } , // 16 pipes (8 PKRs) 2 bpe @ SW_64K_R_X 4xaa @ RbPlus + { 3, 39, 307, 746, 0, } , // 16 pipes (8 PKRs) 4 bpe @ SW_64K_R_X 4xaa @ RbPlus + { 3, 6, 436, 703, 0, } , // 16 pipes (8 PKRs) 8 bpe @ SW_64K_R_X 4xaa @ RbPlus + { 3, 7, 437, 390, 0, } , // 16 pipes (8 PKRs) 16 bpe @ SW_64K_R_X 4xaa @ RbPlus + { 3, 0, 307, 752, 0, } , // 32 pipes (8 PKRs) 1 bpe @ SW_64K_R_X 4xaa @ RbPlus + { 3, 1, 307, 753, 0, } , // 32 pipes (8 PKRs) 2 bpe @ SW_64K_R_X 4xaa @ RbPlus + { 3, 39, 307, 754, 0, } , // 32 pipes (8 PKRs) 4 bpe @ SW_64K_R_X 4xaa @ RbPlus + { 3, 6, 436, 712, 0, } , // 32 pipes (8 PKRs) 8 bpe @ SW_64K_R_X 4xaa @ RbPlus + { 3, 7, 437, 676, 0, } , // 32 pipes (8 PKRs) 16 bpe @ SW_64K_R_X 4xaa @ RbPlus + { 3, 0, 323, 755, 0, } , // 16 pipes (16 PKRs) 1 bpe @ SW_64K_R_X 4xaa @ RbPlus + { 3, 1, 323, 756, 0, } , // 16 pipes (16 PKRs) 2 bpe @ SW_64K_R_X 4xaa @ RbPlus + { 3, 39, 323, 757, 0, } , // 16 pipes (16 PKRs) 4 bpe @ SW_64K_R_X 4xaa @ RbPlus + { 3, 6, 438, 716, 0, } , // 16 pipes (16 PKRs) 8 bpe @ SW_64K_R_X 4xaa @ RbPlus + { 3, 7, 439, 400, 0, } , // 16 pipes (16 PKRs) 16 bpe @ SW_64K_R_X 4xaa @ RbPlus + { 3, 0, 323, 758, 0, } , // 32 pipes (16 PKRs) 1 bpe @ SW_64K_R_X 4xaa @ RbPlus + { 3, 1, 323, 759, 0, } , // 32 pipes (16 PKRs) 2 bpe @ SW_64K_R_X 4xaa @ RbPlus + { 3, 39, 323, 760, 0, } , // 32 pipes (16 PKRs) 4 bpe @ SW_64K_R_X 4xaa @ RbPlus + { 3, 6, 438, 720, 0, } , // 32 pipes (16 PKRs) 8 bpe @ SW_64K_R_X 4xaa @ RbPlus + { 3, 7, 439, 405, 0, } , // 32 pipes (16 PKRs) 16 bpe @ SW_64K_R_X 4xaa @ RbPlus + { 3, 0, 323, 761, 0, } , // 64 pipes (16 PKRs) 1 bpe @ SW_64K_R_X 4xaa @ RbPlus + { 3, 1, 323, 762, 0, } , // 64 pipes (16 PKRs) 2 bpe @ SW_64K_R_X 4xaa @ RbPlus + { 3, 39, 323, 763, 0, } , // 64 pipes (16 PKRs) 4 bpe @ SW_64K_R_X 4xaa @ RbPlus + { 3, 6, 438, 724, 0, } , // 64 pipes (16 PKRs) 8 bpe @ SW_64K_R_X 4xaa @ RbPlus + { 3, 7, 439, 684, 0, } , // 64 pipes (16 PKRs) 16 bpe @ SW_64K_R_X 4xaa @ RbPlus + { 3, 0, 324, 764, 0, } , // 32 pipes (32 PKRs) 1 bpe @ SW_64K_R_X 4xaa @ RbPlus + { 3, 1, 324, 765, 0, } , // 32 pipes (32 PKRs) 2 bpe @ SW_64K_R_X 4xaa @ RbPlus + { 3, 39, 324, 766, 0, } , // 32 pipes (32 PKRs) 4 bpe @ SW_64K_R_X 4xaa @ RbPlus + { 3, 6, 440, 728, 0, } , // 32 pipes (32 PKRs) 8 bpe @ SW_64K_R_X 4xaa @ RbPlus + { 3, 7, 441, 412, 0, } , // 32 pipes (32 PKRs) 16 bpe @ SW_64K_R_X 4xaa @ RbPlus + { 3, 0, 324, 767, 0, } , // 64 pipes (32 PKRs) 1 bpe @ SW_64K_R_X 4xaa @ RbPlus + { 3, 1, 324, 768, 0, } , // 64 pipes (32 PKRs) 2 bpe @ SW_64K_R_X 4xaa @ RbPlus + { 3, 39, 324, 769, 0, } , // 64 pipes (32 PKRs) 4 bpe @ SW_64K_R_X 4xaa @ RbPlus + { 3, 6, 440, 732, 0, } , // 64 pipes (32 PKRs) 8 bpe @ SW_64K_R_X 4xaa @ RbPlus + { 3, 7, 441, 414, 0, } , // 64 pipes (32 PKRs) 16 bpe @ SW_64K_R_X 4xaa @ RbPlus +}; + +const ADDR_SW_PATINFO SW_64K_R_X_8xaa_RBPLUS_PATINFO[] = +{ + { 3, 0, 424, 619, 0, } , // 1 pipes (1 PKRs) 1 bpe @ SW_64K_R_X 8xaa @ RbPlus + { 3, 1, 348, 620, 0, } , // 1 pipes (1 PKRs) 2 bpe @ SW_64K_R_X 8xaa @ RbPlus + { 3, 39, 358, 621, 0, } , // 1 pipes (1 PKRs) 4 bpe @ SW_64K_R_X 8xaa @ RbPlus + { 3, 6, 350, 770, 0, } , // 1 pipes (1 PKRs) 8 bpe @ SW_64K_R_X 8xaa @ RbPlus + { 3, 7, 359, 771, 0, } , // 1 pipes (1 PKRs) 16 bpe @ SW_64K_R_X 8xaa @ RbPlus + { 3, 0, 352, 619, 0, } , // 2 pipes (1-2 PKRs) 1 bpe @ SW_64K_R_X 8xaa @ RbPlus + { 3, 1, 353, 620, 0, } , // 2 pipes (1-2 PKRs) 2 bpe @ SW_64K_R_X 8xaa @ RbPlus + { 3, 39, 354, 621, 0, } , // 2 pipes (1-2 PKRs) 4 bpe @ SW_64K_R_X 8xaa @ RbPlus + { 3, 6, 355, 770, 0, } , // 2 pipes (1-2 PKRs) 8 bpe @ SW_64K_R_X 8xaa @ RbPlus + { 3, 7, 378, 772, 0, } , // 2 pipes (1-2 PKRs) 16 bpe @ SW_64K_R_X 8xaa @ RbPlus + { 3, 0, 280, 619, 0, } , // 4 pipes (1-2 PKRs) 1 bpe @ SW_64K_R_X 8xaa @ RbPlus + { 3, 1, 281, 620, 0, } , // 4 pipes (1-2 PKRs) 2 bpe @ SW_64K_R_X 8xaa @ RbPlus + { 3, 39, 282, 621, 0, } , // 4 pipes (1-2 PKRs) 4 bpe @ SW_64K_R_X 8xaa @ RbPlus + { 3, 6, 283, 622, 0, } , // 4 pipes (1-2 PKRs) 8 bpe @ SW_64K_R_X 8xaa @ RbPlus + { 3, 7, 413, 623, 0, } , // 4 pipes (1-2 PKRs) 16 bpe @ SW_64K_R_X 8xaa @ RbPlus + { 3, 0, 394, 773, 0, } , // 8 pipes (2 PKRs) 1 bpe @ SW_64K_R_X 8xaa @ RbPlus + { 3, 1, 395, 774, 0, } , // 8 pipes (2 PKRs) 2 bpe @ SW_64K_R_X 8xaa @ RbPlus + { 3, 39, 442, 775, 0, } , // 8 pipes (2 PKRs) 4 bpe @ SW_64K_R_X 8xaa @ RbPlus + { 3, 6, 443, 776, 0, } , // 8 pipes (2 PKRs) 8 bpe @ SW_64K_R_X 8xaa @ RbPlus + { 3, 7, 444, 777, 0, } , // 8 pipes (2 PKRs) 16 bpe @ SW_64K_R_X 8xaa @ RbPlus + { 3, 0, 415, 629, 0, } , // 4 pipes (4 PKRs) 1 bpe @ SW_64K_R_X 8xaa @ RbPlus + { 3, 1, 291, 630, 0, } , // 4 pipes (4 PKRs) 2 bpe @ SW_64K_R_X 8xaa @ RbPlus + { 3, 39, 292, 631, 0, } , // 4 pipes (4 PKRs) 4 bpe @ SW_64K_R_X 8xaa @ RbPlus + { 3, 6, 416, 632, 0, } , // 4 pipes (4 PKRs) 8 bpe @ SW_64K_R_X 8xaa @ RbPlus + { 3, 7, 417, 580, 0, } , // 4 pipes (4 PKRs) 16 bpe @ SW_64K_R_X 8xaa @ RbPlus + { 3, 0, 295, 773, 0, } , // 8 pipes (4 PKRs) 1 bpe @ SW_64K_R_X 8xaa @ RbPlus + { 3, 1, 296, 778, 0, } , // 8 pipes (4 PKRs) 2 bpe @ SW_64K_R_X 8xaa @ RbPlus + { 3, 39, 297, 779, 0, } , // 8 pipes (4 PKRs) 4 bpe @ SW_64K_R_X 8xaa @ RbPlus + { 3, 6, 445, 780, 0, } , // 8 pipes (4 PKRs) 8 bpe @ SW_64K_R_X 8xaa @ RbPlus + { 3, 7, 446, 699, 0, } , // 8 pipes (4 PKRs) 16 bpe @ SW_64K_R_X 8xaa @ RbPlus + { 3, 0, 399, 781, 0, } , // 16 pipes (4 PKRs) 1 bpe @ SW_64K_R_X 8xaa @ RbPlus + { 3, 1, 399, 782, 0, } , // 16 pipes (4 PKRs) 2 bpe @ SW_64K_R_X 8xaa @ RbPlus + { 3, 39, 447, 783, 0, } , // 16 pipes (4 PKRs) 4 bpe @ SW_64K_R_X 8xaa @ RbPlus + { 3, 6, 448, 784, 0, } , // 16 pipes (4 PKRs) 8 bpe @ SW_64K_R_X 8xaa @ RbPlus + { 3, 7, 449, 429, 0, } , // 16 pipes (4 PKRs) 16 bpe @ SW_64K_R_X 8xaa @ RbPlus + { 3, 0, 450, 785, 0, } , // 8 pipes (8 PKRs) 1 bpe @ SW_64K_R_X 8xaa @ RbPlus + { 3, 1, 302, 786, 0, } , // 8 pipes (8 PKRs) 2 bpe @ SW_64K_R_X 8xaa @ RbPlus + { 3, 39, 303, 787, 0, } , // 8 pipes (8 PKRs) 4 bpe @ SW_64K_R_X 8xaa @ RbPlus + { 3, 6, 420, 788, 0, } , // 8 pipes (8 PKRs) 8 bpe @ SW_64K_R_X 8xaa @ RbPlus + { 3, 7, 451, 708, 0, } , // 8 pipes (8 PKRs) 16 bpe @ SW_64K_R_X 8xaa @ RbPlus + { 3, 0, 339, 781, 0, } , // 16 pipes (8 PKRs) 1 bpe @ SW_64K_R_X 8xaa @ RbPlus + { 3, 1, 339, 782, 0, } , // 16 pipes (8 PKRs) 2 bpe @ SW_64K_R_X 8xaa @ RbPlus + { 3, 39, 422, 746, 0, } , // 16 pipes (8 PKRs) 4 bpe @ SW_64K_R_X 8xaa @ RbPlus + { 3, 6, 452, 703, 0, } , // 16 pipes (8 PKRs) 8 bpe @ SW_64K_R_X 8xaa @ RbPlus + { 3, 7, 453, 390, 0, } , // 16 pipes (8 PKRs) 16 bpe @ SW_64K_R_X 8xaa @ RbPlus + { 3, 0, 339, 789, 0, } , // 32 pipes (8 PKRs) 1 bpe @ SW_64K_R_X 8xaa @ RbPlus + { 3, 1, 339, 790, 0, } , // 32 pipes (8 PKRs) 2 bpe @ SW_64K_R_X 8xaa @ RbPlus + { 3, 39, 422, 754, 0, } , // 32 pipes (8 PKRs) 4 bpe @ SW_64K_R_X 8xaa @ RbPlus + { 3, 6, 452, 712, 0, } , // 32 pipes (8 PKRs) 8 bpe @ SW_64K_R_X 8xaa @ RbPlus + { 3, 7, 453, 676, 0, } , // 32 pipes (8 PKRs) 16 bpe @ SW_64K_R_X 8xaa @ RbPlus + { 3, 0, 343, 791, 0, } , // 16 pipes (16 PKRs) 1 bpe @ SW_64K_R_X 8xaa @ RbPlus + { 3, 1, 341, 792, 0, } , // 16 pipes (16 PKRs) 2 bpe @ SW_64K_R_X 8xaa @ RbPlus + { 3, 39, 423, 757, 0, } , // 16 pipes (16 PKRs) 4 bpe @ SW_64K_R_X 8xaa @ RbPlus + { 3, 6, 454, 716, 0, } , // 16 pipes (16 PKRs) 8 bpe @ SW_64K_R_X 8xaa @ RbPlus + { 3, 7, 455, 400, 0, } , // 16 pipes (16 PKRs) 16 bpe @ SW_64K_R_X 8xaa @ RbPlus + { 3, 0, 343, 793, 0, } , // 32 pipes (16 PKRs) 1 bpe @ SW_64K_R_X 8xaa @ RbPlus + { 3, 1, 341, 794, 0, } , // 32 pipes (16 PKRs) 2 bpe @ SW_64K_R_X 8xaa @ RbPlus + { 3, 39, 423, 760, 0, } , // 32 pipes (16 PKRs) 4 bpe @ SW_64K_R_X 8xaa @ RbPlus + { 3, 6, 454, 720, 0, } , // 32 pipes (16 PKRs) 8 bpe @ SW_64K_R_X 8xaa @ RbPlus + { 3, 7, 455, 405, 0, } , // 32 pipes (16 PKRs) 16 bpe @ SW_64K_R_X 8xaa @ RbPlus + { 3, 0, 343, 795, 0, } , // 64 pipes (16 PKRs) 1 bpe @ SW_64K_R_X 8xaa @ RbPlus + { 3, 1, 341, 796, 0, } , // 64 pipes (16 PKRs) 2 bpe @ SW_64K_R_X 8xaa @ RbPlus + { 3, 39, 423, 763, 0, } , // 64 pipes (16 PKRs) 4 bpe @ SW_64K_R_X 8xaa @ RbPlus + { 3, 6, 454, 724, 0, } , // 64 pipes (16 PKRs) 8 bpe @ SW_64K_R_X 8xaa @ RbPlus + { 3, 7, 455, 684, 0, } , // 64 pipes (16 PKRs) 16 bpe @ SW_64K_R_X 8xaa @ RbPlus + { 3, 0, 344, 797, 0, } , // 32 pipes (32 PKRs) 1 bpe @ SW_64K_R_X 8xaa @ RbPlus + { 3, 1, 345, 798, 0, } , // 32 pipes (32 PKRs) 2 bpe @ SW_64K_R_X 8xaa @ RbPlus + { 3, 39, 456, 766, 0, } , // 32 pipes (32 PKRs) 4 bpe @ SW_64K_R_X 8xaa @ RbPlus + { 3, 6, 457, 728, 0, } , // 32 pipes (32 PKRs) 8 bpe @ SW_64K_R_X 8xaa @ RbPlus + { 3, 7, 458, 412, 0, } , // 32 pipes (32 PKRs) 16 bpe @ SW_64K_R_X 8xaa @ RbPlus + { 3, 0, 344, 799, 0, } , // 64 pipes (32 PKRs) 1 bpe @ SW_64K_R_X 8xaa @ RbPlus + { 3, 1, 345, 800, 0, } , // 64 pipes (32 PKRs) 2 bpe @ SW_64K_R_X 8xaa @ RbPlus + { 3, 39, 456, 769, 0, } , // 64 pipes (32 PKRs) 4 bpe @ SW_64K_R_X 8xaa @ RbPlus + { 3, 6, 457, 732, 0, } , // 64 pipes (32 PKRs) 8 bpe @ SW_64K_R_X 8xaa @ RbPlus + { 3, 7, 458, 414, 0, } , // 64 pipes (32 PKRs) 16 bpe @ SW_64K_R_X 8xaa @ RbPlus +}; + +const ADDR_SW_PATINFO SW_64K_Z_X_1xaa_RBPLUS_PATINFO[] = +{ + { 2, 8, 347, 193, 0, } , // 1 pipes (1 PKRs) 1 bpe @ SW_64K_Z_X 1xaa @ RbPlus + { 2, 9, 348, 366, 0, } , // 1 pipes (1 PKRs) 2 bpe @ SW_64K_Z_X 1xaa @ RbPlus + { 2, 10, 349, 195, 0, } , // 1 pipes (1 PKRs) 4 bpe @ SW_64K_Z_X 1xaa @ RbPlus + { 2, 11, 350, 367, 0, } , // 1 pipes (1 PKRs) 8 bpe @ SW_64K_Z_X 1xaa @ RbPlus + { 2, 7, 351, 368, 0, } , // 1 pipes (1 PKRs) 16 bpe @ SW_64K_Z_X 1xaa @ RbPlus + { 3, 8, 352, 193, 0, } , // 2 pipes (1-2 PKRs) 1 bpe @ SW_64K_Z_X 1xaa @ RbPlus + { 3, 9, 353, 194, 0, } , // 2 pipes (1-2 PKRs) 2 bpe @ SW_64K_Z_X 1xaa @ RbPlus + { 3, 10, 354, 195, 0, } , // 2 pipes (1-2 PKRs) 4 bpe @ SW_64K_Z_X 1xaa @ RbPlus + { 3, 11, 355, 369, 0, } , // 2 pipes (1-2 PKRs) 8 bpe @ SW_64K_Z_X 1xaa @ RbPlus + { 3, 7, 356, 370, 0, } , // 2 pipes (1-2 PKRs) 16 bpe @ SW_64K_Z_X 1xaa @ RbPlus + { 3, 8, 280, 193, 0, } , // 4 pipes (1-2 PKRs) 1 bpe @ SW_64K_Z_X 1xaa @ RbPlus + { 3, 9, 281, 194, 0, } , // 4 pipes (1-2 PKRs) 2 bpe @ SW_64K_Z_X 1xaa @ RbPlus + { 3, 10, 282, 195, 0, } , // 4 pipes (1-2 PKRs) 4 bpe @ SW_64K_Z_X 1xaa @ RbPlus + { 3, 11, 283, 196, 0, } , // 4 pipes (1-2 PKRs) 8 bpe @ SW_64K_Z_X 1xaa @ RbPlus + { 3, 7, 284, 197, 0, } , // 4 pipes (1-2 PKRs) 16 bpe @ SW_64K_Z_X 1xaa @ RbPlus + { 3, 8, 285, 219, 0, } , // 8 pipes (2 PKRs) 1 bpe @ SW_64K_Z_X 1xaa @ RbPlus + { 3, 9, 286, 371, 0, } , // 8 pipes (2 PKRs) 2 bpe @ SW_64K_Z_X 1xaa @ RbPlus + { 3, 10, 287, 372, 0, } , // 8 pipes (2 PKRs) 4 bpe @ SW_64K_Z_X 1xaa @ RbPlus + { 3, 11, 288, 373, 0, } , // 8 pipes (2 PKRs) 8 bpe @ SW_64K_Z_X 1xaa @ RbPlus + { 3, 7, 289, 374, 0, } , // 8 pipes (2 PKRs) 16 bpe @ SW_64K_Z_X 1xaa @ RbPlus + { 3, 8, 290, 203, 0, } , // 4 pipes (4 PKRs) 1 bpe @ SW_64K_Z_X 1xaa @ RbPlus + { 3, 9, 291, 204, 0, } , // 4 pipes (4 PKRs) 2 bpe @ SW_64K_Z_X 1xaa @ RbPlus + { 3, 10, 292, 205, 0, } , // 4 pipes (4 PKRs) 4 bpe @ SW_64K_Z_X 1xaa @ RbPlus + { 3, 11, 293, 206, 0, } , // 4 pipes (4 PKRs) 8 bpe @ SW_64K_Z_X 1xaa @ RbPlus + { 3, 7, 294, 207, 0, } , // 4 pipes (4 PKRs) 16 bpe @ SW_64K_Z_X 1xaa @ RbPlus + { 3, 8, 295, 219, 0, } , // 8 pipes (4 PKRs) 1 bpe @ SW_64K_Z_X 1xaa @ RbPlus + { 3, 9, 296, 375, 0, } , // 8 pipes (4 PKRs) 2 bpe @ SW_64K_Z_X 1xaa @ RbPlus + { 3, 10, 297, 376, 0, } , // 8 pipes (4 PKRs) 4 bpe @ SW_64K_Z_X 1xaa @ RbPlus + { 3, 11, 298, 377, 0, } , // 8 pipes (4 PKRs) 8 bpe @ SW_64K_Z_X 1xaa @ RbPlus + { 3, 7, 299, 378, 0, } , // 8 pipes (4 PKRs) 16 bpe @ SW_64K_Z_X 1xaa @ RbPlus + { 3, 8, 300, 379, 0, } , // 16 pipes (4 PKRs) 1 bpe @ SW_64K_Z_X 1xaa @ RbPlus + { 3, 9, 300, 380, 0, } , // 16 pipes (4 PKRs) 2 bpe @ SW_64K_Z_X 1xaa @ RbPlus + { 3, 10, 300, 381, 0, } , // 16 pipes (4 PKRs) 4 bpe @ SW_64K_Z_X 1xaa @ RbPlus + { 3, 11, 300, 382, 0, } , // 16 pipes (4 PKRs) 8 bpe @ SW_64K_Z_X 1xaa @ RbPlus + { 3, 7, 300, 383, 0, } , // 16 pipes (4 PKRs) 16 bpe @ SW_64K_Z_X 1xaa @ RbPlus + { 3, 8, 301, 384, 0, } , // 8 pipes (8 PKRs) 1 bpe @ SW_64K_Z_X 1xaa @ RbPlus + { 3, 9, 302, 385, 0, } , // 8 pipes (8 PKRs) 2 bpe @ SW_64K_Z_X 1xaa @ RbPlus + { 3, 10, 303, 386, 0, } , // 8 pipes (8 PKRs) 4 bpe @ SW_64K_Z_X 1xaa @ RbPlus + { 3, 11, 304, 387, 0, } , // 8 pipes (8 PKRs) 8 bpe @ SW_64K_Z_X 1xaa @ RbPlus + { 3, 7, 305, 388, 0, } , // 8 pipes (8 PKRs) 16 bpe @ SW_64K_Z_X 1xaa @ RbPlus + { 3, 8, 306, 379, 0, } , // 16 pipes (8 PKRs) 1 bpe @ SW_64K_Z_X 1xaa @ RbPlus + { 3, 9, 306, 389, 0, } , // 16 pipes (8 PKRs) 2 bpe @ SW_64K_Z_X 1xaa @ RbPlus + { 3, 10, 306, 381, 0, } , // 16 pipes (8 PKRs) 4 bpe @ SW_64K_Z_X 1xaa @ RbPlus + { 3, 11, 307, 382, 0, } , // 16 pipes (8 PKRs) 8 bpe @ SW_64K_Z_X 1xaa @ RbPlus + { 3, 7, 307, 390, 0, } , // 16 pipes (8 PKRs) 16 bpe @ SW_64K_Z_X 1xaa @ RbPlus + { 3, 8, 306, 391, 0, } , // 32 pipes (8 PKRs) 1 bpe @ SW_64K_Z_X 1xaa @ RbPlus + { 3, 9, 306, 392, 0, } , // 32 pipes (8 PKRs) 2 bpe @ SW_64K_Z_X 1xaa @ RbPlus + { 3, 10, 306, 393, 0, } , // 32 pipes (8 PKRs) 4 bpe @ SW_64K_Z_X 1xaa @ RbPlus + { 3, 11, 307, 394, 0, } , // 32 pipes (8 PKRs) 8 bpe @ SW_64K_Z_X 1xaa @ RbPlus + { 3, 7, 307, 395, 0, } , // 32 pipes (8 PKRs) 16 bpe @ SW_64K_Z_X 1xaa @ RbPlus + { 3, 8, 308, 396, 0, } , // 16 pipes (16 PKRs) 1 bpe @ SW_64K_Z_X 1xaa @ RbPlus + { 3, 9, 308, 397, 0, } , // 16 pipes (16 PKRs) 2 bpe @ SW_64K_Z_X 1xaa @ RbPlus + { 3, 10, 308, 398, 0, } , // 16 pipes (16 PKRs) 4 bpe @ SW_64K_Z_X 1xaa @ RbPlus + { 3, 11, 309, 399, 0, } , // 16 pipes (16 PKRs) 8 bpe @ SW_64K_Z_X 1xaa @ RbPlus + { 3, 7, 323, 400, 0, } , // 16 pipes (16 PKRs) 16 bpe @ SW_64K_Z_X 1xaa @ RbPlus + { 3, 8, 308, 401, 0, } , // 32 pipes (16 PKRs) 1 bpe @ SW_64K_Z_X 1xaa @ RbPlus + { 3, 9, 308, 402, 0, } , // 32 pipes (16 PKRs) 2 bpe @ SW_64K_Z_X 1xaa @ RbPlus + { 3, 10, 308, 403, 0, } , // 32 pipes (16 PKRs) 4 bpe @ SW_64K_Z_X 1xaa @ RbPlus + { 3, 11, 309, 404, 0, } , // 32 pipes (16 PKRs) 8 bpe @ SW_64K_Z_X 1xaa @ RbPlus + { 3, 7, 323, 405, 0, } , // 32 pipes (16 PKRs) 16 bpe @ SW_64K_Z_X 1xaa @ RbPlus + { 3, 8, 308, 240, 0, } , // 64 pipes (16 PKRs) 1 bpe @ SW_64K_Z_X 1xaa @ RbPlus + { 3, 9, 308, 241, 0, } , // 64 pipes (16 PKRs) 2 bpe @ SW_64K_Z_X 1xaa @ RbPlus + { 3, 10, 308, 242, 0, } , // 64 pipes (16 PKRs) 4 bpe @ SW_64K_Z_X 1xaa @ RbPlus + { 3, 11, 309, 406, 0, } , // 64 pipes (16 PKRs) 8 bpe @ SW_64K_Z_X 1xaa @ RbPlus + { 3, 7, 323, 407, 0, } , // 64 pipes (16 PKRs) 16 bpe @ SW_64K_Z_X 1xaa @ RbPlus + { 3, 8, 310, 408, 0, } , // 32 pipes (32 PKRs) 1 bpe @ SW_64K_Z_X 1xaa @ RbPlus + { 3, 9, 310, 409, 0, } , // 32 pipes (32 PKRs) 2 bpe @ SW_64K_Z_X 1xaa @ RbPlus + { 3, 10, 310, 410, 0, } , // 32 pipes (32 PKRs) 4 bpe @ SW_64K_Z_X 1xaa @ RbPlus + { 3, 11, 318, 411, 0, } , // 32 pipes (32 PKRs) 8 bpe @ SW_64K_Z_X 1xaa @ RbPlus + { 3, 7, 324, 412, 0, } , // 32 pipes (32 PKRs) 16 bpe @ SW_64K_Z_X 1xaa @ RbPlus + { 3, 8, 310, 250, 0, } , // 64 pipes (32 PKRs) 1 bpe @ SW_64K_Z_X 1xaa @ RbPlus + { 3, 9, 310, 251, 0, } , // 64 pipes (32 PKRs) 2 bpe @ SW_64K_Z_X 1xaa @ RbPlus + { 3, 10, 310, 252, 0, } , // 64 pipes (32 PKRs) 4 bpe @ SW_64K_Z_X 1xaa @ RbPlus + { 3, 11, 318, 413, 0, } , // 64 pipes (32 PKRs) 8 bpe @ SW_64K_Z_X 1xaa @ RbPlus + { 3, 7, 324, 414, 0, } , // 64 pipes (32 PKRs) 16 bpe @ SW_64K_Z_X 1xaa @ RbPlus +}; + +const ADDR_SW_PATINFO SW_64K_Z_X_2xaa_RBPLUS_PATINFO[] = +{ + { 2, 13, 357, 415, 0, } , // 1 pipes (1 PKRs) 1 bpe @ SW_64K_Z_X 2xaa @ RbPlus + { 2, 14, 349, 195, 0, } , // 1 pipes (1 PKRs) 2 bpe @ SW_64K_Z_X 2xaa @ RbPlus + { 3, 15, 358, 263, 0, } , // 1 pipes (1 PKRs) 4 bpe @ SW_64K_Z_X 2xaa @ RbPlus + { 3, 16, 350, 416, 0, } , // 1 pipes (1 PKRs) 8 bpe @ SW_64K_Z_X 2xaa @ RbPlus + { 3, 17, 359, 417, 0, } , // 1 pipes (1 PKRs) 16 bpe @ SW_64K_Z_X 2xaa @ RbPlus + { 3, 13, 360, 415, 0, } , // 2 pipes (1-2 PKRs) 1 bpe @ SW_64K_Z_X 2xaa @ RbPlus + { 3, 14, 354, 195, 0, } , // 2 pipes (1-2 PKRs) 2 bpe @ SW_64K_Z_X 2xaa @ RbPlus + { 3, 15, 354, 263, 0, } , // 2 pipes (1-2 PKRs) 4 bpe @ SW_64K_Z_X 2xaa @ RbPlus + { 3, 16, 361, 418, 0, } , // 2 pipes (1-2 PKRs) 8 bpe @ SW_64K_Z_X 2xaa @ RbPlus + { 3, 17, 356, 419, 0, } , // 2 pipes (1-2 PKRs) 16 bpe @ SW_64K_Z_X 2xaa @ RbPlus + { 3, 13, 281, 262, 0, } , // 4 pipes (1-2 PKRs) 1 bpe @ SW_64K_Z_X 2xaa @ RbPlus + { 3, 14, 282, 195, 0, } , // 4 pipes (1-2 PKRs) 2 bpe @ SW_64K_Z_X 2xaa @ RbPlus + { 3, 15, 282, 263, 0, } , // 4 pipes (1-2 PKRs) 4 bpe @ SW_64K_Z_X 2xaa @ RbPlus + { 3, 16, 317, 264, 0, } , // 4 pipes (1-2 PKRs) 8 bpe @ SW_64K_Z_X 2xaa @ RbPlus + { 3, 17, 284, 265, 0, } , // 4 pipes (1-2 PKRs) 16 bpe @ SW_64K_Z_X 2xaa @ RbPlus + { 3, 13, 286, 420, 0, } , // 8 pipes (2 PKRs) 1 bpe @ SW_64K_Z_X 2xaa @ RbPlus + { 3, 14, 287, 376, 0, } , // 8 pipes (2 PKRs) 2 bpe @ SW_64K_Z_X 2xaa @ RbPlus + { 3, 15, 287, 421, 0, } , // 8 pipes (2 PKRs) 4 bpe @ SW_64K_Z_X 2xaa @ RbPlus + { 3, 16, 289, 422, 0, } , // 8 pipes (2 PKRs) 8 bpe @ SW_64K_Z_X 2xaa @ RbPlus + { 3, 17, 289, 423, 0, } , // 8 pipes (2 PKRs) 16 bpe @ SW_64K_Z_X 2xaa @ RbPlus + { 3, 13, 291, 268, 0, } , // 4 pipes (4 PKRs) 1 bpe @ SW_64K_Z_X 2xaa @ RbPlus + { 3, 14, 292, 205, 0, } , // 4 pipes (4 PKRs) 2 bpe @ SW_64K_Z_X 2xaa @ RbPlus + { 3, 15, 292, 269, 0, } , // 4 pipes (4 PKRs) 4 bpe @ SW_64K_Z_X 2xaa @ RbPlus + { 3, 16, 293, 270, 0, } , // 4 pipes (4 PKRs) 8 bpe @ SW_64K_Z_X 2xaa @ RbPlus + { 3, 17, 294, 271, 0, } , // 4 pipes (4 PKRs) 16 bpe @ SW_64K_Z_X 2xaa @ RbPlus + { 3, 13, 296, 420, 0, } , // 8 pipes (4 PKRs) 1 bpe @ SW_64K_Z_X 2xaa @ RbPlus + { 3, 14, 297, 376, 0, } , // 8 pipes (4 PKRs) 2 bpe @ SW_64K_Z_X 2xaa @ RbPlus + { 3, 15, 297, 421, 0, } , // 8 pipes (4 PKRs) 4 bpe @ SW_64K_Z_X 2xaa @ RbPlus + { 3, 16, 298, 424, 0, } , // 8 pipes (4 PKRs) 8 bpe @ SW_64K_Z_X 2xaa @ RbPlus + { 3, 17, 299, 423, 0, } , // 8 pipes (4 PKRs) 16 bpe @ SW_64K_Z_X 2xaa @ RbPlus + { 3, 13, 300, 425, 0, } , // 16 pipes (4 PKRs) 1 bpe @ SW_64K_Z_X 2xaa @ RbPlus + { 3, 14, 300, 426, 0, } , // 16 pipes (4 PKRs) 2 bpe @ SW_64K_Z_X 2xaa @ RbPlus + { 3, 15, 300, 427, 0, } , // 16 pipes (4 PKRs) 4 bpe @ SW_64K_Z_X 2xaa @ RbPlus + { 3, 16, 362, 428, 0, } , // 16 pipes (4 PKRs) 8 bpe @ SW_64K_Z_X 2xaa @ RbPlus + { 3, 17, 363, 429, 0, } , // 16 pipes (4 PKRs) 16 bpe @ SW_64K_Z_X 2xaa @ RbPlus + { 3, 13, 302, 430, 0, } , // 8 pipes (8 PKRs) 1 bpe @ SW_64K_Z_X 2xaa @ RbPlus + { 3, 14, 303, 386, 0, } , // 8 pipes (8 PKRs) 2 bpe @ SW_64K_Z_X 2xaa @ RbPlus + { 3, 15, 303, 431, 0, } , // 8 pipes (8 PKRs) 4 bpe @ SW_64K_Z_X 2xaa @ RbPlus + { 3, 16, 305, 432, 0, } , // 8 pipes (8 PKRs) 8 bpe @ SW_64K_Z_X 2xaa @ RbPlus + { 3, 17, 364, 433, 0, } , // 8 pipes (8 PKRs) 16 bpe @ SW_64K_Z_X 2xaa @ RbPlus + { 3, 13, 306, 380, 0, } , // 16 pipes (8 PKRs) 1 bpe @ SW_64K_Z_X 2xaa @ RbPlus + { 3, 14, 306, 381, 0, } , // 16 pipes (8 PKRs) 2 bpe @ SW_64K_Z_X 2xaa @ RbPlus + { 3, 15, 306, 434, 0, } , // 16 pipes (8 PKRs) 4 bpe @ SW_64K_Z_X 2xaa @ RbPlus + { 3, 16, 307, 435, 0, } , // 16 pipes (8 PKRs) 8 bpe @ SW_64K_Z_X 2xaa @ RbPlus + { 3, 17, 365, 435, 0, } , // 16 pipes (8 PKRs) 16 bpe @ SW_64K_Z_X 2xaa @ RbPlus + { 3, 13, 306, 402, 0, } , // 32 pipes (8 PKRs) 1 bpe @ SW_64K_Z_X 2xaa @ RbPlus + { 3, 14, 306, 403, 0, } , // 32 pipes (8 PKRs) 2 bpe @ SW_64K_Z_X 2xaa @ RbPlus + { 3, 15, 306, 436, 0, } , // 32 pipes (8 PKRs) 4 bpe @ SW_64K_Z_X 2xaa @ RbPlus + { 3, 16, 307, 405, 0, } , // 32 pipes (8 PKRs) 8 bpe @ SW_64K_Z_X 2xaa @ RbPlus + { 3, 17, 365, 405, 0, } , // 32 pipes (8 PKRs) 16 bpe @ SW_64K_Z_X 2xaa @ RbPlus + { 3, 13, 308, 397, 0, } , // 16 pipes (16 PKRs) 1 bpe @ SW_64K_Z_X 2xaa @ RbPlus + { 3, 14, 308, 398, 0, } , // 16 pipes (16 PKRs) 2 bpe @ SW_64K_Z_X 2xaa @ RbPlus + { 3, 15, 308, 437, 0, } , // 16 pipes (16 PKRs) 4 bpe @ SW_64K_Z_X 2xaa @ RbPlus + { 3, 16, 323, 438, 0, } , // 16 pipes (16 PKRs) 8 bpe @ SW_64K_Z_X 2xaa @ RbPlus + { 3, 17, 366, 438, 0, } , // 16 pipes (16 PKRs) 16 bpe @ SW_64K_Z_X 2xaa @ RbPlus + { 3, 13, 308, 402, 0, } , // 32 pipes (16 PKRs) 1 bpe @ SW_64K_Z_X 2xaa @ RbPlus + { 3, 14, 308, 403, 0, } , // 32 pipes (16 PKRs) 2 bpe @ SW_64K_Z_X 2xaa @ RbPlus + { 3, 15, 308, 436, 0, } , // 32 pipes (16 PKRs) 4 bpe @ SW_64K_Z_X 2xaa @ RbPlus + { 3, 16, 323, 439, 0, } , // 32 pipes (16 PKRs) 8 bpe @ SW_64K_Z_X 2xaa @ RbPlus + { 3, 17, 366, 439, 0, } , // 32 pipes (16 PKRs) 16 bpe @ SW_64K_Z_X 2xaa @ RbPlus + { 3, 13, 308, 440, 0, } , // 64 pipes (16 PKRs) 1 bpe @ SW_64K_Z_X 2xaa @ RbPlus + { 3, 14, 308, 242, 0, } , // 64 pipes (16 PKRs) 2 bpe @ SW_64K_Z_X 2xaa @ RbPlus + { 3, 15, 308, 441, 0, } , // 64 pipes (16 PKRs) 4 bpe @ SW_64K_Z_X 2xaa @ RbPlus + { 3, 16, 323, 442, 0, } , // 64 pipes (16 PKRs) 8 bpe @ SW_64K_Z_X 2xaa @ RbPlus + { 3, 17, 366, 442, 0, } , // 64 pipes (16 PKRs) 16 bpe @ SW_64K_Z_X 2xaa @ RbPlus + { 3, 13, 310, 443, 0, } , // 32 pipes (32 PKRs) 1 bpe @ SW_64K_Z_X 2xaa @ RbPlus + { 3, 14, 310, 410, 0, } , // 32 pipes (32 PKRs) 2 bpe @ SW_64K_Z_X 2xaa @ RbPlus + { 3, 15, 310, 444, 0, } , // 32 pipes (32 PKRs) 4 bpe @ SW_64K_Z_X 2xaa @ RbPlus + { 3, 16, 324, 412, 0, } , // 32 pipes (32 PKRs) 8 bpe @ SW_64K_Z_X 2xaa @ RbPlus + { 3, 17, 367, 412, 0, } , // 32 pipes (32 PKRs) 16 bpe @ SW_64K_Z_X 2xaa @ RbPlus + { 3, 13, 310, 445, 0, } , // 64 pipes (32 PKRs) 1 bpe @ SW_64K_Z_X 2xaa @ RbPlus + { 3, 14, 310, 252, 0, } , // 64 pipes (32 PKRs) 2 bpe @ SW_64K_Z_X 2xaa @ RbPlus + { 3, 15, 310, 446, 0, } , // 64 pipes (32 PKRs) 4 bpe @ SW_64K_Z_X 2xaa @ RbPlus + { 3, 16, 324, 414, 0, } , // 64 pipes (32 PKRs) 8 bpe @ SW_64K_Z_X 2xaa @ RbPlus + { 3, 17, 367, 414, 0, } , // 64 pipes (32 PKRs) 16 bpe @ SW_64K_Z_X 2xaa @ RbPlus +}; + +const ADDR_SW_PATINFO SW_64K_Z_X_4xaa_RBPLUS_PATINFO[] = +{ + { 2, 18, 349, 195, 0, } , // 1 pipes (1 PKRs) 1 bpe @ SW_64K_Z_X 4xaa @ RbPlus + { 3, 19, 349, 447, 0, } , // 1 pipes (1 PKRs) 2 bpe @ SW_64K_Z_X 4xaa @ RbPlus + { 3, 20, 349, 448, 0, } , // 1 pipes (1 PKRs) 4 bpe @ SW_64K_Z_X 4xaa @ RbPlus + { 3, 21, 350, 449, 0, } , // 1 pipes (1 PKRs) 8 bpe @ SW_64K_Z_X 4xaa @ RbPlus + { 3, 22, 351, 450, 0, } , // 1 pipes (1 PKRs) 16 bpe @ SW_64K_Z_X 4xaa @ RbPlus + { 3, 18, 354, 195, 0, } , // 2 pipes (1-2 PKRs) 1 bpe @ SW_64K_Z_X 4xaa @ RbPlus + { 3, 19, 368, 451, 0, } , // 2 pipes (1-2 PKRs) 2 bpe @ SW_64K_Z_X 4xaa @ RbPlus + { 3, 20, 354, 299, 0, } , // 2 pipes (1-2 PKRs) 4 bpe @ SW_64K_Z_X 4xaa @ RbPlus + { 3, 21, 355, 452, 0, } , // 2 pipes (1-2 PKRs) 8 bpe @ SW_64K_Z_X 4xaa @ RbPlus + { 3, 22, 356, 453, 0, } , // 2 pipes (1-2 PKRs) 16 bpe @ SW_64K_Z_X 4xaa @ RbPlus + { 3, 18, 282, 195, 0, } , // 4 pipes (1-2 PKRs) 1 bpe @ SW_64K_Z_X 4xaa @ RbPlus + { 3, 19, 282, 298, 0, } , // 4 pipes (1-2 PKRs) 2 bpe @ SW_64K_Z_X 4xaa @ RbPlus + { 3, 20, 282, 299, 0, } , // 4 pipes (1-2 PKRs) 4 bpe @ SW_64K_Z_X 4xaa @ RbPlus + { 3, 21, 283, 300, 0, } , // 4 pipes (1-2 PKRs) 8 bpe @ SW_64K_Z_X 4xaa @ RbPlus + { 3, 22, 284, 301, 0, } , // 4 pipes (1-2 PKRs) 16 bpe @ SW_64K_Z_X 4xaa @ RbPlus + { 3, 18, 287, 372, 0, } , // 8 pipes (2 PKRs) 1 bpe @ SW_64K_Z_X 4xaa @ RbPlus + { 3, 19, 287, 454, 0, } , // 8 pipes (2 PKRs) 2 bpe @ SW_64K_Z_X 4xaa @ RbPlus + { 3, 20, 287, 455, 0, } , // 8 pipes (2 PKRs) 4 bpe @ SW_64K_Z_X 4xaa @ RbPlus + { 3, 21, 288, 456, 0, } , // 8 pipes (2 PKRs) 8 bpe @ SW_64K_Z_X 4xaa @ RbPlus + { 3, 22, 331, 457, 0, } , // 8 pipes (2 PKRs) 16 bpe @ SW_64K_Z_X 4xaa @ RbPlus + { 3, 18, 292, 205, 0, } , // 4 pipes (4 PKRs) 1 bpe @ SW_64K_Z_X 4xaa @ RbPlus + { 3, 19, 292, 306, 0, } , // 4 pipes (4 PKRs) 2 bpe @ SW_64K_Z_X 4xaa @ RbPlus + { 3, 20, 292, 307, 0, } , // 4 pipes (4 PKRs) 4 bpe @ SW_64K_Z_X 4xaa @ RbPlus + { 3, 21, 320, 308, 0, } , // 4 pipes (4 PKRs) 8 bpe @ SW_64K_Z_X 4xaa @ RbPlus + { 3, 22, 321, 309, 0, } , // 4 pipes (4 PKRs) 16 bpe @ SW_64K_Z_X 4xaa @ RbPlus + { 3, 18, 297, 376, 0, } , // 8 pipes (4 PKRs) 1 bpe @ SW_64K_Z_X 4xaa @ RbPlus + { 3, 19, 297, 458, 0, } , // 8 pipes (4 PKRs) 2 bpe @ SW_64K_Z_X 4xaa @ RbPlus + { 3, 20, 297, 459, 0, } , // 8 pipes (4 PKRs) 4 bpe @ SW_64K_Z_X 4xaa @ RbPlus + { 3, 21, 299, 460, 0, } , // 8 pipes (4 PKRs) 8 bpe @ SW_64K_Z_X 4xaa @ RbPlus + { 3, 22, 369, 461, 0, } , // 8 pipes (4 PKRs) 16 bpe @ SW_64K_Z_X 4xaa @ RbPlus + { 3, 18, 300, 381, 0, } , // 16 pipes (4 PKRs) 1 bpe @ SW_64K_Z_X 4xaa @ RbPlus + { 3, 19, 300, 462, 0, } , // 16 pipes (4 PKRs) 2 bpe @ SW_64K_Z_X 4xaa @ RbPlus + { 3, 20, 300, 463, 0, } , // 16 pipes (4 PKRs) 4 bpe @ SW_64K_Z_X 4xaa @ RbPlus + { 3, 21, 363, 464, 0, } , // 16 pipes (4 PKRs) 8 bpe @ SW_64K_Z_X 4xaa @ RbPlus + { 3, 22, 370, 465, 0, } , // 16 pipes (4 PKRs) 16 bpe @ SW_64K_Z_X 4xaa @ RbPlus + { 3, 18, 303, 386, 0, } , // 8 pipes (8 PKRs) 1 bpe @ SW_64K_Z_X 4xaa @ RbPlus + { 3, 19, 303, 466, 0, } , // 8 pipes (8 PKRs) 2 bpe @ SW_64K_Z_X 4xaa @ RbPlus + { 3, 20, 303, 467, 0, } , // 8 pipes (8 PKRs) 4 bpe @ SW_64K_Z_X 4xaa @ RbPlus + { 3, 21, 371, 468, 0, } , // 8 pipes (8 PKRs) 8 bpe @ SW_64K_Z_X 4xaa @ RbPlus + { 3, 22, 337, 469, 0, } , // 8 pipes (8 PKRs) 16 bpe @ SW_64K_Z_X 4xaa @ RbPlus + { 3, 18, 306, 381, 0, } , // 16 pipes (8 PKRs) 1 bpe @ SW_64K_Z_X 4xaa @ RbPlus + { 3, 19, 306, 462, 0, } , // 16 pipes (8 PKRs) 2 bpe @ SW_64K_Z_X 4xaa @ RbPlus + { 3, 20, 306, 470, 0, } , // 16 pipes (8 PKRs) 4 bpe @ SW_64K_Z_X 4xaa @ RbPlus + { 3, 21, 372, 470, 0, } , // 16 pipes (8 PKRs) 8 bpe @ SW_64K_Z_X 4xaa @ RbPlus + { 3, 22, 373, 470, 0, } , // 16 pipes (8 PKRs) 16 bpe @ SW_64K_Z_X 4xaa @ RbPlus + { 3, 18, 306, 393, 0, } , // 32 pipes (8 PKRs) 1 bpe @ SW_64K_Z_X 4xaa @ RbPlus + { 3, 19, 306, 471, 0, } , // 32 pipes (8 PKRs) 2 bpe @ SW_64K_Z_X 4xaa @ RbPlus + { 3, 20, 306, 472, 0, } , // 32 pipes (8 PKRs) 4 bpe @ SW_64K_Z_X 4xaa @ RbPlus + { 3, 21, 372, 472, 0, } , // 32 pipes (8 PKRs) 8 bpe @ SW_64K_Z_X 4xaa @ RbPlus + { 3, 22, 373, 472, 0, } , // 32 pipes (8 PKRs) 16 bpe @ SW_64K_Z_X 4xaa @ RbPlus + { 3, 18, 308, 398, 0, } , // 16 pipes (16 PKRs) 1 bpe @ SW_64K_Z_X 4xaa @ RbPlus + { 3, 19, 308, 473, 0, } , // 16 pipes (16 PKRs) 2 bpe @ SW_64K_Z_X 4xaa @ RbPlus + { 3, 20, 308, 438, 0, } , // 16 pipes (16 PKRs) 4 bpe @ SW_64K_Z_X 4xaa @ RbPlus + { 3, 21, 374, 438, 0, } , // 16 pipes (16 PKRs) 8 bpe @ SW_64K_Z_X 4xaa @ RbPlus + { 3, 22, 375, 438, 0, } , // 16 pipes (16 PKRs) 16 bpe @ SW_64K_Z_X 4xaa @ RbPlus + { 3, 18, 308, 403, 0, } , // 32 pipes (16 PKRs) 1 bpe @ SW_64K_Z_X 4xaa @ RbPlus + { 3, 19, 308, 471, 0, } , // 32 pipes (16 PKRs) 2 bpe @ SW_64K_Z_X 4xaa @ RbPlus + { 3, 20, 308, 439, 0, } , // 32 pipes (16 PKRs) 4 bpe @ SW_64K_Z_X 4xaa @ RbPlus + { 3, 21, 374, 439, 0, } , // 32 pipes (16 PKRs) 8 bpe @ SW_64K_Z_X 4xaa @ RbPlus + { 3, 22, 375, 439, 0, } , // 32 pipes (16 PKRs) 16 bpe @ SW_64K_Z_X 4xaa @ RbPlus + { 3, 18, 308, 242, 0, } , // 64 pipes (16 PKRs) 1 bpe @ SW_64K_Z_X 4xaa @ RbPlus + { 3, 19, 308, 441, 0, } , // 64 pipes (16 PKRs) 2 bpe @ SW_64K_Z_X 4xaa @ RbPlus + { 3, 20, 308, 442, 0, } , // 64 pipes (16 PKRs) 4 bpe @ SW_64K_Z_X 4xaa @ RbPlus + { 3, 21, 374, 442, 0, } , // 64 pipes (16 PKRs) 8 bpe @ SW_64K_Z_X 4xaa @ RbPlus + { 3, 22, 375, 442, 0, } , // 64 pipes (16 PKRs) 16 bpe @ SW_64K_Z_X 4xaa @ RbPlus + { 3, 18, 310, 410, 0, } , // 32 pipes (32 PKRs) 1 bpe @ SW_64K_Z_X 4xaa @ RbPlus + { 3, 19, 310, 474, 0, } , // 32 pipes (32 PKRs) 2 bpe @ SW_64K_Z_X 4xaa @ RbPlus + { 3, 20, 310, 412, 0, } , // 32 pipes (32 PKRs) 4 bpe @ SW_64K_Z_X 4xaa @ RbPlus + { 3, 21, 376, 412, 0, } , // 32 pipes (32 PKRs) 8 bpe @ SW_64K_Z_X 4xaa @ RbPlus + { 3, 22, 377, 412, 0, } , // 32 pipes (32 PKRs) 16 bpe @ SW_64K_Z_X 4xaa @ RbPlus + { 3, 18, 310, 252, 0, } , // 64 pipes (32 PKRs) 1 bpe @ SW_64K_Z_X 4xaa @ RbPlus + { 3, 19, 310, 475, 0, } , // 64 pipes (32 PKRs) 2 bpe @ SW_64K_Z_X 4xaa @ RbPlus + { 3, 20, 310, 414, 0, } , // 64 pipes (32 PKRs) 4 bpe @ SW_64K_Z_X 4xaa @ RbPlus + { 3, 21, 376, 414, 0, } , // 64 pipes (32 PKRs) 8 bpe @ SW_64K_Z_X 4xaa @ RbPlus + { 3, 22, 377, 414, 0, } , // 64 pipes (32 PKRs) 16 bpe @ SW_64K_Z_X 4xaa @ RbPlus +}; + +const ADDR_SW_PATINFO SW_64K_Z_X_8xaa_RBPLUS_PATINFO[] = +{ + { 3, 23, 358, 263, 0, } , // 1 pipes (1 PKRs) 1 bpe @ SW_64K_Z_X 8xaa @ RbPlus + { 3, 24, 349, 448, 0, } , // 1 pipes (1 PKRs) 2 bpe @ SW_64K_Z_X 8xaa @ RbPlus + { 3, 25, 358, 332, 0, } , // 1 pipes (1 PKRs) 4 bpe @ SW_64K_Z_X 8xaa @ RbPlus + { 3, 26, 350, 476, 0, } , // 1 pipes (1 PKRs) 8 bpe @ SW_64K_Z_X 8xaa @ RbPlus + { 3, 27, 359, 477, 0, } , // 1 pipes (1 PKRs) 16 bpe @ SW_64K_Z_X 8xaa @ RbPlus + { 3, 23, 354, 263, 0, } , // 2 pipes (1-2 PKRs) 1 bpe @ SW_64K_Z_X 8xaa @ RbPlus + { 3, 24, 354, 299, 0, } , // 2 pipes (1-2 PKRs) 2 bpe @ SW_64K_Z_X 8xaa @ RbPlus + { 3, 25, 354, 332, 0, } , // 2 pipes (1-2 PKRs) 4 bpe @ SW_64K_Z_X 8xaa @ RbPlus + { 3, 26, 361, 478, 0, } , // 2 pipes (1-2 PKRs) 8 bpe @ SW_64K_Z_X 8xaa @ RbPlus + { 3, 27, 378, 479, 0, } , // 2 pipes (1-2 PKRs) 16 bpe @ SW_64K_Z_X 8xaa @ RbPlus + { 3, 23, 282, 263, 0, } , // 4 pipes (1-2 PKRs) 1 bpe @ SW_64K_Z_X 8xaa @ RbPlus + { 3, 24, 282, 299, 0, } , // 4 pipes (1-2 PKRs) 2 bpe @ SW_64K_Z_X 8xaa @ RbPlus + { 3, 25, 282, 332, 0, } , // 4 pipes (1-2 PKRs) 4 bpe @ SW_64K_Z_X 8xaa @ RbPlus + { 3, 26, 317, 333, 0, } , // 4 pipes (1-2 PKRs) 8 bpe @ SW_64K_Z_X 8xaa @ RbPlus + { 3, 27, 329, 334, 0, } , // 4 pipes (1-2 PKRs) 16 bpe @ SW_64K_Z_X 8xaa @ RbPlus + { 3, 23, 287, 421, 0, } , // 8 pipes (2 PKRs) 1 bpe @ SW_64K_Z_X 8xaa @ RbPlus + { 3, 24, 287, 480, 0, } , // 8 pipes (2 PKRs) 2 bpe @ SW_64K_Z_X 8xaa @ RbPlus + { 3, 25, 287, 481, 0, } , // 8 pipes (2 PKRs) 4 bpe @ SW_64K_Z_X 8xaa @ RbPlus + { 3, 26, 379, 482, 0, } , // 8 pipes (2 PKRs) 8 bpe @ SW_64K_Z_X 8xaa @ RbPlus + { 3, 27, 380, 483, 0, } , // 8 pipes (2 PKRs) 16 bpe @ SW_64K_Z_X 8xaa @ RbPlus + { 3, 23, 292, 269, 0, } , // 4 pipes (4 PKRs) 1 bpe @ SW_64K_Z_X 8xaa @ RbPlus + { 3, 24, 292, 307, 0, } , // 4 pipes (4 PKRs) 2 bpe @ SW_64K_Z_X 8xaa @ RbPlus + { 3, 25, 292, 339, 0, } , // 4 pipes (4 PKRs) 4 bpe @ SW_64K_Z_X 8xaa @ RbPlus + { 3, 26, 332, 340, 0, } , // 4 pipes (4 PKRs) 8 bpe @ SW_64K_Z_X 8xaa @ RbPlus + { 3, 27, 333, 341, 0, } , // 4 pipes (4 PKRs) 16 bpe @ SW_64K_Z_X 8xaa @ RbPlus + { 3, 23, 297, 421, 0, } , // 8 pipes (4 PKRs) 1 bpe @ SW_64K_Z_X 8xaa @ RbPlus + { 3, 24, 297, 459, 0, } , // 8 pipes (4 PKRs) 2 bpe @ SW_64K_Z_X 8xaa @ RbPlus + { 3, 25, 297, 481, 0, } , // 8 pipes (4 PKRs) 4 bpe @ SW_64K_Z_X 8xaa @ RbPlus + { 3, 26, 381, 484, 0, } , // 8 pipes (4 PKRs) 8 bpe @ SW_64K_Z_X 8xaa @ RbPlus + { 3, 27, 382, 485, 0, } , // 8 pipes (4 PKRs) 16 bpe @ SW_64K_Z_X 8xaa @ RbPlus + { 3, 23, 300, 434, 0, } , // 16 pipes (4 PKRs) 1 bpe @ SW_64K_Z_X 8xaa @ RbPlus + { 3, 24, 300, 463, 0, } , // 16 pipes (4 PKRs) 2 bpe @ SW_64K_Z_X 8xaa @ RbPlus + { 3, 25, 383, 486, 0, } , // 16 pipes (4 PKRs) 4 bpe @ SW_64K_Z_X 8xaa @ RbPlus + { 3, 26, 384, 487, 0, } , // 16 pipes (4 PKRs) 8 bpe @ SW_64K_Z_X 8xaa @ RbPlus + { 3, 27, 385, 488, 0, } , // 16 pipes (4 PKRs) 16 bpe @ SW_64K_Z_X 8xaa @ RbPlus + { 3, 23, 303, 431, 0, } , // 8 pipes (8 PKRs) 1 bpe @ SW_64K_Z_X 8xaa @ RbPlus + { 3, 24, 303, 467, 0, } , // 8 pipes (8 PKRs) 2 bpe @ SW_64K_Z_X 8xaa @ RbPlus + { 3, 25, 303, 489, 0, } , // 8 pipes (8 PKRs) 4 bpe @ SW_64K_Z_X 8xaa @ RbPlus + { 3, 26, 337, 469, 0, } , // 8 pipes (8 PKRs) 8 bpe @ SW_64K_Z_X 8xaa @ RbPlus + { 3, 27, 386, 469, 0, } , // 8 pipes (8 PKRs) 16 bpe @ SW_64K_Z_X 8xaa @ RbPlus + { 3, 23, 306, 434, 0, } , // 16 pipes (8 PKRs) 1 bpe @ SW_64K_Z_X 8xaa @ RbPlus + { 3, 24, 306, 470, 0, } , // 16 pipes (8 PKRs) 2 bpe @ SW_64K_Z_X 8xaa @ RbPlus + { 3, 25, 387, 490, 0, } , // 16 pipes (8 PKRs) 4 bpe @ SW_64K_Z_X 8xaa @ RbPlus + { 3, 26, 373, 470, 0, } , // 16 pipes (8 PKRs) 8 bpe @ SW_64K_Z_X 8xaa @ RbPlus + { 3, 27, 388, 470, 0, } , // 16 pipes (8 PKRs) 16 bpe @ SW_64K_Z_X 8xaa @ RbPlus + { 3, 23, 306, 436, 0, } , // 32 pipes (8 PKRs) 1 bpe @ SW_64K_Z_X 8xaa @ RbPlus + { 3, 24, 306, 472, 0, } , // 32 pipes (8 PKRs) 2 bpe @ SW_64K_Z_X 8xaa @ RbPlus + { 3, 25, 387, 491, 0, } , // 32 pipes (8 PKRs) 4 bpe @ SW_64K_Z_X 8xaa @ RbPlus + { 3, 26, 373, 472, 0, } , // 32 pipes (8 PKRs) 8 bpe @ SW_64K_Z_X 8xaa @ RbPlus + { 3, 27, 388, 492, 0, } , // 32 pipes (8 PKRs) 16 bpe @ SW_64K_Z_X 8xaa @ RbPlus + { 3, 23, 308, 437, 0, } , // 16 pipes (16 PKRs) 1 bpe @ SW_64K_Z_X 8xaa @ RbPlus + { 3, 24, 308, 438, 0, } , // 16 pipes (16 PKRs) 2 bpe @ SW_64K_Z_X 8xaa @ RbPlus + { 3, 25, 389, 493, 0, } , // 16 pipes (16 PKRs) 4 bpe @ SW_64K_Z_X 8xaa @ RbPlus + { 3, 26, 375, 438, 0, } , // 16 pipes (16 PKRs) 8 bpe @ SW_64K_Z_X 8xaa @ RbPlus + { 3, 27, 390, 438, 0, } , // 16 pipes (16 PKRs) 16 bpe @ SW_64K_Z_X 8xaa @ RbPlus + { 3, 23, 308, 436, 0, } , // 32 pipes (16 PKRs) 1 bpe @ SW_64K_Z_X 8xaa @ RbPlus + { 3, 24, 308, 439, 0, } , // 32 pipes (16 PKRs) 2 bpe @ SW_64K_Z_X 8xaa @ RbPlus + { 3, 25, 391, 494, 0, } , // 32 pipes (16 PKRs) 4 bpe @ SW_64K_Z_X 8xaa @ RbPlus + { 3, 26, 375, 439, 0, } , // 32 pipes (16 PKRs) 8 bpe @ SW_64K_Z_X 8xaa @ RbPlus + { 3, 27, 390, 439, 0, } , // 32 pipes (16 PKRs) 16 bpe @ SW_64K_Z_X 8xaa @ RbPlus + { 3, 23, 308, 441, 0, } , // 64 pipes (16 PKRs) 1 bpe @ SW_64K_Z_X 8xaa @ RbPlus + { 3, 24, 308, 442, 0, } , // 64 pipes (16 PKRs) 2 bpe @ SW_64K_Z_X 8xaa @ RbPlus + { 3, 25, 391, 495, 0, } , // 64 pipes (16 PKRs) 4 bpe @ SW_64K_Z_X 8xaa @ RbPlus + { 3, 26, 375, 442, 0, } , // 64 pipes (16 PKRs) 8 bpe @ SW_64K_Z_X 8xaa @ RbPlus + { 3, 27, 390, 442, 0, } , // 64 pipes (16 PKRs) 16 bpe @ SW_64K_Z_X 8xaa @ RbPlus + { 3, 23, 310, 444, 0, } , // 32 pipes (32 PKRs) 1 bpe @ SW_64K_Z_X 8xaa @ RbPlus + { 3, 24, 310, 412, 0, } , // 32 pipes (32 PKRs) 2 bpe @ SW_64K_Z_X 8xaa @ RbPlus + { 3, 25, 392, 496, 0, } , // 32 pipes (32 PKRs) 4 bpe @ SW_64K_Z_X 8xaa @ RbPlus + { 3, 26, 377, 412, 0, } , // 32 pipes (32 PKRs) 8 bpe @ SW_64K_Z_X 8xaa @ RbPlus + { 3, 27, 393, 412, 0, } , // 32 pipes (32 PKRs) 16 bpe @ SW_64K_Z_X 8xaa @ RbPlus + { 3, 23, 310, 446, 0, } , // 64 pipes (32 PKRs) 1 bpe @ SW_64K_Z_X 8xaa @ RbPlus + { 3, 24, 310, 414, 0, } , // 64 pipes (32 PKRs) 2 bpe @ SW_64K_Z_X 8xaa @ RbPlus + { 3, 25, 367, 414, 0, } , // 64 pipes (32 PKRs) 4 bpe @ SW_64K_Z_X 8xaa @ RbPlus + { 3, 26, 377, 414, 0, } , // 64 pipes (32 PKRs) 8 bpe @ SW_64K_Z_X 8xaa @ RbPlus + { 3, 27, 393, 414, 0, } , // 64 pipes (32 PKRs) 16 bpe @ SW_64K_Z_X 8xaa @ RbPlus +}; + +const ADDR_SW_PATINFO SW_64K_S3_RBPLUS_PATINFO[] = +{ + { 1, 29, 131, 148, 0, } , // 1 pipes (1 PKRs) 1 bpe @ SW_64K_S3 @ RbPlus + { 1, 30, 132, 149, 0, } , // 1 pipes (1 PKRs) 2 bpe @ SW_64K_S3 @ RbPlus + { 1, 31, 133, 150, 0, } , // 1 pipes (1 PKRs) 4 bpe @ SW_64K_S3 @ RbPlus + { 1, 32, 134, 151, 0, } , // 1 pipes (1 PKRs) 8 bpe @ SW_64K_S3 @ RbPlus + { 1, 33, 135, 152, 0, } , // 1 pipes (1 PKRs) 16 bpe @ SW_64K_S3 @ RbPlus + { 1, 29, 131, 148, 0, } , // 2 pipes (1-2 PKRs) 1 bpe @ SW_64K_S3 @ RbPlus + { 1, 30, 132, 149, 0, } , // 2 pipes (1-2 PKRs) 2 bpe @ SW_64K_S3 @ RbPlus + { 1, 31, 133, 150, 0, } , // 2 pipes (1-2 PKRs) 4 bpe @ SW_64K_S3 @ RbPlus + { 1, 32, 134, 151, 0, } , // 2 pipes (1-2 PKRs) 8 bpe @ SW_64K_S3 @ RbPlus + { 1, 33, 135, 152, 0, } , // 2 pipes (1-2 PKRs) 16 bpe @ SW_64K_S3 @ RbPlus + { 1, 29, 131, 148, 0, } , // 4 pipes (1-2 PKRs) 1 bpe @ SW_64K_S3 @ RbPlus + { 1, 30, 132, 149, 0, } , // 4 pipes (1-2 PKRs) 2 bpe @ SW_64K_S3 @ RbPlus + { 1, 31, 133, 150, 0, } , // 4 pipes (1-2 PKRs) 4 bpe @ SW_64K_S3 @ RbPlus + { 1, 32, 134, 151, 0, } , // 4 pipes (1-2 PKRs) 8 bpe @ SW_64K_S3 @ RbPlus + { 1, 33, 135, 152, 0, } , // 4 pipes (1-2 PKRs) 16 bpe @ SW_64K_S3 @ RbPlus + { 1, 29, 131, 148, 0, } , // 8 pipes (2 PKRs) 1 bpe @ SW_64K_S3 @ RbPlus + { 1, 30, 132, 149, 0, } , // 8 pipes (2 PKRs) 2 bpe @ SW_64K_S3 @ RbPlus + { 1, 31, 133, 150, 0, } , // 8 pipes (2 PKRs) 4 bpe @ SW_64K_S3 @ RbPlus + { 1, 32, 134, 151, 0, } , // 8 pipes (2 PKRs) 8 bpe @ SW_64K_S3 @ RbPlus + { 1, 33, 135, 152, 0, } , // 8 pipes (2 PKRs) 16 bpe @ SW_64K_S3 @ RbPlus + { 1, 29, 131, 148, 0, } , // 4 pipes (4 PKRs) 1 bpe @ SW_64K_S3 @ RbPlus + { 1, 30, 132, 149, 0, } , // 4 pipes (4 PKRs) 2 bpe @ SW_64K_S3 @ RbPlus + { 1, 31, 133, 150, 0, } , // 4 pipes (4 PKRs) 4 bpe @ SW_64K_S3 @ RbPlus + { 1, 32, 134, 151, 0, } , // 4 pipes (4 PKRs) 8 bpe @ SW_64K_S3 @ RbPlus + { 1, 33, 135, 152, 0, } , // 4 pipes (4 PKRs) 16 bpe @ SW_64K_S3 @ RbPlus + { 1, 29, 131, 148, 0, } , // 8 pipes (4 PKRs) 1 bpe @ SW_64K_S3 @ RbPlus + { 1, 30, 132, 149, 0, } , // 8 pipes (4 PKRs) 2 bpe @ SW_64K_S3 @ RbPlus + { 1, 31, 133, 150, 0, } , // 8 pipes (4 PKRs) 4 bpe @ SW_64K_S3 @ RbPlus + { 1, 32, 134, 151, 0, } , // 8 pipes (4 PKRs) 8 bpe @ SW_64K_S3 @ RbPlus + { 1, 33, 135, 152, 0, } , // 8 pipes (4 PKRs) 16 bpe @ SW_64K_S3 @ RbPlus + { 1, 29, 131, 148, 0, } , // 16 pipes (4 PKRs) 1 bpe @ SW_64K_S3 @ RbPlus + { 1, 30, 132, 149, 0, } , // 16 pipes (4 PKRs) 2 bpe @ SW_64K_S3 @ RbPlus + { 1, 31, 133, 150, 0, } , // 16 pipes (4 PKRs) 4 bpe @ SW_64K_S3 @ RbPlus + { 1, 32, 134, 151, 0, } , // 16 pipes (4 PKRs) 8 bpe @ SW_64K_S3 @ RbPlus + { 1, 33, 135, 152, 0, } , // 16 pipes (4 PKRs) 16 bpe @ SW_64K_S3 @ RbPlus + { 1, 29, 131, 148, 0, } , // 8 pipes (8 PKRs) 1 bpe @ SW_64K_S3 @ RbPlus + { 1, 30, 132, 149, 0, } , // 8 pipes (8 PKRs) 2 bpe @ SW_64K_S3 @ RbPlus + { 1, 31, 133, 150, 0, } , // 8 pipes (8 PKRs) 4 bpe @ SW_64K_S3 @ RbPlus + { 1, 32, 134, 151, 0, } , // 8 pipes (8 PKRs) 8 bpe @ SW_64K_S3 @ RbPlus + { 1, 33, 135, 152, 0, } , // 8 pipes (8 PKRs) 16 bpe @ SW_64K_S3 @ RbPlus + { 1, 29, 131, 148, 0, } , // 16 pipes (8 PKRs) 1 bpe @ SW_64K_S3 @ RbPlus + { 1, 30, 132, 149, 0, } , // 16 pipes (8 PKRs) 2 bpe @ SW_64K_S3 @ RbPlus + { 1, 31, 133, 150, 0, } , // 16 pipes (8 PKRs) 4 bpe @ SW_64K_S3 @ RbPlus + { 1, 32, 134, 151, 0, } , // 16 pipes (8 PKRs) 8 bpe @ SW_64K_S3 @ RbPlus + { 1, 33, 135, 152, 0, } , // 16 pipes (8 PKRs) 16 bpe @ SW_64K_S3 @ RbPlus + { 1, 29, 131, 148, 0, } , // 32 pipes (8 PKRs) 1 bpe @ SW_64K_S3 @ RbPlus + { 1, 30, 132, 149, 0, } , // 32 pipes (8 PKRs) 2 bpe @ SW_64K_S3 @ RbPlus + { 1, 31, 133, 150, 0, } , // 32 pipes (8 PKRs) 4 bpe @ SW_64K_S3 @ RbPlus + { 1, 32, 134, 151, 0, } , // 32 pipes (8 PKRs) 8 bpe @ SW_64K_S3 @ RbPlus + { 1, 33, 135, 152, 0, } , // 32 pipes (8 PKRs) 16 bpe @ SW_64K_S3 @ RbPlus + { 1, 29, 131, 148, 0, } , // 16 pipes (16 PKRs) 1 bpe @ SW_64K_S3 @ RbPlus + { 1, 30, 132, 149, 0, } , // 16 pipes (16 PKRs) 2 bpe @ SW_64K_S3 @ RbPlus + { 1, 31, 133, 150, 0, } , // 16 pipes (16 PKRs) 4 bpe @ SW_64K_S3 @ RbPlus + { 1, 32, 134, 151, 0, } , // 16 pipes (16 PKRs) 8 bpe @ SW_64K_S3 @ RbPlus + { 1, 33, 135, 152, 0, } , // 16 pipes (16 PKRs) 16 bpe @ SW_64K_S3 @ RbPlus + { 1, 29, 131, 148, 0, } , // 32 pipes (16 PKRs) 1 bpe @ SW_64K_S3 @ RbPlus + { 1, 30, 132, 149, 0, } , // 32 pipes (16 PKRs) 2 bpe @ SW_64K_S3 @ RbPlus + { 1, 31, 133, 150, 0, } , // 32 pipes (16 PKRs) 4 bpe @ SW_64K_S3 @ RbPlus + { 1, 32, 134, 151, 0, } , // 32 pipes (16 PKRs) 8 bpe @ SW_64K_S3 @ RbPlus + { 1, 33, 135, 152, 0, } , // 32 pipes (16 PKRs) 16 bpe @ SW_64K_S3 @ RbPlus + { 1, 29, 131, 148, 0, } , // 64 pipes (16 PKRs) 1 bpe @ SW_64K_S3 @ RbPlus + { 1, 30, 132, 149, 0, } , // 64 pipes (16 PKRs) 2 bpe @ SW_64K_S3 @ RbPlus + { 1, 31, 133, 150, 0, } , // 64 pipes (16 PKRs) 4 bpe @ SW_64K_S3 @ RbPlus + { 1, 32, 134, 151, 0, } , // 64 pipes (16 PKRs) 8 bpe @ SW_64K_S3 @ RbPlus + { 1, 33, 135, 152, 0, } , // 64 pipes (16 PKRs) 16 bpe @ SW_64K_S3 @ RbPlus + { 1, 29, 131, 148, 0, } , // 32 pipes (32 PKRs) 1 bpe @ SW_64K_S3 @ RbPlus + { 1, 30, 132, 149, 0, } , // 32 pipes (32 PKRs) 2 bpe @ SW_64K_S3 @ RbPlus + { 1, 31, 133, 150, 0, } , // 32 pipes (32 PKRs) 4 bpe @ SW_64K_S3 @ RbPlus + { 1, 32, 134, 151, 0, } , // 32 pipes (32 PKRs) 8 bpe @ SW_64K_S3 @ RbPlus + { 1, 33, 135, 152, 0, } , // 32 pipes (32 PKRs) 16 bpe @ SW_64K_S3 @ RbPlus + { 1, 29, 131, 148, 0, } , // 64 pipes (32 PKRs) 1 bpe @ SW_64K_S3 @ RbPlus + { 1, 30, 132, 149, 0, } , // 64 pipes (32 PKRs) 2 bpe @ SW_64K_S3 @ RbPlus + { 1, 31, 133, 150, 0, } , // 64 pipes (32 PKRs) 4 bpe @ SW_64K_S3 @ RbPlus + { 1, 32, 134, 151, 0, } , // 64 pipes (32 PKRs) 8 bpe @ SW_64K_S3 @ RbPlus + { 1, 33, 135, 152, 0, } , // 64 pipes (32 PKRs) 16 bpe @ SW_64K_S3 @ RbPlus +}; + +const ADDR_SW_PATINFO SW_64K_S3_X_RBPLUS_PATINFO[] = +{ + { 1, 29, 131, 148, 0, } , // 1 pipes (1 PKRs) 1 bpe @ SW_64K_S3_X @ RbPlus + { 1, 30, 132, 149, 0, } , // 1 pipes (1 PKRs) 2 bpe @ SW_64K_S3_X @ RbPlus + { 1, 31, 133, 150, 0, } , // 1 pipes (1 PKRs) 4 bpe @ SW_64K_S3_X @ RbPlus + { 1, 32, 134, 151, 0, } , // 1 pipes (1 PKRs) 8 bpe @ SW_64K_S3_X @ RbPlus + { 1, 33, 135, 152, 0, } , // 1 pipes (1 PKRs) 16 bpe @ SW_64K_S3_X @ RbPlus + { 3, 29, 136, 148, 0, } , // 2 pipes (1-2 PKRs) 1 bpe @ SW_64K_S3_X @ RbPlus + { 3, 30, 137, 149, 0, } , // 2 pipes (1-2 PKRs) 2 bpe @ SW_64K_S3_X @ RbPlus + { 3, 31, 138, 150, 0, } , // 2 pipes (1-2 PKRs) 4 bpe @ SW_64K_S3_X @ RbPlus + { 3, 32, 139, 151, 0, } , // 2 pipes (1-2 PKRs) 8 bpe @ SW_64K_S3_X @ RbPlus + { 3, 33, 140, 152, 0, } , // 2 pipes (1-2 PKRs) 16 bpe @ SW_64K_S3_X @ RbPlus + { 3, 29, 141, 148, 0, } , // 4 pipes (1-2 PKRs) 1 bpe @ SW_64K_S3_X @ RbPlus + { 3, 30, 142, 149, 0, } , // 4 pipes (1-2 PKRs) 2 bpe @ SW_64K_S3_X @ RbPlus + { 3, 31, 143, 150, 0, } , // 4 pipes (1-2 PKRs) 4 bpe @ SW_64K_S3_X @ RbPlus + { 3, 32, 144, 151, 0, } , // 4 pipes (1-2 PKRs) 8 bpe @ SW_64K_S3_X @ RbPlus + { 3, 33, 145, 152, 0, } , // 4 pipes (1-2 PKRs) 16 bpe @ SW_64K_S3_X @ RbPlus + { 3, 29, 146, 148, 0, } , // 8 pipes (2 PKRs) 1 bpe @ SW_64K_S3_X @ RbPlus + { 3, 30, 147, 149, 0, } , // 8 pipes (2 PKRs) 2 bpe @ SW_64K_S3_X @ RbPlus + { 3, 31, 148, 150, 0, } , // 8 pipes (2 PKRs) 4 bpe @ SW_64K_S3_X @ RbPlus + { 3, 32, 149, 151, 0, } , // 8 pipes (2 PKRs) 8 bpe @ SW_64K_S3_X @ RbPlus + { 3, 33, 150, 152, 0, } , // 8 pipes (2 PKRs) 16 bpe @ SW_64K_S3_X @ RbPlus + { 3, 29, 141, 148, 0, } , // 4 pipes (4 PKRs) 1 bpe @ SW_64K_S3_X @ RbPlus + { 3, 30, 142, 149, 0, } , // 4 pipes (4 PKRs) 2 bpe @ SW_64K_S3_X @ RbPlus + { 3, 31, 143, 150, 0, } , // 4 pipes (4 PKRs) 4 bpe @ SW_64K_S3_X @ RbPlus + { 3, 32, 144, 151, 0, } , // 4 pipes (4 PKRs) 8 bpe @ SW_64K_S3_X @ RbPlus + { 3, 33, 145, 152, 0, } , // 4 pipes (4 PKRs) 16 bpe @ SW_64K_S3_X @ RbPlus + { 3, 29, 146, 148, 0, } , // 8 pipes (4 PKRs) 1 bpe @ SW_64K_S3_X @ RbPlus + { 3, 30, 147, 149, 0, } , // 8 pipes (4 PKRs) 2 bpe @ SW_64K_S3_X @ RbPlus + { 3, 31, 148, 150, 0, } , // 8 pipes (4 PKRs) 4 bpe @ SW_64K_S3_X @ RbPlus + { 3, 32, 149, 151, 0, } , // 8 pipes (4 PKRs) 8 bpe @ SW_64K_S3_X @ RbPlus + { 3, 33, 150, 152, 0, } , // 8 pipes (4 PKRs) 16 bpe @ SW_64K_S3_X @ RbPlus + { 3, 29, 151, 148, 0, } , // 16 pipes (4 PKRs) 1 bpe @ SW_64K_S3_X @ RbPlus + { 3, 30, 152, 149, 0, } , // 16 pipes (4 PKRs) 2 bpe @ SW_64K_S3_X @ RbPlus + { 3, 31, 153, 150, 0, } , // 16 pipes (4 PKRs) 4 bpe @ SW_64K_S3_X @ RbPlus + { 3, 32, 154, 151, 0, } , // 16 pipes (4 PKRs) 8 bpe @ SW_64K_S3_X @ RbPlus + { 3, 33, 155, 152, 0, } , // 16 pipes (4 PKRs) 16 bpe @ SW_64K_S3_X @ RbPlus + { 3, 29, 146, 148, 0, } , // 8 pipes (8 PKRs) 1 bpe @ SW_64K_S3_X @ RbPlus + { 3, 30, 147, 149, 0, } , // 8 pipes (8 PKRs) 2 bpe @ SW_64K_S3_X @ RbPlus + { 3, 31, 148, 150, 0, } , // 8 pipes (8 PKRs) 4 bpe @ SW_64K_S3_X @ RbPlus + { 3, 32, 149, 151, 0, } , // 8 pipes (8 PKRs) 8 bpe @ SW_64K_S3_X @ RbPlus + { 3, 33, 150, 152, 0, } , // 8 pipes (8 PKRs) 16 bpe @ SW_64K_S3_X @ RbPlus + { 3, 29, 151, 148, 0, } , // 16 pipes (8 PKRs) 1 bpe @ SW_64K_S3_X @ RbPlus + { 3, 30, 152, 149, 0, } , // 16 pipes (8 PKRs) 2 bpe @ SW_64K_S3_X @ RbPlus + { 3, 31, 153, 150, 0, } , // 16 pipes (8 PKRs) 4 bpe @ SW_64K_S3_X @ RbPlus + { 3, 32, 154, 151, 0, } , // 16 pipes (8 PKRs) 8 bpe @ SW_64K_S3_X @ RbPlus + { 3, 33, 155, 152, 0, } , // 16 pipes (8 PKRs) 16 bpe @ SW_64K_S3_X @ RbPlus + { 3, 29, 156, 153, 0, } , // 32 pipes (8 PKRs) 1 bpe @ SW_64K_S3_X @ RbPlus + { 3, 30, 157, 154, 0, } , // 32 pipes (8 PKRs) 2 bpe @ SW_64K_S3_X @ RbPlus + { 3, 31, 158, 155, 0, } , // 32 pipes (8 PKRs) 4 bpe @ SW_64K_S3_X @ RbPlus + { 3, 32, 159, 156, 0, } , // 32 pipes (8 PKRs) 8 bpe @ SW_64K_S3_X @ RbPlus + { 3, 33, 160, 157, 0, } , // 32 pipes (8 PKRs) 16 bpe @ SW_64K_S3_X @ RbPlus + { 3, 29, 151, 148, 0, } , // 16 pipes (16 PKRs) 1 bpe @ SW_64K_S3_X @ RbPlus + { 3, 30, 152, 149, 0, } , // 16 pipes (16 PKRs) 2 bpe @ SW_64K_S3_X @ RbPlus + { 3, 31, 153, 150, 0, } , // 16 pipes (16 PKRs) 4 bpe @ SW_64K_S3_X @ RbPlus + { 3, 32, 154, 151, 0, } , // 16 pipes (16 PKRs) 8 bpe @ SW_64K_S3_X @ RbPlus + { 3, 33, 155, 152, 0, } , // 16 pipes (16 PKRs) 16 bpe @ SW_64K_S3_X @ RbPlus + { 3, 29, 156, 153, 0, } , // 32 pipes (16 PKRs) 1 bpe @ SW_64K_S3_X @ RbPlus + { 3, 30, 157, 154, 0, } , // 32 pipes (16 PKRs) 2 bpe @ SW_64K_S3_X @ RbPlus + { 3, 31, 158, 155, 0, } , // 32 pipes (16 PKRs) 4 bpe @ SW_64K_S3_X @ RbPlus + { 3, 32, 159, 156, 0, } , // 32 pipes (16 PKRs) 8 bpe @ SW_64K_S3_X @ RbPlus + { 3, 33, 160, 157, 0, } , // 32 pipes (16 PKRs) 16 bpe @ SW_64K_S3_X @ RbPlus + { 3, 29, 161, 158, 0, } , // 64 pipes (16 PKRs) 1 bpe @ SW_64K_S3_X @ RbPlus + { 3, 30, 162, 159, 0, } , // 64 pipes (16 PKRs) 2 bpe @ SW_64K_S3_X @ RbPlus + { 3, 31, 163, 160, 0, } , // 64 pipes (16 PKRs) 4 bpe @ SW_64K_S3_X @ RbPlus + { 3, 32, 164, 161, 0, } , // 64 pipes (16 PKRs) 8 bpe @ SW_64K_S3_X @ RbPlus + { 3, 33, 165, 162, 0, } , // 64 pipes (16 PKRs) 16 bpe @ SW_64K_S3_X @ RbPlus + { 3, 29, 156, 153, 0, } , // 32 pipes (32 PKRs) 1 bpe @ SW_64K_S3_X @ RbPlus + { 3, 30, 157, 154, 0, } , // 32 pipes (32 PKRs) 2 bpe @ SW_64K_S3_X @ RbPlus + { 3, 31, 158, 155, 0, } , // 32 pipes (32 PKRs) 4 bpe @ SW_64K_S3_X @ RbPlus + { 3, 32, 159, 156, 0, } , // 32 pipes (32 PKRs) 8 bpe @ SW_64K_S3_X @ RbPlus + { 3, 33, 160, 157, 0, } , // 32 pipes (32 PKRs) 16 bpe @ SW_64K_S3_X @ RbPlus + { 3, 29, 161, 158, 0, } , // 64 pipes (32 PKRs) 1 bpe @ SW_64K_S3_X @ RbPlus + { 3, 30, 162, 159, 0, } , // 64 pipes (32 PKRs) 2 bpe @ SW_64K_S3_X @ RbPlus + { 3, 31, 163, 160, 0, } , // 64 pipes (32 PKRs) 4 bpe @ SW_64K_S3_X @ RbPlus + { 3, 32, 164, 161, 0, } , // 64 pipes (32 PKRs) 8 bpe @ SW_64K_S3_X @ RbPlus + { 3, 33, 165, 162, 0, } , // 64 pipes (32 PKRs) 16 bpe @ SW_64K_S3_X @ RbPlus +}; + +const ADDR_SW_PATINFO SW_64K_S3_T_RBPLUS_PATINFO[] = +{ + { 1, 29, 131, 148, 0, } , // 1 pipes (1 PKRs) 1 bpe @ SW_64K_S3_T @ RbPlus + { 1, 30, 132, 149, 0, } , // 1 pipes (1 PKRs) 2 bpe @ SW_64K_S3_T @ RbPlus + { 1, 31, 133, 150, 0, } , // 1 pipes (1 PKRs) 4 bpe @ SW_64K_S3_T @ RbPlus + { 1, 32, 134, 151, 0, } , // 1 pipes (1 PKRs) 8 bpe @ SW_64K_S3_T @ RbPlus + { 1, 33, 135, 152, 0, } , // 1 pipes (1 PKRs) 16 bpe @ SW_64K_S3_T @ RbPlus + { 3, 29, 136, 148, 0, } , // 2 pipes (1-2 PKRs) 1 bpe @ SW_64K_S3_T @ RbPlus + { 3, 30, 137, 149, 0, } , // 2 pipes (1-2 PKRs) 2 bpe @ SW_64K_S3_T @ RbPlus + { 3, 31, 138, 150, 0, } , // 2 pipes (1-2 PKRs) 4 bpe @ SW_64K_S3_T @ RbPlus + { 3, 32, 139, 151, 0, } , // 2 pipes (1-2 PKRs) 8 bpe @ SW_64K_S3_T @ RbPlus + { 3, 33, 140, 152, 0, } , // 2 pipes (1-2 PKRs) 16 bpe @ SW_64K_S3_T @ RbPlus + { 3, 29, 141, 148, 0, } , // 4 pipes (1-2 PKRs) 1 bpe @ SW_64K_S3_T @ RbPlus + { 3, 30, 142, 149, 0, } , // 4 pipes (1-2 PKRs) 2 bpe @ SW_64K_S3_T @ RbPlus + { 3, 31, 143, 150, 0, } , // 4 pipes (1-2 PKRs) 4 bpe @ SW_64K_S3_T @ RbPlus + { 3, 32, 144, 151, 0, } , // 4 pipes (1-2 PKRs) 8 bpe @ SW_64K_S3_T @ RbPlus + { 3, 33, 145, 152, 0, } , // 4 pipes (1-2 PKRs) 16 bpe @ SW_64K_S3_T @ RbPlus + { 3, 29, 166, 148, 0, } , // 8 pipes (2 PKRs) 1 bpe @ SW_64K_S3_T @ RbPlus + { 3, 30, 167, 149, 0, } , // 8 pipes (2 PKRs) 2 bpe @ SW_64K_S3_T @ RbPlus + { 3, 31, 168, 150, 0, } , // 8 pipes (2 PKRs) 4 bpe @ SW_64K_S3_T @ RbPlus + { 3, 32, 169, 151, 0, } , // 8 pipes (2 PKRs) 8 bpe @ SW_64K_S3_T @ RbPlus + { 3, 33, 170, 152, 0, } , // 8 pipes (2 PKRs) 16 bpe @ SW_64K_S3_T @ RbPlus + { 3, 29, 141, 148, 0, } , // 4 pipes (4 PKRs) 1 bpe @ SW_64K_S3_T @ RbPlus + { 3, 30, 142, 149, 0, } , // 4 pipes (4 PKRs) 2 bpe @ SW_64K_S3_T @ RbPlus + { 3, 31, 143, 150, 0, } , // 4 pipes (4 PKRs) 4 bpe @ SW_64K_S3_T @ RbPlus + { 3, 32, 144, 151, 0, } , // 4 pipes (4 PKRs) 8 bpe @ SW_64K_S3_T @ RbPlus + { 3, 33, 145, 152, 0, } , // 4 pipes (4 PKRs) 16 bpe @ SW_64K_S3_T @ RbPlus + { 3, 29, 166, 148, 0, } , // 8 pipes (4 PKRs) 1 bpe @ SW_64K_S3_T @ RbPlus + { 3, 30, 167, 149, 0, } , // 8 pipes (4 PKRs) 2 bpe @ SW_64K_S3_T @ RbPlus + { 3, 31, 168, 150, 0, } , // 8 pipes (4 PKRs) 4 bpe @ SW_64K_S3_T @ RbPlus + { 3, 32, 169, 151, 0, } , // 8 pipes (4 PKRs) 8 bpe @ SW_64K_S3_T @ RbPlus + { 3, 33, 170, 152, 0, } , // 8 pipes (4 PKRs) 16 bpe @ SW_64K_S3_T @ RbPlus + { 3, 29, 171, 148, 0, } , // 16 pipes (4 PKRs) 1 bpe @ SW_64K_S3_T @ RbPlus + { 3, 30, 172, 149, 0, } , // 16 pipes (4 PKRs) 2 bpe @ SW_64K_S3_T @ RbPlus + { 3, 31, 173, 150, 0, } , // 16 pipes (4 PKRs) 4 bpe @ SW_64K_S3_T @ RbPlus + { 3, 32, 174, 151, 0, } , // 16 pipes (4 PKRs) 8 bpe @ SW_64K_S3_T @ RbPlus + { 3, 33, 175, 152, 0, } , // 16 pipes (4 PKRs) 16 bpe @ SW_64K_S3_T @ RbPlus + { 3, 29, 166, 148, 0, } , // 8 pipes (8 PKRs) 1 bpe @ SW_64K_S3_T @ RbPlus + { 3, 30, 167, 149, 0, } , // 8 pipes (8 PKRs) 2 bpe @ SW_64K_S3_T @ RbPlus + { 3, 31, 168, 150, 0, } , // 8 pipes (8 PKRs) 4 bpe @ SW_64K_S3_T @ RbPlus + { 3, 32, 169, 151, 0, } , // 8 pipes (8 PKRs) 8 bpe @ SW_64K_S3_T @ RbPlus + { 3, 33, 170, 152, 0, } , // 8 pipes (8 PKRs) 16 bpe @ SW_64K_S3_T @ RbPlus + { 3, 29, 171, 148, 0, } , // 16 pipes (8 PKRs) 1 bpe @ SW_64K_S3_T @ RbPlus + { 3, 30, 172, 149, 0, } , // 16 pipes (8 PKRs) 2 bpe @ SW_64K_S3_T @ RbPlus + { 3, 31, 173, 150, 0, } , // 16 pipes (8 PKRs) 4 bpe @ SW_64K_S3_T @ RbPlus + { 3, 32, 174, 151, 0, } , // 16 pipes (8 PKRs) 8 bpe @ SW_64K_S3_T @ RbPlus + { 3, 33, 175, 152, 0, } , // 16 pipes (8 PKRs) 16 bpe @ SW_64K_S3_T @ RbPlus + { 3, 29, 176, 153, 0, } , // 32 pipes (8 PKRs) 1 bpe @ SW_64K_S3_T @ RbPlus + { 3, 30, 177, 154, 0, } , // 32 pipes (8 PKRs) 2 bpe @ SW_64K_S3_T @ RbPlus + { 3, 31, 178, 155, 0, } , // 32 pipes (8 PKRs) 4 bpe @ SW_64K_S3_T @ RbPlus + { 3, 32, 179, 156, 0, } , // 32 pipes (8 PKRs) 8 bpe @ SW_64K_S3_T @ RbPlus + { 3, 33, 180, 157, 0, } , // 32 pipes (8 PKRs) 16 bpe @ SW_64K_S3_T @ RbPlus + { 3, 29, 171, 148, 0, } , // 16 pipes (16 PKRs) 1 bpe @ SW_64K_S3_T @ RbPlus + { 3, 30, 172, 149, 0, } , // 16 pipes (16 PKRs) 2 bpe @ SW_64K_S3_T @ RbPlus + { 3, 31, 173, 150, 0, } , // 16 pipes (16 PKRs) 4 bpe @ SW_64K_S3_T @ RbPlus + { 3, 32, 174, 151, 0, } , // 16 pipes (16 PKRs) 8 bpe @ SW_64K_S3_T @ RbPlus + { 3, 33, 175, 152, 0, } , // 16 pipes (16 PKRs) 16 bpe @ SW_64K_S3_T @ RbPlus + { 3, 29, 176, 153, 0, } , // 32 pipes (16 PKRs) 1 bpe @ SW_64K_S3_T @ RbPlus + { 3, 30, 177, 154, 0, } , // 32 pipes (16 PKRs) 2 bpe @ SW_64K_S3_T @ RbPlus + { 3, 31, 178, 155, 0, } , // 32 pipes (16 PKRs) 4 bpe @ SW_64K_S3_T @ RbPlus + { 3, 32, 179, 156, 0, } , // 32 pipes (16 PKRs) 8 bpe @ SW_64K_S3_T @ RbPlus + { 3, 33, 180, 157, 0, } , // 32 pipes (16 PKRs) 16 bpe @ SW_64K_S3_T @ RbPlus + { 3, 29, 131, 163, 0, } , // 64 pipes (16 PKRs) 1 bpe @ SW_64K_S3_T @ RbPlus + { 3, 30, 132, 164, 0, } , // 64 pipes (16 PKRs) 2 bpe @ SW_64K_S3_T @ RbPlus + { 3, 31, 133, 165, 0, } , // 64 pipes (16 PKRs) 4 bpe @ SW_64K_S3_T @ RbPlus + { 3, 32, 134, 166, 0, } , // 64 pipes (16 PKRs) 8 bpe @ SW_64K_S3_T @ RbPlus + { 3, 33, 135, 167, 0, } , // 64 pipes (16 PKRs) 16 bpe @ SW_64K_S3_T @ RbPlus + { 3, 29, 176, 153, 0, } , // 32 pipes (32 PKRs) 1 bpe @ SW_64K_S3_T @ RbPlus + { 3, 30, 177, 154, 0, } , // 32 pipes (32 PKRs) 2 bpe @ SW_64K_S3_T @ RbPlus + { 3, 31, 178, 155, 0, } , // 32 pipes (32 PKRs) 4 bpe @ SW_64K_S3_T @ RbPlus + { 3, 32, 179, 156, 0, } , // 32 pipes (32 PKRs) 8 bpe @ SW_64K_S3_T @ RbPlus + { 3, 33, 180, 157, 0, } , // 32 pipes (32 PKRs) 16 bpe @ SW_64K_S3_T @ RbPlus + { 3, 29, 131, 163, 0, } , // 64 pipes (32 PKRs) 1 bpe @ SW_64K_S3_T @ RbPlus + { 3, 30, 132, 164, 0, } , // 64 pipes (32 PKRs) 2 bpe @ SW_64K_S3_T @ RbPlus + { 3, 31, 133, 165, 0, } , // 64 pipes (32 PKRs) 4 bpe @ SW_64K_S3_T @ RbPlus + { 3, 32, 134, 166, 0, } , // 64 pipes (32 PKRs) 8 bpe @ SW_64K_S3_T @ RbPlus + { 3, 33, 135, 167, 0, } , // 64 pipes (32 PKRs) 16 bpe @ SW_64K_S3_T @ RbPlus +}; + +const ADDR_SW_PATINFO SW_64K_D3_X_RBPLUS_PATINFO[] = +{ + { 1, 34, 131, 148, 0, } , // 1 pipes (1 PKRs) 1 bpe @ SW_64K_D3_X @ RbPlus + { 1, 35, 132, 149, 0, } , // 1 pipes (1 PKRs) 2 bpe @ SW_64K_D3_X @ RbPlus + { 1, 36, 133, 150, 0, } , // 1 pipes (1 PKRs) 4 bpe @ SW_64K_D3_X @ RbPlus + { 1, 37, 134, 151, 0, } , // 1 pipes (1 PKRs) 8 bpe @ SW_64K_D3_X @ RbPlus + { 1, 38, 135, 152, 0, } , // 1 pipes (1 PKRs) 16 bpe @ SW_64K_D3_X @ RbPlus + { 2, 34, 459, 170, 0, } , // 2 pipes (1-2 PKRs) 1 bpe @ SW_64K_D3_X @ RbPlus + { 2, 35, 459, 801, 0, } , // 2 pipes (1-2 PKRs) 2 bpe @ SW_64K_D3_X @ RbPlus + { 2, 36, 460, 802, 0, } , // 2 pipes (1-2 PKRs) 4 bpe @ SW_64K_D3_X @ RbPlus + { 2, 37, 461, 152, 0, } , // 2 pipes (1-2 PKRs) 8 bpe @ SW_64K_D3_X @ RbPlus + { 3, 38, 462, 152, 0, } , // 2 pipes (1-2 PKRs) 16 bpe @ SW_64K_D3_X @ RbPlus + { 3, 34, 463, 803, 0, } , // 4 pipes (1-2 PKRs) 1 bpe @ SW_64K_D3_X @ RbPlus + { 3, 35, 463, 804, 0, } , // 4 pipes (1-2 PKRs) 2 bpe @ SW_64K_D3_X @ RbPlus + { 3, 36, 464, 805, 0, } , // 4 pipes (1-2 PKRs) 4 bpe @ SW_64K_D3_X @ RbPlus + { 4, 37, 465, 806, 0, } , // 4 pipes (1-2 PKRs) 8 bpe @ SW_64K_D3_X @ RbPlus + { 4, 38, 466, 806, 0, } , // 4 pipes (1-2 PKRs) 16 bpe @ SW_64K_D3_X @ RbPlus + { 3, 34, 467, 803, 0, } , // 8 pipes (2 PKRs) 1 bpe @ SW_64K_D3_X @ RbPlus + { 3, 35, 467, 804, 0, } , // 8 pipes (2 PKRs) 2 bpe @ SW_64K_D3_X @ RbPlus + { 3, 36, 468, 805, 0, } , // 8 pipes (2 PKRs) 4 bpe @ SW_64K_D3_X @ RbPlus + { 4, 37, 469, 806, 0, } , // 8 pipes (2 PKRs) 8 bpe @ SW_64K_D3_X @ RbPlus + { 4, 38, 470, 806, 0, } , // 8 pipes (2 PKRs) 16 bpe @ SW_64K_D3_X @ RbPlus + { 3, 34, 471, 807, 0, } , // 4 pipes (4 PKRs) 1 bpe @ SW_64K_D3_X @ RbPlus + { 3, 35, 472, 808, 0, } , // 4 pipes (4 PKRs) 2 bpe @ SW_64K_D3_X @ RbPlus + { 3, 36, 473, 809, 0, } , // 4 pipes (4 PKRs) 4 bpe @ SW_64K_D3_X @ RbPlus + { 4, 37, 474, 810, 0, } , // 4 pipes (4 PKRs) 8 bpe @ SW_64K_D3_X @ RbPlus + { 4, 38, 475, 811, 0, } , // 4 pipes (4 PKRs) 16 bpe @ SW_64K_D3_X @ RbPlus + { 3, 34, 476, 812, 0, } , // 8 pipes (4 PKRs) 1 bpe @ SW_64K_D3_X @ RbPlus + { 3, 35, 477, 804, 0, } , // 8 pipes (4 PKRs) 2 bpe @ SW_64K_D3_X @ RbPlus + { 3, 36, 478, 805, 0, } , // 8 pipes (4 PKRs) 4 bpe @ SW_64K_D3_X @ RbPlus + { 4, 37, 479, 806, 0, } , // 8 pipes (4 PKRs) 8 bpe @ SW_64K_D3_X @ RbPlus + { 4, 38, 480, 806, 0, } , // 8 pipes (4 PKRs) 16 bpe @ SW_64K_D3_X @ RbPlus + { 3, 34, 481, 813, 0, } , // 16 pipes (4 PKRs) 1 bpe @ SW_64K_D3_X @ RbPlus + { 3, 35, 482, 804, 0, } , // 16 pipes (4 PKRs) 2 bpe @ SW_64K_D3_X @ RbPlus + { 3, 36, 483, 805, 0, } , // 16 pipes (4 PKRs) 4 bpe @ SW_64K_D3_X @ RbPlus + { 4, 37, 484, 806, 0, } , // 16 pipes (4 PKRs) 8 bpe @ SW_64K_D3_X @ RbPlus + { 4, 38, 485, 806, 0, } , // 16 pipes (4 PKRs) 16 bpe @ SW_64K_D3_X @ RbPlus + { 3, 34, 486, 814, 0, } , // 8 pipes (8 PKRs) 1 bpe @ SW_64K_D3_X @ RbPlus + { 3, 35, 486, 815, 0, } , // 8 pipes (8 PKRs) 2 bpe @ SW_64K_D3_X @ RbPlus + { 3, 36, 486, 816, 0, } , // 8 pipes (8 PKRs) 4 bpe @ SW_64K_D3_X @ RbPlus + { 4, 37, 487, 817, 0, } , // 8 pipes (8 PKRs) 8 bpe @ SW_64K_D3_X @ RbPlus + { 4, 38, 488, 817, 0, } , // 8 pipes (8 PKRs) 16 bpe @ SW_64K_D3_X @ RbPlus + { 3, 34, 489, 812, 0, } , // 16 pipes (8 PKRs) 1 bpe @ SW_64K_D3_X @ RbPlus + { 3, 35, 490, 804, 0, } , // 16 pipes (8 PKRs) 2 bpe @ SW_64K_D3_X @ RbPlus + { 3, 36, 491, 805, 0, } , // 16 pipes (8 PKRs) 4 bpe @ SW_64K_D3_X @ RbPlus + { 4, 37, 492, 806, 0, } , // 16 pipes (8 PKRs) 8 bpe @ SW_64K_D3_X @ RbPlus + { 4, 38, 493, 806, 0, } , // 16 pipes (8 PKRs) 16 bpe @ SW_64K_D3_X @ RbPlus + { 3, 34, 489, 818, 0, } , // 32 pipes (8 PKRs) 1 bpe @ SW_64K_D3_X @ RbPlus + { 3, 35, 494, 819, 0, } , // 32 pipes (8 PKRs) 2 bpe @ SW_64K_D3_X @ RbPlus + { 3, 36, 494, 820, 0, } , // 32 pipes (8 PKRs) 4 bpe @ SW_64K_D3_X @ RbPlus + { 4, 37, 495, 821, 0, } , // 32 pipes (8 PKRs) 8 bpe @ SW_64K_D3_X @ RbPlus + { 4, 38, 496, 821, 0, } , // 32 pipes (8 PKRs) 16 bpe @ SW_64K_D3_X @ RbPlus + { 3, 34, 497, 822, 0, } , // 16 pipes (16 PKRs) 1 bpe @ SW_64K_D3_X @ RbPlus + { 3, 35, 498, 823, 0, } , // 16 pipes (16 PKRs) 2 bpe @ SW_64K_D3_X @ RbPlus + { 3, 36, 499, 824, 0, } , // 16 pipes (16 PKRs) 4 bpe @ SW_64K_D3_X @ RbPlus + { 4, 37, 500, 825, 0, } , // 16 pipes (16 PKRs) 8 bpe @ SW_64K_D3_X @ RbPlus + { 4, 38, 501, 825, 0, } , // 16 pipes (16 PKRs) 16 bpe @ SW_64K_D3_X @ RbPlus + { 3, 34, 497, 826, 0, } , // 32 pipes (16 PKRs) 1 bpe @ SW_64K_D3_X @ RbPlus + { 3, 35, 498, 827, 0, } , // 32 pipes (16 PKRs) 2 bpe @ SW_64K_D3_X @ RbPlus + { 3, 36, 499, 828, 0, } , // 32 pipes (16 PKRs) 4 bpe @ SW_64K_D3_X @ RbPlus + { 4, 37, 500, 829, 0, } , // 32 pipes (16 PKRs) 8 bpe @ SW_64K_D3_X @ RbPlus + { 4, 38, 501, 829, 0, } , // 32 pipes (16 PKRs) 16 bpe @ SW_64K_D3_X @ RbPlus + { 3, 34, 497, 830, 0, } , // 64 pipes (16 PKRs) 1 bpe @ SW_64K_D3_X @ RbPlus + { 3, 35, 502, 831, 0, } , // 64 pipes (16 PKRs) 2 bpe @ SW_64K_D3_X @ RbPlus + { 3, 36, 502, 832, 0, } , // 64 pipes (16 PKRs) 4 bpe @ SW_64K_D3_X @ RbPlus + { 4, 37, 503, 833, 0, } , // 64 pipes (16 PKRs) 8 bpe @ SW_64K_D3_X @ RbPlus + { 4, 38, 504, 833, 0, } , // 64 pipes (16 PKRs) 16 bpe @ SW_64K_D3_X @ RbPlus + { 3, 34, 505, 834, 0, } , // 32 pipes (32 PKRs) 1 bpe @ SW_64K_D3_X @ RbPlus + { 3, 35, 506, 835, 0, } , // 32 pipes (32 PKRs) 2 bpe @ SW_64K_D3_X @ RbPlus + { 3, 36, 507, 836, 0, } , // 32 pipes (32 PKRs) 4 bpe @ SW_64K_D3_X @ RbPlus + { 4, 37, 508, 837, 0, } , // 32 pipes (32 PKRs) 8 bpe @ SW_64K_D3_X @ RbPlus + { 4, 38, 509, 837, 0, } , // 32 pipes (32 PKRs) 16 bpe @ SW_64K_D3_X @ RbPlus + { 3, 34, 505, 838, 0, } , // 64 pipes (32 PKRs) 1 bpe @ SW_64K_D3_X @ RbPlus + { 3, 35, 506, 839, 0, } , // 64 pipes (32 PKRs) 2 bpe @ SW_64K_D3_X @ RbPlus + { 3, 36, 507, 840, 0, } , // 64 pipes (32 PKRs) 4 bpe @ SW_64K_D3_X @ RbPlus + { 4, 37, 508, 841, 0, } , // 64 pipes (32 PKRs) 8 bpe @ SW_64K_D3_X @ RbPlus + { 4, 38, 509, 841, 0, } , // 64 pipes (32 PKRs) 16 bpe @ SW_64K_D3_X @ RbPlus +}; + +const ADDR_SW_PATINFO SW_VAR_R_X_1xaa_RBPLUS_PATINFO[] = +{ + { 2, 0, 270, 183, 0, } , // 1 pipes (1 PKRs) 1 bpe @ SW_VAR_R_X 1xaa @ RbPlus + { 2, 1, 271, 184, 0, } , // 1 pipes (1 PKRs) 2 bpe @ SW_VAR_R_X 1xaa @ RbPlus + { 2, 39, 272, 185, 0, } , // 1 pipes (1 PKRs) 4 bpe @ SW_VAR_R_X 1xaa @ RbPlus + { 2, 6, 273, 186, 0, } , // 1 pipes (1 PKRs) 8 bpe @ SW_VAR_R_X 1xaa @ RbPlus + { 2, 7, 274, 187, 0, } , // 1 pipes (1 PKRs) 16 bpe @ SW_VAR_R_X 1xaa @ RbPlus + { 3, 0, 275, 188, 0, } , // 2 pipes (1-2 PKRs) 1 bpe @ SW_VAR_R_X 1xaa @ RbPlus + { 3, 1, 276, 189, 0, } , // 2 pipes (1-2 PKRs) 2 bpe @ SW_VAR_R_X 1xaa @ RbPlus + { 3, 39, 277, 190, 0, } , // 2 pipes (1-2 PKRs) 4 bpe @ SW_VAR_R_X 1xaa @ RbPlus + { 3, 6, 278, 191, 0, } , // 2 pipes (1-2 PKRs) 8 bpe @ SW_VAR_R_X 1xaa @ RbPlus + { 3, 7, 279, 192, 0, } , // 2 pipes (1-2 PKRs) 16 bpe @ SW_VAR_R_X 1xaa @ RbPlus + { 3, 0, 280, 193, 0, } , // 4 pipes (1-2 PKRs) 1 bpe @ SW_VAR_R_X 1xaa @ RbPlus + { 3, 1, 281, 194, 0, } , // 4 pipes (1-2 PKRs) 2 bpe @ SW_VAR_R_X 1xaa @ RbPlus + { 3, 39, 282, 195, 0, } , // 4 pipes (1-2 PKRs) 4 bpe @ SW_VAR_R_X 1xaa @ RbPlus + { 3, 6, 283, 196, 0, } , // 4 pipes (1-2 PKRs) 8 bpe @ SW_VAR_R_X 1xaa @ RbPlus + { 3, 7, 284, 197, 0, } , // 4 pipes (1-2 PKRs) 16 bpe @ SW_VAR_R_X 1xaa @ RbPlus + { 3, 0, 394, 198, 1, } , // 8 pipes (2 PKRs) 1 bpe @ SW_VAR_R_X 1xaa @ RbPlus + { 3, 1, 395, 199, 2, } , // 8 pipes (2 PKRs) 2 bpe @ SW_VAR_R_X 1xaa @ RbPlus + { 3, 39, 396, 200, 3, } , // 8 pipes (2 PKRs) 4 bpe @ SW_VAR_R_X 1xaa @ RbPlus + { 3, 6, 397, 201, 4, } , // 8 pipes (2 PKRs) 8 bpe @ SW_VAR_R_X 1xaa @ RbPlus + { 3, 7, 398, 202, 5, } , // 8 pipes (2 PKRs) 16 bpe @ SW_VAR_R_X 1xaa @ RbPlus + { 3, 0, 290, 203, 0, } , // 4 pipes (4 PKRs) 1 bpe @ SW_VAR_R_X 1xaa @ RbPlus + { 3, 1, 291, 204, 0, } , // 4 pipes (4 PKRs) 2 bpe @ SW_VAR_R_X 1xaa @ RbPlus + { 3, 39, 292, 205, 0, } , // 4 pipes (4 PKRs) 4 bpe @ SW_VAR_R_X 1xaa @ RbPlus + { 3, 6, 293, 206, 0, } , // 4 pipes (4 PKRs) 8 bpe @ SW_VAR_R_X 1xaa @ RbPlus + { 3, 7, 294, 207, 0, } , // 4 pipes (4 PKRs) 16 bpe @ SW_VAR_R_X 1xaa @ RbPlus + { 3, 0, 295, 208, 6, } , // 8 pipes (4 PKRs) 1 bpe @ SW_VAR_R_X 1xaa @ RbPlus + { 3, 1, 296, 209, 2, } , // 8 pipes (4 PKRs) 2 bpe @ SW_VAR_R_X 1xaa @ RbPlus + { 3, 39, 297, 210, 7, } , // 8 pipes (4 PKRs) 4 bpe @ SW_VAR_R_X 1xaa @ RbPlus + { 3, 6, 298, 211, 4, } , // 8 pipes (4 PKRs) 8 bpe @ SW_VAR_R_X 1xaa @ RbPlus + { 3, 7, 299, 212, 8, } , // 8 pipes (4 PKRs) 16 bpe @ SW_VAR_R_X 1xaa @ RbPlus + { 3, 0, 399, 213, 9, } , // 16 pipes (4 PKRs) 1 bpe @ SW_VAR_R_X 1xaa @ RbPlus + { 3, 1, 399, 214, 10, } , // 16 pipes (4 PKRs) 2 bpe @ SW_VAR_R_X 1xaa @ RbPlus + { 3, 39, 399, 215, 11, } , // 16 pipes (4 PKRs) 4 bpe @ SW_VAR_R_X 1xaa @ RbPlus + { 3, 6, 399, 216, 12, } , // 16 pipes (4 PKRs) 8 bpe @ SW_VAR_R_X 1xaa @ RbPlus + { 3, 7, 399, 217, 13, } , // 16 pipes (4 PKRs) 16 bpe @ SW_VAR_R_X 1xaa @ RbPlus + { 3, 0, 400, 218, 15, } , // 8 pipes (8 PKRs) 1 bpe @ SW_VAR_R_X 1xaa @ RbPlus + { 3, 1, 401, 219, 15, } , // 8 pipes (8 PKRs) 2 bpe @ SW_VAR_R_X 1xaa @ RbPlus + { 3, 39, 402, 220, 15, } , // 8 pipes (8 PKRs) 4 bpe @ SW_VAR_R_X 1xaa @ RbPlus + { 3, 6, 304, 221, 15, } , // 8 pipes (8 PKRs) 8 bpe @ SW_VAR_R_X 1xaa @ RbPlus + { 3, 7, 305, 222, 15, } , // 8 pipes (8 PKRs) 16 bpe @ SW_VAR_R_X 1xaa @ RbPlus + { 3, 0, 307, 213, 9, } , // 16 pipes (8 PKRs) 1 bpe @ SW_VAR_R_X 1xaa @ RbPlus + { 3, 1, 307, 223, 16, } , // 16 pipes (8 PKRs) 2 bpe @ SW_VAR_R_X 1xaa @ RbPlus + { 3, 39, 307, 215, 11, } , // 16 pipes (8 PKRs) 4 bpe @ SW_VAR_R_X 1xaa @ RbPlus + { 3, 6, 307, 216, 17, } , // 16 pipes (8 PKRs) 8 bpe @ SW_VAR_R_X 1xaa @ RbPlus + { 3, 7, 307, 224, 13, } , // 16 pipes (8 PKRs) 16 bpe @ SW_VAR_R_X 1xaa @ RbPlus + { 3, 0, 307, 497, 18, } , // 32 pipes (8 PKRs) 1 bpe @ SW_VAR_R_X 1xaa @ RbPlus + { 3, 1, 307, 498, 19, } , // 32 pipes (8 PKRs) 2 bpe @ SW_VAR_R_X 1xaa @ RbPlus + { 3, 39, 307, 499, 20, } , // 32 pipes (8 PKRs) 4 bpe @ SW_VAR_R_X 1xaa @ RbPlus + { 3, 6, 307, 500, 21, } , // 32 pipes (8 PKRs) 8 bpe @ SW_VAR_R_X 1xaa @ RbPlus + { 3, 7, 307, 501, 22, } , // 32 pipes (8 PKRs) 16 bpe @ SW_VAR_R_X 1xaa @ RbPlus + { 3, 0, 309, 230, 125, } , // 16 pipes (16 PKRs) 1 bpe @ SW_VAR_R_X 1xaa @ RbPlus + { 3, 1, 309, 231, 126, } , // 16 pipes (16 PKRs) 2 bpe @ SW_VAR_R_X 1xaa @ RbPlus + { 3, 39, 309, 232, 127, } , // 16 pipes (16 PKRs) 4 bpe @ SW_VAR_R_X 1xaa @ RbPlus + { 3, 6, 309, 233, 26, } , // 16 pipes (16 PKRs) 8 bpe @ SW_VAR_R_X 1xaa @ RbPlus + { 3, 7, 309, 234, 27, } , // 16 pipes (16 PKRs) 16 bpe @ SW_VAR_R_X 1xaa @ RbPlus + { 3, 0, 309, 502, 28, } , // 32 pipes (16 PKRs) 1 bpe @ SW_VAR_R_X 1xaa @ RbPlus + { 3, 1, 309, 503, 19, } , // 32 pipes (16 PKRs) 2 bpe @ SW_VAR_R_X 1xaa @ RbPlus + { 3, 39, 309, 504, 29, } , // 32 pipes (16 PKRs) 4 bpe @ SW_VAR_R_X 1xaa @ RbPlus + { 3, 6, 309, 238, 30, } , // 32 pipes (16 PKRs) 8 bpe @ SW_VAR_R_X 1xaa @ RbPlus + { 3, 7, 309, 239, 31, } , // 32 pipes (16 PKRs) 16 bpe @ SW_VAR_R_X 1xaa @ RbPlus + { 3, 0, 309, 505, 32, } , // 64 pipes (16 PKRs) 1 bpe @ SW_VAR_R_X 1xaa @ RbPlus + { 3, 1, 309, 506, 33, } , // 64 pipes (16 PKRs) 2 bpe @ SW_VAR_R_X 1xaa @ RbPlus + { 3, 39, 309, 507, 34, } , // 64 pipes (16 PKRs) 4 bpe @ SW_VAR_R_X 1xaa @ RbPlus + { 3, 6, 309, 508, 35, } , // 64 pipes (16 PKRs) 8 bpe @ SW_VAR_R_X 1xaa @ RbPlus + { 3, 7, 309, 509, 36, } , // 64 pipes (16 PKRs) 16 bpe @ SW_VAR_R_X 1xaa @ RbPlus + { 3, 0, 311, 510, 128, } , // 32 pipes (32 PKRs) 1 bpe @ SW_VAR_R_X 1xaa @ RbPlus + { 3, 1, 311, 511, 129, } , // 32 pipes (32 PKRs) 2 bpe @ SW_VAR_R_X 1xaa @ RbPlus + { 3, 39, 311, 512, 130, } , // 32 pipes (32 PKRs) 4 bpe @ SW_VAR_R_X 1xaa @ RbPlus + { 3, 6, 311, 248, 40, } , // 32 pipes (32 PKRs) 8 bpe @ SW_VAR_R_X 1xaa @ RbPlus + { 3, 7, 311, 249, 41, } , // 32 pipes (32 PKRs) 16 bpe @ SW_VAR_R_X 1xaa @ RbPlus + { 3, 0, 311, 513, 32, } , // 64 pipes (32 PKRs) 1 bpe @ SW_VAR_R_X 1xaa @ RbPlus + { 3, 1, 311, 514, 42, } , // 64 pipes (32 PKRs) 2 bpe @ SW_VAR_R_X 1xaa @ RbPlus + { 3, 39, 311, 515, 34, } , // 64 pipes (32 PKRs) 4 bpe @ SW_VAR_R_X 1xaa @ RbPlus + { 3, 6, 311, 253, 43, } , // 64 pipes (32 PKRs) 8 bpe @ SW_VAR_R_X 1xaa @ RbPlus + { 3, 7, 311, 254, 44, } , // 64 pipes (32 PKRs) 16 bpe @ SW_VAR_R_X 1xaa @ RbPlus +}; + +const ADDR_SW_PATINFO SW_VAR_R_X_2xaa_RBPLUS_PATINFO[] = +{ + { 3, 0, 403, 516, 0, } , // 1 pipes (1 PKRs) 1 bpe @ SW_VAR_R_X 2xaa @ RbPlus + { 3, 1, 271, 517, 0, } , // 1 pipes (1 PKRs) 2 bpe @ SW_VAR_R_X 2xaa @ RbPlus + { 3, 39, 313, 518, 0, } , // 1 pipes (1 PKRs) 4 bpe @ SW_VAR_R_X 2xaa @ RbPlus + { 3, 6, 273, 519, 0, } , // 1 pipes (1 PKRs) 8 bpe @ SW_VAR_R_X 2xaa @ RbPlus + { 3, 7, 314, 520, 0, } , // 1 pipes (1 PKRs) 16 bpe @ SW_VAR_R_X 2xaa @ RbPlus + { 3, 0, 404, 521, 0, } , // 2 pipes (1-2 PKRs) 1 bpe @ SW_VAR_R_X 2xaa @ RbPlus + { 3, 1, 276, 522, 0, } , // 2 pipes (1-2 PKRs) 2 bpe @ SW_VAR_R_X 2xaa @ RbPlus + { 3, 39, 315, 523, 0, } , // 2 pipes (1-2 PKRs) 4 bpe @ SW_VAR_R_X 2xaa @ RbPlus + { 3, 6, 278, 524, 0, } , // 2 pipes (1-2 PKRs) 8 bpe @ SW_VAR_R_X 2xaa @ RbPlus + { 3, 7, 316, 525, 0, } , // 2 pipes (1-2 PKRs) 16 bpe @ SW_VAR_R_X 2xaa @ RbPlus + { 3, 0, 280, 526, 0, } , // 4 pipes (1-2 PKRs) 1 bpe @ SW_VAR_R_X 2xaa @ RbPlus + { 3, 1, 281, 527, 0, } , // 4 pipes (1-2 PKRs) 2 bpe @ SW_VAR_R_X 2xaa @ RbPlus + { 3, 39, 282, 528, 0, } , // 4 pipes (1-2 PKRs) 4 bpe @ SW_VAR_R_X 2xaa @ RbPlus + { 3, 6, 283, 529, 0, } , // 4 pipes (1-2 PKRs) 8 bpe @ SW_VAR_R_X 2xaa @ RbPlus + { 3, 7, 284, 530, 0, } , // 4 pipes (1-2 PKRs) 16 bpe @ SW_VAR_R_X 2xaa @ RbPlus + { 3, 0, 394, 208, 131, } , // 8 pipes (2 PKRs) 1 bpe @ SW_VAR_R_X 2xaa @ RbPlus + { 3, 1, 395, 531, 132, } , // 8 pipes (2 PKRs) 2 bpe @ SW_VAR_R_X 2xaa @ RbPlus + { 3, 39, 396, 302, 133, } , // 8 pipes (2 PKRs) 4 bpe @ SW_VAR_R_X 2xaa @ RbPlus + { 3, 6, 397, 532, 134, } , // 8 pipes (2 PKRs) 8 bpe @ SW_VAR_R_X 2xaa @ RbPlus + { 3, 7, 398, 533, 135, } , // 8 pipes (2 PKRs) 16 bpe @ SW_VAR_R_X 2xaa @ RbPlus + { 3, 0, 290, 534, 0, } , // 4 pipes (4 PKRs) 1 bpe @ SW_VAR_R_X 2xaa @ RbPlus + { 3, 1, 291, 535, 0, } , // 4 pipes (4 PKRs) 2 bpe @ SW_VAR_R_X 2xaa @ RbPlus + { 3, 39, 292, 536, 0, } , // 4 pipes (4 PKRs) 4 bpe @ SW_VAR_R_X 2xaa @ RbPlus + { 3, 6, 293, 537, 0, } , // 4 pipes (4 PKRs) 8 bpe @ SW_VAR_R_X 2xaa @ RbPlus + { 3, 7, 294, 538, 0, } , // 4 pipes (4 PKRs) 16 bpe @ SW_VAR_R_X 2xaa @ RbPlus + { 3, 0, 295, 208, 131, } , // 8 pipes (4 PKRs) 1 bpe @ SW_VAR_R_X 2xaa @ RbPlus + { 3, 1, 296, 209, 132, } , // 8 pipes (4 PKRs) 2 bpe @ SW_VAR_R_X 2xaa @ RbPlus + { 3, 39, 297, 210, 133, } , // 8 pipes (4 PKRs) 4 bpe @ SW_VAR_R_X 2xaa @ RbPlus + { 3, 6, 298, 211, 134, } , // 8 pipes (4 PKRs) 8 bpe @ SW_VAR_R_X 2xaa @ RbPlus + { 3, 7, 299, 212, 135, } , // 8 pipes (4 PKRs) 16 bpe @ SW_VAR_R_X 2xaa @ RbPlus + { 3, 0, 399, 539, 136, } , // 16 pipes (4 PKRs) 1 bpe @ SW_VAR_R_X 2xaa @ RbPlus + { 3, 1, 399, 214, 137, } , // 16 pipes (4 PKRs) 2 bpe @ SW_VAR_R_X 2xaa @ RbPlus + { 3, 39, 399, 280, 138, } , // 16 pipes (4 PKRs) 4 bpe @ SW_VAR_R_X 2xaa @ RbPlus + { 3, 6, 399, 216, 139, } , // 16 pipes (4 PKRs) 8 bpe @ SW_VAR_R_X 2xaa @ RbPlus + { 3, 7, 399, 224, 140, } , // 16 pipes (4 PKRs) 16 bpe @ SW_VAR_R_X 2xaa @ RbPlus + { 3, 0, 400, 540, 15, } , // 8 pipes (8 PKRs) 1 bpe @ SW_VAR_R_X 2xaa @ RbPlus + { 3, 1, 401, 541, 15, } , // 8 pipes (8 PKRs) 2 bpe @ SW_VAR_R_X 2xaa @ RbPlus + { 3, 39, 402, 542, 15, } , // 8 pipes (8 PKRs) 4 bpe @ SW_VAR_R_X 2xaa @ RbPlus + { 3, 6, 304, 543, 15, } , // 8 pipes (8 PKRs) 8 bpe @ SW_VAR_R_X 2xaa @ RbPlus + { 3, 7, 305, 544, 15, } , // 8 pipes (8 PKRs) 16 bpe @ SW_VAR_R_X 2xaa @ RbPlus + { 3, 0, 307, 539, 136, } , // 16 pipes (8 PKRs) 1 bpe @ SW_VAR_R_X 2xaa @ RbPlus + { 3, 1, 307, 214, 137, } , // 16 pipes (8 PKRs) 2 bpe @ SW_VAR_R_X 2xaa @ RbPlus + { 3, 39, 307, 280, 138, } , // 16 pipes (8 PKRs) 4 bpe @ SW_VAR_R_X 2xaa @ RbPlus + { 3, 6, 307, 216, 139, } , // 16 pipes (8 PKRs) 8 bpe @ SW_VAR_R_X 2xaa @ RbPlus + { 3, 7, 307, 224, 140, } , // 16 pipes (8 PKRs) 16 bpe @ SW_VAR_R_X 2xaa @ RbPlus + { 3, 0, 307, 545, 141, } , // 32 pipes (8 PKRs) 1 bpe @ SW_VAR_R_X 2xaa @ RbPlus + { 3, 1, 307, 498, 142, } , // 32 pipes (8 PKRs) 2 bpe @ SW_VAR_R_X 2xaa @ RbPlus + { 3, 39, 307, 546, 143, } , // 32 pipes (8 PKRs) 4 bpe @ SW_VAR_R_X 2xaa @ RbPlus + { 3, 6, 307, 500, 144, } , // 32 pipes (8 PKRs) 8 bpe @ SW_VAR_R_X 2xaa @ RbPlus + { 3, 7, 307, 547, 145, } , // 32 pipes (8 PKRs) 16 bpe @ SW_VAR_R_X 2xaa @ RbPlus + { 3, 0, 309, 548, 146, } , // 16 pipes (16 PKRs) 1 bpe @ SW_VAR_R_X 2xaa @ RbPlus + { 3, 1, 309, 231, 147, } , // 16 pipes (16 PKRs) 2 bpe @ SW_VAR_R_X 2xaa @ RbPlus + { 3, 39, 309, 285, 148, } , // 16 pipes (16 PKRs) 4 bpe @ SW_VAR_R_X 2xaa @ RbPlus + { 3, 6, 309, 233, 149, } , // 16 pipes (16 PKRs) 8 bpe @ SW_VAR_R_X 2xaa @ RbPlus + { 3, 7, 309, 286, 150, } , // 16 pipes (16 PKRs) 16 bpe @ SW_VAR_R_X 2xaa @ RbPlus + { 3, 0, 309, 502, 141, } , // 32 pipes (16 PKRs) 1 bpe @ SW_VAR_R_X 2xaa @ RbPlus + { 3, 1, 309, 503, 151, } , // 32 pipes (16 PKRs) 2 bpe @ SW_VAR_R_X 2xaa @ RbPlus + { 3, 39, 309, 504, 143, } , // 32 pipes (16 PKRs) 4 bpe @ SW_VAR_R_X 2xaa @ RbPlus + { 3, 6, 309, 238, 152, } , // 32 pipes (16 PKRs) 8 bpe @ SW_VAR_R_X 2xaa @ RbPlus + { 3, 7, 309, 239, 153, } , // 32 pipes (16 PKRs) 16 bpe @ SW_VAR_R_X 2xaa @ RbPlus + { 3, 0, 309, 505, 154, } , // 64 pipes (16 PKRs) 1 bpe @ SW_VAR_R_X 2xaa @ RbPlus + { 3, 1, 309, 506, 155, } , // 64 pipes (16 PKRs) 2 bpe @ SW_VAR_R_X 2xaa @ RbPlus + { 3, 39, 309, 507, 156, } , // 64 pipes (16 PKRs) 4 bpe @ SW_VAR_R_X 2xaa @ RbPlus + { 3, 6, 309, 508, 157, } , // 64 pipes (16 PKRs) 8 bpe @ SW_VAR_R_X 2xaa @ RbPlus + { 3, 7, 309, 509, 158, } , // 64 pipes (16 PKRs) 16 bpe @ SW_VAR_R_X 2xaa @ RbPlus + { 3, 0, 318, 549, 159, } , // 32 pipes (32 PKRs) 1 bpe @ SW_VAR_R_X 2xaa @ RbPlus + { 3, 1, 318, 550, 160, } , // 32 pipes (32 PKRs) 2 bpe @ SW_VAR_R_X 2xaa @ RbPlus + { 3, 39, 318, 551, 161, } , // 32 pipes (32 PKRs) 4 bpe @ SW_VAR_R_X 2xaa @ RbPlus + { 3, 6, 318, 287, 162, } , // 32 pipes (32 PKRs) 8 bpe @ SW_VAR_R_X 2xaa @ RbPlus + { 3, 7, 318, 288, 163, } , // 32 pipes (32 PKRs) 16 bpe @ SW_VAR_R_X 2xaa @ RbPlus + { 3, 0, 318, 552, 154, } , // 64 pipes (32 PKRs) 1 bpe @ SW_VAR_R_X 2xaa @ RbPlus + { 3, 1, 318, 553, 155, } , // 64 pipes (32 PKRs) 2 bpe @ SW_VAR_R_X 2xaa @ RbPlus + { 3, 39, 318, 554, 156, } , // 64 pipes (32 PKRs) 4 bpe @ SW_VAR_R_X 2xaa @ RbPlus + { 3, 6, 318, 555, 157, } , // 64 pipes (32 PKRs) 8 bpe @ SW_VAR_R_X 2xaa @ RbPlus + { 3, 7, 318, 290, 158, } , // 64 pipes (32 PKRs) 16 bpe @ SW_VAR_R_X 2xaa @ RbPlus +}; + +const ADDR_SW_PATINFO SW_VAR_R_X_4xaa_RBPLUS_PATINFO[] = +{ + { 3, 0, 270, 556, 0, } , // 1 pipes (1 PKRs) 1 bpe @ SW_VAR_R_X 4xaa @ RbPlus + { 3, 1, 271, 557, 0, } , // 1 pipes (1 PKRs) 2 bpe @ SW_VAR_R_X 4xaa @ RbPlus + { 3, 39, 272, 558, 0, } , // 1 pipes (1 PKRs) 4 bpe @ SW_VAR_R_X 4xaa @ RbPlus + { 3, 6, 273, 559, 0, } , // 1 pipes (1 PKRs) 8 bpe @ SW_VAR_R_X 4xaa @ RbPlus + { 3, 7, 274, 560, 0, } , // 1 pipes (1 PKRs) 16 bpe @ SW_VAR_R_X 4xaa @ RbPlus + { 3, 0, 275, 561, 0, } , // 2 pipes (1-2 PKRs) 1 bpe @ SW_VAR_R_X 4xaa @ RbPlus + { 3, 1, 276, 562, 0, } , // 2 pipes (1-2 PKRs) 2 bpe @ SW_VAR_R_X 4xaa @ RbPlus + { 3, 39, 277, 563, 0, } , // 2 pipes (1-2 PKRs) 4 bpe @ SW_VAR_R_X 4xaa @ RbPlus + { 3, 6, 278, 564, 0, } , // 2 pipes (1-2 PKRs) 8 bpe @ SW_VAR_R_X 4xaa @ RbPlus + { 3, 7, 279, 565, 0, } , // 2 pipes (1-2 PKRs) 16 bpe @ SW_VAR_R_X 4xaa @ RbPlus + { 3, 0, 280, 566, 0, } , // 4 pipes (1-2 PKRs) 1 bpe @ SW_VAR_R_X 4xaa @ RbPlus + { 3, 1, 281, 567, 0, } , // 4 pipes (1-2 PKRs) 2 bpe @ SW_VAR_R_X 4xaa @ RbPlus + { 3, 39, 282, 568, 0, } , // 4 pipes (1-2 PKRs) 4 bpe @ SW_VAR_R_X 4xaa @ RbPlus + { 3, 6, 283, 569, 0, } , // 4 pipes (1-2 PKRs) 8 bpe @ SW_VAR_R_X 4xaa @ RbPlus + { 3, 7, 284, 570, 0, } , // 4 pipes (1-2 PKRs) 16 bpe @ SW_VAR_R_X 4xaa @ RbPlus + { 3, 0, 394, 571, 164, } , // 8 pipes (2 PKRs) 1 bpe @ SW_VAR_R_X 4xaa @ RbPlus + { 3, 1, 395, 572, 165, } , // 8 pipes (2 PKRs) 2 bpe @ SW_VAR_R_X 4xaa @ RbPlus + { 3, 39, 396, 573, 166, } , // 8 pipes (2 PKRs) 4 bpe @ SW_VAR_R_X 4xaa @ RbPlus + { 3, 6, 397, 574, 167, } , // 8 pipes (2 PKRs) 8 bpe @ SW_VAR_R_X 4xaa @ RbPlus + { 3, 7, 398, 575, 168, } , // 8 pipes (2 PKRs) 16 bpe @ SW_VAR_R_X 4xaa @ RbPlus + { 3, 0, 290, 576, 0, } , // 4 pipes (4 PKRs) 1 bpe @ SW_VAR_R_X 4xaa @ RbPlus + { 3, 1, 291, 577, 0, } , // 4 pipes (4 PKRs) 2 bpe @ SW_VAR_R_X 4xaa @ RbPlus + { 3, 39, 292, 578, 0, } , // 4 pipes (4 PKRs) 4 bpe @ SW_VAR_R_X 4xaa @ RbPlus + { 3, 6, 293, 579, 0, } , // 4 pipes (4 PKRs) 8 bpe @ SW_VAR_R_X 4xaa @ RbPlus + { 3, 7, 405, 580, 0, } , // 4 pipes (4 PKRs) 16 bpe @ SW_VAR_R_X 4xaa @ RbPlus + { 3, 0, 295, 581, 169, } , // 8 pipes (4 PKRs) 1 bpe @ SW_VAR_R_X 4xaa @ RbPlus + { 3, 1, 296, 582, 165, } , // 8 pipes (4 PKRs) 2 bpe @ SW_VAR_R_X 4xaa @ RbPlus + { 3, 39, 297, 583, 170, } , // 8 pipes (4 PKRs) 4 bpe @ SW_VAR_R_X 4xaa @ RbPlus + { 3, 6, 298, 584, 167, } , // 8 pipes (4 PKRs) 8 bpe @ SW_VAR_R_X 4xaa @ RbPlus + { 3, 7, 299, 585, 168, } , // 8 pipes (4 PKRs) 16 bpe @ SW_VAR_R_X 4xaa @ RbPlus + { 3, 0, 399, 213, 171, } , // 16 pipes (4 PKRs) 1 bpe @ SW_VAR_R_X 4xaa @ RbPlus + { 3, 1, 399, 214, 172, } , // 16 pipes (4 PKRs) 2 bpe @ SW_VAR_R_X 4xaa @ RbPlus + { 3, 39, 399, 215, 173, } , // 16 pipes (4 PKRs) 4 bpe @ SW_VAR_R_X 4xaa @ RbPlus + { 3, 6, 399, 216, 174, } , // 16 pipes (4 PKRs) 8 bpe @ SW_VAR_R_X 4xaa @ RbPlus + { 3, 7, 399, 217, 175, } , // 16 pipes (4 PKRs) 16 bpe @ SW_VAR_R_X 4xaa @ RbPlus + { 3, 0, 400, 586, 15, } , // 8 pipes (8 PKRs) 1 bpe @ SW_VAR_R_X 4xaa @ RbPlus + { 3, 1, 401, 587, 15, } , // 8 pipes (8 PKRs) 2 bpe @ SW_VAR_R_X 4xaa @ RbPlus + { 3, 39, 402, 588, 15, } , // 8 pipes (8 PKRs) 4 bpe @ SW_VAR_R_X 4xaa @ RbPlus + { 3, 6, 304, 589, 15, } , // 8 pipes (8 PKRs) 8 bpe @ SW_VAR_R_X 4xaa @ RbPlus + { 3, 7, 406, 544, 15, } , // 8 pipes (8 PKRs) 16 bpe @ SW_VAR_R_X 4xaa @ RbPlus + { 3, 0, 307, 213, 171, } , // 16 pipes (8 PKRs) 1 bpe @ SW_VAR_R_X 4xaa @ RbPlus + { 3, 1, 307, 223, 176, } , // 16 pipes (8 PKRs) 2 bpe @ SW_VAR_R_X 4xaa @ RbPlus + { 3, 39, 307, 215, 173, } , // 16 pipes (8 PKRs) 4 bpe @ SW_VAR_R_X 4xaa @ RbPlus + { 3, 6, 307, 216, 177, } , // 16 pipes (8 PKRs) 8 bpe @ SW_VAR_R_X 4xaa @ RbPlus + { 3, 7, 307, 224, 175, } , // 16 pipes (8 PKRs) 16 bpe @ SW_VAR_R_X 4xaa @ RbPlus + { 3, 0, 307, 497, 178, } , // 32 pipes (8 PKRs) 1 bpe @ SW_VAR_R_X 4xaa @ RbPlus + { 3, 1, 307, 498, 179, } , // 32 pipes (8 PKRs) 2 bpe @ SW_VAR_R_X 4xaa @ RbPlus + { 3, 39, 307, 499, 180, } , // 32 pipes (8 PKRs) 4 bpe @ SW_VAR_R_X 4xaa @ RbPlus + { 3, 6, 307, 500, 181, } , // 32 pipes (8 PKRs) 8 bpe @ SW_VAR_R_X 4xaa @ RbPlus + { 3, 7, 307, 501, 182, } , // 32 pipes (8 PKRs) 16 bpe @ SW_VAR_R_X 4xaa @ RbPlus + { 3, 0, 323, 590, 183, } , // 16 pipes (16 PKRs) 1 bpe @ SW_VAR_R_X 4xaa @ RbPlus + { 3, 1, 323, 591, 184, } , // 16 pipes (16 PKRs) 2 bpe @ SW_VAR_R_X 4xaa @ RbPlus + { 3, 39, 323, 592, 185, } , // 16 pipes (16 PKRs) 4 bpe @ SW_VAR_R_X 4xaa @ RbPlus + { 3, 6, 323, 593, 186, } , // 16 pipes (16 PKRs) 8 bpe @ SW_VAR_R_X 4xaa @ RbPlus + { 3, 7, 323, 286, 187, } , // 16 pipes (16 PKRs) 16 bpe @ SW_VAR_R_X 4xaa @ RbPlus + { 3, 0, 323, 594, 188, } , // 32 pipes (16 PKRs) 1 bpe @ SW_VAR_R_X 4xaa @ RbPlus + { 3, 1, 323, 595, 179, } , // 32 pipes (16 PKRs) 2 bpe @ SW_VAR_R_X 4xaa @ RbPlus + { 3, 39, 323, 596, 189, } , // 32 pipes (16 PKRs) 4 bpe @ SW_VAR_R_X 4xaa @ RbPlus + { 3, 6, 323, 321, 190, } , // 32 pipes (16 PKRs) 8 bpe @ SW_VAR_R_X 4xaa @ RbPlus + { 3, 7, 323, 322, 191, } , // 32 pipes (16 PKRs) 16 bpe @ SW_VAR_R_X 4xaa @ RbPlus + { 3, 0, 323, 597, 192, } , // 64 pipes (16 PKRs) 1 bpe @ SW_VAR_R_X 4xaa @ RbPlus + { 3, 1, 323, 598, 193, } , // 64 pipes (16 PKRs) 2 bpe @ SW_VAR_R_X 4xaa @ RbPlus + { 3, 39, 323, 599, 194, } , // 64 pipes (16 PKRs) 4 bpe @ SW_VAR_R_X 4xaa @ RbPlus + { 3, 6, 323, 600, 195, } , // 64 pipes (16 PKRs) 8 bpe @ SW_VAR_R_X 4xaa @ RbPlus + { 3, 7, 323, 601, 196, } , // 64 pipes (16 PKRs) 16 bpe @ SW_VAR_R_X 4xaa @ RbPlus + { 3, 0, 324, 602, 197, } , // 32 pipes (32 PKRs) 1 bpe @ SW_VAR_R_X 4xaa @ RbPlus + { 3, 1, 324, 603, 198, } , // 32 pipes (32 PKRs) 2 bpe @ SW_VAR_R_X 4xaa @ RbPlus + { 3, 39, 324, 604, 199, } , // 32 pipes (32 PKRs) 4 bpe @ SW_VAR_R_X 4xaa @ RbPlus + { 3, 6, 324, 605, 200, } , // 32 pipes (32 PKRs) 8 bpe @ SW_VAR_R_X 4xaa @ RbPlus + { 3, 7, 324, 606, 201, } , // 32 pipes (32 PKRs) 16 bpe @ SW_VAR_R_X 4xaa @ RbPlus + { 3, 0, 324, 607, 192, } , // 64 pipes (32 PKRs) 1 bpe @ SW_VAR_R_X 4xaa @ RbPlus + { 3, 1, 324, 608, 202, } , // 64 pipes (32 PKRs) 2 bpe @ SW_VAR_R_X 4xaa @ RbPlus + { 3, 39, 324, 609, 194, } , // 64 pipes (32 PKRs) 4 bpe @ SW_VAR_R_X 4xaa @ RbPlus + { 3, 6, 324, 327, 203, } , // 64 pipes (32 PKRs) 8 bpe @ SW_VAR_R_X 4xaa @ RbPlus + { 3, 7, 324, 328, 204, } , // 64 pipes (32 PKRs) 16 bpe @ SW_VAR_R_X 4xaa @ RbPlus +}; + +const ADDR_SW_PATINFO SW_VAR_R_X_8xaa_RBPLUS_PATINFO[] = +{ + { 3, 0, 407, 610, 0, } , // 1 pipes (1 PKRs) 1 bpe @ SW_VAR_R_X 8xaa @ RbPlus + { 3, 1, 408, 611, 0, } , // 1 pipes (1 PKRs) 2 bpe @ SW_VAR_R_X 8xaa @ RbPlus + { 3, 39, 409, 612, 0, } , // 1 pipes (1 PKRs) 4 bpe @ SW_VAR_R_X 8xaa @ RbPlus + { 3, 6, 410, 613, 0, } , // 1 pipes (1 PKRs) 8 bpe @ SW_VAR_R_X 8xaa @ RbPlus + { 3, 7, 411, 614, 0, } , // 1 pipes (1 PKRs) 16 bpe @ SW_VAR_R_X 8xaa @ RbPlus + { 3, 0, 404, 615, 0, } , // 2 pipes (1-2 PKRs) 1 bpe @ SW_VAR_R_X 8xaa @ RbPlus + { 3, 1, 276, 616, 0, } , // 2 pipes (1-2 PKRs) 2 bpe @ SW_VAR_R_X 8xaa @ RbPlus + { 3, 39, 315, 617, 0, } , // 2 pipes (1-2 PKRs) 4 bpe @ SW_VAR_R_X 8xaa @ RbPlus + { 3, 6, 278, 618, 0, } , // 2 pipes (1-2 PKRs) 8 bpe @ SW_VAR_R_X 8xaa @ RbPlus + { 3, 7, 412, 565, 0, } , // 2 pipes (1-2 PKRs) 16 bpe @ SW_VAR_R_X 8xaa @ RbPlus + { 3, 0, 280, 619, 0, } , // 4 pipes (1-2 PKRs) 1 bpe @ SW_VAR_R_X 8xaa @ RbPlus + { 3, 1, 281, 620, 0, } , // 4 pipes (1-2 PKRs) 2 bpe @ SW_VAR_R_X 8xaa @ RbPlus + { 3, 39, 282, 621, 0, } , // 4 pipes (1-2 PKRs) 4 bpe @ SW_VAR_R_X 8xaa @ RbPlus + { 3, 6, 283, 622, 0, } , // 4 pipes (1-2 PKRs) 8 bpe @ SW_VAR_R_X 8xaa @ RbPlus + { 3, 7, 413, 623, 0, } , // 4 pipes (1-2 PKRs) 16 bpe @ SW_VAR_R_X 8xaa @ RbPlus + { 3, 0, 394, 624, 205, } , // 8 pipes (2 PKRs) 1 bpe @ SW_VAR_R_X 8xaa @ RbPlus + { 3, 1, 395, 625, 206, } , // 8 pipes (2 PKRs) 2 bpe @ SW_VAR_R_X 8xaa @ RbPlus + { 3, 39, 396, 626, 207, } , // 8 pipes (2 PKRs) 4 bpe @ SW_VAR_R_X 8xaa @ RbPlus + { 3, 6, 397, 627, 208, } , // 8 pipes (2 PKRs) 8 bpe @ SW_VAR_R_X 8xaa @ RbPlus + { 3, 7, 414, 628, 209, } , // 8 pipes (2 PKRs) 16 bpe @ SW_VAR_R_X 8xaa @ RbPlus + { 3, 0, 415, 629, 0, } , // 4 pipes (4 PKRs) 1 bpe @ SW_VAR_R_X 8xaa @ RbPlus + { 3, 1, 291, 630, 0, } , // 4 pipes (4 PKRs) 2 bpe @ SW_VAR_R_X 8xaa @ RbPlus + { 3, 39, 292, 631, 0, } , // 4 pipes (4 PKRs) 4 bpe @ SW_VAR_R_X 8xaa @ RbPlus + { 3, 6, 416, 632, 0, } , // 4 pipes (4 PKRs) 8 bpe @ SW_VAR_R_X 8xaa @ RbPlus + { 3, 7, 417, 580, 0, } , // 4 pipes (4 PKRs) 16 bpe @ SW_VAR_R_X 8xaa @ RbPlus + { 3, 0, 295, 624, 205, } , // 8 pipes (4 PKRs) 1 bpe @ SW_VAR_R_X 8xaa @ RbPlus + { 3, 1, 296, 633, 206, } , // 8 pipes (4 PKRs) 2 bpe @ SW_VAR_R_X 8xaa @ RbPlus + { 3, 39, 297, 634, 207, } , // 8 pipes (4 PKRs) 4 bpe @ SW_VAR_R_X 8xaa @ RbPlus + { 3, 6, 298, 627, 208, } , // 8 pipes (4 PKRs) 8 bpe @ SW_VAR_R_X 8xaa @ RbPlus + { 3, 7, 418, 635, 210, } , // 8 pipes (4 PKRs) 16 bpe @ SW_VAR_R_X 8xaa @ RbPlus + { 3, 0, 399, 636, 211, } , // 16 pipes (4 PKRs) 1 bpe @ SW_VAR_R_X 8xaa @ RbPlus + { 3, 1, 399, 637, 212, } , // 16 pipes (4 PKRs) 2 bpe @ SW_VAR_R_X 8xaa @ RbPlus + { 3, 39, 399, 638, 213, } , // 16 pipes (4 PKRs) 4 bpe @ SW_VAR_R_X 8xaa @ RbPlus + { 3, 6, 399, 639, 214, } , // 16 pipes (4 PKRs) 8 bpe @ SW_VAR_R_X 8xaa @ RbPlus + { 3, 7, 419, 640, 215, } , // 16 pipes (4 PKRs) 16 bpe @ SW_VAR_R_X 8xaa @ RbPlus + { 3, 0, 301, 641, 216, } , // 8 pipes (8 PKRs) 1 bpe @ SW_VAR_R_X 8xaa @ RbPlus + { 3, 1, 302, 642, 216, } , // 8 pipes (8 PKRs) 2 bpe @ SW_VAR_R_X 8xaa @ RbPlus + { 3, 39, 303, 643, 216, } , // 8 pipes (8 PKRs) 4 bpe @ SW_VAR_R_X 8xaa @ RbPlus + { 3, 6, 420, 589, 105, } , // 8 pipes (8 PKRs) 8 bpe @ SW_VAR_R_X 8xaa @ RbPlus + { 3, 7, 421, 544, 217, } , // 8 pipes (8 PKRs) 16 bpe @ SW_VAR_R_X 8xaa @ RbPlus + { 3, 0, 339, 636, 211, } , // 16 pipes (8 PKRs) 1 bpe @ SW_VAR_R_X 8xaa @ RbPlus + { 3, 1, 339, 637, 212, } , // 16 pipes (8 PKRs) 2 bpe @ SW_VAR_R_X 8xaa @ RbPlus + { 3, 39, 339, 638, 213, } , // 16 pipes (8 PKRs) 4 bpe @ SW_VAR_R_X 8xaa @ RbPlus + { 3, 6, 339, 639, 214, } , // 16 pipes (8 PKRs) 8 bpe @ SW_VAR_R_X 8xaa @ RbPlus + { 3, 7, 422, 224, 175, } , // 16 pipes (8 PKRs) 16 bpe @ SW_VAR_R_X 8xaa @ RbPlus + { 3, 0, 339, 545, 218, } , // 32 pipes (8 PKRs) 1 bpe @ SW_VAR_R_X 8xaa @ RbPlus + { 3, 1, 339, 498, 219, } , // 32 pipes (8 PKRs) 2 bpe @ SW_VAR_R_X 8xaa @ RbPlus + { 3, 39, 339, 546, 220, } , // 32 pipes (8 PKRs) 4 bpe @ SW_VAR_R_X 8xaa @ RbPlus + { 3, 6, 339, 500, 221, } , // 32 pipes (8 PKRs) 8 bpe @ SW_VAR_R_X 8xaa @ RbPlus + { 3, 7, 339, 644, 222, } , // 32 pipes (8 PKRs) 16 bpe @ SW_VAR_R_X 8xaa @ RbPlus + { 3, 0, 343, 645, 223, } , // 16 pipes (16 PKRs) 1 bpe @ SW_VAR_R_X 8xaa @ RbPlus + { 3, 1, 343, 646, 224, } , // 16 pipes (16 PKRs) 2 bpe @ SW_VAR_R_X 8xaa @ RbPlus + { 3, 39, 343, 647, 225, } , // 16 pipes (16 PKRs) 4 bpe @ SW_VAR_R_X 8xaa @ RbPlus + { 3, 6, 341, 648, 226, } , // 16 pipes (16 PKRs) 8 bpe @ SW_VAR_R_X 8xaa @ RbPlus + { 3, 7, 423, 286, 187, } , // 16 pipes (16 PKRs) 16 bpe @ SW_VAR_R_X 8xaa @ RbPlus + { 3, 0, 343, 649, 218, } , // 32 pipes (16 PKRs) 1 bpe @ SW_VAR_R_X 8xaa @ RbPlus + { 3, 1, 343, 650, 227, } , // 32 pipes (16 PKRs) 2 bpe @ SW_VAR_R_X 8xaa @ RbPlus + { 3, 39, 343, 651, 220, } , // 32 pipes (16 PKRs) 4 bpe @ SW_VAR_R_X 8xaa @ RbPlus + { 3, 6, 343, 652, 221, } , // 32 pipes (16 PKRs) 8 bpe @ SW_VAR_R_X 8xaa @ RbPlus + { 3, 7, 341, 653, 228, } , // 32 pipes (16 PKRs) 16 bpe @ SW_VAR_R_X 8xaa @ RbPlus + { 3, 0, 343, 654, 229, } , // 64 pipes (16 PKRs) 1 bpe @ SW_VAR_R_X 8xaa @ RbPlus + { 3, 1, 343, 655, 230, } , // 64 pipes (16 PKRs) 2 bpe @ SW_VAR_R_X 8xaa @ RbPlus + { 3, 39, 343, 656, 231, } , // 64 pipes (16 PKRs) 4 bpe @ SW_VAR_R_X 8xaa @ RbPlus + { 3, 6, 343, 657, 232, } , // 64 pipes (16 PKRs) 8 bpe @ SW_VAR_R_X 8xaa @ RbPlus + { 3, 7, 343, 658, 233, } , // 64 pipes (16 PKRs) 16 bpe @ SW_VAR_R_X 8xaa @ RbPlus + { 3, 0, 346, 659, 234, } , // 32 pipes (32 PKRs) 1 bpe @ SW_VAR_R_X 8xaa @ RbPlus + { 3, 1, 346, 660, 235, } , // 32 pipes (32 PKRs) 2 bpe @ SW_VAR_R_X 8xaa @ RbPlus + { 3, 39, 346, 661, 236, } , // 32 pipes (32 PKRs) 4 bpe @ SW_VAR_R_X 8xaa @ RbPlus + { 3, 6, 344, 662, 237, } , // 32 pipes (32 PKRs) 8 bpe @ SW_VAR_R_X 8xaa @ RbPlus + { 3, 7, 345, 663, 238, } , // 32 pipes (32 PKRs) 16 bpe @ SW_VAR_R_X 8xaa @ RbPlus + { 3, 0, 346, 664, 229, } , // 64 pipes (32 PKRs) 1 bpe @ SW_VAR_R_X 8xaa @ RbPlus + { 3, 1, 346, 665, 230, } , // 64 pipes (32 PKRs) 2 bpe @ SW_VAR_R_X 8xaa @ RbPlus + { 3, 39, 346, 666, 231, } , // 64 pipes (32 PKRs) 4 bpe @ SW_VAR_R_X 8xaa @ RbPlus + { 3, 6, 346, 667, 232, } , // 64 pipes (32 PKRs) 8 bpe @ SW_VAR_R_X 8xaa @ RbPlus + { 3, 7, 344, 668, 204, } , // 64 pipes (32 PKRs) 16 bpe @ SW_VAR_R_X 8xaa @ RbPlus +}; + +const ADDR_SW_PATINFO SW_VAR_Z_X_1xaa_RBPLUS_PATINFO[] = +{ + { 2, 8, 270, 183, 0, } , // 1 pipes (1 PKRs) 1 bpe @ SW_VAR_Z_X 1xaa @ RbPlus + { 2, 9, 271, 184, 0, } , // 1 pipes (1 PKRs) 2 bpe @ SW_VAR_Z_X 1xaa @ RbPlus + { 2, 10, 272, 185, 0, } , // 1 pipes (1 PKRs) 4 bpe @ SW_VAR_Z_X 1xaa @ RbPlus + { 2, 11, 273, 186, 0, } , // 1 pipes (1 PKRs) 8 bpe @ SW_VAR_Z_X 1xaa @ RbPlus + { 2, 7, 274, 187, 0, } , // 1 pipes (1 PKRs) 16 bpe @ SW_VAR_Z_X 1xaa @ RbPlus + { 3, 8, 275, 188, 0, } , // 2 pipes (1-2 PKRs) 1 bpe @ SW_VAR_Z_X 1xaa @ RbPlus + { 3, 9, 276, 189, 0, } , // 2 pipes (1-2 PKRs) 2 bpe @ SW_VAR_Z_X 1xaa @ RbPlus + { 3, 10, 277, 190, 0, } , // 2 pipes (1-2 PKRs) 4 bpe @ SW_VAR_Z_X 1xaa @ RbPlus + { 3, 11, 278, 191, 0, } , // 2 pipes (1-2 PKRs) 8 bpe @ SW_VAR_Z_X 1xaa @ RbPlus + { 3, 7, 279, 192, 0, } , // 2 pipes (1-2 PKRs) 16 bpe @ SW_VAR_Z_X 1xaa @ RbPlus + { 3, 8, 280, 193, 0, } , // 4 pipes (1-2 PKRs) 1 bpe @ SW_VAR_Z_X 1xaa @ RbPlus + { 3, 9, 281, 194, 0, } , // 4 pipes (1-2 PKRs) 2 bpe @ SW_VAR_Z_X 1xaa @ RbPlus + { 3, 10, 282, 195, 0, } , // 4 pipes (1-2 PKRs) 4 bpe @ SW_VAR_Z_X 1xaa @ RbPlus + { 3, 11, 283, 196, 0, } , // 4 pipes (1-2 PKRs) 8 bpe @ SW_VAR_Z_X 1xaa @ RbPlus + { 3, 7, 284, 197, 0, } , // 4 pipes (1-2 PKRs) 16 bpe @ SW_VAR_Z_X 1xaa @ RbPlus + { 3, 8, 285, 198, 1, } , // 8 pipes (2 PKRs) 1 bpe @ SW_VAR_Z_X 1xaa @ RbPlus + { 3, 9, 286, 199, 2, } , // 8 pipes (2 PKRs) 2 bpe @ SW_VAR_Z_X 1xaa @ RbPlus + { 3, 10, 287, 200, 3, } , // 8 pipes (2 PKRs) 4 bpe @ SW_VAR_Z_X 1xaa @ RbPlus + { 3, 11, 288, 201, 4, } , // 8 pipes (2 PKRs) 8 bpe @ SW_VAR_Z_X 1xaa @ RbPlus + { 3, 7, 289, 202, 5, } , // 8 pipes (2 PKRs) 16 bpe @ SW_VAR_Z_X 1xaa @ RbPlus + { 3, 8, 290, 203, 0, } , // 4 pipes (4 PKRs) 1 bpe @ SW_VAR_Z_X 1xaa @ RbPlus + { 3, 9, 291, 204, 0, } , // 4 pipes (4 PKRs) 2 bpe @ SW_VAR_Z_X 1xaa @ RbPlus + { 3, 10, 292, 205, 0, } , // 4 pipes (4 PKRs) 4 bpe @ SW_VAR_Z_X 1xaa @ RbPlus + { 3, 11, 293, 206, 0, } , // 4 pipes (4 PKRs) 8 bpe @ SW_VAR_Z_X 1xaa @ RbPlus + { 3, 7, 294, 207, 0, } , // 4 pipes (4 PKRs) 16 bpe @ SW_VAR_Z_X 1xaa @ RbPlus + { 3, 8, 295, 208, 6, } , // 8 pipes (4 PKRs) 1 bpe @ SW_VAR_Z_X 1xaa @ RbPlus + { 3, 9, 296, 209, 2, } , // 8 pipes (4 PKRs) 2 bpe @ SW_VAR_Z_X 1xaa @ RbPlus + { 3, 10, 297, 210, 7, } , // 8 pipes (4 PKRs) 4 bpe @ SW_VAR_Z_X 1xaa @ RbPlus + { 3, 11, 298, 211, 4, } , // 8 pipes (4 PKRs) 8 bpe @ SW_VAR_Z_X 1xaa @ RbPlus + { 3, 7, 299, 212, 8, } , // 8 pipes (4 PKRs) 16 bpe @ SW_VAR_Z_X 1xaa @ RbPlus + { 3, 8, 300, 213, 9, } , // 16 pipes (4 PKRs) 1 bpe @ SW_VAR_Z_X 1xaa @ RbPlus + { 3, 9, 300, 214, 10, } , // 16 pipes (4 PKRs) 2 bpe @ SW_VAR_Z_X 1xaa @ RbPlus + { 3, 10, 300, 215, 11, } , // 16 pipes (4 PKRs) 4 bpe @ SW_VAR_Z_X 1xaa @ RbPlus + { 3, 11, 300, 216, 12, } , // 16 pipes (4 PKRs) 8 bpe @ SW_VAR_Z_X 1xaa @ RbPlus + { 3, 7, 300, 217, 13, } , // 16 pipes (4 PKRs) 16 bpe @ SW_VAR_Z_X 1xaa @ RbPlus + { 3, 8, 301, 218, 14, } , // 8 pipes (8 PKRs) 1 bpe @ SW_VAR_Z_X 1xaa @ RbPlus + { 3, 9, 302, 219, 14, } , // 8 pipes (8 PKRs) 2 bpe @ SW_VAR_Z_X 1xaa @ RbPlus + { 3, 10, 303, 220, 14, } , // 8 pipes (8 PKRs) 4 bpe @ SW_VAR_Z_X 1xaa @ RbPlus + { 3, 11, 304, 221, 15, } , // 8 pipes (8 PKRs) 8 bpe @ SW_VAR_Z_X 1xaa @ RbPlus + { 3, 7, 305, 222, 15, } , // 8 pipes (8 PKRs) 16 bpe @ SW_VAR_Z_X 1xaa @ RbPlus + { 3, 8, 306, 213, 9, } , // 16 pipes (8 PKRs) 1 bpe @ SW_VAR_Z_X 1xaa @ RbPlus + { 3, 9, 306, 223, 16, } , // 16 pipes (8 PKRs) 2 bpe @ SW_VAR_Z_X 1xaa @ RbPlus + { 3, 10, 306, 215, 11, } , // 16 pipes (8 PKRs) 4 bpe @ SW_VAR_Z_X 1xaa @ RbPlus + { 3, 11, 307, 216, 17, } , // 16 pipes (8 PKRs) 8 bpe @ SW_VAR_Z_X 1xaa @ RbPlus + { 3, 7, 307, 224, 13, } , // 16 pipes (8 PKRs) 16 bpe @ SW_VAR_Z_X 1xaa @ RbPlus + { 3, 8, 306, 225, 18, } , // 32 pipes (8 PKRs) 1 bpe @ SW_VAR_Z_X 1xaa @ RbPlus + { 3, 9, 306, 226, 19, } , // 32 pipes (8 PKRs) 2 bpe @ SW_VAR_Z_X 1xaa @ RbPlus + { 3, 10, 306, 227, 20, } , // 32 pipes (8 PKRs) 4 bpe @ SW_VAR_Z_X 1xaa @ RbPlus + { 3, 11, 307, 228, 21, } , // 32 pipes (8 PKRs) 8 bpe @ SW_VAR_Z_X 1xaa @ RbPlus + { 3, 7, 307, 229, 22, } , // 32 pipes (8 PKRs) 16 bpe @ SW_VAR_Z_X 1xaa @ RbPlus + { 3, 8, 308, 230, 23, } , // 16 pipes (16 PKRs) 1 bpe @ SW_VAR_Z_X 1xaa @ RbPlus + { 3, 9, 308, 231, 24, } , // 16 pipes (16 PKRs) 2 bpe @ SW_VAR_Z_X 1xaa @ RbPlus + { 3, 10, 308, 232, 25, } , // 16 pipes (16 PKRs) 4 bpe @ SW_VAR_Z_X 1xaa @ RbPlus + { 3, 11, 309, 233, 26, } , // 16 pipes (16 PKRs) 8 bpe @ SW_VAR_Z_X 1xaa @ RbPlus + { 3, 7, 309, 234, 27, } , // 16 pipes (16 PKRs) 16 bpe @ SW_VAR_Z_X 1xaa @ RbPlus + { 3, 8, 308, 235, 28, } , // 32 pipes (16 PKRs) 1 bpe @ SW_VAR_Z_X 1xaa @ RbPlus + { 3, 9, 308, 236, 19, } , // 32 pipes (16 PKRs) 2 bpe @ SW_VAR_Z_X 1xaa @ RbPlus + { 3, 10, 308, 237, 29, } , // 32 pipes (16 PKRs) 4 bpe @ SW_VAR_Z_X 1xaa @ RbPlus + { 3, 11, 309, 238, 30, } , // 32 pipes (16 PKRs) 8 bpe @ SW_VAR_Z_X 1xaa @ RbPlus + { 3, 7, 309, 239, 31, } , // 32 pipes (16 PKRs) 16 bpe @ SW_VAR_Z_X 1xaa @ RbPlus + { 3, 8, 308, 240, 32, } , // 64 pipes (16 PKRs) 1 bpe @ SW_VAR_Z_X 1xaa @ RbPlus + { 3, 9, 308, 241, 33, } , // 64 pipes (16 PKRs) 2 bpe @ SW_VAR_Z_X 1xaa @ RbPlus + { 3, 10, 308, 242, 34, } , // 64 pipes (16 PKRs) 4 bpe @ SW_VAR_Z_X 1xaa @ RbPlus + { 3, 11, 309, 243, 35, } , // 64 pipes (16 PKRs) 8 bpe @ SW_VAR_Z_X 1xaa @ RbPlus + { 3, 7, 309, 244, 36, } , // 64 pipes (16 PKRs) 16 bpe @ SW_VAR_Z_X 1xaa @ RbPlus + { 3, 8, 310, 245, 37, } , // 32 pipes (32 PKRs) 1 bpe @ SW_VAR_Z_X 1xaa @ RbPlus + { 3, 9, 310, 246, 38, } , // 32 pipes (32 PKRs) 2 bpe @ SW_VAR_Z_X 1xaa @ RbPlus + { 3, 10, 310, 247, 39, } , // 32 pipes (32 PKRs) 4 bpe @ SW_VAR_Z_X 1xaa @ RbPlus + { 3, 11, 311, 248, 40, } , // 32 pipes (32 PKRs) 8 bpe @ SW_VAR_Z_X 1xaa @ RbPlus + { 3, 7, 311, 249, 41, } , // 32 pipes (32 PKRs) 16 bpe @ SW_VAR_Z_X 1xaa @ RbPlus + { 3, 8, 310, 250, 32, } , // 64 pipes (32 PKRs) 1 bpe @ SW_VAR_Z_X 1xaa @ RbPlus + { 3, 9, 310, 251, 42, } , // 64 pipes (32 PKRs) 2 bpe @ SW_VAR_Z_X 1xaa @ RbPlus + { 3, 10, 310, 252, 34, } , // 64 pipes (32 PKRs) 4 bpe @ SW_VAR_Z_X 1xaa @ RbPlus + { 3, 11, 311, 253, 43, } , // 64 pipes (32 PKRs) 8 bpe @ SW_VAR_Z_X 1xaa @ RbPlus + { 3, 7, 311, 254, 44, } , // 64 pipes (32 PKRs) 16 bpe @ SW_VAR_Z_X 1xaa @ RbPlus +}; + +const ADDR_SW_PATINFO SW_VAR_Z_X_2xaa_RBPLUS_PATINFO[] = +{ + { 2, 13, 312, 255, 0, } , // 1 pipes (1 PKRs) 1 bpe @ SW_VAR_Z_X 2xaa @ RbPlus + { 2, 14, 272, 185, 0, } , // 1 pipes (1 PKRs) 2 bpe @ SW_VAR_Z_X 2xaa @ RbPlus + { 3, 15, 313, 256, 0, } , // 1 pipes (1 PKRs) 4 bpe @ SW_VAR_Z_X 2xaa @ RbPlus + { 3, 16, 273, 257, 0, } , // 1 pipes (1 PKRs) 8 bpe @ SW_VAR_Z_X 2xaa @ RbPlus + { 3, 17, 314, 258, 0, } , // 1 pipes (1 PKRs) 16 bpe @ SW_VAR_Z_X 2xaa @ RbPlus + { 3, 13, 276, 189, 0, } , // 2 pipes (1-2 PKRs) 1 bpe @ SW_VAR_Z_X 2xaa @ RbPlus + { 3, 14, 277, 190, 0, } , // 2 pipes (1-2 PKRs) 2 bpe @ SW_VAR_Z_X 2xaa @ RbPlus + { 3, 15, 315, 259, 0, } , // 2 pipes (1-2 PKRs) 4 bpe @ SW_VAR_Z_X 2xaa @ RbPlus + { 3, 16, 278, 260, 0, } , // 2 pipes (1-2 PKRs) 8 bpe @ SW_VAR_Z_X 2xaa @ RbPlus + { 3, 17, 316, 261, 0, } , // 2 pipes (1-2 PKRs) 16 bpe @ SW_VAR_Z_X 2xaa @ RbPlus + { 3, 13, 281, 262, 0, } , // 4 pipes (1-2 PKRs) 1 bpe @ SW_VAR_Z_X 2xaa @ RbPlus + { 3, 14, 282, 195, 0, } , // 4 pipes (1-2 PKRs) 2 bpe @ SW_VAR_Z_X 2xaa @ RbPlus + { 3, 15, 282, 263, 0, } , // 4 pipes (1-2 PKRs) 4 bpe @ SW_VAR_Z_X 2xaa @ RbPlus + { 3, 16, 317, 264, 0, } , // 4 pipes (1-2 PKRs) 8 bpe @ SW_VAR_Z_X 2xaa @ RbPlus + { 3, 17, 284, 265, 0, } , // 4 pipes (1-2 PKRs) 16 bpe @ SW_VAR_Z_X 2xaa @ RbPlus + { 3, 13, 286, 209, 2, } , // 8 pipes (2 PKRs) 1 bpe @ SW_VAR_Z_X 2xaa @ RbPlus + { 3, 14, 287, 266, 3, } , // 8 pipes (2 PKRs) 2 bpe @ SW_VAR_Z_X 2xaa @ RbPlus + { 3, 15, 287, 210, 45, } , // 8 pipes (2 PKRs) 4 bpe @ SW_VAR_Z_X 2xaa @ RbPlus + { 3, 16, 288, 211, 46, } , // 8 pipes (2 PKRs) 8 bpe @ SW_VAR_Z_X 2xaa @ RbPlus + { 3, 17, 289, 267, 47, } , // 8 pipes (2 PKRs) 16 bpe @ SW_VAR_Z_X 2xaa @ RbPlus + { 3, 13, 291, 268, 0, } , // 4 pipes (4 PKRs) 1 bpe @ SW_VAR_Z_X 2xaa @ RbPlus + { 3, 14, 292, 205, 0, } , // 4 pipes (4 PKRs) 2 bpe @ SW_VAR_Z_X 2xaa @ RbPlus + { 3, 15, 292, 269, 0, } , // 4 pipes (4 PKRs) 4 bpe @ SW_VAR_Z_X 2xaa @ RbPlus + { 3, 16, 293, 270, 0, } , // 4 pipes (4 PKRs) 8 bpe @ SW_VAR_Z_X 2xaa @ RbPlus + { 3, 17, 294, 271, 0, } , // 4 pipes (4 PKRs) 16 bpe @ SW_VAR_Z_X 2xaa @ RbPlus + { 3, 13, 296, 209, 2, } , // 8 pipes (4 PKRs) 1 bpe @ SW_VAR_Z_X 2xaa @ RbPlus + { 3, 14, 297, 210, 7, } , // 8 pipes (4 PKRs) 2 bpe @ SW_VAR_Z_X 2xaa @ RbPlus + { 3, 15, 297, 210, 45, } , // 8 pipes (4 PKRs) 4 bpe @ SW_VAR_Z_X 2xaa @ RbPlus + { 3, 16, 298, 211, 46, } , // 8 pipes (4 PKRs) 8 bpe @ SW_VAR_Z_X 2xaa @ RbPlus + { 3, 17, 299, 212, 47, } , // 8 pipes (4 PKRs) 16 bpe @ SW_VAR_Z_X 2xaa @ RbPlus + { 3, 13, 300, 272, 48, } , // 16 pipes (4 PKRs) 1 bpe @ SW_VAR_Z_X 2xaa @ RbPlus + { 3, 14, 300, 273, 11, } , // 16 pipes (4 PKRs) 2 bpe @ SW_VAR_Z_X 2xaa @ RbPlus + { 3, 15, 300, 273, 49, } , // 16 pipes (4 PKRs) 4 bpe @ SW_VAR_Z_X 2xaa @ RbPlus + { 3, 16, 300, 274, 50, } , // 16 pipes (4 PKRs) 8 bpe @ SW_VAR_Z_X 2xaa @ RbPlus + { 3, 17, 300, 275, 51, } , // 16 pipes (4 PKRs) 16 bpe @ SW_VAR_Z_X 2xaa @ RbPlus + { 3, 13, 302, 219, 14, } , // 8 pipes (8 PKRs) 1 bpe @ SW_VAR_Z_X 2xaa @ RbPlus + { 3, 14, 303, 220, 14, } , // 8 pipes (8 PKRs) 2 bpe @ SW_VAR_Z_X 2xaa @ RbPlus + { 3, 15, 303, 276, 14, } , // 8 pipes (8 PKRs) 4 bpe @ SW_VAR_Z_X 2xaa @ RbPlus + { 3, 16, 304, 277, 15, } , // 8 pipes (8 PKRs) 8 bpe @ SW_VAR_Z_X 2xaa @ RbPlus + { 3, 17, 305, 278, 15, } , // 8 pipes (8 PKRs) 16 bpe @ SW_VAR_Z_X 2xaa @ RbPlus + { 3, 13, 306, 279, 48, } , // 16 pipes (8 PKRs) 1 bpe @ SW_VAR_Z_X 2xaa @ RbPlus + { 3, 14, 306, 215, 11, } , // 16 pipes (8 PKRs) 2 bpe @ SW_VAR_Z_X 2xaa @ RbPlus + { 3, 15, 306, 280, 49, } , // 16 pipes (8 PKRs) 4 bpe @ SW_VAR_Z_X 2xaa @ RbPlus + { 3, 16, 307, 281, 52, } , // 16 pipes (8 PKRs) 8 bpe @ SW_VAR_Z_X 2xaa @ RbPlus + { 3, 17, 307, 224, 53, } , // 16 pipes (8 PKRs) 16 bpe @ SW_VAR_Z_X 2xaa @ RbPlus + { 3, 13, 306, 236, 19, } , // 32 pipes (8 PKRs) 1 bpe @ SW_VAR_Z_X 2xaa @ RbPlus + { 3, 14, 306, 237, 54, } , // 32 pipes (8 PKRs) 2 bpe @ SW_VAR_Z_X 2xaa @ RbPlus + { 3, 15, 306, 237, 55, } , // 32 pipes (8 PKRs) 4 bpe @ SW_VAR_Z_X 2xaa @ RbPlus + { 3, 16, 307, 282, 56, } , // 32 pipes (8 PKRs) 8 bpe @ SW_VAR_Z_X 2xaa @ RbPlus + { 3, 17, 307, 283, 57, } , // 32 pipes (8 PKRs) 16 bpe @ SW_VAR_Z_X 2xaa @ RbPlus + { 3, 13, 308, 284, 24, } , // 16 pipes (16 PKRs) 1 bpe @ SW_VAR_Z_X 2xaa @ RbPlus + { 3, 14, 308, 232, 25, } , // 16 pipes (16 PKRs) 2 bpe @ SW_VAR_Z_X 2xaa @ RbPlus + { 3, 15, 308, 285, 58, } , // 16 pipes (16 PKRs) 4 bpe @ SW_VAR_Z_X 2xaa @ RbPlus + { 3, 16, 309, 233, 59, } , // 16 pipes (16 PKRs) 8 bpe @ SW_VAR_Z_X 2xaa @ RbPlus + { 3, 17, 309, 286, 60, } , // 16 pipes (16 PKRs) 16 bpe @ SW_VAR_Z_X 2xaa @ RbPlus + { 3, 13, 308, 236, 19, } , // 32 pipes (16 PKRs) 1 bpe @ SW_VAR_Z_X 2xaa @ RbPlus + { 3, 14, 308, 237, 29, } , // 32 pipes (16 PKRs) 2 bpe @ SW_VAR_Z_X 2xaa @ RbPlus + { 3, 15, 308, 237, 55, } , // 32 pipes (16 PKRs) 4 bpe @ SW_VAR_Z_X 2xaa @ RbPlus + { 3, 16, 309, 238, 56, } , // 32 pipes (16 PKRs) 8 bpe @ SW_VAR_Z_X 2xaa @ RbPlus + { 3, 17, 309, 239, 61, } , // 32 pipes (16 PKRs) 16 bpe @ SW_VAR_Z_X 2xaa @ RbPlus + { 3, 13, 308, 241, 62, } , // 64 pipes (16 PKRs) 1 bpe @ SW_VAR_Z_X 2xaa @ RbPlus + { 3, 14, 308, 242, 34, } , // 64 pipes (16 PKRs) 2 bpe @ SW_VAR_Z_X 2xaa @ RbPlus + { 3, 15, 308, 242, 63, } , // 64 pipes (16 PKRs) 4 bpe @ SW_VAR_Z_X 2xaa @ RbPlus + { 3, 16, 309, 243, 64, } , // 64 pipes (16 PKRs) 8 bpe @ SW_VAR_Z_X 2xaa @ RbPlus + { 3, 17, 309, 244, 65, } , // 64 pipes (16 PKRs) 16 bpe @ SW_VAR_Z_X 2xaa @ RbPlus + { 3, 13, 310, 246, 38, } , // 32 pipes (32 PKRs) 1 bpe @ SW_VAR_Z_X 2xaa @ RbPlus + { 3, 14, 310, 247, 39, } , // 32 pipes (32 PKRs) 2 bpe @ SW_VAR_Z_X 2xaa @ RbPlus + { 3, 15, 310, 247, 66, } , // 32 pipes (32 PKRs) 4 bpe @ SW_VAR_Z_X 2xaa @ RbPlus + { 3, 16, 318, 287, 67, } , // 32 pipes (32 PKRs) 8 bpe @ SW_VAR_Z_X 2xaa @ RbPlus + { 3, 17, 318, 288, 68, } , // 32 pipes (32 PKRs) 16 bpe @ SW_VAR_Z_X 2xaa @ RbPlus + { 3, 13, 310, 251, 62, } , // 64 pipes (32 PKRs) 1 bpe @ SW_VAR_Z_X 2xaa @ RbPlus + { 3, 14, 310, 252, 34, } , // 64 pipes (32 PKRs) 2 bpe @ SW_VAR_Z_X 2xaa @ RbPlus + { 3, 15, 310, 252, 63, } , // 64 pipes (32 PKRs) 4 bpe @ SW_VAR_Z_X 2xaa @ RbPlus + { 3, 16, 318, 289, 69, } , // 64 pipes (32 PKRs) 8 bpe @ SW_VAR_Z_X 2xaa @ RbPlus + { 3, 17, 318, 290, 65, } , // 64 pipes (32 PKRs) 16 bpe @ SW_VAR_Z_X 2xaa @ RbPlus +}; + +const ADDR_SW_PATINFO SW_VAR_Z_X_4xaa_RBPLUS_PATINFO[] = +{ + { 2, 18, 272, 185, 0, } , // 1 pipes (1 PKRs) 1 bpe @ SW_VAR_Z_X 4xaa @ RbPlus + { 3, 19, 272, 291, 0, } , // 1 pipes (1 PKRs) 2 bpe @ SW_VAR_Z_X 4xaa @ RbPlus + { 3, 20, 272, 292, 0, } , // 1 pipes (1 PKRs) 4 bpe @ SW_VAR_Z_X 4xaa @ RbPlus + { 3, 21, 273, 293, 0, } , // 1 pipes (1 PKRs) 8 bpe @ SW_VAR_Z_X 4xaa @ RbPlus + { 3, 22, 274, 294, 0, } , // 1 pipes (1 PKRs) 16 bpe @ SW_VAR_Z_X 4xaa @ RbPlus + { 3, 18, 277, 190, 0, } , // 2 pipes (1-2 PKRs) 1 bpe @ SW_VAR_Z_X 4xaa @ RbPlus + { 3, 19, 315, 259, 0, } , // 2 pipes (1-2 PKRs) 2 bpe @ SW_VAR_Z_X 4xaa @ RbPlus + { 3, 20, 277, 295, 0, } , // 2 pipes (1-2 PKRs) 4 bpe @ SW_VAR_Z_X 4xaa @ RbPlus + { 3, 21, 319, 296, 0, } , // 2 pipes (1-2 PKRs) 8 bpe @ SW_VAR_Z_X 4xaa @ RbPlus + { 3, 22, 279, 297, 0, } , // 2 pipes (1-2 PKRs) 16 bpe @ SW_VAR_Z_X 4xaa @ RbPlus + { 3, 18, 282, 195, 0, } , // 4 pipes (1-2 PKRs) 1 bpe @ SW_VAR_Z_X 4xaa @ RbPlus + { 3, 19, 282, 298, 0, } , // 4 pipes (1-2 PKRs) 2 bpe @ SW_VAR_Z_X 4xaa @ RbPlus + { 3, 20, 282, 299, 0, } , // 4 pipes (1-2 PKRs) 4 bpe @ SW_VAR_Z_X 4xaa @ RbPlus + { 3, 21, 283, 300, 0, } , // 4 pipes (1-2 PKRs) 8 bpe @ SW_VAR_Z_X 4xaa @ RbPlus + { 3, 22, 284, 301, 0, } , // 4 pipes (1-2 PKRs) 16 bpe @ SW_VAR_Z_X 4xaa @ RbPlus + { 3, 18, 287, 200, 3, } , // 8 pipes (2 PKRs) 1 bpe @ SW_VAR_Z_X 4xaa @ RbPlus + { 3, 19, 287, 302, 45, } , // 8 pipes (2 PKRs) 2 bpe @ SW_VAR_Z_X 4xaa @ RbPlus + { 3, 20, 287, 303, 70, } , // 8 pipes (2 PKRs) 4 bpe @ SW_VAR_Z_X 4xaa @ RbPlus + { 3, 21, 289, 304, 71, } , // 8 pipes (2 PKRs) 8 bpe @ SW_VAR_Z_X 4xaa @ RbPlus + { 3, 22, 289, 305, 72, } , // 8 pipes (2 PKRs) 16 bpe @ SW_VAR_Z_X 4xaa @ RbPlus + { 3, 18, 292, 205, 0, } , // 4 pipes (4 PKRs) 1 bpe @ SW_VAR_Z_X 4xaa @ RbPlus + { 3, 19, 292, 306, 0, } , // 4 pipes (4 PKRs) 2 bpe @ SW_VAR_Z_X 4xaa @ RbPlus + { 3, 20, 292, 307, 0, } , // 4 pipes (4 PKRs) 4 bpe @ SW_VAR_Z_X 4xaa @ RbPlus + { 3, 21, 320, 308, 0, } , // 4 pipes (4 PKRs) 8 bpe @ SW_VAR_Z_X 4xaa @ RbPlus + { 3, 22, 321, 309, 0, } , // 4 pipes (4 PKRs) 16 bpe @ SW_VAR_Z_X 4xaa @ RbPlus + { 3, 18, 297, 210, 7, } , // 8 pipes (4 PKRs) 1 bpe @ SW_VAR_Z_X 4xaa @ RbPlus + { 3, 19, 297, 210, 45, } , // 8 pipes (4 PKRs) 2 bpe @ SW_VAR_Z_X 4xaa @ RbPlus + { 3, 20, 297, 310, 45, } , // 8 pipes (4 PKRs) 4 bpe @ SW_VAR_Z_X 4xaa @ RbPlus + { 3, 21, 298, 311, 71, } , // 8 pipes (4 PKRs) 8 bpe @ SW_VAR_Z_X 4xaa @ RbPlus + { 3, 22, 299, 312, 47, } , // 8 pipes (4 PKRs) 16 bpe @ SW_VAR_Z_X 4xaa @ RbPlus + { 3, 18, 300, 215, 11, } , // 16 pipes (4 PKRs) 1 bpe @ SW_VAR_Z_X 4xaa @ RbPlus + { 3, 19, 300, 215, 73, } , // 16 pipes (4 PKRs) 2 bpe @ SW_VAR_Z_X 4xaa @ RbPlus + { 3, 20, 300, 215, 74, } , // 16 pipes (4 PKRs) 4 bpe @ SW_VAR_Z_X 4xaa @ RbPlus + { 3, 21, 300, 216, 75, } , // 16 pipes (4 PKRs) 8 bpe @ SW_VAR_Z_X 4xaa @ RbPlus + { 3, 22, 300, 217, 76, } , // 16 pipes (4 PKRs) 16 bpe @ SW_VAR_Z_X 4xaa @ RbPlus + { 3, 18, 303, 220, 14, } , // 8 pipes (8 PKRs) 1 bpe @ SW_VAR_Z_X 4xaa @ RbPlus + { 3, 19, 303, 276, 14, } , // 8 pipes (8 PKRs) 2 bpe @ SW_VAR_Z_X 4xaa @ RbPlus + { 3, 20, 303, 313, 14, } , // 8 pipes (8 PKRs) 4 bpe @ SW_VAR_Z_X 4xaa @ RbPlus + { 3, 21, 305, 314, 15, } , // 8 pipes (8 PKRs) 8 bpe @ SW_VAR_Z_X 4xaa @ RbPlus + { 3, 22, 322, 315, 15, } , // 8 pipes (8 PKRs) 16 bpe @ SW_VAR_Z_X 4xaa @ RbPlus + { 3, 18, 306, 215, 11, } , // 16 pipes (8 PKRs) 1 bpe @ SW_VAR_Z_X 4xaa @ RbPlus + { 3, 19, 306, 232, 77, } , // 16 pipes (8 PKRs) 2 bpe @ SW_VAR_Z_X 4xaa @ RbPlus + { 3, 20, 306, 215, 78, } , // 16 pipes (8 PKRs) 4 bpe @ SW_VAR_Z_X 4xaa @ RbPlus + { 3, 21, 307, 216, 79, } , // 16 pipes (8 PKRs) 8 bpe @ SW_VAR_Z_X 4xaa @ RbPlus + { 3, 22, 307, 224, 80, } , // 16 pipes (8 PKRs) 16 bpe @ SW_VAR_Z_X 4xaa @ RbPlus + { 3, 18, 306, 227, 20, } , // 32 pipes (8 PKRs) 1 bpe @ SW_VAR_Z_X 4xaa @ RbPlus + { 3, 19, 306, 316, 55, } , // 32 pipes (8 PKRs) 2 bpe @ SW_VAR_Z_X 4xaa @ RbPlus + { 3, 20, 306, 227, 81, } , // 32 pipes (8 PKRs) 4 bpe @ SW_VAR_Z_X 4xaa @ RbPlus + { 3, 21, 307, 317, 82, } , // 32 pipes (8 PKRs) 8 bpe @ SW_VAR_Z_X 4xaa @ RbPlus + { 3, 22, 307, 229, 83, } , // 32 pipes (8 PKRs) 16 bpe @ SW_VAR_Z_X 4xaa @ RbPlus + { 3, 18, 308, 232, 25, } , // 16 pipes (16 PKRs) 1 bpe @ SW_VAR_Z_X 4xaa @ RbPlus + { 3, 19, 308, 232, 84, } , // 16 pipes (16 PKRs) 2 bpe @ SW_VAR_Z_X 4xaa @ RbPlus + { 3, 20, 308, 318, 84, } , // 16 pipes (16 PKRs) 4 bpe @ SW_VAR_Z_X 4xaa @ RbPlus + { 3, 21, 323, 319, 85, } , // 16 pipes (16 PKRs) 8 bpe @ SW_VAR_Z_X 4xaa @ RbPlus + { 3, 22, 323, 320, 86, } , // 16 pipes (16 PKRs) 16 bpe @ SW_VAR_Z_X 4xaa @ RbPlus + { 3, 18, 308, 237, 29, } , // 32 pipes (16 PKRs) 1 bpe @ SW_VAR_Z_X 4xaa @ RbPlus + { 3, 19, 308, 237, 55, } , // 32 pipes (16 PKRs) 2 bpe @ SW_VAR_Z_X 4xaa @ RbPlus + { 3, 20, 308, 237, 87, } , // 32 pipes (16 PKRs) 4 bpe @ SW_VAR_Z_X 4xaa @ RbPlus + { 3, 21, 323, 321, 88, } , // 32 pipes (16 PKRs) 8 bpe @ SW_VAR_Z_X 4xaa @ RbPlus + { 3, 22, 323, 322, 89, } , // 32 pipes (16 PKRs) 16 bpe @ SW_VAR_Z_X 4xaa @ RbPlus + { 3, 18, 308, 242, 34, } , // 64 pipes (16 PKRs) 1 bpe @ SW_VAR_Z_X 4xaa @ RbPlus + { 3, 19, 308, 242, 90, } , // 64 pipes (16 PKRs) 2 bpe @ SW_VAR_Z_X 4xaa @ RbPlus + { 3, 20, 308, 242, 91, } , // 64 pipes (16 PKRs) 4 bpe @ SW_VAR_Z_X 4xaa @ RbPlus + { 3, 21, 323, 323, 92, } , // 64 pipes (16 PKRs) 8 bpe @ SW_VAR_Z_X 4xaa @ RbPlus + { 3, 22, 323, 324, 93, } , // 64 pipes (16 PKRs) 16 bpe @ SW_VAR_Z_X 4xaa @ RbPlus + { 3, 18, 310, 247, 39, } , // 32 pipes (32 PKRs) 1 bpe @ SW_VAR_Z_X 4xaa @ RbPlus + { 3, 19, 310, 247, 66, } , // 32 pipes (32 PKRs) 2 bpe @ SW_VAR_Z_X 4xaa @ RbPlus + { 3, 20, 310, 247, 94, } , // 32 pipes (32 PKRs) 4 bpe @ SW_VAR_Z_X 4xaa @ RbPlus + { 3, 21, 324, 325, 95, } , // 32 pipes (32 PKRs) 8 bpe @ SW_VAR_Z_X 4xaa @ RbPlus + { 3, 22, 324, 326, 96, } , // 32 pipes (32 PKRs) 16 bpe @ SW_VAR_Z_X 4xaa @ RbPlus + { 3, 18, 310, 252, 34, } , // 64 pipes (32 PKRs) 1 bpe @ SW_VAR_Z_X 4xaa @ RbPlus + { 3, 19, 310, 252, 97, } , // 64 pipes (32 PKRs) 2 bpe @ SW_VAR_Z_X 4xaa @ RbPlus + { 3, 20, 310, 252, 98, } , // 64 pipes (32 PKRs) 4 bpe @ SW_VAR_Z_X 4xaa @ RbPlus + { 3, 21, 324, 327, 99, } , // 64 pipes (32 PKRs) 8 bpe @ SW_VAR_Z_X 4xaa @ RbPlus + { 3, 22, 324, 328, 100, } , // 64 pipes (32 PKRs) 16 bpe @ SW_VAR_Z_X 4xaa @ RbPlus +}; + +const ADDR_SW_PATINFO SW_VAR_Z_X_8xaa_RBPLUS_PATINFO[] = +{ + { 3, 23, 313, 256, 0, } , // 1 pipes (1 PKRs) 1 bpe @ SW_VAR_Z_X 8xaa @ RbPlus + { 3, 24, 272, 292, 0, } , // 1 pipes (1 PKRs) 2 bpe @ SW_VAR_Z_X 8xaa @ RbPlus + { 3, 25, 325, 292, 0, } , // 1 pipes (1 PKRs) 4 bpe @ SW_VAR_Z_X 8xaa @ RbPlus + { 3, 26, 326, 329, 0, } , // 1 pipes (1 PKRs) 8 bpe @ SW_VAR_Z_X 8xaa @ RbPlus + { 3, 27, 327, 294, 0, } , // 1 pipes (1 PKRs) 16 bpe @ SW_VAR_Z_X 8xaa @ RbPlus + { 3, 23, 315, 259, 0, } , // 2 pipes (1-2 PKRs) 1 bpe @ SW_VAR_Z_X 8xaa @ RbPlus + { 3, 24, 277, 295, 0, } , // 2 pipes (1-2 PKRs) 2 bpe @ SW_VAR_Z_X 8xaa @ RbPlus + { 3, 25, 315, 330, 0, } , // 2 pipes (1-2 PKRs) 4 bpe @ SW_VAR_Z_X 8xaa @ RbPlus + { 3, 26, 278, 331, 0, } , // 2 pipes (1-2 PKRs) 8 bpe @ SW_VAR_Z_X 8xaa @ RbPlus + { 3, 27, 328, 331, 0, } , // 2 pipes (1-2 PKRs) 16 bpe @ SW_VAR_Z_X 8xaa @ RbPlus + { 3, 23, 282, 263, 0, } , // 4 pipes (1-2 PKRs) 1 bpe @ SW_VAR_Z_X 8xaa @ RbPlus + { 3, 24, 282, 299, 0, } , // 4 pipes (1-2 PKRs) 2 bpe @ SW_VAR_Z_X 8xaa @ RbPlus + { 3, 25, 282, 332, 0, } , // 4 pipes (1-2 PKRs) 4 bpe @ SW_VAR_Z_X 8xaa @ RbPlus + { 3, 26, 317, 333, 0, } , // 4 pipes (1-2 PKRs) 8 bpe @ SW_VAR_Z_X 8xaa @ RbPlus + { 3, 27, 329, 334, 0, } , // 4 pipes (1-2 PKRs) 16 bpe @ SW_VAR_Z_X 8xaa @ RbPlus + { 3, 23, 287, 210, 45, } , // 8 pipes (2 PKRs) 1 bpe @ SW_VAR_Z_X 8xaa @ RbPlus + { 3, 24, 287, 335, 70, } , // 8 pipes (2 PKRs) 2 bpe @ SW_VAR_Z_X 8xaa @ RbPlus + { 3, 25, 287, 336, 70, } , // 8 pipes (2 PKRs) 4 bpe @ SW_VAR_Z_X 8xaa @ RbPlus + { 3, 26, 330, 337, 72, } , // 8 pipes (2 PKRs) 8 bpe @ SW_VAR_Z_X 8xaa @ RbPlus + { 3, 27, 331, 338, 101, } , // 8 pipes (2 PKRs) 16 bpe @ SW_VAR_Z_X 8xaa @ RbPlus + { 3, 23, 292, 269, 0, } , // 4 pipes (4 PKRs) 1 bpe @ SW_VAR_Z_X 8xaa @ RbPlus + { 3, 24, 292, 307, 0, } , // 4 pipes (4 PKRs) 2 bpe @ SW_VAR_Z_X 8xaa @ RbPlus + { 3, 25, 292, 339, 0, } , // 4 pipes (4 PKRs) 4 bpe @ SW_VAR_Z_X 8xaa @ RbPlus + { 3, 26, 332, 340, 0, } , // 4 pipes (4 PKRs) 8 bpe @ SW_VAR_Z_X 8xaa @ RbPlus + { 3, 27, 333, 341, 0, } , // 4 pipes (4 PKRs) 16 bpe @ SW_VAR_Z_X 8xaa @ RbPlus + { 3, 23, 297, 210, 45, } , // 8 pipes (4 PKRs) 1 bpe @ SW_VAR_Z_X 8xaa @ RbPlus + { 3, 24, 297, 310, 45, } , // 8 pipes (4 PKRs) 2 bpe @ SW_VAR_Z_X 8xaa @ RbPlus + { 3, 25, 297, 342, 45, } , // 8 pipes (4 PKRs) 4 bpe @ SW_VAR_Z_X 8xaa @ RbPlus + { 3, 26, 299, 343, 102, } , // 8 pipes (4 PKRs) 8 bpe @ SW_VAR_Z_X 8xaa @ RbPlus + { 3, 27, 334, 344, 103, } , // 8 pipes (4 PKRs) 16 bpe @ SW_VAR_Z_X 8xaa @ RbPlus + { 3, 23, 300, 273, 49, } , // 16 pipes (4 PKRs) 1 bpe @ SW_VAR_Z_X 8xaa @ RbPlus + { 3, 24, 300, 273, 74, } , // 16 pipes (4 PKRs) 2 bpe @ SW_VAR_Z_X 8xaa @ RbPlus + { 3, 25, 300, 345, 74, } , // 16 pipes (4 PKRs) 4 bpe @ SW_VAR_Z_X 8xaa @ RbPlus + { 3, 26, 335, 346, 76, } , // 16 pipes (4 PKRs) 8 bpe @ SW_VAR_Z_X 8xaa @ RbPlus + { 3, 27, 336, 286, 104, } , // 16 pipes (4 PKRs) 16 bpe @ SW_VAR_Z_X 8xaa @ RbPlus + { 3, 23, 303, 276, 14, } , // 8 pipes (8 PKRs) 1 bpe @ SW_VAR_Z_X 8xaa @ RbPlus + { 3, 24, 303, 313, 14, } , // 8 pipes (8 PKRs) 2 bpe @ SW_VAR_Z_X 8xaa @ RbPlus + { 3, 25, 303, 347, 14, } , // 8 pipes (8 PKRs) 4 bpe @ SW_VAR_Z_X 8xaa @ RbPlus + { 3, 26, 337, 348, 105, } , // 8 pipes (8 PKRs) 8 bpe @ SW_VAR_Z_X 8xaa @ RbPlus + { 3, 27, 338, 349, 106, } , // 8 pipes (8 PKRs) 16 bpe @ SW_VAR_Z_X 8xaa @ RbPlus + { 3, 23, 306, 280, 49, } , // 16 pipes (8 PKRs) 1 bpe @ SW_VAR_Z_X 8xaa @ RbPlus + { 3, 24, 306, 215, 78, } , // 16 pipes (8 PKRs) 2 bpe @ SW_VAR_Z_X 8xaa @ RbPlus + { 3, 25, 306, 350, 74, } , // 16 pipes (8 PKRs) 4 bpe @ SW_VAR_Z_X 8xaa @ RbPlus + { 3, 26, 339, 351, 107, } , // 16 pipes (8 PKRs) 8 bpe @ SW_VAR_Z_X 8xaa @ RbPlus + { 3, 27, 340, 351, 108, } , // 16 pipes (8 PKRs) 16 bpe @ SW_VAR_Z_X 8xaa @ RbPlus + { 3, 23, 306, 237, 55, } , // 32 pipes (8 PKRs) 1 bpe @ SW_VAR_Z_X 8xaa @ RbPlus + { 3, 24, 306, 237, 109, } , // 32 pipes (8 PKRs) 2 bpe @ SW_VAR_Z_X 8xaa @ RbPlus + { 3, 25, 306, 237, 110, } , // 32 pipes (8 PKRs) 4 bpe @ SW_VAR_Z_X 8xaa @ RbPlus + { 3, 26, 339, 352, 111, } , // 32 pipes (8 PKRs) 8 bpe @ SW_VAR_Z_X 8xaa @ RbPlus + { 3, 27, 339, 353, 112, } , // 32 pipes (8 PKRs) 16 bpe @ SW_VAR_Z_X 8xaa @ RbPlus + { 3, 23, 308, 285, 58, } , // 16 pipes (16 PKRs) 1 bpe @ SW_VAR_Z_X 8xaa @ RbPlus + { 3, 24, 308, 318, 84, } , // 16 pipes (16 PKRs) 2 bpe @ SW_VAR_Z_X 8xaa @ RbPlus + { 3, 25, 308, 354, 84, } , // 16 pipes (16 PKRs) 4 bpe @ SW_VAR_Z_X 8xaa @ RbPlus + { 3, 26, 341, 355, 113, } , // 16 pipes (16 PKRs) 8 bpe @ SW_VAR_Z_X 8xaa @ RbPlus + { 3, 27, 342, 356, 114, } , // 16 pipes (16 PKRs) 16 bpe @ SW_VAR_Z_X 8xaa @ RbPlus + { 3, 23, 308, 237, 55, } , // 32 pipes (16 PKRs) 1 bpe @ SW_VAR_Z_X 8xaa @ RbPlus + { 3, 24, 308, 237, 87, } , // 32 pipes (16 PKRs) 2 bpe @ SW_VAR_Z_X 8xaa @ RbPlus + { 3, 25, 308, 237, 115, } , // 32 pipes (16 PKRs) 4 bpe @ SW_VAR_Z_X 8xaa @ RbPlus + { 3, 26, 343, 357, 116, } , // 32 pipes (16 PKRs) 8 bpe @ SW_VAR_Z_X 8xaa @ RbPlus + { 3, 27, 341, 358, 117, } , // 32 pipes (16 PKRs) 16 bpe @ SW_VAR_Z_X 8xaa @ RbPlus + { 3, 23, 308, 242, 63, } , // 64 pipes (16 PKRs) 1 bpe @ SW_VAR_Z_X 8xaa @ RbPlus + { 3, 24, 308, 242, 91, } , // 64 pipes (16 PKRs) 2 bpe @ SW_VAR_Z_X 8xaa @ RbPlus + { 3, 25, 308, 242, 118, } , // 64 pipes (16 PKRs) 4 bpe @ SW_VAR_Z_X 8xaa @ RbPlus + { 3, 26, 343, 359, 119, } , // 64 pipes (16 PKRs) 8 bpe @ SW_VAR_Z_X 8xaa @ RbPlus + { 3, 27, 343, 360, 120, } , // 64 pipes (16 PKRs) 16 bpe @ SW_VAR_Z_X 8xaa @ RbPlus + { 3, 23, 310, 247, 66, } , // 32 pipes (32 PKRs) 1 bpe @ SW_VAR_Z_X 8xaa @ RbPlus + { 3, 24, 310, 247, 94, } , // 32 pipes (32 PKRs) 2 bpe @ SW_VAR_Z_X 8xaa @ RbPlus + { 3, 25, 310, 361, 94, } , // 32 pipes (32 PKRs) 4 bpe @ SW_VAR_Z_X 8xaa @ RbPlus + { 3, 26, 344, 362, 121, } , // 32 pipes (32 PKRs) 8 bpe @ SW_VAR_Z_X 8xaa @ RbPlus + { 3, 27, 345, 363, 122, } , // 32 pipes (32 PKRs) 16 bpe @ SW_VAR_Z_X 8xaa @ RbPlus + { 3, 23, 310, 252, 63, } , // 64 pipes (32 PKRs) 1 bpe @ SW_VAR_Z_X 8xaa @ RbPlus + { 3, 24, 310, 252, 98, } , // 64 pipes (32 PKRs) 2 bpe @ SW_VAR_Z_X 8xaa @ RbPlus + { 3, 25, 310, 252, 118, } , // 64 pipes (32 PKRs) 4 bpe @ SW_VAR_Z_X 8xaa @ RbPlus + { 3, 26, 346, 364, 123, } , // 64 pipes (32 PKRs) 8 bpe @ SW_VAR_Z_X 8xaa @ RbPlus + { 3, 27, 344, 365, 124, } , // 64 pipes (32 PKRs) 16 bpe @ SW_VAR_Z_X 8xaa @ RbPlus +}; + +const UINT_64 GFX10_SW_PATTERN_NIBBLE01[][8] = +{ + {X0, X1, X2, X3, Y0, Y1, Y2, Y3, }, // 0 + {0, X0, X1, X2, Y0, Y1, Y2, X3, }, // 1 + {0, 0, X0, X1, Y0, Y1, Y2, X2, }, // 2 + {0, 0, 0, X0, Y0, Y1, X1, X2, }, // 3 + {0, 0, 0, 0, Y0, Y1, X0, X1, }, // 4 + {X0, X1, X2, Y1, Y0, Y2, X3, Y3, }, // 5 + {0, 0, 0, X0, Y0, X1, X2, Y1, }, // 6 + {0, 0, 0, 0, X0, Y0, X1, Y1, }, // 7 + {X0, Y0, X1, Y1, X2, Y2, X3, Y3, }, // 8 + {0, X0, Y0, X1, Y1, X2, Y2, X3, }, // 9 + {0, 0, X0, Y0, X1, Y1, X2, Y2, }, // 10 + {0, 0, 0, X0, Y0, X1, Y1, X2, }, // 11 + {X0, Y0, X1, Y1, X2, Y2, X3, Y4, }, // 12 + {S0, X0, Y0, X1, Y1, X2, Y2, X3, }, // 13 + {0, S0, X0, Y0, X1, Y1, X2, Y2, }, // 14 + {0, 0, S0, X0, Y0, X1, Y1, X2, }, // 15 + {0, 0, 0, S0, X0, Y0, X1, Y1, }, // 16 + {0, 0, 0, 0, S0, X0, Y0, X1, }, // 17 + {S0, S1, X0, Y0, X1, Y1, X2, Y2, }, // 18 + {0, S0, S1, X0, Y0, X1, Y1, X2, }, // 19 + {0, 0, S0, S1, X0, Y0, X1, Y1, }, // 20 + {0, 0, 0, S0, S1, X0, Y0, X1, }, // 21 + {0, 0, 0, 0, S0, S1, X0, Y0, }, // 22 + {S0, S1, S2, X0, Y0, X1, Y1, X2, }, // 23 + {0, S0, S1, S2, X0, Y0, X1, Y1, }, // 24 + {0, 0, S0, S1, S2, X0, Y0, X1, }, // 25 + {0, 0, 0, S0, S1, S2, X0, Y0, }, // 26 + {0, 0, 0, 0, S0, S1, S2, X0, }, // 27 + {X0, X1, X2, Y1, Y0, Y2, X3, Y4, }, // 28 + {X0, X1, Z0, Y0, Z1, Y1, X2, Z2, }, // 29 + {0, X0, Z0, Y0, Z1, Y1, X1, Z2, }, // 30 + {0, 0, X0, Y0, Z0, Y1, X1, Z1, }, // 31 + {0, 0, 0, X0, Z0, Y0, X1, Z1, }, // 32 + {0, 0, 0, 0, Z0, Y0, X0, Z1, }, // 33 + {X0, X1, Z0, Y0, Y1, Z1, X2, Z2, }, // 34 + {0, X0, Z0, Y0, X1, Z1, Y1, Z2, }, // 35 + {0, 0, X0, Y0, X1, Z0, Y1, Z1, }, // 36 + {0, 0, 0, X0, Y0, Z0, X1, Z1, }, // 37 + {0, 0, 0, 0, X0, Z0, Y0, Z1, }, // 38 + {0, 0, X0, X1, Y0, Y1, X2, Y2, }, // 39 +}; + +const UINT_64 GFX10_SW_PATTERN_NIBBLE2[][4] = +{ + {0, 0, 0, 0, }, // 0 + {Y4, X4, Y5, X5, }, // 1 + {Y3, X4, Y4, X5, }, // 2 + {Y3, X3, Y4, X4, }, // 3 + {Y2, X3, Y3, X4, }, // 4 + {Y2, X2, Y3, X3, }, // 5 + {Z0^X4^Y4, X4, Y5, X5, }, // 6 + {Z0^Y3^X4, X4, Y4, X5, }, // 7 + {Z0^X3^Y3, X3, Y4, X4, }, // 8 + {Z0^Y2^X3, X3, Y3, X4, }, // 9 + {Z0^X2^Y2, X2, Y3, X3, }, // 10 + {Z1^Y4^X5, Z0^X4^Y5, Y5, X5, }, // 11 + {Z1^Y3^X5, Z0^X4^Y4, Y4, X5, }, // 12 + {Z1^Y3^X4, Z0^X3^Y4, Y4, X4, }, // 13 + {Z1^Y2^X4, Z0^X3^Y3, Y3, X4, }, // 14 + {Z1^Y2^X3, Z0^X2^Y3, Y3, X3, }, // 15 + {Z2^Y4^X6, Z1^X4^Y6, Z0^X5^Y5, X5, }, // 16 + {Z2^Y3^X6, Z1^X4^Y5, Z0^Y4^X5, X5, }, // 17 + {Z2^Y3^X5, Z1^X3^Y5, Z0^X4^Y4, X4, }, // 18 + {Y2^Z2^X5, Z1^X3^Y4, Z0^Y3^X4, X4, }, // 19 + {Y2^Z2^X4, Z1^X2^Y4, Z0^X3^Y3, X3, }, // 20 + {Z3^Y4^X7, Z2^X4^Y7, Z1^Y5^X6, Z0^X5^Y6, }, // 21 + {Y3^Z3^X7, Z2^X4^Y6, Z1^Y4^X6, Z0^X5^Y5, }, // 22 + {Y3^Z3^X6, Z2^X3^Y6, Z1^Y4^X5, Z0^X4^Y5, }, // 23 + {Y2^Z3^X6, Z2^X3^Y5, Z1^Y3^X5, Z0^X4^Y4, }, // 24 + {Y2^Z3^X5, X2^Z2^Y5, Z1^Y3^X4, Z0^X3^Y4, }, // 25 + {Y4^Z4^X8, Z3^X4^Y8, Z2^Y5^X7, Z1^X5^Y7, }, // 26 + {Y3^Z4^X8, Z3^X4^Y7, Z2^Y4^X7, Z1^X5^Y6, }, // 27 + {Y3^Z4^X7, X3^Z3^Y7, Z2^Y4^X6, Z1^X4^Y6, }, // 28 + {Y2^Z4^X7, X3^Z3^Y6, Z2^Y3^X6, Z1^X4^Y5, }, // 29 + {Y2^Z4^X6, X2^Z3^Y6, Z2^Y3^X5, Z1^X3^Y5, }, // 30 + {Y4^Z5^X9, X4^Z4^Y9, Z3^Y5^X8, Z2^X5^Y8, }, // 31 + {Y3^Z5^X9, X4^Z4^Y8, Z3^Y4^X8, Z2^X5^Y7, }, // 32 + {Y3^Z5^X8, X3^Z4^Y8, Z3^Y4^X7, Z2^X4^Y7, }, // 33 + {Y2^Z5^X8, X3^Z4^Y7, Y3^Z3^X7, Z2^X4^Y6, }, // 34 + {Y2^Z5^X7, X2^Z4^Y7, Y3^Z3^X6, Z2^X3^Y6, }, // 35 + {X4^Y4, X4, Y5, X5, }, // 36 + {Y3^X4, X4, Y4, X5, }, // 37 + {X3^Y3, X3, Y4, X4, }, // 38 + {Y2^X3, X3, Y3, X4, }, // 39 + {X2^Y2, X2, Y3, X3, }, // 40 + {Y4^X5, X4^Y5, Y5, X5, }, // 41 + {Y3^X5, X4^Y4, Y4, X5, }, // 42 + {Y3^X4, X3^Y4, Y4, X4, }, // 43 + {Y2^X4, X3^Y3, Y3, X4, }, // 44 + {Y2^X3, X2^Y3, Y3, X3, }, // 45 + {Y4^X6, X4^Y6, X5^Y5, X5, }, // 46 + {Y3^X6, X4^Y5, Y4^X5, X5, }, // 47 + {Y3^X5, X3^Y5, X4^Y4, X4, }, // 48 + {Y2^X5, X3^Y4, Y3^X4, X4, }, // 49 + {Y2^X4, X2^Y4, X3^Y3, X3, }, // 50 + {Y4^X7, X4^Y7, Y5^X6, X5^Y6, }, // 51 + {Y3^X7, X4^Y6, Y4^X6, X5^Y5, }, // 52 + {Y3^X6, X3^Y6, Y4^X5, X4^Y5, }, // 53 + {Y2^X6, X3^Y5, Y3^X5, X4^Y4, }, // 54 + {Y2^X5, X2^Y5, Y3^X4, X3^Y4, }, // 55 + {Y4, X4, Y5^X7, X5^Y7, }, // 56 + {Y3, X4, Y4^X7, X5^Y6, }, // 57 + {Y3, X3, Y4^X6, X4^Y6, }, // 58 + {Y2, X3, Y3^X6, X4^Y5, }, // 59 + {Y2, X2, Y3^X5, X3^Y5, }, // 60 + {Z0^X3^Y3, X4, Y5, X5, }, // 61 + {Z0^X3^Y3, X4, Y4, X5, }, // 62 + {Z0^X3^Y3, X3, Y2, X4, }, // 63 + {Z0^X3^Y3, X2, Y2, X3, }, // 64 + {Z1^X3^Y3, Z0^X4^Y4, Y5, X5, }, // 65 + {Z1^X3^Y3, Z0^X4^Y4, Y4, X5, }, // 66 + {Z1^X3^Y3, Z0^X4^Y4, Y3, X4, }, // 67 + {Z1^X3^Y3, Z0^X4^Y4, Y2, X3, }, // 68 + {Z1^X3^Y3, Z0^X4^Y4, Y2, X2, }, // 69 + {Z2^X3^Y3, Z1^X4^Y4, Z0^X5^Y5, X5, }, // 70 + {Z2^X3^Y3, Z1^X4^Y4, Z0^X5^Y5, X4, }, // 71 + {Z2^X3^Y3, Z1^X4^Y4, Z0^X5^Y5, X3, }, // 72 + {Z2^X3^Y3, Z1^X4^Y4, Z0^X5^Y5, X2, }, // 73 + {X3^Y3^Z3, Z2^X4^Y4, Z1^Y5^X6, Z0^X5^Y6, }, // 74 + {X3^Y3^Z4, Z3^X4^Y4, Z2^Y5^X7, Z1^X5^Y7, }, // 75 + {X3^Y3^Z3, Z2^X4^Y4, Z1^Y5^X7, Z0^X5^Y7, }, // 76 + {X3^Y3^Z5, X4^Y4^Z4, Z3^Y5^X8, Z2^X5^Y8, }, // 77 + {X3^Y3^Z4, Z3^X4^Y4, Z2^Y5^X8, Z1^X5^Y8, }, // 78 + {X3^Y3^Z3, Z2^X4^Y4, Z1^Y5^X8, Z0^X5^Y8, }, // 79 + {Y3, Y4, X4, Y5, }, // 80 + {X2, Y3, X3, Y4, }, // 81 + {Z0^X3^Y3, Y4, X4, Y5, }, // 82 + {Z0^X3^Y3, X2, X3, Y4, }, // 83 + {Z1^X3^Y3, Z0^X4^Y4, Y4, Y5, }, // 84 + {Z1^X3^Y3, Z0^X4^Y4, X2, Y3, }, // 85 + {Z2^X3^Y3, Z1^X4^Y4, Z0^X5^Y5, Y4, }, // 86 + {Z2^X3^Y3, Z1^X4^Y4, Z0^Y5^X6, Y2^X5^Y6, }, // 87 + {Z2^X3^Y3, Z1^X4^Y4, Z0^Y5^X7, Y2^X5^Y7, }, // 88 + {Z2^X3^Y3, Z1^X4^Y4, Z0^Y5^X8, Y2^X5^Y8, }, // 89 + {X3, Y3, X4, Y4, }, // 90 + {Z0^X3^Y3, X3, X4, Y4, }, // 91 + {Z1^X3^Y3, Z0^X4^Y4, X3, Y4, }, // 92 + {Z2^X3^Y3, Z1^X4^Y4, Z0^X5^Y5, Y2, }, // 93 + {Z1^X3^Y3, Z0^X4^Y4, Y2^X5^Y5, X2, }, // 94 + {Z2^X3^Y3, Z1^X4^Y4, Y2^Y5^X6, Z0^X5^Y6, }, // 95 + {Z1^X3^Y3, Z0^X4^Y4, Y2^Y5^X6, X1^X5^Y6, }, // 96 + {Z2^X3^Y3, Z1^X4^Y4, Y2^Y5^X7, Z0^X5^Y7, }, // 97 + {Z1^X3^Y3, Z0^X4^Y4, Y2^Y5^X7, X1^X5^Y7, }, // 98 + {Z2^X3^Y3, Z1^X4^Y4, Y2^Y5^X8, Z0^X5^Y8, }, // 99 + {Z1^X3^Y3, Z0^X4^Y4, Y2^Y5^X8, X1^X5^Y8, }, // 100 + {Z0^X3^Y3, Y2, X3, Y4, }, // 101 + {Z1^X3^Y3, Z0^X4^Y4, X2, Y2, }, // 102 + {Z1^X3^Y3, Z0^X4^Y4, Y2^X5^Y5, Y3, }, // 103 + {Z1^X3^Y3, Z0^X4^Y4, Y0^X5^Y5, Y2, }, // 104 + {Z2^X3^Y3, Z1^X4^Y4, Z0^Y5^X6, Z3^X5^Y6, }, // 105 + {Z1^X3^Y3, Z0^X4^Y4, Y0^Y5^X6, X1^X5^Y6, }, // 106 + {Z2^X3^Y3, Z1^X4^Y4, Z0^Y5^X7, Z4^X5^Y7, }, // 107 + {Z2^X3^Y3, Z1^X4^Y4, Z0^Y5^X7, Z3^X5^Y7, }, // 108 + {Z1^X3^Y3, Z0^X4^Y4, Y0^Y5^X7, X1^X5^Y7, }, // 109 + {Z2^X3^Y3, Z1^X4^Y4, Z0^Y5^X8, Z4^X5^Y8, }, // 110 + {Z2^X3^Y3, Z1^X4^Y4, Z0^Y5^X8, Z3^X5^Y8, }, // 111 + {Z1^X3^Y3, Z0^X4^Y4, Y0^Y5^X8, X1^X5^Y8, }, // 112 + {Z2^X3^Y3, Z1^X4^Y4, Z0^Y5^X6, S0^X5^Y6, }, // 113 + {Z2^X3^Y3, Z1^X4^Y4, Z0^Y5^X7, S0^X5^Y7, }, // 114 + {Z2^X3^Y3, Z1^X4^Y4, Z0^Y5^X8, S0^X5^Y8, }, // 115 + {Z1^X3^Y3, Z0^X4^Y4, S1^X5^Y5, X2, }, // 116 + {Z2^X3^Y3, Z1^X4^Y4, S1^Y5^X6, Z0^X5^Y6, }, // 117 + {Z1^X3^Y3, Z0^X4^Y4, S1^Y5^X6, S0^X5^Y6, }, // 118 + {Z2^X3^Y3, Z1^X4^Y4, S1^Y5^X7, Z0^X5^Y7, }, // 119 + {Z1^X3^Y3, Z0^X4^Y4, S1^Y5^X7, S0^X5^Y7, }, // 120 + {Z2^X3^Y3, Z1^X4^Y4, S1^Y5^X8, Z0^X5^Y8, }, // 121 + {Z1^X3^Y3, Z0^X4^Y4, S1^Y5^X8, S0^X5^Y8, }, // 122 + {Z1^X3^Y3, Z0^X4^Y4, S2^X5^Y5, Y2, }, // 123 + {Z1^X3^Y3, Z0^X4^Y4, S2^X5^Y5, X2, }, // 124 + {Z2^X3^Y3, Z1^X4^Y4, Z0^Y5^X6, S2^X5^Y6, }, // 125 + {Z1^X3^Y3, Z0^X4^Y4, S2^Y5^X6, S1^X5^Y6, }, // 126 + {Z2^X3^Y3, Z1^X4^Y4, Z0^Y5^X7, S2^X5^Y7, }, // 127 + {Z1^X3^Y3, Z0^X4^Y4, S2^Y5^X7, S1^X5^Y7, }, // 128 + {Z2^X3^Y3, Z1^X4^Y4, Z0^Y5^X8, S2^X5^Y8, }, // 129 + {Z1^X3^Y3, Z0^X4^Y4, S2^Y5^X8, S1^X5^Y8, }, // 130 + {Y2, X3, Z3, Y3, }, // 131 + {Y2, X2, Z3, Y3, }, // 132 + {Y2, X2, Z2, Y3, }, // 133 + {Y1, X2, Z2, Y2, }, // 134 + {Y1, X1, Z2, Y2, }, // 135 + {Y2^X3^Z3, X3, Z3, Y3, }, // 136 + {X2^Y2^Z3, X2, Z3, Y3, }, // 137 + {X2^Y2^Z2, X2, Z2, Y3, }, // 138 + {Y1^X2^Z2, X2, Z2, Y2, }, // 139 + {X1^Y1^Z2, X1, Z2, Y2, }, // 140 + {Y2^X4^Z4, X3^Y3^Z3, Z3, Y3, }, // 141 + {Y2^X3^Z4, X2^Y3^Z3, Z3, Y3, }, // 142 + {Y2^X3^Z3, X2^Z2^Y3, Z2, Y3, }, // 143 + {Y1^X3^Z3, X2^Y2^Z2, Z2, Y2, }, // 144 + {Y1^X2^Z3, X1^Y2^Z2, Z2, Y2, }, // 145 + {Y2^X5^Z5, X3^Y4^Z4, Y3^Z3^X4, Y3, }, // 146 + {Y2^X4^Z5, X2^Y4^Z4, X3^Y3^Z3, Y3, }, // 147 + {Y2^X4^Z4, X2^Z3^Y4, Z2^X3^Y3, Y3, }, // 148 + {Y1^X4^Z4, X2^Y3^Z3, Y2^Z2^X3, Y2, }, // 149 + {Y1^X3^Z4, X1^Y3^Z3, X2^Y2^Z2, Y2, }, // 150 + {Y2^X6^Z6, X3^Y5^Z5, Z3^Y4^X5, Y3^X4^Z4, }, // 151 + {Y2^X5^Z6, X2^Y5^Z5, Z3^X4^Y4, X3^Y3^Z4, }, // 152 + {Y2^X5^Z5, X2^Z4^Y5, Z2^X4^Y4, X3^Y3^Z3, }, // 153 + {Y1^X5^Z5, X2^Y4^Z4, Z2^Y3^X4, Y2^X3^Z3, }, // 154 + {Y1^X4^Z5, X1^Y4^Z4, Z2^X3^Y3, X2^Y2^Z3, }, // 155 + {Y2^X7^Z7, X3^Y6^Z6, Z3^Y5^X6, Y3^X5^Z5, }, // 156 + {Y2^X6^Z7, X2^Y6^Z6, Z3^X5^Y5, Y3^X4^Z5, }, // 157 + {Y2^X6^Z6, X2^Z5^Y6, Z2^X5^Y5, Y3^X4^Z4, }, // 158 + {Y1^X6^Z6, X2^Y5^Z5, Z2^Y4^X5, Y2^X4^Z4, }, // 159 + {Y1^X5^Z6, X1^Y5^Z5, Z2^X4^Y4, Y2^X3^Z4, }, // 160 + {Y2^X8^Z8, X3^Y7^Z7, Z3^Y6^X7, Y3^X6^Z6, }, // 161 + {Y2^X7^Z8, X2^Y7^Z7, Z3^X6^Y6, Y3^X5^Z6, }, // 162 + {Y2^X7^Z7, X2^Z6^Y7, Z2^X6^Y6, Y3^X5^Z5, }, // 163 + {Y1^X7^Z7, X2^Y6^Z6, Z2^Y5^X6, Y2^X5^Z5, }, // 164 + {Y1^X6^Z7, X1^Y6^Z6, Z2^X5^Y5, Y2^X4^Z5, }, // 165 + {Y2^X5, X3^Y4^Z4, Y3^Z3^X4, Y3, }, // 166 + {Y2^X4, X2^Y4^Z4, X3^Y3^Z3, Y3, }, // 167 + {Y2^X4, X2^Z3^Y4, Z2^X3^Y3, Y3, }, // 168 + {Y1^X4, X2^Y3^Z3, Y2^Z2^X3, Y2, }, // 169 + {Y1^X3, X1^Y3^Z3, X2^Y2^Z2, Y2, }, // 170 + {Y2, X3, Z3^Y4^X5, Y3^X4^Z4, }, // 171 + {Y2, X2, Z3^X4^Y4, X3^Y3^Z4, }, // 172 + {Y2, X2, Z2^X4^Y4, X3^Y3^Z3, }, // 173 + {Y1, X2, Z2^Y3^X4, Y2^X3^Z3, }, // 174 + {Y1, X1, Z2^X3^Y3, X2^Y2^Z3, }, // 175 + {Y2, X3, Z3, Y3^X5, }, // 176 + {Y2, X2, Z3, Y3^X4, }, // 177 + {Y2, X2, Z2, Y3^X4, }, // 178 + {Y1, X2, Z2, Y2^X4, }, // 179 + {Y1, X1, Z2, Y2^X3, }, // 180 + {X3^Y3, X3, Z3, Y2, }, // 181 + {X3^Y3, X2, Z3, Y2, }, // 182 + {X3^Y3, X2, Z2, Y2, }, // 183 + {X3^Y3, X2, Z2, Y1, }, // 184 + {X3^Y3, X1, Z2, Y1, }, // 185 + {X3^Y3, X4^Y4, Z3, Y2, }, // 186 + {X3^Y3, X4^Y4, Z2, Y2, }, // 187 + {X3^Y3, X4^Y4, Z2, Y1, }, // 188 + {X3^Y3, X1^X4^Y4, Z2, Y1, }, // 189 + {X3^Y3, X4^Y4, X5^Y5, Z3, }, // 190 + {X3^Y3, X4^Y4, Z3^X5^Y5, Y2, }, // 191 + {X3^Y3, X4^Y4, Z2^X5^Y5, Y2, }, // 192 + {X3^Y3, X4^Y4, Z2^X5^Y5, Y1, }, // 193 + {X3^Y3, X1^X4^Y4, Z2^X5^Y5, Y1, }, // 194 + {X3^Y3, X4^Y4, Y2^Y5^X6, X5^Y6, }, // 195 + {X3^Y3, X4^Y4, Z3^Y5^X6, Y2^X5^Y6, }, // 196 + {X3^Y3, X4^Y4, Z2^Y5^X6, Y2^X5^Y6, }, // 197 + {X3^Y3, X4^Y4, Z2^Y5^X6, Y1^X5^Y6, }, // 198 + {X3^Y3, X1^X4^Y4, Z2^Y5^X6, Y1^X5^Y6, }, // 199 + {X3^Y3, X4^Y4, Y2^Y5^X7, X5^Y7, }, // 200 + {X3^Y3, X4^Y4, Z3^Y5^X7, Y2^X5^Y7, }, // 201 + {X3^Y3, X4^Y4, Z2^Y5^X7, Y2^X5^Y7, }, // 202 + {X3^Y3, X4^Y4, Z2^Y5^X7, Y1^X5^Y7, }, // 203 + {X3^Y3, X1^X4^Y4, Z2^Y5^X7, Y1^X5^Y7, }, // 204 + {X3^Y3, X4^Y4, Y2^Y5^X8, X5^Y8, }, // 205 + {X3^Y3, X4^Y4, Z3^Y5^X8, Y2^X5^Y8, }, // 206 + {X3^Y3, X4^Y4, Z2^Y5^X8, Y2^X5^Y8, }, // 207 + {X3^Y3, X4^Y4, Z2^Y5^X8, Y1^X5^Y8, }, // 208 + {X3^Y3, X1^X4^Y4, Z2^Y5^X8, Y1^X5^Y8, }, // 209 + {Y4^X5, Z0^X4^Y5, Y5, X5, }, // 210 + {Y3^X5, Z0^X4^Y4, Y4, X5, }, // 211 + {Y3^X4, Z0^X3^Y4, Y4, X4, }, // 212 + {Y2^X4, Z0^X3^Y3, Y3, X4, }, // 213 + {Y2^X3, Z0^X2^Y3, Y3, X3, }, // 214 + {Y4^X6, X4^Y6, Z0^X5^Y5, X5, }, // 215 + {Y3^X6, X4^Y5, Z0^Y4^X5, X5, }, // 216 + {Y3^X5, X3^Y5, Z0^X4^Y4, X4, }, // 217 + {Y2^X5, X3^Y4, Z0^Y3^X4, X4, }, // 218 + {Y2^X4, X2^Y4, Z0^X3^Y3, X3, }, // 219 + {Y4^X6, Z1^X4^Y6, Z0^X5^Y5, X5, }, // 220 + {Y3^X6, Z1^X4^Y5, Z0^Y4^X5, X5, }, // 221 + {Y3^X5, Z1^X3^Y5, Z0^X4^Y4, X4, }, // 222 + {Y2^X5, Z1^X3^Y4, Z0^Y3^X4, X4, }, // 223 + {Y2^X4, Z1^X2^Y4, Z0^X3^Y3, X3, }, // 224 + {Y4^X7, X4^Y7, Z1^Y5^X6, Z0^X5^Y6, }, // 225 + {Y3^X7, X4^Y6, Z1^Y4^X6, Z0^X5^Y5, }, // 226 + {Y3^X6, X3^Y6, Z1^Y4^X5, Z0^X4^Y5, }, // 227 + {Y2^X6, X3^Y5, Z1^Y3^X5, Z0^X4^Y4, }, // 228 + {Y2^X5, X2^Y5, Z1^Y3^X4, Z0^X3^Y4, }, // 229 + {Y4^X7, Z2^X4^Y7, Z1^Y5^X6, Z0^X5^Y6, }, // 230 + {Y3^X7, Z2^X4^Y6, Z1^Y4^X6, Z0^X5^Y5, }, // 231 + {Y3^X6, Z2^X3^Y6, Z1^Y4^X5, Z0^X4^Y5, }, // 232 + {Y2^X6, Z2^X3^Y5, Z1^Y3^X5, Z0^X4^Y4, }, // 233 + {Y2^X5, X2^Z2^Y5, Z1^Y3^X4, Z0^X3^Y4, }, // 234 + {Y4^X7, X4^Y7, Z2^Y5^X6, Z1^X5^Y6, }, // 235 + {Y3^X7, X4^Y6, Z2^Y4^X6, Z1^X5^Y5, }, // 236 + {Y3^X6, X3^Y6, Z2^Y4^X5, Z1^X4^Y5, }, // 237 + {Y2^X6, X3^Y5, Z2^Y3^X5, Z1^X4^Y4, }, // 238 + {Y2^X5, X2^Y5, Z2^Y3^X4, Z1^X3^Y4, }, // 239 + {Y4^X7, Z3^X4^Y7, Z2^Y5^X6, Z1^X5^Y6, }, // 240 + {Y3^X7, Z3^X4^Y6, Z2^Y4^X6, Z1^X5^Y5, }, // 241 + {Y3^X6, X3^Z3^Y6, Z2^Y4^X5, Z1^X4^Y5, }, // 242 + {Y2^X6, X3^Z3^Y5, Z2^Y3^X5, Z1^X4^Y4, }, // 243 + {Y2^X5, X2^Z3^Y5, Z2^Y3^X4, Z1^X3^Y4, }, // 244 + {Y4^X7, X4^Y7, Z3^Y5^X6, Z2^X5^Y6, }, // 245 + {Y3^X7, X4^Y6, Z3^Y4^X6, Z2^X5^Y5, }, // 246 + {Y3^X6, X3^Y6, Z3^Y4^X5, Z2^X4^Y5, }, // 247 + {Y2^X6, X3^Y5, Y3^Z3^X5, Z2^X4^Y4, }, // 248 + {Y2^X5, X2^Y5, Y3^Z3^X4, Z2^X3^Y4, }, // 249 + {Y4^X8, X4^Y8, Z2^Y5^X7, Z1^X5^Y7, }, // 250 + {Y3^X8, X4^Y7, Z2^Y4^X7, Z1^X5^Y6, }, // 251 + {Y3^X7, X3^Y7, Z2^Y4^X6, Z1^X4^Y6, }, // 252 + {Y2^X7, X3^Y6, Z2^Y3^X6, Z1^X4^Y5, }, // 253 + {Y2^X6, X2^Y6, Z2^Y3^X5, Z1^X3^Y5, }, // 254 + {Y4^X8, Z3^X4^Y8, Z2^Y5^X7, Z1^X5^Y7, }, // 255 + {Y3^X8, Z3^X4^Y7, Z2^Y4^X7, Z1^X5^Y6, }, // 256 + {Y3^X7, X3^Z3^Y7, Z2^Y4^X6, Z1^X4^Y6, }, // 257 + {Y2^X7, X3^Z3^Y6, Z2^Y3^X6, Z1^X4^Y5, }, // 258 + {Y2^X6, X2^Z3^Y6, Z2^Y3^X5, Z1^X3^Y5, }, // 259 + {Y4^X9, X4^Y9, Z3^Y5^X8, Z2^X5^Y8, }, // 260 + {Y3^X9, X4^Y8, Z3^Y4^X8, Z2^X5^Y7, }, // 261 + {Y3^X8, X3^Y8, Z3^Y4^X7, Z2^X4^Y7, }, // 262 + {Y2^X8, X3^Y7, Y3^Z3^X7, Z2^X4^Y6, }, // 263 + {Y2^X7, X2^Y7, Y3^Z3^X6, Z2^X3^Y6, }, // 264 + {Y4^X9, X4^Z4^Y9, Z3^Y5^X8, Z2^X5^Y8, }, // 265 + {Y3^X9, X4^Z4^Y8, Z3^Y4^X8, Z2^X5^Y7, }, // 266 + {Y3^X8, X3^Z4^Y8, Z3^Y4^X7, Z2^X4^Y7, }, // 267 + {Y2^X8, X3^Z4^Y7, Y3^Z3^X7, Z2^X4^Y6, }, // 268 + {Y2^X7, X2^Z4^Y7, Y3^Z3^X6, Z2^X3^Y6, }, // 269 + {X4, Y4, X5^Y8, Y5^X8, }, // 270 + {Y3, X4, Y4^X8, X5^Y7, }, // 271 + {X3, Y3, X4^Y7, Y4^X7, }, // 272 + {Y2, X3, Y3^X7, X4^Y6, }, // 273 + {X2, Y2, X3^Y6, Y3^X6, }, // 274 + {Z0^X4^Y4, Y4, X5, X6^Y8, }, // 275 + {Z0^X4^Y4, Y3, Y4, X5^Y8, }, // 276 + {Z0^X4^Y4, X3, Y3, X5^Y7, }, // 277 + {Z0^X4^Y4, Y2, X3, Y3^X8, }, // 278 + {Z0^X4^Y4, X2, Y2, X3^Y6, }, // 279 + {Y4^X5^Y5, Z0^X4^Y4, X5, Y5, }, // 280 + {Y4^X5^Y5, Z0^X4^Y4, Y3, X5, }, // 281 + {Y4^X5^Y5, Z0^X4^Y4, X3, Y3, }, // 282 + {Y4^X5^Y5, Z0^X4^Y4, Y2, X3, }, // 283 + {Y4^X5^Y5, Z0^X4^Y4, X2, Y2, }, // 284 + {Y4^X5^Y5, Z0^X4^Y4, X5^Y5, Y5, }, // 285 + {Y4^X5^Y5, Z0^X4^Y4, X5^Y5, Y3, }, // 286 + {Y4^X5^Y5, Z0^X4^Y4, X5^Y5, X3, }, // 287 + {Y4^X5^Y5, Z0^X4^Y4, X5^Y5, Y2, }, // 288 + {Y4^X5^Y5, Z0^X4^Y4, X5^Y5, X2, }, // 289 + {Y4^X6^Y6, Z1^X4^Y4, X5, X6, }, // 290 + {Y4^X6^Y6, Z1^X4^Y4, Y3, X5, }, // 291 + {Y4^X6^Y6, Z1^X4^Y4, X3, Y3, }, // 292 + {Y4^X6^Y6, Z1^X4^Y4, Y2, X3, }, // 293 + {Y4^X6^Y6, Z1^X4^Y4, X2, Y2, }, // 294 + {Y4^X6^Y6, Z1^X4^Y4, Z0^X5^Y5, X5, }, // 295 + {Y4^X6^Y6, Z1^X4^Y4, Z0^X5^Y5, Y3, }, // 296 + {Y4^X6^Y6, Z1^X4^Y4, Z0^X5^Y5, X3, }, // 297 + {Y4^X6^Y6, Z1^X4^Y4, Z0^X5^Y5, Y2, }, // 298 + {Y4^X6^Y6, Z1^X4^Y4, Z0^X5^Y5, X2, }, // 299 + {Y4^X6^Y6, Z1^X4^Y4, Z0^X5^Y5, X5^Y6, }, // 300 + {Y4^X7^Y7, Z1^X4^Y4, Z0^Y5^X6, X6, }, // 301 + {Y4^X7^Y7, Z1^X4^Y4, Z0^Y5^X6, Y3, }, // 302 + {Y4^X7^Y7, Z1^X4^Y4, Z0^Y5^X6, X3, }, // 303 + {Y4^X7^Y7, Z2^X4^Y4, Z1^Y5^X6, Y2, }, // 304 + {Y4^X7^Y7, Z2^X4^Y4, Z1^Y5^X6, X2, }, // 305 + {Y4^X7^Y7, Z1^X4^Y4, Z0^Y5^X6, X5^Y6, }, // 306 + {Y4^X7^Y7, Z2^X4^Y4, Z1^Y5^X6, Z0^X5^Y6, }, // 307 + {Y4^X8^Y8, Z1^X4^Y4, Z0^Y5^X7, X5^Y7, }, // 308 + {Y4^X8^Y8, Z3^X4^Y4, Z2^Y5^X7, Z1^X5^Y7, }, // 309 + {Y4^X9^Y9, Z1^X4^Y4, Z0^Y5^X8, X5^Y8, }, // 310 + {Y4^X9^Y9, X4^Y4^Z4, Z3^Y5^X8, Z2^X5^Y8, }, // 311 + {Y3, X4, Y4^X8, Y5^X7, }, // 312 + {X3, Y3, Y4^X7, X4^Y7, }, // 313 + {X2, Y2, Y3^X6, X3^Y6, }, // 314 + {Z0^X4^Y4, X3, Y3, Y4^X8, }, // 315 + {Z0^X4^Y4, X2, Y2, Y3^X7, }, // 316 + {Y4^X5^Y5, Z0^X4^Y4, X2, X3, }, // 317 + {Y4^X9^Y9, Z3^X4^Y4, Z2^Y5^X8, Z1^X5^Y8, }, // 318 + {Z0^X4^Y4, X2, X3, Y3^X8, }, // 319 + {Y4^X6^Y6, Z1^X4^Y4, X2, X3, }, // 320 + {Y4^X6^Y6, Z0^X4^Y4, X2, X3, }, // 321 + {Y4^X7^Y7, Z1^X4^Y4, Y1^Y5^X6, X2, }, // 322 + {Y4^X8^Y8, Z2^X4^Y4, Z1^Y5^X7, Z0^X5^Y7, }, // 323 + {Y4^X9^Y9, Z2^X4^Y4, Z1^Y5^X8, Z0^X5^Y8, }, // 324 + {X3, Y3, Y4^X7, Y1^X4^Y7, }, // 325 + {Y2, X3, Y3^X7, X1^X4^Y6, }, // 326 + {X2, Y2, Y3^X6, Y0^X3^Y6, }, // 327 + {Y0^X4^Y4, Y2, X3, Y3^X8, }, // 328 + {Y4^X5^Y5, Y0^X4^Y4, X2, X3, }, // 329 + {Y4^X5^Y5, Z0^X4^Y4, X2^X5^Y5, Y2, }, // 330 + {Y4^X5^Y5, Z0^X4^Y4, Y1^X5^Y5, X2, }, // 331 + {Y4^X6^Y6, Z0^X4^Y4, X3, Y3, }, // 332 + {Y4^X6^Y6, Y0^X4^Y4, X3, Y3, }, // 333 + {Y4^X6^Y6, Z0^X4^Y4, Y0^X5^Y5, X2, }, // 334 + {Y4^X6^Y6, Z1^X4^Y4, Z0^X5^Y5, X2^X5^Y5, }, // 335 + {Y4^X6^Y6, Z1^X4^Y4, Z0^X5^Y5, Y1^X5^Y5, }, // 336 + {Y4^X7^Y7, Z0^X4^Y4, Y1^Y5^X6, X3, }, // 337 + {Y4^X7^Y7, Z0^X4^Y4, Y0^Y5^X6, X3, }, // 338 + {Y4^X7^Y7, Z1^X4^Y4, Z0^Y5^X6, Z2^X5^Y6, }, // 339 + {Y4^X7^Y7, Z1^X4^Y4, Z0^Y5^X6, Y0^X5^Y6, }, // 340 + {Y4^X8^Y8, Z1^X4^Y4, Z0^Y5^X7, Z2^X5^Y7, }, // 341 + {Y4^X8^Y8, Z1^X4^Y4, Z0^Y5^X7, Y0^X5^Y7, }, // 342 + {Y4^X8^Y8, Z1^X4^Y4, Z0^Y5^X7, Z3^X5^Y7, }, // 343 + {Y4^X9^Y9, Z1^X4^Y4, Z0^Y5^X8, Z3^X5^Y8, }, // 344 + {Y4^X9^Y9, Z1^X4^Y4, Z0^Y5^X8, Z2^X5^Y8, }, // 345 + {Y4^X9^Y9, Z1^X4^Y4, Z0^Y5^X8, Z4^X5^Y8, }, // 346 + {X4, Y4, X5^Y10, Y5^X10, }, // 347 + {Y3, X4, Y4^X10, X5^Y9, }, // 348 + {X3, Y3, X4^Y9, Y4^X9, }, // 349 + {Y2, X3, Y3^X9, X4^Y8, }, // 350 + {X2, Y2, X3^Y8, Y3^X8, }, // 351 + {Z0^X4^Y4, Y4, X5, Y5^X10, }, // 352 + {Z0^X4^Y4, Y3, Y4, X5^Y9, }, // 353 + {Z0^X4^Y4, X3, Y3, Y4^X9, }, // 354 + {Z0^X4^Y4, Y2, X3, Y3^X9, }, // 355 + {Z0^X4^Y4, X2, Y2, Y3^X8, }, // 356 + {Y3, X4, Y4^X10, Y5^X9, }, // 357 + {X3, Y3, Y4^X9, X4^Y9, }, // 358 + {X2, Y2, Y3^X8, X3^Y8, }, // 359 + {Z0^X4^Y4, Y3, Y4, Y5^X9, }, // 360 + {Z0^X4^Y4, X2, X3, Y3^X9, }, // 361 + {Y4^X6^Y6, Z1^X4^Y4, Z0^X5^Y5, X2^X5^Y6, }, // 362 + {Y4^X6^Y6, Z1^X4^Y4, Z0^X5^Y5, Y1^X5^Y6, }, // 363 + {Y4^X7^Y7, Z1^X4^Y4, Z0^Y5^X6, X2, }, // 364 + {Y4^X7^Y7, Z1^X4^Y4, Z0^Y5^X6, Y1^X5^Y6, }, // 365 + {Y4^X8^Y8, Z1^X4^Y4, Z0^Y5^X7, Y1^X5^Y7, }, // 366 + {Y4^X9^Y9, Z1^X4^Y4, Z0^Y5^X8, Y1^X5^Y8, }, // 367 + {Z0^X4^Y4, X3, Y3, X5^Y8, }, // 368 + {Y4^X6^Y6, Z0^X4^Y4, Y1^X5^Y5, X2, }, // 369 + {Y4^X6^Y6, Z0^X4^Y4, Y1^X5^Y5, X1^X5^Y6, }, // 370 + {Y4^X7^Y7, Z1^X4^Y4, Y1^Y5^X6, X3, }, // 371 + {Y4^X7^Y7, Z1^X4^Y4, Y1^Y5^X6, Z0^X5^Y6, }, // 372 + {Y4^X7^Y7, Z0^X4^Y4, Y1^Y5^X6, X1^X5^Y6, }, // 373 + {Y4^X8^Y8, Z1^X4^Y4, Y1^Y5^X7, Z0^X5^Y7, }, // 374 + {Y4^X8^Y8, Z0^X4^Y4, Y1^Y5^X7, X1^X5^Y7, }, // 375 + {Y4^X9^Y9, Z1^X4^Y4, Y1^Y5^X8, Z0^X5^Y8, }, // 376 + {Y4^X9^Y9, Z0^X4^Y4, Y1^Y5^X8, X1^X5^Y8, }, // 377 + {Z0^X4^Y4, X2, Y2, X3^Y7, }, // 378 + {Y4^X5^Y5, Z0^X4^Y4, Y2^X5^Y5, X2, }, // 379 + {Y4^X5^Y5, Y0^X4^Y4, X1^X5^Y5, X2, }, // 380 + {Y4^X6^Y6, Z0^X4^Y4, Y1^X5^Y5, X3, }, // 381 + {Y4^X6^Y6, Y0^X4^Y4, Y1^X5^Y5, X3, }, // 382 + {Y4^X6^Y6, Z1^X4^Y4, Z0^X5^Y5, Y2^X5^Y6, }, // 383 + {Y4^X6^Y6, Z0^X4^Y4, Y1^X5^Y5, X2^X5^Y6, }, // 384 + {Y4^X6^Y6, Y0^X4^Y4, Y1^X5^Y5, Y2^X5^Y6, }, // 385 + {Y4^X7^Y7, Y0^X4^Y4, Y1^Y5^X6, X3, }, // 386 + {Y4^X7^Y7, Z1^X4^Y4, Z0^Y5^X6, Y2^X5^Y6, }, // 387 + {Y4^X7^Y7, Y0^X4^Y4, Y1^Y5^X6, X1^X5^Y6, }, // 388 + {Y4^X8^Y8, Z1^X4^Y4, Z0^Y5^X7, Y2^X5^Y7, }, // 389 + {Y4^X8^Y8, Y0^X4^Y4, Y1^Y5^X7, X1^X5^Y7, }, // 390 + {Y4^X8^Y8, Z1^X4^Y4, Z0^Y5^X7, X2^X5^Y7, }, // 391 + {Y4^X9^Y9, Z1^X4^Y4, Z0^Y5^X8, X2^X5^Y8, }, // 392 + {Y4^X9^Y9, Y0^X4^Y4, Y1^Y5^X8, X1^X5^Y8, }, // 393 + {Y4^X5^Y5, Z0^X4^Y4, X5^X6^Y6, Y5, }, // 394 + {Y4^X5^Y5, Z0^X4^Y4, X5^X6^Y6, Y3, }, // 395 + {Y4^X5^Y5, Z0^X4^Y4, X5^X6^Y6, X3, }, // 396 + {Y4^X5^Y5, Z0^X4^Y4, X5^X6^Y6, Y2, }, // 397 + {Y4^X5^Y5, Z0^X4^Y4, X5^X6^Y6, X2, }, // 398 + {Y4^X6^Y6, Z1^X4^Y4, Z0^X5^Y5, X5^X7^Y7, }, // 399 + {Y4^X7^Y7, Z2^X4^Y4, Z1^Y5^X6, X6, }, // 400 + {Y4^X7^Y7, Z2^X4^Y4, Z1^Y5^X6, Y3, }, // 401 + {Y4^X7^Y7, Z2^X4^Y4, Z1^Y5^X6, X3, }, // 402 + {X4, Y4, Y5^X8, X5^Y8, }, // 403 + {Z0^X4^Y4, Y4, X5, Y5^X9, }, // 404 + {Y4^X6^Y6, Z0^X4^Y4, X2, Y2, }, // 405 + {Y4^X7^Y7, Z1^X4^Y4, S1^Y5^X6, X2, }, // 406 + {X4, Y4, Y5^X8, S0^X5^Y8, }, // 407 + {Y3, X4, Y4^X8, S0^X5^Y7, }, // 408 + {X3, Y3, Y4^X7, S0^X4^Y7, }, // 409 + {Y2, X3, Y3^X7, S0^X4^Y6, }, // 410 + {X2, Y2, Y3^X6, S0^X3^Y6, }, // 411 + {S2^X4^Y4, X2, Y2, X3^Y6, }, // 412 + {Y4^X5^Y5, S2^X4^Y4, X2, Y2, }, // 413 + {Y4^X5^Y5, Z0^X4^Y4, X3^X6^Y6, X2, }, // 414 + {Y4^X6^Y6, Z1^X4^Y4, X5, Y6, }, // 415 + {Y4^X6^Y6, Z0^X4^Y4, Y2, X3, }, // 416 + {Y4^X6^Y6, S2^X4^Y4, X2, Y2, }, // 417 + {Y4^X6^Y6, Z0^X4^Y4, S2^X5^Y5, X2, }, // 418 + {Y4^X6^Y6, Z1^X4^Y4, Z0^X5^Y5, X3^X7^Y7, }, // 419 + {Y4^X7^Y7, Z0^X4^Y4, S2^Y5^X6, Y2, }, // 420 + {Y4^X7^Y7, Z0^X4^Y4, S2^Y5^X6, X2, }, // 421 + {Y4^X7^Y7, Z1^X4^Y4, Z0^Y5^X6, S2^X5^Y6, }, // 422 + {Y4^X8^Y8, Z1^X4^Y4, Z0^Y5^X7, S2^X5^Y7, }, // 423 + {X4, Y4, Y5^X10, X5^Y10, }, // 424 + {Y4^X5^Y5, Z0^X4^Y4, S0^X6^Y6, X2, }, // 425 + {Y4^X6^Y6, Z1^X4^Y4, Z0^X5^Y5, S0^X7^Y7, }, // 426 + {Y4^X7^Y7, Z1^X4^Y4, Z0^Y5^X6, S0^X5^Y6, }, // 427 + {Y4^X8^Y8, Z1^X4^Y4, Z0^Y5^X7, S0^X5^Y7, }, // 428 + {Y4^X9^Y9, Z1^X4^Y4, Z0^Y5^X8, S0^X5^Y8, }, // 429 + {Y4^X5^Y5, Z0^X4^Y4, S1^X6^Y6, X2, }, // 430 + {Y4^X6^Y6, Z0^X4^Y4, S1^X5^Y5, X2, }, // 431 + {Y4^X6^Y6, Z1^X4^Y4, Z0^X5^Y5, S1^X7^Y7, }, // 432 + {Y4^X6^Y6, Z0^X4^Y4, S1^X5^Y5, S0^X7^Y7, }, // 433 + {Y4^X7^Y7, Z1^X4^Y4, S1^Y5^X6, Y2, }, // 434 + {Y4^X7^Y7, Z0^X4^Y4, S1^Y5^X6, X2, }, // 435 + {Y4^X7^Y7, Z1^X4^Y4, S1^Y5^X6, Z0^X5^Y6, }, // 436 + {Y4^X7^Y7, Z0^X4^Y4, S1^Y5^X6, S0^X5^Y6, }, // 437 + {Y4^X8^Y8, Z1^X4^Y4, S1^Y5^X7, Z0^X5^Y7, }, // 438 + {Y4^X8^Y8, Z0^X4^Y4, S1^Y5^X7, S0^X5^Y7, }, // 439 + {Y4^X9^Y9, Z1^X4^Y4, S1^Y5^X8, Z0^X5^Y8, }, // 440 + {Y4^X9^Y9, Z0^X4^Y4, S1^Y5^X8, S0^X5^Y8, }, // 441 + {Y4^X5^Y5, Z0^X4^Y4, S2^X6^Y6, X3, }, // 442 + {Y4^X5^Y5, Z0^X4^Y4, S2^X6^Y6, Y2, }, // 443 + {Y4^X5^Y5, S2^X4^Y4, S1^X6^Y6, X2, }, // 444 + {Y4^X6^Y6, Z0^X4^Y4, S2^X5^Y5, Y2, }, // 445 + {Y4^X6^Y6, S2^X4^Y4, S1^X5^Y5, X2, }, // 446 + {Y4^X6^Y6, Z1^X4^Y4, Z0^X5^Y5, S2^X7^Y7, }, // 447 + {Y4^X6^Y6, Z0^X4^Y4, S2^X5^Y5, S1^X7^Y7, }, // 448 + {Y4^X6^Y6, S2^X4^Y4, S1^X5^Y5, S0^X7^Y7, }, // 449 + {Y4^X7^Y7, Z1^X4^Y4, Z0^Y5^X6, Y6, }, // 450 + {Y4^X7^Y7, S2^X4^Y4, S1^Y5^X6, X2, }, // 451 + {Y4^X7^Y7, Z0^X4^Y4, S2^Y5^X6, S1^X5^Y6, }, // 452 + {Y4^X7^Y7, S2^X4^Y4, S1^Y5^X6, S0^X5^Y6, }, // 453 + {Y4^X8^Y8, Z0^X4^Y4, S2^Y5^X7, S1^X5^Y7, }, // 454 + {Y4^X8^Y8, S2^X4^Y4, S1^Y5^X7, S0^X5^Y7, }, // 455 + {Y4^X9^Y9, Z1^X4^Y4, Z0^Y5^X8, S2^X5^Y8, }, // 456 + {Y4^X9^Y9, Z0^X4^Y4, S2^Y5^X8, S1^X5^Y8, }, // 457 + {Y4^X9^Y9, S2^X4^Y4, S1^Y5^X8, S0^X5^Y8, }, // 458 + {X4^Y4, Y2, Z3, Y3, }, // 459 + {X4^Y4, Y2, Z2, Y3, }, // 460 + {X4^Y4, Y1, Z2, Y2, }, // 461 + {Y1^X4^Y4, X1, Z2, Y2, }, // 462 + {Y4^X5^Y5, X4^Y4, Y2, Z3, }, // 463 + {Y4^X5^Y5, X4^Y4, Y2, Z2, }, // 464 + {Z3^Y4^X5^Y5, X4^Y4, Y1, Z2, }, // 465 + {Z3^Y4^X5^Y5, Y1^X4^Y4, X1, Z2, }, // 466 + {Y4^X5^Y5, X4^Y4, Z3^X5, Y2, }, // 467 + {Y4^X5^Y5, X4^Y4, Z2^X5, Y2, }, // 468 + {Z3^Y4^X5^Y5, X4^Y4, Z2^X5, Y1, }, // 469 + {Z3^Y4^X5^Y5, Y1^X4^Y4, Z2^X5, X1, }, // 470 + {Y4^X6^Y6, X4^Y4, Y2, Y3, }, // 471 + {Y4^X6^Y6, X4^Y4, Z3, Y3, }, // 472 + {Y4^X6^Y6, X4^Y4, Z2, Y3, }, // 473 + {Z3^Y4^X6^Y6, X4^Y4, Z2, Y2, }, // 474 + {Z3^Y4^X6^Y6, Y1^X4^Y4, Z2, Y2, }, // 475 + {Y4^X6^Y6, X4^Y4, X5^Y5, Y2, }, // 476 + {Y4^X6^Y6, X4^Y4, Y2^X5^Y5, Z3, }, // 477 + {Y4^X6^Y6, X4^Y4, Y2^X5^Y5, Z2, }, // 478 + {Z3^Y4^X6^Y6, X4^Y4, Y1^X5^Y5, Z2, }, // 479 + {Z3^Y4^X6^Y6, Y1^X4^Y4, X1^X5^Y5, Z2, }, // 480 + {Y4^X6^Y6, X4^Y4, X5^Y5, Z3^X6, }, // 481 + {Y4^X6^Y6, X4^Y4, Y2^X5^Y5, Z3^X6, }, // 482 + {Y4^X6^Y6, X4^Y4, Y2^X5^Y5, Z2^X6, }, // 483 + {Z3^Y4^X6^Y6, X4^Y4, Y1^X5^Y5, Z2^X6, }, // 484 + {Z3^Y4^X6^Y6, Y1^X4^Y4, X1^X5^Y5, Z2^X6, }, // 485 + {Y4^X7^Y7, X4^Y4, Y2^Y5^X6, Y3, }, // 486 + {Z3^Y4^X7^Y7, X4^Y4, Y1^Y5^X6, Y2, }, // 487 + {Z3^Y4^X7^Y7, Y1^X4^Y4, X1^Y5^X6, Y2, }, // 488 + {Y4^X7^Y7, X4^Y4, Y2^Y5^X6, X5^Y6, }, // 489 + {Y4^X7^Y7, X4^Y4, Y2^Y5^X6, Z3^X5^Y6, }, // 490 + {Y4^X7^Y7, X4^Y4, Y2^Y5^X6, Z2^X5^Y6, }, // 491 + {Z3^Y4^X7^Y7, X4^Y4, Y1^Y5^X6, Z2^X5^Y6, }, // 492 + {Z3^Y4^X7^Y7, Y1^X4^Y4, X1^Y5^X6, Z2^X5^Y6, }, // 493 + {Y4^X7^Y7, X4^Y4, Y2^Y5^X6, Y3^X5^Y6, }, // 494 + {Z3^Y4^X7^Y7, X4^Y4, Y1^Y5^X6, Y2^X5^Y6, }, // 495 + {Z3^Y4^X7^Y7, Y1^X4^Y4, X1^Y5^X6, Y2^X5^Y6, }, // 496 + {Y4^X8^Y8, X4^Y4, Y2^Y5^X7, X5^Y7, }, // 497 + {Y4^X8^Y8, X4^Y4, Y2^Y5^X7, Z3^X5^Y7, }, // 498 + {Y4^X8^Y8, X4^Y4, Y2^Y5^X7, Z2^X5^Y7, }, // 499 + {Z3^Y4^X8^Y8, X4^Y4, Y1^Y5^X7, Z2^X5^Y7, }, // 500 + {Z3^Y4^X8^Y8, Y1^X4^Y4, X1^Y5^X7, Z2^X5^Y7, }, // 501 + {Y4^X8^Y8, X4^Y4, Y2^Y5^X7, Y3^X5^Y7, }, // 502 + {Z3^Y4^X8^Y8, X4^Y4, Y1^Y5^X7, Y2^X5^Y7, }, // 503 + {Z3^Y4^X8^Y8, Y1^X4^Y4, X1^Y5^X7, Y2^X5^Y7, }, // 504 + {Y4^X9^Y9, X4^Y4, Y2^Y5^X8, X5^Y8, }, // 505 + {Y4^X9^Y9, X4^Y4, Y2^Y5^X8, Z3^X5^Y8, }, // 506 + {Y4^X9^Y9, X4^Y4, Y2^Y5^X8, Z2^X5^Y8, }, // 507 + {Z3^Y4^X9^Y9, X4^Y4, Y1^Y5^X8, Z2^X5^Y8, }, // 508 + {Z3^Y4^X9^Y9, Y1^X4^Y4, X1^Y5^X8, Z2^X5^Y8, }, // 509 +}; + +const UINT_64 GFX10_SW_PATTERN_NIBBLE3[][4] = +{ + {0, 0, 0, 0, }, // 0 + {Y6, X6, Y7, X7, }, // 1 + {Y5, X6, Y6, X7, }, // 2 + {Y5, X5, Y6, X6, }, // 3 + {Y4, X5, Y5, X6, }, // 4 + {Y4, X4, Y5, X5, }, // 5 + {Z0^X6^Y6, X6, Y7, X7, }, // 6 + {Z0^Y5^X6, X6, Y6, X7, }, // 7 + {Z0^X5^Y5, X5, Y6, X6, }, // 8 + {Z0^Y4^X5, X5, Y5, X6, }, // 9 + {Z0^X4^Y4, X4, Y5, X5, }, // 10 + {Z1^Y6^X7, Z0^X6^Y7, Y7, X7, }, // 11 + {Z1^Y5^X7, Z0^X6^Y6, Y6, X7, }, // 12 + {Z1^Y5^X6, Z0^X5^Y6, Y6, X6, }, // 13 + {Z1^Y4^X6, Z0^X5^Y5, Y5, X6, }, // 14 + {Z1^Y4^X5, Z0^X4^Y5, Y5, X5, }, // 15 + {X6^Y6, X6, Y7, X7, }, // 16 + {Y5^X6, X6, Y6, X7, }, // 17 + {X5^Y5, X5, Y6, X6, }, // 18 + {Y4^X5, X5, Y5, X6, }, // 19 + {X4^Y4, X4, Y5, X5, }, // 20 + {Y6^X7, X6^Y7, Y7, X7, }, // 21 + {Y5^X7, X6^Y6, Y6, X7, }, // 22 + {Y5^X6, X5^Y6, Y6, X6, }, // 23 + {Y4^X6, X5^Y5, Y5, X6, }, // 24 + {Y4^X5, X4^Y5, Y5, X5, }, // 25 + {Y3, X4, Y5, X5, }, // 26 + {Y4, X5, Y6, X6, }, // 27 + {Y2, X4, Y5, X6, }, // 28 + {Y2, X3, Y4, X5, }, // 29 + {Y4, X6, Y6, X7, }, // 30 + {Y3, X4, Y6, X6, }, // 31 + {Y2, X3, Y4, X6, }, // 32 + {Y2, X2, Y3, X4, }, // 33 + {Z0^X6^Y6, X4, Y6, X7, }, // 34 + {Z0^X6^Y6, X3, Y4, X6, }, // 35 + {Z0^X6^Y6, Y2, X3, Y4, }, // 36 + {Y2^X6^Y6, X2, Y3, X4, }, // 37 + {Z1^Y6^X7, Z0^X6^Y7, Y4, X7, }, // 38 + {Z1^Y6^X7, Z0^X6^Y7, Y3, X4, }, // 39 + {Y2^Y6^X7, Z0^X6^Y7, Y3, X4, }, // 40 + {Y2^Y6^X7, X2^X6^Y7, Y3, X4, }, // 41 + {X5, Y6, X6, Y7, }, // 42 + {Y5, X5, Y6, Y2^Y7, }, // 43 + {X4, Y5, X5, Y2^Y6, }, // 44 + {Y4, X4, Y5, Y1^Y6, }, // 45 + {Y3, X4, Y5, Y1^Y6, }, // 46 + {Y4, X5, Y6, Y2^Y7, }, // 47 + {X3, Y4, X5, Y2^Y6, }, // 48 + {Y2, X3, Y4, Y1^Y6, }, // 49 + {Y4, Y6, X6, Y7, }, // 50 + {Y3, X4, Y6, Y2^Y7, }, // 51 + {X2, Y3, X4, Y2^Y6, }, // 52 + {Y1, X3, Y4, X2^Y6, }, // 53 + {Z0^X6^Y6, Y4, X6, Y7, }, // 54 + {Z0^X6^Y6, X3, Y4, Y2^Y7, }, // 55 + {Y2^X6^Y6, Y3, X4, X2^Y7, }, // 56 + {X2^X6^Y6, X3, Y4, Y1^Y7, }, // 57 + {Z0^Y6^X7, Z5^X6^Y7, Y4, Y7, }, // 58 + {Z0^Y6^X7, Z5^X6^Y7, Y3, X4, }, // 59 + {Z0^Y6^X7, Y2^X6^Y7, X3, Y4, }, // 60 + {X2^Y6^X7, Y1^X6^Y7, X3, Y4, }, // 61 + {X5, Y5, X6, Y2^Y6, }, // 62 + {Y5, X5, Y2^Y6, X2^Y7, }, // 63 + {Y4, X5, Y1^Y5, X2^Y6, }, // 64 + {Y4, X4, Y1^Y5, X1^Y6, }, // 65 + {Y5, X5, X2^Y6, Y2^Y7, }, // 66 + {Y4, X5, X2^Y5, Y1^Y6, }, // 67 + {Y4, X4, X1^Y5, Y1^Y6, }, // 68 + {Y3, X4, Y1^Y5, X1^Y6, }, // 69 + {X4, Y5, X6, Y2^Y6, }, // 70 + {Y4, X5, X2^Y6, Y2^Y7, }, // 71 + {X3, Y4, Y1^Y5, X2^Y6, }, // 72 + {Y3, X4, X1^Y6, Y1^Y7, }, // 73 + {X3, Y4, X6, Y2^Y6, }, // 74 + {Y3, X4, Y2^Y6, X2^Y7, }, // 75 + {Y3, X4, Y1^Y6, X2^Y7, }, // 76 + {Z4^X6^Y6, X3, Y4, X6, }, // 77 + {Z4^X6^Y6, X3, Y4, Y2^Y6, }, // 78 + {Y1^X6^Y6, Y3, X4, X2^Y7, }, // 79 + {Z5^Y6^X7, Z4^X6^Y7, Y3, X4, }, // 80 + {Y2^Y6^X7, Z4^X6^Y7, Y3, X4, }, // 81 + {Y1^Y6^X7, X2^X6^Y7, Y3, X4, }, // 82 + {Y5, Y1^Y6, Y2^Y7, X2^Y8, }, // 83 + {X4, Y1^Y5, X1^Y6, Y2^Y7, }, // 84 + {Y4, Y0^Y5, Y1^Y6, X1^Y7, }, // 85 + {Y5, Y1^Y6, X2^Y7, Y2^Y8, }, // 86 + {X4, X1^Y5, Y1^Y6, X2^Y7, }, // 87 + {Y4, Y0^Y5, X1^Y6, Y1^Y7, }, // 88 + {X3, Y0^Y5, X1^Y6, Y1^Y7, }, // 89 + {Y4, Y1^Y6, X2^Y7, Y2^Y8, }, // 90 + {X4, X1^Y6, Y1^Y7, X2^Y8, }, // 91 + {X3, X1^Y6, Y1^Y7, X2^Y8, }, // 92 + {X3, Y4, X2^Y6, Y1^Y7, }, // 93 + {X3, Y1^Y6, X2^Y7, Y2^Y8, }, // 94 + {Z3^X6^Y6, X3, Y4, Y2^Y7, }, // 95 + {Y2^X6^Y6, X3, X2^Y7, Y1^Y8, }, // 96 + {Z3^Y6^X7, Y2^X6^Y7, X3, Y4, }, // 97 + {Y2^Y6^X7, X2^X6^Y7, X3, Y1^Y7, }, // 98 + {Y6, X6, Y7, S0^Y8, }, // 99 + {Y5, X6, Y6, S0^Y7, }, // 100 + {Y5, X5, Y6, S0^Y7, }, // 101 + {Y4, X5, Y5, S0^Y6, }, // 102 + {Y4, X4, Y5, S0^Y6, }, // 103 + {Y3, X4, Y5, S0^Y6, }, // 104 + {Y4, X5, Y6, S0^Y7, }, // 105 + {Y2, X4, Y5, S0^Y6, }, // 106 + {Y2, X3, Y4, S0^Y6, }, // 107 + {Y4, X6, Y6, S0^Y7, }, // 108 + {Y3, X4, Y6, S0^Y7, }, // 109 + {Z0^X6^Y6, X6, Y7, S0^Y8, }, // 110 + {Z0^X6^Y6, X4, Y6, S0^Y7, }, // 111 + {Z0^X6^Y6, X3, Y4, S0^Y7, }, // 112 + {S0^X6^Y6, Y2, X3, Y4, }, // 113 + {Z0^Y6^X7, Z5^X6^Y7, Y7, S0^Y8, }, // 114 + {Z0^Y6^X7, Z5^X6^Y7, Y4, S0^Y7, }, // 115 + {Z0^Y6^X7, S0^X6^Y7, Y3, X4, }, // 116 + {S0^Y6^X7, Y2^X6^Y7, X3, Y4, }, // 117 + {Y6, X6, S0^Y7, S1^Y8, }, // 118 + {Y5, X6, S0^Y6, S1^Y7, }, // 119 + {Y5, X5, S0^Y6, S1^Y7, }, // 120 + {Y4, X5, S0^Y5, S1^Y6, }, // 121 + {Y4, X4, S0^Y5, S1^Y6, }, // 122 + {Y3, X4, S0^Y5, S1^Y6, }, // 123 + {Y4, X5, S0^Y6, S1^Y7, }, // 124 + {X3, Y4, S0^Y5, S1^Y6, }, // 125 + {Y4, X6, S0^Y6, S1^Y7, }, // 126 + {Y3, X4, S0^Y6, S1^Y7, }, // 127 + {Z4^X6^Y6, X6, S0^Y7, S1^Y8, }, // 128 + {Z4^X6^Y6, Y4, S0^Y6, S1^Y7, }, // 129 + {S1^X6^Y6, X3, Y4, S0^Y7, }, // 130 + {Z5^Y6^X7, Z4^X6^Y7, S0^Y7, S1^Y8, }, // 131 + {S1^Y6^X7, Z4^X6^Y7, Y4, S0^Y7, }, // 132 + {S1^Y6^X7, S0^X6^Y7, Y3, X4, }, // 133 + {Y6, S0^Y7, S1^Y8, S2^Y9, }, // 134 + {Y5, S0^Y6, S1^Y7, S2^Y8, }, // 135 + {Y4, S0^Y5, S1^Y6, S2^Y7, }, // 136 + {X3, S0^Y5, S1^Y6, S2^Y7, }, // 137 + {Y4, S0^Y6, S1^Y7, S2^Y8, }, // 138 + {X3, Y4, S0^Y6, S1^Y7, }, // 139 + {Y2, X3, S0^Y6, S1^Y7, }, // 140 + {X2, Y2, X3, S0^Y6, }, // 141 + {Z3^X6^Y6, S0^Y7, S1^Y8, S2^Y9, }, // 142 + {S2^X6^Y6, Y4, S0^Y7, S1^Y8, }, // 143 + {S0^X6^Y6, X2, Y2, X3, }, // 144 + {Z3^Y6^X7, S2^X6^Y7, S0^Y7, S1^Y8, }, // 145 + {S2^Y6^X7, S1^X6^Y7, Y4, S0^Y7, }, // 146 + {S0^Y6^X7, X2^X6^Y7, Y2, X3, }, // 147 + {X4, Z4, Y4, X5, }, // 148 + {X3, Z4, Y4, X4, }, // 149 + {X3, Z3, Y4, X4, }, // 150 + {X3, Z3, Y3, X4, }, // 151 + {X2, Z3, Y3, X3, }, // 152 + {X4^Y4^Z4, Z4, Y4, X5, }, // 153 + {X3^Y4^Z4, Z4, Y4, X4, }, // 154 + {X3^Z3^Y4, Z3, Y4, X4, }, // 155 + {X3^Y3^Z3, Z3, Y3, X4, }, // 156 + {X2^Y3^Z3, Z3, Y3, X3, }, // 157 + {X4^Y5^Z5, Y4^Z4^X5, Y4, X5, }, // 158 + {X3^Y5^Z5, X4^Y4^Z4, Y4, X4, }, // 159 + {X3^Z4^Y5, Z3^X4^Y4, Y4, X4, }, // 160 + {X3^Y4^Z4, Y3^Z3^X4, Y3, X4, }, // 161 + {X2^Y4^Z4, X3^Y3^Z3, Y3, X3, }, // 162 + {X4, Y4^Z4^X5, Y4, X5, }, // 163 + {X3, X4^Y4^Z4, Y4, X4, }, // 164 + {X3, Z3^X4^Y4, Y4, X4, }, // 165 + {X3, Y3^Z3^X4, Y3, X4, }, // 166 + {X2, X3^Y3^Z3, Y3, X3, }, // 167 + {X3, Z3, Y2, X4, }, // 168 + {X2, Z3, Y2, X3, }, // 169 + {X3, Z4, Y4, X5, }, // 170 + {X2, Z4, Y3, X4, }, // 171 + {X2, Z3, Y3, X4, }, // 172 + {Y2, X3, Z4, Y4, }, // 173 + {Z3, Y3, X4, Z4, }, // 174 + {Z3^X6^Y6, Y3, X4, Z4, }, // 175 + {X2^X6^Y6, Z4, Y3, X4, }, // 176 + {X2^X6^Y6, Z3, Y3, X4, }, // 177 + {X2^X6^Y6, Z3, Y2, X3, }, // 178 + {Z3^Y6^X7, Z4^X6^Y7, Y3, X4, }, // 179 + {X2^Y6^X7, Z4^X6^Y7, Y3, X4, }, // 180 + {X2^Y6^X7, Z3^X6^Y7, Y3, X4, }, // 181 + {X2^Y6^X7, Z3^X6^Y7, Y2, X3, }, // 182 + {X6^Y7, Y6^X7, 0, 0, }, // 183 + {Y5^X7, X6^Y6, 0, 0, }, // 184 + {X5^Y6, Y5^X6, 0, 0, }, // 185 + {Y4^X6, X5^Y5, 0, 0, }, // 186 + {X4^Y5, Y4^X5, 0, 0, }, // 187 + {Y5^X9, X7^Y7, Y6^X8, 0, }, // 188 + {Y5^X8, X6^Y7, Y6^X7, 0, }, // 189 + {Y4^X8, X6^Y6, Y5^X7, 0, }, // 190 + {Y4^X7, X5^Y6, Y5^X6, 0, }, // 191 + {Y3^X7, X5^Y5, Y4^X6, 0, }, // 192 + {X6^Y9, Y6^X9, X7^Y8, Y7^X8, }, // 193 + {X6^Y8, Y5^X9, X7^Y7, Y6^X8, }, // 194 + {X5^Y8, Y5^X8, X6^Y7, Y6^X7, }, // 195 + {Y3^X8, X5^Y7, X6^Y6, Y5^X7, }, // 196 + {Y3^X7, X3^Y7, X5^Y6, Y5^X6, }, // 197 + {X6, X7^Y9, Y6^X10, X8^Y8, }, // 198 + {Y5, X6^Y9, Y6^X9, X7^Y8, }, // 199 + {Y3, X6^Y8, Y5^X9, X7^Y7, }, // 200 + {X3, Y3^X9, Y5^X8, X6^Y7, }, // 201 + {Y2, X3^Y7, Y3^X8, X6^Y6, }, // 202 + {Y6^X9, X7^Y8, Y7^X8, Z0^X5^Y5, }, // 203 + {X6^Y8, Y6^X8, X7^Y7, Z0^X5^Y5, }, // 204 + {X5^Y8, X6^Y7, Y6^X7, Z0^X5^Y5, }, // 205 + {Y3^X7, X5^Y7, X6^Y6, Z0^X5^Y5, }, // 206 + {X3^Y7, Y3^X6, X5^Y6, Z0^X5^Y5, }, // 207 + {X6, Y6^X10, X7^Y9, Y7^X9, }, // 208 + {X5, X6^Y9, Y6^X9, X7^Y8, }, // 209 + {Y3, X5^Y9, X6^Y8, Y6^X8, }, // 210 + {X3, Y3^X8, X5^Y8, X6^Y7, }, // 211 + {Y2, X3^Y8, Y3^X7, X5^Y7, }, // 212 + {X6, Y6, X7^Y10, Y7^X10, }, // 213 + {Y3, X6, Y6^X10, X7^Y9, }, // 214 + {X3, Y3, X6^Y9, Y6^X9, }, // 215 + {Y2, X3, Y3^X9, X6^Y8, }, // 216 + {X2, Y2, X3^Y8, Y3^X8, }, // 217 + {Y6, X7^Y9, X8^Y8, Y7^X9, }, // 218 + {X6, Y6^X9, X7^Y8, Y7^X8, }, // 219 + {Y3, X6^Y8, X7^Y7, Y6^X8, }, // 220 + {X3, Y3^X8, X6^Y7, Y6^X7, }, // 221 + {Y2, X3^Y7, Y3^X7, X6^Y6, }, // 222 + {Y3, X6, X7^Y9, Y6^X10, }, // 223 + {X2, Y2, Y3^X8, X3^Y8, }, // 224 + {X6^Y6, Y6, X7, X8^Y10, }, // 225 + {X6^Y6, Y3, Y6, X7^Y10, }, // 226 + {X6^Y6, X3, Y3, X7^Y9, }, // 227 + {X6^Y6, Y2, X3, Y3^X10, }, // 228 + {X6^Y6, X2, Y2, X3^Y8, }, // 229 + {X6, X7, Y7^X10, X8^Y9, }, // 230 + {Y3, X6, X7^Y9, Y7^X9, }, // 231 + {X3, Y3, X6^Y9, X7^Y8, }, // 232 + {Y2, X3, Y3^X8, X6^Y8, }, // 233 + {X2, Y2, X3^Y8, Y3^X7, }, // 234 + {X6^Y6, X6, X7, Y7^X11, }, // 235 + {X6^Y6, Y3, X6, X7^Y10, }, // 236 + {X6^Y6, X3, Y3, X6^Y10, }, // 237 + {Z0^X6^Y6, Y2, X3, Y3^X9, }, // 238 + {Z0^X6^Y6, X2, Y2, X3^Y9, }, // 239 + {X6^Y6, X6^Y8, X7, Y7, }, // 240 + {X6^Y6, X6^Y8, Y3, X7, }, // 241 + {X6^Y6, X6^Y8, X3, Y3, }, // 242 + {Z0^X6^Y6, X6^Y8, Y2, X3, }, // 243 + {Z0^X6^Y6, X6^Y8, X2, Y2, }, // 244 + {Y6^X7, X7, Y7, X8^Y10, }, // 245 + {Y6^X7, Y3, X7, Y7^X10, }, // 246 + {Y6^X7, X3, Y3, X7^Y9, }, // 247 + {Z1^Y6^X7, Y2, X3, Y3^X9, }, // 248 + {Z1^Y6^X7, X2, Y2, X3^Y8, }, // 249 + {Y6^X7, X6^Y7, X7, Y7, }, // 250 + {Y6^X7, X6^Y7, Y3, X7, }, // 251 + {Y6^X7, X6^Y7, X3, Y3, }, // 252 + {Z1^Y6^X7, Z0^X6^Y7, Y2, X3, }, // 253 + {Z1^Y6^X7, Z0^X6^Y7, X2, Y2, }, // 254 + {X5^Y7, X6^Y6, 0, 0, }, // 255 + {Y5^X6, Y2^X5^Y6, 0, 0, }, // 256 + {Y4^X6, X2^X5^Y5, 0, 0, }, // 257 + {Y4^X5, Y1^X4^Y5, 0, 0, }, // 258 + {X5^Y7, Y5^X7, Y2^X6^Y6, 0, }, // 259 + {X5^Y6, Y4^X7, X2^Y5^X6, 0, }, // 260 + {X3^Y6, Y4^X6, Y1^X5^Y5, 0, }, // 261 + {Y5^X9, Y6^X8, X6^Y8, X7^Y7, }, // 262 + {Y5^X8, X5^Y8, Y6^X7, Y2^X6^Y7, }, // 263 + {Y3^X8, X5^Y7, Y5^X7, Y2^X6^Y6, }, // 264 + {Y3^X7, X3^Y7, Y5^X6, Y1^X5^Y6, }, // 265 + {Y3, X5^Y9, X6^Y8, X7^Y7, }, // 266 + {Y2, Y3^X7, X3^Y8, X5^Y7, }, // 267 + {Y6^X8, X6^Y8, X7^Y7, Z0^X5^Y5, }, // 268 + {X5^Y8, Y6^X7, Y2^X6^Y7, Z0^X5^Y5, }, // 269 + {Y3^X7, X5^Y7, X2^X6^Y6, Z0^X5^Y5, }, // 270 + {Y3^X6, X3^Y7, Y1^X5^Y6, Z0^X5^Y5, }, // 271 + {Y3, X5, X6^Y10, Y7^X9, }, // 272 + {X3, Y3, X5^Y10, X6^Y9, }, // 273 + {Y2, X3, Y3^X8, X5^Y9, }, // 274 + {X2, Y2, Y3^X7, X3^Y9, }, // 275 + {Y3, X6^Y8, Y6^X8, Y2^X7^Y7, }, // 276 + {X3, Y3^X8, X6^Y7, X2^Y6^X7, }, // 277 + {Y2, Y3^X7, X3^Y7, Y1^X6^Y6, }, // 278 + {Y3, X6, Y6^X10, Y7^X9, }, // 279 + {X3, Y3, Y6^X9, X6^Y9, }, // 280 + {X2, X3, Y3^X9, X6^Y8, }, // 281 + {X6^Y6, Y2, X3, Y3^X9, }, // 282 + {X6^Y6, X2, Y2, Y3^X8, }, // 283 + {Y3, X6, Y7^X9, X7^Y9, }, // 284 + {X3, Y3, X6^Y9, Y7^X8, }, // 285 + {X2, Y2, Y3^X7, X3^Y8, }, // 286 + {Z0^Y6^X7, Y2, X3, Y3^X9, }, // 287 + {Z0^Y6^X7, X2, Y2, Y3^X8, }, // 288 + {Z0^Y6^X7, Z4^X6^Y7, X2, X3, }, // 289 + {Z0^Y6^X7, Z4^X6^Y7, X2, Y2, }, // 290 + {X5^Y6, Y2^Y5^X6, 0, 0, }, // 291 + {X2^X5^Y6, Y2^Y5^X6, 0, 0, }, // 292 + {X2^X5^Y5, Y1^Y4^X6, 0, 0, }, // 293 + {X1^X4^Y5, Y1^Y4^X5, 0, 0, }, // 294 + {Y4^X8, X2^X6^Y6, Y2^Y5^X7, 0, }, // 295 + {Y4^X7, Y2^Y5^X6, Y1^X5^Y6, 0, }, // 296 + {Y3^X7, X1^X5^Y5, Y1^Y4^X6, 0, }, // 297 + {X5^Y8, X6^Y7, Y5^X8, Y2^Y6^X7, }, // 298 + {X5^Y8, Y5^X8, X2^Y6^X7, Y2^X6^Y7, }, // 299 + {Y3^X8, X5^Y7, X2^Y5^X7, Y1^X6^Y6, }, // 300 + {Y3^X7, X3^Y7, X1^Y5^X6, Y1^X5^Y6, }, // 301 + {Y3, Y5^X9, X6^Y8, Y6^X8, }, // 302 + {Y3, X6^Y8, Y5^X9, X2^X7^Y7, }, // 303 + {X3, Y3^X9, Y5^X8, Y2^Y6^X7, }, // 304 + {Y2, X3^Y7, Y3^X8, X1^X6^Y6, }, // 305 + {X5^Y8, X6^Y7, Y2^Y6^X7, Z0^X5^Y5, }, // 306 + {X5^Y8, X2^X6^Y7, Y2^Y6^X7, Z0^X5^Y5, }, // 307 + {Y3^X8, Y2^Y5^X7, Y1^X6^Y6, Z0^X5^Y5, }, // 308 + {Y3^X7, Y2^X6^Y6, X1^X5^Y7, Y1^X5^Y5, }, // 309 + {Y3, X5^Y9, X6^Y8, X2^Y6^X8, }, // 310 + {X3, Y3^X8, X5^Y8, X2^Y6^X7, }, // 311 + {Y2, Y3^X8, X3^Y7, X1^Y5^X7, }, // 312 + {Y3, X6^Y8, X2^X7^Y7, Y2^Y6^X8, }, // 313 + {X3, Y3^X8, Y2^Y6^X7, Y1^X6^Y7, }, // 314 + {X3, Y3^X8, Y2^Y6^X7, X1^X6^Y7, }, // 315 + {X6^Y6, X3, Y3, Y6^X10, }, // 316 + {X6^Y6, X2, X3, Y3^X10, }, // 317 + {X3, Y3, X6^Y9, X2^X7^Y8, }, // 318 + {X2, X3, Y3^X9, Y2^Y6^X8, }, // 319 + {X2, X3, Y3^X8, Y2^X7^Y7, }, // 320 + {Z3^X6^Y6, Y2, X3, Y3^X9, }, // 321 + {Z3^X6^Y6, X2, Y2, Y3^X9, }, // 322 + {Z3^X6^Y6, X6^Y8, Y2, X3, }, // 323 + {Z3^X6^Y6, X6^Y8, X2, Y2, }, // 324 + {Z4^Y6^X7, X2, X3, Y3^X9, }, // 325 + {Y1^Y6^X7, X2, X3, Y3^X9, }, // 326 + {Z4^Y6^X7, Z3^X6^Y7, Y2, X3, }, // 327 + {Z4^Y6^X7, Z3^X6^Y7, X2, Y2, }, // 328 + {Y1^Y4^X6, X2^X5^Y5, 0, 0, }, // 329 + {Y1^X5^Y7, X2^X6^Y6, Y2^Y5^X7, 0, }, // 330 + {X1^X5^Y6, Y1^Y4^X7, X2^Y5^X6, 0, }, // 331 + {Y5^X8, Y1^X5^Y8, X2^X6^Y7, Y2^Y6^X7, }, // 332 + {Y3^X8, Y1^X5^Y7, X1^Y5^X7, Y2^X6^Y6, }, // 333 + {Y3^X7, Y1^X4^Y7, Y2^X5^Y6, X1^Y5^X6, }, // 334 + {Y3, X5^Y9, X6^Y8, X2^X7^Y7, }, // 335 + {Y3, X5^Y9, Y1^X6^Y8, X2^X7^Y7, }, // 336 + {X3, Y3^X8, X5^Y7, X1^X6^Y6, }, // 337 + {Y2, Y3^X7, X3^Y7, Y0^X5^Y6, }, // 338 + {Y1^X5^Y8, X2^X6^Y7, Y2^Y6^X7, Z0^X5^Y5, }, // 339 + {X1^X5^Y8, Y2^Y6^X7, X2^X6^Y7, Y1^X5^Y5, }, // 340 + {X1^X5^Y8, X2^X6^Y7, Y2^Y6^X7, Y1^X5^Y5, }, // 341 + {Y3, X5^Y9, Y1^X6^Y8, X2^Y6^X8, }, // 342 + {X3, Y3^X9, Y1^X6^Y7, X1^Y5^X8, }, // 343 + {X3, Y3^X8, Y1^X5^Y8, Y2^X6^Y7, }, // 344 + {X3, Y3, X5^Y10, Y1^X6^Y9, }, // 345 + {Y2, X3, Y3^X8, X5^Y8, }, // 346 + {Y3, Y1^X6^Y8, X2^X7^Y7, Y2^Y6^X8, }, // 347 + {Y3, X1^X6^Y8, Y2^Y6^X8, X2^X7^Y7, }, // 348 + {Y3, X1^X6^Y8, X2^X7^Y7, Y2^Y6^X8, }, // 349 + {X3, Y3, Y6^X9, Y1^X6^Y9, }, // 350 + {X2, X3, Y3^X9, Y1^X6^Y8, }, // 351 + {X2^X6^Y6, Y2, X3, Y3^X9, }, // 352 + {Y1^X6^Y6, X2, Y2, Y3^X8, }, // 353 + {X3, Y3, Y1^X6^Y9, X2^X7^Y8, }, // 354 + {X3, Y3, X1^X6^Y9, Y2^Y7^X8, }, // 355 + {X3, Y3, X1^X6^Y9, X2^X7^Y8, }, // 356 + {Z2^X6^Y6, X2, X3, Y3^X10, }, // 357 + {Y0^X6^Y6, X2, X3, Y3^X9, }, // 358 + {Z2^X6^Y6, X6^Y8, Y2, X3, }, // 359 + {Z2^X6^Y6, Y1^X6^Y8, X2, Y2, }, // 360 + {Y6^X7, X3, Y3, Y1^X7^Y9, }, // 361 + {Y1^Y6^X7, X3, Y3, X1^X7^Y9, }, // 362 + {Y0^Y6^X7, X3, Y3, X1^X7^Y9, }, // 363 + {Z3^Y6^X7, Z2^X6^Y7, X2, X3, }, // 364 + {Z2^Y6^X7, Y0^X6^Y7, X2, X3, }, // 365 + {Y5^X9, X6^Y8, Y6^X8, X7^Y7, }, // 366 + {Y4^X8, X5^Y7, Y5^X7, X6^Y6, }, // 367 + {X4^Y7, Y4^X7, X5^Y6, Y5^X6, }, // 368 + {X5^Y7, Y4^X8, X6^Y6, Y5^X7, }, // 369 + {X3^Y7, Y4^X7, X5^Y6, Y5^X6, }, // 370 + {Y5, X6^Y8, X7^Y7, Y6^X8, }, // 371 + {Y3, Y5^X8, X6^Y7, Y6^X7, }, // 372 + {X3, Y3^X8, X6^Y6, Y5^X7, }, // 373 + {Y2, Y3^X7, X3^Y6, Y5^X6, }, // 374 + {X5, X6^Y8, Y6^X8, X7^Y7, }, // 375 + {Y3, X5^Y8, X6^Y7, Y6^X7, }, // 376 + {X3, Y3^X7, X5^Y7, X6^Y6, }, // 377 + {Y2, X3^Y7, Y3^X6, X5^Y6, }, // 378 + {X6, Y6, X7^Y8, Y7^X8, }, // 379 + {Y3, X6, Y6^X8, X7^Y7, }, // 380 + {X3, Y3, X6^Y7, Y6^X7, }, // 381 + {Y2, X3, Y3^X7, X6^Y6, }, // 382 + {X2, Y2, X3^Y6, Y3^X6, }, // 383 + {Y6, X7^Y8, Y7^X8, X5^Y6, }, // 384 + {X6, X7^Y7, Y6^X8, X5^Y6, }, // 385 + {Y3, X6^Y7, Y6^X7, X5^Y6, }, // 386 + {X3, Y3^X7, X6^Y6, Z0^X5^Y6, }, // 387 + {Y2, Y3^X6, X3^Y6, Z0^X5^Y6, }, // 388 + {Y3, X6, X7^Y7, Y6^X8, }, // 389 + {X2, Y2, Y3^X6, X3^Y6, }, // 390 + {X6^Y6, Y6, X7, Y7^X8, }, // 391 + {X6^Y6, Y3, Y6, X7^Y7, }, // 392 + {X6^Y6, X3, Y3, Y6^X7, }, // 393 + {X6^Y6, Y2, X3, Y3^X7, }, // 394 + {X3^Y6, X2, Y2, Y3^X6, }, // 395 + {X6, X7, Y7^X8, X6^Y6, }, // 396 + {Y3, X6, X7^Y7, X6^Y6, }, // 397 + {X3, Y3, X6^Y7, X6^Y6, }, // 398 + {Y2, X3, Y3^X7, Z0^X6^Y6, }, // 399 + {X2, X3, Y3^X6, Y2^X6^Y6, }, // 400 + {X6^Y6, X6, X7, Y7^X8, }, // 401 + {X6^Y6, Y3, X6, X7^Y7, }, // 402 + {X6^Y6, X3, Y3, X6^Y7, }, // 403 + {Z0^X6^Y6, Y2, X3, Y3^X7, }, // 404 + {Y2^X6^Y6, X2, X3, Y3^X6, }, // 405 + {Z0^X6^Y6, X3^Y8, Y2, Y3, }, // 406 + {Y2^X6^Y6, X3^Y8, X2, Y3, }, // 407 + {Y6^X7, X7, Y7, X6^Y7, }, // 408 + {Y6^X7, Y3, X7, X6^Y7, }, // 409 + {Y6^X7, X3, Y3, X6^Y7, }, // 410 + {Y2^Y6^X7, X3, Y3, Z0^X6^Y7, }, // 411 + {Y2^Y6^X7, X3, Y3, X2^X6^Y7, }, // 412 + {Y2^Y6^X7, Z0^X6^Y7, X3, Y3, }, // 413 + {Y2^Y6^X7, X2^X6^Y7, X3, Y3, }, // 414 + {X5^Y9, Y6^X8, X6^Y8, X7^Y7, }, // 415 + {Y4^X8, X5^Y7, Y5^X7, X2^X6^Y6, }, // 416 + {Y4^X7, X4^Y7, Y5^X6, Y1^X5^Y6, }, // 417 + {Y4^X8, X5^Y7, Y5^X7, Y2^X6^Y6, }, // 418 + {Y4^X7, X3^Y7, Y5^X6, Y1^X5^Y6, }, // 419 + {X5, Y6^X8, X6^Y8, X7^Y7, }, // 420 + {Y3, X5^Y8, Y6^X7, Y2^X6^Y7, }, // 421 + {X3, Y3^X7, X5^Y7, Y2^X6^Y6, }, // 422 + {Y2, Y3^X6, X3^Y7, Y1^X5^Y6, }, // 423 + {X3, Y3^X7, X5^Y7, X2^X6^Y6, }, // 424 + {Y3, X5, X6^Y8, X7^Y7, }, // 425 + {X3, Y3, X5^Y8, X6^Y7, }, // 426 + {X3, Y3, X5^Y8, Y2^X6^Y7, }, // 427 + {Y2, X3, Y3^X6, X5^Y6, }, // 428 + {X2, Y2, Y3^X5, X3^Y6, }, // 429 + {X6, Y6^X8, X7^Y7, X5^Y6, }, // 430 + {Y3, Y6^X7, Y2^X6^Y7, X5^Y6, }, // 431 + {X3, Y3^X7, Y2^X6^Y6, Z0^X5^Y6, }, // 432 + {X3, Y3^X7, Y2^X6^Y6, Y1^X5^Y6, }, // 433 + {X3, Y3, Y6^X7, Y2^X6^Y7, }, // 434 + {X2, X3, Y3^X7, Y2^X6^Y6, }, // 435 + {X6^Y6, X3, Y3, Y2^X6^Y7, }, // 436 + {X3, Y3, Y2^X6^Y7, X6^Y6, }, // 437 + {X3, Y3, X2^X6^Y7, Y2^X6^Y6, }, // 438 + {Y2^X6^Y6, X3, Y3, X2^X6^Y7, }, // 439 + {X6^Y6, X6^Y8, Y3, Y7, }, // 440 + {X6^Y6, Y2^X6^Y8, X3, Y3, }, // 441 + {Y2^X6^Y6, X2^X6^Y8, X3, Y3, }, // 442 + {Y6^X7, Y3, Y7, X6^Y7, }, // 443 + {Y6^X7, X3, Y3, Y2^X6^Y7, }, // 444 + {Y6^X7, X6^Y7, Y3, Y7, }, // 445 + {Y6^X7, Y2^X6^Y7, X3, Y3, }, // 446 + {X5^Y8, Y5^X8, X6^Y7, Y2^Y6^X7, }, // 447 + {X5^Y8, Y5^X8, X2^X6^Y7, Y2^Y6^X7, }, // 448 + {Y4^X8, X5^Y7, X2^X6^Y6, Y1^Y5^X7, }, // 449 + {X4^Y7, Y4^X7, X1^X5^Y6, Y1^Y5^X6, }, // 450 + {Y4^X9, X6^Y7, Y5^X8, Y2^Y6^X7, }, // 451 + {X5^Y7, Y4^X8, X2^Y5^X7, Y1^X6^Y6, }, // 452 + {X3^Y7, Y4^X7, X1^Y5^X6, Y1^X5^Y6, }, // 453 + {Y3, X6^Y7, Y5^X8, Y2^Y6^X7, }, // 454 + {Y3, Y5^X8, X2^Y6^X7, Y2^X6^Y7, }, // 455 + {X3, Y3^X8, X2^Y5^X7, Y1^X6^Y6, }, // 456 + {Y2, Y3^X6, X3^Y6, X1^X5^Y5, }, // 457 + {Y3, X5^Y8, X6^Y7, Y2^Y6^X7, }, // 458 + {Y3, X5^Y8, X2^X6^Y7, Y2^Y6^X7, }, // 459 + {X3, Y3^X8, Y2^Y5^X7, Y1^X6^Y6, }, // 460 + {X3, Y3^X7, Y2^X6^Y6, X1^X5^Y7, }, // 461 + {X3, Y3, X6^Y7, Y2^Y6^X7, }, // 462 + {X3, Y3, X2^X6^Y7, Y2^Y6^X7, }, // 463 + {X2, X3, Y3^X7, Y2^Y5^X6, }, // 464 + {X2, X3, Y3^X6, Y2^X5^Y6, }, // 465 + {Y3, X6^Y7, Y2^Y6^X7, X5^Y6, }, // 466 + {Y3, X2^Y6^X7, Y2^X6^Y7, X5^Y6, }, // 467 + {Y3, X2^Y6^X7, Y2^X6^Y7, Z0^X5^Y6, }, // 468 + {Y3, X2^Y6^X7, Y2^X6^Y7, X1^X5^Y6, }, // 469 + {X3, Y3, X2^Y6^X7, Y2^X6^Y7, }, // 470 + {X6^Y6, X3, Y3, Y2^Y6^X7, }, // 471 + {Y2^X6^Y6, X3, Y3, X2^X6^Y6, }, // 472 + {X3, Y3, Y2^Y6^X7, X6^Y6, }, // 473 + {Y2^Y6^X7, X3, Y3, X6^Y7, }, // 474 + {Y2^Y6^X7, X6^Y7, X3, Y3, }, // 475 + {Y4^X8, X1^X5^Y7, Y1^Y5^X7, X2^X6^Y6, }, // 476 + {Y4^X7, Y0^X4^Y7, X1^X5^Y6, Y1^Y5^X6, }, // 477 + {Y4^X8, Y1^X5^Y7, X1^Y5^X7, Y2^X6^Y6, }, // 478 + {Y3^X7, Y0^X4^Y6, X1^Y4^X6, Y1^X5^Y5, }, // 479 + {Y3, X5^Y8, X2^Y6^X7, Y2^X6^Y7, }, // 480 + {Y3, Y1^X5^Y8, X2^X6^Y7, Y2^Y6^X7, }, // 481 + {X3, Y3^X7, Y1^X5^Y6, X1^Y5^X6, }, // 482 + {X3, Y3^X6, Y1^X4^Y6, Y2^X5^Y5, }, // 483 + {Y3, X1^X5^Y8, Y2^Y6^X7, X2^X6^Y7, }, // 484 + {Y3, X1^X5^Y8, X2^X6^Y7, Y2^Y6^X7, }, // 485 + {X3, Y3, Y1^X5^Y7, X2^X6^Y6, }, // 486 + {X3, Y3, X1^X5^Y7, Y2^X6^Y6, }, // 487 + {X3, Y3, X1^X5^Y7, X2^X6^Y6, }, // 488 + {Y3, X2^Y6^X7, Y1^X6^Y7, Y2^X5^Y6, }, // 489 + {X3, Y3, X2^Y6^X7, Y1^X6^Y7, }, // 490 + {X2^X6^Y6, X3, Y3, Y1^X6^Y6, }, // 491 + {X2^X6^Y6, X3, Y3, Y2^X6^Y6, }, // 492 + {X3, Y3, Y1^X6^Y7, X2^X6^Y6, }, // 493 + {Y2^X6^Y6, X3, Y3, Y1^X6^Y7, }, // 494 + {Y2^X6^Y6, Y1^X6^Y8, X3, Y3, }, // 495 + {Y2^Y6^X7, X3, Y3, Y1^X6^Y7, }, // 496 + {X6^X8^Y8, Y6, X7, X8^Y10, }, // 497 + {X6^X8^Y8, Y3, Y6, X7^Y10, }, // 498 + {X6^X8^Y8, X3, Y3, X7^Y9, }, // 499 + {X6^X8^Y8, Y2, X3, Y3^X10, }, // 500 + {X6^X8^Y8, X2, Y2, X3^Y8, }, // 501 + {Z0^X6^Y6, X6, X7, Y7^X11, }, // 502 + {Z0^X6^Y6, Y3, X6, X7^Y10, }, // 503 + {Z0^X6^Y6, X3, Y3, X6^Y10, }, // 504 + {Z0^X6^Y6, X6^X9^Y9, X7, Y7, }, // 505 + {Z0^X6^Y6, X6^X9^Y9, Y3, X7, }, // 506 + {Z0^X6^Y6, X6^X9^Y9, X3, Y3, }, // 507 + {Z0^X6^Y6, X6^X9^Y9, Y2, X3, }, // 508 + {Z0^X6^Y6, X6^X9^Y9, X2, Y2, }, // 509 + {Z1^Y6^X7, X7, Y7, X8^Y10, }, // 510 + {Z1^Y6^X7, Y3, X7, Y7^X10, }, // 511 + {Z1^Y6^X7, X3, Y3, X7^Y9, }, // 512 + {Z1^Y6^X7, Z0^X6^Y7, X7, Y7, }, // 513 + {Z1^Y6^X7, Z0^X6^Y7, Y3, X7, }, // 514 + {Z1^Y6^X7, Z0^X6^Y7, X3, Y3, }, // 515 + {Y6^X7, S0^X6^Y7, 0, 0, }, // 516 + {Y5^X7, S0^X6^Y6, 0, 0, }, // 517 + {Y5^X6, S0^X5^Y6, 0, 0, }, // 518 + {Y4^X6, S0^X5^Y5, 0, 0, }, // 519 + {Y4^X5, S0^X4^Y5, 0, 0, }, // 520 + {X6^Y8, Y6^X8, S0^X7^Y7, 0, }, // 521 + {X6^Y7, Y5^X8, S0^Y6^X7, 0, }, // 522 + {X5^Y7, Y5^X7, S0^X6^Y6, 0, }, // 523 + {X5^Y6, Y4^X7, S0^Y5^X6, 0, }, // 524 + {X3^Y6, Y4^X6, S0^X5^Y5, 0, }, // 525 + {Y6^X9, X6^Y9, Y7^X8, S0^X7^Y8, }, // 526 + {Y5^X9, X6^Y8, Y6^X8, S0^X7^Y7, }, // 527 + {Y5^X8, X5^Y8, Y6^X7, S0^X6^Y7, }, // 528 + {Y3^X8, X5^Y7, Y5^X7, S0^X6^Y6, }, // 529 + {Y3^X7, X3^Y7, Y5^X6, S0^X5^Y6, }, // 530 + {Y5, X6^Y9, X7^Y8, Y6^X9, }, // 531 + {X3, Y3^X9, X6^Y7, Y5^X8, }, // 532 + {Y2, Y3^X8, X3^Y7, Y5^X7, }, // 533 + {Y6^X9, Y7^X8, S0^X7^Y8, Z0^X5^Y5, }, // 534 + {X6^Y8, Y6^X8, S0^X7^Y7, Z0^X5^Y5, }, // 535 + {X5^Y8, Y6^X7, S0^X6^Y7, Z0^X5^Y5, }, // 536 + {Y3^X7, X5^Y7, S0^X6^Y6, Z0^X5^Y5, }, // 537 + {Y3^X6, X3^Y7, S0^X5^Y6, Z0^X5^Y5, }, // 538 + {X6, Y6, Y7^X10, X7^Y10, }, // 539 + {Y6, X7^Y9, Y7^X9, S0^X8^Y8, }, // 540 + {X6, X7^Y8, Y6^X9, S0^Y7^X8, }, // 541 + {Y3, X6^Y8, Y6^X8, S0^X7^Y7, }, // 542 + {X3, Y3^X8, X6^Y7, S0^Y6^X7, }, // 543 + {Y2, Y3^X7, X3^Y7, S0^X6^Y6, }, // 544 + {X6^X8^Y8, Y6, X7, Y7^X11, }, // 545 + {X6^X8^Y8, X3, Y3, Y6^X10, }, // 546 + {X6^X8^Y8, X2, Y2, Y3^X9, }, // 547 + {X6, X7, Y7^X10, Y8^X9, }, // 548 + {Z0^Y6^X7, X7, Y7, X8^Y10, }, // 549 + {Z0^Y6^X7, Y3, X7, X8^Y9, }, // 550 + {Z0^Y6^X7, X3, Y3, X7^Y9, }, // 551 + {Z0^Y6^X7, Z4^X6^Y7, X7, Y7, }, // 552 + {Z0^Y6^X7, Z4^X6^Y7, Y3, X7, }, // 553 + {Z0^Y6^X7, Z4^X6^Y7, X3, Y3, }, // 554 + {Z0^Y6^X7, Z4^X6^Y7, Y2, X3, }, // 555 + {S0^X6^Y7, S1^Y6^X7, 0, 0, }, // 556 + {S0^Y5^X7, S1^X6^Y6, 0, 0, }, // 557 + {S0^X5^Y6, S1^Y5^X6, 0, 0, }, // 558 + {S0^Y4^X6, S1^X5^Y5, 0, 0, }, // 559 + {S0^X4^Y5, S1^Y4^X5, 0, 0, }, // 560 + {Y5^X9, S0^X7^Y7, S1^Y6^X8, 0, }, // 561 + {Y5^X8, S0^X6^Y7, S1^Y6^X7, 0, }, // 562 + {Y4^X8, S0^X6^Y6, S1^Y5^X7, 0, }, // 563 + {Y4^X7, S0^X5^Y6, S1^Y5^X6, 0, }, // 564 + {Y3^X7, S0^X5^Y5, S1^Y4^X6, 0, }, // 565 + {X6^Y9, Y6^X9, S0^X7^Y8, S1^Y7^X8, }, // 566 + {X6^Y8, Y5^X9, S0^X7^Y7, S1^Y6^X8, }, // 567 + {X5^Y8, Y5^X8, S0^X6^Y7, S1^Y6^X7, }, // 568 + {Y3^X8, X5^Y7, S0^X6^Y6, S1^Y5^X7, }, // 569 + {Y3^X7, X3^Y7, S0^X5^Y6, S1^Y5^X6, }, // 570 + {X6, X7^Y9, Y6^X10, S0^X8^Y8, }, // 571 + {Y5, X6^Y9, Y6^X9, S0^X7^Y8, }, // 572 + {Y3, X6^Y8, Y5^X9, S0^X7^Y7, }, // 573 + {X3, Y3^X9, Y5^X8, S0^X6^Y7, }, // 574 + {Y2, X3^Y7, Y3^X8, S0^X6^Y6, }, // 575 + {Y6^X9, S0^X7^Y8, S1^Y7^X8, Z0^X5^Y5, }, // 576 + {X6^Y8, S0^Y6^X8, S1^X7^Y7, Z0^X5^Y5, }, // 577 + {X5^Y8, S0^X6^Y7, S1^Y6^X7, Z0^X5^Y5, }, // 578 + {Y3^X8, S0^X6^Y6, S1^Y5^X7, Z0^X5^Y5, }, // 579 + {Y3^X6, X3^Y7, S0^X5^Y6, S1^X5^Y5, }, // 580 + {X6, Y6^X10, X7^Y9, S0^Y7^X9, }, // 581 + {X5, X6^Y9, Y6^X9, S0^X7^Y8, }, // 582 + {Y3, X5^Y9, X6^Y8, S0^Y6^X8, }, // 583 + {X3, Y3^X8, X5^Y8, S0^X6^Y7, }, // 584 + {Y2, Y3^X8, X3^Y7, S0^X6^Y6, }, // 585 + {Y6, X7^Y9, S0^X8^Y8, S1^Y7^X9, }, // 586 + {X6, Y6^X9, S0^X7^Y8, S1^Y7^X8, }, // 587 + {Y3, X6^Y8, S0^X7^Y7, S1^Y6^X8, }, // 588 + {X3, Y3^X8, S0^X6^Y7, S1^Y6^X7, }, // 589 + {X6, X7, Y7^X10, S0^X8^Y9, }, // 590 + {Y3, X6, X7^Y9, S0^Y7^X9, }, // 591 + {X3, Y3, X6^Y9, S0^X7^Y8, }, // 592 + {Y2, X3, Y3^X9, S0^X7^Y7, }, // 593 + {Z3^X6^Y6, X6, X7, Y7^X11, }, // 594 + {Z3^X6^Y6, Y3, X6, X7^Y10, }, // 595 + {Z3^X6^Y6, X3, Y3, X6^Y10, }, // 596 + {Z3^X6^Y6, X6^X9^Y9, X7, Y7, }, // 597 + {Z3^X6^Y6, X6^X9^Y9, Y3, X7, }, // 598 + {Z3^X6^Y6, X6^X9^Y9, X3, Y3, }, // 599 + {Z3^X6^Y6, X6^X9^Y9, Y2, X3, }, // 600 + {Z3^X6^Y6, X6^X9^Y9, X2, Y2, }, // 601 + {Z4^Y6^X7, X7, Y7, X8^Y10, }, // 602 + {Z4^Y6^X7, Y3, X7, Y7^X10, }, // 603 + {Z4^Y6^X7, X3, Y3, X7^Y9, }, // 604 + {Z4^Y6^X7, Y2, X3, Y3^X9, }, // 605 + {S1^Y6^X7, X2, Y2, Y3^X8, }, // 606 + {Z4^Y6^X7, Z3^X6^Y7, X7, Y7, }, // 607 + {Z4^Y6^X7, Z3^X6^Y7, Y3, X7, }, // 608 + {Z4^Y6^X7, Z3^X6^Y7, X3, Y3, }, // 609 + {S1^Y6^X7, S2^X6^Y7, 0, 0, }, // 610 + {S1^Y5^X7, S2^X6^Y6, 0, 0, }, // 611 + {S1^Y5^X6, S2^X5^Y6, 0, 0, }, // 612 + {S1^Y4^X6, S2^X5^Y5, 0, 0, }, // 613 + {S1^Y4^X5, S2^X4^Y5, 0, 0, }, // 614 + {S0^X6^Y8, S1^Y6^X8, S2^X7^Y7, 0, }, // 615 + {S0^X6^Y7, S1^Y5^X8, S2^Y6^X7, 0, }, // 616 + {S0^X5^Y7, S1^Y5^X7, S2^X6^Y6, 0, }, // 617 + {S0^X5^Y6, S1^Y4^X7, S2^Y5^X6, 0, }, // 618 + {Y6^X9, S0^X6^Y9, S1^Y7^X8, S2^X7^Y8, }, // 619 + {Y5^X9, S0^X6^Y8, S1^Y6^X8, S2^X7^Y7, }, // 620 + {Y5^X8, S0^X5^Y8, S1^Y6^X7, S2^X6^Y7, }, // 621 + {Y3^X8, S0^X5^Y7, S1^Y5^X7, S2^X6^Y6, }, // 622 + {Y3^X6, X3^Y7, S0^X4^Y6, S1^X5^Y5, }, // 623 + {X6, Y6^X10, S0^X7^Y9, S1^Y7^X9, }, // 624 + {Y5, X6^Y9, S0^X7^Y8, S1^Y6^X9, }, // 625 + {Y3, Y5^X9, S0^X6^Y8, S1^Y6^X8, }, // 626 + {X3, Y3^X9, S0^X6^Y7, S1^Y5^X8, }, // 627 + {Y2, Y3^X8, S0^X5^Y7, S1^Y5^X7, }, // 628 + {S0^X6^Y9, S1^Y7^X8, S2^X7^Y8, Z0^X5^Y5, }, // 629 + {S0^X6^Y8, S1^Y6^X8, S2^X7^Y7, Z0^X5^Y5, }, // 630 + {S0^X5^Y8, S1^Y6^X7, S2^X6^Y7, Z0^X5^Y5, }, // 631 + {Y3^X7, S0^X5^Y7, S1^X6^Y6, S2^X5^Y5, }, // 632 + {X5, X6^Y9, S0^Y6^X9, S1^X7^Y8, }, // 633 + {Y3, X5^Y9, S0^X6^Y8, S1^Y6^X8, }, // 634 + {Y2, Y3^X7, X3^Y8, S0^X5^Y7, }, // 635 + {X6, Y6, Y7^X10, S0^X7^Y10, }, // 636 + {Y3, X6, Y6^X10, S0^X7^Y9, }, // 637 + {X3, Y3, Y6^X9, S0^X6^Y9, }, // 638 + {Y2, X3, Y3^X9, S0^X6^Y8, }, // 639 + {X2, Y2, Y3^X8, S0^X5^Y8, }, // 640 + {Y6, S0^X7^Y9, S1^Y7^X9, S2^X8^Y8, }, // 641 + {X6, S0^X7^Y8, S1^Y6^X9, S2^Y7^X8, }, // 642 + {Y3, S0^X6^Y8, S1^Y6^X8, S2^X7^Y7, }, // 643 + {X3^X8^Y8, X2, Y2, Y3^X9, }, // 644 + {X6, Y7, S0^X7^Y10, S1^Y8^X9, }, // 645 + {Y3, X6, S0^X7^Y9, S1^Y7^X9, }, // 646 + {X3, Y3, S0^X6^Y9, S1^Y7^X8, }, // 647 + {Y2, X3, Y3^X8, S0^X6^Y8, }, // 648 + {Z2^X6^Y6, X6, X7, Y7^X11, }, // 649 + {Z2^X6^Y6, Y3, X6, X7^Y10, }, // 650 + {Z2^X6^Y6, X3, Y3, X6^Y10, }, // 651 + {Z2^X6^Y6, Y2, X3, Y3^X10, }, // 652 + {S2^X6^Y6, X2, Y2, Y3^X8, }, // 653 + {Z2^X6^Y6, X6^X9^Y9, X7, Y7, }, // 654 + {Z2^X6^Y6, X6^X9^Y9, Y3, X7, }, // 655 + {Z2^X6^Y6, X6^X9^Y9, X3, Y3, }, // 656 + {Z2^X6^Y6, X6^X9^Y9, Y2, X3, }, // 657 + {Z2^X6^Y6, X3^X9^Y9, X2, Y2, }, // 658 + {Z3^Y6^X7, X7, Y7, S0^X8^Y10, }, // 659 + {Z3^Y6^X7, Y3, X7, S0^X8^Y9, }, // 660 + {Z3^Y6^X7, X3, Y3, S0^X7^Y9, }, // 661 + {S2^Y6^X7, Y2, X3, Y3^X9, }, // 662 + {S2^Y6^X7, X2, Y2, Y3^X8, }, // 663 + {Z3^Y6^X7, Z2^X6^Y7, X7, Y7, }, // 664 + {Z3^Y6^X7, Z2^X6^Y7, Y3, X7, }, // 665 + {Z3^Y6^X7, Z2^X6^Y7, X3, Y3, }, // 666 + {Z3^Y6^X7, Z2^X6^Y7, Y2, X3, }, // 667 + {Z2^Y6^X7, S2^X6^Y7, X2, Y2, }, // 668 + {Y6, X7^Y8, Y7^X8, Z0^X5^Y6, }, // 669 + {X6, X7^Y7, Y6^X8, Z0^X5^Y6, }, // 670 + {Y3, X6^Y7, Y6^X7, Z0^X5^Y6, }, // 671 + {X6^X8^Y8, Y6, X7, Y7^X8, }, // 672 + {X6^X8^Y8, Y3, Y6, X7^Y7, }, // 673 + {X6^X8^Y8, X3, Y3, Y6^X7, }, // 674 + {X6^X8^Y8, Y2, X3, Y3^X7, }, // 675 + {X3^X8^Y8, X2, Y2, Y3^X6, }, // 676 + {X6, X7, Y7^X8, Z0^X6^Y6, }, // 677 + {Y3, X6, X7^Y7, Z0^X6^Y6, }, // 678 + {X3, Y3, X6^Y7, Z0^X6^Y6, }, // 679 + {Z0^X6^Y6, X6, X7, Y7^X8, }, // 680 + {Z0^X6^Y6, Y3, X6, X7^Y7, }, // 681 + {Z0^X6^Y6, X3, Y3, X6^Y7, }, // 682 + {Z0^X6^Y6, X3^X9^Y9, Y2, Y3, }, // 683 + {Y2^X6^Y6, X3^X9^Y9, X2, Y3, }, // 684 + {Z1^Y6^X7, X7, Y7, Z0^X6^Y7, }, // 685 + {Z1^Y6^X7, Y3, X7, Z0^X6^Y7, }, // 686 + {Z1^Y6^X7, X3, Y3, Z0^X6^Y7, }, // 687 + {Y4^X8, X5^Y7, Y5^X7, S0^X6^Y6, }, // 688 + {Y4^X7, X4^Y7, Y5^X6, S0^X5^Y6, }, // 689 + {Y4^X7, X3^Y7, Y5^X6, S0^X5^Y6, }, // 690 + {X6, Y6^X9, Y7^X8, S0^X7^Y8, }, // 691 + {Y5, X6^Y8, Y6^X8, S0^X7^Y7, }, // 692 + {Y3, Y5^X8, Y6^X7, S0^X6^Y7, }, // 693 + {X3, Y3^X8, Y5^X7, S0^X6^Y6, }, // 694 + {Y2, Y3^X6, X3^Y6, X5^Y5, }, // 695 + {X5, X6^Y8, Y6^X8, S0^X7^Y7, }, // 696 + {Y3, X5^Y8, Y6^X7, S0^X6^Y7, }, // 697 + {X3, Y3^X7, X5^Y7, S0^X6^Y6, }, // 698 + {Y2, Y3^X6, X3^Y7, S0^X5^Y6, }, // 699 + {X6, Y6, Y7^X8, S0^X7^Y8, }, // 700 + {Y3, X6, Y6^X8, S0^X7^Y7, }, // 701 + {X3, Y3, Y6^X7, S0^X6^Y7, }, // 702 + {Y2, X3, Y3^X7, S0^X6^Y6, }, // 703 + {Y6, Y7^X8, S0^X7^Y8, Z0^X5^Y6, }, // 704 + {X6, Y6^X8, S0^X7^Y7, Z0^X5^Y6, }, // 705 + {Y3, Y6^X7, S0^X6^Y7, Z0^X5^Y6, }, // 706 + {X3, Y3^X7, S0^X6^Y6, Z0^X5^Y6, }, // 707 + {Y2, Y3^X6, X3^Y6, S0^X5^Y6, }, // 708 + {X6^X8^Y8, Y6, Y7, S0^X7^Y8, }, // 709 + {X6^X8^Y8, Y3, Y6, S0^X7^Y7, }, // 710 + {S0^X8^Y8, X3, Y3, X6^Y6, }, // 711 + {S0^X8^Y8, Y2, X3, Y3^X6, }, // 712 + {X6, Y7, S0^X7^Y8, Z0^X6^Y6, }, // 713 + {Y3, X6, S0^X7^Y7, Z0^X6^Y6, }, // 714 + {X3, Y3, S0^X6^Y7, Z0^X6^Y6, }, // 715 + {Y2, X3, Y3^X6, S0^X6^Y6, }, // 716 + {Z0^X6^Y6, X6, Y7, S0^X7^Y8, }, // 717 + {Z0^X6^Y6, Y3, X6, S0^X7^Y7, }, // 718 + {Z0^X6^Y6, X3, Y3, S0^X6^Y7, }, // 719 + {S0^X6^Y6, Y2, X3, Y3^X6, }, // 720 + {Z0^X6^Y6, X6^X9^Y9, Y7, S0^X7, }, // 721 + {Z0^X6^Y6, X6^X9^Y9, Y3, S0^X7, }, // 722 + {Z0^X6^Y6, S0^X9^Y9, X3, Y3, }, // 723 + {S0^X6^Y6, X3^X9^Y9, Y2, Y3, }, // 724 + {Z0^Y6^X7, Y7, S0^X7, Z4^X6^Y7, }, // 725 + {Z0^Y6^X7, Y3, S0^X7, Z4^X6^Y7, }, // 726 + {Z0^Y6^X7, X3, Y3, S0^X6^Y7, }, // 727 + {S0^Y6^X7, X3, Y3, Y2^X6^Y7, }, // 728 + {Z0^Y6^X7, Z4^X6^Y7, Y7, S0^X7, }, // 729 + {Z0^Y6^X7, Z4^X6^Y7, Y3, S0^X7, }, // 730 + {Z0^Y6^X7, S0^X6^Y7, X3, Y3, }, // 731 + {S0^Y6^X7, Y2^X6^Y7, X3, Y3, }, // 732 + {Y5^X9, X6^Y8, S0^Y6^X8, S1^X7^Y7, }, // 733 + {Y4^X8, X5^Y7, S0^Y5^X7, S1^X6^Y6, }, // 734 + {X4^Y7, Y4^X7, S0^X5^Y6, S1^Y5^X6, }, // 735 + {X5^Y7, Y4^X8, S0^X6^Y6, S1^Y5^X7, }, // 736 + {X3^Y7, Y4^X7, S0^X5^Y6, S1^Y5^X6, }, // 737 + {Y5, X6^Y8, S0^X7^Y7, S1^Y6^X8, }, // 738 + {Y3, Y5^X8, S0^X6^Y7, S1^Y6^X7, }, // 739 + {X3, Y3^X8, S0^X6^Y6, S1^Y5^X7, }, // 740 + {Y2, Y3^X6, X3^Y6, S0^X5^Y5, }, // 741 + {X5, X6^Y8, S0^Y6^X8, S1^X7^Y7, }, // 742 + {Y3, X5^Y8, S0^X6^Y7, S1^Y6^X7, }, // 743 + {X6, Y6, S0^X7^Y8, S1^Y7^X8, }, // 744 + {Y3, X6, S0^Y6^X8, S1^X7^Y7, }, // 745 + {X3, Y3, S0^X6^Y7, S1^Y6^X7, }, // 746 + {Y2, X3, Y3^X7, S0^Y5^X6, }, // 747 + {Y6, S0^X7^Y8, S1^Y7^X8, Z0^X5^Y6, }, // 748 + {X6, S0^X7^Y7, S1^Y6^X8, Z0^X5^Y6, }, // 749 + {Y3, S0^X6^Y7, S1^Y6^X7, Z0^X5^Y6, }, // 750 + {Y3, X6, S0^X7^Y7, S1^Y6^X8, }, // 751 + {X6^X8^Y8, Y6, S0^X7, S1^Y7^X8, }, // 752 + {X6^X8^Y8, Y3, S0^X7, S1^Y6^X8, }, // 753 + {S1^X8^Y8, X3, Y3, S0^X6^Y6, }, // 754 + {X6, S0^X7, S1^Y7^X8, Z3^X6^Y6, }, // 755 + {Y3, S0^X7, S1^Y6^X8, Z3^X6^Y6, }, // 756 + {X3, Y3, S0^X6^Y7, S1^X6^Y6, }, // 757 + {Z3^X6^Y6, X6, S0^X7, S1^Y7^X8, }, // 758 + {Z3^X6^Y6, Y3, S0^X7, S1^Y6^X8, }, // 759 + {S1^X6^Y6, X3, Y3, S0^X6^Y7, }, // 760 + {Z3^X6^Y6, X6^X9^Y9, S0^X7, S1^Y7, }, // 761 + {Z3^X6^Y6, S1^X9^Y9, Y3, S0^X7, }, // 762 + {S1^X6^Y6, S0^X9^Y9, X3, Y3, }, // 763 + {Z4^Y6^X7, S0^X7, S1^Y7, Z3^X6^Y7, }, // 764 + {S1^Y6^X7, Y3, S0^X7, Z3^X6^Y7, }, // 765 + {S1^Y6^X7, X3, Y3, S0^X6^Y7, }, // 766 + {Z4^Y6^X7, Z3^X6^Y7, S0^X7, S1^Y7, }, // 767 + {S1^Y6^X7, Z3^X6^Y7, Y3, S0^X7, }, // 768 + {S1^Y6^X7, S0^X6^Y7, X3, Y3, }, // 769 + {Y4^X8, S0^X5^Y7, S1^Y5^X7, S2^X6^Y6, }, // 770 + {Y4^X7, S0^X4^Y7, S1^Y5^X6, S2^X5^Y6, }, // 771 + {Y3^X7, S0^X4^Y6, S1^Y4^X6, S2^X5^Y5, }, // 772 + {Y6, S0^X6^Y9, S1^Y7^X8, S2^X7^Y8, }, // 773 + {Y5, S0^X6^Y8, S1^Y6^X8, S2^X7^Y7, }, // 774 + {Y3, Y5^X7, S0^X5^Y7, S1^X6^Y6, }, // 775 + {X3, Y3^X7, S0^X5^Y6, S1^Y5^X6, }, // 776 + {Y2, Y3^X5, X3^Y6, S0^X4^Y5, }, // 777 + {X5, S0^X6^Y8, S1^Y6^X8, S2^X7^Y7, }, // 778 + {Y3, S0^X5^Y8, S1^Y6^X7, S2^X6^Y7, }, // 779 + {X3, Y3^X7, S0^X5^Y7, S1^X6^Y6, }, // 780 + {Y6, S0^X6, S1^Y7^X8, S2^X7^Y8, }, // 781 + {Y3, S0^X6, S1^Y6^X8, S2^X7^Y7, }, // 782 + {X3, Y3, S0^X5^Y7, S1^X6^Y6, }, // 783 + {Y2, X3, Y3^X6, S0^X5^Y6, }, // 784 + {S0^X6, S1^Y7^X8, S2^X7^Y8, Z2^X5^Y6, }, // 785 + {S0^X6, S1^Y6^X8, S2^X7^Y7, Z2^X5^Y6, }, // 786 + {Y3, S0^X6^Y7, S1^Y6^X7, S2^X5^Y6, }, // 787 + {X3, Y3^X7, S0^X6^Y6, S1^X5^Y6, }, // 788 + {S2^X8^Y8, Y6, S0^X6, S1^X7^Y7, }, // 789 + {S2^X8^Y8, Y3, S0^X6, S1^Y6^X7, }, // 790 + {S0^X6, S1^Y7, S2^X7^Y8, Z2^X6^Y6, }, // 791 + {Y3, S0^X6, S1^X7^Y7, S2^X6^Y6, }, // 792 + {Z2^X6^Y6, S0^X6, S1^Y7, S2^X7^Y8, }, // 793 + {S2^X6^Y6, Y3, S0^X6, S1^X7^Y7, }, // 794 + {Z2^X6^Y6, S2^X9^Y9, S0^X6, S1^Y7, }, // 795 + {S2^X6^Y6, S1^X9^Y9, Y3, S0^X6, }, // 796 + {Z2^Y6^X7, S0^X7, S1^Y7, S2^X6^Y7, }, // 797 + {S2^Y6^X7, Y3, S0^X7, S1^X6^Y7, }, // 798 + {Z2^Y6^X7, S2^X6^Y7, S0^X7, S1^Y7, }, // 799 + {S2^Y6^X7, S1^X6^Y7, Y3, S0^X7, }, // 800 + {X2, Z4, Y4, X3, }, // 801 + {X2, Z3, Y4, X3, }, // 802 + {Y3, X3, Z4, X5, }, // 803 + {Y3, X2, Z4, X3, }, // 804 + {Y3, X2, Z3, X3, }, // 805 + {Y2, X2, Y3, X3, }, // 806 + {Z3, X3, Z4, X5^Y5, }, // 807 + {X2, Z4, X3, Y2^X5^Y5, }, // 808 + {X2, Z3, X3, Y2^X5^Y5, }, // 809 + {X2, Y3, X3, Y1^X5^Y5, }, // 810 + {X2, Y3, X3, X1^X5^Y5, }, // 811 + {Y3, Z3, X3, Z4, }, // 812 + {Y2, Y3, X3, Z4, }, // 813 + {Z3, X3, Z4, X5^Y6, }, // 814 + {X2, Z4, X3, Z3^X5^Y6, }, // 815 + {X2, Z3, X3, Z2^X5^Y6, }, // 816 + {X2, Y3, X3, Z2^X5^Y6, }, // 817 + {Z3^X7, Y3, X3, Z4, }, // 818 + {Z3^X7, X2, Z4, X3, }, // 819 + {Z2^X7, X2, Z3, X3, }, // 820 + {Z2^X7, X2, Y3, X3, }, // 821 + {Z3, X3, Z4, Y3^X6^Y6, }, // 822 + {X2, Z4, X3, Y3^X6^Y6, }, // 823 + {X2, Z3, X3, Y3^X6^Y6, }, // 824 + {X2, Y3, X3, Y2^X6^Y6, }, // 825 + {Y3^X6^Y6, Z3, X3, Z4, }, // 826 + {Y3^X6^Y6, X2, Z4, X3, }, // 827 + {Y3^X6^Y6, X2, Z3, X3, }, // 828 + {Y2^X6^Y6, X2, Y3, X3, }, // 829 + {Y3^X6^Y6, Z3^X8, X3, Z4, }, // 830 + {X2^X6^Y6, Z3^X8, Z4, X3, }, // 831 + {X2^X6^Y6, Z2^X8, Z3, X3, }, // 832 + {X2^X6^Y6, Z2^X8, Y3, X3, }, // 833 + {Y3^Y6^X7, X3, Z4, Z3^X6^Y7, }, // 834 + {Y3^Y6^X7, Z4, X3, X2^X6^Y7, }, // 835 + {Y3^Y6^X7, Z3, X3, X2^X6^Y7, }, // 836 + {Y2^Y6^X7, Y3, X3, X2^X6^Y7, }, // 837 + {Y3^Y6^X7, Z3^X6^Y7, X3, Z4, }, // 838 + {Y3^Y6^X7, X2^X6^Y7, Z4, X3, }, // 839 + {Y3^Y6^X7, X2^X6^Y7, Z3, X3, }, // 840 + {Y2^Y6^X7, X2^X6^Y7, Y3, X3, }, // 841 +}; + +const UINT_64 GFX10_SW_PATTERN_NIBBLE4[][4] = +{ + {0, 0, 0, 0, }, // 0 + {Y7^X9, 0, 0, 0, }, // 1 + {Y7^X8, 0, 0, 0, }, // 2 + {Y6^X8, 0, 0, 0, }, // 3 + {Y6^X7, 0, 0, 0, }, // 4 + {Y5^X7, 0, 0, 0, }, // 5 + {X8^Y8, 0, 0, 0, }, // 6 + {X7^Y7, 0, 0, 0, }, // 7 + {X6^Y6, 0, 0, 0, }, // 8 + {X8^Y9, Y8^X9, 0, 0, }, // 9 + {Y7^X9, X8^Y8, 0, 0, }, // 10 + {X7^Y8, Y7^X8, 0, 0, }, // 11 + {Y6^X8, X7^Y7, 0, 0, }, // 12 + {X6^Y7, Y6^X7, 0, 0, }, // 13 + {X5^Y6, 0, 0, 0, }, // 14 + {Z0^X5^Y6, 0, 0, 0, }, // 15 + {X8^Y8, Y7^X9, 0, 0, }, // 16 + {X7^Y7, Y6^X8, 0, 0, }, // 17 + {Y7^X11, X9^Y9, Y8^X10, 0, }, // 18 + {Y7^X10, X8^Y9, Y8^X9, 0, }, // 19 + {Y6^X10, X8^Y8, Y7^X9, 0, }, // 20 + {Y6^X9, X7^Y8, Y7^X8, 0, }, // 21 + {Y3^X9, X7^Y7, Y6^X8, 0, }, // 22 + {Y8^X9, X6^Y6, 0, 0, }, // 23 + {X8^Y8, X6^Y6, 0, 0, }, // 24 + {Y7^X8, X6^Y6, 0, 0, }, // 25 + {X7^Y7, Z0^X6^Y6, 0, 0, }, // 26 + {X6^Y7, Z0^X6^Y6, 0, 0, }, // 27 + {X8^Y10, Y8^X10, X9^Y9, 0, }, // 28 + {X7^Y9, Y7^X9, X8^Y8, 0, }, // 29 + {X6^Y9, X7^Y8, Y7^X8, 0, }, // 30 + {Y3^X8, X6^Y8, X7^Y7, 0, }, // 31 + {X8^Y11, Y8^X11, X9^Y10, Y9^X10, }, // 32 + {Y7^X11, X8^Y10, Y8^X10, X9^Y9, }, // 33 + {X7^Y10, Y7^X10, X8^Y9, Y8^X9, }, // 34 + {Y3^X10, X7^Y9, Y7^X9, X8^Y8, }, // 35 + {X3^Y9, Y3^X9, X7^Y8, Y7^X8, }, // 36 + {X9^Y9, Y8^X10, X6^Y7, 0, }, // 37 + {X8^Y9, Y8^X9, X6^Y7, 0, }, // 38 + {X8^Y8, Y7^X9, X6^Y7, 0, }, // 39 + {X7^Y8, Y7^X8, Z0^X6^Y7, 0, }, // 40 + {Y3^X8, X7^Y7, Z0^X6^Y7, 0, }, // 41 + {X8^Y10, Y7^X11, X9^Y9, Y8^X10, }, // 42 + {Y3^X10, X7^Y9, X8^Y8, Y7^X9, }, // 43 + {Y3^X9, X3^Y9, X7^Y8, Y7^X8, }, // 44 + {Y2^X7^Y7, 0, 0, 0, }, // 45 + {X2^Y6^X7, 0, 0, 0, }, // 46 + {Y1^X6^Y6, 0, 0, 0, }, // 47 + {X7^Y9, X8^Y8, 0, 0, }, // 48 + {Y7^X8, Y2^X7^Y8, 0, 0, }, // 49 + {X6^Y8, X2^X7^Y7, 0, 0, }, // 50 + {X5^Y8, Y1^X6^Y7, 0, 0, }, // 51 + {Y6^X8, Y2^X7^Y7, 0, 0, }, // 52 + {Y6^X7, Y1^X6^Y7, 0, 0, }, // 53 + {X7^Y9, X8^Y8, Y7^X9, 0, }, // 54 + {X7^Y9, Y7^X9, Y2^X8^Y8, 0, }, // 55 + {X6^Y9, X7^Y8, X2^Y7^X8, 0, }, // 56 + {X3^Y9, X6^Y8, Y1^X7^Y7, 0, }, // 57 + {Y2^X7^Y8, X6^Y6, 0, 0, }, // 58 + {X2^X7^Y7, Z0^X6^Y6, 0, 0, }, // 59 + {Y1^X6^Y7, Z0^X6^Y6, 0, 0, }, // 60 + {Y3^X8, X6^Y8, Y1^X7^Y7, 0, }, // 61 + {Y7^X11, Y8^X10, X8^Y10, X9^Y9, }, // 62 + {Y7^X10, X7^Y10, Y8^X9, Y2^X8^Y9, }, // 63 + {Y3^X10, X7^Y9, Y7^X9, X2^X8^Y8, }, // 64 + {Y3^X9, X3^Y9, Y7^X8, Y1^X7^Y8, }, // 65 + {Y7^X9, Y2^X8^Y8, X6^Y7, 0, }, // 66 + {X7^Y8, X2^Y7^X8, Z4^X6^Y7, 0, }, // 67 + {X3^Y8, Y1^X7^Y7, Z4^X6^Y7, 0, }, // 68 + {Y3^X10, X7^Y9, Y7^X9, Y2^X8^Y8, }, // 69 + {Y2^Y6^X8, 0, 0, 0, }, // 70 + {Y1^X6^Y7, 0, 0, 0, }, // 71 + {Y1^Y5^X7, 0, 0, 0, }, // 72 + {X7^Y8, Y2^Y7^X8, 0, 0, }, // 73 + {X2^X7^Y8, Y2^Y7^X8, 0, 0, }, // 74 + {X2^X7^Y7, Y1^Y6^X8, 0, 0, }, // 75 + {X1^X6^Y7, Y1^Y6^X7, 0, 0, }, // 76 + {Y6^X9, Y2^Y7^X8, 0, 0, }, // 77 + {X2^Y7^X8, Y2^X7^Y8, 0, 0, }, // 78 + {X2^Y6^X8, Y1^X7^Y7, 0, 0, }, // 79 + {X1^Y6^X7, Y1^X6^Y7, 0, 0, }, // 80 + {Y6^X10, X2^X8^Y8, Y2^Y7^X9, 0, }, // 81 + {Y6^X9, Y2^Y7^X8, Y1^X7^Y8, 0, }, // 82 + {Y3^X9, X1^X7^Y7, Y1^Y6^X8, 0, }, // 83 + {Y2^Y7^X8, X6^Y6, 0, 0, }, // 84 + {Y1^X7^Y7, Z3^X6^Y6, 0, 0, }, // 85 + {X1^X6^Y8, Y1^X6^Y6, 0, 0, }, // 86 + {X7^Y9, X2^Y7^X9, Y2^X8^Y8, 0, }, // 87 + {X6^Y9, X2^Y7^X8, Y1^X7^Y8, 0, }, // 88 + {X3^Y8, X1^Y6^X8, Y1^X7^Y7, 0, }, // 89 + {X7^Y10, Y7^X10, X8^Y9, Y2^Y8^X9, }, // 90 + {X7^Y10, Y7^X10, X2^X8^Y9, Y2^Y8^X9, }, // 91 + {Y3^X10, X7^Y9, X2^X8^Y8, Y1^Y7^X9, }, // 92 + {X3^Y9, Y3^X9, X1^X7^Y8, Y1^Y7^X8, }, // 93 + {X2^X8^Y8, Y2^Y7^X9, X6^Y7, 0, }, // 94 + {Y2^Y7^X8, Y1^X7^Y8, Z3^X6^Y7, 0, }, // 95 + {Y2^Y7^X8, X1^X7^Y8, Z3^X6^Y7, 0, }, // 96 + {X7^Y10, X8^Y9, Y7^X10, Y2^Y8^X9, }, // 97 + {X7^Y10, Y7^X10, X2^Y8^X9, Y2^X8^Y9, }, // 98 + {Y3^X10, X7^Y9, X2^Y7^X9, Y1^X8^Y8, }, // 99 + {Y3^X9, X3^Y9, X1^Y7^X8, Y1^X7^Y8, }, // 100 + {X1^Y5^X6, 0, 0, 0, }, // 101 + {Y2^Y6^X7, 0, 0, 0, }, // 102 + {X1^Y6^X7, 0, 0, 0, }, // 103 + {Y0^X5^Y7, X1^X6^Y6, 0, 0, }, // 104 + {Z1^X5^Y6, 0, 0, 0, }, // 105 + {Y1^X5^Y6, 0, 0, 0, }, // 106 + {X1^Y6^X8, Y2^X7^Y7, 0, 0, }, // 107 + {Y2^X7^Y7, X1^Y6^X8, 0, 0, }, // 108 + {X7^Y9, X2^X8^Y8, Y2^Y7^X9, 0, }, // 109 + {Y1^X7^Y9, X2^X8^Y8, Y2^Y7^X9, 0, }, // 110 + {X6^Y8, X1^X7^Y7, Y1^Y6^X8, 0, }, // 111 + {X3^Y8, Y0^X6^Y7, X1^Y6^X7, 0, }, // 112 + {X2^X7^Y8, Y1^X6^Y6, 0, 0, }, // 113 + {Y2^Y7^X8, Y1^X6^Y6, 0, 0, }, // 114 + {Y1^X7^Y9, X2^Y7^X9, Y2^X8^Y8, 0, }, // 115 + {Y1^X7^Y8, X1^Y6^X9, Y2^Y7^X8, 0, }, // 116 + {Y1^X6^Y9, Y2^X7^Y8, X1^Y7^X8, 0, }, // 117 + {Y7^X10, Y1^X7^Y10, X2^X8^Y9, Y2^Y8^X9, }, // 118 + {Y3^X10, X1^X7^Y9, Y1^Y7^X9, X2^X8^Y8, }, // 119 + {Y3^X8, X3^Y9, Y0^X6^Y8, X1^X7^Y7, }, // 120 + {Y2^Y7^X9, X2^X8^Y8, Z2^X6^Y7, 0, }, // 121 + {X2^X8^Y8, Y2^Y7^X9, Y1^X6^Y7, 0, }, // 122 + {Y3^X10, Y1^X7^Y9, X1^Y7^X9, Y2^X8^Y8, }, // 123 + {Y3^X10, Y1^X7^Y9, Y2^X8^Y8, X1^Y7^X9, }, // 124 + {Y8^X9, Z0^X6^Y6, 0, 0, }, // 125 + {X8^Y8, Z0^X6^Y6, 0, 0, }, // 126 + {Y7^X8, Z0^X6^Y6, 0, 0, }, // 127 + {X9^Y9, Y8^X10, Z0^X6^Y7, 0, }, // 128 + {X8^Y9, Y8^X9, Z0^X6^Y7, 0, }, // 129 + {X8^Y8, Y7^X9, Z0^X6^Y7, 0, }, // 130 + {S0^X8^Y8, 0, 0, 0, }, // 131 + {S0^Y7^X8, 0, 0, 0, }, // 132 + {S0^X7^Y7, 0, 0, 0, }, // 133 + {S0^Y6^X7, 0, 0, 0, }, // 134 + {S0^X6^Y6, 0, 0, 0, }, // 135 + {Y8^X9, S0^X8^Y9, 0, 0, }, // 136 + {Y7^X9, S0^X8^Y8, 0, 0, }, // 137 + {Y7^X8, S0^X7^Y8, 0, 0, }, // 138 + {Y6^X8, S0^X7^Y7, 0, 0, }, // 139 + {Y6^X7, S0^X6^Y7, 0, 0, }, // 140 + {X8^Y10, Y8^X10, S0^X9^Y9, 0, }, // 141 + {X8^Y9, Y7^X10, S0^Y8^X9, 0, }, // 142 + {X7^Y9, Y7^X9, S0^X8^Y8, 0, }, // 143 + {X7^Y8, Y6^X9, S0^Y7^X8, 0, }, // 144 + {X3^Y8, Y6^X8, S0^X7^Y7, 0, }, // 145 + {S0^X8^Y9, Z0^X6^Y6, 0, 0, }, // 146 + {S0^X8^Y8, Z0^X6^Y6, 0, 0, }, // 147 + {S0^X7^Y8, Z0^X6^Y6, 0, 0, }, // 148 + {S0^X7^Y7, Z0^X6^Y6, 0, 0, }, // 149 + {S0^X6^Y7, Z0^X6^Y6, 0, 0, }, // 150 + {Y7^X10, X8^Y9, S0^Y8^X9, 0, }, // 151 + {X6^Y9, X7^Y8, S0^Y7^X8, 0, }, // 152 + {Y3^X8, X6^Y8, S0^X7^Y7, 0, }, // 153 + {Y8^X11, X8^Y11, Y9^X10, S0^X9^Y10, }, // 154 + {Y7^X11, X8^Y10, Y8^X10, S0^X9^Y9, }, // 155 + {Y7^X10, X7^Y10, Y8^X9, S0^X8^Y9, }, // 156 + {Y3^X10, X7^Y9, Y7^X9, S0^X8^Y8, }, // 157 + {Y3^X9, X3^Y9, Y7^X8, S0^X7^Y8, }, // 158 + {Y8^X10, S0^X9^Y9, Z4^X6^Y7, 0, }, // 159 + {Y7^X10, S0^Y8^X9, Z4^X6^Y7, 0, }, // 160 + {Y7^X9, S0^X8^Y8, Z4^X6^Y7, 0, }, // 161 + {X7^Y8, S0^Y7^X8, Z4^X6^Y7, 0, }, // 162 + {X3^Y8, S0^X7^Y7, Z4^X6^Y7, 0, }, // 163 + {S1^Y7^X9, 0, 0, 0, }, // 164 + {S1^Y7^X8, 0, 0, 0, }, // 165 + {S1^Y6^X8, 0, 0, 0, }, // 166 + {S1^Y6^X7, 0, 0, 0, }, // 167 + {S1^Y5^X7, 0, 0, 0, }, // 168 + {S1^X8^Y8, 0, 0, 0, }, // 169 + {S1^X7^Y7, 0, 0, 0, }, // 170 + {S0^X8^Y9, S1^Y8^X9, 0, 0, }, // 171 + {S0^Y7^X9, S1^X8^Y8, 0, 0, }, // 172 + {S0^X7^Y8, S1^Y7^X8, 0, 0, }, // 173 + {S0^Y6^X8, S1^X7^Y7, 0, 0, }, // 174 + {S0^X6^Y7, S1^Y6^X7, 0, 0, }, // 175 + {S0^X8^Y8, S1^Y7^X9, 0, 0, }, // 176 + {S0^X7^Y7, S1^Y6^X8, 0, 0, }, // 177 + {Y7^X11, S0^X9^Y9, S1^Y8^X10, 0, }, // 178 + {Y7^X10, S0^X8^Y9, S1^Y8^X9, 0, }, // 179 + {Y6^X10, S0^X8^Y8, S1^Y7^X9, 0, }, // 180 + {Y6^X9, S0^X7^Y8, S1^Y7^X8, 0, }, // 181 + {Y3^X9, S0^X7^Y7, S1^Y6^X8, 0, }, // 182 + {S1^Y8^X9, Z3^X6^Y6, 0, 0, }, // 183 + {S1^X8^Y8, Z3^X6^Y6, 0, 0, }, // 184 + {S1^Y7^X8, Z3^X6^Y6, 0, 0, }, // 185 + {S1^Y6^X8, Z3^X6^Y6, 0, 0, }, // 186 + {S0^X6^Y7, S1^X6^Y6, 0, 0, }, // 187 + {X8^Y10, S0^Y8^X10, S1^X9^Y9, 0, }, // 188 + {X7^Y9, S0^Y7^X9, S1^X8^Y8, 0, }, // 189 + {X6^Y9, S0^X7^Y8, S1^Y7^X8, 0, }, // 190 + {X3^Y8, S0^X7^Y7, S1^Y6^X8, 0, }, // 191 + {X8^Y11, Y8^X11, S0^X9^Y10, S1^Y9^X10, }, // 192 + {Y7^X11, X8^Y10, S0^Y8^X10, S1^X9^Y9, }, // 193 + {X7^Y10, Y7^X10, S0^X8^Y9, S1^Y8^X9, }, // 194 + {Y3^X10, X7^Y9, S0^Y7^X9, S1^X8^Y8, }, // 195 + {X3^Y9, Y3^X9, S0^X7^Y8, S1^Y7^X8, }, // 196 + {S0^X9^Y9, S1^Y8^X10, Z3^X6^Y7, 0, }, // 197 + {S0^X8^Y9, S1^Y8^X9, Z3^X6^Y7, 0, }, // 198 + {S0^X8^Y8, S1^Y7^X9, Z3^X6^Y7, 0, }, // 199 + {S0^X7^Y8, S1^Y7^X8, Z3^X6^Y7, 0, }, // 200 + {X3^Y8, S0^X7^Y7, Z3^X6^Y7, 0, }, // 201 + {X8^Y10, Y7^X11, S0^X9^Y9, S1^Y8^X10, }, // 202 + {Y3^X10, X7^Y9, S0^X8^Y8, S1^Y7^X9, }, // 203 + {Y3^X9, X3^Y9, S0^X7^Y8, S1^Y7^X8, }, // 204 + {S2^X8^Y8, 0, 0, 0, }, // 205 + {S2^Y7^X8, 0, 0, 0, }, // 206 + {S2^X7^Y7, 0, 0, 0, }, // 207 + {S2^Y6^X7, 0, 0, 0, }, // 208 + {S2^X6^Y6, 0, 0, 0, }, // 209 + {S1^X6^Y6, 0, 0, 0, }, // 210 + {S1^Y8^X9, S2^X8^Y9, 0, 0, }, // 211 + {S1^Y7^X9, S2^X8^Y8, 0, 0, }, // 212 + {S1^Y7^X8, S2^X7^Y8, 0, 0, }, // 213 + {S1^Y6^X8, S2^X7^Y7, 0, 0, }, // 214 + {S1^Y6^X7, S2^X6^Y7, 0, 0, }, // 215 + {Z2^X5^Y6, 0, 0, 0, }, // 216 + {S1^X5^Y6, 0, 0, 0, }, // 217 + {S0^X8^Y10, S1^Y8^X10, S2^X9^Y9, 0, }, // 218 + {S0^X8^Y9, S1^Y7^X10, S2^Y8^X9, 0, }, // 219 + {S0^X7^Y9, S1^Y7^X9, S2^X8^Y8, 0, }, // 220 + {S0^X7^Y8, S1^Y6^X9, S2^Y7^X8, 0, }, // 221 + {S0^X6^Y8, S1^Y6^X8, S2^X7^Y7, 0, }, // 222 + {S2^X8^Y9, Z2^X6^Y6, 0, 0, }, // 223 + {S2^X8^Y8, Z2^X6^Y6, 0, 0, }, // 224 + {S2^X7^Y8, Z2^X6^Y6, 0, 0, }, // 225 + {S1^X7^Y7, S2^X6^Y6, 0, 0, }, // 226 + {S0^Y7^X10, S1^X8^Y9, S2^Y8^X9, 0, }, // 227 + {X3^Y9, S0^X6^Y8, S1^X7^Y7, 0, }, // 228 + {Y8^X11, S0^X8^Y11, S1^Y9^X10, S2^X9^Y10, }, // 229 + {Y7^X11, S0^X8^Y10, S1^Y8^X10, S2^X9^Y9, }, // 230 + {Y7^X10, S0^X7^Y10, S1^Y8^X9, S2^X8^Y9, }, // 231 + {Y3^X10, S0^X7^Y9, S1^Y7^X9, S2^X8^Y8, }, // 232 + {Y3^X9, S0^X6^Y9, S1^Y7^X8, S2^X7^Y8, }, // 233 + {S1^Y8^X10, S2^X9^Y9, Z2^X6^Y7, 0, }, // 234 + {S1^Y7^X10, S2^Y8^X9, Z2^X6^Y7, 0, }, // 235 + {S1^Y7^X9, S2^X8^Y8, Z2^X6^Y7, 0, }, // 236 + {S0^X7^Y8, S1^Y7^X8, Z2^X6^Y7, 0, }, // 237 + {X3^Y8, S0^X7^Y7, S1^X6^Y7, 0, }, // 238 +}; + +const UINT_8 DCC_64K_R_X_PATIDX[] = +{ + 0, // 1 pipes 1 bpe ua @ SW_64K_R_X 1xaa @ Navi1x + 1, // 1 pipes 2 bpe ua @ SW_64K_R_X 1xaa @ Navi1x + 2, // 1 pipes 4 bpe ua @ SW_64K_R_X 1xaa @ Navi1x + 3, // 1 pipes 8 bpe ua @ SW_64K_R_X 1xaa @ Navi1x + 4, // 1 pipes 16 bpe ua @ SW_64K_R_X 1xaa @ Navi1x + 5, // 2 pipes 1 bpe ua @ SW_64K_R_X 1xaa @ Navi1x + 6, // 2 pipes 2 bpe ua @ SW_64K_R_X 1xaa @ Navi1x + 2, // 2 pipes 4 bpe ua @ SW_64K_R_X 1xaa @ Navi1x + 3, // 2 pipes 8 bpe ua @ SW_64K_R_X 1xaa @ Navi1x + 4, // 2 pipes 16 bpe ua @ SW_64K_R_X 1xaa @ Navi1x + 7, // 4+ pipes 1 bpe ua @ SW_64K_R_X 1xaa @ Navi1x + 6, // 4+ pipes 2 bpe ua @ SW_64K_R_X 1xaa @ Navi1x + 2, // 4+ pipes 4 bpe ua @ SW_64K_R_X 1xaa @ Navi1x + 3, // 4+ pipes 8 bpe ua @ SW_64K_R_X 1xaa @ Navi1x + 4, // 4+ pipes 16 bpe ua @ SW_64K_R_X 1xaa @ Navi1x + 0, // 1 pipes 1 bpe pa @ SW_64K_R_X 1xaa @ Navi1x + 1, // 1 pipes 2 bpe pa @ SW_64K_R_X 1xaa @ Navi1x + 2, // 1 pipes 4 bpe pa @ SW_64K_R_X 1xaa @ Navi1x + 3, // 1 pipes 8 bpe pa @ SW_64K_R_X 1xaa @ Navi1x + 4, // 1 pipes 16 bpe pa @ SW_64K_R_X 1xaa @ Navi1x + 8, // 2 pipes 1 bpe pa @ SW_64K_R_X 1xaa @ Navi1x + 9, // 2 pipes 2 bpe pa @ SW_64K_R_X 1xaa @ Navi1x + 10, // 2 pipes 4 bpe pa @ SW_64K_R_X 1xaa @ Navi1x + 11, // 2 pipes 8 bpe pa @ SW_64K_R_X 1xaa @ Navi1x + 12, // 2 pipes 16 bpe pa @ SW_64K_R_X 1xaa @ Navi1x + 13, // 4 pipes 1 bpe pa @ SW_64K_R_X 1xaa @ Navi1x + 14, // 4 pipes 2 bpe pa @ SW_64K_R_X 1xaa @ Navi1x + 15, // 4 pipes 4 bpe pa @ SW_64K_R_X 1xaa @ Navi1x + 16, // 4 pipes 8 bpe pa @ SW_64K_R_X 1xaa @ Navi1x + 17, // 4 pipes 16 bpe pa @ SW_64K_R_X 1xaa @ Navi1x + 18, // 8 pipes 1 bpe pa @ SW_64K_R_X 1xaa @ Navi1x + 19, // 8 pipes 2 bpe pa @ SW_64K_R_X 1xaa @ Navi1x + 20, // 8 pipes 4 bpe pa @ SW_64K_R_X 1xaa @ Navi1x + 21, // 8 pipes 8 bpe pa @ SW_64K_R_X 1xaa @ Navi1x + 22, // 8 pipes 16 bpe pa @ SW_64K_R_X 1xaa @ Navi1x + 23, // 16 pipes 1 bpe pa @ SW_64K_R_X 1xaa @ Navi1x + 24, // 16 pipes 2 bpe pa @ SW_64K_R_X 1xaa @ Navi1x + 25, // 16 pipes 4 bpe pa @ SW_64K_R_X 1xaa @ Navi1x + 26, // 16 pipes 8 bpe pa @ SW_64K_R_X 1xaa @ Navi1x + 27, // 16 pipes 16 bpe pa @ SW_64K_R_X 1xaa @ Navi1x + 28, // 32 pipes 1 bpe pa @ SW_64K_R_X 1xaa @ Navi1x + 29, // 32 pipes 2 bpe pa @ SW_64K_R_X 1xaa @ Navi1x + 30, // 32 pipes 4 bpe pa @ SW_64K_R_X 1xaa @ Navi1x + 31, // 32 pipes 8 bpe pa @ SW_64K_R_X 1xaa @ Navi1x + 32, // 32 pipes 16 bpe pa @ SW_64K_R_X 1xaa @ Navi1x + 33, // 64 pipes 1 bpe pa @ SW_64K_R_X 1xaa @ Navi1x + 34, // 64 pipes 2 bpe pa @ SW_64K_R_X 1xaa @ Navi1x + 35, // 64 pipes 4 bpe pa @ SW_64K_R_X 1xaa @ Navi1x + 36, // 64 pipes 8 bpe pa @ SW_64K_R_X 1xaa @ Navi1x + 37, // 64 pipes 16 bpe pa @ SW_64K_R_X 1xaa @ Navi1x +}; + +const UINT_8 HTILE_PATIDX[] = +{ + 0, // 1xaa ua @ HTILE_64K @ Navi1x + 0, // 2xaa ua @ HTILE_64K @ Navi1x + 0, // 4xaa ua @ HTILE_64K @ Navi1x + 0, // 8xaa ua @ HTILE_64K @ Navi1x + 0, // 1 pipes 1xaa pa @ HTILE_64K @ Navi1x + 0, // 1 pipes 2xaa pa @ HTILE_64K @ Navi1x + 0, // 1 pipes 4xaa pa @ HTILE_64K @ Navi1x + 0, // 1 pipes 8xaa pa @ HTILE_64K @ Navi1x + 1, // 2 pipes 1xaa pa @ HTILE_64K @ Navi1x + 1, // 2 pipes 2xaa pa @ HTILE_64K @ Navi1x + 1, // 2 pipes 4xaa pa @ HTILE_64K @ Navi1x + 1, // 2 pipes 8xaa pa @ HTILE_64K @ Navi1x + 2, // 4 pipes 1xaa pa @ HTILE_64K @ Navi1x + 2, // 4 pipes 2xaa pa @ HTILE_64K @ Navi1x + 2, // 4 pipes 4xaa pa @ HTILE_64K @ Navi1x + 2, // 4 pipes 8xaa pa @ HTILE_64K @ Navi1x + 3, // 8 pipes 1xaa pa @ HTILE_64K @ Navi1x + 3, // 8 pipes 2xaa pa @ HTILE_64K @ Navi1x + 3, // 8 pipes 4xaa pa @ HTILE_64K @ Navi1x + 3, // 8 pipes 8xaa pa @ HTILE_64K @ Navi1x + 4, // 16 pipes 1xaa pa @ HTILE_64K @ Navi1x + 4, // 16 pipes 2xaa pa @ HTILE_64K @ Navi1x + 4, // 16 pipes 4xaa pa @ HTILE_64K @ Navi1x + 5, // 16 pipes 8xaa pa @ HTILE_64K @ Navi1x + 6, // 32 pipes 1xaa pa @ HTILE_64K @ Navi1x + 6, // 32 pipes 2xaa pa @ HTILE_64K @ Navi1x + 7, // 32 pipes 4xaa pa @ HTILE_64K @ Navi1x + 8, // 32 pipes 8xaa pa @ HTILE_64K @ Navi1x + 9, // 64 pipes 1xaa pa @ HTILE_64K @ Navi1x + 10, // 64 pipes 2xaa pa @ HTILE_64K @ Navi1x + 11, // 64 pipes 4xaa pa @ HTILE_64K @ Navi1x + 12, // 64 pipes 8xaa pa @ HTILE_64K @ Navi1x +}; + +const UINT_8 CMASK_64K_PATIDX[] = +{ + 0, // 1 bpe ua @ CMASK_64K @ Navi1x + 0, // 2 bpe ua @ CMASK_64K @ Navi1x + 0, // 4 bpe ua @ CMASK_64K @ Navi1x + 0, // 8 bpe ua @ CMASK_64K @ Navi1x + 0, // 1 pipes 1 bpe pa @ CMASK_64K @ Navi1x + 0, // 1 pipes 2 bpe pa @ CMASK_64K @ Navi1x + 0, // 1 pipes 4 bpe pa @ CMASK_64K @ Navi1x + 0, // 1 pipes 8 bpe pa @ CMASK_64K @ Navi1x + 1, // 2 pipes 1 bpe pa @ CMASK_64K @ Navi1x + 1, // 2 pipes 2 bpe pa @ CMASK_64K @ Navi1x + 1, // 2 pipes 4 bpe pa @ CMASK_64K @ Navi1x + 1, // 2 pipes 8 bpe pa @ CMASK_64K @ Navi1x + 2, // 4 pipes 1 bpe pa @ CMASK_64K @ Navi1x + 2, // 4 pipes 2 bpe pa @ CMASK_64K @ Navi1x + 2, // 4 pipes 4 bpe pa @ CMASK_64K @ Navi1x + 2, // 4 pipes 8 bpe pa @ CMASK_64K @ Navi1x + 3, // 8 pipes 1 bpe pa @ CMASK_64K @ Navi1x + 3, // 8 pipes 2 bpe pa @ CMASK_64K @ Navi1x + 3, // 8 pipes 4 bpe pa @ CMASK_64K @ Navi1x + 3, // 8 pipes 8 bpe pa @ CMASK_64K @ Navi1x + 4, // 16 pipes 1 bpe pa @ CMASK_64K @ Navi1x + 4, // 16 pipes 2 bpe pa @ CMASK_64K @ Navi1x + 4, // 16 pipes 4 bpe pa @ CMASK_64K @ Navi1x + 4, // 16 pipes 8 bpe pa @ CMASK_64K @ Navi1x + 5, // 32 pipes 1 bpe pa @ CMASK_64K @ Navi1x + 5, // 32 pipes 2 bpe pa @ CMASK_64K @ Navi1x + 5, // 32 pipes 4 bpe pa @ CMASK_64K @ Navi1x + 5, // 32 pipes 8 bpe pa @ CMASK_64K @ Navi1x + 6, // 64 pipes 1 bpe pa @ CMASK_64K @ Navi1x + 6, // 64 pipes 2 bpe pa @ CMASK_64K @ Navi1x + 6, // 64 pipes 4 bpe pa @ CMASK_64K @ Navi1x + 7, // 64 pipes 8 bpe pa @ CMASK_64K @ Navi1x +}; + +const UINT_8 DCC_64K_R_X_RBPLUS_PATIDX[] = +{ + 0, // 1 bpe ua @ SW_64K_R_X 1xaa @ RbPlus + 1, // 2 bpe ua @ SW_64K_R_X 1xaa @ RbPlus + 2, // 4 bpe ua @ SW_64K_R_X 1xaa @ RbPlus + 3, // 8 bpe ua @ SW_64K_R_X 1xaa @ RbPlus + 4, // 16 bpe ua @ SW_64K_R_X 1xaa @ RbPlus + 0, // 1 pipes (1 PKRs) 1 bpe pa @ SW_64K_R_X 1xaa @ RbPlus + 1, // 1 pipes (1 PKRs) 2 bpe pa @ SW_64K_R_X 1xaa @ RbPlus + 2, // 1 pipes (1 PKRs) 4 bpe pa @ SW_64K_R_X 1xaa @ RbPlus + 3, // 1 pipes (1 PKRs) 8 bpe pa @ SW_64K_R_X 1xaa @ RbPlus + 4, // 1 pipes (1 PKRs) 16 bpe pa @ SW_64K_R_X 1xaa @ RbPlus + 38, // 2 pipes (1-2 PKRs) 1 bpe pa @ SW_64K_R_X 1xaa @ RbPlus + 39, // 2 pipes (1-2 PKRs) 2 bpe pa @ SW_64K_R_X 1xaa @ RbPlus + 40, // 2 pipes (1-2 PKRs) 4 bpe pa @ SW_64K_R_X 1xaa @ RbPlus + 41, // 2 pipes (1-2 PKRs) 8 bpe pa @ SW_64K_R_X 1xaa @ RbPlus + 42, // 2 pipes (1-2 PKRs) 16 bpe pa @ SW_64K_R_X 1xaa @ RbPlus + 43, // 4 pipes (1-2 PKRs) 1 bpe pa @ SW_64K_R_X 1xaa @ RbPlus + 44, // 4 pipes (1-2 PKRs) 2 bpe pa @ SW_64K_R_X 1xaa @ RbPlus + 45, // 4 pipes (1-2 PKRs) 4 bpe pa @ SW_64K_R_X 1xaa @ RbPlus + 46, // 4 pipes (1-2 PKRs) 8 bpe pa @ SW_64K_R_X 1xaa @ RbPlus + 47, // 4 pipes (1-2 PKRs) 16 bpe pa @ SW_64K_R_X 1xaa @ RbPlus + 48, // 8 pipes (2 PKRs) 1 bpe pa @ SW_64K_R_X 1xaa @ RbPlus + 49, // 8 pipes (2 PKRs) 2 bpe pa @ SW_64K_R_X 1xaa @ RbPlus + 50, // 8 pipes (2 PKRs) 4 bpe pa @ SW_64K_R_X 1xaa @ RbPlus + 51, // 8 pipes (2 PKRs) 8 bpe pa @ SW_64K_R_X 1xaa @ RbPlus + 52, // 8 pipes (2 PKRs) 16 bpe pa @ SW_64K_R_X 1xaa @ RbPlus + 53, // 4 pipes (4 PKRs) 1 bpe pa @ SW_64K_R_X 1xaa @ RbPlus + 54, // 4 pipes (4 PKRs) 2 bpe pa @ SW_64K_R_X 1xaa @ RbPlus + 55, // 4 pipes (4 PKRs) 4 bpe pa @ SW_64K_R_X 1xaa @ RbPlus + 56, // 4 pipes (4 PKRs) 8 bpe pa @ SW_64K_R_X 1xaa @ RbPlus + 57, // 4 pipes (4 PKRs) 16 bpe pa @ SW_64K_R_X 1xaa @ RbPlus + 58, // 8 pipes (4 PKRs) 1 bpe pa @ SW_64K_R_X 1xaa @ RbPlus + 59, // 8 pipes (4 PKRs) 2 bpe pa @ SW_64K_R_X 1xaa @ RbPlus + 60, // 8 pipes (4 PKRs) 4 bpe pa @ SW_64K_R_X 1xaa @ RbPlus + 61, // 8 pipes (4 PKRs) 8 bpe pa @ SW_64K_R_X 1xaa @ RbPlus + 62, // 8 pipes (4 PKRs) 16 bpe pa @ SW_64K_R_X 1xaa @ RbPlus + 63, // 16 pipes (4 PKRs) 1 bpe pa @ SW_64K_R_X 1xaa @ RbPlus + 64, // 16 pipes (4 PKRs) 2 bpe pa @ SW_64K_R_X 1xaa @ RbPlus + 65, // 16 pipes (4 PKRs) 4 bpe pa @ SW_64K_R_X 1xaa @ RbPlus + 66, // 16 pipes (4 PKRs) 8 bpe pa @ SW_64K_R_X 1xaa @ RbPlus + 67, // 16 pipes (4 PKRs) 16 bpe pa @ SW_64K_R_X 1xaa @ RbPlus + 68, // 8 pipes (8 PKRs) 1 bpe pa @ SW_64K_R_X 1xaa @ RbPlus + 69, // 8 pipes (8 PKRs) 2 bpe pa @ SW_64K_R_X 1xaa @ RbPlus + 70, // 8 pipes (8 PKRs) 4 bpe pa @ SW_64K_R_X 1xaa @ RbPlus + 71, // 8 pipes (8 PKRs) 8 bpe pa @ SW_64K_R_X 1xaa @ RbPlus + 72, // 8 pipes (8 PKRs) 16 bpe pa @ SW_64K_R_X 1xaa @ RbPlus + 73, // 16 pipes (8 PKRs) 1 bpe pa @ SW_64K_R_X 1xaa @ RbPlus + 74, // 16 pipes (8 PKRs) 2 bpe pa @ SW_64K_R_X 1xaa @ RbPlus + 75, // 16 pipes (8 PKRs) 4 bpe pa @ SW_64K_R_X 1xaa @ RbPlus + 76, // 16 pipes (8 PKRs) 8 bpe pa @ SW_64K_R_X 1xaa @ RbPlus + 77, // 16 pipes (8 PKRs) 16 bpe pa @ SW_64K_R_X 1xaa @ RbPlus + 78, // 32 pipes (8 PKRs) 1 bpe pa @ SW_64K_R_X 1xaa @ RbPlus + 79, // 32 pipes (8 PKRs) 2 bpe pa @ SW_64K_R_X 1xaa @ RbPlus + 80, // 32 pipes (8 PKRs) 4 bpe pa @ SW_64K_R_X 1xaa @ RbPlus + 81, // 32 pipes (8 PKRs) 8 bpe pa @ SW_64K_R_X 1xaa @ RbPlus + 82, // 32 pipes (8 PKRs) 16 bpe pa @ SW_64K_R_X 1xaa @ RbPlus + 83, // 16 pipes (16 PKRs) 1 bpe pa @ SW_64K_R_X 1xaa @ RbPlus + 84, // 16 pipes (16 PKRs) 2 bpe pa @ SW_64K_R_X 1xaa @ RbPlus + 85, // 16 pipes (16 PKRs) 4 bpe pa @ SW_64K_R_X 1xaa @ RbPlus + 86, // 16 pipes (16 PKRs) 8 bpe pa @ SW_64K_R_X 1xaa @ RbPlus + 87, // 16 pipes (16 PKRs) 16 bpe pa @ SW_64K_R_X 1xaa @ RbPlus + 88, // 32 pipes (16 PKRs) 1 bpe pa @ SW_64K_R_X 1xaa @ RbPlus + 89, // 32 pipes (16 PKRs) 2 bpe pa @ SW_64K_R_X 1xaa @ RbPlus + 90, // 32 pipes (16 PKRs) 4 bpe pa @ SW_64K_R_X 1xaa @ RbPlus + 91, // 32 pipes (16 PKRs) 8 bpe pa @ SW_64K_R_X 1xaa @ RbPlus + 92, // 32 pipes (16 PKRs) 16 bpe pa @ SW_64K_R_X 1xaa @ RbPlus + 93, // 64 pipes (16 PKRs) 1 bpe pa @ SW_64K_R_X 1xaa @ RbPlus + 94, // 64 pipes (16 PKRs) 2 bpe pa @ SW_64K_R_X 1xaa @ RbPlus + 95, // 64 pipes (16 PKRs) 4 bpe pa @ SW_64K_R_X 1xaa @ RbPlus + 96, // 64 pipes (16 PKRs) 8 bpe pa @ SW_64K_R_X 1xaa @ RbPlus + 97, // 64 pipes (16 PKRs) 16 bpe pa @ SW_64K_R_X 1xaa @ RbPlus + 98, // 32 pipes (32 PKRs) 1 bpe pa @ SW_64K_R_X 1xaa @ RbPlus + 99, // 32 pipes (32 PKRs) 2 bpe pa @ SW_64K_R_X 1xaa @ RbPlus + 100, // 32 pipes (32 PKRs) 4 bpe pa @ SW_64K_R_X 1xaa @ RbPlus + 101, // 32 pipes (32 PKRs) 8 bpe pa @ SW_64K_R_X 1xaa @ RbPlus + 102, // 32 pipes (32 PKRs) 16 bpe pa @ SW_64K_R_X 1xaa @ RbPlus + 103, // 64 pipes (32 PKRs) 1 bpe pa @ SW_64K_R_X 1xaa @ RbPlus + 104, // 64 pipes (32 PKRs) 2 bpe pa @ SW_64K_R_X 1xaa @ RbPlus + 105, // 64 pipes (32 PKRs) 4 bpe pa @ SW_64K_R_X 1xaa @ RbPlus + 106, // 64 pipes (32 PKRs) 8 bpe pa @ SW_64K_R_X 1xaa @ RbPlus + 107, // 64 pipes (32 PKRs) 16 bpe pa @ SW_64K_R_X 1xaa @ RbPlus +}; + +const UINT_8 HTILE_RBPLUS_PATIDX[] = +{ + 0, // 1xaa ua @ HTILE_64K @ RbPlus + 0, // 2xaa ua @ HTILE_64K @ RbPlus + 0, // 4xaa ua @ HTILE_64K @ RbPlus + 0, // 8xaa ua @ HTILE_64K @ RbPlus + 0, // 1 pipes (1-2 PKRs) 1xaa pa @ HTILE_64K @ RbPlus + 0, // 1 pipes (1-2 PKRs) 2xaa pa @ HTILE_64K @ RbPlus + 0, // 1 pipes (1-2 PKRs) 4xaa pa @ HTILE_64K @ RbPlus + 0, // 1 pipes (1-2 PKRs) 8xaa pa @ HTILE_64K @ RbPlus + 13, // 2 pipes (1-2 PKRs) 1xaa pa @ HTILE_64K @ RbPlus + 13, // 2 pipes (1-2 PKRs) 2xaa pa @ HTILE_64K @ RbPlus + 13, // 2 pipes (1-2 PKRs) 4xaa pa @ HTILE_64K @ RbPlus + 13, // 2 pipes (1-2 PKRs) 8xaa pa @ HTILE_64K @ RbPlus + 14, // 4 pipes (1-2 PKRs) 1xaa pa @ HTILE_64K @ RbPlus + 14, // 4 pipes (1-2 PKRs) 2xaa pa @ HTILE_64K @ RbPlus + 14, // 4 pipes (1-2 PKRs) 4xaa pa @ HTILE_64K @ RbPlus + 14, // 4 pipes (1-2 PKRs) 8xaa pa @ HTILE_64K @ RbPlus + 15, // 8 pipes (1-2 PKRs) 1xaa pa @ HTILE_64K @ RbPlus + 15, // 8 pipes (1-2 PKRs) 2xaa pa @ HTILE_64K @ RbPlus + 15, // 8 pipes (1-2 PKRs) 4xaa pa @ HTILE_64K @ RbPlus + 15, // 8 pipes (1-2 PKRs) 8xaa pa @ HTILE_64K @ RbPlus + 13, // 2 pipes (4 PKRs) 1xaa pa @ HTILE_64K @ RbPlus + 13, // 2 pipes (4 PKRs) 2xaa pa @ HTILE_64K @ RbPlus + 13, // 2 pipes (4 PKRs) 4xaa pa @ HTILE_64K @ RbPlus + 13, // 2 pipes (4 PKRs) 8xaa pa @ HTILE_64K @ RbPlus + 16, // 4 pipes (4 PKRs) 1xaa pa @ HTILE_64K @ RbPlus + 16, // 4 pipes (4 PKRs) 2xaa pa @ HTILE_64K @ RbPlus + 16, // 4 pipes (4 PKRs) 4xaa pa @ HTILE_64K @ RbPlus + 16, // 4 pipes (4 PKRs) 8xaa pa @ HTILE_64K @ RbPlus + 17, // 8 pipes (4 PKRs) 1xaa pa @ HTILE_64K @ RbPlus + 17, // 8 pipes (4 PKRs) 2xaa pa @ HTILE_64K @ RbPlus + 17, // 8 pipes (4 PKRs) 4xaa pa @ HTILE_64K @ RbPlus + 17, // 8 pipes (4 PKRs) 8xaa pa @ HTILE_64K @ RbPlus + 18, // 16 pipes (4 PKRs) 1xaa pa @ HTILE_64K @ RbPlus + 18, // 16 pipes (4 PKRs) 2xaa pa @ HTILE_64K @ RbPlus + 18, // 16 pipes (4 PKRs) 4xaa pa @ HTILE_64K @ RbPlus + 18, // 16 pipes (4 PKRs) 8xaa pa @ HTILE_64K @ RbPlus + 19, // 4 pipes (8 PKRs) 1xaa pa @ HTILE_64K @ RbPlus + 19, // 4 pipes (8 PKRs) 2xaa pa @ HTILE_64K @ RbPlus + 19, // 4 pipes (8 PKRs) 4xaa pa @ HTILE_64K @ RbPlus + 19, // 4 pipes (8 PKRs) 8xaa pa @ HTILE_64K @ RbPlus + 20, // 8 pipes (8 PKRs) 1xaa pa @ HTILE_64K @ RbPlus + 20, // 8 pipes (8 PKRs) 2xaa pa @ HTILE_64K @ RbPlus + 20, // 8 pipes (8 PKRs) 4xaa pa @ HTILE_64K @ RbPlus + 20, // 8 pipes (8 PKRs) 8xaa pa @ HTILE_64K @ RbPlus + 21, // 16 pipes (8 PKRs) 1xaa pa @ HTILE_64K @ RbPlus + 21, // 16 pipes (8 PKRs) 2xaa pa @ HTILE_64K @ RbPlus + 21, // 16 pipes (8 PKRs) 4xaa pa @ HTILE_64K @ RbPlus + 21, // 16 pipes (8 PKRs) 8xaa pa @ HTILE_64K @ RbPlus + 22, // 32 pipes (8 PKRs) 1xaa pa @ HTILE_64K @ RbPlus + 22, // 32 pipes (8 PKRs) 2xaa pa @ HTILE_64K @ RbPlus + 22, // 32 pipes (8 PKRs) 4xaa pa @ HTILE_64K @ RbPlus + 22, // 32 pipes (8 PKRs) 8xaa pa @ HTILE_64K @ RbPlus + 23, // 8 pipes (16 PKRs) 1xaa pa @ HTILE_64K @ RbPlus + 23, // 8 pipes (16 PKRs) 2xaa pa @ HTILE_64K @ RbPlus + 23, // 8 pipes (16 PKRs) 4xaa pa @ HTILE_64K @ RbPlus + 23, // 8 pipes (16 PKRs) 8xaa pa @ HTILE_64K @ RbPlus + 24, // 16 pipes (16 PKRs) 1xaa pa @ HTILE_64K @ RbPlus + 24, // 16 pipes (16 PKRs) 2xaa pa @ HTILE_64K @ RbPlus + 24, // 16 pipes (16 PKRs) 4xaa pa @ HTILE_64K @ RbPlus + 24, // 16 pipes (16 PKRs) 8xaa pa @ HTILE_64K @ RbPlus + 25, // 32 pipes (16 PKRs) 1xaa pa @ HTILE_64K @ RbPlus + 25, // 32 pipes (16 PKRs) 2xaa pa @ HTILE_64K @ RbPlus + 25, // 32 pipes (16 PKRs) 4xaa pa @ HTILE_64K @ RbPlus + 25, // 32 pipes (16 PKRs) 8xaa pa @ HTILE_64K @ RbPlus + 26, // 64 pipes (16 PKRs) 1xaa pa @ HTILE_64K @ RbPlus + 26, // 64 pipes (16 PKRs) 2xaa pa @ HTILE_64K @ RbPlus + 26, // 64 pipes (16 PKRs) 4xaa pa @ HTILE_64K @ RbPlus + 26, // 64 pipes (16 PKRs) 8xaa pa @ HTILE_64K @ RbPlus + 27, // 16 pipes (32 PKRs) 1xaa pa @ HTILE_64K @ RbPlus + 27, // 16 pipes (32 PKRs) 2xaa pa @ HTILE_64K @ RbPlus + 27, // 16 pipes (32 PKRs) 4xaa pa @ HTILE_64K @ RbPlus + 27, // 16 pipes (32 PKRs) 8xaa pa @ HTILE_64K @ RbPlus + 28, // 32 pipes (32 PKRs) 1xaa pa @ HTILE_64K @ RbPlus + 28, // 32 pipes (32 PKRs) 2xaa pa @ HTILE_64K @ RbPlus + 28, // 32 pipes (32 PKRs) 4xaa pa @ HTILE_64K @ RbPlus + 28, // 32 pipes (32 PKRs) 8xaa pa @ HTILE_64K @ RbPlus + 29, // 64 pipes (32 PKRs) 1xaa pa @ HTILE_64K @ RbPlus + 29, // 64 pipes (32 PKRs) 2xaa pa @ HTILE_64K @ RbPlus + 29, // 64 pipes (32 PKRs) 4xaa pa @ HTILE_64K @ RbPlus + 29, // 64 pipes (32 PKRs) 8xaa pa @ HTILE_64K @ RbPlus +}; + +const UINT_8 CMASK_64K_RBPLUS_PATIDX[] = +{ + 0, // 1 bpe ua @ CMASK_64K @ RbPlus + 0, // 2 bpe ua @ CMASK_64K @ RbPlus + 0, // 4 bpe ua @ CMASK_64K @ RbPlus + 0, // 8 bpe ua @ CMASK_64K @ RbPlus + 0, // 1 pipes (1-2 PKRs) 1 bpe pa @ CMASK_64K @ RbPlus + 0, // 1 pipes (1-2 PKRs) 2 bpe pa @ CMASK_64K @ RbPlus + 0, // 1 pipes (1-2 PKRs) 4 bpe pa @ CMASK_64K @ RbPlus + 0, // 1 pipes (1-2 PKRs) 8 bpe pa @ CMASK_64K @ RbPlus + 8, // 2 pipes (1-2 PKRs) 1 bpe pa @ CMASK_64K @ RbPlus + 8, // 2 pipes (1-2 PKRs) 2 bpe pa @ CMASK_64K @ RbPlus + 8, // 2 pipes (1-2 PKRs) 4 bpe pa @ CMASK_64K @ RbPlus + 8, // 2 pipes (1-2 PKRs) 8 bpe pa @ CMASK_64K @ RbPlus + 9, // 4 pipes (1-2 PKRs) 1 bpe pa @ CMASK_64K @ RbPlus + 9, // 4 pipes (1-2 PKRs) 2 bpe pa @ CMASK_64K @ RbPlus + 9, // 4 pipes (1-2 PKRs) 4 bpe pa @ CMASK_64K @ RbPlus + 9, // 4 pipes (1-2 PKRs) 8 bpe pa @ CMASK_64K @ RbPlus + 10, // 8 pipes (1-2 PKRs) 1 bpe pa @ CMASK_64K @ RbPlus + 10, // 8 pipes (1-2 PKRs) 2 bpe pa @ CMASK_64K @ RbPlus + 10, // 8 pipes (1-2 PKRs) 4 bpe pa @ CMASK_64K @ RbPlus + 10, // 8 pipes (1-2 PKRs) 8 bpe pa @ CMASK_64K @ RbPlus + 8, // 2 pipes (4 PKRs) 1 bpe pa @ CMASK_64K @ RbPlus + 8, // 2 pipes (4 PKRs) 2 bpe pa @ CMASK_64K @ RbPlus + 8, // 2 pipes (4 PKRs) 4 bpe pa @ CMASK_64K @ RbPlus + 8, // 2 pipes (4 PKRs) 8 bpe pa @ CMASK_64K @ RbPlus + 11, // 4 pipes (4 PKRs) 1 bpe pa @ CMASK_64K @ RbPlus + 11, // 4 pipes (4 PKRs) 2 bpe pa @ CMASK_64K @ RbPlus + 11, // 4 pipes (4 PKRs) 4 bpe pa @ CMASK_64K @ RbPlus + 11, // 4 pipes (4 PKRs) 8 bpe pa @ CMASK_64K @ RbPlus + 12, // 8 pipes (4 PKRs) 1 bpe pa @ CMASK_64K @ RbPlus + 12, // 8 pipes (4 PKRs) 2 bpe pa @ CMASK_64K @ RbPlus + 12, // 8 pipes (4 PKRs) 4 bpe pa @ CMASK_64K @ RbPlus + 12, // 8 pipes (4 PKRs) 8 bpe pa @ CMASK_64K @ RbPlus + 13, // 16 pipes (4 PKRs) 1 bpe pa @ CMASK_64K @ RbPlus + 13, // 16 pipes (4 PKRs) 2 bpe pa @ CMASK_64K @ RbPlus + 13, // 16 pipes (4 PKRs) 4 bpe pa @ CMASK_64K @ RbPlus + 13, // 16 pipes (4 PKRs) 8 bpe pa @ CMASK_64K @ RbPlus + 14, // 4 pipes (8 PKRs) 1 bpe pa @ CMASK_64K @ RbPlus + 14, // 4 pipes (8 PKRs) 2 bpe pa @ CMASK_64K @ RbPlus + 14, // 4 pipes (8 PKRs) 4 bpe pa @ CMASK_64K @ RbPlus + 14, // 4 pipes (8 PKRs) 8 bpe pa @ CMASK_64K @ RbPlus + 15, // 8 pipes (8 PKRs) 1 bpe pa @ CMASK_64K @ RbPlus + 15, // 8 pipes (8 PKRs) 2 bpe pa @ CMASK_64K @ RbPlus + 15, // 8 pipes (8 PKRs) 4 bpe pa @ CMASK_64K @ RbPlus + 16, // 8 pipes (8 PKRs) 8 bpe pa @ CMASK_64K @ RbPlus + 15, // 16 pipes (8 PKRs) 1 bpe pa @ CMASK_64K @ RbPlus + 15, // 16 pipes (8 PKRs) 2 bpe pa @ CMASK_64K @ RbPlus + 15, // 16 pipes (8 PKRs) 4 bpe pa @ CMASK_64K @ RbPlus + 17, // 16 pipes (8 PKRs) 8 bpe pa @ CMASK_64K @ RbPlus + 18, // 32 pipes (8 PKRs) 1 bpe pa @ CMASK_64K @ RbPlus + 18, // 32 pipes (8 PKRs) 2 bpe pa @ CMASK_64K @ RbPlus + 18, // 32 pipes (8 PKRs) 4 bpe pa @ CMASK_64K @ RbPlus + 19, // 32 pipes (8 PKRs) 8 bpe pa @ CMASK_64K @ RbPlus + 20, // 8 pipes (16 PKRs) 1 bpe pa @ CMASK_64K @ RbPlus + 20, // 8 pipes (16 PKRs) 2 bpe pa @ CMASK_64K @ RbPlus + 20, // 8 pipes (16 PKRs) 4 bpe pa @ CMASK_64K @ RbPlus + 21, // 8 pipes (16 PKRs) 8 bpe pa @ CMASK_64K @ RbPlus + 22, // 16 pipes (16 PKRs) 1 bpe pa @ CMASK_64K @ RbPlus + 22, // 16 pipes (16 PKRs) 2 bpe pa @ CMASK_64K @ RbPlus + 22, // 16 pipes (16 PKRs) 4 bpe pa @ CMASK_64K @ RbPlus + 23, // 16 pipes (16 PKRs) 8 bpe pa @ CMASK_64K @ RbPlus + 22, // 32 pipes (16 PKRs) 1 bpe pa @ CMASK_64K @ RbPlus + 22, // 32 pipes (16 PKRs) 2 bpe pa @ CMASK_64K @ RbPlus + 22, // 32 pipes (16 PKRs) 4 bpe pa @ CMASK_64K @ RbPlus + 24, // 32 pipes (16 PKRs) 8 bpe pa @ CMASK_64K @ RbPlus + 25, // 64 pipes (16 PKRs) 1 bpe pa @ CMASK_64K @ RbPlus + 25, // 64 pipes (16 PKRs) 2 bpe pa @ CMASK_64K @ RbPlus + 25, // 64 pipes (16 PKRs) 4 bpe pa @ CMASK_64K @ RbPlus + 32, // 64 pipes (16 PKRs) 8 bpe pa @ CMASK_64K @ RbPlus + 27, // 16 pipes (32 PKRs) 1 bpe pa @ CMASK_64K @ RbPlus + 27, // 16 pipes (32 PKRs) 2 bpe pa @ CMASK_64K @ RbPlus + 27, // 16 pipes (32 PKRs) 4 bpe pa @ CMASK_64K @ RbPlus + 28, // 16 pipes (32 PKRs) 8 bpe pa @ CMASK_64K @ RbPlus + 29, // 32 pipes (32 PKRs) 1 bpe pa @ CMASK_64K @ RbPlus + 29, // 32 pipes (32 PKRs) 2 bpe pa @ CMASK_64K @ RbPlus + 29, // 32 pipes (32 PKRs) 4 bpe pa @ CMASK_64K @ RbPlus + 33, // 32 pipes (32 PKRs) 8 bpe pa @ CMASK_64K @ RbPlus + 29, // 64 pipes (32 PKRs) 1 bpe pa @ CMASK_64K @ RbPlus + 29, // 64 pipes (32 PKRs) 2 bpe pa @ CMASK_64K @ RbPlus + 29, // 64 pipes (32 PKRs) 4 bpe pa @ CMASK_64K @ RbPlus + 34, // 64 pipes (32 PKRs) 8 bpe pa @ CMASK_64K @ RbPlus +}; + +const UINT_8 CMASK_VAR_RBPLUS_PATIDX[] = +{ + 0, // 1 bpe ua @ CMASK_VAR @ RbPlus + 0, // 2 bpe ua @ CMASK_VAR @ RbPlus + 0, // 4 bpe ua @ CMASK_VAR @ RbPlus + 0, // 8 bpe ua @ CMASK_VAR @ RbPlus + 0, // 1 pipes (1-2 PKRs) 1 bpe pa @ CMASK_VAR @ RbPlus + 0, // 1 pipes (1-2 PKRs) 2 bpe pa @ CMASK_VAR @ RbPlus + 0, // 1 pipes (1-2 PKRs) 4 bpe pa @ CMASK_VAR @ RbPlus + 0, // 1 pipes (1-2 PKRs) 8 bpe pa @ CMASK_VAR @ RbPlus + 8, // 2 pipes (1-2 PKRs) 1 bpe pa @ CMASK_VAR @ RbPlus + 8, // 2 pipes (1-2 PKRs) 2 bpe pa @ CMASK_VAR @ RbPlus + 8, // 2 pipes (1-2 PKRs) 4 bpe pa @ CMASK_VAR @ RbPlus + 8, // 2 pipes (1-2 PKRs) 8 bpe pa @ CMASK_VAR @ RbPlus + 9, // 4 pipes (1-2 PKRs) 1 bpe pa @ CMASK_VAR @ RbPlus + 9, // 4 pipes (1-2 PKRs) 2 bpe pa @ CMASK_VAR @ RbPlus + 9, // 4 pipes (1-2 PKRs) 4 bpe pa @ CMASK_VAR @ RbPlus + 9, // 4 pipes (1-2 PKRs) 8 bpe pa @ CMASK_VAR @ RbPlus + 10, // 8 pipes (1-2 PKRs) 1 bpe pa @ CMASK_VAR @ RbPlus + 10, // 8 pipes (1-2 PKRs) 2 bpe pa @ CMASK_VAR @ RbPlus + 10, // 8 pipes (1-2 PKRs) 4 bpe pa @ CMASK_VAR @ RbPlus + 10, // 8 pipes (1-2 PKRs) 8 bpe pa @ CMASK_VAR @ RbPlus + 8, // 2 pipes (4 PKRs) 1 bpe pa @ CMASK_VAR @ RbPlus + 8, // 2 pipes (4 PKRs) 2 bpe pa @ CMASK_VAR @ RbPlus + 8, // 2 pipes (4 PKRs) 4 bpe pa @ CMASK_VAR @ RbPlus + 8, // 2 pipes (4 PKRs) 8 bpe pa @ CMASK_VAR @ RbPlus + 11, // 4 pipes (4 PKRs) 1 bpe pa @ CMASK_VAR @ RbPlus + 11, // 4 pipes (4 PKRs) 2 bpe pa @ CMASK_VAR @ RbPlus + 11, // 4 pipes (4 PKRs) 4 bpe pa @ CMASK_VAR @ RbPlus + 11, // 4 pipes (4 PKRs) 8 bpe pa @ CMASK_VAR @ RbPlus + 12, // 8 pipes (4 PKRs) 1 bpe pa @ CMASK_VAR @ RbPlus + 12, // 8 pipes (4 PKRs) 2 bpe pa @ CMASK_VAR @ RbPlus + 12, // 8 pipes (4 PKRs) 4 bpe pa @ CMASK_VAR @ RbPlus + 12, // 8 pipes (4 PKRs) 8 bpe pa @ CMASK_VAR @ RbPlus + 13, // 16 pipes (4 PKRs) 1 bpe pa @ CMASK_VAR @ RbPlus + 13, // 16 pipes (4 PKRs) 2 bpe pa @ CMASK_VAR @ RbPlus + 13, // 16 pipes (4 PKRs) 4 bpe pa @ CMASK_VAR @ RbPlus + 13, // 16 pipes (4 PKRs) 8 bpe pa @ CMASK_VAR @ RbPlus + 14, // 4 pipes (8 PKRs) 1 bpe pa @ CMASK_VAR @ RbPlus + 14, // 4 pipes (8 PKRs) 2 bpe pa @ CMASK_VAR @ RbPlus + 14, // 4 pipes (8 PKRs) 4 bpe pa @ CMASK_VAR @ RbPlus + 14, // 4 pipes (8 PKRs) 8 bpe pa @ CMASK_VAR @ RbPlus + 15, // 8 pipes (8 PKRs) 1 bpe pa @ CMASK_VAR @ RbPlus + 15, // 8 pipes (8 PKRs) 2 bpe pa @ CMASK_VAR @ RbPlus + 15, // 8 pipes (8 PKRs) 4 bpe pa @ CMASK_VAR @ RbPlus + 16, // 8 pipes (8 PKRs) 8 bpe pa @ CMASK_VAR @ RbPlus + 15, // 16 pipes (8 PKRs) 1 bpe pa @ CMASK_VAR @ RbPlus + 15, // 16 pipes (8 PKRs) 2 bpe pa @ CMASK_VAR @ RbPlus + 15, // 16 pipes (8 PKRs) 4 bpe pa @ CMASK_VAR @ RbPlus + 17, // 16 pipes (8 PKRs) 8 bpe pa @ CMASK_VAR @ RbPlus + 18, // 32 pipes (8 PKRs) 1 bpe pa @ CMASK_VAR @ RbPlus + 18, // 32 pipes (8 PKRs) 2 bpe pa @ CMASK_VAR @ RbPlus + 18, // 32 pipes (8 PKRs) 4 bpe pa @ CMASK_VAR @ RbPlus + 19, // 32 pipes (8 PKRs) 8 bpe pa @ CMASK_VAR @ RbPlus + 20, // 8 pipes (16 PKRs) 1 bpe pa @ CMASK_VAR @ RbPlus + 20, // 8 pipes (16 PKRs) 2 bpe pa @ CMASK_VAR @ RbPlus + 20, // 8 pipes (16 PKRs) 4 bpe pa @ CMASK_VAR @ RbPlus + 21, // 8 pipes (16 PKRs) 8 bpe pa @ CMASK_VAR @ RbPlus + 22, // 16 pipes (16 PKRs) 1 bpe pa @ CMASK_VAR @ RbPlus + 22, // 16 pipes (16 PKRs) 2 bpe pa @ CMASK_VAR @ RbPlus + 22, // 16 pipes (16 PKRs) 4 bpe pa @ CMASK_VAR @ RbPlus + 23, // 16 pipes (16 PKRs) 8 bpe pa @ CMASK_VAR @ RbPlus + 22, // 32 pipes (16 PKRs) 1 bpe pa @ CMASK_VAR @ RbPlus + 22, // 32 pipes (16 PKRs) 2 bpe pa @ CMASK_VAR @ RbPlus + 22, // 32 pipes (16 PKRs) 4 bpe pa @ CMASK_VAR @ RbPlus + 24, // 32 pipes (16 PKRs) 8 bpe pa @ CMASK_VAR @ RbPlus + 25, // 64 pipes (16 PKRs) 1 bpe pa @ CMASK_VAR @ RbPlus + 25, // 64 pipes (16 PKRs) 2 bpe pa @ CMASK_VAR @ RbPlus + 25, // 64 pipes (16 PKRs) 4 bpe pa @ CMASK_VAR @ RbPlus + 26, // 64 pipes (16 PKRs) 8 bpe pa @ CMASK_VAR @ RbPlus + 27, // 16 pipes (32 PKRs) 1 bpe pa @ CMASK_VAR @ RbPlus + 27, // 16 pipes (32 PKRs) 2 bpe pa @ CMASK_VAR @ RbPlus + 27, // 16 pipes (32 PKRs) 4 bpe pa @ CMASK_VAR @ RbPlus + 28, // 16 pipes (32 PKRs) 8 bpe pa @ CMASK_VAR @ RbPlus + 29, // 32 pipes (32 PKRs) 1 bpe pa @ CMASK_VAR @ RbPlus + 29, // 32 pipes (32 PKRs) 2 bpe pa @ CMASK_VAR @ RbPlus + 29, // 32 pipes (32 PKRs) 4 bpe pa @ CMASK_VAR @ RbPlus + 30, // 32 pipes (32 PKRs) 8 bpe pa @ CMASK_VAR @ RbPlus + 29, // 64 pipes (32 PKRs) 1 bpe pa @ CMASK_VAR @ RbPlus + 29, // 64 pipes (32 PKRs) 2 bpe pa @ CMASK_VAR @ RbPlus + 29, // 64 pipes (32 PKRs) 4 bpe pa @ CMASK_VAR @ RbPlus + 31, // 64 pipes (32 PKRs) 8 bpe pa @ CMASK_VAR @ RbPlus +}; + +const UINT_64 DCC_64K_R_X_SW_PATTERN[][17] = +{ + {0, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, Y9, 0, 0, 0, 0, }, //0 + {0, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, 0, 0, 0, 0, }, //1 + {0, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, 0, 0, 0, 0, }, //2 + {0, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, 0, 0, 0, 0, }, //3 + {0, X2, Y2, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, 0, 0, 0, 0, }, //4 + {0, X3^Y3, X4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, Y9, 0, 0, 0, 0, }, //5 + {0, X3^Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, 0, 0, 0, 0, }, //6 + {0, X3^Y3, X4^Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, Y9, 0, 0, 0, 0, }, //7 + {0, X4, X5, Y5, X6, Y6, X7, Y7, X8, Z0^X3^Y3, Y8, X9, Y9, 0, 0, 0, 0, }, //8 + {0, Y4, X4, X5, Y5, X6, Y6, X7, Y7, Z0^X3^Y3, X8, Y8, X9, 0, 0, 0, 0, }, //9 + {0, X3, Y4, X4, X5, Y5, X6, Y6, X7, Z0^X3^Y3, Y7, X8, Y8, 0, 0, 0, 0, }, //10 + {0, Y2, X3, Y4, X4, X5, Y5, X6, Y6, Z0^X3^Y3, X7, Y7, X8, 0, 0, 0, 0, }, //11 + {0, X2, Y2, X3, Y4, X4, X5, Y5, X6, Z0^X3^Y3, Y6, X7, Y7, 0, 0, 0, 0, }, //12 + {0, X5, Y5, X6, Y6, X7, Y7, X8, Y8, Z1^X3^Y3, Z0^X4^Y4, X9, Y9, 0, 0, 0, 0, }, //13 + {0, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Z1^X3^Y3, Z0^X4^Y4, Y8, X9, 0, 0, 0, 0, }, //14 + {0, X3, Y4, X5, Y5, X6, Y6, X7, Y7, Z1^X3^Y3, Z0^X4^Y4, X8, Y8, 0, 0, 0, 0, }, //15 + {0, Y2, X3, Y4, X5, Y5, X6, Y6, X7, Z1^X3^Y3, Z0^X4^Y4, Y7, X8, 0, 0, 0, 0, }, //16 + {0, X2, Y2, X3, Y4, X5, Y5, X6, Y6, Z1^X3^Y3, Z0^X4^Y4, X7, Y7, 0, 0, 0, 0, }, //17 + {0, Y5, X6, Y6, X7, Y7, X8, Y8, X9, Z2^X3^Y3, Z1^X4^Y4, Z0^X5^Y5, Y9, 0, 0, 0, 0, }, //18 + {0, Y4, Y5, X6, Y6, X7, Y7, X8, Y8, Z2^X3^Y3, Z1^X4^Y4, Z0^X5^Y5, X9, 0, 0, 0, 0, }, //19 + {0, X3, Y4, Y5, X6, Y6, X7, Y7, X8, Z2^X3^Y3, Z1^X4^Y4, Z0^X5^Y5, Y8, 0, 0, 0, 0, }, //20 + {0, Y2, X3, Y4, Y5, X6, Y6, X7, Y7, Z2^X3^Y3, Z1^X4^Y4, Z0^X5^Y5, X8, 0, 0, 0, 0, }, //21 + {0, X2, Y2, X3, Y4, Y5, X6, Y6, X7, Z2^X3^Y3, Z1^X4^Y4, Z0^X5^Y5, Y7, 0, 0, 0, 0, }, //22 + {0, X6, Y6, X7, Y7, X8, Y8, X9, Y9, X3^Y3^Z3, Z2^X4^Y4, Z1^Y5^X6, Z0^X5^Y6, 0, 0, 0, 0, }, //23 + {0, Y4, X6, Y6, X7, Y7, X8, Y8, X9, X3^Y3^Z3, Z2^X4^Y4, Z1^Y5^X6, Z0^X5^Y6, 0, 0, 0, 0, }, //24 + {0, X3, Y4, X6, Y6, X7, Y7, X8, Y8, X3^Y3^Z3, Z2^X4^Y4, Z1^Y5^X6, Z0^X5^Y6, 0, 0, 0, 0, }, //25 + {0, Y2, X3, Y4, X6, Y6, X7, Y7, X8, X3^Y3^Z3, Z2^X4^Y4, Z1^Y5^X6, Z0^X5^Y6, 0, 0, 0, 0, }, //26 + {0, X2, Y2, X3, Y4, X6, Y6, X7, Y7, X3^Y3^Z3, Z2^X4^Y4, Z1^Y5^X6, Z0^X5^Y6, 0, 0, 0, 0, }, //27 + {0, Y6, X7, Y7, X8, Y8, X9, Y9, X10, X3^Y3^Z4, Z3^X4^Y4, Z2^Y5^X7, Z1^X5^Y7, Z0^X6^Y6, 0, 0, 0, }, //28 + {0, Y4, Y6, X7, Y7, X8, Y8, X9, Y9, X3^Y3^Z4, Z3^X4^Y4, Z2^Y5^X7, Z1^X5^Y7, Z0^X6^Y6, 0, 0, 0, }, //29 + {0, X3, Y4, Y6, X7, Y7, X8, Y8, X9, X3^Y3^Z4, Z3^X4^Y4, Z2^Y5^X7, Z1^X5^Y7, Z0^X6^Y6, 0, 0, 0, }, //30 + {0, Y2, X3, Y4, Y6, X7, Y7, X8, Y8, X3^Y3^Z4, Z3^X4^Y4, Z2^Y5^X7, Z1^X5^Y7, Z0^X6^Y6, 0, 0, 0, }, //31 + {0, X2, X3, Y4, Y6, X7, Y7, Y2, X8, X3^Y3^Z3, Z2^X4^Y4, Z1^Y5^X7, Z0^X5^Y7, Y2^X6^Y6, 0, 0, 0, }, //32 + {0, X7, Y7, X8, Y8, X9, Y9, X10, Y10, X3^Y3^Z5, X4^Y4^Z4, Z3^Y5^X8, Z2^X5^Y8, Z1^Y6^X7, Z0^X6^Y7, 0, 0, }, //33 + {0, Y4, X7, Y7, X8, Y8, X9, Y9, X10, X3^Y3^Z5, X4^Y4^Z4, Z3^Y5^X8, Z2^X5^Y8, Z1^Y6^X7, Z0^X6^Y7, 0, 0, }, //34 + {0, X3, Y4, X7, Y7, X8, Y8, X9, Y9, X3^Y3^Z5, X4^Y4^Z4, Z3^Y5^X8, Z2^X5^Y8, Z1^Y6^X7, Z0^X6^Y7, 0, 0, }, //35 + {0, X3, Y4, X7, Y7, X8, Y8, Y2, X9, X3^Y3^Z4, Z3^X4^Y4, Z2^Y5^X8, Z1^X5^Y8, Y2^Y6^X7, Z0^X6^Y7, 0, 0, }, //36 + {0, X3, Y4, X7, Y7, X8, Y8, X2, Y2, X3^Y3^Z3, Z2^X4^Y4, Z1^Y5^X8, Z0^X5^Y8, Y2^Y6^X7, X2^X6^Y7, 0, 0, }, //37 + {0, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Z0^X4^Y4, Y8, X9, Y9, 0, 0, 0, 0, }, //38 + {0, Y3, Y4, X5, Y5, X6, Y6, X7, Y7, Z0^X4^Y4, X8, Y8, X9, 0, 0, 0, 0, }, //39 + {0, X3, Y3, Y4, X5, Y5, X6, Y6, X7, Z0^X4^Y4, Y7, X8, Y8, 0, 0, 0, 0, }, //40 + {0, Y2, X3, Y3, Y4, X5, Y5, X6, Y6, Z0^X4^Y4, X7, Y7, X8, 0, 0, 0, 0, }, //41 + {0, X2, Y2, X3, Y3, Y4, X5, Y5, X6, Z0^X4^Y4, Y6, X7, Y7, 0, 0, 0, 0, }, //42 + {0, X5, Y5, X6, Y6, X7, Y7, X8, Y8, Y4^X5^Y5, Z0^X4^Y4, X9, Y9, 0, 0, 0, 0, }, //43 + {0, Y3, X5, Y5, X6, Y6, X7, Y7, X8, Y4^X5^Y5, Z0^X4^Y4, Y8, X9, 0, 0, 0, 0, }, //44 + {0, X3, Y3, X5, Y5, X6, Y6, X7, Y7, Y4^X5^Y5, Z0^X4^Y4, X8, Y8, 0, 0, 0, 0, }, //45 + {0, Y2, X3, Y3, X5, Y5, X6, Y6, X7, Y4^X5^Y5, Z0^X4^Y4, Y7, X8, 0, 0, 0, 0, }, //46 + {0, X2, Y2, X3, Y3, X5, Y5, X6, Y6, Y4^X5^Y5, Z0^X4^Y4, X7, Y7, 0, 0, 0, 0, }, //47 + {0, Y5, X6, Y6, X7, Y7, X8, Y8, X9, Y4^X5^Y5, Z0^X4^Y4, X5^X6^Y6, Y9, 0, 0, 0, 0, }, //48 + {0, Y3, Y5, X6, Y6, X7, Y7, X8, Y8, Y4^X5^Y5, Z0^X4^Y4, X5^X6^Y6, X9, 0, 0, 0, 0, }, //49 + {0, X3, Y3, Y5, X6, Y6, X7, Y7, X8, Y4^X5^Y5, Z0^X4^Y4, X5^X6^Y6, Y8, 0, 0, 0, 0, }, //50 + {0, Y2, X3, Y3, Y5, X6, Y6, X7, Y7, Y4^X5^Y5, Z0^X4^Y4, X5^X6^Y6, X8, 0, 0, 0, 0, }, //51 + {0, X2, Y2, X3, Y3, Y5, X6, Y6, X7, Y4^X5^Y5, Z0^X4^Y4, X5^X6^Y6, Y7, 0, 0, 0, 0, }, //52 + {0, X5, X6, Y6, X7, Y7, X8, Y8, X9, Y4^X6^Y6, Z1^X4^Y4, X5^Y5, Y9, 0, 0, 0, 0, }, //53 + {0, Y3, X5, X6, Y6, X7, Y7, X8, Y8, Y4^X6^Y6, Z1^X4^Y4, X5^Y5, X9, 0, 0, 0, 0, }, //54 + {0, X3, Y3, X5, X6, Y6, X7, Y7, X8, Y4^X6^Y6, Z1^X4^Y4, X5^Y5, Y8, 0, 0, 0, 0, }, //55 + {0, Y2, X3, Y3, X5, X6, Y6, X7, Y7, Y4^X6^Y6, Z1^X4^Y4, X5^Y5, X8, 0, 0, 0, 0, }, //56 + {0, X2, Y2, X3, Y3, X5, X6, Y6, X7, Y4^X6^Y6, Z1^X4^Y4, X5^Y5, Y7, 0, 0, 0, 0, }, //57 + {0, X5, X6, Y6, X7, Y7, X8, Y8, X9, Y4^X6^Y6, Z1^X4^Y4, Z0^X5^Y5, Y9, 0, 0, 0, 0, }, //58 + {0, Y3, X5, X6, Y6, X7, Y7, X8, Y8, Y4^X6^Y6, Z1^X4^Y4, Z0^X5^Y5, X9, 0, 0, 0, 0, }, //59 + {0, X3, Y3, X5, X6, Y6, X7, Y7, X8, Y4^X6^Y6, Z1^X4^Y4, Z0^X5^Y5, Y8, 0, 0, 0, 0, }, //60 + {0, Y2, X3, Y3, X5, X6, Y6, X7, Y7, Y4^X6^Y6, Z1^X4^Y4, Z0^X5^Y5, X8, 0, 0, 0, 0, }, //61 + {0, X2, Y2, X3, Y3, X5, X6, Y6, X7, Y4^X6^Y6, Z1^X4^Y4, Z0^X5^Y5, Y7, 0, 0, 0, 0, }, //62 + {0, X6, Y6, X7, Y7, X8, Y8, X9, Y9, Y4^X6^Y6, Z1^X4^Y4, Z0^X5^Y5, X5^X7^Y7, 0, 0, 0, 0, }, //63 + {0, Y3, X6, Y6, X7, Y7, X8, Y8, X9, Y4^X6^Y6, Z1^X4^Y4, Z0^X5^Y5, X5^X7^Y7, 0, 0, 0, 0, }, //64 + {0, X3, Y3, X6, Y6, X7, Y7, X8, Y8, Y4^X6^Y6, Z1^X4^Y4, Z0^X5^Y5, X5^X7^Y7, 0, 0, 0, 0, }, //65 + {0, Y2, X3, Y3, X6, Y6, X7, Y7, X8, Y4^X6^Y6, Z1^X4^Y4, Z0^X5^Y5, X5^X7^Y7, 0, 0, 0, 0, }, //66 + {0, X2, Y2, X3, Y3, X6, Y6, X7, Y7, Y4^X6^Y6, Z1^X4^Y4, Z0^X5^Y5, X5^X7^Y7, 0, 0, 0, 0, }, //67 + {0, X6, Y6, X7, Y7, X8, Y8, X9, Y9, Y4^X7^Y7, Z2^X4^Y4, Z1^Y5^X6, X5^Y6, 0, 0, 0, 0, }, //68 + {0, Y3, X6, Y6, X7, Y7, X8, Y8, X9, Y4^X7^Y7, Z2^X4^Y4, Z1^Y5^X6, X5^Y6, 0, 0, 0, 0, }, //69 + {0, X3, Y3, X6, Y6, X7, Y7, X8, Y8, Y4^X7^Y7, Z2^X4^Y4, Z1^Y5^X6, X5^Y6, 0, 0, 0, 0, }, //70 + {0, Y2, X3, Y3, X6, Y6, X7, Y7, X8, Y4^X7^Y7, Z2^X4^Y4, Z1^Y5^X6, X5^Y6, 0, 0, 0, 0, }, //71 + {0, X2, Y2, X3, Y3, X6, Y6, X7, Y7, Y4^X7^Y7, Z2^X4^Y4, Z1^Y5^X6, X5^Y6, 0, 0, 0, 0, }, //72 + {0, X6, Y6, X7, Y7, X8, Y8, X9, Y9, Y4^X7^Y7, Z2^X4^Y4, Z1^Y5^X6, Z0^X5^Y6, 0, 0, 0, 0, }, //73 + {0, Y3, X6, Y6, X7, Y7, X8, Y8, X9, Y4^X7^Y7, Z2^X4^Y4, Z1^Y5^X6, Z0^X5^Y6, 0, 0, 0, 0, }, //74 + {0, X3, Y3, X6, Y6, X7, Y7, X8, Y8, Y4^X7^Y7, Z2^X4^Y4, Z1^Y5^X6, Z0^X5^Y6, 0, 0, 0, 0, }, //75 + {0, Y2, X3, Y3, X6, Y6, X7, Y7, X8, Y4^X7^Y7, Z2^X4^Y4, Z1^Y5^X6, Z0^X5^Y6, 0, 0, 0, 0, }, //76 + {0, X2, Y2, X3, Y3, X6, Y6, X7, Y7, Y4^X7^Y7, Z2^X4^Y4, Z1^Y5^X6, Z0^X5^Y6, 0, 0, 0, 0, }, //77 + {0, Y6, X7, Y7, X8, Y8, X9, Y9, X10, Y4^X7^Y7, Z2^X4^Y4, Z1^Y5^X6, Z0^X5^Y6, X6^X8^Y8, 0, 0, 0, }, //78 + {0, Y3, Y6, X7, Y7, X8, Y8, X9, Y9, Y4^X7^Y7, Z2^X4^Y4, Z1^Y5^X6, Z0^X5^Y6, X6^X8^Y8, 0, 0, 0, }, //79 + {0, X3, Y3, Y6, X7, Y7, X8, Y8, X9, Y4^X7^Y7, Z2^X4^Y4, Z1^Y5^X6, Z0^X5^Y6, X6^X8^Y8, 0, 0, 0, }, //80 + {0, Y2, X3, Y3, Y6, X7, Y7, X8, Y8, Y4^X7^Y7, Z2^X4^Y4, Z1^Y5^X6, Z0^X5^Y6, X6^X8^Y8, 0, 0, 0, }, //81 + {0, X2, Y2, Y3, X6, Y6, X7, Y7, X8, Y4^X7^Y7, Z2^X4^Y4, Z1^Y5^X6, Z0^X5^Y6, X3^X8^Y8, 0, 0, 0, }, //82 + {0, X6, X7, Y7, X8, Y8, X9, Y9, X10, Y4^X8^Y8, Z3^X4^Y4, Z2^Y5^X7, Z1^X5^Y7, X6^Y6, 0, 0, 0, }, //83 + {0, Y3, X6, X7, Y7, X8, Y8, X9, Y9, Y4^X8^Y8, Z3^X4^Y4, Z2^Y5^X7, Z1^X5^Y7, X6^Y6, 0, 0, 0, }, //84 + {0, X3, Y3, X6, X7, Y7, X8, Y8, X9, Y4^X8^Y8, Z3^X4^Y4, Z2^Y5^X7, Z1^X5^Y7, X6^Y6, 0, 0, 0, }, //85 + {0, Y2, X3, Y3, X6, X7, Y7, X8, Y8, Y4^X8^Y8, Z3^X4^Y4, Z2^Y5^X7, Z1^X5^Y7, X6^Y6, 0, 0, 0, }, //86 + {0, X2, X3, Y3, X6, X7, Y7, Y2, X8, Y4^X8^Y8, Z2^X4^Y4, Z1^Y5^X7, Z0^X5^Y7, X6^Y6, 0, 0, 0, }, //87 + {0, X6, X7, Y7, X8, Y8, X9, Y9, X10, Y4^X8^Y8, Z3^X4^Y4, Z2^Y5^X7, Z1^X5^Y7, Z0^X6^Y6, 0, 0, 0, }, //88 + {0, Y3, X6, X7, Y7, X8, Y8, X9, Y9, Y4^X8^Y8, Z3^X4^Y4, Z2^Y5^X7, Z1^X5^Y7, Z0^X6^Y6, 0, 0, 0, }, //89 + {0, X3, Y3, X6, X7, Y7, X8, Y8, X9, Y4^X8^Y8, Z3^X4^Y4, Z2^Y5^X7, Z1^X5^Y7, Z0^X6^Y6, 0, 0, 0, }, //90 + {0, Y2, X3, Y3, X6, X7, Y7, X8, Y8, Y4^X8^Y8, Z3^X4^Y4, Z2^Y5^X7, Z1^X5^Y7, Z0^X6^Y6, 0, 0, 0, }, //91 + {0, X2, X3, Y3, X6, X7, Y7, Y2, X8, Y4^X8^Y8, Z2^X4^Y4, Z1^Y5^X7, Z0^X5^Y7, Y2^X6^Y6, 0, 0, 0, }, //92 + {0, X7, Y7, X8, Y8, X9, Y9, X10, Y10, Y4^X8^Y8, Z3^X4^Y4, Z2^Y5^X7, Z1^X5^Y7, Z0^X6^Y6, X6^X9^Y9, 0, 0, }, //93 + {0, Y3, X7, Y7, X8, Y8, X9, Y9, X10, Y4^X8^Y8, Z3^X4^Y4, Z2^Y5^X7, Z1^X5^Y7, Z0^X6^Y6, X6^X9^Y9, 0, 0, }, //94 + {0, X3, Y3, X7, Y7, X8, Y8, X9, Y9, Y4^X8^Y8, Z3^X4^Y4, Z2^Y5^X7, Z1^X5^Y7, Z0^X6^Y6, X6^X9^Y9, 0, 0, }, //95 + {0, Y2, Y3, X6, X7, Y7, X8, Y8, X9, Y4^X8^Y8, Z3^X4^Y4, Z2^Y5^X7, Z1^X5^Y7, Z0^X6^Y6, X3^X9^Y9, 0, 0, }, //96 + {0, X2, Y3, X6, X7, Y7, X8, Y2, Y8, Y4^X8^Y8, Z2^X4^Y4, Z1^Y5^X7, Z0^X5^Y7, Y2^X6^Y6, X3^X9^Y9, 0, 0, }, //97 + {0, X7, Y7, X8, Y8, X9, Y9, X10, Y10, Y4^X9^Y9, X4^Y4^Z4, Z3^Y5^X8, Z2^X5^Y8, Z1^Y6^X7, X6^Y7, 0, 0, }, //98 + {0, Y3, X7, Y7, X8, Y8, X9, Y9, X10, Y4^X9^Y9, X4^Y4^Z4, Z3^Y5^X8, Z2^X5^Y8, Z1^Y6^X7, X6^Y7, 0, 0, }, //99 + {0, X3, Y3, X7, Y7, X8, Y8, X9, Y9, Y4^X9^Y9, X4^Y4^Z4, Z3^Y5^X8, Z2^X5^Y8, Z1^Y6^X7, X6^Y7, 0, 0, }, //100 + {0, X3, Y3, X7, Y7, X8, Y8, Y2, X9, Y4^X9^Y9, Z3^X4^Y4, Z2^Y5^X8, Z1^X5^Y8, Y2^Y6^X7, X6^Y7, 0, 0, }, //101 + {0, X3, Y3, X7, Y7, X8, Y8, X2, Y2, Y4^X9^Y9, Z2^X4^Y4, Z1^Y5^X8, Z0^X5^Y8, Y2^Y6^X7, X6^Y7, 0, 0, }, //102 + {0, X7, Y7, X8, Y8, X9, Y9, X10, Y10, Y4^X9^Y9, X4^Y4^Z4, Z3^Y5^X8, Z2^X5^Y8, Z1^Y6^X7, Z0^X6^Y7, 0, 0, }, //103 + {0, Y3, X7, Y7, X8, Y8, X9, Y9, X10, Y4^X9^Y9, X4^Y4^Z4, Z3^Y5^X8, Z2^X5^Y8, Z1^Y6^X7, Z0^X6^Y7, 0, 0, }, //104 + {0, X3, Y3, X7, Y7, X8, Y8, X9, Y9, Y4^X9^Y9, X4^Y4^Z4, Z3^Y5^X8, Z2^X5^Y8, Z1^Y6^X7, Z0^X6^Y7, 0, 0, }, //105 + {0, X3, Y3, X7, Y7, X8, Y8, Y2, X9, Y4^X9^Y9, Z3^X4^Y4, Z2^Y5^X8, Z1^X5^Y8, Y2^Y6^X7, Z0^X6^Y7, 0, 0, }, //106 + {0, X3, Y3, X7, Y7, X8, Y8, X2, Y2, Y4^X9^Y9, Z2^X4^Y4, Z1^Y5^X8, Z0^X5^Y8, Y2^Y6^X7, X2^X6^Y7, 0, 0, }, //107 +}; + +const UINT_64 HTILE_SW_PATTERN[][18] = +{ + {0, 0, 0, X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, 0, 0, 0, 0, 0, }, //0 + {0, 0, 0, X3, Y4, X4, X5, Y5, X6, Z0^X3^Y3, Y6, X7, Y7, 0, 0, 0, 0, 0, }, //1 + {0, 0, 0, X3, Y4, X5, Y5, X6, Y6, Z1^X3^Y3, Z0^X4^Y4, X7, Y7, X8, 0, 0, 0, 0, }, //2 + {0, 0, 0, X3, Y4, Y5, X6, Y6, X7, Z2^X3^Y3, Z1^X4^Y4, Z0^X5^Y5, Y7, X8, Y8, 0, 0, 0, }, //3 + {0, 0, 0, X3, Y4, X6, Y6, X7, Y7, X3^Y3^Z3, Z2^X4^Y4, Z1^Y5^X6, Z0^X5^Y6, X8, Y8, X9, 0, 0, }, //4 + {0, 0, 0, X3, Y4, X6, Y6, X7, Y7, Z2^X3^Y3, Z1^X4^Y4, Z0^Y5^X6, X5^Y6, X8, Y8, X9, 0, 0, }, //5 + {0, 0, 0, X3, Y4, Y6, X7, Y7, X8, X3^Y3^Z4, Z3^X4^Y4, Z2^Y5^X7, Z1^X5^Y7, Z0^X6^Y6, Y8, X9, Y9, 0, }, //6 + {0, 0, 0, X3, Y4, Y6, X7, Y7, X8, X3^Y3^Z3, Z2^X4^Y4, Z1^Y5^X7, Z0^X5^Y7, X6^Y6, Y8, X9, Y9, 0, }, //7 + {0, 0, 0, X3, Y4, Y6, X7, Y7, X8, Z2^X3^Y3, Z1^X4^Y4, Z0^Y5^X7, X5^Y7, X6^Y6, Y8, X9, Y9, 0, }, //8 + {0, 0, 0, X3, Y4, X7, Y7, X8, Y8, X3^Y3^Z5, X4^Y4^Z4, Z3^Y5^X8, Z2^X5^Y8, Z1^Y6^X7, Z0^X6^Y7, X9, Y9, X10, }, //9 + {0, 0, 0, X3, Y4, X7, Y7, X8, Y8, X3^Y3^Z4, Z3^X4^Y4, Z2^Y5^X8, Z1^X5^Y8, Z0^Y6^X7, X6^Y7, X9, Y9, X10, }, //10 + {0, 0, 0, X3, Y4, X7, Y7, X8, Y8, X3^Y3^Z3, Z2^X4^Y4, Z1^Y5^X8, Z0^X5^Y8, Y6^X7, X6^Y7, X9, Y9, X10, }, //11 + {0, 0, 0, X3, Y4, X7, Y7, X8, Y8, Z2^X3^Y3, Z1^X4^Y4, Z0^Y5^X8, X5^Y8, Y6^X7, X6^Y7, X9, Y9, X10, }, //12 + {0, 0, 0, X3, Y3, Y4, X5, Y5, X6, Z0^X4^Y4, Y6, X7, Y7, 0, 0, 0, 0, 0, }, //13 + {0, 0, 0, X3, Y3, X5, Y5, X6, Y6, Y4^X5^Y5, Z0^X4^Y4, X7, Y7, X8, 0, 0, 0, 0, }, //14 + {0, 0, 0, X3, Y3, Y5, X6, Y6, X7, Y4^X5^Y5, Z0^X4^Y4, X5^Y5, Y7, X8, Y8, 0, 0, 0, }, //15 + {0, 0, 0, X3, Y3, X5, X6, Y6, X7, Y4^X6^Y6, Z1^X4^Y4, Y7, X8, Y8, X5^Y5, 0, 0, 0, }, //16 + {0, 0, 0, X3, Y3, X5, X6, Y6, X7, Y4^X6^Y6, Z1^X4^Y4, Z0^X5^Y5, Y7, X8, Y8, 0, 0, 0, }, //17 + {0, 0, 0, X3, Y3, X6, Y6, X7, Y7, Y4^X6^Y6, Z1^X4^Y4, Z0^X5^Y5, X5^Y6, X8, Y8, X9, 0, 0, }, //18 + {0, 0, 0, X3, Y3, Y4, X5, X6, Y6, Z1^X4^Y4, Z0^X5^Y5, X7, Y7, X8, 0, 0, 0, 0, }, //19 + {0, 0, 0, X3, Y3, X6, Y6, X7, Y7, Y4^X7^Y7, Z1^X4^Y4, Z0^Y5^X6, X8, Y8, X9, X5^Y6, 0, 0, }, //20 + {0, 0, 0, X3, Y3, X6, Y6, X7, Y7, Y4^X7^Y7, Z1^X4^Y4, Z0^Y5^X6, X5^Y6, X8, Y8, X9, 0, 0, }, //21 + {0, 0, 0, X3, Y3, Y6, X7, Y7, X8, Y4^X7^Y7, Z1^X4^Y4, Z0^Y5^X6, X5^Y6, X6^Y6, Y8, X9, Y9, 0, }, //22 + {0, 0, 0, X3, Y3, Y4, X6, Y6, X7, Z1^X4^Y4, Z0^Y5^X6, X5^Y6, Y7, X8, Y8, 0, 0, 0, }, //23 + {0, 0, 0, X3, Y3, X6, X7, Y7, X8, Y4^X8^Y8, Z1^X4^Y4, Z0^Y5^X7, X5^Y7, Y8, X9, Y9, X6^Y6, 0, }, //24 + {0, 0, 0, X3, Y3, X6, X7, Y7, X8, Y4^X8^Y8, Z1^X4^Y4, Z0^Y5^X7, X5^Y7, X6^Y6, Y8, X9, Y9, 0, }, //25 + {0, 0, 0, X3, Y3, X7, Y7, X8, Y8, Y4^X8^Y8, Z1^X4^Y4, Z0^Y5^X7, X5^Y7, X6^Y6, X6^Y8, X9, Y9, X10, }, //26 + {0, 0, 0, X3, Y3, Y4, X6, X7, Y7, Z1^X4^Y4, Z0^Y5^X7, X5^Y7, X6^Y6, X8, Y8, X9, 0, 0, }, //27 + {0, 0, 0, X3, Y3, X7, Y7, X8, Y8, Y4^X9^Y9, Z1^X4^Y4, Z0^Y5^X8, X5^Y8, Y6^X7, X9, Y9, X10, X6^Y7, }, //28 + {0, 0, 0, X3, Y3, X7, Y7, X8, Y8, Y4^X9^Y9, Z1^X4^Y4, Z0^Y5^X8, X5^Y8, Y6^X7, X6^Y7, X9, Y9, X10, }, //29 +}; + +const UINT_64 CMASK_SW_PATTERN[][17] = +{ + {X3, Y3, X4, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Y8, X9, 0, 0, 0, 0, }, //0 + {X3, Y4, X4, X5, Y5, X6, Y6, X7, Y7, Z0^X3^Y3, X8, Y8, X9, 0, 0, 0, 0, }, //1 + {X3, Y4, X5, Y5, X6, Y6, X7, Y7, X8, Z1^X3^Y3, Z0^X4^Y4, Y8, X9, 0, 0, 0, 0, }, //2 + {X3, Y4, Y5, X6, Y6, X7, Y7, X8, Y8, Z2^X3^Y3, Z1^X4^Y4, Z0^X5^Y5, X9, 0, 0, 0, 0, }, //3 + {X3, Y4, X6, Y6, X7, Y7, X8, Y8, X9, X3^Y3^Z3, Z2^X4^Y4, Z1^Y5^X6, Z0^X5^Y6, 0, 0, 0, 0, }, //4 + {X3, Y4, Y6, X7, Y7, X8, Y8, X9, Y9, X3^Y3^Z4, Z3^X4^Y4, Z2^Y5^X7, Z1^X5^Y7, Z0^X6^Y6, 0, 0, 0, }, //5 + {X3, Y4, X7, Y7, X8, Y8, X9, Y9, X10, X3^Y3^Z5, X4^Y4^Z4, Z3^Y5^X8, Z2^X5^Y8, Z1^Y6^X7, Z0^X6^Y7, 0, 0, }, //6 + {X3, Y4, X7, Y7, X8, Y8, X9, Y9, X10, X3^Y3^Z4, Z3^X4^Y4, Z2^Y5^X8, Z1^X5^Y8, Y6^X7, Z0^X6^Y7, 0, 0, }, //7 + {X3, Y3, Y4, X5, Y5, X6, Y6, X7, Y7, Z0^X4^Y4, X8, Y8, X9, 0, 0, 0, 0, }, //8 + {X3, Y3, X5, Y5, X6, Y6, X7, Y7, X8, Y4^X5^Y5, Z0^X4^Y4, Y8, X9, 0, 0, 0, 0, }, //9 + {X3, Y3, Y5, X6, Y6, X7, Y7, X8, Y8, Y4^X5^Y5, Z0^X4^Y4, X5^Y5, X9, 0, 0, 0, 0, }, //10 + {X3, Y3, X5, X6, Y6, X7, Y7, X8, Y8, Y4^X6^Y6, Z1^X4^Y4, X5^Y5, X9, 0, 0, 0, 0, }, //11 + {X3, Y3, X5, X6, Y6, X7, Y7, X8, Y8, Y4^X6^Y6, Z1^X4^Y4, Z0^X5^Y5, X9, 0, 0, 0, 0, }, //12 + {X3, Y3, X6, Y6, X7, Y7, X8, Y8, X9, Y4^X6^Y6, Z1^X4^Y4, Z0^X5^Y5, X5^Y6, 0, 0, 0, 0, }, //13 + {X3, Y3, Y4, X5, X6, Y6, X7, Y7, X8, Z1^X4^Y4, Z0^X5^Y5, Y8, X9, 0, 0, 0, 0, }, //14 + {X3, Y3, X6, Y6, X7, Y7, X8, Y8, X9, Y4^X7^Y7, Z1^X4^Y4, Z0^Y5^X6, X5^Y6, 0, 0, 0, 0, }, //15 + {X3, Y3, X6, Y6, X7, Y7, X8, Y8, X9, Y4^X7^Y7, Z2^X4^Y4, Z1^Y5^X6, X5^Y6, 0, 0, 0, 0, }, //16 + {X3, Y3, X6, Y6, X7, Y7, X8, Y8, X9, Y4^X7^Y7, Z2^X4^Y4, Z1^Y5^X6, Z0^X5^Y6, 0, 0, 0, 0, }, //17 + {X3, Y3, Y6, X7, Y7, X8, Y8, X9, Y9, Y4^X7^Y7, Z1^X4^Y4, Z0^Y5^X6, X5^Y6, X6^Y6, 0, 0, 0, }, //18 + {X3, Y3, Y6, X7, Y7, X8, Y8, X9, Y9, Y4^X7^Y7, Z2^X4^Y4, Z1^Y5^X6, Z0^X5^Y6, X6^Y6, 0, 0, 0, }, //19 + {X3, Y3, Y4, X6, Y6, X7, Y7, X8, Y8, Z1^X4^Y4, Z0^Y5^X6, X5^Y6, X9, 0, 0, 0, 0, }, //20 + {X3, Y3, Y4, X6, Y6, X7, Y7, X8, Y8, Z2^X4^Y4, Z1^Y5^X6, Z0^X5^Y6, X9, 0, 0, 0, 0, }, //21 + {X3, Y3, X6, X7, Y7, X8, Y8, X9, Y9, Y4^X8^Y8, Z1^X4^Y4, Z0^Y5^X7, X5^Y7, X6^Y6, 0, 0, 0, }, //22 + {X3, Y3, X6, X7, Y7, X8, Y8, X9, Y9, Y4^X8^Y8, Z3^X4^Y4, Z2^Y5^X7, Z1^X5^Y7, X6^Y6, 0, 0, 0, }, //23 + {X3, Y3, X6, X7, Y7, X8, Y8, X9, Y9, Y4^X8^Y8, Z3^X4^Y4, Z2^Y5^X7, Z1^X5^Y7, Z0^X6^Y6, 0, 0, 0, }, //24 + {X3, Y3, X7, Y7, X8, Y8, X9, Y9, X10, Y4^X8^Y8, Z1^X4^Y4, Z0^Y5^X7, X5^Y7, X6^Y6, X6^Y8, 0, 0, }, //25 + {X3, Y3, X7, Y7, X8, Y8, X9, Y9, X10, Y4^X8^Y8, Z3^X4^Y4, Z2^Y5^X7, Z1^X5^Y7, Z0^X6^Y6, X6^Y8, 0, 0, }, //26 + {X3, Y3, Y4, X6, X7, Y7, X8, Y8, X9, Z1^X4^Y4, Z0^Y5^X7, X5^Y7, X6^Y6, 0, 0, 0, 0, }, //27 + {X3, Y3, Y4, X6, X7, Y7, X8, Y8, X9, Z3^X4^Y4, Z2^Y5^X7, Z1^X5^Y7, Z0^X6^Y6, 0, 0, 0, 0, }, //28 + {X3, Y3, X7, Y7, X8, Y8, X9, Y9, X10, Y4^X9^Y9, Z1^X4^Y4, Z0^Y5^X8, X5^Y8, Y6^X7, X6^Y7, 0, 0, }, //29 + {X3, Y3, X7, Y7, X8, Y8, X9, Y9, X10, Y4^X9^Y9, X4^Y4^Z4, Z3^Y5^X8, Z2^X5^Y8, Z1^Y6^X7, X6^Y7, 0, 0, }, //30 + {X3, Y3, X7, Y7, X8, Y8, X9, Y9, X10, Y4^X9^Y9, X4^Y4^Z4, Z3^Y5^X8, Z2^X5^Y8, Z1^Y6^X7, Z0^X6^Y7, 0, 0, }, //31 + {X3, Y3, X6, X7, Y7, X8, X9, Y9, X10, Y4^X8^Y8, Z3^X4^Y4, Z2^Y5^X7, Z1^X5^Y7, Z0^X6^Y6, X3^Y8, 0, 0, }, //32 + {X3, Y3, X7, Y7, X8, Y8, X9, Y9, X10, Y4^X9^Y9, Z3^X4^Y4, Z2^Y5^X8, Z1^X5^Y8, Y6^X7, X6^Y7, 0, 0, }, //33 + {X3, Y3, X7, Y7, X8, Y8, X9, Y9, X10, Y4^X9^Y9, Z3^X4^Y4, Z2^Y5^X8, Z1^X5^Y8, Y6^X7, Z0^X6^Y7, 0, 0, }, //34 }; } // V2 diff -Nru mesa-19.2.8/src/amd/addrlib/src/gfx9/gfx9addrlib.cpp mesa-20.0.8/src/amd/addrlib/src/gfx9/gfx9addrlib.cpp --- mesa-19.2.8/src/amd/addrlib/src/gfx9/gfx9addrlib.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/addrlib/src/gfx9/gfx9addrlib.cpp 2020-06-12 01:21:16.000000000 +0000 @@ -69,51 +69,50 @@ //////////////////////////////////////////////////////////////////////////////////////////////////// const SwizzleModeFlags Gfx9Lib::SwizzleModeTable[ADDR_SW_MAX_TYPE] = -{//Linear 256B 4KB 64KB Var Z Std Disp Rot XOR T RtOpt - {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // ADDR_SW_LINEAR - {0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0}, // ADDR_SW_256B_S - {0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0}, // ADDR_SW_256B_D - {0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0}, // ADDR_SW_256B_R - - {0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0}, // ADDR_SW_4KB_Z - {0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0}, // ADDR_SW_4KB_S - {0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0}, // ADDR_SW_4KB_D - {0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0}, // ADDR_SW_4KB_R - - {0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0}, // ADDR_SW_64KB_Z - {0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0}, // ADDR_SW_64KB_S - {0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0}, // ADDR_SW_64KB_D - {0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0}, // ADDR_SW_64KB_R - - {0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0}, // ADDR_SW_VAR_Z - {0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0}, // ADDR_SW_VAR_S - {0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0}, // ADDR_SW_VAR_D - {0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0}, // ADDR_SW_VAR_R - - {0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0}, // ADDR_SW_64KB_Z_T - {0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0}, // ADDR_SW_64KB_S_T - {0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0}, // ADDR_SW_64KB_D_T - {0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0}, // ADDR_SW_64KB_R_T - - {0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0}, // ADDR_SW_4KB_Z_x - {0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0}, // ADDR_SW_4KB_S_x - {0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0}, // ADDR_SW_4KB_D_x - {0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0}, // ADDR_SW_4KB_R_x - - {0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0}, // ADDR_SW_64KB_Z_X - {0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0}, // ADDR_SW_64KB_S_X - {0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0}, // ADDR_SW_64KB_D_X - {0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0}, // ADDR_SW_64KB_R_X - - {0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0}, // ADDR_SW_VAR_Z_X - {0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0}, // ADDR_SW_VAR_S_X - {0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0}, // ADDR_SW_VAR_D_X - {0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0}, // ADDR_SW_VAR_R_X - {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // ADDR_SW_LINEAR_GENERAL +{//Linear 256B 4KB 64KB Var Z Std Disp Rot XOR T RtOpt Reserved + {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // ADDR_SW_LINEAR + {0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0}, // ADDR_SW_256B_S + {0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0}, // ADDR_SW_256B_D + {0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0}, // ADDR_SW_256B_R + + {0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0}, // ADDR_SW_4KB_Z + {0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0}, // ADDR_SW_4KB_S + {0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0}, // ADDR_SW_4KB_D + {0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0}, // ADDR_SW_4KB_R + + {0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0}, // ADDR_SW_64KB_Z + {0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0}, // ADDR_SW_64KB_S + {0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0}, // ADDR_SW_64KB_D + {0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0}, // ADDR_SW_64KB_R + + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // Reserved + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // Reserved + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // Reserved + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // Reserved + + {0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0}, // ADDR_SW_64KB_Z_T + {0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0}, // ADDR_SW_64KB_S_T + {0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0}, // ADDR_SW_64KB_D_T + {0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0}, // ADDR_SW_64KB_R_T + + {0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0}, // ADDR_SW_4KB_Z_x + {0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0}, // ADDR_SW_4KB_S_x + {0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0}, // ADDR_SW_4KB_D_x + {0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0}, // ADDR_SW_4KB_R_x + + {0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0}, // ADDR_SW_64KB_Z_X + {0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0}, // ADDR_SW_64KB_S_X + {0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0}, // ADDR_SW_64KB_D_X + {0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0}, // ADDR_SW_64KB_R_X + + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // Reserved + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // Reserved + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // Reserved + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // Reserved + {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // ADDR_SW_LINEAR_GENERAL }; -const UINT_32 Gfx9Lib::MipTailOffset256B[] = {2048, 1024, 512, 256, 128, 64, 32, 16, - 8, 6, 5, 4, 3, 2, 1, 0}; +const UINT_32 Gfx9Lib::MipTailOffset256B[] = {2048, 1024, 512, 256, 128, 64, 32, 16, 8, 6, 5, 4, 3, 2, 1, 0}; const Dim3d Gfx9Lib::Block256_3dS[] = {{16, 4, 4}, {8, 4, 4}, {4, 4, 4}, {2, 4, 4}, {1, 4, 4}}; @@ -130,8 +129,7 @@ */ Gfx9Lib::Gfx9Lib(const Client* pClient) : - Lib(pClient), - m_numEquations(0) + Lib(pClient) { m_class = AI_ADDRLIB; memset(&m_settings, 0, sizeof(m_settings)); @@ -281,8 +279,7 @@ ADDR2_COMPUTE_CMASK_INFO_OUTPUT* pOut ///< [out] output structure ) const { -// TODO: Clarify with AddrLib team -// ADDR_ASSERT(pIn->resourceType == ADDR_RSRC_TEX_2D); + ADDR_ASSERT(pIn->resourceType == ADDR_RSRC_TEX_2D); UINT_32 numPipeTotal = GetPipeNumForMetaAddressing(pIn->cMaskFlags.pipeAligned, pIn->swizzleMode); @@ -687,7 +684,7 @@ */ UINT_32 Gfx9Lib::HwlComputeMaxBaseAlignments() const { - return ComputeSurfaceBaseAlignTiled(ADDR_SW_64KB); + return Size64K; } /** @@ -722,7 +719,7 @@ if (m_settings.metaBaseAlignFix) { - maxBaseAlignHtile = Max(maxBaseAlignHtile, GetBlockSize(ADDR_SW_64KB)); + maxBaseAlignHtile = Max(maxBaseAlignHtile, Size64K); } if (m_settings.htileAlignFix) @@ -745,7 +742,7 @@ if (m_settings.metaBaseAlignFix) { - maxBaseAlignDccMsaa = Max(maxBaseAlignDccMsaa, GetBlockSize(ADDR_SW_64KB)); + maxBaseAlignDccMsaa = Max(maxBaseAlignDccMsaa, Size64K); } return Max(maxBaseAlignHtile, Max(maxBaseAlignDccMsaa, maxBaseAlignDcc3D)); @@ -1222,11 +1219,6 @@ break; } - m_blockVarSizeLog2 = pCreateIn->regValue.blockVarSizeLog2; - ADDR_ASSERT((m_blockVarSizeLog2 == 0) || - ((m_blockVarSizeLog2 >= 17u) && (m_blockVarSizeLog2 <= 20u))); - m_blockVarSizeLog2 = Min(Max(17u, m_blockVarSizeLog2), 20u); - if ((m_rbPerSeLog2 == 1) && (((m_pipesLog2 == 1) && ((m_seLog2 == 2) || (m_seLog2 == 3))) || ((m_pipesLog2 == 2) && ((m_seLog2 == 1) || (m_seLog2 == 2))))) @@ -1241,6 +1233,9 @@ m_settings.htileCacheRbConflict = 1; } } + + // For simplicity we never allow VAR swizzle mode for GFX9, the actural value is 18 on GFX9 + m_blockVarSizeLog2 = 0; } else { @@ -2164,6 +2159,7 @@ UINT_32 elementBytesLog2) const { BOOL_32 supported = (elementBytesLog2 < MaxElementBytesLog2) && + (IsValidSwMode(swMode) == TRUE) && (IsLinear(swMode) == FALSE) && (((IsTex2d(rsrcType) == TRUE) && ((elementBytesLog2 < 4) || @@ -2197,7 +2193,7 @@ AddrResourceType rsrcType = static_cast(rsrcTypeIdx + ADDR_RSRC_TEX_2D); // Loop all possible swizzle mode - for (UINT_32 swModeIdx = 0; swModeIdx < MaxSwMode; swModeIdx++) + for (UINT_32 swModeIdx = 0; swModeIdx < MaxSwModeType; swModeIdx++) { AddrSwizzleMode swMode = static_cast(swModeIdx); @@ -2209,7 +2205,7 @@ // Check if the input is supported if (IsEquationSupported(rsrcType, swMode, bppIdx)) { - ADDR_EQUATION equation; + ADDR_EQUATION equation; ADDR_E_RETURNCODE retCode; memset(&equation, 0, sizeof(ADDR_EQUATION)); @@ -2954,14 +2950,10 @@ case ADDR_SW_4KB_R: case ADDR_SW_64KB_D: case ADDR_SW_64KB_R: - case ADDR_SW_VAR_D: - case ADDR_SW_VAR_R: case ADDR_SW_4KB_D_X: case ADDR_SW_4KB_R_X: case ADDR_SW_64KB_D_X: case ADDR_SW_64KB_R_X: - case ADDR_SW_VAR_D_X: - case ADDR_SW_VAR_R_X: support = (pIn->bpp <= 64); break; @@ -2975,22 +2967,18 @@ { case ADDR_SW_4KB_D: case ADDR_SW_64KB_D: - case ADDR_SW_VAR_D: case ADDR_SW_64KB_D_T: case ADDR_SW_4KB_D_X: case ADDR_SW_64KB_D_X: - case ADDR_SW_VAR_D_X: support = (pIn->bpp == 64); break; case ADDR_SW_LINEAR: case ADDR_SW_4KB_S: case ADDR_SW_64KB_S: - case ADDR_SW_VAR_S: case ADDR_SW_64KB_S_T: case ADDR_SW_4KB_S_X: case ADDR_SW_64KB_S_X: - case ADDR_SW_VAR_S_X: support = (pIn->bpp <= 64); break; @@ -3210,7 +3198,7 @@ { BOOL_32 valid = TRUE; - if (pIn->swizzleMode >= ADDR_SW_MAX_TYPE) + if ((pIn->swizzleMode >= ADDR_SW_MAX_TYPE) || (IsValidSwMode(pIn->swizzleMode) == FALSE)) { ADDR_ASSERT_ALWAYS(); valid = FALSE; @@ -3229,7 +3217,6 @@ const AddrSwizzleMode swizzle = pIn->swizzleMode; const BOOL_32 linear = IsLinear(swizzle); const BOOL_32 blk256B = IsBlock256b(swizzle); - const BOOL_32 blkVar = IsBlockVariable(swizzle); const BOOL_32 isNonPrtXor = IsNonPrtXor(swizzle); const ADDR2_SURFACE_FLAGS flags = pIn->flags; @@ -3337,11 +3324,6 @@ valid = FALSE; } } - else if (blkVar) - { - ADDR_ASSERT_ALWAYS(); - valid = FALSE; - } return valid; } @@ -3444,12 +3426,22 @@ if (ValidateNonSwModeParams(&localIn)) { - // Forbid swizzle mode(s) by client setting, for simplicity we never allow VAR swizzle mode for GFX9 + // Forbid swizzle mode(s) by client setting ADDR2_SWMODE_SET allowedSwModeSet = {}; - allowedSwModeSet.value |= pIn->forbiddenBlock.linear ? 0 : Gfx9LinearSwModeMask; - allowedSwModeSet.value |= pIn->forbiddenBlock.micro ? 0 : Gfx9Blk256BSwModeMask; - allowedSwModeSet.value |= pIn->forbiddenBlock.macro4KB ? 0 : Gfx9Blk4KBSwModeMask; - allowedSwModeSet.value |= pIn->forbiddenBlock.macro64KB ? 0 : Gfx9Blk64KBSwModeMask; + allowedSwModeSet.value |= pIn->forbiddenBlock.linear ? 0 : Gfx9LinearSwModeMask; + allowedSwModeSet.value |= pIn->forbiddenBlock.micro ? 0 : Gfx9Blk256BSwModeMask; + allowedSwModeSet.value |= + pIn->forbiddenBlock.macroThin4KB ? 0 : + ((pOut->resourceType == ADDR_RSRC_TEX_3D) ? Gfx9Rsrc3dThin4KBSwModeMask : Gfx9Blk4KBSwModeMask); + allowedSwModeSet.value |= + pIn->forbiddenBlock.macroThick4KB ? 0 : + ((pOut->resourceType == ADDR_RSRC_TEX_3D) ? Gfx9Rsrc3dThick4KBSwModeMask : 0); + allowedSwModeSet.value |= + pIn->forbiddenBlock.macroThin64KB ? 0 : + ((pOut->resourceType == ADDR_RSRC_TEX_3D) ? Gfx9Rsrc3dThin64KBSwModeMask : Gfx9Blk64KBSwModeMask); + allowedSwModeSet.value |= + pIn->forbiddenBlock.macroThick64KB ? 0 : + ((pOut->resourceType == ADDR_RSRC_TEX_3D) ? Gfx9Rsrc3dThick64KBSwModeMask : 0); if (pIn->preferredSwSet.value != 0) { @@ -3466,17 +3458,17 @@ if (pIn->maxAlign > 0) { - if (pIn->maxAlign < GetBlockSize(ADDR_SW_64KB)) + if (pIn->maxAlign < Size64K) { allowedSwModeSet.value &= ~Gfx9Blk64KBSwModeMask; } - if (pIn->maxAlign < GetBlockSize(ADDR_SW_4KB)) + if (pIn->maxAlign < Size4K) { allowedSwModeSet.value &= ~Gfx9Blk4KBSwModeMask; } - if (pIn->maxAlign < GetBlockSize(ADDR_SW_256B)) + if (pIn->maxAlign < Size256) { allowedSwModeSet.value &= ~Gfx9Blk256BSwModeMask; } @@ -3583,7 +3575,7 @@ } if ((numFrags > 1) && - (GetBlockSize(ADDR_SW_4KB) < (m_pipeInterleaveBytes * numFrags))) + (Size4K < (m_pipeInterleaveBytes * numFrags))) { // MSAA surface must have blk_bytes/pipe_interleave >= num_samples allowedSwModeSet.value &= Gfx9Blk64KBSwModeMask; @@ -3630,7 +3622,7 @@ pOut->validSwModeSet = allowedSwModeSet; pOut->canXor = (allowedSwModeSet.value & Gfx9XorSwModeMask) ? TRUE : FALSE; - pOut->validBlockSet = GetAllowedBlockSet(allowedSwModeSet); + pOut->validBlockSet = GetAllowedBlockSet(allowedSwModeSet, pOut->resourceType); pOut->validSwTypeSet = GetAllowedSwSet(allowedSwModeSet); pOut->clientPreferredSwSet = pIn->preferredSwSet; @@ -3640,6 +3632,12 @@ pOut->clientPreferredSwSet.value = AddrSwSetAll; } + // Apply optional restrictions + if (pIn->flags.needEquation) + { + FilterInvalidEqSwizzleMode(allowedSwModeSet, pIn->resourceType, Log2(bpp >> 3)); + } + if (allowedSwModeSet.value == Gfx9LinearSwModeMask) { pOut->swizzleMode = ADDR_SW_LINEAR; @@ -3649,15 +3647,26 @@ // Always ignore linear swizzle mode if there is other choice. allowedSwModeSet.swLinear = 0; - ADDR2_BLOCK_SET allowedBlockSet = GetAllowedBlockSet(allowedSwModeSet); + ADDR2_BLOCK_SET allowedBlockSet = GetAllowedBlockSet(allowedSwModeSet, pOut->resourceType); // Determine block size if there is 2 or more block type candidates if (IsPow2(allowedBlockSet.value) == FALSE) { - const AddrSwizzleMode swMode[AddrBlockMaxTiledType] = {ADDR_SW_256B, ADDR_SW_4KB, ADDR_SW_64KB}; - Dim3d blkDim[AddrBlockMaxTiledType] = {{0}, {0}, {0}}; - Dim3d padDim[AddrBlockMaxTiledType] = {{0}, {0}, {0}}; - UINT_64 padSize[AddrBlockMaxTiledType] = {0}; + AddrSwizzleMode swMode[AddrBlockMaxTiledType] = { ADDR_SW_LINEAR }; + + swMode[AddrBlockMicro] = ADDR_SW_256B_D; + swMode[AddrBlockThin4KB] = ADDR_SW_4KB_D; + swMode[AddrBlockThin64KB] = ADDR_SW_64KB_D; + + if (pOut->resourceType == ADDR_RSRC_TEX_3D) + { + swMode[AddrBlockThick4KB] = ADDR_SW_4KB_S; + swMode[AddrBlockThick64KB] = ADDR_SW_64KB_S; + } + + Dim3d blkDim[AddrBlockMaxTiledType] = {{0}, {0}, {0}, {0}, {0}, {0}}; + Dim3d padDim[AddrBlockMaxTiledType] = {{0}, {0}, {0}, {0}, {0}, {0}}; + UINT_64 padSize[AddrBlockMaxTiledType] = {0}; const UINT_32 ratioLow = pIn->flags.minimizeAlign ? 1 : (pIn->flags.opt4space ? 3 : 2); const UINT_32 ratioHi = pIn->flags.minimizeAlign ? 1 : (pIn->flags.opt4space ? 2 : 1); @@ -3683,7 +3692,7 @@ } padSize[i] = ComputePadSize(&blkDim[i], width, height, numSlices, &padDim[i]); - padSize[i] = PowTwoAlign(padSize[i], sizeAlignInElement); + padSize[i] = PowTwoAlign(padSize[i] * numFrags, sizeAlignInElement); if ((minSize == 0) || ((padSize[i] * ratioHi) <= (minSize * ratioLow))) @@ -3697,28 +3706,41 @@ if ((allowedBlockSet.micro == TRUE) && (width <= blkDim[AddrBlockMicro].w) && (height <= blkDim[AddrBlockMicro].h) && - (NextPow2(pIn->minSizeAlign) <= GetBlockSize(ADDR_SW_256B))) + (NextPow2(pIn->minSizeAlign) <= Size256)) { minSizeBlk = AddrBlockMicro; } if (minSizeBlk == AddrBlockMicro) { + ADDR_ASSERT(pOut->resourceType != ADDR_RSRC_TEX_3D); allowedSwModeSet.value &= Gfx9Blk256BSwModeMask; } - else if (minSizeBlk == AddrBlock4KB) + else if (minSizeBlk == AddrBlockThick4KB) { - allowedSwModeSet.value &= Gfx9Blk4KBSwModeMask; + ADDR_ASSERT(pOut->resourceType == ADDR_RSRC_TEX_3D); + allowedSwModeSet.value &= Gfx9Rsrc3dThick4KBSwModeMask; + } + else if (minSizeBlk == AddrBlockThin4KB) + { + allowedSwModeSet.value &= (pOut->resourceType == ADDR_RSRC_TEX_3D) ? + Gfx9Rsrc3dThin4KBSwModeMask : Gfx9Blk4KBSwModeMask; + } + else if (minSizeBlk == AddrBlockThick64KB) + { + ADDR_ASSERT(pOut->resourceType == ADDR_RSRC_TEX_3D); + allowedSwModeSet.value &= Gfx9Rsrc3dThick64KBSwModeMask; } else { - ADDR_ASSERT(minSizeBlk == AddrBlock64KB); - allowedSwModeSet.value &= Gfx9Blk64KBSwModeMask; + ADDR_ASSERT(minSizeBlk == AddrBlockThin64KB); + allowedSwModeSet.value &= (pOut->resourceType == ADDR_RSRC_TEX_3D) ? + Gfx9Rsrc3dThin64KBSwModeMask : Gfx9Blk64KBSwModeMask; } } // Block type should be determined. - ADDR_ASSERT(IsPow2(GetAllowedBlockSet(allowedSwModeSet).value)); + ADDR_ASSERT(IsPow2(GetAllowedBlockSet(allowedSwModeSet, pOut->resourceType).value)); ADDR2_SWTYPE_SET allowedSwSet = GetAllowedSwSet(allowedSwModeSet); @@ -3775,7 +3797,7 @@ { allowedSwModeSet.value &= Gfx9RotateSwModeMask; } - else if (displayRsrc && allowedSwSet.sw_D) + else if (allowedSwSet.sw_D) { allowedSwModeSet.value &= Gfx9DisplaySwModeMask; } @@ -3794,8 +3816,8 @@ // Swizzle type should be determined. ADDR_ASSERT(IsPow2(GetAllowedSwSet(allowedSwModeSet).value)); - // Determine swizzle mode now - always select the "largest" swizzle mode for a given block type + - // swizzle type combination. For example, for AddrBlock64KB + ADDR_SW_S, select SW_64KB_S_X(25) if it's + // Determine swizzle mode now. Always select the "largest" swizzle mode for a given block type + swizzle + // type combination. For example, for AddrBlockThin64KB + ADDR_SW_S, select SW_64KB_S_X(25) if it's // available, or otherwise select SW_64KB_S_T(17) if it's available, or otherwise select SW_64KB_S(9). pOut->swizzleMode = static_cast(Log2NonPow2(allowedSwModeSet.value)); } @@ -3848,13 +3870,13 @@ const UINT_32 numBankBits = GetBankXorBits(blkSizeLog2); const UINT_32 bppLog2 = Log2(pIn->bpp >> 3); const UINT_32 maxYCoordBlock256 = Log2(Block256_2d[bppLog2].h) - 1; - ASSERTED const ADDR_EQUATION *pEqToCheck = &m_equationTable[eqIndex]; + const ADDR_EQUATION *pEqToCheck = &m_equationTable[eqIndex]; ADDR_ASSERT(maxYCoordBlock256 == - GetMaxValidChannelIndex(&pEqToCheck->addr[0], GetBlockSizeLog2(ADDR_SW_256B), 1)); + GetMaxValidChannelIndex(&pEqToCheck->addr[0], Log2Size256, 1)); const UINT_32 maxYCoordInBaseEquation = - (blkSizeLog2 - GetBlockSizeLog2(ADDR_SW_256B)) / 2 + maxYCoordBlock256; + (blkSizeLog2 - Log2Size256) / 2 + maxYCoordBlock256; ADDR_ASSERT(maxYCoordInBaseEquation == GetMaxValidChannelIndex(&pEqToCheck->addr[0], blkSizeLog2, 1)); @@ -4548,7 +4570,7 @@ // Report mip in tail if Mip0 is already in mip tail BOOL_32 inMipTail = IsInMipTail(resourceType, swizzleMode, tailMaxDim, width, height, depth); - UINT_32 log2blkSize = GetBlockSizeLog2(swizzleMode); + UINT_32 log2BlkSize = GetBlockSizeLog2(swizzleMode); UINT_32 mipIndexInTail = mipId; if (inMipTail == FALSE) @@ -4598,7 +4620,7 @@ if (IsThick(resourceType, swizzleMode)) { - UINT_32 dim = log2blkSize % 3; + UINT_32 dim = log2BlkSize % 3; if (dim == 0) { @@ -4618,7 +4640,7 @@ } else { - if (log2blkSize & 1) + if (log2BlkSize & 1) { inTail = (mipWidthInBlk <= 2) && (mipHeightInBlk == 1); } @@ -4648,7 +4670,7 @@ if (inMipTail) { - UINT_32 index = mipIndexInTail + MaxMacroBits - log2blkSize; + UINT_32 index = mipIndexInTail + MaxMacroBits - log2BlkSize; ADDR_ASSERT(index < sizeof(MipTailOffset256B) / sizeof(UINT_32)); *pMipTailBytesOffset = MipTailOffset256B[index] << 8; } @@ -4729,7 +4751,7 @@ if (IsThin(pIn->resourceType, pIn->swizzleMode)) { UINT_32 blockOffset = 0; - UINT_32 log2blkSize = GetBlockSizeLog2(pIn->swizzleMode); + UINT_32 log2BlkSize = GetBlockSizeLog2(pIn->swizzleMode); if (IsZOrderSwizzle(pIn->swizzleMode)) { @@ -4774,7 +4796,7 @@ MortonGen2d((pIn->x / microBlockDim.w), (pIn->y / microBlockDim.h), 12) << 8; // Sample bits start location - UINT_32 sampleStart = log2blkSize - Log2(pIn->numSamples); + UINT_32 sampleStart = log2BlkSize - Log2(pIn->numSamples); // Join sample bits information to the highest Macro block bits if (IsNonPrtXor(pIn->swizzleMode)) { @@ -4787,7 +4809,7 @@ // after this op, the blockOffset only contains log2 Macro block size bits blockOffset %= (1 << sampleStart); blockOffset |= (pIn->sample << sampleStart); - ADDR_ASSERT((blockOffset >> log2blkSize) == 0); + ADDR_ASSERT((blockOffset >> log2BlkSize) == 0); } } @@ -4796,7 +4818,7 @@ // Mask off bits above Macro block bits to keep page synonyms working for prt if (IsPrt(pIn->swizzleMode)) { - blockOffset &= ((1 << log2blkSize) - 1); + blockOffset &= ((1 << log2BlkSize) - 1); } // Preserve offset inside pipe interleave @@ -4804,13 +4826,13 @@ blockOffset >>= m_pipeInterleaveLog2; // Pipe/Se xor bits - pipeBits = GetPipeXorBits(log2blkSize); + pipeBits = GetPipeXorBits(log2BlkSize); // Pipe xor pipeXor = FoldXor2d(blockOffset, pipeBits); blockOffset >>= pipeBits; // Bank xor bits - bankBits = GetBankXorBits(log2blkSize); + bankBits = GetBankXorBits(log2BlkSize); // Bank Xor bankXor = FoldXor2d(blockOffset, bankBits); blockOffset >>= bankBits; @@ -4825,7 +4847,7 @@ } ADDR_ASSERT((blockOffset | mipTailBytesOffset) == (blockOffset + mipTailBytesOffset)); - ADDR_ASSERT((mipTailBytesOffset == 0u) || (blockOffset < (1u << log2blkSize))); + ADDR_ASSERT((mipTailBytesOffset == 0u) || (blockOffset < (1u << log2BlkSize))); blockOffset |= mipTailBytesOffset; @@ -4840,7 +4862,7 @@ returnCode = ApplyCustomerPipeBankXor(pIn->swizzleMode, pIn->pipeBankXor, bankBits, pipeBits, &blockOffset); - blockOffset %= (1 << log2blkSize); + blockOffset %= (1 << log2BlkSize); UINT_32 pitchInMacroBlock = localOut.mipChainPitch / localOut.blockWidth; UINT_32 paddedHeightInMacroBlock = localOut.mipChainHeight / localOut.blockHeight; @@ -4850,11 +4872,11 @@ ((pIn->y / localOut.blockHeight) + mipStartPos.h) * pitchInMacroBlock + ((pIn->x / localOut.blockWidth) + mipStartPos.w); - pOut->addr = blockOffset | (macroBlockIndex << log2blkSize); + pOut->addr = blockOffset | (macroBlockIndex << log2BlkSize); } else { - UINT_32 log2blkSize = GetBlockSizeLog2(pIn->swizzleMode); + UINT_32 log2BlkSize = GetBlockSizeLog2(pIn->swizzleMode); Dim3d microBlockDim = Block1K_3d[log2ElementBytes]; @@ -4871,7 +4893,7 @@ // Mask off bits above Macro block bits to keep page synonyms working for prt if (IsPrt(pIn->swizzleMode)) { - blockOffset &= ((1 << log2blkSize) - 1); + blockOffset &= ((1 << log2BlkSize) - 1); } // Preserve offset inside pipe interleave @@ -4879,13 +4901,13 @@ blockOffset >>= m_pipeInterleaveLog2; // Pipe/Se xor bits - pipeBits = GetPipeXorBits(log2blkSize); + pipeBits = GetPipeXorBits(log2BlkSize); // Pipe xor pipeXor = FoldXor3d(blockOffset, pipeBits); blockOffset >>= pipeBits; // Bank xor bits - bankBits = GetBankXorBits(log2blkSize); + bankBits = GetBankXorBits(log2BlkSize); // Bank Xor bankXor = FoldXor3d(blockOffset, bankBits); blockOffset >>= bankBits; @@ -4900,13 +4922,13 @@ } ADDR_ASSERT((blockOffset | mipTailBytesOffset) == (blockOffset + mipTailBytesOffset)); - ADDR_ASSERT((mipTailBytesOffset == 0u) || (blockOffset < (1u << log2blkSize))); + ADDR_ASSERT((mipTailBytesOffset == 0u) || (blockOffset < (1u << log2BlkSize))); blockOffset |= mipTailBytesOffset; returnCode = ApplyCustomerPipeBankXor(pIn->swizzleMode, pIn->pipeBankXor, bankBits, pipeBits, &blockOffset); - blockOffset %= (1 << log2blkSize); + blockOffset %= (1 << log2BlkSize); UINT_32 xb = pIn->x / localOut.blockWidth + mipStartPos.w; UINT_32 yb = pIn->y / localOut.blockHeight + mipStartPos.h; @@ -4917,7 +4939,7 @@ (localOut.mipChainHeight / localOut.blockHeight) * pitchInBlock; UINT_64 blockIndex = zb * sliceSizeInBlock + yb * pitchInBlock + xb; - pOut->addr = blockOffset | (blockIndex << log2blkSize); + pOut->addr = blockOffset | (blockIndex << log2BlkSize); } } else @@ -4996,5 +5018,59 @@ return returnCode; } +/** +************************************************************************************************************************ +* Gfx9Lib::ComputeThinBlockDimension +* +* @brief +* Internal function to get thin block width/height/depth in element from surface input params. +* +* @return +* N/A +************************************************************************************************************************ +*/ +VOID Gfx9Lib::ComputeThinBlockDimension( + UINT_32* pWidth, + UINT_32* pHeight, + UINT_32* pDepth, + UINT_32 bpp, + UINT_32 numSamples, + AddrResourceType resourceType, + AddrSwizzleMode swizzleMode) const +{ + ADDR_ASSERT(IsThin(resourceType, swizzleMode)); + + const UINT_32 log2BlkSize = GetBlockSizeLog2(swizzleMode); + const UINT_32 eleBytes = bpp >> 3; + const UINT_32 microBlockSizeTableIndex = Log2(eleBytes); + const UINT_32 log2blkSizeIn256B = log2BlkSize - 8; + const UINT_32 widthAmp = log2blkSizeIn256B / 2; + const UINT_32 heightAmp = log2blkSizeIn256B - widthAmp; + + ADDR_ASSERT(microBlockSizeTableIndex < sizeof(Block256_2d) / sizeof(Block256_2d[0])); + + *pWidth = (Block256_2d[microBlockSizeTableIndex].w << widthAmp); + *pHeight = (Block256_2d[microBlockSizeTableIndex].h << heightAmp); + *pDepth = 1; + + if (numSamples > 1) + { + const UINT_32 log2sample = Log2(numSamples); + const UINT_32 q = log2sample >> 1; + const UINT_32 r = log2sample & 1; + + if (log2BlkSize & 1) + { + *pWidth >>= q; + *pHeight >>= (q + r); + } + else + { + *pWidth >>= (q + r); + *pHeight >>= q; + } + } +} + } // V2 } // Addr diff -Nru mesa-19.2.8/src/amd/addrlib/src/gfx9/gfx9addrlib.h mesa-20.0.8/src/amd/addrlib/src/gfx9/gfx9addrlib.h --- mesa-19.2.8/src/amd/addrlib/src/gfx9/gfx9addrlib.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/addrlib/src/gfx9/gfx9addrlib.h 2020-06-12 01:21:16.000000000 +0000 @@ -114,49 +114,32 @@ (1u << ADDR_SW_64KB_D_X) | (1u << ADDR_SW_64KB_R_X); -const UINT_32 Gfx9BlkVarSwModeMask = (1u << ADDR_SW_VAR_Z) | - (1u << ADDR_SW_VAR_S) | - (1u << ADDR_SW_VAR_D) | - (1u << ADDR_SW_VAR_R) | - (1u << ADDR_SW_VAR_Z_X) | - (1u << ADDR_SW_VAR_S_X) | - (1u << ADDR_SW_VAR_D_X) | - (1u << ADDR_SW_VAR_R_X); - const UINT_32 Gfx9ZSwModeMask = (1u << ADDR_SW_4KB_Z) | (1u << ADDR_SW_64KB_Z) | - (1u << ADDR_SW_VAR_Z) | (1u << ADDR_SW_64KB_Z_T) | (1u << ADDR_SW_4KB_Z_X) | - (1u << ADDR_SW_64KB_Z_X) | - (1u << ADDR_SW_VAR_Z_X); + (1u << ADDR_SW_64KB_Z_X); const UINT_32 Gfx9StandardSwModeMask = (1u << ADDR_SW_256B_S) | (1u << ADDR_SW_4KB_S) | (1u << ADDR_SW_64KB_S) | - (1u << ADDR_SW_VAR_S) | (1u << ADDR_SW_64KB_S_T) | (1u << ADDR_SW_4KB_S_X) | - (1u << ADDR_SW_64KB_S_X) | - (1u << ADDR_SW_VAR_S_X); + (1u << ADDR_SW_64KB_S_X); const UINT_32 Gfx9DisplaySwModeMask = (1u << ADDR_SW_256B_D) | (1u << ADDR_SW_4KB_D) | (1u << ADDR_SW_64KB_D) | - (1u << ADDR_SW_VAR_D) | (1u << ADDR_SW_64KB_D_T) | (1u << ADDR_SW_4KB_D_X) | - (1u << ADDR_SW_64KB_D_X) | - (1u << ADDR_SW_VAR_D_X); + (1u << ADDR_SW_64KB_D_X); const UINT_32 Gfx9RotateSwModeMask = (1u << ADDR_SW_256B_R) | (1u << ADDR_SW_4KB_R) | (1u << ADDR_SW_64KB_R) | - (1u << ADDR_SW_VAR_R) | (1u << ADDR_SW_64KB_R_T) | (1u << ADDR_SW_4KB_R_X) | - (1u << ADDR_SW_64KB_R_X) | - (1u << ADDR_SW_VAR_R_X); + (1u << ADDR_SW_64KB_R_X); const UINT_32 Gfx9XSwModeMask = (1u << ADDR_SW_4KB_Z_X) | (1u << ADDR_SW_4KB_S_X) | @@ -165,11 +148,7 @@ (1u << ADDR_SW_64KB_Z_X) | (1u << ADDR_SW_64KB_S_X) | (1u << ADDR_SW_64KB_D_X) | - (1u << ADDR_SW_64KB_R_X) | - (1u << ADDR_SW_VAR_Z_X) | - (1u << ADDR_SW_VAR_S_X) | - (1u << ADDR_SW_VAR_D_X) | - (1u << ADDR_SW_VAR_R_X); + (1u << ADDR_SW_64KB_R_X); const UINT_32 Gfx9TSwModeMask = (1u << ADDR_SW_64KB_Z_T) | (1u << ADDR_SW_64KB_S_T) | @@ -197,6 +176,16 @@ const UINT_32 Gfx9Rsrc3dThinSwModeMask = Gfx9DisplaySwModeMask & ~Gfx9Blk256BSwModeMask; +const UINT_32 Gfx9Rsrc3dThin4KBSwModeMask = Gfx9Rsrc3dThinSwModeMask & Gfx9Blk4KBSwModeMask; + +const UINT_32 Gfx9Rsrc3dThin64KBSwModeMask = Gfx9Rsrc3dThinSwModeMask & Gfx9Blk64KBSwModeMask; + +const UINT_32 Gfx9Rsrc3dThickSwModeMask = Gfx9Rsrc3dSwModeMask & ~(Gfx9Rsrc3dThinSwModeMask | Gfx9LinearSwModeMask); + +const UINT_32 Gfx9Rsrc3dThick4KBSwModeMask = Gfx9Rsrc3dThickSwModeMask & Gfx9Blk4KBSwModeMask; + +const UINT_32 Gfx9Rsrc3dThick64KBSwModeMask = Gfx9Rsrc3dThickSwModeMask & Gfx9Blk64KBSwModeMask; + const UINT_32 Gfx9MsaaSwModeMask = Gfx9AllSwModeMask & ~Gfx9Blk256BSwModeMask & ~Gfx9LinearSwModeMask; const UINT_32 Dce12NonBpp32SwModeMask = (1u << ADDR_SW_LINEAR) | @@ -204,14 +193,10 @@ (1u << ADDR_SW_4KB_R) | (1u << ADDR_SW_64KB_D) | (1u << ADDR_SW_64KB_R) | - (1u << ADDR_SW_VAR_D) | - (1u << ADDR_SW_VAR_R) | (1u << ADDR_SW_4KB_D_X) | (1u << ADDR_SW_4KB_R_X) | (1u << ADDR_SW_64KB_D_X) | - (1u << ADDR_SW_64KB_R_X) | - (1u << ADDR_SW_VAR_D_X) | - (1u << ADDR_SW_VAR_R_X); + (1u << ADDR_SW_64KB_R_X); const UINT_32 Dce12Bpp32SwModeMask = (1u << ADDR_SW_256B_D) | (1u << ADDR_SW_256B_R) | @@ -220,19 +205,14 @@ const UINT_32 Dcn1NonBpp64SwModeMask = (1u << ADDR_SW_LINEAR) | (1u << ADDR_SW_4KB_S) | (1u << ADDR_SW_64KB_S) | - (1u << ADDR_SW_VAR_S) | (1u << ADDR_SW_64KB_S_T) | (1u << ADDR_SW_4KB_S_X) | - (1u << ADDR_SW_64KB_S_X) | - (1u << ADDR_SW_VAR_S_X); - + (1u << ADDR_SW_64KB_S_X); const UINT_32 Dcn1Bpp64SwModeMask = (1u << ADDR_SW_4KB_D) | (1u << ADDR_SW_64KB_D) | - (1u << ADDR_SW_VAR_D) | (1u << ADDR_SW_64KB_D_T) | (1u << ADDR_SW_4KB_D_X) | (1u << ADDR_SW_64KB_D_X) | - (1u << ADDR_SW_VAR_D_X) | Dcn1NonBpp64SwModeMask; /** @@ -273,9 +253,6 @@ return (pMem != NULL) ? new (pMem) Gfx9Lib(pClient) : NULL; } - virtual BOOL_32 IsValidDisplaySwizzleMode( - const ADDR2_COMPUTE_SURFACE_INFO_INPUT* pIn) const; - protected: Gfx9Lib(const Client* pClient); virtual ~Gfx9Lib(); @@ -376,22 +353,6 @@ AddrSwizzleMode swMode, UINT_32 elementBytesLog2) const; - UINT_32 ComputeSurfaceBaseAlignTiled(AddrSwizzleMode swizzleMode) const - { - UINT_32 baseAlign; - - if (IsXor(swizzleMode)) - { - baseAlign = GetBlockSize(swizzleMode); - } - else - { - baseAlign = 256; - } - - return baseAlign; - } - virtual ADDR_E_RETURNCODE HwlComputePipeBankXor( const ADDR2_COMPUTE_PIPEBANKXOR_INPUT* pIn, ADDR2_COMPUTE_PIPEBANKXOR_OUTPUT* pOut) const; @@ -423,6 +384,137 @@ const ADDR2_COMPUTE_SURFACE_ADDRFROMCOORD_INPUT* pIn, ADDR2_COMPUTE_SURFACE_ADDRFROMCOORD_OUTPUT* pOut) const; + virtual UINT_32 HwlComputeMaxBaseAlignments() const; + + virtual UINT_32 HwlComputeMaxMetaBaseAlignments() const; + + virtual BOOL_32 HwlInitGlobalParams(const ADDR_CREATE_INPUT* pCreateIn); + + virtual ChipFamily HwlConvertChipFamily(UINT_32 uChipFamily, UINT_32 uChipRevision); + + virtual VOID ComputeThinBlockDimension( + UINT_32* pWidth, + UINT_32* pHeight, + UINT_32* pDepth, + UINT_32 bpp, + UINT_32 numSamples, + AddrResourceType resourceType, + AddrSwizzleMode swizzleMode) const; + +private: + VOID GetRbEquation(CoordEq* pRbEq, UINT_32 rbPerSeLog2, UINT_32 seLog2) const; + + VOID GetDataEquation(CoordEq* pDataEq, Gfx9DataType dataSurfaceType, + AddrSwizzleMode swizzleMode, AddrResourceType resourceType, + UINT_32 elementBytesLog2, UINT_32 numSamplesLog2) const; + + VOID GetPipeEquation(CoordEq* pPipeEq, CoordEq* pDataEq, + UINT_32 pipeInterleaveLog2, UINT_32 numPipesLog2, + UINT_32 numSamplesLog2, Gfx9DataType dataSurfaceType, + AddrSwizzleMode swizzleMode, AddrResourceType resourceType) const; + + VOID GenMetaEquation(CoordEq* pMetaEq, UINT_32 maxMip, + UINT_32 elementBytesLog2, UINT_32 numSamplesLog2, + ADDR2_META_FLAGS metaFlag, Gfx9DataType dataSurfaceType, + AddrSwizzleMode swizzleMode, AddrResourceType resourceType, + UINT_32 metaBlkWidthLog2, UINT_32 metaBlkHeightLog2, + UINT_32 metaBlkDepthLog2, UINT_32 compBlkWidthLog2, + UINT_32 compBlkHeightLog2, UINT_32 compBlkDepthLog2) const; + + const CoordEq* GetMetaEquation(const MetaEqParams& metaEqParams); + + VOID GetMetaMipInfo(UINT_32 numMipLevels, Dim3d* pMetaBlkDim, + BOOL_32 dataThick, ADDR2_META_MIP_INFO* pInfo, + UINT_32 mip0Width, UINT_32 mip0Height, UINT_32 mip0Depth, + UINT_32* pNumMetaBlkX, UINT_32* pNumMetaBlkY, UINT_32* pNumMetaBlkZ) const; + + BOOL_32 IsValidDisplaySwizzleMode(const ADDR2_COMPUTE_SURFACE_INFO_INPUT* pIn) const; + + ADDR_E_RETURNCODE ComputeSurfaceLinearPadding( + const ADDR2_COMPUTE_SURFACE_INFO_INPUT* pIn, + UINT_32* pMipmap0PaddedWidth, + UINT_32* pSlice0PaddedHeight, + ADDR2_MIP_INFO* pMipInfo = NULL) const; + + static ADDR2_BLOCK_SET GetAllowedBlockSet(ADDR2_SWMODE_SET allowedSwModeSet, AddrResourceType rsrcType) + { + ADDR2_BLOCK_SET allowedBlockSet = {}; + + allowedBlockSet.micro = (allowedSwModeSet.value & Gfx9Blk256BSwModeMask) ? TRUE : FALSE; + allowedBlockSet.linear = (allowedSwModeSet.value & Gfx9LinearSwModeMask) ? TRUE : FALSE; + + if (rsrcType == ADDR_RSRC_TEX_3D) + { + allowedBlockSet.macroThin4KB = (allowedSwModeSet.value & Gfx9Rsrc3dThin4KBSwModeMask) ? TRUE : FALSE; + allowedBlockSet.macroThick4KB = (allowedSwModeSet.value & Gfx9Rsrc3dThick4KBSwModeMask) ? TRUE : FALSE; + allowedBlockSet.macroThin64KB = (allowedSwModeSet.value & Gfx9Rsrc3dThin64KBSwModeMask) ? TRUE : FALSE; + allowedBlockSet.macroThick64KB = (allowedSwModeSet.value & Gfx9Rsrc3dThick64KBSwModeMask) ? TRUE : FALSE; + } + else + { + allowedBlockSet.macroThin4KB = (allowedSwModeSet.value & Gfx9Blk4KBSwModeMask) ? TRUE : FALSE; + allowedBlockSet.macroThin64KB = (allowedSwModeSet.value & Gfx9Blk64KBSwModeMask) ? TRUE : FALSE; + } + + return allowedBlockSet; + } + + static ADDR2_SWTYPE_SET GetAllowedSwSet(ADDR2_SWMODE_SET allowedSwModeSet) + { + ADDR2_SWTYPE_SET allowedSwSet = {}; + + allowedSwSet.sw_Z = (allowedSwModeSet.value & Gfx9ZSwModeMask) ? TRUE : FALSE; + allowedSwSet.sw_S = (allowedSwModeSet.value & Gfx9StandardSwModeMask) ? TRUE : FALSE; + allowedSwSet.sw_D = (allowedSwModeSet.value & Gfx9DisplaySwModeMask) ? TRUE : FALSE; + allowedSwSet.sw_R = (allowedSwModeSet.value & Gfx9RotateSwModeMask) ? TRUE : FALSE; + + return allowedSwSet; + } + + BOOL_32 IsInMipTail( + AddrResourceType resourceType, + AddrSwizzleMode swizzleMode, + Dim3d mipTailDim, + UINT_32 width, + UINT_32 height, + UINT_32 depth) const + { + BOOL_32 inTail = ((width <= mipTailDim.w) && + (height <= mipTailDim.h) && + (IsThin(resourceType, swizzleMode) || (depth <= mipTailDim.d))); + + return inTail; + } + + BOOL_32 ValidateNonSwModeParams(const ADDR2_COMPUTE_SURFACE_INFO_INPUT* pIn) const; + BOOL_32 ValidateSwModeParams(const ADDR2_COMPUTE_SURFACE_INFO_INPUT* pIn) const; + + UINT_32 GetBankXorBits(UINT_32 macroBlockBits) const + { + UINT_32 pipeBits = GetPipeXorBits(macroBlockBits); + + // Bank xor bits + UINT_32 bankBits = Min(macroBlockBits - pipeBits - m_pipeInterleaveLog2, m_banksLog2); + + return bankBits; + } + + UINT_32 ComputeSurfaceBaseAlignTiled(AddrSwizzleMode swizzleMode) const + { + UINT_32 baseAlign; + + if (IsXor(swizzleMode)) + { + baseAlign = GetBlockSize(swizzleMode); + } + else + { + baseAlign = 256; + } + + return baseAlign; + } + // Initialize equation table VOID InitEquationTable(); @@ -522,127 +614,17 @@ return compressBlkDim; } - static const UINT_32 MaxSeLog2 = 3; - static const UINT_32 MaxRbPerSeLog2 = 2; + static const UINT_32 MaxSeLog2 = 3; + static const UINT_32 MaxRbPerSeLog2 = 2; - static const Dim3d Block256_3dS[MaxNumOfBpp]; - static const Dim3d Block256_3dZ[MaxNumOfBpp]; + static const Dim3d Block256_3dS[MaxNumOfBpp]; + static const Dim3d Block256_3dZ[MaxNumOfBpp]; - static const UINT_32 MipTailOffset256B[]; + static const UINT_32 MipTailOffset256B[]; static const SwizzleModeFlags SwizzleModeTable[ADDR_SW_MAX_TYPE]; - // Max number of swizzle mode supported for equation - static const UINT_32 MaxSwMode = 32; - // Max number of resource type (2D/3D) supported for equation - static const UINT_32 MaxRsrcType = 2; - // Max number of bpp (8bpp/16bpp/32bpp/64bpp/128bpp) - static const UINT_32 MaxElementBytesLog2 = 5; - // Almost all swizzle mode + resource type support equation - static const UINT_32 EquationTableSize = MaxElementBytesLog2 * MaxSwMode * MaxRsrcType; - // Equation table - ADDR_EQUATION m_equationTable[EquationTableSize]; - - // Number of equation entries in the table - UINT_32 m_numEquations; - // Equation lookup table according to bpp and tile index - UINT_32 m_equationLookupTable[MaxRsrcType][MaxSwMode][MaxElementBytesLog2]; - - static const UINT_32 MaxCachedMetaEq = 2; - -private: - virtual UINT_32 HwlComputeMaxBaseAlignments() const; - - virtual UINT_32 HwlComputeMaxMetaBaseAlignments() const; - - virtual BOOL_32 HwlInitGlobalParams(const ADDR_CREATE_INPUT* pCreateIn); - - VOID GetRbEquation(CoordEq* pRbEq, UINT_32 rbPerSeLog2, UINT_32 seLog2) const; - - VOID GetDataEquation(CoordEq* pDataEq, Gfx9DataType dataSurfaceType, - AddrSwizzleMode swizzleMode, AddrResourceType resourceType, - UINT_32 elementBytesLog2, UINT_32 numSamplesLog2) const; - - VOID GetPipeEquation(CoordEq* pPipeEq, CoordEq* pDataEq, - UINT_32 pipeInterleaveLog2, UINT_32 numPipesLog2, - UINT_32 numSamplesLog2, Gfx9DataType dataSurfaceType, - AddrSwizzleMode swizzleMode, AddrResourceType resourceType) const; - - VOID GenMetaEquation(CoordEq* pMetaEq, UINT_32 maxMip, - UINT_32 elementBytesLog2, UINT_32 numSamplesLog2, - ADDR2_META_FLAGS metaFlag, Gfx9DataType dataSurfaceType, - AddrSwizzleMode swizzleMode, AddrResourceType resourceType, - UINT_32 metaBlkWidthLog2, UINT_32 metaBlkHeightLog2, - UINT_32 metaBlkDepthLog2, UINT_32 compBlkWidthLog2, - UINT_32 compBlkHeightLog2, UINT_32 compBlkDepthLog2) const; - - const CoordEq* GetMetaEquation(const MetaEqParams& metaEqParams); - - virtual ChipFamily HwlConvertChipFamily(UINT_32 uChipFamily, UINT_32 uChipRevision); - - VOID GetMetaMipInfo(UINT_32 numMipLevels, Dim3d* pMetaBlkDim, - BOOL_32 dataThick, ADDR2_META_MIP_INFO* pInfo, - UINT_32 mip0Width, UINT_32 mip0Height, UINT_32 mip0Depth, - UINT_32* pNumMetaBlkX, UINT_32* pNumMetaBlkY, UINT_32* pNumMetaBlkZ) const; - - ADDR_E_RETURNCODE ComputeSurfaceLinearPadding( - const ADDR2_COMPUTE_SURFACE_INFO_INPUT* pIn, - UINT_32* pMipmap0PaddedWidth, - UINT_32* pSlice0PaddedHeight, - ADDR2_MIP_INFO* pMipInfo = NULL) const; - - static ADDR2_BLOCK_SET GetAllowedBlockSet(ADDR2_SWMODE_SET allowedSwModeSet) - { - ADDR2_BLOCK_SET allowedBlockSet = {}; - - allowedBlockSet.micro = (allowedSwModeSet.value & Gfx9Blk256BSwModeMask) ? TRUE : FALSE; - allowedBlockSet.macro4KB = (allowedSwModeSet.value & Gfx9Blk4KBSwModeMask) ? TRUE : FALSE; - allowedBlockSet.macro64KB = (allowedSwModeSet.value & Gfx9Blk64KBSwModeMask) ? TRUE : FALSE; - allowedBlockSet.var = (allowedSwModeSet.value & Gfx9BlkVarSwModeMask) ? TRUE : FALSE; - allowedBlockSet.linear = (allowedSwModeSet.value & Gfx9LinearSwModeMask) ? TRUE : FALSE; - - return allowedBlockSet; - } - - static ADDR2_SWTYPE_SET GetAllowedSwSet(ADDR2_SWMODE_SET allowedSwModeSet) - { - ADDR2_SWTYPE_SET allowedSwSet = {}; - - allowedSwSet.sw_Z = (allowedSwModeSet.value & Gfx9ZSwModeMask) ? TRUE : FALSE; - allowedSwSet.sw_S = (allowedSwModeSet.value & Gfx9StandardSwModeMask) ? TRUE : FALSE; - allowedSwSet.sw_D = (allowedSwModeSet.value & Gfx9DisplaySwModeMask) ? TRUE : FALSE; - allowedSwSet.sw_R = (allowedSwModeSet.value & Gfx9RotateSwModeMask) ? TRUE : FALSE; - - return allowedSwSet; - } - - BOOL_32 IsInMipTail( - AddrResourceType resourceType, - AddrSwizzleMode swizzleMode, - Dim3d mipTailDim, - UINT_32 width, - UINT_32 height, - UINT_32 depth) const - { - BOOL_32 inTail = ((width <= mipTailDim.w) && - (height <= mipTailDim.h) && - (IsThin(resourceType, swizzleMode) || (depth <= mipTailDim.d))); - - return inTail; - } - - BOOL_32 ValidateNonSwModeParams(const ADDR2_COMPUTE_SURFACE_INFO_INPUT* pIn) const; - BOOL_32 ValidateSwModeParams(const ADDR2_COMPUTE_SURFACE_INFO_INPUT* pIn) const; - - UINT_32 GetBankXorBits(UINT_32 macroBlockBits) const - { - UINT_32 pipeBits = GetPipeXorBits(macroBlockBits); - - // Bank xor bits - UINT_32 bankBits = Min(macroBlockBits - pipeBits - m_pipeInterleaveLog2, m_banksLog2); - - return bankBits; - } + static const UINT_32 MaxCachedMetaEq = 2; Gfx9ChipSettings m_settings; diff -Nru mesa-19.2.8/src/amd/addrlib/src/r800/ciaddrlib.h mesa-20.0.8/src/amd/addrlib/src/r800/ciaddrlib.h --- mesa-19.2.8/src/amd/addrlib/src/r800/ciaddrlib.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/addrlib/src/r800/ciaddrlib.h 2020-06-12 01:21:16.000000000 +0000 @@ -151,7 +151,6 @@ UINT_32 mipLevel, UINT_32 numSamples, ADDR_COMPUTE_SURFACE_INFO_OUTPUT* pOut) const; private: - VOID ReadGbTileMode( UINT_32 regValue, TileConfig* pCfg) const; diff -Nru mesa-19.2.8/src/amd/addrlib/src/r800/siaddrlib.h mesa-20.0.8/src/amd/addrlib/src/r800/siaddrlib.h --- mesa-19.2.8/src/amd/addrlib/src/r800/siaddrlib.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/addrlib/src/r800/siaddrlib.h 2020-06-12 01:21:16.000000000 +0000 @@ -76,7 +76,6 @@ UINT_32 isSpectre : 1; UINT_32 isSpooky : 1; UINT_32 isKalindi : 1; - // Hawaii is GFXIP 7.2 UINT_32 isHawaii : 1; // VI diff -Nru mesa-19.2.8/src/amd/Android.common.mk mesa-20.0.8/src/amd/Android.common.mk --- mesa-19.2.8/src/amd/Android.common.mk 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/Android.common.mk 2020-06-12 01:21:16.000000000 +0000 @@ -30,9 +30,8 @@ LOCAL_SRC_FILES := \ $(AMD_COMMON_FILES) \ - $(AMD_COMPILER_FILES) \ - $(AMD_DEBUG_FILES) \ - $(AMD_NIR_FILES) + $(AMD_COMMON_LLVM_FILES) \ + $(AMD_DEBUG_FILES) LOCAL_CFLAGS += -DFORCE_BUILD_AMDGPU # instructs LLVM to declare LLVMInitializeAMDGPU* functions @@ -72,6 +71,7 @@ $(MESA_TOP)/include \ $(MESA_TOP)/src \ $(MESA_TOP)/src/amd/common \ + $(MESA_TOP)/src/amd/llvm \ $(MESA_TOP)/src/compiler \ $(call generated-sources-dir-for,STATIC_LIBRARIES,libmesa_nir,,)/nir \ $(MESA_TOP)/src/gallium/include \ @@ -81,6 +81,7 @@ LOCAL_EXPORT_C_INCLUDE_DIRS := \ $(LOCAL_PATH)/common \ + $(LOCAL_PATH)/llvm \ $(intermediates)/common LOCAL_SHARED_LIBRARIES := \ diff -Nru mesa-19.2.8/src/amd/Android.compiler.mk mesa-20.0.8/src/amd/Android.compiler.mk --- mesa-19.2.8/src/amd/Android.compiler.mk 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/amd/Android.compiler.mk 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,93 @@ +# Copyright © 2018 Valve Corporation +# Copyright © 2019 Mauro Rossi issor.oruam@gmail.com + +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice (including the next +# paragraph) shall be included in all copies or substantial portions of the +# Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +# IN THE SOFTWARE. + +LOCAL_PATH := $(call my-dir) + +include $(LOCAL_PATH)/Makefile.sources + +# --------------------------------------- +# Build libmesa_aco +# --------------------------------------- + +include $(CLEAR_VARS) + +LOCAL_MODULE := libmesa_aco + +# filter-out compiler/aco_instruction_selection_setup.cpp because +# it's already included by compiler/aco_instruction_selection.cpp +LOCAL_SRC_FILES := \ + $(filter-out compiler/aco_instruction_selection_setup.cpp, $(ACO_FILES)) + +LOCAL_CFLAGS += -DFORCE_BUILD_AMDGPU # instructs LLVM to declare LLVMInitializeAMDGPU* functions + +LOCAL_CPPFLAGS += -Wall -std=c++14 + +# generate sources +LOCAL_MODULE_CLASS := STATIC_LIBRARIES +intermediates := $(call local-generated-sources-dir) +LOCAL_GENERATED_SOURCES += $(addprefix $(intermediates)/, $(ACO_GENERATED_FILES)) + +ACO_OPCODES_H_SCRIPT := $(MESA_TOP)/src/amd/compiler/aco_opcodes_h.py +ACO_OPCODES_CPP_SCRIPT := $(MESA_TOP)/src/amd/compiler/aco_opcodes_cpp.py +ACO_BUILDER_H_SCRIPT := $(MESA_TOP)/src/amd/compiler/aco_builder_h.py + +ACO_DEPS := $(MESA_TOP)/src/amd/compiler/aco_opcodes.py + +$(intermediates)/compiler/aco_opcodes.h: $(ACO_OPCODES_H_SCRIPT) $(ACO_DEPS) + @mkdir -p $(dir $@) + @echo "Gen Header: $(PRIVATE_MODULE) <= $(notdir $(@))" + $(hide) $(MESA_PYTHON2) $(ACO_OPCODES_H_SCRIPT) > $@ || ($(RM) $@; false) + +$(intermediates)/compiler/aco_opcodes.cpp: $(ACO_OPCODES_CPP_SCRIPT) $(ACO_DEPS) + @mkdir -p $(dir $@) + @echo "Gen Header: $(PRIVATE_MODULE) <= $(notdir $(@))" + $(hide) $(MESA_PYTHON2) $(ACO_OPCODES_CPP_SCRIPT) > $@ || ($(RM) $@; false) + +$(intermediates)/compiler/aco_builder.h: $(ACO_BUILDER_H_SCRIPT) $(ACO_DEPS) + @mkdir -p $(dir $@) + @echo "Gen Header: $(PRIVATE_MODULE) <= $(notdir $(@))" + $(hide) $(MESA_PYTHON2) $(ACO_BUILDER_H_SCRIPT) > $@ || ($(RM) $@; false) + +LOCAL_C_INCLUDES := \ + $(MESA_TOP)/src/amd \ + $(MESA_TOP)/src/amd/common \ + $(MESA_TOP)/src/amd/compiler \ + $(MESA_TOP)/src/compiler/nir \ + $(MESA_TOP)/src/mapi \ + $(MESA_TOP)/src/mesa \ + $(intermediates)/compiler + +LOCAL_EXPORT_C_INCLUDE_DIRS := \ + $(MESA_TOP)/src/amd/compiler \ + $(intermediates)/compiler + +LOCAL_SHARED_LIBRARIES := \ + libdrm_amdgpu + +LOCAL_STATIC_LIBRARIES := \ + libmesa_amd_common \ + libmesa_nir + +$(call mesa-build-with-llvm) + +include $(MESA_COMMON_MK) +include $(BUILD_STATIC_LIBRARY) diff -Nru mesa-19.2.8/src/amd/Android.mk mesa-20.0.8/src/amd/Android.mk --- mesa-19.2.8/src/amd/Android.mk 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/Android.mk 2020-06-12 01:21:16.000000000 +0000 @@ -28,5 +28,6 @@ include $(LOCAL_PATH)/Android.addrlib.mk include $(LOCAL_PATH)/Android.common.mk ifneq ($(filter radeonsi,$(BOARD_GPU_DRIVERS)),) +include $(LOCAL_PATH)/Android.compiler.mk include $(LOCAL_PATH)/vulkan/Android.mk endif diff -Nru mesa-19.2.8/src/amd/common/ac_binary.c mesa-20.0.8/src/amd/common/ac_binary.c --- mesa-19.2.8/src/amd/common/ac_binary.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/common/ac_binary.c 2020-06-12 01:21:16.000000000 +0000 @@ -63,11 +63,29 @@ break; case R_00B02C_SPI_SHADER_PGM_RSRC2_PS: conf->lds_size = MAX2(conf->lds_size, G_00B02C_EXTRA_LDS_SIZE(value)); + conf->num_shared_vgprs = G_00B02C_SHARED_VGPR_CNT(value); + conf->rsrc2 = value; + break; + case R_00B12C_SPI_SHADER_PGM_RSRC2_VS: + conf->num_shared_vgprs = G_00B12C_SHARED_VGPR_CNT(value); + conf->rsrc2 = value; + break; + case R_00B22C_SPI_SHADER_PGM_RSRC2_GS: + conf->num_shared_vgprs = G_00B22C_SHARED_VGPR_CNT(value); + conf->rsrc2 = value; + break; + case R_00B42C_SPI_SHADER_PGM_RSRC2_HS: + conf->num_shared_vgprs = G_00B42C_SHARED_VGPR_CNT(value); + conf->rsrc2 = value; break; case R_00B84C_COMPUTE_PGM_RSRC2: conf->lds_size = MAX2(conf->lds_size, G_00B84C_LDS_SIZE(value)); conf->rsrc2 = value; break; + case R_00B8A0_COMPUTE_PGM_RSRC3: + conf->num_shared_vgprs = G_00B8A0_SHARED_VGPR_CNT(value); + conf->rsrc3 = value; + break; case R_0286CC_SPI_PS_INPUT_ENA: conf->spi_ps_input_ena = value; break; diff -Nru mesa-19.2.8/src/amd/common/ac_binary.h mesa-20.0.8/src/amd/common/ac_binary.h --- mesa-19.2.8/src/amd/common/ac_binary.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/common/ac_binary.h 2020-06-12 01:21:16.000000000 +0000 @@ -35,6 +35,7 @@ struct ac_shader_config { unsigned num_sgprs; unsigned num_vgprs; + unsigned num_shared_vgprs; /* GFX10: number of VGPRs shared between half-waves */ unsigned spilled_sgprs; unsigned spilled_vgprs; unsigned lds_size; /* in HW allocation units; i.e 256 bytes on SI, 512 bytes on CI+ */ @@ -44,6 +45,7 @@ unsigned scratch_bytes_per_wave; unsigned rsrc1; unsigned rsrc2; + unsigned rsrc3; }; void ac_parse_shader_binary_config(const char *data, size_t nbytes, diff -Nru mesa-19.2.8/src/amd/common/ac_debug.c mesa-20.0.8/src/amd/common/ac_debug.c --- mesa-19.2.8/src/amd/common/ac_debug.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/common/ac_debug.c 2020-06-12 01:21:16.000000000 +0000 @@ -28,7 +28,7 @@ #include #define VG(x) x #else -#define VG(x) +#define VG(x) ((void)0) #endif #include diff -Nru mesa-19.2.8/src/amd/common/ac_debug.h mesa-20.0.8/src/amd/common/ac_debug.h --- mesa-19.2.8/src/amd/common/ac_debug.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/common/ac_debug.h 2020-06-12 01:21:16.000000000 +0000 @@ -36,6 +36,10 @@ #define AC_MAX_WAVES_PER_CHIP (64 * 40) +#ifdef __cplusplus +extern "C" { +#endif + struct ac_wave_info { unsigned se; /* shader engine */ unsigned sh; /* shader array */ @@ -67,4 +71,8 @@ unsigned ac_get_wave_info(enum chip_class chip_class, struct ac_wave_info waves[AC_MAX_WAVES_PER_CHIP]); +#ifdef __cplusplus +} +#endif + #endif diff -Nru mesa-19.2.8/src/amd/common/ac_gpu_info.c mesa-20.0.8/src/amd/common/ac_gpu_info.c --- mesa-19.2.8/src/amd/common/ac_gpu_info.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/common/ac_gpu_info.c 2020-06-12 01:21:16.000000000 +0000 @@ -24,6 +24,7 @@ */ #include "ac_gpu_info.h" +#include "addrlib/src/amdgpu_asic_addr.h" #include "sid.h" #include "util/macros.h" @@ -312,24 +313,66 @@ info->pci_id = amdinfo->asic_id; /* TODO: is this correct? */ info->vce_harvest_config = amdinfo->vce_harvest_config; - switch (info->pci_id) { -#define CHIPSET(pci_id, cfamily) \ - case pci_id: \ - info->family = CHIP_##cfamily; \ - info->name = #cfamily; \ +#define identify_chip2(asic, chipname) \ + if (ASICREV_IS(amdinfo->chip_external_rev, asic)) { \ + info->family = CHIP_##chipname; \ + info->name = #chipname; \ + } +#define identify_chip(chipname) identify_chip2(chipname, chipname) + + switch (amdinfo->family_id) { + case FAMILY_SI: + identify_chip(TAHITI); + identify_chip(PITCAIRN); + identify_chip2(CAPEVERDE, VERDE); + identify_chip(OLAND); + identify_chip(HAINAN); + break; + case FAMILY_CI: + identify_chip(BONAIRE); + identify_chip(HAWAII); + break; + case FAMILY_KV: + identify_chip2(SPECTRE, KAVERI); + identify_chip2(SPOOKY, KAVERI); + identify_chip2(KALINDI, KABINI); + identify_chip2(GODAVARI, KABINI); + break; + case FAMILY_VI: + identify_chip(ICELAND); + identify_chip(TONGA); + identify_chip(FIJI); + identify_chip(POLARIS10); + identify_chip(POLARIS11); + identify_chip(POLARIS12); + identify_chip(VEGAM); + break; + case FAMILY_CZ: + identify_chip(CARRIZO); + identify_chip(STONEY); + break; + case FAMILY_AI: + identify_chip(VEGA10); + identify_chip(VEGA12); + identify_chip(VEGA20); + identify_chip(ARCTURUS); + break; + case FAMILY_RV: + identify_chip(RAVEN); + identify_chip(RAVEN2); + identify_chip(RENOIR); + break; + case FAMILY_NV: + identify_chip(NAVI10); + identify_chip(NAVI12); + identify_chip(NAVI14); break; -#include "pci_ids/radeonsi_pci_ids.h" -#undef CHIPSET - - default: - fprintf(stderr, "amdgpu: Invalid PCI ID.\n"); - return false; } - /* Raven2 uses the same PCI IDs as Raven1, but different revision IDs. */ - if (info->family == CHIP_RAVEN && amdinfo->chip_rev >= 0x8) { - info->family = CHIP_RAVEN2; - info->name = "RAVEN2"; + if (!info->name) { + fprintf(stderr, "amdgpu: unknown (family_id, chip_external_rev): (%u, %u)\n", + amdinfo->family_id, amdinfo->chip_external_rev); + return false; } if (info->family >= CHIP_NAVI10) @@ -367,6 +410,9 @@ else info->max_alloc_size = info->gart_size * 0.7; + /* Set which chips have uncached device memory. */ + info->has_l2_uncached = info->chip_class >= GFX9; + /* Set hardware information. */ info->gds_size = gds.gds_total_size; info->gds_gfx_partition_size = gds.gds_gfx_partition_size; @@ -427,9 +473,14 @@ } if (info->chip_class >= GFX10) { info->tcc_cache_line_size = 128; - /* This is a hack, but it's all we can do without a kernel upgrade. */ - info->tcc_harvested = - (info->vram_size / info->num_tcc_blocks) != 512*1024*1024; + + if (info->drm_minor >= 35) { + info->tcc_harvested = device_info.tcc_disabled_mask != 0; + } else { + /* This is a hack, but it's all we can do without a kernel upgrade. */ + info->tcc_harvested = + (info->vram_size / info->num_tcc_blocks) != 512*1024*1024; + } } else { info->tcc_cache_line_size = 64; } @@ -449,8 +500,65 @@ assert(util_is_power_of_two_or_zero(compute.available_rings + 1)); info->has_graphics = gfx.available_rings > 0; - info->num_sdma_rings = util_bitcount(dma.available_rings); - info->num_compute_rings = util_bitcount(compute.available_rings); + info->num_rings[RING_GFX] = util_bitcount(gfx.available_rings); + info->num_rings[RING_COMPUTE] = util_bitcount(compute.available_rings); + info->num_rings[RING_DMA] = util_bitcount(dma.available_rings); + info->num_rings[RING_UVD] = util_bitcount(uvd.available_rings); + info->num_rings[RING_VCE] = util_bitcount(vce.available_rings); + info->num_rings[RING_UVD_ENC] = util_bitcount(uvd_enc.available_rings); + info->num_rings[RING_VCN_DEC] = util_bitcount(vcn_dec.available_rings); + info->num_rings[RING_VCN_ENC] = util_bitcount(vcn_enc.available_rings); + info->num_rings[RING_VCN_JPEG] = util_bitcount(vcn_jpeg.available_rings); + + /* The mere presence of CLEAR_STATE in the IB causes random GPU hangs + * on GFX6. Some CLEAR_STATE cause asic hang on radeon kernel, etc. + * SPI_VS_OUT_CONFIG. So only enable GFX7 CLEAR_STATE on amdgpu kernel. + */ + info->has_clear_state = info->chip_class >= GFX7; + + info->has_distributed_tess = info->chip_class >= GFX10 || + (info->chip_class >= GFX8 && info->max_se >= 2); + + info->has_dcc_constant_encode = info->family == CHIP_RAVEN2 || + info->family == CHIP_RENOIR || + info->chip_class >= GFX10; + + info->has_rbplus = info->family == CHIP_STONEY || + info->chip_class >= GFX9; + + /* Some chips have RB+ registers, but don't support RB+. Those must + * always disable it. + */ + info->rbplus_allowed = info->has_rbplus && + (info->family == CHIP_STONEY || + info->family == CHIP_VEGA12 || + info->family == CHIP_RAVEN || + info->family == CHIP_RAVEN2 || + info->family == CHIP_RENOIR); + + info->has_out_of_order_rast = info->chip_class >= GFX8 && + info->max_se >= 2; + + /* TODO: Figure out how to use LOAD_CONTEXT_REG on GFX6-GFX7. */ + info->has_load_ctx_reg_pkt = info->chip_class >= GFX9 || + (info->chip_class >= GFX8 && + info->me_fw_feature >= 41); + + info->cpdma_prefetch_writes_memory = info->chip_class <= GFX8; + + info->has_gfx9_scissor_bug = info->family == CHIP_VEGA10 || + info->family == CHIP_RAVEN; + + info->has_tc_compat_zrange_bug = info->chip_class >= GFX8 && + info->chip_class <= GFX9; + + info->has_msaa_sample_loc_bug = (info->family >= CHIP_POLARIS10 && + info->family <= CHIP_POLARIS12) || + info->family == CHIP_VEGA10 || + info->family == CHIP_RAVEN; + + info->has_ls_vgpr_init_bug = info->family == CHIP_VEGA10 || + info->family == CHIP_RAVEN; /* Get the number of good compute units. */ info->num_good_compute_units = 0; @@ -504,8 +612,61 @@ } info->has_gds_ordered_append = info->chip_class >= GFX7 && - info->drm_minor >= 29 && - HAVE_LLVM >= 0x0800; + info->drm_minor >= 29; + + if (info->chip_class >= GFX9) { + unsigned pc_lines = 0; + + switch (info->family) { + case CHIP_VEGA10: + case CHIP_VEGA12: + case CHIP_VEGA20: + pc_lines = 2048; + break; + case CHIP_RAVEN: + case CHIP_RAVEN2: + case CHIP_RENOIR: + case CHIP_NAVI10: + case CHIP_NAVI12: + pc_lines = 1024; + break; + case CHIP_NAVI14: + pc_lines = 512; + break; + case CHIP_ARCTURUS: + break; + default: + assert(0); + } + + info->pc_lines = pc_lines; + + if (info->chip_class >= GFX10) { + info->pbb_max_alloc_count = pc_lines / 3; + } else { + info->pbb_max_alloc_count = + MIN2(128, pc_lines / (4 * info->max_se)); + } + } + + /* The number of SDPs is the same as the number of TCCs for now. */ + if (info->chip_class >= GFX10) + info->num_sdp_interfaces = device_info.num_tcc_blocks; + + info->max_wave64_per_simd = info->family >= CHIP_POLARIS10 && + info->family <= CHIP_VEGAM ? 8 : 10; + + /* The number is per SIMD. There is enough SGPRs for the maximum number + * of Wave32, which is double the number for Wave64. + */ + if (info->chip_class >= GFX10) + info->num_physical_sgprs_per_simd = 128 * info->max_wave64_per_simd * 2; + else if (info->chip_class >= GFX8) + info->num_physical_sgprs_per_simd = 800; + else + info->num_physical_sgprs_per_simd = 512; + + info->num_physical_wave64_vgprs_per_simd = info->chip_class >= GFX10 ? 512 : 256; return true; } @@ -543,16 +704,42 @@ printf(" pci (domain:bus:dev.func): %04x:%02x:%02x.%x\n", info->pci_domain, info->pci_bus, info->pci_dev, info->pci_func); + + printf(" name = %s\n", info->name); + printf(" marketing_name = %s\n", info->marketing_name); + printf(" is_pro_graphics = %u\n", info->is_pro_graphics); printf(" pci_id = 0x%x\n", info->pci_id); printf(" family = %i\n", info->family); printf(" chip_class = %i\n", info->chip_class); + printf(" family_id = %i\n", info->family_id); printf(" chip_external_rev = %i\n", info->chip_external_rev); - printf(" num_compute_rings = %u\n", info->num_compute_rings); - printf(" num_sdma_rings = %i\n", info->num_sdma_rings); printf(" clock_crystal_freq = %i\n", info->clock_crystal_freq); - printf(" tcc_cache_line_size = %u\n", info->tcc_cache_line_size); - printf(" tcc_harvested = %u\n", info->tcc_harvested); + printf("Features:\n"); + printf(" has_graphics = %i\n", info->has_graphics); + printf(" num_rings[RING_GFX] = %i\n", info->num_rings[RING_GFX]); + printf(" num_rings[RING_DMA] = %i\n", info->num_rings[RING_DMA]); + printf(" num_rings[RING_COMPUTE] = %u\n", info->num_rings[RING_COMPUTE]); + printf(" num_rings[RING_UVD] = %i\n", info->num_rings[RING_UVD]); + printf(" num_rings[RING_VCE] = %i\n", info->num_rings[RING_VCE]); + printf(" num_rings[RING_UVD_ENC] = %i\n", info->num_rings[RING_UVD_ENC]); + printf(" num_rings[RING_VCN_DEC] = %i\n", info->num_rings[RING_VCN_DEC]); + printf(" num_rings[RING_VCN_ENC] = %i\n", info->num_rings[RING_VCN_ENC]); + printf(" num_rings[RING_VCN_JPEG] = %i\n", info->num_rings[RING_VCN_JPEG]); + printf(" has_clear_state = %u\n", info->has_clear_state); + printf(" has_distributed_tess = %u\n", info->has_distributed_tess); + printf(" has_dcc_constant_encode = %u\n", info->has_dcc_constant_encode); + printf(" has_rbplus = %u\n", info->has_rbplus); + printf(" rbplus_allowed = %u\n", info->rbplus_allowed); + printf(" has_load_ctx_reg_pkt = %u\n", info->has_load_ctx_reg_pkt); + printf(" has_out_of_order_rast = %u\n", info->has_out_of_order_rast); + printf(" cpdma_prefetch_writes_memory = %u\n", info->cpdma_prefetch_writes_memory); + printf(" has_gfx9_scissor_bug = %i\n", info->has_gfx9_scissor_bug); + printf(" has_tc_compat_zrange_bug = %i\n", info->has_tc_compat_zrange_bug); + printf(" has_msaa_sample_loc_bug = %i\n", info->has_msaa_sample_loc_bug); + printf(" has_ls_vgpr_init_bug = %i\n", info->has_ls_vgpr_init_bug); + + printf("Display features:\n"); printf(" use_display_dcc_unaligned = %u\n", info->use_display_dcc_unaligned); printf(" use_display_dcc_with_retile_blit = %u\n", info->use_display_dcc_with_retile_blit); @@ -569,6 +756,11 @@ printf(" min_alloc_size = %u\n", info->min_alloc_size); printf(" address32_hi = %u\n", info->address32_hi); printf(" has_dedicated_vram = %u\n", info->has_dedicated_vram); + printf(" num_sdp_interfaces = %u\n", info->num_sdp_interfaces); + printf(" num_tcc_blocks = %i\n", info->num_tcc_blocks); + printf(" tcc_cache_line_size = %u\n", info->tcc_cache_line_size); + printf(" tcc_harvested = %u\n", info->tcc_harvested); + printf(" pc_lines = %u\n", info->pc_lines); printf("CP info:\n"); printf(" gfx_ib_pad_with_type2 = %i\n", info->gfx_ib_pad_with_type2); @@ -616,9 +808,11 @@ printf(" max_shader_clock = %i\n", info->max_shader_clock); printf(" num_good_compute_units = %i\n", info->num_good_compute_units); printf(" num_good_cu_per_sh = %i\n", info->num_good_cu_per_sh); - printf(" num_tcc_blocks = %i\n", info->num_tcc_blocks); printf(" max_se = %i\n", info->max_se); printf(" max_sh_per_se = %i\n", info->max_sh_per_se); + printf(" max_wave64_per_simd = %i\n", info->max_wave64_per_simd); + printf(" num_physical_sgprs_per_simd = %i\n", info->num_physical_sgprs_per_simd); + printf(" num_physical_wave64_vgprs_per_simd = %i\n", info->num_physical_wave64_vgprs_per_simd); printf("Render backend info:\n"); printf(" pa_sc_tile_steering_override = 0x%x\n", info->pa_sc_tile_steering_override); @@ -627,9 +821,17 @@ printf(" pipe_interleave_bytes = %i\n", info->pipe_interleave_bytes); printf(" enabled_rb_mask = 0x%x\n", info->enabled_rb_mask); printf(" max_alignment = %u\n", (unsigned)info->max_alignment); + printf(" pbb_max_alloc_count = %u\n", info->pbb_max_alloc_count); - printf("GB_ADDR_CONFIG:\n"); - if (info->chip_class >= GFX9) { + printf("GB_ADDR_CONFIG: 0x%08x\n", info->gb_addr_config); + if (info->chip_class >= GFX10) { + printf(" num_pipes = %u\n", + 1 << G_0098F8_NUM_PIPES(info->gb_addr_config)); + printf(" pipe_interleave_size = %u\n", + 256 << G_0098F8_PIPE_INTERLEAVE_SIZE_GFX9(info->gb_addr_config)); + printf(" max_compressed_frags = %u\n", + 1 << G_0098F8_MAX_COMPRESSED_FRAGS(info->gb_addr_config)); + } else if (info->chip_class == GFX9) { printf(" num_pipes = %u\n", 1 << G_0098F8_NUM_PIPES(info->gb_addr_config)); printf(" pipe_interleave_size = %u\n", diff -Nru mesa-19.2.8/src/amd/common/ac_gpu_info.h mesa-20.0.8/src/amd/common/ac_gpu_info.h --- mesa-19.2.8/src/amd/common/ac_gpu_info.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/common/ac_gpu_info.h 2020-06-12 01:21:16.000000000 +0000 @@ -53,13 +53,25 @@ enum chip_class chip_class; uint32_t family_id; uint32_t chip_external_rev; - bool has_graphics; /* false if the chip is compute-only */ - uint32_t num_compute_rings; - uint32_t num_sdma_rings; uint32_t clock_crystal_freq; - uint32_t tcc_cache_line_size; - bool tcc_harvested; + /* Features. */ + bool has_graphics; /* false if the chip is compute-only */ + uint32_t num_rings[NUM_RING_TYPES]; + bool has_clear_state; + bool has_distributed_tess; + bool has_dcc_constant_encode; + bool has_rbplus; /* if RB+ registers exist */ + bool rbplus_allowed; /* if RB+ is allowed */ + bool has_load_ctx_reg_pkt; + bool has_out_of_order_rast; + bool cpdma_prefetch_writes_memory; + bool has_gfx9_scissor_bug; + bool has_tc_compat_zrange_bug; + bool has_msaa_sample_loc_bug; + bool has_ls_vgpr_init_bug; + + /* Display features. */ /* There are 2 display DCC codepaths, because display expects unaligned DCC. */ /* Disable RB and pipe alignment to skip the retile blit. (1 RB chips only) */ bool use_display_dcc_unaligned; @@ -78,7 +90,13 @@ uint32_t min_alloc_size; uint32_t address32_hi; bool has_dedicated_vram; + bool has_l2_uncached; bool r600_has_virtual_memory; + uint32_t num_sdp_interfaces; + uint32_t num_tcc_blocks; + uint32_t tcc_cache_line_size; + bool tcc_harvested; + unsigned pc_lines; /* CP info. */ bool gfx_ib_pad_with_type2; @@ -129,9 +147,11 @@ uint32_t max_shader_clock; uint32_t num_good_compute_units; uint32_t num_good_cu_per_sh; - uint32_t num_tcc_blocks; uint32_t max_se; /* shader engines */ uint32_t max_sh_per_se; /* shader arrays per shader engine */ + uint32_t max_wave64_per_simd; + uint32_t num_physical_sgprs_per_simd; + uint32_t num_physical_wave64_vgprs_per_simd; /* Render backends (color + depth blocks). */ uint32_t r300_num_gb_pipes; @@ -146,6 +166,7 @@ uint32_t pipe_interleave_bytes; uint32_t enabled_rb_mask; /* GCN harvest config */ uint64_t max_alignment; /* from addrlib */ + uint32_t pbb_max_alloc_count; /* Tile modes. */ uint32_t si_tile_mode_array[32]; @@ -174,43 +195,6 @@ unsigned max_waves_per_sh, unsigned threadgroups_per_cu); -static inline unsigned ac_get_max_wave64_per_simd(enum radeon_family family) -{ - - switch (family) { - /* These always have 8 waves: */ - case CHIP_POLARIS10: - case CHIP_POLARIS11: - case CHIP_POLARIS12: - case CHIP_VEGAM: - return 8; - default: - return 10; - } -} - -static inline unsigned ac_get_num_physical_vgprs(enum chip_class chip_class, - unsigned wave_size) -{ - /* The number is per SIMD. */ - if (chip_class >= GFX10) - return wave_size == 32 ? 1024 : 512; - else - return 256; -} - -static inline uint32_t -ac_get_num_physical_sgprs(const struct radeon_info *info) -{ - /* The number is per SIMD. There is enough SGPRs for the maximum number - * of Wave32, which is double the number for Wave64. - */ - if (info->chip_class >= GFX10) - return 128 * ac_get_max_wave64_per_simd(info->family) * 2; - - return info->chip_class >= GFX8 ? 800 : 512; -} - #ifdef __cplusplus } #endif diff -Nru mesa-19.2.8/src/amd/common/ac_llvm_build.c mesa-20.0.8/src/amd/common/ac_llvm_build.c --- mesa-19.2.8/src/amd/common/ac_llvm_build.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/common/ac_llvm_build.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,4708 +0,0 @@ -/* - * Copyright 2014 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sub license, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL - * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, - * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR - * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE - * USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * The above copyright notice and this permission notice (including the - * next paragraph) shall be included in all copies or substantial portions - * of the Software. - * - */ -/* based on pieces from si_pipe.c and radeon_llvm_emit.c */ -#include "ac_llvm_build.h" - -#include - -#include "c11/threads.h" - -#include -#include - -#include "ac_llvm_util.h" -#include "ac_exp_param.h" -#include "util/bitscan.h" -#include "util/macros.h" -#include "util/u_atomic.h" -#include "util/u_math.h" -#include "sid.h" - -#include "shader_enums.h" - -#define AC_LLVM_INITIAL_CF_DEPTH 4 - -/* Data for if/else/endif and bgnloop/endloop control flow structures. - */ -struct ac_llvm_flow { - /* Loop exit or next part of if/else/endif. */ - LLVMBasicBlockRef next_block; - LLVMBasicBlockRef loop_entry_block; -}; - -/* Initialize module-independent parts of the context. - * - * The caller is responsible for initializing ctx::module and ctx::builder. - */ -void -ac_llvm_context_init(struct ac_llvm_context *ctx, - struct ac_llvm_compiler *compiler, - enum chip_class chip_class, enum radeon_family family, - enum ac_float_mode float_mode, unsigned wave_size, - unsigned ballot_mask_bits) -{ - LLVMValueRef args[1]; - - ctx->context = LLVMContextCreate(); - - ctx->chip_class = chip_class; - ctx->family = family; - ctx->wave_size = wave_size; - ctx->ballot_mask_bits = ballot_mask_bits; - ctx->module = ac_create_module(wave_size == 32 ? compiler->tm_wave32 - : compiler->tm, - ctx->context); - ctx->builder = ac_create_builder(ctx->context, float_mode); - - ctx->voidt = LLVMVoidTypeInContext(ctx->context); - ctx->i1 = LLVMInt1TypeInContext(ctx->context); - ctx->i8 = LLVMInt8TypeInContext(ctx->context); - ctx->i16 = LLVMIntTypeInContext(ctx->context, 16); - ctx->i32 = LLVMIntTypeInContext(ctx->context, 32); - ctx->i64 = LLVMIntTypeInContext(ctx->context, 64); - ctx->intptr = ctx->i32; - ctx->f16 = LLVMHalfTypeInContext(ctx->context); - ctx->f32 = LLVMFloatTypeInContext(ctx->context); - ctx->f64 = LLVMDoubleTypeInContext(ctx->context); - ctx->v2i16 = LLVMVectorType(ctx->i16, 2); - ctx->v2i32 = LLVMVectorType(ctx->i32, 2); - ctx->v3i32 = LLVMVectorType(ctx->i32, 3); - ctx->v4i32 = LLVMVectorType(ctx->i32, 4); - ctx->v2f32 = LLVMVectorType(ctx->f32, 2); - ctx->v3f32 = LLVMVectorType(ctx->f32, 3); - ctx->v4f32 = LLVMVectorType(ctx->f32, 4); - ctx->v8i32 = LLVMVectorType(ctx->i32, 8); - ctx->iN_wavemask = LLVMIntTypeInContext(ctx->context, ctx->wave_size); - ctx->iN_ballotmask = LLVMIntTypeInContext(ctx->context, ballot_mask_bits); - - ctx->i8_0 = LLVMConstInt(ctx->i8, 0, false); - ctx->i8_1 = LLVMConstInt(ctx->i8, 1, false); - ctx->i16_0 = LLVMConstInt(ctx->i16, 0, false); - ctx->i16_1 = LLVMConstInt(ctx->i16, 1, false); - ctx->i32_0 = LLVMConstInt(ctx->i32, 0, false); - ctx->i32_1 = LLVMConstInt(ctx->i32, 1, false); - ctx->i64_0 = LLVMConstInt(ctx->i64, 0, false); - ctx->i64_1 = LLVMConstInt(ctx->i64, 1, false); - ctx->f16_0 = LLVMConstReal(ctx->f16, 0.0); - ctx->f16_1 = LLVMConstReal(ctx->f16, 1.0); - ctx->f32_0 = LLVMConstReal(ctx->f32, 0.0); - ctx->f32_1 = LLVMConstReal(ctx->f32, 1.0); - ctx->f64_0 = LLVMConstReal(ctx->f64, 0.0); - ctx->f64_1 = LLVMConstReal(ctx->f64, 1.0); - - ctx->i1false = LLVMConstInt(ctx->i1, 0, false); - ctx->i1true = LLVMConstInt(ctx->i1, 1, false); - - ctx->range_md_kind = LLVMGetMDKindIDInContext(ctx->context, - "range", 5); - - ctx->invariant_load_md_kind = LLVMGetMDKindIDInContext(ctx->context, - "invariant.load", 14); - - ctx->fpmath_md_kind = LLVMGetMDKindIDInContext(ctx->context, "fpmath", 6); - - args[0] = LLVMConstReal(ctx->f32, 2.5); - ctx->fpmath_md_2p5_ulp = LLVMMDNodeInContext(ctx->context, args, 1); - - ctx->uniform_md_kind = LLVMGetMDKindIDInContext(ctx->context, - "amdgpu.uniform", 14); - - ctx->empty_md = LLVMMDNodeInContext(ctx->context, NULL, 0); - ctx->flow = calloc(1, sizeof(*ctx->flow)); -} - -void -ac_llvm_context_dispose(struct ac_llvm_context *ctx) -{ - free(ctx->flow->stack); - free(ctx->flow); - ctx->flow = NULL; -} - -int -ac_get_llvm_num_components(LLVMValueRef value) -{ - LLVMTypeRef type = LLVMTypeOf(value); - unsigned num_components = LLVMGetTypeKind(type) == LLVMVectorTypeKind - ? LLVMGetVectorSize(type) - : 1; - return num_components; -} - -LLVMValueRef -ac_llvm_extract_elem(struct ac_llvm_context *ac, - LLVMValueRef value, - int index) -{ - if (LLVMGetTypeKind(LLVMTypeOf(value)) != LLVMVectorTypeKind) { - assert(index == 0); - return value; - } - - return LLVMBuildExtractElement(ac->builder, value, - LLVMConstInt(ac->i32, index, false), ""); -} - -int -ac_get_elem_bits(struct ac_llvm_context *ctx, LLVMTypeRef type) -{ - if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) - type = LLVMGetElementType(type); - - if (LLVMGetTypeKind(type) == LLVMIntegerTypeKind) - return LLVMGetIntTypeWidth(type); - - if (type == ctx->f16) - return 16; - if (type == ctx->f32) - return 32; - if (type == ctx->f64) - return 64; - - unreachable("Unhandled type kind in get_elem_bits"); -} - -unsigned -ac_get_type_size(LLVMTypeRef type) -{ - LLVMTypeKind kind = LLVMGetTypeKind(type); - - switch (kind) { - case LLVMIntegerTypeKind: - return LLVMGetIntTypeWidth(type) / 8; - case LLVMHalfTypeKind: - return 2; - case LLVMFloatTypeKind: - return 4; - case LLVMDoubleTypeKind: - return 8; - case LLVMPointerTypeKind: - if (LLVMGetPointerAddressSpace(type) == AC_ADDR_SPACE_CONST_32BIT) - return 4; - return 8; - case LLVMVectorTypeKind: - return LLVMGetVectorSize(type) * - ac_get_type_size(LLVMGetElementType(type)); - case LLVMArrayTypeKind: - return LLVMGetArrayLength(type) * - ac_get_type_size(LLVMGetElementType(type)); - default: - assert(0); - return 0; - } -} - -static LLVMTypeRef to_integer_type_scalar(struct ac_llvm_context *ctx, LLVMTypeRef t) -{ - if (t == ctx->i8) - return ctx->i8; - else if (t == ctx->f16 || t == ctx->i16) - return ctx->i16; - else if (t == ctx->f32 || t == ctx->i32) - return ctx->i32; - else if (t == ctx->f64 || t == ctx->i64) - return ctx->i64; - else - unreachable("Unhandled integer size"); -} - -LLVMTypeRef -ac_to_integer_type(struct ac_llvm_context *ctx, LLVMTypeRef t) -{ - if (LLVMGetTypeKind(t) == LLVMVectorTypeKind) { - LLVMTypeRef elem_type = LLVMGetElementType(t); - return LLVMVectorType(to_integer_type_scalar(ctx, elem_type), - LLVMGetVectorSize(t)); - } - if (LLVMGetTypeKind(t) == LLVMPointerTypeKind) { - switch (LLVMGetPointerAddressSpace(t)) { - case AC_ADDR_SPACE_GLOBAL: - return ctx->i64; - case AC_ADDR_SPACE_LDS: - return ctx->i32; - default: - unreachable("unhandled address space"); - } - } - return to_integer_type_scalar(ctx, t); -} - -LLVMValueRef -ac_to_integer(struct ac_llvm_context *ctx, LLVMValueRef v) -{ - LLVMTypeRef type = LLVMTypeOf(v); - if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) { - return LLVMBuildPtrToInt(ctx->builder, v, ac_to_integer_type(ctx, type), ""); - } - return LLVMBuildBitCast(ctx->builder, v, ac_to_integer_type(ctx, type), ""); -} - -LLVMValueRef -ac_to_integer_or_pointer(struct ac_llvm_context *ctx, LLVMValueRef v) -{ - LLVMTypeRef type = LLVMTypeOf(v); - if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) - return v; - return ac_to_integer(ctx, v); -} - -static LLVMTypeRef to_float_type_scalar(struct ac_llvm_context *ctx, LLVMTypeRef t) -{ - if (t == ctx->i8) - return ctx->i8; - else if (t == ctx->i16 || t == ctx->f16) - return ctx->f16; - else if (t == ctx->i32 || t == ctx->f32) - return ctx->f32; - else if (t == ctx->i64 || t == ctx->f64) - return ctx->f64; - else - unreachable("Unhandled float size"); -} - -LLVMTypeRef -ac_to_float_type(struct ac_llvm_context *ctx, LLVMTypeRef t) -{ - if (LLVMGetTypeKind(t) == LLVMVectorTypeKind) { - LLVMTypeRef elem_type = LLVMGetElementType(t); - return LLVMVectorType(to_float_type_scalar(ctx, elem_type), - LLVMGetVectorSize(t)); - } - return to_float_type_scalar(ctx, t); -} - -LLVMValueRef -ac_to_float(struct ac_llvm_context *ctx, LLVMValueRef v) -{ - LLVMTypeRef type = LLVMTypeOf(v); - return LLVMBuildBitCast(ctx->builder, v, ac_to_float_type(ctx, type), ""); -} - - -LLVMValueRef -ac_build_intrinsic(struct ac_llvm_context *ctx, const char *name, - LLVMTypeRef return_type, LLVMValueRef *params, - unsigned param_count, unsigned attrib_mask) -{ - LLVMValueRef function, call; - bool set_callsite_attrs = !(attrib_mask & AC_FUNC_ATTR_LEGACY); - - function = LLVMGetNamedFunction(ctx->module, name); - if (!function) { - LLVMTypeRef param_types[32], function_type; - unsigned i; - - assert(param_count <= 32); - - for (i = 0; i < param_count; ++i) { - assert(params[i]); - param_types[i] = LLVMTypeOf(params[i]); - } - function_type = - LLVMFunctionType(return_type, param_types, param_count, 0); - function = LLVMAddFunction(ctx->module, name, function_type); - - LLVMSetFunctionCallConv(function, LLVMCCallConv); - LLVMSetLinkage(function, LLVMExternalLinkage); - - if (!set_callsite_attrs) - ac_add_func_attributes(ctx->context, function, attrib_mask); - } - - call = LLVMBuildCall(ctx->builder, function, params, param_count, ""); - if (set_callsite_attrs) - ac_add_func_attributes(ctx->context, call, attrib_mask); - return call; -} - -/** - * Given the i32 or vNi32 \p type, generate the textual name (e.g. for use with - * intrinsic names). - */ -void ac_build_type_name_for_intr(LLVMTypeRef type, char *buf, unsigned bufsize) -{ - LLVMTypeRef elem_type = type; - - assert(bufsize >= 8); - - if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) { - int ret = snprintf(buf, bufsize, "v%u", - LLVMGetVectorSize(type)); - if (ret < 0) { - char *type_name = LLVMPrintTypeToString(type); - fprintf(stderr, "Error building type name for: %s\n", - type_name); - LLVMDisposeMessage(type_name); - return; - } - elem_type = LLVMGetElementType(type); - buf += ret; - bufsize -= ret; - } - switch (LLVMGetTypeKind(elem_type)) { - default: break; - case LLVMIntegerTypeKind: - snprintf(buf, bufsize, "i%d", LLVMGetIntTypeWidth(elem_type)); - break; - case LLVMHalfTypeKind: - snprintf(buf, bufsize, "f16"); - break; - case LLVMFloatTypeKind: - snprintf(buf, bufsize, "f32"); - break; - case LLVMDoubleTypeKind: - snprintf(buf, bufsize, "f64"); - break; - } -} - -/** - * Helper function that builds an LLVM IR PHI node and immediately adds - * incoming edges. - */ -LLVMValueRef -ac_build_phi(struct ac_llvm_context *ctx, LLVMTypeRef type, - unsigned count_incoming, LLVMValueRef *values, - LLVMBasicBlockRef *blocks) -{ - LLVMValueRef phi = LLVMBuildPhi(ctx->builder, type, ""); - LLVMAddIncoming(phi, values, blocks, count_incoming); - return phi; -} - -void ac_build_s_barrier(struct ac_llvm_context *ctx) -{ - ac_build_intrinsic(ctx, "llvm.amdgcn.s.barrier", ctx->voidt, NULL, - 0, AC_FUNC_ATTR_CONVERGENT); -} - -/* Prevent optimizations (at least of memory accesses) across the current - * point in the program by emitting empty inline assembly that is marked as - * having side effects. - * - * Optionally, a value can be passed through the inline assembly to prevent - * LLVM from hoisting calls to ReadNone functions. - */ -void -ac_build_optimization_barrier(struct ac_llvm_context *ctx, - LLVMValueRef *pvgpr) -{ - static int counter = 0; - - LLVMBuilderRef builder = ctx->builder; - char code[16]; - - snprintf(code, sizeof(code), "; %d", p_atomic_inc_return(&counter)); - - if (!pvgpr) { - LLVMTypeRef ftype = LLVMFunctionType(ctx->voidt, NULL, 0, false); - LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "", true, false); - LLVMBuildCall(builder, inlineasm, NULL, 0, ""); - } else { - LLVMTypeRef ftype = LLVMFunctionType(ctx->i32, &ctx->i32, 1, false); - LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "=v,0", true, false); - LLVMValueRef vgpr = *pvgpr; - LLVMTypeRef vgpr_type = LLVMTypeOf(vgpr); - unsigned vgpr_size = ac_get_type_size(vgpr_type); - LLVMValueRef vgpr0; - - assert(vgpr_size % 4 == 0); - - vgpr = LLVMBuildBitCast(builder, vgpr, LLVMVectorType(ctx->i32, vgpr_size / 4), ""); - vgpr0 = LLVMBuildExtractElement(builder, vgpr, ctx->i32_0, ""); - vgpr0 = LLVMBuildCall(builder, inlineasm, &vgpr0, 1, ""); - vgpr = LLVMBuildInsertElement(builder, vgpr, vgpr0, ctx->i32_0, ""); - vgpr = LLVMBuildBitCast(builder, vgpr, vgpr_type, ""); - - *pvgpr = vgpr; - } -} - -LLVMValueRef -ac_build_shader_clock(struct ac_llvm_context *ctx) -{ - const char *intr = HAVE_LLVM >= 0x0900 && ctx->chip_class >= GFX8 ? - "llvm.amdgcn.s.memrealtime" : "llvm.readcyclecounter"; - LLVMValueRef tmp = ac_build_intrinsic(ctx, intr, ctx->i64, NULL, 0, 0); - return LLVMBuildBitCast(ctx->builder, tmp, ctx->v2i32, ""); -} - -LLVMValueRef -ac_build_ballot(struct ac_llvm_context *ctx, - LLVMValueRef value) -{ - const char *name; - - if (HAVE_LLVM >= 0x900) { - if (ctx->wave_size == 64) - name = "llvm.amdgcn.icmp.i64.i32"; - else - name = "llvm.amdgcn.icmp.i32.i32"; - } else { - name = "llvm.amdgcn.icmp.i32"; - } - LLVMValueRef args[3] = { - value, - ctx->i32_0, - LLVMConstInt(ctx->i32, LLVMIntNE, 0) - }; - - /* We currently have no other way to prevent LLVM from lifting the icmp - * calls to a dominating basic block. - */ - ac_build_optimization_barrier(ctx, &args[0]); - - args[0] = ac_to_integer(ctx, args[0]); - - return ac_build_intrinsic(ctx, name, ctx->iN_wavemask, args, 3, - AC_FUNC_ATTR_NOUNWIND | - AC_FUNC_ATTR_READNONE | - AC_FUNC_ATTR_CONVERGENT); -} - -LLVMValueRef ac_get_i1_sgpr_mask(struct ac_llvm_context *ctx, - LLVMValueRef value) -{ - const char *name = HAVE_LLVM >= 0x900 ? "llvm.amdgcn.icmp.i64.i1" : "llvm.amdgcn.icmp.i1"; - LLVMValueRef args[3] = { - value, - ctx->i1false, - LLVMConstInt(ctx->i32, LLVMIntNE, 0), - }; - - assert(HAVE_LLVM >= 0x0800); - return ac_build_intrinsic(ctx, name, ctx->i64, args, 3, - AC_FUNC_ATTR_NOUNWIND | - AC_FUNC_ATTR_READNONE | - AC_FUNC_ATTR_CONVERGENT); -} - -LLVMValueRef -ac_build_vote_all(struct ac_llvm_context *ctx, LLVMValueRef value) -{ - LLVMValueRef active_set = ac_build_ballot(ctx, ctx->i32_1); - LLVMValueRef vote_set = ac_build_ballot(ctx, value); - return LLVMBuildICmp(ctx->builder, LLVMIntEQ, vote_set, active_set, ""); -} - -LLVMValueRef -ac_build_vote_any(struct ac_llvm_context *ctx, LLVMValueRef value) -{ - LLVMValueRef vote_set = ac_build_ballot(ctx, value); - return LLVMBuildICmp(ctx->builder, LLVMIntNE, vote_set, - LLVMConstInt(ctx->iN_wavemask, 0, 0), ""); -} - -LLVMValueRef -ac_build_vote_eq(struct ac_llvm_context *ctx, LLVMValueRef value) -{ - LLVMValueRef active_set = ac_build_ballot(ctx, ctx->i32_1); - LLVMValueRef vote_set = ac_build_ballot(ctx, value); - - LLVMValueRef all = LLVMBuildICmp(ctx->builder, LLVMIntEQ, - vote_set, active_set, ""); - LLVMValueRef none = LLVMBuildICmp(ctx->builder, LLVMIntEQ, - vote_set, - LLVMConstInt(ctx->iN_wavemask, 0, 0), ""); - return LLVMBuildOr(ctx->builder, all, none, ""); -} - -LLVMValueRef -ac_build_varying_gather_values(struct ac_llvm_context *ctx, LLVMValueRef *values, - unsigned value_count, unsigned component) -{ - LLVMValueRef vec = NULL; - - if (value_count == 1) { - return values[component]; - } else if (!value_count) - unreachable("value_count is 0"); - - for (unsigned i = component; i < value_count + component; i++) { - LLVMValueRef value = values[i]; - - if (i == component) - vec = LLVMGetUndef( LLVMVectorType(LLVMTypeOf(value), value_count)); - LLVMValueRef index = LLVMConstInt(ctx->i32, i - component, false); - vec = LLVMBuildInsertElement(ctx->builder, vec, value, index, ""); - } - return vec; -} - -LLVMValueRef -ac_build_gather_values_extended(struct ac_llvm_context *ctx, - LLVMValueRef *values, - unsigned value_count, - unsigned value_stride, - bool load, - bool always_vector) -{ - LLVMBuilderRef builder = ctx->builder; - LLVMValueRef vec = NULL; - unsigned i; - - if (value_count == 1 && !always_vector) { - if (load) - return LLVMBuildLoad(builder, values[0], ""); - return values[0]; - } else if (!value_count) - unreachable("value_count is 0"); - - for (i = 0; i < value_count; i++) { - LLVMValueRef value = values[i * value_stride]; - if (load) - value = LLVMBuildLoad(builder, value, ""); - - if (!i) - vec = LLVMGetUndef( LLVMVectorType(LLVMTypeOf(value), value_count)); - LLVMValueRef index = LLVMConstInt(ctx->i32, i, false); - vec = LLVMBuildInsertElement(builder, vec, value, index, ""); - } - return vec; -} - -LLVMValueRef -ac_build_gather_values(struct ac_llvm_context *ctx, - LLVMValueRef *values, - unsigned value_count) -{ - return ac_build_gather_values_extended(ctx, values, value_count, 1, false, false); -} - -/* Expand a scalar or vector to by filling the remaining - * channels with undef. Extract at most src_channels components from the input. - */ -static LLVMValueRef -ac_build_expand(struct ac_llvm_context *ctx, - LLVMValueRef value, - unsigned src_channels, - unsigned dst_channels) -{ - LLVMTypeRef elemtype; - LLVMValueRef chan[dst_channels]; - - if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMVectorTypeKind) { - unsigned vec_size = LLVMGetVectorSize(LLVMTypeOf(value)); - - if (src_channels == dst_channels && vec_size == dst_channels) - return value; - - src_channels = MIN2(src_channels, vec_size); - - for (unsigned i = 0; i < src_channels; i++) - chan[i] = ac_llvm_extract_elem(ctx, value, i); - - elemtype = LLVMGetElementType(LLVMTypeOf(value)); - } else { - if (src_channels) { - assert(src_channels == 1); - chan[0] = value; - } - elemtype = LLVMTypeOf(value); - } - - for (unsigned i = src_channels; i < dst_channels; i++) - chan[i] = LLVMGetUndef(elemtype); - - return ac_build_gather_values(ctx, chan, dst_channels); -} - -/* Extract components [start, start + channels) from a vector. - */ -LLVMValueRef -ac_extract_components(struct ac_llvm_context *ctx, - LLVMValueRef value, - unsigned start, - unsigned channels) -{ - LLVMValueRef chan[channels]; - - for (unsigned i = 0; i < channels; i++) - chan[i] = ac_llvm_extract_elem(ctx, value, i + start); - - return ac_build_gather_values(ctx, chan, channels); -} - -/* Expand a scalar or vector to <4 x type> by filling the remaining channels - * with undef. Extract at most num_channels components from the input. - */ -LLVMValueRef ac_build_expand_to_vec4(struct ac_llvm_context *ctx, - LLVMValueRef value, - unsigned num_channels) -{ - return ac_build_expand(ctx, value, num_channels, 4); -} - -LLVMValueRef ac_build_round(struct ac_llvm_context *ctx, LLVMValueRef value) -{ - unsigned type_size = ac_get_type_size(LLVMTypeOf(value)); - const char *name; - - if (type_size == 2) - name = "llvm.rint.f16"; - else if (type_size == 4) - name = "llvm.rint.f32"; - else - name = "llvm.rint.f64"; - - return ac_build_intrinsic(ctx, name, LLVMTypeOf(value), &value, 1, - AC_FUNC_ATTR_READNONE); -} - -LLVMValueRef -ac_build_fdiv(struct ac_llvm_context *ctx, - LLVMValueRef num, - LLVMValueRef den) -{ - /* If we do (num / den), LLVM >= 7.0 does: - * return num * v_rcp_f32(den * (fabs(den) > 0x1.0p+96f ? 0x1.0p-32f : 1.0f)); - * - * If we do (num * (1 / den)), LLVM does: - * return num * v_rcp_f32(den); - */ - LLVMValueRef one = LLVMConstReal(LLVMTypeOf(num), 1.0); - LLVMValueRef rcp = LLVMBuildFDiv(ctx->builder, one, den, ""); - LLVMValueRef ret = LLVMBuildFMul(ctx->builder, num, rcp, ""); - - /* Use v_rcp_f32 instead of precise division. */ - if (!LLVMIsConstant(ret)) - LLVMSetMetadata(ret, ctx->fpmath_md_kind, ctx->fpmath_md_2p5_ulp); - return ret; -} - -/* See fast_idiv_by_const.h. */ -/* Set: increment = util_fast_udiv_info::increment ? multiplier : 0; */ -LLVMValueRef ac_build_fast_udiv(struct ac_llvm_context *ctx, - LLVMValueRef num, - LLVMValueRef multiplier, - LLVMValueRef pre_shift, - LLVMValueRef post_shift, - LLVMValueRef increment) -{ - LLVMBuilderRef builder = ctx->builder; - - num = LLVMBuildLShr(builder, num, pre_shift, ""); - num = LLVMBuildMul(builder, - LLVMBuildZExt(builder, num, ctx->i64, ""), - LLVMBuildZExt(builder, multiplier, ctx->i64, ""), ""); - num = LLVMBuildAdd(builder, num, - LLVMBuildZExt(builder, increment, ctx->i64, ""), ""); - num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), ""); - num = LLVMBuildTrunc(builder, num, ctx->i32, ""); - return LLVMBuildLShr(builder, num, post_shift, ""); -} - -/* See fast_idiv_by_const.h. */ -/* If num != UINT_MAX, this more efficient version can be used. */ -/* Set: increment = util_fast_udiv_info::increment; */ -LLVMValueRef ac_build_fast_udiv_nuw(struct ac_llvm_context *ctx, - LLVMValueRef num, - LLVMValueRef multiplier, - LLVMValueRef pre_shift, - LLVMValueRef post_shift, - LLVMValueRef increment) -{ - LLVMBuilderRef builder = ctx->builder; - - num = LLVMBuildLShr(builder, num, pre_shift, ""); - num = LLVMBuildNUWAdd(builder, num, increment, ""); - num = LLVMBuildMul(builder, - LLVMBuildZExt(builder, num, ctx->i64, ""), - LLVMBuildZExt(builder, multiplier, ctx->i64, ""), ""); - num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), ""); - num = LLVMBuildTrunc(builder, num, ctx->i32, ""); - return LLVMBuildLShr(builder, num, post_shift, ""); -} - -/* See fast_idiv_by_const.h. */ -/* Both operands must fit in 31 bits and the divisor must not be 1. */ -LLVMValueRef ac_build_fast_udiv_u31_d_not_one(struct ac_llvm_context *ctx, - LLVMValueRef num, - LLVMValueRef multiplier, - LLVMValueRef post_shift) -{ - LLVMBuilderRef builder = ctx->builder; - - num = LLVMBuildMul(builder, - LLVMBuildZExt(builder, num, ctx->i64, ""), - LLVMBuildZExt(builder, multiplier, ctx->i64, ""), ""); - num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), ""); - num = LLVMBuildTrunc(builder, num, ctx->i32, ""); - return LLVMBuildLShr(builder, num, post_shift, ""); -} - -/* Coordinates for cube map selection. sc, tc, and ma are as in Table 8.27 - * of the OpenGL 4.5 (Compatibility Profile) specification, except ma is - * already multiplied by two. id is the cube face number. - */ -struct cube_selection_coords { - LLVMValueRef stc[2]; - LLVMValueRef ma; - LLVMValueRef id; -}; - -static void -build_cube_intrinsic(struct ac_llvm_context *ctx, - LLVMValueRef in[3], - struct cube_selection_coords *out) -{ - LLVMTypeRef f32 = ctx->f32; - - out->stc[1] = ac_build_intrinsic(ctx, "llvm.amdgcn.cubetc", - f32, in, 3, AC_FUNC_ATTR_READNONE); - out->stc[0] = ac_build_intrinsic(ctx, "llvm.amdgcn.cubesc", - f32, in, 3, AC_FUNC_ATTR_READNONE); - out->ma = ac_build_intrinsic(ctx, "llvm.amdgcn.cubema", - f32, in, 3, AC_FUNC_ATTR_READNONE); - out->id = ac_build_intrinsic(ctx, "llvm.amdgcn.cubeid", - f32, in, 3, AC_FUNC_ATTR_READNONE); -} - -/** - * Build a manual selection sequence for cube face sc/tc coordinates and - * major axis vector (multiplied by 2 for consistency) for the given - * vec3 \p coords, for the face implied by \p selcoords. - * - * For the major axis, we always adjust the sign to be in the direction of - * selcoords.ma; i.e., a positive out_ma means that coords is pointed towards - * the selcoords major axis. - */ -static void build_cube_select(struct ac_llvm_context *ctx, - const struct cube_selection_coords *selcoords, - const LLVMValueRef *coords, - LLVMValueRef *out_st, - LLVMValueRef *out_ma) -{ - LLVMBuilderRef builder = ctx->builder; - LLVMTypeRef f32 = LLVMTypeOf(coords[0]); - LLVMValueRef is_ma_positive; - LLVMValueRef sgn_ma; - LLVMValueRef is_ma_z, is_not_ma_z; - LLVMValueRef is_ma_y; - LLVMValueRef is_ma_x; - LLVMValueRef sgn; - LLVMValueRef tmp; - - is_ma_positive = LLVMBuildFCmp(builder, LLVMRealUGE, - selcoords->ma, LLVMConstReal(f32, 0.0), ""); - sgn_ma = LLVMBuildSelect(builder, is_ma_positive, - LLVMConstReal(f32, 1.0), LLVMConstReal(f32, -1.0), ""); - - is_ma_z = LLVMBuildFCmp(builder, LLVMRealUGE, selcoords->id, LLVMConstReal(f32, 4.0), ""); - is_not_ma_z = LLVMBuildNot(builder, is_ma_z, ""); - is_ma_y = LLVMBuildAnd(builder, is_not_ma_z, - LLVMBuildFCmp(builder, LLVMRealUGE, selcoords->id, LLVMConstReal(f32, 2.0), ""), ""); - is_ma_x = LLVMBuildAnd(builder, is_not_ma_z, LLVMBuildNot(builder, is_ma_y, ""), ""); - - /* Select sc */ - tmp = LLVMBuildSelect(builder, is_ma_x, coords[2], coords[0], ""); - sgn = LLVMBuildSelect(builder, is_ma_y, LLVMConstReal(f32, 1.0), - LLVMBuildSelect(builder, is_ma_z, sgn_ma, - LLVMBuildFNeg(builder, sgn_ma, ""), ""), ""); - out_st[0] = LLVMBuildFMul(builder, tmp, sgn, ""); - - /* Select tc */ - tmp = LLVMBuildSelect(builder, is_ma_y, coords[2], coords[1], ""); - sgn = LLVMBuildSelect(builder, is_ma_y, sgn_ma, - LLVMConstReal(f32, -1.0), ""); - out_st[1] = LLVMBuildFMul(builder, tmp, sgn, ""); - - /* Select ma */ - tmp = LLVMBuildSelect(builder, is_ma_z, coords[2], - LLVMBuildSelect(builder, is_ma_y, coords[1], coords[0], ""), ""); - tmp = ac_build_intrinsic(ctx, "llvm.fabs.f32", - ctx->f32, &tmp, 1, AC_FUNC_ATTR_READNONE); - *out_ma = LLVMBuildFMul(builder, tmp, LLVMConstReal(f32, 2.0), ""); -} - -void -ac_prepare_cube_coords(struct ac_llvm_context *ctx, - bool is_deriv, bool is_array, bool is_lod, - LLVMValueRef *coords_arg, - LLVMValueRef *derivs_arg) -{ - - LLVMBuilderRef builder = ctx->builder; - struct cube_selection_coords selcoords; - LLVMValueRef coords[3]; - LLVMValueRef invma; - - if (is_array && !is_lod) { - LLVMValueRef tmp = ac_build_round(ctx, coords_arg[3]); - - /* Section 8.9 (Texture Functions) of the GLSL 4.50 spec says: - * - * "For Array forms, the array layer used will be - * - * max(0, min(d−1, floor(layer+0.5))) - * - * where d is the depth of the texture array and layer - * comes from the component indicated in the tables below. - * Workaroudn for an issue where the layer is taken from a - * helper invocation which happens to fall on a different - * layer due to extrapolation." - * - * GFX8 and earlier attempt to implement this in hardware by - * clamping the value of coords[2] = (8 * layer) + face. - * Unfortunately, this means that the we end up with the wrong - * face when clamping occurs. - * - * Clamp the layer earlier to work around the issue. - */ - if (ctx->chip_class <= GFX8) { - LLVMValueRef ge0; - ge0 = LLVMBuildFCmp(builder, LLVMRealOGE, tmp, ctx->f32_0, ""); - tmp = LLVMBuildSelect(builder, ge0, tmp, ctx->f32_0, ""); - } - - coords_arg[3] = tmp; - } - - build_cube_intrinsic(ctx, coords_arg, &selcoords); - - invma = ac_build_intrinsic(ctx, "llvm.fabs.f32", - ctx->f32, &selcoords.ma, 1, AC_FUNC_ATTR_READNONE); - invma = ac_build_fdiv(ctx, LLVMConstReal(ctx->f32, 1.0), invma); - - for (int i = 0; i < 2; ++i) - coords[i] = LLVMBuildFMul(builder, selcoords.stc[i], invma, ""); - - coords[2] = selcoords.id; - - if (is_deriv && derivs_arg) { - LLVMValueRef derivs[4]; - int axis; - - /* Convert cube derivatives to 2D derivatives. */ - for (axis = 0; axis < 2; axis++) { - LLVMValueRef deriv_st[2]; - LLVMValueRef deriv_ma; - - /* Transform the derivative alongside the texture - * coordinate. Mathematically, the correct formula is - * as follows. Assume we're projecting onto the +Z face - * and denote by dx/dh the derivative of the (original) - * X texture coordinate with respect to horizontal - * window coordinates. The projection onto the +Z face - * plane is: - * - * f(x,z) = x/z - * - * Then df/dh = df/dx * dx/dh + df/dz * dz/dh - * = 1/z * dx/dh - x/z * 1/z * dz/dh. - * - * This motivatives the implementation below. - * - * Whether this actually gives the expected results for - * apps that might feed in derivatives obtained via - * finite differences is anyone's guess. The OpenGL spec - * seems awfully quiet about how textureGrad for cube - * maps should be handled. - */ - build_cube_select(ctx, &selcoords, &derivs_arg[axis * 3], - deriv_st, &deriv_ma); - - deriv_ma = LLVMBuildFMul(builder, deriv_ma, invma, ""); - - for (int i = 0; i < 2; ++i) - derivs[axis * 2 + i] = - LLVMBuildFSub(builder, - LLVMBuildFMul(builder, deriv_st[i], invma, ""), - LLVMBuildFMul(builder, deriv_ma, coords[i], ""), ""); - } - - memcpy(derivs_arg, derivs, sizeof(derivs)); - } - - /* Shift the texture coordinate. This must be applied after the - * derivative calculation. - */ - for (int i = 0; i < 2; ++i) - coords[i] = LLVMBuildFAdd(builder, coords[i], LLVMConstReal(ctx->f32, 1.5), ""); - - if (is_array) { - /* for cube arrays coord.z = coord.w(array_index) * 8 + face */ - /* coords_arg.w component - array_index for cube arrays */ - coords[2] = ac_build_fmad(ctx, coords_arg[3], LLVMConstReal(ctx->f32, 8.0), coords[2]); - } - - memcpy(coords_arg, coords, sizeof(coords)); -} - - -LLVMValueRef -ac_build_fs_interp(struct ac_llvm_context *ctx, - LLVMValueRef llvm_chan, - LLVMValueRef attr_number, - LLVMValueRef params, - LLVMValueRef i, - LLVMValueRef j) -{ - LLVMValueRef args[5]; - LLVMValueRef p1; - - args[0] = i; - args[1] = llvm_chan; - args[2] = attr_number; - args[3] = params; - - p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1", - ctx->f32, args, 4, AC_FUNC_ATTR_READNONE); - - args[0] = p1; - args[1] = j; - args[2] = llvm_chan; - args[3] = attr_number; - args[4] = params; - - return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2", - ctx->f32, args, 5, AC_FUNC_ATTR_READNONE); -} - -LLVMValueRef -ac_build_fs_interp_f16(struct ac_llvm_context *ctx, - LLVMValueRef llvm_chan, - LLVMValueRef attr_number, - LLVMValueRef params, - LLVMValueRef i, - LLVMValueRef j) -{ - LLVMValueRef args[6]; - LLVMValueRef p1; - - args[0] = i; - args[1] = llvm_chan; - args[2] = attr_number; - args[3] = ctx->i1false; - args[4] = params; - - p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1.f16", - ctx->f32, args, 5, AC_FUNC_ATTR_READNONE); - - args[0] = p1; - args[1] = j; - args[2] = llvm_chan; - args[3] = attr_number; - args[4] = ctx->i1false; - args[5] = params; - - return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2.f16", - ctx->f16, args, 6, AC_FUNC_ATTR_READNONE); -} - -LLVMValueRef -ac_build_fs_interp_mov(struct ac_llvm_context *ctx, - LLVMValueRef parameter, - LLVMValueRef llvm_chan, - LLVMValueRef attr_number, - LLVMValueRef params) -{ - LLVMValueRef args[4]; - - args[0] = parameter; - args[1] = llvm_chan; - args[2] = attr_number; - args[3] = params; - - return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.mov", - ctx->f32, args, 4, AC_FUNC_ATTR_READNONE); -} - -LLVMValueRef -ac_build_gep_ptr(struct ac_llvm_context *ctx, - LLVMValueRef base_ptr, - LLVMValueRef index) -{ - return LLVMBuildGEP(ctx->builder, base_ptr, &index, 1, ""); -} - -LLVMValueRef -ac_build_gep0(struct ac_llvm_context *ctx, - LLVMValueRef base_ptr, - LLVMValueRef index) -{ - LLVMValueRef indices[2] = { - ctx->i32_0, - index, - }; - return LLVMBuildGEP(ctx->builder, base_ptr, indices, 2, ""); -} - -LLVMValueRef ac_build_pointer_add(struct ac_llvm_context *ctx, LLVMValueRef ptr, - LLVMValueRef index) -{ - return LLVMBuildPointerCast(ctx->builder, - LLVMBuildGEP(ctx->builder, ptr, &index, 1, ""), - LLVMTypeOf(ptr), ""); -} - -void -ac_build_indexed_store(struct ac_llvm_context *ctx, - LLVMValueRef base_ptr, LLVMValueRef index, - LLVMValueRef value) -{ - LLVMBuildStore(ctx->builder, value, - ac_build_gep0(ctx, base_ptr, index)); -} - -/** - * Build an LLVM bytecode indexed load using LLVMBuildGEP + LLVMBuildLoad. - * It's equivalent to doing a load from &base_ptr[index]. - * - * \param base_ptr Where the array starts. - * \param index The element index into the array. - * \param uniform Whether the base_ptr and index can be assumed to be - * dynamically uniform (i.e. load to an SGPR) - * \param invariant Whether the load is invariant (no other opcodes affect it) - * \param no_unsigned_wraparound - * For all possible re-associations and re-distributions of an expression - * "base_ptr + index * elemsize" into "addr + offset" (excluding GEPs - * without inbounds in base_ptr), this parameter is true if "addr + offset" - * does not result in an unsigned integer wraparound. This is used for - * optimal code generation of 32-bit pointer arithmetic. - * - * For example, a 32-bit immediate offset that causes a 32-bit unsigned - * integer wraparound can't be an imm offset in s_load_dword, because - * the instruction performs "addr + offset" in 64 bits. - * - * Expected usage for bindless textures by chaining GEPs: - * // possible unsigned wraparound, don't use InBounds: - * ptr1 = LLVMBuildGEP(base_ptr, index); - * image = load(ptr1); // becomes "s_load ptr1, 0" - * - * ptr2 = LLVMBuildInBoundsGEP(ptr1, 32 / elemsize); - * sampler = load(ptr2); // becomes "s_load ptr1, 32" thanks to InBounds - */ -static LLVMValueRef -ac_build_load_custom(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, - LLVMValueRef index, bool uniform, bool invariant, - bool no_unsigned_wraparound) -{ - LLVMValueRef pointer, result; - - if (no_unsigned_wraparound && - LLVMGetPointerAddressSpace(LLVMTypeOf(base_ptr)) == AC_ADDR_SPACE_CONST_32BIT) - pointer = LLVMBuildInBoundsGEP(ctx->builder, base_ptr, &index, 1, ""); - else - pointer = LLVMBuildGEP(ctx->builder, base_ptr, &index, 1, ""); - - if (uniform) - LLVMSetMetadata(pointer, ctx->uniform_md_kind, ctx->empty_md); - result = LLVMBuildLoad(ctx->builder, pointer, ""); - if (invariant) - LLVMSetMetadata(result, ctx->invariant_load_md_kind, ctx->empty_md); - return result; -} - -LLVMValueRef ac_build_load(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, - LLVMValueRef index) -{ - return ac_build_load_custom(ctx, base_ptr, index, false, false, false); -} - -LLVMValueRef ac_build_load_invariant(struct ac_llvm_context *ctx, - LLVMValueRef base_ptr, LLVMValueRef index) -{ - return ac_build_load_custom(ctx, base_ptr, index, false, true, false); -} - -/* This assumes that there is no unsigned integer wraparound during the address - * computation, excluding all GEPs within base_ptr. */ -LLVMValueRef ac_build_load_to_sgpr(struct ac_llvm_context *ctx, - LLVMValueRef base_ptr, LLVMValueRef index) -{ - return ac_build_load_custom(ctx, base_ptr, index, true, true, true); -} - -/* See ac_build_load_custom() documentation. */ -LLVMValueRef ac_build_load_to_sgpr_uint_wraparound(struct ac_llvm_context *ctx, - LLVMValueRef base_ptr, LLVMValueRef index) -{ - return ac_build_load_custom(ctx, base_ptr, index, true, true, false); -} - -static unsigned get_load_cache_policy(struct ac_llvm_context *ctx, - unsigned cache_policy) -{ - return cache_policy | - (ctx->chip_class >= GFX10 && cache_policy & ac_glc ? ac_dlc : 0); -} - -static void -ac_build_llvm7_buffer_store_common(struct ac_llvm_context *ctx, - LLVMValueRef rsrc, - LLVMValueRef data, - LLVMValueRef vindex, - LLVMValueRef voffset, - unsigned num_channels, - unsigned cache_policy, - bool use_format) -{ - LLVMValueRef args[] = { - data, - LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""), - vindex ? vindex : ctx->i32_0, - voffset, - LLVMConstInt(ctx->i1, !!(cache_policy & ac_glc), 0), - LLVMConstInt(ctx->i1, !!(cache_policy & ac_slc), 0) - }; - unsigned func = CLAMP(num_channels, 1, 3) - 1; - - const char *type_names[] = {"f32", "v2f32", "v4f32"}; - char name[256]; - - if (use_format) { - snprintf(name, sizeof(name), "llvm.amdgcn.buffer.store.format.%s", - type_names[func]); - } else { - snprintf(name, sizeof(name), "llvm.amdgcn.buffer.store.%s", - type_names[func]); - } - - ac_build_intrinsic(ctx, name, ctx->voidt, args, ARRAY_SIZE(args), - AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY); -} - -static void -ac_build_llvm8_buffer_store_common(struct ac_llvm_context *ctx, - LLVMValueRef rsrc, - LLVMValueRef data, - LLVMValueRef vindex, - LLVMValueRef voffset, - LLVMValueRef soffset, - unsigned num_channels, - LLVMTypeRef return_channel_type, - unsigned cache_policy, - bool use_format, - bool structurized) -{ - LLVMValueRef args[6]; - int idx = 0; - args[idx++] = data; - args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""); - if (structurized) - args[idx++] = vindex ? vindex : ctx->i32_0; - args[idx++] = voffset ? voffset : ctx->i32_0; - args[idx++] = soffset ? soffset : ctx->i32_0; - args[idx++] = LLVMConstInt(ctx->i32, cache_policy, 0); - unsigned func = !ac_has_vec3_support(ctx->chip_class, use_format) && num_channels == 3 ? 4 : num_channels; - const char *indexing_kind = structurized ? "struct" : "raw"; - char name[256], type_name[8]; - - LLVMTypeRef type = func > 1 ? LLVMVectorType(return_channel_type, func) : return_channel_type; - ac_build_type_name_for_intr(type, type_name, sizeof(type_name)); - - if (use_format) { - snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.store.format.%s", - indexing_kind, type_name); - } else { - snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.store.%s", - indexing_kind, type_name); - } - - ac_build_intrinsic(ctx, name, ctx->voidt, args, idx, - AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY); -} - -void -ac_build_buffer_store_format(struct ac_llvm_context *ctx, - LLVMValueRef rsrc, - LLVMValueRef data, - LLVMValueRef vindex, - LLVMValueRef voffset, - unsigned num_channels, - unsigned cache_policy) -{ - if (HAVE_LLVM >= 0x800) { - ac_build_llvm8_buffer_store_common(ctx, rsrc, data, vindex, - voffset, NULL, num_channels, - ctx->f32, cache_policy, - true, true); - } else { - ac_build_llvm7_buffer_store_common(ctx, rsrc, data, vindex, voffset, - num_channels, cache_policy, - true); - } -} - -/* TBUFFER_STORE_FORMAT_{X,XY,XYZ,XYZW} <- the suffix is selected by num_channels=1..4. - * The type of vdata must be one of i32 (num_channels=1), v2i32 (num_channels=2), - * or v4i32 (num_channels=3,4). - */ -void -ac_build_buffer_store_dword(struct ac_llvm_context *ctx, - LLVMValueRef rsrc, - LLVMValueRef vdata, - unsigned num_channels, - LLVMValueRef voffset, - LLVMValueRef soffset, - unsigned inst_offset, - unsigned cache_policy, - bool swizzle_enable_hint) -{ - /* Split 3 channel stores, because only LLVM 9+ support 3-channel - * intrinsics. */ - if (num_channels == 3 && !ac_has_vec3_support(ctx->chip_class, false)) { - LLVMValueRef v[3], v01; - - for (int i = 0; i < 3; i++) { - v[i] = LLVMBuildExtractElement(ctx->builder, vdata, - LLVMConstInt(ctx->i32, i, 0), ""); - } - v01 = ac_build_gather_values(ctx, v, 2); - - ac_build_buffer_store_dword(ctx, rsrc, v01, 2, voffset, - soffset, inst_offset, cache_policy, - swizzle_enable_hint); - ac_build_buffer_store_dword(ctx, rsrc, v[2], 1, voffset, - soffset, inst_offset + 8, - cache_policy, - swizzle_enable_hint); - return; - } - - /* SWIZZLE_ENABLE requires that soffset isn't folded into voffset - * (voffset is swizzled, but soffset isn't swizzled). - * llvm.amdgcn.buffer.store doesn't have a separate soffset parameter. - */ - if (!swizzle_enable_hint) { - LLVMValueRef offset = soffset; - - if (inst_offset) - offset = LLVMBuildAdd(ctx->builder, offset, - LLVMConstInt(ctx->i32, inst_offset, 0), ""); - - if (HAVE_LLVM >= 0x800) { - ac_build_llvm8_buffer_store_common(ctx, rsrc, - ac_to_float(ctx, vdata), - ctx->i32_0, - voffset, offset, - num_channels, - ctx->f32, - cache_policy, - false, false); - } else { - if (voffset) - offset = LLVMBuildAdd(ctx->builder, offset, voffset, ""); - - ac_build_llvm7_buffer_store_common(ctx, rsrc, - ac_to_float(ctx, vdata), - ctx->i32_0, offset, - num_channels, cache_policy, - false); - } - return; - } - - static const unsigned dfmts[] = { - V_008F0C_BUF_DATA_FORMAT_32, - V_008F0C_BUF_DATA_FORMAT_32_32, - V_008F0C_BUF_DATA_FORMAT_32_32_32, - V_008F0C_BUF_DATA_FORMAT_32_32_32_32 - }; - unsigned dfmt = dfmts[num_channels - 1]; - unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT; - LLVMValueRef immoffset = LLVMConstInt(ctx->i32, inst_offset, 0); - - ac_build_raw_tbuffer_store(ctx, rsrc, vdata, voffset, soffset, - immoffset, num_channels, dfmt, nfmt, cache_policy); -} - -static LLVMValueRef -ac_build_llvm7_buffer_load_common(struct ac_llvm_context *ctx, - LLVMValueRef rsrc, - LLVMValueRef vindex, - LLVMValueRef voffset, - unsigned num_channels, - unsigned cache_policy, - bool can_speculate, - bool use_format) -{ - LLVMValueRef args[] = { - LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""), - vindex ? vindex : ctx->i32_0, - voffset, - LLVMConstInt(ctx->i1, !!(cache_policy & ac_glc), 0), - LLVMConstInt(ctx->i1, !!(cache_policy & ac_slc), 0) - }; - unsigned func = CLAMP(num_channels, 1, 3) - 1; - - LLVMTypeRef types[] = {ctx->f32, ctx->v2f32, ctx->v4f32}; - const char *type_names[] = {"f32", "v2f32", "v4f32"}; - char name[256]; - - if (use_format) { - snprintf(name, sizeof(name), "llvm.amdgcn.buffer.load.format.%s", - type_names[func]); - } else { - snprintf(name, sizeof(name), "llvm.amdgcn.buffer.load.%s", - type_names[func]); - } - - return ac_build_intrinsic(ctx, name, types[func], args, - ARRAY_SIZE(args), - ac_get_load_intr_attribs(can_speculate)); -} - -static LLVMValueRef -ac_build_llvm8_buffer_load_common(struct ac_llvm_context *ctx, - LLVMValueRef rsrc, - LLVMValueRef vindex, - LLVMValueRef voffset, - LLVMValueRef soffset, - unsigned num_channels, - LLVMTypeRef channel_type, - unsigned cache_policy, - bool can_speculate, - bool use_format, - bool structurized) -{ - LLVMValueRef args[5]; - int idx = 0; - args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""); - if (structurized) - args[idx++] = vindex ? vindex : ctx->i32_0; - args[idx++] = voffset ? voffset : ctx->i32_0; - args[idx++] = soffset ? soffset : ctx->i32_0; - args[idx++] = LLVMConstInt(ctx->i32, get_load_cache_policy(ctx, cache_policy), 0); - unsigned func = !ac_has_vec3_support(ctx->chip_class, use_format) && num_channels == 3 ? 4 : num_channels; - const char *indexing_kind = structurized ? "struct" : "raw"; - char name[256], type_name[8]; - - LLVMTypeRef type = func > 1 ? LLVMVectorType(channel_type, func) : channel_type; - ac_build_type_name_for_intr(type, type_name, sizeof(type_name)); - - if (use_format) { - snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.load.format.%s", - indexing_kind, type_name); - } else { - snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.load.%s", - indexing_kind, type_name); - } - - return ac_build_intrinsic(ctx, name, type, args, idx, - ac_get_load_intr_attribs(can_speculate)); -} - -LLVMValueRef -ac_build_buffer_load(struct ac_llvm_context *ctx, - LLVMValueRef rsrc, - int num_channels, - LLVMValueRef vindex, - LLVMValueRef voffset, - LLVMValueRef soffset, - unsigned inst_offset, - unsigned cache_policy, - bool can_speculate, - bool allow_smem) -{ - LLVMValueRef offset = LLVMConstInt(ctx->i32, inst_offset, 0); - if (voffset) - offset = LLVMBuildAdd(ctx->builder, offset, voffset, ""); - if (soffset) - offset = LLVMBuildAdd(ctx->builder, offset, soffset, ""); - - if (allow_smem && !(cache_policy & ac_slc) && - (!(cache_policy & ac_glc) || (HAVE_LLVM >= 0x0800 && ctx->chip_class >= GFX8))) { - assert(vindex == NULL); - - LLVMValueRef result[8]; - - for (int i = 0; i < num_channels; i++) { - if (i) { - offset = LLVMBuildAdd(ctx->builder, offset, - LLVMConstInt(ctx->i32, 4, 0), ""); - } - const char *intrname = - HAVE_LLVM >= 0x0800 ? "llvm.amdgcn.s.buffer.load.f32" - : "llvm.SI.load.const.v4i32"; - unsigned num_args = HAVE_LLVM >= 0x0800 ? 3 : 2; - LLVMValueRef args[3] = { - rsrc, - offset, - LLVMConstInt(ctx->i32, get_load_cache_policy(ctx, cache_policy), 0), - }; - result[i] = ac_build_intrinsic(ctx, intrname, - ctx->f32, args, num_args, - AC_FUNC_ATTR_READNONE | - (HAVE_LLVM < 0x0800 ? AC_FUNC_ATTR_LEGACY : 0)); - } - if (num_channels == 1) - return result[0]; - - if (num_channels == 3 && !ac_has_vec3_support(ctx->chip_class, false)) - result[num_channels++] = LLVMGetUndef(ctx->f32); - return ac_build_gather_values(ctx, result, num_channels); - } - - if (HAVE_LLVM >= 0x0800) { - return ac_build_llvm8_buffer_load_common(ctx, rsrc, vindex, - offset, ctx->i32_0, - num_channels, ctx->f32, - cache_policy, - can_speculate, false, - false); - } - - return ac_build_llvm7_buffer_load_common(ctx, rsrc, vindex, offset, - num_channels, cache_policy, - can_speculate, false); -} - -LLVMValueRef ac_build_buffer_load_format(struct ac_llvm_context *ctx, - LLVMValueRef rsrc, - LLVMValueRef vindex, - LLVMValueRef voffset, - unsigned num_channels, - unsigned cache_policy, - bool can_speculate) -{ - if (HAVE_LLVM >= 0x800) { - return ac_build_llvm8_buffer_load_common(ctx, rsrc, vindex, voffset, ctx->i32_0, - num_channels, ctx->f32, - cache_policy, can_speculate, true, true); - } - return ac_build_llvm7_buffer_load_common(ctx, rsrc, vindex, voffset, - num_channels, cache_policy, - can_speculate, true); -} - -LLVMValueRef ac_build_buffer_load_format_gfx9_safe(struct ac_llvm_context *ctx, - LLVMValueRef rsrc, - LLVMValueRef vindex, - LLVMValueRef voffset, - unsigned num_channels, - unsigned cache_policy, - bool can_speculate) -{ - if (HAVE_LLVM >= 0x800) { - return ac_build_llvm8_buffer_load_common(ctx, rsrc, vindex, voffset, ctx->i32_0, - num_channels, ctx->f32, - cache_policy, can_speculate, true, true); - } - - LLVMValueRef elem_count = LLVMBuildExtractElement(ctx->builder, rsrc, LLVMConstInt(ctx->i32, 2, 0), ""); - LLVMValueRef stride = LLVMBuildExtractElement(ctx->builder, rsrc, ctx->i32_1, ""); - stride = LLVMBuildLShr(ctx->builder, stride, LLVMConstInt(ctx->i32, 16, 0), ""); - - LLVMValueRef new_elem_count = LLVMBuildSelect(ctx->builder, - LLVMBuildICmp(ctx->builder, LLVMIntUGT, elem_count, stride, ""), - elem_count, stride, ""); - - LLVMValueRef new_rsrc = LLVMBuildInsertElement(ctx->builder, rsrc, new_elem_count, - LLVMConstInt(ctx->i32, 2, 0), ""); - - return ac_build_llvm7_buffer_load_common(ctx, new_rsrc, vindex, voffset, - num_channels, cache_policy, - can_speculate, true); -} - -/// Translate a (dfmt, nfmt) pair into a chip-appropriate combined format -/// value for LLVM8+ tbuffer intrinsics. -static unsigned -ac_get_tbuffer_format(struct ac_llvm_context *ctx, - unsigned dfmt, unsigned nfmt) -{ - if (ctx->chip_class >= GFX10) { - unsigned format; - switch (dfmt) { - default: unreachable("bad dfmt"); - case V_008F0C_BUF_DATA_FORMAT_INVALID: format = V_008F0C_IMG_FORMAT_INVALID; break; - case V_008F0C_BUF_DATA_FORMAT_8: format = V_008F0C_IMG_FORMAT_8_UINT; break; - case V_008F0C_BUF_DATA_FORMAT_8_8: format = V_008F0C_IMG_FORMAT_8_8_UINT; break; - case V_008F0C_BUF_DATA_FORMAT_8_8_8_8: format = V_008F0C_IMG_FORMAT_8_8_8_8_UINT; break; - case V_008F0C_BUF_DATA_FORMAT_16: format = V_008F0C_IMG_FORMAT_16_UINT; break; - case V_008F0C_BUF_DATA_FORMAT_16_16: format = V_008F0C_IMG_FORMAT_16_16_UINT; break; - case V_008F0C_BUF_DATA_FORMAT_16_16_16_16: format = V_008F0C_IMG_FORMAT_16_16_16_16_UINT; break; - case V_008F0C_BUF_DATA_FORMAT_32: format = V_008F0C_IMG_FORMAT_32_UINT; break; - case V_008F0C_BUF_DATA_FORMAT_32_32: format = V_008F0C_IMG_FORMAT_32_32_UINT; break; - case V_008F0C_BUF_DATA_FORMAT_32_32_32: format = V_008F0C_IMG_FORMAT_32_32_32_UINT; break; - case V_008F0C_BUF_DATA_FORMAT_32_32_32_32: format = V_008F0C_IMG_FORMAT_32_32_32_32_UINT; break; - case V_008F0C_BUF_DATA_FORMAT_2_10_10_10: format = V_008F0C_IMG_FORMAT_2_10_10_10_UINT; break; - } - - // Use the regularity properties of the combined format enum. - // - // Note: float is incompatible with 8-bit data formats, - // [us]{norm,scaled} are incomparible with 32-bit data formats. - // [us]scaled are not writable. - switch (nfmt) { - case V_008F0C_BUF_NUM_FORMAT_UNORM: format -= 4; break; - case V_008F0C_BUF_NUM_FORMAT_SNORM: format -= 3; break; - case V_008F0C_BUF_NUM_FORMAT_USCALED: format -= 2; break; - case V_008F0C_BUF_NUM_FORMAT_SSCALED: format -= 1; break; - default: unreachable("bad nfmt"); - case V_008F0C_BUF_NUM_FORMAT_UINT: break; - case V_008F0C_BUF_NUM_FORMAT_SINT: format += 1; break; - case V_008F0C_BUF_NUM_FORMAT_FLOAT: format += 2; break; - } - - return format; - } else { - return dfmt | (nfmt << 4); - } -} - -static LLVMValueRef -ac_build_llvm8_tbuffer_load(struct ac_llvm_context *ctx, - LLVMValueRef rsrc, - LLVMValueRef vindex, - LLVMValueRef voffset, - LLVMValueRef soffset, - unsigned num_channels, - unsigned dfmt, - unsigned nfmt, - unsigned cache_policy, - bool can_speculate, - bool structurized) -{ - LLVMValueRef args[6]; - int idx = 0; - args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""); - if (structurized) - args[idx++] = vindex ? vindex : ctx->i32_0; - args[idx++] = voffset ? voffset : ctx->i32_0; - args[idx++] = soffset ? soffset : ctx->i32_0; - args[idx++] = LLVMConstInt(ctx->i32, ac_get_tbuffer_format(ctx, dfmt, nfmt), 0); - args[idx++] = LLVMConstInt(ctx->i32, get_load_cache_policy(ctx, cache_policy), 0); - unsigned func = !ac_has_vec3_support(ctx->chip_class, true) && num_channels == 3 ? 4 : num_channels; - const char *indexing_kind = structurized ? "struct" : "raw"; - char name[256], type_name[8]; - - LLVMTypeRef type = func > 1 ? LLVMVectorType(ctx->i32, func) : ctx->i32; - ac_build_type_name_for_intr(type, type_name, sizeof(type_name)); - - snprintf(name, sizeof(name), "llvm.amdgcn.%s.tbuffer.load.%s", - indexing_kind, type_name); - - return ac_build_intrinsic(ctx, name, type, args, idx, - ac_get_load_intr_attribs(can_speculate)); -} - -static LLVMValueRef -ac_build_tbuffer_load(struct ac_llvm_context *ctx, - LLVMValueRef rsrc, - LLVMValueRef vindex, - LLVMValueRef voffset, - LLVMValueRef soffset, - LLVMValueRef immoffset, - unsigned num_channels, - unsigned dfmt, - unsigned nfmt, - unsigned cache_policy, - bool can_speculate, - bool structurized) /* only matters for LLVM 8+ */ -{ - if (HAVE_LLVM >= 0x800) { - voffset = LLVMBuildAdd(ctx->builder, voffset, immoffset, ""); - - return ac_build_llvm8_tbuffer_load(ctx, rsrc, vindex, voffset, - soffset, num_channels, - dfmt, nfmt, cache_policy, - can_speculate, structurized); - } - - LLVMValueRef args[] = { - rsrc, - vindex ? vindex : ctx->i32_0, - voffset, - soffset, - immoffset, - LLVMConstInt(ctx->i32, dfmt, false), - LLVMConstInt(ctx->i32, nfmt, false), - LLVMConstInt(ctx->i1, !!(cache_policy & ac_glc), false), - LLVMConstInt(ctx->i1, !!(cache_policy & ac_slc), false), - }; - unsigned func = CLAMP(num_channels, 1, 3) - 1; - LLVMTypeRef types[] = {ctx->i32, ctx->v2i32, ctx->v4i32}; - const char *type_names[] = {"i32", "v2i32", "v4i32"}; - char name[256]; - - snprintf(name, sizeof(name), "llvm.amdgcn.tbuffer.load.%s", - type_names[func]); - - return ac_build_intrinsic(ctx, name, types[func], args, 9, - ac_get_load_intr_attribs(can_speculate)); -} - -LLVMValueRef -ac_build_struct_tbuffer_load(struct ac_llvm_context *ctx, - LLVMValueRef rsrc, - LLVMValueRef vindex, - LLVMValueRef voffset, - LLVMValueRef soffset, - LLVMValueRef immoffset, - unsigned num_channels, - unsigned dfmt, - unsigned nfmt, - unsigned cache_policy, - bool can_speculate) -{ - return ac_build_tbuffer_load(ctx, rsrc, vindex, voffset, soffset, - immoffset, num_channels, dfmt, nfmt, - cache_policy, can_speculate, true); -} - -LLVMValueRef -ac_build_raw_tbuffer_load(struct ac_llvm_context *ctx, - LLVMValueRef rsrc, - LLVMValueRef voffset, - LLVMValueRef soffset, - LLVMValueRef immoffset, - unsigned num_channels, - unsigned dfmt, - unsigned nfmt, - unsigned cache_policy, - bool can_speculate) -{ - return ac_build_tbuffer_load(ctx, rsrc, NULL, voffset, soffset, - immoffset, num_channels, dfmt, nfmt, - cache_policy, can_speculate, false); -} - -LLVMValueRef -ac_build_tbuffer_load_short(struct ac_llvm_context *ctx, - LLVMValueRef rsrc, - LLVMValueRef voffset, - LLVMValueRef soffset, - LLVMValueRef immoffset, - unsigned cache_policy) -{ - LLVMValueRef res; - - if (HAVE_LLVM >= 0x900) { - voffset = LLVMBuildAdd(ctx->builder, voffset, immoffset, ""); - - /* LLVM 9+ supports i8/i16 with struct/raw intrinsics. */ - res = ac_build_llvm8_buffer_load_common(ctx, rsrc, NULL, - voffset, soffset, - 1, ctx->i16, cache_policy, - false, false, false); - } else { - unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_16; - unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT; - - res = ac_build_raw_tbuffer_load(ctx, rsrc, voffset, soffset, - immoffset, 1, dfmt, nfmt, cache_policy, - false); - - res = LLVMBuildTrunc(ctx->builder, res, ctx->i16, ""); - } - - return res; -} - -LLVMValueRef -ac_build_tbuffer_load_byte(struct ac_llvm_context *ctx, - LLVMValueRef rsrc, - LLVMValueRef voffset, - LLVMValueRef soffset, - LLVMValueRef immoffset, - unsigned cache_policy) -{ - LLVMValueRef res; - - if (HAVE_LLVM >= 0x900) { - voffset = LLVMBuildAdd(ctx->builder, voffset, immoffset, ""); - - /* LLVM 9+ supports i8/i16 with struct/raw intrinsics. */ - res = ac_build_llvm8_buffer_load_common(ctx, rsrc, NULL, - voffset, soffset, - 1, ctx->i8, cache_policy, - false, false, false); - } else { - unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_8; - unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT; - - res = ac_build_raw_tbuffer_load(ctx, rsrc, voffset, soffset, - immoffset, 1, dfmt, nfmt, cache_policy, - false); - - res = LLVMBuildTrunc(ctx->builder, res, ctx->i8, ""); - } - - return res; -} - -/** - * Convert an 11- or 10-bit unsigned floating point number to an f32. - * - * The input exponent is expected to be biased analogous to IEEE-754, i.e. by - * 2^(exp_bits-1) - 1 (as defined in OpenGL and other graphics APIs). - */ -static LLVMValueRef -ac_ufN_to_float(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned exp_bits, unsigned mant_bits) -{ - assert(LLVMTypeOf(src) == ctx->i32); - - LLVMValueRef tmp; - LLVMValueRef mantissa; - mantissa = LLVMBuildAnd(ctx->builder, src, LLVMConstInt(ctx->i32, (1 << mant_bits) - 1, false), ""); - - /* Converting normal numbers is just a shift + correcting the exponent bias */ - unsigned normal_shift = 23 - mant_bits; - unsigned bias_shift = 127 - ((1 << (exp_bits - 1)) - 1); - LLVMValueRef shifted, normal; - - shifted = LLVMBuildShl(ctx->builder, src, LLVMConstInt(ctx->i32, normal_shift, false), ""); - normal = LLVMBuildAdd(ctx->builder, shifted, LLVMConstInt(ctx->i32, bias_shift << 23, false), ""); - - /* Converting nan/inf numbers is the same, but with a different exponent update */ - LLVMValueRef naninf; - naninf = LLVMBuildOr(ctx->builder, normal, LLVMConstInt(ctx->i32, 0xff << 23, false), ""); - - /* Converting denormals is the complex case: determine the leading zeros of the - * mantissa to obtain the correct shift for the mantissa and exponent correction. - */ - LLVMValueRef denormal; - LLVMValueRef params[2] = { - mantissa, - ctx->i1true, /* result can be undef when arg is 0 */ - }; - LLVMValueRef ctlz = ac_build_intrinsic(ctx, "llvm.ctlz.i32", ctx->i32, - params, 2, AC_FUNC_ATTR_READNONE); - - /* Shift such that the leading 1 ends up as the LSB of the exponent field. */ - tmp = LLVMBuildSub(ctx->builder, ctlz, LLVMConstInt(ctx->i32, 8, false), ""); - denormal = LLVMBuildShl(ctx->builder, mantissa, tmp, ""); - - unsigned denormal_exp = bias_shift + (32 - mant_bits) - 1; - tmp = LLVMBuildSub(ctx->builder, LLVMConstInt(ctx->i32, denormal_exp, false), ctlz, ""); - tmp = LLVMBuildShl(ctx->builder, tmp, LLVMConstInt(ctx->i32, 23, false), ""); - denormal = LLVMBuildAdd(ctx->builder, denormal, tmp, ""); - - /* Select the final result. */ - LLVMValueRef result; - - tmp = LLVMBuildICmp(ctx->builder, LLVMIntUGE, src, - LLVMConstInt(ctx->i32, ((1 << exp_bits) - 1) << mant_bits, false), ""); - result = LLVMBuildSelect(ctx->builder, tmp, naninf, normal, ""); - - tmp = LLVMBuildICmp(ctx->builder, LLVMIntUGE, src, - LLVMConstInt(ctx->i32, 1 << mant_bits, false), ""); - result = LLVMBuildSelect(ctx->builder, tmp, result, denormal, ""); - - tmp = LLVMBuildICmp(ctx->builder, LLVMIntNE, src, ctx->i32_0, ""); - result = LLVMBuildSelect(ctx->builder, tmp, result, ctx->i32_0, ""); - - return ac_to_float(ctx, result); -} - -/** - * Generate a fully general open coded buffer format fetch with all required - * fixups suitable for vertex fetch, using non-format buffer loads. - * - * Some combinations of argument values have special interpretations: - * - size = 8 bytes, format = fixed indicates PIPE_FORMAT_R11G11B10_FLOAT - * - size = 8 bytes, format != {float,fixed} indicates a 2_10_10_10 data format - * - * \param log_size log(size of channel in bytes) - * \param num_channels number of channels (1 to 4) - * \param format AC_FETCH_FORMAT_xxx value - * \param reverse whether XYZ channels are reversed - * \param known_aligned whether the source is known to be aligned to hardware's - * effective element size for loading the given format - * (note: this means dword alignment for 8_8_8_8, 16_16, etc.) - * \param rsrc buffer resource descriptor - * \return the resulting vector of floats or integers bitcast to <4 x i32> - */ -LLVMValueRef -ac_build_opencoded_load_format(struct ac_llvm_context *ctx, - unsigned log_size, - unsigned num_channels, - unsigned format, - bool reverse, - bool known_aligned, - LLVMValueRef rsrc, - LLVMValueRef vindex, - LLVMValueRef voffset, - LLVMValueRef soffset, - unsigned cache_policy, - bool can_speculate) -{ - LLVMValueRef tmp; - unsigned load_log_size = log_size; - unsigned load_num_channels = num_channels; - if (log_size == 3) { - load_log_size = 2; - if (format == AC_FETCH_FORMAT_FLOAT) { - load_num_channels = 2 * num_channels; - } else { - load_num_channels = 1; /* 10_11_11 or 2_10_10_10 */ - } - } - - int log_recombine = 0; - if (ctx->chip_class == GFX6 && !known_aligned) { - /* Avoid alignment restrictions by loading one byte at a time. */ - load_num_channels <<= load_log_size; - log_recombine = load_log_size; - load_log_size = 0; - } else if (load_num_channels == 2 || load_num_channels == 4) { - log_recombine = -util_logbase2(load_num_channels); - load_num_channels = 1; - load_log_size += -log_recombine; - } - - assert(load_log_size >= 2 || HAVE_LLVM >= 0x0900); - - LLVMValueRef loads[32]; /* up to 32 bytes */ - for (unsigned i = 0; i < load_num_channels; ++i) { - tmp = LLVMBuildAdd(ctx->builder, soffset, - LLVMConstInt(ctx->i32, i << load_log_size, false), ""); - if (HAVE_LLVM >= 0x0800) { - LLVMTypeRef channel_type = load_log_size == 0 ? ctx->i8 : - load_log_size == 1 ? ctx->i16 : ctx->i32; - unsigned num_channels = 1 << (MAX2(load_log_size, 2) - 2); - loads[i] = ac_build_llvm8_buffer_load_common( - ctx, rsrc, vindex, voffset, tmp, - num_channels, channel_type, cache_policy, - can_speculate, false, true); - } else { - tmp = LLVMBuildAdd(ctx->builder, voffset, tmp, ""); - loads[i] = ac_build_llvm7_buffer_load_common( - ctx, rsrc, vindex, tmp, - 1 << (load_log_size - 2), cache_policy, can_speculate, false); - } - if (load_log_size >= 2) - loads[i] = ac_to_integer(ctx, loads[i]); - } - - if (log_recombine > 0) { - /* Recombine bytes if necessary (GFX6 only) */ - LLVMTypeRef dst_type = log_recombine == 2 ? ctx->i32 : ctx->i16; - - for (unsigned src = 0, dst = 0; src < load_num_channels; ++dst) { - LLVMValueRef accum = NULL; - for (unsigned i = 0; i < (1 << log_recombine); ++i, ++src) { - tmp = LLVMBuildZExt(ctx->builder, loads[src], dst_type, ""); - if (i == 0) { - accum = tmp; - } else { - tmp = LLVMBuildShl(ctx->builder, tmp, - LLVMConstInt(dst_type, 8 * i, false), ""); - accum = LLVMBuildOr(ctx->builder, accum, tmp, ""); - } - } - loads[dst] = accum; - } - } else if (log_recombine < 0) { - /* Split vectors of dwords */ - if (load_log_size > 2) { - assert(load_num_channels == 1); - LLVMValueRef loaded = loads[0]; - unsigned log_split = load_log_size - 2; - log_recombine += log_split; - load_num_channels = 1 << log_split; - load_log_size = 2; - for (unsigned i = 0; i < load_num_channels; ++i) { - tmp = LLVMConstInt(ctx->i32, i, false); - loads[i] = LLVMBuildExtractElement(ctx->builder, loaded, tmp, ""); - } - } - - /* Further split dwords and shorts if required */ - if (log_recombine < 0) { - for (unsigned src = load_num_channels, - dst = load_num_channels << -log_recombine; - src > 0; --src) { - unsigned dst_bits = 1 << (3 + load_log_size + log_recombine); - LLVMTypeRef dst_type = LLVMIntTypeInContext(ctx->context, dst_bits); - LLVMValueRef loaded = loads[src - 1]; - LLVMTypeRef loaded_type = LLVMTypeOf(loaded); - for (unsigned i = 1 << -log_recombine; i > 0; --i, --dst) { - tmp = LLVMConstInt(loaded_type, dst_bits * (i - 1), false); - tmp = LLVMBuildLShr(ctx->builder, loaded, tmp, ""); - loads[dst - 1] = LLVMBuildTrunc(ctx->builder, tmp, dst_type, ""); - } - } - } - } - - if (log_size == 3) { - if (format == AC_FETCH_FORMAT_FLOAT) { - for (unsigned i = 0; i < num_channels; ++i) { - tmp = ac_build_gather_values(ctx, &loads[2 * i], 2); - loads[i] = LLVMBuildBitCast(ctx->builder, tmp, ctx->f64, ""); - } - } else if (format == AC_FETCH_FORMAT_FIXED) { - /* 10_11_11_FLOAT */ - LLVMValueRef data = loads[0]; - LLVMValueRef i32_2047 = LLVMConstInt(ctx->i32, 2047, false); - LLVMValueRef r = LLVMBuildAnd(ctx->builder, data, i32_2047, ""); - tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 11, false), ""); - LLVMValueRef g = LLVMBuildAnd(ctx->builder, tmp, i32_2047, ""); - LLVMValueRef b = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 22, false), ""); - - loads[0] = ac_to_integer(ctx, ac_ufN_to_float(ctx, r, 5, 6)); - loads[1] = ac_to_integer(ctx, ac_ufN_to_float(ctx, g, 5, 6)); - loads[2] = ac_to_integer(ctx, ac_ufN_to_float(ctx, b, 5, 5)); - - num_channels = 3; - log_size = 2; - format = AC_FETCH_FORMAT_FLOAT; - } else { - /* 2_10_10_10 data formats */ - LLVMValueRef data = loads[0]; - LLVMTypeRef i10 = LLVMIntTypeInContext(ctx->context, 10); - LLVMTypeRef i2 = LLVMIntTypeInContext(ctx->context, 2); - loads[0] = LLVMBuildTrunc(ctx->builder, data, i10, ""); - tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 10, false), ""); - loads[1] = LLVMBuildTrunc(ctx->builder, tmp, i10, ""); - tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 20, false), ""); - loads[2] = LLVMBuildTrunc(ctx->builder, tmp, i10, ""); - tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 30, false), ""); - loads[3] = LLVMBuildTrunc(ctx->builder, tmp, i2, ""); - - num_channels = 4; - } - } - - if (format == AC_FETCH_FORMAT_FLOAT) { - if (log_size != 2) { - for (unsigned chan = 0; chan < num_channels; ++chan) { - tmp = ac_to_float(ctx, loads[chan]); - if (log_size == 3) - tmp = LLVMBuildFPTrunc(ctx->builder, tmp, ctx->f32, ""); - else if (log_size == 1) - tmp = LLVMBuildFPExt(ctx->builder, tmp, ctx->f32, ""); - loads[chan] = ac_to_integer(ctx, tmp); - } - } - } else if (format == AC_FETCH_FORMAT_UINT) { - if (log_size != 2) { - for (unsigned chan = 0; chan < num_channels; ++chan) - loads[chan] = LLVMBuildZExt(ctx->builder, loads[chan], ctx->i32, ""); - } - } else if (format == AC_FETCH_FORMAT_SINT) { - if (log_size != 2) { - for (unsigned chan = 0; chan < num_channels; ++chan) - loads[chan] = LLVMBuildSExt(ctx->builder, loads[chan], ctx->i32, ""); - } - } else { - bool unsign = format == AC_FETCH_FORMAT_UNORM || - format == AC_FETCH_FORMAT_USCALED || - format == AC_FETCH_FORMAT_UINT; - - for (unsigned chan = 0; chan < num_channels; ++chan) { - if (unsign) { - tmp = LLVMBuildUIToFP(ctx->builder, loads[chan], ctx->f32, ""); - } else { - tmp = LLVMBuildSIToFP(ctx->builder, loads[chan], ctx->f32, ""); - } - - LLVMValueRef scale = NULL; - if (format == AC_FETCH_FORMAT_FIXED) { - assert(log_size == 2); - scale = LLVMConstReal(ctx->f32, 1.0 / 0x10000); - } else if (format == AC_FETCH_FORMAT_UNORM) { - unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(loads[chan])); - scale = LLVMConstReal(ctx->f32, 1.0 / (((uint64_t)1 << bits) - 1)); - } else if (format == AC_FETCH_FORMAT_SNORM) { - unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(loads[chan])); - scale = LLVMConstReal(ctx->f32, 1.0 / (((uint64_t)1 << (bits - 1)) - 1)); - } - if (scale) - tmp = LLVMBuildFMul(ctx->builder, tmp, scale, ""); - - if (format == AC_FETCH_FORMAT_SNORM) { - /* Clamp to [-1, 1] */ - LLVMValueRef neg_one = LLVMConstReal(ctx->f32, -1.0); - LLVMValueRef clamp = - LLVMBuildFCmp(ctx->builder, LLVMRealULT, tmp, neg_one, ""); - tmp = LLVMBuildSelect(ctx->builder, clamp, neg_one, tmp, ""); - } - - loads[chan] = ac_to_integer(ctx, tmp); - } - } - - while (num_channels < 4) { - if (format == AC_FETCH_FORMAT_UINT || format == AC_FETCH_FORMAT_SINT) { - loads[num_channels] = num_channels == 3 ? ctx->i32_1 : ctx->i32_0; - } else { - loads[num_channels] = ac_to_integer(ctx, num_channels == 3 ? ctx->f32_1 : ctx->f32_0); - } - num_channels++; - } - - if (reverse) { - tmp = loads[0]; - loads[0] = loads[2]; - loads[2] = tmp; - } - - return ac_build_gather_values(ctx, loads, 4); -} - -static void -ac_build_llvm8_tbuffer_store(struct ac_llvm_context *ctx, - LLVMValueRef rsrc, - LLVMValueRef vdata, - LLVMValueRef vindex, - LLVMValueRef voffset, - LLVMValueRef soffset, - unsigned num_channels, - unsigned dfmt, - unsigned nfmt, - unsigned cache_policy, - bool structurized) -{ - LLVMValueRef args[7]; - int idx = 0; - args[idx++] = vdata; - args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""); - if (structurized) - args[idx++] = vindex ? vindex : ctx->i32_0; - args[idx++] = voffset ? voffset : ctx->i32_0; - args[idx++] = soffset ? soffset : ctx->i32_0; - args[idx++] = LLVMConstInt(ctx->i32, ac_get_tbuffer_format(ctx, dfmt, nfmt), 0); - args[idx++] = LLVMConstInt(ctx->i32, cache_policy, 0); - unsigned func = !ac_has_vec3_support(ctx->chip_class, true) && num_channels == 3 ? 4 : num_channels; - const char *indexing_kind = structurized ? "struct" : "raw"; - char name[256], type_name[8]; - - LLVMTypeRef type = func > 1 ? LLVMVectorType(ctx->i32, func) : ctx->i32; - ac_build_type_name_for_intr(type, type_name, sizeof(type_name)); - - snprintf(name, sizeof(name), "llvm.amdgcn.%s.tbuffer.store.%s", - indexing_kind, type_name); - - ac_build_intrinsic(ctx, name, ctx->voidt, args, idx, - AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY); -} - -static void -ac_build_tbuffer_store(struct ac_llvm_context *ctx, - LLVMValueRef rsrc, - LLVMValueRef vdata, - LLVMValueRef vindex, - LLVMValueRef voffset, - LLVMValueRef soffset, - LLVMValueRef immoffset, - unsigned num_channels, - unsigned dfmt, - unsigned nfmt, - unsigned cache_policy, - bool structurized) /* only matters for LLVM 8+ */ -{ - if (HAVE_LLVM >= 0x800) { - voffset = LLVMBuildAdd(ctx->builder, - voffset ? voffset : ctx->i32_0, - immoffset, ""); - - ac_build_llvm8_tbuffer_store(ctx, rsrc, vdata, vindex, voffset, - soffset, num_channels, dfmt, nfmt, - cache_policy, structurized); - } else { - LLVMValueRef params[] = { - vdata, - rsrc, - vindex ? vindex : ctx->i32_0, - voffset ? voffset : ctx->i32_0, - soffset ? soffset : ctx->i32_0, - immoffset, - LLVMConstInt(ctx->i32, dfmt, false), - LLVMConstInt(ctx->i32, nfmt, false), - LLVMConstInt(ctx->i1, !!(cache_policy & ac_glc), false), - LLVMConstInt(ctx->i1, !!(cache_policy & ac_slc), false), - }; - unsigned func = CLAMP(num_channels, 1, 3) - 1; - const char *type_names[] = {"i32", "v2i32", "v4i32"}; - char name[256]; - - snprintf(name, sizeof(name), "llvm.amdgcn.tbuffer.store.%s", - type_names[func]); - - ac_build_intrinsic(ctx, name, ctx->voidt, params, 10, - AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY); - } -} - -void -ac_build_struct_tbuffer_store(struct ac_llvm_context *ctx, - LLVMValueRef rsrc, - LLVMValueRef vdata, - LLVMValueRef vindex, - LLVMValueRef voffset, - LLVMValueRef soffset, - LLVMValueRef immoffset, - unsigned num_channels, - unsigned dfmt, - unsigned nfmt, - unsigned cache_policy) -{ - ac_build_tbuffer_store(ctx, rsrc, vdata, vindex, voffset, soffset, - immoffset, num_channels, dfmt, nfmt, cache_policy, - true); -} - -void -ac_build_raw_tbuffer_store(struct ac_llvm_context *ctx, - LLVMValueRef rsrc, - LLVMValueRef vdata, - LLVMValueRef voffset, - LLVMValueRef soffset, - LLVMValueRef immoffset, - unsigned num_channels, - unsigned dfmt, - unsigned nfmt, - unsigned cache_policy) -{ - ac_build_tbuffer_store(ctx, rsrc, vdata, NULL, voffset, soffset, - immoffset, num_channels, dfmt, nfmt, cache_policy, - false); -} - -void -ac_build_tbuffer_store_short(struct ac_llvm_context *ctx, - LLVMValueRef rsrc, - LLVMValueRef vdata, - LLVMValueRef voffset, - LLVMValueRef soffset, - unsigned cache_policy) -{ - vdata = LLVMBuildBitCast(ctx->builder, vdata, ctx->i16, ""); - - if (HAVE_LLVM >= 0x900) { - /* LLVM 9+ supports i8/i16 with struct/raw intrinsics. */ - ac_build_llvm8_buffer_store_common(ctx, rsrc, vdata, NULL, - voffset, soffset, 1, - ctx->i16, cache_policy, - false, false); - } else { - unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_16; - unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT; - - vdata = LLVMBuildZExt(ctx->builder, vdata, ctx->i32, ""); - - ac_build_raw_tbuffer_store(ctx, rsrc, vdata, voffset, soffset, - ctx->i32_0, 1, dfmt, nfmt, cache_policy); - } -} - -void -ac_build_tbuffer_store_byte(struct ac_llvm_context *ctx, - LLVMValueRef rsrc, - LLVMValueRef vdata, - LLVMValueRef voffset, - LLVMValueRef soffset, - unsigned cache_policy) -{ - vdata = LLVMBuildBitCast(ctx->builder, vdata, ctx->i8, ""); - - if (HAVE_LLVM >= 0x900) { - /* LLVM 9+ supports i8/i16 with struct/raw intrinsics. */ - ac_build_llvm8_buffer_store_common(ctx, rsrc, vdata, NULL, - voffset, soffset, 1, - ctx->i8, cache_policy, - false, false); - } else { - unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_8; - unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT; - - vdata = LLVMBuildZExt(ctx->builder, vdata, ctx->i32, ""); - - ac_build_raw_tbuffer_store(ctx, rsrc, vdata, voffset, soffset, - ctx->i32_0, 1, dfmt, nfmt, cache_policy); - } -} -/** - * Set range metadata on an instruction. This can only be used on load and - * call instructions. If you know an instruction can only produce the values - * 0, 1, 2, you would do set_range_metadata(value, 0, 3); - * \p lo is the minimum value inclusive. - * \p hi is the maximum value exclusive. - */ -static void set_range_metadata(struct ac_llvm_context *ctx, - LLVMValueRef value, unsigned lo, unsigned hi) -{ - LLVMValueRef range_md, md_args[2]; - LLVMTypeRef type = LLVMTypeOf(value); - LLVMContextRef context = LLVMGetTypeContext(type); - - md_args[0] = LLVMConstInt(type, lo, false); - md_args[1] = LLVMConstInt(type, hi, false); - range_md = LLVMMDNodeInContext(context, md_args, 2); - LLVMSetMetadata(value, ctx->range_md_kind, range_md); -} - -LLVMValueRef -ac_get_thread_id(struct ac_llvm_context *ctx) -{ - LLVMValueRef tid; - - LLVMValueRef tid_args[2]; - tid_args[0] = LLVMConstInt(ctx->i32, 0xffffffff, false); - tid_args[1] = ctx->i32_0; - tid_args[1] = ac_build_intrinsic(ctx, - "llvm.amdgcn.mbcnt.lo", ctx->i32, - tid_args, 2, AC_FUNC_ATTR_READNONE); - - if (ctx->wave_size == 32) { - tid = tid_args[1]; - } else { - tid = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.hi", - ctx->i32, tid_args, - 2, AC_FUNC_ATTR_READNONE); - } - set_range_metadata(ctx, tid, 0, ctx->wave_size); - return tid; -} - -/* - * AMD GCN implements derivatives using the local data store (LDS) - * All writes to the LDS happen in all executing threads at - * the same time. TID is the Thread ID for the current - * thread and is a value between 0 and 63, representing - * the thread's position in the wavefront. - * - * For the pixel shader threads are grouped into quads of four pixels. - * The TIDs of the pixels of a quad are: - * - * +------+------+ - * |4n + 0|4n + 1| - * +------+------+ - * |4n + 2|4n + 3| - * +------+------+ - * - * So, masking the TID with 0xfffffffc yields the TID of the top left pixel - * of the quad, masking with 0xfffffffd yields the TID of the top pixel of - * the current pixel's column, and masking with 0xfffffffe yields the TID - * of the left pixel of the current pixel's row. - * - * Adding 1 yields the TID of the pixel to the right of the left pixel, and - * adding 2 yields the TID of the pixel below the top pixel. - */ -LLVMValueRef -ac_build_ddxy(struct ac_llvm_context *ctx, - uint32_t mask, - int idx, - LLVMValueRef val) -{ - unsigned tl_lanes[4], trbl_lanes[4]; - char name[32], type[8]; - LLVMValueRef tl, trbl; - LLVMTypeRef result_type; - LLVMValueRef result; - - result_type = ac_to_float_type(ctx, LLVMTypeOf(val)); - - if (result_type == ctx->f16) - val = LLVMBuildZExt(ctx->builder, val, ctx->i32, ""); - - for (unsigned i = 0; i < 4; ++i) { - tl_lanes[i] = i & mask; - trbl_lanes[i] = (i & mask) + idx; - } - - tl = ac_build_quad_swizzle(ctx, val, - tl_lanes[0], tl_lanes[1], - tl_lanes[2], tl_lanes[3]); - trbl = ac_build_quad_swizzle(ctx, val, - trbl_lanes[0], trbl_lanes[1], - trbl_lanes[2], trbl_lanes[3]); - - if (result_type == ctx->f16) { - tl = LLVMBuildTrunc(ctx->builder, tl, ctx->i16, ""); - trbl = LLVMBuildTrunc(ctx->builder, trbl, ctx->i16, ""); - } - - tl = LLVMBuildBitCast(ctx->builder, tl, result_type, ""); - trbl = LLVMBuildBitCast(ctx->builder, trbl, result_type, ""); - result = LLVMBuildFSub(ctx->builder, trbl, tl, ""); - - ac_build_type_name_for_intr(result_type, type, sizeof(type)); - snprintf(name, sizeof(name), "llvm.amdgcn.wqm.%s", type); - - return ac_build_intrinsic(ctx, name, result_type, &result, 1, 0); -} - -void -ac_build_sendmsg(struct ac_llvm_context *ctx, - uint32_t msg, - LLVMValueRef wave_id) -{ - LLVMValueRef args[2]; - args[0] = LLVMConstInt(ctx->i32, msg, false); - args[1] = wave_id; - ac_build_intrinsic(ctx, "llvm.amdgcn.s.sendmsg", ctx->voidt, args, 2, 0); -} - -LLVMValueRef -ac_build_imsb(struct ac_llvm_context *ctx, - LLVMValueRef arg, - LLVMTypeRef dst_type) -{ - LLVMValueRef msb = ac_build_intrinsic(ctx, "llvm.amdgcn.sffbh.i32", - dst_type, &arg, 1, - AC_FUNC_ATTR_READNONE); - - /* The HW returns the last bit index from MSB, but NIR/TGSI wants - * the index from LSB. Invert it by doing "31 - msb". */ - msb = LLVMBuildSub(ctx->builder, LLVMConstInt(ctx->i32, 31, false), - msb, ""); - - LLVMValueRef all_ones = LLVMConstInt(ctx->i32, -1, true); - LLVMValueRef cond = LLVMBuildOr(ctx->builder, - LLVMBuildICmp(ctx->builder, LLVMIntEQ, - arg, ctx->i32_0, ""), - LLVMBuildICmp(ctx->builder, LLVMIntEQ, - arg, all_ones, ""), ""); - - return LLVMBuildSelect(ctx->builder, cond, all_ones, msb, ""); -} - -LLVMValueRef -ac_build_umsb(struct ac_llvm_context *ctx, - LLVMValueRef arg, - LLVMTypeRef dst_type) -{ - const char *intrin_name; - LLVMTypeRef type; - LLVMValueRef highest_bit; - LLVMValueRef zero; - unsigned bitsize; - - bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(arg)); - switch (bitsize) { - case 64: - intrin_name = "llvm.ctlz.i64"; - type = ctx->i64; - highest_bit = LLVMConstInt(ctx->i64, 63, false); - zero = ctx->i64_0; - break; - case 32: - intrin_name = "llvm.ctlz.i32"; - type = ctx->i32; - highest_bit = LLVMConstInt(ctx->i32, 31, false); - zero = ctx->i32_0; - break; - case 16: - intrin_name = "llvm.ctlz.i16"; - type = ctx->i16; - highest_bit = LLVMConstInt(ctx->i16, 15, false); - zero = ctx->i16_0; - break; - case 8: - intrin_name = "llvm.ctlz.i8"; - type = ctx->i8; - highest_bit = LLVMConstInt(ctx->i8, 7, false); - zero = ctx->i8_0; - break; - default: - unreachable(!"invalid bitsize"); - break; - } - - LLVMValueRef params[2] = { - arg, - ctx->i1true, - }; - - LLVMValueRef msb = ac_build_intrinsic(ctx, intrin_name, type, - params, 2, - AC_FUNC_ATTR_READNONE); - - /* The HW returns the last bit index from MSB, but TGSI/NIR wants - * the index from LSB. Invert it by doing "31 - msb". */ - msb = LLVMBuildSub(ctx->builder, highest_bit, msb, ""); - - if (bitsize == 64) { - msb = LLVMBuildTrunc(ctx->builder, msb, ctx->i32, ""); - } else if (bitsize < 32) { - msb = LLVMBuildSExt(ctx->builder, msb, ctx->i32, ""); - } - - /* check for zero */ - return LLVMBuildSelect(ctx->builder, - LLVMBuildICmp(ctx->builder, LLVMIntEQ, arg, zero, ""), - LLVMConstInt(ctx->i32, -1, true), msb, ""); -} - -LLVMValueRef ac_build_fmin(struct ac_llvm_context *ctx, LLVMValueRef a, - LLVMValueRef b) -{ - char name[64]; - snprintf(name, sizeof(name), "llvm.minnum.f%d", ac_get_elem_bits(ctx, LLVMTypeOf(a))); - LLVMValueRef args[2] = {a, b}; - return ac_build_intrinsic(ctx, name, LLVMTypeOf(a), args, 2, - AC_FUNC_ATTR_READNONE); -} - -LLVMValueRef ac_build_fmax(struct ac_llvm_context *ctx, LLVMValueRef a, - LLVMValueRef b) -{ - char name[64]; - snprintf(name, sizeof(name), "llvm.maxnum.f%d", ac_get_elem_bits(ctx, LLVMTypeOf(a))); - LLVMValueRef args[2] = {a, b}; - return ac_build_intrinsic(ctx, name, LLVMTypeOf(a), args, 2, - AC_FUNC_ATTR_READNONE); -} - -LLVMValueRef ac_build_imin(struct ac_llvm_context *ctx, LLVMValueRef a, - LLVMValueRef b) -{ - LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntSLE, a, b, ""); - return LLVMBuildSelect(ctx->builder, cmp, a, b, ""); -} - -LLVMValueRef ac_build_imax(struct ac_llvm_context *ctx, LLVMValueRef a, - LLVMValueRef b) -{ - LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGT, a, b, ""); - return LLVMBuildSelect(ctx->builder, cmp, a, b, ""); -} - -LLVMValueRef ac_build_umin(struct ac_llvm_context *ctx, LLVMValueRef a, - LLVMValueRef b) -{ - LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntULE, a, b, ""); - return LLVMBuildSelect(ctx->builder, cmp, a, b, ""); -} - -LLVMValueRef ac_build_umax(struct ac_llvm_context *ctx, LLVMValueRef a, - LLVMValueRef b) -{ - LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntUGE, a, b, ""); - return LLVMBuildSelect(ctx->builder, cmp, a, b, ""); -} - -LLVMValueRef ac_build_clamp(struct ac_llvm_context *ctx, LLVMValueRef value) -{ - LLVMTypeRef t = LLVMTypeOf(value); - return ac_build_fmin(ctx, ac_build_fmax(ctx, value, LLVMConstReal(t, 0.0)), - LLVMConstReal(t, 1.0)); -} - -void ac_build_export(struct ac_llvm_context *ctx, struct ac_export_args *a) -{ - LLVMValueRef args[9]; - - args[0] = LLVMConstInt(ctx->i32, a->target, 0); - args[1] = LLVMConstInt(ctx->i32, a->enabled_channels, 0); - - if (a->compr) { - LLVMTypeRef i16 = LLVMInt16TypeInContext(ctx->context); - LLVMTypeRef v2i16 = LLVMVectorType(i16, 2); - - args[2] = LLVMBuildBitCast(ctx->builder, a->out[0], - v2i16, ""); - args[3] = LLVMBuildBitCast(ctx->builder, a->out[1], - v2i16, ""); - args[4] = LLVMConstInt(ctx->i1, a->done, 0); - args[5] = LLVMConstInt(ctx->i1, a->valid_mask, 0); - - ac_build_intrinsic(ctx, "llvm.amdgcn.exp.compr.v2i16", - ctx->voidt, args, 6, 0); - } else { - args[2] = a->out[0]; - args[3] = a->out[1]; - args[4] = a->out[2]; - args[5] = a->out[3]; - args[6] = LLVMConstInt(ctx->i1, a->done, 0); - args[7] = LLVMConstInt(ctx->i1, a->valid_mask, 0); - - ac_build_intrinsic(ctx, "llvm.amdgcn.exp.f32", - ctx->voidt, args, 8, 0); - } -} - -void ac_build_export_null(struct ac_llvm_context *ctx) -{ - struct ac_export_args args; - - args.enabled_channels = 0x0; /* enabled channels */ - args.valid_mask = 1; /* whether the EXEC mask is valid */ - args.done = 1; /* DONE bit */ - args.target = V_008DFC_SQ_EXP_NULL; - args.compr = 0; /* COMPR flag (0 = 32-bit export) */ - args.out[0] = LLVMGetUndef(ctx->f32); /* R */ - args.out[1] = LLVMGetUndef(ctx->f32); /* G */ - args.out[2] = LLVMGetUndef(ctx->f32); /* B */ - args.out[3] = LLVMGetUndef(ctx->f32); /* A */ - - ac_build_export(ctx, &args); -} - -static unsigned ac_num_coords(enum ac_image_dim dim) -{ - switch (dim) { - case ac_image_1d: - return 1; - case ac_image_2d: - case ac_image_1darray: - return 2; - case ac_image_3d: - case ac_image_cube: - case ac_image_2darray: - case ac_image_2dmsaa: - return 3; - case ac_image_2darraymsaa: - return 4; - default: - unreachable("ac_num_coords: bad dim"); - } -} - -static unsigned ac_num_derivs(enum ac_image_dim dim) -{ - switch (dim) { - case ac_image_1d: - case ac_image_1darray: - return 2; - case ac_image_2d: - case ac_image_2darray: - case ac_image_cube: - return 4; - case ac_image_3d: - return 6; - case ac_image_2dmsaa: - case ac_image_2darraymsaa: - default: - unreachable("derivatives not supported"); - } -} - -static const char *get_atomic_name(enum ac_atomic_op op) -{ - switch (op) { - case ac_atomic_swap: return "swap"; - case ac_atomic_add: return "add"; - case ac_atomic_sub: return "sub"; - case ac_atomic_smin: return "smin"; - case ac_atomic_umin: return "umin"; - case ac_atomic_smax: return "smax"; - case ac_atomic_umax: return "umax"; - case ac_atomic_and: return "and"; - case ac_atomic_or: return "or"; - case ac_atomic_xor: return "xor"; - case ac_atomic_inc_wrap: return "inc"; - case ac_atomic_dec_wrap: return "dec"; - } - unreachable("bad atomic op"); -} - -LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx, - struct ac_image_args *a) -{ - const char *overload[3] = { "", "", "" }; - unsigned num_overloads = 0; - LLVMValueRef args[18]; - unsigned num_args = 0; - enum ac_image_dim dim = a->dim; - - assert(!a->lod || a->lod == ctx->i32_0 || a->lod == ctx->f32_0 || - !a->level_zero); - assert((a->opcode != ac_image_get_resinfo && a->opcode != ac_image_load_mip && - a->opcode != ac_image_store_mip) || - a->lod); - assert(a->opcode == ac_image_sample || a->opcode == ac_image_gather4 || - (!a->compare && !a->offset)); - assert((a->opcode == ac_image_sample || a->opcode == ac_image_gather4 || - a->opcode == ac_image_get_lod) || - !a->bias); - assert((a->bias ? 1 : 0) + - (a->lod ? 1 : 0) + - (a->level_zero ? 1 : 0) + - (a->derivs[0] ? 1 : 0) <= 1); - - if (a->opcode == ac_image_get_lod) { - switch (dim) { - case ac_image_1darray: - dim = ac_image_1d; - break; - case ac_image_2darray: - case ac_image_cube: - dim = ac_image_2d; - break; - default: - break; - } - } - - bool sample = a->opcode == ac_image_sample || - a->opcode == ac_image_gather4 || - a->opcode == ac_image_get_lod; - bool atomic = a->opcode == ac_image_atomic || - a->opcode == ac_image_atomic_cmpswap; - bool load = a->opcode == ac_image_sample || - a->opcode == ac_image_gather4 || - a->opcode == ac_image_load || - a->opcode == ac_image_load_mip; - LLVMTypeRef coord_type = sample ? ctx->f32 : ctx->i32; - - if (atomic || a->opcode == ac_image_store || a->opcode == ac_image_store_mip) { - args[num_args++] = a->data[0]; - if (a->opcode == ac_image_atomic_cmpswap) - args[num_args++] = a->data[1]; - } - - if (!atomic) - args[num_args++] = LLVMConstInt(ctx->i32, a->dmask, false); - - if (a->offset) - args[num_args++] = ac_to_integer(ctx, a->offset); - if (a->bias) { - args[num_args++] = ac_to_float(ctx, a->bias); - overload[num_overloads++] = ".f32"; - } - if (a->compare) - args[num_args++] = ac_to_float(ctx, a->compare); - if (a->derivs[0]) { - unsigned count = ac_num_derivs(dim); - for (unsigned i = 0; i < count; ++i) - args[num_args++] = ac_to_float(ctx, a->derivs[i]); - overload[num_overloads++] = ".f32"; - } - unsigned num_coords = - a->opcode != ac_image_get_resinfo ? ac_num_coords(dim) : 0; - for (unsigned i = 0; i < num_coords; ++i) - args[num_args++] = LLVMBuildBitCast(ctx->builder, a->coords[i], coord_type, ""); - if (a->lod) - args[num_args++] = LLVMBuildBitCast(ctx->builder, a->lod, coord_type, ""); - overload[num_overloads++] = sample ? ".f32" : ".i32"; - - args[num_args++] = a->resource; - if (sample) { - args[num_args++] = a->sampler; - args[num_args++] = LLVMConstInt(ctx->i1, a->unorm, false); - } - - args[num_args++] = ctx->i32_0; /* texfailctrl */ - args[num_args++] = LLVMConstInt(ctx->i32, - load ? get_load_cache_policy(ctx, a->cache_policy) : - a->cache_policy, false); - - const char *name; - const char *atomic_subop = ""; - switch (a->opcode) { - case ac_image_sample: name = "sample"; break; - case ac_image_gather4: name = "gather4"; break; - case ac_image_load: name = "load"; break; - case ac_image_load_mip: name = "load.mip"; break; - case ac_image_store: name = "store"; break; - case ac_image_store_mip: name = "store.mip"; break; - case ac_image_atomic: - name = "atomic."; - atomic_subop = get_atomic_name(a->atomic); - break; - case ac_image_atomic_cmpswap: - name = "atomic."; - atomic_subop = "cmpswap"; - break; - case ac_image_get_lod: name = "getlod"; break; - case ac_image_get_resinfo: name = "getresinfo"; break; - default: unreachable("invalid image opcode"); - } - - const char *dimname; - switch (dim) { - case ac_image_1d: dimname = "1d"; break; - case ac_image_2d: dimname = "2d"; break; - case ac_image_3d: dimname = "3d"; break; - case ac_image_cube: dimname = "cube"; break; - case ac_image_1darray: dimname = "1darray"; break; - case ac_image_2darray: dimname = "2darray"; break; - case ac_image_2dmsaa: dimname = "2dmsaa"; break; - case ac_image_2darraymsaa: dimname = "2darraymsaa"; break; - default: unreachable("invalid dim"); - } - - bool lod_suffix = - a->lod && (a->opcode == ac_image_sample || a->opcode == ac_image_gather4); - char intr_name[96]; - snprintf(intr_name, sizeof(intr_name), - "llvm.amdgcn.image.%s%s" /* base name */ - "%s%s%s" /* sample/gather modifiers */ - ".%s.%s%s%s%s", /* dimension and type overloads */ - name, atomic_subop, - a->compare ? ".c" : "", - a->bias ? ".b" : - lod_suffix ? ".l" : - a->derivs[0] ? ".d" : - a->level_zero ? ".lz" : "", - a->offset ? ".o" : "", - dimname, - atomic ? "i32" : "v4f32", - overload[0], overload[1], overload[2]); - - LLVMTypeRef retty; - if (atomic) - retty = ctx->i32; - else if (a->opcode == ac_image_store || a->opcode == ac_image_store_mip) - retty = ctx->voidt; - else - retty = ctx->v4f32; - - LLVMValueRef result = - ac_build_intrinsic(ctx, intr_name, retty, args, num_args, - a->attributes); - if (!sample && retty == ctx->v4f32) { - result = LLVMBuildBitCast(ctx->builder, result, - ctx->v4i32, ""); - } - return result; -} - -LLVMValueRef ac_build_cvt_pkrtz_f16(struct ac_llvm_context *ctx, - LLVMValueRef args[2]) -{ - LLVMTypeRef v2f16 = - LLVMVectorType(LLVMHalfTypeInContext(ctx->context), 2); - - return ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pkrtz", v2f16, - args, 2, AC_FUNC_ATTR_READNONE); -} - -LLVMValueRef ac_build_cvt_pknorm_i16(struct ac_llvm_context *ctx, - LLVMValueRef args[2]) -{ - LLVMValueRef res = - ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pknorm.i16", - ctx->v2i16, args, 2, - AC_FUNC_ATTR_READNONE); - return LLVMBuildBitCast(ctx->builder, res, ctx->i32, ""); -} - -LLVMValueRef ac_build_cvt_pknorm_u16(struct ac_llvm_context *ctx, - LLVMValueRef args[2]) -{ - LLVMValueRef res = - ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pknorm.u16", - ctx->v2i16, args, 2, - AC_FUNC_ATTR_READNONE); - return LLVMBuildBitCast(ctx->builder, res, ctx->i32, ""); -} - -/* The 8-bit and 10-bit clamping is for HW workarounds. */ -LLVMValueRef ac_build_cvt_pk_i16(struct ac_llvm_context *ctx, - LLVMValueRef args[2], unsigned bits, bool hi) -{ - assert(bits == 8 || bits == 10 || bits == 16); - - LLVMValueRef max_rgb = LLVMConstInt(ctx->i32, - bits == 8 ? 127 : bits == 10 ? 511 : 32767, 0); - LLVMValueRef min_rgb = LLVMConstInt(ctx->i32, - bits == 8 ? -128 : bits == 10 ? -512 : -32768, 0); - LLVMValueRef max_alpha = - bits != 10 ? max_rgb : ctx->i32_1; - LLVMValueRef min_alpha = - bits != 10 ? min_rgb : LLVMConstInt(ctx->i32, -2, 0); - - /* Clamp. */ - if (bits != 16) { - for (int i = 0; i < 2; i++) { - bool alpha = hi && i == 1; - args[i] = ac_build_imin(ctx, args[i], - alpha ? max_alpha : max_rgb); - args[i] = ac_build_imax(ctx, args[i], - alpha ? min_alpha : min_rgb); - } - } - - LLVMValueRef res = - ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pk.i16", - ctx->v2i16, args, 2, - AC_FUNC_ATTR_READNONE); - return LLVMBuildBitCast(ctx->builder, res, ctx->i32, ""); -} - -/* The 8-bit and 10-bit clamping is for HW workarounds. */ -LLVMValueRef ac_build_cvt_pk_u16(struct ac_llvm_context *ctx, - LLVMValueRef args[2], unsigned bits, bool hi) -{ - assert(bits == 8 || bits == 10 || bits == 16); - - LLVMValueRef max_rgb = LLVMConstInt(ctx->i32, - bits == 8 ? 255 : bits == 10 ? 1023 : 65535, 0); - LLVMValueRef max_alpha = - bits != 10 ? max_rgb : LLVMConstInt(ctx->i32, 3, 0); - - /* Clamp. */ - if (bits != 16) { - for (int i = 0; i < 2; i++) { - bool alpha = hi && i == 1; - args[i] = ac_build_umin(ctx, args[i], - alpha ? max_alpha : max_rgb); - } - } - - LLVMValueRef res = - ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pk.u16", - ctx->v2i16, args, 2, - AC_FUNC_ATTR_READNONE); - return LLVMBuildBitCast(ctx->builder, res, ctx->i32, ""); -} - -LLVMValueRef ac_build_wqm_vote(struct ac_llvm_context *ctx, LLVMValueRef i1) -{ - return ac_build_intrinsic(ctx, "llvm.amdgcn.wqm.vote", ctx->i1, - &i1, 1, AC_FUNC_ATTR_READNONE); -} - -void ac_build_kill_if_false(struct ac_llvm_context *ctx, LLVMValueRef i1) -{ - ac_build_intrinsic(ctx, "llvm.amdgcn.kill", ctx->voidt, - &i1, 1, 0); -} - -LLVMValueRef ac_build_bfe(struct ac_llvm_context *ctx, LLVMValueRef input, - LLVMValueRef offset, LLVMValueRef width, - bool is_signed) -{ - LLVMValueRef args[] = { - input, - offset, - width, - }; - - LLVMValueRef result = ac_build_intrinsic(ctx, - is_signed ? "llvm.amdgcn.sbfe.i32" : - "llvm.amdgcn.ubfe.i32", - ctx->i32, args, 3, - AC_FUNC_ATTR_READNONE); - - if (HAVE_LLVM < 0x0800) { - /* FIXME: LLVM 7+ returns incorrect result when count is 0. - * https://bugs.freedesktop.org/show_bug.cgi?id=107276 - */ - LLVMValueRef zero = ctx->i32_0; - LLVMValueRef icond = LLVMBuildICmp(ctx->builder, LLVMIntEQ, width, zero, ""); - result = LLVMBuildSelect(ctx->builder, icond, zero, result, ""); - } - - return result; -} - -LLVMValueRef ac_build_imad(struct ac_llvm_context *ctx, LLVMValueRef s0, - LLVMValueRef s1, LLVMValueRef s2) -{ - return LLVMBuildAdd(ctx->builder, - LLVMBuildMul(ctx->builder, s0, s1, ""), s2, ""); -} - -LLVMValueRef ac_build_fmad(struct ac_llvm_context *ctx, LLVMValueRef s0, - LLVMValueRef s1, LLVMValueRef s2) -{ - return LLVMBuildFAdd(ctx->builder, - LLVMBuildFMul(ctx->builder, s0, s1, ""), s2, ""); -} - -void ac_build_waitcnt(struct ac_llvm_context *ctx, unsigned wait_flags) -{ - if (!wait_flags) - return; - - unsigned lgkmcnt = 63; - unsigned vmcnt = ctx->chip_class >= GFX9 ? 63 : 15; - unsigned vscnt = 63; - - if (wait_flags & AC_WAIT_LGKM) - lgkmcnt = 0; - if (wait_flags & AC_WAIT_VLOAD) - vmcnt = 0; - - if (wait_flags & AC_WAIT_VSTORE) { - if (ctx->chip_class >= GFX10) - vscnt = 0; - else - vmcnt = 0; - } - - /* There is no intrinsic for vscnt(0), so use a fence. */ - if ((wait_flags & AC_WAIT_LGKM && - wait_flags & AC_WAIT_VLOAD && - wait_flags & AC_WAIT_VSTORE) || - vscnt == 0) { - LLVMBuildFence(ctx->builder, LLVMAtomicOrderingRelease, false, ""); - return; - } - - unsigned simm16 = (lgkmcnt << 8) | - (7 << 4) | /* expcnt */ - (vmcnt & 0xf) | - ((vmcnt >> 4) << 14); - - LLVMValueRef args[1] = { - LLVMConstInt(ctx->i32, simm16, false), - }; - ac_build_intrinsic(ctx, "llvm.amdgcn.s.waitcnt", - ctx->voidt, args, 1, 0); -} - -LLVMValueRef ac_build_fmed3(struct ac_llvm_context *ctx, LLVMValueRef src0, - LLVMValueRef src1, LLVMValueRef src2, - unsigned bitsize) -{ - LLVMTypeRef type; - char *intr; - - if (bitsize == 16) { - intr = "llvm.amdgcn.fmed3.f16"; - type = ctx->f16; - } else if (bitsize == 32) { - intr = "llvm.amdgcn.fmed3.f32"; - type = ctx->f32; - } else { - intr = "llvm.amdgcn.fmed3.f64"; - type = ctx->f64; - } - - LLVMValueRef params[] = { - src0, - src1, - src2, - }; - return ac_build_intrinsic(ctx, intr, type, params, 3, - AC_FUNC_ATTR_READNONE); -} - -LLVMValueRef ac_build_fract(struct ac_llvm_context *ctx, LLVMValueRef src0, - unsigned bitsize) -{ - LLVMTypeRef type; - char *intr; - - if (bitsize == 16) { - intr = "llvm.amdgcn.fract.f16"; - type = ctx->f16; - } else if (bitsize == 32) { - intr = "llvm.amdgcn.fract.f32"; - type = ctx->f32; - } else { - intr = "llvm.amdgcn.fract.f64"; - type = ctx->f64; - } - - LLVMValueRef params[] = { - src0, - }; - return ac_build_intrinsic(ctx, intr, type, params, 1, - AC_FUNC_ATTR_READNONE); -} - -LLVMValueRef ac_build_isign(struct ac_llvm_context *ctx, LLVMValueRef src0, - unsigned bitsize) -{ - LLVMTypeRef type = LLVMIntTypeInContext(ctx->context, bitsize); - LLVMValueRef zero = LLVMConstInt(type, 0, false); - LLVMValueRef one = LLVMConstInt(type, 1, false); - - LLVMValueRef cmp, val; - cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGT, src0, zero, ""); - val = LLVMBuildSelect(ctx->builder, cmp, one, src0, ""); - cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGE, val, zero, ""); - val = LLVMBuildSelect(ctx->builder, cmp, val, LLVMConstInt(type, -1, true), ""); - return val; -} - -LLVMValueRef ac_build_fsign(struct ac_llvm_context *ctx, LLVMValueRef src0, - unsigned bitsize) -{ - LLVMValueRef cmp, val, zero, one; - LLVMTypeRef type; - - if (bitsize == 16) { - type = ctx->f16; - zero = ctx->f16_0; - one = ctx->f16_1; - } else if (bitsize == 32) { - type = ctx->f32; - zero = ctx->f32_0; - one = ctx->f32_1; - } else { - type = ctx->f64; - zero = ctx->f64_0; - one = ctx->f64_1; - } - - cmp = LLVMBuildFCmp(ctx->builder, LLVMRealOGT, src0, zero, ""); - val = LLVMBuildSelect(ctx->builder, cmp, one, src0, ""); - cmp = LLVMBuildFCmp(ctx->builder, LLVMRealOGE, val, zero, ""); - val = LLVMBuildSelect(ctx->builder, cmp, val, LLVMConstReal(type, -1.0), ""); - return val; -} - -LLVMValueRef ac_build_bit_count(struct ac_llvm_context *ctx, LLVMValueRef src0) -{ - LLVMValueRef result; - unsigned bitsize; - - bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0)); - - switch (bitsize) { - case 64: - result = ac_build_intrinsic(ctx, "llvm.ctpop.i64", ctx->i64, - (LLVMValueRef []) { src0 }, 1, - AC_FUNC_ATTR_READNONE); - - result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, ""); - break; - case 32: - result = ac_build_intrinsic(ctx, "llvm.ctpop.i32", ctx->i32, - (LLVMValueRef []) { src0 }, 1, - AC_FUNC_ATTR_READNONE); - break; - case 16: - result = ac_build_intrinsic(ctx, "llvm.ctpop.i16", ctx->i16, - (LLVMValueRef []) { src0 }, 1, - AC_FUNC_ATTR_READNONE); - - result = LLVMBuildZExt(ctx->builder, result, ctx->i32, ""); - break; - case 8: - result = ac_build_intrinsic(ctx, "llvm.ctpop.i8", ctx->i8, - (LLVMValueRef []) { src0 }, 1, - AC_FUNC_ATTR_READNONE); - - result = LLVMBuildZExt(ctx->builder, result, ctx->i32, ""); - break; - default: - unreachable(!"invalid bitsize"); - break; - } - - return result; -} - -LLVMValueRef ac_build_bitfield_reverse(struct ac_llvm_context *ctx, - LLVMValueRef src0) -{ - LLVMValueRef result; - unsigned bitsize; - - bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0)); - - switch (bitsize) { - case 64: - result = ac_build_intrinsic(ctx, "llvm.bitreverse.i64", ctx->i64, - (LLVMValueRef []) { src0 }, 1, - AC_FUNC_ATTR_READNONE); - - result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, ""); - break; - case 32: - result = ac_build_intrinsic(ctx, "llvm.bitreverse.i32", ctx->i32, - (LLVMValueRef []) { src0 }, 1, - AC_FUNC_ATTR_READNONE); - break; - case 16: - result = ac_build_intrinsic(ctx, "llvm.bitreverse.i16", ctx->i16, - (LLVMValueRef []) { src0 }, 1, - AC_FUNC_ATTR_READNONE); - - result = LLVMBuildZExt(ctx->builder, result, ctx->i32, ""); - break; - case 8: - result = ac_build_intrinsic(ctx, "llvm.bitreverse.i8", ctx->i8, - (LLVMValueRef []) { src0 }, 1, - AC_FUNC_ATTR_READNONE); - - result = LLVMBuildZExt(ctx->builder, result, ctx->i32, ""); - break; - default: - unreachable(!"invalid bitsize"); - break; - } - - return result; -} - -#define AC_EXP_TARGET 0 -#define AC_EXP_ENABLED_CHANNELS 1 -#define AC_EXP_OUT0 2 - -enum ac_ir_type { - AC_IR_UNDEF, - AC_IR_CONST, - AC_IR_VALUE, -}; - -struct ac_vs_exp_chan -{ - LLVMValueRef value; - float const_float; - enum ac_ir_type type; -}; - -struct ac_vs_exp_inst { - unsigned offset; - LLVMValueRef inst; - struct ac_vs_exp_chan chan[4]; -}; - -struct ac_vs_exports { - unsigned num; - struct ac_vs_exp_inst exp[VARYING_SLOT_MAX]; -}; - -/* Return true if the PARAM export has been eliminated. */ -static bool ac_eliminate_const_output(uint8_t *vs_output_param_offset, - uint32_t num_outputs, - struct ac_vs_exp_inst *exp) -{ - unsigned i, default_val; /* SPI_PS_INPUT_CNTL_i.DEFAULT_VAL */ - bool is_zero[4] = {}, is_one[4] = {}; - - for (i = 0; i < 4; i++) { - /* It's a constant expression. Undef outputs are eliminated too. */ - if (exp->chan[i].type == AC_IR_UNDEF) { - is_zero[i] = true; - is_one[i] = true; - } else if (exp->chan[i].type == AC_IR_CONST) { - if (exp->chan[i].const_float == 0) - is_zero[i] = true; - else if (exp->chan[i].const_float == 1) - is_one[i] = true; - else - return false; /* other constant */ - } else - return false; - } - - /* Only certain combinations of 0 and 1 can be eliminated. */ - if (is_zero[0] && is_zero[1] && is_zero[2]) - default_val = is_zero[3] ? 0 : 1; - else if (is_one[0] && is_one[1] && is_one[2]) - default_val = is_zero[3] ? 2 : 3; - else - return false; - - /* The PARAM export can be represented as DEFAULT_VAL. Kill it. */ - LLVMInstructionEraseFromParent(exp->inst); - - /* Change OFFSET to DEFAULT_VAL. */ - for (i = 0; i < num_outputs; i++) { - if (vs_output_param_offset[i] == exp->offset) { - vs_output_param_offset[i] = - AC_EXP_PARAM_DEFAULT_VAL_0000 + default_val; - break; - } - } - return true; -} - -static bool ac_eliminate_duplicated_output(struct ac_llvm_context *ctx, - uint8_t *vs_output_param_offset, - uint32_t num_outputs, - struct ac_vs_exports *processed, - struct ac_vs_exp_inst *exp) -{ - unsigned p, copy_back_channels = 0; - - /* See if the output is already in the list of processed outputs. - * The LLVMValueRef comparison relies on SSA. - */ - for (p = 0; p < processed->num; p++) { - bool different = false; - - for (unsigned j = 0; j < 4; j++) { - struct ac_vs_exp_chan *c1 = &processed->exp[p].chan[j]; - struct ac_vs_exp_chan *c2 = &exp->chan[j]; - - /* Treat undef as a match. */ - if (c2->type == AC_IR_UNDEF) - continue; - - /* If c1 is undef but c2 isn't, we can copy c2 to c1 - * and consider the instruction duplicated. - */ - if (c1->type == AC_IR_UNDEF) { - copy_back_channels |= 1 << j; - continue; - } - - /* Test whether the channels are not equal. */ - if (c1->type != c2->type || - (c1->type == AC_IR_CONST && - c1->const_float != c2->const_float) || - (c1->type == AC_IR_VALUE && - c1->value != c2->value)) { - different = true; - break; - } - } - if (!different) - break; - - copy_back_channels = 0; - } - if (p == processed->num) - return false; - - /* If a match was found, but the matching export has undef where the new - * one has a normal value, copy the normal value to the undef channel. - */ - struct ac_vs_exp_inst *match = &processed->exp[p]; - - /* Get current enabled channels mask. */ - LLVMValueRef arg = LLVMGetOperand(match->inst, AC_EXP_ENABLED_CHANNELS); - unsigned enabled_channels = LLVMConstIntGetZExtValue(arg); - - while (copy_back_channels) { - unsigned chan = u_bit_scan(©_back_channels); - - assert(match->chan[chan].type == AC_IR_UNDEF); - LLVMSetOperand(match->inst, AC_EXP_OUT0 + chan, - exp->chan[chan].value); - match->chan[chan] = exp->chan[chan]; - - /* Update number of enabled channels because the original mask - * is not always 0xf. - */ - enabled_channels |= (1 << chan); - LLVMSetOperand(match->inst, AC_EXP_ENABLED_CHANNELS, - LLVMConstInt(ctx->i32, enabled_channels, 0)); - } - - /* The PARAM export is duplicated. Kill it. */ - LLVMInstructionEraseFromParent(exp->inst); - - /* Change OFFSET to the matching export. */ - for (unsigned i = 0; i < num_outputs; i++) { - if (vs_output_param_offset[i] == exp->offset) { - vs_output_param_offset[i] = match->offset; - break; - } - } - return true; -} - -void ac_optimize_vs_outputs(struct ac_llvm_context *ctx, - LLVMValueRef main_fn, - uint8_t *vs_output_param_offset, - uint32_t num_outputs, - uint8_t *num_param_exports) -{ - LLVMBasicBlockRef bb; - bool removed_any = false; - struct ac_vs_exports exports; - - exports.num = 0; - - /* Process all LLVM instructions. */ - bb = LLVMGetFirstBasicBlock(main_fn); - while (bb) { - LLVMValueRef inst = LLVMGetFirstInstruction(bb); - - while (inst) { - LLVMValueRef cur = inst; - inst = LLVMGetNextInstruction(inst); - struct ac_vs_exp_inst exp; - - if (LLVMGetInstructionOpcode(cur) != LLVMCall) - continue; - - LLVMValueRef callee = ac_llvm_get_called_value(cur); - - if (!ac_llvm_is_function(callee)) - continue; - - const char *name = LLVMGetValueName(callee); - unsigned num_args = LLVMCountParams(callee); - - /* Check if this is an export instruction. */ - if ((num_args != 9 && num_args != 8) || - (strcmp(name, "llvm.SI.export") && - strcmp(name, "llvm.amdgcn.exp.f32"))) - continue; - - LLVMValueRef arg = LLVMGetOperand(cur, AC_EXP_TARGET); - unsigned target = LLVMConstIntGetZExtValue(arg); - - if (target < V_008DFC_SQ_EXP_PARAM) - continue; - - target -= V_008DFC_SQ_EXP_PARAM; - - /* Parse the instruction. */ - memset(&exp, 0, sizeof(exp)); - exp.offset = target; - exp.inst = cur; - - for (unsigned i = 0; i < 4; i++) { - LLVMValueRef v = LLVMGetOperand(cur, AC_EXP_OUT0 + i); - - exp.chan[i].value = v; - - if (LLVMIsUndef(v)) { - exp.chan[i].type = AC_IR_UNDEF; - } else if (LLVMIsAConstantFP(v)) { - LLVMBool loses_info; - exp.chan[i].type = AC_IR_CONST; - exp.chan[i].const_float = - LLVMConstRealGetDouble(v, &loses_info); - } else { - exp.chan[i].type = AC_IR_VALUE; - } - } - - /* Eliminate constant and duplicated PARAM exports. */ - if (ac_eliminate_const_output(vs_output_param_offset, - num_outputs, &exp) || - ac_eliminate_duplicated_output(ctx, - vs_output_param_offset, - num_outputs, &exports, - &exp)) { - removed_any = true; - } else { - exports.exp[exports.num++] = exp; - } - } - bb = LLVMGetNextBasicBlock(bb); - } - - /* Remove holes in export memory due to removed PARAM exports. - * This is done by renumbering all PARAM exports. - */ - if (removed_any) { - uint8_t old_offset[VARYING_SLOT_MAX]; - unsigned out, i; - - /* Make a copy of the offsets. We need the old version while - * we are modifying some of them. */ - memcpy(old_offset, vs_output_param_offset, - sizeof(old_offset)); - - for (i = 0; i < exports.num; i++) { - unsigned offset = exports.exp[i].offset; - - /* Update vs_output_param_offset. Multiple outputs can - * have the same offset. - */ - for (out = 0; out < num_outputs; out++) { - if (old_offset[out] == offset) - vs_output_param_offset[out] = i; - } - - /* Change the PARAM offset in the instruction. */ - LLVMSetOperand(exports.exp[i].inst, AC_EXP_TARGET, - LLVMConstInt(ctx->i32, - V_008DFC_SQ_EXP_PARAM + i, 0)); - } - *num_param_exports = exports.num; - } -} - -void ac_init_exec_full_mask(struct ac_llvm_context *ctx) -{ - LLVMValueRef full_mask = LLVMConstInt(ctx->i64, ~0ull, 0); - ac_build_intrinsic(ctx, - "llvm.amdgcn.init.exec", ctx->voidt, - &full_mask, 1, AC_FUNC_ATTR_CONVERGENT); -} - -void ac_declare_lds_as_pointer(struct ac_llvm_context *ctx) -{ - unsigned lds_size = ctx->chip_class >= GFX7 ? 65536 : 32768; - ctx->lds = LLVMBuildIntToPtr(ctx->builder, ctx->i32_0, - LLVMPointerType(LLVMArrayType(ctx->i32, lds_size / 4), AC_ADDR_SPACE_LDS), - "lds"); -} - -LLVMValueRef ac_lds_load(struct ac_llvm_context *ctx, - LLVMValueRef dw_addr) -{ - return LLVMBuildLoad(ctx->builder, ac_build_gep0(ctx, ctx->lds, dw_addr), ""); -} - -void ac_lds_store(struct ac_llvm_context *ctx, - LLVMValueRef dw_addr, - LLVMValueRef value) -{ - value = ac_to_integer(ctx, value); - ac_build_indexed_store(ctx, ctx->lds, - dw_addr, value); -} - -LLVMValueRef ac_find_lsb(struct ac_llvm_context *ctx, - LLVMTypeRef dst_type, - LLVMValueRef src0) -{ - unsigned src0_bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0)); - const char *intrin_name; - LLVMTypeRef type; - LLVMValueRef zero; - - switch (src0_bitsize) { - case 64: - intrin_name = "llvm.cttz.i64"; - type = ctx->i64; - zero = ctx->i64_0; - break; - case 32: - intrin_name = "llvm.cttz.i32"; - type = ctx->i32; - zero = ctx->i32_0; - break; - case 16: - intrin_name = "llvm.cttz.i16"; - type = ctx->i16; - zero = ctx->i16_0; - break; - case 8: - intrin_name = "llvm.cttz.i8"; - type = ctx->i8; - zero = ctx->i8_0; - break; - default: - unreachable(!"invalid bitsize"); - } - - LLVMValueRef params[2] = { - src0, - - /* The value of 1 means that ffs(x=0) = undef, so LLVM won't - * add special code to check for x=0. The reason is that - * the LLVM behavior for x=0 is different from what we - * need here. However, LLVM also assumes that ffs(x) is - * in [0, 31], but GLSL expects that ffs(0) = -1, so - * a conditional assignment to handle 0 is still required. - * - * The hardware already implements the correct behavior. - */ - ctx->i1true, - }; - - LLVMValueRef lsb = ac_build_intrinsic(ctx, intrin_name, type, - params, 2, - AC_FUNC_ATTR_READNONE); - - if (src0_bitsize == 64) { - lsb = LLVMBuildTrunc(ctx->builder, lsb, ctx->i32, ""); - } else if (src0_bitsize < 32) { - lsb = LLVMBuildSExt(ctx->builder, lsb, ctx->i32, ""); - } - - /* TODO: We need an intrinsic to skip this conditional. */ - /* Check for zero: */ - return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, - LLVMIntEQ, src0, - zero, ""), - LLVMConstInt(ctx->i32, -1, 0), lsb, ""); -} - -LLVMTypeRef ac_array_in_const_addr_space(LLVMTypeRef elem_type) -{ - return LLVMPointerType(elem_type, AC_ADDR_SPACE_CONST); -} - -LLVMTypeRef ac_array_in_const32_addr_space(LLVMTypeRef elem_type) -{ - return LLVMPointerType(elem_type, AC_ADDR_SPACE_CONST_32BIT); -} - -static struct ac_llvm_flow * -get_current_flow(struct ac_llvm_context *ctx) -{ - if (ctx->flow->depth > 0) - return &ctx->flow->stack[ctx->flow->depth - 1]; - return NULL; -} - -static struct ac_llvm_flow * -get_innermost_loop(struct ac_llvm_context *ctx) -{ - for (unsigned i = ctx->flow->depth; i > 0; --i) { - if (ctx->flow->stack[i - 1].loop_entry_block) - return &ctx->flow->stack[i - 1]; - } - return NULL; -} - -static struct ac_llvm_flow * -push_flow(struct ac_llvm_context *ctx) -{ - struct ac_llvm_flow *flow; - - if (ctx->flow->depth >= ctx->flow->depth_max) { - unsigned new_max = MAX2(ctx->flow->depth << 1, - AC_LLVM_INITIAL_CF_DEPTH); - - ctx->flow->stack = realloc(ctx->flow->stack, new_max * sizeof(*ctx->flow->stack)); - ctx->flow->depth_max = new_max; - } - - flow = &ctx->flow->stack[ctx->flow->depth]; - ctx->flow->depth++; - - flow->next_block = NULL; - flow->loop_entry_block = NULL; - return flow; -} - -static void set_basicblock_name(LLVMBasicBlockRef bb, const char *base, - int label_id) -{ - char buf[32]; - snprintf(buf, sizeof(buf), "%s%d", base, label_id); - LLVMSetValueName(LLVMBasicBlockAsValue(bb), buf); -} - -/* Append a basic block at the level of the parent flow. - */ -static LLVMBasicBlockRef append_basic_block(struct ac_llvm_context *ctx, - const char *name) -{ - assert(ctx->flow->depth >= 1); - - if (ctx->flow->depth >= 2) { - struct ac_llvm_flow *flow = &ctx->flow->stack[ctx->flow->depth - 2]; - - return LLVMInsertBasicBlockInContext(ctx->context, - flow->next_block, name); - } - - LLVMValueRef main_fn = - LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx->builder)); - return LLVMAppendBasicBlockInContext(ctx->context, main_fn, name); -} - -/* Emit a branch to the given default target for the current block if - * applicable -- that is, if the current block does not already contain a - * branch from a break or continue. - */ -static void emit_default_branch(LLVMBuilderRef builder, - LLVMBasicBlockRef target) -{ - if (!LLVMGetBasicBlockTerminator(LLVMGetInsertBlock(builder))) - LLVMBuildBr(builder, target); -} - -void ac_build_bgnloop(struct ac_llvm_context *ctx, int label_id) -{ - struct ac_llvm_flow *flow = push_flow(ctx); - flow->loop_entry_block = append_basic_block(ctx, "LOOP"); - flow->next_block = append_basic_block(ctx, "ENDLOOP"); - set_basicblock_name(flow->loop_entry_block, "loop", label_id); - LLVMBuildBr(ctx->builder, flow->loop_entry_block); - LLVMPositionBuilderAtEnd(ctx->builder, flow->loop_entry_block); -} - -void ac_build_break(struct ac_llvm_context *ctx) -{ - struct ac_llvm_flow *flow = get_innermost_loop(ctx); - LLVMBuildBr(ctx->builder, flow->next_block); -} - -void ac_build_continue(struct ac_llvm_context *ctx) -{ - struct ac_llvm_flow *flow = get_innermost_loop(ctx); - LLVMBuildBr(ctx->builder, flow->loop_entry_block); -} - -void ac_build_else(struct ac_llvm_context *ctx, int label_id) -{ - struct ac_llvm_flow *current_branch = get_current_flow(ctx); - LLVMBasicBlockRef endif_block; - - assert(!current_branch->loop_entry_block); - - endif_block = append_basic_block(ctx, "ENDIF"); - emit_default_branch(ctx->builder, endif_block); - - LLVMPositionBuilderAtEnd(ctx->builder, current_branch->next_block); - set_basicblock_name(current_branch->next_block, "else", label_id); - - current_branch->next_block = endif_block; -} - -void ac_build_endif(struct ac_llvm_context *ctx, int label_id) -{ - struct ac_llvm_flow *current_branch = get_current_flow(ctx); - - assert(!current_branch->loop_entry_block); - - emit_default_branch(ctx->builder, current_branch->next_block); - LLVMPositionBuilderAtEnd(ctx->builder, current_branch->next_block); - set_basicblock_name(current_branch->next_block, "endif", label_id); - - ctx->flow->depth--; -} - -void ac_build_endloop(struct ac_llvm_context *ctx, int label_id) -{ - struct ac_llvm_flow *current_loop = get_current_flow(ctx); - - assert(current_loop->loop_entry_block); - - emit_default_branch(ctx->builder, current_loop->loop_entry_block); - - LLVMPositionBuilderAtEnd(ctx->builder, current_loop->next_block); - set_basicblock_name(current_loop->next_block, "endloop", label_id); - ctx->flow->depth--; -} - -void ac_build_ifcc(struct ac_llvm_context *ctx, LLVMValueRef cond, int label_id) -{ - struct ac_llvm_flow *flow = push_flow(ctx); - LLVMBasicBlockRef if_block; - - if_block = append_basic_block(ctx, "IF"); - flow->next_block = append_basic_block(ctx, "ELSE"); - set_basicblock_name(if_block, "if", label_id); - LLVMBuildCondBr(ctx->builder, cond, if_block, flow->next_block); - LLVMPositionBuilderAtEnd(ctx->builder, if_block); -} - -void ac_build_if(struct ac_llvm_context *ctx, LLVMValueRef value, - int label_id) -{ - LLVMValueRef cond = LLVMBuildFCmp(ctx->builder, LLVMRealUNE, - value, ctx->f32_0, ""); - ac_build_ifcc(ctx, cond, label_id); -} - -void ac_build_uif(struct ac_llvm_context *ctx, LLVMValueRef value, - int label_id) -{ - LLVMValueRef cond = LLVMBuildICmp(ctx->builder, LLVMIntNE, - ac_to_integer(ctx, value), - ctx->i32_0, ""); - ac_build_ifcc(ctx, cond, label_id); -} - -LLVMValueRef ac_build_alloca_undef(struct ac_llvm_context *ac, LLVMTypeRef type, - const char *name) -{ - LLVMBuilderRef builder = ac->builder; - LLVMBasicBlockRef current_block = LLVMGetInsertBlock(builder); - LLVMValueRef function = LLVMGetBasicBlockParent(current_block); - LLVMBasicBlockRef first_block = LLVMGetEntryBasicBlock(function); - LLVMValueRef first_instr = LLVMGetFirstInstruction(first_block); - LLVMBuilderRef first_builder = LLVMCreateBuilderInContext(ac->context); - LLVMValueRef res; - - if (first_instr) { - LLVMPositionBuilderBefore(first_builder, first_instr); - } else { - LLVMPositionBuilderAtEnd(first_builder, first_block); - } - - res = LLVMBuildAlloca(first_builder, type, name); - LLVMDisposeBuilder(first_builder); - return res; -} - -LLVMValueRef ac_build_alloca(struct ac_llvm_context *ac, - LLVMTypeRef type, const char *name) -{ - LLVMValueRef ptr = ac_build_alloca_undef(ac, type, name); - LLVMBuildStore(ac->builder, LLVMConstNull(type), ptr); - return ptr; -} - -LLVMValueRef ac_cast_ptr(struct ac_llvm_context *ctx, LLVMValueRef ptr, - LLVMTypeRef type) -{ - int addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr)); - return LLVMBuildBitCast(ctx->builder, ptr, - LLVMPointerType(type, addr_space), ""); -} - -LLVMValueRef ac_trim_vector(struct ac_llvm_context *ctx, LLVMValueRef value, - unsigned count) -{ - unsigned num_components = ac_get_llvm_num_components(value); - if (count == num_components) - return value; - - LLVMValueRef masks[MAX2(count, 2)]; - masks[0] = ctx->i32_0; - masks[1] = ctx->i32_1; - for (unsigned i = 2; i < count; i++) - masks[i] = LLVMConstInt(ctx->i32, i, false); - - if (count == 1) - return LLVMBuildExtractElement(ctx->builder, value, masks[0], - ""); - - LLVMValueRef swizzle = LLVMConstVector(masks, count); - return LLVMBuildShuffleVector(ctx->builder, value, value, swizzle, ""); -} - -LLVMValueRef ac_unpack_param(struct ac_llvm_context *ctx, LLVMValueRef param, - unsigned rshift, unsigned bitwidth) -{ - LLVMValueRef value = param; - if (rshift) - value = LLVMBuildLShr(ctx->builder, value, - LLVMConstInt(ctx->i32, rshift, false), ""); - - if (rshift + bitwidth < 32) { - unsigned mask = (1 << bitwidth) - 1; - value = LLVMBuildAnd(ctx->builder, value, - LLVMConstInt(ctx->i32, mask, false), ""); - } - return value; -} - -/* Adjust the sample index according to FMASK. - * - * For uncompressed MSAA surfaces, FMASK should return 0x76543210, - * which is the identity mapping. Each nibble says which physical sample - * should be fetched to get that sample. - * - * For example, 0x11111100 means there are only 2 samples stored and - * the second sample covers 3/4 of the pixel. When reading samples 0 - * and 1, return physical sample 0 (determined by the first two 0s - * in FMASK), otherwise return physical sample 1. - * - * The sample index should be adjusted as follows: - * addr[sample_index] = (fmask >> (addr[sample_index] * 4)) & 0xF; - */ -void ac_apply_fmask_to_sample(struct ac_llvm_context *ac, LLVMValueRef fmask, - LLVMValueRef *addr, bool is_array_tex) -{ - struct ac_image_args fmask_load = {}; - fmask_load.opcode = ac_image_load; - fmask_load.resource = fmask; - fmask_load.dmask = 0xf; - fmask_load.dim = is_array_tex ? ac_image_2darray : ac_image_2d; - fmask_load.attributes = AC_FUNC_ATTR_READNONE; - - fmask_load.coords[0] = addr[0]; - fmask_load.coords[1] = addr[1]; - if (is_array_tex) - fmask_load.coords[2] = addr[2]; - - LLVMValueRef fmask_value = ac_build_image_opcode(ac, &fmask_load); - fmask_value = LLVMBuildExtractElement(ac->builder, fmask_value, - ac->i32_0, ""); - - /* Apply the formula. */ - unsigned sample_chan = is_array_tex ? 3 : 2; - LLVMValueRef final_sample; - final_sample = LLVMBuildMul(ac->builder, addr[sample_chan], - LLVMConstInt(ac->i32, 4, 0), ""); - final_sample = LLVMBuildLShr(ac->builder, fmask_value, final_sample, ""); - /* Mask the sample index by 0x7, because 0x8 means an unknown value - * with EQAA, so those will map to 0. */ - final_sample = LLVMBuildAnd(ac->builder, final_sample, - LLVMConstInt(ac->i32, 0x7, 0), ""); - - /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK - * resource descriptor is 0 (invalid). - */ - LLVMValueRef tmp; - tmp = LLVMBuildBitCast(ac->builder, fmask, ac->v8i32, ""); - tmp = LLVMBuildExtractElement(ac->builder, tmp, ac->i32_1, ""); - tmp = LLVMBuildICmp(ac->builder, LLVMIntNE, tmp, ac->i32_0, ""); - - /* Replace the MSAA sample index. */ - addr[sample_chan] = LLVMBuildSelect(ac->builder, tmp, final_sample, - addr[sample_chan], ""); -} - -static LLVMValueRef -_ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef lane) -{ - ac_build_optimization_barrier(ctx, &src); - return ac_build_intrinsic(ctx, - lane == NULL ? "llvm.amdgcn.readfirstlane" : "llvm.amdgcn.readlane", - LLVMTypeOf(src), (LLVMValueRef []) { - src, lane }, - lane == NULL ? 1 : 2, - AC_FUNC_ATTR_READNONE | - AC_FUNC_ATTR_CONVERGENT); -} - -/** - * Builds the "llvm.amdgcn.readlane" or "llvm.amdgcn.readfirstlane" intrinsic. - * @param ctx - * @param src - * @param lane - id of the lane or NULL for the first active lane - * @return value of the lane - */ -LLVMValueRef -ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef lane) -{ - LLVMTypeRef src_type = LLVMTypeOf(src); - src = ac_to_integer(ctx, src); - unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src)); - LLVMValueRef ret; - - if (bits == 32) { - ret = _ac_build_readlane(ctx, src, lane); - } else { - assert(bits % 32 == 0); - LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32); - LLVMValueRef src_vector = - LLVMBuildBitCast(ctx->builder, src, vec_type, ""); - ret = LLVMGetUndef(vec_type); - for (unsigned i = 0; i < bits / 32; i++) { - src = LLVMBuildExtractElement(ctx->builder, src_vector, - LLVMConstInt(ctx->i32, i, 0), ""); - LLVMValueRef ret_comp = _ac_build_readlane(ctx, src, lane); - ret = LLVMBuildInsertElement(ctx->builder, ret, ret_comp, - LLVMConstInt(ctx->i32, i, 0), ""); - } - } - if (LLVMGetTypeKind(src_type) == LLVMPointerTypeKind) - return LLVMBuildIntToPtr(ctx->builder, ret, src_type, ""); - return LLVMBuildBitCast(ctx->builder, ret, src_type, ""); -} - -LLVMValueRef -ac_build_writelane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef value, LLVMValueRef lane) -{ - if (HAVE_LLVM >= 0x0800) { - return ac_build_intrinsic(ctx, "llvm.amdgcn.writelane", ctx->i32, - (LLVMValueRef []) {value, lane, src}, 3, - AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT); - } - - LLVMValueRef pred = LLVMBuildICmp(ctx->builder, LLVMIntEQ, lane, - ac_get_thread_id(ctx), ""); - return LLVMBuildSelect(ctx->builder, pred, value, src, ""); -} - -LLVMValueRef -ac_build_mbcnt(struct ac_llvm_context *ctx, LLVMValueRef mask) -{ - if (ctx->wave_size == 32) { - return ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.lo", ctx->i32, - (LLVMValueRef []) { mask, ctx->i32_0 }, - 2, AC_FUNC_ATTR_READNONE); - } - LLVMValueRef mask_vec = LLVMBuildBitCast(ctx->builder, mask, - LLVMVectorType(ctx->i32, 2), - ""); - LLVMValueRef mask_lo = LLVMBuildExtractElement(ctx->builder, mask_vec, - ctx->i32_0, ""); - LLVMValueRef mask_hi = LLVMBuildExtractElement(ctx->builder, mask_vec, - ctx->i32_1, ""); - LLVMValueRef val = - ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.lo", ctx->i32, - (LLVMValueRef []) { mask_lo, ctx->i32_0 }, - 2, AC_FUNC_ATTR_READNONE); - val = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.hi", ctx->i32, - (LLVMValueRef []) { mask_hi, val }, - 2, AC_FUNC_ATTR_READNONE); - return val; -} - -enum dpp_ctrl { - _dpp_quad_perm = 0x000, - _dpp_row_sl = 0x100, - _dpp_row_sr = 0x110, - _dpp_row_rr = 0x120, - dpp_wf_sl1 = 0x130, - dpp_wf_rl1 = 0x134, - dpp_wf_sr1 = 0x138, - dpp_wf_rr1 = 0x13C, - dpp_row_mirror = 0x140, - dpp_row_half_mirror = 0x141, - dpp_row_bcast15 = 0x142, - dpp_row_bcast31 = 0x143 -}; - -static inline enum dpp_ctrl -dpp_quad_perm(unsigned lane0, unsigned lane1, unsigned lane2, unsigned lane3) -{ - assert(lane0 < 4 && lane1 < 4 && lane2 < 4 && lane3 < 4); - return _dpp_quad_perm | lane0 | (lane1 << 2) | (lane2 << 4) | (lane3 << 6); -} - -static inline enum dpp_ctrl -dpp_row_sl(unsigned amount) -{ - assert(amount > 0 && amount < 16); - return _dpp_row_sl | amount; -} - -static inline enum dpp_ctrl -dpp_row_sr(unsigned amount) -{ - assert(amount > 0 && amount < 16); - return _dpp_row_sr | amount; -} - -static LLVMValueRef -_ac_build_dpp(struct ac_llvm_context *ctx, LLVMValueRef old, LLVMValueRef src, - enum dpp_ctrl dpp_ctrl, unsigned row_mask, unsigned bank_mask, - bool bound_ctrl) -{ - return ac_build_intrinsic(ctx, "llvm.amdgcn.update.dpp.i32", - LLVMTypeOf(old), - (LLVMValueRef[]) { - old, src, - LLVMConstInt(ctx->i32, dpp_ctrl, 0), - LLVMConstInt(ctx->i32, row_mask, 0), - LLVMConstInt(ctx->i32, bank_mask, 0), - LLVMConstInt(ctx->i1, bound_ctrl, 0) }, - 6, AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT); -} - -static LLVMValueRef -ac_build_dpp(struct ac_llvm_context *ctx, LLVMValueRef old, LLVMValueRef src, - enum dpp_ctrl dpp_ctrl, unsigned row_mask, unsigned bank_mask, - bool bound_ctrl) -{ - LLVMTypeRef src_type = LLVMTypeOf(src); - src = ac_to_integer(ctx, src); - old = ac_to_integer(ctx, old); - unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src)); - LLVMValueRef ret; - if (bits == 32) { - ret = _ac_build_dpp(ctx, old, src, dpp_ctrl, row_mask, - bank_mask, bound_ctrl); - } else { - assert(bits % 32 == 0); - LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32); - LLVMValueRef src_vector = - LLVMBuildBitCast(ctx->builder, src, vec_type, ""); - LLVMValueRef old_vector = - LLVMBuildBitCast(ctx->builder, old, vec_type, ""); - ret = LLVMGetUndef(vec_type); - for (unsigned i = 0; i < bits / 32; i++) { - src = LLVMBuildExtractElement(ctx->builder, src_vector, - LLVMConstInt(ctx->i32, i, - 0), ""); - old = LLVMBuildExtractElement(ctx->builder, old_vector, - LLVMConstInt(ctx->i32, i, - 0), ""); - LLVMValueRef ret_comp = _ac_build_dpp(ctx, old, src, - dpp_ctrl, - row_mask, - bank_mask, - bound_ctrl); - ret = LLVMBuildInsertElement(ctx->builder, ret, - ret_comp, - LLVMConstInt(ctx->i32, i, - 0), ""); - } - } - return LLVMBuildBitCast(ctx->builder, ret, src_type, ""); -} - -static LLVMValueRef -_ac_build_permlane16(struct ac_llvm_context *ctx, LLVMValueRef src, uint64_t sel, - bool exchange_rows, bool bound_ctrl) -{ - LLVMValueRef args[6] = { - src, - src, - LLVMConstInt(ctx->i32, sel, false), - LLVMConstInt(ctx->i32, sel >> 32, false), - ctx->i1true, /* fi */ - bound_ctrl ? ctx->i1true : ctx->i1false, - }; - return ac_build_intrinsic(ctx, exchange_rows ? "llvm.amdgcn.permlanex16" - : "llvm.amdgcn.permlane16", - ctx->i32, args, 6, - AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT); -} - -static LLVMValueRef -ac_build_permlane16(struct ac_llvm_context *ctx, LLVMValueRef src, uint64_t sel, - bool exchange_rows, bool bound_ctrl) -{ - LLVMTypeRef src_type = LLVMTypeOf(src); - src = ac_to_integer(ctx, src); - unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src)); - LLVMValueRef ret; - if (bits == 32) { - ret = _ac_build_permlane16(ctx, src, sel, exchange_rows, - bound_ctrl); - } else { - assert(bits % 32 == 0); - LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32); - LLVMValueRef src_vector = - LLVMBuildBitCast(ctx->builder, src, vec_type, ""); - ret = LLVMGetUndef(vec_type); - for (unsigned i = 0; i < bits / 32; i++) { - src = LLVMBuildExtractElement(ctx->builder, src_vector, - LLVMConstInt(ctx->i32, i, - 0), ""); - LLVMValueRef ret_comp = - _ac_build_permlane16(ctx, src, sel, - exchange_rows, - bound_ctrl); - ret = LLVMBuildInsertElement(ctx->builder, ret, - ret_comp, - LLVMConstInt(ctx->i32, i, - 0), ""); - } - } - return LLVMBuildBitCast(ctx->builder, ret, src_type, ""); -} - -static inline unsigned -ds_pattern_bitmode(unsigned and_mask, unsigned or_mask, unsigned xor_mask) -{ - assert(and_mask < 32 && or_mask < 32 && xor_mask < 32); - return and_mask | (or_mask << 5) | (xor_mask << 10); -} - -static LLVMValueRef -_ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned mask) -{ - return ac_build_intrinsic(ctx, "llvm.amdgcn.ds.swizzle", - LLVMTypeOf(src), (LLVMValueRef []) { - src, LLVMConstInt(ctx->i32, mask, 0) }, - 2, AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT); -} - -LLVMValueRef -ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned mask) -{ - LLVMTypeRef src_type = LLVMTypeOf(src); - src = ac_to_integer(ctx, src); - unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src)); - LLVMValueRef ret; - if (bits == 32) { - ret = _ac_build_ds_swizzle(ctx, src, mask); - } else { - assert(bits % 32 == 0); - LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32); - LLVMValueRef src_vector = - LLVMBuildBitCast(ctx->builder, src, vec_type, ""); - ret = LLVMGetUndef(vec_type); - for (unsigned i = 0; i < bits / 32; i++) { - src = LLVMBuildExtractElement(ctx->builder, src_vector, - LLVMConstInt(ctx->i32, i, - 0), ""); - LLVMValueRef ret_comp = _ac_build_ds_swizzle(ctx, src, - mask); - ret = LLVMBuildInsertElement(ctx->builder, ret, - ret_comp, - LLVMConstInt(ctx->i32, i, - 0), ""); - } - } - return LLVMBuildBitCast(ctx->builder, ret, src_type, ""); -} - -static LLVMValueRef -ac_build_wwm(struct ac_llvm_context *ctx, LLVMValueRef src) -{ - char name[32], type[8]; - ac_build_type_name_for_intr(LLVMTypeOf(src), type, sizeof(type)); - snprintf(name, sizeof(name), "llvm.amdgcn.wwm.%s", type); - return ac_build_intrinsic(ctx, name, LLVMTypeOf(src), - (LLVMValueRef []) { src }, 1, - AC_FUNC_ATTR_READNONE); -} - -static LLVMValueRef -ac_build_set_inactive(struct ac_llvm_context *ctx, LLVMValueRef src, - LLVMValueRef inactive) -{ - char name[33], type[8]; - LLVMTypeRef src_type = LLVMTypeOf(src); - src = ac_to_integer(ctx, src); - inactive = ac_to_integer(ctx, inactive); - ac_build_type_name_for_intr(LLVMTypeOf(src), type, sizeof(type)); - snprintf(name, sizeof(name), "llvm.amdgcn.set.inactive.%s", type); - LLVMValueRef ret = - ac_build_intrinsic(ctx, name, - LLVMTypeOf(src), (LLVMValueRef []) { - src, inactive }, 2, - AC_FUNC_ATTR_READNONE | - AC_FUNC_ATTR_CONVERGENT); - return LLVMBuildBitCast(ctx->builder, ret, src_type, ""); -} - -static LLVMValueRef -get_reduction_identity(struct ac_llvm_context *ctx, nir_op op, unsigned type_size) -{ - if (type_size == 4) { - switch (op) { - case nir_op_iadd: return ctx->i32_0; - case nir_op_fadd: return ctx->f32_0; - case nir_op_imul: return ctx->i32_1; - case nir_op_fmul: return ctx->f32_1; - case nir_op_imin: return LLVMConstInt(ctx->i32, INT32_MAX, 0); - case nir_op_umin: return LLVMConstInt(ctx->i32, UINT32_MAX, 0); - case nir_op_fmin: return LLVMConstReal(ctx->f32, INFINITY); - case nir_op_imax: return LLVMConstInt(ctx->i32, INT32_MIN, 0); - case nir_op_umax: return ctx->i32_0; - case nir_op_fmax: return LLVMConstReal(ctx->f32, -INFINITY); - case nir_op_iand: return LLVMConstInt(ctx->i32, -1, 0); - case nir_op_ior: return ctx->i32_0; - case nir_op_ixor: return ctx->i32_0; - default: - unreachable("bad reduction intrinsic"); - } - } else { /* type_size == 64bit */ - switch (op) { - case nir_op_iadd: return ctx->i64_0; - case nir_op_fadd: return ctx->f64_0; - case nir_op_imul: return ctx->i64_1; - case nir_op_fmul: return ctx->f64_1; - case nir_op_imin: return LLVMConstInt(ctx->i64, INT64_MAX, 0); - case nir_op_umin: return LLVMConstInt(ctx->i64, UINT64_MAX, 0); - case nir_op_fmin: return LLVMConstReal(ctx->f64, INFINITY); - case nir_op_imax: return LLVMConstInt(ctx->i64, INT64_MIN, 0); - case nir_op_umax: return ctx->i64_0; - case nir_op_fmax: return LLVMConstReal(ctx->f64, -INFINITY); - case nir_op_iand: return LLVMConstInt(ctx->i64, -1, 0); - case nir_op_ior: return ctx->i64_0; - case nir_op_ixor: return ctx->i64_0; - default: - unreachable("bad reduction intrinsic"); - } - } -} - -static LLVMValueRef -ac_build_alu_op(struct ac_llvm_context *ctx, LLVMValueRef lhs, LLVMValueRef rhs, nir_op op) -{ - bool _64bit = ac_get_type_size(LLVMTypeOf(lhs)) == 8; - switch (op) { - case nir_op_iadd: return LLVMBuildAdd(ctx->builder, lhs, rhs, ""); - case nir_op_fadd: return LLVMBuildFAdd(ctx->builder, lhs, rhs, ""); - case nir_op_imul: return LLVMBuildMul(ctx->builder, lhs, rhs, ""); - case nir_op_fmul: return LLVMBuildFMul(ctx->builder, lhs, rhs, ""); - case nir_op_imin: return LLVMBuildSelect(ctx->builder, - LLVMBuildICmp(ctx->builder, LLVMIntSLT, lhs, rhs, ""), - lhs, rhs, ""); - case nir_op_umin: return LLVMBuildSelect(ctx->builder, - LLVMBuildICmp(ctx->builder, LLVMIntULT, lhs, rhs, ""), - lhs, rhs, ""); - case nir_op_fmin: return ac_build_intrinsic(ctx, - _64bit ? "llvm.minnum.f64" : "llvm.minnum.f32", - _64bit ? ctx->f64 : ctx->f32, - (LLVMValueRef[]){lhs, rhs}, 2, AC_FUNC_ATTR_READNONE); - case nir_op_imax: return LLVMBuildSelect(ctx->builder, - LLVMBuildICmp(ctx->builder, LLVMIntSGT, lhs, rhs, ""), - lhs, rhs, ""); - case nir_op_umax: return LLVMBuildSelect(ctx->builder, - LLVMBuildICmp(ctx->builder, LLVMIntUGT, lhs, rhs, ""), - lhs, rhs, ""); - case nir_op_fmax: return ac_build_intrinsic(ctx, - _64bit ? "llvm.maxnum.f64" : "llvm.maxnum.f32", - _64bit ? ctx->f64 : ctx->f32, - (LLVMValueRef[]){lhs, rhs}, 2, AC_FUNC_ATTR_READNONE); - case nir_op_iand: return LLVMBuildAnd(ctx->builder, lhs, rhs, ""); - case nir_op_ior: return LLVMBuildOr(ctx->builder, lhs, rhs, ""); - case nir_op_ixor: return LLVMBuildXor(ctx->builder, lhs, rhs, ""); - default: - unreachable("bad reduction intrinsic"); - } -} - -/** - * \param maxprefix specifies that the result only needs to be correct for a - * prefix of this many threads - * - * TODO: add inclusive and excluse scan functions for GFX6. - */ -static LLVMValueRef -ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src, LLVMValueRef identity, - unsigned maxprefix, bool inclusive) -{ - LLVMValueRef result, tmp; - - if (inclusive) { - result = src; - } else if (ctx->chip_class >= GFX10) { - /* wavefront shift_right by 1 on GFX10 (emulate dpp_wf_sr1) */ - LLVMValueRef active, tmp1, tmp2; - LLVMValueRef tid = ac_get_thread_id(ctx); - - tmp1 = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, false); - - tmp2 = ac_build_permlane16(ctx, src, (uint64_t)~0, true, false); - - if (maxprefix > 32) { - active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, - LLVMConstInt(ctx->i32, 32, false), ""); - - tmp2 = LLVMBuildSelect(ctx->builder, active, - ac_build_readlane(ctx, src, - LLVMConstInt(ctx->i32, 31, false)), - tmp2, ""); - - active = LLVMBuildOr(ctx->builder, active, - LLVMBuildICmp(ctx->builder, LLVMIntEQ, - LLVMBuildAnd(ctx->builder, tid, - LLVMConstInt(ctx->i32, 0x1f, false), ""), - LLVMConstInt(ctx->i32, 0x10, false), ""), ""); - src = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, ""); - } else if (maxprefix > 16) { - active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, - LLVMConstInt(ctx->i32, 16, false), ""); - - src = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, ""); - } - - result = src; - } else if (ctx->chip_class >= GFX8) { - src = ac_build_dpp(ctx, identity, src, dpp_wf_sr1, 0xf, 0xf, false); - result = src; - } else { - if (!inclusive) - src = ac_build_dpp(ctx, identity, src, dpp_wf_sr1, 0xf, 0xf, false); - result = src; - } - if (maxprefix <= 1) - return result; - tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, false); - result = ac_build_alu_op(ctx, result, tmp, op); - if (maxprefix <= 2) - return result; - tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(2), 0xf, 0xf, false); - result = ac_build_alu_op(ctx, result, tmp, op); - if (maxprefix <= 3) - return result; - tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(3), 0xf, 0xf, false); - result = ac_build_alu_op(ctx, result, tmp, op); - if (maxprefix <= 4) - return result; - tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(4), 0xf, 0xe, false); - result = ac_build_alu_op(ctx, result, tmp, op); - if (maxprefix <= 8) - return result; - tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(8), 0xf, 0xc, false); - result = ac_build_alu_op(ctx, result, tmp, op); - if (maxprefix <= 16) - return result; - - if (ctx->chip_class >= GFX10) { - LLVMValueRef tid = ac_get_thread_id(ctx); - LLVMValueRef active; - - tmp = ac_build_permlane16(ctx, result, ~(uint64_t)0, true, false); - - active = LLVMBuildICmp(ctx->builder, LLVMIntNE, - LLVMBuildAnd(ctx->builder, tid, - LLVMConstInt(ctx->i32, 16, false), ""), - ctx->i32_0, ""); - - tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, ""); - - result = ac_build_alu_op(ctx, result, tmp, op); - - if (maxprefix <= 32) - return result; - - tmp = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, false)); - - active = LLVMBuildICmp(ctx->builder, LLVMIntUGE, tid, - LLVMConstInt(ctx->i32, 32, false), ""); - - tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, ""); - - result = ac_build_alu_op(ctx, result, tmp, op); - return result; - } - - tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false); - result = ac_build_alu_op(ctx, result, tmp, op); - if (maxprefix <= 32) - return result; - tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false); - result = ac_build_alu_op(ctx, result, tmp, op); - return result; -} - -LLVMValueRef -ac_build_inclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op) -{ - LLVMValueRef result; - - if (LLVMTypeOf(src) == ctx->i1 && op == nir_op_iadd) { - LLVMBuilderRef builder = ctx->builder; - src = LLVMBuildZExt(builder, src, ctx->i32, ""); - result = ac_build_ballot(ctx, src); - result = ac_build_mbcnt(ctx, result); - result = LLVMBuildAdd(builder, result, src, ""); - return result; - } - - ac_build_optimization_barrier(ctx, &src); - - LLVMValueRef identity = - get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src))); - result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity), - LLVMTypeOf(identity), ""); - result = ac_build_scan(ctx, op, result, identity, ctx->wave_size, true); - - return ac_build_wwm(ctx, result); -} - -LLVMValueRef -ac_build_exclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op) -{ - LLVMValueRef result; - - if (LLVMTypeOf(src) == ctx->i1 && op == nir_op_iadd) { - LLVMBuilderRef builder = ctx->builder; - src = LLVMBuildZExt(builder, src, ctx->i32, ""); - result = ac_build_ballot(ctx, src); - result = ac_build_mbcnt(ctx, result); - return result; - } - - ac_build_optimization_barrier(ctx, &src); - - LLVMValueRef identity = - get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src))); - result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity), - LLVMTypeOf(identity), ""); - result = ac_build_scan(ctx, op, result, identity, ctx->wave_size, false); - - return ac_build_wwm(ctx, result); -} - -LLVMValueRef -ac_build_reduce(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op, unsigned cluster_size) -{ - if (cluster_size == 1) return src; - ac_build_optimization_barrier(ctx, &src); - LLVMValueRef result, swap; - LLVMValueRef identity = get_reduction_identity(ctx, op, - ac_get_type_size(LLVMTypeOf(src))); - result = LLVMBuildBitCast(ctx->builder, - ac_build_set_inactive(ctx, src, identity), - LLVMTypeOf(identity), ""); - swap = ac_build_quad_swizzle(ctx, result, 1, 0, 3, 2); - result = ac_build_alu_op(ctx, result, swap, op); - if (cluster_size == 2) return ac_build_wwm(ctx, result); - - swap = ac_build_quad_swizzle(ctx, result, 2, 3, 0, 1); - result = ac_build_alu_op(ctx, result, swap, op); - if (cluster_size == 4) return ac_build_wwm(ctx, result); - - if (ctx->chip_class >= GFX8) - swap = ac_build_dpp(ctx, identity, result, dpp_row_half_mirror, 0xf, 0xf, false); - else - swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x04)); - result = ac_build_alu_op(ctx, result, swap, op); - if (cluster_size == 8) return ac_build_wwm(ctx, result); - - if (ctx->chip_class >= GFX8) - swap = ac_build_dpp(ctx, identity, result, dpp_row_mirror, 0xf, 0xf, false); - else - swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x08)); - result = ac_build_alu_op(ctx, result, swap, op); - if (cluster_size == 16) return ac_build_wwm(ctx, result); - - if (ctx->chip_class >= GFX10) - swap = ac_build_permlane16(ctx, result, 0, true, false); - else if (ctx->chip_class >= GFX8 && cluster_size != 32) - swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false); - else - swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x10)); - result = ac_build_alu_op(ctx, result, swap, op); - if (cluster_size == 32) return ac_build_wwm(ctx, result); - - if (ctx->chip_class >= GFX8) { - if (ctx->chip_class >= GFX10) - swap = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, false)); - else - swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false); - result = ac_build_alu_op(ctx, result, swap, op); - result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 63, 0)); - return ac_build_wwm(ctx, result); - } else { - swap = ac_build_readlane(ctx, result, ctx->i32_0); - result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 32, 0)); - result = ac_build_alu_op(ctx, result, swap, op); - return ac_build_wwm(ctx, result); - } -} - -/** - * "Top half" of a scan that reduces per-wave values across an entire - * workgroup. - * - * The source value must be present in the highest lane of the wave, and the - * highest lane must be live. - */ -void -ac_build_wg_wavescan_top(struct ac_llvm_context *ctx, struct ac_wg_scan *ws) -{ - if (ws->maxwaves <= 1) - return; - - const LLVMValueRef last_lane = LLVMConstInt(ctx->i32, ctx->wave_size - 1, false); - LLVMBuilderRef builder = ctx->builder; - LLVMValueRef tid = ac_get_thread_id(ctx); - LLVMValueRef tmp; - - tmp = LLVMBuildICmp(builder, LLVMIntEQ, tid, last_lane, ""); - ac_build_ifcc(ctx, tmp, 1000); - LLVMBuildStore(builder, ws->src, LLVMBuildGEP(builder, ws->scratch, &ws->waveidx, 1, "")); - ac_build_endif(ctx, 1000); -} - -/** - * "Bottom half" of a scan that reduces per-wave values across an entire - * workgroup. - * - * The caller must place a barrier between the top and bottom halves. - */ -void -ac_build_wg_wavescan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws) -{ - const LLVMTypeRef type = LLVMTypeOf(ws->src); - const LLVMValueRef identity = - get_reduction_identity(ctx, ws->op, ac_get_type_size(type)); - - if (ws->maxwaves <= 1) { - ws->result_reduce = ws->src; - ws->result_inclusive = ws->src; - ws->result_exclusive = identity; - return; - } - assert(ws->maxwaves <= 32); - - LLVMBuilderRef builder = ctx->builder; - LLVMValueRef tid = ac_get_thread_id(ctx); - LLVMBasicBlockRef bbs[2]; - LLVMValueRef phivalues_scan[2]; - LLVMValueRef tmp, tmp2; - - bbs[0] = LLVMGetInsertBlock(builder); - phivalues_scan[0] = LLVMGetUndef(type); - - if (ws->enable_reduce) - tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, ws->numwaves, ""); - else if (ws->enable_inclusive) - tmp = LLVMBuildICmp(builder, LLVMIntULE, tid, ws->waveidx, ""); - else - tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, ws->waveidx, ""); - ac_build_ifcc(ctx, tmp, 1001); - { - tmp = LLVMBuildLoad(builder, LLVMBuildGEP(builder, ws->scratch, &tid, 1, ""), ""); - - ac_build_optimization_barrier(ctx, &tmp); - - bbs[1] = LLVMGetInsertBlock(builder); - phivalues_scan[1] = ac_build_scan(ctx, ws->op, tmp, identity, ws->maxwaves, true); - } - ac_build_endif(ctx, 1001); - - const LLVMValueRef scan = ac_build_phi(ctx, type, 2, phivalues_scan, bbs); - - if (ws->enable_reduce) { - tmp = LLVMBuildSub(builder, ws->numwaves, ctx->i32_1, ""); - ws->result_reduce = ac_build_readlane(ctx, scan, tmp); - } - if (ws->enable_inclusive) - ws->result_inclusive = ac_build_readlane(ctx, scan, ws->waveidx); - if (ws->enable_exclusive) { - tmp = LLVMBuildSub(builder, ws->waveidx, ctx->i32_1, ""); - tmp = ac_build_readlane(ctx, scan, tmp); - tmp2 = LLVMBuildICmp(builder, LLVMIntEQ, ws->waveidx, ctx->i32_0, ""); - ws->result_exclusive = LLVMBuildSelect(builder, tmp2, identity, tmp, ""); - } -} - -/** - * Inclusive scan of a per-wave value across an entire workgroup. - * - * This implies an s_barrier instruction. - * - * Unlike ac_build_inclusive_scan, the caller \em must ensure that all threads - * of the workgroup are live. (This requirement cannot easily be relaxed in a - * useful manner because of the barrier in the algorithm.) - */ -void -ac_build_wg_wavescan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws) -{ - ac_build_wg_wavescan_top(ctx, ws); - ac_build_s_barrier(ctx); - ac_build_wg_wavescan_bottom(ctx, ws); -} - -/** - * "Top half" of a scan that reduces per-thread values across an entire - * workgroup. - * - * All lanes must be active when this code runs. - */ -void -ac_build_wg_scan_top(struct ac_llvm_context *ctx, struct ac_wg_scan *ws) -{ - if (ws->enable_exclusive) { - ws->extra = ac_build_exclusive_scan(ctx, ws->src, ws->op); - if (LLVMTypeOf(ws->src) == ctx->i1 && ws->op == nir_op_iadd) - ws->src = LLVMBuildZExt(ctx->builder, ws->src, ctx->i32, ""); - ws->src = ac_build_alu_op(ctx, ws->extra, ws->src, ws->op); - } else { - ws->src = ac_build_inclusive_scan(ctx, ws->src, ws->op); - } - - bool enable_inclusive = ws->enable_inclusive; - bool enable_exclusive = ws->enable_exclusive; - ws->enable_inclusive = false; - ws->enable_exclusive = ws->enable_exclusive || enable_inclusive; - ac_build_wg_wavescan_top(ctx, ws); - ws->enable_inclusive = enable_inclusive; - ws->enable_exclusive = enable_exclusive; -} - -/** - * "Bottom half" of a scan that reduces per-thread values across an entire - * workgroup. - * - * The caller must place a barrier between the top and bottom halves. - */ -void -ac_build_wg_scan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws) -{ - bool enable_inclusive = ws->enable_inclusive; - bool enable_exclusive = ws->enable_exclusive; - ws->enable_inclusive = false; - ws->enable_exclusive = ws->enable_exclusive || enable_inclusive; - ac_build_wg_wavescan_bottom(ctx, ws); - ws->enable_inclusive = enable_inclusive; - ws->enable_exclusive = enable_exclusive; - - /* ws->result_reduce is already the correct value */ - if (ws->enable_inclusive) - ws->result_inclusive = ac_build_alu_op(ctx, ws->result_inclusive, ws->src, ws->op); - if (ws->enable_exclusive) - ws->result_exclusive = ac_build_alu_op(ctx, ws->result_exclusive, ws->extra, ws->op); -} - -/** - * A scan that reduces per-thread values across an entire workgroup. - * - * The caller must ensure that all lanes are active when this code runs - * (WWM is insufficient!), because there is an implied barrier. - */ -void -ac_build_wg_scan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws) -{ - ac_build_wg_scan_top(ctx, ws); - ac_build_s_barrier(ctx); - ac_build_wg_scan_bottom(ctx, ws); -} - -LLVMValueRef -ac_build_quad_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, - unsigned lane0, unsigned lane1, unsigned lane2, unsigned lane3) -{ - unsigned mask = dpp_quad_perm(lane0, lane1, lane2, lane3); - if (ctx->chip_class >= GFX8) { - return ac_build_dpp(ctx, src, src, mask, 0xf, 0xf, false); - } else { - return ac_build_ds_swizzle(ctx, src, (1 << 15) | mask); - } -} - -LLVMValueRef -ac_build_shuffle(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef index) -{ - index = LLVMBuildMul(ctx->builder, index, LLVMConstInt(ctx->i32, 4, 0), ""); - return ac_build_intrinsic(ctx, - "llvm.amdgcn.ds.bpermute", ctx->i32, - (LLVMValueRef []) {index, src}, 2, - AC_FUNC_ATTR_READNONE | - AC_FUNC_ATTR_CONVERGENT); -} - -LLVMValueRef -ac_build_frexp_exp(struct ac_llvm_context *ctx, LLVMValueRef src0, - unsigned bitsize) -{ - LLVMTypeRef type; - char *intr; - - if (bitsize == 16) { - intr = "llvm.amdgcn.frexp.exp.i16.f16"; - type = ctx->i16; - } else if (bitsize == 32) { - intr = "llvm.amdgcn.frexp.exp.i32.f32"; - type = ctx->i32; - } else { - intr = "llvm.amdgcn.frexp.exp.i32.f64"; - type = ctx->i32; - } - - LLVMValueRef params[] = { - src0, - }; - return ac_build_intrinsic(ctx, intr, type, params, 1, - AC_FUNC_ATTR_READNONE); -} -LLVMValueRef -ac_build_frexp_mant(struct ac_llvm_context *ctx, LLVMValueRef src0, - unsigned bitsize) -{ - LLVMTypeRef type; - char *intr; - - if (bitsize == 16) { - intr = "llvm.amdgcn.frexp.mant.f16"; - type = ctx->f16; - } else if (bitsize == 32) { - intr = "llvm.amdgcn.frexp.mant.f32"; - type = ctx->f32; - } else { - intr = "llvm.amdgcn.frexp.mant.f64"; - type = ctx->f64; - } - - LLVMValueRef params[] = { - src0, - }; - return ac_build_intrinsic(ctx, intr, type, params, 1, - AC_FUNC_ATTR_READNONE); -} - -/* - * this takes an I,J coordinate pair, - * and works out the X and Y derivatives. - * it returns DDX(I), DDX(J), DDY(I), DDY(J). - */ -LLVMValueRef -ac_build_ddxy_interp(struct ac_llvm_context *ctx, LLVMValueRef interp_ij) -{ - LLVMValueRef result[4], a; - unsigned i; - - for (i = 0; i < 2; i++) { - a = LLVMBuildExtractElement(ctx->builder, interp_ij, - LLVMConstInt(ctx->i32, i, false), ""); - result[i] = ac_build_ddxy(ctx, AC_TID_MASK_TOP_LEFT, 1, a); - result[2+i] = ac_build_ddxy(ctx, AC_TID_MASK_TOP_LEFT, 2, a); - } - return ac_build_gather_values(ctx, result, 4); -} - -LLVMValueRef -ac_build_load_helper_invocation(struct ac_llvm_context *ctx) -{ - LLVMValueRef result = ac_build_intrinsic(ctx, "llvm.amdgcn.ps.live", - ctx->i1, NULL, 0, - AC_FUNC_ATTR_READNONE); - result = LLVMBuildNot(ctx->builder, result, ""); - return LLVMBuildSExt(ctx->builder, result, ctx->i32, ""); -} - -LLVMValueRef ac_build_call(struct ac_llvm_context *ctx, LLVMValueRef func, - LLVMValueRef *args, unsigned num_args) -{ - LLVMValueRef ret = LLVMBuildCall(ctx->builder, func, args, num_args, ""); - LLVMSetInstructionCallConv(ret, LLVMGetFunctionCallConv(func)); - return ret; -} diff -Nru mesa-19.2.8/src/amd/common/ac_llvm_build.h mesa-20.0.8/src/amd/common/ac_llvm_build.h --- mesa-19.2.8/src/amd/common/ac_llvm_build.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/common/ac_llvm_build.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,757 +0,0 @@ -/* - * Copyright 2016 Bas Nieuwenhuizen - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sub license, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL - * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, - * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR - * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE - * USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * The above copyright notice and this permission notice (including the - * next paragraph) shall be included in all copies or substantial portions - * of the Software. - * - */ -#ifndef AC_LLVM_BUILD_H -#define AC_LLVM_BUILD_H - -#include -#include -#include "compiler/nir/nir.h" -#include "amd_family.h" - -#ifdef __cplusplus -extern "C" { -#endif - -enum { - AC_ADDR_SPACE_FLAT = 0, /* Slower than global. */ - AC_ADDR_SPACE_GLOBAL = 1, - AC_ADDR_SPACE_GDS = 2, - AC_ADDR_SPACE_LDS = 3, - AC_ADDR_SPACE_CONST = 4, /* Global allowing SMEM. */ - AC_ADDR_SPACE_CONST_32BIT = 6, /* same as CONST, but the pointer type has 32 bits */ -}; - -#define AC_WAIT_LGKM (1 << 0) /* LDS, GDS, constant, message */ -#define AC_WAIT_VLOAD (1 << 1) /* VMEM load/sample instructions */ -#define AC_WAIT_VSTORE (1 << 2) /* VMEM store instructions */ - -struct ac_llvm_flow; -struct ac_llvm_compiler; -enum ac_float_mode; - -struct ac_llvm_flow_state { - struct ac_llvm_flow *stack; - unsigned depth_max; - unsigned depth; -}; - -struct ac_llvm_context { - LLVMContextRef context; - LLVMModuleRef module; - LLVMBuilderRef builder; - - LLVMTypeRef voidt; - LLVMTypeRef i1; - LLVMTypeRef i8; - LLVMTypeRef i16; - LLVMTypeRef i32; - LLVMTypeRef i64; - LLVMTypeRef intptr; - LLVMTypeRef f16; - LLVMTypeRef f32; - LLVMTypeRef f64; - LLVMTypeRef v2i16; - LLVMTypeRef v2i32; - LLVMTypeRef v3i32; - LLVMTypeRef v4i32; - LLVMTypeRef v2f32; - LLVMTypeRef v3f32; - LLVMTypeRef v4f32; - LLVMTypeRef v8i32; - LLVMTypeRef iN_wavemask; - LLVMTypeRef iN_ballotmask; - - LLVMValueRef i8_0; - LLVMValueRef i8_1; - LLVMValueRef i16_0; - LLVMValueRef i16_1; - LLVMValueRef i32_0; - LLVMValueRef i32_1; - LLVMValueRef i64_0; - LLVMValueRef i64_1; - LLVMValueRef f16_0; - LLVMValueRef f16_1; - LLVMValueRef f32_0; - LLVMValueRef f32_1; - LLVMValueRef f64_0; - LLVMValueRef f64_1; - LLVMValueRef i1true; - LLVMValueRef i1false; - - /* Since ac_nir_translate makes a local copy of ac_llvm_context, there - * are two ac_llvm_contexts. Declare a pointer here, so that the control - * flow stack is shared by both ac_llvm_contexts. - */ - struct ac_llvm_flow_state *flow; - - unsigned range_md_kind; - unsigned invariant_load_md_kind; - unsigned uniform_md_kind; - unsigned fpmath_md_kind; - LLVMValueRef fpmath_md_2p5_ulp; - LLVMValueRef empty_md; - - enum chip_class chip_class; - enum radeon_family family; - - unsigned wave_size; - unsigned ballot_mask_bits; - - LLVMValueRef lds; -}; - -void -ac_llvm_context_init(struct ac_llvm_context *ctx, - struct ac_llvm_compiler *compiler, - enum chip_class chip_class, enum radeon_family family, - enum ac_float_mode float_mode, unsigned wave_size, - unsigned ballot_mask_bits); - -void -ac_llvm_context_dispose(struct ac_llvm_context *ctx); - -int -ac_get_llvm_num_components(LLVMValueRef value); - -int -ac_get_elem_bits(struct ac_llvm_context *ctx, LLVMTypeRef type); - -LLVMValueRef -ac_llvm_extract_elem(struct ac_llvm_context *ac, - LLVMValueRef value, - int index); - -unsigned ac_get_type_size(LLVMTypeRef type); - -LLVMTypeRef ac_to_integer_type(struct ac_llvm_context *ctx, LLVMTypeRef t); -LLVMValueRef ac_to_integer(struct ac_llvm_context *ctx, LLVMValueRef v); -LLVMValueRef ac_to_integer_or_pointer(struct ac_llvm_context *ctx, LLVMValueRef v); -LLVMTypeRef ac_to_float_type(struct ac_llvm_context *ctx, LLVMTypeRef t); -LLVMValueRef ac_to_float(struct ac_llvm_context *ctx, LLVMValueRef v); - -LLVMValueRef -ac_build_intrinsic(struct ac_llvm_context *ctx, const char *name, - LLVMTypeRef return_type, LLVMValueRef *params, - unsigned param_count, unsigned attrib_mask); - -void ac_build_type_name_for_intr(LLVMTypeRef type, char *buf, unsigned bufsize); - -LLVMValueRef -ac_build_phi(struct ac_llvm_context *ctx, LLVMTypeRef type, - unsigned count_incoming, LLVMValueRef *values, - LLVMBasicBlockRef *blocks); - -void ac_build_s_barrier(struct ac_llvm_context *ctx); -void ac_build_optimization_barrier(struct ac_llvm_context *ctx, - LLVMValueRef *pvgpr); - -LLVMValueRef ac_build_shader_clock(struct ac_llvm_context *ctx); - -LLVMValueRef ac_build_ballot(struct ac_llvm_context *ctx, LLVMValueRef value); -LLVMValueRef ac_get_i1_sgpr_mask(struct ac_llvm_context *ctx, - LLVMValueRef value); - -LLVMValueRef ac_build_vote_all(struct ac_llvm_context *ctx, LLVMValueRef value); - -LLVMValueRef ac_build_vote_any(struct ac_llvm_context *ctx, LLVMValueRef value); - -LLVMValueRef ac_build_vote_eq(struct ac_llvm_context *ctx, LLVMValueRef value); - -LLVMValueRef -ac_build_varying_gather_values(struct ac_llvm_context *ctx, LLVMValueRef *values, - unsigned value_count, unsigned component); - -LLVMValueRef -ac_build_gather_values_extended(struct ac_llvm_context *ctx, - LLVMValueRef *values, - unsigned value_count, - unsigned value_stride, - bool load, - bool always_vector); -LLVMValueRef -ac_build_gather_values(struct ac_llvm_context *ctx, - LLVMValueRef *values, - unsigned value_count); - -LLVMValueRef -ac_extract_components(struct ac_llvm_context *ctx, - LLVMValueRef value, - unsigned start, - unsigned channels); - -LLVMValueRef ac_build_expand_to_vec4(struct ac_llvm_context *ctx, - LLVMValueRef value, - unsigned num_channels); -LLVMValueRef ac_build_round(struct ac_llvm_context *ctx, LLVMValueRef value); - -LLVMValueRef -ac_build_fdiv(struct ac_llvm_context *ctx, - LLVMValueRef num, - LLVMValueRef den); - -LLVMValueRef ac_build_fast_udiv(struct ac_llvm_context *ctx, - LLVMValueRef num, - LLVMValueRef multiplier, - LLVMValueRef pre_shift, - LLVMValueRef post_shift, - LLVMValueRef increment); -LLVMValueRef ac_build_fast_udiv_nuw(struct ac_llvm_context *ctx, - LLVMValueRef num, - LLVMValueRef multiplier, - LLVMValueRef pre_shift, - LLVMValueRef post_shift, - LLVMValueRef increment); -LLVMValueRef ac_build_fast_udiv_u31_d_not_one(struct ac_llvm_context *ctx, - LLVMValueRef num, - LLVMValueRef multiplier, - LLVMValueRef post_shift); - -void -ac_prepare_cube_coords(struct ac_llvm_context *ctx, - bool is_deriv, bool is_array, bool is_lod, - LLVMValueRef *coords_arg, - LLVMValueRef *derivs_arg); - - -LLVMValueRef -ac_build_fs_interp(struct ac_llvm_context *ctx, - LLVMValueRef llvm_chan, - LLVMValueRef attr_number, - LLVMValueRef params, - LLVMValueRef i, - LLVMValueRef j); - -LLVMValueRef -ac_build_fs_interp_f16(struct ac_llvm_context *ctx, - LLVMValueRef llvm_chan, - LLVMValueRef attr_number, - LLVMValueRef params, - LLVMValueRef i, - LLVMValueRef j); - -LLVMValueRef -ac_build_fs_interp_mov(struct ac_llvm_context *ctx, - LLVMValueRef parameter, - LLVMValueRef llvm_chan, - LLVMValueRef attr_number, - LLVMValueRef params); - -LLVMValueRef -ac_build_gep_ptr(struct ac_llvm_context *ctx, - LLVMValueRef base_ptr, - LLVMValueRef index); - -LLVMValueRef -ac_build_gep0(struct ac_llvm_context *ctx, - LLVMValueRef base_ptr, - LLVMValueRef index); -LLVMValueRef ac_build_pointer_add(struct ac_llvm_context *ctx, LLVMValueRef ptr, - LLVMValueRef index); - -void -ac_build_indexed_store(struct ac_llvm_context *ctx, - LLVMValueRef base_ptr, LLVMValueRef index, - LLVMValueRef value); - -LLVMValueRef ac_build_load(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, - LLVMValueRef index); -LLVMValueRef ac_build_load_invariant(struct ac_llvm_context *ctx, - LLVMValueRef base_ptr, LLVMValueRef index); -LLVMValueRef ac_build_load_to_sgpr(struct ac_llvm_context *ctx, - LLVMValueRef base_ptr, LLVMValueRef index); -LLVMValueRef ac_build_load_to_sgpr_uint_wraparound(struct ac_llvm_context *ctx, - LLVMValueRef base_ptr, LLVMValueRef index); - -void -ac_build_buffer_store_dword(struct ac_llvm_context *ctx, - LLVMValueRef rsrc, - LLVMValueRef vdata, - unsigned num_channels, - LLVMValueRef voffset, - LLVMValueRef soffset, - unsigned inst_offset, - unsigned cache_policy, - bool swizzle_enable_hint); - -void -ac_build_buffer_store_format(struct ac_llvm_context *ctx, - LLVMValueRef rsrc, - LLVMValueRef data, - LLVMValueRef vindex, - LLVMValueRef voffset, - unsigned num_channels, - unsigned cache_policy); - -LLVMValueRef -ac_build_buffer_load(struct ac_llvm_context *ctx, - LLVMValueRef rsrc, - int num_channels, - LLVMValueRef vindex, - LLVMValueRef voffset, - LLVMValueRef soffset, - unsigned inst_offset, - unsigned cache_policy, - bool can_speculate, - bool allow_smem); - -LLVMValueRef ac_build_buffer_load_format(struct ac_llvm_context *ctx, - LLVMValueRef rsrc, - LLVMValueRef vindex, - LLVMValueRef voffset, - unsigned num_channels, - unsigned cache_policy, - bool can_speculate); - -/* load_format that handles the stride & element count better if idxen is - * disabled by LLVM. */ -LLVMValueRef ac_build_buffer_load_format_gfx9_safe(struct ac_llvm_context *ctx, - LLVMValueRef rsrc, - LLVMValueRef vindex, - LLVMValueRef voffset, - unsigned num_channels, - unsigned cache_policy, - bool can_speculate); - -LLVMValueRef -ac_build_tbuffer_load_short(struct ac_llvm_context *ctx, - LLVMValueRef rsrc, - LLVMValueRef voffset, - LLVMValueRef soffset, - LLVMValueRef immoffset, - unsigned cache_policy); - -LLVMValueRef -ac_build_tbuffer_load_byte(struct ac_llvm_context *ctx, - LLVMValueRef rsrc, - LLVMValueRef voffset, - LLVMValueRef soffset, - LLVMValueRef immoffset, - unsigned cache_policy); - -LLVMValueRef -ac_build_struct_tbuffer_load(struct ac_llvm_context *ctx, - LLVMValueRef rsrc, - LLVMValueRef vindex, - LLVMValueRef voffset, - LLVMValueRef soffset, - LLVMValueRef immoffset, - unsigned num_channels, - unsigned dfmt, - unsigned nfmt, - unsigned cache_policy, - bool can_speculate); - -LLVMValueRef -ac_build_raw_tbuffer_load(struct ac_llvm_context *ctx, - LLVMValueRef rsrc, - LLVMValueRef voffset, - LLVMValueRef soffset, - LLVMValueRef immoffset, - unsigned num_channels, - unsigned dfmt, - unsigned nfmt, - unsigned cache_policy, - bool can_speculate); - -/* For ac_build_fetch_format. - * - * Note: FLOAT must be 0 (used for convenience of encoding in radeonsi). - */ -enum { - AC_FETCH_FORMAT_FLOAT = 0, - AC_FETCH_FORMAT_FIXED, - AC_FETCH_FORMAT_UNORM, - AC_FETCH_FORMAT_SNORM, - AC_FETCH_FORMAT_USCALED, - AC_FETCH_FORMAT_SSCALED, - AC_FETCH_FORMAT_UINT, - AC_FETCH_FORMAT_SINT, -}; - -LLVMValueRef -ac_build_opencoded_load_format(struct ac_llvm_context *ctx, - unsigned log_size, - unsigned num_channels, - unsigned format, - bool reverse, - bool known_aligned, - LLVMValueRef rsrc, - LLVMValueRef vindex, - LLVMValueRef voffset, - LLVMValueRef soffset, - unsigned cache_policy, - bool can_speculate); - -void -ac_build_tbuffer_store_short(struct ac_llvm_context *ctx, - LLVMValueRef rsrc, - LLVMValueRef vdata, - LLVMValueRef voffset, - LLVMValueRef soffset, - unsigned cache_policy); - -void -ac_build_tbuffer_store_byte(struct ac_llvm_context *ctx, - LLVMValueRef rsrc, - LLVMValueRef vdata, - LLVMValueRef voffset, - LLVMValueRef soffset, - unsigned cache_policy); - -void -ac_build_struct_tbuffer_store(struct ac_llvm_context *ctx, - LLVMValueRef rsrc, - LLVMValueRef vdata, - LLVMValueRef vindex, - LLVMValueRef voffset, - LLVMValueRef soffset, - LLVMValueRef immoffset, - unsigned num_channels, - unsigned dfmt, - unsigned nfmt, - unsigned cache_policy); - -void -ac_build_raw_tbuffer_store(struct ac_llvm_context *ctx, - LLVMValueRef rsrc, - LLVMValueRef vdata, - LLVMValueRef voffset, - LLVMValueRef soffset, - LLVMValueRef immoffset, - unsigned num_channels, - unsigned dfmt, - unsigned nfmt, - unsigned cache_policy); - -LLVMValueRef -ac_get_thread_id(struct ac_llvm_context *ctx); - -#define AC_TID_MASK_TOP_LEFT 0xfffffffc -#define AC_TID_MASK_TOP 0xfffffffd -#define AC_TID_MASK_LEFT 0xfffffffe - -LLVMValueRef -ac_build_ddxy(struct ac_llvm_context *ctx, - uint32_t mask, - int idx, - LLVMValueRef val); - -#define AC_SENDMSG_GS 2 -#define AC_SENDMSG_GS_DONE 3 -#define AC_SENDMSG_GS_ALLOC_REQ 9 - -#define AC_SENDMSG_GS_OP_NOP (0 << 4) -#define AC_SENDMSG_GS_OP_CUT (1 << 4) -#define AC_SENDMSG_GS_OP_EMIT (2 << 4) -#define AC_SENDMSG_GS_OP_EMIT_CUT (3 << 4) - -void ac_build_sendmsg(struct ac_llvm_context *ctx, - uint32_t msg, - LLVMValueRef wave_id); - -LLVMValueRef ac_build_imsb(struct ac_llvm_context *ctx, - LLVMValueRef arg, - LLVMTypeRef dst_type); - -LLVMValueRef ac_build_umsb(struct ac_llvm_context *ctx, - LLVMValueRef arg, - LLVMTypeRef dst_type); -LLVMValueRef ac_build_fmin(struct ac_llvm_context *ctx, LLVMValueRef a, - LLVMValueRef b); -LLVMValueRef ac_build_fmax(struct ac_llvm_context *ctx, LLVMValueRef a, - LLVMValueRef b); -LLVMValueRef ac_build_imin(struct ac_llvm_context *ctx, LLVMValueRef a, - LLVMValueRef b); -LLVMValueRef ac_build_imax(struct ac_llvm_context *ctx, LLVMValueRef a, - LLVMValueRef b); -LLVMValueRef ac_build_umin(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b); -LLVMValueRef ac_build_umax(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b); -LLVMValueRef ac_build_clamp(struct ac_llvm_context *ctx, LLVMValueRef value); - -struct ac_export_args { - LLVMValueRef out[4]; - unsigned target; - unsigned enabled_channels; - bool compr; - bool done; - bool valid_mask; -}; - -void ac_build_export(struct ac_llvm_context *ctx, struct ac_export_args *a); - -void ac_build_export_null(struct ac_llvm_context *ctx); - -enum ac_image_opcode { - ac_image_sample, - ac_image_gather4, - ac_image_load, - ac_image_load_mip, - ac_image_store, - ac_image_store_mip, - ac_image_get_lod, - ac_image_get_resinfo, - ac_image_atomic, - ac_image_atomic_cmpswap, -}; - -enum ac_atomic_op { - ac_atomic_swap, - ac_atomic_add, - ac_atomic_sub, - ac_atomic_smin, - ac_atomic_umin, - ac_atomic_smax, - ac_atomic_umax, - ac_atomic_and, - ac_atomic_or, - ac_atomic_xor, - ac_atomic_inc_wrap, - ac_atomic_dec_wrap, -}; - -enum ac_image_dim { - ac_image_1d, - ac_image_2d, - ac_image_3d, - ac_image_cube, // includes cube arrays - ac_image_1darray, - ac_image_2darray, - ac_image_2dmsaa, - ac_image_2darraymsaa, -}; - -/* These cache policy bits match the definitions used by the LLVM intrinsics. */ -enum ac_image_cache_policy { - ac_glc = 1 << 0, /* per-CU cache control */ - ac_slc = 1 << 1, /* global L2 cache control */ - ac_dlc = 1 << 2, /* per-shader-array cache control */ -}; - -struct ac_image_args { - enum ac_image_opcode opcode : 4; - enum ac_atomic_op atomic : 4; /* for the ac_image_atomic opcode */ - enum ac_image_dim dim : 3; - unsigned dmask : 4; - unsigned cache_policy : 3; - bool unorm : 1; - bool level_zero : 1; - unsigned attributes; /* additional call-site specific AC_FUNC_ATTRs */ - - LLVMValueRef resource; - LLVMValueRef sampler; - LLVMValueRef data[2]; /* data[0] is source data (vector); data[1] is cmp for cmpswap */ - LLVMValueRef offset; - LLVMValueRef bias; - LLVMValueRef compare; - LLVMValueRef derivs[6]; - LLVMValueRef coords[4]; - LLVMValueRef lod; // also used by ac_image_get_resinfo -}; - -LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx, - struct ac_image_args *a); -LLVMValueRef ac_build_cvt_pkrtz_f16(struct ac_llvm_context *ctx, - LLVMValueRef args[2]); -LLVMValueRef ac_build_cvt_pknorm_i16(struct ac_llvm_context *ctx, - LLVMValueRef args[2]); -LLVMValueRef ac_build_cvt_pknorm_u16(struct ac_llvm_context *ctx, - LLVMValueRef args[2]); -LLVMValueRef ac_build_cvt_pk_i16(struct ac_llvm_context *ctx, - LLVMValueRef args[2], unsigned bits, bool hi); -LLVMValueRef ac_build_cvt_pk_u16(struct ac_llvm_context *ctx, - LLVMValueRef args[2], unsigned bits, bool hi); -LLVMValueRef ac_build_wqm_vote(struct ac_llvm_context *ctx, LLVMValueRef i1); -void ac_build_kill_if_false(struct ac_llvm_context *ctx, LLVMValueRef i1); -LLVMValueRef ac_build_bfe(struct ac_llvm_context *ctx, LLVMValueRef input, - LLVMValueRef offset, LLVMValueRef width, - bool is_signed); -LLVMValueRef ac_build_imad(struct ac_llvm_context *ctx, LLVMValueRef s0, - LLVMValueRef s1, LLVMValueRef s2); -LLVMValueRef ac_build_fmad(struct ac_llvm_context *ctx, LLVMValueRef s0, - LLVMValueRef s1, LLVMValueRef s2); - -void ac_build_waitcnt(struct ac_llvm_context *ctx, unsigned wait_flags); - -LLVMValueRef ac_build_fract(struct ac_llvm_context *ctx, LLVMValueRef src0, - unsigned bitsize); - -LLVMValueRef ac_build_fmed3(struct ac_llvm_context *ctx, LLVMValueRef src0, - LLVMValueRef src1, LLVMValueRef src2, - unsigned bitsize); - -LLVMValueRef ac_build_isign(struct ac_llvm_context *ctx, LLVMValueRef src0, - unsigned bitsize); - -LLVMValueRef ac_build_fsign(struct ac_llvm_context *ctx, LLVMValueRef src0, - unsigned bitsize); - -LLVMValueRef ac_build_bit_count(struct ac_llvm_context *ctx, LLVMValueRef src0); - -LLVMValueRef ac_build_bitfield_reverse(struct ac_llvm_context *ctx, - LLVMValueRef src0); - -void ac_optimize_vs_outputs(struct ac_llvm_context *ac, - LLVMValueRef main_fn, - uint8_t *vs_output_param_offset, - uint32_t num_outputs, - uint8_t *num_param_exports); -void ac_init_exec_full_mask(struct ac_llvm_context *ctx); - -void ac_declare_lds_as_pointer(struct ac_llvm_context *ac); -LLVMValueRef ac_lds_load(struct ac_llvm_context *ctx, - LLVMValueRef dw_addr); -void ac_lds_store(struct ac_llvm_context *ctx, - LLVMValueRef dw_addr, LLVMValueRef value); - -LLVMValueRef ac_find_lsb(struct ac_llvm_context *ctx, - LLVMTypeRef dst_type, - LLVMValueRef src0); - -LLVMTypeRef ac_array_in_const_addr_space(LLVMTypeRef elem_type); -LLVMTypeRef ac_array_in_const32_addr_space(LLVMTypeRef elem_type); - -void ac_build_bgnloop(struct ac_llvm_context *ctx, int lable_id); -void ac_build_break(struct ac_llvm_context *ctx); -void ac_build_continue(struct ac_llvm_context *ctx); -void ac_build_else(struct ac_llvm_context *ctx, int lable_id); -void ac_build_endif(struct ac_llvm_context *ctx, int lable_id); -void ac_build_endloop(struct ac_llvm_context *ctx, int lable_id); -void ac_build_ifcc(struct ac_llvm_context *ctx, LLVMValueRef cond, int label_id); -void ac_build_if(struct ac_llvm_context *ctx, LLVMValueRef value, - int lable_id); -void ac_build_uif(struct ac_llvm_context *ctx, LLVMValueRef value, - int lable_id); - -LLVMValueRef ac_build_alloca(struct ac_llvm_context *ac, LLVMTypeRef type, - const char *name); -LLVMValueRef ac_build_alloca_undef(struct ac_llvm_context *ac, LLVMTypeRef type, - const char *name); - -LLVMValueRef ac_cast_ptr(struct ac_llvm_context *ctx, LLVMValueRef ptr, - LLVMTypeRef type); - -LLVMValueRef ac_trim_vector(struct ac_llvm_context *ctx, LLVMValueRef value, - unsigned count); - -LLVMValueRef ac_unpack_param(struct ac_llvm_context *ctx, LLVMValueRef param, - unsigned rshift, unsigned bitwidth); - -void ac_apply_fmask_to_sample(struct ac_llvm_context *ac, LLVMValueRef fmask, - LLVMValueRef *addr, bool is_array_tex); - -LLVMValueRef -ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned mask); - -LLVMValueRef -ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef lane); - -LLVMValueRef -ac_build_writelane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef value, LLVMValueRef lane); - -LLVMValueRef -ac_build_mbcnt(struct ac_llvm_context *ctx, LLVMValueRef mask); - -LLVMValueRef -ac_build_inclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op); - -LLVMValueRef -ac_build_exclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op); - -LLVMValueRef -ac_build_reduce(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op, unsigned cluster_size); - -/** - * Common arguments for a scan/reduce operation that accumulates per-wave - * values across an entire workgroup, while respecting the order of waves. - */ -struct ac_wg_scan { - bool enable_reduce; - bool enable_exclusive; - bool enable_inclusive; - nir_op op; - LLVMValueRef src; /* clobbered! */ - LLVMValueRef result_reduce; - LLVMValueRef result_exclusive; - LLVMValueRef result_inclusive; - LLVMValueRef extra; - LLVMValueRef waveidx; - LLVMValueRef numwaves; /* only needed for "reduce" operations */ - - /* T addrspace(LDS) pointer to the same type as value, at least maxwaves entries */ - LLVMValueRef scratch; - unsigned maxwaves; -}; - -void -ac_build_wg_wavescan_top(struct ac_llvm_context *ctx, struct ac_wg_scan *ws); -void -ac_build_wg_wavescan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws); -void -ac_build_wg_wavescan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws); - -void -ac_build_wg_scan_top(struct ac_llvm_context *ctx, struct ac_wg_scan *ws); -void -ac_build_wg_scan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws); -void -ac_build_wg_scan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws); - -LLVMValueRef -ac_build_quad_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, - unsigned lane0, unsigned lane1, unsigned lane2, unsigned lane3); - -LLVMValueRef -ac_build_shuffle(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef index); - -LLVMValueRef -ac_build_frexp_exp(struct ac_llvm_context *ctx, LLVMValueRef src0, - unsigned bitsize); - -LLVMValueRef -ac_build_frexp_mant(struct ac_llvm_context *ctx, LLVMValueRef src0, - unsigned bitsize); - -LLVMValueRef -ac_build_ddxy_interp(struct ac_llvm_context *ctx, LLVMValueRef interp_ij); - -LLVMValueRef -ac_build_load_helper_invocation(struct ac_llvm_context *ctx); - -LLVMValueRef ac_build_call(struct ac_llvm_context *ctx, LLVMValueRef func, - LLVMValueRef *args, unsigned num_args); - -LLVMValueRef ac_build_atomic_rmw(struct ac_llvm_context *ctx, LLVMAtomicRMWBinOp op, - LLVMValueRef ptr, LLVMValueRef val, - const char *sync_scope); - -LLVMValueRef ac_build_atomic_cmp_xchg(struct ac_llvm_context *ctx, LLVMValueRef ptr, - LLVMValueRef cmp, LLVMValueRef val, - const char *sync_scope); - -#ifdef __cplusplus -} -#endif - -#endif diff -Nru mesa-19.2.8/src/amd/common/ac_llvm_cull.c mesa-20.0.8/src/amd/common/ac_llvm_cull.c --- mesa-19.2.8/src/amd/common/ac_llvm_cull.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/common/ac_llvm_cull.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,275 +0,0 @@ -/* - * Copyright 2019 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sub license, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL - * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, - * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR - * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE - * USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * The above copyright notice and this permission notice (including the - * next paragraph) shall be included in all copies or substantial portions - * of the Software. - * - */ - -#include "ac_llvm_cull.h" -#include - -struct ac_position_w_info { - /* If a primitive intersects the W=0 plane, it causes a reflection - * of the determinant used for face culling. Every vertex behind - * the W=0 plane negates the determinant, so having 2 vertices behind - * the plane has no effect. This is i1 true if the determinant should be - * negated. - */ - LLVMValueRef w_reflection; - - /* If we simplify the "-w <= p <= w" view culling equation, we get - * "-w <= w", which can't be satisfied when w is negative. - * In perspective projection, a negative W means that the primitive - * is behind the viewer, but the equation is independent of the type - * of projection. - * - * w_accepted is false when all W are negative and therefore - * the primitive is invisible. - */ - LLVMValueRef w_accepted; - - LLVMValueRef all_w_positive; - LLVMValueRef any_w_negative; -}; - -static void ac_analyze_position_w(struct ac_llvm_context *ctx, - LLVMValueRef pos[3][4], - struct ac_position_w_info *w) -{ - LLVMBuilderRef builder = ctx->builder; - LLVMValueRef all_w_negative = ctx->i1true; - - w->w_reflection = ctx->i1false; - w->any_w_negative = ctx->i1false; - - for (unsigned i = 0; i < 3; i++) { - LLVMValueRef neg_w; - - neg_w = LLVMBuildFCmp(builder, LLVMRealOLT, pos[i][3], ctx->f32_0, ""); - /* If neg_w is true, negate w_reflection. */ - w->w_reflection = LLVMBuildXor(builder, w->w_reflection, neg_w, ""); - w->any_w_negative = LLVMBuildOr(builder, w->any_w_negative, neg_w, ""); - all_w_negative = LLVMBuildAnd(builder, all_w_negative, neg_w, ""); - } - w->all_w_positive = LLVMBuildNot(builder, w->any_w_negative, ""); - w->w_accepted = LLVMBuildNot(builder, all_w_negative, ""); -} - -/* Perform front/back face culling and return true if the primitive is accepted. */ -static LLVMValueRef ac_cull_face(struct ac_llvm_context *ctx, - LLVMValueRef pos[3][4], - struct ac_position_w_info *w, - bool cull_front, - bool cull_back, - bool cull_zero_area) -{ - LLVMBuilderRef builder = ctx->builder; - - if (cull_front && cull_back) - return ctx->i1false; - - if (!cull_front && !cull_back && !cull_zero_area) - return ctx->i1true; - - /* Front/back face culling. Also if the determinant == 0, the triangle - * area is 0. - */ - LLVMValueRef det_t0 = LLVMBuildFSub(builder, pos[2][0], pos[0][0], ""); - LLVMValueRef det_t1 = LLVMBuildFSub(builder, pos[1][1], pos[0][1], ""); - LLVMValueRef det_t2 = LLVMBuildFSub(builder, pos[0][0], pos[1][0], ""); - LLVMValueRef det_t3 = LLVMBuildFSub(builder, pos[0][1], pos[2][1], ""); - LLVMValueRef det_p0 = LLVMBuildFMul(builder, det_t0, det_t1, ""); - LLVMValueRef det_p1 = LLVMBuildFMul(builder, det_t2, det_t3, ""); - LLVMValueRef det = LLVMBuildFSub(builder, det_p0, det_p1, ""); - - /* Negative W negates the determinant. */ - det = LLVMBuildSelect(builder, w->w_reflection, - LLVMBuildFNeg(builder, det, ""), - det, ""); - - LLVMValueRef accepted = NULL; - if (cull_front) { - LLVMRealPredicate cond = cull_zero_area ? LLVMRealOGT : LLVMRealOGE; - accepted = LLVMBuildFCmp(builder, cond, det, ctx->f32_0, ""); - } else if (cull_back) { - LLVMRealPredicate cond = cull_zero_area ? LLVMRealOLT : LLVMRealOLE; - accepted = LLVMBuildFCmp(builder, cond, det, ctx->f32_0, ""); - } else if (cull_zero_area) { - accepted = LLVMBuildFCmp(builder, LLVMRealONE, det, ctx->f32_0, ""); - } - return accepted; -} - -/* Perform view culling and small primitive elimination and return true - * if the primitive is accepted and initially_accepted == true. */ -static LLVMValueRef cull_bbox(struct ac_llvm_context *ctx, - LLVMValueRef pos[3][4], - LLVMValueRef initially_accepted, - struct ac_position_w_info *w, - LLVMValueRef vp_scale[2], - LLVMValueRef vp_translate[2], - LLVMValueRef small_prim_precision, - bool cull_view_xy, - bool cull_view_near_z, - bool cull_view_far_z, - bool cull_small_prims, - bool use_halfz_clip_space) -{ - LLVMBuilderRef builder = ctx->builder; - - if (!cull_view_xy && !cull_view_near_z && !cull_view_far_z && !cull_small_prims) - return ctx->i1true; - - /* Skip the culling if the primitive has already been rejected or - * if any W is negative. The bounding box culling doesn't work when - * W is negative. - */ - LLVMValueRef cond = LLVMBuildAnd(builder, initially_accepted, - w->all_w_positive, ""); - LLVMValueRef accepted_var = ac_build_alloca_undef(ctx, ctx->i1, ""); - LLVMBuildStore(builder, initially_accepted, accepted_var); - - ac_build_ifcc(ctx, cond, 10000000 /* does this matter? */); - { - LLVMValueRef bbox_min[3], bbox_max[3]; - LLVMValueRef accepted = initially_accepted; - - /* Compute the primitive bounding box for easy culling. */ - for (unsigned chan = 0; chan < 3; chan++) { - bbox_min[chan] = ac_build_fmin(ctx, pos[0][chan], pos[1][chan]); - bbox_min[chan] = ac_build_fmin(ctx, bbox_min[chan], pos[2][chan]); - - bbox_max[chan] = ac_build_fmax(ctx, pos[0][chan], pos[1][chan]); - bbox_max[chan] = ac_build_fmax(ctx, bbox_max[chan], pos[2][chan]); - } - - /* View culling. */ - if (cull_view_xy || cull_view_near_z || cull_view_far_z) { - for (unsigned chan = 0; chan < 3; chan++) { - LLVMValueRef visible; - - if ((cull_view_xy && chan <= 1) || - (cull_view_near_z && chan == 2)) { - float t = chan == 2 && use_halfz_clip_space ? 0 : -1; - visible = LLVMBuildFCmp(builder, LLVMRealOGE, bbox_max[chan], - LLVMConstReal(ctx->f32, t), ""); - accepted = LLVMBuildAnd(builder, accepted, visible, ""); - } - - if ((cull_view_xy && chan <= 1) || - (cull_view_far_z && chan == 2)) { - visible = LLVMBuildFCmp(builder, LLVMRealOLE, bbox_min[chan], - ctx->f32_1, ""); - accepted = LLVMBuildAnd(builder, accepted, visible, ""); - } - } - } - - /* Small primitive elimination. */ - if (cull_small_prims) { - /* Assuming a sample position at (0.5, 0.5), if we round - * the bounding box min/max extents and the results of - * the rounding are equal in either the X or Y direction, - * the bounding box does not intersect the sample. - * - * See these GDC slides for pictures: - * https://frostbite-wp-prd.s3.amazonaws.com/wp-content/uploads/2016/03/29204330/GDC_2016_Compute.pdf - */ - LLVMValueRef min, max, not_equal[2], visible; - - for (unsigned chan = 0; chan < 2; chan++) { - /* Convert the position to screen-space coordinates. */ - min = ac_build_fmad(ctx, bbox_min[chan], - vp_scale[chan], vp_translate[chan]); - max = ac_build_fmad(ctx, bbox_max[chan], - vp_scale[chan], vp_translate[chan]); - /* Scale the bounding box according to the precision of - * the rasterizer and the number of MSAA samples. */ - min = LLVMBuildFSub(builder, min, small_prim_precision, ""); - max = LLVMBuildFAdd(builder, max, small_prim_precision, ""); - - /* Determine if the bbox intersects the sample point. - * It also works for MSAA, but vp_scale, vp_translate, - * and small_prim_precision are computed differently. - */ - min = ac_build_round(ctx, min); - max = ac_build_round(ctx, max); - not_equal[chan] = LLVMBuildFCmp(builder, LLVMRealONE, min, max, ""); - } - visible = LLVMBuildAnd(builder, not_equal[0], not_equal[1], ""); - accepted = LLVMBuildAnd(builder, accepted, visible, ""); - } - - LLVMBuildStore(builder, accepted, accepted_var); - } - ac_build_endif(ctx, 10000000); - - return LLVMBuildLoad(builder, accepted_var, ""); -} - -/** - * Return i1 true if the primitive is accepted (not culled). - * - * \param pos Vertex positions 3x vec4 - * \param initially_accepted AND'ed with the result. Some computations can be - * skipped if this is false. - * \param vp_scale Viewport scale XY. - * For MSAA, multiply them by the number of samples. - * \param vp_translate Viewport translation XY. - * For MSAA, multiply them by the number of samples. - * \param small_prim_precision Precision of small primitive culling. This should - * be the same as or greater than the precision of - * the rasterizer. Set to num_samples / 2^subpixel_bits. - * subpixel_bits are defined by the quantization mode. - * \param options See ac_cull_options. - */ -LLVMValueRef ac_cull_triangle(struct ac_llvm_context *ctx, - LLVMValueRef pos[3][4], - LLVMValueRef initially_accepted, - LLVMValueRef vp_scale[2], - LLVMValueRef vp_translate[2], - LLVMValueRef small_prim_precision, - struct ac_cull_options *options) -{ - struct ac_position_w_info w; - ac_analyze_position_w(ctx, pos, &w); - - /* W culling. */ - LLVMValueRef accepted = options->cull_w ? w.w_accepted : ctx->i1true; - accepted = LLVMBuildAnd(ctx->builder, accepted, initially_accepted, ""); - - /* Face culling. */ - accepted = LLVMBuildAnd(ctx->builder, accepted, - ac_cull_face(ctx, pos, &w, - options->cull_front, - options->cull_back, - options->cull_zero_area), ""); - - /* View culling and small primitive elimination. */ - accepted = cull_bbox(ctx, pos, accepted, &w, vp_scale, vp_translate, - small_prim_precision, - options->cull_view_xy, - options->cull_view_near_z, - options->cull_view_far_z, - options->cull_small_prims, - options->use_halfz_clip_space); - return accepted; -} diff -Nru mesa-19.2.8/src/amd/common/ac_llvm_cull.h mesa-20.0.8/src/amd/common/ac_llvm_cull.h --- mesa-19.2.8/src/amd/common/ac_llvm_cull.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/common/ac_llvm_cull.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,59 +0,0 @@ -/* - * Copyright 2019 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sub license, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL - * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, - * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR - * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE - * USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * The above copyright notice and this permission notice (including the - * next paragraph) shall be included in all copies or substantial portions - * of the Software. - * - */ - -#ifndef AC_LLVM_CULL_H -#define AC_LLVM_CULL_H - -#include "ac_llvm_build.h" - -struct ac_cull_options { - /* In general, I recommend setting all to true except view Z culling, - * which isn't so effective because W culling is cheaper and partially - * replaces near Z culling, and you don't need to set Position.z - * if Z culling is disabled. - * - * If something doesn't work, turn some of these off to find out what. - */ - bool cull_front; - bool cull_back; - bool cull_view_xy; - bool cull_view_near_z; - bool cull_view_far_z; - bool cull_small_prims; - bool cull_zero_area; - bool cull_w; /* cull primitives with all W < 0 */ - - bool use_halfz_clip_space; -}; - -LLVMValueRef ac_cull_triangle(struct ac_llvm_context *ctx, - LLVMValueRef pos[3][4], - LLVMValueRef initially_accepted, - LLVMValueRef vp_scale[2], - LLVMValueRef vp_translate[2], - LLVMValueRef small_prim_precision, - struct ac_cull_options *options); - -#endif diff -Nru mesa-19.2.8/src/amd/common/ac_llvm_helper.cpp mesa-20.0.8/src/amd/common/ac_llvm_helper.cpp --- mesa-19.2.8/src/amd/common/ac_llvm_helper.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/common/ac_llvm_helper.cpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,288 +0,0 @@ -/* - * Copyright 2014 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sub license, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL - * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, - * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR - * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE - * USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * The above copyright notice and this permission notice (including the - * next paragraph) shall be included in all copies or substantial portions - * of the Software. - * - */ - -/* based on Marek's patch to lp_bld_misc.cpp */ - -// Workaround http://llvm.org/PR23628 -#pragma push_macro("DEBUG") -#undef DEBUG - -#include - -#include "ac_binary.h" -#include "ac_llvm_util.h" -#include "ac_llvm_build.h" - -#include "util/macros.h" - -#include -#include -#include -#include -#include - -#include - -void ac_add_attr_dereferenceable(LLVMValueRef val, uint64_t bytes) -{ - llvm::Argument *A = llvm::unwrap(val); - A->addAttr(llvm::Attribute::getWithDereferenceableBytes(A->getContext(), bytes)); -} - -bool ac_is_sgpr_param(LLVMValueRef arg) -{ - llvm::Argument *A = llvm::unwrap(arg); - llvm::AttributeList AS = A->getParent()->getAttributes(); - unsigned ArgNo = A->getArgNo(); - return AS.hasAttribute(ArgNo + 1, llvm::Attribute::InReg); -} - -LLVMValueRef ac_llvm_get_called_value(LLVMValueRef call) -{ - return LLVMGetCalledValue(call); -} - -bool ac_llvm_is_function(LLVMValueRef v) -{ - return LLVMGetValueKind(v) == LLVMFunctionValueKind; -} - -LLVMModuleRef ac_create_module(LLVMTargetMachineRef tm, LLVMContextRef ctx) -{ - llvm::TargetMachine *TM = reinterpret_cast(tm); - LLVMModuleRef module = LLVMModuleCreateWithNameInContext("mesa-shader", ctx); - - llvm::unwrap(module)->setTargetTriple(TM->getTargetTriple().getTriple()); - llvm::unwrap(module)->setDataLayout(TM->createDataLayout()); - return module; -} - -LLVMBuilderRef ac_create_builder(LLVMContextRef ctx, - enum ac_float_mode float_mode) -{ - LLVMBuilderRef builder = LLVMCreateBuilderInContext(ctx); - - llvm::FastMathFlags flags; - - switch (float_mode) { - case AC_FLOAT_MODE_DEFAULT: - break; - case AC_FLOAT_MODE_NO_SIGNED_ZEROS_FP_MATH: - flags.setNoSignedZeros(); - llvm::unwrap(builder)->setFastMathFlags(flags); - break; - case AC_FLOAT_MODE_UNSAFE_FP_MATH: - flags.setFast(); - llvm::unwrap(builder)->setFastMathFlags(flags); - break; - } - - return builder; -} - -LLVMTargetLibraryInfoRef -ac_create_target_library_info(const char *triple) -{ - return reinterpret_cast(new llvm::TargetLibraryInfoImpl(llvm::Triple(triple))); -} - -void -ac_dispose_target_library_info(LLVMTargetLibraryInfoRef library_info) -{ - delete reinterpret_cast(library_info); -} - -/* Implementation of raw_pwrite_stream that works on malloc()ed memory for - * better compatibility with C code. */ -struct raw_memory_ostream : public llvm::raw_pwrite_stream { - char *buffer; - size_t written; - size_t bufsize; - - raw_memory_ostream() - { - buffer = NULL; - written = 0; - bufsize = 0; - SetUnbuffered(); - } - - ~raw_memory_ostream() - { - free(buffer); - } - - void clear() - { - written = 0; - } - - void take(char *&out_buffer, size_t &out_size) - { - out_buffer = buffer; - out_size = written; - buffer = NULL; - written = 0; - bufsize = 0; - } - - void flush() = delete; - - void write_impl(const char *ptr, size_t size) override - { - if (unlikely(written + size < written)) - abort(); - if (written + size > bufsize) { - bufsize = MAX3(1024, written + size, bufsize / 3 * 4); - buffer = (char *)realloc(buffer, bufsize); - if (!buffer) { - fprintf(stderr, "amd: out of memory allocating ELF buffer\n"); - abort(); - } - } - memcpy(buffer + written, ptr, size); - written += size; - } - - void pwrite_impl(const char *ptr, size_t size, uint64_t offset) override - { - assert(offset == (size_t)offset && - offset + size >= offset && offset + size <= written); - memcpy(buffer + offset, ptr, size); - } - - uint64_t current_pos() const override - { - return written; - } -}; - -/* The LLVM compiler is represented as a pass manager containing passes for - * optimizations, instruction selection, and code generation. - */ -struct ac_compiler_passes { - raw_memory_ostream ostream; /* ELF shader binary stream */ - llvm::legacy::PassManager passmgr; /* list of passes */ -}; - -struct ac_compiler_passes *ac_create_llvm_passes(LLVMTargetMachineRef tm) -{ - struct ac_compiler_passes *p = new ac_compiler_passes(); - if (!p) - return NULL; - - llvm::TargetMachine *TM = reinterpret_cast(tm); - - if (TM->addPassesToEmitFile(p->passmgr, p->ostream, - nullptr, - llvm::TargetMachine::CGFT_ObjectFile)) { - fprintf(stderr, "amd: TargetMachine can't emit a file of this type!\n"); - delete p; - return NULL; - } - return p; -} - -void ac_destroy_llvm_passes(struct ac_compiler_passes *p) -{ - delete p; -} - -/* This returns false on failure. */ -bool ac_compile_module_to_elf(struct ac_compiler_passes *p, LLVMModuleRef module, - char **pelf_buffer, size_t *pelf_size) -{ - p->passmgr.run(*llvm::unwrap(module)); - p->ostream.take(*pelf_buffer, *pelf_size); - return true; -} - -void ac_llvm_add_barrier_noop_pass(LLVMPassManagerRef passmgr) -{ - llvm::unwrap(passmgr)->add(llvm::createBarrierNoopPass()); -} - -void ac_enable_global_isel(LLVMTargetMachineRef tm) -{ - reinterpret_cast(tm)->setGlobalISel(true); -} - -LLVMValueRef ac_build_atomic_rmw(struct ac_llvm_context *ctx, LLVMAtomicRMWBinOp op, - LLVMValueRef ptr, LLVMValueRef val, - const char *sync_scope) { - llvm::AtomicRMWInst::BinOp binop; - switch (op) { - case LLVMAtomicRMWBinOpXchg: - binop = llvm::AtomicRMWInst::Xchg; - break; - case LLVMAtomicRMWBinOpAdd: - binop = llvm::AtomicRMWInst::Add; - break; - case LLVMAtomicRMWBinOpSub: - binop = llvm::AtomicRMWInst::Sub; - break; - case LLVMAtomicRMWBinOpAnd: - binop = llvm::AtomicRMWInst::And; - break; - case LLVMAtomicRMWBinOpNand: - binop = llvm::AtomicRMWInst::Nand; - break; - case LLVMAtomicRMWBinOpOr: - binop = llvm::AtomicRMWInst::Or; - break; - case LLVMAtomicRMWBinOpXor: - binop = llvm::AtomicRMWInst::Xor; - break; - case LLVMAtomicRMWBinOpMax: - binop = llvm::AtomicRMWInst::Max; - break; - case LLVMAtomicRMWBinOpMin: - binop = llvm::AtomicRMWInst::Min; - break; - case LLVMAtomicRMWBinOpUMax: - binop = llvm::AtomicRMWInst::UMax; - break; - case LLVMAtomicRMWBinOpUMin: - binop = llvm::AtomicRMWInst::UMin; - break; - default: - unreachable(!"invalid LLVMAtomicRMWBinOp"); - break; - } - unsigned SSID = llvm::unwrap(ctx->context)->getOrInsertSyncScopeID(sync_scope); - return llvm::wrap(llvm::unwrap(ctx->builder)->CreateAtomicRMW( - binop, llvm::unwrap(ptr), llvm::unwrap(val), - llvm::AtomicOrdering::SequentiallyConsistent, SSID)); -} - -LLVMValueRef ac_build_atomic_cmp_xchg(struct ac_llvm_context *ctx, LLVMValueRef ptr, - LLVMValueRef cmp, LLVMValueRef val, - const char *sync_scope) { - unsigned SSID = llvm::unwrap(ctx->context)->getOrInsertSyncScopeID(sync_scope); - return llvm::wrap(llvm::unwrap(ctx->builder)->CreateAtomicCmpXchg( - llvm::unwrap(ptr), llvm::unwrap(cmp), llvm::unwrap(val), - llvm::AtomicOrdering::SequentiallyConsistent, - llvm::AtomicOrdering::SequentiallyConsistent, SSID)); -} diff -Nru mesa-19.2.8/src/amd/common/ac_llvm_util.c mesa-20.0.8/src/amd/common/ac_llvm_util.c --- mesa-19.2.8/src/amd/common/ac_llvm_util.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/common/ac_llvm_util.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,387 +0,0 @@ -/* - * Copyright 2014 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sub license, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL - * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, - * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR - * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE - * USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * The above copyright notice and this permission notice (including the - * next paragraph) shall be included in all copies or substantial portions - * of the Software. - * - */ -/* based on pieces from si_pipe.c and radeon_llvm_emit.c */ -#include "ac_llvm_util.h" -#include "ac_llvm_build.h" -#include "util/bitscan.h" -#include -#include -#include -#include -#include -#include "c11/threads.h" -#include "gallivm/lp_bld_misc.h" -#include "util/u_math.h" - -#include -#include -#include - -static void ac_init_llvm_target() -{ - LLVMInitializeAMDGPUTargetInfo(); - LLVMInitializeAMDGPUTarget(); - LLVMInitializeAMDGPUTargetMC(); - LLVMInitializeAMDGPUAsmPrinter(); - - /* For inline assembly. */ - LLVMInitializeAMDGPUAsmParser(); - - /* Workaround for bug in llvm 4.0 that causes image intrinsics - * to disappear. - * https://reviews.llvm.org/D26348 - * - * "mesa" is the prefix for error messages. - * - * -global-isel-abort=2 is a no-op unless global isel has been enabled. - * This option tells the backend to fall-back to SelectionDAG and print - * a diagnostic message if global isel fails. - */ - const char *argv[] = { "mesa", "-simplifycfg-sink-common=false", "-global-isel-abort=2" }; - LLVMParseCommandLineOptions(ARRAY_SIZE(argv), argv, NULL); -} - -static once_flag ac_init_llvm_target_once_flag = ONCE_FLAG_INIT; - -void ac_init_llvm_once(void) -{ - call_once(&ac_init_llvm_target_once_flag, ac_init_llvm_target); -} - -static LLVMTargetRef ac_get_llvm_target(const char *triple) -{ - LLVMTargetRef target = NULL; - char *err_message = NULL; - - if (LLVMGetTargetFromTriple(triple, &target, &err_message)) { - fprintf(stderr, "Cannot find target for triple %s ", triple); - if (err_message) { - fprintf(stderr, "%s\n", err_message); - } - LLVMDisposeMessage(err_message); - return NULL; - } - return target; -} - -const char *ac_get_llvm_processor_name(enum radeon_family family) -{ - switch (family) { - case CHIP_TAHITI: - return "tahiti"; - case CHIP_PITCAIRN: - return "pitcairn"; - case CHIP_VERDE: - return "verde"; - case CHIP_OLAND: - return "oland"; - case CHIP_HAINAN: - return "hainan"; - case CHIP_BONAIRE: - return "bonaire"; - case CHIP_KABINI: - return "kabini"; - case CHIP_KAVERI: - return "kaveri"; - case CHIP_HAWAII: - return "hawaii"; - case CHIP_TONGA: - return "tonga"; - case CHIP_ICELAND: - return "iceland"; - case CHIP_CARRIZO: - return "carrizo"; - case CHIP_FIJI: - return "fiji"; - case CHIP_STONEY: - return "stoney"; - case CHIP_POLARIS10: - return "polaris10"; - case CHIP_POLARIS11: - case CHIP_POLARIS12: - case CHIP_VEGAM: - return "polaris11"; - case CHIP_VEGA10: - return "gfx900"; - case CHIP_RAVEN: - return "gfx902"; - case CHIP_VEGA12: - return "gfx904"; - case CHIP_VEGA20: - return "gfx906"; - case CHIP_RAVEN2: - case CHIP_RENOIR: - return HAVE_LLVM >= 0x0800 ? "gfx909" : "gfx902"; - case CHIP_ARCTURUS: - return "gfx908"; - case CHIP_NAVI10: - return "gfx1010"; - case CHIP_NAVI12: - return "gfx1011"; - case CHIP_NAVI14: - return "gfx1012"; - default: - return ""; - } -} - -static LLVMTargetMachineRef ac_create_target_machine(enum radeon_family family, - enum ac_target_machine_options tm_options, - LLVMCodeGenOptLevel level, - const char **out_triple) -{ - assert(family >= CHIP_TAHITI); - char features[256]; - const char *triple = (tm_options & AC_TM_SUPPORTS_SPILL) ? "amdgcn-mesa-mesa3d" : "amdgcn--"; - LLVMTargetRef target = ac_get_llvm_target(triple); - - snprintf(features, sizeof(features), - "+DumpCode,-fp32-denormals,+fp64-denormals%s%s%s%s%s%s%s", - HAVE_LLVM >= 0x0800 ? "" : ",+vgpr-spilling", - family >= CHIP_NAVI10 && !(tm_options & AC_TM_WAVE32) ? - ",+wavefrontsize64,-wavefrontsize32" : "", - tm_options & AC_TM_SISCHED ? ",+si-scheduler" : "", - tm_options & AC_TM_FORCE_ENABLE_XNACK ? ",+xnack" : "", - tm_options & AC_TM_FORCE_DISABLE_XNACK ? ",-xnack" : "", - tm_options & AC_TM_PROMOTE_ALLOCA_TO_SCRATCH ? ",-promote-alloca" : "", - tm_options & AC_TM_NO_LOAD_STORE_OPT ? ",-load-store-opt" : ""); - - LLVMTargetMachineRef tm = LLVMCreateTargetMachine( - target, - triple, - ac_get_llvm_processor_name(family), - features, - level, - LLVMRelocDefault, - LLVMCodeModelDefault); - - if (out_triple) - *out_triple = triple; - if (tm_options & AC_TM_ENABLE_GLOBAL_ISEL) - ac_enable_global_isel(tm); - return tm; -} - -static LLVMPassManagerRef ac_create_passmgr(LLVMTargetLibraryInfoRef target_library_info, - bool check_ir) -{ - LLVMPassManagerRef passmgr = LLVMCreatePassManager(); - if (!passmgr) - return NULL; - - if (target_library_info) - LLVMAddTargetLibraryInfo(target_library_info, - passmgr); - - if (check_ir) - LLVMAddVerifierPass(passmgr); - LLVMAddAlwaysInlinerPass(passmgr); - /* Normally, the pass manager runs all passes on one function before - * moving onto another. Adding a barrier no-op pass forces the pass - * manager to run the inliner on all functions first, which makes sure - * that the following passes are only run on the remaining non-inline - * function, so it removes useless work done on dead inline functions. - */ - ac_llvm_add_barrier_noop_pass(passmgr); - /* This pass should eliminate all the load and store instructions. */ - LLVMAddPromoteMemoryToRegisterPass(passmgr); - LLVMAddScalarReplAggregatesPass(passmgr); - LLVMAddLICMPass(passmgr); - LLVMAddAggressiveDCEPass(passmgr); - LLVMAddCFGSimplificationPass(passmgr); - /* This is recommended by the instruction combining pass. */ - LLVMAddEarlyCSEMemSSAPass(passmgr); - LLVMAddInstructionCombiningPass(passmgr); - return passmgr; -} - -static const char *attr_to_str(enum ac_func_attr attr) -{ - switch (attr) { - case AC_FUNC_ATTR_ALWAYSINLINE: return "alwaysinline"; - case AC_FUNC_ATTR_INREG: return "inreg"; - case AC_FUNC_ATTR_NOALIAS: return "noalias"; - case AC_FUNC_ATTR_NOUNWIND: return "nounwind"; - case AC_FUNC_ATTR_READNONE: return "readnone"; - case AC_FUNC_ATTR_READONLY: return "readonly"; - case AC_FUNC_ATTR_WRITEONLY: return "writeonly"; - case AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY: return "inaccessiblememonly"; - case AC_FUNC_ATTR_CONVERGENT: return "convergent"; - default: - fprintf(stderr, "Unhandled function attribute: %x\n", attr); - return 0; - } -} - -void -ac_add_function_attr(LLVMContextRef ctx, LLVMValueRef function, - int attr_idx, enum ac_func_attr attr) -{ - const char *attr_name = attr_to_str(attr); - unsigned kind_id = LLVMGetEnumAttributeKindForName(attr_name, - strlen(attr_name)); - LLVMAttributeRef llvm_attr = LLVMCreateEnumAttribute(ctx, kind_id, 0); - - if (LLVMIsAFunction(function)) - LLVMAddAttributeAtIndex(function, attr_idx, llvm_attr); - else - LLVMAddCallSiteAttribute(function, attr_idx, llvm_attr); -} - -void ac_add_func_attributes(LLVMContextRef ctx, LLVMValueRef function, - unsigned attrib_mask) -{ - attrib_mask |= AC_FUNC_ATTR_NOUNWIND; - attrib_mask &= ~AC_FUNC_ATTR_LEGACY; - - while (attrib_mask) { - enum ac_func_attr attr = 1u << u_bit_scan(&attrib_mask); - ac_add_function_attr(ctx, function, -1, attr); - } -} - -void -ac_dump_module(LLVMModuleRef module) -{ - char *str = LLVMPrintModuleToString(module); - fprintf(stderr, "%s", str); - LLVMDisposeMessage(str); -} - -void -ac_llvm_add_target_dep_function_attr(LLVMValueRef F, - const char *name, unsigned value) -{ - char str[16]; - - snprintf(str, sizeof(str), "0x%x", value); - LLVMAddTargetDependentFunctionAttr(F, name, str); -} - -void ac_llvm_set_workgroup_size(LLVMValueRef F, unsigned size) -{ - if (!size) - return; - - char str[32]; - snprintf(str, sizeof(str), "%u,%u", size, size); - LLVMAddTargetDependentFunctionAttr(F, "amdgpu-flat-work-group-size", str); -} - -unsigned -ac_count_scratch_private_memory(LLVMValueRef function) -{ - unsigned private_mem_vgprs = 0; - - /* Process all LLVM instructions. */ - LLVMBasicBlockRef bb = LLVMGetFirstBasicBlock(function); - while (bb) { - LLVMValueRef next = LLVMGetFirstInstruction(bb); - - while (next) { - LLVMValueRef inst = next; - next = LLVMGetNextInstruction(next); - - if (LLVMGetInstructionOpcode(inst) != LLVMAlloca) - continue; - - LLVMTypeRef type = LLVMGetElementType(LLVMTypeOf(inst)); - /* No idea why LLVM aligns allocas to 4 elements. */ - unsigned alignment = LLVMGetAlignment(inst); - unsigned dw_size = align(ac_get_type_size(type) / 4, alignment); - private_mem_vgprs += dw_size; - } - bb = LLVMGetNextBasicBlock(bb); - } - - return private_mem_vgprs; -} - -bool -ac_init_llvm_compiler(struct ac_llvm_compiler *compiler, - enum radeon_family family, - enum ac_target_machine_options tm_options) -{ - const char *triple; - memset(compiler, 0, sizeof(*compiler)); - - compiler->tm = ac_create_target_machine(family, tm_options, - LLVMCodeGenLevelDefault, - &triple); - if (!compiler->tm) - return false; - - if (tm_options & AC_TM_CREATE_LOW_OPT) { - compiler->low_opt_tm = - ac_create_target_machine(family, tm_options, - LLVMCodeGenLevelLess, NULL); - if (!compiler->low_opt_tm) - goto fail; - } - - if (family >= CHIP_NAVI10) { - assert(!(tm_options & AC_TM_CREATE_LOW_OPT)); - compiler->tm_wave32 = ac_create_target_machine(family, - tm_options | AC_TM_WAVE32, - LLVMCodeGenLevelDefault, - NULL); - if (!compiler->tm_wave32) - goto fail; - } - - compiler->target_library_info = - ac_create_target_library_info(triple); - if (!compiler->target_library_info) - goto fail; - - compiler->passmgr = ac_create_passmgr(compiler->target_library_info, - tm_options & AC_TM_CHECK_IR); - if (!compiler->passmgr) - goto fail; - - return true; -fail: - ac_destroy_llvm_compiler(compiler); - return false; -} - -void -ac_destroy_llvm_compiler(struct ac_llvm_compiler *compiler) -{ - ac_destroy_llvm_passes(compiler->passes); - ac_destroy_llvm_passes(compiler->passes_wave32); - ac_destroy_llvm_passes(compiler->low_opt_passes); - - if (compiler->passmgr) - LLVMDisposePassManager(compiler->passmgr); - if (compiler->target_library_info) - ac_dispose_target_library_info(compiler->target_library_info); - if (compiler->low_opt_tm) - LLVMDisposeTargetMachine(compiler->low_opt_tm); - if (compiler->tm) - LLVMDisposeTargetMachine(compiler->tm); - if (compiler->tm_wave32) - LLVMDisposeTargetMachine(compiler->tm_wave32); -} diff -Nru mesa-19.2.8/src/amd/common/ac_llvm_util.h mesa-20.0.8/src/amd/common/ac_llvm_util.h --- mesa-19.2.8/src/amd/common/ac_llvm_util.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/common/ac_llvm_util.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,162 +0,0 @@ -/* - * Copyright 2016 Bas Nieuwenhuizen - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sub license, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL - * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, - * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR - * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE - * USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * The above copyright notice and this permission notice (including the - * next paragraph) shall be included in all copies or substantial portions - * of the Software. - * - */ - -#ifndef AC_LLVM_UTIL_H -#define AC_LLVM_UTIL_H - -#include -#include - -#include "amd_family.h" - -#ifdef __cplusplus -extern "C" { -#endif - -struct ac_compiler_passes; - -enum ac_func_attr { - AC_FUNC_ATTR_ALWAYSINLINE = (1 << 0), - AC_FUNC_ATTR_INREG = (1 << 2), - AC_FUNC_ATTR_NOALIAS = (1 << 3), - AC_FUNC_ATTR_NOUNWIND = (1 << 4), - AC_FUNC_ATTR_READNONE = (1 << 5), - AC_FUNC_ATTR_READONLY = (1 << 6), - AC_FUNC_ATTR_WRITEONLY = (1 << 7), - AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY = (1 << 8), - AC_FUNC_ATTR_CONVERGENT = (1 << 9), - - /* Legacy intrinsic that needs attributes on function declarations - * and they must match the internal LLVM definition exactly, otherwise - * intrinsic selection fails. - */ - AC_FUNC_ATTR_LEGACY = (1u << 31), -}; - -enum ac_target_machine_options { - AC_TM_SUPPORTS_SPILL = (1 << 0), - AC_TM_SISCHED = (1 << 1), - AC_TM_FORCE_ENABLE_XNACK = (1 << 2), - AC_TM_FORCE_DISABLE_XNACK = (1 << 3), - AC_TM_PROMOTE_ALLOCA_TO_SCRATCH = (1 << 4), - AC_TM_CHECK_IR = (1 << 5), - AC_TM_ENABLE_GLOBAL_ISEL = (1 << 6), - AC_TM_CREATE_LOW_OPT = (1 << 7), - AC_TM_NO_LOAD_STORE_OPT = (1 << 8), - AC_TM_WAVE32 = (1 << 9), -}; - -enum ac_float_mode { - AC_FLOAT_MODE_DEFAULT, - AC_FLOAT_MODE_NO_SIGNED_ZEROS_FP_MATH, - AC_FLOAT_MODE_UNSAFE_FP_MATH, -}; - -/* Per-thread persistent LLVM objects. */ -struct ac_llvm_compiler { - LLVMTargetLibraryInfoRef target_library_info; - LLVMPassManagerRef passmgr; - - /* Default compiler. */ - LLVMTargetMachineRef tm; - struct ac_compiler_passes *passes; - - /* Wave32 compiler for GFX10. */ - LLVMTargetMachineRef tm_wave32; - struct ac_compiler_passes *passes_wave32; - - /* Optional compiler for faster compilation with fewer optimizations. - * LLVM modules can be created with "tm" too. There is no difference. - */ - LLVMTargetMachineRef low_opt_tm; /* uses -O1 instead of -O2 */ - struct ac_compiler_passes *low_opt_passes; -}; - -const char *ac_get_llvm_processor_name(enum radeon_family family); -void ac_add_attr_dereferenceable(LLVMValueRef val, uint64_t bytes); -bool ac_is_sgpr_param(LLVMValueRef param); -void ac_add_function_attr(LLVMContextRef ctx, LLVMValueRef function, - int attr_idx, enum ac_func_attr attr); -void ac_add_func_attributes(LLVMContextRef ctx, LLVMValueRef function, - unsigned attrib_mask); -void ac_dump_module(LLVMModuleRef module); - -LLVMValueRef ac_llvm_get_called_value(LLVMValueRef call); -bool ac_llvm_is_function(LLVMValueRef v); -LLVMModuleRef ac_create_module(LLVMTargetMachineRef tm, LLVMContextRef ctx); - -LLVMBuilderRef ac_create_builder(LLVMContextRef ctx, - enum ac_float_mode float_mode); - -void -ac_llvm_add_target_dep_function_attr(LLVMValueRef F, - const char *name, unsigned value); -void ac_llvm_set_workgroup_size(LLVMValueRef F, unsigned size); - -static inline unsigned -ac_get_load_intr_attribs(bool can_speculate) -{ - /* READNONE means writes can't affect it, while READONLY means that - * writes can affect it. */ - return can_speculate ? AC_FUNC_ATTR_READNONE : - AC_FUNC_ATTR_READONLY; -} - -unsigned -ac_count_scratch_private_memory(LLVMValueRef function); - -LLVMTargetLibraryInfoRef ac_create_target_library_info(const char *triple); -void ac_dispose_target_library_info(LLVMTargetLibraryInfoRef library_info); -void ac_init_llvm_once(void); - - -bool ac_init_llvm_compiler(struct ac_llvm_compiler *compiler, - enum radeon_family family, - enum ac_target_machine_options tm_options); -void ac_destroy_llvm_compiler(struct ac_llvm_compiler *compiler); - -struct ac_compiler_passes *ac_create_llvm_passes(LLVMTargetMachineRef tm); -void ac_destroy_llvm_passes(struct ac_compiler_passes *p); -bool ac_compile_module_to_elf(struct ac_compiler_passes *p, LLVMModuleRef module, - char **pelf_buffer, size_t *pelf_size); -void ac_llvm_add_barrier_noop_pass(LLVMPassManagerRef passmgr); -void ac_enable_global_isel(LLVMTargetMachineRef tm); - -static inline bool -ac_has_vec3_support(enum chip_class chip, bool use_format) -{ - if (chip == GFX6 && !use_format) { - /* GFX6 only supports vec3 with load/store format. */ - return false; - } - - return HAVE_LLVM >= 0x900; -} - -#ifdef __cplusplus -} -#endif - -#endif /* AC_LLVM_UTIL_H */ diff -Nru mesa-19.2.8/src/amd/common/ac_nir_to_llvm.c mesa-20.0.8/src/amd/common/ac_nir_to_llvm.c --- mesa-19.2.8/src/amd/common/ac_nir_to_llvm.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/common/ac_nir_to_llvm.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,4777 +0,0 @@ -/* - * Copyright © 2016 Bas Nieuwenhuizen - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ - -#include "ac_nir_to_llvm.h" -#include "ac_llvm_build.h" -#include "ac_llvm_util.h" -#include "ac_binary.h" -#include "sid.h" -#include "nir/nir.h" -#include "nir/nir_deref.h" -#include "util/bitscan.h" -#include "util/u_math.h" -#include "ac_shader_abi.h" -#include "ac_shader_util.h" - -struct ac_nir_context { - struct ac_llvm_context ac; - struct ac_shader_abi *abi; - - gl_shader_stage stage; - shader_info *info; - - LLVMValueRef *ssa_defs; - - struct hash_table *defs; - struct hash_table *phis; - struct hash_table *vars; - - LLVMValueRef main_function; - LLVMBasicBlockRef continue_block; - LLVMBasicBlockRef break_block; - - int num_locals; - LLVMValueRef *locals; -}; - -static LLVMValueRef get_sampler_desc(struct ac_nir_context *ctx, - nir_deref_instr *deref_instr, - enum ac_descriptor_type desc_type, - const nir_instr *instr, - bool image, bool write); - -static void -build_store_values_extended(struct ac_llvm_context *ac, - LLVMValueRef *values, - unsigned value_count, - unsigned value_stride, - LLVMValueRef vec) -{ - LLVMBuilderRef builder = ac->builder; - unsigned i; - - for (i = 0; i < value_count; i++) { - LLVMValueRef ptr = values[i * value_stride]; - LLVMValueRef index = LLVMConstInt(ac->i32, i, false); - LLVMValueRef value = LLVMBuildExtractElement(builder, vec, index, ""); - LLVMBuildStore(builder, value, ptr); - } -} - -static enum ac_image_dim -get_ac_sampler_dim(const struct ac_llvm_context *ctx, enum glsl_sampler_dim dim, - bool is_array) -{ - switch (dim) { - case GLSL_SAMPLER_DIM_1D: - if (ctx->chip_class == GFX9) - return is_array ? ac_image_2darray : ac_image_2d; - return is_array ? ac_image_1darray : ac_image_1d; - case GLSL_SAMPLER_DIM_2D: - case GLSL_SAMPLER_DIM_RECT: - case GLSL_SAMPLER_DIM_EXTERNAL: - return is_array ? ac_image_2darray : ac_image_2d; - case GLSL_SAMPLER_DIM_3D: - return ac_image_3d; - case GLSL_SAMPLER_DIM_CUBE: - return ac_image_cube; - case GLSL_SAMPLER_DIM_MS: - return is_array ? ac_image_2darraymsaa : ac_image_2dmsaa; - case GLSL_SAMPLER_DIM_SUBPASS: - return ac_image_2darray; - case GLSL_SAMPLER_DIM_SUBPASS_MS: - return ac_image_2darraymsaa; - default: - unreachable("bad sampler dim"); - } -} - -static enum ac_image_dim -get_ac_image_dim(const struct ac_llvm_context *ctx, enum glsl_sampler_dim sdim, - bool is_array) -{ - enum ac_image_dim dim = get_ac_sampler_dim(ctx, sdim, is_array); - - if (dim == ac_image_cube || - (ctx->chip_class <= GFX8 && dim == ac_image_3d)) - dim = ac_image_2darray; - - return dim; -} - -static LLVMTypeRef get_def_type(struct ac_nir_context *ctx, - const nir_ssa_def *def) -{ - LLVMTypeRef type = LLVMIntTypeInContext(ctx->ac.context, def->bit_size); - if (def->num_components > 1) { - type = LLVMVectorType(type, def->num_components); - } - return type; -} - -static LLVMValueRef get_src(struct ac_nir_context *nir, nir_src src) -{ - assert(src.is_ssa); - return nir->ssa_defs[src.ssa->index]; -} - -static LLVMValueRef -get_memory_ptr(struct ac_nir_context *ctx, nir_src src) -{ - LLVMValueRef ptr = get_src(ctx, src); - ptr = LLVMBuildGEP(ctx->ac.builder, ctx->ac.lds, &ptr, 1, ""); - int addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr)); - - return LLVMBuildBitCast(ctx->ac.builder, ptr, - LLVMPointerType(ctx->ac.i32, addr_space), ""); -} - -static LLVMBasicBlockRef get_block(struct ac_nir_context *nir, - const struct nir_block *b) -{ - struct hash_entry *entry = _mesa_hash_table_search(nir->defs, b); - return (LLVMBasicBlockRef)entry->data; -} - -static LLVMValueRef get_alu_src(struct ac_nir_context *ctx, - nir_alu_src src, - unsigned num_components) -{ - LLVMValueRef value = get_src(ctx, src.src); - bool need_swizzle = false; - - assert(value); - unsigned src_components = ac_get_llvm_num_components(value); - for (unsigned i = 0; i < num_components; ++i) { - assert(src.swizzle[i] < src_components); - if (src.swizzle[i] != i) - need_swizzle = true; - } - - if (need_swizzle || num_components != src_components) { - LLVMValueRef masks[] = { - LLVMConstInt(ctx->ac.i32, src.swizzle[0], false), - LLVMConstInt(ctx->ac.i32, src.swizzle[1], false), - LLVMConstInt(ctx->ac.i32, src.swizzle[2], false), - LLVMConstInt(ctx->ac.i32, src.swizzle[3], false)}; - - if (src_components > 1 && num_components == 1) { - value = LLVMBuildExtractElement(ctx->ac.builder, value, - masks[0], ""); - } else if (src_components == 1 && num_components > 1) { - LLVMValueRef values[] = {value, value, value, value}; - value = ac_build_gather_values(&ctx->ac, values, num_components); - } else { - LLVMValueRef swizzle = LLVMConstVector(masks, num_components); - value = LLVMBuildShuffleVector(ctx->ac.builder, value, value, - swizzle, ""); - } - } - assert(!src.negate); - assert(!src.abs); - return value; -} - -static LLVMValueRef emit_int_cmp(struct ac_llvm_context *ctx, - LLVMIntPredicate pred, LLVMValueRef src0, - LLVMValueRef src1) -{ - LLVMValueRef result = LLVMBuildICmp(ctx->builder, pred, src0, src1, ""); - return LLVMBuildSelect(ctx->builder, result, - LLVMConstInt(ctx->i32, 0xFFFFFFFF, false), - ctx->i32_0, ""); -} - -static LLVMValueRef emit_float_cmp(struct ac_llvm_context *ctx, - LLVMRealPredicate pred, LLVMValueRef src0, - LLVMValueRef src1) -{ - LLVMValueRef result; - src0 = ac_to_float(ctx, src0); - src1 = ac_to_float(ctx, src1); - result = LLVMBuildFCmp(ctx->builder, pred, src0, src1, ""); - return LLVMBuildSelect(ctx->builder, result, - LLVMConstInt(ctx->i32, 0xFFFFFFFF, false), - ctx->i32_0, ""); -} - -static LLVMValueRef emit_intrin_1f_param(struct ac_llvm_context *ctx, - const char *intrin, - LLVMTypeRef result_type, - LLVMValueRef src0) -{ - char name[64]; - LLVMValueRef params[] = { - ac_to_float(ctx, src0), - }; - - ASSERTED const int length = snprintf(name, sizeof(name), "%s.f%d", intrin, - ac_get_elem_bits(ctx, result_type)); - assert(length < sizeof(name)); - return ac_build_intrinsic(ctx, name, result_type, params, 1, AC_FUNC_ATTR_READNONE); -} - -static LLVMValueRef emit_intrin_2f_param(struct ac_llvm_context *ctx, - const char *intrin, - LLVMTypeRef result_type, - LLVMValueRef src0, LLVMValueRef src1) -{ - char name[64]; - LLVMValueRef params[] = { - ac_to_float(ctx, src0), - ac_to_float(ctx, src1), - }; - - ASSERTED const int length = snprintf(name, sizeof(name), "%s.f%d", intrin, - ac_get_elem_bits(ctx, result_type)); - assert(length < sizeof(name)); - return ac_build_intrinsic(ctx, name, result_type, params, 2, AC_FUNC_ATTR_READNONE); -} - -static LLVMValueRef emit_intrin_3f_param(struct ac_llvm_context *ctx, - const char *intrin, - LLVMTypeRef result_type, - LLVMValueRef src0, LLVMValueRef src1, LLVMValueRef src2) -{ - char name[64]; - LLVMValueRef params[] = { - ac_to_float(ctx, src0), - ac_to_float(ctx, src1), - ac_to_float(ctx, src2), - }; - - ASSERTED const int length = snprintf(name, sizeof(name), "%s.f%d", intrin, - ac_get_elem_bits(ctx, result_type)); - assert(length < sizeof(name)); - return ac_build_intrinsic(ctx, name, result_type, params, 3, AC_FUNC_ATTR_READNONE); -} - -static LLVMValueRef emit_bcsel(struct ac_llvm_context *ctx, - LLVMValueRef src0, LLVMValueRef src1, LLVMValueRef src2) -{ - assert(LLVMGetTypeKind(LLVMTypeOf(src0)) != LLVMVectorTypeKind); - - LLVMValueRef v = LLVMBuildICmp(ctx->builder, LLVMIntNE, src0, - ctx->i32_0, ""); - return LLVMBuildSelect(ctx->builder, v, - ac_to_integer_or_pointer(ctx, src1), - ac_to_integer_or_pointer(ctx, src2), ""); -} - -static LLVMValueRef emit_iabs(struct ac_llvm_context *ctx, - LLVMValueRef src0) -{ - return ac_build_imax(ctx, src0, LLVMBuildNeg(ctx->builder, src0, "")); -} - -static LLVMValueRef emit_uint_carry(struct ac_llvm_context *ctx, - const char *intrin, - LLVMValueRef src0, LLVMValueRef src1) -{ - LLVMTypeRef ret_type; - LLVMTypeRef types[] = { ctx->i32, ctx->i1 }; - LLVMValueRef res; - LLVMValueRef params[] = { src0, src1 }; - ret_type = LLVMStructTypeInContext(ctx->context, types, - 2, true); - - res = ac_build_intrinsic(ctx, intrin, ret_type, - params, 2, AC_FUNC_ATTR_READNONE); - - res = LLVMBuildExtractValue(ctx->builder, res, 1, ""); - res = LLVMBuildZExt(ctx->builder, res, ctx->i32, ""); - return res; -} - -static LLVMValueRef emit_b2f(struct ac_llvm_context *ctx, - LLVMValueRef src0, - unsigned bitsize) -{ - LLVMValueRef result = LLVMBuildAnd(ctx->builder, src0, - LLVMBuildBitCast(ctx->builder, LLVMConstReal(ctx->f32, 1.0), ctx->i32, ""), - ""); - result = LLVMBuildBitCast(ctx->builder, result, ctx->f32, ""); - - switch (bitsize) { - case 16: - return LLVMBuildFPTrunc(ctx->builder, result, ctx->f16, ""); - case 32: - return result; - case 64: - return LLVMBuildFPExt(ctx->builder, result, ctx->f64, ""); - default: - unreachable("Unsupported bit size."); - } -} - -static LLVMValueRef emit_f2b(struct ac_llvm_context *ctx, - LLVMValueRef src0) -{ - src0 = ac_to_float(ctx, src0); - LLVMValueRef zero = LLVMConstNull(LLVMTypeOf(src0)); - return LLVMBuildSExt(ctx->builder, - LLVMBuildFCmp(ctx->builder, LLVMRealUNE, src0, zero, ""), - ctx->i32, ""); -} - -static LLVMValueRef emit_b2i(struct ac_llvm_context *ctx, - LLVMValueRef src0, - unsigned bitsize) -{ - LLVMValueRef result = LLVMBuildAnd(ctx->builder, src0, ctx->i32_1, ""); - - switch (bitsize) { - case 8: - return LLVMBuildTrunc(ctx->builder, result, ctx->i8, ""); - case 16: - return LLVMBuildTrunc(ctx->builder, result, ctx->i16, ""); - case 32: - return result; - case 64: - return LLVMBuildZExt(ctx->builder, result, ctx->i64, ""); - default: - unreachable("Unsupported bit size."); - } -} - -static LLVMValueRef emit_i2b(struct ac_llvm_context *ctx, - LLVMValueRef src0) -{ - LLVMValueRef zero = LLVMConstNull(LLVMTypeOf(src0)); - return LLVMBuildSExt(ctx->builder, - LLVMBuildICmp(ctx->builder, LLVMIntNE, src0, zero, ""), - ctx->i32, ""); -} - -static LLVMValueRef emit_f2f16(struct ac_llvm_context *ctx, - LLVMValueRef src0) -{ - LLVMValueRef result; - LLVMValueRef cond = NULL; - - src0 = ac_to_float(ctx, src0); - result = LLVMBuildFPTrunc(ctx->builder, src0, ctx->f16, ""); - - if (ctx->chip_class >= GFX8) { - LLVMValueRef args[2]; - /* Check if the result is a denormal - and flush to 0 if so. */ - args[0] = result; - args[1] = LLVMConstInt(ctx->i32, N_SUBNORMAL | P_SUBNORMAL, false); - cond = ac_build_intrinsic(ctx, "llvm.amdgcn.class.f16", ctx->i1, args, 2, AC_FUNC_ATTR_READNONE); - } - - /* need to convert back up to f32 */ - result = LLVMBuildFPExt(ctx->builder, result, ctx->f32, ""); - - if (ctx->chip_class >= GFX8) - result = LLVMBuildSelect(ctx->builder, cond, ctx->f32_0, result, ""); - else { - /* for GFX6-GFX7 */ - /* 0x38800000 is smallest half float value (2^-14) in 32-bit float, - * so compare the result and flush to 0 if it's smaller. - */ - LLVMValueRef temp, cond2; - temp = emit_intrin_1f_param(ctx, "llvm.fabs", ctx->f32, result); - cond = LLVMBuildFCmp(ctx->builder, LLVMRealUGT, - LLVMBuildBitCast(ctx->builder, LLVMConstInt(ctx->i32, 0x38800000, false), ctx->f32, ""), - temp, ""); - cond2 = LLVMBuildFCmp(ctx->builder, LLVMRealUNE, - temp, ctx->f32_0, ""); - cond = LLVMBuildAnd(ctx->builder, cond, cond2, ""); - result = LLVMBuildSelect(ctx->builder, cond, ctx->f32_0, result, ""); - } - return result; -} - -static LLVMValueRef emit_umul_high(struct ac_llvm_context *ctx, - LLVMValueRef src0, LLVMValueRef src1) -{ - LLVMValueRef dst64, result; - src0 = LLVMBuildZExt(ctx->builder, src0, ctx->i64, ""); - src1 = LLVMBuildZExt(ctx->builder, src1, ctx->i64, ""); - - dst64 = LLVMBuildMul(ctx->builder, src0, src1, ""); - dst64 = LLVMBuildLShr(ctx->builder, dst64, LLVMConstInt(ctx->i64, 32, false), ""); - result = LLVMBuildTrunc(ctx->builder, dst64, ctx->i32, ""); - return result; -} - -static LLVMValueRef emit_imul_high(struct ac_llvm_context *ctx, - LLVMValueRef src0, LLVMValueRef src1) -{ - LLVMValueRef dst64, result; - src0 = LLVMBuildSExt(ctx->builder, src0, ctx->i64, ""); - src1 = LLVMBuildSExt(ctx->builder, src1, ctx->i64, ""); - - dst64 = LLVMBuildMul(ctx->builder, src0, src1, ""); - dst64 = LLVMBuildAShr(ctx->builder, dst64, LLVMConstInt(ctx->i64, 32, false), ""); - result = LLVMBuildTrunc(ctx->builder, dst64, ctx->i32, ""); - return result; -} - -static LLVMValueRef emit_bfm(struct ac_llvm_context *ctx, - LLVMValueRef bits, LLVMValueRef offset) -{ - /* mask = ((1 << bits) - 1) << offset */ - return LLVMBuildShl(ctx->builder, - LLVMBuildSub(ctx->builder, - LLVMBuildShl(ctx->builder, - ctx->i32_1, - bits, ""), - ctx->i32_1, ""), - offset, ""); -} - -static LLVMValueRef emit_bitfield_select(struct ac_llvm_context *ctx, - LLVMValueRef mask, LLVMValueRef insert, - LLVMValueRef base) -{ - /* Calculate: - * (mask & insert) | (~mask & base) = base ^ (mask & (insert ^ base)) - * Use the right-hand side, which the LLVM backend can convert to V_BFI. - */ - return LLVMBuildXor(ctx->builder, base, - LLVMBuildAnd(ctx->builder, mask, - LLVMBuildXor(ctx->builder, insert, base, ""), ""), ""); -} - -static LLVMValueRef emit_pack_2x16(struct ac_llvm_context *ctx, - LLVMValueRef src0, - LLVMValueRef (*pack)(struct ac_llvm_context *ctx, - LLVMValueRef args[2])) -{ - LLVMValueRef comp[2]; - - src0 = ac_to_float(ctx, src0); - comp[0] = LLVMBuildExtractElement(ctx->builder, src0, ctx->i32_0, ""); - comp[1] = LLVMBuildExtractElement(ctx->builder, src0, ctx->i32_1, ""); - - return LLVMBuildBitCast(ctx->builder, pack(ctx, comp), ctx->i32, ""); -} - -static LLVMValueRef emit_unpack_half_2x16(struct ac_llvm_context *ctx, - LLVMValueRef src0) -{ - LLVMValueRef const16 = LLVMConstInt(ctx->i32, 16, false); - LLVMValueRef temps[2], val; - int i; - - for (i = 0; i < 2; i++) { - val = i == 1 ? LLVMBuildLShr(ctx->builder, src0, const16, "") : src0; - val = LLVMBuildTrunc(ctx->builder, val, ctx->i16, ""); - val = LLVMBuildBitCast(ctx->builder, val, ctx->f16, ""); - temps[i] = LLVMBuildFPExt(ctx->builder, val, ctx->f32, ""); - } - return ac_build_gather_values(ctx, temps, 2); -} - -static LLVMValueRef emit_ddxy(struct ac_nir_context *ctx, - nir_op op, - LLVMValueRef src0) -{ - unsigned mask; - int idx; - LLVMValueRef result; - - if (op == nir_op_fddx_fine) - mask = AC_TID_MASK_LEFT; - else if (op == nir_op_fddy_fine) - mask = AC_TID_MASK_TOP; - else - mask = AC_TID_MASK_TOP_LEFT; - - /* for DDX we want to next X pixel, DDY next Y pixel. */ - if (op == nir_op_fddx_fine || - op == nir_op_fddx_coarse || - op == nir_op_fddx) - idx = 1; - else - idx = 2; - - result = ac_build_ddxy(&ctx->ac, mask, idx, src0); - return result; -} - -static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr) -{ - LLVMValueRef src[4], result = NULL; - unsigned num_components = instr->dest.dest.ssa.num_components; - unsigned src_components; - LLVMTypeRef def_type = get_def_type(ctx, &instr->dest.dest.ssa); - - assert(nir_op_infos[instr->op].num_inputs <= ARRAY_SIZE(src)); - switch (instr->op) { - case nir_op_vec2: - case nir_op_vec3: - case nir_op_vec4: - src_components = 1; - break; - case nir_op_pack_half_2x16: - case nir_op_pack_snorm_2x16: - case nir_op_pack_unorm_2x16: - src_components = 2; - break; - case nir_op_unpack_half_2x16: - src_components = 1; - break; - case nir_op_cube_face_coord: - case nir_op_cube_face_index: - src_components = 3; - break; - default: - src_components = num_components; - break; - } - for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) - src[i] = get_alu_src(ctx, instr->src[i], src_components); - - switch (instr->op) { - case nir_op_mov: - result = src[0]; - break; - case nir_op_fneg: - src[0] = ac_to_float(&ctx->ac, src[0]); - result = LLVMBuildFNeg(ctx->ac.builder, src[0], ""); - break; - case nir_op_ineg: - result = LLVMBuildNeg(ctx->ac.builder, src[0], ""); - break; - case nir_op_inot: - result = LLVMBuildNot(ctx->ac.builder, src[0], ""); - break; - case nir_op_iadd: - result = LLVMBuildAdd(ctx->ac.builder, src[0], src[1], ""); - break; - case nir_op_fadd: - src[0] = ac_to_float(&ctx->ac, src[0]); - src[1] = ac_to_float(&ctx->ac, src[1]); - result = LLVMBuildFAdd(ctx->ac.builder, src[0], src[1], ""); - break; - case nir_op_fsub: - src[0] = ac_to_float(&ctx->ac, src[0]); - src[1] = ac_to_float(&ctx->ac, src[1]); - result = LLVMBuildFSub(ctx->ac.builder, src[0], src[1], ""); - break; - case nir_op_isub: - result = LLVMBuildSub(ctx->ac.builder, src[0], src[1], ""); - break; - case nir_op_imul: - result = LLVMBuildMul(ctx->ac.builder, src[0], src[1], ""); - break; - case nir_op_imod: - result = LLVMBuildSRem(ctx->ac.builder, src[0], src[1], ""); - break; - case nir_op_umod: - result = LLVMBuildURem(ctx->ac.builder, src[0], src[1], ""); - break; - case nir_op_fmod: - src[0] = ac_to_float(&ctx->ac, src[0]); - src[1] = ac_to_float(&ctx->ac, src[1]); - result = ac_build_fdiv(&ctx->ac, src[0], src[1]); - result = emit_intrin_1f_param(&ctx->ac, "llvm.floor", - ac_to_float_type(&ctx->ac, def_type), result); - result = LLVMBuildFMul(ctx->ac.builder, src[1] , result, ""); - result = LLVMBuildFSub(ctx->ac.builder, src[0], result, ""); - break; - case nir_op_frem: - src[0] = ac_to_float(&ctx->ac, src[0]); - src[1] = ac_to_float(&ctx->ac, src[1]); - result = LLVMBuildFRem(ctx->ac.builder, src[0], src[1], ""); - break; - case nir_op_irem: - result = LLVMBuildSRem(ctx->ac.builder, src[0], src[1], ""); - break; - case nir_op_idiv: - result = LLVMBuildSDiv(ctx->ac.builder, src[0], src[1], ""); - break; - case nir_op_udiv: - result = LLVMBuildUDiv(ctx->ac.builder, src[0], src[1], ""); - break; - case nir_op_fmul: - src[0] = ac_to_float(&ctx->ac, src[0]); - src[1] = ac_to_float(&ctx->ac, src[1]); - result = LLVMBuildFMul(ctx->ac.builder, src[0], src[1], ""); - break; - case nir_op_frcp: - src[0] = ac_to_float(&ctx->ac, src[0]); - result = ac_build_fdiv(&ctx->ac, LLVMConstReal(LLVMTypeOf(src[0]), 1.0), src[0]); - break; - case nir_op_iand: - result = LLVMBuildAnd(ctx->ac.builder, src[0], src[1], ""); - break; - case nir_op_ior: - result = LLVMBuildOr(ctx->ac.builder, src[0], src[1], ""); - break; - case nir_op_ixor: - result = LLVMBuildXor(ctx->ac.builder, src[0], src[1], ""); - break; - case nir_op_ishl: - if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[1])) < ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0]))) - src[1] = LLVMBuildZExt(ctx->ac.builder, src[1], - LLVMTypeOf(src[0]), ""); - else if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[1])) > ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0]))) - src[1] = LLVMBuildTrunc(ctx->ac.builder, src[1], - LLVMTypeOf(src[0]), ""); - result = LLVMBuildShl(ctx->ac.builder, src[0], src[1], ""); - break; - case nir_op_ishr: - if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[1])) < ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0]))) - src[1] = LLVMBuildZExt(ctx->ac.builder, src[1], - LLVMTypeOf(src[0]), ""); - else if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[1])) > ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0]))) - src[1] = LLVMBuildTrunc(ctx->ac.builder, src[1], - LLVMTypeOf(src[0]), ""); - result = LLVMBuildAShr(ctx->ac.builder, src[0], src[1], ""); - break; - case nir_op_ushr: - if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[1])) < ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0]))) - src[1] = LLVMBuildZExt(ctx->ac.builder, src[1], - LLVMTypeOf(src[0]), ""); - else if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[1])) > ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0]))) - src[1] = LLVMBuildTrunc(ctx->ac.builder, src[1], - LLVMTypeOf(src[0]), ""); - result = LLVMBuildLShr(ctx->ac.builder, src[0], src[1], ""); - break; - case nir_op_ilt32: - result = emit_int_cmp(&ctx->ac, LLVMIntSLT, src[0], src[1]); - break; - case nir_op_ine32: - result = emit_int_cmp(&ctx->ac, LLVMIntNE, src[0], src[1]); - break; - case nir_op_ieq32: - result = emit_int_cmp(&ctx->ac, LLVMIntEQ, src[0], src[1]); - break; - case nir_op_ige32: - result = emit_int_cmp(&ctx->ac, LLVMIntSGE, src[0], src[1]); - break; - case nir_op_ult32: - result = emit_int_cmp(&ctx->ac, LLVMIntULT, src[0], src[1]); - break; - case nir_op_uge32: - result = emit_int_cmp(&ctx->ac, LLVMIntUGE, src[0], src[1]); - break; - case nir_op_feq32: - result = emit_float_cmp(&ctx->ac, LLVMRealOEQ, src[0], src[1]); - break; - case nir_op_fne32: - result = emit_float_cmp(&ctx->ac, LLVMRealUNE, src[0], src[1]); - break; - case nir_op_flt32: - result = emit_float_cmp(&ctx->ac, LLVMRealOLT, src[0], src[1]); - break; - case nir_op_fge32: - result = emit_float_cmp(&ctx->ac, LLVMRealOGE, src[0], src[1]); - break; - case nir_op_fabs: - result = emit_intrin_1f_param(&ctx->ac, "llvm.fabs", - ac_to_float_type(&ctx->ac, def_type), src[0]); - break; - case nir_op_iabs: - result = emit_iabs(&ctx->ac, src[0]); - break; - case nir_op_imax: - result = ac_build_imax(&ctx->ac, src[0], src[1]); - break; - case nir_op_imin: - result = ac_build_imin(&ctx->ac, src[0], src[1]); - break; - case nir_op_umax: - result = ac_build_umax(&ctx->ac, src[0], src[1]); - break; - case nir_op_umin: - result = ac_build_umin(&ctx->ac, src[0], src[1]); - break; - case nir_op_isign: - result = ac_build_isign(&ctx->ac, src[0], - instr->dest.dest.ssa.bit_size); - break; - case nir_op_fsign: - src[0] = ac_to_float(&ctx->ac, src[0]); - result = ac_build_fsign(&ctx->ac, src[0], - instr->dest.dest.ssa.bit_size); - break; - case nir_op_ffloor: - result = emit_intrin_1f_param(&ctx->ac, "llvm.floor", - ac_to_float_type(&ctx->ac, def_type), src[0]); - break; - case nir_op_ftrunc: - result = emit_intrin_1f_param(&ctx->ac, "llvm.trunc", - ac_to_float_type(&ctx->ac, def_type), src[0]); - break; - case nir_op_fceil: - result = emit_intrin_1f_param(&ctx->ac, "llvm.ceil", - ac_to_float_type(&ctx->ac, def_type), src[0]); - break; - case nir_op_fround_even: - result = emit_intrin_1f_param(&ctx->ac, "llvm.rint", - ac_to_float_type(&ctx->ac, def_type),src[0]); - break; - case nir_op_ffract: - src[0] = ac_to_float(&ctx->ac, src[0]); - result = ac_build_fract(&ctx->ac, src[0], - instr->dest.dest.ssa.bit_size); - break; - case nir_op_fsin: - result = emit_intrin_1f_param(&ctx->ac, "llvm.sin", - ac_to_float_type(&ctx->ac, def_type), src[0]); - break; - case nir_op_fcos: - result = emit_intrin_1f_param(&ctx->ac, "llvm.cos", - ac_to_float_type(&ctx->ac, def_type), src[0]); - break; - case nir_op_fsqrt: - result = emit_intrin_1f_param(&ctx->ac, "llvm.sqrt", - ac_to_float_type(&ctx->ac, def_type), src[0]); - break; - case nir_op_fexp2: - result = emit_intrin_1f_param(&ctx->ac, "llvm.exp2", - ac_to_float_type(&ctx->ac, def_type), src[0]); - break; - case nir_op_flog2: - result = emit_intrin_1f_param(&ctx->ac, "llvm.log2", - ac_to_float_type(&ctx->ac, def_type), src[0]); - break; - case nir_op_frsq: - result = emit_intrin_1f_param(&ctx->ac, "llvm.sqrt", - ac_to_float_type(&ctx->ac, def_type), src[0]); - result = ac_build_fdiv(&ctx->ac, LLVMConstReal(LLVMTypeOf(result), 1.0), result); - break; - case nir_op_frexp_exp: - src[0] = ac_to_float(&ctx->ac, src[0]); - result = ac_build_frexp_exp(&ctx->ac, src[0], - ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0]))); - if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0])) == 16) - result = LLVMBuildSExt(ctx->ac.builder, result, - ctx->ac.i32, ""); - break; - case nir_op_frexp_sig: - src[0] = ac_to_float(&ctx->ac, src[0]); - result = ac_build_frexp_mant(&ctx->ac, src[0], - instr->dest.dest.ssa.bit_size); - break; - case nir_op_fpow: - result = emit_intrin_2f_param(&ctx->ac, "llvm.pow", - ac_to_float_type(&ctx->ac, def_type), src[0], src[1]); - break; - case nir_op_fmax: - result = emit_intrin_2f_param(&ctx->ac, "llvm.maxnum", - ac_to_float_type(&ctx->ac, def_type), src[0], src[1]); - if (ctx->ac.chip_class < GFX9 && - instr->dest.dest.ssa.bit_size == 32) { - /* Only pre-GFX9 chips do not flush denorms. */ - result = emit_intrin_1f_param(&ctx->ac, "llvm.canonicalize", - ac_to_float_type(&ctx->ac, def_type), - result); - } - break; - case nir_op_fmin: - result = emit_intrin_2f_param(&ctx->ac, "llvm.minnum", - ac_to_float_type(&ctx->ac, def_type), src[0], src[1]); - if (ctx->ac.chip_class < GFX9 && - instr->dest.dest.ssa.bit_size == 32) { - /* Only pre-GFX9 chips do not flush denorms. */ - result = emit_intrin_1f_param(&ctx->ac, "llvm.canonicalize", - ac_to_float_type(&ctx->ac, def_type), - result); - } - break; - case nir_op_ffma: - result = emit_intrin_3f_param(&ctx->ac, "llvm.fmuladd", - ac_to_float_type(&ctx->ac, def_type), src[0], src[1], src[2]); - break; - case nir_op_ldexp: - src[0] = ac_to_float(&ctx->ac, src[0]); - if (ac_get_elem_bits(&ctx->ac, def_type) == 32) - result = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.ldexp.f32", ctx->ac.f32, src, 2, AC_FUNC_ATTR_READNONE); - else if (ac_get_elem_bits(&ctx->ac, def_type) == 16) - result = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.ldexp.f16", ctx->ac.f16, src, 2, AC_FUNC_ATTR_READNONE); - else - result = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.ldexp.f64", ctx->ac.f64, src, 2, AC_FUNC_ATTR_READNONE); - break; - case nir_op_bfm: - result = emit_bfm(&ctx->ac, src[0], src[1]); - break; - case nir_op_bitfield_select: - result = emit_bitfield_select(&ctx->ac, src[0], src[1], src[2]); - break; - case nir_op_ubfe: - result = ac_build_bfe(&ctx->ac, src[0], src[1], src[2], false); - break; - case nir_op_ibfe: - result = ac_build_bfe(&ctx->ac, src[0], src[1], src[2], true); - break; - case nir_op_bitfield_reverse: - result = ac_build_bitfield_reverse(&ctx->ac, src[0]); - break; - case nir_op_bit_count: - result = ac_build_bit_count(&ctx->ac, src[0]); - break; - case nir_op_vec2: - case nir_op_vec3: - case nir_op_vec4: - for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) - src[i] = ac_to_integer(&ctx->ac, src[i]); - result = ac_build_gather_values(&ctx->ac, src, num_components); - break; - case nir_op_f2i8: - case nir_op_f2i16: - case nir_op_f2i32: - case nir_op_f2i64: - src[0] = ac_to_float(&ctx->ac, src[0]); - result = LLVMBuildFPToSI(ctx->ac.builder, src[0], def_type, ""); - break; - case nir_op_f2u8: - case nir_op_f2u16: - case nir_op_f2u32: - case nir_op_f2u64: - src[0] = ac_to_float(&ctx->ac, src[0]); - result = LLVMBuildFPToUI(ctx->ac.builder, src[0], def_type, ""); - break; - case nir_op_i2f16: - case nir_op_i2f32: - case nir_op_i2f64: - result = LLVMBuildSIToFP(ctx->ac.builder, src[0], ac_to_float_type(&ctx->ac, def_type), ""); - break; - case nir_op_u2f16: - case nir_op_u2f32: - case nir_op_u2f64: - result = LLVMBuildUIToFP(ctx->ac.builder, src[0], ac_to_float_type(&ctx->ac, def_type), ""); - break; - case nir_op_f2f16_rtz: - src[0] = ac_to_float(&ctx->ac, src[0]); - if (LLVMTypeOf(src[0]) == ctx->ac.f64) - src[0] = LLVMBuildFPTrunc(ctx->ac.builder, src[0], ctx->ac.f32, ""); - LLVMValueRef param[2] = { src[0], ctx->ac.f32_0 }; - result = ac_build_cvt_pkrtz_f16(&ctx->ac, param); - result = LLVMBuildExtractElement(ctx->ac.builder, result, ctx->ac.i32_0, ""); - break; - case nir_op_f2f16_rtne: - case nir_op_f2f16: - case nir_op_f2f32: - case nir_op_f2f64: - src[0] = ac_to_float(&ctx->ac, src[0]); - if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0])) < ac_get_elem_bits(&ctx->ac, def_type)) - result = LLVMBuildFPExt(ctx->ac.builder, src[0], ac_to_float_type(&ctx->ac, def_type), ""); - else - result = LLVMBuildFPTrunc(ctx->ac.builder, src[0], ac_to_float_type(&ctx->ac, def_type), ""); - break; - case nir_op_u2u8: - case nir_op_u2u16: - case nir_op_u2u32: - case nir_op_u2u64: - if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0])) < ac_get_elem_bits(&ctx->ac, def_type)) - result = LLVMBuildZExt(ctx->ac.builder, src[0], def_type, ""); - else - result = LLVMBuildTrunc(ctx->ac.builder, src[0], def_type, ""); - break; - case nir_op_i2i8: - case nir_op_i2i16: - case nir_op_i2i32: - case nir_op_i2i64: - if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0])) < ac_get_elem_bits(&ctx->ac, def_type)) - result = LLVMBuildSExt(ctx->ac.builder, src[0], def_type, ""); - else - result = LLVMBuildTrunc(ctx->ac.builder, src[0], def_type, ""); - break; - case nir_op_b32csel: - result = emit_bcsel(&ctx->ac, src[0], src[1], src[2]); - break; - case nir_op_find_lsb: - result = ac_find_lsb(&ctx->ac, ctx->ac.i32, src[0]); - break; - case nir_op_ufind_msb: - result = ac_build_umsb(&ctx->ac, src[0], ctx->ac.i32); - break; - case nir_op_ifind_msb: - result = ac_build_imsb(&ctx->ac, src[0], ctx->ac.i32); - break; - case nir_op_uadd_carry: - result = emit_uint_carry(&ctx->ac, "llvm.uadd.with.overflow.i32", src[0], src[1]); - break; - case nir_op_usub_borrow: - result = emit_uint_carry(&ctx->ac, "llvm.usub.with.overflow.i32", src[0], src[1]); - break; - case nir_op_b2f16: - case nir_op_b2f32: - case nir_op_b2f64: - result = emit_b2f(&ctx->ac, src[0], instr->dest.dest.ssa.bit_size); - break; - case nir_op_f2b32: - result = emit_f2b(&ctx->ac, src[0]); - break; - case nir_op_b2i8: - case nir_op_b2i16: - case nir_op_b2i32: - case nir_op_b2i64: - result = emit_b2i(&ctx->ac, src[0], instr->dest.dest.ssa.bit_size); - break; - case nir_op_i2b32: - result = emit_i2b(&ctx->ac, src[0]); - break; - case nir_op_fquantize2f16: - result = emit_f2f16(&ctx->ac, src[0]); - break; - case nir_op_umul_high: - result = emit_umul_high(&ctx->ac, src[0], src[1]); - break; - case nir_op_imul_high: - result = emit_imul_high(&ctx->ac, src[0], src[1]); - break; - case nir_op_pack_half_2x16: - result = emit_pack_2x16(&ctx->ac, src[0], ac_build_cvt_pkrtz_f16); - break; - case nir_op_pack_snorm_2x16: - result = emit_pack_2x16(&ctx->ac, src[0], ac_build_cvt_pknorm_i16); - break; - case nir_op_pack_unorm_2x16: - result = emit_pack_2x16(&ctx->ac, src[0], ac_build_cvt_pknorm_u16); - break; - case nir_op_unpack_half_2x16: - result = emit_unpack_half_2x16(&ctx->ac, src[0]); - break; - case nir_op_fddx: - case nir_op_fddy: - case nir_op_fddx_fine: - case nir_op_fddy_fine: - case nir_op_fddx_coarse: - case nir_op_fddy_coarse: - result = emit_ddxy(ctx, instr->op, src[0]); - break; - - case nir_op_unpack_64_2x32_split_x: { - assert(ac_get_llvm_num_components(src[0]) == 1); - LLVMValueRef tmp = LLVMBuildBitCast(ctx->ac.builder, src[0], - ctx->ac.v2i32, - ""); - result = LLVMBuildExtractElement(ctx->ac.builder, tmp, - ctx->ac.i32_0, ""); - break; - } - - case nir_op_unpack_64_2x32_split_y: { - assert(ac_get_llvm_num_components(src[0]) == 1); - LLVMValueRef tmp = LLVMBuildBitCast(ctx->ac.builder, src[0], - ctx->ac.v2i32, - ""); - result = LLVMBuildExtractElement(ctx->ac.builder, tmp, - ctx->ac.i32_1, ""); - break; - } - - case nir_op_pack_64_2x32_split: { - LLVMValueRef tmp = ac_build_gather_values(&ctx->ac, src, 2); - result = LLVMBuildBitCast(ctx->ac.builder, tmp, ctx->ac.i64, ""); - break; - } - - case nir_op_pack_32_2x16_split: { - LLVMValueRef tmp = ac_build_gather_values(&ctx->ac, src, 2); - result = LLVMBuildBitCast(ctx->ac.builder, tmp, ctx->ac.i32, ""); - break; - } - - case nir_op_unpack_32_2x16_split_x: { - LLVMValueRef tmp = LLVMBuildBitCast(ctx->ac.builder, src[0], - ctx->ac.v2i16, - ""); - result = LLVMBuildExtractElement(ctx->ac.builder, tmp, - ctx->ac.i32_0, ""); - break; - } - - case nir_op_unpack_32_2x16_split_y: { - LLVMValueRef tmp = LLVMBuildBitCast(ctx->ac.builder, src[0], - ctx->ac.v2i16, - ""); - result = LLVMBuildExtractElement(ctx->ac.builder, tmp, - ctx->ac.i32_1, ""); - break; - } - - case nir_op_cube_face_coord: { - src[0] = ac_to_float(&ctx->ac, src[0]); - LLVMValueRef results[2]; - LLVMValueRef in[3]; - for (unsigned chan = 0; chan < 3; chan++) - in[chan] = ac_llvm_extract_elem(&ctx->ac, src[0], chan); - results[0] = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.cubesc", - ctx->ac.f32, in, 3, AC_FUNC_ATTR_READNONE); - results[1] = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.cubetc", - ctx->ac.f32, in, 3, AC_FUNC_ATTR_READNONE); - LLVMValueRef ma = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.cubema", - ctx->ac.f32, in, 3, AC_FUNC_ATTR_READNONE); - results[0] = ac_build_fdiv(&ctx->ac, results[0], ma); - results[1] = ac_build_fdiv(&ctx->ac, results[1], ma); - LLVMValueRef offset = LLVMConstReal(ctx->ac.f32, 0.5); - results[0] = LLVMBuildFAdd(ctx->ac.builder, results[0], offset, ""); - results[1] = LLVMBuildFAdd(ctx->ac.builder, results[1], offset, ""); - result = ac_build_gather_values(&ctx->ac, results, 2); - break; - } - - case nir_op_cube_face_index: { - src[0] = ac_to_float(&ctx->ac, src[0]); - LLVMValueRef in[3]; - for (unsigned chan = 0; chan < 3; chan++) - in[chan] = ac_llvm_extract_elem(&ctx->ac, src[0], chan); - result = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.cubeid", - ctx->ac.f32, in, 3, AC_FUNC_ATTR_READNONE); - break; - } - - case nir_op_fmin3: - result = emit_intrin_2f_param(&ctx->ac, "llvm.minnum", - ac_to_float_type(&ctx->ac, def_type), src[0], src[1]); - result = emit_intrin_2f_param(&ctx->ac, "llvm.minnum", - ac_to_float_type(&ctx->ac, def_type), result, src[2]); - break; - case nir_op_umin3: - result = ac_build_umin(&ctx->ac, src[0], src[1]); - result = ac_build_umin(&ctx->ac, result, src[2]); - break; - case nir_op_imin3: - result = ac_build_imin(&ctx->ac, src[0], src[1]); - result = ac_build_imin(&ctx->ac, result, src[2]); - break; - case nir_op_fmax3: - result = emit_intrin_2f_param(&ctx->ac, "llvm.maxnum", - ac_to_float_type(&ctx->ac, def_type), src[0], src[1]); - result = emit_intrin_2f_param(&ctx->ac, "llvm.maxnum", - ac_to_float_type(&ctx->ac, def_type), result, src[2]); - break; - case nir_op_umax3: - result = ac_build_umax(&ctx->ac, src[0], src[1]); - result = ac_build_umax(&ctx->ac, result, src[2]); - break; - case nir_op_imax3: - result = ac_build_imax(&ctx->ac, src[0], src[1]); - result = ac_build_imax(&ctx->ac, result, src[2]); - break; - case nir_op_fmed3: { - src[0] = ac_to_float(&ctx->ac, src[0]); - src[1] = ac_to_float(&ctx->ac, src[1]); - src[2] = ac_to_float(&ctx->ac, src[2]); - result = ac_build_fmed3(&ctx->ac, src[0], src[1], src[2], - instr->dest.dest.ssa.bit_size); - break; - } - case nir_op_imed3: { - LLVMValueRef tmp1 = ac_build_imin(&ctx->ac, src[0], src[1]); - LLVMValueRef tmp2 = ac_build_imax(&ctx->ac, src[0], src[1]); - tmp2 = ac_build_imin(&ctx->ac, tmp2, src[2]); - result = ac_build_imax(&ctx->ac, tmp1, tmp2); - break; - } - case nir_op_umed3: { - LLVMValueRef tmp1 = ac_build_umin(&ctx->ac, src[0], src[1]); - LLVMValueRef tmp2 = ac_build_umax(&ctx->ac, src[0], src[1]); - tmp2 = ac_build_umin(&ctx->ac, tmp2, src[2]); - result = ac_build_umax(&ctx->ac, tmp1, tmp2); - break; - } - - default: - fprintf(stderr, "Unknown NIR alu instr: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); - abort(); - } - - if (result) { - assert(instr->dest.dest.is_ssa); - result = ac_to_integer_or_pointer(&ctx->ac, result); - ctx->ssa_defs[instr->dest.dest.ssa.index] = result; - } -} - -static void visit_load_const(struct ac_nir_context *ctx, - const nir_load_const_instr *instr) -{ - LLVMValueRef values[4], value = NULL; - LLVMTypeRef element_type = - LLVMIntTypeInContext(ctx->ac.context, instr->def.bit_size); - - for (unsigned i = 0; i < instr->def.num_components; ++i) { - switch (instr->def.bit_size) { - case 8: - values[i] = LLVMConstInt(element_type, - instr->value[i].u8, false); - break; - case 16: - values[i] = LLVMConstInt(element_type, - instr->value[i].u16, false); - break; - case 32: - values[i] = LLVMConstInt(element_type, - instr->value[i].u32, false); - break; - case 64: - values[i] = LLVMConstInt(element_type, - instr->value[i].u64, false); - break; - default: - fprintf(stderr, - "unsupported nir load_const bit_size: %d\n", - instr->def.bit_size); - abort(); - } - } - if (instr->def.num_components > 1) { - value = LLVMConstVector(values, instr->def.num_components); - } else - value = values[0]; - - ctx->ssa_defs[instr->def.index] = value; -} - -static LLVMValueRef -get_buffer_size(struct ac_nir_context *ctx, LLVMValueRef descriptor, bool in_elements) -{ - LLVMValueRef size = - LLVMBuildExtractElement(ctx->ac.builder, descriptor, - LLVMConstInt(ctx->ac.i32, 2, false), ""); - - /* GFX8 only */ - if (ctx->ac.chip_class == GFX8 && in_elements) { - /* On GFX8, the descriptor contains the size in bytes, - * but TXQ must return the size in elements. - * The stride is always non-zero for resources using TXQ. - */ - LLVMValueRef stride = - LLVMBuildExtractElement(ctx->ac.builder, descriptor, - ctx->ac.i32_1, ""); - stride = LLVMBuildLShr(ctx->ac.builder, stride, - LLVMConstInt(ctx->ac.i32, 16, false), ""); - stride = LLVMBuildAnd(ctx->ac.builder, stride, - LLVMConstInt(ctx->ac.i32, 0x3fff, false), ""); - - size = LLVMBuildUDiv(ctx->ac.builder, size, stride, ""); - } - return size; -} - -static LLVMValueRef lower_gather4_integer(struct ac_llvm_context *ctx, - nir_variable *var, - struct ac_image_args *args, - const nir_tex_instr *instr) -{ - const struct glsl_type *type = glsl_without_array(var->type); - enum glsl_base_type stype = glsl_get_sampler_result_type(type); - LLVMValueRef half_texel[2]; - LLVMValueRef compare_cube_wa = NULL; - LLVMValueRef result; - - //TODO Rect - { - struct ac_image_args txq_args = { 0 }; - - txq_args.dim = get_ac_sampler_dim(ctx, instr->sampler_dim, instr->is_array); - txq_args.opcode = ac_image_get_resinfo; - txq_args.dmask = 0xf; - txq_args.lod = ctx->i32_0; - txq_args.resource = args->resource; - txq_args.attributes = AC_FUNC_ATTR_READNONE; - LLVMValueRef size = ac_build_image_opcode(ctx, &txq_args); - - for (unsigned c = 0; c < 2; c++) { - half_texel[c] = LLVMBuildExtractElement(ctx->builder, size, - LLVMConstInt(ctx->i32, c, false), ""); - half_texel[c] = LLVMBuildUIToFP(ctx->builder, half_texel[c], ctx->f32, ""); - half_texel[c] = ac_build_fdiv(ctx, ctx->f32_1, half_texel[c]); - half_texel[c] = LLVMBuildFMul(ctx->builder, half_texel[c], - LLVMConstReal(ctx->f32, -0.5), ""); - } - } - - LLVMValueRef orig_coords[2] = { args->coords[0], args->coords[1] }; - - for (unsigned c = 0; c < 2; c++) { - LLVMValueRef tmp; - tmp = LLVMBuildBitCast(ctx->builder, args->coords[c], ctx->f32, ""); - args->coords[c] = LLVMBuildFAdd(ctx->builder, tmp, half_texel[c], ""); - } - - /* - * Apparantly cube has issue with integer types that the workaround doesn't solve, - * so this tests if the format is 8_8_8_8 and an integer type do an alternate - * workaround by sampling using a scaled type and converting. - * This is taken from amdgpu-pro shaders. - */ - /* NOTE this produces some ugly code compared to amdgpu-pro, - * LLVM ends up dumping SGPRs into VGPRs to deal with the compare/select, - * and then reads them back. -pro generates two selects, - * one s_cmp for the descriptor rewriting - * one v_cmp for the coordinate and result changes. - */ - if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) { - LLVMValueRef tmp, tmp2; - - /* workaround 8/8/8/8 uint/sint cube gather bug */ - /* first detect it then change to a scaled read and f2i */ - tmp = LLVMBuildExtractElement(ctx->builder, args->resource, ctx->i32_1, ""); - tmp2 = tmp; - - /* extract the DATA_FORMAT */ - tmp = ac_build_bfe(ctx, tmp, LLVMConstInt(ctx->i32, 20, false), - LLVMConstInt(ctx->i32, 6, false), false); - - /* is the DATA_FORMAT == 8_8_8_8 */ - compare_cube_wa = LLVMBuildICmp(ctx->builder, LLVMIntEQ, tmp, LLVMConstInt(ctx->i32, V_008F14_IMG_DATA_FORMAT_8_8_8_8, false), ""); - - if (stype == GLSL_TYPE_UINT) - /* Create a NUM FORMAT - 0x2 or 0x4 - USCALED or UINT */ - tmp = LLVMBuildSelect(ctx->builder, compare_cube_wa, LLVMConstInt(ctx->i32, 0x8000000, false), - LLVMConstInt(ctx->i32, 0x10000000, false), ""); - else - /* Create a NUM FORMAT - 0x3 or 0x5 - SSCALED or SINT */ - tmp = LLVMBuildSelect(ctx->builder, compare_cube_wa, LLVMConstInt(ctx->i32, 0xc000000, false), - LLVMConstInt(ctx->i32, 0x14000000, false), ""); - - /* replace the NUM FORMAT in the descriptor */ - tmp2 = LLVMBuildAnd(ctx->builder, tmp2, LLVMConstInt(ctx->i32, C_008F14_NUM_FORMAT, false), ""); - tmp2 = LLVMBuildOr(ctx->builder, tmp2, tmp, ""); - - args->resource = LLVMBuildInsertElement(ctx->builder, args->resource, tmp2, ctx->i32_1, ""); - - /* don't modify the coordinates for this case */ - for (unsigned c = 0; c < 2; ++c) - args->coords[c] = LLVMBuildSelect( - ctx->builder, compare_cube_wa, - orig_coords[c], args->coords[c], ""); - } - - args->attributes = AC_FUNC_ATTR_READNONE; - result = ac_build_image_opcode(ctx, args); - - if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) { - LLVMValueRef tmp, tmp2; - - /* if the cube workaround is in place, f2i the result. */ - for (unsigned c = 0; c < 4; c++) { - tmp = LLVMBuildExtractElement(ctx->builder, result, LLVMConstInt(ctx->i32, c, false), ""); - if (stype == GLSL_TYPE_UINT) - tmp2 = LLVMBuildFPToUI(ctx->builder, tmp, ctx->i32, ""); - else - tmp2 = LLVMBuildFPToSI(ctx->builder, tmp, ctx->i32, ""); - tmp = LLVMBuildBitCast(ctx->builder, tmp, ctx->i32, ""); - tmp2 = LLVMBuildBitCast(ctx->builder, tmp2, ctx->i32, ""); - tmp = LLVMBuildSelect(ctx->builder, compare_cube_wa, tmp2, tmp, ""); - tmp = LLVMBuildBitCast(ctx->builder, tmp, ctx->f32, ""); - result = LLVMBuildInsertElement(ctx->builder, result, tmp, LLVMConstInt(ctx->i32, c, false), ""); - } - } - return result; -} - -static nir_deref_instr *get_tex_texture_deref(const nir_tex_instr *instr) -{ - nir_deref_instr *texture_deref_instr = NULL; - - for (unsigned i = 0; i < instr->num_srcs; i++) { - switch (instr->src[i].src_type) { - case nir_tex_src_texture_deref: - texture_deref_instr = nir_src_as_deref(instr->src[i].src); - break; - default: - break; - } - } - return texture_deref_instr; -} - -static LLVMValueRef build_tex_intrinsic(struct ac_nir_context *ctx, - const nir_tex_instr *instr, - struct ac_image_args *args) -{ - if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) { - unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa); - - if (ctx->abi->gfx9_stride_size_workaround) { - return ac_build_buffer_load_format_gfx9_safe(&ctx->ac, - args->resource, - args->coords[0], - ctx->ac.i32_0, - util_last_bit(mask), - 0, true); - } else { - return ac_build_buffer_load_format(&ctx->ac, - args->resource, - args->coords[0], - ctx->ac.i32_0, - util_last_bit(mask), - 0, true); - } - } - - args->opcode = ac_image_sample; - - switch (instr->op) { - case nir_texop_txf: - case nir_texop_txf_ms: - case nir_texop_samples_identical: - args->opcode = args->level_zero || - instr->sampler_dim == GLSL_SAMPLER_DIM_MS ? - ac_image_load : ac_image_load_mip; - args->level_zero = false; - break; - case nir_texop_txs: - case nir_texop_query_levels: - args->opcode = ac_image_get_resinfo; - if (!args->lod) - args->lod = ctx->ac.i32_0; - args->level_zero = false; - break; - case nir_texop_tex: - if (ctx->stage != MESA_SHADER_FRAGMENT) { - assert(!args->lod); - args->level_zero = true; - } - break; - case nir_texop_tg4: - args->opcode = ac_image_gather4; - args->level_zero = true; - break; - case nir_texop_lod: - args->opcode = ac_image_get_lod; - break; - default: - break; - } - - if (instr->op == nir_texop_tg4 && ctx->ac.chip_class <= GFX8) { - nir_deref_instr *texture_deref_instr = get_tex_texture_deref(instr); - nir_variable *var = nir_deref_instr_get_variable(texture_deref_instr); - const struct glsl_type *type = glsl_without_array(var->type); - enum glsl_base_type stype = glsl_get_sampler_result_type(type); - if (stype == GLSL_TYPE_UINT || stype == GLSL_TYPE_INT) { - return lower_gather4_integer(&ctx->ac, var, args, instr); - } - } - - /* Fixup for GFX9 which allocates 1D textures as 2D. */ - if (instr->op == nir_texop_lod && ctx->ac.chip_class == GFX9) { - if ((args->dim == ac_image_2darray || - args->dim == ac_image_2d) && !args->coords[1]) { - args->coords[1] = ctx->ac.i32_0; - } - } - - args->attributes = AC_FUNC_ATTR_READNONE; - bool cs_derivs = ctx->stage == MESA_SHADER_COMPUTE && - ctx->info->cs.derivative_group != DERIVATIVE_GROUP_NONE; - if (ctx->stage == MESA_SHADER_FRAGMENT || cs_derivs) { - /* Prevent texture instructions with implicit derivatives from being - * sinked into branches. */ - switch (instr->op) { - case nir_texop_tex: - case nir_texop_txb: - case nir_texop_lod: - args->attributes |= AC_FUNC_ATTR_CONVERGENT; - break; - default: - break; - } - } - - return ac_build_image_opcode(&ctx->ac, args); -} - -static LLVMValueRef visit_vulkan_resource_reindex(struct ac_nir_context *ctx, - nir_intrinsic_instr *instr) -{ - LLVMValueRef ptr = get_src(ctx, instr->src[0]); - LLVMValueRef index = get_src(ctx, instr->src[1]); - - LLVMValueRef result = LLVMBuildGEP(ctx->ac.builder, ptr, &index, 1, ""); - LLVMSetMetadata(result, ctx->ac.uniform_md_kind, ctx->ac.empty_md); - return result; -} - -static LLVMValueRef visit_load_push_constant(struct ac_nir_context *ctx, - nir_intrinsic_instr *instr) -{ - LLVMValueRef ptr, addr; - LLVMValueRef src0 = get_src(ctx, instr->src[0]); - unsigned index = nir_intrinsic_base(instr); - - addr = LLVMConstInt(ctx->ac.i32, index, 0); - addr = LLVMBuildAdd(ctx->ac.builder, addr, src0, ""); - - /* Load constant values from user SGPRS when possible, otherwise - * fallback to the default path that loads directly from memory. - */ - if (LLVMIsConstant(src0) && - instr->dest.ssa.bit_size == 32) { - unsigned count = instr->dest.ssa.num_components; - unsigned offset = index; - - offset += LLVMConstIntGetZExtValue(src0); - offset /= 4; - - offset -= ctx->abi->base_inline_push_consts; - - if (offset + count <= ctx->abi->num_inline_push_consts) { - return ac_build_gather_values(&ctx->ac, - ctx->abi->inline_push_consts + offset, - count); - } - } - - ptr = LLVMBuildGEP(ctx->ac.builder, ctx->abi->push_constants, &addr, 1, ""); - - if (instr->dest.ssa.bit_size == 8) { - unsigned load_dwords = instr->dest.ssa.num_components > 1 ? 2 : 1; - LLVMTypeRef vec_type = LLVMVectorType(LLVMInt8TypeInContext(ctx->ac.context), 4 * load_dwords); - ptr = ac_cast_ptr(&ctx->ac, ptr, vec_type); - LLVMValueRef res = LLVMBuildLoad(ctx->ac.builder, ptr, ""); - - LLVMValueRef params[3]; - if (load_dwords > 1) { - LLVMValueRef res_vec = LLVMBuildBitCast(ctx->ac.builder, res, LLVMVectorType(ctx->ac.i32, 2), ""); - params[0] = LLVMBuildExtractElement(ctx->ac.builder, res_vec, LLVMConstInt(ctx->ac.i32, 1, false), ""); - params[1] = LLVMBuildExtractElement(ctx->ac.builder, res_vec, LLVMConstInt(ctx->ac.i32, 0, false), ""); - } else { - res = LLVMBuildBitCast(ctx->ac.builder, res, ctx->ac.i32, ""); - params[0] = ctx->ac.i32_0; - params[1] = res; - } - params[2] = addr; - res = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.alignbyte", ctx->ac.i32, params, 3, 0); - - res = LLVMBuildTrunc(ctx->ac.builder, res, LLVMIntTypeInContext(ctx->ac.context, instr->dest.ssa.num_components * 8), ""); - if (instr->dest.ssa.num_components > 1) - res = LLVMBuildBitCast(ctx->ac.builder, res, LLVMVectorType(LLVMInt8TypeInContext(ctx->ac.context), instr->dest.ssa.num_components), ""); - return res; - } else if (instr->dest.ssa.bit_size == 16) { - unsigned load_dwords = instr->dest.ssa.num_components / 2 + 1; - LLVMTypeRef vec_type = LLVMVectorType(LLVMInt16TypeInContext(ctx->ac.context), 2 * load_dwords); - ptr = ac_cast_ptr(&ctx->ac, ptr, vec_type); - LLVMValueRef res = LLVMBuildLoad(ctx->ac.builder, ptr, ""); - res = LLVMBuildBitCast(ctx->ac.builder, res, vec_type, ""); - LLVMValueRef cond = LLVMBuildLShr(ctx->ac.builder, addr, ctx->ac.i32_1, ""); - cond = LLVMBuildTrunc(ctx->ac.builder, cond, ctx->ac.i1, ""); - LLVMValueRef mask[] = { LLVMConstInt(ctx->ac.i32, 0, false), LLVMConstInt(ctx->ac.i32, 1, false), - LLVMConstInt(ctx->ac.i32, 2, false), LLVMConstInt(ctx->ac.i32, 3, false), - LLVMConstInt(ctx->ac.i32, 4, false)}; - LLVMValueRef swizzle_aligned = LLVMConstVector(&mask[0], instr->dest.ssa.num_components); - LLVMValueRef swizzle_unaligned = LLVMConstVector(&mask[1], instr->dest.ssa.num_components); - LLVMValueRef shuffle_aligned = LLVMBuildShuffleVector(ctx->ac.builder, res, res, swizzle_aligned, ""); - LLVMValueRef shuffle_unaligned = LLVMBuildShuffleVector(ctx->ac.builder, res, res, swizzle_unaligned, ""); - res = LLVMBuildSelect(ctx->ac.builder, cond, shuffle_unaligned, shuffle_aligned, ""); - return LLVMBuildBitCast(ctx->ac.builder, res, get_def_type(ctx, &instr->dest.ssa), ""); - } - - ptr = ac_cast_ptr(&ctx->ac, ptr, get_def_type(ctx, &instr->dest.ssa)); - - return LLVMBuildLoad(ctx->ac.builder, ptr, ""); -} - -static LLVMValueRef visit_get_buffer_size(struct ac_nir_context *ctx, - const nir_intrinsic_instr *instr) -{ - LLVMValueRef index = get_src(ctx, instr->src[0]); - - return get_buffer_size(ctx, ctx->abi->load_ssbo(ctx->abi, index, false), false); -} - -static uint32_t widen_mask(uint32_t mask, unsigned multiplier) -{ - uint32_t new_mask = 0; - for(unsigned i = 0; i < 32 && (1u << i) <= mask; ++i) - if (mask & (1u << i)) - new_mask |= ((1u << multiplier) - 1u) << (i * multiplier); - return new_mask; -} - -static LLVMValueRef extract_vector_range(struct ac_llvm_context *ctx, LLVMValueRef src, - unsigned start, unsigned count) -{ - LLVMValueRef mask[] = { - ctx->i32_0, ctx->i32_1, - LLVMConstInt(ctx->i32, 2, false), LLVMConstInt(ctx->i32, 3, false) }; - - unsigned src_elements = ac_get_llvm_num_components(src); - - if (count == src_elements) { - assert(start == 0); - return src; - } else if (count == 1) { - assert(start < src_elements); - return LLVMBuildExtractElement(ctx->builder, src, mask[start], ""); - } else { - assert(start + count <= src_elements); - assert(count <= 4); - LLVMValueRef swizzle = LLVMConstVector(&mask[start], count); - return LLVMBuildShuffleVector(ctx->builder, src, src, swizzle, ""); - } -} - -static unsigned get_cache_policy(struct ac_nir_context *ctx, - enum gl_access_qualifier access, - bool may_store_unaligned, - bool writeonly_memory) -{ - unsigned cache_policy = 0; - - /* GFX6 has a TC L1 bug causing corruption of 8bit/16bit stores. All - * store opcodes not aligned to a dword are affected. The only way to - * get unaligned stores is through shader images. - */ - if (((may_store_unaligned && ctx->ac.chip_class == GFX6) || - /* If this is write-only, don't keep data in L1 to prevent - * evicting L1 cache lines that may be needed by other - * instructions. - */ - writeonly_memory || - access & (ACCESS_COHERENT | ACCESS_VOLATILE))) { - cache_policy |= ac_glc; - } - - if (access & ACCESS_STREAM_CACHE_POLICY) - cache_policy |= ac_slc; - - return cache_policy; -} - -static void visit_store_ssbo(struct ac_nir_context *ctx, - nir_intrinsic_instr *instr) -{ - LLVMValueRef src_data = get_src(ctx, instr->src[0]); - int elem_size_bytes = ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src_data)) / 8; - unsigned writemask = nir_intrinsic_write_mask(instr); - enum gl_access_qualifier access = nir_intrinsic_access(instr); - bool writeonly_memory = access & ACCESS_NON_READABLE; - unsigned cache_policy = get_cache_policy(ctx, access, false, writeonly_memory); - - LLVMValueRef rsrc = ctx->abi->load_ssbo(ctx->abi, - get_src(ctx, instr->src[1]), true); - LLVMValueRef base_data = src_data; - base_data = ac_trim_vector(&ctx->ac, base_data, instr->num_components); - LLVMValueRef base_offset = get_src(ctx, instr->src[2]); - - while (writemask) { - int start, count; - LLVMValueRef data, offset; - LLVMTypeRef data_type; - - u_bit_scan_consecutive_range(&writemask, &start, &count); - - /* Due to an LLVM limitation with LLVM < 9, split 3-element - * writes into a 2-element and a 1-element write. */ - if (count == 3 && - (elem_size_bytes != 4 || !ac_has_vec3_support(ctx->ac.chip_class, false))) { - writemask |= 1 << (start + 2); - count = 2; - } - int num_bytes = count * elem_size_bytes; /* count in bytes */ - - /* we can only store 4 DWords at the same time. - * can only happen for 64 Bit vectors. */ - if (num_bytes > 16) { - writemask |= ((1u << (count - 2)) - 1u) << (start + 2); - count = 2; - num_bytes = 16; - } - - /* check alignment of 16 Bit stores */ - if (elem_size_bytes == 2 && num_bytes > 2 && (start % 2) == 1) { - writemask |= ((1u << (count - 1)) - 1u) << (start + 1); - count = 1; - num_bytes = 2; - } - data = extract_vector_range(&ctx->ac, base_data, start, count); - - offset = LLVMBuildAdd(ctx->ac.builder, base_offset, - LLVMConstInt(ctx->ac.i32, start * elem_size_bytes, false), ""); - - if (num_bytes == 1) { - ac_build_tbuffer_store_byte(&ctx->ac, rsrc, data, - offset, ctx->ac.i32_0, - cache_policy); - } else if (num_bytes == 2) { - ac_build_tbuffer_store_short(&ctx->ac, rsrc, data, - offset, ctx->ac.i32_0, - cache_policy); - } else { - int num_channels = num_bytes / 4; - - switch (num_bytes) { - case 16: /* v4f32 */ - data_type = ctx->ac.v4f32; - break; - case 12: /* v3f32 */ - data_type = ctx->ac.v3f32; - break; - case 8: /* v2f32 */ - data_type = ctx->ac.v2f32; - break; - case 4: /* f32 */ - data_type = ctx->ac.f32; - break; - default: - unreachable("Malformed vector store."); - } - data = LLVMBuildBitCast(ctx->ac.builder, data, data_type, ""); - - ac_build_buffer_store_dword(&ctx->ac, rsrc, data, - num_channels, offset, - ctx->ac.i32_0, 0, - cache_policy, false); - } - } -} - -static LLVMValueRef emit_ssbo_comp_swap_64(struct ac_nir_context *ctx, - LLVMValueRef descriptor, - LLVMValueRef offset, - LLVMValueRef compare, - LLVMValueRef exchange) -{ - LLVMBasicBlockRef start_block = NULL, then_block = NULL; - if (ctx->abi->robust_buffer_access) { - LLVMValueRef size = ac_llvm_extract_elem(&ctx->ac, descriptor, 2); - - LLVMValueRef cond = LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, offset, size, ""); - start_block = LLVMGetInsertBlock(ctx->ac.builder); - - ac_build_ifcc(&ctx->ac, cond, -1); - - then_block = LLVMGetInsertBlock(ctx->ac.builder); - } - - LLVMValueRef ptr_parts[2] = { - ac_llvm_extract_elem(&ctx->ac, descriptor, 0), - LLVMBuildAnd(ctx->ac.builder, - ac_llvm_extract_elem(&ctx->ac, descriptor, 1), - LLVMConstInt(ctx->ac.i32, 65535, 0), "") - }; - - ptr_parts[1] = LLVMBuildTrunc(ctx->ac.builder, ptr_parts[1], ctx->ac.i16, ""); - ptr_parts[1] = LLVMBuildSExt(ctx->ac.builder, ptr_parts[1], ctx->ac.i32, ""); - - offset = LLVMBuildZExt(ctx->ac.builder, offset, ctx->ac.i64, ""); - - LLVMValueRef ptr = ac_build_gather_values(&ctx->ac, ptr_parts, 2); - ptr = LLVMBuildBitCast(ctx->ac.builder, ptr, ctx->ac.i64, ""); - ptr = LLVMBuildAdd(ctx->ac.builder, ptr, offset, ""); - ptr = LLVMBuildIntToPtr(ctx->ac.builder, ptr, LLVMPointerType(ctx->ac.i64, AC_ADDR_SPACE_GLOBAL), ""); - - LLVMValueRef result = ac_build_atomic_cmp_xchg(&ctx->ac, ptr, compare, exchange, "singlethread-one-as"); - result = LLVMBuildExtractValue(ctx->ac.builder, result, 0, ""); - - if (ctx->abi->robust_buffer_access) { - ac_build_endif(&ctx->ac, -1); - - LLVMBasicBlockRef incoming_blocks[2] = { - start_block, - then_block, - }; - - LLVMValueRef incoming_values[2] = { - LLVMConstInt(ctx->ac.i64, 0, 0), - result, - }; - LLVMValueRef ret = LLVMBuildPhi(ctx->ac.builder, ctx->ac.i64, ""); - LLVMAddIncoming(ret, incoming_values, incoming_blocks, 2); - return ret; - } else { - return result; - } -} - -static LLVMValueRef visit_atomic_ssbo(struct ac_nir_context *ctx, - const nir_intrinsic_instr *instr) -{ - LLVMTypeRef return_type = LLVMTypeOf(get_src(ctx, instr->src[2])); - const char *op; - char name[64], type[8]; - LLVMValueRef params[6], descriptor; - int arg_count = 0; - - switch (instr->intrinsic) { - case nir_intrinsic_ssbo_atomic_add: - op = "add"; - break; - case nir_intrinsic_ssbo_atomic_imin: - op = "smin"; - break; - case nir_intrinsic_ssbo_atomic_umin: - op = "umin"; - break; - case nir_intrinsic_ssbo_atomic_imax: - op = "smax"; - break; - case nir_intrinsic_ssbo_atomic_umax: - op = "umax"; - break; - case nir_intrinsic_ssbo_atomic_and: - op = "and"; - break; - case nir_intrinsic_ssbo_atomic_or: - op = "or"; - break; - case nir_intrinsic_ssbo_atomic_xor: - op = "xor"; - break; - case nir_intrinsic_ssbo_atomic_exchange: - op = "swap"; - break; - case nir_intrinsic_ssbo_atomic_comp_swap: - op = "cmpswap"; - break; - default: - abort(); - } - - descriptor = ctx->abi->load_ssbo(ctx->abi, - get_src(ctx, instr->src[0]), - true); - - if (instr->intrinsic == nir_intrinsic_ssbo_atomic_comp_swap && - return_type == ctx->ac.i64) { - return emit_ssbo_comp_swap_64(ctx, descriptor, - get_src(ctx, instr->src[1]), - get_src(ctx, instr->src[2]), - get_src(ctx, instr->src[3])); - } - if (instr->intrinsic == nir_intrinsic_ssbo_atomic_comp_swap) { - params[arg_count++] = ac_llvm_extract_elem(&ctx->ac, get_src(ctx, instr->src[3]), 0); - } - params[arg_count++] = ac_llvm_extract_elem(&ctx->ac, get_src(ctx, instr->src[2]), 0); - params[arg_count++] = descriptor; - - if (HAVE_LLVM >= 0x900) { - /* XXX: The new raw/struct atomic intrinsics are buggy with - * LLVM 8, see r358579. - */ - params[arg_count++] = get_src(ctx, instr->src[1]); /* voffset */ - params[arg_count++] = ctx->ac.i32_0; /* soffset */ - params[arg_count++] = ctx->ac.i32_0; /* slc */ - - ac_build_type_name_for_intr(return_type, type, sizeof(type)); - snprintf(name, sizeof(name), - "llvm.amdgcn.raw.buffer.atomic.%s.%s", op, type); - } else { - params[arg_count++] = ctx->ac.i32_0; /* vindex */ - params[arg_count++] = get_src(ctx, instr->src[1]); /* voffset */ - params[arg_count++] = ctx->ac.i1false; /* slc */ - - assert(return_type == ctx->ac.i32); - snprintf(name, sizeof(name), - "llvm.amdgcn.buffer.atomic.%s", op); - } - - return ac_build_intrinsic(&ctx->ac, name, return_type, params, - arg_count, 0); -} - -static LLVMValueRef visit_load_buffer(struct ac_nir_context *ctx, - const nir_intrinsic_instr *instr) -{ - int elem_size_bytes = instr->dest.ssa.bit_size / 8; - int num_components = instr->num_components; - enum gl_access_qualifier access = nir_intrinsic_access(instr); - unsigned cache_policy = get_cache_policy(ctx, access, false, false); - - LLVMValueRef offset = get_src(ctx, instr->src[1]); - LLVMValueRef rsrc = ctx->abi->load_ssbo(ctx->abi, - get_src(ctx, instr->src[0]), false); - LLVMValueRef vindex = ctx->ac.i32_0; - - LLVMTypeRef def_type = get_def_type(ctx, &instr->dest.ssa); - LLVMTypeRef def_elem_type = num_components > 1 ? LLVMGetElementType(def_type) : def_type; - - LLVMValueRef results[4]; - for (int i = 0; i < num_components;) { - int num_elems = num_components - i; - if (elem_size_bytes < 4 && nir_intrinsic_align(instr) % 4 != 0) - num_elems = 1; - if (num_elems * elem_size_bytes > 16) - num_elems = 16 / elem_size_bytes; - int load_bytes = num_elems * elem_size_bytes; - - LLVMValueRef immoffset = LLVMConstInt(ctx->ac.i32, i * elem_size_bytes, false); - - LLVMValueRef ret; - - if (load_bytes == 1) { - ret = ac_build_tbuffer_load_byte(&ctx->ac, - rsrc, - offset, - ctx->ac.i32_0, - immoffset, - cache_policy); - } else if (load_bytes == 2) { - ret = ac_build_tbuffer_load_short(&ctx->ac, - rsrc, - offset, - ctx->ac.i32_0, - immoffset, - cache_policy); - } else { - int num_channels = util_next_power_of_two(load_bytes) / 4; - bool can_speculate = access & ACCESS_CAN_REORDER; - - ret = ac_build_buffer_load(&ctx->ac, rsrc, num_channels, - vindex, offset, immoffset, 0, - cache_policy, can_speculate, false); - } - - LLVMTypeRef byte_vec = LLVMVectorType(ctx->ac.i8, ac_get_type_size(LLVMTypeOf(ret))); - ret = LLVMBuildBitCast(ctx->ac.builder, ret, byte_vec, ""); - ret = ac_trim_vector(&ctx->ac, ret, load_bytes); - - LLVMTypeRef ret_type = LLVMVectorType(def_elem_type, num_elems); - ret = LLVMBuildBitCast(ctx->ac.builder, ret, ret_type, ""); - - for (unsigned j = 0; j < num_elems; j++) { - results[i + j] = LLVMBuildExtractElement(ctx->ac.builder, ret, LLVMConstInt(ctx->ac.i32, j, false), ""); - } - i += num_elems; - } - - return ac_build_gather_values(&ctx->ac, results, num_components); -} - -static LLVMValueRef visit_load_ubo_buffer(struct ac_nir_context *ctx, - const nir_intrinsic_instr *instr) -{ - LLVMValueRef ret; - LLVMValueRef rsrc = get_src(ctx, instr->src[0]); - LLVMValueRef offset = get_src(ctx, instr->src[1]); - int num_components = instr->num_components; - - if (ctx->abi->load_ubo) - rsrc = ctx->abi->load_ubo(ctx->abi, rsrc); - - if (instr->dest.ssa.bit_size == 64) - num_components *= 2; - - if (instr->dest.ssa.bit_size == 16 || instr->dest.ssa.bit_size == 8) { - unsigned load_bytes = instr->dest.ssa.bit_size / 8; - LLVMValueRef results[num_components]; - for (unsigned i = 0; i < num_components; ++i) { - LLVMValueRef immoffset = LLVMConstInt(ctx->ac.i32, - load_bytes * i, 0); - - if (load_bytes == 1) { - results[i] = ac_build_tbuffer_load_byte(&ctx->ac, - rsrc, - offset, - ctx->ac.i32_0, - immoffset, - 0); - } else { - assert(load_bytes == 2); - results[i] = ac_build_tbuffer_load_short(&ctx->ac, - rsrc, - offset, - ctx->ac.i32_0, - immoffset, - 0); - } - } - ret = ac_build_gather_values(&ctx->ac, results, num_components); - } else { - ret = ac_build_buffer_load(&ctx->ac, rsrc, num_components, NULL, offset, - NULL, 0, 0, true, true); - - ret = ac_trim_vector(&ctx->ac, ret, num_components); - } - - return LLVMBuildBitCast(ctx->ac.builder, ret, - get_def_type(ctx, &instr->dest.ssa), ""); -} - -static void -get_deref_offset(struct ac_nir_context *ctx, nir_deref_instr *instr, - bool vs_in, unsigned *vertex_index_out, - LLVMValueRef *vertex_index_ref, - unsigned *const_out, LLVMValueRef *indir_out) -{ - nir_variable *var = nir_deref_instr_get_variable(instr); - nir_deref_path path; - unsigned idx_lvl = 1; - - nir_deref_path_init(&path, instr, NULL); - - if (vertex_index_out != NULL || vertex_index_ref != NULL) { - if (vertex_index_ref) { - *vertex_index_ref = get_src(ctx, path.path[idx_lvl]->arr.index); - if (vertex_index_out) - *vertex_index_out = 0; - } else { - *vertex_index_out = nir_src_as_uint(path.path[idx_lvl]->arr.index); - } - ++idx_lvl; - } - - uint32_t const_offset = 0; - LLVMValueRef offset = NULL; - - if (var->data.compact) { - assert(instr->deref_type == nir_deref_type_array); - const_offset = nir_src_as_uint(instr->arr.index); - goto out; - } - - for (; path.path[idx_lvl]; ++idx_lvl) { - const struct glsl_type *parent_type = path.path[idx_lvl - 1]->type; - if (path.path[idx_lvl]->deref_type == nir_deref_type_struct) { - unsigned index = path.path[idx_lvl]->strct.index; - - for (unsigned i = 0; i < index; i++) { - const struct glsl_type *ft = glsl_get_struct_field(parent_type, i); - const_offset += glsl_count_attribute_slots(ft, vs_in); - } - } else if(path.path[idx_lvl]->deref_type == nir_deref_type_array) { - unsigned size = glsl_count_attribute_slots(path.path[idx_lvl]->type, vs_in); - LLVMValueRef array_off = LLVMBuildMul(ctx->ac.builder, LLVMConstInt(ctx->ac.i32, size, 0), - get_src(ctx, path.path[idx_lvl]->arr.index), ""); - if (offset) - offset = LLVMBuildAdd(ctx->ac.builder, offset, array_off, ""); - else - offset = array_off; - } else - unreachable("Uhandled deref type in get_deref_instr_offset"); - } - -out: - nir_deref_path_finish(&path); - - if (const_offset && offset) - offset = LLVMBuildAdd(ctx->ac.builder, offset, - LLVMConstInt(ctx->ac.i32, const_offset, 0), - ""); - - *const_out = const_offset; - *indir_out = offset; -} - -static LLVMValueRef load_tess_varyings(struct ac_nir_context *ctx, - nir_intrinsic_instr *instr, - bool load_inputs) -{ - LLVMValueRef result; - LLVMValueRef vertex_index = NULL; - LLVMValueRef indir_index = NULL; - unsigned const_index = 0; - - nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr)); - - unsigned location = var->data.location; - unsigned driver_location = var->data.driver_location; - const bool is_patch = var->data.patch; - const bool is_compact = var->data.compact; - - get_deref_offset(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), - false, NULL, is_patch ? NULL : &vertex_index, - &const_index, &indir_index); - - LLVMTypeRef dest_type = get_def_type(ctx, &instr->dest.ssa); - - LLVMTypeRef src_component_type; - if (LLVMGetTypeKind(dest_type) == LLVMVectorTypeKind) - src_component_type = LLVMGetElementType(dest_type); - else - src_component_type = dest_type; - - result = ctx->abi->load_tess_varyings(ctx->abi, src_component_type, - vertex_index, indir_index, - const_index, location, driver_location, - var->data.location_frac, - instr->num_components, - is_patch, is_compact, load_inputs); - if (instr->dest.ssa.bit_size == 16) { - result = ac_to_integer(&ctx->ac, result); - result = LLVMBuildTrunc(ctx->ac.builder, result, dest_type, ""); - } - return LLVMBuildBitCast(ctx->ac.builder, result, dest_type, ""); -} - -static unsigned -type_scalar_size_bytes(const struct glsl_type *type) -{ - assert(glsl_type_is_vector_or_scalar(type) || - glsl_type_is_matrix(type)); - return glsl_type_is_boolean(type) ? 4 : glsl_get_bit_size(type) / 8; -} - -static LLVMValueRef visit_load_var(struct ac_nir_context *ctx, - nir_intrinsic_instr *instr) -{ - nir_deref_instr *deref = nir_instr_as_deref(instr->src[0].ssa->parent_instr); - nir_variable *var = nir_deref_instr_get_variable(deref); - - LLVMValueRef values[8]; - int idx = 0; - int ve = instr->dest.ssa.num_components; - unsigned comp = 0; - LLVMValueRef indir_index; - LLVMValueRef ret; - unsigned const_index; - unsigned stride = 4; - int mode = deref->mode; - - if (var) { - bool vs_in = ctx->stage == MESA_SHADER_VERTEX && - var->data.mode == nir_var_shader_in; - idx = var->data.driver_location; - comp = var->data.location_frac; - mode = var->data.mode; - - get_deref_offset(ctx, deref, vs_in, NULL, NULL, - &const_index, &indir_index); - - if (var->data.compact) { - stride = 1; - const_index += comp; - comp = 0; - } - } - - if (instr->dest.ssa.bit_size == 64 && - (deref->mode == nir_var_shader_in || - deref->mode == nir_var_shader_out || - deref->mode == nir_var_function_temp)) - ve *= 2; - - switch (mode) { - case nir_var_shader_in: - if (ctx->stage == MESA_SHADER_TESS_CTRL || - ctx->stage == MESA_SHADER_TESS_EVAL) { - return load_tess_varyings(ctx, instr, true); - } - - if (ctx->stage == MESA_SHADER_GEOMETRY) { - LLVMTypeRef type = LLVMIntTypeInContext(ctx->ac.context, instr->dest.ssa.bit_size); - LLVMValueRef indir_index; - unsigned const_index, vertex_index; - get_deref_offset(ctx, deref, false, &vertex_index, NULL, - &const_index, &indir_index); - - return ctx->abi->load_inputs(ctx->abi, var->data.location, - var->data.driver_location, - var->data.location_frac, - instr->num_components, vertex_index, const_index, type); - } - - for (unsigned chan = comp; chan < ve + comp; chan++) { - if (indir_index) { - unsigned count = glsl_count_attribute_slots( - var->type, - ctx->stage == MESA_SHADER_VERTEX); - count -= chan / 4; - LLVMValueRef tmp_vec = ac_build_gather_values_extended( - &ctx->ac, ctx->abi->inputs + idx + chan, count, - stride, false, true); - - values[chan] = LLVMBuildExtractElement(ctx->ac.builder, - tmp_vec, - indir_index, ""); - } else - values[chan] = ctx->abi->inputs[idx + chan + const_index * stride]; - } - break; - case nir_var_function_temp: - for (unsigned chan = 0; chan < ve; chan++) { - if (indir_index) { - unsigned count = glsl_count_attribute_slots( - var->type, false); - count -= chan / 4; - LLVMValueRef tmp_vec = ac_build_gather_values_extended( - &ctx->ac, ctx->locals + idx + chan, count, - stride, true, true); - - values[chan] = LLVMBuildExtractElement(ctx->ac.builder, - tmp_vec, - indir_index, ""); - } else { - values[chan] = LLVMBuildLoad(ctx->ac.builder, ctx->locals[idx + chan + const_index * stride], ""); - } - } - break; - case nir_var_mem_shared: { - LLVMValueRef address = get_src(ctx, instr->src[0]); - LLVMValueRef val = LLVMBuildLoad(ctx->ac.builder, address, ""); - return LLVMBuildBitCast(ctx->ac.builder, val, - get_def_type(ctx, &instr->dest.ssa), - ""); - } - case nir_var_shader_out: - if (ctx->stage == MESA_SHADER_TESS_CTRL) { - return load_tess_varyings(ctx, instr, false); - } - - if (ctx->stage == MESA_SHADER_FRAGMENT && - var->data.fb_fetch_output && - ctx->abi->emit_fbfetch) - return ctx->abi->emit_fbfetch(ctx->abi); - - for (unsigned chan = comp; chan < ve + comp; chan++) { - if (indir_index) { - unsigned count = glsl_count_attribute_slots( - var->type, false); - count -= chan / 4; - LLVMValueRef tmp_vec = ac_build_gather_values_extended( - &ctx->ac, ctx->abi->outputs + idx + chan, count, - stride, true, true); - - values[chan] = LLVMBuildExtractElement(ctx->ac.builder, - tmp_vec, - indir_index, ""); - } else { - values[chan] = LLVMBuildLoad(ctx->ac.builder, - ctx->abi->outputs[idx + chan + const_index * stride], - ""); - } - } - break; - case nir_var_mem_global: { - LLVMValueRef address = get_src(ctx, instr->src[0]); - unsigned explicit_stride = glsl_get_explicit_stride(deref->type); - unsigned natural_stride = type_scalar_size_bytes(deref->type); - unsigned stride = explicit_stride ? explicit_stride : natural_stride; - - LLVMTypeRef result_type = get_def_type(ctx, &instr->dest.ssa); - if (stride != natural_stride) { - LLVMTypeRef ptr_type = LLVMPointerType(LLVMGetElementType(result_type), - LLVMGetPointerAddressSpace(LLVMTypeOf(address))); - address = LLVMBuildBitCast(ctx->ac.builder, address, ptr_type , ""); - - for (unsigned i = 0; i < instr->dest.ssa.num_components; ++i) { - LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, i * stride / natural_stride, 0); - values[i] = LLVMBuildLoad(ctx->ac.builder, - ac_build_gep_ptr(&ctx->ac, address, offset), ""); - } - return ac_build_gather_values(&ctx->ac, values, instr->dest.ssa.num_components); - } else { - LLVMTypeRef ptr_type = LLVMPointerType(result_type, - LLVMGetPointerAddressSpace(LLVMTypeOf(address))); - address = LLVMBuildBitCast(ctx->ac.builder, address, ptr_type , ""); - LLVMValueRef val = LLVMBuildLoad(ctx->ac.builder, address, ""); - return val; - } - } - default: - unreachable("unhandle variable mode"); - } - ret = ac_build_varying_gather_values(&ctx->ac, values, ve, comp); - return LLVMBuildBitCast(ctx->ac.builder, ret, get_def_type(ctx, &instr->dest.ssa), ""); -} - -static void -visit_store_var(struct ac_nir_context *ctx, - nir_intrinsic_instr *instr) -{ - nir_deref_instr *deref = nir_instr_as_deref(instr->src[0].ssa->parent_instr); - nir_variable *var = nir_deref_instr_get_variable(deref); - - LLVMValueRef temp_ptr, value; - int idx = 0; - unsigned comp = 0; - LLVMValueRef src = ac_to_float(&ctx->ac, get_src(ctx, instr->src[1])); - int writemask = instr->const_index[0]; - LLVMValueRef indir_index; - unsigned const_index; - - if (var) { - get_deref_offset(ctx, deref, false, - NULL, NULL, &const_index, &indir_index); - idx = var->data.driver_location; - comp = var->data.location_frac; - - if (var->data.compact) { - const_index += comp; - comp = 0; - } - } - - if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src)) == 64 && - (deref->mode == nir_var_shader_out || - deref->mode == nir_var_function_temp)) { - - src = LLVMBuildBitCast(ctx->ac.builder, src, - LLVMVectorType(ctx->ac.f32, ac_get_llvm_num_components(src) * 2), - ""); - - writemask = widen_mask(writemask, 2); - } - - writemask = writemask << comp; - - switch (deref->mode) { - case nir_var_shader_out: - - if (ctx->stage == MESA_SHADER_TESS_CTRL) { - LLVMValueRef vertex_index = NULL; - LLVMValueRef indir_index = NULL; - unsigned const_index = 0; - const bool is_patch = var->data.patch; - - get_deref_offset(ctx, deref, false, NULL, - is_patch ? NULL : &vertex_index, - &const_index, &indir_index); - - ctx->abi->store_tcs_outputs(ctx->abi, var, - vertex_index, indir_index, - const_index, src, writemask); - return; - } - - for (unsigned chan = 0; chan < 8; chan++) { - int stride = 4; - if (!(writemask & (1 << chan))) - continue; - - value = ac_llvm_extract_elem(&ctx->ac, src, chan - comp); - - if (var->data.compact) - stride = 1; - if (indir_index) { - unsigned count = glsl_count_attribute_slots( - var->type, false); - count -= chan / 4; - LLVMValueRef tmp_vec = ac_build_gather_values_extended( - &ctx->ac, ctx->abi->outputs + idx + chan, count, - stride, true, true); - - tmp_vec = LLVMBuildInsertElement(ctx->ac.builder, tmp_vec, - value, indir_index, ""); - build_store_values_extended(&ctx->ac, ctx->abi->outputs + idx + chan, - count, stride, tmp_vec); - - } else { - temp_ptr = ctx->abi->outputs[idx + chan + const_index * stride]; - - LLVMBuildStore(ctx->ac.builder, value, temp_ptr); - } - } - break; - case nir_var_function_temp: - for (unsigned chan = 0; chan < 8; chan++) { - if (!(writemask & (1 << chan))) - continue; - - value = ac_llvm_extract_elem(&ctx->ac, src, chan); - if (indir_index) { - unsigned count = glsl_count_attribute_slots( - var->type, false); - count -= chan / 4; - LLVMValueRef tmp_vec = ac_build_gather_values_extended( - &ctx->ac, ctx->locals + idx + chan, count, - 4, true, true); - - tmp_vec = LLVMBuildInsertElement(ctx->ac.builder, tmp_vec, - value, indir_index, ""); - build_store_values_extended(&ctx->ac, ctx->locals + idx + chan, - count, 4, tmp_vec); - } else { - temp_ptr = ctx->locals[idx + chan + const_index * 4]; - - LLVMBuildStore(ctx->ac.builder, value, temp_ptr); - } - } - break; - - case nir_var_mem_global: - case nir_var_mem_shared: { - int writemask = instr->const_index[0]; - LLVMValueRef address = get_src(ctx, instr->src[0]); - LLVMValueRef val = get_src(ctx, instr->src[1]); - - unsigned explicit_stride = glsl_get_explicit_stride(deref->type); - unsigned natural_stride = type_scalar_size_bytes(deref->type); - unsigned stride = explicit_stride ? explicit_stride : natural_stride; - - LLVMTypeRef ptr_type = LLVMPointerType(LLVMTypeOf(val), - LLVMGetPointerAddressSpace(LLVMTypeOf(address))); - address = LLVMBuildBitCast(ctx->ac.builder, address, ptr_type , ""); - - if (writemask == (1u << ac_get_llvm_num_components(val)) - 1 && - stride == natural_stride) { - LLVMTypeRef ptr_type = LLVMPointerType(LLVMTypeOf(val), - LLVMGetPointerAddressSpace(LLVMTypeOf(address))); - address = LLVMBuildBitCast(ctx->ac.builder, address, ptr_type , ""); - - val = LLVMBuildBitCast(ctx->ac.builder, val, - LLVMGetElementType(LLVMTypeOf(address)), ""); - LLVMBuildStore(ctx->ac.builder, val, address); - } else { - LLVMTypeRef ptr_type = LLVMPointerType(LLVMGetElementType(LLVMTypeOf(val)), - LLVMGetPointerAddressSpace(LLVMTypeOf(address))); - address = LLVMBuildBitCast(ctx->ac.builder, address, ptr_type , ""); - for (unsigned chan = 0; chan < 4; chan++) { - if (!(writemask & (1 << chan))) - continue; - - LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, chan * stride / natural_stride, 0); - - LLVMValueRef ptr = ac_build_gep_ptr(&ctx->ac, address, offset); - LLVMValueRef src = ac_llvm_extract_elem(&ctx->ac, val, - chan); - src = LLVMBuildBitCast(ctx->ac.builder, src, - LLVMGetElementType(LLVMTypeOf(ptr)), ""); - LLVMBuildStore(ctx->ac.builder, src, ptr); - } - } - break; - } - default: - abort(); - break; - } -} - -static int image_type_to_components_count(enum glsl_sampler_dim dim, bool array) -{ - switch (dim) { - case GLSL_SAMPLER_DIM_BUF: - return 1; - case GLSL_SAMPLER_DIM_1D: - return array ? 2 : 1; - case GLSL_SAMPLER_DIM_2D: - return array ? 3 : 2; - case GLSL_SAMPLER_DIM_MS: - return array ? 4 : 3; - case GLSL_SAMPLER_DIM_3D: - case GLSL_SAMPLER_DIM_CUBE: - return 3; - case GLSL_SAMPLER_DIM_RECT: - case GLSL_SAMPLER_DIM_SUBPASS: - return 2; - case GLSL_SAMPLER_DIM_SUBPASS_MS: - return 3; - default: - break; - } - return 0; -} - -static LLVMValueRef adjust_sample_index_using_fmask(struct ac_llvm_context *ctx, - LLVMValueRef coord_x, LLVMValueRef coord_y, - LLVMValueRef coord_z, - LLVMValueRef sample_index, - LLVMValueRef fmask_desc_ptr) -{ - unsigned sample_chan = coord_z ? 3 : 2; - LLVMValueRef addr[4] = {coord_x, coord_y, coord_z}; - addr[sample_chan] = sample_index; - - ac_apply_fmask_to_sample(ctx, fmask_desc_ptr, addr, coord_z != NULL); - return addr[sample_chan]; -} - -static nir_deref_instr *get_image_deref(const nir_intrinsic_instr *instr) -{ - assert(instr->src[0].is_ssa); - return nir_instr_as_deref(instr->src[0].ssa->parent_instr); -} - -static LLVMValueRef get_image_descriptor(struct ac_nir_context *ctx, - const nir_intrinsic_instr *instr, - enum ac_descriptor_type desc_type, - bool write) -{ - nir_deref_instr *deref_instr = - instr->src[0].ssa->parent_instr->type == nir_instr_type_deref ? - nir_instr_as_deref(instr->src[0].ssa->parent_instr) : NULL; - - return get_sampler_desc(ctx, deref_instr, desc_type, &instr->instr, true, write); -} - -static void get_image_coords(struct ac_nir_context *ctx, - const nir_intrinsic_instr *instr, - struct ac_image_args *args, - enum glsl_sampler_dim dim, - bool is_array) -{ - LLVMValueRef src0 = get_src(ctx, instr->src[1]); - LLVMValueRef masks[] = { - LLVMConstInt(ctx->ac.i32, 0, false), LLVMConstInt(ctx->ac.i32, 1, false), - LLVMConstInt(ctx->ac.i32, 2, false), LLVMConstInt(ctx->ac.i32, 3, false), - }; - LLVMValueRef sample_index = ac_llvm_extract_elem(&ctx->ac, get_src(ctx, instr->src[2]), 0); - - int count; - ASSERTED bool add_frag_pos = (dim == GLSL_SAMPLER_DIM_SUBPASS || - dim == GLSL_SAMPLER_DIM_SUBPASS_MS); - bool is_ms = (dim == GLSL_SAMPLER_DIM_MS || - dim == GLSL_SAMPLER_DIM_SUBPASS_MS); - bool gfx9_1d = ctx->ac.chip_class == GFX9 && dim == GLSL_SAMPLER_DIM_1D; - assert(!add_frag_pos && "Input attachments should be lowered by this point."); - count = image_type_to_components_count(dim, is_array); - - if (is_ms && (instr->intrinsic == nir_intrinsic_image_deref_load || - instr->intrinsic == nir_intrinsic_bindless_image_load)) { - LLVMValueRef fmask_load_address[3]; - - fmask_load_address[0] = LLVMBuildExtractElement(ctx->ac.builder, src0, masks[0], ""); - fmask_load_address[1] = LLVMBuildExtractElement(ctx->ac.builder, src0, masks[1], ""); - if (is_array) - fmask_load_address[2] = LLVMBuildExtractElement(ctx->ac.builder, src0, masks[2], ""); - else - fmask_load_address[2] = NULL; - - sample_index = adjust_sample_index_using_fmask(&ctx->ac, - fmask_load_address[0], - fmask_load_address[1], - fmask_load_address[2], - sample_index, - get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), - AC_DESC_FMASK, &instr->instr, true, false)); - } - if (count == 1 && !gfx9_1d) { - if (instr->src[1].ssa->num_components) - args->coords[0] = LLVMBuildExtractElement(ctx->ac.builder, src0, masks[0], ""); - else - args->coords[0] = src0; - } else { - int chan; - if (is_ms) - count--; - for (chan = 0; chan < count; ++chan) { - args->coords[chan] = ac_llvm_extract_elem(&ctx->ac, src0, chan); - } - - if (gfx9_1d) { - if (is_array) { - args->coords[2] = args->coords[1]; - args->coords[1] = ctx->ac.i32_0; - } else - args->coords[1] = ctx->ac.i32_0; - count++; - } - - if (is_ms) { - args->coords[count] = sample_index; - count++; - } - } -} - -static LLVMValueRef get_image_buffer_descriptor(struct ac_nir_context *ctx, - const nir_intrinsic_instr *instr, - bool write, bool atomic) -{ - LLVMValueRef rsrc = get_image_descriptor(ctx, instr, AC_DESC_BUFFER, write); - if (ctx->abi->gfx9_stride_size_workaround || - (ctx->abi->gfx9_stride_size_workaround_for_atomic && atomic)) { - LLVMValueRef elem_count = LLVMBuildExtractElement(ctx->ac.builder, rsrc, LLVMConstInt(ctx->ac.i32, 2, 0), ""); - LLVMValueRef stride = LLVMBuildExtractElement(ctx->ac.builder, rsrc, LLVMConstInt(ctx->ac.i32, 1, 0), ""); - stride = LLVMBuildLShr(ctx->ac.builder, stride, LLVMConstInt(ctx->ac.i32, 16, 0), ""); - - LLVMValueRef new_elem_count = LLVMBuildSelect(ctx->ac.builder, - LLVMBuildICmp(ctx->ac.builder, LLVMIntUGT, elem_count, stride, ""), - elem_count, stride, ""); - - rsrc = LLVMBuildInsertElement(ctx->ac.builder, rsrc, new_elem_count, - LLVMConstInt(ctx->ac.i32, 2, 0), ""); - } - return rsrc; -} - -static LLVMValueRef visit_image_load(struct ac_nir_context *ctx, - const nir_intrinsic_instr *instr, - bool bindless) -{ - LLVMValueRef res; - - enum glsl_sampler_dim dim; - enum gl_access_qualifier access; - bool is_array; - if (bindless) { - dim = nir_intrinsic_image_dim(instr); - access = nir_intrinsic_access(instr); - is_array = nir_intrinsic_image_array(instr); - } else { - const nir_deref_instr *image_deref = get_image_deref(instr); - const struct glsl_type *type = image_deref->type; - const nir_variable *var = nir_deref_instr_get_variable(image_deref); - dim = glsl_get_sampler_dim(type); - access = var->data.image.access; - is_array = glsl_sampler_type_is_array(type); - } - - struct ac_image_args args = {}; - - args.cache_policy = get_cache_policy(ctx, access, false, false); - - if (dim == GLSL_SAMPLER_DIM_BUF) { - unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa); - unsigned num_channels = util_last_bit(mask); - LLVMValueRef rsrc, vindex; - - rsrc = get_image_buffer_descriptor(ctx, instr, false, false); - vindex = LLVMBuildExtractElement(ctx->ac.builder, get_src(ctx, instr->src[1]), - ctx->ac.i32_0, ""); - - bool can_speculate = access & ACCESS_CAN_REORDER; - res = ac_build_buffer_load_format(&ctx->ac, rsrc, vindex, - ctx->ac.i32_0, num_channels, - args.cache_policy, - can_speculate); - res = ac_build_expand_to_vec4(&ctx->ac, res, num_channels); - - res = ac_trim_vector(&ctx->ac, res, instr->dest.ssa.num_components); - res = ac_to_integer(&ctx->ac, res); - } else { - args.opcode = ac_image_load; - get_image_coords(ctx, instr, &args, dim, is_array); - args.resource = get_image_descriptor(ctx, instr, AC_DESC_IMAGE, false); - args.dim = get_ac_image_dim(&ctx->ac, dim, is_array); - args.dmask = 15; - args.attributes = AC_FUNC_ATTR_READONLY; - - res = ac_build_image_opcode(&ctx->ac, &args); - } - return res; -} - -static void visit_image_store(struct ac_nir_context *ctx, - nir_intrinsic_instr *instr, - bool bindless) -{ - - - enum glsl_sampler_dim dim; - enum gl_access_qualifier access; - bool is_array; - if (bindless) { - dim = nir_intrinsic_image_dim(instr); - access = nir_intrinsic_access(instr); - is_array = nir_intrinsic_image_array(instr); - } else { - const nir_deref_instr *image_deref = get_image_deref(instr); - const struct glsl_type *type = image_deref->type; - const nir_variable *var = nir_deref_instr_get_variable(image_deref); - dim = glsl_get_sampler_dim(type); - access = var->data.image.access; - is_array = glsl_sampler_type_is_array(type); - } - - bool writeonly_memory = access & ACCESS_NON_READABLE; - struct ac_image_args args = {}; - - args.cache_policy = get_cache_policy(ctx, access, true, writeonly_memory); - - if (dim == GLSL_SAMPLER_DIM_BUF) { - LLVMValueRef rsrc = get_image_buffer_descriptor(ctx, instr, true, false); - LLVMValueRef src = ac_to_float(&ctx->ac, get_src(ctx, instr->src[3])); - unsigned src_channels = ac_get_llvm_num_components(src); - LLVMValueRef vindex; - - if (src_channels == 3) - src = ac_build_expand_to_vec4(&ctx->ac, src, 3); - - vindex = LLVMBuildExtractElement(ctx->ac.builder, - get_src(ctx, instr->src[1]), - ctx->ac.i32_0, ""); - - ac_build_buffer_store_format(&ctx->ac, rsrc, src, vindex, - ctx->ac.i32_0, src_channels, - args.cache_policy); - } else { - args.opcode = ac_image_store; - args.data[0] = ac_to_float(&ctx->ac, get_src(ctx, instr->src[3])); - get_image_coords(ctx, instr, &args, dim, is_array); - args.resource = get_image_descriptor(ctx, instr, AC_DESC_IMAGE, true); - args.dim = get_ac_image_dim(&ctx->ac, dim, is_array); - args.dmask = 15; - - ac_build_image_opcode(&ctx->ac, &args); - } - -} - -static LLVMValueRef visit_image_atomic(struct ac_nir_context *ctx, - const nir_intrinsic_instr *instr, - bool bindless) -{ - LLVMValueRef params[7]; - int param_count = 0; - - bool cmpswap = instr->intrinsic == nir_intrinsic_image_deref_atomic_comp_swap || - instr->intrinsic == nir_intrinsic_bindless_image_atomic_comp_swap; - const char *atomic_name; - char intrinsic_name[64]; - enum ac_atomic_op atomic_subop; - ASSERTED int length; - - enum glsl_sampler_dim dim; - bool is_unsigned = false; - bool is_array; - if (bindless) { - if (instr->intrinsic == nir_intrinsic_bindless_image_atomic_min || - instr->intrinsic == nir_intrinsic_bindless_image_atomic_max) { - const GLenum format = nir_intrinsic_format(instr); - assert(format == GL_R32UI || format == GL_R32I); - is_unsigned = format == GL_R32UI; - } - dim = nir_intrinsic_image_dim(instr); - is_array = nir_intrinsic_image_array(instr); - } else { - const struct glsl_type *type = get_image_deref(instr)->type; - is_unsigned = glsl_get_sampler_result_type(type) == GLSL_TYPE_UINT; - dim = glsl_get_sampler_dim(type); - is_array = glsl_sampler_type_is_array(type); - } - - switch (instr->intrinsic) { - case nir_intrinsic_bindless_image_atomic_add: - case nir_intrinsic_image_deref_atomic_add: - atomic_name = "add"; - atomic_subop = ac_atomic_add; - break; - case nir_intrinsic_bindless_image_atomic_min: - case nir_intrinsic_image_deref_atomic_min: - atomic_name = is_unsigned ? "umin" : "smin"; - atomic_subop = is_unsigned ? ac_atomic_umin : ac_atomic_smin; - break; - case nir_intrinsic_bindless_image_atomic_max: - case nir_intrinsic_image_deref_atomic_max: - atomic_name = is_unsigned ? "umax" : "smax"; - atomic_subop = is_unsigned ? ac_atomic_umax : ac_atomic_smax; - break; - case nir_intrinsic_bindless_image_atomic_and: - case nir_intrinsic_image_deref_atomic_and: - atomic_name = "and"; - atomic_subop = ac_atomic_and; - break; - case nir_intrinsic_bindless_image_atomic_or: - case nir_intrinsic_image_deref_atomic_or: - atomic_name = "or"; - atomic_subop = ac_atomic_or; - break; - case nir_intrinsic_bindless_image_atomic_xor: - case nir_intrinsic_image_deref_atomic_xor: - atomic_name = "xor"; - atomic_subop = ac_atomic_xor; - break; - case nir_intrinsic_bindless_image_atomic_exchange: - case nir_intrinsic_image_deref_atomic_exchange: - atomic_name = "swap"; - atomic_subop = ac_atomic_swap; - break; - case nir_intrinsic_bindless_image_atomic_comp_swap: - case nir_intrinsic_image_deref_atomic_comp_swap: - atomic_name = "cmpswap"; - atomic_subop = 0; /* not used */ - break; - case nir_intrinsic_bindless_image_atomic_inc_wrap: - case nir_intrinsic_image_deref_atomic_inc_wrap: { - atomic_name = "inc"; - atomic_subop = ac_atomic_inc_wrap; - /* ATOMIC_INC instruction does: - * value = (value + 1) % (data + 1) - * but we want: - * value = (value + 1) % data - * So replace 'data' by 'data - 1'. - */ - ctx->ssa_defs[instr->src[3].ssa->index] = - LLVMBuildSub(ctx->ac.builder, - ctx->ssa_defs[instr->src[3].ssa->index], - ctx->ac.i32_1, ""); - break; - } - case nir_intrinsic_bindless_image_atomic_dec_wrap: - case nir_intrinsic_image_deref_atomic_dec_wrap: - atomic_name = "dec"; - atomic_subop = ac_atomic_dec_wrap; - break; - default: - abort(); - } - - if (cmpswap) - params[param_count++] = get_src(ctx, instr->src[4]); - params[param_count++] = get_src(ctx, instr->src[3]); - - if (dim == GLSL_SAMPLER_DIM_BUF) { - params[param_count++] = get_image_buffer_descriptor(ctx, instr, true, true); - params[param_count++] = LLVMBuildExtractElement(ctx->ac.builder, get_src(ctx, instr->src[1]), - ctx->ac.i32_0, ""); /* vindex */ - params[param_count++] = ctx->ac.i32_0; /* voffset */ - if (HAVE_LLVM >= 0x900) { - /* XXX: The new raw/struct atomic intrinsics are buggy - * with LLVM 8, see r358579. - */ - params[param_count++] = ctx->ac.i32_0; /* soffset */ - params[param_count++] = ctx->ac.i32_0; /* slc */ - - length = snprintf(intrinsic_name, sizeof(intrinsic_name), - "llvm.amdgcn.struct.buffer.atomic.%s.i32", atomic_name); - } else { - params[param_count++] = ctx->ac.i1false; /* slc */ - - length = snprintf(intrinsic_name, sizeof(intrinsic_name), - "llvm.amdgcn.buffer.atomic.%s", atomic_name); - } - - assert(length < sizeof(intrinsic_name)); - return ac_build_intrinsic(&ctx->ac, intrinsic_name, ctx->ac.i32, - params, param_count, 0); - } else { - struct ac_image_args args = {}; - args.opcode = cmpswap ? ac_image_atomic_cmpswap : ac_image_atomic; - args.atomic = atomic_subop; - args.data[0] = params[0]; - if (cmpswap) - args.data[1] = params[1]; - get_image_coords(ctx, instr, &args, dim, is_array); - args.resource = get_image_descriptor(ctx, instr, AC_DESC_IMAGE, true); - args.dim = get_ac_image_dim(&ctx->ac, dim, is_array); - - return ac_build_image_opcode(&ctx->ac, &args); - } -} - -static LLVMValueRef visit_image_samples(struct ac_nir_context *ctx, - const nir_intrinsic_instr *instr, - bool bindless) -{ - enum glsl_sampler_dim dim; - bool is_array; - if (bindless) { - dim = nir_intrinsic_image_dim(instr); - is_array = nir_intrinsic_image_array(instr); - } else { - const struct glsl_type *type = get_image_deref(instr)->type; - dim = glsl_get_sampler_dim(type); - is_array = glsl_sampler_type_is_array(type); - } - - struct ac_image_args args = { 0 }; - args.dim = get_ac_sampler_dim(&ctx->ac, dim, is_array); - args.dmask = 0xf; - args.resource = get_image_descriptor(ctx, instr, AC_DESC_IMAGE, false); - args.opcode = ac_image_get_resinfo; - args.lod = ctx->ac.i32_0; - args.attributes = AC_FUNC_ATTR_READNONE; - - return ac_build_image_opcode(&ctx->ac, &args); -} - -static LLVMValueRef visit_image_size(struct ac_nir_context *ctx, - const nir_intrinsic_instr *instr, - bool bindless) -{ - LLVMValueRef res; - - enum glsl_sampler_dim dim; - bool is_array; - if (bindless) { - dim = nir_intrinsic_image_dim(instr); - is_array = nir_intrinsic_image_array(instr); - } else { - const struct glsl_type *type = get_image_deref(instr)->type; - dim = glsl_get_sampler_dim(type); - is_array = glsl_sampler_type_is_array(type); - } - - if (dim == GLSL_SAMPLER_DIM_BUF) - return get_buffer_size(ctx, get_image_descriptor(ctx, instr, AC_DESC_BUFFER, false), true); - - struct ac_image_args args = { 0 }; - - args.dim = get_ac_image_dim(&ctx->ac, dim, is_array); - args.dmask = 0xf; - args.resource = get_image_descriptor(ctx, instr, AC_DESC_IMAGE, false); - args.opcode = ac_image_get_resinfo; - args.lod = ctx->ac.i32_0; - args.attributes = AC_FUNC_ATTR_READNONE; - - res = ac_build_image_opcode(&ctx->ac, &args); - - LLVMValueRef two = LLVMConstInt(ctx->ac.i32, 2, false); - - if (dim == GLSL_SAMPLER_DIM_CUBE && is_array) { - LLVMValueRef six = LLVMConstInt(ctx->ac.i32, 6, false); - LLVMValueRef z = LLVMBuildExtractElement(ctx->ac.builder, res, two, ""); - z = LLVMBuildSDiv(ctx->ac.builder, z, six, ""); - res = LLVMBuildInsertElement(ctx->ac.builder, res, z, two, ""); - } - if (ctx->ac.chip_class == GFX9 && dim == GLSL_SAMPLER_DIM_1D && is_array) { - LLVMValueRef layers = LLVMBuildExtractElement(ctx->ac.builder, res, two, ""); - res = LLVMBuildInsertElement(ctx->ac.builder, res, layers, - ctx->ac.i32_1, ""); - - } - return res; -} - -static void emit_membar(struct ac_llvm_context *ac, - const nir_intrinsic_instr *instr) -{ - unsigned wait_flags = 0; - - switch (instr->intrinsic) { - case nir_intrinsic_memory_barrier: - case nir_intrinsic_group_memory_barrier: - wait_flags = AC_WAIT_LGKM | AC_WAIT_VLOAD | AC_WAIT_VSTORE; - break; - case nir_intrinsic_memory_barrier_atomic_counter: - case nir_intrinsic_memory_barrier_buffer: - case nir_intrinsic_memory_barrier_image: - wait_flags = AC_WAIT_VLOAD | AC_WAIT_VSTORE; - break; - case nir_intrinsic_memory_barrier_shared: - wait_flags = AC_WAIT_LGKM; - break; - default: - break; - } - - ac_build_waitcnt(ac, wait_flags); -} - -void ac_emit_barrier(struct ac_llvm_context *ac, gl_shader_stage stage) -{ - /* GFX6 only (thanks to a hw bug workaround): - * The real barrier instruction isn’t needed, because an entire patch - * always fits into a single wave. - */ - if (ac->chip_class == GFX6 && stage == MESA_SHADER_TESS_CTRL) { - ac_build_waitcnt(ac, AC_WAIT_LGKM | AC_WAIT_VLOAD | AC_WAIT_VSTORE); - return; - } - ac_build_s_barrier(ac); -} - -static void emit_discard(struct ac_nir_context *ctx, - const nir_intrinsic_instr *instr) -{ - LLVMValueRef cond; - - if (instr->intrinsic == nir_intrinsic_discard_if) { - cond = LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, - get_src(ctx, instr->src[0]), - ctx->ac.i32_0, ""); - } else { - assert(instr->intrinsic == nir_intrinsic_discard); - cond = ctx->ac.i1false; - } - - ctx->abi->emit_kill(ctx->abi, cond); -} - -static LLVMValueRef -visit_load_local_invocation_index(struct ac_nir_context *ctx) -{ - LLVMValueRef result; - LLVMValueRef thread_id = ac_get_thread_id(&ctx->ac); - result = LLVMBuildAnd(ctx->ac.builder, ctx->abi->tg_size, - LLVMConstInt(ctx->ac.i32, 0xfc0, false), ""); - - return LLVMBuildAdd(ctx->ac.builder, result, thread_id, ""); -} - -static LLVMValueRef -visit_load_subgroup_id(struct ac_nir_context *ctx) -{ - if (ctx->stage == MESA_SHADER_COMPUTE) { - LLVMValueRef result; - result = LLVMBuildAnd(ctx->ac.builder, ctx->abi->tg_size, - LLVMConstInt(ctx->ac.i32, 0xfc0, false), ""); - return LLVMBuildLShr(ctx->ac.builder, result, LLVMConstInt(ctx->ac.i32, 6, false), ""); - } else { - return LLVMConstInt(ctx->ac.i32, 0, false); - } -} - -static LLVMValueRef -visit_load_num_subgroups(struct ac_nir_context *ctx) -{ - if (ctx->stage == MESA_SHADER_COMPUTE) { - return LLVMBuildAnd(ctx->ac.builder, ctx->abi->tg_size, - LLVMConstInt(ctx->ac.i32, 0x3f, false), ""); - } else { - return LLVMConstInt(ctx->ac.i32, 1, false); - } -} - -static LLVMValueRef -visit_first_invocation(struct ac_nir_context *ctx) -{ - LLVMValueRef active_set = ac_build_ballot(&ctx->ac, ctx->ac.i32_1); - const char *intr = ctx->ac.wave_size == 32 ? "llvm.cttz.i32" : "llvm.cttz.i64"; - - /* The second argument is whether cttz(0) should be defined, but we do not care. */ - LLVMValueRef args[] = {active_set, ctx->ac.i1false}; - LLVMValueRef result = ac_build_intrinsic(&ctx->ac, intr, - ctx->ac.iN_wavemask, args, 2, - AC_FUNC_ATTR_NOUNWIND | - AC_FUNC_ATTR_READNONE); - - return LLVMBuildTrunc(ctx->ac.builder, result, ctx->ac.i32, ""); -} - -static LLVMValueRef -visit_load_shared(struct ac_nir_context *ctx, - const nir_intrinsic_instr *instr) -{ - LLVMValueRef values[4], derived_ptr, index, ret; - - LLVMValueRef ptr = get_memory_ptr(ctx, instr->src[0]); - - for (int chan = 0; chan < instr->num_components; chan++) { - index = LLVMConstInt(ctx->ac.i32, chan, 0); - derived_ptr = LLVMBuildGEP(ctx->ac.builder, ptr, &index, 1, ""); - values[chan] = LLVMBuildLoad(ctx->ac.builder, derived_ptr, ""); - } - - ret = ac_build_gather_values(&ctx->ac, values, instr->num_components); - return LLVMBuildBitCast(ctx->ac.builder, ret, get_def_type(ctx, &instr->dest.ssa), ""); -} - -static void -visit_store_shared(struct ac_nir_context *ctx, - const nir_intrinsic_instr *instr) -{ - LLVMValueRef derived_ptr, data,index; - LLVMBuilderRef builder = ctx->ac.builder; - - LLVMValueRef ptr = get_memory_ptr(ctx, instr->src[1]); - LLVMValueRef src = get_src(ctx, instr->src[0]); - - int writemask = nir_intrinsic_write_mask(instr); - for (int chan = 0; chan < 4; chan++) { - if (!(writemask & (1 << chan))) { - continue; - } - data = ac_llvm_extract_elem(&ctx->ac, src, chan); - index = LLVMConstInt(ctx->ac.i32, chan, 0); - derived_ptr = LLVMBuildGEP(builder, ptr, &index, 1, ""); - LLVMBuildStore(builder, data, derived_ptr); - } -} - -static LLVMValueRef visit_var_atomic(struct ac_nir_context *ctx, - const nir_intrinsic_instr *instr, - LLVMValueRef ptr, int src_idx) -{ - LLVMValueRef result; - LLVMValueRef src = get_src(ctx, instr->src[src_idx]); - - const char *sync_scope = HAVE_LLVM >= 0x0900 ? "workgroup-one-as" : "workgroup"; - - if (instr->intrinsic == nir_intrinsic_shared_atomic_comp_swap || - instr->intrinsic == nir_intrinsic_deref_atomic_comp_swap) { - LLVMValueRef src1 = get_src(ctx, instr->src[src_idx + 1]); - result = ac_build_atomic_cmp_xchg(&ctx->ac, ptr, src, src1, sync_scope); - result = LLVMBuildExtractValue(ctx->ac.builder, result, 0, ""); - } else { - LLVMAtomicRMWBinOp op; - switch (instr->intrinsic) { - case nir_intrinsic_shared_atomic_add: - case nir_intrinsic_deref_atomic_add: - op = LLVMAtomicRMWBinOpAdd; - break; - case nir_intrinsic_shared_atomic_umin: - case nir_intrinsic_deref_atomic_umin: - op = LLVMAtomicRMWBinOpUMin; - break; - case nir_intrinsic_shared_atomic_umax: - case nir_intrinsic_deref_atomic_umax: - op = LLVMAtomicRMWBinOpUMax; - break; - case nir_intrinsic_shared_atomic_imin: - case nir_intrinsic_deref_atomic_imin: - op = LLVMAtomicRMWBinOpMin; - break; - case nir_intrinsic_shared_atomic_imax: - case nir_intrinsic_deref_atomic_imax: - op = LLVMAtomicRMWBinOpMax; - break; - case nir_intrinsic_shared_atomic_and: - case nir_intrinsic_deref_atomic_and: - op = LLVMAtomicRMWBinOpAnd; - break; - case nir_intrinsic_shared_atomic_or: - case nir_intrinsic_deref_atomic_or: - op = LLVMAtomicRMWBinOpOr; - break; - case nir_intrinsic_shared_atomic_xor: - case nir_intrinsic_deref_atomic_xor: - op = LLVMAtomicRMWBinOpXor; - break; - case nir_intrinsic_shared_atomic_exchange: - case nir_intrinsic_deref_atomic_exchange: - op = LLVMAtomicRMWBinOpXchg; - break; - default: - return NULL; - } - - result = ac_build_atomic_rmw(&ctx->ac, op, ptr, ac_to_integer(&ctx->ac, src), sync_scope); - } - return result; -} - -static LLVMValueRef load_sample_pos(struct ac_nir_context *ctx) -{ - LLVMValueRef values[2]; - LLVMValueRef pos[2]; - - pos[0] = ac_to_float(&ctx->ac, ctx->abi->frag_pos[0]); - pos[1] = ac_to_float(&ctx->ac, ctx->abi->frag_pos[1]); - - values[0] = ac_build_fract(&ctx->ac, pos[0], 32); - values[1] = ac_build_fract(&ctx->ac, pos[1], 32); - return ac_build_gather_values(&ctx->ac, values, 2); -} - -static LLVMValueRef barycentric_center(struct ac_nir_context *ctx, - unsigned mode) -{ - LLVMValueRef interp_param = ctx->abi->lookup_interp_param(ctx->abi, mode, INTERP_CENTER); - return LLVMBuildBitCast(ctx->ac.builder, interp_param, ctx->ac.v2i32, ""); -} - -static LLVMValueRef barycentric_offset(struct ac_nir_context *ctx, - unsigned mode, - LLVMValueRef offset) -{ - LLVMValueRef interp_param = ctx->abi->lookup_interp_param(ctx->abi, mode, INTERP_CENTER); - LLVMValueRef src_c0 = ac_to_float(&ctx->ac, LLVMBuildExtractElement(ctx->ac.builder, offset, ctx->ac.i32_0, "")); - LLVMValueRef src_c1 = ac_to_float(&ctx->ac, LLVMBuildExtractElement(ctx->ac.builder, offset, ctx->ac.i32_1, "")); - - LLVMValueRef ij_out[2]; - LLVMValueRef ddxy_out = ac_build_ddxy_interp(&ctx->ac, interp_param); - - /* - * take the I then J parameters, and the DDX/Y for it, and - * calculate the IJ inputs for the interpolator. - * temp1 = ddx * offset/sample.x + I; - * interp_param.I = ddy * offset/sample.y + temp1; - * temp1 = ddx * offset/sample.x + J; - * interp_param.J = ddy * offset/sample.y + temp1; - */ - for (unsigned i = 0; i < 2; i++) { - LLVMValueRef ix_ll = LLVMConstInt(ctx->ac.i32, i, false); - LLVMValueRef iy_ll = LLVMConstInt(ctx->ac.i32, i + 2, false); - LLVMValueRef ddx_el = LLVMBuildExtractElement(ctx->ac.builder, - ddxy_out, ix_ll, ""); - LLVMValueRef ddy_el = LLVMBuildExtractElement(ctx->ac.builder, - ddxy_out, iy_ll, ""); - LLVMValueRef interp_el = LLVMBuildExtractElement(ctx->ac.builder, - interp_param, ix_ll, ""); - LLVMValueRef temp1, temp2; - - interp_el = LLVMBuildBitCast(ctx->ac.builder, interp_el, - ctx->ac.f32, ""); - - temp1 = ac_build_fmad(&ctx->ac, ddx_el, src_c0, interp_el); - temp2 = ac_build_fmad(&ctx->ac, ddy_el, src_c1, temp1); - - ij_out[i] = LLVMBuildBitCast(ctx->ac.builder, - temp2, ctx->ac.i32, ""); - } - interp_param = ac_build_gather_values(&ctx->ac, ij_out, 2); - return LLVMBuildBitCast(ctx->ac.builder, interp_param, ctx->ac.v2i32, ""); -} - -static LLVMValueRef barycentric_centroid(struct ac_nir_context *ctx, - unsigned mode) -{ - LLVMValueRef interp_param = ctx->abi->lookup_interp_param(ctx->abi, mode, INTERP_CENTROID); - return LLVMBuildBitCast(ctx->ac.builder, interp_param, ctx->ac.v2i32, ""); -} - -static LLVMValueRef barycentric_at_sample(struct ac_nir_context *ctx, - unsigned mode, - LLVMValueRef sample_id) -{ - if (ctx->abi->interp_at_sample_force_center) - return barycentric_center(ctx, mode); - - LLVMValueRef halfval = LLVMConstReal(ctx->ac.f32, 0.5f); - - /* fetch sample ID */ - LLVMValueRef sample_pos = ctx->abi->load_sample_position(ctx->abi, sample_id); - - LLVMValueRef src_c0 = LLVMBuildExtractElement(ctx->ac.builder, sample_pos, ctx->ac.i32_0, ""); - src_c0 = LLVMBuildFSub(ctx->ac.builder, src_c0, halfval, ""); - LLVMValueRef src_c1 = LLVMBuildExtractElement(ctx->ac.builder, sample_pos, ctx->ac.i32_1, ""); - src_c1 = LLVMBuildFSub(ctx->ac.builder, src_c1, halfval, ""); - LLVMValueRef coords[] = { src_c0, src_c1 }; - LLVMValueRef offset = ac_build_gather_values(&ctx->ac, coords, 2); - - return barycentric_offset(ctx, mode, offset); -} - - -static LLVMValueRef barycentric_sample(struct ac_nir_context *ctx, - unsigned mode) -{ - LLVMValueRef interp_param = ctx->abi->lookup_interp_param(ctx->abi, mode, INTERP_SAMPLE); - return LLVMBuildBitCast(ctx->ac.builder, interp_param, ctx->ac.v2i32, ""); -} - -static LLVMValueRef load_interpolated_input(struct ac_nir_context *ctx, - LLVMValueRef interp_param, - unsigned index, unsigned comp_start, - unsigned num_components, - unsigned bitsize) -{ - LLVMValueRef attr_number = LLVMConstInt(ctx->ac.i32, index, false); - - interp_param = LLVMBuildBitCast(ctx->ac.builder, - interp_param, ctx->ac.v2f32, ""); - LLVMValueRef i = LLVMBuildExtractElement( - ctx->ac.builder, interp_param, ctx->ac.i32_0, ""); - LLVMValueRef j = LLVMBuildExtractElement( - ctx->ac.builder, interp_param, ctx->ac.i32_1, ""); - - LLVMValueRef values[4]; - assert(bitsize == 16 || bitsize == 32); - for (unsigned comp = 0; comp < num_components; comp++) { - LLVMValueRef llvm_chan = LLVMConstInt(ctx->ac.i32, comp_start + comp, false); - if (bitsize == 16) { - values[comp] = ac_build_fs_interp_f16(&ctx->ac, llvm_chan, attr_number, - ctx->abi->prim_mask, i, j); - } else { - values[comp] = ac_build_fs_interp(&ctx->ac, llvm_chan, attr_number, - ctx->abi->prim_mask, i, j); - } - } - - return ac_to_integer(&ctx->ac, ac_build_gather_values(&ctx->ac, values, num_components)); -} - -static LLVMValueRef load_flat_input(struct ac_nir_context *ctx, - unsigned index, unsigned comp_start, - unsigned num_components, - unsigned bit_size) -{ - LLVMValueRef attr_number = LLVMConstInt(ctx->ac.i32, index, false); - - LLVMValueRef values[8]; - - /* Each component of a 64-bit value takes up two GL-level channels. */ - unsigned channels = - bit_size == 64 ? num_components * 2 : num_components; - - for (unsigned chan = 0; chan < channels; chan++) { - if (comp_start + chan > 4) - attr_number = LLVMConstInt(ctx->ac.i32, index + 1, false); - LLVMValueRef llvm_chan = LLVMConstInt(ctx->ac.i32, (comp_start + chan) % 4, false); - values[chan] = ac_build_fs_interp_mov(&ctx->ac, - LLVMConstInt(ctx->ac.i32, 2, false), - llvm_chan, - attr_number, - ctx->abi->prim_mask); - values[chan] = LLVMBuildBitCast(ctx->ac.builder, values[chan], ctx->ac.i32, ""); - values[chan] = LLVMBuildTruncOrBitCast(ctx->ac.builder, values[chan], - bit_size == 16 ? ctx->ac.i16 : ctx->ac.i32, ""); - } - - LLVMValueRef result = ac_build_gather_values(&ctx->ac, values, channels); - if (bit_size == 64) { - LLVMTypeRef type = num_components == 1 ? ctx->ac.i64 : - LLVMVectorType(ctx->ac.i64, num_components); - result = LLVMBuildBitCast(ctx->ac.builder, result, type, ""); - } - return result; -} - -static void visit_intrinsic(struct ac_nir_context *ctx, - nir_intrinsic_instr *instr) -{ - LLVMValueRef result = NULL; - - switch (instr->intrinsic) { - case nir_intrinsic_ballot: - result = ac_build_ballot(&ctx->ac, get_src(ctx, instr->src[0])); - if (ctx->ac.ballot_mask_bits > ctx->ac.wave_size) - result = LLVMBuildZExt(ctx->ac.builder, result, ctx->ac.iN_ballotmask, ""); - break; - case nir_intrinsic_read_invocation: - result = ac_build_readlane(&ctx->ac, get_src(ctx, instr->src[0]), - get_src(ctx, instr->src[1])); - break; - case nir_intrinsic_read_first_invocation: - result = ac_build_readlane(&ctx->ac, get_src(ctx, instr->src[0]), NULL); - break; - case nir_intrinsic_load_subgroup_invocation: - result = ac_get_thread_id(&ctx->ac); - break; - case nir_intrinsic_load_work_group_id: { - LLVMValueRef values[3]; - - for (int i = 0; i < 3; i++) { - values[i] = ctx->abi->workgroup_ids[i] ? - ctx->abi->workgroup_ids[i] : ctx->ac.i32_0; - } - - result = ac_build_gather_values(&ctx->ac, values, 3); - break; - } - case nir_intrinsic_load_base_vertex: - case nir_intrinsic_load_first_vertex: - result = ctx->abi->load_base_vertex(ctx->abi); - break; - case nir_intrinsic_load_local_group_size: - result = ctx->abi->load_local_group_size(ctx->abi); - break; - case nir_intrinsic_load_vertex_id: - result = LLVMBuildAdd(ctx->ac.builder, ctx->abi->vertex_id, - ctx->abi->base_vertex, ""); - break; - case nir_intrinsic_load_vertex_id_zero_base: { - result = ctx->abi->vertex_id; - break; - } - case nir_intrinsic_load_local_invocation_id: { - result = ctx->abi->local_invocation_ids; - break; - } - case nir_intrinsic_load_base_instance: - result = ctx->abi->start_instance; - break; - case nir_intrinsic_load_draw_id: - result = ctx->abi->draw_id; - break; - case nir_intrinsic_load_view_index: - result = ctx->abi->view_index; - break; - case nir_intrinsic_load_invocation_id: - if (ctx->stage == MESA_SHADER_TESS_CTRL) { - result = ac_unpack_param(&ctx->ac, ctx->abi->tcs_rel_ids, 8, 5); - } else { - if (ctx->ac.chip_class >= GFX10) { - result = LLVMBuildAnd(ctx->ac.builder, - ctx->abi->gs_invocation_id, - LLVMConstInt(ctx->ac.i32, 127, 0), ""); - } else { - result = ctx->abi->gs_invocation_id; - } - } - break; - case nir_intrinsic_load_primitive_id: - if (ctx->stage == MESA_SHADER_GEOMETRY) { - result = ctx->abi->gs_prim_id; - } else if (ctx->stage == MESA_SHADER_TESS_CTRL) { - result = ctx->abi->tcs_patch_id; - } else if (ctx->stage == MESA_SHADER_TESS_EVAL) { - result = ctx->abi->tes_patch_id; - } else - fprintf(stderr, "Unknown primitive id intrinsic: %d", ctx->stage); - break; - case nir_intrinsic_load_sample_id: - result = ac_unpack_param(&ctx->ac, ctx->abi->ancillary, 8, 4); - break; - case nir_intrinsic_load_sample_pos: - result = load_sample_pos(ctx); - break; - case nir_intrinsic_load_sample_mask_in: - result = ctx->abi->load_sample_mask_in(ctx->abi); - break; - case nir_intrinsic_load_frag_coord: { - LLVMValueRef values[4] = { - ctx->abi->frag_pos[0], - ctx->abi->frag_pos[1], - ctx->abi->frag_pos[2], - ac_build_fdiv(&ctx->ac, ctx->ac.f32_1, ctx->abi->frag_pos[3]) - }; - result = ac_to_integer(&ctx->ac, - ac_build_gather_values(&ctx->ac, values, 4)); - break; - } - case nir_intrinsic_load_layer_id: - result = ctx->abi->inputs[ac_llvm_reg_index_soa(VARYING_SLOT_LAYER, 0)]; - break; - case nir_intrinsic_load_front_face: - result = ctx->abi->front_face; - break; - case nir_intrinsic_load_helper_invocation: - result = ac_build_load_helper_invocation(&ctx->ac); - break; - case nir_intrinsic_load_color0: - result = ctx->abi->color0; - break; - case nir_intrinsic_load_color1: - result = ctx->abi->color1; - break; - case nir_intrinsic_load_user_data_amd: - assert(LLVMTypeOf(ctx->abi->user_data) == ctx->ac.v4i32); - result = ctx->abi->user_data; - break; - case nir_intrinsic_load_instance_id: - result = ctx->abi->instance_id; - break; - case nir_intrinsic_load_num_work_groups: - result = ctx->abi->num_work_groups; - break; - case nir_intrinsic_load_local_invocation_index: - result = visit_load_local_invocation_index(ctx); - break; - case nir_intrinsic_load_subgroup_id: - result = visit_load_subgroup_id(ctx); - break; - case nir_intrinsic_load_num_subgroups: - result = visit_load_num_subgroups(ctx); - break; - case nir_intrinsic_first_invocation: - result = visit_first_invocation(ctx); - break; - case nir_intrinsic_load_push_constant: - result = visit_load_push_constant(ctx, instr); - break; - case nir_intrinsic_vulkan_resource_index: { - LLVMValueRef index = get_src(ctx, instr->src[0]); - unsigned desc_set = nir_intrinsic_desc_set(instr); - unsigned binding = nir_intrinsic_binding(instr); - - result = ctx->abi->load_resource(ctx->abi, index, desc_set, - binding); - break; - } - case nir_intrinsic_vulkan_resource_reindex: - result = visit_vulkan_resource_reindex(ctx, instr); - break; - case nir_intrinsic_store_ssbo: - visit_store_ssbo(ctx, instr); - break; - case nir_intrinsic_load_ssbo: - result = visit_load_buffer(ctx, instr); - break; - case nir_intrinsic_ssbo_atomic_add: - case nir_intrinsic_ssbo_atomic_imin: - case nir_intrinsic_ssbo_atomic_umin: - case nir_intrinsic_ssbo_atomic_imax: - case nir_intrinsic_ssbo_atomic_umax: - case nir_intrinsic_ssbo_atomic_and: - case nir_intrinsic_ssbo_atomic_or: - case nir_intrinsic_ssbo_atomic_xor: - case nir_intrinsic_ssbo_atomic_exchange: - case nir_intrinsic_ssbo_atomic_comp_swap: - result = visit_atomic_ssbo(ctx, instr); - break; - case nir_intrinsic_load_ubo: - result = visit_load_ubo_buffer(ctx, instr); - break; - case nir_intrinsic_get_buffer_size: - result = visit_get_buffer_size(ctx, instr); - break; - case nir_intrinsic_load_deref: - result = visit_load_var(ctx, instr); - break; - case nir_intrinsic_store_deref: - visit_store_var(ctx, instr); - break; - case nir_intrinsic_load_shared: - result = visit_load_shared(ctx, instr); - break; - case nir_intrinsic_store_shared: - visit_store_shared(ctx, instr); - break; - case nir_intrinsic_bindless_image_samples: - result = visit_image_samples(ctx, instr, true); - break; - case nir_intrinsic_image_deref_samples: - result = visit_image_samples(ctx, instr, false); - break; - case nir_intrinsic_bindless_image_load: - result = visit_image_load(ctx, instr, true); - break; - case nir_intrinsic_image_deref_load: - result = visit_image_load(ctx, instr, false); - break; - case nir_intrinsic_bindless_image_store: - visit_image_store(ctx, instr, true); - break; - case nir_intrinsic_image_deref_store: - visit_image_store(ctx, instr, false); - break; - case nir_intrinsic_bindless_image_atomic_add: - case nir_intrinsic_bindless_image_atomic_min: - case nir_intrinsic_bindless_image_atomic_max: - case nir_intrinsic_bindless_image_atomic_and: - case nir_intrinsic_bindless_image_atomic_or: - case nir_intrinsic_bindless_image_atomic_xor: - case nir_intrinsic_bindless_image_atomic_exchange: - case nir_intrinsic_bindless_image_atomic_comp_swap: - case nir_intrinsic_bindless_image_atomic_inc_wrap: - case nir_intrinsic_bindless_image_atomic_dec_wrap: - result = visit_image_atomic(ctx, instr, true); - break; - case nir_intrinsic_image_deref_atomic_add: - case nir_intrinsic_image_deref_atomic_min: - case nir_intrinsic_image_deref_atomic_max: - case nir_intrinsic_image_deref_atomic_and: - case nir_intrinsic_image_deref_atomic_or: - case nir_intrinsic_image_deref_atomic_xor: - case nir_intrinsic_image_deref_atomic_exchange: - case nir_intrinsic_image_deref_atomic_comp_swap: - case nir_intrinsic_image_deref_atomic_inc_wrap: - case nir_intrinsic_image_deref_atomic_dec_wrap: - result = visit_image_atomic(ctx, instr, false); - break; - case nir_intrinsic_bindless_image_size: - result = visit_image_size(ctx, instr, true); - break; - case nir_intrinsic_image_deref_size: - result = visit_image_size(ctx, instr, false); - break; - case nir_intrinsic_shader_clock: - result = ac_build_shader_clock(&ctx->ac); - break; - case nir_intrinsic_discard: - case nir_intrinsic_discard_if: - emit_discard(ctx, instr); - break; - case nir_intrinsic_memory_barrier: - case nir_intrinsic_group_memory_barrier: - case nir_intrinsic_memory_barrier_atomic_counter: - case nir_intrinsic_memory_barrier_buffer: - case nir_intrinsic_memory_barrier_image: - case nir_intrinsic_memory_barrier_shared: - emit_membar(&ctx->ac, instr); - break; - case nir_intrinsic_barrier: - ac_emit_barrier(&ctx->ac, ctx->stage); - break; - case nir_intrinsic_shared_atomic_add: - case nir_intrinsic_shared_atomic_imin: - case nir_intrinsic_shared_atomic_umin: - case nir_intrinsic_shared_atomic_imax: - case nir_intrinsic_shared_atomic_umax: - case nir_intrinsic_shared_atomic_and: - case nir_intrinsic_shared_atomic_or: - case nir_intrinsic_shared_atomic_xor: - case nir_intrinsic_shared_atomic_exchange: - case nir_intrinsic_shared_atomic_comp_swap: { - LLVMValueRef ptr = get_memory_ptr(ctx, instr->src[0]); - result = visit_var_atomic(ctx, instr, ptr, 1); - break; - } - case nir_intrinsic_deref_atomic_add: - case nir_intrinsic_deref_atomic_imin: - case nir_intrinsic_deref_atomic_umin: - case nir_intrinsic_deref_atomic_imax: - case nir_intrinsic_deref_atomic_umax: - case nir_intrinsic_deref_atomic_and: - case nir_intrinsic_deref_atomic_or: - case nir_intrinsic_deref_atomic_xor: - case nir_intrinsic_deref_atomic_exchange: - case nir_intrinsic_deref_atomic_comp_swap: { - LLVMValueRef ptr = get_src(ctx, instr->src[0]); - result = visit_var_atomic(ctx, instr, ptr, 1); - break; - } - case nir_intrinsic_load_barycentric_pixel: - result = barycentric_center(ctx, nir_intrinsic_interp_mode(instr)); - break; - case nir_intrinsic_load_barycentric_centroid: - result = barycentric_centroid(ctx, nir_intrinsic_interp_mode(instr)); - break; - case nir_intrinsic_load_barycentric_sample: - result = barycentric_sample(ctx, nir_intrinsic_interp_mode(instr)); - break; - case nir_intrinsic_load_barycentric_at_offset: { - LLVMValueRef offset = ac_to_float(&ctx->ac, get_src(ctx, instr->src[0])); - result = barycentric_offset(ctx, nir_intrinsic_interp_mode(instr), offset); - break; - } - case nir_intrinsic_load_barycentric_at_sample: { - LLVMValueRef sample_id = get_src(ctx, instr->src[0]); - result = barycentric_at_sample(ctx, nir_intrinsic_interp_mode(instr), sample_id); - break; - } - case nir_intrinsic_load_interpolated_input: { - /* We assume any indirect loads have been lowered away */ - ASSERTED nir_const_value *offset = nir_src_as_const_value(instr->src[1]); - assert(offset); - assert(offset[0].i32 == 0); - - LLVMValueRef interp_param = get_src(ctx, instr->src[0]); - unsigned index = nir_intrinsic_base(instr); - unsigned component = nir_intrinsic_component(instr); - result = load_interpolated_input(ctx, interp_param, index, - component, - instr->dest.ssa.num_components, - instr->dest.ssa.bit_size); - break; - } - case nir_intrinsic_load_input: { - /* We only lower inputs for fragment shaders ATM */ - ASSERTED nir_const_value *offset = nir_src_as_const_value(instr->src[0]); - assert(offset); - assert(offset[0].i32 == 0); - - unsigned index = nir_intrinsic_base(instr); - unsigned component = nir_intrinsic_component(instr); - result = load_flat_input(ctx, index, component, - instr->dest.ssa.num_components, - instr->dest.ssa.bit_size); - break; - } - case nir_intrinsic_emit_vertex: - ctx->abi->emit_vertex(ctx->abi, nir_intrinsic_stream_id(instr), ctx->abi->outputs); - break; - case nir_intrinsic_end_primitive: - ctx->abi->emit_primitive(ctx->abi, nir_intrinsic_stream_id(instr)); - break; - case nir_intrinsic_load_tess_coord: - result = ctx->abi->load_tess_coord(ctx->abi); - break; - case nir_intrinsic_load_tess_level_outer: - result = ctx->abi->load_tess_level(ctx->abi, VARYING_SLOT_TESS_LEVEL_OUTER, false); - break; - case nir_intrinsic_load_tess_level_inner: - result = ctx->abi->load_tess_level(ctx->abi, VARYING_SLOT_TESS_LEVEL_INNER, false); - break; - case nir_intrinsic_load_tess_level_outer_default: - result = ctx->abi->load_tess_level(ctx->abi, VARYING_SLOT_TESS_LEVEL_OUTER, true); - break; - case nir_intrinsic_load_tess_level_inner_default: - result = ctx->abi->load_tess_level(ctx->abi, VARYING_SLOT_TESS_LEVEL_INNER, true); - break; - case nir_intrinsic_load_patch_vertices_in: - result = ctx->abi->load_patch_vertices_in(ctx->abi); - break; - case nir_intrinsic_vote_all: { - LLVMValueRef tmp = ac_build_vote_all(&ctx->ac, get_src(ctx, instr->src[0])); - result = LLVMBuildSExt(ctx->ac.builder, tmp, ctx->ac.i32, ""); - break; - } - case nir_intrinsic_vote_any: { - LLVMValueRef tmp = ac_build_vote_any(&ctx->ac, get_src(ctx, instr->src[0])); - result = LLVMBuildSExt(ctx->ac.builder, tmp, ctx->ac.i32, ""); - break; - } - case nir_intrinsic_shuffle: - result = ac_build_shuffle(&ctx->ac, get_src(ctx, instr->src[0]), - get_src(ctx, instr->src[1])); - break; - case nir_intrinsic_reduce: - result = ac_build_reduce(&ctx->ac, - get_src(ctx, instr->src[0]), - instr->const_index[0], - instr->const_index[1]); - break; - case nir_intrinsic_inclusive_scan: - result = ac_build_inclusive_scan(&ctx->ac, - get_src(ctx, instr->src[0]), - instr->const_index[0]); - break; - case nir_intrinsic_exclusive_scan: - result = ac_build_exclusive_scan(&ctx->ac, - get_src(ctx, instr->src[0]), - instr->const_index[0]); - break; - case nir_intrinsic_quad_broadcast: { - unsigned lane = nir_src_as_uint(instr->src[1]); - result = ac_build_quad_swizzle(&ctx->ac, get_src(ctx, instr->src[0]), - lane, lane, lane, lane); - break; - } - case nir_intrinsic_quad_swap_horizontal: - result = ac_build_quad_swizzle(&ctx->ac, get_src(ctx, instr->src[0]), 1, 0, 3 ,2); - break; - case nir_intrinsic_quad_swap_vertical: - result = ac_build_quad_swizzle(&ctx->ac, get_src(ctx, instr->src[0]), 2, 3, 0 ,1); - break; - case nir_intrinsic_quad_swap_diagonal: - result = ac_build_quad_swizzle(&ctx->ac, get_src(ctx, instr->src[0]), 3, 2, 1 ,0); - break; - case nir_intrinsic_quad_swizzle_amd: { - uint32_t mask = nir_intrinsic_swizzle_mask(instr); - result = ac_build_quad_swizzle(&ctx->ac, get_src(ctx, instr->src[0]), - mask & 0x3, (mask >> 2) & 0x3, - (mask >> 4) & 0x3, (mask >> 6) & 0x3); - break; - } - case nir_intrinsic_masked_swizzle_amd: { - uint32_t mask = nir_intrinsic_swizzle_mask(instr); - result = ac_build_ds_swizzle(&ctx->ac, get_src(ctx, instr->src[0]), mask); - break; - } - case nir_intrinsic_write_invocation_amd: - result = ac_build_writelane(&ctx->ac, get_src(ctx, instr->src[0]), - get_src(ctx, instr->src[1]), - get_src(ctx, instr->src[2])); - break; - case nir_intrinsic_mbcnt_amd: - result = ac_build_mbcnt(&ctx->ac, get_src(ctx, instr->src[0])); - break; - default: - fprintf(stderr, "Unknown intrinsic: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); - break; - } - if (result) { - ctx->ssa_defs[instr->dest.ssa.index] = result; - } -} - -static LLVMValueRef get_bindless_index_from_uniform(struct ac_nir_context *ctx, - unsigned base_index, - unsigned constant_index, - LLVMValueRef dynamic_index) -{ - LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, base_index * 4, 0); - LLVMValueRef index = LLVMBuildAdd(ctx->ac.builder, dynamic_index, - LLVMConstInt(ctx->ac.i32, constant_index, 0), ""); - - /* Bindless uniforms are 64bit so multiple index by 8 */ - index = LLVMBuildMul(ctx->ac.builder, index, LLVMConstInt(ctx->ac.i32, 8, 0), ""); - offset = LLVMBuildAdd(ctx->ac.builder, offset, index, ""); - - LLVMValueRef ubo_index = ctx->abi->load_ubo(ctx->abi, ctx->ac.i32_0); - - LLVMValueRef ret = ac_build_buffer_load(&ctx->ac, ubo_index, 1, NULL, offset, - NULL, 0, 0, true, true); - - return LLVMBuildBitCast(ctx->ac.builder, ret, ctx->ac.i32, ""); -} - -static LLVMValueRef get_sampler_desc(struct ac_nir_context *ctx, - nir_deref_instr *deref_instr, - enum ac_descriptor_type desc_type, - const nir_instr *instr, - bool image, bool write) -{ - LLVMValueRef index = NULL; - unsigned constant_index = 0; - unsigned descriptor_set; - unsigned base_index; - bool bindless = false; - - if (!deref_instr) { - descriptor_set = 0; - if (image) { - nir_intrinsic_instr *img_instr = nir_instr_as_intrinsic(instr); - base_index = 0; - bindless = true; - index = get_src(ctx, img_instr->src[0]); - } else { - nir_tex_instr *tex_instr = nir_instr_as_tex(instr); - int sampSrcIdx = nir_tex_instr_src_index(tex_instr, - nir_tex_src_sampler_handle); - if (sampSrcIdx != -1) { - base_index = 0; - bindless = true; - index = get_src(ctx, tex_instr->src[sampSrcIdx].src); - } else { - assert(tex_instr && !image); - base_index = tex_instr->sampler_index; - } - } - } else { - while(deref_instr->deref_type != nir_deref_type_var) { - if (deref_instr->deref_type == nir_deref_type_array) { - unsigned array_size = glsl_get_aoa_size(deref_instr->type); - if (!array_size) - array_size = 1; - - if (nir_src_is_const(deref_instr->arr.index)) { - constant_index += array_size * nir_src_as_uint(deref_instr->arr.index); - } else { - LLVMValueRef indirect = get_src(ctx, deref_instr->arr.index); - - indirect = LLVMBuildMul(ctx->ac.builder, indirect, - LLVMConstInt(ctx->ac.i32, array_size, false), ""); - - if (!index) - index = indirect; - else - index = LLVMBuildAdd(ctx->ac.builder, index, indirect, ""); - } - - deref_instr = nir_src_as_deref(deref_instr->parent); - } else if (deref_instr->deref_type == nir_deref_type_struct) { - unsigned sidx = deref_instr->strct.index; - deref_instr = nir_src_as_deref(deref_instr->parent); - constant_index += glsl_get_struct_location_offset(deref_instr->type, sidx); - } else { - unreachable("Unsupported deref type"); - } - } - descriptor_set = deref_instr->var->data.descriptor_set; - - if (deref_instr->var->data.bindless) { - /* For now just assert on unhandled variable types */ - assert(deref_instr->var->data.mode == nir_var_uniform); - - base_index = deref_instr->var->data.driver_location; - bindless = true; - - index = index ? index : ctx->ac.i32_0; - index = get_bindless_index_from_uniform(ctx, base_index, - constant_index, index); - } else - base_index = deref_instr->var->data.binding; - } - - return ctx->abi->load_sampler_desc(ctx->abi, - descriptor_set, - base_index, - constant_index, index, - desc_type, image, write, bindless); -} - -/* Disable anisotropic filtering if BASE_LEVEL == LAST_LEVEL. - * - * GFX6-GFX7: - * If BASE_LEVEL == LAST_LEVEL, the shader must disable anisotropic - * filtering manually. The driver sets img7 to a mask clearing - * MAX_ANISO_RATIO if BASE_LEVEL == LAST_LEVEL. The shader must do: - * s_and_b32 samp0, samp0, img7 - * - * GFX8: - * The ANISO_OVERRIDE sampler field enables this fix in TA. - */ -static LLVMValueRef sici_fix_sampler_aniso(struct ac_nir_context *ctx, - LLVMValueRef res, LLVMValueRef samp) -{ - LLVMBuilderRef builder = ctx->ac.builder; - LLVMValueRef img7, samp0; - - if (ctx->ac.chip_class >= GFX8) - return samp; - - img7 = LLVMBuildExtractElement(builder, res, - LLVMConstInt(ctx->ac.i32, 7, 0), ""); - samp0 = LLVMBuildExtractElement(builder, samp, - LLVMConstInt(ctx->ac.i32, 0, 0), ""); - samp0 = LLVMBuildAnd(builder, samp0, img7, ""); - return LLVMBuildInsertElement(builder, samp, samp0, - LLVMConstInt(ctx->ac.i32, 0, 0), ""); -} - -static void tex_fetch_ptrs(struct ac_nir_context *ctx, - nir_tex_instr *instr, - LLVMValueRef *res_ptr, LLVMValueRef *samp_ptr, - LLVMValueRef *fmask_ptr) -{ - nir_deref_instr *texture_deref_instr = NULL; - nir_deref_instr *sampler_deref_instr = NULL; - int plane = -1; - - for (unsigned i = 0; i < instr->num_srcs; i++) { - switch (instr->src[i].src_type) { - case nir_tex_src_texture_deref: - texture_deref_instr = nir_src_as_deref(instr->src[i].src); - break; - case nir_tex_src_sampler_deref: - sampler_deref_instr = nir_src_as_deref(instr->src[i].src); - break; - case nir_tex_src_plane: - plane = nir_src_as_int(instr->src[i].src); - break; - default: - break; - } - } - - if (!sampler_deref_instr) - sampler_deref_instr = texture_deref_instr; - - enum ac_descriptor_type main_descriptor = instr->sampler_dim == GLSL_SAMPLER_DIM_BUF ? AC_DESC_BUFFER : AC_DESC_IMAGE; - - if (plane >= 0) { - assert(instr->op != nir_texop_txf_ms && - instr->op != nir_texop_samples_identical); - assert(instr->sampler_dim != GLSL_SAMPLER_DIM_BUF); - - main_descriptor = AC_DESC_PLANE_0 + plane; - } - - *res_ptr = get_sampler_desc(ctx, texture_deref_instr, main_descriptor, &instr->instr, false, false); - - if (samp_ptr) { - *samp_ptr = get_sampler_desc(ctx, sampler_deref_instr, AC_DESC_SAMPLER, &instr->instr, false, false); - if (instr->sampler_dim < GLSL_SAMPLER_DIM_RECT) - *samp_ptr = sici_fix_sampler_aniso(ctx, *res_ptr, *samp_ptr); - } - if (fmask_ptr && (instr->op == nir_texop_txf_ms || - instr->op == nir_texop_samples_identical)) - *fmask_ptr = get_sampler_desc(ctx, texture_deref_instr, AC_DESC_FMASK, &instr->instr, false, false); -} - -static LLVMValueRef apply_round_slice(struct ac_llvm_context *ctx, - LLVMValueRef coord) -{ - coord = ac_to_float(ctx, coord); - coord = ac_build_round(ctx, coord); - coord = ac_to_integer(ctx, coord); - return coord; -} - -static void visit_tex(struct ac_nir_context *ctx, nir_tex_instr *instr) -{ - LLVMValueRef result = NULL; - struct ac_image_args args = { 0 }; - LLVMValueRef fmask_ptr = NULL, sample_index = NULL; - LLVMValueRef ddx = NULL, ddy = NULL; - unsigned offset_src = 0; - - tex_fetch_ptrs(ctx, instr, &args.resource, &args.sampler, &fmask_ptr); - - for (unsigned i = 0; i < instr->num_srcs; i++) { - switch (instr->src[i].src_type) { - case nir_tex_src_coord: { - LLVMValueRef coord = get_src(ctx, instr->src[i].src); - for (unsigned chan = 0; chan < instr->coord_components; ++chan) - args.coords[chan] = ac_llvm_extract_elem(&ctx->ac, coord, chan); - break; - } - case nir_tex_src_projector: - break; - case nir_tex_src_comparator: - if (instr->is_shadow) - args.compare = get_src(ctx, instr->src[i].src); - break; - case nir_tex_src_offset: - args.offset = get_src(ctx, instr->src[i].src); - offset_src = i; - break; - case nir_tex_src_bias: - if (instr->op == nir_texop_txb) - args.bias = get_src(ctx, instr->src[i].src); - break; - case nir_tex_src_lod: { - if (nir_src_is_const(instr->src[i].src) && nir_src_as_uint(instr->src[i].src) == 0) - args.level_zero = true; - else - args.lod = get_src(ctx, instr->src[i].src); - break; - } - case nir_tex_src_ms_index: - sample_index = get_src(ctx, instr->src[i].src); - break; - case nir_tex_src_ms_mcs: - break; - case nir_tex_src_ddx: - ddx = get_src(ctx, instr->src[i].src); - break; - case nir_tex_src_ddy: - ddy = get_src(ctx, instr->src[i].src); - break; - case nir_tex_src_texture_offset: - case nir_tex_src_sampler_offset: - case nir_tex_src_plane: - default: - break; - } - } - - if (instr->op == nir_texop_txs && instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) { - result = get_buffer_size(ctx, args.resource, true); - goto write_result; - } - - if (instr->op == nir_texop_texture_samples) { - LLVMValueRef res, samples, is_msaa; - res = LLVMBuildBitCast(ctx->ac.builder, args.resource, ctx->ac.v8i32, ""); - samples = LLVMBuildExtractElement(ctx->ac.builder, res, - LLVMConstInt(ctx->ac.i32, 3, false), ""); - is_msaa = LLVMBuildLShr(ctx->ac.builder, samples, - LLVMConstInt(ctx->ac.i32, 28, false), ""); - is_msaa = LLVMBuildAnd(ctx->ac.builder, is_msaa, - LLVMConstInt(ctx->ac.i32, 0xe, false), ""); - is_msaa = LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, is_msaa, - LLVMConstInt(ctx->ac.i32, 0xe, false), ""); - - samples = LLVMBuildLShr(ctx->ac.builder, samples, - LLVMConstInt(ctx->ac.i32, 16, false), ""); - samples = LLVMBuildAnd(ctx->ac.builder, samples, - LLVMConstInt(ctx->ac.i32, 0xf, false), ""); - samples = LLVMBuildShl(ctx->ac.builder, ctx->ac.i32_1, - samples, ""); - samples = LLVMBuildSelect(ctx->ac.builder, is_msaa, samples, - ctx->ac.i32_1, ""); - result = samples; - goto write_result; - } - - if (args.offset && instr->op != nir_texop_txf && instr->op != nir_texop_txf_ms) { - LLVMValueRef offset[3], pack; - for (unsigned chan = 0; chan < 3; ++chan) - offset[chan] = ctx->ac.i32_0; - - unsigned num_components = ac_get_llvm_num_components(args.offset); - for (unsigned chan = 0; chan < num_components; chan++) { - offset[chan] = ac_llvm_extract_elem(&ctx->ac, args.offset, chan); - offset[chan] = LLVMBuildAnd(ctx->ac.builder, offset[chan], - LLVMConstInt(ctx->ac.i32, 0x3f, false), ""); - if (chan) - offset[chan] = LLVMBuildShl(ctx->ac.builder, offset[chan], - LLVMConstInt(ctx->ac.i32, chan * 8, false), ""); - } - pack = LLVMBuildOr(ctx->ac.builder, offset[0], offset[1], ""); - pack = LLVMBuildOr(ctx->ac.builder, pack, offset[2], ""); - args.offset = pack; - } - - /* TC-compatible HTILE on radeonsi promotes Z16 and Z24 to Z32_FLOAT, - * so the depth comparison value isn't clamped for Z16 and - * Z24 anymore. Do it manually here for GFX8-9; GFX10 has an explicitly - * clamped 32-bit float format. - * - * It's unnecessary if the original texture format was - * Z32_FLOAT, but we don't know that here. - */ - if (args.compare && - ctx->ac.chip_class >= GFX8 && - ctx->ac.chip_class <= GFX9 && - ctx->abi->clamp_shadow_reference) - args.compare = ac_build_clamp(&ctx->ac, ac_to_float(&ctx->ac, args.compare)); - - /* pack derivatives */ - if (ddx || ddy) { - int num_src_deriv_channels, num_dest_deriv_channels; - switch (instr->sampler_dim) { - case GLSL_SAMPLER_DIM_3D: - case GLSL_SAMPLER_DIM_CUBE: - num_src_deriv_channels = 3; - num_dest_deriv_channels = 3; - break; - case GLSL_SAMPLER_DIM_2D: - default: - num_src_deriv_channels = 2; - num_dest_deriv_channels = 2; - break; - case GLSL_SAMPLER_DIM_1D: - num_src_deriv_channels = 1; - if (ctx->ac.chip_class == GFX9) { - num_dest_deriv_channels = 2; - } else { - num_dest_deriv_channels = 1; - } - break; - } - - for (unsigned i = 0; i < num_src_deriv_channels; i++) { - args.derivs[i] = ac_to_float(&ctx->ac, - ac_llvm_extract_elem(&ctx->ac, ddx, i)); - args.derivs[num_dest_deriv_channels + i] = ac_to_float(&ctx->ac, - ac_llvm_extract_elem(&ctx->ac, ddy, i)); - } - for (unsigned i = num_src_deriv_channels; i < num_dest_deriv_channels; i++) { - args.derivs[i] = ctx->ac.f32_0; - args.derivs[num_dest_deriv_channels + i] = ctx->ac.f32_0; - } - } - - if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE && args.coords[0]) { - for (unsigned chan = 0; chan < instr->coord_components; chan++) - args.coords[chan] = ac_to_float(&ctx->ac, args.coords[chan]); - if (instr->coord_components == 3) - args.coords[3] = LLVMGetUndef(ctx->ac.f32); - ac_prepare_cube_coords(&ctx->ac, - instr->op == nir_texop_txd, instr->is_array, - instr->op == nir_texop_lod, args.coords, args.derivs); - } - - /* Texture coordinates fixups */ - if (instr->coord_components > 1 && - instr->sampler_dim == GLSL_SAMPLER_DIM_1D && - instr->is_array && - instr->op != nir_texop_txf) { - args.coords[1] = apply_round_slice(&ctx->ac, args.coords[1]); - } - - if (instr->coord_components > 2 && - (instr->sampler_dim == GLSL_SAMPLER_DIM_2D || - instr->sampler_dim == GLSL_SAMPLER_DIM_MS || - instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS || - instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS) && - instr->is_array && - instr->op != nir_texop_txf && instr->op != nir_texop_txf_ms) { - args.coords[2] = apply_round_slice(&ctx->ac, args.coords[2]); - } - - if (ctx->ac.chip_class == GFX9 && - instr->sampler_dim == GLSL_SAMPLER_DIM_1D && - instr->op != nir_texop_lod) { - LLVMValueRef filler; - if (instr->op == nir_texop_txf) - filler = ctx->ac.i32_0; - else - filler = LLVMConstReal(ctx->ac.f32, 0.5); - - if (instr->is_array) - args.coords[2] = args.coords[1]; - args.coords[1] = filler; - } - - /* Pack sample index */ - if (instr->op == nir_texop_txf_ms && sample_index) - args.coords[instr->coord_components] = sample_index; - - if (instr->op == nir_texop_samples_identical) { - struct ac_image_args txf_args = { 0 }; - memcpy(txf_args.coords, args.coords, sizeof(txf_args.coords)); - - txf_args.dmask = 0xf; - txf_args.resource = fmask_ptr; - txf_args.dim = instr->is_array ? ac_image_2darray : ac_image_2d; - result = build_tex_intrinsic(ctx, instr, &txf_args); - - result = LLVMBuildExtractElement(ctx->ac.builder, result, ctx->ac.i32_0, ""); - result = emit_int_cmp(&ctx->ac, LLVMIntEQ, result, ctx->ac.i32_0); - goto write_result; - } - - if ((instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS || - instr->sampler_dim == GLSL_SAMPLER_DIM_MS) && - instr->op != nir_texop_txs) { - unsigned sample_chan = instr->is_array ? 3 : 2; - args.coords[sample_chan] = adjust_sample_index_using_fmask( - &ctx->ac, args.coords[0], args.coords[1], - instr->is_array ? args.coords[2] : NULL, - args.coords[sample_chan], fmask_ptr); - } - - if (args.offset && (instr->op == nir_texop_txf || instr->op == nir_texop_txf_ms)) { - int num_offsets = instr->src[offset_src].src.ssa->num_components; - num_offsets = MIN2(num_offsets, instr->coord_components); - for (unsigned i = 0; i < num_offsets; ++i) { - args.coords[i] = LLVMBuildAdd( - ctx->ac.builder, args.coords[i], - LLVMConstInt(ctx->ac.i32, nir_src_comp_as_uint(instr->src[offset_src].src, i), false), ""); - } - args.offset = NULL; - } - - /* DMASK was repurposed for GATHER4. 4 components are always - * returned and DMASK works like a swizzle - it selects - * the component to fetch. The only valid DMASK values are - * 1=red, 2=green, 4=blue, 8=alpha. (e.g. 1 returns - * (red,red,red,red) etc.) The ISA document doesn't mention - * this. - */ - args.dmask = 0xf; - if (instr->op == nir_texop_tg4) { - if (instr->is_shadow) - args.dmask = 1; - else - args.dmask = 1 << instr->component; - } - - if (instr->sampler_dim != GLSL_SAMPLER_DIM_BUF) - args.dim = get_ac_sampler_dim(&ctx->ac, instr->sampler_dim, instr->is_array); - result = build_tex_intrinsic(ctx, instr, &args); - - if (instr->op == nir_texop_query_levels) - result = LLVMBuildExtractElement(ctx->ac.builder, result, LLVMConstInt(ctx->ac.i32, 3, false), ""); - else if (instr->is_shadow && instr->is_new_style_shadow && - instr->op != nir_texop_txs && instr->op != nir_texop_lod && - instr->op != nir_texop_tg4) - result = LLVMBuildExtractElement(ctx->ac.builder, result, ctx->ac.i32_0, ""); - else if (instr->op == nir_texop_txs && - instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE && - instr->is_array) { - LLVMValueRef two = LLVMConstInt(ctx->ac.i32, 2, false); - LLVMValueRef six = LLVMConstInt(ctx->ac.i32, 6, false); - LLVMValueRef z = LLVMBuildExtractElement(ctx->ac.builder, result, two, ""); - z = LLVMBuildSDiv(ctx->ac.builder, z, six, ""); - result = LLVMBuildInsertElement(ctx->ac.builder, result, z, two, ""); - } else if (ctx->ac.chip_class == GFX9 && - instr->op == nir_texop_txs && - instr->sampler_dim == GLSL_SAMPLER_DIM_1D && - instr->is_array) { - LLVMValueRef two = LLVMConstInt(ctx->ac.i32, 2, false); - LLVMValueRef layers = LLVMBuildExtractElement(ctx->ac.builder, result, two, ""); - result = LLVMBuildInsertElement(ctx->ac.builder, result, layers, - ctx->ac.i32_1, ""); - } else if (instr->dest.ssa.num_components != 4) - result = ac_trim_vector(&ctx->ac, result, instr->dest.ssa.num_components); - -write_result: - if (result) { - assert(instr->dest.is_ssa); - result = ac_to_integer(&ctx->ac, result); - ctx->ssa_defs[instr->dest.ssa.index] = result; - } -} - - -static void visit_phi(struct ac_nir_context *ctx, nir_phi_instr *instr) -{ - LLVMTypeRef type = get_def_type(ctx, &instr->dest.ssa); - LLVMValueRef result = LLVMBuildPhi(ctx->ac.builder, type, ""); - - ctx->ssa_defs[instr->dest.ssa.index] = result; - _mesa_hash_table_insert(ctx->phis, instr, result); -} - -static void visit_post_phi(struct ac_nir_context *ctx, - nir_phi_instr *instr, - LLVMValueRef llvm_phi) -{ - nir_foreach_phi_src(src, instr) { - LLVMBasicBlockRef block = get_block(ctx, src->pred); - LLVMValueRef llvm_src = get_src(ctx, src->src); - - LLVMAddIncoming(llvm_phi, &llvm_src, &block, 1); - } -} - -static void phi_post_pass(struct ac_nir_context *ctx) -{ - hash_table_foreach(ctx->phis, entry) { - visit_post_phi(ctx, (nir_phi_instr*)entry->key, - (LLVMValueRef)entry->data); - } -} - - -static void visit_ssa_undef(struct ac_nir_context *ctx, - const nir_ssa_undef_instr *instr) -{ - unsigned num_components = instr->def.num_components; - LLVMTypeRef type = LLVMIntTypeInContext(ctx->ac.context, instr->def.bit_size); - LLVMValueRef undef; - - if (num_components == 1) - undef = LLVMGetUndef(type); - else { - undef = LLVMGetUndef(LLVMVectorType(type, num_components)); - } - ctx->ssa_defs[instr->def.index] = undef; -} - -static void visit_jump(struct ac_llvm_context *ctx, - const nir_jump_instr *instr) -{ - switch (instr->type) { - case nir_jump_break: - ac_build_break(ctx); - break; - case nir_jump_continue: - ac_build_continue(ctx); - break; - default: - fprintf(stderr, "Unknown NIR jump instr: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); - abort(); - } -} - -static LLVMTypeRef -glsl_base_to_llvm_type(struct ac_llvm_context *ac, - enum glsl_base_type type) -{ - switch (type) { - case GLSL_TYPE_INT: - case GLSL_TYPE_UINT: - case GLSL_TYPE_BOOL: - case GLSL_TYPE_SUBROUTINE: - return ac->i32; - case GLSL_TYPE_INT8: - case GLSL_TYPE_UINT8: - return ac->i8; - case GLSL_TYPE_INT16: - case GLSL_TYPE_UINT16: - return ac->i16; - case GLSL_TYPE_FLOAT: - return ac->f32; - case GLSL_TYPE_FLOAT16: - return ac->f16; - case GLSL_TYPE_INT64: - case GLSL_TYPE_UINT64: - return ac->i64; - case GLSL_TYPE_DOUBLE: - return ac->f64; - default: - unreachable("unknown GLSL type"); - } -} - -static LLVMTypeRef -glsl_to_llvm_type(struct ac_llvm_context *ac, - const struct glsl_type *type) -{ - if (glsl_type_is_scalar(type)) { - return glsl_base_to_llvm_type(ac, glsl_get_base_type(type)); - } - - if (glsl_type_is_vector(type)) { - return LLVMVectorType( - glsl_base_to_llvm_type(ac, glsl_get_base_type(type)), - glsl_get_vector_elements(type)); - } - - if (glsl_type_is_matrix(type)) { - return LLVMArrayType( - glsl_to_llvm_type(ac, glsl_get_column_type(type)), - glsl_get_matrix_columns(type)); - } - - if (glsl_type_is_array(type)) { - return LLVMArrayType( - glsl_to_llvm_type(ac, glsl_get_array_element(type)), - glsl_get_length(type)); - } - - assert(glsl_type_is_struct_or_ifc(type)); - - LLVMTypeRef member_types[glsl_get_length(type)]; - - for (unsigned i = 0; i < glsl_get_length(type); i++) { - member_types[i] = - glsl_to_llvm_type(ac, - glsl_get_struct_field(type, i)); - } - - return LLVMStructTypeInContext(ac->context, member_types, - glsl_get_length(type), false); -} - -static void visit_deref(struct ac_nir_context *ctx, - nir_deref_instr *instr) -{ - if (instr->mode != nir_var_mem_shared && - instr->mode != nir_var_mem_global) - return; - - LLVMValueRef result = NULL; - switch(instr->deref_type) { - case nir_deref_type_var: { - struct hash_entry *entry = _mesa_hash_table_search(ctx->vars, instr->var); - result = entry->data; - break; - } - case nir_deref_type_struct: - if (instr->mode == nir_var_mem_global) { - nir_deref_instr *parent = nir_deref_instr_parent(instr); - uint64_t offset = glsl_get_struct_field_offset(parent->type, - instr->strct.index); - result = ac_build_gep_ptr(&ctx->ac, get_src(ctx, instr->parent), - LLVMConstInt(ctx->ac.i32, offset, 0)); - } else { - result = ac_build_gep0(&ctx->ac, get_src(ctx, instr->parent), - LLVMConstInt(ctx->ac.i32, instr->strct.index, 0)); - } - break; - case nir_deref_type_array: - if (instr->mode == nir_var_mem_global) { - nir_deref_instr *parent = nir_deref_instr_parent(instr); - unsigned stride = glsl_get_explicit_stride(parent->type); - - if ((glsl_type_is_matrix(parent->type) && - glsl_matrix_type_is_row_major(parent->type)) || - (glsl_type_is_vector(parent->type) && stride == 0)) - stride = type_scalar_size_bytes(parent->type); - - assert(stride > 0); - LLVMValueRef index = get_src(ctx, instr->arr.index); - if (LLVMTypeOf(index) != ctx->ac.i64) - index = LLVMBuildZExt(ctx->ac.builder, index, ctx->ac.i64, ""); - - LLVMValueRef offset = LLVMBuildMul(ctx->ac.builder, index, LLVMConstInt(ctx->ac.i64, stride, 0), ""); - - result = ac_build_gep_ptr(&ctx->ac, get_src(ctx, instr->parent), offset); - } else { - result = ac_build_gep0(&ctx->ac, get_src(ctx, instr->parent), - get_src(ctx, instr->arr.index)); - } - break; - case nir_deref_type_ptr_as_array: - if (instr->mode == nir_var_mem_global) { - unsigned stride = nir_deref_instr_ptr_as_array_stride(instr); - - LLVMValueRef index = get_src(ctx, instr->arr.index); - if (LLVMTypeOf(index) != ctx->ac.i64) - index = LLVMBuildZExt(ctx->ac.builder, index, ctx->ac.i64, ""); - - LLVMValueRef offset = LLVMBuildMul(ctx->ac.builder, index, LLVMConstInt(ctx->ac.i64, stride, 0), ""); - - result = ac_build_gep_ptr(&ctx->ac, get_src(ctx, instr->parent), offset); - } else { - result = ac_build_gep_ptr(&ctx->ac, get_src(ctx, instr->parent), - get_src(ctx, instr->arr.index)); - } - break; - case nir_deref_type_cast: { - result = get_src(ctx, instr->parent); - - /* We can't use the structs from LLVM because the shader - * specifies its own offsets. */ - LLVMTypeRef pointee_type = ctx->ac.i8; - if (instr->mode == nir_var_mem_shared) - pointee_type = glsl_to_llvm_type(&ctx->ac, instr->type); - - unsigned address_space; - - switch(instr->mode) { - case nir_var_mem_shared: - address_space = AC_ADDR_SPACE_LDS; - break; - case nir_var_mem_global: - address_space = AC_ADDR_SPACE_GLOBAL; - break; - default: - unreachable("Unhandled address space"); - } - - LLVMTypeRef type = LLVMPointerType(pointee_type, address_space); - - if (LLVMTypeOf(result) != type) { - if (LLVMGetTypeKind(LLVMTypeOf(result)) == LLVMVectorTypeKind) { - result = LLVMBuildBitCast(ctx->ac.builder, result, - type, ""); - } else { - result = LLVMBuildIntToPtr(ctx->ac.builder, result, - type, ""); - } - } - break; - } - default: - unreachable("Unhandled deref_instr deref type"); - } - - ctx->ssa_defs[instr->dest.ssa.index] = result; -} - -static void visit_cf_list(struct ac_nir_context *ctx, - struct exec_list *list); - -static void visit_block(struct ac_nir_context *ctx, nir_block *block) -{ - nir_foreach_instr(instr, block) - { - switch (instr->type) { - case nir_instr_type_alu: - visit_alu(ctx, nir_instr_as_alu(instr)); - break; - case nir_instr_type_load_const: - visit_load_const(ctx, nir_instr_as_load_const(instr)); - break; - case nir_instr_type_intrinsic: - visit_intrinsic(ctx, nir_instr_as_intrinsic(instr)); - break; - case nir_instr_type_tex: - visit_tex(ctx, nir_instr_as_tex(instr)); - break; - case nir_instr_type_phi: - visit_phi(ctx, nir_instr_as_phi(instr)); - break; - case nir_instr_type_ssa_undef: - visit_ssa_undef(ctx, nir_instr_as_ssa_undef(instr)); - break; - case nir_instr_type_jump: - visit_jump(&ctx->ac, nir_instr_as_jump(instr)); - break; - case nir_instr_type_deref: - visit_deref(ctx, nir_instr_as_deref(instr)); - break; - default: - fprintf(stderr, "Unknown NIR instr type: "); - nir_print_instr(instr, stderr); - fprintf(stderr, "\n"); - abort(); - } - } - - _mesa_hash_table_insert(ctx->defs, block, - LLVMGetInsertBlock(ctx->ac.builder)); -} - -static void visit_if(struct ac_nir_context *ctx, nir_if *if_stmt) -{ - LLVMValueRef value = get_src(ctx, if_stmt->condition); - - nir_block *then_block = - (nir_block *) exec_list_get_head(&if_stmt->then_list); - - ac_build_uif(&ctx->ac, value, then_block->index); - - visit_cf_list(ctx, &if_stmt->then_list); - - if (!exec_list_is_empty(&if_stmt->else_list)) { - nir_block *else_block = - (nir_block *) exec_list_get_head(&if_stmt->else_list); - - ac_build_else(&ctx->ac, else_block->index); - visit_cf_list(ctx, &if_stmt->else_list); - } - - ac_build_endif(&ctx->ac, then_block->index); -} - -static void visit_loop(struct ac_nir_context *ctx, nir_loop *loop) -{ - nir_block *first_loop_block = - (nir_block *) exec_list_get_head(&loop->body); - - ac_build_bgnloop(&ctx->ac, first_loop_block->index); - - visit_cf_list(ctx, &loop->body); - - ac_build_endloop(&ctx->ac, first_loop_block->index); -} - -static void visit_cf_list(struct ac_nir_context *ctx, - struct exec_list *list) -{ - foreach_list_typed(nir_cf_node, node, node, list) - { - switch (node->type) { - case nir_cf_node_block: - visit_block(ctx, nir_cf_node_as_block(node)); - break; - - case nir_cf_node_if: - visit_if(ctx, nir_cf_node_as_if(node)); - break; - - case nir_cf_node_loop: - visit_loop(ctx, nir_cf_node_as_loop(node)); - break; - - default: - assert(0); - } - } -} - -void -ac_handle_shader_output_decl(struct ac_llvm_context *ctx, - struct ac_shader_abi *abi, - struct nir_shader *nir, - struct nir_variable *variable, - gl_shader_stage stage) -{ - unsigned output_loc = variable->data.driver_location / 4; - unsigned attrib_count = glsl_count_attribute_slots(variable->type, false); - - /* tess ctrl has it's own load/store paths for outputs */ - if (stage == MESA_SHADER_TESS_CTRL) - return; - - if (stage == MESA_SHADER_VERTEX || - stage == MESA_SHADER_TESS_EVAL || - stage == MESA_SHADER_GEOMETRY) { - int idx = variable->data.location + variable->data.index; - if (idx == VARYING_SLOT_CLIP_DIST0) { - int length = nir->info.clip_distance_array_size + - nir->info.cull_distance_array_size; - - if (length > 4) - attrib_count = 2; - else - attrib_count = 1; - } - } - - bool is_16bit = glsl_type_is_16bit(glsl_without_array(variable->type)); - LLVMTypeRef type = is_16bit ? ctx->f16 : ctx->f32; - for (unsigned i = 0; i < attrib_count; ++i) { - for (unsigned chan = 0; chan < 4; chan++) { - abi->outputs[ac_llvm_reg_index_soa(output_loc + i, chan)] = - ac_build_alloca_undef(ctx, type, ""); - } - } -} - -static void -setup_locals(struct ac_nir_context *ctx, - struct nir_function *func) -{ - int i, j; - ctx->num_locals = 0; - nir_foreach_variable(variable, &func->impl->locals) { - unsigned attrib_count = glsl_count_attribute_slots(variable->type, false); - variable->data.driver_location = ctx->num_locals * 4; - variable->data.location_frac = 0; - ctx->num_locals += attrib_count; - } - ctx->locals = malloc(4 * ctx->num_locals * sizeof(LLVMValueRef)); - if (!ctx->locals) - return; - - for (i = 0; i < ctx->num_locals; i++) { - for (j = 0; j < 4; j++) { - ctx->locals[i * 4 + j] = - ac_build_alloca_undef(&ctx->ac, ctx->ac.f32, "temp"); - } - } -} - -static void -setup_shared(struct ac_nir_context *ctx, - struct nir_shader *nir) -{ - nir_foreach_variable(variable, &nir->shared) { - LLVMValueRef shared = - LLVMAddGlobalInAddressSpace( - ctx->ac.module, glsl_to_llvm_type(&ctx->ac, variable->type), - variable->name ? variable->name : "", - AC_ADDR_SPACE_LDS); - _mesa_hash_table_insert(ctx->vars, variable, shared); - } -} - -void ac_nir_translate(struct ac_llvm_context *ac, struct ac_shader_abi *abi, - struct nir_shader *nir) -{ - struct ac_nir_context ctx = {}; - struct nir_function *func; - - ctx.ac = *ac; - ctx.abi = abi; - - ctx.stage = nir->info.stage; - ctx.info = &nir->info; - - ctx.main_function = LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx.ac.builder)); - - nir_foreach_variable(variable, &nir->outputs) - ac_handle_shader_output_decl(&ctx.ac, ctx.abi, nir, variable, - ctx.stage); - - ctx.defs = _mesa_hash_table_create(NULL, _mesa_hash_pointer, - _mesa_key_pointer_equal); - ctx.phis = _mesa_hash_table_create(NULL, _mesa_hash_pointer, - _mesa_key_pointer_equal); - ctx.vars = _mesa_hash_table_create(NULL, _mesa_hash_pointer, - _mesa_key_pointer_equal); - - func = (struct nir_function *)exec_list_get_head(&nir->functions); - - nir_index_ssa_defs(func->impl); - ctx.ssa_defs = calloc(func->impl->ssa_alloc, sizeof(LLVMValueRef)); - - setup_locals(&ctx, func); - - if (gl_shader_stage_is_compute(nir->info.stage)) - setup_shared(&ctx, nir); - - visit_cf_list(&ctx, &func->impl->body); - phi_post_pass(&ctx); - - if (!gl_shader_stage_is_compute(nir->info.stage)) - ctx.abi->emit_outputs(ctx.abi, AC_LLVM_MAX_OUTPUTS, - ctx.abi->outputs); - - free(ctx.locals); - free(ctx.ssa_defs); - ralloc_free(ctx.defs); - ralloc_free(ctx.phis); - ralloc_free(ctx.vars); -} - -void -ac_lower_indirect_derefs(struct nir_shader *nir, enum chip_class chip_class) -{ - /* While it would be nice not to have this flag, we are constrained - * by the reality that LLVM 9.0 has buggy VGPR indexing on GFX9. - */ - bool llvm_has_working_vgpr_indexing = chip_class != GFX9; - - /* TODO: Indirect indexing of GS inputs is unimplemented. - * - * TCS and TES load inputs directly from LDS or offchip memory, so - * indirect indexing is trivial. - */ - nir_variable_mode indirect_mask = 0; - if (nir->info.stage == MESA_SHADER_GEOMETRY || - (nir->info.stage != MESA_SHADER_TESS_CTRL && - nir->info.stage != MESA_SHADER_TESS_EVAL && - !llvm_has_working_vgpr_indexing)) { - indirect_mask |= nir_var_shader_in; - } - if (!llvm_has_working_vgpr_indexing && - nir->info.stage != MESA_SHADER_TESS_CTRL) - indirect_mask |= nir_var_shader_out; - - /* TODO: We shouldn't need to do this, however LLVM isn't currently - * smart enough to handle indirects without causing excess spilling - * causing the gpu to hang. - * - * See the following thread for more details of the problem: - * https://lists.freedesktop.org/archives/mesa-dev/2017-July/162106.html - */ - indirect_mask |= nir_var_function_temp; - - nir_lower_indirect_derefs(nir, indirect_mask); -} - -static unsigned -get_inst_tessfactor_writemask(nir_intrinsic_instr *intrin) -{ - if (intrin->intrinsic != nir_intrinsic_store_deref) - return 0; - - nir_variable *var = - nir_deref_instr_get_variable(nir_src_as_deref(intrin->src[0])); - - if (var->data.mode != nir_var_shader_out) - return 0; - - unsigned writemask = 0; - const int location = var->data.location; - unsigned first_component = var->data.location_frac; - unsigned num_comps = intrin->dest.ssa.num_components; - - if (location == VARYING_SLOT_TESS_LEVEL_INNER) - writemask = ((1 << (num_comps + 1)) - 1) << first_component; - else if (location == VARYING_SLOT_TESS_LEVEL_OUTER) - writemask = (((1 << (num_comps + 1)) - 1) << first_component) << 4; - - return writemask; -} - -static void -scan_tess_ctrl(nir_cf_node *cf_node, unsigned *upper_block_tf_writemask, - unsigned *cond_block_tf_writemask, - bool *tessfactors_are_def_in_all_invocs, bool is_nested_cf) -{ - switch (cf_node->type) { - case nir_cf_node_block: { - nir_block *block = nir_cf_node_as_block(cf_node); - nir_foreach_instr(instr, block) { - if (instr->type != nir_instr_type_intrinsic) - continue; - - nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); - if (intrin->intrinsic == nir_intrinsic_barrier) { - - /* If we find a barrier in nested control flow put this in the - * too hard basket. In GLSL this is not possible but it is in - * SPIR-V. - */ - if (is_nested_cf) { - *tessfactors_are_def_in_all_invocs = false; - return; - } - - /* The following case must be prevented: - * gl_TessLevelInner = ...; - * barrier(); - * if (gl_InvocationID == 1) - * gl_TessLevelInner = ...; - * - * If you consider disjoint code segments separated by barriers, each - * such segment that writes tess factor channels should write the same - * channels in all codepaths within that segment. - */ - if (upper_block_tf_writemask || cond_block_tf_writemask) { - /* Accumulate the result: */ - *tessfactors_are_def_in_all_invocs &= - !(*cond_block_tf_writemask & ~(*upper_block_tf_writemask)); - - /* Analyze the next code segment from scratch. */ - *upper_block_tf_writemask = 0; - *cond_block_tf_writemask = 0; - } - } else - *upper_block_tf_writemask |= get_inst_tessfactor_writemask(intrin); - } - - break; - } - case nir_cf_node_if: { - unsigned then_tessfactor_writemask = 0; - unsigned else_tessfactor_writemask = 0; - - nir_if *if_stmt = nir_cf_node_as_if(cf_node); - foreach_list_typed(nir_cf_node, nested_node, node, &if_stmt->then_list) { - scan_tess_ctrl(nested_node, &then_tessfactor_writemask, - cond_block_tf_writemask, - tessfactors_are_def_in_all_invocs, true); - } - - foreach_list_typed(nir_cf_node, nested_node, node, &if_stmt->else_list) { - scan_tess_ctrl(nested_node, &else_tessfactor_writemask, - cond_block_tf_writemask, - tessfactors_are_def_in_all_invocs, true); - } - - if (then_tessfactor_writemask || else_tessfactor_writemask) { - /* If both statements write the same tess factor channels, - * we can say that the upper block writes them too. - */ - *upper_block_tf_writemask |= then_tessfactor_writemask & - else_tessfactor_writemask; - *cond_block_tf_writemask |= then_tessfactor_writemask | - else_tessfactor_writemask; - } - - break; - } - case nir_cf_node_loop: { - nir_loop *loop = nir_cf_node_as_loop(cf_node); - foreach_list_typed(nir_cf_node, nested_node, node, &loop->body) { - scan_tess_ctrl(nested_node, cond_block_tf_writemask, - cond_block_tf_writemask, - tessfactors_are_def_in_all_invocs, true); - } - - break; - } - default: - unreachable("unknown cf node type"); - } -} - -bool -ac_are_tessfactors_def_in_all_invocs(const struct nir_shader *nir) -{ - assert(nir->info.stage == MESA_SHADER_TESS_CTRL); - - /* The pass works as follows: - * If all codepaths write tess factors, we can say that all - * invocations define tess factors. - * - * Each tess factor channel is tracked separately. - */ - unsigned main_block_tf_writemask = 0; /* if main block writes tess factors */ - unsigned cond_block_tf_writemask = 0; /* if cond block writes tess factors */ - - /* Initial value = true. Here the pass will accumulate results from - * multiple segments surrounded by barriers. If tess factors aren't - * written at all, it's a shader bug and we don't care if this will be - * true. - */ - bool tessfactors_are_def_in_all_invocs = true; - - nir_foreach_function(function, nir) { - if (function->impl) { - foreach_list_typed(nir_cf_node, node, node, &function->impl->body) { - scan_tess_ctrl(node, &main_block_tf_writemask, - &cond_block_tf_writemask, - &tessfactors_are_def_in_all_invocs, - false); - } - } - } - - /* Accumulate the result for the last code segment separated by a - * barrier. - */ - if (main_block_tf_writemask || cond_block_tf_writemask) { - tessfactors_are_def_in_all_invocs &= - !(cond_block_tf_writemask & ~main_block_tf_writemask); - } - - return tessfactors_are_def_in_all_invocs; -} diff -Nru mesa-19.2.8/src/amd/common/ac_nir_to_llvm.h mesa-20.0.8/src/amd/common/ac_nir_to_llvm.h --- mesa-19.2.8/src/amd/common/ac_nir_to_llvm.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/common/ac_nir_to_llvm.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,64 +0,0 @@ -/* - * Copyright © 2016 Bas Nieuwenhuizen - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ - -#ifndef AC_NIR_TO_LLVM_H -#define AC_NIR_TO_LLVM_H - -#include -#include "llvm-c/Core.h" -#include "llvm-c/TargetMachine.h" -#include "amd_family.h" -#include "compiler/shader_enums.h" - -struct nir_shader; -struct nir_variable; -struct ac_llvm_context; -struct ac_shader_abi; - -/* Interpolation locations */ -#define INTERP_CENTER 0 -#define INTERP_CENTROID 1 -#define INTERP_SAMPLE 2 - -static inline unsigned ac_llvm_reg_index_soa(unsigned index, unsigned chan) -{ - return (index * 4) + chan; -} - -void ac_lower_indirect_derefs(struct nir_shader *nir, enum chip_class); - -bool ac_are_tessfactors_def_in_all_invocs(const struct nir_shader *nir); - -void ac_nir_translate(struct ac_llvm_context *ac, struct ac_shader_abi *abi, - struct nir_shader *nir); - -void -ac_handle_shader_output_decl(struct ac_llvm_context *ctx, - struct ac_shader_abi *abi, - struct nir_shader *nir, - struct nir_variable *variable, - gl_shader_stage stage); - -void ac_emit_barrier(struct ac_llvm_context *ac, gl_shader_stage stage); - -#endif /* AC_NIR_TO_LLVM_H */ diff -Nru mesa-19.2.8/src/amd/common/ac_rtld.c mesa-20.0.8/src/amd/common/ac_rtld.c --- mesa-19.2.8/src/amd/common/ac_rtld.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/common/ac_rtld.c 2020-06-12 01:21:16.000000000 +0000 @@ -271,6 +271,7 @@ uint64_t pasted_text_size = 0; uint64_t rx_align = 1; uint64_t rx_size = 0; + uint64_t exec_size = 0; #define report_if(cond) \ do { \ @@ -370,6 +371,8 @@ if (!strcmp(s->name, ".text")) s->is_pasted_text = true; + + exec_size += shdr->sh_size; } if (s->is_pasted_text) { @@ -438,6 +441,7 @@ } binary->rx_size += rx_size; + binary->exec_size = exec_size; if (i.info->chip_class >= GFX10) { /* In gfx10, the SQ fetches up to 3 cache lines of 16 dwords diff -Nru mesa-19.2.8/src/amd/common/ac_rtld.h mesa-20.0.8/src/amd/common/ac_rtld.h --- mesa-19.2.8/src/amd/common/ac_rtld.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/common/ac_rtld.h 2020-06-12 01:21:16.000000000 +0000 @@ -31,6 +31,10 @@ #include "util/u_dynarray.h" #include "compiler/shader_enums.h" +#ifdef __cplusplus +extern "C" { +#endif + struct ac_rtld_part; struct ac_shader_config; struct radeon_info; @@ -57,6 +61,9 @@ /* Required buffer sizes, currently read/executable only. */ uint64_t rx_size; + /* Size of executable code, for reporting purposes. */ + uint64_t exec_size; + uint64_t rx_end_markers; unsigned num_parts; @@ -131,4 +138,8 @@ bool ac_rtld_upload(struct ac_rtld_upload_info *u); +#ifdef __cplusplus +} +#endif + #endif /* AC_RTLD_H */ diff -Nru mesa-19.2.8/src/amd/common/ac_shader_abi.h mesa-20.0.8/src/amd/common/ac_shader_abi.h --- mesa-19.2.8/src/amd/common/ac_shader_abi.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/common/ac_shader_abi.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,219 +0,0 @@ -/* - * Copyright 2017 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * on the rights to use, copy, modify, merge, publish, distribute, sub - * license, and/or sell copies of the Software, and to permit persons to whom - * the Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL - * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, - * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR - * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE - * USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -#ifndef AC_SHADER_ABI_H -#define AC_SHADER_ABI_H - -#include - -#include "compiler/shader_enums.h" - -struct nir_variable; - -#define AC_LLVM_MAX_OUTPUTS (VARYING_SLOT_VAR31 + 1) - -#define AC_MAX_INLINE_PUSH_CONSTS 8 - -enum ac_descriptor_type { - AC_DESC_IMAGE, - AC_DESC_FMASK, - AC_DESC_SAMPLER, - AC_DESC_BUFFER, - AC_DESC_PLANE_0, - AC_DESC_PLANE_1, - AC_DESC_PLANE_2, -}; - -/* Document the shader ABI during compilation. This is what allows radeonsi and - * radv to share a compiler backend. - */ -struct ac_shader_abi { - LLVMValueRef base_vertex; - LLVMValueRef start_instance; - LLVMValueRef draw_id; - LLVMValueRef vertex_id; - LLVMValueRef instance_id; - LLVMValueRef tcs_patch_id; - LLVMValueRef tcs_rel_ids; - LLVMValueRef tes_patch_id; - LLVMValueRef gs_prim_id; - LLVMValueRef gs_invocation_id; - LLVMValueRef frag_pos[4]; - LLVMValueRef front_face; - LLVMValueRef ancillary; - LLVMValueRef sample_coverage; - LLVMValueRef prim_mask; - LLVMValueRef color0; - LLVMValueRef color1; - LLVMValueRef user_data; - /* CS */ - LLVMValueRef local_invocation_ids; - LLVMValueRef num_work_groups; - LLVMValueRef workgroup_ids[3]; - LLVMValueRef tg_size; - - /* Vulkan only */ - LLVMValueRef push_constants; - LLVMValueRef inline_push_consts[AC_MAX_INLINE_PUSH_CONSTS]; - unsigned num_inline_push_consts; - unsigned base_inline_push_consts; - LLVMValueRef view_index; - - LLVMValueRef outputs[AC_LLVM_MAX_OUTPUTS * 4]; - - /* For VS and PS: pre-loaded shader inputs. - * - * Currently only used for NIR shaders; indexed by variables' - * driver_location. - */ - LLVMValueRef *inputs; - - /* Varying -> attribute number mapping. Also NIR-only */ - unsigned fs_input_attr_indices[MAX_VARYING]; - - void (*emit_outputs)(struct ac_shader_abi *abi, - unsigned max_outputs, - LLVMValueRef *addrs); - - void (*emit_vertex)(struct ac_shader_abi *abi, - unsigned stream, - LLVMValueRef *addrs); - - void (*emit_primitive)(struct ac_shader_abi *abi, - unsigned stream); - - void (*emit_kill)(struct ac_shader_abi *abi, LLVMValueRef visible); - - LLVMValueRef (*load_inputs)(struct ac_shader_abi *abi, - unsigned location, - unsigned driver_location, - unsigned component, - unsigned num_components, - unsigned vertex_index, - unsigned const_index, - LLVMTypeRef type); - - LLVMValueRef (*load_tess_varyings)(struct ac_shader_abi *abi, - LLVMTypeRef type, - LLVMValueRef vertex_index, - LLVMValueRef param_index, - unsigned const_index, - unsigned location, - unsigned driver_location, - unsigned component, - unsigned num_components, - bool is_patch, - bool is_compact, - bool load_inputs); - - void (*store_tcs_outputs)(struct ac_shader_abi *abi, - const struct nir_variable *var, - LLVMValueRef vertex_index, - LLVMValueRef param_index, - unsigned const_index, - LLVMValueRef src, - unsigned writemask); - - LLVMValueRef (*load_tess_coord)(struct ac_shader_abi *abi); - - LLVMValueRef (*load_patch_vertices_in)(struct ac_shader_abi *abi); - - LLVMValueRef (*load_tess_level)(struct ac_shader_abi *abi, - unsigned varying_id, - bool load_default_state); - - - LLVMValueRef (*load_ubo)(struct ac_shader_abi *abi, LLVMValueRef index); - - /** - * Load the descriptor for the given buffer. - * - * \param buffer the buffer as presented in NIR: this is the descriptor - * in Vulkan, and the buffer index in OpenGL/Gallium - * \param write whether buffer contents will be written - */ - LLVMValueRef (*load_ssbo)(struct ac_shader_abi *abi, - LLVMValueRef buffer, bool write); - - /** - * Load a descriptor associated to a sampler. - * - * \param descriptor_set the descriptor set index (only for Vulkan) - * \param base_index the base index of the sampler variable - * \param constant_index constant part of an array index (or 0, if the - * sampler variable is not an array) - * \param index non-constant part of an array index (may be NULL) - * \param desc_type the type of descriptor to load - * \param image whether the descriptor is loaded for an image operation - */ - LLVMValueRef (*load_sampler_desc)(struct ac_shader_abi *abi, - unsigned descriptor_set, - unsigned base_index, - unsigned constant_index, - LLVMValueRef index, - enum ac_descriptor_type desc_type, - bool image, bool write, - bool bindless); - - /** - * Load a Vulkan-specific resource. - * - * \param index resource index - * \param desc_set descriptor set - * \param binding descriptor set binding - */ - LLVMValueRef (*load_resource)(struct ac_shader_abi *abi, - LLVMValueRef index, - unsigned desc_set, - unsigned binding); - - LLVMValueRef (*lookup_interp_param)(struct ac_shader_abi *abi, - enum glsl_interp_mode interp, - unsigned location); - - LLVMValueRef (*load_sample_position)(struct ac_shader_abi *abi, - LLVMValueRef sample_id); - - LLVMValueRef (*load_local_group_size)(struct ac_shader_abi *abi); - - LLVMValueRef (*load_sample_mask_in)(struct ac_shader_abi *abi); - - LLVMValueRef (*load_base_vertex)(struct ac_shader_abi *abi); - - LLVMValueRef (*emit_fbfetch)(struct ac_shader_abi *abi); - - /* Whether to clamp the shadow reference value to [0,1]on GFX8. Radeonsi currently - * uses it due to promoting D16 to D32, but radv needs it off. */ - bool clamp_shadow_reference; - bool interp_at_sample_force_center; - - /* Whether to workaround GFX9 ignoring the stride for the buffer size if IDXEN=0 - * and LLVM optimizes an indexed load with constant index to IDXEN=0. */ - bool gfx9_stride_size_workaround; - bool gfx9_stride_size_workaround_for_atomic; - - /* Whether bounds checks are required */ - bool robust_buffer_access; -}; - -#endif /* AC_SHADER_ABI_H */ diff -Nru mesa-19.2.8/src/amd/common/ac_shader_args.c mesa-20.0.8/src/amd/common/ac_shader_args.c --- mesa-19.2.8/src/amd/common/ac_shader_args.c 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/amd/common/ac_shader_args.c 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,55 @@ +/* + * Copyright 2019 Valve Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "ac_shader_args.h" +#include "nir/nir_builder.h" + +void +ac_add_arg(struct ac_shader_args *info, enum ac_arg_regfile regfile, + unsigned size, enum ac_arg_type type, struct ac_arg *arg) +{ + assert(info->arg_count < AC_MAX_ARGS); + + unsigned offset; + if (regfile == AC_ARG_SGPR) { + offset = info->num_sgprs_used; + info->num_sgprs_used += size; + } else { + assert(regfile == AC_ARG_VGPR); + offset = info->num_vgprs_used; + info->num_vgprs_used += size; + } + + info->args[info->arg_count].file = regfile; + info->args[info->arg_count].offset = offset; + info->args[info->arg_count].size = size; + info->args[info->arg_count].type = type; + + if (arg) { + arg->arg_index = info->arg_count; + arg->used = true; + } + + info->arg_count++; +} + diff -Nru mesa-19.2.8/src/amd/common/ac_shader_args.h mesa-20.0.8/src/amd/common/ac_shader_args.h --- mesa-19.2.8/src/amd/common/ac_shader_args.h 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/amd/common/ac_shader_args.h 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,114 @@ +/* + * Copyright 2019 Valve Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef AC_SHADER_ARGS_H +#define AC_SHADER_ARGS_H + +#include +#include + +#define AC_MAX_INLINE_PUSH_CONSTS 8 + +enum ac_arg_regfile { + AC_ARG_SGPR, + AC_ARG_VGPR, +}; + +enum ac_arg_type { + AC_ARG_FLOAT, + AC_ARG_INT, + AC_ARG_CONST_PTR, /* Pointer to i8 array */ + AC_ARG_CONST_FLOAT_PTR, /* Pointer to f32 array */ + AC_ARG_CONST_PTR_PTR, /* Pointer to pointer to i8 array */ + AC_ARG_CONST_DESC_PTR, /* Pointer to v4i32 array */ + AC_ARG_CONST_IMAGE_PTR, /* Pointer to v8i32 array */ +}; + +struct ac_arg { + uint8_t arg_index; + bool used; +}; + + +#define AC_MAX_ARGS 128 + +struct ac_shader_args { + /* Info on how to declare arguments */ + struct { + enum ac_arg_type type; + enum ac_arg_regfile file; + uint8_t offset; + uint8_t size; + bool skip; + } args[AC_MAX_ARGS]; + + uint8_t arg_count; + uint8_t sgpr_count; + uint8_t num_sgprs_used; + uint8_t num_vgprs_used; + + struct ac_arg base_vertex; + struct ac_arg start_instance; + struct ac_arg draw_id; + struct ac_arg vertex_id; + struct ac_arg instance_id; + struct ac_arg tcs_patch_id; + struct ac_arg tcs_rel_ids; + struct ac_arg tes_patch_id; + struct ac_arg gs_prim_id; + struct ac_arg gs_invocation_id; + + /* PS */ + struct ac_arg frag_pos[4]; + struct ac_arg front_face; + struct ac_arg ancillary; + struct ac_arg sample_coverage; + struct ac_arg prim_mask; + struct ac_arg persp_sample; + struct ac_arg persp_center; + struct ac_arg persp_centroid; + struct ac_arg pull_model; + struct ac_arg linear_sample; + struct ac_arg linear_center; + struct ac_arg linear_centroid; + + /* CS */ + struct ac_arg local_invocation_ids; + struct ac_arg num_work_groups; + struct ac_arg workgroup_ids[3]; + struct ac_arg tg_size; + + /* Vulkan only */ + struct ac_arg push_constants; + struct ac_arg inline_push_consts[AC_MAX_INLINE_PUSH_CONSTS]; + unsigned num_inline_push_consts; + unsigned base_inline_push_consts; + struct ac_arg view_index; +}; + +void ac_add_arg(struct ac_shader_args *info, enum ac_arg_regfile regfile, + unsigned registers, enum ac_arg_type type, + struct ac_arg *arg); + +#endif + diff -Nru mesa-19.2.8/src/amd/common/ac_shader_util.c mesa-20.0.8/src/amd/common/ac_shader_util.c --- mesa-19.2.8/src/amd/common/ac_shader_util.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/common/ac_shader_util.c 2020-06-12 01:21:16.000000000 +0000 @@ -25,7 +25,6 @@ #include #include -#include "ac_nir_to_llvm.h" #include "ac_shader_util.h" #include "sid.h" @@ -109,71 +108,183 @@ S_028A40_ONCHIP(chip_class >= GFX9 ? 1 : 0); } -void -ac_export_mrt_z(struct ac_llvm_context *ctx, LLVMValueRef depth, - LLVMValueRef stencil, LLVMValueRef samplemask, - struct ac_export_args *args) -{ - unsigned mask = 0; - unsigned format = ac_get_spi_shader_z_format(depth != NULL, - stencil != NULL, - samplemask != NULL); - - assert(depth || stencil || samplemask); - - memset(args, 0, sizeof(*args)); - - args->valid_mask = 1; /* whether the EXEC mask is valid */ - args->done = 1; /* DONE bit */ - - /* Specify the target we are exporting */ - args->target = V_008DFC_SQ_EXP_MRTZ; - - args->compr = 0; /* COMP flag */ - args->out[0] = LLVMGetUndef(ctx->f32); /* R, depth */ - args->out[1] = LLVMGetUndef(ctx->f32); /* G, stencil test val[0:7], stencil op val[8:15] */ - args->out[2] = LLVMGetUndef(ctx->f32); /* B, sample mask */ - args->out[3] = LLVMGetUndef(ctx->f32); /* A, alpha to mask */ - - if (format == V_028710_SPI_SHADER_UINT16_ABGR) { - assert(!depth); - args->compr = 1; /* COMPR flag */ - - if (stencil) { - /* Stencil should be in X[23:16]. */ - stencil = ac_to_integer(ctx, stencil); - stencil = LLVMBuildShl(ctx->builder, stencil, - LLVMConstInt(ctx->i32, 16, 0), ""); - args->out[0] = ac_to_float(ctx, stencil); - mask |= 0x3; +/// Translate a (dfmt, nfmt) pair into a chip-appropriate combined format +/// value for LLVM8+ tbuffer intrinsics. +unsigned +ac_get_tbuffer_format(enum chip_class chip_class, + unsigned dfmt, unsigned nfmt) +{ + // Some games try to access vertex buffers without a valid format. + // This is a game bug, but we should still handle it gracefully. + if (dfmt == V_008F0C_IMG_FORMAT_INVALID) + return V_008F0C_IMG_FORMAT_INVALID; + + if (chip_class >= GFX10) { + unsigned format; + switch (dfmt) { + default: unreachable("bad dfmt"); + case V_008F0C_BUF_DATA_FORMAT_INVALID: format = V_008F0C_IMG_FORMAT_INVALID; break; + case V_008F0C_BUF_DATA_FORMAT_8: format = V_008F0C_IMG_FORMAT_8_UINT; break; + case V_008F0C_BUF_DATA_FORMAT_8_8: format = V_008F0C_IMG_FORMAT_8_8_UINT; break; + case V_008F0C_BUF_DATA_FORMAT_8_8_8_8: format = V_008F0C_IMG_FORMAT_8_8_8_8_UINT; break; + case V_008F0C_BUF_DATA_FORMAT_16: format = V_008F0C_IMG_FORMAT_16_UINT; break; + case V_008F0C_BUF_DATA_FORMAT_16_16: format = V_008F0C_IMG_FORMAT_16_16_UINT; break; + case V_008F0C_BUF_DATA_FORMAT_16_16_16_16: format = V_008F0C_IMG_FORMAT_16_16_16_16_UINT; break; + case V_008F0C_BUF_DATA_FORMAT_32: format = V_008F0C_IMG_FORMAT_32_UINT; break; + case V_008F0C_BUF_DATA_FORMAT_32_32: format = V_008F0C_IMG_FORMAT_32_32_UINT; break; + case V_008F0C_BUF_DATA_FORMAT_32_32_32: format = V_008F0C_IMG_FORMAT_32_32_32_UINT; break; + case V_008F0C_BUF_DATA_FORMAT_32_32_32_32: format = V_008F0C_IMG_FORMAT_32_32_32_32_UINT; break; + case V_008F0C_BUF_DATA_FORMAT_2_10_10_10: format = V_008F0C_IMG_FORMAT_2_10_10_10_UINT; break; } - if (samplemask) { - /* SampleMask should be in Y[15:0]. */ - args->out[1] = samplemask; - mask |= 0xc; + + // Use the regularity properties of the combined format enum. + // + // Note: float is incompatible with 8-bit data formats, + // [us]{norm,scaled} are incomparible with 32-bit data formats. + // [us]scaled are not writable. + switch (nfmt) { + case V_008F0C_BUF_NUM_FORMAT_UNORM: format -= 4; break; + case V_008F0C_BUF_NUM_FORMAT_SNORM: format -= 3; break; + case V_008F0C_BUF_NUM_FORMAT_USCALED: format -= 2; break; + case V_008F0C_BUF_NUM_FORMAT_SSCALED: format -= 1; break; + default: unreachable("bad nfmt"); + case V_008F0C_BUF_NUM_FORMAT_UINT: break; + case V_008F0C_BUF_NUM_FORMAT_SINT: format += 1; break; + case V_008F0C_BUF_NUM_FORMAT_FLOAT: format += 2; break; } + + return format; } else { - if (depth) { - args->out[0] = depth; - mask |= 0x1; - } - if (stencil) { - args->out[1] = stencil; - mask |= 0x2; - } - if (samplemask) { - args->out[2] = samplemask; - mask |= 0x4; - } + return dfmt | (nfmt << 4); + } +} + +static const struct ac_data_format_info data_format_table[] = { + [V_008F0C_BUF_DATA_FORMAT_INVALID] = { 0, 4, 0, V_008F0C_BUF_DATA_FORMAT_INVALID }, + [V_008F0C_BUF_DATA_FORMAT_8] = { 1, 1, 1, V_008F0C_BUF_DATA_FORMAT_8 }, + [V_008F0C_BUF_DATA_FORMAT_16] = { 2, 1, 2, V_008F0C_BUF_DATA_FORMAT_16 }, + [V_008F0C_BUF_DATA_FORMAT_8_8] = { 2, 2, 1, V_008F0C_BUF_DATA_FORMAT_8 }, + [V_008F0C_BUF_DATA_FORMAT_32] = { 4, 1, 4, V_008F0C_BUF_DATA_FORMAT_32 }, + [V_008F0C_BUF_DATA_FORMAT_16_16] = { 4, 2, 2, V_008F0C_BUF_DATA_FORMAT_16 }, + [V_008F0C_BUF_DATA_FORMAT_10_11_11] = { 4, 3, 0, V_008F0C_BUF_DATA_FORMAT_10_11_11 }, + [V_008F0C_BUF_DATA_FORMAT_11_11_10] = { 4, 3, 0, V_008F0C_BUF_DATA_FORMAT_11_11_10 }, + [V_008F0C_BUF_DATA_FORMAT_10_10_10_2] = { 4, 4, 0, V_008F0C_BUF_DATA_FORMAT_10_10_10_2 }, + [V_008F0C_BUF_DATA_FORMAT_2_10_10_10] = { 4, 4, 0, V_008F0C_BUF_DATA_FORMAT_2_10_10_10 }, + [V_008F0C_BUF_DATA_FORMAT_8_8_8_8] = { 4, 4, 1, V_008F0C_BUF_DATA_FORMAT_8 }, + [V_008F0C_BUF_DATA_FORMAT_32_32] = { 8, 2, 4, V_008F0C_BUF_DATA_FORMAT_32 }, + [V_008F0C_BUF_DATA_FORMAT_16_16_16_16] = { 8, 4, 2, V_008F0C_BUF_DATA_FORMAT_16 }, + [V_008F0C_BUF_DATA_FORMAT_32_32_32] = { 12, 3, 4, V_008F0C_BUF_DATA_FORMAT_32 }, + [V_008F0C_BUF_DATA_FORMAT_32_32_32_32] = { 16, 4, 4, V_008F0C_BUF_DATA_FORMAT_32 }, +}; + +const struct ac_data_format_info * +ac_get_data_format_info(unsigned dfmt) +{ + assert(dfmt < ARRAY_SIZE(data_format_table)); + return &data_format_table[dfmt]; +} + +enum ac_image_dim +ac_get_sampler_dim(enum chip_class chip_class, enum glsl_sampler_dim dim, + bool is_array) +{ + switch (dim) { + case GLSL_SAMPLER_DIM_1D: + if (chip_class == GFX9) + return is_array ? ac_image_2darray : ac_image_2d; + return is_array ? ac_image_1darray : ac_image_1d; + case GLSL_SAMPLER_DIM_2D: + case GLSL_SAMPLER_DIM_RECT: + case GLSL_SAMPLER_DIM_EXTERNAL: + return is_array ? ac_image_2darray : ac_image_2d; + case GLSL_SAMPLER_DIM_3D: + return ac_image_3d; + case GLSL_SAMPLER_DIM_CUBE: + return ac_image_cube; + case GLSL_SAMPLER_DIM_MS: + return is_array ? ac_image_2darraymsaa : ac_image_2dmsaa; + case GLSL_SAMPLER_DIM_SUBPASS: + return ac_image_2darray; + case GLSL_SAMPLER_DIM_SUBPASS_MS: + return ac_image_2darraymsaa; + default: + unreachable("bad sampler dim"); + } +} + +enum ac_image_dim +ac_get_image_dim(enum chip_class chip_class, enum glsl_sampler_dim sdim, + bool is_array) +{ + enum ac_image_dim dim = ac_get_sampler_dim(chip_class, sdim, is_array); + + /* Match the resource type set in the descriptor. */ + if (dim == ac_image_cube || + (chip_class <= GFX8 && dim == ac_image_3d)) + dim = ac_image_2darray; + else if (sdim == GLSL_SAMPLER_DIM_2D && !is_array && chip_class == GFX9) { + /* When a single layer of a 3D texture is bound, the shader + * will refer to a 2D target, but the descriptor has a 3D type. + * Since the HW ignores BASE_ARRAY in this case, we need to + * send 3 coordinates. This doesn't hurt when the underlying + * texture is non-3D. + */ + dim = ac_image_3d; } - /* GFX6 (except OLAND and HAINAN) has a bug that it only looks - * at the X writemask component. */ - if (ctx->chip_class == GFX6 && - ctx->family != CHIP_OLAND && - ctx->family != CHIP_HAINAN) - mask |= 0x1; + return dim; +} + +unsigned +ac_get_fs_input_vgpr_cnt(const struct ac_shader_config *config, + signed char *face_vgpr_index_ptr, + signed char *ancillary_vgpr_index_ptr) +{ + unsigned num_input_vgprs = 0; + signed char face_vgpr_index = -1; + signed char ancillary_vgpr_index = -1; + + if (G_0286CC_PERSP_SAMPLE_ENA(config->spi_ps_input_addr)) + num_input_vgprs += 2; + if (G_0286CC_PERSP_CENTER_ENA(config->spi_ps_input_addr)) + num_input_vgprs += 2; + if (G_0286CC_PERSP_CENTROID_ENA(config->spi_ps_input_addr)) + num_input_vgprs += 2; + if (G_0286CC_PERSP_PULL_MODEL_ENA(config->spi_ps_input_addr)) + num_input_vgprs += 3; + if (G_0286CC_LINEAR_SAMPLE_ENA(config->spi_ps_input_addr)) + num_input_vgprs += 2; + if (G_0286CC_LINEAR_CENTER_ENA(config->spi_ps_input_addr)) + num_input_vgprs += 2; + if (G_0286CC_LINEAR_CENTROID_ENA(config->spi_ps_input_addr)) + num_input_vgprs += 2; + if (G_0286CC_LINE_STIPPLE_TEX_ENA(config->spi_ps_input_addr)) + num_input_vgprs += 1; + if (G_0286CC_POS_X_FLOAT_ENA(config->spi_ps_input_addr)) + num_input_vgprs += 1; + if (G_0286CC_POS_Y_FLOAT_ENA(config->spi_ps_input_addr)) + num_input_vgprs += 1; + if (G_0286CC_POS_Z_FLOAT_ENA(config->spi_ps_input_addr)) + num_input_vgprs += 1; + if (G_0286CC_POS_W_FLOAT_ENA(config->spi_ps_input_addr)) + num_input_vgprs += 1; + if (G_0286CC_FRONT_FACE_ENA(config->spi_ps_input_addr)) { + face_vgpr_index = num_input_vgprs; + num_input_vgprs += 1; + } + if (G_0286CC_ANCILLARY_ENA(config->spi_ps_input_addr)) { + ancillary_vgpr_index = num_input_vgprs; + num_input_vgprs += 1; + } + if (G_0286CC_SAMPLE_COVERAGE_ENA(config->spi_ps_input_addr)) + num_input_vgprs += 1; + if (G_0286CC_POS_FIXED_PT_ENA(config->spi_ps_input_addr)) + num_input_vgprs += 1; + + if (face_vgpr_index_ptr) + *face_vgpr_index_ptr = face_vgpr_index; + if (ancillary_vgpr_index_ptr) + *ancillary_vgpr_index_ptr = ancillary_vgpr_index; - /* Specify which components to enable */ - args->enabled_channels = mask; + return num_input_vgprs; } diff -Nru mesa-19.2.8/src/amd/common/ac_shader_util.h mesa-20.0.8/src/amd/common/ac_shader_util.h --- mesa-19.2.8/src/amd/common/ac_shader_util.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/common/ac_shader_util.h 2020-06-12 01:21:16.000000000 +0000 @@ -28,7 +28,30 @@ #include #include "amd_family.h" -#include "ac_llvm_build.h" +#include "ac_binary.h" +#include "compiler/nir/nir.h" + +#ifdef __cplusplus +extern "C" { +#endif + +enum ac_image_dim { + ac_image_1d, + ac_image_2d, + ac_image_3d, + ac_image_cube, // includes cube arrays + ac_image_1darray, + ac_image_2darray, + ac_image_2dmsaa, + ac_image_2darraymsaa, +}; + +struct ac_data_format_info { + uint8_t element_size; + uint8_t num_channels; + uint8_t chan_byte_size; + uint8_t chan_format; +}; unsigned ac_get_spi_shader_z_format(bool writes_z, bool writes_stencil, @@ -40,9 +63,28 @@ uint32_t ac_vgt_gs_mode(unsigned gs_max_vert_out, enum chip_class chip_class); -void -ac_export_mrt_z(struct ac_llvm_context *ctx, LLVMValueRef depth, - LLVMValueRef stencil, LLVMValueRef samplemask, - struct ac_export_args *args); +unsigned +ac_get_tbuffer_format(enum chip_class chip_class, + unsigned dfmt, unsigned nfmt); + +const struct ac_data_format_info * +ac_get_data_format_info(unsigned dfmt); + +enum ac_image_dim +ac_get_sampler_dim(enum chip_class chip_class, enum glsl_sampler_dim dim, + bool is_array); + +enum ac_image_dim +ac_get_image_dim(enum chip_class chip_class, enum glsl_sampler_dim sdim, + bool is_array); + +unsigned +ac_get_fs_input_vgpr_cnt(const struct ac_shader_config *config, + signed char *face_vgpr_index, + signed char *ancillary_vgpr_index); + +#ifdef __cplusplus +} +#endif #endif diff -Nru mesa-19.2.8/src/amd/common/ac_surface.c mesa-20.0.8/src/amd/common/ac_surface.c --- mesa-19.2.8/src/amd/common/ac_surface.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/common/ac_surface.c 2020-06-12 01:21:16.000000000 +0000 @@ -85,7 +85,6 @@ if (addrCreateInput.chipFamily >= FAMILY_AI) { addrCreateInput.chipEngine = CIASICIDGFXENGINE_ARCTICISLAND; - regValue.blockVarSizeLog2 = 0; } else { regValue.noOfBanks = amdinfo->mc_arb_ramcfg & 0x3; regValue.noOfRanks = (amdinfo->mc_arb_ramcfg & 0x4) >> 2; @@ -207,6 +206,17 @@ AddrSurfInfoIn->width = align(AddrSurfInfoIn->width, alignment); } + /* addrlib assumes the bytes/pixel is a divisor of 64, which is not + * true for r32g32b32 formats. */ + if (AddrSurfInfoIn->bpp == 96) { + assert(config->info.levels == 1); + assert(AddrSurfInfoIn->tileMode == ADDR_TM_LINEAR_ALIGNED); + + /* The least common multiple of 64 bytes and 12 bytes/pixel is + * 192 bytes, or 16 pixels. */ + AddrSurfInfoIn->width = align(AddrSurfInfoIn->width, 16); + } + if (config->is_3d) AddrSurfInfoIn->numSlices = u_minify(config->info.depth, level); else if (config->is_cube) @@ -338,11 +348,12 @@ } } - /* TC-compatible HTILE. */ + /* HTILE. */ if (!is_stencil && AddrSurfInfoIn->flags.depth && surf_level->mode == RADEON_SURF_MODE_2D && - level == 0) { + level == 0 && + !(surf->flags & RADEON_SURF_NO_HTILE)) { AddrHtileIn->flags.tcCompatible = AddrSurfInfoOut->tcCompatible; AddrHtileIn->pitch = AddrSurfInfoOut->pitch; AddrHtileIn->height = AddrSurfInfoOut->height; @@ -488,7 +499,8 @@ unsigned num_pipes = info->num_tile_pipes; unsigned cl_width, cl_height; - if (surf->flags & RADEON_SURF_Z_OR_SBUFFER) + if (surf->flags & RADEON_SURF_Z_OR_SBUFFER || + (config->info.samples >= 2 && !surf->fmask_size)) return; assert(info->chip_class <= GFX8); @@ -843,7 +855,8 @@ } /* Compute FMASK. */ - if (config->info.samples >= 2 && AddrSurfInfoIn.flags.color) { + if (config->info.samples >= 2 && AddrSurfInfoIn.flags.color && + info->has_graphics && !(surf->flags & RADEON_SURF_NO_FMASK)) { ADDR_COMPUTE_FMASK_INFO_INPUT fin = {0}; ADDR_COMPUTE_FMASK_INFO_OUTPUT fout = {0}; ADDR_TILEINFO fmask_tile_info = {}; @@ -958,6 +971,7 @@ /* This is only called when expecting a tiled layout. */ static int gfx9_get_preferred_swizzle_mode(ADDR_HANDLE addrlib, + struct radeon_surf *surf, ADDR2_COMPUTE_SURFACE_INFO_INPUT *in, bool is_fmask, AddrSwizzleMode *swizzle_mode) { @@ -989,6 +1003,19 @@ sin.flags.fmask = 1; } + if (surf->flags & RADEON_SURF_FORCE_MICRO_TILE_MODE) { + sin.forbiddenBlock.linear = 1; + + if (surf->micro_tile_mode == RADEON_MICRO_MODE_DISPLAY) + sin.preferredSwSet.sw_D = 1; + else if (surf->micro_tile_mode == RADEON_MICRO_MODE_THIN) + sin.preferredSwSet.sw_S = 1; + else if (surf->micro_tile_mode == RADEON_MICRO_MODE_DEPTH) + sin.preferredSwSet.sw_Z = 1; + else if (surf->micro_tile_mode == RADEON_MICRO_MODE_ROTATED) + sin.preferredSwSet.sw_R = 1; + } + ret = Addr2GetPreferredSurfaceSetting(addrlib, &sin, &sout); if (ret != ADDR_OK) return ret; @@ -1049,13 +1076,18 @@ surf->surf_alignment = out.baseAlign; if (in->swizzleMode == ADDR_SW_LINEAR) { - for (unsigned i = 0; i < in->numMipLevels; i++) + for (unsigned i = 0; i < in->numMipLevels; i++) { surf->u.gfx9.offset[i] = mip_info[i].offset; + surf->u.gfx9.pitch[i] = mip_info[i].pitch; + } } if (in->flags.depth) { assert(in->swizzleMode != ADDR_SW_LINEAR); + if (surf->flags & RADEON_SURF_NO_HTILE) + return 0; + /* HTILE */ ADDR2_COMPUTE_HTILE_INFO_INPUT hin = {0}; ADDR2_COMPUTE_HTILE_INFO_OUTPUT hout = {0}; @@ -1082,7 +1114,10 @@ surf->htile_size = hout.htileBytes; surf->htile_slice_size = hout.sliceSize; surf->htile_alignment = hout.baseAlign; - } else { + return 0; + } + + { /* Compute tile swizzle for the color surface. * All *_X and *_T modes can use the swizzle. */ @@ -1285,14 +1320,15 @@ } /* FMASK */ - if (in->numSamples > 1) { + if (in->numSamples > 1 && info->has_graphics && + !(surf->flags & RADEON_SURF_NO_FMASK)) { ADDR2_COMPUTE_FMASK_INFO_INPUT fin = {0}; ADDR2_COMPUTE_FMASK_INFO_OUTPUT fout = {0}; fin.size = sizeof(ADDR2_COMPUTE_FMASK_INFO_INPUT); fout.size = sizeof(ADDR2_COMPUTE_FMASK_INFO_OUTPUT); - ret = gfx9_get_preferred_swizzle_mode(addrlib, in, + ret = gfx9_get_preferred_swizzle_mode(addrlib, surf, in, true, &fin.swizzleMode); if (ret != ADDR_OK) return ret; @@ -1343,7 +1379,9 @@ /* CMASK -- on GFX10 only for FMASK */ if (in->swizzleMode != ADDR_SW_LINEAR && - (info->chip_class <= GFX9 || in->numSamples > 1)) { + in->resourceType == ADDR_RSRC_TEX_2D && + ((info->chip_class <= GFX9 && in->numSamples == 1) || + (surf->fmask_size && in->numSamples >= 2))) { ADDR2_COMPUTE_CMASK_INFO_INPUT cin = {0}; ADDR2_COMPUTE_CMASK_INFO_OUTPUT cout = {0}; @@ -1512,7 +1550,7 @@ break; } - r = gfx9_get_preferred_swizzle_mode(addrlib, &AddrSurfInfoIn, + r = gfx9_get_preferred_swizzle_mode(addrlib, surf, &AddrSurfInfoIn, false, &AddrSurfInfoIn.swizzleMode); if (r) return r; @@ -1551,7 +1589,7 @@ AddrSurfInfoIn.format = ADDR_FMT_8; if (!AddrSurfInfoIn.flags.depth) { - r = gfx9_get_preferred_swizzle_mode(addrlib, &AddrSurfInfoIn, + r = gfx9_get_preferred_swizzle_mode(addrlib, surf, &AddrSurfInfoIn, false, &AddrSurfInfoIn.swizzleMode); if (r) goto error; @@ -1588,11 +1626,9 @@ case ADDR_SW_256B_S: case ADDR_SW_4KB_S: case ADDR_SW_64KB_S: - case ADDR_SW_VAR_S: case ADDR_SW_64KB_S_T: case ADDR_SW_4KB_S_X: case ADDR_SW_64KB_S_X: - case ADDR_SW_VAR_S_X: surf->micro_tile_mode = RADEON_MICRO_MODE_THIN; break; @@ -1601,11 +1637,9 @@ case ADDR_SW_256B_D: case ADDR_SW_4KB_D: case ADDR_SW_64KB_D: - case ADDR_SW_VAR_D: case ADDR_SW_64KB_D_T: case ADDR_SW_4KB_D_X: case ADDR_SW_64KB_D_X: - case ADDR_SW_VAR_D_X: surf->micro_tile_mode = RADEON_MICRO_MODE_DISPLAY; break; @@ -1613,7 +1647,6 @@ case ADDR_SW_256B_R: case ADDR_SW_4KB_R: case ADDR_SW_64KB_R: - case ADDR_SW_VAR_R: case ADDR_SW_64KB_R_T: case ADDR_SW_4KB_R_X: case ADDR_SW_64KB_R_X: @@ -1630,7 +1663,6 @@ /* Z = depth. */ case ADDR_SW_4KB_Z: case ADDR_SW_64KB_Z: - case ADDR_SW_VAR_Z: case ADDR_SW_64KB_Z_T: case ADDR_SW_4KB_Z_X: case ADDR_SW_64KB_Z_X: @@ -1662,7 +1694,61 @@ return r; if (info->chip_class >= GFX9) - return gfx9_compute_surface(addrlib, info, config, mode, surf); + r = gfx9_compute_surface(addrlib, info, config, mode, surf); else - return gfx6_compute_surface(addrlib, info, config, mode, surf); + r = gfx6_compute_surface(addrlib, info, config, mode, surf); + + if (r) + return r; + + /* Determine the memory layout of multiple allocations in one buffer. */ + surf->total_size = surf->surf_size; + + if (surf->htile_size) { + surf->htile_offset = align64(surf->total_size, surf->htile_alignment); + surf->total_size = surf->htile_offset + surf->htile_size; + } + + if (surf->fmask_size) { + assert(config->info.samples >= 2); + surf->fmask_offset = align64(surf->total_size, surf->fmask_alignment); + surf->total_size = surf->fmask_offset + surf->fmask_size; + } + + /* Single-sample CMASK is in a separate buffer. */ + if (surf->cmask_size && config->info.samples >= 2) { + surf->cmask_offset = align64(surf->total_size, surf->cmask_alignment); + surf->total_size = surf->cmask_offset + surf->cmask_size; + } + + if (surf->dcc_size && + (info->use_display_dcc_unaligned || + info->use_display_dcc_with_retile_blit || + !(surf->flags & RADEON_SURF_SCANOUT))) { + surf->dcc_offset = align64(surf->total_size, surf->dcc_alignment); + surf->total_size = surf->dcc_offset + surf->dcc_size; + + if (info->chip_class >= GFX9 && + surf->u.gfx9.dcc_retile_num_elements) { + /* Add space for the displayable DCC buffer. */ + surf->display_dcc_offset = + align64(surf->total_size, surf->u.gfx9.display_dcc_alignment); + surf->total_size = surf->display_dcc_offset + + surf->u.gfx9.display_dcc_size; + + /* Add space for the DCC retile buffer. (16-bit or 32-bit elements) */ + surf->dcc_retile_map_offset = + align64(surf->total_size, info->tcc_cache_line_size); + + if (surf->u.gfx9.dcc_retile_use_uint16) { + surf->total_size = surf->dcc_retile_map_offset + + surf->u.gfx9.dcc_retile_num_elements * 2; + } else { + surf->total_size = surf->dcc_retile_map_offset + + surf->u.gfx9.dcc_retile_num_elements * 4; + } + } + } + + return 0; } diff -Nru mesa-19.2.8/src/amd/common/ac_surface.h mesa-20.0.8/src/amd/common/ac_surface.h --- mesa-19.2.8/src/amd/common/ac_surface.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/common/ac_surface.h 2020-06-12 01:21:16.000000000 +0000 @@ -71,6 +71,9 @@ #define RADEON_SURF_SHAREABLE (1 << 26) #define RADEON_SURF_NO_RENDER_TARGET (1 << 27) #define RADEON_SURF_FORCE_SWIZZLE_MODE (1 << 28) +#define RADEON_SURF_NO_FMASK (1 << 29) +#define RADEON_SURF_NO_HTILE (1 << 30) +#define RADEON_SURF_FORCE_MICRO_TILE_MODE (1u << 31) struct legacy_surf_level { uint64_t offset; @@ -152,6 +155,8 @@ uint64_t surf_slice_size; /* Mipmap level offset within the slice in bytes. Only valid for LINEAR. */ uint32_t offset[RADEON_SURF_MAX_LEVELS]; + /* Mipmap level pitch in elements. Only valid for LINEAR. */ + uint16_t pitch[RADEON_SURF_MAX_LEVELS]; uint64_t stencil_offset; /* separate stencil */ @@ -225,6 +230,15 @@ uint32_t cmask_slice_size; uint32_t cmask_alignment; + /* All buffers combined. */ + uint64_t htile_offset; + uint64_t fmask_offset; + uint64_t cmask_offset; + uint64_t dcc_offset; + uint64_t display_dcc_offset; + uint64_t dcc_retile_map_offset; + uint64_t total_size; + union { /* Return values for GFX8 and older. * diff -Nru mesa-19.2.8/src/amd/common/amd_family.h mesa-20.0.8/src/amd/common/amd_family.h --- mesa-19.2.8/src/amd/common/amd_family.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/common/amd_family.h 2020-06-12 01:21:16.000000000 +0000 @@ -121,4 +121,17 @@ GFX10, }; +enum ring_type { + RING_GFX = 0, + RING_COMPUTE, + RING_DMA, + RING_UVD, + RING_VCE, + RING_UVD_ENC, + RING_VCN_DEC, + RING_VCN_ENC, + RING_VCN_JPEG, + NUM_RING_TYPES, +}; + #endif diff -Nru mesa-19.2.8/src/amd/common/meson.build mesa-20.0.8/src/amd/common/meson.build --- mesa-19.2.8/src/amd/common/meson.build 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/common/meson.build 2020-06-12 01:21:16.000000000 +0000 @@ -40,18 +40,10 @@ 'ac_binary.c', 'ac_binary.h', 'ac_exp_param.h', - 'ac_llvm_build.c', - 'ac_llvm_build.h', - 'ac_llvm_cull.c', - 'ac_llvm_cull.h', - 'ac_llvm_helper.cpp', - 'ac_llvm_util.c', - 'ac_llvm_util.h', - 'ac_shader_abi.h', + 'ac_shader_args.c', + 'ac_shader_args.h', 'ac_shader_util.c', 'ac_shader_util.h', - 'ac_nir_to_llvm.c', - 'ac_nir_to_llvm.h', 'ac_gpu_info.c', 'ac_gpu_info.h', 'ac_rtld.c', @@ -69,7 +61,7 @@ inc_common, inc_compiler, inc_mesa, inc_mapi, inc_amd, ], dependencies : [ - dep_llvm, dep_thread, dep_elf, dep_libdrm_amdgpu, dep_valgrind, + dep_thread, dep_elf, dep_libdrm_amdgpu, dep_valgrind, idep_nir_headers, ], c_args : [c_vis_args], diff -Nru mesa-19.2.8/src/amd/compiler/aco_assembler.cpp mesa-20.0.8/src/amd/compiler/aco_assembler.cpp --- mesa-19.2.8/src/amd/compiler/aco_assembler.cpp 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/amd/compiler/aco_assembler.cpp 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,735 @@ +#include +#include + +#include "aco_ir.h" +#include "common/sid.h" +#include "ac_shader_util.h" +#include "util/u_math.h" + +namespace aco { + +struct asm_context { + Program *program; + enum chip_class chip_class; + std::vector> branches; + std::vector constaddrs; + const int16_t* opcode; + // TODO: keep track of branch instructions referring blocks + // and, when emitting the block, correct the offset in instr + asm_context(Program* program) : program(program), chip_class(program->chip_class) { + if (chip_class <= GFX7) + opcode = &instr_info.opcode_gfx7[0]; + else if (chip_class <= GFX9) + opcode = &instr_info.opcode_gfx9[0]; + else if (chip_class == GFX10) + opcode = &instr_info.opcode_gfx10[0]; + } + + int subvector_begin_pos = -1; +}; + +void emit_instruction(asm_context& ctx, std::vector& out, Instruction* instr) +{ + /* lower remaining pseudo-instructions */ + if (instr->opcode == aco_opcode::p_constaddr) { + unsigned dest = instr->definitions[0].physReg(); + unsigned offset = instr->operands[0].constantValue(); + + /* s_getpc_b64 dest[0:1] */ + uint32_t encoding = (0b101111101 << 23); + uint32_t opcode = ctx.opcode[(int)aco_opcode::s_getpc_b64]; + if (opcode >= 55 && ctx.chip_class <= GFX9) { + assert(ctx.chip_class == GFX9 && opcode < 60); + opcode = opcode - 4; + } + encoding |= dest << 16; + encoding |= opcode << 8; + out.push_back(encoding); + + /* s_add_u32 dest[0], dest[0], ... */ + encoding = (0b10 << 30); + encoding |= ctx.opcode[(int)aco_opcode::s_add_u32] << 23; + encoding |= dest << 16; + encoding |= dest; + encoding |= 255 << 8; + out.push_back(encoding); + ctx.constaddrs.push_back(out.size()); + out.push_back(offset); + + /* s_addc_u32 dest[1], dest[1], 0 */ + encoding = (0b10 << 30); + encoding |= ctx.opcode[(int)aco_opcode::s_addc_u32] << 23; + encoding |= (dest + 1) << 16; + encoding |= dest + 1; + encoding |= 128 << 8; + out.push_back(encoding); + return; + } + + uint32_t opcode = ctx.opcode[(int)instr->opcode]; + if (opcode == (uint32_t)-1) { + fprintf(stderr, "Unsupported opcode: "); + aco_print_instr(instr, stderr); + abort(); + } + + switch (instr->format) { + case Format::SOP2: { + uint32_t encoding = (0b10 << 30); + encoding |= opcode << 23; + encoding |= !instr->definitions.empty() ? instr->definitions[0].physReg() << 16 : 0; + encoding |= instr->operands.size() >= 2 ? instr->operands[1].physReg() << 8 : 0; + encoding |= !instr->operands.empty() ? instr->operands[0].physReg() : 0; + out.push_back(encoding); + break; + } + case Format::SOPK: { + SOPK_instruction *sopk = static_cast(instr); + + if (instr->opcode == aco_opcode::s_subvector_loop_begin) { + assert(ctx.chip_class >= GFX10); + assert(ctx.subvector_begin_pos == -1); + ctx.subvector_begin_pos = out.size(); + } else if (instr->opcode == aco_opcode::s_subvector_loop_end) { + assert(ctx.chip_class >= GFX10); + assert(ctx.subvector_begin_pos != -1); + /* Adjust s_subvector_loop_begin instruction to the address after the end */ + out[ctx.subvector_begin_pos] |= (out.size() - ctx.subvector_begin_pos); + /* Adjust s_subvector_loop_end instruction to the address after the beginning */ + sopk->imm = (uint16_t)(ctx.subvector_begin_pos - (int)out.size()); + ctx.subvector_begin_pos = -1; + } + + uint32_t encoding = (0b1011 << 28); + encoding |= opcode << 23; + encoding |= + !instr->definitions.empty() && !(instr->definitions[0].physReg() == scc) ? + instr->definitions[0].physReg() << 16 : + !instr->operands.empty() && instr->operands[0].physReg() <= 127 ? + instr->operands[0].physReg() << 16 : 0; + encoding |= sopk->imm; + out.push_back(encoding); + break; + } + case Format::SOP1: { + uint32_t encoding = (0b101111101 << 23); + if (opcode >= 55 && ctx.chip_class <= GFX9) { + assert(ctx.chip_class == GFX9 && opcode < 60); + opcode = opcode - 4; + } + encoding |= !instr->definitions.empty() ? instr->definitions[0].physReg() << 16 : 0; + encoding |= opcode << 8; + encoding |= !instr->operands.empty() ? instr->operands[0].physReg() : 0; + out.push_back(encoding); + break; + } + case Format::SOPC: { + uint32_t encoding = (0b101111110 << 23); + encoding |= opcode << 16; + encoding |= instr->operands.size() == 2 ? instr->operands[1].physReg() << 8 : 0; + encoding |= !instr->operands.empty() ? instr->operands[0].physReg() : 0; + out.push_back(encoding); + break; + } + case Format::SOPP: { + SOPP_instruction* sopp = static_cast(instr); + uint32_t encoding = (0b101111111 << 23); + encoding |= opcode << 16; + encoding |= (uint16_t) sopp->imm; + if (sopp->block != -1) + ctx.branches.emplace_back(out.size(), sopp); + out.push_back(encoding); + break; + } + case Format::SMEM: { + SMEM_instruction* smem = static_cast(instr); + bool soe = instr->operands.size() >= (!instr->definitions.empty() ? 3 : 4); + bool is_load = !instr->definitions.empty(); + uint32_t encoding = 0; + + if (ctx.chip_class <= GFX7) { + encoding = (0b11000 << 27); + encoding |= opcode << 22; + encoding |= instr->definitions.size() ? instr->definitions[0].physReg() << 15 : 0; + encoding |= instr->operands.size() ? (instr->operands[0].physReg() >> 1) << 9 : 0; + if (instr->operands.size() >= 2) { + if (!instr->operands[1].isConstant() || instr->operands[1].constantValue() >= 1024) { + encoding |= instr->operands[1].physReg().reg; + } else { + encoding |= instr->operands[1].constantValue() >> 2; + encoding |= 1 << 8; + } + } + out.push_back(encoding); + /* SMRD instructions can take a literal on GFX6 & GFX7 */ + if (instr->operands.size() >= 2 && instr->operands[1].isConstant() && instr->operands[1].constantValue() >= 1024) + out.push_back(instr->operands[1].constantValue() >> 2); + return; + } + + if (ctx.chip_class <= GFX9) { + encoding = (0b110000 << 26); + assert(!smem->dlc); /* Device-level coherent is not supported on GFX9 and lower */ + encoding |= smem->nv ? 1 << 15 : 0; + } else { + encoding = (0b111101 << 26); + assert(!smem->nv); /* Non-volatile is not supported on GFX10 */ + encoding |= smem->dlc ? 1 << 14 : 0; + } + + encoding |= opcode << 18; + encoding |= smem->glc ? 1 << 16 : 0; + + if (ctx.chip_class <= GFX9) { + if (instr->operands.size() >= 2) + encoding |= instr->operands[1].isConstant() ? 1 << 17 : 0; /* IMM - immediate enable */ + } + if (ctx.chip_class == GFX9) { + encoding |= soe ? 1 << 14 : 0; + } + + if (is_load || instr->operands.size() >= 3) { /* SDATA */ + encoding |= (is_load ? instr->definitions[0].physReg() : instr->operands[2].physReg()) << 6; + } + if (instr->operands.size() >= 1) { /* SBASE */ + encoding |= instr->operands[0].physReg() >> 1; + } + + out.push_back(encoding); + encoding = 0; + + int32_t offset = 0; + uint32_t soffset = ctx.chip_class >= GFX10 + ? sgpr_null /* On GFX10 this is disabled by specifying SGPR_NULL */ + : 0; /* On GFX9, it is disabled by the SOE bit (and it's not present on GFX8 and below) */ + if (instr->operands.size() >= 2) { + const Operand &op_off1 = instr->operands[1]; + if (ctx.chip_class <= GFX9) { + offset = op_off1.isConstant() ? op_off1.constantValue() : op_off1.physReg(); + } else { + /* GFX10 only supports constants in OFFSET, so put the operand in SOFFSET if it's an SGPR */ + if (op_off1.isConstant()) { + offset = op_off1.constantValue(); + } else { + soffset = op_off1.physReg(); + assert(!soe); /* There is no place to put the other SGPR offset, if any */ + } + } + + if (soe) { + const Operand &op_off2 = instr->operands.back(); + assert(ctx.chip_class >= GFX9); /* GFX8 and below don't support specifying a constant and an SGPR at the same time */ + assert(!op_off2.isConstant()); + soffset = op_off2.physReg(); + } + } + encoding |= offset; + encoding |= soffset << 25; + + out.push_back(encoding); + return; + } + case Format::VOP2: { + uint32_t encoding = 0; + encoding |= opcode << 25; + encoding |= (0xFF & instr->definitions[0].physReg()) << 17; + encoding |= (0xFF & instr->operands[1].physReg()) << 9; + encoding |= instr->operands[0].physReg(); + out.push_back(encoding); + break; + } + case Format::VOP1: { + uint32_t encoding = (0b0111111 << 25); + if (!instr->definitions.empty()) + encoding |= (0xFF & instr->definitions[0].physReg()) << 17; + encoding |= opcode << 9; + if (!instr->operands.empty()) + encoding |= instr->operands[0].physReg(); + out.push_back(encoding); + break; + } + case Format::VOPC: { + uint32_t encoding = (0b0111110 << 25); + encoding |= opcode << 17; + encoding |= (0xFF & instr->operands[1].physReg()) << 9; + encoding |= instr->operands[0].physReg(); + out.push_back(encoding); + break; + } + case Format::VINTRP: { + Interp_instruction* interp = static_cast(instr); + uint32_t encoding = 0; + + if (ctx.chip_class == GFX8 || ctx.chip_class == GFX9) { + encoding = (0b110101 << 26); /* Vega ISA doc says 110010 but it's wrong */ + } else { + encoding = (0b110010 << 26); + } + + assert(encoding); + encoding |= (0xFF & instr->definitions[0].physReg()) << 18; + encoding |= opcode << 16; + encoding |= interp->attribute << 10; + encoding |= interp->component << 8; + if (instr->opcode == aco_opcode::v_interp_mov_f32) + encoding |= (0x3 & instr->operands[0].constantValue()); + else + encoding |= (0xFF & instr->operands[0].physReg()); + out.push_back(encoding); + break; + } + case Format::DS: { + DS_instruction* ds = static_cast(instr); + uint32_t encoding = (0b110110 << 26); + if (ctx.chip_class == GFX8 || ctx.chip_class == GFX9) { + encoding |= opcode << 17; + encoding |= (ds->gds ? 1 : 0) << 16; + } else { + encoding |= opcode << 18; + encoding |= (ds->gds ? 1 : 0) << 17; + } + encoding |= ((0xFF & ds->offset1) << 8); + encoding |= (0xFFFF & ds->offset0); + out.push_back(encoding); + encoding = 0; + unsigned reg = !instr->definitions.empty() ? instr->definitions[0].physReg() : 0; + encoding |= (0xFF & reg) << 24; + reg = instr->operands.size() >= 3 && !(instr->operands[2].physReg() == m0) ? instr->operands[2].physReg() : 0; + encoding |= (0xFF & reg) << 16; + reg = instr->operands.size() >= 2 && !(instr->operands[1].physReg() == m0) ? instr->operands[1].physReg() : 0; + encoding |= (0xFF & reg) << 8; + encoding |= (0xFF & instr->operands[0].physReg()); + out.push_back(encoding); + break; + } + case Format::MUBUF: { + MUBUF_instruction* mubuf = static_cast(instr); + uint32_t encoding = (0b111000 << 26); + encoding |= opcode << 18; + encoding |= (mubuf->lds ? 1 : 0) << 16; + encoding |= (mubuf->glc ? 1 : 0) << 14; + encoding |= (mubuf->idxen ? 1 : 0) << 13; + assert(!mubuf->addr64 || ctx.chip_class <= GFX7); + if (ctx.chip_class == GFX6 || ctx.chip_class == GFX7) + encoding |= (mubuf->addr64 ? 1 : 0) << 15; + encoding |= (mubuf->offen ? 1 : 0) << 12; + if (ctx.chip_class == GFX8 || ctx.chip_class == GFX9) { + assert(!mubuf->dlc); /* Device-level coherent is not supported on GFX9 and lower */ + encoding |= (mubuf->slc ? 1 : 0) << 17; + } else if (ctx.chip_class >= GFX10) { + encoding |= (mubuf->dlc ? 1 : 0) << 15; + } + encoding |= 0x0FFF & mubuf->offset; + out.push_back(encoding); + encoding = 0; + if (ctx.chip_class <= GFX7 || ctx.chip_class >= GFX10) { + encoding |= (mubuf->slc ? 1 : 0) << 22; + } + encoding |= instr->operands[2].physReg() << 24; + encoding |= (mubuf->tfe ? 1 : 0) << 23; + encoding |= (instr->operands[0].physReg() >> 2) << 16; + unsigned reg = instr->operands.size() > 3 ? instr->operands[3].physReg() : instr->definitions[0].physReg(); + encoding |= (0xFF & reg) << 8; + encoding |= (0xFF & instr->operands[1].physReg()); + out.push_back(encoding); + break; + } + case Format::MTBUF: { + MTBUF_instruction* mtbuf = static_cast(instr); + + uint32_t img_format = ac_get_tbuffer_format(ctx.chip_class, mtbuf->dfmt, mtbuf->nfmt); + uint32_t encoding = (0b111010 << 26); + assert(img_format <= 0x7F); + assert(!mtbuf->dlc || ctx.chip_class >= GFX10); + encoding |= (mtbuf->dlc ? 1 : 0) << 15; /* DLC bit replaces one bit of the OPCODE on GFX10 */ + encoding |= (mtbuf->glc ? 1 : 0) << 14; + encoding |= (mtbuf->idxen ? 1 : 0) << 13; + encoding |= (mtbuf->offen ? 1 : 0) << 12; + encoding |= 0x0FFF & mtbuf->offset; + encoding |= (img_format << 19); /* Handles both the GFX10 FORMAT and the old NFMT+DFMT */ + + if (ctx.chip_class == GFX8 || ctx.chip_class == GFX9) { + encoding |= opcode << 15; + } else { + encoding |= (opcode & 0x07) << 16; /* 3 LSBs of 4-bit OPCODE */ + } + + out.push_back(encoding); + encoding = 0; + + encoding |= instr->operands[2].physReg() << 24; + encoding |= (mtbuf->tfe ? 1 : 0) << 23; + encoding |= (mtbuf->slc ? 1 : 0) << 22; + encoding |= (instr->operands[0].physReg() >> 2) << 16; + unsigned reg = instr->operands.size() > 3 ? instr->operands[3].physReg() : instr->definitions[0].physReg(); + encoding |= (0xFF & reg) << 8; + encoding |= (0xFF & instr->operands[1].physReg()); + + if (ctx.chip_class >= GFX10) { + encoding |= (((opcode & 0x08) >> 4) << 21); /* MSB of 4-bit OPCODE */ + } + + out.push_back(encoding); + break; + } + case Format::MIMG: { + MIMG_instruction* mimg = static_cast(instr); + uint32_t encoding = (0b111100 << 26); + encoding |= mimg->slc ? 1 << 25 : 0; + encoding |= opcode << 18; + encoding |= mimg->lwe ? 1 << 17 : 0; + encoding |= mimg->tfe ? 1 << 16 : 0; + encoding |= mimg->glc ? 1 << 13 : 0; + encoding |= mimg->unrm ? 1 << 12 : 0; + if (ctx.chip_class <= GFX9) { + assert(!mimg->dlc); /* Device-level coherent is not supported on GFX9 and lower */ + assert(!mimg->r128); + encoding |= mimg->a16 ? 1 << 15 : 0; + encoding |= mimg->da ? 1 << 14 : 0; + } else { + encoding |= mimg->r128 ? 1 << 15 : 0; /* GFX10: A16 moved to 2nd word, R128 replaces it in 1st word */ + encoding |= mimg->dim << 3; /* GFX10: dimensionality instead of declare array */ + encoding |= mimg->dlc ? 1 << 7 : 0; + } + encoding |= (0xF & mimg->dmask) << 8; + out.push_back(encoding); + encoding = (0xFF & instr->operands[2].physReg()); /* VADDR */ + if (!instr->definitions.empty()) { + encoding |= (0xFF & instr->definitions[0].physReg()) << 8; /* VDATA */ + } else if (instr->operands[1].regClass().type() == RegType::vgpr) { + encoding |= (0xFF & instr->operands[1].physReg()) << 8; /* VDATA */ + } + encoding |= (0x1F & (instr->operands[0].physReg() >> 2)) << 16; /* T# (resource) */ + if (instr->operands[1].regClass().type() == RegType::sgpr) + encoding |= (0x1F & (instr->operands[1].physReg() >> 2)) << 21; /* sampler */ + + assert(!mimg->d16 || ctx.chip_class >= GFX9); + encoding |= mimg->d16 ? 1 << 15 : 0; + if (ctx.chip_class >= GFX10) { + encoding |= mimg->a16 ? 1 << 14 : 0; /* GFX10: A16 still exists, but is in a different place */ + } + + out.push_back(encoding); + break; + } + case Format::FLAT: + case Format::SCRATCH: + case Format::GLOBAL: { + FLAT_instruction *flat = static_cast(instr); + uint32_t encoding = (0b110111 << 26); + encoding |= opcode << 18; + if (ctx.chip_class <= GFX9) { + assert(flat->offset <= 0x1fff); + encoding |= flat->offset & 0x1fff; + } else if (instr->format == Format::FLAT) { + /* GFX10 has a 12-bit immediate OFFSET field, + * but it has a hw bug: it ignores the offset, called FlatSegmentOffsetBug + */ + assert(flat->offset == 0); + } else { + assert(flat->offset <= 0xfff); + encoding |= flat->offset & 0xfff; + } + if (instr->format == Format::SCRATCH) + encoding |= 1 << 14; + else if (instr->format == Format::GLOBAL) + encoding |= 2 << 14; + encoding |= flat->lds ? 1 << 13 : 0; + encoding |= flat->glc ? 1 << 16 : 0; + encoding |= flat->slc ? 1 << 17 : 0; + if (ctx.chip_class >= GFX10) { + assert(!flat->nv); + encoding |= flat->dlc ? 1 << 12 : 0; + } else { + assert(!flat->dlc); + } + out.push_back(encoding); + encoding = (0xFF & instr->operands[0].physReg()); + if (!instr->definitions.empty()) + encoding |= (0xFF & instr->definitions[0].physReg()) << 24; + if (instr->operands.size() >= 3) + encoding |= (0xFF & instr->operands[2].physReg()) << 8; + if (!instr->operands[1].isUndefined()) { + assert(ctx.chip_class >= GFX10 || instr->operands[1].physReg() != 0x7F); + assert(instr->format != Format::FLAT); + encoding |= instr->operands[1].physReg() << 16; + } else if (instr->format != Format::FLAT || ctx.chip_class >= GFX10) { /* SADDR is actually used with FLAT on GFX10 */ + if (ctx.chip_class <= GFX9) + encoding |= 0x7F << 16; + else + encoding |= sgpr_null << 16; + } + encoding |= flat->nv ? 1 << 23 : 0; + out.push_back(encoding); + break; + } + case Format::EXP: { + Export_instruction* exp = static_cast(instr); + uint32_t encoding; + if (ctx.chip_class == GFX8 || ctx.chip_class == GFX9) { + encoding = (0b110001 << 26); + } else { + encoding = (0b111110 << 26); + } + + encoding |= exp->valid_mask ? 0b1 << 12 : 0; + encoding |= exp->done ? 0b1 << 11 : 0; + encoding |= exp->compressed ? 0b1 << 10 : 0; + encoding |= exp->dest << 4; + encoding |= exp->enabled_mask; + out.push_back(encoding); + encoding = 0xFF & exp->operands[0].physReg(); + encoding |= (0xFF & exp->operands[1].physReg()) << 8; + encoding |= (0xFF & exp->operands[2].physReg()) << 16; + encoding |= (0xFF & exp->operands[3].physReg()) << 24; + out.push_back(encoding); + break; + } + case Format::PSEUDO: + case Format::PSEUDO_BARRIER: + unreachable("Pseudo instructions should be lowered before assembly."); + default: + if ((uint16_t) instr->format & (uint16_t) Format::VOP3A) { + VOP3A_instruction* vop3 = static_cast(instr); + + if ((uint16_t) instr->format & (uint16_t) Format::VOP2) { + opcode = opcode + 0x100; + } else if ((uint16_t) instr->format & (uint16_t) Format::VOP1) { + if (ctx.chip_class == GFX8 || ctx.chip_class == GFX9) + opcode = opcode + 0x140; + else + opcode = opcode + 0x180; + } else if ((uint16_t) instr->format & (uint16_t) Format::VOPC) { + opcode = opcode + 0x0; + } else if ((uint16_t) instr->format & (uint16_t) Format::VINTRP) { + opcode = opcode + 0x270; + } + + uint32_t encoding; + if (ctx.chip_class <= GFX9) { + encoding = (0b110100 << 26); + } else if (ctx.chip_class == GFX10) { + encoding = (0b110101 << 26); + } else { + unreachable("Unknown chip_class."); + } + + if (ctx.chip_class <= GFX7) { + encoding |= opcode << 17; + encoding |= (vop3->clamp ? 1 : 0) << 11; + } else { + encoding |= opcode << 16; + encoding |= (vop3->clamp ? 1 : 0) << 15; + } + encoding |= vop3->opsel << 11; + for (unsigned i = 0; i < 3; i++) + encoding |= vop3->abs[i] << (8+i); + if (instr->definitions.size() == 2) + encoding |= instr->definitions[1].physReg() << 8; + encoding |= (0xFF & instr->definitions[0].physReg()); + out.push_back(encoding); + encoding = 0; + if (instr->opcode == aco_opcode::v_interp_mov_f32) { + encoding = 0x3 & instr->operands[0].constantValue(); + } else { + for (unsigned i = 0; i < instr->operands.size(); i++) + encoding |= instr->operands[i].physReg() << (i * 9); + } + encoding |= vop3->omod << 27; + for (unsigned i = 0; i < 3; i++) + encoding |= vop3->neg[i] << (29+i); + out.push_back(encoding); + + } else if (instr->isDPP()){ + assert(ctx.chip_class >= GFX8); + /* first emit the instruction without the DPP operand */ + Operand dpp_op = instr->operands[0]; + instr->operands[0] = Operand(PhysReg{250}, v1); + instr->format = (Format) ((uint32_t) instr->format & ~(1 << 14)); + emit_instruction(ctx, out, instr); + DPP_instruction* dpp = static_cast(instr); + uint32_t encoding = (0xF & dpp->row_mask) << 28; + encoding |= (0xF & dpp->bank_mask) << 24; + encoding |= dpp->abs[1] << 23; + encoding |= dpp->neg[1] << 22; + encoding |= dpp->abs[0] << 21; + encoding |= dpp->neg[0] << 20; + encoding |= dpp->bound_ctrl << 19; + encoding |= dpp->dpp_ctrl << 8; + encoding |= (0xFF) & dpp_op.physReg(); + out.push_back(encoding); + return; + } else { + unreachable("unimplemented instruction format"); + } + break; + } + + /* append literal dword */ + for (const Operand& op : instr->operands) { + if (op.isLiteral()) { + out.push_back(op.constantValue()); + break; + } + } +} + +void emit_block(asm_context& ctx, std::vector& out, Block& block) +{ + for (aco_ptr& instr : block.instructions) { +#if 0 + int start_idx = out.size(); + std::cerr << "Encoding:\t" << std::endl; + aco_print_instr(&*instr, stderr); + std::cerr << std::endl; +#endif + emit_instruction(ctx, out, instr.get()); +#if 0 + for (int i = start_idx; i < out.size(); i++) + std::cerr << "encoding: " << "0x" << std::setfill('0') << std::setw(8) << std::hex << out[i] << std::endl; +#endif + } +} + +void fix_exports(asm_context& ctx, std::vector& out, Program* program) +{ + for (Block& block : program->blocks) { + if (!(block.kind & block_kind_export_end)) + continue; + std::vector>::reverse_iterator it = block.instructions.rbegin(); + bool exported = false; + while ( it != block.instructions.rend()) + { + if ((*it)->format == Format::EXP) { + Export_instruction* exp = static_cast((*it).get()); + if (program->stage & hw_vs) { + if (exp->dest >= V_008DFC_SQ_EXP_POS && exp->dest <= (V_008DFC_SQ_EXP_POS + 3)) { + exp->done = true; + exported = true; + break; + } + } else { + exp->done = true; + exp->valid_mask = true; + exported = true; + break; + } + } else if ((*it)->definitions.size() && (*it)->definitions[0].physReg() == exec) + break; + ++it; + } + if (exported) + continue; + /* we didn't find an Export instruction and have to insert a null export */ + aco_ptr exp{create_instruction(aco_opcode::exp, Format::EXP, 4, 0)}; + for (unsigned i = 0; i < 4; i++) + exp->operands[i] = Operand(v1); + exp->enabled_mask = 0; + exp->compressed = false; + exp->done = true; + exp->valid_mask = (program->stage & hw_fs) || program->chip_class >= GFX10; + if (program->stage & hw_fs) + exp->dest = 9; /* NULL */ + else + exp->dest = V_008DFC_SQ_EXP_POS; + /* insert the null export 1 instruction before branch/endpgm */ + block.instructions.insert(block.instructions.end() - 1, std::move(exp)); + } +} + +static void fix_branches_gfx10(asm_context& ctx, std::vector& out) +{ + /* Branches with an offset of 0x3f are buggy on GFX10, we workaround by inserting NOPs if needed. */ + bool gfx10_3f_bug = false; + + do { + auto buggy_branch_it = std::find_if(ctx.branches.begin(), ctx.branches.end(), [&ctx](const auto &branch) -> bool { + return ((int)ctx.program->blocks[branch.second->block].offset - branch.first - 1) == 0x3f; + }); + + gfx10_3f_bug = buggy_branch_it != ctx.branches.end(); + + if (gfx10_3f_bug) { + /* Insert an s_nop after the branch */ + constexpr uint32_t s_nop_0 = 0xbf800000u; + int s_nop_pos = buggy_branch_it->first + 1; + auto out_pos = std::next(out.begin(), s_nop_pos); + out.insert(out_pos, s_nop_0); + + /* Update the offset of each affected block */ + for (Block& block : ctx.program->blocks) { + if (block.offset > (unsigned)buggy_branch_it->first) + block.offset++; + } + + /* Update the branches following the current one */ + for (auto branch_it = std::next(buggy_branch_it); branch_it != ctx.branches.end(); ++branch_it) + branch_it->first++; + + /* Find first constant address after the inserted instruction */ + auto caddr_it = std::find_if(ctx.constaddrs.begin(), ctx.constaddrs.end(), [s_nop_pos](const int &caddr_pos) -> bool { + return caddr_pos >= s_nop_pos; + }); + + /* Update the locations of constant addresses */ + for (; caddr_it != ctx.constaddrs.end(); ++caddr_it) + (*caddr_it)++; + + } + } while (gfx10_3f_bug); +} + +void fix_branches(asm_context& ctx, std::vector& out) +{ + if (ctx.chip_class >= GFX10) + fix_branches_gfx10(ctx, out); + + for (std::pair &branch : ctx.branches) { + int offset = (int)ctx.program->blocks[branch.second->block].offset - branch.first - 1; + out[branch.first] |= (uint16_t) offset; + } +} + +void fix_constaddrs(asm_context& ctx, std::vector& out) +{ + for (unsigned addr : ctx.constaddrs) + out[addr] += (out.size() - addr + 1u) * 4u; +} + +unsigned emit_program(Program* program, + std::vector& code) +{ + asm_context ctx(program); + + if (program->stage & (hw_vs | hw_fs)) + fix_exports(ctx, code, program); + + for (Block& block : program->blocks) { + block.offset = code.size(); + emit_block(ctx, code, block); + } + + fix_branches(ctx, code); + + unsigned exec_size = code.size() * sizeof(uint32_t); + + if (program->chip_class >= GFX10) { + /* Pad output with s_code_end so instruction prefetching doesn't cause + * page faults */ + unsigned final_size = align(code.size() + 3 * 16, 16); + while (code.size() < final_size) + code.push_back(0xbf9f0000u); + } + + fix_constaddrs(ctx, code); + + while (program->constant_data.size() % 4u) + program->constant_data.push_back(0); + /* Copy constant data */ + code.insert(code.end(), (uint32_t*)program->constant_data.data(), + (uint32_t*)(program->constant_data.data() + program->constant_data.size())); + + return exec_size; +} + +} diff -Nru mesa-19.2.8/src/amd/compiler/aco_builder_h.py mesa-20.0.8/src/amd/compiler/aco_builder_h.py --- mesa-19.2.8/src/amd/compiler/aco_builder_h.py 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/amd/compiler/aco_builder_h.py 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,550 @@ + +template = """\ +/* + * Copyright (c) 2019 Valve Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * This file was generated by aco_builder_h.py + */ + +#ifndef _ACO_BUILDER_ +#define _ACO_BUILDER_ + +#include "aco_ir.h" +#include "util/u_math.h" +#include "util/bitscan.h" + +namespace aco { +enum dpp_ctrl { + _dpp_quad_perm = 0x000, + _dpp_row_sl = 0x100, + _dpp_row_sr = 0x110, + _dpp_row_rr = 0x120, + dpp_wf_sl1 = 0x130, + dpp_wf_rl1 = 0x134, + dpp_wf_sr1 = 0x138, + dpp_wf_rr1 = 0x13C, + dpp_row_mirror = 0x140, + dpp_row_half_mirror = 0x141, + dpp_row_bcast15 = 0x142, + dpp_row_bcast31 = 0x143 +}; + +inline dpp_ctrl +dpp_quad_perm(unsigned lane0, unsigned lane1, unsigned lane2, unsigned lane3) +{ + assert(lane0 < 4 && lane1 < 4 && lane2 < 4 && lane3 < 4); + return (dpp_ctrl)(lane0 | (lane1 << 2) | (lane2 << 4) | (lane3 << 6)); +} + +inline dpp_ctrl +dpp_row_sl(unsigned amount) +{ + assert(amount > 0 && amount < 16); + return (dpp_ctrl)(((unsigned) _dpp_row_sl) | amount); +} + +inline dpp_ctrl +dpp_row_sr(unsigned amount) +{ + assert(amount > 0 && amount < 16); + return (dpp_ctrl)(((unsigned) _dpp_row_sr) | amount); +} + +inline unsigned +ds_pattern_bitmode(unsigned and_mask, unsigned or_mask, unsigned xor_mask) +{ + assert(and_mask < 32 && or_mask < 32 && xor_mask < 32); + return and_mask | (or_mask << 5) | (xor_mask << 10); +} + +aco_ptr create_s_mov(Definition dst, Operand src); + +enum sendmsg { + sendmsg_none = 0, + _sendmsg_gs = 2, + _sendmsg_gs_done = 3, + sendmsg_save_wave = 4, + sendmsg_stall_wave_gen = 5, + sendmsg_halt_waves = 6, + sendmsg_ordered_ps_done = 7, + sendmsg_early_prim_dealloc = 8, + sendmsg_gs_alloc_req = 9, + sendmsg_id_mask = 0xf, +}; + +inline sendmsg +sendmsg_gs(bool cut, bool emit, unsigned stream) +{ + assert(stream < 4); + return (sendmsg)((unsigned)_sendmsg_gs | (cut << 4) | (emit << 5) | (stream << 8)); +} + +inline sendmsg +sendmsg_gs_done(bool cut, bool emit, unsigned stream) +{ + assert(stream < 4); + return (sendmsg)((unsigned)_sendmsg_gs_done | (cut << 4) | (emit << 5) | (stream << 8)); +} + +class Builder { +public: + struct Result { + Instruction *instr; + + Result(Instruction *instr) : instr(instr) {} + + operator Instruction *() const { + return instr; + } + + operator Temp() const { + return instr->definitions[0].getTemp(); + } + + operator Operand() const { + return Operand((Temp)*this); + } + + Definition& def(unsigned index) const { + return instr->definitions[index]; + } + + aco_ptr get_ptr() const { + return aco_ptr(instr); + } + }; + + struct Op { + Operand op; + Op(Temp tmp) : op(tmp) {} + Op(Operand op_) : op(op_) {} + Op(Result res) : op((Temp)res) {} + }; + + enum WaveSpecificOpcode { + s_cselect = (unsigned) aco_opcode::s_cselect_b64, + s_cmp_lg = (unsigned) aco_opcode::s_cmp_lg_u64, + s_and = (unsigned) aco_opcode::s_and_b64, + s_andn2 = (unsigned) aco_opcode::s_andn2_b64, + s_or = (unsigned) aco_opcode::s_or_b64, + s_orn2 = (unsigned) aco_opcode::s_orn2_b64, + s_not = (unsigned) aco_opcode::s_not_b64, + s_mov = (unsigned) aco_opcode::s_mov_b64, + s_wqm = (unsigned) aco_opcode::s_wqm_b64, + s_and_saveexec = (unsigned) aco_opcode::s_and_saveexec_b64, + s_or_saveexec = (unsigned) aco_opcode::s_or_saveexec_b64, + s_xnor = (unsigned) aco_opcode::s_xnor_b64, + s_xor = (unsigned) aco_opcode::s_xor_b64, + s_bcnt1_i32 = (unsigned) aco_opcode::s_bcnt1_i32_b64, + s_bitcmp1 = (unsigned) aco_opcode::s_bitcmp1_b64, + s_ff1_i32 = (unsigned) aco_opcode::s_ff1_i32_b64, + }; + + Program *program; + bool use_iterator; + bool start; // only when use_iterator == false + RegClass lm; + + std::vector> *instructions; + std::vector>::iterator it; + + Builder(Program *pgm) : program(pgm), use_iterator(false), start(false), lm(pgm->lane_mask), instructions(NULL) {} + Builder(Program *pgm, Block *block) : program(pgm), use_iterator(false), start(false), lm(pgm ? pgm->lane_mask : s2), instructions(&block->instructions) {} + Builder(Program *pgm, std::vector> *instrs) : program(pgm), use_iterator(false), start(false), lm(pgm ? pgm->lane_mask : s2), instructions(instrs) {} + + void moveEnd(Block *block) { + instructions = &block->instructions; + } + + void reset() { + use_iterator = false; + start = false; + instructions = NULL; + } + + void reset(Block *block) { + use_iterator = false; + start = false; + instructions = &block->instructions; + } + + void reset(std::vector> *instrs) { + use_iterator = false; + start = false; + instructions = instrs; + } + + void reset(std::vector> *instrs, std::vector>::iterator instr_it) { + use_iterator = true; + start = false; + instructions = instrs; + it = instr_it; + } + + Result insert(aco_ptr instr) { + Instruction *instr_ptr = instr.get(); + if (instructions) { + if (use_iterator) { + it = instructions->emplace(it, std::move(instr)); + it = std::next(it); + } else if (!start) { + instructions->emplace_back(std::move(instr)); + } else { + instructions->emplace(instructions->begin(), std::move(instr)); + } + } + return Result(instr_ptr); + } + + Result insert(Instruction* instr) { + if (instructions) { + if (use_iterator) { + it = instructions->emplace(it, aco_ptr(instr)); + it = std::next(it); + } else if (!start) { + instructions->emplace_back(aco_ptr(instr)); + } else { + instructions->emplace(instructions->begin(), aco_ptr(instr)); + } + } + return Result(instr); + } + + Temp tmp(RegClass rc) { + return (Temp){program->allocateId(), rc}; + } + + Temp tmp(RegType type, unsigned size) { + return (Temp){program->allocateId(), RegClass(type, size)}; + } + + Definition def(RegClass rc) { + return Definition((Temp){program->allocateId(), rc}); + } + + Definition def(RegType type, unsigned size) { + return Definition((Temp){program->allocateId(), RegClass(type, size)}); + } + + Definition def(RegClass rc, PhysReg reg) { + return Definition(program->allocateId(), reg, rc); + } + + inline aco_opcode w64or32(WaveSpecificOpcode opcode) const { + if (program->wave_size == 64) + return (aco_opcode) opcode; + + switch (opcode) { + case s_cselect: + return aco_opcode::s_cselect_b32; + case s_cmp_lg: + return aco_opcode::s_cmp_lg_u32; + case s_and: + return aco_opcode::s_and_b32; + case s_andn2: + return aco_opcode::s_andn2_b32; + case s_or: + return aco_opcode::s_or_b32; + case s_orn2: + return aco_opcode::s_orn2_b32; + case s_not: + return aco_opcode::s_not_b32; + case s_mov: + return aco_opcode::s_mov_b32; + case s_wqm: + return aco_opcode::s_wqm_b32; + case s_and_saveexec: + return aco_opcode::s_and_saveexec_b32; + case s_or_saveexec: + return aco_opcode::s_or_saveexec_b32; + case s_xnor: + return aco_opcode::s_xnor_b32; + case s_xor: + return aco_opcode::s_xor_b32; + case s_bcnt1_i32: + return aco_opcode::s_bcnt1_i32_b32; + case s_bitcmp1: + return aco_opcode::s_bitcmp1_b32; + case s_ff1_i32: + return aco_opcode::s_ff1_i32_b32; + default: + unreachable("Unsupported wave specific opcode."); + } + } + +% for fixed in ['m0', 'vcc', 'exec', 'scc']: + Operand ${fixed}(Temp tmp) { + % if fixed == 'vcc' or fixed == 'exec': + assert(tmp.regClass() == lm); + % endif + Operand op(tmp); + op.setFixed(aco::${fixed}); + return op; + } + + Definition ${fixed}(Definition def) { + % if fixed == 'vcc' or fixed == 'exec': + assert(def.regClass() == lm); + % endif + def.setFixed(aco::${fixed}); + return def; + } + + Definition hint_${fixed}(Definition def) { + % if fixed == 'vcc' or fixed == 'exec': + assert(def.regClass() == lm); + % endif + def.setHint(aco::${fixed}); + return def; + } + +% endfor + /* hand-written helpers */ + Temp as_uniform(Op op) + { + assert(op.op.isTemp()); + if (op.op.getTemp().type() == RegType::vgpr) + return pseudo(aco_opcode::p_as_uniform, def(RegType::sgpr, op.op.size()), op); + else + return op.op.getTemp(); + } + + Result v_mul_imm(Definition dst, Temp tmp, uint32_t imm, bool bits24=false) + { + assert(tmp.type() == RegType::vgpr); + if (imm == 0) { + return vop1(aco_opcode::v_mov_b32, dst, Operand(0u)); + } else if (imm == 1) { + return copy(dst, Operand(tmp)); + } else if (util_is_power_of_two_or_zero(imm)) { + return vop2(aco_opcode::v_lshlrev_b32, dst, Operand((uint32_t)ffs(imm) - 1u), tmp); + } else if (bits24) { + return vop2(aco_opcode::v_mul_u32_u24, dst, Operand(imm), tmp); + } else { + Temp imm_tmp = copy(def(v1), Operand(imm)); + return vop3(aco_opcode::v_mul_lo_u32, dst, imm_tmp, tmp); + } + } + + Result v_mul24_imm(Definition dst, Temp tmp, uint32_t imm) + { + return v_mul_imm(dst, tmp, imm, true); + } + + Result copy(Definition dst, Op op_) { + Operand op = op_.op; + if (dst.regClass() == s1 && op.size() == 1 && op.isLiteral()) { + uint32_t imm = op.constantValue(); + if (imm == 0x3e22f983) { + if (program->chip_class >= GFX8) + op.setFixed(PhysReg{248}); /* it can be an inline constant on GFX8+ */ + } else if (imm >= 0xffff8000 || imm <= 0x7fff) { + return sopk(aco_opcode::s_movk_i32, dst, imm & 0xFFFFu); + } else if (util_bitreverse(imm) <= 64 || util_bitreverse(imm) >= 0xFFFFFFF0) { + uint32_t rev = util_bitreverse(imm); + return dst.regClass() == v1 ? + vop1(aco_opcode::v_bfrev_b32, dst, Operand(rev)) : + sop1(aco_opcode::s_brev_b32, dst, Operand(rev)); + } else if (imm != 0) { + unsigned start = (ffs(imm) - 1) & 0x1f; + unsigned size = util_bitcount(imm) & 0x1f; + if ((((1u << size) - 1u) << start) == imm) + return sop2(aco_opcode::s_bfm_b32, dst, Operand(size), Operand(start)); + } + } + + if (dst.regClass() == s2) { + return sop1(aco_opcode::s_mov_b64, dst, op); + } else if (op.size() > 1) { + return pseudo(aco_opcode::p_create_vector, dst, op); + } else if (dst.regClass() == v1 || dst.regClass() == v1.as_linear()) { + return vop1(aco_opcode::v_mov_b32, dst, op); + } else { + assert(dst.regClass() == s1); + return sop1(aco_opcode::s_mov_b32, dst, op); + } + } + + Result vadd32(Definition dst, Op a, Op b, bool carry_out=false, Op carry_in=Op(Operand(s2)), bool post_ra=false) { + if (!b.op.isTemp() || b.op.regClass().type() != RegType::vgpr) + std::swap(a, b); + assert((post_ra || b.op.hasRegClass()) && b.op.regClass().type() == RegType::vgpr); + + if (!carry_in.op.isUndefined()) + return vop2(aco_opcode::v_addc_co_u32, Definition(dst), hint_vcc(def(lm)), a, b, carry_in); + else if (program->chip_class >= GFX10 && carry_out) + return vop3(aco_opcode::v_add_co_u32_e64, Definition(dst), def(lm), a, b); + else if (program->chip_class < GFX9 || carry_out) + return vop2(aco_opcode::v_add_co_u32, Definition(dst), hint_vcc(def(lm)), a, b); + else + return vop2(aco_opcode::v_add_u32, Definition(dst), a, b); + } + + Result vsub32(Definition dst, Op a, Op b, bool carry_out=false, Op borrow=Op(Operand(s2))) + { + if (!borrow.op.isUndefined() || program->chip_class < GFX9) + carry_out = true; + + bool reverse = !b.op.isTemp() || b.op.regClass().type() != RegType::vgpr; + if (reverse) + std::swap(a, b); + assert(b.op.isTemp() && b.op.regClass().type() == RegType::vgpr); + + aco_opcode op; + Temp carry; + if (carry_out) { + carry = tmp(s2); + if (borrow.op.isUndefined()) + op = reverse ? aco_opcode::v_subrev_co_u32 : aco_opcode::v_sub_co_u32; + else + op = reverse ? aco_opcode::v_subbrev_co_u32 : aco_opcode::v_subb_co_u32; + } else { + op = reverse ? aco_opcode::v_subrev_u32 : aco_opcode::v_sub_u32; + } + bool vop3 = false; + if (program->chip_class >= GFX10 && op == aco_opcode::v_subrev_co_u32) { + vop3 = true; + op = aco_opcode::v_subrev_co_u32_e64; + } else if (program->chip_class >= GFX10 && op == aco_opcode::v_sub_co_u32) { + vop3 = true; + op = aco_opcode::v_sub_co_u32_e64; + } + + int num_ops = borrow.op.isUndefined() ? 2 : 3; + int num_defs = carry_out ? 2 : 1; + aco_ptr sub; + if (vop3) + sub.reset(create_instruction(op, Format::VOP3B, num_ops, num_defs)); + else + sub.reset(create_instruction(op, Format::VOP2, num_ops, num_defs)); + sub->operands[0] = a.op; + sub->operands[1] = b.op; + if (!borrow.op.isUndefined()) + sub->operands[2] = borrow.op; + sub->definitions[0] = dst; + if (carry_out) { + sub->definitions[1] = Definition(carry); + sub->definitions[1].setHint(aco::vcc); + } + return insert(std::move(sub)); + } + + Result readlane(Definition dst, Op vsrc, Op lane) + { + if (program->chip_class >= GFX8) + return vop3(aco_opcode::v_readlane_b32_e64, dst, vsrc, lane); + else + return vop2(aco_opcode::v_readlane_b32, dst, vsrc, lane); + } + Result writelane(Definition dst, Op val, Op lane, Op vsrc) { + if (program->chip_class >= GFX8) + return vop3(aco_opcode::v_writelane_b32_e64, dst, val, lane, vsrc); + else + return vop2(aco_opcode::v_writelane_b32, dst, val, lane, vsrc); + } +<% +import itertools +formats = [("pseudo", [Format.PSEUDO], 'Pseudo_instruction', list(itertools.product(range(5), range(5))) + [(8, 1), (1, 8)]), + ("sop1", [Format.SOP1], 'SOP1_instruction', [(1, 1), (2, 1), (3, 2)]), + ("sop2", [Format.SOP2], 'SOP2_instruction', itertools.product([1, 2], [2, 3])), + ("sopk", [Format.SOPK], 'SOPK_instruction', itertools.product([0, 1, 2], [0, 1])), + ("sopp", [Format.SOPP], 'SOPP_instruction', [(0, 0), (0, 1)]), + ("sopc", [Format.SOPC], 'SOPC_instruction', [(1, 2)]), + ("smem", [Format.SMEM], 'SMEM_instruction', [(0, 4), (0, 3), (1, 0), (1, 3), (1, 2), (0, 0)]), + ("ds", [Format.DS], 'DS_instruction', [(1, 1), (1, 2), (0, 3), (0, 4)]), + ("mubuf", [Format.MUBUF], 'MUBUF_instruction', [(0, 4), (1, 3)]), + ("mtbuf", [Format.MTBUF], 'MTBUF_instruction', [(0, 4), (1, 3)]), + ("mimg", [Format.MIMG], 'MIMG_instruction', [(0, 3), (1, 3)]), + ("exp", [Format.EXP], 'Export_instruction', [(0, 4)]), + ("branch", [Format.PSEUDO_BRANCH], 'Pseudo_branch_instruction', itertools.product([0], [0, 1])), + ("barrier", [Format.PSEUDO_BARRIER], 'Pseudo_barrier_instruction', [(0, 0)]), + ("reduction", [Format.PSEUDO_REDUCTION], 'Pseudo_reduction_instruction', [(3, 2), (3, 4)]), + ("vop1", [Format.VOP1], 'VOP1_instruction', [(1, 1), (2, 2)]), + ("vop2", [Format.VOP2], 'VOP2_instruction', itertools.product([1, 2], [2, 3])), + ("vopc", [Format.VOPC], 'VOPC_instruction', itertools.product([1, 2], [2])), + ("vop3", [Format.VOP3A], 'VOP3A_instruction', [(1, 3), (1, 2), (1, 1), (2, 2)]), + ("vintrp", [Format.VINTRP], 'Interp_instruction', [(1, 2), (1, 3)]), + ("vop1_dpp", [Format.VOP1, Format.DPP], 'DPP_instruction', [(1, 1)]), + ("vop2_dpp", [Format.VOP2, Format.DPP], 'DPP_instruction', itertools.product([1, 2], [2, 3])), + ("vopc_dpp", [Format.VOPC, Format.DPP], 'DPP_instruction', itertools.product([1, 2], [2])), + ("vop1_e64", [Format.VOP1, Format.VOP3A], 'VOP3A_instruction', itertools.product([1], [1])), + ("vop2_e64", [Format.VOP2, Format.VOP3A], 'VOP3A_instruction', itertools.product([1, 2], [2, 3])), + ("vopc_e64", [Format.VOPC, Format.VOP3A], 'VOP3A_instruction', itertools.product([1, 2], [2])), + ("flat", [Format.FLAT], 'FLAT_instruction', [(0, 3), (1, 2)]), + ("global", [Format.GLOBAL], 'FLAT_instruction', [(0, 3), (1, 2)])] +%>\\ +% for name, formats, struct, shapes in formats: + % for num_definitions, num_operands in shapes: + <% + args = ['aco_opcode opcode'] + for i in range(num_definitions): + args.append('Definition def%d' % i) + for i in range(num_operands): + args.append('Op op%d' % i) + for f in formats: + args += f.get_builder_field_decls() + %>\\ + + Result ${name}(${', '.join(args)}) + { + ${struct} *instr = create_instruction<${struct}>(opcode, (Format)(${'|'.join('(int)Format::%s' % f.name for f in formats)}), ${num_operands}, ${num_definitions}); + % for i in range(num_definitions): + instr->definitions[${i}] = def${i}; + % endfor + % for i in range(num_operands): + instr->operands[${i}] = op${i}.op; + % endfor + % for f in formats: + % for dest, field_name in zip(f.get_builder_field_dests(), f.get_builder_field_names()): + instr->${dest} = ${field_name}; + % endfor + % endfor + return insert(instr); + } + + % if name == 'sop1' or name == 'sop2' or name == 'sopc': + <% + args[0] = 'WaveSpecificOpcode opcode' + params = [] + for i in range(num_definitions): + params.append('def%d' % i) + for i in range(num_operands): + params.append('op%d' % i) + %>\\ + + inline Result ${name}(${', '.join(args)}) + { + return ${name}(w64or32(opcode), ${', '.join(params)}); + } + + % endif + % endfor +% endfor +}; + +} +#endif /* _ACO_BUILDER_ */""" + +from aco_opcodes import opcodes, Format +from mako.template import Template + +print(Template(template).render(opcodes=opcodes, Format=Format)) diff -Nru mesa-19.2.8/src/amd/compiler/aco_dead_code_analysis.cpp mesa-20.0.8/src/amd/compiler/aco_dead_code_analysis.cpp --- mesa-19.2.8/src/amd/compiler/aco_dead_code_analysis.cpp 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/amd/compiler/aco_dead_code_analysis.cpp 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,108 @@ +/* + * Copyright © 2019 Valve Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + */ + +#include "aco_ir.h" + +#include + +/* + * Implements an analysis pass to determine the number of uses + * for each SSA-definition. + */ + +namespace aco { +namespace { + +struct dce_ctx { + int current_block; + std::vector uses; + std::vector> live; + + dce_ctx(Program* program) : current_block(program->blocks.size() - 1), uses(program->peekAllocationId()) + { + live.reserve(program->blocks.size()); + for (Block& block : program->blocks) + live.emplace_back(block.instructions.size()); + } +}; + +void process_block(dce_ctx& ctx, Block& block) +{ + std::vector& live = ctx.live[block.index]; + assert(live.size() == block.instructions.size()); + bool process_predecessors = false; + for (int idx = block.instructions.size() - 1; idx >= 0; idx--) { + if (live[idx]) + continue; + + aco_ptr& instr = block.instructions[idx]; + if (!is_dead(ctx.uses, instr.get())) { + for (const Operand& op : instr->operands) { + if (op.isTemp()) { + if (ctx.uses[op.tempId()] == 0) + process_predecessors = true; + ctx.uses[op.tempId()]++; + } + } + live[idx] = true; + } + } + + if (process_predecessors) { + for (unsigned pred_idx : block.linear_preds) + ctx.current_block = std::max(ctx.current_block, (int) pred_idx); + } +} + +} /* end namespace */ + +bool is_dead(const std::vector& uses, Instruction *instr) +{ + if (instr->definitions.empty()) + return false; + if (std::any_of(instr->definitions.begin(), instr->definitions.end(), + [&uses] (const Definition& def) { return uses[def.tempId()];})) + return false; + return !instr_info.is_atomic[(int)instr->opcode]; +} + +std::vector dead_code_analysis(Program *program) { + + dce_ctx ctx(program); + + while (ctx.current_block >= 0) { + unsigned next_block = ctx.current_block--; + process_block(ctx, program->blocks[next_block]); + } + + /* add one use to exec to prevent startpgm from being removed */ + aco_ptr& startpgm = program->blocks[0].instructions[0]; + assert(startpgm->opcode == aco_opcode::p_startpgm); + ctx.uses[startpgm->definitions.back().tempId()]++; + + return ctx.uses; +} + +} + diff -Nru mesa-19.2.8/src/amd/compiler/aco_dominance.cpp mesa-20.0.8/src/amd/compiler/aco_dominance.cpp --- mesa-19.2.8/src/amd/compiler/aco_dominance.cpp 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/amd/compiler/aco_dominance.cpp 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,93 @@ +/* + * Copyright © 2018 Valve Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * Authors: + * Daniel Schürmann (daniel.schuermann@campus.tu-berlin.de) + * + */ + +#ifndef ACO_DOMINANCE_CPP +#define ACO_DOMINANCE_CPP + +#include "aco_ir.h" + +/* + * Implements the algorithms for computing the dominator tree from + * "A Simple, Fast Dominance Algorithm" by Cooper, Harvey, and Kennedy. + * + * Different from the paper, our CFG allows to compute the dominator tree + * in a single pass as it is guaranteed that the dominating predecessors + * are processed before the current block. + */ + +namespace aco { + +void dominator_tree(Program* program) +{ + program->blocks[0].logical_idom = 0; + program->blocks[0].linear_idom = 0; + + for (unsigned i = 1; i < program->blocks.size(); i++) { + Block& block = program->blocks[i]; + int new_logical_idom = -1; + int new_linear_idom = -1; + for (unsigned pred_idx : block.logical_preds) { + if ((int) program->blocks[pred_idx].logical_idom == -1) + continue; + + if (new_logical_idom == -1) { + new_logical_idom = pred_idx; + continue; + } + + while ((int) pred_idx != new_logical_idom) { + if ((int) pred_idx > new_logical_idom) + pred_idx = program->blocks[pred_idx].logical_idom; + if ((int) pred_idx < new_logical_idom) + new_logical_idom = program->blocks[new_logical_idom].logical_idom; + } + } + + for (unsigned pred_idx : block.linear_preds) { + if ((int) program->blocks[pred_idx].linear_idom == -1) + continue; + + if (new_linear_idom == -1) { + new_linear_idom = pred_idx; + continue; + } + + while ((int) pred_idx != new_linear_idom) { + if ((int) pred_idx > new_linear_idom) + pred_idx = program->blocks[pred_idx].linear_idom; + if ((int) pred_idx < new_linear_idom) + new_linear_idom = program->blocks[new_linear_idom].linear_idom; + } + } + + block.logical_idom = new_logical_idom; + block.linear_idom = new_linear_idom; + } +} + +} +#endif diff -Nru mesa-19.2.8/src/amd/compiler/aco_insert_exec_mask.cpp mesa-20.0.8/src/amd/compiler/aco_insert_exec_mask.cpp --- mesa-19.2.8/src/amd/compiler/aco_insert_exec_mask.cpp 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/amd/compiler/aco_insert_exec_mask.cpp 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,1142 @@ +/* + * Copyright © 2019 Valve Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + */ + +#include "aco_ir.h" +#include "aco_builder.h" +#include "util/u_math.h" + +namespace aco { + +namespace { + +enum WQMState : uint8_t { + Unspecified = 0, + Exact = 1 << 0, + WQM = 1 << 1, /* with control flow applied */ + Preserve_WQM = 1 << 2, + Exact_Branch = 1 << 3, +}; + +enum mask_type : uint8_t { + mask_type_global = 1 << 0, + mask_type_exact = 1 << 1, + mask_type_wqm = 1 << 2, + mask_type_loop = 1 << 3, /* active lanes of a loop */ + mask_type_initial = 1 << 4, /* initially active lanes */ +}; + +struct wqm_ctx { + Program* program; + /* state for WQM propagation */ + std::set worklist; + std::vector defined_in; + std::vector needs_wqm; + std::vector branch_wqm; /* true if the branch condition in this block should be in wqm */ + bool loop; + bool wqm; + wqm_ctx(Program* program) : program(program), + defined_in(program->peekAllocationId(), 0xFFFF), + needs_wqm(program->peekAllocationId()), + branch_wqm(program->blocks.size()), + loop(false), + wqm(false) + { + for (unsigned i = 0; i < program->blocks.size(); i++) + worklist.insert(i); + } +}; + +struct loop_info { + Block* loop_header; + uint16_t num_exec_masks; + uint8_t needs; + bool has_divergent_break; + bool has_divergent_continue; + bool has_discard; /* has a discard or demote */ + loop_info(Block* b, uint16_t num, uint8_t needs, bool breaks, bool cont, bool discard) : + loop_header(b), num_exec_masks(num), needs(needs), has_divergent_break(breaks), + has_divergent_continue(cont), has_discard(discard) {} +}; + +struct block_info { + std::vector> exec; + std::vector instr_needs; + uint8_t block_needs; + uint8_t ever_again_needs; + bool logical_end_wqm; + /* more... */ +}; + +struct exec_ctx { + Program *program; + std::vector info; + std::vector loop; + bool handle_wqm = false; + exec_ctx(Program *program) : program(program), info(program->blocks.size()) {} +}; + +bool pred_by_exec_mask(aco_ptr& instr) { + if (instr->isSALU()) + return instr->reads_exec(); + if (instr->format == Format::SMEM || instr->isSALU()) + return false; + if (instr->format == Format::PSEUDO_BARRIER) + return false; + + if (instr->format == Format::PSEUDO) { + switch (instr->opcode) { + case aco_opcode::p_create_vector: + return instr->definitions[0].getTemp().type() == RegType::vgpr; + case aco_opcode::p_extract_vector: + case aco_opcode::p_split_vector: + return instr->operands[0].getTemp().type() == RegType::vgpr; + case aco_opcode::p_spill: + case aco_opcode::p_reload: + return false; + default: + break; + } + } + + if (instr->opcode == aco_opcode::v_readlane_b32 || + instr->opcode == aco_opcode::v_readlane_b32_e64 || + instr->opcode == aco_opcode::v_writelane_b32 || + instr->opcode == aco_opcode::v_writelane_b32_e64) + return false; + + return true; +} + +bool needs_exact(aco_ptr& instr) { + if (instr->format == Format::MUBUF) { + MUBUF_instruction *mubuf = static_cast(instr.get()); + return mubuf->disable_wqm; + } else if (instr->format == Format::MTBUF) { + MTBUF_instruction *mtbuf = static_cast(instr.get()); + return mtbuf->disable_wqm; + } else if (instr->format == Format::MIMG) { + MIMG_instruction *mimg = static_cast(instr.get()); + return mimg->disable_wqm; + } else if (instr->format == Format::FLAT || instr->format == Format::GLOBAL) { + FLAT_instruction *flat = static_cast(instr.get()); + return flat->disable_wqm; + } else { + return instr->format == Format::EXP || instr->opcode == aco_opcode::p_fs_buffer_store_smem; + } +} + +void set_needs_wqm(wqm_ctx &ctx, Temp tmp) +{ + if (!ctx.needs_wqm[tmp.id()]) { + ctx.needs_wqm[tmp.id()] = true; + if (ctx.defined_in[tmp.id()] != 0xFFFF) + ctx.worklist.insert(ctx.defined_in[tmp.id()]); + } +} + +void mark_block_wqm(wqm_ctx &ctx, unsigned block_idx) +{ + if (ctx.branch_wqm[block_idx]) + return; + + ctx.branch_wqm[block_idx] = true; + Block& block = ctx.program->blocks[block_idx]; + aco_ptr& branch = block.instructions.back(); + + if (branch->opcode != aco_opcode::p_branch) { + assert(!branch->operands.empty() && branch->operands[0].isTemp()); + set_needs_wqm(ctx, branch->operands[0].getTemp()); + } + + /* TODO: this sets more branch conditions to WQM than it needs to + * it should be enough to stop at the "exec mask top level" */ + if (block.kind & block_kind_top_level) + return; + + for (unsigned pred_idx : block.logical_preds) + mark_block_wqm(ctx, pred_idx); +} + +void get_block_needs(wqm_ctx &ctx, exec_ctx &exec_ctx, Block* block) +{ + block_info& info = exec_ctx.info[block->index]; + + std::vector instr_needs(block->instructions.size()); + + if (block->kind & block_kind_top_level) { + if (ctx.loop && ctx.wqm) { + /* mark all break conditions as WQM */ + unsigned block_idx = block->index + 1; + while (!(ctx.program->blocks[block_idx].kind & block_kind_top_level)) { + if (ctx.program->blocks[block_idx].kind & block_kind_break) + mark_block_wqm(ctx, block_idx); + block_idx++; + } + } else if (ctx.loop && !ctx.wqm) { + /* Ensure a branch never results in an exec mask with only helper + * invocations (which can cause a loop to repeat infinitively if it's + * break branches are done in exact). */ + unsigned block_idx = block->index; + do { + if ((ctx.program->blocks[block_idx].kind & block_kind_branch)) + exec_ctx.info[block_idx].block_needs |= Exact_Branch; + block_idx++; + } while (!(ctx.program->blocks[block_idx].kind & block_kind_top_level)); + } + + ctx.loop = false; + ctx.wqm = false; + } + + for (int i = block->instructions.size() - 1; i >= 0; --i) { + aco_ptr& instr = block->instructions[i]; + + WQMState needs = needs_exact(instr) ? Exact : Unspecified; + bool propagate_wqm = instr->opcode == aco_opcode::p_wqm; + bool preserve_wqm = instr->opcode == aco_opcode::p_discard_if; + bool pred_by_exec = pred_by_exec_mask(instr); + for (const Definition& definition : instr->definitions) { + if (!definition.isTemp()) + continue; + const unsigned def = definition.tempId(); + ctx.defined_in[def] = block->index; + if (needs == Unspecified && ctx.needs_wqm[def]) { + needs = pred_by_exec ? WQM : Unspecified; + propagate_wqm = true; + } + } + + if (propagate_wqm) { + for (const Operand& op : instr->operands) { + if (op.isTemp()) { + set_needs_wqm(ctx, op.getTemp()); + } + } + } else if (preserve_wqm && info.block_needs & WQM) { + needs = Preserve_WQM; + } + + /* ensure the condition controlling the control flow for this phi is in WQM */ + if (needs == WQM && instr->opcode == aco_opcode::p_phi) { + for (unsigned pred_idx : block->logical_preds) { + mark_block_wqm(ctx, pred_idx); + exec_ctx.info[pred_idx].logical_end_wqm = true; + ctx.worklist.insert(pred_idx); + } + } + + if ((instr->opcode == aco_opcode::p_logical_end && info.logical_end_wqm) || + instr->opcode == aco_opcode::p_wqm) { + assert(needs != Exact); + needs = WQM; + } + + instr_needs[i] = needs; + info.block_needs |= needs; + } + + info.instr_needs = instr_needs; + + /* for "if () " or "while () ", + * should be computed in WQM */ + if (info.block_needs & WQM && !(block->kind & block_kind_top_level)) { + for (unsigned pred_idx : block->logical_preds) + mark_block_wqm(ctx, pred_idx); + ctx.wqm = true; + } + if (block->kind & block_kind_loop_header) + ctx.loop = true; +} + +void calculate_wqm_needs(exec_ctx& exec_ctx) +{ + wqm_ctx ctx(exec_ctx.program); + + while (!ctx.worklist.empty()) { + unsigned block_index = *std::prev(ctx.worklist.end()); + ctx.worklist.erase(std::prev(ctx.worklist.end())); + + get_block_needs(ctx, exec_ctx, &exec_ctx.program->blocks[block_index]); + } + + uint8_t ever_again_needs = 0; + for (int i = exec_ctx.program->blocks.size() - 1; i >= 0; i--) { + exec_ctx.info[i].ever_again_needs = ever_again_needs; + Block& block = exec_ctx.program->blocks[i]; + + if (block.kind & block_kind_needs_lowering) + exec_ctx.info[i].block_needs |= Exact; + + /* if discard is used somewhere in nested CF, we need to preserve the WQM mask */ + if ((block.kind & block_kind_discard || + block.kind & block_kind_uses_discard_if) && + ever_again_needs & WQM) + exec_ctx.info[i].block_needs |= Preserve_WQM; + + ever_again_needs |= exec_ctx.info[i].block_needs & ~Exact_Branch; + if (block.kind & block_kind_discard || + block.kind & block_kind_uses_discard_if || + block.kind & block_kind_uses_demote) + ever_again_needs |= Exact; + + /* don't propagate WQM preservation further than the next top_level block */ + if (block.kind & block_kind_top_level) + ever_again_needs &= ~Preserve_WQM; + else + exec_ctx.info[i].block_needs &= ~Preserve_WQM; + } + exec_ctx.handle_wqm = true; +} + +void transition_to_WQM(exec_ctx& ctx, Builder bld, unsigned idx) +{ + if (ctx.info[idx].exec.back().second & mask_type_wqm) + return; + if (ctx.info[idx].exec.back().second & mask_type_global) { + Temp exec_mask = ctx.info[idx].exec.back().first; + /* TODO: we might generate better code if we pass the uncopied "exec_mask" + * directly to the s_wqm (we still need to keep this parallelcopy for + * potential later uses of exec_mask though). We currently can't do this + * because of a RA bug. */ + exec_mask = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(bld.lm), bld.exec(exec_mask)); + ctx.info[idx].exec.back().first = exec_mask; + + exec_mask = bld.sop1(Builder::s_wqm, bld.def(bld.lm, exec), bld.def(s1, scc), exec_mask); + ctx.info[idx].exec.emplace_back(exec_mask, mask_type_global | mask_type_wqm); + return; + } + /* otherwise, the WQM mask should be one below the current mask */ + ctx.info[idx].exec.pop_back(); + assert(ctx.info[idx].exec.back().second & mask_type_wqm); + assert(ctx.info[idx].exec.back().first.size() == bld.lm.size()); + ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(bld.lm, exec), + ctx.info[idx].exec.back().first); +} + +void transition_to_Exact(exec_ctx& ctx, Builder bld, unsigned idx) +{ + if (ctx.info[idx].exec.back().second & mask_type_exact) + return; + /* We can't remove the loop exec mask, because that can cause exec.size() to + * be less than num_exec_masks. The loop exec mask also needs to be kept + * around for various uses. */ + if ((ctx.info[idx].exec.back().second & mask_type_global) && + !(ctx.info[idx].exec.back().second & mask_type_loop)) { + ctx.info[idx].exec.pop_back(); + assert(ctx.info[idx].exec.back().second & mask_type_exact); + assert(ctx.info[idx].exec.back().first.size() == bld.lm.size()); + ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(bld.lm, exec), + ctx.info[idx].exec.back().first); + return; + } + /* otherwise, we create an exact mask and push to the stack */ + Temp wqm = ctx.info[idx].exec.back().first; + Temp exact = bld.tmp(bld.lm); + wqm = bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.def(s1, scc), + bld.exec(Definition(exact)), ctx.info[idx].exec[0].first, bld.exec(wqm)); + ctx.info[idx].exec.back().first = wqm; + ctx.info[idx].exec.emplace_back(exact, mask_type_exact); +} + +unsigned add_coupling_code(exec_ctx& ctx, Block* block, + std::vector>& instructions) +{ + unsigned idx = block->index; + Builder bld(ctx.program, &instructions); + std::vector& preds = block->linear_preds; + + /* start block */ + if (idx == 0) { + aco_ptr& startpgm = block->instructions[0]; + assert(startpgm->opcode == aco_opcode::p_startpgm); + Temp exec_mask = startpgm->definitions.back().getTemp(); + bld.insert(std::move(startpgm)); + + /* exec seems to need to be manually initialized with combined shaders */ + if (util_bitcount(ctx.program->stage & sw_mask) > 1) { + bld.sop1(Builder::s_mov, bld.exec(Definition(exec_mask)), bld.lm == s2 ? Operand(UINT64_MAX) : Operand(UINT32_MAX)); + instructions[0]->definitions.pop_back(); + } + + if (ctx.handle_wqm) { + ctx.info[0].exec.emplace_back(exec_mask, mask_type_global | mask_type_exact | mask_type_initial); + /* if this block only needs WQM, initialize already */ + if (ctx.info[0].block_needs == WQM) + transition_to_WQM(ctx, bld, 0); + } else { + uint8_t mask = mask_type_global; + if (ctx.program->needs_wqm) { + exec_mask = bld.sop1(Builder::s_wqm, bld.def(bld.lm, exec), bld.def(s1, scc), bld.exec(exec_mask)); + mask |= mask_type_wqm; + } else { + mask |= mask_type_exact; + } + ctx.info[0].exec.emplace_back(exec_mask, mask); + } + + return 1; + } + + /* loop entry block */ + if (block->kind & block_kind_loop_header) { + assert(preds[0] == idx - 1); + ctx.info[idx].exec = ctx.info[idx - 1].exec; + loop_info& info = ctx.loop.back(); + while (ctx.info[idx].exec.size() > info.num_exec_masks) + ctx.info[idx].exec.pop_back(); + + /* create ssa names for outer exec masks */ + if (info.has_discard) { + aco_ptr phi; + for (int i = 0; i < info.num_exec_masks - 1; i++) { + phi.reset(create_instruction(aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)); + phi->definitions[0] = bld.def(bld.lm); + phi->operands[0] = Operand(ctx.info[preds[0]].exec[i].first); + ctx.info[idx].exec[i].first = bld.insert(std::move(phi)); + } + } + + /* create ssa name for restore mask */ + if (info.has_divergent_break) { + /* this phi might be trivial but ensures a parallelcopy on the loop header */ + aco_ptr phi{create_instruction(aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)}; + phi->definitions[0] = bld.def(bld.lm); + phi->operands[0] = Operand(ctx.info[preds[0]].exec[info.num_exec_masks - 1].first); + ctx.info[idx].exec.back().first = bld.insert(std::move(phi)); + } + + /* create ssa name for loop active mask */ + aco_ptr phi{create_instruction(aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)}; + if (info.has_divergent_continue) + phi->definitions[0] = bld.def(bld.lm); + else + phi->definitions[0] = bld.def(bld.lm, exec); + phi->operands[0] = Operand(ctx.info[preds[0]].exec.back().first); + Temp loop_active = bld.insert(std::move(phi)); + + if (info.has_divergent_break) { + uint8_t mask_type = (ctx.info[idx].exec.back().second & (mask_type_wqm | mask_type_exact)) | mask_type_loop; + ctx.info[idx].exec.emplace_back(loop_active, mask_type); + } else { + ctx.info[idx].exec.back().first = loop_active; + ctx.info[idx].exec.back().second |= mask_type_loop; + } + + /* create a parallelcopy to move the active mask to exec */ + unsigned i = 0; + if (info.has_divergent_continue) { + while (block->instructions[i]->opcode != aco_opcode::p_logical_start) { + bld.insert(std::move(block->instructions[i])); + i++; + } + uint8_t mask_type = ctx.info[idx].exec.back().second & (mask_type_wqm | mask_type_exact); + assert(ctx.info[idx].exec.back().first.size() == bld.lm.size()); + ctx.info[idx].exec.emplace_back(bld.pseudo(aco_opcode::p_parallelcopy, bld.def(bld.lm, exec), + ctx.info[idx].exec.back().first), mask_type); + } + + return i; + } + + /* loop exit block */ + if (block->kind & block_kind_loop_exit) { + Block* header = ctx.loop.back().loop_header; + loop_info& info = ctx.loop.back(); + + for (ASSERTED unsigned pred : preds) + assert(ctx.info[pred].exec.size() >= info.num_exec_masks); + + /* fill the loop header phis */ + std::vector& header_preds = header->linear_preds; + int k = 0; + if (info.has_discard) { + while (k < info.num_exec_masks - 1) { + aco_ptr& phi = header->instructions[k]; + assert(phi->opcode == aco_opcode::p_linear_phi); + for (unsigned i = 1; i < phi->operands.size(); i++) + phi->operands[i] = Operand(ctx.info[header_preds[i]].exec[k].first); + k++; + } + } + aco_ptr& phi = header->instructions[k++]; + assert(phi->opcode == aco_opcode::p_linear_phi); + for (unsigned i = 1; i < phi->operands.size(); i++) + phi->operands[i] = Operand(ctx.info[header_preds[i]].exec[info.num_exec_masks - 1].first); + + if (info.has_divergent_break) { + aco_ptr& phi = header->instructions[k]; + assert(phi->opcode == aco_opcode::p_linear_phi); + for (unsigned i = 1; i < phi->operands.size(); i++) + phi->operands[i] = Operand(ctx.info[header_preds[i]].exec[info.num_exec_masks].first); + } + + assert(!(block->kind & block_kind_top_level) || info.num_exec_masks <= 2); + + /* create the loop exit phis if not trivial */ + bool need_parallelcopy = false; + for (unsigned k = 0; k < info.num_exec_masks; k++) { + Temp same = ctx.info[preds[0]].exec[k].first; + uint8_t type = ctx.info[header_preds[0]].exec[k].second; + bool trivial = true; + + for (unsigned i = 1; i < preds.size() && trivial; i++) { + if (ctx.info[preds[i]].exec[k].first != same) + trivial = false; + } + + if (k == info.num_exec_masks - 1u) { + bool all_liveout_exec = true; + bool all_not_liveout_exec = true; + for (unsigned pred : preds) { + all_liveout_exec = all_liveout_exec && same == ctx.program->blocks[pred].live_out_exec; + all_not_liveout_exec = all_not_liveout_exec && same != ctx.program->blocks[pred].live_out_exec; + } + if (!all_liveout_exec && !all_not_liveout_exec) + trivial = false; + else if (all_not_liveout_exec) + need_parallelcopy = true; + + need_parallelcopy |= !trivial; + } + + if (trivial) { + ctx.info[idx].exec.emplace_back(same, type); + } else { + /* create phi for loop footer */ + aco_ptr phi{create_instruction(aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)}; + phi->definitions[0] = bld.def(bld.lm); + if (k == info.num_exec_masks - 1u) { + phi->definitions[0].setFixed(exec); + need_parallelcopy = false; + } + for (unsigned i = 0; i < phi->operands.size(); i++) + phi->operands[i] = Operand(ctx.info[preds[i]].exec[k].first); + ctx.info[idx].exec.emplace_back(bld.insert(std::move(phi)), type); + } + } + assert(ctx.info[idx].exec.size() == info.num_exec_masks); + + /* create a parallelcopy to move the live mask to exec */ + unsigned i = 0; + while (block->instructions[i]->opcode != aco_opcode::p_logical_start) { + bld.insert(std::move(block->instructions[i])); + i++; + } + + if (ctx.handle_wqm) { + if (block->kind & block_kind_top_level && ctx.info[idx].exec.size() == 2) { + if ((ctx.info[idx].block_needs | ctx.info[idx].ever_again_needs) == 0 || + (ctx.info[idx].block_needs | ctx.info[idx].ever_again_needs) == Exact) { + ctx.info[idx].exec.back().second |= mask_type_global; + transition_to_Exact(ctx, bld, idx); + ctx.handle_wqm = false; + } + } + if (ctx.info[idx].block_needs == WQM) + transition_to_WQM(ctx, bld, idx); + else if (ctx.info[idx].block_needs == Exact) + transition_to_Exact(ctx, bld, idx); + } + + assert(ctx.info[idx].exec.back().first.size() == bld.lm.size()); + if (need_parallelcopy) { + /* only create this parallelcopy is needed, since the operand isn't + * fixed to exec which causes the spiller to miscalculate register demand */ + /* TODO: Fix register_demand calculation for spilling on loop exits. + * The problem is only mitigated because the register demand could be + * higher if the exec phi doesn't get assigned to exec. */ + ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(bld.lm, exec), + ctx.info[idx].exec.back().first); + } + + ctx.loop.pop_back(); + return i; + } + + if (preds.size() == 1) { + ctx.info[idx].exec = ctx.info[preds[0]].exec; + } else { + assert(preds.size() == 2); + /* if one of the predecessors ends in exact mask, we pop it from stack */ + unsigned num_exec_masks = std::min(ctx.info[preds[0]].exec.size(), + ctx.info[preds[1]].exec.size()); + if (block->kind & block_kind_top_level && !(block->kind & block_kind_merge)) + num_exec_masks = std::min(num_exec_masks, 2u); + + /* create phis for diverged exec masks */ + for (unsigned i = 0; i < num_exec_masks; i++) { + bool in_exec = i == num_exec_masks - 1 && !(block->kind & block_kind_merge); + if (!in_exec && ctx.info[preds[0]].exec[i].first == ctx.info[preds[1]].exec[i].first) { + assert(ctx.info[preds[0]].exec[i].second == ctx.info[preds[1]].exec[i].second); + ctx.info[idx].exec.emplace_back(ctx.info[preds[0]].exec[i]); + continue; + } + + Temp phi = bld.pseudo(aco_opcode::p_linear_phi, in_exec ? bld.def(bld.lm, exec) : bld.def(bld.lm), + ctx.info[preds[0]].exec[i].first, + ctx.info[preds[1]].exec[i].first); + uint8_t mask_type = ctx.info[preds[0]].exec[i].second & ctx.info[preds[1]].exec[i].second; + ctx.info[idx].exec.emplace_back(phi, mask_type); + } + } + + unsigned i = 0; + while (block->instructions[i]->opcode == aco_opcode::p_phi || + block->instructions[i]->opcode == aco_opcode::p_linear_phi) { + bld.insert(std::move(block->instructions[i])); + i++; + } + + if (block->kind & block_kind_merge) + ctx.info[idx].exec.pop_back(); + + if (block->kind & block_kind_top_level && ctx.info[idx].exec.size() == 3) { + assert(ctx.info[idx].exec.back().second == mask_type_exact); + assert(block->kind & block_kind_merge); + ctx.info[idx].exec.pop_back(); + } + + /* try to satisfy the block's needs */ + if (ctx.handle_wqm) { + if (block->kind & block_kind_top_level && ctx.info[idx].exec.size() == 2) { + if ((ctx.info[idx].block_needs | ctx.info[idx].ever_again_needs) == 0 || + (ctx.info[idx].block_needs | ctx.info[idx].ever_again_needs) == Exact) { + ctx.info[idx].exec.back().second |= mask_type_global; + transition_to_Exact(ctx, bld, idx); + ctx.handle_wqm = false; + } + } + if (ctx.info[idx].block_needs == WQM) + transition_to_WQM(ctx, bld, idx); + else if (ctx.info[idx].block_needs == Exact) + transition_to_Exact(ctx, bld, idx); + } + + if (block->kind & block_kind_merge) { + Temp restore = ctx.info[idx].exec.back().first; + assert(restore.size() == bld.lm.size()); + ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(bld.lm, exec), restore); + } + + return i; +} + +void lower_fs_buffer_store_smem(Builder& bld, bool need_check, aco_ptr& instr, Temp cur_exec) +{ + Operand offset = instr->operands[1]; + if (need_check) { + /* if exec is zero, then use UINT32_MAX as an offset and make this store a no-op */ + Temp nonempty = bld.sopc(Builder::s_cmp_lg, bld.def(s1, scc), cur_exec, Operand(0u)); + + if (offset.isLiteral()) + offset = bld.sop1(aco_opcode::s_mov_b32, bld.def(s1), offset); + + offset = bld.sop2(aco_opcode::s_cselect_b32, bld.hint_m0(bld.def(s1)), + offset, Operand(UINT32_MAX), bld.scc(nonempty)); + } else if (offset.isConstant() && offset.constantValue() > 0xFFFFF) { + offset = bld.sop1(aco_opcode::s_mov_b32, bld.hint_m0(bld.def(s1)), offset); + } + if (!offset.isConstant()) + offset.setFixed(m0); + + switch (instr->operands[2].size()) { + case 1: + instr->opcode = aco_opcode::s_buffer_store_dword; + break; + case 2: + instr->opcode = aco_opcode::s_buffer_store_dwordx2; + break; + case 4: + instr->opcode = aco_opcode::s_buffer_store_dwordx4; + break; + default: + unreachable("Invalid SMEM buffer store size"); + } + instr->operands[1] = offset; + /* as_uniform() needs to be done here so it's done in exact mode and helper + * lanes don't contribute. */ + instr->operands[2] = Operand(bld.as_uniform(instr->operands[2])); +} + +void process_instructions(exec_ctx& ctx, Block* block, + std::vector>& instructions, + unsigned idx) +{ + WQMState state; + if (ctx.info[block->index].exec.back().second & mask_type_wqm) + state = WQM; + else { + assert(!ctx.handle_wqm || ctx.info[block->index].exec.back().second & mask_type_exact); + state = Exact; + } + + /* if the block doesn't need both, WQM and Exact, we can skip processing the instructions */ + bool process = (ctx.handle_wqm && + (ctx.info[block->index].block_needs & state) != + (ctx.info[block->index].block_needs & (WQM | Exact))) || + block->kind & block_kind_uses_discard_if || + block->kind & block_kind_uses_demote || + block->kind & block_kind_needs_lowering; + if (!process) { + std::vector>::iterator it = std::next(block->instructions.begin(), idx); + instructions.insert(instructions.end(), + std::move_iterator>::iterator>(it), + std::move_iterator>::iterator>(block->instructions.end())); + return; + } + + Builder bld(ctx.program, &instructions); + + for (; idx < block->instructions.size(); idx++) { + aco_ptr instr = std::move(block->instructions[idx]); + + WQMState needs = ctx.handle_wqm ? ctx.info[block->index].instr_needs[idx] : Unspecified; + + if (instr->opcode == aco_opcode::p_discard_if) { + if (ctx.info[block->index].block_needs & Preserve_WQM) { + assert(block->kind & block_kind_top_level); + transition_to_WQM(ctx, bld, block->index); + ctx.info[block->index].exec.back().second &= ~mask_type_global; + } + int num = ctx.info[block->index].exec.size(); + assert(num); + Operand cond = instr->operands[0]; + for (int i = num - 1; i >= 0; i--) { + Instruction *andn2 = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), + ctx.info[block->index].exec[i].first, cond); + if (i == num - 1) { + andn2->operands[0].setFixed(exec); + andn2->definitions[0].setFixed(exec); + } + if (i == 0) { + instr->opcode = aco_opcode::p_exit_early_if; + instr->operands[0] = bld.scc(andn2->definitions[1].getTemp()); + } + ctx.info[block->index].exec[i].first = andn2->definitions[0].getTemp(); + } + assert(!ctx.handle_wqm || (ctx.info[block->index].exec[0].second & mask_type_wqm) == 0); + + } else if (needs == WQM && state != WQM) { + transition_to_WQM(ctx, bld, block->index); + state = WQM; + } else if (needs == Exact && state != Exact) { + transition_to_Exact(ctx, bld, block->index); + state = Exact; + } + + if (instr->opcode == aco_opcode::p_is_helper || instr->opcode == aco_opcode::p_load_helper) { + Definition dst = instr->definitions[0]; + assert(dst.size() == bld.lm.size()); + if (state == Exact) { + instr.reset(create_instruction(bld.w64or32(Builder::s_mov), Format::SOP1, 1, 1)); + instr->operands[0] = Operand(0u); + instr->definitions[0] = dst; + } else { + std::pair& exact_mask = ctx.info[block->index].exec[0]; + if (instr->opcode == aco_opcode::p_load_helper && + !(ctx.info[block->index].exec[0].second & mask_type_initial)) { + /* find last initial exact mask */ + for (int i = block->index; i >= 0; i--) { + if (ctx.program->blocks[i].kind & block_kind_top_level && + ctx.info[i].exec[0].second & mask_type_initial) { + exact_mask = ctx.info[i].exec[0]; + break; + } + } + } + + assert(instr->opcode == aco_opcode::p_is_helper || exact_mask.second & mask_type_initial); + assert(exact_mask.second & mask_type_exact); + + instr.reset(create_instruction(bld.w64or32(Builder::s_andn2), Format::SOP2, 2, 2)); + instr->operands[0] = Operand(ctx.info[block->index].exec.back().first); /* current exec */ + instr->operands[1] = Operand(exact_mask.first); + instr->definitions[0] = dst; + instr->definitions[1] = bld.def(s1, scc); + } + } else if (instr->opcode == aco_opcode::p_demote_to_helper) { + /* turn demote into discard_if with only exact masks */ + assert((ctx.info[block->index].exec[0].second & (mask_type_exact | mask_type_global)) == (mask_type_exact | mask_type_global)); + ctx.info[block->index].exec[0].second &= ~mask_type_initial; + + int num; + Temp cond, exit_cond; + if (instr->operands[0].isConstant()) { + assert(instr->operands[0].constantValue() == -1u); + /* transition to exact and set exec to zero */ + Temp old_exec = ctx.info[block->index].exec.back().first; + Temp new_exec = bld.tmp(bld.lm); + exit_cond = bld.tmp(s1); + cond = bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.scc(Definition(exit_cond)), + bld.exec(Definition(new_exec)), Operand(0u), bld.exec(old_exec)); + + num = ctx.info[block->index].exec.size() - 2; + if (ctx.info[block->index].exec.back().second & mask_type_exact) { + ctx.info[block->index].exec.back().first = new_exec; + } else { + ctx.info[block->index].exec.back().first = cond; + ctx.info[block->index].exec.emplace_back(new_exec, mask_type_exact); + } + } else { + /* demote_if: transition to exact */ + transition_to_Exact(ctx, bld, block->index); + assert(instr->operands[0].isTemp()); + cond = instr->operands[0].getTemp(); + num = ctx.info[block->index].exec.size() - 1; + } + + for (int i = num; i >= 0; i--) { + if (ctx.info[block->index].exec[i].second & mask_type_exact) { + Instruction *andn2 = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), + ctx.info[block->index].exec[i].first, cond); + if (i == (int)ctx.info[block->index].exec.size() - 1) { + andn2->operands[0].setFixed(exec); + andn2->definitions[0].setFixed(exec); + } + + ctx.info[block->index].exec[i].first = andn2->definitions[0].getTemp(); + exit_cond = andn2->definitions[1].getTemp(); + } else { + assert(i != 0); + } + } + instr->opcode = aco_opcode::p_exit_early_if; + instr->operands[0] = bld.scc(exit_cond); + state = Exact; + + } else if (instr->opcode == aco_opcode::p_fs_buffer_store_smem) { + bool need_check = ctx.info[block->index].exec.size() != 1 && + !(ctx.info[block->index].exec[ctx.info[block->index].exec.size() - 2].second & Exact); + lower_fs_buffer_store_smem(bld, need_check, instr, ctx.info[block->index].exec.back().first); + } + + bld.insert(std::move(instr)); + } +} + +void add_branch_code(exec_ctx& ctx, Block* block) +{ + unsigned idx = block->index; + Builder bld(ctx.program, block); + + if (idx == ctx.program->blocks.size() - 1) + return; + + /* try to disable wqm handling */ + if (ctx.handle_wqm && block->kind & block_kind_top_level) { + if (ctx.info[idx].exec.size() == 3) { + assert(ctx.info[idx].exec[1].second == mask_type_wqm); + ctx.info[idx].exec.pop_back(); + } + assert(ctx.info[idx].exec.size() <= 2); + + if (ctx.info[idx].ever_again_needs == 0 || + ctx.info[idx].ever_again_needs == Exact) { + /* transition to Exact */ + aco_ptr branch = std::move(block->instructions.back()); + block->instructions.pop_back(); + ctx.info[idx].exec.back().second |= mask_type_global; + transition_to_Exact(ctx, bld, idx); + bld.insert(std::move(branch)); + ctx.handle_wqm = false; + + } else if (ctx.info[idx].block_needs & Preserve_WQM) { + /* transition to WQM and remove global flag */ + aco_ptr branch = std::move(block->instructions.back()); + block->instructions.pop_back(); + transition_to_WQM(ctx, bld, idx); + ctx.info[idx].exec.back().second &= ~mask_type_global; + bld.insert(std::move(branch)); + } + } + + if (block->kind & block_kind_loop_preheader) { + /* collect information about the succeeding loop */ + bool has_divergent_break = false; + bool has_divergent_continue = false; + bool has_discard = false; + uint8_t needs = 0; + unsigned loop_nest_depth = ctx.program->blocks[idx + 1].loop_nest_depth; + + for (unsigned i = idx + 1; ctx.program->blocks[i].loop_nest_depth >= loop_nest_depth; i++) { + Block& loop_block = ctx.program->blocks[i]; + needs |= ctx.info[i].block_needs; + + if (loop_block.kind & block_kind_uses_discard_if || + loop_block.kind & block_kind_discard || + loop_block.kind & block_kind_uses_demote) + has_discard = true; + if (loop_block.loop_nest_depth != loop_nest_depth) + continue; + + if (loop_block.kind & block_kind_uniform) + continue; + else if (loop_block.kind & block_kind_break) + has_divergent_break = true; + else if (loop_block.kind & block_kind_continue) + has_divergent_continue = true; + } + + if (ctx.handle_wqm) { + if (needs & WQM) { + aco_ptr branch = std::move(block->instructions.back()); + block->instructions.pop_back(); + transition_to_WQM(ctx, bld, idx); + bld.insert(std::move(branch)); + } else { + aco_ptr branch = std::move(block->instructions.back()); + block->instructions.pop_back(); + transition_to_Exact(ctx, bld, idx); + bld.insert(std::move(branch)); + } + } + + unsigned num_exec_masks = ctx.info[idx].exec.size(); + if (block->kind & block_kind_top_level) + num_exec_masks = std::min(num_exec_masks, 2u); + + ctx.loop.emplace_back(&ctx.program->blocks[block->linear_succs[0]], + num_exec_masks, + needs, + has_divergent_break, + has_divergent_continue, + has_discard); + } + + if (block->kind & block_kind_discard) { + + assert(block->instructions.back()->format == Format::PSEUDO_BRANCH); + aco_ptr branch = std::move(block->instructions.back()); + block->instructions.pop_back(); + + /* create a discard_if() instruction with the exec mask as condition */ + unsigned num = 0; + if (ctx.loop.size()) { + /* if we're in a loop, only discard from the outer exec masks */ + num = ctx.loop.back().num_exec_masks; + } else { + num = ctx.info[idx].exec.size() - 1; + } + + Temp old_exec = ctx.info[idx].exec.back().first; + Temp new_exec = bld.tmp(bld.lm); + Temp cond = bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.def(s1, scc), + bld.exec(Definition(new_exec)), Operand(0u), bld.exec(old_exec)); + ctx.info[idx].exec.back().first = new_exec; + + for (int i = num - 1; i >= 0; i--) { + Instruction *andn2 = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), + ctx.info[block->index].exec[i].first, cond); + if (i == (int)ctx.info[idx].exec.size() - 1) + andn2->definitions[0].setFixed(exec); + if (i == 0) + bld.pseudo(aco_opcode::p_exit_early_if, bld.scc(andn2->definitions[1].getTemp())); + ctx.info[block->index].exec[i].first = andn2->definitions[0].getTemp(); + } + assert(!ctx.handle_wqm || (ctx.info[block->index].exec[0].second & mask_type_wqm) == 0); + + if ((block->kind & (block_kind_break | block_kind_uniform)) == block_kind_break) + ctx.info[idx].exec.back().first = cond; + bld.insert(std::move(branch)); + /* no return here as it can be followed by a divergent break */ + } + + if (block->kind & block_kind_continue_or_break) { + assert(ctx.program->blocks[ctx.program->blocks[block->linear_succs[1]].linear_succs[0]].kind & block_kind_loop_header); + assert(ctx.program->blocks[ctx.program->blocks[block->linear_succs[0]].linear_succs[0]].kind & block_kind_loop_exit); + assert(block->instructions.back()->opcode == aco_opcode::p_branch); + block->instructions.pop_back(); + + bool need_parallelcopy = false; + while (!(ctx.info[idx].exec.back().second & mask_type_loop)) { + ctx.info[idx].exec.pop_back(); + need_parallelcopy = true; + } + + if (need_parallelcopy) + ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(bld.lm, exec), ctx.info[idx].exec.back().first); + bld.branch(aco_opcode::p_cbranch_nz, bld.exec(ctx.info[idx].exec.back().first), block->linear_succs[1], block->linear_succs[0]); + return; + } + + if (block->kind & block_kind_uniform) { + Pseudo_branch_instruction* branch = static_cast(block->instructions.back().get()); + if (branch->opcode == aco_opcode::p_branch) { + branch->target[0] = block->linear_succs[0]; + } else { + branch->target[0] = block->linear_succs[1]; + branch->target[1] = block->linear_succs[0]; + } + return; + } + + if (block->kind & block_kind_branch) { + + if (ctx.handle_wqm && + ctx.info[idx].exec.size() >= 2 && + ctx.info[idx].exec.back().second == mask_type_exact && + !(ctx.info[idx].block_needs & Exact_Branch) && + ctx.info[idx].exec[ctx.info[idx].exec.size() - 2].second & mask_type_wqm) { + /* return to wqm before branching */ + ctx.info[idx].exec.pop_back(); + } + + // orig = s_and_saveexec_b64 + assert(block->linear_succs.size() == 2); + assert(block->instructions.back()->opcode == aco_opcode::p_cbranch_z); + Temp cond = block->instructions.back()->operands[0].getTemp(); + block->instructions.pop_back(); + + if (ctx.info[idx].block_needs & Exact_Branch) + transition_to_Exact(ctx, bld, idx); + + Temp current_exec = ctx.info[idx].exec.back().first; + uint8_t mask_type = ctx.info[idx].exec.back().second & (mask_type_wqm | mask_type_exact); + + Temp then_mask = bld.tmp(bld.lm); + Temp old_exec = bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.def(s1, scc), + bld.exec(Definition(then_mask)), cond, bld.exec(current_exec)); + + ctx.info[idx].exec.back().first = old_exec; + + /* add next current exec to the stack */ + ctx.info[idx].exec.emplace_back(then_mask, mask_type); + + bld.branch(aco_opcode::p_cbranch_z, bld.exec(then_mask), block->linear_succs[1], block->linear_succs[0]); + return; + } + + if (block->kind & block_kind_invert) { + // exec = s_andn2_b64 (original_exec, exec) + assert(block->instructions.back()->opcode == aco_opcode::p_cbranch_nz); + block->instructions.pop_back(); + Temp then_mask = ctx.info[idx].exec.back().first; + uint8_t mask_type = ctx.info[idx].exec.back().second; + ctx.info[idx].exec.pop_back(); + Temp orig_exec = ctx.info[idx].exec.back().first; + Temp else_mask = bld.sop2(Builder::s_andn2, bld.def(bld.lm, exec), + bld.def(s1, scc), orig_exec, bld.exec(then_mask)); + + /* add next current exec to the stack */ + ctx.info[idx].exec.emplace_back(else_mask, mask_type); + + bld.branch(aco_opcode::p_cbranch_z, bld.exec(else_mask), block->linear_succs[1], block->linear_succs[0]); + return; + } + + if (block->kind & block_kind_break) { + // loop_mask = s_andn2_b64 (loop_mask, exec) + assert(block->instructions.back()->opcode == aco_opcode::p_branch); + block->instructions.pop_back(); + + Temp current_exec = ctx.info[idx].exec.back().first; + Temp cond = Temp(); + for (int exec_idx = ctx.info[idx].exec.size() - 2; exec_idx >= 0; exec_idx--) { + cond = bld.tmp(s1); + Temp exec_mask = ctx.info[idx].exec[exec_idx].first; + exec_mask = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.scc(Definition(cond)), + exec_mask, bld.exec(current_exec)); + ctx.info[idx].exec[exec_idx].first = exec_mask; + if (ctx.info[idx].exec[exec_idx].second & mask_type_loop) + break; + } + + /* check if the successor is the merge block, otherwise set exec to 0 */ + // TODO: this could be done better by directly branching to the merge block + unsigned succ_idx = ctx.program->blocks[block->linear_succs[1]].linear_succs[0]; + Block& succ = ctx.program->blocks[succ_idx]; + if (!(succ.kind & block_kind_invert || succ.kind & block_kind_merge)) { + ctx.info[idx].exec.back().first = bld.sop1(Builder::s_mov, bld.def(bld.lm, exec), Operand(0u)); + } + + bld.branch(aco_opcode::p_cbranch_nz, bld.scc(cond), block->linear_succs[1], block->linear_succs[0]); + return; + } + + if (block->kind & block_kind_continue) { + assert(block->instructions.back()->opcode == aco_opcode::p_branch); + block->instructions.pop_back(); + + Temp current_exec = ctx.info[idx].exec.back().first; + Temp cond = Temp(); + for (int exec_idx = ctx.info[idx].exec.size() - 2; exec_idx >= 0; exec_idx--) { + if (ctx.info[idx].exec[exec_idx].second & mask_type_loop) + break; + cond = bld.tmp(s1); + Temp exec_mask = ctx.info[idx].exec[exec_idx].first; + exec_mask = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.scc(Definition(cond)), + exec_mask, bld.exec(current_exec)); + ctx.info[idx].exec[exec_idx].first = exec_mask; + } + assert(cond != Temp()); + + /* check if the successor is the merge block, otherwise set exec to 0 */ + // TODO: this could be done better by directly branching to the merge block + unsigned succ_idx = ctx.program->blocks[block->linear_succs[1]].linear_succs[0]; + Block& succ = ctx.program->blocks[succ_idx]; + if (!(succ.kind & block_kind_invert || succ.kind & block_kind_merge)) { + ctx.info[idx].exec.back().first = bld.sop1(Builder::s_mov, bld.def(bld.lm, exec), Operand(0u)); + } + + bld.branch(aco_opcode::p_cbranch_nz, bld.scc(cond), block->linear_succs[1], block->linear_succs[0]); + return; + } +} + +void process_block(exec_ctx& ctx, Block* block) +{ + std::vector> instructions; + instructions.reserve(block->instructions.size()); + + unsigned idx = add_coupling_code(ctx, block, instructions); + + assert(block->index != ctx.program->blocks.size() - 1 || + ctx.info[block->index].exec.size() <= 2); + + process_instructions(ctx, block, instructions, idx); + + block->instructions = std::move(instructions); + + add_branch_code(ctx, block); + + block->live_out_exec = ctx.info[block->index].exec.back().first; +} + +} /* end namespace */ + + +void insert_exec_mask(Program *program) +{ + exec_ctx ctx(program); + + if (program->needs_wqm && program->needs_exact) + calculate_wqm_needs(ctx); + + for (Block& block : program->blocks) + process_block(ctx, &block); + +} + +} + diff -Nru mesa-19.2.8/src/amd/compiler/aco_insert_NOPs.cpp mesa-20.0.8/src/amd/compiler/aco_insert_NOPs.cpp --- mesa-19.2.8/src/amd/compiler/aco_insert_NOPs.cpp 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/amd/compiler/aco_insert_NOPs.cpp 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,653 @@ +/* + * Copyright © 2019 Valve Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + */ + +#include + +#include "aco_ir.h" +#include + +namespace aco { +namespace { + +struct NOP_ctx_gfx8_9 { + enum chip_class chip_class; + unsigned vcc_physical; + + /* just initialize these with something less than max NOPs */ + int VALU_wrexec = -10; + int VALU_wrvcc = -10; + int VALU_wrsgpr = -10; + + NOP_ctx_gfx8_9(Program* program) : chip_class(program->chip_class) { + vcc_physical = program->config->num_sgprs - 2; + } +}; + +struct NOP_ctx_gfx10 { + bool has_VOPC = false; + bool has_nonVALU_exec_read = false; + bool has_VMEM = false; + bool has_branch_after_VMEM = false; + bool has_DS = false; + bool has_branch_after_DS = false; + std::bitset<128> sgprs_read_by_VMEM; + std::bitset<128> sgprs_read_by_SMEM; + + void join(const NOP_ctx_gfx10 &other) { + has_VOPC |= other.has_VOPC; + has_nonVALU_exec_read |= other.has_nonVALU_exec_read; + has_VMEM |= other.has_VMEM; + has_branch_after_VMEM |= other.has_branch_after_VMEM; + has_DS |= other.has_DS; + has_branch_after_DS |= other.has_branch_after_DS; + sgprs_read_by_VMEM |= other.sgprs_read_by_VMEM; + sgprs_read_by_SMEM |= other.sgprs_read_by_SMEM; + } + + bool operator==(const NOP_ctx_gfx10 &other) + { + return + has_VOPC == other.has_VOPC && + has_nonVALU_exec_read == other.has_nonVALU_exec_read && + has_VMEM == other.has_VMEM && + has_branch_after_VMEM == other.has_branch_after_VMEM && + has_DS == other.has_DS && + has_branch_after_DS == other.has_branch_after_DS && + sgprs_read_by_VMEM == other.sgprs_read_by_VMEM && + sgprs_read_by_SMEM == other.sgprs_read_by_SMEM; + } +}; + +template +bool check_written_regs(const aco_ptr &instr, const std::bitset &check_regs) +{ + return std::any_of(instr->definitions.begin(), instr->definitions.end(), [&check_regs](const Definition &def) -> bool { + bool writes_any = false; + for (unsigned i = 0; i < def.size(); i++) { + unsigned def_reg = def.physReg() + i; + writes_any |= def_reg < check_regs.size() && check_regs[def_reg]; + } + return writes_any; + }); +} + +template +void mark_read_regs(const aco_ptr &instr, std::bitset ®_reads) +{ + for (const Operand &op : instr->operands) { + for (unsigned i = 0; i < op.size(); i++) { + unsigned reg = op.physReg() + i; + if (reg < reg_reads.size()) + reg_reads.set(reg); + } + } +} + +bool VALU_writes_sgpr(aco_ptr& instr) +{ + if ((uint32_t) instr->format & (uint32_t) Format::VOPC) + return true; + if (instr->isVOP3() && instr->definitions.size() == 2) + return true; + if (instr->opcode == aco_opcode::v_readfirstlane_b32 || + instr->opcode == aco_opcode::v_readlane_b32 || + instr->opcode == aco_opcode::v_readlane_b32_e64) + return true; + return false; +} + +bool instr_writes_exec(const aco_ptr& instr) +{ + return std::any_of(instr->definitions.begin(), instr->definitions.end(), [](const Definition &def) -> bool { + return def.physReg() == exec_lo || def.physReg() == exec_hi; + }); +} + +bool instr_writes_sgpr(const aco_ptr& instr) +{ + return std::any_of(instr->definitions.begin(), instr->definitions.end(), [](const Definition &def) -> bool { + return def.getTemp().type() == RegType::sgpr; + }); +} + +inline bool instr_is_branch(const aco_ptr& instr) +{ + return instr->opcode == aco_opcode::s_branch || + instr->opcode == aco_opcode::s_cbranch_scc0 || + instr->opcode == aco_opcode::s_cbranch_scc1 || + instr->opcode == aco_opcode::s_cbranch_vccz || + instr->opcode == aco_opcode::s_cbranch_vccnz || + instr->opcode == aco_opcode::s_cbranch_execz || + instr->opcode == aco_opcode::s_cbranch_execnz || + instr->opcode == aco_opcode::s_cbranch_cdbgsys || + instr->opcode == aco_opcode::s_cbranch_cdbguser || + instr->opcode == aco_opcode::s_cbranch_cdbgsys_or_user || + instr->opcode == aco_opcode::s_cbranch_cdbgsys_and_user || + instr->opcode == aco_opcode::s_subvector_loop_begin || + instr->opcode == aco_opcode::s_subvector_loop_end || + instr->opcode == aco_opcode::s_setpc_b64 || + instr->opcode == aco_opcode::s_swappc_b64 || + instr->opcode == aco_opcode::s_getpc_b64 || + instr->opcode == aco_opcode::s_call_b64; +} + +bool regs_intersect(PhysReg a_reg, unsigned a_size, PhysReg b_reg, unsigned b_size) +{ + return a_reg > b_reg ? + (a_reg - b_reg < b_size) : + (b_reg - a_reg < a_size); +} + +unsigned handle_SMEM_clause(aco_ptr& instr, int new_idx, + std::vector>& new_instructions) +{ + //TODO: s_dcache_inv needs to be in it's own group on GFX10 (and previous versions?) + const bool is_store = instr->definitions.empty(); + for (int pred_idx = new_idx - 1; pred_idx >= 0; pred_idx--) { + aco_ptr& pred = new_instructions[pred_idx]; + if (pred->format != Format::SMEM) + break; + + /* Don't allow clauses with store instructions since the clause's + * instructions may use the same address. */ + if (is_store || pred->definitions.empty()) + return 1; + + Definition& instr_def = instr->definitions[0]; + Definition& pred_def = pred->definitions[0]; + + /* ISA reference doesn't say anything about this, but best to be safe */ + if (regs_intersect(instr_def.physReg(), instr_def.size(), pred_def.physReg(), pred_def.size())) + return 1; + + for (const Operand& op : pred->operands) { + if (op.isConstant() || !op.isFixed()) + continue; + if (regs_intersect(instr_def.physReg(), instr_def.size(), op.physReg(), op.size())) + return 1; + } + for (const Operand& op : instr->operands) { + if (op.isConstant() || !op.isFixed()) + continue; + if (regs_intersect(pred_def.physReg(), pred_def.size(), op.physReg(), op.size())) + return 1; + } + } + + return 0; +} + +int handle_instruction_gfx8_9(NOP_ctx_gfx8_9& ctx, aco_ptr& instr, + std::vector>& old_instructions, + std::vector>& new_instructions) +{ + int new_idx = new_instructions.size(); + + // TODO: setreg / getreg / m0 writes + // TODO: try to schedule the NOP-causing instruction up to reduce the number of stall cycles + + + if (instr->format == Format::SMEM) { + if (ctx.chip_class == GFX6) { + bool is_buffer_load = instr->operands.size() && instr->operands[0].size() > 2; + for (int pred_idx = new_idx - 1; pred_idx >= 0 && pred_idx >= new_idx - 4; pred_idx--) { + aco_ptr& pred = new_instructions[pred_idx]; + /* A read of an SGPR by SMRD instruction requires 4 wait states + * when the SGPR was written by a VALU instruction. */ + if (VALU_writes_sgpr(pred)) { + Definition pred_def = pred->definitions[pred->definitions.size() - 1]; + for (const Operand& op : instr->operands) { + if (regs_intersect(pred_def.physReg(), pred_def.size(), op.physReg(), op.size())) + return 4 + pred_idx - new_idx + 1; + } + } + /* According to LLVM, this is an undocumented hardware behavior */ + if (is_buffer_load && pred->isSALU() && pred->definitions.size()) { + Definition pred_def = pred->definitions[0]; + Operand& op = instr->operands[0]; + if (regs_intersect(pred_def.physReg(), pred_def.size(), op.physReg(), op.size())) + return 4 + pred_idx - new_idx + 1; + } + } + } + + /* break off from prevous SMEM clause if needed */ + return handle_SMEM_clause(instr, new_idx, new_instructions); + + } else if (instr->isVALU() || instr->format == Format::VINTRP) { + int NOPs = 0; + + if (instr->isDPP()) { + /* VALU does not forward EXEC to DPP. */ + if (ctx.VALU_wrexec + 5 >= new_idx) + NOPs = 5 + ctx.VALU_wrexec - new_idx + 1; + + /* VALU DPP reads VGPR written by VALU */ + for (int pred_idx = new_idx - 1; pred_idx >= 0 && pred_idx >= new_idx - 2; pred_idx--) { + aco_ptr& pred = new_instructions[pred_idx]; + if ((pred->isVALU() || pred->format == Format::VINTRP) && + !pred->definitions.empty() && + pred->definitions[0].physReg() == instr->operands[0].physReg()) { + NOPs = std::max(NOPs, 2 + pred_idx - new_idx + 1); + break; + } + } + } + + /* SALU writes M0 */ + if (instr->format == Format::VINTRP && new_idx > 0 && ctx.chip_class >= GFX9) { + aco_ptr& pred = new_instructions.back(); + if (pred->isSALU() && + !pred->definitions.empty() && + pred->definitions[0].physReg() == m0) + NOPs = std::max(NOPs, 1); + } + + for (const Operand& op : instr->operands) { + /* VALU which uses VCCZ */ + if (op.physReg() == PhysReg{251} && + ctx.VALU_wrvcc + 5 >= new_idx) + NOPs = std::max(NOPs, 5 + ctx.VALU_wrvcc - new_idx + 1); + + /* VALU which uses EXECZ */ + if (op.physReg() == PhysReg{252} && + ctx.VALU_wrexec + 5 >= new_idx) + NOPs = std::max(NOPs, 5 + ctx.VALU_wrexec - new_idx + 1); + + /* VALU which reads VCC as a constant */ + if (ctx.VALU_wrvcc + 1 >= new_idx) { + for (unsigned k = 0; k < op.size(); k++) { + unsigned reg = op.physReg() + k; + if (reg == ctx.vcc_physical || reg == ctx.vcc_physical + 1) + NOPs = std::max(NOPs, 1); + } + } + } + + switch (instr->opcode) { + case aco_opcode::v_readlane_b32: + case aco_opcode::v_readlane_b32_e64: + case aco_opcode::v_writelane_b32: + case aco_opcode::v_writelane_b32_e64: { + if (ctx.VALU_wrsgpr + 4 < new_idx) + break; + PhysReg reg = instr->operands[1].physReg(); + for (int pred_idx = new_idx - 1; pred_idx >= 0 && pred_idx >= new_idx - 4; pred_idx--) { + aco_ptr& pred = new_instructions[pred_idx]; + if (!pred->isVALU() || !VALU_writes_sgpr(pred)) + continue; + for (const Definition& def : pred->definitions) { + if (def.physReg() == reg) + NOPs = std::max(NOPs, 4 + pred_idx - new_idx + 1); + } + } + break; + } + case aco_opcode::v_div_fmas_f32: + case aco_opcode::v_div_fmas_f64: { + if (ctx.VALU_wrvcc + 4 >= new_idx) + NOPs = std::max(NOPs, 4 + ctx.VALU_wrvcc - new_idx + 1); + break; + } + default: + break; + } + + /* Write VGPRs holding writedata > 64 bit from MIMG/MUBUF instructions */ + // FIXME: handle case if the last instruction of a block without branch is such store + if (new_idx > 0) { + aco_ptr& pred = new_instructions.back(); + /* >64-bit MUBUF/MTBUF store with a constant in SOFFSET */ + bool consider_buf = (pred->format == Format::MUBUF || pred->format == Format::MTBUF) && + pred->operands.size() == 4 && + pred->operands[3].size() > 2 && + pred->operands[2].physReg() >= 128; + /* MIMG store with a 128-bit T# with more than two bits set in dmask (making it a >64-bit store) */ + bool consider_mimg = pred->format == Format::MIMG && + pred->operands[1].regClass().type() == RegType::vgpr && + pred->operands[1].size() > 2 && + pred->operands[0].size() == 4; + /* FLAT/GLOBAL/SCRATCH store with >64-bit data */ + bool consider_flat = (pred->isFlatOrGlobal() || pred->format == Format::SCRATCH) && + pred->operands.size() == 3 && + pred->operands[2].size() > 2; + if (consider_buf || consider_mimg || consider_flat) { + PhysReg wrdata = pred->operands[consider_flat ? 2 : 3].physReg(); + unsigned size = pred->operands[consider_flat ? 2 : 3].size(); + assert(wrdata >= 256); + for (const Definition& def : instr->definitions) { + if (regs_intersect(def.physReg(), def.size(), wrdata, size)) + NOPs = std::max(NOPs, 1); + } + } + } + + if (VALU_writes_sgpr(instr)) { + for (const Definition& def : instr->definitions) { + if (def.physReg() == vcc) + ctx.VALU_wrvcc = NOPs ? new_idx : new_idx + 1; + else if (def.physReg() == exec) + ctx.VALU_wrexec = NOPs ? new_idx : new_idx + 1; + else if (def.physReg() <= 102) + ctx.VALU_wrsgpr = NOPs ? new_idx : new_idx + 1; + } + } + + /* It's required to insert 1 wait state if the dst VGPR of any v_interp_* + * is followed by a read with v_readfirstlane or v_readlane to fix GPU + * hangs on GFX6. Note that v_writelane_* is apparently not affected. + * This hazard isn't documented anywhere but AMD confirmed that hazard. + */ + if (ctx.chip_class == GFX6 && + !new_instructions.empty() && + (instr->opcode == aco_opcode::v_readfirstlane_b32 || + instr->opcode == aco_opcode::v_readlane_b32)) { + aco_ptr& pred = new_instructions.back(); + if (pred->format == Format::VINTRP) { + Definition pred_def = pred->definitions[0]; + Operand& op = instr->operands[0]; + if (regs_intersect(pred_def.physReg(), pred_def.size(), op.physReg(), op.size())) + NOPs = std::max(NOPs, 1); + } + } + return NOPs; + } else if (instr->isVMEM() && ctx.VALU_wrsgpr + 5 >= new_idx) { + /* If the VALU writes the SGPR that is used by a VMEM, the user must add five wait states. */ + for (int pred_idx = new_idx - 1; pred_idx >= 0 && pred_idx >= new_idx - 5; pred_idx--) { + aco_ptr& pred = new_instructions[pred_idx]; + // TODO: break if something else writes the SGPR + if (!(pred->isVALU() && VALU_writes_sgpr(pred))) + continue; + + for (const Definition& def : pred->definitions) { + if (def.physReg() > 102) + continue; + + for (const Operand& op : instr->operands) { + if (regs_intersect(op.physReg(), op.size(), def.physReg(), def.size())) + return 5 + pred_idx - new_idx + 1; + + } + } + } + } else if (instr->format == Format::SOPP) { + if (instr->opcode == aco_opcode::s_sendmsg && new_idx > 0) { + aco_ptr& pred = new_instructions.back(); + if (pred->isSALU() && + !pred->definitions.empty() && + pred->definitions[0].physReg() == m0) + return 1; + } + } + + return 0; +} + +void handle_block_gfx8_9(NOP_ctx_gfx8_9& ctx, Block& block) +{ + std::vector> instructions; + instructions.reserve(block.instructions.size()); + for (unsigned i = 0; i < block.instructions.size(); i++) { + aco_ptr& instr = block.instructions[i]; + unsigned NOPs = handle_instruction_gfx8_9(ctx, instr, block.instructions, instructions); + if (NOPs) { + // TODO: try to move the instruction down + /* create NOP */ + aco_ptr nop{create_instruction(aco_opcode::s_nop, Format::SOPP, 0, 0)}; + nop->imm = NOPs - 1; + nop->block = -1; + instructions.emplace_back(std::move(nop)); + } + + instructions.emplace_back(std::move(instr)); + } + + ctx.VALU_wrvcc -= instructions.size(); + ctx.VALU_wrexec -= instructions.size(); + ctx.VALU_wrsgpr -= instructions.size(); + block.instructions = std::move(instructions); +} + +void insert_NOPs_gfx8_9(Program* program) +{ + NOP_ctx_gfx8_9 ctx(program); + + for (Block& block : program->blocks) { + if (block.instructions.empty()) + continue; + + handle_block_gfx8_9(ctx, block); + } +} + +void handle_instruction_gfx10(Program *program, NOP_ctx_gfx10 &ctx, aco_ptr& instr, + std::vector>& old_instructions, + std::vector>& new_instructions) +{ + /* VMEMtoScalarWriteHazard + * Handle EXEC/M0/SGPR write following a VMEM instruction without a VALU or "waitcnt vmcnt(0)" in-between. + */ + if (instr->isVMEM() || instr->format == Format::FLAT || instr->format == Format::GLOBAL || + instr->format == Format::SCRATCH || instr->format == Format::DS) { + /* Remember all SGPRs that are read by the VMEM instruction */ + mark_read_regs(instr, ctx.sgprs_read_by_VMEM); + ctx.sgprs_read_by_VMEM.set(exec); + if (program->wave_size == 64) + ctx.sgprs_read_by_VMEM.set(exec_hi); + } else if (instr->isSALU() || instr->format == Format::SMEM) { + /* Check if SALU writes an SGPR that was previously read by the VALU */ + if (check_written_regs(instr, ctx.sgprs_read_by_VMEM)) { + ctx.sgprs_read_by_VMEM.reset(); + + /* Insert v_nop to mitigate the problem */ + aco_ptr nop{create_instruction(aco_opcode::v_nop, Format::VOP1, 0, 0)}; + new_instructions.emplace_back(std::move(nop)); + } + } else if (instr->opcode == aco_opcode::s_waitcnt) { + /* Hazard is mitigated by "s_waitcnt vmcnt(0)" */ + uint16_t imm = static_cast(instr.get())->imm; + unsigned vmcnt = (imm & 0xF) | ((imm & (0x3 << 14)) >> 10); + if (vmcnt == 0) + ctx.sgprs_read_by_VMEM.reset(); + } else if (instr->isVALU()) { + /* Hazard is mitigated by any VALU instruction */ + ctx.sgprs_read_by_VMEM.reset(); + } + + /* VcmpxPermlaneHazard + * Handle any permlane following a VOPC instruction, insert v_mov between them. + */ + if (instr->format == Format::VOPC) { + ctx.has_VOPC = true; + } else if (ctx.has_VOPC && + (instr->opcode == aco_opcode::v_permlane16_b32 || + instr->opcode == aco_opcode::v_permlanex16_b32)) { + ctx.has_VOPC = false; + + /* v_nop would be discarded by SQ, so use v_mov with the first operand of the permlane */ + aco_ptr v_mov{create_instruction(aco_opcode::v_mov_b32, Format::VOP1, 1, 1)}; + v_mov->definitions[0] = Definition(instr->operands[0].physReg(), v1); + v_mov->operands[0] = Operand(instr->operands[0].physReg(), v1); + new_instructions.emplace_back(std::move(v_mov)); + } else if (instr->isVALU() && instr->opcode != aco_opcode::v_nop) { + ctx.has_VOPC = false; + } + + /* VcmpxExecWARHazard + * Handle any VALU instruction writing the exec mask after it was read by a non-VALU instruction. + */ + if (!instr->isVALU() && instr->reads_exec()) { + ctx.has_nonVALU_exec_read = true; + } else if (instr->isVALU()) { + if (instr_writes_exec(instr)) { + ctx.has_nonVALU_exec_read = false; + + /* Insert s_waitcnt_depctr instruction with magic imm to mitigate the problem */ + aco_ptr depctr{create_instruction(aco_opcode::s_waitcnt_depctr, Format::SOPP, 0, 0)}; + depctr->imm = 0xfffe; + depctr->block = -1; + new_instructions.emplace_back(std::move(depctr)); + } else if (instr_writes_sgpr(instr)) { + /* Any VALU instruction that writes an SGPR mitigates the problem */ + ctx.has_nonVALU_exec_read = false; + } + } else if (instr->opcode == aco_opcode::s_waitcnt_depctr) { + /* s_waitcnt_depctr can mitigate the problem if it has a magic imm */ + const SOPP_instruction *sopp = static_cast(instr.get()); + if ((sopp->imm & 0xfffe) == 0xfffe) + ctx.has_nonVALU_exec_read = false; + } + + /* SMEMtoVectorWriteHazard + * Handle any VALU instruction writing an SGPR after an SMEM reads it. + */ + if (instr->format == Format::SMEM) { + /* Remember all SGPRs that are read by the SMEM instruction */ + mark_read_regs(instr, ctx.sgprs_read_by_SMEM); + } else if (VALU_writes_sgpr(instr)) { + /* Check if VALU writes an SGPR that was previously read by SMEM */ + if (check_written_regs(instr, ctx.sgprs_read_by_SMEM)) { + ctx.sgprs_read_by_SMEM.reset(); + + /* Insert s_mov to mitigate the problem */ + aco_ptr s_mov{create_instruction(aco_opcode::s_mov_b32, Format::SOP1, 1, 1)}; + s_mov->definitions[0] = Definition(sgpr_null, s1); + s_mov->operands[0] = Operand(0u); + new_instructions.emplace_back(std::move(s_mov)); + } + } else if (instr->isSALU()) { + if (instr->format != Format::SOPP) { + /* SALU can mitigate the hazard */ + ctx.sgprs_read_by_SMEM.reset(); + } else { + /* Reducing lgkmcnt count to 0 always mitigates the hazard. */ + const SOPP_instruction *sopp = static_cast(instr.get()); + if (sopp->opcode == aco_opcode::s_waitcnt_lgkmcnt) { + if (sopp->imm == 0 && sopp->definitions[0].physReg() == sgpr_null) + ctx.sgprs_read_by_SMEM.reset(); + } else if (sopp->opcode == aco_opcode::s_waitcnt) { + unsigned lgkm = (sopp->imm >> 8) & 0x3f; + if (lgkm == 0) + ctx.sgprs_read_by_SMEM.reset(); + } + } + } + + /* LdsBranchVmemWARHazard + * Handle VMEM/GLOBAL/SCRATCH->branch->DS and DS->branch->VMEM/GLOBAL/SCRATCH patterns. + */ + if (instr->isVMEM() || instr->format == Format::GLOBAL || instr->format == Format::SCRATCH) { + ctx.has_VMEM = true; + ctx.has_branch_after_VMEM = false; + /* Mitigation for DS is needed only if there was already a branch after */ + ctx.has_DS = ctx.has_branch_after_DS; + } else if (instr->format == Format::DS) { + ctx.has_DS = true; + ctx.has_branch_after_DS = false; + /* Mitigation for VMEM is needed only if there was already a branch after */ + ctx.has_VMEM = ctx.has_branch_after_VMEM; + } else if (instr_is_branch(instr)) { + ctx.has_branch_after_VMEM = ctx.has_VMEM; + ctx.has_branch_after_DS = ctx.has_DS; + } else if (instr->opcode == aco_opcode::s_waitcnt_vscnt) { + /* Only s_waitcnt_vscnt can mitigate the hazard */ + const SOPK_instruction *sopk = static_cast(instr.get()); + if (sopk->definitions[0].physReg() == sgpr_null && sopk->imm == 0) + ctx.has_VMEM = ctx.has_branch_after_VMEM = ctx.has_DS = ctx.has_branch_after_DS = false; + } + if ((ctx.has_VMEM && ctx.has_branch_after_DS) || (ctx.has_DS && ctx.has_branch_after_VMEM)) { + ctx.has_VMEM = ctx.has_branch_after_VMEM = ctx.has_DS = ctx.has_branch_after_DS = false; + + /* Insert s_waitcnt_vscnt to mitigate the problem */ + aco_ptr wait{create_instruction(aco_opcode::s_waitcnt_vscnt, Format::SOPK, 0, 1)}; + wait->definitions[0] = Definition(sgpr_null, s1); + wait->imm = 0; + new_instructions.emplace_back(std::move(wait)); + } +} + +void handle_block_gfx10(Program *program, NOP_ctx_gfx10& ctx, Block& block) +{ + if (block.instructions.empty()) + return; + + std::vector> instructions; + instructions.reserve(block.instructions.size()); + + for (aco_ptr& instr : block.instructions) { + handle_instruction_gfx10(program, ctx, instr, block.instructions, instructions); + instructions.emplace_back(std::move(instr)); + } + + block.instructions = std::move(instructions); +} + +void mitigate_hazards_gfx10(Program *program) +{ + NOP_ctx_gfx10 all_ctx[program->blocks.size()]; + std::stack loop_header_indices; + + for (unsigned i = 0; i < program->blocks.size(); i++) { + Block& block = program->blocks[i]; + NOP_ctx_gfx10 &ctx = all_ctx[i]; + + if (block.kind & block_kind_loop_header) { + loop_header_indices.push(i); + } else if (block.kind & block_kind_loop_exit) { + /* Go through the whole loop again */ + for (unsigned idx = loop_header_indices.top(); idx < i; idx++) { + NOP_ctx_gfx10 loop_block_ctx; + for (unsigned b : program->blocks[idx].linear_preds) + loop_block_ctx.join(all_ctx[b]); + + handle_block_gfx10(program, loop_block_ctx, program->blocks[idx]); + + /* We only need to continue if the loop header context changed */ + if (idx == loop_header_indices.top() && loop_block_ctx == all_ctx[idx]) + break; + + all_ctx[idx] = loop_block_ctx; + } + + loop_header_indices.pop(); + } + + for (unsigned b : block.linear_preds) + ctx.join(all_ctx[b]); + + handle_block_gfx10(program, ctx, block); + } +} + +} /* end namespace */ + +void insert_NOPs(Program* program) +{ + if (program->chip_class >= GFX10) + mitigate_hazards_gfx10(program); + else + insert_NOPs_gfx8_9(program); +} + +} diff -Nru mesa-19.2.8/src/amd/compiler/aco_insert_waitcnt.cpp mesa-20.0.8/src/amd/compiler/aco_insert_waitcnt.cpp --- mesa-19.2.8/src/amd/compiler/aco_insert_waitcnt.cpp 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/amd/compiler/aco_insert_waitcnt.cpp 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,845 @@ +/* + * Copyright © 2018 Valve Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + */ + +#include +#include +#include + +#include "aco_ir.h" +#include "vulkan/radv_shader.h" + +namespace aco { + +namespace { + +/** + * The general idea of this pass is: + * The CFG is traversed in reverse postorder (forward) and loops are processed + * several times until no progress is made. + * Per BB two wait_ctx is maintained: an in-context and out-context. + * The in-context is the joined out-contexts of the predecessors. + * The context contains a map: gpr -> wait_entry + * consisting of the information about the cnt values to be waited for. + * Note: After merge-nodes, it might occur that for the same register + * multiple cnt values are to be waited for. + * + * The values are updated according to the encountered instructions: + * - additional events increment the counter of waits of the same type + * - or erase gprs with counters higher than to be waited for. + */ + +// TODO: do a more clever insertion of wait_cnt (lgkm_cnt) when there is a load followed by a use of a previous load + +/* Instructions of the same event will finish in-order except for smem + * and maybe flat. Instructions of different events may not finish in-order. */ +enum wait_event : uint16_t { + event_smem = 1 << 0, + event_lds = 1 << 1, + event_gds = 1 << 2, + event_vmem = 1 << 3, + event_vmem_store = 1 << 4, /* GFX10+ */ + event_flat = 1 << 5, + event_exp_pos = 1 << 6, + event_exp_param = 1 << 7, + event_exp_mrt_null = 1 << 8, + event_gds_gpr_lock = 1 << 9, + event_vmem_gpr_lock = 1 << 10, + event_sendmsg = 1 << 11, +}; + +enum counter_type : uint8_t { + counter_exp = 1 << 0, + counter_lgkm = 1 << 1, + counter_vm = 1 << 2, + counter_vs = 1 << 3, +}; + +static const uint16_t exp_events = event_exp_pos | event_exp_param | event_exp_mrt_null | event_gds_gpr_lock | event_vmem_gpr_lock; +static const uint16_t lgkm_events = event_smem | event_lds | event_gds | event_flat | event_sendmsg; +static const uint16_t vm_events = event_vmem | event_flat; +static const uint16_t vs_events = event_vmem_store; + +uint8_t get_counters_for_event(wait_event ev) +{ + switch (ev) { + case event_smem: + case event_lds: + case event_gds: + case event_sendmsg: + return counter_lgkm; + case event_vmem: + return counter_vm; + case event_vmem_store: + return counter_vs; + case event_flat: + return counter_vm | counter_lgkm; + case event_exp_pos: + case event_exp_param: + case event_exp_mrt_null: + case event_gds_gpr_lock: + case event_vmem_gpr_lock: + return counter_exp; + default: + return 0; + } +} + +struct wait_imm { + static const uint8_t unset_counter = 0xff; + + uint8_t vm; + uint8_t exp; + uint8_t lgkm; + uint8_t vs; + + wait_imm() : + vm(unset_counter), exp(unset_counter), lgkm(unset_counter), vs(unset_counter) {} + wait_imm(uint16_t vm_, uint16_t exp_, uint16_t lgkm_, uint16_t vs_) : + vm(vm_), exp(exp_), lgkm(lgkm_), vs(vs_) {} + + wait_imm(enum chip_class chip, uint16_t packed) : vs(unset_counter) + { + vm = packed & 0xf; + if (chip >= GFX9) + vm |= (packed >> 10) & 0x30; + + exp = (packed >> 4) & 0x7; + + lgkm = (packed >> 8) & 0xf; + if (chip >= GFX10) + lgkm |= (packed >> 8) & 0x30; + } + + uint16_t pack(enum chip_class chip) const + { + uint16_t imm = 0; + assert(exp == unset_counter || exp <= 0x7); + switch (chip) { + case GFX10: + assert(lgkm == unset_counter || lgkm <= 0x3f); + assert(vm == unset_counter || vm <= 0x3f); + imm = ((vm & 0x30) << 10) | ((lgkm & 0x3f) << 8) | ((exp & 0x7) << 4) | (vm & 0xf); + break; + case GFX9: + assert(lgkm == unset_counter || lgkm <= 0xf); + assert(vm == unset_counter || vm <= 0x3f); + imm = ((vm & 0x30) << 10) | ((lgkm & 0xf) << 8) | ((exp & 0x7) << 4) | (vm & 0xf); + break; + default: + assert(lgkm == unset_counter || lgkm <= 0xf); + assert(vm == unset_counter || vm <= 0xf); + imm = ((lgkm & 0xf) << 8) | ((exp & 0x7) << 4) | (vm & 0xf); + break; + } + if (chip < GFX9 && vm == wait_imm::unset_counter) + imm |= 0xc000; /* should have no effect on pre-GFX9 and now we won't have to worry about the architecture when interpreting the immediate */ + if (chip < GFX10 && lgkm == wait_imm::unset_counter) + imm |= 0x3000; /* should have no effect on pre-GFX10 and now we won't have to worry about the architecture when interpreting the immediate */ + return imm; + } + + bool combine(const wait_imm& other) + { + bool changed = other.vm < vm || other.exp < exp || other.lgkm < lgkm || other.vs < vs; + vm = std::min(vm, other.vm); + exp = std::min(exp, other.exp); + lgkm = std::min(lgkm, other.lgkm); + vs = std::min(vs, other.vs); + return changed; + } + + bool empty() const + { + return vm == unset_counter && exp == unset_counter && + lgkm == unset_counter && vs == unset_counter; + } +}; + +struct wait_entry { + wait_imm imm; + uint16_t events; /* use wait_event notion */ + uint8_t counters; /* use counter_type notion */ + bool wait_on_read:1; + bool logical:1; + + wait_entry(wait_event event, wait_imm imm, bool logical, bool wait_on_read) + : imm(imm), events(event), counters(get_counters_for_event(event)), + wait_on_read(wait_on_read), logical(logical) {} + + bool join(const wait_entry& other) + { + bool changed = (other.events & ~events) || + (other.counters & ~counters) || + (other.wait_on_read && !wait_on_read); + events |= other.events; + counters |= other.counters; + changed |= imm.combine(other.imm); + wait_on_read = wait_on_read || other.wait_on_read; + assert(logical == other.logical); + return changed; + } + + void remove_counter(counter_type counter) + { + counters &= ~counter; + + if (counter == counter_lgkm) { + imm.lgkm = wait_imm::unset_counter; + events &= ~(event_smem | event_lds | event_gds | event_sendmsg); + } + + if (counter == counter_vm) { + imm.vm = wait_imm::unset_counter; + events &= ~event_vmem; + } + + if (counter == counter_exp) { + imm.exp = wait_imm::unset_counter; + events &= ~(event_exp_pos | event_exp_param | event_exp_mrt_null | event_gds_gpr_lock | event_vmem_gpr_lock); + } + + if (counter == counter_vs) { + imm.vs = wait_imm::unset_counter; + events &= ~event_vmem_store; + } + + if (!(counters & counter_lgkm) && !(counters & counter_vm)) + events &= ~event_flat; + } +}; + +struct wait_ctx { + Program *program; + enum chip_class chip_class; + uint16_t max_vm_cnt; + uint16_t max_exp_cnt; + uint16_t max_lgkm_cnt; + uint16_t max_vs_cnt; + uint16_t unordered_events = event_smem | event_flat; + + uint8_t vm_cnt = 0; + uint8_t exp_cnt = 0; + uint8_t lgkm_cnt = 0; + uint8_t vs_cnt = 0; + bool pending_flat_lgkm = false; + bool pending_flat_vm = false; + bool pending_s_buffer_store = false; /* GFX10 workaround */ + + wait_imm barrier_imm[barrier_count]; + uint16_t barrier_events[barrier_count]; /* use wait_event notion */ + + std::map gpr_map; + + wait_ctx() {} + wait_ctx(Program *program_) + : program(program_), + chip_class(program_->chip_class), + max_vm_cnt(program_->chip_class >= GFX9 ? 62 : 14), + max_exp_cnt(6), + max_lgkm_cnt(program_->chip_class >= GFX10 ? 62 : 14), + max_vs_cnt(program_->chip_class >= GFX10 ? 62 : 0), + unordered_events(event_smem | (program_->chip_class < GFX10 ? event_flat : 0)) {} + + bool join(const wait_ctx* other, bool logical) + { + bool changed = other->exp_cnt > exp_cnt || + other->vm_cnt > vm_cnt || + other->lgkm_cnt > lgkm_cnt || + other->vs_cnt > vs_cnt || + (other->pending_flat_lgkm && !pending_flat_lgkm) || + (other->pending_flat_vm && !pending_flat_vm); + + exp_cnt = std::max(exp_cnt, other->exp_cnt); + vm_cnt = std::max(vm_cnt, other->vm_cnt); + lgkm_cnt = std::max(lgkm_cnt, other->lgkm_cnt); + vs_cnt = std::max(vs_cnt, other->vs_cnt); + pending_flat_lgkm |= other->pending_flat_lgkm; + pending_flat_vm |= other->pending_flat_vm; + pending_s_buffer_store |= other->pending_s_buffer_store; + + for (std::pair entry : other->gpr_map) + { + std::map::iterator it = gpr_map.find(entry.first); + if (entry.second.logical != logical) + continue; + + if (it != gpr_map.end()) { + changed |= it->second.join(entry.second); + } else { + gpr_map.insert(entry); + changed = true; + } + } + + for (unsigned i = 0; i < barrier_count; i++) { + changed |= barrier_imm[i].combine(other->barrier_imm[i]); + changed |= other->barrier_events[i] & ~barrier_events[i]; + barrier_events[i] |= other->barrier_events[i]; + } + + return changed; + } +}; + +wait_imm check_instr(Instruction* instr, wait_ctx& ctx) +{ + wait_imm wait; + + for (const Operand op : instr->operands) { + if (op.isConstant() || op.isUndefined()) + continue; + + /* check consecutively read gprs */ + for (unsigned j = 0; j < op.size(); j++) { + PhysReg reg{op.physReg() + j}; + std::map::iterator it = ctx.gpr_map.find(reg); + if (it == ctx.gpr_map.end() || !it->second.wait_on_read) + continue; + + wait.combine(it->second.imm); + } + } + + for (const Definition& def : instr->definitions) { + /* check consecutively written gprs */ + for (unsigned j = 0; j < def.getTemp().size(); j++) + { + PhysReg reg{def.physReg() + j}; + + std::map::iterator it = ctx.gpr_map.find(reg); + if (it == ctx.gpr_map.end()) + continue; + + /* Vector Memory reads and writes return in the order they were issued */ + if (instr->isVMEM() && ((it->second.events & vm_events) == event_vmem)) { + it->second.remove_counter(counter_vm); + if (!it->second.counters) + it = ctx.gpr_map.erase(it); + continue; + } + + /* LDS reads and writes return in the order they were issued. same for GDS */ + if (instr->format == Format::DS) { + bool gds = static_cast(instr)->gds; + if ((it->second.events & lgkm_events) == (gds ? event_gds : event_lds)) { + it->second.remove_counter(counter_lgkm); + if (!it->second.counters) + it = ctx.gpr_map.erase(it); + continue; + } + } + + wait.combine(it->second.imm); + } + } + + return wait; +} + +wait_imm parse_wait_instr(wait_ctx& ctx, Instruction *instr) +{ + if (instr->opcode == aco_opcode::s_waitcnt_vscnt && + instr->definitions[0].physReg() == sgpr_null) { + wait_imm imm; + imm.vs = std::min(imm.vs, static_cast(instr)->imm); + return imm; + } else if (instr->opcode == aco_opcode::s_waitcnt) { + return wait_imm(ctx.chip_class, static_cast(instr)->imm); + } + return wait_imm(); +} + +wait_imm kill(Instruction* instr, wait_ctx& ctx) +{ + wait_imm imm; + if (ctx.exp_cnt || ctx.vm_cnt || ctx.lgkm_cnt) + imm.combine(check_instr(instr, ctx)); + + imm.combine(parse_wait_instr(ctx, instr)); + + + /* It's required to wait for scalar stores before "writing back" data. + * It shouldn't cost anything anyways since we're about to do s_endpgm. + */ + if (ctx.lgkm_cnt && instr->opcode == aco_opcode::s_dcache_wb) { + assert(ctx.chip_class >= GFX8); + imm.lgkm = 0; + } + + if (ctx.chip_class >= GFX10 && instr->format == Format::SMEM) { + /* GFX10: A store followed by a load at the same address causes a problem because + * the load doesn't load the correct values unless we wait for the store first. + * This is NOT mitigated by an s_nop. + * + * TODO: Refine this when we have proper alias analysis. + */ + SMEM_instruction *smem = static_cast(instr); + if (ctx.pending_s_buffer_store && + !smem->definitions.empty() && + !smem->can_reorder && smem->barrier == barrier_buffer) { + imm.lgkm = 0; + } + } + + if (instr->format == Format::PSEUDO_BARRIER) { + uint32_t workgroup_size = UINT32_MAX; + if (ctx.program->stage & sw_cs) { + unsigned* bsize = ctx.program->info->cs.block_size; + workgroup_size = bsize[0] * bsize[1] * bsize[2]; + } + switch (instr->opcode) { + case aco_opcode::p_memory_barrier_common: + imm.combine(ctx.barrier_imm[ffs(barrier_atomic) - 1]); + imm.combine(ctx.barrier_imm[ffs(barrier_buffer) - 1]); + imm.combine(ctx.barrier_imm[ffs(barrier_image) - 1]); + if (workgroup_size > ctx.program->wave_size) + imm.combine(ctx.barrier_imm[ffs(barrier_shared) - 1]); + break; + case aco_opcode::p_memory_barrier_atomic: + imm.combine(ctx.barrier_imm[ffs(barrier_atomic) - 1]); + break; + /* see comment in aco_scheduler.cpp's can_move_instr() on why these barriers are merged */ + case aco_opcode::p_memory_barrier_buffer: + case aco_opcode::p_memory_barrier_image: + imm.combine(ctx.barrier_imm[ffs(barrier_buffer) - 1]); + imm.combine(ctx.barrier_imm[ffs(barrier_image) - 1]); + break; + case aco_opcode::p_memory_barrier_shared: + if (workgroup_size > ctx.program->wave_size) + imm.combine(ctx.barrier_imm[ffs(barrier_shared) - 1]); + break; + case aco_opcode::p_memory_barrier_gs_data: + imm.combine(ctx.barrier_imm[ffs(barrier_gs_data) - 1]); + break; + case aco_opcode::p_memory_barrier_gs_sendmsg: + imm.combine(ctx.barrier_imm[ffs(barrier_gs_sendmsg) - 1]); + break; + default: + assert(false); + break; + } + } + + if (!imm.empty()) { + if (ctx.pending_flat_vm && imm.vm != wait_imm::unset_counter) + imm.vm = 0; + if (ctx.pending_flat_lgkm && imm.lgkm != wait_imm::unset_counter) + imm.lgkm = 0; + + /* reset counters */ + ctx.exp_cnt = std::min(ctx.exp_cnt, imm.exp); + ctx.vm_cnt = std::min(ctx.vm_cnt, imm.vm); + ctx.lgkm_cnt = std::min(ctx.lgkm_cnt, imm.lgkm); + ctx.vs_cnt = std::min(ctx.vs_cnt, imm.vs); + + /* update barrier wait imms */ + for (unsigned i = 0; i < barrier_count; i++) { + wait_imm& bar = ctx.barrier_imm[i]; + uint16_t& bar_ev = ctx.barrier_events[i]; + if (bar.exp != wait_imm::unset_counter && imm.exp <= bar.exp) { + bar.exp = wait_imm::unset_counter; + bar_ev &= ~exp_events; + } + if (bar.vm != wait_imm::unset_counter && imm.vm <= bar.vm) { + bar.vm = wait_imm::unset_counter; + bar_ev &= ~(vm_events & ~event_flat); + } + if (bar.lgkm != wait_imm::unset_counter && imm.lgkm <= bar.lgkm) { + bar.lgkm = wait_imm::unset_counter; + bar_ev &= ~(lgkm_events & ~event_flat); + } + if (bar.vs != wait_imm::unset_counter && imm.vs <= bar.vs) { + bar.vs = wait_imm::unset_counter; + bar_ev &= ~vs_events; + } + if (bar.vm == wait_imm::unset_counter && bar.lgkm == wait_imm::unset_counter) + bar_ev &= ~event_flat; + } + + /* remove all gprs with higher counter from map */ + std::map::iterator it = ctx.gpr_map.begin(); + while (it != ctx.gpr_map.end()) + { + if (imm.exp != wait_imm::unset_counter && imm.exp <= it->second.imm.exp) + it->second.remove_counter(counter_exp); + if (imm.vm != wait_imm::unset_counter && imm.vm <= it->second.imm.vm) + it->second.remove_counter(counter_vm); + if (imm.lgkm != wait_imm::unset_counter && imm.lgkm <= it->second.imm.lgkm) + it->second.remove_counter(counter_lgkm); + if (imm.lgkm != wait_imm::unset_counter && imm.vs <= it->second.imm.vs) + it->second.remove_counter(counter_vs); + if (!it->second.counters) + it = ctx.gpr_map.erase(it); + else + it++; + } + } + + if (imm.vm == 0) + ctx.pending_flat_vm = false; + if (imm.lgkm == 0) { + ctx.pending_flat_lgkm = false; + ctx.pending_s_buffer_store = false; + } + + return imm; +} + +void update_barrier_counter(uint8_t *ctr, unsigned max) +{ + if (*ctr != wait_imm::unset_counter && *ctr < max) + (*ctr)++; +} + +void update_barrier_imm(wait_ctx& ctx, uint8_t counters, wait_event event, barrier_interaction barrier) +{ + for (unsigned i = 0; i < barrier_count; i++) { + wait_imm& bar = ctx.barrier_imm[i]; + uint16_t& bar_ev = ctx.barrier_events[i]; + if (barrier & (1 << i)) { + bar_ev |= event; + if (counters & counter_lgkm) + bar.lgkm = 0; + if (counters & counter_vm) + bar.vm = 0; + if (counters & counter_exp) + bar.exp = 0; + if (counters & counter_vs) + bar.vs = 0; + } else if (!(bar_ev & ctx.unordered_events) && !(ctx.unordered_events & event)) { + if (counters & counter_lgkm && (bar_ev & lgkm_events) == event) + update_barrier_counter(&bar.lgkm, ctx.max_lgkm_cnt); + if (counters & counter_vm && (bar_ev & vm_events) == event) + update_barrier_counter(&bar.vm, ctx.max_vm_cnt); + if (counters & counter_exp && (bar_ev & exp_events) == event) + update_barrier_counter(&bar.exp, ctx.max_exp_cnt); + if (counters & counter_vs && (bar_ev & vs_events) == event) + update_barrier_counter(&bar.vs, ctx.max_vs_cnt); + } + } +} + +void update_counters(wait_ctx& ctx, wait_event event, barrier_interaction barrier=barrier_none) +{ + uint8_t counters = get_counters_for_event(event); + + if (counters & counter_lgkm && ctx.lgkm_cnt <= ctx.max_lgkm_cnt) + ctx.lgkm_cnt++; + if (counters & counter_vm && ctx.vm_cnt <= ctx.max_vm_cnt) + ctx.vm_cnt++; + if (counters & counter_exp && ctx.exp_cnt <= ctx.max_exp_cnt) + ctx.exp_cnt++; + if (counters & counter_vs && ctx.vs_cnt <= ctx.max_vs_cnt) + ctx.vs_cnt++; + + update_barrier_imm(ctx, counters, event, barrier); + + if (ctx.unordered_events & event) + return; + + if (ctx.pending_flat_lgkm) + counters &= ~counter_lgkm; + if (ctx.pending_flat_vm) + counters &= ~counter_vm; + + for (std::pair& e : ctx.gpr_map) { + wait_entry& entry = e.second; + + if (entry.events & ctx.unordered_events) + continue; + + assert(entry.events); + + if ((counters & counter_exp) && (entry.events & exp_events) == event && entry.imm.exp < ctx.max_exp_cnt) + entry.imm.exp++; + if ((counters & counter_lgkm) && (entry.events & lgkm_events) == event && entry.imm.lgkm < ctx.max_lgkm_cnt) + entry.imm.lgkm++; + if ((counters & counter_vm) && (entry.events & vm_events) == event && entry.imm.vm < ctx.max_vm_cnt) + entry.imm.vm++; + if ((counters & counter_vs) && (entry.events & vs_events) == event && entry.imm.vs < ctx.max_vs_cnt) + entry.imm.vs++; + } +} + +void update_counters_for_flat_load(wait_ctx& ctx, barrier_interaction barrier=barrier_none) +{ + assert(ctx.chip_class < GFX10); + + if (ctx.lgkm_cnt <= ctx.max_lgkm_cnt) + ctx.lgkm_cnt++; + if (ctx.vm_cnt <= ctx.max_vm_cnt) + ctx.vm_cnt++; + + update_barrier_imm(ctx, counter_vm | counter_lgkm, event_flat, barrier); + + for (std::pair e : ctx.gpr_map) + { + if (e.second.counters & counter_vm) + e.second.imm.vm = 0; + if (e.second.counters & counter_lgkm) + e.second.imm.lgkm = 0; + } + ctx.pending_flat_lgkm = true; + ctx.pending_flat_vm = true; +} + +void insert_wait_entry(wait_ctx& ctx, PhysReg reg, RegClass rc, wait_event event, bool wait_on_read) +{ + uint16_t counters = get_counters_for_event(event); + wait_imm imm; + if (counters & counter_lgkm) + imm.lgkm = 0; + if (counters & counter_vm) + imm.vm = 0; + if (counters & counter_exp) + imm.exp = 0; + if (counters & counter_vs) + imm.vs = 0; + + wait_entry new_entry(event, imm, !rc.is_linear(), wait_on_read); + + for (unsigned i = 0; i < rc.size(); i++) { + auto it = ctx.gpr_map.emplace(PhysReg{reg.reg+i}, new_entry); + if (!it.second) + it.first->second.join(new_entry); + } +} + +void insert_wait_entry(wait_ctx& ctx, Operand op, wait_event event) +{ + if (!op.isConstant() && !op.isUndefined()) + insert_wait_entry(ctx, op.physReg(), op.regClass(), event, false); +} + +void insert_wait_entry(wait_ctx& ctx, Definition def, wait_event event) +{ + insert_wait_entry(ctx, def.physReg(), def.regClass(), event, true); +} + +void gen(Instruction* instr, wait_ctx& ctx) +{ + switch (instr->format) { + case Format::EXP: { + Export_instruction* exp_instr = static_cast(instr); + + wait_event ev; + if (exp_instr->dest <= 9) + ev = event_exp_mrt_null; + else if (exp_instr->dest <= 15) + ev = event_exp_pos; + else + ev = event_exp_param; + update_counters(ctx, ev); + + /* insert new entries for exported vgprs */ + for (unsigned i = 0; i < 4; i++) + { + if (exp_instr->enabled_mask & (1 << i)) { + unsigned idx = exp_instr->compressed ? i >> 1 : i; + assert(idx < exp_instr->operands.size()); + insert_wait_entry(ctx, exp_instr->operands[idx], ev); + + } + } + insert_wait_entry(ctx, exec, s2, ev, false); + break; + } + case Format::FLAT: { + if (ctx.chip_class < GFX10 && !instr->definitions.empty()) + update_counters_for_flat_load(ctx, barrier_buffer); + else + update_counters(ctx, event_flat, barrier_buffer); + + if (!instr->definitions.empty()) + insert_wait_entry(ctx, instr->definitions[0], event_flat); + break; + } + case Format::SMEM: { + SMEM_instruction *smem = static_cast(instr); + update_counters(ctx, event_smem, static_cast(instr)->barrier); + + if (!instr->definitions.empty()) + insert_wait_entry(ctx, instr->definitions[0], event_smem); + else if (ctx.chip_class >= GFX10 && + !smem->can_reorder && + smem->barrier == barrier_buffer) + ctx.pending_s_buffer_store = true; + + break; + } + case Format::DS: { + bool gds = static_cast(instr)->gds; + update_counters(ctx, gds ? event_gds : event_lds, gds ? barrier_none : barrier_shared); + if (gds) + update_counters(ctx, event_gds_gpr_lock); + + if (!instr->definitions.empty()) + insert_wait_entry(ctx, instr->definitions[0], gds ? event_gds : event_lds); + + if (gds) { + for (const Operand& op : instr->operands) + insert_wait_entry(ctx, op, event_gds_gpr_lock); + insert_wait_entry(ctx, exec, s2, event_gds_gpr_lock, false); + } + break; + } + case Format::MUBUF: + case Format::MTBUF: + case Format::MIMG: + case Format::GLOBAL: { + wait_event ev = !instr->definitions.empty() || ctx.chip_class < GFX10 ? event_vmem : event_vmem_store; + update_counters(ctx, ev, get_barrier_interaction(instr)); + + if (!instr->definitions.empty()) + insert_wait_entry(ctx, instr->definitions[0], ev); + + if (ctx.chip_class == GFX6 && + instr->format != Format::MIMG && + instr->operands.size() == 4) { + ctx.exp_cnt++; + update_counters(ctx, event_vmem_gpr_lock); + insert_wait_entry(ctx, instr->operands[3], event_vmem_gpr_lock); + } else if (ctx.chip_class == GFX6 && + instr->format == Format::MIMG && + instr->operands[1].regClass().type() == RegType::vgpr) { + ctx.exp_cnt++; + update_counters(ctx, event_vmem_gpr_lock); + insert_wait_entry(ctx, instr->operands[1], event_vmem_gpr_lock); + } + + break; + } + case Format::SOPP: { + if (instr->opcode == aco_opcode::s_sendmsg || + instr->opcode == aco_opcode::s_sendmsghalt) + update_counters(ctx, event_sendmsg, get_barrier_interaction(instr)); + } + default: + break; + } +} + +void emit_waitcnt(wait_ctx& ctx, std::vector>& instructions, wait_imm imm) +{ + if (imm.vs != wait_imm::unset_counter) { + assert(ctx.chip_class >= GFX10); + SOPK_instruction* waitcnt_vs = create_instruction(aco_opcode::s_waitcnt_vscnt, Format::SOPK, 0, 1); + waitcnt_vs->definitions[0] = Definition(sgpr_null, s1); + waitcnt_vs->imm = imm.vs; + instructions.emplace_back(waitcnt_vs); + imm.vs = wait_imm::unset_counter; + } + if (!imm.empty()) { + SOPP_instruction* waitcnt = create_instruction(aco_opcode::s_waitcnt, Format::SOPP, 0, 0); + waitcnt->imm = imm.pack(ctx.chip_class); + waitcnt->block = -1; + instructions.emplace_back(waitcnt); + } +} + +void handle_block(Program *program, Block& block, wait_ctx& ctx) +{ + std::vector> new_instructions; + + wait_imm queued_imm; + for (aco_ptr& instr : block.instructions) { + bool is_wait = !parse_wait_instr(ctx, instr.get()).empty(); + + queued_imm.combine(kill(instr.get(), ctx)); + + gen(instr.get(), ctx); + + if (instr->format != Format::PSEUDO_BARRIER && !is_wait) { + if (!queued_imm.empty()) { + emit_waitcnt(ctx, new_instructions, queued_imm); + queued_imm = wait_imm(); + } + new_instructions.emplace_back(std::move(instr)); + } + } + + if (!queued_imm.empty()) + emit_waitcnt(ctx, new_instructions, queued_imm); + + block.instructions.swap(new_instructions); +} + +} /* end namespace */ + +void insert_wait_states(Program* program) +{ + /* per BB ctx */ + std::vector done(program->blocks.size()); + wait_ctx in_ctx[program->blocks.size()]; + wait_ctx out_ctx[program->blocks.size()]; + for (unsigned i = 0; i < program->blocks.size(); i++) + in_ctx[i] = wait_ctx(program); + std::stack loop_header_indices; + unsigned loop_progress = 0; + + for (unsigned i = 0; i < program->blocks.size();) { + Block& current = program->blocks[i++]; + wait_ctx ctx = in_ctx[current.index]; + + if (current.kind & block_kind_loop_header) { + loop_header_indices.push(current.index); + } else if (current.kind & block_kind_loop_exit) { + bool repeat = false; + if (loop_progress == loop_header_indices.size()) { + i = loop_header_indices.top(); + repeat = true; + } + loop_header_indices.pop(); + loop_progress = std::min(loop_progress, loop_header_indices.size()); + if (repeat) + continue; + } + + bool changed = false; + for (unsigned b : current.linear_preds) + changed |= ctx.join(&out_ctx[b], false); + for (unsigned b : current.logical_preds) + changed |= ctx.join(&out_ctx[b], true); + + in_ctx[current.index] = ctx; + + if (done[current.index] && !changed) + continue; + + if (current.instructions.empty()) { + out_ctx[current.index] = ctx; + continue; + } + + loop_progress = std::max(loop_progress, current.loop_nest_depth); + done[current.index] = true; + + handle_block(program, current, ctx); + + out_ctx[current.index] = ctx; + } +} + +} + diff -Nru mesa-19.2.8/src/amd/compiler/aco_instruction_selection.cpp mesa-20.0.8/src/amd/compiler/aco_instruction_selection.cpp --- mesa-19.2.8/src/amd/compiler/aco_instruction_selection.cpp 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/amd/compiler/aco_instruction_selection.cpp 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,9171 @@ +/* + * Copyright © 2018 Valve Corporation + * Copyright © 2018 Google + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + */ + +#include +#include +#include +#include + +#include "ac_shader_util.h" +#include "aco_ir.h" +#include "aco_builder.h" +#include "aco_interface.h" +#include "aco_instruction_selection_setup.cpp" +#include "util/fast_idiv_by_const.h" + +namespace aco { +namespace { + +class loop_info_RAII { + isel_context* ctx; + unsigned header_idx_old; + Block* exit_old; + bool divergent_cont_old; + bool divergent_branch_old; + bool divergent_if_old; + +public: + loop_info_RAII(isel_context* ctx, unsigned loop_header_idx, Block* loop_exit) + : ctx(ctx), + header_idx_old(ctx->cf_info.parent_loop.header_idx), exit_old(ctx->cf_info.parent_loop.exit), + divergent_cont_old(ctx->cf_info.parent_loop.has_divergent_continue), + divergent_branch_old(ctx->cf_info.parent_loop.has_divergent_branch), + divergent_if_old(ctx->cf_info.parent_if.is_divergent) + { + ctx->cf_info.parent_loop.header_idx = loop_header_idx; + ctx->cf_info.parent_loop.exit = loop_exit; + ctx->cf_info.parent_loop.has_divergent_continue = false; + ctx->cf_info.parent_loop.has_divergent_branch = false; + ctx->cf_info.parent_if.is_divergent = false; + ctx->cf_info.loop_nest_depth = ctx->cf_info.loop_nest_depth + 1; + } + + ~loop_info_RAII() + { + ctx->cf_info.parent_loop.header_idx = header_idx_old; + ctx->cf_info.parent_loop.exit = exit_old; + ctx->cf_info.parent_loop.has_divergent_continue = divergent_cont_old; + ctx->cf_info.parent_loop.has_divergent_branch = divergent_branch_old; + ctx->cf_info.parent_if.is_divergent = divergent_if_old; + ctx->cf_info.loop_nest_depth = ctx->cf_info.loop_nest_depth - 1; + if (!ctx->cf_info.loop_nest_depth && !ctx->cf_info.parent_if.is_divergent) + ctx->cf_info.exec_potentially_empty_discard = false; + } +}; + +struct if_context { + Temp cond; + + bool divergent_old; + bool exec_potentially_empty_discard_old; + bool exec_potentially_empty_break_old; + uint16_t exec_potentially_empty_break_depth_old; + + unsigned BB_if_idx; + unsigned invert_idx; + bool then_branch_divergent; + Block BB_invert; + Block BB_endif; +}; + +static bool visit_cf_list(struct isel_context *ctx, + struct exec_list *list); + +static void add_logical_edge(unsigned pred_idx, Block *succ) +{ + succ->logical_preds.emplace_back(pred_idx); +} + + +static void add_linear_edge(unsigned pred_idx, Block *succ) +{ + succ->linear_preds.emplace_back(pred_idx); +} + +static void add_edge(unsigned pred_idx, Block *succ) +{ + add_logical_edge(pred_idx, succ); + add_linear_edge(pred_idx, succ); +} + +static void append_logical_start(Block *b) +{ + Builder(NULL, b).pseudo(aco_opcode::p_logical_start); +} + +static void append_logical_end(Block *b) +{ + Builder(NULL, b).pseudo(aco_opcode::p_logical_end); +} + +Temp get_ssa_temp(struct isel_context *ctx, nir_ssa_def *def) +{ + assert(ctx->allocated[def->index].id()); + return ctx->allocated[def->index]; +} + +Temp emit_mbcnt(isel_context *ctx, Definition dst, + Operand mask_lo = Operand((uint32_t) -1), Operand mask_hi = Operand((uint32_t) -1)) +{ + Builder bld(ctx->program, ctx->block); + Definition lo_def = ctx->program->wave_size == 32 ? dst : bld.def(v1); + Temp thread_id_lo = bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, lo_def, mask_lo, Operand(0u)); + + if (ctx->program->wave_size == 32) { + return thread_id_lo; + } else { + Temp thread_id_hi = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, dst, mask_hi, thread_id_lo); + return thread_id_hi; + } +} + +Temp emit_wqm(isel_context *ctx, Temp src, Temp dst=Temp(0, s1), bool program_needs_wqm = false) +{ + Builder bld(ctx->program, ctx->block); + + if (!dst.id()) + dst = bld.tmp(src.regClass()); + + assert(src.size() == dst.size()); + + if (ctx->stage != fragment_fs) { + if (!dst.id()) + return src; + + bld.copy(Definition(dst), src); + return dst; + } + + bld.pseudo(aco_opcode::p_wqm, Definition(dst), src); + ctx->program->needs_wqm |= program_needs_wqm; + return dst; +} + +static Temp emit_bpermute(isel_context *ctx, Builder &bld, Temp index, Temp data) +{ + if (index.regClass() == s1) + return bld.readlane(bld.def(s1), data, index); + + Temp index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), index); + + /* Currently not implemented on GFX6-7 */ + assert(ctx->options->chip_class >= GFX8); + + if (ctx->options->chip_class <= GFX9 || ctx->program->wave_size == 32) { + return bld.ds(aco_opcode::ds_bpermute_b32, bld.def(v1), index_x4, data); + } + + /* GFX10, wave64 mode: + * The bpermute instruction is limited to half-wave operation, which means that it can't + * properly support subgroup shuffle like older generations (or wave32 mode), so we + * emulate it here. + */ + if (!ctx->has_gfx10_wave64_bpermute) { + ctx->has_gfx10_wave64_bpermute = true; + ctx->program->config->num_shared_vgprs = 8; /* Shared VGPRs are allocated in groups of 8 */ + ctx->program->vgpr_limit -= 4; /* We allocate 8 shared VGPRs, so we'll have 4 fewer normal VGPRs */ + } + + Temp lane_id = emit_mbcnt(ctx, bld.def(v1)); + Temp lane_is_hi = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x20u), lane_id); + Temp index_is_hi = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x20u), index); + Temp cmp = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm, vcc), lane_is_hi, index_is_hi); + + return bld.reduction(aco_opcode::p_wave64_bpermute, bld.def(v1), bld.def(s2), bld.def(s1, scc), + bld.vcc(cmp), Operand(v2.as_linear()), index_x4, data, gfx10_wave64_bpermute); +} + +Temp as_vgpr(isel_context *ctx, Temp val) +{ + if (val.type() == RegType::sgpr) { + Builder bld(ctx->program, ctx->block); + return bld.copy(bld.def(RegType::vgpr, val.size()), val); + } + assert(val.type() == RegType::vgpr); + return val; +} + +//assumes a != 0xffffffff +void emit_v_div_u32(isel_context *ctx, Temp dst, Temp a, uint32_t b) +{ + assert(b != 0); + Builder bld(ctx->program, ctx->block); + + if (util_is_power_of_two_or_zero(b)) { + bld.vop2(aco_opcode::v_lshrrev_b32, Definition(dst), Operand((uint32_t)util_logbase2(b)), a); + return; + } + + util_fast_udiv_info info = util_compute_fast_udiv_info(b, 32, 32); + + assert(info.multiplier <= 0xffffffff); + + bool pre_shift = info.pre_shift != 0; + bool increment = info.increment != 0; + bool multiply = true; + bool post_shift = info.post_shift != 0; + + if (!pre_shift && !increment && !multiply && !post_shift) { + bld.vop1(aco_opcode::v_mov_b32, Definition(dst), a); + return; + } + + Temp pre_shift_dst = a; + if (pre_shift) { + pre_shift_dst = (increment || multiply || post_shift) ? bld.tmp(v1) : dst; + bld.vop2(aco_opcode::v_lshrrev_b32, Definition(pre_shift_dst), Operand((uint32_t)info.pre_shift), a); + } + + Temp increment_dst = pre_shift_dst; + if (increment) { + increment_dst = (post_shift || multiply) ? bld.tmp(v1) : dst; + bld.vadd32(Definition(increment_dst), Operand((uint32_t) info.increment), pre_shift_dst); + } + + Temp multiply_dst = increment_dst; + if (multiply) { + multiply_dst = post_shift ? bld.tmp(v1) : dst; + bld.vop3(aco_opcode::v_mul_hi_u32, Definition(multiply_dst), increment_dst, + bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand((uint32_t)info.multiplier))); + } + + if (post_shift) { + bld.vop2(aco_opcode::v_lshrrev_b32, Definition(dst), Operand((uint32_t)info.post_shift), multiply_dst); + } +} + +void emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, Temp dst) +{ + Builder bld(ctx->program, ctx->block); + bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand(idx)); +} + + +Temp emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, RegClass dst_rc) +{ + /* no need to extract the whole vector */ + if (src.regClass() == dst_rc) { + assert(idx == 0); + return src; + } + assert(src.size() > idx); + Builder bld(ctx->program, ctx->block); + auto it = ctx->allocated_vec.find(src.id()); + /* the size check needs to be early because elements other than 0 may be garbage */ + if (it != ctx->allocated_vec.end() && it->second[0].size() == dst_rc.size()) { + if (it->second[idx].regClass() == dst_rc) { + return it->second[idx]; + } else { + assert(dst_rc.size() == it->second[idx].regClass().size()); + assert(dst_rc.type() == RegType::vgpr && it->second[idx].type() == RegType::sgpr); + return bld.copy(bld.def(dst_rc), it->second[idx]); + } + } + + if (src.size() == dst_rc.size()) { + assert(idx == 0); + return bld.copy(bld.def(dst_rc), src); + } else { + Temp dst = bld.tmp(dst_rc); + emit_extract_vector(ctx, src, idx, dst); + return dst; + } +} + +void emit_split_vector(isel_context* ctx, Temp vec_src, unsigned num_components) +{ + if (num_components == 1) + return; + if (ctx->allocated_vec.find(vec_src.id()) != ctx->allocated_vec.end()) + return; + aco_ptr split{create_instruction(aco_opcode::p_split_vector, Format::PSEUDO, 1, num_components)}; + split->operands[0] = Operand(vec_src); + std::array elems; + for (unsigned i = 0; i < num_components; i++) { + elems[i] = {ctx->program->allocateId(), RegClass(vec_src.type(), vec_src.size() / num_components)}; + split->definitions[i] = Definition(elems[i]); + } + ctx->block->instructions.emplace_back(std::move(split)); + ctx->allocated_vec.emplace(vec_src.id(), elems); +} + +/* This vector expansion uses a mask to determine which elements in the new vector + * come from the original vector. The other elements are undefined. */ +void expand_vector(isel_context* ctx, Temp vec_src, Temp dst, unsigned num_components, unsigned mask) +{ + emit_split_vector(ctx, vec_src, util_bitcount(mask)); + + if (vec_src == dst) + return; + + Builder bld(ctx->program, ctx->block); + if (num_components == 1) { + if (dst.type() == RegType::sgpr) + bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec_src); + else + bld.copy(Definition(dst), vec_src); + return; + } + + unsigned component_size = dst.size() / num_components; + std::array elems; + + aco_ptr vec{create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)}; + vec->definitions[0] = Definition(dst); + unsigned k = 0; + for (unsigned i = 0; i < num_components; i++) { + if (mask & (1 << i)) { + Temp src = emit_extract_vector(ctx, vec_src, k++, RegClass(vec_src.type(), component_size)); + if (dst.type() == RegType::sgpr) + src = bld.as_uniform(src); + vec->operands[i] = Operand(src); + } else { + vec->operands[i] = Operand(0u); + } + elems[i] = vec->operands[i].getTemp(); + } + ctx->block->instructions.emplace_back(std::move(vec)); + ctx->allocated_vec.emplace(dst.id(), elems); +} + +Temp bool_to_vector_condition(isel_context *ctx, Temp val, Temp dst = Temp(0, s2)) +{ + Builder bld(ctx->program, ctx->block); + if (!dst.id()) + dst = bld.tmp(bld.lm); + + assert(val.regClass() == s1); + assert(dst.regClass() == bld.lm); + + return bld.sop2(Builder::s_cselect, Definition(dst), Operand((uint32_t) -1), Operand(0u), bld.scc(val)); +} + +Temp bool_to_scalar_condition(isel_context *ctx, Temp val, Temp dst = Temp(0, s1)) +{ + Builder bld(ctx->program, ctx->block); + if (!dst.id()) + dst = bld.tmp(s1); + + assert(val.regClass() == bld.lm); + assert(dst.regClass() == s1); + + /* if we're currently in WQM mode, ensure that the source is also computed in WQM */ + Temp tmp = bld.tmp(s1); + bld.sop2(Builder::s_and, bld.def(bld.lm), bld.scc(Definition(tmp)), val, Operand(exec, bld.lm)); + return emit_wqm(ctx, tmp, dst); +} + +Temp get_alu_src(struct isel_context *ctx, nir_alu_src src, unsigned size=1) +{ + if (src.src.ssa->num_components == 1 && src.swizzle[0] == 0 && size == 1) + return get_ssa_temp(ctx, src.src.ssa); + + if (src.src.ssa->num_components == size) { + bool identity_swizzle = true; + for (unsigned i = 0; identity_swizzle && i < size; i++) { + if (src.swizzle[i] != i) + identity_swizzle = false; + } + if (identity_swizzle) + return get_ssa_temp(ctx, src.src.ssa); + } + + Temp vec = get_ssa_temp(ctx, src.src.ssa); + unsigned elem_size = vec.size() / src.src.ssa->num_components; + assert(elem_size > 0); /* TODO: 8 and 16-bit vectors not supported */ + assert(vec.size() % elem_size == 0); + + RegClass elem_rc = RegClass(vec.type(), elem_size); + if (size == 1) { + return emit_extract_vector(ctx, vec, src.swizzle[0], elem_rc); + } else { + assert(size <= 4); + std::array elems; + aco_ptr vec_instr{create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, size, 1)}; + for (unsigned i = 0; i < size; ++i) { + elems[i] = emit_extract_vector(ctx, vec, src.swizzle[i], elem_rc); + vec_instr->operands[i] = Operand{elems[i]}; + } + Temp dst{ctx->program->allocateId(), RegClass(vec.type(), elem_size * size)}; + vec_instr->definitions[0] = Definition(dst); + ctx->block->instructions.emplace_back(std::move(vec_instr)); + ctx->allocated_vec.emplace(dst.id(), elems); + return dst; + } +} + +Temp convert_pointer_to_64_bit(isel_context *ctx, Temp ptr) +{ + if (ptr.size() == 2) + return ptr; + Builder bld(ctx->program, ctx->block); + if (ptr.type() == RegType::vgpr) + ptr = bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), ptr); + return bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), + ptr, Operand((unsigned)ctx->options->address32_hi)); +} + +void emit_sop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst, bool writes_scc) +{ + aco_ptr sop2{create_instruction(op, Format::SOP2, 2, writes_scc ? 2 : 1)}; + sop2->operands[0] = Operand(get_alu_src(ctx, instr->src[0])); + sop2->operands[1] = Operand(get_alu_src(ctx, instr->src[1])); + sop2->definitions[0] = Definition(dst); + if (writes_scc) + sop2->definitions[1] = Definition(ctx->program->allocateId(), scc, s1); + ctx->block->instructions.emplace_back(std::move(sop2)); +} + +void emit_vop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst, + bool commutative, bool swap_srcs=false, bool flush_denorms = false) +{ + Builder bld(ctx->program, ctx->block); + Temp src0 = get_alu_src(ctx, instr->src[swap_srcs ? 1 : 0]); + Temp src1 = get_alu_src(ctx, instr->src[swap_srcs ? 0 : 1]); + if (src1.type() == RegType::sgpr) { + if (commutative && src0.type() == RegType::vgpr) { + Temp t = src0; + src0 = src1; + src1 = t; + } else if (src0.type() == RegType::vgpr && + op != aco_opcode::v_madmk_f32 && + op != aco_opcode::v_madak_f32 && + op != aco_opcode::v_madmk_f16 && + op != aco_opcode::v_madak_f16) { + /* If the instruction is not commutative, we emit a VOP3A instruction */ + bld.vop2_e64(op, Definition(dst), src0, src1); + return; + } else { + src1 = bld.copy(bld.def(RegType::vgpr, src1.size()), src1); //TODO: as_vgpr + } + } + + if (flush_denorms && ctx->program->chip_class < GFX9) { + assert(dst.size() == 1); + Temp tmp = bld.vop2(op, bld.def(v1), src0, src1); + bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand(0x3f800000u), tmp); + } else { + bld.vop2(op, Definition(dst), src0, src1); + } +} + +void emit_vop3a_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst, + bool flush_denorms = false) +{ + Temp src0 = get_alu_src(ctx, instr->src[0]); + Temp src1 = get_alu_src(ctx, instr->src[1]); + Temp src2 = get_alu_src(ctx, instr->src[2]); + + /* ensure that the instruction has at most 1 sgpr operand + * The optimizer will inline constants for us */ + if (src0.type() == RegType::sgpr && src1.type() == RegType::sgpr) + src0 = as_vgpr(ctx, src0); + if (src1.type() == RegType::sgpr && src2.type() == RegType::sgpr) + src1 = as_vgpr(ctx, src1); + if (src2.type() == RegType::sgpr && src0.type() == RegType::sgpr) + src2 = as_vgpr(ctx, src2); + + Builder bld(ctx->program, ctx->block); + if (flush_denorms && ctx->program->chip_class < GFX9) { + assert(dst.size() == 1); + Temp tmp = bld.vop3(op, Definition(dst), src0, src1, src2); + bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand(0x3f800000u), tmp); + } else { + bld.vop3(op, Definition(dst), src0, src1, src2); + } +} + +void emit_vop1_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst) +{ + Builder bld(ctx->program, ctx->block); + bld.vop1(op, Definition(dst), get_alu_src(ctx, instr->src[0])); +} + +void emit_vopc_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst) +{ + Temp src0 = get_alu_src(ctx, instr->src[0]); + Temp src1 = get_alu_src(ctx, instr->src[1]); + assert(src0.size() == src1.size()); + + aco_ptr vopc; + if (src1.type() == RegType::sgpr) { + if (src0.type() == RegType::vgpr) { + /* to swap the operands, we might also have to change the opcode */ + switch (op) { + case aco_opcode::v_cmp_lt_f32: + op = aco_opcode::v_cmp_gt_f32; + break; + case aco_opcode::v_cmp_ge_f32: + op = aco_opcode::v_cmp_le_f32; + break; + case aco_opcode::v_cmp_lt_i32: + op = aco_opcode::v_cmp_gt_i32; + break; + case aco_opcode::v_cmp_ge_i32: + op = aco_opcode::v_cmp_le_i32; + break; + case aco_opcode::v_cmp_lt_u32: + op = aco_opcode::v_cmp_gt_u32; + break; + case aco_opcode::v_cmp_ge_u32: + op = aco_opcode::v_cmp_le_u32; + break; + case aco_opcode::v_cmp_lt_f64: + op = aco_opcode::v_cmp_gt_f64; + break; + case aco_opcode::v_cmp_ge_f64: + op = aco_opcode::v_cmp_le_f64; + break; + case aco_opcode::v_cmp_lt_i64: + op = aco_opcode::v_cmp_gt_i64; + break; + case aco_opcode::v_cmp_ge_i64: + op = aco_opcode::v_cmp_le_i64; + break; + case aco_opcode::v_cmp_lt_u64: + op = aco_opcode::v_cmp_gt_u64; + break; + case aco_opcode::v_cmp_ge_u64: + op = aco_opcode::v_cmp_le_u64; + break; + default: /* eq and ne are commutative */ + break; + } + Temp t = src0; + src0 = src1; + src1 = t; + } else { + src1 = as_vgpr(ctx, src1); + } + } + + Builder bld(ctx->program, ctx->block); + bld.vopc(op, bld.hint_vcc(Definition(dst)), src0, src1); +} + +void emit_sopc_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst) +{ + Temp src0 = get_alu_src(ctx, instr->src[0]); + Temp src1 = get_alu_src(ctx, instr->src[1]); + Builder bld(ctx->program, ctx->block); + + assert(dst.regClass() == bld.lm); + assert(src0.type() == RegType::sgpr); + assert(src1.type() == RegType::sgpr); + assert(src0.regClass() == src1.regClass()); + + /* Emit the SALU comparison instruction */ + Temp cmp = bld.sopc(op, bld.scc(bld.def(s1)), src0, src1); + /* Turn the result into a per-lane bool */ + bool_to_vector_condition(ctx, cmp, dst); +} + +void emit_comparison(isel_context *ctx, nir_alu_instr *instr, Temp dst, + aco_opcode v32_op, aco_opcode v64_op, aco_opcode s32_op = aco_opcode::num_opcodes, aco_opcode s64_op = aco_opcode::num_opcodes) +{ + aco_opcode s_op = instr->src[0].src.ssa->bit_size == 64 ? s64_op : s32_op; + aco_opcode v_op = instr->src[0].src.ssa->bit_size == 64 ? v64_op : v32_op; + bool divergent_vals = ctx->divergent_vals[instr->dest.dest.ssa.index]; + bool use_valu = s_op == aco_opcode::num_opcodes || + divergent_vals || + ctx->allocated[instr->src[0].src.ssa->index].type() == RegType::vgpr || + ctx->allocated[instr->src[1].src.ssa->index].type() == RegType::vgpr; + aco_opcode op = use_valu ? v_op : s_op; + assert(op != aco_opcode::num_opcodes); + assert(dst.regClass() == ctx->program->lane_mask); + + if (use_valu) + emit_vopc_instruction(ctx, instr, op, dst); + else + emit_sopc_instruction(ctx, instr, op, dst); +} + +void emit_boolean_logic(isel_context *ctx, nir_alu_instr *instr, Builder::WaveSpecificOpcode op, Temp dst) +{ + Builder bld(ctx->program, ctx->block); + Temp src0 = get_alu_src(ctx, instr->src[0]); + Temp src1 = get_alu_src(ctx, instr->src[1]); + + assert(dst.regClass() == bld.lm); + assert(src0.regClass() == bld.lm); + assert(src1.regClass() == bld.lm); + + bld.sop2(op, Definition(dst), bld.def(s1, scc), src0, src1); +} + +void emit_bcsel(isel_context *ctx, nir_alu_instr *instr, Temp dst) +{ + Builder bld(ctx->program, ctx->block); + Temp cond = get_alu_src(ctx, instr->src[0]); + Temp then = get_alu_src(ctx, instr->src[1]); + Temp els = get_alu_src(ctx, instr->src[2]); + + assert(cond.regClass() == bld.lm); + + if (dst.type() == RegType::vgpr) { + aco_ptr bcsel; + if (dst.size() == 1) { + then = as_vgpr(ctx, then); + els = as_vgpr(ctx, els); + + bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), els, then, cond); + } else if (dst.size() == 2) { + Temp then_lo = bld.tmp(v1), then_hi = bld.tmp(v1); + bld.pseudo(aco_opcode::p_split_vector, Definition(then_lo), Definition(then_hi), then); + Temp else_lo = bld.tmp(v1), else_hi = bld.tmp(v1); + bld.pseudo(aco_opcode::p_split_vector, Definition(else_lo), Definition(else_hi), els); + + Temp dst0 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_lo, then_lo, cond); + Temp dst1 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_hi, then_hi, cond); + + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + return; + } + + if (instr->dest.dest.ssa.bit_size == 1) { + assert(dst.regClass() == bld.lm); + assert(then.regClass() == bld.lm); + assert(els.regClass() == bld.lm); + } + + if (!ctx->divergent_vals[instr->src[0].src.ssa->index]) { /* uniform condition and values in sgpr */ + if (dst.regClass() == s1 || dst.regClass() == s2) { + assert((then.regClass() == s1 || then.regClass() == s2) && els.regClass() == then.regClass()); + assert(dst.size() == then.size()); + aco_opcode op = dst.regClass() == s1 ? aco_opcode::s_cselect_b32 : aco_opcode::s_cselect_b64; + bld.sop2(op, Definition(dst), then, els, bld.scc(bool_to_scalar_condition(ctx, cond))); + } else { + fprintf(stderr, "Unimplemented uniform bcsel bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + return; + } + + /* divergent boolean bcsel + * this implements bcsel on bools: dst = s0 ? s1 : s2 + * are going to be: dst = (s0 & s1) | (~s0 & s2) */ + assert(instr->dest.dest.ssa.bit_size == 1); + + if (cond.id() != then.id()) + then = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), cond, then); + + if (cond.id() == els.id()) + bld.sop1(Builder::s_mov, Definition(dst), then); + else + bld.sop2(Builder::s_or, Definition(dst), bld.def(s1, scc), then, + bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), els, cond)); +} + +void emit_scaled_op(isel_context *ctx, Builder& bld, Definition dst, Temp val, + aco_opcode op, uint32_t undo) +{ + /* multiply by 16777216 to handle denormals */ + Temp is_denormal = bld.vopc(aco_opcode::v_cmp_class_f32, bld.hint_vcc(bld.def(bld.lm)), + as_vgpr(ctx, val), bld.copy(bld.def(v1), Operand((1u << 7) | (1u << 4)))); + Temp scaled = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x4b800000u), val); + scaled = bld.vop1(op, bld.def(v1), scaled); + scaled = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(undo), scaled); + + Temp not_scaled = bld.vop1(op, bld.def(v1), val); + + bld.vop2(aco_opcode::v_cndmask_b32, dst, not_scaled, scaled, is_denormal); +} + +void emit_rcp(isel_context *ctx, Builder& bld, Definition dst, Temp val) +{ + if (ctx->block->fp_mode.denorm32 == 0) { + bld.vop1(aco_opcode::v_rcp_f32, dst, val); + return; + } + + emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_rcp_f32, 0x4b800000u); +} + +void emit_rsq(isel_context *ctx, Builder& bld, Definition dst, Temp val) +{ + if (ctx->block->fp_mode.denorm32 == 0) { + bld.vop1(aco_opcode::v_rsq_f32, dst, val); + return; + } + + emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_rsq_f32, 0x45800000u); +} + +void emit_sqrt(isel_context *ctx, Builder& bld, Definition dst, Temp val) +{ + if (ctx->block->fp_mode.denorm32 == 0) { + bld.vop1(aco_opcode::v_sqrt_f32, dst, val); + return; + } + + emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_sqrt_f32, 0x39800000u); +} + +void emit_log2(isel_context *ctx, Builder& bld, Definition dst, Temp val) +{ + if (ctx->block->fp_mode.denorm32 == 0) { + bld.vop1(aco_opcode::v_log_f32, dst, val); + return; + } + + emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_log_f32, 0xc1c00000u); +} + +Temp emit_trunc_f64(isel_context *ctx, Builder& bld, Definition dst, Temp val) +{ + if (ctx->options->chip_class >= GFX7) + return bld.vop1(aco_opcode::v_trunc_f64, Definition(dst), val); + + /* GFX6 doesn't support V_TRUNC_F64, lower it. */ + /* TODO: create more efficient code! */ + if (val.type() == RegType::sgpr) + val = as_vgpr(ctx, val); + + /* Split the input value. */ + Temp val_lo = bld.tmp(v1), val_hi = bld.tmp(v1); + bld.pseudo(aco_opcode::p_split_vector, Definition(val_lo), Definition(val_hi), val); + + /* Extract the exponent and compute the unbiased value. */ + Temp exponent = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), val_hi, Operand(20u), Operand(11u)); + exponent = bld.vsub32(bld.def(v1), exponent, Operand(1023u)); + + /* Extract the fractional part. */ + Temp fract_mask = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(-1u), Operand(0x000fffffu)); + fract_mask = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), fract_mask, exponent); + + Temp fract_mask_lo = bld.tmp(v1), fract_mask_hi = bld.tmp(v1); + bld.pseudo(aco_opcode::p_split_vector, Definition(fract_mask_lo), Definition(fract_mask_hi), fract_mask); + + Temp fract_lo = bld.tmp(v1), fract_hi = bld.tmp(v1); + Temp tmp = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), fract_mask_lo); + fract_lo = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), val_lo, tmp); + tmp = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), fract_mask_hi); + fract_hi = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), val_hi, tmp); + + /* Get the sign bit. */ + Temp sign = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x80000000u), val_hi); + + /* Decide the operation to apply depending on the unbiased exponent. */ + Temp exp_lt0 = bld.vopc_e64(aco_opcode::v_cmp_lt_i32, bld.hint_vcc(bld.def(bld.lm)), exponent, Operand(0u)); + Temp dst_lo = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), fract_lo, bld.copy(bld.def(v1), Operand(0u)), exp_lt0); + Temp dst_hi = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), fract_hi, sign, exp_lt0); + Temp exp_gt51 = bld.vopc_e64(aco_opcode::v_cmp_gt_i32, bld.def(s2), exponent, Operand(51u)); + dst_lo = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), dst_lo, val_lo, exp_gt51); + dst_hi = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), dst_hi, val_hi, exp_gt51); + + return bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst_lo, dst_hi); +} + +Temp emit_floor_f64(isel_context *ctx, Builder& bld, Definition dst, Temp val) +{ + if (ctx->options->chip_class >= GFX7) + return bld.vop1(aco_opcode::v_floor_f64, Definition(dst), val); + + /* GFX6 doesn't support V_FLOOR_F64, lower it. */ + Temp src0 = as_vgpr(ctx, val); + + Temp mask = bld.copy(bld.def(s1), Operand(3u)); /* isnan */ + Temp min_val = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(-1u), Operand(0x3fefffffu)); + + Temp isnan = bld.vopc_e64(aco_opcode::v_cmp_class_f64, bld.hint_vcc(bld.def(bld.lm)), src0, mask); + Temp fract = bld.vop1(aco_opcode::v_fract_f64, bld.def(v2), src0); + Temp min = bld.vop3(aco_opcode::v_min_f64, bld.def(v2), fract, min_val); + + Temp then_lo = bld.tmp(v1), then_hi = bld.tmp(v1); + bld.pseudo(aco_opcode::p_split_vector, Definition(then_lo), Definition(then_hi), src0); + Temp else_lo = bld.tmp(v1), else_hi = bld.tmp(v1); + bld.pseudo(aco_opcode::p_split_vector, Definition(else_lo), Definition(else_hi), min); + + Temp dst0 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_lo, then_lo, isnan); + Temp dst1 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_hi, then_hi, isnan); + + Temp v = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), dst0, dst1); + + Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), src0, v); + static_cast(add)->neg[1] = true; + + return add->definitions[0].getTemp(); +} + +void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) +{ + if (!instr->dest.dest.is_ssa) { + fprintf(stderr, "nir alu dst not in ssa: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + abort(); + } + Builder bld(ctx->program, ctx->block); + Temp dst = get_ssa_temp(ctx, &instr->dest.dest.ssa); + switch(instr->op) { + case nir_op_vec2: + case nir_op_vec3: + case nir_op_vec4: { + std::array elems; + aco_ptr vec{create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, instr->dest.dest.ssa.num_components, 1)}; + for (unsigned i = 0; i < instr->dest.dest.ssa.num_components; ++i) { + elems[i] = get_alu_src(ctx, instr->src[i]); + vec->operands[i] = Operand{elems[i]}; + } + vec->definitions[0] = Definition(dst); + ctx->block->instructions.emplace_back(std::move(vec)); + ctx->allocated_vec.emplace(dst.id(), elems); + break; + } + case nir_op_mov: { + Temp src = get_alu_src(ctx, instr->src[0]); + aco_ptr mov; + if (dst.type() == RegType::sgpr) { + if (src.type() == RegType::vgpr) + bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), src); + else if (src.regClass() == s1) + bld.sop1(aco_opcode::s_mov_b32, Definition(dst), src); + else if (src.regClass() == s2) + bld.sop1(aco_opcode::s_mov_b64, Definition(dst), src); + else + unreachable("wrong src register class for nir_op_imov"); + } else if (dst.regClass() == v1) { + bld.vop1(aco_opcode::v_mov_b32, Definition(dst), src); + } else if (dst.regClass() == v2) { + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src); + } else { + nir_print_instr(&instr->instr, stderr); + unreachable("Should have been lowered to scalar."); + } + break; + } + case nir_op_inot: { + Temp src = get_alu_src(ctx, instr->src[0]); + if (instr->dest.dest.ssa.bit_size == 1) { + assert(src.regClass() == bld.lm); + assert(dst.regClass() == bld.lm); + /* Don't use s_andn2 here, this allows the optimizer to make a better decision */ + Temp tmp = bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), src); + bld.sop2(Builder::s_and, Definition(dst), bld.def(s1, scc), tmp, Operand(exec, bld.lm)); + } else if (dst.regClass() == v1) { + emit_vop1_instruction(ctx, instr, aco_opcode::v_not_b32, dst); + } else if (dst.type() == RegType::sgpr) { + aco_opcode opcode = dst.size() == 1 ? aco_opcode::s_not_b32 : aco_opcode::s_not_b64; + bld.sop1(opcode, Definition(dst), bld.def(s1, scc), src); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_ineg: { + Temp src = get_alu_src(ctx, instr->src[0]); + if (dst.regClass() == v1) { + bld.vsub32(Definition(dst), Operand(0u), Operand(src)); + } else if (dst.regClass() == s1) { + bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand((uint32_t) -1), src); + } else if (dst.size() == 2) { + Temp src0 = bld.tmp(dst.type(), 1); + Temp src1 = bld.tmp(dst.type(), 1); + bld.pseudo(aco_opcode::p_split_vector, Definition(src0), Definition(src1), src); + + if (dst.regClass() == s2) { + Temp carry = bld.tmp(s1); + Temp dst0 = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(carry)), Operand(0u), src0); + Temp dst1 = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), Operand(0u), src1, carry); + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1); + } else { + Temp lower = bld.tmp(v1); + Temp borrow = bld.vsub32(Definition(lower), Operand(0u), src0, true).def(1).getTemp(); + Temp upper = bld.vsub32(bld.def(v1), Operand(0u), src1, false, borrow); + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper); + } + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_iabs: { + if (dst.regClass() == s1) { + bld.sop1(aco_opcode::s_abs_i32, Definition(dst), bld.def(s1, scc), get_alu_src(ctx, instr->src[0])); + } else if (dst.regClass() == v1) { + Temp src = get_alu_src(ctx, instr->src[0]); + bld.vop2(aco_opcode::v_max_i32, Definition(dst), src, bld.vsub32(bld.def(v1), Operand(0u), src)); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_isign: { + Temp src = get_alu_src(ctx, instr->src[0]); + if (dst.regClass() == s1) { + Temp tmp = bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), src, Operand(31u)); + Temp gtz = bld.sopc(aco_opcode::s_cmp_gt_i32, bld.def(s1, scc), src, Operand(0u)); + bld.sop2(aco_opcode::s_add_i32, Definition(dst), bld.def(s1, scc), gtz, tmp); + } else if (dst.regClass() == s2) { + Temp neg = bld.sop2(aco_opcode::s_ashr_i64, bld.def(s2), bld.def(s1, scc), src, Operand(63u)); + Temp neqz; + if (ctx->program->chip_class >= GFX8) + neqz = bld.sopc(aco_opcode::s_cmp_lg_u64, bld.def(s1, scc), src, Operand(0u)); + else + neqz = bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), src, Operand(0u)).def(1).getTemp(); + /* SCC gets zero-extended to 64 bit */ + bld.sop2(aco_opcode::s_or_b64, Definition(dst), bld.def(s1, scc), neg, bld.scc(neqz)); + } else if (dst.regClass() == v1) { + Temp tmp = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), src); + Temp gtz = bld.vopc(aco_opcode::v_cmp_ge_i32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src); + bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(1u), tmp, gtz); + } else if (dst.regClass() == v2) { + Temp upper = emit_extract_vector(ctx, src, 1, v1); + Temp neg = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), upper); + Temp gtz = bld.vopc(aco_opcode::v_cmp_ge_i64, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src); + Temp lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(1u), neg, gtz); + upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), neg, gtz); + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_imax: { + if (dst.regClass() == v1) { + emit_vop2_instruction(ctx, instr, aco_opcode::v_max_i32, dst, true); + } else if (dst.regClass() == s1) { + emit_sop2_instruction(ctx, instr, aco_opcode::s_max_i32, dst, true); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_umax: { + if (dst.regClass() == v1) { + emit_vop2_instruction(ctx, instr, aco_opcode::v_max_u32, dst, true); + } else if (dst.regClass() == s1) { + emit_sop2_instruction(ctx, instr, aco_opcode::s_max_u32, dst, true); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_imin: { + if (dst.regClass() == v1) { + emit_vop2_instruction(ctx, instr, aco_opcode::v_min_i32, dst, true); + } else if (dst.regClass() == s1) { + emit_sop2_instruction(ctx, instr, aco_opcode::s_min_i32, dst, true); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_umin: { + if (dst.regClass() == v1) { + emit_vop2_instruction(ctx, instr, aco_opcode::v_min_u32, dst, true); + } else if (dst.regClass() == s1) { + emit_sop2_instruction(ctx, instr, aco_opcode::s_min_u32, dst, true); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_ior: { + if (instr->dest.dest.ssa.bit_size == 1) { + emit_boolean_logic(ctx, instr, Builder::s_or, dst); + } else if (dst.regClass() == v1) { + emit_vop2_instruction(ctx, instr, aco_opcode::v_or_b32, dst, true); + } else if (dst.regClass() == s1) { + emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b32, dst, true); + } else if (dst.regClass() == s2) { + emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b64, dst, true); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_iand: { + if (instr->dest.dest.ssa.bit_size == 1) { + emit_boolean_logic(ctx, instr, Builder::s_and, dst); + } else if (dst.regClass() == v1) { + emit_vop2_instruction(ctx, instr, aco_opcode::v_and_b32, dst, true); + } else if (dst.regClass() == s1) { + emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b32, dst, true); + } else if (dst.regClass() == s2) { + emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b64, dst, true); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_ixor: { + if (instr->dest.dest.ssa.bit_size == 1) { + emit_boolean_logic(ctx, instr, Builder::s_xor, dst); + } else if (dst.regClass() == v1) { + emit_vop2_instruction(ctx, instr, aco_opcode::v_xor_b32, dst, true); + } else if (dst.regClass() == s1) { + emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b32, dst, true); + } else if (dst.regClass() == s2) { + emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b64, dst, true); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_ushr: { + if (dst.regClass() == v1) { + emit_vop2_instruction(ctx, instr, aco_opcode::v_lshrrev_b32, dst, false, true); + } else if (dst.regClass() == v2 && ctx->program->chip_class >= GFX8) { + bld.vop3(aco_opcode::v_lshrrev_b64, Definition(dst), + get_alu_src(ctx, instr->src[1]), get_alu_src(ctx, instr->src[0])); + } else if (dst.regClass() == v2) { + bld.vop3(aco_opcode::v_lshr_b64, Definition(dst), + get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1])); + } else if (dst.regClass() == s2) { + emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b64, dst, true); + } else if (dst.regClass() == s1) { + emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b32, dst, true); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_ishl: { + if (dst.regClass() == v1) { + emit_vop2_instruction(ctx, instr, aco_opcode::v_lshlrev_b32, dst, false, true); + } else if (dst.regClass() == v2 && ctx->program->chip_class >= GFX8) { + bld.vop3(aco_opcode::v_lshlrev_b64, Definition(dst), + get_alu_src(ctx, instr->src[1]), get_alu_src(ctx, instr->src[0])); + } else if (dst.regClass() == v2) { + bld.vop3(aco_opcode::v_lshl_b64, Definition(dst), + get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1])); + } else if (dst.regClass() == s1) { + emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b32, dst, true); + } else if (dst.regClass() == s2) { + emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b64, dst, true); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_ishr: { + if (dst.regClass() == v1) { + emit_vop2_instruction(ctx, instr, aco_opcode::v_ashrrev_i32, dst, false, true); + } else if (dst.regClass() == v2 && ctx->program->chip_class >= GFX8) { + bld.vop3(aco_opcode::v_ashrrev_i64, Definition(dst), + get_alu_src(ctx, instr->src[1]), get_alu_src(ctx, instr->src[0])); + } else if (dst.regClass() == v2) { + bld.vop3(aco_opcode::v_ashr_i64, Definition(dst), + get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1])); + } else if (dst.regClass() == s1) { + emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i32, dst, true); + } else if (dst.regClass() == s2) { + emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i64, dst, true); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_find_lsb: { + Temp src = get_alu_src(ctx, instr->src[0]); + if (src.regClass() == s1) { + bld.sop1(aco_opcode::s_ff1_i32_b32, Definition(dst), src); + } else if (src.regClass() == v1) { + emit_vop1_instruction(ctx, instr, aco_opcode::v_ffbl_b32, dst); + } else if (src.regClass() == s2) { + bld.sop1(aco_opcode::s_ff1_i32_b64, Definition(dst), src); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_ufind_msb: + case nir_op_ifind_msb: { + Temp src = get_alu_src(ctx, instr->src[0]); + if (src.regClass() == s1 || src.regClass() == s2) { + aco_opcode op = src.regClass() == s2 ? + (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b64 : aco_opcode::s_flbit_i32_i64) : + (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b32 : aco_opcode::s_flbit_i32); + Temp msb_rev = bld.sop1(op, bld.def(s1), src); + + Builder::Result sub = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), + Operand(src.size() * 32u - 1u), msb_rev); + Temp msb = sub.def(0).getTemp(); + Temp carry = sub.def(1).getTemp(); + + bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand((uint32_t)-1), msb, bld.scc(carry)); + } else if (src.regClass() == v1) { + aco_opcode op = instr->op == nir_op_ufind_msb ? aco_opcode::v_ffbh_u32 : aco_opcode::v_ffbh_i32; + Temp msb_rev = bld.tmp(v1); + emit_vop1_instruction(ctx, instr, op, msb_rev); + Temp msb = bld.tmp(v1); + Temp carry = bld.vsub32(Definition(msb), Operand(31u), Operand(msb_rev), true).def(1).getTemp(); + bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), msb, Operand((uint32_t)-1), carry); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_bitfield_reverse: { + if (dst.regClass() == s1) { + bld.sop1(aco_opcode::s_brev_b32, Definition(dst), get_alu_src(ctx, instr->src[0])); + } else if (dst.regClass() == v1) { + bld.vop1(aco_opcode::v_bfrev_b32, Definition(dst), get_alu_src(ctx, instr->src[0])); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_iadd: { + if (dst.regClass() == s1) { + emit_sop2_instruction(ctx, instr, aco_opcode::s_add_u32, dst, true); + break; + } + + Temp src0 = get_alu_src(ctx, instr->src[0]); + Temp src1 = get_alu_src(ctx, instr->src[1]); + if (dst.regClass() == v1) { + bld.vadd32(Definition(dst), Operand(src0), Operand(src1)); + break; + } + + assert(src0.size() == 2 && src1.size() == 2); + Temp src00 = bld.tmp(src0.type(), 1); + Temp src01 = bld.tmp(dst.type(), 1); + bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0); + Temp src10 = bld.tmp(src1.type(), 1); + Temp src11 = bld.tmp(dst.type(), 1); + bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1); + + if (dst.regClass() == s2) { + Temp carry = bld.tmp(s1); + Temp dst0 = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10); + Temp dst1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), src01, src11, bld.scc(carry)); + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1); + } else if (dst.regClass() == v2) { + Temp dst0 = bld.tmp(v1); + Temp carry = bld.vadd32(Definition(dst0), src00, src10, true).def(1).getTemp(); + Temp dst1 = bld.vadd32(bld.def(v1), src01, src11, false, carry); + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_uadd_sat: { + Temp src0 = get_alu_src(ctx, instr->src[0]); + Temp src1 = get_alu_src(ctx, instr->src[1]); + if (dst.regClass() == s1) { + Temp tmp = bld.tmp(s1), carry = bld.tmp(s1); + bld.sop2(aco_opcode::s_add_u32, Definition(tmp), bld.scc(Definition(carry)), + src0, src1); + bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand((uint32_t) -1), tmp, bld.scc(carry)); + } else if (dst.regClass() == v1) { + if (ctx->options->chip_class >= GFX9) { + aco_ptr add{create_instruction(aco_opcode::v_add_u32, asVOP3(Format::VOP2), 2, 1)}; + add->operands[0] = Operand(src0); + add->operands[1] = Operand(src1); + add->definitions[0] = Definition(dst); + add->clamp = 1; + ctx->block->instructions.emplace_back(std::move(add)); + } else { + if (src1.regClass() != v1) + std::swap(src0, src1); + assert(src1.regClass() == v1); + Temp tmp = bld.tmp(v1); + Temp carry = bld.vadd32(Definition(tmp), src0, src1, true).def(1).getTemp(); + bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), tmp, Operand((uint32_t) -1), carry); + } + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_uadd_carry: { + Temp src0 = get_alu_src(ctx, instr->src[0]); + Temp src1 = get_alu_src(ctx, instr->src[1]); + if (dst.regClass() == s1) { + bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1); + break; + } + if (dst.regClass() == v1) { + Temp carry = bld.vadd32(bld.def(v1), src0, src1, true).def(1).getTemp(); + bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(1u), carry); + break; + } + + Temp src00 = bld.tmp(src0.type(), 1); + Temp src01 = bld.tmp(dst.type(), 1); + bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0); + Temp src10 = bld.tmp(src1.type(), 1); + Temp src11 = bld.tmp(dst.type(), 1); + bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1); + if (dst.regClass() == s2) { + Temp carry = bld.tmp(s1); + bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10); + carry = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11, bld.scc(carry)).def(1).getTemp(); + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand(0u)); + } else if (dst.regClass() == v2) { + Temp carry = bld.vadd32(bld.def(v1), src00, src10, true).def(1).getTemp(); + carry = bld.vadd32(bld.def(v1), src01, src11, true, carry).def(1).getTemp(); + carry = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand(1u), carry); + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand(0u)); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_isub: { + if (dst.regClass() == s1) { + emit_sop2_instruction(ctx, instr, aco_opcode::s_sub_i32, dst, true); + break; + } + + Temp src0 = get_alu_src(ctx, instr->src[0]); + Temp src1 = get_alu_src(ctx, instr->src[1]); + if (dst.regClass() == v1) { + bld.vsub32(Definition(dst), src0, src1); + break; + } + + Temp src00 = bld.tmp(src0.type(), 1); + Temp src01 = bld.tmp(dst.type(), 1); + bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0); + Temp src10 = bld.tmp(src1.type(), 1); + Temp src11 = bld.tmp(dst.type(), 1); + bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1); + if (dst.regClass() == s2) { + Temp carry = bld.tmp(s1); + Temp dst0 = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10); + Temp dst1 = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), src01, src11, carry); + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1); + } else if (dst.regClass() == v2) { + Temp lower = bld.tmp(v1); + Temp borrow = bld.vsub32(Definition(lower), src00, src10, true).def(1).getTemp(); + Temp upper = bld.vsub32(bld.def(v1), src01, src11, false, borrow); + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_usub_borrow: { + Temp src0 = get_alu_src(ctx, instr->src[0]); + Temp src1 = get_alu_src(ctx, instr->src[1]); + if (dst.regClass() == s1) { + bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1); + break; + } else if (dst.regClass() == v1) { + Temp borrow = bld.vsub32(bld.def(v1), src0, src1, true).def(1).getTemp(); + bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(1u), borrow); + break; + } + + Temp src00 = bld.tmp(src0.type(), 1); + Temp src01 = bld.tmp(dst.type(), 1); + bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0); + Temp src10 = bld.tmp(src1.type(), 1); + Temp src11 = bld.tmp(dst.type(), 1); + bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1); + if (dst.regClass() == s2) { + Temp borrow = bld.tmp(s1); + bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), src00, src10); + borrow = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11, bld.scc(borrow)).def(1).getTemp(); + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand(0u)); + } else if (dst.regClass() == v2) { + Temp borrow = bld.vsub32(bld.def(v1), src00, src10, true).def(1).getTemp(); + borrow = bld.vsub32(bld.def(v1), src01, src11, true, Operand(borrow)).def(1).getTemp(); + borrow = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand(1u), borrow); + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand(0u)); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_imul: { + if (dst.regClass() == v1) { + bld.vop3(aco_opcode::v_mul_lo_u32, Definition(dst), + get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1])); + } else if (dst.regClass() == s1) { + emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_i32, dst, false); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_umul_high: { + if (dst.regClass() == v1) { + bld.vop3(aco_opcode::v_mul_hi_u32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1])); + } else if (dst.regClass() == s1 && ctx->options->chip_class >= GFX9) { + bld.sop2(aco_opcode::s_mul_hi_u32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1])); + } else if (dst.regClass() == s1) { + Temp tmp = bld.vop3(aco_opcode::v_mul_hi_u32, bld.def(v1), get_alu_src(ctx, instr->src[0]), + as_vgpr(ctx, get_alu_src(ctx, instr->src[1]))); + bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_imul_high: { + if (dst.regClass() == v1) { + bld.vop3(aco_opcode::v_mul_hi_i32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1])); + } else if (dst.regClass() == s1 && ctx->options->chip_class >= GFX9) { + bld.sop2(aco_opcode::s_mul_hi_i32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1])); + } else if (dst.regClass() == s1) { + Temp tmp = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), get_alu_src(ctx, instr->src[0]), + as_vgpr(ctx, get_alu_src(ctx, instr->src[1]))); + bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_fmul: { + if (dst.size() == 1) { + emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f32, dst, true); + } else if (dst.size() == 2) { + bld.vop3(aco_opcode::v_mul_f64, Definition(dst), get_alu_src(ctx, instr->src[0]), + as_vgpr(ctx, get_alu_src(ctx, instr->src[1]))); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_fadd: { + if (dst.size() == 1) { + emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f32, dst, true); + } else if (dst.size() == 2) { + bld.vop3(aco_opcode::v_add_f64, Definition(dst), get_alu_src(ctx, instr->src[0]), + as_vgpr(ctx, get_alu_src(ctx, instr->src[1]))); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_fsub: { + Temp src0 = get_alu_src(ctx, instr->src[0]); + Temp src1 = get_alu_src(ctx, instr->src[1]); + if (dst.size() == 1) { + if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr) + emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f32, dst, false); + else + emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f32, dst, true); + } else if (dst.size() == 2) { + Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), + get_alu_src(ctx, instr->src[0]), + as_vgpr(ctx, get_alu_src(ctx, instr->src[1]))); + VOP3A_instruction* sub = static_cast(add); + sub->neg[1] = true; + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_fmax: { + if (dst.size() == 1) { + emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f32, dst, true, false, ctx->block->fp_mode.must_flush_denorms32); + } else if (dst.size() == 2) { + if (ctx->block->fp_mode.must_flush_denorms16_64 && ctx->program->chip_class < GFX9) { + Temp tmp = bld.vop3(aco_opcode::v_max_f64, bld.def(v2), + get_alu_src(ctx, instr->src[0]), + as_vgpr(ctx, get_alu_src(ctx, instr->src[1]))); + bld.vop3(aco_opcode::v_mul_f64, Definition(dst), Operand(0x3FF0000000000000lu), tmp); + } else { + bld.vop3(aco_opcode::v_max_f64, Definition(dst), + get_alu_src(ctx, instr->src[0]), + as_vgpr(ctx, get_alu_src(ctx, instr->src[1]))); + } + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_fmin: { + if (dst.size() == 1) { + emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f32, dst, true, false, ctx->block->fp_mode.must_flush_denorms32); + } else if (dst.size() == 2) { + if (ctx->block->fp_mode.must_flush_denorms16_64 && ctx->program->chip_class < GFX9) { + Temp tmp = bld.vop3(aco_opcode::v_min_f64, bld.def(v2), + get_alu_src(ctx, instr->src[0]), + as_vgpr(ctx, get_alu_src(ctx, instr->src[1]))); + bld.vop3(aco_opcode::v_mul_f64, Definition(dst), Operand(0x3FF0000000000000lu), tmp); + } else { + bld.vop3(aco_opcode::v_min_f64, Definition(dst), + get_alu_src(ctx, instr->src[0]), + as_vgpr(ctx, get_alu_src(ctx, instr->src[1]))); + } + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_fmax3: { + if (dst.size() == 1) { + emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_f32, dst, ctx->block->fp_mode.must_flush_denorms32); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_fmin3: { + if (dst.size() == 1) { + emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_f32, dst, ctx->block->fp_mode.must_flush_denorms32); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_fmed3: { + if (dst.size() == 1) { + emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_f32, dst, ctx->block->fp_mode.must_flush_denorms32); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_umax3: { + if (dst.size() == 1) { + emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_u32, dst); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_umin3: { + if (dst.size() == 1) { + emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_u32, dst); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_umed3: { + if (dst.size() == 1) { + emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_u32, dst); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_imax3: { + if (dst.size() == 1) { + emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_i32, dst); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_imin3: { + if (dst.size() == 1) { + emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_i32, dst); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_imed3: { + if (dst.size() == 1) { + emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_i32, dst); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_cube_face_coord: { + Temp in = get_alu_src(ctx, instr->src[0], 3); + Temp src[3] = { emit_extract_vector(ctx, in, 0, v1), + emit_extract_vector(ctx, in, 1, v1), + emit_extract_vector(ctx, in, 2, v1) }; + Temp ma = bld.vop3(aco_opcode::v_cubema_f32, bld.def(v1), src[0], src[1], src[2]); + ma = bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), ma); + Temp sc = bld.vop3(aco_opcode::v_cubesc_f32, bld.def(v1), src[0], src[1], src[2]); + Temp tc = bld.vop3(aco_opcode::v_cubetc_f32, bld.def(v1), src[0], src[1], src[2]); + sc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), sc, ma, Operand(0x3f000000u/*0.5*/)); + tc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), tc, ma, Operand(0x3f000000u/*0.5*/)); + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), sc, tc); + break; + } + case nir_op_cube_face_index: { + Temp in = get_alu_src(ctx, instr->src[0], 3); + Temp src[3] = { emit_extract_vector(ctx, in, 0, v1), + emit_extract_vector(ctx, in, 1, v1), + emit_extract_vector(ctx, in, 2, v1) }; + bld.vop3(aco_opcode::v_cubeid_f32, Definition(dst), src[0], src[1], src[2]); + break; + } + case nir_op_bcsel: { + emit_bcsel(ctx, instr, dst); + break; + } + case nir_op_frsq: { + if (dst.size() == 1) { + emit_rsq(ctx, bld, Definition(dst), get_alu_src(ctx, instr->src[0])); + } else if (dst.size() == 2) { + emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f64, dst); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_fneg: { + Temp src = get_alu_src(ctx, instr->src[0]); + if (dst.size() == 1) { + if (ctx->block->fp_mode.must_flush_denorms32) + src = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x3f800000u), as_vgpr(ctx, src)); + bld.vop2(aco_opcode::v_xor_b32, Definition(dst), Operand(0x80000000u), as_vgpr(ctx, src)); + } else if (dst.size() == 2) { + if (ctx->block->fp_mode.must_flush_denorms16_64) + src = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), Operand(0x3FF0000000000000lu), as_vgpr(ctx, src)); + Temp upper = bld.tmp(v1), lower = bld.tmp(v1); + bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src); + upper = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), Operand(0x80000000u), upper); + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_fabs: { + Temp src = get_alu_src(ctx, instr->src[0]); + if (dst.size() == 1) { + if (ctx->block->fp_mode.must_flush_denorms32) + src = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x3f800000u), as_vgpr(ctx, src)); + bld.vop2(aco_opcode::v_and_b32, Definition(dst), Operand(0x7FFFFFFFu), as_vgpr(ctx, src)); + } else if (dst.size() == 2) { + if (ctx->block->fp_mode.must_flush_denorms16_64) + src = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), Operand(0x3FF0000000000000lu), as_vgpr(ctx, src)); + Temp upper = bld.tmp(v1), lower = bld.tmp(v1); + bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src); + upper = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7FFFFFFFu), upper); + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_fsat: { + Temp src = get_alu_src(ctx, instr->src[0]); + if (dst.size() == 1) { + bld.vop3(aco_opcode::v_med3_f32, Definition(dst), Operand(0u), Operand(0x3f800000u), src); + /* apparently, it is not necessary to flush denorms if this instruction is used with these operands */ + // TODO: confirm that this holds under any circumstances + } else if (dst.size() == 2) { + Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), src, Operand(0u)); + VOP3A_instruction* vop3 = static_cast(add); + vop3->clamp = true; + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_flog2: { + if (dst.size() == 1) { + emit_log2(ctx, bld, Definition(dst), get_alu_src(ctx, instr->src[0])); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_frcp: { + if (dst.size() == 1) { + emit_rcp(ctx, bld, Definition(dst), get_alu_src(ctx, instr->src[0])); + } else if (dst.size() == 2) { + emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f64, dst); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_fexp2: { + if (dst.size() == 1) { + emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f32, dst); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_fsqrt: { + if (dst.size() == 1) { + emit_sqrt(ctx, bld, Definition(dst), get_alu_src(ctx, instr->src[0])); + } else if (dst.size() == 2) { + emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f64, dst); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_ffract: { + if (dst.size() == 1) { + emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f32, dst); + } else if (dst.size() == 2) { + emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f64, dst); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_ffloor: { + if (dst.size() == 1) { + emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f32, dst); + } else if (dst.size() == 2) { + emit_floor_f64(ctx, bld, Definition(dst), get_alu_src(ctx, instr->src[0])); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_fceil: { + if (dst.size() == 1) { + emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f32, dst); + } else if (dst.size() == 2) { + if (ctx->options->chip_class >= GFX7) { + emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f64, dst); + } else { + /* GFX6 doesn't support V_CEIL_F64, lower it. */ + Temp src0 = get_alu_src(ctx, instr->src[0]); + + /* trunc = trunc(src0) + * if (src0 > 0.0 && src0 != trunc) + * trunc += 1.0 + */ + Temp trunc = emit_trunc_f64(ctx, bld, bld.def(v2), src0); + Temp tmp0 = bld.vopc_e64(aco_opcode::v_cmp_gt_f64, bld.def(bld.lm), src0, Operand(0u)); + Temp tmp1 = bld.vopc(aco_opcode::v_cmp_lg_f64, bld.hint_vcc(bld.def(bld.lm)), src0, trunc); + Temp cond = bld.sop2(aco_opcode::s_and_b64, bld.hint_vcc(bld.def(s2)), bld.def(s1, scc), tmp0, tmp1); + Temp add = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), bld.copy(bld.def(v1), Operand(0u)), bld.copy(bld.def(v1), Operand(0x3ff00000u)), cond); + add = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), bld.copy(bld.def(v1), Operand(0u)), add); + bld.vop3(aco_opcode::v_add_f64, Definition(dst), trunc, add); + } + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_ftrunc: { + if (dst.size() == 1) { + emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f32, dst); + } else if (dst.size() == 2) { + emit_trunc_f64(ctx, bld, Definition(dst), get_alu_src(ctx, instr->src[0])); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_fround_even: { + if (dst.size() == 1) { + emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f32, dst); + } else if (dst.size() == 2) { + if (ctx->options->chip_class >= GFX7) { + emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f64, dst); + } else { + /* GFX6 doesn't support V_RNDNE_F64, lower it. */ + Temp src0 = get_alu_src(ctx, instr->src[0]); + + Temp src0_lo = bld.tmp(v1), src0_hi = bld.tmp(v1); + bld.pseudo(aco_opcode::p_split_vector, Definition(src0_lo), Definition(src0_hi), src0); + + Temp bitmask = bld.sop1(aco_opcode::s_brev_b32, bld.def(s1), bld.copy(bld.def(s1), Operand(-2u))); + Temp bfi = bld.vop3(aco_opcode::v_bfi_b32, bld.def(v1), bitmask, bld.copy(bld.def(v1), Operand(0x43300000u)), as_vgpr(ctx, src0_hi)); + Temp tmp = bld.vop3(aco_opcode::v_add_f64, bld.def(v2), src0, bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(0u), bfi)); + Instruction *sub = bld.vop3(aco_opcode::v_add_f64, bld.def(v2), tmp, bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(0u), bfi)); + static_cast(sub)->neg[1] = true; + tmp = sub->definitions[0].getTemp(); + + Temp v = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(-1u), Operand(0x432fffffu)); + Instruction* vop3 = bld.vopc_e64(aco_opcode::v_cmp_gt_f64, bld.hint_vcc(bld.def(bld.lm)), src0, v); + static_cast(vop3)->abs[0] = true; + Temp cond = vop3->definitions[0].getTemp(); + + Temp tmp_lo = bld.tmp(v1), tmp_hi = bld.tmp(v1); + bld.pseudo(aco_opcode::p_split_vector, Definition(tmp_lo), Definition(tmp_hi), tmp); + Temp dst0 = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp_lo, as_vgpr(ctx, src0_lo), cond); + Temp dst1 = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp_hi, as_vgpr(ctx, src0_hi), cond); + + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1); + } + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_fsin: + case nir_op_fcos: { + Temp src = get_alu_src(ctx, instr->src[0]); + aco_ptr norm; + if (dst.size() == 1) { + Temp half_pi = bld.copy(bld.def(s1), Operand(0x3e22f983u)); + Temp tmp = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), half_pi, as_vgpr(ctx, src)); + + /* before GFX9, v_sin_f32 and v_cos_f32 had a valid input domain of [-256, +256] */ + if (ctx->options->chip_class < GFX9) + tmp = bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), tmp); + + aco_opcode opcode = instr->op == nir_op_fsin ? aco_opcode::v_sin_f32 : aco_opcode::v_cos_f32; + bld.vop1(opcode, Definition(dst), tmp); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_ldexp: { + if (dst.size() == 1) { + bld.vop3(aco_opcode::v_ldexp_f32, Definition(dst), + as_vgpr(ctx, get_alu_src(ctx, instr->src[0])), + get_alu_src(ctx, instr->src[1])); + } else if (dst.size() == 2) { + bld.vop3(aco_opcode::v_ldexp_f64, Definition(dst), + as_vgpr(ctx, get_alu_src(ctx, instr->src[0])), + get_alu_src(ctx, instr->src[1])); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_frexp_sig: { + if (dst.size() == 1) { + bld.vop1(aco_opcode::v_frexp_mant_f32, Definition(dst), + get_alu_src(ctx, instr->src[0])); + } else if (dst.size() == 2) { + bld.vop1(aco_opcode::v_frexp_mant_f64, Definition(dst), + get_alu_src(ctx, instr->src[0])); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_frexp_exp: { + if (instr->src[0].src.ssa->bit_size == 32) { + bld.vop1(aco_opcode::v_frexp_exp_i32_f32, Definition(dst), + get_alu_src(ctx, instr->src[0])); + } else if (instr->src[0].src.ssa->bit_size == 64) { + bld.vop1(aco_opcode::v_frexp_exp_i32_f64, Definition(dst), + get_alu_src(ctx, instr->src[0])); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_fsign: { + Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0])); + if (dst.size() == 1) { + Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src); + src = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0x3f800000u), src, cond); + cond = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src); + bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0xbf800000u), src, cond); + } else if (dst.size() == 2) { + Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f64, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src); + Temp tmp = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0x3FF00000u)); + Temp upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, emit_extract_vector(ctx, src, 1, v1), cond); + + cond = bld.vopc(aco_opcode::v_cmp_le_f64, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src); + tmp = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0xBFF00000u)); + upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, upper, cond); + + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand(0u), upper); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_f2f32: { + if (instr->src[0].src.ssa->bit_size == 64) { + emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f64, dst); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_f2f64: { + if (instr->src[0].src.ssa->bit_size == 32) { + emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f64_f32, dst); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_i2f32: { + assert(dst.size() == 1); + emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_i32, dst); + break; + } + case nir_op_i2f64: { + if (instr->src[0].src.ssa->bit_size == 32) { + emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f64_i32, dst); + } else if (instr->src[0].src.ssa->bit_size == 64) { + Temp src = get_alu_src(ctx, instr->src[0]); + RegClass rc = RegClass(src.type(), 1); + Temp lower = bld.tmp(rc), upper = bld.tmp(rc); + bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src); + lower = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), lower); + upper = bld.vop1(aco_opcode::v_cvt_f64_i32, bld.def(v2), upper); + upper = bld.vop3(aco_opcode::v_ldexp_f64, bld.def(v2), upper, Operand(32u)); + bld.vop3(aco_opcode::v_add_f64, Definition(dst), lower, upper); + + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_u2f32: { + assert(dst.size() == 1); + emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_u32, dst); + break; + } + case nir_op_u2f64: { + if (instr->src[0].src.ssa->bit_size == 32) { + emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f64_u32, dst); + } else if (instr->src[0].src.ssa->bit_size == 64) { + Temp src = get_alu_src(ctx, instr->src[0]); + RegClass rc = RegClass(src.type(), 1); + Temp lower = bld.tmp(rc), upper = bld.tmp(rc); + bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src); + lower = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), lower); + upper = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), upper); + upper = bld.vop3(aco_opcode::v_ldexp_f64, bld.def(v2), upper, Operand(32u)); + bld.vop3(aco_opcode::v_add_f64, Definition(dst), lower, upper); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_f2i32: { + Temp src = get_alu_src(ctx, instr->src[0]); + if (instr->src[0].src.ssa->bit_size == 32) { + if (dst.type() == RegType::vgpr) + bld.vop1(aco_opcode::v_cvt_i32_f32, Definition(dst), src); + else + bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), + bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), src)); + + } else if (instr->src[0].src.ssa->bit_size == 64) { + if (dst.type() == RegType::vgpr) + bld.vop1(aco_opcode::v_cvt_i32_f64, Definition(dst), src); + else + bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), + bld.vop1(aco_opcode::v_cvt_i32_f64, bld.def(v1), src)); + + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_f2u32: { + Temp src = get_alu_src(ctx, instr->src[0]); + if (instr->src[0].src.ssa->bit_size == 32) { + if (dst.type() == RegType::vgpr) + bld.vop1(aco_opcode::v_cvt_u32_f32, Definition(dst), src); + else + bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), + bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), src)); + + } else if (instr->src[0].src.ssa->bit_size == 64) { + if (dst.type() == RegType::vgpr) + bld.vop1(aco_opcode::v_cvt_u32_f64, Definition(dst), src); + else + bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), + bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), src)); + + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_f2i64: { + Temp src = get_alu_src(ctx, instr->src[0]); + if (instr->src[0].src.ssa->bit_size == 32 && dst.type() == RegType::vgpr) { + Temp exponent = bld.vop1(aco_opcode::v_frexp_exp_i32_f32, bld.def(v1), src); + exponent = bld.vop3(aco_opcode::v_med3_i32, bld.def(v1), Operand(0x0u), exponent, Operand(64u)); + Temp mantissa = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7fffffu), src); + Temp sign = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), src); + mantissa = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand(0x800000u), mantissa); + mantissa = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(7u), mantissa); + mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(0u), mantissa); + Temp new_exponent = bld.tmp(v1); + Temp borrow = bld.vsub32(Definition(new_exponent), Operand(63u), exponent, true).def(1).getTemp(); + if (ctx->program->chip_class >= GFX8) + mantissa = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), new_exponent, mantissa); + else + mantissa = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), mantissa, new_exponent); + Temp saturate = bld.vop1(aco_opcode::v_bfrev_b32, bld.def(v1), Operand(0xfffffffeu)); + Temp lower = bld.tmp(v1), upper = bld.tmp(v1); + bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa); + lower = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), lower, Operand(0xffffffffu), borrow); + upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), upper, saturate, borrow); + lower = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), sign, lower); + upper = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), sign, upper); + Temp new_lower = bld.tmp(v1); + borrow = bld.vsub32(Definition(new_lower), lower, sign, true).def(1).getTemp(); + Temp new_upper = bld.vsub32(bld.def(v1), upper, sign, false, borrow); + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), new_lower, new_upper); + + } else if (instr->src[0].src.ssa->bit_size == 32 && dst.type() == RegType::sgpr) { + if (src.type() == RegType::vgpr) + src = bld.as_uniform(src); + Temp exponent = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), src, Operand(0x80017u)); + exponent = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), exponent, Operand(126u)); + exponent = bld.sop2(aco_opcode::s_max_u32, bld.def(s1), bld.def(s1, scc), Operand(0u), exponent); + exponent = bld.sop2(aco_opcode::s_min_u32, bld.def(s1), bld.def(s1, scc), Operand(64u), exponent); + Temp mantissa = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0x7fffffu), src); + Temp sign = bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), src, Operand(31u)); + mantissa = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), Operand(0x800000u), mantissa); + mantissa = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), mantissa, Operand(7u)); + mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), mantissa); + exponent = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), Operand(63u), exponent); + mantissa = bld.sop2(aco_opcode::s_lshr_b64, bld.def(s2), bld.def(s1, scc), mantissa, exponent); + Temp cond = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), exponent, Operand(0xffffffffu)); // exp >= 64 + Temp saturate = bld.sop1(aco_opcode::s_brev_b64, bld.def(s2), Operand(0xfffffffeu)); + mantissa = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), saturate, mantissa, cond); + Temp lower = bld.tmp(s1), upper = bld.tmp(s1); + bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa); + lower = bld.sop2(aco_opcode::s_xor_b32, bld.def(s1), bld.def(s1, scc), sign, lower); + upper = bld.sop2(aco_opcode::s_xor_b32, bld.def(s1), bld.def(s1, scc), sign, upper); + Temp borrow = bld.tmp(s1); + lower = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), lower, sign); + upper = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), upper, sign, borrow); + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper); + + } else if (instr->src[0].src.ssa->bit_size == 64) { + Temp vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0x3df00000u)); + Temp trunc = emit_trunc_f64(ctx, bld, bld.def(v2), src); + Temp mul = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), trunc, vec); + vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0xc1f00000u)); + Temp floor = emit_floor_f64(ctx, bld, bld.def(v2), mul); + Temp fma = bld.vop3(aco_opcode::v_fma_f64, bld.def(v2), floor, vec, trunc); + Temp lower = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), fma); + Temp upper = bld.vop1(aco_opcode::v_cvt_i32_f64, bld.def(v1), floor); + if (dst.type() == RegType::sgpr) { + lower = bld.as_uniform(lower); + upper = bld.as_uniform(upper); + } + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper); + + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_f2u64: { + Temp src = get_alu_src(ctx, instr->src[0]); + if (instr->src[0].src.ssa->bit_size == 32 && dst.type() == RegType::vgpr) { + Temp exponent = bld.vop1(aco_opcode::v_frexp_exp_i32_f32, bld.def(v1), src); + Temp exponent_in_range = bld.vopc(aco_opcode::v_cmp_ge_i32, bld.hint_vcc(bld.def(bld.lm)), Operand(64u), exponent); + exponent = bld.vop2(aco_opcode::v_max_i32, bld.def(v1), Operand(0x0u), exponent); + Temp mantissa = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7fffffu), src); + mantissa = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand(0x800000u), mantissa); + Temp exponent_small = bld.vsub32(bld.def(v1), Operand(24u), exponent); + Temp small = bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), exponent_small, mantissa); + mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(0u), mantissa); + Temp new_exponent = bld.tmp(v1); + Temp cond_small = bld.vsub32(Definition(new_exponent), exponent, Operand(24u), true).def(1).getTemp(); + if (ctx->program->chip_class >= GFX8) + mantissa = bld.vop3(aco_opcode::v_lshlrev_b64, bld.def(v2), new_exponent, mantissa); + else + mantissa = bld.vop3(aco_opcode::v_lshl_b64, bld.def(v2), mantissa, new_exponent); + Temp lower = bld.tmp(v1), upper = bld.tmp(v1); + bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa); + lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), lower, small, cond_small); + upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), upper, Operand(0u), cond_small); + lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0xffffffffu), lower, exponent_in_range); + upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0xffffffffu), upper, exponent_in_range); + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper); + + } else if (instr->src[0].src.ssa->bit_size == 32 && dst.type() == RegType::sgpr) { + if (src.type() == RegType::vgpr) + src = bld.as_uniform(src); + Temp exponent = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), src, Operand(0x80017u)); + exponent = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), exponent, Operand(126u)); + exponent = bld.sop2(aco_opcode::s_max_u32, bld.def(s1), bld.def(s1, scc), Operand(0u), exponent); + Temp mantissa = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0x7fffffu), src); + mantissa = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), Operand(0x800000u), mantissa); + Temp exponent_small = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), Operand(24u), exponent); + Temp small = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), mantissa, exponent_small); + mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), mantissa); + Temp exponent_large = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), exponent, Operand(24u)); + mantissa = bld.sop2(aco_opcode::s_lshl_b64, bld.def(s2), bld.def(s1, scc), mantissa, exponent_large); + Temp cond = bld.sopc(aco_opcode::s_cmp_ge_i32, bld.def(s1, scc), Operand(64u), exponent); + mantissa = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), mantissa, Operand(0xffffffffu), cond); + Temp lower = bld.tmp(s1), upper = bld.tmp(s1); + bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa); + Temp cond_small = bld.sopc(aco_opcode::s_cmp_le_i32, bld.def(s1, scc), exponent, Operand(24u)); + lower = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), small, lower, cond_small); + upper = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), Operand(0u), upper, cond_small); + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper); + + } else if (instr->src[0].src.ssa->bit_size == 64) { + Temp vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0x3df00000u)); + Temp trunc = emit_trunc_f64(ctx, bld, bld.def(v2), src); + Temp mul = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), trunc, vec); + vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0xc1f00000u)); + Temp floor = emit_floor_f64(ctx, bld, bld.def(v2), mul); + Temp fma = bld.vop3(aco_opcode::v_fma_f64, bld.def(v2), floor, vec, trunc); + Temp lower = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), fma); + Temp upper = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), floor); + if (dst.type() == RegType::sgpr) { + lower = bld.as_uniform(lower); + upper = bld.as_uniform(upper); + } + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper); + + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_b2f32: { + Temp src = get_alu_src(ctx, instr->src[0]); + assert(src.regClass() == bld.lm); + + if (dst.regClass() == s1) { + src = bool_to_scalar_condition(ctx, src); + bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand(0x3f800000u), src); + } else if (dst.regClass() == v1) { + bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(0x3f800000u), src); + } else { + unreachable("Wrong destination register class for nir_op_b2f32."); + } + break; + } + case nir_op_b2f64: { + Temp src = get_alu_src(ctx, instr->src[0]); + assert(src.regClass() == bld.lm); + + if (dst.regClass() == s2) { + src = bool_to_scalar_condition(ctx, src); + bld.sop2(aco_opcode::s_cselect_b64, Definition(dst), Operand(0x3f800000u), Operand(0u), bld.scc(src)); + } else if (dst.regClass() == v2) { + Temp one = bld.vop1(aco_opcode::v_mov_b32, bld.def(v2), Operand(0x3FF00000u)); + Temp upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), one, src); + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand(0u), upper); + } else { + unreachable("Wrong destination register class for nir_op_b2f64."); + } + break; + } + case nir_op_i2i32: { + Temp src = get_alu_src(ctx, instr->src[0]); + if (instr->src[0].src.ssa->bit_size == 64) { + /* we can actually just say dst = src, as it would map the lower register */ + emit_extract_vector(ctx, src, 0, dst); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_u2u32: { + Temp src = get_alu_src(ctx, instr->src[0]); + if (instr->src[0].src.ssa->bit_size == 16) { + if (dst.regClass() == s1) { + bld.sop2(aco_opcode::s_and_b32, Definition(dst), bld.def(s1, scc), Operand(0xFFFFu), src); + } else { + // TODO: do better with SDWA + bld.vop2(aco_opcode::v_and_b32, Definition(dst), Operand(0xFFFFu), src); + } + } else if (instr->src[0].src.ssa->bit_size == 64) { + /* we can actually just say dst = src, as it would map the lower register */ + emit_extract_vector(ctx, src, 0, dst); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_i2i64: { + Temp src = get_alu_src(ctx, instr->src[0]); + if (src.regClass() == s1) { + Temp high = bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), src, Operand(31u)); + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src, high); + } else if (src.regClass() == v1) { + Temp high = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), src); + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src, high); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_u2u64: { + Temp src = get_alu_src(ctx, instr->src[0]); + if (instr->src[0].src.ssa->bit_size == 32) { + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src, Operand(0u)); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_b2i32: { + Temp src = get_alu_src(ctx, instr->src[0]); + assert(src.regClass() == bld.lm); + + if (dst.regClass() == s1) { + // TODO: in a post-RA optimization, we can check if src is in VCC, and directly use VCCNZ + bool_to_scalar_condition(ctx, src, dst); + } else if (dst.regClass() == v1) { + bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(1u), src); + } else { + unreachable("Invalid register class for b2i32"); + } + break; + } + case nir_op_i2b1: { + Temp src = get_alu_src(ctx, instr->src[0]); + assert(dst.regClass() == bld.lm); + + if (src.type() == RegType::vgpr) { + assert(src.regClass() == v1 || src.regClass() == v2); + assert(dst.regClass() == bld.lm); + bld.vopc(src.size() == 2 ? aco_opcode::v_cmp_lg_u64 : aco_opcode::v_cmp_lg_u32, + Definition(dst), Operand(0u), src).def(0).setHint(vcc); + } else { + assert(src.regClass() == s1 || src.regClass() == s2); + Temp tmp; + if (src.regClass() == s2 && ctx->program->chip_class <= GFX7) { + tmp = bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), Operand(0u), src).def(1).getTemp(); + } else { + tmp = bld.sopc(src.size() == 2 ? aco_opcode::s_cmp_lg_u64 : aco_opcode::s_cmp_lg_u32, + bld.scc(bld.def(s1)), Operand(0u), src); + } + bool_to_vector_condition(ctx, tmp, dst); + } + break; + } + case nir_op_pack_64_2x32_split: { + Temp src0 = get_alu_src(ctx, instr->src[0]); + Temp src1 = get_alu_src(ctx, instr->src[1]); + + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src0, src1); + break; + } + case nir_op_unpack_64_2x32_split_x: + bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(dst.regClass()), get_alu_src(ctx, instr->src[0])); + break; + case nir_op_unpack_64_2x32_split_y: + bld.pseudo(aco_opcode::p_split_vector, bld.def(dst.regClass()), Definition(dst), get_alu_src(ctx, instr->src[0])); + break; + case nir_op_pack_half_2x16: { + Temp src = get_alu_src(ctx, instr->src[0], 2); + + if (dst.regClass() == v1) { + Temp src0 = bld.tmp(v1); + Temp src1 = bld.tmp(v1); + bld.pseudo(aco_opcode::p_split_vector, Definition(src0), Definition(src1), src); + if (!ctx->block->fp_mode.care_about_round32 || ctx->block->fp_mode.round32 == fp_round_tz) + bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32, Definition(dst), src0, src1); + else + bld.vop3(aco_opcode::v_cvt_pk_u16_u32, Definition(dst), + bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src0), + bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src1)); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_unpack_half_2x16_split_x: { + if (dst.regClass() == v1) { + Builder bld(ctx->program, ctx->block); + bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), get_alu_src(ctx, instr->src[0])); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_unpack_half_2x16_split_y: { + if (dst.regClass() == v1) { + Builder bld(ctx->program, ctx->block); + /* TODO: use SDWA here */ + bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), + bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), Operand(16u), as_vgpr(ctx, get_alu_src(ctx, instr->src[0])))); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_fquantize2f16: { + Temp src = get_alu_src(ctx, instr->src[0]); + Temp f16 = bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v1), src); + Temp f32, cmp_res; + + if (ctx->program->chip_class >= GFX8) { + Temp mask = bld.copy(bld.def(s1), Operand(0x36Fu)); /* value is NOT negative/positive denormal value */ + cmp_res = bld.vopc_e64(aco_opcode::v_cmp_class_f16, bld.hint_vcc(bld.def(bld.lm)), f16, mask); + f32 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), f16); + } else { + /* 0x38800000 is smallest half float value (2^-14) in 32-bit float, + * so compare the result and flush to 0 if it's smaller. + */ + f32 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), f16); + Temp smallest = bld.copy(bld.def(s1), Operand(0x38800000u)); + Instruction* vop3 = bld.vopc_e64(aco_opcode::v_cmp_nlt_f32, bld.hint_vcc(bld.def(bld.lm)), f32, smallest); + static_cast(vop3)->abs[0] = true; + cmp_res = vop3->definitions[0].getTemp(); + } + + if (ctx->block->fp_mode.preserve_signed_zero_inf_nan32 || ctx->program->chip_class < GFX8) { + Temp copysign_0 = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0u), as_vgpr(ctx, src)); + bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), copysign_0, f32, cmp_res); + } else { + bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), f32, cmp_res); + } + break; + } + case nir_op_bfm: { + Temp bits = get_alu_src(ctx, instr->src[0]); + Temp offset = get_alu_src(ctx, instr->src[1]); + + if (dst.regClass() == s1) { + bld.sop2(aco_opcode::s_bfm_b32, Definition(dst), bits, offset); + } else if (dst.regClass() == v1) { + bld.vop3(aco_opcode::v_bfm_b32, Definition(dst), bits, offset); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_bitfield_select: { + /* (mask & insert) | (~mask & base) */ + Temp bitmask = get_alu_src(ctx, instr->src[0]); + Temp insert = get_alu_src(ctx, instr->src[1]); + Temp base = get_alu_src(ctx, instr->src[2]); + + /* dst = (insert & bitmask) | (base & ~bitmask) */ + if (dst.regClass() == s1) { + aco_ptr sop2; + nir_const_value* const_bitmask = nir_src_as_const_value(instr->src[0].src); + nir_const_value* const_insert = nir_src_as_const_value(instr->src[1].src); + Operand lhs; + if (const_insert && const_bitmask) { + lhs = Operand(const_insert->u32 & const_bitmask->u32); + } else { + insert = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), insert, bitmask); + lhs = Operand(insert); + } + + Operand rhs; + nir_const_value* const_base = nir_src_as_const_value(instr->src[2].src); + if (const_base && const_bitmask) { + rhs = Operand(const_base->u32 & ~const_bitmask->u32); + } else { + base = bld.sop2(aco_opcode::s_andn2_b32, bld.def(s1), bld.def(s1, scc), base, bitmask); + rhs = Operand(base); + } + + bld.sop2(aco_opcode::s_or_b32, Definition(dst), bld.def(s1, scc), rhs, lhs); + + } else if (dst.regClass() == v1) { + if (base.type() == RegType::sgpr && (bitmask.type() == RegType::sgpr || (insert.type() == RegType::sgpr))) + base = as_vgpr(ctx, base); + if (insert.type() == RegType::sgpr && bitmask.type() == RegType::sgpr) + insert = as_vgpr(ctx, insert); + + bld.vop3(aco_opcode::v_bfi_b32, Definition(dst), bitmask, insert, base); + + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_ubfe: + case nir_op_ibfe: { + Temp base = get_alu_src(ctx, instr->src[0]); + Temp offset = get_alu_src(ctx, instr->src[1]); + Temp bits = get_alu_src(ctx, instr->src[2]); + + if (dst.type() == RegType::sgpr) { + Operand extract; + nir_const_value* const_offset = nir_src_as_const_value(instr->src[1].src); + nir_const_value* const_bits = nir_src_as_const_value(instr->src[2].src); + if (const_offset && const_bits) { + uint32_t const_extract = (const_bits->u32 << 16) | const_offset->u32; + extract = Operand(const_extract); + } else { + Operand width; + if (const_bits) { + width = Operand(const_bits->u32 << 16); + } else { + width = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), bits, Operand(16u)); + } + extract = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), offset, width); + } + + aco_opcode opcode; + if (dst.regClass() == s1) { + if (instr->op == nir_op_ubfe) + opcode = aco_opcode::s_bfe_u32; + else + opcode = aco_opcode::s_bfe_i32; + } else if (dst.regClass() == s2) { + if (instr->op == nir_op_ubfe) + opcode = aco_opcode::s_bfe_u64; + else + opcode = aco_opcode::s_bfe_i64; + } else { + unreachable("Unsupported BFE bit size"); + } + + bld.sop2(opcode, Definition(dst), bld.def(s1, scc), base, extract); + + } else { + aco_opcode opcode; + if (dst.regClass() == v1) { + if (instr->op == nir_op_ubfe) + opcode = aco_opcode::v_bfe_u32; + else + opcode = aco_opcode::v_bfe_i32; + } else { + unreachable("Unsupported BFE bit size"); + } + + emit_vop3a_instruction(ctx, instr, opcode, dst); + } + break; + } + case nir_op_bit_count: { + Temp src = get_alu_src(ctx, instr->src[0]); + if (src.regClass() == s1) { + bld.sop1(aco_opcode::s_bcnt1_i32_b32, Definition(dst), bld.def(s1, scc), src); + } else if (src.regClass() == v1) { + bld.vop3(aco_opcode::v_bcnt_u32_b32, Definition(dst), src, Operand(0u)); + } else if (src.regClass() == v2) { + bld.vop3(aco_opcode::v_bcnt_u32_b32, Definition(dst), + emit_extract_vector(ctx, src, 1, v1), + bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), + emit_extract_vector(ctx, src, 0, v1), Operand(0u))); + } else if (src.regClass() == s2) { + bld.sop1(aco_opcode::s_bcnt1_i32_b64, Definition(dst), bld.def(s1, scc), src); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_flt: { + emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_f32, aco_opcode::v_cmp_lt_f64); + break; + } + case nir_op_fge: { + emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_f32, aco_opcode::v_cmp_ge_f64); + break; + } + case nir_op_feq: { + emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_eq_f32, aco_opcode::v_cmp_eq_f64); + break; + } + case nir_op_fne: { + emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_neq_f32, aco_opcode::v_cmp_neq_f64); + break; + } + case nir_op_ilt: { + emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_i32, aco_opcode::v_cmp_lt_i64, aco_opcode::s_cmp_lt_i32); + break; + } + case nir_op_ige: { + emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_i32, aco_opcode::v_cmp_ge_i64, aco_opcode::s_cmp_ge_i32); + break; + } + case nir_op_ieq: { + if (instr->src[0].src.ssa->bit_size == 1) + emit_boolean_logic(ctx, instr, Builder::s_xnor, dst); + else + emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_eq_i32, aco_opcode::v_cmp_eq_i64, aco_opcode::s_cmp_eq_i32, + ctx->program->chip_class >= GFX8 ? aco_opcode::s_cmp_eq_u64 : aco_opcode::num_opcodes); + break; + } + case nir_op_ine: { + if (instr->src[0].src.ssa->bit_size == 1) + emit_boolean_logic(ctx, instr, Builder::s_xor, dst); + else + emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lg_i32, aco_opcode::v_cmp_lg_i64, aco_opcode::s_cmp_lg_i32, + ctx->program->chip_class >= GFX8 ? aco_opcode::s_cmp_lg_u64 : aco_opcode::num_opcodes); + break; + } + case nir_op_ult: { + emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_u32, aco_opcode::v_cmp_lt_u64, aco_opcode::s_cmp_lt_u32); + break; + } + case nir_op_uge: { + emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_u32, aco_opcode::v_cmp_ge_u64, aco_opcode::s_cmp_ge_u32); + break; + } + case nir_op_fddx: + case nir_op_fddy: + case nir_op_fddx_fine: + case nir_op_fddy_fine: + case nir_op_fddx_coarse: + case nir_op_fddy_coarse: { + Temp src = get_alu_src(ctx, instr->src[0]); + uint16_t dpp_ctrl1, dpp_ctrl2; + if (instr->op == nir_op_fddx_fine) { + dpp_ctrl1 = dpp_quad_perm(0, 0, 2, 2); + dpp_ctrl2 = dpp_quad_perm(1, 1, 3, 3); + } else if (instr->op == nir_op_fddy_fine) { + dpp_ctrl1 = dpp_quad_perm(0, 1, 0, 1); + dpp_ctrl2 = dpp_quad_perm(2, 3, 2, 3); + } else { + dpp_ctrl1 = dpp_quad_perm(0, 0, 0, 0); + if (instr->op == nir_op_fddx || instr->op == nir_op_fddx_coarse) + dpp_ctrl2 = dpp_quad_perm(1, 1, 1, 1); + else + dpp_ctrl2 = dpp_quad_perm(2, 2, 2, 2); + } + + Temp tmp; + if (ctx->program->chip_class >= GFX8) { + Temp tl = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl1); + tmp = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), src, tl, dpp_ctrl2); + } else { + Temp tl = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl1); + Temp tr = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl2); + tmp = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), tr, tl); + } + emit_wqm(ctx, tmp, dst, true); + break; + } + default: + fprintf(stderr, "Unknown NIR ALU instr: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } +} + +void visit_load_const(isel_context *ctx, nir_load_const_instr *instr) +{ + Temp dst = get_ssa_temp(ctx, &instr->def); + + // TODO: we really want to have the resulting type as this would allow for 64bit literals + // which get truncated the lsb if double and msb if int + // for now, we only use s_mov_b64 with 64bit inline constants + assert(instr->def.num_components == 1 && "Vector load_const should be lowered to scalar."); + assert(dst.type() == RegType::sgpr); + + Builder bld(ctx->program, ctx->block); + + if (instr->def.bit_size == 1) { + assert(dst.regClass() == bld.lm); + int val = instr->value[0].b ? -1 : 0; + Operand op = bld.lm.size() == 1 ? Operand((uint32_t) val) : Operand((uint64_t) val); + bld.sop1(Builder::s_mov, Definition(dst), op); + } else if (dst.size() == 1) { + bld.copy(Definition(dst), Operand(instr->value[0].u32)); + } else { + assert(dst.size() != 1); + aco_ptr vec{create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)}; + if (instr->def.bit_size == 64) + for (unsigned i = 0; i < dst.size(); i++) + vec->operands[i] = Operand{(uint32_t)(instr->value[0].u64 >> i * 32)}; + else { + for (unsigned i = 0; i < dst.size(); i++) + vec->operands[i] = Operand{instr->value[i].u32}; + } + vec->definitions[0] = Definition(dst); + ctx->block->instructions.emplace_back(std::move(vec)); + } +} + +uint32_t widen_mask(uint32_t mask, unsigned multiplier) +{ + uint32_t new_mask = 0; + for(unsigned i = 0; i < 32 && (1u << i) <= mask; ++i) + if (mask & (1u << i)) + new_mask |= ((1u << multiplier) - 1u) << (i * multiplier); + return new_mask; +} + +Operand load_lds_size_m0(isel_context *ctx) +{ + /* TODO: m0 does not need to be initialized on GFX9+ */ + Builder bld(ctx->program, ctx->block); + return bld.m0((Temp)bld.sopk(aco_opcode::s_movk_i32, bld.def(s1, m0), 0xffff)); +} + +void load_lds(isel_context *ctx, unsigned elem_size_bytes, Temp dst, + Temp address, unsigned base_offset, unsigned align) +{ + assert(util_is_power_of_two_nonzero(align) && align >= 4); + + Builder bld(ctx->program, ctx->block); + + Operand m = load_lds_size_m0(ctx); + + unsigned num_components = dst.size() * 4u / elem_size_bytes; + unsigned bytes_read = 0; + unsigned result_size = 0; + unsigned total_bytes = num_components * elem_size_bytes; + std::array result; + bool large_ds_read = ctx->options->chip_class >= GFX7; + bool usable_read2 = ctx->options->chip_class >= GFX7; + + while (bytes_read < total_bytes) { + unsigned todo = total_bytes - bytes_read; + bool aligned8 = bytes_read % 8 == 0 && align % 8 == 0; + bool aligned16 = bytes_read % 16 == 0 && align % 16 == 0; + + aco_opcode op = aco_opcode::last_opcode; + bool read2 = false; + if (todo >= 16 && aligned16 && large_ds_read) { + op = aco_opcode::ds_read_b128; + todo = 16; + } else if (todo >= 16 && aligned8 && usable_read2) { + op = aco_opcode::ds_read2_b64; + read2 = true; + todo = 16; + } else if (todo >= 12 && aligned16 && large_ds_read) { + op = aco_opcode::ds_read_b96; + todo = 12; + } else if (todo >= 8 && aligned8) { + op = aco_opcode::ds_read_b64; + todo = 8; + } else if (todo >= 8 && usable_read2) { + op = aco_opcode::ds_read2_b32; + read2 = true; + todo = 8; + } else if (todo >= 4) { + op = aco_opcode::ds_read_b32; + todo = 4; + } else { + assert(false); + } + assert(todo % elem_size_bytes == 0); + unsigned num_elements = todo / elem_size_bytes; + unsigned offset = base_offset + bytes_read; + unsigned max_offset = read2 ? 1019 : 65535; + + Temp address_offset = address; + if (offset > max_offset) { + address_offset = bld.vadd32(bld.def(v1), Operand(base_offset), address_offset); + offset = bytes_read; + } + assert(offset <= max_offset); /* bytes_read shouldn't be large enough for this to happen */ + + Temp res; + if (num_components == 1 && dst.type() == RegType::vgpr) + res = dst; + else + res = bld.tmp(RegClass(RegType::vgpr, todo / 4)); + + if (read2) + res = bld.ds(op, Definition(res), address_offset, m, offset >> 2, (offset >> 2) + 1); + else + res = bld.ds(op, Definition(res), address_offset, m, offset); + + if (num_components == 1) { + assert(todo == total_bytes); + if (dst.type() == RegType::sgpr) + bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), res); + return; + } + + if (dst.type() == RegType::sgpr) { + Temp new_res = bld.tmp(RegType::sgpr, res.size()); + expand_vector(ctx, res, new_res, res.size(), (1 << res.size()) - 1); + res = new_res; + } + + if (num_elements == 1) { + result[result_size++] = res; + } else { + assert(res != dst && res.size() % num_elements == 0); + aco_ptr split{create_instruction(aco_opcode::p_split_vector, Format::PSEUDO, 1, num_elements)}; + split->operands[0] = Operand(res); + for (unsigned i = 0; i < num_elements; i++) + split->definitions[i] = Definition(result[result_size++] = bld.tmp(res.type(), elem_size_bytes / 4)); + ctx->block->instructions.emplace_back(std::move(split)); + } + + bytes_read += todo; + } + + assert(result_size == num_components && result_size > 1); + aco_ptr vec{create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, result_size, 1)}; + for (unsigned i = 0; i < result_size; i++) + vec->operands[i] = Operand(result[i]); + vec->definitions[0] = Definition(dst); + ctx->block->instructions.emplace_back(std::move(vec)); + ctx->allocated_vec.emplace(dst.id(), result); +} + +Temp extract_subvector(isel_context *ctx, Temp data, unsigned start, unsigned size, RegType type) +{ + if (start == 0 && size == data.size()) + return type == RegType::vgpr ? as_vgpr(ctx, data) : data; + + unsigned size_hint = 1; + auto it = ctx->allocated_vec.find(data.id()); + if (it != ctx->allocated_vec.end()) + size_hint = it->second[0].size(); + if (size % size_hint || start % size_hint) + size_hint = 1; + + start /= size_hint; + size /= size_hint; + + Temp elems[size]; + for (unsigned i = 0; i < size; i++) + elems[i] = emit_extract_vector(ctx, data, start + i, RegClass(type, size_hint)); + + if (size == 1) + return type == RegType::vgpr ? as_vgpr(ctx, elems[0]) : elems[0]; + + aco_ptr vec{create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, size, 1)}; + for (unsigned i = 0; i < size; i++) + vec->operands[i] = Operand(elems[i]); + Temp res = {ctx->program->allocateId(), RegClass(type, size * size_hint)}; + vec->definitions[0] = Definition(res); + ctx->block->instructions.emplace_back(std::move(vec)); + return res; +} + +void ds_write_helper(isel_context *ctx, Operand m, Temp address, Temp data, unsigned data_start, unsigned total_size, unsigned offset0, unsigned offset1, unsigned align) +{ + Builder bld(ctx->program, ctx->block); + unsigned bytes_written = 0; + bool large_ds_write = ctx->options->chip_class >= GFX7; + bool usable_write2 = ctx->options->chip_class >= GFX7; + + while (bytes_written < total_size * 4) { + unsigned todo = total_size * 4 - bytes_written; + bool aligned8 = bytes_written % 8 == 0 && align % 8 == 0; + bool aligned16 = bytes_written % 16 == 0 && align % 16 == 0; + + aco_opcode op = aco_opcode::last_opcode; + bool write2 = false; + unsigned size = 0; + if (todo >= 16 && aligned16 && large_ds_write) { + op = aco_opcode::ds_write_b128; + size = 4; + } else if (todo >= 16 && aligned8 && usable_write2) { + op = aco_opcode::ds_write2_b64; + write2 = true; + size = 4; + } else if (todo >= 12 && aligned16 && large_ds_write) { + op = aco_opcode::ds_write_b96; + size = 3; + } else if (todo >= 8 && aligned8) { + op = aco_opcode::ds_write_b64; + size = 2; + } else if (todo >= 8 && usable_write2) { + op = aco_opcode::ds_write2_b32; + write2 = true; + size = 2; + } else if (todo >= 4) { + op = aco_opcode::ds_write_b32; + size = 1; + } else { + assert(false); + } + + unsigned offset = offset0 + offset1 + bytes_written; + unsigned max_offset = write2 ? 1020 : 65535; + Temp address_offset = address; + if (offset > max_offset) { + address_offset = bld.vadd32(bld.def(v1), Operand(offset0), address_offset); + offset = offset1 + bytes_written; + } + assert(offset <= max_offset); /* offset1 shouldn't be large enough for this to happen */ + + if (write2) { + Temp val0 = extract_subvector(ctx, data, data_start + (bytes_written >> 2), size / 2, RegType::vgpr); + Temp val1 = extract_subvector(ctx, data, data_start + (bytes_written >> 2) + 1, size / 2, RegType::vgpr); + bld.ds(op, address_offset, val0, val1, m, offset >> 2, (offset >> 2) + 1); + } else { + Temp val = extract_subvector(ctx, data, data_start + (bytes_written >> 2), size, RegType::vgpr); + bld.ds(op, address_offset, val, m, offset); + } + + bytes_written += size * 4; + } +} + +void store_lds(isel_context *ctx, unsigned elem_size_bytes, Temp data, uint32_t wrmask, + Temp address, unsigned base_offset, unsigned align) +{ + assert(util_is_power_of_two_nonzero(align) && align >= 4); + + Operand m = load_lds_size_m0(ctx); + + /* we need at most two stores for 32bit variables */ + int start[2], count[2]; + u_bit_scan_consecutive_range(&wrmask, &start[0], &count[0]); + u_bit_scan_consecutive_range(&wrmask, &start[1], &count[1]); + assert(wrmask == 0); + + /* one combined store is sufficient */ + if (count[0] == count[1]) { + Builder bld(ctx->program, ctx->block); + + Temp address_offset = address; + if ((base_offset >> 2) + start[1] > 255) { + address_offset = bld.vadd32(bld.def(v1), Operand(base_offset), address_offset); + base_offset = 0; + } + + assert(count[0] == 1); + Temp val0 = emit_extract_vector(ctx, data, start[0], v1); + Temp val1 = emit_extract_vector(ctx, data, start[1], v1); + aco_opcode op = elem_size_bytes == 4 ? aco_opcode::ds_write2_b32 : aco_opcode::ds_write2_b64; + base_offset = base_offset / elem_size_bytes; + bld.ds(op, address_offset, val0, val1, m, + base_offset + start[0], base_offset + start[1]); + return; + } + + for (unsigned i = 0; i < 2; i++) { + if (count[i] == 0) + continue; + + unsigned elem_size_words = elem_size_bytes / 4; + ds_write_helper(ctx, m, address, data, start[i] * elem_size_words, count[i] * elem_size_words, + base_offset, start[i] * elem_size_bytes, align); + } + return; +} + +void visit_store_vsgs_output(isel_context *ctx, nir_intrinsic_instr *instr) +{ + unsigned write_mask = nir_intrinsic_write_mask(instr); + unsigned component = nir_intrinsic_component(instr); + Temp src = get_ssa_temp(ctx, instr->src[0].ssa); + unsigned idx = (nir_intrinsic_base(instr) + component) * 4u; + Operand offset(s1); + Builder bld(ctx->program, ctx->block); + + nir_instr *off_instr = instr->src[1].ssa->parent_instr; + if (off_instr->type != nir_instr_type_load_const) + offset = bld.v_mul24_imm(bld.def(v1), get_ssa_temp(ctx, instr->src[1].ssa), 16u); + else + idx += nir_instr_as_load_const(off_instr)->value[0].u32 * 16u; + + unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8u; + if (ctx->stage == vertex_es) { + Temp esgs_ring = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_ESGS_VS * 16u)); + + Temp elems[NIR_MAX_VEC_COMPONENTS * 2]; + if (elem_size_bytes == 8) { + for (unsigned i = 0; i < src.size() / 2; i++) { + Temp elem = emit_extract_vector(ctx, src, i, v2); + elems[i*2] = bld.tmp(v1); + elems[i*2+1] = bld.tmp(v1); + bld.pseudo(aco_opcode::p_split_vector, Definition(elems[i*2]), Definition(elems[i*2+1]), elem); + } + write_mask = widen_mask(write_mask, 2); + elem_size_bytes /= 2u; + } else { + for (unsigned i = 0; i < src.size(); i++) + elems[i] = emit_extract_vector(ctx, src, i, v1); + } + + while (write_mask) { + unsigned index = u_bit_scan(&write_mask); + unsigned offset = index * elem_size_bytes; + Temp elem = emit_extract_vector(ctx, src, index, RegClass(RegType::vgpr, elem_size_bytes / 4)); + + Operand vaddr_offset(v1); + unsigned const_offset = idx + offset; + if (const_offset >= 4096u) { + vaddr_offset = bld.copy(bld.def(v1), Operand(const_offset / 4096u * 4096u)); + const_offset %= 4096u; + } + + aco_ptr mtbuf{create_instruction(aco_opcode::tbuffer_store_format_x, Format::MTBUF, 4, 0)}; + mtbuf->operands[0] = Operand(esgs_ring); + mtbuf->operands[1] = vaddr_offset; + mtbuf->operands[2] = Operand(get_arg(ctx, ctx->args->es2gs_offset)); + mtbuf->operands[3] = Operand(elem); + mtbuf->offen = !vaddr_offset.isUndefined(); + mtbuf->dfmt = V_008F0C_BUF_DATA_FORMAT_32; + mtbuf->nfmt = V_008F0C_BUF_NUM_FORMAT_UINT; + mtbuf->offset = const_offset; + mtbuf->glc = true; + mtbuf->slc = true; + mtbuf->barrier = barrier_none; + mtbuf->can_reorder = true; + bld.insert(std::move(mtbuf)); + } + } else { + unsigned itemsize = ctx->program->info->vs.es_info.esgs_itemsize; + + Temp vertex_idx = emit_mbcnt(ctx, bld.def(v1)); + Temp wave_idx = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), get_arg(ctx, ctx->args->merged_wave_info), Operand(4u << 16 | 24)); + vertex_idx = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), vertex_idx, + bld.v_mul24_imm(bld.def(v1), as_vgpr(ctx, wave_idx), ctx->program->wave_size)); + + Temp lds_base = bld.v_mul24_imm(bld.def(v1), vertex_idx, itemsize); + if (!offset.isUndefined()) + lds_base = bld.vadd32(bld.def(v1), offset, lds_base); + + unsigned align = 1 << (ffs(itemsize) - 1); + if (idx) + align = std::min(align, 1u << (ffs(idx) - 1)); + + store_lds(ctx, elem_size_bytes, src, write_mask, lds_base, idx, align); + } +} + +void visit_store_output(isel_context *ctx, nir_intrinsic_instr *instr) +{ + if (ctx->stage == vertex_vs || + ctx->stage == fragment_fs || + ctx->shader->info.stage == MESA_SHADER_GEOMETRY) { + unsigned write_mask = nir_intrinsic_write_mask(instr); + unsigned component = nir_intrinsic_component(instr); + Temp src = get_ssa_temp(ctx, instr->src[0].ssa); + unsigned idx = nir_intrinsic_base(instr) + component; + + nir_instr *off_instr = instr->src[1].ssa->parent_instr; + if (off_instr->type != nir_instr_type_load_const) { + fprintf(stderr, "Unimplemented nir_intrinsic_load_input offset\n"); + nir_print_instr(off_instr, stderr); + fprintf(stderr, "\n"); + } + idx += nir_instr_as_load_const(off_instr)->value[0].u32 * 4u; + + if (instr->src[0].ssa->bit_size == 64) + write_mask = widen_mask(write_mask, 2); + + for (unsigned i = 0; i < 8; ++i) { + if (write_mask & (1 << i)) { + ctx->outputs.mask[idx / 4u] |= 1 << (idx % 4u); + ctx->outputs.outputs[idx / 4u][idx % 4u] = emit_extract_vector(ctx, src, i, v1); + } + idx++; + } + } else if (ctx->stage == vertex_es || + (ctx->stage == vertex_geometry_gs && ctx->shader->info.stage == MESA_SHADER_VERTEX)) { + visit_store_vsgs_output(ctx, instr); + } else { + unreachable("Shader stage not implemented"); + } +} + +void emit_interp_instr(isel_context *ctx, unsigned idx, unsigned component, Temp src, Temp dst, Temp prim_mask) +{ + Temp coord1 = emit_extract_vector(ctx, src, 0, v1); + Temp coord2 = emit_extract_vector(ctx, src, 1, v1); + + Builder bld(ctx->program, ctx->block); + Temp tmp = bld.vintrp(aco_opcode::v_interp_p1_f32, bld.def(v1), coord1, bld.m0(prim_mask), idx, component); + bld.vintrp(aco_opcode::v_interp_p2_f32, Definition(dst), coord2, bld.m0(prim_mask), tmp, idx, component); +} + +void emit_load_frag_coord(isel_context *ctx, Temp dst, unsigned num_components) +{ + aco_ptr vec(create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)); + for (unsigned i = 0; i < num_components; i++) + vec->operands[i] = Operand(get_arg(ctx, ctx->args->ac.frag_pos[i])); + if (G_0286CC_POS_W_FLOAT_ENA(ctx->program->config->spi_ps_input_ena)) { + assert(num_components == 4); + Builder bld(ctx->program, ctx->block); + vec->operands[3] = bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), get_arg(ctx, ctx->args->ac.frag_pos[3])); + } + + for (Operand& op : vec->operands) + op = op.isUndefined() ? Operand(0u) : op; + + vec->definitions[0] = Definition(dst); + ctx->block->instructions.emplace_back(std::move(vec)); + emit_split_vector(ctx, dst, num_components); + return; +} + +void visit_load_interpolated_input(isel_context *ctx, nir_intrinsic_instr *instr) +{ + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + Temp coords = get_ssa_temp(ctx, instr->src[0].ssa); + unsigned idx = nir_intrinsic_base(instr); + unsigned component = nir_intrinsic_component(instr); + Temp prim_mask = get_arg(ctx, ctx->args->ac.prim_mask); + + nir_const_value* offset = nir_src_as_const_value(instr->src[1]); + if (offset) { + assert(offset->u32 == 0); + } else { + /* the lower 15bit of the prim_mask contain the offset into LDS + * while the upper bits contain the number of prims */ + Temp offset_src = get_ssa_temp(ctx, instr->src[1].ssa); + assert(offset_src.regClass() == s1 && "TODO: divergent offsets..."); + Builder bld(ctx->program, ctx->block); + Temp stride = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), prim_mask, Operand(16u)); + stride = bld.sop1(aco_opcode::s_bcnt1_i32_b32, bld.def(s1), bld.def(s1, scc), stride); + stride = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), stride, Operand(48u)); + offset_src = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), stride, offset_src); + prim_mask = bld.sop2(aco_opcode::s_add_i32, bld.def(s1, m0), bld.def(s1, scc), offset_src, prim_mask); + } + + if (instr->dest.ssa.num_components == 1) { + emit_interp_instr(ctx, idx, component, coords, dst, prim_mask); + } else { + aco_ptr vec(create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, instr->dest.ssa.num_components, 1)); + for (unsigned i = 0; i < instr->dest.ssa.num_components; i++) + { + Temp tmp = {ctx->program->allocateId(), v1}; + emit_interp_instr(ctx, idx, component+i, coords, tmp, prim_mask); + vec->operands[i] = Operand(tmp); + } + vec->definitions[0] = Definition(dst); + ctx->block->instructions.emplace_back(std::move(vec)); + } +} + +bool check_vertex_fetch_size(isel_context *ctx, const ac_data_format_info *vtx_info, + unsigned offset, unsigned stride, unsigned channels) +{ + unsigned vertex_byte_size = vtx_info->chan_byte_size * channels; + if (vtx_info->chan_byte_size != 4 && channels == 3) + return false; + return (ctx->options->chip_class != GFX6 && ctx->options->chip_class != GFX10) || + (offset % vertex_byte_size == 0 && stride % vertex_byte_size == 0); +} + +uint8_t get_fetch_data_format(isel_context *ctx, const ac_data_format_info *vtx_info, + unsigned offset, unsigned stride, unsigned *channels) +{ + if (!vtx_info->chan_byte_size) { + *channels = vtx_info->num_channels; + return vtx_info->chan_format; + } + + unsigned num_channels = *channels; + if (!check_vertex_fetch_size(ctx, vtx_info, offset, stride, *channels)) { + unsigned new_channels = num_channels + 1; + /* first, assume more loads is worse and try using a larger data format */ + while (new_channels <= 4 && !check_vertex_fetch_size(ctx, vtx_info, offset, stride, new_channels)) { + new_channels++; + /* don't make the attribute potentially out-of-bounds */ + if (offset + new_channels * vtx_info->chan_byte_size > stride) + new_channels = 5; + } + + if (new_channels == 5) { + /* then try decreasing load size (at the cost of more loads) */ + new_channels = *channels; + while (new_channels > 1 && !check_vertex_fetch_size(ctx, vtx_info, offset, stride, new_channels)) + new_channels--; + } + + if (new_channels < *channels) + *channels = new_channels; + num_channels = new_channels; + } + + switch (vtx_info->chan_format) { + case V_008F0C_BUF_DATA_FORMAT_8: + return (uint8_t[]){V_008F0C_BUF_DATA_FORMAT_8, V_008F0C_BUF_DATA_FORMAT_8_8, + V_008F0C_BUF_DATA_FORMAT_INVALID, V_008F0C_BUF_DATA_FORMAT_8_8_8_8}[num_channels - 1]; + case V_008F0C_BUF_DATA_FORMAT_16: + return (uint8_t[]){V_008F0C_BUF_DATA_FORMAT_16, V_008F0C_BUF_DATA_FORMAT_16_16, + V_008F0C_BUF_DATA_FORMAT_INVALID, V_008F0C_BUF_DATA_FORMAT_16_16_16_16}[num_channels - 1]; + case V_008F0C_BUF_DATA_FORMAT_32: + return (uint8_t[]){V_008F0C_BUF_DATA_FORMAT_32, V_008F0C_BUF_DATA_FORMAT_32_32, + V_008F0C_BUF_DATA_FORMAT_32_32_32, V_008F0C_BUF_DATA_FORMAT_32_32_32_32}[num_channels - 1]; + } + unreachable("shouldn't reach here"); + return V_008F0C_BUF_DATA_FORMAT_INVALID; +} + +/* For 2_10_10_10 formats the alpha is handled as unsigned by pre-vega HW. + * so we may need to fix it up. */ +Temp adjust_vertex_fetch_alpha(isel_context *ctx, unsigned adjustment, Temp alpha) +{ + Builder bld(ctx->program, ctx->block); + + if (adjustment == RADV_ALPHA_ADJUST_SSCALED) + alpha = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), alpha); + + /* For the integer-like cases, do a natural sign extension. + * + * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0 + * and happen to contain 0, 1, 2, 3 as the two LSBs of the + * exponent. + */ + alpha = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(adjustment == RADV_ALPHA_ADJUST_SNORM ? 7u : 30u), alpha); + alpha = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(30u), alpha); + + /* Convert back to the right type. */ + if (adjustment == RADV_ALPHA_ADJUST_SNORM) { + alpha = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), alpha); + Temp clamp = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(bld.lm)), Operand(0xbf800000u), alpha); + alpha = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0xbf800000u), alpha, clamp); + } else if (adjustment == RADV_ALPHA_ADJUST_SSCALED) { + alpha = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), alpha); + } + + return alpha; +} + +void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr) +{ + Builder bld(ctx->program, ctx->block); + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + if (ctx->stage & sw_vs) { + + nir_instr *off_instr = instr->src[0].ssa->parent_instr; + if (off_instr->type != nir_instr_type_load_const) { + fprintf(stderr, "Unimplemented nir_intrinsic_load_input offset\n"); + nir_print_instr(off_instr, stderr); + fprintf(stderr, "\n"); + } + uint32_t offset = nir_instr_as_load_const(off_instr)->value[0].u32; + + Temp vertex_buffers = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->vertex_buffers)); + + unsigned location = nir_intrinsic_base(instr) / 4 - VERT_ATTRIB_GENERIC0 + offset; + unsigned component = nir_intrinsic_component(instr); + unsigned attrib_binding = ctx->options->key.vs.vertex_attribute_bindings[location]; + uint32_t attrib_offset = ctx->options->key.vs.vertex_attribute_offsets[location]; + uint32_t attrib_stride = ctx->options->key.vs.vertex_attribute_strides[location]; + unsigned attrib_format = ctx->options->key.vs.vertex_attribute_formats[location]; + + unsigned dfmt = attrib_format & 0xf; + unsigned nfmt = (attrib_format >> 4) & 0x7; + const struct ac_data_format_info *vtx_info = ac_get_data_format_info(dfmt); + + unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa) << component; + unsigned num_channels = MIN2(util_last_bit(mask), vtx_info->num_channels); + unsigned alpha_adjust = (ctx->options->key.vs.alpha_adjust >> (location * 2)) & 3; + bool post_shuffle = ctx->options->key.vs.post_shuffle & (1 << location); + if (post_shuffle) + num_channels = MAX2(num_channels, 3); + + Operand off = bld.copy(bld.def(s1), Operand(attrib_binding * 16u)); + Temp list = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), vertex_buffers, off); + + Temp index; + if (ctx->options->key.vs.instance_rate_inputs & (1u << location)) { + uint32_t divisor = ctx->options->key.vs.instance_rate_divisors[location]; + Temp start_instance = get_arg(ctx, ctx->args->ac.start_instance); + if (divisor) { + Temp instance_id = get_arg(ctx, ctx->args->ac.instance_id); + if (divisor != 1) { + Temp divided = bld.tmp(v1); + emit_v_div_u32(ctx, divided, as_vgpr(ctx, instance_id), divisor); + index = bld.vadd32(bld.def(v1), start_instance, divided); + } else { + index = bld.vadd32(bld.def(v1), start_instance, instance_id); + } + } else { + index = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), start_instance); + } + } else { + index = bld.vadd32(bld.def(v1), + get_arg(ctx, ctx->args->ac.base_vertex), + get_arg(ctx, ctx->args->ac.vertex_id)); + } + + Temp channels[num_channels]; + unsigned channel_start = 0; + bool direct_fetch = false; + + /* skip unused channels at the start */ + if (vtx_info->chan_byte_size && !post_shuffle) { + channel_start = ffs(mask) - 1; + for (unsigned i = 0; i < channel_start; i++) + channels[i] = Temp(0, s1); + } else if (vtx_info->chan_byte_size && post_shuffle && !(mask & 0x8)) { + num_channels = 3 - (ffs(mask) - 1); + } + + /* load channels */ + while (channel_start < num_channels) { + unsigned fetch_size = num_channels - channel_start; + unsigned fetch_offset = attrib_offset + channel_start * vtx_info->chan_byte_size; + bool expanded = false; + + /* use MUBUF when possible to avoid possible alignment issues */ + /* TODO: we could use SDWA to unpack 8/16-bit attributes without extra instructions */ + bool use_mubuf = (nfmt == V_008F0C_BUF_NUM_FORMAT_FLOAT || + nfmt == V_008F0C_BUF_NUM_FORMAT_UINT || + nfmt == V_008F0C_BUF_NUM_FORMAT_SINT) && + vtx_info->chan_byte_size == 4; + unsigned fetch_dfmt = V_008F0C_BUF_DATA_FORMAT_INVALID; + if (!use_mubuf) { + fetch_dfmt = get_fetch_data_format(ctx, vtx_info, fetch_offset, attrib_stride, &fetch_size); + } else { + if (fetch_size == 3 && ctx->options->chip_class == GFX6) { + /* GFX6 only supports loading vec3 with MTBUF, expand to vec4. */ + fetch_size = 4; + expanded = true; + } + } + + Temp fetch_index = index; + if (attrib_stride != 0 && fetch_offset > attrib_stride) { + fetch_index = bld.vadd32(bld.def(v1), Operand(fetch_offset / attrib_stride), fetch_index); + fetch_offset = fetch_offset % attrib_stride; + } + + Operand soffset(0u); + if (fetch_offset >= 4096) { + soffset = bld.copy(bld.def(s1), Operand(fetch_offset / 4096 * 4096)); + fetch_offset %= 4096; + } + + aco_opcode opcode; + switch (fetch_size) { + case 1: + opcode = use_mubuf ? aco_opcode::buffer_load_dword : aco_opcode::tbuffer_load_format_x; + break; + case 2: + opcode = use_mubuf ? aco_opcode::buffer_load_dwordx2 : aco_opcode::tbuffer_load_format_xy; + break; + case 3: + assert(ctx->options->chip_class >= GFX7 || + (!use_mubuf && ctx->options->chip_class == GFX6)); + opcode = use_mubuf ? aco_opcode::buffer_load_dwordx3 : aco_opcode::tbuffer_load_format_xyz; + break; + case 4: + opcode = use_mubuf ? aco_opcode::buffer_load_dwordx4 : aco_opcode::tbuffer_load_format_xyzw; + break; + default: + unreachable("Unimplemented load_input vector size"); + } + + Temp fetch_dst; + if (channel_start == 0 && fetch_size == dst.size() && !post_shuffle && + !expanded && (alpha_adjust == RADV_ALPHA_ADJUST_NONE || + num_channels <= 3)) { + direct_fetch = true; + fetch_dst = dst; + } else { + fetch_dst = bld.tmp(RegType::vgpr, fetch_size); + } + + if (use_mubuf) { + Instruction *mubuf = bld.mubuf(opcode, + Definition(fetch_dst), list, fetch_index, soffset, + fetch_offset, false, true).instr; + static_cast(mubuf)->can_reorder = true; + } else { + Instruction *mtbuf = bld.mtbuf(opcode, + Definition(fetch_dst), list, fetch_index, soffset, + fetch_dfmt, nfmt, fetch_offset, false, true).instr; + static_cast(mtbuf)->can_reorder = true; + } + + emit_split_vector(ctx, fetch_dst, fetch_dst.size()); + + if (fetch_size == 1) { + channels[channel_start] = fetch_dst; + } else { + for (unsigned i = 0; i < MIN2(fetch_size, num_channels - channel_start); i++) + channels[channel_start + i] = emit_extract_vector(ctx, fetch_dst, i, v1); + } + + channel_start += fetch_size; + } + + if (!direct_fetch) { + bool is_float = nfmt != V_008F0C_BUF_NUM_FORMAT_UINT && + nfmt != V_008F0C_BUF_NUM_FORMAT_SINT; + + static const unsigned swizzle_normal[4] = {0, 1, 2, 3}; + static const unsigned swizzle_post_shuffle[4] = {2, 1, 0, 3}; + const unsigned *swizzle = post_shuffle ? swizzle_post_shuffle : swizzle_normal; + + aco_ptr vec{create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)}; + std::array elems; + unsigned num_temp = 0; + for (unsigned i = 0; i < dst.size(); i++) { + unsigned idx = i + component; + if (swizzle[idx] < num_channels && channels[swizzle[idx]].id()) { + Temp channel = channels[swizzle[idx]]; + if (idx == 3 && alpha_adjust != RADV_ALPHA_ADJUST_NONE) + channel = adjust_vertex_fetch_alpha(ctx, alpha_adjust, channel); + vec->operands[i] = Operand(channel); + + num_temp++; + elems[i] = channel; + } else if (is_float && idx == 3) { + vec->operands[i] = Operand(0x3f800000u); + } else if (!is_float && idx == 3) { + vec->operands[i] = Operand(1u); + } else { + vec->operands[i] = Operand(0u); + } + } + vec->definitions[0] = Definition(dst); + ctx->block->instructions.emplace_back(std::move(vec)); + emit_split_vector(ctx, dst, dst.size()); + + if (num_temp == dst.size()) + ctx->allocated_vec.emplace(dst.id(), elems); + } + } else if (ctx->stage == fragment_fs) { + unsigned offset_idx = instr->intrinsic == nir_intrinsic_load_input ? 0 : 1; + nir_instr *off_instr = instr->src[offset_idx].ssa->parent_instr; + if (off_instr->type != nir_instr_type_load_const || + nir_instr_as_load_const(off_instr)->value[0].u32 != 0) { + fprintf(stderr, "Unimplemented nir_intrinsic_load_input offset\n"); + nir_print_instr(off_instr, stderr); + fprintf(stderr, "\n"); + } + + Temp prim_mask = get_arg(ctx, ctx->args->ac.prim_mask); + nir_const_value* offset = nir_src_as_const_value(instr->src[offset_idx]); + if (offset) { + assert(offset->u32 == 0); + } else { + /* the lower 15bit of the prim_mask contain the offset into LDS + * while the upper bits contain the number of prims */ + Temp offset_src = get_ssa_temp(ctx, instr->src[offset_idx].ssa); + assert(offset_src.regClass() == s1 && "TODO: divergent offsets..."); + Builder bld(ctx->program, ctx->block); + Temp stride = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), prim_mask, Operand(16u)); + stride = bld.sop1(aco_opcode::s_bcnt1_i32_b32, bld.def(s1), bld.def(s1, scc), stride); + stride = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), stride, Operand(48u)); + offset_src = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), stride, offset_src); + prim_mask = bld.sop2(aco_opcode::s_add_i32, bld.def(s1, m0), bld.def(s1, scc), offset_src, prim_mask); + } + + unsigned idx = nir_intrinsic_base(instr); + unsigned component = nir_intrinsic_component(instr); + unsigned vertex_id = 2; /* P0 */ + + if (instr->intrinsic == nir_intrinsic_load_input_vertex) { + nir_const_value* src0 = nir_src_as_const_value(instr->src[0]); + switch (src0->u32) { + case 0: + vertex_id = 2; /* P0 */ + break; + case 1: + vertex_id = 0; /* P10 */ + break; + case 2: + vertex_id = 1; /* P20 */ + break; + default: + unreachable("invalid vertex index"); + } + } + + if (dst.size() == 1) { + bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(dst), Operand(vertex_id), bld.m0(prim_mask), idx, component); + } else { + aco_ptr vec{create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)}; + for (unsigned i = 0; i < dst.size(); i++) + vec->operands[i] = bld.vintrp(aco_opcode::v_interp_mov_f32, bld.def(v1), Operand(vertex_id), bld.m0(prim_mask), idx, component + i); + vec->definitions[0] = Definition(dst); + bld.insert(std::move(vec)); + } + + } else { + unreachable("Shader stage not implemented"); + } +} + +void visit_load_per_vertex_input(isel_context *ctx, nir_intrinsic_instr *instr) +{ + assert(ctx->stage == vertex_geometry_gs || ctx->stage == geometry_gs); + assert(ctx->shader->info.stage == MESA_SHADER_GEOMETRY); + + Builder bld(ctx->program, ctx->block); + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + + Temp offset = Temp(); + if (instr->src[0].ssa->parent_instr->type != nir_instr_type_load_const) { + /* better code could be created, but this case probably doesn't happen + * much in practice */ + Temp indirect_vertex = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa)); + for (unsigned i = 0; i < ctx->shader->info.gs.vertices_in; i++) { + Temp elem; + if (ctx->stage == vertex_geometry_gs) { + elem = get_arg(ctx, ctx->args->gs_vtx_offset[i / 2u * 2u]); + if (i % 2u) + elem = bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), Operand(16u), elem); + } else { + elem = get_arg(ctx, ctx->args->gs_vtx_offset[i]); + } + if (offset.id()) { + Temp cond = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.hint_vcc(bld.def(s2)), + Operand(i), indirect_vertex); + offset = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), offset, elem, cond); + } else { + offset = elem; + } + } + if (ctx->stage == vertex_geometry_gs) + offset = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0xffffu), offset); + } else { + unsigned vertex = nir_src_as_uint(instr->src[0]); + if (ctx->stage == vertex_geometry_gs) + offset = bld.vop3( + aco_opcode::v_bfe_u32, bld.def(v1), get_arg(ctx, ctx->args->gs_vtx_offset[vertex / 2u * 2u]), + Operand((vertex % 2u) * 16u), Operand(16u)); + else + offset = get_arg(ctx, ctx->args->gs_vtx_offset[vertex]); + } + + unsigned const_offset = nir_intrinsic_base(instr); + const_offset += nir_intrinsic_component(instr); + + nir_instr *off_instr = instr->src[1].ssa->parent_instr; + if (off_instr->type != nir_instr_type_load_const) { + Temp indirect_offset = get_ssa_temp(ctx, instr->src[1].ssa); + offset = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), + bld.vadd32(bld.def(v1), indirect_offset, offset)); + } else { + const_offset += nir_instr_as_load_const(off_instr)->value[0].u32 * 4u; + } + const_offset *= 4u; + + offset = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), offset); + + unsigned itemsize = ctx->program->info->vs.es_info.esgs_itemsize; + + unsigned elem_size_bytes = instr->dest.ssa.bit_size / 8; + if (ctx->stage == geometry_gs) { + Temp esgs_ring = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_ESGS_GS * 16u)); + + const_offset *= ctx->program->wave_size; + + std::array elems; + aco_ptr vec{create_instruction( + aco_opcode::p_create_vector, Format::PSEUDO, instr->dest.ssa.num_components, 1)}; + for (unsigned i = 0; i < instr->dest.ssa.num_components; i++) { + Temp subelems[2]; + for (unsigned j = 0; j < elem_size_bytes / 4; j++) { + Operand soffset(0u); + if (const_offset >= 4096u) + soffset = bld.copy(bld.def(s1), Operand(const_offset / 4096u * 4096u)); + + aco_ptr mubuf{create_instruction(aco_opcode::buffer_load_dword, Format::MUBUF, 3, 1)}; + mubuf->definitions[0] = bld.def(v1); + subelems[j] = mubuf->definitions[0].getTemp(); + mubuf->operands[0] = Operand(esgs_ring); + mubuf->operands[1] = Operand(offset); + mubuf->operands[2] = Operand(soffset); + mubuf->offen = true; + mubuf->offset = const_offset % 4096u; + mubuf->glc = true; + mubuf->dlc = ctx->options->chip_class >= GFX10; + mubuf->barrier = barrier_none; + mubuf->can_reorder = true; + bld.insert(std::move(mubuf)); + + const_offset += ctx->program->wave_size * 4u; + } + + if (elem_size_bytes == 4) + elems[i] = subelems[0]; + else + elems[i] = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), subelems[0], subelems[1]); + vec->operands[i] = Operand(elems[i]); + } + vec->definitions[0] = Definition(dst); + ctx->block->instructions.emplace_back(std::move(vec)); + ctx->allocated_vec.emplace(dst.id(), elems); + } else { + unsigned align = 16; /* alignment of indirect offset */ + align = std::min(align, 1u << (ffs(itemsize) - 1)); + if (const_offset) + align = std::min(align, 1u << (ffs(const_offset) - 1)); + + load_lds(ctx, elem_size_bytes, dst, offset, const_offset, align); + } +} + +Temp load_desc_ptr(isel_context *ctx, unsigned desc_set) +{ + if (ctx->program->info->need_indirect_descriptor_sets) { + Builder bld(ctx->program, ctx->block); + Temp ptr64 = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->descriptor_sets[0])); + Operand off = bld.copy(bld.def(s1), Operand(desc_set << 2)); + return bld.smem(aco_opcode::s_load_dword, bld.def(s1), ptr64, off);//, false, false, false); + } + + return get_arg(ctx, ctx->args->descriptor_sets[desc_set]); +} + + +void visit_load_resource(isel_context *ctx, nir_intrinsic_instr *instr) +{ + Builder bld(ctx->program, ctx->block); + Temp index = get_ssa_temp(ctx, instr->src[0].ssa); + if (!ctx->divergent_vals[instr->dest.ssa.index]) + index = bld.as_uniform(index); + unsigned desc_set = nir_intrinsic_desc_set(instr); + unsigned binding = nir_intrinsic_binding(instr); + + Temp desc_ptr; + radv_pipeline_layout *pipeline_layout = ctx->options->layout; + radv_descriptor_set_layout *layout = pipeline_layout->set[desc_set].layout; + unsigned offset = layout->binding[binding].offset; + unsigned stride; + if (layout->binding[binding].type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC || + layout->binding[binding].type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC) { + unsigned idx = pipeline_layout->set[desc_set].dynamic_offset_start + layout->binding[binding].dynamic_offset_offset; + desc_ptr = get_arg(ctx, ctx->args->ac.push_constants); + offset = pipeline_layout->push_constant_size + 16 * idx; + stride = 16; + } else { + desc_ptr = load_desc_ptr(ctx, desc_set); + stride = layout->binding[binding].size; + } + + nir_const_value* nir_const_index = nir_src_as_const_value(instr->src[0]); + unsigned const_index = nir_const_index ? nir_const_index->u32 : 0; + if (stride != 1) { + if (nir_const_index) { + const_index = const_index * stride; + } else if (index.type() == RegType::vgpr) { + bool index24bit = layout->binding[binding].array_size <= 0x1000000; + index = bld.v_mul_imm(bld.def(v1), index, stride, index24bit); + } else { + index = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(stride), Operand(index)); + } + } + if (offset) { + if (nir_const_index) { + const_index = const_index + offset; + } else if (index.type() == RegType::vgpr) { + index = bld.vadd32(bld.def(v1), Operand(offset), index); + } else { + index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset), Operand(index)); + } + } + + if (nir_const_index && const_index == 0) { + index = desc_ptr; + } else if (index.type() == RegType::vgpr) { + index = bld.vadd32(bld.def(v1), + nir_const_index ? Operand(const_index) : Operand(index), + Operand(desc_ptr)); + } else { + index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), + nir_const_index ? Operand(const_index) : Operand(index), + Operand(desc_ptr)); + } + + bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), index); +} + +void load_buffer(isel_context *ctx, unsigned num_components, Temp dst, + Temp rsrc, Temp offset, bool glc=false, bool readonly=true) +{ + Builder bld(ctx->program, ctx->block); + + unsigned num_bytes = dst.size() * 4; + bool dlc = glc && ctx->options->chip_class >= GFX10; + + aco_opcode op; + if (dst.type() == RegType::vgpr || (ctx->options->chip_class < GFX8 && !readonly)) { + Operand vaddr = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1); + Operand soffset = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0); + unsigned const_offset = 0; + + Temp lower = Temp(); + if (num_bytes > 16) { + assert(num_components == 3 || num_components == 4); + op = aco_opcode::buffer_load_dwordx4; + lower = bld.tmp(v4); + aco_ptr mubuf{create_instruction(op, Format::MUBUF, 3, 1)}; + mubuf->definitions[0] = Definition(lower); + mubuf->operands[0] = Operand(rsrc); + mubuf->operands[1] = vaddr; + mubuf->operands[2] = soffset; + mubuf->offen = (offset.type() == RegType::vgpr); + mubuf->glc = glc; + mubuf->dlc = dlc; + mubuf->barrier = readonly ? barrier_none : barrier_buffer; + mubuf->can_reorder = readonly; + bld.insert(std::move(mubuf)); + emit_split_vector(ctx, lower, 2); + num_bytes -= 16; + const_offset = 16; + } else if (num_bytes == 12 && ctx->options->chip_class == GFX6) { + /* GFX6 doesn't support loading vec3, expand to vec4. */ + num_bytes = 16; + } + + switch (num_bytes) { + case 4: + op = aco_opcode::buffer_load_dword; + break; + case 8: + op = aco_opcode::buffer_load_dwordx2; + break; + case 12: + assert(ctx->options->chip_class > GFX6); + op = aco_opcode::buffer_load_dwordx3; + break; + case 16: + op = aco_opcode::buffer_load_dwordx4; + break; + default: + unreachable("Load SSBO not implemented for this size."); + } + aco_ptr mubuf{create_instruction(op, Format::MUBUF, 3, 1)}; + mubuf->operands[0] = Operand(rsrc); + mubuf->operands[1] = vaddr; + mubuf->operands[2] = soffset; + mubuf->offen = (offset.type() == RegType::vgpr); + mubuf->glc = glc; + mubuf->dlc = dlc; + mubuf->barrier = readonly ? barrier_none : barrier_buffer; + mubuf->can_reorder = readonly; + mubuf->offset = const_offset; + aco_ptr instr = std::move(mubuf); + + if (dst.size() > 4) { + assert(lower != Temp()); + Temp upper = bld.tmp(RegType::vgpr, dst.size() - lower.size()); + instr->definitions[0] = Definition(upper); + bld.insert(std::move(instr)); + if (dst.size() == 8) + emit_split_vector(ctx, upper, 2); + instr.reset(create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, dst.size() / 2, 1)); + instr->operands[0] = Operand(emit_extract_vector(ctx, lower, 0, v2)); + instr->operands[1] = Operand(emit_extract_vector(ctx, lower, 1, v2)); + instr->operands[2] = Operand(emit_extract_vector(ctx, upper, 0, v2)); + if (dst.size() == 8) + instr->operands[3] = Operand(emit_extract_vector(ctx, upper, 1, v2)); + } else if (dst.size() == 3 && ctx->options->chip_class == GFX6) { + Temp vec = bld.tmp(v4); + instr->definitions[0] = Definition(vec); + bld.insert(std::move(instr)); + emit_split_vector(ctx, vec, 4); + + instr.reset(create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, 3, 1)); + instr->operands[0] = Operand(emit_extract_vector(ctx, vec, 0, v1)); + instr->operands[1] = Operand(emit_extract_vector(ctx, vec, 1, v1)); + instr->operands[2] = Operand(emit_extract_vector(ctx, vec, 2, v1)); + } + + if (dst.type() == RegType::sgpr) { + Temp vec = bld.tmp(RegType::vgpr, dst.size()); + instr->definitions[0] = Definition(vec); + bld.insert(std::move(instr)); + expand_vector(ctx, vec, dst, num_components, (1 << num_components) - 1); + } else { + instr->definitions[0] = Definition(dst); + bld.insert(std::move(instr)); + emit_split_vector(ctx, dst, num_components); + } + } else { + switch (num_bytes) { + case 4: + op = aco_opcode::s_buffer_load_dword; + break; + case 8: + op = aco_opcode::s_buffer_load_dwordx2; + break; + case 12: + case 16: + op = aco_opcode::s_buffer_load_dwordx4; + break; + case 24: + case 32: + op = aco_opcode::s_buffer_load_dwordx8; + break; + default: + unreachable("Load SSBO not implemented for this size."); + } + aco_ptr load{create_instruction(op, Format::SMEM, 2, 1)}; + load->operands[0] = Operand(rsrc); + load->operands[1] = Operand(bld.as_uniform(offset)); + assert(load->operands[1].getTemp().type() == RegType::sgpr); + load->definitions[0] = Definition(dst); + load->glc = glc; + load->dlc = dlc; + load->barrier = readonly ? barrier_none : barrier_buffer; + load->can_reorder = false; // FIXME: currently, it doesn't seem beneficial due to how our scheduler works + assert(ctx->options->chip_class >= GFX8 || !glc); + + /* trim vector */ + if (dst.size() == 3) { + Temp vec = bld.tmp(s4); + load->definitions[0] = Definition(vec); + bld.insert(std::move(load)); + emit_split_vector(ctx, vec, 4); + + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), + emit_extract_vector(ctx, vec, 0, s1), + emit_extract_vector(ctx, vec, 1, s1), + emit_extract_vector(ctx, vec, 2, s1)); + } else if (dst.size() == 6) { + Temp vec = bld.tmp(s8); + load->definitions[0] = Definition(vec); + bld.insert(std::move(load)); + emit_split_vector(ctx, vec, 4); + + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), + emit_extract_vector(ctx, vec, 0, s2), + emit_extract_vector(ctx, vec, 1, s2), + emit_extract_vector(ctx, vec, 2, s2)); + } else { + bld.insert(std::move(load)); + } + emit_split_vector(ctx, dst, num_components); + } +} + +void visit_load_ubo(isel_context *ctx, nir_intrinsic_instr *instr) +{ + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + Temp rsrc = get_ssa_temp(ctx, instr->src[0].ssa); + + Builder bld(ctx->program, ctx->block); + + nir_intrinsic_instr* idx_instr = nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr); + unsigned desc_set = nir_intrinsic_desc_set(idx_instr); + unsigned binding = nir_intrinsic_binding(idx_instr); + radv_descriptor_set_layout *layout = ctx->options->layout->set[desc_set].layout; + + if (layout->binding[binding].type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) { + uint32_t desc_type = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | + S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | + S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | + S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W); + if (ctx->options->chip_class >= GFX10) { + desc_type |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) | + S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | + S_008F0C_RESOURCE_LEVEL(1); + } else { + desc_type |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | + S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); + } + Temp upper_dwords = bld.pseudo(aco_opcode::p_create_vector, bld.def(s3), + Operand(S_008F04_BASE_ADDRESS_HI(ctx->options->address32_hi)), + Operand(0xFFFFFFFFu), + Operand(desc_type)); + rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), + rsrc, upper_dwords); + } else { + rsrc = convert_pointer_to_64_bit(ctx, rsrc); + rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u)); + } + + load_buffer(ctx, instr->num_components, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa)); +} + +void visit_load_push_constant(isel_context *ctx, nir_intrinsic_instr *instr) +{ + Builder bld(ctx->program, ctx->block); + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + + unsigned offset = nir_intrinsic_base(instr); + nir_const_value *index_cv = nir_src_as_const_value(instr->src[0]); + if (index_cv && instr->dest.ssa.bit_size == 32) { + + unsigned count = instr->dest.ssa.num_components; + unsigned start = (offset + index_cv->u32) / 4u; + start -= ctx->args->ac.base_inline_push_consts; + if (start + count <= ctx->args->ac.num_inline_push_consts) { + std::array elems; + aco_ptr vec{create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)}; + for (unsigned i = 0; i < count; ++i) { + elems[i] = get_arg(ctx, ctx->args->ac.inline_push_consts[start + i]); + vec->operands[i] = Operand{elems[i]}; + } + vec->definitions[0] = Definition(dst); + ctx->block->instructions.emplace_back(std::move(vec)); + ctx->allocated_vec.emplace(dst.id(), elems); + return; + } + } + + Temp index = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa)); + if (offset != 0) // TODO check if index != 0 as well + index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset), index); + Temp ptr = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->ac.push_constants)); + Temp vec = dst; + bool trim = false; + aco_opcode op; + + switch (dst.size()) { + case 1: + op = aco_opcode::s_load_dword; + break; + case 2: + op = aco_opcode::s_load_dwordx2; + break; + case 3: + vec = bld.tmp(s4); + trim = true; + case 4: + op = aco_opcode::s_load_dwordx4; + break; + case 6: + vec = bld.tmp(s8); + trim = true; + case 8: + op = aco_opcode::s_load_dwordx8; + break; + default: + unreachable("unimplemented or forbidden load_push_constant."); + } + + bld.smem(op, Definition(vec), ptr, index); + + if (trim) { + emit_split_vector(ctx, vec, 4); + RegClass rc = dst.size() == 3 ? s1 : s2; + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), + emit_extract_vector(ctx, vec, 0, rc), + emit_extract_vector(ctx, vec, 1, rc), + emit_extract_vector(ctx, vec, 2, rc)); + + } + emit_split_vector(ctx, dst, instr->dest.ssa.num_components); +} + +void visit_load_constant(isel_context *ctx, nir_intrinsic_instr *instr) +{ + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + + Builder bld(ctx->program, ctx->block); + + uint32_t desc_type = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | + S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | + S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | + S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W); + if (ctx->options->chip_class >= GFX10) { + desc_type |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) | + S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | + S_008F0C_RESOURCE_LEVEL(1); + } else { + desc_type |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | + S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); + } + + unsigned base = nir_intrinsic_base(instr); + unsigned range = nir_intrinsic_range(instr); + + Temp offset = get_ssa_temp(ctx, instr->src[0].ssa); + if (base && offset.type() == RegType::sgpr) + offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset, Operand(base)); + else if (base && offset.type() == RegType::vgpr) + offset = bld.vadd32(bld.def(v1), Operand(base), offset); + + Temp rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), + bld.sop1(aco_opcode::p_constaddr, bld.def(s2), bld.def(s1, scc), Operand(ctx->constant_data_offset)), + Operand(MIN2(base + range, ctx->shader->constant_data_size)), + Operand(desc_type)); + + load_buffer(ctx, instr->num_components, dst, rsrc, offset); +} + +void visit_discard_if(isel_context *ctx, nir_intrinsic_instr *instr) +{ + if (ctx->cf_info.loop_nest_depth || ctx->cf_info.parent_if.is_divergent) + ctx->cf_info.exec_potentially_empty_discard = true; + + ctx->program->needs_exact = true; + + // TODO: optimize uniform conditions + Builder bld(ctx->program, ctx->block); + Temp src = get_ssa_temp(ctx, instr->src[0].ssa); + assert(src.regClass() == bld.lm); + src = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)); + bld.pseudo(aco_opcode::p_discard_if, src); + ctx->block->kind |= block_kind_uses_discard_if; + return; +} + +void visit_discard(isel_context* ctx, nir_intrinsic_instr *instr) +{ + Builder bld(ctx->program, ctx->block); + + if (ctx->cf_info.loop_nest_depth || ctx->cf_info.parent_if.is_divergent) + ctx->cf_info.exec_potentially_empty_discard = true; + + bool divergent = ctx->cf_info.parent_if.is_divergent || + ctx->cf_info.parent_loop.has_divergent_continue; + + if (ctx->block->loop_nest_depth && + ((nir_instr_is_last(&instr->instr) && !divergent) || divergent)) { + /* we handle discards the same way as jump instructions */ + append_logical_end(ctx->block); + + /* in loops, discard behaves like break */ + Block *linear_target = ctx->cf_info.parent_loop.exit; + ctx->block->kind |= block_kind_discard; + + if (!divergent) { + /* uniform discard - loop ends here */ + assert(nir_instr_is_last(&instr->instr)); + ctx->block->kind |= block_kind_uniform; + ctx->cf_info.has_branch = true; + bld.branch(aco_opcode::p_branch); + add_linear_edge(ctx->block->index, linear_target); + return; + } + + /* we add a break right behind the discard() instructions */ + ctx->block->kind |= block_kind_break; + unsigned idx = ctx->block->index; + + ctx->cf_info.parent_loop.has_divergent_branch = true; + ctx->cf_info.nir_to_aco[instr->instr.block->index] = idx; + + /* remove critical edges from linear CFG */ + bld.branch(aco_opcode::p_branch); + Block* break_block = ctx->program->create_and_insert_block(); + break_block->loop_nest_depth = ctx->cf_info.loop_nest_depth; + break_block->kind |= block_kind_uniform; + add_linear_edge(idx, break_block); + add_linear_edge(break_block->index, linear_target); + bld.reset(break_block); + bld.branch(aco_opcode::p_branch); + + Block* continue_block = ctx->program->create_and_insert_block(); + continue_block->loop_nest_depth = ctx->cf_info.loop_nest_depth; + add_linear_edge(idx, continue_block); + append_logical_start(continue_block); + ctx->block = continue_block; + + return; + } + + /* it can currently happen that NIR doesn't remove the unreachable code */ + if (!nir_instr_is_last(&instr->instr)) { + ctx->program->needs_exact = true; + /* save exec somewhere temporarily so that it doesn't get + * overwritten before the discard from outer exec masks */ + Temp cond = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), Operand(0xFFFFFFFF), Operand(exec, bld.lm)); + bld.pseudo(aco_opcode::p_discard_if, cond); + ctx->block->kind |= block_kind_uses_discard_if; + return; + } + + /* This condition is incorrect for uniformly branched discards in a loop + * predicated by a divergent condition, but the above code catches that case + * and the discard would end up turning into a discard_if. + * For example: + * if (divergent) { + * while (...) { + * if (uniform) { + * discard; + * } + * } + * } + */ + if (!ctx->cf_info.parent_if.is_divergent) { + /* program just ends here */ + ctx->block->kind |= block_kind_uniform; + bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1), + 0 /* enabled mask */, 9 /* dest */, + false /* compressed */, true/* done */, true /* valid mask */); + bld.sopp(aco_opcode::s_endpgm); + // TODO: it will potentially be followed by a branch which is dead code to sanitize NIR phis + } else { + ctx->block->kind |= block_kind_discard; + /* branch and linear edge is added by visit_if() */ + } +} + +enum aco_descriptor_type { + ACO_DESC_IMAGE, + ACO_DESC_FMASK, + ACO_DESC_SAMPLER, + ACO_DESC_BUFFER, + ACO_DESC_PLANE_0, + ACO_DESC_PLANE_1, + ACO_DESC_PLANE_2, +}; + +static bool +should_declare_array(isel_context *ctx, enum glsl_sampler_dim sampler_dim, bool is_array) { + if (sampler_dim == GLSL_SAMPLER_DIM_BUF) + return false; + ac_image_dim dim = ac_get_sampler_dim(ctx->options->chip_class, sampler_dim, is_array); + return dim == ac_image_cube || + dim == ac_image_1darray || + dim == ac_image_2darray || + dim == ac_image_2darraymsaa; +} + +Temp get_sampler_desc(isel_context *ctx, nir_deref_instr *deref_instr, + enum aco_descriptor_type desc_type, + const nir_tex_instr *tex_instr, bool image, bool write) +{ +/* FIXME: we should lower the deref with some new nir_intrinsic_load_desc + std::unordered_map::iterator it = ctx->tex_desc.find((uint64_t) desc_type << 32 | deref_instr->dest.ssa.index); + if (it != ctx->tex_desc.end()) + return it->second; +*/ + Temp index = Temp(); + bool index_set = false; + unsigned constant_index = 0; + unsigned descriptor_set; + unsigned base_index; + Builder bld(ctx->program, ctx->block); + + if (!deref_instr) { + assert(tex_instr && !image); + descriptor_set = 0; + base_index = tex_instr->sampler_index; + } else { + while(deref_instr->deref_type != nir_deref_type_var) { + unsigned array_size = glsl_get_aoa_size(deref_instr->type); + if (!array_size) + array_size = 1; + + assert(deref_instr->deref_type == nir_deref_type_array); + nir_const_value *const_value = nir_src_as_const_value(deref_instr->arr.index); + if (const_value) { + constant_index += array_size * const_value->u32; + } else { + Temp indirect = get_ssa_temp(ctx, deref_instr->arr.index.ssa); + if (indirect.type() == RegType::vgpr) + indirect = bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), indirect); + + if (array_size != 1) + indirect = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(array_size), indirect); + + if (!index_set) { + index = indirect; + index_set = true; + } else { + index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), index, indirect); + } + } + + deref_instr = nir_src_as_deref(deref_instr->parent); + } + descriptor_set = deref_instr->var->data.descriptor_set; + base_index = deref_instr->var->data.binding; + } + + Temp list = load_desc_ptr(ctx, descriptor_set); + list = convert_pointer_to_64_bit(ctx, list); + + struct radv_descriptor_set_layout *layout = ctx->options->layout->set[descriptor_set].layout; + struct radv_descriptor_set_binding_layout *binding = layout->binding + base_index; + unsigned offset = binding->offset; + unsigned stride = binding->size; + aco_opcode opcode; + RegClass type; + + assert(base_index < layout->binding_count); + + switch (desc_type) { + case ACO_DESC_IMAGE: + type = s8; + opcode = aco_opcode::s_load_dwordx8; + break; + case ACO_DESC_FMASK: + type = s8; + opcode = aco_opcode::s_load_dwordx8; + offset += 32; + break; + case ACO_DESC_SAMPLER: + type = s4; + opcode = aco_opcode::s_load_dwordx4; + if (binding->type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER) + offset += radv_combined_image_descriptor_sampler_offset(binding); + break; + case ACO_DESC_BUFFER: + type = s4; + opcode = aco_opcode::s_load_dwordx4; + break; + case ACO_DESC_PLANE_0: + case ACO_DESC_PLANE_1: + type = s8; + opcode = aco_opcode::s_load_dwordx8; + offset += 32 * (desc_type - ACO_DESC_PLANE_0); + break; + case ACO_DESC_PLANE_2: + type = s4; + opcode = aco_opcode::s_load_dwordx4; + offset += 64; + break; + default: + unreachable("invalid desc_type\n"); + } + + offset += constant_index * stride; + + if (desc_type == ACO_DESC_SAMPLER && binding->immutable_samplers_offset && + (!index_set || binding->immutable_samplers_equal)) { + if (binding->immutable_samplers_equal) + constant_index = 0; + + const uint32_t *samplers = radv_immutable_samplers(layout, binding); + return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), + Operand(samplers[constant_index * 4 + 0]), + Operand(samplers[constant_index * 4 + 1]), + Operand(samplers[constant_index * 4 + 2]), + Operand(samplers[constant_index * 4 + 3])); + } + + Operand off; + if (!index_set) { + off = bld.copy(bld.def(s1), Operand(offset)); + } else { + off = Operand((Temp)bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset), + bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(stride), index))); + } + + Temp res = bld.smem(opcode, bld.def(type), list, off); + + if (desc_type == ACO_DESC_PLANE_2) { + Temp components[8]; + for (unsigned i = 0; i < 8; i++) + components[i] = bld.tmp(s1); + bld.pseudo(aco_opcode::p_split_vector, + Definition(components[0]), + Definition(components[1]), + Definition(components[2]), + Definition(components[3]), + res); + + Temp desc2 = get_sampler_desc(ctx, deref_instr, ACO_DESC_PLANE_1, tex_instr, image, write); + bld.pseudo(aco_opcode::p_split_vector, + bld.def(s1), bld.def(s1), bld.def(s1), bld.def(s1), + Definition(components[4]), + Definition(components[5]), + Definition(components[6]), + Definition(components[7]), + desc2); + + res = bld.pseudo(aco_opcode::p_create_vector, bld.def(s8), + components[0], components[1], components[2], components[3], + components[4], components[5], components[6], components[7]); + } + + return res; +} + +static int image_type_to_components_count(enum glsl_sampler_dim dim, bool array) +{ + switch (dim) { + case GLSL_SAMPLER_DIM_BUF: + return 1; + case GLSL_SAMPLER_DIM_1D: + return array ? 2 : 1; + case GLSL_SAMPLER_DIM_2D: + return array ? 3 : 2; + case GLSL_SAMPLER_DIM_MS: + return array ? 4 : 3; + case GLSL_SAMPLER_DIM_3D: + case GLSL_SAMPLER_DIM_CUBE: + return 3; + case GLSL_SAMPLER_DIM_RECT: + case GLSL_SAMPLER_DIM_SUBPASS: + return 2; + case GLSL_SAMPLER_DIM_SUBPASS_MS: + return 3; + default: + break; + } + return 0; +} + + +/* Adjust the sample index according to FMASK. + * + * For uncompressed MSAA surfaces, FMASK should return 0x76543210, + * which is the identity mapping. Each nibble says which physical sample + * should be fetched to get that sample. + * + * For example, 0x11111100 means there are only 2 samples stored and + * the second sample covers 3/4 of the pixel. When reading samples 0 + * and 1, return physical sample 0 (determined by the first two 0s + * in FMASK), otherwise return physical sample 1. + * + * The sample index should be adjusted as follows: + * sample_index = (fmask >> (sample_index * 4)) & 0xF; + */ +static Temp adjust_sample_index_using_fmask(isel_context *ctx, bool da, std::vector& coords, Operand sample_index, Temp fmask_desc_ptr) +{ + Builder bld(ctx->program, ctx->block); + Temp fmask = bld.tmp(v1); + unsigned dim = ctx->options->chip_class >= GFX10 + ? ac_get_sampler_dim(ctx->options->chip_class, GLSL_SAMPLER_DIM_2D, da) + : 0; + + Temp coord = da ? bld.pseudo(aco_opcode::p_create_vector, bld.def(v3), coords[0], coords[1], coords[2]) : + bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), coords[0], coords[1]); + aco_ptr load{create_instruction(aco_opcode::image_load, Format::MIMG, 3, 1)}; + load->operands[0] = Operand(fmask_desc_ptr); + load->operands[1] = Operand(s4); /* no sampler */ + load->operands[2] = Operand(coord); + load->definitions[0] = Definition(fmask); + load->glc = false; + load->dlc = false; + load->dmask = 0x1; + load->unrm = true; + load->da = da; + load->dim = dim; + load->can_reorder = true; /* fmask images shouldn't be modified */ + ctx->block->instructions.emplace_back(std::move(load)); + + Operand sample_index4; + if (sample_index.isConstant() && sample_index.constantValue() < 16) { + sample_index4 = Operand(sample_index.constantValue() << 2); + } else if (sample_index.regClass() == s1) { + sample_index4 = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), sample_index, Operand(2u)); + } else { + assert(sample_index.regClass() == v1); + sample_index4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), sample_index); + } + + Temp final_sample; + if (sample_index4.isConstant() && sample_index4.constantValue() == 0) + final_sample = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(15u), fmask); + else if (sample_index4.isConstant() && sample_index4.constantValue() == 28) + final_sample = bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), Operand(28u), fmask); + else + final_sample = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), fmask, sample_index4, Operand(4u)); + + /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK + * resource descriptor is 0 (invalid), + */ + Temp compare = bld.tmp(bld.lm); + bld.vopc_e64(aco_opcode::v_cmp_lg_u32, Definition(compare), + Operand(0u), emit_extract_vector(ctx, fmask_desc_ptr, 1, s1)).def(0).setHint(vcc); + + Temp sample_index_v = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), sample_index); + + /* Replace the MSAA sample index. */ + return bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), sample_index_v, final_sample, compare); +} + +static Temp get_image_coords(isel_context *ctx, const nir_intrinsic_instr *instr, const struct glsl_type *type) +{ + + Temp src0 = get_ssa_temp(ctx, instr->src[1].ssa); + enum glsl_sampler_dim dim = glsl_get_sampler_dim(type); + bool is_array = glsl_sampler_type_is_array(type); + ASSERTED bool add_frag_pos = (dim == GLSL_SAMPLER_DIM_SUBPASS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS); + assert(!add_frag_pos && "Input attachments should be lowered."); + bool is_ms = (dim == GLSL_SAMPLER_DIM_MS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS); + bool gfx9_1d = ctx->options->chip_class == GFX9 && dim == GLSL_SAMPLER_DIM_1D; + int count = image_type_to_components_count(dim, is_array); + std::vector coords(count); + Builder bld(ctx->program, ctx->block); + + if (is_ms) { + count--; + Temp src2 = get_ssa_temp(ctx, instr->src[2].ssa); + /* get sample index */ + if (instr->intrinsic == nir_intrinsic_image_deref_load) { + nir_const_value *sample_cv = nir_src_as_const_value(instr->src[2]); + Operand sample_index = sample_cv ? Operand(sample_cv->u32) : Operand(emit_extract_vector(ctx, src2, 0, v1)); + std::vector fmask_load_address; + for (unsigned i = 0; i < (is_array ? 3 : 2); i++) + fmask_load_address.emplace_back(emit_extract_vector(ctx, src0, i, v1)); + + Temp fmask_desc_ptr = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_FMASK, nullptr, false, false); + coords[count] = adjust_sample_index_using_fmask(ctx, is_array, fmask_load_address, sample_index, fmask_desc_ptr); + } else { + coords[count] = emit_extract_vector(ctx, src2, 0, v1); + } + } + + if (gfx9_1d) { + coords[0] = emit_extract_vector(ctx, src0, 0, v1); + coords.resize(coords.size() + 1); + coords[1] = bld.copy(bld.def(v1), Operand(0u)); + if (is_array) + coords[2] = emit_extract_vector(ctx, src0, 1, v1); + } else { + for (int i = 0; i < count; i++) + coords[i] = emit_extract_vector(ctx, src0, i, v1); + } + + if (instr->intrinsic == nir_intrinsic_image_deref_load || + instr->intrinsic == nir_intrinsic_image_deref_store) { + int lod_index = instr->intrinsic == nir_intrinsic_image_deref_load ? 3 : 4; + bool level_zero = nir_src_is_const(instr->src[lod_index]) && nir_src_as_uint(instr->src[lod_index]) == 0; + + if (!level_zero) + coords.emplace_back(get_ssa_temp(ctx, instr->src[lod_index].ssa)); + } + + aco_ptr vec{create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, coords.size(), 1)}; + for (unsigned i = 0; i < coords.size(); i++) + vec->operands[i] = Operand(coords[i]); + Temp res = {ctx->program->allocateId(), RegClass(RegType::vgpr, coords.size())}; + vec->definitions[0] = Definition(res); + ctx->block->instructions.emplace_back(std::move(vec)); + return res; +} + + +void visit_image_load(isel_context *ctx, nir_intrinsic_instr *instr) +{ + Builder bld(ctx->program, ctx->block); + const nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr)); + const struct glsl_type *type = glsl_without_array(var->type); + const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type); + bool is_array = glsl_sampler_type_is_array(type); + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + + if (dim == GLSL_SAMPLER_DIM_BUF) { + unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa); + unsigned num_channels = util_last_bit(mask); + Temp rsrc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, nullptr, true, true); + Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1); + + aco_opcode opcode; + switch (num_channels) { + case 1: + opcode = aco_opcode::buffer_load_format_x; + break; + case 2: + opcode = aco_opcode::buffer_load_format_xy; + break; + case 3: + opcode = aco_opcode::buffer_load_format_xyz; + break; + case 4: + opcode = aco_opcode::buffer_load_format_xyzw; + break; + default: + unreachable(">4 channel buffer image load"); + } + aco_ptr load{create_instruction(opcode, Format::MUBUF, 3, 1)}; + load->operands[0] = Operand(rsrc); + load->operands[1] = Operand(vindex); + load->operands[2] = Operand((uint32_t) 0); + Temp tmp; + if (num_channels == instr->dest.ssa.num_components && dst.type() == RegType::vgpr) + tmp = dst; + else + tmp = {ctx->program->allocateId(), RegClass(RegType::vgpr, num_channels)}; + load->definitions[0] = Definition(tmp); + load->idxen = true; + load->glc = var->data.access & (ACCESS_VOLATILE | ACCESS_COHERENT); + load->dlc = load->glc && ctx->options->chip_class >= GFX10; + load->barrier = barrier_image; + ctx->block->instructions.emplace_back(std::move(load)); + + expand_vector(ctx, tmp, dst, instr->dest.ssa.num_components, (1 << num_channels) - 1); + return; + } + + Temp coords = get_image_coords(ctx, instr, type); + Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, nullptr, true, true); + + unsigned dmask = nir_ssa_def_components_read(&instr->dest.ssa); + unsigned num_components = util_bitcount(dmask); + Temp tmp; + if (num_components == instr->dest.ssa.num_components && dst.type() == RegType::vgpr) + tmp = dst; + else + tmp = {ctx->program->allocateId(), RegClass(RegType::vgpr, num_components)}; + + bool level_zero = nir_src_is_const(instr->src[3]) && nir_src_as_uint(instr->src[3]) == 0; + aco_opcode opcode = level_zero ? aco_opcode::image_load : aco_opcode::image_load_mip; + + aco_ptr load{create_instruction(opcode, Format::MIMG, 3, 1)}; + load->operands[0] = Operand(resource); + load->operands[1] = Operand(s4); /* no sampler */ + load->operands[2] = Operand(coords); + load->definitions[0] = Definition(tmp); + load->glc = var->data.access & (ACCESS_VOLATILE | ACCESS_COHERENT) ? 1 : 0; + load->dlc = load->glc && ctx->options->chip_class >= GFX10; + load->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array); + load->dmask = dmask; + load->unrm = true; + load->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type)); + load->barrier = barrier_image; + ctx->block->instructions.emplace_back(std::move(load)); + + expand_vector(ctx, tmp, dst, instr->dest.ssa.num_components, dmask); + return; +} + +void visit_image_store(isel_context *ctx, nir_intrinsic_instr *instr) +{ + const nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr)); + const struct glsl_type *type = glsl_without_array(var->type); + const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type); + bool is_array = glsl_sampler_type_is_array(type); + Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[3].ssa)); + + bool glc = ctx->options->chip_class == GFX6 || var->data.access & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE) ? 1 : 0; + + if (dim == GLSL_SAMPLER_DIM_BUF) { + Temp rsrc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, nullptr, true, true); + Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1); + aco_opcode opcode; + switch (data.size()) { + case 1: + opcode = aco_opcode::buffer_store_format_x; + break; + case 2: + opcode = aco_opcode::buffer_store_format_xy; + break; + case 3: + opcode = aco_opcode::buffer_store_format_xyz; + break; + case 4: + opcode = aco_opcode::buffer_store_format_xyzw; + break; + default: + unreachable(">4 channel buffer image store"); + } + aco_ptr store{create_instruction(opcode, Format::MUBUF, 4, 0)}; + store->operands[0] = Operand(rsrc); + store->operands[1] = Operand(vindex); + store->operands[2] = Operand((uint32_t) 0); + store->operands[3] = Operand(data); + store->idxen = true; + store->glc = glc; + store->dlc = false; + store->disable_wqm = true; + store->barrier = barrier_image; + ctx->program->needs_exact = true; + ctx->block->instructions.emplace_back(std::move(store)); + return; + } + + assert(data.type() == RegType::vgpr); + Temp coords = get_image_coords(ctx, instr, type); + Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, nullptr, true, true); + + bool level_zero = nir_src_is_const(instr->src[4]) && nir_src_as_uint(instr->src[4]) == 0; + aco_opcode opcode = level_zero ? aco_opcode::image_store : aco_opcode::image_store_mip; + + aco_ptr store{create_instruction(opcode, Format::MIMG, 3, 0)}; + store->operands[0] = Operand(resource); + store->operands[1] = Operand(data); + store->operands[2] = Operand(coords); + store->glc = glc; + store->dlc = false; + store->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array); + store->dmask = (1 << data.size()) - 1; + store->unrm = true; + store->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type)); + store->disable_wqm = true; + store->barrier = barrier_image; + ctx->program->needs_exact = true; + ctx->block->instructions.emplace_back(std::move(store)); + return; +} + +void visit_image_atomic(isel_context *ctx, nir_intrinsic_instr *instr) +{ + /* return the previous value if dest is ever used */ + bool return_previous = false; + nir_foreach_use_safe(use_src, &instr->dest.ssa) { + return_previous = true; + break; + } + nir_foreach_if_use_safe(use_src, &instr->dest.ssa) { + return_previous = true; + break; + } + + const nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr)); + const struct glsl_type *type = glsl_without_array(var->type); + const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type); + bool is_array = glsl_sampler_type_is_array(type); + Builder bld(ctx->program, ctx->block); + + Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[3].ssa)); + assert(data.size() == 1 && "64bit ssbo atomics not yet implemented."); + + if (instr->intrinsic == nir_intrinsic_image_deref_atomic_comp_swap) + data = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), get_ssa_temp(ctx, instr->src[4].ssa), data); + + aco_opcode buf_op, image_op; + switch (instr->intrinsic) { + case nir_intrinsic_image_deref_atomic_add: + buf_op = aco_opcode::buffer_atomic_add; + image_op = aco_opcode::image_atomic_add; + break; + case nir_intrinsic_image_deref_atomic_umin: + buf_op = aco_opcode::buffer_atomic_umin; + image_op = aco_opcode::image_atomic_umin; + break; + case nir_intrinsic_image_deref_atomic_imin: + buf_op = aco_opcode::buffer_atomic_smin; + image_op = aco_opcode::image_atomic_smin; + break; + case nir_intrinsic_image_deref_atomic_umax: + buf_op = aco_opcode::buffer_atomic_umax; + image_op = aco_opcode::image_atomic_umax; + break; + case nir_intrinsic_image_deref_atomic_imax: + buf_op = aco_opcode::buffer_atomic_smax; + image_op = aco_opcode::image_atomic_smax; + break; + case nir_intrinsic_image_deref_atomic_and: + buf_op = aco_opcode::buffer_atomic_and; + image_op = aco_opcode::image_atomic_and; + break; + case nir_intrinsic_image_deref_atomic_or: + buf_op = aco_opcode::buffer_atomic_or; + image_op = aco_opcode::image_atomic_or; + break; + case nir_intrinsic_image_deref_atomic_xor: + buf_op = aco_opcode::buffer_atomic_xor; + image_op = aco_opcode::image_atomic_xor; + break; + case nir_intrinsic_image_deref_atomic_exchange: + buf_op = aco_opcode::buffer_atomic_swap; + image_op = aco_opcode::image_atomic_swap; + break; + case nir_intrinsic_image_deref_atomic_comp_swap: + buf_op = aco_opcode::buffer_atomic_cmpswap; + image_op = aco_opcode::image_atomic_cmpswap; + break; + default: + unreachable("visit_image_atomic should only be called with nir_intrinsic_image_deref_atomic_* instructions."); + } + + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + + if (dim == GLSL_SAMPLER_DIM_BUF) { + Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1); + Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, nullptr, true, true); + //assert(ctx->options->chip_class < GFX9 && "GFX9 stride size workaround not yet implemented."); + aco_ptr mubuf{create_instruction(buf_op, Format::MUBUF, 4, return_previous ? 1 : 0)}; + mubuf->operands[0] = Operand(resource); + mubuf->operands[1] = Operand(vindex); + mubuf->operands[2] = Operand((uint32_t)0); + mubuf->operands[3] = Operand(data); + if (return_previous) + mubuf->definitions[0] = Definition(dst); + mubuf->offset = 0; + mubuf->idxen = true; + mubuf->glc = return_previous; + mubuf->dlc = false; /* Not needed for atomics */ + mubuf->disable_wqm = true; + mubuf->barrier = barrier_image; + ctx->program->needs_exact = true; + ctx->block->instructions.emplace_back(std::move(mubuf)); + return; + } + + Temp coords = get_image_coords(ctx, instr, type); + Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, nullptr, true, true); + aco_ptr mimg{create_instruction(image_op, Format::MIMG, 3, return_previous ? 1 : 0)}; + mimg->operands[0] = Operand(resource); + mimg->operands[1] = Operand(data); + mimg->operands[2] = Operand(coords); + if (return_previous) + mimg->definitions[0] = Definition(dst); + mimg->glc = return_previous; + mimg->dlc = false; /* Not needed for atomics */ + mimg->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array); + mimg->dmask = (1 << data.size()) - 1; + mimg->unrm = true; + mimg->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type)); + mimg->disable_wqm = true; + mimg->barrier = barrier_image; + ctx->program->needs_exact = true; + ctx->block->instructions.emplace_back(std::move(mimg)); + return; +} + +void get_buffer_size(isel_context *ctx, Temp desc, Temp dst, bool in_elements) +{ + if (in_elements && ctx->options->chip_class == GFX8) { + /* we only have to divide by 1, 2, 4, 8, 12 or 16 */ + Builder bld(ctx->program, ctx->block); + + Temp size = emit_extract_vector(ctx, desc, 2, s1); + + Temp size_div3 = bld.vop3(aco_opcode::v_mul_hi_u32, bld.def(v1), bld.copy(bld.def(v1), Operand(0xaaaaaaabu)), size); + size_div3 = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.as_uniform(size_div3), Operand(1u)); + + Temp stride = emit_extract_vector(ctx, desc, 1, s1); + stride = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), stride, Operand((5u << 16) | 16u)); + + Temp is12 = bld.sopc(aco_opcode::s_cmp_eq_i32, bld.def(s1, scc), stride, Operand(12u)); + size = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), size_div3, size, bld.scc(is12)); + + Temp shr_dst = dst.type() == RegType::vgpr ? bld.tmp(s1) : dst; + bld.sop2(aco_opcode::s_lshr_b32, Definition(shr_dst), bld.def(s1, scc), + size, bld.sop1(aco_opcode::s_ff1_i32_b32, bld.def(s1), stride)); + if (dst.type() == RegType::vgpr) + bld.copy(Definition(dst), shr_dst); + + /* TODO: we can probably calculate this faster with v_skip when stride != 12 */ + } else { + emit_extract_vector(ctx, desc, 2, dst); + } +} + +void visit_image_size(isel_context *ctx, nir_intrinsic_instr *instr) +{ + const nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr)); + const struct glsl_type *type = glsl_without_array(var->type); + const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type); + bool is_array = glsl_sampler_type_is_array(type); + Builder bld(ctx->program, ctx->block); + + if (glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_BUF) { + Temp desc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, NULL, true, false); + return get_buffer_size(ctx, desc, get_ssa_temp(ctx, &instr->dest.ssa), true); + } + + /* LOD */ + Temp lod = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0u)); + + /* Resource */ + Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, NULL, true, false); + + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + + aco_ptr mimg{create_instruction(aco_opcode::image_get_resinfo, Format::MIMG, 3, 1)}; + mimg->operands[0] = Operand(resource); + mimg->operands[1] = Operand(s4); /* no sampler */ + mimg->operands[2] = Operand(lod); + uint8_t& dmask = mimg->dmask; + mimg->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array); + mimg->dmask = (1 << instr->dest.ssa.num_components) - 1; + mimg->da = glsl_sampler_type_is_array(type); + mimg->can_reorder = true; + Definition& def = mimg->definitions[0]; + ctx->block->instructions.emplace_back(std::move(mimg)); + + if (glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_CUBE && + glsl_sampler_type_is_array(type)) { + + assert(instr->dest.ssa.num_components == 3); + Temp tmp = {ctx->program->allocateId(), v3}; + def = Definition(tmp); + emit_split_vector(ctx, tmp, 3); + + /* divide 3rd value by 6 by multiplying with magic number */ + Temp c = bld.copy(bld.def(s1), Operand((uint32_t) 0x2AAAAAAB)); + Temp by_6 = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), emit_extract_vector(ctx, tmp, 2, v1), c); + + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), + emit_extract_vector(ctx, tmp, 0, v1), + emit_extract_vector(ctx, tmp, 1, v1), + by_6); + + } else if (ctx->options->chip_class == GFX9 && + glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_1D && + glsl_sampler_type_is_array(type)) { + assert(instr->dest.ssa.num_components == 2); + def = Definition(dst); + dmask = 0x5; + } else { + def = Definition(dst); + } + + emit_split_vector(ctx, dst, instr->dest.ssa.num_components); +} + +void visit_load_ssbo(isel_context *ctx, nir_intrinsic_instr *instr) +{ + Builder bld(ctx->program, ctx->block); + unsigned num_components = instr->num_components; + + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[0].ssa)); + rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u)); + + bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT); + load_buffer(ctx, num_components, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa), glc, false); +} + +void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr) +{ + Builder bld(ctx->program, ctx->block); + Temp data = get_ssa_temp(ctx, instr->src[0].ssa); + unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8; + unsigned writemask = nir_intrinsic_write_mask(instr); + Temp offset = get_ssa_temp(ctx, instr->src[2].ssa); + + Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[1].ssa)); + rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u)); + + bool smem = !ctx->divergent_vals[instr->src[2].ssa->index] && + ctx->options->chip_class >= GFX8; + if (smem) + offset = bld.as_uniform(offset); + bool smem_nonfs = smem && ctx->stage != fragment_fs; + + while (writemask) { + int start, count; + u_bit_scan_consecutive_range(&writemask, &start, &count); + if (count == 3 && (smem || ctx->options->chip_class == GFX6)) { + /* GFX6 doesn't support storing vec3, split it. */ + writemask |= 1u << (start + 2); + count = 2; + } + int num_bytes = count * elem_size_bytes; + + if (num_bytes > 16) { + assert(elem_size_bytes == 8); + writemask |= (((count - 2) << 1) - 1) << (start + 2); + count = 2; + num_bytes = 16; + } + + // TODO: check alignment of sub-dword stores + // TODO: split 3 bytes. there is no store instruction for that + + Temp write_data; + if (count != instr->num_components) { + emit_split_vector(ctx, data, instr->num_components); + aco_ptr vec{create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)}; + for (int i = 0; i < count; i++) { + Temp elem = emit_extract_vector(ctx, data, start + i, RegClass(data.type(), elem_size_bytes / 4)); + vec->operands[i] = Operand(smem_nonfs ? bld.as_uniform(elem) : elem); + } + write_data = bld.tmp(!smem ? RegType::vgpr : smem_nonfs ? RegType::sgpr : data.type(), count * elem_size_bytes / 4); + vec->definitions[0] = Definition(write_data); + ctx->block->instructions.emplace_back(std::move(vec)); + } else if (!smem && data.type() != RegType::vgpr) { + assert(num_bytes % 4 == 0); + write_data = bld.copy(bld.def(RegType::vgpr, num_bytes / 4), data); + } else if (smem_nonfs && data.type() == RegType::vgpr) { + assert(num_bytes % 4 == 0); + write_data = bld.as_uniform(data); + } else { + write_data = data; + } + + aco_opcode vmem_op, smem_op; + switch (num_bytes) { + case 4: + vmem_op = aco_opcode::buffer_store_dword; + smem_op = aco_opcode::s_buffer_store_dword; + break; + case 8: + vmem_op = aco_opcode::buffer_store_dwordx2; + smem_op = aco_opcode::s_buffer_store_dwordx2; + break; + case 12: + vmem_op = aco_opcode::buffer_store_dwordx3; + smem_op = aco_opcode::last_opcode; + assert(!smem && ctx->options->chip_class > GFX6); + break; + case 16: + vmem_op = aco_opcode::buffer_store_dwordx4; + smem_op = aco_opcode::s_buffer_store_dwordx4; + break; + default: + unreachable("Store SSBO not implemented for this size."); + } + if (ctx->stage == fragment_fs) + smem_op = aco_opcode::p_fs_buffer_store_smem; + + if (smem) { + aco_ptr store{create_instruction(smem_op, Format::SMEM, 3, 0)}; + store->operands[0] = Operand(rsrc); + if (start) { + Temp off = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), + offset, Operand(start * elem_size_bytes)); + store->operands[1] = Operand(off); + } else { + store->operands[1] = Operand(offset); + } + if (smem_op != aco_opcode::p_fs_buffer_store_smem) + store->operands[1].setFixed(m0); + store->operands[2] = Operand(write_data); + store->glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE); + store->dlc = false; + store->disable_wqm = true; + store->barrier = barrier_buffer; + ctx->block->instructions.emplace_back(std::move(store)); + ctx->program->wb_smem_l1_on_end = true; + if (smem_op == aco_opcode::p_fs_buffer_store_smem) { + ctx->block->kind |= block_kind_needs_lowering; + ctx->program->needs_exact = true; + } + } else { + aco_ptr store{create_instruction(vmem_op, Format::MUBUF, 4, 0)}; + store->operands[0] = Operand(rsrc); + store->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1); + store->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0); + store->operands[3] = Operand(write_data); + store->offset = start * elem_size_bytes; + store->offen = (offset.type() == RegType::vgpr); + store->glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE); + store->dlc = false; + store->disable_wqm = true; + store->barrier = barrier_buffer; + ctx->program->needs_exact = true; + ctx->block->instructions.emplace_back(std::move(store)); + } + } +} + +void visit_atomic_ssbo(isel_context *ctx, nir_intrinsic_instr *instr) +{ + /* return the previous value if dest is ever used */ + bool return_previous = false; + nir_foreach_use_safe(use_src, &instr->dest.ssa) { + return_previous = true; + break; + } + nir_foreach_if_use_safe(use_src, &instr->dest.ssa) { + return_previous = true; + break; + } + + Builder bld(ctx->program, ctx->block); + Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[2].ssa)); + + if (instr->intrinsic == nir_intrinsic_ssbo_atomic_comp_swap) + data = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, data.size() * 2), + get_ssa_temp(ctx, instr->src[3].ssa), data); + + Temp offset = get_ssa_temp(ctx, instr->src[1].ssa); + Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[0].ssa)); + rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u)); + + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + + aco_opcode op32, op64; + switch (instr->intrinsic) { + case nir_intrinsic_ssbo_atomic_add: + op32 = aco_opcode::buffer_atomic_add; + op64 = aco_opcode::buffer_atomic_add_x2; + break; + case nir_intrinsic_ssbo_atomic_imin: + op32 = aco_opcode::buffer_atomic_smin; + op64 = aco_opcode::buffer_atomic_smin_x2; + break; + case nir_intrinsic_ssbo_atomic_umin: + op32 = aco_opcode::buffer_atomic_umin; + op64 = aco_opcode::buffer_atomic_umin_x2; + break; + case nir_intrinsic_ssbo_atomic_imax: + op32 = aco_opcode::buffer_atomic_smax; + op64 = aco_opcode::buffer_atomic_smax_x2; + break; + case nir_intrinsic_ssbo_atomic_umax: + op32 = aco_opcode::buffer_atomic_umax; + op64 = aco_opcode::buffer_atomic_umax_x2; + break; + case nir_intrinsic_ssbo_atomic_and: + op32 = aco_opcode::buffer_atomic_and; + op64 = aco_opcode::buffer_atomic_and_x2; + break; + case nir_intrinsic_ssbo_atomic_or: + op32 = aco_opcode::buffer_atomic_or; + op64 = aco_opcode::buffer_atomic_or_x2; + break; + case nir_intrinsic_ssbo_atomic_xor: + op32 = aco_opcode::buffer_atomic_xor; + op64 = aco_opcode::buffer_atomic_xor_x2; + break; + case nir_intrinsic_ssbo_atomic_exchange: + op32 = aco_opcode::buffer_atomic_swap; + op64 = aco_opcode::buffer_atomic_swap_x2; + break; + case nir_intrinsic_ssbo_atomic_comp_swap: + op32 = aco_opcode::buffer_atomic_cmpswap; + op64 = aco_opcode::buffer_atomic_cmpswap_x2; + break; + default: + unreachable("visit_atomic_ssbo should only be called with nir_intrinsic_ssbo_atomic_* instructions."); + } + aco_opcode op = instr->dest.ssa.bit_size == 32 ? op32 : op64; + aco_ptr mubuf{create_instruction(op, Format::MUBUF, 4, return_previous ? 1 : 0)}; + mubuf->operands[0] = Operand(rsrc); + mubuf->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1); + mubuf->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0); + mubuf->operands[3] = Operand(data); + if (return_previous) + mubuf->definitions[0] = Definition(dst); + mubuf->offset = 0; + mubuf->offen = (offset.type() == RegType::vgpr); + mubuf->glc = return_previous; + mubuf->dlc = false; /* Not needed for atomics */ + mubuf->disable_wqm = true; + mubuf->barrier = barrier_buffer; + ctx->program->needs_exact = true; + ctx->block->instructions.emplace_back(std::move(mubuf)); +} + +void visit_get_buffer_size(isel_context *ctx, nir_intrinsic_instr *instr) { + + Temp index = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[0].ssa)); + Builder bld(ctx->program, ctx->block); + Temp desc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), index, Operand(0u)); + get_buffer_size(ctx, desc, get_ssa_temp(ctx, &instr->dest.ssa), false); +} + +Temp get_gfx6_global_rsrc(Builder& bld, Temp addr) +{ + uint32_t rsrc_conf = S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | + S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); + + if (addr.type() == RegType::vgpr) + return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), Operand(0u), Operand(0u), Operand(-1u), Operand(rsrc_conf)); + return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), addr, Operand(-1u), Operand(rsrc_conf)); +} + +void visit_load_global(isel_context *ctx, nir_intrinsic_instr *instr) +{ + Builder bld(ctx->program, ctx->block); + unsigned num_components = instr->num_components; + unsigned num_bytes = num_components * instr->dest.ssa.bit_size / 8; + + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + Temp addr = get_ssa_temp(ctx, instr->src[0].ssa); + + bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT); + bool dlc = glc && ctx->options->chip_class >= GFX10; + aco_opcode op; + if (dst.type() == RegType::vgpr || (glc && ctx->options->chip_class < GFX8)) { + bool global = ctx->options->chip_class >= GFX9; + + if (ctx->options->chip_class >= GFX7) { + aco_opcode op; + switch (num_bytes) { + case 4: + op = global ? aco_opcode::global_load_dword : aco_opcode::flat_load_dword; + break; + case 8: + op = global ? aco_opcode::global_load_dwordx2 : aco_opcode::flat_load_dwordx2; + break; + case 12: + op = global ? aco_opcode::global_load_dwordx3 : aco_opcode::flat_load_dwordx3; + break; + case 16: + op = global ? aco_opcode::global_load_dwordx4 : aco_opcode::flat_load_dwordx4; + break; + default: + unreachable("load_global not implemented for this size."); + } + + aco_ptr flat{create_instruction(op, global ? Format::GLOBAL : Format::FLAT, 2, 1)}; + flat->operands[0] = Operand(addr); + flat->operands[1] = Operand(s1); + flat->glc = glc; + flat->dlc = dlc; + flat->barrier = barrier_buffer; + + if (dst.type() == RegType::sgpr) { + Temp vec = bld.tmp(RegType::vgpr, dst.size()); + flat->definitions[0] = Definition(vec); + ctx->block->instructions.emplace_back(std::move(flat)); + bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec); + } else { + flat->definitions[0] = Definition(dst); + ctx->block->instructions.emplace_back(std::move(flat)); + } + emit_split_vector(ctx, dst, num_components); + } else { + assert(ctx->options->chip_class == GFX6); + + /* GFX6 doesn't support loading vec3, expand to vec4. */ + num_bytes = num_bytes == 12 ? 16 : num_bytes; + + aco_opcode op; + switch (num_bytes) { + case 4: + op = aco_opcode::buffer_load_dword; + break; + case 8: + op = aco_opcode::buffer_load_dwordx2; + break; + case 16: + op = aco_opcode::buffer_load_dwordx4; + break; + default: + unreachable("load_global not implemented for this size."); + } + + Temp rsrc = get_gfx6_global_rsrc(bld, addr); + + aco_ptr mubuf{create_instruction(op, Format::MUBUF, 3, 1)}; + mubuf->operands[0] = Operand(rsrc); + mubuf->operands[1] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1); + mubuf->operands[2] = Operand(0u); + mubuf->glc = glc; + mubuf->dlc = false; + mubuf->offset = 0; + mubuf->addr64 = addr.type() == RegType::vgpr; + mubuf->disable_wqm = false; + mubuf->barrier = barrier_buffer; + aco_ptr instr = std::move(mubuf); + + /* expand vector */ + if (dst.size() == 3) { + Temp vec = bld.tmp(v4); + instr->definitions[0] = Definition(vec); + bld.insert(std::move(instr)); + emit_split_vector(ctx, vec, 4); + + instr.reset(create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, 3, 1)); + instr->operands[0] = Operand(emit_extract_vector(ctx, vec, 0, v1)); + instr->operands[1] = Operand(emit_extract_vector(ctx, vec, 1, v1)); + instr->operands[2] = Operand(emit_extract_vector(ctx, vec, 2, v1)); + } + + if (dst.type() == RegType::sgpr) { + Temp vec = bld.tmp(RegType::vgpr, dst.size()); + instr->definitions[0] = Definition(vec); + bld.insert(std::move(instr)); + expand_vector(ctx, vec, dst, num_components, (1 << num_components) - 1); + bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec); + } else { + instr->definitions[0] = Definition(dst); + bld.insert(std::move(instr)); + emit_split_vector(ctx, dst, num_components); + } + } + } else { + switch (num_bytes) { + case 4: + op = aco_opcode::s_load_dword; + break; + case 8: + op = aco_opcode::s_load_dwordx2; + break; + case 12: + case 16: + op = aco_opcode::s_load_dwordx4; + break; + default: + unreachable("load_global not implemented for this size."); + } + aco_ptr load{create_instruction(op, Format::SMEM, 2, 1)}; + load->operands[0] = Operand(addr); + load->operands[1] = Operand(0u); + load->definitions[0] = Definition(dst); + load->glc = glc; + load->dlc = dlc; + load->barrier = barrier_buffer; + assert(ctx->options->chip_class >= GFX8 || !glc); + + if (dst.size() == 3) { + /* trim vector */ + Temp vec = bld.tmp(s4); + load->definitions[0] = Definition(vec); + ctx->block->instructions.emplace_back(std::move(load)); + emit_split_vector(ctx, vec, 4); + + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), + emit_extract_vector(ctx, vec, 0, s1), + emit_extract_vector(ctx, vec, 1, s1), + emit_extract_vector(ctx, vec, 2, s1)); + } else { + ctx->block->instructions.emplace_back(std::move(load)); + } + } +} + +void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr) +{ + Builder bld(ctx->program, ctx->block); + unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8; + + Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa)); + Temp addr = get_ssa_temp(ctx, instr->src[1].ssa); + + if (ctx->options->chip_class >= GFX7) + addr = as_vgpr(ctx, addr); + + unsigned writemask = nir_intrinsic_write_mask(instr); + while (writemask) { + int start, count; + u_bit_scan_consecutive_range(&writemask, &start, &count); + if (count == 3 && ctx->options->chip_class == GFX6) { + /* GFX6 doesn't support storing vec3, split it. */ + writemask |= 1u << (start + 2); + count = 2; + } + unsigned num_bytes = count * elem_size_bytes; + + Temp write_data = data; + if (count != instr->num_components) { + aco_ptr vec{create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)}; + for (int i = 0; i < count; i++) + vec->operands[i] = Operand(emit_extract_vector(ctx, data, start + i, v1)); + write_data = bld.tmp(RegType::vgpr, count); + vec->definitions[0] = Definition(write_data); + ctx->block->instructions.emplace_back(std::move(vec)); + } + + bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE); + unsigned offset = start * elem_size_bytes; + + if (ctx->options->chip_class >= GFX7) { + if (offset > 0 && ctx->options->chip_class < GFX9) { + Temp addr0 = bld.tmp(v1), addr1 = bld.tmp(v1); + Temp new_addr0 = bld.tmp(v1), new_addr1 = bld.tmp(v1); + Temp carry = bld.tmp(bld.lm); + bld.pseudo(aco_opcode::p_split_vector, Definition(addr0), Definition(addr1), addr); + + bld.vop2(aco_opcode::v_add_co_u32, Definition(new_addr0), bld.hint_vcc(Definition(carry)), + Operand(offset), addr0); + bld.vop2(aco_opcode::v_addc_co_u32, Definition(new_addr1), bld.def(bld.lm), + Operand(0u), addr1, + carry).def(1).setHint(vcc); + + addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), new_addr0, new_addr1); + + offset = 0; + } + + bool global = ctx->options->chip_class >= GFX9; + aco_opcode op; + switch (num_bytes) { + case 4: + op = global ? aco_opcode::global_store_dword : aco_opcode::flat_store_dword; + break; + case 8: + op = global ? aco_opcode::global_store_dwordx2 : aco_opcode::flat_store_dwordx2; + break; + case 12: + op = global ? aco_opcode::global_store_dwordx3 : aco_opcode::flat_store_dwordx3; + break; + case 16: + op = global ? aco_opcode::global_store_dwordx4 : aco_opcode::flat_store_dwordx4; + break; + default: + unreachable("store_global not implemented for this size."); + } + + aco_ptr flat{create_instruction(op, global ? Format::GLOBAL : Format::FLAT, 3, 0)}; + flat->operands[0] = Operand(addr); + flat->operands[1] = Operand(s1); + flat->operands[2] = Operand(data); + flat->glc = glc; + flat->dlc = false; + flat->offset = offset; + flat->disable_wqm = true; + flat->barrier = barrier_buffer; + ctx->program->needs_exact = true; + ctx->block->instructions.emplace_back(std::move(flat)); + } else { + assert(ctx->options->chip_class == GFX6); + + aco_opcode op; + switch (num_bytes) { + case 4: + op = aco_opcode::buffer_store_dword; + break; + case 8: + op = aco_opcode::buffer_store_dwordx2; + break; + case 16: + op = aco_opcode::buffer_store_dwordx4; + break; + default: + unreachable("store_global not implemented for this size."); + } + + Temp rsrc = get_gfx6_global_rsrc(bld, addr); + + aco_ptr mubuf{create_instruction(op, Format::MUBUF, 4, 0)}; + mubuf->operands[0] = Operand(rsrc); + mubuf->operands[1] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1); + mubuf->operands[2] = Operand(0u); + mubuf->operands[3] = Operand(write_data); + mubuf->glc = glc; + mubuf->dlc = false; + mubuf->offset = offset; + mubuf->addr64 = addr.type() == RegType::vgpr; + mubuf->disable_wqm = true; + mubuf->barrier = barrier_buffer; + ctx->program->needs_exact = true; + ctx->block->instructions.emplace_back(std::move(mubuf)); + } + } +} + +void visit_global_atomic(isel_context *ctx, nir_intrinsic_instr *instr) +{ + /* return the previous value if dest is ever used */ + bool return_previous = false; + nir_foreach_use_safe(use_src, &instr->dest.ssa) { + return_previous = true; + break; + } + nir_foreach_if_use_safe(use_src, &instr->dest.ssa) { + return_previous = true; + break; + } + + Builder bld(ctx->program, ctx->block); + Temp addr = get_ssa_temp(ctx, instr->src[0].ssa); + Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa)); + + if (ctx->options->chip_class >= GFX7) + addr = as_vgpr(ctx, addr); + + if (instr->intrinsic == nir_intrinsic_global_atomic_comp_swap) + data = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, data.size() * 2), + get_ssa_temp(ctx, instr->src[2].ssa), data); + + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + + aco_opcode op32, op64; + + if (ctx->options->chip_class >= GFX7) { + bool global = ctx->options->chip_class >= GFX9; + switch (instr->intrinsic) { + case nir_intrinsic_global_atomic_add: + op32 = global ? aco_opcode::global_atomic_add : aco_opcode::flat_atomic_add; + op64 = global ? aco_opcode::global_atomic_add_x2 : aco_opcode::flat_atomic_add_x2; + break; + case nir_intrinsic_global_atomic_imin: + op32 = global ? aco_opcode::global_atomic_smin : aco_opcode::flat_atomic_smin; + op64 = global ? aco_opcode::global_atomic_smin_x2 : aco_opcode::flat_atomic_smin_x2; + break; + case nir_intrinsic_global_atomic_umin: + op32 = global ? aco_opcode::global_atomic_umin : aco_opcode::flat_atomic_umin; + op64 = global ? aco_opcode::global_atomic_umin_x2 : aco_opcode::flat_atomic_umin_x2; + break; + case nir_intrinsic_global_atomic_imax: + op32 = global ? aco_opcode::global_atomic_smax : aco_opcode::flat_atomic_smax; + op64 = global ? aco_opcode::global_atomic_smax_x2 : aco_opcode::flat_atomic_smax_x2; + break; + case nir_intrinsic_global_atomic_umax: + op32 = global ? aco_opcode::global_atomic_umax : aco_opcode::flat_atomic_umax; + op64 = global ? aco_opcode::global_atomic_umax_x2 : aco_opcode::flat_atomic_umax_x2; + break; + case nir_intrinsic_global_atomic_and: + op32 = global ? aco_opcode::global_atomic_and : aco_opcode::flat_atomic_and; + op64 = global ? aco_opcode::global_atomic_and_x2 : aco_opcode::flat_atomic_and_x2; + break; + case nir_intrinsic_global_atomic_or: + op32 = global ? aco_opcode::global_atomic_or : aco_opcode::flat_atomic_or; + op64 = global ? aco_opcode::global_atomic_or_x2 : aco_opcode::flat_atomic_or_x2; + break; + case nir_intrinsic_global_atomic_xor: + op32 = global ? aco_opcode::global_atomic_xor : aco_opcode::flat_atomic_xor; + op64 = global ? aco_opcode::global_atomic_xor_x2 : aco_opcode::flat_atomic_xor_x2; + break; + case nir_intrinsic_global_atomic_exchange: + op32 = global ? aco_opcode::global_atomic_swap : aco_opcode::flat_atomic_swap; + op64 = global ? aco_opcode::global_atomic_swap_x2 : aco_opcode::flat_atomic_swap_x2; + break; + case nir_intrinsic_global_atomic_comp_swap: + op32 = global ? aco_opcode::global_atomic_cmpswap : aco_opcode::flat_atomic_cmpswap; + op64 = global ? aco_opcode::global_atomic_cmpswap_x2 : aco_opcode::flat_atomic_cmpswap_x2; + break; + default: + unreachable("visit_atomic_global should only be called with nir_intrinsic_global_atomic_* instructions."); + } + + aco_opcode op = instr->dest.ssa.bit_size == 32 ? op32 : op64; + aco_ptr flat{create_instruction(op, global ? Format::GLOBAL : Format::FLAT, 3, return_previous ? 1 : 0)}; + flat->operands[0] = Operand(addr); + flat->operands[1] = Operand(s1); + flat->operands[2] = Operand(data); + if (return_previous) + flat->definitions[0] = Definition(dst); + flat->glc = return_previous; + flat->dlc = false; /* Not needed for atomics */ + flat->offset = 0; + flat->disable_wqm = true; + flat->barrier = barrier_buffer; + ctx->program->needs_exact = true; + ctx->block->instructions.emplace_back(std::move(flat)); + } else { + assert(ctx->options->chip_class == GFX6); + + switch (instr->intrinsic) { + case nir_intrinsic_global_atomic_add: + op32 = aco_opcode::buffer_atomic_add; + op64 = aco_opcode::buffer_atomic_add_x2; + break; + case nir_intrinsic_global_atomic_imin: + op32 = aco_opcode::buffer_atomic_smin; + op64 = aco_opcode::buffer_atomic_smin_x2; + break; + case nir_intrinsic_global_atomic_umin: + op32 = aco_opcode::buffer_atomic_umin; + op64 = aco_opcode::buffer_atomic_umin_x2; + break; + case nir_intrinsic_global_atomic_imax: + op32 = aco_opcode::buffer_atomic_smax; + op64 = aco_opcode::buffer_atomic_smax_x2; + break; + case nir_intrinsic_global_atomic_umax: + op32 = aco_opcode::buffer_atomic_umax; + op64 = aco_opcode::buffer_atomic_umax_x2; + break; + case nir_intrinsic_global_atomic_and: + op32 = aco_opcode::buffer_atomic_and; + op64 = aco_opcode::buffer_atomic_and_x2; + break; + case nir_intrinsic_global_atomic_or: + op32 = aco_opcode::buffer_atomic_or; + op64 = aco_opcode::buffer_atomic_or_x2; + break; + case nir_intrinsic_global_atomic_xor: + op32 = aco_opcode::buffer_atomic_xor; + op64 = aco_opcode::buffer_atomic_xor_x2; + break; + case nir_intrinsic_global_atomic_exchange: + op32 = aco_opcode::buffer_atomic_swap; + op64 = aco_opcode::buffer_atomic_swap_x2; + break; + case nir_intrinsic_global_atomic_comp_swap: + op32 = aco_opcode::buffer_atomic_cmpswap; + op64 = aco_opcode::buffer_atomic_cmpswap_x2; + break; + default: + unreachable("visit_atomic_global should only be called with nir_intrinsic_global_atomic_* instructions."); + } + + Temp rsrc = get_gfx6_global_rsrc(bld, addr); + + aco_opcode op = instr->dest.ssa.bit_size == 32 ? op32 : op64; + + aco_ptr mubuf{create_instruction(op, Format::MUBUF, 4, return_previous ? 1 : 0)}; + mubuf->operands[0] = Operand(rsrc); + mubuf->operands[1] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1); + mubuf->operands[2] = Operand(0u); + mubuf->operands[3] = Operand(data); + if (return_previous) + mubuf->definitions[0] = Definition(dst); + mubuf->glc = return_previous; + mubuf->dlc = false; + mubuf->offset = 0; + mubuf->addr64 = addr.type() == RegType::vgpr; + mubuf->disable_wqm = true; + mubuf->barrier = barrier_buffer; + ctx->program->needs_exact = true; + ctx->block->instructions.emplace_back(std::move(mubuf)); + } +} + +void emit_memory_barrier(isel_context *ctx, nir_intrinsic_instr *instr) { + Builder bld(ctx->program, ctx->block); + switch(instr->intrinsic) { + case nir_intrinsic_group_memory_barrier: + case nir_intrinsic_memory_barrier: + bld.barrier(aco_opcode::p_memory_barrier_common); + break; + case nir_intrinsic_memory_barrier_buffer: + bld.barrier(aco_opcode::p_memory_barrier_buffer); + break; + case nir_intrinsic_memory_barrier_image: + bld.barrier(aco_opcode::p_memory_barrier_image); + break; + case nir_intrinsic_memory_barrier_shared: + bld.barrier(aco_opcode::p_memory_barrier_shared); + break; + default: + unreachable("Unimplemented memory barrier intrinsic"); + break; + } +} + +void visit_load_shared(isel_context *ctx, nir_intrinsic_instr *instr) +{ + // TODO: implement sparse reads using ds_read2_b32 and nir_ssa_def_components_read() + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + assert(instr->dest.ssa.bit_size >= 32 && "Bitsize not supported in load_shared."); + Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa)); + Builder bld(ctx->program, ctx->block); + + unsigned elem_size_bytes = instr->dest.ssa.bit_size / 8; + unsigned align = nir_intrinsic_align_mul(instr) ? nir_intrinsic_align(instr) : elem_size_bytes; + load_lds(ctx, elem_size_bytes, dst, address, nir_intrinsic_base(instr), align); +} + +void visit_store_shared(isel_context *ctx, nir_intrinsic_instr *instr) +{ + unsigned writemask = nir_intrinsic_write_mask(instr); + Temp data = get_ssa_temp(ctx, instr->src[0].ssa); + Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa)); + unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8; + assert(elem_size_bytes >= 4 && "Only 32bit & 64bit store_shared currently supported."); + + unsigned align = nir_intrinsic_align_mul(instr) ? nir_intrinsic_align(instr) : elem_size_bytes; + store_lds(ctx, elem_size_bytes, data, writemask, address, nir_intrinsic_base(instr), align); +} + +void visit_shared_atomic(isel_context *ctx, nir_intrinsic_instr *instr) +{ + unsigned offset = nir_intrinsic_base(instr); + Operand m = load_lds_size_m0(ctx); + Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa)); + Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa)); + + unsigned num_operands = 3; + aco_opcode op32, op64, op32_rtn, op64_rtn; + switch(instr->intrinsic) { + case nir_intrinsic_shared_atomic_add: + op32 = aco_opcode::ds_add_u32; + op64 = aco_opcode::ds_add_u64; + op32_rtn = aco_opcode::ds_add_rtn_u32; + op64_rtn = aco_opcode::ds_add_rtn_u64; + break; + case nir_intrinsic_shared_atomic_imin: + op32 = aco_opcode::ds_min_i32; + op64 = aco_opcode::ds_min_i64; + op32_rtn = aco_opcode::ds_min_rtn_i32; + op64_rtn = aco_opcode::ds_min_rtn_i64; + break; + case nir_intrinsic_shared_atomic_umin: + op32 = aco_opcode::ds_min_u32; + op64 = aco_opcode::ds_min_u64; + op32_rtn = aco_opcode::ds_min_rtn_u32; + op64_rtn = aco_opcode::ds_min_rtn_u64; + break; + case nir_intrinsic_shared_atomic_imax: + op32 = aco_opcode::ds_max_i32; + op64 = aco_opcode::ds_max_i64; + op32_rtn = aco_opcode::ds_max_rtn_i32; + op64_rtn = aco_opcode::ds_max_rtn_i64; + break; + case nir_intrinsic_shared_atomic_umax: + op32 = aco_opcode::ds_max_u32; + op64 = aco_opcode::ds_max_u64; + op32_rtn = aco_opcode::ds_max_rtn_u32; + op64_rtn = aco_opcode::ds_max_rtn_u64; + break; + case nir_intrinsic_shared_atomic_and: + op32 = aco_opcode::ds_and_b32; + op64 = aco_opcode::ds_and_b64; + op32_rtn = aco_opcode::ds_and_rtn_b32; + op64_rtn = aco_opcode::ds_and_rtn_b64; + break; + case nir_intrinsic_shared_atomic_or: + op32 = aco_opcode::ds_or_b32; + op64 = aco_opcode::ds_or_b64; + op32_rtn = aco_opcode::ds_or_rtn_b32; + op64_rtn = aco_opcode::ds_or_rtn_b64; + break; + case nir_intrinsic_shared_atomic_xor: + op32 = aco_opcode::ds_xor_b32; + op64 = aco_opcode::ds_xor_b64; + op32_rtn = aco_opcode::ds_xor_rtn_b32; + op64_rtn = aco_opcode::ds_xor_rtn_b64; + break; + case nir_intrinsic_shared_atomic_exchange: + op32 = aco_opcode::ds_write_b32; + op64 = aco_opcode::ds_write_b64; + op32_rtn = aco_opcode::ds_wrxchg_rtn_b32; + op64_rtn = aco_opcode::ds_wrxchg2_rtn_b64; + break; + case nir_intrinsic_shared_atomic_comp_swap: + op32 = aco_opcode::ds_cmpst_b32; + op64 = aco_opcode::ds_cmpst_b64; + op32_rtn = aco_opcode::ds_cmpst_rtn_b32; + op64_rtn = aco_opcode::ds_cmpst_rtn_b64; + num_operands = 4; + break; + default: + unreachable("Unhandled shared atomic intrinsic"); + } + + /* return the previous value if dest is ever used */ + bool return_previous = false; + nir_foreach_use_safe(use_src, &instr->dest.ssa) { + return_previous = true; + break; + } + nir_foreach_if_use_safe(use_src, &instr->dest.ssa) { + return_previous = true; + break; + } + + aco_opcode op; + if (data.size() == 1) { + assert(instr->dest.ssa.bit_size == 32); + op = return_previous ? op32_rtn : op32; + } else { + assert(instr->dest.ssa.bit_size == 64); + op = return_previous ? op64_rtn : op64; + } + + if (offset > 65535) { + Builder bld(ctx->program, ctx->block); + address = bld.vadd32(bld.def(v1), Operand(offset), address); + offset = 0; + } + + aco_ptr ds; + ds.reset(create_instruction(op, Format::DS, num_operands, return_previous ? 1 : 0)); + ds->operands[0] = Operand(address); + ds->operands[1] = Operand(data); + if (num_operands == 4) + ds->operands[2] = Operand(get_ssa_temp(ctx, instr->src[2].ssa)); + ds->operands[num_operands - 1] = m; + ds->offset0 = offset; + if (return_previous) + ds->definitions[0] = Definition(get_ssa_temp(ctx, &instr->dest.ssa)); + ctx->block->instructions.emplace_back(std::move(ds)); +} + +Temp get_scratch_resource(isel_context *ctx) +{ + Builder bld(ctx->program, ctx->block); + Temp scratch_addr = ctx->program->private_segment_buffer; + if (ctx->stage != compute_cs) + scratch_addr = bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), scratch_addr, Operand(0u)); + + uint32_t rsrc_conf = S_008F0C_ADD_TID_ENABLE(1) | + S_008F0C_INDEX_STRIDE(ctx->program->wave_size == 64 ? 3 : 2);; + + if (ctx->program->chip_class >= GFX10) { + rsrc_conf |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) | + S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | + S_008F0C_RESOURCE_LEVEL(1); + } else if (ctx->program->chip_class <= GFX7) { /* dfmt modifies stride on GFX8/GFX9 when ADD_TID_EN=1 */ + rsrc_conf |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | + S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); + } + + /* older generations need element size = 16 bytes. element size removed in GFX9 */ + if (ctx->program->chip_class <= GFX8) + rsrc_conf |= S_008F0C_ELEMENT_SIZE(3); + + return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), scratch_addr, Operand(-1u), Operand(rsrc_conf)); +} + +void visit_load_scratch(isel_context *ctx, nir_intrinsic_instr *instr) { + assert(instr->dest.ssa.bit_size == 32 || instr->dest.ssa.bit_size == 64); + Builder bld(ctx->program, ctx->block); + Temp rsrc = get_scratch_resource(ctx); + Temp offset = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa)); + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + + aco_opcode op; + switch (dst.size()) { + case 1: + op = aco_opcode::buffer_load_dword; + break; + case 2: + op = aco_opcode::buffer_load_dwordx2; + break; + case 3: + op = aco_opcode::buffer_load_dwordx3; + break; + case 4: + op = aco_opcode::buffer_load_dwordx4; + break; + case 6: + case 8: { + std::array elems; + Temp lower = bld.mubuf(aco_opcode::buffer_load_dwordx4, + bld.def(v4), rsrc, offset, + ctx->program->scratch_offset, 0, true); + Temp upper = bld.mubuf(dst.size() == 6 ? aco_opcode::buffer_load_dwordx2 : + aco_opcode::buffer_load_dwordx4, + dst.size() == 6 ? bld.def(v2) : bld.def(v4), + rsrc, offset, ctx->program->scratch_offset, 16, true); + emit_split_vector(ctx, lower, 2); + elems[0] = emit_extract_vector(ctx, lower, 0, v2); + elems[1] = emit_extract_vector(ctx, lower, 1, v2); + if (dst.size() == 8) { + emit_split_vector(ctx, upper, 2); + elems[2] = emit_extract_vector(ctx, upper, 0, v2); + elems[3] = emit_extract_vector(ctx, upper, 1, v2); + } else { + elems[2] = upper; + } + + aco_ptr vec{create_instruction(aco_opcode::p_create_vector, + Format::PSEUDO, dst.size() / 2, 1)}; + for (unsigned i = 0; i < dst.size() / 2; i++) + vec->operands[i] = Operand(elems[i]); + vec->definitions[0] = Definition(dst); + bld.insert(std::move(vec)); + ctx->allocated_vec.emplace(dst.id(), elems); + return; + } + default: + unreachable("Wrong dst size for nir_intrinsic_load_scratch"); + } + + bld.mubuf(op, Definition(dst), rsrc, offset, ctx->program->scratch_offset, 0, true); + emit_split_vector(ctx, dst, instr->num_components); +} + +void visit_store_scratch(isel_context *ctx, nir_intrinsic_instr *instr) { + assert(instr->src[0].ssa->bit_size == 32 || instr->src[0].ssa->bit_size == 64); + Builder bld(ctx->program, ctx->block); + Temp rsrc = get_scratch_resource(ctx); + Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa)); + Temp offset = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa)); + + unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8; + unsigned writemask = nir_intrinsic_write_mask(instr); + + while (writemask) { + int start, count; + u_bit_scan_consecutive_range(&writemask, &start, &count); + int num_bytes = count * elem_size_bytes; + + if (num_bytes > 16) { + assert(elem_size_bytes == 8); + writemask |= (((count - 2) << 1) - 1) << (start + 2); + count = 2; + num_bytes = 16; + } + + // TODO: check alignment of sub-dword stores + // TODO: split 3 bytes. there is no store instruction for that + + Temp write_data; + if (count != instr->num_components) { + aco_ptr vec{create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)}; + for (int i = 0; i < count; i++) { + Temp elem = emit_extract_vector(ctx, data, start + i, RegClass(RegType::vgpr, elem_size_bytes / 4)); + vec->operands[i] = Operand(elem); + } + write_data = bld.tmp(RegClass(RegType::vgpr, count * elem_size_bytes / 4)); + vec->definitions[0] = Definition(write_data); + ctx->block->instructions.emplace_back(std::move(vec)); + } else { + write_data = data; + } + + aco_opcode op; + switch (num_bytes) { + case 4: + op = aco_opcode::buffer_store_dword; + break; + case 8: + op = aco_opcode::buffer_store_dwordx2; + break; + case 12: + op = aco_opcode::buffer_store_dwordx3; + break; + case 16: + op = aco_opcode::buffer_store_dwordx4; + break; + default: + unreachable("Invalid data size for nir_intrinsic_store_scratch."); + } + + bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset, write_data, start * elem_size_bytes, true); + } +} + +void visit_load_sample_mask_in(isel_context *ctx, nir_intrinsic_instr *instr) { + uint8_t log2_ps_iter_samples; + if (ctx->program->info->ps.force_persample) { + log2_ps_iter_samples = + util_logbase2(ctx->options->key.fs.num_samples); + } else { + log2_ps_iter_samples = ctx->options->key.fs.log2_ps_iter_samples; + } + + /* The bit pattern matches that used by fixed function fragment + * processing. */ + static const unsigned ps_iter_masks[] = { + 0xffff, /* not used */ + 0x5555, + 0x1111, + 0x0101, + 0x0001, + }; + assert(log2_ps_iter_samples < ARRAY_SIZE(ps_iter_masks)); + + Builder bld(ctx->program, ctx->block); + + Temp sample_id = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), + get_arg(ctx, ctx->args->ac.ancillary), Operand(8u), Operand(4u)); + Temp ps_iter_mask = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(ps_iter_masks[log2_ps_iter_samples])); + Temp mask = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), sample_id, ps_iter_mask); + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + bld.vop2(aco_opcode::v_and_b32, Definition(dst), mask, get_arg(ctx, ctx->args->ac.sample_coverage)); +} + +void visit_emit_vertex_with_counter(isel_context *ctx, nir_intrinsic_instr *instr) { + Builder bld(ctx->program, ctx->block); + + unsigned stream = nir_intrinsic_stream_id(instr); + Temp next_vertex = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa)); + next_vertex = bld.v_mul_imm(bld.def(v1), next_vertex, 4u); + nir_const_value *next_vertex_cv = nir_src_as_const_value(instr->src[0]); + + /* get GSVS ring */ + Temp gsvs_ring = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_GSVS_GS * 16u)); + + unsigned num_components = + ctx->program->info->gs.num_stream_output_components[stream]; + assert(num_components); + + unsigned stride = 4u * num_components * ctx->shader->info.gs.vertices_out; + unsigned stream_offset = 0; + for (unsigned i = 0; i < stream; i++) { + unsigned prev_stride = 4u * ctx->program->info->gs.num_stream_output_components[i] * ctx->shader->info.gs.vertices_out; + stream_offset += prev_stride * ctx->program->wave_size; + } + + /* Limit on the stride field for <= GFX7. */ + assert(stride < (1 << 14)); + + Temp gsvs_dwords[4]; + for (unsigned i = 0; i < 4; i++) + gsvs_dwords[i] = bld.tmp(s1); + bld.pseudo(aco_opcode::p_split_vector, + Definition(gsvs_dwords[0]), + Definition(gsvs_dwords[1]), + Definition(gsvs_dwords[2]), + Definition(gsvs_dwords[3]), + gsvs_ring); + + if (stream_offset) { + Temp stream_offset_tmp = bld.copy(bld.def(s1), Operand(stream_offset)); + + Temp carry = bld.tmp(s1); + gsvs_dwords[0] = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), gsvs_dwords[0], stream_offset_tmp); + gsvs_dwords[1] = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), gsvs_dwords[1], Operand(0u), bld.scc(carry)); + } + + gsvs_dwords[1] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), gsvs_dwords[1], Operand(S_008F04_STRIDE(stride))); + gsvs_dwords[2] = bld.copy(bld.def(s1), Operand((uint32_t)ctx->program->wave_size)); + + gsvs_ring = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), + gsvs_dwords[0], gsvs_dwords[1], gsvs_dwords[2], gsvs_dwords[3]); + + unsigned offset = 0; + for (unsigned i = 0; i <= VARYING_SLOT_VAR31; i++) { + if (ctx->program->info->gs.output_streams[i] != stream) + continue; + + for (unsigned j = 0; j < 4; j++) { + if (!(ctx->program->info->gs.output_usage_mask[i] & (1 << j))) + continue; + + if (ctx->outputs.mask[i] & (1 << j)) { + Operand vaddr_offset = next_vertex_cv ? Operand(v1) : Operand(next_vertex); + unsigned const_offset = (offset + (next_vertex_cv ? next_vertex_cv->u32 : 0u)) * 4u; + if (const_offset >= 4096u) { + if (vaddr_offset.isUndefined()) + vaddr_offset = bld.copy(bld.def(v1), Operand(const_offset / 4096u * 4096u)); + else + vaddr_offset = bld.vadd32(bld.def(v1), Operand(const_offset / 4096u * 4096u), vaddr_offset); + const_offset %= 4096u; + } + + aco_ptr mtbuf{create_instruction(aco_opcode::tbuffer_store_format_x, Format::MTBUF, 4, 0)}; + mtbuf->operands[0] = Operand(gsvs_ring); + mtbuf->operands[1] = vaddr_offset; + mtbuf->operands[2] = Operand(get_arg(ctx, ctx->args->gs2vs_offset)); + mtbuf->operands[3] = Operand(ctx->outputs.outputs[i][j]); + mtbuf->offen = !vaddr_offset.isUndefined(); + mtbuf->dfmt = V_008F0C_BUF_DATA_FORMAT_32; + mtbuf->nfmt = V_008F0C_BUF_NUM_FORMAT_UINT; + mtbuf->offset = const_offset; + mtbuf->glc = true; + mtbuf->slc = true; + mtbuf->barrier = barrier_gs_data; + mtbuf->can_reorder = true; + bld.insert(std::move(mtbuf)); + } + + offset += ctx->shader->info.gs.vertices_out; + } + + /* outputs for the next vertex are undefined and keeping them around can + * create invalid IR with control flow */ + ctx->outputs.mask[i] = 0; + } + + bld.sopp(aco_opcode::s_sendmsg, bld.m0(ctx->gs_wave_id), -1, sendmsg_gs(false, true, stream)); +} + +Temp emit_boolean_reduce(isel_context *ctx, nir_op op, unsigned cluster_size, Temp src) +{ + Builder bld(ctx->program, ctx->block); + + if (cluster_size == 1) { + return src; + } if (op == nir_op_iand && cluster_size == 4) { + //subgroupClusteredAnd(val, 4) -> ~wqm(exec & ~val) + Temp tmp = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src); + return bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), + bld.sop1(Builder::s_wqm, bld.def(bld.lm), bld.def(s1, scc), tmp)); + } else if (op == nir_op_ior && cluster_size == 4) { + //subgroupClusteredOr(val, 4) -> wqm(val & exec) + return bld.sop1(Builder::s_wqm, bld.def(bld.lm), bld.def(s1, scc), + bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm))); + } else if (op == nir_op_iand && cluster_size == ctx->program->wave_size) { + //subgroupAnd(val) -> (exec & ~val) == 0 + Temp tmp = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src).def(1).getTemp(); + Temp cond = bool_to_vector_condition(ctx, emit_wqm(ctx, tmp)); + return bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), cond); + } else if (op == nir_op_ior && cluster_size == ctx->program->wave_size) { + //subgroupOr(val) -> (val & exec) != 0 + Temp tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)).def(1).getTemp(); + return bool_to_vector_condition(ctx, tmp); + } else if (op == nir_op_ixor && cluster_size == ctx->program->wave_size) { + //subgroupXor(val) -> s_bcnt1_i32_b64(val & exec) & 1 + Temp tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)); + tmp = bld.sop1(Builder::s_bcnt1_i32, bld.def(s1), bld.def(s1, scc), tmp); + tmp = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), tmp, Operand(1u)).def(1).getTemp(); + return bool_to_vector_condition(ctx, tmp); + } else { + //subgroupClustered{And,Or,Xor}(val, n) -> + //lane_id = v_mbcnt_hi_u32_b32(-1, v_mbcnt_lo_u32_b32(-1, 0)) ; just v_mbcnt_lo_u32_b32 on wave32 + //cluster_offset = ~(n - 1) & lane_id + //cluster_mask = ((1 << n) - 1) + //subgroupClusteredAnd(): + // return ((val | ~exec) >> cluster_offset) & cluster_mask == cluster_mask + //subgroupClusteredOr(): + // return ((val & exec) >> cluster_offset) & cluster_mask != 0 + //subgroupClusteredXor(): + // return v_bnt_u32_b32(((val & exec) >> cluster_offset) & cluster_mask, 0) & 1 != 0 + Temp lane_id = emit_mbcnt(ctx, bld.def(v1)); + Temp cluster_offset = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(~uint32_t(cluster_size - 1)), lane_id); + + Temp tmp; + if (op == nir_op_iand) + tmp = bld.sop2(Builder::s_orn2, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)); + else + tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)); + + uint32_t cluster_mask = cluster_size == 32 ? -1 : (1u << cluster_size) - 1u; + + if (ctx->program->chip_class <= GFX7) + tmp = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), tmp, cluster_offset); + else if (ctx->program->wave_size == 64) + tmp = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), cluster_offset, tmp); + else + tmp = bld.vop2_e64(aco_opcode::v_lshrrev_b32, bld.def(v1), cluster_offset, tmp); + tmp = emit_extract_vector(ctx, tmp, 0, v1); + if (cluster_mask != 0xffffffff) + tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(cluster_mask), tmp); + + Definition cmp_def = Definition(); + if (op == nir_op_iand) { + cmp_def = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm), Operand(cluster_mask), tmp).def(0); + } else if (op == nir_op_ior) { + cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), tmp).def(0); + } else if (op == nir_op_ixor) { + tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(1u), + bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), tmp, Operand(0u))); + cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), tmp).def(0); + } + cmp_def.setHint(vcc); + return cmp_def.getTemp(); + } +} + +Temp emit_boolean_exclusive_scan(isel_context *ctx, nir_op op, Temp src) +{ + Builder bld(ctx->program, ctx->block); + + //subgroupExclusiveAnd(val) -> mbcnt(exec & ~val) == 0 + //subgroupExclusiveOr(val) -> mbcnt(val & exec) != 0 + //subgroupExclusiveXor(val) -> mbcnt(val & exec) & 1 != 0 + Temp tmp; + if (op == nir_op_iand) + tmp = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src); + else + tmp = bld.sop2(Builder::s_and, bld.def(s2), bld.def(s1, scc), src, Operand(exec, bld.lm)); + + Builder::Result lohi = bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), bld.def(s1), tmp); + Temp lo = lohi.def(0).getTemp(); + Temp hi = lohi.def(1).getTemp(); + Temp mbcnt = emit_mbcnt(ctx, bld.def(v1), Operand(lo), Operand(hi)); + + Definition cmp_def = Definition(); + if (op == nir_op_iand) + cmp_def = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm), Operand(0u), mbcnt).def(0); + else if (op == nir_op_ior) + cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), mbcnt).def(0); + else if (op == nir_op_ixor) + cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), + bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(1u), mbcnt)).def(0); + cmp_def.setHint(vcc); + return cmp_def.getTemp(); +} + +Temp emit_boolean_inclusive_scan(isel_context *ctx, nir_op op, Temp src) +{ + Builder bld(ctx->program, ctx->block); + + //subgroupInclusiveAnd(val) -> subgroupExclusiveAnd(val) && val + //subgroupInclusiveOr(val) -> subgroupExclusiveOr(val) || val + //subgroupInclusiveXor(val) -> subgroupExclusiveXor(val) ^^ val + Temp tmp = emit_boolean_exclusive_scan(ctx, op, src); + if (op == nir_op_iand) + return bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), tmp, src); + else if (op == nir_op_ior) + return bld.sop2(Builder::s_or, bld.def(bld.lm), bld.def(s1, scc), tmp, src); + else if (op == nir_op_ixor) + return bld.sop2(Builder::s_xor, bld.def(bld.lm), bld.def(s1, scc), tmp, src); + + assert(false); + return Temp(); +} + +void emit_uniform_subgroup(isel_context *ctx, nir_intrinsic_instr *instr, Temp src) +{ + Builder bld(ctx->program, ctx->block); + Definition dst(get_ssa_temp(ctx, &instr->dest.ssa)); + if (src.regClass().type() == RegType::vgpr) { + bld.pseudo(aco_opcode::p_as_uniform, dst, src); + } else if (src.regClass() == s1) { + bld.sop1(aco_opcode::s_mov_b32, dst, src); + } else if (src.regClass() == s2) { + bld.sop1(aco_opcode::s_mov_b64, dst, src); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } +} + +void emit_interp_center(isel_context *ctx, Temp dst, Temp pos1, Temp pos2) +{ + Builder bld(ctx->program, ctx->block); + Temp persp_center = get_arg(ctx, ctx->args->ac.persp_center); + Temp p1 = emit_extract_vector(ctx, persp_center, 0, v1); + Temp p2 = emit_extract_vector(ctx, persp_center, 1, v1); + + Temp ddx_1, ddx_2, ddy_1, ddy_2; + uint32_t dpp_ctrl0 = dpp_quad_perm(0, 0, 0, 0); + uint32_t dpp_ctrl1 = dpp_quad_perm(1, 1, 1, 1); + uint32_t dpp_ctrl2 = dpp_quad_perm(2, 2, 2, 2); + + /* Build DD X/Y */ + if (ctx->program->chip_class >= GFX8) { + Temp tl_1 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p1, dpp_ctrl0); + ddx_1 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p1, tl_1, dpp_ctrl1); + ddy_1 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p1, tl_1, dpp_ctrl2); + Temp tl_2 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p2, dpp_ctrl0); + ddx_2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p2, tl_2, dpp_ctrl1); + ddy_2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p2, tl_2, dpp_ctrl2); + } else { + Temp tl_1 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p1, (1 << 15) | dpp_ctrl0); + ddx_1 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p1, (1 << 15) | dpp_ctrl1); + ddx_1 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddx_1, tl_1); + ddx_2 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p1, (1 << 15) | dpp_ctrl2); + ddx_2 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddx_2, tl_1); + Temp tl_2 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p2, (1 << 15) | dpp_ctrl0); + ddy_1 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p2, (1 << 15) | dpp_ctrl1); + ddy_1 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddy_1, tl_2); + ddy_2 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p2, (1 << 15) | dpp_ctrl2); + ddy_2 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddy_2, tl_2); + } + + /* res_k = p_k + ddx_k * pos1 + ddy_k * pos2 */ + Temp tmp1 = bld.vop3(aco_opcode::v_mad_f32, bld.def(v1), ddx_1, pos1, p1); + Temp tmp2 = bld.vop3(aco_opcode::v_mad_f32, bld.def(v1), ddx_2, pos1, p2); + tmp1 = bld.vop3(aco_opcode::v_mad_f32, bld.def(v1), ddy_1, pos2, tmp1); + tmp2 = bld.vop3(aco_opcode::v_mad_f32, bld.def(v1), ddy_2, pos2, tmp2); + Temp wqm1 = bld.tmp(v1); + emit_wqm(ctx, tmp1, wqm1, true); + Temp wqm2 = bld.tmp(v1); + emit_wqm(ctx, tmp2, wqm2, true); + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), wqm1, wqm2); + return; +} + +void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) +{ + Builder bld(ctx->program, ctx->block); + switch(instr->intrinsic) { + case nir_intrinsic_load_barycentric_sample: + case nir_intrinsic_load_barycentric_pixel: + case nir_intrinsic_load_barycentric_centroid: { + glsl_interp_mode mode = (glsl_interp_mode)nir_intrinsic_interp_mode(instr); + Temp bary = Temp(0, s2); + switch (mode) { + case INTERP_MODE_SMOOTH: + case INTERP_MODE_NONE: + if (instr->intrinsic == nir_intrinsic_load_barycentric_pixel) + bary = get_arg(ctx, ctx->args->ac.persp_center); + else if (instr->intrinsic == nir_intrinsic_load_barycentric_centroid) + bary = ctx->persp_centroid; + else if (instr->intrinsic == nir_intrinsic_load_barycentric_sample) + bary = get_arg(ctx, ctx->args->ac.persp_sample); + break; + case INTERP_MODE_NOPERSPECTIVE: + if (instr->intrinsic == nir_intrinsic_load_barycentric_pixel) + bary = get_arg(ctx, ctx->args->ac.linear_center); + else if (instr->intrinsic == nir_intrinsic_load_barycentric_centroid) + bary = ctx->linear_centroid; + else if (instr->intrinsic == nir_intrinsic_load_barycentric_sample) + bary = get_arg(ctx, ctx->args->ac.linear_sample); + break; + default: + break; + } + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + Temp p1 = emit_extract_vector(ctx, bary, 0, v1); + Temp p2 = emit_extract_vector(ctx, bary, 1, v1); + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), + Operand(p1), Operand(p2)); + emit_split_vector(ctx, dst, 2); + break; + } + case nir_intrinsic_load_barycentric_model: { + Temp model = get_arg(ctx, ctx->args->ac.pull_model); + + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + Temp p1 = emit_extract_vector(ctx, model, 0, v1); + Temp p2 = emit_extract_vector(ctx, model, 1, v1); + Temp p3 = emit_extract_vector(ctx, model, 2, v1); + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), + Operand(p1), Operand(p2), Operand(p3)); + emit_split_vector(ctx, dst, 3); + break; + } + case nir_intrinsic_load_barycentric_at_sample: { + uint32_t sample_pos_offset = RING_PS_SAMPLE_POSITIONS * 16; + switch (ctx->options->key.fs.num_samples) { + case 2: sample_pos_offset += 1 << 3; break; + case 4: sample_pos_offset += 3 << 3; break; + case 8: sample_pos_offset += 7 << 3; break; + default: break; + } + Temp sample_pos; + Temp addr = get_ssa_temp(ctx, instr->src[0].ssa); + nir_const_value* const_addr = nir_src_as_const_value(instr->src[0]); + Temp private_segment_buffer = ctx->program->private_segment_buffer; + if (addr.type() == RegType::sgpr) { + Operand offset; + if (const_addr) { + sample_pos_offset += const_addr->u32 << 3; + offset = Operand(sample_pos_offset); + } else if (ctx->options->chip_class >= GFX9) { + offset = bld.sop2(aco_opcode::s_lshl3_add_u32, bld.def(s1), bld.def(s1, scc), addr, Operand(sample_pos_offset)); + } else { + offset = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), addr, Operand(3u)); + offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), addr, Operand(sample_pos_offset)); + } + + Operand off = bld.copy(bld.def(s1), Operand(offset)); + sample_pos = bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), private_segment_buffer, off); + + } else if (ctx->options->chip_class >= GFX9) { + addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(3u), addr); + sample_pos = bld.global(aco_opcode::global_load_dwordx2, bld.def(v2), addr, private_segment_buffer, sample_pos_offset); + } else if (ctx->options->chip_class >= GFX7) { + /* addr += private_segment_buffer + sample_pos_offset */ + Temp tmp0 = bld.tmp(s1); + Temp tmp1 = bld.tmp(s1); + bld.pseudo(aco_opcode::p_split_vector, Definition(tmp0), Definition(tmp1), private_segment_buffer); + Definition scc_tmp = bld.def(s1, scc); + tmp0 = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), scc_tmp, tmp0, Operand(sample_pos_offset)); + tmp1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), tmp1, Operand(0u), bld.scc(scc_tmp.getTemp())); + addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(3u), addr); + Temp pck0 = bld.tmp(v1); + Temp carry = bld.vadd32(Definition(pck0), tmp0, addr, true).def(1).getTemp(); + tmp1 = as_vgpr(ctx, tmp1); + Temp pck1 = bld.vop2_e64(aco_opcode::v_addc_co_u32, bld.def(v1), bld.hint_vcc(bld.def(bld.lm)), tmp1, Operand(0u), carry); + addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), pck0, pck1); + + /* sample_pos = flat_load_dwordx2 addr */ + sample_pos = bld.flat(aco_opcode::flat_load_dwordx2, bld.def(v2), addr, Operand(s1)); + } else { + assert(ctx->options->chip_class == GFX6); + + uint32_t rsrc_conf = S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | + S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); + Temp rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), private_segment_buffer, Operand(0u), Operand(rsrc_conf)); + + addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(3u), addr); + addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), addr, Operand(0u)); + + sample_pos = bld.tmp(v2); + + aco_ptr load{create_instruction(aco_opcode::buffer_load_dwordx2, Format::MUBUF, 3, 1)}; + load->definitions[0] = Definition(sample_pos); + load->operands[0] = Operand(rsrc); + load->operands[1] = Operand(addr); + load->operands[2] = Operand(0u); + load->offset = sample_pos_offset; + load->offen = 0; + load->addr64 = true; + load->glc = false; + load->dlc = false; + load->disable_wqm = false; + load->barrier = barrier_none; + load->can_reorder = true; + ctx->block->instructions.emplace_back(std::move(load)); + } + + /* sample_pos -= 0.5 */ + Temp pos1 = bld.tmp(RegClass(sample_pos.type(), 1)); + Temp pos2 = bld.tmp(RegClass(sample_pos.type(), 1)); + bld.pseudo(aco_opcode::p_split_vector, Definition(pos1), Definition(pos2), sample_pos); + pos1 = bld.vop2_e64(aco_opcode::v_sub_f32, bld.def(v1), pos1, Operand(0x3f000000u)); + pos2 = bld.vop2_e64(aco_opcode::v_sub_f32, bld.def(v1), pos2, Operand(0x3f000000u)); + + emit_interp_center(ctx, get_ssa_temp(ctx, &instr->dest.ssa), pos1, pos2); + break; + } + case nir_intrinsic_load_barycentric_at_offset: { + Temp offset = get_ssa_temp(ctx, instr->src[0].ssa); + RegClass rc = RegClass(offset.type(), 1); + Temp pos1 = bld.tmp(rc), pos2 = bld.tmp(rc); + bld.pseudo(aco_opcode::p_split_vector, Definition(pos1), Definition(pos2), offset); + emit_interp_center(ctx, get_ssa_temp(ctx, &instr->dest.ssa), pos1, pos2); + break; + } + case nir_intrinsic_load_front_face: { + bld.vopc(aco_opcode::v_cmp_lg_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), + Operand(0u), get_arg(ctx, ctx->args->ac.front_face)).def(0).setHint(vcc); + break; + } + case nir_intrinsic_load_view_index: + case nir_intrinsic_load_layer_id: { + if (instr->intrinsic == nir_intrinsic_load_view_index && (ctx->stage & (sw_vs | sw_gs))) { + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.view_index))); + break; + } + + unsigned idx = nir_intrinsic_base(instr); + bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), + Operand(2u), bld.m0(get_arg(ctx, ctx->args->ac.prim_mask)), idx, 0); + break; + } + case nir_intrinsic_load_frag_coord: { + emit_load_frag_coord(ctx, get_ssa_temp(ctx, &instr->dest.ssa), 4); + break; + } + case nir_intrinsic_load_sample_pos: { + Temp posx = get_arg(ctx, ctx->args->ac.frag_pos[0]); + Temp posy = get_arg(ctx, ctx->args->ac.frag_pos[1]); + bld.pseudo(aco_opcode::p_create_vector, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), + posx.id() ? bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), posx) : Operand(0u), + posy.id() ? bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), posy) : Operand(0u)); + break; + } + case nir_intrinsic_load_interpolated_input: + visit_load_interpolated_input(ctx, instr); + break; + case nir_intrinsic_store_output: + visit_store_output(ctx, instr); + break; + case nir_intrinsic_load_input: + case nir_intrinsic_load_input_vertex: + visit_load_input(ctx, instr); + break; + case nir_intrinsic_load_per_vertex_input: + visit_load_per_vertex_input(ctx, instr); + break; + case nir_intrinsic_load_ubo: + visit_load_ubo(ctx, instr); + break; + case nir_intrinsic_load_push_constant: + visit_load_push_constant(ctx, instr); + break; + case nir_intrinsic_load_constant: + visit_load_constant(ctx, instr); + break; + case nir_intrinsic_vulkan_resource_index: + visit_load_resource(ctx, instr); + break; + case nir_intrinsic_discard: + visit_discard(ctx, instr); + break; + case nir_intrinsic_discard_if: + visit_discard_if(ctx, instr); + break; + case nir_intrinsic_load_shared: + visit_load_shared(ctx, instr); + break; + case nir_intrinsic_store_shared: + visit_store_shared(ctx, instr); + break; + case nir_intrinsic_shared_atomic_add: + case nir_intrinsic_shared_atomic_imin: + case nir_intrinsic_shared_atomic_umin: + case nir_intrinsic_shared_atomic_imax: + case nir_intrinsic_shared_atomic_umax: + case nir_intrinsic_shared_atomic_and: + case nir_intrinsic_shared_atomic_or: + case nir_intrinsic_shared_atomic_xor: + case nir_intrinsic_shared_atomic_exchange: + case nir_intrinsic_shared_atomic_comp_swap: + visit_shared_atomic(ctx, instr); + break; + case nir_intrinsic_image_deref_load: + visit_image_load(ctx, instr); + break; + case nir_intrinsic_image_deref_store: + visit_image_store(ctx, instr); + break; + case nir_intrinsic_image_deref_atomic_add: + case nir_intrinsic_image_deref_atomic_umin: + case nir_intrinsic_image_deref_atomic_imin: + case nir_intrinsic_image_deref_atomic_umax: + case nir_intrinsic_image_deref_atomic_imax: + case nir_intrinsic_image_deref_atomic_and: + case nir_intrinsic_image_deref_atomic_or: + case nir_intrinsic_image_deref_atomic_xor: + case nir_intrinsic_image_deref_atomic_exchange: + case nir_intrinsic_image_deref_atomic_comp_swap: + visit_image_atomic(ctx, instr); + break; + case nir_intrinsic_image_deref_size: + visit_image_size(ctx, instr); + break; + case nir_intrinsic_load_ssbo: + visit_load_ssbo(ctx, instr); + break; + case nir_intrinsic_store_ssbo: + visit_store_ssbo(ctx, instr); + break; + case nir_intrinsic_load_global: + visit_load_global(ctx, instr); + break; + case nir_intrinsic_store_global: + visit_store_global(ctx, instr); + break; + case nir_intrinsic_global_atomic_add: + case nir_intrinsic_global_atomic_imin: + case nir_intrinsic_global_atomic_umin: + case nir_intrinsic_global_atomic_imax: + case nir_intrinsic_global_atomic_umax: + case nir_intrinsic_global_atomic_and: + case nir_intrinsic_global_atomic_or: + case nir_intrinsic_global_atomic_xor: + case nir_intrinsic_global_atomic_exchange: + case nir_intrinsic_global_atomic_comp_swap: + visit_global_atomic(ctx, instr); + break; + case nir_intrinsic_ssbo_atomic_add: + case nir_intrinsic_ssbo_atomic_imin: + case nir_intrinsic_ssbo_atomic_umin: + case nir_intrinsic_ssbo_atomic_imax: + case nir_intrinsic_ssbo_atomic_umax: + case nir_intrinsic_ssbo_atomic_and: + case nir_intrinsic_ssbo_atomic_or: + case nir_intrinsic_ssbo_atomic_xor: + case nir_intrinsic_ssbo_atomic_exchange: + case nir_intrinsic_ssbo_atomic_comp_swap: + visit_atomic_ssbo(ctx, instr); + break; + case nir_intrinsic_load_scratch: + visit_load_scratch(ctx, instr); + break; + case nir_intrinsic_store_scratch: + visit_store_scratch(ctx, instr); + break; + case nir_intrinsic_get_buffer_size: + visit_get_buffer_size(ctx, instr); + break; + case nir_intrinsic_control_barrier: { + unsigned* bsize = ctx->program->info->cs.block_size; + unsigned workgroup_size = bsize[0] * bsize[1] * bsize[2]; + if (workgroup_size > ctx->program->wave_size) + bld.sopp(aco_opcode::s_barrier); + break; + } + case nir_intrinsic_group_memory_barrier: + case nir_intrinsic_memory_barrier: + case nir_intrinsic_memory_barrier_buffer: + case nir_intrinsic_memory_barrier_image: + case nir_intrinsic_memory_barrier_shared: + emit_memory_barrier(ctx, instr); + break; + case nir_intrinsic_memory_barrier_tcs_patch: + break; + case nir_intrinsic_load_num_work_groups: { + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.num_work_groups))); + emit_split_vector(ctx, dst, 3); + break; + } + case nir_intrinsic_load_local_invocation_id: { + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.local_invocation_ids))); + emit_split_vector(ctx, dst, 3); + break; + } + case nir_intrinsic_load_work_group_id: { + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + struct ac_arg *args = ctx->args->ac.workgroup_ids; + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), + args[0].used ? Operand(get_arg(ctx, args[0])) : Operand(0u), + args[1].used ? Operand(get_arg(ctx, args[1])) : Operand(0u), + args[2].used ? Operand(get_arg(ctx, args[2])) : Operand(0u)); + emit_split_vector(ctx, dst, 3); + break; + } + case nir_intrinsic_load_local_invocation_index: { + Temp id = emit_mbcnt(ctx, bld.def(v1)); + + /* The tg_size bits [6:11] contain the subgroup id, + * we need this multiplied by the wave size, and then OR the thread id to it. + */ + if (ctx->program->wave_size == 64) { + /* After the s_and the bits are already multiplied by 64 (left shifted by 6) so we can just feed that to v_or */ + Temp tg_num = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0xfc0u), + get_arg(ctx, ctx->args->ac.tg_size)); + bld.vop2(aco_opcode::v_or_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), tg_num, id); + } else { + /* Extract the bit field and multiply the result by 32 (left shift by 5), then do the OR */ + Temp tg_num = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), + get_arg(ctx, ctx->args->ac.tg_size), Operand(0x6u | (0x6u << 16))); + bld.vop3(aco_opcode::v_lshl_or_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), tg_num, Operand(0x5u), id); + } + break; + } + case nir_intrinsic_load_subgroup_id: { + if (ctx->stage == compute_cs) { + bld.sop2(aco_opcode::s_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), bld.def(s1, scc), + get_arg(ctx, ctx->args->ac.tg_size), Operand(0x6u | (0x6u << 16))); + } else { + bld.sop1(aco_opcode::s_mov_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand(0x0u)); + } + break; + } + case nir_intrinsic_load_subgroup_invocation: { + emit_mbcnt(ctx, Definition(get_ssa_temp(ctx, &instr->dest.ssa))); + break; + } + case nir_intrinsic_load_num_subgroups: { + if (ctx->stage == compute_cs) + bld.sop2(aco_opcode::s_and_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), bld.def(s1, scc), Operand(0x3fu), + get_arg(ctx, ctx->args->ac.tg_size)); + else + bld.sop1(aco_opcode::s_mov_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand(0x1u)); + break; + } + case nir_intrinsic_ballot: { + Temp src = get_ssa_temp(ctx, instr->src[0].ssa); + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + Definition tmp = bld.def(dst.regClass()); + Definition lanemask_tmp = dst.size() == bld.lm.size() ? tmp : bld.def(src.regClass()); + if (instr->src[0].ssa->bit_size == 1) { + assert(src.regClass() == bld.lm); + bld.sop2(Builder::s_and, lanemask_tmp, bld.def(s1, scc), Operand(exec, bld.lm), src); + } else if (instr->src[0].ssa->bit_size == 32 && src.regClass() == v1) { + bld.vopc(aco_opcode::v_cmp_lg_u32, lanemask_tmp, Operand(0u), src); + } else if (instr->src[0].ssa->bit_size == 64 && src.regClass() == v2) { + bld.vopc(aco_opcode::v_cmp_lg_u64, lanemask_tmp, Operand(0u), src); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + if (dst.size() != bld.lm.size()) { + /* Wave32 with ballot size set to 64 */ + bld.pseudo(aco_opcode::p_create_vector, Definition(tmp), lanemask_tmp.getTemp(), Operand(0u)); + } + emit_wqm(ctx, tmp.getTemp(), dst); + break; + } + case nir_intrinsic_shuffle: + case nir_intrinsic_read_invocation: { + Temp src = get_ssa_temp(ctx, instr->src[0].ssa); + if (!ctx->divergent_vals[instr->src[0].ssa->index]) { + emit_uniform_subgroup(ctx, instr, src); + } else { + Temp tid = get_ssa_temp(ctx, instr->src[1].ssa); + if (instr->intrinsic == nir_intrinsic_read_invocation || !ctx->divergent_vals[instr->src[1].ssa->index]) + tid = bld.as_uniform(tid); + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + if (src.regClass() == v1) { + emit_wqm(ctx, emit_bpermute(ctx, bld, tid, src), dst); + } else if (src.regClass() == v2) { + Temp lo = bld.tmp(v1), hi = bld.tmp(v1); + bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src); + lo = emit_wqm(ctx, emit_bpermute(ctx, bld, tid, lo)); + hi = emit_wqm(ctx, emit_bpermute(ctx, bld, tid, hi)); + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi); + emit_split_vector(ctx, dst, 2); + } else if (instr->dest.ssa.bit_size == 1 && tid.regClass() == s1) { + assert(src.regClass() == bld.lm); + Temp tmp = bld.sopc(Builder::s_bitcmp1, bld.def(s1, scc), src, tid); + bool_to_vector_condition(ctx, emit_wqm(ctx, tmp), dst); + } else if (instr->dest.ssa.bit_size == 1 && tid.regClass() == v1) { + assert(src.regClass() == bld.lm); + Temp tmp; + if (ctx->program->chip_class <= GFX7) + tmp = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), src, tid); + else if (ctx->program->wave_size == 64) + tmp = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), tid, src); + else + tmp = bld.vop2_e64(aco_opcode::v_lshrrev_b32, bld.def(v1), tid, src); + tmp = emit_extract_vector(ctx, tmp, 0, v1); + tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(1u), tmp); + emit_wqm(ctx, bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), tmp), dst); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + } + break; + } + case nir_intrinsic_load_sample_id: { + bld.vop3(aco_opcode::v_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), + get_arg(ctx, ctx->args->ac.ancillary), Operand(8u), Operand(4u)); + break; + } + case nir_intrinsic_load_sample_mask_in: { + visit_load_sample_mask_in(ctx, instr); + break; + } + case nir_intrinsic_read_first_invocation: { + Temp src = get_ssa_temp(ctx, instr->src[0].ssa); + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + if (src.regClass() == v1) { + emit_wqm(ctx, + bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), src), + dst); + } else if (src.regClass() == v2) { + Temp lo = bld.tmp(v1), hi = bld.tmp(v1); + bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src); + lo = emit_wqm(ctx, bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), lo)); + hi = emit_wqm(ctx, bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), hi)); + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi); + emit_split_vector(ctx, dst, 2); + } else if (instr->dest.ssa.bit_size == 1) { + assert(src.regClass() == bld.lm); + Temp tmp = bld.sopc(Builder::s_bitcmp1, bld.def(s1, scc), src, + bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm))); + bool_to_vector_condition(ctx, emit_wqm(ctx, tmp), dst); + } else if (src.regClass() == s1) { + bld.sop1(aco_opcode::s_mov_b32, Definition(dst), src); + } else if (src.regClass() == s2) { + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_intrinsic_vote_all: { + Temp src = get_ssa_temp(ctx, instr->src[0].ssa); + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + assert(src.regClass() == bld.lm); + assert(dst.regClass() == bld.lm); + + Temp tmp = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src).def(1).getTemp(); + Temp cond = bool_to_vector_condition(ctx, emit_wqm(ctx, tmp)); + bld.sop1(Builder::s_not, Definition(dst), bld.def(s1, scc), cond); + break; + } + case nir_intrinsic_vote_any: { + Temp src = get_ssa_temp(ctx, instr->src[0].ssa); + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + assert(src.regClass() == bld.lm); + assert(dst.regClass() == bld.lm); + + Temp tmp = bool_to_scalar_condition(ctx, src); + bool_to_vector_condition(ctx, emit_wqm(ctx, tmp), dst); + break; + } + case nir_intrinsic_reduce: + case nir_intrinsic_inclusive_scan: + case nir_intrinsic_exclusive_scan: { + Temp src = get_ssa_temp(ctx, instr->src[0].ssa); + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + nir_op op = (nir_op) nir_intrinsic_reduction_op(instr); + unsigned cluster_size = instr->intrinsic == nir_intrinsic_reduce ? + nir_intrinsic_cluster_size(instr) : 0; + cluster_size = util_next_power_of_two(MIN2(cluster_size ? cluster_size : ctx->program->wave_size, ctx->program->wave_size)); + + if (!ctx->divergent_vals[instr->src[0].ssa->index] && (op == nir_op_ior || op == nir_op_iand)) { + emit_uniform_subgroup(ctx, instr, src); + } else if (instr->dest.ssa.bit_size == 1) { + if (op == nir_op_imul || op == nir_op_umin || op == nir_op_imin) + op = nir_op_iand; + else if (op == nir_op_iadd) + op = nir_op_ixor; + else if (op == nir_op_umax || op == nir_op_imax) + op = nir_op_ior; + assert(op == nir_op_iand || op == nir_op_ior || op == nir_op_ixor); + + switch (instr->intrinsic) { + case nir_intrinsic_reduce: + emit_wqm(ctx, emit_boolean_reduce(ctx, op, cluster_size, src), dst); + break; + case nir_intrinsic_exclusive_scan: + emit_wqm(ctx, emit_boolean_exclusive_scan(ctx, op, src), dst); + break; + case nir_intrinsic_inclusive_scan: + emit_wqm(ctx, emit_boolean_inclusive_scan(ctx, op, src), dst); + break; + default: + assert(false); + } + } else if (cluster_size == 1) { + bld.copy(Definition(dst), src); + } else { + src = as_vgpr(ctx, src); + + ReduceOp reduce_op; + switch (op) { + #define CASE(name) case nir_op_##name: reduce_op = (src.regClass() == v1) ? name##32 : name##64; break; + CASE(iadd) + CASE(imul) + CASE(fadd) + CASE(fmul) + CASE(imin) + CASE(umin) + CASE(fmin) + CASE(imax) + CASE(umax) + CASE(fmax) + CASE(iand) + CASE(ior) + CASE(ixor) + default: + unreachable("unknown reduction op"); + #undef CASE + } + + aco_opcode aco_op; + switch (instr->intrinsic) { + case nir_intrinsic_reduce: aco_op = aco_opcode::p_reduce; break; + case nir_intrinsic_inclusive_scan: aco_op = aco_opcode::p_inclusive_scan; break; + case nir_intrinsic_exclusive_scan: aco_op = aco_opcode::p_exclusive_scan; break; + default: + unreachable("unknown reduce intrinsic"); + } + + aco_ptr reduce{create_instruction(aco_op, Format::PSEUDO_REDUCTION, 3, 5)}; + reduce->operands[0] = Operand(src); + // filled in by aco_reduce_assign.cpp, used internally as part of the + // reduce sequence + assert(dst.size() == 1 || dst.size() == 2); + reduce->operands[1] = Operand(RegClass(RegType::vgpr, dst.size()).as_linear()); + reduce->operands[2] = Operand(v1.as_linear()); + + Temp tmp_dst = bld.tmp(dst.regClass()); + reduce->definitions[0] = Definition(tmp_dst); + reduce->definitions[1] = bld.def(ctx->program->lane_mask); // used internally + reduce->definitions[2] = Definition(); + reduce->definitions[3] = Definition(scc, s1); + reduce->definitions[4] = Definition(); + reduce->reduce_op = reduce_op; + reduce->cluster_size = cluster_size; + ctx->block->instructions.emplace_back(std::move(reduce)); + + emit_wqm(ctx, tmp_dst, dst); + } + break; + } + case nir_intrinsic_quad_broadcast: { + Temp src = get_ssa_temp(ctx, instr->src[0].ssa); + if (!ctx->divergent_vals[instr->dest.ssa.index]) { + emit_uniform_subgroup(ctx, instr, src); + } else { + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + unsigned lane = nir_src_as_const_value(instr->src[1])->u32; + uint32_t dpp_ctrl = dpp_quad_perm(lane, lane, lane, lane); + + if (instr->dest.ssa.bit_size == 1) { + assert(src.regClass() == bld.lm); + assert(dst.regClass() == bld.lm); + uint32_t half_mask = 0x11111111u << lane; + Temp mask_tmp = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(half_mask), Operand(half_mask)); + Temp tmp = bld.tmp(bld.lm); + bld.sop1(Builder::s_wqm, Definition(tmp), + bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), mask_tmp, + bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)))); + emit_wqm(ctx, tmp, dst); + } else if (instr->dest.ssa.bit_size == 32) { + if (ctx->program->chip_class >= GFX8) + emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl), dst); + else + emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl), dst); + } else if (instr->dest.ssa.bit_size == 64) { + Temp lo = bld.tmp(v1), hi = bld.tmp(v1); + bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src); + if (ctx->program->chip_class >= GFX8) { + lo = emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), lo, dpp_ctrl)); + hi = emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), hi, dpp_ctrl)); + } else { + lo = emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), lo, (1 << 15) | dpp_ctrl)); + hi = emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), hi, (1 << 15) | dpp_ctrl)); + } + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi); + emit_split_vector(ctx, dst, 2); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + } + break; + } + case nir_intrinsic_quad_swap_horizontal: + case nir_intrinsic_quad_swap_vertical: + case nir_intrinsic_quad_swap_diagonal: + case nir_intrinsic_quad_swizzle_amd: { + Temp src = get_ssa_temp(ctx, instr->src[0].ssa); + if (!ctx->divergent_vals[instr->dest.ssa.index]) { + emit_uniform_subgroup(ctx, instr, src); + break; + } + uint16_t dpp_ctrl = 0; + switch (instr->intrinsic) { + case nir_intrinsic_quad_swap_horizontal: + dpp_ctrl = dpp_quad_perm(1, 0, 3, 2); + break; + case nir_intrinsic_quad_swap_vertical: + dpp_ctrl = dpp_quad_perm(2, 3, 0, 1); + break; + case nir_intrinsic_quad_swap_diagonal: + dpp_ctrl = dpp_quad_perm(3, 2, 1, 0); + break; + case nir_intrinsic_quad_swizzle_amd: + dpp_ctrl = nir_intrinsic_swizzle_mask(instr); + break; + default: + break; + } + if (ctx->program->chip_class < GFX8) + dpp_ctrl |= (1 << 15); + + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + if (instr->dest.ssa.bit_size == 1) { + assert(src.regClass() == bld.lm); + src = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand((uint32_t)-1), src); + if (ctx->program->chip_class >= GFX8) + src = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl); + else + src = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, dpp_ctrl); + Temp tmp = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), src); + emit_wqm(ctx, tmp, dst); + } else if (instr->dest.ssa.bit_size == 32) { + Temp tmp; + if (ctx->program->chip_class >= GFX8) + tmp = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl); + else + tmp = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, dpp_ctrl); + emit_wqm(ctx, tmp, dst); + } else if (instr->dest.ssa.bit_size == 64) { + Temp lo = bld.tmp(v1), hi = bld.tmp(v1); + bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src); + if (ctx->program->chip_class >= GFX8) { + lo = emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), lo, dpp_ctrl)); + hi = emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), hi, dpp_ctrl)); + } else { + lo = emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), lo, dpp_ctrl)); + hi = emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), hi, dpp_ctrl)); + } + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi); + emit_split_vector(ctx, dst, 2); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_intrinsic_masked_swizzle_amd: { + Temp src = get_ssa_temp(ctx, instr->src[0].ssa); + if (!ctx->divergent_vals[instr->dest.ssa.index]) { + emit_uniform_subgroup(ctx, instr, src); + break; + } + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + uint32_t mask = nir_intrinsic_swizzle_mask(instr); + if (dst.regClass() == v1) { + emit_wqm(ctx, + bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, mask, 0, false), + dst); + } else if (dst.regClass() == v2) { + Temp lo = bld.tmp(v1), hi = bld.tmp(v1); + bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src); + lo = emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), lo, mask, 0, false)); + hi = emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), hi, mask, 0, false)); + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi); + emit_split_vector(ctx, dst, 2); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_intrinsic_write_invocation_amd: { + Temp src = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa)); + Temp val = bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa)); + Temp lane = bld.as_uniform(get_ssa_temp(ctx, instr->src[2].ssa)); + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + if (dst.regClass() == v1) { + /* src2 is ignored for writelane. RA assigns the same reg for dst */ + emit_wqm(ctx, bld.writelane(bld.def(v1), val, lane, src), dst); + } else if (dst.regClass() == v2) { + Temp src_lo = bld.tmp(v1), src_hi = bld.tmp(v1); + Temp val_lo = bld.tmp(s1), val_hi = bld.tmp(s1); + bld.pseudo(aco_opcode::p_split_vector, Definition(src_lo), Definition(src_hi), src); + bld.pseudo(aco_opcode::p_split_vector, Definition(val_lo), Definition(val_hi), val); + Temp lo = emit_wqm(ctx, bld.writelane(bld.def(v1), val_lo, lane, src_hi)); + Temp hi = emit_wqm(ctx, bld.writelane(bld.def(v1), val_hi, lane, src_hi)); + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi); + emit_split_vector(ctx, dst, 2); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_intrinsic_mbcnt_amd: { + Temp src = get_ssa_temp(ctx, instr->src[0].ssa); + RegClass rc = RegClass(src.type(), 1); + Temp mask_lo = bld.tmp(rc), mask_hi = bld.tmp(rc); + bld.pseudo(aco_opcode::p_split_vector, Definition(mask_lo), Definition(mask_hi), src); + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + Temp wqm_tmp = emit_mbcnt(ctx, bld.def(v1), Operand(mask_lo), Operand(mask_hi)); + emit_wqm(ctx, wqm_tmp, dst); + break; + } + case nir_intrinsic_load_helper_invocation: { + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + bld.pseudo(aco_opcode::p_load_helper, Definition(dst)); + ctx->block->kind |= block_kind_needs_lowering; + ctx->program->needs_exact = true; + break; + } + case nir_intrinsic_is_helper_invocation: { + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + bld.pseudo(aco_opcode::p_is_helper, Definition(dst)); + ctx->block->kind |= block_kind_needs_lowering; + ctx->program->needs_exact = true; + break; + } + case nir_intrinsic_demote: + bld.pseudo(aco_opcode::p_demote_to_helper, Operand(-1u)); + + if (ctx->cf_info.loop_nest_depth || ctx->cf_info.parent_if.is_divergent) + ctx->cf_info.exec_potentially_empty_discard = true; + ctx->block->kind |= block_kind_uses_demote; + ctx->program->needs_exact = true; + break; + case nir_intrinsic_demote_if: { + Temp src = get_ssa_temp(ctx, instr->src[0].ssa); + assert(src.regClass() == bld.lm); + Temp cond = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)); + bld.pseudo(aco_opcode::p_demote_to_helper, cond); + + if (ctx->cf_info.loop_nest_depth || ctx->cf_info.parent_if.is_divergent) + ctx->cf_info.exec_potentially_empty_discard = true; + ctx->block->kind |= block_kind_uses_demote; + ctx->program->needs_exact = true; + break; + } + case nir_intrinsic_first_invocation: { + emit_wqm(ctx, bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm)), + get_ssa_temp(ctx, &instr->dest.ssa)); + break; + } + case nir_intrinsic_shader_clock: + bld.smem(aco_opcode::s_memtime, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), false); + emit_split_vector(ctx, get_ssa_temp(ctx, &instr->dest.ssa), 2); + break; + case nir_intrinsic_load_vertex_id_zero_base: { + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.vertex_id)); + break; + } + case nir_intrinsic_load_first_vertex: { + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.base_vertex)); + break; + } + case nir_intrinsic_load_base_instance: { + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.start_instance)); + break; + } + case nir_intrinsic_load_instance_id: { + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.instance_id)); + break; + } + case nir_intrinsic_load_draw_id: { + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.draw_id)); + break; + } + case nir_intrinsic_load_invocation_id: { + assert(ctx->shader->info.stage == MESA_SHADER_GEOMETRY); + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + if (ctx->options->chip_class >= GFX10) + bld.vop2_e64(aco_opcode::v_and_b32, Definition(dst), Operand(127u), get_arg(ctx, ctx->args->ac.gs_invocation_id)); + else + bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.gs_invocation_id)); + break; + } + case nir_intrinsic_load_primitive_id: { + assert(ctx->shader->info.stage == MESA_SHADER_GEOMETRY); + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.gs_prim_id)); + break; + } + case nir_intrinsic_emit_vertex_with_counter: { + visit_emit_vertex_with_counter(ctx, instr); + break; + } + case nir_intrinsic_end_primitive_with_counter: { + unsigned stream = nir_intrinsic_stream_id(instr); + bld.sopp(aco_opcode::s_sendmsg, bld.m0(ctx->gs_wave_id), -1, sendmsg_gs(true, false, stream)); + break; + } + case nir_intrinsic_set_vertex_count: { + /* unused, the HW keeps track of this for us */ + break; + } + default: + fprintf(stderr, "Unimplemented intrinsic instr: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + abort(); + + break; + } +} + + +void tex_fetch_ptrs(isel_context *ctx, nir_tex_instr *instr, + Temp *res_ptr, Temp *samp_ptr, Temp *fmask_ptr, + enum glsl_base_type *stype) +{ + nir_deref_instr *texture_deref_instr = NULL; + nir_deref_instr *sampler_deref_instr = NULL; + int plane = -1; + + for (unsigned i = 0; i < instr->num_srcs; i++) { + switch (instr->src[i].src_type) { + case nir_tex_src_texture_deref: + texture_deref_instr = nir_src_as_deref(instr->src[i].src); + break; + case nir_tex_src_sampler_deref: + sampler_deref_instr = nir_src_as_deref(instr->src[i].src); + break; + case nir_tex_src_plane: + plane = nir_src_as_int(instr->src[i].src); + break; + default: + break; + } + } + + *stype = glsl_get_sampler_result_type(texture_deref_instr->type); + + if (!sampler_deref_instr) + sampler_deref_instr = texture_deref_instr; + + if (plane >= 0) { + assert(instr->op != nir_texop_txf_ms && + instr->op != nir_texop_samples_identical); + assert(instr->sampler_dim != GLSL_SAMPLER_DIM_BUF); + *res_ptr = get_sampler_desc(ctx, texture_deref_instr, (aco_descriptor_type)(ACO_DESC_PLANE_0 + plane), instr, false, false); + } else if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) { + *res_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_BUFFER, instr, false, false); + } else if (instr->op == nir_texop_fragment_mask_fetch) { + *res_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_FMASK, instr, false, false); + } else { + *res_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_IMAGE, instr, false, false); + } + if (samp_ptr) { + *samp_ptr = get_sampler_desc(ctx, sampler_deref_instr, ACO_DESC_SAMPLER, instr, false, false); + + if (instr->sampler_dim < GLSL_SAMPLER_DIM_RECT && ctx->options->chip_class < GFX8) { + /* fix sampler aniso on SI/CI: samp[0] = samp[0] & img[7] */ + Builder bld(ctx->program, ctx->block); + + /* to avoid unnecessary moves, we split and recombine sampler and image */ + Temp img[8] = {bld.tmp(s1), bld.tmp(s1), bld.tmp(s1), bld.tmp(s1), + bld.tmp(s1), bld.tmp(s1), bld.tmp(s1), bld.tmp(s1)}; + Temp samp[4] = {bld.tmp(s1), bld.tmp(s1), bld.tmp(s1), bld.tmp(s1)}; + bld.pseudo(aco_opcode::p_split_vector, Definition(img[0]), Definition(img[1]), + Definition(img[2]), Definition(img[3]), Definition(img[4]), + Definition(img[5]), Definition(img[6]), Definition(img[7]), *res_ptr); + bld.pseudo(aco_opcode::p_split_vector, Definition(samp[0]), Definition(samp[1]), + Definition(samp[2]), Definition(samp[3]), *samp_ptr); + + samp[0] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), samp[0], img[7]); + *res_ptr = bld.pseudo(aco_opcode::p_create_vector, bld.def(s8), + img[0], img[1], img[2], img[3], + img[4], img[5], img[6], img[7]); + *samp_ptr = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), + samp[0], samp[1], samp[2], samp[3]); + } + } + if (fmask_ptr && (instr->op == nir_texop_txf_ms || + instr->op == nir_texop_samples_identical)) + *fmask_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_FMASK, instr, false, false); +} + +void build_cube_select(isel_context *ctx, Temp ma, Temp id, Temp deriv, + Temp *out_ma, Temp *out_sc, Temp *out_tc) +{ + Builder bld(ctx->program, ctx->block); + + Temp deriv_x = emit_extract_vector(ctx, deriv, 0, v1); + Temp deriv_y = emit_extract_vector(ctx, deriv, 1, v1); + Temp deriv_z = emit_extract_vector(ctx, deriv, 2, v1); + + Operand neg_one(0xbf800000u); + Operand one(0x3f800000u); + Operand two(0x40000000u); + Operand four(0x40800000u); + + Temp is_ma_positive = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), ma); + Temp sgn_ma = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), neg_one, one, is_ma_positive); + Temp neg_sgn_ma = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), Operand(0u), sgn_ma); + + Temp is_ma_z = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(bld.lm)), four, id); + Temp is_ma_y = bld.vopc(aco_opcode::v_cmp_le_f32, bld.def(bld.lm), two, id); + is_ma_y = bld.sop2(Builder::s_andn2, bld.hint_vcc(bld.def(bld.lm)), is_ma_y, is_ma_z); + Temp is_not_ma_x = bld.sop2(aco_opcode::s_or_b64, bld.hint_vcc(bld.def(bld.lm)), bld.def(s1, scc), is_ma_z, is_ma_y); + + // select sc + Temp tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_z, deriv_x, is_not_ma_x); + Temp sgn = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), + bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), neg_sgn_ma, sgn_ma, is_ma_z), + one, is_ma_y); + *out_sc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tmp, sgn); + + // select tc + tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_y, deriv_z, is_ma_y); + sgn = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), neg_one, sgn_ma, is_ma_y); + *out_tc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tmp, sgn); + + // select ma + tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), + bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_x, deriv_y, is_ma_y), + deriv_z, is_ma_z); + tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7fffffffu), tmp); + *out_ma = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), two, tmp); +} + +void prepare_cube_coords(isel_context *ctx, std::vector& coords, Temp* ddx, Temp* ddy, bool is_deriv, bool is_array) +{ + Builder bld(ctx->program, ctx->block); + Temp ma, tc, sc, id; + + if (is_array) { + coords[3] = bld.vop1(aco_opcode::v_rndne_f32, bld.def(v1), coords[3]); + + // see comment in ac_prepare_cube_coords() + if (ctx->options->chip_class <= GFX8) + coords[3] = bld.vop2(aco_opcode::v_max_f32, bld.def(v1), Operand(0u), coords[3]); + } + + ma = bld.vop3(aco_opcode::v_cubema_f32, bld.def(v1), coords[0], coords[1], coords[2]); + + aco_ptr vop3a{create_instruction(aco_opcode::v_rcp_f32, asVOP3(Format::VOP1), 1, 1)}; + vop3a->operands[0] = Operand(ma); + vop3a->abs[0] = true; + Temp invma = bld.tmp(v1); + vop3a->definitions[0] = Definition(invma); + ctx->block->instructions.emplace_back(std::move(vop3a)); + + sc = bld.vop3(aco_opcode::v_cubesc_f32, bld.def(v1), coords[0], coords[1], coords[2]); + if (!is_deriv) + sc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), sc, invma, Operand(0x3fc00000u/*1.5*/)); + + tc = bld.vop3(aco_opcode::v_cubetc_f32, bld.def(v1), coords[0], coords[1], coords[2]); + if (!is_deriv) + tc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), tc, invma, Operand(0x3fc00000u/*1.5*/)); + + id = bld.vop3(aco_opcode::v_cubeid_f32, bld.def(v1), coords[0], coords[1], coords[2]); + + if (is_deriv) { + sc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), sc, invma); + tc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tc, invma); + + for (unsigned i = 0; i < 2; i++) { + // see comment in ac_prepare_cube_coords() + Temp deriv_ma; + Temp deriv_sc, deriv_tc; + build_cube_select(ctx, ma, id, i ? *ddy : *ddx, + &deriv_ma, &deriv_sc, &deriv_tc); + + deriv_ma = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, invma); + + Temp x = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), + bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_sc, invma), + bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, sc)); + Temp y = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), + bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_tc, invma), + bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, tc)); + *(i ? ddy : ddx) = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), x, y); + } + + sc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand(0x3fc00000u/*1.5*/), sc); + tc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand(0x3fc00000u/*1.5*/), tc); + } + + if (is_array) + id = bld.vop2(aco_opcode::v_madmk_f32, bld.def(v1), coords[3], id, Operand(0x41000000u/*8.0*/)); + coords.resize(3); + coords[0] = sc; + coords[1] = tc; + coords[2] = id; +} + +void get_const_vec(nir_ssa_def *vec, nir_const_value *cv[4]) +{ + if (vec->parent_instr->type != nir_instr_type_alu) + return; + nir_alu_instr *vec_instr = nir_instr_as_alu(vec->parent_instr); + if (vec_instr->op != nir_op_vec(vec->num_components)) + return; + + for (unsigned i = 0; i < vec->num_components; i++) { + cv[i] = vec_instr->src[i].swizzle[0] == 0 ? + nir_src_as_const_value(vec_instr->src[i].src) : NULL; + } +} + +void visit_tex(isel_context *ctx, nir_tex_instr *instr) +{ + Builder bld(ctx->program, ctx->block); + bool has_bias = false, has_lod = false, level_zero = false, has_compare = false, + has_offset = false, has_ddx = false, has_ddy = false, has_derivs = false, has_sample_index = false; + Temp resource, sampler, fmask_ptr, bias = Temp(), compare = Temp(), sample_index = Temp(), + lod = Temp(), offset = Temp(), ddx = Temp(), ddy = Temp(); + std::vector coords; + std::vector derivs; + nir_const_value *sample_index_cv = NULL; + nir_const_value *const_offset[4] = {NULL, NULL, NULL, NULL}; + enum glsl_base_type stype; + tex_fetch_ptrs(ctx, instr, &resource, &sampler, &fmask_ptr, &stype); + + bool tg4_integer_workarounds = ctx->options->chip_class <= GFX8 && instr->op == nir_texop_tg4 && + (stype == GLSL_TYPE_UINT || stype == GLSL_TYPE_INT); + bool tg4_integer_cube_workaround = tg4_integer_workarounds && + instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE; + + for (unsigned i = 0; i < instr->num_srcs; i++) { + switch (instr->src[i].src_type) { + case nir_tex_src_coord: { + Temp coord = get_ssa_temp(ctx, instr->src[i].src.ssa); + for (unsigned i = 0; i < coord.size(); i++) + coords.emplace_back(emit_extract_vector(ctx, coord, i, v1)); + break; + } + case nir_tex_src_bias: + if (instr->op == nir_texop_txb) { + bias = get_ssa_temp(ctx, instr->src[i].src.ssa); + has_bias = true; + } + break; + case nir_tex_src_lod: { + nir_const_value *val = nir_src_as_const_value(instr->src[i].src); + + if (val && val->f32 <= 0.0) { + level_zero = true; + } else { + lod = get_ssa_temp(ctx, instr->src[i].src.ssa); + has_lod = true; + } + break; + } + case nir_tex_src_comparator: + if (instr->is_shadow) { + compare = get_ssa_temp(ctx, instr->src[i].src.ssa); + has_compare = true; + } + break; + case nir_tex_src_offset: + offset = get_ssa_temp(ctx, instr->src[i].src.ssa); + get_const_vec(instr->src[i].src.ssa, const_offset); + has_offset = true; + break; + case nir_tex_src_ddx: + ddx = get_ssa_temp(ctx, instr->src[i].src.ssa); + has_ddx = true; + break; + case nir_tex_src_ddy: + ddy = get_ssa_temp(ctx, instr->src[i].src.ssa); + has_ddy = true; + break; + case nir_tex_src_ms_index: + sample_index = get_ssa_temp(ctx, instr->src[i].src.ssa); + sample_index_cv = nir_src_as_const_value(instr->src[i].src); + has_sample_index = true; + break; + case nir_tex_src_texture_offset: + case nir_tex_src_sampler_offset: + default: + break; + } + } + + if (instr->op == nir_texop_txs && instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) + return get_buffer_size(ctx, resource, get_ssa_temp(ctx, &instr->dest.ssa), true); + + if (instr->op == nir_texop_texture_samples) { + Temp dword3 = emit_extract_vector(ctx, resource, 3, s1); + + Temp samples_log2 = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), dword3, Operand(16u | 4u<<16)); + Temp samples = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), Operand(1u), samples_log2); + Temp type = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), dword3, Operand(28u | 4u<<16 /* offset=28, width=4 */)); + Temp is_msaa = bld.sopc(aco_opcode::s_cmp_ge_u32, bld.def(s1, scc), type, Operand(14u)); + + bld.sop2(aco_opcode::s_cselect_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), + samples, Operand(1u), bld.scc(is_msaa)); + return; + } + + if (has_offset && instr->op != nir_texop_txf && instr->op != nir_texop_txf_ms) { + aco_ptr tmp_instr; + Temp acc, pack = Temp(); + + uint32_t pack_const = 0; + for (unsigned i = 0; i < offset.size(); i++) { + if (!const_offset[i]) + continue; + pack_const |= (const_offset[i]->u32 & 0x3Fu) << (8u * i); + } + + if (offset.type() == RegType::sgpr) { + for (unsigned i = 0; i < offset.size(); i++) { + if (const_offset[i]) + continue; + + acc = emit_extract_vector(ctx, offset, i, s1); + acc = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), acc, Operand(0x3Fu)); + + if (i) { + acc = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), acc, Operand(8u * i)); + } + + if (pack == Temp()) { + pack = acc; + } else { + pack = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), pack, acc); + } + } + + if (pack_const && pack != Temp()) + pack = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), Operand(pack_const), pack); + } else { + for (unsigned i = 0; i < offset.size(); i++) { + if (const_offset[i]) + continue; + + acc = emit_extract_vector(ctx, offset, i, v1); + acc = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x3Fu), acc); + + if (i) { + acc = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(8u * i), acc); + } + + if (pack == Temp()) { + pack = acc; + } else { + pack = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), pack, acc); + } + } + + if (pack_const && pack != Temp()) + pack = bld.sop2(aco_opcode::v_or_b32, bld.def(v1), Operand(pack_const), pack); + } + if (pack_const && pack == Temp()) + offset = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(pack_const)); + else if (pack == Temp()) + has_offset = false; + else + offset = pack; + } + + if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE && instr->coord_components) + prepare_cube_coords(ctx, coords, &ddx, &ddy, instr->op == nir_texop_txd, instr->is_array && instr->op != nir_texop_lod); + + /* pack derivatives */ + if (has_ddx || has_ddy) { + if (instr->sampler_dim == GLSL_SAMPLER_DIM_1D && ctx->options->chip_class == GFX9) { + assert(has_ddx && has_ddy && ddy.size() == 1 && ddy.size() == 1); + Temp zero = bld.copy(bld.def(v1), Operand(0u)); + derivs = {ddx, zero, ddy, zero}; + } else { + for (unsigned i = 0; has_ddx && i < ddx.size(); i++) + derivs.emplace_back(emit_extract_vector(ctx, ddx, i, v1)); + for (unsigned i = 0; has_ddy && i < ddy.size(); i++) + derivs.emplace_back(emit_extract_vector(ctx, ddy, i, v1)); + } + has_derivs = true; + } + + if (instr->coord_components > 1 && + instr->sampler_dim == GLSL_SAMPLER_DIM_1D && + instr->is_array && + instr->op != nir_texop_txf) + coords[1] = bld.vop1(aco_opcode::v_rndne_f32, bld.def(v1), coords[1]); + + if (instr->coord_components > 2 && + (instr->sampler_dim == GLSL_SAMPLER_DIM_2D || + instr->sampler_dim == GLSL_SAMPLER_DIM_MS || + instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS || + instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS) && + instr->is_array && + instr->op != nir_texop_txf && + instr->op != nir_texop_txf_ms && + instr->op != nir_texop_fragment_fetch && + instr->op != nir_texop_fragment_mask_fetch) + coords[2] = bld.vop1(aco_opcode::v_rndne_f32, bld.def(v1), coords[2]); + + if (ctx->options->chip_class == GFX9 && + instr->sampler_dim == GLSL_SAMPLER_DIM_1D && + instr->op != nir_texop_lod && instr->coord_components) { + assert(coords.size() > 0 && coords.size() < 3); + + coords.insert(std::next(coords.begin()), bld.copy(bld.def(v1), instr->op == nir_texop_txf ? + Operand((uint32_t) 0) : + Operand((uint32_t) 0x3f000000))); + } + + bool da = should_declare_array(ctx, instr->sampler_dim, instr->is_array); + + if (instr->op == nir_texop_samples_identical) + resource = fmask_ptr; + + else if ((instr->sampler_dim == GLSL_SAMPLER_DIM_MS || + instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS) && + instr->op != nir_texop_txs && + instr->op != nir_texop_fragment_fetch && + instr->op != nir_texop_fragment_mask_fetch) { + assert(has_sample_index); + Operand op(sample_index); + if (sample_index_cv) + op = Operand(sample_index_cv->u32); + sample_index = adjust_sample_index_using_fmask(ctx, da, coords, op, fmask_ptr); + } + + if (has_offset && (instr->op == nir_texop_txf || instr->op == nir_texop_txf_ms)) { + for (unsigned i = 0; i < std::min(offset.size(), instr->coord_components); i++) { + Temp off = emit_extract_vector(ctx, offset, i, v1); + coords[i] = bld.vadd32(bld.def(v1), coords[i], off); + } + has_offset = false; + } + + /* Build tex instruction */ + unsigned dmask = nir_ssa_def_components_read(&instr->dest.ssa); + unsigned dim = ctx->options->chip_class >= GFX10 && instr->sampler_dim != GLSL_SAMPLER_DIM_BUF + ? ac_get_sampler_dim(ctx->options->chip_class, instr->sampler_dim, instr->is_array) + : 0; + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + Temp tmp_dst = dst; + + /* gather4 selects the component by dmask and always returns vec4 */ + if (instr->op == nir_texop_tg4) { + assert(instr->dest.ssa.num_components == 4); + if (instr->is_shadow) + dmask = 1; + else + dmask = 1 << instr->component; + if (tg4_integer_cube_workaround || dst.type() == RegType::sgpr) + tmp_dst = bld.tmp(v4); + } else if (instr->op == nir_texop_samples_identical) { + tmp_dst = bld.tmp(v1); + } else if (util_bitcount(dmask) != instr->dest.ssa.num_components || dst.type() == RegType::sgpr) { + tmp_dst = bld.tmp(RegClass(RegType::vgpr, util_bitcount(dmask))); + } + + aco_ptr tex; + if (instr->op == nir_texop_txs || instr->op == nir_texop_query_levels) { + if (!has_lod) + lod = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0u)); + + bool div_by_6 = instr->op == nir_texop_txs && + instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE && + instr->is_array && + (dmask & (1 << 2)); + if (tmp_dst.id() == dst.id() && div_by_6) + tmp_dst = bld.tmp(tmp_dst.regClass()); + + tex.reset(create_instruction(aco_opcode::image_get_resinfo, Format::MIMG, 3, 1)); + tex->operands[0] = Operand(resource); + tex->operands[1] = Operand(s4); /* no sampler */ + tex->operands[2] = Operand(as_vgpr(ctx,lod)); + if (ctx->options->chip_class == GFX9 && + instr->op == nir_texop_txs && + instr->sampler_dim == GLSL_SAMPLER_DIM_1D && + instr->is_array) { + tex->dmask = (dmask & 0x1) | ((dmask & 0x2) << 1); + } else if (instr->op == nir_texop_query_levels) { + tex->dmask = 1 << 3; + } else { + tex->dmask = dmask; + } + tex->da = da; + tex->definitions[0] = Definition(tmp_dst); + tex->dim = dim; + tex->can_reorder = true; + ctx->block->instructions.emplace_back(std::move(tex)); + + if (div_by_6) { + /* divide 3rd value by 6 by multiplying with magic number */ + emit_split_vector(ctx, tmp_dst, tmp_dst.size()); + Temp c = bld.copy(bld.def(s1), Operand((uint32_t) 0x2AAAAAAB)); + Temp by_6 = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), emit_extract_vector(ctx, tmp_dst, 2, v1), c); + assert(instr->dest.ssa.num_components == 3); + Temp tmp = dst.type() == RegType::vgpr ? dst : bld.tmp(v3); + tmp_dst = bld.pseudo(aco_opcode::p_create_vector, Definition(tmp), + emit_extract_vector(ctx, tmp_dst, 0, v1), + emit_extract_vector(ctx, tmp_dst, 1, v1), + by_6); + + } + + expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, dmask); + return; + } + + Temp tg4_compare_cube_wa64 = Temp(); + + if (tg4_integer_workarounds) { + tex.reset(create_instruction(aco_opcode::image_get_resinfo, Format::MIMG, 3, 1)); + tex->operands[0] = Operand(resource); + tex->operands[1] = Operand(s4); /* no sampler */ + tex->operands[2] = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0u)); + tex->dim = dim; + tex->dmask = 0x3; + tex->da = da; + Temp size = bld.tmp(v2); + tex->definitions[0] = Definition(size); + tex->can_reorder = true; + ctx->block->instructions.emplace_back(std::move(tex)); + emit_split_vector(ctx, size, size.size()); + + Temp half_texel[2]; + for (unsigned i = 0; i < 2; i++) { + half_texel[i] = emit_extract_vector(ctx, size, i, v1); + half_texel[i] = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), half_texel[i]); + half_texel[i] = bld.vop1(aco_opcode::v_rcp_iflag_f32, bld.def(v1), half_texel[i]); + half_texel[i] = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0xbf000000/*-0.5*/), half_texel[i]); + } + + Temp new_coords[2] = { + bld.vop2(aco_opcode::v_add_f32, bld.def(v1), coords[0], half_texel[0]), + bld.vop2(aco_opcode::v_add_f32, bld.def(v1), coords[1], half_texel[1]) + }; + + if (tg4_integer_cube_workaround) { + // see comment in ac_nir_to_llvm.c's lower_gather4_integer() + Temp desc[resource.size()]; + aco_ptr split{create_instruction(aco_opcode::p_split_vector, + Format::PSEUDO, 1, resource.size())}; + split->operands[0] = Operand(resource); + for (unsigned i = 0; i < resource.size(); i++) { + desc[i] = bld.tmp(s1); + split->definitions[i] = Definition(desc[i]); + } + ctx->block->instructions.emplace_back(std::move(split)); + + Temp dfmt = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), desc[1], Operand(20u | (6u << 16))); + Temp compare_cube_wa = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), dfmt, + Operand((uint32_t)V_008F14_IMG_DATA_FORMAT_8_8_8_8)); + + Temp nfmt; + if (stype == GLSL_TYPE_UINT) { + nfmt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), + Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_USCALED), + Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_UINT), + bld.scc(compare_cube_wa)); + } else { + nfmt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), + Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_SSCALED), + Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_SINT), + bld.scc(compare_cube_wa)); + } + tg4_compare_cube_wa64 = bld.tmp(bld.lm); + bool_to_vector_condition(ctx, compare_cube_wa, tg4_compare_cube_wa64); + + nfmt = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), nfmt, Operand(26u)); + + desc[1] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), desc[1], + Operand((uint32_t)C_008F14_NUM_FORMAT)); + desc[1] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), desc[1], nfmt); + + aco_ptr vec{create_instruction(aco_opcode::p_create_vector, + Format::PSEUDO, resource.size(), 1)}; + for (unsigned i = 0; i < resource.size(); i++) + vec->operands[i] = Operand(desc[i]); + resource = bld.tmp(resource.regClass()); + vec->definitions[0] = Definition(resource); + ctx->block->instructions.emplace_back(std::move(vec)); + + new_coords[0] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), + new_coords[0], coords[0], tg4_compare_cube_wa64); + new_coords[1] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), + new_coords[1], coords[1], tg4_compare_cube_wa64); + } + coords[0] = new_coords[0]; + coords[1] = new_coords[1]; + } + + if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) { + //FIXME: if (ctx->abi->gfx9_stride_size_workaround) return ac_build_buffer_load_format_gfx9_safe() + + assert(coords.size() == 1); + unsigned last_bit = util_last_bit(nir_ssa_def_components_read(&instr->dest.ssa)); + aco_opcode op; + switch (last_bit) { + case 1: + op = aco_opcode::buffer_load_format_x; break; + case 2: + op = aco_opcode::buffer_load_format_xy; break; + case 3: + op = aco_opcode::buffer_load_format_xyz; break; + case 4: + op = aco_opcode::buffer_load_format_xyzw; break; + default: + unreachable("Tex instruction loads more than 4 components."); + } + + /* if the instruction return value matches exactly the nir dest ssa, we can use it directly */ + if (last_bit == instr->dest.ssa.num_components && dst.type() == RegType::vgpr) + tmp_dst = dst; + else + tmp_dst = bld.tmp(RegType::vgpr, last_bit); + + aco_ptr mubuf{create_instruction(op, Format::MUBUF, 3, 1)}; + mubuf->operands[0] = Operand(resource); + mubuf->operands[1] = Operand(coords[0]); + mubuf->operands[2] = Operand((uint32_t) 0); + mubuf->definitions[0] = Definition(tmp_dst); + mubuf->idxen = true; + mubuf->can_reorder = true; + ctx->block->instructions.emplace_back(std::move(mubuf)); + + expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, (1 << last_bit) - 1); + return; + } + + /* gather MIMG address components */ + std::vector args; + if (has_offset) + args.emplace_back(offset); + if (has_bias) + args.emplace_back(bias); + if (has_compare) + args.emplace_back(compare); + if (has_derivs) + args.insert(args.end(), derivs.begin(), derivs.end()); + + args.insert(args.end(), coords.begin(), coords.end()); + if (has_sample_index) + args.emplace_back(sample_index); + if (has_lod) + args.emplace_back(lod); + + Temp arg = bld.tmp(RegClass(RegType::vgpr, args.size())); + aco_ptr vec{create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, args.size(), 1)}; + vec->definitions[0] = Definition(arg); + for (unsigned i = 0; i < args.size(); i++) + vec->operands[i] = Operand(args[i]); + ctx->block->instructions.emplace_back(std::move(vec)); + + + if (instr->op == nir_texop_txf || + instr->op == nir_texop_txf_ms || + instr->op == nir_texop_samples_identical || + instr->op == nir_texop_fragment_fetch || + instr->op == nir_texop_fragment_mask_fetch) { + aco_opcode op = level_zero || instr->sampler_dim == GLSL_SAMPLER_DIM_MS || instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS ? aco_opcode::image_load : aco_opcode::image_load_mip; + tex.reset(create_instruction(op, Format::MIMG, 3, 1)); + tex->operands[0] = Operand(resource); + tex->operands[1] = Operand(s4); /* no sampler */ + tex->operands[2] = Operand(arg); + tex->dim = dim; + tex->dmask = dmask; + tex->unrm = true; + tex->da = da; + tex->definitions[0] = Definition(tmp_dst); + tex->can_reorder = true; + ctx->block->instructions.emplace_back(std::move(tex)); + + if (instr->op == nir_texop_samples_identical) { + assert(dmask == 1 && dst.regClass() == v1); + assert(dst.id() != tmp_dst.id()); + + Temp tmp = bld.tmp(bld.lm); + bld.vopc(aco_opcode::v_cmp_eq_u32, Definition(tmp), Operand(0u), tmp_dst).def(0).setHint(vcc); + bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand((uint32_t)-1), tmp); + + } else { + expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, dmask); + } + return; + } + + // TODO: would be better to do this by adding offsets, but needs the opcodes ordered. + aco_opcode opcode = aco_opcode::image_sample; + if (has_offset) { /* image_sample_*_o */ + if (has_compare) { + opcode = aco_opcode::image_sample_c_o; + if (has_derivs) + opcode = aco_opcode::image_sample_c_d_o; + if (has_bias) + opcode = aco_opcode::image_sample_c_b_o; + if (level_zero) + opcode = aco_opcode::image_sample_c_lz_o; + if (has_lod) + opcode = aco_opcode::image_sample_c_l_o; + } else { + opcode = aco_opcode::image_sample_o; + if (has_derivs) + opcode = aco_opcode::image_sample_d_o; + if (has_bias) + opcode = aco_opcode::image_sample_b_o; + if (level_zero) + opcode = aco_opcode::image_sample_lz_o; + if (has_lod) + opcode = aco_opcode::image_sample_l_o; + } + } else { /* no offset */ + if (has_compare) { + opcode = aco_opcode::image_sample_c; + if (has_derivs) + opcode = aco_opcode::image_sample_c_d; + if (has_bias) + opcode = aco_opcode::image_sample_c_b; + if (level_zero) + opcode = aco_opcode::image_sample_c_lz; + if (has_lod) + opcode = aco_opcode::image_sample_c_l; + } else { + opcode = aco_opcode::image_sample; + if (has_derivs) + opcode = aco_opcode::image_sample_d; + if (has_bias) + opcode = aco_opcode::image_sample_b; + if (level_zero) + opcode = aco_opcode::image_sample_lz; + if (has_lod) + opcode = aco_opcode::image_sample_l; + } + } + + if (instr->op == nir_texop_tg4) { + if (has_offset) { + opcode = aco_opcode::image_gather4_lz_o; + if (has_compare) + opcode = aco_opcode::image_gather4_c_lz_o; + } else { + opcode = aco_opcode::image_gather4_lz; + if (has_compare) + opcode = aco_opcode::image_gather4_c_lz; + } + } else if (instr->op == nir_texop_lod) { + opcode = aco_opcode::image_get_lod; + } + + /* we don't need the bias, sample index, compare value or offset to be + * computed in WQM but if the p_create_vector copies the coordinates, then it + * needs to be in WQM */ + if (ctx->stage == fragment_fs && + !has_derivs && !has_lod && !level_zero && + instr->sampler_dim != GLSL_SAMPLER_DIM_MS && + instr->sampler_dim != GLSL_SAMPLER_DIM_SUBPASS_MS) + arg = emit_wqm(ctx, arg, bld.tmp(arg.regClass()), true); + + tex.reset(create_instruction(opcode, Format::MIMG, 3, 1)); + tex->operands[0] = Operand(resource); + tex->operands[1] = Operand(sampler); + tex->operands[2] = Operand(arg); + tex->dim = dim; + tex->dmask = dmask; + tex->da = da; + tex->definitions[0] = Definition(tmp_dst); + tex->can_reorder = true; + ctx->block->instructions.emplace_back(std::move(tex)); + + if (tg4_integer_cube_workaround) { + assert(tmp_dst.id() != dst.id()); + assert(tmp_dst.size() == dst.size() && dst.size() == 4); + + emit_split_vector(ctx, tmp_dst, tmp_dst.size()); + Temp val[4]; + for (unsigned i = 0; i < dst.size(); i++) { + val[i] = emit_extract_vector(ctx, tmp_dst, i, v1); + Temp cvt_val; + if (stype == GLSL_TYPE_UINT) + cvt_val = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), val[i]); + else + cvt_val = bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), val[i]); + val[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), val[i], cvt_val, tg4_compare_cube_wa64); + } + Temp tmp = dst.regClass() == v4 ? dst : bld.tmp(v4); + tmp_dst = bld.pseudo(aco_opcode::p_create_vector, Definition(tmp), + val[0], val[1], val[2], val[3]); + } + unsigned mask = instr->op == nir_texop_tg4 ? 0xF : dmask; + expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, mask); + +} + + +Operand get_phi_operand(isel_context *ctx, nir_ssa_def *ssa) +{ + Temp tmp = get_ssa_temp(ctx, ssa); + if (ssa->parent_instr->type == nir_instr_type_ssa_undef) + return Operand(tmp.regClass()); + else + return Operand(tmp); +} + +void visit_phi(isel_context *ctx, nir_phi_instr *instr) +{ + aco_ptr phi; + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + assert(instr->dest.ssa.bit_size != 1 || dst.regClass() == ctx->program->lane_mask); + + bool logical = !dst.is_linear() || ctx->divergent_vals[instr->dest.ssa.index]; + logical |= ctx->block->kind & block_kind_merge; + aco_opcode opcode = logical ? aco_opcode::p_phi : aco_opcode::p_linear_phi; + + /* we want a sorted list of sources, since the predecessor list is also sorted */ + std::map phi_src; + nir_foreach_phi_src(src, instr) + phi_src[src->pred->index] = src->src.ssa; + + std::vector& preds = logical ? ctx->block->logical_preds : ctx->block->linear_preds; + unsigned num_operands = 0; + Operand operands[std::max(exec_list_length(&instr->srcs), (unsigned)preds.size())]; + unsigned num_defined = 0; + unsigned cur_pred_idx = 0; + for (std::pair src : phi_src) { + if (cur_pred_idx < preds.size()) { + /* handle missing preds (IF merges with discard/break) and extra preds (loop exit with discard) */ + unsigned block = ctx->cf_info.nir_to_aco[src.first]; + unsigned skipped = 0; + while (cur_pred_idx + skipped < preds.size() && preds[cur_pred_idx + skipped] != block) + skipped++; + if (cur_pred_idx + skipped < preds.size()) { + for (unsigned i = 0; i < skipped; i++) + operands[num_operands++] = Operand(dst.regClass()); + cur_pred_idx += skipped; + } else { + continue; + } + } + /* Handle missing predecessors at the end. This shouldn't happen with loop + * headers and we can't ignore these sources for loop header phis. */ + if (!(ctx->block->kind & block_kind_loop_header) && cur_pred_idx >= preds.size()) + continue; + cur_pred_idx++; + Operand op = get_phi_operand(ctx, src.second); + operands[num_operands++] = op; + num_defined += !op.isUndefined(); + } + /* handle block_kind_continue_or_break at loop exit blocks */ + while (cur_pred_idx++ < preds.size()) + operands[num_operands++] = Operand(dst.regClass()); + + if (num_defined == 0) { + Builder bld(ctx->program, ctx->block); + if (dst.regClass() == s1) { + bld.sop1(aco_opcode::s_mov_b32, Definition(dst), Operand(0u)); + } else if (dst.regClass() == v1) { + bld.vop1(aco_opcode::v_mov_b32, Definition(dst), Operand(0u)); + } else { + aco_ptr vec{create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)}; + for (unsigned i = 0; i < dst.size(); i++) + vec->operands[i] = Operand(0u); + vec->definitions[0] = Definition(dst); + ctx->block->instructions.emplace_back(std::move(vec)); + } + return; + } + + /* we can use a linear phi in some cases if one src is undef */ + if (dst.is_linear() && ctx->block->kind & block_kind_merge && num_defined == 1) { + phi.reset(create_instruction(aco_opcode::p_linear_phi, Format::PSEUDO, num_operands, 1)); + + Block *linear_else = &ctx->program->blocks[ctx->block->linear_preds[1]]; + Block *invert = &ctx->program->blocks[linear_else->linear_preds[0]]; + assert(invert->kind & block_kind_invert); + + unsigned then_block = invert->linear_preds[0]; + + Block* insert_block = NULL; + for (unsigned i = 0; i < num_operands; i++) { + Operand op = operands[i]; + if (op.isUndefined()) + continue; + insert_block = ctx->block->logical_preds[i] == then_block ? invert : ctx->block; + phi->operands[0] = op; + break; + } + assert(insert_block); /* should be handled by the "num_defined == 0" case above */ + phi->operands[1] = Operand(dst.regClass()); + phi->definitions[0] = Definition(dst); + insert_block->instructions.emplace(insert_block->instructions.begin(), std::move(phi)); + return; + } + + /* try to scalarize vector phis */ + if (instr->dest.ssa.bit_size != 1 && dst.size() > 1) { + // TODO: scalarize linear phis on divergent ifs + bool can_scalarize = (opcode == aco_opcode::p_phi || !(ctx->block->kind & block_kind_merge)); + std::array new_vec; + for (unsigned i = 0; can_scalarize && (i < num_operands); i++) { + Operand src = operands[i]; + if (src.isTemp() && ctx->allocated_vec.find(src.tempId()) == ctx->allocated_vec.end()) + can_scalarize = false; + } + if (can_scalarize) { + unsigned num_components = instr->dest.ssa.num_components; + assert(dst.size() % num_components == 0); + RegClass rc = RegClass(dst.type(), dst.size() / num_components); + + aco_ptr vec{create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)}; + for (unsigned k = 0; k < num_components; k++) { + phi.reset(create_instruction(opcode, Format::PSEUDO, num_operands, 1)); + for (unsigned i = 0; i < num_operands; i++) { + Operand src = operands[i]; + phi->operands[i] = src.isTemp() ? Operand(ctx->allocated_vec[src.tempId()][k]) : Operand(rc); + } + Temp phi_dst = {ctx->program->allocateId(), rc}; + phi->definitions[0] = Definition(phi_dst); + ctx->block->instructions.emplace(ctx->block->instructions.begin(), std::move(phi)); + new_vec[k] = phi_dst; + vec->operands[k] = Operand(phi_dst); + } + vec->definitions[0] = Definition(dst); + ctx->block->instructions.emplace_back(std::move(vec)); + ctx->allocated_vec.emplace(dst.id(), new_vec); + return; + } + } + + phi.reset(create_instruction(opcode, Format::PSEUDO, num_operands, 1)); + for (unsigned i = 0; i < num_operands; i++) + phi->operands[i] = operands[i]; + phi->definitions[0] = Definition(dst); + ctx->block->instructions.emplace(ctx->block->instructions.begin(), std::move(phi)); +} + + +void visit_undef(isel_context *ctx, nir_ssa_undef_instr *instr) +{ + Temp dst = get_ssa_temp(ctx, &instr->def); + + assert(dst.type() == RegType::sgpr); + + if (dst.size() == 1) { + Builder(ctx->program, ctx->block).copy(Definition(dst), Operand(0u)); + } else { + aco_ptr vec{create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)}; + for (unsigned i = 0; i < dst.size(); i++) + vec->operands[i] = Operand(0u); + vec->definitions[0] = Definition(dst); + ctx->block->instructions.emplace_back(std::move(vec)); + } +} + +void visit_jump(isel_context *ctx, nir_jump_instr *instr) +{ + Builder bld(ctx->program, ctx->block); + Block *logical_target; + append_logical_end(ctx->block); + unsigned idx = ctx->block->index; + + switch (instr->type) { + case nir_jump_break: + logical_target = ctx->cf_info.parent_loop.exit; + add_logical_edge(idx, logical_target); + ctx->block->kind |= block_kind_break; + + if (!ctx->cf_info.parent_if.is_divergent && + !ctx->cf_info.parent_loop.has_divergent_continue) { + /* uniform break - directly jump out of the loop */ + ctx->block->kind |= block_kind_uniform; + ctx->cf_info.has_branch = true; + bld.branch(aco_opcode::p_branch); + add_linear_edge(idx, logical_target); + return; + } + ctx->cf_info.parent_loop.has_divergent_branch = true; + ctx->cf_info.nir_to_aco[instr->instr.block->index] = ctx->block->index; + break; + case nir_jump_continue: + logical_target = &ctx->program->blocks[ctx->cf_info.parent_loop.header_idx]; + add_logical_edge(idx, logical_target); + ctx->block->kind |= block_kind_continue; + + if (ctx->cf_info.parent_if.is_divergent) { + /* for potential uniform breaks after this continue, + we must ensure that they are handled correctly */ + ctx->cf_info.parent_loop.has_divergent_continue = true; + ctx->cf_info.parent_loop.has_divergent_branch = true; + ctx->cf_info.nir_to_aco[instr->instr.block->index] = ctx->block->index; + } else { + /* uniform continue - directly jump to the loop header */ + ctx->block->kind |= block_kind_uniform; + ctx->cf_info.has_branch = true; + bld.branch(aco_opcode::p_branch); + add_linear_edge(idx, logical_target); + return; + } + break; + default: + fprintf(stderr, "Unknown NIR jump instr: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + abort(); + } + + if (ctx->cf_info.parent_if.is_divergent && !ctx->cf_info.exec_potentially_empty_break) { + ctx->cf_info.exec_potentially_empty_break = true; + ctx->cf_info.exec_potentially_empty_break_depth = ctx->cf_info.loop_nest_depth; + } + + /* remove critical edges from linear CFG */ + bld.branch(aco_opcode::p_branch); + Block* break_block = ctx->program->create_and_insert_block(); + break_block->loop_nest_depth = ctx->cf_info.loop_nest_depth; + break_block->kind |= block_kind_uniform; + add_linear_edge(idx, break_block); + /* the loop_header pointer might be invalidated by this point */ + if (instr->type == nir_jump_continue) + logical_target = &ctx->program->blocks[ctx->cf_info.parent_loop.header_idx]; + add_linear_edge(break_block->index, logical_target); + bld.reset(break_block); + bld.branch(aco_opcode::p_branch); + + Block* continue_block = ctx->program->create_and_insert_block(); + continue_block->loop_nest_depth = ctx->cf_info.loop_nest_depth; + add_linear_edge(idx, continue_block); + append_logical_start(continue_block); + ctx->block = continue_block; + return; +} + +void visit_block(isel_context *ctx, nir_block *block) +{ + nir_foreach_instr(instr, block) { + switch (instr->type) { + case nir_instr_type_alu: + visit_alu_instr(ctx, nir_instr_as_alu(instr)); + break; + case nir_instr_type_load_const: + visit_load_const(ctx, nir_instr_as_load_const(instr)); + break; + case nir_instr_type_intrinsic: + visit_intrinsic(ctx, nir_instr_as_intrinsic(instr)); + break; + case nir_instr_type_tex: + visit_tex(ctx, nir_instr_as_tex(instr)); + break; + case nir_instr_type_phi: + visit_phi(ctx, nir_instr_as_phi(instr)); + break; + case nir_instr_type_ssa_undef: + visit_undef(ctx, nir_instr_as_ssa_undef(instr)); + break; + case nir_instr_type_deref: + break; + case nir_instr_type_jump: + visit_jump(ctx, nir_instr_as_jump(instr)); + break; + default: + fprintf(stderr, "Unknown NIR instr type: "); + nir_print_instr(instr, stderr); + fprintf(stderr, "\n"); + //abort(); + } + } + + if (!ctx->cf_info.parent_loop.has_divergent_branch) + ctx->cf_info.nir_to_aco[block->index] = ctx->block->index; +} + + + +static void visit_loop(isel_context *ctx, nir_loop *loop) +{ + //TODO: we might want to wrap the loop around a branch if exec_potentially_empty=true + append_logical_end(ctx->block); + ctx->block->kind |= block_kind_loop_preheader | block_kind_uniform; + Builder bld(ctx->program, ctx->block); + bld.branch(aco_opcode::p_branch); + unsigned loop_preheader_idx = ctx->block->index; + + Block loop_exit = Block(); + loop_exit.loop_nest_depth = ctx->cf_info.loop_nest_depth; + loop_exit.kind |= (block_kind_loop_exit | (ctx->block->kind & block_kind_top_level)); + + Block* loop_header = ctx->program->create_and_insert_block(); + loop_header->loop_nest_depth = ctx->cf_info.loop_nest_depth + 1; + loop_header->kind |= block_kind_loop_header; + add_edge(loop_preheader_idx, loop_header); + ctx->block = loop_header; + + /* emit loop body */ + unsigned loop_header_idx = loop_header->index; + loop_info_RAII loop_raii(ctx, loop_header_idx, &loop_exit); + append_logical_start(ctx->block); + bool unreachable = visit_cf_list(ctx, &loop->body); + + //TODO: what if a loop ends with a unconditional or uniformly branched continue and this branch is never taken? + if (!ctx->cf_info.has_branch) { + append_logical_end(ctx->block); + if (ctx->cf_info.exec_potentially_empty_discard || ctx->cf_info.exec_potentially_empty_break) { + /* Discards can result in code running with an empty exec mask. + * This would result in divergent breaks not ever being taken. As a + * workaround, break the loop when the loop mask is empty instead of + * always continuing. */ + ctx->block->kind |= (block_kind_continue_or_break | block_kind_uniform); + unsigned block_idx = ctx->block->index; + + /* create helper blocks to avoid critical edges */ + Block *break_block = ctx->program->create_and_insert_block(); + break_block->loop_nest_depth = ctx->cf_info.loop_nest_depth; + break_block->kind = block_kind_uniform; + bld.reset(break_block); + bld.branch(aco_opcode::p_branch); + add_linear_edge(block_idx, break_block); + add_linear_edge(break_block->index, &loop_exit); + + Block *continue_block = ctx->program->create_and_insert_block(); + continue_block->loop_nest_depth = ctx->cf_info.loop_nest_depth; + continue_block->kind = block_kind_uniform; + bld.reset(continue_block); + bld.branch(aco_opcode::p_branch); + add_linear_edge(block_idx, continue_block); + add_linear_edge(continue_block->index, &ctx->program->blocks[loop_header_idx]); + + if (!ctx->cf_info.parent_loop.has_divergent_branch) + add_logical_edge(block_idx, &ctx->program->blocks[loop_header_idx]); + ctx->block = &ctx->program->blocks[block_idx]; + } else { + ctx->block->kind |= (block_kind_continue | block_kind_uniform); + if (!ctx->cf_info.parent_loop.has_divergent_branch) + add_edge(ctx->block->index, &ctx->program->blocks[loop_header_idx]); + else + add_linear_edge(ctx->block->index, &ctx->program->blocks[loop_header_idx]); + } + + bld.reset(ctx->block); + bld.branch(aco_opcode::p_branch); + } + + /* Fixup phis in loop header from unreachable blocks. + * has_branch/has_divergent_branch also indicates if the loop ends with a + * break/continue instruction, but we don't emit those if unreachable=true */ + if (unreachable) { + assert(ctx->cf_info.has_branch || ctx->cf_info.parent_loop.has_divergent_branch); + bool linear = ctx->cf_info.has_branch; + bool logical = ctx->cf_info.has_branch || ctx->cf_info.parent_loop.has_divergent_branch; + for (aco_ptr& instr : ctx->program->blocks[loop_header_idx].instructions) { + if ((logical && instr->opcode == aco_opcode::p_phi) || + (linear && instr->opcode == aco_opcode::p_linear_phi)) { + /* the last operand should be the one that needs to be removed */ + instr->operands.pop_back(); + } else if (!is_phi(instr)) { + break; + } + } + } + + ctx->cf_info.has_branch = false; + + // TODO: if the loop has not a single exit, we must add one °° + /* emit loop successor block */ + ctx->block = ctx->program->insert_block(std::move(loop_exit)); + append_logical_start(ctx->block); + + #if 0 + // TODO: check if it is beneficial to not branch on continues + /* trim linear phis in loop header */ + for (auto&& instr : loop_entry->instructions) { + if (instr->opcode == aco_opcode::p_linear_phi) { + aco_ptr new_phi{create_instruction(aco_opcode::p_linear_phi, Format::PSEUDO, loop_entry->linear_predecessors.size(), 1)}; + new_phi->definitions[0] = instr->definitions[0]; + for (unsigned i = 0; i < new_phi->operands.size(); i++) + new_phi->operands[i] = instr->operands[i]; + /* check that the remaining operands are all the same */ + for (unsigned i = new_phi->operands.size(); i < instr->operands.size(); i++) + assert(instr->operands[i].tempId() == instr->operands.back().tempId()); + instr.swap(new_phi); + } else if (instr->opcode == aco_opcode::p_phi) { + continue; + } else { + break; + } + } + #endif +} + +static void begin_divergent_if_then(isel_context *ctx, if_context *ic, Temp cond) +{ + ic->cond = cond; + + append_logical_end(ctx->block); + ctx->block->kind |= block_kind_branch; + + /* branch to linear then block */ + assert(cond.regClass() == ctx->program->lane_mask); + aco_ptr branch; + branch.reset(create_instruction(aco_opcode::p_cbranch_z, Format::PSEUDO_BRANCH, 1, 0)); + branch->operands[0] = Operand(cond); + ctx->block->instructions.push_back(std::move(branch)); + + ic->BB_if_idx = ctx->block->index; + ic->BB_invert = Block(); + ic->BB_invert.loop_nest_depth = ctx->cf_info.loop_nest_depth; + /* Invert blocks are intentionally not marked as top level because they + * are not part of the logical cfg. */ + ic->BB_invert.kind |= block_kind_invert; + ic->BB_endif = Block(); + ic->BB_endif.loop_nest_depth = ctx->cf_info.loop_nest_depth; + ic->BB_endif.kind |= (block_kind_merge | (ctx->block->kind & block_kind_top_level)); + + ic->exec_potentially_empty_discard_old = ctx->cf_info.exec_potentially_empty_discard; + ic->exec_potentially_empty_break_old = ctx->cf_info.exec_potentially_empty_break; + ic->exec_potentially_empty_break_depth_old = ctx->cf_info.exec_potentially_empty_break_depth; + ic->divergent_old = ctx->cf_info.parent_if.is_divergent; + ctx->cf_info.parent_if.is_divergent = true; + + /* divergent branches use cbranch_execz */ + ctx->cf_info.exec_potentially_empty_discard = false; + ctx->cf_info.exec_potentially_empty_break = false; + ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX; + + /** emit logical then block */ + Block* BB_then_logical = ctx->program->create_and_insert_block(); + BB_then_logical->loop_nest_depth = ctx->cf_info.loop_nest_depth; + add_edge(ic->BB_if_idx, BB_then_logical); + ctx->block = BB_then_logical; + append_logical_start(BB_then_logical); +} + +static void begin_divergent_if_else(isel_context *ctx, if_context *ic) +{ + Block *BB_then_logical = ctx->block; + append_logical_end(BB_then_logical); + /* branch from logical then block to invert block */ + aco_ptr branch; + branch.reset(create_instruction(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0)); + BB_then_logical->instructions.emplace_back(std::move(branch)); + add_linear_edge(BB_then_logical->index, &ic->BB_invert); + if (!ctx->cf_info.parent_loop.has_divergent_branch) + add_logical_edge(BB_then_logical->index, &ic->BB_endif); + BB_then_logical->kind |= block_kind_uniform; + assert(!ctx->cf_info.has_branch); + ic->then_branch_divergent = ctx->cf_info.parent_loop.has_divergent_branch; + ctx->cf_info.parent_loop.has_divergent_branch = false; + + /** emit linear then block */ + Block* BB_then_linear = ctx->program->create_and_insert_block(); + BB_then_linear->loop_nest_depth = ctx->cf_info.loop_nest_depth; + BB_then_linear->kind |= block_kind_uniform; + add_linear_edge(ic->BB_if_idx, BB_then_linear); + /* branch from linear then block to invert block */ + branch.reset(create_instruction(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0)); + BB_then_linear->instructions.emplace_back(std::move(branch)); + add_linear_edge(BB_then_linear->index, &ic->BB_invert); + + /** emit invert merge block */ + ctx->block = ctx->program->insert_block(std::move(ic->BB_invert)); + ic->invert_idx = ctx->block->index; + + /* branch to linear else block (skip else) */ + branch.reset(create_instruction(aco_opcode::p_cbranch_nz, Format::PSEUDO_BRANCH, 1, 0)); + branch->operands[0] = Operand(ic->cond); + ctx->block->instructions.push_back(std::move(branch)); + + ic->exec_potentially_empty_discard_old |= ctx->cf_info.exec_potentially_empty_discard; + ic->exec_potentially_empty_break_old |= ctx->cf_info.exec_potentially_empty_break; + ic->exec_potentially_empty_break_depth_old = + std::min(ic->exec_potentially_empty_break_depth_old, ctx->cf_info.exec_potentially_empty_break_depth); + /* divergent branches use cbranch_execz */ + ctx->cf_info.exec_potentially_empty_discard = false; + ctx->cf_info.exec_potentially_empty_break = false; + ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX; + + /** emit logical else block */ + Block* BB_else_logical = ctx->program->create_and_insert_block(); + BB_else_logical->loop_nest_depth = ctx->cf_info.loop_nest_depth; + add_logical_edge(ic->BB_if_idx, BB_else_logical); + add_linear_edge(ic->invert_idx, BB_else_logical); + ctx->block = BB_else_logical; + append_logical_start(BB_else_logical); +} + +static void end_divergent_if(isel_context *ctx, if_context *ic) +{ + Block *BB_else_logical = ctx->block; + append_logical_end(BB_else_logical); + + /* branch from logical else block to endif block */ + aco_ptr branch; + branch.reset(create_instruction(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0)); + BB_else_logical->instructions.emplace_back(std::move(branch)); + add_linear_edge(BB_else_logical->index, &ic->BB_endif); + if (!ctx->cf_info.parent_loop.has_divergent_branch) + add_logical_edge(BB_else_logical->index, &ic->BB_endif); + BB_else_logical->kind |= block_kind_uniform; + + assert(!ctx->cf_info.has_branch); + ctx->cf_info.parent_loop.has_divergent_branch &= ic->then_branch_divergent; + + + /** emit linear else block */ + Block* BB_else_linear = ctx->program->create_and_insert_block(); + BB_else_linear->loop_nest_depth = ctx->cf_info.loop_nest_depth; + BB_else_linear->kind |= block_kind_uniform; + add_linear_edge(ic->invert_idx, BB_else_linear); + + /* branch from linear else block to endif block */ + branch.reset(create_instruction(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0)); + BB_else_linear->instructions.emplace_back(std::move(branch)); + add_linear_edge(BB_else_linear->index, &ic->BB_endif); + + + /** emit endif merge block */ + ctx->block = ctx->program->insert_block(std::move(ic->BB_endif)); + append_logical_start(ctx->block); + + + ctx->cf_info.parent_if.is_divergent = ic->divergent_old; + ctx->cf_info.exec_potentially_empty_discard |= ic->exec_potentially_empty_discard_old; + ctx->cf_info.exec_potentially_empty_break |= ic->exec_potentially_empty_break_old; + ctx->cf_info.exec_potentially_empty_break_depth = + std::min(ic->exec_potentially_empty_break_depth_old, ctx->cf_info.exec_potentially_empty_break_depth); + if (ctx->cf_info.loop_nest_depth == ctx->cf_info.exec_potentially_empty_break_depth && + !ctx->cf_info.parent_if.is_divergent) { + ctx->cf_info.exec_potentially_empty_break = false; + ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX; + } + /* uniform control flow never has an empty exec-mask */ + if (!ctx->cf_info.loop_nest_depth && !ctx->cf_info.parent_if.is_divergent) { + ctx->cf_info.exec_potentially_empty_discard = false; + ctx->cf_info.exec_potentially_empty_break = false; + ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX; + } +} + +static bool visit_if(isel_context *ctx, nir_if *if_stmt) +{ + Temp cond = get_ssa_temp(ctx, if_stmt->condition.ssa); + Builder bld(ctx->program, ctx->block); + aco_ptr branch; + + if (!ctx->divergent_vals[if_stmt->condition.ssa->index]) { /* uniform condition */ + /** + * Uniform conditionals are represented in the following way*) : + * + * The linear and logical CFG: + * BB_IF + * / \ + * BB_THEN (logical) BB_ELSE (logical) + * \ / + * BB_ENDIF + * + * *) Exceptions may be due to break and continue statements within loops + * If a break/continue happens within uniform control flow, it branches + * to the loop exit/entry block. Otherwise, it branches to the next + * merge block. + **/ + append_logical_end(ctx->block); + ctx->block->kind |= block_kind_uniform; + + /* emit branch */ + assert(cond.regClass() == bld.lm); + // TODO: in a post-RA optimizer, we could check if the condition is in VCC and omit this instruction + cond = bool_to_scalar_condition(ctx, cond); + + branch.reset(create_instruction(aco_opcode::p_cbranch_z, Format::PSEUDO_BRANCH, 1, 0)); + branch->operands[0] = Operand(cond); + branch->operands[0].setFixed(scc); + ctx->block->instructions.emplace_back(std::move(branch)); + + unsigned BB_if_idx = ctx->block->index; + Block BB_endif = Block(); + BB_endif.loop_nest_depth = ctx->cf_info.loop_nest_depth; + BB_endif.kind |= ctx->block->kind & block_kind_top_level; + + /** emit then block */ + Block* BB_then = ctx->program->create_and_insert_block(); + BB_then->loop_nest_depth = ctx->cf_info.loop_nest_depth; + add_edge(BB_if_idx, BB_then); + append_logical_start(BB_then); + ctx->block = BB_then; + visit_cf_list(ctx, &if_stmt->then_list); + BB_then = ctx->block; + bool then_branch = ctx->cf_info.has_branch; + bool then_branch_divergent = ctx->cf_info.parent_loop.has_divergent_branch; + + if (!then_branch) { + append_logical_end(BB_then); + /* branch from then block to endif block */ + branch.reset(create_instruction(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0)); + BB_then->instructions.emplace_back(std::move(branch)); + add_linear_edge(BB_then->index, &BB_endif); + if (!then_branch_divergent) + add_logical_edge(BB_then->index, &BB_endif); + BB_then->kind |= block_kind_uniform; + } + + ctx->cf_info.has_branch = false; + ctx->cf_info.parent_loop.has_divergent_branch = false; + + /** emit else block */ + Block* BB_else = ctx->program->create_and_insert_block(); + BB_else->loop_nest_depth = ctx->cf_info.loop_nest_depth; + add_edge(BB_if_idx, BB_else); + append_logical_start(BB_else); + ctx->block = BB_else; + visit_cf_list(ctx, &if_stmt->else_list); + BB_else = ctx->block; + + if (!ctx->cf_info.has_branch) { + append_logical_end(BB_else); + /* branch from then block to endif block */ + branch.reset(create_instruction(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0)); + BB_else->instructions.emplace_back(std::move(branch)); + add_linear_edge(BB_else->index, &BB_endif); + if (!ctx->cf_info.parent_loop.has_divergent_branch) + add_logical_edge(BB_else->index, &BB_endif); + BB_else->kind |= block_kind_uniform; + } + + ctx->cf_info.has_branch &= then_branch; + ctx->cf_info.parent_loop.has_divergent_branch &= then_branch_divergent; + + /** emit endif merge block */ + if (!ctx->cf_info.has_branch) { + ctx->block = ctx->program->insert_block(std::move(BB_endif)); + append_logical_start(ctx->block); + } + } else { /* non-uniform condition */ + /** + * To maintain a logical and linear CFG without critical edges, + * non-uniform conditionals are represented in the following way*) : + * + * The linear CFG: + * BB_IF + * / \ + * BB_THEN (logical) BB_THEN (linear) + * \ / + * BB_INVERT (linear) + * / \ + * BB_ELSE (logical) BB_ELSE (linear) + * \ / + * BB_ENDIF + * + * The logical CFG: + * BB_IF + * / \ + * BB_THEN (logical) BB_ELSE (logical) + * \ / + * BB_ENDIF + * + * *) Exceptions may be due to break and continue statements within loops + **/ + + if_context ic; + + begin_divergent_if_then(ctx, &ic, cond); + visit_cf_list(ctx, &if_stmt->then_list); + + begin_divergent_if_else(ctx, &ic); + visit_cf_list(ctx, &if_stmt->else_list); + + end_divergent_if(ctx, &ic); + } + + return !ctx->cf_info.has_branch && !ctx->block->logical_preds.empty(); +} + +static bool visit_cf_list(isel_context *ctx, + struct exec_list *list) +{ + foreach_list_typed(nir_cf_node, node, node, list) { + switch (node->type) { + case nir_cf_node_block: + visit_block(ctx, nir_cf_node_as_block(node)); + break; + case nir_cf_node_if: + if (!visit_if(ctx, nir_cf_node_as_if(node))) + return true; + break; + case nir_cf_node_loop: + visit_loop(ctx, nir_cf_node_as_loop(node)); + break; + default: + unreachable("unimplemented cf list type"); + } + } + return false; +} + +static void export_vs_varying(isel_context *ctx, int slot, bool is_pos, int *next_pos) +{ + int offset = ctx->program->info->vs.outinfo.vs_output_param_offset[slot]; + uint64_t mask = ctx->outputs.mask[slot]; + if (!is_pos && !mask) + return; + if (!is_pos && offset == AC_EXP_PARAM_UNDEFINED) + return; + aco_ptr exp{create_instruction(aco_opcode::exp, Format::EXP, 4, 0)}; + exp->enabled_mask = mask; + for (unsigned i = 0; i < 4; ++i) { + if (mask & (1 << i)) + exp->operands[i] = Operand(ctx->outputs.outputs[slot][i]); + else + exp->operands[i] = Operand(v1); + } + /* Navi10-14 skip POS0 exports if EXEC=0 and DONE=0, causing a hang. + * Setting valid_mask=1 prevents it and has no other effect. + */ + exp->valid_mask = ctx->options->chip_class >= GFX10 && is_pos && *next_pos == 0; + exp->done = false; + exp->compressed = false; + if (is_pos) + exp->dest = V_008DFC_SQ_EXP_POS + (*next_pos)++; + else + exp->dest = V_008DFC_SQ_EXP_PARAM + offset; + ctx->block->instructions.emplace_back(std::move(exp)); +} + +static void export_vs_psiz_layer_viewport(isel_context *ctx, int *next_pos) +{ + aco_ptr exp{create_instruction(aco_opcode::exp, Format::EXP, 4, 0)}; + exp->enabled_mask = 0; + for (unsigned i = 0; i < 4; ++i) + exp->operands[i] = Operand(v1); + if (ctx->outputs.mask[VARYING_SLOT_PSIZ]) { + exp->operands[0] = Operand(ctx->outputs.outputs[VARYING_SLOT_PSIZ][0]); + exp->enabled_mask |= 0x1; + } + if (ctx->outputs.mask[VARYING_SLOT_LAYER]) { + exp->operands[2] = Operand(ctx->outputs.outputs[VARYING_SLOT_LAYER][0]); + exp->enabled_mask |= 0x4; + } + if (ctx->outputs.mask[VARYING_SLOT_VIEWPORT]) { + if (ctx->options->chip_class < GFX9) { + exp->operands[3] = Operand(ctx->outputs.outputs[VARYING_SLOT_VIEWPORT][0]); + exp->enabled_mask |= 0x8; + } else { + Builder bld(ctx->program, ctx->block); + + Temp out = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(16u), + Operand(ctx->outputs.outputs[VARYING_SLOT_VIEWPORT][0])); + if (exp->operands[2].isTemp()) + out = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand(out), exp->operands[2]); + + exp->operands[2] = Operand(out); + exp->enabled_mask |= 0x4; + } + } + exp->valid_mask = ctx->options->chip_class >= GFX10 && *next_pos == 0; + exp->done = false; + exp->compressed = false; + exp->dest = V_008DFC_SQ_EXP_POS + (*next_pos)++; + ctx->block->instructions.emplace_back(std::move(exp)); +} + +static void create_vs_exports(isel_context *ctx) +{ + radv_vs_output_info *outinfo = &ctx->program->info->vs.outinfo; + + if (outinfo->export_prim_id) { + ctx->outputs.mask[VARYING_SLOT_PRIMITIVE_ID] |= 0x1; + ctx->outputs.outputs[VARYING_SLOT_PRIMITIVE_ID][0] = get_arg(ctx, ctx->args->vs_prim_id); + } + + if (ctx->options->key.has_multiview_view_index) { + ctx->outputs.mask[VARYING_SLOT_LAYER] |= 0x1; + ctx->outputs.outputs[VARYING_SLOT_LAYER][0] = as_vgpr(ctx, get_arg(ctx, ctx->args->ac.view_index)); + } + + /* the order these position exports are created is important */ + int next_pos = 0; + export_vs_varying(ctx, VARYING_SLOT_POS, true, &next_pos); + if (outinfo->writes_pointsize || outinfo->writes_layer || outinfo->writes_viewport_index) { + export_vs_psiz_layer_viewport(ctx, &next_pos); + } + if (ctx->num_clip_distances + ctx->num_cull_distances > 0) + export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST0, true, &next_pos); + if (ctx->num_clip_distances + ctx->num_cull_distances > 4) + export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST1, true, &next_pos); + + if (ctx->export_clip_dists) { + if (ctx->num_clip_distances + ctx->num_cull_distances > 0) + export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST0, false, &next_pos); + if (ctx->num_clip_distances + ctx->num_cull_distances > 4) + export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST1, false, &next_pos); + } + + for (unsigned i = 0; i <= VARYING_SLOT_VAR31; ++i) { + if (i < VARYING_SLOT_VAR0 && + i != VARYING_SLOT_LAYER && + i != VARYING_SLOT_PRIMITIVE_ID && + i != VARYING_SLOT_VIEWPORT) + continue; + + export_vs_varying(ctx, i, false, NULL); + } +} + +static void export_fs_mrt_z(isel_context *ctx) +{ + Builder bld(ctx->program, ctx->block); + unsigned enabled_channels = 0; + bool compr = false; + Operand values[4]; + + for (unsigned i = 0; i < 4; ++i) { + values[i] = Operand(v1); + } + + /* Both stencil and sample mask only need 16-bits. */ + if (!ctx->program->info->ps.writes_z && + (ctx->program->info->ps.writes_stencil || + ctx->program->info->ps.writes_sample_mask)) { + compr = true; /* COMPR flag */ + + if (ctx->program->info->ps.writes_stencil) { + /* Stencil should be in X[23:16]. */ + values[0] = Operand(ctx->outputs.outputs[FRAG_RESULT_STENCIL][0]); + values[0] = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(16u), values[0]); + enabled_channels |= 0x3; + } + + if (ctx->program->info->ps.writes_sample_mask) { + /* SampleMask should be in Y[15:0]. */ + values[1] = Operand(ctx->outputs.outputs[FRAG_RESULT_SAMPLE_MASK][0]); + enabled_channels |= 0xc; + } + } else { + if (ctx->program->info->ps.writes_z) { + values[0] = Operand(ctx->outputs.outputs[FRAG_RESULT_DEPTH][0]); + enabled_channels |= 0x1; + } + + if (ctx->program->info->ps.writes_stencil) { + values[1] = Operand(ctx->outputs.outputs[FRAG_RESULT_STENCIL][0]); + enabled_channels |= 0x2; + } + + if (ctx->program->info->ps.writes_sample_mask) { + values[2] = Operand(ctx->outputs.outputs[FRAG_RESULT_SAMPLE_MASK][0]); + enabled_channels |= 0x4; + } + } + + /* GFX6 (except OLAND and HAINAN) has a bug that it only looks at the X + * writemask component. + */ + if (ctx->options->chip_class == GFX6 && + ctx->options->family != CHIP_OLAND && + ctx->options->family != CHIP_HAINAN) { + enabled_channels |= 0x1; + } + + bld.exp(aco_opcode::exp, values[0], values[1], values[2], values[3], + enabled_channels, V_008DFC_SQ_EXP_MRTZ, compr); +} + +static void export_fs_mrt_color(isel_context *ctx, int slot) +{ + Builder bld(ctx->program, ctx->block); + unsigned write_mask = ctx->outputs.mask[slot]; + Operand values[4]; + + for (unsigned i = 0; i < 4; ++i) { + if (write_mask & (1 << i)) { + values[i] = Operand(ctx->outputs.outputs[slot][i]); + } else { + values[i] = Operand(v1); + } + } + + unsigned target, col_format; + unsigned enabled_channels = 0; + aco_opcode compr_op = (aco_opcode)0; + + slot -= FRAG_RESULT_DATA0; + target = V_008DFC_SQ_EXP_MRT + slot; + col_format = (ctx->options->key.fs.col_format >> (4 * slot)) & 0xf; + + bool is_int8 = (ctx->options->key.fs.is_int8 >> slot) & 1; + bool is_int10 = (ctx->options->key.fs.is_int10 >> slot) & 1; + + switch (col_format) + { + case V_028714_SPI_SHADER_ZERO: + enabled_channels = 0; /* writemask */ + target = V_008DFC_SQ_EXP_NULL; + break; + + case V_028714_SPI_SHADER_32_R: + enabled_channels = 1; + break; + + case V_028714_SPI_SHADER_32_GR: + enabled_channels = 0x3; + break; + + case V_028714_SPI_SHADER_32_AR: + if (ctx->options->chip_class >= GFX10) { + /* Special case: on GFX10, the outputs are different for 32_AR */ + enabled_channels = 0x3; + values[1] = values[3]; + values[3] = Operand(v1); + } else { + enabled_channels = 0x9; + } + break; + + case V_028714_SPI_SHADER_FP16_ABGR: + enabled_channels = 0x5; + compr_op = aco_opcode::v_cvt_pkrtz_f16_f32; + break; + + case V_028714_SPI_SHADER_UNORM16_ABGR: + enabled_channels = 0x5; + compr_op = aco_opcode::v_cvt_pknorm_u16_f32; + break; + + case V_028714_SPI_SHADER_SNORM16_ABGR: + enabled_channels = 0x5; + compr_op = aco_opcode::v_cvt_pknorm_i16_f32; + break; + + case V_028714_SPI_SHADER_UINT16_ABGR: { + enabled_channels = 0x5; + compr_op = aco_opcode::v_cvt_pk_u16_u32; + if (is_int8 || is_int10) { + /* clamp */ + uint32_t max_rgb = is_int8 ? 255 : is_int10 ? 1023 : 0; + Temp max_rgb_val = bld.copy(bld.def(s1), Operand(max_rgb)); + + for (unsigned i = 0; i < 4; i++) { + if ((write_mask >> i) & 1) { + values[i] = bld.vop2(aco_opcode::v_min_u32, bld.def(v1), + i == 3 && is_int10 ? Operand(3u) : Operand(max_rgb_val), + values[i]); + } + } + } + break; + } + + case V_028714_SPI_SHADER_SINT16_ABGR: + enabled_channels = 0x5; + compr_op = aco_opcode::v_cvt_pk_i16_i32; + if (is_int8 || is_int10) { + /* clamp */ + uint32_t max_rgb = is_int8 ? 127 : is_int10 ? 511 : 0; + uint32_t min_rgb = is_int8 ? -128 :is_int10 ? -512 : 0; + Temp max_rgb_val = bld.copy(bld.def(s1), Operand(max_rgb)); + Temp min_rgb_val = bld.copy(bld.def(s1), Operand(min_rgb)); + + for (unsigned i = 0; i < 4; i++) { + if ((write_mask >> i) & 1) { + values[i] = bld.vop2(aco_opcode::v_min_i32, bld.def(v1), + i == 3 && is_int10 ? Operand(1u) : Operand(max_rgb_val), + values[i]); + values[i] = bld.vop2(aco_opcode::v_max_i32, bld.def(v1), + i == 3 && is_int10 ? Operand(-2u) : Operand(min_rgb_val), + values[i]); + } + } + } + break; + + case V_028714_SPI_SHADER_32_ABGR: + enabled_channels = 0xF; + break; + + default: + break; + } + + if (target == V_008DFC_SQ_EXP_NULL) + return; + + if ((bool) compr_op) { + for (int i = 0; i < 2; i++) { + /* check if at least one of the values to be compressed is enabled */ + unsigned enabled = (write_mask >> (i*2) | write_mask >> (i*2+1)) & 0x1; + if (enabled) { + enabled_channels |= enabled << (i*2); + values[i] = bld.vop3(compr_op, bld.def(v1), + values[i*2].isUndefined() ? Operand(0u) : values[i*2], + values[i*2+1].isUndefined() ? Operand(0u): values[i*2+1]); + } else { + values[i] = Operand(v1); + } + } + values[2] = Operand(v1); + values[3] = Operand(v1); + } else { + for (int i = 0; i < 4; i++) + values[i] = enabled_channels & (1 << i) ? values[i] : Operand(v1); + } + + bld.exp(aco_opcode::exp, values[0], values[1], values[2], values[3], + enabled_channels, target, (bool) compr_op); +} + +static void create_fs_exports(isel_context *ctx) +{ + /* Export depth, stencil and sample mask. */ + if (ctx->outputs.mask[FRAG_RESULT_DEPTH] || + ctx->outputs.mask[FRAG_RESULT_STENCIL] || + ctx->outputs.mask[FRAG_RESULT_SAMPLE_MASK]) { + export_fs_mrt_z(ctx); + } + + /* Export all color render targets. */ + for (unsigned i = FRAG_RESULT_DATA0; i < FRAG_RESULT_DATA7 + 1; ++i) { + if (ctx->outputs.mask[i]) + export_fs_mrt_color(ctx, i); + } +} + +static void emit_stream_output(isel_context *ctx, + Temp const *so_buffers, + Temp const *so_write_offset, + const struct radv_stream_output *output) +{ + unsigned num_comps = util_bitcount(output->component_mask); + unsigned writemask = (1 << num_comps) - 1; + unsigned loc = output->location; + unsigned buf = output->buffer; + + assert(num_comps && num_comps <= 4); + if (!num_comps || num_comps > 4) + return; + + unsigned start = ffs(output->component_mask) - 1; + + Temp out[4]; + bool all_undef = true; + assert(ctx->stage == vertex_vs || ctx->stage == gs_copy_vs); + for (unsigned i = 0; i < num_comps; i++) { + out[i] = ctx->outputs.outputs[loc][start + i]; + all_undef = all_undef && !out[i].id(); + } + if (all_undef) + return; + + while (writemask) { + int start, count; + u_bit_scan_consecutive_range(&writemask, &start, &count); + if (count == 3 && ctx->options->chip_class == GFX6) { + /* GFX6 doesn't support storing vec3, split it. */ + writemask |= 1u << (start + 2); + count = 2; + } + + unsigned offset = output->offset + start * 4; + + Temp write_data = {ctx->program->allocateId(), RegClass(RegType::vgpr, count)}; + aco_ptr vec{create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)}; + for (int i = 0; i < count; ++i) + vec->operands[i] = (ctx->outputs.mask[loc] & 1 << (start + i)) ? Operand(out[start + i]) : Operand(0u); + vec->definitions[0] = Definition(write_data); + ctx->block->instructions.emplace_back(std::move(vec)); + + aco_opcode opcode; + switch (count) { + case 1: + opcode = aco_opcode::buffer_store_dword; + break; + case 2: + opcode = aco_opcode::buffer_store_dwordx2; + break; + case 3: + opcode = aco_opcode::buffer_store_dwordx3; + break; + case 4: + opcode = aco_opcode::buffer_store_dwordx4; + break; + default: + unreachable("Unsupported dword count."); + } + + aco_ptr store{create_instruction(opcode, Format::MUBUF, 4, 0)}; + store->operands[0] = Operand(so_buffers[buf]); + store->operands[1] = Operand(so_write_offset[buf]); + store->operands[2] = Operand((uint32_t) 0); + store->operands[3] = Operand(write_data); + if (offset > 4095) { + /* Don't think this can happen in RADV, but maybe GL? It's easy to do this anyway. */ + Builder bld(ctx->program, ctx->block); + store->operands[0] = bld.vadd32(bld.def(v1), Operand(offset), Operand(so_write_offset[buf])); + } else { + store->offset = offset; + } + store->offen = true; + store->glc = true; + store->dlc = false; + store->slc = true; + store->can_reorder = true; + ctx->block->instructions.emplace_back(std::move(store)); + } +} + +static void emit_streamout(isel_context *ctx, unsigned stream) +{ + Builder bld(ctx->program, ctx->block); + + Temp so_buffers[4]; + Temp buf_ptr = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->streamout_buffers)); + for (unsigned i = 0; i < 4; i++) { + unsigned stride = ctx->program->info->so.strides[i]; + if (!stride) + continue; + + Operand off = bld.copy(bld.def(s1), Operand(i * 16u)); + so_buffers[i] = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), buf_ptr, off); + } + + Temp so_vtx_count = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), + get_arg(ctx, ctx->args->streamout_config), Operand(0x70010u)); + + Temp tid = emit_mbcnt(ctx, bld.def(v1)); + + Temp can_emit = bld.vopc(aco_opcode::v_cmp_gt_i32, bld.def(bld.lm), so_vtx_count, tid); + + if_context ic; + begin_divergent_if_then(ctx, &ic, can_emit); + + bld.reset(ctx->block); + + Temp so_write_index = bld.vadd32(bld.def(v1), get_arg(ctx, ctx->args->streamout_write_idx), tid); + + Temp so_write_offset[4]; + + for (unsigned i = 0; i < 4; i++) { + unsigned stride = ctx->program->info->so.strides[i]; + if (!stride) + continue; + + if (stride == 1) { + Temp offset = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), + get_arg(ctx, ctx->args->streamout_write_idx), + get_arg(ctx, ctx->args->streamout_offset[i])); + Temp new_offset = bld.vadd32(bld.def(v1), offset, tid); + + so_write_offset[i] = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), new_offset); + } else { + Temp offset = bld.v_mul_imm(bld.def(v1), so_write_index, stride * 4u); + Temp offset2 = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(4u), + get_arg(ctx, ctx->args->streamout_offset[i])); + so_write_offset[i] = bld.vadd32(bld.def(v1), offset, offset2); + } + } + + for (unsigned i = 0; i < ctx->program->info->so.num_outputs; i++) { + struct radv_stream_output *output = + &ctx->program->info->so.outputs[i]; + if (stream != output->stream) + continue; + + emit_stream_output(ctx, so_buffers, so_write_offset, output); + } + + begin_divergent_if_else(ctx, &ic); + end_divergent_if(ctx, &ic); +} + +} /* end namespace */ + +void split_arguments(isel_context *ctx, Pseudo_instruction *startpgm) +{ + /* Split all arguments except for the first (ring_offsets) and the last + * (exec) so that the dead channels don't stay live throughout the program. + */ + for (int i = 1; i < startpgm->definitions.size() - 1; i++) { + if (startpgm->definitions[i].regClass().size() > 1) { + emit_split_vector(ctx, startpgm->definitions[i].getTemp(), + startpgm->definitions[i].regClass().size()); + } + } +} + +void handle_bc_optimize(isel_context *ctx) +{ + /* needed when SPI_PS_IN_CONTROL.BC_OPTIMIZE_DISABLE is set to 0 */ + Builder bld(ctx->program, ctx->block); + uint32_t spi_ps_input_ena = ctx->program->config->spi_ps_input_ena; + bool uses_center = G_0286CC_PERSP_CENTER_ENA(spi_ps_input_ena) || G_0286CC_LINEAR_CENTER_ENA(spi_ps_input_ena); + bool uses_centroid = G_0286CC_PERSP_CENTROID_ENA(spi_ps_input_ena) || G_0286CC_LINEAR_CENTROID_ENA(spi_ps_input_ena); + ctx->persp_centroid = get_arg(ctx, ctx->args->ac.persp_centroid); + ctx->linear_centroid = get_arg(ctx, ctx->args->ac.linear_centroid); + if (uses_center && uses_centroid) { + Temp sel = bld.vopc_e64(aco_opcode::v_cmp_lt_i32, bld.hint_vcc(bld.def(bld.lm)), + get_arg(ctx, ctx->args->ac.prim_mask), Operand(0u)); + + if (G_0286CC_PERSP_CENTROID_ENA(spi_ps_input_ena)) { + Temp new_coord[2]; + for (unsigned i = 0; i < 2; i++) { + Temp persp_centroid = emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.persp_centroid), i, v1); + Temp persp_center = emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.persp_center), i, v1); + new_coord[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), + persp_centroid, persp_center, sel); + } + ctx->persp_centroid = bld.tmp(v2); + bld.pseudo(aco_opcode::p_create_vector, Definition(ctx->persp_centroid), + Operand(new_coord[0]), Operand(new_coord[1])); + emit_split_vector(ctx, ctx->persp_centroid, 2); + } + + if (G_0286CC_LINEAR_CENTROID_ENA(spi_ps_input_ena)) { + Temp new_coord[2]; + for (unsigned i = 0; i < 2; i++) { + Temp linear_centroid = emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.linear_centroid), i, v1); + Temp linear_center = emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.linear_center), i, v1); + new_coord[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), + linear_centroid, linear_center, sel); + } + ctx->linear_centroid = bld.tmp(v2); + bld.pseudo(aco_opcode::p_create_vector, Definition(ctx->linear_centroid), + Operand(new_coord[0]), Operand(new_coord[1])); + emit_split_vector(ctx, ctx->linear_centroid, 2); + } + } +} + +void setup_fp_mode(isel_context *ctx, nir_shader *shader) +{ + Program *program = ctx->program; + + unsigned float_controls = shader->info.float_controls_execution_mode; + + program->next_fp_mode.preserve_signed_zero_inf_nan32 = + float_controls & FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP32; + program->next_fp_mode.preserve_signed_zero_inf_nan16_64 = + float_controls & (FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP16 | + FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP64); + + program->next_fp_mode.must_flush_denorms32 = + float_controls & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32; + program->next_fp_mode.must_flush_denorms16_64 = + float_controls & (FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16 | + FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP64); + + program->next_fp_mode.care_about_round32 = + float_controls & (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 | FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32); + + program->next_fp_mode.care_about_round16_64 = + float_controls & (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64 | + FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64); + + /* default to preserving fp16 and fp64 denorms, since it's free */ + if (program->next_fp_mode.must_flush_denorms16_64) + program->next_fp_mode.denorm16_64 = 0; + else + program->next_fp_mode.denorm16_64 = fp_denorm_keep; + + /* preserving fp32 denorms is expensive, so only do it if asked */ + if (float_controls & FLOAT_CONTROLS_DENORM_PRESERVE_FP32) + program->next_fp_mode.denorm32 = fp_denorm_keep; + else + program->next_fp_mode.denorm32 = 0; + + if (float_controls & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32) + program->next_fp_mode.round32 = fp_round_tz; + else + program->next_fp_mode.round32 = fp_round_ne; + + if (float_controls & (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64)) + program->next_fp_mode.round16_64 = fp_round_tz; + else + program->next_fp_mode.round16_64 = fp_round_ne; + + ctx->block->fp_mode = program->next_fp_mode; +} + +void cleanup_cfg(Program *program) +{ + /* create linear_succs/logical_succs */ + for (Block& BB : program->blocks) { + for (unsigned idx : BB.linear_preds) + program->blocks[idx].linear_succs.emplace_back(BB.index); + for (unsigned idx : BB.logical_preds) + program->blocks[idx].logical_succs.emplace_back(BB.index); + } +} + +void select_program(Program *program, + unsigned shader_count, + struct nir_shader *const *shaders, + ac_shader_config* config, + struct radv_shader_args *args) +{ + isel_context ctx = setup_isel_context(program, shader_count, shaders, config, args, false); + + for (unsigned i = 0; i < shader_count; i++) { + nir_shader *nir = shaders[i]; + init_context(&ctx, nir); + + setup_fp_mode(&ctx, nir); + + if (!i) { + /* needs to be after init_context() for FS */ + Pseudo_instruction *startpgm = add_startpgm(&ctx); + append_logical_start(ctx.block); + split_arguments(&ctx, startpgm); + } + + if_context ic; + if (shader_count >= 2) { + Builder bld(ctx.program, ctx.block); + Temp count = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), get_arg(&ctx, args->merged_wave_info), Operand((8u << 16) | (i * 8u))); + Temp thread_id = emit_mbcnt(&ctx, bld.def(v1)); + Temp cond = bld.vopc(aco_opcode::v_cmp_gt_u32, bld.hint_vcc(bld.def(bld.lm)), count, thread_id); + + begin_divergent_if_then(&ctx, &ic, cond); + } + + if (i) { + Builder bld(ctx.program, ctx.block); + assert(ctx.stage == vertex_geometry_gs); + bld.barrier(aco_opcode::p_memory_barrier_shared); + bld.sopp(aco_opcode::s_barrier); + + ctx.gs_wave_id = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1, m0), bld.def(s1, scc), get_arg(&ctx, args->merged_wave_info), Operand((8u << 16) | 16u)); + } else if (ctx.stage == geometry_gs) + ctx.gs_wave_id = get_arg(&ctx, args->gs_wave_id); + + if (ctx.stage == fragment_fs) + handle_bc_optimize(&ctx); + + nir_function_impl *func = nir_shader_get_entrypoint(nir); + visit_cf_list(&ctx, &func->body); + + if (ctx.program->info->so.num_outputs && ctx.stage == vertex_vs) + emit_streamout(&ctx, 0); + + if (ctx.stage == vertex_vs) { + create_vs_exports(&ctx); + } else if (nir->info.stage == MESA_SHADER_GEOMETRY) { + Builder bld(ctx.program, ctx.block); + bld.barrier(aco_opcode::p_memory_barrier_gs_data); + bld.sopp(aco_opcode::s_sendmsg, bld.m0(ctx.gs_wave_id), -1, sendmsg_gs_done(false, false, 0)); + } + + if (ctx.stage == fragment_fs) + create_fs_exports(&ctx); + + if (shader_count >= 2) { + begin_divergent_if_else(&ctx, &ic); + end_divergent_if(&ctx, &ic); + } + + ralloc_free(ctx.divergent_vals); + } + + program->config->float_mode = program->blocks[0].fp_mode.val; + + append_logical_end(ctx.block); + ctx.block->kind |= block_kind_uniform | block_kind_export_end; + Builder bld(ctx.program, ctx.block); + if (ctx.program->wb_smem_l1_on_end) + bld.smem(aco_opcode::s_dcache_wb, false); + bld.sopp(aco_opcode::s_endpgm); + + cleanup_cfg(program); +} + +void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader, + ac_shader_config* config, + struct radv_shader_args *args) +{ + isel_context ctx = setup_isel_context(program, 1, &gs_shader, config, args, true); + + program->next_fp_mode.preserve_signed_zero_inf_nan32 = false; + program->next_fp_mode.preserve_signed_zero_inf_nan16_64 = false; + program->next_fp_mode.must_flush_denorms32 = false; + program->next_fp_mode.must_flush_denorms16_64 = false; + program->next_fp_mode.care_about_round32 = false; + program->next_fp_mode.care_about_round16_64 = false; + program->next_fp_mode.denorm16_64 = fp_denorm_keep; + program->next_fp_mode.denorm32 = 0; + program->next_fp_mode.round32 = fp_round_ne; + program->next_fp_mode.round16_64 = fp_round_ne; + ctx.block->fp_mode = program->next_fp_mode; + + add_startpgm(&ctx); + append_logical_start(ctx.block); + + Builder bld(ctx.program, ctx.block); + + Temp gsvs_ring = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), program->private_segment_buffer, Operand(RING_GSVS_VS * 16u)); + + Operand stream_id(0u); + if (args->shader_info->so.num_outputs) + stream_id = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), + get_arg(&ctx, ctx.args->streamout_config), Operand(0x20018u)); + + Temp vtx_offset = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), get_arg(&ctx, ctx.args->ac.vertex_id)); + + std::stack endif_blocks; + + for (unsigned stream = 0; stream < 4; stream++) { + if (stream_id.isConstant() && stream != stream_id.constantValue()) + continue; + + unsigned num_components = args->shader_info->gs.num_stream_output_components[stream]; + if (stream > 0 && (!num_components || !args->shader_info->so.num_outputs)) + continue; + + memset(ctx.outputs.mask, 0, sizeof(ctx.outputs.mask)); + + unsigned BB_if_idx = ctx.block->index; + Block BB_endif = Block(); + if (!stream_id.isConstant()) { + /* begin IF */ + Temp cond = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), stream_id, Operand(stream)); + append_logical_end(ctx.block); + ctx.block->kind |= block_kind_uniform; + bld.branch(aco_opcode::p_cbranch_z, cond); + + BB_endif.kind |= ctx.block->kind & block_kind_top_level; + + ctx.block = ctx.program->create_and_insert_block(); + add_edge(BB_if_idx, ctx.block); + bld.reset(ctx.block); + append_logical_start(ctx.block); + } + + unsigned offset = 0; + for (unsigned i = 0; i <= VARYING_SLOT_VAR31; ++i) { + if (args->shader_info->gs.output_streams[i] != stream) + continue; + + unsigned output_usage_mask = args->shader_info->gs.output_usage_mask[i]; + unsigned length = util_last_bit(output_usage_mask); + for (unsigned j = 0; j < length; ++j) { + if (!(output_usage_mask & (1 << j))) + continue; + + unsigned const_offset = offset * args->shader_info->gs.vertices_out * 16 * 4; + Temp voffset = vtx_offset; + if (const_offset >= 4096u) { + voffset = bld.vadd32(bld.def(v1), Operand(const_offset / 4096u * 4096u), voffset); + const_offset %= 4096u; + } + + aco_ptr mubuf{create_instruction(aco_opcode::buffer_load_dword, Format::MUBUF, 3, 1)}; + mubuf->definitions[0] = bld.def(v1); + mubuf->operands[0] = Operand(gsvs_ring); + mubuf->operands[1] = Operand(voffset); + mubuf->operands[2] = Operand(0u); + mubuf->offen = true; + mubuf->offset = const_offset; + mubuf->glc = true; + mubuf->slc = true; + mubuf->dlc = args->options->chip_class >= GFX10; + mubuf->barrier = barrier_none; + mubuf->can_reorder = true; + + ctx.outputs.mask[i] |= 1 << j; + ctx.outputs.outputs[i][j] = mubuf->definitions[0].getTemp(); + + bld.insert(std::move(mubuf)); + + offset++; + } + } + + if (args->shader_info->so.num_outputs) { + emit_streamout(&ctx, stream); + bld.reset(ctx.block); + } + + if (stream == 0) { + create_vs_exports(&ctx); + ctx.block->kind |= block_kind_export_end; + } + + if (!stream_id.isConstant()) { + append_logical_end(ctx.block); + + /* branch from then block to endif block */ + bld.branch(aco_opcode::p_branch); + add_edge(ctx.block->index, &BB_endif); + ctx.block->kind |= block_kind_uniform; + + /* emit else block */ + ctx.block = ctx.program->create_and_insert_block(); + add_edge(BB_if_idx, ctx.block); + bld.reset(ctx.block); + append_logical_start(ctx.block); + + endif_blocks.push(std::move(BB_endif)); + } + } + + while (!endif_blocks.empty()) { + Block BB_endif = std::move(endif_blocks.top()); + endif_blocks.pop(); + + Block *BB_else = ctx.block; + + append_logical_end(BB_else); + /* branch from else block to endif block */ + bld.branch(aco_opcode::p_branch); + add_edge(BB_else->index, &BB_endif); + BB_else->kind |= block_kind_uniform; + + /** emit endif merge block */ + ctx.block = program->insert_block(std::move(BB_endif)); + bld.reset(ctx.block); + append_logical_start(ctx.block); + } + + program->config->float_mode = program->blocks[0].fp_mode.val; + + append_logical_end(ctx.block); + ctx.block->kind |= block_kind_uniform; + bld.sopp(aco_opcode::s_endpgm); + + cleanup_cfg(program); +} +} diff -Nru mesa-19.2.8/src/amd/compiler/aco_instruction_selection_setup.cpp mesa-20.0.8/src/amd/compiler/aco_instruction_selection_setup.cpp --- mesa-19.2.8/src/amd/compiler/aco_instruction_selection_setup.cpp 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/amd/compiler/aco_instruction_selection_setup.cpp 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,1142 @@ +/* + * Copyright © 2018 Valve Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + */ + +#include +#include +#include "aco_ir.h" +#include "nir.h" +#include "nir_control_flow.h" +#include "vulkan/radv_shader.h" +#include "vulkan/radv_descriptor_set.h" +#include "vulkan/radv_shader_args.h" +#include "sid.h" +#include "ac_exp_param.h" +#include "ac_shader_util.h" + +#include "util/u_math.h" + +#define MAX_INLINE_PUSH_CONSTS 8 + +namespace aco { + +struct output_state { + uint8_t mask[VARYING_SLOT_VAR31 + 1]; + Temp outputs[VARYING_SLOT_VAR31 + 1][4]; +}; + +struct isel_context { + const struct radv_nir_compiler_options *options; + struct radv_shader_args *args; + Program *program; + nir_shader *shader; + uint32_t constant_data_offset; + Block *block; + bool *divergent_vals; + std::unique_ptr allocated; + std::unordered_map> allocated_vec; + Stage stage; /* Stage */ + bool has_gfx10_wave64_bpermute = false; + struct { + bool has_branch; + uint16_t loop_nest_depth = 0; + struct { + unsigned header_idx; + Block* exit; + bool has_divergent_continue = false; + bool has_divergent_branch = false; + } parent_loop; + struct { + bool is_divergent = false; + } parent_if; + bool exec_potentially_empty_discard = false; /* set to false when loop_nest_depth==0 && parent_if.is_divergent==false */ + uint16_t exec_potentially_empty_break_depth = UINT16_MAX; + /* Set to false when loop_nest_depth==exec_potentially_empty_break_depth + * and parent_if.is_divergent==false. Called _break but it's also used for + * loop continues. */ + bool exec_potentially_empty_break = false; + std::unique_ptr nir_to_aco; /* NIR block index to ACO block index */ + } cf_info; + + Temp arg_temps[AC_MAX_ARGS]; + + /* FS inputs */ + Temp persp_centroid, linear_centroid; + + /* GS inputs */ + Temp gs_wave_id; + + /* gathered information */ + uint64_t input_masks[MESA_SHADER_COMPUTE]; + uint64_t output_masks[MESA_SHADER_COMPUTE]; + + /* VS output information */ + bool export_clip_dists; + unsigned num_clip_distances; + unsigned num_cull_distances; + + /* VS, FS or GS output information */ + output_state outputs; +}; + +Temp get_arg(isel_context *ctx, struct ac_arg arg) +{ + assert(arg.used); + return ctx->arg_temps[arg.arg_index]; +} + +unsigned get_interp_input(nir_intrinsic_op intrin, enum glsl_interp_mode interp) +{ + switch (interp) { + case INTERP_MODE_SMOOTH: + case INTERP_MODE_NONE: + if (intrin == nir_intrinsic_load_barycentric_pixel || + intrin == nir_intrinsic_load_barycentric_at_sample || + intrin == nir_intrinsic_load_barycentric_at_offset) + return S_0286CC_PERSP_CENTER_ENA(1); + else if (intrin == nir_intrinsic_load_barycentric_centroid) + return S_0286CC_PERSP_CENTROID_ENA(1); + else if (intrin == nir_intrinsic_load_barycentric_sample) + return S_0286CC_PERSP_SAMPLE_ENA(1); + break; + case INTERP_MODE_NOPERSPECTIVE: + if (intrin == nir_intrinsic_load_barycentric_pixel) + return S_0286CC_LINEAR_CENTER_ENA(1); + else if (intrin == nir_intrinsic_load_barycentric_centroid) + return S_0286CC_LINEAR_CENTROID_ENA(1); + else if (intrin == nir_intrinsic_load_barycentric_sample) + return S_0286CC_LINEAR_SAMPLE_ENA(1); + break; + default: + break; + } + return 0; +} + +/* If one side of a divergent IF ends in a branch and the other doesn't, we + * might have to emit the contents of the side without the branch at the merge + * block instead. This is so that we can use any SGPR live-out of the side + * without the branch without creating a linear phi in the invert or merge block. */ +bool +sanitize_if(nir_function_impl *impl, bool *divergent, nir_if *nif) +{ + if (!divergent[nif->condition.ssa->index]) + return false; + + nir_block *then_block = nir_if_last_then_block(nif); + nir_block *else_block = nir_if_last_else_block(nif); + bool then_jump = nir_block_ends_in_jump(then_block) || nir_block_is_unreachable(then_block); + bool else_jump = nir_block_ends_in_jump(else_block) || nir_block_is_unreachable(else_block); + if (then_jump == else_jump) + return false; + + /* If the continue from block is empty then return as there is nothing to + * move. + */ + if (nir_cf_list_is_empty_block(else_jump ? &nif->then_list : &nif->else_list)) + return false; + + /* Even though this if statement has a jump on one side, we may still have + * phis afterwards. Single-source phis can be produced by loop unrolling + * or dead control-flow passes and are perfectly legal. Run a quick phi + * removal on the block after the if to clean up any such phis. + */ + nir_opt_remove_phis_block(nir_cf_node_as_block(nir_cf_node_next(&nif->cf_node))); + + /* Finally, move the continue from branch after the if-statement. */ + nir_block *last_continue_from_blk = else_jump ? then_block : else_block; + nir_block *first_continue_from_blk = else_jump ? + nir_if_first_then_block(nif) : nir_if_first_else_block(nif); + + nir_cf_list tmp; + nir_cf_extract(&tmp, nir_before_block(first_continue_from_blk), + nir_after_block(last_continue_from_blk)); + nir_cf_reinsert(&tmp, nir_after_cf_node(&nif->cf_node)); + + /* nir_cf_extract() invalidates dominance metadata, but it should still be + * correct because of the specific type of transformation we did. Block + * indices are not valid except for block_0's, which is all we care about for + * nir_block_is_unreachable(). */ + impl->valid_metadata = + (nir_metadata)(impl->valid_metadata | nir_metadata_dominance | nir_metadata_block_index); + + return true; +} + +bool +sanitize_cf_list(nir_function_impl *impl, bool *divergent, struct exec_list *cf_list) +{ + bool progress = false; + foreach_list_typed(nir_cf_node, cf_node, node, cf_list) { + switch (cf_node->type) { + case nir_cf_node_block: + break; + case nir_cf_node_if: { + nir_if *nif = nir_cf_node_as_if(cf_node); + progress |= sanitize_cf_list(impl, divergent, &nif->then_list); + progress |= sanitize_cf_list(impl, divergent, &nif->else_list); + progress |= sanitize_if(impl, divergent, nif); + break; + } + case nir_cf_node_loop: { + nir_loop *loop = nir_cf_node_as_loop(cf_node); + progress |= sanitize_cf_list(impl, divergent, &loop->body); + break; + } + case nir_cf_node_function: + unreachable("Invalid cf type"); + } + } + + return progress; +} + +void init_context(isel_context *ctx, nir_shader *shader) +{ + nir_function_impl *impl = nir_shader_get_entrypoint(shader); + unsigned lane_mask_size = ctx->program->lane_mask.size(); + + ctx->shader = shader; + ctx->divergent_vals = nir_divergence_analysis(shader, nir_divergence_view_index_uniform); + + /* sanitize control flow */ + nir_metadata_require(impl, nir_metadata_dominance); + sanitize_cf_list(impl, ctx->divergent_vals, &impl->body); + nir_metadata_preserve(impl, (nir_metadata)~nir_metadata_block_index); + + /* we'll need this for isel */ + nir_metadata_require(impl, nir_metadata_block_index); + + if (!(ctx->stage & sw_gs_copy) && ctx->options->dump_preoptir) { + fprintf(stderr, "NIR shader before instruction selection:\n"); + nir_print_shader(shader, stderr); + } + + std::unique_ptr allocated{new Temp[impl->ssa_alloc]()}; + + unsigned spi_ps_inputs = 0; + + std::unique_ptr nir_to_aco{new unsigned[impl->num_blocks]()}; + + bool done = false; + while (!done) { + done = true; + nir_foreach_block(block, impl) { + nir_foreach_instr(instr, block) { + switch(instr->type) { + case nir_instr_type_alu: { + nir_alu_instr *alu_instr = nir_instr_as_alu(instr); + unsigned size = alu_instr->dest.dest.ssa.num_components; + if (alu_instr->dest.dest.ssa.bit_size == 64) + size *= 2; + RegType type = RegType::sgpr; + switch(alu_instr->op) { + case nir_op_fmul: + case nir_op_fadd: + case nir_op_fsub: + case nir_op_fmax: + case nir_op_fmin: + case nir_op_fmax3: + case nir_op_fmin3: + case nir_op_fmed3: + case nir_op_fneg: + case nir_op_fabs: + case nir_op_fsat: + case nir_op_fsign: + case nir_op_frcp: + case nir_op_frsq: + case nir_op_fsqrt: + case nir_op_fexp2: + case nir_op_flog2: + case nir_op_ffract: + case nir_op_ffloor: + case nir_op_fceil: + case nir_op_ftrunc: + case nir_op_fround_even: + case nir_op_fsin: + case nir_op_fcos: + case nir_op_f2f32: + case nir_op_f2f64: + case nir_op_u2f32: + case nir_op_u2f64: + case nir_op_i2f32: + case nir_op_i2f64: + case nir_op_pack_half_2x16: + case nir_op_unpack_half_2x16_split_x: + case nir_op_unpack_half_2x16_split_y: + case nir_op_fddx: + case nir_op_fddy: + case nir_op_fddx_fine: + case nir_op_fddy_fine: + case nir_op_fddx_coarse: + case nir_op_fddy_coarse: + case nir_op_fquantize2f16: + case nir_op_ldexp: + case nir_op_frexp_sig: + case nir_op_frexp_exp: + case nir_op_cube_face_index: + case nir_op_cube_face_coord: + type = RegType::vgpr; + break; + case nir_op_flt: + case nir_op_fge: + case nir_op_feq: + case nir_op_fne: + case nir_op_ilt: + case nir_op_ige: + case nir_op_ult: + case nir_op_uge: + case nir_op_ieq: + case nir_op_ine: + case nir_op_i2b1: + size = lane_mask_size; + break; + case nir_op_f2i64: + case nir_op_f2u64: + case nir_op_b2i32: + case nir_op_b2f32: + case nir_op_f2i32: + case nir_op_f2u32: + type = ctx->divergent_vals[alu_instr->dest.dest.ssa.index] ? RegType::vgpr : RegType::sgpr; + break; + case nir_op_bcsel: + if (alu_instr->dest.dest.ssa.bit_size == 1) { + size = lane_mask_size; + } else { + if (ctx->divergent_vals[alu_instr->dest.dest.ssa.index]) { + type = RegType::vgpr; + } else { + if (allocated[alu_instr->src[1].src.ssa->index].type() == RegType::vgpr || + allocated[alu_instr->src[2].src.ssa->index].type() == RegType::vgpr) { + type = RegType::vgpr; + } + } + if (alu_instr->src[1].src.ssa->num_components == 1 && alu_instr->src[2].src.ssa->num_components == 1) { + assert(allocated[alu_instr->src[1].src.ssa->index].size() == allocated[alu_instr->src[2].src.ssa->index].size()); + size = allocated[alu_instr->src[1].src.ssa->index].size(); + } + } + break; + case nir_op_mov: + if (alu_instr->dest.dest.ssa.bit_size == 1) { + size = lane_mask_size; + } else { + type = ctx->divergent_vals[alu_instr->dest.dest.ssa.index] ? RegType::vgpr : RegType::sgpr; + } + break; + default: + if (alu_instr->dest.dest.ssa.bit_size == 1) { + size = lane_mask_size; + } else { + for (unsigned i = 0; i < nir_op_infos[alu_instr->op].num_inputs; i++) { + if (allocated[alu_instr->src[i].src.ssa->index].type() == RegType::vgpr) + type = RegType::vgpr; + } + } + break; + } + allocated[alu_instr->dest.dest.ssa.index] = Temp(0, RegClass(type, size)); + break; + } + case nir_instr_type_load_const: { + unsigned size = nir_instr_as_load_const(instr)->def.num_components; + if (nir_instr_as_load_const(instr)->def.bit_size == 64) + size *= 2; + else if (nir_instr_as_load_const(instr)->def.bit_size == 1) + size *= lane_mask_size; + allocated[nir_instr_as_load_const(instr)->def.index] = Temp(0, RegClass(RegType::sgpr, size)); + break; + } + case nir_instr_type_intrinsic: { + nir_intrinsic_instr *intrinsic = nir_instr_as_intrinsic(instr); + if (!nir_intrinsic_infos[intrinsic->intrinsic].has_dest) + break; + unsigned size = intrinsic->dest.ssa.num_components; + if (intrinsic->dest.ssa.bit_size == 64) + size *= 2; + RegType type = RegType::sgpr; + switch(intrinsic->intrinsic) { + case nir_intrinsic_load_push_constant: + case nir_intrinsic_load_work_group_id: + case nir_intrinsic_load_num_work_groups: + case nir_intrinsic_load_subgroup_id: + case nir_intrinsic_load_num_subgroups: + case nir_intrinsic_load_first_vertex: + case nir_intrinsic_load_base_instance: + case nir_intrinsic_get_buffer_size: + case nir_intrinsic_vote_all: + case nir_intrinsic_vote_any: + case nir_intrinsic_read_first_invocation: + case nir_intrinsic_read_invocation: + case nir_intrinsic_first_invocation: + type = RegType::sgpr; + if (intrinsic->dest.ssa.bit_size == 1) + size = lane_mask_size; + break; + case nir_intrinsic_ballot: + type = RegType::sgpr; + break; + case nir_intrinsic_load_sample_id: + case nir_intrinsic_load_sample_mask_in: + case nir_intrinsic_load_input: + case nir_intrinsic_load_input_vertex: + case nir_intrinsic_load_per_vertex_input: + case nir_intrinsic_load_vertex_id: + case nir_intrinsic_load_vertex_id_zero_base: + case nir_intrinsic_load_barycentric_sample: + case nir_intrinsic_load_barycentric_pixel: + case nir_intrinsic_load_barycentric_model: + case nir_intrinsic_load_barycentric_centroid: + case nir_intrinsic_load_barycentric_at_sample: + case nir_intrinsic_load_barycentric_at_offset: + case nir_intrinsic_load_interpolated_input: + case nir_intrinsic_load_frag_coord: + case nir_intrinsic_load_sample_pos: + case nir_intrinsic_load_layer_id: + case nir_intrinsic_load_local_invocation_id: + case nir_intrinsic_load_local_invocation_index: + case nir_intrinsic_load_subgroup_invocation: + case nir_intrinsic_write_invocation_amd: + case nir_intrinsic_mbcnt_amd: + case nir_intrinsic_load_instance_id: + case nir_intrinsic_ssbo_atomic_add: + case nir_intrinsic_ssbo_atomic_imin: + case nir_intrinsic_ssbo_atomic_umin: + case nir_intrinsic_ssbo_atomic_imax: + case nir_intrinsic_ssbo_atomic_umax: + case nir_intrinsic_ssbo_atomic_and: + case nir_intrinsic_ssbo_atomic_or: + case nir_intrinsic_ssbo_atomic_xor: + case nir_intrinsic_ssbo_atomic_exchange: + case nir_intrinsic_ssbo_atomic_comp_swap: + case nir_intrinsic_global_atomic_add: + case nir_intrinsic_global_atomic_imin: + case nir_intrinsic_global_atomic_umin: + case nir_intrinsic_global_atomic_imax: + case nir_intrinsic_global_atomic_umax: + case nir_intrinsic_global_atomic_and: + case nir_intrinsic_global_atomic_or: + case nir_intrinsic_global_atomic_xor: + case nir_intrinsic_global_atomic_exchange: + case nir_intrinsic_global_atomic_comp_swap: + case nir_intrinsic_image_deref_atomic_add: + case nir_intrinsic_image_deref_atomic_umin: + case nir_intrinsic_image_deref_atomic_imin: + case nir_intrinsic_image_deref_atomic_umax: + case nir_intrinsic_image_deref_atomic_imax: + case nir_intrinsic_image_deref_atomic_and: + case nir_intrinsic_image_deref_atomic_or: + case nir_intrinsic_image_deref_atomic_xor: + case nir_intrinsic_image_deref_atomic_exchange: + case nir_intrinsic_image_deref_atomic_comp_swap: + case nir_intrinsic_image_deref_size: + case nir_intrinsic_shared_atomic_add: + case nir_intrinsic_shared_atomic_imin: + case nir_intrinsic_shared_atomic_umin: + case nir_intrinsic_shared_atomic_imax: + case nir_intrinsic_shared_atomic_umax: + case nir_intrinsic_shared_atomic_and: + case nir_intrinsic_shared_atomic_or: + case nir_intrinsic_shared_atomic_xor: + case nir_intrinsic_shared_atomic_exchange: + case nir_intrinsic_shared_atomic_comp_swap: + case nir_intrinsic_load_scratch: + case nir_intrinsic_load_invocation_id: + case nir_intrinsic_load_primitive_id: + type = RegType::vgpr; + break; + case nir_intrinsic_shuffle: + case nir_intrinsic_quad_broadcast: + case nir_intrinsic_quad_swap_horizontal: + case nir_intrinsic_quad_swap_vertical: + case nir_intrinsic_quad_swap_diagonal: + case nir_intrinsic_quad_swizzle_amd: + case nir_intrinsic_masked_swizzle_amd: + case nir_intrinsic_inclusive_scan: + case nir_intrinsic_exclusive_scan: + if (intrinsic->dest.ssa.bit_size == 1) { + size = lane_mask_size; + type = RegType::sgpr; + } else if (!ctx->divergent_vals[intrinsic->dest.ssa.index]) { + type = RegType::sgpr; + } else { + type = RegType::vgpr; + } + break; + case nir_intrinsic_load_view_index: + type = ctx->stage == fragment_fs ? RegType::vgpr : RegType::sgpr; + break; + case nir_intrinsic_load_front_face: + case nir_intrinsic_load_helper_invocation: + case nir_intrinsic_is_helper_invocation: + type = RegType::sgpr; + size = lane_mask_size; + break; + case nir_intrinsic_reduce: + if (intrinsic->dest.ssa.bit_size == 1) { + size = lane_mask_size; + type = RegType::sgpr; + } else if (!ctx->divergent_vals[intrinsic->dest.ssa.index]) { + type = RegType::sgpr; + } else { + type = RegType::vgpr; + } + break; + case nir_intrinsic_load_ubo: + case nir_intrinsic_load_ssbo: + case nir_intrinsic_load_global: + case nir_intrinsic_vulkan_resource_index: + type = ctx->divergent_vals[intrinsic->dest.ssa.index] ? RegType::vgpr : RegType::sgpr; + break; + /* due to copy propagation, the swizzled imov is removed if num dest components == 1 */ + case nir_intrinsic_load_shared: + if (ctx->divergent_vals[intrinsic->dest.ssa.index]) + type = RegType::vgpr; + else + type = RegType::sgpr; + break; + default: + for (unsigned i = 0; i < nir_intrinsic_infos[intrinsic->intrinsic].num_srcs; i++) { + if (allocated[intrinsic->src[i].ssa->index].type() == RegType::vgpr) + type = RegType::vgpr; + } + break; + } + allocated[intrinsic->dest.ssa.index] = Temp(0, RegClass(type, size)); + + switch(intrinsic->intrinsic) { + case nir_intrinsic_load_barycentric_sample: + case nir_intrinsic_load_barycentric_pixel: + case nir_intrinsic_load_barycentric_centroid: + case nir_intrinsic_load_barycentric_at_sample: + case nir_intrinsic_load_barycentric_at_offset: { + glsl_interp_mode mode = (glsl_interp_mode)nir_intrinsic_interp_mode(intrinsic); + spi_ps_inputs |= get_interp_input(intrinsic->intrinsic, mode); + break; + } + case nir_intrinsic_load_barycentric_model: + spi_ps_inputs |= S_0286CC_PERSP_PULL_MODEL_ENA(1); + break; + case nir_intrinsic_load_front_face: + spi_ps_inputs |= S_0286CC_FRONT_FACE_ENA(1); + break; + case nir_intrinsic_load_frag_coord: + case nir_intrinsic_load_sample_pos: { + uint8_t mask = nir_ssa_def_components_read(&intrinsic->dest.ssa); + for (unsigned i = 0; i < 4; i++) { + if (mask & (1 << i)) + spi_ps_inputs |= S_0286CC_POS_X_FLOAT_ENA(1) << i; + + } + break; + } + case nir_intrinsic_load_sample_id: + spi_ps_inputs |= S_0286CC_ANCILLARY_ENA(1); + break; + case nir_intrinsic_load_sample_mask_in: + spi_ps_inputs |= S_0286CC_ANCILLARY_ENA(1); + spi_ps_inputs |= S_0286CC_SAMPLE_COVERAGE_ENA(1); + break; + default: + break; + } + break; + } + case nir_instr_type_tex: { + nir_tex_instr* tex = nir_instr_as_tex(instr); + unsigned size = tex->dest.ssa.num_components; + + if (tex->dest.ssa.bit_size == 64) + size *= 2; + if (tex->op == nir_texop_texture_samples) + assert(!ctx->divergent_vals[tex->dest.ssa.index]); + if (ctx->divergent_vals[tex->dest.ssa.index]) + allocated[tex->dest.ssa.index] = Temp(0, RegClass(RegType::vgpr, size)); + else + allocated[tex->dest.ssa.index] = Temp(0, RegClass(RegType::sgpr, size)); + break; + } + case nir_instr_type_parallel_copy: { + nir_foreach_parallel_copy_entry(entry, nir_instr_as_parallel_copy(instr)) { + allocated[entry->dest.ssa.index] = allocated[entry->src.ssa->index]; + } + break; + } + case nir_instr_type_ssa_undef: { + unsigned size = nir_instr_as_ssa_undef(instr)->def.num_components; + if (nir_instr_as_ssa_undef(instr)->def.bit_size == 64) + size *= 2; + else if (nir_instr_as_ssa_undef(instr)->def.bit_size == 1) + size *= lane_mask_size; + allocated[nir_instr_as_ssa_undef(instr)->def.index] = Temp(0, RegClass(RegType::sgpr, size)); + break; + } + case nir_instr_type_phi: { + nir_phi_instr* phi = nir_instr_as_phi(instr); + RegType type; + unsigned size = phi->dest.ssa.num_components; + + if (phi->dest.ssa.bit_size == 1) { + assert(size == 1 && "multiple components not yet supported on boolean phis."); + type = RegType::sgpr; + size *= lane_mask_size; + allocated[phi->dest.ssa.index] = Temp(0, RegClass(type, size)); + break; + } + + if (ctx->divergent_vals[phi->dest.ssa.index]) { + type = RegType::vgpr; + } else { + type = RegType::sgpr; + nir_foreach_phi_src (src, phi) { + if (allocated[src->src.ssa->index].type() == RegType::vgpr) + type = RegType::vgpr; + if (allocated[src->src.ssa->index].type() == RegType::none) + done = false; + } + } + + size *= phi->dest.ssa.bit_size == 64 ? 2 : 1; + RegClass rc = RegClass(type, size); + if (rc != allocated[phi->dest.ssa.index].regClass()) { + done = false; + } else { + nir_foreach_phi_src(src, phi) + assert(allocated[src->src.ssa->index].size() == rc.size()); + } + allocated[phi->dest.ssa.index] = Temp(0, rc); + break; + } + default: + break; + } + } + } + } + + if (G_0286CC_POS_W_FLOAT_ENA(spi_ps_inputs)) { + /* If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be enabled too */ + spi_ps_inputs |= S_0286CC_PERSP_CENTER_ENA(1); + } + + if (!(spi_ps_inputs & 0x7F)) { + /* At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled */ + spi_ps_inputs |= S_0286CC_PERSP_CENTER_ENA(1); + } + + ctx->program->config->spi_ps_input_ena = spi_ps_inputs; + ctx->program->config->spi_ps_input_addr = spi_ps_inputs; + + for (unsigned i = 0; i < impl->ssa_alloc; i++) + allocated[i] = Temp(ctx->program->allocateId(), allocated[i].regClass()); + + ctx->allocated.reset(allocated.release()); + ctx->cf_info.nir_to_aco.reset(nir_to_aco.release()); +} + +Pseudo_instruction *add_startpgm(struct isel_context *ctx) +{ + unsigned arg_count = ctx->args->ac.arg_count; + if (ctx->stage == fragment_fs) { + /* LLVM optimizes away unused FS inputs and computes spi_ps_input_addr + * itself and then communicates the results back via the ELF binary. + * Mirror what LLVM does by re-mapping the VGPR arguments here. + * + * TODO: If we made the FS input scanning code into a separate pass that + * could run before argument setup, then this wouldn't be necessary + * anymore. + */ + struct ac_shader_args *args = &ctx->args->ac; + arg_count = 0; + for (unsigned i = 0, vgpr_arg = 0, vgpr_reg = 0; i < args->arg_count; i++) { + if (args->args[i].file != AC_ARG_VGPR) { + arg_count++; + continue; + } + + if (!(ctx->program->config->spi_ps_input_addr & (1 << vgpr_arg))) { + args->args[i].skip = true; + } else { + args->args[i].offset = vgpr_reg; + vgpr_reg += args->args[i].size; + arg_count++; + } + vgpr_arg++; + } + } + + aco_ptr startpgm{create_instruction(aco_opcode::p_startpgm, Format::PSEUDO, 0, arg_count + 1)}; + for (unsigned i = 0, arg = 0; i < ctx->args->ac.arg_count; i++) { + if (ctx->args->ac.args[i].skip) + continue; + + enum ac_arg_regfile file = ctx->args->ac.args[i].file; + unsigned size = ctx->args->ac.args[i].size; + unsigned reg = ctx->args->ac.args[i].offset; + RegClass type = RegClass(file == AC_ARG_SGPR ? RegType::sgpr : RegType::vgpr, size); + Temp dst = Temp{ctx->program->allocateId(), type}; + ctx->arg_temps[i] = dst; + startpgm->definitions[arg] = Definition(dst); + startpgm->definitions[arg].setFixed(PhysReg{file == AC_ARG_SGPR ? reg : reg + 256}); + arg++; + } + startpgm->definitions[arg_count] = Definition{ctx->program->allocateId(), exec, ctx->program->lane_mask}; + Pseudo_instruction *instr = startpgm.get(); + ctx->block->instructions.push_back(std::move(startpgm)); + + /* Stash these in the program so that they can be accessed later when + * handling spilling. + */ + ctx->program->private_segment_buffer = get_arg(ctx, ctx->args->ring_offsets); + ctx->program->scratch_offset = get_arg(ctx, ctx->args->scratch_offset); + + return instr; +} + +int +type_size(const struct glsl_type *type, bool bindless) +{ + // TODO: don't we need type->std430_base_alignment() here? + return glsl_count_attribute_slots(type, false); +} + +void +shared_var_info(const struct glsl_type *type, unsigned *size, unsigned *align) +{ + assert(glsl_type_is_vector_or_scalar(type)); + + uint32_t comp_size = glsl_type_is_boolean(type) + ? 4 : glsl_get_bit_size(type) / 8; + unsigned length = glsl_get_vector_elements(type); + *size = comp_size * length, + *align = comp_size; +} + +static bool +mem_vectorize_callback(unsigned align, unsigned bit_size, + unsigned num_components, unsigned high_offset, + nir_intrinsic_instr *low, nir_intrinsic_instr *high) +{ + if ((bit_size != 32 && bit_size != 64) || num_components > 4) + return false; + + /* >128 bit loads are split except with SMEM */ + if (bit_size * num_components > 128) + return false; + + switch (low->intrinsic) { + case nir_intrinsic_load_ubo: + case nir_intrinsic_load_ssbo: + case nir_intrinsic_store_ssbo: + case nir_intrinsic_load_push_constant: + return align % 4 == 0; + case nir_intrinsic_load_deref: + case nir_intrinsic_store_deref: + assert(nir_src_as_deref(low->src[0])->mode == nir_var_mem_shared); + /* fallthrough */ + case nir_intrinsic_load_shared: + case nir_intrinsic_store_shared: + if (bit_size * num_components > 64) /* 96 and 128 bit loads require 128 bit alignment and are split otherwise */ + return align % 16 == 0; + else + return align % 4 == 0; + default: + return false; + } + return false; +} + +void +setup_vs_output_info(isel_context *ctx, nir_shader *nir, + bool export_prim_id, bool export_clip_dists, + radv_vs_output_info *outinfo) +{ + memset(outinfo->vs_output_param_offset, AC_EXP_PARAM_UNDEFINED, + sizeof(outinfo->vs_output_param_offset)); + + outinfo->param_exports = 0; + int pos_written = 0x1; + if (outinfo->writes_pointsize || outinfo->writes_viewport_index || outinfo->writes_layer) + pos_written |= 1 << 1; + + uint64_t mask = ctx->output_masks[nir->info.stage]; + while (mask) { + int idx = u_bit_scan64(&mask); + if (idx >= VARYING_SLOT_VAR0 || idx == VARYING_SLOT_LAYER || + idx == VARYING_SLOT_PRIMITIVE_ID || idx == VARYING_SLOT_VIEWPORT || + ((idx == VARYING_SLOT_CLIP_DIST0 || idx == VARYING_SLOT_CLIP_DIST1) && export_clip_dists)) { + if (outinfo->vs_output_param_offset[idx] == AC_EXP_PARAM_UNDEFINED) + outinfo->vs_output_param_offset[idx] = outinfo->param_exports++; + } + } + if (outinfo->writes_layer && + outinfo->vs_output_param_offset[VARYING_SLOT_LAYER] == AC_EXP_PARAM_UNDEFINED) { + /* when ctx->options->key.has_multiview_view_index = true, the layer + * variable isn't declared in NIR and it's isel's job to get the layer */ + outinfo->vs_output_param_offset[VARYING_SLOT_LAYER] = outinfo->param_exports++; + } + + if (export_prim_id) { + assert(outinfo->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID] == AC_EXP_PARAM_UNDEFINED); + outinfo->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID] = outinfo->param_exports++; + } + + ctx->export_clip_dists = export_clip_dists; + ctx->num_clip_distances = util_bitcount(outinfo->clip_dist_mask); + ctx->num_cull_distances = util_bitcount(outinfo->cull_dist_mask); + + assert(ctx->num_clip_distances + ctx->num_cull_distances <= 8); + + if (ctx->num_clip_distances + ctx->num_cull_distances > 0) + pos_written |= 1 << 2; + if (ctx->num_clip_distances + ctx->num_cull_distances > 4) + pos_written |= 1 << 3; + + outinfo->pos_exports = util_bitcount(pos_written); +} + +void +setup_vs_variables(isel_context *ctx, nir_shader *nir) +{ + nir_foreach_variable(variable, &nir->inputs) + { + variable->data.driver_location = variable->data.location * 4; + } + nir_foreach_variable(variable, &nir->outputs) + { + if (ctx->stage == vertex_geometry_gs) + variable->data.driver_location = util_bitcount64(ctx->output_masks[nir->info.stage] & ((1ull << variable->data.location) - 1ull)) * 4; + else if (ctx->stage == vertex_es) + //TODO: make this more compact + variable->data.driver_location = shader_io_get_unique_index((gl_varying_slot)variable->data.location) * 4; + else + variable->data.driver_location = variable->data.location * 4; + } + + if (ctx->stage == vertex_vs) { + radv_vs_output_info *outinfo = &ctx->program->info->vs.outinfo; + setup_vs_output_info(ctx, nir, outinfo->export_prim_id, + ctx->options->key.vs_common_out.export_clip_dists, outinfo); + } else if (ctx->stage == vertex_geometry_gs || ctx->stage == vertex_es) { + /* TODO: radv_nir_shader_info_pass() already sets this but it's larger + * than it needs to be in order to set it better, we have to improve + * radv_nir_shader_info_pass() because gfx9_get_gs_info() uses + * esgs_itemsize and has to be done before compilation + */ + /* radv_es_output_info *outinfo = &ctx->program->info->vs.es_info; + outinfo->esgs_itemsize = util_bitcount64(ctx->output_masks[nir->info.stage]) * 16u; */ + } +} + +void +setup_variables(isel_context *ctx, nir_shader *nir) +{ + switch (nir->info.stage) { + case MESA_SHADER_FRAGMENT: { + nir_foreach_variable(variable, &nir->outputs) + { + int idx = variable->data.location + variable->data.index; + variable->data.driver_location = idx * 4; + } + break; + } + case MESA_SHADER_COMPUTE: { + ctx->program->config->lds_size = (nir->info.cs.shared_size + ctx->program->lds_alloc_granule - 1) / + ctx->program->lds_alloc_granule; + break; + } + case MESA_SHADER_VERTEX: { + setup_vs_variables(ctx, nir); + break; + } + case MESA_SHADER_GEOMETRY: { + assert(ctx->stage == vertex_geometry_gs || ctx->stage == geometry_gs); + if (ctx->stage == vertex_geometry_gs) { + nir_foreach_variable(variable, &nir->inputs) { + variable->data.driver_location = util_bitcount64(ctx->input_masks[nir->info.stage] & ((1ull << variable->data.location) - 1ull)) * 4; + } + } else { + //TODO: make this more compact + nir_foreach_variable(variable, &nir->inputs) { + variable->data.driver_location = shader_io_get_unique_index((gl_varying_slot)variable->data.location) * 4; + } + } + nir_foreach_variable(variable, &nir->outputs) { + variable->data.driver_location = variable->data.location * 4; + } + if (ctx->stage == vertex_geometry_gs) + ctx->program->info->gs.es_type = MESA_SHADER_VERTEX; /* tesselation shaders are not yet supported */ + break; + } + default: + unreachable("Unhandled shader stage."); + } +} + +void +get_io_masks(isel_context *ctx, unsigned shader_count, struct nir_shader *const *shaders) +{ + for (unsigned i = 0; i < shader_count; i++) { + nir_shader *nir = shaders[i]; + if (nir->info.stage == MESA_SHADER_COMPUTE) + continue; + + uint64_t output_mask = 0; + nir_foreach_variable(variable, &nir->outputs) { + const glsl_type *type = variable->type; + if (nir_is_per_vertex_io(variable, nir->info.stage)) + type = type->fields.array; + unsigned slots = type->count_attribute_slots(false); + if (variable->data.compact) { + unsigned component_count = variable->data.location_frac + type->length; + slots = (component_count + 3) / 4; + } + output_mask |= ((1ull << slots) - 1) << variable->data.location; + } + + uint64_t input_mask = 0; + nir_foreach_variable(variable, &nir->inputs) { + const glsl_type *type = variable->type; + if (nir_is_per_vertex_io(variable, nir->info.stage)) + type = type->fields.array; + unsigned slots = type->count_attribute_slots(false); + if (variable->data.compact) { + unsigned component_count = variable->data.location_frac + type->length; + slots = (component_count + 3) / 4; + } + input_mask |= ((1ull << slots) - 1) << variable->data.location; + } + + ctx->output_masks[nir->info.stage] |= output_mask; + if (i + 1 < shader_count) + ctx->input_masks[shaders[i + 1]->info.stage] |= output_mask; + + ctx->input_masks[nir->info.stage] |= input_mask; + if (i) + ctx->output_masks[shaders[i - 1]->info.stage] |= input_mask; + } +} + +void +setup_nir(isel_context *ctx, nir_shader *nir) +{ + Program *program = ctx->program; + + /* align and copy constant data */ + while (program->constant_data.size() % 4u) + program->constant_data.push_back(0); + ctx->constant_data_offset = program->constant_data.size(); + program->constant_data.insert(program->constant_data.end(), + (uint8_t*)nir->constant_data, + (uint8_t*)nir->constant_data + nir->constant_data_size); + + /* the variable setup has to be done before lower_io / CSE */ + setup_variables(ctx, nir); + + /* optimize and lower memory operations */ + bool lower_to_scalar = false; + bool lower_pack = false; + if (nir_opt_load_store_vectorize(nir, + (nir_variable_mode)(nir_var_mem_ssbo | nir_var_mem_ubo | + nir_var_mem_push_const | nir_var_mem_shared), + mem_vectorize_callback)) { + lower_to_scalar = true; + lower_pack = true; + } + if (nir->info.stage != MESA_SHADER_COMPUTE) + nir_lower_io(nir, (nir_variable_mode)(nir_var_shader_in | nir_var_shader_out), type_size, (nir_lower_io_options)0); + nir_lower_explicit_io(nir, nir_var_mem_global, nir_address_format_64bit_global); + + if (lower_to_scalar) + nir_lower_alu_to_scalar(nir, NULL, NULL); + if (lower_pack) + nir_lower_pack(nir); + + /* lower ALU operations */ + // TODO: implement logic64 in aco, it's more effective for sgprs + nir_lower_int64(nir, nir->options->lower_int64_options); + + nir_opt_idiv_const(nir, 32); + nir_lower_idiv(nir, nir_lower_idiv_precise); + + /* optimize the lowered ALU operations */ + bool more_algebraic = true; + while (more_algebraic) { + more_algebraic = false; + NIR_PASS_V(nir, nir_copy_prop); + NIR_PASS_V(nir, nir_opt_dce); + NIR_PASS_V(nir, nir_opt_constant_folding); + NIR_PASS(more_algebraic, nir, nir_opt_algebraic); + } + + /* Do late algebraic optimization to turn add(a, neg(b)) back into + * subs, then the mandatory cleanup after algebraic. Note that it may + * produce fnegs, and if so then we need to keep running to squash + * fneg(fneg(a)). + */ + bool more_late_algebraic = true; + while (more_late_algebraic) { + more_late_algebraic = false; + NIR_PASS(more_late_algebraic, nir, nir_opt_algebraic_late); + NIR_PASS_V(nir, nir_opt_constant_folding); + NIR_PASS_V(nir, nir_copy_prop); + NIR_PASS_V(nir, nir_opt_dce); + NIR_PASS_V(nir, nir_opt_cse); + } + + /* cleanup passes */ + nir_lower_load_const_to_scalar(nir); + nir_opt_shrink_load(nir); + nir_move_options move_opts = (nir_move_options)( + nir_move_const_undef | nir_move_load_ubo | nir_move_load_input | + nir_move_comparisons | nir_move_copies); + nir_opt_sink(nir, move_opts); + nir_opt_move(nir, move_opts); + nir_convert_to_lcssa(nir, true, false); + nir_lower_phis_to_scalar(nir); + + nir_function_impl *func = nir_shader_get_entrypoint(nir); + nir_index_ssa_defs(func); +} + +isel_context +setup_isel_context(Program* program, + unsigned shader_count, + struct nir_shader *const *shaders, + ac_shader_config* config, + struct radv_shader_args *args, + bool is_gs_copy_shader) +{ + program->stage = 0; + for (unsigned i = 0; i < shader_count; i++) { + switch (shaders[i]->info.stage) { + case MESA_SHADER_VERTEX: + program->stage |= sw_vs; + break; + case MESA_SHADER_TESS_CTRL: + program->stage |= sw_tcs; + break; + case MESA_SHADER_TESS_EVAL: + program->stage |= sw_tes; + break; + case MESA_SHADER_GEOMETRY: + program->stage |= is_gs_copy_shader ? sw_gs_copy : sw_gs; + break; + case MESA_SHADER_FRAGMENT: + program->stage |= sw_fs; + break; + case MESA_SHADER_COMPUTE: + program->stage |= sw_cs; + break; + default: + unreachable("Shader stage not implemented"); + } + } + bool gfx9_plus = args->options->chip_class >= GFX9; + bool ngg = args->shader_info->is_ngg && args->options->chip_class >= GFX10; + if (program->stage == sw_vs && args->shader_info->vs.as_es) + program->stage |= hw_es; + else if (program->stage == sw_vs && !args->shader_info->vs.as_ls) + program->stage |= hw_vs; + else if (program->stage == sw_gs) + program->stage |= hw_gs; + else if (program->stage == sw_fs) + program->stage |= hw_fs; + else if (program->stage == sw_cs) + program->stage |= hw_cs; + else if (program->stage == sw_gs_copy) + program->stage |= hw_vs; + else if (program->stage == (sw_vs | sw_gs) && gfx9_plus && !ngg) + program->stage |= hw_gs; + else + unreachable("Shader stage not implemented"); + + program->config = config; + program->info = args->shader_info; + program->chip_class = args->options->chip_class; + program->family = args->options->family; + program->wave_size = args->shader_info->wave_size; + program->lane_mask = program->wave_size == 32 ? s1 : s2; + + program->lds_alloc_granule = args->options->chip_class >= GFX7 ? 512 : 256; + program->lds_limit = args->options->chip_class >= GFX7 ? 65536 : 32768; + program->vgpr_limit = 256; + program->vgpr_alloc_granule = 3; + + if (args->options->chip_class >= GFX10) { + program->physical_sgprs = 2560; /* doesn't matter as long as it's at least 128 * 20 */ + program->sgpr_alloc_granule = 127; + program->sgpr_limit = 106; + program->vgpr_alloc_granule = program->wave_size == 32 ? 7 : 3; + } else if (program->chip_class >= GFX8) { + program->physical_sgprs = 800; + program->sgpr_alloc_granule = 15; + if (args->options->family == CHIP_TONGA || args->options->family == CHIP_ICELAND) + program->sgpr_limit = 94; /* workaround hardware bug */ + else + program->sgpr_limit = 102; + } else { + program->physical_sgprs = 512; + program->sgpr_alloc_granule = 7; + program->sgpr_limit = 104; + } + + /* TODO: we don't have to allocate VCC if we don't need it */ + program->needs_vcc = true; + + calc_min_waves(program); + program->vgpr_limit = get_addr_vgpr_from_waves(program, program->min_waves); + program->sgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves); + + isel_context ctx = {}; + ctx.program = program; + ctx.args = args; + ctx.options = args->options; + ctx.stage = program->stage; + + get_io_masks(&ctx, shader_count, shaders); + + unsigned scratch_size = 0; + if (program->stage == gs_copy_vs) { + assert(shader_count == 1); + setup_vs_output_info(&ctx, shaders[0], false, true, &args->shader_info->vs.outinfo); + } else { + for (unsigned i = 0; i < shader_count; i++) { + nir_shader *nir = shaders[i]; + setup_nir(&ctx, nir); + } + + for (unsigned i = 0; i < shader_count; i++) + scratch_size = std::max(scratch_size, shaders[i]->scratch_size); + } + + ctx.program->config->scratch_bytes_per_wave = align(scratch_size * ctx.program->wave_size, 1024); + + ctx.block = ctx.program->create_and_insert_block(); + ctx.block->loop_nest_depth = 0; + ctx.block->kind = block_kind_top_level; + + return ctx; +} + +} diff -Nru mesa-19.2.8/src/amd/compiler/aco_interface.cpp mesa-20.0.8/src/amd/compiler/aco_interface.cpp --- mesa-19.2.8/src/amd/compiler/aco_interface.cpp 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/amd/compiler/aco_interface.cpp 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,187 @@ +/* + * Copyright © 2018 Google + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "aco_interface.h" +#include "aco_ir.h" +#include "vulkan/radv_shader.h" +#include "vulkan/radv_shader_args.h" +#include "c11/threads.h" +#include "util/debug.h" + +#include +#include + +namespace aco { +uint64_t debug_flags = 0; + +static const struct debug_control aco_debug_options[] = { + {"validateir", DEBUG_VALIDATE}, + {"validatera", DEBUG_VALIDATE_RA}, + {"perfwarn", DEBUG_PERFWARN}, + {NULL, 0} +}; + +static once_flag init_once_flag = ONCE_FLAG_INIT; + +static void init() +{ + debug_flags = parse_debug_string(getenv("ACO_DEBUG"), aco_debug_options); + + #ifndef NDEBUG + /* enable some flags by default on debug builds */ + debug_flags |= aco::DEBUG_VALIDATE; + #endif +} +} + +void aco_compile_shader(unsigned shader_count, + struct nir_shader *const *shaders, + struct radv_shader_binary **binary, + struct radv_shader_args *args) +{ + call_once(&aco::init_once_flag, aco::init); + + ac_shader_config config = {0}; + std::unique_ptr program{new aco::Program}; + + /* Instruction Selection */ + if (args->is_gs_copy_shader) + aco::select_gs_copy_shader(program.get(), shaders[0], &config, args); + else + aco::select_program(program.get(), shader_count, shaders, &config, args); + if (args->options->dump_preoptir) { + std::cerr << "After Instruction Selection:\n"; + aco_print_program(program.get(), stderr); + } + aco::validate(program.get(), stderr); + + /* Boolean phi lowering */ + aco::lower_bool_phis(program.get()); + //std::cerr << "After Boolean Phi Lowering:\n"; + //aco_print_program(program.get(), stderr); + + aco::dominator_tree(program.get()); + + /* Optimization */ + aco::value_numbering(program.get()); + aco::optimize(program.get()); + aco::validate(program.get(), stderr); + + aco::setup_reduce_temp(program.get()); + aco::insert_exec_mask(program.get()); + aco::validate(program.get(), stderr); + + aco::live live_vars = aco::live_var_analysis(program.get(), args->options); + aco::spill(program.get(), live_vars, args->options); + + //std::cerr << "Before Schedule:\n"; + //aco_print_program(program.get(), stderr); + aco::schedule_program(program.get(), live_vars); + + std::string llvm_ir; + if (args->options->record_ir) { + char *data = NULL; + size_t size = 0; + FILE *f = open_memstream(&data, &size); + if (f) { + aco_print_program(program.get(), f); + fputc(0, f); + fclose(f); + } + + llvm_ir = std::string(data, data + size); + free(data); + } + + /* Register Allocation */ + aco::register_allocation(program.get(), live_vars.live_out); + if (args->options->dump_shader) { + std::cerr << "After RA:\n"; + aco_print_program(program.get(), stderr); + } + + if (aco::validate_ra(program.get(), args->options, stderr)) { + std::cerr << "Program after RA validation failure:\n"; + aco_print_program(program.get(), stderr); + abort(); + } + + aco::ssa_elimination(program.get()); + /* Lower to HW Instructions */ + aco::lower_to_hw_instr(program.get()); + //std::cerr << "After Eliminate Pseudo Instr:\n"; + //aco_print_program(program.get(), stderr); + + /* Insert Waitcnt */ + aco::insert_wait_states(program.get()); + aco::insert_NOPs(program.get()); + + //std::cerr << "After Insert-Waitcnt:\n"; + //aco_print_program(program.get(), stderr); + + /* Assembly */ + std::vector code; + unsigned exec_size = aco::emit_program(program.get(), code); + + bool get_disasm = args->options->dump_shader || args->options->record_ir; + + size_t size = llvm_ir.size(); + + std::string disasm; + if (get_disasm) { + std::ostringstream stream; + aco::print_asm(program.get(), code, exec_size / 4u, stream); + stream << '\0'; + disasm = stream.str(); + size += disasm.size(); + } + + size += code.size() * sizeof(uint32_t) + sizeof(radv_shader_binary_legacy); + /* We need to calloc to prevent unintialized data because this will be used + * directly for the disk cache. Uninitialized data can appear because of + * padding in the struct or because legacy_binary->data can be at an offset + * from the start less than sizeof(radv_shader_binary_legacy). */ + radv_shader_binary_legacy* legacy_binary = (radv_shader_binary_legacy*) calloc(size, 1); + + legacy_binary->base.type = RADV_BINARY_TYPE_LEGACY; + legacy_binary->base.stage = shaders[shader_count-1]->info.stage; + legacy_binary->base.is_gs_copy_shader = args->is_gs_copy_shader; + legacy_binary->base.total_size = size; + + memcpy(legacy_binary->data, code.data(), code.size() * sizeof(uint32_t)); + legacy_binary->exec_size = exec_size; + legacy_binary->code_size = code.size() * sizeof(uint32_t); + + legacy_binary->config = config; + legacy_binary->disasm_size = 0; + legacy_binary->ir_size = llvm_ir.size(); + + llvm_ir.copy((char*) legacy_binary->data + legacy_binary->code_size, llvm_ir.size()); + + if (get_disasm) { + disasm.copy((char*) legacy_binary->data + legacy_binary->code_size + llvm_ir.size(), disasm.size()); + legacy_binary->disasm_size = disasm.size(); + } + + *binary = (radv_shader_binary*) legacy_binary; +} diff -Nru mesa-19.2.8/src/amd/compiler/aco_interface.h mesa-20.0.8/src/amd/compiler/aco_interface.h --- mesa-19.2.8/src/amd/compiler/aco_interface.h 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/amd/compiler/aco_interface.h 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,44 @@ +/* + * Copyright © 2018 Google + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef ACO_INTERFACE_H +#define ACO_INTERFACE_H + +#include "nir.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct ac_shader_config; + +void aco_compile_shader(unsigned shader_count, + struct nir_shader *const *shaders, + struct radv_shader_binary** binary, + struct radv_shader_args *args); + +#ifdef __cplusplus +} +#endif + +#endif diff -Nru mesa-19.2.8/src/amd/compiler/aco_ir.h mesa-20.0.8/src/amd/compiler/aco_ir.h --- mesa-19.2.8/src/amd/compiler/aco_ir.h 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/amd/compiler/aco_ir.h 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,1320 @@ +/* + * Copyright © 2018 Valve Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + */ + +#ifndef ACO_IR_H +#define ACO_IR_H + +#include +#include +#include +#include + +#include "nir.h" +#include "ac_binary.h" +#include "amd_family.h" +#include "aco_opcodes.h" +#include "aco_util.h" + +struct radv_nir_compiler_options; +struct radv_shader_args; +struct radv_shader_info; + +namespace aco { + +extern uint64_t debug_flags; + +enum { + DEBUG_VALIDATE = 0x1, + DEBUG_VALIDATE_RA = 0x2, + DEBUG_PERFWARN = 0x4, +}; + +/** + * Representation of the instruction's microcode encoding format + * Note: Some Vector ALU Formats can be combined, such that: + * - VOP2* | VOP3A represents a VOP2 instruction in VOP3A encoding + * - VOP2* | DPP represents a VOP2 instruction with data parallel primitive. + * - VOP2* | SDWA represents a VOP2 instruction with sub-dword addressing. + * + * (*) The same is applicable for VOP1 and VOPC instructions. + */ +enum class Format : std::uint16_t { + /* Pseudo Instruction Format */ + PSEUDO = 0, + /* Scalar ALU & Control Formats */ + SOP1 = 1, + SOP2 = 2, + SOPK = 3, + SOPP = 4, + SOPC = 5, + /* Scalar Memory Format */ + SMEM = 6, + /* LDS/GDS Format */ + DS = 8, + /* Vector Memory Buffer Formats */ + MTBUF = 9, + MUBUF = 10, + /* Vector Memory Image Format */ + MIMG = 11, + /* Export Format */ + EXP = 12, + /* Flat Formats */ + FLAT = 13, + GLOBAL = 14, + SCRATCH = 15, + + PSEUDO_BRANCH = 16, + PSEUDO_BARRIER = 17, + PSEUDO_REDUCTION = 18, + + /* Vector ALU Formats */ + VOP1 = 1 << 8, + VOP2 = 1 << 9, + VOPC = 1 << 10, + VOP3 = 1 << 11, + VOP3A = 1 << 11, + VOP3B = 1 << 11, + VOP3P = 1 << 12, + /* Vector Parameter Interpolation Format */ + VINTRP = 1 << 13, + DPP = 1 << 14, + SDWA = 1 << 15, +}; + +enum barrier_interaction : uint8_t { + barrier_none = 0, + barrier_buffer = 0x1, + barrier_image = 0x2, + barrier_atomic = 0x4, + barrier_shared = 0x8, + /* used for geometry shaders to ensure vertex data writes are before the + * GS_DONE s_sendmsg. */ + barrier_gs_data = 0x10, + /* used for geometry shaders to ensure s_sendmsg instructions are in-order. */ + barrier_gs_sendmsg = 0x20, + barrier_count = 6, +}; + +enum fp_round { + fp_round_ne = 0, + fp_round_pi = 1, + fp_round_ni = 2, + fp_round_tz = 3, +}; + +enum fp_denorm { + /* Note that v_rcp_f32, v_exp_f32, v_log_f32, v_sqrt_f32, v_rsq_f32 and + * v_mad_f32/v_madak_f32/v_madmk_f32/v_mac_f32 always flush denormals. */ + fp_denorm_flush = 0x0, + fp_denorm_keep = 0x3, +}; + +struct float_mode { + /* matches encoding of the MODE register */ + union { + struct { + fp_round round32:2; + fp_round round16_64:2; + unsigned denorm32:2; + unsigned denorm16_64:2; + }; + uint8_t val = 0; + }; + /* if false, optimizations which may remove infs/nan/-0.0 can be done */ + bool preserve_signed_zero_inf_nan32:1; + bool preserve_signed_zero_inf_nan16_64:1; + /* if false, optimizations which may remove denormal flushing can be done */ + bool must_flush_denorms32:1; + bool must_flush_denorms16_64:1; + bool care_about_round32:1; + bool care_about_round16_64:1; + + /* Returns true if instructions using the mode "other" can safely use the + * current one instead. */ + bool canReplace(float_mode other) const noexcept { + return val == other.val && + (preserve_signed_zero_inf_nan32 || !other.preserve_signed_zero_inf_nan32) && + (preserve_signed_zero_inf_nan16_64 || !other.preserve_signed_zero_inf_nan16_64) && + (must_flush_denorms32 || !other.must_flush_denorms32) && + (must_flush_denorms16_64 || !other.must_flush_denorms16_64) && + (care_about_round32 || !other.care_about_round32) && + (care_about_round16_64 || !other.care_about_round16_64); + } +}; + +constexpr Format asVOP3(Format format) { + return (Format) ((uint32_t) Format::VOP3 | (uint32_t) format); +}; + +enum class RegType { + none = 0, + sgpr, + vgpr, + linear_vgpr, +}; + +struct RegClass { + + enum RC : uint8_t { + s1 = 1, + s2 = 2, + s3 = 3, + s4 = 4, + s6 = 6, + s8 = 8, + s16 = 16, + v1 = s1 | (1 << 5), + v2 = s2 | (1 << 5), + v3 = s3 | (1 << 5), + v4 = s4 | (1 << 5), + v5 = 5 | (1 << 5), + v6 = 6 | (1 << 5), + v7 = 7 | (1 << 5), + v8 = 8 | (1 << 5), + /* these are used for WWM and spills to vgpr */ + v1_linear = v1 | (1 << 6), + v2_linear = v2 | (1 << 6), + }; + + RegClass() = default; + constexpr RegClass(RC rc) + : rc(rc) {} + constexpr RegClass(RegType type, unsigned size) + : rc((RC) ((type == RegType::vgpr ? 1 << 5 : 0) | size)) {} + + constexpr operator RC() const { return rc; } + explicit operator bool() = delete; + + constexpr RegType type() const { return rc <= RC::s16 ? RegType::sgpr : RegType::vgpr; } + constexpr unsigned size() const { return (unsigned) rc & 0x1F; } + constexpr bool is_linear() const { return rc <= RC::s16 || rc & (1 << 6); } + constexpr RegClass as_linear() const { return RegClass((RC) (rc | (1 << 6))); } + +private: + RC rc; +}; + +/* transitional helper expressions */ +static constexpr RegClass s1{RegClass::s1}; +static constexpr RegClass s2{RegClass::s2}; +static constexpr RegClass s3{RegClass::s3}; +static constexpr RegClass s4{RegClass::s4}; +static constexpr RegClass s8{RegClass::s8}; +static constexpr RegClass s16{RegClass::s16}; +static constexpr RegClass v1{RegClass::v1}; +static constexpr RegClass v2{RegClass::v2}; +static constexpr RegClass v3{RegClass::v3}; +static constexpr RegClass v4{RegClass::v4}; +static constexpr RegClass v5{RegClass::v5}; +static constexpr RegClass v6{RegClass::v6}; +static constexpr RegClass v7{RegClass::v7}; +static constexpr RegClass v8{RegClass::v8}; + +/** + * Temp Class + * Each temporary virtual register has a + * register class (i.e. size and type) + * and SSA id. + */ +struct Temp { + Temp() = default; + constexpr Temp(uint32_t id, RegClass cls) noexcept + : id_(id), reg_class(cls) {} + + constexpr uint32_t id() const noexcept { return id_; } + constexpr RegClass regClass() const noexcept { return reg_class; } + + constexpr unsigned size() const noexcept { return reg_class.size(); } + constexpr RegType type() const noexcept { return reg_class.type(); } + constexpr bool is_linear() const noexcept { return reg_class.is_linear(); } + + constexpr bool operator <(Temp other) const noexcept { return id() < other.id(); } + constexpr bool operator==(Temp other) const noexcept { return id() == other.id(); } + constexpr bool operator!=(Temp other) const noexcept { return id() != other.id(); } + +private: + uint32_t id_:24; + RegClass reg_class; +}; + +/** + * PhysReg + * Represents the physical register for each + * Operand and Definition. + */ +struct PhysReg { + constexpr PhysReg() = default; + explicit constexpr PhysReg(unsigned r) : reg(r) {} + constexpr operator unsigned() const { return reg; } + + uint16_t reg = 0; +}; + +/* helper expressions for special registers */ +static constexpr PhysReg m0{124}; +static constexpr PhysReg vcc{106}; +static constexpr PhysReg sgpr_null{125}; /* GFX10+ */ +static constexpr PhysReg exec{126}; +static constexpr PhysReg exec_lo{126}; +static constexpr PhysReg exec_hi{127}; +static constexpr PhysReg scc{253}; + +/** + * Operand Class + * Initially, each Operand refers to either + * a temporary virtual register + * or to a constant value + * Temporary registers get mapped to physical register during RA + * Constant values are inlined into the instruction sequence. + */ +class Operand final +{ +public: + constexpr Operand() + : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isConstant_(false), + isKill_(false), isUndef_(true), isFirstKill_(false), is64BitConst_(false) {} + + explicit Operand(Temp r) noexcept + { + data_.temp = r; + if (r.id()) { + isTemp_ = true; + } else { + isUndef_ = true; + setFixed(PhysReg{128}); + } + }; + explicit Operand(uint32_t v, bool is64bit = false) noexcept + { + data_.i = v; + isConstant_ = true; + is64BitConst_ = is64bit; + if (v <= 64) + setFixed(PhysReg{128 + v}); + else if (v >= 0xFFFFFFF0) /* [-16 .. -1] */ + setFixed(PhysReg{192 - v}); + else if (v == 0x3f000000) /* 0.5 */ + setFixed(PhysReg{240}); + else if (v == 0xbf000000) /* -0.5 */ + setFixed(PhysReg{241}); + else if (v == 0x3f800000) /* 1.0 */ + setFixed(PhysReg{242}); + else if (v == 0xbf800000) /* -1.0 */ + setFixed(PhysReg{243}); + else if (v == 0x40000000) /* 2.0 */ + setFixed(PhysReg{244}); + else if (v == 0xc0000000) /* -2.0 */ + setFixed(PhysReg{245}); + else if (v == 0x40800000) /* 4.0 */ + setFixed(PhysReg{246}); + else if (v == 0xc0800000) /* -4.0 */ + setFixed(PhysReg{247}); + else { /* Literal Constant */ + assert(!is64bit && "attempt to create a 64-bit literal constant"); + setFixed(PhysReg{255}); + } + }; + explicit Operand(uint64_t v) noexcept + { + isConstant_ = true; + is64BitConst_ = true; + if (v <= 64) { + data_.i = (uint32_t) v; + setFixed(PhysReg{128 + (uint32_t) v}); + } else if (v >= 0xFFFFFFFFFFFFFFF0) { /* [-16 .. -1] */ + data_.i = (uint32_t) v; + setFixed(PhysReg{192 - (uint32_t) v}); + } else if (v == 0x3FE0000000000000) { /* 0.5 */ + data_.i = 0x3f000000; + setFixed(PhysReg{240}); + } else if (v == 0xBFE0000000000000) { /* -0.5 */ + data_.i = 0xbf000000; + setFixed(PhysReg{241}); + } else if (v == 0x3FF0000000000000) { /* 1.0 */ + data_.i = 0x3f800000; + setFixed(PhysReg{242}); + } else if (v == 0xBFF0000000000000) { /* -1.0 */ + data_.i = 0xbf800000; + setFixed(PhysReg{243}); + } else if (v == 0x4000000000000000) { /* 2.0 */ + data_.i = 0x40000000; + setFixed(PhysReg{244}); + } else if (v == 0xC000000000000000) { /* -2.0 */ + data_.i = 0xc0000000; + setFixed(PhysReg{245}); + } else if (v == 0x4010000000000000) { /* 4.0 */ + data_.i = 0x40800000; + setFixed(PhysReg{246}); + } else if (v == 0xC010000000000000) { /* -4.0 */ + data_.i = 0xc0800000; + setFixed(PhysReg{247}); + } else { /* Literal Constant: we don't know if it is a long or double.*/ + isConstant_ = 0; + assert(false && "attempt to create a 64-bit literal constant"); + } + }; + explicit Operand(RegClass type) noexcept + { + isUndef_ = true; + data_.temp = Temp(0, type); + setFixed(PhysReg{128}); + }; + explicit Operand(PhysReg reg, RegClass type) noexcept + { + data_.temp = Temp(0, type); + setFixed(reg); + } + + constexpr bool isTemp() const noexcept + { + return isTemp_; + } + + constexpr void setTemp(Temp t) noexcept { + assert(!isConstant_); + isTemp_ = true; + data_.temp = t; + } + + constexpr Temp getTemp() const noexcept + { + return data_.temp; + } + + constexpr uint32_t tempId() const noexcept + { + return data_.temp.id(); + } + + constexpr bool hasRegClass() const noexcept + { + return isTemp() || isUndefined(); + } + + constexpr RegClass regClass() const noexcept + { + return data_.temp.regClass(); + } + + constexpr unsigned size() const noexcept + { + if (isConstant()) + return is64BitConst_ ? 2 : 1; + else + return data_.temp.size(); + } + + constexpr bool isFixed() const noexcept + { + return isFixed_; + } + + constexpr PhysReg physReg() const noexcept + { + return reg_; + } + + constexpr void setFixed(PhysReg reg) noexcept + { + isFixed_ = reg != unsigned(-1); + reg_ = reg; + } + + constexpr bool isConstant() const noexcept + { + return isConstant_; + } + + constexpr bool isLiteral() const noexcept + { + return isConstant() && reg_ == 255; + } + + constexpr bool isUndefined() const noexcept + { + return isUndef_; + } + + constexpr uint32_t constantValue() const noexcept + { + return data_.i; + } + + constexpr bool constantEquals(uint32_t cmp) const noexcept + { + return isConstant() && constantValue() == cmp; + } + + constexpr uint64_t constantValue64(bool signext=false) const noexcept + { + if (is64BitConst_) { + if (reg_.reg <= 192) + return reg_.reg - 128; + else if (reg_.reg <= 208) + return 0xFFFFFFFFFFFFFFFF - (reg_.reg - 193); + + switch (reg_.reg) { + case 240: + return 0x3FE0000000000000; + case 241: + return 0xBFE0000000000000; + case 242: + return 0x3FF0000000000000; + case 243: + return 0xBFF0000000000000; + case 244: + return 0x4000000000000000; + case 245: + return 0xC000000000000000; + case 246: + return 0x4010000000000000; + case 247: + return 0xC010000000000000; + } + } + return (signext && (data_.i & 0x80000000u) ? 0xffffffff00000000ull : 0ull) | data_.i; + } + + constexpr void setKill(bool flag) noexcept + { + isKill_ = flag; + if (!flag) + setFirstKill(false); + } + + constexpr bool isKill() const noexcept + { + return isKill_ || isFirstKill(); + } + + constexpr void setFirstKill(bool flag) noexcept + { + isFirstKill_ = flag; + if (flag) + setKill(flag); + } + + /* When there are multiple operands killing the same temporary, + * isFirstKill() is only returns true for the first one. */ + constexpr bool isFirstKill() const noexcept + { + return isFirstKill_; + } + +private: + union { + uint32_t i; + float f; + Temp temp = Temp(0, s1); + } data_; + PhysReg reg_; + union { + struct { + uint8_t isTemp_:1; + uint8_t isFixed_:1; + uint8_t isConstant_:1; + uint8_t isKill_:1; + uint8_t isUndef_:1; + uint8_t isFirstKill_:1; + uint8_t is64BitConst_:1; + }; + /* can't initialize bit-fields in c++11, so work around using a union */ + uint8_t control_ = 0; + }; +}; + +/** + * Definition Class + * Definitions are the results of Instructions + * and refer to temporary virtual registers + * which are later mapped to physical registers + */ +class Definition final +{ +public: + constexpr Definition() : temp(Temp(0, s1)), reg_(0), isFixed_(0), hasHint_(0), isKill_(0) {} + Definition(uint32_t index, RegClass type) noexcept + : temp(index, type) {} + explicit Definition(Temp tmp) noexcept + : temp(tmp) {} + Definition(PhysReg reg, RegClass type) noexcept + : temp(Temp(0, type)) + { + setFixed(reg); + } + Definition(uint32_t tmpId, PhysReg reg, RegClass type) noexcept + : temp(Temp(tmpId, type)) + { + setFixed(reg); + } + + constexpr bool isTemp() const noexcept + { + return tempId() > 0; + } + + constexpr Temp getTemp() const noexcept + { + return temp; + } + + constexpr uint32_t tempId() const noexcept + { + return temp.id(); + } + + constexpr void setTemp(Temp t) noexcept { + temp = t; + } + + constexpr RegClass regClass() const noexcept + { + return temp.regClass(); + } + + constexpr unsigned size() const noexcept + { + return temp.size(); + } + + constexpr bool isFixed() const noexcept + { + return isFixed_; + } + + constexpr PhysReg physReg() const noexcept + { + return reg_; + } + + constexpr void setFixed(PhysReg reg) noexcept + { + isFixed_ = 1; + reg_ = reg; + } + + constexpr void setHint(PhysReg reg) noexcept + { + hasHint_ = 1; + reg_ = reg; + } + + constexpr bool hasHint() const noexcept + { + return hasHint_; + } + + constexpr void setKill(bool flag) noexcept + { + isKill_ = flag; + } + + constexpr bool isKill() const noexcept + { + return isKill_; + } + +private: + Temp temp = Temp(0, s1); + PhysReg reg_; + union { + struct { + uint8_t isFixed_:1; + uint8_t hasHint_:1; + uint8_t isKill_:1; + }; + /* can't initialize bit-fields in c++11, so work around using a union */ + uint8_t control_ = 0; + }; +}; + +class Block; + +struct Instruction { + aco_opcode opcode; + Format format; + uint32_t pass_flags; + + aco::span operands; + aco::span definitions; + + constexpr bool isVALU() const noexcept + { + return ((uint16_t) format & (uint16_t) Format::VOP1) == (uint16_t) Format::VOP1 + || ((uint16_t) format & (uint16_t) Format::VOP2) == (uint16_t) Format::VOP2 + || ((uint16_t) format & (uint16_t) Format::VOPC) == (uint16_t) Format::VOPC + || ((uint16_t) format & (uint16_t) Format::VOP3A) == (uint16_t) Format::VOP3A + || ((uint16_t) format & (uint16_t) Format::VOP3B) == (uint16_t) Format::VOP3B + || ((uint16_t) format & (uint16_t) Format::VOP3P) == (uint16_t) Format::VOP3P; + } + + constexpr bool isSALU() const noexcept + { + return format == Format::SOP1 || + format == Format::SOP2 || + format == Format::SOPC || + format == Format::SOPK || + format == Format::SOPP; + } + + constexpr bool isVMEM() const noexcept + { + return format == Format::MTBUF || + format == Format::MUBUF || + format == Format::MIMG; + } + + constexpr bool isDPP() const noexcept + { + return (uint16_t) format & (uint16_t) Format::DPP; + } + + constexpr bool isVOP3() const noexcept + { + return ((uint16_t) format & (uint16_t) Format::VOP3A) || + ((uint16_t) format & (uint16_t) Format::VOP3B) || + format == Format::VOP3P; + } + + constexpr bool isSDWA() const noexcept + { + return (uint16_t) format & (uint16_t) Format::SDWA; + } + + constexpr bool isFlatOrGlobal() const noexcept + { + return format == Format::FLAT || format == Format::GLOBAL; + } + + constexpr bool usesModifiers() const noexcept; + + constexpr bool reads_exec() const noexcept + { + for (const Operand& op : operands) { + if (op.isFixed() && op.physReg() == exec) + return true; + } + return false; + } +}; + +struct SOPK_instruction : public Instruction { + uint16_t imm; +}; + +struct SOPP_instruction : public Instruction { + uint32_t imm; + int block; +}; + +struct SOPC_instruction : public Instruction { +}; + +struct SOP1_instruction : public Instruction { +}; + +struct SOP2_instruction : public Instruction { +}; + +/** + * Scalar Memory Format: + * For s_(buffer_)load_dword*: + * Operand(0): SBASE - SGPR-pair which provides base address + * Operand(1): Offset - immediate (un)signed offset or SGPR + * Operand(2) / Definition(0): SDATA - SGPR for read / write result + * Operand(n-1): SOffset - SGPR offset (Vega only) + * + * Having no operands is also valid for instructions such as s_dcache_inv. + * + */ +struct SMEM_instruction : public Instruction { + bool glc : 1; /* VI+: globally coherent */ + bool dlc : 1; /* NAVI: device level coherent */ + bool nv : 1; /* VEGA only: Non-volatile */ + bool can_reorder : 1; + bool disable_wqm : 1; + barrier_interaction barrier; +}; + +struct VOP1_instruction : public Instruction { +}; + +struct VOP2_instruction : public Instruction { +}; + +struct VOPC_instruction : public Instruction { +}; + +struct VOP3A_instruction : public Instruction { + bool abs[3]; + bool neg[3]; + uint8_t opsel : 4; + uint8_t omod : 2; + bool clamp : 1; +}; + +/** + * Data Parallel Primitives Format: + * This format can be used for VOP1, VOP2 or VOPC instructions. + * The swizzle applies to the src0 operand. + * + */ +struct DPP_instruction : public Instruction { + bool abs[2]; + bool neg[2]; + uint16_t dpp_ctrl; + uint8_t row_mask : 4; + uint8_t bank_mask : 4; + bool bound_ctrl : 1; +}; + +struct Interp_instruction : public Instruction { + uint8_t attribute; + uint8_t component; +}; + +/** + * Local and Global Data Sharing instructions + * Operand(0): ADDR - VGPR which supplies the address. + * Operand(1): DATA0 - First data VGPR. + * Operand(2): DATA1 - Second data VGPR. + * Operand(n-1): M0 - LDS size. + * Definition(0): VDST - Destination VGPR when results returned to VGPRs. + * + */ +struct DS_instruction : public Instruction { + int16_t offset0; + int8_t offset1; + bool gds; +}; + +/** + * Vector Memory Untyped-buffer Instructions + * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant) + * Operand(1): VADDR - Address source. Can carry an index and/or offset + * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant) + * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data + * + */ +struct MUBUF_instruction : public Instruction { + uint16_t offset : 12; /* Unsigned byte offset - 12 bit */ + bool offen : 1; /* Supply an offset from VGPR (VADDR) */ + bool idxen : 1; /* Supply an index from VGPR (VADDR) */ + bool addr64 : 1; /* SI, CIK: Address size is 64-bit */ + bool glc : 1; /* globally coherent */ + bool dlc : 1; /* NAVI: device level coherent */ + bool slc : 1; /* system level coherent */ + bool tfe : 1; /* texture fail enable */ + bool lds : 1; /* Return read-data to LDS instead of VGPRs */ + bool disable_wqm : 1; /* Require an exec mask without helper invocations */ + bool can_reorder : 1; + barrier_interaction barrier; +}; + +/** + * Vector Memory Typed-buffer Instructions + * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant) + * Operand(1): VADDR - Address source. Can carry an index and/or offset + * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant) + * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data + * + */ +struct MTBUF_instruction : public Instruction { + uint16_t offset; /* Unsigned byte offset - 12 bit */ + uint8_t dfmt : 4; /* Data Format of data in memory buffer */ + uint8_t nfmt : 3; /* Numeric format of data in memory */ + bool offen : 1; /* Supply an offset from VGPR (VADDR) */ + bool idxen : 1; /* Supply an index from VGPR (VADDR) */ + bool glc : 1; /* globally coherent */ + bool dlc : 1; /* NAVI: device level coherent */ + bool slc : 1; /* system level coherent */ + bool tfe : 1; /* texture fail enable */ + bool disable_wqm : 1; /* Require an exec mask without helper invocations */ + bool can_reorder : 1; + barrier_interaction barrier; +}; + +/** + * Vector Memory Image Instructions + * Operand(0) SRSRC - Scalar GPR that specifies the resource constant. + * Operand(1): SSAMP - Scalar GPR that specifies sampler constant. + * or VDATA - Vector GPR for write data. + * Operand(2): VADDR - Address source. Can carry an offset or an index. + * Definition(0): VDATA - Vector GPR for read result. + * + */ +struct MIMG_instruction : public Instruction { + uint8_t dmask; /* Data VGPR enable mask */ + uint8_t dim : 3; /* NAVI: dimensionality */ + bool unrm : 1; /* Force address to be un-normalized */ + bool dlc : 1; /* NAVI: device level coherent */ + bool glc : 1; /* globally coherent */ + bool slc : 1; /* system level coherent */ + bool tfe : 1; /* texture fail enable */ + bool da : 1; /* declare an array */ + bool lwe : 1; /* Force data to be un-normalized */ + bool r128 : 1; /* NAVI: Texture resource size */ + bool a16 : 1; /* VEGA, NAVI: Address components are 16-bits */ + bool d16 : 1; /* Convert 32-bit data to 16-bit data */ + bool disable_wqm : 1; /* Require an exec mask without helper invocations */ + bool can_reorder : 1; + barrier_interaction barrier; +}; + +/** + * Flat/Scratch/Global Instructions + * Operand(0): ADDR + * Operand(1): SADDR + * Operand(2) / Definition(0): DATA/VDST + * + */ +struct FLAT_instruction : public Instruction { + uint16_t offset; /* Vega/Navi only */ + bool slc : 1; /* system level coherent */ + bool glc : 1; /* globally coherent */ + bool dlc : 1; /* NAVI: device level coherent */ + bool lds : 1; + bool nv : 1; + bool disable_wqm : 1; /* Require an exec mask without helper invocations */ + bool can_reorder : 1; + barrier_interaction barrier; +}; + +struct Export_instruction : public Instruction { + uint8_t enabled_mask; + uint8_t dest; + bool compressed : 1; + bool done : 1; + bool valid_mask : 1; +}; + +struct Pseudo_instruction : public Instruction { + bool tmp_in_scc; + PhysReg scratch_sgpr; /* might not be valid if it's not needed */ +}; + +struct Pseudo_branch_instruction : public Instruction { + /* target[0] is the block index of the branch target. + * For conditional branches, target[1] contains the fall-through alternative. + * A value of 0 means the target has not been initialized (BB0 cannot be a branch target). + */ + uint32_t target[2]; +}; + +struct Pseudo_barrier_instruction : public Instruction { +}; + +enum ReduceOp { + iadd32, iadd64, + imul32, imul64, + fadd32, fadd64, + fmul32, fmul64, + imin32, imin64, + imax32, imax64, + umin32, umin64, + umax32, umax64, + fmin32, fmin64, + fmax32, fmax64, + iand32, iand64, + ior32, ior64, + ixor32, ixor64, + gfx10_wave64_bpermute +}; + +/** + * Subgroup Reduction Instructions, everything except for the data to be + * reduced and the result as inserted by setup_reduce_temp(). + * Operand(0): data to be reduced + * Operand(1): reduce temporary + * Operand(2): vector temporary + * Definition(0): result + * Definition(1): scalar temporary + * Definition(2): scalar identity temporary (not used to store identity on GFX10) + * Definition(3): scc clobber + * Definition(4): vcc clobber + * + */ +struct Pseudo_reduction_instruction : public Instruction { + ReduceOp reduce_op; + unsigned cluster_size; // must be 0 for scans +}; + +struct instr_deleter_functor { + void operator()(void* p) { + free(p); + } +}; + +template +using aco_ptr = std::unique_ptr; + +template +T* create_instruction(aco_opcode opcode, Format format, uint32_t num_operands, uint32_t num_definitions) +{ + std::size_t size = sizeof(T) + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition); + char *data = (char*) calloc(1, size); + T* inst = (T*) data; + + inst->opcode = opcode; + inst->format = format; + + uint16_t operands_offset = data + sizeof(T) - (char*)&inst->operands; + inst->operands = aco::span(operands_offset, num_operands); + uint16_t definitions_offset = (char*)inst->operands.end() - (char*)&inst->definitions; + inst->definitions = aco::span(definitions_offset, num_definitions); + + return inst; +} + +constexpr bool Instruction::usesModifiers() const noexcept +{ + if (isDPP() || isSDWA()) + return true; + if (!isVOP3()) + return false; + const VOP3A_instruction *vop3 = static_cast(this); + for (unsigned i = 0; i < operands.size(); i++) { + if (vop3->abs[i] || vop3->neg[i]) + return true; + } + return vop3->opsel || vop3->clamp || vop3->omod; +} + +constexpr bool is_phi(Instruction* instr) +{ + return instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi; +} + +static inline bool is_phi(aco_ptr& instr) +{ + return is_phi(instr.get()); +} + +barrier_interaction get_barrier_interaction(Instruction* instr); + +bool is_dead(const std::vector& uses, Instruction *instr); + +enum block_kind { + /* uniform indicates that leaving this block, + * all actives lanes stay active */ + block_kind_uniform = 1 << 0, + block_kind_top_level = 1 << 1, + block_kind_loop_preheader = 1 << 2, + block_kind_loop_header = 1 << 3, + block_kind_loop_exit = 1 << 4, + block_kind_continue = 1 << 5, + block_kind_break = 1 << 6, + block_kind_continue_or_break = 1 << 7, + block_kind_discard = 1 << 8, + block_kind_branch = 1 << 9, + block_kind_merge = 1 << 10, + block_kind_invert = 1 << 11, + block_kind_uses_discard_if = 1 << 12, + block_kind_needs_lowering = 1 << 13, + block_kind_uses_demote = 1 << 14, + block_kind_export_end = 1 << 15, +}; + + +struct RegisterDemand { + constexpr RegisterDemand() = default; + constexpr RegisterDemand(const int16_t v, const int16_t s) noexcept + : vgpr{v}, sgpr{s} {} + int16_t vgpr = 0; + int16_t sgpr = 0; + + constexpr friend bool operator==(const RegisterDemand a, const RegisterDemand b) noexcept { + return a.vgpr == b.vgpr && a.sgpr == b.sgpr; + } + + constexpr bool exceeds(const RegisterDemand other) const noexcept { + return vgpr > other.vgpr || sgpr > other.sgpr; + } + + constexpr RegisterDemand operator+(const Temp t) const noexcept { + if (t.type() == RegType::sgpr) + return RegisterDemand( vgpr, sgpr + t.size() ); + else + return RegisterDemand( vgpr + t.size(), sgpr ); + } + + constexpr RegisterDemand operator+(const RegisterDemand other) const noexcept { + return RegisterDemand(vgpr + other.vgpr, sgpr + other.sgpr); + } + + constexpr RegisterDemand operator-(const RegisterDemand other) const noexcept { + return RegisterDemand(vgpr - other.vgpr, sgpr - other.sgpr); + } + + constexpr RegisterDemand& operator+=(const RegisterDemand other) noexcept { + vgpr += other.vgpr; + sgpr += other.sgpr; + return *this; + } + + constexpr RegisterDemand& operator-=(const RegisterDemand other) noexcept { + vgpr -= other.vgpr; + sgpr -= other.sgpr; + return *this; + } + + constexpr RegisterDemand& operator+=(const Temp t) noexcept { + if (t.type() == RegType::sgpr) + sgpr += t.size(); + else + vgpr += t.size(); + return *this; + } + + constexpr RegisterDemand& operator-=(const Temp t) noexcept { + if (t.type() == RegType::sgpr) + sgpr -= t.size(); + else + vgpr -= t.size(); + return *this; + } + + constexpr void update(const RegisterDemand other) noexcept { + vgpr = std::max(vgpr, other.vgpr); + sgpr = std::max(sgpr, other.sgpr); + } + +}; + +/* CFG */ +struct Block { + float_mode fp_mode; + unsigned index; + unsigned offset = 0; + std::vector> instructions; + std::vector logical_preds; + std::vector linear_preds; + std::vector logical_succs; + std::vector linear_succs; + RegisterDemand register_demand = RegisterDemand(); + uint16_t loop_nest_depth = 0; + uint16_t kind = 0; + int logical_idom = -1; + int linear_idom = -1; + Temp live_out_exec = Temp(); + + /* this information is needed for predecessors to blocks with phis when + * moving out of ssa */ + bool scc_live_out = false; + PhysReg scratch_sgpr = PhysReg(); /* only needs to be valid if scc_live_out != false */ + + Block(unsigned idx) : index(idx) {} + Block() : index(0) {} +}; + +using Stage = uint16_t; + +/* software stages */ +static constexpr Stage sw_vs = 1 << 0; +static constexpr Stage sw_gs = 1 << 1; +static constexpr Stage sw_tcs = 1 << 2; +static constexpr Stage sw_tes = 1 << 3; +static constexpr Stage sw_fs = 1 << 4; +static constexpr Stage sw_cs = 1 << 5; +static constexpr Stage sw_gs_copy = 1 << 6; +static constexpr Stage sw_mask = 0x7f; + +/* hardware stages (can't be OR'd, just a mask for convenience when testing multiple) */ +static constexpr Stage hw_vs = 1 << 7; +static constexpr Stage hw_es = 1 << 8; /* not on GFX9. combined into GS on GFX9 (and GFX10/legacy). */ +static constexpr Stage hw_gs = 1 << 9; +static constexpr Stage hw_ls = 1 << 10; /* not on GFX9. combined into HS on GFX9 (and GFX10/legacy). */ +static constexpr Stage hw_hs = 1 << 11; +static constexpr Stage hw_fs = 1 << 12; +static constexpr Stage hw_cs = 1 << 13; +static constexpr Stage hw_mask = 0x7f << 7; + +/* possible settings of Program::stage */ +static constexpr Stage vertex_vs = sw_vs | hw_vs; +static constexpr Stage fragment_fs = sw_fs | hw_fs; +static constexpr Stage compute_cs = sw_cs | hw_cs; +static constexpr Stage tess_eval_vs = sw_tes | hw_vs; +static constexpr Stage gs_copy_vs = sw_gs_copy | hw_vs; +/* GFX10/NGG */ +static constexpr Stage ngg_vertex_gs = sw_vs | hw_gs; +static constexpr Stage ngg_vertex_geometry_gs = sw_vs | sw_gs | hw_gs; +static constexpr Stage ngg_tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs; +static constexpr Stage ngg_vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs; +/* GFX9 (and GFX10 if NGG isn't used) */ +static constexpr Stage vertex_geometry_gs = sw_vs | sw_gs | hw_gs; +static constexpr Stage vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs; +static constexpr Stage tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs; +/* pre-GFX9 */ +static constexpr Stage vertex_ls = sw_vs | hw_ls; /* vertex before tesselation control */ +static constexpr Stage vertex_es = sw_vs | hw_es; /* vertex before geometry */ +static constexpr Stage tess_control_hs = sw_tcs | hw_hs; +static constexpr Stage tess_eval_es = sw_tes | hw_gs; /* tesselation evaluation before geometry */ +static constexpr Stage geometry_gs = sw_gs | hw_gs; + +class Program final { +public: + float_mode next_fp_mode; + std::vector blocks; + RegisterDemand max_reg_demand = RegisterDemand(); + uint16_t num_waves = 0; + uint16_t max_waves = 0; /* maximum number of waves, regardless of register usage */ + ac_shader_config* config; + struct radv_shader_info *info; + enum chip_class chip_class; + enum radeon_family family; + unsigned wave_size; + RegClass lane_mask; + Stage stage; /* Stage */ + bool needs_exact = false; /* there exists an instruction with disable_wqm = true */ + bool needs_wqm = false; /* there exists a p_wqm instruction */ + bool wb_smem_l1_on_end = false; + + std::vector constant_data; + Temp private_segment_buffer; + Temp scratch_offset; + + uint16_t min_waves = 0; + uint16_t lds_alloc_granule; + uint32_t lds_limit; /* in bytes */ + uint16_t vgpr_limit; + uint16_t sgpr_limit; + uint16_t physical_sgprs; + uint16_t sgpr_alloc_granule; /* minus one. must be power of two */ + uint16_t vgpr_alloc_granule; /* minus one. must be power of two */ + + bool needs_vcc = false; + bool needs_xnack_mask = false; + bool needs_flat_scr = false; + + uint32_t allocateId() + { + assert(allocationID <= 16777215); + return allocationID++; + } + + uint32_t peekAllocationId() + { + return allocationID; + } + + void setAllocationId(uint32_t id) + { + allocationID = id; + } + + Block* create_and_insert_block() { + blocks.emplace_back(blocks.size()); + blocks.back().fp_mode = next_fp_mode; + return &blocks.back(); + } + + Block* insert_block(Block&& block) { + block.index = blocks.size(); + block.fp_mode = next_fp_mode; + blocks.emplace_back(std::move(block)); + return &blocks.back(); + } + +private: + uint32_t allocationID = 1; +}; + +struct live { + /* live temps out per block */ + std::vector> live_out; + /* register demand (sgpr/vgpr) per instruction per block */ + std::vector> register_demand; +}; + +void select_program(Program *program, + unsigned shader_count, + struct nir_shader *const *shaders, + ac_shader_config* config, + struct radv_shader_args *args); +void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader, + ac_shader_config* config, + struct radv_shader_args *args); + +void lower_wqm(Program* program, live& live_vars, + const struct radv_nir_compiler_options *options); +void lower_bool_phis(Program* program); +void calc_min_waves(Program* program); +void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand); +live live_var_analysis(Program* program, const struct radv_nir_compiler_options *options); +std::vector dead_code_analysis(Program *program); +void dominator_tree(Program* program); +void insert_exec_mask(Program *program); +void value_numbering(Program* program); +void optimize(Program* program); +void setup_reduce_temp(Program* program); +void lower_to_cssa(Program* program, live& live_vars, const struct radv_nir_compiler_options *options); +void register_allocation(Program *program, std::vector> live_out_per_block); +void ssa_elimination(Program* program); +void lower_to_hw_instr(Program* program); +void schedule_program(Program* program, live& live_vars); +void spill(Program* program, live& live_vars, const struct radv_nir_compiler_options *options); +void insert_wait_states(Program* program); +void insert_NOPs(Program* program); +unsigned emit_program(Program* program, std::vector& code); +void print_asm(Program *program, std::vector& binary, + unsigned exec_size, std::ostream& out); +void validate(Program* program, FILE *output); +bool validate_ra(Program* program, const struct radv_nir_compiler_options *options, FILE *output); +#ifndef NDEBUG +void perfwarn(bool cond, const char *msg, Instruction *instr=NULL); +#else +#define perfwarn(program, cond, msg, ...) do {} while(0) +#endif + +void aco_print_instr(Instruction *instr, FILE *output); +void aco_print_program(Program *program, FILE *output); + +/* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */ +uint16_t get_extra_sgprs(Program *program); + +/* get number of sgprs/vgprs allocated required to address a number of sgprs/vgprs */ +uint16_t get_sgpr_alloc(Program *program, uint16_t addressable_sgprs); +uint16_t get_vgpr_alloc(Program *program, uint16_t addressable_vgprs); + +/* return number of addressable sgprs/vgprs for max_waves */ +uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t max_waves); +uint16_t get_addr_vgpr_from_waves(Program *program, uint16_t max_waves); + +typedef struct { + const int16_t opcode_gfx7[static_cast(aco_opcode::num_opcodes)]; + const int16_t opcode_gfx9[static_cast(aco_opcode::num_opcodes)]; + const int16_t opcode_gfx10[static_cast(aco_opcode::num_opcodes)]; + const std::bitset(aco_opcode::num_opcodes)> can_use_input_modifiers; + const std::bitset(aco_opcode::num_opcodes)> can_use_output_modifiers; + const std::bitset(aco_opcode::num_opcodes)> is_atomic; + const char *name[static_cast(aco_opcode::num_opcodes)]; + const aco::Format format[static_cast(aco_opcode::num_opcodes)]; +} Info; + +extern const Info instr_info; + +} + +#endif /* ACO_IR_H */ + diff -Nru mesa-19.2.8/src/amd/compiler/aco_live_var_analysis.cpp mesa-20.0.8/src/amd/compiler/aco_live_var_analysis.cpp --- mesa-19.2.8/src/amd/compiler/aco_live_var_analysis.cpp 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/amd/compiler/aco_live_var_analysis.cpp 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,385 @@ +/* + * Copyright © 2018 Valve Corporation + * Copyright © 2018 Google + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * Authors: + * Daniel Schürmann (daniel.schuermann@campus.tu-berlin.de) + * Bas Nieuwenhuizen (bas@basnieuwenhuizen.nl) + * + */ + +#include "aco_ir.h" +#include "util/u_math.h" + +#include +#include + +#include "vulkan/radv_shader.h" + +namespace aco { +namespace { + +void process_live_temps_per_block(Program *program, live& lives, Block* block, + std::set& worklist, std::vector& phi_sgpr_ops) +{ + std::vector& register_demand = lives.register_demand[block->index]; + RegisterDemand new_demand; + + register_demand.resize(block->instructions.size()); + block->register_demand = RegisterDemand(); + + std::set live_sgprs; + std::set live_vgprs; + + /* add the live_out_exec to live */ + bool exec_live = false; + if (block->live_out_exec != Temp()) { + live_sgprs.insert(block->live_out_exec); + new_demand.sgpr += program->lane_mask.size(); + exec_live = true; + } + + /* split the live-outs from this block into the temporary sets */ + std::vector>& live_temps = lives.live_out; + for (const Temp temp : live_temps[block->index]) { + const bool inserted = temp.is_linear() + ? live_sgprs.insert(temp).second + : live_vgprs.insert(temp).second; + if (inserted) { + new_demand += temp; + } + } + new_demand.sgpr -= phi_sgpr_ops[block->index]; + + /* traverse the instructions backwards */ + int idx; + for (idx = block->instructions.size() -1; idx >= 0; idx--) { + Instruction *insn = block->instructions[idx].get(); + if (is_phi(insn)) + break; + + /* substract the 1 or 2 sgprs from exec */ + if (exec_live) + assert(new_demand.sgpr >= (int16_t) program->lane_mask.size()); + register_demand[idx] = RegisterDemand(new_demand.vgpr, new_demand.sgpr - (exec_live ? program->lane_mask.size() : 0)); + + /* KILL */ + for (Definition& definition : insn->definitions) { + if (!definition.isTemp()) { + continue; + } + + const Temp temp = definition.getTemp(); + size_t n = 0; + if (temp.is_linear()) + n = live_sgprs.erase(temp); + else + n = live_vgprs.erase(temp); + + if (n) { + new_demand -= temp; + definition.setKill(false); + } else { + register_demand[idx] += temp; + definition.setKill(true); + } + + if (definition.isFixed() && definition.physReg() == exec) + exec_live = false; + } + + /* GEN */ + if (insn->opcode == aco_opcode::p_logical_end) { + new_demand.sgpr += phi_sgpr_ops[block->index]; + } else { + /* we need to do this in a separate loop because the next one can + * setKill() for several operands at once and we don't want to + * overwrite that in a later iteration */ + for (Operand& op : insn->operands) + op.setKill(false); + + for (unsigned i = 0; i < insn->operands.size(); ++i) + { + Operand& operand = insn->operands[i]; + if (!operand.isTemp()) { + continue; + } + const Temp temp = operand.getTemp(); + const bool inserted = temp.is_linear() + ? live_sgprs.insert(temp).second + : live_vgprs.insert(temp).second; + if (inserted) { + operand.setFirstKill(true); + for (unsigned j = i + 1; j < insn->operands.size(); ++j) { + if (insn->operands[j].isTemp() && insn->operands[j].tempId() == operand.tempId()) { + insn->operands[j].setFirstKill(false); + insn->operands[j].setKill(true); + } + } + new_demand += temp; + } + + if (operand.isFixed() && operand.physReg() == exec) + exec_live = true; + } + } + + block->register_demand.update(register_demand[idx]); + } + + /* update block's register demand for a last time */ + if (exec_live) + assert(new_demand.sgpr >= (int16_t) program->lane_mask.size()); + new_demand.sgpr -= exec_live ? program->lane_mask.size() : 0; + block->register_demand.update(new_demand); + + /* handle phi definitions */ + int phi_idx = idx; + while (phi_idx >= 0) { + register_demand[phi_idx] = new_demand; + Instruction *insn = block->instructions[phi_idx].get(); + + assert(is_phi(insn)); + assert(insn->definitions.size() == 1 && insn->definitions[0].isTemp()); + Definition& definition = insn->definitions[0]; + const Temp temp = definition.getTemp(); + size_t n = 0; + + if (temp.is_linear()) + n = live_sgprs.erase(temp); + else + n = live_vgprs.erase(temp); + + if (n) + definition.setKill(false); + else + definition.setKill(true); + + phi_idx--; + } + + /* now, we have the live-in sets and need to merge them into the live-out sets */ + for (unsigned pred_idx : block->logical_preds) { + for (Temp vgpr : live_vgprs) { + auto it = live_temps[pred_idx].insert(vgpr); + if (it.second) + worklist.insert(pred_idx); + } + } + + for (unsigned pred_idx : block->linear_preds) { + for (Temp sgpr : live_sgprs) { + auto it = live_temps[pred_idx].insert(sgpr); + if (it.second) + worklist.insert(pred_idx); + } + } + + /* handle phi operands */ + phi_idx = idx; + while (phi_idx >= 0) { + Instruction *insn = block->instructions[phi_idx].get(); + assert(is_phi(insn)); + /* directly insert into the predecessors live-out set */ + std::vector& preds = insn->opcode == aco_opcode::p_phi + ? block->logical_preds + : block->linear_preds; + for (unsigned i = 0; i < preds.size(); ++i) { + Operand &operand = insn->operands[i]; + if (!operand.isTemp()) { + continue; + } + /* check if we changed an already processed block */ + const bool inserted = live_temps[preds[i]].insert(operand.getTemp()).second; + if (inserted) { + operand.setKill(true); + worklist.insert(preds[i]); + if (insn->opcode == aco_opcode::p_phi && operand.getTemp().type() == RegType::sgpr) + phi_sgpr_ops[preds[i]] += operand.size(); + } + } + phi_idx--; + } + + if ((block->logical_preds.empty() && !live_vgprs.empty()) || + (block->linear_preds.empty() && !live_sgprs.empty())) { + aco_print_program(program, stderr); + fprintf(stderr, "These temporaries are never defined or are defined after use:\n"); + for (Temp vgpr : live_vgprs) + fprintf(stderr, "%%%d\n", vgpr.id()); + for (Temp sgpr : live_sgprs) + fprintf(stderr, "%%%d\n", sgpr.id()); + abort(); + } + + assert(block->index != 0 || new_demand == RegisterDemand()); +} + +unsigned calc_waves_per_workgroup(Program *program) +{ + unsigned workgroup_size = program->wave_size; + if (program->stage == compute_cs) { + unsigned* bsize = program->info->cs.block_size; + workgroup_size = bsize[0] * bsize[1] * bsize[2]; + } + return align(workgroup_size, program->wave_size) / program->wave_size; +} +} /* end namespace */ + +uint16_t get_extra_sgprs(Program *program) +{ + if (program->chip_class >= GFX10) { + assert(!program->needs_flat_scr); + assert(!program->needs_xnack_mask); + return 2; + } else if (program->chip_class >= GFX8) { + if (program->needs_flat_scr) + return 6; + else if (program->needs_xnack_mask) + return 4; + else if (program->needs_vcc) + return 2; + else + return 0; + } else { + assert(!program->needs_xnack_mask); + if (program->needs_flat_scr) + return 4; + else if (program->needs_vcc) + return 2; + else + return 0; + } +} + +uint16_t get_sgpr_alloc(Program *program, uint16_t addressable_sgprs) +{ + assert(addressable_sgprs <= program->sgpr_limit); + uint16_t sgprs = addressable_sgprs + get_extra_sgprs(program); + uint16_t granule = program->sgpr_alloc_granule + 1; + return align(std::max(sgprs, granule), granule); +} + +uint16_t get_vgpr_alloc(Program *program, uint16_t addressable_vgprs) +{ + assert(addressable_vgprs <= program->vgpr_limit); + uint16_t granule = program->vgpr_alloc_granule + 1; + return align(std::max(addressable_vgprs, granule), granule); +} + +uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t max_waves) +{ + uint16_t sgprs = program->physical_sgprs / max_waves & ~program->sgpr_alloc_granule; + sgprs -= get_extra_sgprs(program); + return std::min(sgprs, program->sgpr_limit); +} + +uint16_t get_addr_vgpr_from_waves(Program *program, uint16_t max_waves) +{ + uint16_t vgprs = 256 / max_waves & ~program->vgpr_alloc_granule; + return std::min(vgprs, program->vgpr_limit); +} + +void calc_min_waves(Program* program) +{ + unsigned waves_per_workgroup = calc_waves_per_workgroup(program); + /* currently min_waves is in wave64 waves */ + if (program->wave_size == 32) + waves_per_workgroup = DIV_ROUND_UP(waves_per_workgroup, 2); + + unsigned simd_per_cu = 4; /* TODO: different on Navi */ + bool wgp = program->chip_class >= GFX10; /* assume WGP is used on Navi */ + unsigned simd_per_cu_wgp = wgp ? simd_per_cu * 2 : simd_per_cu; + + program->min_waves = DIV_ROUND_UP(waves_per_workgroup, simd_per_cu_wgp); +} + +void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand) +{ + /* TODO: max_waves_per_simd, simd_per_cu and the number of physical vgprs for Navi */ + unsigned max_waves_per_simd = 10; + unsigned simd_per_cu = 4; + + bool wgp = program->chip_class >= GFX10; /* assume WGP is used on Navi */ + unsigned simd_per_cu_wgp = wgp ? simd_per_cu * 2 : simd_per_cu; + unsigned lds_limit = wgp ? program->lds_limit * 2 : program->lds_limit; + + /* this won't compile, register pressure reduction necessary */ + if (new_demand.vgpr > program->vgpr_limit || new_demand.sgpr > program->sgpr_limit) { + program->num_waves = 0; + program->max_reg_demand = new_demand; + } else { + program->num_waves = program->physical_sgprs / get_sgpr_alloc(program, new_demand.sgpr); + program->num_waves = std::min(program->num_waves, 256 / get_vgpr_alloc(program, new_demand.vgpr)); + program->max_waves = max_waves_per_simd; + + /* adjust max_waves for workgroup and LDS limits */ + unsigned waves_per_workgroup = calc_waves_per_workgroup(program); + unsigned workgroups_per_cu_wgp = max_waves_per_simd * simd_per_cu_wgp / waves_per_workgroup; + if (program->config->lds_size) { + unsigned lds = program->config->lds_size * program->lds_alloc_granule; + workgroups_per_cu_wgp = std::min(workgroups_per_cu_wgp, lds_limit / lds); + } + if (waves_per_workgroup > 1 && program->chip_class < GFX10) + workgroups_per_cu_wgp = std::min(workgroups_per_cu_wgp, 16u); /* TODO: is this a SI-only limit? what about Navi? */ + + /* in cases like waves_per_workgroup=3 or lds=65536 and + * waves_per_workgroup=1, we want the maximum possible number of waves per + * SIMD and not the minimum. so DIV_ROUND_UP is used */ + program->max_waves = std::min(program->max_waves, DIV_ROUND_UP(workgroups_per_cu_wgp * waves_per_workgroup, simd_per_cu_wgp)); + + /* incorporate max_waves and calculate max_reg_demand */ + program->num_waves = std::min(program->num_waves, program->max_waves); + program->max_reg_demand.vgpr = get_addr_vgpr_from_waves(program, program->num_waves); + program->max_reg_demand.sgpr = get_addr_sgpr_from_waves(program, program->num_waves); + } +} + +live live_var_analysis(Program* program, + const struct radv_nir_compiler_options *options) +{ + live result; + result.live_out.resize(program->blocks.size()); + result.register_demand.resize(program->blocks.size()); + std::set worklist; + std::vector phi_sgpr_ops(program->blocks.size()); + RegisterDemand new_demand; + + /* this implementation assumes that the block idx corresponds to the block's position in program->blocks vector */ + for (Block& block : program->blocks) + worklist.insert(block.index); + while (!worklist.empty()) { + std::set::reverse_iterator b_it = worklist.rbegin(); + unsigned block_idx = *b_it; + worklist.erase(block_idx); + process_live_temps_per_block(program, result, &program->blocks[block_idx], worklist, phi_sgpr_ops); + new_demand.update(program->blocks[block_idx].register_demand); + } + + /* calculate the program's register demand and number of waves */ + update_vgpr_sgpr_demand(program, new_demand); + + return result; +} + +} + diff -Nru mesa-19.2.8/src/amd/compiler/aco_lower_bool_phis.cpp mesa-20.0.8/src/amd/compiler/aco_lower_bool_phis.cpp --- mesa-19.2.8/src/amd/compiler/aco_lower_bool_phis.cpp 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/amd/compiler/aco_lower_bool_phis.cpp 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,207 @@ +/* + * Copyright © 2019 Valve Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * Authors: + * Rhys Perry (pendingchaos02@gmail.com) + * + */ + +#include + +#include "aco_ir.h" +#include "aco_builder.h" +#include + + +namespace aco { + +struct phi_use { + Block *block; + unsigned phi_def; + + bool operator<(const phi_use& other) const { + return std::make_tuple(block, phi_def) < + std::make_tuple(other.block, other.phi_def); + } +}; + +struct ssa_state { + std::map latest; + std::map> phis; +}; + +Operand get_ssa(Program *program, unsigned block_idx, ssa_state *state) +{ + while (true) { + auto pos = state->latest.find(block_idx); + if (pos != state->latest.end()) + return Operand(Temp(pos->second, program->lane_mask)); + + Block& block = program->blocks[block_idx]; + size_t pred = block.linear_preds.size(); + if (pred == 0) { + return Operand(program->lane_mask); + } else if (pred == 1) { + block_idx = block.linear_preds[0]; + continue; + } else { + unsigned res = program->allocateId(); + state->latest[block_idx] = res; + + aco_ptr phi{create_instruction(aco_opcode::p_linear_phi, Format::PSEUDO, pred, 1)}; + for (unsigned i = 0; i < pred; i++) { + phi->operands[i] = get_ssa(program, block.linear_preds[i], state); + if (phi->operands[i].isTemp()) { + assert(i < 64); + state->phis[phi->operands[i].tempId()][(phi_use){&block, res}] |= (uint64_t)1 << i; + } + } + phi->definitions[0] = Definition(Temp{res, program->lane_mask}); + block.instructions.emplace(block.instructions.begin(), std::move(phi)); + + return Operand(Temp(res, program->lane_mask)); + } + } +} + +void update_phi(Program *program, ssa_state *state, Block *block, unsigned phi_def, uint64_t operand_mask) { + for (auto& phi : block->instructions) { + if (phi->opcode != aco_opcode::p_phi && phi->opcode != aco_opcode::p_linear_phi) + break; + if (phi->opcode != aco_opcode::p_linear_phi) + continue; + if (phi->definitions[0].tempId() != phi_def) + continue; + assert(ffsll(operand_mask) <= phi->operands.size()); + + uint64_t operands = operand_mask; + while (operands) { + unsigned operand = u_bit_scan64(&operands); + Operand new_operand = get_ssa(program, block->linear_preds[operand], state); + phi->operands[operand] = new_operand; + if (!new_operand.isUndefined()) + state->phis[new_operand.tempId()][(phi_use){block, phi_def}] |= (uint64_t)1 << operand; + } + return; + } + assert(false); +} + +Temp write_ssa(Program *program, Block *block, ssa_state *state, unsigned previous) { + unsigned id = program->allocateId(); + state->latest[block->index] = id; + + /* update phis */ + if (previous) { + std::map phis; + phis.swap(state->phis[previous]); + for (auto& phi : phis) + update_phi(program, state, phi.first.block, phi.first.phi_def, phi.second); + } + + return {id, program->lane_mask}; +} + +void insert_before_logical_end(Block *block, aco_ptr instr) +{ + auto IsLogicalEnd = [] (const aco_ptr& instr) -> bool { + return instr->opcode == aco_opcode::p_logical_end; + }; + auto it = std::find_if(block->instructions.crbegin(), block->instructions.crend(), IsLogicalEnd); + + if (it == block->instructions.crend()) { + assert(block->instructions.back()->format == Format::PSEUDO_BRANCH); + block->instructions.insert(std::prev(block->instructions.end()), std::move(instr)); + } + else + block->instructions.insert(std::prev(it.base()), std::move(instr)); +} + +void lower_divergent_bool_phi(Program *program, Block *block, aco_ptr& phi) +{ + Builder bld(program); + + ssa_state state; + state.latest[block->index] = phi->definitions[0].tempId(); + for (unsigned i = 0; i < phi->operands.size(); i++) { + Block *pred = &program->blocks[block->logical_preds[i]]; + + if (phi->operands[i].isUndefined()) + continue; + + assert(phi->operands[i].isTemp()); + Temp phi_src = phi->operands[i].getTemp(); + assert(phi_src.regClass() == bld.lm); + + Operand cur = get_ssa(program, pred->index, &state); + assert(cur.regClass() == bld.lm); + Temp new_cur = write_ssa(program, pred, &state, cur.isTemp() ? cur.tempId() : 0); + assert(new_cur.regClass() == bld.lm); + + if (cur.isUndefined()) { + insert_before_logical_end(pred, bld.sop1(aco_opcode::s_mov_b64, Definition(new_cur), phi_src).get_ptr()); + } else { + Temp tmp1 = bld.tmp(bld.lm), tmp2 = bld.tmp(bld.lm); + insert_before_logical_end(pred, + bld.sop2(Builder::s_andn2, Definition(tmp1), bld.def(s1, scc), + cur, Operand(exec, bld.lm)).get_ptr()); + insert_before_logical_end(pred, + bld.sop2(Builder::s_and, Definition(tmp2), bld.def(s1, scc), + phi_src, Operand(exec, bld.lm)).get_ptr()); + insert_before_logical_end(pred, + bld.sop2(Builder::s_or, Definition(new_cur), bld.def(s1, scc), + tmp1, tmp2).get_ptr()); + } + } + + unsigned num_preds = block->linear_preds.size(); + if (phi->operands.size() != num_preds) { + Pseudo_instruction* new_phi{create_instruction(aco_opcode::p_linear_phi, Format::PSEUDO, num_preds, 1)}; + new_phi->definitions[0] = phi->definitions[0]; + phi.reset(new_phi); + } else { + phi->opcode = aco_opcode::p_linear_phi; + } + assert(phi->operands.size() == num_preds); + + for (unsigned i = 0; i < num_preds; i++) + phi->operands[i] = get_ssa(program, block->linear_preds[i], &state); + + return; +} + +void lower_bool_phis(Program* program) +{ + for (Block& block : program->blocks) { + for (aco_ptr& phi : block.instructions) { + if (phi->opcode == aco_opcode::p_phi) { + assert(program->wave_size == 64 ? phi->definitions[0].regClass() != s1 : phi->definitions[0].regClass() != s2); + if (phi->definitions[0].regClass() == program->lane_mask) + lower_divergent_bool_phi(program, &block, phi); + } else if (!is_phi(phi)) { + break; + } + } + } +} + +} diff -Nru mesa-19.2.8/src/amd/compiler/aco_lower_to_cssa.cpp mesa-20.0.8/src/amd/compiler/aco_lower_to_cssa.cpp --- mesa-19.2.8/src/amd/compiler/aco_lower_to_cssa.cpp 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/amd/compiler/aco_lower_to_cssa.cpp 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,212 @@ +/* + * Copyright © 2019 Valve Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + */ + +#include +#include "aco_ir.h" +#include "aco_builder.h" + +/* + * Implements an algorithm to lower to Concentional SSA Form (CSSA). + * After "Revisiting Out-of-SSA Translation for Correctness, CodeQuality, and Efficiency" + * by B. Boissinot, A. Darte, F. Rastello, B. Dupont de Dinechin, C. Guillon, + * + * By lowering the IR to CSSA, the insertion of parallelcopies is separated from + * the register coalescing problem. Additionally, correctness is ensured w.r.t. spilling. + * The algorithm tries to find beneficial insertion points by checking if a basic block + * is empty and if the variable already has a new definition in a dominating block. + */ + + +namespace aco { +namespace { + +typedef std::map>> phi_info; + +struct cssa_ctx { + Program* program; + live& live_vars; + phi_info logical_phi_info; + phi_info linear_phi_info; + + cssa_ctx(Program* program, live& live_vars) : program(program), live_vars(live_vars) {} +}; + +bool collect_phi_info(cssa_ctx& ctx) +{ + bool progress = false; + for (Block& block : ctx.program->blocks) { + for (aco_ptr& phi : block.instructions) { + bool is_logical; + if (phi->opcode == aco_opcode::p_phi) + is_logical = true; + else if (phi->opcode == aco_opcode::p_linear_phi) + is_logical = false; + else + break; + + /* no CSSA for the exec mask as we don't spill it anyway */ + if (phi->definitions[0].isFixed() && phi->definitions[0].physReg() == exec) + continue; + std::vector& preds = is_logical ? block.logical_preds : block.linear_preds; + + /* collect definition's block per Operand */ + std::vector def_points(phi->operands.size()); + for (unsigned i = 0; i < phi->operands.size(); i++) { + Operand& op = phi->operands[i]; + if (op.isUndefined()) { + def_points[i] = preds[i]; + } else if (op.isConstant()) { + /* in theory, we could insert the definition there... */ + def_points[i] = 0; + } else { + assert(op.isTemp()); + unsigned pred = preds[i]; + do { + def_points[i] = pred; + pred = is_logical ? + ctx.program->blocks[pred].logical_idom : + ctx.program->blocks[pred].linear_idom; + } while (def_points[i] != pred && + ctx.live_vars.live_out[pred].find(op.getTemp()) != ctx.live_vars.live_out[pred].end()); + } + } + + /* check live-range intersections */ + for (unsigned i = 0; i < phi->operands.size(); i++) { + Operand op = phi->operands[i]; + if (op.isUndefined()) + continue; + /* check if the operand comes from the exec mask of a predecessor */ + if (op.isTemp() && op.getTemp() == ctx.program->blocks[preds[i]].live_out_exec) + op.setFixed(exec); + + bool interferes = false; + unsigned idom = is_logical ? + ctx.program->blocks[def_points[i]].logical_idom : + ctx.program->blocks[def_points[i]].linear_idom; + /* live-through operands definitely interfere */ + if (op.isTemp() && !op.isKill()) { + interferes = true; + /* create copies for constants to ease spilling */ + } else if (op.isConstant()) { + interferes = true; + /* create copies for SGPR -> VGPR moves */ + } else if (op.regClass() != phi->definitions[0].regClass()) { + interferes = true; + /* operand might interfere with any phi-def*/ + } else if (def_points[i] == block.index) { + interferes = true; + /* operand might interfere with phi-def */ + } else if (ctx.live_vars.live_out[idom].count(phi->definitions[0].getTemp())) { + interferes = true; + /* else check for interferences with other operands */ + } else { + for (unsigned j = 0; !interferes && j < phi->operands.size(); j++) { + /* don't care about other register classes */ + if (!phi->operands[j].isTemp() || phi->operands[j].regClass() != phi->definitions[0].regClass()) + continue; + /* same operands cannot interfere */ + if (op.getTemp() == phi->operands[j].getTemp()) + continue; + /* if def_points[i] dominates any other def_point, assume they interfere. + * As live-through operands are checked above, only test up the current block. */ + unsigned other_def_point = def_points[j]; + while (def_points[i] < other_def_point && other_def_point != block.index) + other_def_point = is_logical ? + ctx.program->blocks[other_def_point].logical_idom : + ctx.program->blocks[other_def_point].linear_idom; + interferes = def_points[i] == other_def_point; + } + } + + if (!interferes) + continue; + + progress = true; + + /* create new temporary and rename operands */ + Temp new_tmp = Temp{ctx.program->allocateId(), phi->definitions[0].regClass()}; + if (is_logical) + ctx.logical_phi_info[preds[i]].emplace_back(Definition(new_tmp), op); + else + ctx.linear_phi_info[preds[i]].emplace_back(Definition(new_tmp), op); + phi->operands[i] = Operand(new_tmp); + phi->operands[i].setKill(true); + def_points[i] = preds[i]; + } + } + } + return progress; +} + +void insert_parallelcopies(cssa_ctx& ctx) +{ + /* insert the parallelcopies from logical phis before p_logical_end */ + for (auto&& entry : ctx.logical_phi_info) { + Block& block = ctx.program->blocks[entry.first]; + unsigned idx = block.instructions.size() - 1; + while (block.instructions[idx]->opcode != aco_opcode::p_logical_end) { + assert(idx > 0); + idx--; + } + + Builder bld(ctx.program); + bld.reset(&block.instructions, std::next(block.instructions.begin(), idx)); + for (std::pair& pair : entry.second) + bld.pseudo(aco_opcode::p_parallelcopy, pair.first, pair.second); + } + + /* insert parallelcopies for the linear phis at the end of blocks just before the branch */ + for (auto&& entry : ctx.linear_phi_info) { + Block& block = ctx.program->blocks[entry.first]; + std::vector>::iterator it = block.instructions.end(); + --it; + assert((*it)->format == Format::PSEUDO_BRANCH); + + Builder bld(ctx.program); + bld.reset(&block.instructions, it); + for (std::pair& pair : entry.second) + bld.pseudo(aco_opcode::p_parallelcopy, pair.first, pair.second); + } +} + +} /* end namespace */ + + +void lower_to_cssa(Program* program, live& live_vars, const struct radv_nir_compiler_options *options) +{ + cssa_ctx ctx = {program, live_vars}; + /* collect information about all interfering phi operands */ + bool progress = collect_phi_info(ctx); + + if (!progress) + return; + + insert_parallelcopies(ctx); + + /* update live variable information */ + live_vars = live_var_analysis(program, options); +} +} + diff -Nru mesa-19.2.8/src/amd/compiler/aco_lower_to_hw_instr.cpp mesa-20.0.8/src/amd/compiler/aco_lower_to_hw_instr.cpp --- mesa-19.2.8/src/amd/compiler/aco_lower_to_hw_instr.cpp 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/amd/compiler/aco_lower_to_hw_instr.cpp 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,1179 @@ +/* + * Copyright © 2018 Valve Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * Authors: + * Daniel Schürmann (daniel.schuermann@campus.tu-berlin.de) + * + */ + +#include + +#include "aco_ir.h" +#include "aco_builder.h" +#include "util/u_math.h" +#include "sid.h" +#include "vulkan/radv_shader.h" + + +namespace aco { + +struct lower_context { + Program *program; + std::vector> instructions; +}; + +aco_opcode get_reduce_opcode(chip_class chip, ReduceOp op) { + switch (op) { + case iadd32: return chip >= GFX9 ? aco_opcode::v_add_u32 : aco_opcode::v_add_co_u32; + case imul32: return aco_opcode::v_mul_lo_u32; + case fadd32: return aco_opcode::v_add_f32; + case fmul32: return aco_opcode::v_mul_f32; + case imax32: return aco_opcode::v_max_i32; + case imin32: return aco_opcode::v_min_i32; + case umin32: return aco_opcode::v_min_u32; + case umax32: return aco_opcode::v_max_u32; + case fmin32: return aco_opcode::v_min_f32; + case fmax32: return aco_opcode::v_max_f32; + case iand32: return aco_opcode::v_and_b32; + case ixor32: return aco_opcode::v_xor_b32; + case ior32: return aco_opcode::v_or_b32; + case iadd64: return aco_opcode::num_opcodes; + case imul64: return aco_opcode::num_opcodes; + case fadd64: return aco_opcode::v_add_f64; + case fmul64: return aco_opcode::v_mul_f64; + case imin64: return aco_opcode::num_opcodes; + case imax64: return aco_opcode::num_opcodes; + case umin64: return aco_opcode::num_opcodes; + case umax64: return aco_opcode::num_opcodes; + case fmin64: return aco_opcode::v_min_f64; + case fmax64: return aco_opcode::v_max_f64; + case iand64: return aco_opcode::num_opcodes; + case ior64: return aco_opcode::num_opcodes; + case ixor64: return aco_opcode::num_opcodes; + default: return aco_opcode::num_opcodes; + } +} + +void emit_vadd32(Builder& bld, Definition def, Operand src0, Operand src1) +{ + Instruction *instr = bld.vadd32(def, src0, src1, false, Operand(s2), true); + if (instr->definitions.size() >= 2) { + assert(instr->definitions[1].regClass() == bld.lm); + instr->definitions[1].setFixed(vcc); + } +} + +void emit_int64_dpp_op(lower_context *ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1_reg, + PhysReg vtmp_reg, ReduceOp op, + unsigned dpp_ctrl, unsigned row_mask, unsigned bank_mask, bool bound_ctrl, + Operand *identity=NULL) +{ + Builder bld(ctx->program, &ctx->instructions); + Definition dst[] = {Definition(dst_reg, v1), Definition(PhysReg{dst_reg+1}, v1)}; + Definition vtmp_def[] = {Definition(vtmp_reg, v1), Definition(PhysReg{vtmp_reg+1}, v1)}; + Operand src0[] = {Operand(src0_reg, v1), Operand(PhysReg{src0_reg+1}, v1)}; + Operand src1[] = {Operand(src1_reg, v1), Operand(PhysReg{src1_reg+1}, v1)}; + Operand src1_64 = Operand(src1_reg, v2); + Operand vtmp_op[] = {Operand(vtmp_reg, v1), Operand(PhysReg{vtmp_reg+1}, v1)}; + Operand vtmp_op64 = Operand(vtmp_reg, v2); + if (op == iadd64) { + if (ctx->program->chip_class >= GFX10) { + if (identity) + bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]); + bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0], + dpp_ctrl, row_mask, bank_mask, bound_ctrl); + bld.vop3(aco_opcode::v_add_co_u32_e64, dst[0], bld.def(bld.lm, vcc), vtmp_op[0], src1[0]); + } else { + bld.vop2_dpp(aco_opcode::v_add_co_u32, dst[0], bld.def(bld.lm, vcc), src0[0], src1[0], + dpp_ctrl, row_mask, bank_mask, bound_ctrl); + } + bld.vop2_dpp(aco_opcode::v_addc_co_u32, dst[1], bld.def(bld.lm, vcc), src0[1], src1[1], Operand(vcc, bld.lm), + dpp_ctrl, row_mask, bank_mask, bound_ctrl); + } else if (op == iand64) { + bld.vop2_dpp(aco_opcode::v_and_b32, dst[0], src0[0], src1[0], + dpp_ctrl, row_mask, bank_mask, bound_ctrl); + bld.vop2_dpp(aco_opcode::v_and_b32, dst[1], src0[1], src1[1], + dpp_ctrl, row_mask, bank_mask, bound_ctrl); + } else if (op == ior64) { + bld.vop2_dpp(aco_opcode::v_or_b32, dst[0], src0[0], src1[0], + dpp_ctrl, row_mask, bank_mask, bound_ctrl); + bld.vop2_dpp(aco_opcode::v_or_b32, dst[1], src0[1], src1[1], + dpp_ctrl, row_mask, bank_mask, bound_ctrl); + } else if (op == ixor64) { + bld.vop2_dpp(aco_opcode::v_xor_b32, dst[0], src0[0], src1[0], + dpp_ctrl, row_mask, bank_mask, bound_ctrl); + bld.vop2_dpp(aco_opcode::v_xor_b32, dst[1], src0[1], src1[1], + dpp_ctrl, row_mask, bank_mask, bound_ctrl); + } else if (op == umin64 || op == umax64 || op == imin64 || op == imax64) { + aco_opcode cmp = aco_opcode::num_opcodes; + switch (op) { + case umin64: + cmp = aco_opcode::v_cmp_gt_u64; + break; + case umax64: + cmp = aco_opcode::v_cmp_lt_u64; + break; + case imin64: + cmp = aco_opcode::v_cmp_gt_i64; + break; + case imax64: + cmp = aco_opcode::v_cmp_lt_i64; + break; + default: + break; + } + + if (identity) { + bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]); + bld.vop1(aco_opcode::v_mov_b32, vtmp_def[1], identity[1]); + } + bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0], + dpp_ctrl, row_mask, bank_mask, bound_ctrl); + bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[1], src0[1], + dpp_ctrl, row_mask, bank_mask, bound_ctrl); + + bld.vopc(cmp, bld.def(bld.lm, vcc), vtmp_op64, src1_64); + bld.vop2(aco_opcode::v_cndmask_b32, dst[0], vtmp_op[0], src1[0], Operand(vcc, bld.lm)); + bld.vop2(aco_opcode::v_cndmask_b32, dst[1], vtmp_op[1], src1[1], Operand(vcc, bld.lm)); + } else if (op == imul64) { + /* t4 = dpp(x_hi) + * t1 = umul_lo(t4, y_lo) + * t3 = dpp(x_lo) + * t0 = umul_lo(t3, y_hi) + * t2 = iadd(t0, t1) + * t5 = umul_hi(t3, y_lo) + * res_hi = iadd(t2, t5) + * res_lo = umul_lo(t3, y_lo) + * Requires that res_hi != src0[0] and res_hi != src1[0] + * and that vtmp[0] != res_hi. + */ + if (identity) + bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[1]); + bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[1], + dpp_ctrl, row_mask, bank_mask, bound_ctrl); + bld.vop3(aco_opcode::v_mul_lo_u32, vtmp_def[1], vtmp_op[0], src1[0]); + if (identity) + bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]); + bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0], + dpp_ctrl, row_mask, bank_mask, bound_ctrl); + bld.vop3(aco_opcode::v_mul_lo_u32, vtmp_def[0], vtmp_op[0], src1[1]); + emit_vadd32(bld, vtmp_def[1], vtmp_op[0], vtmp_op[1]); + if (identity) + bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]); + bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0], + dpp_ctrl, row_mask, bank_mask, bound_ctrl); + bld.vop3(aco_opcode::v_mul_hi_u32, vtmp_def[0], vtmp_op[0], src1[0]); + emit_vadd32(bld, dst[1], vtmp_op[1], vtmp_op[0]); + if (identity) + bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]); + bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0], + dpp_ctrl, row_mask, bank_mask, bound_ctrl); + bld.vop3(aco_opcode::v_mul_lo_u32, dst[0], vtmp_op[0], src1[0]); + } +} + +void emit_int64_op(lower_context *ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1_reg, PhysReg vtmp, ReduceOp op) +{ + Builder bld(ctx->program, &ctx->instructions); + Definition dst[] = {Definition(dst_reg, v1), Definition(PhysReg{dst_reg+1}, v1)}; + RegClass src0_rc = src0_reg.reg >= 256 ? v1 : s1; + Operand src0[] = {Operand(src0_reg, src0_rc), Operand(PhysReg{src0_reg+1}, src0_rc)}; + Operand src1[] = {Operand(src1_reg, v1), Operand(PhysReg{src1_reg+1}, v1)}; + Operand src0_64 = Operand(src0_reg, src0_reg.reg >= 256 ? v2 : s2); + Operand src1_64 = Operand(src1_reg, v2); + + if (src0_rc == s1 && + (op == imul64 || op == umin64 || op == umax64 || op == imin64 || op == imax64)) { + assert(vtmp.reg != 0); + bld.vop1(aco_opcode::v_mov_b32, Definition(vtmp, v1), src0[0]); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp+1}, v1), src0[1]); + src0_reg = vtmp; + src0[0] = Operand(vtmp, v1); + src0[1] = Operand(PhysReg{vtmp+1}, v1); + src0_64 = Operand(vtmp, v2); + } else if (src0_rc == s1 && op == iadd64) { + assert(vtmp.reg != 0); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp+1}, v1), src0[1]); + src0[1] = Operand(PhysReg{vtmp+1}, v1); + } + + if (op == iadd64) { + if (ctx->program->chip_class >= GFX10) { + bld.vop3(aco_opcode::v_add_co_u32_e64, dst[0], bld.def(bld.lm, vcc), src0[0], src1[0]); + } else { + bld.vop2(aco_opcode::v_add_co_u32, dst[0], bld.def(bld.lm, vcc), src0[0], src1[0]); + } + bld.vop2(aco_opcode::v_addc_co_u32, dst[1], bld.def(bld.lm, vcc), src0[1], src1[1], Operand(vcc, bld.lm)); + } else if (op == iand64) { + bld.vop2(aco_opcode::v_and_b32, dst[0], src0[0], src1[0]); + bld.vop2(aco_opcode::v_and_b32, dst[1], src0[1], src1[1]); + } else if (op == ior64) { + bld.vop2(aco_opcode::v_or_b32, dst[0], src0[0], src1[0]); + bld.vop2(aco_opcode::v_or_b32, dst[1], src0[1], src1[1]); + } else if (op == ixor64) { + bld.vop2(aco_opcode::v_xor_b32, dst[0], src0[0], src1[0]); + bld.vop2(aco_opcode::v_xor_b32, dst[1], src0[1], src1[1]); + } else if (op == umin64 || op == umax64 || op == imin64 || op == imax64) { + aco_opcode cmp = aco_opcode::num_opcodes; + switch (op) { + case umin64: + cmp = aco_opcode::v_cmp_gt_u64; + break; + case umax64: + cmp = aco_opcode::v_cmp_lt_u64; + break; + case imin64: + cmp = aco_opcode::v_cmp_gt_i64; + break; + case imax64: + cmp = aco_opcode::v_cmp_lt_i64; + break; + default: + break; + } + + bld.vopc(cmp, bld.def(bld.lm, vcc), src0_64, src1_64); + bld.vop2(aco_opcode::v_cndmask_b32, dst[0], src0[0], src1[0], Operand(vcc, bld.lm)); + bld.vop2(aco_opcode::v_cndmask_b32, dst[1], src0[1], src1[1], Operand(vcc, bld.lm)); + } else if (op == imul64) { + if (src1_reg == dst_reg) { + /* it's fine if src0==dst but not if src1==dst */ + std::swap(src0_reg, src1_reg); + std::swap(src0[0], src1[0]); + std::swap(src0[1], src1[1]); + std::swap(src0_64, src1_64); + } + assert(!(src0_reg == src1_reg)); + /* t1 = umul_lo(x_hi, y_lo) + * t0 = umul_lo(x_lo, y_hi) + * t2 = iadd(t0, t1) + * t5 = umul_hi(x_lo, y_lo) + * res_hi = iadd(t2, t5) + * res_lo = umul_lo(x_lo, y_lo) + * assumes that it's ok to modify x_hi/y_hi, since we might not have vtmp + */ + Definition tmp0_def(PhysReg{src0_reg+1}, v1); + Definition tmp1_def(PhysReg{src1_reg+1}, v1); + Operand tmp0_op = src0[1]; + Operand tmp1_op = src1[1]; + bld.vop3(aco_opcode::v_mul_lo_u32, tmp0_def, src0[1], src1[0]); + bld.vop3(aco_opcode::v_mul_lo_u32, tmp1_def, src0[0], src1[1]); + emit_vadd32(bld, tmp0_def, tmp1_op, tmp0_op); + bld.vop3(aco_opcode::v_mul_hi_u32, tmp1_def, src0[0], src1[0]); + emit_vadd32(bld, dst[1], tmp0_op, tmp1_op); + bld.vop3(aco_opcode::v_mul_lo_u32, dst[0], src0[0], src1[0]); + } +} + +void emit_dpp_op(lower_context *ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1_reg, + PhysReg vtmp, ReduceOp op, unsigned size, + unsigned dpp_ctrl, unsigned row_mask, unsigned bank_mask, bool bound_ctrl, + Operand *identity=NULL) /* for VOP3 with sparse writes */ +{ + Builder bld(ctx->program, &ctx->instructions); + RegClass rc = RegClass(RegType::vgpr, size); + Definition dst(dst_reg, rc); + Operand src0(src0_reg, rc); + Operand src1(src1_reg, rc); + + aco_opcode opcode = get_reduce_opcode(ctx->program->chip_class, op); + bool vop3 = op == imul32 || size == 2; + + if (!vop3) { + if (opcode == aco_opcode::v_add_co_u32) + bld.vop2_dpp(opcode, dst, bld.def(bld.lm, vcc), src0, src1, dpp_ctrl, row_mask, bank_mask, bound_ctrl); + else + bld.vop2_dpp(opcode, dst, src0, src1, dpp_ctrl, row_mask, bank_mask, bound_ctrl); + return; + } + + if (opcode == aco_opcode::num_opcodes) { + emit_int64_dpp_op(ctx, dst_reg ,src0_reg, src1_reg, vtmp, op, + dpp_ctrl, row_mask, bank_mask, bound_ctrl, identity); + return; + } + + if (identity) + bld.vop1(aco_opcode::v_mov_b32, Definition(vtmp, v1), identity[0]); + if (identity && size >= 2) + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp+1}, v1), identity[1]); + + for (unsigned i = 0; i < size; i++) + bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{src0_reg+i}, v1), + dpp_ctrl, row_mask, bank_mask, bound_ctrl); + + bld.vop3(opcode, dst, Operand(vtmp, rc), src1); +} + +void emit_op(lower_context *ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1_reg, + PhysReg vtmp, ReduceOp op, unsigned size) +{ + Builder bld(ctx->program, &ctx->instructions); + RegClass rc = RegClass(RegType::vgpr, size); + Definition dst(dst_reg, rc); + Operand src0(src0_reg, RegClass(src0_reg.reg >= 256 ? RegType::vgpr : RegType::sgpr, size)); + Operand src1(src1_reg, rc); + + aco_opcode opcode = get_reduce_opcode(ctx->program->chip_class, op); + bool vop3 = op == imul32 || size == 2; + + if (opcode == aco_opcode::num_opcodes) { + emit_int64_op(ctx, dst_reg, src0_reg, src1_reg, vtmp, op); + return; + } + + if (vop3) { + bld.vop3(opcode, dst, src0, src1); + } else if (opcode == aco_opcode::v_add_co_u32) { + bld.vop2(opcode, dst, bld.def(bld.lm, vcc), src0, src1); + } else { + bld.vop2(opcode, dst, src0, src1); + } +} + +void emit_dpp_mov(lower_context *ctx, PhysReg dst, PhysReg src0, unsigned size, + unsigned dpp_ctrl, unsigned row_mask, unsigned bank_mask, bool bound_ctrl) +{ + Builder bld(ctx->program, &ctx->instructions); + for (unsigned i = 0; i < size; i++) { + bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(PhysReg{dst+i}, v1), Operand(PhysReg{src0+i}, v1), + dpp_ctrl, row_mask, bank_mask, bound_ctrl); + } +} + +uint32_t get_reduction_identity(ReduceOp op, unsigned idx) +{ + switch (op) { + case iadd32: + case iadd64: + case fadd32: + case fadd64: + case ior32: + case ior64: + case ixor32: + case ixor64: + case umax32: + case umax64: + return 0; + case imul32: + case imul64: + return idx ? 0 : 1; + case fmul32: + return 0x3f800000u; /* 1.0 */ + case fmul64: + return idx ? 0x3ff00000u : 0u; /* 1.0 */ + case imin32: + return INT32_MAX; + case imin64: + return idx ? 0x7fffffffu : 0xffffffffu; + case imax32: + return INT32_MIN; + case imax64: + return idx ? 0x80000000u : 0; + case umin32: + case umin64: + case iand32: + case iand64: + return 0xffffffffu; + case fmin32: + return 0x7f800000u; /* infinity */ + case fmin64: + return idx ? 0x7ff00000u : 0u; /* infinity */ + case fmax32: + return 0xff800000u; /* negative infinity */ + case fmax64: + return idx ? 0xfff00000u : 0u; /* negative infinity */ + default: + unreachable("Invalid reduction operation"); + break; + } + return 0; +} + +void emit_ds_swizzle(Builder bld, PhysReg dst, PhysReg src, unsigned size, unsigned ds_pattern) +{ + for (unsigned i = 0; i < size; i++) { + bld.ds(aco_opcode::ds_swizzle_b32, Definition(PhysReg{dst+i}, v1), + Operand(PhysReg{src+i}, v1), ds_pattern); + } +} + +void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsigned cluster_size, PhysReg tmp, + PhysReg stmp, PhysReg vtmp, PhysReg sitmp, Operand src, Definition dst) +{ + assert(cluster_size == ctx->program->wave_size || op == aco_opcode::p_reduce); + assert(cluster_size <= ctx->program->wave_size); + + Builder bld(ctx->program, &ctx->instructions); + + Operand identity[2]; + identity[0] = Operand(get_reduction_identity(reduce_op, 0)); + identity[1] = Operand(get_reduction_identity(reduce_op, 1)); + Operand vcndmask_identity[2] = {identity[0], identity[1]}; + + /* First, copy the source to tmp and set inactive lanes to the identity */ + bld.sop1(Builder::s_or_saveexec, Definition(stmp, bld.lm), Definition(scc, s1), Definition(exec, bld.lm), Operand(UINT64_MAX), Operand(exec, bld.lm)); + + for (unsigned i = 0; i < src.size(); i++) { + /* p_exclusive_scan needs it to be a sgpr or inline constant for the v_writelane_b32 + * except on GFX10, where v_writelane_b32 can take a literal. */ + if (identity[i].isLiteral() && op == aco_opcode::p_exclusive_scan && ctx->program->chip_class < GFX10) { + bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg{sitmp+i}, s1), identity[i]); + identity[i] = Operand(PhysReg{sitmp+i}, s1); + + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{tmp+i}, v1), identity[i]); + vcndmask_identity[i] = Operand(PhysReg{tmp+i}, v1); + } else if (identity[i].isLiteral()) { + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{tmp+i}, v1), identity[i]); + vcndmask_identity[i] = Operand(PhysReg{tmp+i}, v1); + } + } + + for (unsigned i = 0; i < src.size(); i++) { + bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg{tmp + i}, v1), + vcndmask_identity[i], Operand(PhysReg{src.physReg() + i}, v1), + Operand(stmp, bld.lm)); + } + + bool reduction_needs_last_op = false; + switch (op) { + case aco_opcode::p_reduce: + if (cluster_size == 1) break; + + if (ctx->program->chip_class <= GFX7) { + reduction_needs_last_op = true; + emit_ds_swizzle(bld, vtmp, tmp, src.size(), (1 << 15) | dpp_quad_perm(1, 0, 3, 2)); + if (cluster_size == 2) break; + emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size()); + emit_ds_swizzle(bld, vtmp, tmp, src.size(), (1 << 15) | dpp_quad_perm(2, 3, 0, 1)); + if (cluster_size == 4) break; + emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size()); + emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1f, 0, 0x04)); + if (cluster_size == 8) break; + emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size()); + emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1f, 0, 0x08)); + if (cluster_size == 16) break; + emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size()); + emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1f, 0, 0x10)); + if (cluster_size == 32) break; + emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size()); + for (unsigned i = 0; i < src.size(); i++) + bld.readlane(Definition(PhysReg{dst.physReg() + i}, s1), Operand(PhysReg{tmp + i}, v1), Operand(0u)); + // TODO: it would be more effective to do the last reduction step on SALU + emit_op(ctx, tmp, dst.physReg(), tmp, vtmp, reduce_op, src.size()); + reduction_needs_last_op = false; + break; + } + + emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_quad_perm(1, 0, 3, 2), 0xf, 0xf, false); + if (cluster_size == 2) break; + emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_quad_perm(2, 3, 0, 1), 0xf, 0xf, false); + if (cluster_size == 4) break; + emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_half_mirror, 0xf, 0xf, false); + if (cluster_size == 8) break; + emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_mirror, 0xf, 0xf, false); + if (cluster_size == 16) break; + + if (ctx->program->chip_class >= GFX10) { + /* GFX10+ doesn't support row_bcast15 and row_bcast31 */ + for (unsigned i = 0; i < src.size(); i++) + bld.vop3(aco_opcode::v_permlanex16_b32, Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{tmp+i}, v1), Operand(0u), Operand(0u)); + + if (cluster_size == 32) { + reduction_needs_last_op = true; + break; + } + + emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size()); + for (unsigned i = 0; i < src.size(); i++) + bld.readlane(Definition(PhysReg{dst.physReg() + i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(0u)); + // TODO: it would be more effective to do the last reduction step on SALU + emit_op(ctx, tmp, dst.physReg(), tmp, vtmp, reduce_op, src.size()); + break; + } + + if (cluster_size == 32) { + emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1f, 0, 0x10)); + reduction_needs_last_op = true; + break; + } + assert(cluster_size == 64); + emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_bcast15, 0xa, 0xf, false); + emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_bcast31, 0xc, 0xf, false); + break; + case aco_opcode::p_exclusive_scan: + if (ctx->program->chip_class >= GFX10) { /* gfx10 doesn't support wf_sr1, so emulate it */ + /* shift rows right */ + emit_dpp_mov(ctx, vtmp, tmp, src.size(), dpp_row_sr(1), 0xf, 0xf, true); + + /* fill in the gaps in rows 1 and 3 */ + bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand(0x10000u)); + bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(0x10000u)); + for (unsigned i = 0; i < src.size(); i++) { + Instruction *perm = bld.vop3(aco_opcode::v_permlanex16_b32, + Definition(PhysReg{vtmp+i}, v1), + Operand(PhysReg{tmp+i}, v1), + Operand(0xffffffffu), Operand(0xffffffffu)).instr; + static_cast(perm)->opsel = 1; /* FI (Fetch Inactive) */ + } + bld.sop1(Builder::s_mov, Definition(exec, bld.lm), Operand(UINT64_MAX)); + + if (ctx->program->wave_size == 64) { + /* fill in the gap in row 2 */ + for (unsigned i = 0; i < src.size(); i++) { + bld.readlane(Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u)); + bld.writelane(Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{sitmp+i}, s1), Operand(32u), Operand(PhysReg{vtmp+i}, v1)); + } + } + std::swap(tmp, vtmp); + } else if (ctx->program->chip_class >= GFX8) { + emit_dpp_mov(ctx, tmp, tmp, src.size(), dpp_wf_sr1, 0xf, 0xf, true); + } else { + // TODO: use LDS on CS with a single write and shifted read + /* wavefront shift_right by 1 on SI/CI */ + emit_ds_swizzle(bld, vtmp, tmp, src.size(), (1 << 15) | dpp_quad_perm(0, 0, 1, 2)); + emit_ds_swizzle(bld, tmp, tmp, src.size(), ds_pattern_bitmode(0x1F, 0x00, 0x07)); /* mirror(8) */ + bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand(0x10101010u)); + bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(exec_lo, s1)); + for (unsigned i = 0; i < src.size(); i++) + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{tmp+i}, v1)); + + bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(UINT64_MAX)); + emit_ds_swizzle(bld, tmp, tmp, src.size(), ds_pattern_bitmode(0x1F, 0x00, 0x08)); /* swap(8) */ + bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand(0x01000100u)); + bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(exec_lo, s1)); + for (unsigned i = 0; i < src.size(); i++) + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{tmp+i}, v1)); + + bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(UINT64_MAX)); + emit_ds_swizzle(bld, tmp, tmp, src.size(), ds_pattern_bitmode(0x1F, 0x00, 0x10)); /* swap(16) */ + bld.sop2(aco_opcode::s_bfm_b32, Definition(exec_lo, s1), Operand(1u), Operand(16u)); + bld.sop2(aco_opcode::s_bfm_b32, Definition(exec_hi, s1), Operand(1u), Operand(16u)); + for (unsigned i = 0; i < src.size(); i++) + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{tmp+i}, v1)); + + bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(UINT64_MAX)); + for (unsigned i = 0; i < src.size(); i++) { + bld.writelane(Definition(PhysReg{vtmp+i}, v1), identity[i], Operand(0u), Operand(PhysReg{vtmp+i}, v1)); + bld.readlane(Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(0u)); + bld.writelane(Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{sitmp+i}, s1), Operand(32u), Operand(PhysReg{vtmp+i}, v1)); + identity[i] = Operand(0u); /* prevent further uses of identity */ + } + std::swap(tmp, vtmp); + } + + for (unsigned i = 0; i < src.size(); i++) { + if (!identity[i].isConstant() || identity[i].constantValue()) { /* bound_ctrl should take care of this overwise */ + if (ctx->program->chip_class < GFX10) + assert((identity[i].isConstant() && !identity[i].isLiteral()) || identity[i].physReg() == PhysReg{sitmp+i}); + bld.writelane(Definition(PhysReg{tmp+i}, v1), identity[i], Operand(0u), Operand(PhysReg{tmp+i}, v1)); + } + } + /* fall through */ + case aco_opcode::p_inclusive_scan: + assert(cluster_size == ctx->program->wave_size); + if (ctx->program->chip_class <= GFX7) { + emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1e, 0x00, 0x00)); + bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand(0xAAAAAAAAu)); + bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(exec_lo, s1)); + emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size()); + + bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(UINT64_MAX)); + emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1c, 0x01, 0x00)); + bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand(0xCCCCCCCCu)); + bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(exec_lo, s1)); + emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size()); + + bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(UINT64_MAX)); + emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x18, 0x03, 0x00)); + bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand(0xF0F0F0F0u)); + bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(exec_lo, s1)); + emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size()); + + bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(UINT64_MAX)); + emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x10, 0x07, 0x00)); + bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand(0xFF00FF00u)); + bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(exec_lo, s1)); + emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size()); + + bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(UINT64_MAX)); + emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x00, 0x0f, 0x00)); + bld.sop2(aco_opcode::s_bfm_b32, Definition(exec_lo, s1), Operand(16u), Operand(16u)); + bld.sop2(aco_opcode::s_bfm_b32, Definition(exec_hi, s1), Operand(16u), Operand(16u)); + emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size()); + + for (unsigned i = 0; i < src.size(); i++) + bld.readlane(Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u)); + bld.sop2(aco_opcode::s_bfm_b64, Definition(exec, s2), Operand(32u), Operand(32u)); + emit_op(ctx, tmp, sitmp, tmp, vtmp, reduce_op, src.size()); + break; + } + + emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), + dpp_row_sr(1), 0xf, 0xf, false, identity); + emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), + dpp_row_sr(2), 0xf, 0xf, false, identity); + emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), + dpp_row_sr(4), 0xf, 0xf, false, identity); + emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), + dpp_row_sr(8), 0xf, 0xf, false, identity); + if (ctx->program->chip_class >= GFX10) { + bld.sop2(aco_opcode::s_bfm_b32, Definition(exec_lo, s1), Operand(16u), Operand(16u)); + bld.sop2(aco_opcode::s_bfm_b32, Definition(exec_hi, s1), Operand(16u), Operand(16u)); + for (unsigned i = 0; i < src.size(); i++) { + Instruction *perm = bld.vop3(aco_opcode::v_permlanex16_b32, + Definition(PhysReg{vtmp+i}, v1), + Operand(PhysReg{tmp+i}, v1), + Operand(0xffffffffu), Operand(0xffffffffu)).instr; + static_cast(perm)->opsel = 1; /* FI (Fetch Inactive) */ + } + emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size()); + + if (ctx->program->wave_size == 64) { + bld.sop2(aco_opcode::s_bfm_b64, Definition(exec, s2), Operand(32u), Operand(32u)); + for (unsigned i = 0; i < src.size(); i++) + bld.readlane(Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u)); + emit_op(ctx, tmp, sitmp, tmp, vtmp, reduce_op, src.size()); + } + } else { + emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), + dpp_row_bcast15, 0xa, 0xf, false, identity); + emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), + dpp_row_bcast31, 0xc, 0xf, false, identity); + } + break; + default: + unreachable("Invalid reduction mode"); + } + + + if (op == aco_opcode::p_reduce) { + if (reduction_needs_last_op && dst.regClass().type() == RegType::vgpr) { + bld.sop1(Builder::s_mov, Definition(exec, bld.lm), Operand(stmp, bld.lm)); + emit_op(ctx, dst.physReg(), tmp, vtmp, PhysReg{0}, reduce_op, src.size()); + return; + } + + if (reduction_needs_last_op) + emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size()); + } + + /* restore exec */ + bld.sop1(Builder::s_mov, Definition(exec, bld.lm), Operand(stmp, bld.lm)); + + if (dst.regClass().type() == RegType::sgpr) { + for (unsigned k = 0; k < src.size(); k++) { + bld.readlane(Definition(PhysReg{dst.physReg() + k}, s1), + Operand(PhysReg{tmp + k}, v1), Operand(ctx->program->wave_size - 1)); + } + } else if (dst.physReg() != tmp) { + for (unsigned k = 0; k < src.size(); k++) { + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{dst.physReg() + k}, v1), + Operand(PhysReg{tmp + k}, v1)); + } + } +} + +struct copy_operation { + Operand op; + Definition def; + unsigned uses; + unsigned size; +}; + +void handle_operands(std::map& copy_map, lower_context* ctx, chip_class chip_class, Pseudo_instruction *pi) +{ + Builder bld(ctx->program, &ctx->instructions); + aco_ptr mov; + std::map::iterator it = copy_map.begin(); + std::map::iterator target; + bool writes_scc = false; + + /* count the number of uses for each dst reg */ + while (it != copy_map.end()) { + if (it->second.op.isConstant()) { + ++it; + continue; + } + + if (it->second.def.physReg() == scc) + writes_scc = true; + + assert(!pi->tmp_in_scc || !(it->second.def.physReg() == pi->scratch_sgpr)); + + /* if src and dst reg are the same, remove operation */ + if (it->first == it->second.op.physReg()) { + it = copy_map.erase(it); + continue; + } + /* check if the operand reg may be overwritten by another copy operation */ + target = copy_map.find(it->second.op.physReg()); + if (target != copy_map.end()) { + target->second.uses++; + } + + ++it; + } + + /* first, handle paths in the location transfer graph */ + bool preserve_scc = pi->tmp_in_scc && !writes_scc; + it = copy_map.begin(); + while (it != copy_map.end()) { + + /* the target reg is not used as operand for any other copy */ + if (it->second.uses == 0) { + + /* try to coalesce 32-bit sgpr copies to 64-bit copies */ + if (it->second.def.getTemp().type() == RegType::sgpr && it->second.size == 1 && + !it->second.op.isConstant() && it->first % 2 == it->second.op.physReg() % 2) { + + PhysReg other_def_reg = PhysReg{it->first % 2 ? it->first - 1 : it->first + 1}; + PhysReg other_op_reg = PhysReg{it->first % 2 ? it->second.op.physReg() - 1 : it->second.op.physReg() + 1}; + std::map::iterator other = copy_map.find(other_def_reg); + + if (other != copy_map.end() && !other->second.uses && other->second.size == 1 && + other->second.op.physReg() == other_op_reg && !other->second.op.isConstant()) { + std::map::iterator to_erase = it->first % 2 ? it : other; + it = it->first % 2 ? other : it; + copy_map.erase(to_erase); + it->second.size = 2; + } + } + + if (it->second.def.physReg() == scc) { + bld.sopc(aco_opcode::s_cmp_lg_i32, it->second.def, it->second.op, Operand(0u)); + preserve_scc = true; + } else if (it->second.size == 2 && it->second.def.getTemp().type() == RegType::sgpr) { + bld.sop1(aco_opcode::s_mov_b64, it->second.def, Operand(it->second.op.physReg(), s2)); + } else if (it->second.size == 2 && it->second.op.isConstant()) { + uint64_t val = it->second.op.constantValue64(); + bld.vop1(aco_opcode::v_mov_b32, it->second.def, Operand((uint32_t)val)); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{it->second.def.physReg() + 1}, v1), + Operand((uint32_t)(val >> 32))); + } else { + bld.copy(it->second.def, it->second.op); + } + + /* reduce the number of uses of the operand reg by one */ + if (!it->second.op.isConstant()) { + for (unsigned i = 0; i < it->second.size; i++) { + target = copy_map.find(PhysReg{it->second.op.physReg() + i}); + if (target != copy_map.end()) + target->second.uses--; + } + } + + copy_map.erase(it); + it = copy_map.begin(); + continue; + } else { + /* the target reg is used as operand, check the next entry */ + ++it; + } + } + + if (copy_map.empty()) + return; + + /* all target regs are needed as operand somewhere which means, all entries are part of a cycle */ + bool constants = false; + for (it = copy_map.begin(); it != copy_map.end(); ++it) { + assert(it->second.op.isFixed()); + if (it->first == it->second.op.physReg()) + continue; + /* do constants later */ + if (it->second.op.isConstant()) { + constants = true; + continue; + } + + if (preserve_scc && it->second.def.getTemp().type() == RegType::sgpr) + assert(!(it->second.def.physReg() == pi->scratch_sgpr)); + + /* to resolve the cycle, we have to swap the src reg with the dst reg */ + copy_operation swap = it->second; + assert(swap.op.regClass() == swap.def.regClass()); + Operand def_as_op = Operand(swap.def.physReg(), swap.def.regClass()); + Definition op_as_def = Definition(swap.op.physReg(), swap.op.regClass()); + if (chip_class >= GFX9 && swap.def.getTemp().type() == RegType::vgpr) { + bld.vop1(aco_opcode::v_swap_b32, swap.def, op_as_def, swap.op, def_as_op); + } else if (swap.op.physReg() == scc || swap.def.physReg() == scc) { + /* we need to swap scc and another sgpr */ + assert(!preserve_scc); + + PhysReg other = swap.op.physReg() == scc ? swap.def.physReg() : swap.op.physReg(); + + bld.sop1(aco_opcode::s_mov_b32, Definition(pi->scratch_sgpr, s1), Operand(scc, s1)); + bld.sopc(aco_opcode::s_cmp_lg_i32, Definition(scc, s1), Operand(other, s1), Operand(0u)); + bld.sop1(aco_opcode::s_mov_b32, Definition(other, s1), Operand(pi->scratch_sgpr, s1)); + } else if (swap.def.getTemp().type() == RegType::sgpr) { + if (preserve_scc) { + bld.sop1(aco_opcode::s_mov_b32, Definition(pi->scratch_sgpr, s1), swap.op); + bld.sop1(aco_opcode::s_mov_b32, op_as_def, def_as_op); + bld.sop1(aco_opcode::s_mov_b32, swap.def, Operand(pi->scratch_sgpr, s1)); + } else { + bld.sop2(aco_opcode::s_xor_b32, op_as_def, Definition(scc, s1), swap.op, def_as_op); + bld.sop2(aco_opcode::s_xor_b32, swap.def, Definition(scc, s1), swap.op, def_as_op); + bld.sop2(aco_opcode::s_xor_b32, op_as_def, Definition(scc, s1), swap.op, def_as_op); + } + } else { + bld.vop2(aco_opcode::v_xor_b32, op_as_def, swap.op, def_as_op); + bld.vop2(aco_opcode::v_xor_b32, swap.def, swap.op, def_as_op); + bld.vop2(aco_opcode::v_xor_b32, op_as_def, swap.op, def_as_op); + } + + /* change the operand reg of the target's use */ + assert(swap.uses == 1); + target = it; + for (++target; target != copy_map.end(); ++target) { + if (target->second.op.physReg() == it->first) { + target->second.op.setFixed(swap.op.physReg()); + break; + } + } + } + + /* copy constants into a registers which were operands */ + if (constants) { + for (it = copy_map.begin(); it != copy_map.end(); ++it) { + if (!it->second.op.isConstant()) + continue; + if (it->second.def.physReg() == scc) { + bld.sopc(aco_opcode::s_cmp_lg_i32, Definition(scc, s1), Operand(0u), Operand(it->second.op.constantValue() ? 1u : 0u)); + } else { + bld.copy(it->second.def, it->second.op); + } + } + } +} + +void lower_to_hw_instr(Program* program) +{ + Block *discard_block = NULL; + + for (size_t i = 0; i < program->blocks.size(); i++) + { + Block *block = &program->blocks[i]; + lower_context ctx; + ctx.program = program; + Builder bld(program, &ctx.instructions); + + bool set_mode = i == 0 && block->fp_mode.val != program->config->float_mode; + for (unsigned pred : block->linear_preds) { + if (program->blocks[pred].fp_mode.val != block->fp_mode.val) { + set_mode = true; + break; + } + } + if (set_mode) { + /* only allow changing modes at top-level blocks so this doesn't break + * the "jump over empty blocks" optimization */ + assert(block->kind & block_kind_top_level); + uint32_t mode = block->fp_mode.val; + /* "((size - 1) << 11) | register" (MODE is encoded as register 1) */ + bld.sopk(aco_opcode::s_setreg_imm32_b32, Operand(mode), (7 << 11) | 1); + } + + for (size_t j = 0; j < block->instructions.size(); j++) { + aco_ptr& instr = block->instructions[j]; + aco_ptr mov; + if (instr->format == Format::PSEUDO) { + Pseudo_instruction *pi = (Pseudo_instruction*)instr.get(); + + switch (instr->opcode) + { + case aco_opcode::p_extract_vector: + { + unsigned reg = instr->operands[0].physReg() + instr->operands[1].constantValue() * instr->definitions[0].size(); + RegClass rc = RegClass(instr->operands[0].getTemp().type(), 1); + RegClass rc_def = RegClass(instr->definitions[0].getTemp().type(), 1); + if (reg == instr->definitions[0].physReg()) + break; + + std::map copy_operations; + for (unsigned i = 0; i < instr->definitions[0].size(); i++) { + Definition def = Definition(PhysReg{instr->definitions[0].physReg() + i}, rc_def); + copy_operations[def.physReg()] = {Operand(PhysReg{reg + i}, rc), def, 0, 1}; + } + handle_operands(copy_operations, &ctx, program->chip_class, pi); + break; + } + case aco_opcode::p_create_vector: + { + std::map copy_operations; + RegClass rc_def = RegClass(instr->definitions[0].getTemp().type(), 1); + unsigned reg_idx = 0; + for (const Operand& op : instr->operands) { + if (op.isConstant()) { + const PhysReg reg = PhysReg{instr->definitions[0].physReg() + reg_idx}; + const Definition def = Definition(reg, rc_def); + copy_operations[reg] = {op, def, 0, op.size()}; + reg_idx++; + continue; + } + + RegClass rc_op = RegClass(op.getTemp().type(), 1); + for (unsigned j = 0; j < op.size(); j++) + { + const Operand copy_op = Operand(PhysReg{op.physReg() + j}, rc_op); + const Definition def = Definition(PhysReg{instr->definitions[0].physReg() + reg_idx}, rc_def); + copy_operations[def.physReg()] = {copy_op, def, 0, 1}; + reg_idx++; + } + } + handle_operands(copy_operations, &ctx, program->chip_class, pi); + break; + } + case aco_opcode::p_split_vector: + { + std::map copy_operations; + RegClass rc_op = instr->operands[0].isConstant() ? s1 : RegClass(instr->operands[0].regClass().type(), 1); + for (unsigned i = 0; i < instr->definitions.size(); i++) { + unsigned k = instr->definitions[i].size(); + RegClass rc_def = RegClass(instr->definitions[i].getTemp().type(), 1); + for (unsigned j = 0; j < k; j++) { + Operand op = Operand(PhysReg{instr->operands[0].physReg() + (i*k+j)}, rc_op); + Definition def = Definition(PhysReg{instr->definitions[i].physReg() + j}, rc_def); + copy_operations[def.physReg()] = {op, def, 0, op.size()}; + } + } + handle_operands(copy_operations, &ctx, program->chip_class, pi); + break; + } + case aco_opcode::p_parallelcopy: + case aco_opcode::p_wqm: + { + std::map copy_operations; + for (unsigned i = 0; i < instr->operands.size(); i++) + { + Operand operand = instr->operands[i]; + if (operand.isConstant() || operand.size() == 1) { + assert(instr->definitions[i].size() == operand.size()); + copy_operations[instr->definitions[i].physReg()] = {operand, instr->definitions[i], 0, operand.size()}; + } else { + RegClass def_rc = RegClass(instr->definitions[i].regClass().type(), 1); + RegClass op_rc = RegClass(operand.getTemp().type(), 1); + for (unsigned j = 0; j < operand.size(); j++) + { + Operand op = Operand(PhysReg{instr->operands[i].physReg() + j}, op_rc); + Definition def = Definition(PhysReg{instr->definitions[i].physReg() + j}, def_rc); + copy_operations[def.physReg()] = {op, def, 0, 1}; + } + } + } + handle_operands(copy_operations, &ctx, program->chip_class, pi); + break; + } + case aco_opcode::p_exit_early_if: + { + /* don't bother with an early exit at the end of the program */ + if (block->instructions[j + 1]->opcode == aco_opcode::p_logical_end && + block->instructions[j + 2]->opcode == aco_opcode::s_endpgm) { + break; + } + + if (!discard_block) { + discard_block = program->create_and_insert_block(); + block = &program->blocks[i]; + + bld.reset(discard_block); + bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1), + 0, V_008DFC_SQ_EXP_NULL, false, true, true); + if (program->wb_smem_l1_on_end) + bld.smem(aco_opcode::s_dcache_wb); + bld.sopp(aco_opcode::s_endpgm); + + bld.reset(&ctx.instructions); + } + + //TODO: exec can be zero here with block_kind_discard + + assert(instr->operands[0].physReg() == scc); + bld.sopp(aco_opcode::s_cbranch_scc0, instr->operands[0], discard_block->index); + + discard_block->linear_preds.push_back(block->index); + block->linear_succs.push_back(discard_block->index); + break; + } + case aco_opcode::p_spill: + { + assert(instr->operands[0].regClass() == v1.as_linear()); + for (unsigned i = 0; i < instr->operands[2].size(); i++) + bld.writelane(bld.def(v1, instr->operands[0].physReg()), + Operand(PhysReg{instr->operands[2].physReg() + i}, s1), + Operand(instr->operands[1].constantValue() + i), + instr->operands[0]); + break; + } + case aco_opcode::p_reload: + { + assert(instr->operands[0].regClass() == v1.as_linear()); + for (unsigned i = 0; i < instr->definitions[0].size(); i++) + bld.readlane(bld.def(s1, PhysReg{instr->definitions[0].physReg() + i}), + instr->operands[0], + Operand(instr->operands[1].constantValue() + i)); + break; + } + case aco_opcode::p_as_uniform: + { + if (instr->operands[0].isConstant() || instr->operands[0].regClass().type() == RegType::sgpr) { + std::map copy_operations; + Operand operand = instr->operands[0]; + if (operand.isConstant() || operand.size() == 1) { + assert(instr->definitions[0].size() == 1); + copy_operations[instr->definitions[0].physReg()] = {operand, instr->definitions[0], 0, operand.size()}; + } else { + for (unsigned i = 0; i < operand.size(); i++) + { + Operand op = Operand(PhysReg{operand.physReg() + i}, s1); + Definition def = Definition(PhysReg{instr->definitions[0].physReg() + i}, s1); + copy_operations[def.physReg()] = {op, def, 0, 1}; + } + } + + handle_operands(copy_operations, &ctx, program->chip_class, pi); + } else { + assert(instr->operands[0].regClass().type() == RegType::vgpr); + assert(instr->definitions[0].regClass().type() == RegType::sgpr); + assert(instr->operands[0].size() == instr->definitions[0].size()); + for (unsigned i = 0; i < instr->definitions[0].size(); i++) { + bld.vop1(aco_opcode::v_readfirstlane_b32, + bld.def(s1, PhysReg{instr->definitions[0].physReg() + i}), + Operand(PhysReg{instr->operands[0].physReg() + i}, v1)); + } + } + break; + } + default: + break; + } + } else if (instr->format == Format::PSEUDO_BRANCH) { + Pseudo_branch_instruction* branch = static_cast(instr.get()); + /* check if all blocks from current to target are empty */ + bool can_remove = block->index < branch->target[0]; + for (unsigned i = block->index + 1; can_remove && i < branch->target[0]; i++) { + if (program->blocks[i].instructions.size()) + can_remove = false; + } + if (can_remove) + continue; + + switch (instr->opcode) { + case aco_opcode::p_branch: + assert(block->linear_succs[0] == branch->target[0]); + bld.sopp(aco_opcode::s_branch, branch->target[0]); + break; + case aco_opcode::p_cbranch_nz: + assert(block->linear_succs[1] == branch->target[0]); + if (branch->operands[0].physReg() == exec) + bld.sopp(aco_opcode::s_cbranch_execnz, branch->target[0]); + else if (branch->operands[0].physReg() == vcc) + bld.sopp(aco_opcode::s_cbranch_vccnz, branch->target[0]); + else { + assert(branch->operands[0].physReg() == scc); + bld.sopp(aco_opcode::s_cbranch_scc1, branch->target[0]); + } + break; + case aco_opcode::p_cbranch_z: + assert(block->linear_succs[1] == branch->target[0]); + if (branch->operands[0].physReg() == exec) + bld.sopp(aco_opcode::s_cbranch_execz, branch->target[0]); + else if (branch->operands[0].physReg() == vcc) + bld.sopp(aco_opcode::s_cbranch_vccz, branch->target[0]); + else { + assert(branch->operands[0].physReg() == scc); + bld.sopp(aco_opcode::s_cbranch_scc0, branch->target[0]); + } + break; + default: + unreachable("Unknown Pseudo branch instruction!"); + } + + } else if (instr->format == Format::PSEUDO_REDUCTION) { + Pseudo_reduction_instruction* reduce = static_cast(instr.get()); + if (reduce->reduce_op == gfx10_wave64_bpermute) { + /* Only makes sense on GFX10 wave64 */ + assert(program->chip_class >= GFX10); + assert(program->info->wave_size == 64); + assert(instr->definitions[0].regClass() == v1); /* Destination */ + assert(instr->definitions[1].regClass() == s2); /* Temp EXEC */ + assert(instr->definitions[1].physReg() != vcc); + assert(instr->definitions[2].physReg() == scc); /* SCC clobber */ + assert(instr->operands[0].physReg() == vcc); /* Compare */ + assert(instr->operands[1].regClass() == v2.as_linear()); /* Temp VGPR pair */ + assert(instr->operands[2].regClass() == v1); /* Indices x4 */ + assert(instr->operands[3].regClass() == v1); /* Input data */ + + PhysReg shared_vgpr_reg_lo = PhysReg(align(program->config->num_vgprs, 4) + 256); + PhysReg shared_vgpr_reg_hi = PhysReg(shared_vgpr_reg_lo + 1); + Operand compare = instr->operands[0]; + Operand tmp1(instr->operands[1].physReg(), v1); + Operand tmp2(PhysReg(instr->operands[1].physReg() + 1), v1); + Operand index_x4 = instr->operands[2]; + Operand input_data = instr->operands[3]; + Definition shared_vgpr_lo(shared_vgpr_reg_lo, v1); + Definition shared_vgpr_hi(shared_vgpr_reg_hi, v1); + Definition def_temp1(tmp1.physReg(), v1); + Definition def_temp2(tmp2.physReg(), v1); + + /* Save EXEC and set it for all lanes */ + bld.sop1(aco_opcode::s_or_saveexec_b64, instr->definitions[1], instr->definitions[2], + Definition(exec, s2), Operand((uint64_t)-1), Operand(exec, s2)); + + /* HI: Copy data from high lanes 32-63 to shared vgpr */ + bld.vop1_dpp(aco_opcode::v_mov_b32, shared_vgpr_hi, input_data, dpp_quad_perm(0, 1, 2, 3), 0xc, 0xf, false); + + /* LO: Copy data from low lanes 0-31 to shared vgpr */ + bld.vop1_dpp(aco_opcode::v_mov_b32, shared_vgpr_lo, input_data, dpp_quad_perm(0, 1, 2, 3), 0x3, 0xf, false); + /* LO: Copy shared vgpr (high lanes' data) to output vgpr */ + bld.vop1_dpp(aco_opcode::v_mov_b32, def_temp1, Operand(shared_vgpr_reg_hi, v1), dpp_quad_perm(0, 1, 2, 3), 0x3, 0xf, false); + + /* HI: Copy shared vgpr (low lanes' data) to output vgpr */ + bld.vop1_dpp(aco_opcode::v_mov_b32, def_temp1, Operand(shared_vgpr_reg_lo, v1), dpp_quad_perm(0, 1, 2, 3), 0xc, 0xf, false); + + /* Permute the original input */ + bld.ds(aco_opcode::ds_bpermute_b32, def_temp2, index_x4, input_data); + /* Permute the swapped input */ + bld.ds(aco_opcode::ds_bpermute_b32, def_temp1, index_x4, tmp1); + + /* Restore saved EXEC */ + bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(instr->definitions[1].physReg(), s2)); + /* Choose whether to use the original or swapped */ + bld.vop2(aco_opcode::v_cndmask_b32, instr->definitions[0], tmp1, tmp2, compare); + } else { + emit_reduction(&ctx, reduce->opcode, reduce->reduce_op, reduce->cluster_size, + reduce->operands[1].physReg(), // tmp + reduce->definitions[1].physReg(), // stmp + reduce->operands[2].physReg(), // vtmp + reduce->definitions[2].physReg(), // sitmp + reduce->operands[0], reduce->definitions[0]); + } + } else { + ctx.instructions.emplace_back(std::move(instr)); + } + + } + block->instructions.swap(ctx.instructions); + } +} + +} diff -Nru mesa-19.2.8/src/amd/compiler/aco_opcodes_cpp.py mesa-20.0.8/src/amd/compiler/aco_opcodes_cpp.py --- mesa-19.2.8/src/amd/compiler/aco_opcodes_cpp.py 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/amd/compiler/aco_opcodes_cpp.py 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,76 @@ + +template = """\ +/* + * Copyright (c) 2018 Valve Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + */ + +#include "aco_ir.h" + +namespace aco { + + +<% +opcode_names = sorted(opcodes.keys()) +can_use_input_modifiers = "".join([opcodes[name].input_mod for name in reversed(opcode_names)]) +can_use_output_modifiers = "".join([opcodes[name].output_mod for name in reversed(opcode_names)]) +is_atomic = "".join([opcodes[name].is_atomic for name in reversed(opcode_names)]) +%> + +extern const aco::Info instr_info = { + .opcode_gfx7 = { + % for name in opcode_names: + ${opcodes[name].opcode_gfx7}, + % endfor + }, + .opcode_gfx9 = { + % for name in opcode_names: + ${opcodes[name].opcode_gfx9}, + % endfor + }, + .opcode_gfx10 = { + % for name in opcode_names: + ${opcodes[name].opcode_gfx10}, + % endfor + }, + .can_use_input_modifiers = std::bitset<${len(opcode_names)}>("${can_use_input_modifiers}"), + .can_use_output_modifiers = std::bitset<${len(opcode_names)}>("${can_use_output_modifiers}"), + .is_atomic = std::bitset<${len(opcode_names)}>("${is_atomic}"), + .name = { + % for name in opcode_names: + "${name}", + % endfor + }, + .format = { + % for name in opcode_names: + aco::Format::${str(opcodes[name].format.name)}, + % endfor + }, +}; + +} +""" + +from aco_opcodes import opcodes +from mako.template import Template + +print(Template(template).render(opcodes=opcodes)) diff -Nru mesa-19.2.8/src/amd/compiler/aco_opcodes_h.py mesa-20.0.8/src/amd/compiler/aco_opcodes_h.py --- mesa-19.2.8/src/amd/compiler/aco_opcodes_h.py 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/amd/compiler/aco_opcodes_h.py 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,47 @@ + +template = """\ +/* + * Copyright (c) 2018 Valve Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * Authors: + * Daniel Schuermann (daniel.schuermann@campus.tu-berlin.de) + */ + +#ifndef _ACO_OPCODES_ +#define _ACO_OPCODES_ + +<% opcode_names = sorted(opcodes.keys()) %> + +enum class aco_opcode : std::uint16_t { +% for name in opcode_names: + ${name}, +% endfor + last_opcode = ${opcode_names[-1]}, + num_opcodes = last_opcode + 1 +}; + +#endif /* _ACO_OPCODES_ */""" + +from aco_opcodes import opcodes +from mako.template import Template + +print(Template(template).render(opcodes=opcodes)) diff -Nru mesa-19.2.8/src/amd/compiler/aco_opcodes.py mesa-20.0.8/src/amd/compiler/aco_opcodes.py --- mesa-19.2.8/src/amd/compiler/aco_opcodes.py 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/amd/compiler/aco_opcodes.py 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,1588 @@ +# +# Copyright (c) 2018 Valve Corporation +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice (including the next +# paragraph) shall be included in all copies or substantial portions of the +# Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +# IN THE SOFTWARE. +# +# Authors: +# Daniel Schuermann (daniel.schuermann@campus.tu-berlin.de) + + +# Class that represents all the information we have about the opcode +# NOTE: this must be kept in sync with aco_op_info + +import sys +from enum import Enum + +class Format(Enum): + PSEUDO = 0 + SOP1 = 1 + SOP2 = 2 + SOPK = 3 + SOPP = 4 + SOPC = 5 + SMEM = 6 + DS = 8 + MTBUF = 9 + MUBUF = 10 + MIMG = 11 + EXP = 12 + FLAT = 13 + GLOBAL = 14 + SCRATCH = 15 + PSEUDO_BRANCH = 16 + PSEUDO_BARRIER = 17 + PSEUDO_REDUCTION = 18 + VOP1 = 1 << 8 + VOP2 = 1 << 9 + VOPC = 1 << 10 + VOP3A = 1 << 11 + VOP3B = 1 << 11 + VOP3P = 1 << 12 + VINTRP = 1 << 13 + DPP = 1 << 14 + SDWA = 1 << 15 + + def get_builder_fields(self): + if self == Format.SOPK: + return [('uint16_t', 'imm', None)] + elif self == Format.SOPP: + return [('uint32_t', 'block', '-1'), + ('uint32_t', 'imm', '0')] + elif self == Format.SMEM: + return [('bool', 'can_reorder', 'true'), + ('bool', 'glc', 'false'), + ('bool', 'dlc', 'false'), + ('bool', 'nv', 'false')] + elif self == Format.DS: + return [('int16_t', 'offset0', '0'), + ('int8_t', 'offset1', '0'), + ('bool', 'gds', 'false')] + elif self == Format.MTBUF: + return [('unsigned', 'dfmt', None), + ('unsigned', 'nfmt', None), + ('unsigned', 'offset', None), + ('bool', 'offen', None), + ('bool', 'idxen', 'false'), + ('bool', 'disable_wqm', 'false'), + ('bool', 'glc', 'false'), + ('bool', 'dlc', 'false'), + ('bool', 'slc', 'false'), + ('bool', 'tfe', 'false')] + elif self == Format.MUBUF: + return [('unsigned', 'offset', None), + ('bool', 'offen', None), + ('bool', 'idxen', 'false'), + ('bool', 'addr64', 'false'), + ('bool', 'disable_wqm', 'false'), + ('bool', 'glc', 'false'), + ('bool', 'dlc', 'false'), + ('bool', 'slc', 'false'), + ('bool', 'tfe', 'false'), + ('bool', 'lds', 'false')] + elif self == Format.MIMG: + return [('unsigned', 'dmask', '0xF'), + ('bool', 'da', 'false'), + ('bool', 'unrm', 'true'), + ('bool', 'disable_wqm', 'false'), + ('bool', 'glc', 'false'), + ('bool', 'dlc', 'false'), + ('bool', 'slc', 'false'), + ('bool', 'tfe', 'false'), + ('bool', 'lwe', 'false'), + ('bool', 'r128_a16', 'false', 'r128'), + ('bool', 'd16', 'false')] + return [('unsigned', 'attribute', None), + ('unsigned', 'component', None)] + elif self == Format.EXP: + return [('unsigned', 'enabled_mask', None), + ('unsigned', 'dest', None), + ('bool', 'compr', 'false', 'compressed'), + ('bool', 'done', 'false'), + ('bool', 'vm', 'false', 'valid_mask')] + elif self == Format.PSEUDO_BRANCH: + return [('uint32_t', 'target0', '0', 'target[0]'), + ('uint32_t', 'target1', '0', 'target[1]')] + elif self == Format.PSEUDO_REDUCTION: + return [('ReduceOp', 'op', None, 'reduce_op'), + ('unsigned', 'cluster_size', '0')] + elif self == Format.VINTRP: + return [('unsigned', 'attribute', None), + ('unsigned', 'component', None)] + elif self == Format.DPP: + return [('uint16_t', 'dpp_ctrl', None), + ('uint8_t', 'row_mask', '0xF'), + ('uint8_t', 'bank_mask', '0xF'), + ('bool', 'bound_ctrl', 'false')] + elif self in [Format.FLAT, Format.GLOBAL, Format.SCRATCH]: + return [('uint16_t', 'offset', 0), + ('bool', 'can_reorder', 'true'), + ('bool', 'glc', 'false'), + ('bool', 'slc', 'false'), + ('bool', 'lds', 'false'), + ('bool', 'nv', 'false')] + else: + return [] + + def get_builder_field_names(self): + return [f[1] for f in self.get_builder_fields()] + + def get_builder_field_dests(self): + return [(f[3] if len(f) >= 4 else f[1]) for f in self.get_builder_fields()] + + def get_builder_field_decls(self): + return [('%s %s=%s' % (f[0], f[1], f[2]) if f[2] != None else '%s %s' % (f[0], f[1])) for f in self.get_builder_fields()] + + +class Opcode(object): + """Class that represents all the information we have about the opcode + NOTE: this must be kept in sync with aco_op_info + """ + def __init__(self, name, opcode_gfx7, opcode_gfx9, opcode_gfx10, format, input_mod, output_mod, is_atomic): + """Parameters: + + - name is the name of the opcode (prepend nir_op_ for the enum name) + - all types are strings that get nir_type_ prepended to them + - input_types is a list of types + - algebraic_properties is a space-seperated string, where nir_op_is_ is + prepended before each entry + - const_expr is an expression or series of statements that computes the + constant value of the opcode given the constant values of its inputs. + """ + assert isinstance(name, str) + assert isinstance(opcode_gfx7, int) + assert isinstance(opcode_gfx9, int) + assert isinstance(opcode_gfx10, int) + assert isinstance(format, Format) + assert isinstance(input_mod, bool) + assert isinstance(output_mod, bool) + + self.name = name + self.opcode_gfx7 = opcode_gfx7 + self.opcode_gfx9 = opcode_gfx9 + self.opcode_gfx10 = opcode_gfx10 + self.input_mod = "1" if input_mod else "0" + self.output_mod = "1" if output_mod else "0" + self.is_atomic = "1" if is_atomic else "0" + self.format = format + + +# global dictionary of opcodes +opcodes = {} + +def opcode(name, opcode_gfx7 = -1, opcode_gfx9 = -1, opcode_gfx10 = -1, format = Format.PSEUDO, input_mod = False, output_mod = False, is_atomic = False): + assert name not in opcodes + opcodes[name] = Opcode(name, opcode_gfx7, opcode_gfx9, opcode_gfx10, format, input_mod, output_mod, is_atomic) + +opcode("exp", 0, 0, 0, format = Format.EXP) +opcode("p_parallelcopy") +opcode("p_startpgm") +opcode("p_phi") +opcode("p_linear_phi") +opcode("p_as_uniform") + +opcode("p_create_vector") +opcode("p_extract_vector") +opcode("p_split_vector") + +# start/end the parts where we can use exec based instructions +# implicitly +opcode("p_logical_start") +opcode("p_logical_end") + +# e.g. subgroupMin() in SPIR-V +opcode("p_reduce", format=Format.PSEUDO_REDUCTION) +# e.g. subgroupInclusiveMin() +opcode("p_inclusive_scan", format=Format.PSEUDO_REDUCTION) +# e.g. subgroupExclusiveMin() +opcode("p_exclusive_scan", format=Format.PSEUDO_REDUCTION) +# simulates proper bpermute behavior on GFX10 wave64 +opcode("p_wave64_bpermute", format=Format.PSEUDO_REDUCTION) + +opcode("p_branch", format=Format.PSEUDO_BRANCH) +opcode("p_cbranch", format=Format.PSEUDO_BRANCH) +opcode("p_cbranch_z", format=Format.PSEUDO_BRANCH) +opcode("p_cbranch_nz", format=Format.PSEUDO_BRANCH) + +opcode("p_memory_barrier_common", format=Format.PSEUDO_BARRIER) # atomic, buffer, image and shared +opcode("p_memory_barrier_atomic", format=Format.PSEUDO_BARRIER) +opcode("p_memory_barrier_buffer", format=Format.PSEUDO_BARRIER) +opcode("p_memory_barrier_image", format=Format.PSEUDO_BARRIER) +opcode("p_memory_barrier_shared", format=Format.PSEUDO_BARRIER) +opcode("p_memory_barrier_gs_data", format=Format.PSEUDO_BARRIER) +opcode("p_memory_barrier_gs_sendmsg", format=Format.PSEUDO_BARRIER) + +opcode("p_spill") +opcode("p_reload") + +# start/end linear vgprs +opcode("p_start_linear_vgpr") +opcode("p_end_linear_vgpr") + +opcode("p_wqm") +opcode("p_discard_if") +opcode("p_load_helper") +opcode("p_demote_to_helper") +opcode("p_is_helper") +opcode("p_exit_early_if") + +opcode("p_fs_buffer_store_smem", format=Format.SMEM) + + +# SOP2 instructions: 2 scalar inputs, 1 scalar output (+optional scc) +SOP2 = { + # GFX6, GFX7, GFX8, GFX9, GFX10, name + (0x00, 0x00, 0x00, 0x00, 0x00, "s_add_u32"), + (0x01, 0x01, 0x01, 0x01, 0x01, "s_sub_u32"), + (0x02, 0x02, 0x02, 0x02, 0x02, "s_add_i32"), + (0x03, 0x03, 0x03, 0x03, 0x03, "s_sub_i32"), + (0x04, 0x04, 0x04, 0x04, 0x04, "s_addc_u32"), + (0x05, 0x05, 0x05, 0x05, 0x05, "s_subb_u32"), + (0x06, 0x06, 0x06, 0x06, 0x06, "s_min_i32"), + (0x07, 0x07, 0x07, 0x07, 0x07, "s_min_u32"), + (0x08, 0x08, 0x08, 0x08, 0x08, "s_max_i32"), + (0x09, 0x09, 0x09, 0x09, 0x09, "s_max_u32"), + (0x0a, 0x0a, 0x0a, 0x0a, 0x0a, "s_cselect_b32"), + (0x0b, 0x0b, 0x0b, 0x0b, 0x0b, "s_cselect_b64"), + (0x0e, 0x0e, 0x0c, 0x0c, 0x0e, "s_and_b32"), + (0x0f, 0x0f, 0x0d, 0x0d, 0x0f, "s_and_b64"), + (0x10, 0x10, 0x0e, 0x0e, 0x10, "s_or_b32"), + (0x11, 0x11, 0x0f, 0x0f, 0x11, "s_or_b64"), + (0x12, 0x12, 0x10, 0x10, 0x12, "s_xor_b32"), + (0x13, 0x13, 0x11, 0x11, 0x13, "s_xor_b64"), + (0x14, 0x14, 0x12, 0x12, 0x14, "s_andn2_b32"), + (0x15, 0x15, 0x13, 0x13, 0x15, "s_andn2_b64"), + (0x16, 0x16, 0x14, 0x14, 0x16, "s_orn2_b32"), + (0x17, 0x17, 0x15, 0x15, 0x17, "s_orn2_b64"), + (0x18, 0x18, 0x16, 0x16, 0x18, "s_nand_b32"), + (0x19, 0x19, 0x17, 0x17, 0x19, "s_nand_b64"), + (0x1a, 0x1a, 0x18, 0x18, 0x1a, "s_nor_b32"), + (0x1b, 0x1b, 0x19, 0x19, 0x1b, "s_nor_b64"), + (0x1c, 0x1c, 0x1a, 0x1a, 0x1c, "s_xnor_b32"), + (0x1d, 0x1d, 0x1b, 0x1b, 0x1d, "s_xnor_b64"), + (0x1e, 0x1e, 0x1c, 0x1c, 0x1e, "s_lshl_b32"), + (0x1f, 0x1f, 0x1d, 0x1d, 0x1f, "s_lshl_b64"), + (0x20, 0x20, 0x1e, 0x1e, 0x20, "s_lshr_b32"), + (0x21, 0x21, 0x1f, 0x1f, 0x21, "s_lshr_b64"), + (0x22, 0x22, 0x20, 0x20, 0x22, "s_ashr_i32"), + (0x23, 0x23, 0x21, 0x21, 0x23, "s_ashr_i64"), + (0x24, 0x24, 0x22, 0x22, 0x24, "s_bfm_b32"), + (0x25, 0x25, 0x23, 0x23, 0x25, "s_bfm_b64"), + (0x26, 0x26, 0x24, 0x24, 0x26, "s_mul_i32"), + (0x27, 0x27, 0x25, 0x25, 0x27, "s_bfe_u32"), + (0x28, 0x28, 0x26, 0x26, 0x28, "s_bfe_i32"), + (0x29, 0x29, 0x27, 0x27, 0x29, "s_bfe_u64"), + (0x2a, 0x2a, 0x28, 0x28, 0x2a, "s_bfe_i64"), + (0x2b, 0x2b, 0x29, 0x29, -1, "s_cbranch_g_fork"), + (0x2c, 0x2c, 0x2a, 0x2a, 0x2c, "s_absdiff_i32"), + ( -1, -1, 0x2b, 0x2b, -1, "s_rfe_restore_b64"), + ( -1, -1, -1, 0x2e, 0x2e, "s_lshl1_add_u32"), + ( -1, -1, -1, 0x2f, 0x2f, "s_lshl2_add_u32"), + ( -1, -1, -1, 0x30, 0x30, "s_lshl3_add_u32"), + ( -1, -1, -1, 0x31, 0x31, "s_lshl4_add_u32"), + ( -1, -1, -1, 0x32, 0x32, "s_pack_ll_b32_b16"), + ( -1, -1, -1, 0x33, 0x33, "s_pack_lh_b32_b16"), + ( -1, -1, -1, 0x34, 0x34, "s_pack_hh_b32_b16"), + ( -1, -1, -1, 0x2c, 0x35, "s_mul_hi_u32"), + ( -1, -1, -1, 0x2d, 0x36, "s_mul_hi_i32"), +} +for (gfx6, gfx7, gfx8, gfx9, gfx10, name) in SOP2: + opcode(name, gfx7, gfx9, gfx10, Format.SOP2) + + +# SOPK instructions: 0 input (+ imm), 1 output + optional scc +SOPK = { + # GFX6, GFX7, GFX8, GFX9, GFX10, name + (0x00, 0x00, 0x00, 0x00, 0x00, "s_movk_i32"), + ( -1, -1, -1, -1, 0x01, "s_version"), # GFX10+ + (0x02, 0x02, 0x01, 0x01, 0x02, "s_cmovk_i32"), # GFX8_GFX9 + (0x03, 0x03, 0x02, 0x02, 0x03, "s_cmpk_eq_i32"), + (0x04, 0x04, 0x03, 0x03, 0x04, "s_cmpk_lg_i32"), + (0x05, 0x05, 0x04, 0x04, 0x05, "s_cmpk_gt_i32"), + (0x06, 0x06, 0x05, 0x05, 0x06, "s_cmpk_ge_i32"), + (0x07, 0x07, 0x06, 0x06, 0x07, "s_cmpk_lt_i32"), + (0x08, 0x08, 0x07, 0x07, 0x08, "s_cmpk_le_i32"), + (0x09, 0x09, 0x08, 0x08, 0x09, "s_cmpk_eq_u32"), + (0x0a, 0x0a, 0x09, 0x09, 0x0a, "s_cmpk_lg_u32"), + (0x0b, 0x0b, 0x0a, 0x0a, 0x0b, "s_cmpk_gt_u32"), + (0x0c, 0x0c, 0x0b, 0x0b, 0x0c, "s_cmpk_ge_u32"), + (0x0d, 0x0d, 0x0c, 0x0c, 0x0d, "s_cmpk_lt_u32"), + (0x0e, 0x0e, 0x0d, 0x0d, 0x0e, "s_cmpk_le_u32"), + (0x0f, 0x0f, 0x0e, 0x0e, 0x0f, "s_addk_i32"), + (0x10, 0x10, 0x0f, 0x0f, 0x10, "s_mulk_i32"), + (0x11, 0x11, 0x10, 0x10, -1, "s_cbranch_i_fork"), + (0x12, 0x12, 0x11, 0x11, 0x12, "s_getreg_b32"), + (0x13, 0x13, 0x12, 0x12, 0x13, "s_setreg_b32"), + (0x15, 0x15, 0x14, 0x14, 0x15, "s_setreg_imm32_b32"), # requires 32bit literal + ( -1, -1, 0x15, 0x15, 0x16, "s_call_b64"), + ( -1, -1, -1, -1, 0x17, "s_waitcnt_vscnt"), + ( -1, -1, -1, -1, 0x18, "s_waitcnt_vmcnt"), + ( -1, -1, -1, -1, 0x19, "s_waitcnt_expcnt"), + ( -1, -1, -1, -1, 0x1a, "s_waitcnt_lgkmcnt"), + ( -1, -1, -1, -1, 0x1b, "s_subvector_loop_begin"), + ( -1, -1, -1, -1, 0x1c, "s_subvector_loop_end"), +} +for (gfx6, gfx7, gfx8, gfx9, gfx10, name) in SOPK: + opcode(name, gfx7, gfx9, gfx10, Format.SOPK) + + +# SOP1 instructions: 1 input, 1 output (+optional SCC) +SOP1 = { + # GFX6, GFX7, GFX8, GFX9, GFX10, name + (0x03, 0x03, 0x00, 0x00, 0x03, "s_mov_b32"), + (0x04, 0x04, 0x01, 0x01, 0x04, "s_mov_b64"), + (0x05, 0x05, 0x02, 0x02, 0x05, "s_cmov_b32"), + (0x06, 0x06, 0x03, 0x03, 0x06, "s_cmov_b64"), + (0x07, 0x07, 0x04, 0x04, 0x07, "s_not_b32"), + (0x08, 0x08, 0x05, 0x05, 0x08, "s_not_b64"), + (0x09, 0x09, 0x06, 0x06, 0x09, "s_wqm_b32"), + (0x0a, 0x0a, 0x07, 0x07, 0x0a, "s_wqm_b64"), + (0x0b, 0x0b, 0x08, 0x08, 0x0b, "s_brev_b32"), + (0x0c, 0x0c, 0x09, 0x09, 0x0c, "s_brev_b64"), + (0x0d, 0x0d, 0x0a, 0x0a, 0x0d, "s_bcnt0_i32_b32"), + (0x0e, 0x0e, 0x0b, 0x0b, 0x0e, "s_bcnt0_i32_b64"), + (0x0f, 0x0f, 0x0c, 0x0c, 0x0f, "s_bcnt1_i32_b32"), + (0x10, 0x10, 0x0d, 0x0d, 0x10, "s_bcnt1_i32_b64"), + (0x11, 0x11, 0x0e, 0x0e, 0x11, "s_ff0_i32_b32"), + (0x12, 0x12, 0x0f, 0x0f, 0x12, "s_ff0_i32_b64"), + (0x13, 0x13, 0x10, 0x10, 0x13, "s_ff1_i32_b32"), + (0x14, 0x14, 0x11, 0x11, 0x14, "s_ff1_i32_b64"), + (0x15, 0x15, 0x12, 0x12, 0x15, "s_flbit_i32_b32"), + (0x16, 0x16, 0x13, 0x13, 0x16, "s_flbit_i32_b64"), + (0x17, 0x17, 0x14, 0x14, 0x17, "s_flbit_i32"), + (0x18, 0x18, 0x15, 0x15, 0x18, "s_flbit_i32_i64"), + (0x19, 0x19, 0x16, 0x16, 0x19, "s_sext_i32_i8"), + (0x1a, 0x1a, 0x17, 0x17, 0x1a, "s_sext_i32_i16"), + (0x1b, 0x1b, 0x18, 0x18, 0x1b, "s_bitset0_b32"), + (0x1c, 0x1c, 0x19, 0x19, 0x1c, "s_bitset0_b64"), + (0x1d, 0x1d, 0x1a, 0x1a, 0x1d, "s_bitset1_b32"), + (0x1e, 0x1e, 0x1b, 0x1b, 0x1e, "s_bitset1_b64"), + (0x1f, 0x1f, 0x1c, 0x1c, 0x1f, "s_getpc_b64"), + (0x20, 0x20, 0x1d, 0x1d, 0x20, "s_setpc_b64"), + (0x21, 0x21, 0x1e, 0x1e, 0x21, "s_swappc_b64"), + (0x22, 0x22, 0x1f, 0x1f, 0x22, "s_rfe_b64"), + (0x24, 0x24, 0x20, 0x20, 0x24, "s_and_saveexec_b64"), + (0x25, 0x25, 0x21, 0x21, 0x25, "s_or_saveexec_b64"), + (0x26, 0x26, 0x22, 0x22, 0x26, "s_xor_saveexec_b64"), + (0x27, 0x27, 0x23, 0x23, 0x27, "s_andn2_saveexec_b64"), + (0x28, 0x28, 0x24, 0x24, 0x28, "s_orn2_saveexec_b64"), + (0x29, 0x29, 0x25, 0x25, 0x29, "s_nand_saveexec_b64"), + (0x2a, 0x2a, 0x26, 0x26, 0x2a, "s_nor_saveexec_b64"), + (0x2b, 0x2b, 0x27, 0x27, 0x2b, "s_xnor_saveexec_b64"), + (0x2c, 0x2c, 0x28, 0x28, 0x2c, "s_quadmask_b32"), + (0x2d, 0x2d, 0x29, 0x29, 0x2d, "s_quadmask_b64"), + (0x2e, 0x2e, 0x2a, 0x2a, 0x2e, "s_movrels_b32"), + (0x2f, 0x2f, 0x2b, 0x2b, 0x2f, "s_movrels_b64"), + (0x30, 0x30, 0x2c, 0x2c, 0x30, "s_movreld_b32"), + (0x31, 0x31, 0x2d, 0x2d, 0x31, "s_movreld_b64"), + (0x32, 0x32, 0x2e, 0x2e, -1, "s_cbranch_join"), + (0x34, 0x34, 0x30, 0x30, 0x34, "s_abs_i32"), + (0x35, 0x35, -1, -1, 0x35, "s_mov_fed_b32"), + ( -1, -1, 0x32, 0x32, -1, "s_set_gpr_idx_idx"), + ( -1, -1, -1, 0x33, 0x37, "s_andn1_saveexec_b64"), + ( -1, -1, -1, 0x34, 0x38, "s_orn1_saveexec_b64"), + ( -1, -1, -1, 0x35, 0x39, "s_andn1_wrexec_b64"), + ( -1, -1, -1, 0x36, 0x3a, "s_andn2_wrexec_b64"), + ( -1, -1, -1, 0x37, 0x3b, "s_bitreplicate_b64_b32"), + ( -1, -1, -1, -1, 0x3c, "s_and_saveexec_b32"), + ( -1, -1, -1, -1, 0x3d, "s_or_saveexec_b32"), + ( -1, -1, -1, -1, 0x3e, "s_xor_saveexec_b32"), + ( -1, -1, -1, -1, 0x3f, "s_andn2_saveexec_b32"), + ( -1, -1, -1, -1, 0x40, "s_orn2_saveexec_b32"), + ( -1, -1, -1, -1, 0x41, "s_nand_saveexec_b32"), + ( -1, -1, -1, -1, 0x42, "s_nor_saveexec_b32"), + ( -1, -1, -1, -1, 0x43, "s_xnor_saveexec_b32"), + ( -1, -1, -1, -1, 0x44, "s_andn1_saveexec_b32"), + ( -1, -1, -1, -1, 0x45, "s_orn1_saveexec_b32"), + ( -1, -1, -1, -1, 0x46, "s_andn1_wrexec_b32"), + ( -1, -1, -1, -1, 0x47, "s_andn2_wrexec_b32"), + ( -1, -1, -1, -1, 0x49, "s_movrelsd_2_b32"), + # actually a pseudo-instruction. it's lowered to SALU during assembly though, so it's useful to identify it as a SOP1. + ( -1, -1, -1, -1, -1, "p_constaddr"), +} +for (gfx6, gfx7, gfx8, gfx9, gfx10, name) in SOP1: + opcode(name, gfx7, gfx9, gfx10, Format.SOP1) + + +# SOPC instructions: 2 inputs and 0 outputs (+SCC) +SOPC = { + # GFX6, GFX7, GFX8, GFX9, GFX10, name + (0x00, 0x00, 0x00, 0x00, 0x00, "s_cmp_eq_i32"), + (0x01, 0x01, 0x01, 0x01, 0x01, "s_cmp_lg_i32"), + (0x02, 0x02, 0x02, 0x02, 0x02, "s_cmp_gt_i32"), + (0x03, 0x03, 0x03, 0x03, 0x03, "s_cmp_ge_i32"), + (0x04, 0x04, 0x04, 0x04, 0x04, "s_cmp_lt_i32"), + (0x05, 0x05, 0x05, 0x05, 0x05, "s_cmp_le_i32"), + (0x06, 0x06, 0x06, 0x06, 0x06, "s_cmp_eq_u32"), + (0x07, 0x07, 0x07, 0x07, 0x07, "s_cmp_lg_u32"), + (0x08, 0x08, 0x08, 0x08, 0x08, "s_cmp_gt_u32"), + (0x09, 0x09, 0x09, 0x09, 0x09, "s_cmp_ge_u32"), + (0x0a, 0x0a, 0x0a, 0x0a, 0x0a, "s_cmp_lt_u32"), + (0x0b, 0x0b, 0x0b, 0x0b, 0x0b, "s_cmp_le_u32"), + (0x0c, 0x0c, 0x0c, 0x0c, 0x0c, "s_bitcmp0_b32"), + (0x0d, 0x0d, 0x0d, 0x0d, 0x0d, "s_bitcmp1_b32"), + (0x0e, 0x0e, 0x0e, 0x0e, 0x0e, "s_bitcmp0_b64"), + (0x0f, 0x0f, 0x0f, 0x0f, 0x0f, "s_bitcmp1_b64"), + (0x10, 0x10, 0x10, 0x10, -1, "s_setvskip"), + ( -1, -1, 0x11, 0x11, -1, "s_set_gpr_idx_on"), + ( -1, -1, 0x12, 0x12, 0x12, "s_cmp_eq_u64"), + ( -1, -1, 0x13, 0x13, 0x13, "s_cmp_lg_u64"), +} +for (gfx6, gfx7, gfx8, gfx9, gfx10, name) in SOPC: + opcode(name, gfx7, gfx9, gfx10, Format.SOPC) + + +# SOPP instructions: 0 inputs (+optional scc/vcc), 0 outputs +SOPP = { + # GFX6, GFX7, GFX8, GFX9, GFX10, name + (0x00, 0x00, 0x00, 0x00, 0x00, "s_nop"), + (0x01, 0x01, 0x01, 0x01, 0x01, "s_endpgm"), + (0x02, 0x02, 0x02, 0x02, 0x02, "s_branch"), + ( -1, -1, 0x03, 0x03, 0x03, "s_wakeup"), + (0x04, 0x04, 0x04, 0x04, 0x04, "s_cbranch_scc0"), + (0x05, 0x05, 0x05, 0x05, 0x05, "s_cbranch_scc1"), + (0x06, 0x06, 0x06, 0x06, 0x06, "s_cbranch_vccz"), + (0x07, 0x07, 0x07, 0x07, 0x07, "s_cbranch_vccnz"), + (0x08, 0x08, 0x08, 0x08, 0x08, "s_cbranch_execz"), + (0x09, 0x09, 0x09, 0x09, 0x09, "s_cbranch_execnz"), + (0x0a, 0x0a, 0x0a, 0x0a, 0x0a, "s_barrier"), + ( -1, 0x0b, 0x0b, 0x0b, 0x0b, "s_setkill"), + (0x0c, 0x0c, 0x0c, 0x0c, 0x0c, "s_waitcnt"), + (0x0d, 0x0d, 0x0d, 0x0d, 0x0d, "s_sethalt"), + (0x0e, 0x0e, 0x0e, 0x0e, 0x0e, "s_sleep"), + (0x0f, 0x0f, 0x0f, 0x0f, 0x0f, "s_setprio"), + (0x10, 0x10, 0x10, 0x10, 0x10, "s_sendmsg"), + (0x11, 0x11, 0x11, 0x11, 0x11, "s_sendmsghalt"), + (0x12, 0x12, 0x12, 0x12, 0x12, "s_trap"), + (0x13, 0x13, 0x13, 0x13, 0x13, "s_icache_inv"), + (0x14, 0x14, 0x14, 0x14, 0x14, "s_incperflevel"), + (0x15, 0x15, 0x15, 0x15, 0x15, "s_decperflevel"), + (0x16, 0x16, 0x16, 0x16, 0x16, "s_ttracedata"), + ( -1, 0x17, 0x17, 0x17, 0x17, "s_cbranch_cdbgsys"), + ( -1, 0x18, 0x18, 0x18, 0x18, "s_cbranch_cdbguser"), + ( -1, 0x19, 0x19, 0x19, 0x19, "s_cbranch_cdbgsys_or_user"), + ( -1, 0x1a, 0x1a, 0x1a, 0x1a, "s_cbranch_cdbgsys_and_user"), + ( -1, -1, 0x1b, 0x1b, 0x1b, "s_endpgm_saved"), + ( -1, -1, 0x1c, 0x1c, -1, "s_set_gpr_idx_off"), + ( -1, -1, 0x1d, 0x1d, -1, "s_set_gpr_idx_mode"), + ( -1, -1, -1, 0x1e, 0x1e, "s_endpgm_ordered_ps_done"), + ( -1, -1, -1, -1, 0x1f, "s_code_end"), + ( -1, -1, -1, -1, 0x20, "s_inst_prefetch"), + ( -1, -1, -1, -1, 0x21, "s_clause"), + ( -1, -1, -1, -1, 0x22, "s_wait_idle"), + ( -1, -1, -1, -1, 0x23, "s_waitcnt_depctr"), + ( -1, -1, -1, -1, 0x24, "s_round_mode"), + ( -1, -1, -1, -1, 0x25, "s_denorm_mode"), + ( -1, -1, -1, -1, 0x26, "s_ttracedata_imm"), +} +for (gfx6, gfx7, gfx8, gfx9, gfx10, name) in SOPP: + opcode(name, gfx7, gfx9, gfx10, Format.SOPP) + + +# SMEM instructions: sbase input (2 sgpr), potentially 2 offset inputs, 1 sdata input/output +SMEM = { + # GFX6, GFX7, GFX8, GFX9, GFX10, name + (0x00, 0x00, 0x00, 0x00, 0x00, "s_load_dword"), + (0x01, 0x01, 0x01, 0x01, 0x01, "s_load_dwordx2"), + (0x02, 0x02, 0x02, 0x02, 0x02, "s_load_dwordx4"), + (0x03, 0x03, 0x03, 0x03, 0x03, "s_load_dwordx8"), + (0x04, 0x04, 0x04, 0x04, 0x04, "s_load_dwordx16"), + ( -1, -1, -1, 0x05, 0x05, "s_scratch_load_dword"), + ( -1, -1, -1, 0x06, 0x06, "s_scratch_load_dwordx2"), + ( -1, -1, -1, 0x07, 0x07, "s_scratch_load_dwordx4"), + (0x08, 0x08, 0x08, 0x08, 0x08, "s_buffer_load_dword"), + (0x09, 0x09, 0x09, 0x09, 0x09, "s_buffer_load_dwordx2"), + (0x0a, 0x0a, 0x0a, 0x0a, 0x0a, "s_buffer_load_dwordx4"), + (0x0b, 0x0b, 0x0b, 0x0b, 0x0b, "s_buffer_load_dwordx8"), + (0x0c, 0x0c, 0x0c, 0x0c, 0x0c, "s_buffer_load_dwordx16"), + ( -1, -1, 0x10, 0x10, 0x10, "s_store_dword"), + ( -1, -1, 0x11, 0x11, 0x11, "s_store_dwordx2"), + ( -1, -1, 0x12, 0x12, 0x12, "s_store_dwordx4"), + ( -1, -1, -1, 0x15, 0x15, "s_scratch_store_dword"), + ( -1, -1, -1, 0x16, 0x16, "s_scratch_store_dwordx2"), + ( -1, -1, -1, 0x17, 0x17, "s_scratch_store_dwordx4"), + ( -1, -1, 0x18, 0x18, 0x18, "s_buffer_store_dword"), + ( -1, -1, 0x19, 0x19, 0x19, "s_buffer_store_dwordx2"), + ( -1, -1, 0x1a, 0x1a, 0x1a, "s_buffer_store_dwordx4"), + ( -1, -1, 0x1f, 0x1f, 0x1f, "s_gl1_inv"), + (0x1f, 0x1f, 0x20, 0x20, 0x20, "s_dcache_inv"), + ( -1, -1, 0x21, 0x21, 0x21, "s_dcache_wb"), + ( -1, 0x1d, 0x22, 0x22, -1, "s_dcache_inv_vol"), + ( -1, -1, 0x23, 0x23, -1, "s_dcache_wb_vol"), + (0x1e, 0x1e, 0x24, 0x24, 0x24, "s_memtime"), + ( -1, -1, 0x25, 0x25, 0x25, "s_memrealtime"), + ( -1, -1, 0x26, 0x26, 0x26, "s_atc_probe"), + ( -1, -1, 0x27, 0x27, 0x27, "s_atc_probe_buffer"), + ( -1, -1, -1, 0x28, 0x28, "s_dcache_discard"), + ( -1, -1, -1, 0x29, 0x29, "s_dcache_discard_x2"), + ( -1, -1, -1, -1, 0x2a, "s_get_waveid_in_workgroup"), + ( -1, -1, -1, 0x40, 0x40, "s_buffer_atomic_swap"), + ( -1, -1, -1, 0x41, 0x41, "s_buffer_atomic_cmpswap"), + ( -1, -1, -1, 0x42, 0x42, "s_buffer_atomic_add"), + ( -1, -1, -1, 0x43, 0x43, "s_buffer_atomic_sub"), + ( -1, -1, -1, 0x44, 0x44, "s_buffer_atomic_smin"), + ( -1, -1, -1, 0x45, 0x45, "s_buffer_atomic_umin"), + ( -1, -1, -1, 0x46, 0x46, "s_buffer_atomic_smax"), + ( -1, -1, -1, 0x47, 0x47, "s_buffer_atomic_umax"), + ( -1, -1, -1, 0x48, 0x48, "s_buffer_atomic_and"), + ( -1, -1, -1, 0x49, 0x49, "s_buffer_atomic_or"), + ( -1, -1, -1, 0x4a, 0x4a, "s_buffer_atomic_xor"), + ( -1, -1, -1, 0x4b, 0x4b, "s_buffer_atomic_inc"), + ( -1, -1, -1, 0x4c, 0x4c, "s_buffer_atomic_dec"), + ( -1, -1, -1, 0x60, 0x60, "s_buffer_atomic_swap_x2"), + ( -1, -1, -1, 0x61, 0x61, "s_buffer_atomic_cmpswap_x2"), + ( -1, -1, -1, 0x62, 0x62, "s_buffer_atomic_add_x2"), + ( -1, -1, -1, 0x63, 0x63, "s_buffer_atomic_sub_x2"), + ( -1, -1, -1, 0x64, 0x64, "s_buffer_atomic_smin_x2"), + ( -1, -1, -1, 0x65, 0x65, "s_buffer_atomic_umin_x2"), + ( -1, -1, -1, 0x66, 0x66, "s_buffer_atomic_smax_x2"), + ( -1, -1, -1, 0x67, 0x67, "s_buffer_atomic_umax_x2"), + ( -1, -1, -1, 0x68, 0x68, "s_buffer_atomic_and_x2"), + ( -1, -1, -1, 0x69, 0x69, "s_buffer_atomic_or_x2"), + ( -1, -1, -1, 0x6a, 0x6a, "s_buffer_atomic_xor_x2"), + ( -1, -1, -1, 0x6b, 0x6b, "s_buffer_atomic_inc_x2"), + ( -1, -1, -1, 0x6c, 0x6c, "s_buffer_atomic_dec_x2"), + ( -1, -1, -1, 0x80, 0x80, "s_atomic_swap"), + ( -1, -1, -1, 0x81, 0x81, "s_atomic_cmpswap"), + ( -1, -1, -1, 0x82, 0x82, "s_atomic_add"), + ( -1, -1, -1, 0x83, 0x83, "s_atomic_sub"), + ( -1, -1, -1, 0x84, 0x84, "s_atomic_smin"), + ( -1, -1, -1, 0x85, 0x85, "s_atomic_umin"), + ( -1, -1, -1, 0x86, 0x86, "s_atomic_smax"), + ( -1, -1, -1, 0x87, 0x87, "s_atomic_umax"), + ( -1, -1, -1, 0x88, 0x88, "s_atomic_and"), + ( -1, -1, -1, 0x89, 0x89, "s_atomic_or"), + ( -1, -1, -1, 0x8a, 0x8a, "s_atomic_xor"), + ( -1, -1, -1, 0x8b, 0x8b, "s_atomic_inc"), + ( -1, -1, -1, 0x8c, 0x8c, "s_atomic_dec"), + ( -1, -1, -1, 0xa0, 0xa0, "s_atomic_swap_x2"), + ( -1, -1, -1, 0xa1, 0xa1, "s_atomic_cmpswap_x2"), + ( -1, -1, -1, 0xa2, 0xa2, "s_atomic_add_x2"), + ( -1, -1, -1, 0xa3, 0xa3, "s_atomic_sub_x2"), + ( -1, -1, -1, 0xa4, 0xa4, "s_atomic_smin_x2"), + ( -1, -1, -1, 0xa5, 0xa5, "s_atomic_umin_x2"), + ( -1, -1, -1, 0xa6, 0xa6, "s_atomic_smax_x2"), + ( -1, -1, -1, 0xa7, 0xa7, "s_atomic_umax_x2"), + ( -1, -1, -1, 0xa8, 0xa8, "s_atomic_and_x2"), + ( -1, -1, -1, 0xa9, 0xa9, "s_atomic_or_x2"), + ( -1, -1, -1, 0xaa, 0xaa, "s_atomic_xor_x2"), + ( -1, -1, -1, 0xab, 0xab, "s_atomic_inc_x2"), + ( -1, -1, -1, 0xac, 0xac, "s_atomic_dec_x2"), +} +for (gfx6, gfx7, gfx8, gfx9, gfx10, name) in SMEM: + opcode(name, gfx7, gfx9, gfx10, Format.SMEM, is_atomic = "atomic" in name) + + +# VOP2 instructions: 2 inputs, 1 output (+ optional vcc) +# TODO: misses some GFX6_7 opcodes which were shifted to VOP3 in GFX8 +VOP2 = { + # GFX6, GFX7, GFX8, GFX9, GFX10, name, input/output modifiers + (0x01, 0x01, -1, -1, -1, "v_readlane_b32", False), + (0x02, 0x02, -1, -1, -1, "v_writelane_b32", False), + (0x03, 0x03, 0x01, 0x01, 0x03, "v_add_f32", True), + (0x04, 0x04, 0x02, 0x02, 0x04, "v_sub_f32", True), + (0x05, 0x05, 0x03, 0x03, 0x05, "v_subrev_f32", True), + (0x06, 0x06, -1, -1, 0x06, "v_mac_legacy_f32", True), + (0x07, 0x07, 0x04, 0x04, 0x07, "v_mul_legacy_f32", True), + (0x08, 0x08, 0x05, 0x05, 0x08, "v_mul_f32", True), + (0x09, 0x09, 0x06, 0x06, 0x09, "v_mul_i32_i24", False), + (0x0a, 0x0a, 0x07, 0x07, 0x0a, "v_mul_hi_i32_i24", False), + (0x0b, 0x0b, 0x08, 0x08, 0x0b, "v_mul_u32_u24", False), + (0x0c, 0x0c, 0x09, 0x09, 0x0c, "v_mul_hi_u32_u24", False), + (0x0d, 0x0d, -1, -1, -1, "v_min_legacy_f32", True), + (0x0e, 0x0e, -1, -1, -1, "v_max_legacy_f32", True), + (0x0f, 0x0f, 0x0a, 0x0a, 0x0f, "v_min_f32", True), + (0x10, 0x10, 0x0b, 0x0b, 0x10, "v_max_f32", True), + (0x11, 0x11, 0x0c, 0x0c, 0x11, "v_min_i32", False), + (0x12, 0x12, 0x0d, 0x0d, 0x12, "v_max_i32", False), + (0x13, 0x13, 0x0e, 0x0e, 0x13, "v_min_u32", False), + (0x14, 0x14, 0x0f, 0x0f, 0x14, "v_max_u32", False), + (0x15, 0x15, -1, -1, -1, "v_lshr_b32", False), + (0x16, 0x16, 0x10, 0x10, 0x16, "v_lshrrev_b32", False), + (0x17, 0x17, -1, -1, -1, "v_ashr_i32", False), + (0x18, 0x18, 0x11, 0x11, 0x18, "v_ashrrev_i32", False), + (0x19, 0x19, -1, -1, -1, "v_lshl_b32", False), + (0x1a, 0x1a, 0x12, 0x12, 0x1a, "v_lshlrev_b32", False), + (0x1b, 0x1b, 0x13, 0x13, 0x1b, "v_and_b32", False), + (0x1c, 0x1c, 0x14, 0x14, 0x1c, "v_or_b32", False), + (0x1d, 0x1d, 0x15, 0x15, 0x1d, "v_xor_b32", False), + ( -1, -1, -1, -1, 0x1e, "v_xnor_b32", False), + (0x1f, 0x1f, 0x16, 0x16, 0x1f, "v_mac_f32", True), + (0x20, 0x20, 0x17, 0x17, 0x20, "v_madmk_f32", False), + (0x21, 0x21, 0x18, 0x18, 0x21, "v_madak_f32", False), + (0x25, 0x25, 0x19, 0x19, -1, "v_add_co_u32", False), # VOP3B only in RDNA + (0x26, 0x26, 0x1a, 0x1a, -1, "v_sub_co_u32", False), # VOP3B only in RDNA + (0x27, 0x27, 0x1b, 0x1b, -1, "v_subrev_co_u32", False), # VOP3B only in RDNA + (0x28, 0x28, 0x1c, 0x1c, 0x28, "v_addc_co_u32", False), # v_add_co_ci_u32 in RDNA + (0x29, 0x29, 0x1d, 0x1d, 0x29, "v_subb_co_u32", False), # v_sub_co_ci_u32 in RDNA + (0x2a, 0x2a, 0x1e, 0x1e, 0x2a, "v_subbrev_co_u32", False), # v_subrev_co_ci_u32 in RDNA + ( -1, -1, -1, -1, 0x2b, "v_fmac_f32", True), + ( -1, -1, -1, -1, 0x2c, "v_fmamk_f32", True), + ( -1, -1, -1, -1, 0x2d, "v_fmaak_f32", True), + ( -1, -1, 0x1f, 0x1f, 0x32, "v_add_f16", True), + ( -1, -1, 0x20, 0x20, 0x33, "v_sub_f16", True), + ( -1, -1, 0x21, 0x21, 0x34, "v_subrev_f16", True), + ( -1, -1, 0x22, 0x22, 0x35, "v_mul_f16", True), + ( -1, -1, 0x23, 0x23, -1, "v_mac_f16", True), + ( -1, -1, 0x24, 0x24, -1, "v_madmk_f16", False), + ( -1, -1, 0x25, 0x25, -1, "v_madak_f16", False), + ( -1, -1, 0x26, 0x26, -1, "v_add_u16", False), + ( -1, -1, 0x27, 0x27, -1, "v_sub_u16", False), + ( -1, -1, 0x28, 0x28, -1, "v_subrev_u16", False), + ( -1, -1, 0x29, 0x29, -1, "v_mul_lo_u16", False), + ( -1, -1, 0x2a, 0x2a, -1, "v_lshlrev_b16", False), + ( -1, -1, 0x2b, 0x2b, -1, "v_lshrrev_b16", False), + ( -1, -1, 0x2c, 0x2c, -1, "v_ashrrev_b16", False), + ( -1, -1, 0x2d, 0x2d, 0x39, "v_max_f16", True), + ( -1, -1, 0x2e, 0x2e, 0x3a, "v_min_f16", True), + ( -1, -1, 0x2f, 0x2f, -1, "v_max_u16", False), + ( -1, -1, 0x30, 0x30, -1, "v_max_i16", False), + ( -1, -1, 0x31, 0x31, -1, "v_min_u16", False), + ( -1, -1, 0x32, 0x32, -1, "v_min_i16", False), + ( -1, -1, 0x33, 0x33, 0x3b, "v_ldexp_f16", False), + ( -1, -1, 0x34, 0x34, 0x25, "v_add_u32", False), # v_add_nc_u32 in RDNA + ( -1, -1, 0x35, 0x35, 0x26, "v_sub_u32", False), # v_sub_nc_u32 in RDNA + ( -1, -1, 0x36, 0x36, 0x27, "v_subrev_u32", False), # v_subrev_nc_u32 in RDNA + ( -1, -1, -1, -1, 0x36, "v_fmac_f16", False), + ( -1, -1, -1, -1, 0x37, "v_fmamk_f16", False), + ( -1, -1, -1, -1, 0x38, "v_fmaak_f16", False), + ( -1, -1, -1, -1, 0x3c, "v_pk_fmac_f16", False), +} +for (gfx6, gfx7, gfx8, gfx9, gfx10, name, modifiers) in VOP2: + opcode(name, gfx7, gfx9, gfx10, Format.VOP2, modifiers, modifiers) + +if True: + # v_cndmask_b32 can use input modifiers but not output modifiers + (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0x00, 0x00, 0x00, 0x00, 0x01, "v_cndmask_b32") + opcode(name, gfx7, gfx9, gfx10, Format.VOP2, True, False) + + +# VOP1 instructions: instructions with 1 input and 1 output +VOP1 = { + # GFX6, GFX7, GFX8, GFX9, GFX10, name, input_modifiers, output_modifiers + (0x00, 0x00, 0x00, 0x00, 0x00, "v_nop", False, False), + (0x01, 0x01, 0x01, 0x01, 0x01, "v_mov_b32", False, False), + (0x02, 0x02, 0x02, 0x02, 0x02, "v_readfirstlane_b32", False, False), + (0x03, 0x03, 0x03, 0x03, 0x03, "v_cvt_i32_f64", True, False), + (0x04, 0x04, 0x04, 0x04, 0x04, "v_cvt_f64_i32", False, True), + (0x05, 0x05, 0x05, 0x05, 0x05, "v_cvt_f32_i32", False, True), + (0x06, 0x06, 0x06, 0x06, 0x06, "v_cvt_f32_u32", False, True), + (0x07, 0x07, 0x07, 0x07, 0x07, "v_cvt_u32_f32", True, False), + (0x08, 0x08, 0x08, 0x08, 0x08, "v_cvt_i32_f32", True, False), + (0x09, 0x09, -1, -1, 0x09, "v_mov_fed_b32", True, False), # LLVM mentions it for GFX8_9 + (0x0a, 0x0a, 0x0a, 0x0a, 0x0a, "v_cvt_f16_f32", True, True), + (0x0b, 0x0b, 0x0b, 0x0b, 0x0b, "v_cvt_f32_f16", True, True), + (0x0c, 0x0c, 0x0c, 0x0c, 0x0c, "v_cvt_rpi_i32_f32", True, False), + (0x0d, 0x0d, 0x0d, 0x0d, 0x0d, "v_cvt_flr_i32_f32", True, False), + (0x0e, 0x0e, 0x0e, 0x0e, 0x0e, "v_cvt_off_f32_i4", False, True), + (0x0f, 0x0f, 0x0f, 0x0f, 0x0f, "v_cvt_f32_f64", True, True), + (0x10, 0x10, 0x10, 0x10, 0x10, "v_cvt_f64_f32", True, True), + (0x11, 0x11, 0x11, 0x11, 0x11, "v_cvt_f32_ubyte0", False, True), + (0x12, 0x12, 0x12, 0x12, 0x12, "v_cvt_f32_ubyte1", False, True), + (0x13, 0x13, 0x13, 0x13, 0x13, "v_cvt_f32_ubyte2", False, True), + (0x14, 0x14, 0x14, 0x14, 0x14, "v_cvt_f32_ubyte3", False, True), + (0x15, 0x15, 0x15, 0x15, 0x15, "v_cvt_u32_f64", True, False), + (0x16, 0x16, 0x16, 0x16, 0x16, "v_cvt_f64_u32", False, True), + ( -1, 0x17, 0x17, 0x17, 0x17, "v_trunc_f64", True, True), + ( -1, 0x18, 0x18, 0x18, 0x18, "v_ceil_f64", True, True), + ( -1, 0x19, 0x19, 0x19, 0x19, "v_rndne_f64", True, True), + ( -1, 0x1a, 0x1a, 0x1a, 0x1a, "v_floor_f64", True, True), + ( -1, -1, -1, -1, 0x1b, "v_pipeflush", False, False), + (0x20, 0x20, 0x1b, 0x1b, 0x20, "v_fract_f32", True, True), + (0x21, 0x21, 0x1c, 0x1c, 0x21, "v_trunc_f32", True, True), + (0x22, 0x22, 0x1d, 0x1d, 0x22, "v_ceil_f32", True, True), + (0x23, 0x23, 0x1e, 0x1e, 0x23, "v_rndne_f32", True, True), + (0x24, 0x24, 0x1f, 0x1f, 0x24, "v_floor_f32", True, True), + (0x25, 0x25, 0x20, 0x20, 0x25, "v_exp_f32", True, True), + (0x26, 0x26, -1, -1, -1, "v_log_clamp_f32", True, True), + (0x27, 0x27, 0x21, 0x21, 0x27, "v_log_f32", True, True), + (0x28, 0x28, -1, -1, -1, "v_rcp_clamp_f32", True, True), + (0x29, 0x29, -1, -1, -1, "v_rcp_legacy_f32", True, True), + (0x2a, 0x2a, 0x22, 0x22, 0x2a, "v_rcp_f32", True, True), + (0x2b, 0x2b, 0x23, 0x23, 0x2b, "v_rcp_iflag_f32", True, True), + (0x2c, 0x2c, -1, -1, -1, "v_rsq_clamp_f32", True, True), + (0x2d, 0x2d, -1, -1, -1, "v_rsq_legacy_f32", True, True), + (0x2e, 0x2e, 0x24, 0x24, 0x2e, "v_rsq_f32", True, True), + (0x2f, 0x2f, 0x25, 0x25, 0x2f, "v_rcp_f64", True, True), + (0x30, 0x30, -1, -1, -1, "v_rcp_clamp_f64", True, True), + (0x31, 0x31, 0x26, 0x26, 0x31, "v_rsq_f64", True, True), + (0x32, 0x32, -1, -1, -1, "v_rsq_clamp_f64", True, True), + (0x33, 0x33, 0x27, 0x27, 0x33, "v_sqrt_f32", True, True), + (0x34, 0x34, 0x28, 0x28, 0x34, "v_sqrt_f64", True, True), + (0x35, 0x35, 0x29, 0x29, 0x35, "v_sin_f32", True, True), + (0x36, 0x36, 0x2a, 0x2a, 0x36, "v_cos_f32", True, True), + (0x37, 0x37, 0x2b, 0x2b, 0x37, "v_not_b32", False, False), + (0x38, 0x38, 0x2c, 0x2c, 0x38, "v_bfrev_b32", False, False), + (0x39, 0x39, 0x2d, 0x2d, 0x39, "v_ffbh_u32", False, False), + (0x3a, 0x3a, 0x2e, 0x2e, 0x3a, "v_ffbl_b32", False, False), + (0x3b, 0x3b, 0x2f, 0x2f, 0x3b, "v_ffbh_i32", False, False), + (0x3c, 0x3c, 0x30, 0x30, 0x3c, "v_frexp_exp_i32_f64", True, False), + (0x3d, 0x3d, 0x31, 0x31, 0x3d, "v_frexp_mant_f64", True, False), + (0x3e, 0x3e, 0x32, 0x32, 0x3e, "v_fract_f64", True, True), + (0x3f, 0x3f, 0x33, 0x33, 0x3f, "v_frexp_exp_i32_f32", True, False), + (0x40, 0x40, 0x34, 0x34, 0x40, "v_frexp_mant_f32", True, False), + (0x41, 0x41, 0x35, 0x35, 0x41, "v_clrexcp", False, False), + (0x42, 0x42, 0x36, -1, 0x42, "v_movreld_b32", False, False), + (0x43, 0x43, 0x37, -1, 0x43, "v_movrels_b32", False, False), + (0x44, 0x44, 0x38, -1, 0x44, "v_movrelsd_b32", False, False), + ( -1, -1, -1, -1, 0x48, "v_movrelsd_2_b32", False, False), + ( -1, -1, -1, 0x37, -1, "v_screen_partition_4se_b32", False, False), + ( -1, -1, 0x39, 0x39, 0x50, "v_cvt_f16_u16", False, True), + ( -1, -1, 0x3a, 0x3a, 0x51, "v_cvt_f16_i16", False, True), + ( -1, -1, 0x3b, 0x3b, 0x52, "v_cvt_u16_f16", True, False), + ( -1, -1, 0x3c, 0x3c, 0x53, "v_cvt_i16_f16", True, False), + ( -1, -1, 0x3d, 0x3d, 0x54, "v_rcp_f16", True, True), + ( -1, -1, 0x3e, 0x3e, 0x55, "v_sqrt_f16", True, True), + ( -1, -1, 0x3f, 0x3f, 0x56, "v_rsq_f16", True, True), + ( -1, -1, 0x40, 0x40, 0x57, "v_log_f16", True, True), + ( -1, -1, 0x41, 0x41, 0x58, "v_exp_f16", True, True), + ( -1, -1, 0x42, 0x42, 0x59, "v_frexp_mant_f16", True, False), + ( -1, -1, 0x43, 0x43, 0x5a, "v_frexp_exp_i16_f16", True, False), + ( -1, -1, 0x44, 0x44, 0x5b, "v_floor_f16", True, True), + ( -1, -1, 0x45, 0x45, 0x5c, "v_ceil_f16", True, True), + ( -1, -1, 0x46, 0x46, 0x5d, "v_trunc_f16", True, True), + ( -1, -1, 0x47, 0x47, 0x5e, "v_rndne_f16", True, True), + ( -1, -1, 0x48, 0x48, 0x5f, "v_fract_f16", True, True), + ( -1, -1, 0x49, 0x49, 0x60, "v_sin_f16", True, True), + ( -1, -1, 0x4a, 0x4a, 0x61, "v_cos_f16", True, True), + ( -1, 0x46, 0x4b, 0x4b, -1, "v_exp_legacy_f32", True, True), + ( -1, 0x45, 0x4c, 0x4c, -1, "v_log_legacy_f32", True, True), + ( -1, -1, -1, 0x4f, 0x62, "v_sat_pk_u8_i16", False, False), + ( -1, -1, -1, 0x4d, 0x63, "v_cvt_norm_i16_f16", True, False), + ( -1, -1, -1, 0x4e, 0x64, "v_cvt_norm_u16_f16", True, False), + ( -1, -1, -1, 0x51, 0x65, "v_swap_b32", False, False), + ( -1, -1, -1, -1, 0x68, "v_swaprel_b32", False, False), +} +for (gfx6, gfx7, gfx8, gfx9, gfx10, name, in_mod, out_mod) in VOP1: + opcode(name, gfx7, gfx9, gfx10, Format.VOP1, in_mod, out_mod) + + +# VOPC instructions: + +VOPC_CLASS = { + (0x88, 0x88, 0x10, 0x10, 0x88, "v_cmp_class_f32"), + ( -1, -1, 0x14, 0x14, 0x8f, "v_cmp_class_f16"), + (0x98, 0x98, 0x11, 0x11, 0x98, "v_cmpx_class_f32"), + ( -1, -1, 0x15, 0x15, 0x9f, "v_cmpx_class_f16"), + (0xa8, 0xa8, 0x12, 0x12, 0xa8, "v_cmp_class_f64"), + (0xb8, 0xb8, 0x13, 0x13, 0xb8, "v_cmpx_class_f64"), +} +for (gfx6, gfx7, gfx8, gfx9, gfx10, name) in VOPC_CLASS: + opcode(name, gfx7, gfx9, gfx10, Format.VOPC, True, False) + +COMPF = ["f", "lt", "eq", "le", "gt", "lg", "ge", "o", "u", "nge", "nlg", "ngt", "nle", "neq", "nlt", "tru"] + +for i in range(8): + (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, 0x20+i, 0x20+i, 0xc8+i, "v_cmp_"+COMPF[i]+"_f16") + opcode(name, gfx7, gfx9, gfx10, Format.VOPC, True, False) + (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, 0x30+i, 0x30+i, 0xd8+i, "v_cmpx_"+COMPF[i]+"_f16") + opcode(name, gfx7, gfx9, gfx10, Format.VOPC, True, False) + (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, 0x28+i, 0x28+i, 0xe8+i, "v_cmp_"+COMPF[i+8]+"_f16") + opcode(name, gfx7, gfx9, gfx10, Format.VOPC, True, False) + (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, 0x38+i, 0x38+i, 0xf8+i, "v_cmpx_"+COMPF[i+8]+"_f16") + opcode(name, gfx7, gfx9, gfx10, Format.VOPC, True, False) + +for i in range(16): + (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0x00+i, 0x00+i, 0x40+i, 0x40+i, 0x00+i, "v_cmp_"+COMPF[i]+"_f32") + opcode(name, gfx7, gfx9, gfx10, Format.VOPC, True, False) + (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0x10+i, 0x10+i, 0x50+i, 0x50+i, 0x10+i, "v_cmpx_"+COMPF[i]+"_f32") + opcode(name, gfx7, gfx9, gfx10, Format.VOPC, True, False) + (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0x20+i, 0x20+i, 0x60+i, 0x60+i, 0x20+i, "v_cmp_"+COMPF[i]+"_f64") + opcode(name, gfx7, gfx9, gfx10, Format.VOPC, True, False) + (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0x30+i, 0x30+i, 0x70+i, 0x70+i, 0x30+i, "v_cmpx_"+COMPF[i]+"_f64") + opcode(name, gfx7, gfx9, gfx10, Format.VOPC, True, False) + # GFX_6_7 + (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0x40+i, 0x40+i, -1, -1, -1, "v_cmps_"+COMPF[i]+"_f32") + (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0x50+i, 0x50+i, -1, -1, -1, "v_cmpsx_"+COMPF[i]+"_f32") + (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0x60+i, 0x60+i, -1, -1, -1, "v_cmps_"+COMPF[i]+"_f64") + (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0x70+i, 0x70+i, -1, -1, -1, "v_cmpsx_"+COMPF[i]+"_f64") + +COMPI = ["f", "lt", "eq", "le", "gt", "lg", "ge", "tru"] + +# GFX_8_9 +for i in [0,7]: # only 0 and 7 + (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, 0xa0+i, 0xa0+i, -1, "v_cmp_"+COMPI[i]+"_i16") + opcode(name, gfx7, gfx9, gfx10, Format.VOPC) + (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, 0xb0+i, 0xb0+i, -1, "v_cmpx_"+COMPI[i]+"_i16") + opcode(name, gfx7, gfx9, gfx10, Format.VOPC) + (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, 0xa8+i, 0xa8+i, -1, "v_cmp_"+COMPI[i]+"_u16") + opcode(name, gfx7, gfx9, gfx10, Format.VOPC) + (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, 0xb8+i, 0xb8+i, -1, "v_cmpx_"+COMPI[i]+"_u16") + opcode(name, gfx7, gfx9, gfx10, Format.VOPC) + +for i in range(1, 7): # [1..6] + (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, 0xa0+i, 0xa0+i, 0x88+i, "v_cmp_"+COMPI[i]+"_i16") + opcode(name, gfx7, gfx9, gfx10, Format.VOPC) + (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, 0xb0+i, 0xb0+i, 0x98+i, "v_cmpx_"+COMPI[i]+"_i16") + opcode(name, gfx7, gfx9, gfx10, Format.VOPC) + (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, 0xa8+i, 0xa8+i, 0xa8+i, "v_cmp_"+COMPI[i]+"_u16") + opcode(name, gfx7, gfx9, gfx10, Format.VOPC) + (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, 0xb8+i, 0xb8+i, 0xb8+i, "v_cmpx_"+COMPI[i]+"_u16") + opcode(name, gfx7, gfx9, gfx10, Format.VOPC) + +for i in range(8): + (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0x80+i, 0x80+i, 0xc0+i, 0xc0+i, 0x80+i, "v_cmp_"+COMPI[i]+"_i32") + opcode(name, gfx7, gfx9, gfx10, Format.VOPC) + (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0x90+i, 0x90+i, 0xd0+i, 0xd0+i, 0x90+i, "v_cmpx_"+COMPI[i]+"_i32") + opcode(name, gfx7, gfx9, gfx10, Format.VOPC) + (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0xa0+i, 0xa0+i, 0xe0+i, 0xe0+i, 0xa0+i, "v_cmp_"+COMPI[i]+"_i64") + opcode(name, gfx7, gfx9, gfx10, Format.VOPC) + (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0xb0+i, 0xb0+i, 0xf0+i, 0xf0+i, 0xb0+i, "v_cmpx_"+COMPI[i]+"_i64") + opcode(name, gfx7, gfx9, gfx10, Format.VOPC) + (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0xc0+i, 0xc0+i, 0xc8+i, 0xc8+i, 0xc0+i, "v_cmp_"+COMPI[i]+"_u32") + opcode(name, gfx7, gfx9, gfx10, Format.VOPC) + (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0xd0+i, 0xd0+i, 0xd8+i, 0xd8+i, 0xd0+i, "v_cmpx_"+COMPI[i]+"_u32") + opcode(name, gfx7, gfx9, gfx10, Format.VOPC) + (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0xe0+i, 0xe0+i, 0xe8+i, 0xe8+i, 0xe0+i, "v_cmp_"+COMPI[i]+"_u64") + opcode(name, gfx7, gfx9, gfx10, Format.VOPC) + (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0xf0+i, 0xf0+i, 0xf8+i, 0xf8+i, 0xf0+i, "v_cmpx_"+COMPI[i]+"_u64") + opcode(name, gfx7, gfx9, gfx10, Format.VOPC) + + +# VOPP instructions: packed 16bit instructions - 1 or 2 inputs and 1 output +VOPP = { + (0x00, "v_pk_mad_i16"), + (0x01, "v_pk_mul_lo_u16"), + (0x02, "v_pk_add_i16"), + (0x03, "v_pk_sub_i16"), + (0x04, "v_pk_lshlrev_b16"), + (0x05, "v_pk_lshrrev_b16"), + (0x06, "v_pk_ashrrev_i16"), + (0x07, "v_pk_max_i16"), + (0x08, "v_pk_min_i16"), + (0x09, "v_pk_mad_u16"), + (0x0a, "v_pk_add_u16"), + (0x0b, "v_pk_sub_u16"), + (0x0c, "v_pk_max_u16"), + (0x0d, "v_pk_min_u16"), + (0x0e, "v_pk_fma_f16"), + (0x0f, "v_pk_add_f16"), + (0x10, "v_pk_mul_f16"), + (0x11, "v_pk_min_f16"), + (0x12, "v_pk_max_f16"), + (0x20, "v_pk_fma_mix_f32"), # v_mad_mix_f32 in VEGA ISA, v_fma_mix_f32 in RDNA ISA + (0x21, "v_pk_fma_mixlo_f16"), # v_mad_mixlo_f16 in VEGA ISA, v_fma_mixlo_f16 in RDNA ISA + (0x22, "v_pk_fma_mixhi_f16"), # v_mad_mixhi_f16 in VEGA ISA, v_fma_mixhi_f16 in RDNA ISA +} +# note that these are only supported on gfx9+ so we'll need to distinguish between gfx8 and gfx9 here +# (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, -1, code, code, name) +for (code, name) in VOPP: + opcode(name, -1, code, code, Format.VOP3P) + + +# VINTERP instructions: +VINTRP = { + (0x00, "v_interp_p1_f32"), + (0x01, "v_interp_p2_f32"), + (0x02, "v_interp_mov_f32"), +} +# (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (code, code, code, code, code, name) +for (code, name) in VINTRP: + opcode(name, code, code, code, Format.VINTRP) + +# VOP3 instructions: 3 inputs, 1 output +# VOP3b instructions: have a unique scalar output, e.g. VOP2 with vcc out +VOP3 = { + (0x140, 0x140, 0x1c0, 0x1c0, 0x140, "v_mad_legacy_f32", True, True), + (0x141, 0x141, 0x1c1, 0x1c1, 0x141, "v_mad_f32", True, True), + (0x142, 0x142, 0x1c2, 0x1c2, 0x142, "v_mad_i32_i24", False, False), + (0x143, 0x143, 0x1c3, 0x1c3, 0x143, "v_mad_u32_u24", False, False), + (0x144, 0x144, 0x1c4, 0x1c4, 0x144, "v_cubeid_f32", True, True), + (0x145, 0x145, 0x1c5, 0x1c5, 0x145, "v_cubesc_f32", True, True), + (0x146, 0x146, 0x1c6, 0x1c6, 0x146, "v_cubetc_f32", True, True), + (0x147, 0x147, 0x1c7, 0x1c7, 0x147, "v_cubema_f32", True, True), + (0x148, 0x148, 0x1c8, 0x1c8, 0x148, "v_bfe_u32", False, False), + (0x149, 0x149, 0x1c9, 0x1c9, 0x149, "v_bfe_i32", False, False), + (0x14a, 0x14a, 0x1ca, 0x1ca, 0x14a, "v_bfi_b32", False, False), + (0x14b, 0x14b, 0x1cb, 0x1cb, 0x14b, "v_fma_f32", True, True), + (0x14c, 0x14c, 0x1cc, 0x1cc, 0x14c, "v_fma_f64", True, True), + (0x14d, 0x14d, 0x1cd, 0x1cd, 0x14d, "v_lerp_u8", False, False), + (0x14e, 0x14e, 0x1ce, 0x1ce, 0x14e, "v_alignbit_b32", False, False), + (0x14f, 0x14f, 0x1cf, 0x1cf, 0x14f, "v_alignbyte_b32", False, False), + (0x150, 0x150, -1, -1, 0x150, "v_mullit_f32", True, True), + (0x151, 0x151, 0x1d0, 0x1d0, 0x151, "v_min3_f32", True, True), + (0x152, 0x152, 0x1d1, 0x1d1, 0x152, "v_min3_i32", False, False), + (0x153, 0x153, 0x1d2, 0x1d2, 0x153, "v_min3_u32", False, False), + (0x154, 0x154, 0x1d3, 0x1d3, 0x154, "v_max3_f32", True, True), + (0x155, 0x155, 0x1d4, 0x1d4, 0x155, "v_max3_i32", False, False), + (0x156, 0x156, 0x1d5, 0x1d5, 0x156, "v_max3_u32", False, False), + (0x157, 0x157, 0x1d6, 0x1d6, 0x157, "v_med3_f32", True, True), + (0x158, 0x158, 0x1d7, 0x1d7, 0x158, "v_med3_i32", False, False), + (0x159, 0x159, 0x1d8, 0x1d8, 0x159, "v_med3_u32", False, False), + (0x15a, 0x15a, 0x1d9, 0x1d9, 0x15a, "v_sad_u8", False, False), + (0x15b, 0x15b, 0x1da, 0x1da, 0x15b, "v_sad_hi_u8", False, False), + (0x15c, 0x15c, 0x1db, 0x1db, 0x15c, "v_sad_u16", False, False), + (0x15d, 0x15d, 0x1dc, 0x1dc, 0x15d, "v_sad_u32", False, False), + (0x15e, 0x15e, 0x1dd, 0x1dd, 0x15e, "v_cvt_pk_u8_f32", True, False), + (0x15f, 0x15f, 0x1de, 0x1de, 0x15f, "v_div_fixup_f32", True, True), + (0x160, 0x160, 0x1df, 0x1df, 0x160, "v_div_fixup_f64", True, True), + (0x161, 0x161, -1, -1, -1, "v_lshl_b64", False, False), + (0x162, 0x162, -1, -1, -1, "v_lshr_b64", False, False), + (0x163, 0x163, -1, -1, -1, "v_ashr_i64", False, False), + (0x164, 0x164, 0x280, 0x280, 0x164, "v_add_f64", True, True), + (0x165, 0x165, 0x281, 0x281, 0x165, "v_mul_f64", True, True), + (0x166, 0x166, 0x282, 0x282, 0x166, "v_min_f64", True, True), + (0x167, 0x167, 0x283, 0x283, 0x167, "v_max_f64", True, True), + (0x168, 0x168, 0x284, 0x284, 0x168, "v_ldexp_f64", False, True), # src1 can take input modifiers + (0x169, 0x169, 0x285, 0x285, 0x169, "v_mul_lo_u32", False, False), + (0x16a, 0x16a, 0x286, 0x286, 0x16a, "v_mul_hi_u32", False, False), + (0x16b, 0x16b, 0x285, 0x285, 0x16b, "v_mul_lo_i32", False, False), # identical to v_mul_lo_u32 + (0x16c, 0x16c, 0x287, 0x287, 0x16c, "v_mul_hi_i32", False, False), + (0x16d, 0x16d, 0x1e0, 0x1e0, 0x16d, "v_div_scale_f32", True, True), # writes to VCC + (0x16e, 0x16e, 0x1e1, 0x1e1, 0x16e, "v_div_scale_f64", True, True), # writes to VCC + (0x16f, 0x16f, 0x1e2, 0x1e2, 0x16f, "v_div_fmas_f32", True, True), # takes VCC input + (0x170, 0x170, 0x1e3, 0x1e3, 0x170, "v_div_fmas_f64", True, True), # takes VCC input + (0x171, 0x171, 0x1e4, 0x1e4, 0x171, "v_msad_u8", False, False), + (0x172, 0x172, 0x1e5, 0x1e5, 0x172, "v_qsad_pk_u16_u8", False, False), + (0x172, -1, -1, -1, -1, "v_qsad_u8", False, False), # what's the difference? + (0x173, 0x173, 0x1e6, 0x1e6, 0x173, "v_mqsad_pk_u16_u8", False, False), + (0x173, -1, -1, -1, -1, "v_mqsad_u8", False, False), # what's the difference? + (0x174, 0x174, 0x292, 0x292, 0x174, "v_trig_preop_f64", False, False), + ( -1, 0x175, 0x1e7, 0x1e7, 0x175, "v_mqsad_u32_u8", False, False), + ( -1, 0x176, 0x1e8, 0x1e8, 0x176, "v_mad_u64_u32", False, False), + ( -1, 0x177, 0x1e9, 0x1e9, 0x177, "v_mad_i64_i32", False, False), + ( -1, -1, 0x1ea, 0x1ea, -1, "v_mad_legacy_f16", True, True), + ( -1, -1, 0x1eb, 0x1eb, -1, "v_mad_legacy_u16", False, False), + ( -1, -1, 0x1ec, 0x1ec, -1, "v_mad_legacy_i16", False, False), + ( -1, -1, 0x1ed, 0x1ed, 0x344, "v_perm_b32", False, False), + ( -1, -1, 0x1ee, 0x1ee, -1, "v_fma_legacy_f16", True, True), + ( -1, -1, 0x1ef, 0x1ef, -1, "v_div_fixup_legacy_f16", True, True), + (0x12c, 0x12c, 0x1f0, 0x1f0, -1, "v_cvt_pkaccum_u8_f32", True, False), + ( -1, -1, -1, 0x1f1, 0x373, "v_mad_u32_u16", False, False), + ( -1, -1, -1, 0x1f2, 0x375, "v_mad_i32_i16", False, False), + ( -1, -1, -1, 0x1f3, 0x345, "v_xad_u32", False, False), + ( -1, -1, -1, 0x1f4, 0x351, "v_min3_f16", True, True), + ( -1, -1, -1, 0x1f5, 0x352, "v_min3_i16", False, False), + ( -1, -1, -1, 0x1f6, 0x353, "v_min3_u16", False, False), + ( -1, -1, -1, 0x1f7, 0x354, "v_max3_f16", True, True), + ( -1, -1, -1, 0x1f8, 0x355, "v_max3_i16", False, False), + ( -1, -1, -1, 0x1f9, 0x356, "v_max3_u16", False, False), + ( -1, -1, -1, 0x1fa, 0x357, "v_med3_f16", True, True), + ( -1, -1, -1, 0x1fb, 0x358, "v_med3_i16", False, False), + ( -1, -1, -1, 0x1fc, 0x359, "v_med3_u16", False, False), + ( -1, -1, -1, 0x1fd, 0x346, "v_lshl_add_u32", False, False), + ( -1, -1, -1, 0x1fe, 0x347, "v_add_lshl_u32", False, False), + ( -1, -1, -1, 0x1ff, 0x36d, "v_add3_u32", False, False), + ( -1, -1, -1, 0x200, 0x36f, "v_lshl_or_b32", False, False), + ( -1, -1, -1, 0x201, 0x371, "v_and_or_b32", False, False), + ( -1, -1, -1, 0x202, 0x372, "v_or3_b32", False, False), + ( -1, -1, -1, 0x203, -1, "v_mad_f16", True, True), + ( -1, -1, -1, 0x204, 0x340, "v_mad_u16", False, False), + ( -1, -1, -1, 0x205, 0x35e, "v_mad_i16", False, False), + ( -1, -1, -1, 0x206, 0x34b, "v_fma_f16", True, True), + ( -1, -1, -1, 0x207, 0x35f, "v_div_fixup_f16", True, True), + ( -1, -1, 0x274, 0x274, 0x342, "v_interp_p1ll_f16", True, True), + ( -1, -1, 0x275, 0x275, 0x343, "v_interp_p1lv_f16", True, True), + ( -1, -1, 0x276, 0x276, -1, "v_interp_p2_legacy_f16", True, True), + ( -1, -1, -1, 0x277, 0x35a, "v_interp_p2_f16", True, True), + (0x12b, 0x12b, 0x288, 0x288, 0x362, "v_ldexp_f32", False, True), + ( -1, -1, 0x289, 0x289, 0x360, "v_readlane_b32_e64", False, False), + ( -1, -1, 0x28a, 0x28a, 0x361, "v_writelane_b32_e64", False, False), + (0x122, 0x122, 0x28b, 0x28b, 0x364, "v_bcnt_u32_b32", False, False), + (0x123, 0x123, 0x28c, 0x28c, 0x365, "v_mbcnt_lo_u32_b32", False, False), + (0x124, 0x124, 0x28d, 0x28d, 0x366, "v_mbcnt_hi_u32_b32", False, False), + ( -1, -1, 0x28f, 0x28f, 0x2ff, "v_lshlrev_b64", False, False), + ( -1, -1, 0x290, 0x290, 0x300, "v_lshrrev_b64", False, False), + ( -1, -1, 0x291, 0x291, 0x301, "v_ashrrev_i64", False, False), + (0x11e, 0x11e, 0x293, 0x293, 0x363, "v_bfm_b32", False, False), + (0x12d, 0x12d, 0x294, 0x294, 0x368, "v_cvt_pknorm_i16_f32", True, False), + (0x12e, 0x12e, 0x295, 0x295, 0x369, "v_cvt_pknorm_u16_f32", True, False), + (0x12f, 0x12f, 0x296, 0x296, 0x12f, "v_cvt_pkrtz_f16_f32", True, False), # GFX6_7_10 is VOP2 with opcode 0x02f + (0x130, 0x130, 0x297, 0x297, 0x36a, "v_cvt_pk_u16_u32", False, False), + (0x131, 0x131, 0x298, 0x298, 0x36b, "v_cvt_pk_i16_i32", False, False), + ( -1, -1, -1, 0x299, 0x312, "v_cvt_pknorm_i16_f16", True, False), + ( -1, -1, -1, 0x29a, 0x313, "v_cvt_pknorm_u16_f16", True, False), + ( -1, -1, -1, 0x29c, 0x37f, "v_add_i32", False, False), + ( -1, -1, -1, 0x29d, 0x376, "v_sub_i32", False, False), + ( -1, -1, -1, 0x29e, 0x30d, "v_add_i16", False, False), + ( -1, -1, -1, 0x29f, 0x30e, "v_sub_i16", False, False), + ( -1, -1, -1, 0x2a0, 0x311, "v_pack_b32_f16", True, False), + ( -1, -1, -1, -1, 0x178, "v_xor3_b32", False, False), + ( -1, -1, -1, -1, 0x377, "v_permlane16_b32", False, False), + ( -1, -1, -1, -1, 0x378, "v_permlanex16_b32", False, False), + ( -1, -1, -1, -1, 0x30f, "v_add_co_u32_e64", False, False), + ( -1, -1, -1, -1, 0x310, "v_sub_co_u32_e64", False, False), + ( -1, -1, -1, -1, 0x319, "v_subrev_co_u32_e64", False, False), +# TODO: many 16bit instructions moved from VOP2 to VOP3 on GFX10 +} +for (gfx6, gfx7, gfx8, gfx9, gfx10, name, in_mod, out_mod) in VOP3: + opcode(name, gfx7, gfx9, gfx10, Format.VOP3A, in_mod, out_mod) + + +# DS instructions: 3 inputs (1 addr, 2 data), 1 output +DS = { + (0x00, 0x00, 0x00, 0x00, 0x00, "ds_add_u32"), + (0x01, 0x01, 0x01, 0x01, 0x01, "ds_sub_u32"), + (0x02, 0x02, 0x02, 0x02, 0x02, "ds_rsub_u32"), + (0x03, 0x03, 0x03, 0x03, 0x03, "ds_inc_u32"), + (0x04, 0x04, 0x04, 0x04, 0x04, "ds_dec_u32"), + (0x05, 0x05, 0x05, 0x05, 0x05, "ds_min_i32"), + (0x06, 0x06, 0x06, 0x06, 0x06, "ds_max_i32"), + (0x07, 0x07, 0x07, 0x07, 0x07, "ds_min_u32"), + (0x08, 0x08, 0x08, 0x08, 0x08, "ds_max_u32"), + (0x09, 0x09, 0x09, 0x09, 0x09, "ds_and_b32"), + (0x0a, 0x0a, 0x0a, 0x0a, 0x0a, "ds_or_b32"), + (0x0b, 0x0b, 0x0b, 0x0b, 0x0b, "ds_xor_b32"), + (0x0c, 0x0c, 0x0c, 0x0c, 0x0c, "ds_mskor_b32"), + (0x0d, 0x0d, 0x0d, 0x0d, 0x0d, "ds_write_b32"), + (0x0e, 0x0e, 0x0e, 0x0e, 0x0e, "ds_write2_b32"), + (0x0f, 0x0f, 0x0f, 0x0f, 0x0f, "ds_write2st64_b32"), + (0x10, 0x10, 0x10, 0x10, 0x10, "ds_cmpst_b32"), + (0x11, 0x11, 0x11, 0x11, 0x11, "ds_cmpst_f32"), + (0x12, 0x12, 0x12, 0x12, 0x12, "ds_min_f32"), + (0x13, 0x13, 0x13, 0x13, 0x13, "ds_max_f32"), + ( -1, 0x14, 0x14, 0x14, 0x14, "ds_nop"), + ( -1, -1, 0x15, 0x15, 0x15, "ds_add_f32"), + ( -1, -1, 0x1d, 0x1d, 0xb0, "ds_write_addtid_b32"), + (0x1e, 0x1e, 0x1e, 0x1e, 0x1e, "ds_write_b8"), + (0x1f, 0x1f, 0x1f, 0x1f, 0x1f, "ds_write_b16"), + (0x20, 0x20, 0x20, 0x20, 0x20, "ds_add_rtn_u32"), + (0x21, 0x21, 0x21, 0x21, 0x21, "ds_sub_rtn_u32"), + (0x22, 0x22, 0x22, 0x22, 0x22, "ds_rsub_rtn_u32"), + (0x23, 0x23, 0x23, 0x23, 0x23, "ds_inc_rtn_u32"), + (0x24, 0x24, 0x24, 0x24, 0x24, "ds_dec_rtn_u32"), + (0x25, 0x25, 0x25, 0x25, 0x25, "ds_min_rtn_i32"), + (0x26, 0x26, 0x26, 0x26, 0x26, "ds_max_rtn_i32"), + (0x27, 0x27, 0x27, 0x27, 0x27, "ds_min_rtn_u32"), + (0x28, 0x28, 0x28, 0x28, 0x28, "ds_max_rtn_u32"), + (0x29, 0x29, 0x29, 0x29, 0x29, "ds_and_rtn_b32"), + (0x2a, 0x2a, 0x2a, 0x2a, 0x2a, "ds_or_rtn_b32"), + (0x2b, 0x2b, 0x2b, 0x2b, 0x2b, "ds_xor_rtn_b32"), + (0x2c, 0x2c, 0x2c, 0x2c, 0x2c, "ds_mskor_rtn_b32"), + (0x2d, 0x2d, 0x2d, 0x2d, 0x2d, "ds_wrxchg_rtn_b32"), + (0x2e, 0x2e, 0x2e, 0x2e, 0x2e, "ds_wrxchg2_rtn_b32"), + (0x2f, 0x2f, 0x2f, 0x2f, 0x2f, "ds_wrxchg2st64_rtn_b32"), + (0x30, 0x30, 0x30, 0x30, 0x30, "ds_cmpst_rtn_b32"), + (0x31, 0x31, 0x31, 0x31, 0x31, "ds_cmpst_rtn_f32"), + (0x32, 0x32, 0x32, 0x32, 0x32, "ds_min_rtn_f32"), + (0x33, 0x33, 0x33, 0x33, 0x33, "ds_max_rtn_f32"), + ( -1, 0x34, 0x34, 0x34, 0x34, "ds_wrap_rtn_b32"), + ( -1, -1, 0x35, 0x35, 0x55, "ds_add_rtn_f32"), + (0x36, 0x36, 0x36, 0x36, 0x36, "ds_read_b32"), + (0x37, 0x37, 0x37, 0x37, 0x37, "ds_read2_b32"), + (0x38, 0x38, 0x38, 0x38, 0x38, "ds_read2st64_b32"), + (0x39, 0x39, 0x39, 0x39, 0x39, "ds_read_i8"), + (0x3a, 0x3a, 0x3a, 0x3a, 0x3a, "ds_read_u8"), + (0x3b, 0x3b, 0x3b, 0x3b, 0x3b, "ds_read_i16"), + (0x3c, 0x3c, 0x3c, 0x3c, 0x3c, "ds_read_u16"), + (0x35, 0x35, 0x3d, 0x3d, 0x35, "ds_swizzle_b32"), #data1 & offset, no addr/data2 + ( -1, -1, 0x3e, 0x3e, 0xb2, "ds_permute_b32"), + ( -1, -1, 0x3f, 0x3f, 0xb3, "ds_bpermute_b32"), + (0x40, 0x40, 0x40, 0x40, 0x40, "ds_add_u64"), + (0x41, 0x41, 0x41, 0x41, 0x41, "ds_sub_u64"), + (0x42, 0x42, 0x42, 0x42, 0x42, "ds_rsub_u64"), + (0x43, 0x43, 0x43, 0x43, 0x43, "ds_inc_u64"), + (0x44, 0x44, 0x44, 0x44, 0x44, "ds_dec_u64"), + (0x45, 0x45, 0x45, 0x45, 0x45, "ds_min_i64"), + (0x46, 0x46, 0x46, 0x46, 0x46, "ds_max_i64"), + (0x47, 0x47, 0x47, 0x47, 0x47, "ds_min_u64"), + (0x48, 0x48, 0x48, 0x48, 0x48, "ds_max_u64"), + (0x49, 0x49, 0x49, 0x49, 0x49, "ds_and_b64"), + (0x4a, 0x4a, 0x4a, 0x4a, 0x4a, "ds_or_b64"), + (0x4b, 0x4b, 0x4b, 0x4b, 0x4b, "ds_xor_b64"), + (0x4c, 0x4c, 0x4c, 0x4c, 0x4c, "ds_mskor_b64"), + (0x4d, 0x4d, 0x4d, 0x4d, 0x4d, "ds_write_b64"), + (0x4e, 0x4e, 0x4e, 0x4e, 0x4e, "ds_write2_b64"), + (0x4f, 0x4f, 0x4f, 0x4f, 0x4f, "ds_write2st64_b64"), + (0x50, 0x50, 0x50, 0x50, 0x50, "ds_cmpst_b64"), + (0x51, 0x51, 0x51, 0x51, 0x51, "ds_cmpst_f64"), + (0x52, 0x52, 0x52, 0x52, 0x52, "ds_min_f64"), + (0x53, 0x53, 0x53, 0x53, 0x53, "ds_max_f64"), + ( -1, -1, 0x54, 0x54, 0xa0, "ds_write_b8_d16_hi"), + ( -1, -1, 0x55, 0x55, 0xa1, "ds_write_b16_d16_hi"), + ( -1, -1, 0x56, 0x56, 0xa2, "ds_read_u8_d16"), + ( -1, -1, 0x57, 0x57, 0xa3, "ds_read_u8_d16_hi"), + ( -1, -1, 0x58, 0x58, 0xa4, "ds_read_i8_d16"), + ( -1, -1, 0x59, 0x59, 0xa5, "ds_read_i8_d16_hi"), + ( -1, -1, 0x5a, 0x5a, 0xa6, "ds_read_u16_d16"), + ( -1, -1, 0x5b, 0x5b, 0xa7, "ds_read_u16_d16_hi"), + (0x60, 0x60, 0x60, 0x60, 0x60, "ds_add_rtn_u64"), + (0x61, 0x61, 0x61, 0x61, 0x61, "ds_sub_rtn_u64"), + (0x62, 0x62, 0x62, 0x62, 0x62, "ds_rsub_rtn_u64"), + (0x63, 0x63, 0x63, 0x63, 0x63, "ds_inc_rtn_u64"), + (0x64, 0x64, 0x64, 0x64, 0x64, "ds_dec_rtn_u64"), + (0x65, 0x65, 0x65, 0x65, 0x65, "ds_min_rtn_i64"), + (0x66, 0x66, 0x66, 0x66, 0x66, "ds_max_rtn_i64"), + (0x67, 0x67, 0x67, 0x67, 0x67, "ds_min_rtn_u64"), + (0x68, 0x68, 0x68, 0x68, 0x68, "ds_max_rtn_u64"), + (0x69, 0x69, 0x69, 0x69, 0x69, "ds_and_rtn_b64"), + (0x6a, 0x6a, 0x6a, 0x6a, 0x6a, "ds_or_rtn_b64"), + (0x6b, 0x6b, 0x6b, 0x6b, 0x6b, "ds_xor_rtn_b64"), + (0x6c, 0x6c, 0x6c, 0x6c, 0x6c, "ds_mskor_rtn_b64"), + (0x6d, 0x6d, 0x6d, 0x6d, 0x6d, "ds_wrxchg_rtn_b64"), + (0x6e, 0x6e, 0x6e, 0x6e, 0x6e, "ds_wrxchg2_rtn_b64"), + (0x6f, 0x6f, 0x6f, 0x6f, 0x6f, "ds_wrxchg2st64_rtn_b64"), + (0x70, 0x70, 0x70, 0x70, 0x70, "ds_cmpst_rtn_b64"), + (0x71, 0x71, 0x71, 0x71, 0x71, "ds_cmpst_rtn_f64"), + (0x72, 0x72, 0x72, 0x72, 0x72, "ds_min_rtn_f64"), + (0x73, 0x73, 0x73, 0x73, 0x73, "ds_max_rtn_f64"), + (0x76, 0x76, 0x76, 0x76, 0x76, "ds_read_b64"), + (0x77, 0x77, 0x77, 0x77, 0x77, "ds_read2_b64"), + (0x78, 0x78, 0x78, 0x78, 0x78, "ds_read2st64_b64"), + ( -1, 0x7e, 0x7e, 0x7e, 0x7e, "ds_condxchg32_rtn_b64"), + (0x80, 0x80, 0x80, 0x80, 0x80, "ds_add_src2_u32"), + (0x81, 0x81, 0x81, 0x81, 0x81, "ds_sub_src2_u32"), + (0x82, 0x82, 0x82, 0x82, 0x82, "ds_rsub_src2_u32"), + (0x83, 0x83, 0x83, 0x83, 0x83, "ds_inc_src2_u32"), + (0x84, 0x84, 0x84, 0x84, 0x84, "ds_dec_src2_u32"), + (0x85, 0x85, 0x85, 0x85, 0x85, "ds_min_src2_i32"), + (0x86, 0x86, 0x86, 0x86, 0x86, "ds_max_src2_i32"), + (0x87, 0x87, 0x87, 0x87, 0x87, "ds_min_src2_u32"), + (0x88, 0x88, 0x88, 0x88, 0x88, "ds_max_src2_u32"), + (0x89, 0x89, 0x89, 0x89, 0x89, "ds_and_src2_b32"), + (0x8a, 0x8a, 0x8a, 0x8a, 0x8a, "ds_or_src2_b32"), + (0x8b, 0x8b, 0x8b, 0x8b, 0x8b, "ds_xor_src2_b32"), + (0x8d, 0x8d, 0x8d, 0x8d, 0x8d, "ds_write_src2_b32"), + (0x92, 0x92, 0x92, 0x92, 0x92, "ds_min_src2_f32"), + (0x93, 0x93, 0x93, 0x93, 0x93, "ds_max_src2_f32"), + ( -1, -1, 0x95, 0x95, 0x95, "ds_add_src2_f32"), + ( -1, 0x18, 0x98, 0x98, 0x18, "ds_gws_sema_release_all"), + (0x19, 0x19, 0x99, 0x99, 0x19, "ds_gws_init"), + (0x1a, 0x1a, 0x9a, 0x9a, 0x1a, "ds_gws_sema_v"), + (0x1b, 0x1b, 0x9b, 0x9b, 0x1b, "ds_gws_sema_br"), + (0x1c, 0x1c, 0x9c, 0x9c, 0x1c, "ds_gws_sema_p"), + (0x1d, 0x1d, 0x9d, 0x9d, 0x1d, "ds_gws_barrier"), + ( -1, -1, 0xb6, 0xb6, 0xb1, "ds_read_addtid_b32"), + (0x3d, 0x3d, 0xbd, 0xbd, 0x3d, "ds_consume"), + (0x3e, 0x3e, 0xbe, 0xbe, 0x3e, "ds_append"), + (0x3f, 0x3f, 0xbf, 0xbf, 0x3f, "ds_ordered_count"), + (0xc0, 0xc0, 0xc0, 0xc0, 0xc0, "ds_add_src2_u64"), + (0xc1, 0xc1, 0xc1, 0xc1, 0xc1, "ds_sub_src2_u64"), + (0xc2, 0xc2, 0xc2, 0xc2, 0xc2, "ds_rsub_src2_u64"), + (0xc3, 0xc3, 0xc3, 0xc3, 0xc3, "ds_inc_src2_u64"), + (0xc4, 0xc4, 0xc4, 0xc4, 0xc4, "ds_dec_src2_u64"), + (0xc5, 0xc5, 0xc5, 0xc5, 0xc5, "ds_min_src2_i64"), + (0xc6, 0xc6, 0xc6, 0xc6, 0xc6, "ds_max_src2_i64"), + (0xc7, 0xc7, 0xc7, 0xc7, 0xc7, "ds_min_src2_u64"), + (0xc8, 0xc8, 0xc8, 0xc8, 0xc8, "ds_max_src2_u64"), + (0xc9, 0xc9, 0xc9, 0xc9, 0xc9, "ds_and_src2_b64"), + (0xca, 0xca, 0xca, 0xca, 0xca, "ds_or_src2_b64"), + (0xcb, 0xcb, 0xcb, 0xcb, 0xcb, "ds_xor_src2_b64"), + (0xcd, 0xcd, 0xcd, 0xcd, 0xcd, "ds_write_src2_b64"), + (0xd2, 0xd2, 0xd2, 0xd2, 0xd2, "ds_min_src2_f64"), + (0xd3, 0xd3, 0xd3, 0xd3, 0xd3, "ds_max_src2_f64"), + ( -1, 0xde, 0xde, 0xde, 0xde, "ds_write_b96"), + ( -1, 0xdf, 0xdf, 0xdf, 0xdf, "ds_write_b128"), + ( -1, 0xfd, 0xfd, -1, -1, "ds_condxchg32_rtn_b128"), + ( -1, 0xfe, 0xfe, 0xfe, 0xfe, "ds_read_b96"), + ( -1, 0xff, 0xff, 0xff, 0xff, "ds_read_b128"), +} +for (gfx6, gfx7, gfx8, gfx9, gfx10, name) in DS: + opcode(name, gfx7, gfx9, gfx10, Format.DS) + +# MUBUF instructions: +MUBUF = { + (0x00, 0x00, 0x00, 0x00, 0x00, "buffer_load_format_x"), + (0x01, 0x01, 0x01, 0x01, 0x01, "buffer_load_format_xy"), + (0x02, 0x02, 0x02, 0x02, 0x02, "buffer_load_format_xyz"), + (0x03, 0x03, 0x03, 0x03, 0x03, "buffer_load_format_xyzw"), + (0x04, 0x04, 0x04, 0x04, 0x04, "buffer_store_format_x"), + (0x05, 0x05, 0x05, 0x05, 0x05, "buffer_store_format_xy"), + (0x06, 0x06, 0x06, 0x06, 0x06, "buffer_store_format_xyz"), + (0x07, 0x07, 0x07, 0x07, 0x07, "buffer_store_format_xyzw"), + ( -1, -1, 0x08, 0x08, 0x80, "buffer_load_format_d16_x"), + ( -1, -1, 0x09, 0x09, 0x81, "buffer_load_format_d16_xy"), + ( -1, -1, 0x0a, 0x0a, 0x82, "buffer_load_format_d16_xyz"), + ( -1, -1, 0x0b, 0x0b, 0x83, "buffer_load_format_d16_xyzw"), + ( -1, -1, 0x0c, 0x0c, 0x84, "buffer_store_format_d16_x"), + ( -1, -1, 0x0d, 0x0d, 0x85, "buffer_store_format_d16_xy"), + ( -1, -1, 0x0e, 0x0e, 0x86, "buffer_store_format_d16_xyz"), + ( -1, -1, 0x0f, 0x0f, 0x87, "buffer_store_format_d16_xyzw"), + (0x08, 0x08, 0x10, 0x10, 0x08, "buffer_load_ubyte"), + (0x09, 0x09, 0x11, 0x11, 0x09, "buffer_load_sbyte"), + (0x0a, 0x0a, 0x12, 0x12, 0x0a, "buffer_load_ushort"), + (0x0b, 0x0b, 0x13, 0x13, 0x0b, "buffer_load_sshort"), + (0x0c, 0x0c, 0x14, 0x14, 0x0c, "buffer_load_dword"), + (0x0d, 0x0d, 0x15, 0x15, 0x0d, "buffer_load_dwordx2"), + ( -1, 0x0f, 0x16, 0x16, 0x0f, "buffer_load_dwordx3"), + (0x0f, 0x0e, 0x17, 0x17, 0x0e, "buffer_load_dwordx4"), + (0x18, 0x18, 0x18, 0x18, 0x18, "buffer_store_byte"), + ( -1, -1, -1, 0x19, 0x19, "buffer_store_byte_d16_hi"), + (0x1a, 0x1a, 0x1a, 0x1a, 0x1a, "buffer_store_short"), + ( -1, -1, -1, 0x1b, 0x1b, "buffer_store_short_d16_hi"), + (0x1c, 0x1c, 0x1c, 0x1c, 0x1c, "buffer_store_dword"), + (0x1d, 0x1d, 0x1d, 0x1d, 0x1d, "buffer_store_dwordx2"), + ( -1, 0x1f, 0x1e, 0x1e, 0x1f, "buffer_store_dwordx3"), + (0x1e, 0x1e, 0x1f, 0x1f, 0x1e, "buffer_store_dwordx4"), + ( -1, -1, -1, 0x20, 0x20, "buffer_load_ubyte_d16"), + ( -1, -1, -1, 0x21, 0x21, "buffer_load_ubyte_d16_hi"), + ( -1, -1, -1, 0x22, 0x22, "buffer_load_sbyte_d16"), + ( -1, -1, -1, 0x23, 0x23, "buffer_load_sbyte_d16_hi"), + ( -1, -1, -1, 0x24, 0x24, "buffer_load_short_d16"), + ( -1, -1, -1, 0x25, 0x25, "buffer_load_short_d16_hi"), + ( -1, -1, -1, 0x26, 0x26, "buffer_load_format_d16_hi_x"), + ( -1, -1, -1, 0x27, 0x27, "buffer_store_format_d16_hi_x"), + ( -1, -1, 0x3d, 0x3d, -1, "buffer_store_lds_dword"), + (0x71, 0x71, 0x3e, 0x3e, -1, "buffer_wbinvl1"), + (0x70, 0x70, 0x3f, 0x3f, -1, "buffer_wbinvl1_vol"), + (0x30, 0x30, 0x40, 0x40, 0x30, "buffer_atomic_swap"), + (0x31, 0x31, 0x41, 0x41, 0x31, "buffer_atomic_cmpswap"), + (0x32, 0x32, 0x42, 0x42, 0x32, "buffer_atomic_add"), + (0x33, 0x33, 0x43, 0x43, 0x33, "buffer_atomic_sub"), + (0x34, -1, -1, -1, -1, "buffer_atomic_rsub"), + (0x35, 0x35, 0x44, 0x44, 0x35, "buffer_atomic_smin"), + (0x36, 0x36, 0x45, 0x45, 0x36, "buffer_atomic_umin"), + (0x37, 0x37, 0x46, 0x46, 0x37, "buffer_atomic_smax"), + (0x38, 0x38, 0x47, 0x47, 0x38, "buffer_atomic_umax"), + (0x39, 0x39, 0x48, 0x48, 0x39, "buffer_atomic_and"), + (0x3a, 0x3a, 0x49, 0x49, 0x3a, "buffer_atomic_or"), + (0x3b, 0x3b, 0x4a, 0x4a, 0x3b, "buffer_atomic_xor"), + (0x3c, 0x3c, 0x4b, 0x4b, 0x3c, "buffer_atomic_inc"), + (0x3d, 0x3d, 0x4c, 0x4c, 0x3d, "buffer_atomic_dec"), + (0x3e, 0x3e, -1, -1, 0x3e, "buffer_atomic_fcmpswap"), + (0x3f, 0x3f, -1, -1, 0x3f, "buffer_atomic_fmin"), + (0x40, 0x40, -1, -1, 0x40, "buffer_atomic_fmax"), + (0x50, 0x50, 0x60, 0x60, 0x50, "buffer_atomic_swap_x2"), + (0x51, 0x51, 0x61, 0x61, 0x51, "buffer_atomic_cmpswap_x2"), + (0x52, 0x52, 0x62, 0x62, 0x52, "buffer_atomic_add_x2"), + (0x53, 0x53, 0x63, 0x63, 0x53, "buffer_atomic_sub_x2"), + (0x54, -1, -1, -1, -1, "buffer_atomic_rsub_x2"), + (0x55, 0x55, 0x64, 0x64, 0x55, "buffer_atomic_smin_x2"), + (0x56, 0x56, 0x65, 0x65, 0x56, "buffer_atomic_umin_x2"), + (0x57, 0x57, 0x66, 0x66, 0x57, "buffer_atomic_smax_x2"), + (0x58, 0x58, 0x67, 0x67, 0x58, "buffer_atomic_umax_x2"), + (0x59, 0x59, 0x68, 0x68, 0x59, "buffer_atomic_and_x2"), + (0x5a, 0x5a, 0x69, 0x69, 0x5a, "buffer_atomic_or_x2"), + (0x5b, 0x5b, 0x6a, 0x6a, 0x5b, "buffer_atomic_xor_x2"), + (0x5c, 0x5c, 0x6b, 0x6b, 0x5c, "buffer_atomic_inc_x2"), + (0x5d, 0x5d, 0x6c, 0x6c, 0x5d, "buffer_atomic_dec_x2"), + (0x5e, 0x5e, -1, -1, 0x5e, "buffer_atomic_fcmpswap_x2"), + (0x5f, 0x5f, -1, -1, 0x5f, "buffer_atomic_fmin_x2"), + (0x60, 0x60, -1, -1, 0x60, "buffer_atomic_fmax_x2"), + ( -1, -1, -1, -1, 0x71, "buffer_gl0_inv"), + ( -1, -1, -1, -1, 0x72, "buffer_gl1_inv"), +} +for (gfx6, gfx7, gfx8, gfx9, gfx10, name) in MUBUF: + opcode(name, gfx7, gfx9, gfx10, Format.MUBUF, is_atomic = "atomic" in name) + +MTBUF = { + (0x00, 0x00, 0x00, 0x00, 0x00, "tbuffer_load_format_x"), + (0x01, 0x01, 0x01, 0x01, 0x01, "tbuffer_load_format_xy"), + (0x02, 0x02, 0x02, 0x02, 0x02, "tbuffer_load_format_xyz"), + (0x03, 0x03, 0x03, 0x03, 0x03, "tbuffer_load_format_xyzw"), + (0x04, 0x04, 0x04, 0x04, 0x04, "tbuffer_store_format_x"), + (0x05, 0x05, 0x05, 0x05, 0x05, "tbuffer_store_format_xy"), + (0x06, 0x06, 0x06, 0x06, 0x06, "tbuffer_store_format_xyz"), + (0x07, 0x07, 0x07, 0x07, 0x07, "tbuffer_store_format_xyzw"), + ( -1, -1, 0x08, 0x08, 0x08, "tbuffer_load_format_d16_x"), + ( -1, -1, 0x09, 0x09, 0x09, "tbuffer_load_format_d16_xy"), + ( -1, -1, 0x0a, 0x0a, 0x0a, "tbuffer_load_format_d16_xyz"), + ( -1, -1, 0x0b, 0x0b, 0x0b, "tbuffer_load_format_d16_xyzw"), + ( -1, -1, 0x0c, 0x0c, 0x0c, "tbuffer_store_format_d16_x"), + ( -1, -1, 0x0d, 0x0d, 0x0d, "tbuffer_store_format_d16_xy"), + ( -1, -1, 0x0e, 0x0e, 0x0e, "tbuffer_store_format_d16_xyz"), + ( -1, -1, 0x0f, 0x0f, 0x0f, "tbuffer_store_format_d16_xyzw"), +} +for (gfx6, gfx7, gfx8, gfx9, gfx10, name) in MTBUF: + opcode(name, gfx7, gfx9, gfx10, Format.MTBUF) + + +IMAGE = { + (0x00, "image_load"), + (0x01, "image_load_mip"), + (0x02, "image_load_pck"), + (0x03, "image_load_pck_sgn"), + (0x04, "image_load_mip_pck"), + (0x05, "image_load_mip_pck_sgn"), + (0x08, "image_store"), + (0x09, "image_store_mip"), + (0x0a, "image_store_pck"), + (0x0b, "image_store_mip_pck"), + (0x0e, "image_get_resinfo"), + (0x60, "image_get_lod"), +} +# (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (code, code, code, code, code, name) +for (code, name) in IMAGE: + opcode(name, code, code, code, Format.MIMG) + +IMAGE_ATOMIC = { + (0x0f, 0x0f, 0x10, "image_atomic_swap"), + (0x10, 0x10, 0x11, "image_atomic_cmpswap"), + (0x11, 0x11, 0x12, "image_atomic_add"), + (0x12, 0x12, 0x13, "image_atomic_sub"), + (0x13, -1, -1, "image_atomic_rsub"), + (0x14, 0x14, 0x14, "image_atomic_smin"), + (0x15, 0x15, 0x15, "image_atomic_umin"), + (0x16, 0x16, 0x16, "image_atomic_smax"), + (0x17, 0x17, 0x17, "image_atomic_umax"), + (0x18, 0x18, 0x18, "image_atomic_and"), + (0x19, 0x19, 0x19, "image_atomic_or"), + (0x1a, 0x1a, 0x1a, "image_atomic_xor"), + (0x1b, 0x1b, 0x1b, "image_atomic_inc"), + (0x1c, 0x1c, 0x1c, "image_atomic_dec"), + (0x1d, 0x1d, -1, "image_atomic_fcmpswap"), + (0x1e, 0x1e, -1, "image_atomic_fmin"), + (0x1f, 0x1f, -1, "image_atomic_fmax"), +} +# (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (gfx6, gfx7, gfx89, gfx89, ???, name) +# gfx7 and gfx10 opcodes are the same here +for (gfx6, gfx7, gfx89, name) in IMAGE_ATOMIC: + opcode(name, gfx7, gfx89, gfx7, Format.MIMG, is_atomic = True) + +IMAGE_SAMPLE = { + (0x20, "image_sample"), + (0x21, "image_sample_cl"), + (0x22, "image_sample_d"), + (0x23, "image_sample_d_cl"), + (0x24, "image_sample_l"), + (0x25, "image_sample_b"), + (0x26, "image_sample_b_cl"), + (0x27, "image_sample_lz"), + (0x28, "image_sample_c"), + (0x29, "image_sample_c_cl"), + (0x2a, "image_sample_c_d"), + (0x2b, "image_sample_c_d_cl"), + (0x2c, "image_sample_c_l"), + (0x2d, "image_sample_c_b"), + (0x2e, "image_sample_c_b_cl"), + (0x2f, "image_sample_c_lz"), + (0x30, "image_sample_o"), + (0x31, "image_sample_cl_o"), + (0x32, "image_sample_d_o"), + (0x33, "image_sample_d_cl_o"), + (0x34, "image_sample_l_o"), + (0x35, "image_sample_b_o"), + (0x36, "image_sample_b_cl_o"), + (0x37, "image_sample_lz_o"), + (0x38, "image_sample_c_o"), + (0x39, "image_sample_c_cl_o"), + (0x3a, "image_sample_c_d_o"), + (0x3b, "image_sample_c_d_cl_o"), + (0x3c, "image_sample_c_l_o"), + (0x3d, "image_sample_c_b_o"), + (0x3e, "image_sample_c_b_cl_o"), + (0x3f, "image_sample_c_lz_o"), + (0x68, "image_sample_cd"), + (0x69, "image_sample_cd_cl"), + (0x6a, "image_sample_c_cd"), + (0x6b, "image_sample_c_cd_cl"), + (0x6c, "image_sample_cd_o"), + (0x6d, "image_sample_cd_cl_o"), + (0x6e, "image_sample_c_cd_o"), + (0x6f, "image_sample_c_cd_cl_o"), +} +# (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (code, code, code, code, code, name) +for (code, name) in IMAGE_SAMPLE: + opcode(name, code, code, code, Format.MIMG) + +IMAGE_GATHER4 = { + (0x40, "image_gather4"), + (0x41, "image_gather4_cl"), + #(0x42, "image_gather4h"), VEGA only? + (0x44, "image_gather4_l"), # following instructions have different opcodes according to ISA sheet. + (0x45, "image_gather4_b"), + (0x46, "image_gather4_b_cl"), + (0x47, "image_gather4_lz"), + (0x48, "image_gather4_c"), + (0x49, "image_gather4_c_cl"), # previous instructions have different opcodes according to ISA sheet. + #(0x4a, "image_gather4h_pck"), VEGA only? + #(0x4b, "image_gather8h_pck"), VGEA only? + (0x4c, "image_gather4_c_l"), + (0x4d, "image_gather4_c_b"), + (0x4e, "image_gather4_c_b_cl"), + (0x4f, "image_gather4_c_lz"), + (0x50, "image_gather4_o"), + (0x51, "image_gather4_cl_o"), + (0x54, "image_gather4_l_o"), + (0x55, "image_gather4_b_o"), + (0x56, "image_gather4_b_cl_o"), + (0x57, "image_gather4_lz_o"), + (0x58, "image_gather4_c_o"), + (0x59, "image_gather4_c_cl_o"), + (0x5c, "image_gather4_c_l_o"), + (0x5d, "image_gather4_c_b_o"), + (0x5e, "image_gather4_c_b_cl_o"), + (0x5f, "image_gather4_c_lz_o"), +} +# (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (code, code, code, code, code, name) +for (code, name) in IMAGE_GATHER4: + opcode(name, code, code, code, Format.MIMG) + + +FLAT = { + #GFX7, GFX8_9, GFX10 + (0x08, 0x10, 0x08, "flat_load_ubyte"), + (0x09, 0x11, 0x09, "flat_load_sbyte"), + (0x0a, 0x12, 0x0a, "flat_load_ushort"), + (0x0b, 0x13, 0x0b, "flat_load_sshort"), + (0x0c, 0x14, 0x0c, "flat_load_dword"), + (0x0d, 0x15, 0x0d, "flat_load_dwordx2"), + (0x0f, 0x16, 0x0f, "flat_load_dwordx3"), + (0x0e, 0x17, 0x0e, "flat_load_dwordx4"), + (0x18, 0x18, 0x18, "flat_store_byte"), + ( -1, 0x19, 0x19, "flat_store_byte_d16_hi"), + (0x1a, 0x1a, 0x1a, "flat_store_short"), + ( -1, 0x1b, 0x1b, "flat_store_short_d16_hi"), + (0x1c, 0x1c, 0x1c, "flat_store_dword"), + (0x1d, 0x1d, 0x1d, "flat_store_dwordx2"), + (0x1f, 0x1e, 0x1f, "flat_store_dwordx3"), + (0x1e, 0x1f, 0x1e, "flat_store_dwordx4"), + ( -1, 0x20, 0x20, "flat_load_ubyte_d16"), + ( -1, 0x21, 0x21, "flat_load_ubyte_d16_hi"), + ( -1, 0x22, 0x22, "flat_load_sbyte_d16"), + ( -1, 0x23, 0x23, "flat_load_sbyte_d16_hi"), + ( -1, 0x24, 0x24, "flat_load_short_d16"), + ( -1, 0x25, 0x25, "flat_load_short_d16_hi"), + (0x30, 0x40, 0x30, "flat_atomic_swap"), + (0x31, 0x41, 0x31, "flat_atomic_cmpswap"), + (0x32, 0x42, 0x32, "flat_atomic_add"), + (0x33, 0x43, 0x33, "flat_atomic_sub"), + (0x35, 0x44, 0x35, "flat_atomic_smin"), + (0x36, 0x45, 0x36, "flat_atomic_umin"), + (0x37, 0x46, 0x37, "flat_atomic_smax"), + (0x38, 0x47, 0x38, "flat_atomic_umax"), + (0x39, 0x48, 0x39, "flat_atomic_and"), + (0x3a, 0x49, 0x3a, "flat_atomic_or"), + (0x3b, 0x4a, 0x3b, "flat_atomic_xor"), + (0x3c, 0x4b, 0x3c, "flat_atomic_inc"), + (0x3d, 0x4c, 0x3d, "flat_atomic_dec"), + (0x3e, -1, 0x3e, "flat_atomic_fcmpswap"), + (0x3f, -1, 0x3f, "flat_atomic_fmin"), + (0x40, -1, 0x40, "flat_atomic_fmax"), + (0x50, 0x60, 0x50, "flat_atomic_swap_x2"), + (0x51, 0x61, 0x51, "flat_atomic_cmpswap_x2"), + (0x52, 0x62, 0x52, "flat_atomic_add_x2"), + (0x53, 0x63, 0x53, "flat_atomic_sub_x2"), + (0x55, 0x64, 0x55, "flat_atomic_smin_x2"), + (0x56, 0x65, 0x56, "flat_atomic_umin_x2"), + (0x57, 0x66, 0x57, "flat_atomic_smax_x2"), + (0x58, 0x67, 0x58, "flat_atomic_umax_x2"), + (0x59, 0x68, 0x59, "flat_atomic_and_x2"), + (0x5a, 0x69, 0x5a, "flat_atomic_or_x2"), + (0x5b, 0x6a, 0x5b, "flat_atomic_xor_x2"), + (0x5c, 0x6b, 0x5c, "flat_atomic_inc_x2"), + (0x5d, 0x6c, 0x5d, "flat_atomic_dec_x2"), + (0x5e, -1, 0x5e, "flat_atomic_fcmpswap_x2"), + (0x5f, -1, 0x5f, "flat_atomic_fmin_x2"), + (0x60, -1, 0x60, "flat_atomic_fmax_x2"), +} +for (gfx7, gfx8, gfx10, name) in FLAT: + opcode(name, gfx7, gfx8, gfx10, Format.FLAT, is_atomic = "atomic" in name) + +GLOBAL = { + #GFX8_9, GFX10 + (0x10, 0x08, "global_load_ubyte"), + (0x11, 0x09, "global_load_sbyte"), + (0x12, 0x0a, "global_load_ushort"), + (0x13, 0x0b, "global_load_sshort"), + (0x14, 0x0c, "global_load_dword"), + (0x15, 0x0d, "global_load_dwordx2"), + (0x16, 0x0f, "global_load_dwordx3"), + (0x17, 0x0e, "global_load_dwordx4"), + (0x18, 0x18, "global_store_byte"), + (0x19, 0x19, "global_store_byte_d16_hi"), + (0x1a, 0x1a, "global_store_short"), + (0x1b, 0x1b, "global_store_short_d16_hi"), + (0x1c, 0x1c, "global_store_dword"), + (0x1d, 0x1d, "global_store_dwordx2"), + (0x1e, 0x1f, "global_store_dwordx3"), + (0x1f, 0x1e, "global_store_dwordx4"), + (0x20, 0x20, "global_load_ubyte_d16"), + (0x21, 0x21, "global_load_ubyte_d16_hi"), + (0x22, 0x22, "global_load_sbyte_d16"), + (0x23, 0x23, "global_load_sbyte_d16_hi"), + (0x24, 0x24, "global_load_short_d16"), + (0x25, 0x25, "global_load_short_d16_hi"), + (0x40, 0x30, "global_atomic_swap"), + (0x41, 0x31, "global_atomic_cmpswap"), + (0x42, 0x32, "global_atomic_add"), + (0x43, 0x33, "global_atomic_sub"), + (0x44, 0x35, "global_atomic_smin"), + (0x45, 0x36, "global_atomic_umin"), + (0x46, 0x37, "global_atomic_smax"), + (0x47, 0x38, "global_atomic_umax"), + (0x48, 0x39, "global_atomic_and"), + (0x49, 0x3a, "global_atomic_or"), + (0x4a, 0x3b, "global_atomic_xor"), + (0x4b, 0x3c, "global_atomic_inc"), + (0x4c, 0x3d, "global_atomic_dec"), + ( -1, 0x3e, "global_atomic_fcmpswap"), + ( -1, 0x3f, "global_atomic_fmin"), + ( -1, 0x40, "global_atomic_fmax"), + (0x60, 0x50, "global_atomic_swap_x2"), + (0x61, 0x51, "global_atomic_cmpswap_x2"), + (0x62, 0x52, "global_atomic_add_x2"), + (0x63, 0x53, "global_atomic_sub_x2"), + (0x64, 0x55, "global_atomic_smin_x2"), + (0x65, 0x56, "global_atomic_umin_x2"), + (0x66, 0x57, "global_atomic_smax_x2"), + (0x67, 0x58, "global_atomic_umax_x2"), + (0x68, 0x59, "global_atomic_and_x2"), + (0x69, 0x5a, "global_atomic_or_x2"), + (0x6a, 0x5b, "global_atomic_xor_x2"), + (0x6b, 0x5c, "global_atomic_inc_x2"), + (0x6c, 0x5d, "global_atomic_dec_x2"), + ( -1, 0x5e, "global_atomic_fcmpswap_x2"), + ( -1, 0x5f, "global_atomic_fmin_x2"), + ( -1, 0x60, "global_atomic_fmax_x2"), +} +for (gfx8, gfx10, name) in GLOBAL: + opcode(name, -1, gfx8, gfx10, Format.GLOBAL, is_atomic = "atomic" in name) + +SCRATCH = { + #GFX8_9, GFX10 + (0x10, 0x08, "scratch_load_ubyte"), + (0x11, 0x09, "scratch_load_sbyte"), + (0x12, 0x0a, "scratch_load_ushort"), + (0x13, 0x0b, "scratch_load_sshort"), + (0x14, 0x0c, "scratch_load_dword"), + (0x15, 0x0d, "scratch_load_dwordx2"), + (0x16, 0x0f, "scratch_load_dwordx3"), + (0x17, 0x0e, "scratch_load_dwordx4"), + (0x18, 0x18, "scratch_store_byte"), + (0x19, 0x19, "scratch_store_byte_d16_hi"), + (0x1a, 0x1a, "scratch_store_short"), + (0x1b, 0x1b, "scratch_store_short_d16_hi"), + (0x1c, 0x1c, "scratch_store_dword"), + (0x1d, 0x1d, "scratch_store_dwordx2"), + (0x1e, 0x1f, "scratch_store_dwordx3"), + (0x1f, 0x1e, "scratch_store_dwordx4"), + (0x20, 0x20, "scratch_load_ubyte_d16"), + (0x21, 0x21, "scratch_load_ubyte_d16_hi"), + (0x22, 0x22, "scratch_load_sbyte_d16"), + (0x23, 0x23, "scratch_load_sbyte_d16_hi"), + (0x24, 0x24, "scratch_load_short_d16"), + (0x25, 0x25, "scratch_load_short_d16_hi"), +} +for (gfx8, gfx10, name) in SCRATCH: + opcode(name, -1, gfx8, gfx10, Format.SCRATCH) + +# check for duplicate opcode numbers +for ver in ['gfx9', 'gfx10']: + op_to_name = {} + for op in opcodes.values(): + if op.format in [Format.PSEUDO, Format.PSEUDO_BRANCH, Format.PSEUDO_BARRIER, Format.PSEUDO_REDUCTION]: + continue + + num = getattr(op, 'opcode_' + ver) + if num == -1: + continue + + key = (op.format, num) + + if key in op_to_name: + # exceptions + names = set([op_to_name[key], op.name]) + if ver in ['gfx8', 'gfx9'] and names == set(['v_mul_lo_i32', 'v_mul_lo_u32']): + continue + + print('%s and %s share the same opcode number (%s)' % (op_to_name[key], op.name, ver)) + sys.exit(1) + else: + op_to_name[key] = op.name diff -Nru mesa-19.2.8/src/amd/compiler/aco_optimizer.cpp mesa-20.0.8/src/amd/compiler/aco_optimizer.cpp --- mesa-19.2.8/src/amd/compiler/aco_optimizer.cpp 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/amd/compiler/aco_optimizer.cpp 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,2826 @@ +/* + * Copyright © 2018 Valve Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * Authors: + * Daniel Schürmann (daniel.schuermann@campus.tu-berlin.de) + * + */ + +#include +#include + +#include "aco_ir.h" +#include "util/half_float.h" +#include "util/u_math.h" + +namespace aco { + +/** + * The optimizer works in 4 phases: + * (1) The first pass collects information for each ssa-def, + * propagates reg->reg operands of the same type, inline constants + * and neg/abs input modifiers. + * (2) The second pass combines instructions like mad, omod, clamp and + * propagates sgpr's on VALU instructions. + * This pass depends on information collected in the first pass. + * (3) The third pass goes backwards, and selects instructions, + * i.e. decides if a mad instruction is profitable and eliminates dead code. + * (4) The fourth pass cleans up the sequence: literals get applied and dead + * instructions are removed from the sequence. + */ + + +struct mad_info { + aco_ptr add_instr; + uint32_t mul_temp_id; + uint32_t literal_idx; + bool check_literal; + + mad_info(aco_ptr instr, uint32_t id) + : add_instr(std::move(instr)), mul_temp_id(id), check_literal(false) {} +}; + +enum Label { + label_vec = 1 << 0, + label_constant = 1 << 1, + label_abs = 1 << 2, + label_neg = 1 << 3, + label_mul = 1 << 4, + label_temp = 1 << 5, + label_literal = 1 << 6, + label_mad = 1 << 7, + label_omod2 = 1 << 8, + label_omod4 = 1 << 9, + label_omod5 = 1 << 10, + label_omod_success = 1 << 11, + label_clamp = 1 << 12, + label_clamp_success = 1 << 13, + label_undefined = 1 << 14, + label_vcc = 1 << 15, + label_b2f = 1 << 16, + label_add_sub = 1 << 17, + label_bitwise = 1 << 18, + label_minmax = 1 << 19, + label_fcmp = 1 << 20, + label_uniform_bool = 1 << 21, + label_constant_64bit = 1 << 22, + label_uniform_bitwise = 1 << 23, + label_scc_invert = 1 << 24, + label_vcc_hint = 1 << 25, + label_scc_needed = 1 << 26, +}; + +static constexpr uint32_t instr_labels = label_vec | label_mul | label_mad | label_omod_success | label_clamp_success | + label_add_sub | label_bitwise | label_uniform_bitwise | label_minmax | label_fcmp; +static constexpr uint32_t temp_labels = label_abs | label_neg | label_temp | label_vcc | label_b2f | label_uniform_bool | + label_omod2 | label_omod4 | label_omod5 | label_clamp | label_scc_invert; +static constexpr uint32_t val_labels = label_constant | label_constant_64bit | label_literal | label_mad; + +struct ssa_info { + uint32_t val; + union { + Temp temp; + Instruction* instr; + }; + uint32_t label; + + void add_label(Label new_label) + { + /* Since all labels which use "instr" use it for the same thing + * (indicating the defining instruction), there is no need to clear + * any other instr labels. */ + if (new_label & instr_labels) + label &= ~temp_labels; /* instr and temp alias */ + + if (new_label & temp_labels) { + label &= ~temp_labels; + label &= ~instr_labels; /* instr and temp alias */ + } + + if (new_label & val_labels) + label &= ~val_labels; + + label |= new_label; + } + + void set_vec(Instruction* vec) + { + add_label(label_vec); + instr = vec; + } + + bool is_vec() + { + return label & label_vec; + } + + void set_constant(uint32_t constant) + { + add_label(label_constant); + val = constant; + } + + bool is_constant() + { + return label & label_constant; + } + + void set_constant_64bit(uint32_t constant) + { + add_label(label_constant_64bit); + val = constant; + } + + bool is_constant_64bit() + { + return label & label_constant_64bit; + } + + void set_abs(Temp abs_temp) + { + add_label(label_abs); + temp = abs_temp; + } + + bool is_abs() + { + return label & label_abs; + } + + void set_neg(Temp neg_temp) + { + add_label(label_neg); + temp = neg_temp; + } + + bool is_neg() + { + return label & label_neg; + } + + void set_neg_abs(Temp neg_abs_temp) + { + add_label((Label)((uint32_t)label_abs | (uint32_t)label_neg)); + temp = neg_abs_temp; + } + + void set_mul(Instruction* mul) + { + add_label(label_mul); + instr = mul; + } + + bool is_mul() + { + return label & label_mul; + } + + void set_temp(Temp tmp) + { + add_label(label_temp); + temp = tmp; + } + + bool is_temp() + { + return label & label_temp; + } + + void set_literal(uint32_t lit) + { + add_label(label_literal); + val = lit; + } + + bool is_literal() + { + return label & label_literal; + } + + void set_mad(Instruction* mad, uint32_t mad_info_idx) + { + add_label(label_mad); + val = mad_info_idx; + instr = mad; + } + + bool is_mad() + { + return label & label_mad; + } + + void set_omod2(Temp def) + { + add_label(label_omod2); + temp = def; + } + + bool is_omod2() + { + return label & label_omod2; + } + + void set_omod4(Temp def) + { + add_label(label_omod4); + temp = def; + } + + bool is_omod4() + { + return label & label_omod4; + } + + void set_omod5(Temp def) + { + add_label(label_omod5); + temp = def; + } + + bool is_omod5() + { + return label & label_omod5; + } + + void set_omod_success(Instruction* omod_instr) + { + add_label(label_omod_success); + instr = omod_instr; + } + + bool is_omod_success() + { + return label & label_omod_success; + } + + void set_clamp(Temp def) + { + add_label(label_clamp); + temp = def; + } + + bool is_clamp() + { + return label & label_clamp; + } + + void set_clamp_success(Instruction* clamp_instr) + { + add_label(label_clamp_success); + instr = clamp_instr; + } + + bool is_clamp_success() + { + return label & label_clamp_success; + } + + void set_undefined() + { + add_label(label_undefined); + } + + bool is_undefined() + { + return label & label_undefined; + } + + void set_vcc(Temp vcc) + { + add_label(label_vcc); + temp = vcc; + } + + bool is_vcc() + { + return label & label_vcc; + } + + bool is_constant_or_literal() + { + return is_constant() || is_literal(); + } + + void set_b2f(Temp val) + { + add_label(label_b2f); + temp = val; + } + + bool is_b2f() + { + return label & label_b2f; + } + + void set_add_sub(Instruction *add_sub_instr) + { + add_label(label_add_sub); + instr = add_sub_instr; + } + + bool is_add_sub() + { + return label & label_add_sub; + } + + void set_bitwise(Instruction *bitwise_instr) + { + add_label(label_bitwise); + instr = bitwise_instr; + } + + bool is_bitwise() + { + return label & label_bitwise; + } + + void set_uniform_bitwise() + { + add_label(label_uniform_bitwise); + } + + bool is_uniform_bitwise() + { + return label & label_uniform_bitwise; + } + + void set_minmax(Instruction *minmax_instr) + { + add_label(label_minmax); + instr = minmax_instr; + } + + bool is_minmax() + { + return label & label_minmax; + } + + void set_fcmp(Instruction *fcmp_instr) + { + add_label(label_fcmp); + instr = fcmp_instr; + } + + bool is_fcmp() + { + return label & label_fcmp; + } + + void set_scc_needed() + { + add_label(label_scc_needed); + } + + bool is_scc_needed() + { + return label & label_scc_needed; + } + + void set_scc_invert(Temp scc_inv) + { + add_label(label_scc_invert); + temp = scc_inv; + } + + bool is_scc_invert() + { + return label & label_scc_invert; + } + + void set_uniform_bool(Temp uniform_bool) + { + add_label(label_uniform_bool); + temp = uniform_bool; + } + + bool is_uniform_bool() + { + return label & label_uniform_bool; + } + + void set_vcc_hint() + { + add_label(label_vcc_hint); + } + + bool is_vcc_hint() + { + return label & label_vcc_hint; + } +}; + +struct opt_ctx { + Program* program; + std::vector> instructions; + ssa_info* info; + std::pair last_literal; + std::vector mad_infos; + std::vector uses; +}; + +bool can_swap_operands(aco_ptr& instr) +{ + if (instr->operands[0].isConstant() || + (instr->operands[0].isTemp() && instr->operands[0].getTemp().type() == RegType::sgpr)) + return false; + + switch (instr->opcode) { + case aco_opcode::v_add_f32: + case aco_opcode::v_mul_f32: + case aco_opcode::v_or_b32: + case aco_opcode::v_and_b32: + case aco_opcode::v_xor_b32: + case aco_opcode::v_max_f32: + case aco_opcode::v_min_f32: + case aco_opcode::v_max_i32: + case aco_opcode::v_min_i32: + case aco_opcode::v_max_u32: + case aco_opcode::v_min_u32: + case aco_opcode::v_cmp_eq_f32: + case aco_opcode::v_cmp_lg_f32: + return true; + case aco_opcode::v_sub_f32: + instr->opcode = aco_opcode::v_subrev_f32; + return true; + case aco_opcode::v_cmp_lt_f32: + instr->opcode = aco_opcode::v_cmp_gt_f32; + return true; + case aco_opcode::v_cmp_ge_f32: + instr->opcode = aco_opcode::v_cmp_le_f32; + return true; + case aco_opcode::v_cmp_lt_i32: + instr->opcode = aco_opcode::v_cmp_gt_i32; + return true; + default: + return false; + } +} + +bool can_use_VOP3(opt_ctx& ctx, aco_ptr& instr) +{ + if (instr->isVOP3()) + return true; + + if (instr->operands.size() && instr->operands[0].isLiteral() && ctx.program->chip_class < GFX10) + return false; + + if (instr->isDPP() || instr->isSDWA()) + return false; + + return instr->opcode != aco_opcode::v_madmk_f32 && + instr->opcode != aco_opcode::v_madak_f32 && + instr->opcode != aco_opcode::v_madmk_f16 && + instr->opcode != aco_opcode::v_madak_f16 && + instr->opcode != aco_opcode::v_fmamk_f32 && + instr->opcode != aco_opcode::v_fmaak_f32 && + instr->opcode != aco_opcode::v_fmamk_f16 && + instr->opcode != aco_opcode::v_fmaak_f16 && + instr->opcode != aco_opcode::v_readlane_b32 && + instr->opcode != aco_opcode::v_writelane_b32 && + instr->opcode != aco_opcode::v_readfirstlane_b32; +} + +bool can_apply_sgprs(aco_ptr& instr) +{ + return instr->opcode != aco_opcode::v_readfirstlane_b32 && + instr->opcode != aco_opcode::v_readlane_b32 && + instr->opcode != aco_opcode::v_readlane_b32_e64 && + instr->opcode != aco_opcode::v_writelane_b32 && + instr->opcode != aco_opcode::v_writelane_b32_e64; +} + +void to_VOP3(opt_ctx& ctx, aco_ptr& instr) +{ + if (instr->isVOP3()) + return; + + aco_ptr tmp = std::move(instr); + Format format = asVOP3(tmp->format); + instr.reset(create_instruction(tmp->opcode, format, tmp->operands.size(), tmp->definitions.size())); + std::copy(tmp->operands.cbegin(), tmp->operands.cend(), instr->operands.begin()); + for (unsigned i = 0; i < instr->definitions.size(); i++) { + instr->definitions[i] = tmp->definitions[i]; + if (instr->definitions[i].isTemp()) { + ssa_info& info = ctx.info[instr->definitions[i].tempId()]; + if (info.label & instr_labels && info.instr == tmp.get()) + info.instr = instr.get(); + } + } +} + +/* only covers special cases */ +bool alu_can_accept_constant(aco_opcode opcode, unsigned operand) +{ + switch (opcode) { + case aco_opcode::v_interp_p2_f32: + case aco_opcode::v_mac_f32: + case aco_opcode::v_writelane_b32: + case aco_opcode::v_writelane_b32_e64: + case aco_opcode::v_cndmask_b32: + return operand != 2; + case aco_opcode::s_addk_i32: + case aco_opcode::s_mulk_i32: + case aco_opcode::p_wqm: + case aco_opcode::p_extract_vector: + case aco_opcode::p_split_vector: + case aco_opcode::v_readlane_b32: + case aco_opcode::v_readlane_b32_e64: + case aco_opcode::v_readfirstlane_b32: + return operand != 0; + default: + return true; + } +} + +bool valu_can_accept_vgpr(aco_ptr& instr, unsigned operand) +{ + if (instr->opcode == aco_opcode::v_readlane_b32 || instr->opcode == aco_opcode::v_readlane_b32_e64 || + instr->opcode == aco_opcode::v_writelane_b32 || instr->opcode == aco_opcode::v_writelane_b32_e64) + return operand != 1; + return true; +} + +/* check constant bus and literal limitations */ +bool check_vop3_operands(opt_ctx& ctx, unsigned num_operands, Operand *operands) +{ + int limit = ctx.program->chip_class >= GFX10 ? 2 : 1; + Operand literal32(s1); + Operand literal64(s2); + unsigned num_sgprs = 0; + unsigned sgpr[] = {0, 0}; + + for (unsigned i = 0; i < num_operands; i++) { + Operand op = operands[i]; + + if (op.hasRegClass() && op.regClass().type() == RegType::sgpr) { + /* two reads of the same SGPR count as 1 to the limit */ + if (op.tempId() != sgpr[0] && op.tempId() != sgpr[1]) { + if (num_sgprs < 2) + sgpr[num_sgprs++] = op.tempId(); + limit--; + if (limit < 0) + return false; + } + } else if (op.isLiteral()) { + if (ctx.program->chip_class < GFX10) + return false; + + if (!literal32.isUndefined() && literal32.constantValue() != op.constantValue()) + return false; + if (!literal64.isUndefined() && literal64.constantValue() != op.constantValue()) + return false; + + /* Any number of 32-bit literals counts as only 1 to the limit. Same + * (but separately) for 64-bit literals. */ + if (op.size() == 1 && literal32.isUndefined()) { + limit--; + literal32 = op; + } else if (op.size() == 2 && literal64.isUndefined()) { + limit--; + literal64 = op; + } + + if (limit < 0) + return false; + } + } + + return true; +} + +bool parse_base_offset(opt_ctx &ctx, Instruction* instr, unsigned op_index, Temp *base, uint32_t *offset) +{ + Operand op = instr->operands[op_index]; + + if (!op.isTemp()) + return false; + Temp tmp = op.getTemp(); + if (!ctx.info[tmp.id()].is_add_sub()) + return false; + + Instruction *add_instr = ctx.info[tmp.id()].instr; + + switch (add_instr->opcode) { + case aco_opcode::v_add_u32: + case aco_opcode::v_add_co_u32: + case aco_opcode::s_add_i32: + case aco_opcode::s_add_u32: + break; + default: + return false; + } + + if (add_instr->usesModifiers()) + return false; + + for (unsigned i = 0; i < 2; i++) { + if (add_instr->operands[i].isConstant()) { + *offset = add_instr->operands[i].constantValue(); + } else if (add_instr->operands[i].isTemp() && + ctx.info[add_instr->operands[i].tempId()].is_constant_or_literal()) { + *offset = ctx.info[add_instr->operands[i].tempId()].val; + } else { + continue; + } + if (!add_instr->operands[!i].isTemp()) + continue; + + uint32_t offset2 = 0; + if (parse_base_offset(ctx, add_instr, !i, base, &offset2)) { + *offset += offset2; + } else { + *base = add_instr->operands[!i].getTemp(); + } + return true; + } + + return false; +} + +Operand get_constant_op(opt_ctx &ctx, uint32_t val, bool is64bit = false) +{ + // TODO: this functions shouldn't be needed if we store Operand instead of value. + Operand op(val, is64bit); + if (val == 0x3e22f983 && ctx.program->chip_class >= GFX8) + op.setFixed(PhysReg{248}); /* 1/2 PI can be an inline constant on GFX8+ */ + return op; +} + +void label_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr) +{ + if (instr->isSALU() || instr->isVALU() || instr->format == Format::PSEUDO) { + ASSERTED bool all_const = false; + for (Operand& op : instr->operands) + all_const = all_const && (!op.isTemp() || ctx.info[op.tempId()].is_constant_or_literal()); + perfwarn(all_const, "All instruction operands are constant", instr.get()); + } + + for (unsigned i = 0; i < instr->operands.size(); i++) + { + if (!instr->operands[i].isTemp()) + continue; + + ssa_info info = ctx.info[instr->operands[i].tempId()]; + /* propagate undef */ + if (info.is_undefined() && is_phi(instr)) + instr->operands[i] = Operand(instr->operands[i].regClass()); + /* propagate reg->reg of same type */ + if (info.is_temp() && info.temp.regClass() == instr->operands[i].getTemp().regClass()) { + instr->operands[i].setTemp(ctx.info[instr->operands[i].tempId()].temp); + info = ctx.info[info.temp.id()]; + } + + /* SALU / PSEUDO: propagate inline constants */ + if (instr->isSALU() || instr->format == Format::PSEUDO) { + if (info.is_temp() && info.temp.type() == RegType::sgpr) { + instr->operands[i].setTemp(info.temp); + info = ctx.info[info.temp.id()]; + } else if (info.is_temp() && info.temp.type() == RegType::vgpr) { + /* propagate vgpr if it can take it */ + switch (instr->opcode) { + case aco_opcode::p_create_vector: + case aco_opcode::p_split_vector: + case aco_opcode::p_extract_vector: + case aco_opcode::p_phi: { + const bool all_vgpr = std::none_of(instr->definitions.begin(), instr->definitions.end(), + [] (const Definition& def) { return def.getTemp().type() != RegType::vgpr;}); + if (all_vgpr) { + instr->operands[i] = Operand(info.temp); + info = ctx.info[info.temp.id()]; + } + break; + } + default: + break; + } + } + if ((info.is_constant() || info.is_constant_64bit() || (info.is_literal() && instr->format == Format::PSEUDO)) && + !instr->operands[i].isFixed() && alu_can_accept_constant(instr->opcode, i)) { + instr->operands[i] = get_constant_op(ctx, info.val, info.is_constant_64bit()); + continue; + } + } + + /* VALU: propagate neg, abs & inline constants */ + else if (instr->isVALU()) { + if (info.is_temp() && info.temp.type() == RegType::vgpr && valu_can_accept_vgpr(instr, i)) { + instr->operands[i].setTemp(info.temp); + info = ctx.info[info.temp.id()]; + } + if (info.is_abs() && (can_use_VOP3(ctx, instr) || instr->isDPP()) && instr_info.can_use_input_modifiers[(int)instr->opcode]) { + if (!instr->isDPP()) + to_VOP3(ctx, instr); + instr->operands[i] = Operand(info.temp); + if (instr->isDPP()) + static_cast(instr.get())->abs[i] = true; + else + static_cast(instr.get())->abs[i] = true; + } + if (info.is_neg() && instr->opcode == aco_opcode::v_add_f32) { + instr->opcode = i ? aco_opcode::v_sub_f32 : aco_opcode::v_subrev_f32; + instr->operands[i].setTemp(info.temp); + continue; + } else if (info.is_neg() && (can_use_VOP3(ctx, instr) || instr->isDPP()) && instr_info.can_use_input_modifiers[(int)instr->opcode]) { + if (!instr->isDPP()) + to_VOP3(ctx, instr); + instr->operands[i].setTemp(info.temp); + if (instr->isDPP()) + static_cast(instr.get())->neg[i] = true; + else + static_cast(instr.get())->neg[i] = true; + continue; + } + if ((info.is_constant() || info.is_constant_64bit()) && alu_can_accept_constant(instr->opcode, i)) { + Operand op = get_constant_op(ctx, info.val, info.is_constant_64bit()); + perfwarn(instr->opcode == aco_opcode::v_cndmask_b32 && i == 2, "v_cndmask_b32 with a constant selector", instr.get()); + if (i == 0 || instr->opcode == aco_opcode::v_readlane_b32 || instr->opcode == aco_opcode::v_writelane_b32) { + instr->operands[i] = op; + continue; + } else if (!instr->isVOP3() && can_swap_operands(instr)) { + instr->operands[i] = instr->operands[0]; + instr->operands[0] = op; + continue; + } else if (can_use_VOP3(ctx, instr)) { + to_VOP3(ctx, instr); + instr->operands[i] = op; + continue; + } + } + } + + /* MUBUF: propagate constants and combine additions */ + else if (instr->format == Format::MUBUF) { + MUBUF_instruction *mubuf = static_cast(instr.get()); + Temp base; + uint32_t offset; + while (info.is_temp()) + info = ctx.info[info.temp.id()]; + + if (mubuf->offen && i == 1 && info.is_constant_or_literal() && mubuf->offset + info.val < 4096) { + assert(!mubuf->idxen); + instr->operands[1] = Operand(v1); + mubuf->offset += info.val; + mubuf->offen = false; + continue; + } else if (i == 2 && info.is_constant_or_literal() && mubuf->offset + info.val < 4096) { + instr->operands[2] = Operand((uint32_t) 0); + mubuf->offset += info.val; + continue; + } else if (mubuf->offen && i == 1 && parse_base_offset(ctx, instr.get(), i, &base, &offset) && base.regClass() == v1 && mubuf->offset + offset < 4096) { + assert(!mubuf->idxen); + instr->operands[1].setTemp(base); + mubuf->offset += offset; + continue; + } else if (i == 2 && parse_base_offset(ctx, instr.get(), i, &base, &offset) && base.regClass() == s1 && mubuf->offset + offset < 4096) { + instr->operands[i].setTemp(base); + mubuf->offset += offset; + continue; + } + } + + /* DS: combine additions */ + else if (instr->format == Format::DS) { + + DS_instruction *ds = static_cast(instr.get()); + Temp base; + uint32_t offset; + bool has_usable_ds_offset = ctx.program->chip_class >= GFX7; + if (has_usable_ds_offset && + i == 0 && parse_base_offset(ctx, instr.get(), i, &base, &offset) && + base.regClass() == instr->operands[i].regClass() && + instr->opcode != aco_opcode::ds_swizzle_b32) { + if (instr->opcode == aco_opcode::ds_write2_b32 || instr->opcode == aco_opcode::ds_read2_b32 || + instr->opcode == aco_opcode::ds_write2_b64 || instr->opcode == aco_opcode::ds_read2_b64) { + if (offset % 4 == 0 && + ds->offset0 + (offset >> 2) <= 255 && + ds->offset1 + (offset >> 2) <= 255) { + instr->operands[i].setTemp(base); + ds->offset0 += offset >> 2; + ds->offset1 += offset >> 2; + } + } else { + if (ds->offset0 + offset <= 65535) { + instr->operands[i].setTemp(base); + ds->offset0 += offset; + } + } + } + } + + /* SMEM: propagate constants and combine additions */ + else if (instr->format == Format::SMEM) { + + SMEM_instruction *smem = static_cast(instr.get()); + Temp base; + uint32_t offset; + if (i == 1 && info.is_constant_or_literal() && + ((ctx.program->chip_class == GFX6 && info.val <= 0x3FF) || + (ctx.program->chip_class == GFX7 && info.val <= 0xFFFFFFFF) || + (ctx.program->chip_class >= GFX8 && info.val <= 0xFFFFF))) { + instr->operands[i] = Operand(info.val); + continue; + } else if (i == 1 && parse_base_offset(ctx, instr.get(), i, &base, &offset) && base.regClass() == s1 && offset <= 0xFFFFF && ctx.program->chip_class >= GFX9) { + bool soe = smem->operands.size() >= (!smem->definitions.empty() ? 3 : 4); + if (soe && + (!ctx.info[smem->operands.back().tempId()].is_constant_or_literal() || + ctx.info[smem->operands.back().tempId()].val != 0)) { + continue; + } + if (soe) { + smem->operands[1] = Operand(offset); + smem->operands.back() = Operand(base); + } else { + SMEM_instruction *new_instr = create_instruction(smem->opcode, Format::SMEM, smem->operands.size() + 1, smem->definitions.size()); + new_instr->operands[0] = smem->operands[0]; + new_instr->operands[1] = Operand(offset); + if (smem->definitions.empty()) + new_instr->operands[2] = smem->operands[2]; + new_instr->operands.back() = Operand(base); + if (!smem->definitions.empty()) + new_instr->definitions[0] = smem->definitions[0]; + new_instr->can_reorder = smem->can_reorder; + new_instr->barrier = smem->barrier; + new_instr->glc = smem->glc; + new_instr->dlc = smem->dlc; + new_instr->nv = smem->nv; + new_instr->disable_wqm = smem->disable_wqm; + instr.reset(new_instr); + smem = static_cast(instr.get()); + } + continue; + } + } + + else if (instr->format == Format::PSEUDO_BRANCH) { + if (ctx.info[instr->operands[0].tempId()].is_scc_invert()) { + /* Flip the branch instruction to get rid of the scc_invert instruction */ + instr->opcode = instr->opcode == aco_opcode::p_cbranch_z ? aco_opcode::p_cbranch_nz : aco_opcode::p_cbranch_z; + instr->operands[0].setTemp(ctx.info[instr->operands[0].tempId()].temp); + } + } + } + + /* if this instruction doesn't define anything, return */ + if (instr->definitions.empty()) + return; + + switch (instr->opcode) { + case aco_opcode::p_create_vector: { + unsigned num_ops = instr->operands.size(); + for (const Operand& op : instr->operands) { + if (op.isTemp() && ctx.info[op.tempId()].is_vec()) + num_ops += ctx.info[op.tempId()].instr->operands.size() - 1; + } + if (num_ops != instr->operands.size()) { + aco_ptr old_vec = std::move(instr); + instr.reset(create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, num_ops, 1)); + instr->definitions[0] = old_vec->definitions[0]; + unsigned k = 0; + for (Operand& old_op : old_vec->operands) { + if (old_op.isTemp() && ctx.info[old_op.tempId()].is_vec()) { + for (unsigned j = 0; j < ctx.info[old_op.tempId()].instr->operands.size(); j++) { + Operand op = ctx.info[old_op.tempId()].instr->operands[j]; + if (op.isTemp() && ctx.info[op.tempId()].is_temp() && + ctx.info[op.tempId()].temp.type() == instr->definitions[0].regClass().type()) + op.setTemp(ctx.info[op.tempId()].temp); + instr->operands[k++] = op; + } + } else { + instr->operands[k++] = old_op; + } + } + assert(k == num_ops); + } + if (instr->operands.size() == 1 && instr->operands[0].isTemp()) + ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp()); + else if (instr->definitions[0].getTemp().size() == instr->operands.size()) + ctx.info[instr->definitions[0].tempId()].set_vec(instr.get()); + break; + } + case aco_opcode::p_split_vector: { + if (!ctx.info[instr->operands[0].tempId()].is_vec()) + break; + Instruction* vec = ctx.info[instr->operands[0].tempId()].instr; + assert(instr->definitions.size() == vec->operands.size()); + for (unsigned i = 0; i < instr->definitions.size(); i++) { + Operand vec_op = vec->operands[i]; + if (vec_op.isConstant()) { + if (vec_op.isLiteral()) + ctx.info[instr->definitions[i].tempId()].set_literal(vec_op.constantValue()); + else if (vec_op.size() == 1) + ctx.info[instr->definitions[i].tempId()].set_constant(vec_op.constantValue()); + else if (vec_op.size() == 2) + ctx.info[instr->definitions[i].tempId()].set_constant_64bit(vec_op.constantValue()); + } else { + assert(vec_op.isTemp()); + ctx.info[instr->definitions[i].tempId()].set_temp(vec_op.getTemp()); + } + } + break; + } + case aco_opcode::p_extract_vector: { /* mov */ + if (!ctx.info[instr->operands[0].tempId()].is_vec()) + break; + Instruction* vec = ctx.info[instr->operands[0].tempId()].instr; + if (vec->definitions[0].getTemp().size() == vec->operands.size() && /* TODO: what about 64bit or other combinations? */ + vec->operands[0].size() == instr->definitions[0].size()) { + + /* convert this extract into a mov instruction */ + Operand vec_op = vec->operands[instr->operands[1].constantValue()]; + bool is_vgpr = instr->definitions[0].getTemp().type() == RegType::vgpr; + aco_opcode opcode = is_vgpr ? aco_opcode::v_mov_b32 : aco_opcode::s_mov_b32; + Format format = is_vgpr ? Format::VOP1 : Format::SOP1; + instr->opcode = opcode; + instr->format = format; + while (instr->operands.size() > 1) + instr->operands.pop_back(); + instr->operands[0] = vec_op; + + if (vec_op.isConstant()) { + if (vec_op.isLiteral()) + ctx.info[instr->definitions[0].tempId()].set_literal(vec_op.constantValue()); + else if (vec_op.size() == 1) + ctx.info[instr->definitions[0].tempId()].set_constant(vec_op.constantValue()); + else if (vec_op.size() == 2) + ctx.info[instr->definitions[0].tempId()].set_constant_64bit(vec_op.constantValue()); + + } else { + assert(vec_op.isTemp()); + ctx.info[instr->definitions[0].tempId()].set_temp(vec_op.getTemp()); + } + } + break; + } + case aco_opcode::s_mov_b32: /* propagate */ + case aco_opcode::s_mov_b64: + case aco_opcode::v_mov_b32: + case aco_opcode::p_as_uniform: + if (instr->definitions[0].isFixed()) { + /* don't copy-propagate copies into fixed registers */ + } else if (instr->usesModifiers()) { + // TODO + } else if (instr->operands[0].isConstant()) { + if (instr->operands[0].isLiteral()) + ctx.info[instr->definitions[0].tempId()].set_literal(instr->operands[0].constantValue()); + else if (instr->operands[0].size() == 1) + ctx.info[instr->definitions[0].tempId()].set_constant(instr->operands[0].constantValue()); + else if (instr->operands[0].size() == 2) + ctx.info[instr->definitions[0].tempId()].set_constant_64bit(instr->operands[0].constantValue()); + } else if (instr->operands[0].isTemp()) { + ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp()); + } else { + assert(instr->operands[0].isFixed()); + } + break; + case aco_opcode::p_is_helper: + if (!ctx.program->needs_wqm) + ctx.info[instr->definitions[0].tempId()].set_constant(0u); + break; + case aco_opcode::s_movk_i32: { + uint32_t v = static_cast(instr.get())->imm; + v = v & 0x8000 ? (v | 0xffff0000) : v; + if (v <= 64 || v >= 0xfffffff0) + ctx.info[instr->definitions[0].tempId()].set_constant(v); + else + ctx.info[instr->definitions[0].tempId()].set_literal(v); + break; + } + case aco_opcode::v_bfrev_b32: + case aco_opcode::s_brev_b32: { + if (instr->operands[0].isConstant()) { + uint32_t v = util_bitreverse(instr->operands[0].constantValue()); + if (v <= 64 || v >= 0xfffffff0) + ctx.info[instr->definitions[0].tempId()].set_constant(v); + else + ctx.info[instr->definitions[0].tempId()].set_literal(v); + } + break; + } + case aco_opcode::s_bfm_b32: { + if (instr->operands[0].isConstant() && instr->operands[1].isConstant()) { + unsigned size = instr->operands[0].constantValue() & 0x1f; + unsigned start = instr->operands[1].constantValue() & 0x1f; + uint32_t v = ((1u << size) - 1u) << start; + if (v <= 64 || v >= 0xfffffff0) + ctx.info[instr->definitions[0].tempId()].set_constant(v); + else + ctx.info[instr->definitions[0].tempId()].set_literal(v); + } + } + case aco_opcode::v_mul_f32: { /* omod */ + /* TODO: try to move the negate/abs modifier to the consumer instead */ + if (instr->usesModifiers()) + break; + + for (unsigned i = 0; i < 2; i++) { + if (instr->operands[!i].isConstant() && instr->operands[i].isTemp()) { + if (instr->operands[!i].constantValue() == 0x40000000) { /* 2.0 */ + ctx.info[instr->operands[i].tempId()].set_omod2(instr->definitions[0].getTemp()); + } else if (instr->operands[!i].constantValue() == 0x40800000) { /* 4.0 */ + ctx.info[instr->operands[i].tempId()].set_omod4(instr->definitions[0].getTemp()); + } else if (instr->operands[!i].constantValue() == 0x3f000000) { /* 0.5 */ + ctx.info[instr->operands[i].tempId()].set_omod5(instr->definitions[0].getTemp()); + } else if (instr->operands[!i].constantValue() == 0x3f800000 && + !block.fp_mode.must_flush_denorms32) { /* 1.0 */ + ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[i].getTemp()); + } else { + continue; + } + break; + } + } + break; + } + case aco_opcode::v_and_b32: /* abs */ + if (!instr->usesModifiers() && instr->operands[0].constantEquals(0x7FFFFFFF) && + instr->operands[1].isTemp() && instr->operands[1].getTemp().type() == RegType::vgpr) + ctx.info[instr->definitions[0].tempId()].set_abs(instr->operands[1].getTemp()); + else + ctx.info[instr->definitions[0].tempId()].set_bitwise(instr.get()); + break; + case aco_opcode::v_xor_b32: { /* neg */ + if (!instr->usesModifiers() && instr->operands[0].constantEquals(0x80000000u) && instr->operands[1].isTemp()) { + if (ctx.info[instr->operands[1].tempId()].is_neg()) { + ctx.info[instr->definitions[0].tempId()].set_temp(ctx.info[instr->operands[1].tempId()].temp); + } else if (instr->operands[1].getTemp().type() == RegType::vgpr) { + if (ctx.info[instr->operands[1].tempId()].is_abs()) { /* neg(abs(x)) */ + instr->operands[1].setTemp(ctx.info[instr->operands[1].tempId()].temp); + instr->opcode = aco_opcode::v_or_b32; + ctx.info[instr->definitions[0].tempId()].set_neg_abs(instr->operands[1].getTemp()); + } else { + ctx.info[instr->definitions[0].tempId()].set_neg(instr->operands[1].getTemp()); + } + } + } else { + ctx.info[instr->definitions[0].tempId()].set_bitwise(instr.get()); + } + break; + } + case aco_opcode::v_med3_f32: { /* clamp */ + VOP3A_instruction* vop3 = static_cast(instr.get()); + if (vop3->abs[0] || vop3->abs[1] || vop3->abs[2] || + vop3->neg[0] || vop3->neg[1] || vop3->neg[2] || + vop3->omod != 0 || vop3->opsel != 0) + break; + + unsigned idx = 0; + bool found_zero = false, found_one = false; + for (unsigned i = 0; i < 3; i++) + { + if (instr->operands[i].constantEquals(0)) + found_zero = true; + else if (instr->operands[i].constantEquals(0x3f800000)) /* 1.0 */ + found_one = true; + else + idx = i; + } + if (found_zero && found_one && instr->operands[idx].isTemp()) { + ctx.info[instr->operands[idx].tempId()].set_clamp(instr->definitions[0].getTemp()); + } + break; + } + case aco_opcode::v_cndmask_b32: + if (instr->operands[0].constantEquals(0) && + instr->operands[1].constantEquals(0xFFFFFFFF) && + instr->operands[2].isTemp()) + ctx.info[instr->definitions[0].tempId()].set_vcc(instr->operands[2].getTemp()); + else if (instr->operands[0].constantEquals(0) && + instr->operands[1].constantEquals(0x3f800000u) && + instr->operands[2].isTemp()) + ctx.info[instr->definitions[0].tempId()].set_b2f(instr->operands[2].getTemp()); + + ctx.info[instr->operands[2].tempId()].set_vcc_hint(); + break; + case aco_opcode::v_cmp_lg_u32: + if (instr->format == Format::VOPC && /* don't optimize VOP3 / SDWA / DPP */ + instr->operands[0].constantEquals(0) && + instr->operands[1].isTemp() && ctx.info[instr->operands[1].tempId()].is_vcc()) + ctx.info[instr->definitions[0].tempId()].set_temp(ctx.info[instr->operands[1].tempId()].temp); + break; + case aco_opcode::p_phi: + case aco_opcode::p_linear_phi: { + /* lower_bool_phis() can create phis like this */ + bool all_same_temp = instr->operands[0].isTemp(); + /* this check is needed when moving uniform loop counters out of a divergent loop */ + if (all_same_temp) + all_same_temp = instr->definitions[0].regClass() == instr->operands[0].regClass(); + for (unsigned i = 1; all_same_temp && (i < instr->operands.size()); i++) { + if (!instr->operands[i].isTemp() || instr->operands[i].tempId() != instr->operands[0].tempId()) + all_same_temp = false; + } + if (all_same_temp) { + ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp()); + } else { + bool all_undef = instr->operands[0].isUndefined(); + for (unsigned i = 1; all_undef && (i < instr->operands.size()); i++) { + if (!instr->operands[i].isUndefined()) + all_undef = false; + } + if (all_undef) + ctx.info[instr->definitions[0].tempId()].set_undefined(); + } + break; + } + case aco_opcode::v_add_u32: + case aco_opcode::v_add_co_u32: + case aco_opcode::s_add_i32: + case aco_opcode::s_add_u32: + ctx.info[instr->definitions[0].tempId()].set_add_sub(instr.get()); + break; + case aco_opcode::s_not_b32: + case aco_opcode::s_not_b64: + if (ctx.info[instr->operands[0].tempId()].is_uniform_bool()) { + ctx.info[instr->definitions[0].tempId()].set_uniform_bitwise(); + ctx.info[instr->definitions[1].tempId()].set_scc_invert(ctx.info[instr->operands[0].tempId()].temp); + } else if (ctx.info[instr->operands[0].tempId()].is_uniform_bitwise()) { + ctx.info[instr->definitions[0].tempId()].set_uniform_bitwise(); + ctx.info[instr->definitions[1].tempId()].set_scc_invert(ctx.info[instr->operands[0].tempId()].instr->definitions[1].getTemp()); + } + ctx.info[instr->definitions[0].tempId()].set_bitwise(instr.get()); + break; + case aco_opcode::s_and_b32: + case aco_opcode::s_and_b64: + if (instr->operands[1].isFixed() && instr->operands[1].physReg() == exec && instr->operands[0].isTemp()) { + if (ctx.info[instr->operands[0].tempId()].is_uniform_bool()) { + /* Try to get rid of the superfluous s_cselect + s_and_b64 that comes from turning a uniform bool into divergent */ + ctx.info[instr->definitions[1].tempId()].set_temp(ctx.info[instr->operands[0].tempId()].temp); + ctx.info[instr->definitions[0].tempId()].set_uniform_bool(ctx.info[instr->operands[0].tempId()].temp); + break; + } else if (ctx.info[instr->operands[0].tempId()].is_uniform_bitwise()) { + /* Try to get rid of the superfluous s_and_b64, since the uniform bitwise instruction already produces the same SCC */ + ctx.info[instr->definitions[1].tempId()].set_temp(ctx.info[instr->operands[0].tempId()].instr->definitions[1].getTemp()); + ctx.info[instr->definitions[0].tempId()].set_uniform_bool(ctx.info[instr->operands[0].tempId()].instr->definitions[1].getTemp()); + break; + } + } + /* fallthrough */ + case aco_opcode::s_or_b32: + case aco_opcode::s_or_b64: + case aco_opcode::s_xor_b32: + case aco_opcode::s_xor_b64: + if (std::all_of(instr->operands.begin(), instr->operands.end(), [&ctx](const Operand& op) { + return op.isTemp() && (ctx.info[op.tempId()].is_uniform_bool() || ctx.info[op.tempId()].is_uniform_bitwise()); + })) { + ctx.info[instr->definitions[0].tempId()].set_uniform_bitwise(); + } + /* fallthrough */ + case aco_opcode::s_lshl_b32: + case aco_opcode::v_or_b32: + case aco_opcode::v_lshlrev_b32: + ctx.info[instr->definitions[0].tempId()].set_bitwise(instr.get()); + break; + case aco_opcode::v_min_f32: + case aco_opcode::v_min_f16: + case aco_opcode::v_min_u32: + case aco_opcode::v_min_i32: + case aco_opcode::v_min_u16: + case aco_opcode::v_min_i16: + case aco_opcode::v_max_f32: + case aco_opcode::v_max_f16: + case aco_opcode::v_max_u32: + case aco_opcode::v_max_i32: + case aco_opcode::v_max_u16: + case aco_opcode::v_max_i16: + ctx.info[instr->definitions[0].tempId()].set_minmax(instr.get()); + break; + case aco_opcode::v_cmp_lt_f32: + case aco_opcode::v_cmp_eq_f32: + case aco_opcode::v_cmp_le_f32: + case aco_opcode::v_cmp_gt_f32: + case aco_opcode::v_cmp_lg_f32: + case aco_opcode::v_cmp_ge_f32: + case aco_opcode::v_cmp_o_f32: + case aco_opcode::v_cmp_u_f32: + case aco_opcode::v_cmp_nge_f32: + case aco_opcode::v_cmp_nlg_f32: + case aco_opcode::v_cmp_ngt_f32: + case aco_opcode::v_cmp_nle_f32: + case aco_opcode::v_cmp_neq_f32: + case aco_opcode::v_cmp_nlt_f32: + ctx.info[instr->definitions[0].tempId()].set_fcmp(instr.get()); + break; + case aco_opcode::s_cselect_b64: + case aco_opcode::s_cselect_b32: + if (instr->operands[0].constantEquals((unsigned) -1) && + instr->operands[1].constantEquals(0)) { + /* Found a cselect that operates on a uniform bool that comes from eg. s_cmp */ + ctx.info[instr->definitions[0].tempId()].set_uniform_bool(instr->operands[2].getTemp()); + } + if (instr->operands[2].isTemp() && ctx.info[instr->operands[2].tempId()].is_scc_invert()) { + /* Flip the operands to get rid of the scc_invert instruction */ + std::swap(instr->operands[0], instr->operands[1]); + instr->operands[2].setTemp(ctx.info[instr->operands[2].tempId()].temp); + } + break; + case aco_opcode::p_wqm: + if (instr->operands[0].isTemp() && + ctx.info[instr->operands[0].tempId()].is_scc_invert()) { + ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp()); + } + break; + default: + break; + } +} + +ALWAYS_INLINE bool get_cmp_info(aco_opcode op, aco_opcode *ordered, aco_opcode *unordered, aco_opcode *inverse) +{ + *ordered = *unordered = op; + switch (op) { + #define CMP(ord, unord) \ + case aco_opcode::v_cmp_##ord##_f32:\ + case aco_opcode::v_cmp_n##unord##_f32:\ + *ordered = aco_opcode::v_cmp_##ord##_f32;\ + *unordered = aco_opcode::v_cmp_n##unord##_f32;\ + *inverse = op == aco_opcode::v_cmp_n##unord##_f32 ? aco_opcode::v_cmp_##unord##_f32 : aco_opcode::v_cmp_n##ord##_f32;\ + return true; + CMP(lt, /*n*/ge) + CMP(eq, /*n*/lg) + CMP(le, /*n*/gt) + CMP(gt, /*n*/le) + CMP(lg, /*n*/eq) + CMP(ge, /*n*/lt) + #undef CMP + default: + return false; + } +} + +aco_opcode get_ordered(aco_opcode op) +{ + aco_opcode ordered, unordered, inverse; + return get_cmp_info(op, &ordered, &unordered, &inverse) ? ordered : aco_opcode::last_opcode; +} + +aco_opcode get_unordered(aco_opcode op) +{ + aco_opcode ordered, unordered, inverse; + return get_cmp_info(op, &ordered, &unordered, &inverse) ? unordered : aco_opcode::last_opcode; +} + +aco_opcode get_inverse(aco_opcode op) +{ + aco_opcode ordered, unordered, inverse; + return get_cmp_info(op, &ordered, &unordered, &inverse) ? inverse : aco_opcode::last_opcode; +} + +bool is_cmp(aco_opcode op) +{ + aco_opcode ordered, unordered, inverse; + return get_cmp_info(op, &ordered, &unordered, &inverse); +} + +unsigned original_temp_id(opt_ctx &ctx, Temp tmp) +{ + if (ctx.info[tmp.id()].is_temp()) + return ctx.info[tmp.id()].temp.id(); + else + return tmp.id(); +} + +void decrease_uses(opt_ctx &ctx, Instruction* instr) +{ + if (!--ctx.uses[instr->definitions[0].tempId()]) { + for (const Operand& op : instr->operands) { + if (op.isTemp()) + ctx.uses[op.tempId()]--; + } + } +} + +Instruction *follow_operand(opt_ctx &ctx, Operand op, bool ignore_uses=false) +{ + if (!op.isTemp() || !(ctx.info[op.tempId()].label & instr_labels)) + return nullptr; + if (!ignore_uses && ctx.uses[op.tempId()] > 1) + return nullptr; + + Instruction *instr = ctx.info[op.tempId()].instr; + + if (instr->definitions.size() == 2) { + assert(instr->definitions[0].isTemp() && instr->definitions[0].tempId() == op.tempId()); + if (instr->definitions[1].isTemp() && ctx.uses[instr->definitions[1].tempId()]) + return nullptr; + } + + return instr; +} + +/* s_or_b64(neq(a, a), neq(b, b)) -> v_cmp_u_f32(a, b) + * s_and_b64(eq(a, a), eq(b, b)) -> v_cmp_o_f32(a, b) */ +bool combine_ordering_test(opt_ctx &ctx, aco_ptr& instr) +{ + if (instr->definitions[0].regClass() != ctx.program->lane_mask) + return false; + if (instr->definitions[1].isTemp() && ctx.uses[instr->definitions[1].tempId()]) + return false; + + bool is_or = instr->opcode == aco_opcode::s_or_b64 || instr->opcode == aco_opcode::s_or_b32; + + bool neg[2] = {false, false}; + bool abs[2] = {false, false}; + uint8_t opsel = 0; + Instruction *op_instr[2]; + Temp op[2]; + + for (unsigned i = 0; i < 2; i++) { + op_instr[i] = follow_operand(ctx, instr->operands[i], true); + if (!op_instr[i]) + return false; + + aco_opcode expected_cmp = is_or ? aco_opcode::v_cmp_neq_f32 : aco_opcode::v_cmp_eq_f32; + + if (op_instr[i]->opcode != expected_cmp) + return false; + if (!op_instr[i]->operands[0].isTemp() || !op_instr[i]->operands[1].isTemp()) + return false; + + if (op_instr[i]->isVOP3()) { + VOP3A_instruction *vop3 = static_cast(op_instr[i]); + if (vop3->neg[0] != vop3->neg[1] || vop3->abs[0] != vop3->abs[1] || vop3->opsel == 1 || vop3->opsel == 2) + return false; + neg[i] = vop3->neg[0]; + abs[i] = vop3->abs[0]; + opsel |= (vop3->opsel & 1) << i; + } + + Temp op0 = op_instr[i]->operands[0].getTemp(); + Temp op1 = op_instr[i]->operands[1].getTemp(); + if (original_temp_id(ctx, op0) != original_temp_id(ctx, op1)) + return false; + + op[i] = op1; + } + + if (op[1].type() == RegType::sgpr) + std::swap(op[0], op[1]); + unsigned num_sgprs = (op[0].type() == RegType::sgpr) + (op[1].type() == RegType::sgpr); + if (num_sgprs > (ctx.program->chip_class >= GFX10 ? 2 : 1)) + return false; + + ctx.uses[op[0].id()]++; + ctx.uses[op[1].id()]++; + decrease_uses(ctx, op_instr[0]); + decrease_uses(ctx, op_instr[1]); + + aco_opcode new_op = is_or ? aco_opcode::v_cmp_u_f32 : aco_opcode::v_cmp_o_f32; + Instruction *new_instr; + if (neg[0] || neg[1] || abs[0] || abs[1] || opsel || num_sgprs > 1) { + VOP3A_instruction *vop3 = create_instruction(new_op, asVOP3(Format::VOPC), 2, 1); + for (unsigned i = 0; i < 2; i++) { + vop3->neg[i] = neg[i]; + vop3->abs[i] = abs[i]; + } + vop3->opsel = opsel; + new_instr = static_cast(vop3); + } else { + new_instr = create_instruction(new_op, Format::VOPC, 2, 1); + } + new_instr->operands[0] = Operand(op[0]); + new_instr->operands[1] = Operand(op[1]); + new_instr->definitions[0] = instr->definitions[0]; + + ctx.info[instr->definitions[0].tempId()].label = 0; + ctx.info[instr->definitions[0].tempId()].set_fcmp(new_instr); + + instr.reset(new_instr); + + return true; +} + +/* s_or_b64(v_cmp_u_f32(a, b), cmp(a, b)) -> get_unordered(cmp)(a, b) + * s_and_b64(v_cmp_o_f32(a, b), cmp(a, b)) -> get_ordered(cmp)(a, b) */ +bool combine_comparison_ordering(opt_ctx &ctx, aco_ptr& instr) +{ + if (instr->definitions[0].regClass() != ctx.program->lane_mask) + return false; + if (instr->definitions[1].isTemp() && ctx.uses[instr->definitions[1].tempId()]) + return false; + + bool is_or = instr->opcode == aco_opcode::s_or_b64 || instr->opcode == aco_opcode::s_or_b32; + aco_opcode expected_nan_test = is_or ? aco_opcode::v_cmp_u_f32 : aco_opcode::v_cmp_o_f32; + + Instruction *nan_test = follow_operand(ctx, instr->operands[0], true); + Instruction *cmp = follow_operand(ctx, instr->operands[1], true); + if (!nan_test || !cmp) + return false; + + if (cmp->opcode == expected_nan_test) + std::swap(nan_test, cmp); + else if (nan_test->opcode != expected_nan_test) + return false; + + if (!is_cmp(cmp->opcode)) + return false; + + if (!nan_test->operands[0].isTemp() || !nan_test->operands[1].isTemp()) + return false; + if (!cmp->operands[0].isTemp() || !cmp->operands[1].isTemp()) + return false; + + unsigned prop_cmp0 = original_temp_id(ctx, cmp->operands[0].getTemp()); + unsigned prop_cmp1 = original_temp_id(ctx, cmp->operands[1].getTemp()); + unsigned prop_nan0 = original_temp_id(ctx, nan_test->operands[0].getTemp()); + unsigned prop_nan1 = original_temp_id(ctx, nan_test->operands[1].getTemp()); + if (prop_cmp0 != prop_nan0 && prop_cmp0 != prop_nan1) + return false; + if (prop_cmp1 != prop_nan0 && prop_cmp1 != prop_nan1) + return false; + + ctx.uses[cmp->operands[0].tempId()]++; + ctx.uses[cmp->operands[1].tempId()]++; + decrease_uses(ctx, nan_test); + decrease_uses(ctx, cmp); + + aco_opcode new_op = is_or ? get_unordered(cmp->opcode) : get_ordered(cmp->opcode); + Instruction *new_instr; + if (cmp->isVOP3()) { + VOP3A_instruction *new_vop3 = create_instruction(new_op, asVOP3(Format::VOPC), 2, 1); + VOP3A_instruction *cmp_vop3 = static_cast(cmp); + memcpy(new_vop3->abs, cmp_vop3->abs, sizeof(new_vop3->abs)); + memcpy(new_vop3->neg, cmp_vop3->neg, sizeof(new_vop3->neg)); + new_vop3->clamp = cmp_vop3->clamp; + new_vop3->omod = cmp_vop3->omod; + new_vop3->opsel = cmp_vop3->opsel; + new_instr = new_vop3; + } else { + new_instr = create_instruction(new_op, Format::VOPC, 2, 1); + } + new_instr->operands[0] = cmp->operands[0]; + new_instr->operands[1] = cmp->operands[1]; + new_instr->definitions[0] = instr->definitions[0]; + + ctx.info[instr->definitions[0].tempId()].label = 0; + ctx.info[instr->definitions[0].tempId()].set_fcmp(new_instr); + + instr.reset(new_instr); + + return true; +} + +/* s_or_b64(v_cmp_neq_f32(a, a), cmp(a, #b)) and b is not NaN -> get_unordered(cmp)(a, b) + * s_and_b64(v_cmp_eq_f32(a, a), cmp(a, #b)) and b is not NaN -> get_ordered(cmp)(a, b) */ +bool combine_constant_comparison_ordering(opt_ctx &ctx, aco_ptr& instr) +{ + if (instr->definitions[0].regClass() != ctx.program->lane_mask) + return false; + if (instr->definitions[1].isTemp() && ctx.uses[instr->definitions[1].tempId()]) + return false; + + bool is_or = instr->opcode == aco_opcode::s_or_b64 || instr->opcode == aco_opcode::s_or_b32; + + Instruction *nan_test = follow_operand(ctx, instr->operands[0], true); + Instruction *cmp = follow_operand(ctx, instr->operands[1], true); + + if (!nan_test || !cmp) + return false; + + aco_opcode expected_nan_test = is_or ? aco_opcode::v_cmp_neq_f32 : aco_opcode::v_cmp_eq_f32; + if (cmp->opcode == expected_nan_test) + std::swap(nan_test, cmp); + else if (nan_test->opcode != expected_nan_test) + return false; + + if (!is_cmp(cmp->opcode)) + return false; + + if (!nan_test->operands[0].isTemp() || !nan_test->operands[1].isTemp()) + return false; + if (!cmp->operands[0].isTemp() && !cmp->operands[1].isTemp()) + return false; + + unsigned prop_nan0 = original_temp_id(ctx, nan_test->operands[0].getTemp()); + unsigned prop_nan1 = original_temp_id(ctx, nan_test->operands[1].getTemp()); + if (prop_nan0 != prop_nan1) + return false; + + if (nan_test->isVOP3()) { + VOP3A_instruction *vop3 = static_cast(nan_test); + if (vop3->neg[0] != vop3->neg[1] || vop3->abs[0] != vop3->abs[1] || vop3->opsel == 1 || vop3->opsel == 2) + return false; + } + + int constant_operand = -1; + for (unsigned i = 0; i < 2; i++) { + if (cmp->operands[i].isTemp() && original_temp_id(ctx, cmp->operands[i].getTemp()) == prop_nan0) { + constant_operand = !i; + break; + } + } + if (constant_operand == -1) + return false; + + uint32_t constant; + if (cmp->operands[constant_operand].isConstant()) { + constant = cmp->operands[constant_operand].constantValue(); + } else if (cmp->operands[constant_operand].isTemp()) { + Temp tmp = cmp->operands[constant_operand].getTemp(); + unsigned id = original_temp_id(ctx, tmp); + if (!ctx.info[id].is_constant() && !ctx.info[id].is_literal()) + return false; + constant = ctx.info[id].val; + } else { + return false; + } + + float constantf; + memcpy(&constantf, &constant, 4); + if (isnan(constantf)) + return false; + + if (cmp->operands[0].isTemp()) + ctx.uses[cmp->operands[0].tempId()]++; + if (cmp->operands[1].isTemp()) + ctx.uses[cmp->operands[1].tempId()]++; + decrease_uses(ctx, nan_test); + decrease_uses(ctx, cmp); + + aco_opcode new_op = is_or ? get_unordered(cmp->opcode) : get_ordered(cmp->opcode); + Instruction *new_instr; + if (cmp->isVOP3()) { + VOP3A_instruction *new_vop3 = create_instruction(new_op, asVOP3(Format::VOPC), 2, 1); + VOP3A_instruction *cmp_vop3 = static_cast(cmp); + memcpy(new_vop3->abs, cmp_vop3->abs, sizeof(new_vop3->abs)); + memcpy(new_vop3->neg, cmp_vop3->neg, sizeof(new_vop3->neg)); + new_vop3->clamp = cmp_vop3->clamp; + new_vop3->omod = cmp_vop3->omod; + new_vop3->opsel = cmp_vop3->opsel; + new_instr = new_vop3; + } else { + new_instr = create_instruction(new_op, Format::VOPC, 2, 1); + } + new_instr->operands[0] = cmp->operands[0]; + new_instr->operands[1] = cmp->operands[1]; + new_instr->definitions[0] = instr->definitions[0]; + + ctx.info[instr->definitions[0].tempId()].label = 0; + ctx.info[instr->definitions[0].tempId()].set_fcmp(new_instr); + + instr.reset(new_instr); + + return true; +} + +/* s_not_b64(cmp(a, b) -> get_inverse(cmp)(a, b) */ +bool combine_inverse_comparison(opt_ctx &ctx, aco_ptr& instr) +{ + if (instr->opcode != aco_opcode::s_not_b64) + return false; + if (instr->definitions[1].isTemp() && ctx.uses[instr->definitions[1].tempId()]) + return false; + if (!instr->operands[0].isTemp()) + return false; + + Instruction *cmp = follow_operand(ctx, instr->operands[0]); + if (!cmp) + return false; + + aco_opcode new_opcode = get_inverse(cmp->opcode); + if (new_opcode == aco_opcode::last_opcode) + return false; + + if (cmp->operands[0].isTemp()) + ctx.uses[cmp->operands[0].tempId()]++; + if (cmp->operands[1].isTemp()) + ctx.uses[cmp->operands[1].tempId()]++; + decrease_uses(ctx, cmp); + + Instruction *new_instr; + if (cmp->isVOP3()) { + VOP3A_instruction *new_vop3 = create_instruction(new_opcode, asVOP3(Format::VOPC), 2, 1); + VOP3A_instruction *cmp_vop3 = static_cast(cmp); + memcpy(new_vop3->abs, cmp_vop3->abs, sizeof(new_vop3->abs)); + memcpy(new_vop3->neg, cmp_vop3->neg, sizeof(new_vop3->neg)); + new_vop3->clamp = cmp_vop3->clamp; + new_vop3->omod = cmp_vop3->omod; + new_vop3->opsel = cmp_vop3->opsel; + new_instr = new_vop3; + } else { + new_instr = create_instruction(new_opcode, Format::VOPC, 2, 1); + } + new_instr->operands[0] = cmp->operands[0]; + new_instr->operands[1] = cmp->operands[1]; + new_instr->definitions[0] = instr->definitions[0]; + + ctx.info[instr->definitions[0].tempId()].label = 0; + ctx.info[instr->definitions[0].tempId()].set_fcmp(new_instr); + + instr.reset(new_instr); + + return true; +} + +/* op1(op2(1, 2), 0) if swap = false + * op1(0, op2(1, 2)) if swap = true */ +bool match_op3_for_vop3(opt_ctx &ctx, aco_opcode op1, aco_opcode op2, + Instruction* op1_instr, bool swap, const char *shuffle_str, + Operand operands[3], bool neg[3], bool abs[3], uint8_t *opsel, + bool *op1_clamp, uint8_t *op1_omod, + bool *inbetween_neg, bool *inbetween_abs, bool *inbetween_opsel) +{ + /* checks */ + if (op1_instr->opcode != op1) + return false; + + Instruction *op2_instr = follow_operand(ctx, op1_instr->operands[swap]); + if (!op2_instr || op2_instr->opcode != op2) + return false; + + VOP3A_instruction *op1_vop3 = op1_instr->isVOP3() ? static_cast(op1_instr) : NULL; + VOP3A_instruction *op2_vop3 = op2_instr->isVOP3() ? static_cast(op2_instr) : NULL; + + /* don't support inbetween clamp/omod */ + if (op2_vop3 && (op2_vop3->clamp || op2_vop3->omod)) + return false; + + /* get operands and modifiers and check inbetween modifiers */ + *op1_clamp = op1_vop3 ? op1_vop3->clamp : false; + *op1_omod = op1_vop3 ? op1_vop3->omod : 0u; + + if (inbetween_neg) + *inbetween_neg = op1_vop3 ? op1_vop3->neg[swap] : false; + else if (op1_vop3 && op1_vop3->neg[swap]) + return false; + + if (inbetween_abs) + *inbetween_abs = op1_vop3 ? op1_vop3->abs[swap] : false; + else if (op1_vop3 && op1_vop3->abs[swap]) + return false; + + if (inbetween_opsel) + *inbetween_opsel = op1_vop3 ? op1_vop3->opsel & (1 << swap) : false; + else if (op1_vop3 && op1_vop3->opsel & (1 << swap)) + return false; + + int shuffle[3]; + shuffle[shuffle_str[0] - '0'] = 0; + shuffle[shuffle_str[1] - '0'] = 1; + shuffle[shuffle_str[2] - '0'] = 2; + + operands[shuffle[0]] = op1_instr->operands[!swap]; + neg[shuffle[0]] = op1_vop3 ? op1_vop3->neg[!swap] : false; + abs[shuffle[0]] = op1_vop3 ? op1_vop3->abs[!swap] : false; + if (op1_vop3 && op1_vop3->opsel & (1 << !swap)) + *opsel |= 1 << shuffle[0]; + + for (unsigned i = 0; i < 2; i++) { + operands[shuffle[i + 1]] = op2_instr->operands[i]; + neg[shuffle[i + 1]] = op2_vop3 ? op2_vop3->neg[i] : false; + abs[shuffle[i + 1]] = op2_vop3 ? op2_vop3->abs[i] : false; + if (op2_vop3 && op2_vop3->opsel & (1 << i)) + *opsel |= 1 << shuffle[i + 1]; + } + + /* check operands */ + if (!check_vop3_operands(ctx, 3, operands)) + return false; + + return true; +} + +void create_vop3_for_op3(opt_ctx& ctx, aco_opcode opcode, aco_ptr& instr, + Operand operands[3], bool neg[3], bool abs[3], uint8_t opsel, + bool clamp, unsigned omod) +{ + VOP3A_instruction *new_instr = create_instruction(opcode, Format::VOP3A, 3, 1); + memcpy(new_instr->abs, abs, sizeof(bool[3])); + memcpy(new_instr->neg, neg, sizeof(bool[3])); + new_instr->clamp = clamp; + new_instr->omod = omod; + new_instr->opsel = opsel; + new_instr->operands[0] = operands[0]; + new_instr->operands[1] = operands[1]; + new_instr->operands[2] = operands[2]; + new_instr->definitions[0] = instr->definitions[0]; + ctx.info[instr->definitions[0].tempId()].label = 0; + + instr.reset(new_instr); +} + +bool combine_three_valu_op(opt_ctx& ctx, aco_ptr& instr, aco_opcode op2, aco_opcode new_op, const char *shuffle, uint8_t ops) +{ + uint32_t omod_clamp = ctx.info[instr->definitions[0].tempId()].label & + (label_omod_success | label_clamp_success); + + for (unsigned swap = 0; swap < 2; swap++) { + if (!((1 << swap) & ops)) + continue; + + Operand operands[3]; + bool neg[3], abs[3], clamp; + uint8_t opsel = 0, omod = 0; + if (match_op3_for_vop3(ctx, instr->opcode, op2, + instr.get(), swap, shuffle, + operands, neg, abs, &opsel, + &clamp, &omod, NULL, NULL, NULL)) { + ctx.uses[instr->operands[swap].tempId()]--; + create_vop3_for_op3(ctx, new_op, instr, operands, neg, abs, opsel, clamp, omod); + if (omod_clamp & label_omod_success) + ctx.info[instr->definitions[0].tempId()].set_omod_success(instr.get()); + if (omod_clamp & label_clamp_success) + ctx.info[instr->definitions[0].tempId()].set_clamp_success(instr.get()); + return true; + } + } + return false; +} + +bool combine_minmax(opt_ctx& ctx, aco_ptr& instr, aco_opcode opposite, aco_opcode minmax3) +{ + if (combine_three_valu_op(ctx, instr, instr->opcode, minmax3, "012", 1 | 2)) + return true; + + uint32_t omod_clamp = ctx.info[instr->definitions[0].tempId()].label & + (label_omod_success | label_clamp_success); + + /* min(-max(a, b), c) -> min3(-a, -b, c) * + * max(-min(a, b), c) -> max3(-a, -b, c) */ + for (unsigned swap = 0; swap < 2; swap++) { + Operand operands[3]; + bool neg[3], abs[3], clamp; + uint8_t opsel = 0, omod = 0; + bool inbetween_neg; + if (match_op3_for_vop3(ctx, instr->opcode, opposite, + instr.get(), swap, "012", + operands, neg, abs, &opsel, + &clamp, &omod, &inbetween_neg, NULL, NULL) && + inbetween_neg) { + ctx.uses[instr->operands[swap].tempId()]--; + neg[1] = true; + neg[2] = true; + create_vop3_for_op3(ctx, minmax3, instr, operands, neg, abs, opsel, clamp, omod); + if (omod_clamp & label_omod_success) + ctx.info[instr->definitions[0].tempId()].set_omod_success(instr.get()); + if (omod_clamp & label_clamp_success) + ctx.info[instr->definitions[0].tempId()].set_clamp_success(instr.get()); + return true; + } + } + return false; +} + +/* s_not_b32(s_and_b32(a, b)) -> s_nand_b32(a, b) + * s_not_b32(s_or_b32(a, b)) -> s_nor_b32(a, b) + * s_not_b32(s_xor_b32(a, b)) -> s_xnor_b32(a, b) + * s_not_b64(s_and_b64(a, b)) -> s_nand_b64(a, b) + * s_not_b64(s_or_b64(a, b)) -> s_nor_b64(a, b) + * s_not_b64(s_xor_b64(a, b)) -> s_xnor_b64(a, b) */ +bool combine_salu_not_bitwise(opt_ctx& ctx, aco_ptr& instr) +{ + /* checks */ + if (!instr->operands[0].isTemp()) + return false; + if (instr->definitions[1].isTemp() && ctx.uses[instr->definitions[1].tempId()]) + return false; + + Instruction *op2_instr = follow_operand(ctx, instr->operands[0]); + if (!op2_instr) + return false; + switch (op2_instr->opcode) { + case aco_opcode::s_and_b32: + case aco_opcode::s_or_b32: + case aco_opcode::s_xor_b32: + case aco_opcode::s_and_b64: + case aco_opcode::s_or_b64: + case aco_opcode::s_xor_b64: + break; + default: + return false; + } + + /* create instruction */ + std::swap(instr->definitions[0], op2_instr->definitions[0]); + std::swap(instr->definitions[1], op2_instr->definitions[1]); + ctx.uses[instr->operands[0].tempId()]--; + ctx.info[op2_instr->definitions[0].tempId()].label = 0; + + switch (op2_instr->opcode) { + case aco_opcode::s_and_b32: + op2_instr->opcode = aco_opcode::s_nand_b32; + break; + case aco_opcode::s_or_b32: + op2_instr->opcode = aco_opcode::s_nor_b32; + break; + case aco_opcode::s_xor_b32: + op2_instr->opcode = aco_opcode::s_xnor_b32; + break; + case aco_opcode::s_and_b64: + op2_instr->opcode = aco_opcode::s_nand_b64; + break; + case aco_opcode::s_or_b64: + op2_instr->opcode = aco_opcode::s_nor_b64; + break; + case aco_opcode::s_xor_b64: + op2_instr->opcode = aco_opcode::s_xnor_b64; + break; + default: + break; + } + + return true; +} + +/* s_and_b32(a, s_not_b32(b)) -> s_andn2_b32(a, b) + * s_or_b32(a, s_not_b32(b)) -> s_orn2_b32(a, b) + * s_and_b64(a, s_not_b64(b)) -> s_andn2_b64(a, b) + * s_or_b64(a, s_not_b64(b)) -> s_orn2_b64(a, b) */ +bool combine_salu_n2(opt_ctx& ctx, aco_ptr& instr) +{ + if (instr->definitions[1].isTemp() && ctx.uses[instr->definitions[1].tempId()]) + return false; + + if (instr->definitions[0].isTemp() && ctx.info[instr->definitions[0].tempId()].is_uniform_bool()) + return false; + + for (unsigned i = 0; i < 2; i++) { + Instruction *op2_instr = follow_operand(ctx, instr->operands[i]); + if (!op2_instr || (op2_instr->opcode != aco_opcode::s_not_b32 && op2_instr->opcode != aco_opcode::s_not_b64)) + continue; + + if (instr->operands[!i].isLiteral() && op2_instr->operands[0].isLiteral() && + instr->operands[!i].constantValue() != op2_instr->operands[0].constantValue()) + continue; + + ctx.uses[instr->operands[i].tempId()]--; + instr->operands[0] = instr->operands[!i]; + instr->operands[1] = op2_instr->operands[0]; + ctx.info[instr->definitions[0].tempId()].label = 0; + + switch (instr->opcode) { + case aco_opcode::s_and_b32: + instr->opcode = aco_opcode::s_andn2_b32; + break; + case aco_opcode::s_or_b32: + instr->opcode = aco_opcode::s_orn2_b32; + break; + case aco_opcode::s_and_b64: + instr->opcode = aco_opcode::s_andn2_b64; + break; + case aco_opcode::s_or_b64: + instr->opcode = aco_opcode::s_orn2_b64; + break; + default: + break; + } + + return true; + } + return false; +} + +/* s_add_{i32,u32}(a, s_lshl_b32(b, )) -> s_lshl_add_u32(a, b) */ +bool combine_salu_lshl_add(opt_ctx& ctx, aco_ptr& instr) +{ + if (instr->definitions[1].isTemp() && ctx.uses[instr->definitions[1].tempId()]) + return false; + + for (unsigned i = 0; i < 2; i++) { + Instruction *op2_instr = follow_operand(ctx, instr->operands[i]); + if (!op2_instr || op2_instr->opcode != aco_opcode::s_lshl_b32 || !op2_instr->operands[1].isConstant()) + continue; + + uint32_t shift = op2_instr->operands[1].constantValue(); + if (shift < 1 || shift > 4) + continue; + + if (instr->operands[!i].isLiteral() && op2_instr->operands[0].isLiteral() && + instr->operands[!i].constantValue() != op2_instr->operands[0].constantValue()) + continue; + + ctx.uses[instr->operands[i].tempId()]--; + instr->operands[1] = instr->operands[!i]; + instr->operands[0] = op2_instr->operands[0]; + ctx.info[instr->definitions[0].tempId()].label = 0; + + instr->opcode = ((aco_opcode[]){aco_opcode::s_lshl1_add_u32, + aco_opcode::s_lshl2_add_u32, + aco_opcode::s_lshl3_add_u32, + aco_opcode::s_lshl4_add_u32})[shift - 1]; + + return true; + } + return false; +} + +bool get_minmax_info(aco_opcode op, aco_opcode *min, aco_opcode *max, aco_opcode *min3, aco_opcode *max3, aco_opcode *med3, bool *some_gfx9_only) +{ + switch (op) { + #define MINMAX(type, gfx9) \ + case aco_opcode::v_min_##type:\ + case aco_opcode::v_max_##type:\ + case aco_opcode::v_med3_##type:\ + *min = aco_opcode::v_min_##type;\ + *max = aco_opcode::v_max_##type;\ + *med3 = aco_opcode::v_med3_##type;\ + *min3 = aco_opcode::v_min3_##type;\ + *max3 = aco_opcode::v_max3_##type;\ + *some_gfx9_only = gfx9;\ + return true; + MINMAX(f32, false) + MINMAX(u32, false) + MINMAX(i32, false) + MINMAX(f16, true) + MINMAX(u16, true) + MINMAX(i16, true) + #undef MINMAX + default: + return false; + } +} + +/* v_min_{f,u,i}{16,32}(v_max_{f,u,i}{16,32}(a, lb), ub) -> v_med3_{f,u,i}{16,32}(a, lb, ub) when ub > lb + * v_max_{f,u,i}{16,32}(v_min_{f,u,i}{16,32}(a, ub), lb) -> v_med3_{f,u,i}{16,32}(a, lb, ub) when ub > lb */ +bool combine_clamp(opt_ctx& ctx, aco_ptr& instr, + aco_opcode min, aco_opcode max, aco_opcode med) +{ + /* TODO: GLSL's clamp(x, minVal, maxVal) and SPIR-V's + * FClamp(x, minVal, maxVal)/NClamp(x, minVal, maxVal) are undefined if + * minVal > maxVal, which means we can always select it to a v_med3_f32 */ + aco_opcode other_op; + if (instr->opcode == min) + other_op = max; + else if (instr->opcode == max) + other_op = min; + else + return false; + + uint32_t omod_clamp = ctx.info[instr->definitions[0].tempId()].label & + (label_omod_success | label_clamp_success); + + for (unsigned swap = 0; swap < 2; swap++) { + Operand operands[3]; + bool neg[3], abs[3], clamp; + uint8_t opsel = 0, omod = 0; + if (match_op3_for_vop3(ctx, instr->opcode, other_op, instr.get(), swap, + "012", operands, neg, abs, &opsel, + &clamp, &omod, NULL, NULL, NULL)) { + int const0_idx = -1, const1_idx = -1; + uint32_t const0 = 0, const1 = 0; + for (int i = 0; i < 3; i++) { + uint32_t val; + if (operands[i].isConstant()) { + val = operands[i].constantValue(); + } else if (operands[i].isTemp() && ctx.info[operands[i].tempId()].is_constant_or_literal()) { + val = ctx.info[operands[i].tempId()].val; + } else { + continue; + } + if (const0_idx >= 0) { + const1_idx = i; + const1 = val; + } else { + const0_idx = i; + const0 = val; + } + } + if (const0_idx < 0 || const1_idx < 0) + continue; + + if (opsel & (1 << const0_idx)) + const0 >>= 16; + if (opsel & (1 << const1_idx)) + const1 >>= 16; + + int lower_idx = const0_idx; + switch (min) { + case aco_opcode::v_min_f32: + case aco_opcode::v_min_f16: { + float const0_f, const1_f; + if (min == aco_opcode::v_min_f32) { + memcpy(&const0_f, &const0, 4); + memcpy(&const1_f, &const1, 4); + } else { + const0_f = _mesa_half_to_float(const0); + const1_f = _mesa_half_to_float(const1); + } + if (abs[const0_idx]) const0_f = fabsf(const0_f); + if (abs[const1_idx]) const1_f = fabsf(const1_f); + if (neg[const0_idx]) const0_f = -const0_f; + if (neg[const1_idx]) const1_f = -const1_f; + lower_idx = const0_f < const1_f ? const0_idx : const1_idx; + break; + } + case aco_opcode::v_min_u32: { + lower_idx = const0 < const1 ? const0_idx : const1_idx; + break; + } + case aco_opcode::v_min_u16: { + lower_idx = (uint16_t)const0 < (uint16_t)const1 ? const0_idx : const1_idx; + break; + } + case aco_opcode::v_min_i32: { + int32_t const0_i = const0 & 0x80000000u ? -2147483648 + (int32_t)(const0 & 0x7fffffffu) : const0; + int32_t const1_i = const1 & 0x80000000u ? -2147483648 + (int32_t)(const1 & 0x7fffffffu) : const1; + lower_idx = const0_i < const1_i ? const0_idx : const1_idx; + break; + } + case aco_opcode::v_min_i16: { + int16_t const0_i = const0 & 0x8000u ? -32768 + (int16_t)(const0 & 0x7fffu) : const0; + int16_t const1_i = const1 & 0x8000u ? -32768 + (int16_t)(const1 & 0x7fffu) : const1; + lower_idx = const0_i < const1_i ? const0_idx : const1_idx; + break; + } + default: + break; + } + int upper_idx = lower_idx == const0_idx ? const1_idx : const0_idx; + + if (instr->opcode == min) { + if (upper_idx != 0 || lower_idx == 0) + return false; + } else { + if (upper_idx == 0 || lower_idx != 0) + return false; + } + + ctx.uses[instr->operands[swap].tempId()]--; + create_vop3_for_op3(ctx, med, instr, operands, neg, abs, opsel, clamp, omod); + if (omod_clamp & label_omod_success) + ctx.info[instr->definitions[0].tempId()].set_omod_success(instr.get()); + if (omod_clamp & label_clamp_success) + ctx.info[instr->definitions[0].tempId()].set_clamp_success(instr.get()); + + return true; + } + } + + return false; +} + + +void apply_sgprs(opt_ctx &ctx, aco_ptr& instr) +{ + bool is_shift64 = instr->opcode == aco_opcode::v_lshlrev_b64 || + instr->opcode == aco_opcode::v_lshrrev_b64 || + instr->opcode == aco_opcode::v_ashrrev_i64; + + /* find candidates and create the set of sgprs already read */ + unsigned sgpr_ids[2] = {0, 0}; + uint32_t operand_mask = 0; + bool has_literal = false; + for (unsigned i = 0; i < instr->operands.size(); i++) { + if (instr->operands[i].isLiteral()) + has_literal = true; + if (!instr->operands[i].isTemp()) + continue; + if (instr->operands[i].getTemp().type() == RegType::sgpr) { + if (instr->operands[i].tempId() != sgpr_ids[0]) + sgpr_ids[!!sgpr_ids[0]] = instr->operands[i].tempId(); + } + ssa_info& info = ctx.info[instr->operands[i].tempId()]; + if (info.is_temp() && info.temp.type() == RegType::sgpr) + operand_mask |= 1u << i; + } + unsigned max_sgprs = 1; + if (ctx.program->chip_class >= GFX10 && !is_shift64) + max_sgprs = 2; + if (has_literal) + max_sgprs--; + + unsigned num_sgprs = !!sgpr_ids[0] + !!sgpr_ids[1]; + + /* keep on applying sgprs until there is nothing left to be done */ + while (operand_mask) { + uint32_t sgpr_idx = 0; + uint32_t sgpr_info_id = 0; + uint32_t mask = operand_mask; + /* choose a sgpr */ + while (mask) { + unsigned i = u_bit_scan(&mask); + uint16_t uses = ctx.uses[instr->operands[i].tempId()]; + if (sgpr_info_id == 0 || uses < ctx.uses[sgpr_info_id]) { + sgpr_idx = i; + sgpr_info_id = instr->operands[i].tempId(); + } + } + operand_mask &= ~(1u << sgpr_idx); + + /* Applying two sgprs require making it VOP3, so don't do it unless it's + * definitively beneficial. + * TODO: this is too conservative because later the use count could be reduced to 1 */ + if (num_sgprs && ctx.uses[sgpr_info_id] > 1 && !instr->isVOP3()) + break; + + Temp sgpr = ctx.info[sgpr_info_id].temp; + bool new_sgpr = sgpr.id() != sgpr_ids[0] && sgpr.id() != sgpr_ids[1]; + if (new_sgpr && num_sgprs >= max_sgprs) + continue; + + if (sgpr_idx == 0 || instr->isVOP3()) { + instr->operands[sgpr_idx] = Operand(sgpr); + } else if (can_swap_operands(instr)) { + instr->operands[sgpr_idx] = instr->operands[0]; + instr->operands[0] = Operand(sgpr); + /* swap bits using a 4-entry LUT */ + uint32_t swapped = (0x3120 >> (operand_mask & 0x3)) & 0xf; + operand_mask = (operand_mask & ~0x3) | swapped; + } else if (can_use_VOP3(ctx, instr)) { + to_VOP3(ctx, instr); + instr->operands[sgpr_idx] = Operand(sgpr); + } else { + continue; + } + + if (new_sgpr) + sgpr_ids[num_sgprs++] = sgpr.id(); + ctx.uses[sgpr_info_id]--; + ctx.uses[sgpr.id()]++; + } +} + +bool apply_omod_clamp(opt_ctx &ctx, Block& block, aco_ptr& instr) +{ + /* check if we could apply omod on predecessor */ + if (instr->opcode == aco_opcode::v_mul_f32) { + bool op0 = instr->operands[0].isTemp() && ctx.info[instr->operands[0].tempId()].is_omod_success(); + bool op1 = instr->operands[1].isTemp() && ctx.info[instr->operands[1].tempId()].is_omod_success(); + if (op0 || op1) { + unsigned idx = op0 ? 0 : 1; + /* omod was successfully applied */ + /* if the omod instruction is v_mad, we also have to change the original add */ + if (ctx.info[instr->operands[idx].tempId()].is_mad()) { + Instruction* add_instr = ctx.mad_infos[ctx.info[instr->operands[idx].tempId()].val].add_instr.get(); + if (ctx.info[instr->definitions[0].tempId()].is_clamp()) + static_cast(add_instr)->clamp = true; + add_instr->definitions[0] = instr->definitions[0]; + } + + Instruction* omod_instr = ctx.info[instr->operands[idx].tempId()].instr; + /* check if we have an additional clamp modifier */ + if (ctx.info[instr->definitions[0].tempId()].is_clamp() && ctx.uses[instr->definitions[0].tempId()] == 1 && + ctx.uses[ctx.info[instr->definitions[0].tempId()].temp.id()]) { + static_cast(omod_instr)->clamp = true; + ctx.info[instr->definitions[0].tempId()].set_clamp_success(omod_instr); + } + /* change definition ssa-id of modified instruction */ + omod_instr->definitions[0] = instr->definitions[0]; + + /* change the definition of instr to something unused, e.g. the original omod def */ + instr->definitions[0] = Definition(instr->operands[idx].getTemp()); + ctx.uses[instr->definitions[0].tempId()] = 0; + return true; + } + if (!ctx.info[instr->definitions[0].tempId()].label) { + /* in all other cases, label this instruction as option for multiply-add */ + ctx.info[instr->definitions[0].tempId()].set_mul(instr.get()); + } + } + + /* check if we could apply clamp on predecessor */ + if (instr->opcode == aco_opcode::v_med3_f32) { + unsigned idx = 0; + bool found_zero = false, found_one = false; + for (unsigned i = 0; i < 3; i++) + { + if (instr->operands[i].constantEquals(0)) + found_zero = true; + else if (instr->operands[i].constantEquals(0x3f800000)) /* 1.0 */ + found_one = true; + else + idx = i; + } + if (found_zero && found_one && instr->operands[idx].isTemp() && + ctx.info[instr->operands[idx].tempId()].is_clamp_success()) { + /* clamp was successfully applied */ + /* if the clamp instruction is v_mad, we also have to change the original add */ + if (ctx.info[instr->operands[idx].tempId()].is_mad()) { + Instruction* add_instr = ctx.mad_infos[ctx.info[instr->operands[idx].tempId()].val].add_instr.get(); + add_instr->definitions[0] = instr->definitions[0]; + } + Instruction* clamp_instr = ctx.info[instr->operands[idx].tempId()].instr; + /* change definition ssa-id of modified instruction */ + clamp_instr->definitions[0] = instr->definitions[0]; + + /* change the definition of instr to something unused, e.g. the original omod def */ + instr->definitions[0] = Definition(instr->operands[idx].getTemp()); + ctx.uses[instr->definitions[0].tempId()] = 0; + return true; + } + } + + /* omod has no effect if denormals are enabled */ + bool can_use_omod = block.fp_mode.denorm32 == 0; + + /* apply omod / clamp modifiers if the def is used only once and the instruction can have modifiers */ + if (!instr->definitions.empty() && ctx.uses[instr->definitions[0].tempId()] == 1 && + can_use_VOP3(ctx, instr) && instr_info.can_use_output_modifiers[(int)instr->opcode]) { + ssa_info& def_info = ctx.info[instr->definitions[0].tempId()]; + if (can_use_omod && def_info.is_omod2() && ctx.uses[def_info.temp.id()]) { + to_VOP3(ctx, instr); + static_cast(instr.get())->omod = 1; + def_info.set_omod_success(instr.get()); + } else if (can_use_omod && def_info.is_omod4() && ctx.uses[def_info.temp.id()]) { + to_VOP3(ctx, instr); + static_cast(instr.get())->omod = 2; + def_info.set_omod_success(instr.get()); + } else if (can_use_omod && def_info.is_omod5() && ctx.uses[def_info.temp.id()]) { + to_VOP3(ctx, instr); + static_cast(instr.get())->omod = 3; + def_info.set_omod_success(instr.get()); + } else if (def_info.is_clamp() && ctx.uses[def_info.temp.id()]) { + to_VOP3(ctx, instr); + static_cast(instr.get())->clamp = true; + def_info.set_clamp_success(instr.get()); + } + } + + return false; +} + +// TODO: we could possibly move the whole label_instruction pass to combine_instruction: +// this would mean that we'd have to fix the instruction uses while value propagation + +void combine_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr) +{ + if (instr->definitions.empty() || is_dead(ctx.uses, instr.get())) + return; + + if (instr->isVALU()) { + if (can_apply_sgprs(instr)) + apply_sgprs(ctx, instr); + if (apply_omod_clamp(ctx, block, instr)) + return; + } + + if (ctx.info[instr->definitions[0].tempId()].is_vcc_hint()) { + instr->definitions[0].setHint(vcc); + } + + /* TODO: There are still some peephole optimizations that could be done: + * - abs(a - b) -> s_absdiff_i32 + * - various patterns for s_bitcmp{0,1}_b32 and s_bitset{0,1}_b32 + * - patterns for v_alignbit_b32 and v_alignbyte_b32 + * These aren't probably too interesting though. + * There are also patterns for v_cmp_class_f{16,32,64}. This is difficult but + * probably more useful than the previously mentioned optimizations. + * The various comparison optimizations also currently only work with 32-bit + * floats. */ + + /* neg(mul(a, b)) -> mul(neg(a), b) */ + if (ctx.info[instr->definitions[0].tempId()].is_neg() && ctx.uses[instr->operands[1].tempId()] == 1) { + Temp val = ctx.info[instr->definitions[0].tempId()].temp; + + if (!ctx.info[val.id()].is_mul()) + return; + + Instruction* mul_instr = ctx.info[val.id()].instr; + + if (mul_instr->operands[0].isLiteral()) + return; + if (mul_instr->isVOP3() && static_cast(mul_instr)->clamp) + return; + + /* convert to mul(neg(a), b) */ + ctx.uses[mul_instr->definitions[0].tempId()]--; + Definition def = instr->definitions[0]; + /* neg(abs(mul(a, b))) -> mul(neg(abs(a)), abs(b)) */ + bool is_abs = ctx.info[instr->definitions[0].tempId()].is_abs(); + instr.reset(create_instruction(aco_opcode::v_mul_f32, asVOP3(Format::VOP2), 2, 1)); + instr->operands[0] = mul_instr->operands[0]; + instr->operands[1] = mul_instr->operands[1]; + instr->definitions[0] = def; + VOP3A_instruction* new_mul = static_cast(instr.get()); + if (mul_instr->isVOP3()) { + VOP3A_instruction* mul = static_cast(mul_instr); + new_mul->neg[0] = mul->neg[0] && !is_abs; + new_mul->neg[1] = mul->neg[1] && !is_abs; + new_mul->abs[0] = mul->abs[0] || is_abs; + new_mul->abs[1] = mul->abs[1] || is_abs; + new_mul->omod = mul->omod; + } + new_mul->neg[0] ^= true; + new_mul->clamp = false; + + ctx.info[instr->definitions[0].tempId()].set_mul(instr.get()); + return; + } + /* combine mul+add -> mad */ + else if ((instr->opcode == aco_opcode::v_add_f32 || + instr->opcode == aco_opcode::v_sub_f32 || + instr->opcode == aco_opcode::v_subrev_f32) && + block.fp_mode.denorm32 == 0 && !block.fp_mode.preserve_signed_zero_inf_nan32) { + //TODO: we could use fma instead when denormals are enabled if the NIR isn't marked as precise + + uint32_t uses_src0 = UINT32_MAX; + uint32_t uses_src1 = UINT32_MAX; + Instruction* mul_instr = nullptr; + unsigned add_op_idx; + /* check if any of the operands is a multiplication */ + if (instr->operands[0].isTemp() && ctx.info[instr->operands[0].tempId()].is_mul()) + uses_src0 = ctx.uses[instr->operands[0].tempId()]; + if (instr->operands[1].isTemp() && ctx.info[instr->operands[1].tempId()].is_mul()) + uses_src1 = ctx.uses[instr->operands[1].tempId()]; + + /* find the 'best' mul instruction to combine with the add */ + if (uses_src0 < uses_src1) { + mul_instr = ctx.info[instr->operands[0].tempId()].instr; + add_op_idx = 1; + } else if (uses_src1 < uses_src0) { + mul_instr = ctx.info[instr->operands[1].tempId()].instr; + add_op_idx = 0; + } else if (uses_src0 != UINT32_MAX) { + /* tiebreaker: quite random what to pick */ + if (ctx.info[instr->operands[0].tempId()].instr->operands[0].isLiteral()) { + mul_instr = ctx.info[instr->operands[1].tempId()].instr; + add_op_idx = 0; + } else { + mul_instr = ctx.info[instr->operands[0].tempId()].instr; + add_op_idx = 1; + } + } + if (mul_instr) { + Operand op[3] = {Operand(v1), Operand(v1), Operand(v1)}; + bool neg[3] = {false, false, false}; + bool abs[3] = {false, false, false}; + unsigned omod = 0; + bool clamp = false; + op[0] = mul_instr->operands[0]; + op[1] = mul_instr->operands[1]; + op[2] = instr->operands[add_op_idx]; + // TODO: would be better to check this before selecting a mul instr? + if (!check_vop3_operands(ctx, 3, op)) + return; + + if (mul_instr->isVOP3()) { + VOP3A_instruction* vop3 = static_cast (mul_instr); + neg[0] = vop3->neg[0]; + neg[1] = vop3->neg[1]; + abs[0] = vop3->abs[0]; + abs[1] = vop3->abs[1]; + /* we cannot use these modifiers between mul and add */ + if (vop3->clamp || vop3->omod) + return; + } + + /* convert to mad */ + ctx.uses[mul_instr->definitions[0].tempId()]--; + if (ctx.uses[mul_instr->definitions[0].tempId()]) { + if (op[0].isTemp()) + ctx.uses[op[0].tempId()]++; + if (op[1].isTemp()) + ctx.uses[op[1].tempId()]++; + } + + if (instr->isVOP3()) { + VOP3A_instruction* vop3 = static_cast (instr.get()); + neg[2] = vop3->neg[add_op_idx]; + abs[2] = vop3->abs[add_op_idx]; + omod = vop3->omod; + clamp = vop3->clamp; + /* abs of the multiplication result */ + if (vop3->abs[1 - add_op_idx]) { + neg[0] = false; + neg[1] = false; + abs[0] = true; + abs[1] = true; + } + /* neg of the multiplication result */ + neg[1] = neg[1] ^ vop3->neg[1 - add_op_idx]; + } + if (instr->opcode == aco_opcode::v_sub_f32) + neg[1 + add_op_idx] = neg[1 + add_op_idx] ^ true; + else if (instr->opcode == aco_opcode::v_subrev_f32) + neg[2 - add_op_idx] = neg[2 - add_op_idx] ^ true; + + aco_ptr mad{create_instruction(aco_opcode::v_mad_f32, Format::VOP3A, 3, 1)}; + for (unsigned i = 0; i < 3; i++) + { + mad->operands[i] = op[i]; + mad->neg[i] = neg[i]; + mad->abs[i] = abs[i]; + } + mad->omod = omod; + mad->clamp = clamp; + mad->definitions[0] = instr->definitions[0]; + + /* mark this ssa_def to be re-checked for profitability and literals */ + ctx.mad_infos.emplace_back(std::move(instr), mul_instr->definitions[0].tempId()); + ctx.info[mad->definitions[0].tempId()].set_mad(mad.get(), ctx.mad_infos.size() - 1); + instr.reset(mad.release()); + return; + } + } + /* v_mul_f32(v_cndmask_b32(0, 1.0, cond), a) -> v_cndmask_b32(0, a, cond) */ + else if (instr->opcode == aco_opcode::v_mul_f32 && !instr->isVOP3()) { + for (unsigned i = 0; i < 2; i++) { + if (instr->operands[i].isTemp() && ctx.info[instr->operands[i].tempId()].is_b2f() && + ctx.uses[instr->operands[i].tempId()] == 1 && + instr->operands[!i].isTemp() && instr->operands[!i].getTemp().type() == RegType::vgpr) { + ctx.uses[instr->operands[i].tempId()]--; + ctx.uses[ctx.info[instr->operands[i].tempId()].temp.id()]++; + + aco_ptr new_instr{create_instruction(aco_opcode::v_cndmask_b32, Format::VOP2, 3, 1)}; + new_instr->operands[0] = Operand(0u); + new_instr->operands[1] = instr->operands[!i]; + new_instr->operands[2] = Operand(ctx.info[instr->operands[i].tempId()].temp); + new_instr->definitions[0] = instr->definitions[0]; + instr.reset(new_instr.release()); + ctx.info[instr->definitions[0].tempId()].label = 0; + return; + } + } + } else if (instr->opcode == aco_opcode::v_or_b32 && ctx.program->chip_class >= GFX9) { + if (combine_three_valu_op(ctx, instr, aco_opcode::v_or_b32, aco_opcode::v_or3_b32, "012", 1 | 2)) ; + else if (combine_three_valu_op(ctx, instr, aco_opcode::v_and_b32, aco_opcode::v_and_or_b32, "120", 1 | 2)) ; + else combine_three_valu_op(ctx, instr, aco_opcode::v_lshlrev_b32, aco_opcode::v_lshl_or_b32, "210", 1 | 2); + } else if (instr->opcode == aco_opcode::v_add_u32 && ctx.program->chip_class >= GFX9) { + if (combine_three_valu_op(ctx, instr, aco_opcode::v_xor_b32, aco_opcode::v_xad_u32, "120", 1 | 2)) ; + else if (combine_three_valu_op(ctx, instr, aco_opcode::v_add_u32, aco_opcode::v_add3_u32, "012", 1 | 2)) ; + else combine_three_valu_op(ctx, instr, aco_opcode::v_lshlrev_b32, aco_opcode::v_lshl_add_u32, "210", 1 | 2); + } else if (instr->opcode == aco_opcode::v_lshlrev_b32 && ctx.program->chip_class >= GFX9) { + combine_three_valu_op(ctx, instr, aco_opcode::v_add_u32, aco_opcode::v_add_lshl_u32, "120", 2); + } else if ((instr->opcode == aco_opcode::s_add_u32 || instr->opcode == aco_opcode::s_add_i32) && ctx.program->chip_class >= GFX9) { + combine_salu_lshl_add(ctx, instr); + } else if (instr->opcode == aco_opcode::s_not_b32) { + combine_salu_not_bitwise(ctx, instr); + } else if (instr->opcode == aco_opcode::s_not_b64) { + if (combine_inverse_comparison(ctx, instr)) ; + else combine_salu_not_bitwise(ctx, instr); + } else if (instr->opcode == aco_opcode::s_and_b32 || instr->opcode == aco_opcode::s_or_b32 || + instr->opcode == aco_opcode::s_and_b64 || instr->opcode == aco_opcode::s_or_b64) { + if (combine_ordering_test(ctx, instr)) ; + else if (combine_comparison_ordering(ctx, instr)) ; + else if (combine_constant_comparison_ordering(ctx, instr)) ; + else combine_salu_n2(ctx, instr); + } else { + aco_opcode min, max, min3, max3, med3; + bool some_gfx9_only; + if (get_minmax_info(instr->opcode, &min, &max, &min3, &max3, &med3, &some_gfx9_only) && + (!some_gfx9_only || ctx.program->chip_class >= GFX9)) { + if (combine_minmax(ctx, instr, instr->opcode == min ? max : min, instr->opcode == min ? min3 : max3)) ; + else combine_clamp(ctx, instr, min, max, med3); + } + } +} + +bool to_uniform_bool_instr(opt_ctx &ctx, aco_ptr &instr) +{ + switch (instr->opcode) { + case aco_opcode::s_and_b32: + case aco_opcode::s_and_b64: + instr->opcode = aco_opcode::s_and_b32; + break; + case aco_opcode::s_or_b32: + case aco_opcode::s_or_b64: + instr->opcode = aco_opcode::s_or_b32; + break; + case aco_opcode::s_xor_b32: + case aco_opcode::s_xor_b64: + instr->opcode = aco_opcode::s_absdiff_i32; + break; + default: + /* Don't transform other instructions. They are very unlikely to appear here. */ + return false; + } + + for (Operand &op : instr->operands) { + ctx.uses[op.tempId()]--; + + if (ctx.info[op.tempId()].is_uniform_bool()) { + /* Just use the uniform boolean temp. */ + op.setTemp(ctx.info[op.tempId()].temp); + } else if (ctx.info[op.tempId()].is_uniform_bitwise()) { + /* Use the SCC definition of the predecessor instruction. + * This allows the predecessor to get picked up by the same optimization (if it has no divergent users), + * and it also makes sure that the current instruction will keep working even if the predecessor won't be transformed. + */ + Instruction *pred_instr = ctx.info[op.tempId()].instr; + assert(pred_instr->definitions.size() >= 2); + assert(pred_instr->definitions[1].isFixed() && pred_instr->definitions[1].physReg() == scc); + op.setTemp(pred_instr->definitions[1].getTemp()); + } else { + unreachable("Invalid operand on uniform bitwise instruction."); + } + + ctx.uses[op.tempId()]++; + } + + instr->definitions[0].setTemp(Temp(instr->definitions[0].tempId(), s1)); + assert(instr->operands[0].regClass() == s1); + assert(instr->operands[1].regClass() == s1); + return true; +} + +void select_instruction(opt_ctx &ctx, aco_ptr& instr) +{ + const uint32_t threshold = 4; + + if (is_dead(ctx.uses, instr.get())) { + instr.reset(); + return; + } + + /* convert split_vector into a copy or extract_vector if only one definition is ever used */ + if (instr->opcode == aco_opcode::p_split_vector) { + unsigned num_used = 0; + unsigned idx = 0; + for (unsigned i = 0; i < instr->definitions.size(); i++) { + if (ctx.uses[instr->definitions[i].tempId()]) { + num_used++; + idx = i; + } + } + bool done = false; + if (num_used == 1 && ctx.info[instr->operands[0].tempId()].is_vec() && + ctx.uses[instr->operands[0].tempId()] == 1) { + Instruction *vec = ctx.info[instr->operands[0].tempId()].instr; + + unsigned off = 0; + Operand op; + for (Operand& vec_op : vec->operands) { + if (off == idx * instr->definitions[0].size()) { + op = vec_op; + break; + } + off += vec_op.size(); + } + if (off != instr->operands[0].size()) { + ctx.uses[instr->operands[0].tempId()]--; + for (Operand& vec_op : vec->operands) { + if (vec_op.isTemp()) + ctx.uses[vec_op.tempId()]--; + } + if (op.isTemp()) + ctx.uses[op.tempId()]++; + + aco_ptr extract{create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, 1, 1)}; + extract->operands[0] = op; + extract->definitions[0] = instr->definitions[idx]; + instr.reset(extract.release()); + + done = true; + } + } + + if (!done && num_used == 1) { + aco_ptr extract{create_instruction(aco_opcode::p_extract_vector, Format::PSEUDO, 2, 1)}; + extract->operands[0] = instr->operands[0]; + extract->operands[1] = Operand((uint32_t) idx); + extract->definitions[0] = instr->definitions[idx]; + instr.reset(extract.release()); + } + } + + mad_info* mad_info = NULL; + if (instr->opcode == aco_opcode::v_mad_f32 && ctx.info[instr->definitions[0].tempId()].is_mad()) { + mad_info = &ctx.mad_infos[ctx.info[instr->definitions[0].tempId()].val]; + /* re-check mad instructions */ + if (ctx.uses[mad_info->mul_temp_id]) { + ctx.uses[mad_info->mul_temp_id]++; + if (instr->operands[0].isTemp()) + ctx.uses[instr->operands[0].tempId()]--; + if (instr->operands[1].isTemp()) + ctx.uses[instr->operands[1].tempId()]--; + instr.swap(mad_info->add_instr); + mad_info = NULL; + } + /* check literals */ + else if (!instr->usesModifiers()) { + bool sgpr_used = false; + uint32_t literal_idx = 0; + uint32_t literal_uses = UINT32_MAX; + for (unsigned i = 0; i < instr->operands.size(); i++) + { + if (instr->operands[i].isConstant() && i > 0) { + literal_uses = UINT32_MAX; + break; + } + if (!instr->operands[i].isTemp()) + continue; + /* if one of the operands is sgpr, we cannot add a literal somewhere else on pre-GFX10 or operands other than the 1st */ + if (instr->operands[i].getTemp().type() == RegType::sgpr && (i > 0 || ctx.program->chip_class < GFX10)) { + if (!sgpr_used && ctx.info[instr->operands[i].tempId()].is_literal()) { + literal_uses = ctx.uses[instr->operands[i].tempId()]; + literal_idx = i; + } else { + literal_uses = UINT32_MAX; + } + sgpr_used = true; + /* don't break because we still need to check constants */ + } else if (!sgpr_used && + ctx.info[instr->operands[i].tempId()].is_literal() && + ctx.uses[instr->operands[i].tempId()] < literal_uses) { + literal_uses = ctx.uses[instr->operands[i].tempId()]; + literal_idx = i; + } + } + if (literal_uses < threshold) { + ctx.uses[instr->operands[literal_idx].tempId()]--; + mad_info->check_literal = true; + mad_info->literal_idx = literal_idx; + return; + } + } + } + + /* Mark SCC needed, so the uniform boolean transformation won't swap the definitions when it isn't beneficial */ + if (instr->format == Format::PSEUDO_BRANCH && + instr->operands.size() && + instr->operands[0].isTemp()) { + ctx.info[instr->operands[0].tempId()].set_scc_needed(); + return; + } else if ((instr->opcode == aco_opcode::s_cselect_b64 || + instr->opcode == aco_opcode::s_cselect_b32) && + instr->operands[2].isTemp()) { + ctx.info[instr->operands[2].tempId()].set_scc_needed(); + } + + /* check for literals */ + if (!instr->isSALU() && !instr->isVALU()) + return; + + /* Transform uniform bitwise boolean operations to 32-bit when there are no divergent uses. */ + if (instr->definitions.size() && + ctx.uses[instr->definitions[0].tempId()] == 0 && + ctx.info[instr->definitions[0].tempId()].is_uniform_bitwise()) { + bool transform_done = to_uniform_bool_instr(ctx, instr); + + if (transform_done && !ctx.info[instr->definitions[1].tempId()].is_scc_needed()) { + /* Swap the two definition IDs in order to avoid overusing the SCC. This reduces extra moves generated by RA. */ + uint32_t def0_id = instr->definitions[0].getTemp().id(); + uint32_t def1_id = instr->definitions[1].getTemp().id(); + instr->definitions[0].setTemp(Temp(def1_id, s1)); + instr->definitions[1].setTemp(Temp(def0_id, s1)); + } + + return; + } + + if (instr->isSDWA() || instr->isDPP() || (instr->isVOP3() && ctx.program->chip_class < GFX10)) + return; /* some encodings can't ever take literals */ + + /* we do not apply the literals yet as we don't know if it is profitable */ + Operand current_literal(s1); + + unsigned literal_id = 0; + unsigned literal_uses = UINT32_MAX; + Operand literal(s1); + unsigned num_operands = 1; + if (instr->isSALU() || (ctx.program->chip_class >= GFX10 && can_use_VOP3(ctx, instr))) + num_operands = instr->operands.size(); + /* catch VOP2 with a 3rd SGPR operand (e.g. v_cndmask_b32, v_addc_co_u32) */ + else if (instr->isVALU() && instr->operands.size() >= 3) + return; + + unsigned sgpr_ids[2] = {0, 0}; + bool is_literal_sgpr = false; + uint32_t mask = 0; + + /* choose a literal to apply */ + for (unsigned i = 0; i < num_operands; i++) { + Operand op = instr->operands[i]; + + if (instr->isVALU() && op.isTemp() && op.getTemp().type() == RegType::sgpr && + op.tempId() != sgpr_ids[0]) + sgpr_ids[!!sgpr_ids[0]] = op.tempId(); + + if (op.isLiteral()) { + current_literal = op; + continue; + } else if (!op.isTemp() || !ctx.info[op.tempId()].is_literal()) { + continue; + } + + if (!alu_can_accept_constant(instr->opcode, i)) + continue; + + if (ctx.uses[op.tempId()] < literal_uses) { + is_literal_sgpr = op.getTemp().type() == RegType::sgpr; + mask = 0; + literal = Operand(ctx.info[op.tempId()].val); + literal_uses = ctx.uses[op.tempId()]; + literal_id = op.tempId(); + } + + mask |= (op.tempId() == literal_id) << i; + } + + + /* don't go over the constant bus limit */ + bool is_shift64 = instr->opcode == aco_opcode::v_lshlrev_b64 || + instr->opcode == aco_opcode::v_lshrrev_b64 || + instr->opcode == aco_opcode::v_ashrrev_i64; + unsigned const_bus_limit = instr->isVALU() ? 1 : UINT32_MAX; + if (ctx.program->chip_class >= GFX10 && !is_shift64) + const_bus_limit = 2; + + unsigned num_sgprs = !!sgpr_ids[0] + !!sgpr_ids[1]; + if (num_sgprs == const_bus_limit && !is_literal_sgpr) + return; + + if (literal_id && literal_uses < threshold && + (current_literal.isUndefined() || + (current_literal.size() == literal.size() && + current_literal.constantValue() == literal.constantValue()))) { + /* mark the literal to be applied */ + while (mask) { + unsigned i = u_bit_scan(&mask); + if (instr->operands[i].isTemp() && instr->operands[i].tempId() == literal_id) + ctx.uses[instr->operands[i].tempId()]--; + } + } +} + + +void apply_literals(opt_ctx &ctx, aco_ptr& instr) +{ + /* Cleanup Dead Instructions */ + if (!instr) + return; + + /* apply literals on MAD */ + if (instr->opcode == aco_opcode::v_mad_f32 && ctx.info[instr->definitions[0].tempId()].is_mad()) { + mad_info* info = &ctx.mad_infos[ctx.info[instr->definitions[0].tempId()].val]; + if (info->check_literal && ctx.uses[instr->operands[info->literal_idx].tempId()] == 0) { + aco_ptr new_mad; + if (info->literal_idx == 2) { /* add literal -> madak */ + new_mad.reset(create_instruction(aco_opcode::v_madak_f32, Format::VOP2, 3, 1)); + new_mad->operands[0] = instr->operands[0]; + new_mad->operands[1] = instr->operands[1]; + } else { /* mul literal -> madmk */ + new_mad.reset(create_instruction(aco_opcode::v_madmk_f32, Format::VOP2, 3, 1)); + new_mad->operands[0] = instr->operands[1 - info->literal_idx]; + new_mad->operands[1] = instr->operands[2]; + } + new_mad->operands[2] = Operand(ctx.info[instr->operands[info->literal_idx].tempId()].val); + new_mad->definitions[0] = instr->definitions[0]; + ctx.instructions.emplace_back(std::move(new_mad)); + return; + } + } + + /* apply literals on other SALU/VALU */ + if (instr->isSALU() || instr->isVALU()) { + for (unsigned i = 0; i < instr->operands.size(); i++) { + Operand op = instr->operands[i]; + if (op.isTemp() && ctx.info[op.tempId()].is_literal() && ctx.uses[op.tempId()] == 0) { + Operand literal(ctx.info[op.tempId()].val); + if (instr->isVALU() && i > 0) + to_VOP3(ctx, instr); + instr->operands[i] = literal; + } + } + } + + ctx.instructions.emplace_back(std::move(instr)); +} + + +void optimize(Program* program) +{ + opt_ctx ctx; + ctx.program = program; + std::vector info(program->peekAllocationId()); + ctx.info = info.data(); + + /* 1. Bottom-Up DAG pass (forward) to label all ssa-defs */ + for (Block& block : program->blocks) { + for (aco_ptr& instr : block.instructions) + label_instruction(ctx, block, instr); + } + + ctx.uses = std::move(dead_code_analysis(program)); + + /* 2. Combine v_mad, omod, clamp and propagate sgpr on VALU instructions */ + for (Block& block : program->blocks) { + for (aco_ptr& instr : block.instructions) + combine_instruction(ctx, block, instr); + } + + /* 3. Top-Down DAG pass (backward) to select instructions (includes DCE) */ + for (std::vector::reverse_iterator it = program->blocks.rbegin(); it != program->blocks.rend(); ++it) { + Block* block = &(*it); + for (std::vector>::reverse_iterator it = block->instructions.rbegin(); it != block->instructions.rend(); ++it) + select_instruction(ctx, *it); + } + + /* 4. Add literals to instructions */ + for (Block& block : program->blocks) { + ctx.instructions.clear(); + for (aco_ptr& instr : block.instructions) + apply_literals(ctx, instr); + block.instructions.swap(ctx.instructions); + } + +} + +} diff -Nru mesa-19.2.8/src/amd/compiler/aco_opt_value_numbering.cpp mesa-20.0.8/src/amd/compiler/aco_opt_value_numbering.cpp --- mesa-19.2.8/src/amd/compiler/aco_opt_value_numbering.cpp 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/amd/compiler/aco_opt_value_numbering.cpp 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,409 @@ +/* + * Copyright © 2018 Valve Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + */ + +#include +#include +#include "aco_ir.h" + +/* + * Implements the algorithm for dominator-tree value numbering + * from "Value Numbering" by Briggs, Cooper, and Simpson. + */ + +namespace aco { +namespace { + +struct InstrHash { + std::size_t operator()(Instruction* instr) const + { + uint64_t hash = (uint64_t) instr->opcode + (uint64_t) instr->format; + for (unsigned i = 0; i < instr->operands.size(); i++) { + Operand op = instr->operands[i]; + uint64_t val = op.isTemp() ? op.tempId() : op.isFixed() ? op.physReg() : op.constantValue(); + hash |= val << (i+1) * 8; + } + if (instr->isVOP3()) { + VOP3A_instruction* vop3 = static_cast(instr); + for (unsigned i = 0; i < 3; i++) { + hash ^= vop3->abs[i] << (i*3 + 0); + hash ^= vop3->neg[i] << (i*3 + 2); + } + hash ^= vop3->opsel * 13; + hash ^= (vop3->clamp << 28) * 13; + hash += vop3->omod << 19; + } + switch (instr->format) { + case Format::SMEM: + break; + case Format::VINTRP: { + Interp_instruction* interp = static_cast(instr); + hash ^= interp->attribute << 13; + hash ^= interp->component << 27; + break; + } + case Format::DS: + break; + default: + break; + } + + return hash; + } +}; + +struct InstrPred { + bool operator()(Instruction* a, Instruction* b) const + { + if (a->format != b->format) + return false; + if (a->opcode != b->opcode) + return false; + if (a->operands.size() != b->operands.size() || a->definitions.size() != b->definitions.size()) + return false; /* possible with pseudo-instructions */ + for (unsigned i = 0; i < a->operands.size(); i++) { + if (a->operands[i].isConstant()) { + if (!b->operands[i].isConstant()) + return false; + if (a->operands[i].constantValue() != b->operands[i].constantValue()) + return false; + } + else if (a->operands[i].isTemp()) { + if (!b->operands[i].isTemp()) + return false; + if (a->operands[i].tempId() != b->operands[i].tempId()) + return false; + } + else if (a->operands[i].isUndefined() ^ b->operands[i].isUndefined()) + return false; + if (a->operands[i].isFixed()) { + if (!b->operands[i].isFixed()) + return false; + if (a->operands[i].physReg() != b->operands[i].physReg()) + return false; + if (a->operands[i].physReg() == exec && a->pass_flags != b->pass_flags) + return false; + } + } + for (unsigned i = 0; i < a->definitions.size(); i++) { + if (a->definitions[i].isTemp()) { + if (!b->definitions[i].isTemp()) + return false; + if (a->definitions[i].regClass() != b->definitions[i].regClass()) + return false; + } + if (a->definitions[i].isFixed()) { + if (!b->definitions[i].isFixed()) + return false; + if (a->definitions[i].physReg() != b->definitions[i].physReg()) + return false; + if (a->definitions[i].physReg() == exec) + return false; + } + } + + if (a->opcode == aco_opcode::v_readfirstlane_b32) + return a->pass_flags == b->pass_flags; + + /* The results of VOPC depend on the exec mask if used for subgroup operations. */ + if ((uint32_t) a->format & (uint32_t) Format::VOPC && a->pass_flags != b->pass_flags) + return false; + + if (a->isVOP3()) { + VOP3A_instruction* a3 = static_cast(a); + VOP3A_instruction* b3 = static_cast(b); + for (unsigned i = 0; i < 3; i++) { + if (a3->abs[i] != b3->abs[i] || + a3->neg[i] != b3->neg[i]) + return false; + } + return a3->clamp == b3->clamp && + a3->omod == b3->omod && + a3->opsel == b3->opsel; + } + if (a->isDPP()) { + DPP_instruction* aDPP = static_cast(a); + DPP_instruction* bDPP = static_cast(b); + return aDPP->pass_flags == bDPP->pass_flags && + aDPP->dpp_ctrl == bDPP->dpp_ctrl && + aDPP->bank_mask == bDPP->bank_mask && + aDPP->row_mask == bDPP->row_mask && + aDPP->bound_ctrl == bDPP->bound_ctrl && + aDPP->abs[0] == bDPP->abs[0] && + aDPP->abs[1] == bDPP->abs[1] && + aDPP->neg[0] == bDPP->neg[0] && + aDPP->neg[1] == bDPP->neg[1]; + } + + switch (a->format) { + case Format::SOPK: { + SOPK_instruction* aK = static_cast(a); + SOPK_instruction* bK = static_cast(b); + return aK->imm == bK->imm; + } + case Format::SMEM: { + SMEM_instruction* aS = static_cast(a); + SMEM_instruction* bS = static_cast(b); + return aS->can_reorder && bS->can_reorder && + aS->glc == bS->glc && aS->nv == bS->nv; + } + case Format::VINTRP: { + Interp_instruction* aI = static_cast(a); + Interp_instruction* bI = static_cast(b); + if (aI->attribute != bI->attribute) + return false; + if (aI->component != bI->component) + return false; + return true; + } + case Format::PSEUDO_REDUCTION: { + Pseudo_reduction_instruction *aR = static_cast(a); + Pseudo_reduction_instruction *bR = static_cast(b); + return aR->pass_flags == bR->pass_flags && + aR->reduce_op == bR->reduce_op && + aR->cluster_size == bR->cluster_size; + } + case Format::MTBUF: { + MTBUF_instruction* aM = static_cast(a); + MTBUF_instruction* bM = static_cast(b); + return aM->can_reorder && bM->can_reorder && + aM->barrier == bM->barrier && + aM->dfmt == bM->dfmt && + aM->nfmt == bM->nfmt && + aM->offset == bM->offset && + aM->offen == bM->offen && + aM->idxen == bM->idxen && + aM->glc == bM->glc && + aM->dlc == bM->dlc && + aM->slc == bM->slc && + aM->tfe == bM->tfe && + aM->disable_wqm == bM->disable_wqm; + } + case Format::MUBUF: { + MUBUF_instruction* aM = static_cast(a); + MUBUF_instruction* bM = static_cast(b); + return aM->can_reorder && bM->can_reorder && + aM->barrier == bM->barrier && + aM->offset == bM->offset && + aM->offen == bM->offen && + aM->idxen == bM->idxen && + aM->glc == bM->glc && + aM->dlc == bM->dlc && + aM->slc == bM->slc && + aM->tfe == bM->tfe && + aM->lds == bM->lds && + aM->disable_wqm == bM->disable_wqm; + } + /* we want to optimize these in NIR and don't hassle with load-store dependencies */ + case Format::FLAT: + case Format::GLOBAL: + case Format::SCRATCH: + case Format::EXP: + case Format::SOPP: + case Format::PSEUDO_BRANCH: + case Format::PSEUDO_BARRIER: + return false; + case Format::DS: { + if (a->opcode != aco_opcode::ds_bpermute_b32 && + a->opcode != aco_opcode::ds_permute_b32 && + a->opcode != aco_opcode::ds_swizzle_b32) + return false; + DS_instruction* aD = static_cast(a); + DS_instruction* bD = static_cast(b); + return aD->pass_flags == bD->pass_flags && + aD->gds == bD->gds && + aD->offset0 == bD->offset0 && + aD->offset1 == bD->offset1; + } + case Format::MIMG: { + MIMG_instruction* aM = static_cast(a); + MIMG_instruction* bM = static_cast(b); + return aM->can_reorder && bM->can_reorder && + aM->barrier == bM->barrier && + aM->dmask == bM->dmask && + aM->unrm == bM->unrm && + aM->glc == bM->glc && + aM->slc == bM->slc && + aM->tfe == bM->tfe && + aM->da == bM->da && + aM->lwe == bM->lwe && + aM->r128 == bM->r128 && + aM->a16 == bM->a16 && + aM->d16 == bM->d16 && + aM->disable_wqm == bM->disable_wqm; + } + default: + return true; + } + } +}; + +using expr_set = std::unordered_map; + +struct vn_ctx { + Program* program; + expr_set expr_values; + std::map renames; + + /* The exec id should be the same on the same level of control flow depth. + * Together with the check for dominator relations, it is safe to assume + * that the same exec_id also means the same execution mask. + * Discards increment the exec_id, so that it won't return to the previous value. + */ + uint32_t exec_id = 1; + + vn_ctx(Program* program) : program(program) {} +}; + + +/* dominates() returns true if the parent block dominates the child block and + * if the parent block is part of the same loop or has a smaller loop nest depth. + */ +bool dominates(vn_ctx& ctx, uint32_t parent, uint32_t child) +{ + unsigned parent_loop_nest_depth = ctx.program->blocks[parent].loop_nest_depth; + while (parent < child && parent_loop_nest_depth <= ctx.program->blocks[child].loop_nest_depth) + child = ctx.program->blocks[child].logical_idom; + + return parent == child; +} + +void process_block(vn_ctx& ctx, Block& block) +{ + std::vector> new_instructions; + new_instructions.reserve(block.instructions.size()); + + for (aco_ptr& instr : block.instructions) { + /* first, rename operands */ + for (Operand& op : instr->operands) { + if (!op.isTemp()) + continue; + auto it = ctx.renames.find(op.tempId()); + if (it != ctx.renames.end()) + op.setTemp(it->second); + } + + if (instr->opcode == aco_opcode::p_discard_if || + instr->opcode == aco_opcode::p_demote_to_helper) + ctx.exec_id++; + + if (instr->definitions.empty() || instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi) { + new_instructions.emplace_back(std::move(instr)); + continue; + } + + /* simple copy-propagation through renaming */ + if ((instr->opcode == aco_opcode::s_mov_b32 || instr->opcode == aco_opcode::s_mov_b64 || instr->opcode == aco_opcode::v_mov_b32) && + !instr->definitions[0].isFixed() && instr->operands[0].isTemp() && instr->operands[0].regClass() == instr->definitions[0].regClass() && + !instr->isDPP() && !((int)instr->format & (int)Format::SDWA)) { + ctx.renames[instr->definitions[0].tempId()] = instr->operands[0].getTemp(); + } + + instr->pass_flags = ctx.exec_id; + std::pair res = ctx.expr_values.emplace(instr.get(), block.index); + + /* if there was already an expression with the same value number */ + if (!res.second) { + Instruction* orig_instr = res.first->first; + assert(instr->definitions.size() == orig_instr->definitions.size()); + /* check if the original instruction dominates the current one */ + if (dominates(ctx, res.first->second, block.index) && + ctx.program->blocks[res.first->second].fp_mode.canReplace(block.fp_mode)) { + for (unsigned i = 0; i < instr->definitions.size(); i++) { + assert(instr->definitions[i].regClass() == orig_instr->definitions[i].regClass()); + assert(instr->definitions[i].isTemp()); + ctx.renames[instr->definitions[i].tempId()] = orig_instr->definitions[i].getTemp(); + } + } else { + ctx.expr_values.erase(res.first); + ctx.expr_values.emplace(instr.get(), block.index); + new_instructions.emplace_back(std::move(instr)); + } + } else { + new_instructions.emplace_back(std::move(instr)); + } + } + + block.instructions = std::move(new_instructions); +} + +void rename_phi_operands(Block& block, std::map& renames) +{ + for (aco_ptr& phi : block.instructions) { + if (phi->opcode != aco_opcode::p_phi && phi->opcode != aco_opcode::p_linear_phi) + break; + + for (Operand& op : phi->operands) { + if (!op.isTemp()) + continue; + auto it = renames.find(op.tempId()); + if (it != renames.end()) + op.setTemp(it->second); + } + } +} +} /* end namespace */ + + +void value_numbering(Program* program) +{ + vn_ctx ctx(program); + std::vector loop_headers; + + for (Block& block : program->blocks) { + assert(ctx.exec_id > 0); + /* decrement exec_id when leaving nested control flow */ + if (block.kind & block_kind_loop_header) + loop_headers.push_back(block.index); + if (block.kind & block_kind_merge) { + ctx.exec_id--; + } else if (block.kind & block_kind_loop_exit) { + ctx.exec_id -= program->blocks[loop_headers.back()].linear_preds.size(); + ctx.exec_id -= block.linear_preds.size(); + loop_headers.pop_back(); + } + + if (block.logical_idom != -1) + process_block(ctx, block); + else + rename_phi_operands(block, ctx.renames); + + /* increment exec_id when entering nested control flow */ + if (block.kind & block_kind_branch || + block.kind & block_kind_loop_preheader || + block.kind & block_kind_break || + block.kind & block_kind_continue || + block.kind & block_kind_discard) + ctx.exec_id++; + else if (block.kind & block_kind_continue_or_break) + ctx.exec_id += 2; + } + + /* rename loop header phi operands */ + for (Block& block : program->blocks) { + if (block.kind & block_kind_loop_header) + rename_phi_operands(block, ctx.renames); + } +} + +} diff -Nru mesa-19.2.8/src/amd/compiler/aco_print_asm.cpp mesa-20.0.8/src/amd/compiler/aco_print_asm.cpp --- mesa-19.2.8/src/amd/compiler/aco_print_asm.cpp 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/amd/compiler/aco_print_asm.cpp 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,190 @@ +#include +#include +#include "aco_ir.h" +#include "llvm-c/Disassembler.h" +#include "ac_llvm_util.h" + +#include + +namespace aco { + +/* LLVM disassembler only supports GFX8+, try to disassemble with CLRXdisasm + * for GFX6-GFX7 if found on the system, this is better than nothing. +*/ +void print_asm_gfx6_gfx7(Program *program, std::vector& binary, + std::ostream& out) +{ + char path[] = "/tmp/fileXXXXXX"; + char line[2048], command[128]; + const char *gpu_type; + FILE *p; + int fd; + + /* Dump the binary into a temporary file. */ + fd = mkstemp(path); + if (fd < 0) + return; + + for (uint32_t w : binary) + { + if (write(fd, &w, sizeof(w)) == -1) + goto fail; + } + + /* Determine the GPU type for CLRXdisasm. Use the family for GFX6 chips + * because it doesn't allow to use gfx600 directly. + */ + switch (program->chip_class) { + case GFX6: + switch (program->family) { + case CHIP_TAHITI: + gpu_type = "tahiti"; + break; + case CHIP_PITCAIRN: + gpu_type = "pitcairn"; + break; + case CHIP_VERDE: + gpu_type = "capeverde"; + break; + case CHIP_OLAND: + gpu_type = "oland"; + break; + case CHIP_HAINAN: + gpu_type = "hainan"; + break; + default: + unreachable("Invalid GFX6 family!"); + } + break; + case GFX7: + gpu_type = "gfx700"; + break; + default: + unreachable("Invalid chip class!"); + } + + sprintf(command, "clrxdisasm --gpuType=%s -r %s", gpu_type, path); + + p = popen(command, "r"); + if (p) { + while (fgets(line, sizeof(line), p)) + out << line; + pclose(p); + } + +fail: + close(fd); + unlink(path); +} + +void print_asm(Program *program, std::vector& binary, + unsigned exec_size, std::ostream& out) +{ + if (program->chip_class <= GFX7) { + print_asm_gfx6_gfx7(program, binary, out); + return; + } + + std::vector referenced_blocks(program->blocks.size()); + referenced_blocks[0] = true; + for (Block& block : program->blocks) { + for (unsigned succ : block.linear_succs) + referenced_blocks[succ] = true; + } + + std::vector> symbols; + std::vector> block_names; + block_names.reserve(program->blocks.size()); + for (Block& block : program->blocks) { + if (!referenced_blocks[block.index]) + continue; + std::array name; + sprintf(name.data(), "BB%u", block.index); + block_names.push_back(name); + symbols.emplace_back(block.offset * 4, llvm::StringRef(block_names[block_names.size() - 1].data()), 0); + } + + const char *features = ""; + if (program->chip_class >= GFX10 && program->wave_size == 64) { + features = "+wavefrontsize64"; + } + + LLVMDisasmContextRef disasm = LLVMCreateDisasmCPUFeatures("amdgcn-mesa-mesa3d", + ac_get_llvm_processor_name(program->family), + features, + &symbols, 0, NULL, NULL); + + char outline[1024]; + size_t pos = 0; + bool invalid = false; + unsigned next_block = 0; + while (pos < exec_size) { + while (next_block < program->blocks.size() && pos == program->blocks[next_block].offset) { + if (referenced_blocks[next_block]) + out << "BB" << std::dec << next_block << ":" << std::endl; + next_block++; + } + + /* mask out src2 on v_writelane_b32 */ + if (((program->chip_class == GFX8 || program->chip_class == GFX9) && (binary[pos] & 0xffff8000) == 0xd28a0000) || + (program->chip_class == GFX10 && (binary[pos] & 0xffff8000) == 0xd7610000)) { + binary[pos+1] = binary[pos+1] & 0xF803FFFF; + } + + size_t l = LLVMDisasmInstruction(disasm, (uint8_t *) &binary[pos], + (exec_size - pos) * sizeof(uint32_t), pos * 4, + outline, sizeof(outline)); + + size_t new_pos; + const int align_width = 60; + if (!l && program->chip_class == GFX9 && ((binary[pos] & 0xffff8000) == 0xd1348000)) { /* not actually an invalid instruction */ + out << std::left << std::setw(align_width) << std::setfill(' ') << "\tv_add_u32_e64 + clamp"; + new_pos = pos + 2; + } else if (!l) { + out << std::left << std::setw(align_width) << std::setfill(' ') << "(invalid instruction)"; + new_pos = pos + 1; + invalid = true; + } else { + out << std::left << std::setw(align_width) << std::setfill(' ') << outline; + assert(l % 4 == 0); + new_pos = pos + l / 4; + } + out << std::right; + + out << " ;"; + for (; pos < new_pos; pos++) + out << " " << std::setfill('0') << std::setw(8) << std::hex << binary[pos]; + out << std::endl; + } + out << std::setfill(' ') << std::setw(0) << std::dec; + assert(next_block == program->blocks.size()); + + LLVMDisasmDispose(disasm); + + if (program->constant_data.size()) { + out << std::endl << "/* constant data */" << std::endl; + for (unsigned i = 0; i < program->constant_data.size(); i += 32) { + out << '[' << std::setw(6) << std::setfill('0') << std::dec << i << ']'; + unsigned line_size = std::min(program->constant_data.size() - i, 32); + for (unsigned j = 0; j < line_size; j += 4) { + unsigned size = std::min(program->constant_data.size() - (i + j), 4); + uint32_t v = 0; + memcpy(&v, &program->constant_data[i + j], size); + out << " " << std::setw(8) << std::setfill('0') << std::hex << v; + } + out << std::endl; + } + } + + out << std::setfill(' ') << std::setw(0) << std::dec; + + if (invalid) { + /* Invalid instructions usually lead to GPU hangs, which can make + * getting the actual invalid instruction hard. Abort here so that we + * can find the problem. + */ + abort(); + } +} + +} diff -Nru mesa-19.2.8/src/amd/compiler/aco_print_ir.cpp mesa-20.0.8/src/amd/compiler/aco_print_ir.cpp --- mesa-19.2.8/src/amd/compiler/aco_print_ir.cpp 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/amd/compiler/aco_print_ir.cpp 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,658 @@ +#include "aco_ir.h" +#include "aco_builder.h" + +#include "sid.h" +#include "ac_shader_util.h" + +namespace aco { + +static const char *reduce_ops[] = { + [iadd32] = "iadd32", + [iadd64] = "iadd64", + [imul32] = "imul32", + [imul64] = "imul64", + [fadd32] = "fadd32", + [fadd64] = "fadd64", + [fmul32] = "fmul32", + [fmul64] = "fmul64", + [imin32] = "imin32", + [imin64] = "imin64", + [imax32] = "imax32", + [imax64] = "imax64", + [umin32] = "umin32", + [umin64] = "umin64", + [umax32] = "umax32", + [umax64] = "umax64", + [fmin32] = "fmin32", + [fmin64] = "fmin64", + [fmax32] = "fmax32", + [fmax64] = "fmax64", + [iand32] = "iand32", + [iand64] = "iand64", + [ior32] = "ior32", + [ior64] = "ior64", + [ixor32] = "ixor32", + [ixor64] = "ixor64", +}; + +static void print_reg_class(const RegClass rc, FILE *output) +{ + switch (rc) { + case RegClass::s1: fprintf(output, " s1: "); return; + case RegClass::s2: fprintf(output, " s2: "); return; + case RegClass::s3: fprintf(output, " s3: "); return; + case RegClass::s4: fprintf(output, " s4: "); return; + case RegClass::s6: fprintf(output, " s6: "); return; + case RegClass::s8: fprintf(output, " s8: "); return; + case RegClass::s16: fprintf(output, "s16: "); return; + case RegClass::v1: fprintf(output, " v1: "); return; + case RegClass::v2: fprintf(output, " v2: "); return; + case RegClass::v3: fprintf(output, " v3: "); return; + case RegClass::v4: fprintf(output, " v4: "); return; + case RegClass::v5: fprintf(output, " v5: "); return; + case RegClass::v6: fprintf(output, " v6: "); return; + case RegClass::v7: fprintf(output, " v7: "); return; + case RegClass::v8: fprintf(output, " v8: "); return; + case RegClass::v1_linear: fprintf(output, " v1: "); return; + case RegClass::v2_linear: fprintf(output, " v2: "); return; + } +} + +void print_physReg(unsigned reg, unsigned size, FILE *output) +{ + if (reg == 124) { + fprintf(output, ":m0"); + } else if (reg == 106) { + fprintf(output, ":vcc"); + } else if (reg == 253) { + fprintf(output, ":scc"); + } else if (reg == 126) { + fprintf(output, ":exec"); + } else { + bool is_vgpr = reg / 256; + reg = reg % 256; + fprintf(output, ":%c[%d", is_vgpr ? 'v' : 's', reg); + if (size > 1) + fprintf(output, "-%d]", reg + size -1); + else + fprintf(output, "]"); + } +} + +static void print_constant(uint8_t reg, FILE *output) +{ + if (reg >= 128 && reg <= 192) { + fprintf(output, "%d", reg - 128); + return; + } else if (reg >= 192 && reg <= 208) { + fprintf(output, "%d", 192 - reg); + return; + } + + switch (reg) { + case 240: + fprintf(output, "0.5"); + break; + case 241: + fprintf(output, "-0.5"); + break; + case 242: + fprintf(output, "1.0"); + break; + case 243: + fprintf(output, "-1.0"); + break; + case 244: + fprintf(output, "2.0"); + break; + case 245: + fprintf(output, "-2.0"); + break; + case 246: + fprintf(output, "4.0"); + break; + case 247: + fprintf(output, "-4.0"); + break; + case 248: + fprintf(output, "1/(2*PI)"); + break; + } +} + +static void print_operand(const Operand *operand, FILE *output) +{ + if (operand->isLiteral()) { + fprintf(output, "0x%x", operand->constantValue()); + } else if (operand->isConstant()) { + print_constant(operand->physReg().reg, output); + } else if (operand->isUndefined()) { + print_reg_class(operand->regClass(), output); + fprintf(output, "undef"); + } else { + fprintf(output, "%%%d", operand->tempId()); + + if (operand->isFixed()) + print_physReg(operand->physReg(), operand->size(), output); + } +} + +static void print_definition(const Definition *definition, FILE *output) +{ + print_reg_class(definition->regClass(), output); + fprintf(output, "%%%d", definition->tempId()); + + if (definition->isFixed()) + print_physReg(definition->physReg(), definition->size(), output); +} + +static void print_barrier_reorder(bool can_reorder, barrier_interaction barrier, FILE *output) +{ + if (can_reorder) + fprintf(output, " reorder"); + + if (barrier & barrier_buffer) + fprintf(output, " buffer"); + if (barrier & barrier_image) + fprintf(output, " image"); + if (barrier & barrier_atomic) + fprintf(output, " atomic"); + if (barrier & barrier_shared) + fprintf(output, " shared"); + if (barrier & barrier_gs_data) + fprintf(output, " gs_data"); + if (barrier & barrier_gs_sendmsg) + fprintf(output, " gs_sendmsg"); +} + +static void print_instr_format_specific(struct Instruction *instr, FILE *output) +{ + switch (instr->format) { + case Format::SOPK: { + SOPK_instruction* sopk = static_cast(instr); + fprintf(output, " imm:%d", sopk->imm & 0x8000 ? (sopk->imm - 65536) : sopk->imm); + break; + } + case Format::SOPP: { + SOPP_instruction* sopp = static_cast(instr); + uint16_t imm = sopp->imm; + switch (instr->opcode) { + case aco_opcode::s_waitcnt: { + /* we usually should check the chip class for vmcnt/lgkm, but + * insert_waitcnt() should fill it in regardless. */ + unsigned vmcnt = (imm & 0xF) | ((imm & (0x3 << 14)) >> 10); + if (vmcnt != 63) fprintf(output, " vmcnt(%d)", vmcnt); + if (((imm >> 4) & 0x7) < 0x7) fprintf(output, " expcnt(%d)", (imm >> 4) & 0x7); + if (((imm >> 8) & 0x3F) < 0x3F) fprintf(output, " lgkmcnt(%d)", (imm >> 8) & 0x3F); + break; + } + case aco_opcode::s_endpgm: + case aco_opcode::s_endpgm_saved: + case aco_opcode::s_endpgm_ordered_ps_done: + case aco_opcode::s_wakeup: + case aco_opcode::s_barrier: + case aco_opcode::s_icache_inv: + case aco_opcode::s_ttracedata: + case aco_opcode::s_set_gpr_idx_off: { + break; + } + case aco_opcode::s_sendmsg: { + unsigned id = imm & sendmsg_id_mask; + switch (id) { + case sendmsg_none: + fprintf(output, " sendmsg(MSG_NONE)"); + break; + case _sendmsg_gs: + fprintf(output, " sendmsg(gs%s%s, %u)", + imm & 0x10 ? ", cut" : "", imm & 0x20 ? ", emit" : "", imm >> 8); + break; + case _sendmsg_gs_done: + fprintf(output, " sendmsg(gs_done%s%s, %u)", + imm & 0x10 ? ", cut" : "", imm & 0x20 ? ", emit" : "", imm >> 8); + break; + case sendmsg_save_wave: + fprintf(output, " sendmsg(save_wave)"); + break; + case sendmsg_stall_wave_gen: + fprintf(output, " sendmsg(stall_wave_gen)"); + break; + case sendmsg_halt_waves: + fprintf(output, " sendmsg(halt_waves)"); + break; + case sendmsg_ordered_ps_done: + fprintf(output, " sendmsg(ordered_ps_done)"); + break; + case sendmsg_early_prim_dealloc: + fprintf(output, " sendmsg(early_prim_dealloc)"); + break; + case sendmsg_gs_alloc_req: + fprintf(output, " sendmsg(gs_alloc_req)"); + break; + } + break; + } + default: { + if (imm) + fprintf(output, " imm:%u", imm); + break; + } + } + if (sopp->block != -1) + fprintf(output, " block:BB%d", sopp->block); + break; + } + case Format::SMEM: { + SMEM_instruction* smem = static_cast(instr); + if (smem->glc) + fprintf(output, " glc"); + if (smem->dlc) + fprintf(output, " dlc"); + if (smem->nv) + fprintf(output, " nv"); + print_barrier_reorder(smem->can_reorder, smem->barrier, output); + break; + } + case Format::VINTRP: { + Interp_instruction* vintrp = static_cast(instr); + fprintf(output, " attr%d.%c", vintrp->attribute, "xyzw"[vintrp->component]); + break; + } + case Format::DS: { + DS_instruction* ds = static_cast(instr); + if (ds->offset0) + fprintf(output, " offset0:%u", ds->offset0); + if (ds->offset1) + fprintf(output, " offset1:%u", ds->offset1); + if (ds->gds) + fprintf(output, " gds"); + break; + } + case Format::MUBUF: { + MUBUF_instruction* mubuf = static_cast(instr); + if (mubuf->offset) + fprintf(output, " offset:%u", mubuf->offset); + if (mubuf->offen) + fprintf(output, " offen"); + if (mubuf->idxen) + fprintf(output, " idxen"); + if (mubuf->addr64) + fprintf(output, " addr64"); + if (mubuf->glc) + fprintf(output, " glc"); + if (mubuf->dlc) + fprintf(output, " dlc"); + if (mubuf->slc) + fprintf(output, " slc"); + if (mubuf->tfe) + fprintf(output, " tfe"); + if (mubuf->lds) + fprintf(output, " lds"); + if (mubuf->disable_wqm) + fprintf(output, " disable_wqm"); + print_barrier_reorder(mubuf->can_reorder, mubuf->barrier, output); + break; + } + case Format::MIMG: { + MIMG_instruction* mimg = static_cast(instr); + unsigned identity_dmask = !instr->definitions.empty() ? + (1 << instr->definitions[0].size()) - 1 : + 0xf; + if ((mimg->dmask & identity_dmask) != identity_dmask) + fprintf(output, " dmask:%s%s%s%s", + mimg->dmask & 0x1 ? "x" : "", + mimg->dmask & 0x2 ? "y" : "", + mimg->dmask & 0x4 ? "z" : "", + mimg->dmask & 0x8 ? "w" : ""); + switch (mimg->dim) { + case ac_image_1d: + fprintf(output, " 1d"); + break; + case ac_image_2d: + fprintf(output, " 2d"); + break; + case ac_image_3d: + fprintf(output, " 3d"); + break; + case ac_image_cube: + fprintf(output, " cube"); + break; + case ac_image_1darray: + fprintf(output, " 1darray"); + break; + case ac_image_2darray: + fprintf(output, " 2darray"); + break; + case ac_image_2dmsaa: + fprintf(output, " 2dmsaa"); + break; + case ac_image_2darraymsaa: + fprintf(output, " 2darraymsaa"); + break; + } + if (mimg->unrm) + fprintf(output, " unrm"); + if (mimg->glc) + fprintf(output, " glc"); + if (mimg->dlc) + fprintf(output, " dlc"); + if (mimg->slc) + fprintf(output, " slc"); + if (mimg->tfe) + fprintf(output, " tfe"); + if (mimg->da) + fprintf(output, " da"); + if (mimg->lwe) + fprintf(output, " lwe"); + if (mimg->r128 || mimg->a16) + fprintf(output, " r128/a16"); + if (mimg->d16) + fprintf(output, " d16"); + if (mimg->disable_wqm) + fprintf(output, " disable_wqm"); + print_barrier_reorder(mimg->can_reorder, mimg->barrier, output); + break; + } + case Format::EXP: { + Export_instruction* exp = static_cast(instr); + unsigned identity_mask = exp->compressed ? 0x5 : 0xf; + if ((exp->enabled_mask & identity_mask) != identity_mask) + fprintf(output, " en:%c%c%c%c", + exp->enabled_mask & 0x1 ? 'r' : '*', + exp->enabled_mask & 0x2 ? 'g' : '*', + exp->enabled_mask & 0x4 ? 'b' : '*', + exp->enabled_mask & 0x8 ? 'a' : '*'); + if (exp->compressed) + fprintf(output, " compr"); + if (exp->done) + fprintf(output, " done"); + if (exp->valid_mask) + fprintf(output, " vm"); + + if (exp->dest <= V_008DFC_SQ_EXP_MRT + 7) + fprintf(output, " mrt%d", exp->dest - V_008DFC_SQ_EXP_MRT); + else if (exp->dest == V_008DFC_SQ_EXP_MRTZ) + fprintf(output, " mrtz"); + else if (exp->dest == V_008DFC_SQ_EXP_NULL) + fprintf(output, " null"); + else if (exp->dest >= V_008DFC_SQ_EXP_POS && exp->dest <= V_008DFC_SQ_EXP_POS + 3) + fprintf(output, " pos%d", exp->dest - V_008DFC_SQ_EXP_POS); + else if (exp->dest >= V_008DFC_SQ_EXP_PARAM && exp->dest <= V_008DFC_SQ_EXP_PARAM + 31) + fprintf(output, " param%d", exp->dest - V_008DFC_SQ_EXP_PARAM); + break; + } + case Format::PSEUDO_BRANCH: { + Pseudo_branch_instruction* branch = static_cast(instr); + /* Note: BB0 cannot be a branch target */ + if (branch->target[0] != 0) + fprintf(output, " BB%d", branch->target[0]); + if (branch->target[1] != 0) + fprintf(output, ", BB%d", branch->target[1]); + break; + } + case Format::PSEUDO_REDUCTION: { + Pseudo_reduction_instruction* reduce = static_cast(instr); + fprintf(output, " op:%s", reduce_ops[reduce->reduce_op]); + if (reduce->cluster_size) + fprintf(output, " cluster_size:%u", reduce->cluster_size); + break; + } + case Format::FLAT: + case Format::GLOBAL: + case Format::SCRATCH: { + FLAT_instruction* flat = static_cast(instr); + if (flat->offset) + fprintf(output, " offset:%u", flat->offset); + if (flat->glc) + fprintf(output, " glc"); + if (flat->dlc) + fprintf(output, " dlc"); + if (flat->slc) + fprintf(output, " slc"); + if (flat->lds) + fprintf(output, " lds"); + if (flat->nv) + fprintf(output, " nv"); + if (flat->disable_wqm) + fprintf(output, " disable_wqm"); + print_barrier_reorder(flat->can_reorder, flat->barrier, output); + break; + } + case Format::MTBUF: { + MTBUF_instruction* mtbuf = static_cast(instr); + fprintf(output, " dfmt:"); + switch (mtbuf->dfmt) { + case V_008F0C_BUF_DATA_FORMAT_8: fprintf(output, "8"); break; + case V_008F0C_BUF_DATA_FORMAT_16: fprintf(output, "16"); break; + case V_008F0C_BUF_DATA_FORMAT_8_8: fprintf(output, "8_8"); break; + case V_008F0C_BUF_DATA_FORMAT_32: fprintf(output, "32"); break; + case V_008F0C_BUF_DATA_FORMAT_16_16: fprintf(output, "16_16"); break; + case V_008F0C_BUF_DATA_FORMAT_10_11_11: fprintf(output, "10_11_11"); break; + case V_008F0C_BUF_DATA_FORMAT_11_11_10: fprintf(output, "11_11_10"); break; + case V_008F0C_BUF_DATA_FORMAT_10_10_10_2: fprintf(output, "10_10_10_2"); break; + case V_008F0C_BUF_DATA_FORMAT_2_10_10_10: fprintf(output, "2_10_10_10"); break; + case V_008F0C_BUF_DATA_FORMAT_8_8_8_8: fprintf(output, "8_8_8_8"); break; + case V_008F0C_BUF_DATA_FORMAT_32_32: fprintf(output, "32_32"); break; + case V_008F0C_BUF_DATA_FORMAT_16_16_16_16: fprintf(output, "16_16_16_16"); break; + case V_008F0C_BUF_DATA_FORMAT_32_32_32: fprintf(output, "32_32_32"); break; + case V_008F0C_BUF_DATA_FORMAT_32_32_32_32: fprintf(output, "32_32_32_32"); break; + case V_008F0C_BUF_DATA_FORMAT_RESERVED_15: fprintf(output, "reserved15"); break; + } + fprintf(output, " nfmt:"); + switch (mtbuf->nfmt) { + case V_008F0C_BUF_NUM_FORMAT_UNORM: fprintf(output, "unorm"); break; + case V_008F0C_BUF_NUM_FORMAT_SNORM: fprintf(output, "snorm"); break; + case V_008F0C_BUF_NUM_FORMAT_USCALED: fprintf(output, "uscaled"); break; + case V_008F0C_BUF_NUM_FORMAT_SSCALED: fprintf(output, "sscaled"); break; + case V_008F0C_BUF_NUM_FORMAT_UINT: fprintf(output, "uint"); break; + case V_008F0C_BUF_NUM_FORMAT_SINT: fprintf(output, "sint"); break; + case V_008F0C_BUF_NUM_FORMAT_SNORM_OGL: fprintf(output, "snorm"); break; + case V_008F0C_BUF_NUM_FORMAT_FLOAT: fprintf(output, "float"); break; + } + if (mtbuf->offset) + fprintf(output, " offset:%u", mtbuf->offset); + if (mtbuf->offen) + fprintf(output, " offen"); + if (mtbuf->idxen) + fprintf(output, " idxen"); + if (mtbuf->glc) + fprintf(output, " glc"); + if (mtbuf->dlc) + fprintf(output, " dlc"); + if (mtbuf->slc) + fprintf(output, " slc"); + if (mtbuf->tfe) + fprintf(output, " tfe"); + if (mtbuf->disable_wqm) + fprintf(output, " disable_wqm"); + print_barrier_reorder(mtbuf->can_reorder, mtbuf->barrier, output); + break; + } + default: { + break; + } + } + if (instr->isVOP3()) { + VOP3A_instruction* vop3 = static_cast(instr); + switch (vop3->omod) { + case 1: + fprintf(output, " *2"); + break; + case 2: + fprintf(output, " *4"); + break; + case 3: + fprintf(output, " *0.5"); + break; + } + if (vop3->clamp) + fprintf(output, " clamp"); + } else if (instr->isDPP()) { + DPP_instruction* dpp = static_cast(instr); + if (dpp->dpp_ctrl <= 0xff) { + fprintf(output, " quad_perm:[%d,%d,%d,%d]", + dpp->dpp_ctrl & 0x3, (dpp->dpp_ctrl >> 2) & 0x3, + (dpp->dpp_ctrl >> 4) & 0x3, (dpp->dpp_ctrl >> 6) & 0x3); + } else if (dpp->dpp_ctrl >= 0x101 && dpp->dpp_ctrl <= 0x10f) { + fprintf(output, " row_shl:%d", dpp->dpp_ctrl & 0xf); + } else if (dpp->dpp_ctrl >= 0x111 && dpp->dpp_ctrl <= 0x11f) { + fprintf(output, " row_shr:%d", dpp->dpp_ctrl & 0xf); + } else if (dpp->dpp_ctrl >= 0x121 && dpp->dpp_ctrl <= 0x12f) { + fprintf(output, " row_ror:%d", dpp->dpp_ctrl & 0xf); + } else if (dpp->dpp_ctrl == dpp_wf_sl1) { + fprintf(output, " wave_shl:1"); + } else if (dpp->dpp_ctrl == dpp_wf_rl1) { + fprintf(output, " wave_rol:1"); + } else if (dpp->dpp_ctrl == dpp_wf_sr1) { + fprintf(output, " wave_shr:1"); + } else if (dpp->dpp_ctrl == dpp_wf_rr1) { + fprintf(output, " wave_ror:1"); + } else if (dpp->dpp_ctrl == dpp_row_mirror) { + fprintf(output, " row_mirror"); + } else if (dpp->dpp_ctrl == dpp_row_half_mirror) { + fprintf(output, " row_half_mirror"); + } else if (dpp->dpp_ctrl == dpp_row_bcast15) { + fprintf(output, " row_bcast:15"); + } else if (dpp->dpp_ctrl == dpp_row_bcast31) { + fprintf(output, " row_bcast:31"); + } else { + fprintf(output, " dpp_ctrl:0x%.3x", dpp->dpp_ctrl); + } + if (dpp->row_mask != 0xf) + fprintf(output, " row_mask:0x%.1x", dpp->row_mask); + if (dpp->bank_mask != 0xf) + fprintf(output, " bank_mask:0x%.1x", dpp->bank_mask); + if (dpp->bound_ctrl) + fprintf(output, " bound_ctrl:1"); + } else if ((int)instr->format & (int)Format::SDWA) { + fprintf(output, " (printing unimplemented)"); + } +} + +void aco_print_instr(struct Instruction *instr, FILE *output) +{ + if (!instr->definitions.empty()) { + for (unsigned i = 0; i < instr->definitions.size(); ++i) { + print_definition(&instr->definitions[i], output); + if (i + 1 != instr->definitions.size()) + fprintf(output, ", "); + } + fprintf(output, " = "); + } + fprintf(output, "%s", instr_info.name[(int)instr->opcode]); + if (instr->operands.size()) { + bool abs[instr->operands.size()]; + bool neg[instr->operands.size()]; + if ((int)instr->format & (int)Format::VOP3A) { + VOP3A_instruction* vop3 = static_cast(instr); + for (unsigned i = 0; i < instr->operands.size(); ++i) { + abs[i] = vop3->abs[i]; + neg[i] = vop3->neg[i]; + } + } else if (instr->isDPP()) { + DPP_instruction* dpp = static_cast(instr); + assert(instr->operands.size() <= 2); + for (unsigned i = 0; i < instr->operands.size(); ++i) { + abs[i] = dpp->abs[i]; + neg[i] = dpp->neg[i]; + } + } else { + for (unsigned i = 0; i < instr->operands.size(); ++i) { + abs[i] = false; + neg[i] = false; + } + } + for (unsigned i = 0; i < instr->operands.size(); ++i) { + if (i) + fprintf(output, ", "); + else + fprintf(output, " "); + + if (neg[i]) + fprintf(output, "-"); + if (abs[i]) + fprintf(output, "|"); + print_operand(&instr->operands[i], output); + if (abs[i]) + fprintf(output, "|"); + } + } + print_instr_format_specific(instr, output); +} + +static void print_block_kind(uint16_t kind, FILE *output) +{ + if (kind & block_kind_uniform) + fprintf(output, "uniform, "); + if (kind & block_kind_top_level) + fprintf(output, "top-level, "); + if (kind & block_kind_loop_preheader) + fprintf(output, "loop-preheader, "); + if (kind & block_kind_loop_header) + fprintf(output, "loop-header, "); + if (kind & block_kind_loop_exit) + fprintf(output, "loop-exit, "); + if (kind & block_kind_continue) + fprintf(output, "continue, "); + if (kind & block_kind_break) + fprintf(output, "break, "); + if (kind & block_kind_continue_or_break) + fprintf(output, "continue_or_break, "); + if (kind & block_kind_discard) + fprintf(output, "discard, "); + if (kind & block_kind_branch) + fprintf(output, "branch, "); + if (kind & block_kind_merge) + fprintf(output, "merge, "); + if (kind & block_kind_invert) + fprintf(output, "invert, "); + if (kind & block_kind_uses_discard_if) + fprintf(output, "discard_if, "); + if (kind & block_kind_needs_lowering) + fprintf(output, "needs_lowering, "); + if (kind & block_kind_uses_demote) + fprintf(output, "uses_demote, "); +} + +void aco_print_block(const struct Block* block, FILE *output) +{ + fprintf(output, "BB%d\n", block->index); + fprintf(output, "/* logical preds: "); + for (unsigned pred : block->logical_preds) + fprintf(output, "BB%d, ", pred); + fprintf(output, "/ linear preds: "); + for (unsigned pred : block->linear_preds) + fprintf(output, "BB%d, ", pred); + fprintf(output, "/ kind: "); + print_block_kind(block->kind, output); + fprintf(output, "*/\n"); + for (auto const& instr : block->instructions) { + fprintf(output, "\t"); + aco_print_instr(instr.get(), output); + fprintf(output, "\n"); + } +} + +void aco_print_program(Program *program, FILE *output) +{ + for (Block const& block : program->blocks) + aco_print_block(&block, output); + + if (program->constant_data.size()) { + fprintf(output, "\n/* constant data */\n"); + for (unsigned i = 0; i < program->constant_data.size(); i += 32) { + fprintf(output, "[%06d] ", i); + unsigned line_size = std::min(program->constant_data.size() - i, 32); + for (unsigned j = 0; j < line_size; j += 4) { + unsigned size = std::min(program->constant_data.size() - (i + j), 4); + uint32_t v = 0; + memcpy(&v, &program->constant_data[i + j], size); + fprintf(output, " %08x", v); + } + fprintf(output, "\n"); + } + } + + fprintf(output, "\n"); +} + +} diff -Nru mesa-19.2.8/src/amd/compiler/aco_reduce_assign.cpp mesa-20.0.8/src/amd/compiler/aco_reduce_assign.cpp --- mesa-19.2.8/src/amd/compiler/aco_reduce_assign.cpp 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/amd/compiler/aco_reduce_assign.cpp 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,188 @@ +/* + * Copyright © 2018 Valve Corporation + * Copyright © 2018 Google + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + */ + +#include "aco_ir.h" +#include "aco_builder.h" + +/* + * Insert p_linear_start instructions right before RA to correctly allocate + * temporaries for reductions that have to disrespect EXEC by executing in + * WWM. + */ + +namespace aco { + +void setup_reduce_temp(Program* program) +{ + unsigned last_top_level_block_idx = 0; + unsigned maxSize = 0; + + std::vector hasReductions(program->blocks.size()); + for (Block& block : program->blocks) { + for (aco_ptr& instr : block.instructions) { + if (instr->format != Format::PSEUDO_REDUCTION) + continue; + + maxSize = MAX2(maxSize, instr->operands[0].size()); + hasReductions[block.index] = true; + } + } + + if (maxSize == 0) + return; + + assert(maxSize == 1 || maxSize == 2); + Temp reduceTmp(0, RegClass(RegType::vgpr, maxSize).as_linear()); + Temp vtmp(0, RegClass(RegType::vgpr, maxSize).as_linear()); + int inserted_at = -1; + int vtmp_inserted_at = -1; + bool reduceTmp_in_loop = false; + bool vtmp_in_loop = false; + + for (Block& block : program->blocks) { + + /* insert p_end_linear_vgpr after the outermost loop */ + if (reduceTmp_in_loop && block.loop_nest_depth == 0) { + assert(inserted_at == (int)last_top_level_block_idx); + + aco_ptr end{create_instruction(aco_opcode::p_end_linear_vgpr, Format::PSEUDO, vtmp_in_loop ? 2 : 1, 0)}; + end->operands[0] = Operand(reduceTmp); + if (vtmp_in_loop) + end->operands[1] = Operand(vtmp); + /* insert after the phis of the loop exit block */ + std::vector>::iterator it = block.instructions.begin(); + while ((*it)->opcode == aco_opcode::p_linear_phi || (*it)->opcode == aco_opcode::p_phi) + ++it; + block.instructions.insert(it, std::move(end)); + reduceTmp_in_loop = false; + } + + if (block.kind & block_kind_top_level) + last_top_level_block_idx = block.index; + + if (!hasReductions[block.index]) + continue; + + std::vector>::iterator it; + for (it = block.instructions.begin(); it != block.instructions.end(); ++it) { + Instruction *instr = (*it).get(); + if (instr->format != Format::PSEUDO_REDUCTION) + continue; + + ReduceOp op = static_cast(instr)->reduce_op; + reduceTmp_in_loop |= block.loop_nest_depth > 0; + + if ((int)last_top_level_block_idx != inserted_at) { + reduceTmp = {program->allocateId(), reduceTmp.regClass()}; + aco_ptr create{create_instruction(aco_opcode::p_start_linear_vgpr, Format::PSEUDO, 0, 1)}; + create->definitions[0] = Definition(reduceTmp); + /* find the right place to insert this definition */ + if (last_top_level_block_idx == block.index) { + /* insert right before the current instruction */ + it = block.instructions.insert(it, std::move(create)); + it++; + /* inserted_at is intentionally not updated here, so later blocks + * would insert at the end instead of using this one. */ + } else { + assert(last_top_level_block_idx < block.index); + /* insert before the branch at last top level block */ + std::vector>& instructions = program->blocks[last_top_level_block_idx].instructions; + instructions.insert(std::next(instructions.begin(), instructions.size() - 1), std::move(create)); + inserted_at = last_top_level_block_idx; + } + } + + if (op == gfx10_wave64_bpermute) { + instr->operands[1] = Operand(reduceTmp); + continue; + } + + /* same as before, except for the vector temporary instead of the reduce temporary */ + unsigned cluster_size = static_cast(instr)->cluster_size; + bool need_vtmp = op == imul32 || op == fadd64 || op == fmul64 || + op == fmin64 || op == fmax64 || op == umin64 || + op == umax64 || op == imin64 || op == imax64 || + op == imul64; + + if (program->chip_class >= GFX10 && cluster_size == 64) + need_vtmp = true; + if (program->chip_class >= GFX10 && op == iadd64) + need_vtmp = true; + if (program->chip_class <= GFX7) + need_vtmp = true; + + need_vtmp |= cluster_size == 32; + + vtmp_in_loop |= need_vtmp && block.loop_nest_depth > 0; + if (need_vtmp && (int)last_top_level_block_idx != vtmp_inserted_at) { + vtmp = {program->allocateId(), vtmp.regClass()}; + aco_ptr create{create_instruction(aco_opcode::p_start_linear_vgpr, Format::PSEUDO, 0, 1)}; + create->definitions[0] = Definition(vtmp); + if (last_top_level_block_idx == block.index) { + it = block.instructions.insert(it, std::move(create)); + it++; + } else { + assert(last_top_level_block_idx < block.index); + std::vector>& instructions = program->blocks[last_top_level_block_idx].instructions; + instructions.insert(std::next(instructions.begin(), instructions.size() - 1), std::move(create)); + vtmp_inserted_at = last_top_level_block_idx; + } + } + + instr->operands[1] = Operand(reduceTmp); + if (need_vtmp) + instr->operands[2] = Operand(vtmp); + + /* scalar temporary */ + Builder bld(program); + instr->definitions[1] = bld.def(s2); + + /* scalar identity temporary */ + bool need_sitmp = (program->chip_class <= GFX7 || program->chip_class >= GFX10) && instr->opcode != aco_opcode::p_reduce; + if (instr->opcode == aco_opcode::p_exclusive_scan) { + need_sitmp |= + (op == imin32 || op == imin64 || op == imax32 || op == imax64 || + op == fmin32 || op == fmin64 || op == fmax32 || op == fmax64 || + op == fmul64); + } + if (need_sitmp) { + instr->definitions[2] = bld.def(RegClass(RegType::sgpr, instr->operands[0].size())); + } + + /* vcc clobber */ + bool clobber_vcc = false; + if ((op == iadd32 || op == imul64) && program->chip_class < GFX9) + clobber_vcc = true; + if (op == iadd64 || op == umin64 || op == umax64 || op == imin64 || op == imax64) + clobber_vcc = true; + + if (clobber_vcc) + instr->definitions[4] = Definition(vcc, bld.lm); + } + } +} + +}; + diff -Nru mesa-19.2.8/src/amd/compiler/aco_register_allocation.cpp mesa-20.0.8/src/amd/compiler/aco_register_allocation.cpp --- mesa-19.2.8/src/amd/compiler/aco_register_allocation.cpp 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/amd/compiler/aco_register_allocation.cpp 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,1978 @@ +/* + * Copyright © 2018 Valve Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * Authors: + * Daniel Schürmann (daniel.schuermann@campus.tu-berlin.de) + * Bas Nieuwenhuizen (bas@basnieuwenhuizen.nl) + * + */ + +#include +#include +#include +#include +#include + +#include "aco_ir.h" +#include "sid.h" +#include "util/u_math.h" + +namespace aco { +namespace { + +struct ra_ctx { + std::bitset<512> war_hint; + Program* program; + std::unordered_map> assignments; + std::map orig_names; + unsigned max_used_sgpr = 0; + unsigned max_used_vgpr = 0; + std::bitset<64> defs_done; /* see MAX_ARGS in aco_instruction_selection_setup.cpp */ + + ra_ctx(Program* program) : program(program) {} +}; + + +/* helper function for debugging */ +#if 0 +void print_regs(ra_ctx& ctx, bool vgprs, std::array& reg_file) +{ + unsigned max = vgprs ? ctx.program->max_reg_demand.vgpr : ctx.program->max_reg_demand.sgpr; + unsigned lb = vgprs ? 256 : 0; + unsigned ub = lb + max; + char reg_char = vgprs ? 'v' : 's'; + + /* print markers */ + printf(" "); + for (unsigned i = lb; i < ub; i += 3) { + printf("%.2u ", i - lb); + } + printf("\n"); + + /* print usage */ + printf("%cgprs: ", reg_char); + unsigned free_regs = 0; + unsigned prev = 0; + bool char_select = false; + for (unsigned i = lb; i < ub; i++) { + if (reg_file[i] == 0xFFFF) { + printf("~"); + } else if (reg_file[i]) { + if (reg_file[i] != prev) { + prev = reg_file[i]; + char_select = !char_select; + } + printf(char_select ? "#" : "@"); + } else { + free_regs++; + printf("."); + } + } + printf("\n"); + + printf("%u/%u used, %u/%u free\n", max - free_regs, max, free_regs, max); + + /* print assignments */ + prev = 0; + unsigned size = 0; + for (unsigned i = lb; i < ub; i++) { + if (reg_file[i] != prev) { + if (prev && size > 1) + printf("-%d]\n", i - 1 - lb); + else if (prev) + printf("]\n"); + prev = reg_file[i]; + if (prev && prev != 0xFFFF) { + if (ctx.orig_names.count(reg_file[i]) && ctx.orig_names[reg_file[i]].id() != reg_file[i]) + printf("%%%u (was %%%d) = %c[%d", reg_file[i], ctx.orig_names[reg_file[i]].id(), reg_char, i - lb); + else + printf("%%%u = %c[%d", reg_file[i], reg_char, i - lb); + } + size = 1; + } else { + size++; + } + } + if (prev && size > 1) + printf("-%d]\n", ub - lb - 1); + else if (prev) + printf("]\n"); +} +#endif + + +void adjust_max_used_regs(ra_ctx& ctx, RegClass rc, unsigned reg) +{ + unsigned max_addressible_sgpr = ctx.program->sgpr_limit; + unsigned size = rc.size(); + if (rc.type() == RegType::vgpr) { + assert(reg >= 256); + unsigned hi = reg - 256 + size - 1; + ctx.max_used_vgpr = std::max(ctx.max_used_vgpr, hi); + } else if (reg + rc.size() <= max_addressible_sgpr) { + unsigned hi = reg + size - 1; + ctx.max_used_sgpr = std::max(ctx.max_used_sgpr, std::min(hi, max_addressible_sgpr)); + } +} + + +void update_renames(ra_ctx& ctx, std::array& reg_file, + std::vector>& parallelcopies, + aco_ptr& instr) +{ + /* allocate id's and rename operands: this is done transparently here */ + for (std::pair& copy : parallelcopies) { + /* the definitions with id are not from this function and already handled */ + if (copy.second.isTemp()) + continue; + + /* check if we we moved another parallelcopy definition */ + for (std::pair& other : parallelcopies) { + if (!other.second.isTemp()) + continue; + if (copy.first.getTemp() == other.second.getTemp()) { + copy.first.setTemp(other.first.getTemp()); + copy.first.setFixed(other.first.physReg()); + } + } + // FIXME: if a definition got moved, change the target location and remove the parallelcopy + copy.second.setTemp(Temp(ctx.program->allocateId(), copy.second.regClass())); + ctx.assignments[copy.second.tempId()] = {copy.second.physReg(), copy.second.regClass()}; + for (unsigned i = copy.second.physReg().reg; i < copy.second.physReg() + copy.second.size(); i++) + reg_file[i] = copy.second.tempId(); + /* check if we moved an operand */ + for (Operand& op : instr->operands) { + if (!op.isTemp()) + continue; + if (op.tempId() == copy.first.tempId()) { + bool omit_renaming = instr->opcode == aco_opcode::p_create_vector && !op.isKill(); + for (std::pair& pc : parallelcopies) { + PhysReg def_reg = pc.second.physReg(); + omit_renaming &= def_reg > copy.first.physReg() ? + (copy.first.physReg() + copy.first.size() <= def_reg.reg) : + (def_reg + pc.second.size() <= copy.first.physReg().reg); + } + if (omit_renaming) + continue; + op.setTemp(copy.second.getTemp()); + op.setFixed(copy.second.physReg()); + } + } + } +} + +std::pair get_reg_simple(ra_ctx& ctx, + std::array& reg_file, + uint32_t lb, uint32_t ub, + uint32_t size, uint32_t stride, + RegClass rc) +{ + /* best fit algorithm: find the smallest gap to fit in the variable */ + if (stride == 1) { + unsigned best_pos = 0xFFFF; + unsigned gap_size = 0xFFFF; + unsigned next_pos = 0xFFFF; + + for (unsigned current_reg = lb; current_reg < ub; current_reg++) { + if (reg_file[current_reg] != 0 || ctx.war_hint[current_reg]) { + if (next_pos == 0xFFFF) + continue; + + /* check if the variable fits */ + if (next_pos + size > current_reg) { + next_pos = 0xFFFF; + continue; + } + + /* check if the tested gap is smaller */ + if (current_reg - next_pos < gap_size) { + best_pos = next_pos; + gap_size = current_reg - next_pos; + } + next_pos = 0xFFFF; + continue; + } + + if (next_pos == 0xFFFF) + next_pos = current_reg; + } + + /* final check */ + if (next_pos != 0xFFFF && + next_pos + size <= ub && + ub - next_pos < gap_size) { + best_pos = next_pos; + gap_size = ub - next_pos; + } + if (best_pos != 0xFFFF) { + adjust_max_used_regs(ctx, rc, best_pos); + return {PhysReg{best_pos}, true}; + } + return {{}, false}; + } + + bool found = false; + unsigned reg_lo = lb; + unsigned reg_hi = lb + size - 1; + while (!found && reg_lo + size <= ub) { + if (reg_file[reg_lo] != 0) { + reg_lo += stride; + continue; + } + reg_hi = reg_lo + size - 1; + found = true; + for (unsigned reg = reg_lo + 1; found && reg <= reg_hi; reg++) { + if (reg_file[reg] != 0 || ctx.war_hint[reg]) + found = false; + } + if (found) { + adjust_max_used_regs(ctx, rc, reg_lo); + return {PhysReg{reg_lo}, true}; + } + + reg_lo += stride; + } + + return {{}, false}; +} + +bool get_regs_for_copies(ra_ctx& ctx, + std::array& reg_file, + std::vector>& parallelcopies, + std::set> vars, + uint32_t lb, uint32_t ub, + aco_ptr& instr, + uint32_t def_reg_lo, + uint32_t def_reg_hi) +{ + + /* variables are sorted from small sized to large */ + /* NOTE: variables are also sorted by ID. this only affects a very small number of shaders slightly though. */ + for (std::set>::reverse_iterator it = vars.rbegin(); it != vars.rend(); ++it) { + unsigned id = it->second; + std::pair var = ctx.assignments[id]; + uint32_t size = it->first; + uint32_t stride = 1; + if (var.second.type() == RegType::sgpr) { + if (size == 2) + stride = 2; + if (size > 3) + stride = 4; + } + + /* check if this is a dead operand, then we can re-use the space from the definition */ + bool is_dead_operand = false; + for (unsigned i = 0; !is_phi(instr) && !is_dead_operand && i < instr->operands.size(); i++) { + if (instr->operands[i].isTemp() && instr->operands[i].isKill() && instr->operands[i].tempId() == id) + is_dead_operand = true; + } + + std::pair res; + if (is_dead_operand) { + if (instr->opcode == aco_opcode::p_create_vector) { + for (unsigned i = 0, offset = 0; i < instr->operands.size(); offset += instr->operands[i].size(), i++) { + if (instr->operands[i].isTemp() && instr->operands[i].tempId() == id) { + for (unsigned j = 0; j < size; j++) + assert(reg_file[def_reg_lo + offset + j] == 0); + res = {PhysReg{def_reg_lo + offset}, true}; + break; + } + } + } else { + res = get_reg_simple(ctx, reg_file, def_reg_lo, def_reg_hi + 1, size, stride, var.second); + } + } else { + res = get_reg_simple(ctx, reg_file, lb, def_reg_lo, size, stride, var.second); + if (!res.second) { + unsigned lb = (def_reg_hi + stride) & ~(stride - 1); + res = get_reg_simple(ctx, reg_file, lb, ub, size, stride, var.second); + } + } + + if (res.second) { + /* mark the area as blocked */ + for (unsigned i = res.first.reg; i < res.first + size; i++) + reg_file[i] = 0xFFFFFFFF; + /* create parallelcopy pair (without definition id) */ + Temp tmp = Temp(id, var.second); + Operand pc_op = Operand(tmp); + pc_op.setFixed(var.first); + Definition pc_def = Definition(res.first, pc_op.regClass()); + parallelcopies.emplace_back(pc_op, pc_def); + continue; + } + + unsigned best_pos = lb; + unsigned num_moves = 0xFF; + unsigned num_vars = 0; + + /* we use a sliding window to find potential positions */ + unsigned reg_lo = lb; + unsigned reg_hi = lb + size - 1; + for (reg_lo = lb, reg_hi = lb + size - 1; reg_hi < ub; reg_lo += stride, reg_hi += stride) { + if (!is_dead_operand && ((reg_lo >= def_reg_lo && reg_lo <= def_reg_hi) || + (reg_hi >= def_reg_lo && reg_hi <= def_reg_hi))) + continue; + + /* second, check that we have at most k=num_moves elements in the window + * and no element is larger than the currently processed one */ + unsigned k = 0; + unsigned n = 0; + unsigned last_var = 0; + bool found = true; + for (unsigned j = reg_lo; found && j <= reg_hi; j++) { + if (reg_file[j] == 0 || reg_file[j] == last_var) + continue; + + /* 0xFFFF signals that this area is already blocked! */ + if (reg_file[j] == 0xFFFFFFFF || k > num_moves) { + found = false; + break; + } + /* we cannot split live ranges of linear vgprs */ + if (ctx.assignments[reg_file[j]].second & (1 << 6)) { + found = false; + break; + } + bool is_kill = false; + for (const Operand& op : instr->operands) { + if (op.isTemp() && op.isKill() && op.tempId() == reg_file[j]) { + is_kill = true; + break; + } + } + if (!is_kill && ctx.assignments[reg_file[j]].second.size() >= size) { + found = false; + break; + } + + k += ctx.assignments[reg_file[j]].second.size(); + last_var = reg_file[j]; + n++; + if (k > num_moves || (k == num_moves && n <= num_vars)) { + found = false; + break; + } + } + + if (found) { + best_pos = reg_lo; + num_moves = k; + num_vars = n; + } + } + + /* FIXME: we messed up and couldn't find space for the variables to be copied */ + if (num_moves == 0xFF) + return false; + + reg_lo = best_pos; + reg_hi = best_pos + size - 1; + + /* collect variables and block reg file */ + std::set> new_vars; + for (unsigned j = reg_lo; j <= reg_hi; j++) { + if (reg_file[j] != 0) { + unsigned size = ctx.assignments[reg_file[j]].second.size(); + unsigned id = reg_file[j]; + new_vars.emplace(size, id); + for (unsigned k = 0; k < size; k++) + reg_file[ctx.assignments[id].first + k] = 0; + } + } + + /* mark the area as blocked */ + for (unsigned i = reg_lo; i <= reg_hi; i++) + reg_file[i] = 0xFFFFFFFF; + + if (!get_regs_for_copies(ctx, reg_file, parallelcopies, new_vars, lb, ub, instr, def_reg_lo, def_reg_hi)) + return false; + + adjust_max_used_regs(ctx, var.second, reg_lo); + + /* create parallelcopy pair (without definition id) */ + Temp tmp = Temp(id, var.second); + Operand pc_op = Operand(tmp); + pc_op.setFixed(var.first); + Definition pc_def = Definition(PhysReg{reg_lo}, pc_op.regClass()); + parallelcopies.emplace_back(pc_op, pc_def); + } + + return true; +} + + +std::pair get_reg_impl(ra_ctx& ctx, + std::array& reg_file, + std::vector>& parallelcopies, + uint32_t lb, uint32_t ub, + uint32_t size, uint32_t stride, + RegClass rc, + aco_ptr& instr) +{ + unsigned regs_free = 0; + /* check how many free regs we have */ + for (unsigned j = lb; j < ub; j++) { + if (reg_file[j] == 0) + regs_free++; + } + + /* mark and count killed operands */ + unsigned killed_ops = 0; + for (unsigned j = 0; !is_phi(instr) && j < instr->operands.size(); j++) { + if (instr->operands[j].isTemp() && + instr->operands[j].isFirstKill() && + instr->operands[j].physReg() >= lb && + instr->operands[j].physReg() < ub) { + assert(instr->operands[j].isFixed()); + assert(reg_file[instr->operands[j].physReg().reg] == 0); + for (unsigned k = 0; k < instr->operands[j].size(); k++) + reg_file[instr->operands[j].physReg() + k] = 0xFFFFFFFF; + killed_ops += instr->operands[j].getTemp().size(); + } + } + + assert(regs_free >= size); + /* we might have to move dead operands to dst in order to make space */ + unsigned op_moves = 0; + + if (size > (regs_free - killed_ops)) + op_moves = size - (regs_free - killed_ops); + + /* find the best position to place the definition */ + unsigned best_pos = lb; + unsigned num_moves = 0xFF; + unsigned num_vars = 0; + + /* we use a sliding window to check potential positions */ + unsigned reg_lo = lb; + unsigned reg_hi = lb + size - 1; + for (reg_lo = lb, reg_hi = lb + size - 1; reg_hi < ub; reg_lo += stride, reg_hi += stride) { + /* first check the edges: this is what we have to fix to allow for num_moves > size */ + if (reg_lo > lb && reg_file[reg_lo] != 0 && reg_file[reg_lo] == reg_file[reg_lo - 1]) + continue; + if (reg_hi < ub - 1 && reg_file[reg_hi] != 0 && reg_file[reg_hi] == reg_file[reg_hi + 1]) + continue; + + /* second, check that we have at most k=num_moves elements in the window + * and no element is larger than the currently processed one */ + unsigned k = op_moves; + unsigned n = 0; + unsigned remaining_op_moves = op_moves; + unsigned last_var = 0; + bool found = true; + bool aligned = rc == RegClass::v4 && reg_lo % 4 == 0; + for (unsigned j = reg_lo; found && j <= reg_hi; j++) { + if (reg_file[j] == 0 || reg_file[j] == last_var) + continue; + + /* dead operands effectively reduce the number of estimated moves */ + if (remaining_op_moves && reg_file[j] == 0xFFFFFFFF) { + k--; + remaining_op_moves--; + continue; + } + + if (ctx.assignments[reg_file[j]].second.size() >= size) { + found = false; + break; + } + + + /* we cannot split live ranges of linear vgprs */ + if (ctx.assignments[reg_file[j]].second & (1 << 6)) { + found = false; + break; + } + + k += ctx.assignments[reg_file[j]].second.size(); + n++; + last_var = reg_file[j]; + } + + if (!found || k > num_moves) + continue; + if (k == num_moves && n < num_vars) + continue; + if (!aligned && k == num_moves && n == num_vars) + continue; + + if (found) { + best_pos = reg_lo; + num_moves = k; + num_vars = n; + } + } + + if (num_moves == 0xFF) { + /* remove killed operands from reg_file once again */ + for (unsigned i = 0; !is_phi(instr) && i < instr->operands.size(); i++) { + if (instr->operands[i].isTemp() && instr->operands[i].isFirstKill()) { + for (unsigned k = 0; k < instr->operands[i].getTemp().size(); k++) + reg_file[instr->operands[i].physReg() + k] = 0; + } + } + for (unsigned i = 0; i < instr->definitions.size(); i++) { + Definition def = instr->definitions[i]; + if (def.isTemp() && def.isFixed() && ctx.defs_done.test(i)) { + for (unsigned k = 0; k < def.getTemp().size(); k++) + reg_file[def.physReg() + k] = def.tempId(); + } + } + return {{}, false}; + } + + std::array register_file = reg_file; + + /* now, we figured the placement for our definition */ + std::set> vars; + for (unsigned j = best_pos; j < best_pos + size; j++) { + if (reg_file[j] != 0xFFFFFFFF && reg_file[j] != 0) + vars.emplace(ctx.assignments[reg_file[j]].second.size(), reg_file[j]); + reg_file[j] = 0; + } + + if (instr->opcode == aco_opcode::p_create_vector) { + /* move killed operands which aren't yet at the correct position */ + for (unsigned i = 0, offset = 0; i < instr->operands.size(); offset += instr->operands[i].size(), i++) { + if (instr->operands[i].isTemp() && instr->operands[i].isFirstKill() && + instr->operands[i].getTemp().type() == rc.type()) { + + if (instr->operands[i].physReg() != best_pos + offset) { + vars.emplace(instr->operands[i].size(), instr->operands[i].tempId()); + for (unsigned j = 0; j < instr->operands[i].size(); j++) + reg_file[instr->operands[i].physReg() + j] = 0; + } else { + for (unsigned j = 0; j < instr->operands[i].size(); j++) + reg_file[instr->operands[i].physReg() + j] = instr->operands[i].tempId(); + } + } + } + } else { + /* re-enable the killed operands */ + for (unsigned j = 0; !is_phi(instr) && j < instr->operands.size(); j++) { + if (instr->operands[j].isTemp() && instr->operands[j].isFirstKill()) { + for (unsigned k = 0; k < instr->operands[j].getTemp().size(); k++) + reg_file[instr->operands[j].physReg() + k] = instr->operands[j].tempId(); + } + } + } + + std::vector> pc; + if (!get_regs_for_copies(ctx, reg_file, pc, vars, lb, ub, instr, best_pos, best_pos + size - 1)) { + reg_file = std::move(register_file); + /* remove killed operands from reg_file once again */ + if (!is_phi(instr)) { + for (const Operand& op : instr->operands) { + if (op.isTemp() && op.isFirstKill()) { + for (unsigned k = 0; k < op.getTemp().size(); k++) + reg_file[op.physReg() + k] = 0; + } + } + } + for (unsigned i = 0; i < instr->definitions.size(); i++) { + Definition& def = instr->definitions[i]; + if (def.isTemp() && def.isFixed() && ctx.defs_done.test(i)) { + for (unsigned k = 0; k < def.getTemp().size(); k++) + reg_file[def.physReg() + k] = def.tempId(); + } + } + return {{}, false}; + } + + parallelcopies.insert(parallelcopies.end(), pc.begin(), pc.end()); + + /* we set the definition regs == 0. the actual caller is responsible for correct setting */ + for (unsigned i = 0; i < size; i++) + reg_file[best_pos + i] = 0; + + update_renames(ctx, reg_file, parallelcopies, instr); + + /* remove killed operands from reg_file once again */ + for (unsigned i = 0; !is_phi(instr) && i < instr->operands.size(); i++) { + if (!instr->operands[i].isTemp() || !instr->operands[i].isFixed()) + continue; + assert(!instr->operands[i].isUndefined()); + if (instr->operands[i].isFirstKill()) { + for (unsigned j = 0; j < instr->operands[i].getTemp().size(); j++) + reg_file[instr->operands[i].physReg() + j] = 0; + } + } + for (unsigned i = 0; i < instr->definitions.size(); i++) { + Definition def = instr->definitions[i]; + if (def.isTemp() && def.isFixed() && ctx.defs_done.test(i)) { + for (unsigned k = 0; k < def.getTemp().size(); k++) + reg_file[def.physReg() + k] = def.tempId(); + } + } + + adjust_max_used_regs(ctx, rc, best_pos); + return {PhysReg{best_pos}, true}; +} + +PhysReg get_reg(ra_ctx& ctx, + std::array& reg_file, + RegClass rc, + std::vector>& parallelcopies, + aco_ptr& instr) +{ + uint32_t size = rc.size(); + uint32_t stride = 1; + uint32_t lb, ub; + if (rc.type() == RegType::vgpr) { + lb = 256; + ub = 256 + ctx.program->max_reg_demand.vgpr; + } else { + lb = 0; + ub = ctx.program->max_reg_demand.sgpr; + if (size == 2) + stride = 2; + else if (size >= 4) + stride = 4; + } + + std::pair res = {{}, false}; + /* try to find space without live-range splits */ + if (rc.type() == RegType::vgpr && (size == 4 || size == 8)) + res = get_reg_simple(ctx, reg_file, lb, ub, size, 4, rc); + if (!res.second) + res = get_reg_simple(ctx, reg_file, lb, ub, size, stride, rc); + if (res.second) + return res.first; + + /* try to find space with live-range splits */ + res = get_reg_impl(ctx, reg_file, parallelcopies, lb, ub, size, stride, rc, instr); + + if (res.second) + return res.first; + + unsigned regs_free = 0; + for (unsigned i = lb; i < ub; i++) { + if (!reg_file[i]) + regs_free++; + } + + /* We should only fail here because keeping under the limit would require + * too many moves. */ + assert(regs_free >= size); + + /* try using more registers */ + uint16_t max_addressible_sgpr = ctx.program->sgpr_limit; + uint16_t max_addressible_vgpr = ctx.program->vgpr_limit; + if (rc.type() == RegType::vgpr && ctx.program->max_reg_demand.vgpr < max_addressible_vgpr) { + update_vgpr_sgpr_demand(ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr + 1, ctx.program->max_reg_demand.sgpr)); + return get_reg(ctx, reg_file, rc, parallelcopies, instr); + } else if (rc.type() == RegType::sgpr && ctx.program->max_reg_demand.sgpr < max_addressible_sgpr) { + update_vgpr_sgpr_demand(ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr, ctx.program->max_reg_demand.sgpr + 1)); + return get_reg(ctx, reg_file, rc, parallelcopies, instr); + } + + //FIXME: if nothing helps, shift-rotate the registers to make space + + unreachable("did not find a register"); +} + + +std::pair get_reg_vec(ra_ctx& ctx, + std::array& reg_file, + RegClass rc) +{ + uint32_t size = rc.size(); + uint32_t stride = 1; + uint32_t lb, ub; + if (rc.type() == RegType::vgpr) { + lb = 256; + ub = 256 + ctx.program->max_reg_demand.vgpr; + } else { + lb = 0; + ub = ctx.program->max_reg_demand.sgpr; + if (size == 2) + stride = 2; + else if (size >= 4) + stride = 4; + } + return get_reg_simple(ctx, reg_file, lb, ub, size, stride, rc); +} + + +PhysReg get_reg_create_vector(ra_ctx& ctx, + std::array& reg_file, + RegClass rc, + std::vector>& parallelcopies, + aco_ptr& instr) +{ + /* create_vector instructions have different costs w.r.t. register coalescing */ + uint32_t size = rc.size(); + uint32_t stride = 1; + uint32_t lb, ub; + if (rc.type() == RegType::vgpr) { + lb = 256; + ub = 256 + ctx.program->max_reg_demand.vgpr; + } else { + lb = 0; + ub = ctx.program->max_reg_demand.sgpr; + if (size == 2) + stride = 2; + else if (size >= 4) + stride = 4; + } + + unsigned best_pos = -1; + unsigned num_moves = 0xFF; + bool best_war_hint = true; + + /* test for each operand which definition placement causes the least shuffle instructions */ + for (unsigned i = 0, offset = 0; i < instr->operands.size(); offset += instr->operands[i].size(), i++) { + // TODO: think about, if we can alias live operands on the same register + if (!instr->operands[i].isTemp() || !instr->operands[i].isKill() || instr->operands[i].getTemp().type() != rc.type()) + continue; + + if (offset > instr->operands[i].physReg()) + continue; + + unsigned reg_lo = instr->operands[i].physReg() - offset; + unsigned reg_hi = reg_lo + size - 1; + unsigned k = 0; + + /* no need to check multiple times */ + if (reg_lo == best_pos) + continue; + + /* check borders */ + // TODO: this can be improved */ + if (reg_lo < lb || reg_hi >= ub || reg_lo % stride != 0) + continue; + if (reg_lo > lb && reg_file[reg_lo] != 0 && reg_file[reg_lo] == reg_file[reg_lo - 1]) + continue; + if (reg_hi < ub - 1 && reg_file[reg_hi] != 0 && reg_file[reg_hi] == reg_file[reg_hi + 1]) + continue; + + /* count variables to be moved and check war_hint */ + bool war_hint = false; + bool linear_vgpr = false; + for (unsigned j = reg_lo; j <= reg_hi && !linear_vgpr; j++) { + if (reg_file[j] != 0) { + k++; + /* we cannot split live ranges of linear vgprs */ + if (ctx.assignments[reg_file[j]].second & (1 << 6)) + linear_vgpr = true; + } + war_hint |= ctx.war_hint[j]; + } + if (linear_vgpr || (war_hint && !best_war_hint)) + continue; + + /* count operands in wrong positions */ + for (unsigned j = 0, offset = 0; j < instr->operands.size(); offset += instr->operands[j].size(), j++) { + if (j == i || + !instr->operands[j].isTemp() || + instr->operands[j].getTemp().type() != rc.type()) + continue; + if (instr->operands[j].physReg() != reg_lo + offset) + k += instr->operands[j].size(); + } + bool aligned = rc == RegClass::v4 && reg_lo % 4 == 0; + if (k > num_moves || (!aligned && k == num_moves)) + continue; + + best_pos = reg_lo; + num_moves = k; + best_war_hint = war_hint; + } + + if (num_moves >= size) + return get_reg(ctx, reg_file, rc, parallelcopies, instr); + + /* collect variables to be moved */ + std::set> vars; + for (unsigned i = best_pos; i < best_pos + size; i++) { + if (reg_file[i] != 0) + vars.emplace(ctx.assignments[reg_file[i]].second.size(), reg_file[i]); + reg_file[i] = 0; + } + + /* move killed operands which aren't yet at the correct position */ + for (unsigned i = 0, offset = 0; i < instr->operands.size(); offset += instr->operands[i].size(), i++) { + if (instr->operands[i].isTemp() && instr->operands[i].isFirstKill() && instr->operands[i].getTemp().type() == rc.type()) { + if (instr->operands[i].physReg() != best_pos + offset) { + vars.emplace(instr->operands[i].size(), instr->operands[i].tempId()); + } else { + for (unsigned j = 0; j < instr->operands[i].size(); j++) + reg_file[instr->operands[i].physReg() + j] = instr->operands[i].tempId(); + } + } + } + + ASSERTED bool success = false; + success = get_regs_for_copies(ctx, reg_file, parallelcopies, vars, lb, ub, instr, best_pos, best_pos + size - 1); + assert(success); + + update_renames(ctx, reg_file, parallelcopies, instr); + adjust_max_used_regs(ctx, rc, best_pos); + return PhysReg{best_pos}; +} + +bool get_reg_specified(ra_ctx& ctx, + std::array& reg_file, + RegClass rc, + std::vector>& parallelcopies, + aco_ptr& instr, + PhysReg reg) +{ + uint32_t size = rc.size(); + uint32_t stride = 1; + uint32_t lb, ub; + + if (rc.type() == RegType::vgpr) { + lb = 256; + ub = 256 + ctx.program->max_reg_demand.vgpr; + } else { + if (size == 2) + stride = 2; + else if (size >= 4) + stride = 4; + if (reg % stride != 0) + return false; + lb = 0; + ub = ctx.program->max_reg_demand.sgpr; + } + + uint32_t reg_lo = reg.reg; + uint32_t reg_hi = reg + (size - 1); + + if (reg_lo < lb || reg_hi >= ub || reg_lo > reg_hi) + return false; + + for (unsigned i = reg_lo; i <= reg_hi; i++) { + if (reg_file[i] != 0) + return false; + } + adjust_max_used_regs(ctx, rc, reg_lo); + return true; +} + +void handle_pseudo(ra_ctx& ctx, + const std::array& reg_file, + Instruction* instr) +{ + if (instr->format != Format::PSEUDO) + return; + + /* all instructions which use handle_operands() need this information */ + switch (instr->opcode) { + case aco_opcode::p_extract_vector: + case aco_opcode::p_create_vector: + case aco_opcode::p_split_vector: + case aco_opcode::p_parallelcopy: + case aco_opcode::p_wqm: + break; + default: + return; + } + + /* if all definitions are vgpr, no need to care for SCC */ + bool writes_sgpr = false; + for (Definition& def : instr->definitions) { + if (def.getTemp().type() == RegType::sgpr) { + writes_sgpr = true; + break; + } + } + /* if all operands are constant, no need to care either */ + bool reads_sgpr = false; + for (Operand& op : instr->operands) { + if (op.isTemp() && op.getTemp().type() == RegType::sgpr) { + reads_sgpr = true; + break; + } + } + if (!(writes_sgpr && reads_sgpr)) + return; + + Pseudo_instruction *pi = (Pseudo_instruction *)instr; + if (reg_file[scc.reg]) { + pi->tmp_in_scc = true; + + int reg = ctx.max_used_sgpr; + for (; reg >= 0 && reg_file[reg]; reg--) + ; + if (reg < 0) { + reg = ctx.max_used_sgpr + 1; + for (; reg < ctx.program->max_reg_demand.sgpr && reg_file[reg]; reg++) + ; + assert(reg < ctx.program->max_reg_demand.sgpr); + } + + adjust_max_used_regs(ctx, s1, reg); + pi->scratch_sgpr = PhysReg{(unsigned)reg}; + } else { + pi->tmp_in_scc = false; + } +} + +bool operand_can_use_reg(aco_ptr& instr, unsigned idx, PhysReg reg) +{ + switch (instr->format) { + case Format::SMEM: + return reg != scc && + reg != exec && + (reg != m0 || idx == 1 || idx == 3) && /* offset can be m0 */ + (reg != vcc || (instr->definitions.empty() && idx == 2)); /* sdata can be vcc */ + default: + // TODO: there are more instructions with restrictions on registers + return true; + } +} + +} /* end namespace */ + + +void register_allocation(Program *program, std::vector> live_out_per_block) +{ + ra_ctx ctx(program); + + std::vector> renames(program->blocks.size()); + + struct phi_info { + Instruction* phi; + unsigned block_idx; + std::set uses; + }; + + bool filled[program->blocks.size()]; + bool sealed[program->blocks.size()]; + memset(filled, 0, sizeof filled); + memset(sealed, 0, sizeof sealed); + std::vector> incomplete_phis(program->blocks.size()); + std::map phi_map; + std::map affinities; + std::function read_variable; + std::function handle_live_in; + std::function::iterator)> try_remove_trivial_phi; + + read_variable = [&](Temp val, unsigned block_idx) -> Temp { + std::unordered_map::iterator it = renames[block_idx].find(val.id()); + assert(it != renames[block_idx].end()); + return it->second; + }; + + handle_live_in = [&](Temp val, Block *block) -> Temp { + std::vector& preds = val.is_linear() ? block->linear_preds : block->logical_preds; + if (preds.size() == 0 || val.regClass() == val.regClass().as_linear()) { + renames[block->index][val.id()] = val; + return val; + } + assert(preds.size() > 0); + + Temp new_val; + if (!sealed[block->index]) { + /* consider rename from already processed predecessor */ + Temp tmp = read_variable(val, preds[0]); + + /* if the block is not sealed yet, we create an incomplete phi (which might later get removed again) */ + new_val = Temp{program->allocateId(), val.regClass()}; + aco_opcode opcode = val.is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi; + aco_ptr phi{create_instruction(opcode, Format::PSEUDO, preds.size(), 1)}; + phi->definitions[0] = Definition(new_val); + for (unsigned i = 0; i < preds.size(); i++) + phi->operands[i] = Operand(val); + if (tmp.regClass() == new_val.regClass()) + affinities[new_val.id()] = tmp.id(); + + phi_map.emplace(new_val.id(), phi_info{phi.get(), block->index}); + incomplete_phis[block->index].emplace_back(phi.get()); + block->instructions.insert(block->instructions.begin(), std::move(phi)); + + } else if (preds.size() == 1) { + /* if the block has only one predecessor, just look there for the name */ + new_val = read_variable(val, preds[0]); + } else { + /* there are multiple predecessors and the block is sealed */ + Temp ops[preds.size()]; + + /* we start assuming that the name is the same from all predecessors */ + renames[block->index][val.id()] = val; + bool needs_phi = false; + + /* get the rename from each predecessor and check if they are the same */ + for (unsigned i = 0; i < preds.size(); i++) { + ops[i] = read_variable(val, preds[i]); + if (i == 0) + new_val = ops[i]; + else + needs_phi |= !(new_val == ops[i]); + } + + if (needs_phi) { + /* the variable has been renamed differently in the predecessors: we need to insert a phi */ + aco_opcode opcode = val.is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi; + aco_ptr phi{create_instruction(opcode, Format::PSEUDO, preds.size(), 1)}; + new_val = Temp{program->allocateId(), val.regClass()}; + phi->definitions[0] = Definition(new_val); + for (unsigned i = 0; i < preds.size(); i++) { + phi->operands[i] = Operand(ops[i]); + phi->operands[i].setFixed(ctx.assignments[ops[i].id()].first); + if (ops[i].regClass() == new_val.regClass()) + affinities[new_val.id()] = ops[i].id(); + } + phi_map.emplace(new_val.id(), phi_info{phi.get(), block->index}); + block->instructions.insert(block->instructions.begin(), std::move(phi)); + } + } + + renames[block->index][val.id()] = new_val; + renames[block->index][new_val.id()] = new_val; + ctx.orig_names[new_val.id()] = val; + return new_val; + }; + + try_remove_trivial_phi = [&] (std::map::iterator info) -> Temp { + assert(info->second.block_idx != 0); + Instruction* phi = info->second.phi; + Temp same = Temp(); + + Definition def = phi->definitions[0]; + /* a phi node is trivial if all operands are the same as the definition of the phi */ + for (const Operand& op : phi->operands) { + const Temp t = op.getTemp(); + if (t == same || t == def.getTemp()) + continue; + if (!(same == Temp()) || !(op.physReg() == def.physReg())) { + /* phi is not trivial */ + return def.getTemp(); + } + same = t; + } + assert(!(same == Temp() || same == def.getTemp())); + + /* reroute all uses to same and remove phi */ + std::vector::iterator> phi_users; + std::map::iterator same_phi_info = phi_map.find(same.id()); + for (Instruction* instr : info->second.uses) { + assert(phi != instr); + /* recursively try to remove trivial phis */ + if (is_phi(instr)) { + /* ignore if the phi was already flagged trivial */ + if (instr->definitions.empty()) + continue; + + std::map::iterator it = phi_map.find(instr->definitions[0].tempId()); + if (it != phi_map.end() && it != info) + phi_users.emplace_back(it); + } + for (Operand& op : instr->operands) { + if (op.isTemp() && op.tempId() == def.tempId()) { + op.setTemp(same); + if (same_phi_info != phi_map.end()) + same_phi_info->second.uses.emplace(instr); + } + } + } + + auto it = ctx.orig_names.find(same.id()); + unsigned orig_var = it != ctx.orig_names.end() ? it->second.id() : same.id(); + for (unsigned i = 0; i < program->blocks.size(); i++) { + auto it = renames[i].find(orig_var); + if (it != renames[i].end() && it->second == def.getTemp()) + renames[i][orig_var] = same; + } + + unsigned block_idx = info->second.block_idx; + phi->definitions.clear(); /* this indicates that the phi can be removed */ + phi_map.erase(info); + for (auto it : phi_users) { + if (sealed[it->second.block_idx]) + try_remove_trivial_phi(it); + } + + /* due to the removal of other phis, the name might have changed once again! */ + return renames[block_idx][orig_var]; + }; + + std::map vectors; + std::vector> phi_ressources; + std::map temp_to_phi_ressources; + + for (std::vector::reverse_iterator it = program->blocks.rbegin(); it != program->blocks.rend(); it++) { + Block& block = *it; + + /* first, compute the death points of all live vars within the block */ + std::set& live = live_out_per_block[block.index]; + + std::vector>::reverse_iterator rit; + for (rit = block.instructions.rbegin(); rit != block.instructions.rend(); ++rit) { + aco_ptr& instr = *rit; + if (is_phi(instr)) { + live.erase(instr->definitions[0].getTemp()); + if (instr->definitions[0].isKill() || instr->definitions[0].isFixed()) + continue; + /* collect information about affinity-related temporaries */ + std::vector affinity_related; + /* affinity_related[0] is the last seen affinity-related temp */ + affinity_related.emplace_back(instr->definitions[0].getTemp()); + affinity_related.emplace_back(instr->definitions[0].getTemp()); + for (const Operand& op : instr->operands) { + if (op.isTemp() && op.regClass() == instr->definitions[0].regClass()) { + affinity_related.emplace_back(op.getTemp()); + temp_to_phi_ressources[op.tempId()] = phi_ressources.size(); + } + } + phi_ressources.emplace_back(std::move(affinity_related)); + continue; + } + + /* add vector affinities */ + if (instr->opcode == aco_opcode::p_create_vector) { + for (const Operand& op : instr->operands) { + if (op.isTemp() && op.getTemp().type() == instr->definitions[0].getTemp().type()) + vectors[op.tempId()] = instr.get(); + } + } + + /* add operands to live variables */ + for (const Operand& op : instr->operands) { + if (op.isTemp()) + live.emplace(op.getTemp()); + } + + /* erase definitions from live */ + for (unsigned i = 0; i < instr->definitions.size(); i++) { + const Definition& def = instr->definitions[i]; + if (!def.isTemp()) + continue; + live.erase(def.getTemp()); + /* mark last-seen phi operand */ + std::map::iterator it = temp_to_phi_ressources.find(def.tempId()); + if (it != temp_to_phi_ressources.end() && def.regClass() == phi_ressources[it->second][0].regClass()) { + phi_ressources[it->second][0] = def.getTemp(); + /* try to coalesce phi affinities with parallelcopies */ + if (!def.isFixed() && instr->opcode == aco_opcode::p_parallelcopy) { + Operand op = instr->operands[i]; + if (op.isTemp() && op.isFirstKill() && def.regClass() == op.regClass()) { + phi_ressources[it->second].emplace_back(op.getTemp()); + temp_to_phi_ressources[op.tempId()] = it->second; + } + } + } + } + } + } + /* create affinities */ + for (std::vector& vec : phi_ressources) { + assert(vec.size() > 1); + for (unsigned i = 1; i < vec.size(); i++) + if (vec[i].id() != vec[0].id()) + affinities[vec[i].id()] = vec[0].id(); + } + + /* state of register file after phis */ + std::vector> sgpr_live_in(program->blocks.size()); + + for (Block& block : program->blocks) { + std::set& live = live_out_per_block[block.index]; + /* initialize register file */ + assert(block.index != 0 || live.empty()); + std::array register_file = {0}; + ctx.war_hint.reset(); + + for (Temp t : live) { + Temp renamed = handle_live_in(t, &block); + if (ctx.assignments.find(renamed.id()) != ctx.assignments.end()) { + for (unsigned i = 0; i < t.size(); i++) + register_file[ctx.assignments[renamed.id()].first + i] = renamed.id(); + } + } + + std::vector> instructions; + std::vector>::iterator it; + + /* this is a slight adjustment from the paper as we already have phi nodes: + * We consider them incomplete phis and only handle the definition. */ + + /* handle fixed phi definitions */ + for (it = block.instructions.begin(); it != block.instructions.end(); ++it) { + aco_ptr& phi = *it; + if (!is_phi(phi)) + break; + Definition& definition = phi->definitions[0]; + if (!definition.isFixed()) + continue; + + /* check if a dead exec mask phi is needed */ + if (definition.isKill()) { + for (Operand& op : phi->operands) { + assert(op.isTemp()); + if (ctx.assignments.find(op.tempId()) == ctx.assignments.end() || + ctx.assignments[op.tempId()].first != exec) { + definition.setKill(false); + break; + } + } + } + + if (definition.isKill()) + continue; + + assert(definition.physReg() == exec); + for (unsigned i = 0; i < definition.size(); i++) { + assert(register_file[definition.physReg() + i] == 0); + register_file[definition.physReg() + i] = definition.tempId(); + } + ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()}; + } + + /* look up the affinities */ + for (it = block.instructions.begin(); it != block.instructions.end(); ++it) { + aco_ptr& phi = *it; + if (!is_phi(phi)) + break; + Definition& definition = phi->definitions[0]; + if (definition.isKill() || definition.isFixed()) + continue; + + if (affinities.find(definition.tempId()) != affinities.end() && + ctx.assignments.find(affinities[definition.tempId()]) != ctx.assignments.end()) { + assert(ctx.assignments[affinities[definition.tempId()]].second == definition.regClass()); + PhysReg reg = ctx.assignments[affinities[definition.tempId()]].first; + bool try_use_special_reg = reg == scc || reg == exec; + if (try_use_special_reg) { + for (const Operand& op : phi->operands) { + if (!op.isTemp() || + ctx.assignments.find(op.tempId()) == ctx.assignments.end() || + !(ctx.assignments[op.tempId()].first == reg)) { + try_use_special_reg = false; + break; + } + } + if (!try_use_special_reg) + continue; + } + bool reg_free = true; + for (unsigned i = reg.reg; reg_free && i < reg + definition.size(); i++) { + if (register_file[i] != 0) + reg_free = false; + } + /* only assign if register is still free */ + if (reg_free) { + definition.setFixed(reg); + for (unsigned i = 0; i < definition.size(); i++) + register_file[definition.physReg() + i] = definition.tempId(); + ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()}; + } + } + } + + /* find registers for phis without affinity or where the register was blocked */ + for (it = block.instructions.begin();it != block.instructions.end(); ++it) { + aco_ptr& phi = *it; + if (!is_phi(phi)) + break; + + Definition& definition = phi->definitions[0]; + if (definition.isKill()) + continue; + + renames[block.index][definition.tempId()] = definition.getTemp(); + + if (!definition.isFixed()) { + std::vector> parallelcopy; + /* try to find a register that is used by at least one operand */ + for (const Operand& op : phi->operands) { + if (!op.isTemp() || + ctx.assignments.find(op.tempId()) == ctx.assignments.end()) + continue; + PhysReg reg = ctx.assignments[op.tempId()].first; + /* we tried this already on the previous loop */ + if (reg == scc || reg == exec) + continue; + if (get_reg_specified(ctx, register_file, definition.regClass(), parallelcopy, phi, reg)) { + definition.setFixed(reg); + break; + } + } + if (!definition.isFixed()) + definition.setFixed(get_reg(ctx, register_file, definition.regClass(), parallelcopy, phi)); + + /* process parallelcopy */ + for (std::pair pc : parallelcopy) { + /* see if it's a copy from a different phi */ + //TODO: prefer moving some previous phis over live-ins + //TODO: somehow prevent phis fixed before the RA from being updated (shouldn't be a problem in practice since they can only be fixed to exec) + Instruction *prev_phi = NULL; + std::vector>::iterator phi_it; + for (phi_it = instructions.begin(); phi_it != instructions.end(); ++phi_it) { + if ((*phi_it)->definitions[0].tempId() == pc.first.tempId()) + prev_phi = phi_it->get(); + } + phi_it = it; + while (!prev_phi && is_phi(*++phi_it)) { + if ((*phi_it)->definitions[0].tempId() == pc.first.tempId()) + prev_phi = phi_it->get(); + } + if (prev_phi) { + /* if so, just update that phi's register */ + prev_phi->definitions[0].setFixed(pc.second.physReg()); + ctx.assignments[prev_phi->definitions[0].tempId()] = {pc.second.physReg(), pc.second.regClass()}; + for (unsigned reg = pc.second.physReg(); reg < pc.second.physReg() + pc.second.size(); reg++) + register_file[reg] = prev_phi->definitions[0].tempId(); + continue; + } + + /* rename */ + std::map::iterator orig_it = ctx.orig_names.find(pc.first.tempId()); + Temp orig = pc.first.getTemp(); + if (orig_it != ctx.orig_names.end()) + orig = orig_it->second; + else + ctx.orig_names[pc.second.tempId()] = orig; + renames[block.index][orig.id()] = pc.second.getTemp(); + renames[block.index][pc.second.tempId()] = pc.second.getTemp(); + + /* otherwise, this is a live-in and we need to create a new phi + * to move it in this block's predecessors */ + aco_opcode opcode = pc.first.getTemp().is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi; + std::vector& preds = pc.first.getTemp().is_linear() ? block.linear_preds : block.logical_preds; + aco_ptr new_phi{create_instruction(opcode, Format::PSEUDO, preds.size(), 1)}; + new_phi->definitions[0] = pc.second; + for (unsigned i = 0; i < preds.size(); i++) + new_phi->operands[i] = Operand(pc.first); + instructions.emplace_back(std::move(new_phi)); + } + + for (unsigned i = 0; i < definition.size(); i++) + register_file[definition.physReg() + i] = definition.tempId(); + ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()}; + } + live.emplace(definition.getTemp()); + + /* update phi affinities */ + for (const Operand& op : phi->operands) { + if (op.isTemp() && op.regClass() == phi->definitions[0].regClass()) + affinities[op.tempId()] = definition.tempId(); + } + + instructions.emplace_back(std::move(*it)); + } + + /* fill in sgpr_live_in */ + for (unsigned i = 0; i <= ctx.max_used_sgpr; i++) + sgpr_live_in[block.index][i] = register_file[i]; + sgpr_live_in[block.index][127] = register_file[scc.reg]; + + /* Handle all other instructions of the block */ + for (; it != block.instructions.end(); ++it) { + aco_ptr& instr = *it; + + /* parallelcopies from p_phi are inserted here which means + * live ranges of killed operands end here as well */ + if (instr->opcode == aco_opcode::p_logical_end) { + /* no need to process this instruction any further */ + if (block.logical_succs.size() != 1) { + instructions.emplace_back(std::move(instr)); + continue; + } + + Block& succ = program->blocks[block.logical_succs[0]]; + unsigned idx = 0; + for (; idx < succ.logical_preds.size(); idx++) { + if (succ.logical_preds[idx] == block.index) + break; + } + for (aco_ptr& phi : succ.instructions) { + if (phi->opcode == aco_opcode::p_phi) { + if (phi->operands[idx].isTemp() && + phi->operands[idx].getTemp().type() == RegType::sgpr && + phi->operands[idx].isFirstKill()) { + Temp phi_op = read_variable(phi->operands[idx].getTemp(), block.index); + PhysReg reg = ctx.assignments[phi_op.id()].first; + assert(register_file[reg] == phi_op.id()); + register_file[reg] = 0; + } + } else if (phi->opcode != aco_opcode::p_linear_phi) { + break; + } + } + instructions.emplace_back(std::move(instr)); + continue; + } + + std::vector> parallelcopy; + + assert(!is_phi(instr)); + + /* handle operands */ + for (unsigned i = 0; i < instr->operands.size(); ++i) { + auto& operand = instr->operands[i]; + if (!operand.isTemp()) + continue; + + /* rename operands */ + operand.setTemp(read_variable(operand.getTemp(), block.index)); + + /* check if the operand is fixed */ + if (operand.isFixed()) { + + if (operand.physReg() == ctx.assignments[operand.tempId()].first) { + /* we are fine: the operand is already assigned the correct reg */ + + } else { + /* check if target reg is blocked, and move away the blocking var */ + if (register_file[operand.physReg().reg]) { + uint32_t blocking_id = register_file[operand.physReg().reg]; + RegClass rc = ctx.assignments[blocking_id].second; + Operand pc_op = Operand(Temp{blocking_id, rc}); + pc_op.setFixed(operand.physReg()); + Definition pc_def = Definition(Temp{program->allocateId(), pc_op.regClass()}); + /* find free reg */ + PhysReg reg = get_reg(ctx, register_file, pc_op.regClass(), parallelcopy, instr); + pc_def.setFixed(reg); + ctx.assignments[pc_def.tempId()] = {reg, pc_def.regClass()}; + for (unsigned i = 0; i < operand.size(); i++) { + register_file[pc_op.physReg() + i] = 0; + register_file[pc_def.physReg() + i] = pc_def.tempId(); + } + parallelcopy.emplace_back(pc_op, pc_def); + + /* handle renames of previous operands */ + for (unsigned j = 0; j < i; j++) { + Operand& op = instr->operands[j]; + if (op.isTemp() && op.tempId() == blocking_id) { + op.setTemp(pc_def.getTemp()); + op.setFixed(reg); + } + } + } + /* move operand to fixed reg and create parallelcopy pair */ + Operand pc_op = operand; + Temp tmp = Temp{program->allocateId(), operand.regClass()}; + Definition pc_def = Definition(tmp); + pc_def.setFixed(operand.physReg()); + pc_op.setFixed(ctx.assignments[operand.tempId()].first); + operand.setTemp(tmp); + ctx.assignments[tmp.id()] = {pc_def.physReg(), pc_def.regClass()}; + operand.setFixed(pc_def.physReg()); + for (unsigned i = 0; i < operand.size(); i++) { + register_file[pc_op.physReg() + i] = 0; + register_file[pc_def.physReg() + i] = tmp.id(); + } + parallelcopy.emplace_back(pc_op, pc_def); + } + } else { + assert(ctx.assignments.find(operand.tempId()) != ctx.assignments.end()); + PhysReg reg = ctx.assignments[operand.tempId()].first; + + if (operand_can_use_reg(instr, i, reg)) { + operand.setFixed(ctx.assignments[operand.tempId()].first); + } else { + Operand pc_op = operand; + pc_op.setFixed(reg); + PhysReg new_reg = get_reg(ctx, register_file, operand.regClass(), parallelcopy, instr); + Definition pc_def = Definition(program->allocateId(), new_reg, pc_op.regClass()); + ctx.assignments[pc_def.tempId()] = {reg, pc_def.regClass()}; + for (unsigned i = 0; i < operand.size(); i++) { + register_file[pc_op.physReg() + i] = 0; + register_file[pc_def.physReg() + i] = pc_def.tempId(); + } + parallelcopy.emplace_back(pc_op, pc_def); + operand.setFixed(new_reg); + } + + if (instr->format == Format::EXP || + (instr->isVMEM() && i == 3 && program->chip_class == GFX6) || + (instr->format == Format::DS && static_cast(instr.get())->gds)) { + for (unsigned j = 0; j < operand.size(); j++) + ctx.war_hint.set(operand.physReg().reg + j); + } + } + std::map::iterator phi = phi_map.find(operand.getTemp().id()); + if (phi != phi_map.end()) + phi->second.uses.emplace(instr.get()); + + } + /* remove dead vars from register file */ + for (const Operand& op : instr->operands) { + if (op.isTemp() && op.isFirstKill()) + for (unsigned j = 0; j < op.size(); j++) + register_file[op.physReg() + j] = 0; + } + + /* try to optimize v_mad_f32 -> v_mac_f32 */ + if (instr->opcode == aco_opcode::v_mad_f32 && + instr->operands[2].isTemp() && + instr->operands[2].isKill() && + instr->operands[2].getTemp().type() == RegType::vgpr && + instr->operands[1].isTemp() && + instr->operands[1].getTemp().type() == RegType::vgpr) { /* TODO: swap src0 and src1 in this case */ + VOP3A_instruction* vop3 = static_cast(instr.get()); + bool can_use_mac = !(vop3->abs[0] || vop3->abs[1] || vop3->abs[2] || + vop3->neg[0] || vop3->neg[1] || vop3->neg[2] || + vop3->clamp || vop3->omod || vop3->opsel); + if (can_use_mac) { + instr->format = Format::VOP2; + instr->opcode = aco_opcode::v_mac_f32; + } + } + + /* handle definitions which must have the same register as an operand */ + if (instr->opcode == aco_opcode::v_interp_p2_f32 || + instr->opcode == aco_opcode::v_mac_f32 || + instr->opcode == aco_opcode::v_writelane_b32 || + instr->opcode == aco_opcode::v_writelane_b32_e64) { + instr->definitions[0].setFixed(instr->operands[2].physReg()); + } else if (instr->opcode == aco_opcode::s_addk_i32 || + instr->opcode == aco_opcode::s_mulk_i32) { + instr->definitions[0].setFixed(instr->operands[0].physReg()); + } else if (instr->format == Format::MUBUF && + instr->definitions.size() == 1 && + instr->operands.size() == 4) { + instr->definitions[0].setFixed(instr->operands[3].physReg()); + } else if (instr->format == Format::MIMG && + instr->definitions.size() == 1 && + instr->operands[1].regClass().type() == RegType::vgpr) { + instr->definitions[0].setFixed(instr->operands[1].physReg()); + } + + ctx.defs_done.reset(); + + /* handle fixed definitions first */ + for (unsigned i = 0; i < instr->definitions.size(); ++i) { + auto& definition = instr->definitions[i]; + if (!definition.isFixed()) + continue; + + adjust_max_used_regs(ctx, definition.regClass(), definition.physReg()); + /* check if the target register is blocked */ + if (register_file[definition.physReg().reg] != 0) { + /* create parallelcopy pair to move blocking var */ + Temp tmp = {register_file[definition.physReg()], ctx.assignments[register_file[definition.physReg()]].second}; + Operand pc_op = Operand(tmp); + pc_op.setFixed(ctx.assignments[register_file[definition.physReg().reg]].first); + RegClass rc = pc_op.regClass(); + tmp = Temp{program->allocateId(), rc}; + Definition pc_def = Definition(tmp); + + /* re-enable the killed operands, so that we don't move the blocking var there */ + for (const Operand& op : instr->operands) { + if (op.isTemp() && op.isFirstKill()) + for (unsigned j = 0; j < op.size(); j++) + register_file[op.physReg() + j] = 0xFFFF; + } + + /* find a new register for the blocking variable */ + PhysReg reg = get_reg(ctx, register_file, rc, parallelcopy, instr); + /* once again, disable killed operands */ + for (const Operand& op : instr->operands) { + if (op.isTemp() && op.isFirstKill()) + for (unsigned j = 0; j < op.size(); j++) + register_file[op.physReg() + j] = 0; + } + for (unsigned k = 0; k < i; k++) { + if (instr->definitions[k].isTemp() && ctx.defs_done.test(k) && !instr->definitions[k].isKill()) + for (unsigned j = 0; j < instr->definitions[k].size(); j++) + register_file[instr->definitions[k].physReg() + j] = instr->definitions[k].tempId(); + } + pc_def.setFixed(reg); + + /* finish assignment of parallelcopy */ + ctx.assignments[pc_def.tempId()] = {reg, pc_def.regClass()}; + parallelcopy.emplace_back(pc_op, pc_def); + + /* add changes to reg_file */ + for (unsigned i = 0; i < pc_op.size(); i++) { + register_file[pc_op.physReg() + i] = 0x0; + register_file[pc_def.physReg() + i] = pc_def.tempId(); + } + } + ctx.defs_done.set(i); + + if (!definition.isTemp()) + continue; + + /* set live if it has a kill point */ + if (!definition.isKill()) + live.emplace(definition.getTemp()); + + ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()}; + renames[block.index][definition.tempId()] = definition.getTemp(); + for (unsigned j = 0; j < definition.size(); j++) + register_file[definition.physReg() + j] = definition.tempId(); + } + + /* handle all other definitions */ + for (unsigned i = 0; i < instr->definitions.size(); ++i) { + auto& definition = instr->definitions[i]; + + if (definition.isFixed() || !definition.isTemp()) + continue; + + /* find free reg */ + if (definition.hasHint() && register_file[definition.physReg().reg] == 0) + definition.setFixed(definition.physReg()); + else if (instr->opcode == aco_opcode::p_split_vector) { + PhysReg reg = PhysReg{instr->operands[0].physReg() + i * definition.size()}; + if (!get_reg_specified(ctx, register_file, definition.regClass(), parallelcopy, instr, reg)) + reg = get_reg(ctx, register_file, definition.regClass(), parallelcopy, instr); + definition.setFixed(reg); + } else if (instr->opcode == aco_opcode::p_wqm) { + PhysReg reg; + if (instr->operands[0].isKill() && instr->operands[0].getTemp().type() == definition.getTemp().type()) { + reg = instr->operands[0].physReg(); + assert(register_file[reg.reg] == 0); + } else { + reg = get_reg(ctx, register_file, definition.regClass(), parallelcopy, instr); + } + definition.setFixed(reg); + } else if (instr->opcode == aco_opcode::p_extract_vector) { + PhysReg reg; + if (instr->operands[0].isKill() && + instr->operands[0].getTemp().type() == definition.getTemp().type()) { + reg = instr->operands[0].physReg(); + reg.reg += definition.size() * instr->operands[1].constantValue(); + assert(register_file[reg.reg] == 0); + } else { + reg = get_reg(ctx, register_file, definition.regClass(), parallelcopy, instr); + } + definition.setFixed(reg); + } else if (instr->opcode == aco_opcode::p_create_vector) { + PhysReg reg = get_reg_create_vector(ctx, register_file, definition.regClass(), + parallelcopy, instr); + definition.setFixed(reg); + } else if (affinities.find(definition.tempId()) != affinities.end() && + ctx.assignments.find(affinities[definition.tempId()]) != ctx.assignments.end()) { + PhysReg reg = ctx.assignments[affinities[definition.tempId()]].first; + if (get_reg_specified(ctx, register_file, definition.regClass(), parallelcopy, instr, reg)) + definition.setFixed(reg); + else + definition.setFixed(get_reg(ctx, register_file, definition.regClass(), parallelcopy, instr)); + + } else if (vectors.find(definition.tempId()) != vectors.end()) { + Instruction* vec = vectors[definition.tempId()]; + unsigned offset = 0; + for (const Operand& op : vec->operands) { + if (op.isTemp() && op.tempId() == definition.tempId()) + break; + else + offset += op.size(); + } + unsigned k = 0; + for (const Operand& op : vec->operands) { + if (op.isTemp() && + op.tempId() != definition.tempId() && + op.getTemp().type() == definition.getTemp().type() && + ctx.assignments.find(op.tempId()) != ctx.assignments.end()) { + PhysReg reg = ctx.assignments[op.tempId()].first; + reg.reg = reg - k + offset; + if (get_reg_specified(ctx, register_file, definition.regClass(), parallelcopy, instr, reg)) { + definition.setFixed(reg); + break; + } + } + k += op.size(); + } + if (!definition.isFixed()) { + std::pair res = get_reg_vec(ctx, register_file, vec->definitions[0].regClass()); + PhysReg reg = res.first; + if (res.second) { + reg.reg += offset; + } else { + reg = get_reg(ctx, register_file, definition.regClass(), parallelcopy, instr); + } + definition.setFixed(reg); + } + } else + definition.setFixed(get_reg(ctx, register_file, definition.regClass(), parallelcopy, instr)); + + assert(definition.isFixed() && ((definition.getTemp().type() == RegType::vgpr && definition.physReg() >= 256) || + (definition.getTemp().type() != RegType::vgpr && definition.physReg() < 256))); + ctx.defs_done.set(i); + + /* set live if it has a kill point */ + if (!definition.isKill()) + live.emplace(definition.getTemp()); + + ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()}; + renames[block.index][definition.tempId()] = definition.getTemp(); + for (unsigned j = 0; j < definition.size(); j++) + register_file[definition.physReg() + j] = definition.tempId(); + } + + handle_pseudo(ctx, register_file, instr.get()); + + /* kill definitions */ + for (const Definition& def : instr->definitions) { + if (def.isTemp() && def.isKill()) { + for (unsigned j = 0; j < def.size(); j++) { + register_file[def.physReg() + j] = 0; + } + } + } + + /* emit parallelcopy */ + if (!parallelcopy.empty()) { + aco_ptr pc; + pc.reset(create_instruction(aco_opcode::p_parallelcopy, Format::PSEUDO, parallelcopy.size(), parallelcopy.size())); + bool temp_in_scc = register_file[scc.reg]; + bool sgpr_operands_alias_defs = false; + uint64_t sgpr_operands[4] = {0, 0, 0, 0}; + for (unsigned i = 0; i < parallelcopy.size(); i++) { + if (temp_in_scc && parallelcopy[i].first.isTemp() && parallelcopy[i].first.getTemp().type() == RegType::sgpr) { + if (!sgpr_operands_alias_defs) { + unsigned reg = parallelcopy[i].first.physReg().reg; + unsigned size = parallelcopy[i].first.getTemp().size(); + sgpr_operands[reg / 64u] |= ((1u << size) - 1) << (reg % 64u); + + reg = parallelcopy[i].second.physReg().reg; + size = parallelcopy[i].second.getTemp().size(); + if (sgpr_operands[reg / 64u] & ((1u << size) - 1) << (reg % 64u)) + sgpr_operands_alias_defs = true; + } + } + + pc->operands[i] = parallelcopy[i].first; + pc->definitions[i] = parallelcopy[i].second; + assert(pc->operands[i].size() == pc->definitions[i].size()); + + /* it might happen that the operand is already renamed. we have to restore the original name. */ + std::map::iterator it = ctx.orig_names.find(pc->operands[i].tempId()); + Temp orig = it != ctx.orig_names.end() ? it->second : pc->operands[i].getTemp(); + ctx.orig_names[pc->definitions[i].tempId()] = orig; + renames[block.index][orig.id()] = pc->definitions[i].getTemp(); + renames[block.index][pc->definitions[i].tempId()] = pc->definitions[i].getTemp(); + + std::map::iterator phi = phi_map.find(pc->operands[i].tempId()); + if (phi != phi_map.end()) + phi->second.uses.emplace(pc.get()); + } + + if (temp_in_scc && sgpr_operands_alias_defs) { + /* disable definitions and re-enable operands */ + for (const Definition& def : instr->definitions) { + if (def.isTemp() && !def.isKill()) { + for (unsigned j = 0; j < def.size(); j++) { + register_file[def.physReg() + j] = 0x0; + } + } + } + for (const Operand& op : instr->operands) { + if (op.isTemp() && op.isFirstKill()) { + for (unsigned j = 0; j < op.size(); j++) + register_file[op.physReg() + j] = 0xFFFF; + } + } + + handle_pseudo(ctx, register_file, pc.get()); + + /* re-enable live vars */ + for (const Operand& op : instr->operands) { + if (op.isTemp() && op.isFirstKill()) + for (unsigned j = 0; j < op.size(); j++) + register_file[op.physReg() + j] = 0x0; + } + for (const Definition& def : instr->definitions) { + if (def.isTemp() && !def.isKill()) { + for (unsigned j = 0; j < def.size(); j++) { + register_file[def.physReg() + j] = def.tempId(); + } + } + } + } else { + pc->tmp_in_scc = false; + } + + instructions.emplace_back(std::move(pc)); + } + + /* some instructions need VOP3 encoding if operand/definition is not assigned to VCC */ + bool instr_needs_vop3 = !instr->isVOP3() && + ((instr->format == Format::VOPC && !(instr->definitions[0].physReg() == vcc)) || + (instr->opcode == aco_opcode::v_cndmask_b32 && !(instr->operands[2].physReg() == vcc)) || + ((instr->opcode == aco_opcode::v_add_co_u32 || + instr->opcode == aco_opcode::v_addc_co_u32 || + instr->opcode == aco_opcode::v_sub_co_u32 || + instr->opcode == aco_opcode::v_subb_co_u32 || + instr->opcode == aco_opcode::v_subrev_co_u32 || + instr->opcode == aco_opcode::v_subbrev_co_u32) && + !(instr->definitions[1].physReg() == vcc)) || + ((instr->opcode == aco_opcode::v_addc_co_u32 || + instr->opcode == aco_opcode::v_subb_co_u32 || + instr->opcode == aco_opcode::v_subbrev_co_u32) && + !(instr->operands[2].physReg() == vcc))); + if (instr_needs_vop3) { + + /* if the first operand is a literal, we have to move it to a reg */ + if (instr->operands.size() && instr->operands[0].isLiteral() && program->chip_class < GFX10) { + bool can_sgpr = true; + /* check, if we have to move to vgpr */ + for (const Operand& op : instr->operands) { + if (op.isTemp() && op.getTemp().type() == RegType::sgpr) { + can_sgpr = false; + break; + } + } + aco_ptr mov; + if (can_sgpr) + mov.reset(create_instruction(aco_opcode::s_mov_b32, Format::SOP1, 1, 1)); + else + mov.reset(create_instruction(aco_opcode::v_mov_b32, Format::VOP1, 1, 1)); + mov->operands[0] = instr->operands[0]; + Temp tmp = {program->allocateId(), can_sgpr ? s1 : v1}; + mov->definitions[0] = Definition(tmp); + /* disable definitions and re-enable operands */ + for (const Definition& def : instr->definitions) { + for (unsigned j = 0; j < def.size(); j++) { + register_file[def.physReg() + j] = 0x0; + } + } + for (const Operand& op : instr->operands) { + if (op.isTemp() && op.isFirstKill()) { + for (unsigned j = 0; j < op.size(); j++) + register_file[op.physReg() + j] = 0xFFFF; + } + } + mov->definitions[0].setFixed(get_reg(ctx, register_file, tmp.regClass(), parallelcopy, mov)); + instr->operands[0] = Operand(tmp); + instr->operands[0].setFixed(mov->definitions[0].physReg()); + instructions.emplace_back(std::move(mov)); + /* re-enable live vars */ + for (const Operand& op : instr->operands) { + if (op.isTemp() && op.isFirstKill()) + for (unsigned j = 0; j < op.size(); j++) + register_file[op.physReg() + j] = 0x0; + } + for (const Definition& def : instr->definitions) { + if (def.isTemp() && !def.isKill()) { + for (unsigned j = 0; j < def.size(); j++) { + register_file[def.physReg() + j] = def.tempId(); + } + } + } + } + + /* change the instruction to VOP3 to enable an arbitrary register pair as dst */ + aco_ptr tmp = std::move(instr); + Format format = asVOP3(tmp->format); + instr.reset(create_instruction(tmp->opcode, format, tmp->operands.size(), tmp->definitions.size())); + for (unsigned i = 0; i < instr->operands.size(); i++) { + Operand& operand = tmp->operands[i]; + instr->operands[i] = operand; + /* keep phi_map up to date */ + if (operand.isTemp()) { + std::map::iterator phi = phi_map.find(operand.tempId()); + if (phi != phi_map.end()) { + phi->second.uses.erase(tmp.get()); + phi->second.uses.emplace(instr.get()); + } + } + } + std::copy(tmp->definitions.begin(), tmp->definitions.end(), instr->definitions.begin()); + } + instructions.emplace_back(std::move(*it)); + + } /* end for Instr */ + + block.instructions = std::move(instructions); + + filled[block.index] = true; + for (unsigned succ_idx : block.linear_succs) { + Block& succ = program->blocks[succ_idx]; + /* seal block if all predecessors are filled */ + bool all_filled = true; + for (unsigned pred_idx : succ.linear_preds) { + if (!filled[pred_idx]) { + all_filled = false; + break; + } + } + if (all_filled) { + /* finish incomplete phis and check if they became trivial */ + for (Instruction* phi : incomplete_phis[succ_idx]) { + std::vector preds = phi->definitions[0].getTemp().is_linear() ? succ.linear_preds : succ.logical_preds; + for (unsigned i = 0; i < phi->operands.size(); i++) { + phi->operands[i].setTemp(read_variable(phi->operands[i].getTemp(), preds[i])); + phi->operands[i].setFixed(ctx.assignments[phi->operands[i].tempId()].first); + } + try_remove_trivial_phi(phi_map.find(phi->definitions[0].tempId())); + } + /* complete the original phi nodes, but no need to check triviality */ + for (aco_ptr& instr : succ.instructions) { + if (!is_phi(instr)) + break; + std::vector preds = instr->opcode == aco_opcode::p_phi ? succ.logical_preds : succ.linear_preds; + + for (unsigned i = 0; i < instr->operands.size(); i++) { + auto& operand = instr->operands[i]; + if (!operand.isTemp()) + continue; + operand.setTemp(read_variable(operand.getTemp(), preds[i])); + operand.setFixed(ctx.assignments[operand.tempId()].first); + std::map::iterator phi = phi_map.find(operand.getTemp().id()); + if (phi != phi_map.end()) + phi->second.uses.emplace(instr.get()); + } + } + sealed[succ_idx] = true; + } + } + } /* end for BB */ + + /* remove trivial phis */ + for (Block& block : program->blocks) { + auto end = std::find_if(block.instructions.begin(), block.instructions.end(), + [](aco_ptr& instr) { return !is_phi(instr);}); + auto middle = std::remove_if(block.instructions.begin(), end, + [](const aco_ptr& instr) { return instr->definitions.empty();}); + block.instructions.erase(middle, end); + } + + /* find scc spill registers which may be needed for parallelcopies created by phis */ + for (Block& block : program->blocks) { + if (block.linear_preds.size() <= 1) + continue; + + std::bitset<128> regs = sgpr_live_in[block.index]; + if (!regs[127]) + continue; + + /* choose a register */ + int16_t reg = 0; + for (; reg < ctx.program->max_reg_demand.sgpr && regs[reg]; reg++) + ; + assert(reg < ctx.program->max_reg_demand.sgpr); + adjust_max_used_regs(ctx, s1, reg); + + /* update predecessors */ + for (unsigned& pred_index : block.linear_preds) { + Block& pred = program->blocks[pred_index]; + pred.scc_live_out = true; + pred.scratch_sgpr = PhysReg{(uint16_t)reg}; + } + } + + /* num_gpr = rnd_up(max_used_gpr + 1) */ + program->config->num_vgprs = align(ctx.max_used_vgpr + 1, 4); + if (program->family == CHIP_TONGA || program->family == CHIP_ICELAND) /* workaround hardware bug */ + program->config->num_sgprs = get_sgpr_alloc(program, program->sgpr_limit); + else + program->config->num_sgprs = align(ctx.max_used_sgpr + 1 + get_extra_sgprs(program), 8); +} + +} diff -Nru mesa-19.2.8/src/amd/compiler/aco_scheduler.cpp mesa-20.0.8/src/amd/compiler/aco_scheduler.cpp --- mesa-19.2.8/src/amd/compiler/aco_scheduler.cpp 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/amd/compiler/aco_scheduler.cpp 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,1015 @@ +/* + * Copyright © 2018 Valve Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + */ + +#include "aco_ir.h" +#include "aco_builder.h" +#include +#include + +#include "vulkan/radv_shader.h" // for radv_nir_compiler_options +#include "amdgfxregs.h" + +#define SMEM_WINDOW_SIZE (350 - ctx.num_waves * 35) +#define VMEM_WINDOW_SIZE (1024 - ctx.num_waves * 64) +#define POS_EXP_WINDOW_SIZE 512 +#define SMEM_MAX_MOVES (64 - ctx.num_waves * 4) +#define VMEM_MAX_MOVES (128 - ctx.num_waves * 8) +/* creating clauses decreases def-use distances, so make it less aggressive the lower num_waves is */ +#define VMEM_CLAUSE_MAX_GRAB_DIST ((ctx.num_waves - 1) * 8) +#define POS_EXP_MAX_MOVES 512 + +namespace aco { + +struct sched_ctx { + std::vector depends_on; + std::vector RAR_dependencies; + /* For downwards VMEM scheduling, same as RAR_dependencies but excludes the + * instructions in the clause, since new instructions in the clause are not + * moved past any other instructions in the clause. */ + std::vector new_RAR_dependencies; + + RegisterDemand max_registers; + int16_t num_waves; + int16_t last_SMEM_stall; + int last_SMEM_dep_idx; +}; + +/* This scheduler is a simple bottom-up pass based on ideas from + * "A Novel Lightweight Instruction Scheduling Algorithm for Just-In-Time Compiler" + * from Xiaohua Shi and Peng Guo. + * The basic approach is to iterate over all instructions. When a memory instruction + * is encountered it tries to move independent instructions from above and below + * between the memory instruction and it's first user. + * The novelty is that this scheduler cares for the current register pressure: + * Instructions will only be moved if the register pressure won't exceed a certain bound. + */ + +template +void move_element(T& list, size_t idx, size_t before) { + if (idx < before) { + auto begin = std::next(list.begin(), idx); + auto end = std::next(list.begin(), before); + std::rotate(begin, begin + 1, end); + } else if (idx > before) { + auto begin = std::next(list.begin(), before); + auto end = std::next(list.begin(), idx + 1); + std::rotate(begin, end - 1, end); + } +} + +static RegisterDemand getLiveChanges(aco_ptr& instr) +{ + RegisterDemand changes; + for (const Definition& def : instr->definitions) { + if (!def.isTemp() || def.isKill()) + continue; + changes += def.getTemp(); + } + + for (const Operand& op : instr->operands) { + if (!op.isTemp() || !op.isFirstKill()) + continue; + changes -= op.getTemp(); + } + + return changes; +} + +static RegisterDemand getTempRegisters(aco_ptr& instr) +{ + RegisterDemand temp_registers; + for (const Definition& def : instr->definitions) { + if (!def.isTemp() || !def.isKill()) + continue; + temp_registers += def.getTemp(); + } + return temp_registers; +} + +static bool is_spill_reload(aco_ptr& instr) +{ + return instr->opcode == aco_opcode::p_spill || instr->opcode == aco_opcode::p_reload; +} + +bool can_reorder(Instruction* candidate) +{ + switch (candidate->format) { + case Format::SMEM: + return static_cast(candidate)->can_reorder; + case Format::MUBUF: + return static_cast(candidate)->can_reorder; + case Format::MIMG: + return static_cast(candidate)->can_reorder; + case Format::MTBUF: + return static_cast(candidate)->can_reorder; + case Format::FLAT: + case Format::GLOBAL: + case Format::SCRATCH: + return static_cast(candidate)->can_reorder; + default: + return true; + } +} + +bool is_gs_or_done_sendmsg(Instruction *instr) +{ + if (instr->opcode == aco_opcode::s_sendmsg) { + uint16_t imm = static_cast(instr)->imm; + return (imm & sendmsg_id_mask) == _sendmsg_gs || + (imm & sendmsg_id_mask) == _sendmsg_gs_done; + } + return false; +} + +bool is_done_sendmsg(Instruction *instr) +{ + if (instr->opcode == aco_opcode::s_sendmsg) { + uint16_t imm = static_cast(instr)->imm; + return (imm & sendmsg_id_mask) == _sendmsg_gs_done; + } + return false; +} + +barrier_interaction get_barrier_interaction(Instruction* instr) +{ + switch (instr->format) { + case Format::SMEM: + return static_cast(instr)->barrier; + case Format::MUBUF: + return static_cast(instr)->barrier; + case Format::MIMG: + return static_cast(instr)->barrier; + case Format::MTBUF: + return static_cast(instr)->barrier; + case Format::FLAT: + case Format::GLOBAL: + case Format::SCRATCH: + return static_cast(instr)->barrier; + case Format::DS: + return barrier_shared; + case Format::SOPP: + if (is_done_sendmsg(instr)) + return (barrier_interaction)(barrier_gs_data | barrier_gs_sendmsg); + else if (is_gs_or_done_sendmsg(instr)) + return barrier_gs_sendmsg; + else + return barrier_none; + default: + return barrier_none; + } +} + +bool can_move_instr(aco_ptr& instr, Instruction* current, int moving_interaction) +{ + /* don't move exports so that they stay closer together */ + if (instr->format == Format::EXP) + return false; + + /* don't move s_memtime/s_memrealtime */ + if (instr->opcode == aco_opcode::s_memtime || instr->opcode == aco_opcode::s_memrealtime) + return false; + + /* handle barriers */ + + /* TODO: instead of stopping, maybe try to move the barriers and any + * instructions interacting with them instead? */ + if (instr->format != Format::PSEUDO_BARRIER) { + if (instr->opcode == aco_opcode::s_barrier) { + return can_reorder(current) && moving_interaction == barrier_none; + } else if (is_gs_or_done_sendmsg(instr.get())) { + int interaction = get_barrier_interaction(current); + interaction |= moving_interaction; + return !(interaction & get_barrier_interaction(instr.get())); + } else { + return true; + } + } + + int interaction = get_barrier_interaction(current); + interaction |= moving_interaction; + + switch (instr->opcode) { + case aco_opcode::p_memory_barrier_atomic: + return !(interaction & barrier_atomic); + /* For now, buffer and image barriers are treated the same. this is because of + * dEQP-VK.memory_model.message_passing.core11.u32.coherent.fence_fence.atomicwrite.device.payload_nonlocal.buffer.guard_nonlocal.image.comp + * which seems to use an image load to determine if the result of a buffer load is valid. So the ordering of the two loads is important. + * I /think/ we should probably eventually expand the meaning of a buffer barrier so that all buffer operations before it, must stay before it + * and that both image and buffer operations after it, must stay after it. We should also do the same for image barriers. + * Or perhaps the problem is that we don't have a combined barrier instruction for both buffers and images, but the CTS test expects us to? + * Either way, this solution should work. */ + case aco_opcode::p_memory_barrier_buffer: + case aco_opcode::p_memory_barrier_image: + return !(interaction & (barrier_image | barrier_buffer)); + case aco_opcode::p_memory_barrier_shared: + return !(interaction & barrier_shared); + case aco_opcode::p_memory_barrier_common: + return !(interaction & (barrier_image | barrier_buffer | barrier_shared | barrier_atomic)); + case aco_opcode::p_memory_barrier_gs_data: + return !(interaction & barrier_gs_data); + case aco_opcode::p_memory_barrier_gs_sendmsg: + return !(interaction & barrier_gs_sendmsg); + default: + return false; + } +} + +void schedule_SMEM(sched_ctx& ctx, Block* block, + std::vector& register_demand, + Instruction* current, int idx) +{ + assert(idx != 0); + int window_size = SMEM_WINDOW_SIZE; + int max_moves = SMEM_MAX_MOVES; + int16_t k = 0; + bool can_reorder_cur = can_reorder(current); + + /* don't move s_memtime/s_memrealtime */ + if (current->opcode == aco_opcode::s_memtime || current->opcode == aco_opcode::s_memrealtime) + return; + + /* create the initial set of values which current depends on */ + std::fill(ctx.depends_on.begin(), ctx.depends_on.end(), false); + for (const Operand& op : current->operands) { + if (op.isTemp()) + ctx.depends_on[op.tempId()] = true; + } + + /* maintain how many registers remain free when moving instructions */ + RegisterDemand register_pressure = register_demand[idx]; + + /* first, check if we have instructions before current to move down */ + int insert_idx = idx + 1; + int moving_interaction = barrier_none; + bool moving_spill = false; + + for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int) idx - window_size; candidate_idx--) { + assert(candidate_idx >= 0); + aco_ptr& candidate = block->instructions[candidate_idx]; + bool can_reorder_candidate = can_reorder(candidate.get()); + + /* break if we'd make the previous SMEM instruction stall */ + bool can_stall_prev_smem = idx <= ctx.last_SMEM_dep_idx && candidate_idx < ctx.last_SMEM_dep_idx; + if (can_stall_prev_smem && ctx.last_SMEM_stall >= 0) + break; + + /* break when encountering another MEM instruction, logical_start or barriers */ + if (!can_reorder_candidate && !can_reorder_cur) + break; + if (candidate->opcode == aco_opcode::p_logical_start) + break; + if (candidate->opcode == aco_opcode::p_exit_early_if) + break; + if (!can_move_instr(candidate, current, moving_interaction)) + break; + if (candidate->isVMEM()) + break; + register_pressure.update(register_demand[candidate_idx]); + + /* if current depends on candidate, add additional dependencies and continue */ + bool can_move_down = true; + bool writes_exec = false; + for (const Definition& def : candidate->definitions) { + if (def.isTemp() && ctx.depends_on[def.tempId()]) + can_move_down = false; + if (def.isFixed() && def.physReg() == exec) + writes_exec = true; + } + if (writes_exec) + break; + + if (moving_spill && is_spill_reload(candidate)) + can_move_down = false; + if ((moving_interaction & barrier_shared) && candidate->format == Format::DS) + can_move_down = false; + moving_interaction |= get_barrier_interaction(candidate.get()); + moving_spill |= is_spill_reload(candidate); + if (!can_move_down) { + for (const Operand& op : candidate->operands) { + if (op.isTemp()) + ctx.depends_on[op.tempId()] = true; + } + can_reorder_cur &= can_reorder_candidate; + continue; + } + + bool register_pressure_unknown = false; + /* check if one of candidate's operands is killed by depending instruction */ + for (const Operand& op : candidate->operands) { + if (op.isTemp() && ctx.depends_on[op.tempId()]) { + // FIXME: account for difference in register pressure + register_pressure_unknown = true; + } + } + if (register_pressure_unknown) { + for (const Operand& op : candidate->operands) { + if (op.isTemp()) + ctx.depends_on[op.tempId()] = true; + } + can_reorder_cur &= can_reorder_candidate; + continue; + } + + /* check if register pressure is low enough: the diff is negative if register pressure is increased */ + const RegisterDemand candidate_diff = getLiveChanges(candidate); + const RegisterDemand tempDemand = getTempRegisters(candidate); + if (RegisterDemand(register_pressure - candidate_diff).exceeds(ctx.max_registers)) + break; + const RegisterDemand tempDemand2 = getTempRegisters(block->instructions[insert_idx - 1]); + const RegisterDemand new_demand = register_demand[insert_idx - 1] - tempDemand2 + tempDemand; + if (new_demand.exceeds(ctx.max_registers)) + break; + // TODO: we might want to look further to find a sequence of instructions to move down which doesn't exceed reg pressure + + /* move the candidate below the memory load */ + move_element(block->instructions, candidate_idx, insert_idx); + + /* update register pressure */ + move_element(register_demand, candidate_idx, insert_idx); + for (int i = candidate_idx; i < insert_idx - 1; i++) { + register_demand[i] -= candidate_diff; + } + register_demand[insert_idx - 1] = new_demand; + register_pressure -= candidate_diff; + + if (candidate_idx < ctx.last_SMEM_dep_idx) + ctx.last_SMEM_stall++; + insert_idx--; + k++; + } + + /* create the initial set of values which depend on current */ + std::fill(ctx.depends_on.begin(), ctx.depends_on.end(), false); + std::fill(ctx.RAR_dependencies.begin(), ctx.RAR_dependencies.end(), false); + for (const Definition& def : current->definitions) { + if (def.isTemp()) + ctx.depends_on[def.tempId()] = true; + } + + /* find the first instruction depending on current or find another MEM */ + insert_idx = idx + 1; + moving_interaction = barrier_none; + moving_spill = false; + can_reorder_cur = true; + + bool found_dependency = false; + /* second, check if we have instructions after current to move up */ + for (int candidate_idx = idx + 1; k < max_moves && candidate_idx < (int) idx + window_size; candidate_idx++) { + assert(candidate_idx < (int) block->instructions.size()); + aco_ptr& candidate = block->instructions[candidate_idx]; + bool can_reorder_candidate = can_reorder(candidate.get()); + + if (candidate->opcode == aco_opcode::p_logical_end) + break; + if (!can_move_instr(candidate, current, moving_interaction)) + break; + + const bool writes_exec = std::any_of(candidate->definitions.begin(), candidate->definitions.end(), + [](const Definition& def) { return def.isFixed() && def.physReg() == exec;}); + if (writes_exec) + break; + + /* check if candidate depends on current */ + bool is_dependency = std::any_of(candidate->operands.begin(), candidate->operands.end(), + [&ctx](const Operand& op) { return op.isTemp() && ctx.depends_on[op.tempId()];}); + /* no need to steal from following VMEM instructions */ + if (is_dependency && candidate->isVMEM()) + break; + if (moving_spill && is_spill_reload(candidate)) + is_dependency = true; + if ((moving_interaction & barrier_shared) && candidate->format == Format::DS) + is_dependency = true; + moving_interaction |= get_barrier_interaction(candidate.get()); + moving_spill |= is_spill_reload(candidate); + if (is_dependency) { + for (const Definition& def : candidate->definitions) { + if (def.isTemp()) + ctx.depends_on[def.tempId()] = true; + } + for (const Operand& op : candidate->operands) { + if (op.isTemp()) + ctx.RAR_dependencies[op.tempId()] = true; + } + if (!found_dependency) { + insert_idx = candidate_idx; + found_dependency = true; + /* init register pressure */ + register_pressure = register_demand[insert_idx - 1]; + } + } + + if (!can_reorder_candidate && !can_reorder_cur) + break; + + if (!found_dependency) { + k++; + continue; + } + + /* update register pressure */ + register_pressure.update(register_demand[candidate_idx - 1]); + + if (is_dependency) { + can_reorder_cur &= can_reorder_candidate; + continue; + } + assert(insert_idx != idx); + + // TODO: correctly calculate register pressure for this case + bool register_pressure_unknown = false; + /* check if candidate uses/kills an operand which is used by a dependency */ + for (const Operand& op : candidate->operands) { + if (op.isTemp() && ctx.RAR_dependencies[op.tempId()]) + register_pressure_unknown = true; + } + if (register_pressure_unknown) { + if (candidate->isVMEM()) + break; + for (const Definition& def : candidate->definitions) { + if (def.isTemp()) + ctx.RAR_dependencies[def.tempId()] = true; + } + for (const Operand& op : candidate->operands) { + if (op.isTemp()) + ctx.RAR_dependencies[op.tempId()] = true; + } + can_reorder_cur &= can_reorder_candidate; + continue; + } + + /* check if register pressure is low enough: the diff is negative if register pressure is decreased */ + const RegisterDemand candidate_diff = getLiveChanges(candidate); + const RegisterDemand temp = getTempRegisters(candidate); + if (RegisterDemand(register_pressure + candidate_diff).exceeds(ctx.max_registers)) + break; + const RegisterDemand temp2 = getTempRegisters(block->instructions[insert_idx - 1]); + const RegisterDemand new_demand = register_demand[insert_idx - 1] - temp2 + candidate_diff + temp; + if (new_demand.exceeds(ctx.max_registers)) + break; + + /* move the candidate above the insert_idx */ + move_element(block->instructions, candidate_idx, insert_idx); + + /* update register pressure */ + move_element(register_demand, candidate_idx, insert_idx); + for (int i = insert_idx + 1; i <= candidate_idx; i++) { + register_demand[i] += candidate_diff; + } + register_demand[insert_idx] = new_demand; + register_pressure += candidate_diff; + insert_idx++; + k++; + } + + ctx.last_SMEM_dep_idx = found_dependency ? insert_idx : 0; + ctx.last_SMEM_stall = 10 - ctx.num_waves - k; +} + +void schedule_VMEM(sched_ctx& ctx, Block* block, + std::vector& register_demand, + Instruction* current, int idx) +{ + assert(idx != 0); + int window_size = VMEM_WINDOW_SIZE; + int max_moves = VMEM_MAX_MOVES; + int clause_max_grab_dist = VMEM_CLAUSE_MAX_GRAB_DIST; + int16_t k = 0; + /* initially true as we don't pull other VMEM instructions + * through the current instruction */ + bool can_reorder_vmem = true; + bool can_reorder_smem = true; + + /* create the initial set of values which current depends on */ + std::fill(ctx.depends_on.begin(), ctx.depends_on.end(), false); + std::fill(ctx.RAR_dependencies.begin(), ctx.RAR_dependencies.end(), false); + std::fill(ctx.new_RAR_dependencies.begin(), ctx.new_RAR_dependencies.end(), false); + for (const Operand& op : current->operands) { + if (op.isTemp()) { + ctx.depends_on[op.tempId()] = true; + if (op.isFirstKill()) + ctx.RAR_dependencies[op.tempId()] = true; + } + } + + /* maintain how many registers remain free when moving instructions */ + RegisterDemand register_pressure_indep = register_demand[idx]; + RegisterDemand register_pressure_clause = register_demand[idx]; + + /* first, check if we have instructions before current to move down */ + int indep_insert_idx = idx + 1; + int clause_insert_idx = idx; + int moving_interaction = barrier_none; + bool moving_spill = false; + + for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int) idx - window_size; candidate_idx--) { + assert(candidate_idx >= 0); + aco_ptr& candidate = block->instructions[candidate_idx]; + bool can_reorder_candidate = can_reorder(candidate.get()); + bool is_vmem = candidate->isVMEM() || candidate->isFlatOrGlobal(); + + /* break when encountering another VMEM instruction, logical_start or barriers */ + if (!can_reorder_smem && candidate->format == Format::SMEM && !can_reorder_candidate) + break; + if (candidate->opcode == aco_opcode::p_logical_start) + break; + if (candidate->opcode == aco_opcode::p_exit_early_if) + break; + if (!can_move_instr(candidate, current, moving_interaction)) + break; + + /* break if we'd make the previous SMEM instruction stall */ + bool can_stall_prev_smem = idx <= ctx.last_SMEM_dep_idx && candidate_idx < ctx.last_SMEM_dep_idx; + if (can_stall_prev_smem && ctx.last_SMEM_stall >= 0) + break; + register_pressure_indep.update(register_demand[candidate_idx]); + + bool part_of_clause = false; + if (current->isVMEM() == candidate->isVMEM()) { + bool same_resource = true; + if (current->isVMEM()) + same_resource = candidate->operands[0].tempId() == current->operands[0].tempId(); + bool can_reorder = can_reorder_vmem || can_reorder_candidate; + int grab_dist = clause_insert_idx - candidate_idx; + /* We can't easily tell how much this will decrease the def-to-use + * distances, so just use how far it will be moved as a heuristic. */ + part_of_clause = can_reorder && same_resource && grab_dist < clause_max_grab_dist; + } + + /* if current depends on candidate, add additional dependencies and continue */ + bool can_move_down = !is_vmem || part_of_clause; + bool writes_exec = false; + for (const Definition& def : candidate->definitions) { + if (def.isTemp() && ctx.depends_on[def.tempId()]) + can_move_down = false; + if (def.isFixed() && def.physReg() == exec) + writes_exec = true; + } + if (writes_exec) + break; + + if (moving_spill && is_spill_reload(candidate)) + can_move_down = false; + if ((moving_interaction & barrier_shared) && candidate->format == Format::DS) + can_move_down = false; + moving_interaction |= get_barrier_interaction(candidate.get()); + moving_spill |= is_spill_reload(candidate); + if (!can_move_down) { + for (const Operand& op : candidate->operands) { + if (op.isTemp()) { + ctx.depends_on[op.tempId()] = true; + if (op.isFirstKill()) { + ctx.RAR_dependencies[op.tempId()] = true; + ctx.new_RAR_dependencies[op.tempId()] = true; + } + } + } + register_pressure_clause.update(register_demand[candidate_idx]); + can_reorder_smem &= candidate->format != Format::SMEM || can_reorder_candidate; + can_reorder_vmem &= !is_vmem || can_reorder_candidate; + continue; + } + + if (part_of_clause) { + for (const Operand& op : candidate->operands) { + if (op.isTemp()) { + ctx.depends_on[op.tempId()] = true; + if (op.isFirstKill()) + ctx.RAR_dependencies[op.tempId()] = true; + } + } + } + + bool register_pressure_unknown = false; + std::vector& RAR_deps = part_of_clause ? ctx.new_RAR_dependencies : ctx.RAR_dependencies; + /* check if one of candidate's operands is killed by depending instruction */ + for (const Operand& op : candidate->operands) { + if (op.isTemp() && RAR_deps[op.tempId()]) { + // FIXME: account for difference in register pressure + register_pressure_unknown = true; + } + } + if (register_pressure_unknown) { + for (const Operand& op : candidate->operands) { + if (op.isTemp()) { + ctx.depends_on[op.tempId()] = true; + if (op.isFirstKill()) { + ctx.RAR_dependencies[op.tempId()] = true; + ctx.new_RAR_dependencies[op.tempId()] = true; + } + } + } + register_pressure_clause.update(register_demand[candidate_idx]); + can_reorder_smem &= candidate->format != Format::SMEM || can_reorder_candidate; + can_reorder_vmem &= !is_vmem || can_reorder_candidate; + continue; + } + + int insert_idx = part_of_clause ? clause_insert_idx : indep_insert_idx; + RegisterDemand register_pressure = part_of_clause ? register_pressure_clause : register_pressure_indep; + + /* check if register pressure is low enough: the diff is negative if register pressure is increased */ + const RegisterDemand candidate_diff = getLiveChanges(candidate); + const RegisterDemand temp = getTempRegisters(candidate);; + if (RegisterDemand(register_pressure - candidate_diff).exceeds(ctx.max_registers)) + break; + const RegisterDemand temp2 = getTempRegisters(block->instructions[insert_idx - 1]); + const RegisterDemand new_demand = register_demand[insert_idx - 1] - temp2 + temp; + if (new_demand.exceeds(ctx.max_registers)) + break; + // TODO: we might want to look further to find a sequence of instructions to move down which doesn't exceed reg pressure + + /* move the candidate below the memory load */ + move_element(block->instructions, candidate_idx, insert_idx); + + /* update register pressure */ + move_element(register_demand, candidate_idx, insert_idx); + for (int i = candidate_idx; i < insert_idx - 1; i++) { + register_demand[i] -= candidate_diff; + } + register_demand[insert_idx - 1] = new_demand; + register_pressure_clause -= candidate_diff; + clause_insert_idx--; + if (!part_of_clause) { + register_pressure_indep -= candidate_diff; + indep_insert_idx--; + } + k++; + if (candidate_idx < ctx.last_SMEM_dep_idx) + ctx.last_SMEM_stall++; + } + + /* create the initial set of values which depend on current */ + std::fill(ctx.depends_on.begin(), ctx.depends_on.end(), false); + std::fill(ctx.RAR_dependencies.begin(), ctx.RAR_dependencies.end(), false); + for (const Definition& def : current->definitions) { + if (def.isTemp()) + ctx.depends_on[def.tempId()] = true; + } + + /* find the first instruction depending on current or find another VMEM */ + RegisterDemand register_pressure; + int insert_idx = idx; + moving_interaction = barrier_none; + moving_spill = false; + // TODO: differentiate between loads and stores (load-load can always reorder) + can_reorder_vmem = true; + can_reorder_smem = true; + + bool found_dependency = false; + /* second, check if we have instructions after current to move up */ + for (int candidate_idx = idx + 1; k < max_moves && candidate_idx < (int) idx + window_size; candidate_idx++) { + assert(candidate_idx < (int) block->instructions.size()); + aco_ptr& candidate = block->instructions[candidate_idx]; + bool can_reorder_candidate = can_reorder(candidate.get()); + bool is_vmem = candidate->isVMEM() || candidate->isFlatOrGlobal(); + + if (candidate->opcode == aco_opcode::p_logical_end) + break; + if (!can_move_instr(candidate, current, moving_interaction)) + break; + + const bool writes_exec = std::any_of(candidate->definitions.begin(), candidate->definitions.end(), + [](const Definition& def) {return def.isFixed() && def.physReg() == exec; }); + if (writes_exec) + break; + + /* check if candidate depends on current */ + bool is_dependency = false; + if (candidate->format == Format::SMEM) + is_dependency = !can_reorder_smem && !can_reorder_candidate; + if (is_vmem) + is_dependency = !can_reorder_vmem && !can_reorder_candidate; + for (const Operand& op : candidate->operands) { + if (op.isTemp() && ctx.depends_on[op.tempId()]) { + is_dependency = true; + break; + } + } + if (moving_spill && is_spill_reload(candidate)) + is_dependency = true; + if ((moving_interaction & barrier_shared) && candidate->format == Format::DS) + is_dependency = true; + moving_interaction |= get_barrier_interaction(candidate.get()); + moving_spill |= is_spill_reload(candidate); + if (is_dependency) { + for (const Definition& def : candidate->definitions) { + if (def.isTemp()) + ctx.depends_on[def.tempId()] = true; + } + for (const Operand& op : candidate->operands) { + if (op.isTemp()) + ctx.RAR_dependencies[op.tempId()] = true; + } + /* update flag whether we can reorder other memory instructions */ + can_reorder_smem &= candidate->format != Format::SMEM || can_reorder_candidate; + can_reorder_vmem &= !is_vmem || can_reorder_candidate; + + if (!found_dependency) { + insert_idx = candidate_idx; + found_dependency = true; + /* init register pressure */ + register_pressure = register_demand[insert_idx - 1]; + continue; + } + + } else if (is_vmem) { + /* don't move up dependencies of other VMEM instructions */ + for (const Definition& def : candidate->definitions) { + if (def.isTemp()) + ctx.depends_on[def.tempId()] = true; + } + } + + /* update register pressure */ + register_pressure.update(register_demand[candidate_idx - 1]); + + if (is_dependency || !found_dependency) + continue; + assert(insert_idx != idx); + + bool register_pressure_unknown = false; + /* check if candidate uses/kills an operand which is used by a dependency */ + for (const Operand& op : candidate->operands) { + if (op.isTemp() && op.isFirstKill() && ctx.RAR_dependencies[op.tempId()]) + register_pressure_unknown = true; + } + if (register_pressure_unknown) { + for (const Definition& def : candidate->definitions) { + if (def.isTemp()) + ctx.depends_on[def.tempId()] = true; + } + for (const Operand& op : candidate->operands) { + if (op.isTemp()) + ctx.RAR_dependencies[op.tempId()] = true; + } + can_reorder_smem &= candidate->format != Format::SMEM || can_reorder_candidate; + can_reorder_vmem &= !is_vmem || can_reorder_candidate; + continue; + } + + /* check if register pressure is low enough: the diff is negative if register pressure is decreased */ + const RegisterDemand candidate_diff = getLiveChanges(candidate); + const RegisterDemand temp = getTempRegisters(candidate); + if (RegisterDemand(register_pressure + candidate_diff).exceeds(ctx.max_registers)) + break; + const RegisterDemand temp2 = getTempRegisters(block->instructions[insert_idx - 1]); + const RegisterDemand new_demand = register_demand[insert_idx - 1] - temp2 + candidate_diff + temp; + if (new_demand.exceeds(ctx.max_registers)) + break; + + /* move the candidate above the insert_idx */ + move_element(block->instructions, candidate_idx, insert_idx); + + /* update register pressure */ + move_element(register_demand, candidate_idx, insert_idx); + for (int i = insert_idx + 1; i <= candidate_idx; i++) { + register_demand[i] += candidate_diff; + } + register_demand[insert_idx] = new_demand; + register_pressure += candidate_diff; + insert_idx++; + k++; + } +} + +void schedule_position_export(sched_ctx& ctx, Block* block, + std::vector& register_demand, + Instruction* current, int idx) +{ + assert(idx != 0); + int window_size = POS_EXP_WINDOW_SIZE; + int max_moves = POS_EXP_MAX_MOVES; + int16_t k = 0; + + /* create the initial set of values which current depends on */ + std::fill(ctx.depends_on.begin(), ctx.depends_on.end(), false); + std::fill(ctx.RAR_dependencies.begin(), ctx.RAR_dependencies.end(), false); + for (const Operand& op : current->operands) { + if (op.isTemp()) { + ctx.depends_on[op.tempId()] = true; + if (op.isFirstKill()) + ctx.RAR_dependencies[op.tempId()] = true; + } + } + + /* maintain how many registers remain free when moving instructions */ + RegisterDemand register_pressure = register_demand[idx]; + + /* first, check if we have instructions before current to move down */ + int insert_idx = idx + 1; + int moving_interaction = barrier_none; + bool moving_spill = false; + + for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int) idx - window_size; candidate_idx--) { + assert(candidate_idx >= 0); + aco_ptr& candidate = block->instructions[candidate_idx]; + + /* break when encountering logical_start or barriers */ + if (candidate->opcode == aco_opcode::p_logical_start) + break; + if (candidate->opcode == aco_opcode::p_exit_early_if) + break; + if (candidate->isVMEM() || candidate->format == Format::SMEM || candidate->isFlatOrGlobal()) + break; + if (!can_move_instr(candidate, current, moving_interaction)) + break; + + register_pressure.update(register_demand[candidate_idx]); + + /* if current depends on candidate, add additional dependencies and continue */ + bool can_move_down = true; + bool writes_exec = false; + for (unsigned i = 0; i < candidate->definitions.size(); i++) { + if (candidate->definitions[i].isTemp() && ctx.depends_on[candidate->definitions[i].tempId()]) + can_move_down = false; + if (candidate->definitions[i].isFixed() && candidate->definitions[i].physReg() == exec) + writes_exec = true; + } + if (writes_exec) + break; + + if (moving_spill && is_spill_reload(candidate)) + can_move_down = false; + if ((moving_interaction & barrier_shared) && candidate->format == Format::DS) + can_move_down = false; + moving_interaction |= get_barrier_interaction(candidate.get()); + moving_spill |= is_spill_reload(candidate); + if (!can_move_down) { + for (const Operand& op : candidate->operands) { + if (op.isTemp()) { + ctx.depends_on[op.tempId()] = true; + if (op.isFirstKill()) + ctx.RAR_dependencies[op.tempId()] = true; + } + } + continue; + } + + bool register_pressure_unknown = false; + /* check if one of candidate's operands is killed by depending instruction */ + for (const Operand& op : candidate->operands) { + if (op.isTemp() && ctx.RAR_dependencies[op.tempId()]) { + // FIXME: account for difference in register pressure + register_pressure_unknown = true; + } + } + if (register_pressure_unknown) { + for (const Operand& op : candidate->operands) { + if (op.isTemp()) { + ctx.depends_on[op.tempId()] = true; + if (op.isFirstKill()) + ctx.RAR_dependencies[op.tempId()] = true; + } + } + continue; + } + + /* check if register pressure is low enough: the diff is negative if register pressure is increased */ + const RegisterDemand candidate_diff = getLiveChanges(candidate); + const RegisterDemand temp = getTempRegisters(candidate);; + if (RegisterDemand(register_pressure - candidate_diff).exceeds(ctx.max_registers)) + break; + const RegisterDemand temp2 = getTempRegisters(block->instructions[insert_idx - 1]); + const RegisterDemand new_demand = register_demand[insert_idx - 1] - temp2 + temp; + if (new_demand.exceeds(ctx.max_registers)) + break; + // TODO: we might want to look further to find a sequence of instructions to move down which doesn't exceed reg pressure + + /* move the candidate below the export */ + move_element(block->instructions, candidate_idx, insert_idx); + + /* update register pressure */ + move_element(register_demand, candidate_idx, insert_idx); + for (int i = candidate_idx; i < insert_idx - 1; i++) { + register_demand[i] -= candidate_diff; + } + register_demand[insert_idx - 1] = new_demand; + register_pressure -= candidate_diff; + insert_idx--; + k++; + } +} + +void schedule_block(sched_ctx& ctx, Program *program, Block* block, live& live_vars) +{ + ctx.last_SMEM_dep_idx = 0; + ctx.last_SMEM_stall = INT16_MIN; + + /* go through all instructions and find memory loads */ + for (unsigned idx = 0; idx < block->instructions.size(); idx++) { + Instruction* current = block->instructions[idx].get(); + + if (current->definitions.empty()) + continue; + + if (current->isVMEM() || current->isFlatOrGlobal()) + schedule_VMEM(ctx, block, live_vars.register_demand[block->index], current, idx); + if (current->format == Format::SMEM) + schedule_SMEM(ctx, block, live_vars.register_demand[block->index], current, idx); + } + + if ((program->stage & hw_vs) && block->index == program->blocks.size() - 1) { + /* Try to move position exports as far up as possible, to reduce register + * usage and because ISA reference guides say so. */ + for (unsigned idx = 0; idx < block->instructions.size(); idx++) { + Instruction* current = block->instructions[idx].get(); + + if (current->format == Format::EXP) { + unsigned target = static_cast(current)->dest; + if (target >= V_008DFC_SQ_EXP_POS && target < V_008DFC_SQ_EXP_PARAM) + schedule_position_export(ctx, block, live_vars.register_demand[block->index], current, idx); + } + } + } + + /* resummarize the block's register demand */ + block->register_demand = RegisterDemand(); + for (unsigned idx = 0; idx < block->instructions.size(); idx++) { + block->register_demand.update(live_vars.register_demand[block->index][idx]); + } +} + + +void schedule_program(Program *program, live& live_vars) +{ + sched_ctx ctx; + ctx.depends_on.resize(program->peekAllocationId()); + ctx.RAR_dependencies.resize(program->peekAllocationId()); + ctx.new_RAR_dependencies.resize(program->peekAllocationId()); + /* Allowing the scheduler to reduce the number of waves to as low as 5 + * improves performance of Thrones of Britannia significantly and doesn't + * seem to hurt anything else. */ + if (program->num_waves <= 5) + ctx.num_waves = program->num_waves; + else if (program->max_reg_demand.vgpr >= 32) + ctx.num_waves = 5; + else if (program->max_reg_demand.vgpr >= 28) + ctx.num_waves = 6; + else if (program->max_reg_demand.vgpr >= 24) + ctx.num_waves = 7; + else + ctx.num_waves = 8; + ctx.num_waves = std::max(ctx.num_waves, program->min_waves); + + assert(ctx.num_waves > 0 && ctx.num_waves <= program->num_waves); + ctx.max_registers = { int16_t(get_addr_vgpr_from_waves(program, ctx.num_waves) - 2), + int16_t(get_addr_sgpr_from_waves(program, ctx.num_waves))}; + + for (Block& block : program->blocks) + schedule_block(ctx, program, &block, live_vars); + + /* update max_reg_demand and num_waves */ + RegisterDemand new_demand; + for (Block& block : program->blocks) { + new_demand.update(block.register_demand); + } + update_vgpr_sgpr_demand(program, new_demand); + + /* if enabled, this code asserts that register_demand is updated correctly */ + #if 0 + int prev_num_waves = program->num_waves; + const RegisterDemand prev_max_demand = program->max_reg_demand; + + std::vector demands(program->blocks.size()); + for (unsigned j = 0; j < program->blocks.size(); j++) { + demands[j] = program->blocks[j].register_demand; + } + + struct radv_nir_compiler_options options; + options.chip_class = program->chip_class; + live live_vars2 = aco::live_var_analysis(program, &options); + + for (unsigned j = 0; j < program->blocks.size(); j++) { + Block &b = program->blocks[j]; + for (unsigned i = 0; i < b.instructions.size(); i++) + assert(live_vars.register_demand[b.index][i] == live_vars2.register_demand[b.index][i]); + assert(b.register_demand == demands[j]); + } + + assert(program->max_reg_demand == prev_max_demand); + assert(program->num_waves == prev_num_waves); + #endif +} + +} diff -Nru mesa-19.2.8/src/amd/compiler/aco_spill.cpp mesa-20.0.8/src/amd/compiler/aco_spill.cpp --- mesa-19.2.8/src/amd/compiler/aco_spill.cpp 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/amd/compiler/aco_spill.cpp 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,1789 @@ +/* + * Copyright © 2018 Valve Corporation + * Copyright © 2018 Google + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + */ + +#include "aco_ir.h" +#include "aco_builder.h" +#include "sid.h" + +#include +#include + +/* + * Implements the spilling algorithm on SSA-form from + * "Register Spilling and Live-Range Splitting for SSA-Form Programs" + * by Matthias Braun and Sebastian Hack. + */ + +namespace aco { + +namespace { + +struct remat_info { + Instruction *instr; +}; + +struct spill_ctx { + RegisterDemand target_pressure; + Program* program; + std::vector> register_demand; + std::vector> renames; + std::vector> spills_entry; + std::vector> spills_exit; + std::vector processed; + std::stack loop_header; + std::vector>> next_use_distances_start; + std::vector>> next_use_distances_end; + std::vector>> interferences; + std::vector> affinities; + std::vector is_reloaded; + std::map remat; + std::map remat_used; + unsigned wave_size; + + spill_ctx(const RegisterDemand target_pressure, Program* program, + std::vector> register_demand) + : target_pressure(target_pressure), program(program), + register_demand(register_demand), renames(program->blocks.size()), + spills_entry(program->blocks.size()), spills_exit(program->blocks.size()), + processed(program->blocks.size(), false), wave_size(program->wave_size) {} + + void add_affinity(uint32_t first, uint32_t second) + { + unsigned found_first = affinities.size(); + unsigned found_second = affinities.size(); + for (unsigned i = 0; i < affinities.size(); i++) { + std::vector& vec = affinities[i]; + for (uint32_t entry : vec) { + if (entry == first) + found_first = i; + else if (entry == second) + found_second = i; + } + } + if (found_first == affinities.size() && found_second == affinities.size()) { + affinities.emplace_back(std::vector({first, second})); + } else if (found_first < affinities.size() && found_second == affinities.size()) { + affinities[found_first].push_back(second); + } else if (found_second < affinities.size() && found_first == affinities.size()) { + affinities[found_second].push_back(first); + } else if (found_first != found_second) { + /* merge second into first */ + affinities[found_first].insert(affinities[found_first].end(), affinities[found_second].begin(), affinities[found_second].end()); + affinities.erase(std::next(affinities.begin(), found_second)); + } else { + assert(found_first == found_second); + } + } + + uint32_t allocate_spill_id(RegClass rc) + { + interferences.emplace_back(rc, std::set()); + is_reloaded.push_back(false); + return next_spill_id++; + } + + uint32_t next_spill_id = 0; +}; + +int32_t get_dominator(int idx_a, int idx_b, Program* program, bool is_linear) +{ + + if (idx_a == -1) + return idx_b; + if (idx_b == -1) + return idx_a; + if (is_linear) { + while (idx_a != idx_b) { + if (idx_a > idx_b) + idx_a = program->blocks[idx_a].linear_idom; + else + idx_b = program->blocks[idx_b].linear_idom; + } + } else { + while (idx_a != idx_b) { + if (idx_a > idx_b) + idx_a = program->blocks[idx_a].logical_idom; + else + idx_b = program->blocks[idx_b].logical_idom; + } + } + assert(idx_a != -1); + return idx_a; +} + +void next_uses_per_block(spill_ctx& ctx, unsigned block_idx, std::set& worklist) +{ + Block* block = &ctx.program->blocks[block_idx]; + std::map> next_uses = ctx.next_use_distances_end[block_idx]; + + /* to compute the next use distance at the beginning of the block, we have to add the block's size */ + for (std::map>::iterator it = next_uses.begin(); it != next_uses.end(); ++it) + it->second.second = it->second.second + block->instructions.size(); + + int idx = block->instructions.size() - 1; + while (idx >= 0) { + aco_ptr& instr = block->instructions[idx]; + + if (instr->opcode == aco_opcode::p_linear_phi || + instr->opcode == aco_opcode::p_phi) + break; + + for (const Definition& def : instr->definitions) { + if (def.isTemp()) + next_uses.erase(def.getTemp()); + } + + for (const Operand& op : instr->operands) { + /* omit exec mask */ + if (op.isFixed() && op.physReg() == exec) + continue; + if (op.regClass().type() == RegType::vgpr && op.regClass().is_linear()) + continue; + if (op.isTemp()) + next_uses[op.getTemp()] = {block_idx, idx}; + } + idx--; + } + + assert(block_idx != 0 || next_uses.empty()); + ctx.next_use_distances_start[block_idx] = next_uses; + while (idx >= 0) { + aco_ptr& instr = block->instructions[idx]; + assert(instr->opcode == aco_opcode::p_linear_phi || instr->opcode == aco_opcode::p_phi); + + for (unsigned i = 0; i < instr->operands.size(); i++) { + unsigned pred_idx = instr->opcode == aco_opcode::p_phi ? + block->logical_preds[i] : + block->linear_preds[i]; + if (instr->operands[i].isTemp()) { + if (instr->operands[i].getTemp() == ctx.program->blocks[pred_idx].live_out_exec) + continue; + if (ctx.next_use_distances_end[pred_idx].find(instr->operands[i].getTemp()) == ctx.next_use_distances_end[pred_idx].end() || + ctx.next_use_distances_end[pred_idx][instr->operands[i].getTemp()] != std::pair{block_idx, 0}) + worklist.insert(pred_idx); + ctx.next_use_distances_end[pred_idx][instr->operands[i].getTemp()] = {block_idx, 0}; + } + } + next_uses.erase(instr->definitions[0].getTemp()); + idx--; + } + + /* all remaining live vars must be live-out at the predecessors */ + for (std::pair> pair : next_uses) { + Temp temp = pair.first; + uint32_t distance = pair.second.second; + uint32_t dom = pair.second.first; + std::vector& preds = temp.is_linear() ? block->linear_preds : block->logical_preds; + for (unsigned pred_idx : preds) { + if (temp == ctx.program->blocks[pred_idx].live_out_exec) + continue; + if (ctx.program->blocks[pred_idx].loop_nest_depth > block->loop_nest_depth) + distance += 0xFFFF; + if (ctx.next_use_distances_end[pred_idx].find(temp) != ctx.next_use_distances_end[pred_idx].end()) { + dom = get_dominator(dom, ctx.next_use_distances_end[pred_idx][temp].first, ctx.program, temp.is_linear()); + distance = std::min(ctx.next_use_distances_end[pred_idx][temp].second, distance); + } + if (ctx.next_use_distances_end[pred_idx][temp] != std::pair{dom, distance}) + worklist.insert(pred_idx); + ctx.next_use_distances_end[pred_idx][temp] = {dom, distance}; + } + } + +} + +void compute_global_next_uses(spill_ctx& ctx, std::vector>& live_out) +{ + ctx.next_use_distances_start.resize(ctx.program->blocks.size()); + ctx.next_use_distances_end.resize(ctx.program->blocks.size()); + std::set worklist; + for (Block& block : ctx.program->blocks) + worklist.insert(block.index); + + while (!worklist.empty()) { + std::set::reverse_iterator b_it = worklist.rbegin(); + unsigned block_idx = *b_it; + worklist.erase(block_idx); + next_uses_per_block(ctx, block_idx, worklist); + } +} + +bool should_rematerialize(aco_ptr& instr) +{ + /* TODO: rematerialization is only supported for VOP1, SOP1 and PSEUDO */ + if (instr->format != Format::VOP1 && instr->format != Format::SOP1 && instr->format != Format::PSEUDO && instr->format != Format::SOPK) + return false; + /* TODO: pseudo-instruction rematerialization is only supported for p_create_vector */ + if (instr->format == Format::PSEUDO && instr->opcode != aco_opcode::p_create_vector) + return false; + if (instr->format == Format::SOPK && instr->opcode != aco_opcode::s_movk_i32) + return false; + + for (const Operand& op : instr->operands) { + /* TODO: rematerialization using temporaries isn't yet supported */ + if (op.isTemp()) + return false; + } + + /* TODO: rematerialization with multiple definitions isn't yet supported */ + if (instr->definitions.size() > 1) + return false; + + return true; +} + +aco_ptr do_reload(spill_ctx& ctx, Temp tmp, Temp new_name, uint32_t spill_id) +{ + std::map::iterator remat = ctx.remat.find(tmp); + if (remat != ctx.remat.end()) { + Instruction *instr = remat->second.instr; + assert((instr->format == Format::VOP1 || instr->format == Format::SOP1 || instr->format == Format::PSEUDO || instr->format == Format::SOPK) && "unsupported"); + assert((instr->format != Format::PSEUDO || instr->opcode == aco_opcode::p_create_vector) && "unsupported"); + assert(instr->definitions.size() == 1 && "unsupported"); + + aco_ptr res; + if (instr->format == Format::VOP1) { + res.reset(create_instruction(instr->opcode, instr->format, instr->operands.size(), instr->definitions.size())); + } else if (instr->format == Format::SOP1) { + res.reset(create_instruction(instr->opcode, instr->format, instr->operands.size(), instr->definitions.size())); + } else if (instr->format == Format::PSEUDO) { + res.reset(create_instruction(instr->opcode, instr->format, instr->operands.size(), instr->definitions.size())); + } else if (instr->format == Format::SOPK) { + res.reset(create_instruction(instr->opcode, instr->format, instr->operands.size(), instr->definitions.size())); + static_cast(res.get())->imm = static_cast(instr)->imm; + } + for (unsigned i = 0; i < instr->operands.size(); i++) { + res->operands[i] = instr->operands[i]; + if (instr->operands[i].isTemp()) { + assert(false && "unsupported"); + if (ctx.remat.count(instr->operands[i].getTemp())) + ctx.remat_used[ctx.remat[instr->operands[i].getTemp()].instr] = true; + } + } + res->definitions[0] = Definition(new_name); + return res; + } else { + aco_ptr reload{create_instruction(aco_opcode::p_reload, Format::PSEUDO, 1, 1)}; + reload->operands[0] = Operand(spill_id); + reload->definitions[0] = Definition(new_name); + ctx.is_reloaded[spill_id] = true; + return reload; + } +} + +void get_rematerialize_info(spill_ctx& ctx) +{ + for (Block& block : ctx.program->blocks) { + bool logical = false; + for (aco_ptr& instr : block.instructions) { + if (instr->opcode == aco_opcode::p_logical_start) + logical = true; + else if (instr->opcode == aco_opcode::p_logical_end) + logical = false; + if (logical && should_rematerialize(instr)) { + for (const Definition& def : instr->definitions) { + if (def.isTemp()) { + ctx.remat[def.getTemp()] = (remat_info){instr.get()}; + ctx.remat_used[instr.get()] = false; + } + } + } + } + } +} + +std::vector> local_next_uses(spill_ctx& ctx, Block* block) +{ + std::vector> local_next_uses(block->instructions.size()); + + std::map next_uses; + for (std::pair> pair : ctx.next_use_distances_end[block->index]) + next_uses[pair.first] = pair.second.second + block->instructions.size(); + + for (int idx = block->instructions.size() - 1; idx >= 0; idx--) { + aco_ptr& instr = block->instructions[idx]; + if (!instr) + break; + if (instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi) + break; + + for (const Operand& op : instr->operands) { + if (op.isFixed() && op.physReg() == exec) + continue; + if (op.regClass().type() == RegType::vgpr && op.regClass().is_linear()) + continue; + if (op.isTemp()) + next_uses[op.getTemp()] = idx; + } + for (const Definition& def : instr->definitions) { + if (def.isTemp()) + next_uses.erase(def.getTemp()); + } + local_next_uses[idx] = next_uses; + } + return local_next_uses; +} + + +RegisterDemand init_live_in_vars(spill_ctx& ctx, Block* block, unsigned block_idx) +{ + RegisterDemand spilled_registers; + + /* first block, nothing was spilled before */ + if (block_idx == 0) + return {0, 0}; + + /* loop header block */ + if (block->loop_nest_depth > ctx.program->blocks[block_idx - 1].loop_nest_depth) { + assert(block->linear_preds[0] == block_idx - 1); + assert(block->logical_preds[0] == block_idx - 1); + + /* create new loop_info */ + ctx.loop_header.emplace(block); + + /* check how many live-through variables should be spilled */ + RegisterDemand new_demand; + unsigned i = block_idx; + while (ctx.program->blocks[i].loop_nest_depth >= block->loop_nest_depth) { + assert(ctx.program->blocks.size() > i); + new_demand.update(ctx.program->blocks[i].register_demand); + i++; + } + unsigned loop_end = i; + + /* select live-through vgpr variables */ + while (new_demand.vgpr - spilled_registers.vgpr > ctx.target_pressure.vgpr) { + unsigned distance = 0; + Temp to_spill; + for (std::pair> pair : ctx.next_use_distances_end[block_idx - 1]) { + if (pair.first.type() == RegType::vgpr && + pair.second.first >= loop_end && + pair.second.second > distance && + ctx.spills_entry[block_idx].find(pair.first) == ctx.spills_entry[block_idx].end()) { + to_spill = pair.first; + distance = pair.second.second; + } + } + if (distance == 0) + break; + + uint32_t spill_id; + if (ctx.spills_exit[block_idx - 1].find(to_spill) == ctx.spills_exit[block_idx - 1].end()) { + spill_id = ctx.allocate_spill_id(to_spill.regClass()); + } else { + spill_id = ctx.spills_exit[block_idx - 1][to_spill]; + } + + ctx.spills_entry[block_idx][to_spill] = spill_id; + spilled_registers.vgpr += to_spill.size(); + } + + /* select live-through sgpr variables */ + while (new_demand.sgpr - spilled_registers.sgpr > ctx.target_pressure.sgpr) { + unsigned distance = 0; + Temp to_spill; + for (std::pair> pair : ctx.next_use_distances_end[block_idx - 1]) { + if (pair.first.type() == RegType::sgpr && + pair.second.first >= loop_end && + pair.second.second > distance && + ctx.spills_entry[block_idx].find(pair.first) == ctx.spills_entry[block_idx].end()) { + to_spill = pair.first; + distance = pair.second.second; + } + } + if (distance == 0) + break; + + uint32_t spill_id; + if (ctx.spills_exit[block_idx - 1].find(to_spill) == ctx.spills_exit[block_idx - 1].end()) { + spill_id = ctx.allocate_spill_id(to_spill.regClass()); + } else { + spill_id = ctx.spills_exit[block_idx - 1][to_spill]; + } + + ctx.spills_entry[block_idx][to_spill] = spill_id; + spilled_registers.sgpr += to_spill.size(); + } + + + + /* shortcut */ + if (!RegisterDemand(new_demand - spilled_registers).exceeds(ctx.target_pressure)) + return spilled_registers; + + /* if reg pressure is too high at beginning of loop, add variables with furthest use */ + unsigned idx = 0; + while (block->instructions[idx]->opcode == aco_opcode::p_phi || block->instructions[idx]->opcode == aco_opcode::p_linear_phi) + idx++; + + assert(idx != 0 && "loop without phis: TODO"); + idx--; + RegisterDemand reg_pressure = ctx.register_demand[block_idx][idx] - spilled_registers; + while (reg_pressure.sgpr > ctx.target_pressure.sgpr) { + unsigned distance = 0; + Temp to_spill; + for (std::pair> pair : ctx.next_use_distances_start[block_idx]) { + if (pair.first.type() == RegType::sgpr && + pair.second.second > distance && + ctx.spills_entry[block_idx].find(pair.first) == ctx.spills_entry[block_idx].end()) { + to_spill = pair.first; + distance = pair.second.second; + } + } + assert(distance != 0); + + ctx.spills_entry[block_idx][to_spill] = ctx.allocate_spill_id(to_spill.regClass()); + spilled_registers.sgpr += to_spill.size(); + reg_pressure.sgpr -= to_spill.size(); + } + while (reg_pressure.vgpr > ctx.target_pressure.vgpr) { + unsigned distance = 0; + Temp to_spill; + for (std::pair> pair : ctx.next_use_distances_start[block_idx]) { + if (pair.first.type() == RegType::vgpr && + pair.second.second > distance && + ctx.spills_entry[block_idx].find(pair.first) == ctx.spills_entry[block_idx].end()) { + to_spill = pair.first; + distance = pair.second.second; + } + } + assert(distance != 0); + ctx.spills_entry[block_idx][to_spill] = ctx.allocate_spill_id(to_spill.regClass()); + spilled_registers.vgpr += to_spill.size(); + reg_pressure.vgpr -= to_spill.size(); + } + + return spilled_registers; + } + + /* branch block */ + if (block->linear_preds.size() == 1 && !(block->kind & block_kind_loop_exit)) { + /* keep variables spilled if they are alive and not used in the current block */ + unsigned pred_idx = block->linear_preds[0]; + for (std::pair pair : ctx.spills_exit[pred_idx]) { + if (pair.first.type() == RegType::sgpr && + ctx.next_use_distances_start[block_idx].find(pair.first) != ctx.next_use_distances_start[block_idx].end() && + ctx.next_use_distances_start[block_idx][pair.first].second > block_idx) { + ctx.spills_entry[block_idx].insert(pair); + spilled_registers.sgpr += pair.first.size(); + } + } + if (block->logical_preds.size() == 1) { + pred_idx = block->logical_preds[0]; + for (std::pair pair : ctx.spills_exit[pred_idx]) { + if (pair.first.type() == RegType::vgpr && + ctx.next_use_distances_start[block_idx].find(pair.first) != ctx.next_use_distances_start[block_idx].end() && + ctx.next_use_distances_start[block_idx][pair.first].second > block_idx) { + ctx.spills_entry[block_idx].insert(pair); + spilled_registers.vgpr += pair.first.size(); + } + } + } + + /* if register demand is still too high, we just keep all spilled live vars and process the block */ + if (block->register_demand.sgpr - spilled_registers.sgpr > ctx.target_pressure.sgpr) { + pred_idx = block->linear_preds[0]; + for (std::pair pair : ctx.spills_exit[pred_idx]) { + if (pair.first.type() == RegType::sgpr && + ctx.next_use_distances_start[block_idx].find(pair.first) != ctx.next_use_distances_start[block_idx].end() && + ctx.spills_entry[block_idx].insert(pair).second) { + spilled_registers.sgpr += pair.first.size(); + } + } + } + if (block->register_demand.vgpr - spilled_registers.vgpr > ctx.target_pressure.vgpr && block->logical_preds.size() == 1) { + pred_idx = block->logical_preds[0]; + for (std::pair pair : ctx.spills_exit[pred_idx]) { + if (pair.first.type() == RegType::vgpr && + ctx.next_use_distances_start[block_idx].find(pair.first) != ctx.next_use_distances_start[block_idx].end() && + ctx.spills_entry[block_idx].insert(pair).second) { + spilled_registers.vgpr += pair.first.size(); + } + } + } + + return spilled_registers; + } + + /* else: merge block */ + std::set partial_spills; + + /* keep variables spilled on all incoming paths */ + for (std::pair> pair : ctx.next_use_distances_start[block_idx]) { + std::vector& preds = pair.first.is_linear() ? block->linear_preds : block->logical_preds; + /* If it can be rematerialized, keep the variable spilled if all predecessors do not reload it. + * Otherwise, if any predecessor reloads it, ensure it's reloaded on all other predecessors. + * The idea is that it's better in practice to rematerialize redundantly than to create lots of phis. */ + /* TODO: test this idea with more than Dawn of War III shaders (the current pipeline-db doesn't seem to exercise this path much) */ + bool remat = ctx.remat.count(pair.first); + bool spill = !remat; + uint32_t spill_id = 0; + for (unsigned pred_idx : preds) { + /* variable is not even live at the predecessor: probably from a phi */ + if (ctx.next_use_distances_end[pred_idx].find(pair.first) == ctx.next_use_distances_end[pred_idx].end()) { + spill = false; + break; + } + if (ctx.spills_exit[pred_idx].find(pair.first) == ctx.spills_exit[pred_idx].end()) { + if (!remat) + spill = false; + } else { + partial_spills.insert(pair.first); + /* it might be that on one incoming path, the variable has a different spill_id, but add_couple_code() will take care of that. */ + spill_id = ctx.spills_exit[pred_idx][pair.first]; + if (remat) + spill = true; + } + } + if (spill) { + ctx.spills_entry[block_idx][pair.first] = spill_id; + partial_spills.erase(pair.first); + spilled_registers += pair.first; + } + } + + /* same for phis */ + unsigned idx = 0; + while (block->instructions[idx]->opcode == aco_opcode::p_linear_phi || + block->instructions[idx]->opcode == aco_opcode::p_phi) { + aco_ptr& phi = block->instructions[idx]; + std::vector& preds = phi->opcode == aco_opcode::p_phi ? block->logical_preds : block->linear_preds; + bool spill = true; + + for (unsigned i = 0; i < phi->operands.size(); i++) { + if (phi->operands[i].isUndefined()) + continue; + assert(phi->operands[i].isTemp()); + if (ctx.spills_exit[preds[i]].find(phi->operands[i].getTemp()) == ctx.spills_exit[preds[i]].end()) + spill = false; + else + partial_spills.insert(phi->definitions[0].getTemp()); + } + if (spill) { + ctx.spills_entry[block_idx][phi->definitions[0].getTemp()] = ctx.allocate_spill_id(phi->definitions[0].regClass()); + partial_spills.erase(phi->definitions[0].getTemp()); + spilled_registers += phi->definitions[0].getTemp(); + } + + idx++; + } + + /* if reg pressure at first instruction is still too high, add partially spilled variables */ + RegisterDemand reg_pressure; + if (idx == 0) { + for (const Definition& def : block->instructions[idx]->definitions) { + if (def.isTemp()) { + reg_pressure -= def.getTemp(); + } + } + for (const Operand& op : block->instructions[idx]->operands) { + if (op.isTemp() && op.isFirstKill()) { + reg_pressure += op.getTemp(); + } + } + } else { + idx--; + } + reg_pressure += ctx.register_demand[block_idx][idx] - spilled_registers; + + while (reg_pressure.sgpr > ctx.target_pressure.sgpr) { + assert(!partial_spills.empty()); + + std::set::iterator it = partial_spills.begin(); + Temp to_spill = *it; + unsigned distance = ctx.next_use_distances_start[block_idx][*it].second; + while (it != partial_spills.end()) { + assert(ctx.spills_entry[block_idx].find(*it) == ctx.spills_entry[block_idx].end()); + + if (it->type() == RegType::sgpr && ctx.next_use_distances_start[block_idx][*it].second > distance) { + distance = ctx.next_use_distances_start[block_idx][*it].second; + to_spill = *it; + } + ++it; + } + assert(distance != 0); + + ctx.spills_entry[block_idx][to_spill] = ctx.allocate_spill_id(to_spill.regClass()); + partial_spills.erase(to_spill); + spilled_registers.sgpr += to_spill.size(); + reg_pressure.sgpr -= to_spill.size(); + } + + while (reg_pressure.vgpr > ctx.target_pressure.vgpr) { + assert(!partial_spills.empty()); + + std::set::iterator it = partial_spills.begin(); + Temp to_spill = *it; + unsigned distance = ctx.next_use_distances_start[block_idx][*it].second; + while (it != partial_spills.end()) { + assert(ctx.spills_entry[block_idx].find(*it) == ctx.spills_entry[block_idx].end()); + + if (it->type() == RegType::vgpr && ctx.next_use_distances_start[block_idx][*it].second > distance) { + distance = ctx.next_use_distances_start[block_idx][*it].second; + to_spill = *it; + } + ++it; + } + assert(distance != 0); + + ctx.spills_entry[block_idx][to_spill] = ctx.allocate_spill_id(to_spill.regClass()); + partial_spills.erase(to_spill); + spilled_registers.vgpr += to_spill.size(); + reg_pressure.vgpr -= to_spill.size(); + } + + return spilled_registers; +} + + +RegisterDemand get_demand_before(spill_ctx& ctx, unsigned block_idx, unsigned idx) +{ + if (idx == 0) { + RegisterDemand demand_before = ctx.register_demand[block_idx][idx]; + aco_ptr& instr = ctx.program->blocks[block_idx].instructions[idx]; + for (const Definition& def : instr->definitions) + demand_before -= def.getTemp(); + for (const Operand& op : instr->operands) { + if (op.isFirstKill()) + demand_before += op.getTemp(); + } + return demand_before; + } else { + return ctx.register_demand[block_idx][idx - 1]; + } +} + +void add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx) +{ + /* no coupling code necessary */ + if (block->linear_preds.size() == 0) + return; + + std::vector> instructions; + /* branch block: TODO take other branch into consideration */ + if (block->linear_preds.size() == 1 && !(block->kind & (block_kind_loop_exit | block_kind_loop_header))) { + assert(ctx.processed[block->linear_preds[0]]); + assert(ctx.register_demand[block_idx].size() == block->instructions.size()); + std::vector reg_demand; + unsigned insert_idx = 0; + unsigned pred_idx = block->linear_preds[0]; + RegisterDemand demand_before = get_demand_before(ctx, block_idx, 0); + + for (std::pair> live : ctx.next_use_distances_start[block_idx]) { + if (!live.first.is_linear()) + continue; + /* still spilled */ + if (ctx.spills_entry[block_idx].find(live.first) != ctx.spills_entry[block_idx].end()) + continue; + + /* in register at end of predecessor */ + if (ctx.spills_exit[pred_idx].find(live.first) == ctx.spills_exit[pred_idx].end()) { + std::map::iterator it = ctx.renames[pred_idx].find(live.first); + if (it != ctx.renames[pred_idx].end()) + ctx.renames[block_idx].insert(*it); + continue; + } + + /* variable is spilled at predecessor and live at current block: create reload instruction */ + Temp new_name = {ctx.program->allocateId(), live.first.regClass()}; + aco_ptr reload = do_reload(ctx, live.first, new_name, ctx.spills_exit[pred_idx][live.first]); + instructions.emplace_back(std::move(reload)); + reg_demand.push_back(demand_before); + ctx.renames[block_idx][live.first] = new_name; + } + + if (block->logical_preds.size() == 1) { + do { + assert(insert_idx < block->instructions.size()); + instructions.emplace_back(std::move(block->instructions[insert_idx])); + reg_demand.push_back(ctx.register_demand[block_idx][insert_idx]); + insert_idx++; + } while (instructions.back()->opcode != aco_opcode::p_logical_start); + + unsigned pred_idx = block->logical_preds[0]; + for (std::pair> live : ctx.next_use_distances_start[block_idx]) { + if (live.first.is_linear()) + continue; + /* still spilled */ + if (ctx.spills_entry[block_idx].find(live.first) != ctx.spills_entry[block_idx].end()) + continue; + + /* in register at end of predecessor */ + if (ctx.spills_exit[pred_idx].find(live.first) == ctx.spills_exit[pred_idx].end()) { + std::map::iterator it = ctx.renames[pred_idx].find(live.first); + if (it != ctx.renames[pred_idx].end()) + ctx.renames[block_idx].insert(*it); + continue; + } + + /* variable is spilled at predecessor and live at current block: create reload instruction */ + Temp new_name = {ctx.program->allocateId(), live.first.regClass()}; + aco_ptr reload = do_reload(ctx, live.first, new_name, ctx.spills_exit[pred_idx][live.first]); + instructions.emplace_back(std::move(reload)); + reg_demand.emplace_back(reg_demand.back()); + ctx.renames[block_idx][live.first] = new_name; + } + } + + /* combine new reload instructions with original block */ + if (!instructions.empty()) { + reg_demand.insert(reg_demand.end(), std::next(ctx.register_demand[block->index].begin(), insert_idx), + ctx.register_demand[block->index].end()); + ctx.register_demand[block_idx] = std::move(reg_demand); + instructions.insert(instructions.end(), + std::move_iterator>::iterator>(std::next(block->instructions.begin(), insert_idx)), + std::move_iterator>::iterator>(block->instructions.end())); + block->instructions = std::move(instructions); + } + return; + } + + /* loop header and merge blocks: check if all (linear) predecessors have been processed */ + for (ASSERTED unsigned pred : block->linear_preds) + assert(ctx.processed[pred]); + + /* iterate the phi nodes for which operands to spill at the predecessor */ + for (aco_ptr& phi : block->instructions) { + if (phi->opcode != aco_opcode::p_phi && + phi->opcode != aco_opcode::p_linear_phi) + break; + + /* if the phi is not spilled, add to instructions */ + if (ctx.spills_entry[block_idx].find(phi->definitions[0].getTemp()) == ctx.spills_entry[block_idx].end()) { + instructions.emplace_back(std::move(phi)); + continue; + } + + std::vector& preds = phi->opcode == aco_opcode::p_phi ? block->logical_preds : block->linear_preds; + uint32_t def_spill_id = ctx.spills_entry[block_idx][phi->definitions[0].getTemp()]; + + for (unsigned i = 0; i < phi->operands.size(); i++) { + if (phi->operands[i].isUndefined()) + continue; + + unsigned pred_idx = preds[i]; + assert(phi->operands[i].isTemp() && phi->operands[i].isKill()); + Temp var = phi->operands[i].getTemp(); + + /* build interferences between the phi def and all spilled variables at the predecessor blocks */ + for (std::pair pair : ctx.spills_exit[pred_idx]) { + if (var == pair.first) + continue; + ctx.interferences[def_spill_id].second.emplace(pair.second); + ctx.interferences[pair.second].second.emplace(def_spill_id); + } + + /* check if variable is already spilled at predecessor */ + std::map::iterator spilled = ctx.spills_exit[pred_idx].find(var); + if (spilled != ctx.spills_exit[pred_idx].end()) { + if (spilled->second != def_spill_id) + ctx.add_affinity(def_spill_id, spilled->second); + continue; + } + + /* rename if necessary */ + std::map::iterator rename_it = ctx.renames[pred_idx].find(var); + if (rename_it != ctx.renames[pred_idx].end()) { + var = rename_it->second; + ctx.renames[pred_idx].erase(rename_it); + } + + uint32_t spill_id = ctx.allocate_spill_id(phi->definitions[0].regClass()); + ctx.add_affinity(def_spill_id, spill_id); + aco_ptr spill{create_instruction(aco_opcode::p_spill, Format::PSEUDO, 2, 0)}; + spill->operands[0] = Operand(var); + spill->operands[1] = Operand(spill_id); + Block& pred = ctx.program->blocks[pred_idx]; + unsigned idx = pred.instructions.size(); + do { + assert(idx != 0); + idx--; + } while (phi->opcode == aco_opcode::p_phi && pred.instructions[idx]->opcode != aco_opcode::p_logical_end); + std::vector>::iterator it = std::next(pred.instructions.begin(), idx); + pred.instructions.insert(it, std::move(spill)); + ctx.spills_exit[pred_idx][phi->operands[i].getTemp()] = spill_id; + } + + /* remove phi from instructions */ + phi.reset(); + } + + /* iterate all (other) spilled variables for which to spill at the predecessor */ + // TODO: would be better to have them sorted: first vgprs and first with longest distance + for (std::pair pair : ctx.spills_entry[block_idx]) { + std::vector preds = pair.first.is_linear() ? block->linear_preds : block->logical_preds; + + for (unsigned pred_idx : preds) { + /* variable is already spilled at predecessor */ + std::map::iterator spilled = ctx.spills_exit[pred_idx].find(pair.first); + if (spilled != ctx.spills_exit[pred_idx].end()) { + if (spilled->second != pair.second) + ctx.add_affinity(pair.second, spilled->second); + continue; + } + + /* variable is dead at predecessor, it must be from a phi: this works because of CSSA form */ + if (ctx.next_use_distances_end[pred_idx].find(pair.first) == ctx.next_use_distances_end[pred_idx].end()) + continue; + + /* add interferences between spilled variable and predecessors exit spills */ + for (std::pair exit_spill : ctx.spills_exit[pred_idx]) { + if (exit_spill.first == pair.first) + continue; + ctx.interferences[exit_spill.second].second.emplace(pair.second); + ctx.interferences[pair.second].second.emplace(exit_spill.second); + } + + /* variable is in register at predecessor and has to be spilled */ + /* rename if necessary */ + Temp var = pair.first; + std::map::iterator rename_it = ctx.renames[pred_idx].find(var); + if (rename_it != ctx.renames[pred_idx].end()) { + var = rename_it->second; + ctx.renames[pred_idx].erase(rename_it); + } + + aco_ptr spill{create_instruction(aco_opcode::p_spill, Format::PSEUDO, 2, 0)}; + spill->operands[0] = Operand(var); + spill->operands[1] = Operand(pair.second); + Block& pred = ctx.program->blocks[pred_idx]; + unsigned idx = pred.instructions.size(); + do { + assert(idx != 0); + idx--; + } while (pair.first.type() == RegType::vgpr && pred.instructions[idx]->opcode != aco_opcode::p_logical_end); + std::vector>::iterator it = std::next(pred.instructions.begin(), idx); + pred.instructions.insert(it, std::move(spill)); + ctx.spills_exit[pred.index][pair.first] = pair.second; + } + } + + /* iterate phis for which operands to reload */ + for (aco_ptr& phi : instructions) { + assert(phi->opcode == aco_opcode::p_phi || phi->opcode == aco_opcode::p_linear_phi); + assert(ctx.spills_entry[block_idx].find(phi->definitions[0].getTemp()) == ctx.spills_entry[block_idx].end()); + + std::vector& preds = phi->opcode == aco_opcode::p_phi ? block->logical_preds : block->linear_preds; + for (unsigned i = 0; i < phi->operands.size(); i++) { + if (!phi->operands[i].isTemp()) + continue; + unsigned pred_idx = preds[i]; + + /* rename operand */ + if (ctx.spills_exit[pred_idx].find(phi->operands[i].getTemp()) == ctx.spills_exit[pred_idx].end()) { + std::map::iterator it = ctx.renames[pred_idx].find(phi->operands[i].getTemp()); + if (it != ctx.renames[pred_idx].end()) + phi->operands[i].setTemp(it->second); + continue; + } + + Temp tmp = phi->operands[i].getTemp(); + + /* reload phi operand at end of predecessor block */ + Temp new_name = {ctx.program->allocateId(), tmp.regClass()}; + Block& pred = ctx.program->blocks[pred_idx]; + unsigned idx = pred.instructions.size(); + do { + assert(idx != 0); + idx--; + } while (phi->opcode == aco_opcode::p_phi && pred.instructions[idx]->opcode != aco_opcode::p_logical_end); + std::vector>::iterator it = std::next(pred.instructions.begin(), idx); + + aco_ptr reload = do_reload(ctx, tmp, new_name, ctx.spills_exit[pred_idx][tmp]); + pred.instructions.insert(it, std::move(reload)); + + ctx.spills_exit[pred_idx].erase(tmp); + ctx.renames[pred_idx][tmp] = new_name; + phi->operands[i].setTemp(new_name); + } + } + + /* iterate live variables for which to reload */ + // TODO: reload at current block if variable is spilled on all predecessors + for (std::pair> pair : ctx.next_use_distances_start[block_idx]) { + /* skip spilled variables */ + if (ctx.spills_entry[block_idx].find(pair.first) != ctx.spills_entry[block_idx].end()) + continue; + std::vector preds = pair.first.is_linear() ? block->linear_preds : block->logical_preds; + + /* variable is dead at predecessor, it must be from a phi */ + bool is_dead = false; + for (unsigned pred_idx : preds) { + if (ctx.next_use_distances_end[pred_idx].find(pair.first) == ctx.next_use_distances_end[pred_idx].end()) + is_dead = true; + } + if (is_dead) + continue; + for (unsigned pred_idx : preds) { + /* the variable is not spilled at the predecessor */ + if (ctx.spills_exit[pred_idx].find(pair.first) == ctx.spills_exit[pred_idx].end()) + continue; + + /* variable is spilled at predecessor and has to be reloaded */ + Temp new_name = {ctx.program->allocateId(), pair.first.regClass()}; + Block& pred = ctx.program->blocks[pred_idx]; + unsigned idx = pred.instructions.size(); + do { + assert(idx != 0); + idx--; + } while (pair.first.type() == RegType::vgpr && pred.instructions[idx]->opcode != aco_opcode::p_logical_end); + std::vector>::iterator it = std::next(pred.instructions.begin(), idx); + + aco_ptr reload = do_reload(ctx, pair.first, new_name, ctx.spills_exit[pred.index][pair.first]); + pred.instructions.insert(it, std::move(reload)); + + ctx.spills_exit[pred.index].erase(pair.first); + ctx.renames[pred.index][pair.first] = new_name; + } + + /* check if we have to create a new phi for this variable */ + Temp rename = Temp(); + bool is_same = true; + for (unsigned pred_idx : preds) { + if (ctx.renames[pred_idx].find(pair.first) == ctx.renames[pred_idx].end()) { + if (rename == Temp()) + rename = pair.first; + else + is_same = rename == pair.first; + } else { + if (rename == Temp()) + rename = ctx.renames[pred_idx][pair.first]; + else + is_same = rename == ctx.renames[pred_idx][pair.first]; + } + + if (!is_same) + break; + } + + if (!is_same) { + /* the variable was renamed differently in the predecessors: we have to create a phi */ + aco_opcode opcode = pair.first.is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi; + aco_ptr phi{create_instruction(opcode, Format::PSEUDO, preds.size(), 1)}; + rename = {ctx.program->allocateId(), pair.first.regClass()}; + for (unsigned i = 0; i < phi->operands.size(); i++) { + Temp tmp; + if (ctx.renames[preds[i]].find(pair.first) != ctx.renames[preds[i]].end()) + tmp = ctx.renames[preds[i]][pair.first]; + else if (preds[i] >= block_idx) + tmp = rename; + else + tmp = pair.first; + phi->operands[i] = Operand(tmp); + } + phi->definitions[0] = Definition(rename); + instructions.emplace_back(std::move(phi)); + } + + /* the variable was renamed: add new name to renames */ + if (!(rename == Temp() || rename == pair.first)) + ctx.renames[block_idx][pair.first] = rename; + } + + /* combine phis with instructions */ + unsigned idx = 0; + while (!block->instructions[idx]) { + idx++; + } + + if (!ctx.processed[block_idx]) { + assert(!(block->kind & block_kind_loop_header)); + RegisterDemand demand_before = get_demand_before(ctx, block_idx, idx); + ctx.register_demand[block->index].erase(ctx.register_demand[block->index].begin(), ctx.register_demand[block->index].begin() + idx); + ctx.register_demand[block->index].insert(ctx.register_demand[block->index].begin(), instructions.size(), demand_before); + } + + std::vector>::iterator start = std::next(block->instructions.begin(), idx); + instructions.insert(instructions.end(), std::move_iterator>::iterator>(start), + std::move_iterator>::iterator>(block->instructions.end())); + block->instructions = std::move(instructions); +} + +void process_block(spill_ctx& ctx, unsigned block_idx, Block* block, + std::map ¤t_spills, RegisterDemand spilled_registers) +{ + assert(!ctx.processed[block_idx]); + + std::vector> local_next_use_distance; + std::vector> instructions; + unsigned idx = 0; + + /* phis are handled separetely */ + while (block->instructions[idx]->opcode == aco_opcode::p_phi || + block->instructions[idx]->opcode == aco_opcode::p_linear_phi) { + aco_ptr& instr = block->instructions[idx]; + for (const Operand& op : instr->operands) { + /* prevent it's definining instruction from being DCE'd if it could be rematerialized */ + if (op.isTemp() && ctx.remat.count(op.getTemp())) + ctx.remat_used[ctx.remat[op.getTemp()].instr] = true; + } + instructions.emplace_back(std::move(instr)); + idx++; + } + + if (block->register_demand.exceeds(ctx.target_pressure)) + local_next_use_distance = local_next_uses(ctx, block); + + while (idx < block->instructions.size()) { + aco_ptr& instr = block->instructions[idx]; + + std::map> reloads; + std::map spills; + /* rename and reload operands */ + for (Operand& op : instr->operands) { + if (!op.isTemp()) + continue; + if (current_spills.find(op.getTemp()) == current_spills.end()) { + /* the Operand is in register: check if it was renamed */ + if (ctx.renames[block_idx].find(op.getTemp()) != ctx.renames[block_idx].end()) + op.setTemp(ctx.renames[block_idx][op.getTemp()]); + /* prevent it's definining instruction from being DCE'd if it could be rematerialized */ + if (ctx.remat.count(op.getTemp())) + ctx.remat_used[ctx.remat[op.getTemp()].instr] = true; + continue; + } + /* the Operand is spilled: add it to reloads */ + Temp new_tmp = {ctx.program->allocateId(), op.regClass()}; + ctx.renames[block_idx][op.getTemp()] = new_tmp; + reloads[new_tmp] = std::make_pair(op.getTemp(), current_spills[op.getTemp()]); + current_spills.erase(op.getTemp()); + op.setTemp(new_tmp); + spilled_registers -= new_tmp; + } + + /* check if register demand is low enough before and after the current instruction */ + if (block->register_demand.exceeds(ctx.target_pressure)) { + + RegisterDemand new_demand = ctx.register_demand[block_idx][idx]; + new_demand.update(get_demand_before(ctx, block_idx, idx)); + + assert(!local_next_use_distance.empty()); + + /* if reg pressure is too high, spill variable with furthest next use */ + while (RegisterDemand(new_demand - spilled_registers).exceeds(ctx.target_pressure)) { + unsigned distance = 0; + Temp to_spill; + bool do_rematerialize = false; + if (new_demand.vgpr - spilled_registers.vgpr > ctx.target_pressure.vgpr) { + for (std::pair pair : local_next_use_distance[idx]) { + bool can_rematerialize = ctx.remat.count(pair.first); + if (pair.first.type() == RegType::vgpr && + ((pair.second > distance && can_rematerialize == do_rematerialize) || + (can_rematerialize && !do_rematerialize && pair.second > idx)) && + current_spills.find(pair.first) == current_spills.end() && + ctx.spills_exit[block_idx].find(pair.first) == ctx.spills_exit[block_idx].end()) { + to_spill = pair.first; + distance = pair.second; + do_rematerialize = can_rematerialize; + } + } + } else { + for (std::pair pair : local_next_use_distance[idx]) { + bool can_rematerialize = ctx.remat.count(pair.first); + if (pair.first.type() == RegType::sgpr && + ((pair.second > distance && can_rematerialize == do_rematerialize) || + (can_rematerialize && !do_rematerialize && pair.second > idx)) && + current_spills.find(pair.first) == current_spills.end() && + ctx.spills_exit[block_idx].find(pair.first) == ctx.spills_exit[block_idx].end()) { + to_spill = pair.first; + distance = pair.second; + do_rematerialize = can_rematerialize; + } + } + } + + assert(distance != 0 && distance > idx); + uint32_t spill_id = ctx.allocate_spill_id(to_spill.regClass()); + + /* add interferences with currently spilled variables */ + for (std::pair pair : current_spills) { + ctx.interferences[spill_id].second.emplace(pair.second); + ctx.interferences[pair.second].second.emplace(spill_id); + } + for (std::pair> pair : reloads) { + ctx.interferences[spill_id].second.emplace(pair.second.second); + ctx.interferences[pair.second.second].second.emplace(spill_id); + } + + current_spills[to_spill] = spill_id; + spilled_registers += to_spill; + + /* rename if necessary */ + if (ctx.renames[block_idx].find(to_spill) != ctx.renames[block_idx].end()) { + to_spill = ctx.renames[block_idx][to_spill]; + } + + /* add spill to new instructions */ + aco_ptr spill{create_instruction(aco_opcode::p_spill, Format::PSEUDO, 2, 0)}; + spill->operands[0] = Operand(to_spill); + spill->operands[1] = Operand(spill_id); + instructions.emplace_back(std::move(spill)); + } + } + + /* add reloads and instruction to new instructions */ + for (std::pair> pair : reloads) { + aco_ptr reload = do_reload(ctx, pair.second.first, pair.first, pair.second.second); + instructions.emplace_back(std::move(reload)); + } + instructions.emplace_back(std::move(instr)); + idx++; + } + + block->instructions = std::move(instructions); + ctx.spills_exit[block_idx].insert(current_spills.begin(), current_spills.end()); +} + +void spill_block(spill_ctx& ctx, unsigned block_idx) +{ + Block* block = &ctx.program->blocks[block_idx]; + + /* determine set of variables which are spilled at the beginning of the block */ + RegisterDemand spilled_registers = init_live_in_vars(ctx, block, block_idx); + + /* add interferences for spilled variables */ + for (std::pair x : ctx.spills_entry[block_idx]) { + for (std::pair y : ctx.spills_entry[block_idx]) + if (x.second != y.second) + ctx.interferences[x.second].second.emplace(y.second); + } + + bool is_loop_header = block->loop_nest_depth && ctx.loop_header.top()->index == block_idx; + if (!is_loop_header) { + /* add spill/reload code on incoming control flow edges */ + add_coupling_code(ctx, block, block_idx); + } + + std::map current_spills = ctx.spills_entry[block_idx]; + + /* check conditions to process this block */ + bool process = RegisterDemand(block->register_demand - spilled_registers).exceeds(ctx.target_pressure) || + !ctx.renames[block_idx].empty() || + ctx.remat_used.size(); + + std::map::iterator it = current_spills.begin(); + while (!process && it != current_spills.end()) { + if (ctx.next_use_distances_start[block_idx][it->first].first == block_idx) + process = true; + ++it; + } + + if (process) + process_block(ctx, block_idx, block, current_spills, spilled_registers); + else + ctx.spills_exit[block_idx].insert(current_spills.begin(), current_spills.end()); + + ctx.processed[block_idx] = true; + + /* check if the next block leaves the current loop */ + if (block->loop_nest_depth == 0 || ctx.program->blocks[block_idx + 1].loop_nest_depth >= block->loop_nest_depth) + return; + + Block* loop_header = ctx.loop_header.top(); + + /* preserve original renames at end of loop header block */ + std::map renames = std::move(ctx.renames[loop_header->index]); + + /* add coupling code to all loop header predecessors */ + add_coupling_code(ctx, loop_header, loop_header->index); + + /* update remat_used for phis added in add_coupling_code() */ + for (aco_ptr& instr : loop_header->instructions) { + if (!is_phi(instr)) + break; + for (const Operand& op : instr->operands) { + if (op.isTemp() && ctx.remat.count(op.getTemp())) + ctx.remat_used[ctx.remat[op.getTemp()].instr] = true; + } + } + + /* propagate new renames through loop: i.e. repair the SSA */ + renames.swap(ctx.renames[loop_header->index]); + for (std::pair rename : renames) { + for (unsigned idx = loop_header->index; idx <= block_idx; idx++) { + Block& current = ctx.program->blocks[idx]; + std::vector>::iterator instr_it = current.instructions.begin(); + + /* first rename phis */ + while (instr_it != current.instructions.end()) { + aco_ptr& phi = *instr_it; + if (phi->opcode != aco_opcode::p_phi && phi->opcode != aco_opcode::p_linear_phi) + break; + /* no need to rename the loop header phis once again. this happened in add_coupling_code() */ + if (idx == loop_header->index) { + instr_it++; + continue; + } + + for (Operand& op : phi->operands) { + if (!op.isTemp()) + continue; + if (op.getTemp() == rename.first) + op.setTemp(rename.second); + } + instr_it++; + } + + std::map>::iterator it = ctx.next_use_distances_start[idx].find(rename.first); + + /* variable is not live at beginning of this block */ + if (it == ctx.next_use_distances_start[idx].end()) + continue; + + /* if the variable is live at the block's exit, add rename */ + if (ctx.next_use_distances_end[idx].find(rename.first) != ctx.next_use_distances_end[idx].end()) + ctx.renames[idx].insert(rename); + + /* rename all uses in this block */ + bool renamed = false; + while (!renamed && instr_it != current.instructions.end()) { + aco_ptr& instr = *instr_it; + for (Operand& op : instr->operands) { + if (!op.isTemp()) + continue; + if (op.getTemp() == rename.first) { + op.setTemp(rename.second); + /* we can stop with this block as soon as the variable is spilled */ + if (instr->opcode == aco_opcode::p_spill) + renamed = true; + } + } + instr_it++; + } + } + } + + /* remove loop header info from stack */ + ctx.loop_header.pop(); +} + +Temp load_scratch_resource(spill_ctx& ctx, Temp& scratch_offset, + std::vector>& instructions, + unsigned offset, bool is_top_level) +{ + Builder bld(ctx.program); + if (is_top_level) { + bld.reset(&instructions); + } else { + /* find p_logical_end */ + unsigned idx = instructions.size() - 1; + while (instructions[idx]->opcode != aco_opcode::p_logical_end) + idx--; + bld.reset(&instructions, std::next(instructions.begin(), idx)); + } + + Temp private_segment_buffer = ctx.program->private_segment_buffer; + if (ctx.program->stage != compute_cs) + private_segment_buffer = bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), private_segment_buffer, Operand(0u)); + + if (offset) + scratch_offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), scratch_offset, Operand(offset)); + + uint32_t rsrc_conf = S_008F0C_ADD_TID_ENABLE(1) | + S_008F0C_INDEX_STRIDE(ctx.program->wave_size == 64 ? 3 : 2); + + if (ctx.program->chip_class >= GFX10) { + rsrc_conf |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) | + S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | + S_008F0C_RESOURCE_LEVEL(1); + } else if (ctx.program->chip_class <= GFX7) { /* dfmt modifies stride on GFX8/GFX9 when ADD_TID_EN=1 */ + rsrc_conf |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | + S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); + } + /* older generations need element size = 4 bytes. element size removed in GFX9 */ + if (ctx.program->chip_class <= GFX8) + rsrc_conf |= S_008F0C_ELEMENT_SIZE(1); + + return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), + private_segment_buffer, Operand(-1u), + Operand(rsrc_conf)); +} + +void assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) { + std::map sgpr_slot; + std::map vgpr_slot; + std::vector is_assigned(ctx.interferences.size()); + + /* first, handle affinities: just merge all interferences into both spill ids */ + for (std::vector& vec : ctx.affinities) { + for (unsigned i = 0; i < vec.size(); i++) { + for (unsigned j = i + 1; j < vec.size(); j++) { + assert(vec[i] != vec[j]); + for (uint32_t id : ctx.interferences[vec[i]].second) + ctx.interferences[id].second.insert(vec[j]); + for (uint32_t id : ctx.interferences[vec[j]].second) + ctx.interferences[id].second.insert(vec[i]); + ctx.interferences[vec[i]].second.insert(ctx.interferences[vec[j]].second.begin(), ctx.interferences[vec[j]].second.end()); + ctx.interferences[vec[j]].second.insert(ctx.interferences[vec[i]].second.begin(), ctx.interferences[vec[i]].second.end()); + + bool reloaded = ctx.is_reloaded[vec[i]] || ctx.is_reloaded[vec[j]]; + ctx.is_reloaded[vec[i]] = reloaded; + ctx.is_reloaded[vec[j]] = reloaded; + } + } + } + for (ASSERTED uint32_t i = 0; i < ctx.interferences.size(); i++) + for (ASSERTED uint32_t id : ctx.interferences[i].second) + assert(i != id); + + /* for each spill slot, assign as many spill ids as possible */ + std::vector> spill_slot_interferences; + unsigned slot_idx = 0; + bool done = false; + + /* assign sgpr spill slots */ + while (!done) { + done = true; + for (unsigned id = 0; id < ctx.interferences.size(); id++) { + if (is_assigned[id] || !ctx.is_reloaded[id]) + continue; + if (ctx.interferences[id].first.type() != RegType::sgpr) + continue; + + /* check interferences */ + bool interferes = false; + for (unsigned i = slot_idx; i < slot_idx + ctx.interferences[id].first.size(); i++) { + if (i == spill_slot_interferences.size()) + spill_slot_interferences.emplace_back(std::set()); + if (spill_slot_interferences[i].find(id) != spill_slot_interferences[i].end() || i / ctx.wave_size != slot_idx / ctx.wave_size) { + interferes = true; + break; + } + } + if (interferes) { + done = false; + continue; + } + + /* we found a spill id which can be assigned to current spill slot */ + sgpr_slot[id] = slot_idx; + is_assigned[id] = true; + for (unsigned i = slot_idx; i < slot_idx + ctx.interferences[id].first.size(); i++) + spill_slot_interferences[i].insert(ctx.interferences[id].second.begin(), ctx.interferences[id].second.end()); + + /* add all affinities: there are no additional interferences */ + for (std::vector& vec : ctx.affinities) { + bool found_affinity = false; + for (uint32_t entry : vec) { + if (entry == id) { + found_affinity = true; + break; + } + } + if (!found_affinity) + continue; + for (uint32_t entry : vec) { + sgpr_slot[entry] = slot_idx; + is_assigned[entry] = true; + } + } + } + slot_idx++; + } + + unsigned sgpr_spill_slots = spill_slot_interferences.size(); + spill_slot_interferences.clear(); + slot_idx = 0; + done = false; + + /* assign vgpr spill slots */ + while (!done) { + done = true; + for (unsigned id = 0; id < ctx.interferences.size(); id++) { + if (is_assigned[id] || !ctx.is_reloaded[id]) + continue; + if (ctx.interferences[id].first.type() != RegType::vgpr) + continue; + + /* check interferences */ + bool interferes = false; + for (unsigned i = slot_idx; i < slot_idx + ctx.interferences[id].first.size(); i++) { + if (i == spill_slot_interferences.size()) + spill_slot_interferences.emplace_back(std::set()); + /* check for interference and ensure that vector regs are stored next to each other */ + if (spill_slot_interferences[i].find(id) != spill_slot_interferences[i].end()) { + interferes = true; + break; + } + } + if (interferes) { + done = false; + continue; + } + + /* we found a spill id which can be assigned to current spill slot */ + vgpr_slot[id] = slot_idx; + is_assigned[id] = true; + for (unsigned i = slot_idx; i < slot_idx + ctx.interferences[id].first.size(); i++) + spill_slot_interferences[i].insert(ctx.interferences[id].second.begin(), ctx.interferences[id].second.end()); + + /* add all affinities: there are no additional interferences */ + for (std::vector& vec : ctx.affinities) { + bool found_affinity = false; + for (uint32_t entry : vec) { + if (entry == id) { + found_affinity = true; + break; + } + } + if (!found_affinity) + continue; + for (uint32_t entry : vec) { + vgpr_slot[entry] = slot_idx; + is_assigned[entry] = true; + } + } + } + slot_idx++; + } + + unsigned vgpr_spill_slots = spill_slot_interferences.size(); + + for (unsigned id = 0; id < is_assigned.size(); id++) + assert(is_assigned[id] || !ctx.is_reloaded[id]); + + for (std::vector& vec : ctx.affinities) { + for (unsigned i = 0; i < vec.size(); i++) { + for (unsigned j = i + 1; j < vec.size(); j++) { + assert(is_assigned[vec[i]] == is_assigned[vec[j]]); + if (!is_assigned[vec[i]]) + continue; + assert(ctx.is_reloaded[vec[i]] == ctx.is_reloaded[vec[j]]); + assert(ctx.interferences[vec[i]].first.type() == ctx.interferences[vec[j]].first.type()); + if (ctx.interferences[vec[i]].first.type() == RegType::sgpr) + assert(sgpr_slot[vec[i]] == sgpr_slot[vec[j]]); + else + assert(vgpr_slot[vec[i]] == vgpr_slot[vec[j]]); + } + } + } + + /* hope, we didn't mess up */ + std::vector vgpr_spill_temps((sgpr_spill_slots + ctx.wave_size - 1) / ctx.wave_size); + assert(vgpr_spill_temps.size() <= spills_to_vgpr); + + /* replace pseudo instructions with actual hardware instructions */ + Temp scratch_offset = ctx.program->scratch_offset, scratch_rsrc = Temp(); + unsigned last_top_level_block_idx = 0; + std::vector reload_in_loop(vgpr_spill_temps.size()); + for (Block& block : ctx.program->blocks) { + + /* after loops, we insert a user if there was a reload inside the loop */ + if (block.loop_nest_depth == 0) { + int end_vgprs = 0; + for (unsigned i = 0; i < vgpr_spill_temps.size(); i++) { + if (reload_in_loop[i]) + end_vgprs++; + } + + if (end_vgprs > 0) { + aco_ptr destr{create_instruction(aco_opcode::p_end_linear_vgpr, Format::PSEUDO, end_vgprs, 0)}; + int k = 0; + for (unsigned i = 0; i < vgpr_spill_temps.size(); i++) { + if (reload_in_loop[i]) + destr->operands[k++] = Operand(vgpr_spill_temps[i]); + reload_in_loop[i] = false; + } + /* find insertion point */ + std::vector>::iterator it = block.instructions.begin(); + while ((*it)->opcode == aco_opcode::p_linear_phi || (*it)->opcode == aco_opcode::p_phi) + ++it; + block.instructions.insert(it, std::move(destr)); + } + } + + if (block.kind & block_kind_top_level && !block.linear_preds.empty()) { + last_top_level_block_idx = block.index; + + /* check if any spilled variables use a created linear vgpr, otherwise destroy them */ + for (unsigned i = 0; i < vgpr_spill_temps.size(); i++) { + if (vgpr_spill_temps[i] == Temp()) + continue; + + bool can_destroy = true; + for (std::pair pair : ctx.spills_exit[block.linear_preds[0]]) { + + if (sgpr_slot.find(pair.second) != sgpr_slot.end() && + sgpr_slot[pair.second] / ctx.wave_size == i) { + can_destroy = false; + break; + } + } + if (can_destroy) + vgpr_spill_temps[i] = Temp(); + } + } + + std::vector>::iterator it; + std::vector> instructions; + instructions.reserve(block.instructions.size()); + Builder bld(ctx.program, &instructions); + for (it = block.instructions.begin(); it != block.instructions.end(); ++it) { + + if ((*it)->opcode == aco_opcode::p_spill) { + uint32_t spill_id = (*it)->operands[1].constantValue(); + + if (!ctx.is_reloaded[spill_id]) { + /* never reloaded, so don't spill */ + } else if (vgpr_slot.find(spill_id) != vgpr_slot.end()) { + /* spill vgpr */ + ctx.program->config->spilled_vgprs += (*it)->operands[0].size(); + uint32_t spill_slot = vgpr_slot[spill_id]; + bool add_offset_to_sgpr = ctx.program->config->scratch_bytes_per_wave / ctx.program->wave_size + vgpr_spill_slots * 4 > 4096; + unsigned base_offset = add_offset_to_sgpr ? 0 : ctx.program->config->scratch_bytes_per_wave / ctx.program->wave_size; + + /* check if the scratch resource descriptor already exists */ + if (scratch_rsrc == Temp()) { + unsigned offset = add_offset_to_sgpr ? ctx.program->config->scratch_bytes_per_wave : 0; + scratch_rsrc = load_scratch_resource(ctx, scratch_offset, + last_top_level_block_idx == block.index ? + instructions : ctx.program->blocks[last_top_level_block_idx].instructions, + offset, + last_top_level_block_idx == block.index); + } + + unsigned offset = base_offset + spill_slot * 4; + aco_opcode opcode = aco_opcode::buffer_store_dword; + assert((*it)->operands[0].isTemp()); + Temp temp = (*it)->operands[0].getTemp(); + assert(temp.type() == RegType::vgpr && !temp.is_linear()); + if (temp.size() > 1) { + Instruction* split{create_instruction(aco_opcode::p_split_vector, Format::PSEUDO, 1, temp.size())}; + split->operands[0] = Operand(temp); + for (unsigned i = 0; i < temp.size(); i++) + split->definitions[i] = bld.def(v1); + bld.insert(split); + for (unsigned i = 0; i < temp.size(); i++) + bld.mubuf(opcode, scratch_rsrc, Operand(), scratch_offset, split->definitions[i].getTemp(), offset + i * 4, false); + } else { + bld.mubuf(opcode, scratch_rsrc, Operand(), scratch_offset, temp, offset, false); + } + } else if (sgpr_slot.find(spill_id) != sgpr_slot.end()) { + ctx.program->config->spilled_sgprs += (*it)->operands[0].size(); + + uint32_t spill_slot = sgpr_slot[spill_id]; + + /* check if the linear vgpr already exists */ + if (vgpr_spill_temps[spill_slot / ctx.wave_size] == Temp()) { + Temp linear_vgpr = {ctx.program->allocateId(), v1.as_linear()}; + vgpr_spill_temps[spill_slot / ctx.wave_size] = linear_vgpr; + aco_ptr create{create_instruction(aco_opcode::p_start_linear_vgpr, Format::PSEUDO, 0, 1)}; + create->definitions[0] = Definition(linear_vgpr); + /* find the right place to insert this definition */ + if (last_top_level_block_idx == block.index) { + /* insert right before the current instruction */ + instructions.emplace_back(std::move(create)); + } else { + assert(last_top_level_block_idx < block.index); + /* insert before the branch at last top level block */ + std::vector>& instructions = ctx.program->blocks[last_top_level_block_idx].instructions; + instructions.insert(std::next(instructions.begin(), instructions.size() - 1), std::move(create)); + } + } + + /* spill sgpr: just add the vgpr temp to operands */ + Pseudo_instruction* spill = create_instruction(aco_opcode::p_spill, Format::PSEUDO, 3, 0); + spill->operands[0] = Operand(vgpr_spill_temps[spill_slot / ctx.wave_size]); + spill->operands[1] = Operand(spill_slot % ctx.wave_size); + spill->operands[2] = (*it)->operands[0]; + instructions.emplace_back(aco_ptr(spill)); + } else { + unreachable("No spill slot assigned for spill id"); + } + + } else if ((*it)->opcode == aco_opcode::p_reload) { + uint32_t spill_id = (*it)->operands[0].constantValue(); + assert(ctx.is_reloaded[spill_id]); + + if (vgpr_slot.find(spill_id) != vgpr_slot.end()) { + /* reload vgpr */ + uint32_t spill_slot = vgpr_slot[spill_id]; + bool add_offset_to_sgpr = ctx.program->config->scratch_bytes_per_wave / ctx.program->wave_size + vgpr_spill_slots * 4 > 4096; + unsigned base_offset = add_offset_to_sgpr ? 0 : ctx.program->config->scratch_bytes_per_wave / ctx.program->wave_size; + + /* check if the scratch resource descriptor already exists */ + if (scratch_rsrc == Temp()) { + unsigned offset = add_offset_to_sgpr ? ctx.program->config->scratch_bytes_per_wave : 0; + scratch_rsrc = load_scratch_resource(ctx, scratch_offset, + last_top_level_block_idx == block.index ? + instructions : ctx.program->blocks[last_top_level_block_idx].instructions, + offset, + last_top_level_block_idx == block.index); + } + + unsigned offset = base_offset + spill_slot * 4; + aco_opcode opcode = aco_opcode::buffer_load_dword; + Definition def = (*it)->definitions[0]; + if (def.size() > 1) { + Instruction* vec{create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, def.size(), 1)}; + vec->definitions[0] = def; + for (unsigned i = 0; i < def.size(); i++) { + Temp tmp = bld.tmp(v1); + vec->operands[i] = Operand(tmp); + bld.mubuf(opcode, Definition(tmp), scratch_rsrc, Operand(), scratch_offset, offset + i * 4, false); + } + bld.insert(vec); + } else { + bld.mubuf(opcode, def, scratch_rsrc, Operand(), scratch_offset, offset, false); + } + } else if (sgpr_slot.find(spill_id) != sgpr_slot.end()) { + uint32_t spill_slot = sgpr_slot[spill_id]; + reload_in_loop[spill_slot / ctx.wave_size] = block.loop_nest_depth > 0; + + /* check if the linear vgpr already exists */ + if (vgpr_spill_temps[spill_slot / ctx.wave_size] == Temp()) { + Temp linear_vgpr = {ctx.program->allocateId(), v1.as_linear()}; + vgpr_spill_temps[spill_slot / ctx.wave_size] = linear_vgpr; + aco_ptr create{create_instruction(aco_opcode::p_start_linear_vgpr, Format::PSEUDO, 0, 1)}; + create->definitions[0] = Definition(linear_vgpr); + /* find the right place to insert this definition */ + if (last_top_level_block_idx == block.index) { + /* insert right before the current instruction */ + instructions.emplace_back(std::move(create)); + } else { + assert(last_top_level_block_idx < block.index); + /* insert before the branch at last top level block */ + std::vector>& instructions = ctx.program->blocks[last_top_level_block_idx].instructions; + instructions.insert(std::next(instructions.begin(), instructions.size() - 1), std::move(create)); + } + } + + /* reload sgpr: just add the vgpr temp to operands */ + Pseudo_instruction* reload = create_instruction(aco_opcode::p_reload, Format::PSEUDO, 2, 1); + reload->operands[0] = Operand(vgpr_spill_temps[spill_slot / ctx.wave_size]); + reload->operands[1] = Operand(spill_slot % ctx.wave_size); + reload->definitions[0] = (*it)->definitions[0]; + instructions.emplace_back(aco_ptr(reload)); + } else { + unreachable("No spill slot assigned for spill id"); + } + } else if (!ctx.remat_used.count(it->get()) || ctx.remat_used[it->get()]) { + instructions.emplace_back(std::move(*it)); + } + + } + block.instructions = std::move(instructions); + } + + /* update required scratch memory */ + ctx.program->config->scratch_bytes_per_wave += align(vgpr_spill_slots * 4 * ctx.program->wave_size, 1024); + + /* SSA elimination inserts copies for logical phis right before p_logical_end + * So if a linear vgpr is used between that p_logical_end and the branch, + * we need to ensure logical phis don't choose a definition which aliases + * the linear vgpr. + * TODO: Moving the spills and reloads to before p_logical_end might produce + * slightly better code. */ + for (Block& block : ctx.program->blocks) { + /* loops exits are already handled */ + if (block.logical_preds.size() <= 1) + continue; + + bool has_logical_phis = false; + for (aco_ptr& instr : block.instructions) { + if (instr->opcode == aco_opcode::p_phi) { + has_logical_phis = true; + break; + } else if (instr->opcode != aco_opcode::p_linear_phi) { + break; + } + } + if (!has_logical_phis) + continue; + + std::set vgprs; + for (unsigned pred_idx : block.logical_preds) { + Block& pred = ctx.program->blocks[pred_idx]; + for (int i = pred.instructions.size() - 1; i >= 0; i--) { + aco_ptr& pred_instr = pred.instructions[i]; + if (pred_instr->opcode == aco_opcode::p_logical_end) { + break; + } else if (pred_instr->opcode == aco_opcode::p_spill || + pred_instr->opcode == aco_opcode::p_reload) { + vgprs.insert(pred_instr->operands[0].getTemp()); + } + } + } + if (!vgprs.size()) + continue; + + aco_ptr destr{create_instruction(aco_opcode::p_end_linear_vgpr, Format::PSEUDO, vgprs.size(), 0)}; + int k = 0; + for (Temp tmp : vgprs) { + destr->operands[k++] = Operand(tmp); + } + /* find insertion point */ + std::vector>::iterator it = block.instructions.begin(); + while ((*it)->opcode == aco_opcode::p_linear_phi || (*it)->opcode == aco_opcode::p_phi) + ++it; + block.instructions.insert(it, std::move(destr)); + } +} + +} /* end namespace */ + + +void spill(Program* program, live& live_vars, const struct radv_nir_compiler_options *options) +{ + program->config->spilled_vgprs = 0; + program->config->spilled_sgprs = 0; + + /* no spilling when register pressure is low enough */ + if (program->num_waves > 0) + return; + + /* lower to CSSA before spilling to ensure correctness w.r.t. phis */ + lower_to_cssa(program, live_vars, options); + + /* calculate target register demand */ + RegisterDemand register_target = program->max_reg_demand; + if (register_target.sgpr > program->sgpr_limit) + register_target.vgpr += (register_target.sgpr - program->sgpr_limit + program->wave_size - 1 + 32) / program->wave_size; + register_target.sgpr = program->sgpr_limit; + + if (register_target.vgpr > program->vgpr_limit) + register_target.sgpr = program->sgpr_limit - 5; + int spills_to_vgpr = (program->max_reg_demand.sgpr - register_target.sgpr + program->wave_size - 1 + 32) / program->wave_size; + register_target.vgpr = program->vgpr_limit - spills_to_vgpr; + + /* initialize ctx */ + spill_ctx ctx(register_target, program, live_vars.register_demand); + compute_global_next_uses(ctx, live_vars.live_out); + get_rematerialize_info(ctx); + + /* create spills and reloads */ + for (unsigned i = 0; i < program->blocks.size(); i++) + spill_block(ctx, i); + + /* assign spill slots and DCE rematerialized code */ + assign_spill_slots(ctx, spills_to_vgpr); + + /* update live variable information */ + live_vars = live_var_analysis(program, options); + + assert(program->num_waves > 0); +} + +} + diff -Nru mesa-19.2.8/src/amd/compiler/aco_ssa_elimination.cpp mesa-20.0.8/src/amd/compiler/aco_ssa_elimination.cpp --- mesa-19.2.8/src/amd/compiler/aco_ssa_elimination.cpp 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/amd/compiler/aco_ssa_elimination.cpp 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,302 @@ +/* + * Copyright © 2018 Valve Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + */ + + +#include "aco_ir.h" + +#include + +namespace aco { +namespace { + +/* map: block-id -> pair (dest, src) to store phi information */ +typedef std::map>> phi_info; + +struct ssa_elimination_ctx { + phi_info logical_phi_info; + phi_info linear_phi_info; + std::vector empty_blocks; + Program* program; + + ssa_elimination_ctx(Program* program) : empty_blocks(program->blocks.size(), true), program(program) {} +}; + +void collect_phi_info(ssa_elimination_ctx& ctx) +{ + for (Block& block : ctx.program->blocks) { + for (aco_ptr& phi : block.instructions) { + if (phi->opcode != aco_opcode::p_phi && phi->opcode != aco_opcode::p_linear_phi) + break; + + for (unsigned i = 0; i < phi->operands.size(); i++) { + if (phi->operands[i].isUndefined()) + continue; + if (phi->operands[i].isTemp() && phi->operands[i].physReg() == phi->definitions[0].physReg()) + continue; + + std::vector& preds = phi->opcode == aco_opcode::p_phi ? block.logical_preds : block.linear_preds; + phi_info& info = phi->opcode == aco_opcode::p_phi ? ctx.logical_phi_info : ctx.linear_phi_info; + const auto result = info.emplace(preds[i], std::vector>()); + assert(phi->definitions[0].size() == phi->operands[i].size()); + result.first->second.emplace_back(phi->definitions[0], phi->operands[i]); + ctx.empty_blocks[preds[i]] = false; + } + } + } +} + +void insert_parallelcopies(ssa_elimination_ctx& ctx) +{ + /* insert the parallelcopies from logical phis before p_logical_end */ + for (auto&& entry : ctx.logical_phi_info) { + Block& block = ctx.program->blocks[entry.first]; + unsigned idx = block.instructions.size() - 1; + while (block.instructions[idx]->opcode != aco_opcode::p_logical_end) { + assert(idx > 0); + idx--; + } + + std::vector>::iterator it = std::next(block.instructions.begin(), idx); + aco_ptr pc{create_instruction(aco_opcode::p_parallelcopy, Format::PSEUDO, entry.second.size(), entry.second.size())}; + unsigned i = 0; + for (std::pair& pair : entry.second) + { + pc->definitions[i] = pair.first; + pc->operands[i] = pair.second; + i++; + } + /* this shouldn't be needed since we're only copying vgprs */ + pc->tmp_in_scc = false; + block.instructions.insert(it, std::move(pc)); + } + + /* insert parallelcopies for the linear phis at the end of blocks just before the branch */ + for (auto&& entry : ctx.linear_phi_info) { + Block& block = ctx.program->blocks[entry.first]; + std::vector>::iterator it = block.instructions.end(); + --it; + assert((*it)->format == Format::PSEUDO_BRANCH); + aco_ptr pc{create_instruction(aco_opcode::p_parallelcopy, Format::PSEUDO, entry.second.size(), entry.second.size())}; + unsigned i = 0; + for (std::pair& pair : entry.second) + { + pc->definitions[i] = pair.first; + pc->operands[i] = pair.second; + i++; + } + pc->tmp_in_scc = block.scc_live_out; + pc->scratch_sgpr = block.scratch_sgpr; + block.instructions.insert(it, std::move(pc)); + } +} + +bool is_empty_block(Block* block, bool ignore_exec_writes) +{ + /* check if this block is empty and the exec mask is not needed */ + for (aco_ptr& instr : block->instructions) { + switch (instr->opcode) { + case aco_opcode::p_linear_phi: + case aco_opcode::p_phi: + case aco_opcode::p_logical_start: + case aco_opcode::p_logical_end: + case aco_opcode::p_branch: + break; + case aco_opcode::p_parallelcopy: + for (unsigned i = 0; i < instr->definitions.size(); i++) { + if (ignore_exec_writes && instr->definitions[i].physReg() == exec) + continue; + if (instr->definitions[i].physReg() != instr->operands[i].physReg()) + return false; + } + break; + case aco_opcode::s_andn2_b64: + case aco_opcode::s_andn2_b32: + if (ignore_exec_writes && instr->definitions[0].physReg() == exec) + break; + default: + return false; + } + } + return true; +} + +void try_remove_merge_block(ssa_elimination_ctx& ctx, Block* block) +{ + /* check if the successor is another merge block which restores exec */ + // TODO: divergent loops also restore exec + if (block->linear_succs.size() != 1 || + !(ctx.program->blocks[block->linear_succs[0]].kind & block_kind_merge)) + return; + + /* check if this block is empty */ + if (!is_empty_block(block, true)) + return; + + /* keep the branch instruction and remove the rest */ + aco_ptr branch = std::move(block->instructions.back()); + block->instructions.clear(); + block->instructions.emplace_back(std::move(branch)); +} + +void try_remove_invert_block(ssa_elimination_ctx& ctx, Block* block) +{ + assert(block->linear_succs.size() == 2); + /* only remove this block if the successor got removed as well */ + if (block->linear_succs[0] != block->linear_succs[1]) + return; + + /* check if block is otherwise empty */ + if (!is_empty_block(block, true)) + return; + + unsigned succ_idx = block->linear_succs[0]; + assert(block->linear_preds.size() == 2); + for (unsigned i = 0; i < 2; i++) { + Block *pred = &ctx.program->blocks[block->linear_preds[i]]; + pred->linear_succs[0] = succ_idx; + ctx.program->blocks[succ_idx].linear_preds[i] = pred->index; + + Pseudo_branch_instruction *branch = static_cast(pred->instructions.back().get()); + assert(branch->format == Format::PSEUDO_BRANCH); + branch->target[0] = succ_idx; + branch->target[1] = succ_idx; + } + + block->instructions.clear(); + block->linear_preds.clear(); + block->linear_succs.clear(); +} + +void try_remove_simple_block(ssa_elimination_ctx& ctx, Block* block) +{ + if (!is_empty_block(block, false)) + return; + + Block& pred = ctx.program->blocks[block->linear_preds[0]]; + Block& succ = ctx.program->blocks[block->linear_succs[0]]; + Pseudo_branch_instruction* branch = static_cast(pred.instructions.back().get()); + if (branch->opcode == aco_opcode::p_branch) { + branch->target[0] = succ.index; + branch->target[1] = succ.index; + } else if (branch->target[0] == block->index) { + branch->target[0] = succ.index; + } else if (branch->target[0] == succ.index) { + assert(branch->target[1] == block->index); + branch->target[1] = succ.index; + branch->opcode = aco_opcode::p_branch; + } else if (branch->target[1] == block->index) { + /* check if there is a fall-through path from block to succ */ + bool falls_through = block->index < succ.index; + for (unsigned j = block->index + 1; falls_through && j < succ.index; j++) { + assert(ctx.program->blocks[j].index == j); + if (!ctx.program->blocks[j].instructions.empty()) + falls_through = false; + } + if (falls_through) { + branch->target[1] = succ.index; + } else { + /* check if there is a fall-through path for the alternative target */ + if (block->index >= branch->target[0]) + return; + for (unsigned j = block->index + 1; j < branch->target[0]; j++) { + if (!ctx.program->blocks[j].instructions.empty()) + return; + } + + /* This is a (uniform) break or continue block. The branch condition has to be inverted. */ + if (branch->opcode == aco_opcode::p_cbranch_z) + branch->opcode = aco_opcode::p_cbranch_nz; + else if (branch->opcode == aco_opcode::p_cbranch_nz) + branch->opcode = aco_opcode::p_cbranch_z; + else + assert(false); + /* also invert the linear successors */ + pred.linear_succs[0] = pred.linear_succs[1]; + pred.linear_succs[1] = succ.index; + branch->target[1] = branch->target[0]; + branch->target[0] = succ.index; + } + } else { + assert(false); + } + + if (branch->target[0] == branch->target[1]) + branch->opcode = aco_opcode::p_branch; + + for (unsigned i = 0; i < pred.linear_succs.size(); i++) + if (pred.linear_succs[i] == block->index) + pred.linear_succs[i] = succ.index; + + for (unsigned i = 0; i < succ.linear_preds.size(); i++) + if (succ.linear_preds[i] == block->index) + succ.linear_preds[i] = pred.index; + + block->instructions.clear(); + block->linear_preds.clear(); + block->linear_succs.clear(); +} + +void jump_threading(ssa_elimination_ctx& ctx) +{ + for (int i = ctx.program->blocks.size() - 1; i >= 0; i--) { + Block* block = &ctx.program->blocks[i]; + + if (!ctx.empty_blocks[i]) + continue; + + if (block->kind & block_kind_invert) { + try_remove_invert_block(ctx, block); + continue; + } + + if (block->linear_succs.size() > 1) + continue; + + if (block->kind & block_kind_merge || + block->kind & block_kind_loop_exit) + try_remove_merge_block(ctx, block); + + if (block->linear_preds.size() == 1) + try_remove_simple_block(ctx, block); + } +} + +} /* end namespace */ + + +void ssa_elimination(Program* program) +{ + ssa_elimination_ctx ctx(program); + + /* Collect information about every phi-instruction */ + collect_phi_info(ctx); + + /* eliminate empty blocks */ + jump_threading(ctx); + + /* insert parallelcopies from SSA elimination */ + insert_parallelcopies(ctx); + +} +} diff -Nru mesa-19.2.8/src/amd/compiler/aco_util.h mesa-20.0.8/src/amd/compiler/aco_util.h --- mesa-19.2.8/src/amd/compiler/aco_util.h 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/amd/compiler/aco_util.h 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,233 @@ +/* + * Copyright Michael Schellenberger Costa + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + */ + +#ifndef ACO_UTIL_H +#define ACO_UTIL_H + +#include +#include + +namespace aco { + +/*! \brief Definition of a span object +* +* \details A "span" is an "array view" type for holding a view of contiguous +* data. The "span" object does not own the data itself. +*/ +template +class span { +public: + using value_type = T; + using pointer = value_type*; + using const_pointer = const value_type*; + using reference = value_type&; + using const_reference = const value_type&; + using iterator = pointer; + using const_iterator = const_pointer; + using reverse_iterator = std::reverse_iterator; + using const_reverse_iterator = std::reverse_iterator; + using size_type = uint16_t; + using difference_type = ptrdiff_t; + + /*! \brief Compiler generated default constructor + */ + constexpr span() = default; + + /*! \brief Constructor taking a pointer and the length of the span + * \param[in] data Pointer to the underlying data array + * \param[in] length The size of the span + */ + constexpr span(uint16_t offset, const size_type length) + : offset{ offset } , length{ length } {} + + /*! \brief Returns an iterator to the begin of the span + * \return data + */ + constexpr iterator begin() noexcept { + return (pointer)((uintptr_t)this + offset); + } + + /*! \brief Returns a const_iterator to the begin of the span + * \return data + */ + constexpr const_iterator begin() const noexcept { + return (const_pointer)((uintptr_t)this + offset); + } + + /*! \brief Returns an iterator to the end of the span + * \return data + length + */ + constexpr iterator end() noexcept { + return std::next(begin(), length); + } + + /*! \brief Returns a const_iterator to the end of the span + * \return data + length + */ + constexpr const_iterator end() const noexcept { + return std::next(begin(), length); + } + + /*! \brief Returns a const_iterator to the begin of the span + * \return data + */ + constexpr const_iterator cbegin() const noexcept { + return begin(); + } + + /*! \brief Returns a const_iterator to the end of the span + * \return data + length + */ + constexpr const_iterator cend() const noexcept { + return std::next(begin(), length); + } + + /*! \brief Returns a reverse_iterator to the end of the span + * \return reverse_iterator(end()) + */ + constexpr reverse_iterator rbegin() noexcept { + return reverse_iterator(end()); + } + + /*! \brief Returns a const_reverse_iterator to the end of the span + * \return reverse_iterator(end()) + */ + constexpr const_reverse_iterator rbegin() const noexcept { + return const_reverse_iterator(end()); + } + + /*! \brief Returns a reverse_iterator to the begin of the span + * \return reverse_iterator(begin()) + */ + constexpr reverse_iterator rend() noexcept { + return reverse_iterator(begin()); + } + + /*! \brief Returns a const_reverse_iterator to the begin of the span + * \return reverse_iterator(begin()) + */ + constexpr const_reverse_iterator rend() const noexcept { + return const_reverse_iterator(begin()); + } + + /*! \brief Returns a const_reverse_iterator to the end of the span + * \return rbegin() + */ + constexpr const_reverse_iterator crbegin() const noexcept { + return const_reverse_iterator(cend()); + } + + /*! \brief Returns a const_reverse_iterator to the begin of the span + * \return rend() + */ + constexpr const_reverse_iterator crend() const noexcept { + return const_reverse_iterator(cbegin()); + } + + /*! \brief Unchecked access operator + * \param[in] index Index of the element we want to access + * \return *(std::next(data, index)) + */ + constexpr reference operator[](const size_type index) noexcept { + assert(length > index); + return *(std::next(begin(), index)); + } + + /*! \brief Unchecked const access operator + * \param[in] index Index of the element we want to access + * \return *(std::next(data, index)) + */ + constexpr const_reference operator[](const size_type index) const noexcept { + assert(length > index); + return *(std::next(begin(), index)); + } + + /*! \brief Returns a reference to the last element of the span + * \return *(std::next(data, length - 1)) + */ + constexpr reference back() noexcept { + assert(length > 0); + return *(std::next(begin(), length - 1)); + } + + /*! \brief Returns a const_reference to the last element of the span + * \return *(std::next(data, length - 1)) + */ + constexpr const_reference back() const noexcept { + assert(length > 0); + return *(std::next(begin(), length - 1)); + } + + /*! \brief Returns a reference to the first element of the span + * \return *begin() + */ + constexpr reference front() noexcept { + assert(length > 0); + return *begin(); + } + + /*! \brief Returns a const_reference to the first element of the span + * \return *cbegin() + */ + constexpr const_reference front() const noexcept { + assert(length > 0); + return *cbegin(); + } + + /*! \brief Returns true if the span is empty + * \return length == 0 + */ + constexpr bool empty() const noexcept { + return length == 0; + } + + /*! \brief Returns the size of the span + * \return length == 0 + */ + constexpr size_type size() const noexcept { + return length; + } + + /*! \brief Decreases the size of the span by 1 + */ + constexpr void pop_back() noexcept { + assert(length > 0); + --length; + } + + /*! \brief Clears the span + */ + constexpr void clear() noexcept { + offset = 0; + length = 0; + } + +private: + uint16_t offset{ 0 }; //!> Byte offset from span to data + size_type length{ 0 }; //!> Size of the span +}; + +} // namespace aco + +#endif // ACO_UTIL_H diff -Nru mesa-19.2.8/src/amd/compiler/aco_validate.cpp mesa-20.0.8/src/amd/compiler/aco_validate.cpp --- mesa-19.2.8/src/amd/compiler/aco_validate.cpp 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/amd/compiler/aco_validate.cpp 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,529 @@ +/* + * Copyright © 2018 Valve Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + */ + +#include "aco_ir.h" + +#include +#include + +namespace aco { + +#ifndef NDEBUG +void perfwarn(bool cond, const char *msg, Instruction *instr) +{ + if (cond) { + fprintf(stderr, "ACO performance warning: %s\n", msg); + if (instr) { + fprintf(stderr, "instruction: "); + aco_print_instr(instr, stderr); + fprintf(stderr, "\n"); + } + + if (debug_flags & DEBUG_PERFWARN) + exit(1); + } +} +#endif + +void validate(Program* program, FILE * output) +{ + if (!(debug_flags & DEBUG_VALIDATE)) + return; + + bool is_valid = true; + auto check = [&output, &is_valid](bool check, const char * msg, aco::Instruction * instr) -> void { + if (!check) { + fprintf(output, "%s: ", msg); + aco_print_instr(instr, output); + fprintf(output, "\n"); + is_valid = false; + } + }; + auto check_block = [&output, &is_valid](bool check, const char * msg, aco::Block * block) -> void { + if (!check) { + fprintf(output, "%s: BB%u\n", msg, block->index); + is_valid = false; + } + }; + + for (Block& block : program->blocks) { + for (aco_ptr& instr : block.instructions) { + + /* check base format */ + Format base_format = instr->format; + base_format = (Format)((uint32_t)base_format & ~(uint32_t)Format::SDWA); + base_format = (Format)((uint32_t)base_format & ~(uint32_t)Format::DPP); + if ((uint32_t)base_format & (uint32_t)Format::VOP1) + base_format = Format::VOP1; + else if ((uint32_t)base_format & (uint32_t)Format::VOP2) + base_format = Format::VOP2; + else if ((uint32_t)base_format & (uint32_t)Format::VOPC) + base_format = Format::VOPC; + else if ((uint32_t)base_format & (uint32_t)Format::VINTRP) + base_format = Format::VINTRP; + check(base_format == instr_info.format[(int)instr->opcode], "Wrong base format for instruction", instr.get()); + + /* check VOP3 modifiers */ + if (((uint32_t)instr->format & (uint32_t)Format::VOP3) && instr->format != Format::VOP3) { + check(base_format == Format::VOP2 || + base_format == Format::VOP1 || + base_format == Format::VOPC || + base_format == Format::VINTRP, + "Format cannot have VOP3A/VOP3B applied", instr.get()); + } + + /* check for undefs */ + for (unsigned i = 0; i < instr->operands.size(); i++) { + if (instr->operands[i].isUndefined()) { + bool flat = instr->format == Format::FLAT || instr->format == Format::SCRATCH || instr->format == Format::GLOBAL; + bool can_be_undef = is_phi(instr) || instr->format == Format::EXP || + instr->format == Format::PSEUDO_REDUCTION || + (flat && i == 1) || (instr->format == Format::MIMG && i == 1) || + ((instr->format == Format::MUBUF || instr->format == Format::MTBUF) && i == 1); + check(can_be_undef, "Undefs can only be used in certain operands", instr.get()); + } + } + + if (instr->isSALU() || instr->isVALU()) { + /* check literals */ + Operand literal(s1); + for (unsigned i = 0; i < instr->operands.size(); i++) + { + Operand op = instr->operands[i]; + if (!op.isLiteral()) + continue; + + check(instr->format == Format::SOP1 || + instr->format == Format::SOP2 || + instr->format == Format::SOPC || + instr->format == Format::VOP1 || + instr->format == Format::VOP2 || + instr->format == Format::VOPC || + (instr->isVOP3() && program->chip_class >= GFX10), + "Literal applied on wrong instruction format", instr.get()); + + check(literal.isUndefined() || (literal.size() == op.size() && literal.constantValue() == op.constantValue()), "Only 1 Literal allowed", instr.get()); + literal = op; + check(!instr->isVALU() || instr->isVOP3() || i == 0 || i == 2, "Wrong source position for Literal argument", instr.get()); + } + + /* check num sgprs for VALU */ + if (instr->isVALU()) { + bool is_shift64 = instr->opcode == aco_opcode::v_lshlrev_b64 || + instr->opcode == aco_opcode::v_lshrrev_b64 || + instr->opcode == aco_opcode::v_ashrrev_i64; + unsigned const_bus_limit = 1; + if (program->chip_class >= GFX10 && !is_shift64) + const_bus_limit = 2; + + check(instr->definitions[0].getTemp().type() == RegType::vgpr || + (int) instr->format & (int) Format::VOPC || + instr->opcode == aco_opcode::v_readfirstlane_b32 || + instr->opcode == aco_opcode::v_readlane_b32 || + instr->opcode == aco_opcode::v_readlane_b32_e64, + "Wrong Definition type for VALU instruction", instr.get()); + unsigned num_sgprs = 0; + unsigned sgpr[] = {0, 0}; + for (unsigned i = 0; i < instr->operands.size(); i++) + { + Operand op = instr->operands[i]; + if (instr->opcode == aco_opcode::v_readfirstlane_b32 || + instr->opcode == aco_opcode::v_readlane_b32 || + instr->opcode == aco_opcode::v_readlane_b32_e64 || + instr->opcode == aco_opcode::v_writelane_b32 || + instr->opcode == aco_opcode::v_writelane_b32_e64) { + check(!op.isLiteral(), "No literal allowed on VALU instruction", instr.get()); + check(i == 1 || (op.isTemp() && op.regClass() == v1), "Wrong Operand type for VALU instruction", instr.get()); + continue; + } + if (op.isTemp() && instr->operands[i].regClass().type() == RegType::sgpr) { + check(i != 1 || instr->isVOP3(), "Wrong source position for SGPR argument", instr.get()); + + if (op.tempId() != sgpr[0] && op.tempId() != sgpr[1]) { + if (num_sgprs < 2) + sgpr[num_sgprs++] = op.tempId(); + } + } + + if (op.isConstant() && !op.isLiteral()) + check(i == 0 || instr->isVOP3(), "Wrong source position for constant argument", instr.get()); + } + check(num_sgprs + (literal.isUndefined() ? 0 : 1) <= const_bus_limit, "Too many SGPRs/literals", instr.get()); + } + + if (instr->format == Format::SOP1 || instr->format == Format::SOP2) { + check(instr->definitions[0].getTemp().type() == RegType::sgpr, "Wrong Definition type for SALU instruction", instr.get()); + for (const Operand& op : instr->operands) { + check(op.isConstant() || op.regClass().type() <= RegType::sgpr, + "Wrong Operand type for SALU instruction", instr.get()); + } + } + } + + switch (instr->format) { + case Format::PSEUDO: { + if (instr->opcode == aco_opcode::p_create_vector) { + unsigned size = 0; + for (const Operand& op : instr->operands) { + size += op.size(); + } + check(size == instr->definitions[0].size(), "Definition size does not match operand sizes", instr.get()); + if (instr->definitions[0].getTemp().type() == RegType::sgpr) { + for (const Operand& op : instr->operands) { + check(op.isConstant() || op.regClass().type() == RegType::sgpr, + "Wrong Operand type for scalar vector", instr.get()); + } + } + } else if (instr->opcode == aco_opcode::p_extract_vector) { + check((instr->operands[0].isTemp()) && instr->operands[1].isConstant(), "Wrong Operand types", instr.get()); + check(instr->operands[1].constantValue() < instr->operands[0].size(), "Index out of range", instr.get()); + check(instr->definitions[0].getTemp().type() == RegType::vgpr || instr->operands[0].regClass().type() == RegType::sgpr, + "Cannot extract SGPR value from VGPR vector", instr.get()); + } else if (instr->opcode == aco_opcode::p_parallelcopy) { + check(instr->definitions.size() == instr->operands.size(), "Number of Operands does not match number of Definitions", instr.get()); + for (unsigned i = 0; i < instr->operands.size(); i++) { + if (instr->operands[i].isTemp()) + check((instr->definitions[i].getTemp().type() == instr->operands[i].regClass().type()) || + (instr->definitions[i].getTemp().type() == RegType::vgpr && instr->operands[i].regClass().type() == RegType::sgpr), + "Operand and Definition types do not match", instr.get()); + } + } else if (instr->opcode == aco_opcode::p_phi) { + check(instr->operands.size() == block.logical_preds.size(), "Number of Operands does not match number of predecessors", instr.get()); + check(instr->definitions[0].getTemp().type() == RegType::vgpr || instr->definitions[0].getTemp().regClass() == program->lane_mask, "Logical Phi Definition must be vgpr or divergent boolean", instr.get()); + } else if (instr->opcode == aco_opcode::p_linear_phi) { + for (const Operand& op : instr->operands) + check(!op.isTemp() || op.getTemp().is_linear(), "Wrong Operand type", instr.get()); + check(instr->operands.size() == block.linear_preds.size(), "Number of Operands does not match number of predecessors", instr.get()); + } + break; + } + case Format::SMEM: { + if (instr->operands.size() >= 1) + check(instr->operands[0].isTemp() && instr->operands[0].regClass().type() == RegType::sgpr, "SMEM operands must be sgpr", instr.get()); + if (instr->operands.size() >= 2) + check(instr->operands[1].isConstant() || (instr->operands[1].isTemp() && instr->operands[1].regClass().type() == RegType::sgpr), + "SMEM offset must be constant or sgpr", instr.get()); + if (!instr->definitions.empty()) + check(instr->definitions[0].getTemp().type() == RegType::sgpr, "SMEM result must be sgpr", instr.get()); + break; + } + case Format::MTBUF: + case Format::MUBUF: { + check(instr->operands.size() > 1, "VMEM instructions must have at least one operand", instr.get()); + check(instr->operands[1].hasRegClass() && instr->operands[1].regClass().type() == RegType::vgpr, + "VADDR must be in vgpr for VMEM instructions", instr.get()); + check(instr->operands[0].isTemp() && instr->operands[0].regClass().type() == RegType::sgpr, "VMEM resource constant must be sgpr", instr.get()); + check(instr->operands.size() < 4 || (instr->operands[3].isTemp() && instr->operands[3].regClass().type() == RegType::vgpr), "VMEM write data must be vgpr", instr.get()); + break; + } + case Format::MIMG: { + check(instr->operands.size() == 3, "MIMG instructions must have exactly 3 operands", instr.get()); + check(instr->operands[0].hasRegClass() && (instr->operands[0].regClass() == s4 || instr->operands[0].regClass() == s8), + "MIMG operands[0] (resource constant) must be in 4 or 8 SGPRs", instr.get()); + if (instr->operands[1].hasRegClass() && instr->operands[1].regClass().type() == RegType::sgpr) + check(instr->operands[1].regClass() == s4, "MIMG operands[1] (sampler constant) must be 4 SGPRs", instr.get()); + else if (instr->operands[1].hasRegClass() && instr->operands[1].regClass().type() == RegType::vgpr) + check((instr->definitions.empty() || instr->definitions[0].regClass() == instr->operands[1].regClass() || + instr->opcode == aco_opcode::image_atomic_cmpswap || instr->opcode == aco_opcode::image_atomic_fcmpswap), + "MIMG operands[1] (VDATA) must be the same as definitions[0] for atomics", instr.get()); + check(instr->operands[2].hasRegClass() && instr->operands[2].regClass().type() == RegType::vgpr, + "MIMG operands[2] (VADDR) must be VGPR", instr.get()); + check(instr->definitions.empty() || (instr->definitions[0].isTemp() && instr->definitions[0].regClass().type() == RegType::vgpr), + "MIMG definitions[0] (VDATA) must be VGPR", instr.get()); + break; + } + case Format::DS: { + for (const Operand& op : instr->operands) { + check((op.isTemp() && op.regClass().type() == RegType::vgpr) || op.physReg() == m0, + "Only VGPRs are valid DS instruction operands", instr.get()); + } + if (!instr->definitions.empty()) + check(instr->definitions[0].getTemp().type() == RegType::vgpr, "DS instruction must return VGPR", instr.get()); + break; + } + case Format::EXP: { + for (unsigned i = 0; i < 4; i++) + check(instr->operands[i].hasRegClass() && instr->operands[i].regClass().type() == RegType::vgpr, + "Only VGPRs are valid Export arguments", instr.get()); + break; + } + case Format::FLAT: + check(instr->operands[1].isUndefined(), "Flat instructions don't support SADDR", instr.get()); + /* fallthrough */ + case Format::GLOBAL: + case Format::SCRATCH: { + check(instr->operands[0].isTemp() && instr->operands[0].regClass().type() == RegType::vgpr, "FLAT/GLOBAL/SCRATCH address must be vgpr", instr.get()); + check(instr->operands[1].hasRegClass() && instr->operands[1].regClass().type() == RegType::sgpr, + "FLAT/GLOBAL/SCRATCH sgpr address must be undefined or sgpr", instr.get()); + if (!instr->definitions.empty()) + check(instr->definitions[0].getTemp().type() == RegType::vgpr, "FLAT/GLOBAL/SCRATCH result must be vgpr", instr.get()); + else + check(instr->operands[2].regClass().type() == RegType::vgpr, "FLAT/GLOBAL/SCRATCH data must be vgpr", instr.get()); + break; + } + default: + break; + } + } + } + + /* validate CFG */ + for (unsigned i = 0; i < program->blocks.size(); i++) { + Block& block = program->blocks[i]; + check_block(block.index == i, "block.index must match actual index", &block); + + /* predecessors/successors should be sorted */ + for (unsigned j = 0; j + 1 < block.linear_preds.size(); j++) + check_block(block.linear_preds[j] < block.linear_preds[j + 1], "linear predecessors must be sorted", &block); + for (unsigned j = 0; j + 1 < block.logical_preds.size(); j++) + check_block(block.logical_preds[j] < block.logical_preds[j + 1], "logical predecessors must be sorted", &block); + for (unsigned j = 0; j + 1 < block.linear_succs.size(); j++) + check_block(block.linear_succs[j] < block.linear_succs[j + 1], "linear successors must be sorted", &block); + for (unsigned j = 0; j + 1 < block.logical_succs.size(); j++) + check_block(block.logical_succs[j] < block.logical_succs[j + 1], "logical successors must be sorted", &block); + + /* critical edges are not allowed */ + if (block.linear_preds.size() > 1) { + for (unsigned pred : block.linear_preds) + check_block(program->blocks[pred].linear_succs.size() == 1, "linear critical edges are not allowed", &program->blocks[pred]); + for (unsigned pred : block.logical_preds) + check_block(program->blocks[pred].logical_succs.size() == 1, "logical critical edges are not allowed", &program->blocks[pred]); + } + } + + assert(is_valid); +} + +/* RA validation */ +namespace { + +struct Location { + Location() : block(NULL), instr(NULL) {} + + Block *block; + Instruction *instr; //NULL if it's the block's live-in +}; + +struct Assignment { + Location defloc; + Location firstloc; + PhysReg reg; +}; + +bool ra_fail(FILE *output, Location loc, Location loc2, const char *fmt, ...) { + va_list args; + va_start(args, fmt); + char msg[1024]; + vsprintf(msg, fmt, args); + va_end(args); + + fprintf(stderr, "RA error found at instruction in BB%d:\n", loc.block->index); + if (loc.instr) { + aco_print_instr(loc.instr, stderr); + fprintf(stderr, "\n%s", msg); + } else { + fprintf(stderr, "%s", msg); + } + if (loc2.block) { + fprintf(stderr, " in BB%d:\n", loc2.block->index); + aco_print_instr(loc2.instr, stderr); + } + fprintf(stderr, "\n\n"); + + return true; +} + +} /* end namespace */ + +bool validate_ra(Program *program, const struct radv_nir_compiler_options *options, FILE *output) { + if (!(debug_flags & DEBUG_VALIDATE_RA)) + return false; + + bool err = false; + aco::live live_vars = aco::live_var_analysis(program, options); + std::vector> phi_sgpr_ops(program->blocks.size()); + + std::map assignments; + for (Block& block : program->blocks) { + Location loc; + loc.block = █ + for (aco_ptr& instr : block.instructions) { + if (instr->opcode == aco_opcode::p_phi) { + for (unsigned i = 0; i < instr->operands.size(); i++) { + if (instr->operands[i].isTemp() && + instr->operands[i].getTemp().type() == RegType::sgpr && + instr->operands[i].isFirstKill()) + phi_sgpr_ops[block.logical_preds[i]].emplace_back(instr->operands[i].getTemp()); + } + } + + loc.instr = instr.get(); + for (unsigned i = 0; i < instr->operands.size(); i++) { + Operand& op = instr->operands[i]; + if (!op.isTemp()) + continue; + if (!op.isFixed()) + err |= ra_fail(output, loc, Location(), "Operand %d is not assigned a register", i); + if (assignments.count(op.tempId()) && assignments[op.tempId()].reg != op.physReg()) + err |= ra_fail(output, loc, assignments.at(op.tempId()).firstloc, "Operand %d has an inconsistent register assignment with instruction", i); + if ((op.getTemp().type() == RegType::vgpr && op.physReg() + op.size() > 256 + program->config->num_vgprs) || + (op.getTemp().type() == RegType::sgpr && op.physReg() + op.size() > program->config->num_sgprs && op.physReg() < program->sgpr_limit)) + err |= ra_fail(output, loc, assignments.at(op.tempId()).firstloc, "Operand %d has an out-of-bounds register assignment", i); + if (!assignments[op.tempId()].firstloc.block) + assignments[op.tempId()].firstloc = loc; + if (!assignments[op.tempId()].defloc.block) + assignments[op.tempId()].reg = op.physReg(); + } + + for (unsigned i = 0; i < instr->definitions.size(); i++) { + Definition& def = instr->definitions[i]; + if (!def.isTemp()) + continue; + if (!def.isFixed()) + err |= ra_fail(output, loc, Location(), "Definition %d is not assigned a register", i); + if (assignments[def.tempId()].defloc.block) + err |= ra_fail(output, loc, assignments.at(def.tempId()).defloc, "Temporary %%%d also defined by instruction", def.tempId()); + if ((def.getTemp().type() == RegType::vgpr && def.physReg() + def.size() > 256 + program->config->num_vgprs) || + (def.getTemp().type() == RegType::sgpr && def.physReg() + def.size() > program->config->num_sgprs && def.physReg() < program->sgpr_limit)) + err |= ra_fail(output, loc, assignments.at(def.tempId()).firstloc, "Definition %d has an out-of-bounds register assignment", i); + if (!assignments[def.tempId()].firstloc.block) + assignments[def.tempId()].firstloc = loc; + assignments[def.tempId()].defloc = loc; + assignments[def.tempId()].reg = def.physReg(); + } + } + } + + for (Block& block : program->blocks) { + Location loc; + loc.block = █ + + std::array regs; + regs.fill(0); + + std::set live; + live.insert(live_vars.live_out[block.index].begin(), live_vars.live_out[block.index].end()); + /* remove killed p_phi sgpr operands */ + for (Temp tmp : phi_sgpr_ops[block.index]) + live.erase(tmp); + + /* check live out */ + for (Temp tmp : live) { + PhysReg reg = assignments.at(tmp.id()).reg; + for (unsigned i = 0; i < tmp.size(); i++) { + if (regs[reg + i]) { + err |= ra_fail(output, loc, Location(), "Assignment of element %d of %%%d already taken by %%%d in live-out", i, tmp.id(), regs[reg + i]); + } + regs[reg + i] = tmp.id(); + } + } + regs.fill(0); + + for (auto it = block.instructions.rbegin(); it != block.instructions.rend(); ++it) { + aco_ptr& instr = *it; + + /* check killed p_phi sgpr operands */ + if (instr->opcode == aco_opcode::p_logical_end) { + for (Temp tmp : phi_sgpr_ops[block.index]) { + PhysReg reg = assignments.at(tmp.id()).reg; + for (unsigned i = 0; i < tmp.size(); i++) { + if (regs[reg + i]) + err |= ra_fail(output, loc, Location(), "Assignment of element %d of %%%d already taken by %%%d in live-out", i, tmp.id(), regs[reg + i]); + } + live.emplace(tmp); + } + } + + for (const Definition& def : instr->definitions) { + if (!def.isTemp()) + continue; + live.erase(def.getTemp()); + } + + /* don't count phi operands as live-in, since they are actually + * killed when they are copied at the predecessor */ + if (instr->opcode != aco_opcode::p_phi && instr->opcode != aco_opcode::p_linear_phi) { + for (const Operand& op : instr->operands) { + if (!op.isTemp()) + continue; + live.insert(op.getTemp()); + } + } + } + + for (Temp tmp : live) { + PhysReg reg = assignments.at(tmp.id()).reg; + for (unsigned i = 0; i < tmp.size(); i++) + regs[reg + i] = tmp.id(); + } + + for (aco_ptr& instr : block.instructions) { + loc.instr = instr.get(); + + /* remove killed p_phi operands from regs */ + if (instr->opcode == aco_opcode::p_logical_end) { + for (Temp tmp : phi_sgpr_ops[block.index]) { + PhysReg reg = assignments.at(tmp.id()).reg; + regs[reg] = 0; + } + } + + if (instr->opcode != aco_opcode::p_phi && instr->opcode != aco_opcode::p_linear_phi) { + for (const Operand& op : instr->operands) { + if (!op.isTemp()) + continue; + if (op.isFirstKill()) { + for (unsigned j = 0; j < op.getTemp().size(); j++) + regs[op.physReg() + j] = 0; + } + } + } + + for (unsigned i = 0; i < instr->definitions.size(); i++) { + Definition& def = instr->definitions[i]; + if (!def.isTemp()) + continue; + Temp tmp = def.getTemp(); + PhysReg reg = assignments.at(tmp.id()).reg; + for (unsigned j = 0; j < tmp.size(); j++) { + if (regs[reg + j]) + err |= ra_fail(output, loc, assignments.at(regs[reg + i]).defloc, "Assignment of element %d of %%%d already taken by %%%d from instruction", i, tmp.id(), regs[reg + j]); + regs[reg + j] = tmp.id(); + } + } + + for (const Definition& def : instr->definitions) { + if (!def.isTemp()) + continue; + if (def.isKill()) { + for (unsigned j = 0; j < def.getTemp().size(); j++) + regs[def.physReg() + j] = 0; + } + } + } + } + + return err; +} +} diff -Nru mesa-19.2.8/src/amd/compiler/meson.build mesa-20.0.8/src/amd/compiler/meson.build --- mesa-19.2.8/src/amd/compiler/meson.build 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/amd/compiler/meson.build 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,107 @@ +# Copyright © 2018 Valve Corporation + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +aco_depends = files('aco_opcodes.py') + +aco_opcodes_h = custom_target( + 'aco_opcodes.h', + input : 'aco_opcodes_h.py', + output : 'aco_opcodes.h', + command : [prog_python, '@INPUT@'], + capture : true, + depend_files : aco_depends, +) + +aco_opcodes_c = custom_target( + 'aco_opcodes.cpp', + input : 'aco_opcodes_cpp.py', + output : 'aco_opcodes.cpp', + command : [prog_python, '@INPUT@'], + capture : true, + depend_files : aco_depends, +) + +aco_builder_h = custom_target( + 'aco_builder.h', + input : 'aco_builder_h.py', + output : 'aco_builder.h', + command : [prog_python, '@INPUT@'], + capture : true, + depend_files : aco_depends, +) + +# Headers-only dependency +idep_aco_headers = declare_dependency( + sources : [aco_opcodes_h], + include_directories : include_directories('.'), +) + +libaco_files = files( + 'aco_dead_code_analysis.cpp', + 'aco_dominance.cpp', + 'aco_instruction_selection.cpp', + 'aco_instruction_selection_setup.cpp', + 'aco_interface.cpp', + 'aco_interface.h', + 'aco_ir.h', + 'aco_assembler.cpp', + 'aco_insert_exec_mask.cpp', + 'aco_insert_NOPs.cpp', + 'aco_insert_waitcnt.cpp', + 'aco_reduce_assign.cpp', + 'aco_register_allocation.cpp', + 'aco_live_var_analysis.cpp', + 'aco_lower_bool_phis.cpp', + 'aco_lower_to_cssa.cpp', + 'aco_lower_to_hw_instr.cpp', + 'aco_optimizer.cpp', + 'aco_opt_value_numbering.cpp', + 'aco_print_asm.cpp', + 'aco_print_ir.cpp', + 'aco_scheduler.cpp', + 'aco_ssa_elimination.cpp', + 'aco_spill.cpp', + 'aco_util.h', + 'aco_validate.cpp', +) + +_libaco = static_library( + 'aco', + [libaco_files, aco_opcodes_c, aco_opcodes_h, aco_builder_h], + include_directories : [ + inc_common, inc_compiler, inc_mesa, inc_mapi, inc_amd, inc_amd_common, inc_amd_common_llvm, + ], + link_with : [ + libamd_common + ], + dependencies : [ + dep_llvm, dep_thread, dep_elf, dep_libdrm_amdgpu, dep_valgrind, + idep_nir_headers, idep_amdgfxregs_h, + ], + c_args : [c_vis_args], + cpp_args : [cpp_vis_args], + build_by_default : true, +) + +# Also link with aco +idep_aco = declare_dependency( + dependencies : idep_aco_headers, + link_with : _libaco, +) diff -Nru mesa-19.2.8/src/amd/compiler/README.md mesa-20.0.8/src/amd/compiler/README.md --- mesa-19.2.8/src/amd/compiler/README.md 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/amd/compiler/README.md 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,219 @@ +# Unofficial GCN/RDNA ISA reference errata + +## v_sad_u32 + +The Vega ISA reference writes it's behaviour as: +``` +D.u = abs(S0.i - S1.i) + S2.u. +``` +This is incorrect. The actual behaviour is what is written in the GCN3 reference +guide: +``` +ABS_DIFF (A,B) = (A>B) ? (A-B) : (B-A) +D.u = ABS_DIFF (S0.u,S1.u) + S2.u +``` +The instruction doesn't subtract the S0 and S1 and use the absolute value (the +_signed_ distance), it uses the _unsigned_ distance between the operands. So +`v_sad_u32(-5, 0, 0)` would return `4294967291` (`-5` interpreted as unsigned), +not `5`. + +## s_bfe_* + +Both the Vega and GCN3 ISA references write that these instructions don't write +SCC. They do. + +## v_bcnt_u32_b32 + +The Vega ISA reference writes it's behaviour as: +``` +D.u = 0; +for i in 0 ... 31 do +D.u += (S0.u[i] == 1 ? 1 : 0); +endfor. +``` +This is incorrect. The actual behaviour (and number of operands) is what +is written in the GCN3 reference guide: +``` +D.u = CountOneBits(S0.u) + S1.u. +``` + +## SMEM stores + +The Vega ISA references doesn't say this (or doesn't make it clear), but +the offset for SMEM stores must be in m0 if IMM == 0. + +The RDNA ISA doesn't mention SMEM stores at all, but they seem to be supported +by the chip and are present in LLVM. AMD devs however highly recommend avoiding +these instructions. + +## SMEM atomics + +RDNA ISA: same as the SMEM stores, the ISA pretends they don't exist, but they +are there in LLVM. + +## VMEM stores + +All reference guides say (under "Vector Memory Instruction Data Dependencies"): +> When a VM instruction is issued, the address is immediately read out of VGPRs +> and sent to the texture cache. Any texture or buffer resources and samplers +> are also sent immediately. However, write-data is not immediately sent to the +> texture cache. +Reading that, one might think that waitcnts need to be added when writing to +the registers used for a VMEM store's data. Experimentation has shown that this +does not seem to be the case on GFX8 and GFX9 (GFX6 and GFX7 are untested). It +also seems unlikely, since NOPs are apparently needed in a subset of these +situations. + +## MIMG opcodes on GFX8/GCN3 + +The `image_atomic_{swap,cmpswap,add,sub}` opcodes in the GCN3 ISA reference +guide are incorrect. The Vega ISA reference guide has the correct ones. + +## VINTRP encoding + +VEGA ISA doc says the encoding should be `110010` but `110101` works. + +## VOP1 instructions encoded as VOP3 + +RDNA ISA doc says that `0x140` should be added to the opcode, but that doesn't +work. What works is adding `0x180`, which LLVM also does. + +## FLAT, Scratch, Global instructions + +The NV bit was removed in RDNA, but some parts of the doc still mention it. + +RDNA ISA doc 13.8.1 says that SADDR should be set to 0x7f when ADDR is used, but +9.3.1 says it should be set to NULL. We assume 9.3.1 is correct and set it to +SGPR_NULL. + +## Legacy instructions + +Some instructions have a `_LEGACY` variant which implements "DX9 rules", in which +the zero "wins" in multiplications, ie. `0.0*x` is always `0.0`. The VEGA ISA +mentions `V_MAC_LEGACY_F32` but this instruction is not really there on VEGA. + +## RDNA L0, L1 cache and DLC, GLC bits + +The old L1 cache was renamed to L0, and a new L1 cache was added to RDNA. The +L1 cache is 1 cache per shader array. Some instruction encodings have DLC and +GLC bits that interact with the cache. + +* DLC ("device level coherent") bit: controls the L1 cache +* GLC ("globally coherent") bit: controls the L0 cache + +The recommendation from AMD devs is to always set these two bits at the same time, +as it doesn't make too much sense to set them independently, aside from some +circumstances (eg. we needn't set DLC when only one shader array is used). + +Stores and atomics always bypass the L1 cache, so they don't support the DLC bit, +and it shouldn't be set in these cases. Setting the DLC for these cases can result +in graphical glitches. + +## RDNA S_DCACHE_WB + +The S_DCACHE_WB is not mentioned in the RDNA ISA doc, but it is needed in order +to achieve correct behavior in some SSBO CTS tests. + +## RDNA subvector mode + +The documentation of S_SUBVECTOR_LOOP_BEGIN and S_SUBVECTOR_LOOP_END is not clear +on what sort of addressing should be used, but it says that it +"is equivalent to an S_CBRANCH with extra math", so the subvector loop handling +in ACO is done according to the S_CBRANCH doc. + +# Hardware Bugs + +## SMEM corrupts VCCZ on SI/CI + +https://github.com/llvm/llvm-project/blob/acb089e12ae48b82c0b05c42326196a030df9b82/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp#L580-L616 +After issuing a SMEM instructions, we need to wait for the SMEM instructions to +finish and then write to vcc (for example, `s_mov_b64 vcc, vcc`) to correct vccz + +Currently, we don't do this. + +## GCN / GFX6 hazards + +### VINTRP followed by a read with v_readfirstlane or v_readlane + +It's required to insert 1 wait state if the dst VGPR of any v_interp_* is +followed by a read with v_readfirstlane or v_readlane to fix GPU hangs on GFX6. +Note that v_writelane_* is apparently not affected. This hazard isn't +documented anywhere but AMD confirmed it. + +## RDNA / GFX10 hazards + +### SMEM store followed by a load with the same address + +We found that an `s_buffer_load` will produce incorrect results if it is preceded +by an `s_buffer_store` with the same address. Inserting an `s_nop` between them +does not mitigate the issue, so an `s_waitcnt lgkmcnt(0)` must be inserted. +This is not mentioned by LLVM among the other GFX10 bugs, but LLVM doesn't use +SMEM stores, so it's not surprising that they didn't notice it. + +### VMEMtoScalarWriteHazard + +Triggered by: +VMEM/FLAT/GLOBAL/SCRATCH/DS instruction reads an SGPR (or EXEC, or M0). +Then, a SALU/SMEM instruction writes the same SGPR. + +Mitigated by: +A VALU instruction or an `s_waitcnt vmcnt(0)` between the two instructions. + +### SMEMtoVectorWriteHazard + +Triggered by: +An SMEM instruction reads an SGPR. Then, a VALU instruction writes that same SGPR. + +Mitigated by: +Any non-SOPP SALU instruction (except `s_setvskip`, `s_version`, and any non-lgkmcnt `s_waitcnt`). + +### Offset3fBug + +Any branch that is located at offset 0x3f will be buggy. Just insert some NOPs to make sure no branch +is located at this offset. + +### InstFwdPrefetchBug + +According to LLVM, the `s_inst_prefetch` instruction can cause a hang. +There are no further details. + +### LdsMisalignedBug + +When there is a misaligned multi-dword FLAT load/store instruction in WGP mode, +it needs to be split into multiple single-dword FLAT instructions. + +ACO doesn't use FLAT load/store on GFX10, so is unaffected. + +### FlatSegmentOffsetBug + +The 12-bit immediate OFFSET field of FLAT instructions must always be 0. +GLOBAL and SCRATCH are unaffected. + +ACO doesn't use FLAT load/store on GFX10, so is unaffected. + +### VcmpxPermlaneHazard + +Triggered by: +Any permlane instruction that follows any VOPC instruction. +Confirmed by AMD devs that despite the name, this doesn't only affect v_cmpx. + +Mitigated by: any VALU instruction except `v_nop`. + +### VcmpxExecWARHazard + +Triggered by: +Any non-VALU instruction reads the EXEC mask. Then, any VALU instruction writes the EXEC mask. + +Mitigated by: +A VALU instruction that writes an SGPR (or has a valid SDST operand), or `s_waitcnt_depctr 0xfffe`. +Note: `s_waitcnt_depctr` is an internal instruction, so there is no further information +about what it does or what its operand means. + +### LdsBranchVmemWARHazard + +Triggered by: +VMEM/GLOBAL/SCRATCH instruction, then a branch, then a DS instruction, +or vice versa: DS instruction, then a branch, then a VMEM/GLOBAL/SCRATCH instruction. + +Mitigated by: +Only `s_waitcnt_vscnt null, 0`. Needed even if the first instruction is a load. diff -Nru mesa-19.2.8/src/amd/llvm/ac_llvm_build.c mesa-20.0.8/src/amd/llvm/ac_llvm_build.c --- mesa-19.2.8/src/amd/llvm/ac_llvm_build.c 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/amd/llvm/ac_llvm_build.c 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,5069 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + */ +/* based on pieces from si_pipe.c and radeon_llvm_emit.c */ +#include "ac_llvm_build.h" + +#include +#include + +#include "c11/threads.h" + +#include +#include + +#include "ac_llvm_util.h" +#include "ac_shader_util.h" +#include "ac_exp_param.h" +#include "util/bitscan.h" +#include "util/macros.h" +#include "util/u_atomic.h" +#include "util/u_math.h" +#include "sid.h" + +#include "shader_enums.h" + +#define AC_LLVM_INITIAL_CF_DEPTH 4 + +/* Data for if/else/endif and bgnloop/endloop control flow structures. + */ +struct ac_llvm_flow { + /* Loop exit or next part of if/else/endif. */ + LLVMBasicBlockRef next_block; + LLVMBasicBlockRef loop_entry_block; +}; + +/* Initialize module-independent parts of the context. + * + * The caller is responsible for initializing ctx::module and ctx::builder. + */ +void +ac_llvm_context_init(struct ac_llvm_context *ctx, + struct ac_llvm_compiler *compiler, + enum chip_class chip_class, enum radeon_family family, + enum ac_float_mode float_mode, unsigned wave_size, + unsigned ballot_mask_bits) +{ + ctx->context = LLVMContextCreate(); + + ctx->chip_class = chip_class; + ctx->family = family; + ctx->wave_size = wave_size; + ctx->ballot_mask_bits = ballot_mask_bits; + ctx->float_mode = float_mode; + ctx->module = ac_create_module(wave_size == 32 ? compiler->tm_wave32 + : compiler->tm, + ctx->context); + ctx->builder = ac_create_builder(ctx->context, float_mode); + + ctx->voidt = LLVMVoidTypeInContext(ctx->context); + ctx->i1 = LLVMInt1TypeInContext(ctx->context); + ctx->i8 = LLVMInt8TypeInContext(ctx->context); + ctx->i16 = LLVMIntTypeInContext(ctx->context, 16); + ctx->i32 = LLVMIntTypeInContext(ctx->context, 32); + ctx->i64 = LLVMIntTypeInContext(ctx->context, 64); + ctx->i128 = LLVMIntTypeInContext(ctx->context, 128); + ctx->intptr = ctx->i32; + ctx->f16 = LLVMHalfTypeInContext(ctx->context); + ctx->f32 = LLVMFloatTypeInContext(ctx->context); + ctx->f64 = LLVMDoubleTypeInContext(ctx->context); + ctx->v2i16 = LLVMVectorType(ctx->i16, 2); + ctx->v2i32 = LLVMVectorType(ctx->i32, 2); + ctx->v3i32 = LLVMVectorType(ctx->i32, 3); + ctx->v4i32 = LLVMVectorType(ctx->i32, 4); + ctx->v2f32 = LLVMVectorType(ctx->f32, 2); + ctx->v3f32 = LLVMVectorType(ctx->f32, 3); + ctx->v4f32 = LLVMVectorType(ctx->f32, 4); + ctx->v8i32 = LLVMVectorType(ctx->i32, 8); + ctx->iN_wavemask = LLVMIntTypeInContext(ctx->context, ctx->wave_size); + ctx->iN_ballotmask = LLVMIntTypeInContext(ctx->context, ballot_mask_bits); + + ctx->i8_0 = LLVMConstInt(ctx->i8, 0, false); + ctx->i8_1 = LLVMConstInt(ctx->i8, 1, false); + ctx->i16_0 = LLVMConstInt(ctx->i16, 0, false); + ctx->i16_1 = LLVMConstInt(ctx->i16, 1, false); + ctx->i32_0 = LLVMConstInt(ctx->i32, 0, false); + ctx->i32_1 = LLVMConstInt(ctx->i32, 1, false); + ctx->i64_0 = LLVMConstInt(ctx->i64, 0, false); + ctx->i64_1 = LLVMConstInt(ctx->i64, 1, false); + ctx->i128_0 = LLVMConstInt(ctx->i128, 0, false); + ctx->i128_1 = LLVMConstInt(ctx->i128, 1, false); + ctx->f16_0 = LLVMConstReal(ctx->f16, 0.0); + ctx->f16_1 = LLVMConstReal(ctx->f16, 1.0); + ctx->f32_0 = LLVMConstReal(ctx->f32, 0.0); + ctx->f32_1 = LLVMConstReal(ctx->f32, 1.0); + ctx->f64_0 = LLVMConstReal(ctx->f64, 0.0); + ctx->f64_1 = LLVMConstReal(ctx->f64, 1.0); + + ctx->i1false = LLVMConstInt(ctx->i1, 0, false); + ctx->i1true = LLVMConstInt(ctx->i1, 1, false); + + ctx->range_md_kind = LLVMGetMDKindIDInContext(ctx->context, + "range", 5); + + ctx->invariant_load_md_kind = LLVMGetMDKindIDInContext(ctx->context, + "invariant.load", 14); + + ctx->uniform_md_kind = LLVMGetMDKindIDInContext(ctx->context, + "amdgpu.uniform", 14); + + ctx->empty_md = LLVMMDNodeInContext(ctx->context, NULL, 0); + ctx->flow = calloc(1, sizeof(*ctx->flow)); +} + +void +ac_llvm_context_dispose(struct ac_llvm_context *ctx) +{ + free(ctx->flow->stack); + free(ctx->flow); + ctx->flow = NULL; +} + +int +ac_get_llvm_num_components(LLVMValueRef value) +{ + LLVMTypeRef type = LLVMTypeOf(value); + unsigned num_components = LLVMGetTypeKind(type) == LLVMVectorTypeKind + ? LLVMGetVectorSize(type) + : 1; + return num_components; +} + +LLVMValueRef +ac_llvm_extract_elem(struct ac_llvm_context *ac, + LLVMValueRef value, + int index) +{ + if (LLVMGetTypeKind(LLVMTypeOf(value)) != LLVMVectorTypeKind) { + assert(index == 0); + return value; + } + + return LLVMBuildExtractElement(ac->builder, value, + LLVMConstInt(ac->i32, index, false), ""); +} + +int +ac_get_elem_bits(struct ac_llvm_context *ctx, LLVMTypeRef type) +{ + if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) + type = LLVMGetElementType(type); + + if (LLVMGetTypeKind(type) == LLVMIntegerTypeKind) + return LLVMGetIntTypeWidth(type); + + if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) { + if (LLVMGetPointerAddressSpace(type) == AC_ADDR_SPACE_LDS) + return 32; + } + + if (type == ctx->f16) + return 16; + if (type == ctx->f32) + return 32; + if (type == ctx->f64) + return 64; + + unreachable("Unhandled type kind in get_elem_bits"); +} + +unsigned +ac_get_type_size(LLVMTypeRef type) +{ + LLVMTypeKind kind = LLVMGetTypeKind(type); + + switch (kind) { + case LLVMIntegerTypeKind: + return LLVMGetIntTypeWidth(type) / 8; + case LLVMHalfTypeKind: + return 2; + case LLVMFloatTypeKind: + return 4; + case LLVMDoubleTypeKind: + return 8; + case LLVMPointerTypeKind: + if (LLVMGetPointerAddressSpace(type) == AC_ADDR_SPACE_CONST_32BIT) + return 4; + return 8; + case LLVMVectorTypeKind: + return LLVMGetVectorSize(type) * + ac_get_type_size(LLVMGetElementType(type)); + case LLVMArrayTypeKind: + return LLVMGetArrayLength(type) * + ac_get_type_size(LLVMGetElementType(type)); + default: + assert(0); + return 0; + } +} + +static LLVMTypeRef to_integer_type_scalar(struct ac_llvm_context *ctx, LLVMTypeRef t) +{ + if (t == ctx->i8) + return ctx->i8; + else if (t == ctx->f16 || t == ctx->i16) + return ctx->i16; + else if (t == ctx->f32 || t == ctx->i32) + return ctx->i32; + else if (t == ctx->f64 || t == ctx->i64) + return ctx->i64; + else + unreachable("Unhandled integer size"); +} + +LLVMTypeRef +ac_to_integer_type(struct ac_llvm_context *ctx, LLVMTypeRef t) +{ + if (LLVMGetTypeKind(t) == LLVMVectorTypeKind) { + LLVMTypeRef elem_type = LLVMGetElementType(t); + return LLVMVectorType(to_integer_type_scalar(ctx, elem_type), + LLVMGetVectorSize(t)); + } + if (LLVMGetTypeKind(t) == LLVMPointerTypeKind) { + switch (LLVMGetPointerAddressSpace(t)) { + case AC_ADDR_SPACE_GLOBAL: + return ctx->i64; + case AC_ADDR_SPACE_CONST_32BIT: + case AC_ADDR_SPACE_LDS: + return ctx->i32; + default: + unreachable("unhandled address space"); + } + } + return to_integer_type_scalar(ctx, t); +} + +LLVMValueRef +ac_to_integer(struct ac_llvm_context *ctx, LLVMValueRef v) +{ + LLVMTypeRef type = LLVMTypeOf(v); + if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) { + return LLVMBuildPtrToInt(ctx->builder, v, ac_to_integer_type(ctx, type), ""); + } + return LLVMBuildBitCast(ctx->builder, v, ac_to_integer_type(ctx, type), ""); +} + +LLVMValueRef +ac_to_integer_or_pointer(struct ac_llvm_context *ctx, LLVMValueRef v) +{ + LLVMTypeRef type = LLVMTypeOf(v); + if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) + return v; + return ac_to_integer(ctx, v); +} + +static LLVMTypeRef to_float_type_scalar(struct ac_llvm_context *ctx, LLVMTypeRef t) +{ + if (t == ctx->i8) + return ctx->i8; + else if (t == ctx->i16 || t == ctx->f16) + return ctx->f16; + else if (t == ctx->i32 || t == ctx->f32) + return ctx->f32; + else if (t == ctx->i64 || t == ctx->f64) + return ctx->f64; + else + unreachable("Unhandled float size"); +} + +LLVMTypeRef +ac_to_float_type(struct ac_llvm_context *ctx, LLVMTypeRef t) +{ + if (LLVMGetTypeKind(t) == LLVMVectorTypeKind) { + LLVMTypeRef elem_type = LLVMGetElementType(t); + return LLVMVectorType(to_float_type_scalar(ctx, elem_type), + LLVMGetVectorSize(t)); + } + return to_float_type_scalar(ctx, t); +} + +LLVMValueRef +ac_to_float(struct ac_llvm_context *ctx, LLVMValueRef v) +{ + LLVMTypeRef type = LLVMTypeOf(v); + return LLVMBuildBitCast(ctx->builder, v, ac_to_float_type(ctx, type), ""); +} + + +LLVMValueRef +ac_build_intrinsic(struct ac_llvm_context *ctx, const char *name, + LLVMTypeRef return_type, LLVMValueRef *params, + unsigned param_count, unsigned attrib_mask) +{ + LLVMValueRef function, call; + bool set_callsite_attrs = !(attrib_mask & AC_FUNC_ATTR_LEGACY); + + function = LLVMGetNamedFunction(ctx->module, name); + if (!function) { + LLVMTypeRef param_types[32], function_type; + unsigned i; + + assert(param_count <= 32); + + for (i = 0; i < param_count; ++i) { + assert(params[i]); + param_types[i] = LLVMTypeOf(params[i]); + } + function_type = + LLVMFunctionType(return_type, param_types, param_count, 0); + function = LLVMAddFunction(ctx->module, name, function_type); + + LLVMSetFunctionCallConv(function, LLVMCCallConv); + LLVMSetLinkage(function, LLVMExternalLinkage); + + if (!set_callsite_attrs) + ac_add_func_attributes(ctx->context, function, attrib_mask); + } + + call = LLVMBuildCall(ctx->builder, function, params, param_count, ""); + if (set_callsite_attrs) + ac_add_func_attributes(ctx->context, call, attrib_mask); + return call; +} + +/** + * Given the i32 or vNi32 \p type, generate the textual name (e.g. for use with + * intrinsic names). + */ +void ac_build_type_name_for_intr(LLVMTypeRef type, char *buf, unsigned bufsize) +{ + LLVMTypeRef elem_type = type; + + assert(bufsize >= 8); + + if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) { + int ret = snprintf(buf, bufsize, "v%u", + LLVMGetVectorSize(type)); + if (ret < 0) { + char *type_name = LLVMPrintTypeToString(type); + fprintf(stderr, "Error building type name for: %s\n", + type_name); + LLVMDisposeMessage(type_name); + return; + } + elem_type = LLVMGetElementType(type); + buf += ret; + bufsize -= ret; + } + switch (LLVMGetTypeKind(elem_type)) { + default: break; + case LLVMIntegerTypeKind: + snprintf(buf, bufsize, "i%d", LLVMGetIntTypeWidth(elem_type)); + break; + case LLVMHalfTypeKind: + snprintf(buf, bufsize, "f16"); + break; + case LLVMFloatTypeKind: + snprintf(buf, bufsize, "f32"); + break; + case LLVMDoubleTypeKind: + snprintf(buf, bufsize, "f64"); + break; + } +} + +/** + * Helper function that builds an LLVM IR PHI node and immediately adds + * incoming edges. + */ +LLVMValueRef +ac_build_phi(struct ac_llvm_context *ctx, LLVMTypeRef type, + unsigned count_incoming, LLVMValueRef *values, + LLVMBasicBlockRef *blocks) +{ + LLVMValueRef phi = LLVMBuildPhi(ctx->builder, type, ""); + LLVMAddIncoming(phi, values, blocks, count_incoming); + return phi; +} + +void ac_build_s_barrier(struct ac_llvm_context *ctx) +{ + ac_build_intrinsic(ctx, "llvm.amdgcn.s.barrier", ctx->voidt, NULL, + 0, AC_FUNC_ATTR_CONVERGENT); +} + +/* Prevent optimizations (at least of memory accesses) across the current + * point in the program by emitting empty inline assembly that is marked as + * having side effects. + * + * Optionally, a value can be passed through the inline assembly to prevent + * LLVM from hoisting calls to ReadNone functions. + */ +void +ac_build_optimization_barrier(struct ac_llvm_context *ctx, + LLVMValueRef *pvgpr) +{ + static int counter = 0; + + LLVMBuilderRef builder = ctx->builder; + char code[16]; + + snprintf(code, sizeof(code), "; %d", p_atomic_inc_return(&counter)); + + if (!pvgpr) { + LLVMTypeRef ftype = LLVMFunctionType(ctx->voidt, NULL, 0, false); + LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "", true, false); + LLVMBuildCall(builder, inlineasm, NULL, 0, ""); + } else { + LLVMTypeRef ftype = LLVMFunctionType(ctx->i32, &ctx->i32, 1, false); + LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "=v,0", true, false); + LLVMTypeRef type = LLVMTypeOf(*pvgpr); + unsigned bitsize = ac_get_elem_bits(ctx, type); + LLVMValueRef vgpr = *pvgpr; + LLVMTypeRef vgpr_type; + unsigned vgpr_size; + LLVMValueRef vgpr0; + + if (bitsize < 32) + vgpr = LLVMBuildZExt(ctx->builder, vgpr, ctx->i32, ""); + + vgpr_type = LLVMTypeOf(vgpr); + vgpr_size = ac_get_type_size(vgpr_type); + + assert(vgpr_size % 4 == 0); + + vgpr = LLVMBuildBitCast(builder, vgpr, LLVMVectorType(ctx->i32, vgpr_size / 4), ""); + vgpr0 = LLVMBuildExtractElement(builder, vgpr, ctx->i32_0, ""); + vgpr0 = LLVMBuildCall(builder, inlineasm, &vgpr0, 1, ""); + vgpr = LLVMBuildInsertElement(builder, vgpr, vgpr0, ctx->i32_0, ""); + vgpr = LLVMBuildBitCast(builder, vgpr, vgpr_type, ""); + + if (bitsize < 32) + vgpr = LLVMBuildTrunc(builder, vgpr, type, ""); + + *pvgpr = vgpr; + } +} + +LLVMValueRef +ac_build_shader_clock(struct ac_llvm_context *ctx) +{ + const char *intr = LLVM_VERSION_MAJOR >= 9 && ctx->chip_class >= GFX8 ? + "llvm.amdgcn.s.memrealtime" : "llvm.readcyclecounter"; + LLVMValueRef tmp = ac_build_intrinsic(ctx, intr, ctx->i64, NULL, 0, 0); + return LLVMBuildBitCast(ctx->builder, tmp, ctx->v2i32, ""); +} + +LLVMValueRef +ac_build_ballot(struct ac_llvm_context *ctx, + LLVMValueRef value) +{ + const char *name; + + if (LLVM_VERSION_MAJOR >= 9) { + if (ctx->wave_size == 64) + name = "llvm.amdgcn.icmp.i64.i32"; + else + name = "llvm.amdgcn.icmp.i32.i32"; + } else { + name = "llvm.amdgcn.icmp.i32"; + } + LLVMValueRef args[3] = { + value, + ctx->i32_0, + LLVMConstInt(ctx->i32, LLVMIntNE, 0) + }; + + /* We currently have no other way to prevent LLVM from lifting the icmp + * calls to a dominating basic block. + */ + ac_build_optimization_barrier(ctx, &args[0]); + + args[0] = ac_to_integer(ctx, args[0]); + + return ac_build_intrinsic(ctx, name, ctx->iN_wavemask, args, 3, + AC_FUNC_ATTR_NOUNWIND | + AC_FUNC_ATTR_READNONE | + AC_FUNC_ATTR_CONVERGENT); +} + +LLVMValueRef ac_get_i1_sgpr_mask(struct ac_llvm_context *ctx, + LLVMValueRef value) +{ + const char *name; + + if (LLVM_VERSION_MAJOR >= 9) { + if (ctx->wave_size == 64) + name = "llvm.amdgcn.icmp.i64.i1"; + else + name = "llvm.amdgcn.icmp.i32.i1"; + } else { + name = "llvm.amdgcn.icmp.i1"; + } + LLVMValueRef args[3] = { + value, + ctx->i1false, + LLVMConstInt(ctx->i32, LLVMIntNE, 0), + }; + + return ac_build_intrinsic(ctx, name, ctx->iN_wavemask, args, 3, + AC_FUNC_ATTR_NOUNWIND | + AC_FUNC_ATTR_READNONE | + AC_FUNC_ATTR_CONVERGENT); +} + +LLVMValueRef +ac_build_vote_all(struct ac_llvm_context *ctx, LLVMValueRef value) +{ + LLVMValueRef active_set = ac_build_ballot(ctx, ctx->i32_1); + LLVMValueRef vote_set = ac_build_ballot(ctx, value); + return LLVMBuildICmp(ctx->builder, LLVMIntEQ, vote_set, active_set, ""); +} + +LLVMValueRef +ac_build_vote_any(struct ac_llvm_context *ctx, LLVMValueRef value) +{ + LLVMValueRef vote_set = ac_build_ballot(ctx, value); + return LLVMBuildICmp(ctx->builder, LLVMIntNE, vote_set, + LLVMConstInt(ctx->iN_wavemask, 0, 0), ""); +} + +LLVMValueRef +ac_build_vote_eq(struct ac_llvm_context *ctx, LLVMValueRef value) +{ + LLVMValueRef active_set = ac_build_ballot(ctx, ctx->i32_1); + LLVMValueRef vote_set = ac_build_ballot(ctx, value); + + LLVMValueRef all = LLVMBuildICmp(ctx->builder, LLVMIntEQ, + vote_set, active_set, ""); + LLVMValueRef none = LLVMBuildICmp(ctx->builder, LLVMIntEQ, + vote_set, + LLVMConstInt(ctx->iN_wavemask, 0, 0), ""); + return LLVMBuildOr(ctx->builder, all, none, ""); +} + +LLVMValueRef +ac_build_varying_gather_values(struct ac_llvm_context *ctx, LLVMValueRef *values, + unsigned value_count, unsigned component) +{ + LLVMValueRef vec = NULL; + + if (value_count == 1) { + return values[component]; + } else if (!value_count) + unreachable("value_count is 0"); + + for (unsigned i = component; i < value_count + component; i++) { + LLVMValueRef value = values[i]; + + if (i == component) + vec = LLVMGetUndef( LLVMVectorType(LLVMTypeOf(value), value_count)); + LLVMValueRef index = LLVMConstInt(ctx->i32, i - component, false); + vec = LLVMBuildInsertElement(ctx->builder, vec, value, index, ""); + } + return vec; +} + +LLVMValueRef +ac_build_gather_values_extended(struct ac_llvm_context *ctx, + LLVMValueRef *values, + unsigned value_count, + unsigned value_stride, + bool load, + bool always_vector) +{ + LLVMBuilderRef builder = ctx->builder; + LLVMValueRef vec = NULL; + unsigned i; + + if (value_count == 1 && !always_vector) { + if (load) + return LLVMBuildLoad(builder, values[0], ""); + return values[0]; + } else if (!value_count) + unreachable("value_count is 0"); + + for (i = 0; i < value_count; i++) { + LLVMValueRef value = values[i * value_stride]; + if (load) + value = LLVMBuildLoad(builder, value, ""); + + if (!i) + vec = LLVMGetUndef( LLVMVectorType(LLVMTypeOf(value), value_count)); + LLVMValueRef index = LLVMConstInt(ctx->i32, i, false); + vec = LLVMBuildInsertElement(builder, vec, value, index, ""); + } + return vec; +} + +LLVMValueRef +ac_build_gather_values(struct ac_llvm_context *ctx, + LLVMValueRef *values, + unsigned value_count) +{ + return ac_build_gather_values_extended(ctx, values, value_count, 1, false, false); +} + +/* Expand a scalar or vector to by filling the remaining + * channels with undef. Extract at most src_channels components from the input. + */ +static LLVMValueRef +ac_build_expand(struct ac_llvm_context *ctx, + LLVMValueRef value, + unsigned src_channels, + unsigned dst_channels) +{ + LLVMTypeRef elemtype; + LLVMValueRef chan[dst_channels]; + + if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMVectorTypeKind) { + unsigned vec_size = LLVMGetVectorSize(LLVMTypeOf(value)); + + if (src_channels == dst_channels && vec_size == dst_channels) + return value; + + src_channels = MIN2(src_channels, vec_size); + + for (unsigned i = 0; i < src_channels; i++) + chan[i] = ac_llvm_extract_elem(ctx, value, i); + + elemtype = LLVMGetElementType(LLVMTypeOf(value)); + } else { + if (src_channels) { + assert(src_channels == 1); + chan[0] = value; + } + elemtype = LLVMTypeOf(value); + } + + for (unsigned i = src_channels; i < dst_channels; i++) + chan[i] = LLVMGetUndef(elemtype); + + return ac_build_gather_values(ctx, chan, dst_channels); +} + +/* Extract components [start, start + channels) from a vector. + */ +LLVMValueRef +ac_extract_components(struct ac_llvm_context *ctx, + LLVMValueRef value, + unsigned start, + unsigned channels) +{ + LLVMValueRef chan[channels]; + + for (unsigned i = 0; i < channels; i++) + chan[i] = ac_llvm_extract_elem(ctx, value, i + start); + + return ac_build_gather_values(ctx, chan, channels); +} + +/* Expand a scalar or vector to <4 x type> by filling the remaining channels + * with undef. Extract at most num_channels components from the input. + */ +LLVMValueRef ac_build_expand_to_vec4(struct ac_llvm_context *ctx, + LLVMValueRef value, + unsigned num_channels) +{ + return ac_build_expand(ctx, value, num_channels, 4); +} + +LLVMValueRef ac_build_round(struct ac_llvm_context *ctx, LLVMValueRef value) +{ + unsigned type_size = ac_get_type_size(LLVMTypeOf(value)); + const char *name; + + if (type_size == 2) + name = "llvm.rint.f16"; + else if (type_size == 4) + name = "llvm.rint.f32"; + else + name = "llvm.rint.f64"; + + return ac_build_intrinsic(ctx, name, LLVMTypeOf(value), &value, 1, + AC_FUNC_ATTR_READNONE); +} + +LLVMValueRef +ac_build_fdiv(struct ac_llvm_context *ctx, + LLVMValueRef num, + LLVMValueRef den) +{ + unsigned type_size = ac_get_type_size(LLVMTypeOf(den)); + const char *name; + + if (type_size == 2) + name = "llvm.amdgcn.rcp.f16"; + else if (type_size == 4) + name = "llvm.amdgcn.rcp.f32"; + else + name = "llvm.amdgcn.rcp.f64"; + + LLVMValueRef rcp = ac_build_intrinsic(ctx, name, LLVMTypeOf(den), + &den, 1, AC_FUNC_ATTR_READNONE); + + return LLVMBuildFMul(ctx->builder, num, rcp, ""); +} + +/* See fast_idiv_by_const.h. */ +/* Set: increment = util_fast_udiv_info::increment ? multiplier : 0; */ +LLVMValueRef ac_build_fast_udiv(struct ac_llvm_context *ctx, + LLVMValueRef num, + LLVMValueRef multiplier, + LLVMValueRef pre_shift, + LLVMValueRef post_shift, + LLVMValueRef increment) +{ + LLVMBuilderRef builder = ctx->builder; + + num = LLVMBuildLShr(builder, num, pre_shift, ""); + num = LLVMBuildMul(builder, + LLVMBuildZExt(builder, num, ctx->i64, ""), + LLVMBuildZExt(builder, multiplier, ctx->i64, ""), ""); + num = LLVMBuildAdd(builder, num, + LLVMBuildZExt(builder, increment, ctx->i64, ""), ""); + num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), ""); + num = LLVMBuildTrunc(builder, num, ctx->i32, ""); + return LLVMBuildLShr(builder, num, post_shift, ""); +} + +/* See fast_idiv_by_const.h. */ +/* If num != UINT_MAX, this more efficient version can be used. */ +/* Set: increment = util_fast_udiv_info::increment; */ +LLVMValueRef ac_build_fast_udiv_nuw(struct ac_llvm_context *ctx, + LLVMValueRef num, + LLVMValueRef multiplier, + LLVMValueRef pre_shift, + LLVMValueRef post_shift, + LLVMValueRef increment) +{ + LLVMBuilderRef builder = ctx->builder; + + num = LLVMBuildLShr(builder, num, pre_shift, ""); + num = LLVMBuildNUWAdd(builder, num, increment, ""); + num = LLVMBuildMul(builder, + LLVMBuildZExt(builder, num, ctx->i64, ""), + LLVMBuildZExt(builder, multiplier, ctx->i64, ""), ""); + num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), ""); + num = LLVMBuildTrunc(builder, num, ctx->i32, ""); + return LLVMBuildLShr(builder, num, post_shift, ""); +} + +/* See fast_idiv_by_const.h. */ +/* Both operands must fit in 31 bits and the divisor must not be 1. */ +LLVMValueRef ac_build_fast_udiv_u31_d_not_one(struct ac_llvm_context *ctx, + LLVMValueRef num, + LLVMValueRef multiplier, + LLVMValueRef post_shift) +{ + LLVMBuilderRef builder = ctx->builder; + + num = LLVMBuildMul(builder, + LLVMBuildZExt(builder, num, ctx->i64, ""), + LLVMBuildZExt(builder, multiplier, ctx->i64, ""), ""); + num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), ""); + num = LLVMBuildTrunc(builder, num, ctx->i32, ""); + return LLVMBuildLShr(builder, num, post_shift, ""); +} + +/* Coordinates for cube map selection. sc, tc, and ma are as in Table 8.27 + * of the OpenGL 4.5 (Compatibility Profile) specification, except ma is + * already multiplied by two. id is the cube face number. + */ +struct cube_selection_coords { + LLVMValueRef stc[2]; + LLVMValueRef ma; + LLVMValueRef id; +}; + +static void +build_cube_intrinsic(struct ac_llvm_context *ctx, + LLVMValueRef in[3], + struct cube_selection_coords *out) +{ + LLVMTypeRef f32 = ctx->f32; + + out->stc[1] = ac_build_intrinsic(ctx, "llvm.amdgcn.cubetc", + f32, in, 3, AC_FUNC_ATTR_READNONE); + out->stc[0] = ac_build_intrinsic(ctx, "llvm.amdgcn.cubesc", + f32, in, 3, AC_FUNC_ATTR_READNONE); + out->ma = ac_build_intrinsic(ctx, "llvm.amdgcn.cubema", + f32, in, 3, AC_FUNC_ATTR_READNONE); + out->id = ac_build_intrinsic(ctx, "llvm.amdgcn.cubeid", + f32, in, 3, AC_FUNC_ATTR_READNONE); +} + +/** + * Build a manual selection sequence for cube face sc/tc coordinates and + * major axis vector (multiplied by 2 for consistency) for the given + * vec3 \p coords, for the face implied by \p selcoords. + * + * For the major axis, we always adjust the sign to be in the direction of + * selcoords.ma; i.e., a positive out_ma means that coords is pointed towards + * the selcoords major axis. + */ +static void build_cube_select(struct ac_llvm_context *ctx, + const struct cube_selection_coords *selcoords, + const LLVMValueRef *coords, + LLVMValueRef *out_st, + LLVMValueRef *out_ma) +{ + LLVMBuilderRef builder = ctx->builder; + LLVMTypeRef f32 = LLVMTypeOf(coords[0]); + LLVMValueRef is_ma_positive; + LLVMValueRef sgn_ma; + LLVMValueRef is_ma_z, is_not_ma_z; + LLVMValueRef is_ma_y; + LLVMValueRef is_ma_x; + LLVMValueRef sgn; + LLVMValueRef tmp; + + is_ma_positive = LLVMBuildFCmp(builder, LLVMRealUGE, + selcoords->ma, LLVMConstReal(f32, 0.0), ""); + sgn_ma = LLVMBuildSelect(builder, is_ma_positive, + LLVMConstReal(f32, 1.0), LLVMConstReal(f32, -1.0), ""); + + is_ma_z = LLVMBuildFCmp(builder, LLVMRealUGE, selcoords->id, LLVMConstReal(f32, 4.0), ""); + is_not_ma_z = LLVMBuildNot(builder, is_ma_z, ""); + is_ma_y = LLVMBuildAnd(builder, is_not_ma_z, + LLVMBuildFCmp(builder, LLVMRealUGE, selcoords->id, LLVMConstReal(f32, 2.0), ""), ""); + is_ma_x = LLVMBuildAnd(builder, is_not_ma_z, LLVMBuildNot(builder, is_ma_y, ""), ""); + + /* Select sc */ + tmp = LLVMBuildSelect(builder, is_ma_x, coords[2], coords[0], ""); + sgn = LLVMBuildSelect(builder, is_ma_y, LLVMConstReal(f32, 1.0), + LLVMBuildSelect(builder, is_ma_z, sgn_ma, + LLVMBuildFNeg(builder, sgn_ma, ""), ""), ""); + out_st[0] = LLVMBuildFMul(builder, tmp, sgn, ""); + + /* Select tc */ + tmp = LLVMBuildSelect(builder, is_ma_y, coords[2], coords[1], ""); + sgn = LLVMBuildSelect(builder, is_ma_y, sgn_ma, + LLVMConstReal(f32, -1.0), ""); + out_st[1] = LLVMBuildFMul(builder, tmp, sgn, ""); + + /* Select ma */ + tmp = LLVMBuildSelect(builder, is_ma_z, coords[2], + LLVMBuildSelect(builder, is_ma_y, coords[1], coords[0], ""), ""); + tmp = ac_build_intrinsic(ctx, "llvm.fabs.f32", + ctx->f32, &tmp, 1, AC_FUNC_ATTR_READNONE); + *out_ma = LLVMBuildFMul(builder, tmp, LLVMConstReal(f32, 2.0), ""); +} + +void +ac_prepare_cube_coords(struct ac_llvm_context *ctx, + bool is_deriv, bool is_array, bool is_lod, + LLVMValueRef *coords_arg, + LLVMValueRef *derivs_arg) +{ + + LLVMBuilderRef builder = ctx->builder; + struct cube_selection_coords selcoords; + LLVMValueRef coords[3]; + LLVMValueRef invma; + + if (is_array && !is_lod) { + LLVMValueRef tmp = ac_build_round(ctx, coords_arg[3]); + + /* Section 8.9 (Texture Functions) of the GLSL 4.50 spec says: + * + * "For Array forms, the array layer used will be + * + * max(0, min(d−1, floor(layer+0.5))) + * + * where d is the depth of the texture array and layer + * comes from the component indicated in the tables below. + * Workaroudn for an issue where the layer is taken from a + * helper invocation which happens to fall on a different + * layer due to extrapolation." + * + * GFX8 and earlier attempt to implement this in hardware by + * clamping the value of coords[2] = (8 * layer) + face. + * Unfortunately, this means that the we end up with the wrong + * face when clamping occurs. + * + * Clamp the layer earlier to work around the issue. + */ + if (ctx->chip_class <= GFX8) { + LLVMValueRef ge0; + ge0 = LLVMBuildFCmp(builder, LLVMRealOGE, tmp, ctx->f32_0, ""); + tmp = LLVMBuildSelect(builder, ge0, tmp, ctx->f32_0, ""); + } + + coords_arg[3] = tmp; + } + + build_cube_intrinsic(ctx, coords_arg, &selcoords); + + invma = ac_build_intrinsic(ctx, "llvm.fabs.f32", + ctx->f32, &selcoords.ma, 1, AC_FUNC_ATTR_READNONE); + invma = ac_build_fdiv(ctx, LLVMConstReal(ctx->f32, 1.0), invma); + + for (int i = 0; i < 2; ++i) + coords[i] = LLVMBuildFMul(builder, selcoords.stc[i], invma, ""); + + coords[2] = selcoords.id; + + if (is_deriv && derivs_arg) { + LLVMValueRef derivs[4]; + int axis; + + /* Convert cube derivatives to 2D derivatives. */ + for (axis = 0; axis < 2; axis++) { + LLVMValueRef deriv_st[2]; + LLVMValueRef deriv_ma; + + /* Transform the derivative alongside the texture + * coordinate. Mathematically, the correct formula is + * as follows. Assume we're projecting onto the +Z face + * and denote by dx/dh the derivative of the (original) + * X texture coordinate with respect to horizontal + * window coordinates. The projection onto the +Z face + * plane is: + * + * f(x,z) = x/z + * + * Then df/dh = df/dx * dx/dh + df/dz * dz/dh + * = 1/z * dx/dh - x/z * 1/z * dz/dh. + * + * This motivatives the implementation below. + * + * Whether this actually gives the expected results for + * apps that might feed in derivatives obtained via + * finite differences is anyone's guess. The OpenGL spec + * seems awfully quiet about how textureGrad for cube + * maps should be handled. + */ + build_cube_select(ctx, &selcoords, &derivs_arg[axis * 3], + deriv_st, &deriv_ma); + + deriv_ma = LLVMBuildFMul(builder, deriv_ma, invma, ""); + + for (int i = 0; i < 2; ++i) + derivs[axis * 2 + i] = + LLVMBuildFSub(builder, + LLVMBuildFMul(builder, deriv_st[i], invma, ""), + LLVMBuildFMul(builder, deriv_ma, coords[i], ""), ""); + } + + memcpy(derivs_arg, derivs, sizeof(derivs)); + } + + /* Shift the texture coordinate. This must be applied after the + * derivative calculation. + */ + for (int i = 0; i < 2; ++i) + coords[i] = LLVMBuildFAdd(builder, coords[i], LLVMConstReal(ctx->f32, 1.5), ""); + + if (is_array) { + /* for cube arrays coord.z = coord.w(array_index) * 8 + face */ + /* coords_arg.w component - array_index for cube arrays */ + coords[2] = ac_build_fmad(ctx, coords_arg[3], LLVMConstReal(ctx->f32, 8.0), coords[2]); + } + + memcpy(coords_arg, coords, sizeof(coords)); +} + + +LLVMValueRef +ac_build_fs_interp(struct ac_llvm_context *ctx, + LLVMValueRef llvm_chan, + LLVMValueRef attr_number, + LLVMValueRef params, + LLVMValueRef i, + LLVMValueRef j) +{ + LLVMValueRef args[5]; + LLVMValueRef p1; + + args[0] = i; + args[1] = llvm_chan; + args[2] = attr_number; + args[3] = params; + + p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1", + ctx->f32, args, 4, AC_FUNC_ATTR_READNONE); + + args[0] = p1; + args[1] = j; + args[2] = llvm_chan; + args[3] = attr_number; + args[4] = params; + + return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2", + ctx->f32, args, 5, AC_FUNC_ATTR_READNONE); +} + +LLVMValueRef +ac_build_fs_interp_f16(struct ac_llvm_context *ctx, + LLVMValueRef llvm_chan, + LLVMValueRef attr_number, + LLVMValueRef params, + LLVMValueRef i, + LLVMValueRef j) +{ + LLVMValueRef args[6]; + LLVMValueRef p1; + + args[0] = i; + args[1] = llvm_chan; + args[2] = attr_number; + args[3] = ctx->i1false; + args[4] = params; + + p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1.f16", + ctx->f32, args, 5, AC_FUNC_ATTR_READNONE); + + args[0] = p1; + args[1] = j; + args[2] = llvm_chan; + args[3] = attr_number; + args[4] = ctx->i1false; + args[5] = params; + + return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2.f16", + ctx->f16, args, 6, AC_FUNC_ATTR_READNONE); +} + +LLVMValueRef +ac_build_fs_interp_mov(struct ac_llvm_context *ctx, + LLVMValueRef parameter, + LLVMValueRef llvm_chan, + LLVMValueRef attr_number, + LLVMValueRef params) +{ + LLVMValueRef args[4]; + + args[0] = parameter; + args[1] = llvm_chan; + args[2] = attr_number; + args[3] = params; + + return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.mov", + ctx->f32, args, 4, AC_FUNC_ATTR_READNONE); +} + +LLVMValueRef +ac_build_gep_ptr(struct ac_llvm_context *ctx, + LLVMValueRef base_ptr, + LLVMValueRef index) +{ + return LLVMBuildGEP(ctx->builder, base_ptr, &index, 1, ""); +} + +LLVMValueRef +ac_build_gep0(struct ac_llvm_context *ctx, + LLVMValueRef base_ptr, + LLVMValueRef index) +{ + LLVMValueRef indices[2] = { + ctx->i32_0, + index, + }; + return LLVMBuildGEP(ctx->builder, base_ptr, indices, 2, ""); +} + +LLVMValueRef ac_build_pointer_add(struct ac_llvm_context *ctx, LLVMValueRef ptr, + LLVMValueRef index) +{ + return LLVMBuildPointerCast(ctx->builder, + LLVMBuildGEP(ctx->builder, ptr, &index, 1, ""), + LLVMTypeOf(ptr), ""); +} + +void +ac_build_indexed_store(struct ac_llvm_context *ctx, + LLVMValueRef base_ptr, LLVMValueRef index, + LLVMValueRef value) +{ + LLVMBuildStore(ctx->builder, value, + ac_build_gep0(ctx, base_ptr, index)); +} + +/** + * Build an LLVM bytecode indexed load using LLVMBuildGEP + LLVMBuildLoad. + * It's equivalent to doing a load from &base_ptr[index]. + * + * \param base_ptr Where the array starts. + * \param index The element index into the array. + * \param uniform Whether the base_ptr and index can be assumed to be + * dynamically uniform (i.e. load to an SGPR) + * \param invariant Whether the load is invariant (no other opcodes affect it) + * \param no_unsigned_wraparound + * For all possible re-associations and re-distributions of an expression + * "base_ptr + index * elemsize" into "addr + offset" (excluding GEPs + * without inbounds in base_ptr), this parameter is true if "addr + offset" + * does not result in an unsigned integer wraparound. This is used for + * optimal code generation of 32-bit pointer arithmetic. + * + * For example, a 32-bit immediate offset that causes a 32-bit unsigned + * integer wraparound can't be an imm offset in s_load_dword, because + * the instruction performs "addr + offset" in 64 bits. + * + * Expected usage for bindless textures by chaining GEPs: + * // possible unsigned wraparound, don't use InBounds: + * ptr1 = LLVMBuildGEP(base_ptr, index); + * image = load(ptr1); // becomes "s_load ptr1, 0" + * + * ptr2 = LLVMBuildInBoundsGEP(ptr1, 32 / elemsize); + * sampler = load(ptr2); // becomes "s_load ptr1, 32" thanks to InBounds + */ +static LLVMValueRef +ac_build_load_custom(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, + LLVMValueRef index, bool uniform, bool invariant, + bool no_unsigned_wraparound) +{ + LLVMValueRef pointer, result; + + if (no_unsigned_wraparound && + LLVMGetPointerAddressSpace(LLVMTypeOf(base_ptr)) == AC_ADDR_SPACE_CONST_32BIT) + pointer = LLVMBuildInBoundsGEP(ctx->builder, base_ptr, &index, 1, ""); + else + pointer = LLVMBuildGEP(ctx->builder, base_ptr, &index, 1, ""); + + if (uniform) + LLVMSetMetadata(pointer, ctx->uniform_md_kind, ctx->empty_md); + result = LLVMBuildLoad(ctx->builder, pointer, ""); + if (invariant) + LLVMSetMetadata(result, ctx->invariant_load_md_kind, ctx->empty_md); + return result; +} + +LLVMValueRef ac_build_load(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, + LLVMValueRef index) +{ + return ac_build_load_custom(ctx, base_ptr, index, false, false, false); +} + +LLVMValueRef ac_build_load_invariant(struct ac_llvm_context *ctx, + LLVMValueRef base_ptr, LLVMValueRef index) +{ + return ac_build_load_custom(ctx, base_ptr, index, false, true, false); +} + +/* This assumes that there is no unsigned integer wraparound during the address + * computation, excluding all GEPs within base_ptr. */ +LLVMValueRef ac_build_load_to_sgpr(struct ac_llvm_context *ctx, + LLVMValueRef base_ptr, LLVMValueRef index) +{ + return ac_build_load_custom(ctx, base_ptr, index, true, true, true); +} + +/* See ac_build_load_custom() documentation. */ +LLVMValueRef ac_build_load_to_sgpr_uint_wraparound(struct ac_llvm_context *ctx, + LLVMValueRef base_ptr, LLVMValueRef index) +{ + return ac_build_load_custom(ctx, base_ptr, index, true, true, false); +} + +static unsigned get_load_cache_policy(struct ac_llvm_context *ctx, + unsigned cache_policy) +{ + return cache_policy | + (ctx->chip_class >= GFX10 && cache_policy & ac_glc ? ac_dlc : 0); +} + +static void +ac_build_buffer_store_common(struct ac_llvm_context *ctx, + LLVMValueRef rsrc, + LLVMValueRef data, + LLVMValueRef vindex, + LLVMValueRef voffset, + LLVMValueRef soffset, + unsigned num_channels, + LLVMTypeRef return_channel_type, + unsigned cache_policy, + bool use_format, + bool structurized) +{ + LLVMValueRef args[6]; + int idx = 0; + args[idx++] = data; + args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""); + if (structurized) + args[idx++] = vindex ? vindex : ctx->i32_0; + args[idx++] = voffset ? voffset : ctx->i32_0; + args[idx++] = soffset ? soffset : ctx->i32_0; + args[idx++] = LLVMConstInt(ctx->i32, cache_policy, 0); + unsigned func = !ac_has_vec3_support(ctx->chip_class, use_format) && num_channels == 3 ? 4 : num_channels; + const char *indexing_kind = structurized ? "struct" : "raw"; + char name[256], type_name[8]; + + LLVMTypeRef type = func > 1 ? LLVMVectorType(return_channel_type, func) : return_channel_type; + ac_build_type_name_for_intr(type, type_name, sizeof(type_name)); + + if (use_format) { + snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.store.format.%s", + indexing_kind, type_name); + } else { + snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.store.%s", + indexing_kind, type_name); + } + + ac_build_intrinsic(ctx, name, ctx->voidt, args, idx, + AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY); +} + +void +ac_build_buffer_store_format(struct ac_llvm_context *ctx, + LLVMValueRef rsrc, + LLVMValueRef data, + LLVMValueRef vindex, + LLVMValueRef voffset, + unsigned num_channels, + unsigned cache_policy) +{ + ac_build_buffer_store_common(ctx, rsrc, data, vindex, + voffset, NULL, num_channels, + ctx->f32, cache_policy, + true, true); +} + +/* TBUFFER_STORE_FORMAT_{X,XY,XYZ,XYZW} <- the suffix is selected by num_channels=1..4. + * The type of vdata must be one of i32 (num_channels=1), v2i32 (num_channels=2), + * or v4i32 (num_channels=3,4). + */ +void +ac_build_buffer_store_dword(struct ac_llvm_context *ctx, + LLVMValueRef rsrc, + LLVMValueRef vdata, + unsigned num_channels, + LLVMValueRef voffset, + LLVMValueRef soffset, + unsigned inst_offset, + unsigned cache_policy) +{ + /* Split 3 channel stores, because only LLVM 9+ support 3-channel + * intrinsics. */ + if (num_channels == 3 && !ac_has_vec3_support(ctx->chip_class, false)) { + LLVMValueRef v[3], v01; + + for (int i = 0; i < 3; i++) { + v[i] = LLVMBuildExtractElement(ctx->builder, vdata, + LLVMConstInt(ctx->i32, i, 0), ""); + } + v01 = ac_build_gather_values(ctx, v, 2); + + ac_build_buffer_store_dword(ctx, rsrc, v01, 2, voffset, + soffset, inst_offset, cache_policy); + ac_build_buffer_store_dword(ctx, rsrc, v[2], 1, voffset, + soffset, inst_offset + 8, + cache_policy); + return; + } + + /* SWIZZLE_ENABLE requires that soffset isn't folded into voffset + * (voffset is swizzled, but soffset isn't swizzled). + * llvm.amdgcn.buffer.store doesn't have a separate soffset parameter. + */ + if (!(cache_policy & ac_swizzled)) { + LLVMValueRef offset = soffset; + + if (inst_offset) + offset = LLVMBuildAdd(ctx->builder, offset, + LLVMConstInt(ctx->i32, inst_offset, 0), ""); + + ac_build_buffer_store_common(ctx, rsrc, ac_to_float(ctx, vdata), + ctx->i32_0, voffset, offset, + num_channels, ctx->f32, + cache_policy, false, false); + return; + } + + static const unsigned dfmts[] = { + V_008F0C_BUF_DATA_FORMAT_32, + V_008F0C_BUF_DATA_FORMAT_32_32, + V_008F0C_BUF_DATA_FORMAT_32_32_32, + V_008F0C_BUF_DATA_FORMAT_32_32_32_32 + }; + unsigned dfmt = dfmts[num_channels - 1]; + unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT; + LLVMValueRef immoffset = LLVMConstInt(ctx->i32, inst_offset, 0); + + ac_build_raw_tbuffer_store(ctx, rsrc, vdata, voffset, soffset, + immoffset, num_channels, dfmt, nfmt, cache_policy); +} + +static LLVMValueRef +ac_build_buffer_load_common(struct ac_llvm_context *ctx, + LLVMValueRef rsrc, + LLVMValueRef vindex, + LLVMValueRef voffset, + LLVMValueRef soffset, + unsigned num_channels, + LLVMTypeRef channel_type, + unsigned cache_policy, + bool can_speculate, + bool use_format, + bool structurized) +{ + LLVMValueRef args[5]; + int idx = 0; + args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""); + if (structurized) + args[idx++] = vindex ? vindex : ctx->i32_0; + args[idx++] = voffset ? voffset : ctx->i32_0; + args[idx++] = soffset ? soffset : ctx->i32_0; + args[idx++] = LLVMConstInt(ctx->i32, get_load_cache_policy(ctx, cache_policy), 0); + unsigned func = !ac_has_vec3_support(ctx->chip_class, use_format) && num_channels == 3 ? 4 : num_channels; + const char *indexing_kind = structurized ? "struct" : "raw"; + char name[256], type_name[8]; + + LLVMTypeRef type = func > 1 ? LLVMVectorType(channel_type, func) : channel_type; + ac_build_type_name_for_intr(type, type_name, sizeof(type_name)); + + if (use_format) { + snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.load.format.%s", + indexing_kind, type_name); + } else { + snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.load.%s", + indexing_kind, type_name); + } + + return ac_build_intrinsic(ctx, name, type, args, idx, + ac_get_load_intr_attribs(can_speculate)); +} + +LLVMValueRef +ac_build_buffer_load(struct ac_llvm_context *ctx, + LLVMValueRef rsrc, + int num_channels, + LLVMValueRef vindex, + LLVMValueRef voffset, + LLVMValueRef soffset, + unsigned inst_offset, + unsigned cache_policy, + bool can_speculate, + bool allow_smem) +{ + LLVMValueRef offset = LLVMConstInt(ctx->i32, inst_offset, 0); + if (voffset) + offset = LLVMBuildAdd(ctx->builder, offset, voffset, ""); + if (soffset) + offset = LLVMBuildAdd(ctx->builder, offset, soffset, ""); + + if (allow_smem && !(cache_policy & ac_slc) && + (!(cache_policy & ac_glc) || ctx->chip_class >= GFX8)) { + assert(vindex == NULL); + + LLVMValueRef result[8]; + + for (int i = 0; i < num_channels; i++) { + if (i) { + offset = LLVMBuildAdd(ctx->builder, offset, + LLVMConstInt(ctx->i32, 4, 0), ""); + } + LLVMValueRef args[3] = { + rsrc, + offset, + LLVMConstInt(ctx->i32, get_load_cache_policy(ctx, cache_policy), 0), + }; + result[i] = ac_build_intrinsic(ctx, + "llvm.amdgcn.s.buffer.load.f32", + ctx->f32, args, 3, + AC_FUNC_ATTR_READNONE); + } + if (num_channels == 1) + return result[0]; + + if (num_channels == 3 && !ac_has_vec3_support(ctx->chip_class, false)) + result[num_channels++] = LLVMGetUndef(ctx->f32); + return ac_build_gather_values(ctx, result, num_channels); + } + + return ac_build_buffer_load_common(ctx, rsrc, vindex, + offset, ctx->i32_0, + num_channels, ctx->f32, + cache_policy, + can_speculate, false, false); +} + +LLVMValueRef ac_build_buffer_load_format(struct ac_llvm_context *ctx, + LLVMValueRef rsrc, + LLVMValueRef vindex, + LLVMValueRef voffset, + unsigned num_channels, + unsigned cache_policy, + bool can_speculate) +{ + return ac_build_buffer_load_common(ctx, rsrc, vindex, voffset, + ctx->i32_0, num_channels, ctx->f32, + cache_policy, can_speculate, + true, true); +} + +static LLVMValueRef +ac_build_tbuffer_load(struct ac_llvm_context *ctx, + LLVMValueRef rsrc, + LLVMValueRef vindex, + LLVMValueRef voffset, + LLVMValueRef soffset, + LLVMValueRef immoffset, + unsigned num_channels, + unsigned dfmt, + unsigned nfmt, + unsigned cache_policy, + bool can_speculate, + bool structurized) +{ + voffset = LLVMBuildAdd(ctx->builder, voffset, immoffset, ""); + + LLVMValueRef args[6]; + int idx = 0; + args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""); + if (structurized) + args[idx++] = vindex ? vindex : ctx->i32_0; + args[idx++] = voffset ? voffset : ctx->i32_0; + args[idx++] = soffset ? soffset : ctx->i32_0; + args[idx++] = LLVMConstInt(ctx->i32, ac_get_tbuffer_format(ctx->chip_class, dfmt, nfmt), 0); + args[idx++] = LLVMConstInt(ctx->i32, get_load_cache_policy(ctx, cache_policy), 0); + unsigned func = !ac_has_vec3_support(ctx->chip_class, true) && num_channels == 3 ? 4 : num_channels; + const char *indexing_kind = structurized ? "struct" : "raw"; + char name[256], type_name[8]; + + LLVMTypeRef type = func > 1 ? LLVMVectorType(ctx->i32, func) : ctx->i32; + ac_build_type_name_for_intr(type, type_name, sizeof(type_name)); + + snprintf(name, sizeof(name), "llvm.amdgcn.%s.tbuffer.load.%s", + indexing_kind, type_name); + + return ac_build_intrinsic(ctx, name, type, args, idx, + ac_get_load_intr_attribs(can_speculate)); +} + +LLVMValueRef +ac_build_struct_tbuffer_load(struct ac_llvm_context *ctx, + LLVMValueRef rsrc, + LLVMValueRef vindex, + LLVMValueRef voffset, + LLVMValueRef soffset, + LLVMValueRef immoffset, + unsigned num_channels, + unsigned dfmt, + unsigned nfmt, + unsigned cache_policy, + bool can_speculate) +{ + return ac_build_tbuffer_load(ctx, rsrc, vindex, voffset, soffset, + immoffset, num_channels, dfmt, nfmt, + cache_policy, can_speculate, true); +} + +LLVMValueRef +ac_build_raw_tbuffer_load(struct ac_llvm_context *ctx, + LLVMValueRef rsrc, + LLVMValueRef voffset, + LLVMValueRef soffset, + LLVMValueRef immoffset, + unsigned num_channels, + unsigned dfmt, + unsigned nfmt, + unsigned cache_policy, + bool can_speculate) +{ + return ac_build_tbuffer_load(ctx, rsrc, NULL, voffset, soffset, + immoffset, num_channels, dfmt, nfmt, + cache_policy, can_speculate, false); +} + +LLVMValueRef +ac_build_tbuffer_load_short(struct ac_llvm_context *ctx, + LLVMValueRef rsrc, + LLVMValueRef voffset, + LLVMValueRef soffset, + LLVMValueRef immoffset, + unsigned cache_policy) +{ + LLVMValueRef res; + + if (LLVM_VERSION_MAJOR >= 9) { + voffset = LLVMBuildAdd(ctx->builder, voffset, immoffset, ""); + + /* LLVM 9+ supports i8/i16 with struct/raw intrinsics. */ + res = ac_build_buffer_load_common(ctx, rsrc, NULL, + voffset, soffset, + 1, ctx->i16, cache_policy, + false, false, false); + } else { + unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_16; + unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT; + + res = ac_build_raw_tbuffer_load(ctx, rsrc, voffset, soffset, + immoffset, 1, dfmt, nfmt, cache_policy, + false); + + res = LLVMBuildTrunc(ctx->builder, res, ctx->i16, ""); + } + + return res; +} + +LLVMValueRef +ac_build_tbuffer_load_byte(struct ac_llvm_context *ctx, + LLVMValueRef rsrc, + LLVMValueRef voffset, + LLVMValueRef soffset, + LLVMValueRef immoffset, + unsigned cache_policy) +{ + LLVMValueRef res; + + if (LLVM_VERSION_MAJOR >= 9) { + voffset = LLVMBuildAdd(ctx->builder, voffset, immoffset, ""); + + /* LLVM 9+ supports i8/i16 with struct/raw intrinsics. */ + res = ac_build_buffer_load_common(ctx, rsrc, NULL, + voffset, soffset, + 1, ctx->i8, cache_policy, + false, false, false); + } else { + unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_8; + unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT; + + res = ac_build_raw_tbuffer_load(ctx, rsrc, voffset, soffset, + immoffset, 1, dfmt, nfmt, cache_policy, + false); + + res = LLVMBuildTrunc(ctx->builder, res, ctx->i8, ""); + } + + return res; +} + +/** + * Convert an 11- or 10-bit unsigned floating point number to an f32. + * + * The input exponent is expected to be biased analogous to IEEE-754, i.e. by + * 2^(exp_bits-1) - 1 (as defined in OpenGL and other graphics APIs). + */ +static LLVMValueRef +ac_ufN_to_float(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned exp_bits, unsigned mant_bits) +{ + assert(LLVMTypeOf(src) == ctx->i32); + + LLVMValueRef tmp; + LLVMValueRef mantissa; + mantissa = LLVMBuildAnd(ctx->builder, src, LLVMConstInt(ctx->i32, (1 << mant_bits) - 1, false), ""); + + /* Converting normal numbers is just a shift + correcting the exponent bias */ + unsigned normal_shift = 23 - mant_bits; + unsigned bias_shift = 127 - ((1 << (exp_bits - 1)) - 1); + LLVMValueRef shifted, normal; + + shifted = LLVMBuildShl(ctx->builder, src, LLVMConstInt(ctx->i32, normal_shift, false), ""); + normal = LLVMBuildAdd(ctx->builder, shifted, LLVMConstInt(ctx->i32, bias_shift << 23, false), ""); + + /* Converting nan/inf numbers is the same, but with a different exponent update */ + LLVMValueRef naninf; + naninf = LLVMBuildOr(ctx->builder, normal, LLVMConstInt(ctx->i32, 0xff << 23, false), ""); + + /* Converting denormals is the complex case: determine the leading zeros of the + * mantissa to obtain the correct shift for the mantissa and exponent correction. + */ + LLVMValueRef denormal; + LLVMValueRef params[2] = { + mantissa, + ctx->i1true, /* result can be undef when arg is 0 */ + }; + LLVMValueRef ctlz = ac_build_intrinsic(ctx, "llvm.ctlz.i32", ctx->i32, + params, 2, AC_FUNC_ATTR_READNONE); + + /* Shift such that the leading 1 ends up as the LSB of the exponent field. */ + tmp = LLVMBuildSub(ctx->builder, ctlz, LLVMConstInt(ctx->i32, 8, false), ""); + denormal = LLVMBuildShl(ctx->builder, mantissa, tmp, ""); + + unsigned denormal_exp = bias_shift + (32 - mant_bits) - 1; + tmp = LLVMBuildSub(ctx->builder, LLVMConstInt(ctx->i32, denormal_exp, false), ctlz, ""); + tmp = LLVMBuildShl(ctx->builder, tmp, LLVMConstInt(ctx->i32, 23, false), ""); + denormal = LLVMBuildAdd(ctx->builder, denormal, tmp, ""); + + /* Select the final result. */ + LLVMValueRef result; + + tmp = LLVMBuildICmp(ctx->builder, LLVMIntUGE, src, + LLVMConstInt(ctx->i32, ((1 << exp_bits) - 1) << mant_bits, false), ""); + result = LLVMBuildSelect(ctx->builder, tmp, naninf, normal, ""); + + tmp = LLVMBuildICmp(ctx->builder, LLVMIntUGE, src, + LLVMConstInt(ctx->i32, 1 << mant_bits, false), ""); + result = LLVMBuildSelect(ctx->builder, tmp, result, denormal, ""); + + tmp = LLVMBuildICmp(ctx->builder, LLVMIntNE, src, ctx->i32_0, ""); + result = LLVMBuildSelect(ctx->builder, tmp, result, ctx->i32_0, ""); + + return ac_to_float(ctx, result); +} + +/** + * Generate a fully general open coded buffer format fetch with all required + * fixups suitable for vertex fetch, using non-format buffer loads. + * + * Some combinations of argument values have special interpretations: + * - size = 8 bytes, format = fixed indicates PIPE_FORMAT_R11G11B10_FLOAT + * - size = 8 bytes, format != {float,fixed} indicates a 2_10_10_10 data format + * + * \param log_size log(size of channel in bytes) + * \param num_channels number of channels (1 to 4) + * \param format AC_FETCH_FORMAT_xxx value + * \param reverse whether XYZ channels are reversed + * \param known_aligned whether the source is known to be aligned to hardware's + * effective element size for loading the given format + * (note: this means dword alignment for 8_8_8_8, 16_16, etc.) + * \param rsrc buffer resource descriptor + * \return the resulting vector of floats or integers bitcast to <4 x i32> + */ +LLVMValueRef +ac_build_opencoded_load_format(struct ac_llvm_context *ctx, + unsigned log_size, + unsigned num_channels, + unsigned format, + bool reverse, + bool known_aligned, + LLVMValueRef rsrc, + LLVMValueRef vindex, + LLVMValueRef voffset, + LLVMValueRef soffset, + unsigned cache_policy, + bool can_speculate) +{ + LLVMValueRef tmp; + unsigned load_log_size = log_size; + unsigned load_num_channels = num_channels; + if (log_size == 3) { + load_log_size = 2; + if (format == AC_FETCH_FORMAT_FLOAT) { + load_num_channels = 2 * num_channels; + } else { + load_num_channels = 1; /* 10_11_11 or 2_10_10_10 */ + } + } + + int log_recombine = 0; + if (ctx->chip_class == GFX6 && !known_aligned) { + /* Avoid alignment restrictions by loading one byte at a time. */ + load_num_channels <<= load_log_size; + log_recombine = load_log_size; + load_log_size = 0; + } else if (load_num_channels == 2 || load_num_channels == 4) { + log_recombine = -util_logbase2(load_num_channels); + load_num_channels = 1; + load_log_size += -log_recombine; + } + + assert(load_log_size >= 2 || LLVM_VERSION_MAJOR >= 9); + + LLVMValueRef loads[32]; /* up to 32 bytes */ + for (unsigned i = 0; i < load_num_channels; ++i) { + tmp = LLVMBuildAdd(ctx->builder, soffset, + LLVMConstInt(ctx->i32, i << load_log_size, false), ""); + LLVMTypeRef channel_type = load_log_size == 0 ? ctx->i8 : + load_log_size == 1 ? ctx->i16 : ctx->i32; + unsigned num_channels = 1 << (MAX2(load_log_size, 2) - 2); + loads[i] = ac_build_buffer_load_common( + ctx, rsrc, vindex, voffset, tmp, + num_channels, channel_type, cache_policy, + can_speculate, false, true); + if (load_log_size >= 2) + loads[i] = ac_to_integer(ctx, loads[i]); + } + + if (log_recombine > 0) { + /* Recombine bytes if necessary (GFX6 only) */ + LLVMTypeRef dst_type = log_recombine == 2 ? ctx->i32 : ctx->i16; + + for (unsigned src = 0, dst = 0; src < load_num_channels; ++dst) { + LLVMValueRef accum = NULL; + for (unsigned i = 0; i < (1 << log_recombine); ++i, ++src) { + tmp = LLVMBuildZExt(ctx->builder, loads[src], dst_type, ""); + if (i == 0) { + accum = tmp; + } else { + tmp = LLVMBuildShl(ctx->builder, tmp, + LLVMConstInt(dst_type, 8 * i, false), ""); + accum = LLVMBuildOr(ctx->builder, accum, tmp, ""); + } + } + loads[dst] = accum; + } + } else if (log_recombine < 0) { + /* Split vectors of dwords */ + if (load_log_size > 2) { + assert(load_num_channels == 1); + LLVMValueRef loaded = loads[0]; + unsigned log_split = load_log_size - 2; + log_recombine += log_split; + load_num_channels = 1 << log_split; + load_log_size = 2; + for (unsigned i = 0; i < load_num_channels; ++i) { + tmp = LLVMConstInt(ctx->i32, i, false); + loads[i] = LLVMBuildExtractElement(ctx->builder, loaded, tmp, ""); + } + } + + /* Further split dwords and shorts if required */ + if (log_recombine < 0) { + for (unsigned src = load_num_channels, + dst = load_num_channels << -log_recombine; + src > 0; --src) { + unsigned dst_bits = 1 << (3 + load_log_size + log_recombine); + LLVMTypeRef dst_type = LLVMIntTypeInContext(ctx->context, dst_bits); + LLVMValueRef loaded = loads[src - 1]; + LLVMTypeRef loaded_type = LLVMTypeOf(loaded); + for (unsigned i = 1 << -log_recombine; i > 0; --i, --dst) { + tmp = LLVMConstInt(loaded_type, dst_bits * (i - 1), false); + tmp = LLVMBuildLShr(ctx->builder, loaded, tmp, ""); + loads[dst - 1] = LLVMBuildTrunc(ctx->builder, tmp, dst_type, ""); + } + } + } + } + + if (log_size == 3) { + if (format == AC_FETCH_FORMAT_FLOAT) { + for (unsigned i = 0; i < num_channels; ++i) { + tmp = ac_build_gather_values(ctx, &loads[2 * i], 2); + loads[i] = LLVMBuildBitCast(ctx->builder, tmp, ctx->f64, ""); + } + } else if (format == AC_FETCH_FORMAT_FIXED) { + /* 10_11_11_FLOAT */ + LLVMValueRef data = loads[0]; + LLVMValueRef i32_2047 = LLVMConstInt(ctx->i32, 2047, false); + LLVMValueRef r = LLVMBuildAnd(ctx->builder, data, i32_2047, ""); + tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 11, false), ""); + LLVMValueRef g = LLVMBuildAnd(ctx->builder, tmp, i32_2047, ""); + LLVMValueRef b = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 22, false), ""); + + loads[0] = ac_to_integer(ctx, ac_ufN_to_float(ctx, r, 5, 6)); + loads[1] = ac_to_integer(ctx, ac_ufN_to_float(ctx, g, 5, 6)); + loads[2] = ac_to_integer(ctx, ac_ufN_to_float(ctx, b, 5, 5)); + + num_channels = 3; + log_size = 2; + format = AC_FETCH_FORMAT_FLOAT; + } else { + /* 2_10_10_10 data formats */ + LLVMValueRef data = loads[0]; + LLVMTypeRef i10 = LLVMIntTypeInContext(ctx->context, 10); + LLVMTypeRef i2 = LLVMIntTypeInContext(ctx->context, 2); + loads[0] = LLVMBuildTrunc(ctx->builder, data, i10, ""); + tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 10, false), ""); + loads[1] = LLVMBuildTrunc(ctx->builder, tmp, i10, ""); + tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 20, false), ""); + loads[2] = LLVMBuildTrunc(ctx->builder, tmp, i10, ""); + tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 30, false), ""); + loads[3] = LLVMBuildTrunc(ctx->builder, tmp, i2, ""); + + num_channels = 4; + } + } + + if (format == AC_FETCH_FORMAT_FLOAT) { + if (log_size != 2) { + for (unsigned chan = 0; chan < num_channels; ++chan) { + tmp = ac_to_float(ctx, loads[chan]); + if (log_size == 3) + tmp = LLVMBuildFPTrunc(ctx->builder, tmp, ctx->f32, ""); + else if (log_size == 1) + tmp = LLVMBuildFPExt(ctx->builder, tmp, ctx->f32, ""); + loads[chan] = ac_to_integer(ctx, tmp); + } + } + } else if (format == AC_FETCH_FORMAT_UINT) { + if (log_size != 2) { + for (unsigned chan = 0; chan < num_channels; ++chan) + loads[chan] = LLVMBuildZExt(ctx->builder, loads[chan], ctx->i32, ""); + } + } else if (format == AC_FETCH_FORMAT_SINT) { + if (log_size != 2) { + for (unsigned chan = 0; chan < num_channels; ++chan) + loads[chan] = LLVMBuildSExt(ctx->builder, loads[chan], ctx->i32, ""); + } + } else { + bool unsign = format == AC_FETCH_FORMAT_UNORM || + format == AC_FETCH_FORMAT_USCALED || + format == AC_FETCH_FORMAT_UINT; + + for (unsigned chan = 0; chan < num_channels; ++chan) { + if (unsign) { + tmp = LLVMBuildUIToFP(ctx->builder, loads[chan], ctx->f32, ""); + } else { + tmp = LLVMBuildSIToFP(ctx->builder, loads[chan], ctx->f32, ""); + } + + LLVMValueRef scale = NULL; + if (format == AC_FETCH_FORMAT_FIXED) { + assert(log_size == 2); + scale = LLVMConstReal(ctx->f32, 1.0 / 0x10000); + } else if (format == AC_FETCH_FORMAT_UNORM) { + unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(loads[chan])); + scale = LLVMConstReal(ctx->f32, 1.0 / (((uint64_t)1 << bits) - 1)); + } else if (format == AC_FETCH_FORMAT_SNORM) { + unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(loads[chan])); + scale = LLVMConstReal(ctx->f32, 1.0 / (((uint64_t)1 << (bits - 1)) - 1)); + } + if (scale) + tmp = LLVMBuildFMul(ctx->builder, tmp, scale, ""); + + if (format == AC_FETCH_FORMAT_SNORM) { + /* Clamp to [-1, 1] */ + LLVMValueRef neg_one = LLVMConstReal(ctx->f32, -1.0); + LLVMValueRef clamp = + LLVMBuildFCmp(ctx->builder, LLVMRealULT, tmp, neg_one, ""); + tmp = LLVMBuildSelect(ctx->builder, clamp, neg_one, tmp, ""); + } + + loads[chan] = ac_to_integer(ctx, tmp); + } + } + + while (num_channels < 4) { + if (format == AC_FETCH_FORMAT_UINT || format == AC_FETCH_FORMAT_SINT) { + loads[num_channels] = num_channels == 3 ? ctx->i32_1 : ctx->i32_0; + } else { + loads[num_channels] = ac_to_integer(ctx, num_channels == 3 ? ctx->f32_1 : ctx->f32_0); + } + num_channels++; + } + + if (reverse) { + tmp = loads[0]; + loads[0] = loads[2]; + loads[2] = tmp; + } + + return ac_build_gather_values(ctx, loads, 4); +} + +static void +ac_build_tbuffer_store(struct ac_llvm_context *ctx, + LLVMValueRef rsrc, + LLVMValueRef vdata, + LLVMValueRef vindex, + LLVMValueRef voffset, + LLVMValueRef soffset, + LLVMValueRef immoffset, + unsigned num_channels, + unsigned dfmt, + unsigned nfmt, + unsigned cache_policy, + bool structurized) +{ + voffset = LLVMBuildAdd(ctx->builder, voffset ? voffset : ctx->i32_0, + immoffset, ""); + + LLVMValueRef args[7]; + int idx = 0; + args[idx++] = vdata; + args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""); + if (structurized) + args[idx++] = vindex ? vindex : ctx->i32_0; + args[idx++] = voffset ? voffset : ctx->i32_0; + args[idx++] = soffset ? soffset : ctx->i32_0; + args[idx++] = LLVMConstInt(ctx->i32, ac_get_tbuffer_format(ctx->chip_class, dfmt, nfmt), 0); + args[idx++] = LLVMConstInt(ctx->i32, cache_policy, 0); + unsigned func = !ac_has_vec3_support(ctx->chip_class, true) && num_channels == 3 ? 4 : num_channels; + const char *indexing_kind = structurized ? "struct" : "raw"; + char name[256], type_name[8]; + + LLVMTypeRef type = func > 1 ? LLVMVectorType(ctx->i32, func) : ctx->i32; + ac_build_type_name_for_intr(type, type_name, sizeof(type_name)); + + snprintf(name, sizeof(name), "llvm.amdgcn.%s.tbuffer.store.%s", + indexing_kind, type_name); + + ac_build_intrinsic(ctx, name, ctx->voidt, args, idx, + AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY); +} + +void +ac_build_struct_tbuffer_store(struct ac_llvm_context *ctx, + LLVMValueRef rsrc, + LLVMValueRef vdata, + LLVMValueRef vindex, + LLVMValueRef voffset, + LLVMValueRef soffset, + LLVMValueRef immoffset, + unsigned num_channels, + unsigned dfmt, + unsigned nfmt, + unsigned cache_policy) +{ + ac_build_tbuffer_store(ctx, rsrc, vdata, vindex, voffset, soffset, + immoffset, num_channels, dfmt, nfmt, cache_policy, + true); +} + +void +ac_build_raw_tbuffer_store(struct ac_llvm_context *ctx, + LLVMValueRef rsrc, + LLVMValueRef vdata, + LLVMValueRef voffset, + LLVMValueRef soffset, + LLVMValueRef immoffset, + unsigned num_channels, + unsigned dfmt, + unsigned nfmt, + unsigned cache_policy) +{ + ac_build_tbuffer_store(ctx, rsrc, vdata, NULL, voffset, soffset, + immoffset, num_channels, dfmt, nfmt, cache_policy, + false); +} + +void +ac_build_tbuffer_store_short(struct ac_llvm_context *ctx, + LLVMValueRef rsrc, + LLVMValueRef vdata, + LLVMValueRef voffset, + LLVMValueRef soffset, + unsigned cache_policy) +{ + vdata = LLVMBuildBitCast(ctx->builder, vdata, ctx->i16, ""); + + if (LLVM_VERSION_MAJOR >= 9) { + /* LLVM 9+ supports i8/i16 with struct/raw intrinsics. */ + ac_build_buffer_store_common(ctx, rsrc, vdata, NULL, + voffset, soffset, 1, + ctx->i16, cache_policy, + false, false); + } else { + unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_16; + unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT; + + vdata = LLVMBuildZExt(ctx->builder, vdata, ctx->i32, ""); + + ac_build_raw_tbuffer_store(ctx, rsrc, vdata, voffset, soffset, + ctx->i32_0, 1, dfmt, nfmt, cache_policy); + } +} + +void +ac_build_tbuffer_store_byte(struct ac_llvm_context *ctx, + LLVMValueRef rsrc, + LLVMValueRef vdata, + LLVMValueRef voffset, + LLVMValueRef soffset, + unsigned cache_policy) +{ + vdata = LLVMBuildBitCast(ctx->builder, vdata, ctx->i8, ""); + + if (LLVM_VERSION_MAJOR >= 9) { + /* LLVM 9+ supports i8/i16 with struct/raw intrinsics. */ + ac_build_buffer_store_common(ctx, rsrc, vdata, NULL, + voffset, soffset, 1, + ctx->i8, cache_policy, + false, false); + } else { + unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_8; + unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT; + + vdata = LLVMBuildZExt(ctx->builder, vdata, ctx->i32, ""); + + ac_build_raw_tbuffer_store(ctx, rsrc, vdata, voffset, soffset, + ctx->i32_0, 1, dfmt, nfmt, cache_policy); + } +} +/** + * Set range metadata on an instruction. This can only be used on load and + * call instructions. If you know an instruction can only produce the values + * 0, 1, 2, you would do set_range_metadata(value, 0, 3); + * \p lo is the minimum value inclusive. + * \p hi is the maximum value exclusive. + */ +static void set_range_metadata(struct ac_llvm_context *ctx, + LLVMValueRef value, unsigned lo, unsigned hi) +{ + LLVMValueRef range_md, md_args[2]; + LLVMTypeRef type = LLVMTypeOf(value); + LLVMContextRef context = LLVMGetTypeContext(type); + + md_args[0] = LLVMConstInt(type, lo, false); + md_args[1] = LLVMConstInt(type, hi, false); + range_md = LLVMMDNodeInContext(context, md_args, 2); + LLVMSetMetadata(value, ctx->range_md_kind, range_md); +} + +LLVMValueRef +ac_get_thread_id(struct ac_llvm_context *ctx) +{ + LLVMValueRef tid; + + LLVMValueRef tid_args[2]; + tid_args[0] = LLVMConstInt(ctx->i32, 0xffffffff, false); + tid_args[1] = ctx->i32_0; + tid_args[1] = ac_build_intrinsic(ctx, + "llvm.amdgcn.mbcnt.lo", ctx->i32, + tid_args, 2, AC_FUNC_ATTR_READNONE); + + if (ctx->wave_size == 32) { + tid = tid_args[1]; + } else { + tid = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.hi", + ctx->i32, tid_args, + 2, AC_FUNC_ATTR_READNONE); + } + set_range_metadata(ctx, tid, 0, ctx->wave_size); + return tid; +} + +/* + * AMD GCN implements derivatives using the local data store (LDS) + * All writes to the LDS happen in all executing threads at + * the same time. TID is the Thread ID for the current + * thread and is a value between 0 and 63, representing + * the thread's position in the wavefront. + * + * For the pixel shader threads are grouped into quads of four pixels. + * The TIDs of the pixels of a quad are: + * + * +------+------+ + * |4n + 0|4n + 1| + * +------+------+ + * |4n + 2|4n + 3| + * +------+------+ + * + * So, masking the TID with 0xfffffffc yields the TID of the top left pixel + * of the quad, masking with 0xfffffffd yields the TID of the top pixel of + * the current pixel's column, and masking with 0xfffffffe yields the TID + * of the left pixel of the current pixel's row. + * + * Adding 1 yields the TID of the pixel to the right of the left pixel, and + * adding 2 yields the TID of the pixel below the top pixel. + */ +LLVMValueRef +ac_build_ddxy(struct ac_llvm_context *ctx, + uint32_t mask, + int idx, + LLVMValueRef val) +{ + unsigned tl_lanes[4], trbl_lanes[4]; + char name[32], type[8]; + LLVMValueRef tl, trbl; + LLVMTypeRef result_type; + LLVMValueRef result; + + result_type = ac_to_float_type(ctx, LLVMTypeOf(val)); + + if (result_type == ctx->f16) + val = LLVMBuildZExt(ctx->builder, val, ctx->i32, ""); + + for (unsigned i = 0; i < 4; ++i) { + tl_lanes[i] = i & mask; + trbl_lanes[i] = (i & mask) + idx; + } + + tl = ac_build_quad_swizzle(ctx, val, + tl_lanes[0], tl_lanes[1], + tl_lanes[2], tl_lanes[3]); + trbl = ac_build_quad_swizzle(ctx, val, + trbl_lanes[0], trbl_lanes[1], + trbl_lanes[2], trbl_lanes[3]); + + if (result_type == ctx->f16) { + tl = LLVMBuildTrunc(ctx->builder, tl, ctx->i16, ""); + trbl = LLVMBuildTrunc(ctx->builder, trbl, ctx->i16, ""); + } + + tl = LLVMBuildBitCast(ctx->builder, tl, result_type, ""); + trbl = LLVMBuildBitCast(ctx->builder, trbl, result_type, ""); + result = LLVMBuildFSub(ctx->builder, trbl, tl, ""); + + ac_build_type_name_for_intr(result_type, type, sizeof(type)); + snprintf(name, sizeof(name), "llvm.amdgcn.wqm.%s", type); + + return ac_build_intrinsic(ctx, name, result_type, &result, 1, 0); +} + +void +ac_build_sendmsg(struct ac_llvm_context *ctx, + uint32_t msg, + LLVMValueRef wave_id) +{ + LLVMValueRef args[2]; + args[0] = LLVMConstInt(ctx->i32, msg, false); + args[1] = wave_id; + ac_build_intrinsic(ctx, "llvm.amdgcn.s.sendmsg", ctx->voidt, args, 2, 0); +} + +LLVMValueRef +ac_build_imsb(struct ac_llvm_context *ctx, + LLVMValueRef arg, + LLVMTypeRef dst_type) +{ + LLVMValueRef msb = ac_build_intrinsic(ctx, "llvm.amdgcn.sffbh.i32", + dst_type, &arg, 1, + AC_FUNC_ATTR_READNONE); + + /* The HW returns the last bit index from MSB, but NIR/TGSI wants + * the index from LSB. Invert it by doing "31 - msb". */ + msb = LLVMBuildSub(ctx->builder, LLVMConstInt(ctx->i32, 31, false), + msb, ""); + + LLVMValueRef all_ones = LLVMConstInt(ctx->i32, -1, true); + LLVMValueRef cond = LLVMBuildOr(ctx->builder, + LLVMBuildICmp(ctx->builder, LLVMIntEQ, + arg, ctx->i32_0, ""), + LLVMBuildICmp(ctx->builder, LLVMIntEQ, + arg, all_ones, ""), ""); + + return LLVMBuildSelect(ctx->builder, cond, all_ones, msb, ""); +} + +LLVMValueRef +ac_build_umsb(struct ac_llvm_context *ctx, + LLVMValueRef arg, + LLVMTypeRef dst_type) +{ + const char *intrin_name; + LLVMTypeRef type; + LLVMValueRef highest_bit; + LLVMValueRef zero; + unsigned bitsize; + + bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(arg)); + switch (bitsize) { + case 64: + intrin_name = "llvm.ctlz.i64"; + type = ctx->i64; + highest_bit = LLVMConstInt(ctx->i64, 63, false); + zero = ctx->i64_0; + break; + case 32: + intrin_name = "llvm.ctlz.i32"; + type = ctx->i32; + highest_bit = LLVMConstInt(ctx->i32, 31, false); + zero = ctx->i32_0; + break; + case 16: + intrin_name = "llvm.ctlz.i16"; + type = ctx->i16; + highest_bit = LLVMConstInt(ctx->i16, 15, false); + zero = ctx->i16_0; + break; + case 8: + intrin_name = "llvm.ctlz.i8"; + type = ctx->i8; + highest_bit = LLVMConstInt(ctx->i8, 7, false); + zero = ctx->i8_0; + break; + default: + unreachable(!"invalid bitsize"); + break; + } + + LLVMValueRef params[2] = { + arg, + ctx->i1true, + }; + + LLVMValueRef msb = ac_build_intrinsic(ctx, intrin_name, type, + params, 2, + AC_FUNC_ATTR_READNONE); + + /* The HW returns the last bit index from MSB, but TGSI/NIR wants + * the index from LSB. Invert it by doing "31 - msb". */ + msb = LLVMBuildSub(ctx->builder, highest_bit, msb, ""); + + if (bitsize == 64) { + msb = LLVMBuildTrunc(ctx->builder, msb, ctx->i32, ""); + } else if (bitsize < 32) { + msb = LLVMBuildSExt(ctx->builder, msb, ctx->i32, ""); + } + + /* check for zero */ + return LLVMBuildSelect(ctx->builder, + LLVMBuildICmp(ctx->builder, LLVMIntEQ, arg, zero, ""), + LLVMConstInt(ctx->i32, -1, true), msb, ""); +} + +LLVMValueRef ac_build_fmin(struct ac_llvm_context *ctx, LLVMValueRef a, + LLVMValueRef b) +{ + char name[64]; + snprintf(name, sizeof(name), "llvm.minnum.f%d", ac_get_elem_bits(ctx, LLVMTypeOf(a))); + LLVMValueRef args[2] = {a, b}; + return ac_build_intrinsic(ctx, name, LLVMTypeOf(a), args, 2, + AC_FUNC_ATTR_READNONE); +} + +LLVMValueRef ac_build_fmax(struct ac_llvm_context *ctx, LLVMValueRef a, + LLVMValueRef b) +{ + char name[64]; + snprintf(name, sizeof(name), "llvm.maxnum.f%d", ac_get_elem_bits(ctx, LLVMTypeOf(a))); + LLVMValueRef args[2] = {a, b}; + return ac_build_intrinsic(ctx, name, LLVMTypeOf(a), args, 2, + AC_FUNC_ATTR_READNONE); +} + +LLVMValueRef ac_build_imin(struct ac_llvm_context *ctx, LLVMValueRef a, + LLVMValueRef b) +{ + LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntSLE, a, b, ""); + return LLVMBuildSelect(ctx->builder, cmp, a, b, ""); +} + +LLVMValueRef ac_build_imax(struct ac_llvm_context *ctx, LLVMValueRef a, + LLVMValueRef b) +{ + LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGT, a, b, ""); + return LLVMBuildSelect(ctx->builder, cmp, a, b, ""); +} + +LLVMValueRef ac_build_umin(struct ac_llvm_context *ctx, LLVMValueRef a, + LLVMValueRef b) +{ + LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntULE, a, b, ""); + return LLVMBuildSelect(ctx->builder, cmp, a, b, ""); +} + +LLVMValueRef ac_build_umax(struct ac_llvm_context *ctx, LLVMValueRef a, + LLVMValueRef b) +{ + LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntUGE, a, b, ""); + return LLVMBuildSelect(ctx->builder, cmp, a, b, ""); +} + +LLVMValueRef ac_build_clamp(struct ac_llvm_context *ctx, LLVMValueRef value) +{ + LLVMTypeRef t = LLVMTypeOf(value); + return ac_build_fmin(ctx, ac_build_fmax(ctx, value, LLVMConstReal(t, 0.0)), + LLVMConstReal(t, 1.0)); +} + +void ac_build_export(struct ac_llvm_context *ctx, struct ac_export_args *a) +{ + LLVMValueRef args[9]; + + args[0] = LLVMConstInt(ctx->i32, a->target, 0); + args[1] = LLVMConstInt(ctx->i32, a->enabled_channels, 0); + + if (a->compr) { + LLVMTypeRef i16 = LLVMInt16TypeInContext(ctx->context); + LLVMTypeRef v2i16 = LLVMVectorType(i16, 2); + + args[2] = LLVMBuildBitCast(ctx->builder, a->out[0], + v2i16, ""); + args[3] = LLVMBuildBitCast(ctx->builder, a->out[1], + v2i16, ""); + args[4] = LLVMConstInt(ctx->i1, a->done, 0); + args[5] = LLVMConstInt(ctx->i1, a->valid_mask, 0); + + ac_build_intrinsic(ctx, "llvm.amdgcn.exp.compr.v2i16", + ctx->voidt, args, 6, 0); + } else { + args[2] = a->out[0]; + args[3] = a->out[1]; + args[4] = a->out[2]; + args[5] = a->out[3]; + args[6] = LLVMConstInt(ctx->i1, a->done, 0); + args[7] = LLVMConstInt(ctx->i1, a->valid_mask, 0); + + ac_build_intrinsic(ctx, "llvm.amdgcn.exp.f32", + ctx->voidt, args, 8, 0); + } +} + +void ac_build_export_null(struct ac_llvm_context *ctx) +{ + struct ac_export_args args; + + args.enabled_channels = 0x0; /* enabled channels */ + args.valid_mask = 1; /* whether the EXEC mask is valid */ + args.done = 1; /* DONE bit */ + args.target = V_008DFC_SQ_EXP_NULL; + args.compr = 0; /* COMPR flag (0 = 32-bit export) */ + args.out[0] = LLVMGetUndef(ctx->f32); /* R */ + args.out[1] = LLVMGetUndef(ctx->f32); /* G */ + args.out[2] = LLVMGetUndef(ctx->f32); /* B */ + args.out[3] = LLVMGetUndef(ctx->f32); /* A */ + + ac_build_export(ctx, &args); +} + +static unsigned ac_num_coords(enum ac_image_dim dim) +{ + switch (dim) { + case ac_image_1d: + return 1; + case ac_image_2d: + case ac_image_1darray: + return 2; + case ac_image_3d: + case ac_image_cube: + case ac_image_2darray: + case ac_image_2dmsaa: + return 3; + case ac_image_2darraymsaa: + return 4; + default: + unreachable("ac_num_coords: bad dim"); + } +} + +static unsigned ac_num_derivs(enum ac_image_dim dim) +{ + switch (dim) { + case ac_image_1d: + case ac_image_1darray: + return 2; + case ac_image_2d: + case ac_image_2darray: + case ac_image_cube: + return 4; + case ac_image_3d: + return 6; + case ac_image_2dmsaa: + case ac_image_2darraymsaa: + default: + unreachable("derivatives not supported"); + } +} + +static const char *get_atomic_name(enum ac_atomic_op op) +{ + switch (op) { + case ac_atomic_swap: return "swap"; + case ac_atomic_add: return "add"; + case ac_atomic_sub: return "sub"; + case ac_atomic_smin: return "smin"; + case ac_atomic_umin: return "umin"; + case ac_atomic_smax: return "smax"; + case ac_atomic_umax: return "umax"; + case ac_atomic_and: return "and"; + case ac_atomic_or: return "or"; + case ac_atomic_xor: return "xor"; + case ac_atomic_inc_wrap: return "inc"; + case ac_atomic_dec_wrap: return "dec"; + } + unreachable("bad atomic op"); +} + +LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx, + struct ac_image_args *a) +{ + const char *overload[3] = { "", "", "" }; + unsigned num_overloads = 0; + LLVMValueRef args[18]; + unsigned num_args = 0; + enum ac_image_dim dim = a->dim; + + assert(!a->lod || a->lod == ctx->i32_0 || a->lod == ctx->f32_0 || + !a->level_zero); + assert((a->opcode != ac_image_get_resinfo && a->opcode != ac_image_load_mip && + a->opcode != ac_image_store_mip) || + a->lod); + assert(a->opcode == ac_image_sample || a->opcode == ac_image_gather4 || + (!a->compare && !a->offset)); + assert((a->opcode == ac_image_sample || a->opcode == ac_image_gather4 || + a->opcode == ac_image_get_lod) || + !a->bias); + assert((a->bias ? 1 : 0) + + (a->lod ? 1 : 0) + + (a->level_zero ? 1 : 0) + + (a->derivs[0] ? 1 : 0) <= 1); + + if (a->opcode == ac_image_get_lod) { + switch (dim) { + case ac_image_1darray: + dim = ac_image_1d; + break; + case ac_image_2darray: + case ac_image_cube: + dim = ac_image_2d; + break; + default: + break; + } + } + + bool sample = a->opcode == ac_image_sample || + a->opcode == ac_image_gather4 || + a->opcode == ac_image_get_lod; + bool atomic = a->opcode == ac_image_atomic || + a->opcode == ac_image_atomic_cmpswap; + bool load = a->opcode == ac_image_sample || + a->opcode == ac_image_gather4 || + a->opcode == ac_image_load || + a->opcode == ac_image_load_mip; + LLVMTypeRef coord_type = sample ? ctx->f32 : ctx->i32; + + if (atomic || a->opcode == ac_image_store || a->opcode == ac_image_store_mip) { + args[num_args++] = a->data[0]; + if (a->opcode == ac_image_atomic_cmpswap) + args[num_args++] = a->data[1]; + } + + if (!atomic) + args[num_args++] = LLVMConstInt(ctx->i32, a->dmask, false); + + if (a->offset) + args[num_args++] = ac_to_integer(ctx, a->offset); + if (a->bias) { + args[num_args++] = ac_to_float(ctx, a->bias); + overload[num_overloads++] = ".f32"; + } + if (a->compare) + args[num_args++] = ac_to_float(ctx, a->compare); + if (a->derivs[0]) { + unsigned count = ac_num_derivs(dim); + for (unsigned i = 0; i < count; ++i) + args[num_args++] = ac_to_float(ctx, a->derivs[i]); + overload[num_overloads++] = ".f32"; + } + unsigned num_coords = + a->opcode != ac_image_get_resinfo ? ac_num_coords(dim) : 0; + for (unsigned i = 0; i < num_coords; ++i) + args[num_args++] = LLVMBuildBitCast(ctx->builder, a->coords[i], coord_type, ""); + if (a->lod) + args[num_args++] = LLVMBuildBitCast(ctx->builder, a->lod, coord_type, ""); + overload[num_overloads++] = sample ? ".f32" : ".i32"; + + args[num_args++] = a->resource; + if (sample) { + args[num_args++] = a->sampler; + args[num_args++] = LLVMConstInt(ctx->i1, a->unorm, false); + } + + args[num_args++] = ctx->i32_0; /* texfailctrl */ + args[num_args++] = LLVMConstInt(ctx->i32, + load ? get_load_cache_policy(ctx, a->cache_policy) : + a->cache_policy, false); + + const char *name; + const char *atomic_subop = ""; + switch (a->opcode) { + case ac_image_sample: name = "sample"; break; + case ac_image_gather4: name = "gather4"; break; + case ac_image_load: name = "load"; break; + case ac_image_load_mip: name = "load.mip"; break; + case ac_image_store: name = "store"; break; + case ac_image_store_mip: name = "store.mip"; break; + case ac_image_atomic: + name = "atomic."; + atomic_subop = get_atomic_name(a->atomic); + break; + case ac_image_atomic_cmpswap: + name = "atomic."; + atomic_subop = "cmpswap"; + break; + case ac_image_get_lod: name = "getlod"; break; + case ac_image_get_resinfo: name = "getresinfo"; break; + default: unreachable("invalid image opcode"); + } + + const char *dimname; + switch (dim) { + case ac_image_1d: dimname = "1d"; break; + case ac_image_2d: dimname = "2d"; break; + case ac_image_3d: dimname = "3d"; break; + case ac_image_cube: dimname = "cube"; break; + case ac_image_1darray: dimname = "1darray"; break; + case ac_image_2darray: dimname = "2darray"; break; + case ac_image_2dmsaa: dimname = "2dmsaa"; break; + case ac_image_2darraymsaa: dimname = "2darraymsaa"; break; + default: unreachable("invalid dim"); + } + + bool lod_suffix = + a->lod && (a->opcode == ac_image_sample || a->opcode == ac_image_gather4); + char intr_name[96]; + snprintf(intr_name, sizeof(intr_name), + "llvm.amdgcn.image.%s%s" /* base name */ + "%s%s%s" /* sample/gather modifiers */ + ".%s.%s%s%s%s", /* dimension and type overloads */ + name, atomic_subop, + a->compare ? ".c" : "", + a->bias ? ".b" : + lod_suffix ? ".l" : + a->derivs[0] ? ".d" : + a->level_zero ? ".lz" : "", + a->offset ? ".o" : "", + dimname, + atomic ? "i32" : "v4f32", + overload[0], overload[1], overload[2]); + + LLVMTypeRef retty; + if (atomic) + retty = ctx->i32; + else if (a->opcode == ac_image_store || a->opcode == ac_image_store_mip) + retty = ctx->voidt; + else + retty = ctx->v4f32; + + LLVMValueRef result = + ac_build_intrinsic(ctx, intr_name, retty, args, num_args, + a->attributes); + if (!sample && retty == ctx->v4f32) { + result = LLVMBuildBitCast(ctx->builder, result, + ctx->v4i32, ""); + } + return result; +} + +LLVMValueRef ac_build_image_get_sample_count(struct ac_llvm_context *ctx, + LLVMValueRef rsrc) +{ + LLVMValueRef samples; + + /* Read the samples from the descriptor directly. + * Hardware doesn't have any instruction for this. + */ + samples = LLVMBuildExtractElement(ctx->builder, rsrc, + LLVMConstInt(ctx->i32, 3, 0), ""); + samples = LLVMBuildLShr(ctx->builder, samples, + LLVMConstInt(ctx->i32, 16, 0), ""); + samples = LLVMBuildAnd(ctx->builder, samples, + LLVMConstInt(ctx->i32, 0xf, 0), ""); + samples = LLVMBuildShl(ctx->builder, ctx->i32_1, + samples, ""); + return samples; +} + +LLVMValueRef ac_build_cvt_pkrtz_f16(struct ac_llvm_context *ctx, + LLVMValueRef args[2]) +{ + LLVMTypeRef v2f16 = + LLVMVectorType(LLVMHalfTypeInContext(ctx->context), 2); + + return ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pkrtz", v2f16, + args, 2, AC_FUNC_ATTR_READNONE); +} + +LLVMValueRef ac_build_cvt_pknorm_i16(struct ac_llvm_context *ctx, + LLVMValueRef args[2]) +{ + LLVMValueRef res = + ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pknorm.i16", + ctx->v2i16, args, 2, + AC_FUNC_ATTR_READNONE); + return LLVMBuildBitCast(ctx->builder, res, ctx->i32, ""); +} + +LLVMValueRef ac_build_cvt_pknorm_u16(struct ac_llvm_context *ctx, + LLVMValueRef args[2]) +{ + LLVMValueRef res = + ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pknorm.u16", + ctx->v2i16, args, 2, + AC_FUNC_ATTR_READNONE); + return LLVMBuildBitCast(ctx->builder, res, ctx->i32, ""); +} + +/* The 8-bit and 10-bit clamping is for HW workarounds. */ +LLVMValueRef ac_build_cvt_pk_i16(struct ac_llvm_context *ctx, + LLVMValueRef args[2], unsigned bits, bool hi) +{ + assert(bits == 8 || bits == 10 || bits == 16); + + LLVMValueRef max_rgb = LLVMConstInt(ctx->i32, + bits == 8 ? 127 : bits == 10 ? 511 : 32767, 0); + LLVMValueRef min_rgb = LLVMConstInt(ctx->i32, + bits == 8 ? -128 : bits == 10 ? -512 : -32768, 0); + LLVMValueRef max_alpha = + bits != 10 ? max_rgb : ctx->i32_1; + LLVMValueRef min_alpha = + bits != 10 ? min_rgb : LLVMConstInt(ctx->i32, -2, 0); + + /* Clamp. */ + if (bits != 16) { + for (int i = 0; i < 2; i++) { + bool alpha = hi && i == 1; + args[i] = ac_build_imin(ctx, args[i], + alpha ? max_alpha : max_rgb); + args[i] = ac_build_imax(ctx, args[i], + alpha ? min_alpha : min_rgb); + } + } + + LLVMValueRef res = + ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pk.i16", + ctx->v2i16, args, 2, + AC_FUNC_ATTR_READNONE); + return LLVMBuildBitCast(ctx->builder, res, ctx->i32, ""); +} + +/* The 8-bit and 10-bit clamping is for HW workarounds. */ +LLVMValueRef ac_build_cvt_pk_u16(struct ac_llvm_context *ctx, + LLVMValueRef args[2], unsigned bits, bool hi) +{ + assert(bits == 8 || bits == 10 || bits == 16); + + LLVMValueRef max_rgb = LLVMConstInt(ctx->i32, + bits == 8 ? 255 : bits == 10 ? 1023 : 65535, 0); + LLVMValueRef max_alpha = + bits != 10 ? max_rgb : LLVMConstInt(ctx->i32, 3, 0); + + /* Clamp. */ + if (bits != 16) { + for (int i = 0; i < 2; i++) { + bool alpha = hi && i == 1; + args[i] = ac_build_umin(ctx, args[i], + alpha ? max_alpha : max_rgb); + } + } + + LLVMValueRef res = + ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pk.u16", + ctx->v2i16, args, 2, + AC_FUNC_ATTR_READNONE); + return LLVMBuildBitCast(ctx->builder, res, ctx->i32, ""); +} + +LLVMValueRef ac_build_wqm_vote(struct ac_llvm_context *ctx, LLVMValueRef i1) +{ + return ac_build_intrinsic(ctx, "llvm.amdgcn.wqm.vote", ctx->i1, + &i1, 1, AC_FUNC_ATTR_READNONE); +} + +void ac_build_kill_if_false(struct ac_llvm_context *ctx, LLVMValueRef i1) +{ + ac_build_intrinsic(ctx, "llvm.amdgcn.kill", ctx->voidt, + &i1, 1, 0); +} + +LLVMValueRef ac_build_bfe(struct ac_llvm_context *ctx, LLVMValueRef input, + LLVMValueRef offset, LLVMValueRef width, + bool is_signed) +{ + LLVMValueRef args[] = { + input, + offset, + width, + }; + + return ac_build_intrinsic(ctx, is_signed ? "llvm.amdgcn.sbfe.i32" : + "llvm.amdgcn.ubfe.i32", + ctx->i32, args, 3, AC_FUNC_ATTR_READNONE); + +} + +LLVMValueRef ac_build_imad(struct ac_llvm_context *ctx, LLVMValueRef s0, + LLVMValueRef s1, LLVMValueRef s2) +{ + return LLVMBuildAdd(ctx->builder, + LLVMBuildMul(ctx->builder, s0, s1, ""), s2, ""); +} + +LLVMValueRef ac_build_fmad(struct ac_llvm_context *ctx, LLVMValueRef s0, + LLVMValueRef s1, LLVMValueRef s2) +{ + /* FMA is better on GFX10, because it has FMA units instead of MUL-ADD units. */ + if (ctx->chip_class >= GFX10) { + return ac_build_intrinsic(ctx, "llvm.fma.f32", ctx->f32, + (LLVMValueRef []) {s0, s1, s2}, 3, + AC_FUNC_ATTR_READNONE); + } + + return LLVMBuildFAdd(ctx->builder, + LLVMBuildFMul(ctx->builder, s0, s1, ""), s2, ""); +} + +void ac_build_waitcnt(struct ac_llvm_context *ctx, unsigned wait_flags) +{ + if (!wait_flags) + return; + + unsigned lgkmcnt = 63; + unsigned vmcnt = ctx->chip_class >= GFX9 ? 63 : 15; + unsigned vscnt = 63; + + if (wait_flags & AC_WAIT_LGKM) + lgkmcnt = 0; + if (wait_flags & AC_WAIT_VLOAD) + vmcnt = 0; + + if (wait_flags & AC_WAIT_VSTORE) { + if (ctx->chip_class >= GFX10) + vscnt = 0; + else + vmcnt = 0; + } + + /* There is no intrinsic for vscnt(0), so use a fence. */ + if ((wait_flags & AC_WAIT_LGKM && + wait_flags & AC_WAIT_VLOAD && + wait_flags & AC_WAIT_VSTORE) || + vscnt == 0) { + LLVMBuildFence(ctx->builder, LLVMAtomicOrderingRelease, false, ""); + return; + } + + unsigned simm16 = (lgkmcnt << 8) | + (7 << 4) | /* expcnt */ + (vmcnt & 0xf) | + ((vmcnt >> 4) << 14); + + LLVMValueRef args[1] = { + LLVMConstInt(ctx->i32, simm16, false), + }; + ac_build_intrinsic(ctx, "llvm.amdgcn.s.waitcnt", + ctx->voidt, args, 1, 0); +} + +LLVMValueRef ac_build_fmed3(struct ac_llvm_context *ctx, LLVMValueRef src0, + LLVMValueRef src1, LLVMValueRef src2, + unsigned bitsize) +{ + LLVMValueRef result; + + if (bitsize == 64 || (bitsize == 16 && ctx->chip_class <= GFX8)) { + /* Lower 64-bit fmed because LLVM doesn't expose an intrinsic, + * or lower 16-bit fmed because it's only supported on GFX9+. + */ + LLVMValueRef min1, min2, max1; + + min1 = ac_build_fmin(ctx, src0, src1); + max1 = ac_build_fmax(ctx, src0, src1); + min2 = ac_build_fmin(ctx, max1, src2); + + result = ac_build_fmax(ctx, min2, min1); + } else { + LLVMTypeRef type; + char *intr; + + if (bitsize == 16) { + intr = "llvm.amdgcn.fmed3.f16"; + type = ctx->f16; + } else { + assert(bitsize == 32); + intr = "llvm.amdgcn.fmed3.f32"; + type = ctx->f32; + } + + LLVMValueRef params[] = { + src0, + src1, + src2, + }; + + result = ac_build_intrinsic(ctx, intr, type, params, 3, + AC_FUNC_ATTR_READNONE); + } + + if (ctx->chip_class < GFX9 && bitsize == 32) { + /* Only pre-GFX9 chips do not flush denorms. */ + result = ac_build_canonicalize(ctx, result, bitsize); + } + + return result; +} + +LLVMValueRef ac_build_fract(struct ac_llvm_context *ctx, LLVMValueRef src0, + unsigned bitsize) +{ + LLVMTypeRef type; + char *intr; + + if (bitsize == 16) { + intr = "llvm.amdgcn.fract.f16"; + type = ctx->f16; + } else if (bitsize == 32) { + intr = "llvm.amdgcn.fract.f32"; + type = ctx->f32; + } else { + intr = "llvm.amdgcn.fract.f64"; + type = ctx->f64; + } + + LLVMValueRef params[] = { + src0, + }; + return ac_build_intrinsic(ctx, intr, type, params, 1, + AC_FUNC_ATTR_READNONE); +} + +LLVMValueRef ac_build_isign(struct ac_llvm_context *ctx, LLVMValueRef src0, + unsigned bitsize) +{ + LLVMTypeRef type = LLVMIntTypeInContext(ctx->context, bitsize); + LLVMValueRef zero = LLVMConstInt(type, 0, false); + LLVMValueRef one = LLVMConstInt(type, 1, false); + + LLVMValueRef cmp, val; + cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGT, src0, zero, ""); + val = LLVMBuildSelect(ctx->builder, cmp, one, src0, ""); + cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGE, val, zero, ""); + val = LLVMBuildSelect(ctx->builder, cmp, val, LLVMConstInt(type, -1, true), ""); + return val; +} + +LLVMValueRef ac_build_fsign(struct ac_llvm_context *ctx, LLVMValueRef src0, + unsigned bitsize) +{ + LLVMValueRef cmp, val, zero, one; + LLVMTypeRef type; + + if (bitsize == 16) { + type = ctx->f16; + zero = ctx->f16_0; + one = ctx->f16_1; + } else if (bitsize == 32) { + type = ctx->f32; + zero = ctx->f32_0; + one = ctx->f32_1; + } else { + type = ctx->f64; + zero = ctx->f64_0; + one = ctx->f64_1; + } + + cmp = LLVMBuildFCmp(ctx->builder, LLVMRealOGT, src0, zero, ""); + val = LLVMBuildSelect(ctx->builder, cmp, one, src0, ""); + cmp = LLVMBuildFCmp(ctx->builder, LLVMRealOGE, val, zero, ""); + val = LLVMBuildSelect(ctx->builder, cmp, val, LLVMConstReal(type, -1.0), ""); + return val; +} + +LLVMValueRef ac_build_bit_count(struct ac_llvm_context *ctx, LLVMValueRef src0) +{ + LLVMValueRef result; + unsigned bitsize; + + bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0)); + + switch (bitsize) { + case 128: + result = ac_build_intrinsic(ctx, "llvm.ctpop.i128", ctx->i128, + (LLVMValueRef []) { src0 }, 1, + AC_FUNC_ATTR_READNONE); + result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, ""); + break; + case 64: + result = ac_build_intrinsic(ctx, "llvm.ctpop.i64", ctx->i64, + (LLVMValueRef []) { src0 }, 1, + AC_FUNC_ATTR_READNONE); + + result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, ""); + break; + case 32: + result = ac_build_intrinsic(ctx, "llvm.ctpop.i32", ctx->i32, + (LLVMValueRef []) { src0 }, 1, + AC_FUNC_ATTR_READNONE); + break; + case 16: + result = ac_build_intrinsic(ctx, "llvm.ctpop.i16", ctx->i16, + (LLVMValueRef []) { src0 }, 1, + AC_FUNC_ATTR_READNONE); + + result = LLVMBuildZExt(ctx->builder, result, ctx->i32, ""); + break; + case 8: + result = ac_build_intrinsic(ctx, "llvm.ctpop.i8", ctx->i8, + (LLVMValueRef []) { src0 }, 1, + AC_FUNC_ATTR_READNONE); + + result = LLVMBuildZExt(ctx->builder, result, ctx->i32, ""); + break; + default: + unreachable(!"invalid bitsize"); + break; + } + + return result; +} + +LLVMValueRef ac_build_bitfield_reverse(struct ac_llvm_context *ctx, + LLVMValueRef src0) +{ + LLVMValueRef result; + unsigned bitsize; + + bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0)); + + switch (bitsize) { + case 64: + result = ac_build_intrinsic(ctx, "llvm.bitreverse.i64", ctx->i64, + (LLVMValueRef []) { src0 }, 1, + AC_FUNC_ATTR_READNONE); + + result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, ""); + break; + case 32: + result = ac_build_intrinsic(ctx, "llvm.bitreverse.i32", ctx->i32, + (LLVMValueRef []) { src0 }, 1, + AC_FUNC_ATTR_READNONE); + break; + case 16: + result = ac_build_intrinsic(ctx, "llvm.bitreverse.i16", ctx->i16, + (LLVMValueRef []) { src0 }, 1, + AC_FUNC_ATTR_READNONE); + + result = LLVMBuildZExt(ctx->builder, result, ctx->i32, ""); + break; + case 8: + result = ac_build_intrinsic(ctx, "llvm.bitreverse.i8", ctx->i8, + (LLVMValueRef []) { src0 }, 1, + AC_FUNC_ATTR_READNONE); + + result = LLVMBuildZExt(ctx->builder, result, ctx->i32, ""); + break; + default: + unreachable(!"invalid bitsize"); + break; + } + + return result; +} + +#define AC_EXP_TARGET 0 +#define AC_EXP_ENABLED_CHANNELS 1 +#define AC_EXP_OUT0 2 + +enum ac_ir_type { + AC_IR_UNDEF, + AC_IR_CONST, + AC_IR_VALUE, +}; + +struct ac_vs_exp_chan +{ + LLVMValueRef value; + float const_float; + enum ac_ir_type type; +}; + +struct ac_vs_exp_inst { + unsigned offset; + LLVMValueRef inst; + struct ac_vs_exp_chan chan[4]; +}; + +struct ac_vs_exports { + unsigned num; + struct ac_vs_exp_inst exp[VARYING_SLOT_MAX]; +}; + +/* Return true if the PARAM export has been eliminated. */ +static bool ac_eliminate_const_output(uint8_t *vs_output_param_offset, + uint32_t num_outputs, + struct ac_vs_exp_inst *exp) +{ + unsigned i, default_val; /* SPI_PS_INPUT_CNTL_i.DEFAULT_VAL */ + bool is_zero[4] = {}, is_one[4] = {}; + + for (i = 0; i < 4; i++) { + /* It's a constant expression. Undef outputs are eliminated too. */ + if (exp->chan[i].type == AC_IR_UNDEF) { + is_zero[i] = true; + is_one[i] = true; + } else if (exp->chan[i].type == AC_IR_CONST) { + if (exp->chan[i].const_float == 0) + is_zero[i] = true; + else if (exp->chan[i].const_float == 1) + is_one[i] = true; + else + return false; /* other constant */ + } else + return false; + } + + /* Only certain combinations of 0 and 1 can be eliminated. */ + if (is_zero[0] && is_zero[1] && is_zero[2]) + default_val = is_zero[3] ? 0 : 1; + else if (is_one[0] && is_one[1] && is_one[2]) + default_val = is_zero[3] ? 2 : 3; + else + return false; + + /* The PARAM export can be represented as DEFAULT_VAL. Kill it. */ + LLVMInstructionEraseFromParent(exp->inst); + + /* Change OFFSET to DEFAULT_VAL. */ + for (i = 0; i < num_outputs; i++) { + if (vs_output_param_offset[i] == exp->offset) { + vs_output_param_offset[i] = + AC_EXP_PARAM_DEFAULT_VAL_0000 + default_val; + break; + } + } + return true; +} + +static bool ac_eliminate_duplicated_output(struct ac_llvm_context *ctx, + uint8_t *vs_output_param_offset, + uint32_t num_outputs, + struct ac_vs_exports *processed, + struct ac_vs_exp_inst *exp) +{ + unsigned p, copy_back_channels = 0; + + /* See if the output is already in the list of processed outputs. + * The LLVMValueRef comparison relies on SSA. + */ + for (p = 0; p < processed->num; p++) { + bool different = false; + + for (unsigned j = 0; j < 4; j++) { + struct ac_vs_exp_chan *c1 = &processed->exp[p].chan[j]; + struct ac_vs_exp_chan *c2 = &exp->chan[j]; + + /* Treat undef as a match. */ + if (c2->type == AC_IR_UNDEF) + continue; + + /* If c1 is undef but c2 isn't, we can copy c2 to c1 + * and consider the instruction duplicated. + */ + if (c1->type == AC_IR_UNDEF) { + copy_back_channels |= 1 << j; + continue; + } + + /* Test whether the channels are not equal. */ + if (c1->type != c2->type || + (c1->type == AC_IR_CONST && + c1->const_float != c2->const_float) || + (c1->type == AC_IR_VALUE && + c1->value != c2->value)) { + different = true; + break; + } + } + if (!different) + break; + + copy_back_channels = 0; + } + if (p == processed->num) + return false; + + /* If a match was found, but the matching export has undef where the new + * one has a normal value, copy the normal value to the undef channel. + */ + struct ac_vs_exp_inst *match = &processed->exp[p]; + + /* Get current enabled channels mask. */ + LLVMValueRef arg = LLVMGetOperand(match->inst, AC_EXP_ENABLED_CHANNELS); + unsigned enabled_channels = LLVMConstIntGetZExtValue(arg); + + while (copy_back_channels) { + unsigned chan = u_bit_scan(©_back_channels); + + assert(match->chan[chan].type == AC_IR_UNDEF); + LLVMSetOperand(match->inst, AC_EXP_OUT0 + chan, + exp->chan[chan].value); + match->chan[chan] = exp->chan[chan]; + + /* Update number of enabled channels because the original mask + * is not always 0xf. + */ + enabled_channels |= (1 << chan); + LLVMSetOperand(match->inst, AC_EXP_ENABLED_CHANNELS, + LLVMConstInt(ctx->i32, enabled_channels, 0)); + } + + /* The PARAM export is duplicated. Kill it. */ + LLVMInstructionEraseFromParent(exp->inst); + + /* Change OFFSET to the matching export. */ + for (unsigned i = 0; i < num_outputs; i++) { + if (vs_output_param_offset[i] == exp->offset) { + vs_output_param_offset[i] = match->offset; + break; + } + } + return true; +} + +void ac_optimize_vs_outputs(struct ac_llvm_context *ctx, + LLVMValueRef main_fn, + uint8_t *vs_output_param_offset, + uint32_t num_outputs, + uint32_t skip_output_mask, + uint8_t *num_param_exports) +{ + LLVMBasicBlockRef bb; + bool removed_any = false; + struct ac_vs_exports exports; + + exports.num = 0; + + /* Process all LLVM instructions. */ + bb = LLVMGetFirstBasicBlock(main_fn); + while (bb) { + LLVMValueRef inst = LLVMGetFirstInstruction(bb); + + while (inst) { + LLVMValueRef cur = inst; + inst = LLVMGetNextInstruction(inst); + struct ac_vs_exp_inst exp; + + if (LLVMGetInstructionOpcode(cur) != LLVMCall) + continue; + + LLVMValueRef callee = ac_llvm_get_called_value(cur); + + if (!ac_llvm_is_function(callee)) + continue; + + const char *name = LLVMGetValueName(callee); + unsigned num_args = LLVMCountParams(callee); + + /* Check if this is an export instruction. */ + if ((num_args != 9 && num_args != 8) || + (strcmp(name, "llvm.SI.export") && + strcmp(name, "llvm.amdgcn.exp.f32"))) + continue; + + LLVMValueRef arg = LLVMGetOperand(cur, AC_EXP_TARGET); + unsigned target = LLVMConstIntGetZExtValue(arg); + + if (target < V_008DFC_SQ_EXP_PARAM) + continue; + + target -= V_008DFC_SQ_EXP_PARAM; + + /* Parse the instruction. */ + memset(&exp, 0, sizeof(exp)); + exp.offset = target; + exp.inst = cur; + + for (unsigned i = 0; i < 4; i++) { + LLVMValueRef v = LLVMGetOperand(cur, AC_EXP_OUT0 + i); + + exp.chan[i].value = v; + + if (LLVMIsUndef(v)) { + exp.chan[i].type = AC_IR_UNDEF; + } else if (LLVMIsAConstantFP(v)) { + LLVMBool loses_info; + exp.chan[i].type = AC_IR_CONST; + exp.chan[i].const_float = + LLVMConstRealGetDouble(v, &loses_info); + } else { + exp.chan[i].type = AC_IR_VALUE; + } + } + + /* Eliminate constant and duplicated PARAM exports. */ + if (!((1u << target) & skip_output_mask) && + (ac_eliminate_const_output(vs_output_param_offset, + num_outputs, &exp) || + ac_eliminate_duplicated_output(ctx, + vs_output_param_offset, + num_outputs, &exports, + &exp))) { + removed_any = true; + } else { + exports.exp[exports.num++] = exp; + } + } + bb = LLVMGetNextBasicBlock(bb); + } + + /* Remove holes in export memory due to removed PARAM exports. + * This is done by renumbering all PARAM exports. + */ + if (removed_any) { + uint8_t old_offset[VARYING_SLOT_MAX]; + unsigned out, i; + + /* Make a copy of the offsets. We need the old version while + * we are modifying some of them. */ + memcpy(old_offset, vs_output_param_offset, + sizeof(old_offset)); + + for (i = 0; i < exports.num; i++) { + unsigned offset = exports.exp[i].offset; + + /* Update vs_output_param_offset. Multiple outputs can + * have the same offset. + */ + for (out = 0; out < num_outputs; out++) { + if (old_offset[out] == offset) + vs_output_param_offset[out] = i; + } + + /* Change the PARAM offset in the instruction. */ + LLVMSetOperand(exports.exp[i].inst, AC_EXP_TARGET, + LLVMConstInt(ctx->i32, + V_008DFC_SQ_EXP_PARAM + i, 0)); + } + *num_param_exports = exports.num; + } +} + +void ac_init_exec_full_mask(struct ac_llvm_context *ctx) +{ + LLVMValueRef full_mask = LLVMConstInt(ctx->i64, ~0ull, 0); + ac_build_intrinsic(ctx, + "llvm.amdgcn.init.exec", ctx->voidt, + &full_mask, 1, AC_FUNC_ATTR_CONVERGENT); +} + +void ac_declare_lds_as_pointer(struct ac_llvm_context *ctx) +{ + unsigned lds_size = ctx->chip_class >= GFX7 ? 65536 : 32768; + ctx->lds = LLVMBuildIntToPtr(ctx->builder, ctx->i32_0, + LLVMPointerType(LLVMArrayType(ctx->i32, lds_size / 4), AC_ADDR_SPACE_LDS), + "lds"); +} + +LLVMValueRef ac_lds_load(struct ac_llvm_context *ctx, + LLVMValueRef dw_addr) +{ + return LLVMBuildLoad(ctx->builder, ac_build_gep0(ctx, ctx->lds, dw_addr), ""); +} + +void ac_lds_store(struct ac_llvm_context *ctx, + LLVMValueRef dw_addr, + LLVMValueRef value) +{ + value = ac_to_integer(ctx, value); + ac_build_indexed_store(ctx, ctx->lds, + dw_addr, value); +} + +LLVMValueRef ac_find_lsb(struct ac_llvm_context *ctx, + LLVMTypeRef dst_type, + LLVMValueRef src0) +{ + unsigned src0_bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0)); + const char *intrin_name; + LLVMTypeRef type; + LLVMValueRef zero; + + switch (src0_bitsize) { + case 64: + intrin_name = "llvm.cttz.i64"; + type = ctx->i64; + zero = ctx->i64_0; + break; + case 32: + intrin_name = "llvm.cttz.i32"; + type = ctx->i32; + zero = ctx->i32_0; + break; + case 16: + intrin_name = "llvm.cttz.i16"; + type = ctx->i16; + zero = ctx->i16_0; + break; + case 8: + intrin_name = "llvm.cttz.i8"; + type = ctx->i8; + zero = ctx->i8_0; + break; + default: + unreachable(!"invalid bitsize"); + } + + LLVMValueRef params[2] = { + src0, + + /* The value of 1 means that ffs(x=0) = undef, so LLVM won't + * add special code to check for x=0. The reason is that + * the LLVM behavior for x=0 is different from what we + * need here. However, LLVM also assumes that ffs(x) is + * in [0, 31], but GLSL expects that ffs(0) = -1, so + * a conditional assignment to handle 0 is still required. + * + * The hardware already implements the correct behavior. + */ + ctx->i1true, + }; + + LLVMValueRef lsb = ac_build_intrinsic(ctx, intrin_name, type, + params, 2, + AC_FUNC_ATTR_READNONE); + + if (src0_bitsize == 64) { + lsb = LLVMBuildTrunc(ctx->builder, lsb, ctx->i32, ""); + } else if (src0_bitsize < 32) { + lsb = LLVMBuildSExt(ctx->builder, lsb, ctx->i32, ""); + } + + /* TODO: We need an intrinsic to skip this conditional. */ + /* Check for zero: */ + return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, + LLVMIntEQ, src0, + zero, ""), + LLVMConstInt(ctx->i32, -1, 0), lsb, ""); +} + +LLVMTypeRef ac_array_in_const_addr_space(LLVMTypeRef elem_type) +{ + return LLVMPointerType(elem_type, AC_ADDR_SPACE_CONST); +} + +LLVMTypeRef ac_array_in_const32_addr_space(LLVMTypeRef elem_type) +{ + return LLVMPointerType(elem_type, AC_ADDR_SPACE_CONST_32BIT); +} + +static struct ac_llvm_flow * +get_current_flow(struct ac_llvm_context *ctx) +{ + if (ctx->flow->depth > 0) + return &ctx->flow->stack[ctx->flow->depth - 1]; + return NULL; +} + +static struct ac_llvm_flow * +get_innermost_loop(struct ac_llvm_context *ctx) +{ + for (unsigned i = ctx->flow->depth; i > 0; --i) { + if (ctx->flow->stack[i - 1].loop_entry_block) + return &ctx->flow->stack[i - 1]; + } + return NULL; +} + +static struct ac_llvm_flow * +push_flow(struct ac_llvm_context *ctx) +{ + struct ac_llvm_flow *flow; + + if (ctx->flow->depth >= ctx->flow->depth_max) { + unsigned new_max = MAX2(ctx->flow->depth << 1, + AC_LLVM_INITIAL_CF_DEPTH); + + ctx->flow->stack = realloc(ctx->flow->stack, new_max * sizeof(*ctx->flow->stack)); + ctx->flow->depth_max = new_max; + } + + flow = &ctx->flow->stack[ctx->flow->depth]; + ctx->flow->depth++; + + flow->next_block = NULL; + flow->loop_entry_block = NULL; + return flow; +} + +static void set_basicblock_name(LLVMBasicBlockRef bb, const char *base, + int label_id) +{ + char buf[32]; + snprintf(buf, sizeof(buf), "%s%d", base, label_id); + LLVMSetValueName(LLVMBasicBlockAsValue(bb), buf); +} + +/* Append a basic block at the level of the parent flow. + */ +static LLVMBasicBlockRef append_basic_block(struct ac_llvm_context *ctx, + const char *name) +{ + assert(ctx->flow->depth >= 1); + + if (ctx->flow->depth >= 2) { + struct ac_llvm_flow *flow = &ctx->flow->stack[ctx->flow->depth - 2]; + + return LLVMInsertBasicBlockInContext(ctx->context, + flow->next_block, name); + } + + LLVMValueRef main_fn = + LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx->builder)); + return LLVMAppendBasicBlockInContext(ctx->context, main_fn, name); +} + +/* Emit a branch to the given default target for the current block if + * applicable -- that is, if the current block does not already contain a + * branch from a break or continue. + */ +static void emit_default_branch(LLVMBuilderRef builder, + LLVMBasicBlockRef target) +{ + if (!LLVMGetBasicBlockTerminator(LLVMGetInsertBlock(builder))) + LLVMBuildBr(builder, target); +} + +void ac_build_bgnloop(struct ac_llvm_context *ctx, int label_id) +{ + struct ac_llvm_flow *flow = push_flow(ctx); + flow->loop_entry_block = append_basic_block(ctx, "LOOP"); + flow->next_block = append_basic_block(ctx, "ENDLOOP"); + set_basicblock_name(flow->loop_entry_block, "loop", label_id); + LLVMBuildBr(ctx->builder, flow->loop_entry_block); + LLVMPositionBuilderAtEnd(ctx->builder, flow->loop_entry_block); +} + +void ac_build_break(struct ac_llvm_context *ctx) +{ + struct ac_llvm_flow *flow = get_innermost_loop(ctx); + LLVMBuildBr(ctx->builder, flow->next_block); +} + +void ac_build_continue(struct ac_llvm_context *ctx) +{ + struct ac_llvm_flow *flow = get_innermost_loop(ctx); + LLVMBuildBr(ctx->builder, flow->loop_entry_block); +} + +void ac_build_else(struct ac_llvm_context *ctx, int label_id) +{ + struct ac_llvm_flow *current_branch = get_current_flow(ctx); + LLVMBasicBlockRef endif_block; + + assert(!current_branch->loop_entry_block); + + endif_block = append_basic_block(ctx, "ENDIF"); + emit_default_branch(ctx->builder, endif_block); + + LLVMPositionBuilderAtEnd(ctx->builder, current_branch->next_block); + set_basicblock_name(current_branch->next_block, "else", label_id); + + current_branch->next_block = endif_block; +} + +void ac_build_endif(struct ac_llvm_context *ctx, int label_id) +{ + struct ac_llvm_flow *current_branch = get_current_flow(ctx); + + assert(!current_branch->loop_entry_block); + + emit_default_branch(ctx->builder, current_branch->next_block); + LLVMPositionBuilderAtEnd(ctx->builder, current_branch->next_block); + set_basicblock_name(current_branch->next_block, "endif", label_id); + + ctx->flow->depth--; +} + +void ac_build_endloop(struct ac_llvm_context *ctx, int label_id) +{ + struct ac_llvm_flow *current_loop = get_current_flow(ctx); + + assert(current_loop->loop_entry_block); + + emit_default_branch(ctx->builder, current_loop->loop_entry_block); + + LLVMPositionBuilderAtEnd(ctx->builder, current_loop->next_block); + set_basicblock_name(current_loop->next_block, "endloop", label_id); + ctx->flow->depth--; +} + +void ac_build_ifcc(struct ac_llvm_context *ctx, LLVMValueRef cond, int label_id) +{ + struct ac_llvm_flow *flow = push_flow(ctx); + LLVMBasicBlockRef if_block; + + if_block = append_basic_block(ctx, "IF"); + flow->next_block = append_basic_block(ctx, "ELSE"); + set_basicblock_name(if_block, "if", label_id); + LLVMBuildCondBr(ctx->builder, cond, if_block, flow->next_block); + LLVMPositionBuilderAtEnd(ctx->builder, if_block); +} + +void ac_build_if(struct ac_llvm_context *ctx, LLVMValueRef value, + int label_id) +{ + LLVMValueRef cond = LLVMBuildFCmp(ctx->builder, LLVMRealUNE, + value, ctx->f32_0, ""); + ac_build_ifcc(ctx, cond, label_id); +} + +void ac_build_uif(struct ac_llvm_context *ctx, LLVMValueRef value, + int label_id) +{ + LLVMValueRef cond = LLVMBuildICmp(ctx->builder, LLVMIntNE, + ac_to_integer(ctx, value), + ctx->i32_0, ""); + ac_build_ifcc(ctx, cond, label_id); +} + +LLVMValueRef ac_build_alloca_undef(struct ac_llvm_context *ac, LLVMTypeRef type, + const char *name) +{ + LLVMBuilderRef builder = ac->builder; + LLVMBasicBlockRef current_block = LLVMGetInsertBlock(builder); + LLVMValueRef function = LLVMGetBasicBlockParent(current_block); + LLVMBasicBlockRef first_block = LLVMGetEntryBasicBlock(function); + LLVMValueRef first_instr = LLVMGetFirstInstruction(first_block); + LLVMBuilderRef first_builder = LLVMCreateBuilderInContext(ac->context); + LLVMValueRef res; + + if (first_instr) { + LLVMPositionBuilderBefore(first_builder, first_instr); + } else { + LLVMPositionBuilderAtEnd(first_builder, first_block); + } + + res = LLVMBuildAlloca(first_builder, type, name); + LLVMDisposeBuilder(first_builder); + return res; +} + +LLVMValueRef ac_build_alloca(struct ac_llvm_context *ac, + LLVMTypeRef type, const char *name) +{ + LLVMValueRef ptr = ac_build_alloca_undef(ac, type, name); + LLVMBuildStore(ac->builder, LLVMConstNull(type), ptr); + return ptr; +} + +LLVMValueRef ac_cast_ptr(struct ac_llvm_context *ctx, LLVMValueRef ptr, + LLVMTypeRef type) +{ + int addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr)); + return LLVMBuildBitCast(ctx->builder, ptr, + LLVMPointerType(type, addr_space), ""); +} + +LLVMValueRef ac_trim_vector(struct ac_llvm_context *ctx, LLVMValueRef value, + unsigned count) +{ + unsigned num_components = ac_get_llvm_num_components(value); + if (count == num_components) + return value; + + LLVMValueRef masks[MAX2(count, 2)]; + masks[0] = ctx->i32_0; + masks[1] = ctx->i32_1; + for (unsigned i = 2; i < count; i++) + masks[i] = LLVMConstInt(ctx->i32, i, false); + + if (count == 1) + return LLVMBuildExtractElement(ctx->builder, value, masks[0], + ""); + + LLVMValueRef swizzle = LLVMConstVector(masks, count); + return LLVMBuildShuffleVector(ctx->builder, value, value, swizzle, ""); +} + +LLVMValueRef ac_unpack_param(struct ac_llvm_context *ctx, LLVMValueRef param, + unsigned rshift, unsigned bitwidth) +{ + LLVMValueRef value = param; + if (rshift) + value = LLVMBuildLShr(ctx->builder, value, + LLVMConstInt(ctx->i32, rshift, false), ""); + + if (rshift + bitwidth < 32) { + unsigned mask = (1 << bitwidth) - 1; + value = LLVMBuildAnd(ctx->builder, value, + LLVMConstInt(ctx->i32, mask, false), ""); + } + return value; +} + +/* Adjust the sample index according to FMASK. + * + * For uncompressed MSAA surfaces, FMASK should return 0x76543210, + * which is the identity mapping. Each nibble says which physical sample + * should be fetched to get that sample. + * + * For example, 0x11111100 means there are only 2 samples stored and + * the second sample covers 3/4 of the pixel. When reading samples 0 + * and 1, return physical sample 0 (determined by the first two 0s + * in FMASK), otherwise return physical sample 1. + * + * The sample index should be adjusted as follows: + * addr[sample_index] = (fmask >> (addr[sample_index] * 4)) & 0xF; + */ +void ac_apply_fmask_to_sample(struct ac_llvm_context *ac, LLVMValueRef fmask, + LLVMValueRef *addr, bool is_array_tex) +{ + struct ac_image_args fmask_load = {}; + fmask_load.opcode = ac_image_load; + fmask_load.resource = fmask; + fmask_load.dmask = 0xf; + fmask_load.dim = is_array_tex ? ac_image_2darray : ac_image_2d; + fmask_load.attributes = AC_FUNC_ATTR_READNONE; + + fmask_load.coords[0] = addr[0]; + fmask_load.coords[1] = addr[1]; + if (is_array_tex) + fmask_load.coords[2] = addr[2]; + + LLVMValueRef fmask_value = ac_build_image_opcode(ac, &fmask_load); + fmask_value = LLVMBuildExtractElement(ac->builder, fmask_value, + ac->i32_0, ""); + + /* Apply the formula. */ + unsigned sample_chan = is_array_tex ? 3 : 2; + LLVMValueRef final_sample; + final_sample = LLVMBuildMul(ac->builder, addr[sample_chan], + LLVMConstInt(ac->i32, 4, 0), ""); + final_sample = LLVMBuildLShr(ac->builder, fmask_value, final_sample, ""); + /* Mask the sample index by 0x7, because 0x8 means an unknown value + * with EQAA, so those will map to 0. */ + final_sample = LLVMBuildAnd(ac->builder, final_sample, + LLVMConstInt(ac->i32, 0x7, 0), ""); + + /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK + * resource descriptor is 0 (invalid). + */ + LLVMValueRef tmp; + tmp = LLVMBuildBitCast(ac->builder, fmask, ac->v8i32, ""); + tmp = LLVMBuildExtractElement(ac->builder, tmp, ac->i32_1, ""); + tmp = LLVMBuildICmp(ac->builder, LLVMIntNE, tmp, ac->i32_0, ""); + + /* Replace the MSAA sample index. */ + addr[sample_chan] = LLVMBuildSelect(ac->builder, tmp, final_sample, + addr[sample_chan], ""); +} + +static LLVMValueRef +_ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src, + LLVMValueRef lane, bool with_opt_barrier) +{ + LLVMTypeRef type = LLVMTypeOf(src); + LLVMValueRef result; + + if (with_opt_barrier) + ac_build_optimization_barrier(ctx, &src); + + src = LLVMBuildZExt(ctx->builder, src, ctx->i32, ""); + if (lane) + lane = LLVMBuildZExt(ctx->builder, lane, ctx->i32, ""); + + result = ac_build_intrinsic(ctx, + lane == NULL ? "llvm.amdgcn.readfirstlane" : "llvm.amdgcn.readlane", + ctx->i32, (LLVMValueRef []) { src, lane }, + lane == NULL ? 1 : 2, + AC_FUNC_ATTR_READNONE | + AC_FUNC_ATTR_CONVERGENT); + + return LLVMBuildTrunc(ctx->builder, result, type, ""); +} + +static LLVMValueRef +ac_build_readlane_common(struct ac_llvm_context *ctx, + LLVMValueRef src, LLVMValueRef lane, + bool with_opt_barrier) +{ + LLVMTypeRef src_type = LLVMTypeOf(src); + src = ac_to_integer(ctx, src); + unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src)); + LLVMValueRef ret; + + if (bits > 32) { + assert(bits % 32 == 0); + LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32); + LLVMValueRef src_vector = + LLVMBuildBitCast(ctx->builder, src, vec_type, ""); + ret = LLVMGetUndef(vec_type); + for (unsigned i = 0; i < bits / 32; i++) { + LLVMValueRef ret_comp; + + src = LLVMBuildExtractElement(ctx->builder, src_vector, + LLVMConstInt(ctx->i32, i, 0), ""); + + ret_comp = _ac_build_readlane(ctx, src, lane, + with_opt_barrier); + + ret = LLVMBuildInsertElement(ctx->builder, ret, ret_comp, + LLVMConstInt(ctx->i32, i, 0), ""); + } + } else { + ret = _ac_build_readlane(ctx, src, lane, with_opt_barrier); + } + + if (LLVMGetTypeKind(src_type) == LLVMPointerTypeKind) + return LLVMBuildIntToPtr(ctx->builder, ret, src_type, ""); + return LLVMBuildBitCast(ctx->builder, ret, src_type, ""); +} + +/** + * Builds the "llvm.amdgcn.readlane" or "llvm.amdgcn.readfirstlane" intrinsic. + * + * The optimization barrier is not needed if the value is the same in all lanes + * or if this is called in the outermost block. + * + * @param ctx + * @param src + * @param lane - id of the lane or NULL for the first active lane + * @return value of the lane + */ +LLVMValueRef ac_build_readlane_no_opt_barrier(struct ac_llvm_context *ctx, + LLVMValueRef src, LLVMValueRef lane) +{ + return ac_build_readlane_common(ctx, src, lane, false); +} + + +LLVMValueRef +ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef lane) +{ + return ac_build_readlane_common(ctx, src, lane, true); +} + +LLVMValueRef +ac_build_writelane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef value, LLVMValueRef lane) +{ + return ac_build_intrinsic(ctx, "llvm.amdgcn.writelane", ctx->i32, + (LLVMValueRef []) {value, lane, src}, 3, + AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT); +} + +LLVMValueRef +ac_build_mbcnt(struct ac_llvm_context *ctx, LLVMValueRef mask) +{ + if (ctx->wave_size == 32) { + return ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.lo", ctx->i32, + (LLVMValueRef []) { mask, ctx->i32_0 }, + 2, AC_FUNC_ATTR_READNONE); + } + LLVMValueRef mask_vec = LLVMBuildBitCast(ctx->builder, mask, + LLVMVectorType(ctx->i32, 2), + ""); + LLVMValueRef mask_lo = LLVMBuildExtractElement(ctx->builder, mask_vec, + ctx->i32_0, ""); + LLVMValueRef mask_hi = LLVMBuildExtractElement(ctx->builder, mask_vec, + ctx->i32_1, ""); + LLVMValueRef val = + ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.lo", ctx->i32, + (LLVMValueRef []) { mask_lo, ctx->i32_0 }, + 2, AC_FUNC_ATTR_READNONE); + val = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.hi", ctx->i32, + (LLVMValueRef []) { mask_hi, val }, + 2, AC_FUNC_ATTR_READNONE); + return val; +} + +enum dpp_ctrl { + _dpp_quad_perm = 0x000, + _dpp_row_sl = 0x100, + _dpp_row_sr = 0x110, + _dpp_row_rr = 0x120, + dpp_wf_sl1 = 0x130, + dpp_wf_rl1 = 0x134, + dpp_wf_sr1 = 0x138, + dpp_wf_rr1 = 0x13C, + dpp_row_mirror = 0x140, + dpp_row_half_mirror = 0x141, + dpp_row_bcast15 = 0x142, + dpp_row_bcast31 = 0x143 +}; + +static inline enum dpp_ctrl +dpp_quad_perm(unsigned lane0, unsigned lane1, unsigned lane2, unsigned lane3) +{ + assert(lane0 < 4 && lane1 < 4 && lane2 < 4 && lane3 < 4); + return _dpp_quad_perm | lane0 | (lane1 << 2) | (lane2 << 4) | (lane3 << 6); +} + +static inline enum dpp_ctrl +dpp_row_sl(unsigned amount) +{ + assert(amount > 0 && amount < 16); + return _dpp_row_sl | amount; +} + +static inline enum dpp_ctrl +dpp_row_sr(unsigned amount) +{ + assert(amount > 0 && amount < 16); + return _dpp_row_sr | amount; +} + +static LLVMValueRef +_ac_build_dpp(struct ac_llvm_context *ctx, LLVMValueRef old, LLVMValueRef src, + enum dpp_ctrl dpp_ctrl, unsigned row_mask, unsigned bank_mask, + bool bound_ctrl) +{ + LLVMTypeRef type = LLVMTypeOf(src); + LLVMValueRef res; + + old = LLVMBuildZExt(ctx->builder, old, ctx->i32, ""); + src = LLVMBuildZExt(ctx->builder, src, ctx->i32, ""); + + res = ac_build_intrinsic(ctx, "llvm.amdgcn.update.dpp.i32", ctx->i32, + (LLVMValueRef[]) { + old, src, + LLVMConstInt(ctx->i32, dpp_ctrl, 0), + LLVMConstInt(ctx->i32, row_mask, 0), + LLVMConstInt(ctx->i32, bank_mask, 0), + LLVMConstInt(ctx->i1, bound_ctrl, 0) }, + 6, AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT); + + return LLVMBuildTrunc(ctx->builder, res, type, ""); +} + +static LLVMValueRef +ac_build_dpp(struct ac_llvm_context *ctx, LLVMValueRef old, LLVMValueRef src, + enum dpp_ctrl dpp_ctrl, unsigned row_mask, unsigned bank_mask, + bool bound_ctrl) +{ + LLVMTypeRef src_type = LLVMTypeOf(src); + src = ac_to_integer(ctx, src); + old = ac_to_integer(ctx, old); + unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src)); + LLVMValueRef ret; + if (bits > 32) { + assert(bits % 32 == 0); + LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32); + LLVMValueRef src_vector = + LLVMBuildBitCast(ctx->builder, src, vec_type, ""); + LLVMValueRef old_vector = + LLVMBuildBitCast(ctx->builder, old, vec_type, ""); + ret = LLVMGetUndef(vec_type); + for (unsigned i = 0; i < bits / 32; i++) { + src = LLVMBuildExtractElement(ctx->builder, src_vector, + LLVMConstInt(ctx->i32, i, + 0), ""); + old = LLVMBuildExtractElement(ctx->builder, old_vector, + LLVMConstInt(ctx->i32, i, + 0), ""); + LLVMValueRef ret_comp = _ac_build_dpp(ctx, old, src, + dpp_ctrl, + row_mask, + bank_mask, + bound_ctrl); + ret = LLVMBuildInsertElement(ctx->builder, ret, + ret_comp, + LLVMConstInt(ctx->i32, i, + 0), ""); + } + } else { + ret = _ac_build_dpp(ctx, old, src, dpp_ctrl, row_mask, + bank_mask, bound_ctrl); + } + return LLVMBuildBitCast(ctx->builder, ret, src_type, ""); +} + +static LLVMValueRef +_ac_build_permlane16(struct ac_llvm_context *ctx, LLVMValueRef src, uint64_t sel, + bool exchange_rows, bool bound_ctrl) +{ + LLVMTypeRef type = LLVMTypeOf(src); + LLVMValueRef result; + + src = LLVMBuildZExt(ctx->builder, src, ctx->i32, ""); + + LLVMValueRef args[6] = { + src, + src, + LLVMConstInt(ctx->i32, sel, false), + LLVMConstInt(ctx->i32, sel >> 32, false), + ctx->i1true, /* fi */ + bound_ctrl ? ctx->i1true : ctx->i1false, + }; + + result = ac_build_intrinsic(ctx, exchange_rows ? "llvm.amdgcn.permlanex16" + : "llvm.amdgcn.permlane16", + ctx->i32, args, 6, + AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT); + + return LLVMBuildTrunc(ctx->builder, result, type, ""); +} + +static LLVMValueRef +ac_build_permlane16(struct ac_llvm_context *ctx, LLVMValueRef src, uint64_t sel, + bool exchange_rows, bool bound_ctrl) +{ + LLVMTypeRef src_type = LLVMTypeOf(src); + src = ac_to_integer(ctx, src); + unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src)); + LLVMValueRef ret; + if (bits > 32) { + assert(bits % 32 == 0); + LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32); + LLVMValueRef src_vector = + LLVMBuildBitCast(ctx->builder, src, vec_type, ""); + ret = LLVMGetUndef(vec_type); + for (unsigned i = 0; i < bits / 32; i++) { + src = LLVMBuildExtractElement(ctx->builder, src_vector, + LLVMConstInt(ctx->i32, i, + 0), ""); + LLVMValueRef ret_comp = + _ac_build_permlane16(ctx, src, sel, + exchange_rows, + bound_ctrl); + ret = LLVMBuildInsertElement(ctx->builder, ret, + ret_comp, + LLVMConstInt(ctx->i32, i, + 0), ""); + } + } else { + ret = _ac_build_permlane16(ctx, src, sel, exchange_rows, + bound_ctrl); + } + return LLVMBuildBitCast(ctx->builder, ret, src_type, ""); +} + +static inline unsigned +ds_pattern_bitmode(unsigned and_mask, unsigned or_mask, unsigned xor_mask) +{ + assert(and_mask < 32 && or_mask < 32 && xor_mask < 32); + return and_mask | (or_mask << 5) | (xor_mask << 10); +} + +static LLVMValueRef +_ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned mask) +{ + LLVMTypeRef src_type = LLVMTypeOf(src); + LLVMValueRef ret; + + src = LLVMBuildZExt(ctx->builder, src, ctx->i32, ""); + + ret = ac_build_intrinsic(ctx, "llvm.amdgcn.ds.swizzle", ctx->i32, + (LLVMValueRef []) { + src, LLVMConstInt(ctx->i32, mask, 0) }, + 2, AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT); + + return LLVMBuildTrunc(ctx->builder, ret, src_type, ""); +} + +LLVMValueRef +ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned mask) +{ + LLVMTypeRef src_type = LLVMTypeOf(src); + src = ac_to_integer(ctx, src); + unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src)); + LLVMValueRef ret; + if (bits > 32) { + assert(bits % 32 == 0); + LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32); + LLVMValueRef src_vector = + LLVMBuildBitCast(ctx->builder, src, vec_type, ""); + ret = LLVMGetUndef(vec_type); + for (unsigned i = 0; i < bits / 32; i++) { + src = LLVMBuildExtractElement(ctx->builder, src_vector, + LLVMConstInt(ctx->i32, i, + 0), ""); + LLVMValueRef ret_comp = _ac_build_ds_swizzle(ctx, src, + mask); + ret = LLVMBuildInsertElement(ctx->builder, ret, + ret_comp, + LLVMConstInt(ctx->i32, i, + 0), ""); + } + } else { + ret = _ac_build_ds_swizzle(ctx, src, mask); + } + return LLVMBuildBitCast(ctx->builder, ret, src_type, ""); +} + +static LLVMValueRef +ac_build_wwm(struct ac_llvm_context *ctx, LLVMValueRef src) +{ + LLVMTypeRef src_type = LLVMTypeOf(src); + unsigned bitsize = ac_get_elem_bits(ctx, src_type); + char name[32], type[8]; + LLVMValueRef ret; + + src = ac_to_integer(ctx, src); + + if (bitsize < 32) + src = LLVMBuildZExt(ctx->builder, src, ctx->i32, ""); + + ac_build_type_name_for_intr(LLVMTypeOf(src), type, sizeof(type)); + snprintf(name, sizeof(name), "llvm.amdgcn.wwm.%s", type); + ret = ac_build_intrinsic(ctx, name, LLVMTypeOf(src), + (LLVMValueRef []) { src }, 1, + AC_FUNC_ATTR_READNONE); + + if (bitsize < 32) + ret = LLVMBuildTrunc(ctx->builder, ret, + ac_to_integer_type(ctx, src_type), ""); + + return LLVMBuildBitCast(ctx->builder, ret, src_type, ""); +} + +static LLVMValueRef +ac_build_set_inactive(struct ac_llvm_context *ctx, LLVMValueRef src, + LLVMValueRef inactive) +{ + char name[33], type[8]; + LLVMTypeRef src_type = LLVMTypeOf(src); + unsigned bitsize = ac_get_elem_bits(ctx, src_type); + src = ac_to_integer(ctx, src); + inactive = ac_to_integer(ctx, inactive); + + if (bitsize < 32) { + src = LLVMBuildZExt(ctx->builder, src, ctx->i32, ""); + inactive = LLVMBuildZExt(ctx->builder, inactive, ctx->i32, ""); + } + + ac_build_type_name_for_intr(LLVMTypeOf(src), type, sizeof(type)); + snprintf(name, sizeof(name), "llvm.amdgcn.set.inactive.%s", type); + LLVMValueRef ret = + ac_build_intrinsic(ctx, name, + LLVMTypeOf(src), (LLVMValueRef []) { + src, inactive }, 2, + AC_FUNC_ATTR_READNONE | + AC_FUNC_ATTR_CONVERGENT); + if (bitsize < 32) + ret = LLVMBuildTrunc(ctx->builder, ret, src_type, ""); + + return ret; +} + +static LLVMValueRef +get_reduction_identity(struct ac_llvm_context *ctx, nir_op op, unsigned type_size) +{ + if (type_size == 1) { + switch (op) { + case nir_op_iadd: return ctx->i8_0; + case nir_op_imul: return ctx->i8_1; + case nir_op_imin: return LLVMConstInt(ctx->i8, INT8_MAX, 0); + case nir_op_umin: return LLVMConstInt(ctx->i8, UINT8_MAX, 0); + case nir_op_imax: return LLVMConstInt(ctx->i8, INT8_MIN, 0); + case nir_op_umax: return ctx->i8_0; + case nir_op_iand: return LLVMConstInt(ctx->i8, -1, 0); + case nir_op_ior: return ctx->i8_0; + case nir_op_ixor: return ctx->i8_0; + default: + unreachable("bad reduction intrinsic"); + } + } else if (type_size == 2) { + switch (op) { + case nir_op_iadd: return ctx->i16_0; + case nir_op_fadd: return ctx->f16_0; + case nir_op_imul: return ctx->i16_1; + case nir_op_fmul: return ctx->f16_1; + case nir_op_imin: return LLVMConstInt(ctx->i16, INT16_MAX, 0); + case nir_op_umin: return LLVMConstInt(ctx->i16, UINT16_MAX, 0); + case nir_op_fmin: return LLVMConstReal(ctx->f16, INFINITY); + case nir_op_imax: return LLVMConstInt(ctx->i16, INT16_MIN, 0); + case nir_op_umax: return ctx->i16_0; + case nir_op_fmax: return LLVMConstReal(ctx->f16, -INFINITY); + case nir_op_iand: return LLVMConstInt(ctx->i16, -1, 0); + case nir_op_ior: return ctx->i16_0; + case nir_op_ixor: return ctx->i16_0; + default: + unreachable("bad reduction intrinsic"); + } + } else if (type_size == 4) { + switch (op) { + case nir_op_iadd: return ctx->i32_0; + case nir_op_fadd: return ctx->f32_0; + case nir_op_imul: return ctx->i32_1; + case nir_op_fmul: return ctx->f32_1; + case nir_op_imin: return LLVMConstInt(ctx->i32, INT32_MAX, 0); + case nir_op_umin: return LLVMConstInt(ctx->i32, UINT32_MAX, 0); + case nir_op_fmin: return LLVMConstReal(ctx->f32, INFINITY); + case nir_op_imax: return LLVMConstInt(ctx->i32, INT32_MIN, 0); + case nir_op_umax: return ctx->i32_0; + case nir_op_fmax: return LLVMConstReal(ctx->f32, -INFINITY); + case nir_op_iand: return LLVMConstInt(ctx->i32, -1, 0); + case nir_op_ior: return ctx->i32_0; + case nir_op_ixor: return ctx->i32_0; + default: + unreachable("bad reduction intrinsic"); + } + } else { /* type_size == 64bit */ + switch (op) { + case nir_op_iadd: return ctx->i64_0; + case nir_op_fadd: return ctx->f64_0; + case nir_op_imul: return ctx->i64_1; + case nir_op_fmul: return ctx->f64_1; + case nir_op_imin: return LLVMConstInt(ctx->i64, INT64_MAX, 0); + case nir_op_umin: return LLVMConstInt(ctx->i64, UINT64_MAX, 0); + case nir_op_fmin: return LLVMConstReal(ctx->f64, INFINITY); + case nir_op_imax: return LLVMConstInt(ctx->i64, INT64_MIN, 0); + case nir_op_umax: return ctx->i64_0; + case nir_op_fmax: return LLVMConstReal(ctx->f64, -INFINITY); + case nir_op_iand: return LLVMConstInt(ctx->i64, -1, 0); + case nir_op_ior: return ctx->i64_0; + case nir_op_ixor: return ctx->i64_0; + default: + unreachable("bad reduction intrinsic"); + } + } +} + +static LLVMValueRef +ac_build_alu_op(struct ac_llvm_context *ctx, LLVMValueRef lhs, LLVMValueRef rhs, nir_op op) +{ + bool _64bit = ac_get_type_size(LLVMTypeOf(lhs)) == 8; + bool _32bit = ac_get_type_size(LLVMTypeOf(lhs)) == 4; + switch (op) { + case nir_op_iadd: return LLVMBuildAdd(ctx->builder, lhs, rhs, ""); + case nir_op_fadd: return LLVMBuildFAdd(ctx->builder, lhs, rhs, ""); + case nir_op_imul: return LLVMBuildMul(ctx->builder, lhs, rhs, ""); + case nir_op_fmul: return LLVMBuildFMul(ctx->builder, lhs, rhs, ""); + case nir_op_imin: return LLVMBuildSelect(ctx->builder, + LLVMBuildICmp(ctx->builder, LLVMIntSLT, lhs, rhs, ""), + lhs, rhs, ""); + case nir_op_umin: return LLVMBuildSelect(ctx->builder, + LLVMBuildICmp(ctx->builder, LLVMIntULT, lhs, rhs, ""), + lhs, rhs, ""); + case nir_op_fmin: return ac_build_intrinsic(ctx, + _64bit ? "llvm.minnum.f64" : _32bit ? "llvm.minnum.f32" : "llvm.minnum.f16", + _64bit ? ctx->f64 : _32bit ? ctx->f32 : ctx->f16, + (LLVMValueRef[]){lhs, rhs}, 2, AC_FUNC_ATTR_READNONE); + case nir_op_imax: return LLVMBuildSelect(ctx->builder, + LLVMBuildICmp(ctx->builder, LLVMIntSGT, lhs, rhs, ""), + lhs, rhs, ""); + case nir_op_umax: return LLVMBuildSelect(ctx->builder, + LLVMBuildICmp(ctx->builder, LLVMIntUGT, lhs, rhs, ""), + lhs, rhs, ""); + case nir_op_fmax: return ac_build_intrinsic(ctx, + _64bit ? "llvm.maxnum.f64" : _32bit ? "llvm.maxnum.f32" : "llvm.maxnum.f16", + _64bit ? ctx->f64 : _32bit ? ctx->f32 : ctx->f16, + (LLVMValueRef[]){lhs, rhs}, 2, AC_FUNC_ATTR_READNONE); + case nir_op_iand: return LLVMBuildAnd(ctx->builder, lhs, rhs, ""); + case nir_op_ior: return LLVMBuildOr(ctx->builder, lhs, rhs, ""); + case nir_op_ixor: return LLVMBuildXor(ctx->builder, lhs, rhs, ""); + default: + unreachable("bad reduction intrinsic"); + } +} + +/** + * \param src The value to shift. + * \param identity The value to use the first lane. + * \param maxprefix specifies that the result only needs to be correct for a + * prefix of this many threads + * \return src, shifted 1 lane up, and identity shifted into lane 0. + */ +static LLVMValueRef +ac_wavefront_shift_right_1(struct ac_llvm_context *ctx, LLVMValueRef src, + LLVMValueRef identity, unsigned maxprefix) +{ + if (ctx->chip_class >= GFX10) { + /* wavefront shift_right by 1 on GFX10 (emulate dpp_wf_sr1) */ + LLVMValueRef active, tmp1, tmp2; + LLVMValueRef tid = ac_get_thread_id(ctx); + + tmp1 = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, false); + + tmp2 = ac_build_permlane16(ctx, src, (uint64_t)~0, true, false); + + if (maxprefix > 32) { + active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, + LLVMConstInt(ctx->i32, 32, false), ""); + + tmp2 = LLVMBuildSelect(ctx->builder, active, + ac_build_readlane(ctx, src, + LLVMConstInt(ctx->i32, 31, false)), + tmp2, ""); + + active = LLVMBuildOr(ctx->builder, active, + LLVMBuildICmp(ctx->builder, LLVMIntEQ, + LLVMBuildAnd(ctx->builder, tid, + LLVMConstInt(ctx->i32, 0x1f, false), ""), + LLVMConstInt(ctx->i32, 0x10, false), ""), ""); + return LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, ""); + } else if (maxprefix > 16) { + active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, + LLVMConstInt(ctx->i32, 16, false), ""); + + return LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, ""); + } + } else if (ctx->chip_class >= GFX8) { + return ac_build_dpp(ctx, identity, src, dpp_wf_sr1, 0xf, 0xf, false); + } + + /* wavefront shift_right by 1 on SI/CI */ + LLVMValueRef active, tmp1, tmp2; + LLVMValueRef tid = ac_get_thread_id(ctx); + tmp1 = ac_build_ds_swizzle(ctx, src, (1 << 15) | dpp_quad_perm(0, 0, 1, 2)); + tmp2 = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x18, 0x03, 0x00)); + active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, + LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0x7, 0), ""), + LLVMConstInt(ctx->i32, 0x4, 0), ""); + tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, ""); + tmp2 = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x10, 0x07, 0x00)); + active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, + LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0xf, 0), ""), + LLVMConstInt(ctx->i32, 0x8, 0), ""); + tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, ""); + tmp2 = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x00, 0x0f, 0x00)); + active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, + LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0x1f, 0), ""), + LLVMConstInt(ctx->i32, 0x10, 0), ""); + tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, ""); + tmp2 = ac_build_readlane(ctx, src, LLVMConstInt(ctx->i32, 31, 0)); + active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, LLVMConstInt(ctx->i32, 32, 0), ""); + tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, ""); + active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, LLVMConstInt(ctx->i32, 0, 0), ""); + return LLVMBuildSelect(ctx->builder, active, identity, tmp1, ""); +} + +/** + * \param maxprefix specifies that the result only needs to be correct for a + * prefix of this many threads + */ +static LLVMValueRef +ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src, LLVMValueRef identity, + unsigned maxprefix, bool inclusive) +{ + LLVMValueRef result, tmp; + + if (!inclusive) + src = ac_wavefront_shift_right_1(ctx, src, identity, maxprefix); + + result = src; + + if (ctx->chip_class <= GFX7) { + assert(maxprefix == 64); + LLVMValueRef tid = ac_get_thread_id(ctx); + LLVMValueRef active; + tmp = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x1e, 0x00, 0x00)); + active = LLVMBuildICmp(ctx->builder, LLVMIntNE, + LLVMBuildAnd(ctx->builder, tid, ctx->i32_1, ""), + ctx->i32_0, ""); + tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, ""); + result = ac_build_alu_op(ctx, result, tmp, op); + tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1c, 0x01, 0x00)); + active = LLVMBuildICmp(ctx->builder, LLVMIntNE, + LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 2, 0), ""), + ctx->i32_0, ""); + tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, ""); + result = ac_build_alu_op(ctx, result, tmp, op); + tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x18, 0x03, 0x00)); + active = LLVMBuildICmp(ctx->builder, LLVMIntNE, + LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 4, 0), ""), + ctx->i32_0, ""); + tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, ""); + result = ac_build_alu_op(ctx, result, tmp, op); + tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x10, 0x07, 0x00)); + active = LLVMBuildICmp(ctx->builder, LLVMIntNE, + LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 8, 0), ""), + ctx->i32_0, ""); + tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, ""); + result = ac_build_alu_op(ctx, result, tmp, op); + tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x00, 0x0f, 0x00)); + active = LLVMBuildICmp(ctx->builder, LLVMIntNE, + LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 16, 0), ""), + ctx->i32_0, ""); + tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, ""); + result = ac_build_alu_op(ctx, result, tmp, op); + tmp = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, 0)); + active = LLVMBuildICmp(ctx->builder, LLVMIntNE, + LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 32, 0), ""), + ctx->i32_0, ""); + tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, ""); + result = ac_build_alu_op(ctx, result, tmp, op); + return result; + } + + if (maxprefix <= 1) + return result; + tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, false); + result = ac_build_alu_op(ctx, result, tmp, op); + if (maxprefix <= 2) + return result; + tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(2), 0xf, 0xf, false); + result = ac_build_alu_op(ctx, result, tmp, op); + if (maxprefix <= 3) + return result; + tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(3), 0xf, 0xf, false); + result = ac_build_alu_op(ctx, result, tmp, op); + if (maxprefix <= 4) + return result; + tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(4), 0xf, 0xe, false); + result = ac_build_alu_op(ctx, result, tmp, op); + if (maxprefix <= 8) + return result; + tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(8), 0xf, 0xc, false); + result = ac_build_alu_op(ctx, result, tmp, op); + if (maxprefix <= 16) + return result; + + if (ctx->chip_class >= GFX10) { + LLVMValueRef tid = ac_get_thread_id(ctx); + LLVMValueRef active; + + tmp = ac_build_permlane16(ctx, result, ~(uint64_t)0, true, false); + + active = LLVMBuildICmp(ctx->builder, LLVMIntNE, + LLVMBuildAnd(ctx->builder, tid, + LLVMConstInt(ctx->i32, 16, false), ""), + ctx->i32_0, ""); + + tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, ""); + + result = ac_build_alu_op(ctx, result, tmp, op); + + if (maxprefix <= 32) + return result; + + tmp = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, false)); + + active = LLVMBuildICmp(ctx->builder, LLVMIntUGE, tid, + LLVMConstInt(ctx->i32, 32, false), ""); + + tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, ""); + + result = ac_build_alu_op(ctx, result, tmp, op); + return result; + } + + tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false); + result = ac_build_alu_op(ctx, result, tmp, op); + if (maxprefix <= 32) + return result; + tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false); + result = ac_build_alu_op(ctx, result, tmp, op); + return result; +} + +LLVMValueRef +ac_build_inclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op) +{ + LLVMValueRef result; + + if (LLVMTypeOf(src) == ctx->i1 && op == nir_op_iadd) { + LLVMBuilderRef builder = ctx->builder; + src = LLVMBuildZExt(builder, src, ctx->i32, ""); + result = ac_build_ballot(ctx, src); + result = ac_build_mbcnt(ctx, result); + result = LLVMBuildAdd(builder, result, src, ""); + return result; + } + + ac_build_optimization_barrier(ctx, &src); + + LLVMValueRef identity = + get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src))); + result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity), + LLVMTypeOf(identity), ""); + result = ac_build_scan(ctx, op, result, identity, ctx->wave_size, true); + + return ac_build_wwm(ctx, result); +} + +LLVMValueRef +ac_build_exclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op) +{ + LLVMValueRef result; + + if (LLVMTypeOf(src) == ctx->i1 && op == nir_op_iadd) { + LLVMBuilderRef builder = ctx->builder; + src = LLVMBuildZExt(builder, src, ctx->i32, ""); + result = ac_build_ballot(ctx, src); + result = ac_build_mbcnt(ctx, result); + return result; + } + + ac_build_optimization_barrier(ctx, &src); + + LLVMValueRef identity = + get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src))); + result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity), + LLVMTypeOf(identity), ""); + result = ac_build_scan(ctx, op, result, identity, ctx->wave_size, false); + + return ac_build_wwm(ctx, result); +} + +LLVMValueRef +ac_build_reduce(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op, unsigned cluster_size) +{ + if (cluster_size == 1) return src; + ac_build_optimization_barrier(ctx, &src); + LLVMValueRef result, swap; + LLVMValueRef identity = get_reduction_identity(ctx, op, + ac_get_type_size(LLVMTypeOf(src))); + result = LLVMBuildBitCast(ctx->builder, + ac_build_set_inactive(ctx, src, identity), + LLVMTypeOf(identity), ""); + swap = ac_build_quad_swizzle(ctx, result, 1, 0, 3, 2); + result = ac_build_alu_op(ctx, result, swap, op); + if (cluster_size == 2) return ac_build_wwm(ctx, result); + + swap = ac_build_quad_swizzle(ctx, result, 2, 3, 0, 1); + result = ac_build_alu_op(ctx, result, swap, op); + if (cluster_size == 4) return ac_build_wwm(ctx, result); + + if (ctx->chip_class >= GFX8) + swap = ac_build_dpp(ctx, identity, result, dpp_row_half_mirror, 0xf, 0xf, false); + else + swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x04)); + result = ac_build_alu_op(ctx, result, swap, op); + if (cluster_size == 8) return ac_build_wwm(ctx, result); + + if (ctx->chip_class >= GFX8) + swap = ac_build_dpp(ctx, identity, result, dpp_row_mirror, 0xf, 0xf, false); + else + swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x08)); + result = ac_build_alu_op(ctx, result, swap, op); + if (cluster_size == 16) return ac_build_wwm(ctx, result); + + if (ctx->chip_class >= GFX10) + swap = ac_build_permlane16(ctx, result, 0, true, false); + else if (ctx->chip_class >= GFX8 && cluster_size != 32) + swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false); + else + swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x10)); + result = ac_build_alu_op(ctx, result, swap, op); + if (cluster_size == 32) return ac_build_wwm(ctx, result); + + if (ctx->chip_class >= GFX8) { + if (ctx->wave_size == 64) { + if (ctx->chip_class >= GFX10) + swap = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, false)); + else + swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false); + result = ac_build_alu_op(ctx, result, swap, op); + result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 63, 0)); + } + + return ac_build_wwm(ctx, result); + } else { + swap = ac_build_readlane(ctx, result, ctx->i32_0); + result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 32, 0)); + result = ac_build_alu_op(ctx, result, swap, op); + return ac_build_wwm(ctx, result); + } +} + +/** + * "Top half" of a scan that reduces per-wave values across an entire + * workgroup. + * + * The source value must be present in the highest lane of the wave, and the + * highest lane must be live. + */ +void +ac_build_wg_wavescan_top(struct ac_llvm_context *ctx, struct ac_wg_scan *ws) +{ + if (ws->maxwaves <= 1) + return; + + const LLVMValueRef last_lane = LLVMConstInt(ctx->i32, ctx->wave_size - 1, false); + LLVMBuilderRef builder = ctx->builder; + LLVMValueRef tid = ac_get_thread_id(ctx); + LLVMValueRef tmp; + + tmp = LLVMBuildICmp(builder, LLVMIntEQ, tid, last_lane, ""); + ac_build_ifcc(ctx, tmp, 1000); + LLVMBuildStore(builder, ws->src, LLVMBuildGEP(builder, ws->scratch, &ws->waveidx, 1, "")); + ac_build_endif(ctx, 1000); +} + +/** + * "Bottom half" of a scan that reduces per-wave values across an entire + * workgroup. + * + * The caller must place a barrier between the top and bottom halves. + */ +void +ac_build_wg_wavescan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws) +{ + const LLVMTypeRef type = LLVMTypeOf(ws->src); + const LLVMValueRef identity = + get_reduction_identity(ctx, ws->op, ac_get_type_size(type)); + + if (ws->maxwaves <= 1) { + ws->result_reduce = ws->src; + ws->result_inclusive = ws->src; + ws->result_exclusive = identity; + return; + } + assert(ws->maxwaves <= 32); + + LLVMBuilderRef builder = ctx->builder; + LLVMValueRef tid = ac_get_thread_id(ctx); + LLVMBasicBlockRef bbs[2]; + LLVMValueRef phivalues_scan[2]; + LLVMValueRef tmp, tmp2; + + bbs[0] = LLVMGetInsertBlock(builder); + phivalues_scan[0] = LLVMGetUndef(type); + + if (ws->enable_reduce) + tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, ws->numwaves, ""); + else if (ws->enable_inclusive) + tmp = LLVMBuildICmp(builder, LLVMIntULE, tid, ws->waveidx, ""); + else + tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, ws->waveidx, ""); + ac_build_ifcc(ctx, tmp, 1001); + { + tmp = LLVMBuildLoad(builder, LLVMBuildGEP(builder, ws->scratch, &tid, 1, ""), ""); + + ac_build_optimization_barrier(ctx, &tmp); + + bbs[1] = LLVMGetInsertBlock(builder); + phivalues_scan[1] = ac_build_scan(ctx, ws->op, tmp, identity, ws->maxwaves, true); + } + ac_build_endif(ctx, 1001); + + const LLVMValueRef scan = ac_build_phi(ctx, type, 2, phivalues_scan, bbs); + + if (ws->enable_reduce) { + tmp = LLVMBuildSub(builder, ws->numwaves, ctx->i32_1, ""); + ws->result_reduce = ac_build_readlane(ctx, scan, tmp); + } + if (ws->enable_inclusive) + ws->result_inclusive = ac_build_readlane(ctx, scan, ws->waveidx); + if (ws->enable_exclusive) { + tmp = LLVMBuildSub(builder, ws->waveidx, ctx->i32_1, ""); + tmp = ac_build_readlane(ctx, scan, tmp); + tmp2 = LLVMBuildICmp(builder, LLVMIntEQ, ws->waveidx, ctx->i32_0, ""); + ws->result_exclusive = LLVMBuildSelect(builder, tmp2, identity, tmp, ""); + } +} + +/** + * Inclusive scan of a per-wave value across an entire workgroup. + * + * This implies an s_barrier instruction. + * + * Unlike ac_build_inclusive_scan, the caller \em must ensure that all threads + * of the workgroup are live. (This requirement cannot easily be relaxed in a + * useful manner because of the barrier in the algorithm.) + */ +void +ac_build_wg_wavescan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws) +{ + ac_build_wg_wavescan_top(ctx, ws); + ac_build_s_barrier(ctx); + ac_build_wg_wavescan_bottom(ctx, ws); +} + +/** + * "Top half" of a scan that reduces per-thread values across an entire + * workgroup. + * + * All lanes must be active when this code runs. + */ +void +ac_build_wg_scan_top(struct ac_llvm_context *ctx, struct ac_wg_scan *ws) +{ + if (ws->enable_exclusive) { + ws->extra = ac_build_exclusive_scan(ctx, ws->src, ws->op); + if (LLVMTypeOf(ws->src) == ctx->i1 && ws->op == nir_op_iadd) + ws->src = LLVMBuildZExt(ctx->builder, ws->src, ctx->i32, ""); + ws->src = ac_build_alu_op(ctx, ws->extra, ws->src, ws->op); + } else { + ws->src = ac_build_inclusive_scan(ctx, ws->src, ws->op); + } + + bool enable_inclusive = ws->enable_inclusive; + bool enable_exclusive = ws->enable_exclusive; + ws->enable_inclusive = false; + ws->enable_exclusive = ws->enable_exclusive || enable_inclusive; + ac_build_wg_wavescan_top(ctx, ws); + ws->enable_inclusive = enable_inclusive; + ws->enable_exclusive = enable_exclusive; +} + +/** + * "Bottom half" of a scan that reduces per-thread values across an entire + * workgroup. + * + * The caller must place a barrier between the top and bottom halves. + */ +void +ac_build_wg_scan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws) +{ + bool enable_inclusive = ws->enable_inclusive; + bool enable_exclusive = ws->enable_exclusive; + ws->enable_inclusive = false; + ws->enable_exclusive = ws->enable_exclusive || enable_inclusive; + ac_build_wg_wavescan_bottom(ctx, ws); + ws->enable_inclusive = enable_inclusive; + ws->enable_exclusive = enable_exclusive; + + /* ws->result_reduce is already the correct value */ + if (ws->enable_inclusive) + ws->result_inclusive = ac_build_alu_op(ctx, ws->result_inclusive, ws->src, ws->op); + if (ws->enable_exclusive) + ws->result_exclusive = ac_build_alu_op(ctx, ws->result_exclusive, ws->extra, ws->op); +} + +/** + * A scan that reduces per-thread values across an entire workgroup. + * + * The caller must ensure that all lanes are active when this code runs + * (WWM is insufficient!), because there is an implied barrier. + */ +void +ac_build_wg_scan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws) +{ + ac_build_wg_scan_top(ctx, ws); + ac_build_s_barrier(ctx); + ac_build_wg_scan_bottom(ctx, ws); +} + +LLVMValueRef +ac_build_quad_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, + unsigned lane0, unsigned lane1, unsigned lane2, unsigned lane3) +{ + unsigned mask = dpp_quad_perm(lane0, lane1, lane2, lane3); + if (ctx->chip_class >= GFX8) { + return ac_build_dpp(ctx, src, src, mask, 0xf, 0xf, false); + } else { + return ac_build_ds_swizzle(ctx, src, (1 << 15) | mask); + } +} + +LLVMValueRef +ac_build_shuffle(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef index) +{ + LLVMTypeRef type = LLVMTypeOf(src); + LLVMValueRef result; + + index = LLVMBuildMul(ctx->builder, index, LLVMConstInt(ctx->i32, 4, 0), ""); + src = LLVMBuildZExt(ctx->builder, src, ctx->i32, ""); + + result = ac_build_intrinsic(ctx, "llvm.amdgcn.ds.bpermute", ctx->i32, + (LLVMValueRef []) {index, src}, 2, + AC_FUNC_ATTR_READNONE | + AC_FUNC_ATTR_CONVERGENT); + return LLVMBuildTrunc(ctx->builder, result, type, ""); +} + +LLVMValueRef +ac_build_frexp_exp(struct ac_llvm_context *ctx, LLVMValueRef src0, + unsigned bitsize) +{ + LLVMTypeRef type; + char *intr; + + if (bitsize == 16) { + intr = "llvm.amdgcn.frexp.exp.i16.f16"; + type = ctx->i16; + } else if (bitsize == 32) { + intr = "llvm.amdgcn.frexp.exp.i32.f32"; + type = ctx->i32; + } else { + intr = "llvm.amdgcn.frexp.exp.i32.f64"; + type = ctx->i32; + } + + LLVMValueRef params[] = { + src0, + }; + return ac_build_intrinsic(ctx, intr, type, params, 1, + AC_FUNC_ATTR_READNONE); +} +LLVMValueRef +ac_build_frexp_mant(struct ac_llvm_context *ctx, LLVMValueRef src0, + unsigned bitsize) +{ + LLVMTypeRef type; + char *intr; + + if (bitsize == 16) { + intr = "llvm.amdgcn.frexp.mant.f16"; + type = ctx->f16; + } else if (bitsize == 32) { + intr = "llvm.amdgcn.frexp.mant.f32"; + type = ctx->f32; + } else { + intr = "llvm.amdgcn.frexp.mant.f64"; + type = ctx->f64; + } + + LLVMValueRef params[] = { + src0, + }; + return ac_build_intrinsic(ctx, intr, type, params, 1, + AC_FUNC_ATTR_READNONE); +} + +LLVMValueRef +ac_build_canonicalize(struct ac_llvm_context *ctx, LLVMValueRef src0, + unsigned bitsize) +{ + LLVMTypeRef type; + char *intr; + + if (bitsize == 16) { + intr = "llvm.canonicalize.f16"; + type = ctx->f16; + } else if (bitsize == 32) { + intr = "llvm.canonicalize.f32"; + type = ctx->f32; + } else { + intr = "llvm.canonicalize.f64"; + type = ctx->f64; + } + + LLVMValueRef params[] = { + src0, + }; + return ac_build_intrinsic(ctx, intr, type, params, 1, + AC_FUNC_ATTR_READNONE); +} + +/* + * this takes an I,J coordinate pair, + * and works out the X and Y derivatives. + * it returns DDX(I), DDX(J), DDY(I), DDY(J). + */ +LLVMValueRef +ac_build_ddxy_interp(struct ac_llvm_context *ctx, LLVMValueRef interp_ij) +{ + LLVMValueRef result[4], a; + unsigned i; + + for (i = 0; i < 2; i++) { + a = LLVMBuildExtractElement(ctx->builder, interp_ij, + LLVMConstInt(ctx->i32, i, false), ""); + result[i] = ac_build_ddxy(ctx, AC_TID_MASK_TOP_LEFT, 1, a); + result[2+i] = ac_build_ddxy(ctx, AC_TID_MASK_TOP_LEFT, 2, a); + } + return ac_build_gather_values(ctx, result, 4); +} + +LLVMValueRef +ac_build_load_helper_invocation(struct ac_llvm_context *ctx) +{ + LLVMValueRef result = ac_build_intrinsic(ctx, "llvm.amdgcn.ps.live", + ctx->i1, NULL, 0, + AC_FUNC_ATTR_READNONE); + result = LLVMBuildNot(ctx->builder, result, ""); + return LLVMBuildSExt(ctx->builder, result, ctx->i32, ""); +} + +LLVMValueRef ac_build_call(struct ac_llvm_context *ctx, LLVMValueRef func, + LLVMValueRef *args, unsigned num_args) +{ + LLVMValueRef ret = LLVMBuildCall(ctx->builder, func, args, num_args, ""); + LLVMSetInstructionCallConv(ret, LLVMGetFunctionCallConv(func)); + return ret; +} + +void +ac_export_mrt_z(struct ac_llvm_context *ctx, LLVMValueRef depth, + LLVMValueRef stencil, LLVMValueRef samplemask, + struct ac_export_args *args) +{ + unsigned mask = 0; + unsigned format = ac_get_spi_shader_z_format(depth != NULL, + stencil != NULL, + samplemask != NULL); + + assert(depth || stencil || samplemask); + + memset(args, 0, sizeof(*args)); + + args->valid_mask = 1; /* whether the EXEC mask is valid */ + args->done = 1; /* DONE bit */ + + /* Specify the target we are exporting */ + args->target = V_008DFC_SQ_EXP_MRTZ; + + args->compr = 0; /* COMP flag */ + args->out[0] = LLVMGetUndef(ctx->f32); /* R, depth */ + args->out[1] = LLVMGetUndef(ctx->f32); /* G, stencil test val[0:7], stencil op val[8:15] */ + args->out[2] = LLVMGetUndef(ctx->f32); /* B, sample mask */ + args->out[3] = LLVMGetUndef(ctx->f32); /* A, alpha to mask */ + + if (format == V_028710_SPI_SHADER_UINT16_ABGR) { + assert(!depth); + args->compr = 1; /* COMPR flag */ + + if (stencil) { + /* Stencil should be in X[23:16]. */ + stencil = ac_to_integer(ctx, stencil); + stencil = LLVMBuildShl(ctx->builder, stencil, + LLVMConstInt(ctx->i32, 16, 0), ""); + args->out[0] = ac_to_float(ctx, stencil); + mask |= 0x3; + } + if (samplemask) { + /* SampleMask should be in Y[15:0]. */ + args->out[1] = samplemask; + mask |= 0xc; + } + } else { + if (depth) { + args->out[0] = depth; + mask |= 0x1; + } + if (stencil) { + args->out[1] = stencil; + mask |= 0x2; + } + if (samplemask) { + args->out[2] = samplemask; + mask |= 0x4; + } + } + + /* GFX6 (except OLAND and HAINAN) has a bug that it only looks + * at the X writemask component. */ + if (ctx->chip_class == GFX6 && + ctx->family != CHIP_OLAND && + ctx->family != CHIP_HAINAN) + mask |= 0x1; + + /* Specify which components to enable */ + args->enabled_channels = mask; +} + +/* Send GS Alloc Req message from the first wave of the group to SPI. + * Message payload is: + * - bits 0..10: vertices in group + * - bits 12..22: primitives in group + */ +void ac_build_sendmsg_gs_alloc_req(struct ac_llvm_context *ctx, LLVMValueRef wave_id, + LLVMValueRef vtx_cnt, LLVMValueRef prim_cnt) +{ + LLVMBuilderRef builder = ctx->builder; + LLVMValueRef tmp; + bool export_dummy_prim = false; + + /* HW workaround for a GPU hang with 100% culling. + * We always have to export at least 1 primitive. + * Export a degenerate triangle using vertex 0 for all 3 vertices. + */ + if (prim_cnt == ctx->i32_0 && + (ctx->family == CHIP_NAVI10 || + ctx->family == CHIP_NAVI12 || + ctx->family == CHIP_NAVI14)) { + assert(vtx_cnt == ctx->i32_0); + prim_cnt = ctx->i32_1; + vtx_cnt = ctx->i32_1; + export_dummy_prim = true; + } + + ac_build_ifcc(ctx, LLVMBuildICmp(builder, LLVMIntEQ, wave_id, ctx->i32_0, ""), 5020); + + tmp = LLVMBuildShl(builder, prim_cnt, LLVMConstInt(ctx->i32, 12, false),""); + tmp = LLVMBuildOr(builder, tmp, vtx_cnt, ""); + ac_build_sendmsg(ctx, AC_SENDMSG_GS_ALLOC_REQ, tmp); + + if (export_dummy_prim) { + struct ac_ngg_prim prim = {}; + /* The vertex indices are 0,0,0. */ + prim.passthrough = ctx->i32_0; + + struct ac_export_args pos = {}; + pos.out[0] = pos.out[1] = pos.out[2] = pos.out[3] = ctx->f32_0; + pos.target = V_008DFC_SQ_EXP_POS; + pos.enabled_channels = 0xf; + pos.done = true; + + ac_build_ifcc(ctx, LLVMBuildICmp(builder, LLVMIntEQ, ac_get_thread_id(ctx), + ctx->i32_0, ""), 5021); + ac_build_export_prim(ctx, &prim); + ac_build_export(ctx, &pos); + ac_build_endif(ctx, 5021); + } + + ac_build_endif(ctx, 5020); +} + +LLVMValueRef ac_pack_prim_export(struct ac_llvm_context *ctx, + const struct ac_ngg_prim *prim) +{ + /* The prim export format is: + * - bits 0..8: index 0 + * - bit 9: edge flag 0 + * - bits 10..18: index 1 + * - bit 19: edge flag 1 + * - bits 20..28: index 2 + * - bit 29: edge flag 2 + * - bit 31: null primitive (skip) + */ + LLVMBuilderRef builder = ctx->builder; + LLVMValueRef tmp = LLVMBuildZExt(builder, prim->isnull, ctx->i32, ""); + LLVMValueRef result = LLVMBuildShl(builder, tmp, LLVMConstInt(ctx->i32, 31, false), ""); + + for (unsigned i = 0; i < prim->num_vertices; ++i) { + tmp = LLVMBuildShl(builder, prim->index[i], + LLVMConstInt(ctx->i32, 10 * i, false), ""); + result = LLVMBuildOr(builder, result, tmp, ""); + tmp = LLVMBuildZExt(builder, prim->edgeflag[i], ctx->i32, ""); + tmp = LLVMBuildShl(builder, tmp, + LLVMConstInt(ctx->i32, 10 * i + 9, false), ""); + result = LLVMBuildOr(builder, result, tmp, ""); + } + return result; +} + +void ac_build_export_prim(struct ac_llvm_context *ctx, + const struct ac_ngg_prim *prim) +{ + struct ac_export_args args; + + if (prim->passthrough) { + args.out[0] = prim->passthrough; + } else { + args.out[0] = ac_pack_prim_export(ctx, prim); + } + + args.out[0] = LLVMBuildBitCast(ctx->builder, args.out[0], ctx->f32, ""); + args.out[1] = LLVMGetUndef(ctx->f32); + args.out[2] = LLVMGetUndef(ctx->f32); + args.out[3] = LLVMGetUndef(ctx->f32); + + args.target = V_008DFC_SQ_EXP_PRIM; + args.enabled_channels = 1; + args.done = true; + args.valid_mask = false; + args.compr = false; + + ac_build_export(ctx, &args); +} + +static LLVMTypeRef +arg_llvm_type(enum ac_arg_type type, unsigned size, struct ac_llvm_context *ctx) +{ + if (type == AC_ARG_FLOAT) { + return size == 1 ? ctx->f32 : LLVMVectorType(ctx->f32, size); + } else if (type == AC_ARG_INT) { + return size == 1 ? ctx->i32 : LLVMVectorType(ctx->i32, size); + } else { + LLVMTypeRef ptr_type; + switch (type) { + case AC_ARG_CONST_PTR: + ptr_type = ctx->i8; + break; + case AC_ARG_CONST_FLOAT_PTR: + ptr_type = ctx->f32; + break; + case AC_ARG_CONST_PTR_PTR: + ptr_type = ac_array_in_const32_addr_space(ctx->i8); + break; + case AC_ARG_CONST_DESC_PTR: + ptr_type = ctx->v4i32; + break; + case AC_ARG_CONST_IMAGE_PTR: + ptr_type = ctx->v8i32; + break; + default: + unreachable("unknown arg type"); + } + if (size == 1) { + return ac_array_in_const32_addr_space(ptr_type); + } else { + assert(size == 2); + return ac_array_in_const_addr_space(ptr_type); + } + } +} + +LLVMValueRef +ac_build_main(const struct ac_shader_args *args, + struct ac_llvm_context *ctx, + enum ac_llvm_calling_convention convention, + const char *name, LLVMTypeRef ret_type, + LLVMModuleRef module) +{ + LLVMTypeRef arg_types[AC_MAX_ARGS]; + + for (unsigned i = 0; i < args->arg_count; i++) { + arg_types[i] = arg_llvm_type(args->args[i].type, + args->args[i].size, ctx); + } + + LLVMTypeRef main_function_type = + LLVMFunctionType(ret_type, arg_types, args->arg_count, 0); + + LLVMValueRef main_function = + LLVMAddFunction(module, name, main_function_type); + LLVMBasicBlockRef main_function_body = + LLVMAppendBasicBlockInContext(ctx->context, main_function, "main_body"); + LLVMPositionBuilderAtEnd(ctx->builder, main_function_body); + + LLVMSetFunctionCallConv(main_function, convention); + for (unsigned i = 0; i < args->arg_count; ++i) { + LLVMValueRef P = LLVMGetParam(main_function, i); + + if (args->args[i].file != AC_ARG_SGPR) + continue; + + ac_add_function_attr(ctx->context, main_function, i + 1, AC_FUNC_ATTR_INREG); + + if (LLVMGetTypeKind(LLVMTypeOf(P)) == LLVMPointerTypeKind) { + ac_add_function_attr(ctx->context, main_function, i + 1, AC_FUNC_ATTR_NOALIAS); + ac_add_attr_dereferenceable(P, UINT64_MAX); + } + } + + ctx->main_function = main_function; + return main_function; +} + +void ac_build_s_endpgm(struct ac_llvm_context *ctx) +{ + LLVMTypeRef calltype = LLVMFunctionType(ctx->voidt, NULL, 0, false); + LLVMValueRef code = LLVMConstInlineAsm(calltype, "s_endpgm", "", true, false); + LLVMBuildCall(ctx->builder, code, NULL, 0, ""); +} + +LLVMValueRef ac_prefix_bitcount(struct ac_llvm_context *ctx, + LLVMValueRef mask, LLVMValueRef index) +{ + LLVMBuilderRef builder = ctx->builder; + LLVMTypeRef type = LLVMTypeOf(mask); + + LLVMValueRef bit = LLVMBuildShl(builder, LLVMConstInt(type, 1, 0), + LLVMBuildZExt(builder, index, type, ""), ""); + LLVMValueRef prefix_bits = LLVMBuildSub(builder, bit, LLVMConstInt(type, 1, 0), ""); + LLVMValueRef prefix_mask = LLVMBuildAnd(builder, mask, prefix_bits, ""); + return ac_build_bit_count(ctx, prefix_mask); +} + +/* Compute the prefix sum of the "mask" bit array with 128 elements (bits). */ +LLVMValueRef ac_prefix_bitcount_2x64(struct ac_llvm_context *ctx, + LLVMValueRef mask[2], LLVMValueRef index) +{ + LLVMBuilderRef builder = ctx->builder; +#if 0 + /* Reference version using i128. */ + LLVMValueRef input_mask = + LLVMBuildBitCast(builder, ac_build_gather_values(ctx, mask, 2), ctx->i128, ""); + + return ac_prefix_bitcount(ctx, input_mask, index); +#else + /* Optimized version using 2 64-bit masks. */ + LLVMValueRef is_hi, is_0, c64, c128, all_bits; + LLVMValueRef prefix_mask[2], shift[2], mask_bcnt0, prefix_bcnt[2]; + + /* Compute the 128-bit prefix mask. */ + c64 = LLVMConstInt(ctx->i32, 64, 0); + c128 = LLVMConstInt(ctx->i32, 128, 0); + all_bits = LLVMConstInt(ctx->i64, UINT64_MAX, 0); + /* The first index that can have non-zero high bits in the prefix mask is 65. */ + is_hi = LLVMBuildICmp(builder, LLVMIntUGT, index, c64, ""); + is_0 = LLVMBuildICmp(builder, LLVMIntEQ, index, ctx->i32_0, ""); + mask_bcnt0 = ac_build_bit_count(ctx, mask[0]); + + for (unsigned i = 0; i < 2; i++) { + shift[i] = LLVMBuildSub(builder, i ? c128 : c64, index, ""); + /* For i==0, index==0, the right shift by 64 doesn't give the desired result, + * so we handle it by the is_0 select. + * For i==1, index==64, same story, so we handle it by the last is_hi select. + * For i==0, index==64, we shift by 0, which is what we want. + */ + prefix_mask[i] = LLVMBuildLShr(builder, all_bits, + LLVMBuildZExt(builder, shift[i], ctx->i64, ""), ""); + prefix_mask[i] = LLVMBuildAnd(builder, mask[i], prefix_mask[i], ""); + prefix_bcnt[i] = ac_build_bit_count(ctx, prefix_mask[i]); + } + + prefix_bcnt[0] = LLVMBuildSelect(builder, is_0, ctx->i32_0, prefix_bcnt[0], ""); + prefix_bcnt[0] = LLVMBuildSelect(builder, is_hi, mask_bcnt0, prefix_bcnt[0], ""); + prefix_bcnt[1] = LLVMBuildSelect(builder, is_hi, prefix_bcnt[1], ctx->i32_0, ""); + + return LLVMBuildAdd(builder, prefix_bcnt[0], prefix_bcnt[1], ""); +#endif +} + +/** + * Convert triangle strip indices to triangle indices. This is used to decompose + * triangle strips into triangles. + */ +void ac_build_triangle_strip_indices_to_triangle(struct ac_llvm_context *ctx, + LLVMValueRef is_odd, + LLVMValueRef flatshade_first, + LLVMValueRef index[3]) +{ + LLVMBuilderRef builder = ctx->builder; + LLVMValueRef out[3]; + + /* We need to change the vertex order for odd triangles to get correct + * front/back facing by swapping 2 vertex indices, but we also have to + * keep the provoking vertex in the same place. + * + * If the first vertex is provoking, swap index 1 and 2. + * If the last vertex is provoking, swap index 0 and 1. + */ + out[0] = LLVMBuildSelect(builder, flatshade_first, + index[0], + LLVMBuildSelect(builder, is_odd, + index[1], index[0], ""), ""); + out[1] = LLVMBuildSelect(builder, flatshade_first, + LLVMBuildSelect(builder, is_odd, + index[2], index[1], ""), + LLVMBuildSelect(builder, is_odd, + index[0], index[1], ""), ""); + out[2] = LLVMBuildSelect(builder, flatshade_first, + LLVMBuildSelect(builder, is_odd, + index[1], index[2], ""), + index[2], ""); + memcpy(index, out, sizeof(out)); +} diff -Nru mesa-19.2.8/src/amd/llvm/ac_llvm_build.h mesa-20.0.8/src/amd/llvm/ac_llvm_build.h --- mesa-19.2.8/src/amd/llvm/ac_llvm_build.h 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/amd/llvm/ac_llvm_build.h 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,806 @@ +/* + * Copyright 2016 Bas Nieuwenhuizen + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + */ +#ifndef AC_LLVM_BUILD_H +#define AC_LLVM_BUILD_H + +#include +#include +#include "compiler/nir/nir.h" +#include "amd_family.h" +#include "ac_shader_util.h" +#include "ac_shader_args.h" +#include "ac_shader_abi.h" + +#ifdef __cplusplus +extern "C" { +#endif + +enum { + AC_ADDR_SPACE_FLAT = 0, /* Slower than global. */ + AC_ADDR_SPACE_GLOBAL = 1, + AC_ADDR_SPACE_GDS = 2, + AC_ADDR_SPACE_LDS = 3, + AC_ADDR_SPACE_CONST = 4, /* Global allowing SMEM. */ + AC_ADDR_SPACE_CONST_32BIT = 6, /* same as CONST, but the pointer type has 32 bits */ +}; + +#define AC_WAIT_LGKM (1 << 0) /* LDS, GDS, constant, message */ +#define AC_WAIT_VLOAD (1 << 1) /* VMEM load/sample instructions */ +#define AC_WAIT_VSTORE (1 << 2) /* VMEM store instructions */ + +struct ac_llvm_flow; +struct ac_llvm_compiler; +enum ac_float_mode; + +struct ac_llvm_flow_state { + struct ac_llvm_flow *stack; + unsigned depth_max; + unsigned depth; +}; + +struct ac_llvm_context { + LLVMContextRef context; + LLVMModuleRef module; + LLVMBuilderRef builder; + + LLVMValueRef main_function; + + LLVMTypeRef voidt; + LLVMTypeRef i1; + LLVMTypeRef i8; + LLVMTypeRef i16; + LLVMTypeRef i32; + LLVMTypeRef i64; + LLVMTypeRef i128; + LLVMTypeRef intptr; + LLVMTypeRef f16; + LLVMTypeRef f32; + LLVMTypeRef f64; + LLVMTypeRef v2i16; + LLVMTypeRef v2i32; + LLVMTypeRef v3i32; + LLVMTypeRef v4i32; + LLVMTypeRef v2f32; + LLVMTypeRef v3f32; + LLVMTypeRef v4f32; + LLVMTypeRef v8i32; + LLVMTypeRef iN_wavemask; + LLVMTypeRef iN_ballotmask; + + LLVMValueRef i8_0; + LLVMValueRef i8_1; + LLVMValueRef i16_0; + LLVMValueRef i16_1; + LLVMValueRef i32_0; + LLVMValueRef i32_1; + LLVMValueRef i64_0; + LLVMValueRef i64_1; + LLVMValueRef i128_0; + LLVMValueRef i128_1; + LLVMValueRef f16_0; + LLVMValueRef f16_1; + LLVMValueRef f32_0; + LLVMValueRef f32_1; + LLVMValueRef f64_0; + LLVMValueRef f64_1; + LLVMValueRef i1true; + LLVMValueRef i1false; + + /* Since ac_nir_translate makes a local copy of ac_llvm_context, there + * are two ac_llvm_contexts. Declare a pointer here, so that the control + * flow stack is shared by both ac_llvm_contexts. + */ + struct ac_llvm_flow_state *flow; + + unsigned range_md_kind; + unsigned invariant_load_md_kind; + unsigned uniform_md_kind; + LLVMValueRef empty_md; + + enum chip_class chip_class; + enum radeon_family family; + + unsigned wave_size; + unsigned ballot_mask_bits; + + unsigned float_mode; + + LLVMValueRef lds; +}; + +void +ac_llvm_context_init(struct ac_llvm_context *ctx, + struct ac_llvm_compiler *compiler, + enum chip_class chip_class, enum radeon_family family, + enum ac_float_mode float_mode, unsigned wave_size, + unsigned ballot_mask_bits); + +void +ac_llvm_context_dispose(struct ac_llvm_context *ctx); + +int +ac_get_llvm_num_components(LLVMValueRef value); + +int +ac_get_elem_bits(struct ac_llvm_context *ctx, LLVMTypeRef type); + +LLVMValueRef +ac_llvm_extract_elem(struct ac_llvm_context *ac, + LLVMValueRef value, + int index); + +unsigned ac_get_type_size(LLVMTypeRef type); + +LLVMTypeRef ac_to_integer_type(struct ac_llvm_context *ctx, LLVMTypeRef t); +LLVMValueRef ac_to_integer(struct ac_llvm_context *ctx, LLVMValueRef v); +LLVMValueRef ac_to_integer_or_pointer(struct ac_llvm_context *ctx, LLVMValueRef v); +LLVMTypeRef ac_to_float_type(struct ac_llvm_context *ctx, LLVMTypeRef t); +LLVMValueRef ac_to_float(struct ac_llvm_context *ctx, LLVMValueRef v); + +LLVMValueRef +ac_build_intrinsic(struct ac_llvm_context *ctx, const char *name, + LLVMTypeRef return_type, LLVMValueRef *params, + unsigned param_count, unsigned attrib_mask); + +void ac_build_type_name_for_intr(LLVMTypeRef type, char *buf, unsigned bufsize); + +LLVMValueRef +ac_build_phi(struct ac_llvm_context *ctx, LLVMTypeRef type, + unsigned count_incoming, LLVMValueRef *values, + LLVMBasicBlockRef *blocks); + +void ac_build_s_barrier(struct ac_llvm_context *ctx); +void ac_build_optimization_barrier(struct ac_llvm_context *ctx, + LLVMValueRef *pvgpr); + +LLVMValueRef ac_build_shader_clock(struct ac_llvm_context *ctx); + +LLVMValueRef ac_build_ballot(struct ac_llvm_context *ctx, LLVMValueRef value); +LLVMValueRef ac_get_i1_sgpr_mask(struct ac_llvm_context *ctx, + LLVMValueRef value); + +LLVMValueRef ac_build_vote_all(struct ac_llvm_context *ctx, LLVMValueRef value); + +LLVMValueRef ac_build_vote_any(struct ac_llvm_context *ctx, LLVMValueRef value); + +LLVMValueRef ac_build_vote_eq(struct ac_llvm_context *ctx, LLVMValueRef value); + +LLVMValueRef +ac_build_varying_gather_values(struct ac_llvm_context *ctx, LLVMValueRef *values, + unsigned value_count, unsigned component); + +LLVMValueRef +ac_build_gather_values_extended(struct ac_llvm_context *ctx, + LLVMValueRef *values, + unsigned value_count, + unsigned value_stride, + bool load, + bool always_vector); +LLVMValueRef +ac_build_gather_values(struct ac_llvm_context *ctx, + LLVMValueRef *values, + unsigned value_count); + +LLVMValueRef +ac_extract_components(struct ac_llvm_context *ctx, + LLVMValueRef value, + unsigned start, + unsigned channels); + +LLVMValueRef ac_build_expand_to_vec4(struct ac_llvm_context *ctx, + LLVMValueRef value, + unsigned num_channels); +LLVMValueRef ac_build_round(struct ac_llvm_context *ctx, LLVMValueRef value); + +LLVMValueRef +ac_build_fdiv(struct ac_llvm_context *ctx, + LLVMValueRef num, + LLVMValueRef den); + +LLVMValueRef ac_build_fast_udiv(struct ac_llvm_context *ctx, + LLVMValueRef num, + LLVMValueRef multiplier, + LLVMValueRef pre_shift, + LLVMValueRef post_shift, + LLVMValueRef increment); +LLVMValueRef ac_build_fast_udiv_nuw(struct ac_llvm_context *ctx, + LLVMValueRef num, + LLVMValueRef multiplier, + LLVMValueRef pre_shift, + LLVMValueRef post_shift, + LLVMValueRef increment); +LLVMValueRef ac_build_fast_udiv_u31_d_not_one(struct ac_llvm_context *ctx, + LLVMValueRef num, + LLVMValueRef multiplier, + LLVMValueRef post_shift); + +void +ac_prepare_cube_coords(struct ac_llvm_context *ctx, + bool is_deriv, bool is_array, bool is_lod, + LLVMValueRef *coords_arg, + LLVMValueRef *derivs_arg); + + +LLVMValueRef +ac_build_fs_interp(struct ac_llvm_context *ctx, + LLVMValueRef llvm_chan, + LLVMValueRef attr_number, + LLVMValueRef params, + LLVMValueRef i, + LLVMValueRef j); + +LLVMValueRef +ac_build_fs_interp_f16(struct ac_llvm_context *ctx, + LLVMValueRef llvm_chan, + LLVMValueRef attr_number, + LLVMValueRef params, + LLVMValueRef i, + LLVMValueRef j); + +LLVMValueRef +ac_build_fs_interp_mov(struct ac_llvm_context *ctx, + LLVMValueRef parameter, + LLVMValueRef llvm_chan, + LLVMValueRef attr_number, + LLVMValueRef params); + +LLVMValueRef +ac_build_gep_ptr(struct ac_llvm_context *ctx, + LLVMValueRef base_ptr, + LLVMValueRef index); + +LLVMValueRef +ac_build_gep0(struct ac_llvm_context *ctx, + LLVMValueRef base_ptr, + LLVMValueRef index); +LLVMValueRef ac_build_pointer_add(struct ac_llvm_context *ctx, LLVMValueRef ptr, + LLVMValueRef index); + +void +ac_build_indexed_store(struct ac_llvm_context *ctx, + LLVMValueRef base_ptr, LLVMValueRef index, + LLVMValueRef value); + +LLVMValueRef ac_build_load(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, + LLVMValueRef index); +LLVMValueRef ac_build_load_invariant(struct ac_llvm_context *ctx, + LLVMValueRef base_ptr, LLVMValueRef index); +LLVMValueRef ac_build_load_to_sgpr(struct ac_llvm_context *ctx, + LLVMValueRef base_ptr, LLVMValueRef index); +LLVMValueRef ac_build_load_to_sgpr_uint_wraparound(struct ac_llvm_context *ctx, + LLVMValueRef base_ptr, LLVMValueRef index); + +void +ac_build_buffer_store_dword(struct ac_llvm_context *ctx, + LLVMValueRef rsrc, + LLVMValueRef vdata, + unsigned num_channels, + LLVMValueRef voffset, + LLVMValueRef soffset, + unsigned inst_offset, + unsigned cache_policy); + +void +ac_build_buffer_store_format(struct ac_llvm_context *ctx, + LLVMValueRef rsrc, + LLVMValueRef data, + LLVMValueRef vindex, + LLVMValueRef voffset, + unsigned num_channels, + unsigned cache_policy); + +LLVMValueRef +ac_build_buffer_load(struct ac_llvm_context *ctx, + LLVMValueRef rsrc, + int num_channels, + LLVMValueRef vindex, + LLVMValueRef voffset, + LLVMValueRef soffset, + unsigned inst_offset, + unsigned cache_policy, + bool can_speculate, + bool allow_smem); + +LLVMValueRef ac_build_buffer_load_format(struct ac_llvm_context *ctx, + LLVMValueRef rsrc, + LLVMValueRef vindex, + LLVMValueRef voffset, + unsigned num_channels, + unsigned cache_policy, + bool can_speculate); + +LLVMValueRef +ac_build_tbuffer_load_short(struct ac_llvm_context *ctx, + LLVMValueRef rsrc, + LLVMValueRef voffset, + LLVMValueRef soffset, + LLVMValueRef immoffset, + unsigned cache_policy); + +LLVMValueRef +ac_build_tbuffer_load_byte(struct ac_llvm_context *ctx, + LLVMValueRef rsrc, + LLVMValueRef voffset, + LLVMValueRef soffset, + LLVMValueRef immoffset, + unsigned cache_policy); + +LLVMValueRef +ac_build_struct_tbuffer_load(struct ac_llvm_context *ctx, + LLVMValueRef rsrc, + LLVMValueRef vindex, + LLVMValueRef voffset, + LLVMValueRef soffset, + LLVMValueRef immoffset, + unsigned num_channels, + unsigned dfmt, + unsigned nfmt, + unsigned cache_policy, + bool can_speculate); + +LLVMValueRef +ac_build_raw_tbuffer_load(struct ac_llvm_context *ctx, + LLVMValueRef rsrc, + LLVMValueRef voffset, + LLVMValueRef soffset, + LLVMValueRef immoffset, + unsigned num_channels, + unsigned dfmt, + unsigned nfmt, + unsigned cache_policy, + bool can_speculate); + +/* For ac_build_fetch_format. + * + * Note: FLOAT must be 0 (used for convenience of encoding in radeonsi). + */ +enum { + AC_FETCH_FORMAT_FLOAT = 0, + AC_FETCH_FORMAT_FIXED, + AC_FETCH_FORMAT_UNORM, + AC_FETCH_FORMAT_SNORM, + AC_FETCH_FORMAT_USCALED, + AC_FETCH_FORMAT_SSCALED, + AC_FETCH_FORMAT_UINT, + AC_FETCH_FORMAT_SINT, +}; + +LLVMValueRef +ac_build_opencoded_load_format(struct ac_llvm_context *ctx, + unsigned log_size, + unsigned num_channels, + unsigned format, + bool reverse, + bool known_aligned, + LLVMValueRef rsrc, + LLVMValueRef vindex, + LLVMValueRef voffset, + LLVMValueRef soffset, + unsigned cache_policy, + bool can_speculate); + +void +ac_build_tbuffer_store_short(struct ac_llvm_context *ctx, + LLVMValueRef rsrc, + LLVMValueRef vdata, + LLVMValueRef voffset, + LLVMValueRef soffset, + unsigned cache_policy); + +void +ac_build_tbuffer_store_byte(struct ac_llvm_context *ctx, + LLVMValueRef rsrc, + LLVMValueRef vdata, + LLVMValueRef voffset, + LLVMValueRef soffset, + unsigned cache_policy); + +void +ac_build_struct_tbuffer_store(struct ac_llvm_context *ctx, + LLVMValueRef rsrc, + LLVMValueRef vdata, + LLVMValueRef vindex, + LLVMValueRef voffset, + LLVMValueRef soffset, + LLVMValueRef immoffset, + unsigned num_channels, + unsigned dfmt, + unsigned nfmt, + unsigned cache_policy); + +void +ac_build_raw_tbuffer_store(struct ac_llvm_context *ctx, + LLVMValueRef rsrc, + LLVMValueRef vdata, + LLVMValueRef voffset, + LLVMValueRef soffset, + LLVMValueRef immoffset, + unsigned num_channels, + unsigned dfmt, + unsigned nfmt, + unsigned cache_policy); + +LLVMValueRef +ac_get_thread_id(struct ac_llvm_context *ctx); + +#define AC_TID_MASK_TOP_LEFT 0xfffffffc +#define AC_TID_MASK_TOP 0xfffffffd +#define AC_TID_MASK_LEFT 0xfffffffe + +LLVMValueRef +ac_build_ddxy(struct ac_llvm_context *ctx, + uint32_t mask, + int idx, + LLVMValueRef val); + +#define AC_SENDMSG_GS 2 +#define AC_SENDMSG_GS_DONE 3 +#define AC_SENDMSG_GS_ALLOC_REQ 9 + +#define AC_SENDMSG_GS_OP_NOP (0 << 4) +#define AC_SENDMSG_GS_OP_CUT (1 << 4) +#define AC_SENDMSG_GS_OP_EMIT (2 << 4) +#define AC_SENDMSG_GS_OP_EMIT_CUT (3 << 4) + +void ac_build_sendmsg(struct ac_llvm_context *ctx, + uint32_t msg, + LLVMValueRef wave_id); + +LLVMValueRef ac_build_imsb(struct ac_llvm_context *ctx, + LLVMValueRef arg, + LLVMTypeRef dst_type); + +LLVMValueRef ac_build_umsb(struct ac_llvm_context *ctx, + LLVMValueRef arg, + LLVMTypeRef dst_type); +LLVMValueRef ac_build_fmin(struct ac_llvm_context *ctx, LLVMValueRef a, + LLVMValueRef b); +LLVMValueRef ac_build_fmax(struct ac_llvm_context *ctx, LLVMValueRef a, + LLVMValueRef b); +LLVMValueRef ac_build_imin(struct ac_llvm_context *ctx, LLVMValueRef a, + LLVMValueRef b); +LLVMValueRef ac_build_imax(struct ac_llvm_context *ctx, LLVMValueRef a, + LLVMValueRef b); +LLVMValueRef ac_build_umin(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b); +LLVMValueRef ac_build_umax(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b); +LLVMValueRef ac_build_clamp(struct ac_llvm_context *ctx, LLVMValueRef value); + +struct ac_export_args { + LLVMValueRef out[4]; + unsigned target; + unsigned enabled_channels; + bool compr; + bool done; + bool valid_mask; +}; + +void ac_build_export(struct ac_llvm_context *ctx, struct ac_export_args *a); + +void ac_build_export_null(struct ac_llvm_context *ctx); + +enum ac_image_opcode { + ac_image_sample, + ac_image_gather4, + ac_image_load, + ac_image_load_mip, + ac_image_store, + ac_image_store_mip, + ac_image_get_lod, + ac_image_get_resinfo, + ac_image_atomic, + ac_image_atomic_cmpswap, +}; + +enum ac_atomic_op { + ac_atomic_swap, + ac_atomic_add, + ac_atomic_sub, + ac_atomic_smin, + ac_atomic_umin, + ac_atomic_smax, + ac_atomic_umax, + ac_atomic_and, + ac_atomic_or, + ac_atomic_xor, + ac_atomic_inc_wrap, + ac_atomic_dec_wrap, +}; + +/* These cache policy bits match the definitions used by the LLVM intrinsics. */ +enum ac_image_cache_policy { + ac_glc = 1 << 0, /* per-CU cache control */ + ac_slc = 1 << 1, /* global L2 cache control */ + ac_dlc = 1 << 2, /* per-shader-array cache control */ + ac_swizzled = 1 << 3, /* the access is swizzled, disabling load/store merging */ +}; + +struct ac_image_args { + enum ac_image_opcode opcode : 4; + enum ac_atomic_op atomic : 4; /* for the ac_image_atomic opcode */ + enum ac_image_dim dim : 3; + unsigned dmask : 4; + unsigned cache_policy : 3; + bool unorm : 1; + bool level_zero : 1; + unsigned attributes; /* additional call-site specific AC_FUNC_ATTRs */ + + LLVMValueRef resource; + LLVMValueRef sampler; + LLVMValueRef data[2]; /* data[0] is source data (vector); data[1] is cmp for cmpswap */ + LLVMValueRef offset; + LLVMValueRef bias; + LLVMValueRef compare; + LLVMValueRef derivs[6]; + LLVMValueRef coords[4]; + LLVMValueRef lod; // also used by ac_image_get_resinfo +}; + +LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx, + struct ac_image_args *a); +LLVMValueRef ac_build_image_get_sample_count(struct ac_llvm_context *ctx, + LLVMValueRef rsrc); +LLVMValueRef ac_build_cvt_pkrtz_f16(struct ac_llvm_context *ctx, + LLVMValueRef args[2]); +LLVMValueRef ac_build_cvt_pknorm_i16(struct ac_llvm_context *ctx, + LLVMValueRef args[2]); +LLVMValueRef ac_build_cvt_pknorm_u16(struct ac_llvm_context *ctx, + LLVMValueRef args[2]); +LLVMValueRef ac_build_cvt_pk_i16(struct ac_llvm_context *ctx, + LLVMValueRef args[2], unsigned bits, bool hi); +LLVMValueRef ac_build_cvt_pk_u16(struct ac_llvm_context *ctx, + LLVMValueRef args[2], unsigned bits, bool hi); +LLVMValueRef ac_build_wqm_vote(struct ac_llvm_context *ctx, LLVMValueRef i1); +void ac_build_kill_if_false(struct ac_llvm_context *ctx, LLVMValueRef i1); +LLVMValueRef ac_build_bfe(struct ac_llvm_context *ctx, LLVMValueRef input, + LLVMValueRef offset, LLVMValueRef width, + bool is_signed); +LLVMValueRef ac_build_imad(struct ac_llvm_context *ctx, LLVMValueRef s0, + LLVMValueRef s1, LLVMValueRef s2); +LLVMValueRef ac_build_fmad(struct ac_llvm_context *ctx, LLVMValueRef s0, + LLVMValueRef s1, LLVMValueRef s2); + +void ac_build_waitcnt(struct ac_llvm_context *ctx, unsigned wait_flags); + +LLVMValueRef ac_build_fract(struct ac_llvm_context *ctx, LLVMValueRef src0, + unsigned bitsize); + +LLVMValueRef ac_build_fmed3(struct ac_llvm_context *ctx, LLVMValueRef src0, + LLVMValueRef src1, LLVMValueRef src2, + unsigned bitsize); + +LLVMValueRef ac_build_isign(struct ac_llvm_context *ctx, LLVMValueRef src0, + unsigned bitsize); + +LLVMValueRef ac_build_fsign(struct ac_llvm_context *ctx, LLVMValueRef src0, + unsigned bitsize); + +LLVMValueRef ac_build_bit_count(struct ac_llvm_context *ctx, LLVMValueRef src0); + +LLVMValueRef ac_build_bitfield_reverse(struct ac_llvm_context *ctx, + LLVMValueRef src0); + +void ac_optimize_vs_outputs(struct ac_llvm_context *ac, + LLVMValueRef main_fn, + uint8_t *vs_output_param_offset, + uint32_t num_outputs, + uint32_t skip_output_mask, + uint8_t *num_param_exports); +void ac_init_exec_full_mask(struct ac_llvm_context *ctx); + +void ac_declare_lds_as_pointer(struct ac_llvm_context *ac); +LLVMValueRef ac_lds_load(struct ac_llvm_context *ctx, + LLVMValueRef dw_addr); +void ac_lds_store(struct ac_llvm_context *ctx, + LLVMValueRef dw_addr, LLVMValueRef value); + +LLVMValueRef ac_find_lsb(struct ac_llvm_context *ctx, + LLVMTypeRef dst_type, + LLVMValueRef src0); + +LLVMTypeRef ac_array_in_const_addr_space(LLVMTypeRef elem_type); +LLVMTypeRef ac_array_in_const32_addr_space(LLVMTypeRef elem_type); + +void ac_build_bgnloop(struct ac_llvm_context *ctx, int lable_id); +void ac_build_break(struct ac_llvm_context *ctx); +void ac_build_continue(struct ac_llvm_context *ctx); +void ac_build_else(struct ac_llvm_context *ctx, int lable_id); +void ac_build_endif(struct ac_llvm_context *ctx, int lable_id); +void ac_build_endloop(struct ac_llvm_context *ctx, int lable_id); +void ac_build_ifcc(struct ac_llvm_context *ctx, LLVMValueRef cond, int label_id); +void ac_build_if(struct ac_llvm_context *ctx, LLVMValueRef value, + int lable_id); +void ac_build_uif(struct ac_llvm_context *ctx, LLVMValueRef value, + int lable_id); + +LLVMValueRef ac_build_alloca(struct ac_llvm_context *ac, LLVMTypeRef type, + const char *name); +LLVMValueRef ac_build_alloca_undef(struct ac_llvm_context *ac, LLVMTypeRef type, + const char *name); + +LLVMValueRef ac_cast_ptr(struct ac_llvm_context *ctx, LLVMValueRef ptr, + LLVMTypeRef type); + +LLVMValueRef ac_trim_vector(struct ac_llvm_context *ctx, LLVMValueRef value, + unsigned count); + +LLVMValueRef ac_unpack_param(struct ac_llvm_context *ctx, LLVMValueRef param, + unsigned rshift, unsigned bitwidth); + +void ac_apply_fmask_to_sample(struct ac_llvm_context *ac, LLVMValueRef fmask, + LLVMValueRef *addr, bool is_array_tex); + +LLVMValueRef +ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned mask); + +LLVMValueRef ac_build_readlane_no_opt_barrier(struct ac_llvm_context *ctx, + LLVMValueRef src, LLVMValueRef lane); + +LLVMValueRef +ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef lane); + +LLVMValueRef +ac_build_writelane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef value, LLVMValueRef lane); + +LLVMValueRef +ac_build_mbcnt(struct ac_llvm_context *ctx, LLVMValueRef mask); + +LLVMValueRef +ac_build_inclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op); + +LLVMValueRef +ac_build_exclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op); + +LLVMValueRef +ac_build_reduce(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op, unsigned cluster_size); + +/** + * Common arguments for a scan/reduce operation that accumulates per-wave + * values across an entire workgroup, while respecting the order of waves. + */ +struct ac_wg_scan { + bool enable_reduce; + bool enable_exclusive; + bool enable_inclusive; + nir_op op; + LLVMValueRef src; /* clobbered! */ + LLVMValueRef result_reduce; + LLVMValueRef result_exclusive; + LLVMValueRef result_inclusive; + LLVMValueRef extra; + LLVMValueRef waveidx; + LLVMValueRef numwaves; /* only needed for "reduce" operations */ + + /* T addrspace(LDS) pointer to the same type as value, at least maxwaves entries */ + LLVMValueRef scratch; + unsigned maxwaves; +}; + +void +ac_build_wg_wavescan_top(struct ac_llvm_context *ctx, struct ac_wg_scan *ws); +void +ac_build_wg_wavescan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws); +void +ac_build_wg_wavescan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws); + +void +ac_build_wg_scan_top(struct ac_llvm_context *ctx, struct ac_wg_scan *ws); +void +ac_build_wg_scan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws); +void +ac_build_wg_scan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws); + +LLVMValueRef +ac_build_quad_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, + unsigned lane0, unsigned lane1, unsigned lane2, unsigned lane3); + +LLVMValueRef +ac_build_shuffle(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef index); + +LLVMValueRef +ac_build_frexp_exp(struct ac_llvm_context *ctx, LLVMValueRef src0, + unsigned bitsize); + +LLVMValueRef +ac_build_frexp_mant(struct ac_llvm_context *ctx, LLVMValueRef src0, + unsigned bitsize); + +LLVMValueRef +ac_build_canonicalize(struct ac_llvm_context *ctx, LLVMValueRef src0, + unsigned bitsize); + +LLVMValueRef +ac_build_ddxy_interp(struct ac_llvm_context *ctx, LLVMValueRef interp_ij); + +LLVMValueRef +ac_build_load_helper_invocation(struct ac_llvm_context *ctx); + +LLVMValueRef ac_build_call(struct ac_llvm_context *ctx, LLVMValueRef func, + LLVMValueRef *args, unsigned num_args); + +LLVMValueRef ac_build_atomic_rmw(struct ac_llvm_context *ctx, LLVMAtomicRMWBinOp op, + LLVMValueRef ptr, LLVMValueRef val, + const char *sync_scope); + +LLVMValueRef ac_build_atomic_cmp_xchg(struct ac_llvm_context *ctx, LLVMValueRef ptr, + LLVMValueRef cmp, LLVMValueRef val, + const char *sync_scope); + +void +ac_export_mrt_z(struct ac_llvm_context *ctx, LLVMValueRef depth, + LLVMValueRef stencil, LLVMValueRef samplemask, + struct ac_export_args *args); + +void ac_build_sendmsg_gs_alloc_req(struct ac_llvm_context *ctx, LLVMValueRef wave_id, + LLVMValueRef vtx_cnt, LLVMValueRef prim_cnt); + +struct ac_ngg_prim { + unsigned num_vertices; + LLVMValueRef isnull; + LLVMValueRef index[3]; + LLVMValueRef edgeflag[3]; + LLVMValueRef passthrough; +}; + +LLVMValueRef ac_pack_prim_export(struct ac_llvm_context *ctx, + const struct ac_ngg_prim *prim); +void ac_build_export_prim(struct ac_llvm_context *ctx, + const struct ac_ngg_prim *prim); + +static inline LLVMValueRef +ac_get_arg(struct ac_llvm_context *ctx, struct ac_arg arg) +{ + assert(arg.used); + return LLVMGetParam(ctx->main_function, arg.arg_index); +} + +enum ac_llvm_calling_convention { + AC_LLVM_AMDGPU_VS = 87, + AC_LLVM_AMDGPU_GS = 88, + AC_LLVM_AMDGPU_PS = 89, + AC_LLVM_AMDGPU_CS = 90, + AC_LLVM_AMDGPU_HS = 93, +}; + +LLVMValueRef ac_build_main(const struct ac_shader_args *args, + struct ac_llvm_context *ctx, + enum ac_llvm_calling_convention convention, + const char *name, LLVMTypeRef ret_type, + LLVMModuleRef module); +void ac_build_s_endpgm(struct ac_llvm_context *ctx); + +LLVMValueRef ac_prefix_bitcount(struct ac_llvm_context *ctx, + LLVMValueRef mask, LLVMValueRef index); +LLVMValueRef ac_prefix_bitcount_2x64(struct ac_llvm_context *ctx, + LLVMValueRef mask[2], LLVMValueRef index); +void ac_build_triangle_strip_indices_to_triangle(struct ac_llvm_context *ctx, + LLVMValueRef is_odd, + LLVMValueRef flatshade_first, + LLVMValueRef index[3]); + +#ifdef __cplusplus +} +#endif + +#endif diff -Nru mesa-19.2.8/src/amd/llvm/ac_llvm_cull.c mesa-20.0.8/src/amd/llvm/ac_llvm_cull.c --- mesa-19.2.8/src/amd/llvm/ac_llvm_cull.c 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/amd/llvm/ac_llvm_cull.c 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,275 @@ +/* + * Copyright 2019 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + */ + +#include "ac_llvm_cull.h" +#include + +struct ac_position_w_info { + /* If a primitive intersects the W=0 plane, it causes a reflection + * of the determinant used for face culling. Every vertex behind + * the W=0 plane negates the determinant, so having 2 vertices behind + * the plane has no effect. This is i1 true if the determinant should be + * negated. + */ + LLVMValueRef w_reflection; + + /* If we simplify the "-w <= p <= w" view culling equation, we get + * "-w <= w", which can't be satisfied when w is negative. + * In perspective projection, a negative W means that the primitive + * is behind the viewer, but the equation is independent of the type + * of projection. + * + * w_accepted is false when all W are negative and therefore + * the primitive is invisible. + */ + LLVMValueRef w_accepted; + + LLVMValueRef all_w_positive; + LLVMValueRef any_w_negative; +}; + +static void ac_analyze_position_w(struct ac_llvm_context *ctx, + LLVMValueRef pos[3][4], + struct ac_position_w_info *w) +{ + LLVMBuilderRef builder = ctx->builder; + LLVMValueRef all_w_negative = ctx->i1true; + + w->w_reflection = ctx->i1false; + w->any_w_negative = ctx->i1false; + + for (unsigned i = 0; i < 3; i++) { + LLVMValueRef neg_w; + + neg_w = LLVMBuildFCmp(builder, LLVMRealOLT, pos[i][3], ctx->f32_0, ""); + /* If neg_w is true, negate w_reflection. */ + w->w_reflection = LLVMBuildXor(builder, w->w_reflection, neg_w, ""); + w->any_w_negative = LLVMBuildOr(builder, w->any_w_negative, neg_w, ""); + all_w_negative = LLVMBuildAnd(builder, all_w_negative, neg_w, ""); + } + w->all_w_positive = LLVMBuildNot(builder, w->any_w_negative, ""); + w->w_accepted = LLVMBuildNot(builder, all_w_negative, ""); +} + +/* Perform front/back face culling and return true if the primitive is accepted. */ +static LLVMValueRef ac_cull_face(struct ac_llvm_context *ctx, + LLVMValueRef pos[3][4], + struct ac_position_w_info *w, + bool cull_front, + bool cull_back, + bool cull_zero_area) +{ + LLVMBuilderRef builder = ctx->builder; + + if (cull_front && cull_back) + return ctx->i1false; + + if (!cull_front && !cull_back && !cull_zero_area) + return ctx->i1true; + + /* Front/back face culling. Also if the determinant == 0, the triangle + * area is 0. + */ + LLVMValueRef det_t0 = LLVMBuildFSub(builder, pos[2][0], pos[0][0], ""); + LLVMValueRef det_t1 = LLVMBuildFSub(builder, pos[1][1], pos[0][1], ""); + LLVMValueRef det_t2 = LLVMBuildFSub(builder, pos[0][0], pos[1][0], ""); + LLVMValueRef det_t3 = LLVMBuildFSub(builder, pos[0][1], pos[2][1], ""); + LLVMValueRef det_p0 = LLVMBuildFMul(builder, det_t0, det_t1, ""); + LLVMValueRef det_p1 = LLVMBuildFMul(builder, det_t2, det_t3, ""); + LLVMValueRef det = LLVMBuildFSub(builder, det_p0, det_p1, ""); + + /* Negative W negates the determinant. */ + det = LLVMBuildSelect(builder, w->w_reflection, + LLVMBuildFNeg(builder, det, ""), + det, ""); + + LLVMValueRef accepted = NULL; + if (cull_front) { + LLVMRealPredicate cond = cull_zero_area ? LLVMRealOGT : LLVMRealOGE; + accepted = LLVMBuildFCmp(builder, cond, det, ctx->f32_0, ""); + } else if (cull_back) { + LLVMRealPredicate cond = cull_zero_area ? LLVMRealOLT : LLVMRealOLE; + accepted = LLVMBuildFCmp(builder, cond, det, ctx->f32_0, ""); + } else if (cull_zero_area) { + accepted = LLVMBuildFCmp(builder, LLVMRealONE, det, ctx->f32_0, ""); + } + return accepted; +} + +/* Perform view culling and small primitive elimination and return true + * if the primitive is accepted and initially_accepted == true. */ +static LLVMValueRef cull_bbox(struct ac_llvm_context *ctx, + LLVMValueRef pos[3][4], + LLVMValueRef initially_accepted, + struct ac_position_w_info *w, + LLVMValueRef vp_scale[2], + LLVMValueRef vp_translate[2], + LLVMValueRef small_prim_precision, + bool cull_view_xy, + bool cull_view_near_z, + bool cull_view_far_z, + bool cull_small_prims, + bool use_halfz_clip_space) +{ + LLVMBuilderRef builder = ctx->builder; + + if (!cull_view_xy && !cull_view_near_z && !cull_view_far_z && !cull_small_prims) + return initially_accepted; + + /* Skip the culling if the primitive has already been rejected or + * if any W is negative. The bounding box culling doesn't work when + * W is negative. + */ + LLVMValueRef cond = LLVMBuildAnd(builder, initially_accepted, + w->all_w_positive, ""); + LLVMValueRef accepted_var = ac_build_alloca_undef(ctx, ctx->i1, ""); + LLVMBuildStore(builder, initially_accepted, accepted_var); + + ac_build_ifcc(ctx, cond, 10000000 /* does this matter? */); + { + LLVMValueRef bbox_min[3], bbox_max[3]; + LLVMValueRef accepted = initially_accepted; + + /* Compute the primitive bounding box for easy culling. */ + for (unsigned chan = 0; chan < (cull_view_near_z || cull_view_far_z ? 3 : 2); chan++) { + bbox_min[chan] = ac_build_fmin(ctx, pos[0][chan], pos[1][chan]); + bbox_min[chan] = ac_build_fmin(ctx, bbox_min[chan], pos[2][chan]); + + bbox_max[chan] = ac_build_fmax(ctx, pos[0][chan], pos[1][chan]); + bbox_max[chan] = ac_build_fmax(ctx, bbox_max[chan], pos[2][chan]); + } + + /* View culling. */ + if (cull_view_xy || cull_view_near_z || cull_view_far_z) { + for (unsigned chan = 0; chan < 3; chan++) { + LLVMValueRef visible; + + if ((cull_view_xy && chan <= 1) || + (cull_view_near_z && chan == 2)) { + float t = chan == 2 && use_halfz_clip_space ? 0 : -1; + visible = LLVMBuildFCmp(builder, LLVMRealOGE, bbox_max[chan], + LLVMConstReal(ctx->f32, t), ""); + accepted = LLVMBuildAnd(builder, accepted, visible, ""); + } + + if ((cull_view_xy && chan <= 1) || + (cull_view_far_z && chan == 2)) { + visible = LLVMBuildFCmp(builder, LLVMRealOLE, bbox_min[chan], + ctx->f32_1, ""); + accepted = LLVMBuildAnd(builder, accepted, visible, ""); + } + } + } + + /* Small primitive elimination. */ + if (cull_small_prims) { + /* Assuming a sample position at (0.5, 0.5), if we round + * the bounding box min/max extents and the results of + * the rounding are equal in either the X or Y direction, + * the bounding box does not intersect the sample. + * + * See these GDC slides for pictures: + * https://frostbite-wp-prd.s3.amazonaws.com/wp-content/uploads/2016/03/29204330/GDC_2016_Compute.pdf + */ + LLVMValueRef min, max, not_equal[2], visible; + + for (unsigned chan = 0; chan < 2; chan++) { + /* Convert the position to screen-space coordinates. */ + min = ac_build_fmad(ctx, bbox_min[chan], + vp_scale[chan], vp_translate[chan]); + max = ac_build_fmad(ctx, bbox_max[chan], + vp_scale[chan], vp_translate[chan]); + /* Scale the bounding box according to the precision of + * the rasterizer and the number of MSAA samples. */ + min = LLVMBuildFSub(builder, min, small_prim_precision, ""); + max = LLVMBuildFAdd(builder, max, small_prim_precision, ""); + + /* Determine if the bbox intersects the sample point. + * It also works for MSAA, but vp_scale, vp_translate, + * and small_prim_precision are computed differently. + */ + min = ac_build_round(ctx, min); + max = ac_build_round(ctx, max); + not_equal[chan] = LLVMBuildFCmp(builder, LLVMRealONE, min, max, ""); + } + visible = LLVMBuildAnd(builder, not_equal[0], not_equal[1], ""); + accepted = LLVMBuildAnd(builder, accepted, visible, ""); + } + + LLVMBuildStore(builder, accepted, accepted_var); + } + ac_build_endif(ctx, 10000000); + + return LLVMBuildLoad(builder, accepted_var, ""); +} + +/** + * Return i1 true if the primitive is accepted (not culled). + * + * \param pos Vertex positions 3x vec4 + * \param initially_accepted AND'ed with the result. Some computations can be + * skipped if this is false. + * \param vp_scale Viewport scale XY. + * For MSAA, multiply them by the number of samples. + * \param vp_translate Viewport translation XY. + * For MSAA, multiply them by the number of samples. + * \param small_prim_precision Precision of small primitive culling. This should + * be the same as or greater than the precision of + * the rasterizer. Set to num_samples / 2^subpixel_bits. + * subpixel_bits are defined by the quantization mode. + * \param options See ac_cull_options. + */ +LLVMValueRef ac_cull_triangle(struct ac_llvm_context *ctx, + LLVMValueRef pos[3][4], + LLVMValueRef initially_accepted, + LLVMValueRef vp_scale[2], + LLVMValueRef vp_translate[2], + LLVMValueRef small_prim_precision, + struct ac_cull_options *options) +{ + struct ac_position_w_info w; + ac_analyze_position_w(ctx, pos, &w); + + /* W culling. */ + LLVMValueRef accepted = options->cull_w ? w.w_accepted : ctx->i1true; + accepted = LLVMBuildAnd(ctx->builder, accepted, initially_accepted, ""); + + /* Face culling. */ + accepted = LLVMBuildAnd(ctx->builder, accepted, + ac_cull_face(ctx, pos, &w, + options->cull_front, + options->cull_back, + options->cull_zero_area), ""); + + /* View culling and small primitive elimination. */ + accepted = cull_bbox(ctx, pos, accepted, &w, vp_scale, vp_translate, + small_prim_precision, + options->cull_view_xy, + options->cull_view_near_z, + options->cull_view_far_z, + options->cull_small_prims, + options->use_halfz_clip_space); + return accepted; +} diff -Nru mesa-19.2.8/src/amd/llvm/ac_llvm_cull.h mesa-20.0.8/src/amd/llvm/ac_llvm_cull.h --- mesa-19.2.8/src/amd/llvm/ac_llvm_cull.h 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/amd/llvm/ac_llvm_cull.h 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,59 @@ +/* + * Copyright 2019 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + */ + +#ifndef AC_LLVM_CULL_H +#define AC_LLVM_CULL_H + +#include "ac_llvm_build.h" + +struct ac_cull_options { + /* In general, I recommend setting all to true except view Z culling, + * which isn't so effective because W culling is cheaper and partially + * replaces near Z culling, and you don't need to set Position.z + * if Z culling is disabled. + * + * If something doesn't work, turn some of these off to find out what. + */ + bool cull_front; + bool cull_back; + bool cull_view_xy; + bool cull_view_near_z; + bool cull_view_far_z; + bool cull_small_prims; + bool cull_zero_area; + bool cull_w; /* cull primitives with all W < 0 */ + + bool use_halfz_clip_space; +}; + +LLVMValueRef ac_cull_triangle(struct ac_llvm_context *ctx, + LLVMValueRef pos[3][4], + LLVMValueRef initially_accepted, + LLVMValueRef vp_scale[2], + LLVMValueRef vp_translate[2], + LLVMValueRef small_prim_precision, + struct ac_cull_options *options); + +#endif diff -Nru mesa-19.2.8/src/amd/llvm/ac_llvm_helper.cpp mesa-20.0.8/src/amd/llvm/ac_llvm_helper.cpp --- mesa-19.2.8/src/amd/llvm/ac_llvm_helper.cpp 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/amd/llvm/ac_llvm_helper.cpp 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,283 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + */ + +#include + +#include "ac_binary.h" +#include "ac_llvm_util.h" +#include "ac_llvm_build.h" + +#include "util/macros.h" + +#include +#include +#include +#include +#include + +#include + +void ac_add_attr_dereferenceable(LLVMValueRef val, uint64_t bytes) +{ + llvm::Argument *A = llvm::unwrap(val); + A->addAttr(llvm::Attribute::getWithDereferenceableBytes(A->getContext(), bytes)); +} + +bool ac_is_sgpr_param(LLVMValueRef arg) +{ + llvm::Argument *A = llvm::unwrap(arg); + llvm::AttributeList AS = A->getParent()->getAttributes(); + unsigned ArgNo = A->getArgNo(); + return AS.hasAttribute(ArgNo + 1, llvm::Attribute::InReg); +} + +LLVMValueRef ac_llvm_get_called_value(LLVMValueRef call) +{ + return LLVMGetCalledValue(call); +} + +bool ac_llvm_is_function(LLVMValueRef v) +{ + return LLVMGetValueKind(v) == LLVMFunctionValueKind; +} + +LLVMModuleRef ac_create_module(LLVMTargetMachineRef tm, LLVMContextRef ctx) +{ + llvm::TargetMachine *TM = reinterpret_cast(tm); + LLVMModuleRef module = LLVMModuleCreateWithNameInContext("mesa-shader", ctx); + + llvm::unwrap(module)->setTargetTriple(TM->getTargetTriple().getTriple()); + llvm::unwrap(module)->setDataLayout(TM->createDataLayout()); + return module; +} + +LLVMBuilderRef ac_create_builder(LLVMContextRef ctx, + enum ac_float_mode float_mode) +{ + LLVMBuilderRef builder = LLVMCreateBuilderInContext(ctx); + + llvm::FastMathFlags flags; + + switch (float_mode) { + case AC_FLOAT_MODE_DEFAULT: + case AC_FLOAT_MODE_DENORM_FLUSH_TO_ZERO: + break; + case AC_FLOAT_MODE_NO_SIGNED_ZEROS_FP_MATH: + flags.setNoSignedZeros(); + llvm::unwrap(builder)->setFastMathFlags(flags); + break; + } + + return builder; +} + +LLVMTargetLibraryInfoRef +ac_create_target_library_info(const char *triple) +{ + return reinterpret_cast(new llvm::TargetLibraryInfoImpl(llvm::Triple(triple))); +} + +void +ac_dispose_target_library_info(LLVMTargetLibraryInfoRef library_info) +{ + delete reinterpret_cast(library_info); +} + +/* Implementation of raw_pwrite_stream that works on malloc()ed memory for + * better compatibility with C code. */ +struct raw_memory_ostream : public llvm::raw_pwrite_stream { + char *buffer; + size_t written; + size_t bufsize; + + raw_memory_ostream() + { + buffer = NULL; + written = 0; + bufsize = 0; + SetUnbuffered(); + } + + ~raw_memory_ostream() + { + free(buffer); + } + + void clear() + { + written = 0; + } + + void take(char *&out_buffer, size_t &out_size) + { + out_buffer = buffer; + out_size = written; + buffer = NULL; + written = 0; + bufsize = 0; + } + + void flush() = delete; + + void write_impl(const char *ptr, size_t size) override + { + if (unlikely(written + size < written)) + abort(); + if (written + size > bufsize) { + bufsize = MAX3(1024, written + size, bufsize / 3 * 4); + buffer = (char *)realloc(buffer, bufsize); + if (!buffer) { + fprintf(stderr, "amd: out of memory allocating ELF buffer\n"); + abort(); + } + } + memcpy(buffer + written, ptr, size); + written += size; + } + + void pwrite_impl(const char *ptr, size_t size, uint64_t offset) override + { + assert(offset == (size_t)offset && + offset + size >= offset && offset + size <= written); + memcpy(buffer + offset, ptr, size); + } + + uint64_t current_pos() const override + { + return written; + } +}; + +/* The LLVM compiler is represented as a pass manager containing passes for + * optimizations, instruction selection, and code generation. + */ +struct ac_compiler_passes { + raw_memory_ostream ostream; /* ELF shader binary stream */ + llvm::legacy::PassManager passmgr; /* list of passes */ +}; + +struct ac_compiler_passes *ac_create_llvm_passes(LLVMTargetMachineRef tm) +{ + struct ac_compiler_passes *p = new ac_compiler_passes(); + if (!p) + return NULL; + + llvm::TargetMachine *TM = reinterpret_cast(tm); + + if (TM->addPassesToEmitFile(p->passmgr, p->ostream, + nullptr, +#if LLVM_VERSION_MAJOR >= 10 + llvm::CGFT_ObjectFile)) { +#else + llvm::TargetMachine::CGFT_ObjectFile)) { +#endif + fprintf(stderr, "amd: TargetMachine can't emit a file of this type!\n"); + delete p; + return NULL; + } + return p; +} + +void ac_destroy_llvm_passes(struct ac_compiler_passes *p) +{ + delete p; +} + +/* This returns false on failure. */ +bool ac_compile_module_to_elf(struct ac_compiler_passes *p, LLVMModuleRef module, + char **pelf_buffer, size_t *pelf_size) +{ + p->passmgr.run(*llvm::unwrap(module)); + p->ostream.take(*pelf_buffer, *pelf_size); + return true; +} + +void ac_llvm_add_barrier_noop_pass(LLVMPassManagerRef passmgr) +{ + llvm::unwrap(passmgr)->add(llvm::createBarrierNoopPass()); +} + +void ac_enable_global_isel(LLVMTargetMachineRef tm) +{ + reinterpret_cast(tm)->setGlobalISel(true); +} + +LLVMValueRef ac_build_atomic_rmw(struct ac_llvm_context *ctx, LLVMAtomicRMWBinOp op, + LLVMValueRef ptr, LLVMValueRef val, + const char *sync_scope) { + llvm::AtomicRMWInst::BinOp binop; + switch (op) { + case LLVMAtomicRMWBinOpXchg: + binop = llvm::AtomicRMWInst::Xchg; + break; + case LLVMAtomicRMWBinOpAdd: + binop = llvm::AtomicRMWInst::Add; + break; + case LLVMAtomicRMWBinOpSub: + binop = llvm::AtomicRMWInst::Sub; + break; + case LLVMAtomicRMWBinOpAnd: + binop = llvm::AtomicRMWInst::And; + break; + case LLVMAtomicRMWBinOpNand: + binop = llvm::AtomicRMWInst::Nand; + break; + case LLVMAtomicRMWBinOpOr: + binop = llvm::AtomicRMWInst::Or; + break; + case LLVMAtomicRMWBinOpXor: + binop = llvm::AtomicRMWInst::Xor; + break; + case LLVMAtomicRMWBinOpMax: + binop = llvm::AtomicRMWInst::Max; + break; + case LLVMAtomicRMWBinOpMin: + binop = llvm::AtomicRMWInst::Min; + break; + case LLVMAtomicRMWBinOpUMax: + binop = llvm::AtomicRMWInst::UMax; + break; + case LLVMAtomicRMWBinOpUMin: + binop = llvm::AtomicRMWInst::UMin; + break; + default: + unreachable(!"invalid LLVMAtomicRMWBinOp"); + break; + } + unsigned SSID = llvm::unwrap(ctx->context)->getOrInsertSyncScopeID(sync_scope); + return llvm::wrap(llvm::unwrap(ctx->builder)->CreateAtomicRMW( + binop, llvm::unwrap(ptr), llvm::unwrap(val), + llvm::AtomicOrdering::SequentiallyConsistent, SSID)); +} + +LLVMValueRef ac_build_atomic_cmp_xchg(struct ac_llvm_context *ctx, LLVMValueRef ptr, + LLVMValueRef cmp, LLVMValueRef val, + const char *sync_scope) { + unsigned SSID = llvm::unwrap(ctx->context)->getOrInsertSyncScopeID(sync_scope); + return llvm::wrap(llvm::unwrap(ctx->builder)->CreateAtomicCmpXchg( + llvm::unwrap(ptr), llvm::unwrap(cmp), llvm::unwrap(val), + llvm::AtomicOrdering::SequentiallyConsistent, + llvm::AtomicOrdering::SequentiallyConsistent, SSID)); +} diff -Nru mesa-19.2.8/src/amd/llvm/ac_llvm_util.c mesa-20.0.8/src/amd/llvm/ac_llvm_util.c --- mesa-19.2.8/src/amd/llvm/ac_llvm_util.c 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/amd/llvm/ac_llvm_util.c 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,397 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + */ +/* based on pieces from si_pipe.c and radeon_llvm_emit.c */ +#include "ac_llvm_util.h" +#include "ac_llvm_build.h" +#include "util/bitscan.h" +#include +#include +#include +#include +#include +#include "c11/threads.h" +#include "gallivm/lp_bld_misc.h" +#include "util/u_math.h" + +#include +#include +#include + +static void ac_init_llvm_target() +{ + LLVMInitializeAMDGPUTargetInfo(); + LLVMInitializeAMDGPUTarget(); + LLVMInitializeAMDGPUTargetMC(); + LLVMInitializeAMDGPUAsmPrinter(); + + /* For inline assembly. */ + LLVMInitializeAMDGPUAsmParser(); + + /* For ACO disassembly. */ + LLVMInitializeAMDGPUDisassembler(); + + /* Workaround for bug in llvm 4.0 that causes image intrinsics + * to disappear. + * https://reviews.llvm.org/D26348 + * + * "mesa" is the prefix for error messages. + * + * -global-isel-abort=2 is a no-op unless global isel has been enabled. + * This option tells the backend to fall-back to SelectionDAG and print + * a diagnostic message if global isel fails. + */ + const char *argv[] = { + "mesa", + "-simplifycfg-sink-common=false", + "-global-isel-abort=2", +#if LLVM_VERSION_MAJOR >= 10 + /* Atomic optimizations require LLVM 10.0 for gfx10 support. */ + "-amdgpu-atomic-optimizations=true", +#endif + }; + LLVMParseCommandLineOptions(ARRAY_SIZE(argv), argv, NULL); +} + +static once_flag ac_init_llvm_target_once_flag = ONCE_FLAG_INIT; + +void ac_init_llvm_once(void) +{ + call_once(&ac_init_llvm_target_once_flag, ac_init_llvm_target); +} + +static LLVMTargetRef ac_get_llvm_target(const char *triple) +{ + LLVMTargetRef target = NULL; + char *err_message = NULL; + + if (LLVMGetTargetFromTriple(triple, &target, &err_message)) { + fprintf(stderr, "Cannot find target for triple %s ", triple); + if (err_message) { + fprintf(stderr, "%s\n", err_message); + } + LLVMDisposeMessage(err_message); + return NULL; + } + return target; +} + +const char *ac_get_llvm_processor_name(enum radeon_family family) +{ + switch (family) { + case CHIP_TAHITI: + return "tahiti"; + case CHIP_PITCAIRN: + return "pitcairn"; + case CHIP_VERDE: + return "verde"; + case CHIP_OLAND: + return "oland"; + case CHIP_HAINAN: + return "hainan"; + case CHIP_BONAIRE: + return "bonaire"; + case CHIP_KABINI: + return "kabini"; + case CHIP_KAVERI: + return "kaveri"; + case CHIP_HAWAII: + return "hawaii"; + case CHIP_TONGA: + return "tonga"; + case CHIP_ICELAND: + return "iceland"; + case CHIP_CARRIZO: + return "carrizo"; + case CHIP_FIJI: + return "fiji"; + case CHIP_STONEY: + return "stoney"; + case CHIP_POLARIS10: + return "polaris10"; + case CHIP_POLARIS11: + case CHIP_POLARIS12: + case CHIP_VEGAM: + return "polaris11"; + case CHIP_VEGA10: + return "gfx900"; + case CHIP_RAVEN: + return "gfx902"; + case CHIP_VEGA12: + return "gfx904"; + case CHIP_VEGA20: + return "gfx906"; + case CHIP_RAVEN2: + case CHIP_RENOIR: + return "gfx909"; + case CHIP_ARCTURUS: + return "gfx908"; + case CHIP_NAVI10: + return "gfx1010"; + case CHIP_NAVI12: + return "gfx1011"; + case CHIP_NAVI14: + return "gfx1012"; + default: + return ""; + } +} + +static LLVMTargetMachineRef ac_create_target_machine(enum radeon_family family, + enum ac_target_machine_options tm_options, + LLVMCodeGenOptLevel level, + const char **out_triple) +{ + assert(family >= CHIP_TAHITI); + char features[256]; + const char *triple = (tm_options & AC_TM_SUPPORTS_SPILL) ? "amdgcn-mesa-mesa3d" : "amdgcn--"; + LLVMTargetRef target = ac_get_llvm_target(triple); + + snprintf(features, sizeof(features), + "+DumpCode,-fp32-denormals,+fp64-denormals%s%s%s%s%s%s", + family >= CHIP_NAVI10 && !(tm_options & AC_TM_WAVE32) ? + ",+wavefrontsize64,-wavefrontsize32" : "", + tm_options & AC_TM_SISCHED ? ",+si-scheduler" : "", + tm_options & AC_TM_FORCE_ENABLE_XNACK ? ",+xnack" : "", + tm_options & AC_TM_FORCE_DISABLE_XNACK ? ",-xnack" : "", + tm_options & AC_TM_PROMOTE_ALLOCA_TO_SCRATCH ? ",-promote-alloca" : "", + tm_options & AC_TM_NO_LOAD_STORE_OPT ? ",-load-store-opt" : ""); + + LLVMTargetMachineRef tm = LLVMCreateTargetMachine( + target, + triple, + ac_get_llvm_processor_name(family), + features, + level, + LLVMRelocDefault, + LLVMCodeModelDefault); + + if (out_triple) + *out_triple = triple; + if (tm_options & AC_TM_ENABLE_GLOBAL_ISEL) + ac_enable_global_isel(tm); + return tm; +} + +static LLVMPassManagerRef ac_create_passmgr(LLVMTargetLibraryInfoRef target_library_info, + bool check_ir) +{ + LLVMPassManagerRef passmgr = LLVMCreatePassManager(); + if (!passmgr) + return NULL; + + if (target_library_info) + LLVMAddTargetLibraryInfo(target_library_info, + passmgr); + + if (check_ir) + LLVMAddVerifierPass(passmgr); + LLVMAddAlwaysInlinerPass(passmgr); + /* Normally, the pass manager runs all passes on one function before + * moving onto another. Adding a barrier no-op pass forces the pass + * manager to run the inliner on all functions first, which makes sure + * that the following passes are only run on the remaining non-inline + * function, so it removes useless work done on dead inline functions. + */ + ac_llvm_add_barrier_noop_pass(passmgr); + /* This pass should eliminate all the load and store instructions. */ + LLVMAddPromoteMemoryToRegisterPass(passmgr); + LLVMAddScalarReplAggregatesPass(passmgr); + LLVMAddLICMPass(passmgr); + LLVMAddAggressiveDCEPass(passmgr); + LLVMAddCFGSimplificationPass(passmgr); + /* This is recommended by the instruction combining pass. */ + LLVMAddEarlyCSEMemSSAPass(passmgr); + LLVMAddInstructionCombiningPass(passmgr); + return passmgr; +} + +static const char *attr_to_str(enum ac_func_attr attr) +{ + switch (attr) { + case AC_FUNC_ATTR_ALWAYSINLINE: return "alwaysinline"; + case AC_FUNC_ATTR_INREG: return "inreg"; + case AC_FUNC_ATTR_NOALIAS: return "noalias"; + case AC_FUNC_ATTR_NOUNWIND: return "nounwind"; + case AC_FUNC_ATTR_READNONE: return "readnone"; + case AC_FUNC_ATTR_READONLY: return "readonly"; + case AC_FUNC_ATTR_WRITEONLY: return "writeonly"; + case AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY: return "inaccessiblememonly"; + case AC_FUNC_ATTR_CONVERGENT: return "convergent"; + default: + fprintf(stderr, "Unhandled function attribute: %x\n", attr); + return 0; + } +} + +void +ac_add_function_attr(LLVMContextRef ctx, LLVMValueRef function, + int attr_idx, enum ac_func_attr attr) +{ + const char *attr_name = attr_to_str(attr); + unsigned kind_id = LLVMGetEnumAttributeKindForName(attr_name, + strlen(attr_name)); + LLVMAttributeRef llvm_attr = LLVMCreateEnumAttribute(ctx, kind_id, 0); + + if (LLVMIsAFunction(function)) + LLVMAddAttributeAtIndex(function, attr_idx, llvm_attr); + else + LLVMAddCallSiteAttribute(function, attr_idx, llvm_attr); +} + +void ac_add_func_attributes(LLVMContextRef ctx, LLVMValueRef function, + unsigned attrib_mask) +{ + attrib_mask |= AC_FUNC_ATTR_NOUNWIND; + attrib_mask &= ~AC_FUNC_ATTR_LEGACY; + + while (attrib_mask) { + enum ac_func_attr attr = 1u << u_bit_scan(&attrib_mask); + ac_add_function_attr(ctx, function, -1, attr); + } +} + +void +ac_dump_module(LLVMModuleRef module) +{ + char *str = LLVMPrintModuleToString(module); + fprintf(stderr, "%s", str); + LLVMDisposeMessage(str); +} + +void +ac_llvm_add_target_dep_function_attr(LLVMValueRef F, + const char *name, unsigned value) +{ + char str[16]; + + snprintf(str, sizeof(str), "0x%x", value); + LLVMAddTargetDependentFunctionAttr(F, name, str); +} + +void ac_llvm_set_workgroup_size(LLVMValueRef F, unsigned size) +{ + if (!size) + return; + + char str[32]; + snprintf(str, sizeof(str), "%u,%u", size, size); + LLVMAddTargetDependentFunctionAttr(F, "amdgpu-flat-work-group-size", str); +} + +unsigned +ac_count_scratch_private_memory(LLVMValueRef function) +{ + unsigned private_mem_vgprs = 0; + + /* Process all LLVM instructions. */ + LLVMBasicBlockRef bb = LLVMGetFirstBasicBlock(function); + while (bb) { + LLVMValueRef next = LLVMGetFirstInstruction(bb); + + while (next) { + LLVMValueRef inst = next; + next = LLVMGetNextInstruction(next); + + if (LLVMGetInstructionOpcode(inst) != LLVMAlloca) + continue; + + LLVMTypeRef type = LLVMGetElementType(LLVMTypeOf(inst)); + /* No idea why LLVM aligns allocas to 4 elements. */ + unsigned alignment = LLVMGetAlignment(inst); + unsigned dw_size = align(ac_get_type_size(type) / 4, alignment); + private_mem_vgprs += dw_size; + } + bb = LLVMGetNextBasicBlock(bb); + } + + return private_mem_vgprs; +} + +bool +ac_init_llvm_compiler(struct ac_llvm_compiler *compiler, + enum radeon_family family, + enum ac_target_machine_options tm_options) +{ + const char *triple; + memset(compiler, 0, sizeof(*compiler)); + + compiler->tm = ac_create_target_machine(family, tm_options, + LLVMCodeGenLevelDefault, + &triple); + if (!compiler->tm) + return false; + + if (tm_options & AC_TM_CREATE_LOW_OPT) { + compiler->low_opt_tm = + ac_create_target_machine(family, tm_options, + LLVMCodeGenLevelLess, NULL); + if (!compiler->low_opt_tm) + goto fail; + } + + if (family >= CHIP_NAVI10) { + assert(!(tm_options & AC_TM_CREATE_LOW_OPT)); + compiler->tm_wave32 = ac_create_target_machine(family, + tm_options | AC_TM_WAVE32, + LLVMCodeGenLevelDefault, + NULL); + if (!compiler->tm_wave32) + goto fail; + } + + compiler->target_library_info = + ac_create_target_library_info(triple); + if (!compiler->target_library_info) + goto fail; + + compiler->passmgr = ac_create_passmgr(compiler->target_library_info, + tm_options & AC_TM_CHECK_IR); + if (!compiler->passmgr) + goto fail; + + return true; +fail: + ac_destroy_llvm_compiler(compiler); + return false; +} + +void +ac_destroy_llvm_compiler(struct ac_llvm_compiler *compiler) +{ + ac_destroy_llvm_passes(compiler->passes); + ac_destroy_llvm_passes(compiler->passes_wave32); + ac_destroy_llvm_passes(compiler->low_opt_passes); + + if (compiler->passmgr) + LLVMDisposePassManager(compiler->passmgr); + if (compiler->target_library_info) + ac_dispose_target_library_info(compiler->target_library_info); + if (compiler->low_opt_tm) + LLVMDisposeTargetMachine(compiler->low_opt_tm); + if (compiler->tm) + LLVMDisposeTargetMachine(compiler->tm); + if (compiler->tm_wave32) + LLVMDisposeTargetMachine(compiler->tm_wave32); +} diff -Nru mesa-19.2.8/src/amd/llvm/ac_llvm_util.h mesa-20.0.8/src/amd/llvm/ac_llvm_util.h --- mesa-19.2.8/src/amd/llvm/ac_llvm_util.h 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/amd/llvm/ac_llvm_util.h 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,163 @@ +/* + * Copyright 2016 Bas Nieuwenhuizen + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + */ + +#ifndef AC_LLVM_UTIL_H +#define AC_LLVM_UTIL_H + +#include +#include +#include + +#include "amd_family.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct ac_compiler_passes; + +enum ac_func_attr { + AC_FUNC_ATTR_ALWAYSINLINE = (1 << 0), + AC_FUNC_ATTR_INREG = (1 << 2), + AC_FUNC_ATTR_NOALIAS = (1 << 3), + AC_FUNC_ATTR_NOUNWIND = (1 << 4), + AC_FUNC_ATTR_READNONE = (1 << 5), + AC_FUNC_ATTR_READONLY = (1 << 6), + AC_FUNC_ATTR_WRITEONLY = (1 << 7), + AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY = (1 << 8), + AC_FUNC_ATTR_CONVERGENT = (1 << 9), + + /* Legacy intrinsic that needs attributes on function declarations + * and they must match the internal LLVM definition exactly, otherwise + * intrinsic selection fails. + */ + AC_FUNC_ATTR_LEGACY = (1u << 31), +}; + +enum ac_target_machine_options { + AC_TM_SUPPORTS_SPILL = (1 << 0), + AC_TM_SISCHED = (1 << 1), + AC_TM_FORCE_ENABLE_XNACK = (1 << 2), + AC_TM_FORCE_DISABLE_XNACK = (1 << 3), + AC_TM_PROMOTE_ALLOCA_TO_SCRATCH = (1 << 4), + AC_TM_CHECK_IR = (1 << 5), + AC_TM_ENABLE_GLOBAL_ISEL = (1 << 6), + AC_TM_CREATE_LOW_OPT = (1 << 7), + AC_TM_NO_LOAD_STORE_OPT = (1 << 8), + AC_TM_WAVE32 = (1 << 9), +}; + +enum ac_float_mode { + AC_FLOAT_MODE_DEFAULT, + AC_FLOAT_MODE_NO_SIGNED_ZEROS_FP_MATH, + AC_FLOAT_MODE_DENORM_FLUSH_TO_ZERO, +}; + +/* Per-thread persistent LLVM objects. */ +struct ac_llvm_compiler { + LLVMTargetLibraryInfoRef target_library_info; + LLVMPassManagerRef passmgr; + + /* Default compiler. */ + LLVMTargetMachineRef tm; + struct ac_compiler_passes *passes; + + /* Wave32 compiler for GFX10. */ + LLVMTargetMachineRef tm_wave32; + struct ac_compiler_passes *passes_wave32; + + /* Optional compiler for faster compilation with fewer optimizations. + * LLVM modules can be created with "tm" too. There is no difference. + */ + LLVMTargetMachineRef low_opt_tm; /* uses -O1 instead of -O2 */ + struct ac_compiler_passes *low_opt_passes; +}; + +const char *ac_get_llvm_processor_name(enum radeon_family family); +void ac_add_attr_dereferenceable(LLVMValueRef val, uint64_t bytes); +bool ac_is_sgpr_param(LLVMValueRef param); +void ac_add_function_attr(LLVMContextRef ctx, LLVMValueRef function, + int attr_idx, enum ac_func_attr attr); +void ac_add_func_attributes(LLVMContextRef ctx, LLVMValueRef function, + unsigned attrib_mask); +void ac_dump_module(LLVMModuleRef module); + +LLVMValueRef ac_llvm_get_called_value(LLVMValueRef call); +bool ac_llvm_is_function(LLVMValueRef v); +LLVMModuleRef ac_create_module(LLVMTargetMachineRef tm, LLVMContextRef ctx); + +LLVMBuilderRef ac_create_builder(LLVMContextRef ctx, + enum ac_float_mode float_mode); + +void +ac_llvm_add_target_dep_function_attr(LLVMValueRef F, + const char *name, unsigned value); +void ac_llvm_set_workgroup_size(LLVMValueRef F, unsigned size); + +static inline unsigned +ac_get_load_intr_attribs(bool can_speculate) +{ + /* READNONE means writes can't affect it, while READONLY means that + * writes can affect it. */ + return can_speculate ? AC_FUNC_ATTR_READNONE : + AC_FUNC_ATTR_READONLY; +} + +unsigned +ac_count_scratch_private_memory(LLVMValueRef function); + +LLVMTargetLibraryInfoRef ac_create_target_library_info(const char *triple); +void ac_dispose_target_library_info(LLVMTargetLibraryInfoRef library_info); +void ac_init_llvm_once(void); + + +bool ac_init_llvm_compiler(struct ac_llvm_compiler *compiler, + enum radeon_family family, + enum ac_target_machine_options tm_options); +void ac_destroy_llvm_compiler(struct ac_llvm_compiler *compiler); + +struct ac_compiler_passes *ac_create_llvm_passes(LLVMTargetMachineRef tm); +void ac_destroy_llvm_passes(struct ac_compiler_passes *p); +bool ac_compile_module_to_elf(struct ac_compiler_passes *p, LLVMModuleRef module, + char **pelf_buffer, size_t *pelf_size); +void ac_llvm_add_barrier_noop_pass(LLVMPassManagerRef passmgr); +void ac_enable_global_isel(LLVMTargetMachineRef tm); + +static inline bool +ac_has_vec3_support(enum chip_class chip, bool use_format) +{ + if (chip == GFX6 && !use_format) { + /* GFX6 only supports vec3 with load/store format. */ + return false; + } + + return LLVM_VERSION_MAJOR >= 9; +} + +#ifdef __cplusplus +} +#endif + +#endif /* AC_LLVM_UTIL_H */ diff -Nru mesa-19.2.8/src/amd/llvm/ac_nir_to_llvm.c mesa-20.0.8/src/amd/llvm/ac_nir_to_llvm.c --- mesa-19.2.8/src/amd/llvm/ac_nir_to_llvm.c 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/amd/llvm/ac_nir_to_llvm.c 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,5371 @@ +/* + * Copyright © 2016 Bas Nieuwenhuizen + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include + +#include "ac_nir_to_llvm.h" +#include "ac_llvm_build.h" +#include "ac_llvm_util.h" +#include "ac_binary.h" +#include "sid.h" +#include "nir/nir.h" +#include "nir/nir_deref.h" +#include "util/bitscan.h" +#include "util/u_math.h" +#include "ac_shader_abi.h" +#include "ac_shader_util.h" + +struct ac_nir_context { + struct ac_llvm_context ac; + struct ac_shader_abi *abi; + const struct ac_shader_args *args; + + gl_shader_stage stage; + shader_info *info; + + LLVMValueRef *ssa_defs; + + LLVMValueRef scratch; + LLVMValueRef constant_data; + + struct hash_table *defs; + struct hash_table *phis; + struct hash_table *vars; + + LLVMValueRef main_function; + LLVMBasicBlockRef continue_block; + LLVMBasicBlockRef break_block; + + int num_locals; + LLVMValueRef *locals; +}; + +static LLVMValueRef get_sampler_desc_index(struct ac_nir_context *ctx, + nir_deref_instr *deref_instr, + const nir_instr *instr, + bool image); + +static LLVMValueRef get_sampler_desc(struct ac_nir_context *ctx, + nir_deref_instr *deref_instr, + enum ac_descriptor_type desc_type, + const nir_instr *instr, + LLVMValueRef index, + bool image, bool write); + +static void +build_store_values_extended(struct ac_llvm_context *ac, + LLVMValueRef *values, + unsigned value_count, + unsigned value_stride, + LLVMValueRef vec) +{ + LLVMBuilderRef builder = ac->builder; + unsigned i; + + for (i = 0; i < value_count; i++) { + LLVMValueRef ptr = values[i * value_stride]; + LLVMValueRef index = LLVMConstInt(ac->i32, i, false); + LLVMValueRef value = LLVMBuildExtractElement(builder, vec, index, ""); + LLVMBuildStore(builder, value, ptr); + } +} + +static LLVMTypeRef get_def_type(struct ac_nir_context *ctx, + const nir_ssa_def *def) +{ + LLVMTypeRef type = LLVMIntTypeInContext(ctx->ac.context, def->bit_size); + if (def->num_components > 1) { + type = LLVMVectorType(type, def->num_components); + } + return type; +} + +static LLVMValueRef get_src(struct ac_nir_context *nir, nir_src src) +{ + assert(src.is_ssa); + return nir->ssa_defs[src.ssa->index]; +} + +static LLVMValueRef +get_memory_ptr(struct ac_nir_context *ctx, nir_src src, unsigned bit_size) +{ + LLVMValueRef ptr = get_src(ctx, src); + ptr = LLVMBuildGEP(ctx->ac.builder, ctx->ac.lds, &ptr, 1, ""); + int addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr)); + + LLVMTypeRef type = LLVMIntTypeInContext(ctx->ac.context, bit_size); + + return LLVMBuildBitCast(ctx->ac.builder, ptr, + LLVMPointerType(type, addr_space), ""); +} + +static LLVMBasicBlockRef get_block(struct ac_nir_context *nir, + const struct nir_block *b) +{ + struct hash_entry *entry = _mesa_hash_table_search(nir->defs, b); + return (LLVMBasicBlockRef)entry->data; +} + +static LLVMValueRef get_alu_src(struct ac_nir_context *ctx, + nir_alu_src src, + unsigned num_components) +{ + LLVMValueRef value = get_src(ctx, src.src); + bool need_swizzle = false; + + assert(value); + unsigned src_components = ac_get_llvm_num_components(value); + for (unsigned i = 0; i < num_components; ++i) { + assert(src.swizzle[i] < src_components); + if (src.swizzle[i] != i) + need_swizzle = true; + } + + if (need_swizzle || num_components != src_components) { + LLVMValueRef masks[] = { + LLVMConstInt(ctx->ac.i32, src.swizzle[0], false), + LLVMConstInt(ctx->ac.i32, src.swizzle[1], false), + LLVMConstInt(ctx->ac.i32, src.swizzle[2], false), + LLVMConstInt(ctx->ac.i32, src.swizzle[3], false)}; + + if (src_components > 1 && num_components == 1) { + value = LLVMBuildExtractElement(ctx->ac.builder, value, + masks[0], ""); + } else if (src_components == 1 && num_components > 1) { + LLVMValueRef values[] = {value, value, value, value}; + value = ac_build_gather_values(&ctx->ac, values, num_components); + } else { + LLVMValueRef swizzle = LLVMConstVector(masks, num_components); + value = LLVMBuildShuffleVector(ctx->ac.builder, value, value, + swizzle, ""); + } + } + assert(!src.negate); + assert(!src.abs); + return value; +} + +static LLVMValueRef emit_int_cmp(struct ac_llvm_context *ctx, + LLVMIntPredicate pred, LLVMValueRef src0, + LLVMValueRef src1) +{ + LLVMValueRef result = LLVMBuildICmp(ctx->builder, pred, src0, src1, ""); + return LLVMBuildSelect(ctx->builder, result, + LLVMConstInt(ctx->i32, 0xFFFFFFFF, false), + ctx->i32_0, ""); +} + +static LLVMValueRef emit_float_cmp(struct ac_llvm_context *ctx, + LLVMRealPredicate pred, LLVMValueRef src0, + LLVMValueRef src1) +{ + LLVMValueRef result; + src0 = ac_to_float(ctx, src0); + src1 = ac_to_float(ctx, src1); + result = LLVMBuildFCmp(ctx->builder, pred, src0, src1, ""); + return LLVMBuildSelect(ctx->builder, result, + LLVMConstInt(ctx->i32, 0xFFFFFFFF, false), + ctx->i32_0, ""); +} + +static LLVMValueRef emit_intrin_1f_param(struct ac_llvm_context *ctx, + const char *intrin, + LLVMTypeRef result_type, + LLVMValueRef src0) +{ + char name[64]; + LLVMValueRef params[] = { + ac_to_float(ctx, src0), + }; + + ASSERTED const int length = snprintf(name, sizeof(name), "%s.f%d", intrin, + ac_get_elem_bits(ctx, result_type)); + assert(length < sizeof(name)); + return ac_build_intrinsic(ctx, name, result_type, params, 1, AC_FUNC_ATTR_READNONE); +} + +static LLVMValueRef emit_intrin_2f_param(struct ac_llvm_context *ctx, + const char *intrin, + LLVMTypeRef result_type, + LLVMValueRef src0, LLVMValueRef src1) +{ + char name[64]; + LLVMValueRef params[] = { + ac_to_float(ctx, src0), + ac_to_float(ctx, src1), + }; + + ASSERTED const int length = snprintf(name, sizeof(name), "%s.f%d", intrin, + ac_get_elem_bits(ctx, result_type)); + assert(length < sizeof(name)); + return ac_build_intrinsic(ctx, name, result_type, params, 2, AC_FUNC_ATTR_READNONE); +} + +static LLVMValueRef emit_intrin_3f_param(struct ac_llvm_context *ctx, + const char *intrin, + LLVMTypeRef result_type, + LLVMValueRef src0, LLVMValueRef src1, LLVMValueRef src2) +{ + char name[64]; + LLVMValueRef params[] = { + ac_to_float(ctx, src0), + ac_to_float(ctx, src1), + ac_to_float(ctx, src2), + }; + + ASSERTED const int length = snprintf(name, sizeof(name), "%s.f%d", intrin, + ac_get_elem_bits(ctx, result_type)); + assert(length < sizeof(name)); + return ac_build_intrinsic(ctx, name, result_type, params, 3, AC_FUNC_ATTR_READNONE); +} + +static LLVMValueRef emit_bcsel(struct ac_llvm_context *ctx, + LLVMValueRef src0, LLVMValueRef src1, LLVMValueRef src2) +{ + LLVMTypeRef src1_type = LLVMTypeOf(src1); + LLVMTypeRef src2_type = LLVMTypeOf(src2); + + assert(LLVMGetTypeKind(LLVMTypeOf(src0)) != LLVMVectorTypeKind); + + if (LLVMGetTypeKind(src1_type) == LLVMPointerTypeKind && + LLVMGetTypeKind(src2_type) != LLVMPointerTypeKind) { + src2 = LLVMBuildIntToPtr(ctx->builder, src2, src1_type, ""); + } else if (LLVMGetTypeKind(src2_type) == LLVMPointerTypeKind && + LLVMGetTypeKind(src1_type) != LLVMPointerTypeKind) { + src1 = LLVMBuildIntToPtr(ctx->builder, src1, src2_type, ""); + } + + LLVMValueRef v = LLVMBuildICmp(ctx->builder, LLVMIntNE, src0, + ctx->i32_0, ""); + return LLVMBuildSelect(ctx->builder, v, + ac_to_integer_or_pointer(ctx, src1), + ac_to_integer_or_pointer(ctx, src2), ""); +} + +static LLVMValueRef emit_iabs(struct ac_llvm_context *ctx, + LLVMValueRef src0) +{ + return ac_build_imax(ctx, src0, LLVMBuildNeg(ctx->builder, src0, "")); +} + +static LLVMValueRef emit_uint_carry(struct ac_llvm_context *ctx, + const char *intrin, + LLVMValueRef src0, LLVMValueRef src1) +{ + LLVMTypeRef ret_type; + LLVMTypeRef types[] = { ctx->i32, ctx->i1 }; + LLVMValueRef res; + LLVMValueRef params[] = { src0, src1 }; + ret_type = LLVMStructTypeInContext(ctx->context, types, + 2, true); + + res = ac_build_intrinsic(ctx, intrin, ret_type, + params, 2, AC_FUNC_ATTR_READNONE); + + res = LLVMBuildExtractValue(ctx->builder, res, 1, ""); + res = LLVMBuildZExt(ctx->builder, res, ctx->i32, ""); + return res; +} + +static LLVMValueRef emit_b2f(struct ac_llvm_context *ctx, + LLVMValueRef src0, + unsigned bitsize) +{ + LLVMValueRef result = LLVMBuildAnd(ctx->builder, src0, + LLVMBuildBitCast(ctx->builder, LLVMConstReal(ctx->f32, 1.0), ctx->i32, ""), + ""); + result = LLVMBuildBitCast(ctx->builder, result, ctx->f32, ""); + + switch (bitsize) { + case 16: + return LLVMBuildFPTrunc(ctx->builder, result, ctx->f16, ""); + case 32: + return result; + case 64: + return LLVMBuildFPExt(ctx->builder, result, ctx->f64, ""); + default: + unreachable("Unsupported bit size."); + } +} + +static LLVMValueRef emit_f2b(struct ac_llvm_context *ctx, + LLVMValueRef src0) +{ + src0 = ac_to_float(ctx, src0); + LLVMValueRef zero = LLVMConstNull(LLVMTypeOf(src0)); + return LLVMBuildSExt(ctx->builder, + LLVMBuildFCmp(ctx->builder, LLVMRealUNE, src0, zero, ""), + ctx->i32, ""); +} + +static LLVMValueRef emit_b2i(struct ac_llvm_context *ctx, + LLVMValueRef src0, + unsigned bitsize) +{ + LLVMValueRef result = LLVMBuildAnd(ctx->builder, src0, ctx->i32_1, ""); + + switch (bitsize) { + case 8: + return LLVMBuildTrunc(ctx->builder, result, ctx->i8, ""); + case 16: + return LLVMBuildTrunc(ctx->builder, result, ctx->i16, ""); + case 32: + return result; + case 64: + return LLVMBuildZExt(ctx->builder, result, ctx->i64, ""); + default: + unreachable("Unsupported bit size."); + } +} + +static LLVMValueRef emit_i2b(struct ac_llvm_context *ctx, + LLVMValueRef src0) +{ + LLVMValueRef zero = LLVMConstNull(LLVMTypeOf(src0)); + return LLVMBuildSExt(ctx->builder, + LLVMBuildICmp(ctx->builder, LLVMIntNE, src0, zero, ""), + ctx->i32, ""); +} + +static LLVMValueRef emit_f2f16(struct ac_llvm_context *ctx, + LLVMValueRef src0) +{ + LLVMValueRef result; + LLVMValueRef cond = NULL; + + src0 = ac_to_float(ctx, src0); + result = LLVMBuildFPTrunc(ctx->builder, src0, ctx->f16, ""); + + if (ctx->chip_class >= GFX8) { + LLVMValueRef args[2]; + /* Check if the result is a denormal - and flush to 0 if so. */ + args[0] = result; + args[1] = LLVMConstInt(ctx->i32, N_SUBNORMAL | P_SUBNORMAL, false); + cond = ac_build_intrinsic(ctx, "llvm.amdgcn.class.f16", ctx->i1, args, 2, AC_FUNC_ATTR_READNONE); + } + + /* need to convert back up to f32 */ + result = LLVMBuildFPExt(ctx->builder, result, ctx->f32, ""); + + if (ctx->chip_class >= GFX8) + result = LLVMBuildSelect(ctx->builder, cond, ctx->f32_0, result, ""); + else { + /* for GFX6-GFX7 */ + /* 0x38800000 is smallest half float value (2^-14) in 32-bit float, + * so compare the result and flush to 0 if it's smaller. + */ + LLVMValueRef temp, cond2; + temp = emit_intrin_1f_param(ctx, "llvm.fabs", ctx->f32, result); + cond = LLVMBuildFCmp(ctx->builder, LLVMRealOGT, + LLVMBuildBitCast(ctx->builder, LLVMConstInt(ctx->i32, 0x38800000, false), ctx->f32, ""), + temp, ""); + cond2 = LLVMBuildFCmp(ctx->builder, LLVMRealONE, + temp, ctx->f32_0, ""); + cond = LLVMBuildAnd(ctx->builder, cond, cond2, ""); + result = LLVMBuildSelect(ctx->builder, cond, ctx->f32_0, result, ""); + } + return result; +} + +static LLVMValueRef emit_umul_high(struct ac_llvm_context *ctx, + LLVMValueRef src0, LLVMValueRef src1) +{ + LLVMValueRef dst64, result; + src0 = LLVMBuildZExt(ctx->builder, src0, ctx->i64, ""); + src1 = LLVMBuildZExt(ctx->builder, src1, ctx->i64, ""); + + dst64 = LLVMBuildMul(ctx->builder, src0, src1, ""); + dst64 = LLVMBuildLShr(ctx->builder, dst64, LLVMConstInt(ctx->i64, 32, false), ""); + result = LLVMBuildTrunc(ctx->builder, dst64, ctx->i32, ""); + return result; +} + +static LLVMValueRef emit_imul_high(struct ac_llvm_context *ctx, + LLVMValueRef src0, LLVMValueRef src1) +{ + LLVMValueRef dst64, result; + src0 = LLVMBuildSExt(ctx->builder, src0, ctx->i64, ""); + src1 = LLVMBuildSExt(ctx->builder, src1, ctx->i64, ""); + + dst64 = LLVMBuildMul(ctx->builder, src0, src1, ""); + dst64 = LLVMBuildAShr(ctx->builder, dst64, LLVMConstInt(ctx->i64, 32, false), ""); + result = LLVMBuildTrunc(ctx->builder, dst64, ctx->i32, ""); + return result; +} + +static LLVMValueRef emit_bfm(struct ac_llvm_context *ctx, + LLVMValueRef bits, LLVMValueRef offset) +{ + /* mask = ((1 << bits) - 1) << offset */ + return LLVMBuildShl(ctx->builder, + LLVMBuildSub(ctx->builder, + LLVMBuildShl(ctx->builder, + ctx->i32_1, + bits, ""), + ctx->i32_1, ""), + offset, ""); +} + +static LLVMValueRef emit_bitfield_select(struct ac_llvm_context *ctx, + LLVMValueRef mask, LLVMValueRef insert, + LLVMValueRef base) +{ + /* Calculate: + * (mask & insert) | (~mask & base) = base ^ (mask & (insert ^ base)) + * Use the right-hand side, which the LLVM backend can convert to V_BFI. + */ + return LLVMBuildXor(ctx->builder, base, + LLVMBuildAnd(ctx->builder, mask, + LLVMBuildXor(ctx->builder, insert, base, ""), ""), ""); +} + +static LLVMValueRef emit_pack_2x16(struct ac_llvm_context *ctx, + LLVMValueRef src0, + LLVMValueRef (*pack)(struct ac_llvm_context *ctx, + LLVMValueRef args[2])) +{ + LLVMValueRef comp[2]; + + src0 = ac_to_float(ctx, src0); + comp[0] = LLVMBuildExtractElement(ctx->builder, src0, ctx->i32_0, ""); + comp[1] = LLVMBuildExtractElement(ctx->builder, src0, ctx->i32_1, ""); + + return LLVMBuildBitCast(ctx->builder, pack(ctx, comp), ctx->i32, ""); +} + +static LLVMValueRef emit_unpack_half_2x16(struct ac_llvm_context *ctx, + LLVMValueRef src0) +{ + LLVMValueRef const16 = LLVMConstInt(ctx->i32, 16, false); + LLVMValueRef temps[2], val; + int i; + + for (i = 0; i < 2; i++) { + val = i == 1 ? LLVMBuildLShr(ctx->builder, src0, const16, "") : src0; + val = LLVMBuildTrunc(ctx->builder, val, ctx->i16, ""); + val = LLVMBuildBitCast(ctx->builder, val, ctx->f16, ""); + temps[i] = LLVMBuildFPExt(ctx->builder, val, ctx->f32, ""); + } + return ac_build_gather_values(ctx, temps, 2); +} + +static LLVMValueRef emit_ddxy(struct ac_nir_context *ctx, + nir_op op, + LLVMValueRef src0) +{ + unsigned mask; + int idx; + LLVMValueRef result; + + if (op == nir_op_fddx_fine) + mask = AC_TID_MASK_LEFT; + else if (op == nir_op_fddy_fine) + mask = AC_TID_MASK_TOP; + else + mask = AC_TID_MASK_TOP_LEFT; + + /* for DDX we want to next X pixel, DDY next Y pixel. */ + if (op == nir_op_fddx_fine || + op == nir_op_fddx_coarse || + op == nir_op_fddx) + idx = 1; + else + idx = 2; + + result = ac_build_ddxy(&ctx->ac, mask, idx, src0); + return result; +} + +struct waterfall_context { + LLVMBasicBlockRef phi_bb[2]; + bool use_waterfall; +}; + +/* To deal with divergent descriptors we can create a loop that handles all + * lanes with the same descriptor on a given iteration (henceforth a + * waterfall loop). + * + * These helper create the begin and end of the loop leaving the caller + * to implement the body. + * + * params: + * - ctx is the usal nir context + * - wctx is a temporary struct containing some loop info. Can be left uninitialized. + * - value is the possibly divergent value for which we built the loop + * - divergent is whether value is actually divergent. If false we just pass + * things through. + */ +static LLVMValueRef enter_waterfall(struct ac_nir_context *ctx, + struct waterfall_context *wctx, + LLVMValueRef value, bool divergent) +{ + /* If the app claims the value is divergent but it is constant we can + * end up with a dynamic index of NULL. */ + if (!value) + divergent = false; + + wctx->use_waterfall = divergent; + if (!divergent) + return value; + + ac_build_bgnloop(&ctx->ac, 6000); + + LLVMValueRef scalar_value = ac_build_readlane(&ctx->ac, value, NULL); + + LLVMValueRef active = LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, value, + scalar_value, "uniform_active"); + + wctx->phi_bb[0] = LLVMGetInsertBlock(ctx->ac.builder); + ac_build_ifcc(&ctx->ac, active, 6001); + + return scalar_value; +} + +static LLVMValueRef exit_waterfall(struct ac_nir_context *ctx, + struct waterfall_context *wctx, + LLVMValueRef value) +{ + LLVMValueRef ret = NULL; + LLVMValueRef phi_src[2]; + LLVMValueRef cc_phi_src[2] = { + LLVMConstInt(ctx->ac.i32, 0, false), + LLVMConstInt(ctx->ac.i32, 0xffffffff, false), + }; + + if (!wctx->use_waterfall) + return value; + + wctx->phi_bb[1] = LLVMGetInsertBlock(ctx->ac.builder); + + ac_build_endif(&ctx->ac, 6001); + + if (value) { + phi_src[0] = LLVMGetUndef(LLVMTypeOf(value)); + phi_src[1] = value; + + ret = ac_build_phi(&ctx->ac, LLVMTypeOf(value), 2, phi_src, wctx->phi_bb); + } + + /* + * By using the optimization barrier on the exit decision, we decouple + * the operations from the break, and hence avoid LLVM hoisting the + * opteration into the break block. + */ + LLVMValueRef cc = ac_build_phi(&ctx->ac, ctx->ac.i32, 2, cc_phi_src, wctx->phi_bb); + ac_build_optimization_barrier(&ctx->ac, &cc); + + LLVMValueRef active = LLVMBuildICmp(ctx->ac.builder, LLVMIntNE, cc, ctx->ac.i32_0, "uniform_active2"); + ac_build_ifcc(&ctx->ac, active, 6002); + ac_build_break(&ctx->ac); + ac_build_endif(&ctx->ac, 6002); + + ac_build_endloop(&ctx->ac, 6000); + return ret; +} + +static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr) +{ + LLVMValueRef src[4], result = NULL; + unsigned num_components = instr->dest.dest.ssa.num_components; + unsigned src_components; + LLVMTypeRef def_type = get_def_type(ctx, &instr->dest.dest.ssa); + + assert(nir_op_infos[instr->op].num_inputs <= ARRAY_SIZE(src)); + switch (instr->op) { + case nir_op_vec2: + case nir_op_vec3: + case nir_op_vec4: + src_components = 1; + break; + case nir_op_pack_half_2x16: + case nir_op_pack_snorm_2x16: + case nir_op_pack_unorm_2x16: + src_components = 2; + break; + case nir_op_unpack_half_2x16: + src_components = 1; + break; + case nir_op_cube_face_coord: + case nir_op_cube_face_index: + src_components = 3; + break; + default: + src_components = num_components; + break; + } + for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) + src[i] = get_alu_src(ctx, instr->src[i], src_components); + + switch (instr->op) { + case nir_op_mov: + result = src[0]; + break; + case nir_op_fneg: + src[0] = ac_to_float(&ctx->ac, src[0]); + result = LLVMBuildFNeg(ctx->ac.builder, src[0], ""); + if (ctx->ac.float_mode == AC_FLOAT_MODE_DENORM_FLUSH_TO_ZERO) { + /* fneg will be optimized by backend compiler with sign + * bit removed via XOR. This is probably a LLVM bug. + */ + result = ac_build_canonicalize(&ctx->ac, result, + instr->dest.dest.ssa.bit_size); + } + break; + case nir_op_ineg: + result = LLVMBuildNeg(ctx->ac.builder, src[0], ""); + break; + case nir_op_inot: + result = LLVMBuildNot(ctx->ac.builder, src[0], ""); + break; + case nir_op_iadd: + result = LLVMBuildAdd(ctx->ac.builder, src[0], src[1], ""); + break; + case nir_op_fadd: + src[0] = ac_to_float(&ctx->ac, src[0]); + src[1] = ac_to_float(&ctx->ac, src[1]); + result = LLVMBuildFAdd(ctx->ac.builder, src[0], src[1], ""); + break; + case nir_op_fsub: + src[0] = ac_to_float(&ctx->ac, src[0]); + src[1] = ac_to_float(&ctx->ac, src[1]); + result = LLVMBuildFSub(ctx->ac.builder, src[0], src[1], ""); + break; + case nir_op_isub: + result = LLVMBuildSub(ctx->ac.builder, src[0], src[1], ""); + break; + case nir_op_imul: + result = LLVMBuildMul(ctx->ac.builder, src[0], src[1], ""); + break; + case nir_op_imod: + result = LLVMBuildSRem(ctx->ac.builder, src[0], src[1], ""); + break; + case nir_op_umod: + result = LLVMBuildURem(ctx->ac.builder, src[0], src[1], ""); + break; + case nir_op_fmod: + /* lower_fmod only lower 16-bit and 32-bit fmod */ + assert(instr->dest.dest.ssa.bit_size == 64); + src[0] = ac_to_float(&ctx->ac, src[0]); + src[1] = ac_to_float(&ctx->ac, src[1]); + result = ac_build_fdiv(&ctx->ac, src[0], src[1]); + result = emit_intrin_1f_param(&ctx->ac, "llvm.floor", + ac_to_float_type(&ctx->ac, def_type), result); + result = LLVMBuildFMul(ctx->ac.builder, src[1] , result, ""); + result = LLVMBuildFSub(ctx->ac.builder, src[0], result, ""); + break; + case nir_op_irem: + result = LLVMBuildSRem(ctx->ac.builder, src[0], src[1], ""); + break; + case nir_op_idiv: + result = LLVMBuildSDiv(ctx->ac.builder, src[0], src[1], ""); + break; + case nir_op_udiv: + result = LLVMBuildUDiv(ctx->ac.builder, src[0], src[1], ""); + break; + case nir_op_fmul: + src[0] = ac_to_float(&ctx->ac, src[0]); + src[1] = ac_to_float(&ctx->ac, src[1]); + result = LLVMBuildFMul(ctx->ac.builder, src[0], src[1], ""); + break; + case nir_op_frcp: + result = emit_intrin_1f_param(&ctx->ac, "llvm.amdgcn.rcp", + ac_to_float_type(&ctx->ac, def_type), src[0]); + break; + case nir_op_iand: + result = LLVMBuildAnd(ctx->ac.builder, src[0], src[1], ""); + break; + case nir_op_ior: + result = LLVMBuildOr(ctx->ac.builder, src[0], src[1], ""); + break; + case nir_op_ixor: + result = LLVMBuildXor(ctx->ac.builder, src[0], src[1], ""); + break; + case nir_op_ishl: + if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[1])) < ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0]))) + src[1] = LLVMBuildZExt(ctx->ac.builder, src[1], + LLVMTypeOf(src[0]), ""); + else if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[1])) > ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0]))) + src[1] = LLVMBuildTrunc(ctx->ac.builder, src[1], + LLVMTypeOf(src[0]), ""); + result = LLVMBuildShl(ctx->ac.builder, src[0], src[1], ""); + break; + case nir_op_ishr: + if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[1])) < ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0]))) + src[1] = LLVMBuildZExt(ctx->ac.builder, src[1], + LLVMTypeOf(src[0]), ""); + else if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[1])) > ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0]))) + src[1] = LLVMBuildTrunc(ctx->ac.builder, src[1], + LLVMTypeOf(src[0]), ""); + result = LLVMBuildAShr(ctx->ac.builder, src[0], src[1], ""); + break; + case nir_op_ushr: + if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[1])) < ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0]))) + src[1] = LLVMBuildZExt(ctx->ac.builder, src[1], + LLVMTypeOf(src[0]), ""); + else if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[1])) > ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0]))) + src[1] = LLVMBuildTrunc(ctx->ac.builder, src[1], + LLVMTypeOf(src[0]), ""); + result = LLVMBuildLShr(ctx->ac.builder, src[0], src[1], ""); + break; + case nir_op_ilt32: + result = emit_int_cmp(&ctx->ac, LLVMIntSLT, src[0], src[1]); + break; + case nir_op_ine32: + result = emit_int_cmp(&ctx->ac, LLVMIntNE, src[0], src[1]); + break; + case nir_op_ieq32: + result = emit_int_cmp(&ctx->ac, LLVMIntEQ, src[0], src[1]); + break; + case nir_op_ige32: + result = emit_int_cmp(&ctx->ac, LLVMIntSGE, src[0], src[1]); + break; + case nir_op_ult32: + result = emit_int_cmp(&ctx->ac, LLVMIntULT, src[0], src[1]); + break; + case nir_op_uge32: + result = emit_int_cmp(&ctx->ac, LLVMIntUGE, src[0], src[1]); + break; + case nir_op_feq32: + result = emit_float_cmp(&ctx->ac, LLVMRealOEQ, src[0], src[1]); + break; + case nir_op_fne32: + result = emit_float_cmp(&ctx->ac, LLVMRealUNE, src[0], src[1]); + break; + case nir_op_flt32: + result = emit_float_cmp(&ctx->ac, LLVMRealOLT, src[0], src[1]); + break; + case nir_op_fge32: + result = emit_float_cmp(&ctx->ac, LLVMRealOGE, src[0], src[1]); + break; + case nir_op_fabs: + result = emit_intrin_1f_param(&ctx->ac, "llvm.fabs", + ac_to_float_type(&ctx->ac, def_type), src[0]); + if (ctx->ac.float_mode == AC_FLOAT_MODE_DENORM_FLUSH_TO_ZERO) { + /* fabs will be optimized by backend compiler with sign + * bit removed via AND. + */ + result = ac_build_canonicalize(&ctx->ac, result, + instr->dest.dest.ssa.bit_size); + } + break; + case nir_op_iabs: + result = emit_iabs(&ctx->ac, src[0]); + break; + case nir_op_imax: + result = ac_build_imax(&ctx->ac, src[0], src[1]); + break; + case nir_op_imin: + result = ac_build_imin(&ctx->ac, src[0], src[1]); + break; + case nir_op_umax: + result = ac_build_umax(&ctx->ac, src[0], src[1]); + break; + case nir_op_umin: + result = ac_build_umin(&ctx->ac, src[0], src[1]); + break; + case nir_op_isign: + result = ac_build_isign(&ctx->ac, src[0], + instr->dest.dest.ssa.bit_size); + break; + case nir_op_fsign: + src[0] = ac_to_float(&ctx->ac, src[0]); + result = ac_build_fsign(&ctx->ac, src[0], + instr->dest.dest.ssa.bit_size); + break; + case nir_op_ffloor: + result = emit_intrin_1f_param(&ctx->ac, "llvm.floor", + ac_to_float_type(&ctx->ac, def_type), src[0]); + break; + case nir_op_ftrunc: + result = emit_intrin_1f_param(&ctx->ac, "llvm.trunc", + ac_to_float_type(&ctx->ac, def_type), src[0]); + break; + case nir_op_fceil: + result = emit_intrin_1f_param(&ctx->ac, "llvm.ceil", + ac_to_float_type(&ctx->ac, def_type), src[0]); + break; + case nir_op_fround_even: + result = emit_intrin_1f_param(&ctx->ac, "llvm.rint", + ac_to_float_type(&ctx->ac, def_type),src[0]); + break; + case nir_op_ffract: + src[0] = ac_to_float(&ctx->ac, src[0]); + result = ac_build_fract(&ctx->ac, src[0], + instr->dest.dest.ssa.bit_size); + break; + case nir_op_fsin: + result = emit_intrin_1f_param(&ctx->ac, "llvm.sin", + ac_to_float_type(&ctx->ac, def_type), src[0]); + break; + case nir_op_fcos: + result = emit_intrin_1f_param(&ctx->ac, "llvm.cos", + ac_to_float_type(&ctx->ac, def_type), src[0]); + break; + case nir_op_fsqrt: + result = emit_intrin_1f_param(&ctx->ac, "llvm.sqrt", + ac_to_float_type(&ctx->ac, def_type), src[0]); + break; + case nir_op_fexp2: + result = emit_intrin_1f_param(&ctx->ac, "llvm.exp2", + ac_to_float_type(&ctx->ac, def_type), src[0]); + break; + case nir_op_flog2: + result = emit_intrin_1f_param(&ctx->ac, "llvm.log2", + ac_to_float_type(&ctx->ac, def_type), src[0]); + break; + case nir_op_frsq: + result = emit_intrin_1f_param(&ctx->ac, "llvm.amdgcn.rsq", + ac_to_float_type(&ctx->ac, def_type), src[0]); + break; + case nir_op_frexp_exp: + src[0] = ac_to_float(&ctx->ac, src[0]); + result = ac_build_frexp_exp(&ctx->ac, src[0], + ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0]))); + if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0])) == 16) + result = LLVMBuildSExt(ctx->ac.builder, result, + ctx->ac.i32, ""); + break; + case nir_op_frexp_sig: + src[0] = ac_to_float(&ctx->ac, src[0]); + result = ac_build_frexp_mant(&ctx->ac, src[0], + instr->dest.dest.ssa.bit_size); + break; + case nir_op_fpow: + result = emit_intrin_2f_param(&ctx->ac, "llvm.pow", + ac_to_float_type(&ctx->ac, def_type), src[0], src[1]); + break; + case nir_op_fmax: + result = emit_intrin_2f_param(&ctx->ac, "llvm.maxnum", + ac_to_float_type(&ctx->ac, def_type), src[0], src[1]); + if (ctx->ac.chip_class < GFX9 && + instr->dest.dest.ssa.bit_size == 32) { + /* Only pre-GFX9 chips do not flush denorms. */ + result = ac_build_canonicalize(&ctx->ac, result, + instr->dest.dest.ssa.bit_size); + } + break; + case nir_op_fmin: + result = emit_intrin_2f_param(&ctx->ac, "llvm.minnum", + ac_to_float_type(&ctx->ac, def_type), src[0], src[1]); + if (ctx->ac.chip_class < GFX9 && + instr->dest.dest.ssa.bit_size == 32) { + /* Only pre-GFX9 chips do not flush denorms. */ + result = ac_build_canonicalize(&ctx->ac, result, + instr->dest.dest.ssa.bit_size); + } + break; + case nir_op_ffma: + /* FMA is better on GFX10, because it has FMA units instead of MUL-ADD units. */ + result = emit_intrin_3f_param(&ctx->ac, ctx->ac.chip_class >= GFX10 ? "llvm.fma" : "llvm.fmuladd", + ac_to_float_type(&ctx->ac, def_type), src[0], src[1], src[2]); + break; + case nir_op_ldexp: + src[0] = ac_to_float(&ctx->ac, src[0]); + if (ac_get_elem_bits(&ctx->ac, def_type) == 32) + result = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.ldexp.f32", ctx->ac.f32, src, 2, AC_FUNC_ATTR_READNONE); + else if (ac_get_elem_bits(&ctx->ac, def_type) == 16) + result = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.ldexp.f16", ctx->ac.f16, src, 2, AC_FUNC_ATTR_READNONE); + else + result = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.ldexp.f64", ctx->ac.f64, src, 2, AC_FUNC_ATTR_READNONE); + break; + case nir_op_bfm: + result = emit_bfm(&ctx->ac, src[0], src[1]); + break; + case nir_op_bitfield_select: + result = emit_bitfield_select(&ctx->ac, src[0], src[1], src[2]); + break; + case nir_op_ubfe: + result = ac_build_bfe(&ctx->ac, src[0], src[1], src[2], false); + break; + case nir_op_ibfe: + result = ac_build_bfe(&ctx->ac, src[0], src[1], src[2], true); + break; + case nir_op_bitfield_reverse: + result = ac_build_bitfield_reverse(&ctx->ac, src[0]); + break; + case nir_op_bit_count: + result = ac_build_bit_count(&ctx->ac, src[0]); + break; + case nir_op_vec2: + case nir_op_vec3: + case nir_op_vec4: + for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) + src[i] = ac_to_integer(&ctx->ac, src[i]); + result = ac_build_gather_values(&ctx->ac, src, num_components); + break; + case nir_op_f2i8: + case nir_op_f2i16: + case nir_op_f2i32: + case nir_op_f2i64: + src[0] = ac_to_float(&ctx->ac, src[0]); + result = LLVMBuildFPToSI(ctx->ac.builder, src[0], def_type, ""); + break; + case nir_op_f2u8: + case nir_op_f2u16: + case nir_op_f2u32: + case nir_op_f2u64: + src[0] = ac_to_float(&ctx->ac, src[0]); + result = LLVMBuildFPToUI(ctx->ac.builder, src[0], def_type, ""); + break; + case nir_op_i2f16: + case nir_op_i2f32: + case nir_op_i2f64: + result = LLVMBuildSIToFP(ctx->ac.builder, src[0], ac_to_float_type(&ctx->ac, def_type), ""); + break; + case nir_op_u2f16: + case nir_op_u2f32: + case nir_op_u2f64: + result = LLVMBuildUIToFP(ctx->ac.builder, src[0], ac_to_float_type(&ctx->ac, def_type), ""); + break; + case nir_op_f2f16_rtz: + src[0] = ac_to_float(&ctx->ac, src[0]); + if (LLVMTypeOf(src[0]) == ctx->ac.f64) + src[0] = LLVMBuildFPTrunc(ctx->ac.builder, src[0], ctx->ac.f32, ""); + LLVMValueRef param[2] = { src[0], ctx->ac.f32_0 }; + result = ac_build_cvt_pkrtz_f16(&ctx->ac, param); + result = LLVMBuildExtractElement(ctx->ac.builder, result, ctx->ac.i32_0, ""); + break; + case nir_op_f2f16_rtne: + case nir_op_f2f16: + case nir_op_f2f32: + case nir_op_f2f64: + src[0] = ac_to_float(&ctx->ac, src[0]); + if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0])) < ac_get_elem_bits(&ctx->ac, def_type)) + result = LLVMBuildFPExt(ctx->ac.builder, src[0], ac_to_float_type(&ctx->ac, def_type), ""); + else + result = LLVMBuildFPTrunc(ctx->ac.builder, src[0], ac_to_float_type(&ctx->ac, def_type), ""); + break; + case nir_op_u2u8: + case nir_op_u2u16: + case nir_op_u2u32: + case nir_op_u2u64: + if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0])) < ac_get_elem_bits(&ctx->ac, def_type)) + result = LLVMBuildZExt(ctx->ac.builder, src[0], def_type, ""); + else + result = LLVMBuildTrunc(ctx->ac.builder, src[0], def_type, ""); + break; + case nir_op_i2i8: + case nir_op_i2i16: + case nir_op_i2i32: + case nir_op_i2i64: + if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0])) < ac_get_elem_bits(&ctx->ac, def_type)) + result = LLVMBuildSExt(ctx->ac.builder, src[0], def_type, ""); + else + result = LLVMBuildTrunc(ctx->ac.builder, src[0], def_type, ""); + break; + case nir_op_b32csel: + result = emit_bcsel(&ctx->ac, src[0], src[1], src[2]); + break; + case nir_op_find_lsb: + result = ac_find_lsb(&ctx->ac, ctx->ac.i32, src[0]); + break; + case nir_op_ufind_msb: + result = ac_build_umsb(&ctx->ac, src[0], ctx->ac.i32); + break; + case nir_op_ifind_msb: + result = ac_build_imsb(&ctx->ac, src[0], ctx->ac.i32); + break; + case nir_op_uadd_carry: + result = emit_uint_carry(&ctx->ac, "llvm.uadd.with.overflow.i32", src[0], src[1]); + break; + case nir_op_usub_borrow: + result = emit_uint_carry(&ctx->ac, "llvm.usub.with.overflow.i32", src[0], src[1]); + break; + case nir_op_b2f16: + case nir_op_b2f32: + case nir_op_b2f64: + result = emit_b2f(&ctx->ac, src[0], instr->dest.dest.ssa.bit_size); + break; + case nir_op_f2b32: + result = emit_f2b(&ctx->ac, src[0]); + break; + case nir_op_b2i8: + case nir_op_b2i16: + case nir_op_b2i32: + case nir_op_b2i64: + result = emit_b2i(&ctx->ac, src[0], instr->dest.dest.ssa.bit_size); + break; + case nir_op_i2b32: + result = emit_i2b(&ctx->ac, src[0]); + break; + case nir_op_fquantize2f16: + result = emit_f2f16(&ctx->ac, src[0]); + break; + case nir_op_umul_high: + result = emit_umul_high(&ctx->ac, src[0], src[1]); + break; + case nir_op_imul_high: + result = emit_imul_high(&ctx->ac, src[0], src[1]); + break; + case nir_op_pack_half_2x16: + result = emit_pack_2x16(&ctx->ac, src[0], ac_build_cvt_pkrtz_f16); + break; + case nir_op_pack_snorm_2x16: + result = emit_pack_2x16(&ctx->ac, src[0], ac_build_cvt_pknorm_i16); + break; + case nir_op_pack_unorm_2x16: + result = emit_pack_2x16(&ctx->ac, src[0], ac_build_cvt_pknorm_u16); + break; + case nir_op_unpack_half_2x16: + result = emit_unpack_half_2x16(&ctx->ac, src[0]); + break; + case nir_op_fddx: + case nir_op_fddy: + case nir_op_fddx_fine: + case nir_op_fddy_fine: + case nir_op_fddx_coarse: + case nir_op_fddy_coarse: + result = emit_ddxy(ctx, instr->op, src[0]); + break; + + case nir_op_unpack_64_2x32_split_x: { + assert(ac_get_llvm_num_components(src[0]) == 1); + LLVMValueRef tmp = LLVMBuildBitCast(ctx->ac.builder, src[0], + ctx->ac.v2i32, + ""); + result = LLVMBuildExtractElement(ctx->ac.builder, tmp, + ctx->ac.i32_0, ""); + break; + } + + case nir_op_unpack_64_2x32_split_y: { + assert(ac_get_llvm_num_components(src[0]) == 1); + LLVMValueRef tmp = LLVMBuildBitCast(ctx->ac.builder, src[0], + ctx->ac.v2i32, + ""); + result = LLVMBuildExtractElement(ctx->ac.builder, tmp, + ctx->ac.i32_1, ""); + break; + } + + case nir_op_pack_64_2x32_split: { + LLVMValueRef tmp = ac_build_gather_values(&ctx->ac, src, 2); + result = LLVMBuildBitCast(ctx->ac.builder, tmp, ctx->ac.i64, ""); + break; + } + + case nir_op_pack_32_2x16_split: { + LLVMValueRef tmp = ac_build_gather_values(&ctx->ac, src, 2); + result = LLVMBuildBitCast(ctx->ac.builder, tmp, ctx->ac.i32, ""); + break; + } + + case nir_op_unpack_32_2x16_split_x: { + LLVMValueRef tmp = LLVMBuildBitCast(ctx->ac.builder, src[0], + ctx->ac.v2i16, + ""); + result = LLVMBuildExtractElement(ctx->ac.builder, tmp, + ctx->ac.i32_0, ""); + break; + } + + case nir_op_unpack_32_2x16_split_y: { + LLVMValueRef tmp = LLVMBuildBitCast(ctx->ac.builder, src[0], + ctx->ac.v2i16, + ""); + result = LLVMBuildExtractElement(ctx->ac.builder, tmp, + ctx->ac.i32_1, ""); + break; + } + + case nir_op_cube_face_coord: { + src[0] = ac_to_float(&ctx->ac, src[0]); + LLVMValueRef results[2]; + LLVMValueRef in[3]; + for (unsigned chan = 0; chan < 3; chan++) + in[chan] = ac_llvm_extract_elem(&ctx->ac, src[0], chan); + results[0] = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.cubesc", + ctx->ac.f32, in, 3, AC_FUNC_ATTR_READNONE); + results[1] = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.cubetc", + ctx->ac.f32, in, 3, AC_FUNC_ATTR_READNONE); + LLVMValueRef ma = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.cubema", + ctx->ac.f32, in, 3, AC_FUNC_ATTR_READNONE); + results[0] = ac_build_fdiv(&ctx->ac, results[0], ma); + results[1] = ac_build_fdiv(&ctx->ac, results[1], ma); + LLVMValueRef offset = LLVMConstReal(ctx->ac.f32, 0.5); + results[0] = LLVMBuildFAdd(ctx->ac.builder, results[0], offset, ""); + results[1] = LLVMBuildFAdd(ctx->ac.builder, results[1], offset, ""); + result = ac_build_gather_values(&ctx->ac, results, 2); + break; + } + + case nir_op_cube_face_index: { + src[0] = ac_to_float(&ctx->ac, src[0]); + LLVMValueRef in[3]; + for (unsigned chan = 0; chan < 3; chan++) + in[chan] = ac_llvm_extract_elem(&ctx->ac, src[0], chan); + result = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.cubeid", + ctx->ac.f32, in, 3, AC_FUNC_ATTR_READNONE); + break; + } + + case nir_op_fmin3: + result = emit_intrin_2f_param(&ctx->ac, "llvm.minnum", + ac_to_float_type(&ctx->ac, def_type), src[0], src[1]); + result = emit_intrin_2f_param(&ctx->ac, "llvm.minnum", + ac_to_float_type(&ctx->ac, def_type), result, src[2]); + break; + case nir_op_umin3: + result = ac_build_umin(&ctx->ac, src[0], src[1]); + result = ac_build_umin(&ctx->ac, result, src[2]); + break; + case nir_op_imin3: + result = ac_build_imin(&ctx->ac, src[0], src[1]); + result = ac_build_imin(&ctx->ac, result, src[2]); + break; + case nir_op_fmax3: + result = emit_intrin_2f_param(&ctx->ac, "llvm.maxnum", + ac_to_float_type(&ctx->ac, def_type), src[0], src[1]); + result = emit_intrin_2f_param(&ctx->ac, "llvm.maxnum", + ac_to_float_type(&ctx->ac, def_type), result, src[2]); + break; + case nir_op_umax3: + result = ac_build_umax(&ctx->ac, src[0], src[1]); + result = ac_build_umax(&ctx->ac, result, src[2]); + break; + case nir_op_imax3: + result = ac_build_imax(&ctx->ac, src[0], src[1]); + result = ac_build_imax(&ctx->ac, result, src[2]); + break; + case nir_op_fmed3: { + src[0] = ac_to_float(&ctx->ac, src[0]); + src[1] = ac_to_float(&ctx->ac, src[1]); + src[2] = ac_to_float(&ctx->ac, src[2]); + result = ac_build_fmed3(&ctx->ac, src[0], src[1], src[2], + instr->dest.dest.ssa.bit_size); + break; + } + case nir_op_imed3: { + LLVMValueRef tmp1 = ac_build_imin(&ctx->ac, src[0], src[1]); + LLVMValueRef tmp2 = ac_build_imax(&ctx->ac, src[0], src[1]); + tmp2 = ac_build_imin(&ctx->ac, tmp2, src[2]); + result = ac_build_imax(&ctx->ac, tmp1, tmp2); + break; + } + case nir_op_umed3: { + LLVMValueRef tmp1 = ac_build_umin(&ctx->ac, src[0], src[1]); + LLVMValueRef tmp2 = ac_build_umax(&ctx->ac, src[0], src[1]); + tmp2 = ac_build_umin(&ctx->ac, tmp2, src[2]); + result = ac_build_umax(&ctx->ac, tmp1, tmp2); + break; + } + + default: + fprintf(stderr, "Unknown NIR alu instr: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + abort(); + } + + if (result) { + assert(instr->dest.dest.is_ssa); + result = ac_to_integer_or_pointer(&ctx->ac, result); + ctx->ssa_defs[instr->dest.dest.ssa.index] = result; + } +} + +static void visit_load_const(struct ac_nir_context *ctx, + const nir_load_const_instr *instr) +{ + LLVMValueRef values[4], value = NULL; + LLVMTypeRef element_type = + LLVMIntTypeInContext(ctx->ac.context, instr->def.bit_size); + + for (unsigned i = 0; i < instr->def.num_components; ++i) { + switch (instr->def.bit_size) { + case 8: + values[i] = LLVMConstInt(element_type, + instr->value[i].u8, false); + break; + case 16: + values[i] = LLVMConstInt(element_type, + instr->value[i].u16, false); + break; + case 32: + values[i] = LLVMConstInt(element_type, + instr->value[i].u32, false); + break; + case 64: + values[i] = LLVMConstInt(element_type, + instr->value[i].u64, false); + break; + default: + fprintf(stderr, + "unsupported nir load_const bit_size: %d\n", + instr->def.bit_size); + abort(); + } + } + if (instr->def.num_components > 1) { + value = LLVMConstVector(values, instr->def.num_components); + } else + value = values[0]; + + ctx->ssa_defs[instr->def.index] = value; +} + +static LLVMValueRef +get_buffer_size(struct ac_nir_context *ctx, LLVMValueRef descriptor, bool in_elements) +{ + LLVMValueRef size = + LLVMBuildExtractElement(ctx->ac.builder, descriptor, + LLVMConstInt(ctx->ac.i32, 2, false), ""); + + /* GFX8 only */ + if (ctx->ac.chip_class == GFX8 && in_elements) { + /* On GFX8, the descriptor contains the size in bytes, + * but TXQ must return the size in elements. + * The stride is always non-zero for resources using TXQ. + */ + LLVMValueRef stride = + LLVMBuildExtractElement(ctx->ac.builder, descriptor, + ctx->ac.i32_1, ""); + stride = LLVMBuildLShr(ctx->ac.builder, stride, + LLVMConstInt(ctx->ac.i32, 16, false), ""); + stride = LLVMBuildAnd(ctx->ac.builder, stride, + LLVMConstInt(ctx->ac.i32, 0x3fff, false), ""); + + size = LLVMBuildUDiv(ctx->ac.builder, size, stride, ""); + } + return size; +} + +/* Gather4 should follow the same rules as bilinear filtering, but the hardware + * incorrectly forces nearest filtering if the texture format is integer. + * The only effect it has on Gather4, which always returns 4 texels for + * bilinear filtering, is that the final coordinates are off by 0.5 of + * the texel size. + * + * The workaround is to subtract 0.5 from the unnormalized coordinates, + * or (0.5 / size) from the normalized coordinates. + * + * However, cube textures with 8_8_8_8 data formats require a different + * workaround of overriding the num format to USCALED/SSCALED. This would lose + * precision in 32-bit data formats, so it needs to be applied dynamically at + * runtime. In this case, return an i1 value that indicates whether the + * descriptor was overridden (and hence a fixup of the sampler result is needed). + */ +static LLVMValueRef lower_gather4_integer(struct ac_llvm_context *ctx, + nir_variable *var, + struct ac_image_args *args, + const nir_tex_instr *instr) +{ + const struct glsl_type *type = glsl_without_array(var->type); + enum glsl_base_type stype = glsl_get_sampler_result_type(type); + LLVMValueRef wa_8888 = NULL; + LLVMValueRef half_texel[2]; + LLVMValueRef result; + + assert(stype == GLSL_TYPE_INT || stype == GLSL_TYPE_UINT); + + if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) { + LLVMValueRef formats; + LLVMValueRef data_format; + LLVMValueRef wa_formats; + + formats = LLVMBuildExtractElement(ctx->builder, args->resource, ctx->i32_1, ""); + + data_format = LLVMBuildLShr(ctx->builder, formats, + LLVMConstInt(ctx->i32, 20, false), ""); + data_format = LLVMBuildAnd(ctx->builder, data_format, + LLVMConstInt(ctx->i32, (1u << 6) - 1, false), ""); + wa_8888 = LLVMBuildICmp( + ctx->builder, LLVMIntEQ, data_format, + LLVMConstInt(ctx->i32, V_008F14_IMG_DATA_FORMAT_8_8_8_8, false), + ""); + + uint32_t wa_num_format = + stype == GLSL_TYPE_UINT ? + S_008F14_NUM_FORMAT(V_008F14_IMG_NUM_FORMAT_USCALED) : + S_008F14_NUM_FORMAT(V_008F14_IMG_NUM_FORMAT_SSCALED); + wa_formats = LLVMBuildAnd(ctx->builder, formats, + LLVMConstInt(ctx->i32, C_008F14_NUM_FORMAT, false), + ""); + wa_formats = LLVMBuildOr(ctx->builder, wa_formats, + LLVMConstInt(ctx->i32, wa_num_format, false), ""); + + formats = LLVMBuildSelect(ctx->builder, wa_8888, wa_formats, formats, ""); + args->resource = LLVMBuildInsertElement( + ctx->builder, args->resource, formats, ctx->i32_1, ""); + } + + if (instr->sampler_dim == GLSL_SAMPLER_DIM_RECT) { + assert(!wa_8888); + half_texel[0] = half_texel[1] = LLVMConstReal(ctx->f32, -0.5); + } else { + struct ac_image_args resinfo = {}; + LLVMBasicBlockRef bbs[2]; + + LLVMValueRef unnorm = NULL; + LLVMValueRef default_offset = ctx->f32_0; + if (instr->sampler_dim == GLSL_SAMPLER_DIM_2D && + !instr->is_array) { + /* In vulkan, whether the sampler uses unnormalized + * coordinates or not is a dynamic property of the + * sampler. Hence, to figure out whether or not we + * need to divide by the texture size, we need to test + * the sampler at runtime. This tests the bit set by + * radv_init_sampler(). + */ + LLVMValueRef sampler0 = + LLVMBuildExtractElement(ctx->builder, args->sampler, ctx->i32_0, ""); + sampler0 = LLVMBuildLShr(ctx->builder, sampler0, + LLVMConstInt(ctx->i32, 15, false), ""); + sampler0 = LLVMBuildAnd(ctx->builder, sampler0, ctx->i32_1, ""); + unnorm = LLVMBuildICmp(ctx->builder, LLVMIntEQ, sampler0, ctx->i32_1, ""); + default_offset = LLVMConstReal(ctx->f32, -0.5); + } + + bbs[0] = LLVMGetInsertBlock(ctx->builder); + if (wa_8888 || unnorm) { + assert(!(wa_8888 && unnorm)); + LLVMValueRef not_needed = wa_8888 ? wa_8888 : unnorm; + /* Skip the texture size query entirely if we don't need it. */ + ac_build_ifcc(ctx, LLVMBuildNot(ctx->builder, not_needed, ""), 2000); + bbs[1] = LLVMGetInsertBlock(ctx->builder); + } + + /* Query the texture size. */ + resinfo.dim = ac_get_sampler_dim(ctx->chip_class, instr->sampler_dim, instr->is_array); + resinfo.opcode = ac_image_get_resinfo; + resinfo.dmask = 0xf; + resinfo.lod = ctx->i32_0; + resinfo.resource = args->resource; + resinfo.attributes = AC_FUNC_ATTR_READNONE; + LLVMValueRef size = ac_build_image_opcode(ctx, &resinfo); + + /* Compute -0.5 / size. */ + for (unsigned c = 0; c < 2; c++) { + half_texel[c] = + LLVMBuildExtractElement(ctx->builder, size, + LLVMConstInt(ctx->i32, c, 0), ""); + half_texel[c] = LLVMBuildUIToFP(ctx->builder, half_texel[c], ctx->f32, ""); + half_texel[c] = ac_build_fdiv(ctx, ctx->f32_1, half_texel[c]); + half_texel[c] = LLVMBuildFMul(ctx->builder, half_texel[c], + LLVMConstReal(ctx->f32, -0.5), ""); + } + + if (wa_8888 || unnorm) { + ac_build_endif(ctx, 2000); + + for (unsigned c = 0; c < 2; c++) { + LLVMValueRef values[2] = { default_offset, half_texel[c] }; + half_texel[c] = ac_build_phi(ctx, ctx->f32, 2, + values, bbs); + } + } + } + + for (unsigned c = 0; c < 2; c++) { + LLVMValueRef tmp; + tmp = LLVMBuildBitCast(ctx->builder, args->coords[c], ctx->f32, ""); + args->coords[c] = LLVMBuildFAdd(ctx->builder, tmp, half_texel[c], ""); + } + + args->attributes = AC_FUNC_ATTR_READNONE; + result = ac_build_image_opcode(ctx, args); + + if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) { + LLVMValueRef tmp, tmp2; + + /* if the cube workaround is in place, f2i the result. */ + for (unsigned c = 0; c < 4; c++) { + tmp = LLVMBuildExtractElement(ctx->builder, result, LLVMConstInt(ctx->i32, c, false), ""); + if (stype == GLSL_TYPE_UINT) + tmp2 = LLVMBuildFPToUI(ctx->builder, tmp, ctx->i32, ""); + else + tmp2 = LLVMBuildFPToSI(ctx->builder, tmp, ctx->i32, ""); + tmp = LLVMBuildBitCast(ctx->builder, tmp, ctx->i32, ""); + tmp2 = LLVMBuildBitCast(ctx->builder, tmp2, ctx->i32, ""); + tmp = LLVMBuildSelect(ctx->builder, wa_8888, tmp2, tmp, ""); + tmp = LLVMBuildBitCast(ctx->builder, tmp, ctx->f32, ""); + result = LLVMBuildInsertElement(ctx->builder, result, tmp, LLVMConstInt(ctx->i32, c, false), ""); + } + } + return result; +} + +static nir_deref_instr *get_tex_texture_deref(const nir_tex_instr *instr) +{ + nir_deref_instr *texture_deref_instr = NULL; + + for (unsigned i = 0; i < instr->num_srcs; i++) { + switch (instr->src[i].src_type) { + case nir_tex_src_texture_deref: + texture_deref_instr = nir_src_as_deref(instr->src[i].src); + break; + default: + break; + } + } + return texture_deref_instr; +} + +static LLVMValueRef build_tex_intrinsic(struct ac_nir_context *ctx, + const nir_tex_instr *instr, + struct ac_image_args *args) +{ + if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) { + unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa); + + return ac_build_buffer_load_format(&ctx->ac, + args->resource, + args->coords[0], + ctx->ac.i32_0, + util_last_bit(mask), + 0, true); + } + + args->opcode = ac_image_sample; + + switch (instr->op) { + case nir_texop_txf: + case nir_texop_txf_ms: + case nir_texop_samples_identical: + args->opcode = args->level_zero || + instr->sampler_dim == GLSL_SAMPLER_DIM_MS ? + ac_image_load : ac_image_load_mip; + args->level_zero = false; + break; + case nir_texop_txs: + case nir_texop_query_levels: + args->opcode = ac_image_get_resinfo; + if (!args->lod) + args->lod = ctx->ac.i32_0; + args->level_zero = false; + break; + case nir_texop_tex: + if (ctx->stage != MESA_SHADER_FRAGMENT) { + assert(!args->lod); + args->level_zero = true; + } + break; + case nir_texop_tg4: + args->opcode = ac_image_gather4; + args->level_zero = true; + break; + case nir_texop_lod: + args->opcode = ac_image_get_lod; + break; + case nir_texop_fragment_fetch: + case nir_texop_fragment_mask_fetch: + args->opcode = ac_image_load; + args->level_zero = false; + break; + default: + break; + } + + if (instr->op == nir_texop_tg4 && ctx->ac.chip_class <= GFX8) { + nir_deref_instr *texture_deref_instr = get_tex_texture_deref(instr); + nir_variable *var = nir_deref_instr_get_variable(texture_deref_instr); + const struct glsl_type *type = glsl_without_array(var->type); + enum glsl_base_type stype = glsl_get_sampler_result_type(type); + if (stype == GLSL_TYPE_UINT || stype == GLSL_TYPE_INT) { + return lower_gather4_integer(&ctx->ac, var, args, instr); + } + } + + /* Fixup for GFX9 which allocates 1D textures as 2D. */ + if (instr->op == nir_texop_lod && ctx->ac.chip_class == GFX9) { + if ((args->dim == ac_image_2darray || + args->dim == ac_image_2d) && !args->coords[1]) { + args->coords[1] = ctx->ac.i32_0; + } + } + + args->attributes = AC_FUNC_ATTR_READNONE; + bool cs_derivs = ctx->stage == MESA_SHADER_COMPUTE && + ctx->info->cs.derivative_group != DERIVATIVE_GROUP_NONE; + if (ctx->stage == MESA_SHADER_FRAGMENT || cs_derivs) { + /* Prevent texture instructions with implicit derivatives from being + * sinked into branches. */ + switch (instr->op) { + case nir_texop_tex: + case nir_texop_txb: + case nir_texop_lod: + args->attributes |= AC_FUNC_ATTR_CONVERGENT; + break; + default: + break; + } + } + + return ac_build_image_opcode(&ctx->ac, args); +} + +static LLVMValueRef visit_vulkan_resource_reindex(struct ac_nir_context *ctx, + nir_intrinsic_instr *instr) +{ + LLVMValueRef ptr = get_src(ctx, instr->src[0]); + LLVMValueRef index = get_src(ctx, instr->src[1]); + + LLVMValueRef result = LLVMBuildGEP(ctx->ac.builder, ptr, &index, 1, ""); + LLVMSetMetadata(result, ctx->ac.uniform_md_kind, ctx->ac.empty_md); + return result; +} + +static LLVMValueRef visit_load_push_constant(struct ac_nir_context *ctx, + nir_intrinsic_instr *instr) +{ + LLVMValueRef ptr, addr; + LLVMValueRef src0 = get_src(ctx, instr->src[0]); + unsigned index = nir_intrinsic_base(instr); + + addr = LLVMConstInt(ctx->ac.i32, index, 0); + addr = LLVMBuildAdd(ctx->ac.builder, addr, src0, ""); + + /* Load constant values from user SGPRS when possible, otherwise + * fallback to the default path that loads directly from memory. + */ + if (LLVMIsConstant(src0) && + instr->dest.ssa.bit_size == 32) { + unsigned count = instr->dest.ssa.num_components; + unsigned offset = index; + + offset += LLVMConstIntGetZExtValue(src0); + offset /= 4; + + offset -= ctx->args->base_inline_push_consts; + + unsigned num_inline_push_consts = ctx->args->num_inline_push_consts; + if (offset + count <= num_inline_push_consts) { + LLVMValueRef push_constants[num_inline_push_consts]; + for (unsigned i = 0; i < num_inline_push_consts; i++) + push_constants[i] = ac_get_arg(&ctx->ac, + ctx->args->inline_push_consts[i]); + return ac_build_gather_values(&ctx->ac, + push_constants + offset, + count); + } + } + + ptr = LLVMBuildGEP(ctx->ac.builder, + ac_get_arg(&ctx->ac, ctx->args->push_constants), &addr, 1, ""); + + if (instr->dest.ssa.bit_size == 8) { + unsigned load_dwords = instr->dest.ssa.num_components > 1 ? 2 : 1; + LLVMTypeRef vec_type = LLVMVectorType(LLVMInt8TypeInContext(ctx->ac.context), 4 * load_dwords); + ptr = ac_cast_ptr(&ctx->ac, ptr, vec_type); + LLVMValueRef res = LLVMBuildLoad(ctx->ac.builder, ptr, ""); + + LLVMValueRef params[3]; + if (load_dwords > 1) { + LLVMValueRef res_vec = LLVMBuildBitCast(ctx->ac.builder, res, LLVMVectorType(ctx->ac.i32, 2), ""); + params[0] = LLVMBuildExtractElement(ctx->ac.builder, res_vec, LLVMConstInt(ctx->ac.i32, 1, false), ""); + params[1] = LLVMBuildExtractElement(ctx->ac.builder, res_vec, LLVMConstInt(ctx->ac.i32, 0, false), ""); + } else { + res = LLVMBuildBitCast(ctx->ac.builder, res, ctx->ac.i32, ""); + params[0] = ctx->ac.i32_0; + params[1] = res; + } + params[2] = addr; + res = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.alignbyte", ctx->ac.i32, params, 3, 0); + + res = LLVMBuildTrunc(ctx->ac.builder, res, LLVMIntTypeInContext(ctx->ac.context, instr->dest.ssa.num_components * 8), ""); + if (instr->dest.ssa.num_components > 1) + res = LLVMBuildBitCast(ctx->ac.builder, res, LLVMVectorType(LLVMInt8TypeInContext(ctx->ac.context), instr->dest.ssa.num_components), ""); + return res; + } else if (instr->dest.ssa.bit_size == 16) { + unsigned load_dwords = instr->dest.ssa.num_components / 2 + 1; + LLVMTypeRef vec_type = LLVMVectorType(LLVMInt16TypeInContext(ctx->ac.context), 2 * load_dwords); + ptr = ac_cast_ptr(&ctx->ac, ptr, vec_type); + LLVMValueRef res = LLVMBuildLoad(ctx->ac.builder, ptr, ""); + res = LLVMBuildBitCast(ctx->ac.builder, res, vec_type, ""); + LLVMValueRef cond = LLVMBuildLShr(ctx->ac.builder, addr, ctx->ac.i32_1, ""); + cond = LLVMBuildTrunc(ctx->ac.builder, cond, ctx->ac.i1, ""); + LLVMValueRef mask[] = { LLVMConstInt(ctx->ac.i32, 0, false), LLVMConstInt(ctx->ac.i32, 1, false), + LLVMConstInt(ctx->ac.i32, 2, false), LLVMConstInt(ctx->ac.i32, 3, false), + LLVMConstInt(ctx->ac.i32, 4, false)}; + LLVMValueRef swizzle_aligned = LLVMConstVector(&mask[0], instr->dest.ssa.num_components); + LLVMValueRef swizzle_unaligned = LLVMConstVector(&mask[1], instr->dest.ssa.num_components); + LLVMValueRef shuffle_aligned = LLVMBuildShuffleVector(ctx->ac.builder, res, res, swizzle_aligned, ""); + LLVMValueRef shuffle_unaligned = LLVMBuildShuffleVector(ctx->ac.builder, res, res, swizzle_unaligned, ""); + res = LLVMBuildSelect(ctx->ac.builder, cond, shuffle_unaligned, shuffle_aligned, ""); + return LLVMBuildBitCast(ctx->ac.builder, res, get_def_type(ctx, &instr->dest.ssa), ""); + } + + ptr = ac_cast_ptr(&ctx->ac, ptr, get_def_type(ctx, &instr->dest.ssa)); + + return LLVMBuildLoad(ctx->ac.builder, ptr, ""); +} + +static LLVMValueRef visit_get_buffer_size(struct ac_nir_context *ctx, + const nir_intrinsic_instr *instr) +{ + LLVMValueRef index = get_src(ctx, instr->src[0]); + + return get_buffer_size(ctx, ctx->abi->load_ssbo(ctx->abi, index, false), false); +} + +static uint32_t widen_mask(uint32_t mask, unsigned multiplier) +{ + uint32_t new_mask = 0; + for(unsigned i = 0; i < 32 && (1u << i) <= mask; ++i) + if (mask & (1u << i)) + new_mask |= ((1u << multiplier) - 1u) << (i * multiplier); + return new_mask; +} + +static LLVMValueRef extract_vector_range(struct ac_llvm_context *ctx, LLVMValueRef src, + unsigned start, unsigned count) +{ + LLVMValueRef mask[] = { + ctx->i32_0, ctx->i32_1, + LLVMConstInt(ctx->i32, 2, false), LLVMConstInt(ctx->i32, 3, false) }; + + unsigned src_elements = ac_get_llvm_num_components(src); + + if (count == src_elements) { + assert(start == 0); + return src; + } else if (count == 1) { + assert(start < src_elements); + return LLVMBuildExtractElement(ctx->builder, src, mask[start], ""); + } else { + assert(start + count <= src_elements); + assert(count <= 4); + LLVMValueRef swizzle = LLVMConstVector(&mask[start], count); + return LLVMBuildShuffleVector(ctx->builder, src, src, swizzle, ""); + } +} + +static unsigned get_cache_policy(struct ac_nir_context *ctx, + enum gl_access_qualifier access, + bool may_store_unaligned, + bool writeonly_memory) +{ + unsigned cache_policy = 0; + + /* GFX6 has a TC L1 bug causing corruption of 8bit/16bit stores. All + * store opcodes not aligned to a dword are affected. The only way to + * get unaligned stores is through shader images. + */ + if (((may_store_unaligned && ctx->ac.chip_class == GFX6) || + /* If this is write-only, don't keep data in L1 to prevent + * evicting L1 cache lines that may be needed by other + * instructions. + */ + writeonly_memory || + access & (ACCESS_COHERENT | ACCESS_VOLATILE))) { + cache_policy |= ac_glc; + } + + if (access & ACCESS_STREAM_CACHE_POLICY) + cache_policy |= ac_slc; + + return cache_policy; +} + +static LLVMValueRef enter_waterfall_ssbo(struct ac_nir_context *ctx, + struct waterfall_context *wctx, + const nir_intrinsic_instr *instr, + nir_src src) +{ + return enter_waterfall(ctx, wctx, get_src(ctx, src), + nir_intrinsic_access(instr) & ACCESS_NON_UNIFORM); +} + +static void visit_store_ssbo(struct ac_nir_context *ctx, + nir_intrinsic_instr *instr) +{ + LLVMValueRef src_data = get_src(ctx, instr->src[0]); + int elem_size_bytes = ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src_data)) / 8; + unsigned writemask = nir_intrinsic_write_mask(instr); + enum gl_access_qualifier access = nir_intrinsic_access(instr); + bool writeonly_memory = access & ACCESS_NON_READABLE; + unsigned cache_policy = get_cache_policy(ctx, access, false, writeonly_memory); + + struct waterfall_context wctx; + LLVMValueRef rsrc_base = enter_waterfall_ssbo(ctx, &wctx, instr, instr->src[1]); + + LLVMValueRef rsrc = ctx->abi->load_ssbo(ctx->abi, rsrc_base, true); + LLVMValueRef base_data = src_data; + base_data = ac_trim_vector(&ctx->ac, base_data, instr->num_components); + LLVMValueRef base_offset = get_src(ctx, instr->src[2]); + + while (writemask) { + int start, count; + LLVMValueRef data, offset; + LLVMTypeRef data_type; + + u_bit_scan_consecutive_range(&writemask, &start, &count); + + /* Due to an LLVM limitation with LLVM < 9, split 3-element + * writes into a 2-element and a 1-element write. */ + if (count == 3 && + (elem_size_bytes != 4 || !ac_has_vec3_support(ctx->ac.chip_class, false))) { + writemask |= 1 << (start + 2); + count = 2; + } + int num_bytes = count * elem_size_bytes; /* count in bytes */ + + /* we can only store 4 DWords at the same time. + * can only happen for 64 Bit vectors. */ + if (num_bytes > 16) { + writemask |= ((1u << (count - 2)) - 1u) << (start + 2); + count = 2; + num_bytes = 16; + } + + /* check alignment of 16 Bit stores */ + if (elem_size_bytes == 2 && num_bytes > 2 && (start % 2) == 1) { + writemask |= ((1u << (count - 1)) - 1u) << (start + 1); + count = 1; + num_bytes = 2; + } + + /* Due to alignment issues, split stores of 8-bit/16-bit + * vectors. + */ + if (ctx->ac.chip_class == GFX6 && count > 1 && elem_size_bytes < 4) { + writemask |= ((1u << (count - 1)) - 1u) << (start + 1); + count = 1; + num_bytes = elem_size_bytes; + } + + data = extract_vector_range(&ctx->ac, base_data, start, count); + + offset = LLVMBuildAdd(ctx->ac.builder, base_offset, + LLVMConstInt(ctx->ac.i32, start * elem_size_bytes, false), ""); + + if (num_bytes == 1) { + ac_build_tbuffer_store_byte(&ctx->ac, rsrc, data, + offset, ctx->ac.i32_0, + cache_policy); + } else if (num_bytes == 2) { + ac_build_tbuffer_store_short(&ctx->ac, rsrc, data, + offset, ctx->ac.i32_0, + cache_policy); + } else { + int num_channels = num_bytes / 4; + + switch (num_bytes) { + case 16: /* v4f32 */ + data_type = ctx->ac.v4f32; + break; + case 12: /* v3f32 */ + data_type = ctx->ac.v3f32; + break; + case 8: /* v2f32 */ + data_type = ctx->ac.v2f32; + break; + case 4: /* f32 */ + data_type = ctx->ac.f32; + break; + default: + unreachable("Malformed vector store."); + } + data = LLVMBuildBitCast(ctx->ac.builder, data, data_type, ""); + + ac_build_buffer_store_dword(&ctx->ac, rsrc, data, + num_channels, offset, + ctx->ac.i32_0, 0, + cache_policy); + } + } + + exit_waterfall(ctx, &wctx, NULL); +} + +static LLVMValueRef emit_ssbo_comp_swap_64(struct ac_nir_context *ctx, + LLVMValueRef descriptor, + LLVMValueRef offset, + LLVMValueRef compare, + LLVMValueRef exchange) +{ + LLVMBasicBlockRef start_block = NULL, then_block = NULL; + if (ctx->abi->robust_buffer_access) { + LLVMValueRef size = ac_llvm_extract_elem(&ctx->ac, descriptor, 2); + + LLVMValueRef cond = LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, offset, size, ""); + start_block = LLVMGetInsertBlock(ctx->ac.builder); + + ac_build_ifcc(&ctx->ac, cond, -1); + + then_block = LLVMGetInsertBlock(ctx->ac.builder); + } + + LLVMValueRef ptr_parts[2] = { + ac_llvm_extract_elem(&ctx->ac, descriptor, 0), + LLVMBuildAnd(ctx->ac.builder, + ac_llvm_extract_elem(&ctx->ac, descriptor, 1), + LLVMConstInt(ctx->ac.i32, 65535, 0), "") + }; + + ptr_parts[1] = LLVMBuildTrunc(ctx->ac.builder, ptr_parts[1], ctx->ac.i16, ""); + ptr_parts[1] = LLVMBuildSExt(ctx->ac.builder, ptr_parts[1], ctx->ac.i32, ""); + + offset = LLVMBuildZExt(ctx->ac.builder, offset, ctx->ac.i64, ""); + + LLVMValueRef ptr = ac_build_gather_values(&ctx->ac, ptr_parts, 2); + ptr = LLVMBuildBitCast(ctx->ac.builder, ptr, ctx->ac.i64, ""); + ptr = LLVMBuildAdd(ctx->ac.builder, ptr, offset, ""); + ptr = LLVMBuildIntToPtr(ctx->ac.builder, ptr, LLVMPointerType(ctx->ac.i64, AC_ADDR_SPACE_GLOBAL), ""); + + LLVMValueRef result = ac_build_atomic_cmp_xchg(&ctx->ac, ptr, compare, exchange, "singlethread-one-as"); + result = LLVMBuildExtractValue(ctx->ac.builder, result, 0, ""); + + if (ctx->abi->robust_buffer_access) { + ac_build_endif(&ctx->ac, -1); + + LLVMBasicBlockRef incoming_blocks[2] = { + start_block, + then_block, + }; + + LLVMValueRef incoming_values[2] = { + LLVMConstInt(ctx->ac.i64, 0, 0), + result, + }; + LLVMValueRef ret = LLVMBuildPhi(ctx->ac.builder, ctx->ac.i64, ""); + LLVMAddIncoming(ret, incoming_values, incoming_blocks, 2); + return ret; + } else { + return result; + } +} + +static LLVMValueRef visit_atomic_ssbo(struct ac_nir_context *ctx, + nir_intrinsic_instr *instr) +{ + LLVMTypeRef return_type = LLVMTypeOf(get_src(ctx, instr->src[2])); + const char *op; + char name[64], type[8]; + LLVMValueRef params[6], descriptor; + int arg_count = 0; + + struct waterfall_context wctx; + LLVMValueRef rsrc_base = enter_waterfall_ssbo(ctx, &wctx, instr, instr->src[0]); + + switch (instr->intrinsic) { + case nir_intrinsic_ssbo_atomic_add: + op = "add"; + break; + case nir_intrinsic_ssbo_atomic_imin: + op = "smin"; + break; + case nir_intrinsic_ssbo_atomic_umin: + op = "umin"; + break; + case nir_intrinsic_ssbo_atomic_imax: + op = "smax"; + break; + case nir_intrinsic_ssbo_atomic_umax: + op = "umax"; + break; + case nir_intrinsic_ssbo_atomic_and: + op = "and"; + break; + case nir_intrinsic_ssbo_atomic_or: + op = "or"; + break; + case nir_intrinsic_ssbo_atomic_xor: + op = "xor"; + break; + case nir_intrinsic_ssbo_atomic_exchange: + op = "swap"; + break; + case nir_intrinsic_ssbo_atomic_comp_swap: + op = "cmpswap"; + break; + default: + abort(); + } + + descriptor = ctx->abi->load_ssbo(ctx->abi, + rsrc_base, + true); + + LLVMValueRef result; + if (instr->intrinsic == nir_intrinsic_ssbo_atomic_comp_swap && + return_type == ctx->ac.i64) { + result = emit_ssbo_comp_swap_64(ctx, descriptor, + get_src(ctx, instr->src[1]), + get_src(ctx, instr->src[2]), + get_src(ctx, instr->src[3])); + } else { + if (instr->intrinsic == nir_intrinsic_ssbo_atomic_comp_swap) { + params[arg_count++] = ac_llvm_extract_elem(&ctx->ac, get_src(ctx, instr->src[3]), 0); + } + params[arg_count++] = ac_llvm_extract_elem(&ctx->ac, get_src(ctx, instr->src[2]), 0); + params[arg_count++] = descriptor; + + if (LLVM_VERSION_MAJOR >= 9) { + /* XXX: The new raw/struct atomic intrinsics are buggy with + * LLVM 8, see r358579. + */ + params[arg_count++] = get_src(ctx, instr->src[1]); /* voffset */ + params[arg_count++] = ctx->ac.i32_0; /* soffset */ + params[arg_count++] = ctx->ac.i32_0; /* slc */ + + ac_build_type_name_for_intr(return_type, type, sizeof(type)); + snprintf(name, sizeof(name), + "llvm.amdgcn.raw.buffer.atomic.%s.%s", op, type); + } else { + params[arg_count++] = ctx->ac.i32_0; /* vindex */ + params[arg_count++] = get_src(ctx, instr->src[1]); /* voffset */ + params[arg_count++] = ctx->ac.i1false; /* slc */ + + assert(return_type == ctx->ac.i32); + snprintf(name, sizeof(name), + "llvm.amdgcn.buffer.atomic.%s", op); + } + + result = ac_build_intrinsic(&ctx->ac, name, return_type, params, + arg_count, 0); + } + + return exit_waterfall(ctx, &wctx, result); +} + +static LLVMValueRef visit_load_buffer(struct ac_nir_context *ctx, + nir_intrinsic_instr *instr) +{ + struct waterfall_context wctx; + LLVMValueRef rsrc_base = enter_waterfall_ssbo(ctx, &wctx, instr, instr->src[0]); + + int elem_size_bytes = instr->dest.ssa.bit_size / 8; + int num_components = instr->num_components; + enum gl_access_qualifier access = nir_intrinsic_access(instr); + unsigned cache_policy = get_cache_policy(ctx, access, false, false); + + LLVMValueRef offset = get_src(ctx, instr->src[1]); + LLVMValueRef rsrc = ctx->abi->load_ssbo(ctx->abi, rsrc_base, false); + LLVMValueRef vindex = ctx->ac.i32_0; + + LLVMTypeRef def_type = get_def_type(ctx, &instr->dest.ssa); + LLVMTypeRef def_elem_type = num_components > 1 ? LLVMGetElementType(def_type) : def_type; + + LLVMValueRef results[4]; + for (int i = 0; i < num_components;) { + int num_elems = num_components - i; + if (elem_size_bytes < 4 && nir_intrinsic_align(instr) % 4 != 0) + num_elems = 1; + if (num_elems * elem_size_bytes > 16) + num_elems = 16 / elem_size_bytes; + int load_bytes = num_elems * elem_size_bytes; + + LLVMValueRef immoffset = LLVMConstInt(ctx->ac.i32, i * elem_size_bytes, false); + + LLVMValueRef ret; + + if (load_bytes == 1) { + ret = ac_build_tbuffer_load_byte(&ctx->ac, + rsrc, + offset, + ctx->ac.i32_0, + immoffset, + cache_policy); + } else if (load_bytes == 2) { + ret = ac_build_tbuffer_load_short(&ctx->ac, + rsrc, + offset, + ctx->ac.i32_0, + immoffset, + cache_policy); + } else { + int num_channels = util_next_power_of_two(load_bytes) / 4; + bool can_speculate = access & ACCESS_CAN_REORDER; + + ret = ac_build_buffer_load(&ctx->ac, rsrc, num_channels, + vindex, offset, immoffset, 0, + cache_policy, can_speculate, false); + } + + LLVMTypeRef byte_vec = LLVMVectorType(ctx->ac.i8, ac_get_type_size(LLVMTypeOf(ret))); + ret = LLVMBuildBitCast(ctx->ac.builder, ret, byte_vec, ""); + ret = ac_trim_vector(&ctx->ac, ret, load_bytes); + + LLVMTypeRef ret_type = LLVMVectorType(def_elem_type, num_elems); + ret = LLVMBuildBitCast(ctx->ac.builder, ret, ret_type, ""); + + for (unsigned j = 0; j < num_elems; j++) { + results[i + j] = LLVMBuildExtractElement(ctx->ac.builder, ret, LLVMConstInt(ctx->ac.i32, j, false), ""); + } + i += num_elems; + } + + LLVMValueRef ret = ac_build_gather_values(&ctx->ac, results, num_components); + return exit_waterfall(ctx, &wctx, ret); +} + +static LLVMValueRef enter_waterfall_ubo(struct ac_nir_context *ctx, + struct waterfall_context *wctx, + const nir_intrinsic_instr *instr) +{ + return enter_waterfall(ctx, wctx, get_src(ctx, instr->src[0]), + nir_intrinsic_access(instr) & ACCESS_NON_UNIFORM); +} + +static LLVMValueRef visit_load_ubo_buffer(struct ac_nir_context *ctx, + nir_intrinsic_instr *instr) +{ + struct waterfall_context wctx; + LLVMValueRef rsrc_base = enter_waterfall_ubo(ctx, &wctx, instr); + + LLVMValueRef ret; + LLVMValueRef rsrc = rsrc_base; + LLVMValueRef offset = get_src(ctx, instr->src[1]); + int num_components = instr->num_components; + + if (ctx->abi->load_ubo) + rsrc = ctx->abi->load_ubo(ctx->abi, rsrc); + + if (instr->dest.ssa.bit_size == 64) + num_components *= 2; + + if (instr->dest.ssa.bit_size == 16 || instr->dest.ssa.bit_size == 8) { + unsigned load_bytes = instr->dest.ssa.bit_size / 8; + LLVMValueRef results[num_components]; + for (unsigned i = 0; i < num_components; ++i) { + LLVMValueRef immoffset = LLVMConstInt(ctx->ac.i32, + load_bytes * i, 0); + + if (load_bytes == 1) { + results[i] = ac_build_tbuffer_load_byte(&ctx->ac, + rsrc, + offset, + ctx->ac.i32_0, + immoffset, + 0); + } else { + assert(load_bytes == 2); + results[i] = ac_build_tbuffer_load_short(&ctx->ac, + rsrc, + offset, + ctx->ac.i32_0, + immoffset, + 0); + } + } + ret = ac_build_gather_values(&ctx->ac, results, num_components); + } else { + ret = ac_build_buffer_load(&ctx->ac, rsrc, num_components, NULL, offset, + NULL, 0, 0, true, true); + + ret = ac_trim_vector(&ctx->ac, ret, num_components); + } + + ret = LLVMBuildBitCast(ctx->ac.builder, ret, + get_def_type(ctx, &instr->dest.ssa), ""); + + return exit_waterfall(ctx, &wctx, ret); +} + +static void +get_deref_offset(struct ac_nir_context *ctx, nir_deref_instr *instr, + bool vs_in, unsigned *vertex_index_out, + LLVMValueRef *vertex_index_ref, + unsigned *const_out, LLVMValueRef *indir_out) +{ + nir_variable *var = nir_deref_instr_get_variable(instr); + nir_deref_path path; + unsigned idx_lvl = 1; + + nir_deref_path_init(&path, instr, NULL); + + if (vertex_index_out != NULL || vertex_index_ref != NULL) { + if (vertex_index_ref) { + *vertex_index_ref = get_src(ctx, path.path[idx_lvl]->arr.index); + if (vertex_index_out) + *vertex_index_out = 0; + } else { + *vertex_index_out = nir_src_as_uint(path.path[idx_lvl]->arr.index); + } + ++idx_lvl; + } + + uint32_t const_offset = 0; + LLVMValueRef offset = NULL; + + if (var->data.compact) { + assert(instr->deref_type == nir_deref_type_array); + const_offset = nir_src_as_uint(instr->arr.index); + goto out; + } + + for (; path.path[idx_lvl]; ++idx_lvl) { + const struct glsl_type *parent_type = path.path[idx_lvl - 1]->type; + if (path.path[idx_lvl]->deref_type == nir_deref_type_struct) { + unsigned index = path.path[idx_lvl]->strct.index; + + for (unsigned i = 0; i < index; i++) { + const struct glsl_type *ft = glsl_get_struct_field(parent_type, i); + const_offset += glsl_count_attribute_slots(ft, vs_in); + } + } else if(path.path[idx_lvl]->deref_type == nir_deref_type_array) { + unsigned size = glsl_count_attribute_slots(path.path[idx_lvl]->type, vs_in); + if (nir_src_is_const(path.path[idx_lvl]->arr.index)) { + const_offset += size * + nir_src_as_uint(path.path[idx_lvl]->arr.index); + } else { + LLVMValueRef array_off = LLVMBuildMul(ctx->ac.builder, LLVMConstInt(ctx->ac.i32, size, 0), + get_src(ctx, path.path[idx_lvl]->arr.index), ""); + if (offset) + offset = LLVMBuildAdd(ctx->ac.builder, offset, array_off, ""); + else + offset = array_off; + } + } else + unreachable("Uhandled deref type in get_deref_instr_offset"); + } + +out: + nir_deref_path_finish(&path); + + if (const_offset && offset) + offset = LLVMBuildAdd(ctx->ac.builder, offset, + LLVMConstInt(ctx->ac.i32, const_offset, 0), + ""); + + *const_out = const_offset; + *indir_out = offset; +} + +static LLVMValueRef load_tess_varyings(struct ac_nir_context *ctx, + nir_intrinsic_instr *instr, + bool load_inputs) +{ + LLVMValueRef result; + LLVMValueRef vertex_index = NULL; + LLVMValueRef indir_index = NULL; + unsigned const_index = 0; + + nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr)); + + unsigned location = var->data.location; + unsigned driver_location = var->data.driver_location; + const bool is_patch = var->data.patch || + var->data.location == VARYING_SLOT_TESS_LEVEL_INNER || + var->data.location == VARYING_SLOT_TESS_LEVEL_OUTER; + const bool is_compact = var->data.compact; + + get_deref_offset(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), + false, NULL, is_patch ? NULL : &vertex_index, + &const_index, &indir_index); + + LLVMTypeRef dest_type = get_def_type(ctx, &instr->dest.ssa); + + LLVMTypeRef src_component_type; + if (LLVMGetTypeKind(dest_type) == LLVMVectorTypeKind) + src_component_type = LLVMGetElementType(dest_type); + else + src_component_type = dest_type; + + result = ctx->abi->load_tess_varyings(ctx->abi, src_component_type, + vertex_index, indir_index, + const_index, location, driver_location, + var->data.location_frac, + instr->num_components, + is_patch, is_compact, load_inputs); + if (instr->dest.ssa.bit_size == 16) { + result = ac_to_integer(&ctx->ac, result); + result = LLVMBuildTrunc(ctx->ac.builder, result, dest_type, ""); + } + return LLVMBuildBitCast(ctx->ac.builder, result, dest_type, ""); +} + +static unsigned +type_scalar_size_bytes(const struct glsl_type *type) +{ + assert(glsl_type_is_vector_or_scalar(type) || + glsl_type_is_matrix(type)); + return glsl_type_is_boolean(type) ? 4 : glsl_get_bit_size(type) / 8; +} + +static LLVMValueRef visit_load_var(struct ac_nir_context *ctx, + nir_intrinsic_instr *instr) +{ + nir_deref_instr *deref = nir_instr_as_deref(instr->src[0].ssa->parent_instr); + nir_variable *var = nir_deref_instr_get_variable(deref); + + LLVMValueRef values[8]; + int idx = 0; + int ve = instr->dest.ssa.num_components; + unsigned comp = 0; + LLVMValueRef indir_index; + LLVMValueRef ret; + unsigned const_index; + unsigned stride = 4; + int mode = deref->mode; + + if (var) { + bool vs_in = ctx->stage == MESA_SHADER_VERTEX && + var->data.mode == nir_var_shader_in; + idx = var->data.driver_location; + comp = var->data.location_frac; + mode = var->data.mode; + + get_deref_offset(ctx, deref, vs_in, NULL, NULL, + &const_index, &indir_index); + + if (var->data.compact) { + stride = 1; + const_index += comp; + comp = 0; + } + } + + if (instr->dest.ssa.bit_size == 64 && + (deref->mode == nir_var_shader_in || + deref->mode == nir_var_shader_out || + deref->mode == nir_var_function_temp)) + ve *= 2; + + switch (mode) { + case nir_var_shader_in: + if (ctx->stage == MESA_SHADER_TESS_CTRL || + ctx->stage == MESA_SHADER_TESS_EVAL) { + return load_tess_varyings(ctx, instr, true); + } + + if (ctx->stage == MESA_SHADER_GEOMETRY) { + LLVMTypeRef type = LLVMIntTypeInContext(ctx->ac.context, instr->dest.ssa.bit_size); + LLVMValueRef indir_index; + unsigned const_index, vertex_index; + get_deref_offset(ctx, deref, false, &vertex_index, NULL, + &const_index, &indir_index); + assert(indir_index == NULL); + + return ctx->abi->load_inputs(ctx->abi, var->data.location, + var->data.driver_location, + var->data.location_frac, + instr->num_components, vertex_index, const_index, type); + } + + for (unsigned chan = comp; chan < ve + comp; chan++) { + if (indir_index) { + unsigned count = glsl_count_attribute_slots( + var->type, + ctx->stage == MESA_SHADER_VERTEX); + count -= chan / 4; + LLVMValueRef tmp_vec = ac_build_gather_values_extended( + &ctx->ac, ctx->abi->inputs + idx + chan, count, + stride, false, true); + + values[chan] = LLVMBuildExtractElement(ctx->ac.builder, + tmp_vec, + indir_index, ""); + } else + values[chan] = ctx->abi->inputs[idx + chan + const_index * stride]; + } + break; + case nir_var_function_temp: + for (unsigned chan = 0; chan < ve; chan++) { + if (indir_index) { + unsigned count = glsl_count_attribute_slots( + var->type, false); + count -= chan / 4; + LLVMValueRef tmp_vec = ac_build_gather_values_extended( + &ctx->ac, ctx->locals + idx + chan, count, + stride, true, true); + + values[chan] = LLVMBuildExtractElement(ctx->ac.builder, + tmp_vec, + indir_index, ""); + } else { + values[chan] = LLVMBuildLoad(ctx->ac.builder, ctx->locals[idx + chan + const_index * stride], ""); + } + } + break; + case nir_var_shader_out: + if (ctx->stage == MESA_SHADER_TESS_CTRL) { + return load_tess_varyings(ctx, instr, false); + } + + if (ctx->stage == MESA_SHADER_FRAGMENT && + var->data.fb_fetch_output && + ctx->abi->emit_fbfetch) + return ctx->abi->emit_fbfetch(ctx->abi); + + for (unsigned chan = comp; chan < ve + comp; chan++) { + if (indir_index) { + unsigned count = glsl_count_attribute_slots( + var->type, false); + count -= chan / 4; + LLVMValueRef tmp_vec = ac_build_gather_values_extended( + &ctx->ac, ctx->abi->outputs + idx + chan, count, + stride, true, true); + + values[chan] = LLVMBuildExtractElement(ctx->ac.builder, + tmp_vec, + indir_index, ""); + } else { + values[chan] = LLVMBuildLoad(ctx->ac.builder, + ctx->abi->outputs[idx + chan + const_index * stride], + ""); + } + } + break; + case nir_var_mem_global: { + LLVMValueRef address = get_src(ctx, instr->src[0]); + LLVMTypeRef result_type = get_def_type(ctx, &instr->dest.ssa); + unsigned explicit_stride = glsl_get_explicit_stride(deref->type); + unsigned natural_stride = type_scalar_size_bytes(deref->type); + unsigned stride = explicit_stride ? explicit_stride : natural_stride; + int elem_size_bytes = ac_get_elem_bits(&ctx->ac, result_type) / 8; + bool split_loads = ctx->ac.chip_class == GFX6 && elem_size_bytes < 4; + + if (stride != natural_stride || split_loads) { + if (LLVMGetTypeKind(result_type) == LLVMVectorTypeKind) + result_type = LLVMGetElementType(result_type); + + LLVMTypeRef ptr_type = LLVMPointerType(result_type, + LLVMGetPointerAddressSpace(LLVMTypeOf(address))); + address = LLVMBuildBitCast(ctx->ac.builder, address, ptr_type , ""); + + for (unsigned i = 0; i < instr->dest.ssa.num_components; ++i) { + LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, i * stride / natural_stride, 0); + values[i] = LLVMBuildLoad(ctx->ac.builder, + ac_build_gep_ptr(&ctx->ac, address, offset), ""); + } + return ac_build_gather_values(&ctx->ac, values, instr->dest.ssa.num_components); + } else { + LLVMTypeRef ptr_type = LLVMPointerType(result_type, + LLVMGetPointerAddressSpace(LLVMTypeOf(address))); + address = LLVMBuildBitCast(ctx->ac.builder, address, ptr_type , ""); + LLVMValueRef val = LLVMBuildLoad(ctx->ac.builder, address, ""); + return val; + } + } + default: + unreachable("unhandle variable mode"); + } + ret = ac_build_varying_gather_values(&ctx->ac, values, ve, comp); + return LLVMBuildBitCast(ctx->ac.builder, ret, get_def_type(ctx, &instr->dest.ssa), ""); +} + +static void +visit_store_var(struct ac_nir_context *ctx, + nir_intrinsic_instr *instr) +{ + nir_deref_instr *deref = nir_instr_as_deref(instr->src[0].ssa->parent_instr); + nir_variable *var = nir_deref_instr_get_variable(deref); + + LLVMValueRef temp_ptr, value; + int idx = 0; + unsigned comp = 0; + LLVMValueRef src = ac_to_float(&ctx->ac, get_src(ctx, instr->src[1])); + int writemask = instr->const_index[0]; + LLVMValueRef indir_index; + unsigned const_index; + + if (var) { + get_deref_offset(ctx, deref, false, + NULL, NULL, &const_index, &indir_index); + idx = var->data.driver_location; + comp = var->data.location_frac; + + if (var->data.compact) { + const_index += comp; + comp = 0; + } + } + + if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src)) == 64 && + (deref->mode == nir_var_shader_out || + deref->mode == nir_var_function_temp)) { + + src = LLVMBuildBitCast(ctx->ac.builder, src, + LLVMVectorType(ctx->ac.f32, ac_get_llvm_num_components(src) * 2), + ""); + + writemask = widen_mask(writemask, 2); + } + + writemask = writemask << comp; + + switch (deref->mode) { + case nir_var_shader_out: + + if (ctx->stage == MESA_SHADER_TESS_CTRL) { + LLVMValueRef vertex_index = NULL; + LLVMValueRef indir_index = NULL; + unsigned const_index = 0; + const bool is_patch = var->data.patch || + var->data.location == VARYING_SLOT_TESS_LEVEL_INNER || + var->data.location == VARYING_SLOT_TESS_LEVEL_OUTER; + + get_deref_offset(ctx, deref, false, NULL, + is_patch ? NULL : &vertex_index, + &const_index, &indir_index); + + ctx->abi->store_tcs_outputs(ctx->abi, var, + vertex_index, indir_index, + const_index, src, writemask); + return; + } + + for (unsigned chan = 0; chan < 8; chan++) { + int stride = 4; + if (!(writemask & (1 << chan))) + continue; + + value = ac_llvm_extract_elem(&ctx->ac, src, chan - comp); + + if (var->data.compact) + stride = 1; + if (indir_index) { + unsigned count = glsl_count_attribute_slots( + var->type, false); + count -= chan / 4; + LLVMValueRef tmp_vec = ac_build_gather_values_extended( + &ctx->ac, ctx->abi->outputs + idx + chan, count, + stride, true, true); + + tmp_vec = LLVMBuildInsertElement(ctx->ac.builder, tmp_vec, + value, indir_index, ""); + build_store_values_extended(&ctx->ac, ctx->abi->outputs + idx + chan, + count, stride, tmp_vec); + + } else { + temp_ptr = ctx->abi->outputs[idx + chan + const_index * stride]; + + LLVMBuildStore(ctx->ac.builder, value, temp_ptr); + } + } + break; + case nir_var_function_temp: + for (unsigned chan = 0; chan < 8; chan++) { + if (!(writemask & (1 << chan))) + continue; + + value = ac_llvm_extract_elem(&ctx->ac, src, chan); + if (indir_index) { + unsigned count = glsl_count_attribute_slots( + var->type, false); + count -= chan / 4; + LLVMValueRef tmp_vec = ac_build_gather_values_extended( + &ctx->ac, ctx->locals + idx + chan, count, + 4, true, true); + + tmp_vec = LLVMBuildInsertElement(ctx->ac.builder, tmp_vec, + value, indir_index, ""); + build_store_values_extended(&ctx->ac, ctx->locals + idx + chan, + count, 4, tmp_vec); + } else { + temp_ptr = ctx->locals[idx + chan + const_index * 4]; + + LLVMBuildStore(ctx->ac.builder, value, temp_ptr); + } + } + break; + + case nir_var_mem_global: { + int writemask = instr->const_index[0]; + LLVMValueRef address = get_src(ctx, instr->src[0]); + LLVMValueRef val = get_src(ctx, instr->src[1]); + + unsigned explicit_stride = glsl_get_explicit_stride(deref->type); + unsigned natural_stride = type_scalar_size_bytes(deref->type); + unsigned stride = explicit_stride ? explicit_stride : natural_stride; + int elem_size_bytes = ac_get_elem_bits(&ctx->ac, LLVMTypeOf(val)) / 8; + bool split_stores = ctx->ac.chip_class == GFX6 && elem_size_bytes < 4; + + LLVMTypeRef ptr_type = LLVMPointerType(LLVMTypeOf(val), + LLVMGetPointerAddressSpace(LLVMTypeOf(address))); + address = LLVMBuildBitCast(ctx->ac.builder, address, ptr_type , ""); + + if (writemask == (1u << ac_get_llvm_num_components(val)) - 1 && + stride == natural_stride && !split_stores) { + LLVMTypeRef ptr_type = LLVMPointerType(LLVMTypeOf(val), + LLVMGetPointerAddressSpace(LLVMTypeOf(address))); + address = LLVMBuildBitCast(ctx->ac.builder, address, ptr_type , ""); + + val = LLVMBuildBitCast(ctx->ac.builder, val, + LLVMGetElementType(LLVMTypeOf(address)), ""); + LLVMBuildStore(ctx->ac.builder, val, address); + } else { + LLVMTypeRef val_type = LLVMTypeOf(val); + if (LLVMGetTypeKind(LLVMTypeOf(val)) == LLVMVectorTypeKind) + val_type = LLVMGetElementType(val_type); + + LLVMTypeRef ptr_type = LLVMPointerType(val_type, + LLVMGetPointerAddressSpace(LLVMTypeOf(address))); + address = LLVMBuildBitCast(ctx->ac.builder, address, ptr_type , ""); + for (unsigned chan = 0; chan < 4; chan++) { + if (!(writemask & (1 << chan))) + continue; + + LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, chan * stride / natural_stride, 0); + + LLVMValueRef ptr = ac_build_gep_ptr(&ctx->ac, address, offset); + LLVMValueRef src = ac_llvm_extract_elem(&ctx->ac, val, + chan); + src = LLVMBuildBitCast(ctx->ac.builder, src, + LLVMGetElementType(LLVMTypeOf(ptr)), ""); + LLVMBuildStore(ctx->ac.builder, src, ptr); + } + } + break; + } + default: + abort(); + break; + } +} + +static int image_type_to_components_count(enum glsl_sampler_dim dim, bool array) +{ + switch (dim) { + case GLSL_SAMPLER_DIM_BUF: + return 1; + case GLSL_SAMPLER_DIM_1D: + return array ? 2 : 1; + case GLSL_SAMPLER_DIM_2D: + return array ? 3 : 2; + case GLSL_SAMPLER_DIM_MS: + return array ? 4 : 3; + case GLSL_SAMPLER_DIM_3D: + case GLSL_SAMPLER_DIM_CUBE: + return 3; + case GLSL_SAMPLER_DIM_RECT: + case GLSL_SAMPLER_DIM_SUBPASS: + return 2; + case GLSL_SAMPLER_DIM_SUBPASS_MS: + return 3; + default: + break; + } + return 0; +} + +static LLVMValueRef adjust_sample_index_using_fmask(struct ac_llvm_context *ctx, + LLVMValueRef coord_x, LLVMValueRef coord_y, + LLVMValueRef coord_z, + LLVMValueRef sample_index, + LLVMValueRef fmask_desc_ptr) +{ + unsigned sample_chan = coord_z ? 3 : 2; + LLVMValueRef addr[4] = {coord_x, coord_y, coord_z}; + addr[sample_chan] = sample_index; + + ac_apply_fmask_to_sample(ctx, fmask_desc_ptr, addr, coord_z != NULL); + return addr[sample_chan]; +} + +static nir_deref_instr *get_image_deref(const nir_intrinsic_instr *instr) +{ + assert(instr->src[0].is_ssa); + return nir_instr_as_deref(instr->src[0].ssa->parent_instr); +} + +static LLVMValueRef get_image_descriptor(struct ac_nir_context *ctx, + const nir_intrinsic_instr *instr, + LLVMValueRef dynamic_index, + enum ac_descriptor_type desc_type, + bool write) +{ + nir_deref_instr *deref_instr = + instr->src[0].ssa->parent_instr->type == nir_instr_type_deref ? + nir_instr_as_deref(instr->src[0].ssa->parent_instr) : NULL; + + return get_sampler_desc(ctx, deref_instr, desc_type, &instr->instr, dynamic_index, true, write); +} + +static void get_image_coords(struct ac_nir_context *ctx, + const nir_intrinsic_instr *instr, + LLVMValueRef dynamic_desc_index, + struct ac_image_args *args, + enum glsl_sampler_dim dim, + bool is_array) +{ + LLVMValueRef src0 = get_src(ctx, instr->src[1]); + LLVMValueRef masks[] = { + LLVMConstInt(ctx->ac.i32, 0, false), LLVMConstInt(ctx->ac.i32, 1, false), + LLVMConstInt(ctx->ac.i32, 2, false), LLVMConstInt(ctx->ac.i32, 3, false), + }; + LLVMValueRef sample_index = ac_llvm_extract_elem(&ctx->ac, get_src(ctx, instr->src[2]), 0); + + int count; + ASSERTED bool add_frag_pos = (dim == GLSL_SAMPLER_DIM_SUBPASS || + dim == GLSL_SAMPLER_DIM_SUBPASS_MS); + bool is_ms = (dim == GLSL_SAMPLER_DIM_MS || + dim == GLSL_SAMPLER_DIM_SUBPASS_MS); + bool gfx9_1d = ctx->ac.chip_class == GFX9 && dim == GLSL_SAMPLER_DIM_1D; + assert(!add_frag_pos && "Input attachments should be lowered by this point."); + count = image_type_to_components_count(dim, is_array); + + if (is_ms && (instr->intrinsic == nir_intrinsic_image_deref_load || + instr->intrinsic == nir_intrinsic_bindless_image_load)) { + LLVMValueRef fmask_load_address[3]; + + fmask_load_address[0] = LLVMBuildExtractElement(ctx->ac.builder, src0, masks[0], ""); + fmask_load_address[1] = LLVMBuildExtractElement(ctx->ac.builder, src0, masks[1], ""); + if (is_array) + fmask_load_address[2] = LLVMBuildExtractElement(ctx->ac.builder, src0, masks[2], ""); + else + fmask_load_address[2] = NULL; + + sample_index = adjust_sample_index_using_fmask(&ctx->ac, + fmask_load_address[0], + fmask_load_address[1], + fmask_load_address[2], + sample_index, + get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), + AC_DESC_FMASK, &instr->instr, dynamic_desc_index, true, false)); + } + if (count == 1 && !gfx9_1d) { + if (instr->src[1].ssa->num_components) + args->coords[0] = LLVMBuildExtractElement(ctx->ac.builder, src0, masks[0], ""); + else + args->coords[0] = src0; + } else { + int chan; + if (is_ms) + count--; + for (chan = 0; chan < count; ++chan) { + args->coords[chan] = ac_llvm_extract_elem(&ctx->ac, src0, chan); + } + + if (gfx9_1d) { + if (is_array) { + args->coords[2] = args->coords[1]; + args->coords[1] = ctx->ac.i32_0; + } else + args->coords[1] = ctx->ac.i32_0; + count++; + } + if (ctx->ac.chip_class == GFX9 && + dim == GLSL_SAMPLER_DIM_2D && + !is_array) { + /* The hw can't bind a slice of a 3D image as a 2D + * image, because it ignores BASE_ARRAY if the target + * is 3D. The workaround is to read BASE_ARRAY and set + * it as the 3rd address operand for all 2D images. + */ + LLVMValueRef first_layer, const5, mask; + + const5 = LLVMConstInt(ctx->ac.i32, 5, 0); + mask = LLVMConstInt(ctx->ac.i32, S_008F24_BASE_ARRAY(~0), 0); + first_layer = LLVMBuildExtractElement(ctx->ac.builder, args->resource, const5, ""); + first_layer = LLVMBuildAnd(ctx->ac.builder, first_layer, mask, ""); + + args->coords[count] = first_layer; + count++; + } + + + if (is_ms) { + args->coords[count] = sample_index; + count++; + } + } +} + +static LLVMValueRef get_image_buffer_descriptor(struct ac_nir_context *ctx, + const nir_intrinsic_instr *instr, + LLVMValueRef dynamic_index, + bool write, bool atomic) +{ + LLVMValueRef rsrc = get_image_descriptor(ctx, instr, dynamic_index, AC_DESC_BUFFER, write); + if (ctx->ac.chip_class == GFX9 && LLVM_VERSION_MAJOR < 9 && atomic) { + LLVMValueRef elem_count = LLVMBuildExtractElement(ctx->ac.builder, rsrc, LLVMConstInt(ctx->ac.i32, 2, 0), ""); + LLVMValueRef stride = LLVMBuildExtractElement(ctx->ac.builder, rsrc, LLVMConstInt(ctx->ac.i32, 1, 0), ""); + stride = LLVMBuildLShr(ctx->ac.builder, stride, LLVMConstInt(ctx->ac.i32, 16, 0), ""); + + LLVMValueRef new_elem_count = LLVMBuildSelect(ctx->ac.builder, + LLVMBuildICmp(ctx->ac.builder, LLVMIntUGT, elem_count, stride, ""), + elem_count, stride, ""); + + rsrc = LLVMBuildInsertElement(ctx->ac.builder, rsrc, new_elem_count, + LLVMConstInt(ctx->ac.i32, 2, 0), ""); + } + return rsrc; +} + +static LLVMValueRef enter_waterfall_image(struct ac_nir_context *ctx, + struct waterfall_context *wctx, + const nir_intrinsic_instr *instr) +{ + nir_deref_instr *deref_instr = NULL; + + if (instr->src[0].ssa->parent_instr->type == nir_instr_type_deref) + deref_instr = nir_instr_as_deref(instr->src[0].ssa->parent_instr); + + LLVMValueRef value = get_sampler_desc_index(ctx, deref_instr, &instr->instr, true); + return enter_waterfall(ctx, wctx, value, nir_intrinsic_access(instr) & ACCESS_NON_UNIFORM); +} + +static LLVMValueRef visit_image_load(struct ac_nir_context *ctx, + const nir_intrinsic_instr *instr, + bool bindless) +{ + LLVMValueRef res; + + enum glsl_sampler_dim dim; + enum gl_access_qualifier access; + bool is_array; + if (bindless) { + dim = nir_intrinsic_image_dim(instr); + access = nir_intrinsic_access(instr); + is_array = nir_intrinsic_image_array(instr); + } else { + const nir_deref_instr *image_deref = get_image_deref(instr); + const struct glsl_type *type = image_deref->type; + const nir_variable *var = nir_deref_instr_get_variable(image_deref); + dim = glsl_get_sampler_dim(type); + access = var->data.access; + is_array = glsl_sampler_type_is_array(type); + } + + struct waterfall_context wctx; + LLVMValueRef dynamic_index = enter_waterfall_image(ctx, &wctx, instr); + + struct ac_image_args args = {}; + + args.cache_policy = get_cache_policy(ctx, access, false, false); + + if (dim == GLSL_SAMPLER_DIM_BUF) { + unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa); + unsigned num_channels = util_last_bit(mask); + LLVMValueRef rsrc, vindex; + + rsrc = get_image_buffer_descriptor(ctx, instr, dynamic_index, false, false); + vindex = LLVMBuildExtractElement(ctx->ac.builder, get_src(ctx, instr->src[1]), + ctx->ac.i32_0, ""); + + bool can_speculate = access & ACCESS_CAN_REORDER; + res = ac_build_buffer_load_format(&ctx->ac, rsrc, vindex, + ctx->ac.i32_0, num_channels, + args.cache_policy, + can_speculate); + res = ac_build_expand_to_vec4(&ctx->ac, res, num_channels); + + res = ac_trim_vector(&ctx->ac, res, instr->dest.ssa.num_components); + res = ac_to_integer(&ctx->ac, res); + } else { + bool level_zero = nir_src_is_const(instr->src[3]) && nir_src_as_uint(instr->src[3]) == 0; + + args.opcode = level_zero ? ac_image_load : ac_image_load_mip; + args.resource = get_image_descriptor(ctx, instr, dynamic_index, AC_DESC_IMAGE, false); + get_image_coords(ctx, instr, dynamic_index, &args, dim, is_array); + args.dim = ac_get_image_dim(ctx->ac.chip_class, dim, is_array); + if (!level_zero) + args.lod = get_src(ctx, instr->src[3]); + args.dmask = 15; + args.attributes = AC_FUNC_ATTR_READONLY; + + res = ac_build_image_opcode(&ctx->ac, &args); + } + return exit_waterfall(ctx, &wctx, res); +} + +static void visit_image_store(struct ac_nir_context *ctx, + const nir_intrinsic_instr *instr, + bool bindless) +{ + + + enum glsl_sampler_dim dim; + enum gl_access_qualifier access; + bool is_array; + + if (bindless) { + dim = nir_intrinsic_image_dim(instr); + access = nir_intrinsic_access(instr); + is_array = nir_intrinsic_image_array(instr); + } else { + const nir_deref_instr *image_deref = get_image_deref(instr); + const struct glsl_type *type = image_deref->type; + const nir_variable *var = nir_deref_instr_get_variable(image_deref); + dim = glsl_get_sampler_dim(type); + access = var->data.access; + is_array = glsl_sampler_type_is_array(type); + } + + struct waterfall_context wctx; + LLVMValueRef dynamic_index = enter_waterfall_image(ctx, &wctx, instr); + + bool writeonly_memory = access & ACCESS_NON_READABLE; + struct ac_image_args args = {}; + + args.cache_policy = get_cache_policy(ctx, access, true, writeonly_memory); + + if (dim == GLSL_SAMPLER_DIM_BUF) { + LLVMValueRef rsrc = get_image_buffer_descriptor(ctx, instr, dynamic_index, true, false); + LLVMValueRef src = ac_to_float(&ctx->ac, get_src(ctx, instr->src[3])); + unsigned src_channels = ac_get_llvm_num_components(src); + LLVMValueRef vindex; + + if (src_channels == 3) + src = ac_build_expand_to_vec4(&ctx->ac, src, 3); + + vindex = LLVMBuildExtractElement(ctx->ac.builder, + get_src(ctx, instr->src[1]), + ctx->ac.i32_0, ""); + + ac_build_buffer_store_format(&ctx->ac, rsrc, src, vindex, + ctx->ac.i32_0, src_channels, + args.cache_policy); + } else { + bool level_zero = nir_src_is_const(instr->src[4]) && nir_src_as_uint(instr->src[4]) == 0; + + args.opcode = level_zero ? ac_image_store : ac_image_store_mip; + args.data[0] = ac_to_float(&ctx->ac, get_src(ctx, instr->src[3])); + args.resource = get_image_descriptor(ctx, instr, dynamic_index, AC_DESC_IMAGE, true); + get_image_coords(ctx, instr, dynamic_index, &args, dim, is_array); + args.dim = ac_get_image_dim(ctx->ac.chip_class, dim, is_array); + if (!level_zero) + args.lod = get_src(ctx, instr->src[4]); + args.dmask = 15; + + ac_build_image_opcode(&ctx->ac, &args); + } + + exit_waterfall(ctx, &wctx, NULL); +} + +static LLVMValueRef visit_image_atomic(struct ac_nir_context *ctx, + const nir_intrinsic_instr *instr, + bool bindless) +{ + LLVMValueRef params[7]; + int param_count = 0; + + bool cmpswap = instr->intrinsic == nir_intrinsic_image_deref_atomic_comp_swap || + instr->intrinsic == nir_intrinsic_bindless_image_atomic_comp_swap; + const char *atomic_name; + char intrinsic_name[64]; + enum ac_atomic_op atomic_subop; + ASSERTED int length; + + enum glsl_sampler_dim dim; + bool is_array; + if (bindless) { + if (instr->intrinsic == nir_intrinsic_bindless_image_atomic_imin || + instr->intrinsic == nir_intrinsic_bindless_image_atomic_umin || + instr->intrinsic == nir_intrinsic_bindless_image_atomic_imax || + instr->intrinsic == nir_intrinsic_bindless_image_atomic_umax) { + ASSERTED const GLenum format = nir_intrinsic_format(instr); + assert(format == GL_R32UI || format == GL_R32I); + } + dim = nir_intrinsic_image_dim(instr); + is_array = nir_intrinsic_image_array(instr); + } else { + const struct glsl_type *type = get_image_deref(instr)->type; + dim = glsl_get_sampler_dim(type); + is_array = glsl_sampler_type_is_array(type); + } + + struct waterfall_context wctx; + LLVMValueRef dynamic_index = enter_waterfall_image(ctx, &wctx, instr); + + switch (instr->intrinsic) { + case nir_intrinsic_bindless_image_atomic_add: + case nir_intrinsic_image_deref_atomic_add: + atomic_name = "add"; + atomic_subop = ac_atomic_add; + break; + case nir_intrinsic_bindless_image_atomic_imin: + case nir_intrinsic_image_deref_atomic_imin: + atomic_name = "smin"; + atomic_subop = ac_atomic_smin; + break; + case nir_intrinsic_bindless_image_atomic_umin: + case nir_intrinsic_image_deref_atomic_umin: + atomic_name = "umin"; + atomic_subop = ac_atomic_umin; + break; + case nir_intrinsic_bindless_image_atomic_imax: + case nir_intrinsic_image_deref_atomic_imax: + atomic_name = "smax"; + atomic_subop = ac_atomic_smax; + break; + case nir_intrinsic_bindless_image_atomic_umax: + case nir_intrinsic_image_deref_atomic_umax: + atomic_name = "umax"; + atomic_subop = ac_atomic_umax; + break; + case nir_intrinsic_bindless_image_atomic_and: + case nir_intrinsic_image_deref_atomic_and: + atomic_name = "and"; + atomic_subop = ac_atomic_and; + break; + case nir_intrinsic_bindless_image_atomic_or: + case nir_intrinsic_image_deref_atomic_or: + atomic_name = "or"; + atomic_subop = ac_atomic_or; + break; + case nir_intrinsic_bindless_image_atomic_xor: + case nir_intrinsic_image_deref_atomic_xor: + atomic_name = "xor"; + atomic_subop = ac_atomic_xor; + break; + case nir_intrinsic_bindless_image_atomic_exchange: + case nir_intrinsic_image_deref_atomic_exchange: + atomic_name = "swap"; + atomic_subop = ac_atomic_swap; + break; + case nir_intrinsic_bindless_image_atomic_comp_swap: + case nir_intrinsic_image_deref_atomic_comp_swap: + atomic_name = "cmpswap"; + atomic_subop = 0; /* not used */ + break; + case nir_intrinsic_bindless_image_atomic_inc_wrap: + case nir_intrinsic_image_deref_atomic_inc_wrap: { + atomic_name = "inc"; + atomic_subop = ac_atomic_inc_wrap; + /* ATOMIC_INC instruction does: + * value = (value + 1) % (data + 1) + * but we want: + * value = (value + 1) % data + * So replace 'data' by 'data - 1'. + */ + ctx->ssa_defs[instr->src[3].ssa->index] = + LLVMBuildSub(ctx->ac.builder, + ctx->ssa_defs[instr->src[3].ssa->index], + ctx->ac.i32_1, ""); + break; + } + case nir_intrinsic_bindless_image_atomic_dec_wrap: + case nir_intrinsic_image_deref_atomic_dec_wrap: + atomic_name = "dec"; + atomic_subop = ac_atomic_dec_wrap; + break; + default: + abort(); + } + + if (cmpswap) + params[param_count++] = get_src(ctx, instr->src[4]); + params[param_count++] = get_src(ctx, instr->src[3]); + + LLVMValueRef result; + if (dim == GLSL_SAMPLER_DIM_BUF) { + params[param_count++] = get_image_buffer_descriptor(ctx, instr, dynamic_index, true, true); + params[param_count++] = LLVMBuildExtractElement(ctx->ac.builder, get_src(ctx, instr->src[1]), + ctx->ac.i32_0, ""); /* vindex */ + params[param_count++] = ctx->ac.i32_0; /* voffset */ + if (LLVM_VERSION_MAJOR >= 9) { + /* XXX: The new raw/struct atomic intrinsics are buggy + * with LLVM 8, see r358579. + */ + params[param_count++] = ctx->ac.i32_0; /* soffset */ + params[param_count++] = ctx->ac.i32_0; /* slc */ + + length = snprintf(intrinsic_name, sizeof(intrinsic_name), + "llvm.amdgcn.struct.buffer.atomic.%s.i32", atomic_name); + } else { + params[param_count++] = ctx->ac.i1false; /* slc */ + + length = snprintf(intrinsic_name, sizeof(intrinsic_name), + "llvm.amdgcn.buffer.atomic.%s", atomic_name); + } + + assert(length < sizeof(intrinsic_name)); + result = ac_build_intrinsic(&ctx->ac, intrinsic_name, ctx->ac.i32, + params, param_count, 0); + } else { + struct ac_image_args args = {}; + args.opcode = cmpswap ? ac_image_atomic_cmpswap : ac_image_atomic; + args.atomic = atomic_subop; + args.data[0] = params[0]; + if (cmpswap) + args.data[1] = params[1]; + args.resource = get_image_descriptor(ctx, instr, dynamic_index, AC_DESC_IMAGE, true); + get_image_coords(ctx, instr, dynamic_index, &args, dim, is_array); + args.dim = ac_get_image_dim(ctx->ac.chip_class, dim, is_array); + + result = ac_build_image_opcode(&ctx->ac, &args); + } + + return exit_waterfall(ctx, &wctx, result); +} + +static LLVMValueRef visit_image_samples(struct ac_nir_context *ctx, + nir_intrinsic_instr *instr) +{ + struct waterfall_context wctx; + LLVMValueRef dynamic_index = enter_waterfall_image(ctx, &wctx, instr); + LLVMValueRef rsrc = get_image_descriptor(ctx, instr, dynamic_index, AC_DESC_IMAGE, false); + + LLVMValueRef ret = ac_build_image_get_sample_count(&ctx->ac, rsrc); + + return exit_waterfall(ctx, &wctx, ret); +} + +static LLVMValueRef visit_image_size(struct ac_nir_context *ctx, + const nir_intrinsic_instr *instr, + bool bindless) +{ + LLVMValueRef res; + + enum glsl_sampler_dim dim; + bool is_array; + if (bindless) { + dim = nir_intrinsic_image_dim(instr); + is_array = nir_intrinsic_image_array(instr); + } else { + const struct glsl_type *type = get_image_deref(instr)->type; + dim = glsl_get_sampler_dim(type); + is_array = glsl_sampler_type_is_array(type); + } + + struct waterfall_context wctx; + LLVMValueRef dynamic_index = enter_waterfall_image(ctx, &wctx, instr); + + if (dim == GLSL_SAMPLER_DIM_BUF) { + res = get_buffer_size(ctx, get_image_descriptor(ctx, instr, dynamic_index, AC_DESC_BUFFER, false), true); + } else { + + struct ac_image_args args = { 0 }; + + args.dim = ac_get_image_dim(ctx->ac.chip_class, dim, is_array); + args.dmask = 0xf; + args.resource = get_image_descriptor(ctx, instr, dynamic_index, AC_DESC_IMAGE, false); + args.opcode = ac_image_get_resinfo; + args.lod = ctx->ac.i32_0; + args.attributes = AC_FUNC_ATTR_READNONE; + + res = ac_build_image_opcode(&ctx->ac, &args); + + LLVMValueRef two = LLVMConstInt(ctx->ac.i32, 2, false); + + if (dim == GLSL_SAMPLER_DIM_CUBE && is_array) { + LLVMValueRef six = LLVMConstInt(ctx->ac.i32, 6, false); + LLVMValueRef z = LLVMBuildExtractElement(ctx->ac.builder, res, two, ""); + z = LLVMBuildSDiv(ctx->ac.builder, z, six, ""); + res = LLVMBuildInsertElement(ctx->ac.builder, res, z, two, ""); + } + + if (ctx->ac.chip_class == GFX9 && dim == GLSL_SAMPLER_DIM_1D && is_array) { + LLVMValueRef layers = LLVMBuildExtractElement(ctx->ac.builder, res, two, ""); + res = LLVMBuildInsertElement(ctx->ac.builder, res, layers, + ctx->ac.i32_1, ""); + } + } + return exit_waterfall(ctx, &wctx, res); +} + +static void emit_membar(struct ac_llvm_context *ac, + const nir_intrinsic_instr *instr) +{ + unsigned wait_flags = 0; + + switch (instr->intrinsic) { + case nir_intrinsic_memory_barrier: + case nir_intrinsic_group_memory_barrier: + wait_flags = AC_WAIT_LGKM | AC_WAIT_VLOAD | AC_WAIT_VSTORE; + break; + case nir_intrinsic_memory_barrier_buffer: + case nir_intrinsic_memory_barrier_image: + wait_flags = AC_WAIT_VLOAD | AC_WAIT_VSTORE; + break; + case nir_intrinsic_memory_barrier_shared: + wait_flags = AC_WAIT_LGKM; + break; + default: + break; + } + + ac_build_waitcnt(ac, wait_flags); +} + +void ac_emit_barrier(struct ac_llvm_context *ac, gl_shader_stage stage) +{ + /* GFX6 only (thanks to a hw bug workaround): + * The real barrier instruction isn’t needed, because an entire patch + * always fits into a single wave. + */ + if (ac->chip_class == GFX6 && stage == MESA_SHADER_TESS_CTRL) { + ac_build_waitcnt(ac, AC_WAIT_LGKM | AC_WAIT_VLOAD | AC_WAIT_VSTORE); + return; + } + ac_build_s_barrier(ac); +} + +static void emit_discard(struct ac_nir_context *ctx, + const nir_intrinsic_instr *instr) +{ + LLVMValueRef cond; + + if (instr->intrinsic == nir_intrinsic_discard_if) { + cond = LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, + get_src(ctx, instr->src[0]), + ctx->ac.i32_0, ""); + } else { + assert(instr->intrinsic == nir_intrinsic_discard); + cond = ctx->ac.i1false; + } + + ctx->abi->emit_kill(ctx->abi, cond); +} + +static LLVMValueRef +visit_load_local_invocation_index(struct ac_nir_context *ctx) +{ + LLVMValueRef result; + LLVMValueRef thread_id = ac_get_thread_id(&ctx->ac); + result = LLVMBuildAnd(ctx->ac.builder, + ac_get_arg(&ctx->ac, ctx->args->tg_size), + LLVMConstInt(ctx->ac.i32, 0xfc0, false), ""); + + if (ctx->ac.wave_size == 32) + result = LLVMBuildLShr(ctx->ac.builder, result, + LLVMConstInt(ctx->ac.i32, 1, false), ""); + + return LLVMBuildAdd(ctx->ac.builder, result, thread_id, ""); +} + +static LLVMValueRef +visit_load_subgroup_id(struct ac_nir_context *ctx) +{ + if (ctx->stage == MESA_SHADER_COMPUTE) { + LLVMValueRef result; + result = LLVMBuildAnd(ctx->ac.builder, + ac_get_arg(&ctx->ac, ctx->args->tg_size), + LLVMConstInt(ctx->ac.i32, 0xfc0, false), ""); + return LLVMBuildLShr(ctx->ac.builder, result, LLVMConstInt(ctx->ac.i32, 6, false), ""); + } else { + return LLVMConstInt(ctx->ac.i32, 0, false); + } +} + +static LLVMValueRef +visit_load_num_subgroups(struct ac_nir_context *ctx) +{ + if (ctx->stage == MESA_SHADER_COMPUTE) { + return LLVMBuildAnd(ctx->ac.builder, + ac_get_arg(&ctx->ac, ctx->args->tg_size), + LLVMConstInt(ctx->ac.i32, 0x3f, false), ""); + } else { + return LLVMConstInt(ctx->ac.i32, 1, false); + } +} + +static LLVMValueRef +visit_first_invocation(struct ac_nir_context *ctx) +{ + LLVMValueRef active_set = ac_build_ballot(&ctx->ac, ctx->ac.i32_1); + const char *intr = ctx->ac.wave_size == 32 ? "llvm.cttz.i32" : "llvm.cttz.i64"; + + /* The second argument is whether cttz(0) should be defined, but we do not care. */ + LLVMValueRef args[] = {active_set, ctx->ac.i1false}; + LLVMValueRef result = ac_build_intrinsic(&ctx->ac, intr, + ctx->ac.iN_wavemask, args, 2, + AC_FUNC_ATTR_NOUNWIND | + AC_FUNC_ATTR_READNONE); + + return LLVMBuildTrunc(ctx->ac.builder, result, ctx->ac.i32, ""); +} + +static LLVMValueRef +visit_load_shared(struct ac_nir_context *ctx, + const nir_intrinsic_instr *instr) +{ + LLVMValueRef values[4], derived_ptr, index, ret; + + LLVMValueRef ptr = get_memory_ptr(ctx, instr->src[0], + instr->dest.ssa.bit_size); + + for (int chan = 0; chan < instr->num_components; chan++) { + index = LLVMConstInt(ctx->ac.i32, chan, 0); + derived_ptr = LLVMBuildGEP(ctx->ac.builder, ptr, &index, 1, ""); + values[chan] = LLVMBuildLoad(ctx->ac.builder, derived_ptr, ""); + } + + ret = ac_build_gather_values(&ctx->ac, values, instr->num_components); + return LLVMBuildBitCast(ctx->ac.builder, ret, get_def_type(ctx, &instr->dest.ssa), ""); +} + +static void +visit_store_shared(struct ac_nir_context *ctx, + const nir_intrinsic_instr *instr) +{ + LLVMValueRef derived_ptr, data,index; + LLVMBuilderRef builder = ctx->ac.builder; + + LLVMValueRef ptr = get_memory_ptr(ctx, instr->src[1], + instr->src[0].ssa->bit_size); + LLVMValueRef src = get_src(ctx, instr->src[0]); + + int writemask = nir_intrinsic_write_mask(instr); + for (int chan = 0; chan < 4; chan++) { + if (!(writemask & (1 << chan))) { + continue; + } + data = ac_llvm_extract_elem(&ctx->ac, src, chan); + index = LLVMConstInt(ctx->ac.i32, chan, 0); + derived_ptr = LLVMBuildGEP(builder, ptr, &index, 1, ""); + LLVMBuildStore(builder, data, derived_ptr); + } +} + +static LLVMValueRef visit_var_atomic(struct ac_nir_context *ctx, + const nir_intrinsic_instr *instr, + LLVMValueRef ptr, int src_idx) +{ + LLVMValueRef result; + LLVMValueRef src = get_src(ctx, instr->src[src_idx]); + + const char *sync_scope = LLVM_VERSION_MAJOR >= 9 ? "workgroup-one-as" : "workgroup"; + + if (instr->src[0].ssa->parent_instr->type == nir_instr_type_deref) { + nir_deref_instr *deref = nir_instr_as_deref(instr->src[0].ssa->parent_instr); + if (deref->mode == nir_var_mem_global) { + /* use "singlethread" sync scope to implement relaxed ordering */ + sync_scope = LLVM_VERSION_MAJOR >= 9 ? "singlethread-one-as" : "singlethread"; + + LLVMTypeRef ptr_type = LLVMPointerType(LLVMTypeOf(src), LLVMGetPointerAddressSpace(LLVMTypeOf(ptr))); + ptr = LLVMBuildBitCast(ctx->ac.builder, ptr, ptr_type , ""); + } + } + + if (instr->intrinsic == nir_intrinsic_shared_atomic_comp_swap || + instr->intrinsic == nir_intrinsic_deref_atomic_comp_swap) { + LLVMValueRef src1 = get_src(ctx, instr->src[src_idx + 1]); + result = ac_build_atomic_cmp_xchg(&ctx->ac, ptr, src, src1, sync_scope); + result = LLVMBuildExtractValue(ctx->ac.builder, result, 0, ""); + } else { + LLVMAtomicRMWBinOp op; + switch (instr->intrinsic) { + case nir_intrinsic_shared_atomic_add: + case nir_intrinsic_deref_atomic_add: + op = LLVMAtomicRMWBinOpAdd; + break; + case nir_intrinsic_shared_atomic_umin: + case nir_intrinsic_deref_atomic_umin: + op = LLVMAtomicRMWBinOpUMin; + break; + case nir_intrinsic_shared_atomic_umax: + case nir_intrinsic_deref_atomic_umax: + op = LLVMAtomicRMWBinOpUMax; + break; + case nir_intrinsic_shared_atomic_imin: + case nir_intrinsic_deref_atomic_imin: + op = LLVMAtomicRMWBinOpMin; + break; + case nir_intrinsic_shared_atomic_imax: + case nir_intrinsic_deref_atomic_imax: + op = LLVMAtomicRMWBinOpMax; + break; + case nir_intrinsic_shared_atomic_and: + case nir_intrinsic_deref_atomic_and: + op = LLVMAtomicRMWBinOpAnd; + break; + case nir_intrinsic_shared_atomic_or: + case nir_intrinsic_deref_atomic_or: + op = LLVMAtomicRMWBinOpOr; + break; + case nir_intrinsic_shared_atomic_xor: + case nir_intrinsic_deref_atomic_xor: + op = LLVMAtomicRMWBinOpXor; + break; + case nir_intrinsic_shared_atomic_exchange: + case nir_intrinsic_deref_atomic_exchange: + op = LLVMAtomicRMWBinOpXchg; + break; + default: + return NULL; + } + + result = ac_build_atomic_rmw(&ctx->ac, op, ptr, ac_to_integer(&ctx->ac, src), sync_scope); + } + return result; +} + +static LLVMValueRef load_sample_pos(struct ac_nir_context *ctx) +{ + LLVMValueRef values[2]; + LLVMValueRef pos[2]; + + pos[0] = ac_to_float(&ctx->ac, + ac_get_arg(&ctx->ac, ctx->args->frag_pos[0])); + pos[1] = ac_to_float(&ctx->ac, + ac_get_arg(&ctx->ac, ctx->args->frag_pos[1])); + + values[0] = ac_build_fract(&ctx->ac, pos[0], 32); + values[1] = ac_build_fract(&ctx->ac, pos[1], 32); + return ac_build_gather_values(&ctx->ac, values, 2); +} + +static LLVMValueRef lookup_interp_param(struct ac_nir_context *ctx, + enum glsl_interp_mode interp, unsigned location) +{ + switch (interp) { + case INTERP_MODE_FLAT: + default: + return NULL; + case INTERP_MODE_SMOOTH: + case INTERP_MODE_NONE: + if (location == INTERP_CENTER) + return ac_get_arg(&ctx->ac, ctx->args->persp_center); + else if (location == INTERP_CENTROID) + return ctx->abi->persp_centroid; + else if (location == INTERP_SAMPLE) + return ac_get_arg(&ctx->ac, ctx->args->persp_sample); + break; + case INTERP_MODE_NOPERSPECTIVE: + if (location == INTERP_CENTER) + return ac_get_arg(&ctx->ac, ctx->args->linear_center); + else if (location == INTERP_CENTROID) + return ctx->abi->linear_centroid; + else if (location == INTERP_SAMPLE) + return ac_get_arg(&ctx->ac, ctx->args->linear_sample); + break; + } + return NULL; +} + +static LLVMValueRef barycentric_center(struct ac_nir_context *ctx, + unsigned mode) +{ + LLVMValueRef interp_param = lookup_interp_param(ctx, mode, INTERP_CENTER); + return LLVMBuildBitCast(ctx->ac.builder, interp_param, ctx->ac.v2i32, ""); +} + +static LLVMValueRef barycentric_offset(struct ac_nir_context *ctx, + unsigned mode, + LLVMValueRef offset) +{ + LLVMValueRef interp_param = lookup_interp_param(ctx, mode, INTERP_CENTER); + LLVMValueRef src_c0 = ac_to_float(&ctx->ac, LLVMBuildExtractElement(ctx->ac.builder, offset, ctx->ac.i32_0, "")); + LLVMValueRef src_c1 = ac_to_float(&ctx->ac, LLVMBuildExtractElement(ctx->ac.builder, offset, ctx->ac.i32_1, "")); + + LLVMValueRef ij_out[2]; + LLVMValueRef ddxy_out = ac_build_ddxy_interp(&ctx->ac, interp_param); + + /* + * take the I then J parameters, and the DDX/Y for it, and + * calculate the IJ inputs for the interpolator. + * temp1 = ddx * offset/sample.x + I; + * interp_param.I = ddy * offset/sample.y + temp1; + * temp1 = ddx * offset/sample.x + J; + * interp_param.J = ddy * offset/sample.y + temp1; + */ + for (unsigned i = 0; i < 2; i++) { + LLVMValueRef ix_ll = LLVMConstInt(ctx->ac.i32, i, false); + LLVMValueRef iy_ll = LLVMConstInt(ctx->ac.i32, i + 2, false); + LLVMValueRef ddx_el = LLVMBuildExtractElement(ctx->ac.builder, + ddxy_out, ix_ll, ""); + LLVMValueRef ddy_el = LLVMBuildExtractElement(ctx->ac.builder, + ddxy_out, iy_ll, ""); + LLVMValueRef interp_el = LLVMBuildExtractElement(ctx->ac.builder, + interp_param, ix_ll, ""); + LLVMValueRef temp1, temp2; + + interp_el = LLVMBuildBitCast(ctx->ac.builder, interp_el, + ctx->ac.f32, ""); + + temp1 = ac_build_fmad(&ctx->ac, ddx_el, src_c0, interp_el); + temp2 = ac_build_fmad(&ctx->ac, ddy_el, src_c1, temp1); + + ij_out[i] = LLVMBuildBitCast(ctx->ac.builder, + temp2, ctx->ac.i32, ""); + } + interp_param = ac_build_gather_values(&ctx->ac, ij_out, 2); + return LLVMBuildBitCast(ctx->ac.builder, interp_param, ctx->ac.v2i32, ""); +} + +static LLVMValueRef barycentric_centroid(struct ac_nir_context *ctx, + unsigned mode) +{ + LLVMValueRef interp_param = lookup_interp_param(ctx, mode, INTERP_CENTROID); + return LLVMBuildBitCast(ctx->ac.builder, interp_param, ctx->ac.v2i32, ""); +} + +static LLVMValueRef barycentric_at_sample(struct ac_nir_context *ctx, + unsigned mode, + LLVMValueRef sample_id) +{ + if (ctx->abi->interp_at_sample_force_center) + return barycentric_center(ctx, mode); + + LLVMValueRef halfval = LLVMConstReal(ctx->ac.f32, 0.5f); + + /* fetch sample ID */ + LLVMValueRef sample_pos = ctx->abi->load_sample_position(ctx->abi, sample_id); + + LLVMValueRef src_c0 = LLVMBuildExtractElement(ctx->ac.builder, sample_pos, ctx->ac.i32_0, ""); + src_c0 = LLVMBuildFSub(ctx->ac.builder, src_c0, halfval, ""); + LLVMValueRef src_c1 = LLVMBuildExtractElement(ctx->ac.builder, sample_pos, ctx->ac.i32_1, ""); + src_c1 = LLVMBuildFSub(ctx->ac.builder, src_c1, halfval, ""); + LLVMValueRef coords[] = { src_c0, src_c1 }; + LLVMValueRef offset = ac_build_gather_values(&ctx->ac, coords, 2); + + return barycentric_offset(ctx, mode, offset); +} + + +static LLVMValueRef barycentric_sample(struct ac_nir_context *ctx, + unsigned mode) +{ + LLVMValueRef interp_param = lookup_interp_param(ctx, mode, INTERP_SAMPLE); + return LLVMBuildBitCast(ctx->ac.builder, interp_param, ctx->ac.v2i32, ""); +} + +static LLVMValueRef barycentric_model(struct ac_nir_context *ctx) +{ + return LLVMBuildBitCast(ctx->ac.builder, + ac_get_arg(&ctx->ac, ctx->args->pull_model), + ctx->ac.v3i32, ""); +} + +static LLVMValueRef load_interpolated_input(struct ac_nir_context *ctx, + LLVMValueRef interp_param, + unsigned index, unsigned comp_start, + unsigned num_components, + unsigned bitsize) +{ + LLVMValueRef attr_number = LLVMConstInt(ctx->ac.i32, index, false); + + interp_param = LLVMBuildBitCast(ctx->ac.builder, + interp_param, ctx->ac.v2f32, ""); + LLVMValueRef i = LLVMBuildExtractElement( + ctx->ac.builder, interp_param, ctx->ac.i32_0, ""); + LLVMValueRef j = LLVMBuildExtractElement( + ctx->ac.builder, interp_param, ctx->ac.i32_1, ""); + + LLVMValueRef values[4]; + assert(bitsize == 16 || bitsize == 32); + for (unsigned comp = 0; comp < num_components; comp++) { + LLVMValueRef llvm_chan = LLVMConstInt(ctx->ac.i32, comp_start + comp, false); + if (bitsize == 16) { + values[comp] = ac_build_fs_interp_f16(&ctx->ac, llvm_chan, attr_number, + ac_get_arg(&ctx->ac, ctx->args->prim_mask), i, j); + } else { + values[comp] = ac_build_fs_interp(&ctx->ac, llvm_chan, attr_number, + ac_get_arg(&ctx->ac, ctx->args->prim_mask), i, j); + } + } + + return ac_to_integer(&ctx->ac, ac_build_gather_values(&ctx->ac, values, num_components)); +} + +static LLVMValueRef load_input(struct ac_nir_context *ctx, + nir_intrinsic_instr *instr) +{ + unsigned offset_idx = instr->intrinsic == nir_intrinsic_load_input ? 0 : 1; + + /* We only lower inputs for fragment shaders ATM */ + ASSERTED nir_const_value *offset = nir_src_as_const_value(instr->src[offset_idx]); + assert(offset); + assert(offset[0].i32 == 0); + + unsigned component = nir_intrinsic_component(instr); + unsigned index = nir_intrinsic_base(instr); + unsigned vertex_id = 2; /* P0 */ + + if (instr->intrinsic == nir_intrinsic_load_input_vertex) { + nir_const_value *src0 = nir_src_as_const_value(instr->src[0]); + + switch (src0[0].i32) { + case 0: + vertex_id = 2; + break; + case 1: + vertex_id = 0; + break; + case 2: + vertex_id = 1; + break; + default: + unreachable("Invalid vertex index"); + } + } + + LLVMValueRef attr_number = LLVMConstInt(ctx->ac.i32, index, false); + LLVMValueRef values[8]; + + /* Each component of a 64-bit value takes up two GL-level channels. */ + unsigned num_components = instr->dest.ssa.num_components; + unsigned bit_size = instr->dest.ssa.bit_size; + unsigned channels = + bit_size == 64 ? num_components * 2 : num_components; + + for (unsigned chan = 0; chan < channels; chan++) { + if (component + chan > 4) + attr_number = LLVMConstInt(ctx->ac.i32, index + 1, false); + LLVMValueRef llvm_chan = LLVMConstInt(ctx->ac.i32, (component + chan) % 4, false); + values[chan] = ac_build_fs_interp_mov(&ctx->ac, + LLVMConstInt(ctx->ac.i32, vertex_id, false), + llvm_chan, + attr_number, + ac_get_arg(&ctx->ac, ctx->args->prim_mask)); + values[chan] = LLVMBuildBitCast(ctx->ac.builder, values[chan], ctx->ac.i32, ""); + values[chan] = LLVMBuildTruncOrBitCast(ctx->ac.builder, values[chan], + bit_size == 16 ? ctx->ac.i16 : ctx->ac.i32, ""); + } + + LLVMValueRef result = ac_build_gather_values(&ctx->ac, values, channels); + if (bit_size == 64) { + LLVMTypeRef type = num_components == 1 ? ctx->ac.i64 : + LLVMVectorType(ctx->ac.i64, num_components); + result = LLVMBuildBitCast(ctx->ac.builder, result, type, ""); + } + return result; +} + +static void visit_intrinsic(struct ac_nir_context *ctx, + nir_intrinsic_instr *instr) +{ + LLVMValueRef result = NULL; + + switch (instr->intrinsic) { + case nir_intrinsic_ballot: + result = ac_build_ballot(&ctx->ac, get_src(ctx, instr->src[0])); + if (ctx->ac.ballot_mask_bits > ctx->ac.wave_size) + result = LLVMBuildZExt(ctx->ac.builder, result, ctx->ac.iN_ballotmask, ""); + break; + case nir_intrinsic_read_invocation: + result = ac_build_readlane(&ctx->ac, get_src(ctx, instr->src[0]), + get_src(ctx, instr->src[1])); + break; + case nir_intrinsic_read_first_invocation: + result = ac_build_readlane(&ctx->ac, get_src(ctx, instr->src[0]), NULL); + break; + case nir_intrinsic_load_subgroup_invocation: + result = ac_get_thread_id(&ctx->ac); + break; + case nir_intrinsic_load_work_group_id: { + LLVMValueRef values[3]; + + for (int i = 0; i < 3; i++) { + values[i] = ctx->args->workgroup_ids[i].used ? + ac_get_arg(&ctx->ac, ctx->args->workgroup_ids[i]) : ctx->ac.i32_0; + } + + result = ac_build_gather_values(&ctx->ac, values, 3); + break; + } + case nir_intrinsic_load_base_vertex: + case nir_intrinsic_load_first_vertex: + result = ctx->abi->load_base_vertex(ctx->abi); + break; + case nir_intrinsic_load_local_group_size: + result = ctx->abi->load_local_group_size(ctx->abi); + break; + case nir_intrinsic_load_vertex_id: + result = LLVMBuildAdd(ctx->ac.builder, + ac_get_arg(&ctx->ac, ctx->args->vertex_id), + ac_get_arg(&ctx->ac, ctx->args->base_vertex), ""); + break; + case nir_intrinsic_load_vertex_id_zero_base: { + result = ctx->abi->vertex_id; + break; + } + case nir_intrinsic_load_local_invocation_id: { + result = ac_get_arg(&ctx->ac, ctx->args->local_invocation_ids); + break; + } + case nir_intrinsic_load_base_instance: + result = ac_get_arg(&ctx->ac, ctx->args->start_instance); + break; + case nir_intrinsic_load_draw_id: + result = ac_get_arg(&ctx->ac, ctx->args->draw_id); + break; + case nir_intrinsic_load_view_index: + result = ac_get_arg(&ctx->ac, ctx->args->view_index); + break; + case nir_intrinsic_load_invocation_id: + if (ctx->stage == MESA_SHADER_TESS_CTRL) { + result = ac_unpack_param(&ctx->ac, + ac_get_arg(&ctx->ac, ctx->args->tcs_rel_ids), + 8, 5); + } else { + if (ctx->ac.chip_class >= GFX10) { + result = LLVMBuildAnd(ctx->ac.builder, + ac_get_arg(&ctx->ac, ctx->args->gs_invocation_id), + LLVMConstInt(ctx->ac.i32, 127, 0), ""); + } else { + result = ac_get_arg(&ctx->ac, ctx->args->gs_invocation_id); + } + } + break; + case nir_intrinsic_load_primitive_id: + if (ctx->stage == MESA_SHADER_GEOMETRY) { + result = ac_get_arg(&ctx->ac, ctx->args->gs_prim_id); + } else if (ctx->stage == MESA_SHADER_TESS_CTRL) { + result = ac_get_arg(&ctx->ac, ctx->args->tcs_patch_id); + } else if (ctx->stage == MESA_SHADER_TESS_EVAL) { + result = ac_get_arg(&ctx->ac, ctx->args->tes_patch_id); + } else + fprintf(stderr, "Unknown primitive id intrinsic: %d", ctx->stage); + break; + case nir_intrinsic_load_sample_id: + result = ac_unpack_param(&ctx->ac, + ac_get_arg(&ctx->ac, ctx->args->ancillary), + 8, 4); + break; + case nir_intrinsic_load_sample_pos: + result = load_sample_pos(ctx); + break; + case nir_intrinsic_load_sample_mask_in: + result = ctx->abi->load_sample_mask_in(ctx->abi); + break; + case nir_intrinsic_load_frag_coord: { + LLVMValueRef values[4] = { + ac_get_arg(&ctx->ac, ctx->args->frag_pos[0]), + ac_get_arg(&ctx->ac, ctx->args->frag_pos[1]), + ac_get_arg(&ctx->ac, ctx->args->frag_pos[2]), + ac_build_fdiv(&ctx->ac, ctx->ac.f32_1, + ac_get_arg(&ctx->ac, ctx->args->frag_pos[3])) + }; + result = ac_to_integer(&ctx->ac, + ac_build_gather_values(&ctx->ac, values, 4)); + break; + } + case nir_intrinsic_load_layer_id: + result = ctx->abi->inputs[ac_llvm_reg_index_soa(VARYING_SLOT_LAYER, 0)]; + break; + case nir_intrinsic_load_front_face: + result = ac_get_arg(&ctx->ac, ctx->args->front_face); + break; + case nir_intrinsic_load_helper_invocation: + result = ac_build_load_helper_invocation(&ctx->ac); + break; + case nir_intrinsic_load_color0: + result = ctx->abi->color0; + break; + case nir_intrinsic_load_color1: + result = ctx->abi->color1; + break; + case nir_intrinsic_load_user_data_amd: + assert(LLVMTypeOf(ctx->abi->user_data) == ctx->ac.v4i32); + result = ctx->abi->user_data; + break; + case nir_intrinsic_load_instance_id: + result = ctx->abi->instance_id; + break; + case nir_intrinsic_load_num_work_groups: + result = ac_get_arg(&ctx->ac, ctx->args->num_work_groups); + break; + case nir_intrinsic_load_local_invocation_index: + result = visit_load_local_invocation_index(ctx); + break; + case nir_intrinsic_load_subgroup_id: + result = visit_load_subgroup_id(ctx); + break; + case nir_intrinsic_load_num_subgroups: + result = visit_load_num_subgroups(ctx); + break; + case nir_intrinsic_first_invocation: + result = visit_first_invocation(ctx); + break; + case nir_intrinsic_load_push_constant: + result = visit_load_push_constant(ctx, instr); + break; + case nir_intrinsic_vulkan_resource_index: { + LLVMValueRef index = get_src(ctx, instr->src[0]); + unsigned desc_set = nir_intrinsic_desc_set(instr); + unsigned binding = nir_intrinsic_binding(instr); + + result = ctx->abi->load_resource(ctx->abi, index, desc_set, + binding); + break; + } + case nir_intrinsic_vulkan_resource_reindex: + result = visit_vulkan_resource_reindex(ctx, instr); + break; + case nir_intrinsic_store_ssbo: + visit_store_ssbo(ctx, instr); + break; + case nir_intrinsic_load_ssbo: + result = visit_load_buffer(ctx, instr); + break; + case nir_intrinsic_ssbo_atomic_add: + case nir_intrinsic_ssbo_atomic_imin: + case nir_intrinsic_ssbo_atomic_umin: + case nir_intrinsic_ssbo_atomic_imax: + case nir_intrinsic_ssbo_atomic_umax: + case nir_intrinsic_ssbo_atomic_and: + case nir_intrinsic_ssbo_atomic_or: + case nir_intrinsic_ssbo_atomic_xor: + case nir_intrinsic_ssbo_atomic_exchange: + case nir_intrinsic_ssbo_atomic_comp_swap: + result = visit_atomic_ssbo(ctx, instr); + break; + case nir_intrinsic_load_ubo: + result = visit_load_ubo_buffer(ctx, instr); + break; + case nir_intrinsic_get_buffer_size: + result = visit_get_buffer_size(ctx, instr); + break; + case nir_intrinsic_load_deref: + result = visit_load_var(ctx, instr); + break; + case nir_intrinsic_store_deref: + visit_store_var(ctx, instr); + break; + case nir_intrinsic_load_shared: + result = visit_load_shared(ctx, instr); + break; + case nir_intrinsic_store_shared: + visit_store_shared(ctx, instr); + break; + case nir_intrinsic_bindless_image_samples: + case nir_intrinsic_image_deref_samples: + result = visit_image_samples(ctx, instr); + break; + case nir_intrinsic_bindless_image_load: + result = visit_image_load(ctx, instr, true); + break; + case nir_intrinsic_image_deref_load: + result = visit_image_load(ctx, instr, false); + break; + case nir_intrinsic_bindless_image_store: + visit_image_store(ctx, instr, true); + break; + case nir_intrinsic_image_deref_store: + visit_image_store(ctx, instr, false); + break; + case nir_intrinsic_bindless_image_atomic_add: + case nir_intrinsic_bindless_image_atomic_imin: + case nir_intrinsic_bindless_image_atomic_umin: + case nir_intrinsic_bindless_image_atomic_imax: + case nir_intrinsic_bindless_image_atomic_umax: + case nir_intrinsic_bindless_image_atomic_and: + case nir_intrinsic_bindless_image_atomic_or: + case nir_intrinsic_bindless_image_atomic_xor: + case nir_intrinsic_bindless_image_atomic_exchange: + case nir_intrinsic_bindless_image_atomic_comp_swap: + case nir_intrinsic_bindless_image_atomic_inc_wrap: + case nir_intrinsic_bindless_image_atomic_dec_wrap: + result = visit_image_atomic(ctx, instr, true); + break; + case nir_intrinsic_image_deref_atomic_add: + case nir_intrinsic_image_deref_atomic_imin: + case nir_intrinsic_image_deref_atomic_umin: + case nir_intrinsic_image_deref_atomic_imax: + case nir_intrinsic_image_deref_atomic_umax: + case nir_intrinsic_image_deref_atomic_and: + case nir_intrinsic_image_deref_atomic_or: + case nir_intrinsic_image_deref_atomic_xor: + case nir_intrinsic_image_deref_atomic_exchange: + case nir_intrinsic_image_deref_atomic_comp_swap: + case nir_intrinsic_image_deref_atomic_inc_wrap: + case nir_intrinsic_image_deref_atomic_dec_wrap: + result = visit_image_atomic(ctx, instr, false); + break; + case nir_intrinsic_bindless_image_size: + result = visit_image_size(ctx, instr, true); + break; + case nir_intrinsic_image_deref_size: + result = visit_image_size(ctx, instr, false); + break; + case nir_intrinsic_shader_clock: + result = ac_build_shader_clock(&ctx->ac); + break; + case nir_intrinsic_discard: + case nir_intrinsic_discard_if: + emit_discard(ctx, instr); + break; + case nir_intrinsic_memory_barrier: + case nir_intrinsic_group_memory_barrier: + case nir_intrinsic_memory_barrier_buffer: + case nir_intrinsic_memory_barrier_image: + case nir_intrinsic_memory_barrier_shared: + emit_membar(&ctx->ac, instr); + break; + case nir_intrinsic_memory_barrier_tcs_patch: + break; + case nir_intrinsic_control_barrier: + ac_emit_barrier(&ctx->ac, ctx->stage); + break; + case nir_intrinsic_shared_atomic_add: + case nir_intrinsic_shared_atomic_imin: + case nir_intrinsic_shared_atomic_umin: + case nir_intrinsic_shared_atomic_imax: + case nir_intrinsic_shared_atomic_umax: + case nir_intrinsic_shared_atomic_and: + case nir_intrinsic_shared_atomic_or: + case nir_intrinsic_shared_atomic_xor: + case nir_intrinsic_shared_atomic_exchange: + case nir_intrinsic_shared_atomic_comp_swap: { + LLVMValueRef ptr = get_memory_ptr(ctx, instr->src[0], + instr->src[1].ssa->bit_size); + result = visit_var_atomic(ctx, instr, ptr, 1); + break; + } + case nir_intrinsic_deref_atomic_add: + case nir_intrinsic_deref_atomic_imin: + case nir_intrinsic_deref_atomic_umin: + case nir_intrinsic_deref_atomic_imax: + case nir_intrinsic_deref_atomic_umax: + case nir_intrinsic_deref_atomic_and: + case nir_intrinsic_deref_atomic_or: + case nir_intrinsic_deref_atomic_xor: + case nir_intrinsic_deref_atomic_exchange: + case nir_intrinsic_deref_atomic_comp_swap: { + LLVMValueRef ptr = get_src(ctx, instr->src[0]); + result = visit_var_atomic(ctx, instr, ptr, 1); + break; + } + case nir_intrinsic_load_barycentric_pixel: + result = barycentric_center(ctx, nir_intrinsic_interp_mode(instr)); + break; + case nir_intrinsic_load_barycentric_centroid: + result = barycentric_centroid(ctx, nir_intrinsic_interp_mode(instr)); + break; + case nir_intrinsic_load_barycentric_sample: + result = barycentric_sample(ctx, nir_intrinsic_interp_mode(instr)); + break; + case nir_intrinsic_load_barycentric_model: + result = barycentric_model(ctx); + break; + case nir_intrinsic_load_barycentric_at_offset: { + LLVMValueRef offset = ac_to_float(&ctx->ac, get_src(ctx, instr->src[0])); + result = barycentric_offset(ctx, nir_intrinsic_interp_mode(instr), offset); + break; + } + case nir_intrinsic_load_barycentric_at_sample: { + LLVMValueRef sample_id = get_src(ctx, instr->src[0]); + result = barycentric_at_sample(ctx, nir_intrinsic_interp_mode(instr), sample_id); + break; + } + case nir_intrinsic_load_interpolated_input: { + /* We assume any indirect loads have been lowered away */ + ASSERTED nir_const_value *offset = nir_src_as_const_value(instr->src[1]); + assert(offset); + assert(offset[0].i32 == 0); + + LLVMValueRef interp_param = get_src(ctx, instr->src[0]); + unsigned index = nir_intrinsic_base(instr); + unsigned component = nir_intrinsic_component(instr); + result = load_interpolated_input(ctx, interp_param, index, + component, + instr->dest.ssa.num_components, + instr->dest.ssa.bit_size); + break; + } + case nir_intrinsic_load_input: + case nir_intrinsic_load_input_vertex: + result = load_input(ctx, instr); + break; + case nir_intrinsic_emit_vertex: + ctx->abi->emit_vertex(ctx->abi, nir_intrinsic_stream_id(instr), ctx->abi->outputs); + break; + case nir_intrinsic_end_primitive: + ctx->abi->emit_primitive(ctx->abi, nir_intrinsic_stream_id(instr)); + break; + case nir_intrinsic_load_tess_coord: + result = ctx->abi->load_tess_coord(ctx->abi); + break; + case nir_intrinsic_load_tess_level_outer: + result = ctx->abi->load_tess_level(ctx->abi, VARYING_SLOT_TESS_LEVEL_OUTER, false); + break; + case nir_intrinsic_load_tess_level_inner: + result = ctx->abi->load_tess_level(ctx->abi, VARYING_SLOT_TESS_LEVEL_INNER, false); + break; + case nir_intrinsic_load_tess_level_outer_default: + result = ctx->abi->load_tess_level(ctx->abi, VARYING_SLOT_TESS_LEVEL_OUTER, true); + break; + case nir_intrinsic_load_tess_level_inner_default: + result = ctx->abi->load_tess_level(ctx->abi, VARYING_SLOT_TESS_LEVEL_INNER, true); + break; + case nir_intrinsic_load_patch_vertices_in: + result = ctx->abi->load_patch_vertices_in(ctx->abi); + break; + case nir_intrinsic_vote_all: { + LLVMValueRef tmp = ac_build_vote_all(&ctx->ac, get_src(ctx, instr->src[0])); + result = LLVMBuildSExt(ctx->ac.builder, tmp, ctx->ac.i32, ""); + break; + } + case nir_intrinsic_vote_any: { + LLVMValueRef tmp = ac_build_vote_any(&ctx->ac, get_src(ctx, instr->src[0])); + result = LLVMBuildSExt(ctx->ac.builder, tmp, ctx->ac.i32, ""); + break; + } + case nir_intrinsic_shuffle: + if (ctx->ac.chip_class == GFX8 || + ctx->ac.chip_class == GFX9 || + (ctx->ac.chip_class == GFX10 && ctx->ac.wave_size == 32)) { + result = ac_build_shuffle(&ctx->ac, get_src(ctx, instr->src[0]), + get_src(ctx, instr->src[1])); + } else { + LLVMValueRef src = get_src(ctx, instr->src[0]); + LLVMValueRef index = get_src(ctx, instr->src[1]); + LLVMTypeRef type = LLVMTypeOf(src); + struct waterfall_context wctx; + LLVMValueRef index_val; + + index_val = enter_waterfall(ctx, &wctx, index, true); + + src = LLVMBuildZExt(ctx->ac.builder, src, + ctx->ac.i32, ""); + + result = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.readlane", + ctx->ac.i32, + (LLVMValueRef []) { src, index_val }, 2, + AC_FUNC_ATTR_READNONE | + AC_FUNC_ATTR_CONVERGENT); + + result = LLVMBuildTrunc(ctx->ac.builder, result, type, ""); + + result = exit_waterfall(ctx, &wctx, result); + } + break; + case nir_intrinsic_reduce: + result = ac_build_reduce(&ctx->ac, + get_src(ctx, instr->src[0]), + instr->const_index[0], + instr->const_index[1]); + break; + case nir_intrinsic_inclusive_scan: + result = ac_build_inclusive_scan(&ctx->ac, + get_src(ctx, instr->src[0]), + instr->const_index[0]); + break; + case nir_intrinsic_exclusive_scan: + result = ac_build_exclusive_scan(&ctx->ac, + get_src(ctx, instr->src[0]), + instr->const_index[0]); + break; + case nir_intrinsic_quad_broadcast: { + unsigned lane = nir_src_as_uint(instr->src[1]); + result = ac_build_quad_swizzle(&ctx->ac, get_src(ctx, instr->src[0]), + lane, lane, lane, lane); + break; + } + case nir_intrinsic_quad_swap_horizontal: + result = ac_build_quad_swizzle(&ctx->ac, get_src(ctx, instr->src[0]), 1, 0, 3 ,2); + break; + case nir_intrinsic_quad_swap_vertical: + result = ac_build_quad_swizzle(&ctx->ac, get_src(ctx, instr->src[0]), 2, 3, 0 ,1); + break; + case nir_intrinsic_quad_swap_diagonal: + result = ac_build_quad_swizzle(&ctx->ac, get_src(ctx, instr->src[0]), 3, 2, 1 ,0); + break; + case nir_intrinsic_quad_swizzle_amd: { + uint32_t mask = nir_intrinsic_swizzle_mask(instr); + result = ac_build_quad_swizzle(&ctx->ac, get_src(ctx, instr->src[0]), + mask & 0x3, (mask >> 2) & 0x3, + (mask >> 4) & 0x3, (mask >> 6) & 0x3); + break; + } + case nir_intrinsic_masked_swizzle_amd: { + uint32_t mask = nir_intrinsic_swizzle_mask(instr); + result = ac_build_ds_swizzle(&ctx->ac, get_src(ctx, instr->src[0]), mask); + break; + } + case nir_intrinsic_write_invocation_amd: + result = ac_build_writelane(&ctx->ac, get_src(ctx, instr->src[0]), + get_src(ctx, instr->src[1]), + get_src(ctx, instr->src[2])); + break; + case nir_intrinsic_mbcnt_amd: + result = ac_build_mbcnt(&ctx->ac, get_src(ctx, instr->src[0])); + break; + case nir_intrinsic_load_scratch: { + LLVMValueRef offset = get_src(ctx, instr->src[0]); + LLVMValueRef ptr = ac_build_gep0(&ctx->ac, ctx->scratch, + offset); + LLVMTypeRef comp_type = + LLVMIntTypeInContext(ctx->ac.context, instr->dest.ssa.bit_size); + LLVMTypeRef vec_type = + instr->dest.ssa.num_components == 1 ? comp_type : + LLVMVectorType(comp_type, instr->dest.ssa.num_components); + unsigned addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr)); + ptr = LLVMBuildBitCast(ctx->ac.builder, ptr, + LLVMPointerType(vec_type, addr_space), ""); + result = LLVMBuildLoad(ctx->ac.builder, ptr, ""); + break; + } + case nir_intrinsic_store_scratch: { + LLVMValueRef offset = get_src(ctx, instr->src[1]); + LLVMValueRef ptr = ac_build_gep0(&ctx->ac, ctx->scratch, + offset); + LLVMTypeRef comp_type = + LLVMIntTypeInContext(ctx->ac.context, instr->src[0].ssa->bit_size); + unsigned addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr)); + ptr = LLVMBuildBitCast(ctx->ac.builder, ptr, + LLVMPointerType(comp_type, addr_space), ""); + LLVMValueRef src = get_src(ctx, instr->src[0]); + unsigned wrmask = nir_intrinsic_write_mask(instr); + while (wrmask) { + int start, count; + u_bit_scan_consecutive_range(&wrmask, &start, &count); + + LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, start, false); + LLVMValueRef offset_ptr = LLVMBuildGEP(ctx->ac.builder, ptr, &offset, 1, ""); + LLVMTypeRef vec_type = + count == 1 ? comp_type : LLVMVectorType(comp_type, count); + offset_ptr = LLVMBuildBitCast(ctx->ac.builder, + offset_ptr, + LLVMPointerType(vec_type, addr_space), + ""); + LLVMValueRef offset_src = + ac_extract_components(&ctx->ac, src, start, count); + LLVMBuildStore(ctx->ac.builder, offset_src, offset_ptr); + } + break; + } + case nir_intrinsic_load_constant: { + unsigned base = nir_intrinsic_base(instr); + unsigned range = nir_intrinsic_range(instr); + + LLVMValueRef offset = get_src(ctx, instr->src[0]); + offset = LLVMBuildAdd(ctx->ac.builder, offset, + LLVMConstInt(ctx->ac.i32, base, false), ""); + + /* Clamp the offset to avoid out-of-bound access because global + * instructions can't handle them. + */ + LLVMValueRef size = LLVMConstInt(ctx->ac.i32, base + range, false); + LLVMValueRef cond = LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, + offset, size, ""); + offset = LLVMBuildSelect(ctx->ac.builder, cond, offset, size, ""); + + LLVMValueRef ptr = ac_build_gep0(&ctx->ac, ctx->constant_data, + offset); + LLVMTypeRef comp_type = + LLVMIntTypeInContext(ctx->ac.context, instr->dest.ssa.bit_size); + LLVMTypeRef vec_type = + instr->dest.ssa.num_components == 1 ? comp_type : + LLVMVectorType(comp_type, instr->dest.ssa.num_components); + unsigned addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr)); + ptr = LLVMBuildBitCast(ctx->ac.builder, ptr, + LLVMPointerType(vec_type, addr_space), ""); + result = LLVMBuildLoad(ctx->ac.builder, ptr, ""); + break; + } + default: + fprintf(stderr, "Unknown intrinsic: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + break; + } + if (result) { + ctx->ssa_defs[instr->dest.ssa.index] = result; + } +} + +static LLVMValueRef get_bindless_index_from_uniform(struct ac_nir_context *ctx, + unsigned base_index, + unsigned constant_index, + LLVMValueRef dynamic_index) +{ + LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, base_index * 4, 0); + LLVMValueRef index = LLVMBuildAdd(ctx->ac.builder, dynamic_index, + LLVMConstInt(ctx->ac.i32, constant_index, 0), ""); + + /* Bindless uniforms are 64bit so multiple index by 8 */ + index = LLVMBuildMul(ctx->ac.builder, index, LLVMConstInt(ctx->ac.i32, 8, 0), ""); + offset = LLVMBuildAdd(ctx->ac.builder, offset, index, ""); + + LLVMValueRef ubo_index = ctx->abi->load_ubo(ctx->abi, ctx->ac.i32_0); + + LLVMValueRef ret = ac_build_buffer_load(&ctx->ac, ubo_index, 1, NULL, offset, + NULL, 0, 0, true, true); + + return LLVMBuildBitCast(ctx->ac.builder, ret, ctx->ac.i32, ""); +} + +struct sampler_desc_address { + unsigned descriptor_set; + unsigned base_index; /* binding in vulkan */ + unsigned constant_index; + LLVMValueRef dynamic_index; + bool image; + bool bindless; +}; + +static struct sampler_desc_address +get_sampler_desc_internal(struct ac_nir_context *ctx, + nir_deref_instr *deref_instr, + const nir_instr *instr, + bool image) +{ + LLVMValueRef index = NULL; + unsigned constant_index = 0; + unsigned descriptor_set; + unsigned base_index; + bool bindless = false; + + if (!deref_instr) { + descriptor_set = 0; + if (image) { + nir_intrinsic_instr *img_instr = nir_instr_as_intrinsic(instr); + base_index = 0; + bindless = true; + index = get_src(ctx, img_instr->src[0]); + } else { + nir_tex_instr *tex_instr = nir_instr_as_tex(instr); + int sampSrcIdx = nir_tex_instr_src_index(tex_instr, + nir_tex_src_sampler_handle); + if (sampSrcIdx != -1) { + base_index = 0; + bindless = true; + index = get_src(ctx, tex_instr->src[sampSrcIdx].src); + } else { + assert(tex_instr && !image); + base_index = tex_instr->sampler_index; + } + } + } else { + while(deref_instr->deref_type != nir_deref_type_var) { + if (deref_instr->deref_type == nir_deref_type_array) { + unsigned array_size = glsl_get_aoa_size(deref_instr->type); + if (!array_size) + array_size = 1; + + if (nir_src_is_const(deref_instr->arr.index)) { + constant_index += array_size * nir_src_as_uint(deref_instr->arr.index); + } else { + LLVMValueRef indirect = get_src(ctx, deref_instr->arr.index); + + indirect = LLVMBuildMul(ctx->ac.builder, indirect, + LLVMConstInt(ctx->ac.i32, array_size, false), ""); + + if (!index) + index = indirect; + else + index = LLVMBuildAdd(ctx->ac.builder, index, indirect, ""); + } + + deref_instr = nir_src_as_deref(deref_instr->parent); + } else if (deref_instr->deref_type == nir_deref_type_struct) { + unsigned sidx = deref_instr->strct.index; + deref_instr = nir_src_as_deref(deref_instr->parent); + constant_index += glsl_get_struct_location_offset(deref_instr->type, sidx); + } else { + unreachable("Unsupported deref type"); + } + } + descriptor_set = deref_instr->var->data.descriptor_set; + + if (deref_instr->var->data.bindless) { + /* For now just assert on unhandled variable types */ + assert(deref_instr->var->data.mode == nir_var_uniform); + + base_index = deref_instr->var->data.driver_location; + bindless = true; + + index = index ? index : ctx->ac.i32_0; + index = get_bindless_index_from_uniform(ctx, base_index, + constant_index, index); + } else + base_index = deref_instr->var->data.binding; + } + return (struct sampler_desc_address) { + .descriptor_set = descriptor_set, + .base_index = base_index, + .constant_index = constant_index, + .dynamic_index = index, + .image = image, + .bindless = bindless, + }; +} + +/* Extract any possibly divergent index into a separate value that can be fed + * into get_sampler_desc with the same arguments. */ +static LLVMValueRef get_sampler_desc_index(struct ac_nir_context *ctx, + nir_deref_instr *deref_instr, + const nir_instr *instr, + bool image) +{ + struct sampler_desc_address addr = get_sampler_desc_internal(ctx, deref_instr, instr, image); + return addr.dynamic_index; +} + +static LLVMValueRef get_sampler_desc(struct ac_nir_context *ctx, + nir_deref_instr *deref_instr, + enum ac_descriptor_type desc_type, + const nir_instr *instr, + LLVMValueRef index, + bool image, bool write) +{ + struct sampler_desc_address addr = get_sampler_desc_internal(ctx, deref_instr, instr, image); + return ctx->abi->load_sampler_desc(ctx->abi, + addr.descriptor_set, + addr.base_index, + addr.constant_index, index, + desc_type, addr.image, write, addr.bindless); +} + +/* Disable anisotropic filtering if BASE_LEVEL == LAST_LEVEL. + * + * GFX6-GFX7: + * If BASE_LEVEL == LAST_LEVEL, the shader must disable anisotropic + * filtering manually. The driver sets img7 to a mask clearing + * MAX_ANISO_RATIO if BASE_LEVEL == LAST_LEVEL. The shader must do: + * s_and_b32 samp0, samp0, img7 + * + * GFX8: + * The ANISO_OVERRIDE sampler field enables this fix in TA. + */ +static LLVMValueRef sici_fix_sampler_aniso(struct ac_nir_context *ctx, + LLVMValueRef res, LLVMValueRef samp) +{ + LLVMBuilderRef builder = ctx->ac.builder; + LLVMValueRef img7, samp0; + + if (ctx->ac.chip_class >= GFX8) + return samp; + + img7 = LLVMBuildExtractElement(builder, res, + LLVMConstInt(ctx->ac.i32, 7, 0), ""); + samp0 = LLVMBuildExtractElement(builder, samp, + LLVMConstInt(ctx->ac.i32, 0, 0), ""); + samp0 = LLVMBuildAnd(builder, samp0, img7, ""); + return LLVMBuildInsertElement(builder, samp, samp0, + LLVMConstInt(ctx->ac.i32, 0, 0), ""); +} + +static void tex_fetch_ptrs(struct ac_nir_context *ctx, + nir_tex_instr *instr, + struct waterfall_context *wctx, + LLVMValueRef *res_ptr, LLVMValueRef *samp_ptr, + LLVMValueRef *fmask_ptr) +{ + nir_deref_instr *texture_deref_instr = NULL; + nir_deref_instr *sampler_deref_instr = NULL; + int plane = -1; + + for (unsigned i = 0; i < instr->num_srcs; i++) { + switch (instr->src[i].src_type) { + case nir_tex_src_texture_deref: + texture_deref_instr = nir_src_as_deref(instr->src[i].src); + break; + case nir_tex_src_sampler_deref: + sampler_deref_instr = nir_src_as_deref(instr->src[i].src); + break; + case nir_tex_src_plane: + plane = nir_src_as_int(instr->src[i].src); + break; + default: + break; + } + } + + LLVMValueRef texture_dynamic_index = get_sampler_desc_index(ctx, texture_deref_instr, + &instr->instr, false); + if (!sampler_deref_instr) + sampler_deref_instr = texture_deref_instr; + + LLVMValueRef sampler_dynamic_index = get_sampler_desc_index(ctx, sampler_deref_instr, + &instr->instr, false); + if (instr->texture_non_uniform) + texture_dynamic_index = enter_waterfall(ctx, wctx + 0, texture_dynamic_index, true); + + if (instr->sampler_non_uniform) + sampler_dynamic_index = enter_waterfall(ctx, wctx + 1, sampler_dynamic_index, true); + + enum ac_descriptor_type main_descriptor = instr->sampler_dim == GLSL_SAMPLER_DIM_BUF ? AC_DESC_BUFFER : AC_DESC_IMAGE; + + if (plane >= 0) { + assert(instr->op != nir_texop_txf_ms && + instr->op != nir_texop_samples_identical); + assert(instr->sampler_dim != GLSL_SAMPLER_DIM_BUF); + + main_descriptor = AC_DESC_PLANE_0 + plane; + } + + if (instr->op == nir_texop_fragment_mask_fetch) { + /* The fragment mask is fetched from the compressed + * multisampled surface. + */ + main_descriptor = AC_DESC_FMASK; + } + + *res_ptr = get_sampler_desc(ctx, texture_deref_instr, main_descriptor, &instr->instr, + texture_dynamic_index, false, false); + + if (samp_ptr) { + *samp_ptr = get_sampler_desc(ctx, sampler_deref_instr, AC_DESC_SAMPLER, &instr->instr, + sampler_dynamic_index, false, false); + if (instr->sampler_dim < GLSL_SAMPLER_DIM_RECT) + *samp_ptr = sici_fix_sampler_aniso(ctx, *res_ptr, *samp_ptr); + } + if (fmask_ptr && (instr->op == nir_texop_txf_ms || + instr->op == nir_texop_samples_identical)) + *fmask_ptr = get_sampler_desc(ctx, texture_deref_instr, AC_DESC_FMASK, + &instr->instr, texture_dynamic_index, false, false); +} + +static LLVMValueRef apply_round_slice(struct ac_llvm_context *ctx, + LLVMValueRef coord) +{ + coord = ac_to_float(ctx, coord); + coord = ac_build_round(ctx, coord); + coord = ac_to_integer(ctx, coord); + return coord; +} + +static void visit_tex(struct ac_nir_context *ctx, nir_tex_instr *instr) +{ + LLVMValueRef result = NULL; + struct ac_image_args args = { 0 }; + LLVMValueRef fmask_ptr = NULL, sample_index = NULL; + LLVMValueRef ddx = NULL, ddy = NULL; + unsigned offset_src = 0; + struct waterfall_context wctx[2] = {{{0}}}; + + tex_fetch_ptrs(ctx, instr, wctx, &args.resource, &args.sampler, &fmask_ptr); + + for (unsigned i = 0; i < instr->num_srcs; i++) { + switch (instr->src[i].src_type) { + case nir_tex_src_coord: { + LLVMValueRef coord = get_src(ctx, instr->src[i].src); + for (unsigned chan = 0; chan < instr->coord_components; ++chan) + args.coords[chan] = ac_llvm_extract_elem(&ctx->ac, coord, chan); + break; + } + case nir_tex_src_projector: + break; + case nir_tex_src_comparator: + if (instr->is_shadow) { + args.compare = get_src(ctx, instr->src[i].src); + args.compare = ac_to_float(&ctx->ac, args.compare); + } + break; + case nir_tex_src_offset: + args.offset = get_src(ctx, instr->src[i].src); + offset_src = i; + break; + case nir_tex_src_bias: + if (instr->op == nir_texop_txb) + args.bias = get_src(ctx, instr->src[i].src); + break; + case nir_tex_src_lod: { + if (nir_src_is_const(instr->src[i].src) && nir_src_as_uint(instr->src[i].src) == 0) + args.level_zero = true; + else + args.lod = get_src(ctx, instr->src[i].src); + break; + } + case nir_tex_src_ms_index: + sample_index = get_src(ctx, instr->src[i].src); + break; + case nir_tex_src_ms_mcs: + break; + case nir_tex_src_ddx: + ddx = get_src(ctx, instr->src[i].src); + break; + case nir_tex_src_ddy: + ddy = get_src(ctx, instr->src[i].src); + break; + case nir_tex_src_texture_offset: + case nir_tex_src_sampler_offset: + case nir_tex_src_plane: + default: + break; + } + } + + if (instr->op == nir_texop_txs && instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) { + result = get_buffer_size(ctx, args.resource, true); + goto write_result; + } + + if (instr->op == nir_texop_texture_samples) { + LLVMValueRef res, samples, is_msaa; + res = LLVMBuildBitCast(ctx->ac.builder, args.resource, ctx->ac.v8i32, ""); + samples = LLVMBuildExtractElement(ctx->ac.builder, res, + LLVMConstInt(ctx->ac.i32, 3, false), ""); + is_msaa = LLVMBuildLShr(ctx->ac.builder, samples, + LLVMConstInt(ctx->ac.i32, 28, false), ""); + is_msaa = LLVMBuildAnd(ctx->ac.builder, is_msaa, + LLVMConstInt(ctx->ac.i32, 0xe, false), ""); + is_msaa = LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, is_msaa, + LLVMConstInt(ctx->ac.i32, 0xe, false), ""); + + samples = LLVMBuildLShr(ctx->ac.builder, samples, + LLVMConstInt(ctx->ac.i32, 16, false), ""); + samples = LLVMBuildAnd(ctx->ac.builder, samples, + LLVMConstInt(ctx->ac.i32, 0xf, false), ""); + samples = LLVMBuildShl(ctx->ac.builder, ctx->ac.i32_1, + samples, ""); + samples = LLVMBuildSelect(ctx->ac.builder, is_msaa, samples, + ctx->ac.i32_1, ""); + result = samples; + goto write_result; + } + + if (args.offset && instr->op != nir_texop_txf && instr->op != nir_texop_txf_ms) { + LLVMValueRef offset[3], pack; + for (unsigned chan = 0; chan < 3; ++chan) + offset[chan] = ctx->ac.i32_0; + + unsigned num_components = ac_get_llvm_num_components(args.offset); + for (unsigned chan = 0; chan < num_components; chan++) { + offset[chan] = ac_llvm_extract_elem(&ctx->ac, args.offset, chan); + offset[chan] = LLVMBuildAnd(ctx->ac.builder, offset[chan], + LLVMConstInt(ctx->ac.i32, 0x3f, false), ""); + if (chan) + offset[chan] = LLVMBuildShl(ctx->ac.builder, offset[chan], + LLVMConstInt(ctx->ac.i32, chan * 8, false), ""); + } + pack = LLVMBuildOr(ctx->ac.builder, offset[0], offset[1], ""); + pack = LLVMBuildOr(ctx->ac.builder, pack, offset[2], ""); + args.offset = pack; + } + + /* Section 8.23.1 (Depth Texture Comparison Mode) of the + * OpenGL 4.5 spec says: + * + * "If the texture’s internal format indicates a fixed-point + * depth texture, then D_t and D_ref are clamped to the + * range [0, 1]; otherwise no clamping is performed." + * + * TC-compatible HTILE promotes Z16 and Z24 to Z32_FLOAT, + * so the depth comparison value isn't clamped for Z16 and + * Z24 anymore. Do it manually here for GFX8-9; GFX10 has + * an explicitly clamped 32-bit float format. + */ + if (args.compare && + ctx->ac.chip_class >= GFX8 && + ctx->ac.chip_class <= GFX9 && + ctx->abi->clamp_shadow_reference) { + LLVMValueRef upgraded, clamped; + + upgraded = LLVMBuildExtractElement(ctx->ac.builder, args.sampler, + LLVMConstInt(ctx->ac.i32, 3, false), ""); + upgraded = LLVMBuildLShr(ctx->ac.builder, upgraded, + LLVMConstInt(ctx->ac.i32, 29, false), ""); + upgraded = LLVMBuildTrunc(ctx->ac.builder, upgraded, ctx->ac.i1, ""); + clamped = ac_build_clamp(&ctx->ac, args.compare); + args.compare = LLVMBuildSelect(ctx->ac.builder, upgraded, clamped, + args.compare, ""); + } + + /* pack derivatives */ + if (ddx || ddy) { + int num_src_deriv_channels, num_dest_deriv_channels; + switch (instr->sampler_dim) { + case GLSL_SAMPLER_DIM_3D: + case GLSL_SAMPLER_DIM_CUBE: + num_src_deriv_channels = 3; + num_dest_deriv_channels = 3; + break; + case GLSL_SAMPLER_DIM_2D: + default: + num_src_deriv_channels = 2; + num_dest_deriv_channels = 2; + break; + case GLSL_SAMPLER_DIM_1D: + num_src_deriv_channels = 1; + if (ctx->ac.chip_class == GFX9) { + num_dest_deriv_channels = 2; + } else { + num_dest_deriv_channels = 1; + } + break; + } + + for (unsigned i = 0; i < num_src_deriv_channels; i++) { + args.derivs[i] = ac_to_float(&ctx->ac, + ac_llvm_extract_elem(&ctx->ac, ddx, i)); + args.derivs[num_dest_deriv_channels + i] = ac_to_float(&ctx->ac, + ac_llvm_extract_elem(&ctx->ac, ddy, i)); + } + for (unsigned i = num_src_deriv_channels; i < num_dest_deriv_channels; i++) { + args.derivs[i] = ctx->ac.f32_0; + args.derivs[num_dest_deriv_channels + i] = ctx->ac.f32_0; + } + } + + if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE && args.coords[0]) { + for (unsigned chan = 0; chan < instr->coord_components; chan++) + args.coords[chan] = ac_to_float(&ctx->ac, args.coords[chan]); + if (instr->coord_components == 3) + args.coords[3] = LLVMGetUndef(ctx->ac.f32); + ac_prepare_cube_coords(&ctx->ac, + instr->op == nir_texop_txd, instr->is_array, + instr->op == nir_texop_lod, args.coords, args.derivs); + } + + /* Texture coordinates fixups */ + if (instr->coord_components > 1 && + instr->sampler_dim == GLSL_SAMPLER_DIM_1D && + instr->is_array && + instr->op != nir_texop_txf) { + args.coords[1] = apply_round_slice(&ctx->ac, args.coords[1]); + } + + if (instr->coord_components > 2 && + (instr->sampler_dim == GLSL_SAMPLER_DIM_2D || + instr->sampler_dim == GLSL_SAMPLER_DIM_MS || + instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS || + instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS) && + instr->is_array && + instr->op != nir_texop_txf && + instr->op != nir_texop_txf_ms && + instr->op != nir_texop_fragment_fetch && + instr->op != nir_texop_fragment_mask_fetch) { + args.coords[2] = apply_round_slice(&ctx->ac, args.coords[2]); + } + + if (ctx->ac.chip_class == GFX9 && + instr->sampler_dim == GLSL_SAMPLER_DIM_1D && + instr->op != nir_texop_lod) { + LLVMValueRef filler; + if (instr->op == nir_texop_txf) + filler = ctx->ac.i32_0; + else + filler = LLVMConstReal(ctx->ac.f32, 0.5); + + if (instr->is_array) + args.coords[2] = args.coords[1]; + args.coords[1] = filler; + } + + /* Pack sample index */ + if (sample_index && (instr->op == nir_texop_txf_ms || + instr->op == nir_texop_fragment_fetch)) + args.coords[instr->coord_components] = sample_index; + + if (instr->op == nir_texop_samples_identical) { + struct ac_image_args txf_args = { 0 }; + memcpy(txf_args.coords, args.coords, sizeof(txf_args.coords)); + + txf_args.dmask = 0xf; + txf_args.resource = fmask_ptr; + txf_args.dim = instr->is_array ? ac_image_2darray : ac_image_2d; + result = build_tex_intrinsic(ctx, instr, &txf_args); + + result = LLVMBuildExtractElement(ctx->ac.builder, result, ctx->ac.i32_0, ""); + result = emit_int_cmp(&ctx->ac, LLVMIntEQ, result, ctx->ac.i32_0); + goto write_result; + } + + if ((instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS || + instr->sampler_dim == GLSL_SAMPLER_DIM_MS) && + instr->op != nir_texop_txs && + instr->op != nir_texop_fragment_fetch && + instr->op != nir_texop_fragment_mask_fetch) { + unsigned sample_chan = instr->is_array ? 3 : 2; + args.coords[sample_chan] = adjust_sample_index_using_fmask( + &ctx->ac, args.coords[0], args.coords[1], + instr->is_array ? args.coords[2] : NULL, + args.coords[sample_chan], fmask_ptr); + } + + if (args.offset && (instr->op == nir_texop_txf || instr->op == nir_texop_txf_ms)) { + int num_offsets = instr->src[offset_src].src.ssa->num_components; + num_offsets = MIN2(num_offsets, instr->coord_components); + for (unsigned i = 0; i < num_offsets; ++i) { + args.coords[i] = LLVMBuildAdd( + ctx->ac.builder, args.coords[i], + LLVMConstInt(ctx->ac.i32, nir_src_comp_as_uint(instr->src[offset_src].src, i), false), ""); + } + args.offset = NULL; + } + + /* DMASK was repurposed for GATHER4. 4 components are always + * returned and DMASK works like a swizzle - it selects + * the component to fetch. The only valid DMASK values are + * 1=red, 2=green, 4=blue, 8=alpha. (e.g. 1 returns + * (red,red,red,red) etc.) The ISA document doesn't mention + * this. + */ + args.dmask = 0xf; + if (instr->op == nir_texop_tg4) { + if (instr->is_shadow) + args.dmask = 1; + else + args.dmask = 1 << instr->component; + } + + if (instr->sampler_dim != GLSL_SAMPLER_DIM_BUF) { + args.dim = ac_get_sampler_dim(ctx->ac.chip_class, instr->sampler_dim, instr->is_array); + args.unorm = instr->sampler_dim == GLSL_SAMPLER_DIM_RECT; + } + + /* Adjust the number of coordinates because we only need (x,y) for 2D + * multisampled images and (x,y,layer) for 2D multisampled layered + * images or for multisampled input attachments. + */ + if (instr->op == nir_texop_fragment_mask_fetch) { + if (args.dim == ac_image_2dmsaa) { + args.dim = ac_image_2d; + } else { + assert(args.dim == ac_image_2darraymsaa); + args.dim = ac_image_2darray; + } + } + + result = build_tex_intrinsic(ctx, instr, &args); + + if (instr->op == nir_texop_query_levels) + result = LLVMBuildExtractElement(ctx->ac.builder, result, LLVMConstInt(ctx->ac.i32, 3, false), ""); + else if (instr->is_shadow && instr->is_new_style_shadow && + instr->op != nir_texop_txs && instr->op != nir_texop_lod && + instr->op != nir_texop_tg4) + result = LLVMBuildExtractElement(ctx->ac.builder, result, ctx->ac.i32_0, ""); + else if (instr->op == nir_texop_txs && + instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE && + instr->is_array) { + LLVMValueRef two = LLVMConstInt(ctx->ac.i32, 2, false); + LLVMValueRef six = LLVMConstInt(ctx->ac.i32, 6, false); + LLVMValueRef z = LLVMBuildExtractElement(ctx->ac.builder, result, two, ""); + z = LLVMBuildSDiv(ctx->ac.builder, z, six, ""); + result = LLVMBuildInsertElement(ctx->ac.builder, result, z, two, ""); + } else if (ctx->ac.chip_class == GFX9 && + instr->op == nir_texop_txs && + instr->sampler_dim == GLSL_SAMPLER_DIM_1D && + instr->is_array) { + LLVMValueRef two = LLVMConstInt(ctx->ac.i32, 2, false); + LLVMValueRef layers = LLVMBuildExtractElement(ctx->ac.builder, result, two, ""); + result = LLVMBuildInsertElement(ctx->ac.builder, result, layers, + ctx->ac.i32_1, ""); + } else if (instr->dest.ssa.num_components != 4) + result = ac_trim_vector(&ctx->ac, result, instr->dest.ssa.num_components); + +write_result: + if (result) { + assert(instr->dest.is_ssa); + result = ac_to_integer(&ctx->ac, result); + + for (int i = ARRAY_SIZE(wctx); --i >= 0;) { + result = exit_waterfall(ctx, wctx + i, result); + } + + ctx->ssa_defs[instr->dest.ssa.index] = result; + } +} + +static void visit_phi(struct ac_nir_context *ctx, nir_phi_instr *instr) +{ + LLVMTypeRef type = get_def_type(ctx, &instr->dest.ssa); + LLVMValueRef result = LLVMBuildPhi(ctx->ac.builder, type, ""); + + ctx->ssa_defs[instr->dest.ssa.index] = result; + _mesa_hash_table_insert(ctx->phis, instr, result); +} + +static void visit_post_phi(struct ac_nir_context *ctx, + nir_phi_instr *instr, + LLVMValueRef llvm_phi) +{ + nir_foreach_phi_src(src, instr) { + LLVMBasicBlockRef block = get_block(ctx, src->pred); + LLVMValueRef llvm_src = get_src(ctx, src->src); + + LLVMAddIncoming(llvm_phi, &llvm_src, &block, 1); + } +} + +static void phi_post_pass(struct ac_nir_context *ctx) +{ + hash_table_foreach(ctx->phis, entry) { + visit_post_phi(ctx, (nir_phi_instr*)entry->key, + (LLVMValueRef)entry->data); + } +} + + +static void visit_ssa_undef(struct ac_nir_context *ctx, + const nir_ssa_undef_instr *instr) +{ + unsigned num_components = instr->def.num_components; + LLVMTypeRef type = LLVMIntTypeInContext(ctx->ac.context, instr->def.bit_size); + LLVMValueRef undef; + + if (num_components == 1) + undef = LLVMGetUndef(type); + else { + undef = LLVMGetUndef(LLVMVectorType(type, num_components)); + } + ctx->ssa_defs[instr->def.index] = undef; +} + +static void visit_jump(struct ac_llvm_context *ctx, + const nir_jump_instr *instr) +{ + switch (instr->type) { + case nir_jump_break: + ac_build_break(ctx); + break; + case nir_jump_continue: + ac_build_continue(ctx); + break; + default: + fprintf(stderr, "Unknown NIR jump instr: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + abort(); + } +} + +static LLVMTypeRef +glsl_base_to_llvm_type(struct ac_llvm_context *ac, + enum glsl_base_type type) +{ + switch (type) { + case GLSL_TYPE_INT: + case GLSL_TYPE_UINT: + case GLSL_TYPE_BOOL: + case GLSL_TYPE_SUBROUTINE: + return ac->i32; + case GLSL_TYPE_INT8: + case GLSL_TYPE_UINT8: + return ac->i8; + case GLSL_TYPE_INT16: + case GLSL_TYPE_UINT16: + return ac->i16; + case GLSL_TYPE_FLOAT: + return ac->f32; + case GLSL_TYPE_FLOAT16: + return ac->f16; + case GLSL_TYPE_INT64: + case GLSL_TYPE_UINT64: + return ac->i64; + case GLSL_TYPE_DOUBLE: + return ac->f64; + default: + unreachable("unknown GLSL type"); + } +} + +static LLVMTypeRef +glsl_to_llvm_type(struct ac_llvm_context *ac, + const struct glsl_type *type) +{ + if (glsl_type_is_scalar(type)) { + return glsl_base_to_llvm_type(ac, glsl_get_base_type(type)); + } + + if (glsl_type_is_vector(type)) { + return LLVMVectorType( + glsl_base_to_llvm_type(ac, glsl_get_base_type(type)), + glsl_get_vector_elements(type)); + } + + if (glsl_type_is_matrix(type)) { + return LLVMArrayType( + glsl_to_llvm_type(ac, glsl_get_column_type(type)), + glsl_get_matrix_columns(type)); + } + + if (glsl_type_is_array(type)) { + return LLVMArrayType( + glsl_to_llvm_type(ac, glsl_get_array_element(type)), + glsl_get_length(type)); + } + + assert(glsl_type_is_struct_or_ifc(type)); + + LLVMTypeRef member_types[glsl_get_length(type)]; + + for (unsigned i = 0; i < glsl_get_length(type); i++) { + member_types[i] = + glsl_to_llvm_type(ac, + glsl_get_struct_field(type, i)); + } + + return LLVMStructTypeInContext(ac->context, member_types, + glsl_get_length(type), false); +} + +static void visit_deref(struct ac_nir_context *ctx, + nir_deref_instr *instr) +{ + if (instr->mode != nir_var_mem_shared && + instr->mode != nir_var_mem_global) + return; + + LLVMValueRef result = NULL; + switch(instr->deref_type) { + case nir_deref_type_var: { + struct hash_entry *entry = _mesa_hash_table_search(ctx->vars, instr->var); + result = entry->data; + break; + } + case nir_deref_type_struct: + if (instr->mode == nir_var_mem_global) { + nir_deref_instr *parent = nir_deref_instr_parent(instr); + uint64_t offset = glsl_get_struct_field_offset(parent->type, + instr->strct.index); + result = ac_build_gep_ptr(&ctx->ac, get_src(ctx, instr->parent), + LLVMConstInt(ctx->ac.i32, offset, 0)); + } else { + result = ac_build_gep0(&ctx->ac, get_src(ctx, instr->parent), + LLVMConstInt(ctx->ac.i32, instr->strct.index, 0)); + } + break; + case nir_deref_type_array: + if (instr->mode == nir_var_mem_global) { + nir_deref_instr *parent = nir_deref_instr_parent(instr); + unsigned stride = glsl_get_explicit_stride(parent->type); + + if ((glsl_type_is_matrix(parent->type) && + glsl_matrix_type_is_row_major(parent->type)) || + (glsl_type_is_vector(parent->type) && stride == 0)) + stride = type_scalar_size_bytes(parent->type); + + assert(stride > 0); + LLVMValueRef index = get_src(ctx, instr->arr.index); + if (LLVMTypeOf(index) != ctx->ac.i64) + index = LLVMBuildZExt(ctx->ac.builder, index, ctx->ac.i64, ""); + + LLVMValueRef offset = LLVMBuildMul(ctx->ac.builder, index, LLVMConstInt(ctx->ac.i64, stride, 0), ""); + + result = ac_build_gep_ptr(&ctx->ac, get_src(ctx, instr->parent), offset); + } else { + result = ac_build_gep0(&ctx->ac, get_src(ctx, instr->parent), + get_src(ctx, instr->arr.index)); + } + break; + case nir_deref_type_ptr_as_array: + if (instr->mode == nir_var_mem_global) { + unsigned stride = nir_deref_instr_ptr_as_array_stride(instr); + + LLVMValueRef index = get_src(ctx, instr->arr.index); + if (LLVMTypeOf(index) != ctx->ac.i64) + index = LLVMBuildZExt(ctx->ac.builder, index, ctx->ac.i64, ""); + + LLVMValueRef offset = LLVMBuildMul(ctx->ac.builder, index, LLVMConstInt(ctx->ac.i64, stride, 0), ""); + + result = ac_build_gep_ptr(&ctx->ac, get_src(ctx, instr->parent), offset); + } else { + result = ac_build_gep_ptr(&ctx->ac, get_src(ctx, instr->parent), + get_src(ctx, instr->arr.index)); + } + break; + case nir_deref_type_cast: { + result = get_src(ctx, instr->parent); + + /* We can't use the structs from LLVM because the shader + * specifies its own offsets. */ + LLVMTypeRef pointee_type = ctx->ac.i8; + if (instr->mode == nir_var_mem_shared) + pointee_type = glsl_to_llvm_type(&ctx->ac, instr->type); + + unsigned address_space; + + switch(instr->mode) { + case nir_var_mem_shared: + address_space = AC_ADDR_SPACE_LDS; + break; + case nir_var_mem_global: + address_space = AC_ADDR_SPACE_GLOBAL; + break; + default: + unreachable("Unhandled address space"); + } + + LLVMTypeRef type = LLVMPointerType(pointee_type, address_space); + + if (LLVMTypeOf(result) != type) { + if (LLVMGetTypeKind(LLVMTypeOf(result)) == LLVMVectorTypeKind) { + result = LLVMBuildBitCast(ctx->ac.builder, result, + type, ""); + } else { + result = LLVMBuildIntToPtr(ctx->ac.builder, result, + type, ""); + } + } + break; + } + default: + unreachable("Unhandled deref_instr deref type"); + } + + ctx->ssa_defs[instr->dest.ssa.index] = result; +} + +static void visit_cf_list(struct ac_nir_context *ctx, + struct exec_list *list); + +static void visit_block(struct ac_nir_context *ctx, nir_block *block) +{ + nir_foreach_instr(instr, block) + { + switch (instr->type) { + case nir_instr_type_alu: + visit_alu(ctx, nir_instr_as_alu(instr)); + break; + case nir_instr_type_load_const: + visit_load_const(ctx, nir_instr_as_load_const(instr)); + break; + case nir_instr_type_intrinsic: + visit_intrinsic(ctx, nir_instr_as_intrinsic(instr)); + break; + case nir_instr_type_tex: + visit_tex(ctx, nir_instr_as_tex(instr)); + break; + case nir_instr_type_phi: + visit_phi(ctx, nir_instr_as_phi(instr)); + break; + case nir_instr_type_ssa_undef: + visit_ssa_undef(ctx, nir_instr_as_ssa_undef(instr)); + break; + case nir_instr_type_jump: + visit_jump(&ctx->ac, nir_instr_as_jump(instr)); + break; + case nir_instr_type_deref: + visit_deref(ctx, nir_instr_as_deref(instr)); + break; + default: + fprintf(stderr, "Unknown NIR instr type: "); + nir_print_instr(instr, stderr); + fprintf(stderr, "\n"); + abort(); + } + } + + _mesa_hash_table_insert(ctx->defs, block, + LLVMGetInsertBlock(ctx->ac.builder)); +} + +static void visit_if(struct ac_nir_context *ctx, nir_if *if_stmt) +{ + LLVMValueRef value = get_src(ctx, if_stmt->condition); + + nir_block *then_block = + (nir_block *) exec_list_get_head(&if_stmt->then_list); + + ac_build_uif(&ctx->ac, value, then_block->index); + + visit_cf_list(ctx, &if_stmt->then_list); + + if (!exec_list_is_empty(&if_stmt->else_list)) { + nir_block *else_block = + (nir_block *) exec_list_get_head(&if_stmt->else_list); + + ac_build_else(&ctx->ac, else_block->index); + visit_cf_list(ctx, &if_stmt->else_list); + } + + ac_build_endif(&ctx->ac, then_block->index); +} + +static void visit_loop(struct ac_nir_context *ctx, nir_loop *loop) +{ + nir_block *first_loop_block = + (nir_block *) exec_list_get_head(&loop->body); + + ac_build_bgnloop(&ctx->ac, first_loop_block->index); + + visit_cf_list(ctx, &loop->body); + + ac_build_endloop(&ctx->ac, first_loop_block->index); +} + +static void visit_cf_list(struct ac_nir_context *ctx, + struct exec_list *list) +{ + foreach_list_typed(nir_cf_node, node, node, list) + { + switch (node->type) { + case nir_cf_node_block: + visit_block(ctx, nir_cf_node_as_block(node)); + break; + + case nir_cf_node_if: + visit_if(ctx, nir_cf_node_as_if(node)); + break; + + case nir_cf_node_loop: + visit_loop(ctx, nir_cf_node_as_loop(node)); + break; + + default: + assert(0); + } + } +} + +void +ac_handle_shader_output_decl(struct ac_llvm_context *ctx, + struct ac_shader_abi *abi, + struct nir_shader *nir, + struct nir_variable *variable, + gl_shader_stage stage) +{ + unsigned output_loc = variable->data.driver_location / 4; + unsigned attrib_count = glsl_count_attribute_slots(variable->type, false); + + /* tess ctrl has it's own load/store paths for outputs */ + if (stage == MESA_SHADER_TESS_CTRL) + return; + + if (stage == MESA_SHADER_VERTEX || + stage == MESA_SHADER_TESS_EVAL || + stage == MESA_SHADER_GEOMETRY) { + int idx = variable->data.location + variable->data.index; + if (idx == VARYING_SLOT_CLIP_DIST0) { + int length = nir->info.clip_distance_array_size + + nir->info.cull_distance_array_size; + + if (length > 4) + attrib_count = 2; + else + attrib_count = 1; + } + } + + bool is_16bit = glsl_type_is_16bit(glsl_without_array(variable->type)); + LLVMTypeRef type = is_16bit ? ctx->f16 : ctx->f32; + for (unsigned i = 0; i < attrib_count; ++i) { + for (unsigned chan = 0; chan < 4; chan++) { + abi->outputs[ac_llvm_reg_index_soa(output_loc + i, chan)] = + ac_build_alloca_undef(ctx, type, ""); + } + } +} + +static void +setup_locals(struct ac_nir_context *ctx, + struct nir_function *func) +{ + int i, j; + ctx->num_locals = 0; + nir_foreach_variable(variable, &func->impl->locals) { + unsigned attrib_count = glsl_count_attribute_slots(variable->type, false); + variable->data.driver_location = ctx->num_locals * 4; + variable->data.location_frac = 0; + ctx->num_locals += attrib_count; + } + ctx->locals = malloc(4 * ctx->num_locals * sizeof(LLVMValueRef)); + if (!ctx->locals) + return; + + for (i = 0; i < ctx->num_locals; i++) { + for (j = 0; j < 4; j++) { + ctx->locals[i * 4 + j] = + ac_build_alloca_undef(&ctx->ac, ctx->ac.f32, "temp"); + } + } +} + +static void +setup_scratch(struct ac_nir_context *ctx, + struct nir_shader *shader) +{ + if (shader->scratch_size == 0) + return; + + ctx->scratch = ac_build_alloca_undef(&ctx->ac, + LLVMArrayType(ctx->ac.i8, shader->scratch_size), + "scratch"); +} + +static void +setup_constant_data(struct ac_nir_context *ctx, + struct nir_shader *shader) +{ + if (!shader->constant_data) + return; + + LLVMValueRef data = + LLVMConstStringInContext(ctx->ac.context, + shader->constant_data, + shader->constant_data_size, + true); + LLVMTypeRef type = LLVMArrayType(ctx->ac.i8, shader->constant_data_size); + + /* We want to put the constant data in the CONST address space so that + * we can use scalar loads. However, LLVM versions before 10 put these + * variables in the same section as the code, which is unacceptable + * for RadeonSI as it needs to relocate all the data sections after + * the code sections. See https://reviews.llvm.org/D65813. + */ + unsigned address_space = + LLVM_VERSION_MAJOR < 10 ? AC_ADDR_SPACE_GLOBAL : AC_ADDR_SPACE_CONST; + + LLVMValueRef global = + LLVMAddGlobalInAddressSpace(ctx->ac.module, type, + "const_data", + address_space); + + LLVMSetInitializer(global, data); + LLVMSetGlobalConstant(global, true); + LLVMSetVisibility(global, LLVMHiddenVisibility); + ctx->constant_data = global; +} + +static void +setup_shared(struct ac_nir_context *ctx, + struct nir_shader *nir) +{ + if (ctx->ac.lds) + return; + + LLVMTypeRef type = LLVMArrayType(ctx->ac.i8, + nir->info.cs.shared_size); + + LLVMValueRef lds = + LLVMAddGlobalInAddressSpace(ctx->ac.module, type, + "compute_lds", + AC_ADDR_SPACE_LDS); + LLVMSetAlignment(lds, 64 * 1024); + + ctx->ac.lds = LLVMBuildBitCast(ctx->ac.builder, lds, + LLVMPointerType(ctx->ac.i8, + AC_ADDR_SPACE_LDS), ""); +} + +void ac_nir_translate(struct ac_llvm_context *ac, struct ac_shader_abi *abi, + const struct ac_shader_args *args, struct nir_shader *nir) +{ + struct ac_nir_context ctx = {}; + struct nir_function *func; + + ctx.ac = *ac; + ctx.abi = abi; + ctx.args = args; + + ctx.stage = nir->info.stage; + ctx.info = &nir->info; + + ctx.main_function = LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx.ac.builder)); + + nir_foreach_variable(variable, &nir->outputs) + ac_handle_shader_output_decl(&ctx.ac, ctx.abi, nir, variable, + ctx.stage); + + ctx.defs = _mesa_hash_table_create(NULL, _mesa_hash_pointer, + _mesa_key_pointer_equal); + ctx.phis = _mesa_hash_table_create(NULL, _mesa_hash_pointer, + _mesa_key_pointer_equal); + ctx.vars = _mesa_hash_table_create(NULL, _mesa_hash_pointer, + _mesa_key_pointer_equal); + + func = (struct nir_function *)exec_list_get_head(&nir->functions); + + nir_index_ssa_defs(func->impl); + ctx.ssa_defs = calloc(func->impl->ssa_alloc, sizeof(LLVMValueRef)); + + setup_locals(&ctx, func); + setup_scratch(&ctx, nir); + setup_constant_data(&ctx, nir); + + if (gl_shader_stage_is_compute(nir->info.stage)) + setup_shared(&ctx, nir); + + visit_cf_list(&ctx, &func->impl->body); + phi_post_pass(&ctx); + + if (!gl_shader_stage_is_compute(nir->info.stage)) + ctx.abi->emit_outputs(ctx.abi, AC_LLVM_MAX_OUTPUTS, + ctx.abi->outputs); + + free(ctx.locals); + free(ctx.ssa_defs); + ralloc_free(ctx.defs); + ralloc_free(ctx.phis); + ralloc_free(ctx.vars); +} + +bool +ac_lower_indirect_derefs(struct nir_shader *nir, enum chip_class chip_class) +{ + bool progress = false; + + /* Lower large variables to scratch first so that we won't bloat the + * shader by generating large if ladders for them. We later lower + * scratch to alloca's, assuming LLVM won't generate VGPR indexing. + */ + NIR_PASS(progress, nir, nir_lower_vars_to_scratch, + nir_var_function_temp, + 256, + glsl_get_natural_size_align_bytes); + + /* While it would be nice not to have this flag, we are constrained + * by the reality that LLVM 9.0 has buggy VGPR indexing on GFX9. + */ + bool llvm_has_working_vgpr_indexing = chip_class != GFX9; + + /* TODO: Indirect indexing of GS inputs is unimplemented. + * + * TCS and TES load inputs directly from LDS or offchip memory, so + * indirect indexing is trivial. + */ + nir_variable_mode indirect_mask = 0; + if (nir->info.stage == MESA_SHADER_GEOMETRY || + (nir->info.stage != MESA_SHADER_TESS_CTRL && + nir->info.stage != MESA_SHADER_TESS_EVAL && + !llvm_has_working_vgpr_indexing)) { + indirect_mask |= nir_var_shader_in; + } + if (!llvm_has_working_vgpr_indexing && + nir->info.stage != MESA_SHADER_TESS_CTRL) + indirect_mask |= nir_var_shader_out; + + /* TODO: We shouldn't need to do this, however LLVM isn't currently + * smart enough to handle indirects without causing excess spilling + * causing the gpu to hang. + * + * See the following thread for more details of the problem: + * https://lists.freedesktop.org/archives/mesa-dev/2017-July/162106.html + */ + indirect_mask |= nir_var_function_temp; + + progress |= nir_lower_indirect_derefs(nir, indirect_mask); + return progress; +} + +static unsigned +get_inst_tessfactor_writemask(nir_intrinsic_instr *intrin) +{ + if (intrin->intrinsic != nir_intrinsic_store_deref) + return 0; + + nir_variable *var = + nir_deref_instr_get_variable(nir_src_as_deref(intrin->src[0])); + + if (var->data.mode != nir_var_shader_out) + return 0; + + unsigned writemask = 0; + const int location = var->data.location; + unsigned first_component = var->data.location_frac; + unsigned num_comps = intrin->dest.ssa.num_components; + + if (location == VARYING_SLOT_TESS_LEVEL_INNER) + writemask = ((1 << (num_comps + 1)) - 1) << first_component; + else if (location == VARYING_SLOT_TESS_LEVEL_OUTER) + writemask = (((1 << (num_comps + 1)) - 1) << first_component) << 4; + + return writemask; +} + +static void +scan_tess_ctrl(nir_cf_node *cf_node, unsigned *upper_block_tf_writemask, + unsigned *cond_block_tf_writemask, + bool *tessfactors_are_def_in_all_invocs, bool is_nested_cf) +{ + switch (cf_node->type) { + case nir_cf_node_block: { + nir_block *block = nir_cf_node_as_block(cf_node); + nir_foreach_instr(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + if (intrin->intrinsic == nir_intrinsic_control_barrier) { + + /* If we find a barrier in nested control flow put this in the + * too hard basket. In GLSL this is not possible but it is in + * SPIR-V. + */ + if (is_nested_cf) { + *tessfactors_are_def_in_all_invocs = false; + return; + } + + /* The following case must be prevented: + * gl_TessLevelInner = ...; + * barrier(); + * if (gl_InvocationID == 1) + * gl_TessLevelInner = ...; + * + * If you consider disjoint code segments separated by barriers, each + * such segment that writes tess factor channels should write the same + * channels in all codepaths within that segment. + */ + if (upper_block_tf_writemask || cond_block_tf_writemask) { + /* Accumulate the result: */ + *tessfactors_are_def_in_all_invocs &= + !(*cond_block_tf_writemask & ~(*upper_block_tf_writemask)); + + /* Analyze the next code segment from scratch. */ + *upper_block_tf_writemask = 0; + *cond_block_tf_writemask = 0; + } + } else + *upper_block_tf_writemask |= get_inst_tessfactor_writemask(intrin); + } + + break; + } + case nir_cf_node_if: { + unsigned then_tessfactor_writemask = 0; + unsigned else_tessfactor_writemask = 0; + + nir_if *if_stmt = nir_cf_node_as_if(cf_node); + foreach_list_typed(nir_cf_node, nested_node, node, &if_stmt->then_list) { + scan_tess_ctrl(nested_node, &then_tessfactor_writemask, + cond_block_tf_writemask, + tessfactors_are_def_in_all_invocs, true); + } + + foreach_list_typed(nir_cf_node, nested_node, node, &if_stmt->else_list) { + scan_tess_ctrl(nested_node, &else_tessfactor_writemask, + cond_block_tf_writemask, + tessfactors_are_def_in_all_invocs, true); + } + + if (then_tessfactor_writemask || else_tessfactor_writemask) { + /* If both statements write the same tess factor channels, + * we can say that the upper block writes them too. + */ + *upper_block_tf_writemask |= then_tessfactor_writemask & + else_tessfactor_writemask; + *cond_block_tf_writemask |= then_tessfactor_writemask | + else_tessfactor_writemask; + } + + break; + } + case nir_cf_node_loop: { + nir_loop *loop = nir_cf_node_as_loop(cf_node); + foreach_list_typed(nir_cf_node, nested_node, node, &loop->body) { + scan_tess_ctrl(nested_node, cond_block_tf_writemask, + cond_block_tf_writemask, + tessfactors_are_def_in_all_invocs, true); + } + + break; + } + default: + unreachable("unknown cf node type"); + } +} + +bool +ac_are_tessfactors_def_in_all_invocs(const struct nir_shader *nir) +{ + assert(nir->info.stage == MESA_SHADER_TESS_CTRL); + + /* The pass works as follows: + * If all codepaths write tess factors, we can say that all + * invocations define tess factors. + * + * Each tess factor channel is tracked separately. + */ + unsigned main_block_tf_writemask = 0; /* if main block writes tess factors */ + unsigned cond_block_tf_writemask = 0; /* if cond block writes tess factors */ + + /* Initial value = true. Here the pass will accumulate results from + * multiple segments surrounded by barriers. If tess factors aren't + * written at all, it's a shader bug and we don't care if this will be + * true. + */ + bool tessfactors_are_def_in_all_invocs = true; + + nir_foreach_function(function, nir) { + if (function->impl) { + foreach_list_typed(nir_cf_node, node, node, &function->impl->body) { + scan_tess_ctrl(node, &main_block_tf_writemask, + &cond_block_tf_writemask, + &tessfactors_are_def_in_all_invocs, + false); + } + } + } + + /* Accumulate the result for the last code segment separated by a + * barrier. + */ + if (main_block_tf_writemask || cond_block_tf_writemask) { + tessfactors_are_def_in_all_invocs &= + !(cond_block_tf_writemask & ~main_block_tf_writemask); + } + + return tessfactors_are_def_in_all_invocs; +} diff -Nru mesa-19.2.8/src/amd/llvm/ac_nir_to_llvm.h mesa-20.0.8/src/amd/llvm/ac_nir_to_llvm.h --- mesa-19.2.8/src/amd/llvm/ac_nir_to_llvm.h 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/amd/llvm/ac_nir_to_llvm.h 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,65 @@ +/* + * Copyright © 2016 Bas Nieuwenhuizen + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef AC_NIR_TO_LLVM_H +#define AC_NIR_TO_LLVM_H + +#include +#include "llvm-c/Core.h" +#include "llvm-c/TargetMachine.h" +#include "amd_family.h" +#include "compiler/shader_enums.h" + +struct nir_shader; +struct nir_variable; +struct ac_llvm_context; +struct ac_shader_abi; +struct ac_shader_args; + +/* Interpolation locations */ +#define INTERP_CENTER 0 +#define INTERP_CENTROID 1 +#define INTERP_SAMPLE 2 + +static inline unsigned ac_llvm_reg_index_soa(unsigned index, unsigned chan) +{ + return (index * 4) + chan; +} + +bool ac_lower_indirect_derefs(struct nir_shader *nir, enum chip_class); + +bool ac_are_tessfactors_def_in_all_invocs(const struct nir_shader *nir); + +void ac_nir_translate(struct ac_llvm_context *ac, struct ac_shader_abi *abi, + const struct ac_shader_args *args, struct nir_shader *nir); + +void +ac_handle_shader_output_decl(struct ac_llvm_context *ctx, + struct ac_shader_abi *abi, + struct nir_shader *nir, + struct nir_variable *variable, + gl_shader_stage stage); + +void ac_emit_barrier(struct ac_llvm_context *ac, gl_shader_stage stage); + +#endif /* AC_NIR_TO_LLVM_H */ diff -Nru mesa-19.2.8/src/amd/llvm/ac_shader_abi.h mesa-20.0.8/src/amd/llvm/ac_shader_abi.h --- mesa-19.2.8/src/amd/llvm/ac_shader_abi.h 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/amd/llvm/ac_shader_abi.h 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,188 @@ +/* + * Copyright 2017 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef AC_SHADER_ABI_H +#define AC_SHADER_ABI_H + +#include +#include +#include "ac_shader_args.h" + +#include "compiler/shader_enums.h" + +struct nir_variable; + +#define AC_LLVM_MAX_OUTPUTS (VARYING_SLOT_VAR31 + 1) + +#define AC_MAX_INLINE_PUSH_CONSTS 8 + +enum ac_descriptor_type { + AC_DESC_IMAGE, + AC_DESC_FMASK, + AC_DESC_SAMPLER, + AC_DESC_BUFFER, + AC_DESC_PLANE_0, + AC_DESC_PLANE_1, + AC_DESC_PLANE_2, +}; + +/* Document the shader ABI during compilation. This is what allows radeonsi and + * radv to share a compiler backend. + */ +struct ac_shader_abi { + LLVMValueRef outputs[AC_LLVM_MAX_OUTPUTS * 4]; + + /* These input registers sometimes need to be fixed up. */ + LLVMValueRef vertex_id; + LLVMValueRef instance_id; + LLVMValueRef persp_centroid, linear_centroid; + LLVMValueRef color0, color1; + LLVMValueRef user_data; + + /* For VS and PS: pre-loaded shader inputs. + * + * Currently only used for NIR shaders; indexed by variables' + * driver_location. + */ + LLVMValueRef *inputs; + + /* Varying -> attribute number mapping. Also NIR-only */ + unsigned fs_input_attr_indices[MAX_VARYING]; + + void (*emit_outputs)(struct ac_shader_abi *abi, + unsigned max_outputs, + LLVMValueRef *addrs); + + void (*emit_vertex)(struct ac_shader_abi *abi, + unsigned stream, + LLVMValueRef *addrs); + + void (*emit_primitive)(struct ac_shader_abi *abi, + unsigned stream); + + void (*emit_kill)(struct ac_shader_abi *abi, LLVMValueRef visible); + + LLVMValueRef (*load_inputs)(struct ac_shader_abi *abi, + unsigned location, + unsigned driver_location, + unsigned component, + unsigned num_components, + unsigned vertex_index, + unsigned const_index, + LLVMTypeRef type); + + LLVMValueRef (*load_tess_varyings)(struct ac_shader_abi *abi, + LLVMTypeRef type, + LLVMValueRef vertex_index, + LLVMValueRef param_index, + unsigned const_index, + unsigned location, + unsigned driver_location, + unsigned component, + unsigned num_components, + bool is_patch, + bool is_compact, + bool load_inputs); + + void (*store_tcs_outputs)(struct ac_shader_abi *abi, + const struct nir_variable *var, + LLVMValueRef vertex_index, + LLVMValueRef param_index, + unsigned const_index, + LLVMValueRef src, + unsigned writemask); + + LLVMValueRef (*load_tess_coord)(struct ac_shader_abi *abi); + + LLVMValueRef (*load_patch_vertices_in)(struct ac_shader_abi *abi); + + LLVMValueRef (*load_tess_level)(struct ac_shader_abi *abi, + unsigned varying_id, + bool load_default_state); + + + LLVMValueRef (*load_ubo)(struct ac_shader_abi *abi, LLVMValueRef index); + + /** + * Load the descriptor for the given buffer. + * + * \param buffer the buffer as presented in NIR: this is the descriptor + * in Vulkan, and the buffer index in OpenGL/Gallium + * \param write whether buffer contents will be written + */ + LLVMValueRef (*load_ssbo)(struct ac_shader_abi *abi, + LLVMValueRef buffer, bool write); + + /** + * Load a descriptor associated to a sampler. + * + * \param descriptor_set the descriptor set index (only for Vulkan) + * \param base_index the base index of the sampler variable + * \param constant_index constant part of an array index (or 0, if the + * sampler variable is not an array) + * \param index non-constant part of an array index (may be NULL) + * \param desc_type the type of descriptor to load + * \param image whether the descriptor is loaded for an image operation + */ + LLVMValueRef (*load_sampler_desc)(struct ac_shader_abi *abi, + unsigned descriptor_set, + unsigned base_index, + unsigned constant_index, + LLVMValueRef index, + enum ac_descriptor_type desc_type, + bool image, bool write, + bool bindless); + + /** + * Load a Vulkan-specific resource. + * + * \param index resource index + * \param desc_set descriptor set + * \param binding descriptor set binding + */ + LLVMValueRef (*load_resource)(struct ac_shader_abi *abi, + LLVMValueRef index, + unsigned desc_set, + unsigned binding); + + LLVMValueRef (*load_sample_position)(struct ac_shader_abi *abi, + LLVMValueRef sample_id); + + LLVMValueRef (*load_local_group_size)(struct ac_shader_abi *abi); + + LLVMValueRef (*load_sample_mask_in)(struct ac_shader_abi *abi); + + LLVMValueRef (*load_base_vertex)(struct ac_shader_abi *abi); + + LLVMValueRef (*emit_fbfetch)(struct ac_shader_abi *abi); + + /* Whether to clamp the shadow reference value to [0,1]on GFX8. Radeonsi currently + * uses it due to promoting D16 to D32, but radv needs it off. */ + bool clamp_shadow_reference; + bool interp_at_sample_force_center; + + /* Whether bounds checks are required */ + bool robust_buffer_access; +}; + +#endif /* AC_SHADER_ABI_H */ diff -Nru mesa-19.2.8/src/amd/llvm/meson.build mesa-20.0.8/src/amd/llvm/meson.build --- mesa-19.2.8/src/amd/llvm/meson.build 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/amd/llvm/meson.build 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,50 @@ +# Copyright © 2019 Valve Corporation + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +amd_common_llvm_files = files( + 'ac_llvm_build.c', + 'ac_llvm_build.h', + 'ac_llvm_cull.c', + 'ac_llvm_cull.h', + 'ac_llvm_helper.cpp', + 'ac_llvm_util.c', + 'ac_llvm_util.h', + 'ac_nir_to_llvm.c', + 'ac_nir_to_llvm.h', + 'ac_shader_abi.h', +) + +libamd_common_llvm = static_library( + 'amd_common_llvm', + [amd_common_llvm_files], + include_directories : [ + inc_common, inc_compiler, inc_mesa, inc_mapi, inc_amd, inc_amd_common + ], + link_with: [ + libamd_common + ], + dependencies : [ + dep_llvm, dep_thread, dep_elf, dep_libdrm_amdgpu, dep_valgrind, + idep_nir_headers, idep_amdgfxregs_h, + ], + c_args : [c_vis_args], + cpp_args : [cpp_vis_args], +) + diff -Nru mesa-19.2.8/src/amd/Makefile.sources mesa-20.0.8/src/amd/Makefile.sources --- mesa-19.2.8/src/amd/Makefile.sources 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/Makefile.sources 2020-06-12 01:21:16.000000000 +0000 @@ -36,33 +36,32 @@ addrlib/src/r800/siaddrlib.cpp \ addrlib/src/r800/siaddrlib.h -AMD_COMPILER_FILES = \ +AMD_COMMON_FILES = \ common/ac_binary.c \ common/ac_binary.h \ common/ac_exp_param.h \ - common/ac_llvm_build.c \ - common/ac_llvm_build.h \ - common/ac_llvm_cull.c \ - common/ac_llvm_cull.h \ - common/ac_llvm_helper.cpp \ - common/ac_llvm_util.c \ - common/ac_llvm_util.h \ + common/ac_gpu_info.c \ + common/ac_gpu_info.h \ + common/ac_surface.c \ + common/ac_surface.h \ common/ac_rtld.c \ common/ac_rtld.h \ - common/ac_shader_abi.h \ + common/ac_shader_args.c \ + common/ac_shader_args.h \ common/ac_shader_util.c \ common/ac_shader_util.h - -AMD_NIR_FILES = \ - common/ac_nir_to_llvm.c \ - common/ac_nir_to_llvm.h - -AMD_COMMON_FILES = \ - common/ac_gpu_info.c \ - common/ac_gpu_info.h \ - common/ac_surface.c \ - common/ac_surface.h +AMD_COMMON_LLVM_FILES = \ + llvm/ac_llvm_build.c \ + llvm/ac_llvm_build.h \ + llvm/ac_llvm_cull.c \ + llvm/ac_llvm_cull.h \ + llvm/ac_llvm_helper.cpp \ + llvm/ac_llvm_util.c \ + llvm/ac_llvm_util.h \ + llvm/ac_shader_abi.h \ + llvm/ac_nir_to_llvm.c \ + llvm/ac_nir_to_llvm.h AMD_DEBUG_FILES = \ common/ac_debug.c \ @@ -71,3 +70,36 @@ AMD_GENERATED_FILES = \ common/amdgfxregs.h \ common/sid_tables.h + +ACO_FILES = \ + compiler/aco_dead_code_analysis.cpp \ + compiler/aco_dominance.cpp \ + compiler/aco_instruction_selection.cpp \ + compiler/aco_instruction_selection_setup.cpp \ + compiler/aco_interface.cpp \ + compiler/aco_interface.h \ + compiler/aco_ir.h \ + compiler/aco_assembler.cpp \ + compiler/aco_insert_exec_mask.cpp \ + compiler/aco_insert_NOPs.cpp \ + compiler/aco_insert_waitcnt.cpp \ + compiler/aco_reduce_assign.cpp \ + compiler/aco_register_allocation.cpp \ + compiler/aco_live_var_analysis.cpp \ + compiler/aco_lower_bool_phis.cpp \ + compiler/aco_lower_to_cssa.cpp \ + compiler/aco_lower_to_hw_instr.cpp \ + compiler/aco_optimizer.cpp \ + compiler/aco_opt_value_numbering.cpp \ + compiler/aco_print_asm.cpp \ + compiler/aco_print_ir.cpp \ + compiler/aco_scheduler.cpp \ + compiler/aco_ssa_elimination.cpp \ + compiler/aco_spill.cpp \ + compiler/aco_util.h \ + compiler/aco_validate.cpp + +ACO_GENERATED_FILES = \ + compiler/aco_builder.h \ + compiler/aco_opcodes.cpp \ + compiler/aco_opcodes.h diff -Nru mesa-19.2.8/src/amd/meson.build mesa-20.0.8/src/amd/meson.build --- mesa-19.2.8/src/amd/meson.build 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/meson.build 2020-06-12 01:21:16.000000000 +0000 @@ -22,6 +22,8 @@ subdir('addrlib') subdir('common') +subdir('llvm') if with_amd_vk + subdir('compiler') subdir('vulkan') endif diff -Nru mesa-19.2.8/src/amd/registers/gfx10-rsrc.json mesa-20.0.8/src/amd/registers/gfx10-rsrc.json --- mesa-19.2.8/src/amd/registers/gfx10-rsrc.json 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/registers/gfx10-rsrc.json 2020-06-12 01:21:16.000000000 +0000 @@ -183,6 +183,14 @@ {"name": "BC_SWIZZLE_ZYXW", "value": 4}, {"name": "BC_SWIZZLE_YXWZ", "value": 5} ] + }, + "SQ_BUF_RSRC_WORD3__OOB_SELECT": { + "entries": [ + {"name": "OOB_SELECT_STRUCTURED_WITH_OFFSET", "value": 0}, + {"name": "OOB_SELECT_STRUCTURED", "value": 1}, + {"name": "OOB_SELECT_DISABLED", "value": 2}, + {"name": "OOB_SELECT_RAW", "value": 3} + ] } }, "register_mappings": [ @@ -304,7 +312,7 @@ {"bits": [21, 22], "name": "INDEX_STRIDE"}, {"bits": [23, 23], "name": "ADD_TID_ENABLE"}, {"bits": [24, 24], "comment": "must be 1", "name": "RESOURCE_LEVEL"}, - {"bits": [28, 29], "name": "OOB_SELECT"}, + {"bits": [28, 29], "enum_ref": "SQ_BUF_RSRC_WORD3__OOB_SELECT", "name": "OOB_SELECT"}, {"bits": [30, 31], "comment": "must be 0", "name": "TYPE"} ] }, diff -Nru mesa-19.2.8/src/amd/vulkan/Android.mk mesa-20.0.8/src/amd/vulkan/Android.mk --- mesa-19.2.8/src/amd/vulkan/Android.mk 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/vulkan/Android.mk 2020-06-12 01:21:16.000000000 +0000 @@ -30,6 +30,7 @@ RADV_COMMON_INCLUDES := \ $(MESA_TOP)/include \ $(MESA_TOP)/src/ \ + $(MESA_TOP)/src/amd/vulkan \ $(MESA_TOP)/src/vulkan/wsi \ $(MESA_TOP)/src/vulkan/util \ $(MESA_TOP)/src/amd \ @@ -67,6 +68,7 @@ LOCAL_C_INCLUDES := $(RADV_COMMON_INCLUDES) LOCAL_STATIC_LIBRARIES := \ + libmesa_aco \ libmesa_amd_common \ libmesa_nir \ libmesa_util \ @@ -167,7 +169,8 @@ libmesa_amdgpu_addrlib \ libmesa_amd_common \ libmesa_radv_common \ - libmesa_vulkan_util + libmesa_vulkan_util \ + libmesa_aco LOCAL_SHARED_LIBRARIES += $(RADV_SHARED_LIBRARIES) libz libsync liblog diff -Nru mesa-19.2.8/src/amd/vulkan/gfx10_format_table.py mesa-20.0.8/src/amd/vulkan/gfx10_format_table.py --- mesa-19.2.8/src/amd/vulkan/gfx10_format_table.py 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/vulkan/gfx10_format_table.py 2020-06-12 01:21:16.000000000 +0000 @@ -21,7 +21,7 @@ # USE OR OTHER DEALINGS IN THE SOFTWARE. # """ -Script that generates the mapping from Gallium PIPE_FORMAT_xxx to gfx10 +Script that generates the mapping from Vulkan VK_FORMAT_xxx to gfx10 IMG_FORMAT_xxx enums. """ @@ -34,12 +34,10 @@ import sys AMD_REGISTERS = os.path.abspath(os.path.join(os.path.dirname(sys.argv[0]), "../registers")) -#GALLIUM_UTIL = os.path.abspath(os.path.join(os.path.dirname(sys.argv[0]), "../../auxiliary/util")) sys.path.extend([AMD_REGISTERS]) from regdb import Object, RegisterDatabase from vk_format_parse import * -#from u_format_parse import * # ---------------------------------------------------------------------------- # Hard-coded mappings @@ -68,6 +66,11 @@ 'VK_FORMAT_BC6H_SFLOAT_BLOCK': hardcoded_format('BC6_SFLOAT'), 'VK_FORMAT_BC7_UNORM_BLOCK': hardcoded_format('BC7_UNORM'), 'VK_FORMAT_BC7_SRGB_BLOCK': hardcoded_format('BC7_SRGB'), + + # DS + 'VK_FORMAT_D16_UNORM_S8_UINT': hardcoded_format('INVALID'), + 'VK_FORMAT_D24_UNORM_S8_UINT': hardcoded_format('8_24_UNORM'), + 'VK_FORMAT_D32_SFLOAT_S8_UINT': hardcoded_format('X24_8_32_FLOAT'), } @@ -82,11 +85,11 @@ ##__VA_ARGS__ } static const struct gfx10_format gfx10_format_table[VK_FORMAT_RANGE_SIZE] = { -% for pipe_format, args in formats: +% for vk_format, args in formats: % if args is not None: - [${pipe_format}] = FMT(${args}), + [${vk_format}] = FMT(${args}), % else: -/* ${pipe_format} is not supported */ +/* ${vk_format} is not supported */ % endif % endfor }; @@ -114,8 +117,8 @@ class Gfx10FormatMapping(object): - def __init__(self, pipe_formats, gfx10_formats): - self.pipe_formats = pipe_formats + def __init__(self, vk_formats, gfx10_formats): + self.vk_formats = vk_formats self.gfx10_formats = gfx10_formats self.plain_gfx10_formats = dict( @@ -219,17 +222,17 @@ if __name__ == '__main__': - pipe_formats = parse(sys.argv[1]) + vk_formats = parse(sys.argv[1]) with open(sys.argv[2], 'r') as filp: db = RegisterDatabase.from_json(json.load(filp)) gfx10_formats = [Gfx10Format(entry) for entry in db.enum('IMG_FORMAT').entries] - mapping = Gfx10FormatMapping(pipe_formats, gfx10_formats) + mapping = Gfx10FormatMapping(vk_formats, gfx10_formats) formats = [] - for fmt in pipe_formats: + for fmt in vk_formats: if fmt.name in HARDCODED: obj = HARDCODED[fmt.name] else: diff -Nru mesa-19.2.8/src/amd/vulkan/Makefile.sources mesa-20.0.8/src/amd/vulkan/Makefile.sources --- mesa-19.2.8/src/amd/vulkan/Makefile.sources 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/vulkan/Makefile.sources 2020-06-12 01:21:16.000000000 +0000 @@ -63,6 +63,8 @@ radv_private.h \ radv_radeon_winsys.h \ radv_shader.c \ + radv_shader_args.c \ + radv_shader_args.h \ radv_shader_info.c \ radv_shader.h \ radv_shader_helper.h \ diff -Nru mesa-19.2.8/src/amd/vulkan/meson.build mesa-20.0.8/src/amd/vulkan/meson.build --- mesa-19.2.8/src/amd/vulkan/meson.build 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/vulkan/meson.build 2020-06-12 01:21:16.000000000 +0000 @@ -67,6 +67,7 @@ 'winsys/amdgpu/radv_amdgpu_winsys.c', 'winsys/amdgpu/radv_amdgpu_winsys.h', 'winsys/amdgpu/radv_amdgpu_winsys_public.h', + 'radv_android.c', 'radv_cmd_buffer.c', 'radv_cs.h', 'radv_debug.c', @@ -100,6 +101,8 @@ 'radv_radeon_winsys.h', 'radv_shader.c', 'radv_shader.h', + 'radv_shader_args.c', + 'radv_shader_args.h', 'radv_shader_helper.h', 'radv_shader_info.c', 'radv_query.c', @@ -139,24 +142,24 @@ endif if with_platform_android + radv_deps += dep_android radv_flags += [ '-DVK_USE_PLATFORM_ANDROID_KHR' ] - libradv_files += files('radv_android.c') endif libvulkan_radeon = shared_library( 'vulkan_radeon', - [libradv_files, radv_entrypoints, radv_extensions_c, amd_vk_format_table_c, sha1_h, xmlpool_options_h, radv_gfx10_format_table_h], + [libradv_files, radv_entrypoints, radv_extensions_c, amd_vk_format_table_c, sha1_h, radv_gfx10_format_table_h], include_directories : [ - inc_common, inc_amd, inc_amd_common, inc_compiler, inc_util, inc_vulkan_wsi, + inc_common, inc_amd, inc_amd_common, inc_amd_common_llvm, inc_compiler, inc_util, inc_vulkan_wsi, ], link_with : [ - libamd_common, libamdgpu_addrlib, libvulkan_wsi, + libamd_common, libamd_common_llvm, libamdgpu_addrlib, libvulkan_wsi, ], dependencies : [ dep_llvm, dep_libdrm_amdgpu, dep_thread, dep_elf, dep_dl, dep_m, - dep_valgrind, radv_deps, + dep_valgrind, radv_deps, idep_aco, idep_mesautil, idep_nir, idep_vulkan_util, idep_amdgfxregs_h, idep_xmlconfig, ], c_args : [c_vis_args, no_override_init_args, radv_flags], @@ -165,6 +168,19 @@ install : true, ) +if with_symbols_check + test( + 'radv symbols check', + symbols_check, + args : [ + '--lib', libvulkan_radeon, + '--symbols-file', vulkan_icd_symbols, + symbols_check_args, + ], + suite : ['amd'], + ) +endif + radeon_icd = custom_target( 'radeon_icd', input : 'radv_icd.py', diff -Nru mesa-19.2.8/src/amd/vulkan/radv_android.c mesa-20.0.8/src/amd/vulkan/radv_android.c --- mesa-19.2.8/src/amd/vulkan/radv_android.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/vulkan/radv_android.c 2020-06-12 01:21:16.000000000 +0000 @@ -21,14 +21,20 @@ * IN THE SOFTWARE. */ +#ifdef ANDROID #include #include #include +#include #include #include #include +#endif #include "radv_private.h" +#include "vk_util.h" + +#ifdef ANDROID static int radv_hal_open(const struct hw_module_t* mod, const char* id, struct hw_device_t** dev); static int radv_hal_close(struct hw_device_t *dev); @@ -372,3 +378,404 @@ } return VK_SUCCESS; } +#endif + +#if RADV_SUPPORT_ANDROID_HARDWARE_BUFFER + +enum { + /* Usage bit equal to GRALLOC_USAGE_HW_CAMERA_MASK */ + AHARDWAREBUFFER_USAGE_CAMERA_MASK = 0x00060000U, +}; + +static inline VkFormat +vk_format_from_android(unsigned android_format, unsigned android_usage) +{ + switch (android_format) { + case AHARDWAREBUFFER_FORMAT_R8G8B8A8_UNORM: + case AHARDWAREBUFFER_FORMAT_R8G8B8X8_UNORM: + return VK_FORMAT_R8G8B8A8_UNORM; + case AHARDWAREBUFFER_FORMAT_R8G8B8_UNORM: + return VK_FORMAT_R8G8B8_UNORM; + case AHARDWAREBUFFER_FORMAT_R5G6B5_UNORM: + return VK_FORMAT_R5G6B5_UNORM_PACK16; + case AHARDWAREBUFFER_FORMAT_R16G16B16A16_FLOAT: + return VK_FORMAT_R16G16B16A16_SFLOAT; + case AHARDWAREBUFFER_FORMAT_R10G10B10A2_UNORM: + return VK_FORMAT_A2B10G10R10_UNORM_PACK32; + case AHARDWAREBUFFER_FORMAT_Y8Cb8Cr8_420: + return VK_FORMAT_G8_B8R8_2PLANE_420_UNORM; + case AHARDWAREBUFFER_FORMAT_IMPLEMENTATION_DEFINED: + if (android_usage & AHARDWAREBUFFER_USAGE_CAMERA_MASK) + return VK_FORMAT_G8_B8R8_2PLANE_420_UNORM; + else + return VK_FORMAT_R8G8B8_UNORM; + case AHARDWAREBUFFER_FORMAT_BLOB: + default: + return VK_FORMAT_UNDEFINED; + } +} + +static inline unsigned +android_format_from_vk(unsigned vk_format) +{ + switch (vk_format) { + case VK_FORMAT_R8G8B8A8_UNORM: + return AHARDWAREBUFFER_FORMAT_R8G8B8A8_UNORM; + case VK_FORMAT_R8G8B8_UNORM: + return AHARDWAREBUFFER_FORMAT_R8G8B8_UNORM; + case VK_FORMAT_R5G6B5_UNORM_PACK16: + return AHARDWAREBUFFER_FORMAT_R5G6B5_UNORM; + case VK_FORMAT_R16G16B16A16_SFLOAT: + return AHARDWAREBUFFER_FORMAT_R16G16B16A16_FLOAT; + case VK_FORMAT_A2B10G10R10_UNORM_PACK32: + return AHARDWAREBUFFER_FORMAT_R10G10B10A2_UNORM; + case VK_FORMAT_G8_B8R8_2PLANE_420_UNORM: + return AHARDWAREBUFFER_FORMAT_Y8Cb8Cr8_420; + default: + return AHARDWAREBUFFER_FORMAT_BLOB; + } +} + +uint64_t +radv_ahb_usage_from_vk_usage(const VkImageCreateFlags vk_create, + const VkImageUsageFlags vk_usage) +{ + uint64_t ahb_usage = 0; + if (vk_usage & VK_IMAGE_USAGE_SAMPLED_BIT) + ahb_usage |= AHARDWAREBUFFER_USAGE_GPU_SAMPLED_IMAGE; + + if (vk_usage & VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT) + ahb_usage |= AHARDWAREBUFFER_USAGE_GPU_SAMPLED_IMAGE; + + if (vk_usage & VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT) + ahb_usage |= AHARDWAREBUFFER_USAGE_GPU_COLOR_OUTPUT; + + if (vk_create & VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT) + ahb_usage |= AHARDWAREBUFFER_USAGE_GPU_CUBE_MAP; + + if (vk_create & VK_IMAGE_CREATE_PROTECTED_BIT) + ahb_usage |= AHARDWAREBUFFER_USAGE_PROTECTED_CONTENT; + + /* No usage bits set - set at least one GPU usage. */ + if (ahb_usage == 0) + ahb_usage = AHARDWAREBUFFER_USAGE_GPU_SAMPLED_IMAGE; + return ahb_usage; +} + +static VkResult +get_ahb_buffer_format_properties( + VkDevice device_h, + const struct AHardwareBuffer *buffer, + VkAndroidHardwareBufferFormatPropertiesANDROID *pProperties) +{ + RADV_FROM_HANDLE(radv_device, device, device_h); + + /* Get a description of buffer contents . */ + AHardwareBuffer_Desc desc; + AHardwareBuffer_describe(buffer, &desc); + + /* Verify description. */ + const uint64_t gpu_usage = + AHARDWAREBUFFER_USAGE_GPU_SAMPLED_IMAGE | + AHARDWAREBUFFER_USAGE_GPU_COLOR_OUTPUT | + AHARDWAREBUFFER_USAGE_GPU_DATA_BUFFER; + + /* "Buffer must be a valid Android hardware buffer object with at least + * one of the AHARDWAREBUFFER_USAGE_GPU_* usage flags." + */ + if (!(desc.usage & (gpu_usage))) + return VK_ERROR_INVALID_EXTERNAL_HANDLE; + + /* Fill properties fields based on description. */ + VkAndroidHardwareBufferFormatPropertiesANDROID *p = pProperties; + + p->format = vk_format_from_android(desc.format, desc.usage); + p->externalFormat = (uint64_t) (uintptr_t) p->format; + + VkFormatProperties format_properties; + radv_GetPhysicalDeviceFormatProperties( + radv_physical_device_to_handle(device->physical_device), + p->format, &format_properties); + + if (desc.usage & AHARDWAREBUFFER_USAGE_GPU_DATA_BUFFER) + p->formatFeatures = format_properties.linearTilingFeatures; + else + p->formatFeatures = format_properties.optimalTilingFeatures; + + /* "Images can be created with an external format even if the Android hardware + * buffer has a format which has an equivalent Vulkan format to enable + * consistent handling of images from sources that might use either category + * of format. However, all images created with an external format are subject + * to the valid usage requirements associated with external formats, even if + * the Android hardware buffer’s format has a Vulkan equivalent." + * + * "The formatFeatures member *must* include + * VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT and at least one of + * VK_FORMAT_FEATURE_MIDPOINT_CHROMA_SAMPLES_BIT or + * VK_FORMAT_FEATURE_COSITED_CHROMA_SAMPLES_BIT" + */ + assert(p->formatFeatures & VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT); + + p->formatFeatures |= VK_FORMAT_FEATURE_MIDPOINT_CHROMA_SAMPLES_BIT; + + /* "Implementations may not always be able to determine the color model, + * numerical range, or chroma offsets of the image contents, so the values + * in VkAndroidHardwareBufferFormatPropertiesANDROID are only suggestions. + * Applications should treat these values as sensible defaults to use in + * the absence of more reliable information obtained through some other + * means." + */ + p->samplerYcbcrConversionComponents.r = VK_COMPONENT_SWIZZLE_IDENTITY; + p->samplerYcbcrConversionComponents.g = VK_COMPONENT_SWIZZLE_IDENTITY; + p->samplerYcbcrConversionComponents.b = VK_COMPONENT_SWIZZLE_IDENTITY; + p->samplerYcbcrConversionComponents.a = VK_COMPONENT_SWIZZLE_IDENTITY; + + p->suggestedYcbcrModel = VK_SAMPLER_YCBCR_MODEL_CONVERSION_YCBCR_601; + p->suggestedYcbcrRange = VK_SAMPLER_YCBCR_RANGE_ITU_FULL; + + p->suggestedXChromaOffset = VK_CHROMA_LOCATION_MIDPOINT; + p->suggestedYChromaOffset = VK_CHROMA_LOCATION_MIDPOINT; + + return VK_SUCCESS; +} + +VkResult +radv_GetAndroidHardwareBufferPropertiesANDROID( + VkDevice device_h, + const struct AHardwareBuffer *buffer, + VkAndroidHardwareBufferPropertiesANDROID *pProperties) +{ + RADV_FROM_HANDLE(radv_device, dev, device_h); + struct radv_physical_device *pdevice = dev->physical_device; + + VkAndroidHardwareBufferFormatPropertiesANDROID *format_prop = + vk_find_struct(pProperties->pNext, + ANDROID_HARDWARE_BUFFER_FORMAT_PROPERTIES_ANDROID); + + /* Fill format properties of an Android hardware buffer. */ + if (format_prop) + get_ahb_buffer_format_properties(device_h, buffer, format_prop); + + /* NOTE - We support buffers with only one handle but do not error on + * multiple handle case. Reason is that we want to support YUV formats + * where we have many logical planes but they all point to the same + * buffer, like is the case with VK_FORMAT_G8_B8R8_2PLANE_420_UNORM. + */ + const native_handle_t *handle = + AHardwareBuffer_getNativeHandle(buffer); + int dma_buf = (handle && handle->numFds) ? handle->data[0] : -1; + if (dma_buf < 0) + return VK_ERROR_INVALID_EXTERNAL_HANDLE; + + /* All memory types. */ + uint32_t memory_types = (1u << pdevice->memory_properties.memoryTypeCount) - 1; + + pProperties->allocationSize = lseek(dma_buf, 0, SEEK_END); + pProperties->memoryTypeBits = memory_types; + + return VK_SUCCESS; +} + +VkResult +radv_GetMemoryAndroidHardwareBufferANDROID( + VkDevice device_h, + const VkMemoryGetAndroidHardwareBufferInfoANDROID *pInfo, + struct AHardwareBuffer **pBuffer) +{ + RADV_FROM_HANDLE(radv_device_memory, mem, pInfo->memory); + + /* This should always be set due to the export handle types being set on + * allocation. */ + assert(mem->android_hardware_buffer); + + /* Some quotes from Vulkan spec: + * + * "If the device memory was created by importing an Android hardware + * buffer, vkGetMemoryAndroidHardwareBufferANDROID must return that same + * Android hardware buffer object." + * + * "VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID must + * have been included in VkExportMemoryAllocateInfo::handleTypes when + * memory was created." + */ + *pBuffer = mem->android_hardware_buffer; + /* Increase refcount. */ + AHardwareBuffer_acquire(mem->android_hardware_buffer); + return VK_SUCCESS; +} + +#endif + +VkFormat +radv_select_android_external_format(const void *next, VkFormat default_format) +{ +#if RADV_SUPPORT_ANDROID_HARDWARE_BUFFER + const VkExternalFormatANDROID *android_format = + vk_find_struct_const(next, EXTERNAL_FORMAT_ANDROID); + + if (android_format && android_format->externalFormat) { + return (VkFormat)android_format->externalFormat; + } +#endif + + return default_format; +} + + +VkResult +radv_import_ahb_memory(struct radv_device *device, + struct radv_device_memory *mem, + unsigned priority, + const VkImportAndroidHardwareBufferInfoANDROID *info) +{ +#if RADV_SUPPORT_ANDROID_HARDWARE_BUFFER + /* Import from AHardwareBuffer to radv_device_memory. */ + const native_handle_t *handle = + AHardwareBuffer_getNativeHandle(info->buffer); + + /* NOTE - We support buffers with only one handle but do not error on + * multiple handle case. Reason is that we want to support YUV formats + * where we have many logical planes but they all point to the same + * buffer, like is the case with VK_FORMAT_G8_B8R8_2PLANE_420_UNORM. + */ + int dma_buf = (handle && handle->numFds) ? handle->data[0] : -1; + if (dma_buf < 0) + return VK_ERROR_INVALID_EXTERNAL_HANDLE; + + uint64_t alloc_size = 0; + mem->bo = device->ws->buffer_from_fd(device->ws, dma_buf, + priority, &alloc_size); + if (!mem->bo) + return VK_ERROR_OUT_OF_HOST_MEMORY; + + if (mem->image) { + struct radeon_bo_metadata metadata; + device->ws->buffer_get_metadata(mem->bo, &metadata); + + struct radv_image_create_info create_info = { + .no_metadata_planes = true, + .bo_metadata = &metadata + }; + + VkResult result = radv_image_create_layout(device, create_info, mem->image); + if (result != VK_SUCCESS) { + device->ws->buffer_destroy(mem->bo); + mem->bo = NULL; + return result; + } + + if (alloc_size < mem->image->size) { + device->ws->buffer_destroy(mem->bo); + mem->bo = NULL; + return VK_ERROR_INVALID_EXTERNAL_HANDLE; + } + } else if (mem->buffer) { + if (alloc_size < mem->buffer->size) { + device->ws->buffer_destroy(mem->bo); + mem->bo = NULL; + return VK_ERROR_INVALID_EXTERNAL_HANDLE; + } + } + + /* "If the vkAllocateMemory command succeeds, the implementation must + * acquire a reference to the imported hardware buffer, which it must + * release when the device memory object is freed. If the command fails, + * the implementation must not retain a reference." + */ + AHardwareBuffer_acquire(info->buffer); + mem->android_hardware_buffer = info->buffer; + + return VK_SUCCESS; +#else /* RADV_SUPPORT_ANDROID_HARDWARE_BUFFER */ + return VK_ERROR_EXTENSION_NOT_PRESENT; +#endif +} + +VkResult +radv_create_ahb_memory(struct radv_device *device, + struct radv_device_memory *mem, + unsigned priority, + const VkMemoryAllocateInfo *pAllocateInfo) +{ +#if RADV_SUPPORT_ANDROID_HARDWARE_BUFFER + const VkMemoryDedicatedAllocateInfo *dedicated_info = + vk_find_struct_const(pAllocateInfo->pNext, + MEMORY_DEDICATED_ALLOCATE_INFO); + + uint32_t w = 0; + uint32_t h = 1; + uint32_t layers = 1; + uint32_t format = 0; + uint64_t usage = 0; + + /* If caller passed dedicated information. */ + if (dedicated_info && dedicated_info->image) { + RADV_FROM_HANDLE(radv_image, image, dedicated_info->image); + w = image->info.width; + h = image->info.height; + layers = image->info.array_size; + format = android_format_from_vk(image->vk_format); + usage = radv_ahb_usage_from_vk_usage(image->flags, image->usage); + } else if (dedicated_info && dedicated_info->buffer) { + RADV_FROM_HANDLE(radv_buffer, buffer, dedicated_info->buffer); + w = buffer->size; + format = AHARDWAREBUFFER_FORMAT_BLOB; + usage = AHARDWAREBUFFER_USAGE_CPU_READ_OFTEN | + AHARDWAREBUFFER_USAGE_CPU_WRITE_OFTEN; + } else { + w = pAllocateInfo->allocationSize; + format = AHARDWAREBUFFER_FORMAT_BLOB; + usage = AHARDWAREBUFFER_USAGE_CPU_READ_OFTEN | + AHARDWAREBUFFER_USAGE_CPU_WRITE_OFTEN; + } + + struct AHardwareBuffer *android_hardware_buffer = NULL; + struct AHardwareBuffer_Desc desc = { + .width = w, + .height = h, + .layers = layers, + .format = format, + .usage = usage, + }; + + if (AHardwareBuffer_allocate(&desc, &android_hardware_buffer) != 0) + return VK_ERROR_OUT_OF_HOST_MEMORY; + + mem->android_hardware_buffer = android_hardware_buffer; + + const struct VkImportAndroidHardwareBufferInfoANDROID import_info = { + .buffer = mem->android_hardware_buffer, + }; + + VkResult result = radv_import_ahb_memory(device, mem, priority, &import_info); + if (result != VK_SUCCESS) + AHardwareBuffer_release(mem->android_hardware_buffer); + return result; +#else /* RADV_SUPPORT_ANDROID_HARDWARE_BUFFER */ + return VK_ERROR_EXTENSION_NOT_PRESENT; +#endif +} + +bool radv_android_gralloc_supports_format(VkFormat format, VkImageUsageFlagBits usage) { +#if RADV_SUPPORT_ANDROID_HARDWARE_BUFFER + /* Ideally we check Gralloc for what it supports and then merge that with the radv + format support, but there is no easy gralloc query besides just creating an image. + That seems a bit on the expensive side, so just hardcode for now. */ + /* TODO: Add multi-plane formats after confirming everything works between radeonsi + and radv. */ + switch(format) { + case VK_FORMAT_R8G8B8A8_UNORM: + case VK_FORMAT_R5G6B5_UNORM_PACK16: + return true; + case VK_FORMAT_R8_UNORM: + case VK_FORMAT_R8G8_UNORM: + return !(usage & VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT); + default: + return false; + } +#else + (void)format; + (void)usage; + return false; +#endif +} diff -Nru mesa-19.2.8/src/amd/vulkan/radv_cmd_buffer.c mesa-20.0.8/src/amd/vulkan/radv_cmd_buffer.c --- mesa-19.2.8/src/amd/vulkan/radv_cmd_buffer.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/vulkan/radv_cmd_buffer.c 2020-06-12 01:21:16.000000000 +0000 @@ -222,10 +222,11 @@ struct radv_streamout_state *so = &cmd_buffer->state.streamout; struct radv_shader_info *info; - if (!pipeline->streamout_shader) + if (!pipeline->streamout_shader || + cmd_buffer->device->physical_device->use_ngg_streamout) return; - info = &pipeline->streamout_shader->info.info; + info = &pipeline->streamout_shader->info; for (int i = 0; i < MAX_SO_BUFFERS; i++) so->stride_in_dw[i] = info->so.strides[i]; @@ -331,11 +332,15 @@ } cmd_buffer->push_constant_stages = 0; - cmd_buffer->scratch_size_needed = 0; - cmd_buffer->compute_scratch_size_needed = 0; + cmd_buffer->scratch_size_per_wave_needed = 0; + cmd_buffer->scratch_waves_wanted = 0; + cmd_buffer->compute_scratch_size_per_wave_needed = 0; + cmd_buffer->compute_scratch_waves_wanted = 0; cmd_buffer->esgs_ring_size_needed = 0; cmd_buffer->gsvs_ring_size_needed = 0; cmd_buffer->tess_rings_needed = false; + cmd_buffer->gds_needed = false; + cmd_buffer->gds_oa_needed = false; cmd_buffer->sample_positions_needed = false; if (cmd_buffer->upload.upload_bo) @@ -554,8 +559,9 @@ assert(!"invalid ring type"); } - data[0] = (uintptr_t)pipeline; - data[1] = (uintptr_t)pipeline >> 32; + uint64_t pipeline_address = (uintptr_t)pipeline; + data[0] = pipeline_address; + data[1] = pipeline_address >> 32; radv_emit_write_data_packet(cmd_buffer, va, 2, data); } @@ -787,10 +793,12 @@ num_samples); /* Compute the maximum sample distance from the specified locations. */ - for (uint32_t i = 0; i < num_samples; i++) { - VkOffset2D offset = sample_locs[0][i]; - max_sample_dist = MAX2(max_sample_dist, - MAX2(abs(offset.x), abs(offset.y))); + for (unsigned i = 0; i < 4; ++i) { + for (uint32_t j = 0; j < num_samples; j++) { + VkOffset2D offset = sample_locs[i][j]; + max_sample_dist = MAX2(max_sample_dist, + MAX2(abs(offset.x), abs(offset.y))); + } } /* Emit the specified user sample locations. */ @@ -860,29 +868,16 @@ struct radv_pipeline *pipeline) { int num_samples = pipeline->graphics.ms.num_samples; - struct radv_multisample_state *ms = &pipeline->graphics.ms; struct radv_pipeline *old_pipeline = cmd_buffer->state.emitted_pipeline; - if (pipeline->shaders[MESA_SHADER_FRAGMENT]->info.info.ps.needs_sample_positions) + if (pipeline->shaders[MESA_SHADER_FRAGMENT]->info.ps.needs_sample_positions) cmd_buffer->sample_positions_needed = true; if (old_pipeline && num_samples == old_pipeline->graphics.ms.num_samples) return; - radeon_set_context_reg_seq(cmd_buffer->cs, R_028BDC_PA_SC_LINE_CNTL, 2); - radeon_emit(cmd_buffer->cs, ms->pa_sc_line_cntl); - radeon_emit(cmd_buffer->cs, ms->pa_sc_aa_config); - - radeon_set_context_reg(cmd_buffer->cs, R_028A48_PA_SC_MODE_CNTL_0, ms->pa_sc_mode_cntl_0); - radv_emit_default_sample_locations(cmd_buffer->cs, num_samples); - /* GFX9: Flush DFSM when the AA mode changes. */ - if (cmd_buffer->device->dfsm_allowed) { - radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); - radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_FLUSH_DFSM) | EVENT_INDEX(0)); - } - cmd_buffer->state.context_roll_without_scissor_emitted = true; } @@ -988,7 +983,7 @@ static void radv_emit_rbplus_state(struct radv_cmd_buffer *cmd_buffer) { - if (!cmd_buffer->device->physical_device->rbplus_allowed) + if (!cmd_buffer->device->physical_device->rad_info.rbplus_allowed) return; struct radv_pipeline *pipeline = cmd_buffer->state.pipeline; @@ -1003,8 +998,9 @@ for (unsigned i = 0; i < subpass->color_count; ++i) { if (subpass->color_attachments[i].attachment == VK_ATTACHMENT_UNUSED) { - sx_blend_opt_control |= S_02875C_MRT0_COLOR_OPT_DISABLE(1) << (i * 4); - sx_blend_opt_control |= S_02875C_MRT0_ALPHA_OPT_DISABLE(1) << (i * 4); + /* We don't set the DISABLE bits, because the HW can't have holes, + * so the SPI color format is set to 32-bit 1-component. */ + sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_R << (i * 4); continue; } @@ -1120,10 +1116,10 @@ } } - for (unsigned i = subpass->color_count; i < 8; ++i) { - sx_blend_opt_control |= S_02875C_MRT0_COLOR_OPT_DISABLE(1) << (i * 4); - sx_blend_opt_control |= S_02875C_MRT0_ALPHA_OPT_DISABLE(1) << (i * 4); - } + /* Do not set the DISABLE bits for the unused attachments, as that + * breaks dual source blending in SkQP and does not seem to improve + * performance. */ + /* TODO: avoid redundantly setting context registers */ radeon_set_context_reg_seq(cmd_buffer->cs, R_028754_SX_PS_DOWNCONVERT, 3); radeon_emit(cmd_buffer->cs, sx_ps_downconvert); @@ -1134,6 +1130,33 @@ } static void +radv_emit_batch_break_on_new_ps(struct radv_cmd_buffer *cmd_buffer) +{ + if (!cmd_buffer->device->pbb_allowed) + return; + + struct radv_binning_settings settings = + radv_get_binning_settings(cmd_buffer->device->physical_device); + bool break_for_new_ps = + (!cmd_buffer->state.emitted_pipeline || + cmd_buffer->state.emitted_pipeline->shaders[MESA_SHADER_FRAGMENT] != + cmd_buffer->state.pipeline->shaders[MESA_SHADER_FRAGMENT]) && + (settings.context_states_per_bin > 1 || + settings.persistent_states_per_bin > 1); + bool break_for_new_cb_target_mask = + (!cmd_buffer->state.emitted_pipeline || + cmd_buffer->state.emitted_pipeline->graphics.cb_target_mask != + cmd_buffer->state.pipeline->graphics.cb_target_mask) && + settings.context_states_per_bin > 1; + + if (!break_for_new_ps && !break_for_new_cb_target_mask) + return; + + radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0)); +} + +static void radv_emit_graphics_pipeline(struct radv_cmd_buffer *cmd_buffer) { struct radv_pipeline *pipeline = cmd_buffer->state.pipeline; @@ -1144,9 +1167,10 @@ radv_update_multisample_state(cmd_buffer, pipeline); radv_update_binning_state(cmd_buffer, pipeline); - cmd_buffer->scratch_size_needed = - MAX2(cmd_buffer->scratch_size_needed, - pipeline->max_waves * pipeline->scratch_bytes_per_wave); + cmd_buffer->scratch_size_per_wave_needed = MAX2(cmd_buffer->scratch_size_per_wave_needed, + pipeline->scratch_bytes_per_wave); + cmd_buffer->scratch_waves_wanted = MAX2(cmd_buffer->scratch_waves_wanted, + pipeline->max_waves); if (!cmd_buffer->state.emitted_pipeline || cmd_buffer->state.emitted_pipeline->graphics.can_use_guardband != @@ -1164,6 +1188,8 @@ cmd_buffer->state.context_roll_without_scissor_emitted = true; } + radv_emit_batch_break_on_new_ps(cmd_buffer); + for (unsigned i = 0; i < MESA_SHADER_COMPUTE; i++) { if (!pipeline->shaders[i]) continue; @@ -1398,13 +1424,15 @@ static void radv_update_zrange_precision(struct radv_cmd_buffer *cmd_buffer, struct radv_ds_buffer_info *ds, - struct radv_image *image, VkImageLayout layout, + const struct radv_image_view *iview, + VkImageLayout layout, bool in_render_loop, bool requires_cond_exec) { + const struct radv_image *image = iview->image; uint32_t db_z_info = ds->db_z_info; uint32_t db_z_info_reg; - if (!cmd_buffer->device->physical_device->has_tc_compat_zrange_bug || + if (!cmd_buffer->device->physical_device->rad_info.has_tc_compat_zrange_bug || !radv_image_is_tc_compat_htile(image)) return; @@ -1428,8 +1456,7 @@ * SET_CONTEXT_REG packet. */ if (requires_cond_exec) { - uint64_t va = radv_buffer_get_va(image->bo); - va += image->offset + image->tc_compat_zrange_offset; + uint64_t va = radv_get_tc_compat_zrange_va(image, iview->base_mip); radeon_emit(cmd_buffer->cs, PKT3(PKT3_COND_EXEC, 3, 0)); radeon_emit(cmd_buffer->cs, va); @@ -1444,10 +1471,11 @@ static void radv_emit_fb_ds_state(struct radv_cmd_buffer *cmd_buffer, struct radv_ds_buffer_info *ds, - struct radv_image *image, + struct radv_image_view *iview, VkImageLayout layout, bool in_render_loop) { + const struct radv_image *image = iview->image; uint32_t db_z_info = ds->db_z_info; uint32_t db_stencil_info = ds->db_stencil_info; @@ -1519,7 +1547,8 @@ } /* Update the ZRANGE_PRECISION value for the TC-compat bug. */ - radv_update_zrange_precision(cmd_buffer, ds, image, layout, in_render_loop, true); + radv_update_zrange_precision(cmd_buffer, ds, iview, layout, + in_render_loop, true); radeon_set_context_reg(cmd_buffer->cs, R_028B78_PA_SU_POLY_OFFSET_DB_FMT_CNTL, ds->pa_su_poly_offset_db_fmt_cntl); @@ -1531,11 +1560,12 @@ */ static void radv_update_bound_fast_clear_ds(struct radv_cmd_buffer *cmd_buffer, - struct radv_image *image, + const struct radv_image_view *iview, VkClearDepthStencilValue ds_clear_value, VkImageAspectFlags aspects) { const struct radv_subpass *subpass = cmd_buffer->state.subpass; + const struct radv_image *image = iview->image; struct radeon_cmdbuf *cs = cmd_buffer->cs; uint32_t att_idx; @@ -1571,8 +1601,8 @@ VkImageLayout layout = subpass->depth_stencil_attachment->layout; bool in_render_loop = subpass->depth_stencil_attachment->in_render_loop; - radv_update_zrange_precision(cmd_buffer, &cmd_buffer->state.attachments[att_idx].ds, image, - layout, in_render_loop, false); + radv_update_zrange_precision(cmd_buffer, &cmd_buffer->state.attachments[att_idx].ds, + iview, layout, in_render_loop, false); } cmd_buffer->state.context_roll_without_scissor_emitted = true; @@ -1584,34 +1614,51 @@ static void radv_set_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, + const VkImageSubresourceRange *range, VkClearDepthStencilValue ds_clear_value, VkImageAspectFlags aspects) { struct radeon_cmdbuf *cs = cmd_buffer->cs; - uint64_t va = radv_buffer_get_va(image->bo); - unsigned reg_offset = 0, reg_count = 0; + uint64_t va = radv_get_ds_clear_value_va(image, range->baseMipLevel); + uint32_t level_count = radv_get_levelCount(image, range); - va += image->offset + image->clear_value_offset; + if (aspects == (VK_IMAGE_ASPECT_DEPTH_BIT | + VK_IMAGE_ASPECT_STENCIL_BIT)) { + /* Use the fastest way when both aspects are used. */ + radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + 2 * level_count, cmd_buffer->state.predicating)); + radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | + S_370_WR_CONFIRM(1) | + S_370_ENGINE_SEL(V_370_PFP)); + radeon_emit(cs, va); + radeon_emit(cs, va >> 32); - if (aspects & VK_IMAGE_ASPECT_STENCIL_BIT) { - ++reg_count; + for (uint32_t l = 0; l < level_count; l++) { + radeon_emit(cs, ds_clear_value.stencil); + radeon_emit(cs, fui(ds_clear_value.depth)); + } } else { - ++reg_offset; - va += 4; - } - if (aspects & VK_IMAGE_ASPECT_DEPTH_BIT) - ++reg_count; + /* Otherwise we need one WRITE_DATA packet per level. */ + for (uint32_t l = 0; l < level_count; l++) { + uint64_t va = radv_get_ds_clear_value_va(image, range->baseMipLevel + l); + unsigned value; + + if (aspects == VK_IMAGE_ASPECT_DEPTH_BIT) { + value = fui(ds_clear_value.depth); + va += 4; + } else { + assert(aspects == VK_IMAGE_ASPECT_STENCIL_BIT); + value = ds_clear_value.stencil; + } - radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + reg_count, cmd_buffer->state.predicating)); - radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | - S_370_WR_CONFIRM(1) | - S_370_ENGINE_SEL(V_370_PFP)); - radeon_emit(cs, va); - radeon_emit(cs, va >> 32); - if (aspects & VK_IMAGE_ASPECT_STENCIL_BIT) - radeon_emit(cs, ds_clear_value.stencil); - if (aspects & VK_IMAGE_ASPECT_DEPTH_BIT) - radeon_emit(cs, fui(ds_clear_value.depth)); + radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, cmd_buffer->state.predicating)); + radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | + S_370_WR_CONFIRM(1) | + S_370_ENGINE_SEL(V_370_PFP)); + radeon_emit(cs, va); + radeon_emit(cs, va >> 32); + radeon_emit(cs, value); + } + } } /** @@ -1620,30 +1667,40 @@ static void radv_set_tc_compat_zrange_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, + const VkImageSubresourceRange *range, uint32_t value) { struct radeon_cmdbuf *cs = cmd_buffer->cs; - uint64_t va = radv_buffer_get_va(image->bo); - if (!cmd_buffer->device->physical_device->has_tc_compat_zrange_bug) + if (!cmd_buffer->device->physical_device->rad_info.has_tc_compat_zrange_bug) return; - va += image->offset + image->tc_compat_zrange_offset; + uint64_t va = radv_get_tc_compat_zrange_va(image, range->baseMipLevel); + uint32_t level_count = radv_get_levelCount(image, range); - radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, cmd_buffer->state.predicating)); + radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + level_count, cmd_buffer->state.predicating)); radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP)); radeon_emit(cs, va); radeon_emit(cs, va >> 32); - radeon_emit(cs, value); + + for (uint32_t l = 0; l < level_count; l++) + radeon_emit(cs, value); } static void radv_update_tc_compat_zrange_metadata(struct radv_cmd_buffer *cmd_buffer, - struct radv_image *image, + const struct radv_image_view *iview, VkClearDepthStencilValue ds_clear_value) { + VkImageSubresourceRange range = { + .aspectMask = iview->aspect_mask, + .baseMipLevel = iview->base_mip, + .levelCount = iview->level_count, + .baseArrayLayer = iview->base_layer, + .layerCount = iview->layer_count, + }; uint32_t cond_val; /* Conditionally set DB_Z_INFO.ZRANGE_PRECISION to 0 when the last @@ -1651,7 +1708,8 @@ */ cond_val = ds_clear_value.depth == 0.0f ? UINT_MAX : 0; - radv_set_tc_compat_zrange_metadata(cmd_buffer, image, cond_val); + radv_set_tc_compat_zrange_metadata(cmd_buffer, iview->image, &range, + cond_val); } /** @@ -1659,22 +1717,32 @@ */ void radv_update_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer, - struct radv_image *image, + const struct radv_image_view *iview, VkClearDepthStencilValue ds_clear_value, VkImageAspectFlags aspects) { + VkImageSubresourceRange range = { + .aspectMask = iview->aspect_mask, + .baseMipLevel = iview->base_mip, + .levelCount = iview->level_count, + .baseArrayLayer = iview->base_layer, + .layerCount = iview->layer_count, + }; + struct radv_image *image = iview->image; + assert(radv_image_has_htile(image)); - radv_set_ds_clear_metadata(cmd_buffer, image, ds_clear_value, aspects); + radv_set_ds_clear_metadata(cmd_buffer, iview->image, &range, + ds_clear_value, aspects); if (radv_image_is_tc_compat_htile(image) && (aspects & VK_IMAGE_ASPECT_DEPTH_BIT)) { - radv_update_tc_compat_zrange_metadata(cmd_buffer, image, + radv_update_tc_compat_zrange_metadata(cmd_buffer, iview, ds_clear_value); } - radv_update_bound_fast_clear_ds(cmd_buffer, image, ds_clear_value, - aspects); + radv_update_bound_fast_clear_ds(cmd_buffer, iview, ds_clear_value, + aspects); } /** @@ -1682,15 +1750,14 @@ */ static void radv_load_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer, - struct radv_image *image) + const struct radv_image_view *iview) { struct radeon_cmdbuf *cs = cmd_buffer->cs; + const struct radv_image *image = iview->image; VkImageAspectFlags aspects = vk_format_aspects(image->vk_format); - uint64_t va = radv_buffer_get_va(image->bo); + uint64_t va = radv_get_ds_clear_value_va(image, iview->base_mip); unsigned reg_offset = 0, reg_count = 0; - va += image->offset + image->clear_value_offset; - if (!radv_image_has_htile(image)) return; @@ -1705,7 +1772,7 @@ uint32_t reg = R_028028_DB_STENCIL_CLEAR + 4 * reg_offset; - if (cmd_buffer->device->physical_device->has_load_ctx_reg_pkt) { + if (cmd_buffer->device->physical_device->rad_info.has_load_ctx_reg_pkt) { radeon_emit(cs, PKT3(PKT3_LOAD_CONTEXT_REG, 3, 0)); radeon_emit(cs, va); radeon_emit(cs, va >> 32); @@ -1889,7 +1956,7 @@ uint32_t reg = R_028C8C_CB_COLOR0_CLEAR_WORD0 + cb_idx * 0x3c; - if (cmd_buffer->device->physical_device->has_load_ctx_reg_pkt) { + if (cmd_buffer->device->physical_device->rad_info.has_load_ctx_reg_pkt) { radeon_emit(cs, PKT3(PKT3_LOAD_CONTEXT_REG, 3, cmd_buffer->state.predicating)); radeon_emit(cs, va); radeon_emit(cs, va >> 32); @@ -1946,7 +2013,8 @@ int idx = subpass->depth_stencil_attachment->attachment; VkImageLayout layout = subpass->depth_stencil_attachment->layout; bool in_render_loop = subpass->depth_stencil_attachment->in_render_loop; - struct radv_image *image = cmd_buffer->state.attachments[idx].iview->image; + struct radv_image_view *iview = cmd_buffer->state.attachments[idx].iview; + struct radv_image *image = iview->image; radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, cmd_buffer->state.attachments[idx].iview->bo); ASSERTED uint32_t queue_mask = radv_image_queue_family_mask(image, cmd_buffer->queue_family_index, @@ -1955,13 +2023,13 @@ assert(radv_layout_has_htile(image, layout, in_render_loop, queue_mask) == radv_layout_is_htile_compressed(image, layout, in_render_loop, queue_mask)); - radv_emit_fb_ds_state(cmd_buffer, &cmd_buffer->state.attachments[idx].ds, image, layout, in_render_loop); + radv_emit_fb_ds_state(cmd_buffer, &cmd_buffer->state.attachments[idx].ds, iview, layout, in_render_loop); if (cmd_buffer->state.attachments[idx].ds.offset_scale != cmd_buffer->state.offset_scale) { cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS; cmd_buffer->state.offset_scale = cmd_buffer->state.attachments[idx].ds.offset_scale; } - radv_load_ds_clear_metadata(cmd_buffer, image); + radv_load_ds_clear_metadata(cmd_buffer, iview); } else { if (cmd_buffer->device->physical_device->rad_info.chip_class == GFX9) radeon_set_context_reg_seq(cmd_buffer->cs, R_028038_DB_Z_INFO, 2); @@ -1977,7 +2045,7 @@ if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX8) { bool disable_constant_encode = - cmd_buffer->device->physical_device->has_dcc_constant_encode; + cmd_buffer->device->physical_device->rad_info.has_dcc_constant_encode; enum chip_class chip_class = cmd_buffer->device->physical_device->rad_info.chip_class; uint8_t watermark = chip_class >= GFX10 ? 6 : 4; @@ -1988,7 +2056,7 @@ S_028424_DISABLE_CONSTANT_ENCODE_REG(disable_constant_encode)); } - if (cmd_buffer->device->pbb_allowed) { + if (cmd_buffer->device->dfsm_allowed) { radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0)); } @@ -2097,7 +2165,7 @@ radv_emit_viewport(cmd_buffer); if (states & (RADV_CMD_DIRTY_DYNAMIC_SCISSOR | RADV_CMD_DIRTY_DYNAMIC_VIEWPORT) && - !cmd_buffer->device->physical_device->has_scissor_bug) + !cmd_buffer->device->physical_device->rad_info.has_gfx9_scissor_bug) radv_emit_scissor(cmd_buffer); if (states & RADV_CMD_DIRTY_DYNAMIC_LINE_WIDTH) @@ -2285,11 +2353,11 @@ if (!shader) continue; - need_push_constants |= shader->info.info.loads_push_constants; - need_push_constants |= shader->info.info.loads_dynamic_offsets; + need_push_constants |= shader->info.loads_push_constants; + need_push_constants |= shader->info.loads_dynamic_offsets; - uint8_t base = shader->info.info.base_inline_push_consts; - uint8_t count = shader->info.info.num_inline_push_consts; + uint8_t base = shader->info.base_inline_push_consts; + uint8_t count = shader->info.num_inline_push_consts; radv_emit_inline_push_consts(cmd_buffer, pipeline, stage, AC_UD_INLINE_PUSH_CONSTANTS, @@ -2340,8 +2408,7 @@ if ((pipeline_is_dirty || (cmd_buffer->state.dirty & RADV_CMD_DIRTY_VERTEX_BUFFER)) && cmd_buffer->state.pipeline->num_vertex_bindings && - radv_get_shader(cmd_buffer->state.pipeline, MESA_SHADER_VERTEX)->info.info.vs.has_vertex_buffers) { - struct radv_vertex_elements_info *velems = &cmd_buffer->state.pipeline->vertex_elements; + radv_get_shader(cmd_buffer->state.pipeline, MESA_SHADER_VERTEX)->info.vs.has_vertex_buffers) { unsigned vb_offset; void *vb_ptr; uint32_t i = 0; @@ -2358,6 +2425,7 @@ uint32_t offset; struct radv_buffer *buffer = cmd_buffer->vertex_bindings[i].buffer; uint32_t stride = cmd_buffer->state.pipeline->binding_stride[i]; + unsigned num_records; if (!buffer) continue; @@ -2366,20 +2434,28 @@ offset = cmd_buffer->vertex_bindings[i].offset; va += offset + buffer->offset; + + num_records = buffer->size - offset; + if (cmd_buffer->device->physical_device->rad_info.chip_class != GFX8 && stride) + num_records /= stride; + desc[0] = va; desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | S_008F04_STRIDE(stride); - if (cmd_buffer->device->physical_device->rad_info.chip_class <= GFX7 && stride) - desc[2] = (buffer->size - offset - velems->format_size[i]) / stride + 1; - else - desc[2] = buffer->size - offset; + desc[2] = num_records; desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W); if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10) { + /* OOB_SELECT chooses the out-of-bounds check: + * - 1: index >= NUM_RECORDS (Structured) + * - 3: offset >= NUM_RECORDS (Raw) + */ + int oob_select = stride ? V_008F0C_OOB_SELECT_STRUCTURED : V_008F0C_OOB_SELECT_RAW; + desc[3] |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_UINT) | - S_008F0C_OOB_SELECT(1) | + S_008F0C_OOB_SELECT(oob_select) | S_008F0C_RESOURCE_LEVEL(1); } else { desc[3] |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) | @@ -2466,9 +2542,18 @@ * the buffer will be considered not bound and store * instructions will be no-ops. */ + uint32_t size = 0xffffffff; + + /* Compute the correct buffer size for NGG streamout + * because it's used to determine the max emit per + * buffer. + */ + if (cmd_buffer->device->physical_device->use_ngg_streamout) + size = buffer->size - sb[i].offset; + desc[0] = va; desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32); - desc[2] = 0xffffffff; + desc[2] = size; desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | @@ -2476,7 +2561,7 @@ if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10) { desc[3] |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) | - S_008F0C_OOB_SELECT(3) | + S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1); } else { desc[3] |= S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); @@ -2493,12 +2578,42 @@ } static void +radv_flush_ngg_gs_state(struct radv_cmd_buffer *cmd_buffer) +{ + struct radv_pipeline *pipeline = cmd_buffer->state.pipeline; + struct radv_userdata_info *loc; + uint32_t ngg_gs_state = 0; + uint32_t base_reg; + + if (!radv_pipeline_has_gs(pipeline) || + !radv_pipeline_has_ngg(pipeline)) + return; + + /* By default NGG GS queries are disabled but they are enabled if the + * command buffer has active GDS queries or if it's a secondary command + * buffer that inherits the number of generated primitives. + */ + if (cmd_buffer->state.active_pipeline_gds_queries || + (cmd_buffer->state.inherited_pipeline_statistics & VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT)) + ngg_gs_state = 1; + + loc = radv_lookup_user_sgpr(pipeline, MESA_SHADER_GEOMETRY, + AC_UD_NGG_GS_STATE); + base_reg = pipeline->user_data_0[MESA_SHADER_GEOMETRY]; + assert(loc->sgpr_idx != -1); + + radeon_set_sh_reg(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, + ngg_gs_state); +} + +static void radv_upload_graphics_shader_descriptors(struct radv_cmd_buffer *cmd_buffer, bool pipeline_is_dirty) { radv_flush_vertex_descriptors(cmd_buffer, pipeline_is_dirty); radv_flush_streamout_descriptors(cmd_buffer); radv_flush_descriptors(cmd_buffer, VK_SHADER_STAGE_ALL_GRAPHICS); radv_flush_constants(cmd_buffer, VK_SHADER_STAGE_ALL_GRAPHICS); + radv_flush_ngg_gs_state(cmd_buffer); } struct radv_draw_info { @@ -2805,6 +2920,11 @@ break; case VK_ACCESS_SHADER_READ_BIT: flush_bits |= RADV_CMD_FLAG_INV_VCACHE; + /* Unlike LLVM, ACO uses SMEM for SSBOs and we have to + * invalidate the scalar cache. */ + if (cmd_buffer->device->physical_device->use_aco && + cmd_buffer->device->physical_device->rad_info.chip_class >= GFX8) + flush_bits |= RADV_CMD_FLAG_INV_SCACHE; if (!image_is_coherent) flush_bits |= RADV_CMD_FLAG_INV_L2; @@ -2900,7 +3020,7 @@ struct radv_image_view *view = cmd_buffer->state.attachments[idx].iview; struct radv_sample_locations_state *sample_locs; VkImageSubresourceRange range; - range.aspectMask = 0; + range.aspectMask = view->aspect_mask; range.baseMipLevel = view->base_mip; range.levelCount = 1; range.baseArrayLayer = view->base_layer; @@ -2923,14 +3043,48 @@ sample_locs = radv_get_attachment_sample_locations(cmd_buffer, idx, begin_subpass); - radv_handle_image_transition(cmd_buffer, - view->image, - cmd_buffer->state.attachments[idx].current_layout, - cmd_buffer->state.attachments[idx].current_in_render_loop, - att.layout, att.in_render_loop, - 0, 0, &range, sample_locs); + /* Determine if the subpass uses separate depth/stencil layouts. */ + bool uses_separate_depth_stencil_layouts = false; + if ((cmd_buffer->state.attachments[idx].current_layout != + cmd_buffer->state.attachments[idx].current_stencil_layout) || + (att.layout != att.stencil_layout)) { + uses_separate_depth_stencil_layouts = true; + } + + /* For separate layouts, perform depth and stencil transitions + * separately. + */ + if (uses_separate_depth_stencil_layouts && + (range.aspectMask == (VK_IMAGE_ASPECT_DEPTH_BIT | + VK_IMAGE_ASPECT_STENCIL_BIT))) { + /* Depth-only transitions. */ + range.aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT; + radv_handle_image_transition(cmd_buffer, + view->image, + cmd_buffer->state.attachments[idx].current_layout, + cmd_buffer->state.attachments[idx].current_in_render_loop, + att.layout, att.in_render_loop, + 0, 0, &range, sample_locs); + + /* Stencil-only transitions. */ + range.aspectMask = VK_IMAGE_ASPECT_STENCIL_BIT; + radv_handle_image_transition(cmd_buffer, + view->image, + cmd_buffer->state.attachments[idx].current_stencil_layout, + cmd_buffer->state.attachments[idx].current_in_render_loop, + att.stencil_layout, att.in_render_loop, + 0, 0, &range, sample_locs); + } else { + radv_handle_image_transition(cmd_buffer, + view->image, + cmd_buffer->state.attachments[idx].current_layout, + cmd_buffer->state.attachments[idx].current_in_render_loop, + att.layout, att.in_render_loop, + 0, 0, &range, sample_locs); + } cmd_buffer->state.attachments[idx].current_layout = att.layout; + cmd_buffer->state.attachments[idx].current_stencil_layout = att.stencil_layout; cmd_buffer->state.attachments[idx].current_in_render_loop = att.in_render_loop; @@ -3032,11 +3186,11 @@ const VkRenderPassBeginInfo *info) { struct radv_cmd_state *state = &cmd_buffer->state; - const struct VkRenderPassAttachmentBeginInfoKHR *attachment_info = NULL; + const struct VkRenderPassAttachmentBeginInfo *attachment_info = NULL; if (info) { attachment_info = vk_find_struct_const(info->pNext, - RENDER_PASS_ATTACHMENT_BEGIN_INFO_KHR); + RENDER_PASS_ATTACHMENT_BEGIN_INFO); } @@ -3087,6 +3241,7 @@ } state->attachments[i].current_layout = att->initial_layout; + state->attachments[i].current_stencil_layout = att->stencil_initial_layout; state->attachments[i].sample_location.count = 0; struct radv_image_view *iview; @@ -3120,7 +3275,7 @@ for (i = 0; i < pAllocateInfo->commandBufferCount; i++) { - if (!list_empty(&pool->free_cmd_buffers)) { + if (!list_is_empty(&pool->free_cmd_buffers)) { struct radv_cmd_buffer *cmd_buffer = list_first_entry(&pool->free_cmd_buffers, struct radv_cmd_buffer, pool_link); list_del(&cmd_buffer->pool_link); @@ -3227,6 +3382,9 @@ return result; } + cmd_buffer->state.inherited_pipeline_statistics = + pBeginInfo->pInheritanceInfo->pipelineStatistics; + radv_cmd_buffer_set_subpass(cmd_buffer, subpass); } @@ -3355,7 +3513,7 @@ assert(!(set->layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR)); if (!cmd_buffer->device->use_global_bo_list) { - for (unsigned j = 0; j < set->layout->buffer_count; ++j) + for (unsigned j = 0; j < set->buffer_count; ++j) if (set->descriptors[j]) radv_cs_add_buffer(ws, cmd_buffer->cs, set->descriptors[j]); } @@ -3410,7 +3568,7 @@ if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10) { dst[3] |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) | - S_008F0C_OOB_SELECT(3) | + S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1); } else { dst[3] |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | @@ -3577,6 +3735,13 @@ */ cmd_buffer->state.flush_bits |= cmd_buffer->active_query_flush_bits; + /* Since NGG streamout uses GDS, we need to make GDS idle when + * we leave the IB, otherwise another process might overwrite + * it while our shaders are busy. + */ + if (cmd_buffer->gds_needed) + cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH; + si_emit_cache_flush(cmd_buffer); } @@ -3611,9 +3776,10 @@ radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, pipeline->cs.cdw); radeon_emit_array(cmd_buffer->cs, pipeline->cs.buf, pipeline->cs.cdw); - cmd_buffer->compute_scratch_size_needed = - MAX2(cmd_buffer->compute_scratch_size_needed, - pipeline->max_waves * pipeline->scratch_bytes_per_wave); + cmd_buffer->compute_scratch_size_per_wave_needed = MAX2(cmd_buffer->compute_scratch_size_per_wave_needed, + pipeline->scratch_bytes_per_wave); + cmd_buffer->compute_scratch_waves_wanted = MAX2(cmd_buffer->compute_scratch_waves_wanted, + pipeline->max_waves); radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, pipeline->shaders[MESA_SHADER_COMPUTE]->bo); @@ -3942,10 +4108,14 @@ for (uint32_t i = 0; i < commandBufferCount; i++) { RADV_FROM_HANDLE(radv_cmd_buffer, secondary, pCmdBuffers[i]); - primary->scratch_size_needed = MAX2(primary->scratch_size_needed, - secondary->scratch_size_needed); - primary->compute_scratch_size_needed = MAX2(primary->compute_scratch_size_needed, - secondary->compute_scratch_size_needed); + primary->scratch_size_per_wave_needed = MAX2(primary->scratch_size_per_wave_needed, + secondary->scratch_size_per_wave_needed); + primary->scratch_waves_wanted = MAX2(primary->scratch_waves_wanted, + secondary->scratch_waves_wanted); + primary->compute_scratch_size_per_wave_needed = MAX2(primary->compute_scratch_size_per_wave_needed, + secondary->compute_scratch_size_per_wave_needed); + primary->compute_scratch_waves_wanted = MAX2(primary->compute_scratch_waves_wanted, + secondary->compute_scratch_waves_wanted); if (secondary->esgs_ring_size_needed > primary->esgs_ring_size_needed) primary->esgs_ring_size_needed = secondary->esgs_ring_size_needed; @@ -3955,6 +4125,8 @@ primary->tess_rings_needed = true; if (secondary->sample_positions_needed) primary->sample_positions_needed = true; + if (secondary->gds_needed) + primary->gds_needed = true; if (!secondary->state.framebuffer && (primary->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER)) { @@ -4155,7 +4327,8 @@ continue; VkImageLayout layout = state->pass->attachments[a].final_layout; - struct radv_subpass_attachment att = { a, layout }; + VkImageLayout stencil_layout = state->pass->attachments[a].stencil_final_layout; + struct radv_subpass_attachment att = { a, layout, stencil_layout }; radv_handle_subpass_image_transition(cmd_buffer, att, false); } } @@ -4185,10 +4358,10 @@ radv_cmd_buffer_begin_subpass(cmd_buffer, 0); } -void radv_CmdBeginRenderPass2KHR( +void radv_CmdBeginRenderPass2( VkCommandBuffer commandBuffer, const VkRenderPassBeginInfo* pRenderPassBeginInfo, - const VkSubpassBeginInfoKHR* pSubpassBeginInfo) + const VkSubpassBeginInfo* pSubpassBeginInfo) { radv_CmdBeginRenderPass(commandBuffer, pRenderPassBeginInfo, pSubpassBeginInfo->contents); @@ -4205,10 +4378,10 @@ radv_cmd_buffer_begin_subpass(cmd_buffer, prev_subpass + 1); } -void radv_CmdNextSubpass2KHR( +void radv_CmdNextSubpass2( VkCommandBuffer commandBuffer, - const VkSubpassBeginInfoKHR* pSubpassBeginInfo, - const VkSubpassEndInfoKHR* pSubpassEndInfo) + const VkSubpassBeginInfo* pSubpassBeginInfo, + const VkSubpassEndInfo* pSubpassEndInfo) { radv_CmdNextSubpass(commandBuffer, pSubpassBeginInfo->contents); } @@ -4270,7 +4443,7 @@ struct radeon_cmdbuf *cs = cmd_buffer->cs; unsigned di_src_sel = indexed ? V_0287F0_DI_SRC_SEL_DMA : V_0287F0_DI_SRC_SEL_AUTO_INDEX; - bool draw_id_enable = radv_get_shader(cmd_buffer->state.pipeline, MESA_SHADER_VERTEX)->info.info.vs.needs_draw_id; + bool draw_id_enable = radv_get_shader(cmd_buffer->state.pipeline, MESA_SHADER_VERTEX)->info.vs.needs_draw_id; uint32_t base_reg = cmd_buffer->state.pipeline->graphics.vtx_base_sgpr; bool predicating = cmd_buffer->state.predicating; assert(base_reg); @@ -4441,7 +4614,7 @@ { struct radv_cmd_state *state = &cmd_buffer->state; - if (!cmd_buffer->device->physical_device->has_scissor_bug) + if (!cmd_buffer->device->physical_device->rad_info.has_gfx9_scissor_bug) return false; if (cmd_buffer->state.context_roll_without_scissor_emitted || info->strmout_buffer) @@ -4694,7 +4867,7 @@ radv_draw(cmd_buffer, &info); } -void radv_CmdDrawIndirectCountKHR( +void radv_CmdDrawIndirectCount( VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset, @@ -4718,7 +4891,7 @@ radv_draw(cmd_buffer, &info); } -void radv_CmdDrawIndexedIndirectCountKHR( +void radv_CmdDrawIndexedIndirectCount( VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset, @@ -4783,6 +4956,11 @@ ASSERTED unsigned cdw_max = radeon_check_space(ws, cs, 25); + if (compute_shader->info.wave_size == 32) { + assert(cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10); + dispatch_initiator |= S_00B800_CS_W32_EN(1); + } + if (info->indirect) { uint64_t va = radv_buffer_get_va(info->indirect->bo); @@ -5042,9 +5220,9 @@ cmd_buffer->state.subpass_sample_locs = NULL; } -void radv_CmdEndRenderPass2KHR( +void radv_CmdEndRenderPass2( VkCommandBuffer commandBuffer, - const VkSubpassEndInfoKHR* pSubpassEndInfo) + const VkSubpassEndInfo* pSubpassEndInfo) { radv_CmdEndRenderPass(commandBuffer); } @@ -5058,26 +5236,26 @@ */ static void radv_initialize_htile(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, - const VkImageSubresourceRange *range, - uint32_t clear_word) + const VkImageSubresourceRange *range) { assert(range->baseMipLevel == 0); assert(range->levelCount == 1 || range->levelCount == VK_REMAINING_ARRAY_LAYERS); VkImageAspectFlags aspects = VK_IMAGE_ASPECT_DEPTH_BIT; struct radv_cmd_state *state = &cmd_buffer->state; + uint32_t htile_value = vk_format_is_stencil(image->vk_format) ? 0xfffff30f : 0xfffc000f; VkClearDepthStencilValue value = {}; state->flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB | RADV_CMD_FLAG_FLUSH_AND_INV_DB_META; - state->flush_bits |= radv_clear_htile(cmd_buffer, image, range, clear_word); + state->flush_bits |= radv_clear_htile(cmd_buffer, image, range, htile_value); state->flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META; if (vk_format_is_stencil(image->vk_format)) aspects |= VK_IMAGE_ASPECT_STENCIL_BIT; - radv_set_ds_clear_metadata(cmd_buffer, image, value, aspects); + radv_set_ds_clear_metadata(cmd_buffer, image, range, value, aspects); if (radv_image_is_tc_compat_htile(image)) { /* Initialize the TC-compat metada value to 0 because by @@ -5085,7 +5263,7 @@ * need have to conditionally update its value when performing * a fast depth clear. */ - radv_set_tc_compat_zrange_metadata(cmd_buffer, image, 0); + radv_set_tc_compat_zrange_metadata(cmd_buffer, image, range, 0); } } @@ -5104,30 +5282,17 @@ return; if (src_layout == VK_IMAGE_LAYOUT_UNDEFINED) { - uint32_t clear_value = vk_format_is_stencil(image->vk_format) ? 0xfffff30f : 0xfffc000f; - - if (radv_layout_is_htile_compressed(image, dst_layout, dst_render_loop, - dst_queue_mask)) { - clear_value = 0; - } - - radv_initialize_htile(cmd_buffer, image, range, clear_value); + radv_initialize_htile(cmd_buffer, image, range); } else if (!radv_layout_is_htile_compressed(image, src_layout, src_render_loop, src_queue_mask) && radv_layout_is_htile_compressed(image, dst_layout, dst_render_loop, dst_queue_mask)) { - uint32_t clear_value = vk_format_is_stencil(image->vk_format) ? 0xfffff30f : 0xfffc000f; - radv_initialize_htile(cmd_buffer, image, range, clear_value); + radv_initialize_htile(cmd_buffer, image, range); } else if (radv_layout_is_htile_compressed(image, src_layout, src_render_loop, src_queue_mask) && !radv_layout_is_htile_compressed(image, dst_layout, dst_render_loop, dst_queue_mask)) { - VkImageSubresourceRange local_range = *range; - local_range.aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT; - local_range.baseMipLevel = 0; - local_range.levelCount = 1; - cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB | RADV_CMD_FLAG_FLUSH_AND_INV_DB_META; - radv_decompress_depth_image_inplace(cmd_buffer, image, - &local_range, sample_locs); + radv_decompress_depth_image_inplace(cmd_buffer, image, range, + sample_locs); cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB | RADV_CMD_FLAG_FLUSH_AND_INV_DB_META; @@ -5742,7 +5907,12 @@ sb[idx].buffer = radv_buffer_from_handle(pBuffers[i]); sb[idx].offset = pOffsets[i]; - sb[idx].size = pSizes[i]; + + if (!pSizes || pSizes[i] == VK_WHOLE_SIZE) { + sb[idx].size = sb[idx].buffer->size - sb[idx].offset; + } else { + sb[idx].size = pSizes[i]; + } radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, sb[idx].buffer->bo); @@ -5788,9 +5958,15 @@ (so->enabled_mask << 8) | (so->enabled_mask << 12); - if ((old_streamout_enabled != so->streamout_enabled) || - (old_hw_enabled_mask != so->hw_enabled_mask)) + if (!cmd_buffer->device->physical_device->use_ngg_streamout && + ((old_streamout_enabled != so->streamout_enabled) || + (old_hw_enabled_mask != so->hw_enabled_mask))) radv_emit_streamout_enable(cmd_buffer); + + if (cmd_buffer->device->physical_device->use_ngg_streamout) { + cmd_buffer->gds_needed = true; + cmd_buffer->gds_oa_needed = true; + } } static void radv_flush_vgt_streamout(struct radv_cmd_buffer *cmd_buffer) @@ -5884,6 +6060,62 @@ radv_set_streamout_enable(cmd_buffer, true); } +static void +gfx10_emit_streamout_begin(struct radv_cmd_buffer *cmd_buffer, + uint32_t firstCounterBuffer, + uint32_t counterBufferCount, + const VkBuffer *pCounterBuffers, + const VkDeviceSize *pCounterBufferOffsets) +{ + struct radv_streamout_state *so = &cmd_buffer->state.streamout; + unsigned last_target = util_last_bit(so->enabled_mask) - 1; + struct radeon_cmdbuf *cs = cmd_buffer->cs; + uint32_t i; + + assert(cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10); + assert(firstCounterBuffer + counterBufferCount <= MAX_SO_BUFFERS); + + /* Sync because the next streamout operation will overwrite GDS and we + * have to make sure it's idle. + * TODO: Improve by tracking if there is a streamout operation in + * flight. + */ + cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VS_PARTIAL_FLUSH; + si_emit_cache_flush(cmd_buffer); + + for_each_bit(i, so->enabled_mask) { + int32_t counter_buffer_idx = i - firstCounterBuffer; + if (counter_buffer_idx >= 0 && counter_buffer_idx >= counterBufferCount) + counter_buffer_idx = -1; + + bool append = counter_buffer_idx >= 0 && + pCounterBuffers && pCounterBuffers[counter_buffer_idx]; + uint64_t va = 0; + + if (append) { + RADV_FROM_HANDLE(radv_buffer, buffer, pCounterBuffers[counter_buffer_idx]); + + va += radv_buffer_get_va(buffer->bo); + va += buffer->offset + pCounterBufferOffsets[counter_buffer_idx]; + + radv_cs_add_buffer(cmd_buffer->device->ws, cs, buffer->bo); + } + + radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0)); + radeon_emit(cs, S_411_SRC_SEL(append ? V_411_SRC_ADDR_TC_L2 : V_411_DATA) | + S_411_DST_SEL(V_411_GDS) | + S_411_CP_SYNC(i == last_target)); + radeon_emit(cs, va); + radeon_emit(cs, va >> 32); + radeon_emit(cs, 4 * i); /* destination in GDS */ + radeon_emit(cs, 0); + radeon_emit(cs, S_414_BYTE_COUNT_GFX9(4) | + S_414_DISABLE_WR_CONFIRM_GFX9(i != last_target)); + } + + radv_set_streamout_enable(cmd_buffer, true); +} + void radv_CmdBeginTransformFeedbackEXT( VkCommandBuffer commandBuffer, uint32_t firstCounterBuffer, @@ -5893,9 +6125,15 @@ { RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); - radv_emit_streamout_begin(cmd_buffer, - firstCounterBuffer, counterBufferCount, - pCounterBuffers, pCounterBufferOffsets); + if (cmd_buffer->device->physical_device->use_ngg_streamout) { + gfx10_emit_streamout_begin(cmd_buffer, + firstCounterBuffer, counterBufferCount, + pCounterBuffers, pCounterBufferOffsets); + } else { + radv_emit_streamout_begin(cmd_buffer, + firstCounterBuffer, counterBufferCount, + pCounterBuffers, pCounterBufferOffsets); + } } static void @@ -5950,6 +6188,47 @@ radv_set_streamout_enable(cmd_buffer, false); } +static void +gfx10_emit_streamout_end(struct radv_cmd_buffer *cmd_buffer, + uint32_t firstCounterBuffer, + uint32_t counterBufferCount, + const VkBuffer *pCounterBuffers, + const VkDeviceSize *pCounterBufferOffsets) +{ + struct radv_streamout_state *so = &cmd_buffer->state.streamout; + struct radeon_cmdbuf *cs = cmd_buffer->cs; + uint32_t i; + + assert(cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10); + assert(firstCounterBuffer + counterBufferCount <= MAX_SO_BUFFERS); + + for_each_bit(i, so->enabled_mask) { + int32_t counter_buffer_idx = i - firstCounterBuffer; + if (counter_buffer_idx >= 0 && counter_buffer_idx >= counterBufferCount) + counter_buffer_idx = -1; + + if (counter_buffer_idx >= 0 && pCounterBuffers && pCounterBuffers[counter_buffer_idx]) { + /* The array of counters buffer is optional. */ + RADV_FROM_HANDLE(radv_buffer, buffer, pCounterBuffers[counter_buffer_idx]); + uint64_t va = radv_buffer_get_va(buffer->bo); + + va += buffer->offset + pCounterBufferOffsets[counter_buffer_idx]; + + si_cs_emit_write_event_eop(cs, + cmd_buffer->device->physical_device->rad_info.chip_class, + radv_cmd_buffer_uses_mec(cmd_buffer), + V_028A90_PS_DONE, 0, + EOP_DST_SEL_TC_L2, + EOP_DATA_SEL_GDS, + va, EOP_DATA_GDS(i, 1), 0); + + radv_cs_add_buffer(cmd_buffer->device->ws, cs, buffer->bo); + } + } + + radv_set_streamout_enable(cmd_buffer, false); +} + void radv_CmdEndTransformFeedbackEXT( VkCommandBuffer commandBuffer, uint32_t firstCounterBuffer, @@ -5959,9 +6238,15 @@ { RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); - radv_emit_streamout_end(cmd_buffer, - firstCounterBuffer, counterBufferCount, - pCounterBuffers, pCounterBufferOffsets); + if (cmd_buffer->device->physical_device->use_ngg_streamout) { + gfx10_emit_streamout_end(cmd_buffer, + firstCounterBuffer, counterBufferCount, + pCounterBuffers, pCounterBufferOffsets); + } else { + radv_emit_streamout_end(cmd_buffer, + firstCounterBuffer, counterBufferCount, + pCounterBuffers, pCounterBufferOffsets); + } } void radv_CmdDrawIndirectByteCountEXT( diff -Nru mesa-19.2.8/src/amd/vulkan/radv_constants.h mesa-20.0.8/src/amd/vulkan/radv_constants.h --- mesa-19.2.8/src/amd/vulkan/radv_constants.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/vulkan/radv_constants.h 2020-06-12 01:21:16.000000000 +0000 @@ -53,6 +53,7 @@ #define MAX_INLINE_UNIFORM_BLOCK_COUNT 64 #define NUM_DEPTH_CLEAR_PIPELINES 3 +#define NUM_DEPTH_DECOMPRESS_PIPELINES 3 /* * This is the point we switch from using CP to compute shader @@ -77,5 +78,16 @@ #define RADV_NUM_PHYSICAL_VGPRS 256 +/* Make sure everything is addressable by a signed 32-bit int, and + * our largest descriptors are 96 bytes. + */ +#define RADV_MAX_PER_SET_DESCRIPTORS ((1ull << 31 ) / 96) + +/* Our buffer size fields allow only this much */ +#define RADV_MAX_MEMORY_ALLOCATION_SIZE 0xFFFFFFFFull + +/* Number of invocations in each subgroup. */ +#define RADV_SUBGROUP_SIZE 64 + #endif /* RADV_CONSTANTS_H */ diff -Nru mesa-19.2.8/src/amd/vulkan/radv_debug.c mesa-20.0.8/src/amd/vulkan/radv_debug.c --- mesa-19.2.8/src/amd/vulkan/radv_debug.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/vulkan/radv_debug.c 2020-06-12 01:21:16.000000000 +0000 @@ -145,37 +145,6 @@ fprintf(f, "\n"); } -static const char * -radv_get_descriptor_name(enum VkDescriptorType type) -{ - switch (type) { - case VK_DESCRIPTOR_TYPE_SAMPLER: - return "SAMPLER"; - case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: - return "COMBINED_IMAGE_SAMPLER"; - case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE: - return "SAMPLED_IMAGE"; - case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: - return "STORAGE_IMAGE"; - case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: - return "UNIFORM_TEXEL_BUFFER"; - case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: - return "STORAGE_TEXEL_BUFFER"; - case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: - return "UNIFORM_BUFFER"; - case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: - return "STORAGE_BUFFER"; - case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: - return "UNIFORM_BUFFER_DYNAMIC"; - case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: - return "STORAGE_BUFFER_DYNAMIC"; - case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: - return "INPUT_ATTACHMENT"; - default: - return "UNKNOWN"; - } -} - static void radv_dump_buffer_descriptor(enum chip_class chip_class, const uint32_t *desc, FILE *f) @@ -224,9 +193,10 @@ } static void -radv_dump_descriptor_set(enum chip_class chip_class, +radv_dump_descriptor_set(struct radv_device *device, struct radv_descriptor_set *set, unsigned id, FILE *f) { + enum chip_class chip_class = device->physical_device->rad_info.chip_class; const struct radv_descriptor_set_layout *layout; int i; @@ -234,52 +204,10 @@ return; layout = set->layout; - fprintf(f, "** descriptor set (%d) **\n", id); - fprintf(f, "va: 0x%"PRIx64"\n", set->va); - fprintf(f, "size: %d\n", set->size); - fprintf(f, "mapped_ptr:\n"); - - for (i = 0; i < set->size / 4; i++) { - fprintf(f, "\t[0x%x] = 0x%08x\n", i, set->mapped_ptr[i]); - } - fprintf(f, "\n"); - - fprintf(f, "\t*** layout ***\n"); - fprintf(f, "\tbinding_count: %d\n", layout->binding_count); - fprintf(f, "\tsize: %d\n", layout->size); - fprintf(f, "\tshader_stages: %x\n", layout->shader_stages); - fprintf(f, "\tdynamic_shader_stages: %x\n", - layout->dynamic_shader_stages); - fprintf(f, "\tbuffer_count: %d\n", layout->buffer_count); - fprintf(f, "\tdynamic_offset_count: %d\n", - layout->dynamic_offset_count); - fprintf(f, "\n"); - for (i = 0; i < set->layout->binding_count; i++) { uint32_t *desc = set->mapped_ptr + layout->binding[i].offset / 4; - fprintf(f, "\t\t**** binding layout (%d) ****\n", i); - fprintf(f, "\t\ttype: %s\n", - radv_get_descriptor_name(layout->binding[i].type)); - fprintf(f, "\t\tarray_size: %d\n", - layout->binding[i].array_size); - fprintf(f, "\t\toffset: %d\n", - layout->binding[i].offset); - fprintf(f, "\t\tbuffer_offset: %d\n", - layout->binding[i].buffer_offset); - fprintf(f, "\t\tdynamic_offset_offset: %d\n", - layout->binding[i].dynamic_offset_offset); - fprintf(f, "\t\tdynamic_offset_count: %d\n", - layout->binding[i].dynamic_offset_count); - fprintf(f, "\t\tsize: %d\n", - layout->binding[i].size); - fprintf(f, "\t\timmutable_samplers_offset: %d\n", - layout->binding[i].immutable_samplers_offset); - fprintf(f, "\t\timmutable_samplers_equal: %d\n", - layout->binding[i].immutable_samplers_equal); - fprintf(f, "\n"); - switch (layout->binding[i].type) { case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: @@ -312,19 +240,17 @@ } static void -radv_dump_descriptors(struct radv_pipeline *pipeline, FILE *f) +radv_dump_descriptors(struct radv_device *device, FILE *f) { - struct radv_device *device = pipeline->device; - enum chip_class chip_class = device->physical_device->rad_info.chip_class; uint64_t *ptr = (uint64_t *)device->trace_id_ptr; int i; - fprintf(f, "List of descriptors:\n"); + fprintf(f, "Descriptors:\n"); for (i = 0; i < MAX_SETS; i++) { struct radv_descriptor_set *set = - (struct radv_descriptor_set *)ptr[i + 3]; + *(struct radv_descriptor_set **)(ptr + i + 3); - radv_dump_descriptor_set(chip_class, set, i, f); + radv_dump_descriptor_set(device, set, i, f); } } @@ -507,7 +433,7 @@ fprintf(f, "NIR:\n%s\n", shader->nir_string); } - fprintf(f, "LLVM IR:\n%s\n", shader->llvm_ir_string); + fprintf(f, "LLVM IR:\n%s\n", shader->ir_string); fprintf(f, "DISASM:\n%s\n", shader->disasm_string); radv_shader_dump_stats(pipeline->device, shader, stage, f); @@ -531,11 +457,11 @@ { radv_dump_shaders(pipeline, active_stages, f); radv_dump_annotated_shaders(pipeline, active_stages, f); - radv_dump_descriptors(pipeline, f); } static void -radv_dump_graphics_state(struct radv_pipeline *graphics_pipeline, +radv_dump_graphics_state(struct radv_device *device, + struct radv_pipeline *graphics_pipeline, struct radv_pipeline *compute_pipeline, FILE *f) { VkShaderStageFlagBits active_stages; @@ -549,10 +475,13 @@ active_stages = VK_SHADER_STAGE_COMPUTE_BIT; radv_dump_pipeline_state(compute_pipeline, active_stages, f); } + + radv_dump_descriptors(device, f); } static void -radv_dump_compute_state(struct radv_pipeline *compute_pipeline, FILE *f) +radv_dump_compute_state(struct radv_device *device, + struct radv_pipeline *compute_pipeline, FILE *f) { VkShaderStageFlagBits active_stages = VK_SHADER_STAGE_COMPUTE_BIT; @@ -560,6 +489,7 @@ return; radv_dump_pipeline_state(compute_pipeline, active_stages, f); + radv_dump_descriptors(device, f); } static struct radv_pipeline * @@ -567,7 +497,7 @@ { uint64_t *ptr = (uint64_t *)device->trace_id_ptr; - return (struct radv_pipeline *)ptr[1]; + return *(struct radv_pipeline **)(ptr + 1); } static struct radv_pipeline * @@ -575,7 +505,7 @@ { uint64_t *ptr = (uint64_t *)device->trace_id_ptr; - return (struct radv_pipeline *)ptr[2]; + return *(struct radv_pipeline **)(ptr + 2); } static void @@ -677,6 +607,8 @@ graphics_pipeline = radv_get_saved_graphics_pipeline(device); compute_pipeline = radv_get_saved_compute_pipeline(device); + radv_dump_trace(queue->device, cs); + fprintf(stderr, "GPU hang report:\n\n"); radv_dump_device_name(device, stderr); @@ -692,23 +624,26 @@ switch (ring) { case RING_GFX: - radv_dump_graphics_state(graphics_pipeline, compute_pipeline, + fprintf(stderr, "RING_GFX:\n"); + radv_dump_graphics_state(queue->device, + graphics_pipeline, compute_pipeline, stderr); break; case RING_COMPUTE: - radv_dump_compute_state(compute_pipeline, stderr); + fprintf(stderr, "RING_COMPUTE:\n"); + radv_dump_compute_state(queue->device, + compute_pipeline, stderr); break; default: assert(0); break; } - radv_dump_trace(queue->device, cs); abort(); } void -radv_print_spirv(uint32_t *data, uint32_t size, FILE *fp) +radv_print_spirv(const char *data, uint32_t size, FILE *fp) { char path[] = "/tmp/fileXXXXXX"; char line[2048], command[128]; diff -Nru mesa-19.2.8/src/amd/vulkan/radv_debug.h mesa-20.0.8/src/amd/vulkan/radv_debug.h --- mesa-19.2.8/src/amd/vulkan/radv_debug.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/vulkan/radv_debug.h 2020-06-12 01:21:16.000000000 +0000 @@ -26,6 +26,7 @@ #include "radv_private.h" +/* Please keep docs/envvars.html up-to-date when you add/remove options. */ enum { RADV_DEBUG_NO_FAST_CLEARS = 0x1, RADV_DEBUG_NO_DCC = 0x2, @@ -34,40 +35,43 @@ RADV_DEBUG_DUMP_SHADER_STATS = 0x10, RADV_DEBUG_NO_HIZ = 0x20, RADV_DEBUG_NO_COMPUTE_QUEUE = 0x40, - RADV_DEBUG_UNSAFE_MATH = 0x80, - RADV_DEBUG_ALL_BOS = 0x100, - RADV_DEBUG_NO_IBS = 0x200, - RADV_DEBUG_DUMP_SPIRV = 0x400, - RADV_DEBUG_VM_FAULTS = 0x800, - RADV_DEBUG_ZERO_VRAM = 0x1000, - RADV_DEBUG_SYNC_SHADERS = 0x2000, - RADV_DEBUG_NO_SISCHED = 0x4000, - RADV_DEBUG_PREOPTIR = 0x8000, - RADV_DEBUG_NO_DYNAMIC_BOUNDS = 0x10000, - RADV_DEBUG_NO_OUT_OF_ORDER = 0x20000, - RADV_DEBUG_INFO = 0x40000, - RADV_DEBUG_ERRORS = 0x80000, - RADV_DEBUG_STARTUP = 0x100000, - RADV_DEBUG_CHECKIR = 0x200000, - RADV_DEBUG_NOTHREADLLVM = 0x400000, - RADV_DEBUG_NOBINNING = 0x800000, - RADV_DEBUG_NO_LOAD_STORE_OPT = 0x1000000, - RADV_DEBUG_NO_NGG = 0x2000000, - RADV_DEBUG_NO_SHADER_BALLOT = 0x4000000, + RADV_DEBUG_ALL_BOS = 0x80, + RADV_DEBUG_NO_IBS = 0x100, + RADV_DEBUG_DUMP_SPIRV = 0x200, + RADV_DEBUG_VM_FAULTS = 0x400, + RADV_DEBUG_ZERO_VRAM = 0x800, + RADV_DEBUG_SYNC_SHADERS = 0x1000, + RADV_DEBUG_NO_SISCHED = 0x2000, + RADV_DEBUG_PREOPTIR = 0x4000, + RADV_DEBUG_NO_DYNAMIC_BOUNDS = 0x8000, + RADV_DEBUG_NO_OUT_OF_ORDER = 0x10000, + RADV_DEBUG_INFO = 0x20000, + RADV_DEBUG_ERRORS = 0x40000, + RADV_DEBUG_STARTUP = 0x80000, + RADV_DEBUG_CHECKIR = 0x100000, + RADV_DEBUG_NOTHREADLLVM = 0x200000, + RADV_DEBUG_NOBINNING = 0x400000, + RADV_DEBUG_NO_LOAD_STORE_OPT = 0x800000, + RADV_DEBUG_NO_NGG = 0x1000000, + RADV_DEBUG_NO_SHADER_BALLOT = 0x2000000, + RADV_DEBUG_ALL_ENTRYPOINTS = 0x4000000, + RADV_DEBUG_DUMP_META_SHADERS = 0x8000000, + RADV_DEBUG_NO_MEMORY_CACHE = 0x10000000, }; enum { - RADV_PERFTEST_NO_BATCHCHAIN = 0x1, - RADV_PERFTEST_SISCHED = 0x2, - RADV_PERFTEST_LOCAL_BOS = 0x4, - RADV_PERFTEST_OUT_OF_ORDER = 0x8, - RADV_PERFTEST_DCC_MSAA = 0x10, - RADV_PERFTEST_BO_LIST = 0x20, - RADV_PERFTEST_SHADER_BALLOT = 0x40, - RADV_PERFTEST_TC_COMPAT_CMASK = 0x80, - RADV_PERFTEST_CS_WAVE_32 = 0x100, - RADV_PERFTEST_PS_WAVE_32 = 0x200, - RADV_PERFTEST_GE_WAVE_32 = 0x400, + RADV_PERFTEST_NO_BATCHCHAIN = 0x1, + RADV_PERFTEST_SISCHED = 0x2, + RADV_PERFTEST_LOCAL_BOS = 0x4, + RADV_PERFTEST_DCC_MSAA = 0x8, + RADV_PERFTEST_BO_LIST = 0x10, + RADV_PERFTEST_SHADER_BALLOT = 0x20, + RADV_PERFTEST_TC_COMPAT_CMASK = 0x40, + RADV_PERFTEST_CS_WAVE_32 = 0x80, + RADV_PERFTEST_PS_WAVE_32 = 0x100, + RADV_PERFTEST_GE_WAVE_32 = 0x200, + RADV_PERFTEST_DFSM = 0x400, + RADV_PERFTEST_ACO = 0x800, }; bool @@ -77,7 +81,7 @@ radv_check_gpu_hangs(struct radv_queue *queue, struct radeon_cmdbuf *cs); void -radv_print_spirv(uint32_t *data, uint32_t size, FILE *fp); +radv_print_spirv(const char *data, uint32_t size, FILE *fp); void radv_dump_enabled_options(struct radv_device *device, FILE *f); diff -Nru mesa-19.2.8/src/amd/vulkan/radv_descriptor_set.c mesa-20.0.8/src/amd/vulkan/radv_descriptor_set.c --- mesa-19.2.8/src/amd/vulkan/radv_descriptor_set.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/vulkan/radv_descriptor_set.c 2020-06-12 01:21:16.000000000 +0000 @@ -78,8 +78,8 @@ struct radv_descriptor_set_layout *set_layout; assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO); - const VkDescriptorSetLayoutBindingFlagsCreateInfoEXT *variable_flags = - vk_find_struct_const(pCreateInfo->pNext, DESCRIPTOR_SET_LAYOUT_BINDING_FLAGS_CREATE_INFO_EXT); + const VkDescriptorSetLayoutBindingFlagsCreateInfo *variable_flags = + vk_find_struct_const(pCreateInfo->pNext, DESCRIPTOR_SET_LAYOUT_BINDING_FLAGS_CREATE_INFO); uint32_t max_binding = 0; uint32_t immutable_sampler_count = 0; @@ -150,7 +150,7 @@ for (uint32_t j = 0; j < pCreateInfo->bindingCount; j++) { const VkDescriptorSetLayoutBinding *binding = bindings + j; uint32_t b = binding->binding; - uint32_t alignment; + uint32_t alignment = 0; unsigned binding_buffer_count = 0; uint32_t descriptor_count = binding->descriptorCount; bool has_ycbcr_sampler = false; @@ -214,7 +214,6 @@ descriptor_count = 1; break; default: - unreachable("unknown descriptor type\n"); break; } @@ -310,10 +309,10 @@ return; } - const VkDescriptorSetLayoutBindingFlagsCreateInfoEXT *variable_flags = - vk_find_struct_const(pCreateInfo->pNext, DESCRIPTOR_SET_LAYOUT_BINDING_FLAGS_CREATE_INFO_EXT); - VkDescriptorSetVariableDescriptorCountLayoutSupportEXT *variable_count = - vk_find_struct((void*)pCreateInfo->pNext, DESCRIPTOR_SET_VARIABLE_DESCRIPTOR_COUNT_LAYOUT_SUPPORT_EXT); + const VkDescriptorSetLayoutBindingFlagsCreateInfo *variable_flags = + vk_find_struct_const(pCreateInfo->pNext, DESCRIPTOR_SET_LAYOUT_BINDING_FLAGS_CREATE_INFO); + VkDescriptorSetVariableDescriptorCountLayoutSupport *variable_count = + vk_find_struct((void*)pCreateInfo->pNext, DESCRIPTOR_SET_VARIABLE_DESCRIPTOR_COUNT_LAYOUT_SUPPORT); if (variable_count) { variable_count->maxVariableDescriptorCount = 0; } @@ -363,7 +362,6 @@ descriptor_count = 1; break; default: - unreachable("unknown descriptor type\n"); break; } @@ -512,6 +510,7 @@ } set->layout = layout; + set->buffer_count = buffer_count; uint32_t layout_size = layout->size; if (variable_count) { assert(layout->has_variable_descriptors); @@ -672,7 +671,6 @@ bo_size += pCreateInfo->pPoolSizes[i].descriptorCount; break; default: - unreachable("unknown descriptor type\n"); break; } } @@ -770,8 +768,8 @@ uint32_t i; struct radv_descriptor_set *set = NULL; - const VkDescriptorSetVariableDescriptorCountAllocateInfoEXT *variable_counts = - vk_find_struct_const(pAllocateInfo->pNext, DESCRIPTOR_SET_VARIABLE_DESCRIPTOR_COUNT_ALLOCATE_INFO_EXT); + const VkDescriptorSetVariableDescriptorCountAllocateInfo *variable_counts = + vk_find_struct_const(pAllocateInfo->pNext, DESCRIPTOR_SET_VARIABLE_DESCRIPTOR_COUNT_ALLOCATE_INFO); const uint32_t zero = 0; /* allocate a set of buffers for each shader to contain descriptors */ @@ -864,7 +862,7 @@ if (device->physical_device->rad_info.chip_class >= GFX10) { dst[3] |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) | - S_008F0C_OOB_SELECT(3) | + S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1); } else { dst[3] |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | @@ -1049,7 +1047,6 @@ } break; default: - unreachable("unimplemented descriptor type"); break; } ptr += binding_layout->size / 4; @@ -1153,7 +1150,18 @@ return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); templ->entry_count = entry_count; - templ->bind_point = pCreateInfo->pipelineBindPoint; + + if (pCreateInfo->templateType == VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_PUSH_DESCRIPTORS_KHR) { + RADV_FROM_HANDLE(radv_pipeline_layout, pipeline_layout, pCreateInfo->pipelineLayout); + + /* descriptorSetLayout should be ignored for push descriptors + * and instead it refers to pipelineLayout and set. + */ + assert(pCreateInfo->set < MAX_SETS); + set_layout = pipeline_layout->set[pCreateInfo->set].layout; + + templ->bind_point = pCreateInfo->pipelineBindPoint; + } for (i = 0; i < entry_count; i++) { const VkDescriptorUpdateTemplateEntry *entry = &pCreateInfo->pDescriptorUpdateEntries[i]; @@ -1291,7 +1299,6 @@ memcpy(pDst, templ->entry[i].immutable_samplers + 4 * j, 16); break; default: - unreachable("unimplemented descriptor type"); break; } pSrc += templ->entry[i].src_stride; diff -Nru mesa-19.2.8/src/amd/vulkan/radv_device.c mesa-20.0.8/src/amd/vulkan/radv_device.c --- mesa-19.2.8/src/amd/vulkan/radv_device.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/vulkan/radv_device.c 2020-06-12 01:21:16.000000000 +0000 @@ -25,10 +25,24 @@ * IN THE SOFTWARE. */ +#include "dirent.h" +#include +#include +#include +#include +#include +#include +#include #include +#include +#include #include +#include +#include #include #include +#include + #include "radv_debug.h" #include "radv_private.h" #include "radv_shader.h" @@ -46,9 +60,29 @@ #include "util/build_id.h" #include "util/debug.h" #include "util/mesa-sha1.h" +#include "util/timespec.h" +#include "util/u_atomic.h" #include "compiler/glsl_types.h" #include "util/xmlpool.h" +static struct radv_timeline_point * +radv_timeline_find_point_at_least_locked(struct radv_device *device, + struct radv_timeline *timeline, + uint64_t p); + +static struct radv_timeline_point * +radv_timeline_add_point_locked(struct radv_device *device, + struct radv_timeline *timeline, + uint64_t p); + +static void +radv_timeline_trigger_waiters_locked(struct radv_timeline *timeline, + struct list_head *processing_list); + +static +void radv_destroy_semaphore_part(struct radv_device *device, + struct radv_semaphore_part *part); + static int radv_device_get_cache_uuid(enum radeon_family family, void *uuid) { @@ -83,44 +117,6 @@ ac_compute_device_uuid(info, uuid, VK_UUID_SIZE); } -static void -radv_get_device_name(enum radeon_family family, char *name, size_t name_len) -{ - const char *chip_string; - - switch (family) { - case CHIP_TAHITI: chip_string = "AMD RADV TAHITI"; break; - case CHIP_PITCAIRN: chip_string = "AMD RADV PITCAIRN"; break; - case CHIP_VERDE: chip_string = "AMD RADV CAPE VERDE"; break; - case CHIP_OLAND: chip_string = "AMD RADV OLAND"; break; - case CHIP_HAINAN: chip_string = "AMD RADV HAINAN"; break; - case CHIP_BONAIRE: chip_string = "AMD RADV BONAIRE"; break; - case CHIP_KAVERI: chip_string = "AMD RADV KAVERI"; break; - case CHIP_KABINI: chip_string = "AMD RADV KABINI"; break; - case CHIP_HAWAII: chip_string = "AMD RADV HAWAII"; break; - case CHIP_TONGA: chip_string = "AMD RADV TONGA"; break; - case CHIP_ICELAND: chip_string = "AMD RADV ICELAND"; break; - case CHIP_CARRIZO: chip_string = "AMD RADV CARRIZO"; break; - case CHIP_FIJI: chip_string = "AMD RADV FIJI"; break; - case CHIP_POLARIS10: chip_string = "AMD RADV POLARIS10"; break; - case CHIP_POLARIS11: chip_string = "AMD RADV POLARIS11"; break; - case CHIP_POLARIS12: chip_string = "AMD RADV POLARIS12"; break; - case CHIP_STONEY: chip_string = "AMD RADV STONEY"; break; - case CHIP_VEGAM: chip_string = "AMD RADV VEGA M"; break; - case CHIP_VEGA10: chip_string = "AMD RADV VEGA10"; break; - case CHIP_VEGA12: chip_string = "AMD RADV VEGA12"; break; - case CHIP_VEGA20: chip_string = "AMD RADV VEGA20"; break; - case CHIP_RAVEN: chip_string = "AMD RADV RAVEN"; break; - case CHIP_RAVEN2: chip_string = "AMD RADV RAVEN2"; break; - case CHIP_NAVI10: chip_string = "AMD RADV NAVI10"; break; - case CHIP_NAVI12: chip_string = "AMD RADV NAVI12"; break; - case CHIP_NAVI14: chip_string = "AMD RADV NAVI14"; break; - default: chip_string = "AMD RADV unknown"; break; - } - - snprintf(name, name_len, "%s (LLVM " MESA_LLVM_VERSION_STRING ")", chip_string); -} - static uint64_t radv_get_visible_vram_size(struct radv_physical_device *device) { @@ -133,6 +129,42 @@ return device->rad_info.vram_size - radv_get_visible_vram_size(device); } +static bool +radv_is_mem_type_vram(enum radv_mem_type type) +{ + return type == RADV_MEM_TYPE_VRAM || + type == RADV_MEM_TYPE_VRAM_UNCACHED; +} + +static bool +radv_is_mem_type_vram_visible(enum radv_mem_type type) +{ + return type == RADV_MEM_TYPE_VRAM_CPU_ACCESS || + type == RADV_MEM_TYPE_VRAM_CPU_ACCESS_UNCACHED; +} +static bool +radv_is_mem_type_gtt_wc(enum radv_mem_type type) +{ + return type == RADV_MEM_TYPE_GTT_WRITE_COMBINE || + type == RADV_MEM_TYPE_GTT_WRITE_COMBINE_VRAM_UNCACHED; +} + +static bool +radv_is_mem_type_gtt_cached(enum radv_mem_type type) +{ + return type == RADV_MEM_TYPE_GTT_CACHED || + type == RADV_MEM_TYPE_GTT_CACHED_VRAM_UNCACHED; +} + +static bool +radv_is_mem_type_uncached(enum radv_mem_type type) +{ + return type == RADV_MEM_TYPE_VRAM_UNCACHED || + type == RADV_MEM_TYPE_VRAM_CPU_ACCESS_UNCACHED || + type == RADV_MEM_TYPE_GTT_WRITE_COMBINE_VRAM_UNCACHED || + type == RADV_MEM_TYPE_GTT_CACHED_VRAM_UNCACHED; +} + static void radv_physical_device_init_mem_types(struct radv_physical_device *device) { @@ -213,6 +245,46 @@ }; } device->memory_properties.memoryTypeCount = type_count; + + if (device->rad_info.has_l2_uncached) { + for (int i = 0; i < device->memory_properties.memoryTypeCount; i++) { + VkMemoryType mem_type = device->memory_properties.memoryTypes[i]; + + if ((mem_type.propertyFlags & (VK_MEMORY_PROPERTY_HOST_COHERENT_BIT | + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT)) || + mem_type.propertyFlags == VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT) { + enum radv_mem_type mem_type_id; + + switch (device->mem_type_indices[i]) { + case RADV_MEM_TYPE_VRAM: + mem_type_id = RADV_MEM_TYPE_VRAM_UNCACHED; + break; + case RADV_MEM_TYPE_VRAM_CPU_ACCESS: + mem_type_id = RADV_MEM_TYPE_VRAM_CPU_ACCESS_UNCACHED; + break; + case RADV_MEM_TYPE_GTT_WRITE_COMBINE: + mem_type_id = RADV_MEM_TYPE_GTT_WRITE_COMBINE_VRAM_UNCACHED; + break; + case RADV_MEM_TYPE_GTT_CACHED: + mem_type_id = RADV_MEM_TYPE_GTT_CACHED_VRAM_UNCACHED; + break; + default: + unreachable("invalid memory type"); + } + + VkMemoryPropertyFlags property_flags = mem_type.propertyFlags | + VK_MEMORY_PROPERTY_DEVICE_COHERENT_BIT_AMD | + VK_MEMORY_PROPERTY_DEVICE_UNCACHED_BIT_AMD; + + device->mem_type_indices[type_count] = mem_type_id; + device->memory_properties.memoryTypes[type_count++] = (VkMemoryType) { + .propertyFlags = property_flags, + .heapIndex = mem_type.heapIndex, + }; + } + } + device->memory_properties.memoryTypeCount = type_count; + } } static void @@ -298,7 +370,7 @@ device->ws = radv_amdgpu_winsys_create(fd, instance->debug_flags, instance->perftest_flags); if (!device->ws) { - result = vk_error(instance, VK_ERROR_INCOMPATIBLE_DRIVER); + result = vk_error(instance, VK_ERROR_INITIALIZATION_FAILED); goto fail; } @@ -325,7 +397,11 @@ radv_handle_env_var_force_family(device); - radv_get_device_name(device->rad_info.family, device->name, sizeof(device->name)); + device->use_aco = instance->perftest_flags & RADV_PERFTEST_ACO; + + snprintf(device->name, sizeof(device->name), + "AMD RADV%s %s (LLVM " MESA_LLVM_VERSION_STRING ")", device->use_aco ? "/ACO" : "", + device->rad_info.name); if (radv_device_get_cache_uuid(device->rad_info.family, device->cache_uuid)) { device->ws->destroy(device->ws); @@ -337,7 +413,7 @@ /* These flags affect shader compilation. */ uint64_t shader_env_flags = (device->instance->perftest_flags & RADV_PERFTEST_SISCHED ? 0x1 : 0) | - (device->instance->debug_flags & RADV_DEBUG_UNSAFE_MATH ? 0x2 : 0); + (device->use_aco ? 0x2 : 0); /* The gpu id is already embedded in the uuid so we just pass "radv" * when creating the cache. @@ -346,56 +422,30 @@ disk_cache_format_hex_id(buf, device->cache_uuid, VK_UUID_SIZE * 2); device->disk_cache = disk_cache_create(device->name, buf, shader_env_flags); - if (device->rad_info.chip_class < GFX8 || - device->rad_info.chip_class > GFX9) + if (device->rad_info.chip_class < GFX8) fprintf(stderr, "WARNING: radv is not a conformant vulkan implementation, testing use only.\n"); radv_get_driver_uuid(&device->driver_uuid); radv_get_device_uuid(&device->rad_info, &device->device_uuid); - if (device->rad_info.family == CHIP_STONEY || - device->rad_info.chip_class >= GFX9) { - device->has_rbplus = true; - device->rbplus_allowed = device->rad_info.family == CHIP_STONEY || - device->rad_info.family == CHIP_VEGA12 || - device->rad_info.family == CHIP_RAVEN || - device->rad_info.family == CHIP_RAVEN2 || - device->rad_info.family == CHIP_RENOIR; - } - - /* The mere presence of CLEAR_STATE in the IB causes random GPU hangs - * on GFX6. - */ - device->has_clear_state = device->rad_info.chip_class >= GFX7; - - device->cpdma_prefetch_writes_memory = device->rad_info.chip_class <= GFX8; - - /* Vega10/Raven need a special workaround for a hardware bug. */ - device->has_scissor_bug = device->rad_info.family == CHIP_VEGA10 || - device->rad_info.family == CHIP_RAVEN; - - device->has_tc_compat_zrange_bug = device->rad_info.chip_class < GFX10; - - /* Out-of-order primitive rasterization. */ - device->has_out_of_order_rast = device->rad_info.chip_class >= GFX8 && - device->rad_info.max_se >= 2; - device->out_of_order_rast_allowed = device->has_out_of_order_rast && + device->out_of_order_rast_allowed = device->rad_info.has_out_of_order_rast && !(device->instance->debug_flags & RADV_DEBUG_NO_OUT_OF_ORDER); device->dcc_msaa_allowed = (device->instance->perftest_flags & RADV_PERFTEST_DCC_MSAA); - /* TODO: Figure out how to use LOAD_CONTEXT_REG on GFX6-GFX7. */ - device->has_load_ctx_reg_pkt = device->rad_info.chip_class >= GFX9 || - (device->rad_info.chip_class >= GFX8 && - device->rad_info.me_fw_feature >= 41); - - device->has_dcc_constant_encode = device->rad_info.family == CHIP_RAVEN2 || - device->rad_info.family == CHIP_RENOIR || - device->rad_info.chip_class >= GFX10; + device->use_shader_ballot = (device->use_aco && device->rad_info.chip_class >= GFX8) || + (device->instance->perftest_flags & RADV_PERFTEST_SHADER_BALLOT); + + device->use_ngg = device->rad_info.chip_class >= GFX10 && + device->rad_info.family != CHIP_NAVI14 && + !(device->instance->debug_flags & RADV_DEBUG_NO_NGG); + if (device->use_aco && device->use_ngg) { + fprintf(stderr, "WARNING: disabling NGG because ACO is used.\n"); + device->use_ngg = false; + } - device->use_shader_ballot = device->rad_info.chip_class >= GFX8 && - device->instance->perftest_flags & RADV_PERFTEST_SHADER_BALLOT; + device->use_ngg_streamout = false; /* Determine the number of threads per wave for all stages. */ device->cs_wave_size = 64; @@ -488,7 +538,6 @@ {"shaderstats", RADV_DEBUG_DUMP_SHADER_STATS}, {"nohiz", RADV_DEBUG_NO_HIZ}, {"nocompute", RADV_DEBUG_NO_COMPUTE_QUEUE}, - {"unsafemath", RADV_DEBUG_UNSAFE_MATH}, {"allbos", RADV_DEBUG_ALL_BOS}, {"noibs", RADV_DEBUG_NO_IBS}, {"spirv", RADV_DEBUG_DUMP_SPIRV}, @@ -508,6 +557,9 @@ {"noloadstoreopt", RADV_DEBUG_NO_LOAD_STORE_OPT}, {"nongg", RADV_DEBUG_NO_NGG}, {"noshaderballot", RADV_DEBUG_NO_SHADER_BALLOT}, + {"allentrypoints", RADV_DEBUG_ALL_ENTRYPOINTS}, + {"metashaders", RADV_DEBUG_DUMP_META_SHADERS}, + {"nomemorycache", RADV_DEBUG_NO_MEMORY_CACHE}, {NULL, 0} }; @@ -529,6 +581,8 @@ {"cswave32", RADV_PERFTEST_CS_WAVE_32}, {"pswave32", RADV_PERFTEST_PS_WAVE_32}, {"gewave32", RADV_PERFTEST_GE_WAVE_32}, + {"dfsm", RADV_PERFTEST_DFSM}, + {"aco", RADV_PERFTEST_ACO}, {NULL, 0} }; @@ -544,44 +598,49 @@ const VkApplicationInfo *info) { const char *name = info ? info->pApplicationName : NULL; + const char *engine_name = info ? info->pEngineName : NULL; - if (!name) - return; - - if (!strcmp(name, "Talos - Linux - 32bit") || - !strcmp(name, "Talos - Linux - 64bit")) { - if (!(instance->debug_flags & RADV_DEBUG_NO_SISCHED)) { - /* Force enable LLVM sisched for Talos because it looks - * safe and it gives few more FPS. + if (name) { + if (!strcmp(name, "DOOM_VFR")) { + /* Work around a Doom VFR game bug */ + instance->debug_flags |= RADV_DEBUG_NO_DYNAMIC_BOUNDS; + } else if (!strcmp(name, "MonsterHunterWorld.exe")) { + /* Workaround for a WaW hazard when LLVM moves/merges + * load/store memory operations. + * See https://reviews.llvm.org/D61313 + */ + if (LLVM_VERSION_MAJOR < 9) + instance->debug_flags |= RADV_DEBUG_NO_LOAD_STORE_OPT; + } else if (!strcmp(name, "Wolfenstein: Youngblood")) { + if (!(instance->debug_flags & RADV_DEBUG_NO_SHADER_BALLOT) && + !(instance->perftest_flags & RADV_PERFTEST_ACO)) { + /* Force enable VK_AMD_shader_ballot because it looks + * safe and it gives a nice boost (+20% on Vega 56 at + * this time). It also prevents corruption on LLVM. + */ + instance->perftest_flags |= RADV_PERFTEST_SHADER_BALLOT; + } + } else if (!strcmp(name, "Fledge")) { + /* + * Zero VRAM for "The Surge 2" + * + * This avoid a hang when when rendering any level. Likely + * uninitialized data in an indirect draw. */ - instance->perftest_flags |= RADV_PERFTEST_SISCHED; + instance->debug_flags |= RADV_DEBUG_ZERO_VRAM; + } else if (!strcmp(name, "DOOMEternal")) { + /* Zero VRAM for Doom Eternal to fix rendering issues. */ + instance->debug_flags |= RADV_DEBUG_ZERO_VRAM; } - } else if (!strcmp(name, "DOOM_VFR")) { - /* Work around a Doom VFR game bug */ - instance->debug_flags |= RADV_DEBUG_NO_DYNAMIC_BOUNDS; - } else if (!strcmp(name, "MonsterHunterWorld.exe")) { - /* Workaround for a WaW hazard when LLVM moves/merges - * load/store memory operations. - * See https://reviews.llvm.org/D61313 - */ - if (HAVE_LLVM < 0x900) - instance->debug_flags |= RADV_DEBUG_NO_LOAD_STORE_OPT; - } else if (!strcmp(name, "Wolfenstein: Youngblood")) { - if (!(instance->debug_flags & RADV_DEBUG_NO_SHADER_BALLOT)) { - /* Force enable VK_AMD_shader_ballot because it looks - * safe and it gives a nice boost (+20% on Vega 56 at - * this time). + } + + if (engine_name) { + if (!strcmp(engine_name, "vkd3d")) { + /* Zero VRAM for all VKD3D (DX12->VK) games to fix + * rendering issues. */ - instance->perftest_flags |= RADV_PERFTEST_SHADER_BALLOT; + instance->debug_flags |= RADV_DEBUG_ZERO_VRAM; } - } else if (!strcmp(name, "Fledge")) { - /* - * Zero VRAM for "The Surge 2" - * - * This avoid a hang when when rendering any level. Likely - * uninitialized data in an indirect draw. - */ - instance->debug_flags |= RADV_DEBUG_ZERO_VRAM; } } @@ -601,6 +660,10 @@ DRI_CONF_VK_X11_OVERRIDE_MIN_IMAGE_COUNT(0) DRI_CONF_VK_X11_STRICT_IMAGE_COUNT("false") DRI_CONF_SECTION_END + + DRI_CONF_SECTION_DEBUG + DRI_CONF_VK_WSI_FORCE_BGRA8_UNORM_FIRST("false") + DRI_CONF_SECTION_END DRI_CONF_END; static void radv_init_dri_options(struct radv_instance *instance) @@ -653,12 +716,24 @@ instance->apiVersion = client_version; instance->physicalDeviceCount = -1; + /* Get secure compile thread count. NOTE: We cap this at 32 */ +#define MAX_SC_PROCS 32 + char *num_sc_threads = getenv("RADV_SECURE_COMPILE_THREADS"); + if (num_sc_threads) + instance->num_sc_threads = MIN2(strtoul(num_sc_threads, NULL, 10), MAX_SC_PROCS); + instance->debug_flags = parse_debug_string(getenv("RADV_DEBUG"), radv_debug_options); + /* Disable memory cache when secure compile is set */ + if (radv_device_use_secure_compile(instance)) + instance->debug_flags |= RADV_DEBUG_NO_MEMORY_CACHE; + instance->perftest_flags = parse_debug_string(getenv("RADV_PERFTEST"), radv_perftest_options); + if (instance->perftest_flags & RADV_PERFTEST_ACO) + fprintf(stderr, "WARNING: Experimental compiler backend enabled. Here be dragons! Incorrect rendering, GPU hangs and/or resets are likely\n"); if (instance->debug_flags & RADV_DEBUG_STARTUP) radv_logi("Created an instance"); @@ -729,7 +804,7 @@ { /* TODO: Check for more devices ? */ drmDevicePtr devices[8]; - VkResult result = VK_ERROR_INCOMPATIBLE_DRIVER; + VkResult result = VK_SUCCESS; int max_devices; instance->physicalDeviceCount = 0; @@ -740,7 +815,7 @@ radv_logi("Found %d drm nodes", max_devices); if (max_devices < 1) - return vk_error(instance, VK_ERROR_INCOMPATIBLE_DRIVER); + return vk_error(instance, VK_SUCCESS); for (unsigned i = 0; i < (unsigned)max_devices; i++) { if (devices[i]->available_nodes & 1 << DRM_NODE_RENDER && @@ -751,14 +826,22 @@ instance->physicalDeviceCount, instance, devices[i]); - if (result == VK_SUCCESS) - ++instance->physicalDeviceCount; - else if (result != VK_ERROR_INCOMPATIBLE_DRIVER) + /* Incompatible DRM device, skip. */ + if (result == VK_ERROR_INCOMPATIBLE_DRIVER) { + result = VK_SUCCESS; + continue; + } + + /* Error creating the physical device, report the error. */ + if (result != VK_SUCCESS) break; + + ++instance->physicalDeviceCount; } } drmFreeDevices(devices, max_devices); + /* If we successfully enumerated any devices, call it success */ return result; } @@ -772,8 +855,7 @@ if (instance->physicalDeviceCount < 0) { result = radv_enumerate_devices(instance); - if (result != VK_SUCCESS && - result != VK_ERROR_INCOMPATIBLE_DRIVER) + if (result != VK_SUCCESS) return result; } @@ -799,8 +881,7 @@ if (instance->physicalDeviceCount < 0) { result = radv_enumerate_devices(instance); - if (result != VK_SUCCESS && - result != VK_ERROR_INCOMPATIBLE_DRIVER) + if (result != VK_SUCCESS) return result; } @@ -867,7 +948,7 @@ .shaderCullDistance = true, .shaderFloat64 = true, .shaderInt64 = true, - .shaderInt16 = pdevice->rad_info.chip_class >= GFX9, + .shaderInt16 = pdevice->rad_info.chip_class >= GFX9 && !pdevice->use_aco, .sparseBinding = true, .variableMultisampleRate = true, .inheritedQueries = true, @@ -909,11 +990,10 @@ case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_16BIT_STORAGE_FEATURES: { VkPhysicalDevice16BitStorageFeatures *features = (VkPhysicalDevice16BitStorageFeatures*)ext; - bool enabled = pdevice->rad_info.chip_class >= GFX8; - features->storageBuffer16BitAccess = enabled; - features->uniformAndStorageBuffer16BitAccess = enabled; - features->storagePushConstant16 = enabled; - features->storageInputOutput16 = enabled && HAVE_LLVM >= 0x900; + features->storageBuffer16BitAccess = !pdevice->use_aco; + features->uniformAndStorageBuffer16BitAccess = !pdevice->use_aco; + features->storagePushConstant16 = !pdevice->use_aco; + features->storageInputOutput16 = pdevice->rad_info.chip_class >= GFX8 && !pdevice->use_aco && LLVM_VERSION_MAJOR >= 9; break; } case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SAMPLER_YCBCR_CONVERSION_FEATURES: { @@ -922,9 +1002,9 @@ features->samplerYcbcrConversion = true; break; } - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DESCRIPTOR_INDEXING_FEATURES_EXT: { - VkPhysicalDeviceDescriptorIndexingFeaturesEXT *features = - (VkPhysicalDeviceDescriptorIndexingFeaturesEXT*)ext; + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DESCRIPTOR_INDEXING_FEATURES: { + VkPhysicalDeviceDescriptorIndexingFeatures *features = + (VkPhysicalDeviceDescriptorIndexingFeatures*)ext; features->shaderInputAttachmentArrayDynamicIndexing = true; features->shaderUniformTexelBufferArrayDynamicIndexing = true; features->shaderStorageTexelBufferArrayDynamicIndexing = true; @@ -957,27 +1037,27 @@ case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VERTEX_ATTRIBUTE_DIVISOR_FEATURES_EXT: { VkPhysicalDeviceVertexAttributeDivisorFeaturesEXT *features = (VkPhysicalDeviceVertexAttributeDivisorFeaturesEXT *)ext; - features->vertexAttributeInstanceRateDivisor = VK_TRUE; - features->vertexAttributeInstanceRateZeroDivisor = VK_TRUE; + features->vertexAttributeInstanceRateDivisor = true; + features->vertexAttributeInstanceRateZeroDivisor = true; break; } case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TRANSFORM_FEEDBACK_FEATURES_EXT: { VkPhysicalDeviceTransformFeedbackFeaturesEXT *features = (VkPhysicalDeviceTransformFeedbackFeaturesEXT*)ext; features->transformFeedback = true; - features->geometryStreams = true; + features->geometryStreams = !pdevice->use_ngg_streamout; break; } - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SCALAR_BLOCK_LAYOUT_FEATURES_EXT: { - VkPhysicalDeviceScalarBlockLayoutFeaturesEXT *features = - (VkPhysicalDeviceScalarBlockLayoutFeaturesEXT *)ext; + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SCALAR_BLOCK_LAYOUT_FEATURES: { + VkPhysicalDeviceScalarBlockLayoutFeatures *features = + (VkPhysicalDeviceScalarBlockLayoutFeatures *)ext; features->scalarBlockLayout = pdevice->rad_info.chip_class >= GFX7; break; } case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MEMORY_PRIORITY_FEATURES_EXT: { VkPhysicalDeviceMemoryPriorityFeaturesEXT *features = (VkPhysicalDeviceMemoryPriorityFeaturesEXT *)ext; - features->memoryPriority = VK_TRUE; + features->memoryPriority = true; break; } case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_BUFFER_DEVICE_ADDRESS_FEATURES_EXT: { @@ -988,39 +1068,52 @@ features->bufferDeviceAddressMultiDevice = false; break; } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_BUFFER_DEVICE_ADDRESS_FEATURES: { + VkPhysicalDeviceBufferDeviceAddressFeatures *features = + (VkPhysicalDeviceBufferDeviceAddressFeatures *)ext; + features->bufferDeviceAddress = true; + features->bufferDeviceAddressCaptureReplay = false; + features->bufferDeviceAddressMultiDevice = false; + break; + } case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DEPTH_CLIP_ENABLE_FEATURES_EXT: { VkPhysicalDeviceDepthClipEnableFeaturesEXT *features = (VkPhysicalDeviceDepthClipEnableFeaturesEXT *)ext; features->depthClipEnable = true; break; } - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_HOST_QUERY_RESET_FEATURES_EXT: { - VkPhysicalDeviceHostQueryResetFeaturesEXT *features = - (VkPhysicalDeviceHostQueryResetFeaturesEXT *)ext; + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_HOST_QUERY_RESET_FEATURES: { + VkPhysicalDeviceHostQueryResetFeatures *features = + (VkPhysicalDeviceHostQueryResetFeatures *)ext; features->hostQueryReset = true; break; } - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_8BIT_STORAGE_FEATURES_KHR: { - VkPhysicalDevice8BitStorageFeaturesKHR *features = - (VkPhysicalDevice8BitStorageFeaturesKHR*)ext; - bool enabled = pdevice->rad_info.chip_class >= GFX8; - features->storageBuffer8BitAccess = enabled; - features->uniformAndStorageBuffer8BitAccess = enabled; - features->storagePushConstant8 = enabled; - break; - } - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT16_INT8_FEATURES_KHR: { - VkPhysicalDeviceFloat16Int8FeaturesKHR *features = - (VkPhysicalDeviceFloat16Int8FeaturesKHR*)ext; - features->shaderFloat16 = pdevice->rad_info.chip_class >= GFX8 && HAVE_LLVM >= 0x0800; - features->shaderInt8 = true; - break; - } - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_ATOMIC_INT64_FEATURES_KHR: { - VkPhysicalDeviceShaderAtomicInt64FeaturesKHR *features = - (VkPhysicalDeviceShaderAtomicInt64FeaturesKHR *)ext; - features->shaderBufferInt64Atomics = HAVE_LLVM >= 0x0900; - features->shaderSharedInt64Atomics = HAVE_LLVM >= 0x0900; + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_8BIT_STORAGE_FEATURES: { + VkPhysicalDevice8BitStorageFeatures *features = + (VkPhysicalDevice8BitStorageFeatures *)ext; + features->storageBuffer8BitAccess = !pdevice->use_aco; + features->uniformAndStorageBuffer8BitAccess = !pdevice->use_aco; + features->storagePushConstant8 = !pdevice->use_aco; + break; + } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_FLOAT16_INT8_FEATURES: { + VkPhysicalDeviceShaderFloat16Int8Features *features = + (VkPhysicalDeviceShaderFloat16Int8Features*)ext; + features->shaderFloat16 = pdevice->rad_info.chip_class >= GFX8 && !pdevice->use_aco; + features->shaderInt8 = !pdevice->use_aco; + break; + } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_ATOMIC_INT64_FEATURES: { + VkPhysicalDeviceShaderAtomicInt64Features *features = + (VkPhysicalDeviceShaderAtomicInt64Features *)ext; + features->shaderBufferInt64Atomics = LLVM_VERSION_MAJOR >= 9; + features->shaderSharedInt64Atomics = LLVM_VERSION_MAJOR >= 9; + break; + } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_DEMOTE_TO_HELPER_INVOCATION_FEATURES_EXT: { + VkPhysicalDeviceShaderDemoteToHelperInvocationFeaturesEXT *features = + (VkPhysicalDeviceShaderDemoteToHelperInvocationFeaturesEXT *)ext; + features->shaderDemoteToHelperInvocation = pdevice->use_aco; break; } case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_INLINE_UNIFORM_BLOCK_FEATURES_EXT: { @@ -1044,9 +1137,9 @@ features->ycbcrImageArrays = true; break; } - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_UNIFORM_BUFFER_STANDARD_LAYOUT_FEATURES_KHR: { - VkPhysicalDeviceUniformBufferStandardLayoutFeaturesKHR *features = - (VkPhysicalDeviceUniformBufferStandardLayoutFeaturesKHR *)ext; + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_UNIFORM_BUFFER_STANDARD_LAYOUT_FEATURES: { + VkPhysicalDeviceUniformBufferStandardLayoutFeatures *features = + (VkPhysicalDeviceUniformBufferStandardLayoutFeatures *)ext; features->uniformBufferStandardLayout = true; break; } @@ -1056,9 +1149,9 @@ features->indexTypeUint8 = pdevice->rad_info.chip_class >= GFX8; break; } - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGELESS_FRAMEBUFFER_FEATURES_KHR: { - VkPhysicalDeviceImagelessFramebufferFeaturesKHR *features = - (VkPhysicalDeviceImagelessFramebufferFeaturesKHR *)ext; + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGELESS_FRAMEBUFFER_FEATURES: { + VkPhysicalDeviceImagelessFramebufferFeatures *features = + (VkPhysicalDeviceImagelessFramebufferFeatures *)ext; features->imagelessFramebuffer = true; break; } @@ -1068,6 +1161,119 @@ features->pipelineExecutableInfo = true; break; } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_CLOCK_FEATURES_KHR: { + VkPhysicalDeviceShaderClockFeaturesKHR *features = + (VkPhysicalDeviceShaderClockFeaturesKHR *)ext; + features->shaderSubgroupClock = true; + features->shaderDeviceClock = false; + break; + } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TEXEL_BUFFER_ALIGNMENT_FEATURES_EXT: { + VkPhysicalDeviceTexelBufferAlignmentFeaturesEXT *features = + (VkPhysicalDeviceTexelBufferAlignmentFeaturesEXT *)ext; + features->texelBufferAlignment = true; + break; + } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TIMELINE_SEMAPHORE_FEATURES: { + VkPhysicalDeviceTimelineSemaphoreFeatures *features = + (VkPhysicalDeviceTimelineSemaphoreFeatures *) ext; + features->timelineSemaphore = true; + break; + } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_SIZE_CONTROL_FEATURES_EXT: { + VkPhysicalDeviceSubgroupSizeControlFeaturesEXT *features = + (VkPhysicalDeviceSubgroupSizeControlFeaturesEXT *)ext; + features->subgroupSizeControl = true; + features->computeFullSubgroups = true; + break; + } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COHERENT_MEMORY_FEATURES_AMD: { + VkPhysicalDeviceCoherentMemoryFeaturesAMD *features = + (VkPhysicalDeviceCoherentMemoryFeaturesAMD *)ext; + features->deviceCoherentMemory = pdevice->rad_info.has_l2_uncached; + break; + } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_SUBGROUP_EXTENDED_TYPES_FEATURES: { + VkPhysicalDeviceShaderSubgroupExtendedTypesFeatures *features = + (VkPhysicalDeviceShaderSubgroupExtendedTypesFeatures *)ext; + features->shaderSubgroupExtendedTypes = true; + break; + } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SEPARATE_DEPTH_STENCIL_LAYOUTS_FEATURES_KHR: { + VkPhysicalDeviceSeparateDepthStencilLayoutsFeaturesKHR *features = + (VkPhysicalDeviceSeparateDepthStencilLayoutsFeaturesKHR *)ext; + features->separateDepthStencilLayouts = true; + break; + } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_FEATURES: { + VkPhysicalDeviceVulkan11Features *features = + (VkPhysicalDeviceVulkan11Features *)ext; + features->storageBuffer16BitAccess = !pdevice->use_aco; + features->uniformAndStorageBuffer16BitAccess = !pdevice->use_aco; + features->storagePushConstant16 = !pdevice->use_aco; + features->storageInputOutput16 = pdevice->rad_info.chip_class >= GFX8 && !pdevice->use_aco && LLVM_VERSION_MAJOR >= 9; + features->multiview = true; + features->multiviewGeometryShader = true; + features->multiviewTessellationShader = true; + features->variablePointersStorageBuffer = true; + features->variablePointers = true; + features->protectedMemory = false; + features->samplerYcbcrConversion = true; + features->shaderDrawParameters = true; + break; + } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES: { + VkPhysicalDeviceVulkan12Features *features = + (VkPhysicalDeviceVulkan12Features *)ext; + features->samplerMirrorClampToEdge = true; + features->drawIndirectCount = true; + features->storageBuffer8BitAccess = !pdevice->use_aco; + features->uniformAndStorageBuffer8BitAccess = !pdevice->use_aco; + features->storagePushConstant8 = !pdevice->use_aco; + features->shaderBufferInt64Atomics = LLVM_VERSION_MAJOR >= 9; + features->shaderSharedInt64Atomics = LLVM_VERSION_MAJOR >= 9; + features->shaderFloat16 = pdevice->rad_info.chip_class >= GFX8 && !pdevice->use_aco; + features->shaderInt8 = !pdevice->use_aco; + features->descriptorIndexing = true; + features->shaderInputAttachmentArrayDynamicIndexing = true; + features->shaderUniformTexelBufferArrayDynamicIndexing = true; + features->shaderStorageTexelBufferArrayDynamicIndexing = true; + features->shaderUniformBufferArrayNonUniformIndexing = true; + features->shaderSampledImageArrayNonUniformIndexing = true; + features->shaderStorageBufferArrayNonUniformIndexing = true; + features->shaderStorageImageArrayNonUniformIndexing = true; + features->shaderInputAttachmentArrayNonUniformIndexing = true; + features->shaderUniformTexelBufferArrayNonUniformIndexing = true; + features->shaderStorageTexelBufferArrayNonUniformIndexing = true; + features->descriptorBindingUniformBufferUpdateAfterBind = true; + features->descriptorBindingSampledImageUpdateAfterBind = true; + features->descriptorBindingStorageImageUpdateAfterBind = true; + features->descriptorBindingStorageBufferUpdateAfterBind = true; + features->descriptorBindingUniformTexelBufferUpdateAfterBind = true; + features->descriptorBindingStorageTexelBufferUpdateAfterBind = true; + features->descriptorBindingUpdateUnusedWhilePending = true; + features->descriptorBindingPartiallyBound = true; + features->descriptorBindingVariableDescriptorCount = true; + features->runtimeDescriptorArray = true; + features->samplerFilterMinmax = pdevice->rad_info.chip_class >= GFX7; + features->scalarBlockLayout = pdevice->rad_info.chip_class >= GFX7; + features->imagelessFramebuffer = true; + features->uniformBufferStandardLayout = true; + features->shaderSubgroupExtendedTypes = true; + features->separateDepthStencilLayouts = true; + features->hostQueryReset = true; + features->timelineSemaphore = pdevice->rad_info.has_syncobj_wait_for_submit; + features->bufferDeviceAddress = true; + features->bufferDeviceAddressCaptureReplay = false; + features->bufferDeviceAddressMultiDevice = false; + features->vulkanMemoryModel = false; + features->vulkanMemoryModelDeviceScope = false; + features->vulkanMemoryModelAvailabilityVisibilityChains = false; + features->shaderOutputViewportIndex = true; + features->shaderOutputLayer = true; + features->subgroupBroadcastDynamicId = true; + break; + } default: break; } @@ -1156,11 +1362,11 @@ .maxFragmentCombinedOutputResources = 8, .maxComputeSharedMemorySize = 32768, .maxComputeWorkGroupCount = { 65535, 65535, 65535 }, - .maxComputeWorkGroupInvocations = 2048, + .maxComputeWorkGroupInvocations = 1024, .maxComputeWorkGroupSize = { - 2048, - 2048, - 2048 + 1024, + 1024, + 1024 }, .subPixelPrecisionBits = 8, .subTexelPrecisionBits = 8, @@ -1193,7 +1399,7 @@ .framebufferNoAttachmentsSampleCounts = sample_counts, .maxColorAttachments = MAX_RTS, .sampledImageColorSampleCounts = sample_counts, - .sampledImageIntegerSampleCounts = VK_SAMPLE_COUNT_1_BIT, + .sampledImageIntegerSampleCounts = sample_counts, .sampledImageDepthSampleCounts = sample_counts, .sampledImageStencilSampleCounts = sample_counts, .storageImageSampleCounts = pdevice->rad_info.chip_class >= GFX8 ? sample_counts : VK_SAMPLE_COUNT_1_BIT, @@ -1229,6 +1435,149 @@ memcpy(pProperties->pipelineCacheUUID, pdevice->cache_uuid, VK_UUID_SIZE); } +static void +radv_get_physical_device_properties_1_1(struct radv_physical_device *pdevice, + VkPhysicalDeviceVulkan11Properties *p) +{ + assert(p->sType == VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_PROPERTIES); + + memcpy(p->deviceUUID, pdevice->device_uuid, VK_UUID_SIZE); + memcpy(p->driverUUID, pdevice->driver_uuid, VK_UUID_SIZE); + memset(p->deviceLUID, 0, VK_LUID_SIZE); + /* The LUID is for Windows. */ + p->deviceLUIDValid = false; + p->deviceNodeMask = 0; + + p->subgroupSize = RADV_SUBGROUP_SIZE; + p->subgroupSupportedStages = VK_SHADER_STAGE_ALL; + p->subgroupSupportedOperations = VK_SUBGROUP_FEATURE_BASIC_BIT | + VK_SUBGROUP_FEATURE_VOTE_BIT | + VK_SUBGROUP_FEATURE_ARITHMETIC_BIT | + VK_SUBGROUP_FEATURE_BALLOT_BIT | + VK_SUBGROUP_FEATURE_CLUSTERED_BIT | + VK_SUBGROUP_FEATURE_QUAD_BIT; + + if (((pdevice->rad_info.chip_class == GFX6 || + pdevice->rad_info.chip_class == GFX7) && !pdevice->use_aco) || + pdevice->rad_info.chip_class >= GFX8) { + p->subgroupSupportedOperations |= VK_SUBGROUP_FEATURE_SHUFFLE_BIT | + VK_SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT; + } + p->subgroupQuadOperationsInAllStages = true; + + p->pointClippingBehavior = VK_POINT_CLIPPING_BEHAVIOR_ALL_CLIP_PLANES; + p->maxMultiviewViewCount = MAX_VIEWS; + p->maxMultiviewInstanceIndex = INT_MAX; + p->protectedNoFault = false; + p->maxPerSetDescriptors = RADV_MAX_PER_SET_DESCRIPTORS; + p->maxMemoryAllocationSize = RADV_MAX_MEMORY_ALLOCATION_SIZE; +} + +static void +radv_get_physical_device_properties_1_2(struct radv_physical_device *pdevice, + VkPhysicalDeviceVulkan12Properties *p) +{ + assert(p->sType == VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_PROPERTIES); + + p->driverID = VK_DRIVER_ID_MESA_RADV; + snprintf(p->driverName, VK_MAX_DRIVER_NAME_SIZE, "radv"); + snprintf(p->driverInfo, VK_MAX_DRIVER_INFO_SIZE, + "Mesa " PACKAGE_VERSION MESA_GIT_SHA1 + " (LLVM " MESA_LLVM_VERSION_STRING ")"); + p->conformanceVersion = (VkConformanceVersion) { + .major = 1, + .minor = 2, + .subminor = 0, + .patch = 0, + }; + + /* On AMD hardware, denormals and rounding modes for fp16/fp64 are + * controlled by the same config register. + */ + p->denormBehaviorIndependence = VK_SHADER_FLOAT_CONTROLS_INDEPENDENCE_32_BIT_ONLY_KHR; + p->roundingModeIndependence = VK_SHADER_FLOAT_CONTROLS_INDEPENDENCE_32_BIT_ONLY_KHR; + + /* Do not allow both preserving and flushing denorms because different + * shaders in the same pipeline can have different settings and this + * won't work for merged shaders. To make it work, this requires LLVM + * support for changing the register. The same logic applies for the + * rounding modes because they are configured with the same config + * register. TODO: we can enable a lot of these for ACO when it + * supports all stages. + */ + p->shaderDenormFlushToZeroFloat32 = true; + p->shaderDenormPreserveFloat32 = false; + p->shaderRoundingModeRTEFloat32 = true; + p->shaderRoundingModeRTZFloat32 = false; + p->shaderSignedZeroInfNanPreserveFloat32 = true; + + p->shaderDenormFlushToZeroFloat16 = false; + p->shaderDenormPreserveFloat16 = pdevice->rad_info.chip_class >= GFX8; + p->shaderRoundingModeRTEFloat16 = pdevice->rad_info.chip_class >= GFX8; + p->shaderRoundingModeRTZFloat16 = false; + p->shaderSignedZeroInfNanPreserveFloat16 = pdevice->rad_info.chip_class >= GFX8; + + p->shaderDenormFlushToZeroFloat64 = false; + p->shaderDenormPreserveFloat64 = pdevice->rad_info.chip_class >= GFX8; + p->shaderRoundingModeRTEFloat64 = pdevice->rad_info.chip_class >= GFX8; + p->shaderRoundingModeRTZFloat64 = false; + p->shaderSignedZeroInfNanPreserveFloat64 = pdevice->rad_info.chip_class >= GFX8; + + p->maxUpdateAfterBindDescriptorsInAllPools = UINT32_MAX / 64; + p->shaderUniformBufferArrayNonUniformIndexingNative = false; + p->shaderSampledImageArrayNonUniformIndexingNative = false; + p->shaderStorageBufferArrayNonUniformIndexingNative = false; + p->shaderStorageImageArrayNonUniformIndexingNative = false; + p->shaderInputAttachmentArrayNonUniformIndexingNative = false; + p->robustBufferAccessUpdateAfterBind = false; + p->quadDivergentImplicitLod = false; + + size_t max_descriptor_set_size = ((1ull << 31) - 16 * MAX_DYNAMIC_BUFFERS - + MAX_INLINE_UNIFORM_BLOCK_SIZE * MAX_INLINE_UNIFORM_BLOCK_COUNT) / + (32 /* uniform buffer, 32 due to potential space wasted on alignment */ + + 32 /* storage buffer, 32 due to potential space wasted on alignment */ + + 32 /* sampler, largest when combined with image */ + + 64 /* sampled image */ + + 64 /* storage image */); + p->maxPerStageDescriptorUpdateAfterBindSamplers = max_descriptor_set_size; + p->maxPerStageDescriptorUpdateAfterBindUniformBuffers = max_descriptor_set_size; + p->maxPerStageDescriptorUpdateAfterBindStorageBuffers = max_descriptor_set_size; + p->maxPerStageDescriptorUpdateAfterBindSampledImages = max_descriptor_set_size; + p->maxPerStageDescriptorUpdateAfterBindStorageImages = max_descriptor_set_size; + p->maxPerStageDescriptorUpdateAfterBindInputAttachments = max_descriptor_set_size; + p->maxPerStageUpdateAfterBindResources = max_descriptor_set_size; + p->maxDescriptorSetUpdateAfterBindSamplers = max_descriptor_set_size; + p->maxDescriptorSetUpdateAfterBindUniformBuffers = max_descriptor_set_size; + p->maxDescriptorSetUpdateAfterBindUniformBuffersDynamic = MAX_DYNAMIC_UNIFORM_BUFFERS; + p->maxDescriptorSetUpdateAfterBindStorageBuffers = max_descriptor_set_size; + p->maxDescriptorSetUpdateAfterBindStorageBuffersDynamic = MAX_DYNAMIC_STORAGE_BUFFERS; + p->maxDescriptorSetUpdateAfterBindSampledImages = max_descriptor_set_size; + p->maxDescriptorSetUpdateAfterBindStorageImages = max_descriptor_set_size; + p->maxDescriptorSetUpdateAfterBindInputAttachments = max_descriptor_set_size; + + /* We support all of the depth resolve modes */ + p->supportedDepthResolveModes = VK_RESOLVE_MODE_SAMPLE_ZERO_BIT_KHR | + VK_RESOLVE_MODE_AVERAGE_BIT_KHR | + VK_RESOLVE_MODE_MIN_BIT_KHR | + VK_RESOLVE_MODE_MAX_BIT_KHR; + + /* Average doesn't make sense for stencil so we don't support that */ + p->supportedStencilResolveModes = VK_RESOLVE_MODE_SAMPLE_ZERO_BIT_KHR | + VK_RESOLVE_MODE_MIN_BIT_KHR | + VK_RESOLVE_MODE_MAX_BIT_KHR; + + p->independentResolveNone = true; + p->independentResolve = true; + + /* GFX6-8 only support single channel min/max filter. */ + p->filterMinmaxImageComponentMapping = pdevice->rad_info.chip_class >= GFX9; + p->filterMinmaxSingleComponentFormats = true; + + p->maxTimelineSemaphoreValueDifference = UINT64_MAX; + + p->framebufferIntegerColorSampleCounts = VK_SAMPLE_COUNT_1_BIT; +} + void radv_GetPhysicalDeviceProperties2( VkPhysicalDevice physicalDevice, VkPhysicalDeviceProperties2 *pProperties) @@ -1236,6 +1585,23 @@ RADV_FROM_HANDLE(radv_physical_device, pdevice, physicalDevice); radv_GetPhysicalDeviceProperties(physicalDevice, &pProperties->properties); + VkPhysicalDeviceVulkan11Properties core_1_1 = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_PROPERTIES, + }; + radv_get_physical_device_properties_1_1(pdevice, &core_1_1); + + VkPhysicalDeviceVulkan12Properties core_1_2 = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_PROPERTIES, + }; + radv_get_physical_device_properties_1_2(pdevice, &core_1_2); + +#define CORE_RENAMED_PROPERTY(major, minor, ext_property, core_property) \ + memcpy(&properties->ext_property, &core_##major##_##minor.core_property, \ + sizeof(core_##major##_##minor.core_property)) + +#define CORE_PROPERTY(major, minor, property) \ + CORE_RENAMED_PROPERTY(major, minor, property, property) + vk_foreach_struct(ext, pProperties->pNext) { switch (ext->sType) { case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PUSH_DESCRIPTOR_PROPERTIES_KHR: { @@ -1246,21 +1612,22 @@ } case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES: { VkPhysicalDeviceIDProperties *properties = (VkPhysicalDeviceIDProperties*)ext; - memcpy(properties->driverUUID, pdevice->driver_uuid, VK_UUID_SIZE); - memcpy(properties->deviceUUID, pdevice->device_uuid, VK_UUID_SIZE); - properties->deviceLUIDValid = false; + CORE_PROPERTY(1, 1, deviceUUID); + CORE_PROPERTY(1, 1, driverUUID); + CORE_PROPERTY(1, 1, deviceLUID); + CORE_PROPERTY(1, 1, deviceLUIDValid); break; } case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MULTIVIEW_PROPERTIES: { VkPhysicalDeviceMultiviewProperties *properties = (VkPhysicalDeviceMultiviewProperties*)ext; - properties->maxMultiviewViewCount = MAX_VIEWS; - properties->maxMultiviewInstanceIndex = INT_MAX; + CORE_PROPERTY(1, 1, maxMultiviewViewCount); + CORE_PROPERTY(1, 1, maxMultiviewInstanceIndex); break; } case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_POINT_CLIPPING_PROPERTIES: { VkPhysicalDevicePointClippingProperties *properties = (VkPhysicalDevicePointClippingProperties*)ext; - properties->pointClippingBehavior = VK_POINT_CLIPPING_BEHAVIOR_ALL_CLIP_PLANES; + CORE_PROPERTY(1, 1, pointClippingBehavior); break; } case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DISCARD_RECTANGLE_PROPERTIES_EXT: { @@ -1278,38 +1645,27 @@ case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_PROPERTIES: { VkPhysicalDeviceSubgroupProperties *properties = (VkPhysicalDeviceSubgroupProperties*)ext; - properties->subgroupSize = 64; - properties->supportedStages = VK_SHADER_STAGE_ALL; - properties->supportedOperations = - VK_SUBGROUP_FEATURE_BASIC_BIT | - VK_SUBGROUP_FEATURE_BALLOT_BIT | - VK_SUBGROUP_FEATURE_QUAD_BIT | - VK_SUBGROUP_FEATURE_VOTE_BIT; - if (pdevice->rad_info.chip_class >= GFX8) { - properties->supportedOperations |= - VK_SUBGROUP_FEATURE_ARITHMETIC_BIT | - VK_SUBGROUP_FEATURE_SHUFFLE_BIT | - VK_SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT; - } - properties->quadOperationsInAllStages = true; + CORE_PROPERTY(1, 1, subgroupSize); + CORE_RENAMED_PROPERTY(1, 1, supportedStages, + subgroupSupportedStages); + CORE_RENAMED_PROPERTY(1, 1, supportedOperations, + subgroupSupportedOperations); + CORE_RENAMED_PROPERTY(1, 1, quadOperationsInAllStages, + subgroupQuadOperationsInAllStages); break; } case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MAINTENANCE_3_PROPERTIES: { VkPhysicalDeviceMaintenance3Properties *properties = (VkPhysicalDeviceMaintenance3Properties*)ext; - /* Make sure everything is addressable by a signed 32-bit int, and - * our largest descriptors are 96 bytes. */ - properties->maxPerSetDescriptors = (1ull << 31) / 96; - /* Our buffer size fields allow only this much */ - properties->maxMemoryAllocationSize = 0xFFFFFFFFull; + CORE_PROPERTY(1, 1, maxPerSetDescriptors); + CORE_PROPERTY(1, 1, maxMemoryAllocationSize); break; } - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SAMPLER_FILTER_MINMAX_PROPERTIES_EXT: { - VkPhysicalDeviceSamplerFilterMinmaxPropertiesEXT *properties = - (VkPhysicalDeviceSamplerFilterMinmaxPropertiesEXT *)ext; - /* GFX6-8 only support single channel min/max filter. */ - properties->filterMinmaxImageComponentMapping = pdevice->rad_info.chip_class >= GFX9; - properties->filterMinmaxSingleComponentFormats = true; + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SAMPLER_FILTER_MINMAX_PROPERTIES: { + VkPhysicalDeviceSamplerFilterMinmaxProperties *properties = + (VkPhysicalDeviceSamplerFilterMinmaxProperties *)ext; + CORE_PROPERTY(1, 2, filterMinmaxImageComponentMapping); + CORE_PROPERTY(1, 2, filterMinmaxSingleComponentFormats); break; } case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_CORE_PROPERTIES_AMD: { @@ -1335,7 +1691,7 @@ /* SGPR. */ properties->sgprsPerSimd = - ac_get_num_physical_sgprs(&pdevice->rad_info); + pdevice->rad_info.num_physical_sgprs_per_simd; properties->minSgprAllocation = pdevice->rad_info.chip_class >= GFX8 ? 16 : 8; properties->maxSgprAllocation = @@ -1351,46 +1707,53 @@ properties->vgprAllocationGranularity = 4; break; } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_CORE_PROPERTIES_2_AMD: { + VkPhysicalDeviceShaderCoreProperties2AMD *properties = + (VkPhysicalDeviceShaderCoreProperties2AMD *)ext; + + properties->shaderCoreFeatures = 0; + properties->activeComputeUnitCount = + pdevice->rad_info.num_good_compute_units; + break; + } case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VERTEX_ATTRIBUTE_DIVISOR_PROPERTIES_EXT: { VkPhysicalDeviceVertexAttributeDivisorPropertiesEXT *properties = (VkPhysicalDeviceVertexAttributeDivisorPropertiesEXT *)ext; properties->maxVertexAttribDivisor = UINT32_MAX; break; } - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DESCRIPTOR_INDEXING_PROPERTIES_EXT: { - VkPhysicalDeviceDescriptorIndexingPropertiesEXT *properties = - (VkPhysicalDeviceDescriptorIndexingPropertiesEXT*)ext; - properties->maxUpdateAfterBindDescriptorsInAllPools = UINT32_MAX / 64; - properties->shaderUniformBufferArrayNonUniformIndexingNative = false; - properties->shaderSampledImageArrayNonUniformIndexingNative = false; - properties->shaderStorageBufferArrayNonUniformIndexingNative = false; - properties->shaderStorageImageArrayNonUniformIndexingNative = false; - properties->shaderInputAttachmentArrayNonUniformIndexingNative = false; - properties->robustBufferAccessUpdateAfterBind = false; - properties->quadDivergentImplicitLod = false; - - size_t max_descriptor_set_size = radv_max_descriptor_set_size(); - properties->maxPerStageDescriptorUpdateAfterBindSamplers = max_descriptor_set_size; - properties->maxPerStageDescriptorUpdateAfterBindUniformBuffers = max_descriptor_set_size; - properties->maxPerStageDescriptorUpdateAfterBindStorageBuffers = max_descriptor_set_size; - properties->maxPerStageDescriptorUpdateAfterBindSampledImages = max_descriptor_set_size; - properties->maxPerStageDescriptorUpdateAfterBindStorageImages = max_descriptor_set_size; - properties->maxPerStageDescriptorUpdateAfterBindInputAttachments = max_descriptor_set_size; - properties->maxPerStageUpdateAfterBindResources = max_descriptor_set_size; - properties->maxDescriptorSetUpdateAfterBindSamplers = max_descriptor_set_size; - properties->maxDescriptorSetUpdateAfterBindUniformBuffers = max_descriptor_set_size; - properties->maxDescriptorSetUpdateAfterBindUniformBuffersDynamic = MAX_DYNAMIC_UNIFORM_BUFFERS; - properties->maxDescriptorSetUpdateAfterBindStorageBuffers = max_descriptor_set_size; - properties->maxDescriptorSetUpdateAfterBindStorageBuffersDynamic = MAX_DYNAMIC_STORAGE_BUFFERS; - properties->maxDescriptorSetUpdateAfterBindSampledImages = max_descriptor_set_size; - properties->maxDescriptorSetUpdateAfterBindStorageImages = max_descriptor_set_size; - properties->maxDescriptorSetUpdateAfterBindInputAttachments = max_descriptor_set_size; + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DESCRIPTOR_INDEXING_PROPERTIES: { + VkPhysicalDeviceDescriptorIndexingProperties *properties = + (VkPhysicalDeviceDescriptorIndexingProperties*)ext; + CORE_PROPERTY(1, 2, maxUpdateAfterBindDescriptorsInAllPools); + CORE_PROPERTY(1, 2, shaderUniformBufferArrayNonUniformIndexingNative); + CORE_PROPERTY(1, 2, shaderSampledImageArrayNonUniformIndexingNative); + CORE_PROPERTY(1, 2, shaderStorageBufferArrayNonUniformIndexingNative); + CORE_PROPERTY(1, 2, shaderStorageImageArrayNonUniformIndexingNative); + CORE_PROPERTY(1, 2, shaderInputAttachmentArrayNonUniformIndexingNative); + CORE_PROPERTY(1, 2, robustBufferAccessUpdateAfterBind); + CORE_PROPERTY(1, 2, quadDivergentImplicitLod); + CORE_PROPERTY(1, 2, maxPerStageDescriptorUpdateAfterBindSamplers); + CORE_PROPERTY(1, 2, maxPerStageDescriptorUpdateAfterBindUniformBuffers); + CORE_PROPERTY(1, 2, maxPerStageDescriptorUpdateAfterBindStorageBuffers); + CORE_PROPERTY(1, 2, maxPerStageDescriptorUpdateAfterBindSampledImages); + CORE_PROPERTY(1, 2, maxPerStageDescriptorUpdateAfterBindStorageImages); + CORE_PROPERTY(1, 2, maxPerStageDescriptorUpdateAfterBindInputAttachments); + CORE_PROPERTY(1, 2, maxPerStageUpdateAfterBindResources); + CORE_PROPERTY(1, 2, maxDescriptorSetUpdateAfterBindSamplers); + CORE_PROPERTY(1, 2, maxDescriptorSetUpdateAfterBindUniformBuffers); + CORE_PROPERTY(1, 2, maxDescriptorSetUpdateAfterBindUniformBuffersDynamic); + CORE_PROPERTY(1, 2, maxDescriptorSetUpdateAfterBindStorageBuffers); + CORE_PROPERTY(1, 2, maxDescriptorSetUpdateAfterBindStorageBuffersDynamic); + CORE_PROPERTY(1, 2, maxDescriptorSetUpdateAfterBindSampledImages); + CORE_PROPERTY(1, 2, maxDescriptorSetUpdateAfterBindStorageImages); + CORE_PROPERTY(1, 2, maxDescriptorSetUpdateAfterBindInputAttachments); break; } case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROTECTED_MEMORY_PROPERTIES: { VkPhysicalDeviceProtectedMemoryProperties *properties = (VkPhysicalDeviceProtectedMemoryProperties *)ext; - properties->protectedNoFault = false; + CORE_PROPERTY(1, 1, protectedNoFault); break; } case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_CONSERVATIVE_RASTERIZATION_PROPERTIES_EXT: { @@ -1399,12 +1762,12 @@ properties->primitiveOverestimationSize = 0; properties->maxExtraPrimitiveOverestimationSize = 0; properties->extraPrimitiveOverestimationSizeGranularity = 0; - properties->primitiveUnderestimation = VK_FALSE; - properties->conservativePointAndLineRasterization = VK_FALSE; - properties->degenerateTrianglesRasterized = VK_FALSE; - properties->degenerateLinesRasterized = VK_FALSE; - properties->fullyCoveredFragmentShaderInputVariable = VK_FALSE; - properties->conservativeRasterizationPostDepthCoverage = VK_FALSE; + properties->primitiveUnderestimation = false; + properties->conservativePointAndLineRasterization = false; + properties->degenerateTrianglesRasterized = false; + properties->degenerateLinesRasterized = false; + properties->fullyCoveredFragmentShaderInputVariable = false; + properties->conservativeRasterizationPostDepthCoverage = false; break; } case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PCI_BUS_INFO_PROPERTIES_EXT: { @@ -1416,22 +1779,13 @@ properties->pciFunction = pdevice->bus_info.func; break; } - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DRIVER_PROPERTIES_KHR: { - VkPhysicalDeviceDriverPropertiesKHR *driver_props = - (VkPhysicalDeviceDriverPropertiesKHR *) ext; - - driver_props->driverID = VK_DRIVER_ID_MESA_RADV_KHR; - snprintf(driver_props->driverName, VK_MAX_DRIVER_NAME_SIZE_KHR, "radv"); - snprintf(driver_props->driverInfo, VK_MAX_DRIVER_INFO_SIZE_KHR, - "Mesa " PACKAGE_VERSION MESA_GIT_SHA1 - " (LLVM " MESA_LLVM_VERSION_STRING ")"); - - driver_props->conformanceVersion = (VkConformanceVersionKHR) { - .major = 1, - .minor = 1, - .subminor = 2, - .patch = 0, - }; + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DRIVER_PROPERTIES: { + VkPhysicalDeviceDriverProperties *properties = + (VkPhysicalDeviceDriverProperties *) ext; + CORE_PROPERTY(1, 2, driverID); + CORE_PROPERTY(1, 2, driverName); + CORE_PROPERTY(1, 2, driverInfo); + CORE_PROPERTY(1, 2, conformanceVersion); break; } case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TRANSFORM_FEEDBACK_PROPERTIES_EXT: { @@ -1443,8 +1797,8 @@ properties->maxTransformFeedbackStreamDataSize = 512; properties->maxTransformFeedbackBufferDataSize = UINT32_MAX; properties->maxTransformFeedbackBufferDataStride = 512; - properties->transformFeedbackQueries = true; - properties->transformFeedbackStreamsLinesTriangles = true; + properties->transformFeedbackQueries = !pdevice->use_ngg_streamout; + properties->transformFeedbackStreamsLinesTriangles = !pdevice->use_ngg_streamout; properties->transformFeedbackRasterizationStreamSelect = false; properties->transformFeedbackDraw = true; break; @@ -1470,30 +1824,76 @@ properties->sampleLocationCoordinateRange[0] = 0.0f; properties->sampleLocationCoordinateRange[1] = 0.9375f; properties->sampleLocationSubPixelBits = 4; - properties->variableSampleLocations = VK_FALSE; + properties->variableSampleLocations = false; break; } - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DEPTH_STENCIL_RESOLVE_PROPERTIES_KHR: { - VkPhysicalDeviceDepthStencilResolvePropertiesKHR *properties = - (VkPhysicalDeviceDepthStencilResolvePropertiesKHR *)ext; - - /* We support all of the depth resolve modes */ - properties->supportedDepthResolveModes = - VK_RESOLVE_MODE_SAMPLE_ZERO_BIT_KHR | - VK_RESOLVE_MODE_AVERAGE_BIT_KHR | - VK_RESOLVE_MODE_MIN_BIT_KHR | - VK_RESOLVE_MODE_MAX_BIT_KHR; - - /* Average doesn't make sense for stencil so we don't support that */ - properties->supportedStencilResolveModes = - VK_RESOLVE_MODE_SAMPLE_ZERO_BIT_KHR | - VK_RESOLVE_MODE_MIN_BIT_KHR | - VK_RESOLVE_MODE_MAX_BIT_KHR; - - properties->independentResolveNone = VK_TRUE; - properties->independentResolve = VK_TRUE; + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DEPTH_STENCIL_RESOLVE_PROPERTIES: { + VkPhysicalDeviceDepthStencilResolveProperties *properties = + (VkPhysicalDeviceDepthStencilResolveProperties *)ext; + CORE_PROPERTY(1, 2, supportedDepthResolveModes); + CORE_PROPERTY(1, 2, supportedStencilResolveModes); + CORE_PROPERTY(1, 2, independentResolveNone); + CORE_PROPERTY(1, 2, independentResolve); + break; + } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TEXEL_BUFFER_ALIGNMENT_PROPERTIES_EXT: { + VkPhysicalDeviceTexelBufferAlignmentPropertiesEXT *properties = + (VkPhysicalDeviceTexelBufferAlignmentPropertiesEXT *)ext; + properties->storageTexelBufferOffsetAlignmentBytes = 4; + properties->storageTexelBufferOffsetSingleTexelAlignment = true; + properties->uniformTexelBufferOffsetAlignmentBytes = 4; + properties->uniformTexelBufferOffsetSingleTexelAlignment = true; + break; + } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT_CONTROLS_PROPERTIES : { + VkPhysicalDeviceFloatControlsProperties *properties = + (VkPhysicalDeviceFloatControlsProperties *)ext; + CORE_PROPERTY(1, 2, denormBehaviorIndependence); + CORE_PROPERTY(1, 2, roundingModeIndependence); + CORE_PROPERTY(1, 2, shaderDenormFlushToZeroFloat16); + CORE_PROPERTY(1, 2, shaderDenormPreserveFloat16); + CORE_PROPERTY(1, 2, shaderRoundingModeRTEFloat16); + CORE_PROPERTY(1, 2, shaderRoundingModeRTZFloat16); + CORE_PROPERTY(1, 2, shaderSignedZeroInfNanPreserveFloat16); + CORE_PROPERTY(1, 2, shaderDenormFlushToZeroFloat32); + CORE_PROPERTY(1, 2, shaderDenormPreserveFloat32); + CORE_PROPERTY(1, 2, shaderRoundingModeRTEFloat32); + CORE_PROPERTY(1, 2, shaderRoundingModeRTZFloat32); + CORE_PROPERTY(1, 2, shaderSignedZeroInfNanPreserveFloat32); + CORE_PROPERTY(1, 2, shaderDenormFlushToZeroFloat64); + CORE_PROPERTY(1, 2, shaderDenormPreserveFloat64); + CORE_PROPERTY(1, 2, shaderRoundingModeRTEFloat64); + CORE_PROPERTY(1, 2, shaderRoundingModeRTZFloat64); + CORE_PROPERTY(1, 2, shaderSignedZeroInfNanPreserveFloat64); + break; + } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TIMELINE_SEMAPHORE_PROPERTIES: { + VkPhysicalDeviceTimelineSemaphoreProperties *properties = + (VkPhysicalDeviceTimelineSemaphoreProperties *) ext; + CORE_PROPERTY(1, 2, maxTimelineSemaphoreValueDifference); + break; + } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_SIZE_CONTROL_PROPERTIES_EXT: { + VkPhysicalDeviceSubgroupSizeControlPropertiesEXT *props = + (VkPhysicalDeviceSubgroupSizeControlPropertiesEXT *)ext; + props->minSubgroupSize = 64; + props->maxSubgroupSize = 64; + props->maxComputeWorkgroupSubgroups = UINT32_MAX; + props->requiredSubgroupSizeStages = 0; + + if (pdevice->rad_info.chip_class >= GFX10) { + /* Only GFX10+ supports wave32. */ + props->minSubgroupSize = 32; + props->requiredSubgroupSizeStages = VK_SHADER_STAGE_COMPUTE_BIT; + } break; } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_PROPERTIES: + radv_get_physical_device_properties_1_1(pdevice, (void *)ext); + break; + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_PROPERTIES: + radv_get_physical_device_properties_1_2(pdevice, (void *)ext); + break; default: break; } @@ -1507,7 +1907,7 @@ { int num_queue_families = 1; int idx; - if (pdevice->rad_info.num_compute_rings > 0 && + if (pdevice->rad_info.num_rings[RING_COMPUTE] > 0 && !(pdevice->instance->debug_flags & RADV_DEBUG_NO_COMPUTE_QUEUE)) num_queue_families++; @@ -1533,14 +1933,14 @@ idx++; } - if (pdevice->rad_info.num_compute_rings > 0 && + if (pdevice->rad_info.num_rings[RING_COMPUTE] > 0 && !(pdevice->instance->debug_flags & RADV_DEBUG_NO_COMPUTE_QUEUE)) { if (*pCount > idx) { *pQueueFamilyProperties[idx] = (VkQueueFamilyProperties) { .queueFlags = VK_QUEUE_COMPUTE_BIT | VK_QUEUE_TRANSFER_BIT | VK_QUEUE_SPARSE_BINDING_BIT, - .queueCount = pdevice->rad_info.num_compute_rings, + .queueCount = pdevice->rad_info.num_rings[RING_COMPUTE], .timestampValidBits = 64, .minImageTransferGranularity = (VkExtent3D) { 1, 1, 1 }, }; @@ -1620,8 +2020,7 @@ for (int i = 0; i < device->memory_properties.memoryTypeCount; i++) { uint32_t heap_index = device->memory_properties.memoryTypes[i].heapIndex; - switch (device->mem_type_indices[i]) { - case RADV_MEM_TYPE_VRAM: + if (radv_is_mem_type_vram(device->mem_type_indices[i])) { heap_usage = device->ws->query_value(device->ws, RADEON_ALLOCATED_VRAM); @@ -1631,8 +2030,7 @@ memoryBudget->heapBudget[heap_index] = heap_budget; memoryBudget->heapUsage[heap_index] = heap_usage; - break; - case RADV_MEM_TYPE_VRAM_CPU_ACCESS: + } else if (radv_is_mem_type_vram_visible(device->mem_type_indices[i])) { heap_usage = device->ws->query_value(device->ws, RADEON_ALLOCATED_VRAM_VIS); @@ -1642,8 +2040,7 @@ memoryBudget->heapBudget[heap_index] = heap_budget; memoryBudget->heapUsage[heap_index] = heap_usage; - break; - case RADV_MEM_TYPE_GTT_WRITE_COMBINE: + } else if (radv_is_mem_type_gtt_wc(device->mem_type_indices[i])) { heap_usage = device->ws->query_value(device->ws, RADEON_ALLOCATED_GTT); @@ -1653,9 +2050,6 @@ memoryBudget->heapBudget[heap_index] = heap_budget; memoryBudget->heapUsage[heap_index] = heap_usage; - break; - default: - break; } } @@ -1697,7 +2091,7 @@ const struct radv_physical_device *physical_device = device->physical_device; uint32_t memoryTypeBits = 0; for (int i = 0; i < physical_device->memory_properties.memoryTypeCount; i++) { - if (physical_device->mem_type_indices[i] == RADV_MEM_TYPE_GTT_CACHED) { + if (radv_is_mem_type_gtt_cached(physical_device->mem_type_indices[i])) { memoryTypeBits = (1 << i); break; } @@ -1744,10 +2138,14 @@ queue->queue_idx = idx; queue->priority = radv_get_queue_global_priority(global_priority); queue->flags = flags; + queue->hw_ctx = NULL; - queue->hw_ctx = device->ws->ctx_create(device->ws, queue->priority); - if (!queue->hw_ctx) - return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); + VkResult result = device->ws->ctx_create(device->ws, queue->priority, &queue->hw_ctx); + if (result != VK_SUCCESS) + return vk_error(device->instance, result); + + list_inithead(&queue->pending_submissions); + pthread_mutex_init(&queue->pending_mutex, NULL); return VK_SUCCESS; } @@ -1755,6 +2153,8 @@ static void radv_queue_finish(struct radv_queue *queue) { + pthread_mutex_destroy(&queue->pending_mutex); + if (queue->hw_ctx) queue->device->ws->ctx_destroy(queue->hw_ctx); @@ -1774,6 +2174,10 @@ queue->device->ws->buffer_destroy(queue->gsvs_ring_bo); if (queue->tess_rings_bo) queue->device->ws->buffer_destroy(queue->tess_rings_bo); + if (queue->gds_bo) + queue->device->ws->buffer_destroy(queue->gds_bo); + if (queue->gds_oa_bo) + queue->device->ws->buffer_destroy(queue->gds_oa_bo); if (queue->compute_scratch_bo) queue->device->ws->buffer_destroy(queue->compute_scratch_bo); } @@ -1883,88 +2287,636 @@ return result; } -VkResult radv_CreateDevice( - VkPhysicalDevice physicalDevice, - const VkDeviceCreateInfo* pCreateInfo, - const VkAllocationCallbacks* pAllocator, - VkDevice* pDevice) +static int install_seccomp_filter() { + + struct sock_filter filter[] = { + /* Check arch is 64bit x86 */ + BPF_STMT(BPF_LD + BPF_W + BPF_ABS, (offsetof(struct seccomp_data, arch))), + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, AUDIT_ARCH_X86_64, 0, 12), + + /* Futex is required for mutex locks */ + #if defined __NR__newselect + BPF_STMT(BPF_LD + BPF_W + BPF_ABS, (offsetof(struct seccomp_data, nr))), + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, __NR__newselect, 11, 0), + #elif defined __NR_select + BPF_STMT(BPF_LD + BPF_W + BPF_ABS, (offsetof(struct seccomp_data, nr))), + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, __NR_select, 11, 0), + #else + BPF_STMT(BPF_LD + BPF_W + BPF_ABS, (offsetof(struct seccomp_data, nr))), + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, __NR_pselect6, 11, 0), + #endif + + /* Allow system exit calls for the forked process */ + BPF_STMT(BPF_LD + BPF_W + BPF_ABS, (offsetof(struct seccomp_data, nr))), + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, __NR_exit_group, 9, 0), + + /* Allow system read calls */ + BPF_STMT(BPF_LD + BPF_W + BPF_ABS, (offsetof(struct seccomp_data, nr))), + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, __NR_read, 7, 0), + + /* Allow system write calls */ + BPF_STMT(BPF_LD + BPF_W + BPF_ABS, (offsetof(struct seccomp_data, nr))), + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, __NR_write, 5, 0), + + /* Allow system brk calls (we need this for malloc) */ + BPF_STMT(BPF_LD + BPF_W + BPF_ABS, (offsetof(struct seccomp_data, nr))), + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, __NR_brk, 3, 0), + + /* Futex is required for mutex locks */ + BPF_STMT(BPF_LD + BPF_W + BPF_ABS, (offsetof(struct seccomp_data, nr))), + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, __NR_futex, 1, 0), + + /* Return error if we hit a system call not on the whitelist */ + BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_ERRNO | (EPERM & SECCOMP_RET_DATA)), + + /* Allow whitelisted system calls */ + BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_ALLOW), + }; + + struct sock_fprog prog = { + .len = (unsigned short)(sizeof(filter) / sizeof(filter[0])), + .filter = filter, + }; + + if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) + return -1; + + if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog)) + return -1; + + return 0; +} + +/* Helper function with timeout support for reading from the pipe between + * processes used for secure compile. + */ +bool radv_sc_read(int fd, void *buf, size_t size, bool timeout) { - RADV_FROM_HANDLE(radv_physical_device, physical_device, physicalDevice); - VkResult result; - struct radv_device *device; + fd_set fds; + struct timeval tv; - bool keep_shader_info = false; + FD_ZERO(&fds); + FD_SET(fd, &fds); - /* Check enabled features */ - if (pCreateInfo->pEnabledFeatures) { - VkPhysicalDeviceFeatures supported_features; - radv_GetPhysicalDeviceFeatures(physicalDevice, &supported_features); - VkBool32 *supported_feature = (VkBool32 *)&supported_features; - VkBool32 *enabled_feature = (VkBool32 *)pCreateInfo->pEnabledFeatures; - unsigned num_features = sizeof(VkPhysicalDeviceFeatures) / sizeof(VkBool32); - for (uint32_t i = 0; i < num_features; i++) { - if (enabled_feature[i] && !supported_feature[i]) - return vk_error(physical_device->instance, VK_ERROR_FEATURE_NOT_PRESENT); + while (true) { + /* We can't rely on the value of tv after calling select() so + * we must reset it on each iteration of the loop. + */ + tv.tv_sec = 5; + tv.tv_usec = 0; + + int rval = select(fd + 1, &fds, NULL, NULL, timeout ? &tv : NULL); + + if (rval == -1) { + /* select error */ + return false; + } else if (rval) { + ssize_t bytes_read = read(fd, buf, size); + if (bytes_read < 0) + return false; + + buf += bytes_read; + size -= bytes_read; + if (size == 0) + return true; + } else { + /* select timeout */ + return false; } } +} - device = vk_zalloc2(&physical_device->instance->alloc, pAllocator, - sizeof(*device), 8, - VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); - if (!device) - return vk_error(physical_device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); +static bool radv_close_all_fds(const int *keep_fds, int keep_fd_count) +{ + DIR *d; + struct dirent *dir; + d = opendir("/proc/self/fd"); + if (!d) + return false; + int dir_fd = dirfd(d); - device->_loader_data.loaderMagic = ICD_LOADER_MAGIC; - device->instance = physical_device->instance; - device->physical_device = physical_device; + while ((dir = readdir(d)) != NULL) { + if (dir->d_name[0] == '.') + continue; - device->ws = physical_device->ws; - if (pAllocator) - device->alloc = *pAllocator; - else - device->alloc = physical_device->instance->alloc; + int fd = atoi(dir->d_name); + if (fd == dir_fd) + continue; - for (uint32_t i = 0; i < pCreateInfo->enabledExtensionCount; i++) { - const char *ext_name = pCreateInfo->ppEnabledExtensionNames[i]; - int index = radv_get_device_extension_index(ext_name); - if (index < 0 || !physical_device->supported_extensions.extensions[index]) { - vk_free(&device->alloc, device); - return vk_error(physical_device->instance, VK_ERROR_EXTENSION_NOT_PRESENT); - } + bool keep = false; + for (int i = 0; !keep && i < keep_fd_count; ++i) + if (keep_fds[i] == fd) + keep = true; - device->enabled_extensions.extensions[index] = true; + if (keep) + continue; + + close(fd); } + closedir(d); + return true; +} - keep_shader_info = device->enabled_extensions.AMD_shader_info; +static bool secure_compile_open_fifo_fds(struct radv_secure_compile_state *sc, + int *fd_server, int *fd_client, + unsigned process, bool make_fifo) +{ + bool result = false; + char *fifo_server_path = NULL; + char *fifo_client_path = NULL; - /* With update after bind we can't attach bo's to the command buffer - * from the descriptor set anymore, so we have to use a global BO list. - */ - device->use_global_bo_list = - (device->instance->perftest_flags & RADV_PERFTEST_BO_LIST) || - device->enabled_extensions.EXT_descriptor_indexing || - device->enabled_extensions.EXT_buffer_device_address; + if (asprintf(&fifo_server_path, "/tmp/radv_server_%s_%u", sc->uid, process) == -1) + goto open_fifo_exit; - device->robust_buffer_access = pCreateInfo->pEnabledFeatures && - pCreateInfo->pEnabledFeatures->robustBufferAccess; + if (asprintf(&fifo_client_path, "/tmp/radv_client_%s_%u", sc->uid, process) == -1) + goto open_fifo_exit; - mtx_init(&device->shader_slab_mutex, mtx_plain); - list_inithead(&device->shader_slabs); + if (make_fifo) { + int file1 = mkfifo(fifo_server_path, 0666); + if(file1 < 0) + goto open_fifo_exit; - radv_bo_list_init(&device->bo_list); + int file2 = mkfifo(fifo_client_path, 0666); + if(file2 < 0) + goto open_fifo_exit; + } - for (unsigned i = 0; i < pCreateInfo->queueCreateInfoCount; i++) { - const VkDeviceQueueCreateInfo *queue_create = &pCreateInfo->pQueueCreateInfos[i]; - uint32_t qfi = queue_create->queueFamilyIndex; - const VkDeviceQueueGlobalPriorityCreateInfoEXT *global_priority = - vk_find_struct_const(queue_create->pNext, DEVICE_QUEUE_GLOBAL_PRIORITY_CREATE_INFO_EXT); + *fd_server = open(fifo_server_path, O_RDWR); + if(*fd_server < 1) + goto open_fifo_exit; - assert(!global_priority || device->physical_device->rad_info.has_ctx_priority); + *fd_client = open(fifo_client_path, O_RDWR); + if(*fd_client < 1) { + close(*fd_server); + goto open_fifo_exit; + } - device->queues[qfi] = vk_alloc(&device->alloc, - queue_create->queueCount * sizeof(struct radv_queue), 8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); - if (!device->queues[qfi]) { - result = VK_ERROR_OUT_OF_HOST_MEMORY; + result = true; + +open_fifo_exit: + free(fifo_server_path); + free(fifo_client_path); + + return result; +} + +static void run_secure_compile_device(struct radv_device *device, unsigned process, + int fd_idle_device_output) +{ + int fd_secure_input; + int fd_secure_output; + bool fifo_result = secure_compile_open_fifo_fds(device->sc_state, + &fd_secure_input, + &fd_secure_output, + process, false); + + enum radv_secure_compile_type sc_type; + + const int needed_fds[] = { + fd_secure_input, + fd_secure_output, + fd_idle_device_output, + }; + + if (!fifo_result || !radv_close_all_fds(needed_fds, ARRAY_SIZE(needed_fds)) || + install_seccomp_filter() == -1) { + sc_type = RADV_SC_TYPE_INIT_FAILURE; + } else { + sc_type = RADV_SC_TYPE_INIT_SUCCESS; + device->sc_state->secure_compile_processes[process].fd_secure_input = fd_secure_input; + device->sc_state->secure_compile_processes[process].fd_secure_output = fd_secure_output; + } + + write(fd_idle_device_output, &sc_type, sizeof(sc_type)); + + if (sc_type == RADV_SC_TYPE_INIT_FAILURE) + goto secure_compile_exit; + + while (true) { + radv_sc_read(fd_secure_input, &sc_type, sizeof(sc_type), false); + + if (sc_type == RADV_SC_TYPE_COMPILE_PIPELINE) { + struct radv_pipeline *pipeline; + bool sc_read = true; + + pipeline = vk_zalloc2(&device->alloc, NULL, sizeof(*pipeline), 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + + pipeline->device = device; + + /* Read pipeline layout */ + struct radv_pipeline_layout layout; + sc_read = radv_sc_read(fd_secure_input, &layout, sizeof(struct radv_pipeline_layout), true); + sc_read &= radv_sc_read(fd_secure_input, &layout.num_sets, sizeof(uint32_t), true); + if (!sc_read) + goto secure_compile_exit; + + for (uint32_t set = 0; set < layout.num_sets; set++) { + uint32_t layout_size; + sc_read &= radv_sc_read(fd_secure_input, &layout_size, sizeof(uint32_t), true); + if (!sc_read) + goto secure_compile_exit; + + layout.set[set].layout = malloc(layout_size); + layout.set[set].layout->layout_size = layout_size; + sc_read &= radv_sc_read(fd_secure_input, layout.set[set].layout, + layout.set[set].layout->layout_size, true); + } + + pipeline->layout = &layout; + + /* Read pipeline key */ + struct radv_pipeline_key key; + sc_read &= radv_sc_read(fd_secure_input, &key, sizeof(struct radv_pipeline_key), true); + + /* Read pipeline create flags */ + VkPipelineCreateFlags flags; + sc_read &= radv_sc_read(fd_secure_input, &flags, sizeof(VkPipelineCreateFlags), true); + + /* Read stage and shader information */ + uint32_t num_stages; + const VkPipelineShaderStageCreateInfo *pStages[MESA_SHADER_STAGES] = { 0, }; + sc_read &= radv_sc_read(fd_secure_input, &num_stages, sizeof(uint32_t), true); + if (!sc_read) + goto secure_compile_exit; + + for (uint32_t i = 0; i < num_stages; i++) { + + /* Read stage */ + gl_shader_stage stage; + sc_read &= radv_sc_read(fd_secure_input, &stage, sizeof(gl_shader_stage), true); + + VkPipelineShaderStageCreateInfo *pStage = calloc(1, sizeof(VkPipelineShaderStageCreateInfo)); + + /* Read entry point name */ + size_t name_size; + sc_read &= radv_sc_read(fd_secure_input, &name_size, sizeof(size_t), true); + if (!sc_read) + goto secure_compile_exit; + + char *ep_name = malloc(name_size); + sc_read &= radv_sc_read(fd_secure_input, ep_name, name_size, true); + pStage->pName = ep_name; + + /* Read shader module */ + size_t module_size; + sc_read &= radv_sc_read(fd_secure_input, &module_size, sizeof(size_t), true); + if (!sc_read) + goto secure_compile_exit; + + struct radv_shader_module *module = malloc(module_size); + sc_read &= radv_sc_read(fd_secure_input, module, module_size, true); + pStage->module = radv_shader_module_to_handle(module); + + /* Read specialization info */ + bool has_spec_info; + sc_read &= radv_sc_read(fd_secure_input, &has_spec_info, sizeof(bool), true); + if (!sc_read) + goto secure_compile_exit; + + if (has_spec_info) { + VkSpecializationInfo *specInfo = malloc(sizeof(VkSpecializationInfo)); + pStage->pSpecializationInfo = specInfo; + + sc_read &= radv_sc_read(fd_secure_input, &specInfo->dataSize, sizeof(size_t), true); + if (!sc_read) + goto secure_compile_exit; + + void *si_data = malloc(specInfo->dataSize); + sc_read &= radv_sc_read(fd_secure_input, si_data, specInfo->dataSize, true); + specInfo->pData = si_data; + + sc_read &= radv_sc_read(fd_secure_input, &specInfo->mapEntryCount, sizeof(uint32_t), true); + if (!sc_read) + goto secure_compile_exit; + + VkSpecializationMapEntry *mapEntries = malloc(sizeof(VkSpecializationMapEntry) * specInfo->mapEntryCount); + for (uint32_t j = 0; j < specInfo->mapEntryCount; j++) { + sc_read &= radv_sc_read(fd_secure_input, &mapEntries[j], sizeof(VkSpecializationMapEntry), true); + if (!sc_read) + goto secure_compile_exit; + } + + specInfo->pMapEntries = mapEntries; + } + + pStages[stage] = pStage; + } + + /* Compile the shaders */ + VkPipelineCreationFeedbackEXT *stage_feedbacks[MESA_SHADER_STAGES] = { 0 }; + radv_create_shaders(pipeline, device, NULL, &key, pStages, flags, NULL, stage_feedbacks); + + /* free memory allocated above */ + for (uint32_t set = 0; set < layout.num_sets; set++) + free(layout.set[set].layout); + + for (uint32_t i = 0; i < MESA_SHADER_STAGES; i++) { + if (!pStages[i]) + continue; + + free((void *) pStages[i]->pName); + free(radv_shader_module_from_handle(pStages[i]->module)); + if (pStages[i]->pSpecializationInfo) { + free((void *) pStages[i]->pSpecializationInfo->pData); + free((void *) pStages[i]->pSpecializationInfo->pMapEntries); + free((void *) pStages[i]->pSpecializationInfo); + } + free((void *) pStages[i]); + } + + vk_free(&device->alloc, pipeline); + + sc_type = RADV_SC_TYPE_COMPILE_PIPELINE_FINISHED; + write(fd_secure_output, &sc_type, sizeof(sc_type)); + + } else if (sc_type == RADV_SC_TYPE_DESTROY_DEVICE) { + goto secure_compile_exit; + } + } + +secure_compile_exit: + close(fd_secure_input); + close(fd_secure_output); + close(fd_idle_device_output); + _exit(0); +} + +static enum radv_secure_compile_type fork_secure_compile_device(struct radv_device *device, unsigned process) +{ + int fd_secure_input[2]; + int fd_secure_output[2]; + + /* create pipe descriptors (used to communicate between processes) */ + if (pipe(fd_secure_input) == -1 || pipe(fd_secure_output) == -1) + return RADV_SC_TYPE_INIT_FAILURE; + + + int sc_pid; + if ((sc_pid = fork()) == 0) { + device->sc_state->secure_compile_thread_counter = process; + run_secure_compile_device(device, process, fd_secure_output[1]); + } else { + if (sc_pid == -1) + return RADV_SC_TYPE_INIT_FAILURE; + + /* Read the init result returned from the secure process */ + enum radv_secure_compile_type sc_type; + bool sc_read = radv_sc_read(fd_secure_output[0], &sc_type, sizeof(sc_type), true); + + if (sc_type == RADV_SC_TYPE_INIT_FAILURE || !sc_read) { + close(fd_secure_input[0]); + close(fd_secure_input[1]); + close(fd_secure_output[1]); + close(fd_secure_output[0]); + int status; + waitpid(sc_pid, &status, 0); + + return RADV_SC_TYPE_INIT_FAILURE; + } else { + assert(sc_type == RADV_SC_TYPE_INIT_SUCCESS); + write(device->sc_state->secure_compile_processes[process].fd_secure_output, &sc_type, sizeof(sc_type)); + + close(fd_secure_input[0]); + close(fd_secure_input[1]); + close(fd_secure_output[1]); + close(fd_secure_output[0]); + + int status; + waitpid(sc_pid, &status, 0); + } + } + + return RADV_SC_TYPE_INIT_SUCCESS; +} + +/* Run a bare bones fork of a device that was forked right after its creation. + * This device will have low overhead when it is forked again before each + * pipeline compilation. This device sits idle and its only job is to fork + * itself. + */ +static void run_secure_compile_idle_device(struct radv_device *device, unsigned process, + int fd_secure_input, int fd_secure_output) +{ + enum radv_secure_compile_type sc_type = RADV_SC_TYPE_INIT_SUCCESS; + device->sc_state->secure_compile_processes[process].fd_secure_input = fd_secure_input; + device->sc_state->secure_compile_processes[process].fd_secure_output = fd_secure_output; + + write(fd_secure_output, &sc_type, sizeof(sc_type)); + + while (true) { + radv_sc_read(fd_secure_input, &sc_type, sizeof(sc_type), false); + + if (sc_type == RADV_SC_TYPE_FORK_DEVICE) { + sc_type = fork_secure_compile_device(device, process); + + if (sc_type == RADV_SC_TYPE_INIT_FAILURE) + goto secure_compile_exit; + + } else if (sc_type == RADV_SC_TYPE_DESTROY_DEVICE) { + goto secure_compile_exit; + } + } + +secure_compile_exit: + close(fd_secure_input); + close(fd_secure_output); + _exit(0); +} + +static void destroy_secure_compile_device(struct radv_device *device, unsigned process) +{ + int fd_secure_input = device->sc_state->secure_compile_processes[process].fd_secure_input; + + enum radv_secure_compile_type sc_type = RADV_SC_TYPE_DESTROY_DEVICE; + write(fd_secure_input, &sc_type, sizeof(sc_type)); + + close(device->sc_state->secure_compile_processes[process].fd_secure_input); + close(device->sc_state->secure_compile_processes[process].fd_secure_output); + + int status; + waitpid(device->sc_state->secure_compile_processes[process].sc_pid, &status, 0); +} + +static VkResult fork_secure_compile_idle_device(struct radv_device *device) +{ + device->sc_state = vk_zalloc(&device->alloc, + sizeof(struct radv_secure_compile_state), + 8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); + + mtx_init(&device->sc_state->secure_compile_mutex, mtx_plain); + + pid_t upid = getpid(); + time_t seconds = time(NULL); + + char *uid; + if (asprintf(&uid, "%ld_%ld", (long) upid, (long) seconds) == -1) + return VK_ERROR_INITIALIZATION_FAILED; + + device->sc_state->uid = uid; + + uint8_t sc_threads = device->instance->num_sc_threads; + int fd_secure_input[MAX_SC_PROCS][2]; + int fd_secure_output[MAX_SC_PROCS][2]; + + /* create pipe descriptors (used to communicate between processes) */ + for (unsigned i = 0; i < sc_threads; i++) { + if (pipe(fd_secure_input[i]) == -1 || + pipe(fd_secure_output[i]) == -1) { + return VK_ERROR_INITIALIZATION_FAILED; + } + } + + device->sc_state->secure_compile_processes = vk_zalloc(&device->alloc, + sizeof(struct radv_secure_compile_process) * sc_threads, 8, + VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); + + for (unsigned process = 0; process < sc_threads; process++) { + if ((device->sc_state->secure_compile_processes[process].sc_pid = fork()) == 0) { + device->sc_state->secure_compile_thread_counter = process; + run_secure_compile_idle_device(device, process, fd_secure_input[process][0], fd_secure_output[process][1]); + } else { + if (device->sc_state->secure_compile_processes[process].sc_pid == -1) + return VK_ERROR_INITIALIZATION_FAILED; + + /* Read the init result returned from the secure process */ + enum radv_secure_compile_type sc_type; + bool sc_read = radv_sc_read(fd_secure_output[process][0], &sc_type, sizeof(sc_type), true); + + bool fifo_result; + if (sc_read && sc_type == RADV_SC_TYPE_INIT_SUCCESS) { + fifo_result = secure_compile_open_fifo_fds(device->sc_state, + &device->sc_state->secure_compile_processes[process].fd_server, + &device->sc_state->secure_compile_processes[process].fd_client, + process, true); + + device->sc_state->secure_compile_processes[process].fd_secure_input = fd_secure_input[process][1]; + device->sc_state->secure_compile_processes[process].fd_secure_output = fd_secure_output[process][0]; + } + + if (sc_type == RADV_SC_TYPE_INIT_FAILURE || !sc_read || !fifo_result) { + close(fd_secure_input[process][0]); + close(fd_secure_input[process][1]); + close(fd_secure_output[process][1]); + close(fd_secure_output[process][0]); + int status; + waitpid(device->sc_state->secure_compile_processes[process].sc_pid, &status, 0); + + /* Destroy any forks that were created sucessfully */ + for (unsigned i = 0; i < process; i++) { + destroy_secure_compile_device(device, i); + } + + return VK_ERROR_INITIALIZATION_FAILED; + } + } + } + return VK_SUCCESS; +} + +static VkResult +radv_create_pthread_cond(pthread_cond_t *cond) +{ + pthread_condattr_t condattr; + if (pthread_condattr_init(&condattr)) { + return VK_ERROR_INITIALIZATION_FAILED; + } + + if (pthread_condattr_setclock(&condattr, CLOCK_MONOTONIC)) { + pthread_condattr_destroy(&condattr); + return VK_ERROR_INITIALIZATION_FAILED; + } + if (pthread_cond_init(cond, &condattr)) { + pthread_condattr_destroy(&condattr); + return VK_ERROR_INITIALIZATION_FAILED; + } + pthread_condattr_destroy(&condattr); + return VK_SUCCESS; +} + +VkResult radv_CreateDevice( + VkPhysicalDevice physicalDevice, + const VkDeviceCreateInfo* pCreateInfo, + const VkAllocationCallbacks* pAllocator, + VkDevice* pDevice) +{ + RADV_FROM_HANDLE(radv_physical_device, physical_device, physicalDevice); + VkResult result; + struct radv_device *device; + + bool keep_shader_info = false; + + /* Check enabled features */ + if (pCreateInfo->pEnabledFeatures) { + VkPhysicalDeviceFeatures supported_features; + radv_GetPhysicalDeviceFeatures(physicalDevice, &supported_features); + VkBool32 *supported_feature = (VkBool32 *)&supported_features; + VkBool32 *enabled_feature = (VkBool32 *)pCreateInfo->pEnabledFeatures; + unsigned num_features = sizeof(VkPhysicalDeviceFeatures) / sizeof(VkBool32); + for (uint32_t i = 0; i < num_features; i++) { + if (enabled_feature[i] && !supported_feature[i]) + return vk_error(physical_device->instance, VK_ERROR_FEATURE_NOT_PRESENT); + } + } + + device = vk_zalloc2(&physical_device->instance->alloc, pAllocator, + sizeof(*device), 8, + VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); + if (!device) + return vk_error(physical_device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); + + device->_loader_data.loaderMagic = ICD_LOADER_MAGIC; + device->instance = physical_device->instance; + device->physical_device = physical_device; + + device->ws = physical_device->ws; + if (pAllocator) + device->alloc = *pAllocator; + else + device->alloc = physical_device->instance->alloc; + + for (uint32_t i = 0; i < pCreateInfo->enabledExtensionCount; i++) { + const char *ext_name = pCreateInfo->ppEnabledExtensionNames[i]; + int index = radv_get_device_extension_index(ext_name); + if (index < 0 || !physical_device->supported_extensions.extensions[index]) { + vk_free(&device->alloc, device); + return vk_error(physical_device->instance, VK_ERROR_EXTENSION_NOT_PRESENT); + } + + device->enabled_extensions.extensions[index] = true; + } + + keep_shader_info = device->enabled_extensions.AMD_shader_info; + + /* With update after bind we can't attach bo's to the command buffer + * from the descriptor set anymore, so we have to use a global BO list. + */ + device->use_global_bo_list = + (device->instance->perftest_flags & RADV_PERFTEST_BO_LIST) || + device->enabled_extensions.EXT_descriptor_indexing || + device->enabled_extensions.EXT_buffer_device_address || + device->enabled_extensions.KHR_buffer_device_address; + + device->robust_buffer_access = pCreateInfo->pEnabledFeatures && + pCreateInfo->pEnabledFeatures->robustBufferAccess; + + mtx_init(&device->shader_slab_mutex, mtx_plain); + list_inithead(&device->shader_slabs); + + radv_bo_list_init(&device->bo_list); + + for (unsigned i = 0; i < pCreateInfo->queueCreateInfoCount; i++) { + const VkDeviceQueueCreateInfo *queue_create = &pCreateInfo->pQueueCreateInfos[i]; + uint32_t qfi = queue_create->queueFamilyIndex; + const VkDeviceQueueGlobalPriorityCreateInfoEXT *global_priority = + vk_find_struct_const(queue_create->pNext, DEVICE_QUEUE_GLOBAL_PRIORITY_CREATE_INFO_EXT); + + assert(!global_priority || device->physical_device->rad_info.has_ctx_priority); + + device->queues[qfi] = vk_alloc(&device->alloc, + queue_create->queueCount * sizeof(struct radv_queue), 8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); + if (!device->queues[qfi]) { + result = VK_ERROR_OUT_OF_HOST_MEMORY; goto fail; } @@ -1984,14 +2936,11 @@ device->pbb_allowed = device->physical_device->rad_info.chip_class >= GFX9 && !(device->instance->debug_flags & RADV_DEBUG_NOBINNING); + /* Disable DFSM by default. As of 2019-09-15 Talos on Low is still 3% slower on Raven. */ device->dfsm_allowed = device->pbb_allowed && - (device->physical_device->rad_info.family == CHIP_RAVEN || - device->physical_device->rad_info.family == CHIP_RAVEN2 || - device->physical_device->rad_info.family == CHIP_RENOIR); + (device->instance->perftest_flags & RADV_PERFTEST_DFSM); -#ifdef ANDROID device->always_use_syncobj = device->physical_device->rad_info.has_syncobj_wait_for_submit; -#endif /* The maximum number of scratch waves. Scratch space isn't divided * evenly between CUs. The number is only a function of the number of CUs. @@ -2009,8 +2958,7 @@ device->scratch_waves = MAX2(32 * physical_device->rad_info.num_good_compute_units, max_threads_per_block / 64); - device->dispatch_initiator = S_00B800_COMPUTE_SHADER_EN(1) | - S_00B800_CS_W32_EN(device->physical_device->cs_wave_size == 32); + device->dispatch_initiator = S_00B800_COMPUTE_SHADER_EN(1); if (device->physical_device->rad_info.chip_class >= GFX7) { /* If the KMD allows it (there is a KMD hw register for it), @@ -2023,9 +2971,6 @@ device->tess_offchip_block_dw_size = device->physical_device->rad_info.family == CHIP_HAWAII ? 4096 : 8192; - device->has_distributed_tess = - device->physical_device->rad_info.chip_class >= GFX8 && - device->physical_device->rad_info.max_se >= 2; if (getenv("RADV_TRACE_FILE")) { const char *filename = getenv("RADV_TRACE_FILE"); @@ -2043,8 +2988,12 @@ radv_dump_enabled_options(device, stderr); } - device->keep_shader_info = keep_shader_info; + /* Temporarily disable secure compile while we create meta shaders, etc */ + uint8_t sc_threads = device->instance->num_sc_threads; + if (sc_threads) + device->instance->num_sc_threads = 0; + device->keep_shader_info = keep_shader_info; result = radv_device_init_meta(device); if (result != VK_SUCCESS) goto fail; @@ -2084,6 +3033,10 @@ device->mem_cache = radv_pipeline_cache_from_handle(pc); + result = radv_create_pthread_cond(&device->timeline_cond); + if (result != VK_SUCCESS) + goto fail_mem_cache; + device->force_aniso = MIN2(16, radv_get_int_debug_option("RADV_TEX_ANISO", -1)); if (device->force_aniso >= 0) { @@ -2091,9 +3044,20 @@ 1 << util_logbase2(device->force_aniso)); } + /* Fork device for secure compile as required */ + device->instance->num_sc_threads = sc_threads; + if (radv_device_use_secure_compile(device->instance)) { + + result = fork_secure_compile_idle_device(device); + if (result != VK_SUCCESS) + goto fail_meta; + } + *pDevice = radv_device_to_handle(device); return VK_SUCCESS; +fail_mem_cache: + radv_DestroyPipelineCache(radv_device_to_handle(device), pc, NULL); fail_meta: radv_device_finish_meta(device); fail: @@ -2146,7 +3110,19 @@ radv_destroy_shader_slabs(device); + pthread_cond_destroy(&device->timeline_cond); radv_bo_list_finish(&device->bo_list); + if (radv_device_use_secure_compile(device->instance)) { + for (unsigned i = 0; i < device->instance->num_sc_threads; i++ ) { + destroy_secure_compile_device(device, i); + } + } + + if (device->sc_state) { + free(device->sc_state->uid); + vk_free(&device->alloc, device->sc_state->secure_compile_processes); + } + vk_free(&device->alloc, device->sc_state); vk_free(&device->alloc, device); } @@ -2250,7 +3226,7 @@ if (queue->device->physical_device->rad_info.chip_class >= GFX10) { desc[3] |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) | - S_008F0C_OOB_SELECT(2) | + S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_DISABLED) | S_008F0C_RESOURCE_LEVEL(1); } else { desc[3] |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | @@ -2271,7 +3247,7 @@ if (queue->device->physical_device->rad_info.chip_class >= GFX10) { desc[7] |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) | - S_008F0C_OOB_SELECT(2) | + S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_DISABLED) | S_008F0C_RESOURCE_LEVEL(1); } else { desc[7] |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | @@ -2297,7 +3273,7 @@ if (queue->device->physical_device->rad_info.chip_class >= GFX10) { desc[3] |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) | - S_008F0C_OOB_SELECT(2) | + S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_DISABLED) | S_008F0C_RESOURCE_LEVEL(1); } else { desc[3] |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | @@ -2320,7 +3296,7 @@ if (queue->device->physical_device->rad_info.chip_class >= GFX10) { desc[7] |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) | - S_008F0C_OOB_SELECT(2) | + S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_DISABLED) | S_008F0C_RESOURCE_LEVEL(1); } else { desc[7] |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | @@ -2346,7 +3322,7 @@ if (queue->device->physical_device->rad_info.chip_class >= GFX10) { desc[3] |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) | - S_008F0C_OOB_SELECT(3) | + S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1); } else { desc[3] |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | @@ -2363,7 +3339,7 @@ if (queue->device->physical_device->rad_info.chip_class >= GFX10) { desc[7] |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) | - S_008F0C_OOB_SELECT(3) | + S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1); } else { desc[7] |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | @@ -2525,8 +3501,27 @@ } static void +radv_emit_graphics_scratch(struct radv_queue *queue, struct radeon_cmdbuf *cs, + uint32_t size_per_wave, uint32_t waves, + struct radeon_winsys_bo *scratch_bo) +{ + if (queue->queue_family_index != RADV_QUEUE_GENERAL) + return; + + if (!scratch_bo) + return; + + radv_cs_add_buffer(queue->device->ws, cs, scratch_bo); + + radeon_set_context_reg(cs, R_0286E8_SPI_TMPRING_SIZE, + S_0286E8_WAVES(waves) | + S_0286E8_WAVESIZE(round_up_u32(size_per_wave, 1024))); +} + +static void radv_emit_compute_scratch(struct radv_queue *queue, struct radeon_cmdbuf *cs, - struct radeon_winsys_bo *compute_scratch_bo) + uint32_t size_per_wave, uint32_t waves, + struct radeon_winsys_bo *compute_scratch_bo) { uint64_t scratch_va; @@ -2541,6 +3536,10 @@ radeon_emit(cs, scratch_va); radeon_emit(cs, S_008F04_BASE_ADDRESS_HI(scratch_va >> 32) | S_008F04_SWIZZLE_ENABLE(1)); + + radeon_set_sh_reg(cs, R_00B860_COMPUTE_TMPRING_SIZE, + S_00B860_WAVES(waves) | + S_00B860_WAVESIZE(round_up_u32(size_per_wave, 1024))); } static void @@ -2621,11 +3620,15 @@ static VkResult radv_get_preamble_cs(struct radv_queue *queue, - uint32_t scratch_size, - uint32_t compute_scratch_size, + uint32_t scratch_size_per_wave, + uint32_t scratch_waves, + uint32_t compute_scratch_size_per_wave, + uint32_t compute_scratch_waves, uint32_t esgs_ring_size, uint32_t gsvs_ring_size, bool needs_tess_rings, + bool needs_gds, + bool needs_gds_oa, bool needs_sample_positions, struct radeon_cmdbuf **initial_full_flush_preamble_cs, struct radeon_cmdbuf **initial_preamble_cs, @@ -2637,8 +3640,10 @@ struct radeon_winsys_bo *esgs_ring_bo = NULL; struct radeon_winsys_bo *gsvs_ring_bo = NULL; struct radeon_winsys_bo *tess_rings_bo = NULL; + struct radeon_winsys_bo *gds_bo = NULL; + struct radeon_winsys_bo *gds_oa_bo = NULL; struct radeon_cmdbuf *dest_cs[3] = {0}; - bool add_tess_rings = false, add_sample_positions = false; + bool add_tess_rings = false, add_gds = false, add_gds_oa = false, add_sample_positions = false; unsigned tess_factor_ring_size = 0, tess_offchip_ring_size = 0; unsigned max_offchip_buffers; unsigned hs_offchip_param = 0; @@ -2648,6 +3653,14 @@ if (needs_tess_rings) add_tess_rings = true; } + if (!queue->has_gds) { + if (needs_gds) + add_gds = true; + } + if (!queue->has_gds_oa) { + if (needs_gds_oa) + add_gds_oa = true; + } if (!queue->has_sample_positions) { if (needs_sample_positions) add_sample_positions = true; @@ -2659,22 +3672,39 @@ tess_offchip_ring_size = max_offchip_buffers * queue->device->tess_offchip_block_dw_size * 4; - if (scratch_size <= queue->scratch_size && - compute_scratch_size <= queue->compute_scratch_size && + scratch_size_per_wave = MAX2(scratch_size_per_wave, queue->scratch_size_per_wave); + if (scratch_size_per_wave) + scratch_waves = MIN2(scratch_waves, UINT32_MAX / scratch_size_per_wave); + else + scratch_waves = 0; + + compute_scratch_size_per_wave = MAX2(compute_scratch_size_per_wave, queue->compute_scratch_size_per_wave); + if (compute_scratch_size_per_wave) + compute_scratch_waves = MIN2(compute_scratch_waves, UINT32_MAX / compute_scratch_size_per_wave); + else + compute_scratch_waves = 0; + + if (scratch_size_per_wave <= queue->scratch_size_per_wave && + scratch_waves <= queue->scratch_waves && + compute_scratch_size_per_wave <= queue->compute_scratch_size_per_wave && + compute_scratch_waves <= queue->compute_scratch_waves && esgs_ring_size <= queue->esgs_ring_size && gsvs_ring_size <= queue->gsvs_ring_size && - !add_tess_rings && !add_sample_positions && + !add_tess_rings && !add_gds && !add_gds_oa && !add_sample_positions && queue->initial_preamble_cs) { *initial_full_flush_preamble_cs = queue->initial_full_flush_preamble_cs; *initial_preamble_cs = queue->initial_preamble_cs; *continue_preamble_cs = queue->continue_preamble_cs; - if (!scratch_size && !compute_scratch_size && !esgs_ring_size && !gsvs_ring_size && - !needs_tess_rings && !needs_sample_positions) + if (!scratch_size_per_wave && !compute_scratch_size_per_wave && + !esgs_ring_size && !gsvs_ring_size && !needs_tess_rings && + !needs_gds && !needs_gds_oa && !needs_sample_positions) *continue_preamble_cs = NULL; return VK_SUCCESS; } - if (scratch_size > queue->scratch_size) { + uint32_t scratch_size = scratch_size_per_wave * scratch_waves; + uint32_t queue_scratch_size = queue->scratch_size_per_wave * queue->scratch_waves; + if (scratch_size > queue_scratch_size) { scratch_bo = queue->device->ws->buffer_create(queue->device->ws, scratch_size, 4096, @@ -2686,7 +3716,9 @@ } else scratch_bo = queue->scratch_bo; - if (compute_scratch_size > queue->compute_scratch_size) { + uint32_t compute_scratch_size = compute_scratch_size_per_wave * compute_scratch_waves; + uint32_t compute_queue_scratch_size = queue->compute_scratch_size_per_wave * queue->compute_scratch_waves; + if (compute_scratch_size > compute_queue_scratch_size) { compute_scratch_bo = queue->device->ws->buffer_create(queue->device->ws, compute_scratch_size, 4096, @@ -2740,6 +3772,37 @@ tess_rings_bo = queue->tess_rings_bo; } + if (add_gds) { + assert(queue->device->physical_device->rad_info.chip_class >= GFX10); + + /* 4 streamout GDS counters. + * We need 256B (64 dw) of GDS, otherwise streamout hangs. + */ + gds_bo = queue->device->ws->buffer_create(queue->device->ws, + 256, 4, + RADEON_DOMAIN_GDS, + ring_bo_flags, + RADV_BO_PRIORITY_SCRATCH); + if (!gds_bo) + goto fail; + } else { + gds_bo = queue->gds_bo; + } + + if (add_gds_oa) { + assert(queue->device->physical_device->rad_info.chip_class >= GFX10); + + gds_oa_bo = queue->device->ws->buffer_create(queue->device->ws, + 4, 1, + RADEON_DOMAIN_OA, + ring_bo_flags, + RADV_BO_PRIORITY_SCRATCH); + if (!gds_oa_bo) + goto fail; + } else { + gds_oa_bo = queue->gds_oa_bo; + } + if (scratch_bo != queue->scratch_bo || esgs_ring_bo != queue->esgs_ring_bo || gsvs_ring_bo != queue->gsvs_ring_bo || @@ -2828,7 +3891,15 @@ radv_emit_tess_factor_ring(queue, cs, hs_offchip_param, tess_factor_ring_size, tess_rings_bo); radv_emit_global_shader_pointers(queue, cs, descriptor_bo); - radv_emit_compute_scratch(queue, cs, compute_scratch_bo); + radv_emit_compute_scratch(queue, cs, compute_scratch_size_per_wave, + compute_scratch_waves, compute_scratch_bo); + radv_emit_graphics_scratch(queue, cs, scratch_size_per_wave, + scratch_waves, scratch_bo); + + if (gds_bo) + radv_cs_add_buffer(queue->device->ws, cs, gds_bo); + if (gds_oa_bo) + radv_cs_add_buffer(queue->device->ws, cs, gds_oa_bo); if (i == 0) { si_cs_emit_cache_flush(cs, @@ -2876,15 +3947,17 @@ if (queue->scratch_bo) queue->device->ws->buffer_destroy(queue->scratch_bo); queue->scratch_bo = scratch_bo; - queue->scratch_size = scratch_size; } + queue->scratch_size_per_wave = scratch_size_per_wave; + queue->scratch_waves = scratch_waves; if (compute_scratch_bo != queue->compute_scratch_bo) { if (queue->compute_scratch_bo) queue->device->ws->buffer_destroy(queue->compute_scratch_bo); queue->compute_scratch_bo = compute_scratch_bo; - queue->compute_scratch_size = compute_scratch_size; } + queue->compute_scratch_size_per_wave = compute_scratch_size_per_wave; + queue->compute_scratch_waves = compute_scratch_waves; if (esgs_ring_bo != queue->esgs_ring_bo) { if (queue->esgs_ring_bo) @@ -2905,6 +3978,16 @@ queue->has_tess_rings = true; } + if (gds_bo != queue->gds_bo) { + queue->gds_bo = gds_bo; + queue->has_gds = true; + } + + if (gds_oa_bo != queue->gds_oa_bo) { + queue->gds_oa_bo = gds_oa_bo; + queue->has_gds_oa = true; + } + if (descriptor_bo != queue->descriptor_bo) { if (queue->descriptor_bo) queue->device->ws->buffer_destroy(queue->descriptor_bo); @@ -2937,15 +4020,21 @@ queue->device->ws->buffer_destroy(gsvs_ring_bo); if (tess_rings_bo && tess_rings_bo != queue->tess_rings_bo) queue->device->ws->buffer_destroy(tess_rings_bo); + if (gds_bo && gds_bo != queue->gds_bo) + queue->device->ws->buffer_destroy(gds_bo); + if (gds_oa_bo && gds_oa_bo != queue->gds_oa_bo) + queue->device->ws->buffer_destroy(gds_oa_bo); + return vk_error(queue->device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY); } -static VkResult radv_alloc_sem_counts(struct radv_instance *instance, +static VkResult radv_alloc_sem_counts(struct radv_device *device, struct radv_winsys_sem_counts *counts, int num_sems, - const VkSemaphore *sems, + struct radv_semaphore_part **sems, + const uint64_t *timeline_values, VkFence _fence, - bool reset_temp) + bool is_signal) { int syncobj_idx = 0, sem_idx = 0; @@ -2953,12 +4042,19 @@ return VK_SUCCESS; for (uint32_t i = 0; i < num_sems; i++) { - RADV_FROM_HANDLE(radv_semaphore, sem, sems[i]); - - if (sem->temp_syncobj || sem->syncobj) + switch(sems[i]->kind) { + case RADV_SEMAPHORE_SYNCOBJ: counts->syncobj_count++; - else + break; + case RADV_SEMAPHORE_WINSYS: counts->sem_count++; + break; + case RADV_SEMAPHORE_NONE: + break; + case RADV_SEMAPHORE_TIMELINE: + counts->syncobj_count++; + break; + } } if (_fence != VK_NULL_HANDLE) { @@ -2970,28 +4066,48 @@ if (counts->syncobj_count) { counts->syncobj = (uint32_t *)malloc(sizeof(uint32_t) * counts->syncobj_count); if (!counts->syncobj) - return vk_error(instance, VK_ERROR_OUT_OF_HOST_MEMORY); + return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); } if (counts->sem_count) { counts->sem = (struct radeon_winsys_sem **)malloc(sizeof(struct radeon_winsys_sem *) * counts->sem_count); if (!counts->sem) { free(counts->syncobj); - return vk_error(instance, VK_ERROR_OUT_OF_HOST_MEMORY); + return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); } } for (uint32_t i = 0; i < num_sems; i++) { - RADV_FROM_HANDLE(radv_semaphore, sem, sems[i]); + switch(sems[i]->kind) { + case RADV_SEMAPHORE_NONE: + unreachable("Empty semaphore"); + break; + case RADV_SEMAPHORE_SYNCOBJ: + counts->syncobj[syncobj_idx++] = sems[i]->syncobj; + break; + case RADV_SEMAPHORE_WINSYS: + counts->sem[sem_idx++] = sems[i]->ws_sem; + break; + case RADV_SEMAPHORE_TIMELINE: { + pthread_mutex_lock(&sems[i]->timeline.mutex); + struct radv_timeline_point *point = NULL; + if (is_signal) { + point = radv_timeline_add_point_locked(device, &sems[i]->timeline, timeline_values[i]); + } else { + point = radv_timeline_find_point_at_least_locked(device, &sems[i]->timeline, timeline_values[i]); + } - if (sem->temp_syncobj) { - counts->syncobj[syncobj_idx++] = sem->temp_syncobj; + pthread_mutex_unlock(&sems[i]->timeline.mutex); + + if (point) { + counts->syncobj[syncobj_idx++] = point->syncobj; + } else { + /* Explicitly remove the semaphore so we might not find + * a point later post-submit. */ + sems[i] = NULL; + } + break; } - else if (sem->syncobj) - counts->syncobj[syncobj_idx++] = sem->syncobj; - else { - assert(sem->sem); - counts->sem[sem_idx++] = sem->sem; } } @@ -3003,6 +4119,9 @@ counts->syncobj[syncobj_idx++] = fence->syncobj; } + assert(syncobj_idx <= counts->syncobj_count); + counts->syncobj_count = syncobj_idx; + return VK_SUCCESS; } @@ -3018,34 +4137,31 @@ static void radv_free_temp_syncobjs(struct radv_device *device, int num_sems, - const VkSemaphore *sems) + struct radv_semaphore_part *sems) { for (uint32_t i = 0; i < num_sems; i++) { - RADV_FROM_HANDLE(radv_semaphore, sem, sems[i]); - - if (sem->temp_syncobj) { - device->ws->destroy_syncobj(device->ws, sem->temp_syncobj); - sem->temp_syncobj = 0; - } + radv_destroy_semaphore_part(device, sems + i); } } static VkResult -radv_alloc_sem_info(struct radv_instance *instance, +radv_alloc_sem_info(struct radv_device *device, struct radv_winsys_sem_info *sem_info, int num_wait_sems, - const VkSemaphore *wait_sems, + struct radv_semaphore_part **wait_sems, + const uint64_t *wait_values, int num_signal_sems, - const VkSemaphore *signal_sems, + struct radv_semaphore_part **signal_sems, + const uint64_t *signal_values, VkFence fence) { VkResult ret; memset(sem_info, 0, sizeof(*sem_info)); - ret = radv_alloc_sem_counts(instance, &sem_info->wait, num_wait_sems, wait_sems, VK_NULL_HANDLE, true); + ret = radv_alloc_sem_counts(device, &sem_info->wait, num_wait_sems, wait_sems, wait_values, VK_NULL_HANDLE, false); if (ret) return ret; - ret = radv_alloc_sem_counts(instance, &sem_info->signal, num_signal_sems, signal_sems, fence, false); + ret = radv_alloc_sem_counts(device, &sem_info->signal, num_signal_sems, signal_sems, signal_values, fence, true); if (ret) radv_free_sem_info(sem_info); @@ -3055,116 +4171,397 @@ return ret; } -/* Signals fence as soon as all the work currently put on queue is done. */ -static VkResult radv_signal_fence(struct radv_queue *queue, - struct radv_fence *fence) +static void +radv_finalize_timelines(struct radv_device *device, + uint32_t num_wait_sems, + struct radv_semaphore_part **wait_sems, + const uint64_t *wait_values, + uint32_t num_signal_sems, + struct radv_semaphore_part **signal_sems, + const uint64_t *signal_values, + struct list_head *processing_list) +{ + for (uint32_t i = 0; i < num_wait_sems; ++i) { + if (wait_sems[i] && wait_sems[i]->kind == RADV_SEMAPHORE_TIMELINE) { + pthread_mutex_lock(&wait_sems[i]->timeline.mutex); + struct radv_timeline_point *point = + radv_timeline_find_point_at_least_locked(device, &wait_sems[i]->timeline, wait_values[i]); + point->wait_count -= 2; + pthread_mutex_unlock(&wait_sems[i]->timeline.mutex); + } + } + for (uint32_t i = 0; i < num_signal_sems; ++i) { + if (signal_sems[i] && signal_sems[i]->kind == RADV_SEMAPHORE_TIMELINE) { + pthread_mutex_lock(&signal_sems[i]->timeline.mutex); + struct radv_timeline_point *point = + radv_timeline_find_point_at_least_locked(device, &signal_sems[i]->timeline, signal_values[i]); + signal_sems[i]->timeline.highest_submitted = + MAX2(signal_sems[i]->timeline.highest_submitted, point->value); + point->wait_count -= 2; + radv_timeline_trigger_waiters_locked(&signal_sems[i]->timeline, processing_list); + pthread_mutex_unlock(&signal_sems[i]->timeline.mutex); + } + } +} + +static void +radv_sparse_buffer_bind_memory(struct radv_device *device, + const VkSparseBufferMemoryBindInfo *bind) { - int ret; - VkResult result; - struct radv_winsys_sem_info sem_info; + RADV_FROM_HANDLE(radv_buffer, buffer, bind->buffer); - result = radv_alloc_sem_info(queue->device->instance, &sem_info, 0, NULL, 0, NULL, - radv_fence_to_handle(fence)); - if (result != VK_SUCCESS) - return result; + for (uint32_t i = 0; i < bind->bindCount; ++i) { + struct radv_device_memory *mem = NULL; - ret = queue->device->ws->cs_submit(queue->hw_ctx, queue->queue_idx, - &queue->device->empty_cs[queue->queue_family_index], - 1, NULL, NULL, &sem_info, NULL, - false, fence->fence); - radv_free_sem_info(&sem_info); + if (bind->pBinds[i].memory != VK_NULL_HANDLE) + mem = radv_device_memory_from_handle(bind->pBinds[i].memory); - if (ret) - return vk_error(queue->device->instance, VK_ERROR_DEVICE_LOST); + device->ws->buffer_virtual_bind(buffer->bo, + bind->pBinds[i].resourceOffset, + bind->pBinds[i].size, + mem ? mem->bo : NULL, + bind->pBinds[i].memoryOffset); + } +} - return VK_SUCCESS; +static void +radv_sparse_image_opaque_bind_memory(struct radv_device *device, + const VkSparseImageOpaqueMemoryBindInfo *bind) +{ + RADV_FROM_HANDLE(radv_image, image, bind->image); + + for (uint32_t i = 0; i < bind->bindCount; ++i) { + struct radv_device_memory *mem = NULL; + + if (bind->pBinds[i].memory != VK_NULL_HANDLE) + mem = radv_device_memory_from_handle(bind->pBinds[i].memory); + + device->ws->buffer_virtual_bind(image->bo, + bind->pBinds[i].resourceOffset, + bind->pBinds[i].size, + mem ? mem->bo : NULL, + bind->pBinds[i].memoryOffset); + } } -VkResult radv_QueueSubmit( - VkQueue _queue, - uint32_t submitCount, - const VkSubmitInfo* pSubmits, - VkFence _fence) +static VkResult +radv_get_preambles(struct radv_queue *queue, + const VkCommandBuffer *cmd_buffers, + uint32_t cmd_buffer_count, + struct radeon_cmdbuf **initial_full_flush_preamble_cs, + struct radeon_cmdbuf **initial_preamble_cs, + struct radeon_cmdbuf **continue_preamble_cs) { - RADV_FROM_HANDLE(radv_queue, queue, _queue); - RADV_FROM_HANDLE(radv_fence, fence, _fence); - struct radeon_winsys_fence *base_fence = fence ? fence->fence : NULL; - struct radeon_winsys_ctx *ctx = queue->hw_ctx; - int ret; - uint32_t max_cs_submission = queue->device->trace_bo ? 1 : RADV_MAX_IBS_PER_SUBMIT; - uint32_t scratch_size = 0; - uint32_t compute_scratch_size = 0; + uint32_t scratch_size_per_wave = 0, waves_wanted = 0; + uint32_t compute_scratch_size_per_wave = 0, compute_waves_wanted = 0; uint32_t esgs_ring_size = 0, gsvs_ring_size = 0; - struct radeon_cmdbuf *initial_preamble_cs = NULL, *initial_flush_preamble_cs = NULL, *continue_preamble_cs = NULL; - VkResult result; - bool fence_emitted = false; bool tess_rings_needed = false; + bool gds_needed = false; + bool gds_oa_needed = false; bool sample_positions_needed = false; - /* Do this first so failing to allocate scratch buffers can't result in - * partially executed submissions. */ - for (uint32_t i = 0; i < submitCount; i++) { - for (uint32_t j = 0; j < pSubmits[i].commandBufferCount; j++) { - RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, - pSubmits[i].pCommandBuffers[j]); - - scratch_size = MAX2(scratch_size, cmd_buffer->scratch_size_needed); - compute_scratch_size = MAX2(compute_scratch_size, - cmd_buffer->compute_scratch_size_needed); - esgs_ring_size = MAX2(esgs_ring_size, cmd_buffer->esgs_ring_size_needed); - gsvs_ring_size = MAX2(gsvs_ring_size, cmd_buffer->gsvs_ring_size_needed); - tess_rings_needed |= cmd_buffer->tess_rings_needed; - sample_positions_needed |= cmd_buffer->sample_positions_needed; + for (uint32_t j = 0; j < cmd_buffer_count; j++) { + RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, + cmd_buffers[j]); + + scratch_size_per_wave = MAX2(scratch_size_per_wave, cmd_buffer->scratch_size_per_wave_needed); + waves_wanted = MAX2(waves_wanted, cmd_buffer->scratch_waves_wanted); + compute_scratch_size_per_wave = MAX2(compute_scratch_size_per_wave, + cmd_buffer->compute_scratch_size_per_wave_needed); + compute_waves_wanted = MAX2(compute_waves_wanted, + cmd_buffer->compute_scratch_waves_wanted); + esgs_ring_size = MAX2(esgs_ring_size, cmd_buffer->esgs_ring_size_needed); + gsvs_ring_size = MAX2(gsvs_ring_size, cmd_buffer->gsvs_ring_size_needed); + tess_rings_needed |= cmd_buffer->tess_rings_needed; + gds_needed |= cmd_buffer->gds_needed; + gds_oa_needed |= cmd_buffer->gds_oa_needed; + sample_positions_needed |= cmd_buffer->sample_positions_needed; + } + + return radv_get_preamble_cs(queue, scratch_size_per_wave, waves_wanted, + compute_scratch_size_per_wave, compute_waves_wanted, + esgs_ring_size, gsvs_ring_size, tess_rings_needed, + gds_needed, gds_oa_needed, sample_positions_needed, + initial_full_flush_preamble_cs, + initial_preamble_cs, continue_preamble_cs); +} + +struct radv_deferred_queue_submission { + struct radv_queue *queue; + VkCommandBuffer *cmd_buffers; + uint32_t cmd_buffer_count; + + /* Sparse bindings that happen on a queue. */ + VkSparseBufferMemoryBindInfo *buffer_binds; + uint32_t buffer_bind_count; + VkSparseImageOpaqueMemoryBindInfo *image_opaque_binds; + uint32_t image_opaque_bind_count; + + bool flush_caches; + VkShaderStageFlags wait_dst_stage_mask; + struct radv_semaphore_part **wait_semaphores; + uint32_t wait_semaphore_count; + struct radv_semaphore_part **signal_semaphores; + uint32_t signal_semaphore_count; + VkFence fence; + + uint64_t *wait_values; + uint64_t *signal_values; + + struct radv_semaphore_part *temporary_semaphore_parts; + uint32_t temporary_semaphore_part_count; + + struct list_head queue_pending_list; + uint32_t submission_wait_count; + struct radv_timeline_waiter *wait_nodes; + + struct list_head processing_list; +}; + +struct radv_queue_submission { + const VkCommandBuffer *cmd_buffers; + uint32_t cmd_buffer_count; + + /* Sparse bindings that happen on a queue. */ + const VkSparseBufferMemoryBindInfo *buffer_binds; + uint32_t buffer_bind_count; + const VkSparseImageOpaqueMemoryBindInfo *image_opaque_binds; + uint32_t image_opaque_bind_count; + + bool flush_caches; + VkPipelineStageFlags wait_dst_stage_mask; + const VkSemaphore *wait_semaphores; + uint32_t wait_semaphore_count; + const VkSemaphore *signal_semaphores; + uint32_t signal_semaphore_count; + VkFence fence; + + const uint64_t *wait_values; + uint32_t wait_value_count; + const uint64_t *signal_values; + uint32_t signal_value_count; +}; + +static VkResult +radv_create_deferred_submission(struct radv_queue *queue, + const struct radv_queue_submission *submission, + struct radv_deferred_queue_submission **out) +{ + struct radv_deferred_queue_submission *deferred = NULL; + size_t size = sizeof(struct radv_deferred_queue_submission); + + uint32_t temporary_count = 0; + for (uint32_t i = 0; i < submission->wait_semaphore_count; ++i) { + RADV_FROM_HANDLE(radv_semaphore, semaphore, submission->wait_semaphores[i]); + if (semaphore->temporary.kind != RADV_SEMAPHORE_NONE) + ++temporary_count; + } + + size += submission->cmd_buffer_count * sizeof(VkCommandBuffer); + size += submission->buffer_bind_count * sizeof(VkSparseBufferMemoryBindInfo); + size += submission->image_opaque_bind_count * sizeof(VkSparseImageOpaqueMemoryBindInfo); + size += submission->wait_semaphore_count * sizeof(struct radv_semaphore_part *); + size += temporary_count * sizeof(struct radv_semaphore_part); + size += submission->signal_semaphore_count * sizeof(struct radv_semaphore_part *); + size += submission->wait_value_count * sizeof(uint64_t); + size += submission->signal_value_count * sizeof(uint64_t); + size += submission->wait_semaphore_count * sizeof(struct radv_timeline_waiter); + + deferred = calloc(1, size); + if (!deferred) + return VK_ERROR_OUT_OF_HOST_MEMORY; + + deferred->queue = queue; + + deferred->cmd_buffers = (void*)(deferred + 1); + deferred->cmd_buffer_count = submission->cmd_buffer_count; + memcpy(deferred->cmd_buffers, submission->cmd_buffers, + submission->cmd_buffer_count * sizeof(*deferred->cmd_buffers)); + + deferred->buffer_binds = (void*)(deferred->cmd_buffers + submission->cmd_buffer_count); + deferred->buffer_bind_count = submission->buffer_bind_count; + memcpy(deferred->buffer_binds, submission->buffer_binds, + submission->buffer_bind_count * sizeof(*deferred->buffer_binds)); + + deferred->image_opaque_binds = (void*)(deferred->buffer_binds + submission->buffer_bind_count); + deferred->image_opaque_bind_count = submission->image_opaque_bind_count; + memcpy(deferred->image_opaque_binds, submission->image_opaque_binds, + submission->image_opaque_bind_count * sizeof(*deferred->image_opaque_binds)); + + deferred->flush_caches = submission->flush_caches; + deferred->wait_dst_stage_mask = submission->wait_dst_stage_mask; + + deferred->wait_semaphores = (void*)(deferred->image_opaque_binds + deferred->image_opaque_bind_count); + deferred->wait_semaphore_count = submission->wait_semaphore_count; + + deferred->signal_semaphores = (void*)(deferred->wait_semaphores + deferred->wait_semaphore_count); + deferred->signal_semaphore_count = submission->signal_semaphore_count; + + deferred->fence = submission->fence; + + deferred->temporary_semaphore_parts = (void*)(deferred->signal_semaphores + deferred->signal_semaphore_count); + deferred->temporary_semaphore_part_count = temporary_count; + + uint32_t temporary_idx = 0; + for (uint32_t i = 0; i < submission->wait_semaphore_count; ++i) { + RADV_FROM_HANDLE(radv_semaphore, semaphore, submission->wait_semaphores[i]); + if (semaphore->temporary.kind != RADV_SEMAPHORE_NONE) { + deferred->wait_semaphores[i] = &deferred->temporary_semaphore_parts[temporary_idx]; + deferred->temporary_semaphore_parts[temporary_idx] = semaphore->temporary; + semaphore->temporary.kind = RADV_SEMAPHORE_NONE; + ++temporary_idx; + } else + deferred->wait_semaphores[i] = &semaphore->permanent; + } + + for (uint32_t i = 0; i < submission->signal_semaphore_count; ++i) { + RADV_FROM_HANDLE(radv_semaphore, semaphore, submission->signal_semaphores[i]); + if (semaphore->temporary.kind != RADV_SEMAPHORE_NONE) { + deferred->signal_semaphores[i] = &semaphore->temporary; + } else { + deferred->signal_semaphores[i] = &semaphore->permanent; } } - result = radv_get_preamble_cs(queue, scratch_size, compute_scratch_size, - esgs_ring_size, gsvs_ring_size, tess_rings_needed, - sample_positions_needed, &initial_flush_preamble_cs, - &initial_preamble_cs, &continue_preamble_cs); - if (result != VK_SUCCESS) - return result; + deferred->wait_values = (void*)(deferred->temporary_semaphore_parts + temporary_count); + memcpy(deferred->wait_values, submission->wait_values, submission->wait_value_count * sizeof(uint64_t)); + deferred->signal_values = deferred->wait_values + submission->wait_value_count; + memcpy(deferred->signal_values, submission->signal_values, submission->signal_value_count * sizeof(uint64_t)); - for (uint32_t i = 0; i < submitCount; i++) { - struct radeon_cmdbuf **cs_array; - bool do_flush = !i || pSubmits[i].pWaitDstStageMask; - bool can_patch = true; - uint32_t advance; - struct radv_winsys_sem_info sem_info; - - result = radv_alloc_sem_info(queue->device->instance, - &sem_info, - pSubmits[i].waitSemaphoreCount, - pSubmits[i].pWaitSemaphores, - pSubmits[i].signalSemaphoreCount, - pSubmits[i].pSignalSemaphores, - _fence); - if (result != VK_SUCCESS) - return result; + deferred->wait_nodes = (void*)(deferred->signal_values + submission->signal_value_count); + /* This is worst-case. radv_queue_enqueue_submission will fill in further, but this + * ensure the submission is not accidentally triggered early when adding wait timelines. */ + deferred->submission_wait_count = 1 + submission->wait_semaphore_count; - if (!pSubmits[i].commandBufferCount) { - if (pSubmits[i].waitSemaphoreCount || pSubmits[i].signalSemaphoreCount) { - ret = queue->device->ws->cs_submit(ctx, queue->queue_idx, - &queue->device->empty_cs[queue->queue_family_index], - 1, NULL, NULL, - &sem_info, NULL, - false, base_fence); - if (ret) { - radv_loge("failed to submit CS %d\n", i); - abort(); - } - fence_emitted = true; + *out = deferred; + return VK_SUCCESS; +} + +static void +radv_queue_enqueue_submission(struct radv_deferred_queue_submission *submission, + struct list_head *processing_list) +{ + uint32_t wait_cnt = 0; + struct radv_timeline_waiter *waiter = submission->wait_nodes; + for (uint32_t i = 0; i < submission->wait_semaphore_count; ++i) { + if (submission->wait_semaphores[i]->kind == RADV_SEMAPHORE_TIMELINE) { + pthread_mutex_lock(&submission->wait_semaphores[i]->timeline.mutex); + if (submission->wait_semaphores[i]->timeline.highest_submitted < submission->wait_values[i]) { + ++wait_cnt; + waiter->value = submission->wait_values[i]; + waiter->submission = submission; + list_addtail(&waiter->list, &submission->wait_semaphores[i]->timeline.waiters); + ++waiter; } - radv_free_sem_info(&sem_info); - continue; + pthread_mutex_unlock(&submission->wait_semaphores[i]->timeline.mutex); } + } - cs_array = malloc(sizeof(struct radeon_cmdbuf *) * - (pSubmits[i].commandBufferCount)); + pthread_mutex_lock(&submission->queue->pending_mutex); - for (uint32_t j = 0; j < pSubmits[i].commandBufferCount; j++) { - RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, - pSubmits[i].pCommandBuffers[j]); + bool is_first = list_is_empty(&submission->queue->pending_submissions); + list_addtail(&submission->queue_pending_list, &submission->queue->pending_submissions); + + pthread_mutex_unlock(&submission->queue->pending_mutex); + + /* If there is already a submission in the queue, that will decrement the counter by 1 when + * submitted, but if the queue was empty, we decrement ourselves as there is no previous + * submission. */ + uint32_t decrement = submission->wait_semaphore_count - wait_cnt + (is_first ? 1 : 0); + if (__atomic_sub_fetch(&submission->submission_wait_count, decrement, __ATOMIC_ACQ_REL) == 0) { + list_addtail(&submission->processing_list, processing_list); + } +} + +static void +radv_queue_submission_update_queue(struct radv_deferred_queue_submission *submission, + struct list_head *processing_list) +{ + pthread_mutex_lock(&submission->queue->pending_mutex); + list_del(&submission->queue_pending_list); + + /* trigger the next submission in the queue. */ + if (!list_is_empty(&submission->queue->pending_submissions)) { + struct radv_deferred_queue_submission *next_submission = + list_first_entry(&submission->queue->pending_submissions, + struct radv_deferred_queue_submission, + queue_pending_list); + if (p_atomic_dec_zero(&next_submission->submission_wait_count)) { + list_addtail(&next_submission->processing_list, processing_list); + } + } + pthread_mutex_unlock(&submission->queue->pending_mutex); + + pthread_cond_broadcast(&submission->queue->device->timeline_cond); +} + +static VkResult +radv_queue_submit_deferred(struct radv_deferred_queue_submission *submission, + struct list_head *processing_list) +{ + RADV_FROM_HANDLE(radv_fence, fence, submission->fence); + struct radv_queue *queue = submission->queue; + struct radeon_winsys_ctx *ctx = queue->hw_ctx; + uint32_t max_cs_submission = queue->device->trace_bo ? 1 : RADV_MAX_IBS_PER_SUBMIT; + struct radeon_winsys_fence *base_fence = fence ? fence->fence : NULL; + bool do_flush = submission->flush_caches || submission->wait_dst_stage_mask; + bool can_patch = true; + uint32_t advance; + struct radv_winsys_sem_info sem_info; + VkResult result; + int ret; + struct radeon_cmdbuf *initial_preamble_cs = NULL; + struct radeon_cmdbuf *initial_flush_preamble_cs = NULL; + struct radeon_cmdbuf *continue_preamble_cs = NULL; + + result = radv_get_preambles(queue, submission->cmd_buffers, + submission->cmd_buffer_count, + &initial_preamble_cs, + &initial_flush_preamble_cs, + &continue_preamble_cs); + if (result != VK_SUCCESS) + goto fail; + + result = radv_alloc_sem_info(queue->device, + &sem_info, + submission->wait_semaphore_count, + submission->wait_semaphores, + submission->wait_values, + submission->signal_semaphore_count, + submission->signal_semaphores, + submission->signal_values, + submission->fence); + if (result != VK_SUCCESS) + goto fail; + + for (uint32_t i = 0; i < submission->buffer_bind_count; ++i) { + radv_sparse_buffer_bind_memory(queue->device, + submission->buffer_binds + i); + } + + for (uint32_t i = 0; i < submission->image_opaque_bind_count; ++i) { + radv_sparse_image_opaque_bind_memory(queue->device, + submission->image_opaque_binds + i); + } + + if (!submission->cmd_buffer_count) { + ret = queue->device->ws->cs_submit(ctx, queue->queue_idx, + &queue->device->empty_cs[queue->queue_family_index], + 1, NULL, NULL, + &sem_info, NULL, + false, base_fence); + if (ret) { + radv_loge("failed to submit CS\n"); + abort(); + } + + goto success; + } else { + struct radeon_cmdbuf **cs_array = malloc(sizeof(struct radeon_cmdbuf *) * + (submission->cmd_buffer_count)); + + for (uint32_t j = 0; j < submission->cmd_buffer_count; j++) { + RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, submission->cmd_buffers[j]); assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY); cs_array[j] = cmd_buffer->cs; @@ -3174,18 +4571,18 @@ cmd_buffer->status = RADV_CMD_BUFFER_STATUS_PENDING; } - for (uint32_t j = 0; j < pSubmits[i].commandBufferCount; j += advance) { + for (uint32_t j = 0; j < submission->cmd_buffer_count; j += advance) { struct radeon_cmdbuf *initial_preamble = (do_flush && !j) ? initial_flush_preamble_cs : initial_preamble_cs; const struct radv_winsys_bo_list *bo_list = NULL; advance = MIN2(max_cs_submission, - pSubmits[i].commandBufferCount - j); + submission->cmd_buffer_count - j); if (queue->device->trace_bo) *queue->device->trace_id_ptr = 0; sem_info.cs_emit_wait = j == 0; - sem_info.cs_emit_signal = j + advance == pSubmits[i].commandBufferCount; + sem_info.cs_emit_signal = j + advance == submission->cmd_buffer_count; if (unlikely(queue->device->use_global_bo_list)) { pthread_mutex_lock(&queue->device->bo_list.mutex); @@ -3193,36 +4590,155 @@ } ret = queue->device->ws->cs_submit(ctx, queue->queue_idx, cs_array + j, - advance, initial_preamble, continue_preamble_cs, - &sem_info, bo_list, - can_patch, base_fence); + advance, initial_preamble, continue_preamble_cs, + &sem_info, bo_list, + can_patch, base_fence); if (unlikely(queue->device->use_global_bo_list)) pthread_mutex_unlock(&queue->device->bo_list.mutex); - if (ret) { - radv_loge("failed to submit CS %d\n", i); - abort(); - } - fence_emitted = true; - if (queue->device->trace_bo) { - radv_check_gpu_hangs(queue, cs_array[j]); - } - } + if (ret) { + radv_loge("failed to submit CS\n"); + abort(); + } + if (queue->device->trace_bo) { + radv_check_gpu_hangs(queue, cs_array[j]); + } + } + + free(cs_array); + } + +success: + radv_free_temp_syncobjs(queue->device, + submission->temporary_semaphore_part_count, + submission->temporary_semaphore_parts); + radv_finalize_timelines(queue->device, + submission->wait_semaphore_count, + submission->wait_semaphores, + submission->wait_values, + submission->signal_semaphore_count, + submission->signal_semaphores, + submission->signal_values, + processing_list); + /* Has to happen after timeline finalization to make sure the + * condition variable is only triggered when timelines and queue have + * been updated. */ + radv_queue_submission_update_queue(submission, processing_list); + radv_free_sem_info(&sem_info); + free(submission); + return VK_SUCCESS; + +fail: + radv_free_temp_syncobjs(queue->device, + submission->temporary_semaphore_part_count, + submission->temporary_semaphore_parts); + free(submission); + return VK_ERROR_DEVICE_LOST; +} + +static VkResult +radv_process_submissions(struct list_head *processing_list) +{ + while(!list_is_empty(processing_list)) { + struct radv_deferred_queue_submission *submission = + list_first_entry(processing_list, struct radv_deferred_queue_submission, processing_list); + list_del(&submission->processing_list); + + VkResult result = radv_queue_submit_deferred(submission, processing_list); + if (result != VK_SUCCESS) + return result; + } + return VK_SUCCESS; +} + +static VkResult radv_queue_submit(struct radv_queue *queue, + const struct radv_queue_submission *submission) +{ + struct radv_deferred_queue_submission *deferred = NULL; + + VkResult result = radv_create_deferred_submission(queue, submission, &deferred); + if (result != VK_SUCCESS) + return result; + + struct list_head processing_list; + list_inithead(&processing_list); + + radv_queue_enqueue_submission(deferred, &processing_list); + return radv_process_submissions(&processing_list); +} + +/* Signals fence as soon as all the work currently put on queue is done. */ +static VkResult radv_signal_fence(struct radv_queue *queue, + VkFence fence) +{ + return radv_queue_submit(queue, &(struct radv_queue_submission) { + .fence = fence + }); +} + +static bool radv_submit_has_effects(const VkSubmitInfo *info) +{ + return info->commandBufferCount || + info->waitSemaphoreCount || + info->signalSemaphoreCount; +} + +VkResult radv_QueueSubmit( + VkQueue _queue, + uint32_t submitCount, + const VkSubmitInfo* pSubmits, + VkFence fence) +{ + RADV_FROM_HANDLE(radv_queue, queue, _queue); + VkResult result; + uint32_t fence_idx = 0; + bool flushed_caches = false; + + if (fence != VK_NULL_HANDLE) { + for (uint32_t i = 0; i < submitCount; ++i) + if (radv_submit_has_effects(pSubmits + i)) + fence_idx = i; + } else + fence_idx = UINT32_MAX; + + for (uint32_t i = 0; i < submitCount; i++) { + if (!radv_submit_has_effects(pSubmits + i) && fence_idx != i) + continue; + + VkPipelineStageFlags wait_dst_stage_mask = 0; + for (unsigned j = 0; j < pSubmits[i].waitSemaphoreCount; ++j) { + wait_dst_stage_mask |= pSubmits[i].pWaitDstStageMask[j]; + } + + const VkTimelineSemaphoreSubmitInfo *timeline_info = + vk_find_struct_const(pSubmits[i].pNext, TIMELINE_SEMAPHORE_SUBMIT_INFO); + + result = radv_queue_submit(queue, &(struct radv_queue_submission) { + .cmd_buffers = pSubmits[i].pCommandBuffers, + .cmd_buffer_count = pSubmits[i].commandBufferCount, + .wait_dst_stage_mask = wait_dst_stage_mask, + .flush_caches = !flushed_caches, + .wait_semaphores = pSubmits[i].pWaitSemaphores, + .wait_semaphore_count = pSubmits[i].waitSemaphoreCount, + .signal_semaphores = pSubmits[i].pSignalSemaphores, + .signal_semaphore_count = pSubmits[i].signalSemaphoreCount, + .fence = i == fence_idx ? fence : VK_NULL_HANDLE, + .wait_values = timeline_info ? timeline_info->pWaitSemaphoreValues : NULL, + .wait_value_count = timeline_info && timeline_info->pWaitSemaphoreValues ? timeline_info->waitSemaphoreValueCount : 0, + .signal_values = timeline_info ? timeline_info->pSignalSemaphoreValues : NULL, + .signal_value_count = timeline_info && timeline_info->pSignalSemaphoreValues ? timeline_info->signalSemaphoreValueCount : 0, + }); + if (result != VK_SUCCESS) + return result; - radv_free_temp_syncobjs(queue->device, - pSubmits[i].waitSemaphoreCount, - pSubmits[i].pWaitSemaphores); - radv_free_sem_info(&sem_info); - free(cs_array); + flushed_caches = true; } - if (fence) { - if (!fence_emitted) { - result = radv_signal_fence(queue, fence); - if (result != VK_SUCCESS) - return result; - } + if (fence != VK_NULL_HANDLE && !submitCount) { + result = radv_signal_fence(queue, fence); + if (result != VK_SUCCESS) + return result; } return VK_SUCCESS; @@ -3233,6 +4749,12 @@ { RADV_FROM_HANDLE(radv_queue, queue, _queue); + pthread_mutex_lock(&queue->pending_mutex); + while (!list_is_empty(&queue->pending_submissions)) { + pthread_cond_wait(&queue->device->timeline_cond, &queue->pending_mutex); + } + pthread_mutex_unlock(&queue->pending_mutex); + queue->device->ws->ctx_wait_idle(queue->hw_ctx, radv_queue_family_to_ring(queue->queue_family_index), queue->queue_idx); @@ -3295,11 +4817,16 @@ const char* pName) { RADV_FROM_HANDLE(radv_instance, instance, _instance); + bool unchecked = instance ? instance->debug_flags & RADV_DEBUG_ALL_ENTRYPOINTS : false; - return radv_lookup_entrypoint_checked(pName, - instance ? instance->apiVersion : 0, - instance ? &instance->enabled_extensions : NULL, - NULL); + if (unchecked) { + return radv_lookup_entrypoint_unchecked(pName); + } else { + return radv_lookup_entrypoint_checked(pName, + instance ? instance->apiVersion : 0, + instance ? &instance->enabled_extensions : NULL, + NULL); + } } /* The loader wants us to expose a second GetInstanceProcAddr function @@ -3340,11 +4867,16 @@ const char* pName) { RADV_FROM_HANDLE(radv_device, device, _device); + bool unchecked = device ? device->instance->debug_flags & RADV_DEBUG_ALL_ENTRYPOINTS : false; - return radv_lookup_entrypoint_checked(pName, - device->instance->apiVersion, - &device->instance->enabled_extensions, - &device->enabled_extensions); + if (unchecked) { + return radv_lookup_entrypoint_unchecked(pName); + } else { + return radv_lookup_entrypoint_checked(pName, + device->instance->apiVersion, + &device->instance->enabled_extensions, + &device->enabled_extensions); + } } bool radv_get_memory_fd(struct radv_device *device, @@ -3362,6 +4894,28 @@ pFD); } + +static void radv_free_memory(struct radv_device *device, + const VkAllocationCallbacks* pAllocator, + struct radv_device_memory *mem) +{ + if (mem == NULL) + return; + +#if RADV_SUPPORT_ANDROID_HARDWARE_BUFFER + if (mem->android_hardware_buffer) + AHardwareBuffer_release(mem->android_hardware_buffer); +#endif + + if (mem->bo) { + radv_bo_list_remove(device, mem->bo); + device->ws->buffer_destroy(mem->bo); + mem->bo = NULL; + } + + vk_free2(&device->alloc, pAllocator, mem); +} + static VkResult radv_alloc_memory(struct radv_device *device, const VkMemoryAllocateInfo* pAllocateInfo, const VkAllocationCallbacks* pAllocator, @@ -3375,25 +4929,29 @@ assert(pAllocateInfo->sType == VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO); - if (pAllocateInfo->allocationSize == 0) { - /* Apparently, this is allowed */ - *pMem = VK_NULL_HANDLE; - return VK_SUCCESS; - } - const VkImportMemoryFdInfoKHR *import_info = vk_find_struct_const(pAllocateInfo->pNext, IMPORT_MEMORY_FD_INFO_KHR); const VkMemoryDedicatedAllocateInfo *dedicate_info = vk_find_struct_const(pAllocateInfo->pNext, MEMORY_DEDICATED_ALLOCATE_INFO); const VkExportMemoryAllocateInfo *export_info = vk_find_struct_const(pAllocateInfo->pNext, EXPORT_MEMORY_ALLOCATE_INFO); + const struct VkImportAndroidHardwareBufferInfoANDROID *ahb_import_info = + vk_find_struct_const(pAllocateInfo->pNext, + IMPORT_ANDROID_HARDWARE_BUFFER_INFO_ANDROID); const VkImportMemoryHostPointerInfoEXT *host_ptr_info = vk_find_struct_const(pAllocateInfo->pNext, IMPORT_MEMORY_HOST_POINTER_INFO_EXT); const struct wsi_memory_allocate_info *wsi_info = vk_find_struct_const(pAllocateInfo->pNext, WSI_MEMORY_ALLOCATE_INFO_MESA); - mem = vk_alloc2(&device->alloc, pAllocator, sizeof(*mem), 8, + if (pAllocateInfo->allocationSize == 0 && !ahb_import_info && + !(export_info && (export_info->handleTypes & VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID))) { + /* Apparently, this is allowed */ + *pMem = VK_NULL_HANDLE; + return VK_SUCCESS; + } + + mem = vk_zalloc2(&device->alloc, pAllocator, sizeof(*mem), 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); if (mem == NULL) return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); @@ -3420,14 +4978,27 @@ (int)(priority_float * RADV_BO_PRIORITY_APPLICATION_MAX)); mem->user_ptr = NULL; + mem->bo = NULL; + +#if RADV_SUPPORT_ANDROID_HARDWARE_BUFFER + mem->android_hardware_buffer = NULL; +#endif - if (import_info) { + if (ahb_import_info) { + result = radv_import_ahb_memory(device, mem, priority, ahb_import_info); + if (result != VK_SUCCESS) + goto fail; + } else if(export_info && (export_info->handleTypes & VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID)) { + result = radv_create_ahb_memory(device, mem, priority, pAllocateInfo); + if (result != VK_SUCCESS) + goto fail; + } else if (import_info) { assert(import_info->handleType == VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT || import_info->handleType == VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT); mem->bo = device->ws->buffer_from_fd(device->ws, import_info->fd, - priority, NULL, NULL); + priority, NULL); if (!mem->bo) { result = VK_ERROR_INVALID_EXTERNAL_HANDLE; goto fail; @@ -3436,7 +5007,7 @@ } } else if (host_ptr_info) { assert(host_ptr_info->handleType == VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT); - assert(mem_type_index == RADV_MEM_TYPE_GTT_CACHED); + assert(radv_is_mem_type_gtt_cached(mem_type_index)); mem->bo = device->ws->buffer_from_ptr(device->ws, host_ptr_info->pHostPointer, pAllocateInfo->allocationSize, priority); @@ -3448,18 +5019,18 @@ } } else { uint64_t alloc_size = align_u64(pAllocateInfo->allocationSize, 4096); - if (mem_type_index == RADV_MEM_TYPE_GTT_WRITE_COMBINE || - mem_type_index == RADV_MEM_TYPE_GTT_CACHED) + if (radv_is_mem_type_gtt_wc(mem_type_index) || + radv_is_mem_type_gtt_cached(mem_type_index)) domain = RADEON_DOMAIN_GTT; else domain = RADEON_DOMAIN_VRAM; - if (mem_type_index == RADV_MEM_TYPE_VRAM) + if (radv_is_mem_type_vram(mem_type_index)) flags |= RADEON_FLAG_NO_CPU_ACCESS; else flags |= RADEON_FLAG_CPU_ACCESS; - if (mem_type_index == RADV_MEM_TYPE_GTT_WRITE_COMBINE) + if (radv_is_mem_type_gtt_wc(mem_type_index)) flags |= RADEON_FLAG_GTT_WC; if (!dedicate_info && !import_info && (!export_info || !export_info->handleTypes)) { @@ -3469,6 +5040,11 @@ } } + if (radv_is_mem_type_uncached(mem_type_index)) { + assert(device->physical_device->rad_info.has_l2_uncached); + flags |= RADEON_FLAG_VA_UNCACHED; + } + mem->bo = device->ws->buffer_create(device->ws, alloc_size, device->physical_device->rad_info.max_alignment, domain, flags, priority); @@ -3481,16 +5057,14 @@ result = radv_bo_list_add(device, mem->bo); if (result != VK_SUCCESS) - goto fail_bo; + goto fail; *pMem = radv_device_memory_to_handle(mem); return VK_SUCCESS; -fail_bo: - device->ws->buffer_destroy(mem->bo); fail: - vk_free2(&device->alloc, pAllocator, mem); + radv_free_memory(device, pAllocator,mem); return result; } @@ -3513,14 +5087,7 @@ RADV_FROM_HANDLE(radv_device, device, _device); RADV_FROM_HANDLE(radv_device_memory, mem, _mem); - if (mem == NULL) - return; - - radv_bo_list_remove(device, mem->bo); - device->ws->buffer_destroy(mem->bo); - mem->bo = NULL; - - vk_free2(&device->alloc, pAllocator, mem); + radv_free_memory(device, pAllocator, mem); } VkResult radv_MapMemory( @@ -3758,107 +5325,63 @@ return radv_BindImageMemory2(device, 1, &info); } - -static void -radv_sparse_buffer_bind_memory(struct radv_device *device, - const VkSparseBufferMemoryBindInfo *bind) -{ - RADV_FROM_HANDLE(radv_buffer, buffer, bind->buffer); - - for (uint32_t i = 0; i < bind->bindCount; ++i) { - struct radv_device_memory *mem = NULL; - - if (bind->pBinds[i].memory != VK_NULL_HANDLE) - mem = radv_device_memory_from_handle(bind->pBinds[i].memory); - - device->ws->buffer_virtual_bind(buffer->bo, - bind->pBinds[i].resourceOffset, - bind->pBinds[i].size, - mem ? mem->bo : NULL, - bind->pBinds[i].memoryOffset); - } -} - -static void -radv_sparse_image_opaque_bind_memory(struct radv_device *device, - const VkSparseImageOpaqueMemoryBindInfo *bind) +static bool radv_sparse_bind_has_effects(const VkBindSparseInfo *info) { - RADV_FROM_HANDLE(radv_image, image, bind->image); - - for (uint32_t i = 0; i < bind->bindCount; ++i) { - struct radv_device_memory *mem = NULL; - - if (bind->pBinds[i].memory != VK_NULL_HANDLE) - mem = radv_device_memory_from_handle(bind->pBinds[i].memory); - - device->ws->buffer_virtual_bind(image->bo, - bind->pBinds[i].resourceOffset, - bind->pBinds[i].size, - mem ? mem->bo : NULL, - bind->pBinds[i].memoryOffset); - } + return info->bufferBindCount || + info->imageOpaqueBindCount || + info->imageBindCount || + info->waitSemaphoreCount || + info->signalSemaphoreCount; } VkResult radv_QueueBindSparse( VkQueue _queue, uint32_t bindInfoCount, const VkBindSparseInfo* pBindInfo, - VkFence _fence) + VkFence fence) { - RADV_FROM_HANDLE(radv_fence, fence, _fence); RADV_FROM_HANDLE(radv_queue, queue, _queue); - struct radeon_winsys_fence *base_fence = fence ? fence->fence : NULL; - bool fence_emitted = false; VkResult result; - int ret; + uint32_t fence_idx = 0; - for (uint32_t i = 0; i < bindInfoCount; ++i) { - struct radv_winsys_sem_info sem_info; - for (uint32_t j = 0; j < pBindInfo[i].bufferBindCount; ++j) { - radv_sparse_buffer_bind_memory(queue->device, - pBindInfo[i].pBufferBinds + j); - } - - for (uint32_t j = 0; j < pBindInfo[i].imageOpaqueBindCount; ++j) { - radv_sparse_image_opaque_bind_memory(queue->device, - pBindInfo[i].pImageOpaqueBinds + j); - } - - VkResult result; - result = radv_alloc_sem_info(queue->device->instance, - &sem_info, - pBindInfo[i].waitSemaphoreCount, - pBindInfo[i].pWaitSemaphores, - pBindInfo[i].signalSemaphoreCount, - pBindInfo[i].pSignalSemaphores, - _fence); - if (result != VK_SUCCESS) - return result; + if (fence != VK_NULL_HANDLE) { + for (uint32_t i = 0; i < bindInfoCount; ++i) + if (radv_sparse_bind_has_effects(pBindInfo + i)) + fence_idx = i; + } else + fence_idx = UINT32_MAX; - if (pBindInfo[i].waitSemaphoreCount || pBindInfo[i].signalSemaphoreCount) { - ret = queue->device->ws->cs_submit(queue->hw_ctx, queue->queue_idx, - &queue->device->empty_cs[queue->queue_family_index], - 1, NULL, NULL, - &sem_info, NULL, - false, base_fence); - if (ret) { - radv_loge("failed to submit CS %d\n", i); - abort(); - } + for (uint32_t i = 0; i < bindInfoCount; ++i) { + if (i != fence_idx && !radv_sparse_bind_has_effects(pBindInfo + i)) + continue; - fence_emitted = true; - } + const VkTimelineSemaphoreSubmitInfo *timeline_info = + vk_find_struct_const(pBindInfo[i].pNext, TIMELINE_SEMAPHORE_SUBMIT_INFO); - radv_free_sem_info(&sem_info); + VkResult result = radv_queue_submit(queue, &(struct radv_queue_submission) { + .buffer_binds = pBindInfo[i].pBufferBinds, + .buffer_bind_count = pBindInfo[i].bufferBindCount, + .image_opaque_binds = pBindInfo[i].pImageOpaqueBinds, + .image_opaque_bind_count = pBindInfo[i].imageOpaqueBindCount, + .wait_semaphores = pBindInfo[i].pWaitSemaphores, + .wait_semaphore_count = pBindInfo[i].waitSemaphoreCount, + .signal_semaphores = pBindInfo[i].pSignalSemaphores, + .signal_semaphore_count = pBindInfo[i].signalSemaphoreCount, + .fence = i == fence_idx ? fence : VK_NULL_HANDLE, + .wait_values = timeline_info ? timeline_info->pWaitSemaphoreValues : NULL, + .wait_value_count = timeline_info && timeline_info->pWaitSemaphoreValues ? timeline_info->waitSemaphoreValueCount : 0, + .signal_values = timeline_info ? timeline_info->pSignalSemaphoreValues : NULL, + .signal_value_count = timeline_info && timeline_info->pSignalSemaphoreValues ? timeline_info->signalSemaphoreValueCount : 0, + }); + if (result != VK_SUCCESS) + return result; } - if (fence) { - if (!fence_emitted) { - result = radv_signal_fence(queue, fence); - if (result != VK_SUCCESS) - return result; - } + if (fence != VK_NULL_HANDLE && !bindInfoCount) { + result = radv_signal_fence(queue, fence); + if (result != VK_SUCCESS) + return result; } return VK_SUCCESS; @@ -4137,6 +5660,197 @@ // Queue semaphore functions +static void +radv_create_timeline(struct radv_timeline *timeline, uint64_t value) +{ + timeline->highest_signaled = value; + timeline->highest_submitted = value; + list_inithead(&timeline->points); + list_inithead(&timeline->free_points); + list_inithead(&timeline->waiters); + pthread_mutex_init(&timeline->mutex, NULL); +} + +static void +radv_destroy_timeline(struct radv_device *device, + struct radv_timeline *timeline) +{ + list_for_each_entry_safe(struct radv_timeline_point, point, + &timeline->free_points, list) { + list_del(&point->list); + device->ws->destroy_syncobj(device->ws, point->syncobj); + free(point); + } + list_for_each_entry_safe(struct radv_timeline_point, point, + &timeline->points, list) { + list_del(&point->list); + device->ws->destroy_syncobj(device->ws, point->syncobj); + free(point); + } + pthread_mutex_destroy(&timeline->mutex); +} + +static void +radv_timeline_gc_locked(struct radv_device *device, + struct radv_timeline *timeline) +{ + list_for_each_entry_safe(struct radv_timeline_point, point, + &timeline->points, list) { + if (point->wait_count || point->value > timeline->highest_submitted) + return; + + if (device->ws->wait_syncobj(device->ws, &point->syncobj, 1, true, 0)) { + timeline->highest_signaled = point->value; + list_del(&point->list); + list_add(&point->list, &timeline->free_points); + } + } +} + +static struct radv_timeline_point * +radv_timeline_find_point_at_least_locked(struct radv_device *device, + struct radv_timeline *timeline, + uint64_t p) +{ + radv_timeline_gc_locked(device, timeline); + + if (p <= timeline->highest_signaled) + return NULL; + + list_for_each_entry(struct radv_timeline_point, point, + &timeline->points, list) { + if (point->value >= p) { + ++point->wait_count; + return point; + } + } + return NULL; +} + +static struct radv_timeline_point * +radv_timeline_add_point_locked(struct radv_device *device, + struct radv_timeline *timeline, + uint64_t p) +{ + radv_timeline_gc_locked(device, timeline); + + struct radv_timeline_point *ret = NULL; + struct radv_timeline_point *prev = NULL; + + if (p <= timeline->highest_signaled) + return NULL; + + list_for_each_entry(struct radv_timeline_point, point, + &timeline->points, list) { + if (point->value == p) { + return NULL; + } + + if (point->value < p) + prev = point; + } + + if (list_is_empty(&timeline->free_points)) { + ret = malloc(sizeof(struct radv_timeline_point)); + device->ws->create_syncobj(device->ws, &ret->syncobj); + } else { + ret = list_first_entry(&timeline->free_points, struct radv_timeline_point, list); + list_del(&ret->list); + + device->ws->reset_syncobj(device->ws, ret->syncobj); + } + + ret->value = p; + ret->wait_count = 1; + + if (prev) { + list_add(&ret->list, &prev->list); + } else { + list_addtail(&ret->list, &timeline->points); + } + return ret; +} + + +static VkResult +radv_timeline_wait_locked(struct radv_device *device, + struct radv_timeline *timeline, + uint64_t value, + uint64_t abs_timeout) +{ + while(timeline->highest_submitted < value) { + struct timespec abstime; + timespec_from_nsec(&abstime, abs_timeout); + + pthread_cond_timedwait(&device->timeline_cond, &timeline->mutex, &abstime); + + if (radv_get_current_time() >= abs_timeout && timeline->highest_submitted < value) + return VK_TIMEOUT; + } + + struct radv_timeline_point *point = radv_timeline_find_point_at_least_locked(device, timeline, value); + if (!point) + return VK_SUCCESS; + + pthread_mutex_unlock(&timeline->mutex); + + bool success = device->ws->wait_syncobj(device->ws, &point->syncobj, 1, true, abs_timeout); + + pthread_mutex_lock(&timeline->mutex); + point->wait_count--; + return success ? VK_SUCCESS : VK_TIMEOUT; +} + +static void +radv_timeline_trigger_waiters_locked(struct radv_timeline *timeline, + struct list_head *processing_list) +{ + list_for_each_entry_safe(struct radv_timeline_waiter, waiter, + &timeline->waiters, list) { + if (waiter->value > timeline->highest_submitted) + continue; + + if (p_atomic_dec_zero(&waiter->submission->submission_wait_count)) { + list_addtail(&waiter->submission->processing_list, processing_list); + } + list_del(&waiter->list); + } +} + +static +void radv_destroy_semaphore_part(struct radv_device *device, + struct radv_semaphore_part *part) +{ + switch(part->kind) { + case RADV_SEMAPHORE_NONE: + break; + case RADV_SEMAPHORE_WINSYS: + device->ws->destroy_sem(part->ws_sem); + break; + case RADV_SEMAPHORE_TIMELINE: + radv_destroy_timeline(device, &part->timeline); + break; + case RADV_SEMAPHORE_SYNCOBJ: + device->ws->destroy_syncobj(device->ws, part->syncobj); + break; + } + part->kind = RADV_SEMAPHORE_NONE; +} + +static VkSemaphoreTypeKHR +radv_get_semaphore_type(const void *pNext, uint64_t *initial_value) +{ + const VkSemaphoreTypeCreateInfo *type_info = + vk_find_struct_const(pNext, SEMAPHORE_TYPE_CREATE_INFO); + + if (!type_info) + return VK_SEMAPHORE_TYPE_BINARY; + + if (initial_value) + *initial_value = type_info->initialValue; + return type_info->semaphoreType; +} + VkResult radv_CreateSemaphore( VkDevice _device, const VkSemaphoreCreateInfo* pCreateInfo, @@ -4148,6 +5862,8 @@ vk_find_struct_const(pCreateInfo->pNext, EXPORT_SEMAPHORE_CREATE_INFO); VkExternalSemaphoreHandleTypeFlags handleTypes = export ? export->handleTypes : 0; + uint64_t initial_value = 0; + VkSemaphoreTypeKHR type = radv_get_semaphore_type(pCreateInfo->pNext, &initial_value); struct radv_semaphore *sem = vk_alloc2(&device->alloc, pAllocator, sizeof(*sem), 8, @@ -4155,23 +5871,27 @@ if (!sem) return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); - sem->temp_syncobj = 0; - /* create a syncobject if we are going to export this semaphore */ - if (device->always_use_syncobj || handleTypes) { + sem->temporary.kind = RADV_SEMAPHORE_NONE; + sem->permanent.kind = RADV_SEMAPHORE_NONE; + + if (type == VK_SEMAPHORE_TYPE_TIMELINE) { + radv_create_timeline(&sem->permanent.timeline, initial_value); + sem->permanent.kind = RADV_SEMAPHORE_TIMELINE; + } else if (device->always_use_syncobj || handleTypes) { assert (device->physical_device->rad_info.has_syncobj); - int ret = device->ws->create_syncobj(device->ws, &sem->syncobj); + int ret = device->ws->create_syncobj(device->ws, &sem->permanent.syncobj); if (ret) { vk_free2(&device->alloc, pAllocator, sem); return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); } - sem->sem = NULL; + sem->permanent.kind = RADV_SEMAPHORE_SYNCOBJ; } else { - sem->sem = device->ws->create_sem(device->ws); - if (!sem->sem) { + sem->permanent.ws_sem = device->ws->create_sem(device->ws); + if (!sem->permanent.ws_sem) { vk_free2(&device->alloc, pAllocator, sem); return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); } - sem->syncobj = 0; + sem->permanent.kind = RADV_SEMAPHORE_WINSYS; } *pSemaphore = radv_semaphore_to_handle(sem); @@ -4188,13 +5908,115 @@ if (!_semaphore) return; - if (sem->syncobj) - device->ws->destroy_syncobj(device->ws, sem->syncobj); - else - device->ws->destroy_sem(sem->sem); + radv_destroy_semaphore_part(device, &sem->temporary); + radv_destroy_semaphore_part(device, &sem->permanent); vk_free2(&device->alloc, pAllocator, sem); } +VkResult +radv_GetSemaphoreCounterValue(VkDevice _device, + VkSemaphore _semaphore, + uint64_t* pValue) +{ + RADV_FROM_HANDLE(radv_device, device, _device); + RADV_FROM_HANDLE(radv_semaphore, semaphore, _semaphore); + + struct radv_semaphore_part *part = + semaphore->temporary.kind != RADV_SEMAPHORE_NONE ? &semaphore->temporary : &semaphore->permanent; + + switch (part->kind) { + case RADV_SEMAPHORE_TIMELINE: { + pthread_mutex_lock(&part->timeline.mutex); + radv_timeline_gc_locked(device, &part->timeline); + *pValue = part->timeline.highest_signaled; + pthread_mutex_unlock(&part->timeline.mutex); + return VK_SUCCESS; + } + case RADV_SEMAPHORE_NONE: + case RADV_SEMAPHORE_SYNCOBJ: + case RADV_SEMAPHORE_WINSYS: + unreachable("Invalid semaphore type"); + } + unreachable("Unhandled semaphore type"); +} + + +static VkResult +radv_wait_timelines(struct radv_device *device, + const VkSemaphoreWaitInfo* pWaitInfo, + uint64_t abs_timeout) +{ + if ((pWaitInfo->flags & VK_SEMAPHORE_WAIT_ANY_BIT_KHR) && pWaitInfo->semaphoreCount > 1) { + for (;;) { + for(uint32_t i = 0; i < pWaitInfo->semaphoreCount; ++i) { + RADV_FROM_HANDLE(radv_semaphore, semaphore, pWaitInfo->pSemaphores[i]); + pthread_mutex_lock(&semaphore->permanent.timeline.mutex); + VkResult result = radv_timeline_wait_locked(device, &semaphore->permanent.timeline, pWaitInfo->pValues[i], 0); + pthread_mutex_unlock(&semaphore->permanent.timeline.mutex); + + if (result == VK_SUCCESS) + return VK_SUCCESS; + } + if (radv_get_current_time() > abs_timeout) + return VK_TIMEOUT; + } + } + + for(uint32_t i = 0; i < pWaitInfo->semaphoreCount; ++i) { + RADV_FROM_HANDLE(radv_semaphore, semaphore, pWaitInfo->pSemaphores[i]); + pthread_mutex_lock(&semaphore->permanent.timeline.mutex); + VkResult result = radv_timeline_wait_locked(device, &semaphore->permanent.timeline, pWaitInfo->pValues[i], abs_timeout); + pthread_mutex_unlock(&semaphore->permanent.timeline.mutex); + + if (result != VK_SUCCESS) + return result; + } + return VK_SUCCESS; +} +VkResult +radv_WaitSemaphores(VkDevice _device, + const VkSemaphoreWaitInfo* pWaitInfo, + uint64_t timeout) +{ + RADV_FROM_HANDLE(radv_device, device, _device); + uint64_t abs_timeout = radv_get_absolute_timeout(timeout); + return radv_wait_timelines(device, pWaitInfo, abs_timeout); +} + +VkResult +radv_SignalSemaphore(VkDevice _device, + const VkSemaphoreSignalInfo* pSignalInfo) +{ + RADV_FROM_HANDLE(radv_device, device, _device); + RADV_FROM_HANDLE(radv_semaphore, semaphore, pSignalInfo->semaphore); + + struct radv_semaphore_part *part = + semaphore->temporary.kind != RADV_SEMAPHORE_NONE ? &semaphore->temporary : &semaphore->permanent; + + switch(part->kind) { + case RADV_SEMAPHORE_TIMELINE: { + pthread_mutex_lock(&part->timeline.mutex); + radv_timeline_gc_locked(device, &part->timeline); + part->timeline.highest_submitted = MAX2(part->timeline.highest_submitted, pSignalInfo->value); + part->timeline.highest_signaled = MAX2(part->timeline.highest_signaled, pSignalInfo->value); + + struct list_head processing_list; + list_inithead(&processing_list); + radv_timeline_trigger_waiters_locked(&part->timeline, &processing_list); + pthread_mutex_unlock(&part->timeline.mutex); + + return radv_process_submissions(&processing_list); + } + case RADV_SEMAPHORE_NONE: + case RADV_SEMAPHORE_SYNCOBJ: + case RADV_SEMAPHORE_WINSYS: + unreachable("Invalid semaphore type"); + } + return VK_SUCCESS; +} + + + VkResult radv_CreateEvent( VkDevice _device, const VkEventCreateInfo* pCreateInfo, @@ -4328,15 +6150,27 @@ vk_free2(&device->alloc, pAllocator, buffer); } -VkDeviceAddress radv_GetBufferDeviceAddressEXT( +VkDeviceAddress radv_GetBufferDeviceAddress( VkDevice device, - const VkBufferDeviceAddressInfoEXT* pInfo) + const VkBufferDeviceAddressInfo* pInfo) { RADV_FROM_HANDLE(radv_buffer, buffer, pInfo->buffer); return radv_buffer_get_va(buffer->bo) + buffer->offset; } +uint64_t radv_GetBufferOpaqueCaptureAddress(VkDevice device, + const VkBufferDeviceAddressInfo* pInfo) +{ + return 0; +} + +uint64_t radv_GetDeviceMemoryOpaqueCaptureAddress(VkDevice device, + const VkDeviceMemoryOpaqueCaptureAddressInfo* pInfo) +{ + return 0; +} + static inline unsigned si_tile_mode_index(const struct radv_image_plane *plane, unsigned level, bool stencil) { @@ -4866,9 +6700,9 @@ { RADV_FROM_HANDLE(radv_device, device, _device); struct radv_framebuffer *framebuffer; - const VkFramebufferAttachmentsCreateInfoKHR *imageless_create_info = + const VkFramebufferAttachmentsCreateInfo *imageless_create_info = vk_find_struct_const(pCreateInfo->pNext, - FRAMEBUFFER_ATTACHMENTS_CREATE_INFO_KHR); + FRAMEBUFFER_ATTACHMENTS_CREATE_INFO); assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO); @@ -4886,7 +6720,7 @@ framebuffer->layers = pCreateInfo->layers; if (imageless_create_info) { for (unsigned i = 0; i < imageless_create_info->attachmentImageInfoCount; ++i) { - const VkFramebufferAttachmentImageInfoKHR *attachment = + const VkFramebufferAttachmentImageInfo *attachment = imageless_create_info->pAttachmentImageInfos + i; framebuffer->width = MIN2(framebuffer->width, attachment->width); framebuffer->height = MIN2(framebuffer->height, attachment->height); @@ -5029,7 +6863,7 @@ } static unsigned -radv_tex_filter_mode(VkSamplerReductionModeEXT mode) +radv_tex_filter_mode(VkSamplerReductionMode mode) { switch (mode) { case VK_SAMPLER_REDUCTION_MODE_WEIGHTED_AVERAGE_EXT: @@ -5068,24 +6902,30 @@ bool compat_mode = device->physical_device->rad_info.chip_class == GFX8 || device->physical_device->rad_info.chip_class == GFX9; unsigned filter_mode = V_008F30_SQ_IMG_FILTER_MODE_BLEND; + unsigned depth_compare_func = V_008F30_SQ_TEX_DEPTH_COMPARE_NEVER; + bool trunc_coord = pCreateInfo->minFilter == VK_FILTER_NEAREST && pCreateInfo->magFilter == VK_FILTER_NEAREST; - const struct VkSamplerReductionModeCreateInfoEXT *sampler_reduction = + const struct VkSamplerReductionModeCreateInfo *sampler_reduction = vk_find_struct_const(pCreateInfo->pNext, - SAMPLER_REDUCTION_MODE_CREATE_INFO_EXT); + SAMPLER_REDUCTION_MODE_CREATE_INFO); if (sampler_reduction) filter_mode = radv_tex_filter_mode(sampler_reduction->reductionMode); + if (pCreateInfo->compareEnable) + depth_compare_func = radv_tex_compare(pCreateInfo->compareOp); + sampler->state[0] = (S_008F30_CLAMP_X(radv_tex_wrap(pCreateInfo->addressModeU)) | S_008F30_CLAMP_Y(radv_tex_wrap(pCreateInfo->addressModeV)) | S_008F30_CLAMP_Z(radv_tex_wrap(pCreateInfo->addressModeW)) | S_008F30_MAX_ANISO_RATIO(max_aniso_ratio) | - S_008F30_DEPTH_COMPARE_FUNC(radv_tex_compare(pCreateInfo->compareOp)) | + S_008F30_DEPTH_COMPARE_FUNC(depth_compare_func) | S_008F30_FORCE_UNNORMALIZED(pCreateInfo->unnormalizedCoordinates ? 1 : 0) | S_008F30_ANISO_THRESHOLD(max_aniso_ratio >> 1) | S_008F30_ANISO_BIAS(max_aniso_ratio) | S_008F30_DISABLE_CUBE_WRAP(0) | S_008F30_COMPAT_MODE(compat_mode) | - S_008F30_FILTER_MODE(filter_mode)); + S_008F30_FILTER_MODE(filter_mode) | + S_008F30_TRUNC_COORD(trunc_coord)); sampler->state[1] = (S_008F34_MIN_LOD(S_FIXED(CLAMP(pCreateInfo->minLod, 0, 15), 8)) | S_008F34_MAX_LOD(S_FIXED(CLAMP(pCreateInfo->maxLod, 0, 15), 8)) | S_008F34_PERF_MIP(max_aniso_ratio ? max_aniso_ratio + 6 : 0)); @@ -5222,7 +7062,7 @@ switch (handleType) { case VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT: - pMemoryFdProperties->memoryTypeBits = (1 << RADV_MEM_TYPE_COUNT) - 1; + pMemoryFdProperties->memoryTypeBits = (1 << device->physical_device->memory_properties.memoryTypeCount) - 1; return VK_SUCCESS; default: @@ -5289,22 +7129,34 @@ { RADV_FROM_HANDLE(radv_device, device, _device); RADV_FROM_HANDLE(radv_semaphore, sem, pImportSemaphoreFdInfo->semaphore); - uint32_t *syncobj_dst = NULL; + VkResult result; + struct radv_semaphore_part *dst = NULL; if (pImportSemaphoreFdInfo->flags & VK_SEMAPHORE_IMPORT_TEMPORARY_BIT) { - syncobj_dst = &sem->temp_syncobj; + dst = &sem->temporary; } else { - syncobj_dst = &sem->syncobj; + dst = &sem->permanent; } + uint32_t syncobj = dst->kind == RADV_SEMAPHORE_SYNCOBJ ? dst->syncobj : 0; + switch(pImportSemaphoreFdInfo->handleType) { case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT: - return radv_import_opaque_fd(device, pImportSemaphoreFdInfo->fd, syncobj_dst); + result = radv_import_opaque_fd(device, pImportSemaphoreFdInfo->fd, &syncobj); + break; case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT: - return radv_import_sync_fd(device, pImportSemaphoreFdInfo->fd, syncobj_dst); + result = radv_import_sync_fd(device, pImportSemaphoreFdInfo->fd, &syncobj); + break; default: unreachable("Unhandled semaphore handle type"); } + + if (result == VK_SUCCESS) { + dst->syncobj = syncobj; + dst->kind = RADV_SEMAPHORE_SYNCOBJ; + } + + return result; } VkResult radv_GetSemaphoreFdKHR(VkDevice _device, @@ -5316,10 +7168,13 @@ int ret; uint32_t syncobj_handle; - if (sem->temp_syncobj) - syncobj_handle = sem->temp_syncobj; - else - syncobj_handle = sem->syncobj; + if (sem->temporary.kind != RADV_SEMAPHORE_NONE) { + assert(sem->temporary.kind == RADV_SEMAPHORE_SYNCOBJ); + syncobj_handle = sem->temporary.syncobj; + } else { + assert(sem->permanent.kind == RADV_SEMAPHORE_SYNCOBJ); + syncobj_handle = sem->permanent.syncobj; + } switch(pGetFdInfo->handleType) { case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT: @@ -5328,9 +7183,8 @@ case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT: ret = device->ws->export_syncobj_to_sync_file(device->ws, syncobj_handle, pFd); if (!ret) { - if (sem->temp_syncobj) { - close (sem->temp_syncobj); - sem->temp_syncobj = 0; + if (sem->temporary.kind != RADV_SEMAPHORE_NONE) { + radv_destroy_semaphore_part(device, &sem->temporary); } else { device->ws->reset_syncobj(device->ws, syncobj_handle); } @@ -5351,11 +7205,17 @@ VkExternalSemaphoreProperties *pExternalSemaphoreProperties) { RADV_FROM_HANDLE(radv_physical_device, pdevice, physicalDevice); + VkSemaphoreTypeKHR type = radv_get_semaphore_type(pExternalSemaphoreInfo->pNext, NULL); + + if (type == VK_SEMAPHORE_TYPE_TIMELINE) { + pExternalSemaphoreProperties->exportFromImportedHandleTypes = 0; + pExternalSemaphoreProperties->compatibleHandleTypes = 0; + pExternalSemaphoreProperties->externalSemaphoreFeatures = 0; /* Require has_syncobj_wait_for_submit for the syncobj signal ioctl introduced at virtually the same time */ - if (pdevice->rad_info.has_syncobj_wait_for_submit && - (pExternalSemaphoreInfo->handleType == VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT || - pExternalSemaphoreInfo->handleType == VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT)) { + } else if (pdevice->rad_info.has_syncobj_wait_for_submit && + (pExternalSemaphoreInfo->handleType == VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT || + pExternalSemaphoreInfo->handleType == VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT)) { pExternalSemaphoreProperties->exportFromImportedHandleTypes = VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT | VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT; pExternalSemaphoreProperties->compatibleHandleTypes = VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT | VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT; pExternalSemaphoreProperties->externalSemaphoreFeatures = VK_EXTERNAL_SEMAPHORE_FEATURE_EXPORTABLE_BIT | diff -Nru mesa-19.2.8/src/amd/vulkan/radv_extensions.py mesa-20.0.8/src/amd/vulkan/radv_extensions.py --- mesa-19.2.8/src/amd/vulkan/radv_extensions.py 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/vulkan/radv_extensions.py 2020-06-12 01:21:16.000000000 +0000 @@ -31,7 +31,7 @@ from mako.template import Template -MAX_API_VERSION = '1.1.107' +MAX_API_VERSION = '1.2.128' class Extension: def __init__(self, name, ext_version, enable): @@ -50,9 +50,11 @@ # the those extension strings, then tests dEQP-VK.api.info.instance.extensions # and dEQP-VK.api.info.device fail due to the duplicated strings. EXTENSIONS = [ + Extension('VK_ANDROID_external_memory_android_hardware_buffer', 3, 'RADV_SUPPORT_ANDROID_HARDWARE_BUFFER && device->rad_info.has_syncobj_wait_for_submit'), Extension('VK_ANDROID_native_buffer', 5, 'ANDROID && device->rad_info.has_syncobj_wait_for_submit'), - Extension('VK_KHR_16bit_storage', 1, True), + Extension('VK_KHR_16bit_storage', 1, '!device->use_aco'), Extension('VK_KHR_bind_memory2', 1, True), + Extension('VK_KHR_buffer_device_address', 1, True), Extension('VK_KHR_create_renderpass2', 1, True), Extension('VK_KHR_dedicated_allocation', 1, True), Extension('VK_KHR_depth_stencil_resolve', 1, True), @@ -85,13 +87,20 @@ Extension('VK_KHR_relaxed_block_layout', 1, True), Extension('VK_KHR_sampler_mirror_clamp_to_edge', 1, True), Extension('VK_KHR_sampler_ycbcr_conversion', 1, True), - Extension('VK_KHR_shader_atomic_int64', 1, 'HAVE_LLVM >= 0x0900'), + Extension('VK_KHR_separate_depth_stencil_layouts', 1, True), + Extension('VK_KHR_shader_atomic_int64', 1, 'LLVM_VERSION_MAJOR >= 9'), + Extension('VK_KHR_shader_clock', 1, True), Extension('VK_KHR_shader_draw_parameters', 1, True), - Extension('VK_KHR_shader_float16_int8', 1, True), + Extension('VK_KHR_shader_float_controls', 1, True), + Extension('VK_KHR_shader_float16_int8', 1, '!device->use_aco'), + Extension('VK_KHR_shader_subgroup_extended_types', 1, True), + Extension('VK_KHR_spirv_1_4', 1, True), Extension('VK_KHR_storage_buffer_storage_class', 1, True), Extension('VK_KHR_surface', 25, 'RADV_HAS_SURFACE'), Extension('VK_KHR_surface_protected_capabilities', 1, 'RADV_HAS_SURFACE'), Extension('VK_KHR_swapchain', 68, 'RADV_HAS_SURFACE'), + Extension('VK_KHR_swapchain_mutable_format', 1, 'RADV_HAS_SURFACE'), + Extension('VK_KHR_timeline_semaphore', 2, 'device->rad_info.has_syncobj_wait_for_submit'), Extension('VK_KHR_uniform_buffer_standard_layout', 1, True), Extension('VK_KHR_variable_pointers', 1, True), Extension('VK_KHR_wayland_surface', 6, 'VK_USE_PLATFORM_WAYLAND_KHR'), @@ -99,7 +108,7 @@ Extension('VK_KHR_xlib_surface', 6, 'VK_USE_PLATFORM_XLIB_KHR'), Extension('VK_KHR_multiview', 1, True), Extension('VK_KHR_display', 23, 'VK_USE_PLATFORM_DISPLAY_KHR'), - Extension('VK_KHR_8bit_storage', 1, 'device->rad_info.chip_class >= GFX8'), + Extension('VK_KHR_8bit_storage', 1, '!device->use_aco'), Extension('VK_EXT_direct_mode_display', 1, 'VK_USE_PLATFORM_DISPLAY_KHR'), Extension('VK_EXT_acquire_xlib_display', 1, 'VK_USE_PLATFORM_XLIB_XRANDR_EXT'), Extension('VK_EXT_buffer_device_address', 1, True), @@ -125,28 +134,39 @@ Extension('VK_EXT_pipeline_creation_feedback', 1, True), Extension('VK_EXT_post_depth_coverage', 1, 'device->rad_info.chip_class >= GFX10'), Extension('VK_EXT_queue_family_foreign', 1, True), - Extension('VK_EXT_sample_locations', 1, True), + # Disable sample locations on GFX10 until the CTS failures have been resolved. + Extension('VK_EXT_sample_locations', 1, 'device->rad_info.chip_class < GFX10'), Extension('VK_EXT_sampler_filter_minmax', 1, 'device->rad_info.chip_class >= GFX7'), Extension('VK_EXT_scalar_block_layout', 1, 'device->rad_info.chip_class >= GFX7'), + Extension('VK_EXT_shader_demote_to_helper_invocation',1, 'device->use_aco'), Extension('VK_EXT_shader_viewport_index_layer', 1, True), Extension('VK_EXT_shader_stencil_export', 1, True), Extension('VK_EXT_shader_subgroup_ballot', 1, True), Extension('VK_EXT_shader_subgroup_vote', 1, True), + Extension('VK_EXT_subgroup_size_control', 2, '!device->use_aco'), + Extension('VK_EXT_texel_buffer_alignment', 1, True), Extension('VK_EXT_transform_feedback', 1, True), Extension('VK_EXT_vertex_attribute_divisor', 3, True), Extension('VK_EXT_ycbcr_image_arrays', 1, True), Extension('VK_AMD_buffer_marker', 1, True), + Extension('VK_AMD_device_coherent_memory', 1, True), Extension('VK_AMD_draw_indirect_count', 1, True), Extension('VK_AMD_gcn_shader', 1, True), - Extension('VK_AMD_gpu_shader_half_float', 1, 'device->rad_info.chip_class >= GFX9 && HAVE_LLVM >= 0x0800'), - Extension('VK_AMD_gpu_shader_int16', 1, 'device->rad_info.chip_class >= GFX9'), - Extension('VK_AMD_rasterization_order', 1, 'device->has_out_of_order_rast'), + Extension('VK_AMD_gpu_shader_half_float', 1, '!device->use_aco && device->rad_info.chip_class >= GFX9'), + Extension('VK_AMD_gpu_shader_int16', 1, '!device->use_aco && device->rad_info.chip_class >= GFX9'), + Extension('VK_AMD_mixed_attachment_samples', 1, 'device->rad_info.chip_class >= GFX8'), + Extension('VK_AMD_rasterization_order', 1, 'device->rad_info.has_out_of_order_rast'), Extension('VK_AMD_shader_ballot', 1, 'device->use_shader_ballot'), Extension('VK_AMD_shader_core_properties', 1, True), + Extension('VK_AMD_shader_core_properties2', 1, True), + Extension('VK_AMD_shader_explicit_vertex_parameter', 1, True), + Extension('VK_AMD_shader_image_load_store_lod', 1, True), + Extension('VK_AMD_shader_fragment_mask', 1, True), Extension('VK_AMD_shader_info', 1, True), Extension('VK_AMD_shader_trinary_minmax', 1, True), Extension('VK_GOOGLE_decorate_string', 1, True), Extension('VK_GOOGLE_hlsl_functionality1', 1, True), + Extension('VK_GOOGLE_user_type', 1, True), Extension('VK_NV_compute_shader_derivatives', 1, 'device->rad_info.chip_class >= GFX8'), ] @@ -349,8 +369,13 @@ { uint32_t override = vk_get_version_override(); uint32_t version = VK_MAKE_VERSION(1, 0, 68); - if (dev->rad_info.has_syncobj_wait_for_submit) - version = ${MAX_API_VERSION.c_vk_version()}; + if (dev->rad_info.has_syncobj_wait_for_submit) { + if (ANDROID) { + version = VK_MAKE_VERSION(1, 1, 107); + } else { + version = ${MAX_API_VERSION.c_vk_version()}; + } + } return override ? MIN2(override, version) : version; } diff -Nru mesa-19.2.8/src/amd/vulkan/radv_formats.c mesa-20.0.8/src/amd/vulkan/radv_formats.c --- mesa-19.2.8/src/amd/vulkan/radv_formats.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/vulkan/radv_formats.c 2020-06-12 01:21:16.000000000 +0000 @@ -595,7 +595,7 @@ /* From the Vulkan spec 1.1.71: * * "The following formats must support the - * VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_MINMAX_BIT_EXT feature with + * VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_MINMAX_BIT feature with * VK_IMAGE_TILING_OPTIMAL, if they support * VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT." */ @@ -694,7 +694,7 @@ VK_FORMAT_FEATURE_TRANSFER_DST_BIT; if (radv_is_filter_minmax_format_supported(format)) - tiled |= VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_MINMAX_BIT_EXT; + tiled |= VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_MINMAX_BIT; /* Don't support blitting surfaces with depth/stencil. */ if (vk_format_is_depth(format) && vk_format_is_stencil(format)) @@ -712,7 +712,7 @@ VK_FORMAT_FEATURE_BLIT_SRC_BIT; if (radv_is_filter_minmax_format_supported(format)) - tiled |= VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_MINMAX_BIT_EXT; + tiled |= VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_MINMAX_BIT; if (linear_sampling) { linear |= VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_LINEAR_BIT; @@ -1104,6 +1104,7 @@ static VkResult radv_get_image_format_properties(struct radv_physical_device *physical_device, const VkPhysicalDeviceImageFormatInfo2 *info, + VkFormat format, VkImageFormatProperties *pImageFormatProperties) { @@ -1113,10 +1114,10 @@ uint32_t maxMipLevels; uint32_t maxArraySize; VkSampleCountFlags sampleCounts = VK_SAMPLE_COUNT_1_BIT; - const struct vk_format_description *desc = vk_format_description(info->format); + const struct vk_format_description *desc = vk_format_description(format); enum chip_class chip_class = physical_device->rad_info.chip_class; - radv_physical_device_get_format_properties(physical_device, info->format, + radv_physical_device_get_format_properties(physical_device, format, &format_props); if (info->tiling == VK_IMAGE_TILING_LINEAR) { format_feature_flags = format_props.linearTilingFeatures; @@ -1129,7 +1130,7 @@ if (format_feature_flags == 0) goto unsupported; - if (info->type != VK_IMAGE_TYPE_2D && vk_format_is_depth_or_stencil(info->format)) + if (info->type != VK_IMAGE_TYPE_2D && vk_format_is_depth_or_stencil(format)) goto unsupported; switch (info->type) { @@ -1179,9 +1180,9 @@ } if (info->tiling == VK_IMAGE_TILING_LINEAR && - (info->format == VK_FORMAT_R32G32B32_SFLOAT || - info->format == VK_FORMAT_R32G32B32_SINT || - info->format == VK_FORMAT_R32G32B32_UINT)) { + (format == VK_FORMAT_R32G32B32_SFLOAT || + format == VK_FORMAT_R32G32B32_SINT || + format == VK_FORMAT_R32G32B32_UINT)) { /* R32G32B32 is a weird format and the driver currently only * supports the barely minimum. * TODO: Implement more if we really need to. @@ -1196,8 +1197,8 @@ /* We can't create 3d compressed 128bpp images that can be rendered to on GFX9 */ if (physical_device->rad_info.chip_class >= GFX9 && info->type == VK_IMAGE_TYPE_3D && - vk_format_get_blocksizebits(info->format) == 128 && - vk_format_is_compressed(info->format) && + vk_format_get_blocksizebits(format) == 128 && + vk_format_is_compressed(format) && (info->flags & VK_IMAGE_CREATE_BLOCK_TEXEL_VIEW_COMPATIBLE_BIT) && ((info->flags & VK_IMAGE_CREATE_EXTENDED_USAGE_BIT) || (info->usage & VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT))) { @@ -1247,6 +1248,12 @@ } } + /* Sparse resources with multi-planar formats are unsupported. */ + if (info->flags & VK_IMAGE_CREATE_SPARSE_BINDING_BIT) { + if (desc->plane_count > 1) + goto unsupported; + } + *pImageFormatProperties = (VkImageFormatProperties) { .maxExtent = maxExtent, .maxMipLevels = maxMipLevels, @@ -1293,14 +1300,16 @@ .flags = createFlags, }; - return radv_get_image_format_properties(physical_device, &info, + return radv_get_image_format_properties(physical_device, &info, format, pImageFormatProperties); } static void -get_external_image_format_properties(const VkPhysicalDeviceImageFormatInfo2 *pImageFormatInfo, +get_external_image_format_properties(struct radv_physical_device *physical_device, + const VkPhysicalDeviceImageFormatInfo2 *pImageFormatInfo, VkExternalMemoryHandleTypeFlagBits handleType, - VkExternalMemoryProperties *external_properties) + VkExternalMemoryProperties *external_properties, + VkImageFormatProperties *format_properties) { VkExternalMemoryFeatureFlagBits flags = 0; VkExternalMemoryHandleTypeFlags export_flags = 0; @@ -1322,6 +1331,24 @@ break; } break; + case VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID: + if (!physical_device->supported_extensions.ANDROID_external_memory_android_hardware_buffer) + break; + + if (!radv_android_gralloc_supports_format(pImageFormatInfo->format, + pImageFormatInfo->usage)) + break; + + if (pImageFormatInfo->type != VK_IMAGE_TYPE_2D) + break; + + format_properties->maxMipLevels = MIN2(1, format_properties->maxMipLevels); + format_properties->maxArrayLayers = MIN2(1, format_properties->maxArrayLayers); + format_properties->sampleCounts &= VK_SAMPLE_COUNT_1_BIT; + + flags = VK_EXTERNAL_MEMORY_FEATURE_DEDICATED_ONLY_BIT|VK_EXTERNAL_MEMORY_FEATURE_EXPORTABLE_BIT|VK_EXTERNAL_MEMORY_FEATURE_IMPORTABLE_BIT; + compat_flags = VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID; + break; case VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT: flags = VK_EXTERNAL_MEMORY_FEATURE_IMPORTABLE_BIT; compat_flags = VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT; @@ -1345,10 +1372,12 @@ RADV_FROM_HANDLE(radv_physical_device, physical_device, physicalDevice); const VkPhysicalDeviceExternalImageFormatInfo *external_info = NULL; VkExternalImageFormatProperties *external_props = NULL; + struct VkAndroidHardwareBufferUsageANDROID *android_usage = NULL; VkSamplerYcbcrConversionImageFormatProperties *ycbcr_props = NULL; VkResult result; + VkFormat format = radv_select_android_external_format(base_info->pNext, base_info->format); - result = radv_get_image_format_properties(physical_device, base_info, + result = radv_get_image_format_properties(physical_device, base_info, format, &base_props->imageFormatProperties); if (result != VK_SUCCESS) return result; @@ -1373,11 +1402,23 @@ case VK_STRUCTURE_TYPE_SAMPLER_YCBCR_CONVERSION_IMAGE_FORMAT_PROPERTIES: ycbcr_props = (void *) s; break; + case VK_STRUCTURE_TYPE_ANDROID_HARDWARE_BUFFER_USAGE_ANDROID: + android_usage = (void *) s; + break; default: break; } } + bool ahb_supported = physical_device->supported_extensions.ANDROID_external_memory_android_hardware_buffer; + if (android_usage && ahb_supported) { +#if RADV_SUPPORT_ANDROID_HARDWARE_BUFFER + android_usage->androidHardwareBufferUsage = + radv_ahb_usage_from_vk_usage(base_info->flags, + base_info->usage); +#endif + } + /* From the Vulkan 1.0.97 spec: * * If handleType is 0, vkGetPhysicalDeviceImageFormatProperties2 will @@ -1385,8 +1426,9 @@ * present and VkExternalImageFormatProperties will be ignored. */ if (external_info && external_info->handleType != 0) { - get_external_image_format_properties(base_info, external_info->handleType, - &external_props->externalMemoryProperties); + get_external_image_format_properties(physical_device, base_info, external_info->handleType, + &external_props->externalMemoryProperties, + &base_props->imageFormatProperties); if (!external_props->externalMemoryProperties.externalMemoryFeatures) { /* From the Vulkan 1.0.97 spec: * @@ -1403,7 +1445,7 @@ } if (ycbcr_props) { - ycbcr_props->combinedImageSamplerDescriptorCount = vk_format_get_plane_count(base_info->format); + ycbcr_props->combinedImageSamplerDescriptorCount = vk_format_get_plane_count(format); } return VK_SUCCESS; diff -Nru mesa-19.2.8/src/amd/vulkan/radv_image.c mesa-20.0.8/src/amd/vulkan/radv_image.c --- mesa-19.2.8/src/amd/vulkan/radv_image.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/vulkan/radv_image.c 2020-06-12 01:21:16.000000000 +0000 @@ -36,17 +36,16 @@ static unsigned radv_choose_tiling(struct radv_device *device, - const struct radv_image_create_info *create_info) + const VkImageCreateInfo *pCreateInfo, + VkFormat format) { - const VkImageCreateInfo *pCreateInfo = create_info->vk_info; - if (pCreateInfo->tiling == VK_IMAGE_TILING_LINEAR) { assert(pCreateInfo->samples <= 1); return RADEON_SURF_MODE_LINEAR_ALIGNED; } - if (!vk_format_is_compressed(pCreateInfo->format) && - !vk_format_is_depth_or_stencil(pCreateInfo->format) + if (!vk_format_is_compressed(format) && + !vk_format_is_depth_or_stencil(format) && device->physical_device->rad_info.chip_class <= GFX8) { /* this causes hangs in some VK CTS tests on GFX9. */ /* Textures with a very small height are recommended to be linear. */ @@ -66,14 +65,14 @@ static bool radv_use_tc_compat_htile_for_image(struct radv_device *device, - const VkImageCreateInfo *pCreateInfo) + const VkImageCreateInfo *pCreateInfo, + VkFormat format) { /* TC-compat HTILE is only available for GFX8+. */ if (device->physical_device->rad_info.chip_class < GFX8) return false; - if ((pCreateInfo->usage & VK_IMAGE_USAGE_STORAGE_BIT) || - (pCreateInfo->flags & VK_IMAGE_CREATE_EXTENDED_USAGE_BIT)) + if ((pCreateInfo->usage & VK_IMAGE_USAGE_STORAGE_BIT)) return false; if (pCreateInfo->tiling == VK_IMAGE_TILING_LINEAR) @@ -86,8 +85,8 @@ * tests - disable for now. On GFX10 D32_SFLOAT is affected as well. */ if (pCreateInfo->samples >= 2 && - (pCreateInfo->format == VK_FORMAT_D32_SFLOAT_S8_UINT || - (pCreateInfo->format == VK_FORMAT_D32_SFLOAT && + (format == VK_FORMAT_D32_SFLOAT_S8_UINT || + (format == VK_FORMAT_D32_SFLOAT && device->physical_device->rad_info.chip_class == GFX10))) return false; @@ -95,16 +94,16 @@ * supports 32-bit. Though, it's possible to enable TC-compat for * 16-bit depth surfaces if no Z planes are compressed. */ - if (pCreateInfo->format != VK_FORMAT_D32_SFLOAT_S8_UINT && - pCreateInfo->format != VK_FORMAT_D32_SFLOAT && - pCreateInfo->format != VK_FORMAT_D16_UNORM) + if (format != VK_FORMAT_D32_SFLOAT_S8_UINT && + format != VK_FORMAT_D32_SFLOAT && + format != VK_FORMAT_D16_UNORM) return false; if (pCreateInfo->flags & VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT) { - const struct VkImageFormatListCreateInfoKHR *format_list = - (const struct VkImageFormatListCreateInfoKHR *) + const struct VkImageFormatListCreateInfo *format_list = + (const struct VkImageFormatListCreateInfo *) vk_find_struct_const(pCreateInfo->pNext, - IMAGE_FORMAT_LIST_CREATE_INFO_KHR); + IMAGE_FORMAT_LIST_CREATE_INFO); /* We have to ignore the existence of the list if viewFormatCount = 0 */ if (format_list && format_list->viewFormatCount) { @@ -115,7 +114,7 @@ if (format_list->pViewFormats[i] == VK_FORMAT_UNDEFINED) continue; - if (pCreateInfo->format != format_list->pViewFormats[i]) + if (format != format_list->pViewFormats[i]) return false; } } else { @@ -129,24 +128,21 @@ static bool radv_surface_has_scanout(struct radv_device *device, const struct radv_image_create_info *info) { - if (info->scanout) - return true; - - if (!info->bo_metadata) - return false; - - if (device->physical_device->rad_info.chip_class >= GFX9) { - return info->bo_metadata->u.gfx9.swizzle_mode == 0 || info->bo_metadata->u.gfx9.swizzle_mode % 4 == 2; - } else { - return info->bo_metadata->u.legacy.scanout; + if (info->bo_metadata) { + if (device->physical_device->rad_info.chip_class >= GFX9) + return info->bo_metadata->u.gfx9.scanout; + else + return info->bo_metadata->u.legacy.scanout; } + + return info->scanout; } static bool radv_use_dcc_for_image(struct radv_device *device, const struct radv_image *image, - const struct radv_image_create_info *create_info, - const VkImageCreateInfo *pCreateInfo) + const VkImageCreateInfo *pCreateInfo, + VkFormat format) { bool dcc_compatible_formats; bool blendable; @@ -162,15 +158,14 @@ return false; /* TODO: Enable DCC for storage images. */ - if ((pCreateInfo->usage & VK_IMAGE_USAGE_STORAGE_BIT) || - (pCreateInfo->flags & VK_IMAGE_CREATE_EXTENDED_USAGE_BIT)) + if ((pCreateInfo->usage & VK_IMAGE_USAGE_STORAGE_BIT)) return false; if (pCreateInfo->tiling == VK_IMAGE_TILING_LINEAR) return false; - if (vk_format_is_subsampled(pCreateInfo->format) || - vk_format_get_plane_count(pCreateInfo->format) > 1) + if (vk_format_is_subsampled(format) || + vk_format_get_plane_count(format) > 1) return false; /* TODO: Enable DCC for mipmaps on GFX9+. */ @@ -182,9 +177,6 @@ if (pCreateInfo->arrayLayers > 1 && pCreateInfo->mipLevels > 1) return false; - if (radv_surface_has_scanout(device, create_info)) - return false; - /* FIXME: DCC for MSAA with 4x and 8x samples doesn't work yet, while * 2x can be enabled with an option. */ @@ -195,14 +187,14 @@ /* Determine if the formats are DCC compatible. */ dcc_compatible_formats = - radv_is_colorbuffer_format_supported(pCreateInfo->format, + radv_is_colorbuffer_format_supported(format, &blendable); if (pCreateInfo->flags & VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT) { - const struct VkImageFormatListCreateInfoKHR *format_list = - (const struct VkImageFormatListCreateInfoKHR *) + const struct VkImageFormatListCreateInfo *format_list = + (const struct VkImageFormatListCreateInfo *) vk_find_struct_const(pCreateInfo->pNext, - IMAGE_FORMAT_LIST_CREATE_INFO_KHR); + IMAGE_FORMAT_LIST_CREATE_INFO); /* We have to ignore the existence of the list if viewFormatCount = 0 */ if (format_list && format_list->viewFormatCount) { @@ -212,7 +204,7 @@ if (format_list->pViewFormats[i] == VK_FORMAT_UNDEFINED) continue; - if (!radv_dcc_formats_compatible(pCreateInfo->format, + if (!radv_dcc_formats_compatible(format, format_list->pViewFormats[i])) dcc_compatible_formats = false; } @@ -250,12 +242,32 @@ return true; } +static uint32_t si_get_bo_metadata_word1(const struct radv_device *device) +{ + return (ATI_VENDOR_ID << 16) | device->physical_device->rad_info.pci_id; +} + +static bool +radv_is_valid_opaque_metadata(const struct radv_device *device, + const struct radeon_bo_metadata *md) +{ + if (md->metadata[0] != 1 || + md->metadata[1] != si_get_bo_metadata_word1(device)) + return false; + + if (md->size_metadata < 40) + return false; + + return true; +} + static void -radv_prefill_surface_from_metadata(struct radv_device *device, - struct radeon_surf *surface, - const struct radv_image_create_info *create_info) +radv_patch_surface_from_metadata(struct radv_device *device, + struct radeon_surf *surface, + const struct radeon_bo_metadata *md) { - const struct radeon_bo_metadata *md = create_info->bo_metadata; + surface->flags = RADEON_SURF_CLR(surface->flags, MODE); + if (device->physical_device->rad_info.chip_class >= GFX9) { if (md->u.gfx9.swizzle_mode > 0) surface->flags |= RADEON_SURF_SET(RADEON_SURF_MODE_2D, MODE); @@ -281,16 +293,105 @@ } } +static VkResult +radv_patch_image_dimensions(struct radv_device *device, + struct radv_image *image, + const struct radv_image_create_info *create_info, + struct ac_surf_info *image_info) +{ + unsigned width = image->info.width; + unsigned height = image->info.height; + + /* + * minigbm sometimes allocates bigger images which is going to result in + * weird strides and other properties. Lets be lenient where possible and + * fail it on GFX10 (as we cannot cope there). + * + * Example hack: https://chromium-review.googlesource.com/c/chromiumos/platform/minigbm/+/1457777/ + */ + if (create_info->bo_metadata && + radv_is_valid_opaque_metadata(device, create_info->bo_metadata)) { + const struct radeon_bo_metadata *md = create_info->bo_metadata; + + if (device->physical_device->rad_info.chip_class >= GFX10) { + width = G_00A004_WIDTH_LO(md->metadata[3]) + + (G_00A008_WIDTH_HI(md->metadata[4]) << 2) + 1; + height = S_00A008_HEIGHT(md->metadata[4]) + 1; + } else { + width = G_008F18_WIDTH(md->metadata[4]) + 1; + height = G_008F18_HEIGHT(md->metadata[4]) + 1; + } + } + + if (image->info.width == width && image->info.height == height) + return VK_SUCCESS; + + if (width < image->info.width || height < image->info.height) { + fprintf(stderr, + "The imported image has smaller dimensions than the internal\n" + "dimensions. Using it is going to fail badly, so we reject\n" + "this import.\n" + "(internal dimensions: %d x %d, external dimensions: %d x %d)\n", + image->info.width, image->info.height, width, height); + return VK_ERROR_INVALID_EXTERNAL_HANDLE; + } else if (device->physical_device->rad_info.chip_class >= GFX10) { + fprintf(stderr, + "Tried to import an image with inconsistent width on GFX10.\n" + "As GFX10 has no separate stride fields we cannot cope with\n" + "an inconsistency in width and will fail this import.\n" + "(internal dimensions: %d x %d, external dimensions: %d x %d)\n", + image->info.width, image->info.height, width, height); + return VK_ERROR_INVALID_EXTERNAL_HANDLE; + } else { + fprintf(stderr, + "Tried to import an image with inconsistent width on pre-GFX10.\n" + "As GFX10 has no separate stride fields we cannot cope with\n" + "an inconsistency and would fail on GFX10.\n" + "(internal dimensions: %d x %d, external dimensions: %d x %d)\n", + image->info.width, image->info.height, width, height); + } + image_info->width = width; + image_info->height = height; + + return VK_SUCCESS; +} + +static VkResult +radv_patch_image_from_extra_info(struct radv_device *device, + struct radv_image *image, + const struct radv_image_create_info *create_info, + struct ac_surf_info *image_info) +{ + VkResult result = radv_patch_image_dimensions(device, image, create_info, image_info); + if (result != VK_SUCCESS) + return result; + + for (unsigned plane = 0; plane < image->plane_count; ++plane) { + if (create_info->bo_metadata) { + radv_patch_surface_from_metadata(device, &image->planes[plane].surface, + create_info->bo_metadata); + } + + if (radv_surface_has_scanout(device, create_info)) { + image->planes[plane].surface.flags |= RADEON_SURF_SCANOUT; + image->planes[plane].surface.flags |= RADEON_SURF_DISABLE_DCC; + + image->info.surf_index = NULL; + } + } + return VK_SUCCESS; +} + static int radv_init_surface(struct radv_device *device, const struct radv_image *image, struct radeon_surf *surface, unsigned plane_id, - const struct radv_image_create_info *create_info) + const VkImageCreateInfo *pCreateInfo, + VkFormat image_format) { - const VkImageCreateInfo *pCreateInfo = create_info->vk_info; - unsigned array_mode = radv_choose_tiling(device, create_info); - VkFormat format = vk_format_get_plane_format(pCreateInfo->format, plane_id); + unsigned array_mode = radv_choose_tiling(device, pCreateInfo, image_format); + VkFormat format = vk_format_get_plane_format(image_format, plane_id); const struct vk_format_description *desc = vk_format_description(format); bool is_depth, is_stencil; @@ -305,11 +406,8 @@ if (surface->bpe == 3) { surface->bpe = 4; } - if (create_info->bo_metadata) { - radv_prefill_surface_from_metadata(device, surface, create_info); - } else { - surface->flags = RADEON_SURF_SET(array_mode, MODE); - } + + surface->flags = RADEON_SURF_SET(array_mode, MODE); switch (pCreateInfo->imageType){ case VK_IMAGE_TYPE_1D: @@ -333,7 +431,7 @@ if (is_depth) { surface->flags |= RADEON_SURF_ZBUFFER; - if (radv_use_tc_compat_htile_for_image(device, pCreateInfo)) + if (radv_use_tc_compat_htile_for_image(device, pCreateInfo, image_format)) surface->flags |= RADEON_SURF_TC_COMPATIBLE_HTILE; } @@ -342,26 +440,18 @@ if (device->physical_device->rad_info.chip_class >= GFX9 && pCreateInfo->imageType == VK_IMAGE_TYPE_3D && - vk_format_get_blocksizebits(pCreateInfo->format) == 128 && - vk_format_is_compressed(pCreateInfo->format)) + vk_format_get_blocksizebits(image_format) == 128 && + vk_format_is_compressed(image_format)) surface->flags |= RADEON_SURF_NO_RENDER_TARGET; surface->flags |= RADEON_SURF_OPTIMIZE_FOR_SPACE; - if (!radv_use_dcc_for_image(device, image, create_info, pCreateInfo)) + if (!radv_use_dcc_for_image(device, image, pCreateInfo, image_format)) surface->flags |= RADEON_SURF_DISABLE_DCC; - if (radv_surface_has_scanout(device, create_info)) - surface->flags |= RADEON_SURF_SCANOUT; - return 0; } -static uint32_t si_get_bo_metadata_word1(struct radv_device *device) -{ - return (ATI_VENDOR_ID << 16) | device->physical_device->rad_info.pci_id; -} - static inline unsigned si_tile_mode_index(const struct radv_image_plane *plane, unsigned level, bool stencil) { @@ -433,7 +523,7 @@ * else: swizzle_address >= NUM_RECORDS */ state[3] |= S_008F0C_FORMAT(fmt->img_format) | - S_008F0C_OOB_SELECT(0) | + S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_STRUCTURED_WITH_OFFSET) | S_008F0C_RESOURCE_LEVEL(1); } else { num_format = radv_translate_buffer_numformat(desc, first_non_void); @@ -1057,6 +1147,7 @@ if (device->physical_device->rad_info.chip_class >= GFX9) { metadata->u.gfx9.swizzle_mode = surface->u.gfx9.surf.swizzle_mode; + metadata->u.gfx9.scanout = (surface->flags & RADEON_SURF_SCANOUT) != 0; } else { metadata->u.legacy.microtile = surface->u.legacy.level[0].mode >= RADEON_SURF_MODE_1D ? RADEON_LAYOUT_TILED : RADEON_LAYOUT_LINEAR; @@ -1157,15 +1248,15 @@ /* + 8 for storing the clear values */ image->clear_value_offset = image->htile_offset + image->planes[0].surface.htile_size; - image->size = image->clear_value_offset + 8; + image->size = image->clear_value_offset + image->info.levels * 8; if (radv_image_is_tc_compat_htile(image) && - device->physical_device->has_tc_compat_zrange_bug) { + device->physical_device->rad_info.has_tc_compat_zrange_bug) { /* Metadata for the TC-compatible HTILE hardware bug which * have to be fixed by updating ZRANGE_PRECISION when doing * fast depth clears to 0.0f. */ image->tc_compat_zrange_offset = image->size; - image->size = image->tc_compat_zrange_offset + 4; + image->size = image->tc_compat_zrange_offset + image->info.levels * 4; } image->alignment = align64(image->alignment, image->planes[0].surface.htile_alignment); } @@ -1256,72 +1347,29 @@ } VkResult -radv_image_create(VkDevice _device, - const struct radv_image_create_info *create_info, - const VkAllocationCallbacks* alloc, - VkImage *pImage) -{ - RADV_FROM_HANDLE(radv_device, device, _device); - const VkImageCreateInfo *pCreateInfo = create_info->vk_info; - struct radv_image *image = NULL; - assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO); - - const unsigned plane_count = vk_format_get_plane_count(pCreateInfo->format); - const size_t image_struct_size = sizeof(*image) + sizeof(struct radv_image_plane) * plane_count; +radv_image_create_layout(struct radv_device *device, + struct radv_image_create_info create_info, + struct radv_image *image) +{ + /* Check that we did not initialize things earlier */ + assert(!image->planes[0].surface.surf_size); + + /* Clear the pCreateInfo pointer so we catch issues in the delayed case when we test in the + * common internal case. */ + create_info.vk_info = NULL; + + struct ac_surf_info image_info = image->info; + VkResult result = radv_patch_image_from_extra_info(device, image, &create_info, &image_info); + if (result != VK_SUCCESS) + return result; - radv_assert(pCreateInfo->mipLevels > 0); - radv_assert(pCreateInfo->arrayLayers > 0); - radv_assert(pCreateInfo->samples > 0); - radv_assert(pCreateInfo->extent.width > 0); - radv_assert(pCreateInfo->extent.height > 0); - radv_assert(pCreateInfo->extent.depth > 0); - - image = vk_zalloc2(&device->alloc, alloc, image_struct_size, 8, - VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); - if (!image) - return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); - - image->type = pCreateInfo->imageType; - image->info.width = pCreateInfo->extent.width; - image->info.height = pCreateInfo->extent.height; - image->info.depth = pCreateInfo->extent.depth; - image->info.samples = pCreateInfo->samples; - image->info.storage_samples = pCreateInfo->samples; - image->info.array_size = pCreateInfo->arrayLayers; - image->info.levels = pCreateInfo->mipLevels; - image->info.num_channels = vk_format_get_nr_components(pCreateInfo->format); - - image->vk_format = pCreateInfo->format; - image->tiling = pCreateInfo->tiling; - image->usage = pCreateInfo->usage; - image->flags = pCreateInfo->flags; - - image->exclusive = pCreateInfo->sharingMode == VK_SHARING_MODE_EXCLUSIVE; - if (pCreateInfo->sharingMode == VK_SHARING_MODE_CONCURRENT) { - for (uint32_t i = 0; i < pCreateInfo->queueFamilyIndexCount; ++i) - if (pCreateInfo->pQueueFamilyIndices[i] == VK_QUEUE_FAMILY_EXTERNAL || - pCreateInfo->pQueueFamilyIndices[i] == VK_QUEUE_FAMILY_FOREIGN_EXT) - image->queue_family_mask |= (1u << RADV_MAX_QUEUE_FAMILIES) - 1u; - else - image->queue_family_mask |= 1u << pCreateInfo->pQueueFamilyIndices[i]; - } - - image->shareable = vk_find_struct_const(pCreateInfo->pNext, - EXTERNAL_MEMORY_IMAGE_CREATE_INFO) != NULL; - if (!vk_format_is_depth_or_stencil(pCreateInfo->format) && - !radv_surface_has_scanout(device, create_info) && !image->shareable) { - image->info.surf_index = &device->image_mrt_offset_counter; - } - - image->plane_count = plane_count; image->size = 0; image->alignment = 1; - for (unsigned plane = 0; plane < plane_count; ++plane) { - struct ac_surf_info info = image->info; - radv_init_surface(device, image, &image->planes[plane].surface, plane, create_info); + for (unsigned plane = 0; plane < image->plane_count; ++plane) { + struct ac_surf_info info = image_info; if (plane) { - const struct vk_format_description *desc = vk_format_description(pCreateInfo->format); + const struct vk_format_description *desc = vk_format_description(image->vk_format); assert(info.width % desc->width_divisor == 0); assert(info.height % desc->height_divisor == 0); @@ -1338,7 +1386,7 @@ image->planes[plane].format = vk_format_get_plane_format(image->vk_format, plane); } - if (!create_info->no_metadata_planes) { + if (!create_info.no_metadata_planes) { /* Try to enable DCC first. */ if (radv_image_can_enable_dcc(device, image)) { radv_image_alloc_dcc(image); @@ -1378,7 +1426,90 @@ radv_image_disable_htile(image); } - if (pCreateInfo->flags & VK_IMAGE_CREATE_SPARSE_BINDING_BIT) { + assert(image->planes[0].surface.surf_size); + return VK_SUCCESS; +} + +VkResult +radv_image_create(VkDevice _device, + const struct radv_image_create_info *create_info, + const VkAllocationCallbacks* alloc, + VkImage *pImage) +{ + RADV_FROM_HANDLE(radv_device, device, _device); + const VkImageCreateInfo *pCreateInfo = create_info->vk_info; + struct radv_image *image = NULL; + VkFormat format = radv_select_android_external_format(pCreateInfo->pNext, + pCreateInfo->format); + assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO); + + const unsigned plane_count = vk_format_get_plane_count(format); + const size_t image_struct_size = sizeof(*image) + sizeof(struct radv_image_plane) * plane_count; + + radv_assert(pCreateInfo->mipLevels > 0); + radv_assert(pCreateInfo->arrayLayers > 0); + radv_assert(pCreateInfo->samples > 0); + radv_assert(pCreateInfo->extent.width > 0); + radv_assert(pCreateInfo->extent.height > 0); + radv_assert(pCreateInfo->extent.depth > 0); + + image = vk_zalloc2(&device->alloc, alloc, image_struct_size, 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (!image) + return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); + + image->type = pCreateInfo->imageType; + image->info.width = pCreateInfo->extent.width; + image->info.height = pCreateInfo->extent.height; + image->info.depth = pCreateInfo->extent.depth; + image->info.samples = pCreateInfo->samples; + image->info.storage_samples = pCreateInfo->samples; + image->info.array_size = pCreateInfo->arrayLayers; + image->info.levels = pCreateInfo->mipLevels; + image->info.num_channels = vk_format_get_nr_components(format); + + image->vk_format = format; + image->tiling = pCreateInfo->tiling; + image->usage = pCreateInfo->usage; + image->flags = pCreateInfo->flags; + image->plane_count = plane_count; + + image->exclusive = pCreateInfo->sharingMode == VK_SHARING_MODE_EXCLUSIVE; + if (pCreateInfo->sharingMode == VK_SHARING_MODE_CONCURRENT) { + for (uint32_t i = 0; i < pCreateInfo->queueFamilyIndexCount; ++i) + if (pCreateInfo->pQueueFamilyIndices[i] == VK_QUEUE_FAMILY_EXTERNAL || + pCreateInfo->pQueueFamilyIndices[i] == VK_QUEUE_FAMILY_FOREIGN_EXT) + image->queue_family_mask |= (1u << RADV_MAX_QUEUE_FAMILIES) - 1u; + else + image->queue_family_mask |= 1u << pCreateInfo->pQueueFamilyIndices[i]; + } + + const VkExternalMemoryImageCreateInfo *external_info = + vk_find_struct_const(pCreateInfo->pNext, + EXTERNAL_MEMORY_IMAGE_CREATE_INFO) ; + + image->shareable = external_info; + if (!vk_format_is_depth_or_stencil(format) && !image->shareable) { + image->info.surf_index = &device->image_mrt_offset_counter; + } + + for (unsigned plane = 0; plane < image->plane_count; ++plane) { + radv_init_surface(device, image, &image->planes[plane].surface, plane, pCreateInfo, format); + } + + bool delay_layout = external_info && + (external_info->handleTypes & VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID); + + if (delay_layout) { + *pImage = radv_image_to_handle(image); + assert (!(image->flags & VK_IMAGE_CREATE_SPARSE_BINDING_BIT)); + return VK_SUCCESS; + } + + ASSERTED VkResult result = radv_image_create_layout(device, *create_info, image); + assert(result == VK_SUCCESS); + + if (image->flags & VK_IMAGE_CREATE_SPARSE_BINDING_BIT) { image->alignment = MAX2(image->alignment, 4096); image->size = align64(image->size, image->alignment); image->offset = 0; @@ -1515,8 +1646,14 @@ iview->plane_id = radv_plane_from_aspect(pCreateInfo->subresourceRange.aspectMask); iview->aspect_mask = pCreateInfo->subresourceRange.aspectMask; iview->multiple_planes = vk_format_get_plane_count(image->vk_format) > 1 && iview->aspect_mask == VK_IMAGE_ASPECT_COLOR_BIT; + iview->vk_format = pCreateInfo->format; + /* If the image has an Android external format, pCreateInfo->format will be + * VK_FORMAT_UNDEFINED. */ + if (iview->vk_format == VK_FORMAT_UNDEFINED) + iview->vk_format = image->vk_format; + if (iview->aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT) { iview->vk_format = vk_format_stencil_only(iview->vk_format); } else if (iview->aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT) { @@ -1616,6 +1753,8 @@ return radv_image_has_htile(image) && (layout == VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL || + layout == VK_IMAGE_LAYOUT_DEPTH_ATTACHMENT_OPTIMAL_KHR || + layout == VK_IMAGE_LAYOUT_STENCIL_ATTACHMENT_OPTIMAL_KHR || (layout == VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL && queue_mask == (1u << RADV_QUEUE_GENERAL))); } @@ -1630,6 +1769,8 @@ return radv_image_has_htile(image) && (layout == VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL || + layout == VK_IMAGE_LAYOUT_DEPTH_ATTACHMENT_OPTIMAL_KHR || + layout == VK_IMAGE_LAYOUT_STENCIL_ATTACHMENT_OPTIMAL_KHR || (layout == VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL && queue_mask == (1u << RADV_QUEUE_GENERAL))); } @@ -1733,7 +1874,9 @@ struct radeon_surf *surface = &plane->surface; if (device->physical_device->rad_info.chip_class >= GFX9) { - pLayout->offset = plane->offset + surface->u.gfx9.offset[level] + surface->u.gfx9.surf_slice_size * layer; + uint64_t level_offset = surface->is_linear ? surface->u.gfx9.offset[level] : 0; + + pLayout->offset = plane->offset + level_offset + surface->u.gfx9.surf_slice_size * layer; if (image->vk_format == VK_FORMAT_R32G32B32_UINT || image->vk_format == VK_FORMAT_R32G32B32_SINT || image->vk_format == VK_FORMAT_R32G32B32_SFLOAT) { @@ -1743,8 +1886,10 @@ */ pLayout->rowPitch = surface->u.gfx9.surf_pitch * surface->bpe / 3; } else { + uint32_t pitch = surface->is_linear ? surface->u.gfx9.pitch[level] : surface->u.gfx9.surf_pitch; + assert(util_is_power_of_two_nonzero(surface->bpe)); - pLayout->rowPitch = surface->u.gfx9.surf_pitch * surface->bpe; + pLayout->rowPitch = pitch * surface->bpe; } pLayout->arrayPitch = surface->u.gfx9.surf_slice_size; diff -Nru mesa-19.2.8/src/amd/vulkan/radv_meta_blit2d.c mesa-20.0.8/src/amd/vulkan/radv_meta_blit2d.c --- mesa-19.2.8/src/amd/vulkan/radv_meta_blit2d.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/vulkan/radv_meta_blit2d.c 2020-06-12 01:21:16.000000000 +0000 @@ -817,7 +817,27 @@ .preserveAttachmentCount = 0, .pPreserveAttachments = NULL, }, - .dependencyCount = 0, + .dependencyCount = 2, + .pDependencies = (VkSubpassDependency[]) { + { + .srcSubpass = VK_SUBPASS_EXTERNAL, + .dstSubpass = 0, + .srcStageMask = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, + .dstStageMask = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, + .srcAccessMask = 0, + .dstAccessMask = 0, + .dependencyFlags = 0 + }, + { + .srcSubpass = 0, + .dstSubpass = VK_SUBPASS_EXTERNAL, + .srcStageMask = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, + .dstStageMask = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, + .srcAccessMask = 0, + .dstAccessMask = 0, + .dependencyFlags = 0 + } + }, }, &device->meta_state.alloc, &device->meta_state.blit2d_render_passes[fs_key][dst_layout]); } } @@ -988,7 +1008,27 @@ .preserveAttachmentCount = 0, .pPreserveAttachments = NULL, }, - .dependencyCount = 0, + .dependencyCount = 2, + .pDependencies = (VkSubpassDependency[]) { + { + .srcSubpass = VK_SUBPASS_EXTERNAL, + .dstSubpass = 0, + .srcStageMask = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, + .dstStageMask = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, + .srcAccessMask = 0, + .dstAccessMask = 0, + .dependencyFlags = 0 + }, + { + .srcSubpass = 0, + .dstSubpass = VK_SUBPASS_EXTERNAL, + .srcStageMask = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, + .dstStageMask = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, + .srcAccessMask = 0, + .dstAccessMask = 0, + .dependencyFlags = 0 + } + }, }, &device->meta_state.alloc, &device->meta_state.blit2d_depth_only_rp[ds_layout]); } } @@ -1158,7 +1198,27 @@ .preserveAttachmentCount = 0, .pPreserveAttachments = NULL, }, - .dependencyCount = 0, + .dependencyCount = 2, + .pDependencies = (VkSubpassDependency[]) { + { + .srcSubpass = VK_SUBPASS_EXTERNAL, + .dstSubpass = 0, + .srcStageMask = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, + .dstStageMask = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, + .srcAccessMask = 0, + .dstAccessMask = 0, + .dependencyFlags = 0 + }, + { + .srcSubpass = 0, + .dstSubpass = VK_SUBPASS_EXTERNAL, + .srcStageMask = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, + .dstStageMask = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, + .srcAccessMask = 0, + .dstAccessMask = 0, + .dependencyFlags = 0 + } + }, }, &device->meta_state.alloc, &device->meta_state.blit2d_stencil_only_rp[ds_layout]); } } diff -Nru mesa-19.2.8/src/amd/vulkan/radv_meta_blit.c mesa-20.0.8/src/amd/vulkan/radv_meta_blit.c --- mesa-19.2.8/src/amd/vulkan/radv_meta_blit.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/vulkan/radv_meta_blit.c 2020-06-12 01:21:16.000000000 +0000 @@ -959,7 +959,27 @@ .preserveAttachmentCount = 0, .pPreserveAttachments = NULL, }, - .dependencyCount = 0, + .dependencyCount = 2, + .pDependencies = (VkSubpassDependency[]) { + { + .srcSubpass = VK_SUBPASS_EXTERNAL, + .dstSubpass = 0, + .srcStageMask = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, + .dstStageMask = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, + .srcAccessMask = 0, + .dstAccessMask = 0, + .dependencyFlags = 0 + }, + { + .srcSubpass = 0, + .dstSubpass = VK_SUBPASS_EXTERNAL, + .srcStageMask = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, + .dstStageMask = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, + .srcAccessMask = 0, + .dstAccessMask = 0, + .dependencyFlags = 0 + } + }, }, &device->meta_state.alloc, &device->meta_state.blit.render_pass[key][j]); if (result != VK_SUCCESS) goto fail; @@ -1019,7 +1039,27 @@ .preserveAttachmentCount = 0, .pPreserveAttachments = NULL, }, - .dependencyCount = 0, + .dependencyCount = 2, + .pDependencies = (VkSubpassDependency[]) { + { + .srcSubpass = VK_SUBPASS_EXTERNAL, + .dstSubpass = 0, + .srcStageMask = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, + .dstStageMask = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, + .srcAccessMask = 0, + .dstAccessMask = 0, + .dependencyFlags = 0 + }, + { + .srcSubpass = 0, + .dstSubpass = VK_SUBPASS_EXTERNAL, + .srcStageMask = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, + .dstStageMask = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, + .srcAccessMask = 0, + .dstAccessMask = 0, + .dependencyFlags = 0 + } + }, }, &device->meta_state.alloc, &device->meta_state.blit.depth_only_rp[ds_layout]); if (result != VK_SUCCESS) goto fail; @@ -1076,7 +1116,28 @@ .preserveAttachmentCount = 0, .pPreserveAttachments = NULL, }, - .dependencyCount = 0, + .dependencyCount = 2, + .pDependencies = (VkSubpassDependency[]) { + { + .srcSubpass = VK_SUBPASS_EXTERNAL, + .dstSubpass = 0, + .srcStageMask = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, + .dstStageMask = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, + .srcAccessMask = 0, + .dstAccessMask = 0, + .dependencyFlags = 0 + }, + { + .srcSubpass = 0, + .dstSubpass = VK_SUBPASS_EXTERNAL, + .srcStageMask = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, + .dstStageMask = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, + .srcAccessMask = 0, + .dstAccessMask = 0, + .dependencyFlags = 0 + } + }, + }, &device->meta_state.alloc, &device->meta_state.blit.stencil_only_rp[ds_layout]); } if (result != VK_SUCCESS) diff -Nru mesa-19.2.8/src/amd/vulkan/radv_meta_buffer.c mesa-20.0.8/src/amd/vulkan/radv_meta_buffer.c --- mesa-19.2.8/src/amd/vulkan/radv_meta_buffer.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/vulkan/radv_meta_buffer.c 2020-06-12 01:21:16.000000000 +0000 @@ -52,6 +52,7 @@ store->src[2] = nir_src_for_ssa(offset); nir_intrinsic_set_write_mask(store, 0xf); nir_intrinsic_set_access(store, ACCESS_NON_READABLE); + nir_intrinsic_set_align(store, 16, 0); store->num_components = 4; nir_builder_instr_insert(&b, &store->instr); @@ -104,6 +105,7 @@ load->src[1] = nir_src_for_ssa(offset); nir_ssa_dest_init(&load->instr, &load->dest, 4, 32, NULL); load->num_components = 4; + nir_intrinsic_set_align(load, 16, 0); nir_builder_instr_insert(&b, &load->instr); nir_intrinsic_instr *store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo); @@ -112,6 +114,7 @@ store->src[2] = nir_src_for_ssa(offset); nir_intrinsic_set_write_mask(store, 0xf); nir_intrinsic_set_access(store, ACCESS_NON_READABLE); + nir_intrinsic_set_align(store, 16, 0); store->num_components = 4; nir_builder_instr_insert(&b, &store->instr); diff -Nru mesa-19.2.8/src/amd/vulkan/radv_meta_bufimage.c mesa-20.0.8/src/amd/vulkan/radv_meta_bufimage.c --- mesa-19.2.8/src/amd/vulkan/radv_meta_bufimage.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/vulkan/radv_meta_bufimage.c 2020-06-12 01:21:16.000000000 +0000 @@ -121,6 +121,7 @@ store->src[1] = nir_src_for_ssa(coord); store->src[2] = nir_src_for_ssa(nir_ssa_undef(&b, 1, 32)); store->src[3] = nir_src_for_ssa(outval); + store->src[4] = nir_src_for_ssa(nir_imm_int(&b, 0)); nir_builder_instr_insert(&b, &store->instr); return b.shader; @@ -348,6 +349,7 @@ store->src[1] = nir_src_for_ssa(img_coord); store->src[2] = nir_src_for_ssa(nir_ssa_undef(&b, 1, 32)); store->src[3] = nir_src_for_ssa(outval); + store->src[4] = nir_src_for_ssa(nir_imm_int(&b, 0)); nir_builder_instr_insert(&b, &store->instr); return b.shader; @@ -591,6 +593,7 @@ store->src[1] = nir_src_for_ssa(coord); store->src[2] = nir_src_for_ssa(nir_ssa_undef(&b, 1, 32)); store->src[3] = nir_src_for_ssa(nir_channel(&b, outval, chan)); + store->src[4] = nir_src_for_ssa(nir_imm_int(&b, 0)); nir_builder_instr_insert(&b, &store->instr); } @@ -772,6 +775,7 @@ store->src[1] = nir_src_for_ssa(dst_coord); store->src[2] = nir_src_for_ssa(nir_ssa_undef(&b, 1, 32)); store->src[3] = nir_src_for_ssa(outval); + store->src[4] = nir_src_for_ssa(nir_imm_int(&b, 0)); nir_builder_instr_insert(&b, &store->instr); return b.shader; @@ -1018,6 +1022,7 @@ store->src[1] = nir_src_for_ssa(dst_coord); store->src[2] = nir_src_for_ssa(nir_ssa_undef(&b, 1, 32)); store->src[3] = nir_src_for_ssa(nir_channel(&b, outval, 0)); + store->src[4] = nir_src_for_ssa(nir_imm_int(&b, 0)); nir_builder_instr_insert(&b, &store->instr); } @@ -1179,6 +1184,7 @@ store->src[1] = nir_src_for_ssa(global_id); store->src[2] = nir_src_for_ssa(nir_ssa_undef(&b, 1, 32)); store->src[3] = nir_src_for_ssa(&clear_val->dest.ssa); + store->src[4] = nir_src_for_ssa(nir_imm_int(&b, 0)); nir_builder_instr_insert(&b, &store->instr); return b.shader; @@ -1377,6 +1383,7 @@ store->src[1] = nir_src_for_ssa(coord); store->src[2] = nir_src_for_ssa(nir_ssa_undef(&b, 1, 32)); store->src[3] = nir_src_for_ssa(nir_channel(&b, &clear_val->dest.ssa, chan)); + store->src[4] = nir_src_for_ssa(nir_imm_int(&b, 0)); nir_builder_instr_insert(&b, &store->instr); } diff -Nru mesa-19.2.8/src/amd/vulkan/radv_meta.c mesa-20.0.8/src/amd/vulkan/radv_meta.c --- mesa-19.2.8/src/amd/vulkan/radv_meta.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/vulkan/radv_meta.c 2020-06-12 01:21:16.000000000 +0000 @@ -262,7 +262,8 @@ strcpy(path, pwd.pw_dir); strcat(path, "/.cache"); - mkdir(path, 0755); + if (mkdir(path, 0755) && errno != EEXIST) + return false; ret = snprintf(path, PATH_MAX + 1, "%s%s%zd", pwd.pw_dir, suffix2, sizeof(void *) * 8); diff -Nru mesa-19.2.8/src/amd/vulkan/radv_meta_clear.c mesa-20.0.8/src/amd/vulkan/radv_meta_clear.c --- mesa-19.2.8/src/amd/vulkan/radv_meta_clear.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/vulkan/radv_meta_clear.c 2020-06-12 01:21:16.000000000 +0000 @@ -235,7 +235,27 @@ .preserveAttachmentCount = 0, .pPreserveAttachments = NULL, }, - .dependencyCount = 0, + .dependencyCount = 2, + .pDependencies = (VkSubpassDependency[]) { + { + .srcSubpass = VK_SUBPASS_EXTERNAL, + .dstSubpass = 0, + .srcStageMask = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, + .dstStageMask = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, + .srcAccessMask = 0, + .dstAccessMask = 0, + .dependencyFlags = 0 + }, + { + .srcSubpass = 0, + .dstSubpass = VK_SUBPASS_EXTERNAL, + .srcStageMask = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, + .dstStageMask = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, + .srcAccessMask = 0, + .dstAccessMask = 0, + .dependencyFlags = 0 + } + }, }, &device->meta_state.alloc, pass); mtx_unlock(&device->meta_state.mtx); return result; @@ -344,6 +364,16 @@ radv_DestroyPipeline(radv_device_to_handle(device), state->clear[i].depthstencil_pipeline[j], &state->alloc); + + radv_DestroyPipeline(radv_device_to_handle(device), + state->clear[i].depth_only_unrestricted_pipeline[j], + &state->alloc); + radv_DestroyPipeline(radv_device_to_handle(device), + state->clear[i].stencil_only_unrestricted_pipeline[j], + &state->alloc); + radv_DestroyPipeline(radv_device_to_handle(device), + state->clear[i].depthstencil_unrestricted_pipeline[j], + &state->alloc); } radv_DestroyRenderPass(radv_device_to_handle(device), state->clear[i].depthstencil_rp, @@ -355,6 +385,9 @@ radv_DestroyPipelineLayout(radv_device_to_handle(device), state->clear_depth_p_layout, &state->alloc); + radv_DestroyPipelineLayout(radv_device_to_handle(device), + state->clear_depth_unrestricted_p_layout, + &state->alloc); finish_meta_clear_htile_mask_state(device); } @@ -470,15 +503,21 @@ static void -build_depthstencil_shader(struct nir_shader **out_vs, struct nir_shader **out_fs) +build_depthstencil_shader(struct nir_shader **out_vs, + struct nir_shader **out_fs, + bool unrestricted) { nir_builder vs_b, fs_b; nir_builder_init_simple_shader(&vs_b, NULL, MESA_SHADER_VERTEX, NULL); nir_builder_init_simple_shader(&fs_b, NULL, MESA_SHADER_FRAGMENT, NULL); - vs_b.shader->info.name = ralloc_strdup(vs_b.shader, "meta_clear_depthstencil_vs"); - fs_b.shader->info.name = ralloc_strdup(fs_b.shader, "meta_clear_depthstencil_fs"); + vs_b.shader->info.name = ralloc_strdup(vs_b.shader, + unrestricted ? "meta_clear_depthstencil_unrestricted_vs" + : "meta_clear_depthstencil_vs"); + fs_b.shader->info.name = ralloc_strdup(fs_b.shader, + unrestricted ? "meta_clear_depthstencil_unrestricted_fs" + : "meta_clear_depthstencil_fs"); const struct glsl_type *position_out_type = glsl_vec4_type(); nir_variable *vs_out_pos = @@ -486,15 +525,36 @@ "gl_Position"); vs_out_pos->data.location = VARYING_SLOT_POS; - nir_intrinsic_instr *in_color_load = nir_intrinsic_instr_create(vs_b.shader, nir_intrinsic_load_push_constant); - nir_intrinsic_set_base(in_color_load, 0); - nir_intrinsic_set_range(in_color_load, 4); - in_color_load->src[0] = nir_src_for_ssa(nir_imm_int(&vs_b, 0)); - in_color_load->num_components = 1; - nir_ssa_dest_init(&in_color_load->instr, &in_color_load->dest, 1, 32, "depth value"); - nir_builder_instr_insert(&vs_b, &in_color_load->instr); + nir_ssa_def *z; + if (unrestricted) { + nir_intrinsic_instr *in_color_load = nir_intrinsic_instr_create(fs_b.shader, nir_intrinsic_load_push_constant); + nir_intrinsic_set_base(in_color_load, 0); + nir_intrinsic_set_range(in_color_load, 4); + in_color_load->src[0] = nir_src_for_ssa(nir_imm_int(&fs_b, 0)); + in_color_load->num_components = 1; + nir_ssa_dest_init(&in_color_load->instr, &in_color_load->dest, 1, 32, "depth value"); + nir_builder_instr_insert(&fs_b, &in_color_load->instr); + + nir_variable *fs_out_depth = + nir_variable_create(fs_b.shader, nir_var_shader_out, + glsl_int_type(), "f_depth"); + fs_out_depth->data.location = FRAG_RESULT_DEPTH; + nir_store_var(&fs_b, fs_out_depth, &in_color_load->dest.ssa, 0x1); - nir_ssa_def *outvec = radv_meta_gen_rect_vertices_comp2(&vs_b, &in_color_load->dest.ssa); + z = nir_imm_float(&vs_b, 0.0); + } else { + nir_intrinsic_instr *in_color_load = nir_intrinsic_instr_create(vs_b.shader, nir_intrinsic_load_push_constant); + nir_intrinsic_set_base(in_color_load, 0); + nir_intrinsic_set_range(in_color_load, 4); + in_color_load->src[0] = nir_src_for_ssa(nir_imm_int(&vs_b, 0)); + in_color_load->num_components = 1; + nir_ssa_dest_init(&in_color_load->instr, &in_color_load->dest, 1, 32, "depth value"); + nir_builder_instr_insert(&vs_b, &in_color_load->instr); + + z = &in_color_load->dest.ssa; + } + + nir_ssa_def *outvec = radv_meta_gen_rect_vertices_comp2(&vs_b, z); nir_store_var(&vs_b, vs_out_pos, outvec, 0xf); const struct glsl_type *layer_type = glsl_int_type(); @@ -550,7 +610,27 @@ .preserveAttachmentCount = 0, .pPreserveAttachments = NULL, }, - .dependencyCount = 0, + .dependencyCount = 2, + .pDependencies = (VkSubpassDependency[]) { + { + .srcSubpass = VK_SUBPASS_EXTERNAL, + .dstSubpass = 0, + .srcStageMask = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, + .dstStageMask = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, + .srcAccessMask = 0, + .dstAccessMask = 0, + .dependencyFlags = 0 + }, + { + .srcSubpass = 0, + .dstSubpass = VK_SUBPASS_EXTERNAL, + .srcStageMask = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, + .dstStageMask = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, + .srcAccessMask = 0, + .dstAccessMask = 0, + .dependencyFlags = 0 + } + } }, &device->meta_state.alloc, render_pass); mtx_unlock(&device->meta_state.mtx); return result; @@ -561,6 +641,7 @@ VkImageAspectFlags aspects, uint32_t samples, int index, + bool unrestricted, VkPipeline *pipeline, VkRenderPass render_pass) { @@ -573,7 +654,7 @@ return VK_SUCCESS; } - build_depthstencil_shader(&vs_nir, &fs_nir); + build_depthstencil_shader(&vs_nir, &fs_nir, unrestricted); const VkPipelineVertexInputStateCreateInfo vi_state = { .sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO, @@ -671,6 +752,7 @@ { bool fast = depth_view_can_fast_clear(cmd_buffer, iview, aspects, layout, in_render_loop, clear_rect, clear_value); + bool unrestricted = cmd_buffer->device->enabled_extensions.EXT_depth_range_unrestricted; int index = DEPTH_CLEAR_SLOW; VkPipeline *pipeline; @@ -682,13 +764,19 @@ switch (aspects) { case VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT: - pipeline = &meta_state->clear[samples_log2].depthstencil_pipeline[index]; + pipeline = unrestricted ? + &meta_state->clear[samples_log2].depthstencil_unrestricted_pipeline[index] : + &meta_state->clear[samples_log2].depthstencil_pipeline[index]; break; case VK_IMAGE_ASPECT_DEPTH_BIT: - pipeline = &meta_state->clear[samples_log2].depth_only_pipeline[index]; + pipeline = unrestricted ? + &meta_state->clear[samples_log2].depth_only_unrestricted_pipeline[index] : + &meta_state->clear[samples_log2].depth_only_pipeline[index]; break; case VK_IMAGE_ASPECT_STENCIL_BIT: - pipeline = &meta_state->clear[samples_log2].stencil_only_pipeline[index]; + pipeline = unrestricted ? + &meta_state->clear[samples_log2].stencil_only_unrestricted_pipeline[index] : + &meta_state->clear[samples_log2].stencil_only_pipeline[index]; break; default: unreachable("expected depth or stencil aspect"); @@ -704,7 +792,7 @@ } if (*pipeline == VK_NULL_HANDLE) { - VkResult ret = create_depthstencil_pipeline(cmd_buffer->device, aspects, 1u << samples_log2, index, + VkResult ret = create_depthstencil_pipeline(cmd_buffer->device, aspects, 1u << samples_log2, index, unrestricted, pipeline, cmd_buffer->device->meta_state.clear[samples_log2].depthstencil_rp); if (ret != VK_SUCCESS) { cmd_buffer->record_result = ret; @@ -749,10 +837,17 @@ if (!(aspects & VK_IMAGE_ASPECT_DEPTH_BIT)) clear_value.depth = 1.0f; - radv_CmdPushConstants(radv_cmd_buffer_to_handle(cmd_buffer), - device->meta_state.clear_depth_p_layout, - VK_SHADER_STAGE_VERTEX_BIT, 0, 4, - &clear_value.depth); + if (cmd_buffer->device->enabled_extensions.EXT_depth_range_unrestricted) { + radv_CmdPushConstants(radv_cmd_buffer_to_handle(cmd_buffer), + device->meta_state.clear_depth_unrestricted_p_layout, + VK_SHADER_STAGE_FRAGMENT_BIT, 0, 4, + &clear_value.depth); + } else { + radv_CmdPushConstants(radv_cmd_buffer_to_handle(cmd_buffer), + device->meta_state.clear_depth_p_layout, + VK_SHADER_STAGE_VERTEX_BIT, 0, 4, + &clear_value.depth); + } uint32_t prev_reference = cmd_buffer->state.dynamic.stencil_reference.front; if (aspects & VK_IMAGE_ASPECT_STENCIL_BIT) { @@ -786,7 +881,7 @@ if (depth_view_can_fast_clear(cmd_buffer, iview, aspects, ds_att->layout, ds_att->in_render_loop, clear_rect, clear_value)) - radv_update_ds_clear_metadata(cmd_buffer, iview->image, + radv_update_ds_clear_metadata(cmd_buffer, iview, clear_value, aspects); radv_CmdSetViewport(radv_cmd_buffer_to_handle(cmd_buffer), 0, 1, &(VkViewport) { @@ -1014,12 +1109,6 @@ if (!view_mask && clear_rect->layerCount != iview->image->info.array_size) return false; - if (cmd_buffer->device->physical_device->rad_info.chip_class != GFX9 && - (!(aspects & VK_IMAGE_ASPECT_DEPTH_BIT) || - ((vk_format_aspects(iview->image->vk_format) & VK_IMAGE_ASPECT_STENCIL_BIT) && - !(aspects & VK_IMAGE_ASPECT_STENCIL_BIT)))) - return false; - if (((aspects & VK_IMAGE_ASPECT_DEPTH_BIT) && !radv_is_fast_clear_depth_allowed(clear_value)) || ((aspects & VK_IMAGE_ASPECT_STENCIL_BIT) && @@ -1039,10 +1128,8 @@ VkClearDepthStencilValue clear_value = clear_att->clearValue.depthStencil; VkImageAspectFlags aspects = clear_att->aspectMask; uint32_t clear_word, flush_bits; - uint32_t htile_mask; clear_word = radv_get_htile_fast_clear_value(iview->image, clear_value); - htile_mask = radv_get_htile_mask(iview->image, aspects); if (pre_flush) { cmd_buffer->state.flush_bits |= (RADV_CMD_FLAG_FLUSH_AND_INV_DB | @@ -1050,22 +1137,27 @@ *pre_flush |= cmd_buffer->state.flush_bits; } - if (htile_mask == UINT_MAX) { - /* Clear the whole HTILE buffer. */ - flush_bits = radv_fill_buffer(cmd_buffer, iview->image->bo, - iview->image->offset + iview->image->htile_offset, - iview->image->planes[0].surface.htile_size, clear_word); - } else { - /* Only clear depth or stencil bytes in the HTILE buffer. */ - /* TODO: Implement that path for GFX10. */ - assert(cmd_buffer->device->physical_device->rad_info.chip_class == GFX9); - flush_bits = clear_htile_mask(cmd_buffer, iview->image->bo, - iview->image->offset + iview->image->htile_offset, - iview->image->planes[0].surface.htile_size, clear_word, - htile_mask); + struct VkImageSubresourceRange range = { + .aspectMask = aspects, + .baseMipLevel = 0, + .levelCount = VK_REMAINING_MIP_LEVELS, + .baseArrayLayer = 0, + .layerCount = VK_REMAINING_ARRAY_LAYERS, + }; + + flush_bits = radv_clear_htile(cmd_buffer, iview->image, &range, clear_word); + + if (iview->image->planes[0].surface.has_stencil && + !(aspects == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT))) { + /* Synchronize after performing a depth-only or a stencil-only + * fast clear because the driver uses an optimized path which + * performs a read-modify-write operation, and the two separate + * aspects might use the same HTILE memory. + */ + cmd_buffer->state.flush_bits |= flush_bits; } - radv_update_ds_clear_metadata(cmd_buffer, iview->image, clear_value, aspects); + radv_update_ds_clear_metadata(cmd_buffer, iview, clear_value, aspects); if (post_flush) { *post_flush |= flush_bits; } @@ -1121,6 +1213,7 @@ load->src[1] = nir_src_for_ssa(offset); nir_ssa_dest_init(&load->instr, &load->dest, 4, 32, NULL); load->num_components = 4; + nir_intrinsic_set_align(load, 16, 0); nir_builder_instr_insert(&b, &load->instr); /* data = (data & ~htile_mask) | (htile_value & htile_mask) */ @@ -1136,6 +1229,7 @@ store->src[2] = nir_src_for_ssa(offset); nir_intrinsic_set_write_mask(store, 0xf); nir_intrinsic_set_access(store, ACCESS_NON_READABLE); + nir_intrinsic_set_align(store, 16, 0); store->num_components = 4; nir_builder_instr_insert(&b, &store->instr); @@ -1249,6 +1343,20 @@ if (res != VK_SUCCESS) goto fail; + VkPipelineLayoutCreateInfo pl_depth_unrestricted_create_info = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, + .setLayoutCount = 0, + .pushConstantRangeCount = 1, + .pPushConstantRanges = &(VkPushConstantRange){VK_SHADER_STAGE_FRAGMENT_BIT, 0, 4}, + }; + + res = radv_CreatePipelineLayout(radv_device_to_handle(device), + &pl_depth_unrestricted_create_info, + &device->meta_state.alloc, + &device->meta_state.clear_depth_unrestricted_p_layout); + if (res != VK_SUCCESS) + goto fail; + res = init_meta_clear_htile_mask_state(device); if (res != VK_SUCCESS) goto fail; @@ -1286,6 +1394,7 @@ VK_IMAGE_ASPECT_DEPTH_BIT, samples, j, + false, &state->clear[i].depth_only_pipeline[j], state->clear[i].depthstencil_rp); if (res != VK_SUCCESS) @@ -1295,6 +1404,7 @@ VK_IMAGE_ASPECT_STENCIL_BIT, samples, j, + false, &state->clear[i].stencil_only_pipeline[j], state->clear[i].depthstencil_rp); if (res != VK_SUCCESS) @@ -1305,10 +1415,42 @@ VK_IMAGE_ASPECT_STENCIL_BIT, samples, j, + false, &state->clear[i].depthstencil_pipeline[j], state->clear[i].depthstencil_rp); if (res != VK_SUCCESS) goto fail; + + res = create_depthstencil_pipeline(device, + VK_IMAGE_ASPECT_DEPTH_BIT, + samples, + j, + true, + &state->clear[i].depth_only_unrestricted_pipeline[j], + state->clear[i].depthstencil_rp); + if (res != VK_SUCCESS) + goto fail; + + res = create_depthstencil_pipeline(device, + VK_IMAGE_ASPECT_STENCIL_BIT, + samples, + j, + true, + &state->clear[i].stencil_only_unrestricted_pipeline[j], + state->clear[i].depthstencil_rp); + if (res != VK_SUCCESS) + goto fail; + + res = create_depthstencil_pipeline(device, + VK_IMAGE_ASPECT_DEPTH_BIT | + VK_IMAGE_ASPECT_STENCIL_BIT, + samples, + j, + true, + &state->clear[i].depthstencil_unrestricted_pipeline[j], + state->clear[i].depthstencil_rp); + if (res != VK_SUCCESS) + goto fail; } } return VK_SUCCESS; @@ -1427,15 +1569,30 @@ } uint32_t -radv_clear_htile(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, - const VkImageSubresourceRange *range, uint32_t value) +radv_clear_htile(struct radv_cmd_buffer *cmd_buffer, + const struct radv_image *image, + const VkImageSubresourceRange *range, + uint32_t value) { unsigned layer_count = radv_get_layerCount(image, range); uint64_t size = image->planes[0].surface.htile_slice_size * layer_count; uint64_t offset = image->offset + image->htile_offset + image->planes[0].surface.htile_slice_size * range->baseArrayLayer; + uint32_t htile_mask, flush_bits; - return radv_fill_buffer(cmd_buffer, image->bo, offset, size, value); + htile_mask = radv_get_htile_mask(image, range->aspectMask); + + if (htile_mask == UINT_MAX) { + /* Clear the whole HTILE buffer. */ + flush_bits = radv_fill_buffer(cmd_buffer, image->bo, offset, + size, value); + } else { + /* Only clear depth or stencil bytes in the HTILE buffer. */ + flush_bits = clear_htile_mask(cmd_buffer, image->bo, offset, + size, value, htile_mask); + } + + return flush_bits; } enum { @@ -1971,7 +2128,28 @@ .pAttachments = &att_desc, .subpassCount = 1, .pSubpasses = &subpass_desc, - }, + .dependencyCount = 2, + .pDependencies = (VkSubpassDependency[]) { + { + .srcSubpass = VK_SUBPASS_EXTERNAL, + .dstSubpass = 0, + .srcStageMask = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, + .dstStageMask = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, + .srcAccessMask = 0, + .dstAccessMask = 0, + .dependencyFlags = 0 + }, + { + .srcSubpass = 0, + .dstSubpass = VK_SUBPASS_EXTERNAL, + .srcStageMask = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, + .dstStageMask = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, + .srcAccessMask = 0, + .dstAccessMask = 0, + .dependencyFlags = 0 + } + } + }, &cmd_buffer->pool->alloc, &pass); diff -Nru mesa-19.2.8/src/amd/vulkan/radv_meta_decompress.c mesa-20.0.8/src/amd/vulkan/radv_meta_decompress.c --- mesa-19.2.8/src/amd/vulkan/radv_meta_decompress.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/vulkan/radv_meta_decompress.c 2020-06-12 01:21:16.000000000 +0000 @@ -28,6 +28,17 @@ #include "radv_private.h" #include "sid.h" +enum radv_depth_op { + DEPTH_DECOMPRESS, + DEPTH_RESUMMARIZE, +}; + +enum radv_depth_decompress { + DECOMPRESS_DEPTH_STENCIL, + DECOMPRESS_DEPTH, + DECOMPRESS_STENCIL, +}; + static VkResult create_pass(struct radv_device *device, uint32_t samples, @@ -67,7 +78,27 @@ .preserveAttachmentCount = 0, .pPreserveAttachments = NULL, }, - .dependencyCount = 0, + .dependencyCount = 2, + .pDependencies = (VkSubpassDependency[]) { + { + .srcSubpass = VK_SUBPASS_EXTERNAL, + .dstSubpass = 0, + .srcStageMask = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, + .dstStageMask = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, + .srcAccessMask = 0, + .dstAccessMask = 0, + .dependencyFlags = 0 + }, + { + .srcSubpass = 0, + .dstSubpass = VK_SUBPASS_EXTERNAL, + .srcStageMask = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, + .dstStageMask = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, + .srcAccessMask = 0, + .dstAccessMask = 0, + .dependencyFlags = 0 + } + }, }, alloc, pass); @@ -98,15 +129,16 @@ uint32_t samples, VkRenderPass pass, VkPipelineLayout layout, - VkPipeline *decompress_pipeline, - VkPipeline *resummarize_pipeline) + enum radv_depth_op op, + enum radv_depth_decompress decompress, + VkPipeline *pipeline) { VkResult result; VkDevice device_h = radv_device_to_handle(device); struct radv_shader_module vs_module = {0}; mtx_lock(&device->meta_state.mtx); - if (*decompress_pipeline) { + if (*pipeline) { mtx_unlock(&device->meta_state.mtx); return VK_SUCCESS; } @@ -207,34 +239,20 @@ .subpass = 0, }; - result = radv_graphics_pipeline_create(device_h, - radv_pipeline_cache_to_handle(&device->meta_state.cache), - &pipeline_create_info, - &(struct radv_graphics_pipeline_create_info) { - .use_rectlist = true, - .db_flush_depth_inplace = true, - .db_flush_stencil_inplace = true, - }, - &device->meta_state.alloc, - decompress_pipeline); - if (result != VK_SUCCESS) - goto cleanup; + struct radv_graphics_pipeline_create_info extra = { + .use_rectlist = true, + .db_flush_depth_inplace = decompress == DECOMPRESS_DEPTH_STENCIL || + decompress == DECOMPRESS_DEPTH, + .db_flush_stencil_inplace = decompress == DECOMPRESS_DEPTH_STENCIL || + decompress == DECOMPRESS_STENCIL, + .db_resummarize = op == DEPTH_RESUMMARIZE, + }; result = radv_graphics_pipeline_create(device_h, radv_pipeline_cache_to_handle(&device->meta_state.cache), - &pipeline_create_info, - &(struct radv_graphics_pipeline_create_info) { - .use_rectlist = true, - .db_flush_depth_inplace = true, - .db_flush_stencil_inplace = true, - .db_resummarize = true, - }, + &pipeline_create_info, &extra, &device->meta_state.alloc, - resummarize_pipeline); - if (result != VK_SUCCESS) - goto cleanup; - - goto cleanup; + pipeline); cleanup: ralloc_free(fs_module.nir); @@ -256,9 +274,12 @@ radv_DestroyPipelineLayout(radv_device_to_handle(device), state->depth_decomp[i].p_layout, &state->alloc); - radv_DestroyPipeline(radv_device_to_handle(device), - state->depth_decomp[i].decompress_pipeline, - &state->alloc); + + for (uint32_t j = 0; j < NUM_DEPTH_DECOMPRESS_PIPELINES; j++) { + radv_DestroyPipeline(radv_device_to_handle(device), + state->depth_decomp[i].decompress_pipeline[j], + &state->alloc); + } radv_DestroyPipeline(radv_device_to_handle(device), state->depth_decomp[i].resummarize_pipeline, &state->alloc); @@ -295,10 +316,22 @@ if (on_demand) continue; + for (uint32_t j = 0; j < NUM_DEPTH_DECOMPRESS_PIPELINES; j++) { + res = create_pipeline(device, vs_module_h, samples, + state->depth_decomp[i].pass, + state->depth_decomp[i].p_layout, + DEPTH_DECOMPRESS, + j, + &state->depth_decomp[i].decompress_pipeline[j]); + if (res != VK_SUCCESS) + goto fail; + } + res = create_pipeline(device, vs_module_h, samples, state->depth_decomp[i].pass, state->depth_decomp[i].p_layout, - &state->depth_decomp[i].decompress_pipeline, + DEPTH_RESUMMARIZE, + 0, /* unused */ &state->depth_decomp[i].resummarize_pipeline); if (res != VK_SUCCESS) goto fail; @@ -315,27 +348,47 @@ return res; } -enum radv_depth_op { - DEPTH_DECOMPRESS, - DEPTH_RESUMMARIZE, -}; - static VkPipeline * radv_get_depth_pipeline(struct radv_cmd_buffer *cmd_buffer, - struct radv_image *image, enum radv_depth_op op) + struct radv_image *image, + const VkImageSubresourceRange *subresourceRange, + enum radv_depth_op op) { struct radv_meta_state *state = &cmd_buffer->device->meta_state; uint32_t samples = image->info.samples; uint32_t samples_log2 = ffs(samples) - 1; + enum radv_depth_decompress decompress; VkPipeline *pipeline; - if (!state->depth_decomp[samples_log2].decompress_pipeline) { + if (subresourceRange->aspectMask == VK_IMAGE_ASPECT_DEPTH_BIT) { + decompress = DECOMPRESS_DEPTH; + } else if (subresourceRange->aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) { + decompress = DECOMPRESS_STENCIL; + } else { + decompress = DECOMPRESS_DEPTH_STENCIL; + } + + if (!state->depth_decomp[samples_log2].decompress_pipeline[decompress]) { VkResult ret; + for (uint32_t i = 0; i < NUM_DEPTH_DECOMPRESS_PIPELINES; i++) { + ret = create_pipeline(cmd_buffer->device, VK_NULL_HANDLE, samples, + state->depth_decomp[samples_log2].pass, + state->depth_decomp[samples_log2].p_layout, + DEPTH_DECOMPRESS, + i, + &state->depth_decomp[samples_log2].decompress_pipeline[i]); + if (ret != VK_SUCCESS) { + cmd_buffer->record_result = ret; + return NULL; + } + } + ret = create_pipeline(cmd_buffer->device, VK_NULL_HANDLE, samples, state->depth_decomp[samples_log2].pass, state->depth_decomp[samples_log2].p_layout, - &state->depth_decomp[samples_log2].decompress_pipeline, + DEPTH_RESUMMARIZE, + 0, /* unused */ &state->depth_decomp[samples_log2].resummarize_pipeline); if (ret != VK_SUCCESS) { cmd_buffer->record_result = ret; @@ -345,7 +398,7 @@ switch (op) { case DEPTH_DECOMPRESS: - pipeline = &state->depth_decomp[samples_log2].decompress_pipeline; + pipeline = &state->depth_decomp[samples_log2].decompress_pipeline[decompress]; break; case DEPTH_RESUMMARIZE: pipeline = &state->depth_decomp[samples_log2].resummarize_pipeline; @@ -357,22 +410,85 @@ return pipeline; } +static void +radv_process_depth_image_layer(struct radv_cmd_buffer *cmd_buffer, + struct radv_image *image, + const VkImageSubresourceRange *range, + int level, int layer) +{ + struct radv_device *device = cmd_buffer->device; + struct radv_meta_state *state = &device->meta_state; + uint32_t samples_log2 = ffs(image->info.samples) - 1; + struct radv_image_view iview; + uint32_t width, height; + + width = radv_minify(image->info.width, range->baseMipLevel + level); + height = radv_minify(image->info.height, range->baseMipLevel + level); + + radv_image_view_init(&iview, device, + &(VkImageViewCreateInfo) { + .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, + .image = radv_image_to_handle(image), + .viewType = radv_meta_get_view_type(image), + .format = image->vk_format, + .subresourceRange = { + .aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT, + .baseMipLevel = range->baseMipLevel + level, + .levelCount = 1, + .baseArrayLayer = range->baseArrayLayer + layer, + .layerCount = 1, + }, + }, NULL); + + + VkFramebuffer fb_h; + radv_CreateFramebuffer(radv_device_to_handle(device), + &(VkFramebufferCreateInfo) { + .sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO, + .attachmentCount = 1, + .pAttachments = (VkImageView[]) { + radv_image_view_to_handle(&iview) + }, + .width = width, + .height = height, + .layers = 1 + }, &cmd_buffer->pool->alloc, &fb_h); + + radv_CmdBeginRenderPass(radv_cmd_buffer_to_handle(cmd_buffer), + &(VkRenderPassBeginInfo) { + .sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO, + .renderPass = state->depth_decomp[samples_log2].pass, + .framebuffer = fb_h, + .renderArea = { + .offset = { + 0, + 0, + }, + .extent = { + width, + height, + } + }, + .clearValueCount = 0, + .pClearValues = NULL, + }, + VK_SUBPASS_CONTENTS_INLINE); + + radv_CmdDraw(radv_cmd_buffer_to_handle(cmd_buffer), 3, 1, 0, 0); + radv_CmdEndRenderPass(radv_cmd_buffer_to_handle(cmd_buffer)); + + radv_DestroyFramebuffer(radv_device_to_handle(device), fb_h, + &cmd_buffer->pool->alloc); +} + static void radv_process_depth_image_inplace(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, - VkImageSubresourceRange *subresourceRange, + const VkImageSubresourceRange *subresourceRange, struct radv_sample_locations_state *sample_locs, enum radv_depth_op op) { struct radv_meta_saved_state saved_state; - VkDevice device_h = radv_device_to_handle(cmd_buffer->device); VkCommandBuffer cmd_buffer_h = radv_cmd_buffer_to_handle(cmd_buffer); - uint32_t width = radv_minify(image->info.width, - subresourceRange->baseMipLevel); - uint32_t height = radv_minify(image->info.height, - subresourceRange->baseMipLevel); - uint32_t samples = image->info.samples; - uint32_t samples_log2 = ffs(samples) - 1; - struct radv_meta_state *meta_state = &cmd_buffer->device->meta_state; VkPipeline *pipeline; if (!radv_image_has_htile(image)) @@ -383,25 +499,12 @@ RADV_META_SAVE_SAMPLE_LOCATIONS | RADV_META_SAVE_PASS); - pipeline = radv_get_depth_pipeline(cmd_buffer, image, op); + pipeline = radv_get_depth_pipeline(cmd_buffer, image, + subresourceRange, op); radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer), VK_PIPELINE_BIND_POINT_GRAPHICS, *pipeline); - radv_CmdSetViewport(cmd_buffer_h, 0, 1, &(VkViewport) { - .x = 0, - .y = 0, - .width = width, - .height = height, - .minDepth = 0.0f, - .maxDepth = 1.0f - }); - - radv_CmdSetScissor(cmd_buffer_h, 0, 1, &(VkRect2D) { - .offset = { 0, 0 }, - .extent = { width, height }, - }); - if (sample_locs) { assert(image->flags & VK_IMAGE_CREATE_SAMPLE_LOCATIONS_COMPATIBLE_DEPTH_BIT_EXT); @@ -417,72 +520,42 @@ }); } - for (uint32_t layer = 0; layer < radv_get_layerCount(image, subresourceRange); layer++) { - struct radv_image_view iview; - - radv_image_view_init(&iview, cmd_buffer->device, - &(VkImageViewCreateInfo) { - .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, - .image = radv_image_to_handle(image), - .viewType = radv_meta_get_view_type(image), - .format = image->vk_format, - .subresourceRange = { - .aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT, - .baseMipLevel = subresourceRange->baseMipLevel, - .levelCount = 1, - .baseArrayLayer = subresourceRange->baseArrayLayer + layer, - .layerCount = 1, - }, - }, NULL); - - - VkFramebuffer fb_h; - radv_CreateFramebuffer(device_h, - &(VkFramebufferCreateInfo) { - .sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO, - .attachmentCount = 1, - .pAttachments = (VkImageView[]) { - radv_image_view_to_handle(&iview) - }, - .width = width, - .height = height, - .layers = 1 - }, - &cmd_buffer->pool->alloc, - &fb_h); - - radv_CmdBeginRenderPass(cmd_buffer_h, - &(VkRenderPassBeginInfo) { - .sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO, - .renderPass = meta_state->depth_decomp[samples_log2].pass, - .framebuffer = fb_h, - .renderArea = { - .offset = { - 0, - 0, - }, - .extent = { - width, - height, - } - }, - .clearValueCount = 0, - .pClearValues = NULL, - }, - VK_SUBPASS_CONTENTS_INLINE); - - radv_CmdDraw(cmd_buffer_h, 3, 1, 0, 0); - radv_CmdEndRenderPass(cmd_buffer_h); - - radv_DestroyFramebuffer(device_h, fb_h, - &cmd_buffer->pool->alloc); + for (uint32_t l = 0; l < radv_get_levelCount(image, subresourceRange); ++l) { + uint32_t width = + radv_minify(image->info.width, + subresourceRange->baseMipLevel + l); + uint32_t height = + radv_minify(image->info.height, + subresourceRange->baseMipLevel + l); + + radv_CmdSetViewport(cmd_buffer_h, 0, 1, + &(VkViewport) { + .x = 0, + .y = 0, + .width = width, + .height = height, + .minDepth = 0.0f, + .maxDepth = 1.0f + }); + + radv_CmdSetScissor(cmd_buffer_h, 0, 1, + &(VkRect2D) { + .offset = { 0, 0 }, + .extent = { width, height }, + }); + + for (uint32_t s = 0; s < radv_get_layerCount(image, subresourceRange); s++) { + radv_process_depth_image_layer(cmd_buffer, image, + subresourceRange, l, s); + } } + radv_meta_restore(&saved_state, cmd_buffer); } void radv_decompress_depth_image_inplace(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, - VkImageSubresourceRange *subresourceRange, + const VkImageSubresourceRange *subresourceRange, struct radv_sample_locations_state *sample_locs) { assert(cmd_buffer->queue_family_index == RADV_QUEUE_GENERAL); @@ -492,7 +565,7 @@ void radv_resummarize_depth_image_inplace(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, - VkImageSubresourceRange *subresourceRange, + const VkImageSubresourceRange *subresourceRange, struct radv_sample_locations_state *sample_locs) { assert(cmd_buffer->queue_family_index == RADV_QUEUE_GENERAL); diff -Nru mesa-19.2.8/src/amd/vulkan/radv_meta_fast_clear.c mesa-20.0.8/src/amd/vulkan/radv_meta_fast_clear.c --- mesa-19.2.8/src/amd/vulkan/radv_meta_fast_clear.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/vulkan/radv_meta_fast_clear.c 2020-06-12 01:21:16.000000000 +0000 @@ -87,7 +87,7 @@ nir_intrinsic_instr *membar = nir_intrinsic_instr_create(b.shader, nir_intrinsic_memory_barrier); nir_builder_instr_insert(&b, &membar->instr); - nir_intrinsic_instr *bar = nir_intrinsic_instr_create(b.shader, nir_intrinsic_barrier); + nir_intrinsic_instr *bar = nir_intrinsic_instr_create(b.shader, nir_intrinsic_control_barrier); nir_builder_instr_insert(&b, &bar->instr); nir_ssa_def *outval = &tex->dest.ssa; @@ -97,6 +97,7 @@ store->src[1] = nir_src_for_ssa(global_id); store->src[2] = nir_src_for_ssa(nir_ssa_undef(&b, 1, 32)); store->src[3] = nir_src_for_ssa(outval); + store->src[4] = nir_src_for_ssa(nir_imm_int(&b, 0)); nir_builder_instr_insert(&b, &store->instr); return b.shader; @@ -222,7 +223,27 @@ .preserveAttachmentCount = 0, .pPreserveAttachments = NULL, }, - .dependencyCount = 0, + .dependencyCount = 2, + .pDependencies = (VkSubpassDependency[]) { + { + .srcSubpass = VK_SUBPASS_EXTERNAL, + .dstSubpass = 0, + .srcStageMask = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, + .dstStageMask = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, + .srcAccessMask = 0, + .dstAccessMask = 0, + .dependencyFlags = 0 + }, + { + .srcSubpass = 0, + .dstSubpass = VK_SUBPASS_EXTERNAL, + .srcStageMask = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, + .dstStageMask = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, + .srcAccessMask = 0, + .dstAccessMask = 0, + .dependencyFlags = 0 + } + }, }, alloc, &device->meta_state.fast_clear_flush.pass); diff -Nru mesa-19.2.8/src/amd/vulkan/radv_meta_fmask_expand.c mesa-20.0.8/src/amd/vulkan/radv_meta_fmask_expand.c --- mesa-19.2.8/src/amd/vulkan/radv_meta_fmask_expand.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/vulkan/radv_meta_fmask_expand.c 2020-06-12 01:21:16.000000000 +0000 @@ -52,7 +52,7 @@ img_type, "out_img"); output_img->data.descriptor_set = 0; output_img->data.binding = 0; - output_img->data.image.access = ACCESS_NON_READABLE; + output_img->data.access = ACCESS_NON_READABLE; nir_ssa_def *invoc_id = nir_load_local_invocation_id(&b); nir_ssa_def *wg_id = nir_load_work_group_id(&b); @@ -98,6 +98,7 @@ store->src[1] = nir_src_for_ssa(global_id); store->src[2] = nir_src_for_ssa(nir_imm_int(&b, i)); store->src[3] = nir_src_for_ssa(outval); + store->src[4] = nir_src_for_ssa(nir_imm_int(&b, 0)); nir_builder_instr_insert(&b, &store->instr); } diff -Nru mesa-19.2.8/src/amd/vulkan/radv_meta.h mesa-20.0.8/src/amd/vulkan/radv_meta.h --- mesa-19.2.8/src/amd/vulkan/radv_meta.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/vulkan/radv_meta.h 2020-06-12 01:21:16.000000000 +0000 @@ -169,11 +169,11 @@ void radv_decompress_depth_image_inplace(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, - VkImageSubresourceRange *subresourceRange, + const VkImageSubresourceRange *subresourceRange, struct radv_sample_locations_state *sample_locs); void radv_resummarize_depth_image_inplace(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, - VkImageSubresourceRange *subresourceRange, + const VkImageSubresourceRange *subresourceRange, struct radv_sample_locations_state *sample_locs); void radv_fast_clear_flush_image_inplace(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, @@ -221,7 +221,7 @@ struct radv_image *image, const VkImageSubresourceRange *range, uint32_t value); uint32_t radv_clear_htile(struct radv_cmd_buffer *cmd_buffer, - struct radv_image *image, + const struct radv_image *image, const VkImageSubresourceRange *range, uint32_t value); /** diff -Nru mesa-19.2.8/src/amd/vulkan/radv_meta_resolve.c mesa-20.0.8/src/amd/vulkan/radv_meta_resolve.c --- mesa-19.2.8/src/amd/vulkan/radv_meta_resolve.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/vulkan/radv_meta_resolve.c 2020-06-12 01:21:16.000000000 +0000 @@ -97,7 +97,27 @@ .preserveAttachmentCount = 0, .pPreserveAttachments = NULL, }, - .dependencyCount = 0, + .dependencyCount = 2, + .pDependencies = (VkSubpassDependency[]) { + { + .srcSubpass = VK_SUBPASS_EXTERNAL, + .dstSubpass = 0, + .srcStageMask = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, + .dstStageMask = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, + .srcAccessMask = 0, + .dstAccessMask = 0, + .dependencyFlags = 0 + }, + { + .srcSubpass = 0, + .dstSubpass = VK_SUBPASS_EXTERNAL, + .srcStageMask = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, + .dstStageMask = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, + .srcAccessMask = 0, + .dstAccessMask = 0, + .dependencyFlags = 0 + } + }, }, alloc, pass); @@ -649,6 +669,9 @@ struct radv_image_view *dst_iview = cmd_buffer->state.attachments[dst_att.attachment].iview; + /* Make sure to not clear the depth/stencil attachment after resolves. */ + cmd_buffer->state.attachments[dst_att.attachment].pending_clear_aspects = 0; + radv_pick_resolve_method_images(cmd_buffer->device, src_iview->image, src_iview->vk_format, @@ -799,7 +822,7 @@ struct radv_image *src_image = src_iview->image; VkImageResolve region = {}; - region.srcSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; + region.srcSubresource.aspectMask = src_iview->aspect_mask; region.srcSubresource.mipLevel = 0; region.srcSubresource.baseArrayLayer = src_iview->base_layer; region.srcSubresource.layerCount = layer_count; @@ -814,7 +837,7 @@ struct radv_image *src_image = src_iview->image; VkImageResolve region = {}; - region.srcSubresource.aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT; + region.srcSubresource.aspectMask = src_iview->aspect_mask; region.srcSubresource.mipLevel = 0; region.srcSubresource.baseArrayLayer = src_iview->base_layer; region.srcSubresource.layerCount = layer_count; diff -Nru mesa-19.2.8/src/amd/vulkan/radv_meta_resolve_cs.c mesa-20.0.8/src/amd/vulkan/radv_meta_resolve_cs.c --- mesa-19.2.8/src/amd/vulkan/radv_meta_resolve_cs.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/vulkan/radv_meta_resolve_cs.c 2020-06-12 01:21:16.000000000 +0000 @@ -135,6 +135,7 @@ store->src[1] = nir_src_for_ssa(coord); store->src[2] = nir_src_for_ssa(nir_ssa_undef(&b, 1, 32)); store->src[3] = nir_src_for_ssa(outval); + store->src[4] = nir_src_for_ssa(nir_imm_int(&b, 0)); nir_builder_instr_insert(&b, &store->instr); return b.shader; } @@ -145,7 +146,7 @@ }; static const char * -get_resolve_mode_str(VkResolveModeFlagBitsKHR resolve_mode) +get_resolve_mode_str(VkResolveModeFlagBits resolve_mode) { switch (resolve_mode) { case VK_RESOLVE_MODE_SAMPLE_ZERO_BIT_KHR: @@ -164,7 +165,7 @@ static nir_shader * build_depth_stencil_resolve_compute_shader(struct radv_device *dev, int samples, int index, - VkResolveModeFlagBitsKHR resolve_mode) + VkResolveModeFlagBits resolve_mode) { nir_builder b; char name[64]; @@ -295,6 +296,7 @@ store->src[1] = nir_src_for_ssa(coord); store->src[2] = nir_src_for_ssa(nir_ssa_undef(&b, 1, 32)); store->src[3] = nir_src_for_ssa(outval); + store->src[4] = nir_src_for_ssa(nir_imm_int(&b, 0)); nir_builder_instr_insert(&b, &store->instr); return b.shader; } @@ -411,7 +413,7 @@ create_depth_stencil_resolve_pipeline(struct radv_device *device, int samples, int index, - VkResolveModeFlagBitsKHR resolve_mode, + VkResolveModeFlagBits resolve_mode, VkPipeline *pipeline) { VkResult result; @@ -705,7 +707,7 @@ const VkOffset2D *dest_offset, const VkExtent2D *resolve_extent, VkImageAspectFlags aspects, - VkResolveModeFlagBitsKHR resolve_mode) + VkResolveModeFlagBits resolve_mode) { struct radv_device *device = cmd_buffer->device; const uint32_t samples = src_iview->image->info.samples; @@ -959,7 +961,7 @@ void radv_depth_stencil_resolve_subpass_cs(struct radv_cmd_buffer *cmd_buffer, VkImageAspectFlags aspects, - VkResolveModeFlagBitsKHR resolve_mode) + VkResolveModeFlagBits resolve_mode) { struct radv_framebuffer *fb = cmd_buffer->state.framebuffer; const struct radv_subpass *subpass = cmd_buffer->state.subpass; @@ -1043,7 +1045,7 @@ if (radv_image_has_htile(dst_image)) { if (aspects == VK_IMAGE_ASPECT_DEPTH_BIT) { VkImageSubresourceRange range = {}; - range.aspectMask = aspects; + range.aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT; range.baseMipLevel = dst_iview->base_mip; range.levelCount = 1; range.baseArrayLayer = dst_iview->base_layer; diff -Nru mesa-19.2.8/src/amd/vulkan/radv_meta_resolve_fs.c mesa-20.0.8/src/amd/vulkan/radv_meta_resolve_fs.c --- mesa-19.2.8/src/amd/vulkan/radv_meta_resolve_fs.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/vulkan/radv_meta_resolve_fs.c 2020-06-12 01:21:16.000000000 +0000 @@ -231,7 +231,27 @@ .preserveAttachmentCount = 0, .pPreserveAttachments = NULL, }, - .dependencyCount = 0, + .dependencyCount = 2, + .pDependencies = (VkSubpassDependency[]) { + { + .srcSubpass = VK_SUBPASS_EXTERNAL, + .dstSubpass = 0, + .srcStageMask = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, + .dstStageMask = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, + .srcAccessMask = 0, + .dstAccessMask = 0, + .dependencyFlags = 0 + }, + { + .srcSubpass = 0, + .dstSubpass = VK_SUBPASS_EXTERNAL, + .srcStageMask = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, + .dstStageMask = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, + .srcAccessMask = 0, + .dstAccessMask = 0, + .dependencyFlags = 0 + } + }, }, &device->meta_state.alloc, rp + dst_layout); } @@ -318,7 +338,7 @@ }; static const char * -get_resolve_mode_str(VkResolveModeFlagBitsKHR resolve_mode) +get_resolve_mode_str(VkResolveModeFlagBits resolve_mode) { switch (resolve_mode) { case VK_RESOLVE_MODE_SAMPLE_ZERO_BIT_KHR: @@ -337,7 +357,7 @@ static nir_shader * build_depth_stencil_resolve_fragment_shader(struct radv_device *dev, int samples, int index, - VkResolveModeFlagBitsKHR resolve_mode) + VkResolveModeFlagBits resolve_mode) { nir_builder b; char name[64]; @@ -454,7 +474,7 @@ create_depth_stencil_resolve_pipeline(struct radv_device *device, int samples_log2, int index, - VkResolveModeFlagBitsKHR resolve_mode) + VkResolveModeFlagBits resolve_mode) { VkRenderPass *render_pass; VkPipeline *pipeline; @@ -555,7 +575,27 @@ .preserveAttachmentCount = 0, .pPreserveAttachments = NULL, }, - .dependencyCount = 0, + .dependencyCount = 2, + .pDependencies = (VkSubpassDependency[]) { + { + .srcSubpass = VK_SUBPASS_EXTERNAL, + .dstSubpass = 0, + .srcStageMask = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, + .dstStageMask = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, + .srcAccessMask = 0, + .dstAccessMask = 0, + .dependencyFlags = 0 + }, + { + .srcSubpass = 0, + .dstSubpass = VK_SUBPASS_EXTERNAL, + .srcStageMask = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, + .dstStageMask = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, + .srcAccessMask = 0, + .dstAccessMask = 0, + .dependencyFlags = 0 + } + }, }, &device->meta_state.alloc, render_pass); } @@ -608,7 +648,12 @@ .cullMode = VK_CULL_MODE_NONE, .frontFace = VK_FRONT_FACE_COUNTER_CLOCKWISE }, - .pMultisampleState = NULL, + .pMultisampleState = &(VkPipelineMultisampleStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO, + .rasterizationSamples = 1, + .sampleShadingEnable = false, + .pSampleMask = (VkSampleMask[]) { UINT32_MAX }, + }, .pColorBlendState = &(VkPipelineColorBlendStateCreateInfo) { .sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO, .attachmentCount = 0, @@ -880,7 +925,7 @@ const VkOffset2D *dst_offset, const VkExtent2D *resolve_extent, VkImageAspectFlags aspects, - VkResolveModeFlagBitsKHR resolve_mode) + VkResolveModeFlagBits resolve_mode) { struct radv_device *device = cmd_buffer->device; const uint32_t samples = src_iview->image->info.samples; @@ -1176,7 +1221,7 @@ void radv_depth_stencil_resolve_subpass_fs(struct radv_cmd_buffer *cmd_buffer, VkImageAspectFlags aspects, - VkResolveModeFlagBitsKHR resolve_mode) + VkResolveModeFlagBits resolve_mode) { struct radv_framebuffer *fb = cmd_buffer->state.framebuffer; const struct radv_subpass *subpass = cmd_buffer->state.subpass; diff -Nru mesa-19.2.8/src/amd/vulkan/radv_nir_to_llvm.c mesa-20.0.8/src/amd/vulkan/radv_nir_to_llvm.c --- mesa-19.2.8/src/amd/vulkan/radv_nir_to_llvm.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/vulkan/radv_nir_to_llvm.c 2020-06-12 01:21:16.000000000 +0000 @@ -28,6 +28,7 @@ #include "radv_private.h" #include "radv_shader.h" #include "radv_shader_helper.h" +#include "radv_shader_args.h" #include "nir/nir.h" #include @@ -47,37 +48,22 @@ struct radv_shader_context { struct ac_llvm_context ac; - const struct radv_nir_compiler_options *options; - struct radv_shader_variant_info *shader_info; + const struct nir_shader *shader; struct ac_shader_abi abi; + const struct radv_shader_args *args; + + gl_shader_stage stage; unsigned max_workgroup_size; LLVMContextRef context; LLVMValueRef main_function; LLVMValueRef descriptor_sets[MAX_SETS]; + LLVMValueRef ring_offsets; - LLVMValueRef vertex_buffers; LLVMValueRef rel_auto_id; - LLVMValueRef vs_prim_id; - LLVMValueRef es2gs_offset; - LLVMValueRef oc_lds; - LLVMValueRef merged_wave_info; - LLVMValueRef tess_factor_offset; - LLVMValueRef tes_rel_patch_id; - LLVMValueRef tes_u; - LLVMValueRef tes_v; - - /* HW GS */ - /* On gfx10: - * - bits 0..10: ordered_wave_id - * - bits 12..20: number of vertices in group - * - bits 22..30: number of primitives in group - */ - LLVMValueRef gs_tg_info; - LLVMValueRef gs2vs_offset; LLVMValueRef gs_wave_id; LLVMValueRef gs_vtx_offset[6]; @@ -86,41 +72,18 @@ LLVMValueRef hs_ring_tess_offchip; LLVMValueRef hs_ring_tess_factor; - LLVMValueRef persp_sample, persp_center, persp_centroid; - LLVMValueRef linear_sample, linear_center, linear_centroid; - - /* Streamout */ - LLVMValueRef streamout_buffers; - LLVMValueRef streamout_write_idx; - LLVMValueRef streamout_config; - LLVMValueRef streamout_offset[4]; - - gl_shader_stage stage; - LLVMValueRef inputs[RADEON_LLVM_MAX_INPUTS * 4]; - uint64_t float16_shaded_mask; - uint64_t input_mask; uint64_t output_mask; - bool is_gs_copy_shader; LLVMValueRef gs_next_vertex[4]; LLVMValueRef gs_curprim_verts[4]; LLVMValueRef gs_generated_prims[4]; LLVMValueRef gs_ngg_emit; LLVMValueRef gs_ngg_scratch; - unsigned gs_max_out_vertices; - unsigned gs_output_prim; - - unsigned tes_primitive_mode; - uint32_t tcs_patch_outputs_read; - uint64_t tcs_outputs_read; - uint32_t tcs_vertices_per_patch; uint32_t tcs_num_inputs; uint32_t tcs_num_patches; - uint32_t max_gsvs_emit_size; - uint32_t gsvs_vertex_size; LLVMValueRef vertexptr; /* GFX10 only */ }; @@ -132,14 +95,6 @@ unsigned usage_mask; }; -enum radeon_llvm_calling_convention { - RADEON_LLVM_AMDGPU_VS = 87, - RADEON_LLVM_AMDGPU_GS = 88, - RADEON_LLVM_AMDGPU_PS = 89, - RADEON_LLVM_AMDGPU_CS = 90, - RADEON_LLVM_AMDGPU_HS = 93, -}; - static inline struct radv_shader_context * radv_shader_context_from_abi(struct ac_shader_abi *abi) { @@ -147,105 +102,15 @@ return container_of(abi, ctx, abi); } -struct ac_build_if_state -{ - struct radv_shader_context *ctx; - LLVMValueRef condition; - LLVMBasicBlockRef entry_block; - LLVMBasicBlockRef true_block; - LLVMBasicBlockRef false_block; - LLVMBasicBlockRef merge_block; -}; - -static LLVMBasicBlockRef -ac_build_insert_new_block(struct radv_shader_context *ctx, const char *name) -{ - LLVMBasicBlockRef current_block; - LLVMBasicBlockRef next_block; - LLVMBasicBlockRef new_block; - - /* get current basic block */ - current_block = LLVMGetInsertBlock(ctx->ac.builder); - - /* chqeck if there's another block after this one */ - next_block = LLVMGetNextBasicBlock(current_block); - if (next_block) { - /* insert the new block before the next block */ - new_block = LLVMInsertBasicBlockInContext(ctx->context, next_block, name); - } - else { - /* append new block after current block */ - LLVMValueRef function = LLVMGetBasicBlockParent(current_block); - new_block = LLVMAppendBasicBlockInContext(ctx->context, function, name); - } - return new_block; -} - -static void -ac_nir_build_if(struct ac_build_if_state *ifthen, - struct radv_shader_context *ctx, - LLVMValueRef condition) -{ - LLVMBasicBlockRef block = LLVMGetInsertBlock(ctx->ac.builder); - - memset(ifthen, 0, sizeof *ifthen); - ifthen->ctx = ctx; - ifthen->condition = condition; - ifthen->entry_block = block; - - /* create endif/merge basic block for the phi functions */ - ifthen->merge_block = ac_build_insert_new_block(ctx, "endif-block"); - - /* create/insert true_block before merge_block */ - ifthen->true_block = - LLVMInsertBasicBlockInContext(ctx->context, - ifthen->merge_block, - "if-true-block"); - - /* successive code goes into the true block */ - LLVMPositionBuilderAtEnd(ctx->ac.builder, ifthen->true_block); -} - -/** - * End a conditional. - */ -static void -ac_nir_build_endif(struct ac_build_if_state *ifthen) -{ - LLVMBuilderRef builder = ifthen->ctx->ac.builder; - - /* Insert branch to the merge block from current block */ - LLVMBuildBr(builder, ifthen->merge_block); - - /* - * Now patch in the various branch instructions. - */ - - /* Insert the conditional branch instruction at the end of entry_block */ - LLVMPositionBuilderAtEnd(builder, ifthen->entry_block); - if (ifthen->false_block) { - /* we have an else clause */ - LLVMBuildCondBr(builder, ifthen->condition, - ifthen->true_block, ifthen->false_block); - } - else { - /* no else clause */ - LLVMBuildCondBr(builder, ifthen->condition, - ifthen->true_block, ifthen->merge_block); - } - - /* Resume building code at end of the ifthen->merge_block */ - LLVMPositionBuilderAtEnd(builder, ifthen->merge_block); -} - - static LLVMValueRef get_rel_patch_id(struct radv_shader_context *ctx) { switch (ctx->stage) { case MESA_SHADER_TESS_CTRL: - return ac_unpack_param(&ctx->ac, ctx->abi.tcs_rel_ids, 0, 8); + return ac_unpack_param(&ctx->ac, + ac_get_arg(&ctx->ac, ctx->args->ac.tcs_rel_ids), + 0, 8); case MESA_SHADER_TESS_EVAL: - return ctx->tes_rel_patch_id; + return ac_get_arg(&ctx->ac, ctx->args->tes_rel_patch_id); break; default: unreachable("Illegal stage"); @@ -255,14 +120,14 @@ static unsigned get_tcs_num_patches(struct radv_shader_context *ctx) { - unsigned num_tcs_input_cp = ctx->options->key.tcs.input_vertices; - unsigned num_tcs_output_cp = ctx->tcs_vertices_per_patch; + unsigned num_tcs_input_cp = ctx->args->options->key.tcs.input_vertices; + unsigned num_tcs_output_cp = ctx->shader->info.tess.tcs_vertices_out; uint32_t input_vertex_size = ctx->tcs_num_inputs * 16; - uint32_t input_patch_size = ctx->options->key.tcs.input_vertices * input_vertex_size; - uint32_t num_tcs_outputs = util_last_bit64(ctx->shader_info->info.tcs.outputs_written); - uint32_t num_tcs_patch_outputs = util_last_bit64(ctx->shader_info->info.tcs.patch_outputs_written); + uint32_t input_patch_size = ctx->args->options->key.tcs.input_vertices * input_vertex_size; + uint32_t num_tcs_outputs = util_last_bit64(ctx->args->shader_info->tcs.outputs_written); + uint32_t num_tcs_patch_outputs = util_last_bit64(ctx->args->shader_info->tcs.patch_outputs_written); uint32_t output_vertex_size = num_tcs_outputs * 16; - uint32_t pervertex_output_patch_size = ctx->tcs_vertices_per_patch * output_vertex_size; + uint32_t pervertex_output_patch_size = ctx->shader->info.tess.tcs_vertices_out * output_vertex_size; uint32_t output_patch_size = pervertex_output_patch_size + num_tcs_patch_outputs * 16; unsigned num_patches; unsigned hardware_lds_size; @@ -282,20 +147,20 @@ * * Test: dEQP-VK.tessellation.shader_input_output.barrier */ - if (ctx->options->chip_class >= GFX7 && ctx->options->family != CHIP_STONEY) + if (ctx->args->options->chip_class >= GFX7 && ctx->args->options->family != CHIP_STONEY) hardware_lds_size = 65536; num_patches = MIN2(num_patches, hardware_lds_size / (input_patch_size + output_patch_size)); /* Make sure the output data fits in the offchip buffer */ - num_patches = MIN2(num_patches, (ctx->options->tess_offchip_block_dw_size * 4) / output_patch_size); + num_patches = MIN2(num_patches, (ctx->args->options->tess_offchip_block_dw_size * 4) / output_patch_size); /* Not necessary for correctness, but improves performance. The * specific value is taken from the proprietary driver. */ num_patches = MIN2(num_patches, 40); /* GFX6 bug workaround - limit LS-HS threadgroups to only one wave. */ - if (ctx->options->chip_class == GFX6) { - unsigned one_wave = ctx->options->wave_size / MAX2(num_tcs_input_cp, num_tcs_output_cp); + if (ctx->args->options->chip_class == GFX6) { + unsigned one_wave = 64 / MAX2(num_tcs_input_cp, num_tcs_output_cp); num_patches = MIN2(num_patches, one_wave); } return num_patches; @@ -304,7 +169,7 @@ static unsigned calculate_tess_lds_size(struct radv_shader_context *ctx) { - unsigned num_tcs_input_cp = ctx->options->key.tcs.input_vertices; + unsigned num_tcs_input_cp = ctx->args->options->key.tcs.input_vertices; unsigned num_tcs_output_cp; unsigned num_tcs_outputs, num_tcs_patch_outputs; unsigned input_vertex_size, output_vertex_size; @@ -314,9 +179,9 @@ unsigned num_patches; unsigned lds_size; - num_tcs_output_cp = ctx->tcs_vertices_per_patch; - num_tcs_outputs = util_last_bit64(ctx->shader_info->info.tcs.outputs_written); - num_tcs_patch_outputs = util_last_bit64(ctx->shader_info->info.tcs.patch_outputs_written); + num_tcs_output_cp = ctx->shader->info.tess.tcs_vertices_out; + num_tcs_outputs = util_last_bit64(ctx->args->shader_info->tcs.outputs_written); + num_tcs_patch_outputs = util_last_bit64(ctx->args->shader_info->tcs.patch_outputs_written); input_vertex_size = ctx->tcs_num_inputs * 16; output_vertex_size = num_tcs_outputs * 16; @@ -356,9 +221,9 @@ static LLVMValueRef get_tcs_in_patch_stride(struct radv_shader_context *ctx) { - assert (ctx->stage == MESA_SHADER_TESS_CTRL); + assert(ctx->stage == MESA_SHADER_TESS_CTRL); uint32_t input_vertex_size = ctx->tcs_num_inputs * 16; - uint32_t input_patch_size = ctx->options->key.tcs.input_vertices * input_vertex_size; + uint32_t input_patch_size = ctx->args->options->key.tcs.input_vertices * input_vertex_size; input_patch_size /= 4; return LLVMConstInt(ctx->ac.i32, input_patch_size, false); @@ -367,10 +232,10 @@ static LLVMValueRef get_tcs_out_patch_stride(struct radv_shader_context *ctx) { - uint32_t num_tcs_outputs = util_last_bit64(ctx->shader_info->info.tcs.outputs_written); - uint32_t num_tcs_patch_outputs = util_last_bit64(ctx->shader_info->info.tcs.patch_outputs_written); + uint32_t num_tcs_outputs = util_last_bit64(ctx->args->shader_info->tcs.outputs_written); + uint32_t num_tcs_patch_outputs = util_last_bit64(ctx->args->shader_info->tcs.patch_outputs_written); uint32_t output_vertex_size = num_tcs_outputs * 16; - uint32_t pervertex_output_patch_size = ctx->tcs_vertices_per_patch * output_vertex_size; + uint32_t pervertex_output_patch_size = ctx->shader->info.tess.tcs_vertices_out * output_vertex_size; uint32_t output_patch_size = pervertex_output_patch_size + num_tcs_patch_outputs * 16; output_patch_size /= 4; return LLVMConstInt(ctx->ac.i32, output_patch_size, false); @@ -379,7 +244,7 @@ static LLVMValueRef get_tcs_out_vertex_stride(struct radv_shader_context *ctx) { - uint32_t num_tcs_outputs = util_last_bit64(ctx->shader_info->info.tcs.outputs_written); + uint32_t num_tcs_outputs = util_last_bit64(ctx->args->shader_info->tcs.outputs_written); uint32_t output_vertex_size = num_tcs_outputs * 16; output_vertex_size /= 4; return LLVMConstInt(ctx->ac.i32, output_vertex_size, false); @@ -390,7 +255,7 @@ { assert (ctx->stage == MESA_SHADER_TESS_CTRL); uint32_t input_vertex_size = ctx->tcs_num_inputs * 16; - uint32_t input_patch_size = ctx->options->key.tcs.input_vertices * input_vertex_size; + uint32_t input_patch_size = ctx->args->options->key.tcs.input_vertices * input_vertex_size; uint32_t output_patch0_offset = input_patch_size; unsigned num_patches = ctx->tcs_num_patches; @@ -404,12 +269,12 @@ { assert (ctx->stage == MESA_SHADER_TESS_CTRL); uint32_t input_vertex_size = ctx->tcs_num_inputs * 16; - uint32_t input_patch_size = ctx->options->key.tcs.input_vertices * input_vertex_size; + uint32_t input_patch_size = ctx->args->options->key.tcs.input_vertices * input_vertex_size; uint32_t output_patch0_offset = input_patch_size; - uint32_t num_tcs_outputs = util_last_bit64(ctx->shader_info->info.tcs.outputs_written); + uint32_t num_tcs_outputs = util_last_bit64(ctx->args->shader_info->tcs.outputs_written); uint32_t output_vertex_size = num_tcs_outputs * 16; - uint32_t pervertex_output_patch_size = ctx->tcs_vertices_per_patch * output_vertex_size; + uint32_t pervertex_output_patch_size = ctx->shader->info.tess.tcs_vertices_out * output_vertex_size; unsigned num_patches = ctx->tcs_num_patches; output_patch0_offset *= num_patches; @@ -450,87 +315,16 @@ patch0_patch_data_offset); } -#define MAX_ARGS 64 -struct arg_info { - LLVMTypeRef types[MAX_ARGS]; - LLVMValueRef *assign[MAX_ARGS]; - uint8_t count; - uint8_t sgpr_count; - uint8_t num_sgprs_used; - uint8_t num_vgprs_used; -}; - -enum ac_arg_regfile { - ARG_SGPR, - ARG_VGPR, -}; - -static void -add_arg(struct arg_info *info, enum ac_arg_regfile regfile, LLVMTypeRef type, - LLVMValueRef *param_ptr) -{ - assert(info->count < MAX_ARGS); - - info->assign[info->count] = param_ptr; - info->types[info->count] = type; - info->count++; - - if (regfile == ARG_SGPR) { - info->num_sgprs_used += ac_get_type_size(type) / 4; - info->sgpr_count++; - } else { - assert(regfile == ARG_VGPR); - info->num_vgprs_used += ac_get_type_size(type) / 4; - } -} - -static void assign_arguments(LLVMValueRef main_function, - struct arg_info *info) -{ - unsigned i; - for (i = 0; i < info->count; i++) { - if (info->assign[i]) - *info->assign[i] = LLVMGetParam(main_function, i); - } -} - static LLVMValueRef -create_llvm_function(LLVMContextRef ctx, LLVMModuleRef module, - LLVMBuilderRef builder, LLVMTypeRef *return_types, - unsigned num_return_elems, - struct arg_info *args, +create_llvm_function(struct ac_llvm_context *ctx, LLVMModuleRef module, + LLVMBuilderRef builder, + const struct ac_shader_args *args, + enum ac_llvm_calling_convention convention, unsigned max_workgroup_size, const struct radv_nir_compiler_options *options) { - LLVMTypeRef main_function_type, ret_type; - LLVMBasicBlockRef main_function_body; - - if (num_return_elems) - ret_type = LLVMStructTypeInContext(ctx, return_types, - num_return_elems, true); - else - ret_type = LLVMVoidTypeInContext(ctx); - - /* Setup the function */ - main_function_type = - LLVMFunctionType(ret_type, args->types, args->count, 0); LLVMValueRef main_function = - LLVMAddFunction(module, "main", main_function_type); - main_function_body = - LLVMAppendBasicBlockInContext(ctx, main_function, "main_body"); - LLVMPositionBuilderAtEnd(builder, main_function_body); - - LLVMSetFunctionCallConv(main_function, RADEON_LLVM_AMDGPU_CS); - for (unsigned i = 0; i < args->sgpr_count; ++i) { - LLVMValueRef P = LLVMGetParam(main_function, i); - - ac_add_function_attr(ctx, main_function, i + 1, AC_FUNC_ATTR_INREG); - - if (LLVMGetTypeKind(LLVMTypeOf(P)) == LLVMPointerTypeKind) { - ac_add_function_attr(ctx, main_function, i + 1, AC_FUNC_ATTR_NOALIAS); - ac_add_attr_dereferenceable(P, UINT64_MAX); - } - } + ac_build_main(args, ctx, convention, "main", ctx->voidt, module); if (options->address32_hi) { ac_llvm_add_target_dep_function_attr(main_function, @@ -540,469 +334,57 @@ ac_llvm_set_workgroup_size(main_function, max_workgroup_size); - if (options->unsafe_math) { - /* These were copied from some LLVM test. */ - LLVMAddTargetDependentFunctionAttr(main_function, - "less-precise-fpmad", - "true"); - LLVMAddTargetDependentFunctionAttr(main_function, - "no-infs-fp-math", - "true"); - LLVMAddTargetDependentFunctionAttr(main_function, - "no-nans-fp-math", - "true"); - LLVMAddTargetDependentFunctionAttr(main_function, - "unsafe-fp-math", - "true"); - LLVMAddTargetDependentFunctionAttr(main_function, - "no-signed-zeros-fp-math", - "true"); - } return main_function; } - -static void -set_loc(struct radv_userdata_info *ud_info, uint8_t *sgpr_idx, - uint8_t num_sgprs) -{ - ud_info->sgpr_idx = *sgpr_idx; - ud_info->num_sgprs = num_sgprs; - *sgpr_idx += num_sgprs; -} - -static void -set_loc_shader(struct radv_shader_context *ctx, int idx, uint8_t *sgpr_idx, - uint8_t num_sgprs) -{ - struct radv_userdata_info *ud_info = - &ctx->shader_info->user_sgprs_locs.shader_data[idx]; - assert(ud_info); - - set_loc(ud_info, sgpr_idx, num_sgprs); -} - -static void -set_loc_shader_ptr(struct radv_shader_context *ctx, int idx, uint8_t *sgpr_idx) -{ - bool use_32bit_pointers = idx != AC_UD_SCRATCH_RING_OFFSETS; - - set_loc_shader(ctx, idx, sgpr_idx, use_32bit_pointers ? 1 : 2); -} - static void -set_loc_desc(struct radv_shader_context *ctx, int idx, uint8_t *sgpr_idx) -{ - struct radv_userdata_locations *locs = - &ctx->shader_info->user_sgprs_locs; - struct radv_userdata_info *ud_info = &locs->descriptor_sets[idx]; - assert(ud_info); - - set_loc(ud_info, sgpr_idx, 1); - - locs->descriptor_sets_enabled |= 1 << idx; -} - -struct user_sgpr_info { - bool need_ring_offsets; - bool indirect_all_descriptor_sets; - uint8_t remaining_sgprs; -}; - -static bool needs_view_index_sgpr(struct radv_shader_context *ctx, - gl_shader_stage stage) -{ - switch (stage) { - case MESA_SHADER_VERTEX: - if (ctx->shader_info->info.needs_multiview_view_index || - (!ctx->options->key.vs_common_out.as_es && !ctx->options->key.vs_common_out.as_ls && ctx->options->key.has_multiview_view_index)) - return true; - break; - case MESA_SHADER_TESS_EVAL: - if (ctx->shader_info->info.needs_multiview_view_index || (!ctx->options->key.vs_common_out.as_es && ctx->options->key.has_multiview_view_index)) - return true; - break; - case MESA_SHADER_GEOMETRY: - case MESA_SHADER_TESS_CTRL: - if (ctx->shader_info->info.needs_multiview_view_index) - return true; - break; - default: - break; - } - return false; -} - -static uint8_t -count_vs_user_sgprs(struct radv_shader_context *ctx) -{ - uint8_t count = 0; - - if (ctx->shader_info->info.vs.has_vertex_buffers) - count++; - count += ctx->shader_info->info.vs.needs_draw_id ? 3 : 2; - - return count; -} - -static void allocate_inline_push_consts(struct radv_shader_context *ctx, - struct user_sgpr_info *user_sgpr_info) +load_descriptor_sets(struct radv_shader_context *ctx) { - uint8_t remaining_sgprs = user_sgpr_info->remaining_sgprs; - - /* Only supported if shaders use push constants. */ - if (ctx->shader_info->info.min_push_constant_used == UINT8_MAX) - return; - - /* Only supported if shaders don't have indirect push constants. */ - if (ctx->shader_info->info.has_indirect_push_constants) - return; - - /* Only supported for 32-bit push constants. */ - if (!ctx->shader_info->info.has_only_32bit_push_constants) - return; - - uint8_t num_push_consts = - (ctx->shader_info->info.max_push_constant_used - - ctx->shader_info->info.min_push_constant_used) / 4; - - /* Check if the number of user SGPRs is large enough. */ - if (num_push_consts < remaining_sgprs) { - ctx->shader_info->info.num_inline_push_consts = num_push_consts; - } else { - ctx->shader_info->info.num_inline_push_consts = remaining_sgprs; - } - - /* Clamp to the maximum number of allowed inlined push constants. */ - if (ctx->shader_info->info.num_inline_push_consts > AC_MAX_INLINE_PUSH_CONSTS) - ctx->shader_info->info.num_inline_push_consts = AC_MAX_INLINE_PUSH_CONSTS; - - if (ctx->shader_info->info.num_inline_push_consts == num_push_consts && - !ctx->shader_info->info.loads_dynamic_offsets) { - /* Disable the default push constants path if all constants are - * inlined and if shaders don't use dynamic descriptors. - */ - ctx->shader_info->info.loads_push_constants = false; - } - - ctx->shader_info->info.base_inline_push_consts = - ctx->shader_info->info.min_push_constant_used / 4; -} - -static void allocate_user_sgprs(struct radv_shader_context *ctx, - gl_shader_stage stage, - bool has_previous_stage, - gl_shader_stage previous_stage, - bool needs_view_index, - struct user_sgpr_info *user_sgpr_info) -{ - uint8_t user_sgpr_count = 0; - - memset(user_sgpr_info, 0, sizeof(struct user_sgpr_info)); - - /* until we sort out scratch/global buffers always assign ring offsets for gs/vs/es */ - if (stage == MESA_SHADER_GEOMETRY || - stage == MESA_SHADER_VERTEX || - stage == MESA_SHADER_TESS_CTRL || - stage == MESA_SHADER_TESS_EVAL || - ctx->is_gs_copy_shader) - user_sgpr_info->need_ring_offsets = true; - - if (stage == MESA_SHADER_FRAGMENT && - ctx->shader_info->info.ps.needs_sample_positions) - user_sgpr_info->need_ring_offsets = true; - - /* 2 user sgprs will nearly always be allocated for scratch/rings */ - if (ctx->options->supports_spill || user_sgpr_info->need_ring_offsets) { - user_sgpr_count += 2; - } - - switch (stage) { - case MESA_SHADER_COMPUTE: - if (ctx->shader_info->info.cs.uses_grid_size) - user_sgpr_count += 3; - break; - case MESA_SHADER_FRAGMENT: - user_sgpr_count += ctx->shader_info->info.ps.needs_sample_positions; - break; - case MESA_SHADER_VERTEX: - if (!ctx->is_gs_copy_shader) - user_sgpr_count += count_vs_user_sgprs(ctx); - break; - case MESA_SHADER_TESS_CTRL: - if (has_previous_stage) { - if (previous_stage == MESA_SHADER_VERTEX) - user_sgpr_count += count_vs_user_sgprs(ctx); - } - break; - case MESA_SHADER_TESS_EVAL: - break; - case MESA_SHADER_GEOMETRY: - if (has_previous_stage) { - if (previous_stage == MESA_SHADER_VERTEX) { - user_sgpr_count += count_vs_user_sgprs(ctx); - } - } - break; - default: - break; - } - - if (needs_view_index) - user_sgpr_count++; - - if (ctx->shader_info->info.loads_push_constants) - user_sgpr_count++; - - if (ctx->shader_info->info.so.num_outputs) - user_sgpr_count++; - - uint32_t available_sgprs = ctx->options->chip_class >= GFX9 && stage != MESA_SHADER_COMPUTE ? 32 : 16; - uint32_t remaining_sgprs = available_sgprs - user_sgpr_count; - uint32_t num_desc_set = - util_bitcount(ctx->shader_info->info.desc_set_used_mask); - - if (remaining_sgprs < num_desc_set) { - user_sgpr_info->indirect_all_descriptor_sets = true; - user_sgpr_info->remaining_sgprs = remaining_sgprs - 1; - } else { - user_sgpr_info->remaining_sgprs = remaining_sgprs - num_desc_set; - } - - allocate_inline_push_consts(ctx, user_sgpr_info); -} - -static void -declare_global_input_sgprs(struct radv_shader_context *ctx, - const struct user_sgpr_info *user_sgpr_info, - struct arg_info *args, - LLVMValueRef *desc_sets) -{ - LLVMTypeRef type = ac_array_in_const32_addr_space(ctx->ac.i8); - - /* 1 for each descriptor set */ - if (!user_sgpr_info->indirect_all_descriptor_sets) { - uint32_t mask = ctx->shader_info->info.desc_set_used_mask; - + uint32_t mask = ctx->args->shader_info->desc_set_used_mask; + if (ctx->args->shader_info->need_indirect_descriptor_sets) { + LLVMValueRef desc_sets = + ac_get_arg(&ctx->ac, ctx->args->descriptor_sets[0]); while (mask) { int i = u_bit_scan(&mask); - add_arg(args, ARG_SGPR, type, &ctx->descriptor_sets[i]); - } - } else { - add_arg(args, ARG_SGPR, ac_array_in_const32_addr_space(type), - desc_sets); - } - - if (ctx->shader_info->info.loads_push_constants) { - /* 1 for push constants and dynamic descriptors */ - add_arg(args, ARG_SGPR, type, &ctx->abi.push_constants); - } - - for (unsigned i = 0; i < ctx->shader_info->info.num_inline_push_consts; i++) { - add_arg(args, ARG_SGPR, ctx->ac.i32, - &ctx->abi.inline_push_consts[i]); - } - ctx->abi.num_inline_push_consts = ctx->shader_info->info.num_inline_push_consts; - ctx->abi.base_inline_push_consts = ctx->shader_info->info.base_inline_push_consts; - - if (ctx->shader_info->info.so.num_outputs) { - add_arg(args, ARG_SGPR, - ac_array_in_const32_addr_space(ctx->ac.v4i32), - &ctx->streamout_buffers); - } -} - -static void -declare_vs_specific_input_sgprs(struct radv_shader_context *ctx, - gl_shader_stage stage, - bool has_previous_stage, - gl_shader_stage previous_stage, - struct arg_info *args) -{ - if (!ctx->is_gs_copy_shader && - (stage == MESA_SHADER_VERTEX || - (has_previous_stage && previous_stage == MESA_SHADER_VERTEX))) { - if (ctx->shader_info->info.vs.has_vertex_buffers) { - add_arg(args, ARG_SGPR, - ac_array_in_const32_addr_space(ctx->ac.v4i32), - &ctx->vertex_buffers); - } - add_arg(args, ARG_SGPR, ctx->ac.i32, &ctx->abi.base_vertex); - add_arg(args, ARG_SGPR, ctx->ac.i32, &ctx->abi.start_instance); - if (ctx->shader_info->info.vs.needs_draw_id) { - add_arg(args, ARG_SGPR, ctx->ac.i32, &ctx->abi.draw_id); - } - } -} - -static void -declare_vs_input_vgprs(struct radv_shader_context *ctx, struct arg_info *args) -{ - add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->abi.vertex_id); - if (!ctx->is_gs_copy_shader) { - if (ctx->options->key.vs_common_out.as_ls) { - add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->rel_auto_id); - if (ctx->ac.chip_class >= GFX10) { - add_arg(args, ARG_VGPR, ctx->ac.i32, NULL); /* user vgpr */ - add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->abi.instance_id); - } else { - add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->abi.instance_id); - add_arg(args, ARG_VGPR, ctx->ac.i32, NULL); /* unused */ - } - } else { - if (ctx->ac.chip_class >= GFX10) { - if (ctx->options->key.vs_common_out.as_ngg) { - add_arg(args, ARG_VGPR, ctx->ac.i32, NULL); /* user vgpr */ - add_arg(args, ARG_VGPR, ctx->ac.i32, NULL); /* user vgpr */ - add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->abi.instance_id); - } else { - add_arg(args, ARG_VGPR, ctx->ac.i32, NULL); /* unused */ - add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->vs_prim_id); - add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->abi.instance_id); - } - } else { - add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->abi.instance_id); - add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->vs_prim_id); - add_arg(args, ARG_VGPR, ctx->ac.i32, NULL); /* unused */ - } - } - } -} - -static void -declare_streamout_sgprs(struct radv_shader_context *ctx, gl_shader_stage stage, - struct arg_info *args) -{ - int i; - - /* Streamout SGPRs. */ - if (ctx->shader_info->info.so.num_outputs) { - assert(stage == MESA_SHADER_VERTEX || - stage == MESA_SHADER_TESS_EVAL); - - if (stage != MESA_SHADER_TESS_EVAL) { - add_arg(args, ARG_SGPR, ctx->ac.i32, &ctx->streamout_config); - } else { - args->assign[args->count - 1] = &ctx->streamout_config; - args->types[args->count - 1] = ctx->ac.i32; - } - - add_arg(args, ARG_SGPR, ctx->ac.i32, &ctx->streamout_write_idx); - } - - /* A streamout buffer offset is loaded if the stride is non-zero. */ - for (i = 0; i < 4; i++) { - if (!ctx->shader_info->info.so.strides[i]) - continue; - - add_arg(args, ARG_SGPR, ctx->ac.i32, &ctx->streamout_offset[i]); - } -} - -static void -declare_tes_input_vgprs(struct radv_shader_context *ctx, struct arg_info *args) -{ - add_arg(args, ARG_VGPR, ctx->ac.f32, &ctx->tes_u); - add_arg(args, ARG_VGPR, ctx->ac.f32, &ctx->tes_v); - add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->tes_rel_patch_id); - add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->abi.tes_patch_id); -} - -static void -set_global_input_locs(struct radv_shader_context *ctx, - const struct user_sgpr_info *user_sgpr_info, - LLVMValueRef desc_sets, uint8_t *user_sgpr_idx) -{ - uint32_t mask = ctx->shader_info->info.desc_set_used_mask; - - if (!user_sgpr_info->indirect_all_descriptor_sets) { - while (mask) { - int i = u_bit_scan(&mask); + ctx->descriptor_sets[i] = + ac_build_load_to_sgpr(&ctx->ac, desc_sets, + LLVMConstInt(ctx->ac.i32, i, false)); - set_loc_desc(ctx, i, user_sgpr_idx); } } else { - set_loc_shader_ptr(ctx, AC_UD_INDIRECT_DESCRIPTOR_SETS, - user_sgpr_idx); - while (mask) { int i = u_bit_scan(&mask); ctx->descriptor_sets[i] = - ac_build_load_to_sgpr(&ctx->ac, desc_sets, - LLVMConstInt(ctx->ac.i32, i, false)); - - } - - ctx->shader_info->need_indirect_descriptor_sets = true; - } - - if (ctx->shader_info->info.loads_push_constants) { - set_loc_shader_ptr(ctx, AC_UD_PUSH_CONSTANTS, user_sgpr_idx); - } - - if (ctx->shader_info->info.num_inline_push_consts) { - set_loc_shader(ctx, AC_UD_INLINE_PUSH_CONSTANTS, user_sgpr_idx, - ctx->shader_info->info.num_inline_push_consts); - } - - if (ctx->streamout_buffers) { - set_loc_shader_ptr(ctx, AC_UD_STREAMOUT_BUFFERS, - user_sgpr_idx); - } -} - -static void -set_vs_specific_input_locs(struct radv_shader_context *ctx, - gl_shader_stage stage, bool has_previous_stage, - gl_shader_stage previous_stage, - uint8_t *user_sgpr_idx) -{ - if (!ctx->is_gs_copy_shader && - (stage == MESA_SHADER_VERTEX || - (has_previous_stage && previous_stage == MESA_SHADER_VERTEX))) { - if (ctx->shader_info->info.vs.has_vertex_buffers) { - set_loc_shader_ptr(ctx, AC_UD_VS_VERTEX_BUFFERS, - user_sgpr_idx); + ac_get_arg(&ctx->ac, ctx->args->descriptor_sets[i]); } - - unsigned vs_num = 2; - if (ctx->shader_info->info.vs.needs_draw_id) - vs_num++; - - set_loc_shader(ctx, AC_UD_VS_BASE_VERTEX_START_INSTANCE, - user_sgpr_idx, vs_num); } } -static void set_llvm_calling_convention(LLVMValueRef func, - gl_shader_stage stage) +static enum ac_llvm_calling_convention +get_llvm_calling_convention(LLVMValueRef func, gl_shader_stage stage) { - enum radeon_llvm_calling_convention calling_conv; - switch (stage) { case MESA_SHADER_VERTEX: case MESA_SHADER_TESS_EVAL: - calling_conv = RADEON_LLVM_AMDGPU_VS; + return AC_LLVM_AMDGPU_VS; break; case MESA_SHADER_GEOMETRY: - calling_conv = RADEON_LLVM_AMDGPU_GS; + return AC_LLVM_AMDGPU_GS; break; case MESA_SHADER_TESS_CTRL: - calling_conv = RADEON_LLVM_AMDGPU_HS; + return AC_LLVM_AMDGPU_HS; break; case MESA_SHADER_FRAGMENT: - calling_conv = RADEON_LLVM_AMDGPU_PS; + return AC_LLVM_AMDGPU_PS; break; case MESA_SHADER_COMPUTE: - calling_conv = RADEON_LLVM_AMDGPU_CS; + return AC_LLVM_AMDGPU_CS; break; default: unreachable("Unhandle shader type"); } - - LLVMSetFunctionCallConv(func, calling_conv); } /* Returns whether the stage is a stage that can be directly before the GS */ @@ -1013,350 +395,56 @@ static void create_function(struct radv_shader_context *ctx, gl_shader_stage stage, - bool has_previous_stage, - gl_shader_stage previous_stage) + bool has_previous_stage) { - uint8_t user_sgpr_idx; - struct user_sgpr_info user_sgpr_info; - struct arg_info args = {}; - LLVMValueRef desc_sets; - bool needs_view_index = needs_view_index_sgpr(ctx, stage); - if (ctx->ac.chip_class >= GFX10) { - if (is_pre_gs_stage(stage) && ctx->options->key.vs_common_out.as_ngg) { + if (is_pre_gs_stage(stage) && ctx->args->options->key.vs_common_out.as_ngg) { /* On GFX10, VS is merged into GS for NGG. */ - previous_stage = stage; stage = MESA_SHADER_GEOMETRY; has_previous_stage = true; } } - allocate_user_sgprs(ctx, stage, has_previous_stage, - previous_stage, needs_view_index, &user_sgpr_info); + ctx->main_function = create_llvm_function( + &ctx->ac, ctx->ac.module, ctx->ac.builder, &ctx->args->ac, + get_llvm_calling_convention(ctx->main_function, stage), + ctx->max_workgroup_size, + ctx->args->options); + + ctx->ring_offsets = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.implicit.buffer.ptr", + LLVMPointerType(ctx->ac.i8, AC_ADDR_SPACE_CONST), + NULL, 0, AC_FUNC_ATTR_READNONE); + ctx->ring_offsets = LLVMBuildBitCast(ctx->ac.builder, ctx->ring_offsets, + ac_array_in_const_addr_space(ctx->ac.v4i32), ""); + + load_descriptor_sets(ctx); - if (user_sgpr_info.need_ring_offsets && !ctx->options->supports_spill) { - add_arg(&args, ARG_SGPR, ac_array_in_const_addr_space(ctx->ac.v4i32), - &ctx->ring_offsets); + if (stage == MESA_SHADER_TESS_CTRL || + (stage == MESA_SHADER_VERTEX && ctx->args->options->key.vs_common_out.as_ls) || + /* GFX9 has the ESGS ring buffer in LDS. */ + (stage == MESA_SHADER_GEOMETRY && has_previous_stage)) { + ac_declare_lds_as_pointer(&ctx->ac); } - switch (stage) { - case MESA_SHADER_COMPUTE: - declare_global_input_sgprs(ctx, &user_sgpr_info, &args, - &desc_sets); +} - if (ctx->shader_info->info.cs.uses_grid_size) { - add_arg(&args, ARG_SGPR, ctx->ac.v3i32, - &ctx->abi.num_work_groups); - } - for (int i = 0; i < 3; i++) { - ctx->abi.workgroup_ids[i] = NULL; - if (ctx->shader_info->info.cs.uses_block_id[i]) { - add_arg(&args, ARG_SGPR, ctx->ac.i32, - &ctx->abi.workgroup_ids[i]); - } - } - - if (ctx->shader_info->info.cs.uses_local_invocation_idx) - add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->abi.tg_size); - add_arg(&args, ARG_VGPR, ctx->ac.v3i32, - &ctx->abi.local_invocation_ids); - break; - case MESA_SHADER_VERTEX: - declare_global_input_sgprs(ctx, &user_sgpr_info, &args, - &desc_sets); - - declare_vs_specific_input_sgprs(ctx, stage, has_previous_stage, - previous_stage, &args); - - if (needs_view_index) - add_arg(&args, ARG_SGPR, ctx->ac.i32, - &ctx->abi.view_index); - if (ctx->options->key.vs_common_out.as_es) { - add_arg(&args, ARG_SGPR, ctx->ac.i32, - &ctx->es2gs_offset); - } else if (ctx->options->key.vs_common_out.as_ls) { - /* no extra parameters */ - } else { - declare_streamout_sgprs(ctx, stage, &args); - } - - declare_vs_input_vgprs(ctx, &args); - break; - case MESA_SHADER_TESS_CTRL: - if (has_previous_stage) { - // First 6 system regs - add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->oc_lds); - add_arg(&args, ARG_SGPR, ctx->ac.i32, - &ctx->merged_wave_info); - add_arg(&args, ARG_SGPR, ctx->ac.i32, - &ctx->tess_factor_offset); - - add_arg(&args, ARG_SGPR, ctx->ac.i32, NULL); // scratch offset - add_arg(&args, ARG_SGPR, ctx->ac.i32, NULL); // unknown - add_arg(&args, ARG_SGPR, ctx->ac.i32, NULL); // unknown - - declare_global_input_sgprs(ctx, &user_sgpr_info, &args, - &desc_sets); - - declare_vs_specific_input_sgprs(ctx, stage, - has_previous_stage, - previous_stage, &args); - - if (needs_view_index) - add_arg(&args, ARG_SGPR, ctx->ac.i32, - &ctx->abi.view_index); - - add_arg(&args, ARG_VGPR, ctx->ac.i32, - &ctx->abi.tcs_patch_id); - add_arg(&args, ARG_VGPR, ctx->ac.i32, - &ctx->abi.tcs_rel_ids); - - declare_vs_input_vgprs(ctx, &args); - } else { - declare_global_input_sgprs(ctx, &user_sgpr_info, &args, - &desc_sets); - - if (needs_view_index) - add_arg(&args, ARG_SGPR, ctx->ac.i32, - &ctx->abi.view_index); - - add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->oc_lds); - add_arg(&args, ARG_SGPR, ctx->ac.i32, - &ctx->tess_factor_offset); - add_arg(&args, ARG_VGPR, ctx->ac.i32, - &ctx->abi.tcs_patch_id); - add_arg(&args, ARG_VGPR, ctx->ac.i32, - &ctx->abi.tcs_rel_ids); - } - break; - case MESA_SHADER_TESS_EVAL: - declare_global_input_sgprs(ctx, &user_sgpr_info, &args, - &desc_sets); - - if (needs_view_index) - add_arg(&args, ARG_SGPR, ctx->ac.i32, - &ctx->abi.view_index); - - if (ctx->options->key.vs_common_out.as_es) { - add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->oc_lds); - add_arg(&args, ARG_SGPR, ctx->ac.i32, NULL); - add_arg(&args, ARG_SGPR, ctx->ac.i32, - &ctx->es2gs_offset); - } else { - add_arg(&args, ARG_SGPR, ctx->ac.i32, NULL); - declare_streamout_sgprs(ctx, stage, &args); - add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->oc_lds); - } - declare_tes_input_vgprs(ctx, &args); - break; - case MESA_SHADER_GEOMETRY: - if (has_previous_stage) { - // First 6 system regs - if (ctx->options->key.vs_common_out.as_ngg) { - add_arg(&args, ARG_SGPR, ctx->ac.i32, - &ctx->gs_tg_info); - } else { - add_arg(&args, ARG_SGPR, ctx->ac.i32, - &ctx->gs2vs_offset); - } - - add_arg(&args, ARG_SGPR, ctx->ac.i32, - &ctx->merged_wave_info); - add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->oc_lds); - - add_arg(&args, ARG_SGPR, ctx->ac.i32, NULL); // scratch offset - add_arg(&args, ARG_SGPR, ctx->ac.i32, NULL); // unknown - add_arg(&args, ARG_SGPR, ctx->ac.i32, NULL); // unknown - - declare_global_input_sgprs(ctx, &user_sgpr_info, &args, - &desc_sets); - - if (previous_stage != MESA_SHADER_TESS_EVAL) { - declare_vs_specific_input_sgprs(ctx, stage, - has_previous_stage, - previous_stage, - &args); - } - - if (needs_view_index) - add_arg(&args, ARG_SGPR, ctx->ac.i32, - &ctx->abi.view_index); - - add_arg(&args, ARG_VGPR, ctx->ac.i32, - &ctx->gs_vtx_offset[0]); - add_arg(&args, ARG_VGPR, ctx->ac.i32, - &ctx->gs_vtx_offset[2]); - add_arg(&args, ARG_VGPR, ctx->ac.i32, - &ctx->abi.gs_prim_id); - add_arg(&args, ARG_VGPR, ctx->ac.i32, - &ctx->abi.gs_invocation_id); - add_arg(&args, ARG_VGPR, ctx->ac.i32, - &ctx->gs_vtx_offset[4]); - - if (previous_stage == MESA_SHADER_VERTEX) { - declare_vs_input_vgprs(ctx, &args); - } else { - declare_tes_input_vgprs(ctx, &args); - } - } else { - declare_global_input_sgprs(ctx, &user_sgpr_info, &args, - &desc_sets); - - if (needs_view_index) - add_arg(&args, ARG_SGPR, ctx->ac.i32, - &ctx->abi.view_index); - - add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->gs2vs_offset); - add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->gs_wave_id); - add_arg(&args, ARG_VGPR, ctx->ac.i32, - &ctx->gs_vtx_offset[0]); - add_arg(&args, ARG_VGPR, ctx->ac.i32, - &ctx->gs_vtx_offset[1]); - add_arg(&args, ARG_VGPR, ctx->ac.i32, - &ctx->abi.gs_prim_id); - add_arg(&args, ARG_VGPR, ctx->ac.i32, - &ctx->gs_vtx_offset[2]); - add_arg(&args, ARG_VGPR, ctx->ac.i32, - &ctx->gs_vtx_offset[3]); - add_arg(&args, ARG_VGPR, ctx->ac.i32, - &ctx->gs_vtx_offset[4]); - add_arg(&args, ARG_VGPR, ctx->ac.i32, - &ctx->gs_vtx_offset[5]); - add_arg(&args, ARG_VGPR, ctx->ac.i32, - &ctx->abi.gs_invocation_id); - } - break; - case MESA_SHADER_FRAGMENT: - declare_global_input_sgprs(ctx, &user_sgpr_info, &args, - &desc_sets); - - add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->abi.prim_mask); - add_arg(&args, ARG_VGPR, ctx->ac.v2i32, &ctx->persp_sample); - add_arg(&args, ARG_VGPR, ctx->ac.v2i32, &ctx->persp_center); - add_arg(&args, ARG_VGPR, ctx->ac.v2i32, &ctx->persp_centroid); - add_arg(&args, ARG_VGPR, ctx->ac.v3i32, NULL); /* persp pull model */ - add_arg(&args, ARG_VGPR, ctx->ac.v2i32, &ctx->linear_sample); - add_arg(&args, ARG_VGPR, ctx->ac.v2i32, &ctx->linear_center); - add_arg(&args, ARG_VGPR, ctx->ac.v2i32, &ctx->linear_centroid); - add_arg(&args, ARG_VGPR, ctx->ac.f32, NULL); /* line stipple tex */ - add_arg(&args, ARG_VGPR, ctx->ac.f32, &ctx->abi.frag_pos[0]); - add_arg(&args, ARG_VGPR, ctx->ac.f32, &ctx->abi.frag_pos[1]); - add_arg(&args, ARG_VGPR, ctx->ac.f32, &ctx->abi.frag_pos[2]); - add_arg(&args, ARG_VGPR, ctx->ac.f32, &ctx->abi.frag_pos[3]); - add_arg(&args, ARG_VGPR, ctx->ac.i32, &ctx->abi.front_face); - add_arg(&args, ARG_VGPR, ctx->ac.i32, &ctx->abi.ancillary); - add_arg(&args, ARG_VGPR, ctx->ac.i32, &ctx->abi.sample_coverage); - add_arg(&args, ARG_VGPR, ctx->ac.i32, NULL); /* fixed pt */ - break; - default: - unreachable("Shader stage not implemented"); - } - - ctx->main_function = create_llvm_function( - ctx->context, ctx->ac.module, ctx->ac.builder, NULL, 0, &args, - ctx->max_workgroup_size, ctx->options); - set_llvm_calling_convention(ctx->main_function, stage); - - - ctx->shader_info->num_input_vgprs = 0; - ctx->shader_info->num_input_sgprs = ctx->options->supports_spill ? 2 : 0; - - ctx->shader_info->num_input_sgprs += args.num_sgprs_used; - - if (ctx->stage != MESA_SHADER_FRAGMENT) - ctx->shader_info->num_input_vgprs = args.num_vgprs_used; - - assign_arguments(ctx->main_function, &args); - - user_sgpr_idx = 0; - - if (ctx->options->supports_spill || user_sgpr_info.need_ring_offsets) { - set_loc_shader_ptr(ctx, AC_UD_SCRATCH_RING_OFFSETS, - &user_sgpr_idx); - if (ctx->options->supports_spill) { - ctx->ring_offsets = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.implicit.buffer.ptr", - LLVMPointerType(ctx->ac.i8, AC_ADDR_SPACE_CONST), - NULL, 0, AC_FUNC_ATTR_READNONE); - ctx->ring_offsets = LLVMBuildBitCast(ctx->ac.builder, ctx->ring_offsets, - ac_array_in_const_addr_space(ctx->ac.v4i32), ""); - } - } - - /* For merged shaders the user SGPRs start at 8, with 8 system SGPRs in front (including - * the rw_buffers at s0/s1. With user SGPR0 = s8, lets restart the count from 0 */ - if (has_previous_stage) - user_sgpr_idx = 0; - - set_global_input_locs(ctx, &user_sgpr_info, desc_sets, &user_sgpr_idx); - - switch (stage) { - case MESA_SHADER_COMPUTE: - if (ctx->shader_info->info.cs.uses_grid_size) { - set_loc_shader(ctx, AC_UD_CS_GRID_SIZE, - &user_sgpr_idx, 3); - } - break; - case MESA_SHADER_VERTEX: - set_vs_specific_input_locs(ctx, stage, has_previous_stage, - previous_stage, &user_sgpr_idx); - if (ctx->abi.view_index) - set_loc_shader(ctx, AC_UD_VIEW_INDEX, &user_sgpr_idx, 1); - break; - case MESA_SHADER_TESS_CTRL: - set_vs_specific_input_locs(ctx, stage, has_previous_stage, - previous_stage, &user_sgpr_idx); - if (ctx->abi.view_index) - set_loc_shader(ctx, AC_UD_VIEW_INDEX, &user_sgpr_idx, 1); - break; - case MESA_SHADER_TESS_EVAL: - if (ctx->abi.view_index) - set_loc_shader(ctx, AC_UD_VIEW_INDEX, &user_sgpr_idx, 1); - break; - case MESA_SHADER_GEOMETRY: - if (has_previous_stage) { - if (previous_stage == MESA_SHADER_VERTEX) - set_vs_specific_input_locs(ctx, stage, - has_previous_stage, - previous_stage, - &user_sgpr_idx); - } - if (ctx->abi.view_index) - set_loc_shader(ctx, AC_UD_VIEW_INDEX, &user_sgpr_idx, 1); - break; - case MESA_SHADER_FRAGMENT: - break; - default: - unreachable("Shader stage not implemented"); - } - - if (stage == MESA_SHADER_TESS_CTRL || - (stage == MESA_SHADER_VERTEX && ctx->options->key.vs_common_out.as_ls) || - /* GFX9 has the ESGS ring buffer in LDS. */ - (stage == MESA_SHADER_GEOMETRY && has_previous_stage)) { - ac_declare_lds_as_pointer(&ctx->ac); - } - - ctx->shader_info->num_user_sgprs = user_sgpr_idx; -} - - -static LLVMValueRef -radv_load_resource(struct ac_shader_abi *abi, LLVMValueRef index, - unsigned desc_set, unsigned binding) -{ - struct radv_shader_context *ctx = radv_shader_context_from_abi(abi); - LLVMValueRef desc_ptr = ctx->descriptor_sets[desc_set]; - struct radv_pipeline_layout *pipeline_layout = ctx->options->layout; - struct radv_descriptor_set_layout *layout = pipeline_layout->set[desc_set].layout; - unsigned base_offset = layout->binding[binding].offset; - LLVMValueRef offset, stride; +static LLVMValueRef +radv_load_resource(struct ac_shader_abi *abi, LLVMValueRef index, + unsigned desc_set, unsigned binding) +{ + struct radv_shader_context *ctx = radv_shader_context_from_abi(abi); + LLVMValueRef desc_ptr = ctx->descriptor_sets[desc_set]; + struct radv_pipeline_layout *pipeline_layout = ctx->args->options->layout; + struct radv_descriptor_set_layout *layout = pipeline_layout->set[desc_set].layout; + unsigned base_offset = layout->binding[binding].offset; + LLVMValueRef offset, stride; if (layout->binding[binding].type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC || layout->binding[binding].type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC) { unsigned idx = pipeline_layout->set[desc_set].dynamic_offset_start + layout->binding[binding].dynamic_offset_offset; - desc_ptr = ctx->abi.push_constants; + desc_ptr = ac_get_arg(&ctx->ac, ctx->args->ac.push_constants); base_offset = pipeline_layout->push_constant_size + 16 * idx; stride = LLVMConstInt(ctx->ac.i32, 16, false); } else @@ -1380,7 +468,7 @@ if (ctx->ac.chip_class >= GFX10) { desc_type |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) | - S_008F0C_OOB_SELECT(3) | + S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1); } else { desc_type |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | @@ -1389,7 +477,7 @@ LLVMValueRef desc_components[4] = { LLVMBuildPtrToInt(ctx->ac.builder, desc_ptr, ctx->ac.intptr, ""), - LLVMConstInt(ctx->ac.i32, S_008F04_BASE_ADDRESS_HI(ctx->options->address32_hi), false), + LLVMConstInt(ctx->ac.i32, S_008F04_BASE_ADDRESS_HI(ctx->args->options->address32_hi), false), /* High limit to support variable sizes. */ LLVMConstInt(ctx->ac.i32, 0xffffffff, false), LLVMConstInt(ctx->ac.i32, desc_type, false), @@ -1425,12 +513,12 @@ uint32_t num_patches = ctx->tcs_num_patches; uint32_t num_tcs_outputs; if (ctx->stage == MESA_SHADER_TESS_CTRL) - num_tcs_outputs = util_last_bit64(ctx->shader_info->info.tcs.outputs_written); + num_tcs_outputs = util_last_bit64(ctx->args->shader_info->tcs.outputs_written); else - num_tcs_outputs = ctx->options->key.tes.tcs_num_outputs; + num_tcs_outputs = ctx->args->options->key.tes.tcs_num_outputs; uint32_t output_vertex_size = num_tcs_outputs * 16; - uint32_t pervertex_output_patch_size = ctx->tcs_vertices_per_patch * output_vertex_size; + uint32_t pervertex_output_patch_size = ctx->shader->info.tess.tcs_vertices_out * output_vertex_size; return LLVMConstInt(ctx->ac.i32, pervertex_output_patch_size * num_patches, false); } @@ -1440,7 +528,7 @@ { LLVMValueRef param_stride; if (vertex_index) - param_stride = LLVMConstInt(ctx->ac.i32, ctx->tcs_vertices_per_patch * ctx->tcs_num_patches, false); + param_stride = LLVMConstInt(ctx->ac.i32, ctx->shader->info.tess.tcs_vertices_out * ctx->tcs_num_patches, false); else param_stride = LLVMConstInt(ctx->ac.i32, ctx->tcs_num_patches, false); return param_stride; @@ -1453,7 +541,7 @@ LLVMValueRef base_addr; LLVMValueRef param_stride, constant16; LLVMValueRef rel_patch_id = get_rel_patch_id(ctx); - LLVMValueRef vertices_per_patch = LLVMConstInt(ctx->ac.i32, ctx->tcs_vertices_per_patch, false); + LLVMValueRef vertices_per_patch = LLVMConstInt(ctx->ac.i32, ctx->shader->info.tess.tcs_vertices_out, false); constant16 = LLVMConstInt(ctx->ac.i32, 16, false); param_stride = calc_param_stride(ctx, vertex_index); if (vertex_index) { @@ -1596,14 +684,15 @@ LLVMValueRef dw_addr; LLVMValueRef stride = NULL; LLVMValueRef buf_addr = NULL; + LLVMValueRef oc_lds = ac_get_arg(&ctx->ac, ctx->args->oc_lds); unsigned param; bool store_lds = true; if (is_patch) { - if (!(ctx->tcs_patch_outputs_read & (1U << (location - VARYING_SLOT_PATCH0)))) + if (!(ctx->shader->info.patch_outputs_read & (1U << (location - VARYING_SLOT_PATCH0)))) store_lds = false; } else { - if (!(ctx->tcs_outputs_read & (1ULL << location))) + if (!(ctx->shader->info.outputs_read & (1ULL << location))) store_lds = false; } @@ -1652,14 +741,14 @@ if (!is_tess_factor && writemask != 0xF) ac_build_buffer_store_dword(&ctx->ac, ctx->hs_ring_tess_offchip, value, 1, - buf_addr, ctx->oc_lds, - 4 * (base + chan), ac_glc, false); + buf_addr, oc_lds, + 4 * (base + chan), ac_glc); } if (writemask == 0xF) { ac_build_buffer_store_dword(&ctx->ac, ctx->hs_ring_tess_offchip, src, 4, - buf_addr, ctx->oc_lds, - (base * 4), ac_glc, false); + buf_addr, oc_lds, + (base * 4), ac_glc); } } @@ -1680,6 +769,7 @@ struct radv_shader_context *ctx = radv_shader_context_from_abi(abi); LLVMValueRef buf_addr; LLVMValueRef result; + LLVMValueRef oc_lds = ac_get_arg(&ctx->ac, ctx->args->oc_lds); unsigned param = shader_io_get_unique_index(location); if ((location == VARYING_SLOT_CLIP_DIST0 || location == VARYING_SLOT_CLIP_DIST1) && is_compact) { @@ -1698,7 +788,7 @@ buf_addr = LLVMBuildAdd(ctx->ac.builder, buf_addr, comp_offset, ""); result = ac_build_buffer_load(&ctx->ac, ctx->hs_ring_tess_offchip, num_components, NULL, - buf_addr, ctx->oc_lds, is_compact ? (4 * const_index) : 0, ac_glc, true, false); + buf_addr, oc_lds, is_compact ? (4 * const_index) : 0, ac_glc, true, false); result = ac_trim_vector(&ctx->ac, result, num_components); return result; } @@ -1797,36 +887,6 @@ ac_build_kill_if_false(&ctx->ac, visible); } -static LLVMValueRef lookup_interp_param(struct ac_shader_abi *abi, - enum glsl_interp_mode interp, unsigned location) -{ - struct radv_shader_context *ctx = radv_shader_context_from_abi(abi); - - switch (interp) { - case INTERP_MODE_FLAT: - default: - return NULL; - case INTERP_MODE_SMOOTH: - case INTERP_MODE_NONE: - if (location == INTERP_CENTER) - return ctx->persp_center; - else if (location == INTERP_CENTROID) - return ctx->persp_centroid; - else if (location == INTERP_SAMPLE) - return ctx->persp_sample; - break; - case INTERP_MODE_NOPERSPECTIVE: - if (location == INTERP_CENTER) - return ctx->linear_center; - else if (location == INTERP_CENTROID) - return ctx->linear_centroid; - else if (location == INTERP_SAMPLE) - return ctx->linear_sample; - break; - } - return NULL; -} - static uint32_t radv_get_sample_pos_offset(uint32_t num_samples) { @@ -1861,7 +921,7 @@ ac_array_in_const_addr_space(ctx->ac.v2f32), ""); uint32_t sample_pos_offset = - radv_get_sample_pos_offset(ctx->options->key.fs.num_samples); + radv_get_sample_pos_offset(ctx->args->options->key.fs.num_samples); sample_id = LLVMBuildAdd(ctx->ac.builder, sample_id, @@ -1877,11 +937,11 @@ struct radv_shader_context *ctx = radv_shader_context_from_abi(abi); uint8_t log2_ps_iter_samples; - if (ctx->shader_info->info.ps.force_persample) { + if (ctx->args->shader_info->ps.force_persample) { log2_ps_iter_samples = - util_logbase2(ctx->options->key.fs.num_samples); + util_logbase2(ctx->args->options->key.fs.num_samples); } else { - log2_ps_iter_samples = ctx->options->key.fs.log2_ps_iter_samples; + log2_ps_iter_samples = ctx->args->options->key.fs.log2_ps_iter_samples; } /* The bit pattern matches that used by fixed function fragment @@ -1898,9 +958,10 @@ uint32_t ps_iter_mask = ps_iter_masks[log2_ps_iter_samples]; LLVMValueRef result, sample_id; - sample_id = ac_unpack_param(&ctx->ac, abi->ancillary, 8, 4); + sample_id = ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.ancillary), 8, 4); sample_id = LLVMBuildShl(ctx->ac.builder, LLVMConstInt(ctx->ac.i32, ps_iter_mask, false), sample_id, ""); - result = LLVMBuildAnd(ctx->ac.builder, sample_id, abi->sample_coverage, ""); + result = LLVMBuildAnd(ctx->ac.builder, sample_id, + ac_get_arg(&ctx->ac, ctx->args->ac.sample_coverage), ""); return result; } @@ -1917,7 +978,7 @@ unsigned offset = 0; struct radv_shader_context *ctx = radv_shader_context_from_abi(abi); - if (ctx->options->key.vs_common_out.as_ngg) { + if (ctx->args->options->key.vs_common_out.as_ngg) { gfx10_ngg_gs_emit_vertex(ctx, stream, addrs); return; } @@ -1928,19 +989,23 @@ ""); /* If this thread has already emitted the declared maximum number of - * vertices, kill it: excessive vertex emissions are not supposed to - * have any effect, and GS threads have no externally observable - * effects other than emitting vertices. + * vertices, don't emit any more: excessive vertex emissions are not + * supposed to have any effect. */ can_emit = LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, gs_next_vertex, - LLVMConstInt(ctx->ac.i32, ctx->gs_max_out_vertices, false), ""); - ac_build_kill_if_false(&ctx->ac, can_emit); + LLVMConstInt(ctx->ac.i32, ctx->shader->info.gs.vertices_out, false), ""); + + bool use_kill = !ctx->args->shader_info->gs.writes_memory; + if (use_kill) + ac_build_kill_if_false(&ctx->ac, can_emit); + else + ac_build_ifcc(&ctx->ac, can_emit, 6505); for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) { unsigned output_usage_mask = - ctx->shader_info->info.gs.output_usage_mask[i]; + ctx->args->shader_info->gs.output_usage_mask[i]; uint8_t output_stream = - ctx->shader_info->info.gs.output_streams[i]; + ctx->args->shader_info->gs.output_streams[i]; LLVMValueRef *out_ptr = &addrs[i * 4]; int length = util_last_bit(output_usage_mask); @@ -1956,7 +1021,7 @@ out_ptr[j], ""); LLVMValueRef voffset = LLVMConstInt(ctx->ac.i32, offset * - ctx->gs_max_out_vertices, false); + ctx->shader->info.gs.vertices_out, false); offset++; @@ -1969,8 +1034,10 @@ ac_build_buffer_store_dword(&ctx->ac, ctx->gsvs_ring[stream], out_val, 1, - voffset, ctx->gs2vs_offset, 0, - ac_glc | ac_slc, true); + voffset, + ac_get_arg(&ctx->ac, + ctx->args->gs2vs_offset), + 0, ac_glc | ac_slc | ac_swizzled); } } @@ -1981,6 +1048,9 @@ ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8), ctx->gs_wave_id); + + if (!use_kill) + ac_build_endif(&ctx->ac, 6505); } static void @@ -1988,7 +1058,7 @@ { struct radv_shader_context *ctx = radv_shader_context_from_abi(abi); - if (ctx->options->key.vs_common_out.as_ngg) { + if (ctx->args->options->key.vs_common_out.as_ngg) { LLVMBuildStore(ctx->ac.builder, ctx->ac.i32_0, ctx->gs_curprim_verts[stream]); return; } @@ -2002,13 +1072,13 @@ struct radv_shader_context *ctx = radv_shader_context_from_abi(abi); LLVMValueRef coord[4] = { - ctx->tes_u, - ctx->tes_v, + ac_get_arg(&ctx->ac, ctx->args->tes_u), + ac_get_arg(&ctx->ac, ctx->args->tes_v), ctx->ac.f32_0, ctx->ac.f32_0, }; - if (ctx->tes_primitive_mode == GL_TRIANGLES) + if (ctx->shader->info.tess.primitive_mode == GL_TRIANGLES) coord[2] = LLVMBuildFSub(ctx->ac.builder, ctx->ac.f32_1, LLVMBuildFAdd(ctx->ac.builder, coord[0], coord[1], ""), ""); @@ -2019,13 +1089,14 @@ load_patch_vertices_in(struct ac_shader_abi *abi) { struct radv_shader_context *ctx = radv_shader_context_from_abi(abi); - return LLVMConstInt(ctx->ac.i32, ctx->options->key.tcs.input_vertices, false); + return LLVMConstInt(ctx->ac.i32, ctx->args->options->key.tcs.input_vertices, false); } static LLVMValueRef radv_load_base_vertex(struct ac_shader_abi *abi) { - return abi->base_vertex; + struct radv_shader_context *ctx = radv_shader_context_from_abi(abi); + return ac_get_arg(&ctx->ac, ctx->args->ac.base_vertex); } static LLVMValueRef radv_load_ssbo(struct ac_shader_abi *abi, @@ -2071,7 +1142,7 @@ { struct radv_shader_context *ctx = radv_shader_context_from_abi(abi); LLVMValueRef list = ctx->descriptor_sets[descriptor_set]; - struct radv_descriptor_set_layout *layout = ctx->options->layout->set[descriptor_set].layout; + struct radv_descriptor_set_layout *layout = ctx->args->options->layout->set[descriptor_set].layout; struct radv_descriptor_set_binding_layout *binding = layout->binding + base_index; unsigned offset = binding->offset; unsigned stride = binding->size; @@ -2209,35 +1280,6 @@ return LLVMBuildBitCast(ctx->ac.builder, alpha, ctx->ac.i32, ""); } -static unsigned -get_num_channels_from_data_format(unsigned data_format) -{ - switch (data_format) { - case V_008F0C_BUF_DATA_FORMAT_8: - case V_008F0C_BUF_DATA_FORMAT_16: - case V_008F0C_BUF_DATA_FORMAT_32: - return 1; - case V_008F0C_BUF_DATA_FORMAT_8_8: - case V_008F0C_BUF_DATA_FORMAT_16_16: - case V_008F0C_BUF_DATA_FORMAT_32_32: - return 2; - case V_008F0C_BUF_DATA_FORMAT_10_11_11: - case V_008F0C_BUF_DATA_FORMAT_11_11_10: - case V_008F0C_BUF_DATA_FORMAT_32_32_32: - return 3; - case V_008F0C_BUF_DATA_FORMAT_8_8_8_8: - case V_008F0C_BUF_DATA_FORMAT_10_10_10_2: - case V_008F0C_BUF_DATA_FORMAT_2_10_10_10: - case V_008F0C_BUF_DATA_FORMAT_16_16_16_16: - case V_008F0C_BUF_DATA_FORMAT_32_32_32_32: - return 4; - default: - break; - } - - return 4; -} - static LLVMValueRef radv_fixup_vertex_input_fetches(struct radv_shader_context *ctx, LLVMValueRef value, @@ -2259,10 +1301,8 @@ for (unsigned i = 0; i < num_channels; i++) chan[i] = ac_llvm_extract_elem(&ctx->ac, value, i); } else { - if (num_channels) { - assert(num_channels == 1); - chan[0] = value; - } + assert(num_channels == 1); + chan[0] = value; } for (unsigned i = num_channels; i < 4; i++) { @@ -2277,14 +1317,14 @@ handle_vs_input_decl(struct radv_shader_context *ctx, struct nir_variable *variable) { - LLVMValueRef t_list_ptr = ctx->vertex_buffers; + LLVMValueRef t_list_ptr = ac_get_arg(&ctx->ac, ctx->args->vertex_buffers); LLVMValueRef t_offset; LLVMValueRef t_list; LLVMValueRef input; LLVMValueRef buffer_index; unsigned attrib_count = glsl_count_attribute_slots(variable->type, true); uint8_t input_usage_mask = - ctx->shader_info->info.vs.input_usage_mask[variable->data.location]; + ctx->args->shader_info->vs.input_usage_mask[variable->data.location]; unsigned num_input_channels = util_last_bit(input_usage_mask); variable->data.driver_location = variable->data.location * 4; @@ -2293,14 +1333,14 @@ for (unsigned i = 0; i < attrib_count; ++i) { LLVMValueRef output[4]; unsigned attrib_index = variable->data.location + i - VERT_ATTRIB_GENERIC0; - unsigned attrib_format = ctx->options->key.vs.vertex_attribute_formats[attrib_index]; + unsigned attrib_format = ctx->args->options->key.vs.vertex_attribute_formats[attrib_index]; unsigned data_format = attrib_format & 0x0f; unsigned num_format = (attrib_format >> 4) & 0x07; bool is_float = num_format != V_008F0C_BUF_NUM_FORMAT_UINT && num_format != V_008F0C_BUF_NUM_FORMAT_SINT; - if (ctx->options->key.vs.instance_rate_inputs & (1u << attrib_index)) { - uint32_t divisor = ctx->options->key.vs.instance_rate_divisors[attrib_index]; + if (ctx->args->options->key.vs.instance_rate_inputs & (1u << attrib_index)) { + uint32_t divisor = ctx->args->options->key.vs.instance_rate_divisors[attrib_index]; if (divisor) { buffer_index = ctx->abi.instance_id; @@ -2313,50 +1353,100 @@ buffer_index = ctx->ac.i32_0; } - buffer_index = LLVMBuildAdd(ctx->ac.builder, ctx->abi.start_instance, buffer_index, ""); - } else - buffer_index = LLVMBuildAdd(ctx->ac.builder, ctx->abi.vertex_id, - ctx->abi.base_vertex, ""); + buffer_index = LLVMBuildAdd(ctx->ac.builder, + ac_get_arg(&ctx->ac, + ctx->args->ac.start_instance),\ + buffer_index, ""); + } else { + buffer_index = LLVMBuildAdd(ctx->ac.builder, + ctx->abi.vertex_id, + ac_get_arg(&ctx->ac, + ctx->args->ac.base_vertex), ""); + } + + const struct ac_data_format_info *vtx_info = ac_get_data_format_info(data_format); /* Adjust the number of channels to load based on the vertex * attribute format. */ - unsigned num_format_channels = get_num_channels_from_data_format(data_format); - unsigned num_channels = MIN2(num_input_channels, num_format_channels); - unsigned attrib_binding = ctx->options->key.vs.vertex_attribute_bindings[attrib_index]; - unsigned attrib_offset = ctx->options->key.vs.vertex_attribute_offsets[attrib_index]; - unsigned attrib_stride = ctx->options->key.vs.vertex_attribute_strides[attrib_index]; + unsigned num_channels = MIN2(num_input_channels, vtx_info->num_channels); + unsigned attrib_binding = ctx->args->options->key.vs.vertex_attribute_bindings[attrib_index]; + unsigned attrib_offset = ctx->args->options->key.vs.vertex_attribute_offsets[attrib_index]; + unsigned attrib_stride = ctx->args->options->key.vs.vertex_attribute_strides[attrib_index]; - if (ctx->options->key.vs.post_shuffle & (1 << attrib_index)) { + if (ctx->args->options->key.vs.post_shuffle & (1 << attrib_index)) { /* Always load, at least, 3 channels for formats that * need to be shuffled because X<->Z. */ num_channels = MAX2(num_channels, 3); } - if (attrib_stride != 0 && attrib_offset > attrib_stride) { - LLVMValueRef buffer_offset = - LLVMConstInt(ctx->ac.i32, - attrib_offset / attrib_stride, false); + t_offset = LLVMConstInt(ctx->ac.i32, attrib_binding, false); + t_list = ac_build_load_to_sgpr(&ctx->ac, t_list_ptr, t_offset); - buffer_index = LLVMBuildAdd(ctx->ac.builder, - buffer_index, - buffer_offset, ""); + /* Perform per-channel vertex fetch operations if unaligned + * access are detected. Only GFX6 and GFX10 are affected. + */ + bool unaligned_vertex_fetches = false; + if ((ctx->ac.chip_class == GFX6 || ctx->ac.chip_class == GFX10) && + vtx_info->chan_format != data_format && + ((attrib_offset % vtx_info->element_size) || + (attrib_stride % vtx_info->element_size))) + unaligned_vertex_fetches = true; - attrib_offset = attrib_offset % attrib_stride; - } + if (unaligned_vertex_fetches) { + unsigned chan_format = vtx_info->chan_format; + LLVMValueRef values[4]; - t_offset = LLVMConstInt(ctx->ac.i32, attrib_binding, false); - t_list = ac_build_load_to_sgpr(&ctx->ac, t_list_ptr, t_offset); + assert(ctx->ac.chip_class == GFX6 || + ctx->ac.chip_class == GFX10); - input = ac_build_struct_tbuffer_load(&ctx->ac, t_list, - buffer_index, - LLVMConstInt(ctx->ac.i32, attrib_offset, false), - ctx->ac.i32_0, ctx->ac.i32_0, - num_channels, - data_format, num_format, 0, true); + for (unsigned chan = 0; chan < num_channels; chan++) { + unsigned chan_offset = attrib_offset + chan * vtx_info->chan_byte_size; + LLVMValueRef chan_index = buffer_index; + + if (attrib_stride != 0 && chan_offset > attrib_stride) { + LLVMValueRef buffer_offset = + LLVMConstInt(ctx->ac.i32, + chan_offset / attrib_stride, false); + + chan_index = LLVMBuildAdd(ctx->ac.builder, + buffer_index, + buffer_offset, ""); - if (ctx->options->key.vs.post_shuffle & (1 << attrib_index)) { + chan_offset = chan_offset % attrib_stride; + } + + values[chan] = ac_build_struct_tbuffer_load(&ctx->ac, t_list, + chan_index, + LLVMConstInt(ctx->ac.i32, chan_offset, false), + ctx->ac.i32_0, ctx->ac.i32_0, 1, + chan_format, num_format, 0, true); + } + + input = ac_build_gather_values(&ctx->ac, values, num_channels); + } else { + if (attrib_stride != 0 && attrib_offset > attrib_stride) { + LLVMValueRef buffer_offset = + LLVMConstInt(ctx->ac.i32, + attrib_offset / attrib_stride, false); + + buffer_index = LLVMBuildAdd(ctx->ac.builder, + buffer_index, + buffer_offset, ""); + + attrib_offset = attrib_offset % attrib_stride; + } + + input = ac_build_struct_tbuffer_load(&ctx->ac, t_list, + buffer_index, + LLVMConstInt(ctx->ac.i32, attrib_offset, false), + ctx->ac.i32_0, ctx->ac.i32_0, + num_channels, + data_format, num_format, 0, true); + } + + if (ctx->args->options->key.vs.post_shuffle & (1 << attrib_index)) { LLVMValueRef c[4]; c[0] = ac_llvm_extract_elem(&ctx->ac, input, 2); c[1] = ac_llvm_extract_elem(&ctx->ac, input, 1); @@ -2378,7 +1468,7 @@ } } - unsigned alpha_adjust = (ctx->options->key.vs.alpha_adjust >> (attrib_index * 2)) & 3; + unsigned alpha_adjust = (ctx->args->options->key.vs.alpha_adjust >> (attrib_index * 2)) & 3; output[3] = adjust_vertex_fetch_alpha(ctx, alpha_adjust, output[3]); for (unsigned chan = 0; chan < 4; chan++) { @@ -2415,10 +1505,21 @@ uses_center = true; } + ctx->abi.persp_centroid = ac_get_arg(&ctx->ac, ctx->args->ac.persp_centroid); + ctx->abi.linear_centroid = ac_get_arg(&ctx->ac, ctx->args->ac.linear_centroid); + if (uses_center && uses_centroid) { - LLVMValueRef sel = LLVMBuildICmp(ctx->ac.builder, LLVMIntSLT, ctx->abi.prim_mask, ctx->ac.i32_0, ""); - ctx->persp_centroid = LLVMBuildSelect(ctx->ac.builder, sel, ctx->persp_center, ctx->persp_centroid, ""); - ctx->linear_centroid = LLVMBuildSelect(ctx->ac.builder, sel, ctx->linear_center, ctx->linear_centroid, ""); + LLVMValueRef sel = LLVMBuildICmp(ctx->ac.builder, LLVMIntSLT, + ac_get_arg(&ctx->ac, ctx->args->ac.prim_mask), + ctx->ac.i32_0, ""); + ctx->abi.persp_centroid = + LLVMBuildSelect(ctx->ac.builder, sel, + ac_get_arg(&ctx->ac, ctx->args->ac.persp_center), + ctx->abi.persp_centroid, ""); + ctx->abi.linear_centroid = + LLVMBuildSelect(ctx->ac.builder, sel, + ac_get_arg(&ctx->ac, ctx->args->ac.linear_center), + ctx->abi.linear_centroid, ""); } } @@ -2445,27 +1546,6 @@ } mask_attribs = ((1ull << attrib_count) - 1) << idx; - if (stage == MESA_SHADER_VERTEX || - stage == MESA_SHADER_TESS_EVAL || - stage == MESA_SHADER_GEOMETRY) { - if (idx == VARYING_SLOT_CLIP_DIST0) { - if (stage == MESA_SHADER_VERTEX) { - ctx->shader_info->vs.outinfo.clip_dist_mask = (1 << shader->info.clip_distance_array_size) - 1; - ctx->shader_info->vs.outinfo.cull_dist_mask = (1 << shader->info.cull_distance_array_size) - 1; - ctx->shader_info->vs.outinfo.cull_dist_mask <<= shader->info.clip_distance_array_size; - } - if (stage == MESA_SHADER_TESS_EVAL) { - ctx->shader_info->tes.outinfo.clip_dist_mask = (1 << shader->info.clip_distance_array_size) - 1; - ctx->shader_info->tes.outinfo.cull_dist_mask = (1 << shader->info.cull_distance_array_size) - 1; - ctx->shader_info->tes.outinfo.cull_dist_mask <<= shader->info.clip_distance_array_size; - } - if (stage == MESA_SHADER_GEOMETRY) { - ctx->shader_info->vs.outinfo.clip_dist_mask = (1 << shader->info.clip_distance_array_size) - 1; - ctx->shader_info->vs.outinfo.cull_dist_mask = (1 << shader->info.cull_distance_array_size) - 1; - ctx->shader_info->vs.outinfo.cull_dist_mask <<= shader->info.clip_distance_array_size; - } - } - } ctx->output_mask |= mask_attribs; } @@ -2503,9 +1583,9 @@ bool is_16bit = ac_get_type_size(LLVMTypeOf(values[0])) == 2; if (ctx->stage == MESA_SHADER_FRAGMENT) { unsigned index = target - V_008DFC_SQ_EXP_MRT; - unsigned col_format = (ctx->options->key.fs.col_format >> (4 * index)) & 0xf; - bool is_int8 = (ctx->options->key.fs.is_int8 >> index) & 1; - bool is_int10 = (ctx->options->key.fs.is_int10 >> index) & 1; + unsigned col_format = (ctx->args->options->key.fs.col_format >> (4 * index)) & 0xf; + bool is_int8 = (ctx->args->options->key.fs.is_int8 >> index) & 1; + bool is_int10 = (ctx->args->options->key.fs.is_int10 >> index) & 1; unsigned chan; LLVMValueRef (*packf)(struct ac_llvm_context *ctx, LLVMValueRef args[2]) = NULL; @@ -2701,19 +1781,19 @@ ac_build_buffer_store_dword(&ctx->ac, so_buffers[buf], vdata, num_comps, so_write_offsets[buf], ctx->ac.i32_0, offset, - ac_glc | ac_slc, false); + ac_glc | ac_slc); } static void radv_emit_streamout(struct radv_shader_context *ctx, unsigned stream) { - struct ac_build_if_state if_ctx; int i; /* Get bits [22:16], i.e. (so_param >> 16) & 127; */ - assert(ctx->streamout_config); + assert(ctx->args->streamout_config.used); LLVMValueRef so_vtx_count = - ac_build_bfe(&ctx->ac, ctx->streamout_config, + ac_build_bfe(&ctx->ac, + ac_get_arg(&ctx->ac, ctx->args->streamout_config), LLVMConstInt(ctx->ac.i32, 16, false), LLVMConstInt(ctx->ac.i32, 7, false), false); @@ -2727,14 +1807,15 @@ * out-of-bounds buffer access. The hw tells us via the SGPR * (so_vtx_count) which threads are allowed to emit streamout data. */ - ac_nir_build_if(&if_ctx, ctx, can_emit); + ac_build_ifcc(&ctx->ac, can_emit, 6501); { /* The buffer offset is computed as follows: * ByteOffset = streamout_offset[buffer_id]*4 + * (streamout_write_index + thread_id)*stride[buffer_id] + * attrib_offset */ - LLVMValueRef so_write_index = ctx->streamout_write_idx; + LLVMValueRef so_write_index = + ac_get_arg(&ctx->ac, ctx->args->streamout_write_idx); /* Compute (streamout_write_index + thread_id). */ so_write_index = @@ -2745,10 +1826,10 @@ */ LLVMValueRef so_write_offset[4] = {}; LLVMValueRef so_buffers[4] = {}; - LLVMValueRef buf_ptr = ctx->streamout_buffers; + LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->args->streamout_buffers); for (i = 0; i < 4; i++) { - uint16_t stride = ctx->shader_info->info.so.strides[i]; + uint16_t stride = ctx->args->shader_info->so.strides[i]; if (!stride) continue; @@ -2759,7 +1840,8 @@ so_buffers[i] = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset); - LLVMValueRef so_offset = ctx->streamout_offset[i]; + LLVMValueRef so_offset = + ac_get_arg(&ctx->ac, ctx->args->streamout_offset[i]); so_offset = LLVMBuildMul(ctx->ac.builder, so_offset, LLVMConstInt(ctx->ac.i32, 4, false), ""); @@ -2772,10 +1854,10 @@ } /* Write streamout data. */ - for (i = 0; i < ctx->shader_info->info.so.num_outputs; i++) { + for (i = 0; i < ctx->args->shader_info->so.num_outputs; i++) { struct radv_shader_output_values shader_out = {}; struct radv_stream_output *output = - &ctx->shader_info->info.so.outputs[i]; + &ctx->args->shader_info->so.outputs[i]; if (stream != output->stream) continue; @@ -2789,7 +1871,7 @@ output, &shader_out); } } - ac_nir_build_endif(&if_ctx); + ac_build_endif(&ctx->ac, 6501); } static void @@ -2807,6 +1889,7 @@ if (slot_name != VARYING_SLOT_LAYER && slot_name != VARYING_SLOT_PRIMITIVE_ID && + slot_name != VARYING_SLOT_VIEWPORT && slot_name != VARYING_SLOT_CLIP_DIST0 && slot_name != VARYING_SLOT_CLIP_DIST1 && slot_name < VARYING_SLOT_VAR0) @@ -2900,7 +1983,7 @@ if (outinfo->writes_layer == true) pos_args[1].out[2] = layer_value; if (outinfo->writes_viewport_index == true) { - if (ctx->options->chip_class >= GFX9) { + if (ctx->args->options->chip_class >= GFX9) { /* GFX9 has the layer in out.z[10:0] and the viewport * index in out.z[19:16]. */ @@ -2962,7 +2045,7 @@ struct radv_shader_output_values *outputs; unsigned noutput = 0; - if (ctx->options->key.has_multiview_view_index) { + if (ctx->args->options->key.has_multiview_view_index) { LLVMValueRef* tmp_out = &ctx->abi.outputs[ac_llvm_reg_index_soa(VARYING_SLOT_LAYER, 0)]; if(!*tmp_out) { for(unsigned i = 0; i < 4; ++i) @@ -2970,7 +2053,8 @@ ac_build_alloca_undef(&ctx->ac, ctx->ac.f32, ""); } - LLVMBuildStore(ctx->ac.builder, ac_to_float(&ctx->ac, ctx->abi.view_index), *tmp_out); + LLVMValueRef view_index = ac_get_arg(&ctx->ac, ctx->args->ac.view_index); + LLVMBuildStore(ctx->ac.builder, ac_to_float(&ctx->ac, view_index), *tmp_out); ctx->output_mask |= 1ull << VARYING_SLOT_LAYER; } @@ -2978,20 +2062,9 @@ sizeof(outinfo->vs_output_param_offset)); outinfo->pos_exports = 0; - if (ctx->output_mask & (1ull << VARYING_SLOT_PSIZ)) { - outinfo->writes_pointsize = true; - } - - if (ctx->output_mask & (1ull << VARYING_SLOT_LAYER)) { - outinfo->writes_layer = true; - } - - if (ctx->output_mask & (1ull << VARYING_SLOT_VIEWPORT)) { - outinfo->writes_viewport_index = true; - } - - if (ctx->shader_info->info.so.num_outputs && - !ctx->is_gs_copy_shader) { + if (!ctx->args->options->use_ngg_streamout && + ctx->args->shader_info->so.num_outputs && + !ctx->args->is_gs_copy_shader) { /* The GS copy shader emission already emits streamout. */ radv_emit_streamout(ctx, 0); } @@ -3008,16 +2081,16 @@ outputs[noutput].slot_index = i == VARYING_SLOT_CLIP_DIST1; if (ctx->stage == MESA_SHADER_VERTEX && - !ctx->is_gs_copy_shader) { + !ctx->args->is_gs_copy_shader) { outputs[noutput].usage_mask = - ctx->shader_info->info.vs.output_usage_mask[i]; + ctx->args->shader_info->vs.output_usage_mask[i]; } else if (ctx->stage == MESA_SHADER_TESS_EVAL) { outputs[noutput].usage_mask = - ctx->shader_info->info.tes.output_usage_mask[i]; + ctx->args->shader_info->tes.output_usage_mask[i]; } else { - assert(ctx->is_gs_copy_shader); + assert(ctx->args->is_gs_copy_shader); outputs[noutput].usage_mask = - ctx->shader_info->info.gs.output_usage_mask[i]; + ctx->args->shader_info->gs.output_usage_mask[i]; } for (unsigned j = 0; j < 4; j++) { @@ -3030,12 +2103,11 @@ /* Export PrimitiveID. */ if (export_prim_id) { - outinfo->export_prim_id = true; - outputs[noutput].slot_name = VARYING_SLOT_PRIMITIVE_ID; outputs[noutput].slot_index = 0; outputs[noutput].usage_mask = 0x1; - outputs[noutput].values[0] = ctx->vs_prim_id; + outputs[noutput].values[0] = + ac_get_arg(&ctx->ac, ctx->args->vs_prim_id); for (unsigned j = 1; j < 4; j++) outputs[noutput].values[j] = ctx->ac.f32_0; noutput++; @@ -3051,26 +2123,14 @@ struct radv_es_output_info *outinfo) { int j; - uint64_t max_output_written = 0; LLVMValueRef lds_base = NULL; - for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) { - int param_index; - - if (!(ctx->output_mask & (1ull << i))) - continue; - - param_index = shader_io_get_unique_index(i); - - max_output_written = MAX2(param_index, max_output_written); - } - - outinfo->esgs_itemsize = (max_output_written + 1) * 16; - if (ctx->ac.chip_class >= GFX9) { unsigned itemsize_dw = outinfo->esgs_itemsize / 4; LLVMValueRef vertex_idx = ac_get_thread_id(&ctx->ac); - LLVMValueRef wave_idx = ac_unpack_param(&ctx->ac, ctx->merged_wave_info, 24, 4); + LLVMValueRef wave_idx = + ac_unpack_param(&ctx->ac, + ac_get_arg(&ctx->ac, ctx->args->merged_wave_info), 24, 4); vertex_idx = LLVMBuildOr(ctx->ac.builder, vertex_idx, LLVMBuildMul(ctx->ac.builder, wave_idx, LLVMConstInt(ctx->ac.i32, @@ -3090,11 +2150,11 @@ if (ctx->stage == MESA_SHADER_VERTEX) { output_usage_mask = - ctx->shader_info->info.vs.output_usage_mask[i]; + ctx->args->shader_info->vs.output_usage_mask[i]; } else { assert(ctx->stage == MESA_SHADER_TESS_EVAL); output_usage_mask = - ctx->shader_info->info.tes.output_usage_mask[i]; + ctx->args->shader_info->tes.output_usage_mask[i]; } param_index = shader_io_get_unique_index(i); @@ -3124,9 +2184,10 @@ ac_build_buffer_store_dword(&ctx->ac, ctx->esgs_ring, out_val, 1, - NULL, ctx->es2gs_offset, + NULL, + ac_get_arg(&ctx->ac, ctx->args->es2gs_offset), (4 * param_index + j) * 4, - ac_glc | ac_slc, true); + ac_glc | ac_slc | ac_swizzled); } } } @@ -3136,239 +2197,749 @@ handle_ls_outputs_post(struct radv_shader_context *ctx) { LLVMValueRef vertex_id = ctx->rel_auto_id; - uint32_t num_tcs_inputs = util_last_bit64(ctx->shader_info->info.vs.ls_outputs_written); + uint32_t num_tcs_inputs = util_last_bit64(ctx->args->shader_info->vs.ls_outputs_written); LLVMValueRef vertex_dw_stride = LLVMConstInt(ctx->ac.i32, num_tcs_inputs * 4, false); LLVMValueRef base_dw_addr = LLVMBuildMul(ctx->ac.builder, vertex_id, vertex_dw_stride, ""); - for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) { - LLVMValueRef *out_ptr = &ctx->abi.outputs[i * 4]; + for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) { + LLVMValueRef *out_ptr = &ctx->abi.outputs[i * 4]; + + if (!(ctx->output_mask & (1ull << i))) + continue; + + int param = shader_io_get_unique_index(i); + LLVMValueRef dw_addr = LLVMBuildAdd(ctx->ac.builder, base_dw_addr, + LLVMConstInt(ctx->ac.i32, param * 4, false), + ""); + for (unsigned j = 0; j < 4; j++) { + LLVMValueRef value = LLVMBuildLoad(ctx->ac.builder, out_ptr[j], ""); + value = ac_to_integer(&ctx->ac, value); + value = LLVMBuildZExtOrBitCast(ctx->ac.builder, value, ctx->ac.i32, ""); + ac_lds_store(&ctx->ac, dw_addr, value); + dw_addr = LLVMBuildAdd(ctx->ac.builder, dw_addr, ctx->ac.i32_1, ""); + } + } +} + +static LLVMValueRef get_wave_id_in_tg(struct radv_shader_context *ctx) +{ + return ac_unpack_param(&ctx->ac, + ac_get_arg(&ctx->ac, ctx->args->merged_wave_info), 24, 4); +} + +static LLVMValueRef get_tgsize(struct radv_shader_context *ctx) +{ + return ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->merged_wave_info), 28, 4); +} + +static LLVMValueRef get_thread_id_in_tg(struct radv_shader_context *ctx) +{ + LLVMBuilderRef builder = ctx->ac.builder; + LLVMValueRef tmp; + tmp = LLVMBuildMul(builder, get_wave_id_in_tg(ctx), + LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, false), ""); + return LLVMBuildAdd(builder, tmp, ac_get_thread_id(&ctx->ac), ""); +} + +static LLVMValueRef ngg_get_vtx_cnt(struct radv_shader_context *ctx) +{ + return ac_build_bfe(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->gs_tg_info), + LLVMConstInt(ctx->ac.i32, 12, false), + LLVMConstInt(ctx->ac.i32, 9, false), + false); +} + +static LLVMValueRef ngg_get_prim_cnt(struct radv_shader_context *ctx) +{ + return ac_build_bfe(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->gs_tg_info), + LLVMConstInt(ctx->ac.i32, 22, false), + LLVMConstInt(ctx->ac.i32, 9, false), + false); +} + +static LLVMValueRef ngg_get_ordered_id(struct radv_shader_context *ctx) +{ + return ac_build_bfe(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->gs_tg_info), + ctx->ac.i32_0, + LLVMConstInt(ctx->ac.i32, 12, false), + false); +} + +static LLVMValueRef +ngg_gs_get_vertex_storage(struct radv_shader_context *ctx) +{ + unsigned num_outputs = util_bitcount64(ctx->output_mask); + + if (ctx->args->options->key.has_multiview_view_index) + num_outputs++; + + LLVMTypeRef elements[2] = { + LLVMArrayType(ctx->ac.i32, 4 * num_outputs), + LLVMArrayType(ctx->ac.i8, 4), + }; + LLVMTypeRef type = LLVMStructTypeInContext(ctx->ac.context, elements, 2, false); + type = LLVMPointerType(LLVMArrayType(type, 0), AC_ADDR_SPACE_LDS); + return LLVMBuildBitCast(ctx->ac.builder, ctx->gs_ngg_emit, type, ""); +} + +/** + * Return a pointer to the LDS storage reserved for the N'th vertex, where N + * is in emit order; that is: + * - during the epilogue, N is the threadidx (relative to the entire threadgroup) + * - during vertex emit, i.e. while the API GS shader invocation is running, + * N = threadidx * gs_max_out_vertices + emitidx + * + * Goals of the LDS memory layout: + * 1. Eliminate bank conflicts on write for geometry shaders that have all emits + * in uniform control flow + * 2. Eliminate bank conflicts on read for export if, additionally, there is no + * culling + * 3. Agnostic to the number of waves (since we don't know it before compiling) + * 4. Allow coalescing of LDS instructions (ds_write_b128 etc.) + * 5. Avoid wasting memory. + * + * We use an AoS layout due to point 4 (this also helps point 3). In an AoS + * layout, elimination of bank conflicts requires that each vertex occupy an + * odd number of dwords. We use the additional dword to store the output stream + * index as well as a flag to indicate whether this vertex ends a primitive + * for rasterization. + * + * Swizzling is required to satisfy points 1 and 2 simultaneously. + * + * Vertices are stored in export order (gsthread * gs_max_out_vertices + emitidx). + * Indices are swizzled in groups of 32, which ensures point 1 without + * disturbing point 2. + * + * \return an LDS pointer to type {[N x i32], [4 x i8]} + */ +static LLVMValueRef +ngg_gs_vertex_ptr(struct radv_shader_context *ctx, LLVMValueRef vertexidx) +{ + LLVMBuilderRef builder = ctx->ac.builder; + LLVMValueRef storage = ngg_gs_get_vertex_storage(ctx); + + /* gs_max_out_vertices = 2^(write_stride_2exp) * some odd number */ + unsigned write_stride_2exp = ffs(ctx->shader->info.gs.vertices_out) - 1; + if (write_stride_2exp) { + LLVMValueRef row = + LLVMBuildLShr(builder, vertexidx, + LLVMConstInt(ctx->ac.i32, 5, false), ""); + LLVMValueRef swizzle = + LLVMBuildAnd(builder, row, + LLVMConstInt(ctx->ac.i32, (1u << write_stride_2exp) - 1, + false), ""); + vertexidx = LLVMBuildXor(builder, vertexidx, swizzle, ""); + } + + return ac_build_gep0(&ctx->ac, storage, vertexidx); +} + +static LLVMValueRef +ngg_gs_emit_vertex_ptr(struct radv_shader_context *ctx, LLVMValueRef gsthread, + LLVMValueRef emitidx) +{ + LLVMBuilderRef builder = ctx->ac.builder; + LLVMValueRef tmp; + + tmp = LLVMConstInt(ctx->ac.i32, ctx->shader->info.gs.vertices_out, false); + tmp = LLVMBuildMul(builder, tmp, gsthread, ""); + const LLVMValueRef vertexidx = LLVMBuildAdd(builder, tmp, emitidx, ""); + return ngg_gs_vertex_ptr(ctx, vertexidx); +} + +static LLVMValueRef +ngg_gs_get_emit_output_ptr(struct radv_shader_context *ctx, LLVMValueRef vertexptr, + unsigned out_idx) +{ + LLVMValueRef gep_idx[3] = { + ctx->ac.i32_0, /* implied C-style array */ + ctx->ac.i32_0, /* first struct entry */ + LLVMConstInt(ctx->ac.i32, out_idx, false), + }; + return LLVMBuildGEP(ctx->ac.builder, vertexptr, gep_idx, 3, ""); +} + +static LLVMValueRef +ngg_gs_get_emit_primflag_ptr(struct radv_shader_context *ctx, LLVMValueRef vertexptr, + unsigned stream) +{ + LLVMValueRef gep_idx[3] = { + ctx->ac.i32_0, /* implied C-style array */ + ctx->ac.i32_1, /* second struct entry */ + LLVMConstInt(ctx->ac.i32, stream, false), + }; + return LLVMBuildGEP(ctx->ac.builder, vertexptr, gep_idx, 3, ""); +} + +static struct radv_stream_output * +radv_get_stream_output_by_loc(struct radv_streamout_info *so, unsigned location) +{ + for (unsigned i = 0; i < so->num_outputs; ++i) { + if (so->outputs[i].location == location) + return &so->outputs[i]; + } + + return NULL; +} + +static void build_streamout_vertex(struct radv_shader_context *ctx, + LLVMValueRef *so_buffer, LLVMValueRef *wg_offset_dw, + unsigned stream, LLVMValueRef offset_vtx, + LLVMValueRef vertexptr) +{ + struct radv_streamout_info *so = &ctx->args->shader_info->so; + LLVMBuilderRef builder = ctx->ac.builder; + LLVMValueRef offset[4] = {}; + LLVMValueRef tmp; + + for (unsigned buffer = 0; buffer < 4; ++buffer) { + if (!wg_offset_dw[buffer]) + continue; + + tmp = LLVMBuildMul(builder, offset_vtx, + LLVMConstInt(ctx->ac.i32, so->strides[buffer], false), ""); + tmp = LLVMBuildAdd(builder, wg_offset_dw[buffer], tmp, ""); + offset[buffer] = LLVMBuildShl(builder, tmp, LLVMConstInt(ctx->ac.i32, 2, false), ""); + } + + if (ctx->stage == MESA_SHADER_GEOMETRY) { + struct radv_shader_output_values outputs[AC_LLVM_MAX_OUTPUTS]; + unsigned noutput = 0; + unsigned out_idx = 0; + + for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) { + unsigned output_usage_mask = + ctx->args->shader_info->gs.output_usage_mask[i]; + uint8_t output_stream = + output_stream = ctx->args->shader_info->gs.output_streams[i]; + + if (!(ctx->output_mask & (1ull << i)) || + output_stream != stream) + continue; + + outputs[noutput].slot_name = i; + outputs[noutput].slot_index = i == VARYING_SLOT_CLIP_DIST1; + outputs[noutput].usage_mask = output_usage_mask; + + int length = util_last_bit(output_usage_mask); + + for (unsigned j = 0; j < length; j++, out_idx++) { + if (!(output_usage_mask & (1 << j))) + continue; + + tmp = ac_build_gep0(&ctx->ac, vertexptr, + LLVMConstInt(ctx->ac.i32, out_idx, false)); + outputs[noutput].values[j] = LLVMBuildLoad(builder, tmp, ""); + } + + for (unsigned j = length; j < 4; j++) + outputs[noutput].values[j] = LLVMGetUndef(ctx->ac.f32); + + noutput++; + } + + for (unsigned i = 0; i < noutput; i++) { + struct radv_stream_output *output = + radv_get_stream_output_by_loc(so, outputs[i].slot_name); + + if (!output || + output->stream != stream) + continue; + + struct radv_shader_output_values out = {}; + + for (unsigned j = 0; j < 4; j++) { + out.values[j] = outputs[i].values[j]; + } + + radv_emit_stream_output(ctx, so_buffer, offset, output, &out); + } + } else { + for (unsigned i = 0; i < so->num_outputs; ++i) { + struct radv_stream_output *output = + &ctx->args->shader_info->so.outputs[i]; + + if (stream != output->stream) + continue; + + struct radv_shader_output_values out = {}; + + for (unsigned comp = 0; comp < 4; comp++) { + if (!(output->component_mask & (1 << comp))) + continue; + + tmp = ac_build_gep0(&ctx->ac, vertexptr, + LLVMConstInt(ctx->ac.i32, 4 * i + comp, false)); + out.values[comp] = LLVMBuildLoad(builder, tmp, ""); + } + + radv_emit_stream_output(ctx, so_buffer, offset, output, &out); + } + } +} + +struct ngg_streamout { + LLVMValueRef num_vertices; + + /* per-thread data */ + LLVMValueRef prim_enable[4]; /* i1 per stream */ + LLVMValueRef vertices[3]; /* [N x i32] addrspace(LDS)* */ + + /* Output */ + LLVMValueRef emit[4]; /* per-stream emitted primitives (only valid for used streams) */ +}; + +/** + * Build streamout logic. + * + * Implies a barrier. + * + * Writes number of emitted primitives to gs_ngg_scratch[4:7]. + * + * Clobbers gs_ngg_scratch[8:]. + */ +static void build_streamout(struct radv_shader_context *ctx, + struct ngg_streamout *nggso) +{ + struct radv_streamout_info *so = &ctx->args->shader_info->so; + LLVMBuilderRef builder = ctx->ac.builder; + LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->args->streamout_buffers); + LLVMValueRef tid = get_thread_id_in_tg(ctx); + LLVMValueRef cond, tmp, tmp2; + LLVMValueRef i32_2 = LLVMConstInt(ctx->ac.i32, 2, false); + LLVMValueRef i32_4 = LLVMConstInt(ctx->ac.i32, 4, false); + LLVMValueRef i32_8 = LLVMConstInt(ctx->ac.i32, 8, false); + LLVMValueRef so_buffer[4] = {}; + unsigned max_num_vertices = 1 + (nggso->vertices[1] ? 1 : 0) + + (nggso->vertices[2] ? 1 : 0); + LLVMValueRef prim_stride_dw[4] = {}; + LLVMValueRef prim_stride_dw_vgpr = LLVMGetUndef(ctx->ac.i32); + int stream_for_buffer[4] = { -1, -1, -1, -1 }; + unsigned bufmask_for_stream[4] = {}; + bool isgs = ctx->stage == MESA_SHADER_GEOMETRY; + unsigned scratch_emit_base = isgs ? 4 : 0; + LLVMValueRef scratch_emit_basev = isgs ? i32_4 : ctx->ac.i32_0; + unsigned scratch_offset_base = isgs ? 8 : 4; + LLVMValueRef scratch_offset_basev = isgs ? i32_8 : i32_4; + + ac_llvm_add_target_dep_function_attr(ctx->main_function, + "amdgpu-gds-size", 256); + + /* Determine the mapping of streamout buffers to vertex streams. */ + for (unsigned i = 0; i < so->num_outputs; ++i) { + unsigned buf = so->outputs[i].buffer; + unsigned stream = so->outputs[i].stream; + assert(stream_for_buffer[buf] < 0 || stream_for_buffer[buf] == stream); + stream_for_buffer[buf] = stream; + bufmask_for_stream[stream] |= 1 << buf; + } + + for (unsigned buffer = 0; buffer < 4; ++buffer) { + if (stream_for_buffer[buffer] == -1) + continue; + + assert(so->strides[buffer]); + + LLVMValueRef stride_for_buffer = + LLVMConstInt(ctx->ac.i32, so->strides[buffer], false); + prim_stride_dw[buffer] = + LLVMBuildMul(builder, stride_for_buffer, + nggso->num_vertices, ""); + prim_stride_dw_vgpr = ac_build_writelane( + &ctx->ac, prim_stride_dw_vgpr, prim_stride_dw[buffer], + LLVMConstInt(ctx->ac.i32, buffer, false)); + + LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, buffer, false); + so_buffer[buffer] = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, + offset); + } + + cond = LLVMBuildICmp(builder, LLVMIntEQ, get_wave_id_in_tg(ctx), ctx->ac.i32_0, ""); + ac_build_ifcc(&ctx->ac, cond, 5200); + { + LLVMTypeRef gdsptr = LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GDS); + LLVMValueRef gdsbase = LLVMBuildIntToPtr(builder, ctx->ac.i32_0, gdsptr, ""); + + /* Advance the streamout offsets in GDS. */ + LLVMValueRef offsets_vgpr = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, ""); + LLVMValueRef generated_by_stream_vgpr = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, ""); + + cond = LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), i32_4, ""); + ac_build_ifcc(&ctx->ac, cond, 5210); + { + /* Fetch the number of generated primitives and store + * it in GDS for later use. + */ + if (isgs) { + tmp = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tid); + tmp = LLVMBuildLoad(builder, tmp, ""); + } else { + tmp = ac_build_writelane(&ctx->ac, ctx->ac.i32_0, + ngg_get_prim_cnt(ctx), ctx->ac.i32_0); + } + LLVMBuildStore(builder, tmp, generated_by_stream_vgpr); + + unsigned swizzle[4]; + int unused_stream = -1; + for (unsigned stream = 0; stream < 4; ++stream) { + if (!ctx->args->shader_info->gs.num_stream_output_components[stream]) { + unused_stream = stream; + break; + } + } + for (unsigned buffer = 0; buffer < 4; ++buffer) { + if (stream_for_buffer[buffer] >= 0) { + swizzle[buffer] = stream_for_buffer[buffer]; + } else { + assert(unused_stream >= 0); + swizzle[buffer] = unused_stream; + } + } + + tmp = ac_build_quad_swizzle(&ctx->ac, tmp, + swizzle[0], swizzle[1], swizzle[2], swizzle[3]); + tmp = LLVMBuildMul(builder, tmp, prim_stride_dw_vgpr, ""); + + LLVMValueRef args[] = { + LLVMBuildIntToPtr(builder, ngg_get_ordered_id(ctx), gdsptr, ""), + tmp, + ctx->ac.i32_0, // ordering + ctx->ac.i32_0, // scope + ctx->ac.i1false, // isVolatile + LLVMConstInt(ctx->ac.i32, 4 << 24, false), // OA index + ctx->ac.i1true, // wave release + ctx->ac.i1true, // wave done + }; + + tmp = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.ds.ordered.add", + ctx->ac.i32, args, ARRAY_SIZE(args), 0); + + /* Keep offsets in a VGPR for quick retrieval via readlane by + * the first wave for bounds checking, and also store in LDS + * for retrieval by all waves later. */ + LLVMBuildStore(builder, tmp, offsets_vgpr); + + tmp2 = LLVMBuildAdd(builder, ac_get_thread_id(&ctx->ac), + scratch_offset_basev, ""); + tmp2 = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tmp2); + LLVMBuildStore(builder, tmp, tmp2); + } + ac_build_endif(&ctx->ac, 5210); + + /* Determine the max emit per buffer. This is done via the SALU, in part + * because LLVM can't generate divide-by-multiply if we try to do this + * via VALU with one lane per buffer. + */ + LLVMValueRef max_emit[4] = {}; + for (unsigned buffer = 0; buffer < 4; ++buffer) { + if (stream_for_buffer[buffer] == -1) + continue; + + /* Compute the streamout buffer size in DWORD. */ + LLVMValueRef bufsize_dw = + LLVMBuildLShr(builder, + LLVMBuildExtractElement(builder, so_buffer[buffer], i32_2, ""), + i32_2, ""); + + /* Load the streamout buffer offset from GDS. */ + tmp = LLVMBuildLoad(builder, offsets_vgpr, ""); + LLVMValueRef offset_dw = + ac_build_readlane(&ctx->ac, tmp, + LLVMConstInt(ctx->ac.i32, buffer, false)); + + /* Compute the remaining size to emit. */ + LLVMValueRef remaining_dw = + LLVMBuildSub(builder, bufsize_dw, offset_dw, ""); + tmp = LLVMBuildUDiv(builder, remaining_dw, + prim_stride_dw[buffer], ""); + + cond = LLVMBuildICmp(builder, LLVMIntULT, + bufsize_dw, offset_dw, ""); + max_emit[buffer] = LLVMBuildSelect(builder, cond, + ctx->ac.i32_0, tmp, ""); + } + + /* Determine the number of emitted primitives per stream and fixup the + * GDS counter if necessary. + * + * This is complicated by the fact that a single stream can emit to + * multiple buffers (but luckily not vice versa). + */ + LLVMValueRef emit_vgpr = ctx->ac.i32_0; + + for (unsigned stream = 0; stream < 4; ++stream) { + if (!ctx->args->shader_info->gs.num_stream_output_components[stream]) + continue; + + /* Load the number of generated primitives from GDS and + * determine that number for the given stream. + */ + tmp = LLVMBuildLoad(builder, generated_by_stream_vgpr, ""); + LLVMValueRef generated = + ac_build_readlane(&ctx->ac, tmp, + LLVMConstInt(ctx->ac.i32, stream, false)); + + + /* Compute the number of emitted primitives. */ + LLVMValueRef emit = generated; + for (unsigned buffer = 0; buffer < 4; ++buffer) { + if (stream_for_buffer[buffer] == stream) + emit = ac_build_umin(&ctx->ac, emit, max_emit[buffer]); + } + + /* Store the number of emitted primitives for that + * stream. + */ + emit_vgpr = ac_build_writelane(&ctx->ac, emit_vgpr, emit, + LLVMConstInt(ctx->ac.i32, stream, false)); - if (!(ctx->output_mask & (1ull << i))) - continue; + /* Fixup the offset using a plain GDS atomic if we overflowed. */ + cond = LLVMBuildICmp(builder, LLVMIntULT, emit, generated, ""); + ac_build_ifcc(&ctx->ac, cond, 5221); /* scalar branch */ + tmp = LLVMBuildLShr(builder, + LLVMConstInt(ctx->ac.i32, bufmask_for_stream[stream], false), + ac_get_thread_id(&ctx->ac), ""); + tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, ""); + ac_build_ifcc(&ctx->ac, tmp, 5222); + { + tmp = LLVMBuildSub(builder, generated, emit, ""); + tmp = LLVMBuildMul(builder, tmp, prim_stride_dw_vgpr, ""); + tmp2 = LLVMBuildGEP(builder, gdsbase, &tid, 1, ""); + LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpSub, tmp2, tmp, + LLVMAtomicOrderingMonotonic, false); + } + ac_build_endif(&ctx->ac, 5222); + ac_build_endif(&ctx->ac, 5221); + } - int param = shader_io_get_unique_index(i); - LLVMValueRef dw_addr = LLVMBuildAdd(ctx->ac.builder, base_dw_addr, - LLVMConstInt(ctx->ac.i32, param * 4, false), - ""); - for (unsigned j = 0; j < 4; j++) { - LLVMValueRef value = LLVMBuildLoad(ctx->ac.builder, out_ptr[j], ""); - value = ac_to_integer(&ctx->ac, value); - value = LLVMBuildZExtOrBitCast(ctx->ac.builder, value, ctx->ac.i32, ""); - ac_lds_store(&ctx->ac, dw_addr, value); - dw_addr = LLVMBuildAdd(ctx->ac.builder, dw_addr, ctx->ac.i32_1, ""); + /* Store the number of emitted primitives to LDS for later use. */ + cond = LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), i32_4, ""); + ac_build_ifcc(&ctx->ac, cond, 5225); + { + tmp = LLVMBuildAdd(builder, ac_get_thread_id(&ctx->ac), + scratch_emit_basev, ""); + tmp = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tmp); + LLVMBuildStore(builder, emit_vgpr, tmp); + } + ac_build_endif(&ctx->ac, 5225); + } + ac_build_endif(&ctx->ac, 5200); + + /* Determine the workgroup-relative per-thread / primitive offset into + * the streamout buffers */ + struct ac_wg_scan primemit_scan[4] = {}; + + if (isgs) { + for (unsigned stream = 0; stream < 4; ++stream) { + if (!ctx->args->shader_info->gs.num_stream_output_components[stream]) + continue; + + primemit_scan[stream].enable_exclusive = true; + primemit_scan[stream].op = nir_op_iadd; + primemit_scan[stream].src = nggso->prim_enable[stream]; + primemit_scan[stream].scratch = + ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, + LLVMConstInt(ctx->ac.i32, 12 + 8 * stream, false)); + primemit_scan[stream].waveidx = get_wave_id_in_tg(ctx); + primemit_scan[stream].numwaves = get_tgsize(ctx); + primemit_scan[stream].maxwaves = 8; + ac_build_wg_scan_top(&ctx->ac, &primemit_scan[stream]); } } -} -static LLVMValueRef get_wave_id_in_tg(struct radv_shader_context *ctx) -{ - return ac_unpack_param(&ctx->ac, ctx->merged_wave_info, 24, 4); -} + ac_build_s_barrier(&ctx->ac); -static LLVMValueRef get_tgsize(struct radv_shader_context *ctx) -{ - return ac_unpack_param(&ctx->ac, ctx->merged_wave_info, 28, 4); -} + /* Fetch the per-buffer offsets and per-stream emit counts in all waves. */ + LLVMValueRef wgoffset_dw[4] = {}; -static LLVMValueRef get_thread_id_in_tg(struct radv_shader_context *ctx) -{ - LLVMBuilderRef builder = ctx->ac.builder; - LLVMValueRef tmp; - tmp = LLVMBuildMul(builder, get_wave_id_in_tg(ctx), - LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, false), ""); - return LLVMBuildAdd(builder, tmp, ac_get_thread_id(&ctx->ac), ""); -} + { + LLVMValueRef scratch_vgpr; -static LLVMValueRef ngg_get_vtx_cnt(struct radv_shader_context *ctx) -{ - return ac_build_bfe(&ctx->ac, ctx->gs_tg_info, - LLVMConstInt(ctx->ac.i32, 12, false), - LLVMConstInt(ctx->ac.i32, 9, false), - false); -} + tmp = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, ac_get_thread_id(&ctx->ac)); + scratch_vgpr = LLVMBuildLoad(builder, tmp, ""); -static LLVMValueRef ngg_get_prim_cnt(struct radv_shader_context *ctx) -{ - return ac_build_bfe(&ctx->ac, ctx->gs_tg_info, - LLVMConstInt(ctx->ac.i32, 22, false), - LLVMConstInt(ctx->ac.i32, 9, false), - false); -} + for (unsigned buffer = 0; buffer < 4; ++buffer) { + if (stream_for_buffer[buffer] >= 0) { + wgoffset_dw[buffer] = ac_build_readlane( + &ctx->ac, scratch_vgpr, + LLVMConstInt(ctx->ac.i32, scratch_offset_base + buffer, false)); + } + } -static LLVMValueRef -ngg_gs_get_vertex_storage(struct radv_shader_context *ctx) -{ - unsigned num_outputs = util_bitcount64(ctx->output_mask); + for (unsigned stream = 0; stream < 4; ++stream) { + if (ctx->args->shader_info->gs.num_stream_output_components[stream]) { + nggso->emit[stream] = ac_build_readlane( + &ctx->ac, scratch_vgpr, + LLVMConstInt(ctx->ac.i32, scratch_emit_base + stream, false)); + } + } + } - LLVMTypeRef elements[2] = { - LLVMArrayType(ctx->ac.i32, 4 * num_outputs), - LLVMArrayType(ctx->ac.i8, 4), - }; - LLVMTypeRef type = LLVMStructTypeInContext(ctx->ac.context, elements, 2, false); - type = LLVMPointerType(LLVMArrayType(type, 0), AC_ADDR_SPACE_LDS); - return LLVMBuildBitCast(ctx->ac.builder, ctx->gs_ngg_emit, type, ""); -} + /* Write out primitive data */ + for (unsigned stream = 0; stream < 4; ++stream) { + if (!ctx->args->shader_info->gs.num_stream_output_components[stream]) + continue; -/** - * Return a pointer to the LDS storage reserved for the N'th vertex, where N - * is in emit order; that is: - * - during the epilogue, N is the threadidx (relative to the entire threadgroup) - * - during vertex emit, i.e. while the API GS shader invocation is running, - * N = threadidx * gs_max_out_vertices + emitidx - * - * Goals of the LDS memory layout: - * 1. Eliminate bank conflicts on write for geometry shaders that have all emits - * in uniform control flow - * 2. Eliminate bank conflicts on read for export if, additionally, there is no - * culling - * 3. Agnostic to the number of waves (since we don't know it before compiling) - * 4. Allow coalescing of LDS instructions (ds_write_b128 etc.) - * 5. Avoid wasting memory. - * - * We use an AoS layout due to point 4 (this also helps point 3). In an AoS - * layout, elimination of bank conflicts requires that each vertex occupy an - * odd number of dwords. We use the additional dword to store the output stream - * index as well as a flag to indicate whether this vertex ends a primitive - * for rasterization. - * - * Swizzling is required to satisfy points 1 and 2 simultaneously. - * - * Vertices are stored in export order (gsthread * gs_max_out_vertices + emitidx). - * Indices are swizzled in groups of 32, which ensures point 1 without - * disturbing point 2. - * - * \return an LDS pointer to type {[N x i32], [4 x i8]} - */ -static LLVMValueRef -ngg_gs_vertex_ptr(struct radv_shader_context *ctx, LLVMValueRef vertexidx) -{ - LLVMBuilderRef builder = ctx->ac.builder; - LLVMValueRef storage = ngg_gs_get_vertex_storage(ctx); + if (isgs) { + ac_build_wg_scan_bottom(&ctx->ac, &primemit_scan[stream]); + } else { + primemit_scan[stream].result_exclusive = tid; + } - /* gs_max_out_vertices = 2^(write_stride_2exp) * some odd number */ - unsigned write_stride_2exp = ffs(ctx->gs_max_out_vertices) - 1; - if (write_stride_2exp) { - LLVMValueRef row = - LLVMBuildLShr(builder, vertexidx, - LLVMConstInt(ctx->ac.i32, 5, false), ""); - LLVMValueRef swizzle = - LLVMBuildAnd(builder, row, - LLVMConstInt(ctx->ac.i32, (1u << write_stride_2exp) - 1, - false), ""); - vertexidx = LLVMBuildXor(builder, vertexidx, swizzle, ""); + cond = LLVMBuildICmp(builder, LLVMIntULT, + primemit_scan[stream].result_exclusive, + nggso->emit[stream], ""); + cond = LLVMBuildAnd(builder, cond, nggso->prim_enable[stream], ""); + ac_build_ifcc(&ctx->ac, cond, 5240); + { + LLVMValueRef offset_vtx = + LLVMBuildMul(builder, primemit_scan[stream].result_exclusive, + nggso->num_vertices, ""); + + for (unsigned i = 0; i < max_num_vertices; ++i) { + cond = LLVMBuildICmp(builder, LLVMIntULT, + LLVMConstInt(ctx->ac.i32, i, false), + nggso->num_vertices, ""); + ac_build_ifcc(&ctx->ac, cond, 5241); + build_streamout_vertex(ctx, so_buffer, wgoffset_dw, + stream, offset_vtx, nggso->vertices[i]); + ac_build_endif(&ctx->ac, 5241); + offset_vtx = LLVMBuildAdd(builder, offset_vtx, ctx->ac.i32_1, ""); + } + } + ac_build_endif(&ctx->ac, 5240); } - - return ac_build_gep0(&ctx->ac, storage, vertexidx); } -static LLVMValueRef -ngg_gs_emit_vertex_ptr(struct radv_shader_context *ctx, LLVMValueRef gsthread, - LLVMValueRef emitidx) +static unsigned ngg_nogs_vertex_size(struct radv_shader_context *ctx) { - LLVMBuilderRef builder = ctx->ac.builder; - LLVMValueRef tmp; + unsigned lds_vertex_size = 0; - tmp = LLVMConstInt(ctx->ac.i32, ctx->gs_max_out_vertices, false); - tmp = LLVMBuildMul(builder, tmp, gsthread, ""); - const LLVMValueRef vertexidx = LLVMBuildAdd(builder, tmp, emitidx, ""); - return ngg_gs_vertex_ptr(ctx, vertexidx); + if (ctx->args->shader_info->so.num_outputs) + lds_vertex_size = 4 * ctx->args->shader_info->so.num_outputs + 1; + + return lds_vertex_size; } -/* Send GS Alloc Req message from the first wave of the group to SPI. - * Message payload is: - * - bits 0..10: vertices in group - * - bits 12..22: primitives in group +/** + * Returns an `[N x i32] addrspace(LDS)*` pointing at contiguous LDS storage + * for the vertex outputs. */ -static void build_sendmsg_gs_alloc_req(struct radv_shader_context *ctx, - LLVMValueRef vtx_cnt, - LLVMValueRef prim_cnt) +static LLVMValueRef ngg_nogs_vertex_ptr(struct radv_shader_context *ctx, + LLVMValueRef vtxid) { - LLVMBuilderRef builder = ctx->ac.builder; - LLVMValueRef tmp; + /* The extra dword is used to avoid LDS bank conflicts. */ + unsigned vertex_size = ngg_nogs_vertex_size(ctx); + LLVMTypeRef ai32 = LLVMArrayType(ctx->ac.i32, vertex_size); + LLVMTypeRef pai32 = LLVMPointerType(ai32, AC_ADDR_SPACE_LDS); + LLVMValueRef tmp = LLVMBuildBitCast(ctx->ac.builder, ctx->esgs_ring, pai32, ""); + return LLVMBuildGEP(ctx->ac.builder, tmp, &vtxid, 1, ""); +} - tmp = LLVMBuildICmp(builder, LLVMIntEQ, get_wave_id_in_tg(ctx), ctx->ac.i32_0, ""); - ac_build_ifcc(&ctx->ac, tmp, 5020); +static void +handle_ngg_outputs_post_1(struct radv_shader_context *ctx) +{ + struct radv_streamout_info *so = &ctx->args->shader_info->so; + LLVMBuilderRef builder = ctx->ac.builder; + LLVMValueRef vertex_ptr = NULL; + LLVMValueRef tmp, tmp2; - tmp = LLVMBuildShl(builder, prim_cnt, LLVMConstInt(ctx->ac.i32, 12, false),""); - tmp = LLVMBuildOr(builder, tmp, vtx_cnt, ""); - ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_ALLOC_REQ, tmp); + assert((ctx->stage == MESA_SHADER_VERTEX || + ctx->stage == MESA_SHADER_TESS_EVAL) && !ctx->args->is_gs_copy_shader); - ac_build_endif(&ctx->ac, 5020); -} + if (!ctx->args->shader_info->so.num_outputs) + return; -struct ngg_prim { - unsigned num_vertices; - LLVMValueRef isnull; - LLVMValueRef index[3]; - LLVMValueRef edgeflag[3]; -}; + vertex_ptr = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx)); -static void build_export_prim(struct radv_shader_context *ctx, - const struct ngg_prim *prim) -{ - LLVMBuilderRef builder = ctx->ac.builder; - struct ac_export_args args; - LLVMValueRef tmp; + for (unsigned i = 0; i < so->num_outputs; ++i) { + struct radv_stream_output *output = + &ctx->args->shader_info->so.outputs[i]; - tmp = LLVMBuildZExt(builder, prim->isnull, ctx->ac.i32, ""); - args.out[0] = LLVMBuildShl(builder, tmp, LLVMConstInt(ctx->ac.i32, 31, false), ""); + unsigned loc = output->location; - for (unsigned i = 0; i < prim->num_vertices; ++i) { - tmp = LLVMBuildShl(builder, prim->index[i], - LLVMConstInt(ctx->ac.i32, 10 * i, false), ""); - args.out[0] = LLVMBuildOr(builder, args.out[0], tmp, ""); - tmp = LLVMBuildZExt(builder, prim->edgeflag[i], ctx->ac.i32, ""); - tmp = LLVMBuildShl(builder, tmp, - LLVMConstInt(ctx->ac.i32, 10 * i + 9, false), ""); - args.out[0] = LLVMBuildOr(builder, args.out[0], tmp, ""); - } - - args.out[0] = LLVMBuildBitCast(builder, args.out[0], ctx->ac.f32, ""); - args.out[1] = LLVMGetUndef(ctx->ac.f32); - args.out[2] = LLVMGetUndef(ctx->ac.f32); - args.out[3] = LLVMGetUndef(ctx->ac.f32); - - args.target = V_008DFC_SQ_EXP_PRIM; - args.enabled_channels = 1; - args.done = true; - args.valid_mask = false; - args.compr = false; + for (unsigned comp = 0; comp < 4; comp++) { + if (!(output->component_mask & (1 << comp))) + continue; - ac_build_export(&ctx->ac, &args); + tmp = ac_build_gep0(&ctx->ac, vertex_ptr, + LLVMConstInt(ctx->ac.i32, 4 * i + comp, false)); + tmp2 = LLVMBuildLoad(builder, + ctx->abi.outputs[4 * loc + comp], ""); + tmp2 = ac_to_integer(&ctx->ac, tmp2); + LLVMBuildStore(builder, tmp2, tmp); + } + } } static void -handle_ngg_outputs_post(struct radv_shader_context *ctx) +handle_ngg_outputs_post_2(struct radv_shader_context *ctx) { LLVMBuilderRef builder = ctx->ac.builder; - struct ac_build_if_state if_state; - unsigned num_vertices = 3; LLVMValueRef tmp; assert((ctx->stage == MESA_SHADER_VERTEX || - ctx->stage == MESA_SHADER_TESS_EVAL) && !ctx->is_gs_copy_shader); + ctx->stage == MESA_SHADER_TESS_EVAL) && !ctx->args->is_gs_copy_shader); - LLVMValueRef prims_in_wave = ac_unpack_param(&ctx->ac, ctx->merged_wave_info, 8, 8); - LLVMValueRef vtx_in_wave = ac_unpack_param(&ctx->ac, ctx->merged_wave_info, 0, 8); + LLVMValueRef prims_in_wave = ac_unpack_param(&ctx->ac, + ac_get_arg(&ctx->ac, ctx->args->merged_wave_info), 8, 8); + LLVMValueRef vtx_in_wave = ac_unpack_param(&ctx->ac, + ac_get_arg(&ctx->ac, ctx->args->merged_wave_info), 0, 8); LLVMValueRef is_gs_thread = LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), prims_in_wave, ""); LLVMValueRef is_es_thread = LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), vtx_in_wave, ""); LLVMValueRef vtxindex[] = { - ac_unpack_param(&ctx->ac, ctx->gs_vtx_offset[0], 0, 16), - ac_unpack_param(&ctx->ac, ctx->gs_vtx_offset[0], 16, 16), - ac_unpack_param(&ctx->ac, ctx->gs_vtx_offset[2], 0, 16), + ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->gs_vtx_offset[0]), 0, 16), + ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->gs_vtx_offset[0]), 16, 16), + ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->gs_vtx_offset[2]), 0, 16), }; - /* TODO: streamout */ + /* Determine the number of vertices per primitive. */ + unsigned num_vertices; + LLVMValueRef num_vertices_val; + + if (ctx->stage == MESA_SHADER_VERTEX) { + LLVMValueRef outprim_val = + LLVMConstInt(ctx->ac.i32, + ctx->args->options->key.vs.outprim, false); + num_vertices_val = LLVMBuildAdd(builder, outprim_val, + ctx->ac.i32_1, ""); + num_vertices = 3; /* TODO: optimize for points & lines */ + } else { + assert(ctx->stage == MESA_SHADER_TESS_EVAL); + + if (ctx->shader->info.tess.point_mode) + num_vertices = 1; + else if (ctx->shader->info.tess.primitive_mode == GL_ISOLINES) + num_vertices = 2; + else + num_vertices = 3; + + num_vertices_val = LLVMConstInt(ctx->ac.i32, num_vertices, false); + } + + /* Streamout */ + if (ctx->args->shader_info->so.num_outputs) { + struct ngg_streamout nggso = {}; + + nggso.num_vertices = num_vertices_val; + nggso.prim_enable[0] = is_gs_thread; + + for (unsigned i = 0; i < num_vertices; ++i) + nggso.vertices[i] = ngg_nogs_vertex_ptr(ctx, vtxindex[i]); + + build_streamout(ctx, &nggso); + } /* Copy Primitive IDs from GS threads to the LDS address corresponding * to the ES thread of the provoking vertex. */ if (ctx->stage == MESA_SHADER_VERTEX && - ctx->options->key.vs_common_out.export_prim_id) { - /* TODO: streamout */ + ctx->args->options->key.vs_common_out.export_prim_id) { + if (ctx->args->shader_info->so.num_outputs) + ac_build_s_barrier(&ctx->ac); ac_build_ifcc(&ctx->ac, is_gs_thread, 5400); /* Extract the PROVOKING_VTX_INDEX field. */ @@ -3380,24 +2951,18 @@ LLVMValueRef provoking_vtx_index = LLVMBuildExtractElement(builder, indices, provoking_vtx_in_prim, ""); - LLVMBuildStore(builder, ctx->abi.gs_prim_id, + LLVMBuildStore(builder, ac_get_arg(&ctx->ac, ctx->args->ac.gs_prim_id), ac_build_gep0(&ctx->ac, ctx->esgs_ring, provoking_vtx_index)); ac_build_endif(&ctx->ac, 5400); } /* TODO: primitive culling */ - build_sendmsg_gs_alloc_req(ctx, ngg_get_vtx_cnt(ctx), ngg_get_prim_cnt(ctx)); + ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx), + ngg_get_vtx_cnt(ctx), ngg_get_prim_cnt(ctx)); /* TODO: streamout queries */ - /* Export primitive data to the index buffer. Format is: - * - bits 0..8: index 0 - * - bit 9: edge flag 0 - * - bits 10..18: index 1 - * - bit 19: edge flag 1 - * - bits 20..28: index 2 - * - bit 29: edge flag 2 - * - bit 31: null primitive (skip) + /* Export primitive data to the index buffer. * * For the first version, we will always build up all three indices * independent of the primitive type. The additional garbage data @@ -3406,37 +2971,43 @@ * TODO: culling depends on the primitive type, so can have some * interaction here. */ - ac_nir_build_if(&if_state, ctx, is_gs_thread); + ac_build_ifcc(&ctx->ac, is_gs_thread, 6001); { - struct ngg_prim prim = {}; - - prim.num_vertices = num_vertices; - prim.isnull = ctx->ac.i1false; - memcpy(prim.index, vtxindex, sizeof(vtxindex[0]) * 3); + struct ac_ngg_prim prim = {}; - for (unsigned i = 0; i < num_vertices; ++i) { - tmp = LLVMBuildLShr(builder, ctx->abi.gs_invocation_id, - LLVMConstInt(ctx->ac.i32, 8 + i, false), ""); - prim.edgeflag[i] = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, ""); + if (ctx->args->options->key.vs_common_out.as_ngg_passthrough) { + prim.passthrough = ac_get_arg(&ctx->ac, ctx->args->gs_vtx_offset[0]); + } else { + prim.num_vertices = num_vertices; + prim.isnull = ctx->ac.i1false; + memcpy(prim.index, vtxindex, sizeof(vtxindex[0]) * 3); + + for (unsigned i = 0; i < num_vertices; ++i) { + tmp = LLVMBuildLShr(builder, + ac_get_arg(&ctx->ac, ctx->args->ac.gs_invocation_id), + LLVMConstInt(ctx->ac.i32, 8 + i, false), ""); + prim.edgeflag[i] = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, ""); + } } - build_export_prim(ctx, &prim); + ac_build_export_prim(&ctx->ac, &prim); } - ac_nir_build_endif(&if_state); + ac_build_endif(&ctx->ac, 6001); /* Export per-vertex data (positions and parameters). */ - ac_nir_build_if(&if_state, ctx, is_es_thread); + ac_build_ifcc(&ctx->ac, is_es_thread, 6002); { struct radv_vs_output_info *outinfo = - ctx->stage == MESA_SHADER_TESS_EVAL ? &ctx->shader_info->tes.outinfo : &ctx->shader_info->vs.outinfo; + ctx->stage == MESA_SHADER_TESS_EVAL ? + &ctx->args->shader_info->tes.outinfo : &ctx->args->shader_info->vs.outinfo; /* Exporting the primitive ID is handled below. */ /* TODO: use the new VS export path */ handle_vs_outputs_post(ctx, false, - ctx->options->key.vs_common_out.export_clip_dists, + ctx->args->options->key.vs_common_out.export_clip_dists, outinfo); - if (ctx->options->key.vs_common_out.export_prim_id) { + if (ctx->args->options->key.vs_common_out.export_prim_id) { unsigned param_count = outinfo->param_exports; LLVMValueRef values[4]; @@ -3449,7 +3020,7 @@ values[0] = LLVMBuildLoad(builder, tmp, ""); } else { assert(ctx->stage == MESA_SHADER_TESS_EVAL); - values[0] = ctx->abi.tes_patch_id; + values[0] = ac_get_arg(&ctx->ac, ctx->args->ac.tes_patch_id); } values[0] = ac_to_float(&ctx->ac, values[0]); @@ -3459,11 +3030,10 @@ radv_export_param(ctx, param_count, values, 0x1); outinfo->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID] = param_count++; - outinfo->export_prim_id = true; outinfo->param_exports = param_count; } } - ac_nir_build_endif(&if_state); + ac_build_endif(&ctx->ac, 6002); } static void gfx10_ngg_gs_emit_prologue(struct radv_shader_context *ctx) @@ -3511,7 +3081,7 @@ unsigned num_components; num_components = - ctx->shader_info->info.gs.num_stream_output_components[stream]; + ctx->args->shader_info->gs.num_stream_output_components[stream]; if (!num_components) continue; @@ -3522,7 +3092,7 @@ const LLVMValueRef vertexidx = LLVMBuildLoad(builder, ctx->gs_next_vertex[stream], ""); tmp = LLVMBuildICmp(builder, LLVMIntUGE, vertexidx, - LLVMConstInt(ctx->ac.i32, ctx->gs_max_out_vertices, false), ""); + LLVMConstInt(ctx->ac.i32, ctx->shader->info.gs.vertices_out, false), ""); ac_build_ifcc(&ctx->ac, tmp, 5101); ac_build_break(&ctx->ac); ac_build_endif(&ctx->ac, 5101); @@ -3531,21 +3101,40 @@ LLVMBuildStore(builder, tmp, ctx->gs_next_vertex[stream]); tmp = ngg_gs_emit_vertex_ptr(ctx, gsthread, vertexidx); - LLVMValueRef gep_idx[3] = { - ctx->ac.i32_0, /* implied C-style array */ - ctx->ac.i32_1, /* second entry of struct */ - LLVMConstInt(ctx->ac.i32, stream, false), - }; - tmp = LLVMBuildGEP(builder, tmp, gep_idx, 3, ""); - LLVMBuildStore(builder, i8_0, tmp); + LLVMBuildStore(builder, i8_0, + ngg_gs_get_emit_primflag_ptr(ctx, tmp, stream)); ac_build_endloop(&ctx->ac, 5100); } + + /* Accumulate generated primitives counts across the entire threadgroup. */ + for (unsigned stream = 0; stream < 4; ++stream) { + unsigned num_components; + + num_components = + ctx->args->shader_info->gs.num_stream_output_components[stream]; + if (!num_components) + continue; + + LLVMValueRef numprims = + LLVMBuildLoad(builder, ctx->gs_generated_prims[stream], ""); + numprims = ac_build_reduce(&ctx->ac, numprims, nir_op_iadd, ctx->ac.wave_size); + + tmp = LLVMBuildICmp(builder, LLVMIntEQ, ac_get_thread_id(&ctx->ac), ctx->ac.i32_0, ""); + ac_build_ifcc(&ctx->ac, tmp, 5105); + { + LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpAdd, + ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, + LLVMConstInt(ctx->ac.i32, stream, false)), + numprims, LLVMAtomicOrderingMonotonic, false); + } + ac_build_endif(&ctx->ac, 5105); + } } static void gfx10_ngg_gs_emit_epilogue_2(struct radv_shader_context *ctx) { - const unsigned verts_per_prim = si_conv_gl_prim_to_vertices(ctx->gs_output_prim); + const unsigned verts_per_prim = si_conv_gl_prim_to_vertices(ctx->shader->info.gs.output_primitive); LLVMBuilderRef builder = ctx->ac.builder; LLVMValueRef tmp, tmp2; @@ -3554,7 +3143,60 @@ const LLVMValueRef tid = get_thread_id_in_tg(ctx); LLVMValueRef num_emit_threads = ngg_get_prim_cnt(ctx); - /* TODO: streamout */ + /* Streamout */ + if (ctx->args->shader_info->so.num_outputs) { + struct ngg_streamout nggso = {}; + + nggso.num_vertices = LLVMConstInt(ctx->ac.i32, verts_per_prim, false); + + LLVMValueRef vertexptr = ngg_gs_vertex_ptr(ctx, tid); + for (unsigned stream = 0; stream < 4; ++stream) { + if (!ctx->args->shader_info->gs.num_stream_output_components[stream]) + continue; + + tmp = LLVMBuildLoad(builder, + ngg_gs_get_emit_primflag_ptr(ctx, vertexptr, stream), ""); + tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, ""); + tmp2 = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, ""); + nggso.prim_enable[stream] = LLVMBuildAnd(builder, tmp, tmp2, ""); + } + + for (unsigned i = 0; i < verts_per_prim; ++i) { + tmp = LLVMBuildSub(builder, tid, + LLVMConstInt(ctx->ac.i32, verts_per_prim - i - 1, false), ""); + tmp = ngg_gs_vertex_ptr(ctx, tmp); + nggso.vertices[i] = ac_build_gep0(&ctx->ac, tmp, ctx->ac.i32_0); + } + + build_streamout(ctx, &nggso); + } + + /* Write shader query data. */ + tmp = ac_get_arg(&ctx->ac, ctx->args->ngg_gs_state); + tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, ""); + ac_build_ifcc(&ctx->ac, tmp, 5109); + tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, + LLVMConstInt(ctx->ac.i32, 4, false), ""); + ac_build_ifcc(&ctx->ac, tmp, 5110); + { + tmp = LLVMBuildLoad(builder, ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tid), ""); + + ac_llvm_add_target_dep_function_attr(ctx->main_function, + "amdgpu-gds-size", 256); + + LLVMTypeRef gdsptr = LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GDS); + LLVMValueRef gdsbase = LLVMBuildIntToPtr(builder, ctx->ac.i32_0, gdsptr, ""); + + const char *sync_scope = LLVM_VERSION_MAJOR >= 9 ? "workgroup-one-as" : "workgroup"; + + /* Use a plain GDS atomic to accumulate the number of generated + * primitives. + */ + ac_build_atomic_rmw(&ctx->ac, LLVMAtomicRMWBinOpAdd, gdsbase, + tmp, sync_scope); + } + ac_build_endif(&ctx->ac, 5110); + ac_build_endif(&ctx->ac, 5109); /* TODO: culling */ @@ -3576,13 +3218,8 @@ /* Load primitive liveness */ tmp = ngg_gs_vertex_ptr(ctx, primidx); - LLVMValueRef gep_idx[3] = { - ctx->ac.i32_0, /* implicit C-style array */ - ctx->ac.i32_1, /* second value of struct */ - ctx->ac.i32_0, /* stream 0 */ - }; - tmp = LLVMBuildGEP(builder, tmp, gep_idx, 3, ""); - tmp = LLVMBuildLoad(builder, tmp, ""); + tmp = LLVMBuildLoad(builder, + ngg_gs_get_emit_primflag_ptr(ctx, tmp, 0), ""); const LLVMValueRef primlive = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, ""); @@ -3628,7 +3265,8 @@ * there are 4 or more contiguous null primitives in the export * (in the common case of single-dword prim exports). */ - build_sendmsg_gs_alloc_req(ctx, vertlive_scan.result_reduce, num_emit_threads); + ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx), + vertlive_scan.result_reduce, num_emit_threads); /* Setup the reverse vertex compaction permutation. We re-use stream 1 * of the primitive liveness flags, relying on the fact that each @@ -3636,14 +3274,9 @@ ac_build_ifcc(&ctx->ac, vertlive, 5130); { tmp = ngg_gs_vertex_ptr(ctx, vertlive_scan.result_exclusive); - LLVMValueRef gep_idx[3] = { - ctx->ac.i32_0, /* implicit C-style array */ - ctx->ac.i32_1, /* second value of struct */ - ctx->ac.i32_1, /* stream 1 */ - }; - tmp = LLVMBuildGEP(builder, tmp, gep_idx, 3, ""); tmp2 = LLVMBuildTrunc(builder, tid, ctx->ac.i8, ""); - LLVMBuildStore(builder, tmp2, tmp); + LLVMBuildStore(builder, tmp2, + ngg_gs_get_emit_primflag_ptr(ctx, tmp, 1)); } ac_build_endif(&ctx->ac, 5130); @@ -3653,19 +3286,14 @@ tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, ""); ac_build_ifcc(&ctx->ac, tmp, 5140); { - struct ngg_prim prim = {}; + LLVMValueRef flags; + struct ac_ngg_prim prim = {}; prim.num_vertices = verts_per_prim; tmp = ngg_gs_vertex_ptr(ctx, tid); - LLVMValueRef gep_idx[3] = { - ctx->ac.i32_0, /* implicit C-style array */ - ctx->ac.i32_1, /* second value of struct */ - ctx->ac.i32_0, /* primflag */ - }; - tmp = LLVMBuildGEP(builder, tmp, gep_idx, 3, ""); - tmp = LLVMBuildLoad(builder, tmp, ""); - prim.isnull = LLVMBuildICmp(builder, LLVMIntEQ, tmp, - LLVMConstInt(ctx->ac.i8, 0, false), ""); + flags = LLVMBuildLoad(builder, + ngg_gs_get_emit_primflag_ptr(ctx, tmp, 0), ""); + prim.isnull = LLVMBuildNot(builder, LLVMBuildTrunc(builder, flags, ctx->ac.i1, ""), ""); for (unsigned i = 0; i < verts_per_prim; ++i) { prim.index[i] = LLVMBuildSub(builder, vertlive_scan.result_exclusive, @@ -3673,7 +3301,25 @@ prim.edgeflag[i] = ctx->ac.i1false; } - build_export_prim(ctx, &prim); + /* Geometry shaders output triangle strips, but NGG expects + * triangles. We need to change the vertex order for odd + * triangles to get correct front/back facing by swapping 2 + * vertex indices, but we also have to keep the provoking + * vertex in the same place. + */ + if (verts_per_prim == 3) { + LLVMValueRef is_odd = LLVMBuildLShr(builder, flags, ctx->ac.i8_1, ""); + is_odd = LLVMBuildTrunc(builder, is_odd, ctx->ac.i1, ""); + + struct ac_ngg_prim in = prim; + prim.index[0] = in.index[0]; + prim.index[1] = LLVMBuildSelect(builder, is_odd, + in.index[2], in.index[1], ""); + prim.index[2] = LLVMBuildSelect(builder, is_odd, + in.index[1], in.index[2], ""); + } + + ac_build_export_prim(&ctx->ac, &prim); } ac_build_endif(&ctx->ac, 5140); @@ -3681,8 +3327,8 @@ tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, vertlive_scan.result_reduce, ""); ac_build_ifcc(&ctx->ac, tmp, 5145); { - struct radv_vs_output_info *outinfo = &ctx->shader_info->vs.outinfo; - bool export_view_index = ctx->options->key.has_multiview_view_index; + struct radv_vs_output_info *outinfo = &ctx->args->shader_info->vs.outinfo; + bool export_view_index = ctx->args->options->key.has_multiview_view_index; struct radv_shader_output_values *outputs; unsigned noutput = 0; @@ -3695,43 +3341,29 @@ outinfo->pos_exports = 0; tmp = ngg_gs_vertex_ptr(ctx, tid); - LLVMValueRef gep_idx[3] = { - ctx->ac.i32_0, /* implicit C-style array */ - ctx->ac.i32_1, /* second value of struct */ - ctx->ac.i32_1, /* stream 1: source data index */ - }; - tmp = LLVMBuildGEP(builder, tmp, gep_idx, 3, ""); - tmp = LLVMBuildLoad(builder, tmp, ""); + tmp = LLVMBuildLoad(builder, + ngg_gs_get_emit_primflag_ptr(ctx, tmp, 1), ""); tmp = LLVMBuildZExt(builder, tmp, ctx->ac.i32, ""); const LLVMValueRef vertexptr = ngg_gs_vertex_ptr(ctx, tmp); - if (ctx->output_mask & (1ull << VARYING_SLOT_PSIZ)) { - outinfo->writes_pointsize = true; - } - - if (ctx->output_mask & (1ull << VARYING_SLOT_LAYER)) { - outinfo->writes_layer = true; - } - - if (ctx->output_mask & (1ull << VARYING_SLOT_VIEWPORT)) { - outinfo->writes_viewport_index = true; - } - unsigned out_idx = 0; - gep_idx[1] = ctx->ac.i32_0; for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) { + unsigned output_usage_mask = + ctx->args->shader_info->gs.output_usage_mask[i]; + int length = util_last_bit(output_usage_mask); + if (!(ctx->output_mask & (1ull << i))) continue; outputs[noutput].slot_name = i; outputs[noutput].slot_index = i == VARYING_SLOT_CLIP_DIST1; - - outputs[noutput].usage_mask = ctx->shader_info->info.gs.output_usage_mask[i]; - int length = util_last_bit(outputs[noutput].usage_mask); + outputs[noutput].usage_mask = output_usage_mask; for (unsigned j = 0; j < length; j++, out_idx++) { - gep_idx[2] = LLVMConstInt(ctx->ac.i32, out_idx, false); - tmp = LLVMBuildGEP(builder, vertexptr, gep_idx, 3, ""); + if (!(output_usage_mask & (1 << j))) + continue; + + tmp = ngg_gs_get_emit_output_ptr(ctx, vertexptr, out_idx); tmp = LLVMBuildLoad(builder, tmp, ""); LLVMTypeRef type = LLVMGetAllocatedType(ctx->abi.outputs[ac_llvm_reg_index_soa(i, j)]); @@ -3751,19 +3383,18 @@ /* Export ViewIndex. */ if (export_view_index) { - outinfo->writes_layer = true; - outputs[noutput].slot_name = VARYING_SLOT_LAYER; outputs[noutput].slot_index = 0; outputs[noutput].usage_mask = 0x1; - outputs[noutput].values[0] = ac_to_float(&ctx->ac, ctx->abi.view_index); + outputs[noutput].values[0] = + ac_to_float(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.view_index)); for (unsigned j = 1; j < 4; j++) outputs[noutput].values[j] = ctx->ac.f32_0; noutput++; } radv_llvm_export_vs(ctx, outputs, noutput, outinfo, - ctx->options->key.vs_common_out.export_clip_dists); + ctx->args->options->key.vs_common_out.export_clip_dists); FREE(outputs); } ac_build_endif(&ctx->ac, 5145); @@ -3784,8 +3415,8 @@ */ const LLVMValueRef can_emit = LLVMBuildICmp(builder, LLVMIntULT, vertexidx, - LLVMConstInt(ctx->ac.i32, ctx->gs_max_out_vertices, false), ""); - ac_build_kill_if_false(&ctx->ac, can_emit); + LLVMConstInt(ctx->ac.i32, ctx->shader->info.gs.vertices_out, false), ""); + ac_build_ifcc(&ctx->ac, can_emit, 9001); tmp = LLVMBuildAdd(builder, vertexidx, ctx->ac.i32_1, ""); tmp = LLVMBuildSelect(builder, can_emit, tmp, vertexidx, ""); @@ -3796,9 +3427,9 @@ unsigned out_idx = 0; for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) { unsigned output_usage_mask = - ctx->shader_info->info.gs.output_usage_mask[i]; + ctx->args->shader_info->gs.output_usage_mask[i]; uint8_t output_stream = - ctx->shader_info->info.gs.output_streams[i]; + ctx->args->shader_info->gs.output_streams[i]; LLVMValueRef *out_ptr = &addrs[i * 4]; int length = util_last_bit(output_usage_mask); @@ -3812,61 +3443,69 @@ LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, out_ptr[j], ""); - LLVMValueRef gep_idx[3] = { - ctx->ac.i32_0, /* implied C-style array */ - ctx->ac.i32_0, /* first entry of struct */ - LLVMConstInt(ctx->ac.i32, out_idx, false), - }; - LLVMValueRef ptr = LLVMBuildGEP(builder, vertexptr, gep_idx, 3, ""); - out_val = ac_to_integer(&ctx->ac, out_val); out_val = LLVMBuildZExtOrBitCast(ctx->ac.builder, out_val, ctx->ac.i32, ""); - LLVMBuildStore(builder, out_val, ptr); + LLVMBuildStore(builder, out_val, + ngg_gs_get_emit_output_ptr(ctx, vertexptr, out_idx)); } } - assert(out_idx * 4 <= ctx->gsvs_vertex_size); + assert(out_idx * 4 <= ctx->args->shader_info->gs.gsvs_vertex_size); /* Determine and store whether this vertex completed a primitive. */ const LLVMValueRef curverts = LLVMBuildLoad(builder, ctx->gs_curprim_verts[stream], ""); - tmp = LLVMConstInt(ctx->ac.i32, si_conv_gl_prim_to_vertices(ctx->gs_output_prim) - 1, false); + tmp = LLVMConstInt(ctx->ac.i32, si_conv_gl_prim_to_vertices(ctx->shader->info.gs.output_primitive) - 1, false); const LLVMValueRef iscompleteprim = LLVMBuildICmp(builder, LLVMIntUGE, curverts, tmp, ""); + /* Since the geometry shader emits triangle strips, we need to + * track which primitive is odd and swap vertex indices to get + * the correct vertex order. + */ + LLVMValueRef is_odd = ctx->ac.i1false; + if (stream == 0 && + si_conv_gl_prim_to_vertices(ctx->shader->info.gs.output_primitive) == 3) { + tmp = LLVMBuildAnd(builder, curverts, ctx->ac.i32_1, ""); + is_odd = LLVMBuildICmp(builder, LLVMIntEQ, tmp, ctx->ac.i32_1, ""); + } + tmp = LLVMBuildAdd(builder, curverts, ctx->ac.i32_1, ""); LLVMBuildStore(builder, tmp, ctx->gs_curprim_verts[stream]); - LLVMValueRef gep_idx[3] = { - ctx->ac.i32_0, /* implied C-style array */ - ctx->ac.i32_1, /* second struct entry */ - LLVMConstInt(ctx->ac.i32, stream, false), - }; - const LLVMValueRef primflagptr = - LLVMBuildGEP(builder, vertexptr, gep_idx, 3, ""); - + /* The per-vertex primitive flag encoding: + * bit 0: whether this vertex finishes a primitive + * bit 1: whether the primitive is odd (if we are emitting triangle strips) + */ tmp = LLVMBuildZExt(builder, iscompleteprim, ctx->ac.i8, ""); - LLVMBuildStore(builder, tmp, primflagptr); + tmp = LLVMBuildOr(builder, tmp, + LLVMBuildShl(builder, + LLVMBuildZExt(builder, is_odd, ctx->ac.i8, ""), + ctx->ac.i8_1, ""), ""); + LLVMBuildStore(builder, tmp, + ngg_gs_get_emit_primflag_ptr(ctx, vertexptr, stream)); tmp = LLVMBuildLoad(builder, ctx->gs_generated_prims[stream], ""); tmp = LLVMBuildAdd(builder, tmp, LLVMBuildZExt(builder, iscompleteprim, ctx->ac.i32, ""), ""); LLVMBuildStore(builder, tmp, ctx->gs_generated_prims[stream]); + + ac_build_endif(&ctx->ac, 9001); } static void write_tess_factors(struct radv_shader_context *ctx) { unsigned stride, outer_comps, inner_comps; - struct ac_build_if_state if_ctx, inner_if_ctx; - LLVMValueRef invocation_id = ac_unpack_param(&ctx->ac, ctx->abi.tcs_rel_ids, 8, 5); - LLVMValueRef rel_patch_id = ac_unpack_param(&ctx->ac, ctx->abi.tcs_rel_ids, 0, 8); + LLVMValueRef tcs_rel_ids = ac_get_arg(&ctx->ac, ctx->args->ac.tcs_rel_ids); + LLVMValueRef invocation_id = ac_unpack_param(&ctx->ac, tcs_rel_ids, 8, 5); + LLVMValueRef rel_patch_id = ac_unpack_param(&ctx->ac, tcs_rel_ids, 0, 8); unsigned tess_inner_index = 0, tess_outer_index; LLVMValueRef lds_base, lds_inner = NULL, lds_outer, byteoffset, buffer; LLVMValueRef out[6], vec0, vec1, tf_base, inner[4], outer[4]; int i; ac_emit_barrier(&ctx->ac, ctx->stage); - switch (ctx->options->key.tcs.primitive_mode) { + switch (ctx->args->options->key.tcs.primitive_mode) { case GL_ISOLINES: stride = 2; outer_comps = 2; @@ -3886,9 +3525,9 @@ return; } - ac_nir_build_if(&if_ctx, ctx, + ac_build_ifcc(&ctx->ac, LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, - invocation_id, ctx->ac.i32_0, "")); + invocation_id, ctx->ac.i32_0, ""), 6503); lds_base = get_tcs_out_current_patch_data_offset(ctx); @@ -3908,7 +3547,7 @@ } // LINES reversal - if (ctx->options->key.tcs.primitive_mode == GL_ISOLINES) { + if (ctx->args->options->key.tcs.primitive_mode == GL_ISOLINES) { outer[0] = out[1] = ac_lds_load(&ctx->ac, lds_outer); lds_outer = LLVMBuildAdd(ctx->ac.builder, lds_outer, ctx->ac.i32_1, ""); @@ -3937,37 +3576,37 @@ buffer = ctx->hs_ring_tess_factor; - tf_base = ctx->tess_factor_offset; + tf_base = ac_get_arg(&ctx->ac, ctx->args->tess_factor_offset); byteoffset = LLVMBuildMul(ctx->ac.builder, rel_patch_id, LLVMConstInt(ctx->ac.i32, 4 * stride, false), ""); unsigned tf_offset = 0; - if (ctx->options->chip_class <= GFX8) { - ac_nir_build_if(&inner_if_ctx, ctx, + if (ctx->ac.chip_class <= GFX8) { + ac_build_ifcc(&ctx->ac, LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, - rel_patch_id, ctx->ac.i32_0, "")); + rel_patch_id, ctx->ac.i32_0, ""), 6504); /* Store the dynamic HS control word. */ ac_build_buffer_store_dword(&ctx->ac, buffer, LLVMConstInt(ctx->ac.i32, 0x80000000, false), 1, ctx->ac.i32_0, tf_base, - 0, ac_glc, false); + 0, ac_glc); tf_offset += 4; - ac_nir_build_endif(&inner_if_ctx); + ac_build_endif(&ctx->ac, 6504); } /* Store the tessellation factors. */ ac_build_buffer_store_dword(&ctx->ac, buffer, vec0, MIN2(stride, 4), byteoffset, tf_base, - tf_offset, ac_glc, false); + tf_offset, ac_glc); if (vec1) ac_build_buffer_store_dword(&ctx->ac, buffer, vec1, stride - 4, byteoffset, tf_base, - 16 + tf_offset, ac_glc, false); + 16 + tf_offset, ac_glc); //store to offchip for TES to read - only if TES reads them - if (ctx->options->key.tcs.tes_reads_tess_factors) { + if (ctx->args->options->key.tcs.tes_reads_tess_factors) { LLVMValueRef inner_vec, outer_vec, tf_outer_offset; LLVMValueRef tf_inner_offset; unsigned param_outer, param_inner; @@ -3981,7 +3620,8 @@ ac_build_buffer_store_dword(&ctx->ac, ctx->hs_ring_tess_offchip, outer_vec, outer_comps, tf_outer_offset, - ctx->oc_lds, 0, ac_glc, false); + ac_get_arg(&ctx->ac, ctx->args->oc_lds), + 0, ac_glc); if (inner_comps) { param_inner = shader_io_get_unique_index(VARYING_SLOT_TESS_LEVEL_INNER); tf_inner_offset = get_tcs_tes_buffer_address(ctx, NULL, @@ -3991,10 +3631,12 @@ ac_build_gather_values(&ctx->ac, inner, inner_comps); ac_build_buffer_store_dword(&ctx->ac, ctx->hs_ring_tess_offchip, inner_vec, inner_comps, tf_inner_offset, - ctx->oc_lds, 0, ac_glc, false); + ac_get_arg(&ctx->ac, ctx->args->oc_lds), + 0, ac_glc); } } - ac_nir_build_endif(&if_ctx); + + ac_build_endif(&ctx->ac, 6503); } static void @@ -4057,15 +3699,15 @@ } /* Process depth, stencil, samplemask. */ - if (ctx->shader_info->info.ps.writes_z) { + if (ctx->args->shader_info->ps.writes_z) { depth = ac_to_float(&ctx->ac, radv_load_output(ctx, FRAG_RESULT_DEPTH, 0)); } - if (ctx->shader_info->info.ps.writes_stencil) { + if (ctx->args->shader_info->ps.writes_stencil) { stencil = ac_to_float(&ctx->ac, radv_load_output(ctx, FRAG_RESULT_STENCIL, 0)); } - if (ctx->shader_info->info.ps.writes_sample_mask) { + if (ctx->args->shader_info->ps.writes_sample_mask) { samplemask = ac_to_float(&ctx->ac, radv_load_output(ctx, FRAG_RESULT_SAMPLE_MASK, 0)); } @@ -4074,9 +3716,9 @@ * exported. */ if (index > 0 && - !ctx->shader_info->info.ps.writes_z && - !ctx->shader_info->info.ps.writes_stencil && - !ctx->shader_info->info.ps.writes_sample_mask) { + !ctx->args->shader_info->ps.writes_z && + !ctx->args->shader_info->ps.writes_stencil && + !ctx->args->shader_info->ps.writes_sample_mask) { unsigned last = index - 1; color_args[last].valid_mask = 1; /* whether the EXEC mask is valid */ @@ -4096,7 +3738,7 @@ static void emit_gs_epilogue(struct radv_shader_context *ctx) { - if (ctx->options->key.vs_common_out.as_ngg) { + if (ctx->args->options->key.vs_common_out.as_ngg) { gfx10_ngg_gs_emit_epilogue_1(ctx); return; } @@ -4115,16 +3757,16 @@ switch (ctx->stage) { case MESA_SHADER_VERTEX: - if (ctx->options->key.vs_common_out.as_ls) + if (ctx->args->options->key.vs_common_out.as_ls) handle_ls_outputs_post(ctx); - else if (ctx->options->key.vs_common_out.as_es) - handle_es_outputs_post(ctx, &ctx->shader_info->vs.es_info); - else if (ctx->options->key.vs_common_out.as_ngg) - break; /* handled outside of the shader body */ + else if (ctx->args->options->key.vs_common_out.as_es) + handle_es_outputs_post(ctx, &ctx->args->shader_info->vs.es_info); + else if (ctx->args->options->key.vs_common_out.as_ngg) + handle_ngg_outputs_post_1(ctx); else - handle_vs_outputs_post(ctx, ctx->options->key.vs_common_out.export_prim_id, - ctx->options->key.vs_common_out.export_clip_dists, - &ctx->shader_info->vs.outinfo); + handle_vs_outputs_post(ctx, ctx->args->options->key.vs_common_out.export_prim_id, + ctx->args->options->key.vs_common_out.export_clip_dists, + &ctx->args->shader_info->vs.outinfo); break; case MESA_SHADER_FRAGMENT: handle_fs_outputs_post(ctx); @@ -4136,14 +3778,14 @@ handle_tcs_outputs_post(ctx); break; case MESA_SHADER_TESS_EVAL: - if (ctx->options->key.vs_common_out.as_es) - handle_es_outputs_post(ctx, &ctx->shader_info->tes.es_info); - else if (ctx->options->key.vs_common_out.as_ngg) - break; /* handled outside of the shader body */ + if (ctx->args->options->key.vs_common_out.as_es) + handle_es_outputs_post(ctx, &ctx->args->shader_info->tes.es_info); + else if (ctx->args->options->key.vs_common_out.as_ngg) + handle_ngg_outputs_post_1(ctx); else - handle_vs_outputs_post(ctx, ctx->options->key.vs_common_out.export_prim_id, - ctx->options->key.vs_common_out.export_clip_dists, - &ctx->shader_info->tes.outinfo); + handle_vs_outputs_post(ctx, ctx->args->options->key.vs_common_out.export_prim_id, + ctx->args->options->key.vs_common_out.export_clip_dists, + &ctx->args->shader_info->tes.outinfo); break; default: break; @@ -4172,15 +3814,15 @@ case MESA_SHADER_GEOMETRY: return; case MESA_SHADER_VERTEX: - if (ctx->options->key.vs_common_out.as_ls || - ctx->options->key.vs_common_out.as_es) + if (ctx->args->options->key.vs_common_out.as_ls || + ctx->args->options->key.vs_common_out.as_es) return; - outinfo = &ctx->shader_info->vs.outinfo; + outinfo = &ctx->args->shader_info->vs.outinfo; break; case MESA_SHADER_TESS_EVAL: - if (ctx->options->key.vs_common_out.as_es) + if (ctx->args->options->key.vs_common_out.as_es) return; - outinfo = &ctx->shader_info->tes.outinfo; + outinfo = &ctx->args->shader_info->tes.outinfo; break; default: unreachable("Unhandled shader type"); @@ -4189,16 +3831,16 @@ ac_optimize_vs_outputs(&ctx->ac, ctx->main_function, outinfo->vs_output_param_offset, - VARYING_SLOT_MAX, + VARYING_SLOT_MAX, 0, &outinfo->param_exports); } static void ac_setup_rings(struct radv_shader_context *ctx) { - if (ctx->options->chip_class <= GFX8 && + if (ctx->args->options->chip_class <= GFX8 && (ctx->stage == MESA_SHADER_GEOMETRY || - ctx->options->key.vs_common_out.as_es || ctx->options->key.vs_common_out.as_es)) { + ctx->args->options->key.vs_common_out.as_es || ctx->args->options->key.vs_common_out.as_es)) { unsigned ring = ctx->stage == MESA_SHADER_GEOMETRY ? RING_ESGS_GS : RING_ESGS_VS; LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, ring, false); @@ -4208,7 +3850,7 @@ offset); } - if (ctx->is_gs_copy_shader) { + if (ctx->args->is_gs_copy_shader) { ctx->gsvs_ring[0] = ac_build_load_to_sgpr(&ctx->ac, ctx->ring_offsets, LLVMConstInt(ctx->ac.i32, @@ -4239,12 +3881,12 @@ LLVMValueRef ring, tmp; num_components = - ctx->shader_info->info.gs.num_stream_output_components[stream]; + ctx->args->shader_info->gs.num_stream_output_components[stream]; if (!num_components) continue; - stride = 4 * num_components * ctx->gs_max_out_vertices; + stride = 4 * num_components * ctx->shader->info.gs.vertices_out; /* Limit on the stride field for <= GFX7. */ assert(stride < (1 << 14)); @@ -4294,28 +3936,49 @@ const struct nir_shader *nir) { const unsigned backup_sizes[] = {chip_class >= GFX9 ? 128 : 64, 1, 1}; - return radv_get_max_workgroup_size(chip_class, stage, nir ? nir->info.cs.local_size : backup_sizes); + unsigned sizes[3]; + for (unsigned i = 0; i < 3; i++) + sizes[i] = nir ? nir->info.cs.local_size[i] : backup_sizes[i]; + return radv_get_max_workgroup_size(chip_class, stage, sizes); } /* Fixup the HW not emitting the TCS regs if there are no HS threads. */ static void ac_nir_fixup_ls_hs_input_vgprs(struct radv_shader_context *ctx) { - LLVMValueRef count = ac_unpack_param(&ctx->ac, ctx->merged_wave_info, 8, 8); + LLVMValueRef count = + ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->merged_wave_info), 8, 8); LLVMValueRef hs_empty = LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, count, ctx->ac.i32_0, ""); - ctx->abi.instance_id = LLVMBuildSelect(ctx->ac.builder, hs_empty, ctx->rel_auto_id, ctx->abi.instance_id, ""); - ctx->rel_auto_id = LLVMBuildSelect(ctx->ac.builder, hs_empty, ctx->abi.tcs_rel_ids, ctx->rel_auto_id, ""); - ctx->abi.vertex_id = LLVMBuildSelect(ctx->ac.builder, hs_empty, ctx->abi.tcs_patch_id, ctx->abi.vertex_id, ""); -} - -static void prepare_gs_input_vgprs(struct radv_shader_context *ctx) -{ - for(int i = 5; i >= 0; --i) { - ctx->gs_vtx_offset[i] = ac_unpack_param(&ctx->ac, ctx->gs_vtx_offset[i & ~1], - (i & 1) * 16, 16); + ctx->abi.instance_id = LLVMBuildSelect(ctx->ac.builder, hs_empty, + ac_get_arg(&ctx->ac, ctx->args->rel_auto_id), + ctx->abi.instance_id, ""); + ctx->rel_auto_id = LLVMBuildSelect(ctx->ac.builder, hs_empty, + ac_get_arg(&ctx->ac, ctx->args->ac.tcs_rel_ids), + ctx->rel_auto_id, + ""); + ctx->abi.vertex_id = LLVMBuildSelect(ctx->ac.builder, hs_empty, + ac_get_arg(&ctx->ac, ctx->args->ac.tcs_patch_id), + ctx->abi.vertex_id, ""); +} + +static void prepare_gs_input_vgprs(struct radv_shader_context *ctx, bool merged) +{ + if (merged) { + for(int i = 5; i >= 0; --i) { + ctx->gs_vtx_offset[i] = + ac_unpack_param(&ctx->ac, + ac_get_arg(&ctx->ac, ctx->args->gs_vtx_offset[i & ~1]), + (i & 1) * 16, 16); + } + + ctx->gs_wave_id = ac_unpack_param(&ctx->ac, + ac_get_arg(&ctx->ac, ctx->args->merged_wave_info), + 16, 8); + } else { + for (int i = 0; i < 6; i++) + ctx->gs_vtx_offset[i] = ac_get_arg(&ctx->ac, ctx->args->gs_vtx_offset[i]); + ctx->gs_wave_id = ac_get_arg(&ctx->ac, ctx->args->gs_wave_id); } - - ctx->gs_wave_id = ac_unpack_param(&ctx->ac, ctx->merged_wave_info, 16, 8); } /* Ensure that the esgs ring is declared. @@ -4342,50 +4005,39 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm, struct nir_shader *const *shaders, int shader_count, - struct radv_shader_variant_info *shader_info, - const struct radv_nir_compiler_options *options) + const struct radv_shader_args *args) { struct radv_shader_context ctx = {0}; - unsigned i; - ctx.options = options; - ctx.shader_info = shader_info; - - enum ac_float_mode float_mode = - options->unsafe_math ? AC_FLOAT_MODE_UNSAFE_FP_MATH : - AC_FLOAT_MODE_DEFAULT; - - ac_llvm_context_init(&ctx.ac, ac_llvm, options->chip_class, - options->family, float_mode, options->wave_size, - options->wave_size); - ctx.context = ctx.ac.context; + ctx.args = args; - radv_nir_shader_info_init(&shader_info->info); + enum ac_float_mode float_mode = AC_FLOAT_MODE_DEFAULT; - for(int i = 0; i < shader_count; ++i) - radv_nir_shader_info_pass(shaders[i], options, &shader_info->info); + if (args->shader_info->float_controls_mode & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32) { + float_mode = AC_FLOAT_MODE_DENORM_FLUSH_TO_ZERO; + } - for (i = 0; i < MAX_SETS; i++) - shader_info->user_sgprs_locs.descriptor_sets[i].sgpr_idx = -1; - for (i = 0; i < AC_UD_MAX_UD; i++) - shader_info->user_sgprs_locs.shader_data[i].sgpr_idx = -1; + ac_llvm_context_init(&ctx.ac, ac_llvm, args->options->chip_class, + args->options->family, float_mode, + args->shader_info->wave_size, + args->shader_info->ballot_bit_size); + ctx.context = ctx.ac.context; ctx.max_workgroup_size = 0; for (int i = 0; i < shader_count; ++i) { ctx.max_workgroup_size = MAX2(ctx.max_workgroup_size, - radv_nir_get_max_workgroup_size(ctx.options->chip_class, + radv_nir_get_max_workgroup_size(args->options->chip_class, shaders[i]->info.stage, shaders[i])); } if (ctx.ac.chip_class >= GFX10) { if (is_pre_gs_stage(shaders[0]->info.stage) && - options->key.vs_common_out.as_ngg) { + args->options->key.vs_common_out.as_ngg) { ctx.max_workgroup_size = 128; } } - create_function(&ctx, shaders[shader_count - 1]->info.stage, shader_count >= 2, - shader_count >= 2 ? shaders[shader_count - 2]->info.stage : MESA_SHADER_VERTEX); + create_function(&ctx, shaders[shader_count - 1]->info.stage, shader_count >= 2); ctx.abi.inputs = &ctx.inputs[0]; ctx.abi.emit_outputs = handle_shader_outputs_post; @@ -4395,26 +4047,49 @@ ctx.abi.load_sampler_desc = radv_get_sampler_desc; ctx.abi.load_resource = radv_load_resource; ctx.abi.clamp_shadow_reference = false; - ctx.abi.gfx9_stride_size_workaround = ctx.ac.chip_class == GFX9 && HAVE_LLVM < 0x800; - ctx.abi.robust_buffer_access = options->robust_buffer_access; - - /* Because the new raw/struct atomic intrinsics are buggy with LLVM 8, - * we fallback to the old intrinsics for atomic buffer image operations - * and thus we need to apply the indexing workaround... - */ - ctx.abi.gfx9_stride_size_workaround_for_atomic = ctx.ac.chip_class == GFX9 && HAVE_LLVM < 0x900; + ctx.abi.robust_buffer_access = args->options->robust_buffer_access; - bool is_ngg = is_pre_gs_stage(shaders[0]->info.stage) && ctx.options->key.vs_common_out.as_ngg; + bool is_ngg = is_pre_gs_stage(shaders[0]->info.stage) && args->options->key.vs_common_out.as_ngg; if (shader_count >= 2 || is_ngg) ac_init_exec_full_mask(&ctx.ac); - if ((ctx.ac.family == CHIP_VEGA10 || - ctx.ac.family == CHIP_RAVEN) && + if (args->ac.vertex_id.used) + ctx.abi.vertex_id = ac_get_arg(&ctx.ac, args->ac.vertex_id); + if (args->rel_auto_id.used) + ctx.rel_auto_id = ac_get_arg(&ctx.ac, args->rel_auto_id); + if (args->ac.instance_id.used) + ctx.abi.instance_id = ac_get_arg(&ctx.ac, args->ac.instance_id); + + if (args->options->has_ls_vgpr_init_bug && shaders[shader_count - 1]->info.stage == MESA_SHADER_TESS_CTRL) ac_nir_fixup_ls_hs_input_vgprs(&ctx); + if (is_ngg) { + /* Declare scratch space base for streamout and vertex + * compaction. Whether space is actually allocated is + * determined during linking / PM4 creation. + * + * Add an extra dword per vertex to ensure an odd stride, which + * avoids bank conflicts for SoA accesses. + */ + if (!args->options->key.vs_common_out.as_ngg_passthrough) + declare_esgs_ring(&ctx); + + /* This is really only needed when streamout and / or vertex + * compaction is enabled. + */ + if (args->shader_info->so.num_outputs) { + LLVMTypeRef asi32 = LLVMArrayType(ctx.ac.i32, 8); + ctx.gs_ngg_scratch = LLVMAddGlobalInAddressSpace(ctx.ac.module, + asi32, "ngg_scratch", AC_ADDR_SPACE_LDS); + LLVMSetInitializer(ctx.gs_ngg_scratch, LLVMGetUndef(asi32)); + LLVMSetAlignment(ctx.gs_ngg_scratch, 4); + } + } + for(int i = 0; i < shader_count; ++i) { ctx.stage = shaders[i]->info.stage; + ctx.shader = shaders[i]; ctx.output_mask = 0; if (shaders[i]->info.stage == MESA_SHADER_GEOMETRY) { @@ -4422,7 +4097,7 @@ ctx.gs_next_vertex[i] = ac_build_alloca(&ctx.ac, ctx.ac.i32, ""); } - if (ctx.options->key.vs_common_out.as_ngg) { + if (args->options->key.vs_common_out.as_ngg) { for (unsigned i = 0; i < 4; ++i) { ctx.gs_curprim_verts[i] = ac_build_alloca(&ctx.ac, ctx.ac.i32, ""); @@ -4430,56 +4105,50 @@ ac_build_alloca(&ctx.ac, ctx.ac.i32, ""); } - /* TODO: streamout */ + unsigned scratch_size = 8; + if (args->shader_info->so.num_outputs) + scratch_size = 44; - LLVMTypeRef ai32 = LLVMArrayType(ctx.ac.i32, 8); + LLVMTypeRef ai32 = LLVMArrayType(ctx.ac.i32, scratch_size); ctx.gs_ngg_scratch = LLVMAddGlobalInAddressSpace(ctx.ac.module, ai32, "ngg_scratch", AC_ADDR_SPACE_LDS); LLVMSetInitializer(ctx.gs_ngg_scratch, LLVMGetUndef(ai32)); LLVMSetAlignment(ctx.gs_ngg_scratch, 4); - ctx.gs_ngg_emit = LLVMBuildIntToPtr(ctx.ac.builder, ctx.ac.i32_0, - LLVMPointerType(LLVMArrayType(ctx.ac.i32, 0), AC_ADDR_SPACE_LDS), - "ngg_emit"); + ctx.gs_ngg_emit = LLVMAddGlobalInAddressSpace(ctx.ac.module, + LLVMArrayType(ctx.ac.i32, 0), "ngg_emit", AC_ADDR_SPACE_LDS); + LLVMSetLinkage(ctx.gs_ngg_emit, LLVMExternalLinkage); + LLVMSetAlignment(ctx.gs_ngg_emit, 4); } - ctx.gs_max_out_vertices = shaders[i]->info.gs.vertices_out; - ctx.gs_output_prim = shaders[i]->info.gs.output_primitive; ctx.abi.load_inputs = load_gs_input; ctx.abi.emit_primitive = visit_end_primitive; } else if (shaders[i]->info.stage == MESA_SHADER_TESS_CTRL) { - ctx.tcs_outputs_read = shaders[i]->info.outputs_read; - ctx.tcs_patch_outputs_read = shaders[i]->info.patch_outputs_read; ctx.abi.load_tess_varyings = load_tcs_varyings; ctx.abi.load_patch_vertices_in = load_patch_vertices_in; ctx.abi.store_tcs_outputs = store_tcs_output; - ctx.tcs_vertices_per_patch = shaders[i]->info.tess.tcs_vertices_out; if (shader_count == 1) - ctx.tcs_num_inputs = ctx.options->key.tcs.num_inputs; + ctx.tcs_num_inputs = args->options->key.tcs.num_inputs; else - ctx.tcs_num_inputs = util_last_bit64(shader_info->info.vs.ls_outputs_written); + ctx.tcs_num_inputs = util_last_bit64(args->shader_info->vs.ls_outputs_written); ctx.tcs_num_patches = get_tcs_num_patches(&ctx); } else if (shaders[i]->info.stage == MESA_SHADER_TESS_EVAL) { - ctx.tes_primitive_mode = shaders[i]->info.tess.primitive_mode; ctx.abi.load_tess_varyings = load_tes_input; ctx.abi.load_tess_coord = load_tess_coord; ctx.abi.load_patch_vertices_in = load_patch_vertices_in; - ctx.tcs_vertices_per_patch = shaders[i]->info.tess.tcs_vertices_out; - ctx.tcs_num_patches = ctx.options->key.tes.num_patches; + ctx.tcs_num_patches = args->options->key.tes.num_patches; } else if (shaders[i]->info.stage == MESA_SHADER_VERTEX) { ctx.abi.load_base_vertex = radv_load_base_vertex; } else if (shaders[i]->info.stage == MESA_SHADER_FRAGMENT) { - shader_info->fs.can_discard = shaders[i]->info.fs.uses_discard; - ctx.abi.lookup_interp_param = lookup_interp_param; ctx.abi.load_sample_position = load_sample_position; ctx.abi.load_sample_mask_in = load_sample_mask_in; ctx.abi.emit_kill = radv_emit_kill; } if (shaders[i]->info.stage == MESA_SHADER_VERTEX && - ctx.options->key.vs_common_out.as_ngg && - ctx.options->key.vs_common_out.export_prim_id) { + args->options->key.vs_common_out.as_ngg && + args->options->key.vs_common_out.export_prim_id) { declare_esgs_ring(&ctx); } @@ -4487,7 +4156,7 @@ if (i) { if (shaders[i]->info.stage == MESA_SHADER_GEOMETRY && - ctx.options->key.vs_common_out.as_ngg) { + args->options->key.vs_common_out.as_ngg) { gfx10_ngg_gs_emit_prologue(&ctx); nested_barrier = false; } else { @@ -4519,14 +4188,6 @@ nir_foreach_variable(variable, &shaders[i]->outputs) scan_shader_output_decl(&ctx, variable, shaders[i], shaders[i]->info.stage); - if (shaders[i]->info.stage == MESA_SHADER_GEOMETRY) { - unsigned addclip = shaders[i]->info.clip_distance_array_size + - shaders[i]->info.cull_distance_array_size > 4; - ctx.gsvs_vertex_size = (util_bitcount64(ctx.output_mask) + addclip) * 16; - ctx.max_gsvs_emit_size = ctx.gsvs_vertex_size * - shaders[i]->info.gs.vertices_out; - } - ac_setup_rings(&ctx); LLVMBasicBlockRef merge_block; @@ -4535,7 +4196,10 @@ LLVMBasicBlockRef then_block = LLVMAppendBasicBlockInContext(ctx.ac.context, fn, ""); merge_block = LLVMAppendBasicBlockInContext(ctx.ac.context, fn, ""); - LLVMValueRef count = ac_unpack_param(&ctx.ac, ctx.merged_wave_info, 8 * i, 8); + LLVMValueRef count = + ac_unpack_param(&ctx.ac, + ac_get_arg(&ctx.ac, args->merged_wave_info), + 8 * i, 8); LLVMValueRef thread_id = ac_get_thread_id(&ctx.ac); LLVMValueRef cond = LLVMBuildICmp(ctx.ac.builder, LLVMIntULT, thread_id, count, ""); @@ -4548,10 +4212,10 @@ prepare_interp_optimize(&ctx, shaders[i]); else if(shaders[i]->info.stage == MESA_SHADER_VERTEX) handle_vs_inputs(&ctx, shaders[i]); - else if(shader_count >= 2 && shaders[i]->info.stage == MESA_SHADER_GEOMETRY) - prepare_gs_input_vgprs(&ctx); + else if(shaders[i]->info.stage == MESA_SHADER_GEOMETRY) + prepare_gs_input_vgprs(&ctx, shader_count >= 2); - ac_nir_translate(&ctx.ac, &ctx.abi, shaders[i]); + ac_nir_translate(&ctx.ac, &ctx.abi, &args->ac, shaders[i]); if (shader_count >= 2 || is_ngg) { LLVMBuildBr(ctx.ac.builder, merge_block); @@ -4561,40 +4225,37 @@ /* This needs to be outside the if wrapping the shader body, as sometimes * the HW generates waves with 0 es/vs threads. */ if (is_pre_gs_stage(shaders[i]->info.stage) && - ctx.options->key.vs_common_out.as_ngg && + args->options->key.vs_common_out.as_ngg && i == shader_count - 1) { - handle_ngg_outputs_post(&ctx); + handle_ngg_outputs_post_2(&ctx); } else if (shaders[i]->info.stage == MESA_SHADER_GEOMETRY && - ctx.options->key.vs_common_out.as_ngg) { + args->options->key.vs_common_out.as_ngg) { gfx10_ngg_gs_emit_epilogue_2(&ctx); } - if (shaders[i]->info.stage == MESA_SHADER_GEOMETRY) { - shader_info->gs.gsvs_vertex_size = ctx.gsvs_vertex_size; - shader_info->gs.max_gsvs_emit_size = ctx.max_gsvs_emit_size; - } else if (shaders[i]->info.stage == MESA_SHADER_TESS_CTRL) { - shader_info->tcs.num_patches = ctx.tcs_num_patches; - shader_info->tcs.lds_size = calculate_tess_lds_size(&ctx); + if (shaders[i]->info.stage == MESA_SHADER_TESS_CTRL) { + args->shader_info->tcs.num_patches = ctx.tcs_num_patches; + args->shader_info->tcs.lds_size = calculate_tess_lds_size(&ctx); } } LLVMBuildRetVoid(ctx.ac.builder); - if (options->dump_preoptir) { + if (args->options->dump_preoptir) { fprintf(stderr, "%s LLVM IR:\n\n", - radv_get_shader_name(shader_info, + radv_get_shader_name(args->shader_info, shaders[shader_count - 1]->info.stage)); ac_dump_module(ctx.ac.module); fprintf(stderr, "\n"); } - ac_llvm_finalize_module(&ctx, ac_llvm->passmgr, options); + ac_llvm_finalize_module(&ctx, ac_llvm->passmgr, args->options); if (shader_count == 1) ac_nir_eliminate_const_vs_outputs(&ctx); - if (options->dump_shader) { - ctx.shader_info->private_mem_vgprs = + if (args->options->dump_shader) { + args->shader_info->private_mem_vgprs = ac_count_scratch_private_memory(ctx.main_function); } @@ -4638,7 +4299,6 @@ static void ac_compile_llvm_module(struct ac_llvm_compiler *ac_llvm, LLVMModuleRef llvm_module, struct radv_shader_binary **rbinary, - struct radv_shader_variant_info *shader_info, gl_shader_stage stage, const char *name, const struct radv_nir_compiler_options *options) @@ -4653,7 +4313,7 @@ fprintf(stderr, "\n"); } - if (options->record_llvm_ir) { + if (options->record_ir) { char *llvm_ir = LLVMPrintModuleToString(llvm_module); llvm_ir_string = strdup(llvm_ir); LLVMDisposeMessage(llvm_ir); @@ -4686,92 +4346,49 @@ free(elf_buffer); } -static void -ac_fill_shader_info(struct radv_shader_variant_info *shader_info, struct nir_shader *nir, const struct radv_nir_compiler_options *options) -{ - switch (nir->info.stage) { - case MESA_SHADER_COMPUTE: - for (int i = 0; i < 3; ++i) - shader_info->cs.block_size[i] = nir->info.cs.local_size[i]; - break; - case MESA_SHADER_FRAGMENT: - shader_info->fs.early_fragment_test = nir->info.fs.early_fragment_tests; - shader_info->fs.post_depth_coverage = nir->info.fs.post_depth_coverage; - break; - case MESA_SHADER_GEOMETRY: - shader_info->gs.vertices_in = nir->info.gs.vertices_in; - shader_info->gs.vertices_out = nir->info.gs.vertices_out; - shader_info->gs.output_prim = nir->info.gs.output_primitive; - shader_info->gs.invocations = nir->info.gs.invocations; - break; - case MESA_SHADER_TESS_EVAL: - shader_info->tes.primitive_mode = nir->info.tess.primitive_mode; - shader_info->tes.spacing = nir->info.tess.spacing; - shader_info->tes.ccw = nir->info.tess.ccw; - shader_info->tes.point_mode = nir->info.tess.point_mode; - shader_info->tes.as_es = options->key.vs_common_out.as_es; - shader_info->tes.export_prim_id = options->key.vs_common_out.export_prim_id; - shader_info->is_ngg = options->key.vs_common_out.as_ngg; - break; - case MESA_SHADER_TESS_CTRL: - shader_info->tcs.tcs_vertices_out = nir->info.tess.tcs_vertices_out; - break; - case MESA_SHADER_VERTEX: - shader_info->vs.as_es = options->key.vs_common_out.as_es; - shader_info->vs.as_ls = options->key.vs_common_out.as_ls; - shader_info->vs.export_prim_id = options->key.vs_common_out.export_prim_id; - shader_info->is_ngg = options->key.vs_common_out.as_ngg; - break; - default: - break; - } -} - void radv_compile_nir_shader(struct ac_llvm_compiler *ac_llvm, struct radv_shader_binary **rbinary, - struct radv_shader_variant_info *shader_info, + const struct radv_shader_args *args, struct nir_shader *const *nir, - int nir_count, - const struct radv_nir_compiler_options *options) + int nir_count) { LLVMModuleRef llvm_module; - llvm_module = ac_translate_nir_to_llvm(ac_llvm, nir, nir_count, shader_info, - options); + llvm_module = ac_translate_nir_to_llvm(ac_llvm, nir, nir_count, args); - ac_compile_llvm_module(ac_llvm, llvm_module, rbinary, shader_info, + ac_compile_llvm_module(ac_llvm, llvm_module, rbinary, nir[nir_count - 1]->info.stage, - radv_get_shader_name(shader_info, + radv_get_shader_name(args->shader_info, nir[nir_count - 1]->info.stage), - options); - - for (int i = 0; i < nir_count; ++i) - ac_fill_shader_info(shader_info, nir[i], options); + args->options); /* Determine the ES type (VS or TES) for the GS on GFX9. */ - if (options->chip_class >= GFX9) { + if (args->options->chip_class >= GFX9) { if (nir_count == 2 && nir[1]->info.stage == MESA_SHADER_GEOMETRY) { - shader_info->gs.es_type = nir[0]->info.stage; + args->shader_info->gs.es_type = nir[0]->info.stage; } } - shader_info->info.wave_size = options->wave_size; } static void ac_gs_copy_shader_emit(struct radv_shader_context *ctx) { LLVMValueRef vtx_offset = - LLVMBuildMul(ctx->ac.builder, ctx->abi.vertex_id, + LLVMBuildMul(ctx->ac.builder, ac_get_arg(&ctx->ac, ctx->args->ac.vertex_id), LLVMConstInt(ctx->ac.i32, 4, false), ""); LLVMValueRef stream_id; /* Fetch the vertex stream ID. */ - if (ctx->shader_info->info.so.num_outputs) { + if (!ctx->args->options->use_ngg_streamout && + ctx->args->shader_info->so.num_outputs) { stream_id = - ac_unpack_param(&ctx->ac, ctx->streamout_config, 24, 2); + ac_unpack_param(&ctx->ac, + ac_get_arg(&ctx->ac, + ctx->args->streamout_config), + 24, 2); } else { stream_id = ctx->ac.i32_0; } @@ -4785,14 +4402,14 @@ for (unsigned stream = 0; stream < 4; stream++) { unsigned num_components = - ctx->shader_info->info.gs.num_stream_output_components[stream]; + ctx->args->shader_info->gs.num_stream_output_components[stream]; LLVMBasicBlockRef bb; unsigned offset; if (stream > 0 && !num_components) continue; - if (stream > 0 && !ctx->shader_info->info.so.num_outputs) + if (stream > 0 && !ctx->args->shader_info->so.num_outputs) continue; bb = LLVMInsertBasicBlockInContext(ctx->ac.context, end_bb, "out"); @@ -4802,9 +4419,9 @@ offset = 0; for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) { unsigned output_usage_mask = - ctx->shader_info->info.gs.output_usage_mask[i]; + ctx->args->shader_info->gs.output_usage_mask[i]; unsigned output_stream = - ctx->shader_info->info.gs.output_streams[i]; + ctx->args->shader_info->gs.output_streams[i]; int length = util_last_bit(output_usage_mask); if (!(ctx->output_mask & (1ull << i)) || @@ -4819,7 +4436,7 @@ soffset = LLVMConstInt(ctx->ac.i32, offset * - ctx->gs_max_out_vertices * 16 * 4, false); + ctx->shader->info.gs.vertices_out * 16 * 4, false); offset++; @@ -4840,12 +4457,13 @@ } } - if (ctx->shader_info->info.so.num_outputs) + if (!ctx->args->options->use_ngg_streamout && + ctx->args->shader_info->so.num_outputs) radv_emit_streamout(ctx, stream); if (stream == 0) { handle_vs_outputs_post(ctx, false, true, - &ctx->shader_info->vs.outinfo); + &ctx->args->shader_info->vs.outinfo); } LLVMBuildBr(ctx->ac.builder, end_bb); @@ -4858,29 +4476,22 @@ radv_compile_gs_copy_shader(struct ac_llvm_compiler *ac_llvm, struct nir_shader *geom_shader, struct radv_shader_binary **rbinary, - struct radv_shader_variant_info *shader_info, - const struct radv_nir_compiler_options *options) + const struct radv_shader_args *args) { struct radv_shader_context ctx = {0}; - ctx.options = options; - ctx.shader_info = shader_info; + ctx.args = args; - enum ac_float_mode float_mode = - options->unsafe_math ? AC_FLOAT_MODE_UNSAFE_FP_MATH : - AC_FLOAT_MODE_DEFAULT; + assert(args->is_gs_copy_shader); - ac_llvm_context_init(&ctx.ac, ac_llvm, options->chip_class, - options->family, float_mode, 64, 64); + ac_llvm_context_init(&ctx.ac, ac_llvm, args->options->chip_class, + args->options->family, AC_FLOAT_MODE_DEFAULT, 64, 64); ctx.context = ctx.ac.context; - ctx.is_gs_copy_shader = true; ctx.stage = MESA_SHADER_VERTEX; + ctx.shader = geom_shader; - radv_nir_shader_info_pass(geom_shader, options, &shader_info->info); - - create_function(&ctx, MESA_SHADER_VERTEX, false, MESA_SHADER_VERTEX); + create_function(&ctx, MESA_SHADER_VERTEX, false); - ctx.gs_max_out_vertices = geom_shader->info.gs.vertices_out; ac_setup_rings(&ctx); nir_foreach_variable(variable, &geom_shader->outputs) { @@ -4893,10 +4504,10 @@ LLVMBuildRetVoid(ctx.ac.builder); - ac_llvm_finalize_module(&ctx, ac_llvm->passmgr, options); + ac_llvm_finalize_module(&ctx, ac_llvm->passmgr, args->options); - ac_compile_llvm_module(ac_llvm, ctx.ac.module, rbinary, shader_info, - MESA_SHADER_VERTEX, "GS Copy Shader", options); + ac_compile_llvm_module(ac_llvm, ctx.ac.module, rbinary, + MESA_SHADER_VERTEX, "GS Copy Shader", args->options); (*rbinary)->is_gs_copy_shader = true; } diff -Nru mesa-19.2.8/src/amd/vulkan/radv_pass.c mesa-20.0.8/src/amd/vulkan/radv_pass.c --- mesa-19.2.8/src/amd/vulkan/radv_pass.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/vulkan/radv_pass.c 2020-06-12 01:21:16.000000000 +0000 @@ -30,7 +30,7 @@ static void radv_render_pass_add_subpass_dep(struct radv_render_pass *pass, - const VkSubpassDependency2KHR *dep) + const VkSubpassDependency2 *dep) { uint32_t src = dep->srcSubpass; uint32_t dst = dep->dstSubpass; @@ -59,6 +59,126 @@ } } +static bool +radv_pass_has_layout_transitions(const struct radv_render_pass *pass) +{ + for (unsigned i = 0; i < pass->subpass_count; i++) { + const struct radv_subpass *subpass = &pass->subpasses[i]; + for (unsigned j = 0; j < subpass->attachment_count; j++) { + const uint32_t a = subpass->attachments[j].attachment; + if (a == VK_ATTACHMENT_UNUSED) + continue; + + uint32_t initial_layout = pass->attachments[a].initial_layout; + uint32_t stencil_initial_layout = pass->attachments[a].stencil_initial_layout; + uint32_t final_layout = pass->attachments[a].final_layout; + uint32_t stencil_final_layout = pass->attachments[a].stencil_final_layout; + + if (subpass->attachments[j].layout != initial_layout || + subpass->attachments[j].layout != stencil_initial_layout || + subpass->attachments[j].layout != final_layout || + subpass->attachments[j].layout != stencil_final_layout) + return true; + } + } + + return false; +} + +static void +radv_render_pass_add_implicit_deps(struct radv_render_pass *pass, + bool has_ingoing_dep, bool has_outgoing_dep) +{ + /* From the Vulkan 1.0.39 spec: + * + * If there is no subpass dependency from VK_SUBPASS_EXTERNAL to the + * first subpass that uses an attachment, then an implicit subpass + * dependency exists from VK_SUBPASS_EXTERNAL to the first subpass it is + * used in. The implicit subpass dependency only exists if there + * exists an automatic layout transition away from initialLayout. + * The subpass dependency operates as if defined with the + * following parameters: + * + * VkSubpassDependency implicitDependency = { + * .srcSubpass = VK_SUBPASS_EXTERNAL; + * .dstSubpass = firstSubpass; // First subpass attachment is used in + * .srcStageMask = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; + * .dstStageMask = VK_PIPELINE_STAGE_ALL_COMMANDS_BIT; + * .srcAccessMask = 0; + * .dstAccessMask = VK_ACCESS_INPUT_ATTACHMENT_READ_BIT | + * VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | + * VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT | + * VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT | + * VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT; + * .dependencyFlags = 0; + * }; + * + * Similarly, if there is no subpass dependency from the last subpass + * that uses an attachment to VK_SUBPASS_EXTERNAL, then an implicit + * subpass dependency exists from the last subpass it is used in to + * VK_SUBPASS_EXTERNAL. The implicit subpass dependency only exists + * if there exists an automatic layout transition into finalLayout. + * The subpass dependency operates as if defined with the following + * parameters: + * + * VkSubpassDependency implicitDependency = { + * .srcSubpass = lastSubpass; // Last subpass attachment is used in + * .dstSubpass = VK_SUBPASS_EXTERNAL; + * .srcStageMask = VK_PIPELINE_STAGE_ALL_COMMANDS_BIT; + * .dstStageMask = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT; + * .srcAccessMask = VK_ACCESS_INPUT_ATTACHMENT_READ_BIT | + * VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | + * VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT | + * VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT | + * VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT; + * .dstAccessMask = 0; + * .dependencyFlags = 0; + * }; + */ + + /* Implicit subpass dependencies only make sense if automatic layout + * transitions are performed. + */ + if (!radv_pass_has_layout_transitions(pass)) + return; + + if (!has_ingoing_dep) { + const VkSubpassDependency2KHR implicit_ingoing_dep = { + .srcSubpass = VK_SUBPASS_EXTERNAL, + .dstSubpass = 0, + .srcStageMask = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, + .dstStageMask = VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, + .srcAccessMask = 0, + .dstAccessMask = VK_ACCESS_INPUT_ATTACHMENT_READ_BIT | + VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | + VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT | + VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT | + VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT, + .dependencyFlags = 0, + }; + + radv_render_pass_add_subpass_dep(pass, &implicit_ingoing_dep); + } + + if (!has_outgoing_dep) { + const VkSubpassDependency2KHR implicit_outgoing_dep = { + .srcSubpass = 0, + .dstSubpass = VK_SUBPASS_EXTERNAL, + .srcStageMask = VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, + .dstStageMask = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, + .srcAccessMask = VK_ACCESS_INPUT_ATTACHMENT_READ_BIT | + VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | + VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT | + VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT | + VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT, + .dstAccessMask = 0, + .dependencyFlags = 0, + }; + + radv_render_pass_add_subpass_dep(pass, &implicit_outgoing_dep); + } +} + static void radv_render_pass_compile(struct radv_render_pass *pass) { @@ -134,6 +254,8 @@ subpass->max_sample_count = MAX2(color_sample_count, depth_sample_count); + subpass->color_sample_count = color_sample_count; + subpass->depth_sample_count = depth_sample_count; /* We have to handle resolve attachments specially */ subpass->has_color_resolve = false; @@ -226,6 +348,8 @@ att->stencil_load_op = pCreateInfo->pAttachments[i].stencilLoadOp; att->initial_layout = pCreateInfo->pAttachments[i].initialLayout; att->final_layout = pCreateInfo->pAttachments[i].finalLayout; + att->stencil_initial_layout = pCreateInfo->pAttachments[i].initialLayout; + att->stencil_final_layout = pCreateInfo->pAttachments[i].finalLayout; // att->store_op = pCreateInfo->pAttachments[i].storeOp; // att->stencil_store_op = pCreateInfo->pAttachments[i].stencilStoreOp; } @@ -269,6 +393,7 @@ subpass->input_attachments[j] = (struct radv_subpass_attachment) { .attachment = desc->pInputAttachments[j].attachment, .layout = desc->pInputAttachments[j].layout, + .stencil_layout = desc->pInputAttachments[j].layout, }; } } @@ -293,6 +418,7 @@ subpass->resolve_attachments[j] = (struct radv_subpass_attachment) { .attachment = desc->pResolveAttachments[j].attachment, .layout = desc->pResolveAttachments[j].layout, + .stencil_layout = desc->pResolveAttachments[j].layout, }; } } @@ -303,13 +429,17 @@ *subpass->depth_stencil_attachment = (struct radv_subpass_attachment) { .attachment = desc->pDepthStencilAttachment->attachment, .layout = desc->pDepthStencilAttachment->layout, + .stencil_layout = desc->pDepthStencilAttachment->layout, }; } } + bool has_ingoing_dep = false; + bool has_outgoing_dep = false; + for (unsigned i = 0; i < pCreateInfo->dependencyCount; ++i) { - /* Convert to a Dependency2KHR */ - struct VkSubpassDependency2KHR dep2 = { + /* Convert to a Dependency2 */ + struct VkSubpassDependency2 dep2 = { .srcSubpass = pCreateInfo->pDependencies[i].srcSubpass, .dstSubpass = pCreateInfo->pDependencies[i].dstSubpass, .srcStageMask = pCreateInfo->pDependencies[i].srcStageMask, @@ -319,8 +449,19 @@ .dependencyFlags = pCreateInfo->pDependencies[i].dependencyFlags, }; radv_render_pass_add_subpass_dep(pass, &dep2); + + /* Determine if the subpass has explicit dependencies from/to + * VK_SUBPASS_EXTERNAL. + */ + if (pCreateInfo->pDependencies[i].srcSubpass == VK_SUBPASS_EXTERNAL) + has_ingoing_dep = true; + if (pCreateInfo->pDependencies[i].dstSubpass == VK_SUBPASS_EXTERNAL) + has_outgoing_dep = true; } + radv_render_pass_add_implicit_deps(pass, + has_ingoing_dep, has_outgoing_dep); + radv_render_pass_compile(pass); *pRenderPass = radv_render_pass_to_handle(pass); @@ -329,11 +470,11 @@ } static unsigned -radv_num_subpass_attachments2(const VkSubpassDescription2KHR *desc) +radv_num_subpass_attachments2(const VkSubpassDescription2 *desc) { - const VkSubpassDescriptionDepthStencilResolveKHR *ds_resolve = + const VkSubpassDescriptionDepthStencilResolve *ds_resolve = vk_find_struct_const(desc->pNext, - SUBPASS_DESCRIPTION_DEPTH_STENCIL_RESOLVE_KHR); + SUBPASS_DESCRIPTION_DEPTH_STENCIL_RESOLVE); return desc->inputAttachmentCount + desc->colorAttachmentCount + @@ -342,9 +483,9 @@ (ds_resolve && ds_resolve->pDepthStencilResolveAttachment); } -VkResult radv_CreateRenderPass2KHR( +VkResult radv_CreateRenderPass2( VkDevice _device, - const VkRenderPassCreateInfo2KHR* pCreateInfo, + const VkRenderPassCreateInfo2* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkRenderPass* pRenderPass) { @@ -353,7 +494,7 @@ size_t size; size_t attachments_offset; - assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO_2_KHR); + assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO_2); size = sizeof(*pass); size += pCreateInfo->subpassCount * sizeof(pass->subpasses[0]); @@ -372,6 +513,9 @@ for (uint32_t i = 0; i < pCreateInfo->attachmentCount; i++) { struct radv_render_pass_attachment *att = &pass->attachments[i]; + const VkAttachmentDescriptionStencilLayoutKHR *stencil_layout = + vk_find_struct_const(pCreateInfo->pAttachments[i].pNext, + ATTACHMENT_DESCRIPTION_STENCIL_LAYOUT_KHR); att->format = pCreateInfo->pAttachments[i].format; att->samples = pCreateInfo->pAttachments[i].samples; @@ -379,6 +523,12 @@ att->stencil_load_op = pCreateInfo->pAttachments[i].stencilLoadOp; att->initial_layout = pCreateInfo->pAttachments[i].initialLayout; att->final_layout = pCreateInfo->pAttachments[i].finalLayout; + att->stencil_initial_layout = (stencil_layout ? + stencil_layout->stencilInitialLayout : + pCreateInfo->pAttachments[i].initialLayout); + att->stencil_final_layout = (stencil_layout ? + stencil_layout->stencilFinalLayout : + pCreateInfo->pAttachments[i].finalLayout); // att->store_op = pCreateInfo->pAttachments[i].storeOp; // att->stencil_store_op = pCreateInfo->pAttachments[i].stencilStoreOp; } @@ -403,7 +553,7 @@ p = pass->subpass_attachments; for (uint32_t i = 0; i < pCreateInfo->subpassCount; i++) { - const VkSubpassDescription2KHR *desc = &pCreateInfo->pSubpasses[i]; + const VkSubpassDescription2 *desc = &pCreateInfo->pSubpasses[i]; struct radv_subpass *subpass = &pass->subpasses[i]; subpass->input_count = desc->inputAttachmentCount; @@ -417,9 +567,16 @@ p += desc->inputAttachmentCount; for (uint32_t j = 0; j < desc->inputAttachmentCount; j++) { + const VkAttachmentReferenceStencilLayoutKHR *stencil_attachment = + vk_find_struct_const(desc->pInputAttachments[j].pNext, + ATTACHMENT_REFERENCE_STENCIL_LAYOUT_KHR); + subpass->input_attachments[j] = (struct radv_subpass_attachment) { .attachment = desc->pInputAttachments[j].attachment, .layout = desc->pInputAttachments[j].layout, + .stencil_layout = (stencil_attachment ? + stencil_attachment->stencilLayout : + desc->pInputAttachments[j].layout), }; } } @@ -451,22 +608,36 @@ if (desc->pDepthStencilAttachment) { subpass->depth_stencil_attachment = p++; + const VkAttachmentReferenceStencilLayoutKHR *stencil_attachment = + vk_find_struct_const(desc->pDepthStencilAttachment->pNext, + ATTACHMENT_REFERENCE_STENCIL_LAYOUT_KHR); + *subpass->depth_stencil_attachment = (struct radv_subpass_attachment) { .attachment = desc->pDepthStencilAttachment->attachment, .layout = desc->pDepthStencilAttachment->layout, + .stencil_layout = (stencil_attachment ? + stencil_attachment->stencilLayout : + desc->pDepthStencilAttachment->layout), }; } - const VkSubpassDescriptionDepthStencilResolveKHR *ds_resolve = + const VkSubpassDescriptionDepthStencilResolve *ds_resolve = vk_find_struct_const(desc->pNext, - SUBPASS_DESCRIPTION_DEPTH_STENCIL_RESOLVE_KHR); + SUBPASS_DESCRIPTION_DEPTH_STENCIL_RESOLVE); if (ds_resolve && ds_resolve->pDepthStencilResolveAttachment) { subpass->ds_resolve_attachment = p++; + const VkAttachmentReferenceStencilLayoutKHR *stencil_resolve_attachment = + vk_find_struct_const(ds_resolve->pDepthStencilResolveAttachment->pNext, + ATTACHMENT_REFERENCE_STENCIL_LAYOUT_KHR); + *subpass->ds_resolve_attachment = (struct radv_subpass_attachment) { .attachment = ds_resolve->pDepthStencilResolveAttachment->attachment, .layout = ds_resolve->pDepthStencilResolveAttachment->layout, + .stencil_layout = (stencil_resolve_attachment ? + stencil_resolve_attachment->stencilLayout : + ds_resolve->pDepthStencilResolveAttachment->layout), }; subpass->depth_resolve_mode = ds_resolve->depthResolveMode; @@ -474,11 +645,25 @@ } } + bool has_ingoing_dep = false; + bool has_outgoing_dep = false; + for (unsigned i = 0; i < pCreateInfo->dependencyCount; ++i) { radv_render_pass_add_subpass_dep(pass, &pCreateInfo->pDependencies[i]); + + /* Determine if the subpass has explicit dependencies from/to + * VK_SUBPASS_EXTERNAL. + */ + if (pCreateInfo->pDependencies[i].srcSubpass == VK_SUBPASS_EXTERNAL) + has_ingoing_dep = true; + if (pCreateInfo->pDependencies[i].dstSubpass == VK_SUBPASS_EXTERNAL) + has_outgoing_dep = true; } + radv_render_pass_add_implicit_deps(pass, + has_ingoing_dep, has_outgoing_dep); + radv_render_pass_compile(pass); *pRenderPass = radv_render_pass_to_handle(pass); diff -Nru mesa-19.2.8/src/amd/vulkan/radv_pipeline.c mesa-20.0.8/src/amd/vulkan/radv_pipeline.c --- mesa-19.2.8/src/amd/vulkan/radv_pipeline.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/vulkan/radv_pipeline.c 2020-06-12 01:21:16.000000000 +0000 @@ -25,6 +25,7 @@ * IN THE SOFTWARE. */ +#include "util/disk_cache.h" #include "util/mesa-sha1.h" #include "util/u_atomic.h" #include "radv_debug.h" @@ -90,22 +91,49 @@ uint32_t tf_param; }; -struct radv_gs_state { - uint32_t vgt_gs_onchip_cntl; - uint32_t vgt_gs_max_prims_per_subgroup; - uint32_t vgt_esgs_ring_itemsize; - uint32_t lds_size; -}; +static const VkPipelineMultisampleStateCreateInfo * +radv_pipeline_get_multisample_state(const VkGraphicsPipelineCreateInfo *pCreateInfo) +{ + if (!pCreateInfo->pRasterizationState->rasterizerDiscardEnable) + return pCreateInfo->pMultisampleState; + return NULL; +} -struct radv_ngg_state { - uint16_t ngg_emit_size; /* in dwords */ - uint32_t hw_max_esverts; - uint32_t max_gsprims; - uint32_t max_out_verts; - uint32_t prim_amp_factor; - uint32_t vgt_esgs_ring_itemsize; - bool max_vert_out_per_gs_instance; -}; +static const VkPipelineTessellationStateCreateInfo * +radv_pipeline_get_tessellation_state(const VkGraphicsPipelineCreateInfo *pCreateInfo) +{ + for (uint32_t i = 0; i < pCreateInfo->stageCount; i++) { + if (pCreateInfo->pStages[i].stage == VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT || + pCreateInfo->pStages[i].stage == VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT) { + return pCreateInfo->pTessellationState; + } + } + return NULL; +} + +static const VkPipelineDepthStencilStateCreateInfo * +radv_pipeline_get_depth_stencil_state(const VkGraphicsPipelineCreateInfo *pCreateInfo) +{ + RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass); + struct radv_subpass *subpass = pass->subpasses + pCreateInfo->subpass; + + if (!pCreateInfo->pRasterizationState->rasterizerDiscardEnable && + subpass->depth_stencil_attachment) + return pCreateInfo->pDepthStencilState; + return NULL; +} + +static const VkPipelineColorBlendStateCreateInfo * +radv_pipeline_get_color_blend_state(const VkGraphicsPipelineCreateInfo *pCreateInfo) +{ + RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass); + struct radv_subpass *subpass = pass->subpasses + pCreateInfo->subpass; + + if (!pCreateInfo->pRasterizationState->rasterizerDiscardEnable && + subpass->has_color_att) + return pCreateInfo->pColorBlendState; + return NULL; +} bool radv_pipeline_has_ngg(const struct radv_pipeline *pipeline) { @@ -121,6 +149,22 @@ return variant->info.is_ngg; } +bool radv_pipeline_has_ngg_passthrough(const struct radv_pipeline *pipeline) +{ + assert(radv_pipeline_has_ngg(pipeline)); + + struct radv_shader_variant *variant = NULL; + if (pipeline->shaders[MESA_SHADER_GEOMETRY]) + variant = pipeline->shaders[MESA_SHADER_GEOMETRY]; + else if (pipeline->shaders[MESA_SHADER_TESS_EVAL]) + variant = pipeline->shaders[MESA_SHADER_TESS_EVAL]; + else if (pipeline->shaders[MESA_SHADER_VERTEX]) + variant = pipeline->shaders[MESA_SHADER_VERTEX]; + else + return false; + return variant->info.is_ngg_passthrough; +} + bool radv_pipeline_has_gs_copy_shader(const struct radv_pipeline *pipeline) { if (!radv_pipeline_has_gs(pipeline)) @@ -172,8 +216,6 @@ { uint32_t hash_flags = 0; - if (device->instance->debug_flags & RADV_DEBUG_UNSAFE_MATH) - hash_flags |= RADV_HASH_SHADER_UNSAFE_MATH; if (device->instance->debug_flags & RADV_DEBUG_NO_NGG) hash_flags |= RADV_HASH_SHADER_NO_NGG; if (device->instance->perftest_flags & RADV_PERFTEST_SISCHED) @@ -184,6 +226,8 @@ hash_flags |= RADV_HASH_SHADER_PS_WAVE32; if (device->physical_device->ge_wave_size == 32) hash_flags |= RADV_HASH_SHADER_GE_WAVE32; + if (device->physical_device->use_aco) + hash_flags |= RADV_HASH_SHADER_ACO; return hash_flags; } @@ -196,7 +240,8 @@ unsigned min_waves = 1; for (int i = 0; i < MESA_SHADER_STAGES; ++i) { - if (pipeline->shaders[i]) { + if (pipeline->shaders[i] && + pipeline->shaders[i]->config.scratch_bytes_per_wave) { unsigned max_stage_waves = device->scratch_waves; scratch_bytes_per_wave = MAX2(scratch_bytes_per_wave, @@ -216,14 +261,6 @@ min_waves = MAX2(min_waves, round_up_u32(group_size, 64)); } - if (scratch_bytes_per_wave) - max_waves = MIN2(max_waves, 0xffffffffu / scratch_bytes_per_wave); - - if (scratch_bytes_per_wave && max_waves < min_waves) { - /* Not really true at this moment, but will be true on first - * execution. Avoid having hanging shaders. */ - return vk_error(device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY); - } pipeline->scratch_bytes_per_wave = scratch_bytes_per_wave; pipeline->max_waves = max_waves; return VK_SUCCESS; @@ -721,24 +758,24 @@ const VkGraphicsPipelineCreateInfo *pCreateInfo, const struct radv_graphics_pipeline_create_info *extra) { - const VkPipelineColorBlendStateCreateInfo *vkblend = pCreateInfo->pColorBlendState; - const VkPipelineMultisampleStateCreateInfo *vkms = pCreateInfo->pMultisampleState; + const VkPipelineColorBlendStateCreateInfo *vkblend = radv_pipeline_get_color_blend_state(pCreateInfo); + const VkPipelineMultisampleStateCreateInfo *vkms = radv_pipeline_get_multisample_state(pCreateInfo); struct radv_blend_state blend = {0}; unsigned mode = V_028808_CB_NORMAL; int i; - if (!vkblend) - return blend; - if (extra && extra->custom_blend_mode) { blend.single_cb_enable = true; mode = extra->custom_blend_mode; } + blend.cb_color_control = 0; - if (vkblend->logicOpEnable) - blend.cb_color_control |= S_028808_ROP3(si_translate_blend_logic_op(vkblend->logicOp)); - else - blend.cb_color_control |= S_028808_ROP3(V_028808_ROP3_COPY); + if (vkblend) { + if (vkblend->logicOpEnable) + blend.cb_color_control |= S_028808_ROP3(si_translate_blend_logic_op(vkblend->logicOp)); + else + blend.cb_color_control |= S_028808_ROP3(V_028808_ROP3_COPY); + } blend.db_alpha_to_mask = S_028B70_ALPHA_TO_MASK_OFFSET0(3) | S_028B70_ALPHA_TO_MASK_OFFSET1(1) | @@ -752,120 +789,122 @@ } blend.cb_target_mask = 0; - for (i = 0; i < vkblend->attachmentCount; i++) { - const VkPipelineColorBlendAttachmentState *att = &vkblend->pAttachments[i]; - unsigned blend_cntl = 0; - unsigned srcRGB_opt, dstRGB_opt, srcA_opt, dstA_opt; - VkBlendOp eqRGB = att->colorBlendOp; - VkBlendFactor srcRGB = att->srcColorBlendFactor; - VkBlendFactor dstRGB = att->dstColorBlendFactor; - VkBlendOp eqA = att->alphaBlendOp; - VkBlendFactor srcA = att->srcAlphaBlendFactor; - VkBlendFactor dstA = att->dstAlphaBlendFactor; + if (vkblend) { + for (i = 0; i < vkblend->attachmentCount; i++) { + const VkPipelineColorBlendAttachmentState *att = &vkblend->pAttachments[i]; + unsigned blend_cntl = 0; + unsigned srcRGB_opt, dstRGB_opt, srcA_opt, dstA_opt; + VkBlendOp eqRGB = att->colorBlendOp; + VkBlendFactor srcRGB = att->srcColorBlendFactor; + VkBlendFactor dstRGB = att->dstColorBlendFactor; + VkBlendOp eqA = att->alphaBlendOp; + VkBlendFactor srcA = att->srcAlphaBlendFactor; + VkBlendFactor dstA = att->dstAlphaBlendFactor; - blend.sx_mrt_blend_opt[i] = S_028760_COLOR_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED) | S_028760_ALPHA_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED); + blend.sx_mrt_blend_opt[i] = S_028760_COLOR_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED) | S_028760_ALPHA_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED); - if (!att->colorWriteMask) - continue; + if (!att->colorWriteMask) + continue; + + blend.cb_target_mask |= (unsigned)att->colorWriteMask << (4 * i); + blend.cb_target_enabled_4bit |= 0xf << (4 * i); + if (!att->blendEnable) { + blend.cb_blend_control[i] = blend_cntl; + continue; + } - blend.cb_target_mask |= (unsigned)att->colorWriteMask << (4 * i); - blend.cb_target_enabled_4bit |= 0xf << (4 * i); - if (!att->blendEnable) { + if (is_dual_src(srcRGB) || is_dual_src(dstRGB) || is_dual_src(srcA) || is_dual_src(dstA)) + if (i == 0) + blend.mrt0_is_dual_src = true; + + if (eqRGB == VK_BLEND_OP_MIN || eqRGB == VK_BLEND_OP_MAX) { + srcRGB = VK_BLEND_FACTOR_ONE; + dstRGB = VK_BLEND_FACTOR_ONE; + } + if (eqA == VK_BLEND_OP_MIN || eqA == VK_BLEND_OP_MAX) { + srcA = VK_BLEND_FACTOR_ONE; + dstA = VK_BLEND_FACTOR_ONE; + } + + radv_blend_check_commutativity(&blend, eqRGB, srcRGB, dstRGB, + 0x7 << (4 * i)); + radv_blend_check_commutativity(&blend, eqA, srcA, dstA, + 0x8 << (4 * i)); + + /* Blending optimizations for RB+. + * These transformations don't change the behavior. + * + * First, get rid of DST in the blend factors: + * func(src * DST, dst * 0) ---> func(src * 0, dst * SRC) + */ + si_blend_remove_dst(&eqRGB, &srcRGB, &dstRGB, + VK_BLEND_FACTOR_DST_COLOR, + VK_BLEND_FACTOR_SRC_COLOR); + + si_blend_remove_dst(&eqA, &srcA, &dstA, + VK_BLEND_FACTOR_DST_COLOR, + VK_BLEND_FACTOR_SRC_COLOR); + + si_blend_remove_dst(&eqA, &srcA, &dstA, + VK_BLEND_FACTOR_DST_ALPHA, + VK_BLEND_FACTOR_SRC_ALPHA); + + /* Look up the ideal settings from tables. */ + srcRGB_opt = si_translate_blend_opt_factor(srcRGB, false); + dstRGB_opt = si_translate_blend_opt_factor(dstRGB, false); + srcA_opt = si_translate_blend_opt_factor(srcA, true); + dstA_opt = si_translate_blend_opt_factor(dstA, true); + + /* Handle interdependencies. */ + if (si_blend_factor_uses_dst(srcRGB)) + dstRGB_opt = V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE; + if (si_blend_factor_uses_dst(srcA)) + dstA_opt = V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE; + + if (srcRGB == VK_BLEND_FACTOR_SRC_ALPHA_SATURATE && + (dstRGB == VK_BLEND_FACTOR_ZERO || + dstRGB == VK_BLEND_FACTOR_SRC_ALPHA || + dstRGB == VK_BLEND_FACTOR_SRC_ALPHA_SATURATE)) + dstRGB_opt = V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_A0; + + /* Set the final value. */ + blend.sx_mrt_blend_opt[i] = + S_028760_COLOR_SRC_OPT(srcRGB_opt) | + S_028760_COLOR_DST_OPT(dstRGB_opt) | + S_028760_COLOR_COMB_FCN(si_translate_blend_opt_function(eqRGB)) | + S_028760_ALPHA_SRC_OPT(srcA_opt) | + S_028760_ALPHA_DST_OPT(dstA_opt) | + S_028760_ALPHA_COMB_FCN(si_translate_blend_opt_function(eqA)); + blend_cntl |= S_028780_ENABLE(1); + + blend_cntl |= S_028780_COLOR_COMB_FCN(si_translate_blend_function(eqRGB)); + blend_cntl |= S_028780_COLOR_SRCBLEND(si_translate_blend_factor(srcRGB)); + blend_cntl |= S_028780_COLOR_DESTBLEND(si_translate_blend_factor(dstRGB)); + if (srcA != srcRGB || dstA != dstRGB || eqA != eqRGB) { + blend_cntl |= S_028780_SEPARATE_ALPHA_BLEND(1); + blend_cntl |= S_028780_ALPHA_COMB_FCN(si_translate_blend_function(eqA)); + blend_cntl |= S_028780_ALPHA_SRCBLEND(si_translate_blend_factor(srcA)); + blend_cntl |= S_028780_ALPHA_DESTBLEND(si_translate_blend_factor(dstA)); + } blend.cb_blend_control[i] = blend_cntl; - continue; - } - if (is_dual_src(srcRGB) || is_dual_src(dstRGB) || is_dual_src(srcA) || is_dual_src(dstA)) - if (i == 0) - blend.mrt0_is_dual_src = true; - - if (eqRGB == VK_BLEND_OP_MIN || eqRGB == VK_BLEND_OP_MAX) { - srcRGB = VK_BLEND_FACTOR_ONE; - dstRGB = VK_BLEND_FACTOR_ONE; - } - if (eqA == VK_BLEND_OP_MIN || eqA == VK_BLEND_OP_MAX) { - srcA = VK_BLEND_FACTOR_ONE; - dstA = VK_BLEND_FACTOR_ONE; - } - - radv_blend_check_commutativity(&blend, eqRGB, srcRGB, dstRGB, - 0x7 << (4 * i)); - radv_blend_check_commutativity(&blend, eqA, srcA, dstA, - 0x8 << (4 * i)); + blend.blend_enable_4bit |= 0xfu << (i * 4); - /* Blending optimizations for RB+. - * These transformations don't change the behavior. - * - * First, get rid of DST in the blend factors: - * func(src * DST, dst * 0) ---> func(src * 0, dst * SRC) - */ - si_blend_remove_dst(&eqRGB, &srcRGB, &dstRGB, - VK_BLEND_FACTOR_DST_COLOR, - VK_BLEND_FACTOR_SRC_COLOR); - - si_blend_remove_dst(&eqA, &srcA, &dstA, - VK_BLEND_FACTOR_DST_COLOR, - VK_BLEND_FACTOR_SRC_COLOR); - - si_blend_remove_dst(&eqA, &srcA, &dstA, - VK_BLEND_FACTOR_DST_ALPHA, - VK_BLEND_FACTOR_SRC_ALPHA); - - /* Look up the ideal settings from tables. */ - srcRGB_opt = si_translate_blend_opt_factor(srcRGB, false); - dstRGB_opt = si_translate_blend_opt_factor(dstRGB, false); - srcA_opt = si_translate_blend_opt_factor(srcA, true); - dstA_opt = si_translate_blend_opt_factor(dstA, true); - - /* Handle interdependencies. */ - if (si_blend_factor_uses_dst(srcRGB)) - dstRGB_opt = V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE; - if (si_blend_factor_uses_dst(srcA)) - dstA_opt = V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE; - - if (srcRGB == VK_BLEND_FACTOR_SRC_ALPHA_SATURATE && - (dstRGB == VK_BLEND_FACTOR_ZERO || - dstRGB == VK_BLEND_FACTOR_SRC_ALPHA || - dstRGB == VK_BLEND_FACTOR_SRC_ALPHA_SATURATE)) - dstRGB_opt = V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_A0; - - /* Set the final value. */ - blend.sx_mrt_blend_opt[i] = - S_028760_COLOR_SRC_OPT(srcRGB_opt) | - S_028760_COLOR_DST_OPT(dstRGB_opt) | - S_028760_COLOR_COMB_FCN(si_translate_blend_opt_function(eqRGB)) | - S_028760_ALPHA_SRC_OPT(srcA_opt) | - S_028760_ALPHA_DST_OPT(dstA_opt) | - S_028760_ALPHA_COMB_FCN(si_translate_blend_opt_function(eqA)); - blend_cntl |= S_028780_ENABLE(1); - - blend_cntl |= S_028780_COLOR_COMB_FCN(si_translate_blend_function(eqRGB)); - blend_cntl |= S_028780_COLOR_SRCBLEND(si_translate_blend_factor(srcRGB)); - blend_cntl |= S_028780_COLOR_DESTBLEND(si_translate_blend_factor(dstRGB)); - if (srcA != srcRGB || dstA != dstRGB || eqA != eqRGB) { - blend_cntl |= S_028780_SEPARATE_ALPHA_BLEND(1); - blend_cntl |= S_028780_ALPHA_COMB_FCN(si_translate_blend_function(eqA)); - blend_cntl |= S_028780_ALPHA_SRCBLEND(si_translate_blend_factor(srcA)); - blend_cntl |= S_028780_ALPHA_DESTBLEND(si_translate_blend_factor(dstA)); - } - blend.cb_blend_control[i] = blend_cntl; - - blend.blend_enable_4bit |= 0xfu << (i * 4); - - if (srcRGB == VK_BLEND_FACTOR_SRC_ALPHA || - dstRGB == VK_BLEND_FACTOR_SRC_ALPHA || - srcRGB == VK_BLEND_FACTOR_SRC_ALPHA_SATURATE || - dstRGB == VK_BLEND_FACTOR_SRC_ALPHA_SATURATE || - srcRGB == VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA || - dstRGB == VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA) - blend.need_src_alpha |= 1 << i; - } - for (i = vkblend->attachmentCount; i < 8; i++) { - blend.cb_blend_control[i] = 0; - blend.sx_mrt_blend_opt[i] = S_028760_COLOR_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED) | S_028760_ALPHA_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED); + if (srcRGB == VK_BLEND_FACTOR_SRC_ALPHA || + dstRGB == VK_BLEND_FACTOR_SRC_ALPHA || + srcRGB == VK_BLEND_FACTOR_SRC_ALPHA_SATURATE || + dstRGB == VK_BLEND_FACTOR_SRC_ALPHA_SATURATE || + srcRGB == VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA || + dstRGB == VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA) + blend.need_src_alpha |= 1 << i; + } + for (i = vkblend->attachmentCount; i < 8; i++) { + blend.cb_blend_control[i] = 0; + blend.sx_mrt_blend_opt[i] = S_028760_COLOR_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED) | S_028760_ALPHA_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED); + } } - if (pipeline->device->physical_device->has_rbplus) { + if (pipeline->device->physical_device->rad_info.has_rbplus) { /* Disable RB+ blend optimizations for dual source blending. */ if (blend.mrt0_is_dual_src) { for (i = 0; i < 8; i++) { @@ -878,7 +917,8 @@ /* RB+ doesn't work with dual source blending, logic op and * RESOLVE. */ - if (blend.mrt0_is_dual_src || vkblend->logicOpEnable || + if (blend.mrt0_is_dual_src || + (vkblend && vkblend->logicOpEnable) || mode == V_028808_CB_RESOLVE) blend.cb_color_control |= S_028808_DISABLE_DUAL_QUAD(1); } @@ -931,10 +971,27 @@ } } -static uint8_t radv_pipeline_get_ps_iter_samples(const VkPipelineMultisampleStateCreateInfo *vkms) +static uint8_t radv_pipeline_get_ps_iter_samples(const VkGraphicsPipelineCreateInfo *pCreateInfo) { - uint32_t num_samples = vkms->rasterizationSamples; + const VkPipelineMultisampleStateCreateInfo *vkms = pCreateInfo->pMultisampleState; + RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass); + struct radv_subpass *subpass = &pass->subpasses[pCreateInfo->subpass]; uint32_t ps_iter_samples = 1; + uint32_t num_samples; + + /* From the Vulkan 1.1.129 spec, 26.7. Sample Shading: + * + * "If the VK_AMD_mixed_attachment_samples extension is enabled and the + * subpass uses color attachments, totalSamples is the number of + * samples of the color attachments. Otherwise, totalSamples is the + * value of VkPipelineMultisampleStateCreateInfo::rasterizationSamples + * specified at pipeline creation time." + */ + if (subpass->has_color_att) { + num_samples = subpass->color_sample_count; + } else { + num_samples = vkms->rasterizationSamples; + } if (vkms->sampleShadingEnable) { ps_iter_samples = ceil(vkms->minSampleShading * num_samples); @@ -1011,13 +1068,15 @@ { RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass); struct radv_subpass *subpass = pass->subpasses + pCreateInfo->subpass; + const VkPipelineDepthStencilStateCreateInfo *vkds = radv_pipeline_get_depth_stencil_state(pCreateInfo); + const VkPipelineColorBlendStateCreateInfo *vkblend = radv_pipeline_get_color_blend_state(pCreateInfo); unsigned colormask = blend->cb_target_enabled_4bit; if (!pipeline->device->physical_device->out_of_order_rast_allowed) return false; /* Be conservative if a logic operation is enabled with color buffers. */ - if (colormask && pCreateInfo->pColorBlendState->logicOpEnable) + if (colormask && vkblend && vkblend->logicOpEnable) return false; /* Default depth/stencil invariance when no attachment is bound. */ @@ -1025,10 +1084,7 @@ .zs = true, .pass_set = true }; - if (pCreateInfo->pDepthStencilState && - subpass->depth_stencil_attachment) { - const VkPipelineDepthStencilStateCreateInfo *vkds = - pCreateInfo->pDepthStencilState; + if (vkds) { struct radv_render_pass_attachment *attachment = pass->attachments + subpass->depth_stencil_attachment->attachment; bool has_stencil = vk_format_is_stencil(attachment->format); @@ -1077,8 +1133,8 @@ * except when early Z/S tests are requested. */ if (ps && - ps->info.info.ps.writes_memory && - ps->info.fs.early_fragment_test && + ps->info.ps.writes_memory && + ps->info.ps.early_fragment_test && !dsa_order_invariant.pass_set) return false; @@ -1115,7 +1171,7 @@ struct radv_blend_state *blend, const VkGraphicsPipelineCreateInfo *pCreateInfo) { - const VkPipelineMultisampleStateCreateInfo *vkms = pCreateInfo->pMultisampleState; + const VkPipelineMultisampleStateCreateInfo *vkms = radv_pipeline_get_multisample_state(pCreateInfo); struct radv_multisample_state *ms = &pipeline->graphics.ms; unsigned num_tile_pipes = pipeline->device->physical_device->rad_info.num_tile_pipes; bool out_of_order_rast = false; @@ -1141,10 +1197,10 @@ * * Otherwise, sample shading is considered disabled." */ - if (pipeline->shaders[MESA_SHADER_FRAGMENT]->info.info.ps.force_persample) { + if (pipeline->shaders[MESA_SHADER_FRAGMENT]->info.ps.force_persample) { ps_iter_samples = ms->num_samples; } else { - ps_iter_samples = radv_pipeline_get_ps_iter_samples(vkms); + ps_iter_samples = radv_pipeline_get_ps_iter_samples(pCreateInfo); } } else { ms->num_samples = 1; @@ -1187,11 +1243,15 @@ S_028A48_VPORT_SCISSOR_ENABLE(1); if (ms->num_samples > 1) { + RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass); + struct radv_subpass *subpass = &pass->subpasses[pCreateInfo->subpass]; + uint32_t z_samples = subpass->depth_stencil_attachment ? subpass->depth_sample_count : ms->num_samples; unsigned log_samples = util_logbase2(ms->num_samples); + unsigned log_z_samples = util_logbase2(z_samples); unsigned log_ps_iter_samples = util_logbase2(ps_iter_samples); ms->pa_sc_mode_cntl_0 |= S_028A48_MSAA_ENABLE(1); ms->pa_sc_line_cntl |= S_028BDC_EXPAND_LINE_WIDTH(1); /* CM_R_028BDC_PA_SC_LINE_CNTL */ - ms->db_eqaa |= S_028804_MAX_ANCHOR_SAMPLES(log_samples) | + ms->db_eqaa |= S_028804_MAX_ANCHOR_SAMPLES(log_z_samples) | S_028804_PS_ITER_SAMPLES(log_ps_iter_samples) | S_028804_MASK_EXPORT_NUM_SAMPLES(log_samples) | S_028804_ALPHA_TO_MASK_NUM_SAMPLES(log_samples); @@ -1527,23 +1587,25 @@ pipeline->dynamic_state.mask = states; } -static struct radv_gs_state -calculate_gs_info(const VkGraphicsPipelineCreateInfo *pCreateInfo, - const struct radv_pipeline *pipeline) +static void +gfx9_get_gs_info(const struct radv_pipeline_key *key, + const struct radv_pipeline *pipeline, + nir_shader **nir, + struct radv_shader_info *infos, + struct gfx9_gs_info *out) { - struct radv_gs_state gs = {0}; - struct radv_shader_variant_info *gs_info = &pipeline->shaders[MESA_SHADER_GEOMETRY]->info; + struct radv_shader_info *gs_info = &infos[MESA_SHADER_GEOMETRY]; struct radv_es_output_info *es_info; if (pipeline->device->physical_device->rad_info.chip_class >= GFX9) - es_info = radv_pipeline_has_tess(pipeline) ? &gs_info->tes.es_info : &gs_info->vs.es_info; + es_info = nir[MESA_SHADER_TESS_CTRL] ? &gs_info->tes.es_info : &gs_info->vs.es_info; else - es_info = radv_pipeline_has_tess(pipeline) ? - &pipeline->shaders[MESA_SHADER_TESS_EVAL]->info.tes.es_info : - &pipeline->shaders[MESA_SHADER_VERTEX]->info.vs.es_info; + es_info = nir[MESA_SHADER_TESS_CTRL] ? + &infos[MESA_SHADER_TESS_EVAL].tes.es_info : + &infos[MESA_SHADER_VERTEX].vs.es_info; unsigned gs_num_invocations = MAX2(gs_info->gs.invocations, 1); bool uses_adjacency; - switch(pCreateInfo->pInputAssemblyState->topology) { + switch(key->topology) { case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY: case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY: case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY: @@ -1638,15 +1700,13 @@ uint32_t gs_prims_per_subgroup = gs_prims; uint32_t gs_inst_prims_in_subgroup = gs_prims * gs_num_invocations; uint32_t max_prims_per_subgroup = gs_inst_prims_in_subgroup * gs_info->gs.vertices_out; - gs.lds_size = align(esgs_lds_size, 128) / 128; - gs.vgt_gs_onchip_cntl = S_028A44_ES_VERTS_PER_SUBGRP(es_verts_per_subgroup) | + out->lds_size = align(esgs_lds_size, 128) / 128; + out->vgt_gs_onchip_cntl = S_028A44_ES_VERTS_PER_SUBGRP(es_verts_per_subgroup) | S_028A44_GS_PRIMS_PER_SUBGRP(gs_prims_per_subgroup) | S_028A44_GS_INST_PRIMS_IN_SUBGRP(gs_inst_prims_in_subgroup); - gs.vgt_gs_max_prims_per_subgroup = S_028A94_MAX_PRIMS_PER_SUBGROUP(max_prims_per_subgroup); - gs.vgt_esgs_ring_itemsize = esgs_itemsize; + out->vgt_gs_max_prims_per_subgroup = S_028A94_MAX_PRIMS_PER_SUBGROUP(max_prims_per_subgroup); + out->vgt_esgs_ring_itemsize = esgs_itemsize; assert(max_prims_per_subgroup <= max_out_prims); - - return gs; } static void clamp_gsprims_to_esverts(unsigned *max_gsprims, unsigned max_esverts, @@ -1659,21 +1719,20 @@ } static unsigned -radv_get_num_input_vertices(struct radv_pipeline *pipeline) +radv_get_num_input_vertices(nir_shader **nir) { - if (radv_pipeline_has_gs(pipeline)) { - struct radv_shader_variant *gs = - radv_get_shader(pipeline, MESA_SHADER_GEOMETRY); + if (nir[MESA_SHADER_GEOMETRY]) { + nir_shader *gs = nir[MESA_SHADER_GEOMETRY]; return gs->info.gs.vertices_in; } - if (radv_pipeline_has_tess(pipeline)) { - struct radv_shader_variant *tes = radv_get_shader(pipeline, MESA_SHADER_TESS_EVAL); + if (nir[MESA_SHADER_TESS_CTRL]) { + nir_shader *tes = nir[MESA_SHADER_TESS_EVAL]; - if (tes->info.tes.point_mode) + if (tes->info.tess.point_mode) return 1; - if (tes->info.tes.primitive_mode == GL_ISOLINES) + if (tes->info.tess.primitive_mode == GL_ISOLINES) return 2; return 3; } @@ -1681,21 +1740,23 @@ return 3; } -static struct radv_ngg_state -calculate_ngg_info(const VkGraphicsPipelineCreateInfo *pCreateInfo, - struct radv_pipeline *pipeline) +static void +gfx10_get_ngg_info(const struct radv_pipeline_key *key, + struct radv_pipeline *pipeline, + nir_shader **nir, + struct radv_shader_info *infos, + struct gfx10_ngg_info *ngg) { - struct radv_ngg_state ngg = {0}; - struct radv_shader_variant_info *gs_info = &pipeline->shaders[MESA_SHADER_GEOMETRY]->info; + struct radv_shader_info *gs_info = &infos[MESA_SHADER_GEOMETRY]; struct radv_es_output_info *es_info = - radv_pipeline_has_tess(pipeline) ? &gs_info->tes.es_info : &gs_info->vs.es_info; - unsigned gs_type = radv_pipeline_has_gs(pipeline) ? MESA_SHADER_GEOMETRY : MESA_SHADER_VERTEX; - unsigned max_verts_per_prim = radv_get_num_input_vertices(pipeline); + nir[MESA_SHADER_TESS_CTRL] ? &gs_info->tes.es_info : &gs_info->vs.es_info; + unsigned gs_type = nir[MESA_SHADER_GEOMETRY] ? MESA_SHADER_GEOMETRY : MESA_SHADER_VERTEX; + unsigned max_verts_per_prim = radv_get_num_input_vertices(nir); unsigned min_verts_per_prim = gs_type == MESA_SHADER_GEOMETRY ? max_verts_per_prim : 1; - unsigned gs_num_invocations = radv_pipeline_has_gs(pipeline) ? MAX2(gs_info->gs.invocations, 1) : 1; + unsigned gs_num_invocations = nir[MESA_SHADER_GEOMETRY] ? MAX2(gs_info->gs.invocations, 1) : 1; bool uses_adjacency; - switch(pCreateInfo->pInputAssemblyState->topology) { + switch(key->topology) { case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY: case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY: case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY: @@ -1711,17 +1772,11 @@ /* We can't allow using the whole LDS, because GS waves compete with * other shader stages for LDS space. * - * Streamout can increase the ESGS buffer size later on, so be more - * conservative with streamout and use 4K dwords. This may be suboptimal. - * - * Otherwise, use the limit of 7K dwords. The reason is that we need - * to leave some headroom for the max_esverts increase at the end. - * * TODO: We should really take the shader's internal LDS use into * account. The linker will fail if the size is greater than * 8K dwords. */ - const unsigned max_lds_size = (0 /*gs_info->info.so.num_outputs*/ ? 4 : 7) * 1024 - 128; + const unsigned max_lds_size = 8 * 1024 - 768; const unsigned target_lds_size = max_lds_size; unsigned esvert_lds_size = 0; unsigned gsprim_lds_size = 0; @@ -1762,12 +1817,22 @@ esvert_lds_size = es_info->esgs_itemsize / 4; gsprim_lds_size = (gs_info->gs.gsvs_vertex_size / 4 + 1) * max_out_verts_per_gsprim; } else { - /* TODO: This needs to be adjusted once LDS use for compaction - * after culling is implemented. */ - /* - if (es_info->info.so.num_outputs) - esvert_lds_size = 4 * es_info->info.so.num_outputs + 1; - */ + /* VS and TES. */ + /* LDS size for passing data from GS to ES. */ + struct radv_streamout_info *so_info = nir[MESA_SHADER_TESS_CTRL] + ? &infos[MESA_SHADER_TESS_EVAL].so + : &infos[MESA_SHADER_VERTEX].so; + + if (so_info->num_outputs) + esvert_lds_size = 4 * so_info->num_outputs + 1; + + /* GS stores Primitive IDs (one DWORD) into LDS at the address + * corresponding to the ES thread of the provoking vertex. All + * ES threads load and export PrimitiveID for their thread. + */ + if (!nir[MESA_SHADER_TESS_CTRL] && + infos[MESA_SHADER_VERTEX].vs.outinfo.export_prim_id) + esvert_lds_size = MAX2(esvert_lds_size, 1); } unsigned max_gsprims = max_gsprims_base; @@ -1805,9 +1870,18 @@ /* Round up towards full wave sizes for better ALU utilization. */ if (!max_vert_out_per_gs_instance) { - const unsigned wavesize = pipeline->device->physical_device->ge_wave_size; unsigned orig_max_esverts; unsigned orig_max_gsprims; + unsigned wavesize; + + if (gs_type == MESA_SHADER_GEOMETRY) { + wavesize = gs_info->wave_size; + } else { + wavesize = nir[MESA_SHADER_TESS_CTRL] + ? infos[MESA_SHADER_TESS_EVAL].wave_size + : infos[MESA_SHADER_VERTEX].wave_size; + } + do { orig_max_esverts = max_esverts; orig_max_gsprims = max_gsprims; @@ -1854,28 +1928,28 @@ * this check passes, there is enough space for a full primitive without * vertex reuse. */ - ngg.hw_max_esverts = max_esverts - max_verts_per_prim + 1; - ngg.max_gsprims = max_gsprims; - ngg.max_out_verts = max_out_vertices; - ngg.prim_amp_factor = prim_amp_factor; - ngg.max_vert_out_per_gs_instance = max_vert_out_per_gs_instance; - ngg.ngg_emit_size = max_gsprims * gsprim_lds_size; + ngg->hw_max_esverts = max_esverts - max_verts_per_prim + 1; + ngg->max_gsprims = max_gsprims; + ngg->max_out_verts = max_out_vertices; + ngg->prim_amp_factor = prim_amp_factor; + ngg->max_vert_out_per_gs_instance = max_vert_out_per_gs_instance; + ngg->ngg_emit_size = max_gsprims * gsprim_lds_size; + ngg->esgs_ring_size = 4 * max_esverts * esvert_lds_size; if (gs_type == MESA_SHADER_GEOMETRY) { - ngg.vgt_esgs_ring_itemsize = es_info->esgs_itemsize / 4; + ngg->vgt_esgs_ring_itemsize = es_info->esgs_itemsize / 4; } else { - ngg.vgt_esgs_ring_itemsize = 1; + ngg->vgt_esgs_ring_itemsize = 1; } - pipeline->graphics.esgs_ring_size = 4 * max_esverts * esvert_lds_size; - - assert(ngg.hw_max_esverts >= 24); /* HW limitation */ + pipeline->graphics.esgs_ring_size = ngg->esgs_ring_size; - return ngg; + assert(ngg->hw_max_esverts >= 24); /* HW limitation */ } static void -calculate_gs_ring_sizes(struct radv_pipeline *pipeline, const struct radv_gs_state *gs) +calculate_gs_ring_sizes(struct radv_pipeline *pipeline, + const struct gfx9_gs_info *gs) { struct radv_device *device = pipeline->device; unsigned num_se = device->physical_device->rad_info.max_se; @@ -1889,7 +1963,7 @@ unsigned alignment = 256 * num_se; /* The maximum size is 63.999 MB per SE. */ unsigned max_size = ((unsigned)(63.999 * 1024 * 1024) & ~255) * num_se; - struct radv_shader_variant_info *gs_info = &pipeline->shaders[MESA_SHADER_GEOMETRY]->info; + struct radv_shader_info *gs_info = &pipeline->shaders[MESA_SHADER_GEOMETRY]->info; /* Calculate the minimum size. */ unsigned min_esgs_ring_size = align(gs->vgt_esgs_ring_itemsize * 4 * gs_vertex_reuse * @@ -2027,7 +2101,7 @@ else topology = V_028B6C_OUTPUT_TRIANGLE_CW; - if (pipeline->device->has_distributed_tess) { + if (pipeline->device->physical_device->rad_info.has_distributed_tess) { if (pipeline->device->physical_device->rad_info.family == CHIP_FIJI || pipeline->device->physical_device->rad_info.family >= CHIP_POLARIS10) distribution_mode = V_028B6C_DISTRIBUTION_MODE_TRAPEZOIDS; @@ -2274,14 +2348,16 @@ } } - if (pCreateInfo->pTessellationState) - key.tess_input_vertices = pCreateInfo->pTessellationState->patchControlPoints; - - - if (pCreateInfo->pMultisampleState && - pCreateInfo->pMultisampleState->rasterizationSamples > 1) { - uint32_t num_samples = pCreateInfo->pMultisampleState->rasterizationSamples; - uint32_t ps_iter_samples = radv_pipeline_get_ps_iter_samples(pCreateInfo->pMultisampleState); + const VkPipelineTessellationStateCreateInfo *tess = + radv_pipeline_get_tessellation_state(pCreateInfo); + if (tess) + key.tess_input_vertices = tess->patchControlPoints; + + const VkPipelineMultisampleStateCreateInfo *vkms = + radv_pipeline_get_multisample_state(pCreateInfo); + if (vkms && vkms->rasterizationSamples > 1) { + uint32_t num_samples = vkms->rasterizationSamples; + uint32_t ps_iter_samples = radv_pipeline_get_ps_iter_samples(pCreateInfo); key.num_samples = num_samples; key.log2_ps_iter_samples = util_logbase2(ps_iter_samples); } @@ -2290,6 +2366,9 @@ if (pipeline->device->physical_device->rad_info.chip_class < GFX8) radv_pipeline_compute_get_int_clamp(pCreateInfo, &key.is_int8, &key.is_int10); + if (pipeline->device->physical_device->rad_info.chip_class >= GFX10) + key.topology = pCreateInfo->pInputAssemblyState->topology; + return key; } @@ -2319,6 +2398,7 @@ keys[MESA_SHADER_VERTEX].vs.vertex_attribute_offsets[i] = key->vertex_attribute_offsets[i]; keys[MESA_SHADER_VERTEX].vs.vertex_attribute_strides[i] = key->vertex_attribute_strides[i]; } + keys[MESA_SHADER_VERTEX].vs.outprim = si_conv_prim_to_gs_out(key->topology); if (nir[MESA_SHADER_TESS_CTRL]) { keys[MESA_SHADER_VERTEX].vs_common_out.as_ls = true; @@ -2336,9 +2416,7 @@ keys[MESA_SHADER_VERTEX].vs_common_out.as_es = true; } - if (device->physical_device->rad_info.chip_class >= GFX10 && - device->physical_device->rad_info.family != CHIP_NAVI14 && - !(device->instance->debug_flags & RADV_DEBUG_NO_NGG)) { + if (device->physical_device->use_ngg) { if (nir[MESA_SHADER_TESS_CTRL]) { keys[MESA_SHADER_TESS_EVAL].vs_common_out.as_ngg = true; } else { @@ -2357,27 +2435,6 @@ keys[MESA_SHADER_TESS_EVAL].vs_common_out.as_ngg = false; } - /* - * Disable NGG with geometry shaders. There are a bunch of - * issues still: - * * GS primitives in pipeline statistic queries do not get - * updates. See dEQP-VK.query_pool.statistics_query.geometry_shader_primitives - * * dEQP-VK.clipping.user_defined.clip_cull_distance_dynamic_index.*geom* failures - * * Interactions with tessellation failing: - * dEQP-VK.tessellation.geometry_interaction.passthrough.tessellate_isolines_passthrough_geometry_no_change - * * General issues with the last primitive missing/corrupt: - * https://bugs.freedesktop.org/show_bug.cgi?id=111248 - * - * Furthermore, XGL/AMDVLK also disables this as of 9b632ef. - */ - if (nir[MESA_SHADER_GEOMETRY]) { - if (nir[MESA_SHADER_TESS_CTRL]) - keys[MESA_SHADER_TESS_EVAL].vs_common_out.as_ngg = false; - else - keys[MESA_SHADER_VERTEX].vs_common_out.as_ngg = false; - } - - /* TODO: Implement streamout support for NGG. */ gl_shader_stage last_xfb_stage = MESA_SHADER_VERTEX; for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) { @@ -2385,13 +2442,30 @@ last_xfb_stage = i; } - if (nir[last_xfb_stage] && - radv_nir_stage_uses_xfb(nir[last_xfb_stage])) { + bool uses_xfb = nir[last_xfb_stage] && + radv_nir_stage_uses_xfb(nir[last_xfb_stage]); + + if (!device->physical_device->use_ngg_streamout && uses_xfb) { if (nir[MESA_SHADER_TESS_CTRL]) keys[MESA_SHADER_TESS_EVAL].vs_common_out.as_ngg = false; else keys[MESA_SHADER_VERTEX].vs_common_out.as_ngg = false; } + + /* Determine if the pipeline is eligible for the NGG passthrough + * mode. It can't be enabled for geometry shaders, for NGG + * streamout or for vertex shaders that export the primitive ID + * (this is checked later because we don't have the info here.) + */ + if (!nir[MESA_SHADER_GEOMETRY] && !uses_xfb) { + if (nir[MESA_SHADER_TESS_CTRL] && + keys[MESA_SHADER_TESS_EVAL].vs_common_out.as_ngg) { + keys[MESA_SHADER_TESS_EVAL].vs_common_out.as_ngg_passthrough = true; + } else if (nir[MESA_SHADER_VERTEX] && + keys[MESA_SHADER_VERTEX].vs_common_out.as_ngg) { + keys[MESA_SHADER_VERTEX].vs_common_out.as_ngg_passthrough = true; + } + } } for(int i = 0; i < MESA_SHADER_STAGES; ++i) @@ -2402,6 +2476,170 @@ keys[MESA_SHADER_FRAGMENT].fs.is_int10 = key->is_int10; keys[MESA_SHADER_FRAGMENT].fs.log2_ps_iter_samples = key->log2_ps_iter_samples; keys[MESA_SHADER_FRAGMENT].fs.num_samples = key->num_samples; + + if (nir[MESA_SHADER_COMPUTE]) { + keys[MESA_SHADER_COMPUTE].cs.subgroup_size = key->compute_subgroup_size; + } +} + +static uint8_t +radv_get_wave_size(struct radv_device *device, + const VkPipelineShaderStageCreateInfo *pStage, + gl_shader_stage stage, + const struct radv_shader_variant_key *key) +{ + if (stage == MESA_SHADER_GEOMETRY && !key->vs_common_out.as_ngg) + return 64; + else if (stage == MESA_SHADER_COMPUTE) { + if (key->cs.subgroup_size) { + /* Return the required subgroup size if specified. */ + return key->cs.subgroup_size; + } + return device->physical_device->cs_wave_size; + } + else if (stage == MESA_SHADER_FRAGMENT) + return device->physical_device->ps_wave_size; + else + return device->physical_device->ge_wave_size; +} + +static uint8_t +radv_get_ballot_bit_size(struct radv_device *device, + const VkPipelineShaderStageCreateInfo *pStage, + gl_shader_stage stage, + const struct radv_shader_variant_key *key) +{ + if (stage == MESA_SHADER_COMPUTE && key->cs.subgroup_size) + return key->cs.subgroup_size; + return 64; +} + +static void +radv_fill_shader_info(struct radv_pipeline *pipeline, + const VkPipelineShaderStageCreateInfo **pStages, + struct radv_shader_variant_key *keys, + struct radv_shader_info *infos, + nir_shader **nir) +{ + unsigned active_stages = 0; + unsigned filled_stages = 0; + + for (int i = 0; i < MESA_SHADER_STAGES; i++) { + if (nir[i]) + active_stages |= (1 << i); + } + + if (nir[MESA_SHADER_FRAGMENT]) { + radv_nir_shader_info_init(&infos[MESA_SHADER_FRAGMENT]); + radv_nir_shader_info_pass(nir[MESA_SHADER_FRAGMENT], + pipeline->layout, + &keys[MESA_SHADER_FRAGMENT], + &infos[MESA_SHADER_FRAGMENT]); + + /* TODO: These are no longer used as keys we should refactor this */ + keys[MESA_SHADER_VERTEX].vs_common_out.export_prim_id = + infos[MESA_SHADER_FRAGMENT].ps.prim_id_input; + keys[MESA_SHADER_VERTEX].vs_common_out.export_layer_id = + infos[MESA_SHADER_FRAGMENT].ps.layer_input; + keys[MESA_SHADER_VERTEX].vs_common_out.export_clip_dists = + !!infos[MESA_SHADER_FRAGMENT].ps.num_input_clips_culls; + keys[MESA_SHADER_VERTEX].vs_common_out.export_viewport_index = + infos[MESA_SHADER_FRAGMENT].ps.viewport_index_input; + keys[MESA_SHADER_TESS_EVAL].vs_common_out.export_prim_id = + infos[MESA_SHADER_FRAGMENT].ps.prim_id_input; + keys[MESA_SHADER_TESS_EVAL].vs_common_out.export_layer_id = + infos[MESA_SHADER_FRAGMENT].ps.layer_input; + keys[MESA_SHADER_TESS_EVAL].vs_common_out.export_clip_dists = + !!infos[MESA_SHADER_FRAGMENT].ps.num_input_clips_culls; + keys[MESA_SHADER_TESS_EVAL].vs_common_out.export_viewport_index = + infos[MESA_SHADER_FRAGMENT].ps.viewport_index_input; + + /* NGG passthrough mode can't be enabled for vertex shaders + * that export the primitive ID. + * + * TODO: I should really refactor the keys logic. + */ + if (nir[MESA_SHADER_VERTEX] && + keys[MESA_SHADER_VERTEX].vs_common_out.export_prim_id) { + keys[MESA_SHADER_VERTEX].vs_common_out.as_ngg_passthrough = false; + } + + filled_stages |= (1 << MESA_SHADER_FRAGMENT); + } + + if (pipeline->device->physical_device->rad_info.chip_class >= GFX9 && + nir[MESA_SHADER_TESS_CTRL]) { + struct nir_shader *combined_nir[] = {nir[MESA_SHADER_VERTEX], nir[MESA_SHADER_TESS_CTRL]}; + struct radv_shader_variant_key key = keys[MESA_SHADER_TESS_CTRL]; + key.tcs.vs_key = keys[MESA_SHADER_VERTEX].vs; + + radv_nir_shader_info_init(&infos[MESA_SHADER_TESS_CTRL]); + + for (int i = 0; i < 2; i++) { + radv_nir_shader_info_pass(combined_nir[i], + pipeline->layout, &key, + &infos[MESA_SHADER_TESS_CTRL]); + } + + keys[MESA_SHADER_TESS_EVAL].tes.num_patches = + infos[MESA_SHADER_TESS_CTRL].tcs.num_patches; + keys[MESA_SHADER_TESS_EVAL].tes.tcs_num_outputs = + util_last_bit64(infos[MESA_SHADER_TESS_CTRL].tcs.outputs_written); + + filled_stages |= (1 << MESA_SHADER_VERTEX); + filled_stages |= (1 << MESA_SHADER_TESS_CTRL); + } + + if (pipeline->device->physical_device->rad_info.chip_class >= GFX9 && + nir[MESA_SHADER_GEOMETRY]) { + gl_shader_stage pre_stage = nir[MESA_SHADER_TESS_EVAL] ? MESA_SHADER_TESS_EVAL : MESA_SHADER_VERTEX; + struct nir_shader *combined_nir[] = {nir[pre_stage], nir[MESA_SHADER_GEOMETRY]}; + + radv_nir_shader_info_init(&infos[MESA_SHADER_GEOMETRY]); + + for (int i = 0; i < 2; i++) { + radv_nir_shader_info_pass(combined_nir[i], + pipeline->layout, + &keys[pre_stage], + &infos[MESA_SHADER_GEOMETRY]); + } + + filled_stages |= (1 << pre_stage); + filled_stages |= (1 << MESA_SHADER_GEOMETRY); + } + + active_stages ^= filled_stages; + while (active_stages) { + int i = u_bit_scan(&active_stages); + + if (i == MESA_SHADER_TESS_CTRL) { + keys[MESA_SHADER_TESS_CTRL].tcs.num_inputs = + util_last_bit64(infos[MESA_SHADER_VERTEX].vs.ls_outputs_written); + } + + if (i == MESA_SHADER_TESS_EVAL) { + keys[MESA_SHADER_TESS_EVAL].tes.num_patches = + infos[MESA_SHADER_TESS_CTRL].tcs.num_patches; + keys[MESA_SHADER_TESS_EVAL].tes.tcs_num_outputs = + util_last_bit64(infos[MESA_SHADER_TESS_CTRL].tcs.outputs_written); + } + + radv_nir_shader_info_init(&infos[i]); + radv_nir_shader_info_pass(nir[i], pipeline->layout, + &keys[i], &infos[i]); + } + + for (int i = 0; i < MESA_SHADER_STAGES; i++) { + if (nir[i]) { + infos[i].wave_size = + radv_get_wave_size(pipeline->device, pStages[i], + i, &keys[i]); + infos[i].ballot_bit_size = + radv_get_ballot_bit_size(pipeline->device, + pStages[i], i, + &keys[i]); + } + } } static void @@ -2482,6 +2720,14 @@ } static +bool radv_aco_supported_stage(gl_shader_stage stage, bool has_ts) +{ + return (stage == MESA_SHADER_VERTEX && !has_ts) || + (stage == MESA_SHADER_GEOMETRY && !has_ts) || + stage == MESA_SHADER_FRAGMENT || + stage == MESA_SHADER_COMPUTE; +} + void radv_create_shaders(struct radv_pipeline *pipeline, struct radv_device *device, struct radv_pipeline_cache *cache, @@ -2496,6 +2742,7 @@ nir_shader *nir[MESA_SHADER_STAGES] = {0}; struct radv_shader_binary *binaries[MESA_SHADER_STAGES] = {NULL}; struct radv_shader_variant_key keys[MESA_SHADER_STAGES] = {{{{{0}}}}}; + struct radv_shader_info infos[MESA_SHADER_STAGES] = {0}; unsigned char hash[20], gs_copy_hash[20]; bool keep_executable_info = (flags & VK_PIPELINE_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR) || device->keep_shader_info; @@ -2541,18 +2788,33 @@ modules[MESA_SHADER_FRAGMENT] = &fs_m; } + bool has_ts = modules[MESA_SHADER_TESS_CTRL] || modules[MESA_SHADER_TESS_EVAL]; + bool use_aco = device->physical_device->use_aco; + for (unsigned i = 0; i < MESA_SHADER_STAGES; ++i) { const VkPipelineShaderStageCreateInfo *stage = pStages[i]; + unsigned subgroup_size = 64, ballot_bit_size = 64; if (!modules[i]) continue; radv_start_feedback(stage_feedbacks[i]); + if (key->compute_subgroup_size) { + /* Only compute shaders currently support requiring a + * specific subgroup size. + */ + assert(i == MESA_SHADER_COMPUTE); + subgroup_size = key->compute_subgroup_size; + ballot_bit_size = key->compute_subgroup_size; + } + + bool aco = use_aco && radv_aco_supported_stage(i, has_ts); nir[i] = radv_shader_compile_to_nir(device, modules[i], stage ? stage->pName : "main", i, stage ? stage->pSpecializationInfo : NULL, - flags, pipeline->layout); + flags, pipeline->layout, aco, + subgroup_size, ballot_bit_size); /* We don't want to alter meta shaders IR directly so clone it * first. @@ -2574,45 +2836,103 @@ for (int i = 0; i < MESA_SHADER_STAGES; ++i) { if (nir[i]) { - NIR_PASS_V(nir[i], nir_lower_non_uniform_access, - nir_lower_non_uniform_ubo_access | - nir_lower_non_uniform_ssbo_access | - nir_lower_non_uniform_texture_access | - nir_lower_non_uniform_image_access); - NIR_PASS_V(nir[i], nir_lower_bool_to_int32); + bool aco = use_aco && radv_aco_supported_stage(i, has_ts); + if (aco) { + NIR_PASS_V(nir[i], nir_lower_non_uniform_access, + nir_lower_non_uniform_ubo_access | + nir_lower_non_uniform_ssbo_access | + nir_lower_non_uniform_texture_access | + nir_lower_non_uniform_image_access); + } else + NIR_PASS_V(nir[i], nir_lower_bool_to_int32); } + } + if (nir[MESA_SHADER_FRAGMENT]) + radv_lower_fs_io(nir[MESA_SHADER_FRAGMENT]); + + for (int i = 0; i < MESA_SHADER_STAGES; ++i) { if (radv_can_dump_shader(device, modules[i], false)) nir_print_shader(nir[i], stderr); } radv_fill_shader_keys(device, keys, key, nir); + radv_fill_shader_info(pipeline, pStages, keys, infos, nir); + + if ((nir[MESA_SHADER_VERTEX] && + keys[MESA_SHADER_VERTEX].vs_common_out.as_ngg) || + (nir[MESA_SHADER_TESS_EVAL] && + keys[MESA_SHADER_TESS_EVAL].vs_common_out.as_ngg)) { + struct gfx10_ngg_info *ngg_info; + + if (nir[MESA_SHADER_GEOMETRY]) + ngg_info = &infos[MESA_SHADER_GEOMETRY].ngg_info; + else if (nir[MESA_SHADER_TESS_CTRL]) + ngg_info = &infos[MESA_SHADER_TESS_EVAL].ngg_info; + else + ngg_info = &infos[MESA_SHADER_VERTEX].ngg_info; + + gfx10_get_ngg_info(key, pipeline, nir, infos, ngg_info); + } else if (nir[MESA_SHADER_GEOMETRY]) { + struct gfx9_gs_info *gs_info = + &infos[MESA_SHADER_GEOMETRY].gs_ring_info; + + gfx9_get_gs_info(key, pipeline, nir, infos, gs_info); + } + + if(modules[MESA_SHADER_GEOMETRY]) { + struct radv_shader_binary *gs_copy_binary = NULL; + if (!pipeline->gs_copy_shader && + !radv_pipeline_has_ngg(pipeline)) { + struct radv_shader_info info = {}; + struct radv_shader_variant_key key = {}; + + key.has_multiview_view_index = + keys[MESA_SHADER_GEOMETRY].has_multiview_view_index; + + radv_nir_shader_info_pass(nir[MESA_SHADER_GEOMETRY], + pipeline->layout, &key, + &info); + info.wave_size = 64; /* Wave32 not supported. */ + info.ballot_bit_size = 64; + + pipeline->gs_copy_shader = radv_create_gs_copy_shader( + device, nir[MESA_SHADER_GEOMETRY], &info, + &gs_copy_binary, keep_executable_info, + keys[MESA_SHADER_GEOMETRY].has_multiview_view_index, + use_aco); + } + + if (!keep_executable_info && pipeline->gs_copy_shader) { + struct radv_shader_binary *binaries[MESA_SHADER_STAGES] = {NULL}; + struct radv_shader_variant *variants[MESA_SHADER_STAGES] = {0}; + + binaries[MESA_SHADER_GEOMETRY] = gs_copy_binary; + variants[MESA_SHADER_GEOMETRY] = pipeline->gs_copy_shader; + + radv_pipeline_cache_insert_shaders(device, cache, + gs_copy_hash, + variants, + binaries); + } + free(gs_copy_binary); + } + if (nir[MESA_SHADER_FRAGMENT]) { if (!pipeline->shaders[MESA_SHADER_FRAGMENT]) { radv_start_feedback(stage_feedbacks[MESA_SHADER_FRAGMENT]); + bool aco = use_aco && radv_aco_supported_stage(MESA_SHADER_FRAGMENT, has_ts); pipeline->shaders[MESA_SHADER_FRAGMENT] = radv_shader_variant_compile(device, modules[MESA_SHADER_FRAGMENT], &nir[MESA_SHADER_FRAGMENT], 1, pipeline->layout, keys + MESA_SHADER_FRAGMENT, - keep_executable_info, &binaries[MESA_SHADER_FRAGMENT]); + infos + MESA_SHADER_FRAGMENT, + keep_executable_info, aco, + &binaries[MESA_SHADER_FRAGMENT]); radv_stop_feedback(stage_feedbacks[MESA_SHADER_FRAGMENT], false); } - - /* TODO: These are no longer used as keys we should refactor this */ - keys[MESA_SHADER_VERTEX].vs_common_out.export_prim_id = - pipeline->shaders[MESA_SHADER_FRAGMENT]->info.info.ps.prim_id_input; - keys[MESA_SHADER_VERTEX].vs_common_out.export_layer_id = - pipeline->shaders[MESA_SHADER_FRAGMENT]->info.info.ps.layer_input; - keys[MESA_SHADER_VERTEX].vs_common_out.export_clip_dists = - !!pipeline->shaders[MESA_SHADER_FRAGMENT]->info.info.ps.num_input_clips_culls; - keys[MESA_SHADER_TESS_EVAL].vs_common_out.export_prim_id = - pipeline->shaders[MESA_SHADER_FRAGMENT]->info.info.ps.prim_id_input; - keys[MESA_SHADER_TESS_EVAL].vs_common_out.export_layer_id = - pipeline->shaders[MESA_SHADER_FRAGMENT]->info.info.ps.layer_input; - keys[MESA_SHADER_TESS_EVAL].vs_common_out.export_clip_dists = - !!pipeline->shaders[MESA_SHADER_FRAGMENT]->info.info.ps.num_input_clips_culls; } if (device->physical_device->rad_info.chip_class >= GFX9 && modules[MESA_SHADER_TESS_CTRL]) { @@ -2625,14 +2945,14 @@ pipeline->shaders[MESA_SHADER_TESS_CTRL] = radv_shader_variant_compile(device, modules[MESA_SHADER_TESS_CTRL], combined_nir, 2, pipeline->layout, - &key, keep_executable_info, - &binaries[MESA_SHADER_TESS_CTRL]); + &key, &infos[MESA_SHADER_TESS_CTRL], keep_executable_info, + false, &binaries[MESA_SHADER_TESS_CTRL]); radv_stop_feedback(stage_feedbacks[MESA_SHADER_TESS_CTRL], false); } modules[MESA_SHADER_VERTEX] = NULL; keys[MESA_SHADER_TESS_EVAL].tes.num_patches = pipeline->shaders[MESA_SHADER_TESS_CTRL]->info.tcs.num_patches; - keys[MESA_SHADER_TESS_EVAL].tes.tcs_num_outputs = util_last_bit64(pipeline->shaders[MESA_SHADER_TESS_CTRL]->info.info.tcs.outputs_written); + keys[MESA_SHADER_TESS_EVAL].tes.tcs_num_outputs = util_last_bit64(pipeline->shaders[MESA_SHADER_TESS_CTRL]->info.tcs.outputs_written); } if (device->physical_device->rad_info.chip_class >= GFX9 && modules[MESA_SHADER_GEOMETRY]) { @@ -2642,10 +2962,11 @@ radv_start_feedback(stage_feedbacks[MESA_SHADER_GEOMETRY]); + bool aco = use_aco && radv_aco_supported_stage(MESA_SHADER_GEOMETRY, has_ts); pipeline->shaders[MESA_SHADER_GEOMETRY] = radv_shader_variant_compile(device, modules[MESA_SHADER_GEOMETRY], combined_nir, 2, pipeline->layout, - &keys[pre_stage], keep_executable_info, - &binaries[MESA_SHADER_GEOMETRY]); + &keys[pre_stage], &infos[MESA_SHADER_GEOMETRY], keep_executable_info, + aco, &binaries[MESA_SHADER_GEOMETRY]); radv_stop_feedback(stage_feedbacks[MESA_SHADER_GEOMETRY], false); } @@ -2655,49 +2976,25 @@ for (int i = 0; i < MESA_SHADER_STAGES; ++i) { if(modules[i] && !pipeline->shaders[i]) { if (i == MESA_SHADER_TESS_CTRL) { - keys[MESA_SHADER_TESS_CTRL].tcs.num_inputs = util_last_bit64(pipeline->shaders[MESA_SHADER_VERTEX]->info.info.vs.ls_outputs_written); + keys[MESA_SHADER_TESS_CTRL].tcs.num_inputs = util_last_bit64(pipeline->shaders[MESA_SHADER_VERTEX]->info.vs.ls_outputs_written); } if (i == MESA_SHADER_TESS_EVAL) { keys[MESA_SHADER_TESS_EVAL].tes.num_patches = pipeline->shaders[MESA_SHADER_TESS_CTRL]->info.tcs.num_patches; - keys[MESA_SHADER_TESS_EVAL].tes.tcs_num_outputs = util_last_bit64(pipeline->shaders[MESA_SHADER_TESS_CTRL]->info.info.tcs.outputs_written); + keys[MESA_SHADER_TESS_EVAL].tes.tcs_num_outputs = util_last_bit64(pipeline->shaders[MESA_SHADER_TESS_CTRL]->info.tcs.outputs_written); } radv_start_feedback(stage_feedbacks[i]); + bool aco = use_aco && radv_aco_supported_stage(i, has_ts); pipeline->shaders[i] = radv_shader_variant_compile(device, modules[i], &nir[i], 1, pipeline->layout, - keys + i, keep_executable_info, - &binaries[i]); + keys + i, infos + i,keep_executable_info, + aco, &binaries[i]); radv_stop_feedback(stage_feedbacks[i], false); } } - if(modules[MESA_SHADER_GEOMETRY]) { - struct radv_shader_binary *gs_copy_binary = NULL; - if (!pipeline->gs_copy_shader && - !radv_pipeline_has_ngg(pipeline)) { - pipeline->gs_copy_shader = radv_create_gs_copy_shader( - device, nir[MESA_SHADER_GEOMETRY], &gs_copy_binary, - keep_executable_info, - keys[MESA_SHADER_GEOMETRY].has_multiview_view_index); - } - - if (!keep_executable_info && pipeline->gs_copy_shader) { - struct radv_shader_binary *binaries[MESA_SHADER_STAGES] = {NULL}; - struct radv_shader_variant *variants[MESA_SHADER_STAGES] = {0}; - - binaries[MESA_SHADER_GEOMETRY] = gs_copy_binary; - variants[MESA_SHADER_GEOMETRY] = pipeline->gs_copy_shader; - - radv_pipeline_cache_insert_shaders(device, cache, - gs_copy_hash, - variants, - binaries); - } - free(gs_copy_binary); - } - if (!keep_executable_info) { radv_pipeline_cache_insert_shaders(device, cache, hash, pipeline->shaders, binaries); @@ -3010,7 +3307,8 @@ unsigned effective_samples = total_samples; unsigned color_bytes_per_pixel = 0; - const VkPipelineColorBlendStateCreateInfo *vkblend = pCreateInfo->pColorBlendState; + const VkPipelineColorBlendStateCreateInfo *vkblend = + radv_pipeline_get_color_blend_state(pCreateInfo); if (vkblend) { for (unsigned i = 0; i < subpass->color_count; i++) { if (!vkblend->pAttachments[i].colorWriteMask) @@ -3061,20 +3359,6 @@ struct radv_subpass *subpass = pass->subpasses + pCreateInfo->subpass; VkExtent2D extent = {512, 512}; - unsigned sdp_interface_count; - - switch(pipeline->device->physical_device->rad_info.family) { - case CHIP_NAVI10: - case CHIP_NAVI12: - sdp_interface_count = 16; - break; - case CHIP_NAVI14: - sdp_interface_count = 8; - break; - default: - unreachable("Unhandled GFX10 chip"); - } - const unsigned db_tag_size = 64; const unsigned db_tag_count = 312; const unsigned color_tag_size = 1024; @@ -3083,7 +3367,7 @@ const unsigned fmask_tag_count = 44; const unsigned rb_count = pipeline->device->physical_device->rad_info.num_render_backends; - const unsigned pipe_count = MAX2(rb_count, sdp_interface_count); + const unsigned pipe_count = MAX2(rb_count, pipeline->device->physical_device->rad_info.num_sdp_interfaces); const unsigned db_tag_part = (db_tag_count * rb_count / pipe_count) * db_tag_size * pipe_count; const unsigned color_tag_part = (color_tag_count * rb_count / pipe_count) * color_tag_size * pipe_count; @@ -3095,7 +3379,8 @@ unsigned color_bytes_per_pixel = 0; unsigned fmask_bytes_per_pixel = 0; - const VkPipelineColorBlendStateCreateInfo *vkblend = pCreateInfo->pColorBlendState; + const VkPipelineColorBlendStateCreateInfo *vkblend = + radv_pipeline_get_color_blend_state(pCreateInfo); if (vkblend) { for (unsigned i = 0; i < subpass->color_count; i++) { if (!vkblend->pAttachments[i].colorWriteMask) @@ -3108,6 +3393,7 @@ color_bytes_per_pixel += vk_format_get_blocksize(format); if (total_samples > 1) { + assert(samples_log <= 3); const unsigned fmask_array[] = {0, 1, 1, 4}; fmask_bytes_per_pixel += fmask_array[samples_log]; } @@ -3171,7 +3457,8 @@ if (pipeline->device->physical_device->rad_info.chip_class >= GFX10) { RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass); struct radv_subpass *subpass = pass->subpasses + pCreateInfo->subpass; - const VkPipelineColorBlendStateCreateInfo *vkblend = pCreateInfo->pColorBlendState; + const VkPipelineColorBlendStateCreateInfo *vkblend = + radv_pipeline_get_color_blend_state(pCreateInfo); unsigned min_bytes_per_pixel = 0; if (vkblend) { @@ -3202,10 +3489,33 @@ pipeline->graphics.binning.db_dfsm_control = db_dfsm_control; } +struct radv_binning_settings +radv_get_binning_settings(const struct radv_physical_device *pdev) +{ + struct radv_binning_settings settings; + if (pdev->rad_info.has_dedicated_vram) { + settings.context_states_per_bin = 1; + settings.persistent_states_per_bin = 1; + settings.fpovs_per_batch = 63; + } else { + /* The context states are affected by the scissor bug. */ + settings.context_states_per_bin = 6; + /* 32 causes hangs for RAVEN. */ + settings.persistent_states_per_bin = 16; + settings.fpovs_per_batch = 63; + } + + if (pdev->rad_info.has_gfx9_scissor_bug) + settings.context_states_per_bin = 1; + + return settings; +} + static void radv_pipeline_generate_binning_state(struct radeon_cmdbuf *ctx_cs, struct radv_pipeline *pipeline, - const VkGraphicsPipelineCreateInfo *pCreateInfo) + const VkGraphicsPipelineCreateInfo *pCreateInfo, + const struct radv_blend_state *blend) { if (pipeline->device->physical_device->rad_info.chip_class < GFX9) return; @@ -3219,20 +3529,20 @@ unreachable("Unhandled generation for binning bin size calculation"); if (pipeline->device->pbb_allowed && bin_size.width && bin_size.height) { - unsigned context_states_per_bin; /* allowed range: [1, 6] */ - unsigned persistent_states_per_bin; /* allowed range: [1, 32] */ - unsigned fpovs_per_batch; /* allowed range: [0, 255], 0 = unlimited */ - - if (pipeline->device->physical_device->rad_info.has_dedicated_vram) { - context_states_per_bin = 1; - persistent_states_per_bin = 1; - fpovs_per_batch = 63; - } else { - /* The context states are affected by the scissor bug. */ - context_states_per_bin = pipeline->device->physical_device->has_scissor_bug ? 1 : 6; - /* 32 causes hangs for RAVEN. */ - persistent_states_per_bin = 16; - fpovs_per_batch = 63; + struct radv_binning_settings settings = + radv_get_binning_settings(pipeline->device->physical_device); + + bool disable_start_of_prim = true; + uint32_t db_dfsm_control = S_028060_PUNCHOUT_MODE(V_028060_FORCE_OFF); + + const struct radv_shader_variant *ps = pipeline->shaders[MESA_SHADER_FRAGMENT]; + + if (pipeline->device->dfsm_allowed && ps && + !ps->info.ps.can_discard && + !ps->info.ps.writes_memory && + blend->cb_target_enabled_4bit) { + db_dfsm_control = S_028060_PUNCHOUT_MODE(V_028060_AUTO); + disable_start_of_prim = (blend->blend_enable_4bit & blend->cb_target_enabled_4bit) != 0; } const uint32_t pa_sc_binner_cntl_0 = @@ -3241,14 +3551,12 @@ S_028C44_BIN_SIZE_Y(bin_size.height == 16) | S_028C44_BIN_SIZE_X_EXTEND(util_logbase2(MAX2(bin_size.width, 32)) - 5) | S_028C44_BIN_SIZE_Y_EXTEND(util_logbase2(MAX2(bin_size.height, 32)) - 5) | - S_028C44_CONTEXT_STATES_PER_BIN(context_states_per_bin - 1) | - S_028C44_PERSISTENT_STATES_PER_BIN(persistent_states_per_bin - 1) | - S_028C44_DISABLE_START_OF_PRIM(1) | - S_028C44_FPOVS_PER_BATCH(fpovs_per_batch) | + S_028C44_CONTEXT_STATES_PER_BIN(settings.context_states_per_bin - 1) | + S_028C44_PERSISTENT_STATES_PER_BIN(settings.persistent_states_per_bin - 1) | + S_028C44_DISABLE_START_OF_PRIM(disable_start_of_prim) | + S_028C44_FPOVS_PER_BATCH(settings.fpovs_per_batch) | S_028C44_OPTIMAL_BIN_SELECTION(1); - uint32_t db_dfsm_control = S_028060_PUNCHOUT_MODE(V_028060_FORCE_OFF); - pipeline->graphics.binning.pa_sc_binner_cntl_0 = pa_sc_binner_cntl_0; pipeline->graphics.binning.db_dfsm_control = db_dfsm_control; } else @@ -3262,9 +3570,10 @@ const VkGraphicsPipelineCreateInfo *pCreateInfo, const struct radv_graphics_pipeline_create_info *extra) { - const VkPipelineDepthStencilStateCreateInfo *vkds = pCreateInfo->pDepthStencilState; + const VkPipelineDepthStencilStateCreateInfo *vkds = radv_pipeline_get_depth_stencil_state(pCreateInfo); RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass); struct radv_subpass *subpass = pass->subpasses + pCreateInfo->subpass; + struct radv_shader_variant *ps = pipeline->shaders[MESA_SHADER_FRAGMENT]; struct radv_render_pass_attachment *attachment = NULL; uint32_t db_depth_control = 0, db_stencil_control = 0; uint32_t db_render_control = 0, db_render_override2 = 0; @@ -3313,7 +3622,8 @@ db_render_override |= S_02800C_FORCE_HIS_ENABLE0(V_02800C_FORCE_DISABLE) | S_02800C_FORCE_HIS_ENABLE1(V_02800C_FORCE_DISABLE); - if (!pCreateInfo->pRasterizationState->depthClampEnable) { + if (!pCreateInfo->pRasterizationState->depthClampEnable && + ps->info.ps.writes_z) { /* From VK_EXT_depth_range_unrestricted spec: * * "The behavior described in Primitive Clipping still applies. @@ -3346,7 +3656,7 @@ radeon_set_context_reg(ctx_cs, R_028808_CB_COLOR_CONTROL, blend->cb_color_control); radeon_set_context_reg(ctx_cs, R_028B70_DB_ALPHA_TO_MASK, blend->db_alpha_to_mask); - if (pipeline->device->physical_device->has_rbplus) { + if (pipeline->device->physical_device->rad_info.has_rbplus) { radeon_set_context_reg_seq(ctx_cs, R_028760_SX_MRT0_BLEND_OPT, 8); radeon_emit_array(ctx_cs, blend->sx_mrt_blend_opt, 8); @@ -3467,7 +3777,10 @@ radeon_emit(ctx_cs, ms->pa_sc_aa_mask[1]); radeon_set_context_reg(ctx_cs, R_028804_DB_EQAA, ms->db_eqaa); + radeon_set_context_reg(ctx_cs, R_028A48_PA_SC_MODE_CNTL_0, ms->pa_sc_mode_cntl_0); radeon_set_context_reg(ctx_cs, R_028A4C_PA_SC_MODE_CNTL_1, ms->pa_sc_mode_cntl_1); + radeon_set_context_reg(ctx_cs, R_028BDC_PA_SC_LINE_CNTL, ms->pa_sc_line_cntl); + radeon_set_context_reg(ctx_cs, R_028BE0_PA_SC_AA_CONFIG, ms->pa_sc_aa_config); /* The exclusion bits can be set to improve rasterization efficiency * if no sample lies on the pixel boundary (-8 sample offset). It's @@ -3477,6 +3790,12 @@ radeon_set_context_reg(ctx_cs, R_02882C_PA_SU_PRIM_FILTER_CNTL, S_02882C_XMAX_RIGHT_EXCLUSION(exclusion) | S_02882C_YMAX_BOTTOM_EXCLUSION(exclusion)); + + /* GFX9: Flush DFSM when the AA mode changes. */ + if (pipeline->device->dfsm_allowed) { + radeon_emit(ctx_cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(ctx_cs, EVENT_TYPE(V_028A90_FLUSH_DFSM) | EVENT_INDEX(0)); + } } static void @@ -3500,7 +3819,7 @@ vgt_gs_mode = ac_vgt_gs_mode(gs->info.gs.vertices_out, pipeline->device->physical_device->rad_info.chip_class); - } else if (outinfo->export_prim_id || vs->info.info.uses_prim_id) { + } else if (outinfo->export_prim_id || vs->info.uses_prim_id) { vgt_gs_mode = S_028A40_MODE(V_028A40_GS_SCENARIO_A); vgt_primitiveid_en |= S_028A84_PRIMITIVEID_EN(1); } @@ -3618,14 +3937,14 @@ radv_pipeline_generate_hw_ngg(struct radeon_cmdbuf *ctx_cs, struct radeon_cmdbuf *cs, struct radv_pipeline *pipeline, - struct radv_shader_variant *shader, - const struct radv_ngg_state *ngg_state) + struct radv_shader_variant *shader) { uint64_t va = radv_buffer_get_va(shader->bo) + shader->bo_offset; gl_shader_stage es_type = radv_pipeline_has_tess(pipeline) ? MESA_SHADER_TESS_EVAL : MESA_SHADER_VERTEX; struct radv_shader_variant *es = es_type == MESA_SHADER_TESS_EVAL ? pipeline->shaders[MESA_SHADER_TESS_EVAL] : pipeline->shaders[MESA_SHADER_VERTEX]; + const struct gfx10_ngg_info *ngg_state = &shader->info.ngg_info; radeon_set_sh_reg_seq(cs, R_00B320_SPI_SHADER_PGM_LO_ES, 2); radeon_emit(cs, va >> 8); @@ -3643,7 +3962,7 @@ outinfo->writes_layer || outinfo->writes_viewport_index; bool es_enable_prim_id = outinfo->export_prim_id || - (es && es->info.info.uses_prim_id); + (es && es->info.uses_prim_id); bool break_wave_at_eoi = false; unsigned ge_cntl; unsigned nparams; @@ -3652,7 +3971,7 @@ struct radv_shader_variant *gs = pipeline->shaders[MESA_SHADER_GEOMETRY]; - if (es_enable_prim_id || (gs && gs->info.info.uses_prim_id)) + if (es_enable_prim_id || (gs && gs->info.uses_prim_id)) break_wave_at_eoi = true; } @@ -3693,7 +4012,7 @@ radeon_set_context_reg(ctx_cs, R_028A84_VGT_PRIMITIVEID_EN, S_028A84_PRIMITIVEID_EN(es_enable_prim_id) | - S_028A84_NGG_DISABLE_PROVOK_REUSE(es_enable_prim_id)); + S_028A84_NGG_DISABLE_PROVOK_REUSE(outinfo->export_prim_id)); radeon_set_context_reg(ctx_cs, R_028AAC_VGT_ESGS_RING_ITEMSIZE, ngg_state->vgt_esgs_ring_itemsize); @@ -3729,7 +4048,7 @@ !radv_pipeline_has_gs(pipeline))); ge_cntl = S_03096C_PRIM_GRP_SIZE(ngg_state->max_gsprims) | - S_03096C_VERT_GRP_SIZE(ngg_state->hw_max_esverts) | + S_03096C_VERT_GRP_SIZE(256) | /* 256 = disable vertex grouping */ S_03096C_BREAK_WAVE_AT_EOI(break_wave_at_eoi); /* Bug workaround for a possible hang with non-tessellation cases. @@ -3795,8 +4114,7 @@ radv_pipeline_generate_vertex_shader(struct radeon_cmdbuf *ctx_cs, struct radeon_cmdbuf *cs, struct radv_pipeline *pipeline, - const struct radv_tessellation_state *tess, - const struct radv_ngg_state *ngg) + const struct radv_tessellation_state *tess) { struct radv_shader_variant *vs; @@ -3810,7 +4128,7 @@ else if (vs->info.vs.as_es) radv_pipeline_generate_hw_es(cs, pipeline, vs); else if (vs->info.is_ngg) - radv_pipeline_generate_hw_ngg(ctx_cs, cs, pipeline, vs, ngg); + radv_pipeline_generate_hw_ngg(ctx_cs, cs, pipeline, vs); else radv_pipeline_generate_hw_vs(ctx_cs, cs, pipeline, vs); } @@ -3819,8 +4137,7 @@ radv_pipeline_generate_tess_shaders(struct radeon_cmdbuf *ctx_cs, struct radeon_cmdbuf *cs, struct radv_pipeline *pipeline, - const struct radv_tessellation_state *tess, - const struct radv_ngg_state *ngg) + const struct radv_tessellation_state *tess) { if (!radv_pipeline_has_tess(pipeline)) return; @@ -3832,7 +4149,7 @@ if (tes) { if (tes->info.is_ngg) { - radv_pipeline_generate_hw_ngg(ctx_cs, cs, pipeline, tes, ngg); + radv_pipeline_generate_hw_ngg(ctx_cs, cs, pipeline, tes); } else if (tes->info.tes.as_es) radv_pipeline_generate_hw_es(cs, pipeline, tes); else @@ -3864,9 +4181,9 @@ radv_pipeline_generate_hw_gs(struct radeon_cmdbuf *ctx_cs, struct radeon_cmdbuf *cs, struct radv_pipeline *pipeline, - struct radv_shader_variant *gs, - const struct radv_gs_state *gs_state) + struct radv_shader_variant *gs) { + const struct gfx9_gs_info *gs_state = &gs->info.gs_ring_info; unsigned gs_max_out_vertices; uint8_t *num_components; uint8_t max_stream; @@ -3874,8 +4191,8 @@ uint64_t va; gs_max_out_vertices = gs->info.gs.vertices_out; - max_stream = gs->info.info.gs.max_stream; - num_components = gs->info.info.gs.num_stream_output_components; + max_stream = gs->info.gs.max_stream; + num_components = gs->info.gs.num_stream_output_components; offset = num_components[0] * gs_max_out_vertices; @@ -3938,9 +4255,7 @@ static void radv_pipeline_generate_geometry_shader(struct radeon_cmdbuf *ctx_cs, struct radeon_cmdbuf *cs, - struct radv_pipeline *pipeline, - const struct radv_gs_state *gs_state, - const struct radv_ngg_state *ngg_state) + struct radv_pipeline *pipeline) { struct radv_shader_variant *gs; @@ -3949,21 +4264,28 @@ return; if (gs->info.is_ngg) - radv_pipeline_generate_hw_ngg(ctx_cs, cs, pipeline, gs, ngg_state); + radv_pipeline_generate_hw_ngg(ctx_cs, cs, pipeline, gs); else - radv_pipeline_generate_hw_gs(ctx_cs, cs, pipeline, gs, gs_state); + radv_pipeline_generate_hw_gs(ctx_cs, cs, pipeline, gs); radeon_set_context_reg(ctx_cs, R_028B38_VGT_GS_MAX_VERT_OUT, gs->info.gs.vertices_out); } -static uint32_t offset_to_ps_input(uint32_t offset, bool flat_shade, bool float16) +static uint32_t offset_to_ps_input(uint32_t offset, bool flat_shade, + bool explicit, bool float16) { uint32_t ps_input_cntl; if (offset <= AC_EXP_PARAM_OFFSET_31) { ps_input_cntl = S_028644_OFFSET(offset); - if (flat_shade) + if (flat_shade || explicit) ps_input_cntl |= S_028644_FLAT_SHADE(1); + if (explicit) { + /* Force parameter cache to be read in passthrough + * mode. + */ + ps_input_cntl |= S_028644_OFFSET(1 << 5); + } if (float16) { ps_input_cntl |= S_028644_FP16_INTERP_MODE(1) | S_028644_ATTR0_VALID(1); @@ -3989,53 +4311,63 @@ unsigned ps_offset = 0; - if (ps->info.info.ps.prim_id_input) { + if (ps->info.ps.prim_id_input) { unsigned vs_offset = outinfo->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID]; if (vs_offset != AC_EXP_PARAM_UNDEFINED) { - ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, true, false); + ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, true, false, false); ++ps_offset; } } - if (ps->info.info.ps.layer_input || - ps->info.info.needs_multiview_view_index) { + if (ps->info.ps.layer_input || + ps->info.needs_multiview_view_index) { unsigned vs_offset = outinfo->vs_output_param_offset[VARYING_SLOT_LAYER]; if (vs_offset != AC_EXP_PARAM_UNDEFINED) - ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, true, false); + ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, true, false, false); + else + ps_input_cntl[ps_offset] = offset_to_ps_input(AC_EXP_PARAM_DEFAULT_VAL_0000, true, false, false); + ++ps_offset; + } + + if (ps->info.ps.viewport_index_input) { + unsigned vs_offset = outinfo->vs_output_param_offset[VARYING_SLOT_VIEWPORT]; + if (vs_offset != AC_EXP_PARAM_UNDEFINED) + ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, true, false, false); else - ps_input_cntl[ps_offset] = offset_to_ps_input(AC_EXP_PARAM_DEFAULT_VAL_0000, true, false); + ps_input_cntl[ps_offset] = offset_to_ps_input(AC_EXP_PARAM_DEFAULT_VAL_0000, true, false, false); ++ps_offset; } - if (ps->info.info.ps.has_pcoord) { + if (ps->info.ps.has_pcoord) { unsigned val; val = S_028644_PT_SPRITE_TEX(1) | S_028644_OFFSET(0x20); ps_input_cntl[ps_offset] = val; ps_offset++; } - if (ps->info.info.ps.num_input_clips_culls) { + if (ps->info.ps.num_input_clips_culls) { unsigned vs_offset; vs_offset = outinfo->vs_output_param_offset[VARYING_SLOT_CLIP_DIST0]; if (vs_offset != AC_EXP_PARAM_UNDEFINED) { - ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, false, false); + ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, false, false, false); ++ps_offset; } vs_offset = outinfo->vs_output_param_offset[VARYING_SLOT_CLIP_DIST1]; if (vs_offset != AC_EXP_PARAM_UNDEFINED && - ps->info.info.ps.num_input_clips_culls > 4) { - ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, false, false); + ps->info.ps.num_input_clips_culls > 4) { + ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, false, false, false); ++ps_offset; } } - for (unsigned i = 0; i < 32 && (1u << i) <= ps->info.fs.input_mask; ++i) { + for (unsigned i = 0; i < 32 && (1u << i) <= ps->info.ps.input_mask; ++i) { unsigned vs_offset; bool flat_shade; + bool explicit; bool float16; - if (!(ps->info.fs.input_mask & (1u << i))) + if (!(ps->info.ps.input_mask & (1u << i))) continue; vs_offset = outinfo->vs_output_param_offset[VARYING_SLOT_VAR0 + i]; @@ -4045,10 +4377,11 @@ continue; } - flat_shade = !!(ps->info.fs.flat_shaded_mask & (1u << ps_offset)); - float16 = !!(ps->info.fs.float16_shaded_mask & (1u << ps_offset)); + flat_shade = !!(ps->info.ps.flat_shaded_mask & (1u << ps_offset)); + explicit = !!(ps->info.ps.explicit_shaded_mask & (1u << ps_offset)); + float16 = !!(ps->info.ps.float16_shaded_mask & (1u << ps_offset)); - ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, flat_shade, float16); + ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, flat_shade, explicit, float16); ++ps_offset; } @@ -4066,29 +4399,29 @@ const struct radv_shader_variant *ps) { unsigned z_order; - if (ps->info.fs.early_fragment_test || !ps->info.info.ps.writes_memory) + if (ps->info.ps.early_fragment_test || !ps->info.ps.writes_memory) z_order = V_02880C_EARLY_Z_THEN_LATE_Z; else z_order = V_02880C_LATE_Z; - bool disable_rbplus = device->physical_device->has_rbplus && - !device->physical_device->rbplus_allowed; + bool disable_rbplus = device->physical_device->rad_info.has_rbplus && + !device->physical_device->rad_info.rbplus_allowed; /* It shouldn't be needed to export gl_SampleMask when MSAA is disabled * but this appears to break Project Cars (DXVK). See * https://bugs.freedesktop.org/show_bug.cgi?id=109401 */ - bool mask_export_enable = ps->info.info.ps.writes_sample_mask; + bool mask_export_enable = ps->info.ps.writes_sample_mask; - return S_02880C_Z_EXPORT_ENABLE(ps->info.info.ps.writes_z) | - S_02880C_STENCIL_TEST_VAL_EXPORT_ENABLE(ps->info.info.ps.writes_stencil) | - S_02880C_KILL_ENABLE(!!ps->info.fs.can_discard) | + return S_02880C_Z_EXPORT_ENABLE(ps->info.ps.writes_z) | + S_02880C_STENCIL_TEST_VAL_EXPORT_ENABLE(ps->info.ps.writes_stencil) | + S_02880C_KILL_ENABLE(!!ps->info.ps.can_discard) | S_02880C_MASK_EXPORT_ENABLE(mask_export_enable) | S_02880C_Z_ORDER(z_order) | - S_02880C_DEPTH_BEFORE_SHADER(ps->info.fs.early_fragment_test) | - S_02880C_PRE_SHADER_DEPTH_COVERAGE_ENABLE(ps->info.fs.post_depth_coverage) | - S_02880C_EXEC_ON_HIER_FAIL(ps->info.info.ps.writes_memory) | - S_02880C_EXEC_ON_NOOP(ps->info.info.ps.writes_memory) | + S_02880C_DEPTH_BEFORE_SHADER(ps->info.ps.early_fragment_test) | + S_02880C_PRE_SHADER_DEPTH_COVERAGE_ENABLE(ps->info.ps.post_depth_coverage) | + S_02880C_EXEC_ON_HIER_FAIL(ps->info.ps.writes_memory) | + S_02880C_EXEC_ON_NOOP(ps->info.ps.writes_memory) | S_02880C_DUAL_QUAD_DISABLE(disable_rbplus); } @@ -4121,15 +4454,15 @@ ps->config.spi_ps_input_addr); radeon_set_context_reg(ctx_cs, R_0286D8_SPI_PS_IN_CONTROL, - S_0286D8_NUM_INTERP(ps->info.fs.num_interp) | - S_0286D8_PS_W32_EN(ps->info.info.wave_size == 32)); + S_0286D8_NUM_INTERP(ps->info.ps.num_interp) | + S_0286D8_PS_W32_EN(ps->info.wave_size == 32)); radeon_set_context_reg(ctx_cs, R_0286E0_SPI_BARYC_CNTL, pipeline->graphics.spi_baryc_cntl); radeon_set_context_reg(ctx_cs, R_028710_SPI_SHADER_Z_FORMAT, - ac_get_spi_shader_z_format(ps->info.info.ps.writes_z, - ps->info.info.ps.writes_stencil, - ps->info.info.ps.writes_sample_mask)); + ac_get_spi_shader_z_format(ps->info.ps.writes_z, + ps->info.ps.writes_stencil, + ps->info.ps.writes_sample_mask)); if (pipeline->device->dfsm_allowed) { /* optimise this? */ @@ -4179,6 +4512,10 @@ if (radv_pipeline_has_ngg(pipeline)) { stages |= S_028B54_PRIMGEN_EN(1); + if (pipeline->streamout_shader) + stages |= S_028B54_NGG_WAVE_ID_EN(1); + if (radv_pipeline_has_ngg_passthrough(pipeline)) + stages |= S_028B54_PRIMGEN_PASSTHRU_EN(1); } else if (radv_pipeline_has_gs(pipeline)) { stages |= S_028B54_VS_EN(V_028B54_VS_STAGE_COPY_SHADER); } @@ -4190,16 +4527,16 @@ uint8_t hs_size = 64, gs_size = 64, vs_size = 64; if (radv_pipeline_has_tess(pipeline)) - hs_size = pipeline->shaders[MESA_SHADER_TESS_CTRL]->info.info.wave_size; + hs_size = pipeline->shaders[MESA_SHADER_TESS_CTRL]->info.wave_size; if (pipeline->shaders[MESA_SHADER_GEOMETRY]) { - vs_size = gs_size = pipeline->shaders[MESA_SHADER_GEOMETRY]->info.info.wave_size; + vs_size = gs_size = pipeline->shaders[MESA_SHADER_GEOMETRY]->info.wave_size; if (pipeline->gs_copy_shader) - vs_size = pipeline->gs_copy_shader->info.info.wave_size; + vs_size = pipeline->gs_copy_shader->info.wave_size; } else if (pipeline->shaders[MESA_SHADER_TESS_EVAL]) - vs_size = pipeline->shaders[MESA_SHADER_TESS_EVAL]->info.info.wave_size; + vs_size = pipeline->shaders[MESA_SHADER_TESS_EVAL]->info.wave_size; else if (pipeline->shaders[MESA_SHADER_VERTEX]) - vs_size = pipeline->shaders[MESA_SHADER_VERTEX]->info.info.wave_size; + vs_size = pipeline->shaders[MESA_SHADER_VERTEX]->info.wave_size; if (radv_pipeline_has_ngg(pipeline)) gs_size = vs_size; @@ -4247,28 +4584,26 @@ static void gfx10_pipeline_generate_ge_cntl(struct radeon_cmdbuf *ctx_cs, struct radv_pipeline *pipeline, - const struct radv_tessellation_state *tess, - const struct radv_gs_state *gs_state) + const struct radv_tessellation_state *tess) { bool break_wave_at_eoi = false; unsigned primgroup_size; - unsigned vertgroup_size; + unsigned vertgroup_size = 256; /* 256 = disable vertex grouping */ if (radv_pipeline_has_tess(pipeline)) { primgroup_size = tess->num_patches; /* must be a multiple of NUM_PATCHES */ - vertgroup_size = 0; } else if (radv_pipeline_has_gs(pipeline)) { + const struct gfx9_gs_info *gs_state = + &pipeline->shaders[MESA_SHADER_GEOMETRY]->info.gs_ring_info; unsigned vgt_gs_onchip_cntl = gs_state->vgt_gs_onchip_cntl; primgroup_size = G_028A44_GS_PRIMS_PER_SUBGRP(vgt_gs_onchip_cntl); - vertgroup_size = G_028A44_ES_VERTS_PER_SUBGRP(vgt_gs_onchip_cntl); } else { primgroup_size = 128; /* recommended without a GS and tess */ - vertgroup_size = 0; } if (radv_pipeline_has_tess(pipeline)) { - if (pipeline->shaders[MESA_SHADER_TESS_CTRL]->info.info.uses_prim_id || - radv_get_shader(pipeline, MESA_SHADER_TESS_EVAL)->info.info.uses_prim_id) + if (pipeline->shaders[MESA_SHADER_TESS_CTRL]->info.uses_prim_id || + radv_get_shader(pipeline, MESA_SHADER_TESS_EVAL)->info.uses_prim_id) break_wave_at_eoi = true; } @@ -4285,8 +4620,6 @@ const struct radv_graphics_pipeline_create_info *extra, const struct radv_blend_state *blend, const struct radv_tessellation_state *tess, - const struct radv_gs_state *gs, - const struct radv_ngg_state *ngg, unsigned prim, unsigned gs_out) { struct radeon_cmdbuf *ctx_cs = &pipeline->ctx_cs; @@ -4302,20 +4635,16 @@ radv_pipeline_generate_raster_state(ctx_cs, pipeline, pCreateInfo); radv_pipeline_generate_multisample_state(ctx_cs, pipeline); radv_pipeline_generate_vgt_gs_mode(ctx_cs, pipeline); - radv_pipeline_generate_vertex_shader(ctx_cs, cs, pipeline, tess, ngg); - radv_pipeline_generate_tess_shaders(ctx_cs, cs, pipeline, tess, ngg); - radv_pipeline_generate_geometry_shader(ctx_cs, cs, pipeline, gs, ngg); + radv_pipeline_generate_vertex_shader(ctx_cs, cs, pipeline, tess); + radv_pipeline_generate_tess_shaders(ctx_cs, cs, pipeline, tess); + radv_pipeline_generate_geometry_shader(ctx_cs, cs, pipeline); radv_pipeline_generate_fragment_shader(ctx_cs, cs, pipeline); radv_pipeline_generate_ps_inputs(ctx_cs, pipeline); radv_pipeline_generate_vgt_vertex_reuse(ctx_cs, pipeline); - radv_pipeline_generate_binning_state(ctx_cs, pipeline, pCreateInfo); + radv_pipeline_generate_binning_state(ctx_cs, pipeline, pCreateInfo, blend); if (pipeline->device->physical_device->rad_info.chip_class >= GFX10 && !radv_pipeline_has_ngg(pipeline)) - gfx10_pipeline_generate_ge_cntl(ctx_cs, pipeline, tess, gs); - - radeon_set_context_reg(ctx_cs, R_0286E8_SPI_TMPRING_SIZE, - S_0286E8_WAVES(pipeline->max_waves) | - S_0286E8_WAVESIZE(pipeline->scratch_bytes_per_wave >> 10)); + gfx10_pipeline_generate_ge_cntl(ctx_cs, pipeline, tess); radeon_set_context_reg(ctx_cs, R_028B54_VGT_SHADER_STAGES_EN, radv_compute_vgt_shader_stages_en(pipeline)); @@ -4374,15 +4703,15 @@ } ia_multi_vgt_param.ia_switch_on_eoi = false; - if (pipeline->shaders[MESA_SHADER_FRAGMENT]->info.info.ps.prim_id_input) + if (pipeline->shaders[MESA_SHADER_FRAGMENT]->info.ps.prim_id_input) ia_multi_vgt_param.ia_switch_on_eoi = true; if (radv_pipeline_has_gs(pipeline) && - pipeline->shaders[MESA_SHADER_GEOMETRY]->info.info.uses_prim_id) + pipeline->shaders[MESA_SHADER_GEOMETRY]->info.uses_prim_id) ia_multi_vgt_param.ia_switch_on_eoi = true; if (radv_pipeline_has_tess(pipeline)) { /* SWITCH_ON_EOI must be set if PrimID is used. */ - if (pipeline->shaders[MESA_SHADER_TESS_CTRL]->info.info.uses_prim_id || - radv_get_shader(pipeline, MESA_SHADER_TESS_EVAL)->info.info.uses_prim_id) + if (pipeline->shaders[MESA_SHADER_TESS_CTRL]->info.uses_prim_id || + radv_get_shader(pipeline, MESA_SHADER_TESS_EVAL)->info.uses_prim_id) ia_multi_vgt_param.ia_switch_on_eoi = true; } @@ -4395,7 +4724,7 @@ radv_pipeline_has_gs(pipeline)) ia_multi_vgt_param.partial_vs_wave = true; /* Needed for 028B6C_DISTRIBUTION_MODE != 0 */ - if (device->has_distributed_tess) { + if (device->physical_device->rad_info.has_distributed_tess) { if (radv_pipeline_has_gs(pipeline)) { if (device->physical_device->rad_info.chip_class <= GFX8) ia_multi_vgt_param.partial_es_wave = true; @@ -4485,7 +4814,7 @@ struct radv_shader_variant *shader = radv_get_shader(pipeline, i); - if (shader && shader->info.info.so.num_outputs > 0) + if (shader && shader->info.so.num_outputs > 0) return shader; } @@ -4493,6 +4822,197 @@ } static VkResult +radv_secure_compile(struct radv_pipeline *pipeline, + struct radv_device *device, + const struct radv_pipeline_key *key, + const VkPipelineShaderStageCreateInfo **pStages, + const VkPipelineCreateFlags flags, + unsigned num_stages) +{ + uint8_t allowed_pipeline_hashes[2][20]; + radv_hash_shaders(allowed_pipeline_hashes[0], pStages, + pipeline->layout, key, get_hash_flags(device)); + + /* Generate the GC copy hash */ + memcpy(allowed_pipeline_hashes[1], allowed_pipeline_hashes[0], 20); + allowed_pipeline_hashes[1][0] ^= 1; + + uint8_t allowed_hashes[2][20]; + for (unsigned i = 0; i < 2; ++i) { + disk_cache_compute_key(device->physical_device->disk_cache, + allowed_pipeline_hashes[i], 20, + allowed_hashes[i]); + } + + /* Do an early exit if all cache entries are already there. */ + bool may_need_copy_shader = pStages[MESA_SHADER_GEOMETRY]; + void *main_entry = disk_cache_get(device->physical_device->disk_cache, allowed_hashes[0], NULL); + void *copy_entry = NULL; + if (may_need_copy_shader) + copy_entry = disk_cache_get(device->physical_device->disk_cache, allowed_hashes[1], NULL); + + bool has_all_cache_entries = main_entry && (!may_need_copy_shader || copy_entry); + free(main_entry); + free(copy_entry); + + if(has_all_cache_entries) + return VK_SUCCESS; + + unsigned process = 0; + uint8_t sc_threads = device->instance->num_sc_threads; + while (true) { + mtx_lock(&device->sc_state->secure_compile_mutex); + if (device->sc_state->secure_compile_thread_counter < sc_threads) { + device->sc_state->secure_compile_thread_counter++; + for (unsigned i = 0; i < sc_threads; i++) { + if (!device->sc_state->secure_compile_processes[i].in_use) { + device->sc_state->secure_compile_processes[i].in_use = true; + process = i; + break; + } + } + mtx_unlock(&device->sc_state->secure_compile_mutex); + break; + } + mtx_unlock(&device->sc_state->secure_compile_mutex); + } + + int fd_secure_input = device->sc_state->secure_compile_processes[process].fd_secure_input; + int fd_secure_output = device->sc_state->secure_compile_processes[process].fd_secure_output; + + /* Fork a copy of the slim untainted secure compile process */ + enum radv_secure_compile_type sc_type = RADV_SC_TYPE_FORK_DEVICE; + write(fd_secure_input, &sc_type, sizeof(sc_type)); + + if (!radv_sc_read(fd_secure_output, &sc_type, sizeof(sc_type), true) || + sc_type != RADV_SC_TYPE_INIT_SUCCESS) + return VK_ERROR_DEVICE_LOST; + + fd_secure_input = device->sc_state->secure_compile_processes[process].fd_server; + fd_secure_output = device->sc_state->secure_compile_processes[process].fd_client; + + /* Write pipeline / shader module out to secure process via pipe */ + sc_type = RADV_SC_TYPE_COMPILE_PIPELINE; + write(fd_secure_input, &sc_type, sizeof(sc_type)); + + /* Write pipeline layout out to secure process */ + struct radv_pipeline_layout *layout = pipeline->layout; + write(fd_secure_input, layout, sizeof(struct radv_pipeline_layout)); + write(fd_secure_input, &layout->num_sets, sizeof(uint32_t)); + for (uint32_t set = 0; set < layout->num_sets; set++) { + write(fd_secure_input, &layout->set[set].layout->layout_size, sizeof(uint32_t)); + write(fd_secure_input, layout->set[set].layout, layout->set[set].layout->layout_size); + } + + /* Write pipeline key out to secure process */ + write(fd_secure_input, key, sizeof(struct radv_pipeline_key)); + + /* Write pipeline create flags out to secure process */ + write(fd_secure_input, &flags, sizeof(VkPipelineCreateFlags)); + + /* Write stage and shader information out to secure process */ + write(fd_secure_input, &num_stages, sizeof(uint32_t)); + for (uint32_t i = 0; i < MESA_SHADER_STAGES; i++) { + if (!pStages[i]) + continue; + + /* Write stage out to secure process */ + gl_shader_stage stage = ffs(pStages[i]->stage) - 1; + write(fd_secure_input, &stage, sizeof(gl_shader_stage)); + + /* Write entry point name out to secure process */ + size_t name_size = strlen(pStages[i]->pName) + 1; + write(fd_secure_input, &name_size, sizeof(size_t)); + write(fd_secure_input, pStages[i]->pName, name_size); + + /* Write shader module out to secure process */ + struct radv_shader_module *module = radv_shader_module_from_handle(pStages[i]->module); + assert(!module->nir); + size_t module_size = sizeof(struct radv_shader_module) + module->size; + write(fd_secure_input, &module_size, sizeof(size_t)); + write(fd_secure_input, module, module_size); + + /* Write specialization info out to secure process */ + const VkSpecializationInfo *specInfo = pStages[i]->pSpecializationInfo; + bool has_spec_info = specInfo ? true : false; + write(fd_secure_input, &has_spec_info, sizeof(bool)); + if (specInfo) { + write(fd_secure_input, &specInfo->dataSize, sizeof(size_t)); + write(fd_secure_input, specInfo->pData, specInfo->dataSize); + + write(fd_secure_input, &specInfo->mapEntryCount, sizeof(uint32_t)); + for (uint32_t j = 0; j < specInfo->mapEntryCount; j++) + write(fd_secure_input, &specInfo->pMapEntries[j], sizeof(VkSpecializationMapEntry)); + } + } + + /* Read the data returned from the secure process */ + while (sc_type != RADV_SC_TYPE_COMPILE_PIPELINE_FINISHED) { + if (!radv_sc_read(fd_secure_output, &sc_type, sizeof(sc_type), true)) + return VK_ERROR_DEVICE_LOST; + + if (sc_type == RADV_SC_TYPE_WRITE_DISK_CACHE) { + assert(device->physical_device->disk_cache); + + uint8_t disk_sha1[20]; + if (!radv_sc_read(fd_secure_output, disk_sha1, sizeof(uint8_t) * 20, true)) + return VK_ERROR_DEVICE_LOST; + + if (memcmp(disk_sha1, allowed_hashes[0], 20) && + memcmp(disk_sha1, allowed_hashes[1], 20)) + return VK_ERROR_DEVICE_LOST; + + uint32_t entry_size; + if (!radv_sc_read(fd_secure_output, &entry_size, sizeof(uint32_t), true)) + return VK_ERROR_DEVICE_LOST; + + struct cache_entry *entry = malloc(entry_size); + if (!radv_sc_read(fd_secure_output, entry, entry_size, true)) + return VK_ERROR_DEVICE_LOST; + + disk_cache_put(device->physical_device->disk_cache, + disk_sha1, entry, entry_size, + NULL); + + free(entry); + } else if (sc_type == RADV_SC_TYPE_READ_DISK_CACHE) { + uint8_t disk_sha1[20]; + if (!radv_sc_read(fd_secure_output, disk_sha1, sizeof(uint8_t) * 20, true)) + return VK_ERROR_DEVICE_LOST; + + if (memcmp(disk_sha1, allowed_hashes[0], 20) && + memcmp(disk_sha1, allowed_hashes[1], 20)) + return VK_ERROR_DEVICE_LOST; + + size_t size; + struct cache_entry *entry = (struct cache_entry *) + disk_cache_get(device->physical_device->disk_cache, + disk_sha1, &size); + + uint8_t found = entry ? 1 : 0; + write(fd_secure_input, &found, sizeof(uint8_t)); + + if (found) { + write(fd_secure_input, &size, sizeof(size_t)); + write(fd_secure_input, entry, size); + } + + free(entry); + } + } + + sc_type = RADV_SC_TYPE_DESTROY_DEVICE; + write(fd_secure_input, &sc_type, sizeof(sc_type)); + + mtx_lock(&device->sc_state->secure_compile_mutex); + device->sc_state->secure_compile_thread_counter--; + device->sc_state->secure_compile_processes[process].in_use = false; + mtx_unlock(&device->sc_state->secure_compile_mutex); + + return VK_SUCCESS; +} + +static VkResult radv_pipeline_init(struct radv_pipeline *pipeline, struct radv_device *device, struct radv_pipeline_cache *cache, @@ -4529,7 +5049,11 @@ } struct radv_pipeline_key key = radv_generate_graphics_pipeline_key(pipeline, pCreateInfo, &blend, has_view_index); - radv_create_shaders(pipeline, device, cache, &key, pStages, pCreateInfo->flags, pipeline_feedback, stage_feedbacks); + if (radv_device_use_secure_compile(device->instance)) { + return radv_secure_compile(pipeline, device, &key, pStages, pCreateInfo->flags, pCreateInfo->stageCount); + } else { + radv_create_shaders(pipeline, device, cache, &key, pStages, pCreateInfo->flags, pipeline_feedback, stage_feedbacks); + } pipeline->graphics.spi_baryc_cntl = S_0286E0_FRONT_FACE_ALL_BITS(1); radv_pipeline_init_multisample_state(pipeline, &blend, pCreateInfo); @@ -4580,11 +5104,11 @@ */ struct radv_shader_variant *ps = pipeline->shaders[MESA_SHADER_FRAGMENT]; if ((pipeline->device->physical_device->rad_info.chip_class <= GFX9 || - ps->info.fs.can_discard) && + ps->info.ps.can_discard) && !blend.spi_shader_col_format) { - if (!ps->info.info.ps.writes_z && - !ps->info.info.ps.writes_stencil && - !ps->info.info.ps.writes_sample_mask) + if (!ps->info.ps.writes_z && + !ps->info.ps.writes_stencil && + !ps->info.ps.writes_sample_mask) blend.spi_shader_col_format = V_028714_SPI_SHADER_32_R; } @@ -4594,14 +5118,11 @@ } } - struct radv_ngg_state ngg = {0}; - struct radv_gs_state gs = {0}; + if (radv_pipeline_has_gs(pipeline) && !radv_pipeline_has_ngg(pipeline)) { + struct radv_shader_variant *gs = + pipeline->shaders[MESA_SHADER_GEOMETRY]; - if (radv_pipeline_has_ngg(pipeline)) { - ngg = calculate_ngg_info(pCreateInfo, pipeline); - } else if (radv_pipeline_has_gs(pipeline)) { - gs = calculate_gs_info(pCreateInfo, pipeline); - calculate_gs_ring_sizes(pipeline, &gs); + calculate_gs_ring_sizes(pipeline, &gs->info.gs_ring_info); } struct radv_tessellation_state tess = {0}; @@ -4625,7 +5146,7 @@ if (loc->sgpr_idx != -1) { pipeline->graphics.vtx_base_sgpr = pipeline->user_data_0[MESA_SHADER_VERTEX]; pipeline->graphics.vtx_base_sgpr += loc->sgpr_idx * 4; - if (radv_get_shader(pipeline, MESA_SHADER_VERTEX)->info.info.vs.needs_draw_id) + if (radv_get_shader(pipeline, MESA_SHADER_VERTEX)->info.vs.needs_draw_id) pipeline->graphics.vtx_emit_num = 3; else pipeline->graphics.vtx_emit_num = 2; @@ -4635,7 +5156,7 @@ pipeline->streamout_shader = radv_pipeline_get_streamout_shader(pipeline); result = radv_pipeline_scratch_init(device, pipeline); - radv_pipeline_generate_pm4(pipeline, pCreateInfo, extra, &blend, &tess, &gs, &ngg, prim, gs_out); + radv_pipeline_generate_pm4(pipeline, pCreateInfo, extra, &blend, &tess, prim, gs_out); return result; } @@ -4709,8 +5230,8 @@ unsigned max_waves_per_sh = 0; uint64_t va; - pipeline->cs.buf = malloc(20 * 4); - pipeline->cs.max_dw = 20; + pipeline->cs.max_dw = device->physical_device->rad_info.chip_class >= GFX10 ? 22 : 20; + pipeline->cs.buf = malloc(pipeline->cs.max_dw * 4); compute_shader = pipeline->shaders[MESA_SHADER_COMPUTE]; va = radv_buffer_get_va(compute_shader->bo) + compute_shader->bo_offset; @@ -4722,17 +5243,16 @@ radeon_set_sh_reg_seq(&pipeline->cs, R_00B848_COMPUTE_PGM_RSRC1, 2); radeon_emit(&pipeline->cs, compute_shader->config.rsrc1); radeon_emit(&pipeline->cs, compute_shader->config.rsrc2); - - radeon_set_sh_reg(&pipeline->cs, R_00B860_COMPUTE_TMPRING_SIZE, - S_00B860_WAVES(pipeline->max_waves) | - S_00B860_WAVESIZE(pipeline->scratch_bytes_per_wave >> 10)); + if (device->physical_device->rad_info.chip_class >= GFX10) { + radeon_set_sh_reg(&pipeline->cs, R_00B8A0_COMPUTE_PGM_RSRC3, compute_shader->config.rsrc3); + } /* Calculate best compute resource limits. */ threads_per_threadgroup = compute_shader->info.cs.block_size[0] * compute_shader->info.cs.block_size[1] * compute_shader->info.cs.block_size[2]; waves_per_threadgroup = DIV_ROUND_UP(threads_per_threadgroup, - device->physical_device->cs_wave_size); + compute_shader->info.wave_size); if (device->physical_device->rad_info.chip_class >= GFX10 && waves_per_threadgroup == 1) @@ -4759,12 +5279,23 @@ radv_generate_compute_pipeline_key(struct radv_pipeline *pipeline, const VkComputePipelineCreateInfo *pCreateInfo) { + const VkPipelineShaderStageCreateInfo *stage = &pCreateInfo->stage; struct radv_pipeline_key key; memset(&key, 0, sizeof(key)); if (pCreateInfo->flags & VK_PIPELINE_CREATE_DISABLE_OPTIMIZATION_BIT) key.optimisations_disabled = 1; + const VkPipelineShaderStageRequiredSubgroupSizeCreateInfoEXT *subgroup_size = + vk_find_struct_const(stage->pNext, + PIPELINE_SHADER_STAGE_REQUIRED_SUBGROUP_SIZE_CREATE_INFO_EXT); + + if (subgroup_size) { + assert(subgroup_size->requiredSubgroupSize == 32 || + subgroup_size->requiredSubgroupSize == 64); + key.compute_subgroup_size = subgroup_size->requiredSubgroupSize; + } + return key; } @@ -4804,7 +5335,14 @@ struct radv_pipeline_key key = radv_generate_compute_pipeline_key(pipeline, pCreateInfo); - radv_create_shaders(pipeline, device, cache, &key, pStages, pCreateInfo->flags, pipeline_feedback, stage_feedbacks); + if (radv_device_use_secure_compile(device->instance)) { + result = radv_secure_compile(pipeline, device, &key, pStages, pCreateInfo->flags, 1); + *pPipeline = radv_pipeline_to_handle(pipeline); + + return result; + } else { + radv_create_shaders(pipeline, device, cache, &key, pStages, pCreateInfo->flags, pipeline_feedback, stage_feedbacks); + } pipeline->user_data_0[MESA_SHADER_COMPUTE] = radv_pipeline_stage_to_user_data_0(pipeline, MESA_SHADER_COMPUTE, device->physical_device->rad_info.chip_class); pipeline->need_indirect_descriptor_sets |= pipeline->shaders[MESA_SHADER_COMPUTE]->info.need_indirect_descriptor_sets; @@ -4966,6 +5504,7 @@ break; } + pProperties[executable_idx].subgroupSize = pipeline->shaders[i]->info.wave_size; desc_copy(pProperties[executable_idx].name, name); desc_copy(pProperties[executable_idx].description, description); @@ -4977,6 +5516,7 @@ break; pProperties[executable_idx].stages = VK_SHADER_STAGE_GEOMETRY_BIT; + pProperties[executable_idx].subgroupSize = 64; desc_copy(pProperties[executable_idx].name, "GS Copy Shader"); desc_copy(pProperties[executable_idx].description, "Extra shader stage that loads the GS output ringbuffer into the rasterizer"); @@ -4985,9 +5525,6 @@ } } - for (unsigned i = 0; i < count; ++i) - pProperties[i].subgroupSize = 64; - VkResult result = *pExecutableCount < total_count ? VK_INCOMPLETE : VK_SUCCESS; *pExecutableCount = count; return result; @@ -5056,7 +5593,7 @@ desc_copy(s->name, "Code size"); desc_copy(s->description, "Code size in bytes"); s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; - s->value.u64 = shader->code_size; + s->value.u64 = shader->exec_size; } ++s; @@ -5136,12 +5673,17 @@ } ++p; - /* LLVM IR */ + /* backend IR */ if (p < end) { p->isText = true; - desc_copy(p->name, "LLVM IR"); - desc_copy(p->description, "The LLVM IR after some optimizations"); - if (radv_copy_representation(p->pData, &p->dataSize, shader->llvm_ir_string) != VK_SUCCESS) + if (shader->aco_used) { + desc_copy(p->name, "ACO IR"); + desc_copy(p->description, "The ACO IR after some optimizations"); + } else { + desc_copy(p->name, "LLVM IR"); + desc_copy(p->description, "The LLVM IR after some optimizations"); + } + if (radv_copy_representation(p->pData, &p->dataSize, shader->ir_string) != VK_SUCCESS) result = VK_INCOMPLETE; } ++p; diff -Nru mesa-19.2.8/src/amd/vulkan/radv_pipeline_cache.c mesa-20.0.8/src/amd/vulkan/radv_pipeline_cache.c --- mesa-19.2.8/src/amd/vulkan/radv_pipeline_cache.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/vulkan/radv_pipeline_cache.c 2020-06-12 01:21:16.000000000 +0000 @@ -243,6 +243,67 @@ return (device->instance->debug_flags & RADV_DEBUG_NO_CACHE); } +/* + * Secure compiles cannot open files so we get the parent process to load the + * cache entry for us. + */ +static struct cache_entry * +radv_sc_read_from_disk_cache(struct radv_device *device, uint8_t *disk_sha1) +{ + struct cache_entry *entry; + unsigned process = device->sc_state->secure_compile_thread_counter; + enum radv_secure_compile_type sc_type = RADV_SC_TYPE_READ_DISK_CACHE; + + write(device->sc_state->secure_compile_processes[process].fd_secure_output, + &sc_type, sizeof(enum radv_secure_compile_type)); + write(device->sc_state->secure_compile_processes[process].fd_secure_output, + disk_sha1, sizeof(uint8_t) * 20); + + uint8_t found_cache_entry; + if (!radv_sc_read(device->sc_state->secure_compile_processes[process].fd_secure_input, + &found_cache_entry, sizeof(uint8_t), true)) + return NULL; + + if (found_cache_entry) { + size_t entry_size; + if (!radv_sc_read(device->sc_state->secure_compile_processes[process].fd_secure_input, + &entry_size, sizeof(size_t), true)) + return NULL; + + entry = malloc(entry_size); + if (!radv_sc_read(device->sc_state->secure_compile_processes[process].fd_secure_input, + entry, entry_size, true)) + return NULL; + + return entry; + } + + return NULL; +} + +/* + * Secure compiles cannot open files so we get the parent process to write to + * the disk cache for us. + */ +static void +radv_sc_write_to_disk_cache(struct radv_device *device, uint8_t *disk_sha1, + struct cache_entry *entry) +{ + unsigned process = device->sc_state->secure_compile_thread_counter; + enum radv_secure_compile_type sc_type = RADV_SC_TYPE_WRITE_DISK_CACHE; + + write(device->sc_state->secure_compile_processes[process].fd_secure_output, + &sc_type, sizeof(enum radv_secure_compile_type)); + write(device->sc_state->secure_compile_processes[process].fd_secure_output, + disk_sha1, sizeof(uint8_t) * 20); + + uint32_t size = entry_size(entry); + write(device->sc_state->secure_compile_processes[process].fd_secure_output, + &size, sizeof(uint32_t)); + write(device->sc_state->secure_compile_processes[process].fd_secure_output, + entry, size); +} + bool radv_create_shader_variants_from_pipeline_cache(struct radv_device *device, struct radv_pipeline_cache *cache, @@ -275,9 +336,15 @@ uint8_t disk_sha1[20]; disk_cache_compute_key(device->physical_device->disk_cache, sha1, 20, disk_sha1); - entry = (struct cache_entry *) - disk_cache_get(device->physical_device->disk_cache, - disk_sha1, NULL); + + if (radv_device_use_secure_compile(device->instance)) { + entry = radv_sc_read_from_disk_cache(device, disk_sha1); + } else { + entry = (struct cache_entry *) + disk_cache_get(device->physical_device->disk_cache, + disk_sha1, NULL); + } + if (!entry) { pthread_mutex_unlock(&cache->mutex); return false; @@ -295,7 +362,9 @@ free(entry); entry = new_entry; - radv_pipeline_cache_add_entry(cache, new_entry); + if (!(device->instance->debug_flags & RADV_DEBUG_NO_MEMORY_CACHE) || + cache != device->mem_cache) + radv_pipeline_cache_add_entry(cache, new_entry); } } @@ -314,11 +383,17 @@ } - for (int i = 0; i < MESA_SHADER_STAGES; ++i) - if (entry->variants[i]) - p_atomic_inc(&entry->variants[i]->ref_count); - memcpy(variants, entry->variants, sizeof(entry->variants)); + + if (device->instance->debug_flags & RADV_DEBUG_NO_MEMORY_CACHE && + cache == device->mem_cache) + vk_free(&cache->alloc, entry); + else { + for (int i = 0; i < MESA_SHADER_STAGES; ++i) + if (entry->variants[i]) + p_atomic_inc(&entry->variants[i]->ref_count); + } + pthread_mutex_unlock(&cache->mutex); return true; } @@ -394,8 +469,24 @@ uint8_t disk_sha1[20]; disk_cache_compute_key(device->physical_device->disk_cache, sha1, 20, disk_sha1); - disk_cache_put(device->physical_device->disk_cache, - disk_sha1, entry, entry_size(entry), NULL); + + /* Write the cache item out to the parent of this forked + * process. + */ + if (radv_device_use_secure_compile(device->instance)) { + radv_sc_write_to_disk_cache(device, disk_sha1, entry); + } else { + disk_cache_put(device->physical_device->disk_cache, + disk_sha1, entry, entry_size(entry), + NULL); + } + } + + if (device->instance->debug_flags & RADV_DEBUG_NO_MEMORY_CACHE && + cache == device->mem_cache) { + vk_free2(&cache->alloc, NULL, entry); + pthread_mutex_unlock(&cache->mutex); + return; } /* We delay setting the variant so we have reproducible disk cache diff -Nru mesa-19.2.8/src/amd/vulkan/radv_private.h mesa-20.0.8/src/amd/vulkan/radv_private.h --- mesa-19.2.8/src/amd/vulkan/radv_private.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/vulkan/radv_private.h 2020-06-12 01:21:16.000000000 +0000 @@ -40,7 +40,7 @@ #include #define VG(x) x #else -#define VG(x) +#define VG(x) ((void)0) #endif #include "c11/threads.h" @@ -76,6 +76,7 @@ #include #include +#include #include #include @@ -84,6 +85,19 @@ #include "wsi_common.h" #include "wsi_common_display.h" +/* Helper to determine if we should compile + * any of the Android AHB support. + * + * To actually enable the ext we also need + * the necessary kernel support. + */ +#if defined(ANDROID) && ANDROID_API_LEVEL >= 26 +#define RADV_SUPPORT_ANDROID_HARDWARE_BUFFER 1 +#else +#define RADV_SUPPORT_ANDROID_HARDWARE_BUFFER 0 +#endif + + struct gfx10_format { unsigned img_format:9; @@ -107,9 +121,25 @@ RADV_MEM_TYPE_GTT_WRITE_COMBINE, RADV_MEM_TYPE_VRAM_CPU_ACCESS, RADV_MEM_TYPE_GTT_CACHED, + RADV_MEM_TYPE_VRAM_UNCACHED, + RADV_MEM_TYPE_GTT_WRITE_COMBINE_VRAM_UNCACHED, + RADV_MEM_TYPE_VRAM_CPU_ACCESS_UNCACHED, + RADV_MEM_TYPE_GTT_CACHED_VRAM_UNCACHED, RADV_MEM_TYPE_COUNT }; +enum radv_secure_compile_type { + RADV_SC_TYPE_INIT_SUCCESS, + RADV_SC_TYPE_INIT_FAILURE, + RADV_SC_TYPE_COMPILE_PIPELINE, + RADV_SC_TYPE_COMPILE_PIPELINE_FINISHED, + RADV_SC_TYPE_READ_DISK_CACHE, + RADV_SC_TYPE_WRITE_DISK_CACHE, + RADV_SC_TYPE_FORK_DEVICE, + RADV_SC_TYPE_DESTROY_DEVICE, + RADV_SC_TYPE_COUNT +}; + #define radv_printflike(a, b) __attribute__((__format__(__printf__, a, b))) static inline uint32_t @@ -240,7 +270,7 @@ fprintf(stderr, "%s:%d ASSERT: %s\n", __FILE__, __LINE__, #x); \ }) #else -#define radv_assert(x) +#define radv_assert(x) do {} while(0) #endif #define stub_return(v) \ @@ -280,33 +310,28 @@ int master_fd; struct wsi_device wsi_device; - bool has_rbplus; /* if RB+ register exist */ - bool rbplus_allowed; /* if RB+ is allowed */ - bool has_clear_state; - bool cpdma_prefetch_writes_memory; - bool has_scissor_bug; - bool has_tc_compat_zrange_bug; - - bool has_out_of_order_rast; bool out_of_order_rast_allowed; /* Whether DCC should be enabled for MSAA textures. */ bool dcc_msaa_allowed; - /* Whether LOAD_CONTEXT_REG packets are supported. */ - bool has_load_ctx_reg_pkt; - /* Whether to enable the AMD_shader_ballot extension */ bool use_shader_ballot; - /* Whether DISABLE_CONSTANT_ENCODE_REG is supported. */ - bool has_dcc_constant_encode; + /* Whether to enable NGG. */ + bool use_ngg; + + /* Whether to enable NGG streamout. */ + bool use_ngg_streamout; /* Number of threads per wave. */ uint8_t ps_wave_size; uint8_t cs_wave_size; uint8_t ge_wave_size; + /* Whether to use the experimental compiler backend */ + bool use_aco; + /* This is the drivers on-disk cache used as a fallback as opposed to * the pipeline cache defined by apps. */ @@ -334,6 +359,7 @@ uint64_t debug_flags; uint64_t perftest_flags; + uint8_t num_sc_threads; struct vk_debug_report_instance debug_report_callbacks; @@ -343,6 +369,12 @@ struct driOptionCache available_dri_options; }; +static inline +bool radv_device_use_secure_compile(struct radv_instance *instance) +{ + return instance->num_sc_threads; +} + VkResult radv_init_wsi(struct radv_physical_device *physical_device); void radv_finish_wsi(struct radv_physical_device *physical_device); @@ -383,6 +415,12 @@ uint8_t num_samples; uint32_t has_multiview_view_index : 1; uint32_t optimisations_disabled : 1; + uint8_t topology; + + /* Non-zero if a required subgroup size is specified via + * VK_EXT_subgroup_size_control. + */ + uint8_t compute_subgroup_size; }; struct radv_shader_binary; @@ -465,10 +503,15 @@ VkPipeline depth_only_pipeline[NUM_DEPTH_CLEAR_PIPELINES]; VkPipeline stencil_only_pipeline[NUM_DEPTH_CLEAR_PIPELINES]; VkPipeline depthstencil_pipeline[NUM_DEPTH_CLEAR_PIPELINES]; + + VkPipeline depth_only_unrestricted_pipeline[NUM_DEPTH_CLEAR_PIPELINES]; + VkPipeline stencil_only_unrestricted_pipeline[NUM_DEPTH_CLEAR_PIPELINES]; + VkPipeline depthstencil_unrestricted_pipeline[NUM_DEPTH_CLEAR_PIPELINES]; } clear[MAX_SAMPLES_LOG2]; VkPipelineLayout clear_color_p_layout; VkPipelineLayout clear_depth_p_layout; + VkPipelineLayout clear_depth_unrestricted_p_layout; /* Optimized compute fast HTILE clear for stencil or depth only. */ VkPipeline clear_htile_mask_pipeline; @@ -610,7 +653,7 @@ struct { VkPipelineLayout p_layout; - VkPipeline decompress_pipeline; + VkPipeline decompress_pipeline[NUM_DEPTH_DECOMPRESS_PIPELINES]; VkPipeline resummarize_pipeline; VkRenderPass pass; } depth_decomp[MAX_SAMPLES_LOG2]; @@ -642,6 +685,7 @@ VkPipeline occlusion_query_pipeline; VkPipeline pipeline_statistics_query_pipeline; VkPipeline tfb_query_pipeline; + VkPipeline timestamp_query_pipeline; } query; struct { @@ -669,11 +713,15 @@ int queue_idx; VkDeviceQueueCreateFlags flags; - uint32_t scratch_size; - uint32_t compute_scratch_size; + uint32_t scratch_size_per_wave; + uint32_t scratch_waves; + uint32_t compute_scratch_size_per_wave; + uint32_t compute_scratch_waves; uint32_t esgs_ring_size; uint32_t gsvs_ring_size; bool has_tess_rings; + bool has_gds; + bool has_gds_oa; bool has_sample_positions; struct radeon_winsys_bo *scratch_bo; @@ -682,9 +730,14 @@ struct radeon_winsys_bo *esgs_ring_bo; struct radeon_winsys_bo *gsvs_ring_bo; struct radeon_winsys_bo *tess_rings_bo; + struct radeon_winsys_bo *gds_bo; + struct radeon_winsys_bo *gds_oa_bo; struct radeon_cmdbuf *initial_preamble_cs; struct radeon_cmdbuf *initial_full_flush_preamble_cs; struct radeon_cmdbuf *continue_preamble_cs; + + struct list_head pending_submissions; + pthread_mutex_t pending_mutex; }; struct radv_bo_list { @@ -693,6 +746,36 @@ pthread_mutex_t mutex; }; +struct radv_secure_compile_process { + /* Secure process file descriptors. Used to communicate between the + * user facing device and the idle forked device used to fork a clean + * process for each new pipeline compile. + */ + int fd_secure_input; + int fd_secure_output; + + /* FIFO file descriptors used to communicate between the user facing + * device and the secure process that does the actual secure compile. + */ + int fd_server; + int fd_client; + + /* Secure compile process id */ + pid_t sc_pid; + + /* Is the secure compile process currently in use by a thread */ + bool in_use; +}; + +struct radv_secure_compile_state { + struct radv_secure_compile_process *secure_compile_processes; + uint32_t secure_compile_thread_counter; + mtx_t secure_compile_mutex; + + /* Unique process ID used to build name for FIFO file descriptor */ + char *uid; +}; + struct radv_device { VK_LOADER_DATA _loader_data; @@ -708,7 +791,6 @@ struct radeon_cmdbuf *empty_cs[RADV_MAX_QUEUE_FAMILIES]; bool always_use_syncobj; - bool has_distributed_tess; bool pbb_allowed; bool dfsm_allowed; uint32_t tess_offchip_block_dw_size; @@ -764,6 +846,12 @@ /* Whether anisotropy is forced with RADV_TEX_ANISO (-1 is disabled). */ int force_aniso; + + struct radv_secure_compile_state *sc_state; + + /* Condition variable for legacy timelines, to notify waiters when a + * new point gets submitted. */ + pthread_cond_t timeline_cond; }; struct radv_device_memory { @@ -775,6 +863,10 @@ VkDeviceSize map_size; void * map; void * user_ptr; + +#if RADV_SUPPORT_ANDROID_HARDWARE_BUFFER + struct AHardwareBuffer * android_hardware_buffer; +#endif }; @@ -786,6 +878,7 @@ struct radv_descriptor_set { const struct radv_descriptor_set_layout *layout; uint32_t size; + uint32_t buffer_count; struct radeon_winsys_bo *bo; uint64_t va; @@ -1091,6 +1184,9 @@ struct radv_ds_buffer_info *ds, struct radv_image_view *iview); +bool +radv_sc_read(int fd, void *buf, size_t size, bool timeout); + /** * Attachment state when recording a renderpass instance. * @@ -1101,6 +1197,7 @@ uint32_t cleared_views; VkClearValue clear_value; VkImageLayout current_layout; + VkImageLayout current_stencil_layout; bool current_in_render_loop; struct radv_sample_locations_state sample_location; @@ -1164,6 +1261,7 @@ unsigned active_occlusion_queries; bool perfect_occlusion_queries_enabled; unsigned active_pipeline_queries; + unsigned active_pipeline_gds_queries; float offset_scale; uint32_t trace_id; uint32_t last_ia_multi_vgt_param; @@ -1179,6 +1277,9 @@ int predication_type; /* -1: disabled, 0: normal, 1: inverted */ uint64_t predication_va; + /* Inheritance info. */ + VkQueryPipelineStatisticFlags inherited_pipeline_statistics; + bool context_roll_without_scissor_emitted; }; @@ -1230,11 +1331,15 @@ struct radv_cmd_buffer_upload upload; - uint32_t scratch_size_needed; - uint32_t compute_scratch_size_needed; + uint32_t scratch_size_per_wave_needed; + uint32_t scratch_waves_wanted; + uint32_t compute_scratch_size_per_wave_needed; + uint32_t compute_scratch_waves_wanted; uint32_t esgs_ring_size_needed; uint32_t gsvs_ring_size_needed; bool tess_rings_needed; + bool gds_needed; /* for GFX10 streamout and NGG GS queries */ + bool gds_oa_needed; /* for GFX10 streamout */ bool sample_positions_needed; VkResult record_result; @@ -1324,17 +1429,17 @@ void radv_cmd_buffer_resolve_subpass_cs(struct radv_cmd_buffer *cmd_buffer); void radv_depth_stencil_resolve_subpass_cs(struct radv_cmd_buffer *cmd_buffer, VkImageAspectFlags aspects, - VkResolveModeFlagBitsKHR resolve_mode); + VkResolveModeFlagBits resolve_mode); void radv_cmd_buffer_resolve_subpass_fs(struct radv_cmd_buffer *cmd_buffer); void radv_depth_stencil_resolve_subpass_fs(struct radv_cmd_buffer *cmd_buffer, VkImageAspectFlags aspects, - VkResolveModeFlagBitsKHR resolve_mode); + VkResolveModeFlagBits resolve_mode); void radv_emit_default_sample_locations(struct radeon_cmdbuf *cs, int nr_samples); unsigned radv_get_default_max_sample_dist(int log_samples); void radv_device_init_msaa(struct radv_device *device); void radv_update_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer, - struct radv_image *image, + const struct radv_image_view *iview, VkClearDepthStencilValue ds_clear_value, VkImageAspectFlags aspects); @@ -1424,11 +1529,11 @@ #define RADV_HASH_SHADER_IS_GEOM_COPY_SHADER (1 << 0) #define RADV_HASH_SHADER_SISCHED (1 << 1) -#define RADV_HASH_SHADER_UNSAFE_MATH (1 << 2) -#define RADV_HASH_SHADER_NO_NGG (1 << 3) -#define RADV_HASH_SHADER_CS_WAVE32 (1 << 4) -#define RADV_HASH_SHADER_PS_WAVE32 (1 << 5) -#define RADV_HASH_SHADER_GE_WAVE32 (1 << 6) +#define RADV_HASH_SHADER_NO_NGG (1 << 2) +#define RADV_HASH_SHADER_CS_WAVE32 (1 << 3) +#define RADV_HASH_SHADER_PS_WAVE32 (1 << 4) +#define RADV_HASH_SHADER_GE_WAVE32 (1 << 5) +#define RADV_HASH_SHADER_ACO (1 << 6) void radv_hash_shaders(unsigned char *hash, @@ -1558,6 +1663,8 @@ bool radv_pipeline_has_ngg(const struct radv_pipeline *pipeline); +bool radv_pipeline_has_ngg_passthrough(const struct radv_pipeline *pipeline); + bool radv_pipeline_has_gs_copy_shader(const struct radv_pipeline *pipeline); struct radv_userdata_info *radv_lookup_user_sgpr(struct radv_pipeline *pipeline, @@ -1587,6 +1694,15 @@ const VkAllocationCallbacks *alloc, VkPipeline *pPipeline); +struct radv_binning_settings { + unsigned context_states_per_bin; /* allowed range: [1, 6] */ + unsigned persistent_states_per_bin; /* allowed range: [1, 32] */ + unsigned fpovs_per_batch; /* allowed range: [0, 255], 0 = unlimited */ +}; + +struct radv_binning_settings +radv_get_binning_settings(const struct radv_physical_device *pdev); + struct vk_format_description; uint32_t radv_translate_buffer_dataformat(const struct vk_format_description *desc, int first_non_void); @@ -1808,6 +1924,24 @@ return va; } +static inline uint64_t +radv_get_tc_compat_zrange_va(const struct radv_image *image, + uint32_t base_level) +{ + uint64_t va = radv_buffer_get_va(image->bo); + va += image->offset + image->tc_compat_zrange_offset + base_level * 4; + return va; +} + +static inline uint64_t +radv_get_ds_clear_value_va(const struct radv_image *image, + uint32_t base_level) +{ + uint64_t va = radv_buffer_get_va(image->bo); + va += image->offset + image->clear_value_offset + base_level * 8; + return va; +} + unsigned radv_image_queue_family_mask(const struct radv_image *image, uint32_t family, uint32_t queue_family); static inline uint32_t @@ -1877,6 +2011,11 @@ const struct radeon_bo_metadata *bo_metadata; }; +VkResult +radv_image_create_layout(struct radv_device *device, + struct radv_image_create_info create_info, + struct radv_image *image); + VkResult radv_image_create(VkDevice _device, const struct radv_image_create_info *info, const VkAllocationCallbacks* alloc, @@ -1890,6 +2029,24 @@ const VkNativeBufferANDROID *gralloc_info, const VkAllocationCallbacks *alloc, VkImage *out_image_h); +uint64_t +radv_ahb_usage_from_vk_usage(const VkImageCreateFlags vk_create, + const VkImageUsageFlags vk_usage); +VkResult +radv_import_ahb_memory(struct radv_device *device, + struct radv_device_memory *mem, + unsigned priority, + const VkImportAndroidHardwareBufferInfoANDROID *info); +VkResult +radv_create_ahb_memory(struct radv_device *device, + struct radv_device_memory *mem, + unsigned priority, + const VkMemoryAllocateInfo *pAllocateInfo); + +VkFormat +radv_select_android_external_format(const void *next, VkFormat default_format); + +bool radv_android_gralloc_supports_format(VkFormat format, VkImageUsageFlagBits usage); struct radv_image_view_extra_create_info { bool disable_compression; @@ -1990,6 +2147,7 @@ struct radv_subpass_attachment { uint32_t attachment; VkImageLayout layout; + VkImageLayout stencil_layout; bool in_render_loop; }; @@ -2004,8 +2162,8 @@ struct radv_subpass_attachment * resolve_attachments; struct radv_subpass_attachment * depth_stencil_attachment; struct radv_subpass_attachment * ds_resolve_attachment; - VkResolveModeFlagBitsKHR depth_resolve_mode; - VkResolveModeFlagBitsKHR stencil_resolve_mode; + VkResolveModeFlagBits depth_resolve_mode; + VkResolveModeFlagBits stencil_resolve_mode; /** Subpass has at least one color resolve attachment */ bool has_color_resolve; @@ -2016,6 +2174,9 @@ struct radv_subpass_barrier start_barrier; uint32_t view_mask; + + VkSampleCountFlagBits color_sample_count; + VkSampleCountFlagBits depth_sample_count; VkSampleCountFlagBits max_sample_count; }; @@ -2029,6 +2190,8 @@ VkAttachmentLoadOp stencil_load_op; VkImageLayout initial_layout; VkImageLayout final_layout; + VkImageLayout stencil_initial_layout; + VkImageLayout stencil_final_layout; /* The subpass id in which the attachment will be used first/last. */ uint32_t first_subpass_idx; @@ -2057,11 +2220,62 @@ uint32_t pipeline_stats_mask; }; -struct radv_semaphore { - /* use a winsys sem for non-exportable */ - struct radeon_winsys_sem *sem; +typedef enum { + RADV_SEMAPHORE_NONE, + RADV_SEMAPHORE_WINSYS, + RADV_SEMAPHORE_SYNCOBJ, + RADV_SEMAPHORE_TIMELINE, +} radv_semaphore_kind; + +struct radv_deferred_queue_submission; + +struct radv_timeline_waiter { + struct list_head list; + struct radv_deferred_queue_submission *submission; + uint64_t value; +}; + +struct radv_timeline_point { + struct list_head list; + + uint64_t value; uint32_t syncobj; - uint32_t temp_syncobj; + + /* Separate from the list to accomodate CPU wait being async, as well + * as prevent point deletion during submission. */ + unsigned wait_count; +}; + +struct radv_timeline { + /* Using a pthread mutex to be compatible with condition variables. */ + pthread_mutex_t mutex; + + uint64_t highest_signaled; + uint64_t highest_submitted; + + struct list_head points; + + /* Keep free points on hand so we do not have to recreate syncobjs all + * the time. */ + struct list_head free_points; + + /* Submissions that are deferred waiting for a specific value to be + * submitted. */ + struct list_head waiters; +}; + +struct radv_semaphore_part { + radv_semaphore_kind kind; + union { + uint32_t syncobj; + struct radeon_winsys_sem *ws_sem; + struct radv_timeline timeline; + }; +}; + +struct radv_semaphore { + struct radv_semaphore_part permanent; + struct radv_semaphore_part temporary; }; void radv_set_descriptor_set(struct radv_cmd_buffer *cmd_buffer, @@ -2109,21 +2323,18 @@ }; /* radv_nir_to_llvm.c */ -struct radv_shader_variant_info; -struct radv_nir_compiler_options; +struct radv_shader_args; void radv_compile_gs_copy_shader(struct ac_llvm_compiler *ac_llvm, struct nir_shader *geom_shader, struct radv_shader_binary **rbinary, - struct radv_shader_variant_info *shader_info, - const struct radv_nir_compiler_options *option); + const struct radv_shader_args *args); void radv_compile_nir_shader(struct ac_llvm_compiler *ac_llvm, struct radv_shader_binary **rbinary, - struct radv_shader_variant_info *shader_info, + const struct radv_shader_args *args, struct nir_shader *const *nir, - int nir_count, - const struct radv_nir_compiler_options *options); + int nir_count); unsigned radv_nir_get_max_workgroup_size(enum chip_class chip_class, gl_shader_stage stage, @@ -2131,9 +2342,11 @@ /* radv_shader_info.h */ struct radv_shader_info; +struct radv_shader_variant_key; void radv_nir_shader_info_pass(const struct nir_shader *nir, - const struct radv_nir_compiler_options *options, + const struct radv_pipeline_layout *layout, + const struct radv_shader_variant_key *key, struct radv_shader_info *info); void radv_nir_shader_info_init(struct radv_shader_info *info); diff -Nru mesa-19.2.8/src/amd/vulkan/radv_query.c mesa-20.0.8/src/amd/vulkan/radv_query.c --- mesa-19.2.8/src/amd/vulkan/radv_query.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/vulkan/radv_query.c 2020-06-12 01:21:16.000000000 +0000 @@ -40,6 +40,14 @@ static const int pipelinestat_block_size = 11 * 8; static const unsigned pipeline_statistics_indices[] = {7, 6, 3, 4, 5, 2, 1, 0, 8, 9, 10}; +static unsigned +radv_get_pipeline_statistics_index(const VkQueryPipelineStatisticFlagBits flag) +{ + int offset = ffs(flag) - 1; + assert(offset < ARRAY_SIZE(pipeline_statistics_indices)); + return pipeline_statistics_indices[offset]; +} + static nir_ssa_def *nir_test_flag(nir_builder *b, nir_ssa_def *flags, uint32_t flag) { return nir_i2b(b, nir_iand(b, flags, nir_imm_int(b, flag))); @@ -76,6 +84,49 @@ return &flags->dest.ssa; } +static void +radv_store_availability(nir_builder *b, nir_ssa_def *flags, nir_ssa_def *dst_buf, + nir_ssa_def *offset, nir_ssa_def *value32) +{ + nir_ssa_def *result_is_64bit = nir_test_flag(b, flags, VK_QUERY_RESULT_64_BIT); + nir_if *availability_if = nir_if_create(b->shader); + availability_if->condition = nir_src_for_ssa(nir_test_flag(b, flags, VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)); + nir_cf_node_insert(b->cursor, &availability_if->cf_node); + + b->cursor = nir_after_cf_list(&availability_if->then_list); + + + nir_if *store_64bit_if = nir_if_create(b->shader); + store_64bit_if->condition = nir_src_for_ssa(result_is_64bit); + nir_cf_node_insert(b->cursor, &store_64bit_if->cf_node); + + b->cursor = nir_after_cf_list(&store_64bit_if->then_list); + + nir_intrinsic_instr *store = nir_intrinsic_instr_create(b->shader, nir_intrinsic_store_ssbo); + store->src[0] = nir_src_for_ssa(nir_vec2(b, value32, nir_imm_int(b, 0))); + store->src[1] = nir_src_for_ssa(dst_buf); + store->src[2] = nir_src_for_ssa(offset); + nir_intrinsic_set_write_mask(store, 0x3); + nir_intrinsic_set_align(store, 8, 0); + store->num_components = 2; + nir_builder_instr_insert(b, &store->instr); + + b->cursor = nir_after_cf_list(&store_64bit_if->else_list); + + store = nir_intrinsic_instr_create(b->shader, nir_intrinsic_store_ssbo); + store->src[0] = nir_src_for_ssa(value32); + store->src[1] = nir_src_for_ssa(dst_buf); + store->src[2] = nir_src_for_ssa(offset); + nir_intrinsic_set_write_mask(store, 0x1); + nir_intrinsic_set_align(store, 4, 0); + store->num_components = 1; + nir_builder_instr_insert(b, &store->instr); + + b->cursor = nir_after_cf_node(&store_64bit_if->cf_node); + + b->cursor = nir_after_cf_node(&availability_if->cf_node); +} + static nir_shader * build_occlusion_query_shader(struct radv_device *device) { /* the shader this builds is roughly @@ -196,6 +247,7 @@ load->src[1] = nir_src_for_ssa(load_offset); nir_ssa_dest_init(&load->instr, &load->dest, 2, 64, NULL); load->num_components = 2; + nir_intrinsic_set_align(load, 16, 0); nir_builder_instr_insert(&b, &load->instr); nir_store_var(&b, start, nir_channel(&b, &load->dest.ssa, 0), 0x1); @@ -243,6 +295,7 @@ store->src[1] = nir_src_for_ssa(&dst_buf->dest.ssa); store->src[2] = nir_src_for_ssa(output_base); nir_intrinsic_set_write_mask(store, 0x1); + nir_intrinsic_set_align(store, 8, 0); store->num_components = 1; nir_builder_instr_insert(&b, &store->instr); @@ -253,26 +306,15 @@ store->src[1] = nir_src_for_ssa(&dst_buf->dest.ssa); store->src[2] = nir_src_for_ssa(output_base); nir_intrinsic_set_write_mask(store, 0x1); + nir_intrinsic_set_align(store, 4, 0); store->num_components = 1; nir_builder_instr_insert(&b, &store->instr); b.cursor = nir_after_cf_node(&store_if->cf_node); - /* Store the availability bit if requested. */ - - nir_if *availability_if = nir_if_create(b.shader); - availability_if->condition = nir_src_for_ssa(nir_test_flag(&b, flags, VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)); - nir_cf_node_insert(b.cursor, &availability_if->cf_node); - - b.cursor = nir_after_cf_list(&availability_if->then_list); - - store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo); - store->src[0] = nir_src_for_ssa(nir_b2i32(&b, nir_load_var(&b, available))); - store->src[1] = nir_src_for_ssa(&dst_buf->dest.ssa); - store->src[2] = nir_src_for_ssa(nir_iadd(&b, result_size, output_base)); - nir_intrinsic_set_write_mask(store, 0x1); - store->num_components = 1; - nir_builder_instr_insert(&b, &store->instr); + radv_store_availability(&b, flags, &dst_buf->dest.ssa, + nir_iadd(&b, result_size, output_base), + nir_b2i32(&b, nir_load_var(&b, available))); return b.shader; } @@ -376,6 +418,7 @@ load->src[1] = nir_src_for_ssa(avail_offset); nir_ssa_dest_init(&load->instr, &load->dest, 1, 32, NULL); load->num_components = 1; + nir_intrinsic_set_align(load, 4, 0); nir_builder_instr_insert(&b, &load->instr); nir_ssa_def *available32 = &load->dest.ssa; @@ -383,23 +426,9 @@ nir_ssa_def *elem_size = nir_bcsel(&b, result_is_64bit, nir_imm_int(&b, 8), nir_imm_int(&b, 4)); nir_ssa_def *elem_count = nir_ushr(&b, stats_mask, nir_imm_int(&b, 16)); - /* Store the availability bit if requested. */ - - nir_if *availability_if = nir_if_create(b.shader); - availability_if->condition = nir_src_for_ssa(nir_test_flag(&b, flags, VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)); - nir_cf_node_insert(b.cursor, &availability_if->cf_node); - - b.cursor = nir_after_cf_list(&availability_if->then_list); - - nir_intrinsic_instr *store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo); - store->src[0] = nir_src_for_ssa(available32); - store->src[1] = nir_src_for_ssa(&dst_buf->dest.ssa); - store->src[2] = nir_src_for_ssa(nir_iadd(&b, output_base, nir_imul(&b, elem_count, elem_size))); - nir_intrinsic_set_write_mask(store, 0x1); - store->num_components = 1; - nir_builder_instr_insert(&b, &store->instr); - - b.cursor = nir_after_cf_node(&availability_if->cf_node); + radv_store_availability(&b, flags, &dst_buf->dest.ssa, + nir_iadd(&b, output_base, nir_imul(&b, elem_count, elem_size)), + available32); nir_if *available_if = nir_if_create(b.shader); available_if->condition = nir_src_for_ssa(nir_i2b(&b, available32)); @@ -421,6 +450,7 @@ nir_imm_int(&b, pipeline_statistics_indices[i] * 8))); nir_ssa_dest_init(&load->instr, &load->dest, 1, 64, NULL); load->num_components = 1; + nir_intrinsic_set_align(load, 8, 0); nir_builder_instr_insert(&b, &load->instr); nir_ssa_def *start = &load->dest.ssa; @@ -430,6 +460,7 @@ nir_imm_int(&b, pipeline_statistics_indices[i] * 8 + pipelinestat_block_size))); nir_ssa_dest_init(&load->instr, &load->dest, 1, 64, NULL); load->num_components = 1; + nir_intrinsic_set_align(load, 8, 0); nir_builder_instr_insert(&b, &load->instr); nir_ssa_def *end = &load->dest.ssa; @@ -447,6 +478,7 @@ store->src[1] = nir_src_for_ssa(&dst_buf->dest.ssa); store->src[2] = nir_src_for_ssa(nir_load_var(&b, output_offset)); nir_intrinsic_set_write_mask(store, 0x1); + nir_intrinsic_set_align(store, 8, 0); store->num_components = 1; nir_builder_instr_insert(&b, &store->instr); @@ -457,6 +489,7 @@ store->src[1] = nir_src_for_ssa(&dst_buf->dest.ssa); store->src[2] = nir_src_for_ssa(nir_load_var(&b, output_offset)); nir_intrinsic_set_write_mask(store, 0x1); + nir_intrinsic_set_align(store, 4, 0); store->num_components = 1; nir_builder_instr_insert(&b, &store->instr); @@ -498,11 +531,12 @@ b.cursor = nir_after_cf_list(&store_64bit_if->then_list); - store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo); + nir_intrinsic_instr *store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo); store->src[0] = nir_src_for_ssa(nir_imm_int64(&b, 0)); store->src[1] = nir_src_for_ssa(&dst_buf->dest.ssa); store->src[2] = nir_src_for_ssa(output_elem); nir_intrinsic_set_write_mask(store, 0x1); + nir_intrinsic_set_align(store, 8, 0); store->num_components = 1; nir_builder_instr_insert(&b, &store->instr); @@ -513,6 +547,7 @@ store->src[1] = nir_src_for_ssa(&dst_buf->dest.ssa); store->src[2] = nir_src_for_ssa(output_elem); nir_intrinsic_set_write_mask(store, 0x1); + nir_intrinsic_set_align(store, 4, 0); store->num_components = 1; nir_builder_instr_insert(&b, &store->instr); @@ -621,6 +656,7 @@ load1->src[1] = nir_src_for_ssa(input_base); nir_ssa_dest_init(&load1->instr, &load1->dest, 4, 32, NULL); load1->num_components = 4; + nir_intrinsic_set_align(load1, 32, 0); nir_builder_instr_insert(&b, &load1->instr); nir_intrinsic_instr *load2 = nir_intrinsic_instr_create(b.shader, nir_intrinsic_load_ssbo); @@ -628,6 +664,7 @@ load2->src[1] = nir_src_for_ssa(nir_iadd(&b, input_base, nir_imm_int(&b, 16))); nir_ssa_dest_init(&load2->instr, &load2->dest, 4, 32, NULL); load2->num_components = 4; + nir_intrinsic_set_align(load2, 16, 0); nir_builder_instr_insert(&b, &load2->instr); /* Check if result is available. */ @@ -703,6 +740,7 @@ store->src[1] = nir_src_for_ssa(&dst_buf->dest.ssa); store->src[2] = nir_src_for_ssa(output_base); nir_intrinsic_set_write_mask(store, 0x3); + nir_intrinsic_set_align(store, 8, 0); store->num_components = 2; nir_builder_instr_insert(&b, &store->instr); @@ -713,6 +751,7 @@ store->src[1] = nir_src_for_ssa(&dst_buf->dest.ssa); store->src[2] = nir_src_for_ssa(output_base); nir_intrinsic_set_write_mask(store, 0x3); + nir_intrinsic_set_align(store, 4, 0); store->num_components = 2; nir_builder_instr_insert(&b, &store->instr); @@ -720,23 +759,181 @@ b.cursor = nir_after_cf_node(&store_if->cf_node); - /* Store the availability bit if requested. */ - nir_if *availability_if = nir_if_create(b.shader); - availability_if->condition = - nir_src_for_ssa(nir_test_flag(&b, flags, VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)); - nir_cf_node_insert(b.cursor, &availability_if->cf_node); + radv_store_availability(&b, flags, &dst_buf->dest.ssa, + nir_iadd(&b, result_size, output_base), + nir_b2i32(&b, nir_load_var(&b, available))); + + return b.shader; +} + +static nir_shader * +build_timestamp_query_shader(struct radv_device *device) +{ + /* the shader this builds is roughly + * + * uint32_t src_stride = 8; + * + * location(binding = 0) buffer dst_buf; + * location(binding = 1) buffer src_buf; + * + * void main() { + * uint64_t result = 0; + * bool available = false; + * uint64_t src_offset = src_stride * global_id.x; + * uint64_t dst_offset = dst_stride * global_id.x; + * uint64_t timestamp = src_buf[src_offset]; + * if (timestamp != TIMESTAMP_NOT_READY) { + * result = timestamp; + * available = true; + * } + * uint32_t result_size = flags & VK_QUERY_RESULT_64_BIT ? 8 : 4; + * if ((flags & VK_QUERY_RESULT_PARTIAL_BIT) || available) { + * if (flags & VK_QUERY_RESULT_64_BIT) { + * dst_buf[dst_offset] = result; + * } else { + * dst_buf[dst_offset] = (uint32_t)result; + * } + * } + * if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) { + * dst_buf[dst_offset + result_size] = available; + * } + * } + */ + nir_builder b; + nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_COMPUTE, NULL); + b.shader->info.name = ralloc_strdup(b.shader, "timestamp_query"); + b.shader->info.cs.local_size[0] = 64; + b.shader->info.cs.local_size[1] = 1; + b.shader->info.cs.local_size[2] = 1; + + /* Create and initialize local variables. */ + nir_variable *result = + nir_local_variable_create(b.impl, glsl_uint64_t_type(), "result"); + nir_variable *available = + nir_local_variable_create(b.impl, glsl_bool_type(), "available"); + + nir_store_var(&b, result, nir_imm_int64(&b, 0), 0x1); + nir_store_var(&b, available, nir_imm_false(&b), 0x1); + + nir_ssa_def *flags = radv_load_push_int(&b, 0, "flags"); + + /* Load resources. */ + nir_intrinsic_instr *dst_buf = nir_intrinsic_instr_create(b.shader, + nir_intrinsic_vulkan_resource_index); + dst_buf->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0)); + dst_buf->num_components = 1; + nir_intrinsic_set_desc_set(dst_buf, 0); + nir_intrinsic_set_binding(dst_buf, 0); + nir_ssa_dest_init(&dst_buf->instr, &dst_buf->dest, dst_buf->num_components, 32, NULL); + nir_builder_instr_insert(&b, &dst_buf->instr); + + nir_intrinsic_instr *src_buf = nir_intrinsic_instr_create(b.shader, + nir_intrinsic_vulkan_resource_index); + src_buf->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0)); + src_buf->num_components = 1; + nir_intrinsic_set_desc_set(src_buf, 0); + nir_intrinsic_set_binding(src_buf, 1); + nir_ssa_dest_init(&src_buf->instr, &src_buf->dest, src_buf->num_components, 32, NULL); + nir_builder_instr_insert(&b, &src_buf->instr); + + /* Compute global ID. */ + nir_ssa_def *invoc_id = nir_load_local_invocation_id(&b); + nir_ssa_def *wg_id = nir_load_work_group_id(&b); + nir_ssa_def *block_size = nir_imm_ivec4(&b, + b.shader->info.cs.local_size[0], + b.shader->info.cs.local_size[1], + b.shader->info.cs.local_size[2], 0); + nir_ssa_def *global_id = nir_iadd(&b, nir_imul(&b, wg_id, block_size), invoc_id); + global_id = nir_channel(&b, global_id, 0); // We only care about x here. + + /* Compute src/dst strides. */ + nir_ssa_def *input_stride = nir_imm_int(&b, 8); + nir_ssa_def *input_base = nir_imul(&b, input_stride, global_id); + nir_ssa_def *output_stride = radv_load_push_int(&b, 4, "output_stride"); + nir_ssa_def *output_base = nir_imul(&b, output_stride, global_id); + + /* Load data from the query pool. */ + nir_intrinsic_instr *load = nir_intrinsic_instr_create(b.shader, nir_intrinsic_load_ssbo); + load->src[0] = nir_src_for_ssa(&src_buf->dest.ssa); + load->src[1] = nir_src_for_ssa(input_base); + nir_ssa_dest_init(&load->instr, &load->dest, 2, 32, NULL); + load->num_components = 2; + nir_intrinsic_set_align(load, 8, 0); + nir_builder_instr_insert(&b, &load->instr); + + /* Pack the timestamp. */ + nir_ssa_def *timestamp; + timestamp = nir_pack_64_2x32(&b, nir_vec2(&b, + nir_channel(&b, &load->dest.ssa, 0), + nir_channel(&b, &load->dest.ssa, 1))); + + /* Check if result is available. */ + nir_ssa_def *result_is_available = + nir_i2b(&b, nir_ine(&b, timestamp, + nir_imm_int64(&b, TIMESTAMP_NOT_READY))); + + /* Only store result if available. */ + nir_if *available_if = nir_if_create(b.shader); + available_if->condition = nir_src_for_ssa(result_is_available); + nir_cf_node_insert(b.cursor, &available_if->cf_node); + + b.cursor = nir_after_cf_list(&available_if->then_list); + + nir_store_var(&b, result, timestamp, 0x1); + nir_store_var(&b, available, nir_imm_true(&b), 0x1); - b.cursor = nir_after_cf_list(&availability_if->then_list); + b.cursor = nir_after_cf_node(&available_if->cf_node); + + /* Determine if result is 64 or 32 bit. */ + nir_ssa_def *result_is_64bit = + nir_test_flag(&b, flags, VK_QUERY_RESULT_64_BIT); + nir_ssa_def *result_size = + nir_bcsel(&b, result_is_64bit, nir_imm_int(&b, 8), + nir_imm_int(&b, 4)); + + /* Store the result if complete or partial results have been requested. */ + nir_if *store_if = nir_if_create(b.shader); + store_if->condition = + nir_src_for_ssa(nir_ior(&b, nir_test_flag(&b, flags, VK_QUERY_RESULT_PARTIAL_BIT), + nir_load_var(&b, available))); + nir_cf_node_insert(b.cursor, &store_if->cf_node); + + b.cursor = nir_after_cf_list(&store_if->then_list); + + /* Store result. */ + nir_if *store_64bit_if = nir_if_create(b.shader); + store_64bit_if->condition = nir_src_for_ssa(result_is_64bit); + nir_cf_node_insert(b.cursor, &store_64bit_if->cf_node); + + b.cursor = nir_after_cf_list(&store_64bit_if->then_list); + + nir_intrinsic_instr *store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo); + store->src[0] = nir_src_for_ssa(nir_load_var(&b, result)); + store->src[1] = nir_src_for_ssa(&dst_buf->dest.ssa); + store->src[2] = nir_src_for_ssa(output_base); + nir_intrinsic_set_write_mask(store, 0x1); + nir_intrinsic_set_align(store, 8, 0); + store->num_components = 1; + nir_builder_instr_insert(&b, &store->instr); + + b.cursor = nir_after_cf_list(&store_64bit_if->else_list); store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo); - store->src[0] = nir_src_for_ssa(nir_b2i32(&b, nir_load_var(&b, available))); + store->src[0] = nir_src_for_ssa(nir_u2u32(&b, nir_load_var(&b, result))); store->src[1] = nir_src_for_ssa(&dst_buf->dest.ssa); - store->src[2] = nir_src_for_ssa(nir_iadd(&b, result_size, output_base)); + store->src[2] = nir_src_for_ssa(output_base); nir_intrinsic_set_write_mask(store, 0x1); + nir_intrinsic_set_align(store, 4, 0); store->num_components = 1; nir_builder_instr_insert(&b, &store->instr); - b.cursor = nir_after_cf_node(&availability_if->cf_node); + b.cursor = nir_after_cf_node(&store_64bit_if->cf_node); + + b.cursor = nir_after_cf_node(&store_if->cf_node); + + radv_store_availability(&b, flags, &dst_buf->dest.ssa, + nir_iadd(&b, result_size, output_base), + nir_b2i32(&b, nir_load_var(&b, available))); return b.shader; } @@ -747,6 +944,7 @@ struct radv_shader_module occlusion_cs = { .nir = NULL }; struct radv_shader_module pipeline_statistics_cs = { .nir = NULL }; struct radv_shader_module tfb_cs = { .nir = NULL }; + struct radv_shader_module timestamp_cs = { .nir = NULL }; mtx_lock(&device->meta_state.mtx); if (device->meta_state.query.pipeline_statistics_query_pipeline) { @@ -756,6 +954,7 @@ occlusion_cs.nir = build_occlusion_query_shader(device); pipeline_statistics_cs.nir = build_pipeline_statistics_query_shader(device); tfb_cs.nir = build_tfb_query_shader(device); + timestamp_cs.nir = build_timestamp_query_shader(device); VkDescriptorSetLayoutCreateInfo occlusion_ds_create_info = { .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO, @@ -864,12 +1063,36 @@ radv_pipeline_cache_to_handle(&device->meta_state.cache), 1, &tfb_pipeline_info, NULL, &device->meta_state.query.tfb_query_pipeline); + if (result != VK_SUCCESS) + goto fail; + + VkPipelineShaderStageCreateInfo timestamp_pipeline_shader_stage = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, + .stage = VK_SHADER_STAGE_COMPUTE_BIT, + .module = radv_shader_module_to_handle(×tamp_cs), + .pName = "main", + .pSpecializationInfo = NULL, + }; + + VkComputePipelineCreateInfo timestamp_pipeline_info = { + .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, + .stage = timestamp_pipeline_shader_stage, + .flags = 0, + .layout = device->meta_state.query.p_layout, + }; + + result = radv_CreateComputePipelines(radv_device_to_handle(device), + radv_pipeline_cache_to_handle(&device->meta_state.cache), + 1, ×tamp_pipeline_info, NULL, + &device->meta_state.query.timestamp_query_pipeline); + fail: if (result != VK_SUCCESS) radv_device_finish_meta_query_state(device); ralloc_free(occlusion_cs.nir); ralloc_free(pipeline_statistics_cs.nir); ralloc_free(tfb_cs.nir); + ralloc_free(timestamp_cs.nir); mtx_unlock(&device->meta_state.mtx); return result; } @@ -899,6 +1122,11 @@ device->meta_state.query.occlusion_query_pipeline, &device->meta_state.alloc); + if (device->meta_state.query.timestamp_query_pipeline) + radv_DestroyPipeline(radv_device_to_handle(device), + device->meta_state.query.timestamp_query_pipeline, + &device->meta_state.alloc); + if (device->meta_state.query.p_layout) radv_DestroyPipelineLayout(radv_device_to_handle(device), device->meta_state.query.p_layout, @@ -1026,6 +1254,22 @@ radv_meta_restore(&saved_state, cmd_buffer); } +static bool +radv_query_pool_needs_gds(struct radv_device *device, + struct radv_query_pool *pool) +{ + /* The number of primitives generated by geometry shader invocations is + * only counted by the hardware if GS uses the legacy path. When NGG GS + * is used, the hardware can't know the number of generated primitives + * and we have to it manually inside the shader. To achieve that, the + * driver does a plain GDS atomic to accumulate that value. + * TODO: fix use of NGG GS and non-NGG GS inside the same begin/end + * query. + */ + return device->physical_device->use_ngg && + (pool->pipeline_stats_mask & VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT); +} + VkResult radv_CreateQueryPool( VkDevice _device, const VkQueryPoolCreateInfo* pCreateInfo, @@ -1308,7 +1552,6 @@ RADV_FROM_HANDLE(radv_query_pool, pool, queryPool); RADV_FROM_HANDLE(radv_buffer, dst_buffer, dstBuffer); struct radeon_cmdbuf *cs = cmd_buffer->cs; - unsigned elem_size = (flags & VK_QUERY_RESULT_64_BIT) ? 8 : 4; uint64_t va = radv_buffer_get_va(pool->bo); uint64_t dest_va = radv_buffer_get_va(dst_buffer->bo); dest_va += dst_buffer->offset + dstOffset; @@ -1368,14 +1611,13 @@ pool->availability_offset + 4 * firstQuery); break; case VK_QUERY_TYPE_TIMESTAMP: - for(unsigned i = 0; i < queryCount; ++i, dest_va += stride) { - unsigned query = firstQuery + i; - uint64_t local_src_va = va + query * pool->stride; - - ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 19); + if (flags & VK_QUERY_RESULT_WAIT_BIT) { + for(unsigned i = 0; i < queryCount; ++i, dest_va += stride) { + unsigned query = firstQuery + i; + uint64_t local_src_va = va + query * pool->stride; + radeon_check_space(cmd_buffer->device->ws, cs, 7); - if (flags & VK_QUERY_RESULT_WAIT_BIT) { /* Wait on the high 32 bits of the timestamp in * case the low part is 0xffffffff. */ @@ -1384,30 +1626,14 @@ TIMESTAMP_NOT_READY >> 32, 0xffffffff); } - if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) { - uint64_t avail_dest_va = dest_va + elem_size; - - radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); - radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | - COPY_DATA_DST_SEL(COPY_DATA_DST_MEM_GRBM)); - radeon_emit(cs, local_src_va); - radeon_emit(cs, local_src_va >> 32); - radeon_emit(cs, avail_dest_va); - radeon_emit(cs, avail_dest_va >> 32); - } - - radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); - radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | - COPY_DATA_DST_SEL(COPY_DATA_DST_MEM_GRBM) | - ((flags & VK_QUERY_RESULT_64_BIT) ? COPY_DATA_COUNT_SEL : 0)); - radeon_emit(cs, local_src_va); - radeon_emit(cs, local_src_va >> 32); - radeon_emit(cs, dest_va); - radeon_emit(cs, dest_va >> 32); - - - assert(cs->cdw <= cdw_max); } + + radv_query_shader(cmd_buffer, &cmd_buffer->device->meta_state.query.timestamp_query_pipeline, + pool->bo, dst_buffer->bo, + firstQuery * pool->stride, + dst_buffer->offset + dstOffset, + pool->stride, stride, + queryCount, flags, 0, 0); break; case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: if (flags & VK_QUERY_RESULT_WAIT_BIT) { @@ -1474,7 +1700,7 @@ } } -void radv_ResetQueryPoolEXT( +void radv_ResetQueryPool( VkDevice _device, VkQueryPool queryPool, uint32_t firstQuery, @@ -1508,6 +1734,7 @@ } static void emit_begin_query(struct radv_cmd_buffer *cmd_buffer, + struct radv_query_pool *pool, uint64_t va, VkQueryType query_type, VkQueryControlFlags flags, @@ -1559,6 +1786,30 @@ radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2)); radeon_emit(cs, va); radeon_emit(cs, va >> 32); + + if (radv_query_pool_needs_gds(cmd_buffer->device, pool)) { + int idx = radv_get_pipeline_statistics_index(VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT); + + /* Make sure GDS is idle before copying the value. */ + cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH | + RADV_CMD_FLAG_INV_L2; + si_emit_cache_flush(cmd_buffer); + + va += 8 * idx; + + si_cs_emit_write_event_eop(cs, + cmd_buffer->device->physical_device->rad_info.chip_class, + radv_cmd_buffer_uses_mec(cmd_buffer), + V_028A90_PS_DONE, 0, + EOP_DST_SEL_TC_L2, + EOP_DATA_SEL_GDS, + va, EOP_DATA_GDS(0, 1), 0); + + /* Record that the command buffer needs GDS. */ + cmd_buffer->gds_needed = true; + + cmd_buffer->state.active_pipeline_gds_queries++; + } break; case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: radeon_check_space(cmd_buffer->device->ws, cs, 4); @@ -1577,6 +1828,7 @@ } static void emit_end_query(struct radv_cmd_buffer *cmd_buffer, + struct radv_query_pool *pool, uint64_t va, uint64_t avail_va, VkQueryType query_type, uint32_t index) { @@ -1624,6 +1876,27 @@ EOP_DATA_SEL_VALUE_32BIT, avail_va, 1, cmd_buffer->gfx9_eop_bug_va); + + if (radv_query_pool_needs_gds(cmd_buffer->device, pool)) { + int idx = radv_get_pipeline_statistics_index(VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT); + + /* Make sure GDS is idle before copying the value. */ + cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH | + RADV_CMD_FLAG_INV_L2; + si_emit_cache_flush(cmd_buffer); + + va += 8 * idx; + + si_cs_emit_write_event_eop(cs, + cmd_buffer->device->physical_device->rad_info.chip_class, + radv_cmd_buffer_uses_mec(cmd_buffer), + V_028A90_PS_DONE, 0, + EOP_DST_SEL_TC_L2, + EOP_DATA_SEL_GDS, + va, EOP_DATA_GDS(0, 1), 0); + + cmd_buffer->state.active_pipeline_gds_queries--; + } break; case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: radeon_check_space(cmd_buffer->device->ws, cs, 4); @@ -1667,7 +1940,7 @@ va += pool->stride * query; - emit_begin_query(cmd_buffer, va, pool->type, flags, index); + emit_begin_query(cmd_buffer, pool, va, pool->type, flags, index); } void radv_CmdBeginQuery( @@ -1694,7 +1967,7 @@ /* Do not need to add the pool BO to the list because the query must * currently be active, which means the BO is already in the list. */ - emit_end_query(cmd_buffer, va, avail_va, pool->type, index); + emit_end_query(cmd_buffer, pool, va, avail_va, pool->type, index); /* * For multiview we have to emit a query for each bit in the mask, @@ -1711,8 +1984,8 @@ for (unsigned i = 1; i < util_bitcount(cmd_buffer->state.subpass->view_mask); i++) { va += pool->stride; avail_va += 4; - emit_begin_query(cmd_buffer, va, pool->type, 0, 0); - emit_end_query(cmd_buffer, va, avail_va, pool->type, 0); + emit_begin_query(cmd_buffer, pool, va, pool->type, 0, 0); + emit_end_query(cmd_buffer, pool, va, avail_va, pool->type, 0); } } } @@ -1773,5 +2046,15 @@ } query_va += pool->stride; } + + cmd_buffer->active_query_flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH | + RADV_CMD_FLAG_CS_PARTIAL_FLUSH | + RADV_CMD_FLAG_INV_L2 | + RADV_CMD_FLAG_INV_VCACHE; + if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) { + cmd_buffer->active_query_flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB | + RADV_CMD_FLAG_FLUSH_AND_INV_DB; + } + assert(cmd_buffer->cs->cdw <= cdw_max); } diff -Nru mesa-19.2.8/src/amd/vulkan/radv_radeon_winsys.h mesa-20.0.8/src/amd/vulkan/radv_radeon_winsys.h --- mesa-19.2.8/src/amd/vulkan/radv_radeon_winsys.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/vulkan/radv_radeon_winsys.h 2020-06-12 01:21:16.000000000 +0000 @@ -34,6 +34,8 @@ #include #include #include "main/macros.h" +#include +#include #include "amd_family.h" struct radeon_info; @@ -45,7 +47,9 @@ enum radeon_bo_domain { /* bitfield */ RADEON_DOMAIN_GTT = 2, RADEON_DOMAIN_VRAM = 4, - RADEON_DOMAIN_VRAM_GTT = RADEON_DOMAIN_VRAM | RADEON_DOMAIN_GTT + RADEON_DOMAIN_VRAM_GTT = RADEON_DOMAIN_VRAM | RADEON_DOMAIN_GTT, + RADEON_DOMAIN_GDS = 8, + RADEON_DOMAIN_OA = 16, }; enum radeon_bo_flag { /* bitfield */ @@ -67,15 +71,6 @@ RADEON_USAGE_READWRITE = RADEON_USAGE_READ | RADEON_USAGE_WRITE }; -enum ring_type { - RING_GFX = 0, - RING_COMPUTE, - RING_DMA, - RING_UVD, - RING_VCE, - RING_LAST, -}; - enum radeon_ctx_priority { RADEON_CTX_PRIORITY_INVALID = -1, RADEON_CTX_PRIORITY_LOW = 0, @@ -151,6 +146,7 @@ struct { /* surface flags */ unsigned swizzle_mode:5; + bool scanout; } gfx9; } u; @@ -162,8 +158,8 @@ uint32_t metadata[64]; }; -uint32_t syncobj_handle; struct radeon_winsys_fence; +struct radeon_winsys_ctx; struct radeon_winsys_bo { uint64_t va; @@ -242,7 +238,7 @@ struct radeon_winsys_bo *(*buffer_from_fd)(struct radeon_winsys *ws, int fd, unsigned priority, - unsigned *stride, unsigned *offset); + uint64_t *alloc_size); bool (*buffer_get_fd)(struct radeon_winsys *ws, struct radeon_winsys_bo *bo, @@ -258,8 +254,9 @@ void (*buffer_virtual_bind)(struct radeon_winsys_bo *parent, uint64_t offset, uint64_t size, struct radeon_winsys_bo *bo, uint64_t bo_offset); - struct radeon_winsys_ctx *(*ctx_create)(struct radeon_winsys *ws, - enum radeon_ctx_priority priority); + VkResult (*ctx_create)(struct radeon_winsys *ws, + enum radeon_ctx_priority priority, + struct radeon_winsys_ctx **ctx); void (*ctx_destroy)(struct radeon_winsys_ctx *ctx); bool (*ctx_wait_idle)(struct radeon_winsys_ctx *ctx, diff -Nru mesa-19.2.8/src/amd/vulkan/radv_shader_args.c mesa-20.0.8/src/amd/vulkan/radv_shader_args.c --- mesa-19.2.8/src/amd/vulkan/radv_shader_args.c 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/amd/vulkan/radv_shader_args.c 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,762 @@ +/* + * Copyright © 2019 Valve Corporation. + * Copyright © 2016 Red Hat. + * Copyright © 2016 Bas Nieuwenhuizen + * + * based in part on anv driver which is: + * Copyright © 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "radv_private.h" +#include "radv_shader.h" +#include "radv_shader_args.h" + +static void +set_loc(struct radv_userdata_info *ud_info, uint8_t *sgpr_idx, + uint8_t num_sgprs) +{ + ud_info->sgpr_idx = *sgpr_idx; + ud_info->num_sgprs = num_sgprs; + *sgpr_idx += num_sgprs; +} + +static void +set_loc_shader(struct radv_shader_args *args, int idx, uint8_t *sgpr_idx, + uint8_t num_sgprs) +{ + struct radv_userdata_info *ud_info = + &args->shader_info->user_sgprs_locs.shader_data[idx]; + assert(ud_info); + + set_loc(ud_info, sgpr_idx, num_sgprs); +} + +static void +set_loc_shader_ptr(struct radv_shader_args *args, int idx, uint8_t *sgpr_idx) +{ + bool use_32bit_pointers = idx != AC_UD_SCRATCH_RING_OFFSETS; + + set_loc_shader(args, idx, sgpr_idx, use_32bit_pointers ? 1 : 2); +} + +static void +set_loc_desc(struct radv_shader_args *args, int idx, uint8_t *sgpr_idx) +{ + struct radv_userdata_locations *locs = + &args->shader_info->user_sgprs_locs; + struct radv_userdata_info *ud_info = &locs->descriptor_sets[idx]; + assert(ud_info); + + set_loc(ud_info, sgpr_idx, 1); + + locs->descriptor_sets_enabled |= 1 << idx; +} + +struct user_sgpr_info { + bool indirect_all_descriptor_sets; + uint8_t remaining_sgprs; +}; + +static bool needs_view_index_sgpr(struct radv_shader_args *args, + gl_shader_stage stage) +{ + switch (stage) { + case MESA_SHADER_VERTEX: + if (args->shader_info->needs_multiview_view_index || + (!args->options->key.vs_common_out.as_es && !args->options->key.vs_common_out.as_ls && args->options->key.has_multiview_view_index)) + return true; + break; + case MESA_SHADER_TESS_EVAL: + if (args->shader_info->needs_multiview_view_index || (!args->options->key.vs_common_out.as_es && args->options->key.has_multiview_view_index)) + return true; + break; + case MESA_SHADER_GEOMETRY: + case MESA_SHADER_TESS_CTRL: + if (args->shader_info->needs_multiview_view_index) + return true; + break; + default: + break; + } + return false; +} + +static uint8_t +count_vs_user_sgprs(struct radv_shader_args *args) +{ + uint8_t count = 0; + + if (args->shader_info->vs.has_vertex_buffers) + count++; + count += args->shader_info->vs.needs_draw_id ? 3 : 2; + + return count; +} + +static void allocate_inline_push_consts(struct radv_shader_args *args, + struct user_sgpr_info *user_sgpr_info) +{ + uint8_t remaining_sgprs = user_sgpr_info->remaining_sgprs; + + /* Only supported if shaders use push constants. */ + if (args->shader_info->min_push_constant_used == UINT8_MAX) + return; + + /* Only supported if shaders don't have indirect push constants. */ + if (args->shader_info->has_indirect_push_constants) + return; + + /* Only supported for 32-bit push constants. */ + if (!args->shader_info->has_only_32bit_push_constants) + return; + + uint8_t num_push_consts = + (args->shader_info->max_push_constant_used - + args->shader_info->min_push_constant_used) / 4; + + /* Check if the number of user SGPRs is large enough. */ + if (num_push_consts < remaining_sgprs) { + args->shader_info->num_inline_push_consts = num_push_consts; + } else { + args->shader_info->num_inline_push_consts = remaining_sgprs; + } + + /* Clamp to the maximum number of allowed inlined push constants. */ + if (args->shader_info->num_inline_push_consts > AC_MAX_INLINE_PUSH_CONSTS) + args->shader_info->num_inline_push_consts = AC_MAX_INLINE_PUSH_CONSTS; + + if (args->shader_info->num_inline_push_consts == num_push_consts && + !args->shader_info->loads_dynamic_offsets) { + /* Disable the default push constants path if all constants are + * inlined and if shaders don't use dynamic descriptors. + */ + args->shader_info->loads_push_constants = false; + } + + args->shader_info->base_inline_push_consts = + args->shader_info->min_push_constant_used / 4; +} + +static void allocate_user_sgprs(struct radv_shader_args *args, + gl_shader_stage stage, + bool has_previous_stage, + gl_shader_stage previous_stage, + bool needs_view_index, + struct user_sgpr_info *user_sgpr_info) +{ + uint8_t user_sgpr_count = 0; + + memset(user_sgpr_info, 0, sizeof(struct user_sgpr_info)); + + /* 2 user sgprs will always be allocated for scratch/rings */ + user_sgpr_count += 2; + + switch (stage) { + case MESA_SHADER_COMPUTE: + if (args->shader_info->cs.uses_grid_size) + user_sgpr_count += 3; + break; + case MESA_SHADER_FRAGMENT: + user_sgpr_count += args->shader_info->ps.needs_sample_positions; + break; + case MESA_SHADER_VERTEX: + if (!args->is_gs_copy_shader) + user_sgpr_count += count_vs_user_sgprs(args); + break; + case MESA_SHADER_TESS_CTRL: + if (has_previous_stage) { + if (previous_stage == MESA_SHADER_VERTEX) + user_sgpr_count += count_vs_user_sgprs(args); + } + break; + case MESA_SHADER_TESS_EVAL: + break; + case MESA_SHADER_GEOMETRY: + if (has_previous_stage) { + if (previous_stage == MESA_SHADER_VERTEX) { + user_sgpr_count += count_vs_user_sgprs(args); + } + } + break; + default: + break; + } + + if (needs_view_index) + user_sgpr_count++; + + if (args->shader_info->loads_push_constants) + user_sgpr_count++; + + if (args->shader_info->so.num_outputs) + user_sgpr_count++; + + uint32_t available_sgprs = args->options->chip_class >= GFX9 && stage != MESA_SHADER_COMPUTE ? 32 : 16; + uint32_t remaining_sgprs = available_sgprs - user_sgpr_count; + uint32_t num_desc_set = + util_bitcount(args->shader_info->desc_set_used_mask); + + if (remaining_sgprs < num_desc_set) { + user_sgpr_info->indirect_all_descriptor_sets = true; + user_sgpr_info->remaining_sgprs = remaining_sgprs - 1; + } else { + user_sgpr_info->remaining_sgprs = remaining_sgprs - num_desc_set; + } + + allocate_inline_push_consts(args, user_sgpr_info); +} + +static void +declare_global_input_sgprs(struct radv_shader_args *args, + const struct user_sgpr_info *user_sgpr_info) +{ + /* 1 for each descriptor set */ + if (!user_sgpr_info->indirect_all_descriptor_sets) { + uint32_t mask = args->shader_info->desc_set_used_mask; + + while (mask) { + int i = u_bit_scan(&mask); + + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_CONST_PTR, + &args->descriptor_sets[i]); + } + } else { + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_CONST_PTR_PTR, + &args->descriptor_sets[0]); + } + + if (args->shader_info->loads_push_constants) { + /* 1 for push constants and dynamic descriptors */ + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_CONST_PTR, + &args->ac.push_constants); + } + + for (unsigned i = 0; i < args->shader_info->num_inline_push_consts; i++) { + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, + &args->ac.inline_push_consts[i]); + } + args->ac.num_inline_push_consts = args->shader_info->num_inline_push_consts; + args->ac.base_inline_push_consts = args->shader_info->base_inline_push_consts; + + if (args->shader_info->so.num_outputs) { + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, + &args->streamout_buffers); + } +} + +static void +declare_vs_specific_input_sgprs(struct radv_shader_args *args, + gl_shader_stage stage, + bool has_previous_stage, + gl_shader_stage previous_stage) +{ + if (!args->is_gs_copy_shader && + (stage == MESA_SHADER_VERTEX || + (has_previous_stage && previous_stage == MESA_SHADER_VERTEX))) { + if (args->shader_info->vs.has_vertex_buffers) { + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, + &args->vertex_buffers); + } + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.base_vertex); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.start_instance); + if (args->shader_info->vs.needs_draw_id) { + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.draw_id); + } + } +} + +static void +declare_vs_input_vgprs(struct radv_shader_args *args) +{ + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.vertex_id); + if (!args->is_gs_copy_shader) { + if (args->options->key.vs_common_out.as_ls) { + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->rel_auto_id); + if (args->options->chip_class >= GFX10) { + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user vgpr */ + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.instance_id); + } else { + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.instance_id); + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* unused */ + } + } else { + if (args->options->chip_class >= GFX10) { + if (args->options->key.vs_common_out.as_ngg) { + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user vgpr */ + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user vgpr */ + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.instance_id); + } else { + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* unused */ + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->vs_prim_id); + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.instance_id); + } + } else { + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.instance_id); + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->vs_prim_id); + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* unused */ + } + } + } +} + +static void +declare_streamout_sgprs(struct radv_shader_args *args, gl_shader_stage stage) +{ + int i; + + if (args->options->use_ngg_streamout) { + if (stage == MESA_SHADER_TESS_EVAL) + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); + return; + } + + /* Streamout SGPRs. */ + if (args->shader_info->so.num_outputs) { + assert(stage == MESA_SHADER_VERTEX || + stage == MESA_SHADER_TESS_EVAL); + + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->streamout_config); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->streamout_write_idx); + } else if (stage == MESA_SHADER_TESS_EVAL) { + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); + } + + /* A streamout buffer offset is loaded if the stride is non-zero. */ + for (i = 0; i < 4; i++) { + if (!args->shader_info->so.strides[i]) + continue; + + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->streamout_offset[i]); + } +} + +static void +declare_tes_input_vgprs(struct radv_shader_args *args) +{ + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->tes_u); + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->tes_v); + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->tes_rel_patch_id); + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.tes_patch_id); +} + +static void +set_global_input_locs(struct radv_shader_args *args, + const struct user_sgpr_info *user_sgpr_info, + uint8_t *user_sgpr_idx) +{ + uint32_t mask = args->shader_info->desc_set_used_mask; + + if (!user_sgpr_info->indirect_all_descriptor_sets) { + while (mask) { + int i = u_bit_scan(&mask); + + set_loc_desc(args, i, user_sgpr_idx); + } + } else { + set_loc_shader_ptr(args, AC_UD_INDIRECT_DESCRIPTOR_SETS, + user_sgpr_idx); + + args->shader_info->need_indirect_descriptor_sets = true; + } + + if (args->shader_info->loads_push_constants) { + set_loc_shader_ptr(args, AC_UD_PUSH_CONSTANTS, user_sgpr_idx); + } + + if (args->shader_info->num_inline_push_consts) { + set_loc_shader(args, AC_UD_INLINE_PUSH_CONSTANTS, user_sgpr_idx, + args->shader_info->num_inline_push_consts); + } + + if (args->streamout_buffers.used) { + set_loc_shader_ptr(args, AC_UD_STREAMOUT_BUFFERS, + user_sgpr_idx); + } +} + +static void +set_vs_specific_input_locs(struct radv_shader_args *args, + gl_shader_stage stage, bool has_previous_stage, + gl_shader_stage previous_stage, + uint8_t *user_sgpr_idx) +{ + if (!args->is_gs_copy_shader && + (stage == MESA_SHADER_VERTEX || + (has_previous_stage && previous_stage == MESA_SHADER_VERTEX))) { + if (args->shader_info->vs.has_vertex_buffers) { + set_loc_shader_ptr(args, AC_UD_VS_VERTEX_BUFFERS, + user_sgpr_idx); + } + + unsigned vs_num = 2; + if (args->shader_info->vs.needs_draw_id) + vs_num++; + + set_loc_shader(args, AC_UD_VS_BASE_VERTEX_START_INSTANCE, + user_sgpr_idx, vs_num); + } +} + +/* Returns whether the stage is a stage that can be directly before the GS */ +static bool is_pre_gs_stage(gl_shader_stage stage) +{ + return stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_TESS_EVAL; +} + +void +radv_declare_shader_args(struct radv_shader_args *args, + gl_shader_stage stage, + bool has_previous_stage, + gl_shader_stage previous_stage) +{ + struct user_sgpr_info user_sgpr_info; + bool needs_view_index = needs_view_index_sgpr(args, stage); + + if (args->options->chip_class >= GFX10) { + if (is_pre_gs_stage(stage) && args->options->key.vs_common_out.as_ngg) { + /* On GFX10, VS is merged into GS for NGG. */ + previous_stage = stage; + stage = MESA_SHADER_GEOMETRY; + has_previous_stage = true; + } + } + + for (int i = 0; i < MAX_SETS; i++) + args->shader_info->user_sgprs_locs.descriptor_sets[i].sgpr_idx = -1; + for (int i = 0; i < AC_UD_MAX_UD; i++) + args->shader_info->user_sgprs_locs.shader_data[i].sgpr_idx = -1; + + + allocate_user_sgprs(args, stage, has_previous_stage, + previous_stage, needs_view_index, &user_sgpr_info); + + if (args->options->explicit_scratch_args) { + ac_add_arg(&args->ac, AC_ARG_SGPR, 2, AC_ARG_CONST_DESC_PTR, + &args->ring_offsets); + } + + switch (stage) { + case MESA_SHADER_COMPUTE: + declare_global_input_sgprs(args, &user_sgpr_info); + + if (args->shader_info->cs.uses_grid_size) { + ac_add_arg(&args->ac, AC_ARG_SGPR, 3, AC_ARG_INT, + &args->ac.num_work_groups); + } + + for (int i = 0; i < 3; i++) { + if (args->shader_info->cs.uses_block_id[i]) { + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, + &args->ac.workgroup_ids[i]); + } + } + + if (args->shader_info->cs.uses_local_invocation_idx) { + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, + &args->ac.tg_size); + } + + if (args->options->explicit_scratch_args) { + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, + &args->scratch_offset); + } + + ac_add_arg(&args->ac, AC_ARG_VGPR, 3, AC_ARG_INT, + &args->ac.local_invocation_ids); + break; + case MESA_SHADER_VERTEX: + declare_global_input_sgprs(args, &user_sgpr_info); + + declare_vs_specific_input_sgprs(args, stage, has_previous_stage, + previous_stage); + + if (needs_view_index) { + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, + &args->ac.view_index); + } + + if (args->options->key.vs_common_out.as_es) { + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, + &args->es2gs_offset); + } else if (args->options->key.vs_common_out.as_ls) { + /* no extra parameters */ + } else { + declare_streamout_sgprs(args, stage); + } + + if (args->options->explicit_scratch_args) { + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, + &args->scratch_offset); + } + + declare_vs_input_vgprs(args); + break; + case MESA_SHADER_TESS_CTRL: + if (has_previous_stage) { + // First 6 system regs + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->oc_lds); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, + &args->merged_wave_info); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, + &args->tess_factor_offset); + + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->scratch_offset); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); // unknown + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); // unknown + + declare_global_input_sgprs(args, &user_sgpr_info); + + declare_vs_specific_input_sgprs(args, stage, + has_previous_stage, + previous_stage); + + if (needs_view_index) { + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, + &args->ac.view_index); + } + + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, + &args->ac.tcs_patch_id); + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, + &args->ac.tcs_rel_ids); + + declare_vs_input_vgprs(args); + } else { + declare_global_input_sgprs(args, &user_sgpr_info); + + if (needs_view_index) { + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, + &args->ac.view_index); + } + + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->oc_lds); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, + &args->tess_factor_offset); + if (args->options->explicit_scratch_args) { + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, + &args->scratch_offset); + } + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, + &args->ac.tcs_patch_id); + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, + &args->ac.tcs_rel_ids); + } + break; + case MESA_SHADER_TESS_EVAL: + declare_global_input_sgprs(args, &user_sgpr_info); + + if (needs_view_index) + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, + &args->ac.view_index); + + if (args->options->key.vs_common_out.as_es) { + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->oc_lds); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, + &args->es2gs_offset); + } else { + declare_streamout_sgprs(args, stage); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->oc_lds); + } + if (args->options->explicit_scratch_args) { + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, + &args->scratch_offset); + } + declare_tes_input_vgprs(args); + break; + case MESA_SHADER_GEOMETRY: + if (has_previous_stage) { + // First 6 system regs + if (args->options->key.vs_common_out.as_ngg) { + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, + &args->gs_tg_info); + } else { + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, + &args->gs2vs_offset); + } + + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, + &args->merged_wave_info); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->oc_lds); + + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->scratch_offset); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); // unknown + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); // unknown + + declare_global_input_sgprs(args, &user_sgpr_info); + + if (previous_stage != MESA_SHADER_TESS_EVAL) { + declare_vs_specific_input_sgprs(args, stage, + has_previous_stage, + previous_stage); + } + + if (needs_view_index) { + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, + &args->ac.view_index); + } + + if (args->options->key.vs_common_out.as_ngg) { + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, + &args->ngg_gs_state); + } + + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, + &args->gs_vtx_offset[0]); + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, + &args->gs_vtx_offset[2]); + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, + &args->ac.gs_prim_id); + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, + &args->ac.gs_invocation_id); + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, + &args->gs_vtx_offset[4]); + + if (previous_stage == MESA_SHADER_VERTEX) { + declare_vs_input_vgprs(args); + } else { + declare_tes_input_vgprs(args); + } + } else { + declare_global_input_sgprs(args, &user_sgpr_info); + + if (needs_view_index) { + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, + &args->ac.view_index); + } + + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->gs2vs_offset); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->gs_wave_id); + if (args->options->explicit_scratch_args) { + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, + &args->scratch_offset); + } + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, + &args->gs_vtx_offset[0]); + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, + &args->gs_vtx_offset[1]); + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, + &args->ac.gs_prim_id); + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, + &args->gs_vtx_offset[2]); + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, + &args->gs_vtx_offset[3]); + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, + &args->gs_vtx_offset[4]); + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, + &args->gs_vtx_offset[5]); + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, + &args->ac.gs_invocation_id); + } + break; + case MESA_SHADER_FRAGMENT: + declare_global_input_sgprs(args, &user_sgpr_info); + + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.prim_mask); + if (args->options->explicit_scratch_args) { + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, + &args->scratch_offset); + } + ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_INT, &args->ac.persp_sample); + ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_INT, &args->ac.persp_center); + ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_INT, &args->ac.persp_centroid); + ac_add_arg(&args->ac, AC_ARG_VGPR, 3, AC_ARG_INT, &args->ac.pull_model); + ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_INT, &args->ac.linear_sample); + ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_INT, &args->ac.linear_center); + ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_INT, &args->ac.linear_centroid); + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL); /* line stipple tex */ + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.frag_pos[0]); + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.frag_pos[1]); + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.frag_pos[2]); + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.frag_pos[3]); + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.front_face); + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.ancillary); + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.sample_coverage); + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* fixed pt */ + break; + default: + unreachable("Shader stage not implemented"); + } + + args->shader_info->num_input_vgprs = 0; + args->shader_info->num_input_sgprs = 2; + args->shader_info->num_input_sgprs += args->ac.num_sgprs_used; + args->shader_info->num_input_vgprs = args->ac.num_vgprs_used; + + uint8_t user_sgpr_idx = 0; + + set_loc_shader_ptr(args, AC_UD_SCRATCH_RING_OFFSETS, + &user_sgpr_idx); + + /* For merged shaders the user SGPRs start at 8, with 8 system SGPRs in front (including + * the rw_buffers at s0/s1. With user SGPR0 = s8, lets restart the count from 0 */ + if (has_previous_stage) + user_sgpr_idx = 0; + + set_global_input_locs(args, &user_sgpr_info, &user_sgpr_idx); + + switch (stage) { + case MESA_SHADER_COMPUTE: + if (args->shader_info->cs.uses_grid_size) { + set_loc_shader(args, AC_UD_CS_GRID_SIZE, + &user_sgpr_idx, 3); + } + break; + case MESA_SHADER_VERTEX: + set_vs_specific_input_locs(args, stage, has_previous_stage, + previous_stage, &user_sgpr_idx); + if (args->ac.view_index.used) + set_loc_shader(args, AC_UD_VIEW_INDEX, &user_sgpr_idx, 1); + break; + case MESA_SHADER_TESS_CTRL: + set_vs_specific_input_locs(args, stage, has_previous_stage, + previous_stage, &user_sgpr_idx); + if (args->ac.view_index.used) + set_loc_shader(args, AC_UD_VIEW_INDEX, &user_sgpr_idx, 1); + break; + case MESA_SHADER_TESS_EVAL: + if (args->ac.view_index.used) + set_loc_shader(args, AC_UD_VIEW_INDEX, &user_sgpr_idx, 1); + break; + case MESA_SHADER_GEOMETRY: + if (has_previous_stage) { + if (previous_stage == MESA_SHADER_VERTEX) + set_vs_specific_input_locs(args, stage, + has_previous_stage, + previous_stage, + &user_sgpr_idx); + } + if (args->ac.view_index.used) + set_loc_shader(args, AC_UD_VIEW_INDEX, &user_sgpr_idx, 1); + + if (args->ngg_gs_state.used) + set_loc_shader(args, AC_UD_NGG_GS_STATE, &user_sgpr_idx, 1); + break; + case MESA_SHADER_FRAGMENT: + break; + default: + unreachable("Shader stage not implemented"); + } + + args->shader_info->num_user_sgprs = user_sgpr_idx; +} + diff -Nru mesa-19.2.8/src/amd/vulkan/radv_shader_args.h mesa-20.0.8/src/amd/vulkan/radv_shader_args.h --- mesa-19.2.8/src/amd/vulkan/radv_shader_args.h 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/amd/vulkan/radv_shader_args.h 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,85 @@ +/* + * Copyright © 2019 Valve Corporation. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "ac_shader_args.h" +#include "radv_constants.h" +#include "util/list.h" +#include "compiler/shader_enums.h" +#include "amd_family.h" + +struct radv_shader_args { + struct ac_shader_args ac; + struct radv_shader_info *shader_info; + const struct radv_nir_compiler_options *options; + + struct ac_arg descriptor_sets[MAX_SETS]; + struct ac_arg ring_offsets; + struct ac_arg scratch_offset; + + struct ac_arg vertex_buffers; + struct ac_arg rel_auto_id; + struct ac_arg vs_prim_id; + struct ac_arg es2gs_offset; + + struct ac_arg oc_lds; + struct ac_arg merged_wave_info; + struct ac_arg tess_factor_offset; + struct ac_arg tes_rel_patch_id; + struct ac_arg tes_u; + struct ac_arg tes_v; + + /* HW GS */ + /* On gfx10: + * - bits 0..11: ordered_wave_id + * - bits 12..20: number of vertices in group + * - bits 22..30: number of primitives in group + */ + struct ac_arg gs_tg_info; + struct ac_arg gs2vs_offset; + struct ac_arg gs_wave_id; + struct ac_arg gs_vtx_offset[6]; + + /* Streamout */ + struct ac_arg streamout_buffers; + struct ac_arg streamout_write_idx; + struct ac_arg streamout_config; + struct ac_arg streamout_offset[4]; + + /* NGG GS */ + struct ac_arg ngg_gs_state; + + bool is_gs_copy_shader; +}; + +static inline struct radv_shader_args * +radv_shader_args_from_ac(struct ac_shader_args *args) +{ + struct radv_shader_args *radv_args = NULL; + return (struct radv_shader_args *) container_of(args, radv_args, ac); +} + +void radv_declare_shader_args(struct radv_shader_args *args, + gl_shader_stage stage, + bool has_previous_stage, + gl_shader_stage previous_stage); + diff -Nru mesa-19.2.8/src/amd/vulkan/radv_shader.c mesa-20.0.8/src/amd/vulkan/radv_shader.c --- mesa-19.2.8/src/amd/vulkan/radv_shader.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/vulkan/radv_shader.c 2020-06-12 01:21:16.000000000 +0000 @@ -31,6 +31,7 @@ #include "radv_private.h" #include "radv_shader.h" #include "radv_shader_helper.h" +#include "radv_shader_args.h" #include "nir/nir.h" #include "nir/nir_builder.h" #include "spirv/nir_spirv.h" @@ -48,9 +49,11 @@ #include "util/debug.h" #include "ac_exp_param.h" +#include "aco_interface.h" + #include "util/string_buffer.h" -static const struct nir_shader_compiler_options nir_options = { +static const struct nir_shader_compiler_options nir_options_llvm = { .vertex_id_zero_based = true, .lower_scmp = true, .lower_flrp16 = true, @@ -59,6 +62,7 @@ .lower_device_index_to_zero = true, .lower_fsat = true, .lower_fdiv = true, + .lower_fmod = true, .lower_bitfield_insert_to_bitfield_select = true, .lower_bitfield_extract = true, .lower_sub = true, @@ -78,6 +82,51 @@ .lower_rotate = true, .max_unroll_iterations = 32, .use_interpolated_input_intrinsics = true, + /* nir_lower_int64() isn't actually called for the LLVM backend, but + * this helps the loop unrolling heuristics. */ + .lower_int64_options = nir_lower_imul64 | + nir_lower_imul_high64 | + nir_lower_imul_2x32_64 | + nir_lower_divmod64 | + nir_lower_minmax64 | + nir_lower_iabs64, +}; + +static const struct nir_shader_compiler_options nir_options_aco = { + .vertex_id_zero_based = true, + .lower_scmp = true, + .lower_flrp16 = true, + .lower_flrp32 = true, + .lower_flrp64 = true, + .lower_device_index_to_zero = true, + .lower_fdiv = true, + .lower_fmod = true, + .lower_bitfield_insert_to_bitfield_select = true, + .lower_bitfield_extract = true, + .lower_pack_snorm_2x16 = true, + .lower_pack_snorm_4x8 = true, + .lower_pack_unorm_2x16 = true, + .lower_pack_unorm_4x8 = true, + .lower_unpack_snorm_2x16 = true, + .lower_unpack_snorm_4x8 = true, + .lower_unpack_unorm_2x16 = true, + .lower_unpack_unorm_4x8 = true, + .lower_unpack_half_2x16 = true, + .lower_extract_byte = true, + .lower_extract_word = true, + .lower_ffma = true, + .lower_fpow = true, + .lower_mul_2x32_64 = true, + .lower_rotate = true, + .max_unroll_iterations = 32, + .use_interpolated_input_intrinsics = true, + .lower_int64_options = nir_lower_imul64 | + nir_lower_imul_high64 | + nir_lower_imul_2x32_64 | + nir_lower_divmod64 | + nir_lower_logic64 | + nir_lower_minmax64 | + nir_lower_iabs64, }; bool @@ -87,9 +136,11 @@ { if (!(device->instance->debug_flags & RADV_DEBUG_DUMP_SHADERS)) return false; + if (module) + return !module->nir || + (device->instance->debug_flags & RADV_DEBUG_DUMP_META_SHADERS); - /* Only dump non-meta shaders, useful for debugging purposes. */ - return (module && !module->nir) || is_gs_copy_shader; + return is_gs_copy_shader; } bool @@ -101,29 +152,6 @@ module && !module->nir; } -unsigned shader_io_get_unique_index(gl_varying_slot slot) -{ - /* handle patch indices separate */ - if (slot == VARYING_SLOT_TESS_LEVEL_OUTER) - return 0; - if (slot == VARYING_SLOT_TESS_LEVEL_INNER) - return 1; - if (slot >= VARYING_SLOT_PATCH0 && slot <= VARYING_SLOT_TESS_MAX) - return 2 + (slot - VARYING_SLOT_PATCH0); - if (slot == VARYING_SLOT_POS) - return 0; - if (slot == VARYING_SLOT_PSIZ) - return 1; - if (slot == VARYING_SLOT_CLIP_DIST0) - return 2; - if (slot == VARYING_SLOT_CLIP_DIST1) - return 3; - /* 3 is reserved for clip dist as well */ - if (slot >= VARYING_SLOT_VAR0 && slot <= VARYING_SLOT_VAR31) - return 4 + (slot - VARYING_SLOT_VAR0); - unreachable("illegal slot in get unique index\n"); -} - VkResult radv_CreateShaderModule( VkDevice _device, const VkShaderModuleCreateInfo* pCreateInfo, @@ -198,9 +226,9 @@ NIR_PASS(progress, shader, nir_opt_copy_prop_vars); NIR_PASS(progress, shader, nir_opt_dead_write_vars); NIR_PASS(progress, shader, nir_remove_dead_variables, - nir_var_function_temp); + nir_var_function_temp | nir_var_shader_in | nir_var_shader_out); - NIR_PASS_V(shader, nir_lower_alu_to_scalar, NULL); + NIR_PASS_V(shader, nir_lower_alu_to_scalar, NULL, NULL); NIR_PASS_V(shader, nir_lower_phis_to_scalar); NIR_PASS(progress, shader, nir_copy_prop); @@ -250,6 +278,17 @@ NIR_PASS(progress, shader, nir_opt_move, nir_move_load_ubo); } +static void +shared_var_info(const struct glsl_type *type, unsigned *size, unsigned *align) +{ + assert(glsl_type_is_vector_or_scalar(type)); + + uint32_t comp_size = glsl_type_is_boolean(type) ? 4 : glsl_get_bit_size(type) / 8; + unsigned length = glsl_get_vector_elements(type); + *size = comp_size * length, + *align = comp_size; +} + nir_shader * radv_shader_compile_to_nir(struct radv_device *device, struct radv_shader_module *module, @@ -257,15 +296,19 @@ gl_shader_stage stage, const VkSpecializationInfo *spec_info, const VkPipelineCreateFlags flags, - const struct radv_pipeline_layout *layout) + const struct radv_pipeline_layout *layout, + bool use_aco, + unsigned subgroup_size, unsigned ballot_bit_size) { nir_shader *nir; + const nir_shader_compiler_options *nir_options = use_aco ? &nir_options_aco : + &nir_options_llvm; if (module->nir) { /* Some things such as our meta clear/blit code will give us a NIR * shader directly. In that case, we just ignore the SPIR-V entirely * and just use the NIR shader */ nir = module->nir; - nir->options = &nir_options; + nir->options = nir_options; nir_validate_shader(nir, "in internal shader"); assert(exec_list_length(&nir->functions) == 1); @@ -274,7 +317,7 @@ assert(module->size % 4 == 0); if (device->instance->debug_flags & RADV_DEBUG_DUMP_SPIRV) - radv_print_spirv(spirv, module->size, stderr); + radv_print_spirv(module->data, module->size, stderr); uint32_t num_spec_entries = 0; struct nir_spirv_specialization *spec_entries = NULL; @@ -287,41 +330,61 @@ assert(data + entry.size <= spec_info->pData + spec_info->dataSize); spec_entries[i].id = spec_info->pMapEntries[i].constantID; - if (spec_info->dataSize == 8) + switch (entry.size) { + case 8: spec_entries[i].data64 = *(const uint64_t *)data; - else + break; + case 4: spec_entries[i].data32 = *(const uint32_t *)data; + break; + case 2: + spec_entries[i].data32 = *(const uint16_t *)data; + break; + case 1: + spec_entries[i].data32 = *(const uint8_t *)data; + break; + default: + assert(!"Invalid spec constant size"); + break; + } } } const struct spirv_to_nir_options spirv_options = { .lower_ubo_ssbo_access_to_offsets = true, .caps = { + .amd_fragment_mask = true, .amd_gcn_shader = true, + .amd_image_read_write_lod = true, .amd_shader_ballot = device->physical_device->use_shader_ballot, + .amd_shader_explicit_vertex_parameter = true, .amd_trinary_minmax = true, + .demote_to_helper_invocation = device->physical_device->use_aco, .derivative_group = true, .descriptor_array_dynamic_indexing = true, .descriptor_array_non_uniform_indexing = true, .descriptor_indexing = true, .device_group = true, .draw_parameters = true, - .float16 = true, + .float_controls = true, + .float16 = !device->physical_device->use_aco, .float64 = true, .geometry_streams = true, + .image_ms_array = true, .image_read_without_format = true, .image_write_without_format = true, - .int8 = true, - .int16 = true, + .int8 = !device->physical_device->use_aco, + .int16 = !device->physical_device->use_aco, .int64 = true, .int64_atomics = true, .multiview = true, .physical_storage_buffer_address = true, .post_depth_coverage = true, .runtime_descriptor_array = true, + .shader_clock = true, .shader_viewport_index_layer = true, .stencil_export = true, - .storage_8bit = true, - .storage_16bit = true, + .storage_8bit = !device->physical_device->use_aco, + .storage_16bit = !device->physical_device->use_aco, .storage_image_ms = true, .subgroup_arithmetic = true, .subgroup_ballot = true, @@ -343,7 +406,7 @@ nir = spirv_to_nir(spirv, module->size / 4, spec_entries, num_spec_entries, stage, entrypoint_name, - &spirv_options, &nir_options); + &spirv_options, nir_options); assert(nir->info.stage == stage); nir_validate_shader(nir, "after spirv_to_nir"); @@ -383,11 +446,13 @@ NIR_PASS_V(nir, nir_split_var_copies); NIR_PASS_V(nir, nir_split_per_member_structs); + if (nir->info.stage == MESA_SHADER_FRAGMENT && use_aco) + NIR_PASS_V(nir, nir_lower_io_to_vector, nir_var_shader_out); if (nir->info.stage == MESA_SHADER_FRAGMENT) NIR_PASS_V(nir, nir_lower_input_attachments, true); NIR_PASS_V(nir, nir_remove_dead_variables, - nir_var_shader_in | nir_var_shader_out | nir_var_system_value); + nir_var_shader_in | nir_var_shader_out | nir_var_system_value | nir_var_mem_shared); NIR_PASS_V(nir, nir_propagate_invariant); @@ -401,6 +466,9 @@ nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir)); + if (nir->info.stage == MESA_SHADER_GEOMETRY && use_aco) + nir_lower_gs_intrinsics(nir, true); + static const nir_lower_tex_options tex_options = { .lower_txp = ~0, .lower_tg4_offsets = true, @@ -424,14 +492,17 @@ nir_lower_global_vars_to_local(nir); nir_remove_dead_variables(nir, nir_var_function_temp); + bool gfx7minus = device->physical_device->rad_info.chip_class <= GFX7; nir_lower_subgroups(nir, &(struct nir_lower_subgroups_options) { - .subgroup_size = 64, - .ballot_bit_size = 64, + .subgroup_size = subgroup_size, + .ballot_bit_size = ballot_bit_size, .lower_to_scalar = 1, .lower_subgroup_masks = 1, .lower_shuffle = 1, .lower_shuffle_to_32bit = 1, .lower_vote_eq_to_ballot = 1, + .lower_quad_broadcast_dynamic = 1, + .lower_quad_broadcast_dynamic_to_const = gfx7minus, }); nir_lower_load_const_to_scalar(nir); @@ -444,6 +515,21 @@ */ nir_lower_var_copies(nir); + /* Lower deref operations for compute shared memory. */ + if (nir->info.stage == MESA_SHADER_COMPUTE) { + NIR_PASS_V(nir, nir_lower_vars_to_explicit_types, + nir_var_mem_shared, shared_var_info); + NIR_PASS_V(nir, nir_lower_explicit_io, + nir_var_mem_shared, nir_address_format_32bit_offset); + } + + /* Lower large variables that are always constant with load_constant + * intrinsics, which get turned into PC-relative loads from a data + * section next to the shader. + */ + NIR_PASS_V(nir, nir_opt_large_constants, + glsl_get_natural_size_align_bytes, 16); + /* Indirect lowering must be called after the radv_optimize_nir() loop * has been called at least once. Otherwise indirect lowering can * bloat the instruction count of the loop and cause it to be @@ -455,53 +541,6 @@ return nir; } -static void mark_16bit_fs_input(struct radv_shader_variant_info *shader_info, - const struct glsl_type *type, - int location) -{ - if (glsl_type_is_scalar(type) || glsl_type_is_vector(type) || glsl_type_is_matrix(type)) { - unsigned attrib_count = glsl_count_attribute_slots(type, false); - if (glsl_type_is_16bit(type)) { - shader_info->fs.float16_shaded_mask |= ((1ull << attrib_count) - 1) << location; - } - } else if (glsl_type_is_array(type)) { - unsigned stride = glsl_count_attribute_slots(glsl_get_array_element(type), false); - for (unsigned i = 0; i < glsl_get_length(type); ++i) { - mark_16bit_fs_input(shader_info, glsl_get_array_element(type), location + i * stride); - } - } else { - assert(glsl_type_is_struct_or_ifc(type)); - for (unsigned i = 0; i < glsl_get_length(type); i++) { - mark_16bit_fs_input(shader_info, glsl_get_struct_field(type, i), location); - location += glsl_count_attribute_slots(glsl_get_struct_field(type, i), false); - } - } -} - -static void -handle_fs_input_decl(struct radv_shader_variant_info *shader_info, - struct nir_variable *variable) -{ - unsigned attrib_count = glsl_count_attribute_slots(variable->type, false); - - if (variable->data.compact) { - unsigned component_count = variable->data.location_frac + - glsl_get_length(variable->type); - attrib_count = (component_count + 3) / 4; - } else { - mark_16bit_fs_input(shader_info, variable->type, - variable->data.driver_location); - } - - uint64_t mask = ((1ull << attrib_count) - 1); - - if (variable->data.interpolation == INTERP_MODE_FLAT) - shader_info->fs.flat_shaded_mask |= mask << variable->data.driver_location; - - if (variable->data.location >= VARYING_SLOT_VAR0) - shader_info->fs.input_mask |= mask << (variable->data.location - VARYING_SLOT_VAR0); -} - static int type_size_vec4(const struct glsl_type *type, bool bindless) { @@ -569,28 +608,13 @@ return progress; } -/* Gather information needed to setup the vs<->ps linking registers in - * radv_pipeline_generate_ps_inputs(). - */ - -static void -handle_fs_inputs(nir_shader *nir, struct radv_shader_variant_info *shader_info) -{ - shader_info->fs.num_interp = nir->num_inputs; - - nir_foreach_variable(variable, &nir->inputs) - handle_fs_input_decl(shader_info, variable); -} - -static void -lower_fs_io(nir_shader *nir, struct radv_shader_variant_info *shader_info) +void +radv_lower_fs_io(nir_shader *nir) { NIR_PASS_V(nir, lower_view_index); nir_assign_io_var_locations(&nir->inputs, &nir->num_inputs, MESA_SHADER_FRAGMENT); - handle_fs_inputs(nir, shader_info); - NIR_PASS_V(nir, nir_lower_io, nir_var_shader_in, type_size_vec4, 0); /* This pass needs actual constants */ @@ -633,7 +657,7 @@ slab->bo = device->ws->buffer_create(device->ws, slab->size, 256, RADEON_DOMAIN_VRAM, RADEON_FLAG_NO_INTERPROCESS_SHARING | - (device->physical_device->cpdma_prefetch_writes_memory ? + (device->physical_device->rad_info.cpdma_prefetch_writes_memory ? 0 : RADEON_FLAG_READ_ONLY), RADV_BO_PRIORITY_SHADER); slab->ptr = (char*)device->ws->buffer_map(slab->bo); @@ -671,7 +695,7 @@ static void radv_postprocess_config(const struct radv_physical_device *pdevice, const struct ac_shader_config *config_in, - const struct radv_shader_variant_info *info, + const struct radv_shader_info *info, gl_shader_stage stage, struct ac_shader_config *config_out) { @@ -680,73 +704,36 @@ unsigned num_input_vgprs = info->num_input_vgprs; if (stage == MESA_SHADER_FRAGMENT) { - num_input_vgprs = 0; - if (G_0286CC_PERSP_SAMPLE_ENA(config_in->spi_ps_input_addr)) - num_input_vgprs += 2; - if (G_0286CC_PERSP_CENTER_ENA(config_in->spi_ps_input_addr)) - num_input_vgprs += 2; - if (G_0286CC_PERSP_CENTROID_ENA(config_in->spi_ps_input_addr)) - num_input_vgprs += 2; - if (G_0286CC_PERSP_PULL_MODEL_ENA(config_in->spi_ps_input_addr)) - num_input_vgprs += 3; - if (G_0286CC_LINEAR_SAMPLE_ENA(config_in->spi_ps_input_addr)) - num_input_vgprs += 2; - if (G_0286CC_LINEAR_CENTER_ENA(config_in->spi_ps_input_addr)) - num_input_vgprs += 2; - if (G_0286CC_LINEAR_CENTROID_ENA(config_in->spi_ps_input_addr)) - num_input_vgprs += 2; - if (G_0286CC_LINE_STIPPLE_TEX_ENA(config_in->spi_ps_input_addr)) - num_input_vgprs += 1; - if (G_0286CC_POS_X_FLOAT_ENA(config_in->spi_ps_input_addr)) - num_input_vgprs += 1; - if (G_0286CC_POS_Y_FLOAT_ENA(config_in->spi_ps_input_addr)) - num_input_vgprs += 1; - if (G_0286CC_POS_Z_FLOAT_ENA(config_in->spi_ps_input_addr)) - num_input_vgprs += 1; - if (G_0286CC_POS_W_FLOAT_ENA(config_in->spi_ps_input_addr)) - num_input_vgprs += 1; - if (G_0286CC_FRONT_FACE_ENA(config_in->spi_ps_input_addr)) - num_input_vgprs += 1; - if (G_0286CC_ANCILLARY_ENA(config_in->spi_ps_input_addr)) - num_input_vgprs += 1; - if (G_0286CC_SAMPLE_COVERAGE_ENA(config_in->spi_ps_input_addr)) - num_input_vgprs += 1; - if (G_0286CC_POS_FIXED_PT_ENA(config_in->spi_ps_input_addr)) - num_input_vgprs += 1; + num_input_vgprs = ac_get_fs_input_vgpr_cnt(config_in, NULL, NULL); } unsigned num_vgprs = MAX2(config_in->num_vgprs, num_input_vgprs); /* +3 for scratch wave offset and VCC */ unsigned num_sgprs = MAX2(config_in->num_sgprs, info->num_input_sgprs + 3); + unsigned num_shared_vgprs = config_in->num_shared_vgprs; + /* shared VGPRs are introduced in Navi and are allocated in blocks of 8 (RDNA ref 3.6.5) */ + assert((pdevice->rad_info.chip_class >= GFX10 && num_shared_vgprs % 8 == 0) + || (pdevice->rad_info.chip_class < GFX10 && num_shared_vgprs == 0)); + unsigned num_shared_vgpr_blocks = num_shared_vgprs / 8; *config_out = *config_in; config_out->num_vgprs = num_vgprs; config_out->num_sgprs = num_sgprs; - - /* Enable 64-bit and 16-bit denormals, because there is no performance - * cost. - * - * If denormals are enabled, all floating-point output modifiers are - * ignored. - * - * Don't enable denormals for 32-bit floats, because: - * - Floating-point output modifiers would be ignored by the hw. - * - Some opcodes don't support denormals, such as v_mad_f32. We would - * have to stop using those. - * - GFX6 & GFX7 would be very slow. - */ - config_out->float_mode |= V_00B028_FP_64_DENORMS; + config_out->num_shared_vgprs = num_shared_vgprs; config_out->rsrc2 = S_00B12C_USER_SGPR(info->num_user_sgprs) | - S_00B12C_SCRATCH_EN(scratch_enabled) | - S_00B12C_SO_BASE0_EN(!!info->info.so.strides[0]) | - S_00B12C_SO_BASE1_EN(!!info->info.so.strides[1]) | - S_00B12C_SO_BASE2_EN(!!info->info.so.strides[2]) | - S_00B12C_SO_BASE3_EN(!!info->info.so.strides[3]) | - S_00B12C_SO_EN(!!info->info.so.num_outputs); + S_00B12C_SCRATCH_EN(scratch_enabled); + + if (!pdevice->use_ngg_streamout) { + config_out->rsrc2 |= S_00B12C_SO_BASE0_EN(!!info->so.strides[0]) | + S_00B12C_SO_BASE1_EN(!!info->so.strides[1]) | + S_00B12C_SO_BASE2_EN(!!info->so.strides[2]) | + S_00B12C_SO_BASE3_EN(!!info->so.strides[3]) | + S_00B12C_SO_EN(!!info->so.num_outputs); + } config_out->rsrc1 = S_00B848_VGPRS((num_vgprs - 1) / - (info->info.wave_size == 32 ? 8 : 4)) | + (info->wave_size == 32 ? 8 : 4)) | S_00B848_DX10_CLAMP(1) | S_00B848_FLOAT_MODE(config_out->float_mode); @@ -764,16 +751,17 @@ config_out->rsrc2 |= S_00B22C_OC_LDS_EN(1); } else if (info->tes.as_es) { assert(pdevice->rad_info.chip_class <= GFX8); - vgpr_comp_cnt = info->info.uses_prim_id ? 3 : 2; + vgpr_comp_cnt = info->uses_prim_id ? 3 : 2; config_out->rsrc2 |= S_00B12C_OC_LDS_EN(1); } else { - bool enable_prim_id = info->tes.export_prim_id || info->info.uses_prim_id; + bool enable_prim_id = info->tes.export_prim_id || info->uses_prim_id; vgpr_comp_cnt = enable_prim_id ? 3 : 2; config_out->rsrc1 |= S_00B128_MEM_ORDERED(pdevice->rad_info.chip_class >= GFX10); config_out->rsrc2 |= S_00B12C_OC_LDS_EN(1); } + config_out->rsrc2 |= S_00B22C_SHARED_VGPR_CNT(num_shared_vgpr_blocks); break; case MESA_SHADER_TESS_CTRL: if (pdevice->rad_info.chip_class >= GFX9) { @@ -782,15 +770,16 @@ * StepRate0 is set to 1. so that VGPR3 doesn't have to be loaded. */ if (pdevice->rad_info.chip_class >= GFX10) { - vgpr_comp_cnt = info->info.vs.needs_instance_id ? 3 : 1; + vgpr_comp_cnt = info->vs.needs_instance_id ? 3 : 1; } else { - vgpr_comp_cnt = info->info.vs.needs_instance_id ? 2 : 1; + vgpr_comp_cnt = info->vs.needs_instance_id ? 2 : 1; } } else { config_out->rsrc2 |= S_00B12C_OC_LDS_EN(1); } config_out->rsrc1 |= S_00B428_MEM_ORDERED(pdevice->rad_info.chip_class >= GFX10) | S_00B848_WGP_MODE(pdevice->rad_info.chip_class >= GFX10); + config_out->rsrc2 |= S_00B42C_SHARED_VGPR_CNT(num_shared_vgpr_blocks); break; case MESA_SHADER_VERTEX: if (info->is_ngg) { @@ -801,47 +790,52 @@ * VGPR0-3: (VertexID, RelAutoindex, InstanceID / StepRate0, InstanceID). * StepRate0 is set to 1. so that VGPR3 doesn't have to be loaded. */ - vgpr_comp_cnt = info->info.vs.needs_instance_id ? 2 : 1; + vgpr_comp_cnt = info->vs.needs_instance_id ? 2 : 1; } else if (info->vs.as_es) { assert(pdevice->rad_info.chip_class <= GFX8); /* VGPR0-3: (VertexID, InstanceID / StepRate0, ...) */ - vgpr_comp_cnt = info->info.vs.needs_instance_id ? 1 : 0; + vgpr_comp_cnt = info->vs.needs_instance_id ? 1 : 0; } else { /* VGPR0-3: (VertexID, InstanceID / StepRate0, PrimID, InstanceID) * If PrimID is disabled. InstanceID / StepRate1 is loaded instead. * StepRate0 is set to 1. so that VGPR3 doesn't have to be loaded. */ - if (info->info.vs.needs_instance_id && pdevice->rad_info.chip_class >= GFX10) { + if (info->vs.needs_instance_id && pdevice->rad_info.chip_class >= GFX10) { vgpr_comp_cnt = 3; } else if (info->vs.export_prim_id) { vgpr_comp_cnt = 2; - } else if (info->info.vs.needs_instance_id) { + } else if (info->vs.needs_instance_id) { vgpr_comp_cnt = 1; } else { vgpr_comp_cnt = 0; } config_out->rsrc1 |= S_00B128_MEM_ORDERED(pdevice->rad_info.chip_class >= GFX10); + config_out->rsrc2 |= S_00B12C_SHARED_VGPR_CNT(num_shared_vgpr_blocks); } break; case MESA_SHADER_FRAGMENT: config_out->rsrc1 |= S_00B028_MEM_ORDERED(pdevice->rad_info.chip_class >= GFX10); + config_out->rsrc2 |= S_00B02C_SHARED_VGPR_CNT(num_shared_vgpr_blocks); break; case MESA_SHADER_GEOMETRY: config_out->rsrc1 |= S_00B228_MEM_ORDERED(pdevice->rad_info.chip_class >= GFX10) | S_00B848_WGP_MODE(pdevice->rad_info.chip_class >= GFX10); + config_out->rsrc2 |= S_00B22C_SHARED_VGPR_CNT(num_shared_vgpr_blocks); break; case MESA_SHADER_COMPUTE: config_out->rsrc1 |= S_00B848_MEM_ORDERED(pdevice->rad_info.chip_class >= GFX10) | S_00B848_WGP_MODE(pdevice->rad_info.chip_class >= GFX10); config_out->rsrc2 |= - S_00B84C_TGID_X_EN(info->info.cs.uses_block_id[0]) | - S_00B84C_TGID_Y_EN(info->info.cs.uses_block_id[1]) | - S_00B84C_TGID_Z_EN(info->info.cs.uses_block_id[2]) | - S_00B84C_TIDIG_COMP_CNT(info->info.cs.uses_thread_id[2] ? 2 : - info->info.cs.uses_thread_id[1] ? 1 : 0) | - S_00B84C_TG_SIZE_EN(info->info.cs.uses_local_invocation_idx) | + S_00B84C_TGID_X_EN(info->cs.uses_block_id[0]) | + S_00B84C_TGID_Y_EN(info->cs.uses_block_id[1]) | + S_00B84C_TGID_Z_EN(info->cs.uses_block_id[2]) | + S_00B84C_TIDIG_COMP_CNT(info->cs.uses_thread_id[2] ? 2 : + info->cs.uses_thread_id[1] ? 1 : 0) | + S_00B84C_TG_SIZE_EN(info->cs.uses_local_invocation_idx) | S_00B84C_LDS_SIZE(config_in->lds_size); + config_out->rsrc3 |= S_00B8A0_SHARED_VGPR_CNT(num_shared_vgpr_blocks); + break; default: unreachable("unsupported shader type"); @@ -857,18 +851,18 @@ /* VGPR5-8: (VertexID, UserVGPR0, UserVGPR1, UserVGPR2 / InstanceID) */ if (es_stage == MESA_SHADER_VERTEX) { - es_vgpr_comp_cnt = info->info.vs.needs_instance_id ? 3 : 0; + es_vgpr_comp_cnt = info->vs.needs_instance_id ? 3 : 0; } else if (es_stage == MESA_SHADER_TESS_EVAL) { - bool enable_prim_id = info->tes.export_prim_id || info->info.uses_prim_id; + bool enable_prim_id = info->tes.export_prim_id || info->uses_prim_id; es_vgpr_comp_cnt = enable_prim_id ? 3 : 2; } else unreachable("Unexpected ES shader stage"); bool tes_triangles = stage == MESA_SHADER_TESS_EVAL && info->tes.primitive_mode >= 4; /* GL_TRIANGLES */ - if (info->info.uses_invocation_id || stage == MESA_SHADER_VERTEX) { + if (info->uses_invocation_id || stage == MESA_SHADER_VERTEX) { gs_vgpr_comp_cnt = 3; /* VGPR3 contains InvocationID. */ - } else if (info->info.uses_prim_id) { + } else if (info->uses_prim_id) { gs_vgpr_comp_cnt = 2; /* VGPR2 contains PrimitiveID. */ } else if (info->gs.vertices_in >= 3 || tes_triangles) { gs_vgpr_comp_cnt = 1; /* VGPR1 contains offsets 2, 3 */ @@ -888,13 +882,13 @@ if (es_type == MESA_SHADER_VERTEX) { /* VGPR0-3: (VertexID, InstanceID / StepRate0, ...) */ - if (info->info.vs.needs_instance_id) { + if (info->vs.needs_instance_id) { es_vgpr_comp_cnt = pdevice->rad_info.chip_class >= GFX10 ? 3 : 1; } else { es_vgpr_comp_cnt = 0; } } else if (es_type == MESA_SHADER_TESS_EVAL) { - es_vgpr_comp_cnt = info->info.uses_prim_id ? 3 : 2; + es_vgpr_comp_cnt = info->uses_prim_id ? 3 : 2; } else { unreachable("invalid shader ES type"); } @@ -902,9 +896,9 @@ /* If offsets 4, 5 are used, GS_VGPR_COMP_CNT is ignored and * VGPR[0:4] are always loaded. */ - if (info->info.uses_invocation_id) { + if (info->uses_invocation_id) { gs_vgpr_comp_cnt = 3; /* VGPR3 contains InvocationID. */ - } else if (info->info.uses_prim_id) { + } else if (info->uses_prim_id) { gs_vgpr_comp_cnt = 2; /* VGPR2 contains PrimitiveID. */ } else if (info->gs.vertices_in >= 3) { gs_vgpr_comp_cnt = 1; /* VGPR1 contains offsets 2, 3 */ @@ -923,45 +917,6 @@ } } -static void radv_init_llvm_target() -{ - LLVMInitializeAMDGPUTargetInfo(); - LLVMInitializeAMDGPUTarget(); - LLVMInitializeAMDGPUTargetMC(); - LLVMInitializeAMDGPUAsmPrinter(); - - /* For inline assembly. */ - LLVMInitializeAMDGPUAsmParser(); - - /* Workaround for bug in llvm 4.0 that causes image intrinsics - * to disappear. - * https://reviews.llvm.org/D26348 - * - * Workaround for bug in llvm that causes the GPU to hang in presence - * of nested loops because there is an exec mask issue. The proper - * solution is to fix LLVM but this might require a bunch of work. - * https://bugs.llvm.org/show_bug.cgi?id=37744 - * - * "mesa" is the prefix for error messages. - */ - if (HAVE_LLVM >= 0x0800) { - const char *argv[2] = { "mesa", "-simplifycfg-sink-common=false" }; - LLVMParseCommandLineOptions(2, argv, NULL); - - } else { - const char *argv[3] = { "mesa", "-simplifycfg-sink-common=false", - "-amdgpu-skip-threshold=1" }; - LLVMParseCommandLineOptions(3, argv, NULL); - } -} - -static once_flag radv_init_llvm_target_once_flag = ONCE_FLAG_INIT; - -static void radv_init_llvm_once(void) -{ - call_once(&radv_init_llvm_target_once_flag, radv_init_llvm_target); -} - struct radv_shader_variant * radv_shader_variant_create(struct radv_device *device, const struct radv_shader_binary *binary, @@ -976,50 +931,35 @@ variant->ref_count = 1; if (binary->type == RADV_BINARY_TYPE_RTLD) { - struct ac_rtld_symbol lds_symbols[1]; + struct ac_rtld_symbol lds_symbols[2]; unsigned num_lds_symbols = 0; const char *elf_data = (const char *)((struct radv_shader_binary_rtld *)binary)->data; size_t elf_size = ((struct radv_shader_binary_rtld *)binary)->elf_size; - unsigned esgs_ring_size = 0; if (device->physical_device->rad_info.chip_class >= GFX9 && - binary->stage == MESA_SHADER_GEOMETRY && !binary->is_gs_copy_shader) { - /* TODO: Do not hardcode this value */ - esgs_ring_size = 32 * 1024; - } - - if (binary->variant_info.is_ngg) { - /* GS stores Primitive IDs into LDS at the address - * corresponding to the ES thread of the provoking - * vertex. All ES threads load and export PrimitiveID - * for their thread. - */ - if (binary->stage == MESA_SHADER_VERTEX && - binary->variant_info.vs.export_prim_id) { - /* TODO: Do not harcode this value */ - esgs_ring_size = 256 /* max_out_verts */ * 4; - } - } - - if (esgs_ring_size) { + (binary->stage == MESA_SHADER_GEOMETRY || binary->info.is_ngg) && + !binary->is_gs_copy_shader) { /* We add this symbol even on LLVM <= 8 to ensure that * shader->config.lds_size is set correctly below. */ struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++]; sym->name = "esgs_ring"; - sym->size = esgs_ring_size; + sym->size = binary->info.ngg_info.esgs_ring_size; sym->align = 64 * 1024; + } - /* Make sure to have LDS space for NGG scratch. */ - /* TODO: Compute this correctly somehow? */ - if (binary->variant_info.is_ngg) - sym->size -= 32; + if (binary->info.is_ngg && + binary->stage == MESA_SHADER_GEOMETRY) { + struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++]; + sym->name = "ngg_emit"; + sym->size = binary->info.ngg_info.ngg_emit_size * 4; + sym->align = 4; } struct ac_rtld_open_info open_info = { .info = &device->physical_device->rad_info, .shader_type = binary->stage, - .wave_size = binary->variant_info.info.wave_size, + .wave_size = binary->info.wave_size, .num_parts = 1, .elf_ptrs = &elf_data, .elf_sizes = &elf_size, @@ -1038,22 +978,45 @@ return NULL; } + /* Enable 64-bit and 16-bit denormals, because there is no performance + * cost. + * + * If denormals are enabled, all floating-point output modifiers are + * ignored. + * + * Don't enable denormals for 32-bit floats, because: + * - Floating-point output modifiers would be ignored by the hw. + * - Some opcodes don't support denormals, such as v_mad_f32. We would + * have to stop using those. + * - GFX6 & GFX7 would be very slow. + */ + config.float_mode |= V_00B028_FP_64_DENORMS; + if (rtld_binary.lds_size > 0) { unsigned alloc_granularity = device->physical_device->rad_info.chip_class >= GFX7 ? 512 : 256; config.lds_size = align(rtld_binary.lds_size, alloc_granularity) / alloc_granularity; } variant->code_size = rtld_binary.rx_size; + variant->exec_size = rtld_binary.exec_size; } else { assert(binary->type == RADV_BINARY_TYPE_LEGACY); config = ((struct radv_shader_binary_legacy *)binary)->config; - variant->code_size = radv_get_shader_binary_size(((struct radv_shader_binary_legacy *)binary)->code_size); + variant->code_size = radv_get_shader_binary_size(((struct radv_shader_binary_legacy *)binary)->code_size); + variant->exec_size = ((struct radv_shader_binary_legacy *)binary)->exec_size; } - variant->info = binary->variant_info; - radv_postprocess_config(device->physical_device, &config, &binary->variant_info, + variant->info = binary->info; + radv_postprocess_config(device->physical_device, &config, &binary->info, binary->stage, &variant->config); - + + if (radv_device_use_secure_compile(device->instance)) { + if (binary->type == RADV_BINARY_TYPE_RTLD) + ac_rtld_close(&rtld_binary); + + return variant; + } + void *dest_ptr = radv_alloc_shader_memory(device, variant); if (binary->type == RADV_BINARY_TYPE_RTLD) { @@ -1080,7 +1043,7 @@ return NULL; } - variant->llvm_ir_string = bin->llvm_ir_size ? strdup((const char*)(bin->data + bin->elf_size)) : NULL; + variant->ir_string = bin->llvm_ir_size ? strdup((const char*)(bin->data + bin->elf_size)) : NULL; variant->disasm_string = malloc(disasm_size + 1); memcpy(variant->disasm_string, disasm_data, disasm_size); variant->disasm_string[disasm_size] = 0; @@ -1096,8 +1059,8 @@ for (unsigned i = 0; i < DEBUGGER_NUM_MARKERS; i++) ptr32[i] = DEBUGGER_END_OF_CODE_MARKER; - variant->llvm_ir_string = bin->llvm_ir_size ? strdup((const char*)(bin->data + bin->code_size)) : NULL; - variant->disasm_string = bin->disasm_size ? strdup((const char*)(bin->data + bin->code_size + bin->llvm_ir_size)) : NULL; + variant->ir_string = bin->ir_size ? strdup((const char*)(bin->data + bin->code_size)) : NULL; + variant->disasm_string = bin->disasm_size ? strdup((const char*)(bin->data + bin->code_size + bin->ir_size)) : NULL; } return variant; } @@ -1131,67 +1094,76 @@ struct nir_shader * const *shaders, int shader_count, gl_shader_stage stage, + struct radv_shader_info *info, struct radv_nir_compiler_options *options, bool gs_copy_shader, bool keep_shader_info, + bool use_aco, struct radv_shader_binary **binary_out) { enum radeon_family chip_family = device->physical_device->rad_info.family; - enum ac_target_machine_options tm_options = 0; - struct ac_llvm_compiler ac_llvm; struct radv_shader_binary *binary = NULL; - struct radv_shader_variant_info variant_info = {0}; - bool thread_compiler; - - if (shaders[0]->info.stage == MESA_SHADER_FRAGMENT) - lower_fs_io(shaders[0], &variant_info); options->family = chip_family; options->chip_class = device->physical_device->rad_info.chip_class; options->dump_shader = radv_can_dump_shader(device, module, gs_copy_shader); options->dump_preoptir = options->dump_shader && device->instance->debug_flags & RADV_DEBUG_PREOPTIR; - options->record_llvm_ir = keep_shader_info; + options->record_ir = keep_shader_info; options->check_ir = device->instance->debug_flags & RADV_DEBUG_CHECKIR; options->tess_offchip_block_dw_size = device->tess_offchip_block_dw_size; options->address32_hi = device->physical_device->rad_info.address32_hi; + options->has_ls_vgpr_init_bug = device->physical_device->rad_info.has_ls_vgpr_init_bug; + options->use_ngg_streamout = device->physical_device->use_ngg_streamout; - if ((stage == MESA_SHADER_GEOMETRY && !options->key.vs_common_out.as_ngg) || - gs_copy_shader) - options->wave_size = 64; - else if (stage == MESA_SHADER_COMPUTE) - options->wave_size = device->physical_device->cs_wave_size; - else if (stage == MESA_SHADER_FRAGMENT) - options->wave_size = device->physical_device->ps_wave_size; - else - options->wave_size = device->physical_device->ge_wave_size; + struct radv_shader_args args = {}; + args.options = options; + args.shader_info = info; + args.is_gs_copy_shader = gs_copy_shader; + radv_declare_shader_args(&args, + gs_copy_shader ? MESA_SHADER_VERTEX + : shaders[shader_count - 1]->info.stage, + shader_count >= 2, + shader_count >= 2 ? shaders[shader_count - 2]->info.stage + : MESA_SHADER_VERTEX); + + if (!use_aco || options->dump_shader || options->record_ir) + ac_init_llvm_once(); + + if (use_aco) { + aco_compile_shader(shader_count, shaders, &binary, &args); + binary->info = *info; + } else { + enum ac_target_machine_options tm_options = 0; + struct ac_llvm_compiler ac_llvm; + bool thread_compiler; - if (options->supports_spill) tm_options |= AC_TM_SUPPORTS_SPILL; - if (device->instance->perftest_flags & RADV_PERFTEST_SISCHED) - tm_options |= AC_TM_SISCHED; - if (options->check_ir) - tm_options |= AC_TM_CHECK_IR; - if (device->instance->debug_flags & RADV_DEBUG_NO_LOAD_STORE_OPT) - tm_options |= AC_TM_NO_LOAD_STORE_OPT; - - thread_compiler = !(device->instance->debug_flags & RADV_DEBUG_NOTHREADLLVM); - radv_init_llvm_once(); - radv_init_llvm_compiler(&ac_llvm, - thread_compiler, - chip_family, tm_options, - options->wave_size); - if (gs_copy_shader) { - assert(shader_count == 1); - radv_compile_gs_copy_shader(&ac_llvm, *shaders, &binary, - &variant_info, options); - } else { - radv_compile_nir_shader(&ac_llvm, &binary, &variant_info, - shaders, shader_count, options); - } - binary->variant_info = variant_info; + if (device->instance->perftest_flags & RADV_PERFTEST_SISCHED) + tm_options |= AC_TM_SISCHED; + if (options->check_ir) + tm_options |= AC_TM_CHECK_IR; + if (device->instance->debug_flags & RADV_DEBUG_NO_LOAD_STORE_OPT) + tm_options |= AC_TM_NO_LOAD_STORE_OPT; + + thread_compiler = !(device->instance->debug_flags & RADV_DEBUG_NOTHREADLLVM); + radv_init_llvm_compiler(&ac_llvm, + thread_compiler, + chip_family, tm_options, + info->wave_size); + + if (gs_copy_shader) { + assert(shader_count == 1); + radv_compile_gs_copy_shader(&ac_llvm, *shaders, &binary, + &args); + } else { + radv_compile_nir_shader(&ac_llvm, &binary, &args, + shaders, shader_count); + } - radv_destroy_llvm_compiler(&ac_llvm, thread_compiler); + binary->info = *info; + radv_destroy_llvm_compiler(&ac_llvm, thread_compiler); + } struct radv_shader_variant *variant = radv_shader_variant_create(device, binary, keep_shader_info); @@ -1199,6 +1171,7 @@ free(binary); return NULL; } + variant->aco_used = use_aco; if (options->dump_shader) { fprintf(stderr, "disasm:\n%s\n", variant->disasm_string); @@ -1208,7 +1181,14 @@ if (keep_shader_info) { variant->nir_string = radv_dump_nir_shaders(shaders, shader_count); if (!gs_copy_shader && !module->nir) { - variant->spirv = (uint32_t *)module->data; + variant->spirv = malloc(module->size); + if (!variant->spirv) { + free(variant); + free(binary); + return NULL; + } + + memcpy(variant->spirv, module->data, module->size); variant->spirv_size = module->size; } } @@ -1228,7 +1208,9 @@ int shader_count, struct radv_pipeline_layout *layout, const struct radv_shader_variant_key *key, + struct radv_shader_info *info, bool keep_shader_info, + bool use_aco, struct radv_shader_binary **binary_out) { struct radv_nir_compiler_options options = {0}; @@ -1237,27 +1219,28 @@ if (key) options.key = *key; - options.unsafe_math = !!(device->instance->debug_flags & RADV_DEBUG_UNSAFE_MATH); - options.supports_spill = true; + options.explicit_scratch_args = use_aco; options.robust_buffer_access = device->robust_buffer_access; - return shader_variant_compile(device, module, shaders, shader_count, shaders[shader_count - 1]->info.stage, - &options, false, keep_shader_info, binary_out); + return shader_variant_compile(device, module, shaders, shader_count, shaders[shader_count - 1]->info.stage, info, + &options, false, keep_shader_info, use_aco, binary_out); } struct radv_shader_variant * radv_create_gs_copy_shader(struct radv_device *device, struct nir_shader *shader, + struct radv_shader_info *info, struct radv_shader_binary **binary_out, bool keep_shader_info, - bool multiview) + bool multiview, bool use_aco) { struct radv_nir_compiler_options options = {0}; + options.explicit_scratch_args = use_aco; options.key.has_multiview_view_index = multiview; return shader_variant_compile(device, NULL, &shader, 1, MESA_SHADER_VERTEX, - &options, true, keep_shader_info, binary_out); + info, &options, true, keep_shader_info, use_aco, binary_out); } void @@ -1271,14 +1254,15 @@ list_del(&variant->slab_list); mtx_unlock(&device->shader_slab_mutex); + free(variant->spirv); free(variant->nir_string); free(variant->disasm_string); - free(variant->llvm_ir_string); + free(variant->ir_string); free(variant); } const char * -radv_get_shader_name(struct radv_shader_variant_info *info, +radv_get_shader_name(struct radv_shader_info *info, gl_shader_stage stage) { switch (stage) { @@ -1338,16 +1322,16 @@ { enum chip_class chip_class = device->physical_device->rad_info.chip_class; unsigned lds_increment = chip_class >= GFX7 ? 512 : 256; - uint8_t wave_size = variant->info.info.wave_size; + uint8_t wave_size = variant->info.wave_size; struct ac_shader_config *conf = &variant->config; unsigned max_simd_waves; unsigned lds_per_wave = 0; - max_simd_waves = ac_get_max_wave64_per_simd(device->physical_device->rad_info.family); + max_simd_waves = device->physical_device->rad_info.max_wave64_per_simd; if (stage == MESA_SHADER_FRAGMENT) { lds_per_wave = conf->lds_size * lds_increment + - align(variant->info.fs.num_interp * 48, + align(variant->info.ps.num_interp * 48, lds_increment); } else if (stage == MESA_SHADER_COMPUTE) { unsigned max_workgroup_size = @@ -1356,16 +1340,20 @@ DIV_ROUND_UP(max_workgroup_size, wave_size); } - if (conf->num_sgprs) + if (conf->num_sgprs) { + unsigned sgprs = align(conf->num_sgprs, chip_class >= GFX8 ? 16 : 8); max_simd_waves = MIN2(max_simd_waves, - ac_get_num_physical_sgprs(&device->physical_device->rad_info) / - conf->num_sgprs); + device->physical_device->rad_info.num_physical_sgprs_per_simd / + sgprs); + } - if (conf->num_vgprs) + if (conf->num_vgprs) { + unsigned vgprs = align(conf->num_vgprs, wave_size == 32 ? 8 : 4); max_simd_waves = MIN2(max_simd_waves, - RADV_NUM_PHYSICAL_VGPRS / conf->num_vgprs); + RADV_NUM_PHYSICAL_VGPRS / vgprs); + } /* LDS is 64KB per CU (4 SIMDs), divided into 16KB blocks per SIMD * that PS can use. @@ -1405,7 +1393,7 @@ "********************\n\n\n", conf->num_sgprs, conf->num_vgprs, conf->spilled_sgprs, conf->spilled_vgprs, - variant->info.private_mem_vgprs, variant->code_size, + variant->info.private_mem_vgprs, variant->exec_size, conf->lds_size, conf->scratch_bytes_per_wave, max_simd_waves); } @@ -1457,7 +1445,7 @@ VkShaderStatisticsInfoAMD statistics = {}; statistics.shaderStageMask = shaderStage; statistics.numPhysicalVgprs = RADV_NUM_PHYSICAL_VGPRS; - statistics.numPhysicalSgprs = ac_get_num_physical_sgprs(&device->physical_device->rad_info); + statistics.numPhysicalSgprs = device->physical_device->rad_info.num_physical_sgprs_per_simd; statistics.numAvailableSgprs = statistics.numPhysicalSgprs; if (stage == MESA_SHADER_COMPUTE) { @@ -1494,7 +1482,7 @@ buf = _mesa_string_buffer_create(NULL, 1024); _mesa_string_buffer_printf(buf, "%s:\n", radv_get_shader_name(&variant->info, stage)); - _mesa_string_buffer_printf(buf, "%s\n\n", variant->llvm_ir_string); + _mesa_string_buffer_printf(buf, "%s\n\n", variant->ir_string); _mesa_string_buffer_printf(buf, "%s\n\n", variant->disasm_string); generate_shader_stats(device, variant, stage, buf); diff -Nru mesa-19.2.8/src/amd/vulkan/radv_shader.h mesa-20.0.8/src/amd/vulkan/radv_shader.h --- mesa-19.2.8/src/amd/vulkan/radv_shader.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/vulkan/radv_shader.h 2020-06-12 01:21:16.000000000 +0000 @@ -55,9 +55,11 @@ uint32_t as_es:1; uint32_t as_ls:1; uint32_t as_ngg:1; + uint32_t as_ngg_passthrough:1; uint32_t export_prim_id:1; uint32_t export_layer_id:1; uint32_t export_clip_dists:1; + uint32_t export_viewport_index:1; }; struct radv_vs_variant_key { @@ -76,6 +78,9 @@ /* For some formats the channels have to be shuffled. */ uint32_t post_shuffle; + + /* Output primitive type. */ + uint8_t outprim; }; struct radv_tes_variant_key { @@ -101,12 +106,17 @@ uint32_t is_int10; }; +struct radv_cs_variant_key { + uint8_t subgroup_size; +}; + struct radv_shader_variant_key { union { struct radv_vs_variant_key vs; struct radv_fs_variant_key fs; struct radv_tes_variant_key tes; struct radv_tcs_variant_key tcs; + struct radv_cs_variant_key cs; /* A common prefix of the vs and tes keys. */ struct radv_vs_out_key vs_common_out; @@ -117,19 +127,19 @@ struct radv_nir_compiler_options { struct radv_pipeline_layout *layout; struct radv_shader_variant_key key; - bool unsafe_math; - bool supports_spill; + bool explicit_scratch_args; bool clamp_shadow_reference; bool robust_buffer_access; bool dump_shader; bool dump_preoptir; - bool record_llvm_ir; + bool record_ir; bool check_ir; + bool has_ls_vgpr_init_bug; + bool use_ngg_streamout; enum radeon_family family; enum chip_class chip_class; uint32_t tess_offchip_block_dw_size; uint32_t address32_hi; - uint8_t wave_size; }; enum radv_ud_index { @@ -139,7 +149,8 @@ AC_UD_INDIRECT_DESCRIPTOR_SETS = 3, AC_UD_VIEW_INDEX = 4, AC_UD_STREAMOUT_BUFFERS = 5, - AC_UD_SHADER_START = 6, + AC_UD_NGG_GS_STATE = 6, + AC_UD_SHADER_START = 7, AC_UD_VS_VERTEX_BUFFERS = AC_UD_SHADER_START, AC_UD_VS_BASE_VERTEX_START_INSTANCE, AC_UD_VS_MAX_UD, @@ -167,6 +178,51 @@ uint32_t enabled_stream_buffers_mask; }; +struct radv_userdata_info { + int8_t sgpr_idx; + uint8_t num_sgprs; +}; + +struct radv_userdata_locations { + struct radv_userdata_info descriptor_sets[MAX_SETS]; + struct radv_userdata_info shader_data[AC_UD_MAX_UD]; + uint32_t descriptor_sets_enabled; +}; + +struct radv_vs_output_info { + uint8_t vs_output_param_offset[VARYING_SLOT_MAX]; + uint8_t clip_dist_mask; + uint8_t cull_dist_mask; + uint8_t param_exports; + bool writes_pointsize; + bool writes_layer; + bool writes_viewport_index; + bool export_prim_id; + unsigned pos_exports; +}; + +struct radv_es_output_info { + uint32_t esgs_itemsize; +}; + +struct gfx9_gs_info { + uint32_t vgt_gs_onchip_cntl; + uint32_t vgt_gs_max_prims_per_subgroup; + uint32_t vgt_esgs_ring_itemsize; + uint32_t lds_size; +}; + +struct gfx10_ngg_info { + uint16_t ngg_emit_size; /* in dwords */ + uint32_t hw_max_esverts; + uint32_t max_gsprims; + uint32_t max_out_verts; + uint32_t prim_amp_factor; + uint32_t vgt_esgs_ring_itemsize; + uint32_t esgs_ring_size; + bool max_vert_out_per_gs_instance; +}; + struct radv_shader_info { bool loads_push_constants; bool loads_dynamic_offsets; @@ -181,6 +237,15 @@ bool uses_invocation_id; bool uses_prim_id; uint8_t wave_size; + uint8_t ballot_bit_size; + struct radv_userdata_locations user_sgprs_locs; + unsigned num_user_sgprs; + unsigned num_input_sgprs; + unsigned num_input_vgprs; + unsigned private_mem_vgprs; + bool need_indirect_descriptor_sets; + bool is_ngg; + bool is_ngg_passthrough; struct { uint64_t ls_outputs_written; uint8_t input_usage_mask[VERT_ATTRIB_MAX]; @@ -188,15 +253,36 @@ bool has_vertex_buffers; /* needs vertex buffers and base/start */ bool needs_draw_id; bool needs_instance_id; + struct radv_vs_output_info outinfo; + struct radv_es_output_info es_info; + bool as_es; + bool as_ls; + bool export_prim_id; } vs; struct { uint8_t output_usage_mask[VARYING_SLOT_VAR31 + 1]; uint8_t num_stream_output_components[4]; uint8_t output_streams[VARYING_SLOT_VAR31 + 1]; uint8_t max_stream; + bool writes_memory; + unsigned gsvs_vertex_size; + unsigned max_gsvs_emit_size; + unsigned vertices_in; + unsigned vertices_out; + unsigned output_prim; + unsigned invocations; + unsigned es_type; /* GFX9: VS or TES */ } gs; struct { uint8_t output_usage_mask[VARYING_SLOT_VAR31 + 1]; + struct radv_vs_output_info outinfo; + struct radv_es_output_info es_info; + bool as_es; + unsigned primitive_mode; + enum gl_tess_spacing spacing; + bool ccw; + bool point_mode; + bool export_prim_id; } tes; struct { bool force_persample; @@ -208,103 +294,38 @@ bool has_pcoord; bool prim_id_input; bool layer_input; + bool viewport_index_input; uint8_t num_input_clips_culls; + uint32_t input_mask; + uint32_t flat_shaded_mask; + uint32_t explicit_shaded_mask; + uint32_t float16_shaded_mask; + uint32_t num_interp; + bool can_discard; + bool early_fragment_test; + bool post_depth_coverage; } ps; struct { bool uses_grid_size; bool uses_block_id[3]; bool uses_thread_id[3]; bool uses_local_invocation_idx; + unsigned block_size[3]; } cs; struct { uint64_t outputs_written; uint64_t patch_outputs_written; + unsigned tcs_vertices_out; + uint32_t num_patches; + uint32_t lds_size; } tcs; struct radv_streamout_info so; -}; - -struct radv_userdata_info { - int8_t sgpr_idx; - uint8_t num_sgprs; -}; - -struct radv_userdata_locations { - struct radv_userdata_info descriptor_sets[MAX_SETS]; - struct radv_userdata_info shader_data[AC_UD_MAX_UD]; - uint32_t descriptor_sets_enabled; -}; - -struct radv_vs_output_info { - uint8_t vs_output_param_offset[VARYING_SLOT_MAX]; - uint8_t clip_dist_mask; - uint8_t cull_dist_mask; - uint8_t param_exports; - bool writes_pointsize; - bool writes_layer; - bool writes_viewport_index; - bool export_prim_id; - unsigned pos_exports; -}; -struct radv_es_output_info { - uint32_t esgs_itemsize; -}; + struct gfx9_gs_info gs_ring_info; + struct gfx10_ngg_info ngg_info; -struct radv_shader_variant_info { - struct radv_userdata_locations user_sgprs_locs; - struct radv_shader_info info; - unsigned num_user_sgprs; - unsigned num_input_sgprs; - unsigned num_input_vgprs; - unsigned private_mem_vgprs; - bool need_indirect_descriptor_sets; - bool is_ngg; - struct { - struct { - struct radv_vs_output_info outinfo; - struct radv_es_output_info es_info; - bool as_es; - bool as_ls; - bool export_prim_id; - } vs; - struct { - unsigned num_interp; - uint32_t input_mask; - uint32_t flat_shaded_mask; - uint32_t float16_shaded_mask; - bool can_discard; - bool early_fragment_test; - bool post_depth_coverage; - } fs; - struct { - unsigned block_size[3]; - } cs; - struct { - unsigned vertices_in; - unsigned vertices_out; - unsigned output_prim; - unsigned invocations; - unsigned gsvs_vertex_size; - unsigned max_gsvs_emit_size; - unsigned es_type; /* GFX9: VS or TES */ - } gs; - struct { - unsigned tcs_vertices_out; - uint32_t num_patches; - uint32_t lds_size; - } tcs; - struct { - struct radv_vs_output_info outinfo; - struct radv_es_output_info es_info; - bool as_es; - unsigned primitive_mode; - enum gl_tess_spacing spacing; - bool ccw; - bool point_mode; - bool export_prim_id; - } tes; - }; + unsigned float_controls_mode; }; enum radv_shader_binary_type { @@ -317,7 +338,7 @@ gl_shader_stage stage; bool is_gs_copy_shader; - struct radv_shader_variant_info variant_info; + struct radv_shader_info info; /* Self-referential size so we avoid consistency issues. */ uint32_t total_size; @@ -327,10 +348,11 @@ struct radv_shader_binary base; struct ac_shader_config config; unsigned code_size; - unsigned llvm_ir_size; + unsigned exec_size; + unsigned ir_size; unsigned disasm_size; - /* data has size of code_size + llvm_ir_size + disasm_size + 2, where + /* data has size of code_size + ir_size + disasm_size + 2, where * the +2 is for 0 of the ir strings. */ uint8_t data[0]; }; @@ -349,14 +371,16 @@ uint64_t bo_offset; struct ac_shader_config config; uint32_t code_size; - struct radv_shader_variant_info info; + uint32_t exec_size; + struct radv_shader_info info; /* debug only */ - uint32_t *spirv; + bool aco_used; + char *spirv; uint32_t spirv_size; char *nir_string; char *disasm_string; - char *llvm_ir_string; + char *ir_string; struct list_head slab_list; }; @@ -383,7 +407,9 @@ gl_shader_stage stage, const VkSpecializationInfo *spec_info, const VkPipelineCreateFlags flags, - const struct radv_pipeline_layout *layout); + const struct radv_pipeline_layout *layout, + bool use_aco, + unsigned subgroup_size, unsigned ballot_bit_size); void * radv_alloc_shader_memory(struct radv_device *device, @@ -392,6 +418,16 @@ void radv_destroy_shader_slabs(struct radv_device *device); +void +radv_create_shaders(struct radv_pipeline *pipeline, + struct radv_device *device, + struct radv_pipeline_cache *cache, + const struct radv_pipeline_key *key, + const VkPipelineShaderStageCreateInfo **pStages, + const VkPipelineCreateFlags flags, + VkPipelineCreationFeedbackEXT *pipeline_feedback, + VkPipelineCreationFeedbackEXT **stage_feedbacks); + struct radv_shader_variant * radv_shader_variant_create(struct radv_device *device, const struct radv_shader_binary *binary, @@ -403,13 +439,17 @@ int shader_count, struct radv_pipeline_layout *layout, const struct radv_shader_variant_key *key, + struct radv_shader_info *info, bool keep_shader_info, + bool use_aco, struct radv_shader_binary **binary_out); struct radv_shader_variant * radv_create_gs_copy_shader(struct radv_device *device, struct nir_shader *nir, + struct radv_shader_info *info, struct radv_shader_binary **binary_out, - bool multiview, bool keep_shader_info); + bool multiview, bool keep_shader_info, + bool use_aco); void radv_shader_variant_destroy(struct radv_device *device, @@ -427,7 +467,7 @@ const unsigned *sizes); const char * -radv_get_shader_name(struct radv_shader_variant_info *info, +radv_get_shader_name(struct radv_shader_info *info, gl_shader_stage stage); void @@ -445,7 +485,31 @@ radv_can_dump_shader_stats(struct radv_device *device, struct radv_shader_module *module); -unsigned -shader_io_get_unique_index(gl_varying_slot slot); +static inline unsigned +shader_io_get_unique_index(gl_varying_slot slot) +{ + /* handle patch indices separate */ + if (slot == VARYING_SLOT_TESS_LEVEL_OUTER) + return 0; + if (slot == VARYING_SLOT_TESS_LEVEL_INNER) + return 1; + if (slot >= VARYING_SLOT_PATCH0 && slot <= VARYING_SLOT_TESS_MAX) + return 2 + (slot - VARYING_SLOT_PATCH0); + if (slot == VARYING_SLOT_POS) + return 0; + if (slot == VARYING_SLOT_PSIZ) + return 1; + if (slot == VARYING_SLOT_CLIP_DIST0) + return 2; + if (slot == VARYING_SLOT_CLIP_DIST1) + return 3; + /* 3 is reserved for clip dist as well */ + if (slot >= VARYING_SLOT_VAR0 && slot <= VARYING_SLOT_VAR31) + return 4 + (slot - VARYING_SLOT_VAR0); + unreachable("illegal slot in get unique index\n"); +} + +void +radv_lower_fs_io(nir_shader *nir); #endif diff -Nru mesa-19.2.8/src/amd/vulkan/radv_shader_info.c mesa-20.0.8/src/amd/vulkan/radv_shader_info.c --- mesa-19.2.8/src/amd/vulkan/radv_shader_info.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/vulkan/radv_shader_info.c 2020-06-12 01:21:16.000000000 +0000 @@ -156,6 +156,8 @@ { if (nir->info.stage == MESA_SHADER_FRAGMENT) info->ps.writes_memory = true; + else if (nir->info.stage == MESA_SHADER_GEOMETRY) + info->gs.writes_memory = true; } static void @@ -291,8 +293,10 @@ case nir_intrinsic_image_deref_load: case nir_intrinsic_image_deref_store: case nir_intrinsic_image_deref_atomic_add: - case nir_intrinsic_image_deref_atomic_min: - case nir_intrinsic_image_deref_atomic_max: + case nir_intrinsic_image_deref_atomic_imin: + case nir_intrinsic_image_deref_atomic_umin: + case nir_intrinsic_image_deref_atomic_imax: + case nir_intrinsic_image_deref_atomic_umax: case nir_intrinsic_image_deref_atomic_and: case nir_intrinsic_image_deref_atomic_or: case nir_intrinsic_image_deref_atomic_xor: @@ -304,8 +308,10 @@ if (instr->intrinsic == nir_intrinsic_image_deref_store || instr->intrinsic == nir_intrinsic_image_deref_atomic_add || - instr->intrinsic == nir_intrinsic_image_deref_atomic_min || - instr->intrinsic == nir_intrinsic_image_deref_atomic_max || + instr->intrinsic == nir_intrinsic_image_deref_atomic_imin || + instr->intrinsic == nir_intrinsic_image_deref_atomic_umin || + instr->intrinsic == nir_intrinsic_image_deref_atomic_imax || + instr->intrinsic == nir_intrinsic_image_deref_atomic_umax || instr->intrinsic == nir_intrinsic_image_deref_atomic_and || instr->intrinsic == nir_intrinsic_image_deref_atomic_or || instr->intrinsic == nir_intrinsic_image_deref_atomic_xor || @@ -392,7 +398,7 @@ static void gather_info_input_decl_vs(const nir_shader *nir, const nir_variable *var, struct radv_shader_info *info, - const struct radv_nir_compiler_options *options) + const struct radv_shader_variant_key *key) { unsigned attrib_count = glsl_count_attribute_slots(var->type, true); int idx = var->data.location; @@ -403,12 +409,34 @@ for (unsigned i = 0; i < attrib_count; ++i) { unsigned attrib_index = var->data.location + i - VERT_ATTRIB_GENERIC0; - if (options->key.vs.instance_rate_inputs & (1u << attrib_index)) + if (key->vs.instance_rate_inputs & (1u << attrib_index)) info->vs.needs_instance_id = true; } } static void +mark_16bit_ps_input(struct radv_shader_info *info, const struct glsl_type *type, + int location) +{ + if (glsl_type_is_scalar(type) || glsl_type_is_vector(type) || glsl_type_is_matrix(type)) { + unsigned attrib_count = glsl_count_attribute_slots(type, false); + if (glsl_type_is_16bit(type)) { + info->ps.float16_shaded_mask |= ((1ull << attrib_count) - 1) << location; + } + } else if (glsl_type_is_array(type)) { + unsigned stride = glsl_count_attribute_slots(glsl_get_array_element(type), false); + for (unsigned i = 0; i < glsl_get_length(type); ++i) { + mark_16bit_ps_input(info, glsl_get_array_element(type), location + i * stride); + } + } else { + assert(glsl_type_is_struct_or_ifc(type)); + for (unsigned i = 0; i < glsl_get_length(type); i++) { + mark_16bit_ps_input(info, glsl_get_struct_field(type, i), location); + location += glsl_count_attribute_slots(glsl_get_struct_field(type, i), false); + } + } +} +static void gather_info_input_decl_ps(const nir_shader *nir, const nir_variable *var, struct radv_shader_info *info) { @@ -430,6 +458,9 @@ case VARYING_SLOT_CLIP_DIST1: info->ps.num_input_clips_culls += attrib_count; break; + case VARYING_SLOT_VIEWPORT: + info->ps.viewport_index_input = true; + break; default: break; } @@ -438,16 +469,34 @@ if (var->data.sample) info->ps.force_persample = true; } + + if (var->data.compact) { + unsigned component_count = var->data.location_frac + + glsl_get_length(var->type); + attrib_count = (component_count + 3) / 4; + } else { + mark_16bit_ps_input(info, var->type, var->data.driver_location); + } + + uint64_t mask = ((1ull << attrib_count) - 1); + + if (var->data.interpolation == INTERP_MODE_FLAT) + info->ps.flat_shaded_mask |= mask << var->data.driver_location; + if (var->data.interpolation == INTERP_MODE_EXPLICIT) + info->ps.explicit_shaded_mask |= mask << var->data.driver_location; + + if (var->data.location >= VARYING_SLOT_VAR0) + info->ps.input_mask |= mask << (var->data.location - VARYING_SLOT_VAR0); } static void gather_info_input_decl(const nir_shader *nir, const nir_variable *var, struct radv_shader_info *info, - const struct radv_nir_compiler_options *options) + const struct radv_shader_variant_key *key) { switch (nir->info.stage) { case MESA_SHADER_VERTEX: - gather_info_input_decl_vs(nir, var, info, options); + gather_info_input_decl_vs(nir, var, info, key); break; case MESA_SHADER_FRAGMENT: gather_info_input_decl_ps(nir, var, info); @@ -508,22 +557,58 @@ static void gather_info_output_decl(const nir_shader *nir, const nir_variable *var, struct radv_shader_info *info, - const struct radv_nir_compiler_options *options) + const struct radv_shader_variant_key *key) { + struct radv_vs_output_info *vs_info = NULL; + switch (nir->info.stage) { case MESA_SHADER_FRAGMENT: gather_info_output_decl_ps(nir, var, info); break; case MESA_SHADER_VERTEX: - if (options->key.vs_common_out.as_ls) + if (!key->vs_common_out.as_ls && + !key->vs_common_out.as_es) + vs_info = &info->vs.outinfo; + + if (key->vs_common_out.as_ls) gather_info_output_decl_ls(nir, var, info); + else if (key->vs_common_out.as_ngg) + gather_info_output_decl_gs(nir, var, info); break; case MESA_SHADER_GEOMETRY: + vs_info = &info->vs.outinfo; gather_info_output_decl_gs(nir, var, info); break; + case MESA_SHADER_TESS_EVAL: + if (!key->vs_common_out.as_es) + vs_info = &info->tes.outinfo; + break; default: break; } + + if (vs_info) { + switch (var->data.location) { + case VARYING_SLOT_CLIP_DIST0: + vs_info->clip_dist_mask = + (1 << nir->info.clip_distance_array_size) - 1; + vs_info->cull_dist_mask = + (1 << nir->info.cull_distance_array_size) - 1; + vs_info->cull_dist_mask <<= nir->info.clip_distance_array_size; + break; + case VARYING_SLOT_PSIZ: + vs_info->writes_pointsize = true; + break; + case VARYING_SLOT_VIEWPORT: + vs_info->writes_viewport_index = true; + break; + case VARYING_SLOT_LAYER: + vs_info->writes_layer = true; + break; + default: + break; + } + } } static void @@ -569,27 +654,28 @@ void radv_nir_shader_info_pass(const struct nir_shader *nir, - const struct radv_nir_compiler_options *options, + const struct radv_pipeline_layout *layout, + const struct radv_shader_variant_key *key, struct radv_shader_info *info) { struct nir_function *func = (struct nir_function *)exec_list_get_head_const(&nir->functions); - if (options->layout && options->layout->dynamic_offset_count && - (options->layout->dynamic_shader_stages & mesa_to_vk_shader_stage(nir->info.stage))) { + if (layout && layout->dynamic_offset_count && + (layout->dynamic_shader_stages & mesa_to_vk_shader_stage(nir->info.stage))) { info->loads_push_constants = true; info->loads_dynamic_offsets = true; } nir_foreach_variable(variable, &nir->inputs) - gather_info_input_decl(nir, variable, info, options); + gather_info_input_decl(nir, variable, info, key); nir_foreach_block(block, func->impl) { gather_info_block(nir, block, info); } nir_foreach_variable(variable, &nir->outputs) - gather_info_output_decl(nir, variable, info, options); + gather_info_output_decl(nir, variable, info, key); if (nir->info.stage == MESA_SHADER_VERTEX || nir->info.stage == MESA_SHADER_TESS_EVAL || @@ -597,7 +683,7 @@ gather_xfb_info(nir, info); /* Make sure to export the LayerID if the fragment shader needs it. */ - if (options->key.vs_common_out.export_layer_id) { + if (key->vs_common_out.export_layer_id) { switch (nir->info.stage) { case MESA_SHADER_VERTEX: info->vs.output_usage_mask[VARYING_SLOT_LAYER] |= 0x1; @@ -612,4 +698,128 @@ break; } } + + /* Make sure to export the LayerID if the subpass has multiviews. */ + if (key->has_multiview_view_index) { + switch (nir->info.stage) { + case MESA_SHADER_VERTEX: + info->vs.outinfo.writes_layer = true; + break; + case MESA_SHADER_TESS_EVAL: + info->tes.outinfo.writes_layer = true; + break; + case MESA_SHADER_GEOMETRY: + info->vs.outinfo.writes_layer = true; + break; + default: + break; + } + } + + /* Make sure to export the PrimitiveID if the fragment shader needs it. */ + if (key->vs_common_out.export_prim_id) { + switch (nir->info.stage) { + case MESA_SHADER_VERTEX: + info->vs.outinfo.export_prim_id = true; + break; + case MESA_SHADER_TESS_EVAL: + info->tes.outinfo.export_prim_id = true; + break; + case MESA_SHADER_GEOMETRY: + info->vs.outinfo.export_prim_id = true; + break; + default: + break; + } + } + + /* Make sure to export the ViewportIndex if the fragment shader needs it. */ + if (key->vs_common_out.export_viewport_index) { + switch (nir->info.stage) { + case MESA_SHADER_VERTEX: + info->vs.output_usage_mask[VARYING_SLOT_VIEWPORT] |= 0x1; + break; + case MESA_SHADER_TESS_EVAL: + info->tes.output_usage_mask[VARYING_SLOT_VIEWPORT] |= 0x1; + break; + case MESA_SHADER_GEOMETRY: + info->gs.output_usage_mask[VARYING_SLOT_VIEWPORT] |= 0x1; + break; + default: + break; + } + } + + if (nir->info.stage == MESA_SHADER_FRAGMENT) + info->ps.num_interp = nir->num_inputs; + + switch (nir->info.stage) { + case MESA_SHADER_COMPUTE: + for (int i = 0; i < 3; ++i) + info->cs.block_size[i] = nir->info.cs.local_size[i]; + break; + case MESA_SHADER_FRAGMENT: + info->ps.can_discard = nir->info.fs.uses_discard; + info->ps.early_fragment_test = nir->info.fs.early_fragment_tests; + info->ps.post_depth_coverage = nir->info.fs.post_depth_coverage; + break; + case MESA_SHADER_GEOMETRY: + info->gs.vertices_in = nir->info.gs.vertices_in; + info->gs.vertices_out = nir->info.gs.vertices_out; + info->gs.output_prim = nir->info.gs.output_primitive; + info->gs.invocations = nir->info.gs.invocations; + break; + case MESA_SHADER_TESS_EVAL: + info->tes.primitive_mode = nir->info.tess.primitive_mode; + info->tes.spacing = nir->info.tess.spacing; + info->tes.ccw = nir->info.tess.ccw; + info->tes.point_mode = nir->info.tess.point_mode; + info->tes.as_es = key->vs_common_out.as_es; + info->tes.export_prim_id = key->vs_common_out.export_prim_id; + info->is_ngg = key->vs_common_out.as_ngg; + info->is_ngg_passthrough = key->vs_common_out.as_ngg_passthrough; + break; + case MESA_SHADER_TESS_CTRL: + info->tcs.tcs_vertices_out = nir->info.tess.tcs_vertices_out; + break; + case MESA_SHADER_VERTEX: + info->vs.as_es = key->vs_common_out.as_es; + info->vs.as_ls = key->vs_common_out.as_ls; + info->vs.export_prim_id = key->vs_common_out.export_prim_id; + info->is_ngg = key->vs_common_out.as_ngg; + info->is_ngg_passthrough = key->vs_common_out.as_ngg_passthrough; + break; + default: + break; + } + + if (nir->info.stage == MESA_SHADER_GEOMETRY) { + unsigned add_clip = nir->info.clip_distance_array_size + + nir->info.cull_distance_array_size > 4; + info->gs.gsvs_vertex_size = + (util_bitcount64(nir->info.outputs_written) + add_clip) * 16; + info->gs.max_gsvs_emit_size = + info->gs.gsvs_vertex_size * nir->info.gs.vertices_out; + } + + /* Compute the ESGS item size for VS or TES as ES. */ + if ((nir->info.stage == MESA_SHADER_VERTEX || + nir->info.stage == MESA_SHADER_TESS_EVAL) && + key->vs_common_out.as_es) { + struct radv_es_output_info *es_info = + nir->info.stage == MESA_SHADER_VERTEX ? &info->vs.es_info : &info->tes.es_info; + uint32_t max_output_written = 0; + + uint64_t output_mask = nir->info.outputs_written; + while (output_mask) { + const int i = u_bit_scan64(&output_mask); + unsigned param_index = shader_io_get_unique_index(i); + + max_output_written = MAX2(param_index, max_output_written); + } + + es_info->esgs_itemsize = (max_output_written + 1) * 16; + } + + info->float_controls_mode = nir->info.float_controls_execution_mode; } diff -Nru mesa-19.2.8/src/amd/vulkan/radv_wsi.c mesa-20.0.8/src/amd/vulkan/radv_wsi.c --- mesa-19.2.8/src/amd/vulkan/radv_wsi.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/vulkan/radv_wsi.c 2020-06-12 01:21:16.000000000 +0000 @@ -231,19 +231,39 @@ RADV_FROM_HANDLE(radv_device, device, _device); struct radv_physical_device *pdevice = device->physical_device; RADV_FROM_HANDLE(radv_fence, fence, pAcquireInfo->fence); + RADV_FROM_HANDLE(radv_semaphore, semaphore, pAcquireInfo->semaphore); VkResult result = wsi_common_acquire_next_image2(&pdevice->wsi_device, _device, pAcquireInfo, pImageIndex); - if (fence && (result == VK_SUCCESS || result == VK_SUBOPTIMAL_KHR)) { - if (fence->fence) - device->ws->signal_fence(fence->fence); - if (fence->temp_syncobj) { - device->ws->signal_syncobj(device->ws, fence->temp_syncobj); - } else if (fence->syncobj) { - device->ws->signal_syncobj(device->ws, fence->syncobj); + if (result == VK_SUCCESS || result == VK_SUBOPTIMAL_KHR) { + if (fence) { + if (fence->fence) + device->ws->signal_fence(fence->fence); + if (fence->temp_syncobj) { + device->ws->signal_syncobj(device->ws, fence->temp_syncobj); + } else if (fence->syncobj) { + device->ws->signal_syncobj(device->ws, fence->syncobj); + } + } + if (semaphore) { + struct radv_semaphore_part *part = + semaphore->temporary.kind != RADV_SEMAPHORE_NONE ? + &semaphore->temporary : &semaphore->permanent; + + switch (part->kind) { + case RADV_SEMAPHORE_NONE: + case RADV_SEMAPHORE_WINSYS: + /* Do not need to do anything. */ + break; + case RADV_SEMAPHORE_TIMELINE: + unreachable("WSI only allows binary semaphores."); + case RADV_SEMAPHORE_SYNCOBJ: + device->ws->signal_syncobj(device->ws, part->syncobj); + break; + } } } return result; diff -Nru mesa-19.2.8/src/amd/vulkan/radv_wsi_display.c mesa-20.0.8/src/amd/vulkan/radv_wsi_display.c --- mesa-19.2.8/src/amd/vulkan/radv_wsi_display.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/vulkan/radv_wsi_display.c 2020-06-12 01:21:16.000000000 +0000 @@ -34,7 +34,6 @@ #include #include #include "winsys/amdgpu/radv_amdgpu_winsys_public.h" -#include "ac_llvm_util.h" #include "vk_format.h" #include "sid.h" #include "util/debug.h" diff -Nru mesa-19.2.8/src/amd/vulkan/si_cmd_buffer.c mesa-20.0.8/src/amd/vulkan/si_cmd_buffer.c --- mesa-19.2.8/src/amd/vulkan/si_cmd_buffer.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/vulkan/si_cmd_buffer.c 2020-06-12 01:21:16.000000000 +0000 @@ -159,13 +159,14 @@ si_emit_graphics(struct radv_physical_device *physical_device, struct radeon_cmdbuf *cs) { + bool has_clear_state = physical_device->rad_info.has_clear_state; int i; radeon_emit(cs, PKT3(PKT3_CONTEXT_CONTROL, 1, 0)); radeon_emit(cs, CONTEXT_CONTROL_LOAD_ENABLE(1)); radeon_emit(cs, CONTEXT_CONTROL_SHADOW_ENABLE(1)); - if (physical_device->has_clear_state) { + if (has_clear_state) { radeon_emit(cs, PKT3(PKT3_CLEAR_STATE, 0, 0)); radeon_emit(cs, 0); } @@ -174,7 +175,7 @@ si_set_raster_config(physical_device, cs); radeon_set_context_reg(cs, R_028A18_VGT_HOS_MAX_TESS_LEVEL, fui(64)); - if (!physical_device->has_clear_state) + if (!has_clear_state) radeon_set_context_reg(cs, R_028A1C_VGT_HOS_MIN_TESS_LEVEL, fui(0)); /* FIXME calculate these values somehow ??? */ @@ -183,7 +184,7 @@ radeon_set_context_reg(cs, R_028A58_VGT_ES_PER_GS, 0x40); } - if (!physical_device->has_clear_state) { + if (!has_clear_state) { radeon_set_context_reg(cs, R_028A5C_VGT_GS_PER_VS, 0x2); radeon_set_context_reg(cs, R_028A8C_VGT_PRIMITIVEID_RESET, 0x0); radeon_set_context_reg(cs, R_028B98_VGT_STRMOUT_BUFFER_CONFIG, 0x0); @@ -191,19 +192,19 @@ if (physical_device->rad_info.chip_class <= GFX9) radeon_set_context_reg(cs, R_028AA0_VGT_INSTANCE_STEP_RATE_0, 1); - if (!physical_device->has_clear_state) + if (!has_clear_state) radeon_set_context_reg(cs, R_028AB8_VGT_VTX_CNT_EN, 0x0); if (physical_device->rad_info.chip_class < GFX7) radeon_set_config_reg(cs, R_008A14_PA_CL_ENHANCE, S_008A14_NUM_CLIP_SEQ(3) | S_008A14_CLIP_VTX_REORDER_ENA(1)); - if (!physical_device->has_clear_state) + if (!has_clear_state) radeon_set_context_reg(cs, R_02882C_PA_SU_PRIM_FILTER_CNTL, 0); /* CLEAR_STATE doesn't clear these correctly on certain generations. * I don't know why. Deduced by trial and error. */ - if (physical_device->rad_info.chip_class <= GFX7 || !physical_device->has_clear_state) { + if (physical_device->rad_info.chip_class <= GFX7 || !has_clear_state) { radeon_set_context_reg(cs, R_028B28_VGT_STRMOUT_DRAW_OPAQUE_OFFSET, 0); radeon_set_context_reg(cs, R_028204_PA_SC_WINDOW_SCISSOR_TL, S_028204_WINDOW_OFFSET_DISABLE(1)); @@ -216,14 +217,14 @@ S_028034_BR_X(16384) | S_028034_BR_Y(16384)); } - if (!physical_device->has_clear_state) { + if (!has_clear_state) { for (i = 0; i < 16; i++) { radeon_set_context_reg(cs, R_0282D0_PA_SC_VPORT_ZMIN_0 + i*8, 0); radeon_set_context_reg(cs, R_0282D4_PA_SC_VPORT_ZMAX_0 + i*8, fui(1.0)); } } - if (!physical_device->has_clear_state) { + if (!has_clear_state) { radeon_set_context_reg(cs, R_02820C_PA_SC_CLIPRECT_RULE, 0xFFFF); radeon_set_context_reg(cs, R_028230_PA_SC_EDGERULE, 0xAAAAAAAA); /* PA_SU_HARDWARE_SCREEN_OFFSET must be 0 due to hw bug on GFX6 */ @@ -326,8 +327,11 @@ } } - /* Don't use late alloc for NGG on Navi14 due to a hw bug. */ - if (physical_device->rad_info.family == CHIP_NAVI14) { + /* Don't use late alloc for NGG on Navi14 due to a hw bug. + * If NGG is never used, enable all CUs. + */ + if (!physical_device->use_ngg || + physical_device->rad_info.family == CHIP_NAVI14) { late_alloc_limit_gs = 0; cu_mask_gs = 0xffff; } @@ -417,44 +421,14 @@ radeon_set_context_reg(cs, R_028B50_VGT_TESS_DISTRIBUTION, vgt_tess_distribution); - } else if (!physical_device->has_clear_state) { + } else if (!has_clear_state) { radeon_set_context_reg(cs, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, 14); radeon_set_context_reg(cs, R_028C5C_VGT_OUT_DEALLOC_CNTL, 16); } if (physical_device->rad_info.chip_class >= GFX9) { - unsigned num_se = physical_device->rad_info.max_se; - unsigned pc_lines = 0; - unsigned max_alloc_count = 0; - - switch (physical_device->rad_info.family) { - case CHIP_VEGA10: - case CHIP_VEGA12: - case CHIP_VEGA20: - pc_lines = 4096; - break; - case CHIP_RAVEN: - case CHIP_RAVEN2: - case CHIP_RENOIR: - case CHIP_NAVI10: - case CHIP_NAVI12: - pc_lines = 1024; - break; - case CHIP_NAVI14: - pc_lines = 512; - break; - default: - assert(0); - } - - if (physical_device->rad_info.chip_class >= GFX10) { - max_alloc_count = pc_lines / 3; - } else { - max_alloc_count = MIN2(128, pc_lines / (4 * num_se)); - } - radeon_set_context_reg(cs, R_028C48_PA_SC_BINNER_CNTL_1, - S_028C48_MAX_ALLOC_COUNT(max_alloc_count - 1) | + S_028C48_MAX_ALLOC_COUNT(physical_device->rad_info.pbb_max_alloc_count - 1) | S_028C48_MAX_PRIM_PER_BATCH(1023)); radeon_set_context_reg(cs, R_028C4C_PA_SC_CONSERVATIVE_RASTERIZATION_CNTL, S_028C4C_NULL_SQUAD_AA_MASK_ENABLE(1)); @@ -468,7 +442,7 @@ radeon_emit(cs, S_028A04_MIN_SIZE(radv_pack_float_12p4(0)) | S_028A04_MAX_SIZE(radv_pack_float_12p4(8192/2))); - if (!physical_device->has_clear_state) { + if (!has_clear_state) { radeon_set_context_reg(cs, R_028004_DB_COUNT_CONTROL, S_028004_ZPASS_INCREMENT_DISABLE(1)); } @@ -912,15 +886,19 @@ gcr_cntl |= S_586_GL1_INV(1) | S_586_GLV_INV(1); if (flush_bits & RADV_CMD_FLAG_INV_L2) { /* Writeback and invalidate everything in L2. */ - gcr_cntl |= S_586_GL2_INV(1) | S_586_GLM_INV(1); + gcr_cntl |= S_586_GL2_INV(1) | S_586_GL2_WB(1) | + S_586_GLM_INV(1) | S_586_GLM_WB(1); } else if (flush_bits & RADV_CMD_FLAG_WB_L2) { - /* Writeback but do not invalidate. */ - gcr_cntl |= S_586_GL2_WB(1); + /* Writeback but do not invalidate. + * GLM doesn't support WB alone. If WB is set, INV must be set too. + */ + gcr_cntl |= S_586_GL2_WB(1) | + S_586_GLM_WB(1) | S_586_GLM_INV(1); } /* TODO: Implement this new flag for GFX9+. - if (flush_bits & RADV_CMD_FLAG_INV_L2_METADATA) - gcr_cntl |= S_586_GLM_INV(1); + else if (flush_bits & RADV_CMD_FLAG_INV_L2_METADATA) + gcr_cntl |= S_586_GLM_INV(1) | S_586_GLM_WB(1); */ if (flush_bits & (RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB)) { diff -Nru mesa-19.2.8/src/amd/vulkan/vk_format_table.py mesa-20.0.8/src/amd/vulkan/vk_format_table.py --- mesa-19.2.8/src/amd/vulkan/vk_format_table.py 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/vulkan/vk_format_table.py 2020-06-12 01:21:16.000000000 +0000 @@ -80,7 +80,7 @@ if format.nr_channels() <= 1: func(format.le_channels, format.le_swizzles) else: - print('#ifdef PIPE_ARCH_BIG_ENDIAN') + print('#if UTIL_ARCH_BIG_ENDIAN') func(format.be_channels, format.be_swizzles) print('#else') func(format.le_channels, format.le_swizzles) diff -Nru mesa-19.2.8/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_bo.c mesa-20.0.8/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_bo.c --- mesa-19.2.8/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_bo.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_bo.c 2020-06-12 01:21:16.000000000 +0000 @@ -38,6 +38,9 @@ #include "util/u_atomic.h" +#define AMDGPU_TILING_SCANOUT_SHIFT 63 +#define AMDGPU_TILING_SCANOUT_MASK 1 + static void radv_amdgpu_winsys_bo_destroy(struct radeon_winsys_bo *_bo); static int @@ -262,7 +265,7 @@ } else { if (bo->ws->debug_all_bos) { pthread_mutex_lock(&bo->ws->global_bo_list_lock); - LIST_DEL(&bo->global_list_item); + list_del(&bo->global_list_item); bo->ws->num_buffers--; pthread_mutex_unlock(&bo->ws->global_bo_list_lock); } @@ -291,7 +294,7 @@ if (bo->ws->debug_all_bos) { pthread_mutex_lock(&ws->global_bo_list_lock); - LIST_ADDTAIL(&bo->global_list_item, &ws->global_bo_list); + list_addtail(&bo->global_list_item, &ws->global_bo_list); ws->num_buffers++; pthread_mutex_unlock(&ws->global_bo_list_lock); } @@ -356,6 +359,10 @@ request.preferred_heap |= AMDGPU_GEM_DOMAIN_VRAM; if (initial_domain & RADEON_DOMAIN_GTT) request.preferred_heap |= AMDGPU_GEM_DOMAIN_GTT; + if (initial_domain & RADEON_DOMAIN_GDS) + request.preferred_heap |= AMDGPU_GEM_DOMAIN_GDS; + if (initial_domain & RADEON_DOMAIN_OA) + request.preferred_heap |= AMDGPU_GEM_DOMAIN_OA; if (flags & RADEON_FLAG_CPU_ACCESS) { bo->base.vram_cpu_access = initial_domain & RADEON_DOMAIN_VRAM; @@ -530,8 +537,7 @@ static struct radeon_winsys_bo * radv_amdgpu_winsys_bo_from_fd(struct radeon_winsys *_ws, int fd, unsigned priority, - unsigned *stride, - unsigned *offset) + uint64_t *alloc_size) { struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws); struct radv_amdgpu_winsys_bo *bo; @@ -554,6 +560,10 @@ if (r) goto error_query; + if (alloc_size) { + *alloc_size = info.alloc_size; + } + r = amdgpu_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general, result.alloc_size, 1 << 20, 0, &va, &va_handle, AMDGPU_VA_RANGE_HIGH); @@ -656,10 +666,11 @@ { struct radv_amdgpu_winsys_bo *bo = radv_amdgpu_winsys_bo(_bo); struct amdgpu_bo_metadata metadata = {0}; - uint32_t tiling_flags = 0; + uint64_t tiling_flags = 0; if (bo->ws->info.chip_class >= GFX9) { tiling_flags |= AMDGPU_TILING_SET(SWIZZLE_MODE, md->u.gfx9.swizzle_mode); + tiling_flags |= AMDGPU_TILING_SET(SCANOUT, md->u.gfx9.scanout); } else { if (md->u.legacy.macrotile == RADEON_LAYOUT_TILED) tiling_flags |= AMDGPU_TILING_SET(ARRAY_MODE, 4); /* 2D_TILED_THIN1 */ @@ -704,6 +715,7 @@ if (bo->ws->info.chip_class >= GFX9) { md->u.gfx9.swizzle_mode = AMDGPU_TILING_GET(tiling_flags, SWIZZLE_MODE); + md->u.gfx9.scanout = AMDGPU_TILING_GET(tiling_flags, SCANOUT); } else { md->u.legacy.microtile = RADEON_LAYOUT_LINEAR; md->u.legacy.macrotile = RADEON_LAYOUT_LINEAR; diff -Nru mesa-19.2.8/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c mesa-20.0.8/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c --- mesa-19.2.8/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c 2020-06-12 01:21:16.000000000 +0000 @@ -163,7 +163,7 @@ fence->fence.ip_instance = req->ip_instance; fence->fence.ring = req->ring; fence->fence.fence = req->seq_no; - fence->user_ptr = (volatile uint64_t*)(ctx->fence_map + (req->ip_type * MAX_RINGS_PER_TYPE + req->ring) * sizeof(uint64_t)); + fence->user_ptr = (volatile uint64_t*)(ctx->fence_map + req->ip_type * MAX_RINGS_PER_TYPE + req->ring); } static struct radeon_winsys_fence *radv_amdgpu_create_fence() @@ -1119,6 +1119,7 @@ ibs[j].size = size; ibs[j].ib_mc_address = radv_buffer_get_va(bos[j]); + ibs[j].flags = 0; } cnt++; @@ -1163,6 +1164,7 @@ ibs[0].size = size; ibs[0].ib_mc_address = radv_buffer_get_va(bos[0]); + ibs[0].flags = 0; } r = radv_amdgpu_create_bo_list(cs0->ws, &cs_array[i], cnt, @@ -1315,20 +1317,26 @@ } } -static struct radeon_winsys_ctx *radv_amdgpu_ctx_create(struct radeon_winsys *_ws, - enum radeon_ctx_priority priority) +static VkResult radv_amdgpu_ctx_create(struct radeon_winsys *_ws, + enum radeon_ctx_priority priority, + struct radeon_winsys_ctx **rctx) { struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws); struct radv_amdgpu_ctx *ctx = CALLOC_STRUCT(radv_amdgpu_ctx); uint32_t amdgpu_priority = radv_to_amdgpu_priority(priority); + VkResult result; int r; if (!ctx) - return NULL; + return VK_ERROR_OUT_OF_HOST_MEMORY; r = amdgpu_cs_ctx_create2(ws->dev, amdgpu_priority, &ctx->ctx); - if (r) { + if (r && r == -EACCES) { + result = VK_ERROR_NOT_PERMITTED_EXT; + goto error_create; + } else if (r) { fprintf(stderr, "amdgpu: radv_amdgpu_cs_ctx_create2 failed. (%i)\n", r); + result = VK_ERROR_OUT_OF_HOST_MEMORY; goto error_create; } ctx->ws = ws; @@ -1343,10 +1351,12 @@ ctx->fence_map = (uint64_t*)ws->base.buffer_map(ctx->fence_bo); if (ctx->fence_map) memset(ctx->fence_map, 0, 4096); - return (struct radeon_winsys_ctx *)ctx; + + *rctx = (struct radeon_winsys_ctx *)ctx; + return VK_SUCCESS; error_create: FREE(ctx); - return NULL; + return result; } static void radv_amdgpu_ctx_destroy(struct radeon_winsys_ctx *rwctx) diff -Nru mesa-19.2.8/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys.c mesa-20.0.8/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys.c --- mesa-19.2.8/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys.c 2020-06-12 01:21:16.000000000 +0000 @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include "radv_amdgpu_cs.h" @@ -46,7 +47,7 @@ return false; /* LLVM 9.0 is required for GFX10. */ - if (ws->info.chip_class == GFX10 && HAVE_LLVM < 0x0900) { + if (ws->info.chip_class == GFX10 && LLVM_VERSION_MAJOR < 9) { fprintf(stderr, "radv: Navi family support requires LLVM 9 or higher\n"); return false; } @@ -61,8 +62,8 @@ return false; } - ws->info.num_sdma_rings = MIN2(ws->info.num_sdma_rings, MAX_RINGS_PER_TYPE); - ws->info.num_compute_rings = MIN2(ws->info.num_compute_rings, MAX_RINGS_PER_TYPE); + ws->info.num_rings[RING_DMA] = MIN2(ws->info.num_rings[RING_DMA], MAX_RINGS_PER_TYPE); + ws->info.num_rings[RING_COMPUTE] = MIN2(ws->info.num_rings[RING_COMPUTE], MAX_RINGS_PER_TYPE); ws->use_ib_bos = ws->info.chip_class >= GFX7; return true; @@ -189,7 +190,7 @@ ws->use_local_bos = perftest_flags & RADV_PERFTEST_LOCAL_BOS; ws->zero_all_vram_allocs = debug_flags & RADV_DEBUG_ZERO_VRAM; ws->batchchain = !(perftest_flags & RADV_PERFTEST_NO_BATCHCHAIN); - LIST_INITHEAD(&ws->global_bo_list); + list_inithead(&ws->global_bo_list); pthread_mutex_init(&ws->global_bo_list_lock, NULL); ws->base.query_info = radv_amdgpu_winsys_query_info; ws->base.query_value = radv_amdgpu_winsys_query_value; diff -Nru mesa-19.2.8/src/amd/vulkan/winsys/null/radv_null_cs.c mesa-20.0.8/src/amd/vulkan/winsys/null/radv_null_cs.c --- mesa-19.2.8/src/amd/vulkan/winsys/null/radv_null_cs.c 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/amd/vulkan/winsys/null/radv_null_cs.c 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,101 @@ +/* + * Copyright © 2020 Valve Corporation + * + * based on amdgpu winsys. + * Copyright © 2016 Red Hat. + * Copyright © 2016 Bas Nieuwenhuizen + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "radv_null_cs.h" +#include "util/u_memory.h" + +struct radv_null_cs { + struct radeon_cmdbuf base; + struct radv_null_winsys *ws; +}; + +static inline struct radv_null_cs * +radv_null_cs(struct radeon_cmdbuf *base) +{ + return (struct radv_null_cs*)base; +} + +static VkResult radv_null_ctx_create(struct radeon_winsys *_ws, + enum radeon_ctx_priority priority, + struct radeon_winsys_ctx **rctx) +{ + struct radv_null_ctx *ctx = CALLOC_STRUCT(radv_null_ctx); + + if (!ctx) + return VK_ERROR_OUT_OF_HOST_MEMORY; + + *rctx = (struct radeon_winsys_ctx *)ctx; + return VK_SUCCESS; +} + +static void radv_null_ctx_destroy(struct radeon_winsys_ctx *rwctx) +{ + struct radv_null_ctx *ctx = (struct radv_null_ctx *)rwctx; + FREE(ctx); +} + +static struct radeon_cmdbuf * +radv_null_cs_create(struct radeon_winsys *ws, + enum ring_type ring_type) +{ + struct radv_null_cs *cs = calloc(1, sizeof(struct radv_null_cs)); + if (!cs) + return NULL; + + cs->ws = radv_null_winsys(ws); + + cs->base.buf = malloc(16384); + cs->base.max_dw = 4096; + if (!cs->base.buf) { + FREE(cs); + return NULL; + } + + return &cs->base; +} + +static bool radv_null_cs_finalize(struct radeon_cmdbuf *_cs) +{ + return true; +} + +static void radv_null_cs_destroy(struct radeon_cmdbuf *rcs) +{ + struct radv_null_cs *cs = radv_null_cs(rcs); + FREE(cs->base.buf); + FREE(cs); +} + +void radv_null_cs_init_functions(struct radv_null_winsys *ws) +{ + ws->base.ctx_create = radv_null_ctx_create; + ws->base.ctx_destroy = radv_null_ctx_destroy; + ws->base.cs_create = radv_null_cs_create; + ws->base.cs_finalize = radv_null_cs_finalize; + ws->base.cs_destroy = radv_null_cs_destroy; + +} diff -Nru mesa-19.2.8/src/broadcom/cle/gen_pack_header.py mesa-20.0.8/src/broadcom/cle/gen_pack_header.py --- mesa-19.2.8/src/broadcom/cle/gen_pack_header.py 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/broadcom/cle/gen_pack_header.py 2020-06-12 01:21:16.000000000 +0000 @@ -62,7 +62,6 @@ '=': '', '>': '', '#': '', - 'α': 'alpha', '&': '', '*': '', '"': '', diff -Nru mesa-19.2.8/src/broadcom/cle/v3d_packet_helpers.h mesa-20.0.8/src/broadcom/cle/v3d_packet_helpers.h --- mesa-19.2.8/src/broadcom/cle/v3d_packet_helpers.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/broadcom/cle/v3d_packet_helpers.h 2020-06-12 01:21:16.000000000 +0000 @@ -39,7 +39,7 @@ #define __gen_validate_value(x) VALGRIND_CHECK_MEM_IS_DEFINED(&(x), sizeof(x)) #endif #else -#define VG(x) +#define VG(x) ((void)0) #endif #ifndef __gen_validate_value diff -Nru mesa-19.2.8/src/broadcom/cle/v3d_packet_v33.xml mesa-20.0.8/src/broadcom/cle/v3d_packet_v33.xml --- mesa-19.2.8/src/broadcom/cle/v3d_packet_v33.xml 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/broadcom/cle/v3d_packet_v33.xml 2020-06-12 01:21:16.000000000 +0000 @@ -174,6 +174,41 @@ + @@ -254,6 +289,7 @@ + @@ -1265,24 +1301,39 @@ - - - - + + + + - + + + + - + + + + - + + + + - + + + + - + + + + @@ -1313,6 +1364,11 @@ + + + @@ -1338,7 +1394,7 @@ - + diff -Nru mesa-19.2.8/src/broadcom/common/v3d_debug.c mesa-20.0.8/src/broadcom/common/v3d_debug.c --- mesa-19.2.8/src/broadcom/common/v3d_debug.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/broadcom/common/v3d_debug.c 2020-06-12 01:21:16.000000000 +0000 @@ -51,6 +51,7 @@ { "perf", V3D_DEBUG_PERF}, { "norast", V3D_DEBUG_NORAST}, { "fs", V3D_DEBUG_FS}, + { "gs", V3D_DEBUG_GS}, { "vs", V3D_DEBUG_VS}, { "cs", V3D_DEBUG_CS}, { "always_flush", V3D_DEBUG_ALWAYS_FLUSH}, @@ -65,7 +66,7 @@ [MESA_SHADER_VERTEX] = V3D_DEBUG_VS, [MESA_SHADER_TESS_CTRL] = 0, [MESA_SHADER_TESS_EVAL] = 0, - [MESA_SHADER_GEOMETRY] = 0, + [MESA_SHADER_GEOMETRY] = V3D_DEBUG_GS, [MESA_SHADER_FRAGMENT] = V3D_DEBUG_FS, [MESA_SHADER_COMPUTE] = V3D_DEBUG_CS, }; diff -Nru mesa-19.2.8/src/broadcom/common/v3d_debug.h mesa-20.0.8/src/broadcom/common/v3d_debug.h --- mesa-19.2.8/src/broadcom/common/v3d_debug.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/broadcom/common/v3d_debug.h 2020-06-12 01:21:16.000000000 +0000 @@ -47,15 +47,16 @@ #define V3D_DEBUG_VIR (1 << 3) #define V3D_DEBUG_QPU (1 << 4) #define V3D_DEBUG_FS (1 << 5) -#define V3D_DEBUG_VS (1 << 6) -#define V3D_DEBUG_CS (1 << 7) -#define V3D_DEBUG_CL (1 << 8) -#define V3D_DEBUG_SURFACE (1 << 9) -#define V3D_DEBUG_PERF (1 << 10) -#define V3D_DEBUG_NORAST (1 << 11) -#define V3D_DEBUG_ALWAYS_FLUSH (1 << 12) -#define V3D_DEBUG_CLIF (1 << 13) -#define V3D_DEBUG_PRECOMPILE (1 << 14) +#define V3D_DEBUG_GS (1 << 6) +#define V3D_DEBUG_VS (1 << 7) +#define V3D_DEBUG_CS (1 << 8) +#define V3D_DEBUG_CL (1 << 9) +#define V3D_DEBUG_SURFACE (1 << 10) +#define V3D_DEBUG_PERF (1 << 11) +#define V3D_DEBUG_NORAST (1 << 12) +#define V3D_DEBUG_ALWAYS_FLUSH (1 << 13) +#define V3D_DEBUG_CLIF (1 << 14) +#define V3D_DEBUG_PRECOMPILE (1 << 15) #ifdef HAVE_ANDROID_PLATFORM #define LOG_TAG "BROADCOM-MESA" diff -Nru mesa-19.2.8/src/broadcom/common/v3d_limits.h mesa-20.0.8/src/broadcom/common/v3d_limits.h --- mesa-19.2.8/src/broadcom/common/v3d_limits.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/broadcom/common/v3d_limits.h 2020-06-12 01:21:16.000000000 +0000 @@ -30,7 +30,11 @@ #define V3D_CHANNELS 16 #define V3D_MAX_FS_INPUTS 64 +#define V3D_MAX_GS_INPUTS 64 #define V3D_MAX_VS_INPUTS 64 +#define V3D_MAX_ANY_STAGE_INPUTS MAX3(V3D_MAX_VS_INPUTS, \ + V3D_MAX_GS_INPUTS, \ + V3D_MAX_FS_INPUTS) /* Not specifically a hardware limit, just coordination between compiler and * driver. diff -Nru mesa-19.2.8/src/broadcom/compiler/nir_to_vir.c mesa-20.0.8/src/broadcom/compiler/nir_to_vir.c --- mesa-19.2.8/src/broadcom/compiler/nir_to_vir.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/broadcom/compiler/nir_to_vir.c 2020-06-12 01:21:16.000000000 +0000 @@ -22,7 +22,7 @@ */ #include -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_math.h" #include "util/u_memory.h" #include "util/ralloc.h" @@ -208,6 +208,9 @@ instr->intrinsic == nir_intrinsic_load_scratch || instr->intrinsic == nir_intrinsic_load_shared); + if (!is_load) + c->tmu_dirty_rcl = true; + bool has_index = !is_shared_or_scratch; int offset_src; @@ -440,7 +443,7 @@ struct qreg result) { struct qinst *last_inst = NULL; - if (!list_empty(&c->cur_block->instructions)) + if (!list_is_empty(&c->cur_block->instructions)) last_inst = (struct qinst *)c->cur_block->instructions.prev; assert((result.file == QFILE_TEMP && @@ -1364,11 +1367,20 @@ vir_emit_tlb_color_write(c, rt); } +static inline void +vir_VPM_WRITE_indirect(struct v3d_compile *c, + struct qreg val, + struct qreg vpm_index) +{ + assert(c->devinfo->ver >= 40); + vir_STVPMV(c, vpm_index, val); +} + static void vir_VPM_WRITE(struct v3d_compile *c, struct qreg val, uint32_t vpm_index) { if (c->devinfo->ver >= 40) { - vir_STVPMV(c, vir_uniform_ui(c, vpm_index), val); + vir_VPM_WRITE_indirect(c, val, vir_uniform_ui(c, vpm_index)); } else { /* XXX: v3d33_vir_vpm_write_setup(c); */ vir_MOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_VPM), val); @@ -1384,6 +1396,15 @@ vir_VPMWT(c); } +static void +emit_geom_end(struct v3d_compile *c) +{ + /* GFXH-1684: VPM writes need to be complete by the end of the shader. + */ + if (c->devinfo->ver >= 40 && c->devinfo->ver <= 42) + vir_VPMWT(c); +} + void v3d_optimize_nir(struct nir_shader *s) { @@ -1397,7 +1418,7 @@ progress = false; NIR_PASS_V(s, nir_lower_vars_to_ssa); - NIR_PASS(progress, s, nir_lower_alu_to_scalar, NULL); + NIR_PASS(progress, s, nir_lower_alu_to_scalar, NULL, NULL); NIR_PASS(progress, s, nir_lower_phis_to_scalar); NIR_PASS(progress, s, nir_copy_prop); NIR_PASS(progress, s, nir_opt_remove_phis); @@ -1471,7 +1492,7 @@ } static void -ntq_setup_vpm_inputs(struct v3d_compile *c) +ntq_setup_vs_inputs(struct v3d_compile *c) { /* Figure out how many components of each vertex attribute the shader * uses. Each variable should have been split to individual @@ -1562,27 +1583,69 @@ } static void -ntq_setup_fs_inputs(struct v3d_compile *c) +get_sorted_input_variables(struct v3d_compile *c, + unsigned *num_entries, + nir_variable ***vars) { - unsigned num_entries = 0; - unsigned num_components = 0; - nir_foreach_variable(var, &c->s->inputs) { - num_entries++; - num_components += glsl_get_components(var->type); - } + *num_entries = 0; + nir_foreach_variable(var, &c->s->inputs) + (*num_entries)++; - nir_variable *vars[num_entries]; + *vars = ralloc_array(c, nir_variable *, *num_entries); unsigned i = 0; nir_foreach_variable(var, &c->s->inputs) - vars[i++] = var; + (*vars)[i++] = var; /* Sort the variables so that we emit the input setup in * driver_location order. This is required for VPM reads, whose data * is fetched into the VPM in driver_location (TGSI register index) * order. */ - qsort(&vars, num_entries, sizeof(*vars), driver_location_compare); + qsort(*vars, *num_entries, sizeof(**vars), driver_location_compare); +} + +static void +ntq_setup_gs_inputs(struct v3d_compile *c) +{ + nir_variable **vars; + unsigned num_entries; + get_sorted_input_variables(c, &num_entries, &vars); + + for (unsigned i = 0; i < num_entries; i++) { + nir_variable *var = vars[i]; + + /* All GS inputs are arrays with as many entries as vertices + * in the input primitive, but here we only care about the + * per-vertex input type. + */ + const struct glsl_type *type = glsl_without_array(var->type); + unsigned array_len = MAX2(glsl_get_length(type), 1); + unsigned loc = var->data.driver_location; + + resize_qreg_array(c, &c->inputs, &c->inputs_array_size, + (loc + array_len) * 4); + + for (unsigned j = 0; j < array_len; j++) { + unsigned num_elements = glsl_get_vector_elements(type); + for (unsigned k = 0; k < num_elements; k++) { + unsigned chan = var->data.location_frac + k; + unsigned input_idx = c->num_inputs++; + struct v3d_varying_slot slot = + v3d_slot_from_slot_and_component(var->data.location + j, chan); + c->input_slots[input_idx] = slot; + } + } + } +} + + +static void +ntq_setup_fs_inputs(struct v3d_compile *c) +{ + nir_variable **vars; + unsigned num_entries; + get_sorted_input_variables(c, &num_entries, &vars); for (unsigned i = 0; i < num_entries; i++) { nir_variable *var = vars[i]; @@ -1949,6 +2012,55 @@ } static void +emit_store_output_gs(struct v3d_compile *c, nir_intrinsic_instr *instr) +{ + assert(instr->num_components == 1); + + uint32_t base_offset = nir_intrinsic_base(instr); + struct qreg src_offset = ntq_get_src(c, instr->src[1], 0); + struct qreg offset = + vir_ADD(c, vir_uniform_ui(c, base_offset), src_offset); + + /* Usually, for VS or FS, we only emit outputs once at program end so + * our VPM writes are never in non-uniform control flow, but this + * is not true for GS, where we are emitting multiple vertices. + */ + if (vir_in_nonuniform_control_flow(c)) { + vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), c->execute), + V3D_QPU_PF_PUSHZ); + } + + vir_VPM_WRITE_indirect(c, ntq_get_src(c, instr->src[0], 0), offset); + + if (vir_in_nonuniform_control_flow(c)) { + struct qinst *last_inst = + (struct qinst *)c->cur_block->instructions.prev; + vir_set_cond(last_inst, V3D_QPU_COND_IFA); + } +} + +static void +ntq_emit_store_output(struct v3d_compile *c, nir_intrinsic_instr *instr) +{ + /* XXX perf: Use stvpmv with uniform non-constant offsets and + * stvpmd with non-uniform offsets and enable + * PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR. + */ + if (c->s->info.stage == MESA_SHADER_FRAGMENT) { + ntq_emit_color_write(c, instr); + } else if (c->s->info.stage == MESA_SHADER_GEOMETRY) { + emit_store_output_gs(c, instr); + } else { + assert(c->s->info.stage == MESA_SHADER_VERTEX); + assert(instr->num_components == 1); + + vir_VPM_WRITE(c, + ntq_get_src(c, instr->src[0], 0), + nir_intrinsic_base(instr)); + } +} + +static void ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) { switch (instr->intrinsic) { @@ -1995,8 +2107,10 @@ case nir_intrinsic_image_deref_load: case nir_intrinsic_image_deref_store: case nir_intrinsic_image_deref_atomic_add: - case nir_intrinsic_image_deref_atomic_min: - case nir_intrinsic_image_deref_atomic_max: + case nir_intrinsic_image_deref_atomic_imin: + case nir_intrinsic_image_deref_atomic_umin: + case nir_intrinsic_image_deref_atomic_imax: + case nir_intrinsic_image_deref_atomic_umax: case nir_intrinsic_image_deref_atomic_and: case nir_intrinsic_image_deref_atomic_or: case nir_intrinsic_image_deref_atomic_xor: @@ -2088,19 +2202,7 @@ break; case nir_intrinsic_store_output: - /* XXX perf: Use stvpmv with uniform non-constant offsets and - * stvpmd with non-uniform offsets and enable - * PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR. - */ - if (c->s->info.stage == MESA_SHADER_FRAGMENT) { - ntq_emit_color_write(c, instr); - } else { - assert(instr->num_components == 1); - - vir_VPM_WRITE(c, - ntq_get_src(c, instr->src[0], 0), - nir_intrinsic_base(instr)); - } + ntq_emit_store_output(c, instr); break; case nir_intrinsic_image_deref_size: @@ -2141,10 +2243,10 @@ } case nir_intrinsic_memory_barrier: - case nir_intrinsic_memory_barrier_atomic_counter: case nir_intrinsic_memory_barrier_buffer: case nir_intrinsic_memory_barrier_image: case nir_intrinsic_memory_barrier_shared: + case nir_intrinsic_memory_barrier_tcs_patch: case nir_intrinsic_group_memory_barrier: /* We don't do any instruction scheduling of these NIR * instructions between each other, so we just need to make @@ -2155,7 +2257,7 @@ */ break; - case nir_intrinsic_barrier: + case nir_intrinsic_control_barrier: /* Emit a TSY op to get all invocations in the workgroup * (actually supergroup) to block until the last invocation * reaches the TSY op. @@ -2212,6 +2314,43 @@ ntq_store_dest(c, &instr->dest, 0, vir_EIDX(c)); break; + case nir_intrinsic_load_per_vertex_input: { + /* col: vertex index, row = varying index */ + struct qreg col = ntq_get_src(c, instr->src[0], 0); + uint32_t row_idx = nir_intrinsic_base(instr) * 4 + + nir_intrinsic_component(instr); + for (int i = 0; i < instr->num_components; i++) { + struct qreg row = vir_uniform_ui(c, row_idx++); + ntq_store_dest(c, &instr->dest, i, + vir_LDVPMG_IN(c, row, col)); + } + break; + } + + case nir_intrinsic_emit_vertex: + case nir_intrinsic_end_primitive: + unreachable("Should have been lowered in v3d_nir_lower_io"); + break; + + case nir_intrinsic_load_primitive_id: { + /* gl_PrimitiveIdIn is written by the GBG in the first word of + * VPM output header. According to docs, we should read this + * using ldvpm(v,d)_in (See Table 71). + */ + ntq_store_dest(c, &instr->dest, 0, + vir_LDVPMV_IN(c, vir_uniform_ui(c, 0))); + break; + } + + case nir_intrinsic_load_invocation_id: + ntq_store_dest(c, &instr->dest, 0, vir_IID(c)); + break; + + case nir_intrinsic_load_fb_layers_v3d: + ntq_store_dest(c, &instr->dest, 0, + vir_uniform(c, QUNIFORM_FB_LAYERS, 0)); + break; + default: fprintf(stderr, "Unknown intrinsic: "); nir_print_instr(&instr->instr, stderr); @@ -2634,10 +2773,21 @@ c->spill_size += V3D_CHANNELS * c->s->scratch_size; } - if (c->s->info.stage == MESA_SHADER_FRAGMENT) + switch (c->s->info.stage) { + case MESA_SHADER_VERTEX: + ntq_setup_vs_inputs(c); + break; + case MESA_SHADER_GEOMETRY: + ntq_setup_gs_inputs(c); + break; + case MESA_SHADER_FRAGMENT: ntq_setup_fs_inputs(c); - else - ntq_setup_vpm_inputs(c); + break; + case MESA_SHADER_COMPUTE: + break; + default: + unreachable("unsupported shader stage"); + } ntq_setup_outputs(c); @@ -2681,6 +2831,7 @@ .lower_mul_high = true, .lower_wpos_pntc = true, .lower_rotate = true, + .lower_to_scalar = true, }; /** @@ -2782,6 +2933,9 @@ case MESA_SHADER_FRAGMENT: emit_frag_end(c); break; + case MESA_SHADER_GEOMETRY: + emit_geom_end(c); + break; case MESA_SHADER_VERTEX: emit_vert_end(c); break; diff -Nru mesa-19.2.8/src/broadcom/compiler/qpu_schedule.c mesa-20.0.8/src/broadcom/compiler/qpu_schedule.c --- mesa-19.2.8/src/broadcom/compiler/qpu_schedule.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/broadcom/compiler/qpu_schedule.c 2020-06-12 01:21:16.000000000 +0000 @@ -1299,7 +1299,7 @@ const struct v3d_device_info *devinfo = c->devinfo; uint32_t time = 0; - while (!list_empty(&scoreboard->dag->heads)) { + while (!list_is_empty(&scoreboard->dag->heads)) { struct schedule_node *chosen = choose_instruction_to_schedule(devinfo, scoreboard, @@ -1439,7 +1439,7 @@ list_inithead(&setup_list); /* Wrap each instruction in a scheduler structure. */ - while (!list_empty(&block->instructions)) { + while (!list_is_empty(&block->instructions)) { struct qinst *qinst = (struct qinst *)block->instructions.next; struct schedule_node *n = rzalloc(mem_ctx, struct schedule_node); diff -Nru mesa-19.2.8/src/broadcom/compiler/qpu_validate.c mesa-20.0.8/src/broadcom/compiler/qpu_validate.c --- mesa-19.2.8/src/broadcom/compiler/qpu_validate.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/broadcom/compiler/qpu_validate.c 2020-06-12 01:21:16.000000000 +0000 @@ -258,8 +258,10 @@ fail_instr(state, "RF write after THREND"); } - if (v3d_qpu_sig_writes_address(devinfo, &inst->sig)) + if (v3d_qpu_sig_writes_address(devinfo, &inst->sig) && + !inst->sig_magic) { fail_instr(state, "RF write after THREND"); + } /* GFXH-1625: No TMUWT in the last instruction */ if (state->last_thrsw_ip - state->ip == 2 && diff -Nru mesa-19.2.8/src/broadcom/compiler/v3d40_tex.c mesa-20.0.8/src/broadcom/compiler/v3d40_tex.c --- mesa-19.2.8/src/broadcom/compiler/v3d40_tex.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/broadcom/compiler/v3d40_tex.c 2020-06-12 01:21:16.000000000 +0000 @@ -252,9 +252,13 @@ return V3D_TMU_OP_REGULAR; case nir_intrinsic_image_deref_atomic_add: return v3d_get_op_for_atomic_add(instr, 3); - case nir_intrinsic_image_deref_atomic_min: + case nir_intrinsic_image_deref_atomic_imin: + return V3D_TMU_OP_WRITE_SMIN; + case nir_intrinsic_image_deref_atomic_umin: return V3D_TMU_OP_WRITE_UMIN_FULL_L1_CLEAR; - case nir_intrinsic_image_deref_atomic_max: + case nir_intrinsic_image_deref_atomic_imax: + return V3D_TMU_OP_WRITE_SMAX; + case nir_intrinsic_image_deref_atomic_umax: return V3D_TMU_OP_WRITE_UMAX; case nir_intrinsic_image_deref_atomic_and: return V3D_TMU_OP_WRITE_AND_READ_INC; @@ -418,4 +422,7 @@ if (nir_intrinsic_dest_components(instr) == 0) vir_TMUWT(c); + + if (instr->intrinsic != nir_intrinsic_image_deref_load) + c->tmu_dirty_rcl = true; } diff -Nru mesa-19.2.8/src/broadcom/compiler/v3d_compiler.h mesa-20.0.8/src/broadcom/compiler/v3d_compiler.h --- mesa-19.2.8/src/broadcom/compiler/v3d_compiler.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/broadcom/compiler/v3d_compiler.h 2020-06-12 01:21:16.000000000 +0000 @@ -279,6 +279,14 @@ * L2T cache will effectively be the shared memory area. */ QUNIFORM_SHARED_OFFSET, + + /** + * Returns the number of layers in the framebuffer. + * + * This is used to cap gl_Layer in geometry shaders to avoid + * out-of-bounds accesses into the tile state during binning. + */ + QUNIFORM_FB_LAYERS, }; static inline uint32_t v3d_unit_data_create(uint32_t unit, uint32_t value) @@ -329,6 +337,7 @@ bool clamp_r:1; } tex[V3D_MAX_TEXTURE_SAMPLERS]; uint8_t ucp_enables; + bool is_last_geometry_stage; }; struct v3d_fs_key { @@ -371,11 +380,21 @@ struct pipe_rt_blend_state blend; }; +struct v3d_gs_key { + struct v3d_key base; + + struct v3d_varying_slot used_outputs[V3D_MAX_FS_INPUTS]; + uint8_t num_used_outputs; + + bool is_coord; + bool per_vertex_point_size; +}; + struct v3d_vs_key { struct v3d_key base; - struct v3d_varying_slot fs_inputs[V3D_MAX_FS_INPUTS]; - uint8_t num_fs_inputs; + struct v3d_varying_slot used_outputs[V3D_MAX_ANY_STAGE_INPUTS]; + uint8_t num_used_outputs; bool is_coord; bool per_vertex_point_size; @@ -552,6 +571,7 @@ int local_invocation_index_bits; uint8_t vattr_sizes[V3D_MAX_VS_INPUTS / 4]; + uint8_t gs_input_sizes[V3D_MAX_GS_INPUTS]; uint32_t vpm_output_size; /* Size in bytes of registers that have been spilled. This is how much @@ -586,6 +606,7 @@ struct pipe_shader_state *shader_state; struct v3d_key *key; struct v3d_fs_key *fs_key; + struct v3d_gs_key *gs_key; struct v3d_vs_key *vs_key; /* Live ranges of temps. */ @@ -639,6 +660,8 @@ bool lock_scoreboard_on_first_thrsw; bool failed; + + bool tmu_dirty_rcl; }; struct v3d_uniform_list { @@ -658,6 +681,8 @@ * after-final-THRSW state. */ bool single_seg; + + bool tmu_dirty_rcl; }; struct v3d_vs_prog_data { @@ -683,6 +708,36 @@ uint8_t vcm_cache_size; }; +struct v3d_gs_prog_data { + struct v3d_prog_data base; + + /* Whether the program reads gl_PrimitiveIDIn */ + bool uses_pid; + + /* Number of components read from each input varying. */ + uint8_t input_sizes[V3D_MAX_GS_INPUTS / 4]; + + /* Number of inputs */ + uint8_t num_inputs; + struct v3d_varying_slot input_slots[V3D_MAX_GS_INPUTS]; + + /* Total number of components written, for the shader state record. */ + uint32_t vpm_output_size; + + /* Maximum SIMD dispatch width to not exceed VPM output size limits + * in the geometry shader. Notice that the final dispatch width has to + * be decided at draw time and could be lower based on the VPM pressure + * added by other shader stages. + */ + uint8_t simd_width; + + /* Output primitive type */ + uint8_t out_prim_type; + + /* Number of GS invocations */ + uint8_t num_invocations; +}; + struct v3d_fs_prog_data { struct v3d_prog_data base; @@ -994,8 +1049,13 @@ VIR_A_ALU0(EIDX) VIR_A_ALU1(LDVPMV_IN) VIR_A_ALU1(LDVPMV_OUT) +VIR_A_ALU1(LDVPMD_IN) +VIR_A_ALU1(LDVPMD_OUT) +VIR_A_ALU2(LDVPMG_IN) +VIR_A_ALU2(LDVPMG_OUT) VIR_A_ALU0(TMUWT) +VIR_A_ALU0(IID) VIR_A_ALU0(FXCD) VIR_A_ALU0(XCD) VIR_A_ALU0(FYCD) diff -Nru mesa-19.2.8/src/broadcom/compiler/v3d_nir_lower_io.c mesa-20.0.8/src/broadcom/compiler/v3d_nir_lower_io.c --- mesa-19.2.8/src/broadcom/compiler/v3d_nir_lower_io.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/broadcom/compiler/v3d_nir_lower_io.c 2020-06-12 01:21:16.000000000 +0000 @@ -45,22 +45,46 @@ int psiz_vpm_offset; int varyings_vpm_offset; - BITSET_WORD varyings_stored[BITSET_WORDS(V3D_MAX_FS_INPUTS)]; + /* Geometry shader state */ + struct { + /* VPM offset for the current vertex data output */ + nir_variable *output_offset_var; + /* VPM offset for the current vertex header */ + nir_variable *header_offset_var; + /* VPM header for the current vertex */ + nir_variable *header_var; + + /* Size of the complete VPM output header */ + uint32_t output_header_size; + /* Size of the output data for a single vertex */ + uint32_t output_vertex_data_size; + } gs; + + BITSET_WORD varyings_stored[BITSET_WORDS(V3D_MAX_ANY_STAGE_INPUTS)]; nir_ssa_def *pos[4]; }; static void -v3d_nir_store_output(nir_builder *b, int base, nir_ssa_def *chan) +v3d_nir_emit_ff_vpm_outputs(struct v3d_compile *c, nir_builder *b, + struct v3d_nir_lower_io_state *state); + +static void +v3d_nir_store_output(nir_builder *b, int base, nir_ssa_def *offset, + nir_ssa_def *chan) { nir_intrinsic_instr *intr = - nir_intrinsic_instr_create(b->shader, nir_intrinsic_store_output); + nir_intrinsic_instr_create(b->shader, + nir_intrinsic_store_output); nir_ssa_dest_init(&intr->instr, &intr->dest, 1, intr->dest.ssa.bit_size, NULL); intr->num_components = 1; intr->src[0] = nir_src_for_ssa(chan); - intr->src[1] = nir_src_for_ssa(nir_imm_int(b, 0)); + if (offset) + intr->src[1] = nir_src_for_ssa(offset); + else + intr->src[1] = nir_src_for_ssa(nir_imm_int(b, 0)); nir_intrinsic_set_base(intr, base); nir_intrinsic_set_write_mask(intr, 0x1); @@ -91,8 +115,23 @@ { int component = var->data.location_frac + chan; - for (int i = 0; i < c->vs_key->num_fs_inputs; i++) { - struct v3d_varying_slot slot = c->vs_key->fs_inputs[i]; + uint32_t num_used_outputs = 0; + struct v3d_varying_slot *used_outputs = NULL; + switch (c->s->info.stage) { + case MESA_SHADER_VERTEX: + num_used_outputs = c->vs_key->num_used_outputs; + used_outputs = c->vs_key->used_outputs; + break; + case MESA_SHADER_GEOMETRY: + num_used_outputs = c->gs_key->num_used_outputs; + used_outputs = c->gs_key->used_outputs; + break; + default: + unreachable("Unsupported shader stage"); + } + + for (int i = 0; i < num_used_outputs; i++) { + struct v3d_varying_slot slot = used_outputs[i]; if (v3d_slot_get_slot(slot) == var->data.location && v3d_slot_get_component(slot) == component) { @@ -105,6 +144,9 @@ /* Lowers a store_output(gallium driver location) to a series of store_outputs * with a driver_location equal to the offset in the VPM. + * + * For geometry shaders we need to emit multiple vertices so the VPM offsets + * need to be computed in the shader code based on the current vertex index. */ static void v3d_nir_lower_vpm_output(struct v3d_compile *c, nir_builder *b, @@ -113,6 +155,13 @@ { b->cursor = nir_before_instr(&intr->instr); + /* If this is a geometry shader we need to emit our outputs + * to the current vertex offset in the VPM. + */ + nir_ssa_def *offset_reg = + c->s->info.stage == MESA_SHADER_GEOMETRY ? + nir_load_var(b, state->gs.output_offset_var) : NULL; + int start_comp = nir_intrinsic_component(intr); nir_ssa_def *src = nir_ssa_for_src(b, intr->src[0], intr->num_components); @@ -127,6 +176,7 @@ } var = scan_var; } + assert(var); /* Save off the components of the position for the setup of VPM inputs * read by fixed function HW. @@ -140,7 +190,47 @@ /* Just psiz to the position in the FF header right now. */ if (var->data.location == VARYING_SLOT_PSIZ && state->psiz_vpm_offset != -1) { - v3d_nir_store_output(b, state->psiz_vpm_offset, src); + v3d_nir_store_output(b, state->psiz_vpm_offset, offset_reg, src); + } + + if (var->data.location == VARYING_SLOT_LAYER) { + assert(c->s->info.stage == MESA_SHADER_GEOMETRY); + nir_ssa_def *header = nir_load_var(b, state->gs.header_var); + header = nir_iand(b, header, nir_imm_int(b, 0xff00ffff)); + + /* From the GLES 3.2 spec: + * + * "When fragments are written to a layered framebuffer, the + * fragment’s layer number selects an image from the array + * of images at each attachment (...). If the fragment’s + * layer number is negative, or greater than or equal to + * the minimum number of layers of any attachment, the + * effects of the fragment on the framebuffer contents are + * undefined." + * + * This suggests we can just ignore that situation, however, + * for V3D an out-of-bounds layer index means that the binner + * might do out-of-bounds writes access to the tile state. The + * simulator has an assert to catch this, so we play safe here + * and we make sure that doesn't happen by setting gl_Layer + * to 0 in that case (we always allocate tile state for at + * least one layer). + */ + nir_intrinsic_instr *load = + nir_intrinsic_instr_create(b->shader, + nir_intrinsic_load_fb_layers_v3d); + load->num_components = 1; + nir_ssa_dest_init(&load->instr, &load->dest, 1, 32, NULL); + nir_builder_instr_insert(b, &load->instr); + nir_ssa_def *fb_layers = &load->dest.ssa; + + nir_ssa_def *cond = nir_ige(b, src, fb_layers); + nir_ssa_def *layer_id = + nir_bcsel(b, cond, + nir_imm_int(b, 0), + nir_ishl(b, src, nir_imm_int(b, 16))); + header = nir_ior(b, header, layer_id); + nir_store_var(b, state->gs.header_var, header, 0x1); } /* Scalarize outputs if it hasn't happened already, since we want to @@ -160,12 +250,73 @@ BITSET_SET(state->varyings_stored, vpm_offset); v3d_nir_store_output(b, state->varyings_vpm_offset + vpm_offset, - nir_channel(b, src, i)); + offset_reg, nir_channel(b, src, i)); } nir_instr_remove(&intr->instr); } +static inline void +reset_gs_header(nir_builder *b, struct v3d_nir_lower_io_state *state) +{ + const uint8_t NEW_PRIMITIVE_OFFSET = 0; + const uint8_t VERTEX_DATA_LENGTH_OFFSET = 8; + + uint32_t vertex_data_size = state->gs.output_vertex_data_size; + assert((vertex_data_size & 0xffffff00) == 0); + + uint32_t header; + header = 1 << NEW_PRIMITIVE_OFFSET; + header |= vertex_data_size << VERTEX_DATA_LENGTH_OFFSET; + nir_store_var(b, state->gs.header_var, nir_imm_int(b, header), 0x1); +} + +static void +v3d_nir_lower_emit_vertex(struct v3d_compile *c, nir_builder *b, + nir_intrinsic_instr *instr, + struct v3d_nir_lower_io_state *state) +{ + b->cursor = nir_before_instr(&instr->instr); + + nir_ssa_def *header = nir_load_var(b, state->gs.header_var); + nir_ssa_def *header_offset = nir_load_var(b, state->gs.header_offset_var); + nir_ssa_def *output_offset = nir_load_var(b, state->gs.output_offset_var); + + /* Emit fixed function outputs */ + v3d_nir_emit_ff_vpm_outputs(c, b, state); + + /* Emit vertex header */ + v3d_nir_store_output(b, 0, header_offset, header); + + /* Update VPM offset for next vertex output data and header */ + output_offset = + nir_iadd(b, output_offset, + nir_imm_int(b, state->gs.output_vertex_data_size)); + + header_offset = nir_iadd(b, header_offset, nir_imm_int(b, 1)); + + /* Reset the New Primitive bit */ + header = nir_iand(b, header, nir_imm_int(b, 0xfffffffe)); + + nir_store_var(b, state->gs.output_offset_var, output_offset, 0x1); + nir_store_var(b, state->gs.header_offset_var, header_offset, 0x1); + nir_store_var(b, state->gs.header_var, header, 0x1); + + nir_instr_remove(&instr->instr); +} + +static void +v3d_nir_lower_end_primitive(struct v3d_compile *c, nir_builder *b, + nir_intrinsic_instr *instr, + struct v3d_nir_lower_io_state *state) +{ + assert(state->gs.header_var); + b->cursor = nir_before_instr(&instr->instr); + reset_gs_header(b, state); + + nir_instr_remove(&instr->instr); +} + static void v3d_nir_lower_io_instr(struct v3d_compile *c, nir_builder *b, struct nir_instr *instr, @@ -181,8 +332,18 @@ break; case nir_intrinsic_store_output: - if (c->s->info.stage == MESA_SHADER_VERTEX) + if (c->s->info.stage == MESA_SHADER_VERTEX || + c->s->info.stage == MESA_SHADER_GEOMETRY) { v3d_nir_lower_vpm_output(c, b, intr, state); + } + break; + + case nir_intrinsic_emit_vertex: + v3d_nir_lower_emit_vertex(c, b, intr, state); + break; + + case nir_intrinsic_end_primitive: + v3d_nir_lower_end_primitive(c, b, intr, state); break; default: @@ -225,12 +386,64 @@ } static void -v3d_nir_setup_vpm_layout(struct v3d_compile *c, - struct v3d_nir_lower_io_state *state) +v3d_nir_setup_vpm_layout_vs(struct v3d_compile *c, + struct v3d_nir_lower_io_state *state) { uint32_t vpm_offset = 0; - if (c->vs_key->is_coord) { + state->pos_vpm_offset = -1; + state->vp_vpm_offset = -1; + state->zs_vpm_offset = -1; + state->rcp_wc_vpm_offset = -1; + state->psiz_vpm_offset = -1; + + bool needs_ff_outputs = c->vs_key->base.is_last_geometry_stage; + if (needs_ff_outputs) { + if (c->vs_key->is_coord) { + state->pos_vpm_offset = vpm_offset; + vpm_offset += 4; + } + + state->vp_vpm_offset = vpm_offset; + vpm_offset += 2; + + if (!c->vs_key->is_coord) { + state->zs_vpm_offset = vpm_offset++; + state->rcp_wc_vpm_offset = vpm_offset++; + } + + if (c->vs_key->per_vertex_point_size) + state->psiz_vpm_offset = vpm_offset++; + } + + state->varyings_vpm_offset = vpm_offset; + + c->vpm_output_size = MAX2(1, vpm_offset + c->vs_key->num_used_outputs); +} + +static void +v3d_nir_setup_vpm_layout_gs(struct v3d_compile *c, + struct v3d_nir_lower_io_state *state) +{ + /* 1 header slot for number of output vertices */ + uint32_t vpm_offset = 1; + + /* 1 header slot per output vertex */ + const uint32_t num_vertices = c->s->info.gs.vertices_out; + vpm_offset += num_vertices; + + state->gs.output_header_size = vpm_offset; + + /* Vertex data: here we only compute offsets into a generic vertex data + * elements. When it is time to actually write a particular vertex to + * the VPM, we will add the offset for that vertex into the VPM output + * to these offsets. + * + * If geometry shaders are present, they are always the last shader + * stage before rasterization, so we always emit fixed function outputs. + */ + vpm_offset = 0; + if (c->gs_key->is_coord) { state->pos_vpm_offset = vpm_offset; vpm_offset += 4; } else { @@ -240,7 +453,7 @@ state->vp_vpm_offset = vpm_offset; vpm_offset += 2; - if (!c->vs_key->is_coord) { + if (!c->gs_key->is_coord) { state->zs_vpm_offset = vpm_offset++; state->rcp_wc_vpm_offset = vpm_offset++; } else { @@ -248,20 +461,34 @@ state->rcp_wc_vpm_offset = -1; } - if (c->vs_key->per_vertex_point_size) + /* Mesa enables OES_geometry_shader_point_size automatically with + * OES_geometry_shader so we always need to handle point size + * writes if present. + */ + if (c->gs_key->per_vertex_point_size) state->psiz_vpm_offset = vpm_offset++; - else - state->psiz_vpm_offset = -1; state->varyings_vpm_offset = vpm_offset; - c->vpm_output_size = vpm_offset + c->vs_key->num_fs_inputs; + state->gs.output_vertex_data_size = + state->varyings_vpm_offset + c->gs_key->num_used_outputs; + + c->vpm_output_size = + state->gs.output_header_size + + state->gs.output_vertex_data_size * num_vertices; } static void v3d_nir_emit_ff_vpm_outputs(struct v3d_compile *c, nir_builder *b, struct v3d_nir_lower_io_state *state) { + /* If this is a geometry shader we need to emit our fixed function + * outputs to the current vertex offset in the VPM. + */ + nir_ssa_def *offset_reg = + c->s->info.stage == MESA_SHADER_GEOMETRY ? + nir_load_var(b, state->gs.output_offset_var) : NULL; + for (int i = 0; i < 4; i++) { if (!state->pos[i]) state->pos[i] = nir_ssa_undef(b, 1, 32); @@ -272,23 +499,25 @@ if (state->pos_vpm_offset != -1) { for (int i = 0; i < 4; i++) { v3d_nir_store_output(b, state->pos_vpm_offset + i, - state->pos[i]); + offset_reg, state->pos[i]); } } - for (int i = 0; i < 2; i++) { - nir_ssa_def *pos; - nir_ssa_def *scale; - pos = state->pos[i]; - if (i == 0) - scale = nir_load_viewport_x_scale(b); - else - scale = nir_load_viewport_y_scale(b); - pos = nir_fmul(b, pos, scale); - pos = nir_fmul(b, pos, rcp_wc); - pos = nir_f2i32(b, nir_fround_even(b, pos)); - v3d_nir_store_output(b, state->vp_vpm_offset + i, - pos); + if (state->vp_vpm_offset != -1) { + for (int i = 0; i < 2; i++) { + nir_ssa_def *pos; + nir_ssa_def *scale; + pos = state->pos[i]; + if (i == 0) + scale = nir_load_viewport_x_scale(b); + else + scale = nir_load_viewport_y_scale(b); + pos = nir_fmul(b, pos, scale); + pos = nir_fmul(b, pos, rcp_wc); + pos = nir_f2i32(b, nir_fround_even(b, pos)); + v3d_nir_store_output(b, state->vp_vpm_offset + i, + offset_reg, pos); + } } if (state->zs_vpm_offset != -1) { @@ -296,38 +525,118 @@ z = nir_fmul(b, z, nir_load_viewport_z_scale(b)); z = nir_fmul(b, z, rcp_wc); z = nir_fadd(b, z, nir_load_viewport_z_offset(b)); - v3d_nir_store_output(b, state->zs_vpm_offset, z); + v3d_nir_store_output(b, state->zs_vpm_offset, offset_reg, z); } - if (state->rcp_wc_vpm_offset != -1) - v3d_nir_store_output(b, state->rcp_wc_vpm_offset, rcp_wc); + if (state->rcp_wc_vpm_offset != -1) { + v3d_nir_store_output(b, state->rcp_wc_vpm_offset, + offset_reg, rcp_wc); + } - /* Store 0 to varyings requested by the FS but not stored in the VS. - * This should be undefined behavior, but glsl-routing seems to rely - * on it. + /* Store 0 to varyings requested by the FS but not stored by the + * previous stage. This should be undefined behavior, but + * glsl-routing seems to rely on it. */ - for (int i = 0; i < c->vs_key->num_fs_inputs; i++) { + uint32_t num_used_outputs; + switch (c->s->info.stage) { + case MESA_SHADER_VERTEX: + num_used_outputs = c->vs_key->num_used_outputs; + break; + case MESA_SHADER_GEOMETRY: + num_used_outputs = c->gs_key->num_used_outputs; + break; + default: + unreachable("Unsupported shader stage"); + } + + for (int i = 0; i < num_used_outputs; i++) { if (!BITSET_TEST(state->varyings_stored, i)) { v3d_nir_store_output(b, state->varyings_vpm_offset + i, - nir_imm_int(b, 0)); + offset_reg, nir_imm_int(b, 0)); } } } +static void +emit_gs_prolog(struct v3d_compile *c, nir_builder *b, + nir_function_impl *impl, + struct v3d_nir_lower_io_state *state) +{ + nir_block *first = nir_start_block(impl); + b->cursor = nir_before_block(first); + + const struct glsl_type *uint_type = glsl_uint_type(); + + assert(!state->gs.output_offset_var); + state->gs.output_offset_var = + nir_local_variable_create(impl, uint_type, "output_offset"); + nir_store_var(b, state->gs.output_offset_var, + nir_imm_int(b, state->gs.output_header_size), 0x1); + + assert(!state->gs.header_offset_var); + state->gs.header_offset_var = + nir_local_variable_create(impl, uint_type, "header_offset"); + nir_store_var(b, state->gs.header_offset_var, nir_imm_int(b, 1), 0x1); + + assert(!state->gs.header_var); + state->gs.header_var = + nir_local_variable_create(impl, uint_type, "header"); + reset_gs_header(b, state); +} + +static void +emit_gs_vpm_output_header_prolog(struct v3d_compile *c, nir_builder *b, + struct v3d_nir_lower_io_state *state) +{ + const uint8_t VERTEX_COUNT_OFFSET = 16; + + /* Our GS header has 1 generic header slot (at VPM offset 0) and then + * one slot per output vertex after it. This means we don't need to + * have a variable just to keep track of the number of vertices we + * emitted and instead we can just compute it here from the header + * offset variable by removing the one generic header slot that always + * goes at the begining of out header. + */ + nir_ssa_def *header_offset = + nir_load_var(b, state->gs.header_offset_var); + nir_ssa_def *vertex_count = + nir_isub(b, header_offset, nir_imm_int(b, 1)); + nir_ssa_def *header = + nir_ior(b, nir_imm_int(b, state->gs.output_header_size), + nir_ishl(b, vertex_count, + nir_imm_int(b, VERTEX_COUNT_OFFSET))); + + v3d_nir_store_output(b, 0, NULL, header); +} + void v3d_nir_lower_io(nir_shader *s, struct v3d_compile *c) { struct v3d_nir_lower_io_state state = { 0 }; /* Set up the layout of the VPM outputs. */ - if (s->info.stage == MESA_SHADER_VERTEX) - v3d_nir_setup_vpm_layout(c, &state); + switch (s->info.stage) { + case MESA_SHADER_VERTEX: + v3d_nir_setup_vpm_layout_vs(c, &state); + break; + case MESA_SHADER_GEOMETRY: + v3d_nir_setup_vpm_layout_gs(c, &state); + break; + case MESA_SHADER_FRAGMENT: + case MESA_SHADER_COMPUTE: + break; + default: + unreachable("Unsupported shader stage"); + } nir_foreach_function(function, s) { if (function->impl) { nir_builder b; nir_builder_init(&b, function->impl); + if (c->s->info.stage == MESA_SHADER_GEOMETRY) + emit_gs_prolog(c, &b, function->impl, &state); + nir_foreach_block(block, function->impl) { nir_foreach_instr_safe(instr, block) v3d_nir_lower_io_instr(c, &b, instr, @@ -336,8 +645,11 @@ nir_block *last = nir_impl_last_block(function->impl); b.cursor = nir_after_block(last); - if (s->info.stage == MESA_SHADER_VERTEX) + if (s->info.stage == MESA_SHADER_VERTEX) { v3d_nir_emit_ff_vpm_outputs(c, &b, &state); + } else if (s->info.stage == MESA_SHADER_GEOMETRY) { + emit_gs_vpm_output_header_prolog(c, &b, &state); + } nir_metadata_preserve(function->impl, nir_metadata_block_index | @@ -345,6 +657,8 @@ } } - if (s->info.stage == MESA_SHADER_VERTEX) + if (s->info.stage == MESA_SHADER_VERTEX || + s->info.stage == MESA_SHADER_GEOMETRY) { v3d_nir_lower_io_update_output_var_base(c, &state); + } } diff -Nru mesa-19.2.8/src/broadcom/compiler/v3d_nir_lower_logic_ops.c mesa-20.0.8/src/broadcom/compiler/v3d_nir_lower_logic_ops.c --- mesa-19.2.8/src/broadcom/compiler/v3d_nir_lower_logic_ops.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/broadcom/compiler/v3d_nir_lower_logic_ops.c 2020-06-12 01:21:16.000000000 +0000 @@ -30,7 +30,7 @@ * appropriately. */ -#include "util/u_format.h" +#include "util/format/u_format.h" #include "compiler/nir/nir_builder.h" #include "compiler/nir/nir_format_convert.h" #include "v3d_compiler.h" diff -Nru mesa-19.2.8/src/broadcom/compiler/vir.c mesa-20.0.8/src/broadcom/compiler/vir.c --- mesa-19.2.8/src/broadcom/compiler/vir.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/broadcom/compiler/vir.c 2020-06-12 01:21:16.000000000 +0000 @@ -23,6 +23,7 @@ #include "broadcom/common/v3d_device_info.h" #include "v3d_compiler.h" +#include "util/u_prim.h" int vir_get_nsrc(struct qinst *inst) @@ -661,6 +662,47 @@ } static void +v3d_gs_set_prog_data(struct v3d_compile *c, + struct v3d_gs_prog_data *prog_data) +{ + prog_data->num_inputs = c->num_inputs; + memcpy(prog_data->input_slots, c->input_slots, + c->num_inputs * sizeof(*c->input_slots)); + + /* gl_PrimitiveIdIn is written by the GBG into the first word of the + * VPM output header automatically and the shader will overwrite + * it after reading it if necessary, so it doesn't add to the VPM + * size requirements. + */ + prog_data->uses_pid = (c->s->info.system_values_read & + (1ull << SYSTEM_VALUE_PRIMITIVE_ID)); + + /* Output segment size is in sectors (8 rows of 32 bits per channel) */ + prog_data->vpm_output_size = align(c->vpm_output_size, 8) / 8; + + /* Compute SIMD dispatch width and update VPM output size accordingly + * to ensure we can fit our program in memory. Available widths are + * 16, 8, 4, 1. + * + * Notice that at draw time we will have to consider VPM memory + * requirements from other stages and choose a smaller dispatch + * width if needed to fit the program in VPM memory. + */ + prog_data->simd_width = 16; + while ((prog_data->simd_width > 1 && prog_data->vpm_output_size > 16) || + prog_data->simd_width == 2) { + prog_data->simd_width >>= 1; + prog_data->vpm_output_size = + align(prog_data->vpm_output_size, 2) / 2; + } + assert(prog_data->vpm_output_size <= 16); + assert(prog_data->simd_width != 2); + + prog_data->out_prim_type = c->s->info.gs.output_primitive; + prog_data->num_invocations = c->s->info.gs.invocations; +} + +static void v3d_set_fs_prog_data_inputs(struct v3d_compile *c, struct v3d_fs_prog_data *prog_data) { @@ -710,16 +752,25 @@ prog_data->threads = c->threads; prog_data->single_seg = !c->last_thrsw; prog_data->spill_size = c->spill_size; + prog_data->tmu_dirty_rcl = c->tmu_dirty_rcl; v3d_set_prog_data_uniforms(c, prog_data); - if (c->s->info.stage == MESA_SHADER_COMPUTE) { - v3d_cs_set_prog_data(c, (struct v3d_compute_prog_data *)prog_data); - } else if (c->s->info.stage == MESA_SHADER_VERTEX) { + switch (c->s->info.stage) { + case MESA_SHADER_VERTEX: v3d_vs_set_prog_data(c, (struct v3d_vs_prog_data *)prog_data); - } else { - assert(c->s->info.stage == MESA_SHADER_FRAGMENT); + break; + case MESA_SHADER_GEOMETRY: + v3d_gs_set_prog_data(c, (struct v3d_gs_prog_data *)prog_data); + break; + case MESA_SHADER_FRAGMENT: v3d_fs_set_prog_data(c, (struct v3d_fs_prog_data *)prog_data); + break; + case MESA_SHADER_COMPUTE: + v3d_cs_set_prog_data(c, (struct v3d_compute_prog_data *)prog_data); + break; + default: + unreachable("unsupported shader stage"); } } @@ -748,9 +799,9 @@ NIR_PASS_V(c->s, nir_lower_io_to_scalar_early, nir_var_shader_in | nir_var_shader_out); uint64_t used_outputs[4] = {0}; - for (int i = 0; i < c->vs_key->num_fs_inputs; i++) { - int slot = v3d_slot_get_slot(c->vs_key->fs_inputs[i]); - int comp = v3d_slot_get_component(c->vs_key->fs_inputs[i]); + for (int i = 0; i < c->vs_key->num_used_outputs; i++) { + int slot = v3d_slot_get_slot(c->vs_key->used_outputs[i]); + int comp = v3d_slot_get_component(c->vs_key->used_outputs[i]); used_outputs[comp] |= 1ull << slot; } NIR_PASS_V(c->s, nir_remove_unused_io_vars, @@ -771,6 +822,37 @@ } static void +v3d_nir_lower_gs_early(struct v3d_compile *c) +{ + /* Split our I/O vars and dead code eliminate the unused + * components. + */ + NIR_PASS_V(c->s, nir_lower_io_to_scalar_early, + nir_var_shader_in | nir_var_shader_out); + uint64_t used_outputs[4] = {0}; + for (int i = 0; i < c->gs_key->num_used_outputs; i++) { + int slot = v3d_slot_get_slot(c->gs_key->used_outputs[i]); + int comp = v3d_slot_get_component(c->gs_key->used_outputs[i]); + used_outputs[comp] |= 1ull << slot; + } + NIR_PASS_V(c->s, nir_remove_unused_io_vars, + &c->s->outputs, used_outputs, NULL); /* demotes to globals */ + NIR_PASS_V(c->s, nir_lower_global_vars_to_local); + v3d_optimize_nir(c->s); + NIR_PASS_V(c->s, nir_remove_dead_variables, nir_var_shader_in); + + /* This must go before nir_lower_io */ + if (c->gs_key->per_vertex_point_size) + NIR_PASS_V(c->s, nir_lower_point_size, 1.0f, 0.0f); + + NIR_PASS_V(c->s, nir_lower_io, nir_var_shader_in | nir_var_shader_out, + type_size_vec4, + (nir_lower_io_options)0); + /* clean up nir_lower_io's deref_var remains */ + NIR_PASS_V(c->s, nir_opt_dce); +} + +static void v3d_fixup_fs_output_types(struct v3d_compile *c) { nir_foreach_variable(var, &c->s->outputs) { @@ -812,13 +894,24 @@ * enabling early_fragment_tests even if the user didn't. */ if (!(c->s->info.num_images || - c->s->info.num_ssbos || - c->s->info.num_abos)) { + c->s->info.num_ssbos)) { c->s->info.fs.early_fragment_tests = true; } } static void +v3d_nir_lower_gs_late(struct v3d_compile *c) +{ + if (c->key->ucp_enables) { + NIR_PASS_V(c->s, nir_lower_clip_gs, c->key->ucp_enables, + false, NULL); + } + + /* Note: GS output scalarizing must happen after nir_lower_clip_gs. */ + NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_out); +} + +static void v3d_nir_lower_vs_late(struct v3d_compile *c) { if (c->vs_key->clamp_color) @@ -826,7 +919,7 @@ if (c->key->ucp_enables) { NIR_PASS_V(c->s, nir_lower_clip_vs, c->key->ucp_enables, - false); + false, false, NULL); NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_out); } @@ -847,11 +940,12 @@ if (c->fs_key->alpha_test) { NIR_PASS_V(c->s, nir_lower_alpha_test, c->fs_key->alpha_test_func, - false); + false, NULL); } if (c->key->ucp_enables) - NIR_PASS_V(c->s, nir_lower_clip_fs, c->key->ucp_enables); + NIR_PASS_V(c->s, nir_lower_clip_fs, c->key->ucp_enables, + false); /* Note: FS input scalarizing must happen after * nir_lower_two_sided_color, which only handles a vec4 at a time. @@ -906,6 +1000,10 @@ c->vs_key = (struct v3d_vs_key *)key; prog_data = rzalloc_size(NULL, sizeof(struct v3d_vs_prog_data)); break; + case MESA_SHADER_GEOMETRY: + c->gs_key = (struct v3d_gs_key *)key; + prog_data = rzalloc_size(NULL, sizeof(struct v3d_gs_prog_data)); + break; case MESA_SHADER_FRAGMENT: c->fs_key = (struct v3d_fs_key *)key; prog_data = rzalloc_size(NULL, sizeof(struct v3d_fs_prog_data)); @@ -918,31 +1016,67 @@ unreachable("unsupported shader stage"); } - if (c->s->info.stage == MESA_SHADER_VERTEX) { + + switch (c->s->info.stage) { + case MESA_SHADER_VERTEX: v3d_nir_lower_vs_early(c); - } else if (c->s->info.stage != MESA_SHADER_COMPUTE) { - assert(c->s->info.stage == MESA_SHADER_FRAGMENT); + break; + case MESA_SHADER_GEOMETRY: + v3d_nir_lower_gs_early(c); + break; + case MESA_SHADER_FRAGMENT: v3d_nir_lower_fs_early(c); + break; + default: + break; } v3d_lower_nir(c); - if (c->s->info.stage == MESA_SHADER_VERTEX) { + switch (c->s->info.stage) { + case MESA_SHADER_VERTEX: v3d_nir_lower_vs_late(c); - } else if (c->s->info.stage != MESA_SHADER_COMPUTE) { - assert(c->s->info.stage == MESA_SHADER_FRAGMENT); + break; + case MESA_SHADER_GEOMETRY: + v3d_nir_lower_gs_late(c); + break; + case MESA_SHADER_FRAGMENT: v3d_nir_lower_fs_late(c); + break; + default: + break; } NIR_PASS_V(c->s, v3d_nir_lower_io, c); NIR_PASS_V(c->s, v3d_nir_lower_txf_ms, c); NIR_PASS_V(c->s, v3d_nir_lower_image_load_store); - NIR_PASS_V(c->s, nir_lower_idiv); + NIR_PASS_V(c->s, nir_lower_idiv, nir_lower_idiv_fast); v3d_optimize_nir(c->s); + + /* Do late algebraic optimization to turn add(a, neg(b)) back into + * subs, then the mandatory cleanup after algebraic. Note that it may + * produce fnegs, and if so then we need to keep running to squash + * fneg(fneg(a)). + */ + bool more_late_algebraic = true; + while (more_late_algebraic) { + more_late_algebraic = false; + NIR_PASS(more_late_algebraic, c->s, nir_opt_algebraic_late); + NIR_PASS_V(c->s, nir_opt_constant_folding); + NIR_PASS_V(c->s, nir_copy_prop); + NIR_PASS_V(c->s, nir_opt_dce); + NIR_PASS_V(c->s, nir_opt_cse); + } + NIR_PASS_V(c->s, nir_lower_bool_to_int32); NIR_PASS_V(c->s, nir_convert_from_ssa, true); + /* Schedule for about half our register space, to enable more shaders + * to hit 4 threads. + */ + NIR_PASS_V(c->s, nir_schedule, 24); + v3d_nir_to_vir(c); v3d_set_prog_data(c, prog_data); @@ -1017,7 +1151,7 @@ c->cursor.link = NULL; vir_for_each_block(block, c) { - while (!list_empty(&block->instructions)) { + while (!list_is_empty(&block->instructions)) { struct qinst *qinst = list_first_entry(&block->instructions, struct qinst, link); @@ -1112,7 +1246,9 @@ vir_get_stage_name(struct v3d_compile *c) { if (c->vs_key && c->vs_key->is_coord) - return "MESA_SHADER_COORD"; + return "MESA_SHADER_VERTEX_BIN"; + else if (c->gs_key && c->gs_key->is_coord) + return "MESA_SHADER_GEOMETRY_BIN"; else return gl_shader_stage_name(c->s->info.stage); } diff -Nru mesa-19.2.8/src/broadcom/compiler/vir_live_variables.c mesa-20.0.8/src/broadcom/compiler/vir_live_variables.c --- mesa-19.2.8/src/broadcom/compiler/vir_live_variables.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/broadcom/compiler/vir_live_variables.c 2020-06-12 01:21:16.000000000 +0000 @@ -33,18 +33,6 @@ uint8_t channels; }; -static uint32_t -int_hash(const void *key) -{ - return _mesa_hash_data(key, sizeof(int)); -} - -static bool -int_compare(const void *key1, const void *key2) -{ - return *(const int *)key1 == *(const int *)key2; -} - static int vir_reg_to_var(struct qreg reg) { @@ -197,7 +185,7 @@ vir_setup_def_use(struct v3d_compile *c) { struct hash_table *partial_update_ht = - _mesa_hash_table_create(c, int_hash, int_compare); + _mesa_hash_table_create(c, _mesa_hash_int, _mesa_key_int_equal); int ip = 0; vir_for_each_block(block, c) { diff -Nru mesa-19.2.8/src/broadcom/compiler/vir_register_allocate.c mesa-20.0.8/src/broadcom/compiler/vir_register_allocate.c --- mesa-19.2.8/src/broadcom/compiler/vir_register_allocate.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/broadcom/compiler/vir_register_allocate.c 2020-06-12 01:21:16.000000000 +0000 @@ -270,6 +270,7 @@ vir_emit_thrsw(c); vir_TMUWT(c); c->spills++; + c->tmu_dirty_rcl = true; } } diff -Nru mesa-19.2.8/src/broadcom/qpu/qpu_pack.c mesa-20.0.8/src/broadcom/qpu/qpu_pack.c --- mesa-19.2.8/src/broadcom/qpu/qpu_pack.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/broadcom/qpu/qpu_pack.c 2020-06-12 01:21:16.000000000 +0000 @@ -521,7 +521,9 @@ { 187, 187, 1 << 3, ANYMUX, V3D_QPU_A_VPMSETUP, 33 }, { 188, 188, 1 << 0, ANYMUX, V3D_QPU_A_LDVPMV_IN, 40 }, + { 188, 188, 1 << 0, ANYMUX, V3D_QPU_A_LDVPMV_OUT, 40 }, { 188, 188, 1 << 1, ANYMUX, V3D_QPU_A_LDVPMD_IN, 40 }, + { 188, 188, 1 << 1, ANYMUX, V3D_QPU_A_LDVPMD_OUT, 40 }, { 188, 188, 1 << 2, ANYMUX, V3D_QPU_A_LDVPMP, 40 }, { 188, 188, 1 << 3, ANYMUX, V3D_QPU_A_RSQRT, 41 }, { 188, 188, 1 << 4, ANYMUX, V3D_QPU_A_EXP, 41 }, @@ -529,6 +531,7 @@ { 188, 188, 1 << 6, ANYMUX, V3D_QPU_A_SIN, 41 }, { 188, 188, 1 << 7, ANYMUX, V3D_QPU_A_RSQRT2, 41 }, { 189, 189, ANYMUX, ANYMUX, V3D_QPU_A_LDVPMG_IN, 40 }, + { 189, 189, ANYMUX, ANYMUX, V3D_QPU_A_LDVPMG_OUT, 40 }, /* FIXME: MORE COMPLICATED */ /* { 190, 191, ANYMUX, ANYMUX, V3D_QPU_A_VFMOVABSNEGNAB }, */ diff -Nru mesa-19.2.8/src/compiler/blob.c mesa-20.0.8/src/compiler/blob.c --- mesa-19.2.8/src/compiler/blob.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/blob.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,391 +0,0 @@ -/* - * Copyright © 2014 Intel Corporation - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ - -#include - -#include "main/macros.h" -#include "blob.h" - -#ifdef HAVE_VALGRIND -#include -#include -#define VG(x) x -#else -#define VG(x) -#endif - -#define BLOB_INITIAL_SIZE 4096 - -/* Ensure that \blob will be able to fit an additional object of size - * \additional. The growing (if any) will occur by doubling the existing - * allocation. - */ -static bool -grow_to_fit(struct blob *blob, size_t additional) -{ - size_t to_allocate; - uint8_t *new_data; - - if (blob->out_of_memory) - return false; - - if (blob->size + additional <= blob->allocated) - return true; - - if (blob->fixed_allocation) { - blob->out_of_memory = true; - return false; - } - - if (blob->allocated == 0) - to_allocate = BLOB_INITIAL_SIZE; - else - to_allocate = blob->allocated * 2; - - to_allocate = MAX2(to_allocate, blob->allocated + additional); - - new_data = realloc(blob->data, to_allocate); - if (new_data == NULL) { - blob->out_of_memory = true; - return false; - } - - blob->data = new_data; - blob->allocated = to_allocate; - - return true; -} - -/* Align the blob->size so that reading or writing a value at (blob->data + - * blob->size) will result in an access aligned to a granularity of \alignment - * bytes. - * - * \return True unless allocation fails - */ -static bool -align_blob(struct blob *blob, size_t alignment) -{ - const size_t new_size = ALIGN(blob->size, alignment); - - if (blob->size < new_size) { - if (!grow_to_fit(blob, new_size - blob->size)) - return false; - - if (blob->data) - memset(blob->data + blob->size, 0, new_size - blob->size); - blob->size = new_size; - } - - return true; -} - -static void -align_blob_reader(struct blob_reader *blob, size_t alignment) -{ - blob->current = blob->data + ALIGN(blob->current - blob->data, alignment); -} - -void -blob_init(struct blob *blob) -{ - blob->data = NULL; - blob->allocated = 0; - blob->size = 0; - blob->fixed_allocation = false; - blob->out_of_memory = false; -} - -void -blob_init_fixed(struct blob *blob, void *data, size_t size) -{ - blob->data = data; - blob->allocated = size; - blob->size = 0; - blob->fixed_allocation = true; - blob->out_of_memory = false; -} - -bool -blob_overwrite_bytes(struct blob *blob, - size_t offset, - const void *bytes, - size_t to_write) -{ - /* Detect an attempt to overwrite data out of bounds. */ - if (offset + to_write < offset || blob->size < offset + to_write) - return false; - - VG(VALGRIND_CHECK_MEM_IS_DEFINED(bytes, to_write)); - - if (blob->data) - memcpy(blob->data + offset, bytes, to_write); - - return true; -} - -bool -blob_write_bytes(struct blob *blob, const void *bytes, size_t to_write) -{ - if (! grow_to_fit(blob, to_write)) - return false; - - VG(VALGRIND_CHECK_MEM_IS_DEFINED(bytes, to_write)); - - if (blob->data) - memcpy(blob->data + blob->size, bytes, to_write); - blob->size += to_write; - - return true; -} - -intptr_t -blob_reserve_bytes(struct blob *blob, size_t to_write) -{ - intptr_t ret; - - if (! grow_to_fit (blob, to_write)) - return -1; - - ret = blob->size; - blob->size += to_write; - - return ret; -} - -intptr_t -blob_reserve_uint32(struct blob *blob) -{ - align_blob(blob, sizeof(uint32_t)); - return blob_reserve_bytes(blob, sizeof(uint32_t)); -} - -intptr_t -blob_reserve_intptr(struct blob *blob) -{ - align_blob(blob, sizeof(intptr_t)); - return blob_reserve_bytes(blob, sizeof(intptr_t)); -} - -bool -blob_write_uint32(struct blob *blob, uint32_t value) -{ - align_blob(blob, sizeof(value)); - - return blob_write_bytes(blob, &value, sizeof(value)); -} - -#define ASSERT_ALIGNED(_offset, _align) \ - assert(ALIGN((_offset), (_align)) == (_offset)) - -bool -blob_overwrite_uint32 (struct blob *blob, - size_t offset, - uint32_t value) -{ - ASSERT_ALIGNED(offset, sizeof(value)); - return blob_overwrite_bytes(blob, offset, &value, sizeof(value)); -} - -bool -blob_write_uint64(struct blob *blob, uint64_t value) -{ - align_blob(blob, sizeof(value)); - - return blob_write_bytes(blob, &value, sizeof(value)); -} - -bool -blob_write_intptr(struct blob *blob, intptr_t value) -{ - align_blob(blob, sizeof(value)); - - return blob_write_bytes(blob, &value, sizeof(value)); -} - -bool -blob_overwrite_intptr (struct blob *blob, - size_t offset, - intptr_t value) -{ - ASSERT_ALIGNED(offset, sizeof(value)); - return blob_overwrite_bytes(blob, offset, &value, sizeof(value)); -} - -bool -blob_write_string(struct blob *blob, const char *str) -{ - return blob_write_bytes(blob, str, strlen(str) + 1); -} - -void -blob_reader_init(struct blob_reader *blob, const void *data, size_t size) -{ - blob->data = data; - blob->end = blob->data + size; - blob->current = data; - blob->overrun = false; -} - -/* Check that an object of size \size can be read from this blob. - * - * If not, set blob->overrun to indicate that we attempted to read too far. - */ -static bool -ensure_can_read(struct blob_reader *blob, size_t size) -{ - if (blob->overrun) - return false; - - if (blob->current <= blob->end && blob->end - blob->current >= size) - return true; - - blob->overrun = true; - - return false; -} - -const void * -blob_read_bytes(struct blob_reader *blob, size_t size) -{ - const void *ret; - - if (! ensure_can_read (blob, size)) - return NULL; - - ret = blob->current; - - blob->current += size; - - return ret; -} - -void -blob_copy_bytes(struct blob_reader *blob, void *dest, size_t size) -{ - const void *bytes; - - bytes = blob_read_bytes(blob, size); - if (bytes == NULL) - return; - - memcpy(dest, bytes, size); -} - -void -blob_skip_bytes(struct blob_reader *blob, size_t size) -{ - if (ensure_can_read (blob, size)) - blob->current += size; -} - -/* These next three read functions have identical form. If we add any beyond - * these first three we should probably switch to generating these with a - * preprocessor macro. -*/ -uint32_t -blob_read_uint32(struct blob_reader *blob) -{ - uint32_t ret; - int size = sizeof(ret); - - align_blob_reader(blob, size); - - if (! ensure_can_read(blob, size)) - return 0; - - ret = *((uint32_t*) blob->current); - - blob->current += size; - - return ret; -} - -uint64_t -blob_read_uint64(struct blob_reader *blob) -{ - uint64_t ret; - int size = sizeof(ret); - - align_blob_reader(blob, size); - - if (! ensure_can_read(blob, size)) - return 0; - - ret = *((uint64_t*) blob->current); - - blob->current += size; - - return ret; -} - -intptr_t -blob_read_intptr(struct blob_reader *blob) -{ - intptr_t ret; - int size = sizeof(ret); - - align_blob_reader(blob, size); - - if (! ensure_can_read(blob, size)) - return 0; - - ret = *((intptr_t *) blob->current); - - blob->current += size; - - return ret; -} - -char * -blob_read_string(struct blob_reader *blob) -{ - int size; - char *ret; - uint8_t *nul; - - /* If we're already at the end, then this is an overrun. */ - if (blob->current >= blob->end) { - blob->overrun = true; - return NULL; - } - - /* Similarly, if there is no zero byte in the data remaining in this blob, - * we also consider that an overrun. - */ - nul = memchr(blob->current, 0, blob->end - blob->current); - - if (nul == NULL) { - blob->overrun = true; - return NULL; - } - - size = nul - blob->current + 1; - - assert(ensure_can_read(blob, size)); - - ret = (char *) blob->current; - - blob->current += size; - - return ret; -} diff -Nru mesa-19.2.8/src/compiler/blob.h mesa-20.0.8/src/compiler/blob.h --- mesa-19.2.8/src/compiler/blob.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/blob.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,358 +0,0 @@ -/* - * Copyright © 2014 Intel Corporation - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ - -#ifndef BLOB_H -#define BLOB_H - -#include -#include -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -/* The blob functions implement a simple, low-level API for serializing and - * deserializing. - * - * All objects written to a blob will be serialized directly, (without any - * additional meta-data to describe the data written). Therefore, it is the - * caller's responsibility to ensure that any data can be read later, (either - * by knowing exactly what data is expected, or by writing to the blob - * sufficient meta-data to describe what has been written). - * - * A blob is efficient in that it dynamically grows by doubling in size, so - * allocation costs are logarithmic. - */ - -struct blob { - /* The data actually written to the blob. */ - uint8_t *data; - - /** Number of bytes that have been allocated for \c data. */ - size_t allocated; - - /** The number of bytes that have actual data written to them. */ - size_t size; - - /** True if \c data a fixed allocation that we cannot resize - * - * \see blob_init_fixed - */ - bool fixed_allocation; - - /** - * True if we've ever failed to realloc or if we go pas the end of a fixed - * allocation blob. - */ - bool out_of_memory; -}; - -/* When done reading, the caller can ensure that everything was consumed by - * checking the following: - * - * 1. blob->current should be equal to blob->end, (if not, too little was - * read). - * - * 2. blob->overrun should be false, (otherwise, too much was read). - */ -struct blob_reader { - const uint8_t *data; - const uint8_t *end; - const uint8_t *current; - bool overrun; -}; - -/** - * Init a new, empty blob. - */ -void -blob_init(struct blob *blob); - -/** - * Init a new, fixed-size blob. - * - * A fixed-size blob has a fixed block of data that will not be freed on - * blob_finish and will never be grown. If we hit the end, we simply start - * returning false from the write functions. - * - * If a fixed-size blob has a NULL data pointer then the data is written but - * it otherwise operates normally. This can be used to determine the size - * that will be required to write a given data structure. - */ -void -blob_init_fixed(struct blob *blob, void *data, size_t size); - -/** - * Finish a blob and free its memory. - * - * If \blob was initialized with blob_init_fixed, the data pointer is - * considered to be owned by the user and will not be freed. - */ -static inline void -blob_finish(struct blob *blob) -{ - if (!blob->fixed_allocation) - free(blob->data); -} - -/** - * Add some unstructured, fixed-size data to a blob. - * - * \return True unless allocation failed. - */ -bool -blob_write_bytes(struct blob *blob, const void *bytes, size_t to_write); - -/** - * Reserve space in \blob for a number of bytes. - * - * Space will be allocated within the blob for these byes, but the bytes will - * be left uninitialized. The caller is expected to use \sa - * blob_overwrite_bytes to write to these bytes. - * - * \return An offset to space allocated within \blob to which \to_write bytes - * can be written, (or -1 in case of any allocation error). - */ -intptr_t -blob_reserve_bytes(struct blob *blob, size_t to_write); - -/** - * Similar to \sa blob_reserve_bytes, but only reserves an uint32_t worth of - * space. Note that this must be used if later reading with \sa - * blob_read_uint32, since it aligns the offset correctly. - */ -intptr_t -blob_reserve_uint32(struct blob *blob); - -/** - * Similar to \sa blob_reserve_bytes, but only reserves an intptr_t worth of - * space. Note that this must be used if later reading with \sa - * blob_read_intptr, since it aligns the offset correctly. - */ -intptr_t -blob_reserve_intptr(struct blob *blob); - -/** - * Overwrite some data previously written to the blob. - * - * Writes data to an existing portion of the blob at an offset of \offset. - * This data range must have previously been written to the blob by one of the - * blob_write_* calls. - * - * For example usage, see blob_overwrite_uint32 - * - * \return True unless the requested offset or offset+to_write lie outside - * the current blob's size. - */ -bool -blob_overwrite_bytes(struct blob *blob, - size_t offset, - const void *bytes, - size_t to_write); - -/** - * Add a uint32_t to a blob. - * - * \note This function will only write to a uint32_t-aligned offset from the - * beginning of the blob's data, so some padding bytes may be added to the - * blob if this write follows some unaligned write (such as - * blob_write_string). - * - * \return True unless allocation failed. - */ -bool -blob_write_uint32(struct blob *blob, uint32_t value); - -/** - * Overwrite a uint32_t previously written to the blob. - * - * Writes a uint32_t value to an existing portion of the blob at an offset of - * \offset. This data range must have previously been written to the blob by - * one of the blob_write_* calls. - * - * - * The expected usage is something like the following pattern: - * - * size_t offset; - * - * offset = blob_reserve_uint32(blob); - * ... various blob write calls, writing N items ... - * blob_overwrite_uint32 (blob, offset, N); - * - * \return True unless the requested position or position+to_write lie outside - * the current blob's size. - */ -bool -blob_overwrite_uint32(struct blob *blob, - size_t offset, - uint32_t value); - -/** - * Add a uint64_t to a blob. - * - * \note This function will only write to a uint64_t-aligned offset from the - * beginning of the blob's data, so some padding bytes may be added to the - * blob if this write follows some unaligned write (such as - * blob_write_string). - * - * \return True unless allocation failed. - */ -bool -blob_write_uint64(struct blob *blob, uint64_t value); - -/** - * Add an intptr_t to a blob. - * - * \note This function will only write to an intptr_t-aligned offset from the - * beginning of the blob's data, so some padding bytes may be added to the - * blob if this write follows some unaligned write (such as - * blob_write_string). - * - * \return True unless allocation failed. - */ -bool -blob_write_intptr(struct blob *blob, intptr_t value); - -/** - * Overwrite an intptr_t previously written to the blob. - * - * Writes a intptr_t value to an existing portion of the blob at an offset of - * \offset. This data range must have previously been written to the blob by - * one of the blob_write_* calls. - * - * For example usage, see blob_overwrite_uint32 - * - * \return True unless the requested position or position+to_write lie outside - * the current blob's size. - */ -bool -blob_overwrite_intptr(struct blob *blob, - size_t offset, - intptr_t value); - -/** - * Add a NULL-terminated string to a blob, (including the NULL terminator). - * - * \return True unless allocation failed. - */ -bool -blob_write_string(struct blob *blob, const char *str); - -/** - * Start reading a blob, (initializing the contents of \blob for reading). - * - * After this call, the caller can use the various blob_read_* functions to - * read elements from the data array. - * - * For all of the blob_read_* functions, if there is insufficient data - * remaining, the functions will do nothing, (perhaps returning default values - * such as 0). The caller can detect this by noting that the blob_reader's - * current value is unchanged before and after the call. - */ -void -blob_reader_init(struct blob_reader *blob, const void *data, size_t size); - -/** - * Read some unstructured, fixed-size data from the current location, (and - * update the current location to just past this data). - * - * \note The memory returned belongs to the data underlying the blob reader. The - * caller must copy the data in order to use it after the lifetime of the data - * underlying the blob reader. - * - * \return The bytes read (see note above about memory lifetime). - */ -const void * -blob_read_bytes(struct blob_reader *blob, size_t size); - -/** - * Read some unstructured, fixed-size data from the current location, copying - * it to \dest (and update the current location to just past this data) - */ -void -blob_copy_bytes(struct blob_reader *blob, void *dest, size_t size); - -/** - * Skip \size bytes within the blob. - */ -void -blob_skip_bytes(struct blob_reader *blob, size_t size); - -/** - * Read a uint32_t from the current location, (and update the current location - * to just past this uint32_t). - * - * \note This function will only read from a uint32_t-aligned offset from the - * beginning of the blob's data, so some padding bytes may be skipped. - * - * \return The uint32_t read - */ -uint32_t -blob_read_uint32(struct blob_reader *blob); - -/** - * Read a uint64_t from the current location, (and update the current location - * to just past this uint64_t). - * - * \note This function will only read from a uint64_t-aligned offset from the - * beginning of the blob's data, so some padding bytes may be skipped. - * - * \return The uint64_t read - */ -uint64_t -blob_read_uint64(struct blob_reader *blob); - -/** - * Read an intptr_t value from the current location, (and update the - * current location to just past this intptr_t). - * - * \note This function will only read from an intptr_t-aligned offset from the - * beginning of the blob's data, so some padding bytes may be skipped. - * - * \return The intptr_t read - */ -intptr_t -blob_read_intptr(struct blob_reader *blob); - -/** - * Read a NULL-terminated string from the current location, (and update the - * current location to just past this string). - * - * \note The memory returned belongs to the data underlying the blob reader. The - * caller must copy the string in order to use the string after the lifetime - * of the data underlying the blob reader. - * - * \return The string read (see note above about memory lifetime). However, if - * there is no NULL byte remaining within the blob, this function returns - * NULL. - */ -char * -blob_read_string(struct blob_reader *blob); - -#ifdef __cplusplus -} -#endif - -#endif /* BLOB_H */ diff -Nru mesa-19.2.8/src/compiler/glsl/ast_function.cpp mesa-20.0.8/src/compiler/glsl/ast_function.cpp --- mesa-19.2.8/src/compiler/glsl/ast_function.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/glsl/ast_function.cpp 2020-06-12 01:21:16.000000000 +0000 @@ -49,6 +49,13 @@ ast->set_is_lhs(true); ir_rvalue *result = ast->hir(instructions, state); + /* Error happened processing function parameter */ + if (!result) { + actual_parameters->push_tail(ir_rvalue::error_value(mem_ctx)); + count++; + continue; + } + ir_constant *const constant = result->constant_expression_value(mem_ctx); @@ -664,7 +671,6 @@ } /* Local shader has no exact candidates; check the built-ins. */ - _mesa_glsl_initialize_builtin_functions(); sig = _mesa_glsl_find_builtin_function(state, name, actual_parameters); /* if _mesa_glsl_find_builtin_function failed, fall back to the result diff -Nru mesa-19.2.8/src/compiler/glsl/ast.h mesa-20.0.8/src/compiler/glsl/ast.h --- mesa-19.2.8/src/compiler/glsl/ast.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/glsl/ast.h 2020-06-12 01:21:16.000000000 +0000 @@ -77,6 +77,7 @@ { struct YYLTYPE locp; + locp.path = this->location.path; locp.source = this->location.source; locp.first_line = this->location.first_line; locp.first_column = this->location.first_column; @@ -93,6 +94,7 @@ */ void set_location(const struct YYLTYPE &locp) { + this->location.path = locp.path; this->location.source = locp.source; this->location.first_line = locp.first_line; this->location.first_column = locp.first_column; @@ -107,6 +109,7 @@ */ void set_location_range(const struct YYLTYPE &begin, const struct YYLTYPE &end) { + this->location.path = begin.path; this->location.source = begin.source; this->location.first_line = begin.first_line; this->location.last_line = end.last_line; @@ -118,6 +121,7 @@ * Source location of the AST node. */ struct { + char *path; /**< GLSL shader include path. */ unsigned source; /**< GLSL source number. */ unsigned first_line; /**< First line number within the source string. */ unsigned first_column; /**< First column in the first line. */ @@ -1213,6 +1217,16 @@ }; +class ast_demote_statement : public ast_node { +public: + ast_demote_statement(void) {} + virtual void print(void) const; + + virtual ir_rvalue *hir(exec_list *instructions, + struct _mesa_glsl_parse_state *state); +}; + + class ast_function_definition : public ast_node { public: ast_function_definition() : prototype(NULL), body(NULL) diff -Nru mesa-19.2.8/src/compiler/glsl/ast_to_hir.cpp mesa-20.0.8/src/compiler/glsl/ast_to_hir.cpp --- mesa-19.2.8/src/compiler/glsl/ast_to_hir.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/glsl/ast_to_hir.cpp 2020-06-12 01:21:16.000000000 +0000 @@ -1689,8 +1689,11 @@ /* Break out if operand types were not parsed successfully. */ if ((op[0]->type == glsl_type::error_type || - op[1]->type == glsl_type::error_type)) + op[1]->type == glsl_type::error_type)) { + type = glsl_type::error_type; + error_emitted = true; break; + } type = arithmetic_result_type(op[0], op[1], (this->oper == ast_mul_assign), @@ -2131,7 +2134,7 @@ } } type = NULL; /* use result->type, not type. */ - assert(result != NULL || !needs_rvalue); + assert(error_emitted || (result != NULL || !needs_rvalue)); if (result && result->type->is_error() && !error_emitted) _mesa_glsl_error(& loc, state, "type mismatch"); @@ -5195,7 +5198,8 @@ apply_layout_qualifier_to_variable(&this->type->qualifier, var, state, &loc); - if ((var->data.mode == ir_var_auto || var->data.mode == ir_var_temporary) + if ((var->data.mode == ir_var_auto || var->data.mode == ir_var_temporary + || var->data.mode == ir_var_shader_out) && (var->type->is_numeric() || var->type->is_boolean()) && state->zero_init) { const ir_constant_data data = { { 0 } }; @@ -6015,6 +6019,19 @@ name); } + /* Get the precision for the return type */ + unsigned return_precision; + + if (state->es_shader) { + YYLTYPE loc = this->get_location(); + return_precision = + select_gles_precision(this->return_type->qualifier.precision, + return_type, + state, + &loc); + } else { + return_precision = GLSL_PRECISION_NONE; + } /* Create an ir_function if one doesn't already exist. */ f = state->symbols->get_function(name); @@ -6043,7 +6060,6 @@ */ if (state->es_shader) { /* Local shader has no exact candidates; check the built-ins. */ - _mesa_glsl_initialize_builtin_functions(); if (state->language_version >= 300 && _mesa_glsl_has_builtin_function(state, name)) { YYLTYPE loc = this->get_location(); @@ -6086,6 +6102,13 @@ "match prototype", name); } + if (sig->return_precision != return_precision) { + YYLTYPE loc = this->get_location(); + + _mesa_glsl_error(&loc, state, "function `%s' return type precision " + "doesn't match prototype", name); + } + if (sig->is_defined) { if (is_definition) { YYLTYPE loc = this->get_location(); @@ -6130,6 +6153,7 @@ */ if (sig == NULL) { sig = new(ctx) ir_function_signature(return_type); + sig->return_precision = return_precision; f->add_signature(sig); } @@ -6443,6 +6467,25 @@ return NULL; } + +ir_rvalue * +ast_demote_statement::hir(exec_list *instructions, + struct _mesa_glsl_parse_state *state) +{ + void *ctx = state; + + if (state->stage != MESA_SHADER_FRAGMENT) { + YYLTYPE loc = this->get_location(); + + _mesa_glsl_error(& loc, state, + "`demote' may only appear in a fragment shader"); + } + + instructions->push_tail(new(ctx) ir_demote); + + return NULL; +} + ir_rvalue * ast_selection_statement::hir(exec_list *instructions, diff -Nru mesa-19.2.8/src/compiler/glsl/builtin_functions.cpp mesa-20.0.8/src/compiler/glsl/builtin_functions.cpp --- mesa-19.2.8/src/compiler/glsl/builtin_functions.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/glsl/builtin_functions.cpp 2020-06-12 01:21:16.000000000 +0000 @@ -819,6 +819,37 @@ { return state->INTEL_shader_atomic_float_minmax_enable; } + +static bool +demote_to_helper_invocation(const _mesa_glsl_parse_state *state) +{ + return state->EXT_demote_to_helper_invocation_enable; +} + +static bool +shader_integer_functions2(const _mesa_glsl_parse_state *state) +{ + return state->INTEL_shader_integer_functions2_enable; +} + +static bool +shader_integer_functions2_int64(const _mesa_glsl_parse_state *state) +{ + return state->INTEL_shader_integer_functions2_enable && state->has_int64(); +} + +static bool +is_nir(const _mesa_glsl_parse_state *state) +{ + return state->ctx->Const.ShaderCompilerOptions[state->stage].NirOptions; +} + +static bool +is_not_nir(const _mesa_glsl_parse_state *state) +{ + return !is_nir(state); +} + /** @} */ /******************************************************************************/ @@ -949,6 +980,8 @@ B1(acos) B1(atan2) B1(atan) + B1(atan2_op) + B1(atan_op) B1(sinh) B1(cosh) B1(tanh) @@ -1094,13 +1127,21 @@ B1(bitCount) B1(findLSB) B1(findMSB) + BA1(countLeadingZeros) + BA1(countTrailingZeros) BA1(fma) B2(ldexp) B2(frexp) B2(dfrexp) B1(uaddCarry) B1(usubBorrow) + BA1(addSaturate) + BA1(subtractSaturate) + BA1(absoluteDifference) + BA1(average) + BA1(averageRounded) B1(mulExtended) + BA1(multiply32x16) B1(interpolateAtCentroid) B1(interpolateAtOffset) B1(interpolateAtSample) @@ -1182,6 +1223,9 @@ ir_function_signature *_vote(const char *intrinsic_name, builtin_available_predicate avail); + ir_function_signature *_helper_invocation_intrinsic(); + ir_function_signature *_helper_invocation(); + #undef B0 #undef B1 #undef B2 @@ -1254,6 +1298,8 @@ if (mem_ctx != NULL) return; + glsl_type_singleton_init_or_ref(); + mem_ctx = ralloc_context(NULL); create_shader(); create_intrinsics(); @@ -1268,6 +1314,8 @@ ralloc_free(shader); shader = NULL; + + glsl_type_singleton_decref(); } void @@ -1487,6 +1535,8 @@ _read_first_invocation_intrinsic(glsl_type::uvec4_type), NULL); + add_function("__intrinsic_helper_invocation", + _helper_invocation_intrinsic(), NULL); } /** @@ -1713,6 +1763,14 @@ _atan2(glsl_type::vec2_type), _atan2(glsl_type::vec3_type), _atan2(glsl_type::vec4_type), + _atan_op(glsl_type::float_type), + _atan_op(glsl_type::vec2_type), + _atan_op(glsl_type::vec3_type), + _atan_op(glsl_type::vec4_type), + _atan2_op(glsl_type::float_type), + _atan2_op(glsl_type::vec2_type), + _atan2_op(glsl_type::vec3_type), + _atan2_op(glsl_type::vec4_type), NULL); F(sinh) @@ -4228,6 +4286,8 @@ _vote("__intrinsic_vote_eq", v460_desktop), NULL); + add_function("helperInvocationEXT", _helper_invocation(), NULL); + add_function("__builtin_idiv64", generate_ir::idiv64(mem_ctx, integer_functions_supported), NULL); @@ -4252,6 +4312,227 @@ generate_ir::umul64(mem_ctx, integer_functions_supported), NULL); + add_function("countLeadingZeros", + _countLeadingZeros(shader_integer_functions2, + glsl_type::uint_type), + _countLeadingZeros(shader_integer_functions2, + glsl_type::uvec2_type), + _countLeadingZeros(shader_integer_functions2, + glsl_type::uvec3_type), + _countLeadingZeros(shader_integer_functions2, + glsl_type::uvec4_type), + NULL); + + add_function("countTrailingZeros", + _countTrailingZeros(shader_integer_functions2, + glsl_type::uint_type), + _countTrailingZeros(shader_integer_functions2, + glsl_type::uvec2_type), + _countTrailingZeros(shader_integer_functions2, + glsl_type::uvec3_type), + _countTrailingZeros(shader_integer_functions2, + glsl_type::uvec4_type), + NULL); + + add_function("absoluteDifference", + _absoluteDifference(shader_integer_functions2, + glsl_type::int_type), + _absoluteDifference(shader_integer_functions2, + glsl_type::ivec2_type), + _absoluteDifference(shader_integer_functions2, + glsl_type::ivec3_type), + _absoluteDifference(shader_integer_functions2, + glsl_type::ivec4_type), + _absoluteDifference(shader_integer_functions2, + glsl_type::uint_type), + _absoluteDifference(shader_integer_functions2, + glsl_type::uvec2_type), + _absoluteDifference(shader_integer_functions2, + glsl_type::uvec3_type), + _absoluteDifference(shader_integer_functions2, + glsl_type::uvec4_type), + + _absoluteDifference(shader_integer_functions2_int64, + glsl_type::int64_t_type), + _absoluteDifference(shader_integer_functions2_int64, + glsl_type::i64vec2_type), + _absoluteDifference(shader_integer_functions2_int64, + glsl_type::i64vec3_type), + _absoluteDifference(shader_integer_functions2_int64, + glsl_type::i64vec4_type), + _absoluteDifference(shader_integer_functions2_int64, + glsl_type::uint64_t_type), + _absoluteDifference(shader_integer_functions2_int64, + glsl_type::u64vec2_type), + _absoluteDifference(shader_integer_functions2_int64, + glsl_type::u64vec3_type), + _absoluteDifference(shader_integer_functions2_int64, + glsl_type::u64vec4_type), + NULL); + + add_function("addSaturate", + _addSaturate(shader_integer_functions2, + glsl_type::int_type), + _addSaturate(shader_integer_functions2, + glsl_type::ivec2_type), + _addSaturate(shader_integer_functions2, + glsl_type::ivec3_type), + _addSaturate(shader_integer_functions2, + glsl_type::ivec4_type), + _addSaturate(shader_integer_functions2, + glsl_type::uint_type), + _addSaturate(shader_integer_functions2, + glsl_type::uvec2_type), + _addSaturate(shader_integer_functions2, + glsl_type::uvec3_type), + _addSaturate(shader_integer_functions2, + glsl_type::uvec4_type), + + _addSaturate(shader_integer_functions2_int64, + glsl_type::int64_t_type), + _addSaturate(shader_integer_functions2_int64, + glsl_type::i64vec2_type), + _addSaturate(shader_integer_functions2_int64, + glsl_type::i64vec3_type), + _addSaturate(shader_integer_functions2_int64, + glsl_type::i64vec4_type), + _addSaturate(shader_integer_functions2_int64, + glsl_type::uint64_t_type), + _addSaturate(shader_integer_functions2_int64, + glsl_type::u64vec2_type), + _addSaturate(shader_integer_functions2_int64, + glsl_type::u64vec3_type), + _addSaturate(shader_integer_functions2_int64, + glsl_type::u64vec4_type), + NULL); + + add_function("average", + _average(shader_integer_functions2, + glsl_type::int_type), + _average(shader_integer_functions2, + glsl_type::ivec2_type), + _average(shader_integer_functions2, + glsl_type::ivec3_type), + _average(shader_integer_functions2, + glsl_type::ivec4_type), + _average(shader_integer_functions2, + glsl_type::uint_type), + _average(shader_integer_functions2, + glsl_type::uvec2_type), + _average(shader_integer_functions2, + glsl_type::uvec3_type), + _average(shader_integer_functions2, + glsl_type::uvec4_type), + + _average(shader_integer_functions2_int64, + glsl_type::int64_t_type), + _average(shader_integer_functions2_int64, + glsl_type::i64vec2_type), + _average(shader_integer_functions2_int64, + glsl_type::i64vec3_type), + _average(shader_integer_functions2_int64, + glsl_type::i64vec4_type), + _average(shader_integer_functions2_int64, + glsl_type::uint64_t_type), + _average(shader_integer_functions2_int64, + glsl_type::u64vec2_type), + _average(shader_integer_functions2_int64, + glsl_type::u64vec3_type), + _average(shader_integer_functions2_int64, + glsl_type::u64vec4_type), + NULL); + + add_function("averageRounded", + _averageRounded(shader_integer_functions2, + glsl_type::int_type), + _averageRounded(shader_integer_functions2, + glsl_type::ivec2_type), + _averageRounded(shader_integer_functions2, + glsl_type::ivec3_type), + _averageRounded(shader_integer_functions2, + glsl_type::ivec4_type), + _averageRounded(shader_integer_functions2, + glsl_type::uint_type), + _averageRounded(shader_integer_functions2, + glsl_type::uvec2_type), + _averageRounded(shader_integer_functions2, + glsl_type::uvec3_type), + _averageRounded(shader_integer_functions2, + glsl_type::uvec4_type), + + _averageRounded(shader_integer_functions2_int64, + glsl_type::int64_t_type), + _averageRounded(shader_integer_functions2_int64, + glsl_type::i64vec2_type), + _averageRounded(shader_integer_functions2_int64, + glsl_type::i64vec3_type), + _averageRounded(shader_integer_functions2_int64, + glsl_type::i64vec4_type), + _averageRounded(shader_integer_functions2_int64, + glsl_type::uint64_t_type), + _averageRounded(shader_integer_functions2_int64, + glsl_type::u64vec2_type), + _averageRounded(shader_integer_functions2_int64, + glsl_type::u64vec3_type), + _averageRounded(shader_integer_functions2_int64, + glsl_type::u64vec4_type), + NULL); + + add_function("subtractSaturate", + _subtractSaturate(shader_integer_functions2, + glsl_type::int_type), + _subtractSaturate(shader_integer_functions2, + glsl_type::ivec2_type), + _subtractSaturate(shader_integer_functions2, + glsl_type::ivec3_type), + _subtractSaturate(shader_integer_functions2, + glsl_type::ivec4_type), + _subtractSaturate(shader_integer_functions2, + glsl_type::uint_type), + _subtractSaturate(shader_integer_functions2, + glsl_type::uvec2_type), + _subtractSaturate(shader_integer_functions2, + glsl_type::uvec3_type), + _subtractSaturate(shader_integer_functions2, + glsl_type::uvec4_type), + + _subtractSaturate(shader_integer_functions2_int64, + glsl_type::int64_t_type), + _subtractSaturate(shader_integer_functions2_int64, + glsl_type::i64vec2_type), + _subtractSaturate(shader_integer_functions2_int64, + glsl_type::i64vec3_type), + _subtractSaturate(shader_integer_functions2_int64, + glsl_type::i64vec4_type), + _subtractSaturate(shader_integer_functions2_int64, + glsl_type::uint64_t_type), + _subtractSaturate(shader_integer_functions2_int64, + glsl_type::u64vec2_type), + _subtractSaturate(shader_integer_functions2_int64, + glsl_type::u64vec3_type), + _subtractSaturate(shader_integer_functions2_int64, + glsl_type::u64vec4_type), + NULL); + + add_function("multiply32x16", + _multiply32x16(shader_integer_functions2, + glsl_type::int_type), + _multiply32x16(shader_integer_functions2, + glsl_type::ivec2_type), + _multiply32x16(shader_integer_functions2, + glsl_type::ivec3_type), + _multiply32x16(shader_integer_functions2, + glsl_type::ivec4_type), + _multiply32x16(shader_integer_functions2, + glsl_type::uint_type), + _multiply32x16(shader_integer_functions2, + glsl_type::uvec2_type), + _multiply32x16(shader_integer_functions2, + glsl_type::uvec3_type), + _multiply32x16(shader_integer_functions2, + glsl_type::uvec4_type), + NULL); + #undef F #undef FI #undef FIUD_VEC @@ -4712,7 +4993,7 @@ const unsigned n = type->vector_elements; ir_variable *y = in_var(type, "y"); ir_variable *x = in_var(type, "x"); - MAKE_SIG(type, always_available, 2, y, x); + MAKE_SIG(type, is_not_nir, 2, y, x); /* If we're on the left half-plane rotate the coordinates π/2 clock-wise * for the y=0 discontinuity to end up aligned with the vertical @@ -4849,7 +5130,7 @@ builtin_builder::_atan(const glsl_type *type) { ir_variable *y_over_x = in_var(type, "y_over_x"); - MAKE_SIG(type, always_available, 1, y_over_x); + MAKE_SIG(type, is_not_nir, 1, y_over_x); ir_variable *tmp = body.make_temp(type, "tmp"); do_atan(body, type, tmp, y_over_x); @@ -4888,19 +5169,17 @@ ir_variable *x = in_var(type, "x"); MAKE_SIG(type, v130, 1, x); - /* tanh(x) := (0.5 * (e^x - e^(-x))) / (0.5 * (e^x + e^(-x))) - * - * With a little algebra this reduces to (e^2x - 1) / (e^2x + 1) - * - * Clamp x to (-inf, +10] to avoid precision problems. When x > 10, e^2x - * is so much larger than 1.0 that 1.0 gets flushed to zero in the - * computation e^2x +/- 1 so it can be ignored. + /* Clamp x to [-10, +10] to avoid precision problems. + * When x > 10, e^(-x) is so small relative to e^x that it gets flushed to + * zero in the computation e^x + e^(-x). The same happens in the other + * direction when x < -10. */ ir_variable *t = body.make_temp(type, "tmp"); - body.emit(assign(t, min2(x, imm(10.0f)))); + body.emit(assign(t, min2(max2(x, imm(-10.0f)), imm(10.0f)))); - body.emit(ret(div(sub(exp(mul(t, imm(2.0f))), imm(1.0f)), - add(exp(mul(t, imm(2.0f))), imm(1.0f))))); + /* (e^x - e^(-x)) / (e^x + e^(-x)) */ + body.emit(ret(div(sub(exp(t), exp(neg(t))), + add(exp(t), exp(neg(t)))))); return sig; } @@ -4952,6 +5231,7 @@ UNOP(log, ir_unop_log, always_available) UNOP(exp2, ir_unop_exp2, always_available) UNOP(log2, ir_unop_log2, always_available) +UNOP(atan_op, ir_unop_atan, is_nir) UNOPA(sqrt, ir_unop_sqrt) UNOPA(inversesqrt, ir_unop_rsq) @@ -5152,6 +5432,12 @@ } ir_function_signature * +builtin_builder::_atan2_op(const glsl_type *x_type) +{ + return binop(is_nir, ir_binop_atan2, x_type, x_type, x_type); +} + +ir_function_signature * builtin_builder::_floatBitsToInt(const glsl_type *type) { ir_variable *x = in_var(type, "x"); @@ -6552,6 +6838,28 @@ } ir_function_signature * +builtin_builder::_countLeadingZeros(builtin_available_predicate avail, + const glsl_type *type) +{ + return unop(avail, ir_unop_clz, + glsl_type::uvec(type->vector_elements), type); +} + +ir_function_signature * +builtin_builder::_countTrailingZeros(builtin_available_predicate avail, + const glsl_type *type) +{ + ir_variable *a = in_var(type, "a"); + MAKE_SIG(glsl_type::uvec(type->vector_elements), avail, 1, a); + + body.emit(ret(ir_builder::min2( + ir_builder::i2u(ir_builder::expr(ir_unop_find_lsb, a)), + imm(32u)))); + + return sig; +} + +ir_function_signature * builtin_builder::_fma(builtin_available_predicate avail, const glsl_type *type) { ir_variable *a = in_var(type, "a"); @@ -6647,6 +6955,13 @@ } ir_function_signature * +builtin_builder::_addSaturate(builtin_available_predicate avail, + const glsl_type *type) +{ + return binop(avail, ir_binop_add_sat, type, type, type); +} + +ir_function_signature * builtin_builder::_usubBorrow(const glsl_type *type) { ir_variable *x = in_var(type, "x"); @@ -6660,6 +6975,40 @@ return sig; } +ir_function_signature * +builtin_builder::_subtractSaturate(builtin_available_predicate avail, + const glsl_type *type) +{ + return binop(avail, ir_binop_sub_sat, type, type, type); +} + +ir_function_signature * +builtin_builder::_absoluteDifference(builtin_available_predicate avail, + const glsl_type *type) +{ + /* absoluteDifference returns an unsigned type that has the same number of + * bits and number of vector elements as the type of the operands. + */ + return binop(avail, ir_binop_abs_sub, + glsl_type::get_instance(glsl_unsigned_base_type_of(type->base_type), + type->vector_elements, 1), + type, type); +} + +ir_function_signature * +builtin_builder::_average(builtin_available_predicate avail, + const glsl_type *type) +{ + return binop(avail, ir_binop_avg, type, type, type); +} + +ir_function_signature * +builtin_builder::_averageRounded(builtin_available_predicate avail, + const glsl_type *type) +{ + return binop(avail, ir_binop_avg_round, type, type, type); +} + /** * For both imulExtended() and umulExtended() built-ins. */ @@ -6707,6 +7056,13 @@ } ir_function_signature * +builtin_builder::_multiply32x16(builtin_available_predicate avail, + const glsl_type *type) +{ + return binop(avail, ir_binop_mul_32x16, type, type, type); +} + +ir_function_signature * builtin_builder::_interpolateAtCentroid(const glsl_type *type) { ir_variable *interpolant = in_var(type, "interpolant"); @@ -7272,6 +7628,28 @@ return sig; } +ir_function_signature * +builtin_builder::_helper_invocation_intrinsic() +{ + MAKE_INTRINSIC(glsl_type::bool_type, ir_intrinsic_helper_invocation, + demote_to_helper_invocation, 0); + return sig; +} + +ir_function_signature * +builtin_builder::_helper_invocation() +{ + MAKE_SIG(glsl_type::bool_type, demote_to_helper_invocation, 0); + + ir_variable *retval = body.make_temp(glsl_type::bool_type, "retval"); + + body.emit(call(shader->symbols->get_function("__intrinsic_helper_invocation"), + retval, sig->parameters)); + body.emit(ret(retval)); + + return sig; +} + /** @} */ /******************************************************************************/ @@ -7279,24 +7657,28 @@ /* The singleton instance of builtin_builder. */ static builtin_builder builtins; static mtx_t builtins_lock = _MTX_INITIALIZER_NP; +static uint32_t builtin_users = 0; /** * External API (exposing the built-in module to the rest of the compiler): * @{ */ -void -_mesa_glsl_initialize_builtin_functions() +extern "C" void +_mesa_glsl_builtin_functions_init_or_ref() { mtx_lock(&builtins_lock); - builtins.initialize(); + if (builtin_users++ == 0) + builtins.initialize(); mtx_unlock(&builtins_lock); } -void -_mesa_glsl_release_builtin_functions() +extern "C" void +_mesa_glsl_builtin_functions_decref() { mtx_lock(&builtins_lock); - builtins.release(); + assert(builtin_users != 0); + if (--builtin_users == 0) + builtins.release(); mtx_unlock(&builtins_lock); } diff -Nru mesa-19.2.8/src/compiler/glsl/builtin_functions.h mesa-20.0.8/src/compiler/glsl/builtin_functions.h --- mesa-19.2.8/src/compiler/glsl/builtin_functions.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/glsl/builtin_functions.h 2020-06-12 01:21:16.000000000 +0000 @@ -26,8 +26,19 @@ struct gl_shader; -extern void -_mesa_glsl_initialize_builtin_functions(); +#ifdef __cplusplus +extern "C" { +#endif + +void +_mesa_glsl_builtin_functions_init_or_ref(); + +void +_mesa_glsl_builtin_functions_decref(void); + +#ifdef __cplusplus + +} /* extern "C" */ extern ir_function_signature * _mesa_glsl_find_builtin_function(_mesa_glsl_parse_state *state, @@ -43,9 +54,6 @@ extern ir_function_signature * _mesa_get_main_function_signature(glsl_symbol_table *symbols); -extern void -_mesa_glsl_release_builtin_functions(void); - namespace generate_ir { ir_function_signature * @@ -71,4 +79,6 @@ } +#endif /* __cplusplus */ + #endif /* BULITIN_FUNCTIONS_H */ diff -Nru mesa-19.2.8/src/compiler/glsl/builtin_variables.cpp mesa-20.0.8/src/compiler/glsl/builtin_variables.cpp --- mesa-19.2.8/src/compiler/glsl/builtin_variables.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/glsl/builtin_variables.cpp 2020-06-12 01:21:16.000000000 +0000 @@ -1252,10 +1252,13 @@ add_input(VARYING_SLOT_POS, vec4_t, frag_coord_precision, "gl_FragCoord"); } - if (this->state->ctx->Const.GLSLFrontFacingIsSysVal) - add_system_value(SYSTEM_VALUE_FRONT_FACE, bool_t, "gl_FrontFacing"); - else - add_input(VARYING_SLOT_FACE, bool_t, "gl_FrontFacing"); + if (this->state->ctx->Const.GLSLFrontFacingIsSysVal) { + var = add_system_value(SYSTEM_VALUE_FRONT_FACE, bool_t, "gl_FrontFacing"); + var->data.interpolation = INTERP_MODE_FLAT; + } else { + var = add_input(VARYING_SLOT_FACE, bool_t, "gl_FrontFacing"); + var->data.interpolation = INTERP_MODE_FLAT; + } if (state->is_version(120, 100)) { if (this->state->ctx->Const.GLSLPointCoordIsSysVal) @@ -1435,6 +1438,9 @@ void builtin_variable_generator::generate_varyings() { + struct gl_shader_compiler_options *options = + &state->ctx->Const.ShaderCompilerOptions[state->stage]; + /* gl_Position and gl_PointSize are not visible from fragment shaders. */ if (state->stage != MESA_SHADER_FRAGMENT) { add_varying(VARYING_SLOT_POS, vec4_t, GLSL_PRECISION_HIGH, "gl_Position"); @@ -1526,6 +1532,9 @@ var->data.sample = fields[i].sample; var->data.patch = fields[i].patch; var->init_interface_type(per_vertex_out_type); + + var->data.invariant = fields[i].location == VARYING_SLOT_POS && + options->PositionAlwaysInvariant; } } } diff -Nru mesa-19.2.8/src/compiler/glsl/float64.glsl mesa-20.0.8/src/compiler/glsl/float64.glsl --- mesa-19.2.8/src/compiler/glsl/float64.glsl 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/glsl/float64.glsl 2020-06-12 01:21:16.000000000 +0000 @@ -221,10 +221,11 @@ uint64_t __fsat64(uint64_t __a) { - if (__flt64(__a, 0ul)) + /* fsat(NaN) should be zero. */ + if (__is_nan(__a) || __flt64_nonnan(__a, 0ul)) return 0ul; - if (__fge64(__a, 0x3FF0000000000000ul /* 1.0 */)) + if (!__flt64_nonnan(__a, 0x3FF0000000000000ul /* 1.0 */)) return 0x3FF0000000000000ul; return __a; @@ -437,23 +438,25 @@ } return __packFloat64(zSign, 0x7FF, 0u, 0u); } - if (zExp < 0) { - __shift64ExtraRightJamming( - zFrac0, zFrac1, zFrac2, -zExp, zFrac0, zFrac1, zFrac2); - zExp = 0; - if (roundNearestEven) { - increment = zFrac2 < 0u; + } + + if (zExp < 0) { + __shift64ExtraRightJamming( + zFrac0, zFrac1, zFrac2, -zExp, zFrac0, zFrac1, zFrac2); + zExp = 0; + if (roundNearestEven) { + increment = zFrac2 < 0u; + } else { + if (zSign != 0u) { + increment = (FLOAT_ROUNDING_MODE == FLOAT_ROUND_DOWN) && + (zFrac2 != 0u); } else { - if (zSign != 0u) { - increment = (FLOAT_ROUNDING_MODE == FLOAT_ROUND_DOWN) && - (zFrac2 != 0u); - } else { - increment = (FLOAT_ROUNDING_MODE == FLOAT_ROUND_UP) && - (zFrac2 != 0u); - } + increment = (FLOAT_ROUNDING_MODE == FLOAT_ROUND_UP) && + (zFrac2 != 0u); } } } + if (increment) { __add64(zFrac0, zFrac1, 0u, 1u, zFrac0, zFrac1); zFrac1 &= ~((zFrac2 + uint(zFrac2 == 0u)) & uint(roundNearestEven)); @@ -1299,43 +1302,35 @@ float __uint64_to_fp32(uint64_t __a) { - uint zFrac = 0u; uvec2 aFrac = unpackUint2x32(__a); - int shiftCount = __countLeadingZeros32(mix(aFrac.y, aFrac.x, aFrac.y == 0u)); - shiftCount -= mix(40, 8, aFrac.y == 0u); + int shiftCount = mix(__countLeadingZeros32(aFrac.y) - 33, + __countLeadingZeros32(aFrac.x) - 1, + aFrac.y == 0u); - if (0 <= shiftCount) { + if (0 <= shiftCount) __shortShift64Left(aFrac.y, aFrac.x, shiftCount, aFrac.y, aFrac.x); - bool is_zero = (aFrac.y | aFrac.x) == 0u; - return mix(__packFloat32(0u, 0x95 - shiftCount, aFrac.x), 0, is_zero); - } + else + __shift64RightJamming(aFrac.y, aFrac.x, -shiftCount, aFrac.y, aFrac.x); - shiftCount += 7; - __shift64RightJamming(aFrac.y, aFrac.x, -shiftCount, aFrac.y, aFrac.x); - zFrac = mix(aFrac.x<include{HSPACE}+["<][]^./ _A-Za-z0-9+*%[(){}|&~=!:;,?-]+[">] { + BEGIN INITIAL; + RETURN_STRING_TOKEN (INCLUDE); +} + line{HSPACE}+ { BEGIN INITIAL; RETURN_TOKEN (LINE); @@ -558,6 +564,10 @@ } } +{PATH} { + RETURN_STRING_TOKEN (PATH); +} + /* We preserve all newlines, even between #if 0..#endif, so no skipping.. */ <*>{NEWLINE} { diff -Nru mesa-19.2.8/src/compiler/glsl/glcpp/glcpp-parse.y mesa-20.0.8/src/compiler/glsl/glcpp/glcpp-parse.y --- mesa-19.2.8/src/compiler/glsl/glcpp/glcpp-parse.y 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/glsl/glcpp/glcpp-parse.y 2020-06-12 01:21:16.000000000 +0000 @@ -30,6 +30,17 @@ #include "glcpp.h" #include "main/mtypes.h" +#include "util/strndup.h" + +const char * +_mesa_lookup_shader_include(struct gl_context *ctx, char *path, + bool error_check); + +size_t +_mesa_get_shader_include_cursor(struct gl_shared_state *shared); + +void +_mesa_set_shader_include_cursor(struct gl_shared_state *shared, size_t cursor); static void yyerror(YYLTYPE *locp, glcpp_parser_t *parser, const char *error); @@ -149,6 +160,14 @@ static void glcpp_parser_lex_from(glcpp_parser_t *parser, token_list_t *list); +struct define_include { + glcpp_parser_t *parser; + YYLTYPE *loc; +}; + +static void +glcpp_parser_copy_defines(const void *key, void *data, void *closure); + static void add_builtin_define(glcpp_parser_t *parser, const char *name, int value); @@ -174,11 +193,11 @@ /* We use HASH_TOKEN, DEFINE_TOKEN and VERSION_TOKEN (as opposed to * HASH, DEFINE, and VERSION) to avoid conflicts with other symbols, * (such as the and start conditions in the lexer). */ -%token DEFINED ELIF_EXPANDED HASH_TOKEN DEFINE_TOKEN FUNC_IDENTIFIER OBJ_IDENTIFIER ELIF ELSE ENDIF ERROR_TOKEN IF IFDEF IFNDEF LINE PRAGMA UNDEF VERSION_TOKEN GARBAGE IDENTIFIER IF_EXPANDED INTEGER INTEGER_STRING LINE_EXPANDED NEWLINE OTHER PLACEHOLDER SPACE PLUS_PLUS MINUS_MINUS +%token DEFINED ELIF_EXPANDED HASH_TOKEN DEFINE_TOKEN FUNC_IDENTIFIER OBJ_IDENTIFIER ELIF ELSE ENDIF ERROR_TOKEN IF IFDEF IFNDEF LINE PRAGMA UNDEF VERSION_TOKEN GARBAGE IDENTIFIER IF_EXPANDED INTEGER INTEGER_STRING LINE_EXPANDED NEWLINE OTHER PLACEHOLDER SPACE PLUS_PLUS MINUS_MINUS PATH INCLUDE %token PASTE %type INTEGER operator SPACE integer_constant version_constant %type expression -%type IDENTIFIER FUNC_IDENTIFIER OBJ_IDENTIFIER INTEGER_STRING OTHER ERROR_TOKEN PRAGMA +%type IDENTIFIER FUNC_IDENTIFIER OBJ_IDENTIFIER INTEGER_STRING OTHER ERROR_TOKEN PRAGMA PATH INCLUDE %type identifier_list %type preprocessing_token %type pp_tokens replacement_list text_line @@ -238,6 +257,13 @@ "#line %" PRIiMAX " %" PRIiMAX "\n", $2, $3); } +| LINE_EXPANDED integer_constant PATH NEWLINE { + parser->has_new_line_number = 1; + parser->new_line_number = $2; + _mesa_string_buffer_printf(parser->output, + "#line %" PRIiMAX " %s\n", + $2, $3); + } ; define: @@ -323,6 +349,80 @@ _mesa_hash_table_remove (parser->defines, entry); } } +| HASH_TOKEN INCLUDE NEWLINE { + size_t include_cursor = _mesa_get_shader_include_cursor(parser->gl_ctx->Shared); + + /* Remove leading and trailing "" or <> */ + char *start = strchr($2, '"'); + if (!start) { + _mesa_set_shader_include_cursor(parser->gl_ctx->Shared, 0); + start = strchr($2, '<'); + } + char *path = strndup(start + 1, strlen(start + 1) - 1); + + const char *shader = + _mesa_lookup_shader_include(parser->gl_ctx, path, false); + free(path); + + if (!shader) + glcpp_error(&@1, parser, "%s not found", $2); + else { + /* Create a temporary parser with the same settings */ + glcpp_parser_t *tmp_parser = + glcpp_parser_create(parser->gl_ctx, parser->extensions, parser->state); + tmp_parser->version_set = true; + tmp_parser->version = parser->version; + + /* Set the shader source and run the lexer */ + glcpp_lex_set_source_string(tmp_parser, shader); + + /* Copy any existing define macros to the temporary + * shade include parser. + */ + struct define_include di; + di.parser = tmp_parser; + di.loc = &@1; + + hash_table_call_foreach(parser->defines, + glcpp_parser_copy_defines, + &di); + + /* Print out '#include' to the glsl parser. We do this + * so that it can do the error checking require to + * make sure the ARB_shading_language_include + * extension is enabled. + */ + _mesa_string_buffer_printf(parser->output, "#include\n"); + + /* Parse the include string before adding to the + * preprocessor output. + */ + glcpp_parser_parse(tmp_parser); + _mesa_string_buffer_printf(parser->info_log, "%s", + tmp_parser->info_log->buf); + _mesa_string_buffer_printf(parser->output, "%s", + tmp_parser->output->buf); + + /* Copy any new define macros to the parent parser + * and steal the memory of our temp parser so we don't + * free these new defines before they are no longer + * needed. + */ + di.parser = parser; + di.loc = &@1; + ralloc_steal(parser, tmp_parser); + + hash_table_call_foreach(tmp_parser->defines, + glcpp_parser_copy_defines, + &di); + + /* Destroy tmp parser memory we no longer need */ + glcpp_lex_destroy(tmp_parser->scanner); + _mesa_hash_table_destroy(tmp_parser->defines, NULL); + } + + _mesa_set_shader_include_cursor(parser->gl_ctx->Shared, include_cursor); + } | HASH_TOKEN IF pp_tokens NEWLINE { /* Be careful to only evaluate the 'if' expression if * we are not skipping. When we are skipping, we @@ -706,6 +806,10 @@ $$ = _token_create_str (parser, INTEGER_STRING, $1); $$->location = yylloc; } +| PATH { + $$ = _token_create_str (parser, PATH, $1); + $$->location = yylloc; + } | operator { $$ = _token_create_ival (parser, $1, $1); $$->location = yylloc; @@ -1144,6 +1248,7 @@ break; case IDENTIFIER: case INTEGER_STRING: + case PATH: case OTHER: _mesa_string_buffer_append(out, token->value.str); break; @@ -1357,15 +1462,15 @@ #define INITIAL_PP_OUTPUT_BUF_SIZE 4048 glcpp_parser_t * -glcpp_parser_create(const struct gl_extensions *extension_list, - glcpp_extension_iterator extensions, void *state, gl_api api) +glcpp_parser_create(struct gl_context *gl_ctx, + glcpp_extension_iterator extensions, void *state) { glcpp_parser_t *parser; parser = ralloc (NULL, glcpp_parser_t); glcpp_lex_init_extra (parser, &parser->scanner); - parser->defines = _mesa_hash_table_create(NULL, _mesa_key_hash_string, + parser->defines = _mesa_hash_table_create(NULL, _mesa_hash_string, _mesa_key_string_equal); parser->linalloc = linear_alloc_parent(parser, 0); parser->active = NULL; @@ -1392,10 +1497,11 @@ INITIAL_PP_OUTPUT_BUF_SIZE); parser->error = 0; + parser->gl_ctx = gl_ctx; parser->extensions = extensions; - parser->extension_list = extension_list; + parser->extension_list = &gl_ctx->Extensions; parser->state = state; - parser->api = api; + parser->api = gl_ctx->API; parser->version = 0; parser->version_set = false; @@ -2409,3 +2515,29 @@ _glcpp_parser_handle_version_declaration(parser, language_version, NULL, false); } + +static void +glcpp_parser_copy_defines(const void *key, void *data, void *closure) +{ + struct define_include *di = (struct define_include *) closure; + macro_t *macro = (macro_t *) data; + + /* If we hit an error on a previous pass, just return */ + if (di->parser->error) + return; + + const char *identifier = macro->identifier; + struct hash_entry *entry = _mesa_hash_table_search(di->parser->defines, + identifier); + + macro_t *previous = entry ? entry->data : NULL; + if (previous) { + if (_macro_equal(macro, previous)) { + return; + } + glcpp_error(di->loc, di->parser, "Redefinition of macro %s\n", + identifier); + } + + _mesa_hash_table_insert(di->parser->defines, identifier, macro); +} diff -Nru mesa-19.2.8/src/compiler/glsl/glcpp/meson.build mesa-20.0.8/src/compiler/glsl/glcpp/meson.build --- mesa-19.2.8/src/compiler/glsl/glcpp/meson.build 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/glsl/glcpp/meson.build 2020-06-12 01:21:16.000000000 +0000 @@ -35,27 +35,48 @@ command : [prog_flex, '-o', '@OUTPUT@', '@INPUT@'], ) +_extra_args = [] +if cpp.get_id() == 'msvc' + # Flex relies on __STDC_VERSION__>=199901L to decide when to include C99 + # inttypes.h. We always have inttypes.h available with MSVC (either the one + # bundled with MSVC 2013, or the one we bundle ourselves), but we can't just + # define __STDC_VERSION__ without breaking stuff, as MSVC doesn't fully + # support C99. There's also no way to premptively include stdint. + _extra_args += '-FIinttypes.h' +endif + libglcpp = static_library( 'glcpp', [glcpp_lex, glcpp_parse, files('glcpp.h', 'pp.c')], dependencies : idep_mesautil, include_directories : [inc_common], - c_args : [c_vis_args, no_override_init_args, c_msvc_compat_args], - cpp_args : [cpp_vis_args, cpp_msvc_compat_args], + c_args : [c_vis_args, no_override_init_args, c_msvc_compat_args, _extra_args], + cpp_args : [cpp_vis_args, cpp_msvc_compat_args, _extra_args], + build_by_default : false, +) + +libglcpp_standalone = static_library( + 'glcpp_standalone', + 'pp_standalone_scaffolding.c', + link_with : libglcpp, + include_directories : [inc_common], + c_args : [c_vis_args, no_override_init_args, c_msvc_compat_args, _extra_args], + cpp_args : [cpp_vis_args, cpp_msvc_compat_args, _extra_args], build_by_default : false, ) glcpp = executable( 'glcpp', 'glcpp.c', - dependencies : [dep_m], + dependencies : [dep_m, idep_getopt], include_directories : [inc_common], - link_with : [libglcpp, libglsl_util], + link_with : [libglcpp_standalone, libglsl_util], c_args : [c_vis_args, no_override_init_args, c_msvc_compat_args], build_by_default : false, ) -if with_any_opengl and with_tests +# FIXME: these fail on windows due to whitespace differences +if with_any_opengl and with_tests and host_machine.system() != 'windows' modes = ['unix', 'windows', 'oldmac', 'bizarro'] if dep_valgrind.found() modes += ['valgrind'] diff -Nru mesa-19.2.8/src/compiler/glsl/glcpp/pp.c mesa-20.0.8/src/compiler/glsl/glcpp/pp.c --- mesa-19.2.8/src/compiler/glsl/glcpp/pp.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/glsl/glcpp/pp.c 2020-06-12 01:21:16.000000000 +0000 @@ -228,7 +228,7 @@ { int errors; glcpp_parser_t *parser = - glcpp_parser_create(&gl_ctx->Extensions, extensions, state, gl_ctx->API); + glcpp_parser_create(gl_ctx, extensions, state); if (! gl_ctx->Const.DisableGLSLLineContinuations) *shader = remove_line_continuations(parser, *shader); diff -Nru mesa-19.2.8/src/compiler/glsl/glcpp/pp_standalone_scaffolding.c mesa-20.0.8/src/compiler/glsl/glcpp/pp_standalone_scaffolding.c --- mesa-19.2.8/src/compiler/glsl/glcpp/pp_standalone_scaffolding.c 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/compiler/glsl/glcpp/pp_standalone_scaffolding.c 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,57 @@ +/* + * Copyright © 2019 Timothy Arceri + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +/* This file declares stripped-down versions of functions that + * normally exist outside of the glsl folder, so that they can be used + * when running the GLSL compiler standalone (for unit testing or + * compiling builtins). + */ + +#include "pp_standalone_scaffolding.h" + +const char * +_mesa_lookup_shader_include(struct gl_context *ctx, char *path, + bool error_check) +{ + (void) ctx; + (void) path; + (void) error_check; + + return NULL; +} + +size_t +_mesa_get_shader_include_cursor(struct gl_shared_state *shared) +{ + (void) shared; + + return 0; +} + +void +_mesa_set_shader_include_cursor(struct gl_shared_state *shared, + size_t cursor) +{ + (void) shared; + (void) cursor; +} diff -Nru mesa-19.2.8/src/compiler/glsl/glcpp/pp_standalone_scaffolding.h mesa-20.0.8/src/compiler/glsl/glcpp/pp_standalone_scaffolding.h --- mesa-19.2.8/src/compiler/glsl/glcpp/pp_standalone_scaffolding.h 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/compiler/glsl/glcpp/pp_standalone_scaffolding.h 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,47 @@ +/* + * Copyright © 2019 Timothy Arceri + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +/* This file declares stripped-down versions of functions that + * normally exist outside of the glcpp folder, so that they can be used + * when running the GLSL compiler standalone (for unit testing or + * compiling builtins). + */ + +#ifndef PP_STANDALONE_SCAFFOLDING_H +#define PP_STANDALONE_SCAFFOLDING_H + +#include +#include "main/mtypes.h" + +const char * +_mesa_lookup_shader_include(struct gl_context *ctx, char *path, + bool error_check); + +size_t +_mesa_get_shader_include_cursor(struct gl_shared_state *shared); + +void +_mesa_set_shader_include_cursor(struct gl_shared_state *shared, + size_t cursor); + +#endif /* PP_STANDALONE_SCAFFOLDING_H */ diff -Nru mesa-19.2.8/src/compiler/glsl/glcpp/tests/glcpp_test.py mesa-20.0.8/src/compiler/glsl/glcpp/tests/glcpp_test.py --- mesa-19.2.8/src/compiler/glsl/glcpp/tests/glcpp_test.py 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/glsl/glcpp/tests/glcpp_test.py 2020-06-12 01:21:16.000000000 +0000 @@ -24,12 +24,20 @@ from __future__ import print_function import argparse import difflib +import errno import io import os import subprocess import sys import tempfile +# The meson version handles windows paths better, but if it's not available +# fall back to shlex +try: + from meson.mesonlib import split_args +except ImportError: + from shlex import split as split_args + def arg_parser(): parser = argparse.ArgumentParser() @@ -61,7 +69,7 @@ with open(filename, 'rb') as f: proc = subprocess.Popen( - [glcpp] + extra_args, + glcpp + extra_args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, stdin=subprocess.PIPE) @@ -85,7 +93,7 @@ os.close(fd) with open(filename, 'rb') as f: proc = subprocess.Popen( - ['valgrind', '--error-exitcode=31', '--log-file', tmpfile, glcpp] + extra_args, + ['valgrind', '--error-exitcode=31', '--log-file', tmpfile] + glcpp + extra_args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, stdin=subprocess.PIPE) @@ -216,17 +224,30 @@ def main(): args = arg_parser() + wrapper = os.environ.get('MESON_EXE_WRAPPER') + if wrapper is not None: + args.glcpp = split_args(wrapper) + [args.glcpp] + else: + args.glcpp = [args.glcpp] + success = True - if args.unix: - success = success and test_unix(args) - if args.windows: - success = success and test_windows(args) - if args.oldmac: - success = success and test_oldmac(args) - if args.bizarro: - success = success and test_bizarro(args) - if args.valgrind: - success = success and test_valgrind(args) + try: + if args.unix: + success = success and test_unix(args) + if args.windows: + success = success and test_windows(args) + if args.oldmac: + success = success and test_oldmac(args) + if args.bizarro: + success = success and test_bizarro(args) + if args.valgrind: + success = success and test_valgrind(args) + except OSError as e: + if e.errno == errno.ENOEXEC: + print('Skipping due to inability to run host binaries.', + file=sys.stderr) + sys.exit(77) + raise exit(0 if success else 1) diff -Nru mesa-19.2.8/src/compiler/glsl/gl_nir.h mesa-20.0.8/src/compiler/glsl/gl_nir.h --- mesa-19.2.8/src/compiler/glsl/gl_nir.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/glsl/gl_nir.h 2020-06-12 01:21:16.000000000 +0000 @@ -44,8 +44,6 @@ bool gl_nir_lower_buffers(nir_shader *shader, const struct gl_shader_program *shader_program); -bool gl_nir_opt_access(nir_shader *shader); - #ifdef __cplusplus } #endif diff -Nru mesa-19.2.8/src/compiler/glsl/gl_nir_link_atomics.c mesa-20.0.8/src/compiler/glsl/gl_nir_link_atomics.c --- mesa-19.2.8/src/compiler/glsl/gl_nir_link_atomics.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/glsl/gl_nir_link_atomics.c 2020-06-12 01:21:16.000000000 +0000 @@ -170,6 +170,26 @@ return buffers; } +static bool +check_atomic_counters_overlap(const nir_variable *x, const nir_variable *y) +{ + return ((x->data.offset >= y->data.offset && + x->data.offset < y->data.offset + glsl_atomic_size(y->type)) || + (y->data.offset >= x->data.offset && + y->data.offset < x->data.offset + glsl_atomic_size(x->type))); +} + +static int +cmp_active_counter_offsets(const void *a, const void *b) +{ + const struct active_atomic_counter_uniform *const first = + (struct active_atomic_counter_uniform *) a; + const struct active_atomic_counter_uniform *const second = + (struct active_atomic_counter_uniform *) b; + + return first->var->data.offset - second->var->data.offset; +} + void gl_nir_link_assign_atomic_counter_resources(struct gl_context *ctx, struct gl_shader_program *prog) @@ -280,3 +300,75 @@ ralloc_free(abs); } + +void +gl_nir_link_check_atomic_counter_resources(struct gl_context *ctx, + struct gl_shader_program *prog) +{ + unsigned num_buffers; + struct active_atomic_buffer *abs = + find_active_atomic_counters(ctx, prog, &num_buffers); + unsigned atomic_counters[MESA_SHADER_STAGES] = {0}; + unsigned atomic_buffers[MESA_SHADER_STAGES] = {0}; + unsigned total_atomic_counters = 0; + unsigned total_atomic_buffers = 0; + + /* Sum the required resources. Note that this counts buffers and + * counters referenced by several shader stages multiple times + * against the combined limit -- That's the behavior the spec + * requires. + */ + for (unsigned i = 0; i < ctx->Const.MaxAtomicBufferBindings; i++) { + if (abs[i].size == 0) + continue; + + qsort(abs[i].uniforms, abs[i].num_uniforms, + sizeof(struct active_atomic_counter_uniform), + cmp_active_counter_offsets); + + for (unsigned j = 1; j < abs[i].num_uniforms; j++) { + /* If an overlapping counter found, it must be a reference to the + * same counter from a different shader stage. + */ + if (check_atomic_counters_overlap(abs[i].uniforms[j-1].var, + abs[i].uniforms[j].var) + && strcmp(abs[i].uniforms[j-1].var->name, + abs[i].uniforms[j].var->name) != 0) { + linker_error(prog, "Atomic counter %s declared at offset %d " + "which is already in use.", + abs[i].uniforms[j].var->name, + abs[i].uniforms[j].var->data.offset); + } + } + + for (unsigned j = 0; j < MESA_SHADER_STAGES; ++j) { + const unsigned n = abs[i].stage_counter_references[j]; + + if (n) { + atomic_counters[j] += n; + total_atomic_counters += n; + atomic_buffers[j]++; + total_atomic_buffers++; + } + } + } + + /* Check that they are within the supported limits. */ + for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) { + if (atomic_counters[i] > ctx->Const.Program[i].MaxAtomicCounters) + linker_error(prog, "Too many %s shader atomic counters", + _mesa_shader_stage_to_string(i)); + + if (atomic_buffers[i] > ctx->Const.Program[i].MaxAtomicBuffers) + linker_error(prog, "Too many %s shader atomic counter buffers", + _mesa_shader_stage_to_string(i)); + } + + if (total_atomic_counters > ctx->Const.MaxCombinedAtomicCounters) + linker_error(prog, "Too many combined atomic counters"); + + if (total_atomic_buffers > ctx->Const.MaxCombinedAtomicBuffers) + linker_error(prog, "Too many combined atomic buffers"); + + ralloc_free(abs); +} diff -Nru mesa-19.2.8/src/compiler/glsl/gl_nir_linker.c mesa-20.0.8/src/compiler/glsl/gl_nir_linker.c --- mesa-19.2.8/src/compiler/glsl/gl_nir_linker.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/glsl/gl_nir_linker.c 2020-06-12 01:21:16.000000000 +0000 @@ -22,9 +22,11 @@ */ #include "nir.h" +#include "gl_nir.h" #include "gl_nir_linker.h" #include "linker_util.h" #include "main/mtypes.h" +#include "main/shaderobj.h" #include "ir_uniform.h" /* for gl_uniform_storage */ /* This file included general link methods, using NIR, instead of IR as @@ -33,6 +35,250 @@ * Also note that this is tailored for ARB_gl_spirv needs and particularities */ +/** + * Built-in / reserved GL variables names start with "gl_" + */ +static inline bool +is_gl_identifier(const char *s) +{ + return s && s[0] == 'g' && s[1] == 'l' && s[2] == '_'; +} + +static bool +inout_has_same_location(const nir_variable *var, unsigned stage) +{ + if (!var->data.patch && + ((var->data.mode == nir_var_shader_out && + stage == MESA_SHADER_TESS_CTRL) || + (var->data.mode == nir_var_shader_in && + (stage == MESA_SHADER_TESS_CTRL || stage == MESA_SHADER_TESS_EVAL || + stage == MESA_SHADER_GEOMETRY)))) + return true; + else + return false; +} + +/** + * Create gl_shader_variable from nir_variable. + */ +static struct gl_shader_variable * +create_shader_variable(struct gl_shader_program *shProg, + const nir_variable *in, + const char *name, const struct glsl_type *type, + const struct glsl_type *interface_type, + bool use_implicit_location, int location, + const struct glsl_type *outermost_struct_type) +{ + /* Allocate zero-initialized memory to ensure that bitfield padding + * is zero. + */ + struct gl_shader_variable *out = rzalloc(shProg, + struct gl_shader_variable); + if (!out) + return NULL; + + /* Since gl_VertexID may be lowered to gl_VertexIDMESA, but applications + * expect to see gl_VertexID in the program resource list. Pretend. + */ + if (in->data.mode == nir_var_system_value && + in->data.location == SYSTEM_VALUE_VERTEX_ID_ZERO_BASE) { + out->name = ralloc_strdup(shProg, "gl_VertexID"); + } else if ((in->data.mode == nir_var_shader_out && + in->data.location == VARYING_SLOT_TESS_LEVEL_OUTER) || + (in->data.mode == nir_var_system_value && + in->data.location == SYSTEM_VALUE_TESS_LEVEL_OUTER)) { + out->name = ralloc_strdup(shProg, "gl_TessLevelOuter"); + type = glsl_array_type(glsl_float_type(), 4, 0); + } else if ((in->data.mode == nir_var_shader_out && + in->data.location == VARYING_SLOT_TESS_LEVEL_INNER) || + (in->data.mode == nir_var_system_value && + in->data.location == SYSTEM_VALUE_TESS_LEVEL_INNER)) { + out->name = ralloc_strdup(shProg, "gl_TessLevelInner"); + type = glsl_array_type(glsl_float_type(), 2, 0); + } else { + out->name = ralloc_strdup(shProg, name); + } + + if (!out->name) + return NULL; + + /* The ARB_program_interface_query spec says: + * + * "Not all active variables are assigned valid locations; the + * following variables will have an effective location of -1: + * + * * uniforms declared as atomic counters; + * + * * members of a uniform block; + * + * * built-in inputs, outputs, and uniforms (starting with "gl_"); and + * + * * inputs or outputs not declared with a "location" layout + * qualifier, except for vertex shader inputs and fragment shader + * outputs." + */ + if (glsl_get_base_type(in->type) == GLSL_TYPE_ATOMIC_UINT || + is_gl_identifier(in->name) || + !(in->data.explicit_location || use_implicit_location)) { + out->location = -1; + } else { + out->location = location; + } + + out->type = type; + out->outermost_struct_type = outermost_struct_type; + out->interface_type = interface_type; + out->component = in->data.location_frac; + out->index = in->data.index; + out->patch = in->data.patch; + out->mode = in->data.mode; + out->interpolation = in->data.interpolation; + out->precision = in->data.precision; + out->explicit_location = in->data.explicit_location; + + return out; +} + +static bool +add_shader_variable(const struct gl_context *ctx, + struct gl_shader_program *shProg, + struct set *resource_set, + unsigned stage_mask, + GLenum programInterface, nir_variable *var, + const char *name, const struct glsl_type *type, + bool use_implicit_location, int location, + bool inouts_share_location, + const struct glsl_type *outermost_struct_type) +{ + const struct glsl_type *interface_type = var->interface_type; + + if (outermost_struct_type == NULL) { + if (var->data.from_named_ifc_block) { + const char *interface_name = glsl_get_type_name(interface_type); + + if (glsl_type_is_array(interface_type)) { + /* Issue #16 of the ARB_program_interface_query spec says: + * + * "* If a variable is a member of an interface block without an + * instance name, it is enumerated using just the variable name. + * + * * If a variable is a member of an interface block with an + * instance name, it is enumerated as "BlockName.Member", where + * "BlockName" is the name of the interface block (not the + * instance name) and "Member" is the name of the variable." + * + * In particular, it indicates that it should be "BlockName", + * not "BlockName[array length]". The conformance suite and + * dEQP both require this behavior. + * + * Here, we unwrap the extra array level added by named interface + * block array lowering so we have the correct variable type. We + * also unwrap the interface type when constructing the name. + * + * We leave interface_type the same so that ES 3.x SSO pipeline + * validation can enforce the rules requiring array length to + * match on interface blocks. + */ + type = glsl_get_array_element(type); + + interface_name = + glsl_get_type_name(glsl_get_array_element(interface_type)); + } + + name = ralloc_asprintf(shProg, "%s.%s", interface_name, name); + } + } + + switch (glsl_get_base_type(type)) { + case GLSL_TYPE_STRUCT: { + /* The ARB_program_interface_query spec says: + * + * "For an active variable declared as a structure, a separate entry + * will be generated for each active structure member. The name of + * each entry is formed by concatenating the name of the structure, + * the "." character, and the name of the structure member. If a + * structure member to enumerate is itself a structure or array, + * these enumeration rules are applied recursively." + */ + if (outermost_struct_type == NULL) + outermost_struct_type = type; + + unsigned field_location = location; + for (unsigned i = 0; i < glsl_get_length(type); i++) { + const struct glsl_type *field_type = glsl_get_struct_field(type, i); + const struct glsl_struct_field *field = + glsl_get_struct_field_data(type, i); + + char *field_name = ralloc_asprintf(shProg, "%s.%s", name, field->name); + if (!add_shader_variable(ctx, shProg, resource_set, + stage_mask, programInterface, + var, field_name, field_type, + use_implicit_location, field_location, + false, outermost_struct_type)) + return false; + + field_location += glsl_count_attribute_slots(field_type, false); + } + return true; + } + + case GLSL_TYPE_ARRAY: { + /* The ARB_program_interface_query spec says: + * + * "For an active variable declared as an array of basic types, a + * single entry will be generated, with its name string formed by + * concatenating the name of the array and the string "[0]"." + * + * "For an active variable declared as an array of an aggregate data + * type (structures or arrays), a separate entry will be generated + * for each active array element, unless noted immediately below. + * The name of each entry is formed by concatenating the name of + * the array, the "[" character, an integer identifying the element + * number, and the "]" character. These enumeration rules are + * applied recursively, treating each enumerated array element as a + * separate active variable." + */ + const struct glsl_type *array_type = glsl_get_array_element(type); + if (glsl_get_base_type(array_type) == GLSL_TYPE_STRUCT || + glsl_get_base_type(array_type) == GLSL_TYPE_ARRAY) { + unsigned elem_location = location; + unsigned stride = inouts_share_location ? 0 : + glsl_count_attribute_slots(array_type, false); + for (unsigned i = 0; i < glsl_get_length(type); i++) { + char *elem = ralloc_asprintf(shProg, "%s[%d]", name, i); + if (!add_shader_variable(ctx, shProg, resource_set, + stage_mask, programInterface, + var, elem, array_type, + use_implicit_location, elem_location, + false, outermost_struct_type)) + return false; + elem_location += stride; + } + return true; + } + /* fallthrough */ + } + + default: { + /* The ARB_program_interface_query spec says: + * + * "For an active variable declared as a single instance of a basic + * type, a single entry will be generated, using the variable name + * from the shader source." + */ + struct gl_shader_variable *sha_v = + create_shader_variable(shProg, var, name, type, interface_type, + use_implicit_location, location, + outermost_struct_type); + if (!sha_v) + return false; + + return link_util_add_program_resource(shProg, resource_set, + programInterface, sha_v, stage_mask); + } + } +} + static bool add_vars_from_list(const struct gl_context *ctx, struct gl_shader_program *prog, struct set *resource_set, @@ -65,22 +311,48 @@ if (var->data.patch) loc_bias = VARYING_SLOT_PATCH0; - struct gl_shader_variable *sh_var = - rzalloc(prog, struct gl_shader_variable); + if (prog->data->spirv) { + struct gl_shader_variable *sh_var = + rzalloc(prog, struct gl_shader_variable); + + /* In the ARB_gl_spirv spec, names are considered optional debug info, so + * the linker needs to work without them. Returning them is optional. + * For simplicity, we ignore names. + */ + sh_var->name = NULL; + sh_var->type = var->type; + sh_var->location = var->data.location - loc_bias; + sh_var->index = var->data.index; + + if (!link_util_add_program_resource(prog, resource_set, + programInterface, + sh_var, 1 << stage)) { + return false; + } + } else { + /* Skip packed varyings, packed varyings are handled separately + * by add_packed_varyings in the GLSL IR + * build_program_resource_list() call. + * TODO: handle packed varyings here instead. We likely want a NIR + * based packing pass first. + */ + if (strncmp(var->name, "packed:", 7) == 0) + continue; - /* In the ARB_gl_spirv spec, names are considered optional debug info, so - * the linker needs to work without them. Returning them is optional. - * For simplicity, we ignore names. - */ - sh_var->name = NULL; - sh_var->type = var->type; - sh_var->location = var->data.location - loc_bias; - sh_var->index = var->data.index; - - if (!link_util_add_program_resource(prog, resource_set, - programInterface, - sh_var, 1 << stage)) { - return false; + const bool vs_input_or_fs_output = + (stage == MESA_SHADER_VERTEX && + var->data.mode == nir_var_shader_in) || + (stage == MESA_SHADER_FRAGMENT && + var->data.mode == nir_var_shader_out); + + if (!add_shader_variable(ctx, prog, resource_set, + 1 << stage, programInterface, + var, var->name, var->type, + vs_input_or_fs_output, + var->data.location - loc_bias, + inout_has_same_location(var, stage), + NULL)) + return false; } } @@ -125,10 +397,11 @@ */ void nir_build_program_resource_list(struct gl_context *ctx, - struct gl_shader_program *prog) + struct gl_shader_program *prog, + bool rebuild_resourse_list) { /* Rebuild resource list. */ - if (prog->data->ProgramResourceList) { + if (prog->data->ProgramResourceList && rebuild_resourse_list) { ralloc_free(prog->data->ProgramResourceList); prog->data->ProgramResourceList = NULL; prog->data->NumProgramResourceList = 0; @@ -204,9 +477,22 @@ for (unsigned i = 0; i < prog->data->NumUniformStorage; i++) { struct gl_uniform_storage *uniform = &prog->data->UniformStorage[i]; - /* Do not add uniforms internally used by Mesa. */ - if (uniform->hidden) + if (uniform->hidden) { + for (int j = MESA_SHADER_VERTEX; j < MESA_SHADER_STAGES; j++) { + if (!uniform->opaque[j].active || + glsl_get_base_type(uniform->type) != GLSL_TYPE_SUBROUTINE) + continue; + + GLenum type = + _mesa_shader_stage_to_subroutine_uniform((gl_shader_stage)j); + /* add shader subroutines */ + if (!link_util_add_program_resource(prog, resource_set, + type, uniform, 0)) + return; + } + continue; + } if (!link_util_should_add_buffer_variable(prog, uniform, top_level_array_base_offset, @@ -262,5 +548,90 @@ return; } + unsigned mask = prog->data->linked_stages; + while (mask) { + const int i = u_bit_scan(&mask); + struct gl_program *p = prog->_LinkedShaders[i]->Program; + + GLuint type = _mesa_shader_stage_to_subroutine((gl_shader_stage)i); + for (unsigned j = 0; j < p->sh.NumSubroutineFunctions; j++) { + if (!link_util_add_program_resource(prog, resource_set, + type, + &p->sh.SubroutineFunctions[j], + 0)) + return; + } + } + _mesa_set_destroy(resource_set, NULL); } + +bool +gl_nir_link_spirv(struct gl_context *ctx, struct gl_shader_program *prog, + const struct gl_nir_linker_options *options) +{ + if (!gl_nir_link_uniform_blocks(ctx, prog)) + return false; + + if (!gl_nir_link_uniforms(ctx, prog, options->fill_parameters)) + return false; + + gl_nir_link_assign_atomic_counter_resources(ctx, prog); + gl_nir_link_assign_xfb_resources(ctx, prog); + + return true; +} + +/** + * Validate shader image resources. + */ +static void +check_image_resources(struct gl_context *ctx, struct gl_shader_program *prog) +{ + unsigned total_image_units = 0; + unsigned fragment_outputs = 0; + unsigned total_shader_storage_blocks = 0; + + if (!ctx->Extensions.ARB_shader_image_load_store) + return; + + for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) { + struct gl_linked_shader *sh = prog->_LinkedShaders[i]; + if (!sh) + continue; + + total_image_units += sh->Program->info.num_images; + total_shader_storage_blocks += sh->Program->info.num_ssbos; + } + + if (total_image_units > ctx->Const.MaxCombinedImageUniforms) + linker_error(prog, "Too many combined image uniforms\n"); + + struct gl_linked_shader *frag_sh = + prog->_LinkedShaders[MESA_SHADER_FRAGMENT]; + if (frag_sh) { + uint64_t frag_outputs_written = frag_sh->Program->info.outputs_written; + fragment_outputs = util_bitcount64(frag_outputs_written); + } + + if (total_image_units + fragment_outputs + total_shader_storage_blocks > + ctx->Const.MaxCombinedShaderOutputResources) + linker_error(prog, "Too many combined image uniforms, shader storage " + " buffers and fragment outputs\n"); +} + +bool +gl_nir_link_glsl(struct gl_context *ctx, struct gl_shader_program *prog) +{ + link_util_calculate_subroutine_compat(prog); + link_util_check_uniform_resources(ctx, prog); + link_util_check_subroutine_resources(prog); + check_image_resources(ctx, prog); + gl_nir_link_assign_atomic_counter_resources(ctx, prog); + gl_nir_link_check_atomic_counter_resources(ctx, prog); + + if (prog->data->LinkStatus == LINKING_FAILURE) + return false; + + return true; +} diff -Nru mesa-19.2.8/src/compiler/glsl/gl_nir_linker.h mesa-20.0.8/src/compiler/glsl/gl_nir_linker.h --- mesa-19.2.8/src/compiler/glsl/gl_nir_linker.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/glsl/gl_nir_linker.h 2020-06-12 01:21:16.000000000 +0000 @@ -31,18 +31,33 @@ struct gl_context; struct gl_shader_program; +struct gl_nir_linker_options { + bool fill_parameters; +}; + +bool gl_nir_link_spirv(struct gl_context *ctx, + struct gl_shader_program *prog, + const struct gl_nir_linker_options *options); + +bool gl_nir_link_glsl(struct gl_context *ctx, struct gl_shader_program *prog); + bool gl_nir_link_uniforms(struct gl_context *ctx, - struct gl_shader_program *prog); + struct gl_shader_program *prog, + bool fill_parameters); void gl_nir_set_uniform_initializers(struct gl_context *ctx, struct gl_shader_program *prog); void nir_build_program_resource_list(struct gl_context *ctx, - struct gl_shader_program *prog); + struct gl_shader_program *prog, + bool rebuild_resourse_list); void gl_nir_link_assign_atomic_counter_resources(struct gl_context *ctx, struct gl_shader_program *prog); +void gl_nir_link_check_atomic_counter_resources(struct gl_context *ctx, + struct gl_shader_program *prog); + void gl_nir_link_assign_xfb_resources(struct gl_context *ctx, struct gl_shader_program *prog); diff -Nru mesa-19.2.8/src/compiler/glsl/gl_nir_link_uniform_initializers.c mesa-20.0.8/src/compiler/glsl/gl_nir_link_uniform_initializers.c --- mesa-19.2.8/src/compiler/glsl/gl_nir_link_uniform_initializers.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/glsl/gl_nir_link_uniform_initializers.c 2020-06-12 01:21:16.000000000 +0000 @@ -87,7 +87,7 @@ storage->storage[i].i; } } - } else if (glsl_type_is_image(type)) { + } else if (glsl_type_is_image(storage->type)) { for (unsigned i = 0; i < elements; i++) { const unsigned index = storage->opaque[sh].index + i; diff -Nru mesa-19.2.8/src/compiler/glsl/gl_nir_link_uniforms.c mesa-20.0.8/src/compiler/glsl/gl_nir_link_uniforms.c --- mesa-19.2.8/src/compiler/glsl/gl_nir_link_uniforms.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/glsl/gl_nir_link_uniforms.c 2020-06-12 01:21:16.000000000 +0000 @@ -163,7 +163,7 @@ if (nir_variable_is_in_block(var)) { struct gl_uniform_storage *uniform = NULL; - unsigned num_blks = nir_variable_is_in_ubo(var) ? + ASSERTED unsigned num_blks = nir_variable_is_in_ubo(var) ? prog->data->NumUniformBlocks : prog->data->NumShaderStorageBlocks; @@ -249,6 +249,7 @@ unsigned num_shader_uniform_components; unsigned shader_samplers_used; unsigned shader_shadow_samplers; + struct gl_program_parameter_list *params; /* per-variable */ nir_variable *current_var; @@ -342,6 +343,59 @@ return index; } +static void +add_parameter(struct gl_uniform_storage *uniform, + struct gl_context *ctx, + struct gl_shader_program *prog, + const struct glsl_type *type, + struct nir_link_uniforms_state *state) +{ + if (!state->params || uniform->is_shader_storage || glsl_contains_opaque(type)) + return; + + unsigned num_params = glsl_get_aoa_size(type); + num_params = MAX2(num_params, 1); + num_params *= glsl_get_matrix_columns(glsl_without_array(type)); + + bool is_dual_slot = glsl_type_is_dual_slot(glsl_without_array(type)); + if (is_dual_slot) + num_params *= 2; + + struct gl_program_parameter_list *params = state->params; + int base_index = params->NumParameters; + _mesa_reserve_parameter_storage(params, num_params); + + if (ctx->Const.PackedDriverUniformStorage) { + for (unsigned i = 0; i < num_params; i++) { + unsigned dmul = glsl_type_is_64bit(glsl_without_array(type)) ? 2 : 1; + unsigned comps = glsl_get_vector_elements(glsl_without_array(type)) * dmul; + if (is_dual_slot) { + if (i & 0x1) + comps -= 4; + else + comps = 4; + } + + _mesa_add_parameter(params, PROGRAM_UNIFORM, NULL, comps, + glsl_get_gl_type(type), NULL, NULL, false); + } + } else { + for (unsigned i = 0; i < num_params; i++) { + _mesa_add_parameter(params, PROGRAM_UNIFORM, NULL, 4, + glsl_get_gl_type(type), NULL, NULL, true); + } + } + + /* Each Parameter will hold the index to the backing uniform storage. + * This avoids relying on names to match parameters and uniform + * storages. + */ + for (unsigned i = 0; i < num_params; i++) { + struct gl_program_parameter *param = ¶ms->Parameters[base_index + i]; + param->UniformStorageIndex = uniform - prog->data->UniformStorage; + param->MainUniformStorageIndex = state->current_var->data.location; + } +} /** * Creates the neccessary entries in UniformStorage for the uniform. Returns @@ -500,11 +554,9 @@ uniform->array_stride = glsl_type_is_array(type) ? glsl_get_explicit_stride(type) : 0; - if (glsl_type_is_matrix(type)) { - assert(parent_type); - uniform->matrix_stride = glsl_get_explicit_stride(type); - - uniform->row_major = glsl_matrix_type_is_row_major(type); + if (glsl_type_is_matrix(uniform->type)) { + uniform->matrix_stride = glsl_get_explicit_stride(uniform->type); + uniform->row_major = glsl_matrix_type_is_row_major(uniform->type); } else { uniform->matrix_stride = 0; } @@ -549,6 +601,7 @@ uniform->num_compatible_subroutines = 0; unsigned entries = MAX2(1, uniform->array_elements); + unsigned values = glsl_get_component_slots(type); if (glsl_type_is_sampler(type_no_array)) { int sampler_index = @@ -569,6 +622,8 @@ state->shader_samplers_used |= 1U << i; state->shader_shadow_samplers |= shadow << i; } + + state->num_values += values; } else if (glsl_type_is_image(type_no_array)) { /* @FIXME: image_index should match that of the same image * uniform in other shaders. This means we need to match image @@ -585,7 +640,7 @@ /* Set image access qualifiers */ enum gl_access_qualifier image_access = - state->current_var->data.image.access; + state->current_var->data.access; const GLenum access = (image_access & ACCESS_NON_WRITEABLE) ? ((image_access & ACCESS_NON_READABLE) ? GL_NONE : @@ -597,23 +652,33 @@ i++) { stage_program->sh.ImageAccess[i] = access; } - } - unsigned values = glsl_get_component_slots(type); - state->num_shader_uniform_components += values; - state->num_values += values; + if (!uniform->is_shader_storage) { + state->num_shader_uniform_components += values; + state->num_values += values; + } + } else { + if (!state->var_is_in_block) { + state->num_shader_uniform_components += values; + state->num_values += values; + } + } if (uniform->remap_location != UNMAPPED_UNIFORM_LOC && state->max_uniform_location < uniform->remap_location + entries) state->max_uniform_location = uniform->remap_location + entries; + if (!state->var_is_in_block) + add_parameter(uniform, ctx, prog, type, state); + return MAX2(uniform->array_elements, 1); } } bool gl_nir_link_uniforms(struct gl_context *ctx, - struct gl_shader_program *prog) + struct gl_shader_program *prog, + bool fill_parameters) { /* First free up any previous UniformStorage items */ ralloc_free(prog->data->UniformStorage); @@ -636,10 +701,13 @@ state.num_shader_uniform_components = 0; state.shader_samplers_used = 0; state.shader_shadow_samplers = 0; + state.params = fill_parameters ? sh->Program->Parameters : NULL; nir_foreach_variable(var, &nir->uniforms) { struct gl_uniform_storage *uniform = NULL; + state.current_var = var; + /* Check if the uniform has been processed already for * other stage. If so, validate they are compatible and update * the active stage mask. @@ -648,6 +716,9 @@ if (uniform) { var->data.location = uniform - prog->data->UniformStorage; + if (!state.var_is_in_block) + add_parameter(uniform, ctx, prog, var->type, &state); + continue; } @@ -655,7 +726,6 @@ /* From now on the variable’s location will be its uniform index */ var->data.location = prog->data->NumUniformStorage; - state.current_var = var; state.offset = 0; state.var_is_in_block = nir_variable_is_in_block(var); state.top_level_array_size = 0; diff -Nru mesa-19.2.8/src/compiler/glsl/gl_nir_lower_bindless_images.c mesa-20.0.8/src/compiler/glsl/gl_nir_lower_bindless_images.c --- mesa-19.2.8/src/compiler/glsl/gl_nir_lower_bindless_images.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/glsl/gl_nir_lower_bindless_images.c 2020-06-12 01:21:16.000000000 +0000 @@ -48,8 +48,10 @@ switch (intrinsic->intrinsic) { case nir_intrinsic_image_deref_atomic_add: - case nir_intrinsic_image_deref_atomic_min: - case nir_intrinsic_image_deref_atomic_max: + case nir_intrinsic_image_deref_atomic_imin: + case nir_intrinsic_image_deref_atomic_umin: + case nir_intrinsic_image_deref_atomic_imax: + case nir_intrinsic_image_deref_atomic_umax: case nir_intrinsic_image_deref_atomic_and: case nir_intrinsic_image_deref_atomic_or: case nir_intrinsic_image_deref_atomic_xor: diff -Nru mesa-19.2.8/src/compiler/glsl/gl_nir_lower_buffers.c mesa-20.0.8/src/compiler/glsl/gl_nir_lower_buffers.c --- mesa-19.2.8/src/compiler/glsl/gl_nir_lower_buffers.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/glsl/gl_nir_lower_buffers.c 2020-06-12 01:21:16.000000000 +0000 @@ -49,7 +49,6 @@ if (nir_src_is_const(deref->arr.index)) { unsigned arr_index = nir_src_as_uint(deref->arr.index); - arr_index = MIN2(arr_index, arr_size - 1); /* We're walking the deref from the tail so prepend the array index */ block_name = ralloc_asprintf(b->shader, "[%u]%s", arr_index, @@ -59,7 +58,7 @@ } else { nir_ssa_def *arr_index = nir_ssa_for_src(b, deref->arr.index, 1); arr_index = nir_umin(b, arr_index, nir_imm_int(b, arr_size - 1)); - nir_ssa_def *arr_offset = nir_imul_imm(b, arr_index, array_elements); + nir_ssa_def *arr_offset = nir_amul_imm(b, arr_index, array_elements); if (nonconst_index) nonconst_index = nir_iadd(b, nonconst_index, arr_offset); else diff -Nru mesa-19.2.8/src/compiler/glsl/gl_nir_lower_samplers_as_deref.c mesa-20.0.8/src/compiler/glsl/gl_nir_lower_samplers_as_deref.c --- mesa-19.2.8/src/compiler/glsl/gl_nir_lower_samplers_as_deref.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/glsl/gl_nir_lower_samplers_as_deref.c 2020-06-12 01:21:16.000000000 +0000 @@ -168,7 +168,7 @@ return deref; } - uint32_t hash = _mesa_key_hash_string(name); + uint32_t hash = _mesa_hash_string(name); struct hash_entry *h = _mesa_hash_table_search_pre_hashed(state->remap_table, hash, name); @@ -271,8 +271,10 @@ if (instr->intrinsic == nir_intrinsic_image_deref_load || instr->intrinsic == nir_intrinsic_image_deref_store || instr->intrinsic == nir_intrinsic_image_deref_atomic_add || - instr->intrinsic == nir_intrinsic_image_deref_atomic_min || - instr->intrinsic == nir_intrinsic_image_deref_atomic_max || + instr->intrinsic == nir_intrinsic_image_deref_atomic_imin || + instr->intrinsic == nir_intrinsic_image_deref_atomic_umin || + instr->intrinsic == nir_intrinsic_image_deref_atomic_imax || + instr->intrinsic == nir_intrinsic_image_deref_atomic_umax || instr->intrinsic == nir_intrinsic_image_deref_atomic_and || instr->intrinsic == nir_intrinsic_image_deref_atomic_or || instr->intrinsic == nir_intrinsic_image_deref_atomic_xor || @@ -323,12 +325,9 @@ state.shader = shader; state.shader_program = shader_program; - state.remap_table = _mesa_hash_table_create(NULL, _mesa_key_hash_string, + state.remap_table = _mesa_hash_table_create(NULL, _mesa_hash_string, _mesa_key_string_equal); - shader->info.textures_used = 0; - shader->info.textures_used_by_txf = 0; - nir_foreach_function(function, shader) { if (function->impl) progress |= lower_impl(function->impl, &state); diff -Nru mesa-19.2.8/src/compiler/glsl/gl_nir_lower_samplers.c mesa-20.0.8/src/compiler/glsl/gl_nir_lower_samplers.c --- mesa-19.2.8/src/compiler/glsl/gl_nir_lower_samplers.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/glsl/gl_nir_lower_samplers.c 2020-06-12 01:21:16.000000000 +0000 @@ -24,142 +24,16 @@ */ #include "compiler/nir/nir.h" -#include "compiler/nir/nir_builder.h" #include "gl_nir.h" -#include "ir_uniform.h" - -#include "main/compiler.h" -#include "main/mtypes.h" - -static void -lower_tex_src_to_offset(nir_builder *b, - nir_tex_instr *instr, unsigned src_idx) -{ - nir_ssa_def *index = NULL; - unsigned base_index = 0; - unsigned array_elements = 1; - nir_tex_src *src = &instr->src[src_idx]; - bool is_sampler = src->src_type == nir_tex_src_sampler_deref; - - /* We compute first the offsets */ - nir_deref_instr *deref = nir_instr_as_deref(src->src.ssa->parent_instr); - while (deref->deref_type != nir_deref_type_var) { - assert(deref->parent.is_ssa); - nir_deref_instr *parent = - nir_instr_as_deref(deref->parent.ssa->parent_instr); - - assert(deref->deref_type == nir_deref_type_array); - - if (nir_src_is_const(deref->arr.index) && index == NULL) { - /* We're still building a direct index */ - base_index += nir_src_as_uint(deref->arr.index) * array_elements; - } else { - if (index == NULL) { - /* We used to be direct but not anymore */ - index = nir_imm_int(b, base_index); - base_index = 0; - } - - index = nir_iadd(b, index, - nir_imul(b, nir_imm_int(b, array_elements), - nir_ssa_for_src(b, deref->arr.index, 1))); - } - - array_elements *= glsl_get_length(parent->type); - - deref = parent; - } - - if (index) - index = nir_umin(b, index, nir_imm_int(b, array_elements - 1)); - - /* We hit the deref_var. This is the end of the line */ - assert(deref->deref_type == nir_deref_type_var); - - base_index += deref->var->data.binding; - - /* We have the offsets, we apply them, rewriting the source or removing - * instr if needed - */ - if (index) { - nir_instr_rewrite_src(&instr->instr, &src->src, - nir_src_for_ssa(index)); - - src->src_type = is_sampler ? - nir_tex_src_sampler_offset : - nir_tex_src_texture_offset; - - instr->texture_array_size = array_elements; - } else { - nir_tex_instr_remove_src(instr, src_idx); - } - - if (is_sampler) { - instr->sampler_index = base_index; - } else { - instr->texture_index = base_index; - instr->texture_array_size = array_elements; - } -} - -static bool -lower_sampler(nir_builder *b, nir_tex_instr *instr) -{ - int texture_idx = - nir_tex_instr_src_index(instr, nir_tex_src_texture_deref); - - if (texture_idx >= 0) { - b->cursor = nir_before_instr(&instr->instr); - - lower_tex_src_to_offset(b, instr, texture_idx); - } - - int sampler_idx = - nir_tex_instr_src_index(instr, nir_tex_src_sampler_deref); - - if (sampler_idx >= 0) { - lower_tex_src_to_offset(b, instr, sampler_idx); - } - - if (texture_idx < 0 && sampler_idx < 0) - return false; - - return true; -} - -static bool -lower_impl(nir_function_impl *impl) -{ - nir_builder b; - nir_builder_init(&b, impl); - bool progress = false; - - nir_foreach_block(block, impl) { - nir_foreach_instr(instr, block) { - if (instr->type == nir_instr_type_tex) - progress |= lower_sampler(&b, nir_instr_as_tex(instr)); - } - } - - return progress; -} bool gl_nir_lower_samplers(nir_shader *shader, const struct gl_shader_program *shader_program) { - bool progress = false; - /* First, use gl_nir_lower_samplers_as_derefs to set var->data.binding * based on the uniforms, and split structures to simplify derefs. */ gl_nir_lower_samplers_as_deref(shader, shader_program); - /* Next, lower derefs to offsets. */ - nir_foreach_function(function, shader) { - if (function->impl) - progress |= lower_impl(function->impl); - } - - return progress; + return nir_lower_samplers(shader); } diff -Nru mesa-19.2.8/src/compiler/glsl/gl_nir_opt_access.c mesa-20.0.8/src/compiler/glsl/gl_nir_opt_access.c --- mesa-19.2.8/src/compiler/glsl/gl_nir_opt_access.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/glsl/gl_nir_opt_access.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,333 +0,0 @@ -/* - * Copyright © 2019 Valve Corporation - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ - -#include "nir/nir.h" -#include "gl_nir.h" - -/* This pass optimizes GL access qualifiers. So far it does two things: - * - * - Infer readonly when it's missing. - * - Infer ACCESS_CAN_REORDER when the following are true: - * - Either there are no writes, or ACCESS_NON_WRITEABLE and ACCESS_RESTRICT - * are both set. In either case there are no writes to the underlying - * memory. - * - If ACCESS_COHERENT is set, then there must be no memory barriers - * involving the access. Coherent accesses may return different results - * before and after barriers. - * - ACCESS_VOLATILE is not set. - * - * If these conditions are true, then image and buffer reads may be treated as - * if they were uniform buffer reads, i.e. they may be arbitrarily moved, - * combined, rematerialized etc. - */ - -struct access_state { - struct set *vars_written; - bool images_written; - bool buffers_written; - bool image_barriers; - bool buffer_barriers; -}; - -static void -gather_intrinsic(struct access_state *state, nir_intrinsic_instr *instr) -{ - nir_variable *var; - switch (instr->intrinsic) { - case nir_intrinsic_image_deref_store: - case nir_intrinsic_image_deref_atomic_add: - case nir_intrinsic_image_deref_atomic_min: - case nir_intrinsic_image_deref_atomic_max: - case nir_intrinsic_image_deref_atomic_and: - case nir_intrinsic_image_deref_atomic_or: - case nir_intrinsic_image_deref_atomic_xor: - case nir_intrinsic_image_deref_atomic_exchange: - case nir_intrinsic_image_deref_atomic_comp_swap: - case nir_intrinsic_image_deref_atomic_fadd: - var = nir_intrinsic_get_var(instr, 0); - - /* In OpenGL, buffer images use normal buffer objects, whereas other - * image types use textures which cannot alias with buffer objects. - * Therefore we have to group buffer samplers together with SSBO's. - */ - if (glsl_get_sampler_dim(glsl_without_array(var->type)) == - GLSL_SAMPLER_DIM_BUF) - state->buffers_written = true; - else - state->images_written = true; - - if (var->data.mode == nir_var_uniform) - _mesa_set_add(state->vars_written, var); - break; - - case nir_intrinsic_bindless_image_store: - case nir_intrinsic_bindless_image_atomic_add: - case nir_intrinsic_bindless_image_atomic_min: - case nir_intrinsic_bindless_image_atomic_max: - case nir_intrinsic_bindless_image_atomic_and: - case nir_intrinsic_bindless_image_atomic_or: - case nir_intrinsic_bindless_image_atomic_xor: - case nir_intrinsic_bindless_image_atomic_exchange: - case nir_intrinsic_bindless_image_atomic_comp_swap: - case nir_intrinsic_bindless_image_atomic_fadd: - if (nir_intrinsic_image_dim(instr) == GLSL_SAMPLER_DIM_BUF) - state->buffers_written = true; - else - state->images_written = true; - break; - - case nir_intrinsic_store_deref: - case nir_intrinsic_deref_atomic_add: - case nir_intrinsic_deref_atomic_imin: - case nir_intrinsic_deref_atomic_umin: - case nir_intrinsic_deref_atomic_imax: - case nir_intrinsic_deref_atomic_umax: - case nir_intrinsic_deref_atomic_and: - case nir_intrinsic_deref_atomic_or: - case nir_intrinsic_deref_atomic_xor: - case nir_intrinsic_deref_atomic_exchange: - case nir_intrinsic_deref_atomic_comp_swap: - case nir_intrinsic_deref_atomic_fadd: - case nir_intrinsic_deref_atomic_fmin: - case nir_intrinsic_deref_atomic_fmax: - case nir_intrinsic_deref_atomic_fcomp_swap: - var = nir_intrinsic_get_var(instr, 0); - if (var->data.mode != nir_var_mem_ssbo) - break; - - _mesa_set_add(state->vars_written, var); - state->buffers_written = true; - - case nir_intrinsic_memory_barrier: - state->buffer_barriers = true; - state->image_barriers = true; - break; - - case nir_intrinsic_memory_barrier_buffer: - state->buffer_barriers = true; - break; - - case nir_intrinsic_memory_barrier_image: - state->image_barriers = true; - break; - - default: - break; - } -} - -static bool -process_variable(struct access_state *state, nir_variable *var) -{ - if (var->data.mode != nir_var_mem_ssbo && - !(var->data.mode == nir_var_uniform && - glsl_type_is_image(var->type))) - return false; - - /* Ignore variables we've already marked */ - if (var->data.image.access & ACCESS_CAN_REORDER) - return false; - - if (!(var->data.image.access & ACCESS_NON_WRITEABLE) && - !_mesa_set_search(state->vars_written, var)) { - var->data.image.access |= ACCESS_NON_WRITEABLE; - return true; - } - - return false; -} - -static bool -can_reorder(struct access_state *state, enum gl_access_qualifier access, - bool is_buffer, bool is_ssbo) -{ - bool is_any_written = is_buffer ? state->buffers_written : - state->images_written; - - /* Can we guarantee that the underlying memory is never written? */ - if (!is_any_written || - ((access & ACCESS_NON_WRITEABLE) && - (access & ACCESS_RESTRICT))) { - /* Note: memoryBarrierBuffer() is only guaranteed to flush buffer - * variables and not imageBuffer's, so we only consider the GL-level - * type here. - */ - bool is_any_barrier = is_ssbo ? - state->buffer_barriers : state->image_barriers; - - return (!is_any_barrier || !(access & ACCESS_COHERENT)) && - !(access & ACCESS_VOLATILE); - } - - return false; -} - -static bool -process_intrinsic(struct access_state *state, nir_intrinsic_instr *instr) -{ - switch (instr->intrinsic) { - case nir_intrinsic_bindless_image_load: - if (nir_intrinsic_access(instr) & ACCESS_CAN_REORDER) - return false; - - /* We have less information about bindless intrinsics, since we can't - * always trace uses back to the variable. Don't try and infer if it's - * read-only, unless there are no image writes at all. - */ - bool progress = false; - bool is_buffer = - nir_intrinsic_image_dim(instr) == GLSL_SAMPLER_DIM_BUF; - - bool is_any_written = - is_buffer ? state->buffers_written : state->images_written; - - if (!(nir_intrinsic_access(instr) & ACCESS_NON_WRITEABLE) && - !is_any_written) { - progress = true; - nir_intrinsic_set_access(instr, - nir_intrinsic_access(instr) | - ACCESS_NON_WRITEABLE); - } - - if (can_reorder(state, nir_intrinsic_access(instr), is_buffer, false)) { - progress = true; - nir_intrinsic_set_access(instr, - nir_intrinsic_access(instr) | - ACCESS_CAN_REORDER); - } - - return progress; - - case nir_intrinsic_load_deref: - case nir_intrinsic_image_deref_load: { - nir_variable *var = nir_intrinsic_get_var(instr, 0); - - if (instr->intrinsic == nir_intrinsic_load_deref && - var->data.mode != nir_var_mem_ssbo) - return false; - - if (nir_intrinsic_access(instr) & ACCESS_CAN_REORDER) - return false; - - bool progress = false; - - /* Check if we were able to mark the whole variable non-writeable */ - if (!(nir_intrinsic_access(instr) & ACCESS_NON_WRITEABLE) && - var->data.image.access & ACCESS_NON_WRITEABLE) { - progress = true; - nir_intrinsic_set_access(instr, - nir_intrinsic_access(instr) | - ACCESS_NON_WRITEABLE); - } - - bool is_ssbo = var->data.mode == nir_var_mem_ssbo; - - bool is_buffer = is_ssbo || - glsl_get_sampler_dim(glsl_without_array(var->type)) == GLSL_SAMPLER_DIM_BUF; - - if (can_reorder(state, nir_intrinsic_access(instr), is_buffer, is_ssbo)) { - progress = true; - nir_intrinsic_set_access(instr, - nir_intrinsic_access(instr) | - ACCESS_CAN_REORDER); - } - - return progress; - } - - default: - return false; - } -} - -static bool -opt_access_impl(struct access_state *state, - nir_function_impl *impl) -{ - bool progress = false; - - nir_foreach_block(block, impl) { - nir_foreach_instr(instr, block) { - if (instr->type == nir_instr_type_intrinsic) - progress |= process_intrinsic(state, - nir_instr_as_intrinsic(instr)); - } - } - - if (progress) { - nir_metadata_preserve(impl, - nir_metadata_block_index | - nir_metadata_dominance | - nir_metadata_live_ssa_defs | - nir_metadata_loop_analysis); - } - - - return progress; -} - -bool -gl_nir_opt_access(nir_shader *shader) -{ - struct access_state state = { - .vars_written = _mesa_pointer_set_create(NULL), - }; - - bool var_progress = false; - bool progress = false; - - nir_foreach_function(func, shader) { - if (func->impl) { - nir_foreach_block(block, func->impl) { - nir_foreach_instr(instr, block) { - if (instr->type == nir_instr_type_intrinsic) - gather_intrinsic(&state, nir_instr_as_intrinsic(instr)); - } - } - } - } - - nir_foreach_variable(var, &shader->uniforms) - var_progress |= process_variable(&state, var); - - nir_foreach_function(func, shader) { - if (func->impl) { - progress |= opt_access_impl(&state, func->impl); - - /* If we make a change to the uniforms, update all the impls. */ - if (var_progress) { - nir_metadata_preserve(func->impl, - nir_metadata_block_index | - nir_metadata_dominance | - nir_metadata_live_ssa_defs | - nir_metadata_loop_analysis); - } - } - } - - progress |= var_progress; - - _mesa_set_destroy(state.vars_written, NULL); - return progress; -} - diff -Nru mesa-19.2.8/src/compiler/glsl/glsl_lexer.ll mesa-20.0.8/src/compiler/glsl/glsl_lexer.ll --- mesa-19.2.8/src/compiler/glsl/glsl_lexer.ll 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/glsl/glsl_lexer.ll 2020-06-12 01:21:16.000000000 +0000 @@ -45,7 +45,8 @@ yylloc->last_column = yycolumn + 1; \ } while(0); -#define YY_USER_INIT yylineno = 0; yycolumn = 0; yylloc->source = 0; +#define YY_USER_INIT yylineno = 0; yycolumn = 0; yylloc->source = 0; \ + yylloc->path = NULL; /* A macro for handling reserved words and keywords across language versions. * @@ -226,6 +227,7 @@ SPC [ \t]* SPCP [ \t]+ HASH ^{SPC}#{SPC} +PATH ["][./ _A-Za-z0-9]*["] %% [ \r\t]+ ; @@ -234,6 +236,14 @@ ^[ \t]*#[ \t]*$ ; ^[ \t]*#[ \t]*version { BEGIN PP; return VERSION_TOK; } ^[ \t]*#[ \t]*extension { BEGIN PP; return EXTENSION; } +{HASH}include { + if (!yyextra->ARB_shading_language_include_enable) { + struct _mesa_glsl_parse_state *state = yyextra; + _mesa_glsl_error(yylloc, state, + "ARB_shading_language_include required " + "to use #include"); + } +} {HASH}line{SPCP}{INT}{SPCP}{INT}{SPC}$ { /* Eat characters until the first digit is * encountered @@ -257,7 +267,50 @@ yylineno--; yylloc->source = strtol(ptr, NULL, 0); + yylloc->path = NULL; } +{HASH}line{SPCP}{INT}{SPCP}{PATH}{SPC}$ { + if (!yyextra->ARB_shading_language_include_enable) { + struct _mesa_glsl_parse_state *state = yyextra; + _mesa_glsl_error(yylloc, state, + "ARB_shading_language_include required " + "to use #line \"\""); + } + + /* Eat characters until the first digit is + * encountered + */ + char *ptr = yytext; + while (!isdigit(*ptr)) + ptr++; + + /* Subtract one from the line number because + * yylineno is zero-based instead of + * one-based. + */ + yylineno = strtol(ptr, &ptr, 0) - 1; + + /* From GLSL 3.30 and GLSL ES on, after processing the + * line directive (including its new-line), the implementation + * will behave as if it is compiling at the line number passed + * as argument. It was line number + 1 in older specifications. + */ + if (yyextra->is_version(330, 100)) + yylineno--; + + while (isspace(*ptr)) + ptr++; + + /* Skip over leading " */ + ptr++; + + char *end = strrchr(ptr, '"'); + int path_len = (end - ptr) + 1; + void *mem_ctx = yyextra->linalloc; + yylloc->path = (char *) linear_alloc_child(mem_ctx, path_len); + memcpy(yylloc->path, ptr, path_len); + yylloc->path[path_len - 1] = '\0'; + } {HASH}line{SPCP}{INT}{SPC}$ { /* Eat characters until the first digit is * encountered @@ -356,6 +409,7 @@ if return IF; discard return DISCARD; return return RETURN; +demote KEYWORD_WITH_ALT(0, 0, 0, 0, yyextra->EXT_demote_to_helper_invocation_enable, DEMOTE); bvec2 { yylval->type = glsl_type::bvec2_type; return BASIC_TYPE_TOK; } bvec3 { yylval->type = glsl_type::bvec3_type; return BASIC_TYPE_TOK; } diff -Nru mesa-19.2.8/src/compiler/glsl/glsl_parser_extras.cpp mesa-20.0.8/src/compiler/glsl/glsl_parser_extras.cpp --- mesa-19.2.8/src/compiler/glsl/glsl_parser_extras.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/glsl/glsl_parser_extras.cpp 2020-06-12 01:21:16.000000000 +0000 @@ -495,11 +495,15 @@ /* Get the offset that the new message will be written to. */ int msg_offset = strlen(state->info_log); - ralloc_asprintf_append(&state->info_log, "%u:%u(%u): %s: ", - locp->source, - locp->first_line, - locp->first_column, - error ? "error" : "warning"); + if (locp->path) { + ralloc_asprintf_append(&state->info_log, "\"%s\"", locp->path); + } else { + ralloc_asprintf_append(&state->info_log, "%u", locp->source); + } + ralloc_asprintf_append(&state->info_log, ":%u(%u): %s: ", + locp->first_line, locp->first_column, + error ? "error" : "warning"); + ralloc_vasprintf_append(&state->info_log, fmt, ap); const char *const msg = &state->info_log[msg_offset]; @@ -665,6 +669,7 @@ EXT(ARB_shader_texture_lod), EXT(ARB_shader_viewport_layer_array), EXT(ARB_shading_language_420pack), + EXT(ARB_shading_language_include), EXT(ARB_shading_language_packing), EXT(ARB_tessellation_shader), EXT(ARB_texture_cube_map_array), @@ -713,6 +718,7 @@ EXT(AMD_vertex_shader_viewport_index), EXT(ANDROID_extension_pack_es31a), EXT(EXT_blend_func_extended), + EXT(EXT_demote_to_helper_invocation), EXT(EXT_frag_depth), EXT(EXT_draw_buffers), EXT(EXT_clip_cull_distance), @@ -739,6 +745,7 @@ EXT(EXT_texture_shadow_lod), EXT(INTEL_conservative_rasterization), EXT(INTEL_shader_atomic_float_minmax), + EXT(INTEL_shader_integer_functions2), EXT(MESA_shader_integer_functions), EXT(NV_compute_shader_derivatives), EXT(NV_fragment_shader_interlock), @@ -1512,6 +1519,13 @@ void +ast_demote_statement::print(void) const +{ + printf("demote; "); +} + + +void ast_selection_statement::print(void) const { printf("if ( "); @@ -2095,13 +2109,11 @@ shader->symbols); } -void -_mesa_glsl_compile_shader(struct gl_context *ctx, struct gl_shader *shader, - bool dump_ast, bool dump_hir, bool force_recompile) +static bool +can_skip_compile(struct gl_context *ctx, struct gl_shader *shader, + const char *source, bool force_recompile, + bool source_has_shader_include) { - const char *source = force_recompile && shader->FallbackSource ? - shader->FallbackSource : shader->Source; - if (!force_recompile) { if (ctx->Cache) { char buf[41]; @@ -2116,28 +2128,69 @@ shader->CompileStatus = COMPILE_SKIPPED; free((void *)shader->FallbackSource); - shader->FallbackSource = NULL; - return; + + /* Copy pre-processed shader include to fallback source otherwise + * we have no guarantee the shader include source tree has not + * changed. + */ + shader->FallbackSource = source_has_shader_include ? + strdup(source) : NULL; + return true; } } } else { /* We should only ever end up here if a re-compile has been forced by a * shader cache miss. In which case we can skip the compile if its - * already be done by a previous fallback or the initial compile call. + * already been done by a previous fallback or the initial compile call. */ if (shader->CompileStatus == COMPILE_SUCCESS) - return; + return true; } - struct _mesa_glsl_parse_state *state = + return false; +} + +void +_mesa_glsl_compile_shader(struct gl_context *ctx, struct gl_shader *shader, + bool dump_ast, bool dump_hir, bool force_recompile) +{ + const char *source = force_recompile && shader->FallbackSource ? + shader->FallbackSource : shader->Source; + + /* Note this will be true for shaders the have #include inside comments + * however that should be rare enough not to worry about. + */ + bool source_has_shader_include = + strstr(source, "#include") == NULL ? false : true; + + /* If there was no shader include we can check the shader cache and skip + * compilation before we run the preprocessor. We never skip compiling + * shaders that use ARB_shading_language_include because we would need to + * keep duplicate copies of the shader include source tree and paths. + */ + if (!source_has_shader_include && + can_skip_compile(ctx, shader, source, force_recompile, false)) + return; + + struct _mesa_glsl_parse_state *state = new(shader) _mesa_glsl_parse_state(ctx, shader->Stage, shader); if (ctx->Const.GenerateTemporaryNames) (void) p_atomic_cmpxchg(&ir_variable::temporaries_allocate_names, false, true); - state->error = glcpp_preprocess(state, &source, &state->info_log, - add_builtin_defines, state, ctx); + if (!source_has_shader_include || !force_recompile) { + state->error = glcpp_preprocess(state, &source, &state->info_log, + add_builtin_defines, state, ctx); + } + + /* Now that we have run the preprocessor we can check the shader cache and + * skip compilation if possible for those shaders that contained a shader + * include. + */ + if (source_has_shader_include && + can_skip_compile(ctx, shader, source, force_recompile, true)) + return; if (!state->error) { _mesa_glsl_lexer_ctor(state, source); @@ -2187,7 +2240,12 @@ if (!force_recompile) { free((void *)shader->FallbackSource); - shader->FallbackSource = NULL; + + /* Copy pre-processed shader include to fallback source otherwise we + * have no guarantee the shader include source tree has not changed. + */ + shader->FallbackSource = source_has_shader_include ? + strdup(source) : NULL; } delete state->symbols; @@ -2287,7 +2345,20 @@ OPT(lower_vector_insert, ir, false); OPT(optimize_swizzles, ir); - OPT(optimize_split_arrays, ir, linked); + /* Some drivers only call do_common_optimization() once rather than in a + * loop, and split arrays causes each element of a constant array to + * dereference is own copy of the entire array initilizer. This IR is not + * something that can be generated manually in a shader and is not + * accounted for by NIR optimisations, the result is an exponential slow + * down in compilation speed as a constant arrays element count grows. To + * avoid that here we make sure to always clean up the mess split arrays + * causes to constant arrays. + */ + bool array_split = optimize_split_arrays(ir, linked); + if (array_split) + do_constant_propagation(ir); + progress |= array_split; + OPT(optimize_redundant_jumps, ir); if (options->MaxUnrollIterations) { @@ -2326,49 +2397,3 @@ return progress; } - -extern "C" { - -/** - * To be called at GL context ctor. - */ -void -_mesa_init_shader_compiler_types(void) -{ - glsl_type_singleton_init_or_ref(); -} - -/** - * To be called at GL context dtor. - */ -void -_mesa_destroy_shader_compiler_types(void) -{ - glsl_type_singleton_decref(); -} - -/** - * To be called at GL teardown time, this frees compiler datastructures. - * - * After calling this, any previously compiled shaders and shader - * programs would be invalid. So this should happen at approximately - * program exit. - */ -void -_mesa_destroy_shader_compiler(void) -{ - _mesa_destroy_shader_compiler_caches(); -} - -/** - * Releases compiler caches to trade off performance for memory. - * - * Intended to be used with glReleaseShaderCompiler(). - */ -void -_mesa_destroy_shader_compiler_caches(void) -{ - _mesa_glsl_release_builtin_functions(); -} - -} diff -Nru mesa-19.2.8/src/compiler/glsl/glsl_parser_extras.h mesa-20.0.8/src/compiler/glsl/glsl_parser_extras.h --- mesa-19.2.8/src/compiler/glsl/glsl_parser_extras.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/glsl/glsl_parser_extras.h 2020-06-12 01:21:16.000000000 +0000 @@ -68,6 +68,8 @@ int last_line; int last_column; unsigned source; + /* Path for ARB_shading_language_include include source */ + char *path; } YYLTYPE; # define YYLTYPE_IS_DECLARED 1 # define YYLTYPE_IS_TRIVIAL 1 @@ -727,6 +729,8 @@ bool ARB_shader_viewport_layer_array_warn; bool ARB_shading_language_420pack_enable; bool ARB_shading_language_420pack_warn; + bool ARB_shading_language_include_enable; + bool ARB_shading_language_include_warn; bool ARB_shading_language_packing_enable; bool ARB_shading_language_packing_warn; bool ARB_tessellation_shader_enable; @@ -816,6 +820,8 @@ bool EXT_blend_func_extended_warn; bool EXT_clip_cull_distance_enable; bool EXT_clip_cull_distance_warn; + bool EXT_demote_to_helper_invocation_enable; + bool EXT_demote_to_helper_invocation_warn; bool EXT_draw_buffers_enable; bool EXT_draw_buffers_warn; bool EXT_frag_depth_enable; @@ -866,6 +872,8 @@ bool INTEL_conservative_rasterization_warn; bool INTEL_shader_atomic_float_minmax_enable; bool INTEL_shader_atomic_float_minmax_warn; + bool INTEL_shader_integer_functions2_enable; + bool INTEL_shader_integer_functions2_warn; bool MESA_shader_integer_functions_enable; bool MESA_shader_integer_functions_warn; bool NV_compute_shader_derivatives_enable; @@ -956,6 +964,7 @@ (Current).first_column = YYRHSLOC(Rhs, 1).first_column; \ (Current).last_line = YYRHSLOC(Rhs, N).last_line; \ (Current).last_column = YYRHSLOC(Rhs, N).last_column; \ + (Current).path = YYRHSLOC(Rhs, N).path; \ } \ else \ { \ @@ -963,6 +972,7 @@ YYRHSLOC(Rhs, 0).last_line; \ (Current).first_column = (Current).last_column = \ YYRHSLOC(Rhs, 0).last_column; \ + (Current).path = YYRHSLOC(Rhs, 0).path; \ } \ (Current).source = 0; \ } while (0) @@ -1024,11 +1034,6 @@ struct _mesa_glsl_parse_state *state, struct gl_context *gl_ctx); -extern void _mesa_init_shader_compiler_types(void); -extern void _mesa_destroy_shader_compiler_types(void); -extern void _mesa_destroy_shader_compiler(void); -extern void _mesa_destroy_shader_compiler_caches(void); - extern void _mesa_glsl_copy_symbols_from_table(struct exec_list *shader_ir, struct glsl_symbol_table *src, diff -Nru mesa-19.2.8/src/compiler/glsl/glsl_parser.yy mesa-20.0.8/src/compiler/glsl/glsl_parser.yy --- mesa-19.2.8/src/compiler/glsl/glsl_parser.yy 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/glsl/glsl_parser.yy 2020-06-12 01:21:16.000000000 +0000 @@ -91,6 +91,7 @@ @$.last_line = 1; @$.last_column = 1; @$.source = 0; + @$.path = NULL; } %lex-param {struct _mesa_glsl_parse_state *state} @@ -139,7 +140,7 @@ %token ATTRIBUTE CONST_TOK %token BASIC_TYPE_TOK -%token BREAK BUFFER CONTINUE DO ELSE FOR IF DISCARD RETURN SWITCH CASE DEFAULT +%token BREAK BUFFER CONTINUE DO ELSE FOR IF DEMOTE DISCARD RETURN SWITCH CASE DEFAULT %token CENTROID IN_TOK OUT_TOK INOUT_TOK UNIFORM VARYING SAMPLE %token NOPERSPECTIVE FLAT SMOOTH %token IMAGE1DSHADOW IMAGE2DSHADOW IMAGE1DARRAYSHADOW IMAGE2DARRAYSHADOW @@ -256,6 +257,7 @@ %type declaration %type declaration_statement %type jump_statement +%type demote_statement %type interface_block %type basic_interface_block %type struct_specifier @@ -2510,6 +2512,7 @@ | switch_statement | iteration_statement | jump_statement + | demote_statement ; compound_statement: @@ -2807,6 +2810,15 @@ $$->set_location(@1); } ; + +demote_statement: + DEMOTE ';' + { + void *ctx = state->linalloc; + $$ = new(ctx) ast_demote_statement(); + $$->set_location(@1); + } + ; external_declaration: function_definition { $$ = $1; } diff -Nru mesa-19.2.8/src/compiler/glsl/glsl_to_nir.cpp mesa-20.0.8/src/compiler/glsl/glsl_to_nir.cpp --- mesa-19.2.8/src/compiler/glsl/glsl_to_nir.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/glsl/glsl_to_nir.cpp 2020-06-12 01:21:16.000000000 +0000 @@ -34,6 +34,7 @@ #include "program.h" #include "compiler/nir/nir_control_flow.h" #include "compiler/nir/nir_builder.h" +#include "compiler/nir/nir_builtin_builder.h" #include "compiler/nir/nir_deref.h" #include "main/errors.h" #include "main/imports.h" @@ -63,6 +64,7 @@ virtual void visit(ir_loop *); virtual void visit(ir_if *); virtual void visit(ir_discard *); + virtual void visit(ir_demote *); virtual void visit(ir_loop_jump *); virtual void visit(ir_return *); virtual void visit(ir_call *); @@ -164,6 +166,12 @@ } } + if (!glsl_type_is_vector_or_scalar(ir->return_type) && + !ir->return_type->is_void()) { + unsupported = true; + return visit_stop; + } + return visit_continue; } @@ -233,13 +241,6 @@ } } - /* Remap the locations to slots so those requiring two slots will occupy - * two locations. For instance, if we have in the IR code a dvec3 attr0 in - * location 0 and vec4 attr1 in location 1, in NIR attr0 will use - * locations/slots 0 and 1, and attr1 will use location/slot 2 */ - if (shader->info.stage == MESA_SHADER_VERTEX) - nir_remap_dual_slot_attributes(shader, &sh->Program->DualSlotInputs); - shader->info.name = ralloc_asprintf(shader, "GLSL%d", shader_prog->Name); if (shader_prog->Label) shader->info.label = ralloc_strdup(shader, shader_prog->Label); @@ -415,6 +416,15 @@ return glsl_type::get_array_instance(elem_type, array_type->length); } +static unsigned +get_nir_how_declared(unsigned how_declared) +{ + if (how_declared == ir_var_hidden) + return nir_var_hidden; + + return nir_var_declared_normally; +} + void nir_visitor::visit(ir_variable *ir) { @@ -440,9 +450,16 @@ var->data.centroid = ir->data.centroid; var->data.sample = ir->data.sample; var->data.patch = ir->data.patch; + var->data.how_declared = get_nir_how_declared(ir->data.how_declared); var->data.invariant = ir->data.invariant; var->data.location = ir->data.location; var->data.stream = ir->data.stream; + if (ir->data.stream & (1u << 31)) + var->data.stream |= NIR_STREAM_PACKED; + + var->data.precision = ir->data.precision; + var->data.explicit_location = ir->data.explicit_location; + var->data.from_named_ifc_block = ir->data.from_named_ifc_block; var->data.compact = false; switch(ir->data.mode) { @@ -516,23 +533,27 @@ unreachable("not reached"); } - unsigned image_access = 0; + unsigned mem_access = 0; if (ir->data.memory_read_only) - image_access |= ACCESS_NON_WRITEABLE; + mem_access |= ACCESS_NON_WRITEABLE; if (ir->data.memory_write_only) - image_access |= ACCESS_NON_READABLE; + mem_access |= ACCESS_NON_READABLE; if (ir->data.memory_coherent) - image_access |= ACCESS_COHERENT; + mem_access |= ACCESS_COHERENT; if (ir->data.memory_volatile) - image_access |= ACCESS_VOLATILE; + mem_access |= ACCESS_VOLATILE; if (ir->data.memory_restrict) - image_access |= ACCESS_RESTRICT; + mem_access |= ACCESS_RESTRICT; + + var->interface_type = ir->get_interface_type(); /* For UBO and SSBO variables, we need explicit types */ if (var->data.mode & (nir_var_mem_ubo | nir_var_mem_ssbo)) { const glsl_type *explicit_ifc_type = ir->get_interface_type()->get_explicit_interface_type(supports_std430); + var->interface_type = explicit_ifc_type; + if (ir->type->without_array()->is_interface()) { /* If the type contains the interface, wrap the explicit type in the * right number of arrays. @@ -549,15 +570,15 @@ var->type = field->type; if (field->memory_read_only) - image_access |= ACCESS_NON_WRITEABLE; + mem_access |= ACCESS_NON_WRITEABLE; if (field->memory_write_only) - image_access |= ACCESS_NON_READABLE; + mem_access |= ACCESS_NON_READABLE; if (field->memory_coherent) - image_access |= ACCESS_COHERENT; + mem_access |= ACCESS_COHERENT; if (field->memory_volatile) - image_access |= ACCESS_VOLATILE; + mem_access |= ACCESS_VOLATILE; if (field->memory_restrict) - image_access |= ACCESS_RESTRICT; + mem_access |= ACCESS_RESTRICT; found = true; break; @@ -595,15 +616,18 @@ var->data.explicit_binding = ir->data.explicit_binding; var->data.bindless = ir->data.bindless; var->data.offset = ir->data.offset; + var->data.access = (gl_access_qualifier)mem_access; - var->data.image.access = (gl_access_qualifier)image_access; - var->data.image.format = ir->data.image_format; + if (var->type->without_array()->is_image()) { + var->data.image.format = ir->data.image_format; + } else if (var->data.mode == nir_var_shader_out) { + var->data.xfb.buffer = ir->data.xfb_buffer; + var->data.xfb.stride = ir->data.xfb_stride; + } var->data.fb_fetch_output = ir->data.fb_fetch_output; var->data.explicit_xfb_buffer = ir->data.explicit_xfb_buffer; var->data.explicit_xfb_stride = ir->data.explicit_xfb_stride; - var->data.xfb_buffer = ir->data.xfb_buffer; - var->data.xfb_stride = ir->data.xfb_stride; var->num_state_slots = ir->get_num_state_slots(); if (var->num_state_slots > 0) { @@ -622,8 +646,6 @@ var->constant_initializer = constant_copy(ir->constant_initializer, var); - var->interface_type = ir->get_interface_type(); - if (var->data.mode == nir_var_function_temp) nir_function_impl_add_variable(impl, var); else @@ -776,6 +798,15 @@ } void +nir_visitor::visit(ir_demote *ir) +{ + nir_intrinsic_instr *demote = + nir_intrinsic_instr_create(this->shader, nir_intrinsic_demote); + + nir_builder_instr_insert(&b, &demote->instr); +} + +void nir_visitor::visit(ir_emit_vertex *ir) { nir_intrinsic_instr *instr = @@ -846,7 +877,7 @@ nir_deref_path path; nir_deref_path_init(&path, deref, NULL); - unsigned qualifiers = path.path[0]->var->data.image.access; + unsigned qualifiers = path.path[0]->var->data.access; const glsl_type *parent_type = path.path[0]->type; for (nir_deref_instr **cur_ptr = &path.path[1]; *cur_ptr; cur_ptr++) { @@ -970,10 +1001,20 @@ : nir_intrinsic_image_deref_atomic_fadd; break; case ir_intrinsic_image_atomic_min: - op = nir_intrinsic_image_deref_atomic_min; + if (ir->return_deref->type == glsl_type::int_type) + op = nir_intrinsic_image_deref_atomic_imin; + else if (ir->return_deref->type == glsl_type::uint_type) + op = nir_intrinsic_image_deref_atomic_umin; + else + unreachable("Invalid type"); break; case ir_intrinsic_image_atomic_max: - op = nir_intrinsic_image_deref_atomic_max; + if (ir->return_deref->type == glsl_type::int_type) + op = nir_intrinsic_image_deref_atomic_imax; + else if (ir->return_deref->type == glsl_type::uint_type) + op = nir_intrinsic_image_deref_atomic_umax; + else + unreachable("Invalid type"); break; case ir_intrinsic_image_atomic_and: op = nir_intrinsic_image_deref_atomic_and; @@ -1146,6 +1187,9 @@ case ir_intrinsic_read_first_invocation: op = nir_intrinsic_read_first_invocation; break; + case ir_intrinsic_helper_invocation: + op = nir_intrinsic_is_helper_invocation; + break; default: unreachable("not reached"); } @@ -1255,8 +1299,10 @@ case nir_intrinsic_image_deref_load: case nir_intrinsic_image_deref_store: case nir_intrinsic_image_deref_atomic_add: - case nir_intrinsic_image_deref_atomic_min: - case nir_intrinsic_image_deref_atomic_max: + case nir_intrinsic_image_deref_atomic_imin: + case nir_intrinsic_image_deref_atomic_umin: + case nir_intrinsic_image_deref_atomic_imax: + case nir_intrinsic_image_deref_atomic_umax: case nir_intrinsic_image_deref_atomic_and: case nir_intrinsic_image_deref_atomic_or: case nir_intrinsic_image_deref_atomic_xor: @@ -1335,13 +1381,18 @@ instr->src[3] = nir_src_for_ssa(evaluate_rvalue((ir_dereference *)param)); param = param->get_next(); + } else if (op == nir_intrinsic_image_deref_load) { + instr->src[3] = nir_src_for_ssa(nir_imm_int(&b, 0)); /* LOD */ } if (!param->is_tail_sentinel()) { instr->src[4] = nir_src_for_ssa(evaluate_rvalue((ir_dereference *)param)); param = param->get_next(); + } else if (op == nir_intrinsic_image_deref_store) { + instr->src[4] = nir_src_for_ssa(nir_imm_int(&b, 0)); /* LOD */ } + nir_builder_instr_insert(&b, &instr->instr); break; } @@ -1619,6 +1670,12 @@ nir_builder_instr_insert(&b, &instr->instr); break; } + case nir_intrinsic_is_helper_invocation: { + nir_ssa_dest_init(&instr->instr, &instr->dest, 1, 1, NULL); + instr->num_components = 1; + nir_builder_instr_insert(&b, &instr->instr); + break; + } default: unreachable("not reached"); } @@ -1946,6 +2003,9 @@ result = type_is_float(types[0]) ? nir_fabs(&b, srcs[0]) : nir_iabs(&b, srcs[0]); break; + case ir_unop_clz: + result = nir_uclz(&b, srcs[0]); + break; case ir_unop_saturate: assert(type_is_float(types[0])); result = nir_fsat(&b, srcs[0]); @@ -2156,14 +2216,45 @@ return; } + case ir_unop_atan: + result = nir_atan(&b, srcs[0]); + break; + case ir_binop_add: result = type_is_float(out_type) ? nir_fadd(&b, srcs[0], srcs[1]) : nir_iadd(&b, srcs[0], srcs[1]); break; + case ir_binop_add_sat: + result = type_is_signed(out_type) ? nir_iadd_sat(&b, srcs[0], srcs[1]) + : nir_uadd_sat(&b, srcs[0], srcs[1]); + break; case ir_binop_sub: result = type_is_float(out_type) ? nir_fsub(&b, srcs[0], srcs[1]) : nir_isub(&b, srcs[0], srcs[1]); break; + case ir_binop_sub_sat: + result = type_is_signed(out_type) ? nir_isub_sat(&b, srcs[0], srcs[1]) + : nir_usub_sat(&b, srcs[0], srcs[1]); + break; + case ir_binop_abs_sub: + /* out_type is always unsigned for ir_binop_abs_sub, so we have to key + * on the type of the sources. + */ + result = type_is_signed(types[0]) ? nir_uabs_isub(&b, srcs[0], srcs[1]) + : nir_uabs_usub(&b, srcs[0], srcs[1]); + break; + case ir_binop_avg: + result = type_is_signed(out_type) ? nir_ihadd(&b, srcs[0], srcs[1]) + : nir_uhadd(&b, srcs[0], srcs[1]); + break; + case ir_binop_avg_round: + result = type_is_signed(out_type) ? nir_irhadd(&b, srcs[0], srcs[1]) + : nir_urhadd(&b, srcs[0], srcs[1]); + break; + case ir_binop_mul_32x16: + result = type_is_signed(out_type) ? nir_imul_32x16(&b, srcs[0], srcs[1]) + : nir_umul_32x16(&b, srcs[0], srcs[1]); + break; case ir_binop_mul: if (type_is_float(out_type)) result = nir_fmul(&b, srcs[0], srcs[1]); @@ -2319,6 +2410,10 @@ break; } + case ir_binop_atan2: + result = nir_atan2(&b, srcs[0], srcs[1]); + break; + case ir_binop_ldexp: result = nir_ldexp(&b, srcs[0], srcs[1]); break; case ir_triop_fma: result = nir_ffma(&b, srcs[0], srcs[1], srcs[2]); @@ -2644,8 +2739,20 @@ void nir_visitor::visit(ir_barrier *) { + if (shader->info.stage == MESA_SHADER_COMPUTE) { + nir_intrinsic_instr *shared_barrier = + nir_intrinsic_instr_create(this->shader, + nir_intrinsic_memory_barrier_shared); + nir_builder_instr_insert(&b, &shared_barrier->instr); + } else if (shader->info.stage == MESA_SHADER_TESS_CTRL) { + nir_intrinsic_instr *patch_barrier = + nir_intrinsic_instr_create(this->shader, + nir_intrinsic_memory_barrier_tcs_patch); + nir_builder_instr_insert(&b, &patch_barrier->instr); + } + nir_intrinsic_instr *instr = - nir_intrinsic_instr_create(this->shader, nir_intrinsic_barrier); + nir_intrinsic_instr_create(this->shader, nir_intrinsic_control_barrier); nir_builder_instr_insert(&b, &instr->instr); } diff -Nru mesa-19.2.8/src/compiler/glsl/ir_clone.cpp mesa-20.0.8/src/compiler/glsl/ir_clone.cpp --- mesa-19.2.8/src/compiler/glsl/ir_clone.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/glsl/ir_clone.cpp 2020-06-12 01:21:16.000000000 +0000 @@ -102,6 +102,12 @@ return new(mem_ctx) ir_discard(new_condition); } +ir_demote * +ir_demote::clone(void *mem_ctx, struct hash_table *ht) const +{ + return new(mem_ctx) ir_demote(); +} + ir_loop_jump * ir_loop_jump::clone(void *mem_ctx, struct hash_table *ht) const { diff -Nru mesa-19.2.8/src/compiler/glsl/ir_constant_expression.cpp mesa-20.0.8/src/compiler/glsl/ir_constant_expression.cpp --- mesa-19.2.8/src/compiler/glsl/ir_constant_expression.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/glsl/ir_constant_expression.cpp 2020-06-12 01:21:16.000000000 +0000 @@ -73,7 +73,8 @@ static float bitcast_u2f(unsigned int u) { - assert(sizeof(float) == sizeof(unsigned int)); + static_assert(sizeof(float) == sizeof(unsigned int), + "float and unsigned int size mismatch"); float f; memcpy(&f, &u, sizeof(f)); return f; @@ -82,7 +83,8 @@ static unsigned int bitcast_f2u(float f) { - assert(sizeof(float) == sizeof(unsigned int)); + static_assert(sizeof(float) == sizeof(unsigned int), + "float and unsigned int size mismatch"); unsigned int u; memcpy(&u, &f, sizeof(f)); return u; @@ -91,7 +93,8 @@ static double bitcast_u642d(uint64_t u) { - assert(sizeof(double) == sizeof(uint64_t)); + static_assert(sizeof(double) == sizeof(uint64_t), + "double and uint64_t size mismatch"); double d; memcpy(&d, &u, sizeof(d)); return d; @@ -100,7 +103,8 @@ static double bitcast_i642d(int64_t i) { - assert(sizeof(double) == sizeof(int64_t)); + static_assert(sizeof(double) == sizeof(int64_t), + "double and int64_t size mismatch"); double d; memcpy(&d, &i, sizeof(d)); return d; @@ -109,7 +113,8 @@ static uint64_t bitcast_d2u64(double d) { - assert(sizeof(double) == sizeof(uint64_t)); + static_assert(sizeof(double) == sizeof(uint64_t), + "double and uint64_t size mismatch"); uint64_t u; memcpy(&u, &d, sizeof(d)); return u; @@ -118,7 +123,8 @@ static int64_t bitcast_d2i64(double d) { - assert(sizeof(double) == sizeof(int64_t)); + static_assert(sizeof(double) == sizeof(int64_t), + "double and int64_t size mismatch"); int64_t i; memcpy(&i, &d, sizeof(d)); return i; @@ -410,6 +416,42 @@ return _mesa_half_to_float(u); } +static int32_t +iadd_saturate(int32_t a, int32_t b) +{ + return CLAMP(int64_t(a) + int64_t(b), INT32_MIN, INT32_MAX); +} + +static int64_t +iadd64_saturate(int64_t a, int64_t b) +{ + if (a < 0 && b < INT64_MIN - a) + return INT64_MIN; + + if (a > 0 && b > INT64_MAX - a) + return INT64_MAX; + + return a + b; +} + +static int32_t +isub_saturate(int32_t a, int32_t b) +{ + return CLAMP(int64_t(a) - int64_t(b), INT32_MIN, INT32_MAX); +} + +static int64_t +isub64_saturate(int64_t a, int64_t b) +{ + if (b > 0 && a < INT64_MIN + b) + return INT64_MIN; + + if (b < 0 && a > INT64_MAX + b) + return INT64_MAX; + + return a - b; +} + /** * Get the constant that is ultimately referenced by an r-value, in a constant * expression evaluation context. diff -Nru mesa-19.2.8/src/compiler/glsl/ir.cpp mesa-20.0.8/src/compiler/glsl/ir.cpp --- mesa-19.2.8/src/compiler/glsl/ir.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/glsl/ir.cpp 2020-06-12 01:21:16.000000000 +0000 @@ -257,7 +257,9 @@ case ir_unop_dFdy_fine: case ir_unop_bitfield_reverse: case ir_unop_interpolate_at_centroid: + case ir_unop_clz: case ir_unop_saturate: + case ir_unop_atan: this->type = op0->type; break; @@ -452,6 +454,7 @@ case ir_binop_mul: case ir_binop_div: case ir_binop_mod: + case ir_binop_atan2: if (op0->type->is_scalar()) { this->type = op1->type; } else if (op1->type->is_scalar()) { @@ -498,6 +501,7 @@ break; case ir_binop_imul_high: + case ir_binop_mul_32x16: case ir_binop_carry: case ir_binop_borrow: case ir_binop_lshift: @@ -508,6 +512,44 @@ this->type = op0->type; break; + case ir_binop_add_sat: + case ir_binop_sub_sat: + case ir_binop_avg: + case ir_binop_avg_round: + assert(op0->type == op1->type); + this->type = op0->type; + break; + + case ir_binop_abs_sub: { + enum glsl_base_type base; + + assert(op0->type == op1->type); + + switch (op0->type->base_type) { + case GLSL_TYPE_UINT: + case GLSL_TYPE_INT: + base = GLSL_TYPE_UINT; + break; + case GLSL_TYPE_UINT8: + case GLSL_TYPE_INT8: + base = GLSL_TYPE_UINT8; + break; + case GLSL_TYPE_UINT16: + case GLSL_TYPE_INT16: + base = GLSL_TYPE_UINT16; + break; + case GLSL_TYPE_UINT64: + case GLSL_TYPE_INT64: + base = GLSL_TYPE_UINT64; + break; + default: + unreachable(!"Invalid base type."); + } + + this->type = glsl_type::get_instance(base, op0->type->vector_elements, 1); + break; + } + case ir_binop_vector_extract: this->type = op0->type->get_scalar_type(); break; @@ -1827,6 +1869,7 @@ builtin_available_predicate b) : ir_instruction(ir_type_function_signature), return_type(return_type), is_defined(false), + return_precision(GLSL_PRECISION_NONE), intrinsic_id(ir_intrinsic_invalid), builtin_avail(b), _function(NULL) { this->origin = NULL; diff -Nru mesa-19.2.8/src/compiler/glsl/ir_expression_operation.py mesa-20.0.8/src/compiler/glsl/ir_expression_operation.py --- mesa-19.2.8/src/compiler/glsl/ir_expression_operation.py 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/glsl/ir_expression_operation.py 2020-06-12 01:21:16.000000000 +0000 @@ -512,6 +512,7 @@ # Trigonometric operations. operation("sin", 1, source_types=(float_type,), c_expression="sinf({src0})"), operation("cos", 1, source_types=(float_type,), c_expression="cosf({src0})"), + operation("atan", 1, source_types=(float_type,), c_expression="atan({src0})"), # Partial derivatives. operation("dFdx", 1, source_types=(float_type,), c_expression="0.0f"), @@ -538,6 +539,7 @@ operation("bit_count", 1, source_types=(uint_type, int_type), dest_type=int_type, c_expression="util_bitcount({src0})"), operation("find_msb", 1, source_types=(uint_type, int_type), dest_type=int_type, c_expression={'u': "find_msb_uint({src0})", 'i': "find_msb_int({src0})"}), operation("find_lsb", 1, source_types=(uint_type, int_type), dest_type=int_type, c_expression="find_msb_uint({src0} & -{src0})"), + operation("clz", 1, source_types=(uint_type,), dest_type=uint_type, c_expression="(unsigned)(31 - find_msb_uint({src0}))"), operation("saturate", 1, printable_name="sat", source_types=(float_type,), c_expression="CLAMP({src0}, 0.0f, 1.0f)"), @@ -583,8 +585,33 @@ operation("add", 2, printable_name="+", source_types=numeric_types, c_expression="{src0} + {src1}", flags=vector_scalar_operation), operation("sub", 2, printable_name="-", source_types=numeric_types, c_expression="{src0} - {src1}", flags=vector_scalar_operation), + operation("add_sat", 2, printable_name="add_sat", source_types=integer_types, c_expression={ + 'u': "({src0} + {src1}) < {src0} ? UINT32_MAX : ({src0} + {src1})", + 'i': "iadd_saturate({src0}, {src1})", + 'u64': "({src0} + {src1}) < {src0} ? UINT64_MAX : ({src0} + {src1})", + 'i64': "iadd64_saturate({src0}, {src1})" + }), + operation("sub_sat", 2, printable_name="sub_sat", source_types=integer_types, c_expression={ + 'u': "({src1} > {src0}) ? 0 : {src0} - {src1}", + 'i': "isub_saturate({src0}, {src1})", + 'u64': "({src1} > {src0}) ? 0 : {src0} - {src1}", + 'i64': "isub64_saturate({src0}, {src1})" + }), + operation("abs_sub", 2, printable_name="abs_sub", source_types=integer_types, c_expression={ + 'u': "({src1} > {src0}) ? {src1} - {src0} : {src0} - {src1}", + 'i': "({src1} > {src0}) ? (unsigned){src1} - (unsigned){src0} : (unsigned){src0} - (unsigned){src1}", + 'u64': "({src1} > {src0}) ? {src1} - {src0} : {src0} - {src1}", + 'i64': "({src1} > {src0}) ? (uint64_t){src1} - (uint64_t){src0} : (uint64_t){src0} - (uint64_t){src1}", + }), + operation("avg", 2, printable_name="average", source_types=integer_types, c_expression="({src0} >> 1) + ({src1} >> 1) + (({src0} & {src1}) & 1)"), + operation("avg_round", 2, printable_name="average_rounded", source_types=integer_types, c_expression="({src0} >> 1) + ({src1} >> 1) + (({src0} | {src1}) & 1)"), + # "Floating-point or low 32-bit integer multiply." operation("mul", 2, printable_name="*", source_types=numeric_types, c_expression="{src0} * {src1}"), + operation("mul_32x16", 2, printable_name="*", source_types=(uint_type, int_type), c_expression={ + 'u': "{src0} * (uint16_t){src1}", + 'i': "{src0} * (int16_t){src0}" + }), operation("imul_high", 2), # Calculates the high 32-bits of a 64-bit multiply. operation("div", 2, printable_name="/", source_types=numeric_types, c_expression={'u': "{src1} == 0 ? 0 : {src0} / {src1}", 'i': "{src1} == 0 ? 0 : {src0} / {src1}", 'u64': "{src1} == 0 ? 0 : {src0} / {src1}", 'i64': "{src1} == 0 ? 0 : {src0} / {src1}", 'default': "{src0} / {src1}"}, flags=vector_scalar_operation), @@ -664,6 +691,8 @@ # operand1 is the sample ID operation("interpolate_at_sample", 2), + operation("atan2", 2, source_types=(float_type,), c_expression="atan2({src0}, {src1})"), + # Fused floating-point multiply-add, part of ARB_gpu_shader5. operation("fma", 3, source_types=real_types, c_expression="{src0} * {src1} + {src2}"), diff -Nru mesa-19.2.8/src/compiler/glsl/ir.h mesa-20.0.8/src/compiler/glsl/ir.h --- mesa-19.2.8/src/compiler/glsl/ir.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/glsl/ir.h 2020-06-12 01:21:16.000000000 +0000 @@ -74,6 +74,7 @@ ir_type_loop_jump, ir_type_return, ir_type_discard, + ir_type_demote, ir_type_emit_vertex, ir_type_end_primitive, ir_type_barrier, @@ -904,7 +905,7 @@ * * For array types, this represents the binding point for the first element. */ - int16_t binding; + uint16_t binding; /** * Storage location of the base of this variable @@ -1137,6 +1138,8 @@ ir_intrinsic_read_invocation, ir_intrinsic_read_first_invocation, + ir_intrinsic_helper_invocation, + ir_intrinsic_shared_load, ir_intrinsic_shared_store = MAKE_INTRINSIC_FOR_TYPE(store, shared), ir_intrinsic_shared_atomic_add = MAKE_INTRINSIC_FOR_TYPE(atomic_add, shared), @@ -1222,7 +1225,7 @@ /** * Function return type. * - * \note This discards the optional precision qualifier. + * \note The precision qualifier is stored separately in return_precision. */ const struct glsl_type *return_type; @@ -1237,6 +1240,13 @@ /** Whether or not this function has a body (which may be empty). */ unsigned is_defined:1; + /* + * Precision qualifier for the return type. + * + * See the comment for ir_variable_data::precision for more details. + */ + unsigned return_precision:2; + /** Whether or not this function signature is a built-in. */ bool is_builtin() const; @@ -1798,6 +1808,28 @@ /** + * IR instruction representing demote statements from + * GL_EXT_demote_to_helper_invocation. + */ +class ir_demote : public ir_instruction { +public: + ir_demote() + : ir_instruction(ir_type_demote) + { + } + + virtual ir_demote *clone(void *mem_ctx, struct hash_table *ht) const; + + virtual void accept(ir_visitor *v) + { + v->visit(this); + } + + virtual ir_visitor_status accept(ir_hierarchical_visitor *); +}; + + +/** * Texture sampling opcodes used in ir_texture */ enum ir_texture_opcode { diff -Nru mesa-19.2.8/src/compiler/glsl/ir_hierarchical_visitor.cpp mesa-20.0.8/src/compiler/glsl/ir_hierarchical_visitor.cpp --- mesa-19.2.8/src/compiler/glsl/ir_hierarchical_visitor.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/glsl/ir_hierarchical_visitor.cpp 2020-06-12 01:21:16.000000000 +0000 @@ -305,6 +305,24 @@ } ir_visitor_status +ir_hierarchical_visitor::visit_enter(ir_demote *ir) +{ + if (this->callback_enter != NULL) + this->callback_enter(ir, this->data_enter); + + return visit_continue; +} + +ir_visitor_status +ir_hierarchical_visitor::visit_leave(ir_demote *ir) +{ + if (this->callback_leave != NULL) + this->callback_leave(ir, this->data_leave); + + return visit_continue; +} + +ir_visitor_status ir_hierarchical_visitor::visit_enter(ir_if *ir) { if (this->callback_enter != NULL) diff -Nru mesa-19.2.8/src/compiler/glsl/ir_hierarchical_visitor.h mesa-20.0.8/src/compiler/glsl/ir_hierarchical_visitor.h --- mesa-19.2.8/src/compiler/glsl/ir_hierarchical_visitor.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/glsl/ir_hierarchical_visitor.h 2020-06-12 01:21:16.000000000 +0000 @@ -133,6 +133,8 @@ virtual ir_visitor_status visit_leave(class ir_return *); virtual ir_visitor_status visit_enter(class ir_discard *); virtual ir_visitor_status visit_leave(class ir_discard *); + virtual ir_visitor_status visit_enter(class ir_demote *); + virtual ir_visitor_status visit_leave(class ir_demote *); virtual ir_visitor_status visit_enter(class ir_if *); virtual ir_visitor_status visit_leave(class ir_if *); virtual ir_visitor_status visit_enter(class ir_emit_vertex *); diff -Nru mesa-19.2.8/src/compiler/glsl/ir_hv_accept.cpp mesa-20.0.8/src/compiler/glsl/ir_hv_accept.cpp --- mesa-19.2.8/src/compiler/glsl/ir_hv_accept.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/glsl/ir_hv_accept.cpp 2020-06-12 01:21:16.000000000 +0000 @@ -379,6 +379,18 @@ ir_visitor_status +ir_demote::accept(ir_hierarchical_visitor *v) +{ + ir_visitor_status s = v->visit_enter(this); + + if (s != visit_continue) + return (s == visit_continue_with_parent) ? visit_continue : s; + + return v->visit_leave(this); +} + + +ir_visitor_status ir_if::accept(ir_hierarchical_visitor *v) { ir_visitor_status s = v->visit_enter(this); diff -Nru mesa-19.2.8/src/compiler/glsl/ir_optimization.h mesa-20.0.8/src/compiler/glsl/ir_optimization.h --- mesa-19.2.8/src/compiler/glsl/ir_optimization.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/glsl/ir_optimization.h 2020-06-12 01:21:16.000000000 +0000 @@ -140,7 +140,7 @@ exec_list *instructions, bool lower_input, bool lower_output, bool lower_temp, bool lower_uniform); bool lower_quadop_vector(exec_list *instructions, bool dont_lower_swz); -bool lower_const_arrays_to_uniforms(exec_list *instructions, unsigned stage); +bool lower_const_arrays_to_uniforms(exec_list *instructions, unsigned stage, unsigned max_uniform_components); bool lower_clip_cull_distance(struct gl_shader_program *prog, gl_linked_shader *shader); void lower_output_reads(unsigned stage, exec_list *instructions); diff -Nru mesa-19.2.8/src/compiler/glsl/ir_print_visitor.cpp mesa-20.0.8/src/compiler/glsl/ir_print_visitor.cpp --- mesa-19.2.8/src/compiler/glsl/ir_print_visitor.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/glsl/ir_print_visitor.cpp 2020-06-12 01:21:16.000000000 +0000 @@ -213,7 +213,7 @@ "in ", "out ", "inout ", "const_in ", "sys ", "temporary " }; STATIC_ASSERT(ARRAY_SIZE(mode) == ir_var_mode_count); - const char *const interp[] = { "", "smooth", "flat", "noperspective" }; + const char *const interp[] = { "", "smooth", "flat", "noperspective", "explicit" }; STATIC_ASSERT(ARRAY_SIZE(interp) == INTERP_MODE_COUNT); fprintf(f, "(%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s) ", @@ -564,6 +564,13 @@ } +void +ir_print_visitor::visit(ir_demote *ir) +{ + fprintf(f, "(demote)"); +} + + void ir_print_visitor::visit(ir_if *ir) { diff -Nru mesa-19.2.8/src/compiler/glsl/ir_print_visitor.h mesa-20.0.8/src/compiler/glsl/ir_print_visitor.h --- mesa-19.2.8/src/compiler/glsl/ir_print_visitor.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/glsl/ir_print_visitor.h 2020-06-12 01:21:16.000000000 +0000 @@ -63,6 +63,7 @@ virtual void visit(ir_call *); virtual void visit(ir_return *); virtual void visit(ir_discard *); + virtual void visit(ir_demote *); virtual void visit(ir_if *); virtual void visit(ir_loop *); virtual void visit(ir_loop_jump *); diff -Nru mesa-19.2.8/src/compiler/glsl/ir_validate.cpp mesa-20.0.8/src/compiler/glsl/ir_validate.cpp --- mesa-19.2.8/src/compiler/glsl/ir_validate.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/glsl/ir_validate.cpp 2020-06-12 01:21:16.000000000 +0000 @@ -546,6 +546,11 @@ assert(ir->type->base_type == GLSL_TYPE_INT); break; + case ir_unop_clz: + assert(ir->operands[0]->type == ir->type); + assert(ir->type->base_type == GLSL_TYPE_UINT); + break; + case ir_unop_noise: /* XXX what can we assert here? */ break; @@ -610,6 +615,12 @@ assert(ir->type->base_type == GLSL_TYPE_INT); break; + case ir_unop_atan: + assert(ir->operands[0]->type->is_float() || + ir->operands[0]->type->is_double()); + assert(ir->type == ir->operands[0]->type); + break; + case ir_binop_add: case ir_binop_sub: case ir_binop_mul: @@ -643,6 +654,25 @@ } break; + case ir_binop_abs_sub: + assert(ir->operands[0]->type == ir->operands[1]->type); + assert(ir->operands[0]->type->is_integer_32_64()); + assert(ir->operands[0]->type->vector_elements == + ir->type->vector_elements); + assert(ir->type->base_type == GLSL_TYPE_UINT || + ir->type->base_type == GLSL_TYPE_UINT64); + break; + + case ir_binop_add_sat: + case ir_binop_sub_sat: + case ir_binop_avg: + case ir_binop_avg_round: + assert(ir->type == ir->operands[0]->type); + assert(ir->type == ir->operands[1]->type); + assert(ir->type->is_integer_32_64()); + break; + + case ir_binop_mul_32x16: case ir_binop_imul_high: assert(ir->type == ir->operands[0]->type); assert(ir->type == ir->operands[1]->type); @@ -761,6 +791,13 @@ assert(ir->operands[1]->type == glsl_type::int_type); break; + case ir_binop_atan2: + assert(ir->operands[0]->type->is_float() || + ir->operands[0]->type->is_double()); + assert(ir->operands[1]->type == ir->operands[0]->type); + assert(ir->type == ir->operands[0]->type); + break; + case ir_triop_fma: assert(ir->type->is_float() || ir->type->is_double()); @@ -1052,7 +1089,8 @@ _mesa_set_add(ir_set, ir); } -ASSERTED static void +#ifdef DEBUG +static void check_node_type(ir_instruction *ir, void *data) { (void) data; @@ -1065,6 +1103,7 @@ if (value != NULL) assert(value->type != glsl_type::error_type); } +#endif void validate_ir_tree(exec_list *instructions) diff -Nru mesa-19.2.8/src/compiler/glsl/ir_visitor.h mesa-20.0.8/src/compiler/glsl/ir_visitor.h --- mesa-19.2.8/src/compiler/glsl/ir_visitor.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/glsl/ir_visitor.h 2020-06-12 01:21:16.000000000 +0000 @@ -59,6 +59,7 @@ virtual void visit(class ir_call *) = 0; virtual void visit(class ir_return *) = 0; virtual void visit(class ir_discard *) = 0; + virtual void visit(class ir_demote *) = 0; virtual void visit(class ir_if *) = 0; virtual void visit(class ir_loop *) = 0; virtual void visit(class ir_loop_jump *) = 0; @@ -83,6 +84,7 @@ virtual void visit(class ir_assignment *) {} virtual void visit(class ir_constant *) {} virtual void visit(class ir_call *) {} + virtual void visit(class ir_demote *) {} virtual void visit(class ir_emit_vertex *) {} virtual void visit(class ir_end_primitive *) {} virtual void visit(class ir_barrier *) {} diff -Nru mesa-19.2.8/src/compiler/glsl/linker.cpp mesa-20.0.8/src/compiler/glsl/linker.cpp --- mesa-19.2.8/src/compiler/glsl/linker.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/glsl/linker.cpp 2020-06-12 01:21:16.000000000 +0000 @@ -587,11 +587,10 @@ analyze_clip_cull_usage(struct gl_shader_program *prog, struct gl_linked_shader *shader, struct gl_context *ctx, - GLuint *clip_distance_array_size, - GLuint *cull_distance_array_size) + struct shader_info *info) { - *clip_distance_array_size = 0; - *cull_distance_array_size = 0; + info->clip_distance_array_size = 0; + info->cull_distance_array_size = 0; if (prog->data->Version >= (prog->IsES ? 300 : 130)) { /* From section 7.1 (Vertex Shader Special Variables) of the @@ -643,13 +642,13 @@ ir_variable *clip_distance_var = shader->symbols->get_variable("gl_ClipDistance"); assert(clip_distance_var); - *clip_distance_array_size = clip_distance_var->type->length; + info->clip_distance_array_size = clip_distance_var->type->length; } if (gl_CullDistance.found) { ir_variable *cull_distance_var = shader->symbols->get_variable("gl_CullDistance"); assert(cull_distance_var); - *cull_distance_array_size = cull_distance_var->type->length; + info->cull_distance_array_size = cull_distance_var->type->length; } /* From the ARB_cull_distance spec: * @@ -658,7 +657,7 @@ * gl_CullDistance arrays to be larger than * gl_MaxCombinedClipAndCullDistances. */ - if ((*clip_distance_array_size + *cull_distance_array_size) > + if ((uint32_t)(info->clip_distance_array_size + info->cull_distance_array_size) > ctx->Const.MaxClipPlanes) { linker_error(prog, "%s shader: the combined size of " "'gl_ClipDistance' and 'gl_CullDistance' size cannot " @@ -729,9 +728,7 @@ } } - analyze_clip_cull_usage(prog, shader, ctx, - &shader->Program->info.clip_distance_array_size, - &shader->Program->info.cull_distance_array_size); + analyze_clip_cull_usage(prog, shader, ctx, &shader->Program->info); } static void @@ -742,9 +739,7 @@ if (shader == NULL) return; - analyze_clip_cull_usage(prog, shader, ctx, - &shader->Program->info.clip_distance_array_size, - &shader->Program->info.cull_distance_array_size); + analyze_clip_cull_usage(prog, shader, ctx, &shader->Program->info); } @@ -791,9 +786,7 @@ vertices_per_prim(shader->Program->info.gs.input_primitive); prog->Geom.VerticesIn = num_vertices; - analyze_clip_cull_usage(prog, shader, ctx, - &shader->Program->info.clip_distance_array_size, - &shader->Program->info.cull_distance_array_size); + analyze_clip_cull_usage(prog, shader, ctx, &shader->Program->info); } /** @@ -2509,6 +2502,22 @@ link_uniform_blocks(mem_ctx, ctx, prog, linked, &ubo_blocks, &num_ubo_blocks, &ssbo_blocks, &num_ssbo_blocks); + const unsigned max_uniform_blocks = + ctx->Const.Program[linked->Stage].MaxUniformBlocks; + if (num_ubo_blocks > max_uniform_blocks) { + linker_error(prog, "Too many %s uniform blocks (%d/%d)\n", + _mesa_shader_stage_to_string(linked->Stage), + num_ubo_blocks, max_uniform_blocks); + } + + const unsigned max_shader_storage_blocks = + ctx->Const.Program[linked->Stage].MaxShaderStorageBlocks; + if (num_ssbo_blocks > max_shader_storage_blocks) { + linker_error(prog, "Too many %s shader storage blocks (%d/%d)\n", + _mesa_shader_stage_to_string(linked->Stage), + num_ssbo_blocks, max_shader_storage_blocks); + } + if (!prog->data->LinkStatus) { _mesa_delete_linked_shader(ctx, linked); return NULL; @@ -3313,157 +3322,6 @@ } /** - * Validate the resources used by a program versus the implementation limits - */ -static void -check_resources(struct gl_context *ctx, struct gl_shader_program *prog) -{ - unsigned total_uniform_blocks = 0; - unsigned total_shader_storage_blocks = 0; - - for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) { - struct gl_linked_shader *sh = prog->_LinkedShaders[i]; - - if (sh == NULL) - continue; - - if (sh->Program->info.num_textures > - ctx->Const.Program[i].MaxTextureImageUnits) { - linker_error(prog, "Too many %s shader texture samplers\n", - _mesa_shader_stage_to_string(i)); - } - - if (sh->num_uniform_components > - ctx->Const.Program[i].MaxUniformComponents) { - if (ctx->Const.GLSLSkipStrictMaxUniformLimitCheck) { - linker_warning(prog, "Too many %s shader default uniform block " - "components, but the driver will try to optimize " - "them out; this is non-portable out-of-spec " - "behavior\n", - _mesa_shader_stage_to_string(i)); - } else { - linker_error(prog, "Too many %s shader default uniform block " - "components\n", - _mesa_shader_stage_to_string(i)); - } - } - - if (sh->num_combined_uniform_components > - ctx->Const.Program[i].MaxCombinedUniformComponents) { - if (ctx->Const.GLSLSkipStrictMaxUniformLimitCheck) { - linker_warning(prog, "Too many %s shader uniform components, " - "but the driver will try to optimize them out; " - "this is non-portable out-of-spec behavior\n", - _mesa_shader_stage_to_string(i)); - } else { - linker_error(prog, "Too many %s shader uniform components\n", - _mesa_shader_stage_to_string(i)); - } - } - - total_shader_storage_blocks += sh->Program->info.num_ssbos; - total_uniform_blocks += sh->Program->info.num_ubos; - - const unsigned max_uniform_blocks = - ctx->Const.Program[i].MaxUniformBlocks; - if (max_uniform_blocks < sh->Program->info.num_ubos) { - linker_error(prog, "Too many %s uniform blocks (%d/%d)\n", - _mesa_shader_stage_to_string(i), - sh->Program->info.num_ubos, max_uniform_blocks); - } - - const unsigned max_shader_storage_blocks = - ctx->Const.Program[i].MaxShaderStorageBlocks; - if (max_shader_storage_blocks < sh->Program->info.num_ssbos) { - linker_error(prog, "Too many %s shader storage blocks (%d/%d)\n", - _mesa_shader_stage_to_string(i), - sh->Program->info.num_ssbos, max_shader_storage_blocks); - } - } - - if (total_uniform_blocks > ctx->Const.MaxCombinedUniformBlocks) { - linker_error(prog, "Too many combined uniform blocks (%d/%d)\n", - total_uniform_blocks, ctx->Const.MaxCombinedUniformBlocks); - } - - if (total_shader_storage_blocks > ctx->Const.MaxCombinedShaderStorageBlocks) { - linker_error(prog, "Too many combined shader storage blocks (%d/%d)\n", - total_shader_storage_blocks, - ctx->Const.MaxCombinedShaderStorageBlocks); - } - - for (unsigned i = 0; i < prog->data->NumUniformBlocks; i++) { - if (prog->data->UniformBlocks[i].UniformBufferSize > - ctx->Const.MaxUniformBlockSize) { - linker_error(prog, "Uniform block %s too big (%d/%d)\n", - prog->data->UniformBlocks[i].Name, - prog->data->UniformBlocks[i].UniformBufferSize, - ctx->Const.MaxUniformBlockSize); - } - } - - for (unsigned i = 0; i < prog->data->NumShaderStorageBlocks; i++) { - if (prog->data->ShaderStorageBlocks[i].UniformBufferSize > - ctx->Const.MaxShaderStorageBlockSize) { - linker_error(prog, "Shader storage block %s too big (%d/%d)\n", - prog->data->ShaderStorageBlocks[i].Name, - prog->data->ShaderStorageBlocks[i].UniformBufferSize, - ctx->Const.MaxShaderStorageBlockSize); - } - } -} - -static void -link_calculate_subroutine_compat(struct gl_shader_program *prog) -{ - unsigned mask = prog->data->linked_stages; - while (mask) { - const int i = u_bit_scan(&mask); - struct gl_program *p = prog->_LinkedShaders[i]->Program; - - for (unsigned j = 0; j < p->sh.NumSubroutineUniformRemapTable; j++) { - if (p->sh.SubroutineUniformRemapTable[j] == INACTIVE_UNIFORM_EXPLICIT_LOCATION) - continue; - - struct gl_uniform_storage *uni = p->sh.SubroutineUniformRemapTable[j]; - - if (!uni) - continue; - - int count = 0; - if (p->sh.NumSubroutineFunctions == 0) { - linker_error(prog, "subroutine uniform %s defined but no valid functions found\n", uni->type->name); - continue; - } - for (unsigned f = 0; f < p->sh.NumSubroutineFunctions; f++) { - struct gl_subroutine_function *fn = &p->sh.SubroutineFunctions[f]; - for (int k = 0; k < fn->num_compat_types; k++) { - if (fn->types[k] == uni->type) { - count++; - break; - } - } - } - uni->num_compatible_subroutines = count; - } - } -} - -static void -check_subroutine_resources(struct gl_shader_program *prog) -{ - unsigned mask = prog->data->linked_stages; - while (mask) { - const int i = u_bit_scan(&mask); - struct gl_program *p = prog->_LinkedShaders[i]->Program; - - if (p->sh.NumSubroutineUniformRemapTable > MAX_SUBROUTINE_UNIFORM_LOCATIONS) { - linker_error(prog, "Too many %s shader subroutine uniforms\n", - _mesa_shader_stage_to_string(i)); - } - } -} -/** * Validate shader image resources. */ static void @@ -3480,12 +3338,6 @@ struct gl_linked_shader *sh = prog->_LinkedShaders[i]; if (sh) { - if (sh->Program->info.num_images > ctx->Const.Program[i].MaxImageUniforms) - linker_error(prog, "Too many %s shader image uniforms (%u > %u)\n", - _mesa_shader_stage_to_string(i), - sh->Program->info.num_images, - ctx->Const.Program[i].MaxImageUniforms); - total_image_units += sh->Program->info.num_images; total_shader_storage_blocks += sh->Program->info.num_ssbos; @@ -4137,226 +3989,14 @@ return true; } -static char* -get_top_level_name(const char *name) -{ - const char *first_dot = strchr(name, '.'); - const char *first_square_bracket = strchr(name, '['); - int name_size = 0; - - /* The ARB_program_interface_query spec says: - * - * "For the property TOP_LEVEL_ARRAY_SIZE, a single integer identifying - * the number of active array elements of the top-level shader storage - * block member containing to the active variable is written to - * . If the top-level block member is not declared as an - * array, the value one is written to . If the top-level block - * member is an array with no declared size, the value zero is written - * to ." - */ - - /* The buffer variable is on top level.*/ - if (!first_square_bracket && !first_dot) - name_size = strlen(name); - else if ((!first_square_bracket || - (first_dot && first_dot < first_square_bracket))) - name_size = first_dot - name; - else - name_size = first_square_bracket - name; - - return strndup(name, name_size); -} - -static char* -get_var_name(const char *name) -{ - const char *first_dot = strchr(name, '.'); - - if (!first_dot) - return strdup(name); - - return strndup(first_dot+1, strlen(first_dot) - 1); -} - -static bool -is_top_level_shader_storage_block_member(const char* name, - const char* interface_name, - const char* field_name) -{ - bool result = false; - - /* If the given variable is already a top-level shader storage - * block member, then return array_size = 1. - * We could have two possibilities: if we have an instanced - * shader storage block or not instanced. - * - * For the first, we check create a name as it was in top level and - * compare it with the real name. If they are the same, then - * the variable is already at top-level. - * - * Full instanced name is: interface name + '.' + var name + - * NULL character - */ - int name_length = strlen(interface_name) + 1 + strlen(field_name) + 1; - char *full_instanced_name = (char *) calloc(name_length, sizeof(char)); - if (!full_instanced_name) { - fprintf(stderr, "%s: Cannot allocate space for name\n", __func__); - return false; - } - - snprintf(full_instanced_name, name_length, "%s.%s", - interface_name, field_name); - - /* Check if its top-level shader storage block member of an - * instanced interface block, or of a unnamed interface block. - */ - if (strcmp(name, full_instanced_name) == 0 || - strcmp(name, field_name) == 0) - result = true; - - free(full_instanced_name); - return result; -} - -static int -get_array_size(struct gl_uniform_storage *uni, const glsl_struct_field *field, - char *interface_name, char *var_name) -{ - /* The ARB_program_interface_query spec says: - * - * "For the property TOP_LEVEL_ARRAY_SIZE, a single integer identifying - * the number of active array elements of the top-level shader storage - * block member containing to the active variable is written to - * . If the top-level block member is not declared as an - * array, the value one is written to . If the top-level block - * member is an array with no declared size, the value zero is written - * to ." - */ - if (is_top_level_shader_storage_block_member(uni->name, - interface_name, - var_name)) - return 1; - else if (field->type->is_unsized_array()) - return 0; - else if (field->type->is_array()) - return field->type->length; - - return 1; -} - -static int -get_array_stride(struct gl_context *ctx, struct gl_uniform_storage *uni, - const glsl_type *iface, const glsl_struct_field *field, - char *interface_name, char *var_name) -{ - /* The ARB_program_interface_query spec says: - * - * "For the property TOP_LEVEL_ARRAY_STRIDE, a single integer - * identifying the stride between array elements of the top-level - * shader storage block member containing the active variable is - * written to . For top-level block members declared as - * arrays, the value written is the difference, in basic machine units, - * between the offsets of the active variable for consecutive elements - * in the top-level array. For top-level block members not declared as - * an array, zero is written to ." - */ - if (field->type->is_array()) { - const enum glsl_matrix_layout matrix_layout = - glsl_matrix_layout(field->matrix_layout); - bool row_major = matrix_layout == GLSL_MATRIX_LAYOUT_ROW_MAJOR; - const glsl_type *array_type = field->type->fields.array; - - if (is_top_level_shader_storage_block_member(uni->name, - interface_name, - var_name)) - return 0; - - if (GLSL_INTERFACE_PACKING_STD140 == - iface-> - get_internal_ifc_packing(ctx->Const.UseSTD430AsDefaultPacking)) { - if (array_type->is_struct() || array_type->is_array()) - return glsl_align(array_type->std140_size(row_major), 16); - else - return MAX2(array_type->std140_base_alignment(row_major), 16); - } else { - return array_type->std430_array_stride(row_major); - } - } - return 0; -} - -static void -calculate_array_size_and_stride(struct gl_context *ctx, - struct gl_shader_program *shProg, - struct gl_uniform_storage *uni) -{ - int block_index = uni->block_index; - int array_size = -1; - int array_stride = -1; - char *var_name = get_top_level_name(uni->name); - char *interface_name = - get_top_level_name(uni->is_shader_storage ? - shProg->data->ShaderStorageBlocks[block_index].Name : - shProg->data->UniformBlocks[block_index].Name); - - if (strcmp(var_name, interface_name) == 0) { - /* Deal with instanced array of SSBOs */ - char *temp_name = get_var_name(uni->name); - if (!temp_name) { - linker_error(shProg, "Out of memory during linking.\n"); - goto write_top_level_array_size_and_stride; - } - free(var_name); - var_name = get_top_level_name(temp_name); - free(temp_name); - if (!var_name) { - linker_error(shProg, "Out of memory during linking.\n"); - goto write_top_level_array_size_and_stride; - } - } - - for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) { - const gl_linked_shader *sh = shProg->_LinkedShaders[i]; - if (sh == NULL) - continue; - - foreach_in_list(ir_instruction, node, sh->ir) { - ir_variable *var = node->as_variable(); - if (!var || !var->get_interface_type() || - var->data.mode != ir_var_shader_storage) - continue; - - const glsl_type *iface = var->get_interface_type(); - - if (strcmp(interface_name, iface->name) != 0) - continue; - - for (unsigned i = 0; i < iface->length; i++) { - const glsl_struct_field *field = &iface->fields.structure[i]; - if (strcmp(field->name, var_name) != 0) - continue; - - array_stride = get_array_stride(ctx, uni, iface, field, - interface_name, var_name); - array_size = get_array_size(uni, field, interface_name, var_name); - goto write_top_level_array_size_and_stride; - } - } - } -write_top_level_array_size_and_stride: - free(interface_name); - free(var_name); - uni->top_level_array_stride = array_stride; - uni->top_level_array_size = array_size; -} - /** * Builds up a list of program resources that point to existing * resource data. */ void build_program_resource_list(struct gl_context *ctx, - struct gl_shader_program *shProg) + struct gl_shader_program *shProg, + bool add_packed_varyings_only) { /* Rebuild resource list. */ if (shProg->data->ProgramResourceList) { @@ -4396,6 +4036,11 @@ return; } + if (add_packed_varyings_only) { + _mesa_set_destroy(resource_set, NULL); + return; + } + if (!add_fragdata_arrays(ctx, shProg, resource_set)) return; @@ -4445,20 +4090,8 @@ if (shProg->data->UniformStorage[i].hidden) continue; - uint8_t stageref = - build_stageref(shProg, shProg->data->UniformStorage[i].name, - ir_var_uniform); - - /* Add stagereferences for uniforms in a uniform block. */ bool is_shader_storage = shProg->data->UniformStorage[i].is_shader_storage; - int block_index = shProg->data->UniformStorage[i].block_index; - if (block_index != -1) { - stageref |= is_shader_storage ? - shProg->data->ShaderStorageBlocks[block_index].stageref : - shProg->data->UniformBlocks[block_index].stageref; - } - GLenum type = is_shader_storage ? GL_BUFFER_VARIABLE : GL_UNIFORM; if (!link_util_should_add_buffer_variable(shProg, &shProg->data->UniformStorage[i], @@ -4469,9 +4102,6 @@ continue; if (is_shader_storage) { - calculate_array_size_and_stride(ctx, shProg, - &shProg->data->UniformStorage[i]); - /* From the OpenGL 4.6 specification, 7.3.1.1 Naming Active Resources: * * "For an active shader storage block member declared as an array @@ -4504,6 +4134,7 @@ buffer_block_index = shProg->data->UniformStorage[i].block_index; } + uint8_t stageref = shProg->data->UniformStorage[i].active_shader_mask; if (!link_util_add_program_resource(shProg, resource_set, type, &shProg->data->UniformStorage[i], stageref)) return; @@ -4772,12 +4403,17 @@ update_array_sizes(prog); link_assign_uniform_locations(prog, ctx); - link_assign_atomic_counter_resources(ctx, prog); - link_calculate_subroutine_compat(prog); - check_resources(ctx, prog); - check_subroutine_resources(prog); - check_image_resources(ctx, prog); - link_check_atomic_counter_resources(ctx, prog); + if (prog->data->LinkStatus == LINKING_FAILURE) + return; + + if (!ctx->Const.UseNIRGLSLLinker) { + link_util_calculate_subroutine_compat(prog); + link_util_check_uniform_resources(ctx, prog); + link_util_check_subroutine_resources(prog); + check_image_resources(ctx, prog); + link_assign_atomic_counter_resources(ctx, prog); + link_check_atomic_counter_resources(ctx, prog); + } } static bool @@ -5211,10 +4847,11 @@ linker_optimisation_loop(ctx, prog->_LinkedShaders[i]->ir, i); /* Call opts after lowering const arrays to copy propagate things. */ - if (lower_const_arrays_to_uniforms(prog->_LinkedShaders[i]->ir, i)) + if (ctx->Const.GLSLLowerConstArrays && + lower_const_arrays_to_uniforms(prog->_LinkedShaders[i]->ir, i, + ctx->Const.Program[i].MaxUniformComponents)) linker_optimisation_loop(ctx, prog->_LinkedShaders[i]->ir, i); - propagate_invariance(prog->_LinkedShaders[i]->ir); } /* Validation for special cases where we allow sampler array indexing diff -Nru mesa-19.2.8/src/compiler/glsl/linker_util.cpp mesa-20.0.8/src/compiler/glsl/linker_util.cpp --- mesa-19.2.8/src/compiler/glsl/linker_util.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/glsl/linker_util.cpp 2020-06-12 01:21:16.000000000 +0000 @@ -22,7 +22,9 @@ * */ #include "main/mtypes.h" +#include "glsl_types.h" #include "linker_util.h" +#include "util/bitscan.h" #include "util/set.h" #include "ir_uniform.h" /* for gl_uniform_storage */ @@ -154,3 +156,134 @@ } } } + +void +link_util_check_subroutine_resources(struct gl_shader_program *prog) +{ + unsigned mask = prog->data->linked_stages; + while (mask) { + const int i = u_bit_scan(&mask); + struct gl_program *p = prog->_LinkedShaders[i]->Program; + + if (p->sh.NumSubroutineUniformRemapTable > MAX_SUBROUTINE_UNIFORM_LOCATIONS) { + linker_error(prog, "Too many %s shader subroutine uniforms\n", + _mesa_shader_stage_to_string(i)); + } + } +} + +/** + * Validate uniform resources used by a program versus the implementation limits + */ +void +link_util_check_uniform_resources(struct gl_context *ctx, + struct gl_shader_program *prog) +{ + unsigned total_uniform_blocks = 0; + unsigned total_shader_storage_blocks = 0; + + for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) { + struct gl_linked_shader *sh = prog->_LinkedShaders[i]; + + if (sh == NULL) + continue; + + if (sh->num_uniform_components > + ctx->Const.Program[i].MaxUniformComponents) { + if (ctx->Const.GLSLSkipStrictMaxUniformLimitCheck) { + linker_warning(prog, "Too many %s shader default uniform block " + "components, but the driver will try to optimize " + "them out; this is non-portable out-of-spec " + "behavior\n", + _mesa_shader_stage_to_string(i)); + } else { + linker_error(prog, "Too many %s shader default uniform block " + "components\n", + _mesa_shader_stage_to_string(i)); + } + } + + if (sh->num_combined_uniform_components > + ctx->Const.Program[i].MaxCombinedUniformComponents) { + if (ctx->Const.GLSLSkipStrictMaxUniformLimitCheck) { + linker_warning(prog, "Too many %s shader uniform components, " + "but the driver will try to optimize them out; " + "this is non-portable out-of-spec behavior\n", + _mesa_shader_stage_to_string(i)); + } else { + linker_error(prog, "Too many %s shader uniform components\n", + _mesa_shader_stage_to_string(i)); + } + } + + total_shader_storage_blocks += sh->Program->info.num_ssbos; + total_uniform_blocks += sh->Program->info.num_ubos; + } + + if (total_uniform_blocks > ctx->Const.MaxCombinedUniformBlocks) { + linker_error(prog, "Too many combined uniform blocks (%d/%d)\n", + total_uniform_blocks, ctx->Const.MaxCombinedUniformBlocks); + } + + if (total_shader_storage_blocks > ctx->Const.MaxCombinedShaderStorageBlocks) { + linker_error(prog, "Too many combined shader storage blocks (%d/%d)\n", + total_shader_storage_blocks, + ctx->Const.MaxCombinedShaderStorageBlocks); + } + + for (unsigned i = 0; i < prog->data->NumUniformBlocks; i++) { + if (prog->data->UniformBlocks[i].UniformBufferSize > + ctx->Const.MaxUniformBlockSize) { + linker_error(prog, "Uniform block %s too big (%d/%d)\n", + prog->data->UniformBlocks[i].Name, + prog->data->UniformBlocks[i].UniformBufferSize, + ctx->Const.MaxUniformBlockSize); + } + } + + for (unsigned i = 0; i < prog->data->NumShaderStorageBlocks; i++) { + if (prog->data->ShaderStorageBlocks[i].UniformBufferSize > + ctx->Const.MaxShaderStorageBlockSize) { + linker_error(prog, "Shader storage block %s too big (%d/%d)\n", + prog->data->ShaderStorageBlocks[i].Name, + prog->data->ShaderStorageBlocks[i].UniformBufferSize, + ctx->Const.MaxShaderStorageBlockSize); + } + } +} + +void +link_util_calculate_subroutine_compat(struct gl_shader_program *prog) +{ + unsigned mask = prog->data->linked_stages; + while (mask) { + const int i = u_bit_scan(&mask); + struct gl_program *p = prog->_LinkedShaders[i]->Program; + + for (unsigned j = 0; j < p->sh.NumSubroutineUniformRemapTable; j++) { + if (p->sh.SubroutineUniformRemapTable[j] == INACTIVE_UNIFORM_EXPLICIT_LOCATION) + continue; + + struct gl_uniform_storage *uni = p->sh.SubroutineUniformRemapTable[j]; + + if (!uni) + continue; + + int count = 0; + if (p->sh.NumSubroutineFunctions == 0) { + linker_error(prog, "subroutine uniform %s defined but no valid functions found\n", uni->type->name); + continue; + } + for (unsigned f = 0; f < p->sh.NumSubroutineFunctions; f++) { + struct gl_subroutine_function *fn = &p->sh.SubroutineFunctions[f]; + for (int k = 0; k < fn->num_compat_types; k++) { + if (fn->types[k] == uni->type) { + count++; + break; + } + } + } + uni->num_compatible_subroutines = count; + } + } +} diff -Nru mesa-19.2.8/src/compiler/glsl/linker_util.h mesa-20.0.8/src/compiler/glsl/linker_util.h --- mesa-19.2.8/src/compiler/glsl/linker_util.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/glsl/linker_util.h 2020-06-12 01:21:16.000000000 +0000 @@ -24,6 +24,7 @@ #ifndef GLSL_LINKER_UTIL_H #define GLSL_LINKER_UTIL_H +struct gl_context; struct gl_shader_program; struct gl_uniform_storage; @@ -70,6 +71,16 @@ void link_util_update_empty_uniform_locations(struct gl_shader_program *prog); +void +link_util_check_subroutine_resources(struct gl_shader_program *prog); + +void +link_util_check_uniform_resources(struct gl_context *ctx, + struct gl_shader_program *prog); + +void +link_util_calculate_subroutine_compat(struct gl_shader_program *prog); + #ifdef __cplusplus } #endif diff -Nru mesa-19.2.8/src/compiler/glsl/link_interface_blocks.cpp mesa-20.0.8/src/compiler/glsl/link_interface_blocks.cpp --- mesa-19.2.8/src/compiler/glsl/link_interface_blocks.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/glsl/link_interface_blocks.cpp 2020-06-12 01:21:16.000000000 +0000 @@ -220,7 +220,7 @@ public: interface_block_definitions() : mem_ctx(ralloc_context(NULL)), - ht(_mesa_hash_table_create(NULL, _mesa_key_hash_string, + ht(_mesa_hash_table_create(NULL, _mesa_hash_string, _mesa_key_string_equal)) { } @@ -401,12 +401,56 @@ return; } + /* Desktop OpenGL requires redeclaration of the built-in interfaces for + * SSO programs. Passes above implement following rules: + * + * From Section 7.4 (Program Pipeline Objects) of the OpenGL 4.6 Core + * spec: + * + * "To use any built-in input or output in the gl_PerVertex and + * gl_PerFragment blocks in separable program objects, shader code + * must redeclare those blocks prior to use. A separable program + * will fail to link if: + * + * it contains multiple shaders of a single type with different + * redeclarations of these built-in input and output blocks; or + * + * any shader uses a built-in block member not found in the + * redeclaration of that block." + * + * ARB_separate_shader_objects issues section (issue #28) states that + * redeclaration is not required for GLSL shaders using #version 140 or + * earlier (since interface blocks are not possible with older versions). + * + * From Section 7.4.1 (Shader Interface Matching) of the OpenGL ES 3.1 + * spec: + * + * "Built-in inputs or outputs do not affect interface matching." + * + * GL_OES_shader_io_blocks adds following: + * + * "When using any built-in input or output in the gl_PerVertex block + * in separable program objects, shader code may redeclare that block + * prior to use. If the shader does not redeclare the block, the + * intrinsically declared definition of that block will be used." + */ + /* Add output interfaces from the producer to the symbol table. */ foreach_in_list(ir_instruction, node, producer->ir) { ir_variable *var = node->as_variable(); if (!var || !var->get_interface_type() || var->data.mode != ir_var_shader_out) continue; + /* Built-in interface redeclaration check. */ + if (prog->SeparateShader && !prog->IsES && prog->data->Version >= 150 && + var->data.how_declared == ir_var_declared_implicitly && + var->data.used && !producer_iface) { + linker_error(prog, "missing output builtin block %s redeclaration " + "in separable shader program", + var->get_interface_type()->name); + return; + } + definitions.store(var); } @@ -418,6 +462,16 @@ ir_variable *producer_def = definitions.lookup(var); + /* Built-in interface redeclaration check. */ + if (prog->SeparateShader && !prog->IsES && prog->data->Version >= 150 && + var->data.how_declared == ir_var_declared_implicitly && + var->data.used && !producer_iface) { + linker_error(prog, "missing input builtin block %s redeclaration " + "in separable shader program", + var->get_interface_type()->name); + return; + } + /* The producer doesn't generate this input: fail to link. Skip built-in * 'gl_in[]' since that may not be present if the producer does not * write to any of the pre-defined outputs (e.g. if the vertex shader diff -Nru mesa-19.2.8/src/compiler/glsl/link_uniform_block_active_visitor.cpp mesa-20.0.8/src/compiler/glsl/link_uniform_block_active_visitor.cpp --- mesa-19.2.8/src/compiler/glsl/link_uniform_block_active_visitor.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/glsl/link_uniform_block_active_visitor.cpp 2020-06-12 01:21:16.000000000 +0000 @@ -103,6 +103,8 @@ if (*ub_array_ptr == NULL) { *ub_array_ptr = rzalloc(mem_ctx, struct uniform_block_array_elements); (*ub_array_ptr)->ir = ir; + (*ub_array_ptr)->aoa_size = + ir->array->type->arrays_of_arrays_size(); } struct uniform_block_array_elements *ub_array = *ub_array_ptr; @@ -199,6 +201,7 @@ (*ub_array)->array_elements, unsigned, (*ub_array)->num_array_elements); + (*ub_array)->aoa_size = type->arrays_of_arrays_size(); for (unsigned i = 0; i < (*ub_array)->num_array_elements; i++) { (*ub_array)->array_elements[i] = i; diff -Nru mesa-19.2.8/src/compiler/glsl/link_uniform_block_active_visitor.h mesa-20.0.8/src/compiler/glsl/link_uniform_block_active_visitor.h --- mesa-19.2.8/src/compiler/glsl/link_uniform_block_active_visitor.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/glsl/link_uniform_block_active_visitor.h 2020-06-12 01:21:16.000000000 +0000 @@ -30,6 +30,15 @@ struct uniform_block_array_elements { unsigned *array_elements; unsigned num_array_elements; + /** + * Size of the array before array-trimming optimizations. + * + * Locations are only assigned to active array elements, but the location + * values are calculated as if all elements are active. The total number + * of elements in an array including the elements in arrays of arrays before + * inactive elements are removed is needed to be perform that calculation. + */ + unsigned aoa_size; ir_dereference_array *ir; diff -Nru mesa-19.2.8/src/compiler/glsl/link_uniform_blocks.cpp mesa-20.0.8/src/compiler/glsl/link_uniform_blocks.cpp --- mesa-19.2.8/src/compiler/glsl/link_uniform_blocks.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/glsl/link_uniform_blocks.cpp 2020-06-12 01:21:16.000000000 +0000 @@ -222,7 +222,7 @@ gl_uniform_buffer_variable *variables, const struct link_uniform_block_active *const b, unsigned *block_index, - unsigned *binding_offset, + unsigned binding_offset, unsigned linearized_index, struct gl_context *ctx, struct gl_shader_program *prog); @@ -237,26 +237,28 @@ size_t name_length, gl_uniform_block *blocks, ubo_visitor *parcel, gl_uniform_buffer_variable *variables, const struct link_uniform_block_active *const b, - unsigned *block_index, unsigned *binding_offset, + unsigned *block_index, unsigned binding_offset, struct gl_context *ctx, struct gl_shader_program *prog, unsigned first_index) { for (unsigned j = 0; j < ub_array->num_array_elements; j++) { size_t new_length = name_length; + unsigned int element_idx = ub_array->array_elements[j]; /* Append the subscript to the current variable name */ - ralloc_asprintf_rewrite_tail(name, &new_length, "[%u]", - ub_array->array_elements[j]); + ralloc_asprintf_rewrite_tail(name, &new_length, "[%u]", element_idx); if (ub_array->array) { + unsigned binding_stride = binding_offset + (element_idx * + ub_array->array->aoa_size); process_block_array(ub_array->array, name, new_length, blocks, parcel, variables, b, block_index, - binding_offset, ctx, prog, first_index); + binding_stride, ctx, prog, first_index); } else { process_block_array_leaf(*name, blocks, parcel, variables, b, block_index, - binding_offset, *block_index - first_index, - ctx, prog); + binding_offset + element_idx, + *block_index - first_index, ctx, prog); } } } @@ -266,7 +268,7 @@ gl_uniform_block *blocks, ubo_visitor *parcel, gl_uniform_buffer_variable *variables, const struct link_uniform_block_active *const b, - unsigned *block_index, unsigned *binding_offset, + unsigned *block_index, unsigned binding_offset, unsigned linearized_index, struct gl_context *ctx, struct gl_shader_program *prog) { @@ -283,7 +285,7 @@ * block binding and each subsequent element takes the next consecutive * uniform block binding point. */ - blocks[i].Binding = (b->has_binding) ? b->binding + *binding_offset : 0; + blocks[i].Binding = (b->has_binding) ? b->binding + binding_offset : 0; blocks[i].UniformBufferSize = 0; blocks[i]._Packing = glsl_interface_packing(type->interface_packing); @@ -307,7 +309,6 @@ (unsigned)(ptrdiff_t)(&variables[parcel->index] - blocks[i].Uniforms); *block_index = *block_index + 1; - *binding_offset = *binding_offset + 1; } /* This function resizes the array types of the block so that later we can use @@ -370,7 +371,6 @@ if ((create_ubo_blocks && !b->is_shader_storage) || (!create_ubo_blocks && b->is_shader_storage)) { - unsigned binding_offset = 0; if (b->array != NULL) { char *name = ralloc_strdup(NULL, block_type->without_array()->name); @@ -378,12 +378,12 @@ assert(b->has_instance_name); process_block_array(b->array, &name, name_length, blocks, &parcel, - variables, b, &i, &binding_offset, ctx, prog, + variables, b, &i, 0, ctx, prog, i); ralloc_free(name); } else { process_block_array_leaf(block_type->name, blocks, &parcel, - variables, b, &i, &binding_offset, + variables, b, &i, 0, 0, ctx, prog); } } @@ -409,7 +409,7 @@ * the hash is organized by block-name. */ struct hash_table *block_hash = - _mesa_hash_table_create(mem_ctx, _mesa_key_hash_string, + _mesa_hash_table_create(mem_ctx, _mesa_hash_string, _mesa_key_string_equal); if (block_hash == NULL) { @@ -440,6 +440,7 @@ GLSL_INTERFACE_PACKING_PACKED)) { b->type = resize_block_array(b->type, b->array); b->var->type = b->type; + b->var->data.max_array_access = b->type->length - 1; } block_size.num_active_uniforms = 0; diff -Nru mesa-19.2.8/src/compiler/glsl/link_uniforms.cpp mesa-20.0.8/src/compiler/glsl/link_uniforms.cpp --- mesa-19.2.8/src/compiler/glsl/link_uniforms.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/glsl/link_uniforms.cpp 2020-06-12 01:21:16.000000000 +0000 @@ -28,7 +28,9 @@ #include "program.h" #include "string_to_uint_map.h" #include "ir_array_refcount.h" + #include "main/mtypes.h" +#include "util/strndup.h" /** * \file link_uniforms.cpp @@ -42,6 +44,219 @@ */ #define UNMAPPED_UNIFORM_LOC ~0u +static char* +get_top_level_name(const char *name) +{ + const char *first_dot = strchr(name, '.'); + const char *first_square_bracket = strchr(name, '['); + int name_size = 0; + + /* The ARB_program_interface_query spec says: + * + * "For the property TOP_LEVEL_ARRAY_SIZE, a single integer identifying + * the number of active array elements of the top-level shader storage + * block member containing to the active variable is written to + * . If the top-level block member is not declared as an + * array, the value one is written to . If the top-level block + * member is an array with no declared size, the value zero is written + * to ." + */ + + /* The buffer variable is on top level.*/ + if (!first_square_bracket && !first_dot) + name_size = strlen(name); + else if ((!first_square_bracket || + (first_dot && first_dot < first_square_bracket))) + name_size = first_dot - name; + else + name_size = first_square_bracket - name; + + return strndup(name, name_size); +} + +static char* +get_var_name(const char *name) +{ + const char *first_dot = strchr(name, '.'); + + if (!first_dot) + return strdup(name); + + return strndup(first_dot+1, strlen(first_dot) - 1); +} + +static bool +is_top_level_shader_storage_block_member(const char* name, + const char* interface_name, + const char* field_name) +{ + bool result = false; + + /* If the given variable is already a top-level shader storage + * block member, then return array_size = 1. + * We could have two possibilities: if we have an instanced + * shader storage block or not instanced. + * + * For the first, we check create a name as it was in top level and + * compare it with the real name. If they are the same, then + * the variable is already at top-level. + * + * Full instanced name is: interface name + '.' + var name + + * NULL character + */ + int name_length = strlen(interface_name) + 1 + strlen(field_name) + 1; + char *full_instanced_name = (char *) calloc(name_length, sizeof(char)); + if (!full_instanced_name) { + fprintf(stderr, "%s: Cannot allocate space for name\n", __func__); + return false; + } + + snprintf(full_instanced_name, name_length, "%s.%s", + interface_name, field_name); + + /* Check if its top-level shader storage block member of an + * instanced interface block, or of a unnamed interface block. + */ + if (strcmp(name, full_instanced_name) == 0 || + strcmp(name, field_name) == 0) + result = true; + + free(full_instanced_name); + return result; +} + +static int +get_array_size(struct gl_uniform_storage *uni, const glsl_struct_field *field, + char *interface_name, char *var_name) +{ + /* The ARB_program_interface_query spec says: + * + * "For the property TOP_LEVEL_ARRAY_SIZE, a single integer identifying + * the number of active array elements of the top-level shader storage + * block member containing to the active variable is written to + * . If the top-level block member is not declared as an + * array, the value one is written to . If the top-level block + * member is an array with no declared size, the value zero is written + * to ." + */ + if (is_top_level_shader_storage_block_member(uni->name, + interface_name, + var_name)) + return 1; + else if (field->type->is_array()) + return field->type->length; + + return 1; +} + +static int +get_array_stride(struct gl_uniform_storage *uni, const glsl_type *iface, + const glsl_struct_field *field, char *interface_name, + char *var_name, bool use_std430_as_default) +{ + /* The ARB_program_interface_query spec says: + * + * "For the property TOP_LEVEL_ARRAY_STRIDE, a single integer + * identifying the stride between array elements of the top-level + * shader storage block member containing the active variable is + * written to . For top-level block members declared as + * arrays, the value written is the difference, in basic machine units, + * between the offsets of the active variable for consecutive elements + * in the top-level array. For top-level block members not declared as + * an array, zero is written to ." + */ + if (field->type->is_array()) { + const enum glsl_matrix_layout matrix_layout = + glsl_matrix_layout(field->matrix_layout); + bool row_major = matrix_layout == GLSL_MATRIX_LAYOUT_ROW_MAJOR; + const glsl_type *array_type = field->type->fields.array; + + if (is_top_level_shader_storage_block_member(uni->name, + interface_name, + var_name)) + return 0; + + if (GLSL_INTERFACE_PACKING_STD140 == + iface->get_internal_ifc_packing(use_std430_as_default)) { + if (array_type->is_struct() || array_type->is_array()) + return glsl_align(array_type->std140_size(row_major), 16); + else + return MAX2(array_type->std140_base_alignment(row_major), 16); + } else { + return array_type->std430_array_stride(row_major); + } + } + return 0; +} + +static void +calculate_array_size_and_stride(struct gl_shader_program *shProg, + struct gl_uniform_storage *uni, + bool use_std430_as_default) +{ + if (!uni->is_shader_storage) + return; + + int block_index = uni->block_index; + int array_size = -1; + int array_stride = -1; + char *var_name = get_top_level_name(uni->name); + char *interface_name = + get_top_level_name(uni->is_shader_storage ? + shProg->data->ShaderStorageBlocks[block_index].Name : + shProg->data->UniformBlocks[block_index].Name); + + if (strcmp(var_name, interface_name) == 0) { + /* Deal with instanced array of SSBOs */ + char *temp_name = get_var_name(uni->name); + if (!temp_name) { + linker_error(shProg, "Out of memory during linking.\n"); + goto write_top_level_array_size_and_stride; + } + free(var_name); + var_name = get_top_level_name(temp_name); + free(temp_name); + if (!var_name) { + linker_error(shProg, "Out of memory during linking.\n"); + goto write_top_level_array_size_and_stride; + } + } + + for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) { + const gl_linked_shader *sh = shProg->_LinkedShaders[i]; + if (sh == NULL) + continue; + + foreach_in_list(ir_instruction, node, sh->ir) { + ir_variable *var = node->as_variable(); + if (!var || !var->get_interface_type() || + var->data.mode != ir_var_shader_storage) + continue; + + const glsl_type *iface = var->get_interface_type(); + + if (strcmp(interface_name, iface->name) != 0) + continue; + + for (unsigned i = 0; i < iface->length; i++) { + const glsl_struct_field *field = &iface->fields.structure[i]; + if (strcmp(field->name, var_name) != 0) + continue; + + array_stride = get_array_stride(uni, iface, field, interface_name, + var_name, use_std430_as_default); + array_size = get_array_size(uni, field, interface_name, var_name); + goto write_top_level_array_size_and_stride; + } + } + } +write_top_level_array_size_and_stride: + free(interface_name); + free(var_name); + uni->top_level_array_stride = array_stride; + uni->top_level_array_size = array_size; +} + void program_resource_visitor::process(const glsl_type *type, const char *name, bool use_std430_as_default) @@ -844,7 +1059,8 @@ this->uniforms[id].opaque[shader_type].index = ~0; this->uniforms[id].opaque[shader_type].active = false; - this->uniforms[id].active_shader_mask |= 1 << shader_type; + if (current_var->data.used || base_type->is_subroutine()) + this->uniforms[id].active_shader_mask |= 1 << shader_type; /* This assigns uniform indices to sampler and image uniforms. */ handle_samplers(base_type, &this->uniforms[id], name); @@ -951,6 +1167,9 @@ !this->uniforms[id].is_shader_storage && this->buffer_block_index == -1) this->values += type->component_slots(); + + calculate_array_size_and_stride(prog, &this->uniforms[id], + use_std430_as_default); } /** @@ -1505,6 +1724,22 @@ uniform_size.process(var); } + if (uniform_size.num_shader_samplers > + ctx->Const.Program[i].MaxTextureImageUnits) { + linker_error(prog, "Too many %s shader texture samplers\n", + _mesa_shader_stage_to_string(i)); + continue; + } + + if (uniform_size.num_shader_images > + ctx->Const.Program[i].MaxImageUniforms) { + linker_error(prog, "Too many %s shader image uniforms (%u > %u)\n", + _mesa_shader_stage_to_string(i), + sh->Program->info.num_images, + ctx->Const.Program[i].MaxImageUniforms); + continue; + } + sh->Program->info.num_textures = uniform_size.num_shader_samplers; sh->Program->info.num_images = uniform_size.num_shader_images; sh->num_uniform_components = uniform_size.num_shader_uniform_components; @@ -1516,6 +1751,11 @@ } } + if (prog->data->LinkStatus == LINKING_FAILURE) { + delete hiddenUniforms; + return; + } + prog->data->NumUniformStorage = uniform_size.num_active_uniforms; prog->data->NumHiddenUniforms = uniform_size.num_hidden_uniforms; diff -Nru mesa-19.2.8/src/compiler/glsl/link_varyings.cpp mesa-20.0.8/src/compiler/glsl/link_varyings.cpp --- mesa-19.2.8/src/compiler/glsl/link_varyings.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/glsl/link_varyings.cpp 2020-06-12 01:21:16.000000000 +0000 @@ -2051,7 +2051,7 @@ const glsl_type *type = get_varying_type(producer_var, producer_stage); if (type->is_array() || type->is_matrix() || type->is_struct() || - type->is_double()) { + type->is_64bit()) { unsigned comp_slots = type->component_slots() + offset; unsigned slots = comp_slots / 4; if (comp_slots % 4) @@ -2583,13 +2583,13 @@ consumer ? consumer->Stage : MESA_SHADER_NONE); void *hash_table_ctx = ralloc_context(NULL); hash_table *tfeedback_candidates = - _mesa_hash_table_create(hash_table_ctx, _mesa_key_hash_string, + _mesa_hash_table_create(hash_table_ctx, _mesa_hash_string, _mesa_key_string_equal); hash_table *consumer_inputs = - _mesa_hash_table_create(hash_table_ctx, _mesa_key_hash_string, + _mesa_hash_table_create(hash_table_ctx, _mesa_hash_string, _mesa_key_string_equal); hash_table *consumer_interface_inputs = - _mesa_hash_table_create(hash_table_ctx, _mesa_key_hash_string, + _mesa_hash_table_create(hash_table_ctx, _mesa_hash_string, _mesa_key_string_equal); ir_variable *consumer_inputs_with_locations[VARYING_SLOT_TESS_MAX] = { NULL, diff -Nru mesa-19.2.8/src/compiler/glsl/loop_analysis.cpp mesa-20.0.8/src/compiler/glsl/loop_analysis.cpp --- mesa-19.2.8/src/compiler/glsl/loop_analysis.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/glsl/loop_analysis.cpp 2020-06-12 01:21:16.000000000 +0000 @@ -88,7 +88,7 @@ static int calculate_iterations(ir_rvalue *from, ir_rvalue *to, ir_rvalue *increment, enum ir_expression_operation op, bool continue_from_then, - bool swap_compare_operands) + bool swap_compare_operands, bool inc_before_terminator) { if (from == NULL || to == NULL || increment == NULL) return -1; @@ -118,6 +118,32 @@ int iter_value = iter->get_int_component(0); + /* Code after this block works under assumption that iterator will be + * incremented or decremented until it hits the limit, + * however the loop condition can be false on the first iteration. + * Handle such loops first. + */ + { + ir_rvalue *first_value = from; + if (inc_before_terminator) { + first_value = + new(mem_ctx) ir_expression(ir_binop_add, from->type, from, increment); + } + + ir_expression *cmp = swap_compare_operands + ? new(mem_ctx) ir_expression(op, glsl_type::bool_type, to, first_value) + : new(mem_ctx) ir_expression(op, glsl_type::bool_type, first_value, to); + if (continue_from_then) + cmp = new(mem_ctx) ir_expression(ir_unop_logic_not, cmp); + + ir_constant *const cmp_result = cmp->constant_expression_value(mem_ctx); + assert(cmp_result != NULL); + if (cmp_result->get_bool_component(0)) { + ralloc_free(mem_ctx); + return 0; + } + } + /* Make sure that the calculated number of iterations satisfies the exit * condition. This is needed to catch off-by-one errors and some types of * ill-formed loops. For example, we need to detect that the following @@ -172,6 +198,11 @@ } ralloc_free(mem_ctx); + + if (inc_before_terminator) { + iter_value--; + } + return (valid_loop) ? iter_value : -1; } @@ -611,13 +642,13 @@ loop_variable *lv = ls->get(var); if (lv != NULL && lv->is_induction_var()) { + bool inc_before_terminator = + incremented_before_terminator(ir, var, t->ir); + t->iterations = calculate_iterations(init, limit, lv->increment, cmp, t->continue_from_then, - swap_compare_operands); - - if (incremented_before_terminator(ir, var, t->ir)) { - t->iterations--; - } + swap_compare_operands, + inc_before_terminator); if (t->iterations >= 0 && (ls->limiting_terminator == NULL || diff -Nru mesa-19.2.8/src/compiler/glsl/loop_unroll.cpp mesa-20.0.8/src/compiler/glsl/loop_unroll.cpp --- mesa-19.2.8/src/compiler/glsl/loop_unroll.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/glsl/loop_unroll.cpp 2020-06-12 01:21:16.000000000 +0000 @@ -390,17 +390,10 @@ return visit_continue; } - if (ls->limiting_terminator != NULL) { - /* If the limiting terminator has an iteration count of zero, then we've - * proven that the loop cannot run, so delete it. - */ - int iterations = ls->limiting_terminator->iterations; - if (iterations == 0) { - ir->remove(); - this->progress = true; - return visit_continue; - } - } + /* Limiting terminator may have iteration count of zero, + * this is a valid case because the loop may break during + * the first iteration. + */ /* Remove the conditional break statements associated with all terminators * that are associated with a fixed iteration count, except for the one diff -Nru mesa-19.2.8/src/compiler/glsl/lower_const_arrays_to_uniforms.cpp mesa-20.0.8/src/compiler/glsl/lower_const_arrays_to_uniforms.cpp --- mesa-19.2.8/src/compiler/glsl/lower_const_arrays_to_uniforms.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/glsl/lower_const_arrays_to_uniforms.cpp 2020-06-12 01:21:16.000000000 +0000 @@ -45,11 +45,13 @@ namespace { class lower_const_array_visitor : public ir_rvalue_visitor { public: - lower_const_array_visitor(exec_list *insts, unsigned s) + lower_const_array_visitor(exec_list *insts, unsigned s, + unsigned available_uni_components) { instructions = insts; stage = s; const_count = 0; + free_uni_components = available_uni_components; progress = false; } @@ -66,6 +68,7 @@ exec_list *instructions; unsigned stage; unsigned const_count; + unsigned free_uni_components; bool progress; }; @@ -85,6 +88,15 @@ if (!con || !con->type->is_array()) return; + /* How many uniform component slots are required? */ + unsigned component_slots = con->type->component_slots(); + + /* We would utilize more than is available, bail out. */ + if (component_slots > free_uni_components) + return; + + free_uni_components -= component_slots; + void *mem_ctx = ralloc_parent(con); /* In the very unlikely event of 4294967295 constant arrays in a single @@ -116,9 +128,30 @@ } /* anonymous namespace */ + +static unsigned +count_uniforms(exec_list *instructions) +{ + unsigned total = 0; + + foreach_in_list(ir_instruction, node, instructions) { + ir_variable *const var = node->as_variable(); + + if (!var || var->data.mode != ir_var_uniform) + continue; + + total += var->type->component_slots(); + } + return total; +} + bool -lower_const_arrays_to_uniforms(exec_list *instructions, unsigned stage) +lower_const_arrays_to_uniforms(exec_list *instructions, unsigned stage, + unsigned max_uniform_components) { - lower_const_array_visitor v(instructions, stage); + unsigned uniform_components = count_uniforms(instructions); + unsigned free_uniform_slots = max_uniform_components - uniform_components; + + lower_const_array_visitor v(instructions, stage, free_uniform_slots); return v.run(); } diff -Nru mesa-19.2.8/src/compiler/glsl/lower_int64.cpp mesa-20.0.8/src/compiler/glsl/lower_int64.cpp --- mesa-19.2.8/src/compiler/glsl/lower_int64.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/glsl/lower_int64.cpp 2020-06-12 01:21:16.000000000 +0000 @@ -73,7 +73,7 @@ function_list(), added_functions(&function_list, mem_ctx) { functions = _mesa_hash_table_create(mem_ctx, - _mesa_key_hash_string, + _mesa_hash_string, _mesa_key_string_equal); foreach_in_list(ir_instruction, node, instructions) { diff -Nru mesa-19.2.8/src/compiler/glsl/lower_named_interface_blocks.cpp mesa-20.0.8/src/compiler/glsl/lower_named_interface_blocks.cpp --- mesa-19.2.8/src/compiler/glsl/lower_named_interface_blocks.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/glsl/lower_named_interface_blocks.cpp 2020-06-12 01:21:16.000000000 +0000 @@ -125,7 +125,7 @@ void flatten_named_interface_blocks_declarations::run(exec_list *instructions) { - interface_namespace = _mesa_hash_table_create(NULL, _mesa_key_hash_string, + interface_namespace = _mesa_hash_table_create(NULL, _mesa_hash_string, _mesa_key_string_equal); /* First pass: adjust instance block variables with an instance name diff -Nru mesa-19.2.8/src/compiler/glsl/lower_output_reads.cpp mesa-20.0.8/src/compiler/glsl/lower_output_reads.cpp --- mesa-19.2.8/src/compiler/glsl/lower_output_reads.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/glsl/lower_output_reads.cpp 2020-06-12 01:21:16.000000000 +0000 @@ -72,7 +72,7 @@ hash_table_var_hash(const void *key) { const ir_variable * var = static_cast(key); - return _mesa_key_hash_string(var->name); + return _mesa_hash_string(var->name); } output_read_remover::output_read_remover(unsigned stage) diff -Nru mesa-19.2.8/src/compiler/glsl/meson.build mesa-20.0.8/src/compiler/glsl/meson.build --- mesa-19.2.8/src/compiler/glsl/meson.build 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/glsl/meson.build 2020-06-12 01:21:16.000000000 +0000 @@ -85,7 +85,6 @@ 'gl_nir_link_xfb.c', 'gl_nir_linker.c', 'gl_nir_linker.h', - 'gl_nir_opt_access.c', 'gl_nir.h', 'glsl_parser_extras.cpp', 'glsl_parser_extras.h', @@ -229,7 +228,7 @@ c_args : [c_vis_args, c_msvc_compat_args, no_override_init_args], cpp_args : [cpp_vis_args, cpp_msvc_compat_args], link_with : libglcpp, - include_directories : [inc_common, inc_compiler, inc_nir], + include_directories : [inc_common, inc_compiler], dependencies : idep_nir, build_by_default : false, ) @@ -240,8 +239,8 @@ c_args : [c_vis_args, c_msvc_compat_args, no_override_init_args], cpp_args : [cpp_vis_args, cpp_msvc_compat_args], include_directories : [inc_common], - link_with : [libglsl, libglsl_util], - dependencies : idep_mesautil, + link_with : [libglsl, libglsl_util, libglcpp_standalone], + dependencies : [idep_mesautil, idep_getopt], build_by_default : false, ) @@ -250,7 +249,7 @@ 'main.cpp', c_args : [c_vis_args, c_msvc_compat_args, no_override_init_args], cpp_args : [cpp_vis_args, cpp_msvc_compat_args], - dependencies : [dep_clock, dep_thread], + dependencies : [dep_clock, dep_thread, idep_getopt], include_directories : [inc_common], link_with : [libglsl_standalone], build_by_default : with_tools.contains('glsl'), @@ -264,7 +263,7 @@ c_args : [c_vis_args, c_msvc_compat_args, no_override_init_args], cpp_args : [cpp_vis_args, cpp_msvc_compat_args], include_directories : [inc_common], - dependencies : [dep_clock, dep_thread], + dependencies : [dep_clock, dep_thread, idep_getopt], link_with : [libglsl, libglsl_standalone, libglsl_util], build_by_default : with_tools.contains('glsl'), install : with_tools.contains('glsl'), diff -Nru mesa-19.2.8/src/compiler/glsl/opt_dead_builtin_varyings.cpp mesa-20.0.8/src/compiler/glsl/opt_dead_builtin_varyings.cpp --- mesa-19.2.8/src/compiler/glsl/opt_dead_builtin_varyings.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/glsl/opt_dead_builtin_varyings.cpp 2020-06-12 01:21:16.000000000 +0000 @@ -539,7 +539,8 @@ tfeedback_decl *tfeedback_decls) { /* Lower the gl_FragData array to separate variables. */ - if (consumer && consumer->Stage == MESA_SHADER_FRAGMENT) { + if (consumer && consumer->Stage == MESA_SHADER_FRAGMENT && + !ctx->Const.ShaderCompilerOptions[MESA_SHADER_FRAGMENT].NirOptions) { lower_fragdata_array(consumer); } diff -Nru mesa-19.2.8/src/compiler/glsl/opt_dead_code.cpp mesa-20.0.8/src/compiler/glsl/opt_dead_code.cpp --- mesa-19.2.8/src/compiler/glsl/opt_dead_code.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/glsl/opt_dead_code.cpp 2020-06-12 01:21:16.000000000 +0000 @@ -144,8 +144,15 @@ */ if (entry->var->is_in_buffer_block()) { if (entry->var->get_interface_type_packing() != - GLSL_INTERFACE_PACKING_PACKED) + GLSL_INTERFACE_PACKING_PACKED) { + /* Set used to false so it doesn't get set as referenced by + * the shader in the program resource list. This will also + * help avoid the state being unnecessarily flushed for the + * shader stage. + */ + entry->var->data.used = false; continue; + } } if (entry->var->type->is_subroutine()) diff -Nru mesa-19.2.8/src/compiler/glsl/program.h mesa-20.0.8/src/compiler/glsl/program.h --- mesa-19.2.8/src/compiler/glsl/program.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/glsl/program.h 2020-06-12 01:21:16.000000000 +0000 @@ -46,7 +46,8 @@ extern void build_program_resource_list(struct gl_context *ctx, - struct gl_shader_program *shProg); + struct gl_shader_program *shProg, + bool add_packed_varyings_only); extern long parse_program_resource_name(const GLchar *name, diff -Nru mesa-19.2.8/src/compiler/glsl/serialize.cpp mesa-20.0.8/src/compiler/glsl/serialize.cpp --- mesa-19.2.8/src/compiler/glsl/serialize.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/glsl/serialize.cpp 2020-06-12 01:21:16.000000000 +0000 @@ -568,23 +568,48 @@ { remap_type_inactive_explicit_location, remap_type_null_ptr, - remap_type_uniform_offset + remap_type_uniform_offset, + remap_type_uniform_offsets_equal, }; static void -write_uniform_remap_table_entry(struct blob *metadata, - gl_uniform_storage *uniform_storage, - gl_uniform_storage *entry) -{ - if (entry == INACTIVE_UNIFORM_EXPLICIT_LOCATION) { - blob_write_uint32(metadata, remap_type_inactive_explicit_location); - } else if (entry == NULL) { - blob_write_uint32(metadata, remap_type_null_ptr); - } else { - blob_write_uint32(metadata, remap_type_uniform_offset); +write_uniform_remap_table(struct blob *metadata, + unsigned num_entries, + gl_uniform_storage *uniform_storage, + gl_uniform_storage **remap_table) +{ + blob_write_uint32(metadata, num_entries); + for (unsigned i = 0; i < num_entries; i++) { + gl_uniform_storage *entry = remap_table[i]; uint32_t offset = entry - uniform_storage; - blob_write_uint32(metadata, offset); + + if (entry == INACTIVE_UNIFORM_EXPLICIT_LOCATION) { + blob_write_uint32(metadata, remap_type_inactive_explicit_location); + } else if (entry == NULL) { + blob_write_uint32(metadata, remap_type_null_ptr); + } else if (i+1 < num_entries && entry == remap_table[i+1]) { + blob_write_uint32(metadata, remap_type_uniform_offsets_equal); + + /* If many offsets are equal, write only one offset and the number + * of consecutive entries being equal. + */ + unsigned count = 1; + for (unsigned j = i + 1; j < num_entries; j++) { + if (entry != remap_table[j]) + break; + + count++; + } + + blob_write_uint32(metadata, offset); + blob_write_uint32(metadata, count); + i += count - 1; + } else { + blob_write_uint32(metadata, remap_type_uniform_offset); + + blob_write_uint32(metadata, offset); + } } } @@ -592,80 +617,74 @@ write_uniform_remap_tables(struct blob *metadata, struct gl_shader_program *prog) { - blob_write_uint32(metadata, prog->NumUniformRemapTable); - - for (unsigned i = 0; i < prog->NumUniformRemapTable; i++) { - write_uniform_remap_table_entry(metadata, prog->data->UniformStorage, - prog->UniformRemapTable[i]); - } + write_uniform_remap_table(metadata, prog->NumUniformRemapTable, + prog->data->UniformStorage, + prog->UniformRemapTable); for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) { struct gl_linked_shader *sh = prog->_LinkedShaders[i]; if (sh) { - struct gl_program *glprog = sh->Program; - blob_write_uint32(metadata, glprog->sh.NumSubroutineUniformRemapTable); - - for (unsigned j = 0; j < glprog->sh.NumSubroutineUniformRemapTable; j++) { - write_uniform_remap_table_entry(metadata, - prog->data->UniformStorage, - glprog->sh.SubroutineUniformRemapTable[j]); - } + write_uniform_remap_table(metadata, + sh->Program->sh.NumSubroutineUniformRemapTable, + prog->data->UniformStorage, + sh->Program->sh.SubroutineUniformRemapTable); } } } -static void -read_uniform_remap_table_entry(struct blob_reader *metadata, - gl_uniform_storage *uniform_storage, - gl_uniform_storage **entry, - enum uniform_remap_type type) -{ - if (type == remap_type_inactive_explicit_location) { - *entry = INACTIVE_UNIFORM_EXPLICIT_LOCATION; - } else if (type == remap_type_null_ptr) { - *entry = NULL; - } else { - uint32_t uni_offset = blob_read_uint32(metadata); - *entry = uniform_storage + uni_offset; - } -} - -static void -read_uniform_remap_tables(struct blob_reader *metadata, - struct gl_shader_program *prog) +static struct gl_uniform_storage ** +read_uniform_remap_table(struct blob_reader *metadata, + struct gl_shader_program *prog, + unsigned *num_entries, + gl_uniform_storage *uniform_storage) { - prog->NumUniformRemapTable = blob_read_uint32(metadata); + unsigned num = blob_read_uint32(metadata); + *num_entries = num; - prog->UniformRemapTable = rzalloc_array(prog, struct gl_uniform_storage *, - prog->NumUniformRemapTable); + struct gl_uniform_storage **remap_table = + rzalloc_array(prog, struct gl_uniform_storage *, num); - for (unsigned i = 0; i < prog->NumUniformRemapTable; i++) { + for (unsigned i = 0; i < num; i++) { enum uniform_remap_type type = (enum uniform_remap_type) blob_read_uint32(metadata); - read_uniform_remap_table_entry(metadata, prog->data->UniformStorage, - &prog->UniformRemapTable[i], type); + if (type == remap_type_inactive_explicit_location) { + remap_table[i] = INACTIVE_UNIFORM_EXPLICIT_LOCATION; + } else if (type == remap_type_null_ptr) { + remap_table[i] = NULL; + } else if (type == remap_type_uniform_offsets_equal) { + uint32_t uni_offset = blob_read_uint32(metadata); + uint32_t count = blob_read_uint32(metadata); + struct gl_uniform_storage *entry = uniform_storage + uni_offset; + + for (unsigned j = 0; j < count; j++) + remap_table[i+j] = entry; + i += count - 1; + } else { + uint32_t uni_offset = blob_read_uint32(metadata); + remap_table[i] = uniform_storage + uni_offset; + } } + return remap_table; +} + +static void +read_uniform_remap_tables(struct blob_reader *metadata, + struct gl_shader_program *prog) +{ + prog->UniformRemapTable = + read_uniform_remap_table(metadata, prog, &prog->NumUniformRemapTable, + prog->data->UniformStorage); for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) { struct gl_linked_shader *sh = prog->_LinkedShaders[i]; if (sh) { struct gl_program *glprog = sh->Program; - glprog->sh.NumSubroutineUniformRemapTable = blob_read_uint32(metadata); glprog->sh.SubroutineUniformRemapTable = - rzalloc_array(glprog, struct gl_uniform_storage *, - glprog->sh.NumSubroutineUniformRemapTable); - - for (unsigned j = 0; j < glprog->sh.NumSubroutineUniformRemapTable; j++) { - enum uniform_remap_type type = - (enum uniform_remap_type) blob_read_uint32(metadata); - - read_uniform_remap_table_entry(metadata, - prog->data->UniformStorage, - &glprog->sh.SubroutineUniformRemapTable[j], - type); - } + read_uniform_remap_table(metadata, prog, + &glprog->sh.NumSubroutineUniformRemapTable, + prog->data->UniformStorage); } } } @@ -1015,6 +1034,8 @@ blob_write_uint32(metadata, param->DataType); blob_write_bytes(metadata, param->StateIndexes, sizeof(param->StateIndexes)); + blob_write_uint32(metadata, param->UniformStorageIndex); + blob_write_uint32(metadata, param->MainUniformStorageIndex); i++; } @@ -1046,6 +1067,10 @@ _mesa_add_parameter(params, type, name, size, data_type, NULL, state_indexes, padded); + gl_program_parameter *param = ¶ms->Parameters[i]; + param->UniformStorageIndex = blob_read_uint32(metadata); + param->MainUniformStorageIndex = blob_read_uint32(metadata); + i++; } diff -Nru mesa-19.2.8/src/compiler/glsl/standalone.cpp mesa-20.0.8/src/compiler/glsl/standalone.cpp --- mesa-19.2.8/src/compiler/glsl/standalone.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/glsl/standalone.cpp 2020-06-12 01:21:16.000000000 +0000 @@ -134,7 +134,7 @@ initialize_context(struct gl_context *ctx, gl_api api) { initialize_context_to_defaults(ctx, api); - glsl_type_singleton_init_or_ref(); + _mesa_glsl_builtin_functions_init_or_ref(); /* The standalone compiler needs to claim support for almost * everything in order to compile the built-in functions. @@ -620,6 +620,5 @@ delete whole_program->FragDataIndexBindings; ralloc_free(whole_program); - glsl_type_singleton_decref(); - _mesa_glsl_release_builtin_functions(); + _mesa_glsl_builtin_functions_decref(); } diff -Nru mesa-19.2.8/src/compiler/glsl/string_to_uint_map.h mesa-20.0.8/src/compiler/glsl/string_to_uint_map.h --- mesa-19.2.8/src/compiler/glsl/string_to_uint_map.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/glsl/string_to_uint_map.h 2020-06-12 01:21:16.000000000 +0000 @@ -61,7 +61,7 @@ public: string_to_uint_map() { - this->ht = _mesa_hash_table_create(NULL, _mesa_key_hash_string, + this->ht = _mesa_hash_table_create(NULL, _mesa_hash_string, _mesa_key_string_equal); } diff -Nru mesa-19.2.8/src/compiler/glsl/tests/array_refcount_test.cpp mesa-20.0.8/src/compiler/glsl/tests/array_refcount_test.cpp --- mesa-19.2.8/src/compiler/glsl/tests/array_refcount_test.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/glsl/tests/array_refcount_test.cpp 2020-06-12 01:21:16.000000000 +0000 @@ -93,6 +93,8 @@ void array_refcount_test::SetUp() { + glsl_type_singleton_init_or_ref(); + mem_ctx = ralloc_context(NULL); instructions.make_empty(); @@ -117,6 +119,8 @@ ralloc_free(mem_ctx); mem_ctx = NULL; + + glsl_type_singleton_decref(); } static operand diff -Nru mesa-19.2.8/src/compiler/glsl/tests/blob_test.c mesa-20.0.8/src/compiler/glsl/tests/blob_test.c --- mesa-19.2.8/src/compiler/glsl/tests/blob_test.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/glsl/tests/blob_test.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,328 +0,0 @@ -/* - * Copyright © 2014 Intel Corporation - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ - -/* A collection of unit tests for blob.c */ - -#include -#include -#include -#include -#include -#ifdef _MSC_VER -#include -typedef SSIZE_T ssize_t; -#endif - -#include "util/ralloc.h" -#include "blob.h" - -#define bytes_test_str "bytes_test" -#define reserve_test_str "reserve_test" - -/* This placeholder must be the same length as the next overwrite_test_str */ -#define placeholder_str "XXXXXXXXXXXXXX" -#define overwrite_test_str "overwrite_test" -#define uint32_test 0x12345678 -#define uint32_placeholder 0xDEADBEEF -#define uint32_overwrite 0xA1B2C3D4 -#define uint64_test 0x1234567890ABCDEF -#define string_test_str "string_test" - -bool error = false; - -static void -expect_equal(uint64_t expected, uint64_t actual, const char *test) -{ - if (actual != expected) { - fprintf(stderr, - "Error: Test '%s' failed: " - "Expected=%" PRIu64 ", " - "Actual=%" PRIu64 "\n", - test, expected, actual); - error = true; - } -} - -static void -expect_unequal(uint64_t expected, uint64_t actual, const char *test) -{ - if (actual == expected) { - fprintf(stderr, - "Error: Test '%s' failed: Result=%" PRIu64 ", " - "but expected something different.\n", - test, actual); - error = true; - } -} - -static void -expect_equal_str(const char *expected, const char *actual, const char *test) -{ - if (strcmp(expected, actual)) { - fprintf (stderr, "Error: Test '%s' failed:\n\t" - "Expected=\"%s\", Actual=\"%s\"\n", - test, expected, actual); - error = true; - } -} - -static void -expect_equal_bytes(uint8_t *expected, const uint8_t *actual, - size_t num_bytes, const char *test) -{ - size_t i; - - if (memcmp(expected, actual, num_bytes)) { - fprintf (stderr, "Error: Test '%s' failed:\n\t", test); - - fprintf (stderr, "Expected=["); - for (i = 0; i < num_bytes; i++) { - if (i != 0) - fprintf(stderr, ", "); - fprintf(stderr, "0x%02x", expected[i]); - } - fprintf (stderr, "]"); - - fprintf (stderr, "Actual=["); - for (i = 0; i < num_bytes; i++) { - if (i != 0) - fprintf(stderr, ", "); - fprintf(stderr, "0x%02x", actual[i]); - } - fprintf (stderr, "]\n"); - - error = true; - } -} - -/* Test at least one call of each blob_write_foo and blob_read_foo function, - * verifying that we read out everything we wrote, that every bytes is - * consumed, and that the overrun bit is not set. - */ -static void -test_write_and_read_functions (void) -{ - struct blob blob; - struct blob_reader reader; - ssize_t reserved; - size_t str_offset, uint_offset; - uint8_t reserve_buf[sizeof(reserve_test_str)]; - - blob_init(&blob); - - /*** Test blob by writing one of every possible kind of value. */ - - blob_write_bytes(&blob, bytes_test_str, sizeof(bytes_test_str)); - - reserved = blob_reserve_bytes(&blob, sizeof(reserve_test_str)); - blob_overwrite_bytes(&blob, reserved, reserve_test_str, sizeof(reserve_test_str)); - - /* Write a placeholder, (to be replaced later via overwrite_bytes) */ - str_offset = blob.size; - blob_write_bytes(&blob, placeholder_str, sizeof(placeholder_str)); - - blob_write_uint32(&blob, uint32_test); - - /* Write a placeholder, (to be replaced later via overwrite_uint32) */ - uint_offset = blob.size; - blob_write_uint32(&blob, uint32_placeholder); - - blob_write_uint64(&blob, uint64_test); - - blob_write_intptr(&blob, (intptr_t) &blob); - - blob_write_string(&blob, string_test_str); - - /* Finally, overwrite our placeholders. */ - blob_overwrite_bytes(&blob, str_offset, overwrite_test_str, - sizeof(overwrite_test_str)); - blob_overwrite_uint32(&blob, uint_offset, uint32_overwrite); - - /*** Now read each value and verify. */ - blob_reader_init(&reader, blob.data, blob.size); - - expect_equal_str(bytes_test_str, - blob_read_bytes(&reader, sizeof(bytes_test_str)), - "blob_write/read_bytes"); - - blob_copy_bytes(&reader, reserve_buf, sizeof(reserve_buf)); - expect_equal_str(reserve_test_str, (char *) reserve_buf, - "blob_reserve_bytes/blob_copy_bytes"); - - expect_equal_str(overwrite_test_str, - blob_read_bytes(&reader, sizeof(overwrite_test_str)), - "blob_overwrite_bytes"); - - expect_equal(uint32_test, blob_read_uint32(&reader), - "blob_write/read_uint32"); - expect_equal(uint32_overwrite, blob_read_uint32(&reader), - "blob_overwrite_uint32"); - expect_equal(uint64_test, blob_read_uint64(&reader), - "blob_write/read_uint64"); - expect_equal((intptr_t) &blob, blob_read_intptr(&reader), - "blob_write/read_intptr"); - expect_equal_str(string_test_str, blob_read_string(&reader), - "blob_write/read_string"); - - expect_equal(reader.end - reader.data, reader.current - reader.data, - "read_consumes_all_bytes"); - expect_equal(false, reader.overrun, "read_does_not_overrun"); - - blob_finish(&blob); -} - -/* Test that data values are written and read with proper alignment. */ -static void -test_alignment(void) -{ - struct blob blob; - struct blob_reader reader; - uint8_t bytes[] = "ABCDEFGHIJKLMNOP"; - size_t delta, last, num_bytes; - - blob_init(&blob); - - /* First, write an intptr value to the blob and capture that size. This is - * the expected offset between any pair of intptr values (if written with - * alignment). - */ - blob_write_intptr(&blob, (intptr_t) &blob); - - delta = blob.size; - last = blob.size; - - /* Then loop doing the following: - * - * 1. Write an unaligned number of bytes - * 2. Verify that write results in an unaligned size - * 3. Write an intptr_t value - * 2. Verify that that write results in an aligned size - */ - for (num_bytes = 1; num_bytes < sizeof(intptr_t); num_bytes++) { - blob_write_bytes(&blob, bytes, num_bytes); - - expect_unequal(delta, blob.size - last, "unaligned write of bytes"); - - blob_write_intptr(&blob, (intptr_t) &blob); - - expect_equal(2 * delta, blob.size - last, "aligned write of intptr"); - - last = blob.size; - } - - /* Finally, test that reading also does proper alignment. Since we know - * that values were written with all the right alignment, all we have to do - * here is verify that correct values are read. - */ - blob_reader_init(&reader, blob.data, blob.size); - - expect_equal((intptr_t) &blob, blob_read_intptr(&reader), - "read of initial, aligned intptr_t"); - - for (num_bytes = 1; num_bytes < sizeof(intptr_t); num_bytes++) { - expect_equal_bytes(bytes, blob_read_bytes(&reader, num_bytes), - num_bytes, "unaligned read of bytes"); - expect_equal((intptr_t) &blob, blob_read_intptr(&reader), - "aligned read of intptr_t"); - } - - blob_finish(&blob); -} - -/* Test that we detect overrun. */ -static void -test_overrun(void) -{ - struct blob blob; - struct blob_reader reader; - uint32_t value = 0xdeadbeef; - - blob_init(&blob); - - blob_write_uint32(&blob, value); - - blob_reader_init(&reader, blob.data, blob.size); - - expect_equal(value, blob_read_uint32(&reader), "read before overrun"); - expect_equal(false, reader.overrun, "overrun flag not set"); - expect_equal(0, blob_read_uint32(&reader), "read at overrun"); - expect_equal(true, reader.overrun, "overrun flag set"); - - blob_finish(&blob); -} - -/* Test that we can read and write some large objects, (exercising the code in - * the blob_write functions to realloc blob->data. - */ -static void -test_big_objects(void) -{ - void *ctx = ralloc_context(NULL); - struct blob blob; - struct blob_reader reader; - int size = 1000; - int count = 1000; - size_t i; - char *buf; - - blob_init(&blob); - - /* Initialize our buffer. */ - buf = ralloc_size(ctx, size); - for (i = 0; i < size; i++) { - buf[i] = i % 256; - } - - /* Write it many times. */ - for (i = 0; i < count; i++) { - blob_write_bytes(&blob, buf, size); - } - - blob_reader_init(&reader, blob.data, blob.size); - - /* Read and verify it many times. */ - for (i = 0; i < count; i++) { - expect_equal_bytes((uint8_t *) buf, blob_read_bytes(&reader, size), size, - "read of large objects"); - } - - expect_equal(reader.end - reader.data, reader.current - reader.data, - "number of bytes read reading large objects"); - - expect_equal(false, reader.overrun, - "overrun flag not set reading large objects"); - - blob_finish(&blob); - ralloc_free(ctx); -} - -int -main (void) -{ - test_write_and_read_functions (); - test_alignment (); - test_overrun (); - test_big_objects (); - - return error ? 1 : 0; -} diff -Nru mesa-19.2.8/src/compiler/glsl/tests/builtin_variable_test.cpp mesa-20.0.8/src/compiler/glsl/tests/builtin_variable_test.cpp --- mesa-19.2.8/src/compiler/glsl/tests/builtin_variable_test.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/glsl/tests/builtin_variable_test.cpp 2020-06-12 01:21:16.000000000 +0000 @@ -57,6 +57,8 @@ void common_builtin::SetUp() { + glsl_type_singleton_init_or_ref(); + this->mem_ctx = ralloc_context(NULL); this->ir.make_empty(); @@ -79,6 +81,8 @@ { ralloc_free(this->mem_ctx); this->mem_ctx = NULL; + + glsl_type_singleton_decref(); } void diff -Nru mesa-19.2.8/src/compiler/glsl/tests/cache_test.c mesa-20.0.8/src/compiler/glsl/tests/cache_test.c --- mesa-19.2.8/src/compiler/glsl/tests/cache_test.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/glsl/tests/cache_test.c 2020-06-12 01:21:16.000000000 +0000 @@ -162,26 +162,6 @@ return false; } -static void -wait_until_file_written(struct disk_cache *cache, const cache_key key) -{ - struct timespec req; - struct timespec rem; - - /* Set 100ms delay */ - req.tv_sec = 0; - req.tv_nsec = 100000000; - - unsigned retries = 0; - while (retries++ < 20) { - if (does_cache_contain(cache, key)) { - break; - } - - nanosleep(&req, &rem); - } -} - static void * cache_exists(struct disk_cache *cache) { @@ -192,7 +172,7 @@ return NULL; disk_cache_put(cache, dummy_key, data, sizeof(data), NULL); - wait_until_file_written(cache, dummy_key); + disk_cache_wait_for_idle(cache); return disk_cache_get(cache, dummy_key, NULL); } @@ -286,10 +266,8 @@ /* Simple test of put and get. */ disk_cache_put(cache, blob_key, blob, sizeof(blob), NULL); - /* disk_cache_put() hands things off to a thread give it some time to - * finish. - */ - wait_until_file_written(cache, blob_key); + /* disk_cache_put() hands things off to a thread so wait for it. */ + disk_cache_wait_for_idle(cache); result = disk_cache_get(cache, blob_key, &size); expect_equal_str(blob, result, "disk_cache_get of existing item (pointer)"); @@ -301,10 +279,8 @@ disk_cache_compute_key(cache, string, sizeof(string), string_key); disk_cache_put(cache, string_key, string, sizeof(string), NULL); - /* disk_cache_put() hands things off to a thread give it some time to - * finish. - */ - wait_until_file_written(cache, string_key); + /* disk_cache_put() hands things off to a thread so wait for it. */ + disk_cache_wait_for_idle(cache); result = disk_cache_get(cache, string_key, &size); expect_equal_str(result, string, "2nd disk_cache_get of existing item (pointer)"); @@ -344,10 +320,8 @@ free(one_KB); - /* disk_cache_put() hands things off to a thread give it some time to - * finish. - */ - wait_until_file_written(cache, one_KB_key); + /* disk_cache_put() hands things off to a thread so wait for it. */ + disk_cache_wait_for_idle(cache); result = disk_cache_get(cache, one_KB_key, &size); expect_non_null(result, "3rd disk_cache_get of existing item (pointer)"); @@ -386,11 +360,8 @@ disk_cache_put(cache, blob_key, blob, sizeof(blob), NULL); disk_cache_put(cache, string_key, string, sizeof(string), NULL); - /* disk_cache_put() hands things off to a thread give it some time to - * finish. - */ - wait_until_file_written(cache, blob_key); - wait_until_file_written(cache, string_key); + /* disk_cache_put() hands things off to a thread so wait for it. */ + disk_cache_wait_for_idle(cache); count = 0; if (does_cache_contain(cache, blob_key)) @@ -414,10 +385,8 @@ free(one_MB); - /* disk_cache_put() hands things off to a thread give it some time to - * finish. - */ - wait_until_file_written(cache, one_MB_key); + /* disk_cache_put() hands things off to a thread so wait for it. */ + disk_cache_wait_for_idle(cache); bool contains_1MB_file = false; count = 0; diff -Nru mesa-19.2.8/src/compiler/glsl/tests/copy_constant_to_storage_tests.cpp mesa-20.0.8/src/compiler/glsl/tests/copy_constant_to_storage_tests.cpp --- mesa-19.2.8/src/compiler/glsl/tests/copy_constant_to_storage_tests.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/glsl/tests/copy_constant_to_storage_tests.cpp 2020-06-12 01:21:16.000000000 +0000 @@ -54,6 +54,8 @@ void copy_constant_to_storage::SetUp() { + glsl_type_singleton_init_or_ref(); + this->mem_ctx = ralloc_context(NULL); } @@ -62,6 +64,8 @@ { ralloc_free(this->mem_ctx); this->mem_ctx = NULL; + + glsl_type_singleton_decref(); } void diff -Nru mesa-19.2.8/src/compiler/glsl/tests/general_ir_test.cpp mesa-20.0.8/src/compiler/glsl/tests/general_ir_test.cpp --- mesa-19.2.8/src/compiler/glsl/tests/general_ir_test.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/glsl/tests/general_ir_test.cpp 2020-06-12 01:21:16.000000000 +0000 @@ -26,7 +26,25 @@ #include "main/macros.h" #include "ir.h" -TEST(ir_variable_constructor, interface) +class ir_variable_constructor : public ::testing::Test { +public: + virtual void SetUp(); + virtual void TearDown(); +}; + +void +ir_variable_constructor::SetUp() +{ + glsl_type_singleton_init_or_ref(); +} + +void +ir_variable_constructor::TearDown() +{ + glsl_type_singleton_decref(); +} + +TEST_F(ir_variable_constructor, interface) { void *mem_ctx = ralloc_context(NULL); @@ -52,7 +70,7 @@ EXPECT_EQ(iface, v->get_interface_type()); } -TEST(ir_variable_constructor, interface_array) +TEST_F(ir_variable_constructor, interface_array) { void *mem_ctx = ralloc_context(NULL); diff -Nru mesa-19.2.8/src/compiler/glsl/tests/invalidate_locations_test.cpp mesa-20.0.8/src/compiler/glsl/tests/invalidate_locations_test.cpp --- mesa-19.2.8/src/compiler/glsl/tests/invalidate_locations_test.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/glsl/tests/invalidate_locations_test.cpp 2020-06-12 01:21:16.000000000 +0000 @@ -46,6 +46,8 @@ void invalidate_locations::SetUp() { + glsl_type_singleton_init_or_ref(); + this->mem_ctx = ralloc_context(NULL); this->ir.make_empty(); } @@ -55,6 +57,8 @@ { ralloc_free(this->mem_ctx); this->mem_ctx = NULL; + + glsl_type_singleton_decref(); } TEST_F(invalidate_locations, simple_vertex_in_generic) diff -Nru mesa-19.2.8/src/compiler/glsl/tests/lower_int64_test.cpp mesa-20.0.8/src/compiler/glsl/tests/lower_int64_test.cpp --- mesa-19.2.8/src/compiler/glsl/tests/lower_int64_test.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/glsl/tests/lower_int64_test.cpp 2020-06-12 01:21:16.000000000 +0000 @@ -57,6 +57,8 @@ void expand_source::SetUp() { + glsl_type_singleton_init_or_ref(); + mem_ctx = ralloc_context(NULL); memset(expanded_src, 0, sizeof(expanded_src)); @@ -72,6 +74,8 @@ ralloc_free(mem_ctx); mem_ctx = NULL; + + glsl_type_singleton_decref(); } static ir_dereference_variable * diff -Nru mesa-19.2.8/src/compiler/glsl/tests/meson.build mesa-20.0.8/src/compiler/glsl/tests/meson.build --- mesa-19.2.8/src/compiler/glsl/tests/meson.build 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/glsl/tests/meson.build 2020-06-12 01:21:16.000000000 +0000 @@ -18,18 +18,6 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -test( - 'blob_test', - executable( - 'blob_test', - 'blob_test.c', - c_args : [c_vis_args, c_msvc_compat_args, no_override_init_args], - include_directories : [inc_common, inc_compiler], - link_with : [libglsl], - ), - suite : ['compiler', 'glsl'], -) - if with_shader_cache test( 'cache_test', diff -Nru mesa-19.2.8/src/compiler/glsl/tests/opt_add_neg_to_sub_test.cpp mesa-20.0.8/src/compiler/glsl/tests/opt_add_neg_to_sub_test.cpp --- mesa-19.2.8/src/compiler/glsl/tests/opt_add_neg_to_sub_test.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/glsl/tests/opt_add_neg_to_sub_test.cpp 2020-06-12 01:21:16.000000000 +0000 @@ -44,6 +44,8 @@ void add_neg_to_sub::SetUp() { + glsl_type_singleton_init_or_ref(); + mem_ctx = ralloc_context(NULL); instructions.make_empty(); @@ -70,6 +72,8 @@ ralloc_free(mem_ctx); mem_ctx = NULL; + + glsl_type_singleton_decref(); } TEST_F(add_neg_to_sub, a_plus_b) diff -Nru mesa-19.2.8/src/compiler/glsl/tests/optimization_test.py mesa-20.0.8/src/compiler/glsl/tests/optimization_test.py --- mesa-19.2.8/src/compiler/glsl/tests/optimization_test.py 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/glsl/tests/optimization_test.py 2020-06-12 01:21:16.000000000 +0000 @@ -24,12 +24,21 @@ from __future__ import print_function import argparse import difflib +import errno +import os import subprocess import sys import sexps import lower_jump_cases +# The meson version handles windows paths better, but if it's not available +# fall back to shlex +try: + from meson.mesonlib import split_args +except ImportError: + from shlex import split as split_args + def arg_parser(): parser = argparse.ArgumentParser() @@ -54,6 +63,14 @@ return difflib.unified_diff(expected.splitlines(), actual.splitlines()) +def get_test_runner(runner): + """Wrap the test runner in the exe wrapper if necessary.""" + wrapper = os.environ.get('MESON_EXE_WRAPPER', None) + if wrapper is None: + return [runner] + return split_args(wrapper) + [runner] + + def main(): """Generate each test and report pass or fail.""" args = arg_parser() @@ -61,12 +78,14 @@ total = 0 passes = 0 + runner = get_test_runner(args.test_runner) + for gen in lower_jump_cases.CASES: for name, opt, source, expected in gen(): total += 1 print('{}: '.format(name), end='') proc = subprocess.Popen( - [args.test_runner, 'optpass', '--quiet', '--input-ir', opt], + runner + ['optpass', '--quiet', '--input-ir', opt], stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE) @@ -93,4 +112,10 @@ if __name__ == '__main__': - main() + try: + main() + except OSError as e: + if e.errno == errno.ENOEXEC: + print('Skipping due to inability to run host binaries', file=sys.stderr) + sys.exit(77) + raise diff -Nru mesa-19.2.8/src/compiler/glsl/tests/set_uniform_initializer_tests.cpp mesa-20.0.8/src/compiler/glsl/tests/set_uniform_initializer_tests.cpp --- mesa-19.2.8/src/compiler/glsl/tests/set_uniform_initializer_tests.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/glsl/tests/set_uniform_initializer_tests.cpp 2020-06-12 01:21:16.000000000 +0000 @@ -68,6 +68,8 @@ void set_uniform_initializer::SetUp() { + glsl_type_singleton_init_or_ref(); + this->mem_ctx = ralloc_context(NULL); this->prog = rzalloc(NULL, struct gl_shader_program); this->prog->data = rzalloc(this->prog, struct gl_shader_program_data); @@ -86,6 +88,8 @@ ralloc_free(this->prog); this->prog = NULL; + + glsl_type_singleton_decref(); } /** diff -Nru mesa-19.2.8/src/compiler/glsl/tests/sexps.py mesa-20.0.8/src/compiler/glsl/tests/sexps.py --- mesa-19.2.8/src/compiler/glsl/tests/sexps.py 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/glsl/tests/sexps.py 2020-06-12 01:21:16.000000000 +0000 @@ -52,7 +52,7 @@ into a sexp represented as nested lists containing strings. """ sexp_token_regexp = re.compile( - '[a-zA-Z_]+(@[0-9]+)?|[0-9]+(\\.[0-9]+)?|[^ \n]') + '[a-zA-Z_]+(@[0-9]+)?|[0-9]+(\\.[0-9]+)?|[^ \r?\n]') stack = [[]] for match in sexp_token_regexp.finditer(sexp): token = match.group(0) diff -Nru mesa-19.2.8/src/compiler/glsl/tests/varyings_test.cpp mesa-20.0.8/src/compiler/glsl/tests/varyings_test.cpp --- mesa-19.2.8/src/compiler/glsl/tests/varyings_test.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/glsl/tests/varyings_test.cpp 2020-06-12 01:21:16.000000000 +0000 @@ -51,8 +51,6 @@ class link_varyings : public ::testing::Test { public: - link_varyings(); - virtual void SetUp(); virtual void TearDown(); @@ -73,33 +71,33 @@ ir_variable *junk[VARYING_SLOT_TESS_MAX]; }; -link_varyings::link_varyings() -{ - static const glsl_struct_field f[] = { - glsl_struct_field(glsl_type::vec(4), "v") - }; - - this->simple_interface = - glsl_type::get_interface_instance(f, - ARRAY_SIZE(f), - GLSL_INTERFACE_PACKING_STD140, - false, - "simple_interface"); -} - void link_varyings::SetUp() { + glsl_type_singleton_init_or_ref(); + this->mem_ctx = ralloc_context(NULL); this->ir.make_empty(); this->consumer_inputs = - _mesa_hash_table_create(NULL, _mesa_key_hash_string, + _mesa_hash_table_create(NULL, _mesa_hash_string, _mesa_key_string_equal); this->consumer_interface_inputs = - _mesa_hash_table_create(NULL, _mesa_key_hash_string, + _mesa_hash_table_create(NULL, _mesa_hash_string, _mesa_key_string_equal); + + /* Needs to happen after glsl type initialization */ + static const glsl_struct_field f[] = { + glsl_struct_field(glsl_type::vec(4), "v") + }; + + this->simple_interface = + glsl_type::get_interface_instance(f, + ARRAY_SIZE(f), + GLSL_INTERFACE_PACKING_STD140, + false, + "simple_interface"); } void @@ -112,6 +110,8 @@ this->consumer_inputs = NULL; _mesa_hash_table_destroy(this->consumer_interface_inputs, NULL); this->consumer_interface_inputs = NULL; + + glsl_type_singleton_decref(); } TEST_F(link_varyings, single_simple_input) diff -Nru mesa-19.2.8/src/compiler/glsl/tests/warnings_test.py mesa-20.0.8/src/compiler/glsl/tests/warnings_test.py --- mesa-19.2.8/src/compiler/glsl/tests/warnings_test.py 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/glsl/tests/warnings_test.py 2020-06-12 01:21:16.000000000 +0000 @@ -21,8 +21,17 @@ from __future__ import print_function import argparse +import errno import os import subprocess +import sys + +# The meson version handles windows paths better, but if it's not available +# fall back to shlex +try: + from meson.mesonlib import split_args +except ImportError: + from shlex import split as split_args def arg_parser(): @@ -38,6 +47,14 @@ return parser.parse_args() +def get_test_runner(runner): + """Wrap the test runner in the exe wrapper if necessary.""" + wrapper = os.environ.get('MESON_EXE_WRAPPER', None) + if wrapper is None: + return [runner] + return split_args(wrapper) + [runner] + + def main(): args = arg_parser() files = [f for f in os.listdir(args.test_directory) if f.endswith('.vert')] @@ -47,17 +64,19 @@ print('Could not find any tests') exit(1) + runner = get_test_runner(args.glsl_compiler) + print('====== Testing compilation output ======') for file in files: print('Testing {} ...'.format(file), end='') file = os.path.join(args.test_directory, file) with open('{}.expected'.format(file), 'rb') as f: - expected = f.read().strip() + expected = f.read().splitlines() actual = subprocess.check_output( - [args.glsl_compiler, '--just-log', '--version', '150', file] - ).strip() + runner + ['--just-log', '--version', '150', file] + ).splitlines() if actual == expected: print('PASS') @@ -70,4 +89,10 @@ if __name__ == '__main__': - main() + try: + main() + except OSError as e: + if e.errno == errno.ENOEXEC: + print('Skipping due to inability to run host binaries', file=sys.stderr) + sys.exit(77) + raise diff -Nru mesa-19.2.8/src/compiler/glsl_types.cpp mesa-20.0.8/src/compiler/glsl_types.cpp --- mesa-19.2.8/src/compiler/glsl_types.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/glsl_types.cpp 2020-06-12 01:21:16.000000000 +0000 @@ -486,7 +486,6 @@ glsl_type_singleton_decref() { mtx_lock(&glsl_type::hash_mutex); - assert(glsl_type_users > 0); /* Do not release glsl_types if they are still used. */ @@ -639,10 +638,11 @@ explicit_stride, row_major ? "RM" : ""); mtx_lock(&glsl_type::hash_mutex); + assert(glsl_type_users > 0); if (explicit_matrix_types == NULL) { explicit_matrix_types = - _mesa_hash_table_create(NULL, _mesa_key_hash_string, + _mesa_hash_table_create(NULL, _mesa_hash_string, _mesa_key_string_equal); } @@ -663,9 +663,11 @@ assert(((glsl_type *) entry->data)->matrix_columns == columns); assert(((glsl_type *) entry->data)->explicit_stride == explicit_stride); + const glsl_type *t = (const glsl_type *) entry->data; + mtx_unlock(&glsl_type::hash_mutex); - return (const glsl_type *) entry->data; + return t; } assert(!row_major); @@ -1004,9 +1006,10 @@ explicit_stride); mtx_lock(&glsl_type::hash_mutex); + assert(glsl_type_users > 0); if (array_types == NULL) { - array_types = _mesa_hash_table_create(NULL, _mesa_key_hash_string, + array_types = _mesa_hash_table_create(NULL, _mesa_hash_string, _mesa_key_string_equal); } @@ -1023,9 +1026,11 @@ assert(((glsl_type *) entry->data)->length == array_size); assert(((glsl_type *) entry->data)->fields.array == base); + glsl_type *t = (glsl_type *) entry->data; + mtx_unlock(&glsl_type::hash_mutex); - return (glsl_type *) entry->data; + return t; } bool @@ -1204,6 +1209,7 @@ const glsl_type key(fields, num_fields, name, packed); mtx_lock(&glsl_type::hash_mutex); + assert(glsl_type_users > 0); if (struct_types == NULL) { struct_types = _mesa_hash_table_create(NULL, record_key_hash, @@ -1223,9 +1229,11 @@ assert(strcmp(((glsl_type *) entry->data)->name, name) == 0); assert(((glsl_type *) entry->data)->packed == packed); + glsl_type *t = (glsl_type *) entry->data; + mtx_unlock(&glsl_type::hash_mutex); - return (glsl_type *) entry->data; + return t; } @@ -1239,6 +1247,7 @@ const glsl_type key(fields, num_fields, packing, row_major, block_name); mtx_lock(&glsl_type::hash_mutex); + assert(glsl_type_users > 0); if (interface_types == NULL) { interface_types = _mesa_hash_table_create(NULL, record_key_hash, @@ -1258,9 +1267,11 @@ assert(((glsl_type *) entry->data)->length == num_fields); assert(strcmp(((glsl_type *) entry->data)->name, block_name) == 0); + glsl_type *t = (glsl_type *) entry->data; + mtx_unlock(&glsl_type::hash_mutex); - return (glsl_type *) entry->data; + return t; } const glsl_type * @@ -1269,6 +1280,7 @@ const glsl_type key(subroutine_name); mtx_lock(&glsl_type::hash_mutex); + assert(glsl_type_users > 0); if (subroutine_types == NULL) { subroutine_types = _mesa_hash_table_create(NULL, record_key_hash, @@ -1286,9 +1298,11 @@ assert(((glsl_type *) entry->data)->base_type == GLSL_TYPE_SUBROUTINE); assert(strcmp(((glsl_type *) entry->data)->name, subroutine_name) == 0); + glsl_type *t = (glsl_type *) entry->data; + mtx_unlock(&glsl_type::hash_mutex); - return (glsl_type *) entry->data; + return t; } @@ -1322,6 +1336,7 @@ const glsl_type key(return_type, params, num_params); mtx_lock(&glsl_type::hash_mutex); + assert(glsl_type_users > 0); if (function_types == NULL) { function_types = _mesa_hash_table_create(NULL, function_key_hash, @@ -2427,7 +2442,7 @@ } unsigned -glsl_type::count_attribute_slots(bool is_gl_vertex_input) const +glsl_type::count_vec4_slots(bool is_gl_vertex_input, bool is_bindless) const { /* From page 31 (page 37 of the PDF) of the GLSL 1.50 spec: * @@ -2464,8 +2479,6 @@ case GLSL_TYPE_FLOAT: case GLSL_TYPE_FLOAT16: case GLSL_TYPE_BOOL: - case GLSL_TYPE_SAMPLER: - case GLSL_TYPE_IMAGE: return this->matrix_columns; case GLSL_TYPE_DOUBLE: case GLSL_TYPE_UINT64: @@ -2480,7 +2493,7 @@ for (unsigned i = 0; i < this->length; i++) { const glsl_type *member_type = this->fields.structure[i].type; - size += member_type->count_attribute_slots(is_gl_vertex_input); + size += member_type->count_vec4_slots(is_gl_vertex_input, is_bindless); } return size; @@ -2488,9 +2501,17 @@ case GLSL_TYPE_ARRAY: { const glsl_type *element = this->fields.array; - return this->length * element->count_attribute_slots(is_gl_vertex_input); + return this->length * element->count_vec4_slots(is_gl_vertex_input, + is_bindless); } + case GLSL_TYPE_SAMPLER: + case GLSL_TYPE_IMAGE: + if (!is_bindless) + return 0; + else + return 1; + case GLSL_TYPE_SUBROUTINE: return 1; @@ -2506,6 +2527,58 @@ return 0; } +unsigned +glsl_type::count_dword_slots(bool is_bindless) const +{ + switch (this->base_type) { + case GLSL_TYPE_UINT: + case GLSL_TYPE_INT: + case GLSL_TYPE_FLOAT: + case GLSL_TYPE_BOOL: + return this->components(); + case GLSL_TYPE_UINT16: + case GLSL_TYPE_INT16: + case GLSL_TYPE_FLOAT16: + return DIV_ROUND_UP(this->components(), 2); + case GLSL_TYPE_UINT8: + case GLSL_TYPE_INT8: + return DIV_ROUND_UP(this->components(), 4); + case GLSL_TYPE_IMAGE: + case GLSL_TYPE_SAMPLER: + if (!is_bindless) + return 0; + /* FALLTHROUGH */ + case GLSL_TYPE_DOUBLE: + case GLSL_TYPE_UINT64: + case GLSL_TYPE_INT64: + return this->components() * 2; + case GLSL_TYPE_ARRAY: + return this->fields.array->count_dword_slots(is_bindless) * + this->length; + + case GLSL_TYPE_INTERFACE: + case GLSL_TYPE_STRUCT: { + unsigned size = 0; + for (unsigned i = 0; i < this->length; i++) { + size += this->fields.structure[i].type->count_dword_slots(is_bindless); + } + return size; + } + + case GLSL_TYPE_ATOMIC_UINT: + return 0; + case GLSL_TYPE_SUBROUTINE: + return 1; + case GLSL_TYPE_VOID: + case GLSL_TYPE_ERROR: + case GLSL_TYPE_FUNCTION: + default: + unreachable("invalid type in st_glsl_type_dword_size()"); + } + + return 0; +} + int glsl_type::coordinate_components() const { @@ -2568,16 +2641,49 @@ sizeof(((glsl_struct_field *)0)->name); } +union packed_type { + uint32_t u32; + struct { + unsigned base_type:5; + unsigned interface_row_major:1; + unsigned vector_elements:3; + unsigned matrix_columns:3; + unsigned explicit_stride:20; + } basic; + struct { + unsigned base_type:5; + unsigned dimensionality:4; + unsigned shadow:1; + unsigned array:1; + unsigned sampled_type:2; + unsigned _pad:19; + } sampler; + struct { + unsigned base_type:5; + unsigned length:13; + unsigned explicit_stride:14; + } array; + struct { + unsigned base_type:5; + unsigned interface_packing_or_packed:2; + unsigned interface_row_major:1; + unsigned length:24; + } strct; +}; + void encode_type_to_blob(struct blob *blob, const glsl_type *type) { - uint32_t encoding; - if (!type) { blob_write_uint32(blob, 0); return; } + STATIC_ASSERT(sizeof(union packed_type) == 4); + union packed_type encoded; + encoded.u32 = 0; + encoded.basic.base_type = type->base_type; + switch (type->base_type) { case GLSL_TYPE_UINT: case GLSL_TYPE_INT: @@ -2591,45 +2697,68 @@ case GLSL_TYPE_UINT64: case GLSL_TYPE_INT64: case GLSL_TYPE_BOOL: - encoding = (type->base_type << 24) | - (type->interface_row_major << 10) | - (type->vector_elements << 4) | - (type->matrix_columns); - blob_write_uint32(blob, encoding); - blob_write_uint32(blob, type->explicit_stride); + encoded.basic.interface_row_major = type->interface_row_major; + assert(type->matrix_columns < 8); + if (type->vector_elements <= 4) + encoded.basic.vector_elements = type->vector_elements; + else if (type->vector_elements == 8) + encoded.basic.vector_elements = 5; + else if (type->vector_elements == 16) + encoded.basic.vector_elements = 6; + encoded.basic.matrix_columns = type->matrix_columns; + encoded.basic.explicit_stride = MIN2(type->explicit_stride, 0xfffff); + blob_write_uint32(blob, encoded.u32); + /* If we don't have enough bits for explicit_stride, store it + * separately. + */ + if (encoded.basic.explicit_stride == 0xfffff) + blob_write_uint32(blob, type->explicit_stride); return; case GLSL_TYPE_SAMPLER: - encoding = (type->base_type) << 24 | - (type->sampler_dimensionality << 4) | - (type->sampler_shadow << 3) | - (type->sampler_array << 2) | - (type->sampled_type); + encoded.sampler.dimensionality = type->sampler_dimensionality; + encoded.sampler.shadow = type->sampler_shadow; + encoded.sampler.array = type->sampler_array; + encoded.sampler.sampled_type = type->sampled_type; break; case GLSL_TYPE_SUBROUTINE: - encoding = type->base_type << 24; - blob_write_uint32(blob, encoding); + blob_write_uint32(blob, encoded.u32); blob_write_string(blob, type->name); return; case GLSL_TYPE_IMAGE: - encoding = (type->base_type) << 24 | - (type->sampler_dimensionality << 3) | - (type->sampler_array << 2) | - (type->sampled_type); + encoded.sampler.dimensionality = type->sampler_dimensionality; + encoded.sampler.array = type->sampler_array; + encoded.sampler.sampled_type = type->sampled_type; break; case GLSL_TYPE_ATOMIC_UINT: - encoding = (type->base_type << 24); break; case GLSL_TYPE_ARRAY: - blob_write_uint32(blob, (type->base_type) << 24); - blob_write_uint32(blob, type->length); - blob_write_uint32(blob, type->explicit_stride); + encoded.array.length = MIN2(type->length, 0x1fff); + encoded.array.explicit_stride = MIN2(type->explicit_stride, 0x3fff); + blob_write_uint32(blob, encoded.u32); + /* If we don't have enough bits for length or explicit_stride, store it + * separately. + */ + if (encoded.array.length == 0x1fff) + blob_write_uint32(blob, type->length); + if (encoded.array.explicit_stride == 0x3fff) + blob_write_uint32(blob, type->explicit_stride); encode_type_to_blob(blob, type->fields.array); return; case GLSL_TYPE_STRUCT: case GLSL_TYPE_INTERFACE: - blob_write_uint32(blob, (type->base_type) << 24); + encoded.strct.length = MIN2(type->length, 0xffffff); + if (type->is_interface()) { + encoded.strct.interface_packing_or_packed = type->interface_packing; + encoded.strct.interface_row_major = type->interface_row_major; + } else { + encoded.strct.interface_packing_or_packed = type->packed; + } + blob_write_uint32(blob, encoded.u32); blob_write_string(blob, type->name); - blob_write_uint32(blob, type->length); + + /* If we don't have enough bits for length, store it separately. */ + if (encoded.strct.length == 0xffffff) + blob_write_uint32(blob, type->length); size_t s_field_size, s_field_ptrs; get_struct_type_field_and_pointer_sizes(&s_field_size, &s_field_ptrs); @@ -2643,37 +2772,30 @@ ((char *)&type->fields.structure[i]) + s_field_ptrs, s_field_size - s_field_ptrs); } - - if (type->is_interface()) { - blob_write_uint32(blob, type->interface_packing); - blob_write_uint32(blob, type->interface_row_major); - } else { - blob_write_uint32(blob, type->packed); - } return; case GLSL_TYPE_VOID: - encoding = (type->base_type << 24); break; case GLSL_TYPE_ERROR: default: assert(!"Cannot encode type!"); - encoding = 0; + encoded.u32 = 0; break; } - blob_write_uint32(blob, encoding); + blob_write_uint32(blob, encoded.u32); } const glsl_type * decode_type_from_blob(struct blob_reader *blob) { - uint32_t u = blob_read_uint32(blob); + union packed_type encoded; + encoded.u32 = blob_read_uint32(blob); - if (u == 0) { + if (encoded.u32 == 0) { return NULL; } - glsl_base_type base_type = (glsl_base_type) (u >> 24); + glsl_base_type base_type = (glsl_base_type)encoded.basic.base_type; switch (base_type) { case GLSL_TYPE_UINT: @@ -2688,33 +2810,48 @@ case GLSL_TYPE_UINT64: case GLSL_TYPE_INT64: case GLSL_TYPE_BOOL: { - unsigned explicit_stride = blob_read_uint32(blob); - return glsl_type::get_instance(base_type, (u >> 4) & 0x0f, u & 0x0f, - explicit_stride, (u >> 10) & 0x1); + unsigned explicit_stride = encoded.basic.explicit_stride; + if (explicit_stride == 0xfffff) + explicit_stride = blob_read_uint32(blob); + uint32_t vector_elements = encoded.basic.vector_elements; + if (vector_elements == 5) + vector_elements = 8; + else if (vector_elements == 6) + vector_elements = 16; + return glsl_type::get_instance(base_type, encoded.basic.vector_elements, + encoded.basic.matrix_columns, + explicit_stride, + encoded.basic.interface_row_major); } case GLSL_TYPE_SAMPLER: - return glsl_type::get_sampler_instance((enum glsl_sampler_dim) ((u >> 4) & 0x0f), - (u >> 3) & 0x01, - (u >> 2) & 0x01, - (glsl_base_type) ((u >> 0) & 0x03)); + return glsl_type::get_sampler_instance((enum glsl_sampler_dim)encoded.sampler.dimensionality, + encoded.sampler.shadow, + encoded.sampler.array, + (glsl_base_type) encoded.sampler.sampled_type); case GLSL_TYPE_SUBROUTINE: return glsl_type::get_subroutine_instance(blob_read_string(blob)); case GLSL_TYPE_IMAGE: - return glsl_type::get_image_instance((enum glsl_sampler_dim) ((u >> 3) & 0x0f), - (u >> 2) & 0x01, - (glsl_base_type) ((u >> 0) & 0x03)); + return glsl_type::get_image_instance((enum glsl_sampler_dim)encoded.sampler.dimensionality, + encoded.sampler.array, + (glsl_base_type) encoded.sampler.sampled_type); case GLSL_TYPE_ATOMIC_UINT: return glsl_type::atomic_uint_type; case GLSL_TYPE_ARRAY: { - unsigned length = blob_read_uint32(blob); - unsigned explicit_stride = blob_read_uint32(blob); + unsigned length = encoded.array.length; + if (length == 0x1fff) + length = blob_read_uint32(blob); + unsigned explicit_stride = encoded.array.explicit_stride; + if (explicit_stride == 0x3fff) + explicit_stride = blob_read_uint32(blob); return glsl_type::get_array_instance(decode_type_from_blob(blob), length, explicit_stride); } case GLSL_TYPE_STRUCT: case GLSL_TYPE_INTERFACE: { char *name = blob_read_string(blob); - unsigned num_fields = blob_read_uint32(blob); + unsigned num_fields = encoded.strct.length; + if (num_fields == 0xffffff) + num_fields = blob_read_uint32(blob); size_t s_field_size, s_field_ptrs; get_struct_type_field_and_pointer_sizes(&s_field_size, &s_field_ptrs); @@ -2732,12 +2869,12 @@ const glsl_type *t; if (base_type == GLSL_TYPE_INTERFACE) { enum glsl_interface_packing packing = - (glsl_interface_packing) blob_read_uint32(blob); - bool row_major = blob_read_uint32(blob); + (glsl_interface_packing) encoded.strct.interface_packing_or_packed; + bool row_major = encoded.strct.interface_row_major; t = glsl_type::get_interface_instance(fields, num_fields, packing, row_major, name); } else { - unsigned packed = blob_read_uint32(blob); + unsigned packed = encoded.strct.interface_packing_or_packed; t = glsl_type::get_struct_instance(fields, num_fields, name, packed); } diff -Nru mesa-19.2.8/src/compiler/glsl_types.h mesa-20.0.8/src/compiler/glsl_types.h --- mesa-19.2.8/src/compiler/glsl_types.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/glsl_types.h 2020-06-12 01:21:16.000000000 +0000 @@ -29,8 +29,8 @@ #include #include "shader_enums.h" -#include "blob.h" #include "c11/threads.h" +#include "util/blob.h" #include "util/macros.h" #ifdef __cplusplus @@ -196,6 +196,27 @@ return 0; } +static inline enum glsl_base_type +glsl_unsigned_base_type_of(enum glsl_base_type type) +{ + switch (type) { + case GLSL_TYPE_INT: + return GLSL_TYPE_UINT; + case GLSL_TYPE_INT8: + return GLSL_TYPE_UINT8; + case GLSL_TYPE_INT16: + return GLSL_TYPE_UINT16; + case GLSL_TYPE_INT64: + return GLSL_TYPE_UINT64; + default: + assert(type == GLSL_TYPE_UINT || + type == GLSL_TYPE_UINT8 || + type == GLSL_TYPE_UINT16 || + type == GLSL_TYPE_UINT64); + return type; + } +} + enum glsl_sampler_dim { GLSL_SAMPLER_DIM_1D = 0, GLSL_SAMPLER_DIM_2D, @@ -473,6 +494,23 @@ unsigned varying_count() const; /** + * Calculate the number of vec4 slots required to hold this type. + * + * This is the underlying recursive type_size function for + * count_attribute_slots() (vertex inputs and varyings) but also for + * gallium's !PIPE_CAP_PACKED_UNIFORMS case. + */ + unsigned count_vec4_slots(bool is_gl_vertex_input, bool bindless) const; + + /** + * Calculate the number of vec4 slots required to hold this type. + * + * This is the underlying recursive type_size function for + * gallium's PIPE_CAP_PACKED_UNIFORMS case. + */ + unsigned count_dword_slots(bool bindless) const; + + /** * Calculate the number of attribute slots required to hold this type * * This implements the language rules of GLSL 1.50 for counting the number @@ -487,7 +525,9 @@ * Vulkan doesn’t make this distinction so the argument should always be * false. */ - unsigned count_attribute_slots(bool is_gl_vertex_input) const; + unsigned count_attribute_slots(bool is_gl_vertex_input) const { + return count_vec4_slots(is_gl_vertex_input, true); + } /** * Alignment in bytes of the start of this type in a std140 uniform @@ -1214,7 +1254,7 @@ * For interface blocks, the interpolation mode (as in * ir_variable::interpolation). 0 otherwise. */ - unsigned interpolation:2; + unsigned interpolation:3; /** * For interface blocks, 1 if this variable uses centroid interpolation (as diff -Nru mesa-19.2.8/src/compiler/Makefile.sources mesa-20.0.8/src/compiler/Makefile.sources --- mesa-19.2.8/src/compiler/Makefile.sources 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/Makefile.sources 2020-06-12 01:21:16.000000000 +0000 @@ -1,6 +1,4 @@ LIBCOMPILER_FILES = \ - blob.c \ - blob.h \ builtin_type_macros.h \ glsl_types.cpp \ glsl_types.h \ @@ -37,7 +35,6 @@ glsl/gl_nir_link_xfb.c \ glsl/gl_nir_linker.c \ glsl/gl_nir_linker.h \ - glsl/gl_nir_opt_access.c \ glsl/gl_nir.h \ glsl/glsl_parser_extras.cpp \ glsl/glsl_parser_extras.h \ @@ -235,6 +232,7 @@ nir/nir_lower_alpha_test.c \ nir/nir_lower_alu.c \ nir/nir_lower_alu_to_scalar.c \ + nir/nir_lower_amul.c \ nir/nir_lower_array_deref_of_vec.c \ nir/nir_lower_atomics_to_ssbo.c \ nir/nir_lower_bitmap.c \ @@ -244,10 +242,12 @@ nir/nir_lower_clamp_color_outputs.c \ nir/nir_lower_clip.c \ nir/nir_lower_clip_cull_distance_arrays.c \ + nir/nir_lower_clip_halfz.c \ nir/nir_lower_constant_initializers.c \ nir/nir_lower_double_ops.c \ nir/nir_lower_drawpixels.c \ nir/nir_lower_fb_read.c \ + nir/nir_lower_flatshade.c \ nir/nir_lower_flrp.c \ nir/nir_lower_fragcoord_wtrans.c \ nir/nir_lower_frexp.c \ @@ -271,8 +271,11 @@ nir/nir_lower_passthrough_edgeflags.c \ nir/nir_lower_patch_vertices.c \ nir/nir_lower_phis_to_scalar.c \ + nir/nir_lower_point_size.c \ + nir/nir_lower_point_size_mov.c \ nir/nir_lower_regs_to_ssa.c \ nir/nir_lower_returns.c \ + nir/nir_lower_samplers.c \ nir/nir_lower_scratch.c \ nir/nir_lower_subgroups.c \ nir/nir_lower_system_values.c \ @@ -289,6 +292,7 @@ nir/nir_metadata.c \ nir/nir_move_vec_src_uses_to_dest.c \ nir/nir_normalize_cubemap_coords.c \ + nir/nir_opt_access.c \ nir/nir_opt_combine_stores.c \ nir/nir_opt_comparison_pre.c \ nir/nir_opt_conditional_discard.c \ @@ -306,6 +310,7 @@ nir/nir_opt_intrinsics.c \ nir/nir_opt_loop_unroll.c \ nir/nir_opt_large_constants.c \ + nir/nir_opt_load_store_vectorize.c \ nir/nir_opt_move.c \ nir/nir_opt_peephole_select.c \ nir/nir_opt_rematerialize_compares.c \ @@ -323,12 +328,12 @@ nir/nir_range_analysis.h \ nir/nir_remove_dead_variables.c \ nir/nir_repair_ssa.c \ + nir/nir_schedule.c \ nir/nir_search.c \ nir/nir_search.h \ nir/nir_search_helpers.h \ nir/nir_serialize.c \ nir/nir_serialize.h \ - nir/nir_strip.c \ nir/nir_split_per_member_structs.c \ nir/nir_split_var_copies.c \ nir/nir_split_vars.c \ diff -Nru mesa-19.2.8/src/compiler/meson.build mesa-20.0.8/src/compiler/meson.build --- mesa-19.2.8/src/compiler/meson.build 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/meson.build 2020-06-12 01:21:16.000000000 +0000 @@ -19,13 +19,10 @@ # SOFTWARE. inc_compiler = include_directories('.') -inc_nir = include_directories('nir') inc_glsl = include_directories('glsl') inc_spirv = include_directories('spirv') files_libcompiler = files( - 'blob.c', - 'blob.h', 'builtin_type_macros.h', 'glsl_types.cpp', 'glsl_types.h', @@ -64,7 +61,7 @@ 'spirv2nir', files('spirv/spirv2nir.c'), dependencies : [dep_m, idep_nir, idep_mesautil], - include_directories : [inc_common, inc_nir, include_directories('spirv')], + include_directories : [inc_common, include_directories('spirv')], c_args : [c_vis_args, c_msvc_compat_args, no_override_init_args], build_by_default : with_tools.contains('nir'), install : with_tools.contains('nir'), diff -Nru mesa-19.2.8/src/compiler/nir/meson.build mesa-20.0.8/src/compiler/nir/meson.build --- mesa-19.2.8/src/compiler/nir/meson.build 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/meson.build 2020-06-12 01:21:16.000000000 +0000 @@ -114,6 +114,7 @@ 'nir_lower_alu.c', 'nir_lower_alu_to_scalar.c', 'nir_lower_alpha_test.c', + 'nir_lower_amul.c', 'nir_lower_array_deref_of_vec.c', 'nir_lower_atomics_to_ssbo.c', 'nir_lower_bitmap.c', @@ -122,10 +123,12 @@ 'nir_lower_clamp_color_outputs.c', 'nir_lower_clip.c', 'nir_lower_clip_cull_distance_arrays.c', + 'nir_lower_clip_halfz.c', 'nir_lower_constant_initializers.c', 'nir_lower_double_ops.c', 'nir_lower_drawpixels.c', 'nir_lower_fb_read.c', + 'nir_lower_flatshade.c', 'nir_lower_flrp.c', 'nir_lower_fragcoord_wtrans.c', 'nir_lower_frexp.c', @@ -150,8 +153,10 @@ 'nir_lower_patch_vertices.c', 'nir_lower_phis_to_scalar.c', 'nir_lower_point_size.c', + 'nir_lower_point_size_mov.c', 'nir_lower_regs_to_ssa.c', 'nir_lower_returns.c', + 'nir_lower_samplers.c', 'nir_lower_scratch.c', 'nir_lower_subgroups.c', 'nir_lower_system_values.c', @@ -169,6 +174,7 @@ 'nir_metadata.c', 'nir_move_vec_src_uses_to_dest.c', 'nir_normalize_cubemap_coords.c', + 'nir_opt_access.c', 'nir_opt_combine_stores.c', 'nir_opt_comparison_pre.c', 'nir_opt_conditional_discard.c', @@ -185,6 +191,7 @@ 'nir_opt_if.c', 'nir_opt_intrinsics.c', 'nir_opt_large_constants.c', + 'nir_opt_load_store_vectorize.c', 'nir_opt_loop_unroll.c', 'nir_opt_move.c', 'nir_opt_peephole_select.c', @@ -203,12 +210,12 @@ 'nir_range_analysis.h', 'nir_remove_dead_variables.c', 'nir_repair_ssa.c', + 'nir_schedule.c', 'nir_search.c', 'nir_search.h', 'nir_search_helpers.h', 'nir_serialize.c', 'nir_serialize.h', - 'nir_strip.c', 'nir_split_per_member_structs.c', 'nir_split_var_copies.c', 'nir_split_vars.c', @@ -237,7 +244,7 @@ '../spirv/vtn_variables.c', ) -libnir = static_library( +_libnir = static_library( 'nir', [files_libnir, spirv_info_c, nir_opt_algebraic_c, nir_opcodes_c, nir_opcodes_h, nir_constant_expressions_c, nir_builder_opcodes_h, @@ -257,13 +264,25 @@ # Also link with nir idep_nir = declare_dependency( dependencies : idep_nir_headers, - link_with : libnir, + link_with : _libnir, ) nir_algebraic_py = files('nir_algebraic.py') if with_tests test( + 'nir_builder', + executable( + 'nir_builder_test', + files('tests/builder_tests.cpp'), + cpp_args : [cpp_vis_args, cpp_msvc_compat_args], + include_directories : [inc_common], + dependencies : [dep_thread, idep_gtest, idep_nir, idep_mesautil], + ), + suite : ['compiler', 'nir'], + ) + + test( 'nir_control_flow', executable( 'nir_control_flow_test', @@ -317,6 +336,30 @@ include_directories : [inc_common], dependencies : [dep_thread, idep_gtest, idep_nir, idep_mesautil], ), + suite : ['compiler', 'nir'], + ) + + test( + 'load_store_vectorizer', + executable( + 'load_store_vectorizer', + files('tests/load_store_vectorizer_tests.cpp'), + cpp_args : [cpp_vis_args, cpp_msvc_compat_args], + include_directories : [inc_common], + dependencies : [dep_thread, idep_gtest, idep_nir, idep_mesautil], + ), + suite : ['compiler', 'nir'], + ) + + test( + 'nir_serialize_test', + executable( + 'nir_serialize_test', + files('tests/serialize_tests.cpp'), + cpp_args : [cpp_vis_args, cpp_msvc_compat_args], + include_directories : [inc_common], + dependencies : [dep_thread, idep_gtest, idep_nir, idep_mesautil], + ), suite : ['compiler', 'nir'], ) endif diff -Nru mesa-19.2.8/src/compiler/nir/nir_algebraic.py mesa-20.0.8/src/compiler/nir/nir_algebraic.py --- mesa-19.2.8/src/compiler/nir/nir_algebraic.py 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir_algebraic.py 2020-06-12 01:21:16.000000000 +0000 @@ -1054,30 +1054,6 @@ % endfor */ -#ifndef NIR_OPT_ALGEBRAIC_STRUCT_DEFS -#define NIR_OPT_ALGEBRAIC_STRUCT_DEFS - -struct transform { - const nir_search_expression *search; - const nir_search_value *replace; - unsigned condition_offset; -}; - -struct per_op_table { - const uint16_t *filter; - unsigned num_filtered_states; - const uint16_t *table; -}; - -/* Note: these must match the start states created in - * TreeAutomaton._build_table() - */ - -/* WILDCARD_STATE = 0 is set by zeroing the state array */ -static const uint16_t CONST_STATE = 1; - -#endif - <% cache = {} %> % for xform in xforms: ${xform.search.render(cache)} @@ -1118,117 +1094,25 @@ % endfor }; -static void -${pass_name}_pre_block(nir_block *block, uint16_t *states) -{ - nir_foreach_instr(instr, block) { - switch (instr->type) { - case nir_instr_type_alu: { - nir_alu_instr *alu = nir_instr_as_alu(instr); - nir_op op = alu->op; - uint16_t search_op = nir_search_op_for_nir_op(op); - const struct per_op_table *tbl = &${pass_name}_table[search_op]; - if (tbl->num_filtered_states == 0) - continue; - - /* Calculate the index into the transition table. Note the index - * calculated must match the iteration order of Python's - * itertools.product(), which was used to emit the transition - * table. - */ - uint16_t index = 0; - for (unsigned i = 0; i < nir_op_infos[op].num_inputs; i++) { - index *= tbl->num_filtered_states; - index += tbl->filter[states[alu->src[i].src.ssa->index]]; - } - states[alu->dest.dest.ssa.index] = tbl->table[index]; - break; - } - - case nir_instr_type_load_const: { - nir_load_const_instr *load_const = nir_instr_as_load_const(instr); - states[load_const->def.index] = CONST_STATE; - break; - } - - default: - break; - } - } -} - -static bool -${pass_name}_block(nir_builder *build, nir_block *block, - const uint16_t *states, const bool *condition_flags) -{ - bool progress = false; - - nir_foreach_instr_reverse_safe(instr, block) { - if (instr->type != nir_instr_type_alu) - continue; - - nir_alu_instr *alu = nir_instr_as_alu(instr); - if (!alu->dest.dest.is_ssa) - continue; - - switch (states[alu->dest.dest.ssa.index]) { +const struct transform *${pass_name}_transforms[] = { % for i in range(len(automaton.state_patterns)): - case ${i}: - % if automaton.state_patterns[i]: - for (unsigned i = 0; i < ARRAY_SIZE(${pass_name}_state${i}_xforms); i++) { - const struct transform *xform = &${pass_name}_state${i}_xforms[i]; - if (condition_flags[xform->condition_offset] && - nir_replace_instr(build, alu, xform->search, xform->replace)) { - progress = true; - break; - } - } - % endif - break; + % if automaton.state_patterns[i]: + ${pass_name}_state${i}_xforms, + % else: + NULL, + % endif % endfor - default: assert(0); - } - } - - return progress; -} - -static bool -${pass_name}_impl(nir_function_impl *impl, const bool *condition_flags) -{ - bool progress = false; - - nir_builder build; - nir_builder_init(&build, impl); - - /* Note: it's important here that we're allocating a zeroed array, since - * state 0 is the default state, which means we don't have to visit - * anything other than constants and ALU instructions. - */ - uint16_t *states = calloc(impl->ssa_alloc, sizeof(*states)); - - nir_foreach_block(block, impl) { - ${pass_name}_pre_block(block, states); - } - - nir_foreach_block_reverse(block, impl) { - progress |= ${pass_name}_block(&build, block, states, condition_flags); - } - - free(states); - - if (progress) { - nir_metadata_preserve(impl, nir_metadata_block_index | - nir_metadata_dominance); - } else { -#ifndef NDEBUG - impl->valid_metadata &= ~nir_metadata_not_properly_reset; -#endif - } - - return progress; -} +}; +const uint16_t ${pass_name}_transform_counts[] = { +% for i in range(len(automaton.state_patterns)): + % if automaton.state_patterns[i]: + (uint16_t)ARRAY_SIZE(${pass_name}_state${i}_xforms), + % else: + 0, + % endif +% endfor +}; bool ${pass_name}(nir_shader *shader) @@ -1245,8 +1129,12 @@ % endfor nir_foreach_function(function, shader) { - if (function->impl) - progress |= ${pass_name}_impl(function->impl, condition_flags); + if (function->impl) { + progress |= nir_algebraic_impl(function->impl, condition_flags, + ${pass_name}_transforms, + ${pass_name}_transform_counts, + ${pass_name}_table); + } } return progress; diff -Nru mesa-19.2.8/src/compiler/nir/nir_builder.h mesa-20.0.8/src/compiler/nir/nir_builder.h --- mesa-19.2.8/src/compiler/nir/nir_builder.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir_builder.h 2020-06-12 01:21:16.000000000 +0000 @@ -458,6 +458,16 @@ nir_mov_alu(nir_builder *build, nir_alu_src src, unsigned num_components) { assert(!src.abs && !src.negate); + if (src.src.is_ssa && src.src.ssa->num_components == num_components) { + bool any_swizzles = false; + for (unsigned i = 0; i < num_components; i++) { + if (src.swizzle[i] != i) + any_swizzles = true; + } + if (!any_swizzles) + return src.src.ssa; + } + nir_alu_instr *mov = nir_alu_instr_create(build->shader, nir_op_mov); nir_ssa_dest_init(&mov->instr, &mov->dest.dest, num_components, nir_src_bit_size(src.src), NULL); @@ -636,7 +646,7 @@ } static inline nir_ssa_def * -nir_imul_imm(nir_builder *build, nir_ssa_def *x, uint64_t y) +_nir_mul_imm(nir_builder *build, nir_ssa_def *x, uint64_t y, bool amul) { assert(x->bit_size <= 64); if (x->bit_size < 64) @@ -646,14 +656,29 @@ return nir_imm_intN_t(build, 0, x->bit_size); } else if (y == 1) { return x; - } else if (util_is_power_of_two_or_zero64(y)) { + } else if (!build->shader->options->lower_bitops && + util_is_power_of_two_or_zero64(y)) { return nir_ishl(build, x, nir_imm_int(build, ffsll(y) - 1)); + } else if (amul) { + return nir_amul(build, x, nir_imm_intN_t(build, y, x->bit_size)); } else { return nir_imul(build, x, nir_imm_intN_t(build, y, x->bit_size)); } } static inline nir_ssa_def * +nir_imul_imm(nir_builder *build, nir_ssa_def *x, uint64_t y) +{ + return _nir_mul_imm(build, x, y, false); +} + +static inline nir_ssa_def * +nir_amul_imm(nir_builder *build, nir_ssa_def *x, uint64_t y) +{ + return _nir_mul_imm(build, x, y, true); +} + +static inline nir_ssa_def * nir_fadd_imm(nir_builder *build, nir_ssa_def *x, double y) { return nir_fadd(build, x, nir_imm_floatN_t(build, y, x->bit_size)); @@ -733,6 +758,85 @@ return nir_vec(b, dest_comps, dest_num_components); } +/** + * Treats srcs as if it's one big blob of bits and extracts the range of bits + * given by + * + * [first_bit, first_bit + dest_num_components * dest_bit_size) + * + * The range can have any alignment or size as long as it's an integer number + * of destination components and fits inside the concatenated sources. + * + * TODO: The one caveat here is that we can't handle byte alignment if 64-bit + * values are involved because that would require pack/unpack to/from a vec8 + * which NIR currently does not support. + */ +static inline nir_ssa_def * +nir_extract_bits(nir_builder *b, nir_ssa_def **srcs, unsigned num_srcs, + unsigned first_bit, + unsigned dest_num_components, unsigned dest_bit_size) +{ + const unsigned num_bits = dest_num_components * dest_bit_size; + + /* Figure out the common bit size */ + unsigned common_bit_size = dest_bit_size; + for (unsigned i = 0; i < num_srcs; i++) + common_bit_size = MIN2(common_bit_size, srcs[i]->bit_size); + if (first_bit > 0) + common_bit_size = MIN2(common_bit_size, (1u << (ffs(first_bit) - 1))); + + /* We don't want to have to deal with 1-bit values */ + assert(common_bit_size >= 8); + + nir_ssa_def *common_comps[NIR_MAX_VEC_COMPONENTS * sizeof(uint64_t)]; + assert(num_bits / common_bit_size <= ARRAY_SIZE(common_comps)); + + /* First, unpack to the common bit size and select the components from the + * source. + */ + int src_idx = -1; + unsigned src_start_bit = 0; + unsigned src_end_bit = 0; + for (unsigned i = 0; i < num_bits / common_bit_size; i++) { + const unsigned bit = first_bit + (i * common_bit_size); + while (bit >= src_end_bit) { + src_idx++; + assert(src_idx < (int) num_srcs); + src_start_bit = src_end_bit; + src_end_bit += srcs[src_idx]->bit_size * + srcs[src_idx]->num_components; + } + assert(bit >= src_start_bit); + assert(bit + common_bit_size <= src_end_bit); + const unsigned rel_bit = bit - src_start_bit; + const unsigned src_bit_size = srcs[src_idx]->bit_size; + + nir_ssa_def *comp = nir_channel(b, srcs[src_idx], + rel_bit / src_bit_size); + if (srcs[src_idx]->bit_size > common_bit_size) { + nir_ssa_def *unpacked = nir_unpack_bits(b, comp, common_bit_size); + comp = nir_channel(b, unpacked, (rel_bit % src_bit_size) / + common_bit_size); + } + common_comps[i] = comp; + } + + /* Now, re-pack the destination if we have to */ + if (dest_bit_size > common_bit_size) { + unsigned common_per_dest = dest_bit_size / common_bit_size; + nir_ssa_def *dest_comps[NIR_MAX_VEC_COMPONENTS]; + for (unsigned i = 0; i < dest_num_components; i++) { + nir_ssa_def *unpacked = nir_vec(b, common_comps + i * common_per_dest, + common_per_dest); + dest_comps[i] = nir_pack_bits(b, unpacked, dest_bit_size); + } + return nir_vec(b, dest_comps, dest_num_components); + } else { + assert(dest_bit_size == common_bit_size); + return nir_vec(b, common_comps, dest_num_components); + } +} + static inline nir_ssa_def * nir_bitcast_vector(nir_builder *b, nir_ssa_def *src, unsigned dest_bit_size) { @@ -741,43 +845,7 @@ (src->bit_size * src->num_components) / dest_bit_size; assert(dest_num_components <= NIR_MAX_VEC_COMPONENTS); - if (src->bit_size > dest_bit_size) { - assert(src->bit_size % dest_bit_size == 0); - if (src->num_components == 1) { - return nir_unpack_bits(b, src, dest_bit_size); - } else { - const unsigned divisor = src->bit_size / dest_bit_size; - assert(src->num_components * divisor == dest_num_components); - nir_ssa_def *dest[NIR_MAX_VEC_COMPONENTS]; - for (unsigned i = 0; i < src->num_components; i++) { - nir_ssa_def *unpacked = - nir_unpack_bits(b, nir_channel(b, src, i), dest_bit_size); - assert(unpacked->num_components == divisor); - for (unsigned j = 0; j < divisor; j++) - dest[i * divisor + j] = nir_channel(b, unpacked, j); - } - return nir_vec(b, dest, dest_num_components); - } - } else if (src->bit_size < dest_bit_size) { - assert(dest_bit_size % src->bit_size == 0); - if (dest_num_components == 1) { - return nir_pack_bits(b, src, dest_bit_size); - } else { - const unsigned divisor = dest_bit_size / src->bit_size; - assert(src->num_components == dest_num_components * divisor); - nir_ssa_def *dest[NIR_MAX_VEC_COMPONENTS]; - for (unsigned i = 0; i < dest_num_components; i++) { - nir_component_mask_t src_mask = - ((1 << divisor) - 1) << (i * divisor); - dest[i] = nir_pack_bits(b, nir_channels(b, src, src_mask), - dest_bit_size); - } - return nir_vec(b, dest, dest_num_components); - } - } else { - assert(src->bit_size == dest_bit_size); - return src; - } + return nir_extract_bits(b, &src, 1, 0, dest_num_components, dest_bit_size); } /** @@ -807,7 +875,7 @@ static inline nir_ssa_def * nir_ssa_for_alu_src(nir_builder *build, nir_alu_instr *instr, unsigned srcn) { - static uint8_t trivial_swizzle[] = { 0, 1, 2, 3 }; + static uint8_t trivial_swizzle[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }; STATIC_ASSERT(ARRAY_SIZE(trivial_swizzle) == NIR_MAX_VEC_COMPONENTS); nir_alu_src *src = &instr->src[srcn]; @@ -1182,8 +1250,9 @@ nir_load_barycentric(nir_builder *build, nir_intrinsic_op op, unsigned interp_mode) { + unsigned num_components = op == nir_intrinsic_load_barycentric_model ? 3 : 2; nir_intrinsic_instr *bary = nir_intrinsic_instr_create(build->shader, op); - nir_ssa_dest_init(&bary->instr, &bary->dest, 2, 32, NULL); + nir_ssa_dest_init(&bary->instr, &bary->dest, num_components, 32, NULL); nir_intrinsic_set_interp_mode(bary, interp_mode); nir_builder_instr_insert(build, &bary->instr); return &bary->dest.ssa; diff -Nru mesa-19.2.8/src/compiler/nir/nir_builder_opcodes_h.py mesa-20.0.8/src/compiler/nir/nir_builder_opcodes_h.py --- mesa-19.2.8/src/compiler/nir/nir_builder_opcodes_h.py 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir_builder_opcodes_h.py 2020-06-12 01:21:16.000000000 +0000 @@ -31,14 +31,22 @@ return ', '.join('nir_ssa_def *src' + str(i) for i in range(num_srcs)) def src_list(num_srcs): - return ', '.join('src' + str(i) if i < num_srcs else 'NULL' for i in range(4)) + if num_srcs <= 4: + return ', '.join('src' + str(i) if i < num_srcs else 'NULL' for i in range(4)) + else: + return ', '.join('src' + str(i) for i in range(num_srcs)) %> % for name, opcode in sorted(opcodes.items()): static inline nir_ssa_def * nir_${name}(nir_builder *build, ${src_decl_list(opcode.num_inputs)}) { +% if opcode.num_inputs <= 4: return nir_build_alu(build, nir_op_${name}, ${src_list(opcode.num_inputs)}); +% else: + nir_ssa_def *srcs[${opcode.num_inputs}] = {${src_list(opcode.num_inputs)}}; + return nir_build_alu_src_arr(build, nir_op_${name}, srcs); +% endif } % endfor diff -Nru mesa-19.2.8/src/compiler/nir/nir_builtin_builder.c mesa-20.0.8/src/compiler/nir/nir_builtin_builder.c --- mesa-19.2.8/src/compiler/nir/nir_builtin_builder.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir_builtin_builder.c 2020-06-12 01:21:16.000000000 +0000 @@ -1,5 +1,6 @@ /* * Copyright © 2018 Red Hat Inc. + * Copyright © 2015 Intel Corporation * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -173,3 +174,258 @@ return nir_vec(b, res, lo->num_components); } + +/** + * Compute xs[0] + xs[1] + xs[2] + ... using fadd. + */ +static nir_ssa_def * +build_fsum(nir_builder *b, nir_ssa_def **xs, int terms) +{ + nir_ssa_def *accum = xs[0]; + + for (int i = 1; i < terms; i++) + accum = nir_fadd(b, accum, xs[i]); + + return accum; +} + +nir_ssa_def * +nir_atan(nir_builder *b, nir_ssa_def *y_over_x) +{ + const uint32_t bit_size = y_over_x->bit_size; + + nir_ssa_def *abs_y_over_x = nir_fabs(b, y_over_x); + nir_ssa_def *one = nir_imm_floatN_t(b, 1.0f, bit_size); + + /* + * range-reduction, first step: + * + * / y_over_x if |y_over_x| <= 1.0; + * x = < + * \ 1.0 / y_over_x otherwise + */ + nir_ssa_def *x = nir_fdiv(b, nir_fmin(b, abs_y_over_x, one), + nir_fmax(b, abs_y_over_x, one)); + + /* + * approximate atan by evaluating polynomial: + * + * x * 0.9999793128310355 - x^3 * 0.3326756418091246 + + * x^5 * 0.1938924977115610 - x^7 * 0.1173503194786851 + + * x^9 * 0.0536813784310406 - x^11 * 0.0121323213173444 + */ + nir_ssa_def *x_2 = nir_fmul(b, x, x); + nir_ssa_def *x_3 = nir_fmul(b, x_2, x); + nir_ssa_def *x_5 = nir_fmul(b, x_3, x_2); + nir_ssa_def *x_7 = nir_fmul(b, x_5, x_2); + nir_ssa_def *x_9 = nir_fmul(b, x_7, x_2); + nir_ssa_def *x_11 = nir_fmul(b, x_9, x_2); + + nir_ssa_def *polynomial_terms[] = { + nir_fmul_imm(b, x, 0.9999793128310355f), + nir_fmul_imm(b, x_3, -0.3326756418091246f), + nir_fmul_imm(b, x_5, 0.1938924977115610f), + nir_fmul_imm(b, x_7, -0.1173503194786851f), + nir_fmul_imm(b, x_9, 0.0536813784310406f), + nir_fmul_imm(b, x_11, -0.0121323213173444f), + }; + + nir_ssa_def *tmp = + build_fsum(b, polynomial_terms, ARRAY_SIZE(polynomial_terms)); + + /* range-reduction fixup */ + tmp = nir_fadd(b, tmp, + nir_fmul(b, nir_b2f(b, nir_flt(b, one, abs_y_over_x), bit_size), + nir_fadd_imm(b, nir_fmul_imm(b, tmp, -2.0f), M_PI_2))); + + /* sign fixup */ + return nir_fmul(b, tmp, nir_fsign(b, y_over_x)); +} + +nir_ssa_def * +nir_atan2(nir_builder *b, nir_ssa_def *y, nir_ssa_def *x) +{ + assert(y->bit_size == x->bit_size); + const uint32_t bit_size = x->bit_size; + + nir_ssa_def *zero = nir_imm_floatN_t(b, 0, bit_size); + nir_ssa_def *one = nir_imm_floatN_t(b, 1, bit_size); + + /* If we're on the left half-plane rotate the coordinates π/2 clock-wise + * for the y=0 discontinuity to end up aligned with the vertical + * discontinuity of atan(s/t) along t=0. This also makes sure that we + * don't attempt to divide by zero along the vertical line, which may give + * unspecified results on non-GLSL 4.1-capable hardware. + */ + nir_ssa_def *flip = nir_fge(b, zero, x); + nir_ssa_def *s = nir_bcsel(b, flip, nir_fabs(b, x), y); + nir_ssa_def *t = nir_bcsel(b, flip, y, nir_fabs(b, x)); + + /* If the magnitude of the denominator exceeds some huge value, scale down + * the arguments in order to prevent the reciprocal operation from flushing + * its result to zero, which would cause precision problems, and for s + * infinite would cause us to return a NaN instead of the correct finite + * value. + * + * If fmin and fmax are respectively the smallest and largest positive + * normalized floating point values representable by the implementation, + * the constants below should be in agreement with: + * + * huge <= 1 / fmin + * scale <= 1 / fmin / fmax (for |t| >= huge) + * + * In addition scale should be a negative power of two in order to avoid + * loss of precision. The values chosen below should work for most usual + * floating point representations with at least the dynamic range of ATI's + * 24-bit representation. + */ + const double huge_val = bit_size >= 32 ? 1e18 : 16384; + nir_ssa_def *huge = nir_imm_floatN_t(b, huge_val, bit_size); + nir_ssa_def *scale = nir_bcsel(b, nir_fge(b, nir_fabs(b, t), huge), + nir_imm_floatN_t(b, 0.25, bit_size), one); + nir_ssa_def *rcp_scaled_t = nir_frcp(b, nir_fmul(b, t, scale)); + nir_ssa_def *s_over_t = nir_fmul(b, nir_fmul(b, s, scale), rcp_scaled_t); + + /* For |x| = |y| assume tan = 1 even if infinite (i.e. pretend momentarily + * that ∞/∞ = 1) in order to comply with the rather artificial rules + * inherited from IEEE 754-2008, namely: + * + * "atan2(±∞, −∞) is ±3π/4 + * atan2(±∞, +∞) is ±π/4" + * + * Note that this is inconsistent with the rules for the neighborhood of + * zero that are based on iterated limits: + * + * "atan2(±0, −0) is ±π + * atan2(±0, +0) is ±0" + * + * but GLSL specifically allows implementations to deviate from IEEE rules + * at (0,0), so we take that license (i.e. pretend that 0/0 = 1 here as + * well). + */ + nir_ssa_def *tan = nir_bcsel(b, nir_feq(b, nir_fabs(b, x), nir_fabs(b, y)), + one, nir_fabs(b, s_over_t)); + + /* Calculate the arctangent and fix up the result if we had flipped the + * coordinate system. + */ + nir_ssa_def *arc = + nir_fadd(b, nir_fmul_imm(b, nir_b2f(b, flip, bit_size), M_PI_2), + nir_atan(b, tan)); + + /* Rather convoluted calculation of the sign of the result. When x < 0 we + * cannot use fsign because we need to be able to distinguish between + * negative and positive zero. We don't use bitwise arithmetic tricks for + * consistency with the GLSL front-end. When x >= 0 rcp_scaled_t will + * always be non-negative so this won't be able to distinguish between + * negative and positive zero, but we don't care because atan2 is + * continuous along the whole positive y = 0 half-line, so it won't affect + * the result significantly. + */ + return nir_bcsel(b, nir_flt(b, nir_fmin(b, y, rcp_scaled_t), zero), + nir_fneg(b, arc), arc); +} + +nir_ssa_def * +nir_get_texture_size(nir_builder *b, nir_tex_instr *tex) +{ + b->cursor = nir_before_instr(&tex->instr); + + nir_tex_instr *txs; + + unsigned num_srcs = 1; /* One for the LOD */ + for (unsigned i = 0; i < tex->num_srcs; i++) { + if (tex->src[i].src_type == nir_tex_src_texture_deref || + tex->src[i].src_type == nir_tex_src_sampler_deref || + tex->src[i].src_type == nir_tex_src_texture_offset || + tex->src[i].src_type == nir_tex_src_sampler_offset || + tex->src[i].src_type == nir_tex_src_texture_handle || + tex->src[i].src_type == nir_tex_src_sampler_handle) + num_srcs++; + } + + txs = nir_tex_instr_create(b->shader, num_srcs); + txs->op = nir_texop_txs; + txs->sampler_dim = tex->sampler_dim; + txs->is_array = tex->is_array; + txs->is_shadow = tex->is_shadow; + txs->is_new_style_shadow = tex->is_new_style_shadow; + txs->texture_index = tex->texture_index; + txs->sampler_index = tex->sampler_index; + txs->dest_type = nir_type_int; + + unsigned idx = 0; + for (unsigned i = 0; i < tex->num_srcs; i++) { + if (tex->src[i].src_type == nir_tex_src_texture_deref || + tex->src[i].src_type == nir_tex_src_sampler_deref || + tex->src[i].src_type == nir_tex_src_texture_offset || + tex->src[i].src_type == nir_tex_src_sampler_offset || + tex->src[i].src_type == nir_tex_src_texture_handle || + tex->src[i].src_type == nir_tex_src_sampler_handle) { + nir_src_copy(&txs->src[idx].src, &tex->src[i].src, txs); + txs->src[idx].src_type = tex->src[i].src_type; + idx++; + } + } + /* Add in an LOD because some back-ends require it */ + txs->src[idx].src = nir_src_for_ssa(nir_imm_int(b, 0)); + txs->src[idx].src_type = nir_tex_src_lod; + + nir_ssa_dest_init(&txs->instr, &txs->dest, + nir_tex_instr_dest_size(txs), 32, NULL); + nir_builder_instr_insert(b, &txs->instr); + + return nir_i2f32(b, &txs->dest.ssa); +} + +nir_ssa_def * +nir_get_texture_lod(nir_builder *b, nir_tex_instr *tex) +{ + b->cursor = nir_before_instr(&tex->instr); + + nir_tex_instr *tql; + + unsigned num_srcs = 0; + for (unsigned i = 0; i < tex->num_srcs; i++) { + if (tex->src[i].src_type == nir_tex_src_coord || + tex->src[i].src_type == nir_tex_src_texture_deref || + tex->src[i].src_type == nir_tex_src_sampler_deref || + tex->src[i].src_type == nir_tex_src_texture_offset || + tex->src[i].src_type == nir_tex_src_sampler_offset || + tex->src[i].src_type == nir_tex_src_texture_handle || + tex->src[i].src_type == nir_tex_src_sampler_handle) + num_srcs++; + } + + tql = nir_tex_instr_create(b->shader, num_srcs); + tql->op = nir_texop_lod; + tql->coord_components = tex->coord_components; + tql->sampler_dim = tex->sampler_dim; + tql->is_array = tex->is_array; + tql->is_shadow = tex->is_shadow; + tql->is_new_style_shadow = tex->is_new_style_shadow; + tql->texture_index = tex->texture_index; + tql->sampler_index = tex->sampler_index; + tql->dest_type = nir_type_float; + + unsigned idx = 0; + for (unsigned i = 0; i < tex->num_srcs; i++) { + if (tex->src[i].src_type == nir_tex_src_coord || + tex->src[i].src_type == nir_tex_src_texture_deref || + tex->src[i].src_type == nir_tex_src_sampler_deref || + tex->src[i].src_type == nir_tex_src_texture_offset || + tex->src[i].src_type == nir_tex_src_sampler_offset || + tex->src[i].src_type == nir_tex_src_texture_handle || + tex->src[i].src_type == nir_tex_src_sampler_handle) { + nir_src_copy(&tql->src[idx].src, &tex->src[i].src, tql); + tql->src[idx].src_type = tex->src[i].src_type; + idx++; + } + } + + nir_ssa_dest_init(&tql->instr, &tql->dest, 2, 32, NULL); + nir_builder_instr_insert(b, &tql->instr); + + /* The LOD is the y component of the result */ + return nir_channel(b, &tql->dest.ssa, 1); +} diff -Nru mesa-19.2.8/src/compiler/nir/nir_builtin_builder.h mesa-20.0.8/src/compiler/nir/nir_builtin_builder.h --- mesa-19.2.8/src/compiler/nir/nir_builtin_builder.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir_builtin_builder.h 2020-06-12 01:21:16.000000000 +0000 @@ -24,8 +24,13 @@ #ifndef NIR_BUILTIN_BUILDER_H #define NIR_BUILTIN_BUILDER_H +#include "util/u_math.h" #include "nir/nir_builder.h" +#ifdef __cplusplus +extern "C" { +#endif + /* * Functions are sorted alphabetically with removed type and "fast" prefix. * Definitions for functions in the C file come first. @@ -41,6 +46,14 @@ nir_ssa_def* nir_smoothstep(nir_builder *b, nir_ssa_def *edge0, nir_ssa_def *edge1, nir_ssa_def *x); nir_ssa_def* nir_upsample(nir_builder *b, nir_ssa_def *hi, nir_ssa_def *lo); +nir_ssa_def* nir_atan(nir_builder *b, nir_ssa_def *y_over_x); +nir_ssa_def* nir_atan2(nir_builder *b, nir_ssa_def *y, nir_ssa_def *x); + +nir_ssa_def * +nir_get_texture_lod(nir_builder *b, nir_tex_instr *tex); + +nir_ssa_def * +nir_get_texture_size(nir_builder *b, nir_tex_instr *tex); static inline nir_ssa_def * nir_nan_check2(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y, nir_ssa_def *res) @@ -76,6 +89,43 @@ } static inline nir_ssa_def * +nir_umul24(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y) +{ + nir_ssa_def *mask = nir_imm_int(b, 0xffffff); + nir_ssa_def *x_24 = nir_iand(b, x, mask); + nir_ssa_def *y_24 = nir_iand(b, y, mask); + return nir_imul(b, x_24, y_24); +} + +static inline nir_ssa_def * +nir_umad24(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y, nir_ssa_def *z) +{ + nir_ssa_def *temp = nir_umul24(b, x, y); + return nir_iadd(b, temp, z); +} + +static inline nir_ssa_def * +nir_imad24(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y, nir_ssa_def *z) +{ + nir_ssa_def *temp = nir_imul24(b, x, y); + return nir_iadd(b, temp, z); +} + +static inline nir_ssa_def * +nir_imad_hi(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y, nir_ssa_def *z) +{ + nir_ssa_def *temp = nir_imul_high(b, x, y); + return nir_iadd(b, temp, z); +} + +static inline nir_ssa_def * +nir_umad_hi(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y, nir_ssa_def *z) +{ + nir_ssa_def *temp = nir_umul_high(b, x, y); + return nir_iadd(b, temp, z); +} + +static inline nir_ssa_def * nir_bitselect(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y, nir_ssa_def *s) { return nir_ior(b, nir_iand(b, nir_inot(b, s), x), nir_iand(b, s, y)); @@ -209,4 +259,16 @@ return nir_bcsel(b, nir_ieq(b, s, nir_imm_intN_t(b, 0, s->bit_size)), x, y); } +static inline nir_ssa_def * +nir_clz_u(nir_builder *b, nir_ssa_def *a) +{ + nir_ssa_def *val; + val = nir_isub(b, nir_imm_intN_t(b, a->bit_size - 1, 32), nir_ufind_msb(b, a)); + return nir_u2u(b, val, a->bit_size); +} + +#ifdef __cplusplus +} +#endif + #endif /* NIR_BUILTIN_BUILDER_H */ diff -Nru mesa-19.2.8/src/compiler/nir/nir.c mesa-20.0.8/src/compiler/nir/nir.c --- mesa-19.2.8/src/compiler/nir/nir.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir.c 2020-06-12 01:21:16.000000000 +0000 @@ -108,6 +108,7 @@ nir_shader_add_variable(nir_shader *shader, nir_variable *var) { switch (var->data.mode) { + case nir_num_variable_modes: case nir_var_all: assert(!"invalid mode"); break; @@ -146,6 +147,10 @@ case nir_var_system_value: exec_list_push_tail(&shader->system_values, &var->node); break; + + case nir_var_mem_push_const: + assert(!"nir_var_push_constant is not supposed to be used for variables"); + break; } } @@ -1416,7 +1421,7 @@ { if (dest->is_ssa) { /* We can only overwrite an SSA destination if it has no uses. */ - assert(list_empty(&dest->ssa.uses) && list_empty(&dest->ssa.if_uses)); + assert(list_is_empty(&dest->ssa.uses) && list_is_empty(&dest->ssa.if_uses)); } else { list_del(&dest->reg.def_link); if (dest->reg.indirect) @@ -1547,7 +1552,7 @@ } } - if (!list_empty(&def->if_uses)) + if (!list_is_empty(&def->if_uses)) read_mask |= 1; return read_mask; @@ -1787,6 +1792,39 @@ return index; } +static void +index_var_list(struct exec_list *list) +{ + unsigned next_index = 0; + nir_foreach_variable(var, list) + var->index = next_index++; +} + +void +nir_index_vars(nir_shader *shader, nir_function_impl *impl, nir_variable_mode modes) +{ + if ((modes & nir_var_function_temp) && impl) + index_var_list(&impl->locals); + + if (modes & nir_var_shader_temp) + index_var_list(&shader->globals); + + if (modes & nir_var_shader_in) + index_var_list(&shader->inputs); + + if (modes & nir_var_shader_out) + index_var_list(&shader->outputs); + + if (modes & (nir_var_uniform | nir_var_mem_ubo | nir_var_mem_ssbo)) + index_var_list(&shader->uniforms); + + if (modes & nir_var_mem_shared) + index_var_list(&shader->shared); + + if (modes & nir_var_system_value) + index_var_list(&shader->system_values); +} + static nir_instr * cursor_next_instr(nir_cursor cursor) { @@ -1823,9 +1861,10 @@ unreachable("Inavlid cursor option"); } -static bool +ASSERTED static bool dest_is_ssa(nir_dest *dest, void *_state) { + (void) _state; return dest->is_ssa; } @@ -1888,7 +1927,7 @@ list_for_each_entry_safe(nir_src, use_src, &old_if_uses, use_link) nir_if_rewrite_condition(use_src->parent_if, new_src); - if (list_empty(&old_def->uses) && list_empty(&old_def->if_uses)) { + if (list_is_empty(&old_def->uses) && list_is_empty(&old_def->if_uses)) { iter = nir_instr_remove(instr); } else { iter = nir_after_instr(instr); @@ -2185,8 +2224,10 @@ CASE(load) CASE(store) CASE(atomic_add) - CASE(atomic_min) - CASE(atomic_max) + CASE(atomic_imin) + CASE(atomic_umin) + CASE(atomic_imax) + CASE(atomic_umax) CASE(atomic_and) CASE(atomic_or) CASE(atomic_xor) @@ -2207,7 +2248,7 @@ nir_intrinsic_set_image_dim(intrin, glsl_get_sampler_dim(deref->type)); nir_intrinsic_set_image_array(intrin, glsl_sampler_type_is_array(deref->type)); - nir_intrinsic_set_access(intrin, access | var->data.image.access); + nir_intrinsic_set_access(intrin, access | var->data.access); nir_intrinsic_set_format(intrin, var->data.image.format); nir_instr_rewrite_src(&intrin->instr, &intrin->src[0], diff -Nru mesa-19.2.8/src/compiler/nir/nir_clone.c mesa-20.0.8/src/compiler/nir/nir_clone.c --- mesa-19.2.8/src/compiler/nir/nir_clone.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir_clone.c 2020-06-12 01:21:16.000000000 +0000 @@ -422,6 +422,9 @@ ntex->texture_array_size = tex->texture_array_size; ntex->sampler_index = tex->sampler_index; + ntex->texture_non_uniform = tex->texture_non_uniform; + ntex->sampler_non_uniform = tex->sampler_non_uniform; + return ntex; } @@ -629,7 +632,7 @@ list_addtail(&src->src.use_link, &src->src.reg.reg->uses); } } - assert(list_empty(&state->phi_srcs)); + assert(list_is_empty(&state->phi_srcs)); } void @@ -669,7 +672,7 @@ clone_reg_list(state, &nfi->registers, &fi->registers); nfi->reg_alloc = fi->reg_alloc; - assert(list_empty(&state->phi_srcs)); + assert(list_is_empty(&state->phi_srcs)); clone_cf_list(state, &nfi->body, &fi->body); @@ -706,8 +709,10 @@ add_remap(state, nfxn, fxn); nfxn->num_params = fxn->num_params; - nfxn->params = ralloc_array(state->ns, nir_parameter, fxn->num_params); - memcpy(nfxn->params, fxn->params, sizeof(nir_parameter) * fxn->num_params); + if (fxn->num_params) { + nfxn->params = ralloc_array(state->ns, nir_parameter, fxn->num_params); + memcpy(nfxn->params, fxn->params, sizeof(nir_parameter) * fxn->num_params); + } nfxn->is_entrypoint = fxn->is_entrypoint; /* At first glance, it looks like we should clone the function_impl here. diff -Nru mesa-19.2.8/src/compiler/nir/nir_constant_expressions.h mesa-20.0.8/src/compiler/nir/nir_constant_expressions.h --- mesa-19.2.8/src/compiler/nir/nir_constant_expressions.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir_constant_expressions.h 2020-06-12 01:21:16.000000000 +0000 @@ -32,6 +32,7 @@ void nir_eval_const_opcode(nir_op op, nir_const_value *dest, unsigned num_components, unsigned bit_size, - nir_const_value **src); + nir_const_value **src, + unsigned float_controls_execution_mode); #endif /* NIR_CONSTANT_EXPRESSIONS_H */ diff -Nru mesa-19.2.8/src/compiler/nir/nir_constant_expressions.py mesa-20.0.8/src/compiler/nir/nir_constant_expressions.py --- mesa-19.2.8/src/compiler/nir/nir_constant_expressions.py 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir_constant_expressions.py 2020-06-12 01:21:16.000000000 +0000 @@ -63,12 +63,35 @@ #include #include "util/rounding.h" /* for _mesa_roundeven */ #include "util/half_float.h" +#include "util/double.h" +#include "util/softfloat.h" #include "util/bigmath.h" #include "nir_constant_expressions.h" #define MAX_UINT_FOR_SIZE(bits) (UINT64_MAX >> (64 - (bits))) /** + * \brief Checks if the provided value is a denorm and flushes it to zero. + */ +static void +constant_denorm_flush_to_zero(nir_const_value *value, unsigned bit_size) +{ + switch(bit_size) { + case 64: + if (0 == (value->u64 & 0x7ff0000000000000)) + value->u64 &= 0x8000000000000000; + break; + case 32: + if (0 == (value->u32 & 0x7f800000)) + value->u32 &= 0x80000000; + break; + case 16: + if (0 == (value->u16 & 0x7c00)) + value->u16 &= 0x8000; + } +} + +/** * Evaluate one component of packSnorm4x8. */ static uint8_t @@ -235,6 +258,17 @@ * Evaluate one component of unpackHalf2x16. */ static float +unpack_half_1x16_flush_to_zero(uint16_t u) +{ + if (0 == (u & 0x7c00)) + u &= 0x8000; + return _mesa_half_to_float(u); +} + +/** + * Evaluate one component of unpackHalf2x16. + */ +static float unpack_half_1x16(uint16_t u) { return _mesa_half_to_float(u); @@ -258,11 +292,23 @@ ${type}${width}_t y; ${type}${width}_t z; ${type}${width}_t w; + ${type}${width}_t e; + ${type}${width}_t f; + ${type}${width}_t g; + ${type}${width}_t h; + ${type}${width}_t i; + ${type}${width}_t j; + ${type}${width}_t k; + ${type}${width}_t l; + ${type}${width}_t m; + ${type}${width}_t n; + ${type}${width}_t o; + ${type}${width}_t p; }; % endfor % endfor -<%def name="evaluate_op(op, bit_size)"> +<%def name="evaluate_op(op, bit_size, execution_mode)"> <% output_type = type_add_size(op.output_type, bit_size) input_types = [type_add_size(type_, bit_size) for type_ in op.input_types] @@ -290,7 +336,7 @@ _src[${j}][${k}].${get_const_field(input_types[j])}, % endif % endfor - % for k in range(op.input_sizes[j], 4): + % for k in range(op.input_sizes[j], 16): 0, % endfor }; @@ -341,10 +387,26 @@ ## Sanitize the C value to a proper NIR 0/-1 bool _dst_val[_i].${get_const_field(output_type)} = -(int)dst; % elif output_type == "float16": - _dst_val[_i].u16 = _mesa_float_to_half(dst); + if (nir_is_rounding_mode_rtz(execution_mode, 16)) { + _dst_val[_i].u16 = _mesa_float_to_float16_rtz(dst); + } else { + _dst_val[_i].u16 = _mesa_float_to_float16_rtne(dst); + } % else: _dst_val[_i].${get_const_field(output_type)} = dst; % endif + + % if op.name != "fquantize2f16" and type_base_type(output_type) == "float": + % if type_has_size(output_type): + if (nir_is_denorm_flush_to_zero(execution_mode, ${type_size(output_type)})) { + constant_denorm_flush_to_zero(&_dst_val[_i], ${type_size(output_type)}); + } + % else: + if (nir_is_denorm_flush_to_zero(execution_mode, ${bit_size})) { + constant_denorm_flush_to_zero(&_dst_val[i], bit_size); + } + %endif + % endif } % else: ## In the non-per-component case, create a struct dst with @@ -368,14 +430,30 @@ % for k in range(op.output_size): % if output_type == "int1" or output_type == "uint1": /* 1-bit integers get truncated */ - _dst_val[${k}].b = dst.${"xyzw"[k]} & 1; + _dst_val[${k}].b = dst.${"xyzwefghijklmnop"[k]} & 1; % elif output_type.startswith("bool"): ## Sanitize the C value to a proper NIR 0/-1 bool - _dst_val[${k}].${get_const_field(output_type)} = -(int)dst.${"xyzw"[k]}; + _dst_val[${k}].${get_const_field(output_type)} = -(int)dst.${"xyzwefghijklmnop"[k]}; % elif output_type == "float16": - _dst_val[${k}].u16 = _mesa_float_to_half(dst.${"xyzw"[k]}); + if (nir_is_rounding_mode_rtz(execution_mode, 16)) { + _dst_val[${k}].u16 = _mesa_float_to_float16_rtz(dst.${"xyzwefghijklmnop"[k]}); + } else { + _dst_val[${k}].u16 = _mesa_float_to_float16_rtne(dst.${"xyzwefghijklmnop"[k]}); + } % else: - _dst_val[${k}].${get_const_field(output_type)} = dst.${"xyzw"[k]}; + _dst_val[${k}].${get_const_field(output_type)} = dst.${"xyzwefghijklmnop"[k]}; + % endif + + % if op.name != "fquantize2f16" and type_base_type(output_type) == "float": + % if type_has_size(output_type): + if (nir_is_denorm_flush_to_zero(execution_mode, ${type_size(output_type)})) { + constant_denorm_flush_to_zero(&_dst_val[${k}], ${type_size(output_type)}); + } + % else: + if (nir_is_denorm_flush_to_zero(execution_mode, ${bit_size})) { + constant_denorm_flush_to_zero(&_dst_val[${k}], bit_size); + } + % endif % endif % endfor % endif @@ -386,13 +464,14 @@ evaluate_${name}(nir_const_value *_dst_val, UNUSED unsigned num_components, ${"UNUSED" if op_bit_sizes(op) is None else ""} unsigned bit_size, - UNUSED nir_const_value **_src) + UNUSED nir_const_value **_src, + UNUSED unsigned execution_mode) { % if op_bit_sizes(op) is not None: switch (bit_size) { % for bit_size in op_bit_sizes(op): case ${bit_size}: { - ${evaluate_op(op, bit_size)} + ${evaluate_op(op, bit_size, execution_mode)} break; } % endfor @@ -401,7 +480,7 @@ unreachable("unknown bit width"); } % else: - ${evaluate_op(op, 0)} + ${evaluate_op(op, 0, execution_mode)} % endif } % endfor @@ -409,12 +488,13 @@ void nir_eval_const_opcode(nir_op op, nir_const_value *dest, unsigned num_components, unsigned bit_width, - nir_const_value **src) + nir_const_value **src, + unsigned float_controls_execution_mode) { switch (op) { % for name in sorted(opcodes.keys()): case nir_op_${name}: - evaluate_${name}(dest, num_components, bit_width, src); + evaluate_${name}(dest, num_components, bit_width, src, float_controls_execution_mode); return; % endfor default: @@ -425,6 +505,8 @@ from mako.template import Template print(Template(template).render(opcodes=opcodes, type_sizes=type_sizes, + type_base_type=type_base_type, + type_size=type_size, type_has_size=type_has_size, type_add_size=type_add_size, op_bit_sizes=op_bit_sizes, diff -Nru mesa-19.2.8/src/compiler/nir/nir_deref.c mesa-20.0.8/src/compiler/nir/nir_deref.c --- mesa-19.2.8/src/compiler/nir/nir_deref.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir_deref.c 2020-06-12 01:21:16.000000000 +0000 @@ -26,6 +26,19 @@ #include "nir_deref.h" #include "util/hash_table.h" +static bool +is_trivial_deref_cast(nir_deref_instr *cast) +{ + nir_deref_instr *parent = nir_src_as_deref(cast->parent); + if (!parent) + return false; + + return cast->mode == parent->mode && + cast->type == parent->type && + cast->dest.ssa.num_components == parent->dest.ssa.num_components && + cast->dest.ssa.bit_size == parent->dest.ssa.bit_size; +} + void nir_deref_path_init(nir_deref_path *path, nir_deref_instr *deref, void *mem_ctx) @@ -44,6 +57,8 @@ *tail = NULL; for (nir_deref_instr *d = deref; d; d = nir_deref_instr_parent(d)) { + if (d->deref_type == nir_deref_type_cast && is_trivial_deref_cast(d)) + continue; count++; if (count <= max_short_path_len) *(--head) = d; @@ -64,8 +79,11 @@ path->path = ralloc_array(mem_ctx, nir_deref_instr *, count + 1); head = tail = path->path + count; *tail = NULL; - for (nir_deref_instr *d = deref; d; d = nir_deref_instr_parent(d)) + for (nir_deref_instr *d = deref; d; d = nir_deref_instr_parent(d)) { + if (d->deref_type == nir_deref_type_cast && is_trivial_deref_cast(d)) + continue; *(--head) = d; + } done: assert(head == path->path); @@ -92,7 +110,7 @@ for (nir_deref_instr *d = instr; d; d = nir_deref_instr_parent(d)) { /* If anyone is using this deref, leave it alone */ assert(d->dest.is_ssa); - if (!list_empty(&d->dest.ssa.uses)) + if (!list_is_empty(&d->dest.ssa.uses)) break; nir_instr_remove(&d->instr); @@ -292,12 +310,12 @@ assert(path.path[0]->deref_type == nir_deref_type_var); - nir_ssa_def *offset = nir_imm_int(b, 0); + nir_ssa_def *offset = nir_imm_intN_t(b, 0, deref->dest.ssa.bit_size); for (nir_deref_instr **p = &path.path[1]; *p; p++) { if ((*p)->deref_type == nir_deref_type_array) { nir_ssa_def *index = nir_ssa_for_src(b, (*p)->arr.index, 1); int stride = type_get_array_stride((*p)->type, size_align); - offset = nir_iadd(b, offset, nir_imul_imm(b, index, stride)); + offset = nir_iadd(b, offset, nir_amul_imm(b, index, stride)); } else if ((*p)->deref_type == nir_deref_type_struct) { /* p starts at path[1], so this is safe */ nir_deref_instr *parent = *(p - 1); @@ -401,7 +419,7 @@ { assert(path->path[0]->deref_type == nir_deref_type_var); - if (path->path[0]->var->data.image.access & ACCESS_COHERENT) + if (path->path[0]->var->data.access & ACCESS_COHERENT) return true; for (nir_deref_instr **p = &path->path[1]; *p; p++) { @@ -644,6 +662,7 @@ break; case nir_deref_type_array: + case nir_deref_type_ptr_as_array: assert(!nir_src_as_deref(deref->arr.index)); nir_src_copy(&new_deref->arr.index, &deref->arr.index, new_deref); break; @@ -734,17 +753,40 @@ return state.progress; } -static bool -is_trivial_deref_cast(nir_deref_instr *cast) +static void +nir_deref_instr_fixup_child_types(nir_deref_instr *parent) { - nir_deref_instr *parent = nir_src_as_deref(cast->parent); - if (!parent) - return false; + nir_foreach_use(use, &parent->dest.ssa) { + if (use->parent_instr->type != nir_instr_type_deref) + continue; - return cast->mode == parent->mode && - cast->type == parent->type && - cast->dest.ssa.num_components == parent->dest.ssa.num_components && - cast->dest.ssa.bit_size == parent->dest.ssa.bit_size; + nir_deref_instr *child = nir_instr_as_deref(use->parent_instr); + switch (child->deref_type) { + case nir_deref_type_var: + unreachable("nir_deref_type_var cannot be a child"); + + case nir_deref_type_array: + case nir_deref_type_array_wildcard: + child->type = glsl_get_array_element(parent->type); + break; + + case nir_deref_type_ptr_as_array: + child->type = parent->type; + break; + + case nir_deref_type_struct: + child->type = glsl_get_struct_field(parent->type, + child->strct.index); + break; + + case nir_deref_type_cast: + /* We stop the recursion here */ + continue; + } + + /* Recurse into children */ + nir_deref_instr_fixup_child_types(child); + } } static bool @@ -794,6 +836,44 @@ return true; } +static bool +opt_remove_sampler_cast(nir_deref_instr *cast) +{ + assert(cast->deref_type == nir_deref_type_cast); + nir_deref_instr *parent = nir_src_as_deref(cast->parent); + if (parent == NULL) + return false; + + /* Strip both types down to their non-array type and bail if there are any + * discrepancies in array lengths. + */ + const struct glsl_type *parent_type = parent->type; + const struct glsl_type *cast_type = cast->type; + while (glsl_type_is_array(parent_type) && glsl_type_is_array(cast_type)) { + if (glsl_get_length(parent_type) != glsl_get_length(cast_type)) + return false; + parent_type = glsl_get_array_element(parent_type); + cast_type = glsl_get_array_element(cast_type); + } + + if (glsl_type_is_array(parent_type) || glsl_type_is_array(cast_type)) + return false; + + if (!glsl_type_is_sampler(parent_type) || + cast_type != glsl_bare_sampler_type()) + return false; + + /* We're a cast from a more detailed sampler type to a bare sampler */ + nir_ssa_def_rewrite_uses(&cast->dest.ssa, + nir_src_for_ssa(&parent->dest.ssa)); + nir_instr_remove(&cast->instr); + + /* Recursively crawl the deref tree and clean up types */ + nir_deref_instr_fixup_child_types(parent); + + return true; +} + /** * Is this casting a struct to a contained struct. * struct a { struct b field0 }; @@ -833,6 +913,9 @@ if (opt_replace_struct_wrapper_cast(b, cast)) return true; + if (opt_remove_sampler_cast(cast)) + return true; + progress = opt_remove_cast_cast(cast); if (!is_trivial_deref_cast(cast)) return progress; @@ -855,9 +938,11 @@ } /* If uses would be a bit crazy */ - assert(list_empty(&cast->dest.ssa.if_uses)); + assert(list_is_empty(&cast->dest.ssa.if_uses)); + + if (nir_deref_instr_remove_if_unused(cast)) + progress = true; - nir_deref_instr_remove_if_unused(cast); return progress; } diff -Nru mesa-19.2.8/src/compiler/nir/nir_divergence_analysis.c mesa-20.0.8/src/compiler/nir/nir_divergence_analysis.c --- mesa-19.2.8/src/compiler/nir/nir_divergence_analysis.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir_divergence_analysis.c 2020-06-12 01:21:16.000000000 +0000 @@ -125,6 +125,11 @@ else is_divergent = true; break; + case nir_intrinsic_load_input_vertex: + is_divergent = divergent[instr->src[1].ssa->index]; + assert(stage == MESA_SHADER_FRAGMENT); + is_divergent |= !(options & nir_divergence_single_prim_per_subgroup); + break; case nir_intrinsic_load_output: assert(stage == MESA_SHADER_TESS_CTRL || stage == MESA_SHADER_FRAGMENT); is_divergent = divergent[instr->src[0].ssa->index]; @@ -157,6 +162,8 @@ is_divergent = !(options & nir_divergence_single_patch_per_tcs_subgroup); else if (stage == MESA_SHADER_TESS_EVAL) is_divergent = !(options & nir_divergence_single_patch_per_tes_subgroup); + else if (stage == MESA_SHADER_GEOMETRY) + is_divergent = true; else unreachable("Invalid stage for load_primitive_id"); break; @@ -200,7 +207,6 @@ case nir_intrinsic_ballot_find_lsb: case nir_intrinsic_ballot_find_msb: case nir_intrinsic_ballot_bit_count_reduce: - case nir_intrinsic_shuffle: case nir_intrinsic_shuffle_xor: case nir_intrinsic_shuffle_up: case nir_intrinsic_shuffle_down: @@ -247,6 +253,11 @@ break; } + case nir_intrinsic_shuffle: + is_divergent = divergent[instr->src[0].ssa->index] && + divergent[instr->src[1].ssa->index]; + break; + /* Intrinsics which are always divergent */ case nir_intrinsic_load_color0: case nir_intrinsic_load_color1: @@ -258,11 +269,13 @@ case nir_intrinsic_load_barycentric_pixel: case nir_intrinsic_load_barycentric_centroid: case nir_intrinsic_load_barycentric_sample: + case nir_intrinsic_load_barycentric_model: case nir_intrinsic_load_barycentric_at_sample: case nir_intrinsic_load_barycentric_at_offset: case nir_intrinsic_interp_deref_at_offset: case nir_intrinsic_interp_deref_at_sample: case nir_intrinsic_interp_deref_at_centroid: + case nir_intrinsic_interp_deref_at_vertex: case nir_intrinsic_load_tess_coord: case nir_intrinsic_load_point_coord: case nir_intrinsic_load_frag_coord: @@ -310,8 +323,10 @@ case nir_intrinsic_ssbo_atomic_fmin: case nir_intrinsic_ssbo_atomic_fcomp_swap: case nir_intrinsic_image_deref_atomic_add: - case nir_intrinsic_image_deref_atomic_min: - case nir_intrinsic_image_deref_atomic_max: + case nir_intrinsic_image_deref_atomic_imin: + case nir_intrinsic_image_deref_atomic_umin: + case nir_intrinsic_image_deref_atomic_imax: + case nir_intrinsic_image_deref_atomic_umax: case nir_intrinsic_image_deref_atomic_and: case nir_intrinsic_image_deref_atomic_or: case nir_intrinsic_image_deref_atomic_xor: @@ -319,8 +334,10 @@ case nir_intrinsic_image_deref_atomic_comp_swap: case nir_intrinsic_image_deref_atomic_fadd: case nir_intrinsic_image_atomic_add: - case nir_intrinsic_image_atomic_min: - case nir_intrinsic_image_atomic_max: + case nir_intrinsic_image_atomic_imin: + case nir_intrinsic_image_atomic_umin: + case nir_intrinsic_image_atomic_imax: + case nir_intrinsic_image_atomic_umax: case nir_intrinsic_image_atomic_and: case nir_intrinsic_image_atomic_or: case nir_intrinsic_image_atomic_xor: @@ -328,8 +345,10 @@ case nir_intrinsic_image_atomic_comp_swap: case nir_intrinsic_image_atomic_fadd: case nir_intrinsic_bindless_image_atomic_add: - case nir_intrinsic_bindless_image_atomic_min: - case nir_intrinsic_bindless_image_atomic_max: + case nir_intrinsic_bindless_image_atomic_imin: + case nir_intrinsic_bindless_image_atomic_umin: + case nir_intrinsic_bindless_image_atomic_imax: + case nir_intrinsic_bindless_image_atomic_umax: case nir_intrinsic_bindless_image_atomic_and: case nir_intrinsic_bindless_image_atomic_or: case nir_intrinsic_bindless_image_atomic_xor: diff -Nru mesa-19.2.8/src/compiler/nir/nir_from_ssa.c mesa-20.0.8/src/compiler/nir/nir_from_ssa.c --- mesa-19.2.8/src/compiler/nir/nir_from_ssa.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir_from_ssa.c 2020-06-12 01:21:16.000000000 +0000 @@ -495,7 +495,7 @@ } nir_ssa_def_rewrite_uses(def, nir_src_for_reg(reg)); - assert(list_empty(&def->uses) && list_empty(&def->if_uses)); + assert(list_is_empty(&def->uses) && list_is_empty(&def->if_uses)); if (def->parent_instr->type == nir_instr_type_ssa_undef) { /* If it's an ssa_undef instruction, remove it since we know we just got @@ -961,7 +961,7 @@ } } - if (!list_empty(&def->if_uses)) + if (!list_is_empty(&def->if_uses)) return false; return true; diff -Nru mesa-19.2.8/src/compiler/nir/nir_gather_info.c mesa-20.0.8/src/compiler/nir/nir_gather_info.c --- mesa-19.2.8/src/compiler/nir/nir_gather_info.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir_gather_info.c 2020-06-12 01:21:16.000000000 +0000 @@ -102,12 +102,15 @@ } static unsigned -get_io_offset(nir_deref_instr *deref, bool is_vertex_input) +get_io_offset(nir_deref_instr *deref, bool is_vertex_input, bool per_vertex) { unsigned offset = 0; for (nir_deref_instr *d = deref; d; d = nir_deref_instr_parent(d)) { if (d->deref_type == nir_deref_type_array) { + if (per_vertex && nir_deref_instr_parent(d)->deref_type == nir_deref_type_var) + break; + if (!nir_src_is_const(d->arr.index)) return -1; @@ -132,8 +135,9 @@ nir_deref_instr *deref, bool is_output_read) { const struct glsl_type *type = var->type; + bool per_vertex = nir_is_per_vertex_io(var, shader->info.stage); - if (nir_is_per_vertex_io(var, shader->info.stage)) { + if (per_vertex) { assert(glsl_type_is_array(type)); type = glsl_get_array_element(type); } @@ -157,7 +161,7 @@ return false; } - unsigned offset = get_io_offset(deref, false); + unsigned offset = get_io_offset(deref, false, per_vertex); if (offset == -1) return false; @@ -209,6 +213,7 @@ case nir_intrinsic_interp_deref_at_centroid: case nir_intrinsic_interp_deref_at_sample: case nir_intrinsic_interp_deref_at_offset: + case nir_intrinsic_interp_deref_at_vertex: case nir_intrinsic_load_deref: case nir_intrinsic_store_deref:{ nir_deref_instr *deref = nir_src_as_deref(instr->src[0]); @@ -280,6 +285,7 @@ /* fall through */ case nir_intrinsic_emit_vertex: + case nir_intrinsic_emit_vertex_with_counter: if (nir_intrinsic_stream_id(instr) > 0) shader->info.gs.uses_streams = true; @@ -360,9 +366,19 @@ { shader->info.num_textures = 0; shader->info.num_images = 0; + shader->info.last_msaa_image = -1; nir_foreach_variable(var, &shader->uniforms) { + /* Bindless textures and images don't use non-bindless slots. */ + if (var->data.bindless) + continue; + shader->info.num_textures += glsl_type_get_sampler_count(var->type); shader->info.num_images += glsl_type_get_image_count(var->type); + + /* Assuming image slots don't have holes (e.g. OpenGL) */ + if (glsl_type_is_image(var->type) && + glsl_get_sampler_dim(var->type) == GLSL_SAMPLER_DIM_MS) + shader->info.last_msaa_image = shader->info.num_images - 1; } shader->info.inputs_read = 0; @@ -377,6 +393,8 @@ } if (shader->info.stage == MESA_SHADER_FRAGMENT) { shader->info.fs.uses_sample_qualifier = false; + shader->info.fs.uses_discard = false; + shader->info.fs.needs_helper_invocations = false; } void *dead_ctx = ralloc_context(NULL); diff -Nru mesa-19.2.8/src/compiler/nir/nir_gather_xfb_info.c mesa-20.0.8/src/compiler/nir/nir_gather_xfb_info.c --- mesa-19.2.8/src/compiler/nir/nir_gather_xfb_info.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir_gather_xfb_info.c 2020-06-12 01:21:16.000000000 +0000 @@ -28,7 +28,7 @@ static void add_var_xfb_varying(nir_xfb_info *xfb, nir_xfb_varyings_info *varyings, - nir_variable *var, + unsigned buffer, unsigned offset, const struct glsl_type *type) { @@ -38,9 +38,9 @@ nir_xfb_varying_info *varying = &varyings->varyings[varyings->varying_count++]; varying->type = type; - varying->buffer = var->data.xfb_buffer; + varying->buffer = buffer; varying->offset = offset; - xfb->buffers[var->data.xfb_buffer].varying_count++; + xfb->buffers[buffer].varying_count++; } @@ -83,7 +83,7 @@ if (!glsl_type_is_array(child_type) && !glsl_type_is_struct(child_type)) { - add_var_xfb_varying(xfb, varyings, var, *offset, type); + add_var_xfb_varying(xfb, varyings, buffer, *offset, type); varying_added = true; } @@ -100,11 +100,11 @@ } else { assert(buffer < NIR_MAX_XFB_BUFFERS); if (xfb->buffers_written & (1 << buffer)) { - assert(xfb->buffers[buffer].stride == var->data.xfb_stride); + assert(xfb->buffers[buffer].stride == var->data.xfb.stride); assert(xfb->buffer_to_stream[buffer] == var->data.stream); } else { xfb->buffers_written |= (1 << buffer); - xfb->buffers[buffer].stride = var->data.xfb_stride; + xfb->buffers[buffer].stride = var->data.xfb.stride; xfb->buffer_to_stream[buffer] = var->data.stream; } @@ -138,7 +138,7 @@ unsigned comp_offset = var->data.location_frac; if (!varying_added) { - add_var_xfb_varying(xfb, varyings, var, *offset, type); + add_var_xfb_varying(xfb, varyings, buffer, *offset, type); } while (comp_mask) { @@ -235,7 +235,7 @@ if (var->data.explicit_offset && !is_array_block) { unsigned offset = var->data.offset; - add_var_xfb_outputs(xfb, varyings_info, var, var->data.xfb_buffer, + add_var_xfb_outputs(xfb, varyings_info, var, var->data.xfb.buffer, &location, &offset, var->type, false); } else if (is_array_block) { assert(glsl_type_is_struct_or_ifc(var->interface_type)); @@ -253,7 +253,7 @@ } unsigned offset = foffset; - add_var_xfb_outputs(xfb, varyings_info, var, var->data.xfb_buffer + b, + add_var_xfb_outputs(xfb, varyings_info, var, var->data.xfb.buffer + b, &location, &offset, ftype, false); } } diff -Nru mesa-19.2.8/src/compiler/nir/nir.h mesa-20.0.8/src/compiler/nir/nir.h --- mesa-19.2.8/src/compiler/nir/nir.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir.h 2020-06-12 01:21:16.000000000 +0000 @@ -58,9 +58,19 @@ #define NIR_FALSE 0u #define NIR_TRUE (~0u) -#define NIR_MAX_VEC_COMPONENTS 4 +#define NIR_MAX_VEC_COMPONENTS 16 #define NIR_MAX_MATRIX_COLUMNS 4 -typedef uint8_t nir_component_mask_t; +#define NIR_STREAM_PACKED (1 << 8) +typedef uint16_t nir_component_mask_t; + +static inline bool +nir_num_components_valid(unsigned num_components) +{ + return (num_components >= 1 && + num_components <= 4) || + num_components == 8 || + num_components == 16; +} /** Defines a cast function * @@ -92,7 +102,7 @@ */ typedef struct { gl_state_index16 tokens[STATE_LENGTH]; - int swizzle; + uint16_t swizzle; } nir_state_slot; typedef enum { @@ -106,7 +116,9 @@ nir_var_mem_ssbo = (1 << 7), nir_var_mem_shared = (1 << 8), nir_var_mem_global = (1 << 9), - nir_var_all = ~0, + nir_var_mem_push_const = (1 << 10), /* not actually used for variables */ + nir_num_variable_modes = 11, + nir_var_all = (1 << nir_num_variable_modes) - 1, } nir_variable_mode; /** @@ -212,7 +224,7 @@ } } -static inline int64_t +static inline uint64_t nir_const_value_as_uint(nir_const_value value, unsigned bit_size) { switch (bit_size) { @@ -313,7 +325,7 @@ * * \sa nir_variable_mode */ - nir_variable_mode mode; + nir_variable_mode mode:11; /** * Is the variable read-only? @@ -327,6 +339,19 @@ unsigned patch:1; unsigned invariant:1; + /** + * Precision qualifier. + * + * In desktop GLSL we do not care about precision qualifiers at all, in + * fact, the spec says that precision qualifiers are ignored. + * + * To make things easy, we make it so that this field is always + * GLSL_PRECISION_NONE on desktop shaders. This way all the variables + * have the same precision value and the checks we add in the compiler + * for this field will never break a desktop shader compile. + */ + unsigned precision:2; + /** * Can this variable be coalesced with another? * @@ -353,7 +378,7 @@ * * \sa glsl_interp_mode */ - unsigned interpolation:2; + unsigned interpolation:3; /** * If non-zero, then this variable may be packed along with other variables @@ -390,6 +415,15 @@ unsigned explicit_binding:1; /** + * Was the location explicitly set in the shader? + * + * If the location is explicitly set in the shader, it \b cannot be changed + * by the linker or by the API (e.g., calls to \c glBindAttribLocation have + * no effect). + */ + unsigned explicit_location:1; + + /** * Was a transfer feedback buffer set in the shader? */ unsigned explicit_xfb_buffer:1; @@ -405,98 +439,118 @@ unsigned explicit_offset:1; /** - * \brief Layout qualifier for gl_FragDepth. - * - * This is not equal to \c ir_depth_layout_none if and only if this - * variable is \c gl_FragDepth and a layout qualifier is specified. + * Non-zero if this variable was created by lowering a named interface + * block. */ - nir_depth_layout depth_layout; + unsigned from_named_ifc_block:1; /** - * Storage location of the base of this variable - * - * The precise meaning of this field depends on the nature of the variable. - * - * - Vertex shader input: one of the values from \c gl_vert_attrib. - * - Vertex shader output: one of the values from \c gl_varying_slot. - * - Geometry shader input: one of the values from \c gl_varying_slot. - * - Geometry shader output: one of the values from \c gl_varying_slot. - * - Fragment shader input: one of the values from \c gl_varying_slot. - * - Fragment shader output: one of the values from \c gl_frag_result. - * - Uniforms: Per-stage uniform slot number for default uniform block. - * - Uniforms: Index within the uniform block definition for UBO members. - * - Non-UBO Uniforms: uniform slot number. - * - Other: This field is not currently used. + * How the variable was declared. See nir_var_declaration_type. * - * If the variable is a uniform, shader input, or shader output, and the - * slot has not been assigned, the value will be -1. + * This is used to detect variables generated by the compiler, so should + * not be visible via the API. */ - int location; + unsigned how_declared:2; /** - * The actual location of the variable in the IR. Only valid for inputs - * and outputs. + * \brief Layout qualifier for gl_FragDepth. + * + * This is not equal to \c ir_depth_layout_none if and only if this + * variable is \c gl_FragDepth and a layout qualifier is specified. */ - unsigned int driver_location; + nir_depth_layout depth_layout:3; /** * Vertex stream output identifier. * - * For packed outputs, bit 31 is set and bits [2*i+1,2*i] indicate the - * stream of the i-th component. + * For packed outputs, NIR_STREAM_PACKED is set and bits [2*i+1,2*i] + * indicate the stream of the i-th component. */ - unsigned stream; + unsigned stream:9; /** - * output index for dual source blending. + * Access flags for memory variables (SSBO/global), image uniforms, and + * bindless images in uniforms/inputs/outputs. */ - int index; + enum gl_access_qualifier access:8; /** * Descriptor set binding for sampler or UBO. */ - int descriptor_set; + unsigned descriptor_set:5; + + /** + * output index for dual source blending. + */ + unsigned index; /** * Initial binding point for a sampler or UBO. * * For array types, this represents the binding point for the first element. */ - int binding; + unsigned binding; /** - * Location an atomic counter or transform feedback is stored at. + * Storage location of the base of this variable + * + * The precise meaning of this field depends on the nature of the variable. + * + * - Vertex shader input: one of the values from \c gl_vert_attrib. + * - Vertex shader output: one of the values from \c gl_varying_slot. + * - Geometry shader input: one of the values from \c gl_varying_slot. + * - Geometry shader output: one of the values from \c gl_varying_slot. + * - Fragment shader input: one of the values from \c gl_varying_slot. + * - Fragment shader output: one of the values from \c gl_frag_result. + * - Uniforms: Per-stage uniform slot number for default uniform block. + * - Uniforms: Index within the uniform block definition for UBO members. + * - Non-UBO Uniforms: uniform slot number. + * - Other: This field is not currently used. + * + * If the variable is a uniform, shader input, or shader output, and the + * slot has not been assigned, the value will be -1. */ - unsigned offset; + int location; /** - * Transform feedback buffer. + * The actual location of the variable in the IR. Only valid for inputs, + * outputs, and uniforms (including samplers and images). */ - unsigned xfb_buffer; + unsigned driver_location; /** - * Transform feedback stride. + * Location an atomic counter or transform feedback is stored at. */ - unsigned xfb_stride; + unsigned offset; - /** - * How the variable was declared. See nir_var_declaration_type. - * - * This is used to detect variables generated by the compiler, so should - * not be visible via the API. - */ - unsigned how_declared:2; + union { + struct { + /** Image internal format if specified explicitly, otherwise GL_NONE. */ + uint16_t format; /* GLenum */ + } image; + + struct { + /** + * Transform feedback buffer. + */ + uint16_t buffer:2; + + /** + * Transform feedback stride. + */ + uint16_t stride; + } xfb; + }; + } data; - /** - * ARB_shader_image_load_store qualifiers. - */ - struct { - enum gl_access_qualifier access; + /** + * Identifier for this variable generated by nir_index_vars() that is unique + * among other variables in the same exec_list. + */ + unsigned index; - /** Image internal format if specified explicitly, otherwise GL_NONE. */ - GLenum format; - } image; - } data; + /* Number of nir_variable_data members */ + uint16_t num_members; /** * Built-in state that backs this uniform @@ -510,7 +564,7 @@ * \c state_slots will be \c NULL. */ /*@{*/ - unsigned num_state_slots; /**< Number of state slots used */ + uint16_t num_state_slots; /**< Number of state slots used */ nir_state_slot *state_slots; /**< State descriptors. */ /*@}*/ @@ -539,7 +593,6 @@ * inputs each with their own layout specifier. This is only allowed on * variables with a struct or array of array of struct type. */ - unsigned num_members; struct nir_variable_data *members; } nir_variable; @@ -881,6 +934,8 @@ nir_type_bool = 6, nir_type_float = 128, nir_type_bool1 = 1 | nir_type_bool, + nir_type_bool8 = 8 | nir_type_bool, + nir_type_bool16 = 16 | nir_type_bool, nir_type_bool32 = 32 | nir_type_bool, nir_type_int1 = 1 | nir_type_int, nir_type_int8 = 8 | nir_type_int, @@ -984,10 +1039,93 @@ case 2: return nir_op_vec2; case 3: return nir_op_vec3; case 4: return nir_op_vec4; + case 8: return nir_op_vec8; + case 16: return nir_op_vec16; default: unreachable("bad component count"); } } +static inline bool +nir_is_float_control_signed_zero_inf_nan_preserve(unsigned execution_mode, unsigned bit_size) +{ + return (16 == bit_size && execution_mode & FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP16) || + (32 == bit_size && execution_mode & FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP32) || + (64 == bit_size && execution_mode & FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP64); +} + +static inline bool +nir_is_denorm_flush_to_zero(unsigned execution_mode, unsigned bit_size) +{ + return (16 == bit_size && execution_mode & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16) || + (32 == bit_size && execution_mode & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32) || + (64 == bit_size && execution_mode & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP64); +} + +static inline bool +nir_is_denorm_preserve(unsigned execution_mode, unsigned bit_size) +{ + return (16 == bit_size && execution_mode & FLOAT_CONTROLS_DENORM_PRESERVE_FP16) || + (32 == bit_size && execution_mode & FLOAT_CONTROLS_DENORM_PRESERVE_FP32) || + (64 == bit_size && execution_mode & FLOAT_CONTROLS_DENORM_PRESERVE_FP64); +} + +static inline bool +nir_is_rounding_mode_rtne(unsigned execution_mode, unsigned bit_size) +{ + return (16 == bit_size && execution_mode & FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16) || + (32 == bit_size && execution_mode & FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32) || + (64 == bit_size && execution_mode & FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64); +} + +static inline bool +nir_is_rounding_mode_rtz(unsigned execution_mode, unsigned bit_size) +{ + return (16 == bit_size && execution_mode & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16) || + (32 == bit_size && execution_mode & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32) || + (64 == bit_size && execution_mode & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64); +} + +static inline bool +nir_has_any_rounding_mode_rtz(unsigned execution_mode) +{ + return (execution_mode & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16) || + (execution_mode & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32) || + (execution_mode & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64); +} + +static inline bool +nir_has_any_rounding_mode_rtne(unsigned execution_mode) +{ + return (execution_mode & FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16) || + (execution_mode & FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32) || + (execution_mode & FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64); +} + +static inline nir_rounding_mode +nir_get_rounding_mode_from_float_controls(unsigned execution_mode, + nir_alu_type type) +{ + if (nir_alu_type_get_base_type(type) != nir_type_float) + return nir_rounding_mode_undef; + + unsigned bit_size = nir_alu_type_get_type_size(type); + + if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) + return nir_rounding_mode_rtz; + if (nir_is_rounding_mode_rtne(execution_mode, bit_size)) + return nir_rounding_mode_rtne; + return nir_rounding_mode_undef; +} + +static inline bool +nir_has_any_rounding_mode_enabled(unsigned execution_mode) +{ + bool result = + nir_has_any_rounding_mode_rtne(execution_mode) || + nir_has_any_rounding_mode_rtz(execution_mode); + return result; +} + typedef enum { /** * Operation where the first two sources are commutative. @@ -1297,6 +1435,24 @@ return nir_deref_instr_get_variable(nir_src_as_deref(intrin->src[i])); } +typedef enum { + /* Memory ordering. */ + NIR_MEMORY_ACQUIRE = 1 << 0, + NIR_MEMORY_RELEASE = 1 << 1, + + /* Memory visibility operations. */ + NIR_MEMORY_MAKE_AVAILABLE = 1 << 3, + NIR_MEMORY_MAKE_VISIBLE = 1 << 4, +} nir_memory_semantics; + +typedef enum { + NIR_SCOPE_DEVICE, + NIR_SCOPE_QUEUE_FAMILY, + NIR_SCOPE_WORKGROUP, + NIR_SCOPE_SUBGROUP, + NIR_SCOPE_INVOCATION, +} nir_scope; + /** * \name NIR intrinsics semantic flags * @@ -1443,6 +1599,24 @@ NIR_INTRINSIC_SRC_ACCESS, NIR_INTRINSIC_DST_ACCESS, + /* Driver location for nir_load_patch_location_ir3 */ + NIR_INTRINSIC_DRIVER_LOCATION, + + /** + * Mask of nir_memory_semantics, includes ordering and visibility. + */ + NIR_INTRINSIC_MEMORY_SEMANTICS, + + /** + * Mask of nir_variable_modes affected by the memory operation. + */ + NIR_INTRINSIC_MEMORY_MODES, + + /** + * Value of nir_scope. + */ + NIR_INTRINSIC_MEMORY_SCOPE, + NIR_INTRINSIC_NUM_INDEX_FLAGS, } nir_intrinsic_index_flag; @@ -1551,6 +1725,10 @@ INTRINSIC_IDX_ACCESSORS(desc_type, DESC_TYPE, unsigned) INTRINSIC_IDX_ACCESSORS(type, TYPE, nir_alu_type) INTRINSIC_IDX_ACCESSORS(swizzle_mask, SWIZZLE_MASK, unsigned) +INTRINSIC_IDX_ACCESSORS(driver_location, DRIVER_LOCATION, unsigned) +INTRINSIC_IDX_ACCESSORS(memory_semantics, MEMORY_SEMANTICS, nir_memory_semantics) +INTRINSIC_IDX_ACCESSORS(memory_modes, MEMORY_MODES, nir_variable_mode) +INTRINSIC_IDX_ACCESSORS(memory_scope, MEMORY_SCOPE, nir_scope) static inline void nir_intrinsic_set_align(nir_intrinsic_instr *intrin, @@ -1651,6 +1829,9 @@ nir_texop_samples_identical, /**< Query whether all samples are definitely * identical. */ + nir_texop_tex_prefetch, /**< Regular texture look-up, eligible for pre-dispatch */ + nir_texop_fragment_fetch, /**< Multisample fragment color texture fetch */ + nir_texop_fragment_mask_fetch,/**< Multisample fragment mask texture fetch */ } nir_texop; typedef struct { @@ -1747,6 +1928,7 @@ case nir_texop_texture_samples: case nir_texop_query_levels: case nir_texop_samples_identical: + case nir_texop_fragment_mask_fetch: return 1; default: @@ -1912,11 +2094,6 @@ nir_const_value value[]; } nir_load_const_instr; -#define nir_const_load_to_arr(arr, l, m) \ -{ \ - nir_const_value_to_array(arr, l->value, l->def.num_components, m); \ -} while (false); - typedef enum { nir_jump_return, nir_jump_break, @@ -2489,6 +2666,7 @@ nir_lower_shift64 = (1 << 11), nir_lower_imul_2x32_64 = (1 << 12), nir_lower_extract64 = (1 << 13), + nir_lower_ufind_msb64 = (1 << 14), } nir_lower_int64_options; typedef enum { @@ -2597,11 +2775,13 @@ bool lower_ldexp; bool lower_pack_half_2x16; + bool lower_pack_half_2x16_split; bool lower_pack_unorm_2x16; bool lower_pack_snorm_2x16; bool lower_pack_unorm_4x8; bool lower_pack_snorm_4x8; bool lower_unpack_half_2x16; + bool lower_unpack_half_2x16_split; bool lower_unpack_unorm_2x16; bool lower_unpack_snorm_2x16; bool lower_unpack_unorm_4x8; @@ -2652,14 +2832,59 @@ /* Set if nir_lower_wpos_ytransform() should also invert gl_PointCoord. */ bool lower_wpos_pntc; + /** + * Set if nir_op_[iu]hadd and nir_op_[iu]rhadd instructions should be + * lowered to simple arithmetic. + * + * If this flag is set, the lowering will be applied to all bit-sizes of + * these instructions. + * + * \sa ::lower_hadd64 + */ bool lower_hadd; + + /** + * Set if only 64-bit nir_op_[iu]hadd and nir_op_[iu]rhadd instructions + * should be lowered to simple arithmetic. + * + * If this flag is set, the lowering will be applied to only 64-bit + * versions of these instructions. + * + * \sa ::lower_hadd + */ + bool lower_hadd64; + + /** + * Set if nir_op_add_sat and nir_op_usub_sat should be lowered to simple + * arithmetic. + * + * If this flag is set, the lowering will be applied to all bit-sizes of + * these instructions. + * + * \sa ::lower_usub_sat64 + */ bool lower_add_sat; /** + * Set if only 64-bit nir_op_usub_sat should be lowered to simple + * arithmetic. + * + * \sa ::lower_add_sat + */ + bool lower_usub_sat64; + + /** * Should IO be re-vectorized? Some scalar ISAs still operate on vec4's * for IO purposes and would prefer loads/stores be vectorized. */ bool vectorize_io; + bool lower_to_scalar; + + /** + * Should the linker unify inputs_read/outputs_written between adjacent + * shader stages which are linked into a single program? + */ + bool unify_interfaces; /** * Should nir_lower_io() create load_interpolated_input intrinsics? @@ -2676,6 +2901,14 @@ bool lower_rotate; /** + * Backend supports imul24, and would like to use it (when possible) + * for address/offset calculation. If true, driver should call + * nir_lower_amul(). (If not set, amul will automatically be lowered + * to imul.) + */ + bool has_imul24; + + /** * Is this the Intel vec4 backend? * * Used to inhibit algebraic optimizations that are known to be harmful on @@ -3204,6 +3437,8 @@ void nir_index_blocks(nir_function_impl *impl); +void nir_index_vars(nir_shader *shader, nir_function_impl *impl, nir_variable_mode modes); + void nir_print_shader(nir_shader *shader, FILE *fp); void nir_print_shader_annotated(nir_shader *shader, FILE *fp, struct hash_table *errors); void nir_print_instr(const nir_instr *instr, FILE *fp); @@ -3439,6 +3674,8 @@ int size_threshold, glsl_type_size_align_func size_align); +void nir_lower_clip_halfz(nir_shader *shader); + void nir_shader_gather_info(nir_shader *shader, nir_function_impl *entrypoint); void nir_gather_ssa_types(nir_function_impl *impl, @@ -3458,6 +3695,8 @@ void nir_link_xfb_varyings(nir_shader *producer, nir_shader *consumer); bool nir_link_opt_varyings(nir_shader *producer, nir_shader *consumer); +bool nir_lower_amul(nir_shader *shader, + int (*type_size)(const struct glsl_type *, bool)); void nir_assign_io_var_locations(struct exec_list *var_list, unsigned *size, @@ -3606,13 +3845,14 @@ bool nir_move_vec_src_uses_to_dest(nir_shader *shader); bool nir_lower_vec_to_movs(nir_shader *shader); void nir_lower_alpha_test(nir_shader *shader, enum compare_func func, - bool alpha_to_one); + bool alpha_to_one, + const gl_state_index16 *alpha_ref_state_tokens); bool nir_lower_alu(nir_shader *shader); bool nir_lower_flrp(nir_shader *shader, unsigned lowering_mask, bool always_precise, bool have_ffma); -bool nir_lower_alu_to_scalar(nir_shader *shader, BITSET_WORD *lower_set); +bool nir_lower_alu_to_scalar(nir_shader *shader, nir_instr_filter_cb cb, const void *data); bool nir_lower_bool_to_float(nir_shader *shader); bool nir_lower_bool_to_int32(nir_shader *shader); bool nir_lower_int_to_float(nir_shader *shader); @@ -3640,6 +3880,8 @@ bool lower_shuffle:1; bool lower_shuffle_to_32bit:1; bool lower_quad:1; + bool lower_quad_broadcast_dynamic:1; + bool lower_quad_broadcast_dynamic_to_const:1; } nir_lower_subgroups_options; bool nir_lower_subgroups(nir_shader *shader, @@ -3824,21 +4066,44 @@ bool nir_lower_non_uniform_access(nir_shader *shader, enum nir_lower_non_uniform_access_type); -bool nir_lower_idiv(nir_shader *shader); +enum nir_lower_idiv_path { + /* This path is based on NV50LegalizeSSA::handleDIV(). It is the faster of + * the two but it is not exact in some cases (for example, 1091317713u / + * 1034u gives 5209173 instead of 1055432) */ + nir_lower_idiv_fast, + /* This path is based on AMDGPUTargetLowering::LowerUDIVREM() and + * AMDGPUTargetLowering::LowerSDIVREM(). It requires more instructions than + * the nv50 path and many of them are integer multiplications, so it is + * probably slower. It should always return the correct result, though. */ + nir_lower_idiv_precise, +}; + +bool nir_lower_idiv(nir_shader *shader, enum nir_lower_idiv_path path); bool nir_lower_input_attachments(nir_shader *shader, bool use_fragcoord_sysval); -bool nir_lower_clip_vs(nir_shader *shader, unsigned ucp_enables, bool use_vars); -bool nir_lower_clip_gs(nir_shader *shader, unsigned ucp_enables); -bool nir_lower_clip_fs(nir_shader *shader, unsigned ucp_enables); +bool nir_lower_clip_vs(nir_shader *shader, unsigned ucp_enables, + bool use_vars, + bool use_clipdist_array, + const gl_state_index16 clipplane_state_tokens[][STATE_LENGTH]); +bool nir_lower_clip_gs(nir_shader *shader, unsigned ucp_enables, + bool use_clipdist_array, + const gl_state_index16 clipplane_state_tokens[][STATE_LENGTH]); +bool nir_lower_clip_fs(nir_shader *shader, unsigned ucp_enables, + bool use_clipdist_array); bool nir_lower_clip_cull_distance_arrays(nir_shader *nir); +void nir_lower_point_size_mov(nir_shader *shader, + const gl_state_index16 *pointsize_state_tokens); + bool nir_lower_frexp(nir_shader *nir); void nir_lower_two_sided_color(nir_shader *shader); bool nir_lower_clamp_color_outputs(nir_shader *shader); +bool nir_lower_flatshade(nir_shader *shader); + void nir_lower_passthrough_edgeflags(nir_shader *shader); bool nir_lower_patch_vertices(nir_shader *nir, unsigned static_count, const gl_state_index16 *uniform_state_tokens); @@ -3877,7 +4142,7 @@ void nir_lower_bitmap(nir_shader *shader, const nir_lower_bitmap_options *options); -bool nir_lower_atomics_to_ssbo(nir_shader *shader, unsigned ssbo_offset); +bool nir_lower_atomics_to_ssbo(nir_shader *shader); typedef enum { nir_lower_int_source_mods = 1 << 0, @@ -3889,7 +4154,7 @@ bool nir_lower_to_source_mods(nir_shader *shader, nir_lower_to_source_mods_flags options); -bool nir_lower_gs_intrinsics(nir_shader *shader); +bool nir_lower_gs_intrinsics(nir_shader *shader, bool per_stream); typedef unsigned (*nir_lower_bit_size_callback)(const nir_alu_instr *, void *); @@ -3944,11 +4209,14 @@ bool nir_lower_ssa_defs_to_regs_block(nir_block *block); bool nir_rematerialize_derefs_in_use_blocks_impl(nir_function_impl *impl); +bool nir_lower_samplers(nir_shader *shader); + /* This is here for unit tests. */ bool nir_opt_comparison_pre_impl(nir_function_impl *impl); bool nir_opt_comparison_pre(nir_shader *shader); +bool nir_opt_access(nir_shader *shader); bool nir_opt_algebraic(nir_shader *shader); bool nir_opt_algebraic_before_ffma(nir_shader *shader); bool nir_opt_algebraic_late(nir_shader *shader); @@ -3992,6 +4260,7 @@ nir_move_load_ubo = (1 << 1), nir_move_load_input = (1 << 2), nir_move_comparisons = (1 << 3), + nir_move_copies = (1 << 4), } nir_move_options; bool nir_can_move_instr(nir_instr *instr, nir_move_options options); @@ -4018,6 +4287,15 @@ bool nir_opt_conditional_discard(nir_shader *shader); +typedef bool (*nir_should_vectorize_mem_func)(unsigned align, unsigned bit_size, + unsigned num_components, unsigned high_offset, + nir_intrinsic_instr *low, nir_intrinsic_instr *high); + +bool nir_opt_load_store_vectorize(nir_shader *shader, nir_variable_mode modes, + nir_should_vectorize_mem_func callback); + +void nir_schedule(nir_shader *shader, int threshold); + void nir_strip(nir_shader *shader); void nir_sweep(nir_shader *shader); diff -Nru mesa-19.2.8/src/compiler/nir/nir_instr_set.c mesa-20.0.8/src/compiler/nir/nir_instr_set.c --- mesa-19.2.8/src/compiler/nir/nir_instr_set.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir_instr_set.c 2020-06-12 01:21:16.000000000 +0000 @@ -276,6 +276,8 @@ hash = HASH(hash, instr->texture_index); hash = HASH(hash, instr->texture_array_size); hash = HASH(hash, instr->sampler_index); + hash = HASH(hash, instr->texture_non_uniform); + hash = HASH(hash, instr->sampler_non_uniform); return hash; } diff -Nru mesa-19.2.8/src/compiler/nir/nir_intrinsics.py mesa-20.0.8/src/compiler/nir/nir_intrinsics.py --- mesa-19.2.8/src/compiler/nir/nir_intrinsics.py 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir_intrinsics.py 2020-06-12 01:21:16.000000000 +0000 @@ -124,6 +124,14 @@ TYPE = "NIR_INTRINSIC_TYPE" # The swizzle mask for quad_swizzle_amd & masked_swizzle_amd SWIZZLE_MASK = "NIR_INTRINSIC_SWIZZLE_MASK" +# Driver location of attribute +DRIVER_LOCATION = "NIR_INTRINSIC_DRIVER_LOCATION" +# Ordering and visibility of a memory operation +MEMORY_SEMANTICS = "NIR_INTRINSIC_MEMORY_SEMANTICS" +# Modes affected by a memory operation +MEMORY_MODES = "NIR_INTRINSIC_MEMORY_MODES" +# Scope of a memory operation +MEMORY_SCOPE = "NIR_INTRINSIC_MEMORY_SCOPE" # # Possible flags: @@ -162,9 +170,9 @@ # Interpolation of input. The interp_deref_at* intrinsics are similar to the # load_var intrinsic acting on a shader input except that they interpolate the -# input differently. The at_sample and at_offset intrinsics take an -# additional source that is an integer sample id or a vec2 position offset -# respectively. +# input differently. The at_sample, at_offset and at_vertex intrinsics take an +# additional source that is an integer sample id, a vec2 position offset, or a +# vertex ID respectively. intrinsic("interp_deref_at_centroid", dest_comp=0, src_comp=[1], flags=[ CAN_ELIMINATE, CAN_REORDER]) @@ -172,6 +180,8 @@ flags=[CAN_ELIMINATE, CAN_REORDER]) intrinsic("interp_deref_at_offset", src_comp=[1, 2], dest_comp=0, flags=[CAN_ELIMINATE, CAN_REORDER]) +intrinsic("interp_deref_at_vertex", src_comp=[1, 1], dest_comp=0, + flags=[CAN_ELIMINATE, CAN_REORDER]) # Gets the length of an unsized array at the end of a buffer intrinsic("deref_buffer_array_length", src_comp=[-1], dest_comp=1, @@ -187,7 +197,6 @@ def barrier(name): intrinsic(name) -barrier("barrier") barrier("discard") # Demote fragment shader invocation to a helper invocation. Any stores to @@ -199,11 +208,23 @@ barrier("demote") intrinsic("is_helper_invocation", dest_comp=1, flags=[CAN_ELIMINATE]) +# A workgroup-level control barrier. Any thread which hits this barrier will +# pause until all threads within the current workgroup have also hit the +# barrier. For compute shaders, the workgroup is defined as the local group. +# For tessellation control shaders, the workgroup is defined as the current +# patch. This intrinsic does not imply any sort of memory barrier. +barrier("control_barrier") # Memory barrier with semantics analogous to the memoryBarrier() GLSL # intrinsic. barrier("memory_barrier") +# Memory barrier with explicit scope. Follows the semantics of SPIR-V +# OpMemoryBarrier, used to implement Vulkan Memory Model. Storage that the +# barrierr applies is represented using NIR variable modes. +intrinsic("scoped_memory_barrier", + indices=[MEMORY_SEMANTICS, MEMORY_MODES, MEMORY_SCOPE]) + # Shader clock intrinsic with semantics analogous to the clock2x32ARB() # GLSL intrinsic. # The latter can be used as code motion barrier, which is currently not @@ -241,6 +262,9 @@ barrier("begin_invocation_interlock") barrier("end_invocation_interlock") +# Memory barrier for synchronizing TCS patch outputs +barrier("memory_barrier_tcs_patch") + # A conditional discard/demote, with a single boolean source. intrinsic("discard_if", src_comp=[1]) intrinsic("demote_if", src_comp=[1]) @@ -365,11 +389,13 @@ intrinsic("bindless_image_" + name, src_comp=[1] + src_comp, indices=[IMAGE_DIM, IMAGE_ARRAY, FORMAT, ACCESS], **kwargs) -image("load", src_comp=[4, 1], dest_comp=0, flags=[CAN_ELIMINATE]) -image("store", src_comp=[4, 1, 0]) +image("load", src_comp=[4, 1, 1], dest_comp=0, flags=[CAN_ELIMINATE]) +image("store", src_comp=[4, 1, 0, 1]) image("atomic_add", src_comp=[4, 1, 1], dest_comp=1) -image("atomic_min", src_comp=[4, 1, 1], dest_comp=1) -image("atomic_max", src_comp=[4, 1, 1], dest_comp=1) +image("atomic_imin", src_comp=[4, 1, 1], dest_comp=1) +image("atomic_umin", src_comp=[4, 1, 1], dest_comp=1) +image("atomic_imax", src_comp=[4, 1, 1], dest_comp=1) +image("atomic_umax", src_comp=[4, 1, 1], dest_comp=1) image("atomic_and", src_comp=[4, 1, 1], dest_comp=1) image("atomic_or", src_comp=[4, 1, 1], dest_comp=1) image("atomic_xor", src_comp=[4, 1, 1], dest_comp=1) @@ -622,9 +648,10 @@ # Barycentric coordinate intrinsics. # # These set up the barycentric coordinates for a particular interpolation. -# The first three are for the simple cases: pixel, centroid, or per-sample -# (at gl_SampleID). The next two handle interpolating at a specified -# sample location, or interpolating with a vec2 offset, +# The first four are for the simple cases: pixel, centroid, per-sample +# (at gl_SampleID), or pull model (1/W, 1/I, 1/J) at the pixel center. The next +# three two handle interpolating at a specified sample location, or +# interpolating with a vec2 offset, # # The interp_mode index should be either the INTERP_MODE_SMOOTH or # INTERP_MODE_NOPERSPECTIVE enum values. @@ -632,18 +659,19 @@ # The vec2 value produced by these intrinsics is intended for use as the # barycoord source of a load_interpolated_input intrinsic. -def barycentric(name, src_comp=[]): - intrinsic("load_barycentric_" + name, src_comp=src_comp, dest_comp=2, +def barycentric(name, dst_comp, src_comp=[]): + intrinsic("load_barycentric_" + name, src_comp=src_comp, dest_comp=dst_comp, indices=[INTERP_MODE], flags=[CAN_ELIMINATE, CAN_REORDER]) # no sources. -barycentric("pixel") -barycentric("centroid") -barycentric("sample") +barycentric("pixel", 2) +barycentric("centroid", 2) +barycentric("sample", 2) +barycentric("model", 3) # src[] = { sample_id }. -barycentric("at_sample", [1]) +barycentric("at_sample", 2, [1]) # src[] = { offset.xy }. -barycentric("at_offset", [2]) +barycentric("at_offset", 2, [2]) # Load sample position: # @@ -698,6 +726,8 @@ load("ubo", 2, [ACCESS, ALIGN_MUL, ALIGN_OFFSET], flags=[CAN_ELIMINATE, CAN_REORDER]) # src[] = { offset }. load("input", 1, [BASE, COMPONENT, TYPE], [CAN_ELIMINATE, CAN_REORDER]) +# src[] = { vertex_id, offset }. +load("input_vertex", 2, [BASE, COMPONENT, TYPE], [CAN_ELIMINATE, CAN_REORDER]) # src[] = { vertex, offset }. load("per_vertex_input", 2, [BASE, COMPONENT], [CAN_ELIMINATE, CAN_REORDER]) # src[] = { barycoord, offset }. @@ -770,6 +800,45 @@ intrinsic("ssbo_atomic_exchange_ir3", src_comp=[1, 1, 1, 1], dest_comp=1) intrinsic("ssbo_atomic_comp_swap_ir3", src_comp=[1, 1, 1, 1, 1], dest_comp=1) +# System values for freedreno geometry shaders. +system_value("vs_primitive_stride_ir3", 1) +system_value("vs_vertex_stride_ir3", 1) +system_value("gs_header_ir3", 1) +system_value("primitive_location_ir3", 1, indices=[DRIVER_LOCATION]) + +# System values for freedreno tessellation shaders. +system_value("hs_patch_stride_ir3", 1) +system_value("tess_factor_base_ir3", 2) +system_value("tess_param_base_ir3", 2) +system_value("tcs_header_ir3", 1) + +# IR3-specific intrinsics for tessellation control shaders. cond_end_ir3 end +# the shader when src0 is false and is used to narrow down the TCS shader to +# just thread 0 before writing out tessellation levels. +intrinsic("cond_end_ir3", src_comp=[1]) +# end_patch_ir3 is used just before thread 0 exist the TCS and presumably +# signals the TE that the patch is complete and can be tessellated. +intrinsic("end_patch_ir3") + +# IR3-specific load/store intrinsics. These access a buffer used to pass data +# between geometry stages - perhaps it's explicit access to the vertex cache. + +# src[] = { value, offset }. +store("shared_ir3", 2, [BASE, WRMASK, ALIGN_MUL, ALIGN_OFFSET]) +# src[] = { offset }. +load("shared_ir3", 1, [BASE, ALIGN_MUL, ALIGN_OFFSET], [CAN_ELIMINATE]) + +# IR3-specific load/store global intrinsics. They take a 64-bit base address +# and a 32-bit offset. The hardware will add the base and the offset, which +# saves us from doing 64-bit math on the base address. + +# src[] = { value, address(vec2 of hi+lo uint32_t), offset }. +# const_index[] = { write_mask, align_mul, align_offset } +intrinsic("store_global_ir3", [0, 2, 1], indices=[WRMASK, ACCESS, ALIGN_MUL, ALIGN_OFFSET]) +# src[] = { address(vec2 of hi+lo uint32_t), offset }. +# const_index[] = { access, align_mul, align_offset } +intrinsic("load_global_ir3", [2, 1], dest_comp=0, indices=[ACCESS, ALIGN_MUL, ALIGN_OFFSET], flags=[CAN_ELIMINATE]) + # Intrinsics used by the Midgard/Bifrost blend pipeline. These are defined # within a blend shader to read/write the raw value from the tile buffer, # without applying any format conversion in the process. If the shader needs @@ -781,10 +850,20 @@ # One notable divergence is sRGB, which is asymmetric: raw_input_pan requires # an sRGB->linear conversion, but linear values should be written to # raw_output_pan and the hardware handles linear->sRGB. +# +# We also have format-specific Midgard intrinsics. There are rather +# here-be-dragons. load_output_u8_as_fp16_pan does the equivalent of +# load_raw_out_pan on an RGBA8 UNORM framebuffer followed by u2u16 -> fp16 -> +# division by 255. # src[] = { value } store("raw_output_pan", 1, []) load("raw_output_pan", 0, [], [CAN_ELIMINATE, CAN_REORDER]) +load("output_u8_as_fp16_pan", 0, [], [CAN_ELIMINATE, CAN_REORDER]) + +# Loads the sampler paramaters +# src[] = { sampler_index } +load("sampler_lod_parameters_pan", 1, [CAN_ELIMINATE, CAN_REORDER]) # V3D-specific instrinc for tile buffer color reads. # @@ -803,3 +882,7 @@ # src[] = { value, render_target } # BASE = sample index store("tlb_sample_color_v3d", 2, [BASE, COMPONENT, TYPE], []) + +# V3D-specific intrinsic to load the number of layers attached to +# the target framebuffer +intrinsic("load_fb_layers_v3d", dest_comp=1, flags=[CAN_ELIMINATE, CAN_REORDER]) diff -Nru mesa-19.2.8/src/compiler/nir/nir_linking_helpers.c mesa-20.0.8/src/compiler/nir/nir_linking_helpers.c --- mesa-19.2.8/src/compiler/nir/nir_linking_helpers.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir_linking_helpers.c 2020-06-12 01:21:16.000000000 +0000 @@ -443,6 +443,7 @@ uint8_t interp_loc; bool is_32bit; bool is_patch; + bool is_intra_stage_only; bool initialised; }; @@ -456,6 +457,12 @@ if (comp1->is_patch != comp2->is_patch) return comp1->is_patch ? 1 : -1; + /* We want to try to group together TCS outputs that are only read by other + * TCS invocations and not consumed by the follow stage. + */ + if (comp1->is_intra_stage_only != comp2->is_intra_stage_only) + return comp1->is_intra_stage_only ? 1 : -1; + /* We can only pack varyings with matching interpolation types so group * them together. */ @@ -471,7 +478,7 @@ } static void -gather_varying_component_info(nir_shader *consumer, +gather_varying_component_info(nir_shader *producer, nir_shader *consumer, struct varying_component **varying_comp_info, unsigned *varying_comp_info_size, bool default_to_smooth_interp) @@ -482,7 +489,7 @@ /* Count the number of varying that can be packed and create a mapping * of those varyings to the array we will pass to qsort. */ - nir_foreach_variable(var, &consumer->inputs) { + nir_foreach_variable(var, &producer->outputs) { /* Only remap things that aren't builtins. */ if (var->data.location >= VARYING_SLOT_VAR0 && @@ -493,7 +500,7 @@ continue; const struct glsl_type *type = var->type; - if (nir_is_per_vertex_io(var, consumer->info.stage)) { + if (nir_is_per_vertex_io(var, producer->info.stage)) { assert(glsl_type_is_array(type)); type = glsl_get_array_element(type); } @@ -523,7 +530,8 @@ if (intr->intrinsic != nir_intrinsic_load_deref && intr->intrinsic != nir_intrinsic_interp_deref_at_centroid && intr->intrinsic != nir_intrinsic_interp_deref_at_sample && - intr->intrinsic != nir_intrinsic_interp_deref_at_offset) + intr->intrinsic != nir_intrinsic_interp_deref_at_offset && + intr->intrinsic != nir_intrinsic_interp_deref_at_vertex) continue; nir_deref_instr *deref = nir_src_as_deref(intr->src[0]); @@ -560,9 +568,86 @@ vc_info->interp_loc = get_interp_loc(in_var); vc_info->is_32bit = glsl_type_is_32bit(type); vc_info->is_patch = in_var->data.patch; + vc_info->is_intra_stage_only = false; + vc_info->initialised = true; + } + } + } + + /* Walk over the shader and populate the varying component info array + * for varyings which are read by other TCS instances but are not consumed + * by the TES. + */ + if (producer->info.stage == MESA_SHADER_TESS_CTRL) { + impl = nir_shader_get_entrypoint(producer); + + nir_foreach_block(block, impl) { + nir_foreach_instr(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + if (intr->intrinsic != nir_intrinsic_load_deref) + continue; + + nir_deref_instr *deref = nir_src_as_deref(intr->src[0]); + if (deref->mode != nir_var_shader_out) + continue; + + /* We only remap things that aren't builtins. */ + nir_variable *out_var = nir_deref_instr_get_variable(deref); + if (out_var->data.location < VARYING_SLOT_VAR0) + continue; + + unsigned location = out_var->data.location - VARYING_SLOT_VAR0; + if (location >= MAX_VARYINGS_INCL_PATCH) + continue; + + unsigned var_info_idx = + store_varying_info_idx[location][out_var->data.location_frac]; + if (!var_info_idx) { + /* Something went wrong, the shader interfaces didn't match, so + * abandon packing. This can happen for example when the + * inputs are scalars but the outputs are struct members. + */ + *varying_comp_info_size = 0; + break; + } + + struct varying_component *vc_info = + &(*varying_comp_info)[var_info_idx-1]; + + if (!vc_info->initialised) { + const struct glsl_type *type = out_var->type; + if (nir_is_per_vertex_io(out_var, producer->info.stage)) { + assert(glsl_type_is_array(type)); + type = glsl_get_array_element(type); + } + + vc_info->var = out_var; + vc_info->interp_type = + get_interp_type(out_var, type, default_to_smooth_interp); + vc_info->interp_loc = get_interp_loc(out_var); + vc_info->is_32bit = glsl_type_is_32bit(type); + vc_info->is_patch = out_var->data.patch; + vc_info->is_intra_stage_only = true; + vc_info->initialised = true; + } } } } + + for (unsigned i = 0; i < *varying_comp_info_size; i++ ) { + struct varying_component *vc_info = &(*varying_comp_info)[i]; + if (!vc_info->initialised) { + /* Something went wrong, the shader interfaces didn't match, so + * abandon packing. This can happen for example when the outputs are + * scalars but the inputs are struct members. + */ + *varying_comp_info_size = 0; + break; + } + } } static void @@ -647,7 +732,7 @@ unsigned varying_comp_info_size; /* Gather varying component info */ - gather_varying_component_info(consumer, &varying_comp_info, + gather_varying_component_info(producer, consumer, &varying_comp_info, &varying_comp_info_size, default_to_smooth_interp); @@ -1007,9 +1092,6 @@ sort_varyings(var_list); - const int base = stage == MESA_SHADER_FRAGMENT ? - (int) FRAG_RESULT_DATA0 : (int) VARYING_SLOT_VAR0; - int UNUSED last_loc = 0; bool last_partial = false; nir_foreach_variable(var, var_list) { @@ -1019,6 +1101,15 @@ type = glsl_get_array_element(type); } + int base; + if (var->data.mode == nir_var_shader_in && stage == MESA_SHADER_VERTEX) + base = VERT_ATTRIB_GENERIC0; + else if (var->data.mode == nir_var_shader_out && + stage == MESA_SHADER_FRAGMENT) + base = FRAG_RESULT_DATA0; + else + base = VARYING_SLOT_VAR0; + unsigned var_size; if (var->data.compact) { /* compact variables must be arrays of scalars */ @@ -1079,7 +1170,7 @@ if (last_slot_location > location) { unsigned num_unallocated_slots = last_slot_location - location; unsigned first_unallocated_slot = var_size - num_unallocated_slots; - for (unsigned i = first_unallocated_slot; i < num_unallocated_slots; i++) { + for (unsigned i = first_unallocated_slot; i < var_size; i++) { assigned_locations[var->data.location + i] = location; location++; } diff -Nru mesa-19.2.8/src/compiler/nir/nir_loop_analyze.c mesa-20.0.8/src/compiler/nir/nir_loop_analyze.c --- mesa-19.2.8/src/compiler/nir/nir_loop_analyze.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir_loop_analyze.c 2020-06-12 01:21:16.000000000 +0000 @@ -589,29 +589,32 @@ } static nir_const_value -eval_const_unop(nir_op op, unsigned bit_size, nir_const_value src0) +eval_const_unop(nir_op op, unsigned bit_size, nir_const_value src0, + unsigned execution_mode) { assert(nir_op_infos[op].num_inputs == 1); nir_const_value dest; nir_const_value *src[1] = { &src0 }; - nir_eval_const_opcode(op, &dest, 1, bit_size, src); + nir_eval_const_opcode(op, &dest, 1, bit_size, src, execution_mode); return dest; } static nir_const_value eval_const_binop(nir_op op, unsigned bit_size, - nir_const_value src0, nir_const_value src1) + nir_const_value src0, nir_const_value src1, + unsigned execution_mode) { assert(nir_op_infos[op].num_inputs == 2); nir_const_value dest; nir_const_value *src[2] = { &src0, &src1 }; - nir_eval_const_opcode(op, &dest, 1, bit_size, src); + nir_eval_const_opcode(op, &dest, 1, bit_size, src, execution_mode); return dest; } static int32_t get_iteration(nir_op cond_op, nir_const_value initial, nir_const_value step, - nir_const_value limit, unsigned bit_size) + nir_const_value limit, unsigned bit_size, + unsigned execution_mode) { nir_const_value span, iter; @@ -620,23 +623,29 @@ case nir_op_ilt: case nir_op_ieq: case nir_op_ine: - span = eval_const_binop(nir_op_isub, bit_size, limit, initial); - iter = eval_const_binop(nir_op_idiv, bit_size, span, step); + span = eval_const_binop(nir_op_isub, bit_size, limit, initial, + execution_mode); + iter = eval_const_binop(nir_op_idiv, bit_size, span, step, + execution_mode); break; case nir_op_uge: case nir_op_ult: - span = eval_const_binop(nir_op_isub, bit_size, limit, initial); - iter = eval_const_binop(nir_op_udiv, bit_size, span, step); + span = eval_const_binop(nir_op_isub, bit_size, limit, initial, + execution_mode); + iter = eval_const_binop(nir_op_udiv, bit_size, span, step, + execution_mode); break; case nir_op_fge: case nir_op_flt: case nir_op_feq: case nir_op_fne: - span = eval_const_binop(nir_op_fsub, bit_size, limit, initial); - iter = eval_const_binop(nir_op_fdiv, bit_size, span, step); - iter = eval_const_unop(nir_op_f2i64, bit_size, iter); + span = eval_const_binop(nir_op_fsub, bit_size, limit, initial, + execution_mode); + iter = eval_const_binop(nir_op_fdiv, bit_size, span, + step, execution_mode); + iter = eval_const_unop(nir_op_f2i64, bit_size, iter, execution_mode); break; default: @@ -654,7 +663,8 @@ nir_op cond_op, unsigned bit_size, nir_const_value initial, nir_const_value limit, - bool limit_rhs, bool invert_cond) + bool limit_rhs, bool invert_cond, + unsigned execution_mode) { if (trip_offset == 1) { nir_op add_op; @@ -670,7 +680,8 @@ unreachable("Unhandled induction variable base type!"); } - initial = eval_const_binop(add_op, bit_size, initial, step); + initial = eval_const_binop(add_op, bit_size, initial, step, + execution_mode); } nir_const_value *src[2]; @@ -679,7 +690,7 @@ /* Evaluate the loop exit condition */ nir_const_value result; - nir_eval_const_opcode(cond_op, &result, 1, bit_size, src); + nir_eval_const_opcode(cond_op, &result, 1, bit_size, src, execution_mode); return invert_cond ? !result.b : result.b; } @@ -688,7 +699,8 @@ test_iterations(int32_t iter_int, nir_const_value step, nir_const_value limit, nir_op cond_op, unsigned bit_size, nir_alu_type induction_base_type, - nir_const_value initial, bool limit_rhs, bool invert_cond) + nir_const_value initial, bool limit_rhs, bool invert_cond, + unsigned execution_mode) { assert(nir_op_infos[cond_op].num_inputs == 2); @@ -715,11 +727,11 @@ * step the induction variable each iteration. */ nir_const_value mul_result = - eval_const_binop(mul_op, bit_size, iter_src, step); + eval_const_binop(mul_op, bit_size, iter_src, step, execution_mode); /* Add the initial value to the accumulated induction variable total */ nir_const_value add_result = - eval_const_binop(add_op, bit_size, mul_result, initial); + eval_const_binop(add_op, bit_size, mul_result, initial, execution_mode); nir_const_value *src[2]; src[limit_rhs ? 0 : 1] = &add_result; @@ -727,7 +739,7 @@ /* Evaluate the loop exit condition */ nir_const_value result; - nir_eval_const_opcode(cond_op, &result, 1, bit_size, src); + nir_eval_const_opcode(cond_op, &result, 1, bit_size, src, execution_mode); return invert_cond ? !result.b : result.b; } @@ -736,7 +748,7 @@ calculate_iterations(nir_const_value initial, nir_const_value step, nir_const_value limit, nir_alu_instr *alu, nir_ssa_scalar cond, nir_op alu_op, bool limit_rhs, - bool invert_cond) + bool invert_cond, unsigned execution_mode) { /* nir_op_isub should have been lowered away by this point */ assert(alu->op != nir_op_isub); @@ -786,11 +798,13 @@ */ if (will_break_on_first_iteration(step, induction_base_type, trip_offset, alu_op, bit_size, initial, - limit, limit_rhs, invert_cond)) { + limit, limit_rhs, invert_cond, + execution_mode)) { return 0; } - int iter_int = get_iteration(alu_op, initial, step, limit, bit_size); + int iter_int = get_iteration(alu_op, initial, step, limit, bit_size, + execution_mode); /* If iter_int is negative the loop is ill-formed or is the conditional is * unsigned with a huge iteration count so don't bother going any further. @@ -812,7 +826,7 @@ if (test_iterations(iter_bias, step, limit, alu_op, bit_size, induction_base_type, initial, - limit_rhs, invert_cond)) { + limit_rhs, invert_cond, execution_mode)) { return iter_bias > 0 ? iter_bias - trip_offset : iter_bias; } } @@ -950,7 +964,7 @@ * loop. */ static void -find_trip_count(loop_info_state *state) +find_trip_count(loop_info_state *state, unsigned execution_mode) { bool trip_count_known = true; bool guessed_trip_count = false; @@ -1063,7 +1077,8 @@ int iterations = calculate_iterations(initial_val, step_val, limit_val, ind_var->alu, cond, alu_op, limit_rhs, - terminator->continue_from_then); + terminator->continue_from_then, + execution_mode); /* Where we not able to calculate the iteration count */ if (iterations == -1) { @@ -1203,7 +1218,7 @@ return; /* Run through each of the terminators and try to compute a trip-count */ - find_trip_count(state); + find_trip_count(state, impl->function->shader->info.float_controls_execution_mode); nir_foreach_block_in_cf_node(block, &state->loop->cf_node) { if (force_unroll_heuristics(state, block)) { diff -Nru mesa-19.2.8/src/compiler/nir/nir_lower_alpha_test.c mesa-20.0.8/src/compiler/nir/nir_lower_alpha_test.c --- mesa-19.2.8/src/compiler/nir/nir_lower_alpha_test.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir_lower_alpha_test.c 2020-06-12 01:21:16.000000000 +0000 @@ -37,7 +37,8 @@ void nir_lower_alpha_test(nir_shader *shader, enum compare_func func, - bool alpha_to_one) + bool alpha_to_one, + const gl_state_index16 *alpha_ref_state_tokens) { assert(shader->info.stage == MESA_SHADER_FRAGMENT); @@ -93,9 +94,23 @@ 3); } + nir_ssa_def *alpha_ref; + if (alpha_ref_state_tokens) { + nir_variable *var = nir_variable_create(shader, + nir_var_uniform, + glsl_float_type(), + "gl_AlphaRefMESA"); + var->num_state_slots = 1; + var->state_slots = ralloc_array(var, nir_state_slot, 1); + memcpy(var->state_slots[0].tokens, + alpha_ref_state_tokens, + sizeof(var->state_slots[0].tokens)); + alpha_ref = nir_load_var(&b, var); + } else + alpha_ref = nir_load_alpha_ref_float(&b); + nir_ssa_def *condition = - nir_compare_func(&b, func, - alpha, nir_load_alpha_ref_float(&b)); + nir_compare_func(&b, func, alpha, alpha_ref); nir_intrinsic_instr *discard = nir_intrinsic_instr_create(b.shader, diff -Nru mesa-19.2.8/src/compiler/nir/nir_lower_alu_to_scalar.c mesa-20.0.8/src/compiler/nir/nir_lower_alu_to_scalar.c --- mesa-19.2.8/src/compiler/nir/nir_lower_alu_to_scalar.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir_lower_alu_to_scalar.c 2020-06-12 01:21:16.000000000 +0000 @@ -24,6 +24,11 @@ #include "nir.h" #include "nir_builder.h" +struct alu_to_scalar_data { + nir_instr_filter_cb cb; + const void *data; +}; + /** @file nir_lower_alu_to_scalar.c * * Replaces nir_alu_instr operations with more than one channel used in the @@ -89,9 +94,9 @@ } static nir_ssa_def * -lower_alu_instr_scalar(nir_builder *b, nir_instr *instr, void *_state) +lower_alu_instr_scalar(nir_builder *b, nir_instr *instr, void *_data) { - BITSET_WORD *lower_set = _state; + struct alu_to_scalar_data *data = _data; nir_alu_instr *alu = nir_instr_as_alu(instr); unsigned num_src = nir_op_infos[alu->op].num_inputs; unsigned i, chan; @@ -102,7 +107,7 @@ b->cursor = nir_before_instr(&alu->instr); b->exact = alu->exact; - if (lower_set && !BITSET_TEST(lower_set, alu->op)) + if (data->cb && !data->cb(instr, data->data)) return NULL; #define LOWER_REDUCTION(name, chan, merge) \ @@ -112,6 +117,8 @@ return lower_reduction(alu, chan, merge, b); \ switch (alu->op) { + case nir_op_vec16: + case nir_op_vec8: case nir_op_vec4: case nir_op_vec3: case nir_op_vec2: @@ -140,13 +147,23 @@ */ return NULL; + case nir_op_unpack_half_2x16_flush_to_zero: case nir_op_unpack_half_2x16: { if (!b->shader->options->lower_unpack_half_2x16) return NULL; nir_ssa_def *packed = nir_ssa_for_alu_src(b, alu, 0); - return nir_vec2(b, nir_unpack_half_2x16_split_x(b, packed), - nir_unpack_half_2x16_split_y(b, packed)); + if (alu->op == nir_op_unpack_half_2x16_flush_to_zero) { + return nir_vec2(b, + nir_unpack_half_2x16_split_x_flush_to_zero(b, + packed), + nir_unpack_half_2x16_split_y_flush_to_zero(b, + packed)); + } else { + return nir_vec2(b, + nir_unpack_half_2x16_split_x(b, packed), + nir_unpack_half_2x16_split_y(b, packed)); + } } case nir_op_pack_uvec2_to_uint: { @@ -199,6 +216,14 @@ LOWER_REDUCTION(nir_op_ball_iequal, nir_op_ieq, nir_op_iand); LOWER_REDUCTION(nir_op_bany_fnequal, nir_op_fne, nir_op_ior); LOWER_REDUCTION(nir_op_bany_inequal, nir_op_ine, nir_op_ior); + LOWER_REDUCTION(nir_op_b8all_fequal, nir_op_feq8, nir_op_iand); + LOWER_REDUCTION(nir_op_b8all_iequal, nir_op_ieq8, nir_op_iand); + LOWER_REDUCTION(nir_op_b8any_fnequal, nir_op_fne8, nir_op_ior); + LOWER_REDUCTION(nir_op_b8any_inequal, nir_op_ine8, nir_op_ior); + LOWER_REDUCTION(nir_op_b16all_fequal, nir_op_feq16, nir_op_iand); + LOWER_REDUCTION(nir_op_b16all_iequal, nir_op_ieq16, nir_op_iand); + LOWER_REDUCTION(nir_op_b16any_fnequal, nir_op_fne16, nir_op_ior); + LOWER_REDUCTION(nir_op_b16any_inequal, nir_op_ine16, nir_op_ior); LOWER_REDUCTION(nir_op_b32all_fequal, nir_op_feq32, nir_op_iand); LOWER_REDUCTION(nir_op_b32all_iequal, nir_op_ieq32, nir_op_iand); LOWER_REDUCTION(nir_op_b32any_fnequal, nir_op_fne32, nir_op_ior); @@ -246,10 +271,15 @@ } bool -nir_lower_alu_to_scalar(nir_shader *shader, BITSET_WORD *lower_set) +nir_lower_alu_to_scalar(nir_shader *shader, nir_instr_filter_cb cb, const void *_data) { + struct alu_to_scalar_data data = { + .cb = cb, + .data = _data, + }; + return nir_shader_lower_instructions(shader, inst_is_vector_alu, lower_alu_instr_scalar, - lower_set); + &data); } diff -Nru mesa-19.2.8/src/compiler/nir/nir_lower_amul.c mesa-20.0.8/src/compiler/nir/nir_lower_amul.c --- mesa-19.2.8/src/compiler/nir/nir_lower_amul.c 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir_lower_amul.c 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,316 @@ +/* + * Copyright © 2019 Google, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "nir.h" +#include "nir_vla.h" + +/* Lowering for amul instructions, for drivers that support imul24. + * This pass will analyze indirect derefs, and convert corresponding + * amul instructions to either imul or imul24, depending on the + * required range. + * + * 1) Analyze the uniform variables and build a table of UBOs and SSBOs + * that are either too large, or might be too large (unknown size) + * for imul24 + * + * 2) Loop thru looking at all the intrinsics, finding dereferences of + * large variables, and recursively replacing all amul instructions + * used with imul + * + * 3) Finally loop again thru all instructions replacing any remaining + * amul with imul24. At this point any remaining amul instructions + * are not involved in calculating an offset into a large variable, + * thanks to the 2nd step, so they can be safely replace with imul24. + * + * Using two passes over all the instructions lets us handle the case + * where, due to CSE, an amul is used to calculate an offset into both + * a large and small variable. + */ + +typedef struct { + int (*type_size)(const struct glsl_type *, bool); + + /* Tables of UBOs and SSBOs mapping driver_location/base whether + * they are too large to use imul24: + */ + bool *large_ubos; + bool *large_ssbos; + + /* for cases that we cannot determine UBO/SSBO index, track if *any* + * UBO/SSBO is too large for imul24: + */ + bool has_large_ubo; + bool has_large_ssbo; +} lower_state; + +/* Lower 'amul's in offset src of large variables to 'imul': */ +static bool +lower_large_src(nir_src *src, void *s) +{ + lower_state *state = s; + + assert(src->is_ssa); + + nir_instr *parent = src->ssa->parent_instr; + + /* No need to visit instructions we've already visited.. this also + * avoids infinite recursion when phi's are involved: + */ + if (parent->pass_flags) + return false; + + bool progress = nir_foreach_src(parent, lower_large_src, state); + + if (parent->type == nir_instr_type_alu) { + nir_alu_instr *alu = nir_instr_as_alu(parent); + if (alu->op == nir_op_amul) { + alu->op = nir_op_imul; + progress = true; + } + } + + parent->pass_flags = 1; + + return progress; +} + +static bool +large_ubo(lower_state *state, nir_src src) +{ + if (!nir_src_is_const(src)) + return state->has_large_ubo; + return state->large_ubos[nir_src_as_uint(src)]; +} + +static bool +large_ssbo(lower_state *state, nir_src src) +{ + if (!nir_src_is_const(src)) + return state->has_large_ssbo; + return state->large_ssbos[nir_src_as_uint(src)]; +} + +static bool +lower_intrinsic(lower_state *state, nir_intrinsic_instr *intr) +{ + switch (intr->intrinsic) { + case nir_intrinsic_load_ubo: + //# src[] = { buffer_index, offset }. + if (large_ubo(state, intr->src[0])) + return lower_large_src(&intr->src[1], state); + return false; + + case nir_intrinsic_load_ssbo: + //# src[] = { buffer_index, offset }. + if (large_ssbo(state, intr->src[0])) + return lower_large_src(&intr->src[1], state); + return false; + + case nir_intrinsic_store_ssbo: + //# src[] = { value, block_index, offset } + if (large_ssbo(state, intr->src[1])) + return lower_large_src(&intr->src[2], state); + return false; + + case nir_intrinsic_ssbo_atomic_add: + case nir_intrinsic_ssbo_atomic_imin: + case nir_intrinsic_ssbo_atomic_umin: + case nir_intrinsic_ssbo_atomic_imax: + case nir_intrinsic_ssbo_atomic_umax: + case nir_intrinsic_ssbo_atomic_and: + case nir_intrinsic_ssbo_atomic_or: + case nir_intrinsic_ssbo_atomic_xor: + case nir_intrinsic_ssbo_atomic_exchange: + case nir_intrinsic_ssbo_atomic_comp_swap: + case nir_intrinsic_ssbo_atomic_fadd: + case nir_intrinsic_ssbo_atomic_fmin: + case nir_intrinsic_ssbo_atomic_fmax: + case nir_intrinsic_ssbo_atomic_fcomp_swap: + /* 0: SSBO index + * 1: offset + */ + if (large_ssbo(state, intr->src[0])) + return lower_large_src(&intr->src[1], state); + return false; + + case nir_intrinsic_global_atomic_add: + case nir_intrinsic_global_atomic_imin: + case nir_intrinsic_global_atomic_umin: + case nir_intrinsic_global_atomic_imax: + case nir_intrinsic_global_atomic_umax: + case nir_intrinsic_global_atomic_and: + case nir_intrinsic_global_atomic_or: + case nir_intrinsic_global_atomic_xor: + case nir_intrinsic_global_atomic_exchange: + case nir_intrinsic_global_atomic_comp_swap: + case nir_intrinsic_global_atomic_fadd: + case nir_intrinsic_global_atomic_fmin: + case nir_intrinsic_global_atomic_fmax: + case nir_intrinsic_global_atomic_fcomp_swap: + /* just assume we that 24b is not sufficient: */ + return lower_large_src(&intr->src[0], state); + + /* These should all be small enough to unconditionally use imul24: */ + case nir_intrinsic_shared_atomic_add: + case nir_intrinsic_shared_atomic_imin: + case nir_intrinsic_shared_atomic_umin: + case nir_intrinsic_shared_atomic_imax: + case nir_intrinsic_shared_atomic_umax: + case nir_intrinsic_shared_atomic_and: + case nir_intrinsic_shared_atomic_or: + case nir_intrinsic_shared_atomic_xor: + case nir_intrinsic_shared_atomic_exchange: + case nir_intrinsic_shared_atomic_comp_swap: + case nir_intrinsic_shared_atomic_fadd: + case nir_intrinsic_shared_atomic_fmin: + case nir_intrinsic_shared_atomic_fmax: + case nir_intrinsic_shared_atomic_fcomp_swap: + case nir_intrinsic_load_uniform: + case nir_intrinsic_load_input: + case nir_intrinsic_load_output: + case nir_intrinsic_store_output: + default: + return false; + } +} + +static bool +lower_instr(lower_state *state, nir_instr *instr) +{ + bool progress = false; + + if (instr->type == nir_instr_type_intrinsic) { + progress |= lower_intrinsic(state, nir_instr_as_intrinsic(instr)); + } + + return progress; +} + +static bool +is_large(lower_state *state, nir_variable *var) +{ + unsigned size = state->type_size(var->type, false); + + /* if size is not known (ie. VLA) then assume the worst: */ + if (!size) + return true; + + return size >= (1 << 23); +} + +bool +nir_lower_amul(nir_shader *shader, + int (*type_size)(const struct glsl_type *, bool)) +{ + assert(shader->options->has_imul24); + assert(type_size); + + /* uniforms list actually includes ubo's and ssbo's: */ + int num_uniforms = exec_list_length(&shader->uniforms); + + NIR_VLA_FILL(bool, large_ubos, num_uniforms, 0); + NIR_VLA_FILL(bool, large_ssbos, num_uniforms, 0); + + lower_state state = { + .type_size = type_size, + .large_ubos = large_ubos, + .large_ssbos = large_ssbos, + }; + + /* Figure out which UBOs or SSBOs are large enough to be + * disqualified from imul24: + */ + nir_foreach_variable(var, &shader->uniforms) { + if (var->data.mode == nir_var_mem_ubo) { + assert(var->data.driver_location < num_uniforms); + if (is_large(&state, var)) { + state.has_large_ubo = true; + state.large_ubos[var->data.driver_location] = true; + } + } else if (var->data.mode == nir_var_mem_ssbo) { + assert(var->data.driver_location < num_uniforms); + if (is_large(&state, var)) { + state.has_large_ssbo = true; + state.large_ssbos[var->data.driver_location] = true; + } + } + } + + /* clear pass flags: */ + nir_foreach_function(function, shader) { + nir_function_impl *impl = function->impl; + if (!impl) + continue; + + nir_foreach_block(block, impl) { + nir_foreach_instr(instr, block) { + instr->pass_flags = 0; + } + } + } + + bool progress = false; + nir_foreach_function(function, shader) { + nir_function_impl *impl = function->impl; + + if (!impl) + continue; + + nir_foreach_block(block, impl) { + nir_foreach_instr(instr, block) { + progress |= lower_instr(&state, instr); + } + } + } + + /* At this point, all 'amul's used in calculating an offset into + * a large variable have been replaced with 'imul'. So remaining + * 'amul's can be replaced with 'imul24': + */ + nir_foreach_function(function, shader) { + nir_function_impl *impl = function->impl; + + if (!impl) + continue; + + nir_foreach_block(block, impl) { + nir_foreach_instr(instr, block) { + if (instr->type != nir_instr_type_alu) + continue; + + nir_alu_instr *alu = nir_instr_as_alu(instr); + if (alu->op != nir_op_amul) + continue; + + alu->op = nir_op_imul24; + progress |= true; + } + } + + nir_metadata_preserve(impl, nir_metadata_block_index | + nir_metadata_dominance); + + } + + return progress; +} diff -Nru mesa-19.2.8/src/compiler/nir/nir_lower_array_deref_of_vec.c mesa-20.0.8/src/compiler/nir/nir_lower_array_deref_of_vec.c --- mesa-19.2.8/src/compiler/nir/nir_lower_array_deref_of_vec.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir_lower_array_deref_of_vec.c 2020-06-12 01:21:16.000000000 +0000 @@ -80,6 +80,7 @@ intrin->intrinsic != nir_intrinsic_interp_deref_at_centroid && intrin->intrinsic != nir_intrinsic_interp_deref_at_sample && intrin->intrinsic != nir_intrinsic_interp_deref_at_offset && + intrin->intrinsic != nir_intrinsic_interp_deref_at_vertex && intrin->intrinsic != nir_intrinsic_store_deref) continue; diff -Nru mesa-19.2.8/src/compiler/nir/nir_lower_atomics_to_ssbo.c mesa-20.0.8/src/compiler/nir/nir_lower_atomics_to_ssbo.c --- mesa-19.2.8/src/compiler/nir/nir_lower_atomics_to_ssbo.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir_lower_atomics_to_ssbo.c 2020-06-12 01:21:16.000000000 +0000 @@ -32,47 +32,25 @@ #endif /* - * Remap atomic counters to SSBOs. Atomic counters get remapped to - * SSBO binding points [0..ssbo_offset) and the original SSBOs are - * remapped to [ssbo_offset..n) (mostly to align with what mesa/st - * does. + * Remap atomic counters to SSBOs, starting from the shader's next SSBO slot + * (info.num_ssbos). */ static bool lower_instr(nir_intrinsic_instr *instr, unsigned ssbo_offset, nir_builder *b) { nir_intrinsic_op op; - int idx_src; b->cursor = nir_before_instr(&instr->instr); switch (instr->intrinsic) { - case nir_intrinsic_ssbo_atomic_add: - case nir_intrinsic_ssbo_atomic_imin: - case nir_intrinsic_ssbo_atomic_umin: - case nir_intrinsic_ssbo_atomic_imax: - case nir_intrinsic_ssbo_atomic_umax: - case nir_intrinsic_ssbo_atomic_and: - case nir_intrinsic_ssbo_atomic_or: - case nir_intrinsic_ssbo_atomic_xor: - case nir_intrinsic_ssbo_atomic_exchange: - case nir_intrinsic_ssbo_atomic_comp_swap: - case nir_intrinsic_ssbo_atomic_fadd: - case nir_intrinsic_ssbo_atomic_fmin: - case nir_intrinsic_ssbo_atomic_fmax: - case nir_intrinsic_ssbo_atomic_fcomp_swap: - case nir_intrinsic_store_ssbo: - case nir_intrinsic_load_ssbo: - case nir_intrinsic_get_buffer_size: - /* easy case, keep same opcode and just remap SSBO buffer index: */ - op = instr->intrinsic; - idx_src = (op == nir_intrinsic_store_ssbo) ? 1 : 0; - nir_ssa_def *old_idx = nir_ssa_for_src(b, instr->src[idx_src], 1); - nir_ssa_def *new_idx = nir_iadd(b, old_idx, nir_imm_int(b, ssbo_offset)); - nir_instr_rewrite_src(&instr->instr, - &instr->src[idx_src], - nir_src_for_ssa(new_idx)); + case nir_intrinsic_memory_barrier_atomic_counter: + /* Atomic counters are now SSBOs so memoryBarrierAtomicCounter() is now + * memoryBarrierBuffer(). + */ + instr->intrinsic = nir_intrinsic_memory_barrier_buffer; return true; + case nir_intrinsic_atomic_counter_inc: case nir_intrinsic_atomic_counter_add: case nir_intrinsic_atomic_counter_pre_dec: @@ -108,7 +86,7 @@ return false; } - nir_ssa_def *buffer = nir_imm_int(b, nir_intrinsic_base(instr)); + nir_ssa_def *buffer = nir_imm_int(b, ssbo_offset + nir_intrinsic_base(instr)); nir_ssa_def *temp = NULL; nir_intrinsic_instr *new_instr = nir_intrinsic_instr_create(ralloc_parent(instr), op); @@ -184,8 +162,9 @@ } bool -nir_lower_atomics_to_ssbo(nir_shader *shader, unsigned ssbo_offset) +nir_lower_atomics_to_ssbo(nir_shader *shader) { + unsigned ssbo_offset = shader->info.num_ssbos; bool progress = false; nir_foreach_function(function, shader) { @@ -224,7 +203,21 @@ snprintf(name, sizeof(name), "counter%d", var->data.binding); ssbo = nir_variable_create(shader, nir_var_mem_ssbo, type, name); - ssbo->data.binding = var->data.binding; + ssbo->data.binding = ssbo_offset + var->data.binding; + + /* We can't use num_abos, because it only represents the number of + * active atomic counters, and currently unlike SSBO's they aren't + * compacted so num_abos actually isn't a bound on the index passed + * to nir_intrinsic_atomic_counter_*. e.g. if we have a single atomic + * counter declared like: + * + * layout(binding=1) atomic_uint counter0; + * + * then when we lower accesses to it the atomic_counter_* intrinsics + * will have 1 as the index but num_abos will still be 1. + */ + shader->info.num_ssbos = MAX2(shader->info.num_ssbos, + ssbo->data.binding + 1); struct glsl_struct_field field = { .type = type, @@ -239,6 +232,8 @@ replaced |= (1 << var->data.binding); } } + + shader->info.num_abos = 0; } return progress; diff -Nru mesa-19.2.8/src/compiler/nir/nir_lower_bool_to_float.c mesa-20.0.8/src/compiler/nir/nir_lower_bool_to_float.c --- mesa-19.2.8/src/compiler/nir/nir_lower_bool_to_float.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir_lower_bool_to_float.c 2020-06-12 01:21:16.000000000 +0000 @@ -56,6 +56,8 @@ case nir_op_vec2: case nir_op_vec3: case nir_op_vec4: + case nir_op_vec8: + case nir_op_vec16: /* These we expect to have booleans but the opcode doesn't change */ break; diff -Nru mesa-19.2.8/src/compiler/nir/nir_lower_bool_to_int32.c mesa-20.0.8/src/compiler/nir/nir_lower_bool_to_int32.c --- mesa-19.2.8/src/compiler/nir/nir_lower_bool_to_int32.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir_lower_bool_to_int32.c 2020-06-12 01:21:16.000000000 +0000 @@ -53,6 +53,8 @@ case nir_op_vec2: case nir_op_vec3: case nir_op_vec4: + case nir_op_vec8: + case nir_op_vec16: case nir_op_inot: case nir_op_iand: case nir_op_ior: diff -Nru mesa-19.2.8/src/compiler/nir/nir_lower_clip.c mesa-20.0.8/src/compiler/nir/nir_lower_clip.c --- mesa-19.2.8/src/compiler/nir/nir_lower_clip.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir_lower_clip.c 2020-06-12 01:21:16.000000000 +0000 @@ -42,17 +42,23 @@ static nir_variable * create_clipdist_var(nir_shader *shader, unsigned drvloc, - bool output, gl_varying_slot slot) + bool output, gl_varying_slot slot, unsigned array_size) { nir_variable *var = rzalloc(shader, nir_variable); var->data.driver_location = drvloc; - var->type = glsl_vec4_type(); var->data.mode = output ? nir_var_shader_out : nir_var_shader_in; var->name = ralloc_asprintf(var, "clipdist_%d", drvloc); var->data.index = 0; var->data.location = slot; + if (array_size > 0) { + var->type = glsl_array_type(glsl_float_type(), array_size, + sizeof(float)); + var->data.compact = 1; + } else + var->type = glsl_vec4_type(); + if (output) { exec_list_push_tail(&shader->outputs, &var->node); shader->num_outputs++; /* TODO use type_size() */ @@ -66,16 +72,24 @@ static void create_clipdist_vars(nir_shader *shader, nir_variable **io_vars, - unsigned ucp_enables, int *drvloc, bool output) + unsigned ucp_enables, int *drvloc, bool output, + bool use_clipdist_array) { - if (ucp_enables & 0x0f) + if (use_clipdist_array) { io_vars[0] = - create_clipdist_var(shader, ++(*drvloc), output, - VARYING_SLOT_CLIP_DIST0); - if (ucp_enables & 0xf0) - io_vars[1] = - create_clipdist_var(shader, ++(*drvloc), output, - VARYING_SLOT_CLIP_DIST1); + create_clipdist_var(shader, ++(*drvloc), true, + VARYING_SLOT_CLIP_DIST0, + util_last_bit(ucp_enables)); + } else { + if (ucp_enables & 0x0f) + io_vars[0] = + create_clipdist_var(shader, ++(*drvloc), output, + VARYING_SLOT_CLIP_DIST0, 0); + if (ucp_enables & 0xf0) + io_vars[1] = + create_clipdist_var(shader, ++(*drvloc), output, + VARYING_SLOT_CLIP_DIST1, 0); + } } static void @@ -189,10 +203,35 @@ return *clipvertex || *position; } +static nir_ssa_def * +get_ucp(nir_builder *b, int plane, + const gl_state_index16 clipplane_state_tokens[][STATE_LENGTH]) +{ + if (clipplane_state_tokens) { + char tmp[100]; + snprintf(tmp, ARRAY_SIZE(tmp), "gl_ClipPlane%dMESA", plane); + nir_variable *var = nir_variable_create(b->shader, + nir_var_uniform, + glsl_vec4_type(), + tmp); + + var->num_state_slots = 1; + var->state_slots = ralloc_array(var, nir_state_slot, 1); + memcpy(var->state_slots[0].tokens, + clipplane_state_tokens[plane], + sizeof(var->state_slots[0].tokens)); + return nir_load_var(b, var); + } else + return nir_load_user_clip_plane(b, plane); +} + + static void lower_clip_outputs(nir_builder *b, nir_variable *position, nir_variable *clipvertex, nir_variable **out, - unsigned ucp_enables, bool use_vars) + unsigned ucp_enables, bool use_vars, + bool use_clipdist_array, + const gl_state_index16 clipplane_state_tokens[][STATE_LENGTH]) { nir_ssa_def *clipdist[MAX_CLIP_PLANES]; nir_ssa_def *cv; @@ -204,6 +243,7 @@ exec_node_remove(&clipvertex->node); clipvertex->data.mode = nir_var_shader_temp; exec_list_push_tail(&b->shader->globals, &clipvertex->node); + nir_fixup_deref_modes(b->shader); } } else { if (clipvertex) @@ -216,7 +256,7 @@ for (int plane = 0; plane < MAX_CLIP_PLANES; plane++) { if (ucp_enables & (1 << plane)) { - nir_ssa_def *ucp = nir_load_user_clip_plane(b, plane); + nir_ssa_def *ucp = get_ucp(b, plane, clipplane_state_tokens); /* calculate clipdist[plane] - dot(ucp, cv): */ clipdist[plane] = nir_fdot4(b, ucp, cv); @@ -224,18 +264,28 @@ /* 0.0 == don't-clip == disabled: */ clipdist[plane] = nir_imm_float(b, 0.0); } + if (use_clipdist_array && plane < util_last_bit(ucp_enables)) { + assert(use_vars); + nir_deref_instr *deref; + deref = nir_build_deref_array_imm(b, + nir_build_deref_var(b, out[0]), + plane); + nir_store_deref(b, deref, clipdist[plane], 1); + } } - if (use_vars) { - if (ucp_enables & 0x0f) - nir_store_var(b, out[0], nir_vec(b, clipdist, 4), 0xf); - if (ucp_enables & 0xf0) - nir_store_var(b, out[1], nir_vec(b, &clipdist[4], 4), 0xf); - } else { - if (ucp_enables & 0x0f) - store_clipdist_output(b, out[0], &clipdist[0]); - if (ucp_enables & 0xf0) - store_clipdist_output(b, out[1], &clipdist[4]); + if (!use_clipdist_array) { + if (use_vars) { + if (ucp_enables & 0x0f) + nir_store_var(b, out[0], nir_vec(b, clipdist, 4), 0xf); + if (ucp_enables & 0xf0) + nir_store_var(b, out[1], nir_vec(b, &clipdist[4], 4), 0xf); + } else { + if (ucp_enables & 0x0f) + store_clipdist_output(b, out[0], &clipdist[0]); + if (ucp_enables & 0xf0) + store_clipdist_output(b, out[1], &clipdist[4]); + } } } @@ -248,9 +298,14 @@ * * If use_vars is true, the pass will use variable loads and stores instead * of working with store_output intrinsics. + * + * If use_clipdist_array is true, the pass will use compact arrays for the + * clipdist output instead of two vec4s. */ bool -nir_lower_clip_vs(nir_shader *shader, unsigned ucp_enables, bool use_vars) +nir_lower_clip_vs(nir_shader *shader, unsigned ucp_enables, bool use_vars, + bool use_clipdist_array, + const gl_state_index16 clipplane_state_tokens[][STATE_LENGTH]) { nir_function_impl *impl = nir_shader_get_entrypoint(shader); nir_builder b; @@ -292,9 +347,11 @@ return false; /* insert CLIPDIST outputs */ - create_clipdist_vars(shader, out, ucp_enables, &maxloc, true); + create_clipdist_vars(shader, out, ucp_enables, &maxloc, true, + use_clipdist_array); - lower_clip_outputs(&b, position, clipvertex, out, ucp_enables, use_vars); + lower_clip_outputs(&b, position, clipvertex, out, ucp_enables, use_vars, + use_clipdist_array, clipplane_state_tokens); nir_metadata_preserve(impl, nir_metadata_dominance); @@ -304,7 +361,8 @@ static void lower_clip_in_gs_block(nir_builder *b, nir_block *block, nir_variable *position, nir_variable *clipvertex, nir_variable **out, - unsigned ucp_enables) + unsigned ucp_enables, bool use_clipdist_array, + const gl_state_index16 clipplane_state_tokens[][STATE_LENGTH]) { nir_foreach_instr_safe(instr, block) { if (instr->type != nir_instr_type_intrinsic) @@ -315,7 +373,8 @@ case nir_intrinsic_emit_vertex_with_counter: case nir_intrinsic_emit_vertex: b->cursor = nir_before_instr(instr); - lower_clip_outputs(b, position, clipvertex, out, ucp_enables, true); + lower_clip_outputs(b, position, clipvertex, out, ucp_enables, true, + use_clipdist_array, clipplane_state_tokens); break; default: /* not interesting; skip this */ @@ -329,7 +388,9 @@ */ bool -nir_lower_clip_gs(nir_shader *shader, unsigned ucp_enables) +nir_lower_clip_gs(nir_shader *shader, unsigned ucp_enables, + bool use_clipdist_array, + const gl_state_index16 clipplane_state_tokens[][STATE_LENGTH]) { nir_function_impl *impl = nir_shader_get_entrypoint(shader); nir_builder b; @@ -346,12 +407,15 @@ return false; /* insert CLIPDIST outputs */ - create_clipdist_vars(shader, out, ucp_enables, &maxloc, true); + create_clipdist_vars(shader, out, ucp_enables, &maxloc, true, + use_clipdist_array); nir_builder_init(&b, impl); nir_foreach_block(block, impl) - lower_clip_in_gs_block(&b, block, position, clipvertex, out, ucp_enables); + lower_clip_in_gs_block(&b, block, position, clipvertex, out, + ucp_enables, use_clipdist_array, + clipplane_state_tokens); nir_metadata_preserve(impl, nir_metadata_dominance); @@ -399,7 +463,8 @@ /* insert conditional kill based on interpolated CLIPDIST */ bool -nir_lower_clip_fs(nir_shader *shader, unsigned ucp_enables) +nir_lower_clip_fs(nir_shader *shader, unsigned ucp_enables, + bool use_clipdist_array) { nir_variable *in[2]; int maxloc = -1; @@ -421,7 +486,8 @@ * must add our own: */ /* insert CLIPDIST inputs */ - create_clipdist_vars(shader, in, ucp_enables, &maxloc, false); + create_clipdist_vars(shader, in, ucp_enables, &maxloc, false, + use_clipdist_array); nir_foreach_function(function, shader) { if (!strcmp(function->name, "main")) diff -Nru mesa-19.2.8/src/compiler/nir/nir_lower_clip_cull_distance_arrays.c mesa-20.0.8/src/compiler/nir/nir_lower_clip_cull_distance_arrays.c --- mesa-19.2.8/src/compiler/nir/nir_lower_clip_cull_distance_arrays.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir_lower_clip_cull_distance_arrays.c 2020-06-12 01:21:16.000000000 +0000 @@ -72,8 +72,17 @@ cull = var; } - if (!cull && !clip) + if (!cull && !clip) { + /* If this is run after optimizations and the variables have been + * eliminated, we should update the shader info, because no other + * place does that. + */ + if (store_info) { + nir->info.clip_distance_array_size = 0; + nir->info.cull_distance_array_size = 0; + } return false; + } if (!cull && clip) { /* The GLSL IR lowering pass must have converted these to vectors */ diff -Nru mesa-19.2.8/src/compiler/nir/nir_lower_clip_halfz.c mesa-20.0.8/src/compiler/nir/nir_lower_clip_halfz.c --- mesa-19.2.8/src/compiler/nir/nir_lower_clip_halfz.c 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir_lower_clip_halfz.c 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,77 @@ +/* + * Copyright 2018-2019 Collabora Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "nir_builder.h" + +static void +lower_pos_write(nir_builder *b, struct nir_instr *instr) +{ + if (instr->type != nir_instr_type_intrinsic) + return; + + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + if (intr->intrinsic != nir_intrinsic_store_deref) + return; + + nir_variable *var = nir_intrinsic_get_var(intr, 0); + if (var->data.mode != nir_var_shader_out || + var->data.location != VARYING_SLOT_POS) + return; + + b->cursor = nir_before_instr(&intr->instr); + + nir_ssa_def *pos = nir_ssa_for_src(b, intr->src[1], 4); + nir_ssa_def *def = nir_vec4(b, + nir_channel(b, pos, 0), + nir_channel(b, pos, 1), + nir_fmul_imm(b, + nir_fadd(b, + nir_channel(b, pos, 2), + nir_channel(b, pos, 3)), + 0.5), + nir_channel(b, pos, 3)); + nir_instr_rewrite_src(&intr->instr, intr->src + 1, nir_src_for_ssa(def)); +} + +void +nir_lower_clip_halfz(nir_shader *shader) +{ + if (shader->info.stage != MESA_SHADER_VERTEX) + return; + + nir_foreach_function(function, shader) { + if (function->impl) { + nir_builder b; + nir_builder_init(&b, function->impl); + + nir_foreach_block(block, function->impl) { + nir_foreach_instr_safe(instr, block) { + lower_pos_write(&b, instr); + } + } + + nir_metadata_preserve(function->impl, nir_metadata_block_index | + nir_metadata_dominance); + } + } +} diff -Nru mesa-19.2.8/src/compiler/nir/nir_lower_double_ops.c mesa-20.0.8/src/compiler/nir/nir_lower_double_ops.c --- mesa-19.2.8/src/compiler/nir/nir_lower_double_ops.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir_lower_double_ops.c 2020-06-12 01:21:16.000000000 +0000 @@ -26,6 +26,8 @@ #include "nir_builder.h" #include "c99_math.h" +#include + /* * Lowers some unsupported double operations, using only: * @@ -289,9 +291,20 @@ * 0 -> 0 and * +inf -> +inf */ - res = nir_bcsel(b, nir_ior(b, nir_feq(b, src, nir_imm_double(b, 0.0)), + const bool preserve_denorms = + b->shader->info.float_controls_execution_mode & + FLOAT_CONTROLS_DENORM_PRESERVE_FP64; + nir_ssa_def *src_flushed = src; + if (!preserve_denorms) { + src_flushed = nir_bcsel(b, + nir_flt(b, nir_fabs(b, src), + nir_imm_double(b, DBL_MIN)), + nir_imm_double(b, 0.0), + src); + } + res = nir_bcsel(b, nir_ior(b, nir_feq(b, src_flushed, nir_imm_double(b, 0.0)), nir_feq(b, src, nir_imm_double(b, INFINITY))), - src, res); + src_flushed, res); } else { res = fix_inv_result(b, res, src, new_exp); } @@ -723,7 +736,14 @@ * inlining. */ nir_opt_deref_impl(impl); - } + } else if (progress) { + nir_metadata_preserve(impl, nir_metadata_block_index | + nir_metadata_dominance); + } else { +#ifndef NDEBUG + impl->valid_metadata &= ~nir_metadata_not_properly_reset; +#endif + } return progress; } diff -Nru mesa-19.2.8/src/compiler/nir/nir_lower_drawpixels.c mesa-20.0.8/src/compiler/nir/nir_lower_drawpixels.c --- mesa-19.2.8/src/compiler/nir/nir_lower_drawpixels.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir_lower_drawpixels.c 2020-06-12 01:21:16.000000000 +0000 @@ -35,7 +35,7 @@ const nir_lower_drawpixels_options *options; nir_shader *shader; nir_builder b; - nir_variable *texcoord, *scale, *bias, *tex, *pixelmap; + nir_variable *texcoord, *texcoord_const, *scale, *bias, *tex, *pixelmap; } lower_drawpixels_state; static nir_ssa_def * @@ -104,11 +104,12 @@ static nir_ssa_def * get_texcoord_const(lower_drawpixels_state *state) { - if (state->bias == NULL) { - state->bias = create_uniform(state->shader, "gl_MultiTexCoord0", + if (state->texcoord_const == NULL) { + state->texcoord_const = create_uniform(state->shader, + "gl_MultiTexCoord0", state->options->texcoord_state_tokens); } - return nir_load_var(&state->b, state->bias); + return nir_load_var(&state->b, state->texcoord_const); } static void @@ -239,7 +240,9 @@ nir_foreach_instr_safe(instr, block) { if (instr->type == nir_instr_type_intrinsic) { nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); - if (intr->intrinsic == nir_intrinsic_load_deref) { + + switch (intr->intrinsic) { + case nir_intrinsic_load_deref: { nir_deref_instr *deref = nir_src_as_deref(intr->src[0]); nir_variable *var = nir_deref_instr_get_variable(deref); @@ -252,6 +255,29 @@ assert(deref->deref_type == nir_deref_type_var); lower_texcoord(state, intr); } + break; + } + + case nir_intrinsic_load_color0: + lower_color(state, intr); + break; + + case nir_intrinsic_load_interpolated_input: + case nir_intrinsic_load_input: { + /* The intrinsic doesn't carry the variable. We need to find it + * manually. + */ + nir_foreach_variable(var, &state->b.shader->inputs) { + if ((var->data.driver_location == nir_intrinsic_base(intr)) && + (nir_intrinsic_component(intr) >= var->data.location_frac && + nir_intrinsic_component(intr) < + (var->data.location_frac + glsl_get_components(var->type)))) + lower_texcoord(state, intr); + } + break; + } + default: + break; } } } diff -Nru mesa-19.2.8/src/compiler/nir/nir_lower_flatshade.c mesa-20.0.8/src/compiler/nir/nir_lower_flatshade.c --- mesa-19.2.8/src/compiler/nir/nir_lower_flatshade.c 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir_lower_flatshade.c 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,50 @@ +/* + * Copyright © 2015 Red Hat + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "nir.h" +#include "nir_builder.h" + +static bool +lower_input(nir_shader *shader, nir_variable *var) +{ + if (var->data.interpolation == INTERP_MODE_NONE && + (var->data.location == VARYING_SLOT_COL0 || + var->data.location == VARYING_SLOT_COL1 || + var->data.location == VARYING_SLOT_BFC0 || + var->data.location == VARYING_SLOT_BFC1)) + var->data.interpolation = INTERP_MODE_FLAT; + return true; +} + +bool +nir_lower_flatshade(nir_shader *shader) +{ + bool progress = false; + + nir_foreach_variable(var, &shader->inputs) { + progress |= lower_input(shader, var); + } + + return progress; +} diff -Nru mesa-19.2.8/src/compiler/nir/nir_lower_global_vars_to_local.c mesa-20.0.8/src/compiler/nir/nir_lower_global_vars_to_local.c --- mesa-19.2.8/src/compiler/nir/nir_lower_global_vars_to_local.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir_lower_global_vars_to_local.c 2020-06-12 01:21:16.000000000 +0000 @@ -83,8 +83,11 @@ } } - hash_table_foreach(var_func_table, entry) { - nir_variable *var = (void *)entry->key; + nir_foreach_variable_safe(var, &shader->globals) { + struct hash_entry *entry = _mesa_hash_table_search(var_func_table, var); + if (!entry) + continue; + nir_function_impl *impl = entry->data; assert(var->data.mode == nir_var_shader_temp); diff -Nru mesa-19.2.8/src/compiler/nir/nir_lower_gs_intrinsics.c mesa-20.0.8/src/compiler/nir/nir_lower_gs_intrinsics.c --- mesa-19.2.8/src/compiler/nir/nir_lower_gs_intrinsics.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir_lower_gs_intrinsics.c 2020-06-12 01:21:16.000000000 +0000 @@ -23,6 +23,7 @@ #include "nir.h" #include "nir_builder.h" +#include "nir_xfb_info.h" /** * \file nir_lower_gs_intrinsics.c @@ -55,7 +56,7 @@ struct state { nir_builder *builder; - nir_variable *vertex_count_var; + nir_variable *vertex_count_vars[NIR_MAX_XFB_STREAMS]; bool progress; }; @@ -71,10 +72,11 @@ rewrite_emit_vertex(nir_intrinsic_instr *intrin, struct state *state) { nir_builder *b = state->builder; + unsigned stream = nir_intrinsic_stream_id(intrin); /* Load the vertex count */ b->cursor = nir_before_instr(&intrin->instr); - nir_ssa_def *count = nir_load_var(b, state->vertex_count_var); + nir_ssa_def *count = nir_load_var(b, state->vertex_count_vars[stream]); nir_ssa_def *max_vertices = nir_imm_int(b, b->shader->info.gs.vertices_out); @@ -89,12 +91,12 @@ nir_intrinsic_instr *lowered = nir_intrinsic_instr_create(b->shader, nir_intrinsic_emit_vertex_with_counter); - nir_intrinsic_set_stream_id(lowered, nir_intrinsic_stream_id(intrin)); + nir_intrinsic_set_stream_id(lowered, stream); lowered->src[0] = nir_src_for_ssa(count); nir_builder_instr_insert(b, &lowered->instr); /* Increment the vertex count by 1 */ - nir_store_var(b, state->vertex_count_var, + nir_store_var(b, state->vertex_count_vars[stream], nir_iadd(b, count, nir_imm_int(b, 1)), 0x1); /* .x */ @@ -112,14 +114,15 @@ rewrite_end_primitive(nir_intrinsic_instr *intrin, struct state *state) { nir_builder *b = state->builder; + unsigned stream = nir_intrinsic_stream_id(intrin); b->cursor = nir_before_instr(&intrin->instr); - nir_ssa_def *count = nir_load_var(b, state->vertex_count_var); + nir_ssa_def *count = nir_load_var(b, state->vertex_count_vars[stream]); nir_intrinsic_instr *lowered = nir_intrinsic_instr_create(b->shader, nir_intrinsic_end_primitive_with_counter); - nir_intrinsic_set_stream_id(lowered, nir_intrinsic_stream_id(intrin)); + nir_intrinsic_set_stream_id(lowered, stream); lowered->src[0] = nir_src_for_ssa(count); nir_builder_instr_insert(b, &lowered->instr); @@ -169,7 +172,7 @@ nir_block *pred = (nir_block *) entry->key; b->cursor = nir_after_block_before_jump(pred); - nir_ssa_def *count = nir_load_var(b, state->vertex_count_var); + nir_ssa_def *count = nir_load_var(b, state->vertex_count_vars[0]); nir_intrinsic_instr *set_vertex_count = nir_intrinsic_instr_create(shader, nir_intrinsic_set_vertex_count); @@ -180,7 +183,7 @@ } bool -nir_lower_gs_intrinsics(nir_shader *shader) +nir_lower_gs_intrinsics(nir_shader *shader, bool per_stream) { struct state state; state.progress = false; @@ -192,18 +195,29 @@ nir_builder_init(&b, impl); state.builder = &b; - /* Create the counter variable */ - state.vertex_count_var = - nir_local_variable_create(impl, glsl_uint_type(), "vertex_count"); - /* initialize to 0 */ + /* Create the counter variables */ b.cursor = nir_before_cf_list(&impl->body); - nir_store_var(&b, state.vertex_count_var, nir_imm_int(&b, 0), 0x1); + unsigned num_counters = per_stream && shader->info.gs.uses_streams ? + NIR_MAX_XFB_STREAMS : 1; + for (unsigned i = 0; i < num_counters; i++) { + state.vertex_count_vars[i] = + nir_local_variable_create(impl, glsl_uint_type(), "vertex_count"); + /* initialize to 0 */ + nir_store_var(&b, state.vertex_count_vars[i], nir_imm_int(&b, 0), 0x1); + } + /* If per_stream is false, we only have one counter which we want to use + * for all streams. Duplicate the counter pointer so all streams use the + * same counter. + */ + for (unsigned i = num_counters; i < NIR_MAX_XFB_STREAMS; i++) + state.vertex_count_vars[i] = state.vertex_count_vars[0]; nir_foreach_block_safe(block, impl) rewrite_intrinsics(block, &state); /* This only works because we have a single main() function. */ - append_set_vertex_count(impl->end_block, &state); + if (!per_stream) + append_set_vertex_count(impl->end_block, &state); nir_metadata_preserve(impl, 0); diff -Nru mesa-19.2.8/src/compiler/nir/nir_lower_idiv.c mesa-20.0.8/src/compiler/nir/nir_lower_idiv.c --- mesa-19.2.8/src/compiler/nir/nir_lower_idiv.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir_lower_idiv.c 2020-06-12 01:21:16.000000000 +0000 @@ -27,13 +27,17 @@ #include "nir.h" #include "nir_builder.h" -/* Lowers idiv/udiv/umod - * Based on NV50LegalizeSSA::handleDIV() +/* Has two paths + * One (nir_lower_idiv_fast) lowers idiv/udiv/umod and is based on + * NV50LegalizeSSA::handleDIV() * - * Note that this is probably not enough precision for compute shaders. - * Perhaps we want a second higher precision (looping) version of this? - * Or perhaps we assume if you can do compute shaders you can also - * branch out to a pre-optimized shader library routine.. + * Note that this path probably does not have not enough precision for + * compute shaders. Perhaps we want a second higher precision (looping) + * version of this? Or perhaps we assume if you can do compute shaders you + * can also branch out to a pre-optimized shader library routine.. + * + * The other path (nir_lower_idiv_precise) is based off of code used by LLVM's + * AMDGPU target. It should handle 32-bit idiv/irem/imod/udiv/umod exactly. */ static bool @@ -130,8 +134,109 @@ return true; } +/* ported from LLVM's AMDGPUTargetLowering::LowerUDIVREM */ +static nir_ssa_def * +emit_udiv(nir_builder *bld, nir_ssa_def *numer, nir_ssa_def *denom, bool modulo) +{ + nir_ssa_def *rcp = nir_frcp(bld, nir_u2f32(bld, denom)); + rcp = nir_f2u32(bld, nir_fmul_imm(bld, rcp, 4294967296.0)); + nir_ssa_def *rcp_lo = nir_imul(bld, rcp, denom); + nir_ssa_def *rcp_hi = nir_umul_high(bld, rcp, denom); + nir_ssa_def *rcp_hi_ne_zero = nir_ine(bld, rcp_hi, nir_imm_int(bld, 0)); + nir_ssa_def *neg_rcp_lo = nir_ineg(bld, rcp_lo); + nir_ssa_def *abs_rcp_lo = nir_bcsel(bld, rcp_hi_ne_zero, rcp_lo, neg_rcp_lo); + nir_ssa_def *e = nir_umul_high(bld, abs_rcp_lo, rcp); + nir_ssa_def *rcp_plus_e = nir_iadd(bld, rcp, e); + nir_ssa_def *rcp_minus_e = nir_isub(bld, rcp, e); + nir_ssa_def *tmp0 = nir_bcsel(bld, rcp_hi_ne_zero, rcp_minus_e, rcp_plus_e); + nir_ssa_def *quotient = nir_umul_high(bld, tmp0, numer); + nir_ssa_def *num_s_remainder = nir_imul(bld, quotient, denom); + nir_ssa_def *remainder = nir_isub(bld, numer, num_s_remainder); + nir_ssa_def *remainder_ge_den = nir_uge(bld, remainder, denom); + nir_ssa_def *remainder_ge_zero = nir_uge(bld, numer, num_s_remainder); + nir_ssa_def *tmp1 = nir_iand(bld, remainder_ge_den, remainder_ge_zero); + + if (modulo) { + nir_ssa_def *rem = nir_bcsel(bld, tmp1, + nir_isub(bld, remainder, denom), remainder); + return nir_bcsel(bld, remainder_ge_zero, + rem, nir_iadd(bld, remainder, denom)); + } else { + nir_ssa_def *one = nir_imm_int(bld, 1); + nir_ssa_def *div = nir_bcsel(bld, tmp1, + nir_iadd(bld, quotient, one), quotient); + return nir_bcsel(bld, remainder_ge_zero, + div, nir_isub(bld, quotient, one)); + } +} + +/* ported from LLVM's AMDGPUTargetLowering::LowerSDIVREM */ +static nir_ssa_def * +emit_idiv(nir_builder *bld, nir_ssa_def *numer, nir_ssa_def *denom, nir_op op) +{ + nir_ssa_def *lh_sign = nir_ilt(bld, numer, nir_imm_int(bld, 0)); + nir_ssa_def *rh_sign = nir_ilt(bld, denom, nir_imm_int(bld, 0)); + lh_sign = nir_bcsel(bld, lh_sign, nir_imm_int(bld, -1), nir_imm_int(bld, 0)); + rh_sign = nir_bcsel(bld, rh_sign, nir_imm_int(bld, -1), nir_imm_int(bld, 0)); + + nir_ssa_def *lhs = nir_iadd(bld, numer, lh_sign); + nir_ssa_def *rhs = nir_iadd(bld, denom, rh_sign); + lhs = nir_ixor(bld, lhs, lh_sign); + rhs = nir_ixor(bld, rhs, rh_sign); + + if (op == nir_op_idiv) { + nir_ssa_def *d_sign = nir_ixor(bld, lh_sign, rh_sign); + nir_ssa_def *res = emit_udiv(bld, lhs, rhs, false); + res = nir_ixor(bld, res, d_sign); + return nir_isub(bld, res, d_sign); + } else { + nir_ssa_def *res = emit_udiv(bld, lhs, rhs, true); + res = nir_ixor(bld, res, lh_sign); + res = nir_isub(bld, res, lh_sign); + if (op == nir_op_imod) { + nir_ssa_def *cond = nir_ieq(bld, res, nir_imm_int(bld, 0)); + cond = nir_ior(bld, nir_ieq(bld, lh_sign, rh_sign), cond); + res = nir_bcsel(bld, cond, res, nir_iadd(bld, res, denom)); + } + return res; + } +} + +static bool +convert_instr_precise(nir_builder *bld, nir_alu_instr *alu) +{ + nir_op op = alu->op; + + if ((op != nir_op_idiv) && + (op != nir_op_imod) && + (op != nir_op_irem) && + (op != nir_op_udiv) && + (op != nir_op_umod)) + return false; + + if (alu->dest.dest.ssa.bit_size != 32) + return false; + + bld->cursor = nir_before_instr(&alu->instr); + + nir_ssa_def *numer = nir_ssa_for_alu_src(bld, alu, 0); + nir_ssa_def *denom = nir_ssa_for_alu_src(bld, alu, 1); + + nir_ssa_def *res = NULL; + + if (op == nir_op_udiv || op == nir_op_umod) + res = emit_udiv(bld, numer, denom, op == nir_op_umod); + else + res = emit_idiv(bld, numer, denom, op); + + assert(alu->dest.dest.is_ssa); + nir_ssa_def_rewrite_uses(&alu->dest.dest.ssa, nir_src_for_ssa(res)); + + return true; +} + static bool -convert_impl(nir_function_impl *impl) +convert_impl(nir_function_impl *impl, enum nir_lower_idiv_path path) { nir_builder b; nir_builder_init(&b, impl); @@ -139,7 +244,9 @@ nir_foreach_block(block, impl) { nir_foreach_instr_safe(instr, block) { - if (instr->type == nir_instr_type_alu) + if (instr->type == nir_instr_type_alu && path == nir_lower_idiv_precise) + progress |= convert_instr_precise(&b, nir_instr_as_alu(instr)); + else if (instr->type == nir_instr_type_alu) progress |= convert_instr(&b, nir_instr_as_alu(instr)); } } @@ -151,13 +258,13 @@ } bool -nir_lower_idiv(nir_shader *shader) +nir_lower_idiv(nir_shader *shader, enum nir_lower_idiv_path path) { bool progress = false; nir_foreach_function(function, shader) { if (function->impl) - progress |= convert_impl(function->impl); + progress |= convert_impl(function->impl, path); } return progress; diff -Nru mesa-19.2.8/src/compiler/nir/nir_lower_indirect_derefs.c mesa-20.0.8/src/compiler/nir/nir_lower_indirect_derefs.c --- mesa-19.2.8/src/compiler/nir/nir_lower_indirect_derefs.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir_lower_indirect_derefs.c 2020-06-12 01:21:16.000000000 +0000 @@ -126,6 +126,7 @@ intrin->intrinsic != nir_intrinsic_interp_deref_at_centroid && intrin->intrinsic != nir_intrinsic_interp_deref_at_sample && intrin->intrinsic != nir_intrinsic_interp_deref_at_offset && + intrin->intrinsic != nir_intrinsic_interp_deref_at_vertex && intrin->intrinsic != nir_intrinsic_store_deref) continue; diff -Nru mesa-19.2.8/src/compiler/nir/nir_lower_input_attachments.c mesa-20.0.8/src/compiler/nir/nir_lower_input_attachments.c --- mesa-19.2.8/src/compiler/nir/nir_lower_input_attachments.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir_lower_input_attachments.c 2020-06-12 01:21:16.000000000 +0000 @@ -115,6 +115,8 @@ tex->src[3].src = load->src[2]; } + tex->texture_non_uniform = nir_intrinsic_access(load) & ACCESS_NON_UNIFORM; + nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, NULL); nir_builder_instr_insert(&b, &tex->instr); @@ -124,6 +126,34 @@ return true; } +static bool +try_lower_input_texop(nir_function_impl *impl, nir_tex_instr *tex, + bool use_fragcoord_sysval) +{ + nir_deref_instr *deref = nir_src_as_deref(tex->src[0].src); + + if (glsl_get_sampler_dim(deref->type) != GLSL_SAMPLER_DIM_SUBPASS_MS) + return false; + + nir_builder b; + nir_builder_init(&b, impl); + b.cursor = nir_before_instr(&tex->instr); + + nir_ssa_def *frag_coord = use_fragcoord_sysval ? nir_load_frag_coord(&b) + : load_frag_coord(&b); + frag_coord = nir_f2i32(&b, frag_coord); + + nir_ssa_def *layer = nir_load_layer_id(&b); + nir_ssa_def *coord = nir_vec3(&b, nir_channel(&b, frag_coord, 0), + nir_channel(&b, frag_coord, 1), layer); + + tex->coord_components = 3; + + nir_instr_rewrite_src(&tex->instr, &tex->src[1].src, nir_src_for_ssa(coord)); + + return true; +} + bool nir_lower_input_attachments(nir_shader *shader, bool use_fragcoord_sysval) { @@ -136,16 +166,29 @@ nir_foreach_block(block, function->impl) { nir_foreach_instr_safe(instr, block) { - if (instr->type != nir_instr_type_intrinsic) - continue; - - nir_intrinsic_instr *load = nir_instr_as_intrinsic(instr); - - if (load->intrinsic != nir_intrinsic_image_deref_load) - continue; - - progress |= try_lower_input_load(function->impl, load, - use_fragcoord_sysval); + switch (instr->type) { + case nir_instr_type_tex: { + nir_tex_instr *tex = nir_instr_as_tex(instr); + + if (tex->op == nir_texop_fragment_mask_fetch || + tex->op == nir_texop_fragment_fetch) { + progress |= try_lower_input_texop(function->impl, tex, + use_fragcoord_sysval); + } + break; + } + case nir_instr_type_intrinsic: { + nir_intrinsic_instr *load = nir_instr_as_intrinsic(instr); + + if (load->intrinsic == nir_intrinsic_image_deref_load) { + progress |= try_lower_input_load(function->impl, load, + use_fragcoord_sysval); + } + break; + } + default: + break; + } } } } diff -Nru mesa-19.2.8/src/compiler/nir/nir_lower_int64.c mesa-20.0.8/src/compiler/nir/nir_lower_int64.c --- mesa-19.2.8/src/compiler/nir/nir_lower_int64.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir_lower_int64.c 2020-06-12 01:21:16.000000000 +0000 @@ -657,6 +657,19 @@ return lower_u2u64(b, extract32); } +static nir_ssa_def * +lower_ufind_msb64(nir_builder *b, nir_ssa_def *x) +{ + + nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x); + nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x); + nir_ssa_def *lo_count = nir_ufind_msb(b, x_lo); + nir_ssa_def *hi_count = nir_ufind_msb(b, x_hi); + nir_ssa_def *valid_hi_bits = nir_ine(b, x_hi, nir_imm_int(b, 0)); + nir_ssa_def *hi_res = nir_iadd(b, nir_imm_intN_t(b, 32, 32), hi_count); + return nir_bcsel(b, valid_hi_bits, hi_res, lo_count); +} + nir_lower_int64_options nir_lower_int64_op_to_options_mask(nir_op opcode) { @@ -679,8 +692,12 @@ return nir_lower_divmod64; case nir_op_b2i64: case nir_op_i2b1: + case nir_op_i2i8: + case nir_op_i2i16: case nir_op_i2i32: case nir_op_i2i64: + case nir_op_u2u8: + case nir_op_u2u16: case nir_op_u2u32: case nir_op_u2u64: case nir_op_bcsel: @@ -718,6 +735,8 @@ case nir_op_extract_u16: case nir_op_extract_i16: return nir_lower_extract64; + case nir_op_ufind_msb: + return nir_lower_ufind_msb64; default: return 0; } @@ -819,6 +838,9 @@ case nir_op_extract_u16: case nir_op_extract_i16: return lower_extract(b, alu->op, src[0], src[1]); + case nir_op_ufind_msb: + return lower_ufind_msb64(b, src[0]); + break; default: unreachable("Invalid ALU opcode to lower"); } @@ -837,7 +859,11 @@ switch (alu->op) { case nir_op_i2b1: + case nir_op_i2i8: + case nir_op_i2i16: case nir_op_i2i32: + case nir_op_u2u8: + case nir_op_u2u16: case nir_op_u2u32: assert(alu->src[0].src.is_ssa); if (alu->src[0].src.ssa->bit_size != 64) @@ -864,6 +890,11 @@ if (alu->src[0].src.ssa->bit_size != 64) return false; break; + case nir_op_ufind_msb: + assert(alu->src[0].src.is_ssa); + if (alu->src[0].src.ssa->bit_size != 64) + return false; + break; default: assert(alu->dest.dest.is_ssa); if (alu->dest.dest.ssa.bit_size != 64) diff -Nru mesa-19.2.8/src/compiler/nir/nir_lower_io_arrays_to_elements.c mesa-20.0.8/src/compiler/nir/nir_lower_io_arrays_to_elements.c --- mesa-19.2.8/src/compiler/nir/nir_lower_io_arrays_to_elements.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir_lower_io_arrays_to_elements.c 2020-06-12 01:21:16.000000000 +0000 @@ -167,7 +167,8 @@ intr->num_components, intr->dest.ssa.bit_size, NULL); if (intr->intrinsic == nir_intrinsic_interp_deref_at_offset || - intr->intrinsic == nir_intrinsic_interp_deref_at_sample) { + intr->intrinsic == nir_intrinsic_interp_deref_at_sample || + intr->intrinsic == nir_intrinsic_interp_deref_at_vertex) { nir_src_copy(&element_intr->src[1], &intr->src[1], &element_intr->instr); } @@ -212,8 +213,8 @@ * indirect indexing. */ static void -create_indirects_mask(nir_shader *shader, uint64_t *indirects, - uint64_t *patch_indirects, nir_variable_mode mode) +create_indirects_mask(nir_shader *shader, + BITSET_WORD *indirects, nir_variable_mode mode) { nir_foreach_function(function, shader) { if (function->impl) { @@ -232,7 +233,8 @@ intr->intrinsic != nir_intrinsic_store_deref && intr->intrinsic != nir_intrinsic_interp_deref_at_centroid && intr->intrinsic != nir_intrinsic_interp_deref_at_sample && - intr->intrinsic != nir_intrinsic_interp_deref_at_offset) + intr->intrinsic != nir_intrinsic_interp_deref_at_offset && + intr->intrinsic != nir_intrinsic_interp_deref_at_vertex) continue; nir_deref_instr *deref = nir_src_as_deref(intr->src[0]); @@ -244,14 +246,9 @@ nir_deref_path path; nir_deref_path_init(&path, deref, NULL); - uint64_t loc_mask = ((uint64_t)1) << var->data.location; - if (var->data.patch) { - if (deref_has_indirect(&b, var, &path)) - patch_indirects[var->data.location_frac] |= loc_mask; - } else { - if (deref_has_indirect(&b, var, &path)) - indirects[var->data.location_frac] |= loc_mask; - } + int loc = var->data.location * 4 + var->data.location_frac; + if (deref_has_indirect(&b, var, &path)) + BITSET_SET(indirects, loc); nir_deref_path_finish(&path); } @@ -262,7 +259,7 @@ static void lower_io_arrays_to_elements(nir_shader *shader, nir_variable_mode mask, - uint64_t *indirects, uint64_t *patch_indirects, + BITSET_WORD *indirects, struct hash_table *varyings, bool after_cross_stage_opts) { @@ -282,7 +279,8 @@ intr->intrinsic != nir_intrinsic_store_deref && intr->intrinsic != nir_intrinsic_interp_deref_at_centroid && intr->intrinsic != nir_intrinsic_interp_deref_at_sample && - intr->intrinsic != nir_intrinsic_interp_deref_at_offset) + intr->intrinsic != nir_intrinsic_interp_deref_at_offset && + intr->intrinsic != nir_intrinsic_interp_deref_at_vertex) continue; nir_deref_instr *deref = nir_src_as_deref(intr->src[0]); @@ -296,14 +294,9 @@ continue; /* Skip indirects */ - uint64_t loc_mask = ((uint64_t)1) << var->data.location; - if (var->data.patch) { - if (patch_indirects[var->data.location_frac] & loc_mask) - continue; - } else { - if (indirects[var->data.location_frac] & loc_mask) - continue; - } + int loc = var->data.location * 4 + var->data.location_frac; + if (BITSET_TEST(indirects, loc)) + continue; nir_variable_mode mode = var->data.mode; @@ -337,6 +330,7 @@ case nir_intrinsic_interp_deref_at_centroid: case nir_intrinsic_interp_deref_at_sample: case nir_intrinsic_interp_deref_at_offset: + case nir_intrinsic_interp_deref_at_vertex: case nir_intrinsic_load_deref: case nir_intrinsic_store_deref: if ((mask & nir_var_shader_in && mode == nir_var_shader_in) || @@ -359,14 +353,14 @@ struct hash_table *split_inputs = _mesa_pointer_hash_table_create(NULL); struct hash_table *split_outputs = _mesa_pointer_hash_table_create(NULL); - uint64_t indirects[4] = {0}, patch_indirects[4] = {0}; + BITSET_DECLARE(indirects, 4 * VARYING_SLOT_TESS_MAX) = {0}; - lower_io_arrays_to_elements(shader, nir_var_shader_out, indirects, - patch_indirects, split_outputs, true); + lower_io_arrays_to_elements(shader, nir_var_shader_out, + indirects, split_outputs, true); if (!outputs_only) { - lower_io_arrays_to_elements(shader, nir_var_shader_in, indirects, - patch_indirects, split_inputs, true); + lower_io_arrays_to_elements(shader, nir_var_shader_in, + indirects, split_inputs, true); /* Remove old input from the shaders inputs list */ hash_table_foreach(split_inputs, entry) { @@ -397,17 +391,16 @@ struct hash_table *split_inputs = _mesa_pointer_hash_table_create(NULL); struct hash_table *split_outputs = _mesa_pointer_hash_table_create(NULL); - uint64_t indirects[4] = {0}, patch_indirects[4] = {0}; - create_indirects_mask(producer, indirects, patch_indirects, - nir_var_shader_out); - create_indirects_mask(consumer, indirects, patch_indirects, - nir_var_shader_in); + BITSET_DECLARE(indirects, 4 * VARYING_SLOT_TESS_MAX) = {0}; + + create_indirects_mask(producer, indirects, nir_var_shader_out); + create_indirects_mask(consumer, indirects, nir_var_shader_in); - lower_io_arrays_to_elements(producer, nir_var_shader_out, indirects, - patch_indirects, split_outputs, false); + lower_io_arrays_to_elements(producer, nir_var_shader_out, + indirects, split_outputs, false); - lower_io_arrays_to_elements(consumer, nir_var_shader_in, indirects, - patch_indirects, split_inputs, false); + lower_io_arrays_to_elements(consumer, nir_var_shader_in, + indirects, split_inputs, false); /* Remove old input from the shaders inputs list */ hash_table_foreach(split_inputs, entry) { diff -Nru mesa-19.2.8/src/compiler/nir/nir_lower_io.c mesa-20.0.8/src/compiler/nir/nir_lower_io.c --- mesa-19.2.8/src/compiler/nir/nir_lower_io.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir_lower_io.c 2020-06-12 01:21:16.000000000 +0000 @@ -206,7 +206,7 @@ unsigned size = type_size((*p)->type, bts); nir_ssa_def *mul = - nir_imul_imm(b, nir_ssa_for_src(b, (*p)->arr.index, 1), size); + nir_amul_imm(b, nir_ssa_for_src(b, (*p)->arr.index, 1), size); offset = nir_iadd(b, offset, mul); } else if ((*p)->deref_type == nir_deref_type_struct) { @@ -245,20 +245,25 @@ if (nir->info.stage == MESA_SHADER_FRAGMENT && nir->options->use_interpolated_input_intrinsics && var->data.interpolation != INTERP_MODE_FLAT) { - assert(vertex_index == NULL); - - nir_intrinsic_op bary_op; - if (var->data.sample || - (state->options & nir_lower_io_force_sample_interpolation)) - bary_op = nir_intrinsic_load_barycentric_sample; - else if (var->data.centroid) - bary_op = nir_intrinsic_load_barycentric_centroid; - else - bary_op = nir_intrinsic_load_barycentric_pixel; - - barycentric = nir_load_barycentric(&state->builder, bary_op, - var->data.interpolation); - op = nir_intrinsic_load_interpolated_input; + if (var->data.interpolation == INTERP_MODE_EXPLICIT) { + assert(vertex_index != NULL); + op = nir_intrinsic_load_input_vertex; + } else { + assert(vertex_index == NULL); + + nir_intrinsic_op bary_op; + if (var->data.sample || + (state->options & nir_lower_io_force_sample_interpolation)) + bary_op = nir_intrinsic_load_barycentric_sample; + else if (var->data.centroid) + bary_op = nir_intrinsic_load_barycentric_centroid; + else + bary_op = nir_intrinsic_load_barycentric_pixel; + + barycentric = nir_load_barycentric(&state->builder, bary_op, + var->data.interpolation); + op = nir_intrinsic_load_interpolated_input; + } } else { op = vertex_index ? nir_intrinsic_load_per_vertex_input : nir_intrinsic_load_input; @@ -291,6 +296,7 @@ state->type_size(var->type, var->data.bindless)); if (load->intrinsic == nir_intrinsic_load_input || + load->intrinsic == nir_intrinsic_load_input_vertex || load->intrinsic == nir_intrinsic_load_uniform) nir_intrinsic_set_type(load, type); @@ -489,9 +495,20 @@ nir_builder *b = &state->builder; assert(var->data.mode == nir_var_shader_in); - /* Ignore interpolateAt() for flat variables - flat is flat. */ - if (var->data.interpolation == INTERP_MODE_FLAT) - return lower_load(intrin, state, NULL, var, offset, component, type); + /* Ignore interpolateAt() for flat variables - flat is flat. Lower + * interpolateAtVertex() for explicit variables. + */ + if (var->data.interpolation == INTERP_MODE_FLAT || + var->data.interpolation == INTERP_MODE_EXPLICIT) { + nir_ssa_def *vertex_index = NULL; + + if (var->data.interpolation == INTERP_MODE_EXPLICIT) { + assert(intrin->intrinsic == nir_intrinsic_interp_deref_at_vertex); + vertex_index = intrin->src[1].ssa; + } + + return lower_load(intrin, state, vertex_index, var, offset, component, type); + } /* None of the supported APIs allow interpolation on 64-bit things */ assert(intrin->dest.is_ssa && intrin->dest.ssa.bit_size <= 32); @@ -520,7 +537,8 @@ nir_intrinsic_set_interp_mode(bary_setup, var->data.interpolation); if (intrin->intrinsic == nir_intrinsic_interp_deref_at_sample || - intrin->intrinsic == nir_intrinsic_interp_deref_at_offset) + intrin->intrinsic == nir_intrinsic_interp_deref_at_offset || + intrin->intrinsic == nir_intrinsic_interp_deref_at_vertex) nir_src_copy(&bary_setup->src[0], &intrin->src[1], bary_setup); nir_builder_instr_insert(b, &bary_setup->instr); @@ -581,6 +599,7 @@ case nir_intrinsic_interp_deref_at_centroid: case nir_intrinsic_interp_deref_at_sample: case nir_intrinsic_interp_deref_at_offset: + case nir_intrinsic_interp_deref_at_vertex: /* We can optionally lower these to load_interpolated_input */ if (options->use_interpolated_input_intrinsics) break; @@ -653,6 +672,7 @@ case nir_intrinsic_interp_deref_at_centroid: case nir_intrinsic_interp_deref_at_sample: case nir_intrinsic_interp_deref_at_offset: + case nir_intrinsic_interp_deref_at_vertex: assert(vertex_index == NULL); replacement = lower_interpolate_at(intrin, state, var, offset, component_offset, deref->type); @@ -876,7 +896,7 @@ load->src[1] = nir_src_for_ssa(addr_to_offset(b, addr, addr_format)); } - if (mode != nir_var_mem_ubo && mode != nir_var_shader_in && mode != nir_var_mem_shared) + if (mode != nir_var_shader_in && mode != nir_var_mem_shared) nir_intrinsic_set_access(load, nir_intrinsic_access(intrin)); unsigned bit_size = intrin->dest.ssa.bit_size; @@ -1094,7 +1114,7 @@ nir_ssa_def *index = nir_ssa_for_src(b, deref->arr.index, 1); index = nir_i2i(b, index, base_addr->bit_size); return build_addr_iadd(b, base_addr, addr_format, - nir_imul_imm(b, index, stride)); + nir_amul_imm(b, index, stride)); } case nir_deref_type_ptr_as_array: { @@ -1102,7 +1122,7 @@ index = nir_i2i(b, index, base_addr->bit_size); unsigned stride = nir_deref_instr_ptr_as_array_stride(deref); return build_addr_iadd(b, base_addr, addr_format, - nir_imul_imm(b, index, stride)); + nir_amul_imm(b, index, stride)); } case nir_deref_type_array_wildcard: @@ -1191,8 +1211,8 @@ * one deref which could break our list walking since we walk the list * backwards. */ - assert(list_empty(&deref->dest.ssa.if_uses)); - if (list_empty(&deref->dest.ssa.uses)) { + assert(list_is_empty(&deref->dest.ssa.if_uses)); + if (list_is_empty(&deref->dest.ssa.uses)) { nir_instr_remove(&deref->instr); return; } @@ -1432,7 +1452,8 @@ * - compact shader inputs/outputs * - interface types */ - nir_variable_mode supported = nir_var_mem_shared | nir_var_shader_temp | nir_var_function_temp; + ASSERTED nir_variable_mode supported = nir_var_mem_shared | + nir_var_shader_temp | nir_var_function_temp; assert(!(modes & ~supported) && "unsupported"); bool progress = false; diff -Nru mesa-19.2.8/src/compiler/nir/nir_lower_io_to_scalar.c mesa-20.0.8/src/compiler/nir/nir_lower_io_to_scalar.c --- mesa-19.2.8/src/compiler/nir/nir_lower_io_to_scalar.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir_lower_io_to_scalar.c 2020-06-12 01:21:16.000000000 +0000 @@ -217,7 +217,8 @@ chan_intr->src[0] = nir_src_for_ssa(&deref->dest.ssa); if (intr->intrinsic == nir_intrinsic_interp_deref_at_offset || - intr->intrinsic == nir_intrinsic_interp_deref_at_sample) + intr->intrinsic == nir_intrinsic_interp_deref_at_sample || + intr->intrinsic == nir_intrinsic_interp_deref_at_vertex) nir_src_copy(&chan_intr->src[1], &intr->src[1], &chan_intr->instr); nir_builder_instr_insert(b, &chan_intr->instr); @@ -311,7 +312,8 @@ intr->intrinsic != nir_intrinsic_store_deref && intr->intrinsic != nir_intrinsic_interp_deref_at_centroid && intr->intrinsic != nir_intrinsic_interp_deref_at_sample && - intr->intrinsic != nir_intrinsic_interp_deref_at_offset) + intr->intrinsic != nir_intrinsic_interp_deref_at_offset && + intr->intrinsic != nir_intrinsic_interp_deref_at_vertex) continue; nir_deref_instr *deref = nir_src_as_deref(intr->src[0]); @@ -350,6 +352,7 @@ case nir_intrinsic_interp_deref_at_centroid: case nir_intrinsic_interp_deref_at_sample: case nir_intrinsic_interp_deref_at_offset: + case nir_intrinsic_interp_deref_at_vertex: case nir_intrinsic_load_deref: if ((mask & nir_var_shader_in && mode == nir_var_shader_in) || (mask & nir_var_shader_out && mode == nir_var_shader_out)) diff -Nru mesa-19.2.8/src/compiler/nir/nir_lower_io_to_temporaries.c mesa-20.0.8/src/compiler/nir/nir_lower_io_to_temporaries.c --- mesa-19.2.8/src/compiler/nir/nir_lower_io_to_temporaries.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir_lower_io_to_temporaries.c 2020-06-12 01:21:16.000000000 +0000 @@ -199,7 +199,8 @@ new_interp->src[0] = nir_src_for_ssa(&new_interp_deref->dest.ssa); if (interp->intrinsic == nir_intrinsic_interp_deref_at_sample || - interp->intrinsic == nir_intrinsic_interp_deref_at_offset) { + interp->intrinsic == nir_intrinsic_interp_deref_at_offset || + interp->intrinsic == nir_intrinsic_interp_deref_at_vertex) { new_interp->src[1] = interp->src[1]; } @@ -262,7 +263,8 @@ if (interp->intrinsic == nir_intrinsic_interp_deref_at_centroid || interp->intrinsic == nir_intrinsic_interp_deref_at_sample || - interp->intrinsic == nir_intrinsic_interp_deref_at_offset) { + interp->intrinsic == nir_intrinsic_interp_deref_at_offset || + interp->intrinsic == nir_intrinsic_interp_deref_at_vertex) { fixup_interpolation_instr(state, interp, b); } } diff -Nru mesa-19.2.8/src/compiler/nir/nir_lower_io_to_vector.c mesa-20.0.8/src/compiler/nir/nir_lower_io_to_vector.c --- mesa-19.2.8/src/compiler/nir/nir_lower_io_to_vector.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir_lower_io_to_vector.c 2020-06-12 01:21:16.000000000 +0000 @@ -34,6 +34,35 @@ * when all is said and done. */ +/* FRAG_RESULT_MAX+1 instead of just FRAG_RESULT_MAX because of how this pass + * handles dual source blending */ +#define MAX_SLOTS MAX2(VARYING_SLOT_TESS_MAX, FRAG_RESULT_MAX+1) + +static unsigned +get_slot(const nir_variable *var) +{ + /* This handling of dual-source blending might not be correct when more than + * one render target is supported, but it seems no driver supports more than + * one. */ + return var->data.location + var->data.index; +} + +static const struct glsl_type * +get_per_vertex_type(const nir_shader *shader, const nir_variable *var, + unsigned *num_vertices) +{ + if (nir_is_per_vertex_io(var, shader->info.stage)) { + assert(glsl_type_is_array(var->type)); + if (num_vertices) + *num_vertices = glsl_get_length(var->type); + return glsl_get_array_element(var->type); + } else { + if (num_vertices) + *num_vertices = 0; + return var->type; + } +} + static const struct glsl_type * resize_array_vec_type(const struct glsl_type *type, unsigned num_components) { @@ -48,47 +77,48 @@ } static bool -variable_can_rewrite(const nir_variable *var) +variables_can_merge(const nir_shader *shader, + const nir_variable *a, const nir_variable *b, + bool same_array_structure) { - /* Only touch user defined varyings as these are the only ones we split */ - if (var->data.location < VARYING_SLOT_VAR0) + if (a->data.compact || b->data.compact) return false; - /* Skip complex types we don't split in the first place */ - if (!glsl_type_is_vector_or_scalar(glsl_without_array(var->type))) - return false; - - /* TODO: add 64/16bit support ? */ - if (glsl_get_bit_size(glsl_without_array(var->type)) != 32) - return false; - - return true; -} - -static bool -variables_can_merge(nir_shader *shader, - const nir_variable *a, const nir_variable *b) -{ const struct glsl_type *a_type_tail = a->type; const struct glsl_type *b_type_tail = b->type; + if (nir_is_per_vertex_io(a, shader->info.stage) != + nir_is_per_vertex_io(b, shader->info.stage)) + return false; + /* They must have the same array structure */ - while (glsl_type_is_array(a_type_tail)) { - if (!glsl_type_is_array(b_type_tail)) - return false; + if (same_array_structure) { + while (glsl_type_is_array(a_type_tail)) { + if (!glsl_type_is_array(b_type_tail)) + return false; - if (glsl_get_length(a_type_tail) != glsl_get_length(b_type_tail)) - return false; + if (glsl_get_length(a_type_tail) != glsl_get_length(b_type_tail)) + return false; - a_type_tail = glsl_get_array_element(a_type_tail); - b_type_tail = glsl_get_array_element(b_type_tail); + a_type_tail = glsl_get_array_element(a_type_tail); + b_type_tail = glsl_get_array_element(b_type_tail); + } + if (glsl_type_is_array(b_type_tail)) + return false; + } else { + a_type_tail = glsl_without_array(a_type_tail); + b_type_tail = glsl_without_array(b_type_tail); } if (!glsl_type_is_vector_or_scalar(a_type_tail) || !glsl_type_is_vector_or_scalar(b_type_tail)) return false; - if (glsl_get_base_type(a->type) != glsl_get_base_type(b->type)) + if (glsl_get_base_type(a_type_tail) != glsl_get_base_type(b_type_tail)) + return false; + + /* TODO: add 64/16bit support ? */ + if (glsl_get_bit_size(a_type_tail) != 32) return false; assert(a->data.mode == b->data.mode); @@ -97,31 +127,87 @@ a->data.interpolation != b->data.interpolation) return false; + if (shader->info.stage == MESA_SHADER_FRAGMENT && + a->data.mode == nir_var_shader_out && + a->data.index != b->data.index) + return false; + return true; } +static const struct glsl_type * +get_flat_type(const nir_shader *shader, nir_variable *old_vars[MAX_SLOTS][4], + unsigned *loc, nir_variable **first_var, unsigned *num_vertices) +{ + unsigned todo = 1; + unsigned slots = 0; + unsigned num_vars = 0; + enum glsl_base_type base; + *num_vertices = 0; + *first_var = NULL; + + while (todo) { + assert(*loc < MAX_SLOTS); + for (unsigned frac = 0; frac < 4; frac++) { + nir_variable *var = old_vars[*loc][frac]; + if (!var) + continue; + if ((*first_var && + !variables_can_merge(shader, var, *first_var, false)) || + var->data.compact) { + (*loc)++; + return NULL; + } + + if (!*first_var) { + if (!glsl_type_is_vector_or_scalar(glsl_without_array(var->type))) { + (*loc)++; + return NULL; + } + *first_var = var; + base = glsl_get_base_type( + glsl_without_array(get_per_vertex_type(shader, var, NULL))); + } + + bool vs_in = shader->info.stage == MESA_SHADER_VERTEX && + var->data.mode == nir_var_shader_in; + unsigned var_slots = glsl_count_attribute_slots( + get_per_vertex_type(shader, var, num_vertices), vs_in); + todo = MAX2(todo, var_slots); + num_vars++; + } + todo--; + slots++; + (*loc)++; + } + + if (num_vars <= 1) + return NULL; + + if (slots == 1) + return glsl_vector_type(base, 4); + else + return glsl_array_type(glsl_vector_type(base, 4), slots, 0); +} + static bool create_new_io_vars(nir_shader *shader, struct exec_list *io_list, - nir_variable *old_vars[MAX_VARYINGS_INCL_PATCH][4], - nir_variable *new_vars[MAX_VARYINGS_INCL_PATCH][4]) + nir_variable *new_vars[MAX_SLOTS][4], + bool flat_vars[MAX_SLOTS]) { if (exec_list_is_empty(io_list)) return false; + nir_variable *old_vars[MAX_SLOTS][4] = {{0}}; + nir_foreach_variable(var, io_list) { - if (variable_can_rewrite(var)) { - unsigned loc = var->data.location - VARYING_SLOT_VAR0; - unsigned frac = var->data.location_frac; - old_vars[loc][frac] = var; - } + unsigned frac = var->data.location_frac; + old_vars[get_slot(var)][frac] = var; } bool merged_any_vars = false; - /* We don't handle combining vars of different type e.g. different array - * lengths. - */ - for (unsigned loc = 0; loc < MAX_VARYINGS_INCL_PATCH; loc++) { + for (unsigned loc = 0; loc < MAX_SLOTS; loc++) { unsigned frac = 0; while (frac < 4) { nir_variable *first_var = old_vars[loc][frac]; @@ -139,7 +225,7 @@ break; if (var != first_var) { - if (!variables_can_merge(shader, first_var, var)) + if (!variables_can_merge(shader, first_var, var, true)) break; found_merge = true; @@ -147,6 +233,11 @@ const unsigned num_components = glsl_get_components(glsl_without_array(var->type)); + if (!num_components) { + assert(frac == 0); + frac++; + break; /* The type was a struct. */ + } /* We had better not have any overlapping vars */ for (unsigned i = 1; i < num_components; i++) @@ -165,9 +256,42 @@ var->type = resize_array_vec_type(var->type, frac - first); nir_shader_add_variable(shader, var); - for (unsigned i = first; i < frac; i++) + for (unsigned i = first; i < frac; i++) { new_vars[loc][i] = var; + old_vars[loc][i] = NULL; + } + + old_vars[loc][first] = var; + } + } + + /* "flat" mode: tries to ensure there is at most one variable per slot by + * merging variables into vec4s + */ + for (unsigned loc = 0; loc < MAX_SLOTS;) { + nir_variable *first_var; + unsigned num_vertices; + unsigned new_loc = loc; + const struct glsl_type *flat_type = + get_flat_type(shader, old_vars, &new_loc, &first_var, &num_vertices); + if (flat_type) { + merged_any_vars = true; + + nir_variable *var = nir_variable_clone(first_var, shader); + var->data.location_frac = 0; + if (num_vertices) + var->type = glsl_array_type(flat_type, num_vertices, 0); + else + var->type = flat_type; + + nir_shader_add_variable(shader, var); + for (unsigned i = 0; i < glsl_get_length(flat_type); i++) { + for (unsigned j = 0; j < 4; j++) + new_vars[loc + i][j] = var; + flat_vars[loc + i] = true; + } } + loc = new_loc; } return merged_any_vars; @@ -186,6 +310,48 @@ return nir_build_deref_follower(b, parent, leader); } +static nir_ssa_def * +build_array_index(nir_builder *b, nir_deref_instr *deref, nir_ssa_def *base, + bool vs_in) +{ + switch (deref->deref_type) { + case nir_deref_type_var: + return base; + case nir_deref_type_array: { + nir_ssa_def *index = nir_i2i(b, deref->arr.index.ssa, + deref->dest.ssa.bit_size); + return nir_iadd( + b, build_array_index(b, nir_deref_instr_parent(deref), base, vs_in), + nir_amul_imm(b, index, glsl_count_attribute_slots(deref->type, vs_in))); + } + default: + unreachable("Invalid deref instruction type"); + } +} + +static nir_deref_instr * +build_array_deref_of_new_var_flat(nir_shader *shader, + nir_builder *b, nir_variable *new_var, + nir_deref_instr *leader, unsigned base) +{ + nir_deref_instr *deref = nir_build_deref_var(b, new_var); + + if (nir_is_per_vertex_io(new_var, shader->info.stage)) { + assert(leader->deref_type == nir_deref_type_array); + nir_ssa_def *index = leader->arr.index.ssa; + leader = nir_deref_instr_parent(leader); + deref = nir_build_deref_array(b, deref, index); + } + + if (!glsl_type_is_array(deref->type)) + return deref; + + bool vs_in = shader->info.stage == MESA_SHADER_VERTEX && + new_var->data.mode == nir_var_shader_in; + return nir_build_deref_array( + b, deref, build_array_index(b, leader, nir_imm_int(b, base), vs_in)); +} + static bool nir_lower_io_to_vector_impl(nir_function_impl *impl, nir_variable_mode modes) { @@ -197,10 +363,10 @@ nir_metadata_require(impl, nir_metadata_dominance); nir_shader *shader = impl->function->shader; - nir_variable *old_inputs[MAX_VARYINGS_INCL_PATCH][4] = {{0}}; - nir_variable *new_inputs[MAX_VARYINGS_INCL_PATCH][4] = {{0}}; - nir_variable *old_outputs[MAX_VARYINGS_INCL_PATCH][4] = {{0}}; - nir_variable *new_outputs[MAX_VARYINGS_INCL_PATCH][4] = {{0}}; + nir_variable *new_inputs[MAX_SLOTS][4] = {{0}}; + nir_variable *new_outputs[MAX_SLOTS][4] = {{0}}; + bool flat_inputs[MAX_SLOTS] = {0}; + bool flat_outputs[MAX_SLOTS] = {0}; if (modes & nir_var_shader_in) { /* Vertex shaders support overlapping inputs. We don't do those */ @@ -210,21 +376,16 @@ * so we don't bother doing extra non-work. */ if (!create_new_io_vars(shader, &shader->inputs, - old_inputs, new_inputs)) + new_inputs, flat_inputs)) modes &= ~nir_var_shader_in; } if (modes & nir_var_shader_out) { - /* Fragment shader outputs are always vec4. You shouldn't have - * scalarized them and it doesn't make sense to vectorize them. - */ - assert(b.shader->info.stage != MESA_SHADER_FRAGMENT); - /* If we don't actually merge any variables, remove that bit from modes * so we don't bother doing extra non-work. */ if (!create_new_io_vars(shader, &shader->outputs, - old_outputs, new_outputs)) + new_outputs, flat_outputs)) modes &= ~nir_var_shader_out; } @@ -250,27 +411,28 @@ case nir_intrinsic_load_deref: case nir_intrinsic_interp_deref_at_centroid: case nir_intrinsic_interp_deref_at_sample: - case nir_intrinsic_interp_deref_at_offset: { + case nir_intrinsic_interp_deref_at_offset: + case nir_intrinsic_interp_deref_at_vertex: { nir_deref_instr *old_deref = nir_src_as_deref(intrin->src[0]); if (!(old_deref->mode & modes)) break; if (old_deref->mode == nir_var_shader_out) - assert(b.shader->info.stage == MESA_SHADER_TESS_CTRL); + assert(b.shader->info.stage == MESA_SHADER_TESS_CTRL || + b.shader->info.stage == MESA_SHADER_FRAGMENT); nir_variable *old_var = nir_deref_instr_get_variable(old_deref); - if (old_var->data.location < VARYING_SLOT_VAR0) - break; - const unsigned loc = old_var->data.location - VARYING_SLOT_VAR0; + const unsigned loc = get_slot(old_var); const unsigned old_frac = old_var->data.location_frac; nir_variable *new_var = old_deref->mode == nir_var_shader_in ? new_inputs[loc][old_frac] : new_outputs[loc][old_frac]; + bool flat = old_deref->mode == nir_var_shader_in ? + flat_inputs[loc] : flat_outputs[loc]; if (!new_var) break; - assert(new_var->data.location == VARYING_SLOT_VAR0 + loc); const unsigned new_frac = new_var->data.location_frac; nir_component_mask_t vec4_comp_mask = @@ -281,9 +443,15 @@ /* Rewrite the load to use the new variable and only select a * portion of the result. */ - nir_deref_instr *new_deref = - build_array_deref_of_new_var(&b, new_var, old_deref); - assert(glsl_type_is_vector(new_deref->type)); + nir_deref_instr *new_deref; + if (flat) { + new_deref = build_array_deref_of_new_var_flat( + shader, &b, new_var, old_deref, loc - get_slot(new_var)); + } else { + assert(get_slot(new_var) == loc); + new_deref = build_array_deref_of_new_var(&b, new_var, old_deref); + assert(glsl_type_is_vector(new_deref->type)); + } nir_instr_rewrite_src(&intrin->instr, &intrin->src[0], nir_src_for_ssa(&new_deref->dest.ssa)); @@ -309,24 +477,28 @@ break; nir_variable *old_var = nir_deref_instr_get_variable(old_deref); - if (old_var->data.location < VARYING_SLOT_VAR0) - break; - const unsigned loc = old_var->data.location - VARYING_SLOT_VAR0; + const unsigned loc = get_slot(old_var); const unsigned old_frac = old_var->data.location_frac; nir_variable *new_var = new_outputs[loc][old_frac]; + bool flat = flat_outputs[loc]; if (!new_var) break; - assert(new_var->data.location == VARYING_SLOT_VAR0 + loc); const unsigned new_frac = new_var->data.location_frac; b.cursor = nir_before_instr(&intrin->instr); /* Rewrite the store to be a masked store to the new variable */ - nir_deref_instr *new_deref = - build_array_deref_of_new_var(&b, new_var, old_deref); - assert(glsl_type_is_vector(new_deref->type)); + nir_deref_instr *new_deref; + if (flat) { + new_deref = build_array_deref_of_new_var_flat( + shader, &b, new_var, old_deref, loc - get_slot(new_var)); + } else { + assert(get_slot(new_var) == loc); + new_deref = build_array_deref_of_new_var(&b, new_var, old_deref); + assert(glsl_type_is_vector(new_deref->type)); + } nir_instr_rewrite_src(&intrin->instr, &intrin->src[0], nir_src_for_ssa(&new_deref->dest.ssa)); diff -Nru mesa-19.2.8/src/compiler/nir/nir_lower_non_uniform_access.c mesa-20.0.8/src/compiler/nir/nir_lower_non_uniform_access.c --- mesa-19.2.8/src/compiler/nir/nir_lower_non_uniform_access.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir_lower_non_uniform_access.c 2020-06-12 01:21:16.000000000 +0000 @@ -46,17 +46,22 @@ /* We can have at most one texture and one sampler handle */ nir_ssa_def *handles[2]; + nir_deref_instr *parent_derefs[2]; + int texture_deref_handle = -1; + int sampler_deref_handle = -1; unsigned handle_count = 0; for (unsigned i = 0; i < tex->num_srcs; i++) { switch (tex->src[i].src_type) { case nir_tex_src_texture_offset: case nir_tex_src_texture_handle: + case nir_tex_src_texture_deref: if (!tex->texture_non_uniform) continue; break; case nir_tex_src_sampler_offset: case nir_tex_src_sampler_handle: + case nir_tex_src_sampler_deref: if (!tex->sampler_non_uniform) continue; break; @@ -65,10 +70,33 @@ continue; } - assert(tex->src[i].src.is_ssa); - assert(tex->src[i].src.ssa->num_components == 1); assert(handle_count < 2); - handles[handle_count++] = tex->src[i].src.ssa; + assert(tex->src[i].src.is_ssa); + nir_ssa_def *handle = tex->src[i].src.ssa; + if (handle->parent_instr->type == nir_instr_type_deref) { + nir_deref_instr *deref = nir_instr_as_deref(handle->parent_instr); + nir_deref_instr *parent = nir_deref_instr_parent(deref); + if (deref->deref_type == nir_deref_type_var) + continue; + + assert(parent->deref_type == nir_deref_type_var); + assert(deref->deref_type == nir_deref_type_array); + + /* If it's constant, it's automatically uniform; don't bother. */ + if (nir_src_is_const(deref->arr.index)) + continue; + + handle = deref->arr.index.ssa; + + parent_derefs[handle_count] = parent; + if (tex->src[i].src_type == nir_tex_src_texture_deref) + texture_deref_handle = handle_count; + else + sampler_deref_handle = handle_count; + } + assert(handle->num_components == 1); + + handles[handle_count++] = handle; } if (handle_count == 0) @@ -79,14 +107,30 @@ nir_push_loop(b); nir_ssa_def *all_equal_first = nir_imm_true(b); + nir_ssa_def *first[2]; for (unsigned i = 0; i < handle_count; i++) { - nir_ssa_def *equal_first = - nir_ieq(b, read_first_invocation(b, handles[i]), handles[i]); + first[i] = read_first_invocation(b, handles[i]); + nir_ssa_def *equal_first = nir_ieq(b, first[i], handles[i]); all_equal_first = nir_iand(b, all_equal_first, equal_first); } nir_push_if(b, all_equal_first); + /* Replicate the derefs. */ + if (texture_deref_handle >= 0) { + int src_idx = nir_tex_instr_src_index(tex, nir_tex_src_texture_deref); + nir_deref_instr *deref = parent_derefs[texture_deref_handle]; + deref = nir_build_deref_array(b, deref, first[texture_deref_handle]); + tex->src[src_idx].src = nir_src_for_ssa(&deref->dest.ssa); + } + + if (sampler_deref_handle >= 0) { + int src_idx = nir_tex_instr_src_index(tex, nir_tex_src_sampler_deref); + nir_deref_instr *deref = parent_derefs[sampler_deref_handle]; + deref = nir_build_deref_array(b, deref, first[sampler_deref_handle]); + tex->src[src_idx].src = nir_src_for_ssa(&deref->dest.ssa); + } + nir_builder_instr_insert(b, &tex->instr); nir_jump(b, nir_jump_break); @@ -100,19 +144,39 @@ if (!(nir_intrinsic_access(intrin) & ACCESS_NON_UNIFORM)) return false; + assert(intrin->src[handle_src].is_ssa); + nir_ssa_def *handle = intrin->src[handle_src].ssa; + nir_deref_instr *parent_deref = NULL; + if (handle->parent_instr->type == nir_instr_type_deref) { + nir_deref_instr *deref = nir_instr_as_deref(handle->parent_instr); + parent_deref = nir_deref_instr_parent(deref); + if (deref->deref_type == nir_deref_type_var) + return false; + + assert(parent_deref->deref_type == nir_deref_type_var); + assert(deref->deref_type == nir_deref_type_array); + + handle = deref->arr.index.ssa; + } + /* If it's constant, it's automatically uniform; don't bother. */ - if (nir_src_is_const(intrin->src[handle_src])) + if (handle->parent_instr->type == nir_instr_type_load_const) return false; b->cursor = nir_instr_remove(&intrin->instr); nir_push_loop(b); - assert(intrin->src[handle_src].is_ssa); - assert(intrin->src[handle_src].ssa->num_components == 1); - nir_ssa_def *handle = intrin->src[handle_src].ssa; + assert(handle->num_components == 1); + + nir_ssa_def *first = read_first_invocation(b, handle); + nir_push_if(b, nir_ieq(b, first, handle)); - nir_push_if(b, nir_ieq(b, read_first_invocation(b, handle), handle)); + /* Replicate the deref. */ + if (parent_deref) { + nir_deref_instr *deref = nir_build_deref_array(b, parent_deref, first); + intrin->src[handle_src] = nir_src_for_ssa(&deref->dest.ssa); + } nir_builder_instr_insert(b, &intrin->instr); nir_jump(b, nir_jump_break); @@ -179,8 +243,10 @@ case nir_intrinsic_image_load: case nir_intrinsic_image_store: case nir_intrinsic_image_atomic_add: - case nir_intrinsic_image_atomic_min: - case nir_intrinsic_image_atomic_max: + case nir_intrinsic_image_atomic_imin: + case nir_intrinsic_image_atomic_umin: + case nir_intrinsic_image_atomic_imax: + case nir_intrinsic_image_atomic_umax: case nir_intrinsic_image_atomic_and: case nir_intrinsic_image_atomic_or: case nir_intrinsic_image_atomic_xor: @@ -192,8 +258,10 @@ case nir_intrinsic_bindless_image_load: case nir_intrinsic_bindless_image_store: case nir_intrinsic_bindless_image_atomic_add: - case nir_intrinsic_bindless_image_atomic_min: - case nir_intrinsic_bindless_image_atomic_max: + case nir_intrinsic_bindless_image_atomic_imin: + case nir_intrinsic_bindless_image_atomic_umin: + case nir_intrinsic_bindless_image_atomic_imax: + case nir_intrinsic_bindless_image_atomic_umax: case nir_intrinsic_bindless_image_atomic_and: case nir_intrinsic_bindless_image_atomic_or: case nir_intrinsic_bindless_image_atomic_xor: @@ -202,6 +270,20 @@ case nir_intrinsic_bindless_image_atomic_fadd: case nir_intrinsic_bindless_image_size: case nir_intrinsic_bindless_image_samples: + case nir_intrinsic_image_deref_load: + case nir_intrinsic_image_deref_store: + case nir_intrinsic_image_deref_atomic_add: + case nir_intrinsic_image_deref_atomic_umin: + case nir_intrinsic_image_deref_atomic_imin: + case nir_intrinsic_image_deref_atomic_umax: + case nir_intrinsic_image_deref_atomic_imax: + case nir_intrinsic_image_deref_atomic_and: + case nir_intrinsic_image_deref_atomic_or: + case nir_intrinsic_image_deref_atomic_xor: + case nir_intrinsic_image_deref_atomic_exchange: + case nir_intrinsic_image_deref_atomic_comp_swap: + case nir_intrinsic_image_deref_size: + case nir_intrinsic_image_deref_samples: if ((types & nir_lower_non_uniform_image_access) && lower_non_uniform_access_intrin(&b, intrin, 0)) progress = true; diff -Nru mesa-19.2.8/src/compiler/nir/nir_lower_passthrough_edgeflags.c mesa-20.0.8/src/compiler/nir/nir_lower_passthrough_edgeflags.c --- mesa-19.2.8/src/compiler/nir/nir_lower_passthrough_edgeflags.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir_lower_passthrough_edgeflags.c 2020-06-12 01:21:16.000000000 +0000 @@ -39,6 +39,11 @@ glsl_vec4_type(), "edgeflag_in"); in->data.location = VERT_ATTRIB_EDGEFLAG; + /* The edge flag is the last input in st/mesa. */ + assert(shader->num_inputs == util_bitcount64(shader->info.inputs_read)); + in->data.driver_location = shader->num_inputs++; + shader->info.inputs_read |= BITFIELD64_BIT(VERT_ATTRIB_EDGEFLAG); + out = nir_variable_create(shader, nir_var_shader_out, glsl_vec4_type(), "edgeflag_out"); out->data.location = VARYING_SLOT_EDGE; diff -Nru mesa-19.2.8/src/compiler/nir/nir_lower_phis_to_scalar.c mesa-20.0.8/src/compiler/nir/nir_lower_phis_to_scalar.c --- mesa-19.2.8/src/compiler/nir/nir_lower_phis_to_scalar.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir_lower_phis_to_scalar.c 2020-06-12 01:21:16.000000000 +0000 @@ -100,6 +100,7 @@ case nir_intrinsic_interp_deref_at_centroid: case nir_intrinsic_interp_deref_at_sample: case nir_intrinsic_interp_deref_at_offset: + case nir_intrinsic_interp_deref_at_vertex: case nir_intrinsic_load_uniform: case nir_intrinsic_load_ubo: case nir_intrinsic_load_ssbo: diff -Nru mesa-19.2.8/src/compiler/nir/nir_lower_point_size.c mesa-20.0.8/src/compiler/nir/nir_lower_point_size.c --- mesa-19.2.8/src/compiler/nir/nir_lower_point_size.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir_lower_point_size.c 2020-06-12 01:21:16.000000000 +0000 @@ -41,12 +41,9 @@ nir_intrinsic_instr *instr = nir_instr_as_intrinsic(psiz_instr); - /* Some fixed function vertex programs generate PSIZ as a vec4 - * instead of a scalar, where the actual point size is stored in the - * first component. - */ assert(instr->src[1].is_ssa); - nir_ssa_def *psiz = nir_channel(b, instr->src[1].ssa, 0); + assert(instr->src[1].ssa->num_components == 1); + nir_ssa_def *psiz = instr->src[1].ssa; if (min > 0.0f) psiz = nir_fmax(b, psiz, nir_imm_float(b, min)); @@ -54,15 +51,7 @@ if (max > 0.0f) psiz = nir_fmin(b, psiz, nir_imm_float(b, max)); - nir_ssa_def *src_chans[4]; - src_chans[0] = psiz; - for (int i = 1; i < instr->src[1].ssa->num_components; i++) - src_chans[i] = nir_channel(b, instr->src[1].ssa, i); - nir_ssa_def *lowered_src = - nir_vec(b, src_chans, instr->src[1].ssa->num_components); - - nir_instr_rewrite_src(&instr->instr, &instr->src[1], - nir_src_for_ssa(lowered_src)); + nir_instr_rewrite_src(&instr->instr, &instr->src[1], nir_src_for_ssa(psiz)); } static bool diff -Nru mesa-19.2.8/src/compiler/nir/nir_lower_point_size_mov.c mesa-20.0.8/src/compiler/nir/nir_lower_point_size_mov.c --- mesa-19.2.8/src/compiler/nir/nir_lower_point_size_mov.c 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir_lower_point_size_mov.c 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,83 @@ +/* + * Copyright © 2019 Collabora Ltd + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "nir.h" +#include "nir_builder.h" + +/** nir_lower_point_size_mov.c + * + * This pass lowers glPointSize into gl_PointSize, by adding a uniform + * and a move from that uniform to VARYING_SLOT_PSIZ. This is useful for + * OpenGL ES level hardware that lack constant point-size hardware state. + */ + +static bool +lower_impl(nir_function_impl *impl, + const gl_state_index16 *pointsize_state_tokens, + nir_variable *out) +{ + nir_shader *shader = impl->function->shader; + nir_builder b; + nir_variable *in; + + nir_builder_init(&b, impl); + b.cursor = nir_before_cf_list(&impl->body); + + in = nir_variable_create(shader, nir_var_uniform, + glsl_float_type(), "gl_PointSizeClampedMESA"); + in->num_state_slots = 1; + in->state_slots = ralloc_array(in, nir_state_slot, 1); + memcpy(in->state_slots[0].tokens, + pointsize_state_tokens, + sizeof(in->state_slots[0].tokens)); + + if (!out) { + out = nir_variable_create(shader, nir_var_shader_out, + glsl_float_type(), "gl_PointSize"); + out->data.location = VARYING_SLOT_PSIZ; + } + + nir_copy_var(&b, out, in); + + nir_metadata_preserve(impl, nir_metadata_block_index | + nir_metadata_dominance); + return true; +} + +void +nir_lower_point_size_mov(nir_shader *shader, + const gl_state_index16 *pointsize_state_tokens) +{ + assert(shader->info.stage == MESA_SHADER_VERTEX); + + nir_variable *out = NULL; + nir_foreach_variable(var, &shader->outputs) { + if (var->data.location == VARYING_SLOT_PSIZ) { + out = var; + break; + } + } + + lower_impl(nir_shader_get_entrypoint(shader), pointsize_state_tokens, + out); +} diff -Nru mesa-19.2.8/src/compiler/nir/nir_lower_regs_to_ssa.c mesa-20.0.8/src/compiler/nir/nir_lower_regs_to_ssa.c --- mesa-19.2.8/src/compiler/nir/nir_lower_regs_to_ssa.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir_lower_regs_to_ssa.c 2020-06-12 01:21:16.000000000 +0000 @@ -300,9 +300,9 @@ nir_foreach_register_safe(reg, &impl->registers) { if (state.values[reg->index]) { - assert(list_empty(®->uses)); - assert(list_empty(®->if_uses)); - assert(list_empty(®->defs)); + assert(list_is_empty(®->uses)); + assert(list_is_empty(®->if_uses)); + assert(list_is_empty(®->defs)); exec_node_remove(®->node); } } diff -Nru mesa-19.2.8/src/compiler/nir/nir_lower_returns.c mesa-20.0.8/src/compiler/nir/nir_lower_returns.c --- mesa-19.2.8/src/compiler/nir/nir_lower_returns.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir_lower_returns.c 2020-06-12 01:21:16.000000000 +0000 @@ -50,7 +50,7 @@ nir_builder *b = &state->builder; b->cursor = nir_after_cf_node_and_phis(node); - if (nir_cursors_equal(b->cursor, nir_after_cf_list(state->cf_list))) + if (!state->loop && nir_cursors_equal(b->cursor, nir_after_cf_list(state->cf_list))) return; /* Nothing to predicate */ assert(state->return_flag); diff -Nru mesa-19.2.8/src/compiler/nir/nir_lower_samplers.c mesa-20.0.8/src/compiler/nir/nir_lower_samplers.c --- mesa-19.2.8/src/compiler/nir/nir_lower_samplers.c 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir_lower_samplers.c 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,154 @@ +/* + * Copyright (C) 2005-2007 Brian Paul All Rights Reserved. + * Copyright (C) 2008 VMware, Inc. All Rights Reserved. + * Copyright © 2014 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include "nir/nir.h" +#include "nir_builder.h" + +static void +lower_tex_src_to_offset(nir_builder *b, + nir_tex_instr *instr, unsigned src_idx) +{ + nir_ssa_def *index = NULL; + unsigned base_index = 0; + unsigned array_elements = 1; + nir_tex_src *src = &instr->src[src_idx]; + bool is_sampler = src->src_type == nir_tex_src_sampler_deref; + + /* We compute first the offsets */ + nir_deref_instr *deref = nir_instr_as_deref(src->src.ssa->parent_instr); + while (deref->deref_type != nir_deref_type_var) { + assert(deref->parent.is_ssa); + nir_deref_instr *parent = + nir_instr_as_deref(deref->parent.ssa->parent_instr); + + assert(deref->deref_type == nir_deref_type_array); + + if (nir_src_is_const(deref->arr.index) && index == NULL) { + /* We're still building a direct index */ + base_index += nir_src_as_uint(deref->arr.index) * array_elements; + } else { + if (index == NULL) { + /* We used to be direct but not anymore */ + index = nir_imm_int(b, base_index); + base_index = 0; + } + + index = nir_iadd(b, index, + nir_imul(b, nir_imm_int(b, array_elements), + nir_ssa_for_src(b, deref->arr.index, 1))); + } + + array_elements *= glsl_get_length(parent->type); + + deref = parent; + } + + if (index) + index = nir_umin(b, index, nir_imm_int(b, array_elements - 1)); + + /* We hit the deref_var. This is the end of the line */ + assert(deref->deref_type == nir_deref_type_var); + + base_index += deref->var->data.binding; + + /* We have the offsets, we apply them, rewriting the source or removing + * instr if needed + */ + if (index) { + nir_instr_rewrite_src(&instr->instr, &src->src, + nir_src_for_ssa(index)); + + src->src_type = is_sampler ? + nir_tex_src_sampler_offset : + nir_tex_src_texture_offset; + + instr->texture_array_size = array_elements; + } else { + nir_tex_instr_remove_src(instr, src_idx); + } + + if (is_sampler) { + instr->sampler_index = base_index; + } else { + instr->texture_index = base_index; + instr->texture_array_size = array_elements; + } +} + +static bool +lower_sampler(nir_builder *b, nir_tex_instr *instr) +{ + int texture_idx = + nir_tex_instr_src_index(instr, nir_tex_src_texture_deref); + + if (texture_idx >= 0) { + b->cursor = nir_before_instr(&instr->instr); + + lower_tex_src_to_offset(b, instr, texture_idx); + } + + int sampler_idx = + nir_tex_instr_src_index(instr, nir_tex_src_sampler_deref); + + if (sampler_idx >= 0) { + lower_tex_src_to_offset(b, instr, sampler_idx); + } + + if (texture_idx < 0 && sampler_idx < 0) + return false; + + return true; +} + +static bool +lower_impl(nir_function_impl *impl) +{ + nir_builder b; + nir_builder_init(&b, impl); + bool progress = false; + + nir_foreach_block(block, impl) { + nir_foreach_instr(instr, block) { + if (instr->type == nir_instr_type_tex) + progress |= lower_sampler(&b, nir_instr_as_tex(instr)); + } + } + + return progress; +} + +bool +nir_lower_samplers(nir_shader *shader) +{ + bool progress = false; + + /* Next, lower derefs to offsets. */ + nir_foreach_function(function, shader) { + if (function->impl) + progress |= lower_impl(function->impl); + } + + return progress; +} diff -Nru mesa-19.2.8/src/compiler/nir/nir_lower_scratch.c mesa-20.0.8/src/compiler/nir/nir_lower_scratch.c --- mesa-19.2.8/src/compiler/nir/nir_lower_scratch.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir_lower_scratch.c 2020-06-12 01:21:16.000000000 +0000 @@ -57,24 +57,25 @@ load->num_components = intrin->num_components; load->src[0] = nir_src_for_ssa(offset); nir_intrinsic_set_align(load, align, 0); + unsigned bit_size = intrin->dest.ssa.bit_size; nir_ssa_dest_init(&load->instr, &load->dest, intrin->dest.ssa.num_components, - intrin->dest.ssa.bit_size, NULL); + bit_size == 1 ? 32 : bit_size, NULL); nir_builder_instr_insert(b, &load->instr); nir_ssa_def *value = &load->dest.ssa; - if (glsl_type_is_boolean(deref->type)) - value = nir_b2i32(b, value); + if (bit_size == 1) + value = nir_i2b1(b, value); nir_ssa_def_rewrite_uses(&intrin->dest.ssa, - nir_src_for_ssa(&load->dest.ssa)); + nir_src_for_ssa(value)); } else { assert(intrin->intrinsic == nir_intrinsic_store_deref); assert(intrin->src[1].is_ssa); nir_ssa_def *value = intrin->src[1].ssa; - if (glsl_type_is_boolean(deref->type)) - value = nir_i2b(b, value); + if (value->bit_size == 1) + value = nir_b2i32(b, value); nir_intrinsic_instr *store = nir_intrinsic_instr_create(b->shader, nir_intrinsic_store_scratch); diff -Nru mesa-19.2.8/src/compiler/nir/nir_lower_subgroups.c mesa-20.0.8/src/compiler/nir/nir_lower_subgroups.c --- mesa-19.2.8/src/compiler/nir/nir_lower_subgroups.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir_lower_subgroups.c 2020-06-12 01:21:16.000000000 +0000 @@ -302,6 +302,46 @@ } static nir_ssa_def * +lower_dynamic_quad_broadcast(nir_builder *b, nir_intrinsic_instr *intrin, + const nir_lower_subgroups_options *options) +{ + if (!options->lower_quad_broadcast_dynamic_to_const) + return lower_shuffle(b, intrin, options->lower_to_scalar, false); + + nir_ssa_def *dst = NULL; + + for (unsigned i = 0; i < 4; ++i) { + nir_intrinsic_instr *qbcst = + nir_intrinsic_instr_create(b->shader, nir_intrinsic_quad_broadcast); + + qbcst->num_components = intrin->num_components; + qbcst->src[1] = nir_src_for_ssa(nir_imm_int(b, i)); + nir_src_copy(&qbcst->src[0], &intrin->src[0], qbcst); + nir_ssa_dest_init(&qbcst->instr, &qbcst->dest, + intrin->dest.ssa.num_components, + intrin->dest.ssa.bit_size, NULL); + + nir_ssa_def *qbcst_dst = NULL; + + if (options->lower_to_scalar && qbcst->num_components > 1) { + qbcst_dst = lower_subgroup_op_to_scalar(b, qbcst, false); + } else { + nir_builder_instr_insert(b, &qbcst->instr); + qbcst_dst = &qbcst->dest.ssa; + } + + if (i) + dst = nir_bcsel(b, nir_ieq(b, intrin->src[1].ssa, + nir_src_for_ssa(nir_imm_int(b, i)).ssa), + qbcst_dst, dst); + else + dst = qbcst_dst; + } + + return dst; +} + +static nir_ssa_def * lower_subgroups_instr(nir_builder *b, nir_instr *instr, void *_options) { const nir_lower_subgroups_options *options = _options; @@ -406,6 +446,32 @@ assert(intrin->src[0].is_ssa); nir_ssa_def *int_val = ballot_type_to_uint(b, intrin->src[0].ssa, options->ballot_bit_size); + + if (intrin->intrinsic != nir_intrinsic_ballot_bitfield_extract && + intrin->intrinsic != nir_intrinsic_ballot_find_lsb) { + /* For OpGroupNonUniformBallotFindMSB, the SPIR-V Spec says: + * + * "Find the most significant bit set to 1 in Value, considering + * only the bits in Value required to represent all bits of the + * group’s invocations. If none of the considered bits is set to + * 1, the result is undefined." + * + * It has similar text for the other three. This means that, in case + * the subgroup size is less than 32, we have to mask off the unused + * bits. If the subgroup size is fixed and greater than or equal to + * 32, the mask will be 0xffffffff and nir_opt_algebraic will delete + * the iand. + * + * We only have to worry about this for BitCount and FindMSB because + * FindLSB counts from the bottom and BitfieldExtract selects + * individual bits. In either case, if run outside the range of + * valid bits, we hit the undefined results case and we can return + * anything we want. + */ + int_val = nir_iand(b, int_val, + build_subgroup_mask(b, options->ballot_bit_size, options)); + } + switch (intrin->intrinsic) { case nir_intrinsic_ballot_bitfield_extract: assert(intrin->src[1].is_ssa); @@ -473,8 +539,11 @@ case nir_intrinsic_quad_swap_horizontal: case nir_intrinsic_quad_swap_vertical: case nir_intrinsic_quad_swap_diagonal: - if (options->lower_quad) - return lower_shuffle(b, intrin, options->lower_to_scalar, false); + if (options->lower_quad || + (options->lower_quad_broadcast_dynamic && + intrin->intrinsic == nir_intrinsic_quad_broadcast && + !nir_src_is_const(intrin->src[1]))) + return lower_dynamic_quad_broadcast(b, intrin, options); else if (options->lower_to_scalar && intrin->num_components > 1) return lower_subgroup_op_to_scalar(b, intrin, false); break; diff -Nru mesa-19.2.8/src/compiler/nir/nir_lower_system_values.c mesa-20.0.8/src/compiler/nir/nir_lower_system_values.c --- mesa-19.2.8/src/compiler/nir/nir_lower_system_values.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir_lower_system_values.c 2020-06-12 01:21:16.000000000 +0000 @@ -222,6 +222,10 @@ return NULL; } + case nir_intrinsic_load_num_work_groups: + case nir_intrinsic_load_work_group_id: + return sanitize_32bit_sysval(b, intrin); + case nir_intrinsic_load_deref: { nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]); if (deref->mode != nir_var_system_value) @@ -266,6 +270,34 @@ case SYSTEM_VALUE_GLOBAL_GROUP_SIZE: return build_global_group_size(b, bit_size); + case SYSTEM_VALUE_BARYCENTRIC_LINEAR_PIXEL: + return nir_load_barycentric(b, nir_intrinsic_load_barycentric_pixel, + INTERP_MODE_NOPERSPECTIVE); + + case SYSTEM_VALUE_BARYCENTRIC_LINEAR_CENTROID: + return nir_load_barycentric(b, nir_intrinsic_load_barycentric_centroid, + INTERP_MODE_NOPERSPECTIVE); + + case SYSTEM_VALUE_BARYCENTRIC_LINEAR_SAMPLE: + return nir_load_barycentric(b, nir_intrinsic_load_barycentric_sample, + INTERP_MODE_NOPERSPECTIVE); + + case SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL: + return nir_load_barycentric(b, nir_intrinsic_load_barycentric_pixel, + INTERP_MODE_SMOOTH); + + case SYSTEM_VALUE_BARYCENTRIC_PERSP_CENTROID: + return nir_load_barycentric(b, nir_intrinsic_load_barycentric_centroid, + INTERP_MODE_SMOOTH); + + case SYSTEM_VALUE_BARYCENTRIC_PERSP_SAMPLE: + return nir_load_barycentric(b, nir_intrinsic_load_barycentric_sample, + INTERP_MODE_SMOOTH); + + case SYSTEM_VALUE_BARYCENTRIC_PULL_MODEL: + return nir_load_barycentric(b, nir_intrinsic_load_barycentric_model, + INTERP_MODE_NONE); + default: break; } diff -Nru mesa-19.2.8/src/compiler/nir/nir_lower_tex.c mesa-20.0.8/src/compiler/nir/nir_lower_tex.c --- mesa-19.2.8/src/compiler/nir/nir_lower_tex.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir_lower_tex.c 2020-06-12 01:21:16.000000000 +0000 @@ -37,6 +37,7 @@ #include "nir.h" #include "nir_builder.h" +#include "nir_builtin_builder.h" #include "nir_format_convert.h" static bool @@ -103,110 +104,6 @@ return true; } -static nir_ssa_def * -get_texture_size(nir_builder *b, nir_tex_instr *tex) -{ - b->cursor = nir_before_instr(&tex->instr); - - nir_tex_instr *txs; - - unsigned num_srcs = 1; /* One for the LOD */ - for (unsigned i = 0; i < tex->num_srcs; i++) { - if (tex->src[i].src_type == nir_tex_src_texture_deref || - tex->src[i].src_type == nir_tex_src_sampler_deref || - tex->src[i].src_type == nir_tex_src_texture_offset || - tex->src[i].src_type == nir_tex_src_sampler_offset || - tex->src[i].src_type == nir_tex_src_texture_handle || - tex->src[i].src_type == nir_tex_src_sampler_handle) - num_srcs++; - } - - txs = nir_tex_instr_create(b->shader, num_srcs); - txs->op = nir_texop_txs; - txs->sampler_dim = tex->sampler_dim; - txs->is_array = tex->is_array; - txs->is_shadow = tex->is_shadow; - txs->is_new_style_shadow = tex->is_new_style_shadow; - txs->texture_index = tex->texture_index; - txs->sampler_index = tex->sampler_index; - txs->dest_type = nir_type_int; - - unsigned idx = 0; - for (unsigned i = 0; i < tex->num_srcs; i++) { - if (tex->src[i].src_type == nir_tex_src_texture_deref || - tex->src[i].src_type == nir_tex_src_sampler_deref || - tex->src[i].src_type == nir_tex_src_texture_offset || - tex->src[i].src_type == nir_tex_src_sampler_offset || - tex->src[i].src_type == nir_tex_src_texture_handle || - tex->src[i].src_type == nir_tex_src_sampler_handle) { - nir_src_copy(&txs->src[idx].src, &tex->src[i].src, txs); - txs->src[idx].src_type = tex->src[i].src_type; - idx++; - } - } - /* Add in an LOD because some back-ends require it */ - txs->src[idx].src = nir_src_for_ssa(nir_imm_int(b, 0)); - txs->src[idx].src_type = nir_tex_src_lod; - - nir_ssa_dest_init(&txs->instr, &txs->dest, - nir_tex_instr_dest_size(txs), 32, NULL); - nir_builder_instr_insert(b, &txs->instr); - - return nir_i2f32(b, &txs->dest.ssa); -} - -static nir_ssa_def * -get_texture_lod(nir_builder *b, nir_tex_instr *tex) -{ - b->cursor = nir_before_instr(&tex->instr); - - nir_tex_instr *tql; - - unsigned num_srcs = 0; - for (unsigned i = 0; i < tex->num_srcs; i++) { - if (tex->src[i].src_type == nir_tex_src_coord || - tex->src[i].src_type == nir_tex_src_texture_deref || - tex->src[i].src_type == nir_tex_src_sampler_deref || - tex->src[i].src_type == nir_tex_src_texture_offset || - tex->src[i].src_type == nir_tex_src_sampler_offset || - tex->src[i].src_type == nir_tex_src_texture_handle || - tex->src[i].src_type == nir_tex_src_sampler_handle) - num_srcs++; - } - - tql = nir_tex_instr_create(b->shader, num_srcs); - tql->op = nir_texop_lod; - tql->coord_components = tex->coord_components; - tql->sampler_dim = tex->sampler_dim; - tql->is_array = tex->is_array; - tql->is_shadow = tex->is_shadow; - tql->is_new_style_shadow = tex->is_new_style_shadow; - tql->texture_index = tex->texture_index; - tql->sampler_index = tex->sampler_index; - tql->dest_type = nir_type_float; - - unsigned idx = 0; - for (unsigned i = 0; i < tex->num_srcs; i++) { - if (tex->src[i].src_type == nir_tex_src_coord || - tex->src[i].src_type == nir_tex_src_texture_deref || - tex->src[i].src_type == nir_tex_src_sampler_deref || - tex->src[i].src_type == nir_tex_src_texture_offset || - tex->src[i].src_type == nir_tex_src_sampler_offset || - tex->src[i].src_type == nir_tex_src_texture_handle || - tex->src[i].src_type == nir_tex_src_sampler_handle) { - nir_src_copy(&tql->src[idx].src, &tex->src[i].src, tql); - tql->src[idx].src_type = tex->src[i].src_type; - idx++; - } - } - - nir_ssa_dest_init(&tql->instr, &tql->dest, 2, 32, NULL); - nir_builder_instr_insert(b, &tql->instr); - - /* The LOD is the y component of the result */ - return nir_channel(b, &tql->dest.ssa, 1); -} - static bool lower_offset(nir_builder *b, nir_tex_instr *tex) { @@ -229,7 +126,7 @@ if (tex->sampler_dim == GLSL_SAMPLER_DIM_RECT) { offset_coord = nir_fadd(b, coord, nir_i2f32(b, offset)); } else { - nir_ssa_def *txs = get_texture_size(b, tex); + nir_ssa_def *txs = nir_get_texture_size(b, tex); nir_ssa_def *scale = nir_frcp(b, txs); offset_coord = nir_fadd(b, coord, @@ -271,7 +168,7 @@ */ tex->sampler_dim = GLSL_SAMPLER_DIM_2D; - nir_ssa_def *txs = get_texture_size(b, tex); + nir_ssa_def *txs = nir_get_texture_size(b, tex); nir_ssa_def *scale = nir_frcp(b, txs); /* Walk through the sources normalizing the requested arguments. */ @@ -297,7 +194,7 @@ b->cursor = nir_before_instr(&tex->instr); - nir_ssa_def *lod = get_texture_lod(b, tex); + nir_ssa_def *lod = nir_get_texture_lod(b, tex); int bias_idx = nir_tex_instr_src_index(tex, nir_tex_src_bias); if (bias_idx >= 0) { @@ -508,7 +405,7 @@ assert(tex->dest.is_ssa); /* Use textureSize() to get the width and height of LOD 0 */ - nir_ssa_def *size = get_texture_size(b, tex); + nir_ssa_def *size = nir_get_texture_size(b, tex); /* Cubemap texture lookups first generate a texture coordinate normalized * to [-1, 1] on the appropiate face. The appropiate face is determined @@ -675,7 +572,7 @@ } nir_ssa_def *size = - nir_channels(b, get_texture_size(b, tex), component_mask); + nir_channels(b, nir_get_texture_size(b, tex), component_mask); /* Scale the gradients by width and height. Effectively, the incoming * gradients are s'(x,y), t'(x,y), and r'(x,y) from equation 3.19 in the @@ -737,7 +634,7 @@ /* non-normalized texture coords, so clamp to texture * size rather than [0.0, 1.0] */ - nir_ssa_def *txs = get_texture_size(b, tex); + nir_ssa_def *txs = nir_get_texture_size(b, tex); comp[j] = nir_fmax(b, comp[j], nir_imm_float(b, 0.0)); comp[j] = nir_fmin(b, comp[j], nir_channel(b, txs, j)); } else { @@ -1061,7 +958,8 @@ progress = lower_offset(b, tex) || progress; } - if ((tex->sampler_dim == GLSL_SAMPLER_DIM_RECT) && options->lower_rect) { + if ((tex->sampler_dim == GLSL_SAMPLER_DIM_RECT) && options->lower_rect && + tex->op != nir_texop_txf && !nir_tex_instr_is_query(tex)) { lower_rect(b, tex); progress = true; } diff -Nru mesa-19.2.8/src/compiler/nir/nir_lower_to_source_mods.c mesa-20.0.8/src/compiler/nir/nir_lower_to_source_mods.c --- mesa-19.2.8/src/compiler/nir/nir_lower_to_source_mods.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir_lower_to_source_mods.c 2020-06-12 01:21:16.000000000 +0000 @@ -122,8 +122,8 @@ alu->src[i].swizzle[j] = parent->src[0].swizzle[alu->src[i].swizzle[j]]; } - if (list_empty(&parent->dest.dest.ssa.uses) && - list_empty(&parent->dest.dest.ssa.if_uses)) + if (list_is_empty(&parent->dest.dest.ssa.uses) && + list_is_empty(&parent->dest.dest.ssa.if_uses)) nir_instr_remove(&parent->instr); progress = true; @@ -144,7 +144,7 @@ if (!(options & nir_lower_float_source_mods)) continue; - if (!list_empty(&alu->dest.dest.ssa.if_uses)) + if (!list_is_empty(&alu->dest.dest.ssa.if_uses)) continue; bool all_children_are_sat = true; diff -Nru mesa-19.2.8/src/compiler/nir/nir_lower_two_sided_color.c mesa-20.0.8/src/compiler/nir/nir_lower_two_sided_color.c --- mesa-19.2.8/src/compiler/nir/nir_lower_two_sided_color.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir_lower_two_sided_color.c 2020-06-12 01:21:16.000000000 +0000 @@ -138,18 +138,28 @@ nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); - if (intr->intrinsic != nir_intrinsic_load_input) - continue; - int idx; - for (idx = 0; idx < state->colors_count; idx++) { - unsigned drvloc = - state->colors[idx].front->data.driver_location; - if (nir_intrinsic_base(intr) == drvloc) { - assert(nir_src_is_const(intr->src[0])); - break; + if (intr->intrinsic == nir_intrinsic_load_input) { + for (idx = 0; idx < state->colors_count; idx++) { + unsigned drvloc = + state->colors[idx].front->data.driver_location; + if (nir_intrinsic_base(intr) == drvloc) { + assert(nir_src_is_const(intr->src[0])); + break; + } } - } + } else if (intr->intrinsic == nir_intrinsic_load_deref) { + nir_variable *var = nir_intrinsic_get_var(intr, 0); + if (var->data.mode != nir_var_shader_in) + continue; + + for (idx = 0; idx < state->colors_count; idx++) { + unsigned loc = state->colors[idx].front->data.location; + if (var->data.location == loc) + break; + } + } else + continue; if (idx == state->colors_count) continue; @@ -162,8 +172,14 @@ * 32-bit value by default. */ nir_ssa_def *face = nir_load_front_face(b, 1); - nir_ssa_def *front = load_input(b, state->colors[idx].front); - nir_ssa_def *back = load_input(b, state->colors[idx].back); + nir_ssa_def *front, *back; + if (intr->intrinsic == nir_intrinsic_load_deref) { + front = nir_load_var(b, state->colors[idx].front); + back = nir_load_var(b, state->colors[idx].back); + } else { + front = load_input(b, state->colors[idx].front); + back = load_input(b, state->colors[idx].back); + } nir_ssa_def *color = nir_bcsel(b, face, front, back); assert(intr->dest.is_ssa); diff -Nru mesa-19.2.8/src/compiler/nir/nir_lower_uniforms_to_ubo.c mesa-20.0.8/src/compiler/nir/nir_lower_uniforms_to_ubo.c --- mesa-19.2.8/src/compiler/nir/nir_lower_uniforms_to_ubo.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir_lower_uniforms_to_ubo.c 2020-06-12 01:21:16.000000000 +0000 @@ -43,7 +43,9 @@ { b->cursor = nir_before_instr(&instr->instr); - if (instr->intrinsic == nir_intrinsic_load_ubo) { + /* Increase all UBO binding points by 1. */ + if (instr->intrinsic == nir_intrinsic_load_ubo && + !b->shader->info.first_ubo_is_default_ubo) { nir_ssa_def *old_idx = nir_ssa_for_src(b, instr->src[0], 1); nir_ssa_def *new_idx = nir_iadd(b, old_idx, nir_imm_int(b, 1)); nir_instr_rewrite_src(&instr->instr, &instr->src[0], @@ -99,6 +101,7 @@ } } + shader->info.first_ubo_is_default_ubo = true; return progress; } diff -Nru mesa-19.2.8/src/compiler/nir/nir_lower_vec_to_movs.c mesa-20.0.8/src/compiler/nir/nir_lower_vec_to_movs.c --- mesa-19.2.8/src/compiler/nir/nir_lower_vec_to_movs.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir_lower_vec_to_movs.c 2020-06-12 01:21:16.000000000 +0000 @@ -140,7 +140,7 @@ return 0; } - if (!list_empty(&vec->src[start_idx].src.ssa->if_uses)) + if (!list_is_empty(&vec->src[start_idx].src.ssa->if_uses)) return 0; if (vec->src[start_idx].src.ssa->parent_instr->type != nir_instr_type_alu) diff -Nru mesa-19.2.8/src/compiler/nir/nir_opcodes.py mesa-20.0.8/src/compiler/nir/nir_opcodes.py --- mesa-19.2.8/src/compiler/nir/nir_opcodes.py 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir_opcodes.py 2020-06-12 01:21:16.000000000 +0000 @@ -75,7 +75,7 @@ assert isinstance(algebraic_properties, str) assert isinstance(const_expr, str) assert len(input_sizes) == len(input_types) - assert 0 <= output_size <= 4 + assert 0 <= output_size <= 4 or (output_size == 8) or (output_size == 16) for size in input_sizes: assert 0 <= size <= 4 if output_size != 0: @@ -95,6 +95,8 @@ tint = "int" tbool = "bool" tbool1 = "bool1" +tbool8 = "bool8" +tbool16 = "bool16" tbool32 = "bool32" tuint = "uint" tuint16 = "uint16" @@ -123,7 +125,7 @@ if type_has_size(type_): return [type_size(type_)] elif type_ == 'bool': - return [1, 32] + return [1, 8, 16, 32] elif type_ == 'float': return [16, 32, 64] else: @@ -217,17 +219,51 @@ dst_types = [tint, tuint, tfloat, tbool] for dst_t in dst_types: - for bit_size in type_sizes(dst_t): - if bit_size == 16 and dst_t == tfloat and src_t == tfloat: + for dst_bit_size in type_sizes(dst_t): + if dst_bit_size == 16 and dst_t == tfloat and src_t == tfloat: rnd_modes = ['_rtne', '_rtz', ''] for rnd_mode in rnd_modes: - unop_numeric_convert("{0}2{1}{2}{3}".format(src_t[0], dst_t[0], - bit_size, rnd_mode), - dst_t + str(bit_size), src_t, "src0") + if rnd_mode == '_rtne': + conv_expr = """ + if (bit_size > 16) { + dst = _mesa_half_to_float(_mesa_float_to_float16_rtne(src0)); + } else { + dst = src0; + } + """ + elif rnd_mode == '_rtz': + conv_expr = """ + if (bit_size > 16) { + dst = _mesa_half_to_float(_mesa_float_to_float16_rtz(src0)); + } else { + dst = src0; + } + """ + else: + conv_expr = "src0" + + unop_numeric_convert("{0}2{1}{2}{3}".format(src_t[0], + dst_t[0], + dst_bit_size, + rnd_mode), + dst_t + str(dst_bit_size), + src_t, conv_expr) + elif dst_bit_size == 32 and dst_t == tfloat and src_t == tfloat: + conv_expr = """ + if (bit_size > 32 && nir_is_rounding_mode_rtz(execution_mode, 32)) { + dst = _mesa_double_to_float_rtz(src0); + } else { + dst = src0; + } + """ + unop_numeric_convert("{0}2{1}{2}".format(src_t[0], dst_t[0], + dst_bit_size), + dst_t + str(dst_bit_size), src_t, conv_expr) else: conv_expr = "src0 != 0" if dst_t == tbool else "src0" - unop_numeric_convert("{0}2{1}{2}".format(src_t[0], dst_t[0], bit_size), - dst_t + str(bit_size), src_t, conv_expr) + unop_numeric_convert("{0}2{1}{2}".format(src_t[0], dst_t[0], + dst_bit_size), + dst_t + str(dst_bit_size), src_t, conv_expr) # Unary floating-point rounding operations. @@ -333,14 +369,23 @@ unop_horiz("unpack_32_2x16", 2, tuint16, 1, tuint32, "dst.x = src0.x; dst.y = src0.x >> 16;") -# Lowered floating point unpacking operations. +unop_horiz("unpack_half_2x16_flush_to_zero", 2, tfloat32, 1, tuint32, """ +dst.x = unpack_half_1x16_flush_to_zero((uint16_t)(src0.x & 0xffff)); +dst.y = unpack_half_1x16_flush_to_zero((uint16_t)(src0.x << 16)); +""") +# Lowered floating point unpacking operations. unop_convert("unpack_half_2x16_split_x", tfloat32, tuint32, "unpack_half_1x16((uint16_t)(src0 & 0xffff))") unop_convert("unpack_half_2x16_split_y", tfloat32, tuint32, "unpack_half_1x16((uint16_t)(src0 >> 16))") +unop_convert("unpack_half_2x16_split_x_flush_to_zero", tfloat32, tuint32, + "unpack_half_1x16_flush_to_zero((uint16_t)(src0 & 0xffff))") +unop_convert("unpack_half_2x16_split_y_flush_to_zero", tfloat32, tuint32, + "unpack_half_1x16_flush_to_zero((uint16_t)(src0 >> 16))") + unop_convert("unpack_32_2x16_split_x", tuint16, tuint32, "src0") unop_convert("unpack_32_2x16_split_y", tuint16, tuint32, "src0 >> 16") @@ -374,6 +419,15 @@ } """) +unop("uclz", tuint32, """ +int bit; +for (bit = bit_size - 1; bit >= 0; bit--) { + if ((src0 & (1u << bit)) != 0) + break; +} +dst = (unsigned)(31 - bit); +""") + unop("ifind_msb", tint32, """ dst = -1; for (int bit = 31; bit >= 0; bit--) { @@ -452,9 +506,21 @@ def binop_compare(name, ty, alg_props, const_expr): binop_convert(name, tbool1, ty, alg_props, const_expr) +def binop_compare8(name, ty, alg_props, const_expr): + binop_convert(name, tbool8, ty, alg_props, const_expr) + +def binop_compare16(name, ty, alg_props, const_expr): + binop_convert(name, tbool16, ty, alg_props, const_expr) + def binop_compare32(name, ty, alg_props, const_expr): binop_convert(name, tbool32, ty, alg_props, const_expr) +def binop_compare_all_sizes(name, ty, alg_props, const_expr): + binop_compare(name, ty, alg_props, const_expr) + binop_compare8(name + "8", ty, alg_props, const_expr) + binop_compare16(name + "16", ty, alg_props, const_expr) + binop_compare32(name + "32", ty, alg_props, const_expr) + def binop_horiz(name, out_size, out_type, src1_size, src1_type, src2_size, src2_type, const_expr): opcode(name, out_size, out_type, [src1_size, src2_size], [src1_type, src2_type], @@ -482,7 +548,27 @@ [4, 4], [src_type, src_type], False, _2src_commutative, final(reduce_(reduce_(src0, src1), reduce_(src2, src3)))) -binop("fadd", tfloat, _2src_commutative + associative, "src0 + src1") +def binop_reduce_all_sizes(name, output_size, src_type, prereduce_expr, + reduce_expr, final_expr): + binop_reduce(name, output_size, tbool1, src_type, + prereduce_expr, reduce_expr, final_expr) + binop_reduce("b8" + name[1:], output_size, tbool8, src_type, + prereduce_expr, reduce_expr, final_expr) + binop_reduce("b16" + name[1:], output_size, tbool16, src_type, + prereduce_expr, reduce_expr, final_expr) + binop_reduce("b32" + name[1:], output_size, tbool32, src_type, + prereduce_expr, reduce_expr, final_expr) + +binop("fadd", tfloat, _2src_commutative + associative,""" +if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) { + if (bit_size == 64) + dst = _mesa_double_add_rtz(src0, src1); + else + dst = _mesa_double_to_float_rtz((double)src0 + (double)src1); +} else { + dst = src0 + src1; +} +""") binop("iadd", tint, _2src_commutative + associative, "src0 + src1") binop("iadd_sat", tint, _2src_commutative, """ src1 > 0 ? @@ -498,10 +584,33 @@ """) binop("usub_sat", tuint, "", "src0 < src1 ? 0 : src0 - src1") -binop("fsub", tfloat, "", "src0 - src1") +binop("fsub", tfloat, "", """ +if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) { + if (bit_size == 64) + dst = _mesa_double_sub_rtz(src0, src1); + else + dst = _mesa_double_to_float_rtz((double)src0 - (double)src1); +} else { + dst = src0 - src1; +} +""") binop("isub", tint, "", "src0 - src1") - -binop("fmul", tfloat, _2src_commutative + associative, "src0 * src1") +binop_convert("uabs_isub", tuint, tint, "", """ + src1 > src0 ? (uint64_t) src1 - (uint64_t) src0 + : (uint64_t) src0 - (uint64_t) src1 +""") +binop("uabs_usub", tuint, "", "(src1 > src0) ? (src1 - src0) : (src0 - src1)") + +binop("fmul", tfloat, _2src_commutative + associative, """ +if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) { + if (bit_size == 64) + dst = _mesa_double_mul_rtz(src0, src1); + else + dst = _mesa_double_to_float_rtz((double)src0 * (double)src1); +} else { + dst = src0 * src1; +} +""") # low 32-bits of signed/unsigned integer multiply binop("imul", tint, _2src_commutative + associative, "src0 * src1") @@ -558,6 +667,9 @@ dst = ((uint64_t)src0 & mask) * ((uint64_t)src1 & mask); """) +# Multiply 32-bits with low 16-bits. +binop("imul_32x16", tint32, "", "src0 * (int16_t) src1") +binop("umul_32x16", tuint32, "", "src0 * (uint16_t) src1") binop("fdiv", tfloat, "", "src0 / src1") binop("idiv", tint, "", "src1 == 0 ? 0 : (src0 / src1)") @@ -623,46 +735,27 @@ # these integer-aware comparisons return a boolean (0 or ~0) -binop_compare("flt", tfloat, "", "src0 < src1") -binop_compare("fge", tfloat, "", "src0 >= src1") -binop_compare("feq", tfloat, _2src_commutative, "src0 == src1") -binop_compare("fne", tfloat, _2src_commutative, "src0 != src1") -binop_compare("ilt", tint, "", "src0 < src1") -binop_compare("ige", tint, "", "src0 >= src1") -binop_compare("ieq", tint, _2src_commutative, "src0 == src1") -binop_compare("ine", tint, _2src_commutative, "src0 != src1") -binop_compare("ult", tuint, "", "src0 < src1") -binop_compare("uge", tuint, "", "src0 >= src1") -binop_compare32("flt32", tfloat, "", "src0 < src1") -binop_compare32("fge32", tfloat, "", "src0 >= src1") -binop_compare32("feq32", tfloat, _2src_commutative, "src0 == src1") -binop_compare32("fne32", tfloat, _2src_commutative, "src0 != src1") -binop_compare32("ilt32", tint, "", "src0 < src1") -binop_compare32("ige32", tint, "", "src0 >= src1") -binop_compare32("ieq32", tint, _2src_commutative, "src0 == src1") -binop_compare32("ine32", tint, _2src_commutative, "src0 != src1") -binop_compare32("ult32", tuint, "", "src0 < src1") -binop_compare32("uge32", tuint, "", "src0 >= src1") +binop_compare_all_sizes("flt", tfloat, "", "src0 < src1") +binop_compare_all_sizes("fge", tfloat, "", "src0 >= src1") +binop_compare_all_sizes("feq", tfloat, _2src_commutative, "src0 == src1") +binop_compare_all_sizes("fne", tfloat, _2src_commutative, "src0 != src1") +binop_compare_all_sizes("ilt", tint, "", "src0 < src1") +binop_compare_all_sizes("ige", tint, "", "src0 >= src1") +binop_compare_all_sizes("ieq", tint, _2src_commutative, "src0 == src1") +binop_compare_all_sizes("ine", tint, _2src_commutative, "src0 != src1") +binop_compare_all_sizes("ult", tuint, "", "src0 < src1") +binop_compare_all_sizes("uge", tuint, "", "src0 >= src1") # integer-aware GLSL-style comparisons that compare floats and ints -binop_reduce("ball_fequal", 1, tbool1, tfloat, "{src0} == {src1}", - "{src0} && {src1}", "{src}") -binop_reduce("bany_fnequal", 1, tbool1, tfloat, "{src0} != {src1}", - "{src0} || {src1}", "{src}") -binop_reduce("ball_iequal", 1, tbool1, tint, "{src0} == {src1}", - "{src0} && {src1}", "{src}") -binop_reduce("bany_inequal", 1, tbool1, tint, "{src0} != {src1}", - "{src0} || {src1}", "{src}") - -binop_reduce("b32all_fequal", 1, tbool32, tfloat, "{src0} == {src1}", - "{src0} && {src1}", "{src}") -binop_reduce("b32any_fnequal", 1, tbool32, tfloat, "{src0} != {src1}", - "{src0} || {src1}", "{src}") -binop_reduce("b32all_iequal", 1, tbool32, tint, "{src0} == {src1}", - "{src0} && {src1}", "{src}") -binop_reduce("b32any_inequal", 1, tbool32, tint, "{src0} != {src1}", - "{src0} || {src1}", "{src}") +binop_reduce_all_sizes("ball_fequal", 1, tfloat, "{src0} == {src1}", + "{src0} && {src1}", "{src}") +binop_reduce_all_sizes("bany_fnequal", 1, tfloat, "{src0} != {src1}", + "{src0} || {src1}", "{src}") +binop_reduce_all_sizes("ball_iequal", 1, tint, "{src0} == {src1}", + "{src0} && {src1}", "{src}") +binop_reduce_all_sizes("bany_inequal", 1, tint, "{src0} != {src1}", + "{src0} || {src1}", "{src}") # non-integer-aware GLSL-style comparisons that return 0.0 or 1.0 @@ -722,10 +815,10 @@ opcode("fdph_replicated", 4, tfloat, [3, 4], [tfloat, tfloat], False, "", "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w") -binop("fmin", tfloat, "", "fminf(src0, src1)") +binop("fmin", tfloat, "", "fmin(src0, src1)") binop("imin", tint, _2src_commutative + associative, "src1 > src0 ? src0 : src1") binop("umin", tuint, _2src_commutative + associative, "src1 > src0 ? src0 : src1") -binop("fmax", tfloat, "", "fmaxf(src0, src1)") +binop("fmax", tfloat, "", "fmax(src0, src1)") binop("imax", tint, _2src_commutative + associative, "src1 > src0 ? src1 : src0") binop("umax", tuint, _2src_commutative + associative, "src1 > src0 ? src1 : src0") @@ -824,7 +917,21 @@ [src1_size, src2_size, src3_size], [tuint, tuint, tuint], False, "", const_expr) -triop("ffma", tfloat, _2src_commutative, "src0 * src1 + src2") +triop("ffma", tfloat, _2src_commutative, """ +if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) { + if (bit_size == 64) + dst = _mesa_double_fma_rtz(src0, src1, src2); + else if (bit_size == 32) + dst = _mesa_float_fma_rtz(src0, src1, src2); + else + dst = _mesa_double_to_float_rtz(_mesa_double_fma_rtz(src0, src1, src2)); +} else { + if (bit_size == 32) + dst = fmaf(src0, src1, src2); + else + dst = fma(src0, src1, src2); +} +""") triop("flrp", tfloat, "", "src0 * (1 - src2) + src1 * src2") @@ -851,7 +958,11 @@ triop("umed3", tuint, "", "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))") opcode("bcsel", 0, tuint, [0, 0, 0], - [tbool1, tuint, tuint], False, "", "src0 ? src1 : src2") + [tbool1, tuint, tuint], False, "", "src0 ? src1 : src2") +opcode("b8csel", 0, tuint, [0, 0, 0], + [tbool8, tuint, tuint], False, "", "src0 ? src1 : src2") +opcode("b16csel", 0, tuint, [0, 0, 0], + [tbool16, tuint, tuint], False, "", "src0 ? src1 : src2") opcode("b32csel", 0, tuint, [0, 0, 0], [tbool32, tuint, tuint], False, "", "src0 ? src1 : src2") @@ -963,10 +1074,66 @@ dst.w = src3.x; """) +opcode("vec8", 8, tuint, + [1] * 8, [tuint] * 8, + False, "", """ +dst.x = src0.x; +dst.y = src1.x; +dst.z = src2.x; +dst.w = src3.x; +dst.e = src4.x; +dst.f = src5.x; +dst.g = src6.x; +dst.h = src7.x; +""") + +opcode("vec16", 16, tuint, + [1] * 16, [tuint] * 16, + False, "", """ +dst.x = src0.x; +dst.y = src1.x; +dst.z = src2.x; +dst.w = src3.x; +dst.e = src4.x; +dst.f = src5.x; +dst.g = src6.x; +dst.h = src7.x; +dst.i = src8.x; +dst.j = src9.x; +dst.k = src10.x; +dst.l = src11.x; +dst.m = src12.x; +dst.n = src13.x; +dst.o = src14.x; +dst.p = src15.x; +""") + +# An integer multiply instruction for address calculation. This is +# similar to imul, except that the results are undefined in case of +# overflow. Overflow is defined according to the size of the variable +# being dereferenced. +# +# This relaxed definition, compared to imul, allows an optimization +# pass to propagate bounds (ie, from an load/store intrinsic) to the +# sources, such that lower precision integer multiplies can be used. +# This is useful on hw that has 24b or perhaps 16b integer multiply +# instructions. +binop("amul", tint, _2src_commutative + associative, "src0 * src1") + # ir3-specific instruction that maps directly to mul-add shift high mix, # (IMADSH_MIX16 i.e. ah * bl << 16 + c). It is used for lowering integer # multiplication (imul) on Freedreno backend.. -opcode("imadsh_mix16", 1, tint32, - [1, 1, 1], [tint32, tint32, tint32], False, "", """ -dst.x = ((((src0.x & 0xffff0000) >> 16) * (src1.x & 0x0000ffff)) << 16) + src2.x; +opcode("imadsh_mix16", 0, tint32, + [0, 0, 0], [tint32, tint32, tint32], False, "", """ +dst = ((((src0 & 0xffff0000) >> 16) * (src1 & 0x0000ffff)) << 16) + src2; """) + +# ir3-specific instruction that maps directly to ir3 mad.s24. +# +# 24b multiply into 32b result (with sign extension) plus 32b int +triop("imad24_ir3", tint32, _2src_commutative, + "(((int32_t)src0 << 8) >> 8) * (((int32_t)src1 << 8) >> 8) + src2") + +# 24b multiply into 32b result (with sign extension) +binop("imul24", tint32, _2src_commutative + associative, + "(((int32_t)src0 << 8) >> 8) * (((int32_t)src1 << 8) >> 8)") diff -Nru mesa-19.2.8/src/compiler/nir/nir_opt_access.c mesa-20.0.8/src/compiler/nir/nir_opt_access.c --- mesa-19.2.8/src/compiler/nir/nir_opt_access.c 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir_opt_access.c 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,346 @@ +/* + * Copyright © 2019 Valve Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "nir.h" + +/* This pass optimizes GL access qualifiers. So far it does two things: + * + * - Infer readonly when it's missing. + * - Infer ACCESS_CAN_REORDER when the following are true: + * - Either there are no writes, or ACCESS_NON_WRITEABLE and ACCESS_RESTRICT + * are both set. In either case there are no writes to the underlying + * memory. + * - If ACCESS_COHERENT is set, then there must be no memory barriers + * involving the access. Coherent accesses may return different results + * before and after barriers. + * - ACCESS_VOLATILE is not set. + * + * If these conditions are true, then image and buffer reads may be treated as + * if they were uniform buffer reads, i.e. they may be arbitrarily moved, + * combined, rematerialized etc. + */ + +struct access_state { + struct set *vars_written; + bool images_written; + bool buffers_written; + bool image_barriers; + bool buffer_barriers; +}; + +static void +gather_intrinsic(struct access_state *state, nir_intrinsic_instr *instr) +{ + nir_variable *var; + switch (instr->intrinsic) { + case nir_intrinsic_image_deref_store: + case nir_intrinsic_image_deref_atomic_add: + case nir_intrinsic_image_deref_atomic_imin: + case nir_intrinsic_image_deref_atomic_umin: + case nir_intrinsic_image_deref_atomic_imax: + case nir_intrinsic_image_deref_atomic_umax: + case nir_intrinsic_image_deref_atomic_and: + case nir_intrinsic_image_deref_atomic_or: + case nir_intrinsic_image_deref_atomic_xor: + case nir_intrinsic_image_deref_atomic_exchange: + case nir_intrinsic_image_deref_atomic_comp_swap: + case nir_intrinsic_image_deref_atomic_fadd: + var = nir_intrinsic_get_var(instr, 0); + + /* In OpenGL, buffer images use normal buffer objects, whereas other + * image types use textures which cannot alias with buffer objects. + * Therefore we have to group buffer samplers together with SSBO's. + */ + if (glsl_get_sampler_dim(glsl_without_array(var->type)) == + GLSL_SAMPLER_DIM_BUF) + state->buffers_written = true; + else + state->images_written = true; + + if (var->data.mode == nir_var_uniform) + _mesa_set_add(state->vars_written, var); + break; + + case nir_intrinsic_bindless_image_store: + case nir_intrinsic_bindless_image_atomic_add: + case nir_intrinsic_bindless_image_atomic_imin: + case nir_intrinsic_bindless_image_atomic_umin: + case nir_intrinsic_bindless_image_atomic_imax: + case nir_intrinsic_bindless_image_atomic_umax: + case nir_intrinsic_bindless_image_atomic_and: + case nir_intrinsic_bindless_image_atomic_or: + case nir_intrinsic_bindless_image_atomic_xor: + case nir_intrinsic_bindless_image_atomic_exchange: + case nir_intrinsic_bindless_image_atomic_comp_swap: + case nir_intrinsic_bindless_image_atomic_fadd: + if (nir_intrinsic_image_dim(instr) == GLSL_SAMPLER_DIM_BUF) + state->buffers_written = true; + else + state->images_written = true; + break; + + case nir_intrinsic_store_deref: + case nir_intrinsic_deref_atomic_add: + case nir_intrinsic_deref_atomic_imin: + case nir_intrinsic_deref_atomic_umin: + case nir_intrinsic_deref_atomic_imax: + case nir_intrinsic_deref_atomic_umax: + case nir_intrinsic_deref_atomic_and: + case nir_intrinsic_deref_atomic_or: + case nir_intrinsic_deref_atomic_xor: + case nir_intrinsic_deref_atomic_exchange: + case nir_intrinsic_deref_atomic_comp_swap: + case nir_intrinsic_deref_atomic_fadd: + case nir_intrinsic_deref_atomic_fmin: + case nir_intrinsic_deref_atomic_fmax: + case nir_intrinsic_deref_atomic_fcomp_swap: + var = nir_intrinsic_get_var(instr, 0); + if (var->data.mode != nir_var_mem_ssbo) + break; + + _mesa_set_add(state->vars_written, var); + state->buffers_written = true; + + case nir_intrinsic_memory_barrier: + state->buffer_barriers = true; + state->image_barriers = true; + break; + + case nir_intrinsic_memory_barrier_buffer: + state->buffer_barriers = true; + break; + + case nir_intrinsic_memory_barrier_image: + state->image_barriers = true; + break; + + case nir_intrinsic_scoped_memory_barrier: + /* TODO: Could be more granular if we had nir_var_mem_image. */ + if (nir_intrinsic_memory_modes(instr) & (nir_var_mem_ubo | + nir_var_mem_ssbo | + nir_var_uniform | + nir_var_mem_global)) { + state->buffer_barriers = true; + state->image_barriers = true; + } + break; + + default: + break; + } +} + +static bool +process_variable(struct access_state *state, nir_variable *var) +{ + if (var->data.mode != nir_var_mem_ssbo && + !(var->data.mode == nir_var_uniform && + glsl_type_is_image(var->type))) + return false; + + /* Ignore variables we've already marked */ + if (var->data.access & ACCESS_CAN_REORDER) + return false; + + if (!(var->data.access & ACCESS_NON_WRITEABLE) && + !_mesa_set_search(state->vars_written, var)) { + var->data.access |= ACCESS_NON_WRITEABLE; + return true; + } + + return false; +} + +static bool +can_reorder(struct access_state *state, enum gl_access_qualifier access, + bool is_buffer, bool is_ssbo) +{ + bool is_any_written = is_buffer ? state->buffers_written : + state->images_written; + + /* Can we guarantee that the underlying memory is never written? */ + if (!is_any_written || + ((access & ACCESS_NON_WRITEABLE) && + (access & ACCESS_RESTRICT))) { + /* Note: memoryBarrierBuffer() is only guaranteed to flush buffer + * variables and not imageBuffer's, so we only consider the GL-level + * type here. + */ + bool is_any_barrier = is_ssbo ? + state->buffer_barriers : state->image_barriers; + + return (!is_any_barrier || !(access & ACCESS_COHERENT)) && + !(access & ACCESS_VOLATILE); + } + + return false; +} + +static bool +process_intrinsic(struct access_state *state, nir_intrinsic_instr *instr) +{ + switch (instr->intrinsic) { + case nir_intrinsic_bindless_image_load: + if (nir_intrinsic_access(instr) & ACCESS_CAN_REORDER) + return false; + + /* We have less information about bindless intrinsics, since we can't + * always trace uses back to the variable. Don't try and infer if it's + * read-only, unless there are no image writes at all. + */ + bool progress = false; + bool is_buffer = + nir_intrinsic_image_dim(instr) == GLSL_SAMPLER_DIM_BUF; + + bool is_any_written = + is_buffer ? state->buffers_written : state->images_written; + + if (!(nir_intrinsic_access(instr) & ACCESS_NON_WRITEABLE) && + !is_any_written) { + progress = true; + nir_intrinsic_set_access(instr, + nir_intrinsic_access(instr) | + ACCESS_NON_WRITEABLE); + } + + if (can_reorder(state, nir_intrinsic_access(instr), is_buffer, false)) { + progress = true; + nir_intrinsic_set_access(instr, + nir_intrinsic_access(instr) | + ACCESS_CAN_REORDER); + } + + return progress; + + case nir_intrinsic_load_deref: + case nir_intrinsic_image_deref_load: { + nir_variable *var = nir_intrinsic_get_var(instr, 0); + + if (instr->intrinsic == nir_intrinsic_load_deref && + var->data.mode != nir_var_mem_ssbo) + return false; + + if (nir_intrinsic_access(instr) & ACCESS_CAN_REORDER) + return false; + + bool progress = false; + + /* Check if we were able to mark the whole variable non-writeable */ + if (!(nir_intrinsic_access(instr) & ACCESS_NON_WRITEABLE) && + var->data.access & ACCESS_NON_WRITEABLE) { + progress = true; + nir_intrinsic_set_access(instr, + nir_intrinsic_access(instr) | + ACCESS_NON_WRITEABLE); + } + + bool is_ssbo = var->data.mode == nir_var_mem_ssbo; + + bool is_buffer = is_ssbo || + glsl_get_sampler_dim(glsl_without_array(var->type)) == GLSL_SAMPLER_DIM_BUF; + + if (can_reorder(state, nir_intrinsic_access(instr), is_buffer, is_ssbo)) { + progress = true; + nir_intrinsic_set_access(instr, + nir_intrinsic_access(instr) | + ACCESS_CAN_REORDER); + } + + return progress; + } + + default: + return false; + } +} + +static bool +opt_access_impl(struct access_state *state, + nir_function_impl *impl) +{ + bool progress = false; + + nir_foreach_block(block, impl) { + nir_foreach_instr(instr, block) { + if (instr->type == nir_instr_type_intrinsic) + progress |= process_intrinsic(state, + nir_instr_as_intrinsic(instr)); + } + } + + if (progress) { + nir_metadata_preserve(impl, + nir_metadata_block_index | + nir_metadata_dominance | + nir_metadata_live_ssa_defs | + nir_metadata_loop_analysis); + } + + + return progress; +} + +bool +nir_opt_access(nir_shader *shader) +{ + struct access_state state = { + .vars_written = _mesa_pointer_set_create(NULL), + }; + + bool var_progress = false; + bool progress = false; + + nir_foreach_function(func, shader) { + if (func->impl) { + nir_foreach_block(block, func->impl) { + nir_foreach_instr(instr, block) { + if (instr->type == nir_instr_type_intrinsic) + gather_intrinsic(&state, nir_instr_as_intrinsic(instr)); + } + } + } + } + + nir_foreach_variable(var, &shader->uniforms) + var_progress |= process_variable(&state, var); + + nir_foreach_function(func, shader) { + if (func->impl) { + progress |= opt_access_impl(&state, func->impl); + + /* If we make a change to the uniforms, update all the impls. */ + if (var_progress) { + nir_metadata_preserve(func->impl, + nir_metadata_block_index | + nir_metadata_dominance | + nir_metadata_live_ssa_defs | + nir_metadata_loop_analysis); + } + } + } + + progress |= var_progress; + + _mesa_set_destroy(state.vars_written, NULL); + return progress; +} diff -Nru mesa-19.2.8/src/compiler/nir/nir_opt_algebraic.py mesa-20.0.8/src/compiler/nir/nir_opt_algebraic.py --- mesa-19.2.8/src/compiler/nir/nir_opt_algebraic.py 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir_opt_algebraic.py 2020-06-12 01:21:16.000000000 +0000 @@ -29,6 +29,7 @@ import nir_algebraic from nir_opcodes import type_sizes import itertools +import struct from math import pi # Convenience variables @@ -84,6 +85,9 @@ x = ('fmul', ('fsub', x, ('fmul', x, ('fabs', x))), 4.0) return ('ffma', ('ffma', x, ('fabs', x), ('fneg', x)), 0.225, x) +def intBitsToFloat(i): + return struct.unpack('!f', struct.pack('!I', i))[0] + optimizations = [ (('imul', a, '#b@32(is_pos_power_of_two)'), ('ishl', a, ('find_lsb', b)), '!options->lower_bitops'), @@ -103,7 +107,7 @@ (('idiv', a, '#b@32(is_neg_power_of_two)'), ('ineg', ('imul', ('isign', a), ('ushr', ('iabs', a), ('find_lsb', ('iabs', b))))), 'options->lower_idiv'), (('umod', a, '#b(is_pos_power_of_two)'), ('iand', a, ('isub', b, 1))), - (('fneg', ('fneg', a)), a), + (('~fneg', ('fneg', a)), a), (('ineg', ('ineg', a)), a), (('fabs', ('fabs', a)), ('fabs', a)), (('fabs', ('fneg', a)), ('fabs', a)), @@ -129,7 +133,7 @@ (('imul', a, 0), 0), (('umul_unorm_4x8', a, 0), 0), (('umul_unorm_4x8', a, ~0), a), - (('fmul', a, 1.0), a), + (('~fmul', a, 1.0), a), (('imul', a, 1), a), (('fmul', a, -1.0), ('fneg', a)), (('imul', a, -1), ('ineg', a)), @@ -251,6 +255,22 @@ (ishr, a, ('imin', ('iadd', ('iand', b, mask), ('iand', c, mask)), s - 1))), ]) +# Optimize a pattern of address calculation created by DXVK where the offset is +# divided by 4 and then multipled by 4. This can be turned into an iand and the +# additions before can be reassociated to CSE the iand instruction. +for log2 in range(1, 7): # powers of two from 2 to 64 + v = 1 << log2 + mask = 0xffffffff & ~(v - 1) + b_is_multiple = '#b(is_unsigned_multiple_of_{})'.format(v) + + optimizations.extend([ + # 'a >> #b << #b' -> 'a & ~((1 << #b) - 1)' + (('ishl@32', ('ushr@32', a, log2), log2), ('iand', a, mask)), + + # Reassociate for improved CSE + (('iand@32', ('iadd@32', a, b_is_multiple), mask), ('iadd', ('iand', a, mask), b)), + ]) + optimizations.extend([ # This is common for address calculations. Reassociating may enable the # 'a<lower_fsat'), (('~fmax', ('fmin', a, 1.0), 0.0), ('fsat', a), '!options->lower_fsat'), - (('~fmin', ('fmax', a, -1.0), 0.0), ('fneg', ('fsat', ('fneg', a))), '!options->lower_negate && !options->lower_fsat'), - (('~fmax', ('fmin', a, 0.0), -1.0), ('fneg', ('fsat', ('fneg', a))), '!options->lower_negate && !options->lower_fsat'), + (('~fmin', ('fmax', a, -1.0), 0.0), ('fneg', ('fsat', ('fneg', a))), '!options->lower_fsat'), + (('~fmax', ('fmin', a, 0.0), -1.0), ('fneg', ('fsat', ('fneg', a))), '!options->lower_fsat'), (('fsat', ('fsign', a)), ('b2f', ('flt', 0.0, a))), (('fsat', ('b2f', a)), ('b2f', a)), (('fsat', a), ('fmin', ('fmax', a, 0.0), 1.0), 'options->lower_fsat'), (('fsat', ('fsat', a)), ('fsat', a)), - (('fsat', ('fneg(is_used_once)', ('fadd(is_used_once)', a, b))), ('fsat', ('fadd', ('fneg', a), ('fneg', b))), '!options->lower_negate && !options->lower_fsat'), - (('fsat', ('fneg(is_used_once)', ('fmul(is_used_once)', a, b))), ('fsat', ('fmul', ('fneg', a), b)), '!options->lower_negate && !options->lower_fsat'), + (('fsat', ('fneg(is_used_once)', ('fadd(is_used_once)', a, b))), ('fsat', ('fadd', ('fneg', a), ('fneg', b))), '!options->lower_fsat'), + (('fsat', ('fneg(is_used_once)', ('fmul(is_used_once)', a, b))), ('fsat', ('fmul', ('fneg', a), b)), '!options->lower_fsat'), (('fsat', ('fabs(is_used_once)', ('fmul(is_used_once)', a, b))), ('fsat', ('fmul', ('fabs', a), ('fabs', b))), '!options->lower_fsat'), (('fmin', ('fmax', ('fmin', ('fmax', a, b), c), b), c), ('fmin', ('fmax', a, b), c)), (('imin', ('imax', ('imin', ('imax', a, b), c), b), c), ('imin', ('imax', a, b), c)), @@ -670,7 +689,6 @@ # True/False are ~0 and 0 in NIR. b2i of True is 1, and -1 is ~0 (True). (('ineg', ('b2i32', 'a@32')), a), (('flt', ('fneg', ('b2f', 'a@1')), 0), a), # Generated by TGSI KILL_IF. - (('flt', ('fsub', 0.0, ('b2f', 'a@1')), 0), a), # Generated by TGSI KILL_IF. # Comparison with the same args. Note that these are not done for # the float versions because NaN always returns false on float # inequalities. @@ -742,6 +760,7 @@ (('~flog2', ('fpow', a, b)), ('fmul', b, ('flog2', a))), (('~fmul', ('fexp2(is_used_once)', a), ('fexp2(is_used_once)', b)), ('fexp2', ('fadd', a, b))), (('bcsel', ('flt', a, 0.0), 0.0, ('fsqrt', a)), ('fsqrt', ('fmax', a, 0.0))), + (('~fmul', ('fsqrt', a), ('fsqrt', a)), ('fabs',a)), # Division and reciprocal (('~fdiv', 1.0, a), ('frcp', a)), (('fdiv', a, b), ('fmul', a, ('frcp', b)), 'options->lower_fdiv'), @@ -770,7 +789,7 @@ (('bcsel', a, ('b2f(is_used_once)', 'b@32'), ('b2f', 'c@32')), ('b2f', ('bcsel', a, b, c))), (('bcsel', a, b, b), b), - (('fcsel', a, b, b), b), + (('~fcsel', a, b, b), b), # D3D Boolean emulation (('bcsel', a, -1, 0), ('ineg', ('b2i', 'a@1'))), @@ -784,6 +803,7 @@ (('ine', ('ineg', ('b2i', 'a@1')), 0), a), (('ine', ('ineg', ('b2i', 'a@1')), -1), ('inot', a)), (('iand', ('ineg', ('b2i', a)), 1.0), ('b2f', a)), + (('iand', ('ineg', ('b2i', a)), 1), ('b2i', a)), # SM5 32-bit shifts are defined to use the 5 least significant bits (('ishl', 'a@32', ('iand', 31, b)), ('ishl', a, b)), @@ -798,6 +818,11 @@ (('i2b', ('iabs', a)), ('i2b', a)), (('inot', ('f2b1', a)), ('feq', a, 0.0)), + # The C spec says, "If the value of the integral part cannot be represented + # by the integer type, the behavior is undefined." "Undefined" can mean + # "the conversion doesn't happen at all." + (('~i2f32', ('f2i32', 'a@32')), ('ftrunc', a)), + # Ironically, mark these as imprecise because removing the conversions may # preserve more precision than doing the conversions (e.g., # uint(float(0x81818181u)) == 0x81818200). @@ -840,31 +865,41 @@ (('fne', 'a(is_not_zero)', 0.0), True), (('feq', 'a(is_not_zero)', 0.0), False), + # In this chart, + means value > 0 and - means value < 0. + # + # + >= + -> unknown 0 >= + -> false - >= + -> false + # + >= 0 -> true 0 >= 0 -> true - >= 0 -> false + # + >= - -> true 0 >= - -> true - >= - -> unknown + # + # Using grouping conceptually similar to a Karnaugh map... + # + # (+ >= 0, + >= -, 0 >= 0, 0 >= -) == (is_not_negative >= is_not_positive) -> true + # (0 >= +, - >= +) == (is_not_positive >= gt_zero) -> false + # (- >= +, - >= 0) == (lt_zero >= is_not_negative) -> false + # + # The flt / ilt cases just invert the expected result. + # # The results expecting true, must be marked imprecise. The results # expecting false are fine because NaN compared >= or < anything is false. (('~fge', 'a(is_not_negative)', 'b(is_not_positive)'), True), - (('fge', 'b(is_not_positive)', 'a(is_gt_zero)'), False), + (('fge', 'a(is_not_positive)', 'b(is_gt_zero)'), False), (('fge', 'a(is_lt_zero)', 'b(is_not_negative)'), False), - (('~fge', 'b(is_not_negative)', 'a(is_not_positive)'), True), (('flt', 'a(is_not_negative)', 'b(is_not_positive)'), False), - (('~flt', 'b(is_not_positive)', 'a(is_gt_zero)'), True), + (('~flt', 'a(is_not_positive)', 'b(is_gt_zero)'), True), (('~flt', 'a(is_lt_zero)', 'b(is_not_negative)'), True), - (('flt', 'b(is_not_negative)', 'a(is_not_positive)'), False), (('ine', 'a(is_not_zero)', 0), True), (('ieq', 'a(is_not_zero)', 0), False), (('ige', 'a(is_not_negative)', 'b(is_not_positive)'), True), - (('ige', 'b(is_not_positive)', 'a(is_gt_zero)'), False), + (('ige', 'a(is_not_positive)', 'b(is_gt_zero)'), False), (('ige', 'a(is_lt_zero)', 'b(is_not_negative)'), False), - (('ige', 'b(is_not_negative)', 'a(is_not_positive)'), True), (('ilt', 'a(is_not_negative)', 'b(is_not_positive)'), False), - (('ilt', 'b(is_not_positive)', 'a(is_gt_zero)'), True), + (('ilt', 'a(is_not_positive)', 'b(is_gt_zero)'), True), (('ilt', 'a(is_lt_zero)', 'b(is_not_negative)'), True), - (('ilt', 'b(is_not_negative)', 'a(is_not_positive)'), False), (('ult', 0, 'a(is_gt_zero)'), True), @@ -898,6 +933,15 @@ (('unpack_half_2x16_split_y', ('iand', a, 0xffff0000)), ('unpack_half_2x16_split_y', a)), (('unpack_32_2x16_split_y', ('iand', a, 0xffff0000)), ('unpack_32_2x16_split_y', a)), (('unpack_64_2x32_split_y', ('iand', a, 0xffffffff00000000)), ('unpack_64_2x32_split_y', a)), + + # Optimize half packing + (('ishl', ('pack_half_2x16', ('vec2', a, 0)), 16), ('pack_half_2x16', ('vec2', 0, a))), + (('ushr', ('pack_half_2x16', ('vec2', 0, a)), 16), ('pack_half_2x16', ('vec2', a, 0))), + + (('iadd', ('pack_half_2x16', ('vec2', a, 0)), ('pack_half_2x16', ('vec2', 0, b))), + ('pack_half_2x16', ('vec2', a, b))), + (('ior', ('pack_half_2x16', ('vec2', a, 0)), ('pack_half_2x16', ('vec2', 0, b))), + ('pack_half_2x16', ('vec2', a, b))), ]) # After the ('extract_u8', a, 0) pattern, above, triggers, there will be @@ -925,18 +969,14 @@ (('iand', 0xffff, a), ('extract_u16', a, 0), '!options->lower_extract_word'), # Subtracts - (('~fsub', a, ('fsub', 0.0, b)), ('fadd', a, b)), - (('isub', a, ('isub', 0, b)), ('iadd', a, b)), (('ussub_4x8', a, 0), a), (('ussub_4x8', a, ~0), 0), - (('fsub', a, b), ('fadd', a, ('fneg', b)), 'options->lower_sub'), - (('isub', a, b), ('iadd', a, ('ineg', b)), 'options->lower_sub'), - (('fneg', a), ('fsub', 0.0, a), 'options->lower_negate'), - (('ineg', a), ('isub', 0, a), 'options->lower_negate'), - (('~fadd', a, ('fsub', 0.0, b)), ('fsub', a, b)), - (('iadd', a, ('isub', 0, b)), ('isub', a, b)), - (('fabs', ('fsub', 0.0, a)), ('fabs', a)), - (('iabs', ('isub', 0, a)), ('iabs', a)), + # Lower all Subtractions first - they can get recombined later + (('fsub', a, b), ('fadd', a, ('fneg', b))), + (('isub', a, b), ('iadd', a, ('ineg', b))), + (('uabs_usub', a, b), ('bcsel', ('ult', a, b), ('ineg', ('isub', a, b)), ('isub', a, b))), + # This is correct. We don't need isub_sat because the result type is unsigned, so it cannot overflow. + (('uabs_isub', a, b), ('bcsel', ('ilt', a, b), ('ineg', ('isub', a, b)), ('isub', a, b))), # Propagate negation up multiplication chains (('fmul(is_used_by_non_fsat)', ('fneg', a), b), ('fneg', ('fmul', a, b))), @@ -986,8 +1026,83 @@ (('uhadd', a, b), ('iadd', ('iand', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd'), (('irhadd', a, b), ('isub', ('ior', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd'), (('urhadd', a, b), ('isub', ('ior', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd'), + (('ihadd@64', a, b), ('iadd', ('iand', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd64 || (options->lower_int64_options & nir_lower_iadd64) != 0'), + (('uhadd@64', a, b), ('iadd', ('iand', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd64 || (options->lower_int64_options & nir_lower_iadd64) != 0'), + (('irhadd@64', a, b), ('isub', ('ior', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd64 || (options->lower_int64_options & nir_lower_iadd64) != 0'), + (('urhadd@64', a, b), ('isub', ('ior', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd64 || (options->lower_int64_options & nir_lower_iadd64) != 0'), + + (('uadd_sat@64', a, b), ('bcsel', ('ult', ('iadd', a, b), a), -1, ('iadd', a, b)), 'options->lower_add_sat || (options->lower_int64_options & nir_lower_iadd64) != 0'), (('uadd_sat', a, b), ('bcsel', ('ult', ('iadd', a, b), a), -1, ('iadd', a, b)), 'options->lower_add_sat'), (('usub_sat', a, b), ('bcsel', ('ult', a, b), 0, ('isub', a, b)), 'options->lower_add_sat'), + (('usub_sat@64', a, b), ('bcsel', ('ult', a, b), 0, ('isub', a, b)), 'options->lower_usub_sat64 || (options->lower_int64_options & nir_lower_iadd64) != 0'), + + # int64_t sum = a + b; + # + # if (a < 0 && b < 0 && a < sum) + # sum = INT64_MIN; + # } else if (a >= 0 && b >= 0 && sum < a) + # sum = INT64_MAX; + # } + # + # A couple optimizations are applied. + # + # 1. a < sum => sum >= 0. This replacement works because it is known that + # a < 0 and b < 0, so sum should also be < 0 unless there was + # underflow. + # + # 2. sum < a => sum < 0. This replacement works because it is known that + # a >= 0 and b >= 0, so sum should also be >= 0 unless there was + # overflow. + # + # 3. Invert the second if-condition and swap the order of parameters for + # the bcsel. !(a >= 0 && b >= 0 && sum < 0) becomes !(a >= 0) || !(b >= + # 0) || !(sum < 0), and that becomes (a < 0) || (b < 0) || (sum >= 0) + # + # On Intel Gen11, this saves ~11 instructions. + (('iadd_sat@64', a, b), ('bcsel', + ('iand', ('iand', ('ilt', a, 0), ('ilt', b, 0)), ('ige', ('iadd', a, b), 0)), + 0x8000000000000000, + ('bcsel', + ('ior', ('ior', ('ilt', a, 0), ('ilt', b, 0)), ('ige', ('iadd', a, b), 0)), + ('iadd', a, b), + 0x7fffffffffffffff)), + '(options->lower_int64_options & nir_lower_iadd64) != 0'), + + # int64_t sum = a - b; + # + # if (a < 0 && b >= 0 && a < sum) + # sum = INT64_MIN; + # } else if (a >= 0 && b < 0 && a >= sum) + # sum = INT64_MAX; + # } + # + # Optimizations similar to the iadd_sat case are applied here. + (('isub_sat@64', a, b), ('bcsel', + ('iand', ('iand', ('ilt', a, 0), ('ige', b, 0)), ('ige', ('isub', a, b), 0)), + 0x8000000000000000, + ('bcsel', + ('ior', ('ior', ('ilt', a, 0), ('ige', b, 0)), ('ige', ('isub', a, b), 0)), + ('isub', a, b), + 0x7fffffffffffffff)), + '(options->lower_int64_options & nir_lower_iadd64) != 0'), + + # These are done here instead of in the backend because the int64 lowering + # pass will make a mess of the patterns. The first patterns are + # conditioned on nir_lower_minmax64 because it was not clear that it was + # always an improvement on platforms that have real int64 support. No + # shaders in shader-db hit this, so it was hard to say one way or the + # other. + (('ilt', ('imax(is_used_once)', 'a@64', 'b@64'), 0), ('ilt', ('imax', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b)), 0), '(options->lower_int64_options & nir_lower_minmax64) != 0'), + (('ilt', ('imin(is_used_once)', 'a@64', 'b@64'), 0), ('ilt', ('imin', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b)), 0), '(options->lower_int64_options & nir_lower_minmax64) != 0'), + (('ige', ('imax(is_used_once)', 'a@64', 'b@64'), 0), ('ige', ('imax', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b)), 0), '(options->lower_int64_options & nir_lower_minmax64) != 0'), + (('ige', ('imin(is_used_once)', 'a@64', 'b@64'), 0), ('ige', ('imin', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b)), 0), '(options->lower_int64_options & nir_lower_minmax64) != 0'), + (('ilt', 'a@64', 0), ('ilt', ('unpack_64_2x32_split_y', a), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'), + (('ige', 'a@64', 0), ('ige', ('unpack_64_2x32_split_y', a), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'), + + (('ine', 'a@64', 0), ('ine', ('ior', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_y', a)), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'), + (('ieq', 'a@64', 0), ('ieq', ('ior', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_y', a)), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'), + # 0u < uint(a) <=> uint(a) != 0u + (('ult', 0, 'a@64'), ('ine', ('ior', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_y', a)), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'), # Alternative lowering that doesn't rely on bfi. (('bitfield_insert', 'base', 'insert', 'offset', 'bits'), @@ -1110,8 +1225,41 @@ 127.0))), 'options->lower_unpack_snorm_4x8'), + (('pack_half_2x16_split', 'a@32', 'b@32'), + ('ior', ('ishl', ('u2u32', ('f2f16', b)), 16), ('u2u32', ('f2f16', a))), + 'options->lower_pack_half_2x16_split'), + + (('unpack_half_2x16_split_x', 'a@32'), + ('f2f32', ('u2u16', a)), + 'options->lower_unpack_half_2x16_split'), + + (('unpack_half_2x16_split_y', 'a@32'), + ('f2f32', ('u2u16', ('ushr', a, 16))), + 'options->lower_unpack_half_2x16_split'), + (('isign', a), ('imin', ('imax', a, -1), 1), 'options->lower_isign'), (('fsign', a), ('fsub', ('b2f', ('flt', 0.0, a)), ('b2f', ('flt', a, 0.0))), 'options->lower_fsign'), + + # Address/offset calculations: + # Drivers supporting imul24 should use the nir_lower_amul() pass, this + # rule converts everyone else to imul: + (('amul', a, b), ('imul', a, b), '!options->has_imul24'), + + (('imad24_ir3', a, b, 0), ('imul24', a, b)), + (('imad24_ir3', a, 0, c), (c)), + (('imad24_ir3', a, 1, c), ('iadd', a, c)), + + # if first two srcs are const, crack apart the imad so constant folding + # can clean up the imul: + # TODO ffma should probably get a similar rule: + (('imad24_ir3', '#a', '#b', c), ('iadd', ('imul', a, b), c)), + + # These will turn 24b address/offset calc back into 32b shifts, but + # it should be safe to get back some of the bits of precision that we + # already decided were no necessary: + (('imul24', a, '#b@32(is_pos_power_of_two)'), ('ishl', a, ('find_lsb', b)), '!options->lower_bitops'), + (('imul24', a, '#b@32(is_neg_power_of_two)'), ('ineg', ('ishl', a, ('find_lsb', ('iabs', b)))), '!options->lower_bitops'), + (('imul24', a, 0), (0)), ]) # bit_size dependent lowerings @@ -1129,7 +1277,7 @@ ('bcsel', ('ilt', a, ('isub', a, b)), intmin, ('isub', a, b))), 'options->lower_add_sat'), ] -invert = OrderedDict([('feq', 'fne'), ('fne', 'feq'), ('fge', 'flt'), ('flt', 'fge')]) +invert = OrderedDict([('feq', 'fne'), ('fne', 'feq')]) for left, right in itertools.combinations_with_replacement(invert.keys(), 2): optimizations.append((('inot', ('ior(is_used_once)', (left, a, b), (right, c, d))), @@ -1391,6 +1539,44 @@ (('imadsh_mix16', 'a@32', '#b@32(is_upper_half_zero)', 'c@32'), ('c')), ] +# These kinds of sequences can occur after nir_opt_peephole_select. +# +# NOTE: fadd is not handled here because that gets in the way of ffma +# generation in the i965 driver. Instead, fadd and ffma are handled in +# late_optimizations. + +for op in ['flrp']: + optimizations += [ + (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, b, c, e)), (op, b, c, ('bcsel', a, d, e))), + (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', b, c, e)), (op, b, c, ('bcsel', a, d, e))), + (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, b, e, d)), (op, b, ('bcsel', a, c, e), d)), + (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', b, e, d)), (op, b, ('bcsel', a, c, e), d)), + (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, e, c, d)), (op, ('bcsel', a, b, e), c, d)), + (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', e, c, d)), (op, ('bcsel', a, b, e), c, d)), + ] + +for op in ['fmul', 'iadd', 'imul', 'iand', 'ior', 'ixor', 'fmin', 'fmax', 'imin', 'imax', 'umin', 'umax']: + optimizations += [ + (('bcsel', a, (op + '(is_used_once)', b, c), (op, b, 'd(is_not_const)')), (op, b, ('bcsel', a, c, d))), + (('bcsel', a, (op + '(is_used_once)', b, 'c(is_not_const)'), (op, b, d)), (op, b, ('bcsel', a, c, d))), + (('bcsel', a, (op, b, 'c(is_not_const)'), (op + '(is_used_once)', b, d)), (op, b, ('bcsel', a, c, d))), + (('bcsel', a, (op, b, c), (op + '(is_used_once)', b, 'd(is_not_const)')), (op, b, ('bcsel', a, c, d))), + ] + +for op in ['fpow']: + optimizations += [ + (('bcsel', a, (op + '(is_used_once)', b, c), (op, b, d)), (op, b, ('bcsel', a, c, d))), + (('bcsel', a, (op, b, c), (op + '(is_used_once)', b, d)), (op, b, ('bcsel', a, c, d))), + (('bcsel', a, (op + '(is_used_once)', b, c), (op, d, c)), (op, ('bcsel', a, b, d), c)), + (('bcsel', a, (op, b, c), (op + '(is_used_once)', d, c)), (op, ('bcsel', a, b, d), c)), + ] + +for op in ['frcp', 'frsq', 'fsqrt', 'fexp2', 'flog2', 'fsign', 'fsin', 'fcos']: + optimizations += [ + (('bcsel', a, (op + '(is_used_once)', b), (op, c)), (op, ('bcsel', a, b, c))), + (('bcsel', a, (op, b), (op + '(is_used_once)', c)), (op, ('bcsel', a, b, c))), + ] + # This section contains "late" optimizations that should be run before # creating ffmas and calling regular optimizations for the final time. # Optimizations should go here if they help code generation and conflict @@ -1434,6 +1620,12 @@ # optimization loop can prevent other optimizations. (('fneg', ('fneg', a)), a), + # Subtractions get lowered during optimization, so we need to recombine them + (('fadd', 'a', ('fneg', 'b')), ('fsub', 'a', 'b'), '!options->lower_sub'), + (('iadd', 'a', ('ineg', 'b')), ('isub', 'a', 'b'), '!options->lower_sub'), + (('fneg', a), ('fsub', 0.0, a), 'options->lower_negate'), + (('ineg', a), ('isub', 0, a), 'options->lower_negate'), + # These are duplicated from the main optimizations table. The late # patterns that rearrange expressions like x - .5 < 0 to x < .5 can create # new patterns like these. The patterns that compare with zero are removed @@ -1520,6 +1712,11 @@ (('bcsel', a, 0, ('b2f32', ('inot', 'b@bool'))), ('b2f32', ('inot', ('ior', a, b)))), + # Putting this in 'optimizations' interferes with the bcsel(a, op(b, c), + # op(b, d)) => op(b, bcsel(a, c, d)) transformations. I do not know why. + (('bcsel', ('feq', ('fsqrt', 'a(is_not_negative)'), 0.0), intBitsToFloat(0x7f7fffff), ('frsq', a)), + ('fmin', ('frsq', a), intBitsToFloat(0x7f7fffff))), + # Things that look like DPH in the source shader may get expanded to # something that looks like dot(v1.xyz, v2.xyz) + v1.w by the time it gets # to NIR. After FFMA is generated, this can look like: @@ -1542,6 +1739,21 @@ ('ffma', a, b, ('ffma', c, d, e)), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'), ] +for op in ['fadd']: + late_optimizations += [ + (('bcsel', a, (op + '(is_used_once)', b, c), (op, b, d)), (op, b, ('bcsel', a, c, d))), + (('bcsel', a, (op, b, c), (op + '(is_used_once)', b, d)), (op, b, ('bcsel', a, c, d))), + ] + +for op in ['ffma']: + late_optimizations += [ + (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, b, c, e)), (op, b, c, ('bcsel', a, d, e))), + (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', b, c, e)), (op, b, c, ('bcsel', a, d, e))), + + (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, b, e, d)), (op, b, ('bcsel', a, c, e), d)), + (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', b, e, d)), (op, b, ('bcsel', a, c, e), d)), + ] + print(nir_algebraic.AlgebraicPass("nir_opt_algebraic", optimizations).render()) print(nir_algebraic.AlgebraicPass("nir_opt_algebraic_before_ffma", before_ffma_optimizations).render()) diff -Nru mesa-19.2.8/src/compiler/nir/nir_opt_combine_stores.c mesa-20.0.8/src/compiler/nir/nir_opt_combine_stores.c --- mesa-19.2.8/src/compiler/nir/nir_opt_combine_stores.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir_opt_combine_stores.c 2020-06-12 01:21:16.000000000 +0000 @@ -84,7 +84,7 @@ alloc_combined_store(struct combine_stores_state *state) { struct combined_store *result; - if (list_empty(&state->freelist)) { + if (list_is_empty(&state->freelist)) { result = linear_zalloc_child(state->lin_ctx, sizeof(*result)); } else { result = list_first_entry(&state->freelist, @@ -287,10 +287,11 @@ nir_foreach_instr_safe(instr, block) { if (instr->type == nir_instr_type_call) { combine_stores_with_modes(state, nir_var_shader_out | - nir_var_shader_temp | - nir_var_function_temp | - nir_var_mem_ssbo | - nir_var_mem_shared); + nir_var_shader_temp | + nir_var_function_temp | + nir_var_mem_ssbo | + nir_var_mem_shared | + nir_var_mem_global); continue; } @@ -303,17 +304,33 @@ update_combined_store(state, intrin); break; - case nir_intrinsic_barrier: + case nir_intrinsic_control_barrier: case nir_intrinsic_group_memory_barrier: case nir_intrinsic_memory_barrier: - case nir_intrinsic_memory_barrier_atomic_counter: + combine_stores_with_modes(state, nir_var_shader_out | + nir_var_mem_ssbo | + nir_var_mem_shared | + nir_var_mem_global); + break; + case nir_intrinsic_memory_barrier_buffer: - case nir_intrinsic_memory_barrier_image: + combine_stores_with_modes(state, nir_var_mem_ssbo | + nir_var_mem_global); + break; + case nir_intrinsic_memory_barrier_shared: - /* TODO: Be more granular depending on the barrier. */ - combine_stores_with_modes(state, nir_var_shader_out | - nir_var_mem_ssbo | - nir_var_mem_shared); + combine_stores_with_modes(state, nir_var_mem_shared); + break; + + case nir_intrinsic_memory_barrier_tcs_patch: + combine_stores_with_modes(state, nir_var_shader_out); + break; + + case nir_intrinsic_scoped_memory_barrier: + if (nir_intrinsic_memory_semantics(intrin) & NIR_MEMORY_RELEASE) { + combine_stores_with_modes(state, + nir_intrinsic_memory_modes(intrin)); + } break; case nir_intrinsic_emit_vertex: diff -Nru mesa-19.2.8/src/compiler/nir/nir_opt_comparison_pre.c mesa-20.0.8/src/compiler/nir/nir_opt_comparison_pre.c --- mesa-19.2.8/src/compiler/nir/nir_opt_comparison_pre.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir_opt_comparison_pre.c 2020-06-12 01:21:16.000000000 +0000 @@ -325,8 +325,8 @@ * and neither operand is immediate value 0, add it to the set. */ if (is_used_by_if(alu) && - is_not_const_zero(alu, 0, 1, swizzle) && - is_not_const_zero(alu, 1, 1, swizzle)) + is_not_const_zero(NULL, alu, 0, 1, swizzle) && + is_not_const_zero(NULL, alu, 1, 1, swizzle)) add_instruction_for_block(bi, alu); break; diff -Nru mesa-19.2.8/src/compiler/nir/nir_opt_constant_folding.c mesa-20.0.8/src/compiler/nir/nir_opt_constant_folding.c --- mesa-19.2.8/src/compiler/nir/nir_opt_constant_folding.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir_opt_constant_folding.c 2020-06-12 01:21:16.000000000 +0000 @@ -33,13 +33,14 @@ */ struct constant_fold_state { - void *mem_ctx; - nir_function_impl *impl; - bool progress; + nir_shader *shader; + unsigned execution_mode; + bool has_load_constant; + bool has_indirect_load_const; }; static bool -constant_fold_alu_instr(nir_alu_instr *instr, void *mem_ctx) +constant_fold_alu_instr(struct constant_fold_state *state, nir_alu_instr *instr) { nir_const_value src[NIR_MAX_VEC_COMPONENTS][NIR_MAX_VEC_COMPONENTS]; @@ -94,10 +95,10 @@ for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; ++i) srcs[i] = src[i]; nir_eval_const_opcode(instr->op, dest, instr->dest.dest.ssa.num_components, - bit_size, srcs); + bit_size, srcs, state->execution_mode); nir_load_const_instr *new_instr = - nir_load_const_instr_create(mem_ctx, + nir_load_const_instr_create(state->shader, instr->dest.dest.ssa.num_components, instr->dest.dest.ssa.bit_size); @@ -115,7 +116,7 @@ } static bool -constant_fold_intrinsic_instr(nir_intrinsic_instr *instr) +constant_fold_intrinsic_instr(struct constant_fold_state *state, nir_intrinsic_instr *instr) { bool progress = false; @@ -123,19 +124,10 @@ instr->intrinsic == nir_intrinsic_discard_if) && nir_src_is_const(instr->src[0])) { if (nir_src_as_bool(instr->src[0])) { - /* This method of getting a nir_shader * from a nir_instr is - * admittedly gross, but given the rarity of hitting this case I think - * it's preferable to plumbing an otherwise unused nir_shader * - * parameter through four functions to get here. - */ - nir_cf_node *cf_node = &instr->instr.block->cf_node; - nir_function_impl *impl = nir_cf_node_get_function(cf_node); - nir_shader *shader = impl->function->shader; - nir_intrinsic_op op = instr->intrinsic == nir_intrinsic_discard_if ? nir_intrinsic_discard : nir_intrinsic_demote; - nir_intrinsic_instr *new_instr = nir_intrinsic_instr_create(shader, op); + nir_intrinsic_instr *new_instr = nir_intrinsic_instr_create(state->shader, op); nir_instr_insert_before(&instr->instr, &new_instr->instr); nir_instr_remove(&instr->instr); progress = true; @@ -144,24 +136,68 @@ nir_instr_remove(&instr->instr); progress = true; } + } else if (instr->intrinsic == nir_intrinsic_load_constant) { + state->has_load_constant = true; + + if (!nir_src_is_const(instr->src[0])) { + state->has_indirect_load_const = true; + return progress; + } + + unsigned offset = nir_src_as_uint(instr->src[0]); + unsigned base = nir_intrinsic_base(instr); + unsigned range = nir_intrinsic_range(instr); + assert(base + range <= state->shader->constant_data_size); + + nir_instr *new_instr = NULL; + if (offset >= range) { + nir_ssa_undef_instr *undef = + nir_ssa_undef_instr_create(state->shader, + instr->num_components, + instr->dest.ssa.bit_size); + + nir_ssa_def_rewrite_uses(&instr->dest.ssa, nir_src_for_ssa(&undef->def)); + new_instr = &undef->instr; + } else { + nir_load_const_instr *load_const = + nir_load_const_instr_create(state->shader, + instr->num_components, + instr->dest.ssa.bit_size); + + uint8_t *data = (uint8_t*)state->shader->constant_data + base; + for (unsigned i = 0; i < instr->num_components; i++) { + unsigned bytes = instr->dest.ssa.bit_size / 8; + bytes = MIN2(bytes, range - offset); + + memcpy(&load_const->value[i].u64, data + offset, bytes); + offset += bytes; + } + + nir_ssa_def_rewrite_uses(&instr->dest.ssa, nir_src_for_ssa(&load_const->def)); + new_instr = &load_const->instr; + } + + nir_instr_insert_before(&instr->instr, new_instr); + nir_instr_remove(&instr->instr); + progress = true; } return progress; } static bool -constant_fold_block(nir_block *block, void *mem_ctx) +constant_fold_block(struct constant_fold_state *state, nir_block *block) { bool progress = false; nir_foreach_instr_safe(instr, block) { switch (instr->type) { case nir_instr_type_alu: - progress |= constant_fold_alu_instr(nir_instr_as_alu(instr), mem_ctx); + progress |= constant_fold_alu_instr(state, nir_instr_as_alu(instr)); break; case nir_instr_type_intrinsic: progress |= - constant_fold_intrinsic_instr(nir_instr_as_intrinsic(instr)); + constant_fold_intrinsic_instr(state, nir_instr_as_intrinsic(instr)); break; default: /* Don't know how to constant fold */ @@ -173,13 +209,12 @@ } static bool -nir_opt_constant_folding_impl(nir_function_impl *impl) +nir_opt_constant_folding_impl(struct constant_fold_state *state, nir_function_impl *impl) { - void *mem_ctx = ralloc_parent(impl); bool progress = false; nir_foreach_block(block, impl) { - progress |= constant_fold_block(block, mem_ctx); + progress |= constant_fold_block(state, block); } if (progress) { @@ -198,10 +233,25 @@ nir_opt_constant_folding(nir_shader *shader) { bool progress = false; + struct constant_fold_state state; + state.shader = shader; + state.execution_mode = shader->info.float_controls_execution_mode; + state.has_load_constant = false; + state.has_indirect_load_const = false; nir_foreach_function(function, shader) { if (function->impl) - progress |= nir_opt_constant_folding_impl(function->impl); + progress |= nir_opt_constant_folding_impl(&state, function->impl); + } + + /* This doesn't free the constant data if there are no constant loads because + * the data might still be used but the loads have been lowered to load_ubo + */ + if (state.has_load_constant && !state.has_indirect_load_const && + shader->constant_data_size) { + ralloc_free(shader->constant_data); + shader->constant_data = NULL; + shader->constant_data_size = 0; } return progress; diff -Nru mesa-19.2.8/src/compiler/nir/nir_opt_copy_prop_vars.c mesa-20.0.8/src/compiler/nir/nir_opt_copy_prop_vars.c --- mesa-19.2.8/src/compiler/nir/nir_opt_copy_prop_vars.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir_opt_copy_prop_vars.c 2020-06-12 01:21:16.000000000 +0000 @@ -155,7 +155,8 @@ nir_var_shader_temp | nir_var_function_temp | nir_var_mem_ssbo | - nir_var_mem_shared; + nir_var_mem_shared | + nir_var_mem_global; continue; } @@ -164,11 +165,18 @@ nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); switch (intrin->intrinsic) { - case nir_intrinsic_barrier: + case nir_intrinsic_control_barrier: + case nir_intrinsic_group_memory_barrier: case nir_intrinsic_memory_barrier: written->modes |= nir_var_shader_out | nir_var_mem_ssbo | - nir_var_mem_shared; + nir_var_mem_shared | + nir_var_mem_global; + break; + + case nir_intrinsic_scoped_memory_barrier: + if (nir_intrinsic_memory_semantics(intrin) & NIR_MEMORY_ACQUIRE) + written->modes |= nir_intrinsic_memory_modes(intrin); break; case nir_intrinsic_emit_vertex: @@ -783,7 +791,8 @@ nir_var_shader_temp | nir_var_function_temp | nir_var_mem_ssbo | - nir_var_mem_shared); + nir_var_mem_shared | + nir_var_mem_global); if (debug) dump_copy_entries(copies); continue; } @@ -793,13 +802,40 @@ nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); switch (intrin->intrinsic) { - case nir_intrinsic_barrier: + case nir_intrinsic_control_barrier: case nir_intrinsic_memory_barrier: if (debug) dump_instr(instr); apply_barrier_for_modes(copies, nir_var_shader_out | nir_var_mem_ssbo | - nir_var_mem_shared); + nir_var_mem_shared | + nir_var_mem_global); + break; + + case nir_intrinsic_memory_barrier_buffer: + if (debug) dump_instr(instr); + + apply_barrier_for_modes(copies, nir_var_mem_ssbo | + nir_var_mem_global); + break; + + case nir_intrinsic_memory_barrier_shared: + if (debug) dump_instr(instr); + + apply_barrier_for_modes(copies, nir_var_mem_shared); + break; + + case nir_intrinsic_memory_barrier_tcs_patch: + if (debug) dump_instr(instr); + + apply_barrier_for_modes(copies, nir_var_shader_out); + break; + + case nir_intrinsic_scoped_memory_barrier: + if (debug) dump_instr(instr); + + if (nir_intrinsic_memory_semantics(intrin) & NIR_MEMORY_ACQUIRE) + apply_barrier_for_modes(copies, nir_intrinsic_memory_modes(intrin)); break; case nir_intrinsic_emit_vertex: @@ -832,6 +868,7 @@ b->cursor = nir_instr_remove(instr); nir_ssa_def *u = nir_ssa_undef(b, 1, intrin->dest.ssa.bit_size); nir_ssa_def_rewrite_uses(&intrin->dest.ssa, nir_src_for_ssa(u)); + state->progress = true; break; } } @@ -915,6 +952,7 @@ /* Storing to an invalid index is a no-op. */ if (vec_index >= vec_comps) { nir_instr_remove(instr); + state->progress = true; break; } } @@ -926,6 +964,7 @@ * store is redundant so remove it. */ nir_instr_remove(instr); + state->progress = true; } else { struct value value = {0}; value_set_ssa_components(&value, intrin->src[1].ssa, @@ -952,6 +991,7 @@ if (nir_compare_derefs(src, dst) & nir_derefs_equal_bit) { /* This is a no-op self-copy. Get rid of it */ nir_instr_remove(instr); + state->progress = true; continue; } diff -Nru mesa-19.2.8/src/compiler/nir/nir_opt_dead_write_vars.c mesa-20.0.8/src/compiler/nir/nir_opt_dead_write_vars.c --- mesa-19.2.8/src/compiler/nir/nir_opt_dead_write_vars.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir_opt_dead_write_vars.c 2020-06-12 01:21:16.000000000 +0000 @@ -122,7 +122,8 @@ nir_var_shader_temp | nir_var_function_temp | nir_var_mem_ssbo | - nir_var_mem_shared); + nir_var_mem_shared | + nir_var_mem_global); continue; } @@ -131,11 +132,34 @@ nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); switch (intrin->intrinsic) { - case nir_intrinsic_barrier: + case nir_intrinsic_control_barrier: + case nir_intrinsic_group_memory_barrier: case nir_intrinsic_memory_barrier: { clear_unused_for_modes(&unused_writes, nir_var_shader_out | nir_var_mem_ssbo | - nir_var_mem_shared); + nir_var_mem_shared | + nir_var_mem_global); + break; + } + + case nir_intrinsic_memory_barrier_buffer: + clear_unused_for_modes(&unused_writes, nir_var_mem_ssbo | + nir_var_mem_global); + break; + + case nir_intrinsic_memory_barrier_shared: + clear_unused_for_modes(&unused_writes, nir_var_mem_shared); + break; + + case nir_intrinsic_memory_barrier_tcs_patch: + clear_unused_for_modes(&unused_writes, nir_var_shader_out); + break; + + case nir_intrinsic_scoped_memory_barrier: { + if (nir_intrinsic_memory_semantics(intrin) & NIR_MEMORY_RELEASE) { + clear_unused_for_modes(&unused_writes, + nir_intrinsic_memory_modes(intrin)); + } break; } diff -Nru mesa-19.2.8/src/compiler/nir/nir_opt_if.c mesa-20.0.8/src/compiler/nir/nir_opt_if.c --- mesa-19.2.8/src/compiler/nir/nir_opt_if.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir_opt_if.c 2020-06-12 01:21:16.000000000 +0000 @@ -309,35 +309,29 @@ * * - At least one source of the instruction is a phi node from the header block. * - * and either this rule + * - The phi node selects a constant or undef from the block before the loop. * - * - The phi node selects undef from the block before the loop and a value - * from the continue block of the loop. - * - * or these two rules - * - * - The phi node selects a constant from the block before the loop. - * - * - The non-phi source of the ALU instruction comes from a block that + * - Any non-phi sources of the ALU instruction come from a block that * dominates the block before the loop. The most common failure mode for * this check is sources that are generated in the loop header block. * - * The split process moves the original ALU instruction to the bottom of the - * loop. The phi node source is replaced with the value from the phi node - * selected from the continue block (i.e., the non-undef value). A new phi - * node is added to the header block that selects either undef from the block - * before the loop or the result of the (moved) ALU instruction. + * The split process splits the original ALU instruction into two, one at the + * bottom of the loop and one at the block before the loop. The instruction + * before the loop computes the value on the first iteration, and the + * instruction at the bottom computes the value on the second, third, and so + * on. A new phi node is added to the header block that selects either the + * instruction before the loop or the one at the end, and uses of the original + * instruction are replaced by this phi. * * The splitting transforms a loop like: * - * vec1 32 ssa_7 = undefined * vec1 32 ssa_8 = load_const (0x00000001) * vec1 32 ssa_10 = load_const (0x00000000) * // succs: block_1 * loop { * block block_1: * // preds: block_0 block_4 - * vec1 32 ssa_11 = phi block_0: ssa_7, block_4: ssa_15 + * vec1 32 ssa_11 = phi block_0: ssa_10, block_4: ssa_15 * vec1 32 ssa_12 = phi block_0: ssa_1, block_4: ssa_15 * vec1 32 ssa_13 = phi block_0: ssa_10, block_4: ssa_16 * vec1 32 ssa_14 = iadd ssa_11, ssa_8 @@ -348,27 +342,22 @@ * * into: * - * vec1 32 ssa_7 = undefined * vec1 32 ssa_8 = load_const (0x00000001) * vec1 32 ssa_10 = load_const (0x00000000) + * vec1 32 ssa_22 = iadd ssa_10, ssa_8 * // succs: block_1 * loop { * block block_1: * // preds: block_0 block_4 - * vec1 32 ssa_11 = phi block_0: ssa_7, block_4: ssa_15 + * vec1 32 ssa_11 = phi block_0: ssa_10, block_4: ssa_15 * vec1 32 ssa_12 = phi block_0: ssa_1, block_4: ssa_15 * vec1 32 ssa_13 = phi block_0: ssa_10, block_4: ssa_16 - * vec1 32 ssa_21 = phi block_0: sss_7, block_4: ssa_20 + * vec1 32 ssa_21 = phi block_0: ssa_22, block_4: ssa_20 * vec1 32 ssa_15 = b32csel ssa_13, ssa_21, ssa_12 * ... * vec1 32 ssa_20 = iadd ssa_15, ssa_8 * // succs: block_1 * } - * - * If the phi does not select an undef, the instruction is duplicated in the - * loop continue block (as in the undef case) and in the previous block. When - * the ALU instruction is duplicated in the previous block, the correct source - * must be selected from the phi node. */ static bool opt_split_alu_of_phi(nir_builder *b, nir_loop *loop) @@ -394,19 +383,12 @@ nir_alu_instr *const alu = nir_instr_as_alu(instr); - /* Most ALU ops produce an undefined result if any source is undef. - * However, operations like bcsel only produce undefined results of the - * first operand is undef. Even in the undefined case, the result - * should be one of the other two operands, so the result of the bcsel - * should never be replaced with undef. - * - * nir_op_vec{2,3,4} and nir_op_mov are excluded because they can easily - * lead to infinite optimization loops. + /* nir_op_vec{2,3,4} and nir_op_mov are excluded because they can easily + * lead to infinite optimization loops. Splitting comparisons can lead + * to loop unrolling not recognizing loop termintators, and type + * conversions also lead to regressions. */ - if (alu->op == nir_op_bcsel || - alu->op == nir_op_b32csel || - alu->op == nir_op_fcsel || - alu->op == nir_op_vec2 || + if (alu->op == nir_op_vec2 || alu->op == nir_op_vec3 || alu->op == nir_op_vec4 || alu->op == nir_op_mov || @@ -477,26 +459,9 @@ if (has_phi_src_from_prev_block && all_non_phi_exist_in_prev_block && (is_prev_result_undef || is_prev_result_const)) { nir_block *const continue_block = find_continue_block(loop); - nir_ssa_def *prev_value; - if (!is_prev_result_undef) { - b->cursor = nir_after_block(prev_block); - prev_value = clone_alu_and_replace_src_defs(b, alu, prev_srcs); - } else { - /* Since the undef used as the source of the original ALU - * instruction may have different number of components or - * bit size than the result of that instruction, a new - * undef must be created. - */ - nir_ssa_undef_instr *undef = - nir_ssa_undef_instr_create(b->shader, - alu->dest.dest.ssa.num_components, - alu->dest.dest.ssa.bit_size); - - nir_instr_insert_after_block(prev_block, &undef->instr); - - prev_value = &undef->def; - } + b->cursor = nir_after_block(prev_block); + nir_ssa_def *prev_value = clone_alu_and_replace_src_defs(b, alu, prev_srcs); /* Make a copy of the original ALU instruction. Replace the sources * of the new instruction that read a phi with an undef source from @@ -670,7 +635,7 @@ * bcsel that must come before any break. * * For more details, see - * https://gitlab.freedesktop.org/mesa/mesa/merge_requests/170#note_110305 + * https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/170#note_110305 */ nir_foreach_instr_safe(instr, header_block) { if (instr->type != nir_instr_type_alu) diff -Nru mesa-19.2.8/src/compiler/nir/nir_opt_large_constants.c mesa-20.0.8/src/compiler/nir/nir_opt_large_constants.c --- mesa-19.2.8/src/compiler/nir/nir_opt_large_constants.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir_opt_large_constants.c 2020-06-12 01:21:16.000000000 +0000 @@ -179,19 +179,15 @@ /* This pass can only be run once */ assert(shader->constant_data == NULL && shader->constant_data_size == 0); - /* The index parameter is unused for local variables so we'll use it for - * indexing into our array of variable metadata. - */ - unsigned num_locals = 0; - nir_foreach_variable(var, &impl->locals) - var->data.index = num_locals++; + unsigned num_locals = exec_list_length(&impl->locals); + nir_index_vars(shader, impl, nir_var_function_temp); if (num_locals == 0) return false; struct var_info *var_infos = ralloc_array(NULL, struct var_info, num_locals); nir_foreach_variable(var, &impl->locals) { - var_infos[var->data.index] = (struct var_info) { + var_infos[var->index] = (struct var_info) { .var = var, .is_constant = true, .found_read = false, @@ -225,13 +221,7 @@ break; case nir_intrinsic_copy_deref: - /* We always assume the src and therefore the dst are not - * constants here. Copy and constant propagation passes should - * have taken care of this in most cases anyway. - */ - dst_deref = nir_src_as_deref(intrin->src[0]); - src_deref = nir_src_as_deref(intrin->src[1]); - src_is_const = false; + assert(!"Lowering of copy_deref with large constants is prohibited"); break; default: @@ -242,7 +232,7 @@ nir_variable *var = nir_deref_instr_get_variable(dst_deref); assert(var->data.mode == nir_var_function_temp); - struct var_info *info = &var_infos[var->data.index]; + struct var_info *info = &var_infos[var->index]; if (!info->is_constant) continue; @@ -270,7 +260,7 @@ /* We only consider variables constant if all the reads are * dominated by the block that writes to it. */ - struct var_info *info = &var_infos[var->data.index]; + struct var_info *info = &var_infos[var->index]; if (!info->is_constant) continue; @@ -292,7 +282,7 @@ struct var_info *info = &var_infos[i]; /* Fix up indices after we sorted. */ - info->var->data.index = i; + info->var->index = i; if (!info->is_constant) continue; @@ -345,7 +335,7 @@ continue; nir_variable *var = nir_deref_instr_get_variable(deref); - struct var_info *info = &var_infos[var->data.index]; + struct var_info *info = &var_infos[var->index]; if (info->is_constant) { b.cursor = nir_after_instr(&intrin->instr); nir_ssa_def *val = build_constant_load(&b, deref, size_align); @@ -363,31 +353,14 @@ continue; nir_variable *var = nir_deref_instr_get_variable(deref); - struct var_info *info = &var_infos[var->data.index]; - if (info->is_constant) { - nir_instr_remove(&intrin->instr); - nir_deref_instr_remove_if_unused(deref); - } - break; - } - - case nir_intrinsic_copy_deref: { - nir_deref_instr *deref = nir_src_as_deref(intrin->src[1]); - if (deref->mode != nir_var_function_temp) - continue; - - nir_variable *var = nir_deref_instr_get_variable(deref); - struct var_info *info = &var_infos[var->data.index]; + struct var_info *info = &var_infos[var->index]; if (info->is_constant) { - b.cursor = nir_after_instr(&intrin->instr); - nir_ssa_def *val = build_constant_load(&b, deref, size_align); - nir_store_deref(&b, nir_src_as_deref(intrin->src[0]), val, ~0); nir_instr_remove(&intrin->instr); nir_deref_instr_remove_if_unused(deref); } break; } - + case nir_intrinsic_copy_deref: default: continue; } diff -Nru mesa-19.2.8/src/compiler/nir/nir_opt_load_store_vectorize.c mesa-20.0.8/src/compiler/nir/nir_opt_load_store_vectorize.c --- mesa-19.2.8/src/compiler/nir/nir_opt_load_store_vectorize.c 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir_opt_load_store_vectorize.c 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,1305 @@ +/* + * Copyright © 2019 Valve Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +/** + * Although it's called a load/store "vectorization" pass, this also combines + * intersecting and identical loads/stores. It currently supports derefs, ubo, + * ssbo and push constant loads/stores. + * + * This doesn't handle copy_deref intrinsics and assumes that + * nir_lower_alu_to_scalar() has been called and that the IR is free from ALU + * modifiers. It also assumes that derefs have explicitly laid out types. + * + * After vectorization, the backend may want to call nir_lower_alu_to_scalar() + * and nir_lower_pack(). Also this creates cast instructions taking derefs as a + * source and some parts of NIR may not be able to handle that well. + * + * There are a few situations where this doesn't vectorize as well as it could: + * - It won't turn four consecutive vec3 loads into 3 vec4 loads. + * - It doesn't do global vectorization. + * Handling these cases probably wouldn't provide much benefit though. +*/ + +#include "nir.h" +#include "nir_deref.h" +#include "nir_builder.h" +#include "nir_worklist.h" +#include "util/u_dynarray.h" + +#include + +struct intrinsic_info { + nir_variable_mode mode; /* 0 if the mode is obtained from the deref. */ + nir_intrinsic_op op; + bool is_atomic; + /* Indices into nir_intrinsic::src[] or -1 if not applicable. */ + int resource_src; /* resource (e.g. from vulkan_resource_index) */ + int base_src; /* offset which it loads/stores from */ + int deref_src; /* deref which is loads/stores from */ + int value_src; /* the data it is storing */ +}; + +static const struct intrinsic_info * +get_info(nir_intrinsic_op op) { + switch (op) { +#define INFO(mode, op, atomic, res, base, deref, val) \ +case nir_intrinsic_##op: {\ + static const struct intrinsic_info op##_info = {mode, nir_intrinsic_##op, atomic, res, base, deref, val};\ + return &op##_info;\ +} +#define LOAD(mode, op, res, base, deref) INFO(mode, load_##op, false, res, base, deref, -1) +#define STORE(mode, op, res, base, deref, val) INFO(mode, store_##op, false, res, base, deref, val) +#define ATOMIC(mode, type, op, res, base, deref, val) INFO(mode, type##_atomic_##op, true, res, base, deref, val) + LOAD(nir_var_mem_push_const, push_constant, -1, 0, -1) + LOAD(nir_var_mem_ubo, ubo, 0, 1, -1) + LOAD(nir_var_mem_ssbo, ssbo, 0, 1, -1) + STORE(nir_var_mem_ssbo, ssbo, 1, 2, -1, 0) + LOAD(0, deref, -1, -1, 0) + STORE(0, deref, -1, -1, 0, 1) + LOAD(nir_var_mem_shared, shared, -1, 0, -1) + STORE(nir_var_mem_shared, shared, -1, 1, -1, 0) + ATOMIC(nir_var_mem_ssbo, ssbo, add, 0, 1, -1, 2) + ATOMIC(nir_var_mem_ssbo, ssbo, imin, 0, 1, -1, 2) + ATOMIC(nir_var_mem_ssbo, ssbo, umin, 0, 1, -1, 2) + ATOMIC(nir_var_mem_ssbo, ssbo, imax, 0, 1, -1, 2) + ATOMIC(nir_var_mem_ssbo, ssbo, umax, 0, 1, -1, 2) + ATOMIC(nir_var_mem_ssbo, ssbo, and, 0, 1, -1, 2) + ATOMIC(nir_var_mem_ssbo, ssbo, or, 0, 1, -1, 2) + ATOMIC(nir_var_mem_ssbo, ssbo, xor, 0, 1, -1, 2) + ATOMIC(nir_var_mem_ssbo, ssbo, exchange, 0, 1, -1, 2) + ATOMIC(nir_var_mem_ssbo, ssbo, comp_swap, 0, 1, -1, 2) + ATOMIC(nir_var_mem_ssbo, ssbo, fadd, 0, 1, -1, 2) + ATOMIC(nir_var_mem_ssbo, ssbo, fmin, 0, 1, -1, 2) + ATOMIC(nir_var_mem_ssbo, ssbo, fmax, 0, 1, -1, 2) + ATOMIC(nir_var_mem_ssbo, ssbo, fcomp_swap, 0, 1, -1, 2) + ATOMIC(0, deref, add, -1, -1, 0, 1) + ATOMIC(0, deref, imin, -1, -1, 0, 1) + ATOMIC(0, deref, umin, -1, -1, 0, 1) + ATOMIC(0, deref, imax, -1, -1, 0, 1) + ATOMIC(0, deref, umax, -1, -1, 0, 1) + ATOMIC(0, deref, and, -1, -1, 0, 1) + ATOMIC(0, deref, or, -1, -1, 0, 1) + ATOMIC(0, deref, xor, -1, -1, 0, 1) + ATOMIC(0, deref, exchange, -1, -1, 0, 1) + ATOMIC(0, deref, comp_swap, -1, -1, 0, 1) + ATOMIC(0, deref, fadd, -1, -1, 0, 1) + ATOMIC(0, deref, fmin, -1, -1, 0, 1) + ATOMIC(0, deref, fmax, -1, -1, 0, 1) + ATOMIC(0, deref, fcomp_swap, -1, -1, 0, 1) + ATOMIC(nir_var_mem_shared, shared, add, -1, 0, -1, 1) + ATOMIC(nir_var_mem_shared, shared, imin, -1, 0, -1, 1) + ATOMIC(nir_var_mem_shared, shared, umin, -1, 0, -1, 1) + ATOMIC(nir_var_mem_shared, shared, imax, -1, 0, -1, 1) + ATOMIC(nir_var_mem_shared, shared, umax, -1, 0, -1, 1) + ATOMIC(nir_var_mem_shared, shared, and, -1, 0, -1, 1) + ATOMIC(nir_var_mem_shared, shared, or, -1, 0, -1, 1) + ATOMIC(nir_var_mem_shared, shared, xor, -1, 0, -1, 1) + ATOMIC(nir_var_mem_shared, shared, exchange, -1, 0, -1, 1) + ATOMIC(nir_var_mem_shared, shared, comp_swap, -1, 0, -1, 1) + ATOMIC(nir_var_mem_shared, shared, fadd, -1, 0, -1, 1) + ATOMIC(nir_var_mem_shared, shared, fmin, -1, 0, -1, 1) + ATOMIC(nir_var_mem_shared, shared, fmax, -1, 0, -1, 1) + ATOMIC(nir_var_mem_shared, shared, fcomp_swap, -1, 0, -1, 1) + default: + break; +#undef ATOMIC +#undef STORE +#undef LOAD +#undef INFO + } + return NULL; +} + +/* + * Information used to compare memory operations. + * It canonically represents an offset as: + * `offset_defs[0]*offset_defs_mul[0] + offset_defs[1]*offset_defs_mul[1] + ...` + * "offset_defs" is sorted in ascenting order by the ssa definition's index. + * "resource" or "var" may be NULL. + */ +struct entry_key { + nir_ssa_def *resource; + nir_variable *var; + unsigned offset_def_count; + nir_ssa_def **offset_defs; + uint64_t *offset_defs_mul; +}; + +/* Information on a single memory operation. */ +struct entry { + struct list_head head; + unsigned index; + + struct entry_key *key; + union { + uint64_t offset; /* sign-extended */ + int64_t offset_signed; + }; + uint32_t best_align; + + nir_instr *instr; + nir_intrinsic_instr *intrin; + const struct intrinsic_info *info; + enum gl_access_qualifier access; + bool is_store; + + nir_deref_instr *deref; +}; + +struct vectorize_ctx { + nir_variable_mode modes; + nir_should_vectorize_mem_func callback; + struct list_head entries[nir_num_variable_modes]; + struct hash_table *loads[nir_num_variable_modes]; + struct hash_table *stores[nir_num_variable_modes]; +}; + +static uint32_t hash_entry_key(const void *key_) +{ + /* this is careful to not include pointers in the hash calculation so that + * the order of the hash table walk is deterministic */ + struct entry_key *key = (struct entry_key*)key_; + + uint32_t hash = _mesa_fnv32_1a_offset_bias; + if (key->resource) + hash = _mesa_fnv32_1a_accumulate(hash, key->resource->index); + if (key->var) { + hash = _mesa_fnv32_1a_accumulate(hash, key->var->index); + unsigned mode = key->var->data.mode; + hash = _mesa_fnv32_1a_accumulate(hash, mode); + } + + for (unsigned i = 0; i < key->offset_def_count; i++) + hash = _mesa_fnv32_1a_accumulate(hash, key->offset_defs[i]->index); + + hash = _mesa_fnv32_1a_accumulate_block( + hash, key->offset_defs_mul, key->offset_def_count * sizeof(uint64_t)); + + return hash; +} + +static bool entry_key_equals(const void *a_, const void *b_) +{ + struct entry_key *a = (struct entry_key*)a_; + struct entry_key *b = (struct entry_key*)b_; + + if (a->var != b->var || a->resource != b->resource) + return false; + + if (a->offset_def_count != b->offset_def_count) + return false; + + size_t offset_def_size = a->offset_def_count * sizeof(nir_ssa_def *); + size_t offset_def_mul_size = a->offset_def_count * sizeof(uint64_t); + if (a->offset_def_count && + (memcmp(a->offset_defs, b->offset_defs, offset_def_size) || + memcmp(a->offset_defs_mul, b->offset_defs_mul, offset_def_mul_size))) + return false; + + return true; +} + +static void delete_entry_dynarray(struct hash_entry *entry) +{ + struct util_dynarray *arr = (struct util_dynarray *)entry->data; + ralloc_free(arr); +} + +static int sort_entries(const void *a_, const void *b_) +{ + struct entry *a = *(struct entry*const*)a_; + struct entry *b = *(struct entry*const*)b_; + + if (a->offset_signed > b->offset_signed) + return 1; + else if (a->offset_signed < b->offset_signed) + return -1; + else + return 0; +} + +static unsigned +get_bit_size(struct entry *entry) +{ + unsigned size = entry->is_store ? + entry->intrin->src[entry->info->value_src].ssa->bit_size : + entry->intrin->dest.ssa.bit_size; + return size == 1 ? 32u : size; +} + +/* If "def" is from an alu instruction with the opcode "op" and one of it's + * sources is a constant, update "def" to be the non-constant source, fill "c" + * with the constant and return true. */ +static bool +parse_alu(nir_ssa_def **def, nir_op op, uint64_t *c) +{ + nir_ssa_scalar scalar; + scalar.def = *def; + scalar.comp = 0; + + if (!nir_ssa_scalar_is_alu(scalar) || nir_ssa_scalar_alu_op(scalar) != op) + return false; + + nir_ssa_scalar src0 = nir_ssa_scalar_chase_alu_src(scalar, 0); + nir_ssa_scalar src1 = nir_ssa_scalar_chase_alu_src(scalar, 1); + if (op != nir_op_ishl && nir_ssa_scalar_is_const(src0) && src1.comp == 0) { + *c = nir_ssa_scalar_as_uint(src0); + *def = src1.def; + } else if (nir_ssa_scalar_is_const(src1) && src0.comp == 0) { + *c = nir_ssa_scalar_as_uint(src1); + *def = src0.def; + } else { + return false; + } + return true; +} + +/* Parses an offset expression such as "a * 16 + 4" and "(a * 16 + 4) * 64 + 32". */ +static void +parse_offset(nir_ssa_def **base, uint64_t *base_mul, uint64_t *offset) +{ + if ((*base)->parent_instr->type == nir_instr_type_load_const) { + *offset = nir_src_comp_as_uint(nir_src_for_ssa(*base), 0); + *base = NULL; + return; + } + + uint64_t mul = 1; + uint64_t add = 0; + bool progress = false; + do { + uint64_t mul2 = 1, add2 = 0; + + progress = parse_alu(base, nir_op_imul, &mul2); + mul *= mul2; + + mul2 = 0; + progress |= parse_alu(base, nir_op_ishl, &mul2); + mul <<= mul2; + + progress |= parse_alu(base, nir_op_iadd, &add2); + add += add2 * mul; + } while (progress); + + *base_mul = mul; + *offset = add; +} + +static unsigned +type_scalar_size_bytes(const struct glsl_type *type) +{ + assert(glsl_type_is_vector_or_scalar(type) || + glsl_type_is_matrix(type)); + return glsl_type_is_boolean(type) ? 4u : glsl_get_bit_size(type) / 8u; +} + +static int +get_array_stride(const struct glsl_type *type) +{ + unsigned explicit_stride = glsl_get_explicit_stride(type); + if ((glsl_type_is_matrix(type) && + glsl_matrix_type_is_row_major(type)) || + (glsl_type_is_vector(type) && explicit_stride == 0)) + return type_scalar_size_bytes(type); + return explicit_stride; +} + +static uint64_t +mask_sign_extend(uint64_t val, unsigned bit_size) +{ + return (int64_t)(val << (64 - bit_size)) >> (64 - bit_size); +} + +static unsigned +add_to_entry_key(nir_ssa_def **offset_defs, uint64_t *offset_defs_mul, + unsigned offset_def_count, nir_ssa_def *def, uint64_t mul) +{ + mul = mask_sign_extend(mul, def->bit_size); + + for (unsigned i = 0; i <= offset_def_count; i++) { + if (i == offset_def_count || def->index > offset_defs[i]->index) { + /* insert before i */ + memmove(offset_defs + i + 1, offset_defs + i, + (offset_def_count - i) * sizeof(nir_ssa_def *)); + memmove(offset_defs_mul + i + 1, offset_defs_mul + i, + (offset_def_count - i) * sizeof(uint64_t)); + offset_defs[i] = def; + offset_defs_mul[i] = mul; + return 1; + } else if (def->index == offset_defs[i]->index) { + /* merge with offset_def at i */ + offset_defs_mul[i] += mul; + return 0; + } + } + unreachable("Unreachable."); + return 0; +} + +static struct entry_key * +create_entry_key_from_deref(void *mem_ctx, + struct vectorize_ctx *ctx, + nir_deref_path *path, + uint64_t *offset_base) +{ + unsigned path_len = 0; + while (path->path[path_len]) + path_len++; + + nir_ssa_def *offset_defs_stack[32]; + uint64_t offset_defs_mul_stack[32]; + nir_ssa_def **offset_defs = offset_defs_stack; + uint64_t *offset_defs_mul = offset_defs_mul_stack; + if (path_len > 32) { + offset_defs = malloc(path_len * sizeof(nir_ssa_def *)); + offset_defs_mul = malloc(path_len * sizeof(uint64_t)); + } + unsigned offset_def_count = 0; + + struct entry_key* key = ralloc(mem_ctx, struct entry_key); + key->resource = NULL; + key->var = NULL; + *offset_base = 0; + + for (unsigned i = 0; i < path_len; i++) { + nir_deref_instr *parent = i ? path->path[i - 1] : NULL; + nir_deref_instr *deref = path->path[i]; + + switch (deref->deref_type) { + case nir_deref_type_var: { + assert(!parent); + key->var = deref->var; + break; + } + case nir_deref_type_array: + case nir_deref_type_ptr_as_array: { + assert(parent); + nir_ssa_def *index = deref->arr.index.ssa; + uint32_t stride; + if (deref->deref_type == nir_deref_type_ptr_as_array) + stride = nir_deref_instr_ptr_as_array_stride(deref); + else + stride = get_array_stride(parent->type); + + nir_ssa_def *base = index; + uint64_t offset = 0, base_mul = 1; + parse_offset(&base, &base_mul, &offset); + offset = mask_sign_extend(offset, index->bit_size); + + *offset_base += offset * stride; + if (base) { + offset_def_count += add_to_entry_key(offset_defs, offset_defs_mul, + offset_def_count, + base, base_mul * stride); + } + break; + } + case nir_deref_type_struct: { + assert(parent); + int offset = glsl_get_struct_field_offset(parent->type, deref->strct.index); + *offset_base += offset; + break; + } + case nir_deref_type_cast: { + if (!parent) + key->resource = deref->parent.ssa; + break; + } + default: + unreachable("Unhandled deref type"); + } + } + + key->offset_def_count = offset_def_count; + key->offset_defs = ralloc_array(mem_ctx, nir_ssa_def *, offset_def_count); + key->offset_defs_mul = ralloc_array(mem_ctx, uint64_t, offset_def_count); + memcpy(key->offset_defs, offset_defs, offset_def_count * sizeof(nir_ssa_def *)); + memcpy(key->offset_defs_mul, offset_defs_mul, offset_def_count * sizeof(uint64_t)); + + if (offset_defs != offset_defs_stack) + free(offset_defs); + if (offset_defs_mul != offset_defs_mul_stack) + free(offset_defs_mul); + + return key; +} + +static unsigned +parse_entry_key_from_offset(struct entry_key *key, unsigned size, unsigned left, + nir_ssa_def *base, uint64_t base_mul, uint64_t *offset) +{ + uint64_t new_mul; + uint64_t new_offset; + parse_offset(&base, &new_mul, &new_offset); + *offset += new_offset * base_mul; + + if (!base) + return 0; + + base_mul *= new_mul; + + assert(left >= 1); + + if (left >= 2) { + nir_ssa_scalar scalar; + scalar.def = base; + scalar.comp = 0; + if (nir_ssa_scalar_is_alu(scalar) && nir_ssa_scalar_alu_op(scalar) == nir_op_iadd) { + nir_ssa_scalar src0 = nir_ssa_scalar_chase_alu_src(scalar, 0); + nir_ssa_scalar src1 = nir_ssa_scalar_chase_alu_src(scalar, 1); + if (src0.comp == 0 && src1.comp == 0) { + unsigned amount = parse_entry_key_from_offset(key, size, left - 1, src0.def, base_mul, offset); + amount += parse_entry_key_from_offset(key, size + amount, left - amount, src1.def, base_mul, offset); + return amount; + } + } + } + + return add_to_entry_key(key->offset_defs, key->offset_defs_mul, size, base, base_mul); +} + +static struct entry_key * +create_entry_key_from_offset(void *mem_ctx, nir_ssa_def *base, uint64_t base_mul, uint64_t *offset) +{ + struct entry_key *key = ralloc(mem_ctx, struct entry_key); + key->resource = NULL; + key->var = NULL; + if (base) { + nir_ssa_def *offset_defs[32]; + uint64_t offset_defs_mul[32]; + key->offset_defs = offset_defs; + key->offset_defs_mul = offset_defs_mul; + + key->offset_def_count = parse_entry_key_from_offset(key, 0, 32, base, base_mul, offset); + + key->offset_defs = ralloc_array(mem_ctx, nir_ssa_def *, key->offset_def_count); + key->offset_defs_mul = ralloc_array(mem_ctx, uint64_t, key->offset_def_count); + memcpy(key->offset_defs, offset_defs, key->offset_def_count * sizeof(nir_ssa_def *)); + memcpy(key->offset_defs_mul, offset_defs_mul, key->offset_def_count * sizeof(uint64_t)); + } else { + key->offset_def_count = 0; + key->offset_defs = NULL; + key->offset_defs_mul = NULL; + } + return key; +} + +static nir_variable_mode +get_variable_mode(struct entry *entry) +{ + if (entry->info->mode) + return entry->info->mode; + assert(entry->deref); + return entry->deref->mode; +} + +static struct entry * +create_entry(struct vectorize_ctx *ctx, + const struct intrinsic_info *info, + nir_intrinsic_instr *intrin) +{ + struct entry *entry = rzalloc(ctx, struct entry); + entry->intrin = intrin; + entry->instr = &intrin->instr; + entry->info = info; + entry->best_align = UINT32_MAX; + entry->is_store = entry->info->value_src >= 0; + + if (entry->info->deref_src >= 0) { + entry->deref = nir_src_as_deref(intrin->src[entry->info->deref_src]); + nir_deref_path path; + nir_deref_path_init(&path, entry->deref, NULL); + entry->key = create_entry_key_from_deref(entry, ctx, &path, &entry->offset); + nir_deref_path_finish(&path); + } else { + nir_ssa_def *base = entry->info->base_src >= 0 ? + intrin->src[entry->info->base_src].ssa : NULL; + uint64_t offset = 0; + if (nir_intrinsic_infos[intrin->intrinsic].index_map[NIR_INTRINSIC_BASE]) + offset += nir_intrinsic_base(intrin); + entry->key = create_entry_key_from_offset(entry, base, 1, &offset); + entry->offset = offset; + + if (base) + entry->offset = mask_sign_extend(entry->offset, base->bit_size); + } + + if (entry->info->resource_src >= 0) + entry->key->resource = intrin->src[entry->info->resource_src].ssa; + + if (nir_intrinsic_infos[intrin->intrinsic].index_map[NIR_INTRINSIC_ACCESS]) + entry->access = nir_intrinsic_access(intrin); + else if (entry->key->var) + entry->access = entry->key->var->data.access; + + uint32_t restrict_modes = nir_var_shader_in | nir_var_shader_out; + restrict_modes |= nir_var_shader_temp | nir_var_function_temp; + restrict_modes |= nir_var_uniform | nir_var_mem_push_const; + restrict_modes |= nir_var_system_value | nir_var_mem_shared; + if (get_variable_mode(entry) & restrict_modes) + entry->access |= ACCESS_RESTRICT; + + return entry; +} + +static nir_deref_instr * +cast_deref(nir_builder *b, unsigned num_components, unsigned bit_size, nir_deref_instr *deref) +{ + if (glsl_get_components(deref->type) == num_components && + type_scalar_size_bytes(deref->type)*8u == bit_size) + return deref; + + enum glsl_base_type types[] = { + GLSL_TYPE_UINT8, GLSL_TYPE_UINT16, GLSL_TYPE_UINT, GLSL_TYPE_UINT64}; + enum glsl_base_type base = types[ffs(bit_size / 8u) - 1u]; + const struct glsl_type *type = glsl_vector_type(base, num_components); + + if (deref->type == type) + return deref; + + return nir_build_deref_cast(b, &deref->dest.ssa, deref->mode, type, 0); +} + +/* Return true if the write mask "write_mask" of a store with "old_bit_size" + * bits per element can be represented for a store with "new_bit_size" bits per + * element. */ +static bool +writemask_representable(unsigned write_mask, unsigned old_bit_size, unsigned new_bit_size) +{ + while (write_mask) { + int start, count; + u_bit_scan_consecutive_range(&write_mask, &start, &count); + start *= old_bit_size; + count *= old_bit_size; + if (start % new_bit_size != 0) + return false; + if (count % new_bit_size != 0) + return false; + } + return true; +} + +static uint64_t +gcd(uint64_t a, uint64_t b) +{ + while (b) { + uint64_t old_b = b; + b = a % b; + a = old_b; + } + return a; +} + +static uint32_t +get_best_align(struct entry *entry) +{ + if (entry->best_align != UINT32_MAX) + return entry->best_align; + + uint64_t best_align = entry->offset; + for (unsigned i = 0; i < entry->key->offset_def_count; i++) { + if (!best_align) + best_align = entry->key->offset_defs_mul[i]; + else if (entry->key->offset_defs_mul[i]) + best_align = gcd(best_align, entry->key->offset_defs_mul[i]); + } + + if (nir_intrinsic_infos[entry->intrin->intrinsic].index_map[NIR_INTRINSIC_ALIGN_MUL]) + best_align = MAX2(best_align, nir_intrinsic_align(entry->intrin)); + + /* ensure the result is a power of two that fits in a int32_t */ + entry->best_align = gcd(best_align, 1u << 30); + + return entry->best_align; +} + +/* Return true if "new_bit_size" is a usable bit size for a vectorized load/store + * of "low" and "high". */ +static bool +new_bitsize_acceptable(struct vectorize_ctx *ctx, unsigned new_bit_size, + struct entry *low, struct entry *high, unsigned size) +{ + if (size % new_bit_size != 0) + return false; + + unsigned new_num_components = size / new_bit_size; + if (!nir_num_components_valid(new_num_components)) + return false; + + unsigned high_offset = high->offset_signed - low->offset_signed; + + /* check nir_extract_bits limitations */ + unsigned common_bit_size = MIN2(get_bit_size(low), get_bit_size(high)); + common_bit_size = MIN2(common_bit_size, new_bit_size); + if (high_offset > 0) + common_bit_size = MIN2(common_bit_size, (1u << (ffs(high_offset * 8) - 1))); + if (new_bit_size / common_bit_size > NIR_MAX_VEC_COMPONENTS) + return false; + + if (!ctx->callback(get_best_align(low), new_bit_size, new_num_components, + high_offset, low->intrin, high->intrin)) + return false; + + if (low->is_store) { + unsigned low_size = low->intrin->num_components * get_bit_size(low); + unsigned high_size = high->intrin->num_components * get_bit_size(high); + + if (low_size % new_bit_size != 0) + return false; + if (high_size % new_bit_size != 0) + return false; + + unsigned write_mask = nir_intrinsic_write_mask(low->intrin); + if (!writemask_representable(write_mask, low_size, new_bit_size)) + return false; + + write_mask = nir_intrinsic_write_mask(high->intrin); + if (!writemask_representable(write_mask, high_size, new_bit_size)) + return false; + } + + return true; +} + +/* Updates a write mask, "write_mask", so that it can be used with a + * "new_bit_size"-bit store instead of a "old_bit_size"-bit store. */ +static uint32_t +update_writemask(unsigned write_mask, unsigned old_bit_size, unsigned new_bit_size) +{ + uint32_t res = 0; + while (write_mask) { + int start, count; + u_bit_scan_consecutive_range(&write_mask, &start, &count); + start = start * old_bit_size / new_bit_size; + count = count * old_bit_size / new_bit_size; + res |= ((1 << count) - 1) << start; + } + return res; +} + +static nir_deref_instr *subtract_deref(nir_builder *b, nir_deref_instr *deref, int64_t offset) +{ + /* avoid adding another deref to the path */ + if (deref->deref_type == nir_deref_type_ptr_as_array && + nir_src_is_const(deref->arr.index) && + offset % nir_deref_instr_ptr_as_array_stride(deref) == 0) { + unsigned stride = nir_deref_instr_ptr_as_array_stride(deref); + nir_ssa_def *index = nir_imm_intN_t(b, nir_src_as_int(deref->arr.index) - offset / stride, + deref->dest.ssa.bit_size); + return nir_build_deref_ptr_as_array(b, nir_deref_instr_parent(deref), index); + } + + if (deref->deref_type == nir_deref_type_array && + nir_src_is_const(deref->arr.index)) { + nir_deref_instr *parent = nir_deref_instr_parent(deref); + unsigned stride = glsl_get_explicit_stride(parent->type); + if (offset % stride == 0) + return nir_build_deref_array_imm( + b, parent, nir_src_as_int(deref->arr.index) - offset / stride); + } + + + deref = nir_build_deref_cast(b, &deref->dest.ssa, deref->mode, + glsl_scalar_type(GLSL_TYPE_UINT8), 1); + return nir_build_deref_ptr_as_array( + b, deref, nir_imm_intN_t(b, -offset, deref->dest.ssa.bit_size)); +} + +static bool update_align(struct entry *entry) +{ + bool has_align_index = + nir_intrinsic_infos[entry->intrin->intrinsic].index_map[NIR_INTRINSIC_ALIGN_MUL]; + if (has_align_index) { + unsigned align = get_best_align(entry); + if (align != nir_intrinsic_align(entry->intrin)) { + nir_intrinsic_set_align(entry->intrin, align, 0); + return true; + } + } + return false; +} + +static void +vectorize_loads(nir_builder *b, struct vectorize_ctx *ctx, + struct entry *low, struct entry *high, + struct entry *first, struct entry *second, + unsigned new_bit_size, unsigned new_num_components, + unsigned high_start) +{ + unsigned low_bit_size = get_bit_size(low); + unsigned high_bit_size = get_bit_size(high); + bool low_bool = low->intrin->dest.ssa.bit_size == 1; + bool high_bool = high->intrin->dest.ssa.bit_size == 1; + nir_ssa_def *data = &first->intrin->dest.ssa; + + b->cursor = nir_after_instr(first->instr); + + /* update the load's destination size and extract data for each of the original loads */ + data->num_components = new_num_components; + data->bit_size = new_bit_size; + + nir_ssa_def *low_def = nir_extract_bits( + b, &data, 1, 0, low->intrin->num_components, low_bit_size); + nir_ssa_def *high_def = nir_extract_bits( + b, &data, 1, high_start, high->intrin->num_components, high_bit_size); + + /* convert booleans */ + low_def = low_bool ? nir_i2b(b, low_def) : nir_mov(b, low_def); + high_def = high_bool ? nir_i2b(b, high_def) : nir_mov(b, high_def); + + /* update uses */ + if (first == low) { + nir_ssa_def_rewrite_uses_after(&low->intrin->dest.ssa, nir_src_for_ssa(low_def), + high_def->parent_instr); + nir_ssa_def_rewrite_uses(&high->intrin->dest.ssa, nir_src_for_ssa(high_def)); + } else { + nir_ssa_def_rewrite_uses(&low->intrin->dest.ssa, nir_src_for_ssa(low_def)); + nir_ssa_def_rewrite_uses_after(&high->intrin->dest.ssa, nir_src_for_ssa(high_def), + high_def->parent_instr); + } + + /* update the intrinsic */ + first->intrin->num_components = new_num_components; + + const struct intrinsic_info *info = first->info; + + /* update the offset */ + if (first != low && info->base_src >= 0) { + /* let nir_opt_algebraic() remove this addition. this doesn't have much + * issues with subtracting 16 from expressions like "(i + 1) * 16" because + * nir_opt_algebraic() turns them into "i * 16 + 16" */ + b->cursor = nir_before_instr(first->instr); + + nir_ssa_def *new_base = first->intrin->src[info->base_src].ssa; + new_base = nir_iadd(b, new_base, nir_imm_int(b, -(high_start / 8u))); + + nir_instr_rewrite_src(first->instr, &first->intrin->src[info->base_src], + nir_src_for_ssa(new_base)); + } + + /* update the deref */ + if (info->deref_src >= 0) { + b->cursor = nir_before_instr(first->instr); + + nir_deref_instr *deref = nir_src_as_deref(first->intrin->src[info->deref_src]); + if (first != low && high_start != 0) + deref = subtract_deref(b, deref, high_start / 8u); + first->deref = cast_deref(b, new_num_components, new_bit_size, deref); + + nir_instr_rewrite_src(first->instr, &first->intrin->src[info->deref_src], + nir_src_for_ssa(&first->deref->dest.ssa)); + } + + /* update base/align */ + bool has_base_index = + nir_intrinsic_infos[first->intrin->intrinsic].index_map[NIR_INTRINSIC_BASE]; + + if (first != low && has_base_index) + nir_intrinsic_set_base(first->intrin, nir_intrinsic_base(low->intrin)); + + first->key = low->key; + first->offset = low->offset; + first->best_align = get_best_align(low); + + update_align(first); + + nir_instr_remove(second->instr); +} + +static void +vectorize_stores(nir_builder *b, struct vectorize_ctx *ctx, + struct entry *low, struct entry *high, + struct entry *first, struct entry *second, + unsigned new_bit_size, unsigned new_num_components, + unsigned high_start) +{ + ASSERTED unsigned low_size = low->intrin->num_components * get_bit_size(low); + assert(low_size % new_bit_size == 0); + + b->cursor = nir_before_instr(second->instr); + + /* get new writemasks */ + uint32_t low_write_mask = nir_intrinsic_write_mask(low->intrin); + uint32_t high_write_mask = nir_intrinsic_write_mask(high->intrin); + low_write_mask = update_writemask(low_write_mask, get_bit_size(low), new_bit_size); + high_write_mask = update_writemask(high_write_mask, get_bit_size(high), new_bit_size); + high_write_mask <<= high_start / new_bit_size; + + uint32_t write_mask = low_write_mask | high_write_mask; + + /* convert booleans */ + nir_ssa_def *low_val = low->intrin->src[low->info->value_src].ssa; + nir_ssa_def *high_val = high->intrin->src[high->info->value_src].ssa; + low_val = low_val->bit_size == 1 ? nir_b2i(b, low_val, 32) : low_val; + high_val = high_val->bit_size == 1 ? nir_b2i(b, high_val, 32) : high_val; + + /* combine the data */ + nir_ssa_def *data_channels[NIR_MAX_VEC_COMPONENTS]; + for (unsigned i = 0; i < new_num_components; i++) { + bool set_low = low_write_mask & (1 << i); + bool set_high = high_write_mask & (1 << i); + + if (set_low && (!set_high || low == second)) { + unsigned offset = i * new_bit_size; + data_channels[i] = nir_extract_bits(b, &low_val, 1, offset, 1, new_bit_size); + } else if (set_high) { + assert(!set_low || high == second); + unsigned offset = i * new_bit_size - high_start; + data_channels[i] = nir_extract_bits(b, &high_val, 1, offset, 1, new_bit_size); + } else { + data_channels[i] = nir_ssa_undef(b, 1, new_bit_size); + } + } + nir_ssa_def *data = nir_vec(b, data_channels, new_num_components); + + /* update the intrinsic */ + nir_intrinsic_set_write_mask(second->intrin, write_mask); + second->intrin->num_components = data->num_components; + + const struct intrinsic_info *info = second->info; + assert(info->value_src >= 0); + nir_instr_rewrite_src(second->instr, &second->intrin->src[info->value_src], + nir_src_for_ssa(data)); + + /* update the offset */ + if (second != low && info->base_src >= 0) + nir_instr_rewrite_src(second->instr, &second->intrin->src[info->base_src], + low->intrin->src[info->base_src]); + + /* update the deref */ + if (info->deref_src >= 0) { + b->cursor = nir_before_instr(second->instr); + second->deref = cast_deref(b, new_num_components, new_bit_size, + nir_src_as_deref(low->intrin->src[info->deref_src])); + nir_instr_rewrite_src(second->instr, &second->intrin->src[info->deref_src], + nir_src_for_ssa(&second->deref->dest.ssa)); + } + + /* update base/align */ + bool has_base_index = + nir_intrinsic_infos[second->intrin->intrinsic].index_map[NIR_INTRINSIC_BASE]; + + if (second != low && has_base_index) + nir_intrinsic_set_base(second->intrin, nir_intrinsic_base(low->intrin)); + + second->key = low->key; + second->offset = low->offset; + second->best_align = get_best_align(low); + + update_align(second); + + list_del(&first->head); + nir_instr_remove(first->instr); +} + +/* Returns true if it can prove that "a" and "b" point to different resources. */ +static bool +resources_different(nir_ssa_def *a, nir_ssa_def *b) +{ + if (!a || !b) + return false; + + if (a->parent_instr->type == nir_instr_type_load_const && + b->parent_instr->type == nir_instr_type_load_const) { + return nir_src_as_uint(nir_src_for_ssa(a)) != nir_src_as_uint(nir_src_for_ssa(b)); + } + + if (a->parent_instr->type == nir_instr_type_intrinsic && + b->parent_instr->type == nir_instr_type_intrinsic) { + nir_intrinsic_instr *aintrin = nir_instr_as_intrinsic(a->parent_instr); + nir_intrinsic_instr *bintrin = nir_instr_as_intrinsic(b->parent_instr); + if (aintrin->intrinsic == nir_intrinsic_vulkan_resource_index && + bintrin->intrinsic == nir_intrinsic_vulkan_resource_index) { + return nir_intrinsic_desc_set(aintrin) != nir_intrinsic_desc_set(bintrin) || + nir_intrinsic_binding(aintrin) != nir_intrinsic_binding(bintrin) || + resources_different(aintrin->src[0].ssa, bintrin->src[0].ssa); + } + } + + return false; +} + +static int64_t +compare_entries(struct entry *a, struct entry *b) +{ + if (!entry_key_equals(a->key, b->key)) + return INT64_MAX; + return b->offset_signed - a->offset_signed; +} + +static bool +may_alias(struct entry *a, struct entry *b) +{ + assert(get_variable_mode(a) == get_variable_mode(b)); + + /* if the resources/variables are definitively different and both have + * ACCESS_RESTRICT, we can assume they do not alias. */ + bool res_different = a->key->var != b->key->var || + resources_different(a->key->resource, b->key->resource); + if (res_different && (a->access & ACCESS_RESTRICT) && (b->access & ACCESS_RESTRICT)) + return false; + + /* we can't compare offsets if the resources/variables might be different */ + if (a->key->var != b->key->var || a->key->resource != b->key->resource) + return true; + + /* use adjacency information */ + /* TODO: we can look closer at the entry keys */ + int64_t diff = compare_entries(a, b); + if (diff != INT64_MAX) { + /* with atomics, intrin->num_components can be 0 */ + if (diff < 0) + return llabs(diff) < MAX2(b->intrin->num_components, 1u) * (get_bit_size(b) / 8u); + else + return diff < MAX2(a->intrin->num_components, 1u) * (get_bit_size(a) / 8u); + } + + /* TODO: we can use deref information */ + + return true; +} + +static bool +check_for_aliasing(struct vectorize_ctx *ctx, struct entry *first, struct entry *second) +{ + nir_variable_mode mode = get_variable_mode(first); + if (mode & (nir_var_uniform | nir_var_system_value | + nir_var_mem_push_const | nir_var_mem_ubo)) + return false; + + unsigned mode_index = ffs(mode) - 1; + if (first->is_store) { + /* find first entry that aliases "first" */ + list_for_each_entry_from(struct entry, next, first, &ctx->entries[mode_index], head) { + if (next == first) + continue; + if (next == second) + return false; + if (may_alias(first, next)) + return true; + } + } else { + /* find previous store that aliases this load */ + list_for_each_entry_from_rev(struct entry, prev, second, &ctx->entries[mode_index], head) { + if (prev == second) + continue; + if (prev == first) + return false; + if (prev->is_store && may_alias(second, prev)) + return true; + } + } + + return false; +} + +static bool +is_strided_vector(const struct glsl_type *type) +{ + if (glsl_type_is_vector(type)) { + return glsl_get_explicit_stride(type) != + type_scalar_size_bytes(glsl_get_array_element(type)); + } else { + return false; + } +} + +static bool +try_vectorize(nir_function_impl *impl, struct vectorize_ctx *ctx, + struct entry *low, struct entry *high, + struct entry *first, struct entry *second) +{ + if (check_for_aliasing(ctx, first, second)) + return false; + + /* we can only vectorize non-volatile loads/stores of the same type and with + * the same access */ + if (first->info != second->info || first->access != second->access || + (first->access & ACCESS_VOLATILE) || first->info->is_atomic) + return false; + + /* don't attempt to vectorize accesses of row-major matrix columns */ + if (first->deref) { + const struct glsl_type *first_type = first->deref->type; + const struct glsl_type *second_type = second->deref->type; + if (is_strided_vector(first_type) || is_strided_vector(second_type)) + return false; + } + + /* gather information */ + uint64_t diff = high->offset_signed - low->offset_signed; + unsigned low_bit_size = get_bit_size(low); + unsigned high_bit_size = get_bit_size(high); + unsigned low_size = low->intrin->num_components * low_bit_size; + unsigned high_size = high->intrin->num_components * high_bit_size; + unsigned new_size = MAX2(diff * 8u + high_size, low_size); + + /* find a good bit size for the new load/store */ + unsigned new_bit_size = 0; + if (new_bitsize_acceptable(ctx, low_bit_size, low, high, new_size)) { + new_bit_size = low_bit_size; + } else if (low_bit_size != high_bit_size && + new_bitsize_acceptable(ctx, high_bit_size, low, high, new_size)) { + new_bit_size = high_bit_size; + } else { + new_bit_size = 64; + for (; new_bit_size >= 8; new_bit_size /= 2) { + /* don't repeat trying out bitsizes */ + if (new_bit_size == low_bit_size || new_bit_size == high_bit_size) + continue; + if (new_bitsize_acceptable(ctx, new_bit_size, low, high, new_size)) + break; + } + if (new_bit_size < 8) + return false; + } + unsigned new_num_components = new_size / new_bit_size; + + /* vectorize the loads/stores */ + nir_builder b; + nir_builder_init(&b, impl); + + if (first->is_store) + vectorize_stores(&b, ctx, low, high, first, second, + new_bit_size, new_num_components, diff * 8u); + else + vectorize_loads(&b, ctx, low, high, first, second, + new_bit_size, new_num_components, diff * 8u); + + return true; +} + +static bool +vectorize_entries(struct vectorize_ctx *ctx, nir_function_impl *impl, struct hash_table *ht) +{ + if (!ht) + return false; + + bool progress = false; + hash_table_foreach(ht, entry) { + struct util_dynarray *arr = entry->data; + if (!arr->size) + continue; + + qsort(util_dynarray_begin(arr), + util_dynarray_num_elements(arr, struct entry *), + sizeof(struct entry *), &sort_entries); + + unsigned i = 0; + for (; i < util_dynarray_num_elements(arr, struct entry*) - 1; i++) { + struct entry *low = *util_dynarray_element(arr, struct entry *, i); + struct entry *high = *util_dynarray_element(arr, struct entry *, i + 1); + + uint64_t diff = high->offset_signed - low->offset_signed; + if (diff > get_bit_size(low) / 8u * low->intrin->num_components) { + progress |= update_align(low); + continue; + } + + struct entry *first = low->index < high->index ? low : high; + struct entry *second = low->index < high->index ? high : low; + + if (try_vectorize(impl, ctx, low, high, first, second)) { + *util_dynarray_element(arr, struct entry *, i) = NULL; + *util_dynarray_element(arr, struct entry *, i + 1) = low->is_store ? second : first; + progress = true; + } else { + progress |= update_align(low); + } + } + + struct entry *last = *util_dynarray_element(arr, struct entry *, i); + progress |= update_align(last); + } + + _mesa_hash_table_clear(ht, delete_entry_dynarray); + + return progress; +} + +static bool +handle_barrier(struct vectorize_ctx *ctx, bool *progress, nir_function_impl *impl, nir_instr *instr) +{ + unsigned modes = 0; + bool acquire = true; + bool release = true; + if (instr->type == nir_instr_type_intrinsic) { + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + switch (intrin->intrinsic) { + case nir_intrinsic_group_memory_barrier: + case nir_intrinsic_memory_barrier: + modes = nir_var_mem_ssbo | nir_var_mem_shared | nir_var_mem_global; + break; + /* prevent speculative loads/stores */ + case nir_intrinsic_discard_if: + case nir_intrinsic_discard: + modes = nir_var_all; + break; + case nir_intrinsic_memory_barrier_buffer: + modes = nir_var_mem_ssbo | nir_var_mem_global; + break; + case nir_intrinsic_memory_barrier_shared: + modes = nir_var_mem_shared; + break; + case nir_intrinsic_scoped_memory_barrier: + modes = nir_intrinsic_memory_modes(intrin); + acquire = nir_intrinsic_memory_semantics(intrin) & NIR_MEMORY_ACQUIRE; + release = nir_intrinsic_memory_semantics(intrin) & NIR_MEMORY_RELEASE; + switch (nir_intrinsic_memory_scope(intrin)) { + case NIR_SCOPE_INVOCATION: + case NIR_SCOPE_SUBGROUP: + /* a barier should never be required for correctness with these scopes */ + modes = 0; + break; + default: + break; + } + break; + default: + return false; + } + } else if (instr->type == nir_instr_type_call) { + modes = nir_var_all; + } else { + return false; + } + + while (modes) { + unsigned mode_index = u_bit_scan(&modes); + + if (acquire) + *progress |= vectorize_entries(ctx, impl, ctx->loads[mode_index]); + if (release) + *progress |= vectorize_entries(ctx, impl, ctx->stores[mode_index]); + } + + return true; +} + +static bool +process_block(nir_function_impl *impl, struct vectorize_ctx *ctx, nir_block *block) +{ + bool progress = false; + + for (unsigned i = 0; i < nir_num_variable_modes; i++) { + list_inithead(&ctx->entries[i]); + if (ctx->loads[i]) + _mesa_hash_table_clear(ctx->loads[i], delete_entry_dynarray); + if (ctx->stores[i]) + _mesa_hash_table_clear(ctx->stores[i], delete_entry_dynarray); + } + + /* create entries */ + unsigned next_index = 0; + + nir_foreach_instr_safe(instr, block) { + if (handle_barrier(ctx, &progress, impl, instr)) + continue; + + /* gather information */ + if (instr->type != nir_instr_type_intrinsic) + continue; + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + + const struct intrinsic_info *info = get_info(intrin->intrinsic); + if (!info) + continue; + + nir_variable_mode mode = info->mode; + if (!mode) + mode = nir_src_as_deref(intrin->src[info->deref_src])->mode; + if (!(mode & ctx->modes)) + continue; + unsigned mode_index = ffs(mode) - 1; + + /* create entry */ + struct entry *entry = create_entry(ctx, info, intrin); + entry->index = next_index++; + + list_addtail(&entry->head, &ctx->entries[mode_index]); + + /* add the entry to a hash table */ + + struct hash_table *adj_ht = NULL; + if (entry->is_store) { + if (!ctx->stores[mode_index]) + ctx->stores[mode_index] = _mesa_hash_table_create(ctx, &hash_entry_key, &entry_key_equals); + adj_ht = ctx->stores[mode_index]; + } else { + if (!ctx->loads[mode_index]) + ctx->loads[mode_index] = _mesa_hash_table_create(ctx, &hash_entry_key, &entry_key_equals); + adj_ht = ctx->loads[mode_index]; + } + + uint32_t key_hash = hash_entry_key(entry->key); + struct hash_entry *adj_entry = _mesa_hash_table_search_pre_hashed(adj_ht, key_hash, entry->key); + struct util_dynarray *arr; + if (adj_entry && adj_entry->data) { + arr = (struct util_dynarray *)adj_entry->data; + } else { + arr = ralloc(ctx, struct util_dynarray); + util_dynarray_init(arr, arr); + _mesa_hash_table_insert_pre_hashed(adj_ht, key_hash, entry->key, arr); + } + util_dynarray_append(arr, struct entry *, entry); + } + + /* sort and combine entries */ + for (unsigned i = 0; i < nir_num_variable_modes; i++) { + progress |= vectorize_entries(ctx, impl, ctx->loads[i]); + progress |= vectorize_entries(ctx, impl, ctx->stores[i]); + } + + return progress; +} + +bool +nir_opt_load_store_vectorize(nir_shader *shader, nir_variable_mode modes, + nir_should_vectorize_mem_func callback) +{ + bool progress = false; + + struct vectorize_ctx *ctx = rzalloc(NULL, struct vectorize_ctx); + ctx->modes = modes; + ctx->callback = callback; + + nir_index_vars(shader, NULL, modes); + + nir_foreach_function(function, shader) { + if (function->impl) { + if (modes & nir_var_function_temp) + nir_index_vars(shader, function->impl, nir_var_function_temp); + + nir_foreach_block(block, function->impl) + progress |= process_block(function->impl, ctx, block); + + nir_metadata_preserve(function->impl, + nir_metadata_block_index | + nir_metadata_dominance | + nir_metadata_live_ssa_defs); + } + } + + ralloc_free(ctx); + return progress; +} diff -Nru mesa-19.2.8/src/compiler/nir/nir_opt_loop_unroll.c mesa-20.0.8/src/compiler/nir/nir_opt_loop_unroll.c --- mesa-19.2.8/src/compiler/nir/nir_opt_loop_unroll.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir_opt_loop_unroll.c 2020-06-12 01:21:16.000000000 +0000 @@ -42,7 +42,7 @@ * to keep track of and update phis along the way which gets tricky and * doesn't add much value over converting to regs. * - * The loop may have a continue instruction at the end of the loop which does + * The loop may have a jump instruction at the end of the loop which does * nothing. Once we're out of SSA, we can safely delete it so we don't have * to deal with it later. */ @@ -67,7 +67,7 @@ nir_lower_phis_to_regs_block(block_after_loop); - /* Remove continue if its the last instruction in the loop */ + /* Remove jump if it's the last instruction in the loop */ nir_instr *last_instr = nir_block_last_instr(nir_loop_last_block(loop)); if (last_instr && last_instr->type == nir_instr_type_jump) { nir_instr_remove(last_instr); @@ -514,7 +514,7 @@ static bool wrapper_unroll(nir_loop *loop) { - if (!list_empty(&loop->info->loop_terminator_list)) { + if (!list_is_empty(&loop->info->loop_terminator_list)) { /* Unrolling a loop with a large number of exits can result in a * large inrease in register pressure. For now we just skip diff -Nru mesa-19.2.8/src/compiler/nir/nir_opt_peephole_select.c mesa-20.0.8/src/compiler/nir/nir_opt_peephole_select.c --- mesa-19.2.8/src/compiler/nir/nir_opt_peephole_select.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir_opt_peephole_select.c 2020-06-12 01:21:16.000000000 +0000 @@ -27,6 +27,7 @@ #include "nir.h" #include "nir_control_flow.h" +#include "nir_search_helpers.h" /* * Implements a small peephole optimization that looks for @@ -107,6 +108,8 @@ case nir_instr_type_alu: { nir_alu_instr *mov = nir_instr_as_alu(instr); + bool movelike = false; + switch (mov->op) { case nir_op_mov: case nir_op_fneg: @@ -116,6 +119,7 @@ case nir_op_vec2: case nir_op_vec3: case nir_op_vec4: + movelike = true; break; case nir_op_fcos: @@ -149,14 +153,20 @@ return false; if (alu_ok) { - (*count)++; + /* If the ALU operation is an fsat or a move-like operation, do + * not count it. The expectation is that it will eventually be + * merged as a destination modifier or source modifier on some + * other instruction. + */ + if (mov->op != nir_op_fsat && !movelike) + (*count)++; } else { /* Can't handle saturate */ if (mov->dest.saturate) return false; /* It cannot have any if-uses */ - if (!list_empty(&mov->dest.dest.ssa.if_uses)) + if (!list_is_empty(&mov->dest.dest.ssa.if_uses)) return false; /* The only uses of this definition must be phis in the successor */ diff -Nru mesa-19.2.8/src/compiler/nir/nir_opt_sink.c mesa-20.0.8/src/compiler/nir/nir_opt_sink.c --- mesa-19.2.8/src/compiler/nir/nir_opt_sink.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir_opt_sink.c 2020-06-12 01:21:16.000000000 +0000 @@ -50,7 +50,8 @@ if ((options & nir_move_load_input) && (intrin->intrinsic == nir_intrinsic_load_interpolated_input || - intrin->intrinsic == nir_intrinsic_load_input)) + intrin->intrinsic == nir_intrinsic_load_input || + intrin->intrinsic == nir_intrinsic_load_per_vertex_input)) return true; } @@ -58,6 +59,11 @@ return true; } + if ((options & nir_move_copies) && instr->type == nir_instr_type_alu && + nir_instr_as_alu(instr)->op == nir_op_mov) { + return true; + } + if ((options & nir_move_comparisons) && instr->type == nir_instr_type_alu && nir_alu_instr_is_comparison(nir_instr_as_alu(instr))) { return true; diff -Nru mesa-19.2.8/src/compiler/nir/nir_opt_vectorize.c mesa-20.0.8/src/compiler/nir/nir_opt_vectorize.c --- mesa-19.2.8/src/compiler/nir/nir_opt_vectorize.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir_opt_vectorize.c 2020-06-12 01:21:16.000000000 +0000 @@ -248,8 +248,8 @@ nir_if_rewrite_condition(src->parent_if, nir_src_for_ssa(new_alu1)); } - assert(list_empty(&alu1->dest.dest.ssa.uses)); - assert(list_empty(&alu1->dest.dest.ssa.if_uses)); + assert(list_is_empty(&alu1->dest.dest.ssa.uses)); + assert(list_is_empty(&alu1->dest.dest.ssa.if_uses)); nir_foreach_use_safe(src, &alu2->dest.dest.ssa) { if (src->parent_instr->type == nir_instr_type_alu) { @@ -285,8 +285,8 @@ nir_if_rewrite_condition(src->parent_if, nir_src_for_ssa(new_alu2)); } - assert(list_empty(&alu2->dest.dest.ssa.uses)); - assert(list_empty(&alu2->dest.dest.ssa.if_uses)); + assert(list_is_empty(&alu2->dest.dest.ssa.uses)); + assert(list_is_empty(&alu2->dest.dest.ssa.if_uses)); nir_instr_remove(instr1); nir_instr_remove(instr2); diff -Nru mesa-19.2.8/src/compiler/nir/nir_phi_builder.c mesa-20.0.8/src/compiler/nir/nir_phi_builder.c --- mesa-19.2.8/src/compiler/nir/nir_phi_builder.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir_phi_builder.c 2020-06-12 01:21:16.000000000 +0000 @@ -132,8 +132,7 @@ pb->iter_count++; - BITSET_WORD tmp; - BITSET_FOREACH_SET(i, tmp, defs, pb->num_blocks) { + BITSET_FOREACH_SET(i, defs, pb->num_blocks) { if (pb->work[i] < pb->iter_count) pb->W[w_end++] = pb->blocks[i]; pb->work[i] = pb->iter_count; diff -Nru mesa-19.2.8/src/compiler/nir/nir_print.c mesa-20.0.8/src/compiler/nir/nir_print.c --- mesa-19.2.8/src/compiler/nir/nir_print.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir_print.c 2020-06-12 01:21:16.000000000 +0000 @@ -62,6 +62,8 @@ static void print_annotation(print_state *state, void *obj) { + FILE *fp = state->fp; + if (!state->annotations) return; @@ -72,7 +74,7 @@ const char *note = entry->data; _mesa_hash_table_remove(state->annotations, entry); - fprintf(stderr, "%s\n\n", note); + fprintf(fp, "%s\n\n", note); } static void @@ -169,6 +171,12 @@ print_reg_dest(&dest->reg, state); } +static const char * +comp_mask_string(unsigned num_components) +{ + return (num_components > 4) ? "abcdefghijklmnop" : "xyzw"; +} + static void print_alu_src(nir_alu_instr *instr, unsigned src, print_state *state) { @@ -204,7 +212,7 @@ if (!nir_alu_instr_channel_used(instr, src, i)) continue; - fprintf(fp, "%c", "xyzw"[instr->src[src].swizzle[i]]); + fprintf(fp, "%c", comp_mask_string(live_channels)[instr->src[src].swizzle[i]]); } } @@ -222,10 +230,11 @@ if (!dest->dest.is_ssa && dest->write_mask != (1 << dest->dest.reg.reg->num_components) - 1) { + unsigned live_channels = dest->dest.reg.reg->num_components; fprintf(fp, "."); for (unsigned i = 0; i < NIR_MAX_VEC_COMPONENTS; i++) if ((dest->write_mask >> i) & 1) - fprintf(fp, "%c", "xyzw"[i]); + fprintf(fp, "%c", comp_mask_string(live_channels)[i]); } } @@ -456,7 +465,7 @@ cent, samp, patch, inv, get_variable_mode_str(var->data.mode, false), glsl_interp_mode_name(var->data.interpolation)); - enum gl_access_qualifier access = var->data.image.access; + enum gl_access_qualifier access = var->data.access; const char *const coher = (access & ACCESS_COHERENT) ? "coherent " : ""; const char *const volat = (access & ACCESS_VOLATILE) ? "volatile " : ""; const char *const restr = (access & ACCESS_RESTRICT) ? "restrict " : ""; @@ -465,52 +474,54 @@ const char *const reorder = (access & ACCESS_CAN_REORDER) ? "reorderable " : ""; fprintf(fp, "%s%s%s%s%s%s", coher, volat, restr, ronly, wonly, reorder); -#define FORMAT_CASE(x) case x: fprintf(stderr, #x " "); break - switch (var->data.image.format) { - FORMAT_CASE(GL_RGBA32F); - FORMAT_CASE(GL_RGBA32UI); - FORMAT_CASE(GL_RGBA32I); - FORMAT_CASE(GL_R32F); - FORMAT_CASE(GL_R32UI); - FORMAT_CASE(GL_R32I); - FORMAT_CASE(GL_RG32F); - FORMAT_CASE(GL_RG32UI); - FORMAT_CASE(GL_RG32I); - FORMAT_CASE(GL_R8); - FORMAT_CASE(GL_RG8); - FORMAT_CASE(GL_RGBA8); - FORMAT_CASE(GL_R8_SNORM); - FORMAT_CASE(GL_RG8_SNORM); - FORMAT_CASE(GL_RGBA8_SNORM); - FORMAT_CASE(GL_R16); - FORMAT_CASE(GL_RG16); - FORMAT_CASE(GL_RGBA16); - FORMAT_CASE(GL_R16_SNORM); - FORMAT_CASE(GL_RG16_SNORM); - FORMAT_CASE(GL_RGBA16_SNORM); - FORMAT_CASE(GL_R16F); - FORMAT_CASE(GL_RG16F); - FORMAT_CASE(GL_RGBA16F); - FORMAT_CASE(GL_R8UI); - FORMAT_CASE(GL_R8I); - FORMAT_CASE(GL_RG8UI); - FORMAT_CASE(GL_RG8I); - FORMAT_CASE(GL_RGBA8UI); - FORMAT_CASE(GL_RGBA8I); - FORMAT_CASE(GL_R16UI); - FORMAT_CASE(GL_R16I); - FORMAT_CASE(GL_RG16UI); - FORMAT_CASE(GL_RG16I); - FORMAT_CASE(GL_RGBA16UI); - FORMAT_CASE(GL_RGBA16I); - FORMAT_CASE(GL_R11F_G11F_B10F); - FORMAT_CASE(GL_RGB9_E5); - FORMAT_CASE(GL_RGB10_A2); - FORMAT_CASE(GL_RGB10_A2UI); - default: /* Including the normal GL_NONE */ - break; - } + if (glsl_get_base_type(glsl_without_array(var->type)) == GLSL_TYPE_IMAGE) { +#define FORMAT_CASE(x) case x: fprintf(fp, #x " "); break + switch (var->data.image.format) { + FORMAT_CASE(GL_RGBA32F); + FORMAT_CASE(GL_RGBA32UI); + FORMAT_CASE(GL_RGBA32I); + FORMAT_CASE(GL_R32F); + FORMAT_CASE(GL_R32UI); + FORMAT_CASE(GL_R32I); + FORMAT_CASE(GL_RG32F); + FORMAT_CASE(GL_RG32UI); + FORMAT_CASE(GL_RG32I); + FORMAT_CASE(GL_R8); + FORMAT_CASE(GL_RG8); + FORMAT_CASE(GL_RGBA8); + FORMAT_CASE(GL_R8_SNORM); + FORMAT_CASE(GL_RG8_SNORM); + FORMAT_CASE(GL_RGBA8_SNORM); + FORMAT_CASE(GL_R16); + FORMAT_CASE(GL_RG16); + FORMAT_CASE(GL_RGBA16); + FORMAT_CASE(GL_R16_SNORM); + FORMAT_CASE(GL_RG16_SNORM); + FORMAT_CASE(GL_RGBA16_SNORM); + FORMAT_CASE(GL_R16F); + FORMAT_CASE(GL_RG16F); + FORMAT_CASE(GL_RGBA16F); + FORMAT_CASE(GL_R8UI); + FORMAT_CASE(GL_R8I); + FORMAT_CASE(GL_RG8UI); + FORMAT_CASE(GL_RG8I); + FORMAT_CASE(GL_RGBA8UI); + FORMAT_CASE(GL_RGBA8I); + FORMAT_CASE(GL_R16UI); + FORMAT_CASE(GL_R16I); + FORMAT_CASE(GL_RG16UI); + FORMAT_CASE(GL_RG16I); + FORMAT_CASE(GL_RGBA16UI); + FORMAT_CASE(GL_RGBA16I); + FORMAT_CASE(GL_R11F_G11F_B10F); + FORMAT_CASE(GL_RGB9_E5); + FORMAT_CASE(GL_RGB10_A2); + FORMAT_CASE(GL_RGB10_A2UI); + default: /* Including the normal GL_NONE */ + break; + } #undef FORMAT_CASE + } fprintf(fp, "%s %s", glsl_get_type_name(var->type), get_var_name(var, state)); @@ -551,8 +562,12 @@ } if (!loc) { - snprintf(buf, sizeof(buf), "%u", var->data.location); - loc = buf; + if (var->data.location == ~0) { + loc = "~0"; + } else { + snprintf(buf, sizeof(buf), "%u", var->data.location); + loc = buf; + } } /* For shader I/O vars that have been split to components or packed, @@ -561,12 +576,12 @@ unsigned int num_components = glsl_get_components(glsl_without_array(var->type)); const char *components = NULL; - char components_local[6] = {'.' /* the rest is 0-filled */}; + char components_local[18] = {'.' /* the rest is 0-filled */}; switch (var->data.mode) { case nir_var_shader_in: case nir_var_shader_out: - if (num_components < 4 && num_components != 0) { - const char *xyzw = "xyzw"; + if (num_components < 16 && num_components != 0) { + const char *xyzw = comp_mask_string(num_components); for (int i = 0; i < num_components; i++) components_local[i + 1] = xyzw[i + var->data.location_frac]; @@ -798,6 +813,10 @@ [NIR_INTRINSIC_DESC_TYPE] = "desc_type", [NIR_INTRINSIC_TYPE] = "type", [NIR_INTRINSIC_SWIZZLE_MASK] = "swizzle_mask", + [NIR_INTRINSIC_DRIVER_LOCATION] = "driver_location", + [NIR_INTRINSIC_MEMORY_SEMANTICS] = "mem_semantics", + [NIR_INTRINSIC_MEMORY_MODES] = "mem_modes", + [NIR_INTRINSIC_MEMORY_SCOPE] = "mem_scope", }; for (unsigned idx = 1; idx < NIR_INTRINSIC_NUM_INDEX_FLAGS; idx++) { if (!info->index_map[idx]) @@ -808,9 +827,9 @@ /* special case wrmask to show it as a writemask.. */ unsigned wrmask = nir_intrinsic_write_mask(instr); fprintf(fp, " wrmask="); - for (unsigned i = 0; i < 4; i++) + for (unsigned i = 0; i < instr->num_components; i++) if ((wrmask >> i) & 1) - fprintf(fp, "%c", "xyzw"[i]); + fprintf(fp, "%c", comp_mask_string(instr->num_components)[i]); break; } @@ -884,6 +903,42 @@ break; } + case NIR_INTRINSIC_MEMORY_SEMANTICS: { + nir_memory_semantics semantics = nir_intrinsic_memory_semantics(instr); + fprintf(fp, " mem_semantics="); + switch (semantics & (NIR_MEMORY_ACQUIRE | NIR_MEMORY_RELEASE)) { + case 0: fprintf(fp, "NONE"); break; + case NIR_MEMORY_ACQUIRE: fprintf(fp, "ACQ"); break; + case NIR_MEMORY_RELEASE: fprintf(fp, "REL"); break; + default: fprintf(fp, "ACQ|REL"); break; + } + if (semantics & (NIR_MEMORY_MAKE_AVAILABLE)) fprintf(fp, "|AVAILABLE"); + if (semantics & (NIR_MEMORY_MAKE_VISIBLE)) fprintf(fp, "|VISIBLE"); + break; + } + + case NIR_INTRINSIC_MEMORY_MODES: { + fprintf(fp, " mem_modes="); + unsigned int modes = nir_intrinsic_memory_modes(instr); + while (modes) { + nir_variable_mode m = u_bit_scan(&modes); + fprintf(fp, "%s%s", get_variable_mode_str(1 << m, true), modes ? "|" : ""); + } + break; + } + + case NIR_INTRINSIC_MEMORY_SCOPE: { + fprintf(fp, " mem_scope="); + switch (nir_intrinsic_memory_scope(instr)) { + case NIR_SCOPE_DEVICE: fprintf(fp, "DEVICE"); break; + case NIR_SCOPE_QUEUE_FAMILY: fprintf(fp, "QUEUE_FAMILY"); break; + case NIR_SCOPE_WORKGROUP: fprintf(fp, "WORKGROUP"); break; + case NIR_SCOPE_SUBGROUP: fprintf(fp, "SUBGROUP"); break; + case NIR_SCOPE_INVOCATION: fprintf(fp, "INVOCATION"); break; + } + break; + } + default: { unsigned off = info->index_map[idx] - 1; assert(index_name[idx]); /* forgot to update index_name table? */ @@ -982,6 +1037,15 @@ case nir_texop_samples_identical: fprintf(fp, "samples_identical "); break; + case nir_texop_tex_prefetch: + fprintf(fp, "tex (pre-dispatchable) "); + break; + case nir_texop_fragment_fetch: + fprintf(fp, "fragment_fetch "); + break; + case nir_texop_fragment_mask_fetch: + fprintf(fp, "fragment_mask_fetch "); + break; default: unreachable("Invalid texture operation"); break; @@ -1081,6 +1145,14 @@ fprintf(fp, ", %u (sampler)", instr->sampler_index); } } + + if (instr->texture_non_uniform) { + fprintf(fp, ", texture non-uniform"); + } + + if (instr->sampler_non_uniform) { + fprintf(fp, ", sampler non-uniform"); + } } static void @@ -1414,7 +1486,7 @@ state->fp = fp; state->shader = shader; state->ht = _mesa_pointer_hash_table_create(NULL); - state->syms = _mesa_set_create(NULL, _mesa_key_hash_string, + state->syms = _mesa_set_create(NULL, _mesa_hash_string, _mesa_key_string_equal); state->index = 0; } diff -Nru mesa-19.2.8/src/compiler/nir/nir_range_analysis.c mesa-20.0.8/src/compiler/nir/nir_range_analysis.c --- mesa-19.2.8/src/compiler/nir/nir_range_analysis.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir_range_analysis.c 2020-06-12 01:21:16.000000000 +0000 @@ -51,8 +51,41 @@ return (struct ssa_result_range){v & 0xff, (v & 0x0ff00) != 0}; } +static void * +pack_key(const struct nir_alu_instr *instr, nir_alu_type type) +{ + uintptr_t type_encoding; + uintptr_t ptr = (uintptr_t) instr; + + /* The low 2 bits have to be zero or this whole scheme falls apart. */ + assert((ptr & 0x3) == 0); + + /* NIR is typeless in the sense that sequences of bits have whatever + * meaning is attached to them by the instruction that consumes them. + * However, the number of bits must match between producer and consumer. + * As a result, the number of bits does not need to be encoded here. + */ + switch (nir_alu_type_get_base_type(type)) { + case nir_type_int: type_encoding = 0; break; + case nir_type_uint: type_encoding = 1; break; + case nir_type_bool: type_encoding = 2; break; + case nir_type_float: type_encoding = 3; break; + default: unreachable("Invalid base type."); + } + + return (void *)(ptr | type_encoding); +} + +static nir_alu_type +nir_alu_src_type(const nir_alu_instr *instr, unsigned src) +{ + return nir_alu_type_get_base_type(nir_op_infos[instr->op].input_types[src]) | + nir_src_bit_size(instr->src[src].src); +} + static struct ssa_result_range -analyze_constant(const struct nir_alu_instr *instr, unsigned src) +analyze_constant(const struct nir_alu_instr *instr, unsigned src, + nir_alu_type use_type) { uint8_t swizzle[4] = { 0, 1, 2, 3 }; @@ -69,7 +102,7 @@ struct ssa_result_range r = { unknown, false }; - switch (nir_op_infos[instr->op].input_types[src]) { + switch (nir_alu_type_get_base_type(use_type)) { case nir_type_float: { double min_value = DBL_MAX; double max_value = -DBL_MAX; @@ -179,32 +212,172 @@ } } +/** + * Short-hand name for use in the tables in analyze_expression. If this name + * becomes a problem on some compiler, we can change it to _. + */ +#define _______ unknown + + +/* MSVC doesn't have C99's _Pragma() */ +#ifdef _MSC_VER +#define _Pragma(x) +#endif + + #ifndef NDEBUG #define ASSERT_TABLE_IS_COMMUTATIVE(t) \ do { \ - for (unsigned r = 0; r < ARRAY_SIZE(t); r++) { \ - for (unsigned c = 0; c < ARRAY_SIZE(t[0]); c++) \ - assert(t[r][c] == t[c][r]); \ + static bool first = true; \ + if (first) { \ + first = false; \ + _Pragma("GCC unroll 7") \ + for (unsigned r = 0; r < ARRAY_SIZE(t); r++) { \ + _Pragma("GCC unroll 7") \ + for (unsigned c = 0; c < ARRAY_SIZE(t[0]); c++) \ + assert(t[r][c] == t[c][r]); \ + } \ } \ } while (false) #define ASSERT_TABLE_IS_DIAGONAL(t) \ do { \ - for (unsigned r = 0; r < ARRAY_SIZE(t); r++) \ - assert(t[r][r] == r); \ + static bool first = true; \ + if (first) { \ + first = false; \ + _Pragma("GCC unroll 7") \ + for (unsigned r = 0; r < ARRAY_SIZE(t); r++) \ + assert(t[r][r] == r); \ + } \ } while (false) + +static enum ssa_ranges +union_ranges(enum ssa_ranges a, enum ssa_ranges b) +{ + static const enum ssa_ranges union_table[last_range + 1][last_range + 1] = { + /* left\right unknown lt_zero le_zero gt_zero ge_zero ne_zero eq_zero */ + /* unknown */ { _______, _______, _______, _______, _______, _______, _______ }, + /* lt_zero */ { _______, lt_zero, le_zero, ne_zero, _______, ne_zero, le_zero }, + /* le_zero */ { _______, le_zero, le_zero, _______, _______, _______, le_zero }, + /* gt_zero */ { _______, ne_zero, _______, gt_zero, ge_zero, ne_zero, ge_zero }, + /* ge_zero */ { _______, _______, _______, ge_zero, ge_zero, _______, ge_zero }, + /* ne_zero */ { _______, ne_zero, _______, ne_zero, _______, ne_zero, _______ }, + /* eq_zero */ { _______, le_zero, le_zero, ge_zero, ge_zero, _______, eq_zero }, + }; + + ASSERT_TABLE_IS_COMMUTATIVE(union_table); + ASSERT_TABLE_IS_DIAGONAL(union_table); + + return union_table[a][b]; +} + +/* Verify that the 'unknown' entry in each row (or column) of the table is the + * union of all the other values in the row (or column). + */ +#define ASSERT_UNION_OF_OTHERS_MATCHES_UNKNOWN_2_SOURCE(t) \ + do { \ + static bool first = true; \ + if (first) { \ + first = false; \ + _Pragma("GCC unroll 7") \ + for (unsigned i = 0; i < last_range; i++) { \ + enum ssa_ranges col_range = t[i][unknown + 1]; \ + enum ssa_ranges row_range = t[unknown + 1][i]; \ + \ + _Pragma("GCC unroll 5") \ + for (unsigned j = unknown + 2; j < last_range; j++) { \ + col_range = union_ranges(col_range, t[i][j]); \ + row_range = union_ranges(row_range, t[j][i]); \ + } \ + \ + assert(col_range == t[i][unknown]); \ + assert(row_range == t[unknown][i]); \ + } \ + } \ + } while (false) + +/* For most operations, the union of ranges for a strict inequality and + * equality should be the range of the non-strict inequality (e.g., + * union_ranges(range(op(lt_zero), range(op(eq_zero))) == range(op(le_zero)). + * + * Does not apply to selection-like opcodes (bcsel, fmin, fmax, etc.). + */ +#define ASSERT_UNION_OF_EQ_AND_STRICT_INEQ_MATCHES_NONSTRICT_1_SOURCE(t) \ + do { \ + assert(union_ranges(t[lt_zero], t[eq_zero]) == t[le_zero]); \ + assert(union_ranges(t[gt_zero], t[eq_zero]) == t[ge_zero]); \ + } while (false) + +#define ASSERT_UNION_OF_EQ_AND_STRICT_INEQ_MATCHES_NONSTRICT_2_SOURCE(t) \ + do { \ + static bool first = true; \ + if (first) { \ + first = false; \ + _Pragma("GCC unroll 7") \ + for (unsigned i = 0; i < last_range; i++) { \ + assert(union_ranges(t[i][lt_zero], t[i][eq_zero]) == t[i][le_zero]); \ + assert(union_ranges(t[i][gt_zero], t[i][eq_zero]) == t[i][ge_zero]); \ + assert(union_ranges(t[lt_zero][i], t[eq_zero][i]) == t[le_zero][i]); \ + assert(union_ranges(t[gt_zero][i], t[eq_zero][i]) == t[ge_zero][i]); \ + } \ + } \ + } while (false) + +/* Several other unordered tuples span the range of "everything." Each should + * have the same value as unknown: (lt_zero, ge_zero), (le_zero, gt_zero), and + * (eq_zero, ne_zero). union_ranges is already commutative, so only one + * ordering needs to be checked. + * + * Does not apply to selection-like opcodes (bcsel, fmin, fmax, etc.). + * + * In cases where this can be used, it is unnecessary to also use + * ASSERT_UNION_OF_OTHERS_MATCHES_UNKNOWN_*_SOURCE. For any range X, + * union_ranges(X, X) == X. The disjoint ranges cover all of the non-unknown + * possibilities, so the union of all the unions of disjoint ranges is + * equivalent to the union of "others." + */ +#define ASSERT_UNION_OF_DISJOINT_MATCHES_UNKNOWN_1_SOURCE(t) \ + do { \ + assert(union_ranges(t[lt_zero], t[ge_zero]) == t[unknown]); \ + assert(union_ranges(t[le_zero], t[gt_zero]) == t[unknown]); \ + assert(union_ranges(t[eq_zero], t[ne_zero]) == t[unknown]); \ + } while (false) + +#define ASSERT_UNION_OF_DISJOINT_MATCHES_UNKNOWN_2_SOURCE(t) \ + do { \ + static bool first = true; \ + if (first) { \ + first = false; \ + _Pragma("GCC unroll 7") \ + for (unsigned i = 0; i < last_range; i++) { \ + assert(union_ranges(t[i][lt_zero], t[i][ge_zero]) == \ + t[i][unknown]); \ + assert(union_ranges(t[i][le_zero], t[i][gt_zero]) == \ + t[i][unknown]); \ + assert(union_ranges(t[i][eq_zero], t[i][ne_zero]) == \ + t[i][unknown]); \ + \ + assert(union_ranges(t[lt_zero][i], t[ge_zero][i]) == \ + t[unknown][i]); \ + assert(union_ranges(t[le_zero][i], t[gt_zero][i]) == \ + t[unknown][i]); \ + assert(union_ranges(t[eq_zero][i], t[ne_zero][i]) == \ + t[unknown][i]); \ + } \ + } \ + } while (false) + #else #define ASSERT_TABLE_IS_COMMUTATIVE(t) #define ASSERT_TABLE_IS_DIAGONAL(t) +#define ASSERT_UNION_OF_OTHERS_MATCHES_UNKNOWN_2_SOURCE(t) +#define ASSERT_UNION_OF_EQ_AND_STRICT_INEQ_MATCHES_NONSTRICT_1_SOURCE(t) +#define ASSERT_UNION_OF_EQ_AND_STRICT_INEQ_MATCHES_NONSTRICT_2_SOURCE(t) +#define ASSERT_UNION_OF_DISJOINT_MATCHES_UNKNOWN_1_SOURCE(t) +#define ASSERT_UNION_OF_DISJOINT_MATCHES_UNKNOWN_2_SOURCE(t) #endif /** - * Short-hand name for use in the tables in analyze_expression. If this name - * becomes a problem on some compiler, we can change it to _. - */ -#define _______ unknown - -/** * Analyze an expression to determine the range of its result * * The end result of this analysis is a token that communicates something @@ -215,13 +388,16 @@ */ static struct ssa_result_range analyze_expression(const nir_alu_instr *instr, unsigned src, - struct hash_table *ht) + struct hash_table *ht, nir_alu_type use_type) { + /* Ensure that the _Pragma("GCC unroll 7") above are correct. */ + STATIC_ASSERT(last_range + 1 == 7); + if (!instr->src[src].src.is_ssa) return (struct ssa_result_range){unknown, false}; if (nir_src_is_const(instr->src[src].src)) - return analyze_constant(instr, src); + return analyze_constant(instr, src, use_type); if (instr->src[src].src.ssa->parent_instr->type != nir_instr_type_alu) return (struct ssa_result_range){unknown, false}; @@ -229,8 +405,6 @@ const struct nir_alu_instr *const alu = nir_instr_as_alu(instr->src[src].src.ssa->parent_instr); - const nir_alu_type use_type = nir_op_infos[instr->op].input_types[src]; - /* Bail if the type of the instruction generating the value does not match * the type the value will be interpreted as. int/uint/bool can be * reinterpreted trivially. The most important cases are between float and @@ -249,7 +423,7 @@ } } - struct hash_entry *he = _mesa_hash_table_search(ht, alu); + struct hash_entry *he = _mesa_hash_table_search(ht, pack_key(alu, use_type)); if (he != NULL) return unpack_data(he->data); @@ -295,6 +469,8 @@ }; ASSERT_TABLE_IS_COMMUTATIVE(fadd_table); + ASSERT_UNION_OF_DISJOINT_MATCHES_UNKNOWN_2_SOURCE(fadd_table); + ASSERT_UNION_OF_EQ_AND_STRICT_INEQ_MATCHES_NONSTRICT_2_SOURCE(fadd_table); /* Due to flush-to-zero semanatics of floating-point numbers with very * small mangnitudes, we can never really be sure a result will be @@ -339,12 +515,17 @@ }; ASSERT_TABLE_IS_COMMUTATIVE(fmul_table); + ASSERT_UNION_OF_DISJOINT_MATCHES_UNKNOWN_2_SOURCE(fmul_table); + ASSERT_UNION_OF_EQ_AND_STRICT_INEQ_MATCHES_NONSTRICT_2_SOURCE(fmul_table); static const enum ssa_ranges fneg_table[last_range + 1] = { /* unknown lt_zero le_zero gt_zero ge_zero ne_zero eq_zero */ _______, gt_zero, ge_zero, lt_zero, le_zero, ne_zero, eq_zero }; + ASSERT_UNION_OF_DISJOINT_MATCHES_UNKNOWN_1_SOURCE(fneg_table); + ASSERT_UNION_OF_EQ_AND_STRICT_INEQ_MATCHES_NONSTRICT_1_SOURCE(fneg_table); + switch (alu->op) { case nir_op_b2f32: @@ -353,21 +534,10 @@ break; case nir_op_bcsel: { - const struct ssa_result_range left = analyze_expression(alu, 1, ht); - const struct ssa_result_range right = analyze_expression(alu, 2, ht); - - /* If either source is a constant load that is not zero, punt. The type - * will always be uint regardless of the actual type. We can't even - * decide if the value is non-zero because -0.0 is 0x80000000, and that - * will (possibly incorrectly) be considered non-zero. - */ - /* FINISHME: We could do better, but it would require having the expected - * FINISHME: type passed in. - */ - if ((nir_src_is_const(alu->src[1].src) && left.range != eq_zero) || - (nir_src_is_const(alu->src[2].src) && right.range != eq_zero)) { - return (struct ssa_result_range){unknown, false}; - } + const struct ssa_result_range left = + analyze_expression(alu, 1, ht, use_type); + const struct ssa_result_range right = + analyze_expression(alu, 2, ht, use_type); r.is_integral = left.is_integral && right.is_integral; @@ -424,6 +594,7 @@ ASSERT_TABLE_IS_COMMUTATIVE(table); ASSERT_TABLE_IS_DIAGONAL(table); + ASSERT_UNION_OF_OTHERS_MATCHES_UNKNOWN_2_SOURCE(table); r.range = table[left.range][right.range]; break; @@ -431,7 +602,7 @@ case nir_op_i2f32: case nir_op_u2f32: - r = analyze_expression(alu, 0, ht); + r = analyze_expression(alu, 0, ht, nir_alu_src_type(alu, 0)); r.is_integral = true; @@ -441,7 +612,7 @@ break; case nir_op_fabs: - r = analyze_expression(alu, 0, ht); + r = analyze_expression(alu, 0, ht, nir_alu_src_type(alu, 0)); switch (r.range) { case unknown: @@ -463,8 +634,10 @@ break; case nir_op_fadd: { - const struct ssa_result_range left = analyze_expression(alu, 0, ht); - const struct ssa_result_range right = analyze_expression(alu, 1, ht); + const struct ssa_result_range left = + analyze_expression(alu, 0, ht, nir_alu_src_type(alu, 0)); + const struct ssa_result_range right = + analyze_expression(alu, 1, ht, nir_alu_src_type(alu, 1)); r.is_integral = left.is_integral && right.is_integral; r.range = fadd_table[left.range][right.range]; @@ -481,7 +654,10 @@ ge_zero, ge_zero, ge_zero, gt_zero, gt_zero, ge_zero, gt_zero }; - r = analyze_expression(alu, 0, ht); + r = analyze_expression(alu, 0, ht, nir_alu_src_type(alu, 0)); + + ASSERT_UNION_OF_DISJOINT_MATCHES_UNKNOWN_1_SOURCE(table); + ASSERT_UNION_OF_EQ_AND_STRICT_INEQ_MATCHES_NONSTRICT_1_SOURCE(table); r.is_integral = r.is_integral && is_not_negative(r.range); r.range = table[r.range]; @@ -489,8 +665,10 @@ } case nir_op_fmax: { - const struct ssa_result_range left = analyze_expression(alu, 0, ht); - const struct ssa_result_range right = analyze_expression(alu, 1, ht); + const struct ssa_result_range left = + analyze_expression(alu, 0, ht, nir_alu_src_type(alu, 0)); + const struct ssa_result_range right = + analyze_expression(alu, 1, ht, nir_alu_src_type(alu, 1)); r.is_integral = left.is_integral && right.is_integral; @@ -545,14 +723,17 @@ /* Treat fmax as commutative. */ ASSERT_TABLE_IS_COMMUTATIVE(table); ASSERT_TABLE_IS_DIAGONAL(table); + ASSERT_UNION_OF_OTHERS_MATCHES_UNKNOWN_2_SOURCE(table); r.range = table[left.range][right.range]; break; } case nir_op_fmin: { - const struct ssa_result_range left = analyze_expression(alu, 0, ht); - const struct ssa_result_range right = analyze_expression(alu, 1, ht); + const struct ssa_result_range left = + analyze_expression(alu, 0, ht, nir_alu_src_type(alu, 0)); + const struct ssa_result_range right = + analyze_expression(alu, 1, ht, nir_alu_src_type(alu, 1)); r.is_integral = left.is_integral && right.is_integral; @@ -607,14 +788,17 @@ /* Treat fmin as commutative. */ ASSERT_TABLE_IS_COMMUTATIVE(table); ASSERT_TABLE_IS_DIAGONAL(table); + ASSERT_UNION_OF_OTHERS_MATCHES_UNKNOWN_2_SOURCE(table); r.range = table[left.range][right.range]; break; } case nir_op_fmul: { - const struct ssa_result_range left = analyze_expression(alu, 0, ht); - const struct ssa_result_range right = analyze_expression(alu, 1, ht); + const struct ssa_result_range left = + analyze_expression(alu, 0, ht, nir_alu_src_type(alu, 0)); + const struct ssa_result_range right = + analyze_expression(alu, 1, ht, nir_alu_src_type(alu, 1)); r.is_integral = left.is_integral && right.is_integral; @@ -634,28 +818,24 @@ } case nir_op_frcp: - r = (struct ssa_result_range){analyze_expression(alu, 0, ht).range, false}; + r = (struct ssa_result_range){ + analyze_expression(alu, 0, ht, nir_alu_src_type(alu, 0)).range, + false + }; break; - case nir_op_mov: { - const struct ssa_result_range left = analyze_expression(alu, 0, ht); - - /* See commentary in nir_op_bcsel for the reasons this is necessary. */ - if (nir_src_is_const(alu->src[0].src) && left.range != eq_zero) - return (struct ssa_result_range){unknown, false}; - - r = left; + case nir_op_mov: + r = analyze_expression(alu, 0, ht, use_type); break; - } case nir_op_fneg: - r = analyze_expression(alu, 0, ht); + r = analyze_expression(alu, 0, ht, nir_alu_src_type(alu, 0)); r.range = fneg_table[r.range]; break; case nir_op_fsat: - r = analyze_expression(alu, 0, ht); + r = analyze_expression(alu, 0, ht, nir_alu_src_type(alu, 0)); switch (r.range) { case le_zero: @@ -680,7 +860,10 @@ break; case nir_op_fsign: - r = (struct ssa_result_range){analyze_expression(alu, 0, ht).range, true}; + r = (struct ssa_result_range){ + analyze_expression(alu, 0, ht, nir_alu_src_type(alu, 0)).range, + true + }; break; case nir_op_fsqrt: @@ -689,7 +872,8 @@ break; case nir_op_ffloor: { - const struct ssa_result_range left = analyze_expression(alu, 0, ht); + const struct ssa_result_range left = + analyze_expression(alu, 0, ht, nir_alu_src_type(alu, 0)); r.is_integral = true; @@ -704,7 +888,8 @@ } case nir_op_fceil: { - const struct ssa_result_range left = analyze_expression(alu, 0, ht); + const struct ssa_result_range left = + analyze_expression(alu, 0, ht, nir_alu_src_type(alu, 0)); r.is_integral = true; @@ -719,7 +904,8 @@ } case nir_op_ftrunc: { - const struct ssa_result_range left = analyze_expression(alu, 0, ht); + const struct ssa_result_range left = + analyze_expression(alu, 0, ht, nir_alu_src_type(alu, 0)); r.is_integral = true; @@ -749,10 +935,78 @@ r = (struct ssa_result_range){le_zero, false}; break; + case nir_op_fpow: { + /* Due to flush-to-zero semanatics of floating-point numbers with very + * small mangnitudes, we can never really be sure a result will be + * non-zero. + * + * NIR uses pow() and powf() to constant evaluate nir_op_fpow. The man + * page for that function says: + * + * If y is 0, the result is 1.0 (even if x is a NaN). + * + * gt_zero: pow(*, eq_zero) + * | pow(eq_zero, lt_zero) # 0^-y = +inf + * | pow(eq_zero, le_zero) # 0^-y = +inf or 0^0 = 1.0 + * ; + * + * eq_zero: pow(eq_zero, gt_zero) + * ; + * + * ge_zero: pow(gt_zero, gt_zero) + * | pow(gt_zero, ge_zero) + * | pow(gt_zero, lt_zero) + * | pow(gt_zero, le_zero) + * | pow(gt_zero, ne_zero) + * | pow(gt_zero, unknown) + * | pow(ge_zero, gt_zero) + * | pow(ge_zero, ge_zero) + * | pow(ge_zero, lt_zero) + * | pow(ge_zero, le_zero) + * | pow(ge_zero, ne_zero) + * | pow(ge_zero, unknown) + * | pow(eq_zero, ge_zero) # 0^0 = 1.0 or 0^+y = 0.0 + * | pow(eq_zero, ne_zero) # 0^-y = +inf or 0^+y = 0.0 + * | pow(eq_zero, unknown) # union of all other y cases + * ; + * + * All other cases are unknown. + * + * We could do better if the right operand is a constant, integral + * value. + */ + static const enum ssa_ranges table[last_range + 1][last_range + 1] = { + /* left\right unknown lt_zero le_zero gt_zero ge_zero ne_zero eq_zero */ + /* unknown */ { _______, _______, _______, _______, _______, _______, gt_zero }, + /* lt_zero */ { _______, _______, _______, _______, _______, _______, gt_zero }, + /* le_zero */ { _______, _______, _______, _______, _______, _______, gt_zero }, + /* gt_zero */ { ge_zero, ge_zero, ge_zero, ge_zero, ge_zero, ge_zero, gt_zero }, + /* ge_zero */ { ge_zero, ge_zero, ge_zero, ge_zero, ge_zero, ge_zero, gt_zero }, + /* ne_zero */ { _______, _______, _______, _______, _______, _______, gt_zero }, + /* eq_zero */ { ge_zero, gt_zero, gt_zero, eq_zero, ge_zero, ge_zero, gt_zero }, + }; + + const struct ssa_result_range left = + analyze_expression(alu, 0, ht, nir_alu_src_type(alu, 0)); + const struct ssa_result_range right = + analyze_expression(alu, 1, ht, nir_alu_src_type(alu, 1)); + + ASSERT_UNION_OF_DISJOINT_MATCHES_UNKNOWN_2_SOURCE(table); + ASSERT_UNION_OF_EQ_AND_STRICT_INEQ_MATCHES_NONSTRICT_2_SOURCE(table); + + r.is_integral = left.is_integral && right.is_integral && + is_not_negative(right.range); + r.range = table[left.range][right.range]; + break; + } + case nir_op_ffma: { - const struct ssa_result_range first = analyze_expression(alu, 0, ht); - const struct ssa_result_range second = analyze_expression(alu, 1, ht); - const struct ssa_result_range third = analyze_expression(alu, 2, ht); + const struct ssa_result_range first = + analyze_expression(alu, 0, ht, nir_alu_src_type(alu, 0)); + const struct ssa_result_range second = + analyze_expression(alu, 1, ht, nir_alu_src_type(alu, 1)); + const struct ssa_result_range third = + analyze_expression(alu, 2, ht, nir_alu_src_type(alu, 2)); r.is_integral = first.is_integral && second.is_integral && third.is_integral; @@ -775,9 +1029,12 @@ } case nir_op_flrp: { - const struct ssa_result_range first = analyze_expression(alu, 0, ht); - const struct ssa_result_range second = analyze_expression(alu, 1, ht); - const struct ssa_result_range third = analyze_expression(alu, 2, ht); + const struct ssa_result_range first = + analyze_expression(alu, 0, ht, nir_alu_src_type(alu, 0)); + const struct ssa_result_range second = + analyze_expression(alu, 1, ht, nir_alu_src_type(alu, 1)); + const struct ssa_result_range third = + analyze_expression(alu, 2, ht, nir_alu_src_type(alu, 2)); r.is_integral = first.is_integral && second.is_integral && third.is_integral; @@ -801,20 +1058,16 @@ if (r.range == eq_zero) r.is_integral = true; - _mesa_hash_table_insert(ht, alu, pack_data(r)); + _mesa_hash_table_insert(ht, pack_key(alu, use_type), pack_data(r)); return r; } #undef _______ struct ssa_result_range -nir_analyze_range(const nir_alu_instr *instr, unsigned src) +nir_analyze_range(struct hash_table *range_ht, + const nir_alu_instr *instr, unsigned src) { - struct hash_table *ht = _mesa_pointer_hash_table_create(NULL); - - const struct ssa_result_range r = analyze_expression(instr, src, ht); - - _mesa_hash_table_destroy(ht, NULL); - - return r; + return analyze_expression(instr, src, range_ht, + nir_alu_src_type(instr, src)); } diff -Nru mesa-19.2.8/src/compiler/nir/nir_range_analysis.h mesa-20.0.8/src/compiler/nir/nir_range_analysis.h --- mesa-19.2.8/src/compiler/nir/nir_range_analysis.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir_range_analysis.h 2020-06-12 01:21:16.000000000 +0000 @@ -42,6 +42,7 @@ }; extern struct ssa_result_range -nir_analyze_range(const nir_alu_instr *instr, unsigned src); +nir_analyze_range(struct hash_table *range_ht, + const nir_alu_instr *instr, unsigned src); #endif /* _NIR_RANGE_ANALYSIS_H_ */ diff -Nru mesa-19.2.8/src/compiler/nir/nir_schedule.c mesa-20.0.8/src/compiler/nir/nir_schedule.c --- mesa-19.2.8/src/compiler/nir/nir_schedule.c 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir_schedule.c 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,1087 @@ +/* + * Copyright © 2019 Broadcom + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "nir.h" +#include "util/dag.h" +#include "util/u_dynarray.h" + +/** @file + * + * Implements basic-block-level prepass instruction scheduling in NIR to + * manage register pressure. + * + * This is based on the Goodman/Hsu paper (1988, cached copy at + * https://people.freedesktop.org/~anholt/scheduling-goodman-hsu.pdf). We + * make up the DDG for NIR (which can be mostly done using the NIR def/use + * chains for SSA instructions, plus some edges for ordering register writes + * vs reads, and some more for ordering intrinsics). Then we pick heads off + * of the DDG using their heuristic to emit the NIR instructions back into the + * block in their new order. + * + * The hard case for prepass scheduling on GPUs seems to always be consuming + * texture/ubo results. The register pressure heuristic doesn't want to pick + * an instr that starts consuming texture results because it usually won't be + * the only usage, so that instruction will increase pressure. + * + * If you try to force consumption of tex results always, then in a case where + * single sample is used for many outputs, you'll end up picking every other + * user and expanding register pressure. The partially_evaluated_path flag + * helps tremendously, in that if you happen for whatever reason to pick a + * texture sample's output, then you'll try to finish off that sample. Future + * work may include doing some local search before locking in a choice, to try + * to more reliably find the case where just a few choices going against the + * heuristic can manage to free the whole vector. + */ + +static bool debug; + +/** + * Represents a node in the DDG for a NIR instruction. + */ +typedef struct { + struct dag_node dag; /* must be first for our u_dynarray_foreach */ + nir_instr *instr; + bool partially_evaluated_path; + + /* Approximate estimate of the delay between starting this instruction and + * its results being available. + * + * Accuracy is not too important, given that we're prepass scheduling here + * and just trying to reduce excess dependencies introduced by a register + * allocator by stretching out the live intervals of expensive + * instructions. + */ + uint32_t delay; + + /* Cost of the maximum-delay path from this node to the leaves. */ + uint32_t max_delay; + + /* scoreboard->time value when this instruction can be scheduled without + * any stalls expected. + */ + uint32_t ready_time; +} nir_schedule_node; + +typedef struct { + struct dag *dag; + + nir_shader *shader; + + /* Mapping from nir_register * or nir_ssa_def * to a struct set of + * instructions remaining to be scheduled using the register. + */ + struct hash_table *remaining_uses; + + /* Map from nir_instr to nir_schedule_node * */ + struct hash_table *instr_map; + + /* Set of nir_register * or nir_ssa_def * that have had any instruction + * scheduled on them. + */ + struct set *live_values; + + /* An abstract approximation of the number of nir_scheduler_node->delay + * units since the start of the shader. + */ + uint32_t time; + + /* Number of channels currently used by the NIR instructions that have been + * scheduled. + */ + int pressure; + + /* Number of channels that may be in use before we switch to the + * pressure-prioritizing scheduling heuristic. + */ + int threshold; +} nir_schedule_scoreboard; + +/* When walking the instructions in reverse, we use this flag to swap + * before/after in add_dep(). + */ +enum direction { F, R }; + +typedef struct { + nir_shader *shader; + + /* Map from nir_instr to nir_schedule_node * */ + struct hash_table *instr_map; + /* Map from nir_register to nir_schedule_node * */ + struct hash_table *reg_map; + + /* Scheduler nodes for last instruction involved in some class of dependency. + */ + nir_schedule_node *load_input; + nir_schedule_node *store_shared; + nir_schedule_node *unknown_intrinsic; + nir_schedule_node *discard; + nir_schedule_node *jump; + + enum direction dir; +} nir_deps_state; + +static void * +_mesa_hash_table_search_data(struct hash_table *ht, void *key) +{ + struct hash_entry *entry = _mesa_hash_table_search(ht, key); + if (!entry) + return NULL; + return entry->data; +} + +static nir_schedule_node * +nir_schedule_get_node(struct hash_table *instr_map, nir_instr *instr) +{ + return _mesa_hash_table_search_data(instr_map, instr); +} + +static struct set * +nir_schedule_scoreboard_get_src(nir_schedule_scoreboard *scoreboard, nir_src *src) +{ + if (src->is_ssa) { + return _mesa_hash_table_search_data(scoreboard->remaining_uses, src->ssa); + } else { + return _mesa_hash_table_search_data(scoreboard->remaining_uses, + src->reg.reg); + } +} + +static int +nir_schedule_def_pressure(nir_ssa_def *def) +{ + return def->num_components; +} + +static int +nir_schedule_src_pressure(nir_src *src) +{ + if (src->is_ssa) + return nir_schedule_def_pressure(src->ssa); + else + return src->reg.reg->num_components; +} + +static int +nir_schedule_dest_pressure(nir_dest *dest) +{ + if (dest->is_ssa) + return nir_schedule_def_pressure(&dest->ssa); + else + return dest->reg.reg->num_components; +} + +/** + * Adds a dependency such that @after must appear in the final program after + * @before. + * + * We add @before as a child of @after, so that DAG heads are the outputs of + * the program and we make our scheduling decisions bottom to top. + */ +static void +add_dep(nir_deps_state *state, + nir_schedule_node *before, + nir_schedule_node *after) +{ + if (!before || !after) + return; + + assert(before != after); + + if (state->dir == F) + dag_add_edge(&before->dag, &after->dag, NULL); + else + dag_add_edge(&after->dag, &before->dag, NULL); +} + + +static void +add_read_dep(nir_deps_state *state, + nir_schedule_node *before, + nir_schedule_node *after) +{ + add_dep(state, before, after); +} + +static void +add_write_dep(nir_deps_state *state, + nir_schedule_node **before, + nir_schedule_node *after) +{ + add_dep(state, *before, after); + *before = after; +} + +static bool +nir_schedule_reg_src_deps(nir_src *src, void *in_state) +{ + nir_deps_state *state = in_state; + + if (src->is_ssa) + return true; + + struct hash_entry *entry = _mesa_hash_table_search(state->reg_map, + src->reg.reg); + if (!entry) + return true; + nir_schedule_node *dst_n = entry->data; + + nir_schedule_node *src_n = nir_schedule_get_node(state->instr_map, + src->parent_instr); + + add_dep(state, dst_n, src_n); + + return true; +} + +static bool +nir_schedule_reg_dest_deps(nir_dest *dest, void *in_state) +{ + nir_deps_state *state = in_state; + + if (dest->is_ssa) + return true; + + nir_schedule_node *dest_n = nir_schedule_get_node(state->instr_map, + dest->reg.parent_instr); + + struct hash_entry *entry = _mesa_hash_table_search(state->reg_map, + dest->reg.reg); + if (!entry) { + _mesa_hash_table_insert(state->reg_map, dest->reg.reg, dest_n); + return true; + } + nir_schedule_node **before = (nir_schedule_node **)&entry->data; + + add_write_dep(state, before, dest_n); + + return true; +} + +static bool +nir_schedule_ssa_deps(nir_ssa_def *def, void *in_state) +{ + nir_deps_state *state = in_state; + nir_schedule_node *def_n = nir_schedule_get_node(state->instr_map, def->parent_instr); + + nir_foreach_use(src, def) { + nir_schedule_node *use_n = nir_schedule_get_node(state->instr_map, + src->parent_instr); + + add_read_dep(state, def_n, use_n); + } + + return true; +} + +static void +nir_schedule_intrinsic_deps(nir_deps_state *state, + nir_intrinsic_instr *instr) +{ + nir_schedule_node *n = nir_schedule_get_node(state->instr_map, &instr->instr); + + switch (instr->intrinsic) { + case nir_intrinsic_load_uniform: + case nir_intrinsic_load_ubo: + case nir_intrinsic_load_front_face: + break; + + case nir_intrinsic_discard: + case nir_intrinsic_discard_if: + /* We are adding two dependencies: + * + * * A individual one that we could use to add a read_dep while handling + * nir_instr_type_tex + * + * * Include it on the unknown intrinsic set, as we want discard to be + * serialized in in the same order relative to intervening stores or + * atomic accesses to SSBOs and images + */ + add_write_dep(state, &state->discard, n); + add_write_dep(state, &state->unknown_intrinsic, n); + break; + + case nir_intrinsic_store_output: + /* For some non-FS shader stages, or for some hardware, output stores + * affect the same shared memory as input loads. + */ + if (state->shader->info.stage != MESA_SHADER_FRAGMENT) + add_write_dep(state, &state->load_input, n); + + /* Make sure that preceding discards stay before the store_output */ + add_read_dep(state, state->discard, n); + + break; + + case nir_intrinsic_load_input: + add_read_dep(state, state->load_input, n); + break; + + case nir_intrinsic_load_shared: + /* Don't move load_shared beyond a following store_shared, as it could + * change their value + */ + add_read_dep(state, state->store_shared, n); + break; + + case nir_intrinsic_store_shared: + add_write_dep(state, &state->store_shared, n); + break; + + case nir_intrinsic_control_barrier: + case nir_intrinsic_memory_barrier_shared: + add_write_dep(state, &state->store_shared, n); + + /* Serialize against ssbos/atomics/etc. */ + add_write_dep(state, &state->unknown_intrinsic, n); + break; + + default: + /* Attempt to handle other intrinsics that we haven't individually + * categorized by serializing them in the same order relative to each + * other. + */ + add_write_dep(state, &state->unknown_intrinsic, n); + break; + } +} + +/** + * Common code for dependencies that need to be tracked both forward and + * backward. + * + * This is for things like "all reads of r4 have to happen between the r4 + * writes that surround them". + */ +static void +nir_schedule_calculate_deps(nir_deps_state *state, nir_schedule_node *n) +{ + nir_instr *instr = n->instr; + + /* For NIR SSA defs, we only need to do a single pass of making the uses + * depend on the def. + */ + if (state->dir == F) + nir_foreach_ssa_def(instr, nir_schedule_ssa_deps, state); + + /* For NIR regs, track the last writer in the scheduler state so that we + * can keep the writes in order and let reads get reordered only between + * each write. + */ + nir_foreach_src(instr, nir_schedule_reg_src_deps, state); + + nir_foreach_dest(instr, nir_schedule_reg_dest_deps, state); + + /* Make sure any other instructions keep their positions relative to + * jumps. + */ + if (instr->type != nir_instr_type_jump) + add_read_dep(state, state->jump, n); + + switch (instr->type) { + case nir_instr_type_ssa_undef: + case nir_instr_type_load_const: + case nir_instr_type_alu: + case nir_instr_type_deref: + break; + + case nir_instr_type_tex: + /* Don't move texture ops before a discard, as that could increase + * memory bandwidth for reading the discarded samples. + */ + add_read_dep(state, state->discard, n); + break; + + case nir_instr_type_jump: + add_write_dep(state, &state->jump, n); + break; + + case nir_instr_type_call: + unreachable("Calls should have been lowered"); + break; + + case nir_instr_type_parallel_copy: + unreachable("Parallel copies should have been lowered"); + break; + + case nir_instr_type_phi: + unreachable("nir_schedule() should be called after lowering from SSA"); + break; + + case nir_instr_type_intrinsic: + nir_schedule_intrinsic_deps(state, nir_instr_as_intrinsic(instr)); + break; + } +} + +static void +calculate_forward_deps(nir_schedule_scoreboard *scoreboard, nir_block *block) +{ + nir_deps_state state = { + .shader = scoreboard->shader, + .dir = F, + .instr_map = scoreboard->instr_map, + .reg_map = _mesa_pointer_hash_table_create(NULL), + }; + + nir_foreach_instr(instr, block) { + nir_schedule_node *node = nir_schedule_get_node(scoreboard->instr_map, + instr); + nir_schedule_calculate_deps(&state, node); + } + + ralloc_free(state.reg_map); +} + +static void +calculate_reverse_deps(nir_schedule_scoreboard *scoreboard, nir_block *block) +{ + nir_deps_state state = { + .shader = scoreboard->shader, + .dir = R, + .instr_map = scoreboard->instr_map, + .reg_map = _mesa_pointer_hash_table_create(NULL), + }; + + nir_foreach_instr_reverse(instr, block) { + nir_schedule_node *node = nir_schedule_get_node(scoreboard->instr_map, + instr); + nir_schedule_calculate_deps(&state, node); + } + + ralloc_free(state.reg_map); +} + +typedef struct { + nir_schedule_scoreboard *scoreboard; + int regs_freed; +} nir_schedule_regs_freed_state; + +static bool +nir_schedule_regs_freed_src_cb(nir_src *src, void *in_state) +{ + nir_schedule_regs_freed_state *state = in_state; + nir_schedule_scoreboard *scoreboard = state->scoreboard; + struct set *remaining_uses = nir_schedule_scoreboard_get_src(scoreboard, src); + + if (remaining_uses->entries == 1 && + _mesa_set_search(remaining_uses, src->parent_instr)) { + state->regs_freed += nir_schedule_src_pressure(src); + } + + return true; +} + +static bool +nir_schedule_regs_freed_def_cb(nir_ssa_def *def, void *in_state) +{ + nir_schedule_regs_freed_state *state = in_state; + + state->regs_freed -= nir_schedule_def_pressure(def); + + return true; +} + +static bool +nir_schedule_regs_freed_dest_cb(nir_dest *dest, void *in_state) +{ + nir_schedule_regs_freed_state *state = in_state; + nir_schedule_scoreboard *scoreboard = state->scoreboard; + + if (dest->is_ssa) + return true; + + nir_register *reg = dest->reg.reg; + + /* Only the first def of a reg counts against register pressure. */ + if (!_mesa_set_search(scoreboard->live_values, reg)) + state->regs_freed -= nir_schedule_dest_pressure(dest); + + return true; +} + +static int +nir_schedule_regs_freed(nir_schedule_scoreboard *scoreboard, nir_schedule_node *n) +{ + nir_schedule_regs_freed_state state = { + .scoreboard = scoreboard, + }; + + nir_foreach_src(n->instr, nir_schedule_regs_freed_src_cb, &state); + + nir_foreach_ssa_def(n->instr, nir_schedule_regs_freed_def_cb, &state); + + nir_foreach_dest(n->instr, nir_schedule_regs_freed_dest_cb, &state); + + return state.regs_freed; +} + +/** + * Chooses an instruction to schedule using the Goodman/Hsu (1988) CSP (Code + * Scheduling for Parallelism) heuristic. + * + * Picks an instruction on the critical that's ready to execute without + * stalls, if possible, otherwise picks the instruction on the critical path. + */ +static nir_schedule_node * +nir_schedule_choose_instruction_csp(nir_schedule_scoreboard *scoreboard) +{ + nir_schedule_node *chosen = NULL; + + /* Find the leader in the ready (shouldn't-stall) set with the maximum + * cost. + */ + list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) { + if (scoreboard->time < n->ready_time) + continue; + + if (!chosen || chosen->max_delay < n->max_delay) + chosen = n; + } + if (chosen) { + if (debug) { + fprintf(stderr, "chose (ready): "); + nir_print_instr(chosen->instr, stderr); + fprintf(stderr, "\n"); + } + + return chosen; + } + + /* Otherwise, choose the leader with the maximum cost. */ + list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) { + if (!chosen || chosen->max_delay < n->max_delay) + chosen = n; + } + if (debug) { + fprintf(stderr, "chose (leader): "); + nir_print_instr(chosen->instr, stderr); + fprintf(stderr, "\n"); + } + + return chosen; +} + +/** + * Chooses an instruction to schedule using the Goodman/Hsu (1988) CSR (Code + * Scheduling for Register pressure) heuristic. + */ +static nir_schedule_node * +nir_schedule_choose_instruction_csr(nir_schedule_scoreboard *scoreboard) +{ + nir_schedule_node *chosen = NULL; + + /* Find a ready inst with regs freed and pick the one with max cost. */ + list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) { + if (n->ready_time > scoreboard->time) + continue; + + int regs_freed = nir_schedule_regs_freed(scoreboard, n); + + if (regs_freed > 0 && (!chosen || chosen->max_delay < n->max_delay)) { + chosen = n; + } + } + if (chosen) { + if (debug) { + fprintf(stderr, "chose (freed+ready): "); + nir_print_instr(chosen->instr, stderr); + fprintf(stderr, "\n"); + } + + return chosen; + } + + /* Find a leader with regs freed and pick the one with max cost. */ + list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) { + int regs_freed = nir_schedule_regs_freed(scoreboard, n); + + if (regs_freed > 0 && (!chosen || chosen->max_delay < n->max_delay)) { + chosen = n; + } + } + if (chosen) { + if (debug) { + fprintf(stderr, "chose (regs freed): "); + nir_print_instr(chosen->instr, stderr); + fprintf(stderr, "\n"); + } + + return chosen; + } + + /* Find a partially evaluated path and try to finish it off */ + list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) { + if (n->partially_evaluated_path && + (!chosen || chosen->max_delay < n->max_delay)) { + chosen = n; + } + } + if (chosen) { + if (debug) { + fprintf(stderr, "chose (partial path): "); + nir_print_instr(chosen->instr, stderr); + fprintf(stderr, "\n"); + } + + return chosen; + } + + /* Contra the paper, pick a leader with no effect on used regs. This may + * open up new opportunities, as otherwise a single-operand instr consuming + * a value will tend to block finding freeing that value. This had a + * massive effect on reducing spilling on V3D. + * + * XXX: Should this prioritize ready? + */ + list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) { + if (nir_schedule_regs_freed(scoreboard, n) != 0) + continue; + + if (!chosen || chosen->max_delay < n->max_delay) + chosen = n; + } + if (chosen) { + if (debug) { + fprintf(stderr, "chose (regs no-op): "); + nir_print_instr(chosen->instr, stderr); + fprintf(stderr, "\n"); + } + + return chosen; + } + + /* Pick the max delay of the remaining ready set. */ + list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) { + if (n->ready_time > scoreboard->time) + continue; + if (!chosen || chosen->max_delay < n->max_delay) + chosen = n; + } + if (chosen) { + if (debug) { + fprintf(stderr, "chose (ready max delay): "); + nir_print_instr(chosen->instr, stderr); + fprintf(stderr, "\n"); + } + return chosen; + } + + /* Pick the max delay of the remaining leaders. */ + list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) { + if (!chosen || chosen->max_delay < n->max_delay) + chosen = n; + } + + if (debug) { + fprintf(stderr, "chose (max delay): "); + nir_print_instr(chosen->instr, stderr); + fprintf(stderr, "\n"); + } + + return chosen; +} + +static void +dump_state(nir_schedule_scoreboard *scoreboard) +{ + list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) { + fprintf(stderr, "maxdel %5d ", n->max_delay); + nir_print_instr(n->instr, stderr); + fprintf(stderr, "\n"); + + util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) { + nir_schedule_node *child = (nir_schedule_node *)edge->child; + + fprintf(stderr, " -> (%d parents) ", child->dag.parent_count); + nir_print_instr(child->instr, stderr); + fprintf(stderr, "\n"); + } + } +} + +static void +nir_schedule_mark_use(nir_schedule_scoreboard *scoreboard, + void *reg_or_def, + nir_instr *reg_or_def_parent, + int pressure) +{ + /* Make the value live if it's the first time it's been used. */ + if (!_mesa_set_search(scoreboard->live_values, reg_or_def)) { + _mesa_set_add(scoreboard->live_values, reg_or_def); + scoreboard->pressure += pressure; + } + + /* Make the value dead if it's the last remaining use. Be careful when one + * instruction uses a value twice to not decrement pressure twice. + */ + struct set *remaining_uses = + _mesa_hash_table_search_data(scoreboard->remaining_uses, reg_or_def); + struct set_entry *entry = _mesa_set_search(remaining_uses, reg_or_def_parent); + if (entry) { + _mesa_set_remove(remaining_uses, entry); + + if (remaining_uses->entries == 0) + scoreboard->pressure -= pressure; + } +} + +static bool +nir_schedule_mark_src_scheduled(nir_src *src, void *state) +{ + nir_schedule_scoreboard *scoreboard = state; + struct set *remaining_uses = nir_schedule_scoreboard_get_src(scoreboard, src); + + struct set_entry *entry = _mesa_set_search(remaining_uses, + src->parent_instr); + if (entry) { + /* Once we've used an SSA value in one instruction, bump the priority of + * the other uses so the SSA value can get fully consumed. + * + * We don't do this for registers, and it's would be a hassle and it's + * unclear if that would help or not. Also, skip it for constants, as + * they're often folded as immediates into backend instructions and have + * many unrelated instructions all referencing the same value (0). + */ + if (src->is_ssa && + src->ssa->parent_instr->type != nir_instr_type_load_const) { + nir_foreach_use(other_src, src->ssa) { + if (other_src->parent_instr == src->parent_instr) + continue; + + nir_schedule_node *n = + nir_schedule_get_node(scoreboard->instr_map, + other_src->parent_instr); + + if (n && !n->partially_evaluated_path) { + if (debug) { + fprintf(stderr, " New partially evaluated path: "); + nir_print_instr(n->instr, stderr); + fprintf(stderr, "\n"); + } + + n->partially_evaluated_path = true; + } + } + } + } + + nir_schedule_mark_use(scoreboard, + src->is_ssa ? (void *)src->ssa : (void *)src->reg.reg, + src->parent_instr, + nir_schedule_src_pressure(src)); + + return true; +} + +static bool +nir_schedule_mark_def_scheduled(nir_ssa_def *def, void *state) +{ + nir_schedule_scoreboard *scoreboard = state; + + nir_schedule_mark_use(scoreboard, def, def->parent_instr, + nir_schedule_def_pressure(def)); + + return true; +} + +static bool +nir_schedule_mark_dest_scheduled(nir_dest *dest, void *state) +{ + nir_schedule_scoreboard *scoreboard = state; + + /* SSA defs were handled in nir_schedule_mark_def_scheduled() + */ + if (dest->is_ssa) + return true; + + /* XXX: This is not actually accurate for regs -- the last use of a reg may + * have a live interval that extends across control flow. We should + * calculate the live ranges of regs, and have scheduler nodes for the CF + * nodes that also "use" the reg. + */ + nir_schedule_mark_use(scoreboard, dest->reg.reg, + dest->reg.parent_instr, + nir_schedule_dest_pressure(dest)); + + return true; +} + +static void +nir_schedule_mark_node_scheduled(nir_schedule_scoreboard *scoreboard, + nir_schedule_node *n) +{ + nir_foreach_src(n->instr, nir_schedule_mark_src_scheduled, scoreboard); + nir_foreach_ssa_def(n->instr, nir_schedule_mark_def_scheduled, scoreboard); + nir_foreach_dest(n->instr, nir_schedule_mark_dest_scheduled, scoreboard); + + util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) { + nir_schedule_node *child = (nir_schedule_node *)edge->child; + + child->ready_time = MAX2(child->ready_time, + scoreboard->time + n->delay); + + if (child->dag.parent_count == 1) { + if (debug) { + fprintf(stderr, " New DAG head: "); + nir_print_instr(child->instr, stderr); + fprintf(stderr, "\n"); + } + } + } + + dag_prune_head(scoreboard->dag, &n->dag); + + scoreboard->time = MAX2(n->ready_time, scoreboard->time); + scoreboard->time++; +} + +static void +nir_schedule_instructions(nir_schedule_scoreboard *scoreboard, nir_block *block) +{ + while (!list_is_empty(&scoreboard->dag->heads)) { + if (debug) { + fprintf(stderr, "current list:\n"); + dump_state(scoreboard); + } + + nir_schedule_node *chosen; + if (scoreboard->pressure < scoreboard->threshold) + chosen = nir_schedule_choose_instruction_csp(scoreboard); + else + chosen = nir_schedule_choose_instruction_csr(scoreboard); + + /* Now that we've scheduled a new instruction, some of its children may + * be promoted to the list of instructions ready to be scheduled. + */ + nir_schedule_mark_node_scheduled(scoreboard, chosen); + + /* Move the instruction to the end (so our first chosen instructions are + * the start of the program). + */ + exec_node_remove(&chosen->instr->node); + exec_list_push_tail(&block->instr_list, &chosen->instr->node); + + if (debug) + fprintf(stderr, "\n"); + } +} + +static uint32_t +nir_schedule_get_delay(nir_instr *instr) +{ + switch (instr->type) { + case nir_instr_type_ssa_undef: + case nir_instr_type_load_const: + case nir_instr_type_alu: + case nir_instr_type_deref: + case nir_instr_type_jump: + case nir_instr_type_parallel_copy: + case nir_instr_type_call: + case nir_instr_type_phi: + return 1; + + case nir_instr_type_intrinsic: + /* XXX: Pick a large number for UBO/SSBO/image/shared loads */ + return 1; + + case nir_instr_type_tex: + /* Pick some large number to try to fetch textures early and sample them + * late. + */ + return 100; + } + + return 0; +} + +static void +nir_schedule_dag_max_delay_cb(struct dag_node *node, void *state) +{ + nir_schedule_node *n = (nir_schedule_node *)node; + uint32_t max_delay = 0; + + util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) { + nir_schedule_node *child = (nir_schedule_node *)edge->child; + max_delay = MAX2(child->max_delay, max_delay); + } + + n->max_delay = MAX2(n->max_delay, max_delay + n->delay); + } + +static void +nir_schedule_block(nir_schedule_scoreboard *scoreboard, nir_block *block) +{ + void *mem_ctx = ralloc_context(NULL); + scoreboard->instr_map = _mesa_pointer_hash_table_create(mem_ctx); + + scoreboard->dag = dag_create(mem_ctx); + + nir_foreach_instr(instr, block) { + nir_schedule_node *n = + rzalloc(mem_ctx, nir_schedule_node); + + n->instr = instr; + n->delay = nir_schedule_get_delay(instr); + dag_init_node(scoreboard->dag, &n->dag); + + _mesa_hash_table_insert(scoreboard->instr_map, instr, n); + } + + calculate_forward_deps(scoreboard, block); + calculate_reverse_deps(scoreboard, block); + + dag_traverse_bottom_up(scoreboard->dag, nir_schedule_dag_max_delay_cb, NULL); + + nir_schedule_instructions(scoreboard, block); + + ralloc_free(mem_ctx); + scoreboard->instr_map = NULL; +} + +static bool +nir_schedule_ssa_def_init_scoreboard(nir_ssa_def *def, void *state) +{ + nir_schedule_scoreboard *scoreboard = state; + struct set *def_uses = _mesa_pointer_set_create(scoreboard); + + _mesa_hash_table_insert(scoreboard->remaining_uses, def, def_uses); + + _mesa_set_add(def_uses, def->parent_instr); + + nir_foreach_use(src, def) { + _mesa_set_add(def_uses, src->parent_instr); + } + + /* XXX: Handle if uses */ + + return true; +} + +static nir_schedule_scoreboard * +nir_schedule_get_scoreboard(nir_shader *shader, int threshold) +{ + nir_schedule_scoreboard *scoreboard = rzalloc(NULL, nir_schedule_scoreboard); + + scoreboard->shader = shader; + scoreboard->live_values = _mesa_pointer_set_create(scoreboard); + scoreboard->remaining_uses = _mesa_pointer_hash_table_create(scoreboard); + scoreboard->threshold = threshold; + scoreboard->pressure = 0; + + nir_foreach_function(function, shader) { + nir_foreach_register(reg, &function->impl->registers) { + struct set *register_uses = + _mesa_pointer_set_create(scoreboard); + + _mesa_hash_table_insert(scoreboard->remaining_uses, reg, register_uses); + + nir_foreach_use(src, reg) { + _mesa_set_add(register_uses, src->parent_instr); + } + + /* XXX: Handle if uses */ + + nir_foreach_def(dest, reg) { + _mesa_set_add(register_uses, dest->reg.parent_instr); + } + } + + nir_foreach_block(block, function->impl) { + nir_foreach_instr(instr, block) { + nir_foreach_ssa_def(instr, nir_schedule_ssa_def_init_scoreboard, + scoreboard); + } + + /* XXX: We're ignoring if uses, which may prioritize scheduling other + * uses of the if src even when it doesn't help. That's not many + * values, though, so meh. + */ + } + } + + return scoreboard; +} + +static void +nir_schedule_validate_uses(nir_schedule_scoreboard *scoreboard) +{ +#ifdef NDEBUG + return; +#endif + + bool any_uses = false; + + hash_table_foreach(scoreboard->remaining_uses, entry) { + struct set *remaining_uses = entry->data; + + set_foreach(remaining_uses, instr_entry) { + if (!any_uses) { + fprintf(stderr, "Tracked uses remain after scheduling. " + "Affected instructions: \n"); + any_uses = true; + } + nir_print_instr(instr_entry->key, stderr); + fprintf(stderr, "\n"); + } + } + + assert(!any_uses); +} + +/** + * Schedules the NIR instructions to try to decrease stalls (for example, + * delaying texture reads) while managing register pressure. + * + * The threshold represents "number of NIR register/SSA def channels live + * before switching the scheduling heuristic to reduce register pressure", + * since most of our GPU architectures are scalar (extending to vector with a + * flag wouldn't be hard). This number should be a bit below the number of + * registers available (counting any that may be occupied by system value + * payload values, for example), since the heuristic may not always be able to + * free a register immediately. The amount below the limit is up to you to + * tune. + */ +void +nir_schedule(nir_shader *shader, int threshold) +{ + nir_schedule_scoreboard *scoreboard = nir_schedule_get_scoreboard(shader, + threshold); + + if (debug) { + fprintf(stderr, "NIR shader before scheduling:\n"); + nir_print_shader(shader, stderr); + } + + nir_foreach_function(function, shader) { + if (!function->impl) + continue; + + nir_foreach_block(block, function->impl) { + nir_schedule_block(scoreboard, block); + } + } + + nir_schedule_validate_uses(scoreboard); + + ralloc_free(scoreboard); +} diff -Nru mesa-19.2.8/src/compiler/nir/nir_search.c mesa-20.0.8/src/compiler/nir/nir_search.c --- mesa-19.2.8/src/compiler/nir/nir_search.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir_search.c 2020-06-12 01:21:16.000000000 +0000 @@ -28,6 +28,7 @@ #include #include "nir_search.h" #include "nir_builder.h" +#include "nir_worklist.h" #include "util/half_float.h" /* This should be the same as nir_search_max_comm_ops in nir_algebraic.py. */ @@ -38,15 +39,30 @@ bool has_exact_alu; uint8_t comm_op_direction; unsigned variables_seen; + + /* Used for running the automaton on newly-constructed instructions. */ + struct util_dynarray *states; + const struct per_op_table *pass_op_table; + nir_alu_src variables[NIR_SEARCH_MAX_VARIABLES]; + struct hash_table *range_ht; }; static bool match_expression(const nir_search_expression *expr, nir_alu_instr *instr, unsigned num_components, const uint8_t *swizzle, struct match_state *state); +static bool +nir_algebraic_automaton(nir_instr *instr, struct util_dynarray *states, + const struct per_op_table *pass_op_table); -static const uint8_t identity_swizzle[NIR_MAX_VEC_COMPONENTS] = { 0, 1, 2, 3 }; +static const uint8_t identity_swizzle[NIR_MAX_VEC_COMPONENTS] = +{ + 0, 1, 2, 3, + 4, 5, 6, 7, + 8, 9, 10, 11, + 12, 13, 14, 15, +}; /** * Check if a source produces a value of the given type. @@ -297,7 +313,8 @@ instr->src[src].src.ssa->parent_instr->type != nir_instr_type_load_const) return false; - if (var->cond && !var->cond(instr, src, num_components, new_swizzle)) + if (var->cond && !var->cond(state->range_ht, instr, + src, num_components, new_swizzle)) return false; if (var->type != nir_type_invalid && @@ -488,6 +505,11 @@ nir_builder_instr_insert(build, &alu->instr); + assert(alu->dest.dest.ssa.index == + util_dynarray_num_elements(state->states, uint16_t)); + util_dynarray_append(state->states, uint16_t, 0); + nir_algebraic_automaton(&alu->instr, state->states, state->pass_op_table); + nir_alu_src val; val.src = nir_src_for_ssa(&alu->dest.dest.ssa); val.negate = false; @@ -535,6 +557,12 @@ unreachable("Invalid alu source type"); } + assert(cval->index == + util_dynarray_num_elements(state->states, uint16_t)); + util_dynarray_append(state->states, uint16_t, 0); + nir_algebraic_automaton(cval->parent_instr, state->states, + state->pass_op_table); + nir_alu_src val; val.src = nir_src_for_ssa(cval); val.negate = false; @@ -556,16 +584,16 @@ const nir_search_constant *sconst = nir_search_value_as_constant(val); switch (sconst->type) { case nir_type_float: - printf("%f", sconst->data.d); + fprintf(stderr, "%f", sconst->data.d); break; case nir_type_int: - printf("%"PRId64, sconst->data.i); + fprintf(stderr, "%"PRId64, sconst->data.i); break; case nir_type_uint: - printf("0x%"PRIx64, sconst->data.u); + fprintf(stderr, "0x%"PRIx64, sconst->data.u); break; case nir_type_bool: - printf("%s", sconst->data.u != 0 ? "True" : "False"); + fprintf(stderr, "%s", sconst->data.u != 0 ? "True" : "False"); break; default: unreachable("bad const type"); @@ -576,19 +604,19 @@ case nir_search_value_variable: { const nir_search_variable *var = nir_search_value_as_variable(val); if (var->is_constant) - printf("#"); - printf("%c", var->variable + 'a'); + fprintf(stderr, "#"); + fprintf(stderr, "%c", var->variable + 'a'); break; } case nir_search_value_expression: { const nir_search_expression *expr = nir_search_value_as_expression(val); - printf("("); + fprintf(stderr, "("); if (expr->inexact) - printf("~"); + fprintf(stderr, "~"); switch (expr->opcode) { #define CASE(n) \ - case nir_search_op_##n: printf(#n); break; + case nir_search_op_##n: fprintf(stderr, #n); break; CASE(f2b) CASE(b2f) CASE(b2i) @@ -598,7 +626,7 @@ CASE(i2f) #undef CASE default: - printf("%s", nir_op_infos[expr->opcode].name); + fprintf(stderr, "%s", nir_op_infos[expr->opcode].name); } unsigned num_srcs = 1; @@ -606,23 +634,63 @@ num_srcs = nir_op_infos[expr->opcode].num_inputs; for (unsigned i = 0; i < num_srcs; i++) { - printf(" "); + fprintf(stderr, " "); dump_value(expr->srcs[i]); } - printf(")"); + fprintf(stderr, ")"); break; } } if (val->bit_size > 0) - printf("@%d", val->bit_size); + fprintf(stderr, "@%d", val->bit_size); +} + +static void +add_uses_to_worklist(nir_instr *instr, nir_instr_worklist *worklist) +{ + nir_ssa_def *def = nir_instr_ssa_def(instr); + + nir_foreach_use_safe(use_src, def) { + nir_instr_worklist_push_tail(worklist, use_src->parent_instr); + } +} + +static void +nir_algebraic_update_automaton(nir_instr *new_instr, + nir_instr_worklist *algebraic_worklist, + struct util_dynarray *states, + const struct per_op_table *pass_op_table) +{ + + nir_instr_worklist *automaton_worklist = nir_instr_worklist_create(); + + /* Walk through the tree of uses of our new instruction's SSA value, + * recursively updating the automaton state until it stabilizes. + */ + add_uses_to_worklist(new_instr, automaton_worklist); + + nir_instr *instr; + while ((instr = nir_instr_worklist_pop_head(automaton_worklist))) { + if (nir_algebraic_automaton(instr, states, pass_op_table)) { + nir_instr_worklist_push_tail(algebraic_worklist, instr); + + add_uses_to_worklist(instr, automaton_worklist); + } + } + + nir_instr_worklist_destroy(automaton_worklist); } nir_ssa_def * nir_replace_instr(nir_builder *build, nir_alu_instr *instr, + struct hash_table *range_ht, + struct util_dynarray *states, + const struct per_op_table *pass_op_table, const nir_search_expression *search, - const nir_search_value *replace) + const nir_search_value *replace, + nir_instr_worklist *algebraic_worklist) { uint8_t swizzle[NIR_MAX_VEC_COMPONENTS] = { 0 }; @@ -634,6 +702,8 @@ struct match_state state; state.inexact_match = false; state.has_exact_alu = false; + state.range_ht = range_ht; + state.pass_op_table = pass_op_table; STATIC_ASSERT(sizeof(state.comm_op_direction) * 8 >= NIR_SEARCH_MAX_COMM_OPS); @@ -659,33 +729,208 @@ return NULL; #if 0 - printf("matched: "); + fprintf(stderr, "matched: "); dump_value(&search->value); - printf(" -> "); + fprintf(stderr, " -> "); dump_value(replace); - printf(" ssa_%d\n", instr->dest.dest.ssa.index); + fprintf(stderr, " ssa_%d\n", instr->dest.dest.ssa.index); #endif build->cursor = nir_before_instr(&instr->instr); + state.states = states; + nir_alu_src val = construct_value(build, replace, instr->dest.dest.ssa.num_components, instr->dest.dest.ssa.bit_size, &state, &instr->instr); - /* Inserting a mov may be unnecessary. However, it's much easier to - * simply let copy propagation clean this up than to try to go through - * and rewrite swizzles ourselves. + /* Note that NIR builder will elide the MOV if it's a no-op, which may + * allow more work to be done in a single pass through algebraic. */ nir_ssa_def *ssa_val = nir_mov_alu(build, val, instr->dest.dest.ssa.num_components); + if (ssa_val->index == util_dynarray_num_elements(states, uint16_t)) { + util_dynarray_append(states, uint16_t, 0); + nir_algebraic_automaton(ssa_val->parent_instr, states, pass_op_table); + } + + /* Rewrite the uses of the old SSA value to the new one, and recurse + * through the uses updating the automaton's state. + */ nir_ssa_def_rewrite_uses(&instr->dest.dest.ssa, nir_src_for_ssa(ssa_val)); + nir_algebraic_update_automaton(ssa_val->parent_instr, algebraic_worklist, + states, pass_op_table); - /* We know this one has no more uses because we just rewrote them all, - * so we can remove it. The rest of the matched expression, however, we - * don't know so much about. We'll just let dead code clean them up. + /* Nothing uses the instr any more, so drop it out of the program. Note + * that the instr may be in the worklist still, so we can't free it + * directly. */ nir_instr_remove(&instr->instr); return ssa_val; } + +static bool +nir_algebraic_automaton(nir_instr *instr, struct util_dynarray *states, + const struct per_op_table *pass_op_table) +{ + switch (instr->type) { + case nir_instr_type_alu: { + nir_alu_instr *alu = nir_instr_as_alu(instr); + nir_op op = alu->op; + uint16_t search_op = nir_search_op_for_nir_op(op); + const struct per_op_table *tbl = &pass_op_table[search_op]; + if (tbl->num_filtered_states == 0) + return false; + + /* Calculate the index into the transition table. Note the index + * calculated must match the iteration order of Python's + * itertools.product(), which was used to emit the transition + * table. + */ + unsigned index = 0; + for (unsigned i = 0; i < nir_op_infos[op].num_inputs; i++) { + index *= tbl->num_filtered_states; + index += tbl->filter[*util_dynarray_element(states, uint16_t, + alu->src[i].src.ssa->index)]; + } + + uint16_t *state = util_dynarray_element(states, uint16_t, + alu->dest.dest.ssa.index); + if (*state != tbl->table[index]) { + *state = tbl->table[index]; + return true; + } + return false; + } + + case nir_instr_type_load_const: { + nir_load_const_instr *load_const = nir_instr_as_load_const(instr); + uint16_t *state = util_dynarray_element(states, uint16_t, + load_const->def.index); + if (*state != CONST_STATE) { + *state = CONST_STATE; + return true; + } + return false; + } + + default: + return false; + } +} + +static bool +nir_algebraic_instr(nir_builder *build, nir_instr *instr, + struct hash_table *range_ht, + const bool *condition_flags, + const struct transform **transforms, + const uint16_t *transform_counts, + struct util_dynarray *states, + const struct per_op_table *pass_op_table, + nir_instr_worklist *worklist) +{ + + if (instr->type != nir_instr_type_alu) + return false; + + nir_alu_instr *alu = nir_instr_as_alu(instr); + if (!alu->dest.dest.is_ssa) + return false; + + unsigned bit_size = alu->dest.dest.ssa.bit_size; + const unsigned execution_mode = + build->shader->info.float_controls_execution_mode; + const bool ignore_inexact = + nir_is_float_control_signed_zero_inf_nan_preserve(execution_mode, bit_size) || + nir_is_denorm_flush_to_zero(execution_mode, bit_size); + + int xform_idx = *util_dynarray_element(states, uint16_t, + alu->dest.dest.ssa.index); + for (uint16_t i = 0; i < transform_counts[xform_idx]; i++) { + const struct transform *xform = &transforms[xform_idx][i]; + if (condition_flags[xform->condition_offset] && + !(xform->search->inexact && ignore_inexact) && + nir_replace_instr(build, alu, range_ht, states, pass_op_table, + xform->search, xform->replace, worklist)) { + _mesa_hash_table_clear(range_ht, NULL); + return true; + } + } + + return false; +} + +bool +nir_algebraic_impl(nir_function_impl *impl, + const bool *condition_flags, + const struct transform **transforms, + const uint16_t *transform_counts, + const struct per_op_table *pass_op_table) +{ + bool progress = false; + + nir_builder build; + nir_builder_init(&build, impl); + + /* Note: it's important here that we're allocating a zeroed array, since + * state 0 is the default state, which means we don't have to visit + * anything other than constants and ALU instructions. + */ + struct util_dynarray states = {0}; + if (!util_dynarray_resize(&states, uint16_t, impl->ssa_alloc)) + return false; + memset(states.data, 0, states.size); + + struct hash_table *range_ht = _mesa_pointer_hash_table_create(NULL); + + nir_instr_worklist *worklist = nir_instr_worklist_create(); + + /* Walk top-to-bottom setting up the automaton state. */ + nir_foreach_block(block, impl) { + nir_foreach_instr(instr, block) { + nir_algebraic_automaton(instr, &states, pass_op_table); + } + } + + /* Put our instrs in the worklist such that we're popping the last instr + * first. This will encourage us to match the biggest source patterns when + * possible. + */ + nir_foreach_block_reverse(block, impl) { + nir_foreach_instr_reverse(instr, block) { + nir_instr_worklist_push_tail(worklist, instr); + } + } + + nir_instr *instr; + while ((instr = nir_instr_worklist_pop_head(worklist))) { + /* The worklist can have an instr pushed to it multiple times if it was + * the src of multiple instrs that also got optimized, so make sure that + * we don't try to re-optimize an instr we already handled. + */ + if (exec_node_is_tail_sentinel(&instr->node)) + continue; + + progress |= nir_algebraic_instr(&build, instr, + range_ht, condition_flags, + transforms, transform_counts, &states, + pass_op_table, worklist); + } + + nir_instr_worklist_destroy(worklist); + ralloc_free(range_ht); + util_dynarray_fini(&states); + + if (progress) { + nir_metadata_preserve(impl, nir_metadata_block_index | + nir_metadata_dominance); + } else { +#ifndef NDEBUG + impl->valid_metadata &= ~nir_metadata_not_properly_reset; +#endif + } + + return progress; +} diff -Nru mesa-19.2.8/src/compiler/nir/nir_search.h mesa-20.0.8/src/compiler/nir/nir_search.h --- mesa-19.2.8/src/compiler/nir/nir_search.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir_search.h 2020-06-12 01:21:16.000000000 +0000 @@ -29,6 +29,8 @@ #define _NIR_SEARCH_ #include "nir.h" +#include "nir_worklist.h" +#include "util/u_dynarray.h" #define NIR_SEARCH_MAX_VARIABLES 16 @@ -93,11 +95,11 @@ * variables to require, for example, power-of-two in order for the search * to match. */ - bool (*cond)(nir_alu_instr *instr, unsigned src, + bool (*cond)(struct hash_table *range_ht, nir_alu_instr *instr, unsigned src, unsigned num_components, const uint8_t *swizzle); - /** Swizzle (for replace only) */ - uint8_t swizzle[NIR_MAX_VEC_COMPONENTS]; + /** Swizzle (for replace only) */ + uint8_t swizzle[NIR_MAX_VEC_COMPONENTS]; } nir_search_variable; typedef struct { @@ -166,6 +168,25 @@ bool (*cond)(nir_alu_instr *instr); } nir_search_expression; +struct per_op_table { + const uint16_t *filter; + unsigned num_filtered_states; + const uint16_t *table; +}; + +struct transform { + const nir_search_expression *search; + const nir_search_value *replace; + unsigned condition_offset; +}; + +/* Note: these must match the start states created in + * TreeAutomaton._build_table() + */ + +/* WILDCARD_STATE = 0 is set by zeroing the state array */ +static const uint16_t CONST_STATE = 1; + NIR_DEFINE_CAST(nir_search_value_as_variable, nir_search_value, nir_search_variable, value, type, nir_search_value_variable) @@ -178,7 +199,17 @@ nir_ssa_def * nir_replace_instr(struct nir_builder *b, nir_alu_instr *instr, + struct hash_table *range_ht, + struct util_dynarray *states, + const struct per_op_table *pass_op_table, const nir_search_expression *search, - const nir_search_value *replace); + const nir_search_value *replace, + nir_instr_worklist *algebraic_worklist); +bool +nir_algebraic_impl(nir_function_impl *impl, + const bool *condition_flags, + const struct transform **transforms, + const uint16_t *transform_counts, + const struct per_op_table *pass_op_table); #endif /* _NIR_SEARCH_ */ diff -Nru mesa-19.2.8/src/compiler/nir/nir_search_helpers.h mesa-20.0.8/src/compiler/nir/nir_search_helpers.h --- mesa-19.2.8/src/compiler/nir/nir_search_helpers.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir_search_helpers.h 2020-06-12 01:21:16.000000000 +0000 @@ -33,7 +33,8 @@ #include static inline bool -is_pos_power_of_two(nir_alu_instr *instr, unsigned src, unsigned num_components, +is_pos_power_of_two(UNUSED struct hash_table *ht, nir_alu_instr *instr, + unsigned src, unsigned num_components, const uint8_t *swizzle) { /* only constant srcs: */ @@ -41,7 +42,8 @@ return false; for (unsigned i = 0; i < num_components; i++) { - switch (nir_op_infos[instr->op].input_types[src]) { + nir_alu_type type = nir_op_infos[instr->op].input_types[src]; + switch (nir_alu_type_get_base_type(type)) { case nir_type_int: { int64_t val = nir_src_comp_as_int(instr->src[src].src, swizzle[i]); if (val <= 0 || !util_is_power_of_two_or_zero64(val)) @@ -63,7 +65,8 @@ } static inline bool -is_neg_power_of_two(nir_alu_instr *instr, unsigned src, unsigned num_components, +is_neg_power_of_two(UNUSED struct hash_table *ht, nir_alu_instr *instr, + unsigned src, unsigned num_components, const uint8_t *swizzle) { /* only constant srcs: */ @@ -71,7 +74,8 @@ return false; for (unsigned i = 0; i < num_components; i++) { - switch (nir_op_infos[instr->op].input_types[src]) { + nir_alu_type type = nir_op_infos[instr->op].input_types[src]; + switch (nir_alu_type_get_base_type(type)) { case nir_type_int: { int64_t val = nir_src_comp_as_int(instr->src[src].src, swizzle[i]); if (val >= 0 || !util_is_power_of_two_or_zero64(-val)) @@ -86,8 +90,35 @@ return true; } +#define MULTIPLE(test) \ +static inline bool \ +is_unsigned_multiple_of_ ## test(UNUSED struct hash_table *ht, nir_alu_instr *instr, \ + unsigned src, unsigned num_components, \ + const uint8_t *swizzle) \ +{ \ + /* only constant srcs: */ \ + if (!nir_src_is_const(instr->src[src].src)) \ + return false; \ + \ + for (unsigned i = 0; i < num_components; i++) { \ + uint64_t val = nir_src_comp_as_uint(instr->src[src].src, swizzle[i]); \ + if (val % test != 0) \ + return false; \ + } \ + \ + return true; \ +} + +MULTIPLE(2) +MULTIPLE(4) +MULTIPLE(8) +MULTIPLE(16) +MULTIPLE(32) +MULTIPLE(64) + static inline bool -is_zero_to_one(nir_alu_instr *instr, unsigned src, unsigned num_components, +is_zero_to_one(UNUSED struct hash_table *ht, nir_alu_instr *instr, unsigned src, + unsigned num_components, const uint8_t *swizzle) { /* only constant srcs: */ @@ -117,7 +148,8 @@ * 1 while this function tests 0 < src < 1. */ static inline bool -is_gt_0_and_lt_1(nir_alu_instr *instr, unsigned src, unsigned num_components, +is_gt_0_and_lt_1(UNUSED struct hash_table *ht, nir_alu_instr *instr, + unsigned src, unsigned num_components, const uint8_t *swizzle) { /* only constant srcs: */ @@ -141,14 +173,16 @@ } static inline bool -is_not_const_zero(nir_alu_instr *instr, unsigned src, unsigned num_components, +is_not_const_zero(UNUSED struct hash_table *ht, nir_alu_instr *instr, + unsigned src, unsigned num_components, const uint8_t *swizzle) { if (nir_src_as_const_value(instr->src[src].src) == NULL) return true; for (unsigned i = 0; i < num_components; i++) { - switch (nir_op_infos[instr->op].input_types[src]) { + nir_alu_type type = nir_op_infos[instr->op].input_types[src]; + switch (nir_alu_type_get_base_type(type)) { case nir_type_float: if (nir_src_comp_as_float(instr->src[src].src, swizzle[i]) == 0.0) return false; @@ -168,14 +202,15 @@ } static inline bool -is_not_const(nir_alu_instr *instr, unsigned src, UNUSED unsigned num_components, +is_not_const(UNUSED struct hash_table *ht, nir_alu_instr *instr, unsigned src, + UNUSED unsigned num_components, UNUSED const uint8_t *swizzle) { return !nir_src_is_const(instr->src[src].src); } static inline bool -is_not_fmul(nir_alu_instr *instr, unsigned src, +is_not_fmul(struct hash_table *ht, nir_alu_instr *instr, unsigned src, UNUSED unsigned num_components, UNUSED const uint8_t *swizzle) { nir_alu_instr *src_alu = @@ -185,7 +220,7 @@ return true; if (src_alu->op == nir_op_fneg) - return is_not_fmul(src_alu, 0, 0, NULL); + return is_not_fmul(ht, src_alu, 0, 0, NULL); return src_alu->op != nir_op_fmul; } @@ -207,18 +242,18 @@ } static inline bool -is_not_const_and_not_fsign(nir_alu_instr *instr, unsigned src, +is_not_const_and_not_fsign(struct hash_table *ht, nir_alu_instr *instr, unsigned src, unsigned num_components, const uint8_t *swizzle) { - return is_not_const(instr, src, num_components, swizzle) && + return is_not_const(ht, instr, src, num_components, swizzle) && !is_fsign(instr, src, num_components, swizzle); } static inline bool is_used_once(nir_alu_instr *instr) { - bool zero_if_use = list_empty(&instr->dest.dest.ssa.if_uses); - bool zero_use = list_empty(&instr->dest.dest.ssa.uses); + bool zero_if_use = list_is_empty(&instr->dest.dest.ssa.if_uses); + bool zero_use = list_is_empty(&instr->dest.dest.ssa.uses); if (zero_if_use && zero_use) return false; @@ -239,13 +274,13 @@ static inline bool is_used_by_if(nir_alu_instr *instr) { - return !list_empty(&instr->dest.dest.ssa.if_uses); + return !list_is_empty(&instr->dest.dest.ssa.if_uses); } static inline bool is_not_used_by_if(nir_alu_instr *instr) { - return list_empty(&instr->dest.dest.ssa.if_uses); + return list_is_empty(&instr->dest.dest.ssa.if_uses); } static inline bool @@ -273,7 +308,8 @@ * of all its components is zero. */ static inline bool -is_upper_half_zero(nir_alu_instr *instr, unsigned src, +is_upper_half_zero(UNUSED struct hash_table *ht, + nir_alu_instr *instr, unsigned src, unsigned num_components, const uint8_t *swizzle) { if (nir_src_as_const_value(instr->src[src].src) == NULL) @@ -297,7 +333,8 @@ * of all its components is zero. */ static inline bool -is_lower_half_zero(nir_alu_instr *instr, unsigned src, +is_lower_half_zero(UNUSED struct hash_table *ht, + nir_alu_instr *instr, unsigned src, unsigned num_components, const uint8_t *swizzle) { if (nir_src_as_const_value(instr->src[src].src) == NULL) @@ -326,20 +363,20 @@ } static inline bool -is_integral(nir_alu_instr *instr, unsigned src, +is_integral(struct hash_table *ht, nir_alu_instr *instr, unsigned src, UNUSED unsigned num_components, UNUSED const uint8_t *swizzle) { - const struct ssa_result_range r = nir_analyze_range(instr, src); + const struct ssa_result_range r = nir_analyze_range(ht, instr, src); return r.is_integral; } #define RELATION(r) \ static inline bool \ -is_ ## r (nir_alu_instr *instr, unsigned src, \ +is_ ## r (struct hash_table *ht, nir_alu_instr *instr, unsigned src, \ UNUSED unsigned num_components, UNUSED const uint8_t *swizzle) \ { \ - const struct ssa_result_range v = nir_analyze_range(instr, src); \ + const struct ssa_result_range v = nir_analyze_range(ht, instr, src); \ return v.range == r; \ } @@ -350,26 +387,26 @@ RELATION(ne_zero) static inline bool -is_not_negative(nir_alu_instr *instr, unsigned src, +is_not_negative(struct hash_table *ht, nir_alu_instr *instr, unsigned src, UNUSED unsigned num_components, UNUSED const uint8_t *swizzle) { - const struct ssa_result_range v = nir_analyze_range(instr, src); + const struct ssa_result_range v = nir_analyze_range(ht, instr, src); return v.range == ge_zero || v.range == gt_zero || v.range == eq_zero; } static inline bool -is_not_positive(nir_alu_instr *instr, unsigned src, +is_not_positive(struct hash_table *ht, nir_alu_instr *instr, unsigned src, UNUSED unsigned num_components, UNUSED const uint8_t *swizzle) { - const struct ssa_result_range v = nir_analyze_range(instr, src); + const struct ssa_result_range v = nir_analyze_range(ht, instr, src); return v.range == le_zero || v.range == lt_zero || v.range == eq_zero; } static inline bool -is_not_zero(nir_alu_instr *instr, unsigned src, +is_not_zero(struct hash_table *ht, nir_alu_instr *instr, unsigned src, UNUSED unsigned num_components, UNUSED const uint8_t *swizzle) { - const struct ssa_result_range v = nir_analyze_range(instr, src); + const struct ssa_result_range v = nir_analyze_range(ht, instr, src); return v.range == lt_zero || v.range == gt_zero || v.range == ne_zero; } diff -Nru mesa-19.2.8/src/compiler/nir/nir_serialize.c mesa-20.0.8/src/compiler/nir/nir_serialize.c --- mesa-19.2.8/src/compiler/nir/nir_serialize.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir_serialize.c 2020-06-12 01:21:16.000000000 +0000 @@ -24,6 +24,10 @@ #include "nir_serialize.h" #include "nir_control_flow.h" #include "util/u_dynarray.h" +#include "util/u_math.h" + +#define NIR_SERIALIZE_FUNC_HAS_IMPL ((void *)(intptr_t)1) +#define MAX_OBJECT_IDS (1 << 20) typedef struct { size_t blob_offset; @@ -40,12 +44,24 @@ struct hash_table *remap_table; /* the next index to assign to a NIR in-memory object */ - uintptr_t next_idx; + uint32_t next_idx; /* Array of write_phi_fixup structs representing phi sources that need to * be resolved in the second pass. */ struct util_dynarray phi_fixups; + + /* The last serialized type. */ + const struct glsl_type *last_type; + const struct glsl_type *last_interface_type; + struct nir_variable_data last_var_data; + + /* For skipping equal ALU headers (typical after scalarization). */ + nir_instr_type last_instr_type; + uintptr_t last_alu_header_offset; + + /* Don't write optional data such as variable names. */ + bool strip; } write_ctx; typedef struct { @@ -54,10 +70,10 @@ struct blob_reader *blob; /* the next index to assign to a NIR in-memory object */ - uintptr_t next_idx; + uint32_t next_idx; /* The length of the index -> object table */ - uintptr_t idx_table_len; + uint32_t idx_table_len; /* map from index to deserialized pointer */ void **idx_table; @@ -65,27 +81,26 @@ /* List of phi sources. */ struct list_head phi_srcs; + /* The last deserialized type. */ + const struct glsl_type *last_type; + const struct glsl_type *last_interface_type; + struct nir_variable_data last_var_data; } read_ctx; static void write_add_object(write_ctx *ctx, const void *obj) { - uintptr_t index = ctx->next_idx++; - _mesa_hash_table_insert(ctx->remap_table, obj, (void *) index); + uint32_t index = ctx->next_idx++; + assert(index != MAX_OBJECT_IDS); + _mesa_hash_table_insert(ctx->remap_table, obj, (void *)(uintptr_t) index); } -static uintptr_t +static uint32_t write_lookup_object(write_ctx *ctx, const void *obj) { struct hash_entry *entry = _mesa_hash_table_search(ctx->remap_table, obj); assert(entry); - return (uintptr_t) entry->data; -} - -static void -write_object(write_ctx *ctx, const void *obj) -{ - blob_write_intptr(ctx->blob, write_lookup_object(ctx, obj)); + return (uint32_t)(uintptr_t) entry->data; } static void @@ -96,7 +111,7 @@ } static void * -read_lookup_object(read_ctx *ctx, uintptr_t idx) +read_lookup_object(read_ctx *ctx, uint32_t idx) { assert(idx < ctx->idx_table_len); return ctx->idx_table[idx]; @@ -105,7 +120,55 @@ static void * read_object(read_ctx *ctx) { - return read_lookup_object(ctx, blob_read_intptr(ctx->blob)); + return read_lookup_object(ctx, blob_read_uint32(ctx->blob)); +} + +static uint32_t +encode_bit_size_3bits(uint8_t bit_size) +{ + /* Encode values of 0, 1, 2, 4, 8, 16, 32, 64 in 3 bits. */ + assert(bit_size <= 64 && util_is_power_of_two_or_zero(bit_size)); + if (bit_size) + return util_logbase2(bit_size) + 1; + return 0; +} + +static uint8_t +decode_bit_size_3bits(uint8_t bit_size) +{ + if (bit_size) + return 1 << (bit_size - 1); + return 0; +} + +#define NUM_COMPONENTS_IS_SEPARATE_7 7 + +static uint8_t +encode_num_components_in_3bits(uint8_t num_components) +{ + if (num_components <= 4) + return num_components; + if (num_components == 8) + return 5; + if (num_components == 16) + return 6; + + /* special value indicating that num_components is in the next uint32 */ + return NUM_COMPONENTS_IS_SEPARATE_7; +} + +static uint8_t +decode_num_components_in_3bits(uint8_t value) +{ + if (value <= 4) + return value; + if (value == 5) + return 8; + if (value == 6) + return 16; + + unreachable("invalid num_components encoding"); + return 0; } static void @@ -131,28 +194,135 @@ return c; } +enum var_data_encoding { + var_encode_full, + var_encode_shader_temp, + var_encode_function_temp, + var_encode_location_diff, +}; + +union packed_var { + uint32_t u32; + struct { + unsigned has_name:1; + unsigned has_constant_initializer:1; + unsigned has_interface_type:1; + unsigned num_state_slots:7; + unsigned data_encoding:2; + unsigned type_same_as_last:1; + unsigned interface_type_same_as_last:1; + unsigned _pad:2; + unsigned num_members:16; + } u; +}; + +union packed_var_data_diff { + uint32_t u32; + struct { + int location:13; + int location_frac:3; + int driver_location:16; + } u; +}; + static void write_variable(write_ctx *ctx, const nir_variable *var) { write_add_object(ctx, var); - encode_type_to_blob(ctx->blob, var->type); - blob_write_uint32(ctx->blob, !!(var->name)); - if (var->name) + + assert(var->num_state_slots < (1 << 7)); + assert(var->num_members < (1 << 16)); + + STATIC_ASSERT(sizeof(union packed_var) == 4); + union packed_var flags; + flags.u32 = 0; + + flags.u.has_name = !ctx->strip && var->name; + flags.u.has_constant_initializer = !!(var->constant_initializer); + flags.u.has_interface_type = !!(var->interface_type); + flags.u.type_same_as_last = var->type == ctx->last_type; + flags.u.interface_type_same_as_last = + var->interface_type && var->interface_type == ctx->last_interface_type; + flags.u.num_state_slots = var->num_state_slots; + flags.u.num_members = var->num_members; + + struct nir_variable_data data = var->data; + + /* When stripping, we expect that the location is no longer needed, + * which is typically after shaders are linked. + */ + if (ctx->strip && + data.mode != nir_var_shader_in && + data.mode != nir_var_shader_out) + data.location = 0; + + /* Temporary variables don't serialize var->data. */ + if (data.mode == nir_var_shader_temp) + flags.u.data_encoding = var_encode_shader_temp; + else if (data.mode == nir_var_function_temp) + flags.u.data_encoding = var_encode_function_temp; + else { + struct nir_variable_data tmp = data; + + tmp.location = ctx->last_var_data.location; + tmp.location_frac = ctx->last_var_data.location_frac; + tmp.driver_location = ctx->last_var_data.driver_location; + + /* See if we can encode only the difference in locations from the last + * variable. + */ + if (memcmp(&ctx->last_var_data, &tmp, sizeof(tmp)) == 0 && + abs((int)data.location - + (int)ctx->last_var_data.location) < (1 << 12) && + abs((int)data.driver_location - + (int)ctx->last_var_data.driver_location) < (1 << 15)) + flags.u.data_encoding = var_encode_location_diff; + else + flags.u.data_encoding = var_encode_full; + } + + blob_write_uint32(ctx->blob, flags.u32); + + if (!flags.u.type_same_as_last) { + encode_type_to_blob(ctx->blob, var->type); + ctx->last_type = var->type; + } + + if (var->interface_type && !flags.u.interface_type_same_as_last) { + encode_type_to_blob(ctx->blob, var->interface_type); + ctx->last_interface_type = var->interface_type; + } + + if (flags.u.has_name) blob_write_string(ctx->blob, var->name); - blob_write_bytes(ctx->blob, (uint8_t *) &var->data, sizeof(var->data)); - blob_write_uint32(ctx->blob, var->num_state_slots); + + if (flags.u.data_encoding == var_encode_full || + flags.u.data_encoding == var_encode_location_diff) { + if (flags.u.data_encoding == var_encode_full) { + blob_write_bytes(ctx->blob, &data, sizeof(data)); + } else { + /* Serialize only the difference in locations from the last variable. + */ + union packed_var_data_diff diff; + + diff.u.location = data.location - ctx->last_var_data.location; + diff.u.location_frac = data.location_frac - + ctx->last_var_data.location_frac; + diff.u.driver_location = data.driver_location - + ctx->last_var_data.driver_location; + + blob_write_uint32(ctx->blob, diff.u32); + } + + ctx->last_var_data = data; + } + for (unsigned i = 0; i < var->num_state_slots; i++) { - for (unsigned j = 0; j < STATE_LENGTH; j++) - blob_write_uint32(ctx->blob, var->state_slots[i].tokens[j]); - blob_write_uint32(ctx->blob, var->state_slots[i].swizzle); + blob_write_bytes(ctx->blob, &var->state_slots[i], + sizeof(var->state_slots[i])); } - blob_write_uint32(ctx->blob, !!(var->constant_initializer)); if (var->constant_initializer) write_constant(ctx, var->constant_initializer); - blob_write_uint32(ctx->blob, !!(var->interface_type)); - if (var->interface_type) - encode_type_to_blob(ctx->blob, var->interface_type); - blob_write_uint32(ctx->blob, var->num_members); if (var->num_members > 0) { blob_write_bytes(ctx->blob, (uint8_t *) var->members, var->num_members * sizeof(*var->members)); @@ -165,36 +335,65 @@ nir_variable *var = rzalloc(ctx->nir, nir_variable); read_add_object(ctx, var); - var->type = decode_type_from_blob(ctx->blob); - bool has_name = blob_read_uint32(ctx->blob); - if (has_name) { + union packed_var flags; + flags.u32 = blob_read_uint32(ctx->blob); + + if (flags.u.type_same_as_last) { + var->type = ctx->last_type; + } else { + var->type = decode_type_from_blob(ctx->blob); + ctx->last_type = var->type; + } + + if (flags.u.has_interface_type) { + if (flags.u.interface_type_same_as_last) { + var->interface_type = ctx->last_interface_type; + } else { + var->interface_type = decode_type_from_blob(ctx->blob); + ctx->last_interface_type = var->interface_type; + } + } + + if (flags.u.has_name) { const char *name = blob_read_string(ctx->blob); var->name = ralloc_strdup(var, name); } else { var->name = NULL; } - blob_copy_bytes(ctx->blob, (uint8_t *) &var->data, sizeof(var->data)); - var->num_state_slots = blob_read_uint32(ctx->blob); + + if (flags.u.data_encoding == var_encode_shader_temp) + var->data.mode = nir_var_shader_temp; + else if (flags.u.data_encoding == var_encode_function_temp) + var->data.mode = nir_var_function_temp; + else if (flags.u.data_encoding == var_encode_full) { + blob_copy_bytes(ctx->blob, (uint8_t *) &var->data, sizeof(var->data)); + ctx->last_var_data = var->data; + } else { /* var_encode_location_diff */ + union packed_var_data_diff diff; + diff.u32 = blob_read_uint32(ctx->blob); + + var->data = ctx->last_var_data; + var->data.location += diff.u.location; + var->data.location_frac += diff.u.location_frac; + var->data.driver_location += diff.u.driver_location; + + ctx->last_var_data = var->data; + } + + var->num_state_slots = flags.u.num_state_slots; if (var->num_state_slots != 0) { var->state_slots = ralloc_array(var, nir_state_slot, var->num_state_slots); for (unsigned i = 0; i < var->num_state_slots; i++) { - for (unsigned j = 0; j < STATE_LENGTH; j++) - var->state_slots[i].tokens[j] = blob_read_uint32(ctx->blob); - var->state_slots[i].swizzle = blob_read_uint32(ctx->blob); + blob_copy_bytes(ctx->blob, &var->state_slots[i], + sizeof(var->state_slots[i])); } } - bool has_const_initializer = blob_read_uint32(ctx->blob); - if (has_const_initializer) + if (flags.u.has_constant_initializer) var->constant_initializer = read_constant(ctx, var); else var->constant_initializer = NULL; - bool has_interface_type = blob_read_uint32(ctx->blob); - if (has_interface_type) - var->interface_type = decode_type_from_blob(ctx->blob); - else - var->interface_type = NULL; - var->num_members = blob_read_uint32(ctx->blob); + var->num_members = flags.u.num_members; if (var->num_members > 0) { var->members = ralloc_array(var, struct nir_variable_data, var->num_members); @@ -233,8 +432,8 @@ blob_write_uint32(ctx->blob, reg->bit_size); blob_write_uint32(ctx->blob, reg->num_array_elems); blob_write_uint32(ctx->blob, reg->index); - blob_write_uint32(ctx->blob, !!(reg->name)); - if (reg->name) + blob_write_uint32(ctx->blob, !ctx->strip && reg->name); + if (!ctx->strip && reg->name) blob_write_string(ctx->blob, reg->name); } @@ -281,8 +480,32 @@ } } +union packed_src { + uint32_t u32; + struct { + unsigned is_ssa:1; /* <-- Header */ + unsigned is_indirect:1; + unsigned object_idx:20; + unsigned _footer:10; /* <-- Footer */ + } any; + struct { + unsigned _header:22; /* <-- Header */ + unsigned negate:1; /* <-- Footer */ + unsigned abs:1; + unsigned swizzle_x:2; + unsigned swizzle_y:2; + unsigned swizzle_z:2; + unsigned swizzle_w:2; + } alu; + struct { + unsigned _header:22; /* <-- Header */ + unsigned src_type:5; /* <-- Footer */ + unsigned _pad:5; + } tex; +}; + static void -write_src(write_ctx *ctx, const nir_src *src) +write_src_full(write_ctx *ctx, const nir_src *src, union packed_src header) { /* Since sources are very frequent, we try to save some space when storing * them. In particular, we store whether the source is a register and @@ -290,61 +513,237 @@ * assume that the high two bits of the index are zero, since otherwise our * address space would've been exhausted allocating the remap table! */ + header.any.is_ssa = src->is_ssa; if (src->is_ssa) { - uintptr_t idx = write_lookup_object(ctx, src->ssa) << 2; - idx |= 1; - blob_write_intptr(ctx->blob, idx); + header.any.object_idx = write_lookup_object(ctx, src->ssa); + blob_write_uint32(ctx->blob, header.u32); } else { - uintptr_t idx = write_lookup_object(ctx, src->reg.reg) << 2; - if (src->reg.indirect) - idx |= 2; - blob_write_intptr(ctx->blob, idx); + header.any.object_idx = write_lookup_object(ctx, src->reg.reg); + header.any.is_indirect = !!src->reg.indirect; + blob_write_uint32(ctx->blob, header.u32); blob_write_uint32(ctx->blob, src->reg.base_offset); if (src->reg.indirect) { - write_src(ctx, src->reg.indirect); + union packed_src header = {0}; + write_src_full(ctx, src->reg.indirect, header); } } } static void +write_src(write_ctx *ctx, const nir_src *src) +{ + union packed_src header = {0}; + write_src_full(ctx, src, header); +} + +static union packed_src read_src(read_ctx *ctx, nir_src *src, void *mem_ctx) { - uintptr_t val = blob_read_intptr(ctx->blob); - uintptr_t idx = val >> 2; - src->is_ssa = val & 0x1; + STATIC_ASSERT(sizeof(union packed_src) == 4); + union packed_src header; + header.u32 = blob_read_uint32(ctx->blob); + + src->is_ssa = header.any.is_ssa; if (src->is_ssa) { - src->ssa = read_lookup_object(ctx, idx); + src->ssa = read_lookup_object(ctx, header.any.object_idx); } else { - bool is_indirect = val & 0x2; - src->reg.reg = read_lookup_object(ctx, idx); + src->reg.reg = read_lookup_object(ctx, header.any.object_idx); src->reg.base_offset = blob_read_uint32(ctx->blob); - if (is_indirect) { + if (header.any.is_indirect) { src->reg.indirect = ralloc(mem_ctx, nir_src); read_src(ctx, src->reg.indirect, mem_ctx); } else { src->reg.indirect = NULL; } } + return header; } +union packed_dest { + uint8_t u8; + struct { + uint8_t is_ssa:1; + uint8_t has_name:1; + uint8_t num_components:3; + uint8_t bit_size:3; + } ssa; + struct { + uint8_t is_ssa:1; + uint8_t is_indirect:1; + uint8_t _pad:6; + } reg; +}; + +enum intrinsic_const_indices_encoding { + /* Use the 9 bits of packed_const_indices to store 1-9 indices. + * 1 9-bit index, or 2 4-bit indices, or 3 3-bit indices, or + * 4 2-bit indices, or 5-9 1-bit indices. + * + * The common case for load_ubo is 0, 0, 0, which is trivially represented. + * The common cases for load_interpolated_input also fit here, e.g.: 7, 3 + */ + const_indices_9bit_all_combined, + + const_indices_8bit, /* 8 bits per element */ + const_indices_16bit, /* 16 bits per element */ + const_indices_32bit, /* 32 bits per element */ +}; + +enum load_const_packing { + /* Constants are not packed and are stored in following dwords. */ + load_const_full, + + /* packed_value contains high 19 bits, low bits are 0, + * good for floating-point decimals + */ + load_const_scalar_hi_19bits, + + /* packed_value contains low 19 bits, high bits are sign-extended */ + load_const_scalar_lo_19bits_sext, +}; + +union packed_instr { + uint32_t u32; + struct { + unsigned instr_type:4; /* always present */ + unsigned _pad:20; + unsigned dest:8; /* always last */ + } any; + struct { + unsigned instr_type:4; + unsigned exact:1; + unsigned no_signed_wrap:1; + unsigned no_unsigned_wrap:1; + unsigned saturate:1; + /* Reg: writemask; SSA: swizzles for 2 srcs */ + unsigned writemask_or_two_swizzles:4; + unsigned op:9; + unsigned packed_src_ssa_16bit:1; + /* Scalarized ALUs always have the same header. */ + unsigned num_followup_alu_sharing_header:2; + unsigned dest:8; + } alu; + struct { + unsigned instr_type:4; + unsigned deref_type:3; + unsigned cast_type_same_as_last:1; + unsigned mode:10; /* deref_var redefines this */ + unsigned packed_src_ssa_16bit:1; /* deref_var redefines this */ + unsigned _pad:5; /* deref_var redefines this */ + unsigned dest:8; + } deref; + struct { + unsigned instr_type:4; + unsigned deref_type:3; + unsigned _pad:1; + unsigned object_idx:16; /* if 0, the object ID is a separate uint32 */ + unsigned dest:8; + } deref_var; + struct { + unsigned instr_type:4; + unsigned intrinsic:9; + unsigned const_indices_encoding:2; + unsigned packed_const_indices:9; + unsigned dest:8; + } intrinsic; + struct { + unsigned instr_type:4; + unsigned last_component:4; + unsigned bit_size:3; + unsigned packing:2; /* enum load_const_packing */ + unsigned packed_value:19; /* meaning determined by packing */ + } load_const; + struct { + unsigned instr_type:4; + unsigned last_component:4; + unsigned bit_size:3; + unsigned _pad:21; + } undef; + struct { + unsigned instr_type:4; + unsigned num_srcs:4; + unsigned op:4; + unsigned texture_array_size:12; + unsigned dest:8; + } tex; + struct { + unsigned instr_type:4; + unsigned num_srcs:20; + unsigned dest:8; + } phi; + struct { + unsigned instr_type:4; + unsigned type:2; + unsigned _pad:26; + } jump; +}; + +/* Write "lo24" as low 24 bits in the first uint32. */ static void -write_dest(write_ctx *ctx, const nir_dest *dst) +write_dest(write_ctx *ctx, const nir_dest *dst, union packed_instr header, + nir_instr_type instr_type) { - uint32_t val = dst->is_ssa; + STATIC_ASSERT(sizeof(union packed_dest) == 1); + union packed_dest dest; + dest.u8 = 0; + + dest.ssa.is_ssa = dst->is_ssa; if (dst->is_ssa) { - val |= !!(dst->ssa.name) << 1; - val |= dst->ssa.num_components << 2; - val |= dst->ssa.bit_size << 5; + dest.ssa.has_name = !ctx->strip && dst->ssa.name; + dest.ssa.num_components = + encode_num_components_in_3bits(dst->ssa.num_components); + dest.ssa.bit_size = encode_bit_size_3bits(dst->ssa.bit_size); } else { - val |= !!(dst->reg.indirect) << 1; + dest.reg.is_indirect = !!(dst->reg.indirect); } - blob_write_uint32(ctx->blob, val); + header.any.dest = dest.u8; + + /* Check if the current ALU instruction has the same header as the previous + * instruction that is also ALU. If it is, we don't have to write + * the current header. This is a typical occurence after scalarization. + */ + if (instr_type == nir_instr_type_alu) { + bool equal_header = false; + + if (ctx->last_instr_type == nir_instr_type_alu) { + assert(ctx->last_alu_header_offset); + union packed_instr *last_header = + (union packed_instr *)(ctx->blob->data + + ctx->last_alu_header_offset); + + /* Clear the field that counts ALUs with equal headers. */ + union packed_instr clean_header; + clean_header.u32 = last_header->u32; + clean_header.alu.num_followup_alu_sharing_header = 0; + + /* There can be at most 4 consecutive ALU instructions + * sharing the same header. + */ + if (last_header->alu.num_followup_alu_sharing_header < 3 && + header.u32 == clean_header.u32) { + last_header->alu.num_followup_alu_sharing_header++; + equal_header = true; + } + } + + if (!equal_header) { + ctx->last_alu_header_offset = ctx->blob->size; + blob_write_uint32(ctx->blob, header.u32); + } + } else { + blob_write_uint32(ctx->blob, header.u32); + } + + if (dest.ssa.is_ssa && + dest.ssa.num_components == NUM_COMPONENTS_IS_SEPARATE_7) + blob_write_uint32(ctx->blob, dst->ssa.num_components); + if (dst->is_ssa) { write_add_object(ctx, &dst->ssa); - if (dst->ssa.name) + if (dest.ssa.has_name) blob_write_string(ctx->blob, dst->ssa.name); } else { - blob_write_intptr(ctx->blob, write_lookup_object(ctx, dst->reg.reg)); + blob_write_uint32(ctx->blob, write_lookup_object(ctx, dst->reg.reg)); blob_write_uint32(ctx->blob, dst->reg.base_offset); if (dst->reg.indirect) write_src(ctx, dst->reg.indirect); @@ -352,73 +751,217 @@ } static void -read_dest(read_ctx *ctx, nir_dest *dst, nir_instr *instr) +read_dest(read_ctx *ctx, nir_dest *dst, nir_instr *instr, + union packed_instr header) { - uint32_t val = blob_read_uint32(ctx->blob); - bool is_ssa = val & 0x1; - if (is_ssa) { - bool has_name = val & 0x2; - unsigned num_components = (val >> 2) & 0x7; - unsigned bit_size = val >> 5; - char *name = has_name ? blob_read_string(ctx->blob) : NULL; + union packed_dest dest; + dest.u8 = header.any.dest; + + if (dest.ssa.is_ssa) { + unsigned bit_size = decode_bit_size_3bits(dest.ssa.bit_size); + unsigned num_components; + if (dest.ssa.num_components == NUM_COMPONENTS_IS_SEPARATE_7) + num_components = blob_read_uint32(ctx->blob); + else + num_components = decode_num_components_in_3bits(dest.ssa.num_components); + char *name = dest.ssa.has_name ? blob_read_string(ctx->blob) : NULL; nir_ssa_dest_init(instr, dst, num_components, bit_size, name); read_add_object(ctx, &dst->ssa); } else { - bool is_indirect = val & 0x2; dst->reg.reg = read_object(ctx); dst->reg.base_offset = blob_read_uint32(ctx->blob); - if (is_indirect) { + if (dest.reg.is_indirect) { dst->reg.indirect = ralloc(instr, nir_src); read_src(ctx, dst->reg.indirect, instr); } } } +static bool +are_object_ids_16bit(write_ctx *ctx) +{ + /* Check the highest object ID, because they are monotonic. */ + return ctx->next_idx < (1 << 16); +} + +static bool +is_alu_src_ssa_16bit(write_ctx *ctx, const nir_alu_instr *alu) +{ + unsigned num_srcs = nir_op_infos[alu->op].num_inputs; + + for (unsigned i = 0; i < num_srcs; i++) { + if (!alu->src[i].src.is_ssa || alu->src[i].abs || alu->src[i].negate) + return false; + + unsigned src_components = nir_ssa_alu_instr_src_components(alu, i); + + for (unsigned chan = 0; chan < src_components; chan++) { + /* The swizzles for src0.x and src1.x are stored + * in writemask_or_two_swizzles for SSA ALUs. + */ + if (alu->dest.dest.is_ssa && i < 2 && chan == 0 && + alu->src[i].swizzle[chan] < 4) + continue; + + if (alu->src[i].swizzle[chan] != chan) + return false; + } + } + + return are_object_ids_16bit(ctx); +} + static void write_alu(write_ctx *ctx, const nir_alu_instr *alu) { - blob_write_uint32(ctx->blob, alu->op); - uint32_t flags = alu->exact; - flags |= alu->no_signed_wrap << 1; - flags |= alu->no_unsigned_wrap << 2; - flags |= alu->dest.saturate << 3; - flags |= alu->dest.write_mask << 4; - blob_write_uint32(ctx->blob, flags); + unsigned num_srcs = nir_op_infos[alu->op].num_inputs; + unsigned dst_components = nir_dest_num_components(alu->dest.dest); - write_dest(ctx, &alu->dest.dest); - - for (unsigned i = 0; i < nir_op_infos[alu->op].num_inputs; i++) { - write_src(ctx, &alu->src[i].src); - flags = alu->src[i].negate; - flags |= alu->src[i].abs << 1; - for (unsigned j = 0; j < 4; j++) - flags |= alu->src[i].swizzle[j] << (2 + 2 * j); - blob_write_uint32(ctx->blob, flags); + /* 9 bits for nir_op */ + STATIC_ASSERT(nir_num_opcodes <= 512); + union packed_instr header; + header.u32 = 0; + + header.alu.instr_type = alu->instr.type; + header.alu.exact = alu->exact; + header.alu.no_signed_wrap = alu->no_signed_wrap; + header.alu.no_unsigned_wrap = alu->no_unsigned_wrap; + header.alu.saturate = alu->dest.saturate; + header.alu.op = alu->op; + header.alu.packed_src_ssa_16bit = is_alu_src_ssa_16bit(ctx, alu); + + if (header.alu.packed_src_ssa_16bit && + alu->dest.dest.is_ssa) { + /* For packed srcs of SSA ALUs, this field stores the swizzles. */ + header.alu.writemask_or_two_swizzles = alu->src[0].swizzle[0]; + if (num_srcs > 1) + header.alu.writemask_or_two_swizzles |= alu->src[1].swizzle[0] << 2; + } else if (!alu->dest.dest.is_ssa && dst_components <= 4) { + /* For vec4 registers, this field is a writemask. */ + header.alu.writemask_or_two_swizzles = alu->dest.write_mask; + } + + write_dest(ctx, &alu->dest.dest, header, alu->instr.type); + + if (!alu->dest.dest.is_ssa && dst_components > 4) + blob_write_uint32(ctx->blob, alu->dest.write_mask); + + if (header.alu.packed_src_ssa_16bit) { + for (unsigned i = 0; i < num_srcs; i++) { + assert(alu->src[i].src.is_ssa); + unsigned idx = write_lookup_object(ctx, alu->src[i].src.ssa); + assert(idx < (1 << 16)); + blob_write_uint16(ctx->blob, idx); + } + } else { + for (unsigned i = 0; i < num_srcs; i++) { + unsigned src_channels = nir_ssa_alu_instr_src_components(alu, i); + unsigned src_components = nir_src_num_components(alu->src[i].src); + union packed_src src; + bool packed = src_components <= 4 && src_channels <= 4; + src.u32 = 0; + + src.alu.negate = alu->src[i].negate; + src.alu.abs = alu->src[i].abs; + + if (packed) { + src.alu.swizzle_x = alu->src[i].swizzle[0]; + src.alu.swizzle_y = alu->src[i].swizzle[1]; + src.alu.swizzle_z = alu->src[i].swizzle[2]; + src.alu.swizzle_w = alu->src[i].swizzle[3]; + } + + write_src_full(ctx, &alu->src[i].src, src); + + /* Store swizzles for vec8 and vec16. */ + if (!packed) { + for (unsigned o = 0; o < src_channels; o += 8) { + unsigned value = 0; + + for (unsigned j = 0; j < 8 && o + j < src_channels; j++) { + value |= (uint32_t)alu->src[i].swizzle[o + j] << + (4 * j); /* 4 bits per swizzle */ + } + + blob_write_uint32(ctx->blob, value); + } + } + } } } static nir_alu_instr * -read_alu(read_ctx *ctx) +read_alu(read_ctx *ctx, union packed_instr header) { - nir_op op = blob_read_uint32(ctx->blob); - nir_alu_instr *alu = nir_alu_instr_create(ctx->nir, op); + unsigned num_srcs = nir_op_infos[header.alu.op].num_inputs; + nir_alu_instr *alu = nir_alu_instr_create(ctx->nir, header.alu.op); - uint32_t flags = blob_read_uint32(ctx->blob); - alu->exact = flags & 1; - alu->no_signed_wrap = flags & 2; - alu->no_unsigned_wrap = flags & 4; - alu->dest.saturate = flags & 8; - alu->dest.write_mask = flags >> 4; - - read_dest(ctx, &alu->dest.dest, &alu->instr); - - for (unsigned i = 0; i < nir_op_infos[op].num_inputs; i++) { - read_src(ctx, &alu->src[i].src, &alu->instr); - flags = blob_read_uint32(ctx->blob); - alu->src[i].negate = flags & 1; - alu->src[i].abs = flags & 2; - for (unsigned j = 0; j < 4; j++) - alu->src[i].swizzle[j] = (flags >> (2 * j + 2)) & 3; + alu->exact = header.alu.exact; + alu->no_signed_wrap = header.alu.no_signed_wrap; + alu->no_unsigned_wrap = header.alu.no_unsigned_wrap; + alu->dest.saturate = header.alu.saturate; + + read_dest(ctx, &alu->dest.dest, &alu->instr, header); + + unsigned dst_components = nir_dest_num_components(alu->dest.dest); + + if (alu->dest.dest.is_ssa) { + alu->dest.write_mask = u_bit_consecutive(0, dst_components); + } else if (dst_components <= 4) { + alu->dest.write_mask = header.alu.writemask_or_two_swizzles; + } else { + alu->dest.write_mask = blob_read_uint32(ctx->blob); + } + + if (header.alu.packed_src_ssa_16bit) { + for (unsigned i = 0; i < num_srcs; i++) { + nir_alu_src *src = &alu->src[i]; + src->src.is_ssa = true; + src->src.ssa = read_lookup_object(ctx, blob_read_uint16(ctx->blob)); + + memset(&src->swizzle, 0, sizeof(src->swizzle)); + + unsigned src_components = nir_ssa_alu_instr_src_components(alu, i); + + for (unsigned chan = 0; chan < src_components; chan++) + src->swizzle[chan] = chan; + } + } else { + for (unsigned i = 0; i < num_srcs; i++) { + union packed_src src = read_src(ctx, &alu->src[i].src, &alu->instr); + unsigned src_channels = nir_ssa_alu_instr_src_components(alu, i); + unsigned src_components = nir_src_num_components(alu->src[i].src); + bool packed = src_components <= 4 && src_channels <= 4; + + alu->src[i].negate = src.alu.negate; + alu->src[i].abs = src.alu.abs; + + memset(&alu->src[i].swizzle, 0, sizeof(alu->src[i].swizzle)); + + if (packed) { + alu->src[i].swizzle[0] = src.alu.swizzle_x; + alu->src[i].swizzle[1] = src.alu.swizzle_y; + alu->src[i].swizzle[2] = src.alu.swizzle_z; + alu->src[i].swizzle[3] = src.alu.swizzle_w; + } else { + /* Load swizzles for vec8 and vec16. */ + for (unsigned o = 0; o < src_channels; o += 8) { + unsigned value = blob_read_uint32(ctx->blob); + + for (unsigned j = 0; j < 8 && o + j < src_channels; j++) { + alu->src[i].swizzle[o + j] = + (value >> (4 * j)) & 0xf; /* 4 bits per swizzle */ + } + } + } + } + } + + if (header.alu.packed_src_ssa_16bit && + alu->dest.dest.is_ssa) { + alu->src[0].swizzle[0] = header.alu.writemask_or_two_swizzles & 0x3; + if (num_srcs > 1) + alu->src[1].swizzle[0] = header.alu.writemask_or_two_swizzles >> 2; } return alu; @@ -427,36 +970,71 @@ static void write_deref(write_ctx *ctx, const nir_deref_instr *deref) { - blob_write_uint32(ctx->blob, deref->deref_type); + assert(deref->deref_type < 8); + assert(deref->mode < (1 << 10)); - blob_write_uint32(ctx->blob, deref->mode); - encode_type_to_blob(ctx->blob, deref->type); + union packed_instr header; + header.u32 = 0; - write_dest(ctx, &deref->dest); + header.deref.instr_type = deref->instr.type; + header.deref.deref_type = deref->deref_type; + + if (deref->deref_type == nir_deref_type_cast) { + header.deref.mode = deref->mode; + header.deref.cast_type_same_as_last = deref->type == ctx->last_type; + } + unsigned var_idx = 0; if (deref->deref_type == nir_deref_type_var) { - write_object(ctx, deref->var); - return; + var_idx = write_lookup_object(ctx, deref->var); + if (var_idx && var_idx < (1 << 16)) + header.deref_var.object_idx = var_idx; } - write_src(ctx, &deref->parent); + if (deref->deref_type == nir_deref_type_array || + deref->deref_type == nir_deref_type_ptr_as_array) { + header.deref.packed_src_ssa_16bit = + deref->parent.is_ssa && deref->arr.index.is_ssa && + are_object_ids_16bit(ctx); + } + + write_dest(ctx, &deref->dest, header, deref->instr.type); switch (deref->deref_type) { + case nir_deref_type_var: + if (!header.deref_var.object_idx) + blob_write_uint32(ctx->blob, var_idx); + break; + case nir_deref_type_struct: + write_src(ctx, &deref->parent); blob_write_uint32(ctx->blob, deref->strct.index); break; case nir_deref_type_array: case nir_deref_type_ptr_as_array: - write_src(ctx, &deref->arr.index); + if (header.deref.packed_src_ssa_16bit) { + blob_write_uint16(ctx->blob, + write_lookup_object(ctx, deref->parent.ssa)); + blob_write_uint16(ctx->blob, + write_lookup_object(ctx, deref->arr.index.ssa)); + } else { + write_src(ctx, &deref->parent); + write_src(ctx, &deref->arr.index); + } break; case nir_deref_type_cast: + write_src(ctx, &deref->parent); blob_write_uint32(ctx->blob, deref->cast.ptr_stride); + if (!header.deref.cast_type_same_as_last) { + encode_type_to_blob(ctx->blob, deref->type); + ctx->last_type = deref->type; + } break; case nir_deref_type_array_wildcard: - /* Nothing to do */ + write_src(ctx, &deref->parent); break; default: @@ -465,88 +1043,206 @@ } static nir_deref_instr * -read_deref(read_ctx *ctx) +read_deref(read_ctx *ctx, union packed_instr header) { - nir_deref_type deref_type = blob_read_uint32(ctx->blob); + nir_deref_type deref_type = header.deref.deref_type; nir_deref_instr *deref = nir_deref_instr_create(ctx->nir, deref_type); - deref->mode = blob_read_uint32(ctx->blob); - deref->type = decode_type_from_blob(ctx->blob); + read_dest(ctx, &deref->dest, &deref->instr, header); - read_dest(ctx, &deref->dest, &deref->instr); + nir_deref_instr *parent; - if (deref_type == nir_deref_type_var) { - deref->var = read_object(ctx); - return deref; - } + switch (deref->deref_type) { + case nir_deref_type_var: + if (header.deref_var.object_idx) + deref->var = read_lookup_object(ctx, header.deref_var.object_idx); + else + deref->var = read_object(ctx); - read_src(ctx, &deref->parent, &deref->instr); + deref->type = deref->var->type; + break; - switch (deref->deref_type) { case nir_deref_type_struct: + read_src(ctx, &deref->parent, &deref->instr); + parent = nir_src_as_deref(deref->parent); deref->strct.index = blob_read_uint32(ctx->blob); + deref->type = glsl_get_struct_field(parent->type, deref->strct.index); break; case nir_deref_type_array: case nir_deref_type_ptr_as_array: - read_src(ctx, &deref->arr.index, &deref->instr); + if (header.deref.packed_src_ssa_16bit) { + deref->parent.is_ssa = true; + deref->parent.ssa = read_lookup_object(ctx, blob_read_uint16(ctx->blob)); + deref->arr.index.is_ssa = true; + deref->arr.index.ssa = read_lookup_object(ctx, blob_read_uint16(ctx->blob)); + } else { + read_src(ctx, &deref->parent, &deref->instr); + read_src(ctx, &deref->arr.index, &deref->instr); + } + + parent = nir_src_as_deref(deref->parent); + if (deref->deref_type == nir_deref_type_array) + deref->type = glsl_get_array_element(parent->type); + else + deref->type = parent->type; break; case nir_deref_type_cast: + read_src(ctx, &deref->parent, &deref->instr); deref->cast.ptr_stride = blob_read_uint32(ctx->blob); + if (header.deref.cast_type_same_as_last) { + deref->type = ctx->last_type; + } else { + deref->type = decode_type_from_blob(ctx->blob); + ctx->last_type = deref->type; + } break; case nir_deref_type_array_wildcard: - /* Nothing to do */ + read_src(ctx, &deref->parent, &deref->instr); + parent = nir_src_as_deref(deref->parent); + deref->type = glsl_get_array_element(parent->type); break; default: unreachable("Invalid deref type"); } + if (deref_type == nir_deref_type_var) { + deref->mode = deref->var->data.mode; + } else if (deref->deref_type == nir_deref_type_cast) { + deref->mode = header.deref.mode; + } else { + assert(deref->parent.is_ssa); + deref->mode = nir_instr_as_deref(deref->parent.ssa->parent_instr)->mode; + } + return deref; } static void write_intrinsic(write_ctx *ctx, const nir_intrinsic_instr *intrin) { - blob_write_uint32(ctx->blob, intrin->intrinsic); - + /* 9 bits for nir_intrinsic_op */ + STATIC_ASSERT(nir_num_intrinsics <= 512); unsigned num_srcs = nir_intrinsic_infos[intrin->intrinsic].num_srcs; unsigned num_indices = nir_intrinsic_infos[intrin->intrinsic].num_indices; + assert(intrin->intrinsic < 512); - blob_write_uint32(ctx->blob, intrin->num_components); + union packed_instr header; + header.u32 = 0; + + header.intrinsic.instr_type = intrin->instr.type; + header.intrinsic.intrinsic = intrin->intrinsic; + + /* Analyze constant indices to decide how to encode them. */ + if (num_indices) { + unsigned max_bits = 0; + for (unsigned i = 0; i < num_indices; i++) { + unsigned max = util_last_bit(intrin->const_index[i]); + max_bits = MAX2(max_bits, max); + } + + if (max_bits * num_indices <= 9) { + header.intrinsic.const_indices_encoding = const_indices_9bit_all_combined; + + /* Pack all const indices into 6 bits. */ + unsigned bit_size = 9 / num_indices; + for (unsigned i = 0; i < num_indices; i++) { + header.intrinsic.packed_const_indices |= + intrin->const_index[i] << (i * bit_size); + } + } else if (max_bits <= 8) + header.intrinsic.const_indices_encoding = const_indices_8bit; + else if (max_bits <= 16) + header.intrinsic.const_indices_encoding = const_indices_16bit; + else + header.intrinsic.const_indices_encoding = const_indices_32bit; + } if (nir_intrinsic_infos[intrin->intrinsic].has_dest) - write_dest(ctx, &intrin->dest); + write_dest(ctx, &intrin->dest, header, intrin->instr.type); + else + blob_write_uint32(ctx->blob, header.u32); for (unsigned i = 0; i < num_srcs; i++) write_src(ctx, &intrin->src[i]); - for (unsigned i = 0; i < num_indices; i++) - blob_write_uint32(ctx->blob, intrin->const_index[i]); + if (num_indices) { + switch (header.intrinsic.const_indices_encoding) { + case const_indices_8bit: + for (unsigned i = 0; i < num_indices; i++) + blob_write_uint8(ctx->blob, intrin->const_index[i]); + break; + case const_indices_16bit: + for (unsigned i = 0; i < num_indices; i++) + blob_write_uint16(ctx->blob, intrin->const_index[i]); + break; + case const_indices_32bit: + for (unsigned i = 0; i < num_indices; i++) + blob_write_uint32(ctx->blob, intrin->const_index[i]); + break; + } + } } static nir_intrinsic_instr * -read_intrinsic(read_ctx *ctx) +read_intrinsic(read_ctx *ctx, union packed_instr header) { - nir_intrinsic_op op = blob_read_uint32(ctx->blob); - + nir_intrinsic_op op = header.intrinsic.intrinsic; nir_intrinsic_instr *intrin = nir_intrinsic_instr_create(ctx->nir, op); unsigned num_srcs = nir_intrinsic_infos[op].num_srcs; unsigned num_indices = nir_intrinsic_infos[op].num_indices; - intrin->num_components = blob_read_uint32(ctx->blob); - if (nir_intrinsic_infos[op].has_dest) - read_dest(ctx, &intrin->dest, &intrin->instr); + read_dest(ctx, &intrin->dest, &intrin->instr, header); for (unsigned i = 0; i < num_srcs; i++) read_src(ctx, &intrin->src[i], &intrin->instr); - for (unsigned i = 0; i < num_indices; i++) - intrin->const_index[i] = blob_read_uint32(ctx->blob); + /* Vectorized instrinsics have num_components same as dst or src that has + * 0 components in the info. Find it. + */ + if (nir_intrinsic_infos[op].has_dest && + nir_intrinsic_infos[op].dest_components == 0) { + intrin->num_components = nir_dest_num_components(intrin->dest); + } else { + for (unsigned i = 0; i < num_srcs; i++) { + if (nir_intrinsic_infos[op].src_components[i] == 0) { + intrin->num_components = nir_src_num_components(intrin->src[i]); + break; + } + } + } + + if (num_indices) { + switch (header.intrinsic.const_indices_encoding) { + case const_indices_9bit_all_combined: { + unsigned bit_size = 9 / num_indices; + unsigned bit_mask = u_bit_consecutive(0, bit_size); + for (unsigned i = 0; i < num_indices; i++) { + intrin->const_index[i] = + (header.intrinsic.packed_const_indices >> (i * bit_size)) & + bit_mask; + } + break; + } + case const_indices_8bit: + for (unsigned i = 0; i < num_indices; i++) + intrin->const_index[i] = blob_read_uint8(ctx->blob); + break; + case const_indices_16bit: + for (unsigned i = 0; i < num_indices; i++) + intrin->const_index[i] = blob_read_uint16(ctx->blob); + break; + case const_indices_32bit: + for (unsigned i = 0; i < num_indices; i++) + intrin->const_index[i] = blob_read_uint32(ctx->blob); + break; + } + } return intrin; } @@ -554,22 +1250,155 @@ static void write_load_const(write_ctx *ctx, const nir_load_const_instr *lc) { - uint32_t val = lc->def.num_components; - val |= lc->def.bit_size << 3; - blob_write_uint32(ctx->blob, val); - blob_write_bytes(ctx->blob, lc->value, sizeof(*lc->value) * lc->def.num_components); + assert(lc->def.num_components >= 1 && lc->def.num_components <= 16); + union packed_instr header; + header.u32 = 0; + + header.load_const.instr_type = lc->instr.type; + header.load_const.last_component = lc->def.num_components - 1; + header.load_const.bit_size = encode_bit_size_3bits(lc->def.bit_size); + header.load_const.packing = load_const_full; + + /* Try to pack 1-component constants into the 19 free bits in the header. */ + if (lc->def.num_components == 1) { + switch (lc->def.bit_size) { + case 64: + if ((lc->value[0].u64 & 0x1fffffffffffull) == 0) { + /* packed_value contains high 19 bits, low bits are 0 */ + header.load_const.packing = load_const_scalar_hi_19bits; + header.load_const.packed_value = lc->value[0].u64 >> 45; + } else if (((lc->value[0].i64 << 45) >> 45) == lc->value[0].i64) { + /* packed_value contains low 19 bits, high bits are sign-extended */ + header.load_const.packing = load_const_scalar_lo_19bits_sext; + header.load_const.packed_value = lc->value[0].u64; + } + break; + + case 32: + if ((lc->value[0].u32 & 0x1fff) == 0) { + header.load_const.packing = load_const_scalar_hi_19bits; + header.load_const.packed_value = lc->value[0].u32 >> 13; + } else if (((lc->value[0].i32 << 13) >> 13) == lc->value[0].i32) { + header.load_const.packing = load_const_scalar_lo_19bits_sext; + header.load_const.packed_value = lc->value[0].u32; + } + break; + + case 16: + header.load_const.packing = load_const_scalar_lo_19bits_sext; + header.load_const.packed_value = lc->value[0].u16; + break; + case 8: + header.load_const.packing = load_const_scalar_lo_19bits_sext; + header.load_const.packed_value = lc->value[0].u8; + break; + case 1: + header.load_const.packing = load_const_scalar_lo_19bits_sext; + header.load_const.packed_value = lc->value[0].b; + break; + default: + unreachable("invalid bit_size"); + } + } + + blob_write_uint32(ctx->blob, header.u32); + + if (header.load_const.packing == load_const_full) { + switch (lc->def.bit_size) { + case 64: + blob_write_bytes(ctx->blob, lc->value, + sizeof(*lc->value) * lc->def.num_components); + break; + + case 32: + for (unsigned i = 0; i < lc->def.num_components; i++) + blob_write_uint32(ctx->blob, lc->value[i].u32); + break; + + case 16: + for (unsigned i = 0; i < lc->def.num_components; i++) + blob_write_uint16(ctx->blob, lc->value[i].u16); + break; + + default: + assert(lc->def.bit_size <= 8); + for (unsigned i = 0; i < lc->def.num_components; i++) + blob_write_uint8(ctx->blob, lc->value[i].u8); + break; + } + } + write_add_object(ctx, &lc->def); } static nir_load_const_instr * -read_load_const(read_ctx *ctx) +read_load_const(read_ctx *ctx, union packed_instr header) { - uint32_t val = blob_read_uint32(ctx->blob); - nir_load_const_instr *lc = - nir_load_const_instr_create(ctx->nir, val & 0x7, val >> 3); + nir_load_const_instr_create(ctx->nir, header.load_const.last_component + 1, + decode_bit_size_3bits(header.load_const.bit_size)); + + switch (header.load_const.packing) { + case load_const_scalar_hi_19bits: + switch (lc->def.bit_size) { + case 64: + lc->value[0].u64 = (uint64_t)header.load_const.packed_value << 45; + break; + case 32: + lc->value[0].u32 = (uint64_t)header.load_const.packed_value << 13; + break; + default: + unreachable("invalid bit_size"); + } + break; + + case load_const_scalar_lo_19bits_sext: + switch (lc->def.bit_size) { + case 64: + lc->value[0].i64 = ((int64_t)header.load_const.packed_value << 45) >> 45; + break; + case 32: + lc->value[0].i32 = ((int32_t)header.load_const.packed_value << 13) >> 13; + break; + case 16: + lc->value[0].u16 = header.load_const.packed_value; + break; + case 8: + lc->value[0].u8 = header.load_const.packed_value; + break; + case 1: + lc->value[0].b = header.load_const.packed_value; + break; + default: + unreachable("invalid bit_size"); + } + break; + + case load_const_full: + switch (lc->def.bit_size) { + case 64: + blob_copy_bytes(ctx->blob, lc->value, sizeof(*lc->value) * lc->def.num_components); + break; + + case 32: + for (unsigned i = 0; i < lc->def.num_components; i++) + lc->value[i].u32 = blob_read_uint32(ctx->blob); + break; + + case 16: + for (unsigned i = 0; i < lc->def.num_components; i++) + lc->value[i].u16 = blob_read_uint16(ctx->blob); + break; + + default: + assert(lc->def.bit_size <= 8); + for (unsigned i = 0; i < lc->def.num_components; i++) + lc->value[i].u8 = blob_read_uint8(ctx->blob); + break; + } + break; + } - blob_copy_bytes(ctx->blob, lc->value, sizeof(*lc->value) * lc->def.num_components); read_add_object(ctx, &lc->def); return lc; } @@ -577,19 +1406,25 @@ static void write_ssa_undef(write_ctx *ctx, const nir_ssa_undef_instr *undef) { - uint32_t val = undef->def.num_components; - val |= undef->def.bit_size << 3; - blob_write_uint32(ctx->blob, val); + assert(undef->def.num_components >= 1 && undef->def.num_components <= 16); + + union packed_instr header; + header.u32 = 0; + + header.undef.instr_type = undef->instr.type; + header.undef.last_component = undef->def.num_components - 1; + header.undef.bit_size = encode_bit_size_3bits(undef->def.bit_size); + + blob_write_uint32(ctx->blob, header.u32); write_add_object(ctx, &undef->def); } static nir_ssa_undef_instr * -read_ssa_undef(read_ctx *ctx) +read_ssa_undef(read_ctx *ctx, union packed_instr header) { - uint32_t val = blob_read_uint32(ctx->blob); - nir_ssa_undef_instr *undef = - nir_ssa_undef_instr_create(ctx->nir, val & 0x7, val >> 3); + nir_ssa_undef_instr_create(ctx->nir, header.undef.last_component + 1, + decode_bit_size_3bits(header.undef.bit_size)); read_add_object(ctx, &undef->def); return undef; @@ -605,19 +1440,33 @@ unsigned is_shadow:1; unsigned is_new_style_shadow:1; unsigned component:2; - unsigned unused:10; /* Mark unused for valgrind. */ + unsigned texture_non_uniform:1; + unsigned sampler_non_uniform:1; + unsigned unused:8; /* Mark unused for valgrind. */ } u; }; static void write_tex(write_ctx *ctx, const nir_tex_instr *tex) { - blob_write_uint32(ctx->blob, tex->num_srcs); - blob_write_uint32(ctx->blob, tex->op); + assert(tex->num_srcs < 16); + assert(tex->op < 16); + assert(tex->texture_array_size < 1024); + + union packed_instr header; + header.u32 = 0; + + header.tex.instr_type = tex->instr.type; + header.tex.num_srcs = tex->num_srcs; + header.tex.op = tex->op; + header.tex.texture_array_size = tex->texture_array_size; + + write_dest(ctx, &tex->dest, header, tex->instr.type); + blob_write_uint32(ctx->blob, tex->texture_index); - blob_write_uint32(ctx->blob, tex->texture_array_size); blob_write_uint32(ctx->blob, tex->sampler_index); - blob_write_bytes(ctx->blob, tex->tg4_offsets, sizeof(tex->tg4_offsets)); + if (tex->op == nir_texop_tg4) + blob_write_bytes(ctx->blob, tex->tg4_offsets, sizeof(tex->tg4_offsets)); STATIC_ASSERT(sizeof(union packed_tex_data) == sizeof(uint32_t)); union packed_tex_data packed = { @@ -628,27 +1477,32 @@ .u.is_shadow = tex->is_shadow, .u.is_new_style_shadow = tex->is_new_style_shadow, .u.component = tex->component, + .u.texture_non_uniform = tex->texture_non_uniform, + .u.sampler_non_uniform = tex->sampler_non_uniform, }; blob_write_uint32(ctx->blob, packed.u32); - write_dest(ctx, &tex->dest); for (unsigned i = 0; i < tex->num_srcs; i++) { - blob_write_uint32(ctx->blob, tex->src[i].src_type); - write_src(ctx, &tex->src[i].src); + union packed_src src; + src.u32 = 0; + src.tex.src_type = tex->src[i].src_type; + write_src_full(ctx, &tex->src[i].src, src); } } static nir_tex_instr * -read_tex(read_ctx *ctx) +read_tex(read_ctx *ctx, union packed_instr header) { - unsigned num_srcs = blob_read_uint32(ctx->blob); - nir_tex_instr *tex = nir_tex_instr_create(ctx->nir, num_srcs); + nir_tex_instr *tex = nir_tex_instr_create(ctx->nir, header.tex.num_srcs); + + read_dest(ctx, &tex->dest, &tex->instr, header); - tex->op = blob_read_uint32(ctx->blob); + tex->op = header.tex.op; tex->texture_index = blob_read_uint32(ctx->blob); - tex->texture_array_size = blob_read_uint32(ctx->blob); + tex->texture_array_size = header.tex.texture_array_size; tex->sampler_index = blob_read_uint32(ctx->blob); - blob_copy_bytes(ctx->blob, tex->tg4_offsets, sizeof(tex->tg4_offsets)); + if (tex->op == nir_texop_tg4) + blob_copy_bytes(ctx->blob, tex->tg4_offsets, sizeof(tex->tg4_offsets)); union packed_tex_data packed; packed.u32 = blob_read_uint32(ctx->blob); @@ -659,11 +1513,12 @@ tex->is_shadow = packed.u.is_shadow; tex->is_new_style_shadow = packed.u.is_new_style_shadow; tex->component = packed.u.component; + tex->texture_non_uniform = packed.u.texture_non_uniform; + tex->sampler_non_uniform = packed.u.sampler_non_uniform; - read_dest(ctx, &tex->dest, &tex->instr); for (unsigned i = 0; i < tex->num_srcs; i++) { - tex->src[i].src_type = blob_read_uint32(ctx->blob); - read_src(ctx, &tex->src[i].src, &tex->instr); + union packed_src src = read_src(ctx, &tex->src[i].src, &tex->instr); + tex->src[i].src_type = src.tex.src_type; } return tex; @@ -672,20 +1527,24 @@ static void write_phi(write_ctx *ctx, const nir_phi_instr *phi) { + union packed_instr header; + header.u32 = 0; + + header.phi.instr_type = phi->instr.type; + header.phi.num_srcs = exec_list_length(&phi->srcs); + /* Phi nodes are special, since they may reference SSA definitions and - * basic blocks that don't exist yet. We leave two empty uintptr_t's here, + * basic blocks that don't exist yet. We leave two empty uint32_t's here, * and then store enough information so that a later fixup pass can fill * them in correctly. */ - write_dest(ctx, &phi->dest); - - blob_write_uint32(ctx->blob, exec_list_length(&phi->srcs)); + write_dest(ctx, &phi->dest, header, phi->instr.type); nir_foreach_phi_src(src, phi) { assert(src->src.is_ssa); - size_t blob_offset = blob_reserve_intptr(ctx->blob); - ASSERTED size_t blob_offset2 = blob_reserve_intptr(ctx->blob); - assert(blob_offset + sizeof(uintptr_t) == blob_offset2); + size_t blob_offset = blob_reserve_uint32(ctx->blob); + ASSERTED size_t blob_offset2 = blob_reserve_uint32(ctx->blob); + assert(blob_offset + sizeof(uint32_t) == blob_offset2); write_phi_fixup fixup = { .blob_offset = blob_offset, .src = src->src.ssa, @@ -699,7 +1558,7 @@ write_fixup_phis(write_ctx *ctx) { util_dynarray_foreach(&ctx->phi_fixups, write_phi_fixup, fixup) { - uintptr_t *blob_ptr = (uintptr_t *)(ctx->blob->data + fixup->blob_offset); + uint32_t *blob_ptr = (uint32_t *)(ctx->blob->data + fixup->blob_offset); blob_ptr[0] = write_lookup_object(ctx, fixup->src); blob_ptr[1] = write_lookup_object(ctx, fixup->block); } @@ -708,13 +1567,11 @@ } static nir_phi_instr * -read_phi(read_ctx *ctx, nir_block *blk) +read_phi(read_ctx *ctx, nir_block *blk, union packed_instr header) { nir_phi_instr *phi = nir_phi_instr_create(ctx->nir); - read_dest(ctx, &phi->dest, &phi->instr); - - unsigned num_srcs = blob_read_uint32(ctx->blob); + read_dest(ctx, &phi->dest, &phi->instr, header); /* For similar reasons as before, we just store the index directly into the * pointer, and let a later pass resolve the phi sources. @@ -726,12 +1583,12 @@ */ nir_instr_insert_after_block(blk, &phi->instr); - for (unsigned i = 0; i < num_srcs; i++) { + for (unsigned i = 0; i < header.phi.num_srcs; i++) { nir_phi_src *src = ralloc(phi, nir_phi_src); src->src.is_ssa = true; - src->src.ssa = (nir_ssa_def *) blob_read_intptr(ctx->blob); - src->pred = (nir_block *) blob_read_intptr(ctx->blob); + src->src.ssa = (nir_ssa_def *)(uintptr_t) blob_read_uint32(ctx->blob); + src->pred = (nir_block *)(uintptr_t) blob_read_uint32(ctx->blob); /* Since we're not letting nir_insert_instr handle use/def stuff for us, * we have to set the parent_instr manually. It doesn't really matter @@ -762,27 +1619,34 @@ list_addtail(&src->src.use_link, &src->src.ssa->uses); } - assert(list_empty(&ctx->phi_srcs)); + assert(list_is_empty(&ctx->phi_srcs)); } static void write_jump(write_ctx *ctx, const nir_jump_instr *jmp) { - blob_write_uint32(ctx->blob, jmp->type); + assert(jmp->type < 4); + + union packed_instr header; + header.u32 = 0; + + header.jump.instr_type = jmp->instr.type; + header.jump.type = jmp->type; + + blob_write_uint32(ctx->blob, header.u32); } static nir_jump_instr * -read_jump(read_ctx *ctx) +read_jump(read_ctx *ctx, union packed_instr header) { - nir_jump_type type = blob_read_uint32(ctx->blob); - nir_jump_instr *jmp = nir_jump_instr_create(ctx->nir, type); + nir_jump_instr *jmp = nir_jump_instr_create(ctx->nir, header.jump.type); return jmp; } static void write_call(write_ctx *ctx, const nir_call_instr *call) { - blob_write_intptr(ctx->blob, write_lookup_object(ctx, call->callee)); + blob_write_uint32(ctx->blob, write_lookup_object(ctx, call->callee)); for (unsigned i = 0; i < call->num_params; i++) write_src(ctx, &call->params[i]); @@ -803,7 +1667,9 @@ static void write_instr(write_ctx *ctx, const nir_instr *instr) { - blob_write_uint32(ctx->blob, instr->type); + /* We have only 4 bits for the instruction type. */ + assert(instr->type < 16); + switch (instr->type) { case nir_instr_type_alu: write_alu(ctx, nir_instr_as_alu(instr)); @@ -830,6 +1696,7 @@ write_jump(ctx, nir_instr_as_jump(instr)); break; case nir_instr_type_call: + blob_write_uint32(ctx->blob, instr->type); write_call(ctx, nir_instr_as_call(instr)); break; case nir_instr_type_parallel_copy: @@ -839,29 +1706,34 @@ } } -static void +/* Return the number of instructions read. */ +static unsigned read_instr(read_ctx *ctx, nir_block *block) { - nir_instr_type type = blob_read_uint32(ctx->blob); + STATIC_ASSERT(sizeof(union packed_instr) == 4); + union packed_instr header; + header.u32 = blob_read_uint32(ctx->blob); nir_instr *instr; - switch (type) { + + switch (header.any.instr_type) { case nir_instr_type_alu: - instr = &read_alu(ctx)->instr; - break; + for (unsigned i = 0; i <= header.alu.num_followup_alu_sharing_header; i++) + nir_instr_insert_after_block(block, &read_alu(ctx, header)->instr); + return header.alu.num_followup_alu_sharing_header + 1; case nir_instr_type_deref: - instr = &read_deref(ctx)->instr; + instr = &read_deref(ctx, header)->instr; break; case nir_instr_type_intrinsic: - instr = &read_intrinsic(ctx)->instr; + instr = &read_intrinsic(ctx, header)->instr; break; case nir_instr_type_load_const: - instr = &read_load_const(ctx)->instr; + instr = &read_load_const(ctx, header)->instr; break; case nir_instr_type_ssa_undef: - instr = &read_ssa_undef(ctx)->instr; + instr = &read_ssa_undef(ctx, header)->instr; break; case nir_instr_type_tex: - instr = &read_tex(ctx)->instr; + instr = &read_tex(ctx, header)->instr; break; case nir_instr_type_phi: /* Phi instructions are a bit of a special case when reading because we @@ -869,10 +1741,10 @@ * for us. Instead, we need to wait until all the blocks/instructions * are read so that we can set their sources up. */ - read_phi(ctx, block); - return; + read_phi(ctx, block, header); + return 1; case nir_instr_type_jump: - instr = &read_jump(ctx)->instr; + instr = &read_jump(ctx, header)->instr; break; case nir_instr_type_call: instr = &read_call(ctx)->instr; @@ -884,6 +1756,7 @@ } nir_instr_insert_after_block(block, instr); + return 1; } static void @@ -891,8 +1764,14 @@ { write_add_object(ctx, block); blob_write_uint32(ctx->blob, exec_list_length(&block->instr_list)); - nir_foreach_instr(instr, block) + + ctx->last_instr_type = ~0; + ctx->last_alu_header_offset = 0; + + nir_foreach_instr(instr, block) { write_instr(ctx, instr); + ctx->last_instr_type = instr->type; + } } static void @@ -907,8 +1786,8 @@ read_add_object(ctx, block); unsigned num_instrs = blob_read_uint32(ctx->blob); - for (unsigned i = 0; i < num_instrs; i++) { - read_instr(ctx, block); + for (unsigned i = 0; i < num_instrs;) { + i += read_instr(ctx, block); } } @@ -1045,7 +1924,12 @@ static void write_function(write_ctx *ctx, const nir_function *fxn) { - blob_write_uint32(ctx->blob, !!(fxn->name)); + uint32_t flags = fxn->is_entrypoint; + if (fxn->name) + flags |= 0x2; + if (fxn->impl) + flags |= 0x4; + blob_write_uint32(ctx->blob, flags); if (fxn->name) blob_write_string(ctx->blob, fxn->name); @@ -1059,8 +1943,6 @@ blob_write_uint32(ctx->blob, val); } - blob_write_uint32(ctx->blob, fxn->is_entrypoint); - /* At first glance, it looks like we should write the function_impl here. * However, call instructions need to be able to reference at least the * function and those will get processed as we write the function_impls. @@ -1071,7 +1953,8 @@ static void read_function(read_ctx *ctx) { - bool has_name = blob_read_uint32(ctx->blob); + uint32_t flags = blob_read_uint32(ctx->blob); + bool has_name = flags & 0x2; char *name = has_name ? blob_read_string(ctx->blob) : NULL; nir_function *fxn = nir_function_create(ctx->nir, name); @@ -1086,31 +1969,40 @@ fxn->params[i].bit_size = (val >> 8) & 0xff; } - fxn->is_entrypoint = blob_read_uint32(ctx->blob); + fxn->is_entrypoint = flags & 0x1; + if (flags & 0x4) + fxn->impl = NIR_SERIALIZE_FUNC_HAS_IMPL; } +/** + * Serialize NIR into a binary blob. + * + * \param strip Don't serialize information only useful for debugging, + * such as variable names, making cache hits from similar + * shaders more likely. + */ void -nir_serialize(struct blob *blob, const nir_shader *nir) +nir_serialize(struct blob *blob, const nir_shader *nir, bool strip) { - write_ctx ctx; + write_ctx ctx = {0}; ctx.remap_table = _mesa_pointer_hash_table_create(NULL); - ctx.next_idx = 0; ctx.blob = blob; ctx.nir = nir; + ctx.strip = strip; util_dynarray_init(&ctx.phi_fixups, NULL); - size_t idx_size_offset = blob_reserve_intptr(blob); + size_t idx_size_offset = blob_reserve_uint32(blob); struct shader_info info = nir->info; uint32_t strings = 0; - if (info.name) + if (!strip && info.name) strings |= 0x1; - if (info.label) + if (!strip && info.label) strings |= 0x2; blob_write_uint32(blob, strings); - if (info.name) + if (!strip && info.name) blob_write_string(blob, info.name); - if (info.label) + if (!strip && info.label) blob_write_string(blob, info.label); info.name = info.label = NULL; blob_write_bytes(blob, (uint8_t *) &info, sizeof(info)); @@ -1134,14 +2026,15 @@ } nir_foreach_function(fxn, nir) { - write_function_impl(&ctx, fxn->impl); + if (fxn->impl) + write_function_impl(&ctx, fxn->impl); } blob_write_uint32(blob, nir->constant_data_size); if (nir->constant_data_size > 0) blob_write_bytes(blob, nir->constant_data, nir->constant_data_size); - *(uintptr_t *)(blob->data + idx_size_offset) = ctx.next_idx; + *(uint32_t *)(blob->data + idx_size_offset) = ctx.next_idx; _mesa_hash_table_destroy(ctx.remap_table, NULL); util_dynarray_fini(&ctx.phi_fixups); @@ -1152,12 +2045,11 @@ const struct nir_shader_compiler_options *options, struct blob_reader *blob) { - read_ctx ctx; + read_ctx ctx = {0}; ctx.blob = blob; list_inithead(&ctx.phi_srcs); - ctx.idx_table_len = blob_read_intptr(blob); + ctx.idx_table_len = blob_read_uint32(blob); ctx.idx_table = calloc(ctx.idx_table_len, sizeof(uintptr_t)); - ctx.next_idx = 0; uint32_t strings = blob_read_uint32(blob); char *name = (strings & 0x1) ? blob_read_string(blob) : NULL; @@ -1190,8 +2082,10 @@ for (unsigned i = 0; i < num_functions; i++) read_function(&ctx); - nir_foreach_function(fxn, ctx.nir) - fxn->impl = read_function_impl(&ctx, fxn); + nir_foreach_function(fxn, ctx.nir) { + if (fxn->impl == NIR_SERIALIZE_FUNC_HAS_IMPL) + fxn->impl = read_function_impl(&ctx, fxn); + } ctx.nir->constant_data_size = blob_read_uint32(blob); if (ctx.nir->constant_data_size > 0) { @@ -1213,7 +2107,7 @@ struct blob writer; blob_init(&writer); - nir_serialize(&writer, shader); + nir_serialize(&writer, shader, false); /* Delete all of dest's ralloc children but leave dest alone */ void *dead_ctx = ralloc_context(NULL); diff -Nru mesa-19.2.8/src/compiler/nir/nir_serialize.h mesa-20.0.8/src/compiler/nir/nir_serialize.h --- mesa-19.2.8/src/compiler/nir/nir_serialize.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir_serialize.h 2020-06-12 01:21:16.000000000 +0000 @@ -25,13 +25,13 @@ #define _NIR_SERIALIZE_H #include "nir.h" -#include "compiler/blob.h" +#include "util/blob.h" #ifdef __cplusplus extern "C" { #endif -void nir_serialize(struct blob *blob, const nir_shader *nir); +void nir_serialize(struct blob *blob, const nir_shader *nir, bool strip); nir_shader *nir_deserialize(void *mem_ctx, const struct nir_shader_compiler_options *options, struct blob_reader *blob); diff -Nru mesa-19.2.8/src/compiler/nir/nir_strip.c mesa-20.0.8/src/compiler/nir/nir_strip.c --- mesa-19.2.8/src/compiler/nir/nir_strip.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir_strip.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,104 +0,0 @@ -/* - * Copyright © 2019 Valve Corporation - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - */ - -#include "nir.h" -#include "util/debug.h" - -/* This pass removes information which is only useful for debugging, - * making cache hits from similar shaders more likely. - */ - -static void -strip_variable(nir_variable *var) -{ - var->name = NULL; - - if (var->data.mode != nir_var_shader_in && - var->data.mode != nir_var_shader_out) { - /* We assume that this is called after nir_lower_io(), at which point - * the original user-facing location is irrelevant except for inputs and - * outputs. - */ - var->data.location = 0; - } -} - -static void -strip_register(nir_register *reg) -{ - reg->name = NULL; -} - -static bool -strip_def(nir_ssa_def *def, void *_unused) -{ - (void) _unused; - def->name = NULL; - return true; -} - -static void -strip_impl(nir_function_impl *impl) -{ - nir_index_ssa_defs(impl); - - nir_foreach_variable(var, &impl->locals) - strip_variable(var); - nir_foreach_register(reg, &impl->registers) - strip_register(reg); - nir_foreach_block(block, impl) { - nir_foreach_instr(instr, block) { - nir_foreach_ssa_def(instr, strip_def, NULL); - } - } -} - -void -nir_strip(nir_shader *shader) -{ - static int should_strip = -1; - if (should_strip < 0) - should_strip = env_var_as_boolean("NIR_STRIP", true); - if (!should_strip) - return; - - shader->info.name = NULL; - shader->info.label = NULL; - - nir_foreach_variable(var, &shader->uniforms) - strip_variable(var); - nir_foreach_variable(var, &shader->inputs) - strip_variable(var); - nir_foreach_variable(var, &shader->outputs) - strip_variable(var); - nir_foreach_variable(var, &shader->system_values) - strip_variable(var); - nir_foreach_variable(var, &shader->globals) - strip_variable(var); - - nir_foreach_function(func, shader) { - if (func->impl) - strip_impl(func->impl); - } -} diff -Nru mesa-19.2.8/src/compiler/nir/nir_validate.c mesa-20.0.8/src/compiler/nir/nir_validate.c --- mesa-19.2.8/src/compiler/nir/nir_validate.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir_validate.c 2020-06-12 01:21:16.000000000 +0000 @@ -126,6 +126,12 @@ unsigned bit_sizes, unsigned num_components); static void +validate_num_components(validate_state *state, unsigned num_components) +{ + validate_assert(state, nir_num_components_valid(num_components)); +} + +static void validate_reg_src(nir_src *src, validate_state *state, unsigned bit_sizes, unsigned num_components) { @@ -275,10 +281,7 @@ BITSET_SET(state->ssa_defs_found, def->index); validate_assert(state, def->parent_instr == state->instr); - - validate_assert(state, (def->num_components <= 4) || - (def->num_components == 8) || - (def->num_components == 16)); + validate_num_components(state, def->num_components); list_validate(&def->uses); nir_foreach_use(src, def) { @@ -498,7 +501,7 @@ * conditions expect well-formed Booleans. If you want to compare with * NULL, an explicit comparison operation should be used. */ - validate_assert(state, list_empty(&instr->dest.ssa.if_uses)); + validate_assert(state, list_is_empty(&instr->dest.ssa.if_uses)); /* Only certain modes can be used as sources for phi instructions. */ nir_foreach_use(use, &instr->dest.ssa) { @@ -569,11 +572,15 @@ break; } - unsigned num_srcs = nir_intrinsic_infos[instr->intrinsic].num_srcs; + if (instr->num_components > 0) + validate_num_components(state, instr->num_components); + + const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic]; + unsigned num_srcs = info->num_srcs; for (unsigned i = 0; i < num_srcs; i++) { unsigned components_read = nir_intrinsic_src_components(instr, i); - validate_assert(state, components_read > 0); + validate_num_components(state, components_read); validate_src(&instr->src[i], state, src_bit_sizes[i], components_read); } @@ -582,8 +589,7 @@ unsigned components_written = nir_intrinsic_dest_components(instr); unsigned bit_sizes = nir_intrinsic_infos[instr->intrinsic].dest_bit_sizes; - validate_assert(state, components_written > 0); - + validate_num_components(state, components_written); if (dest_bit_size && bit_sizes) validate_assert(state, dest_bit_size & bit_sizes); else @@ -987,6 +993,7 @@ { validate_assert(state, reg->index < state->impl->reg_alloc); validate_assert(state, !BITSET_TEST(state->regs_found, reg->index)); + validate_num_components(state, reg->num_components); BITSET_SET(state->regs_found, reg->index); list_validate(®->uses); @@ -1055,14 +1062,14 @@ } static void -validate_var_decl(nir_variable *var, bool is_global, validate_state *state) +validate_var_decl(nir_variable *var, nir_variable_mode valid_modes, + validate_state *state) { state->var = var; - validate_assert(state, is_global == nir_variable_is_global(var)); - /* Must have exactly one mode set */ validate_assert(state, util_is_power_of_two_nonzero(var->data.mode)); + validate_assert(state, var->data.mode & valid_modes); if (var->data.compact) { /* The "compact" flag is only valid on arrays of scalars. */ @@ -1090,7 +1097,8 @@ */ _mesa_hash_table_insert(state->var_defs, var, - is_global ? NULL : state->impl); + valid_modes == nir_var_function_temp ? + state->impl : NULL); state->var = NULL; } @@ -1119,7 +1127,7 @@ exec_list_validate(&impl->locals); nir_foreach_variable(var, &impl->locals) { - validate_var_decl(var, false, state); + validate_var_decl(var, nir_var_function_temp, state); } state->regs_found = reralloc(state->mem_ctx, state->regs_found, @@ -1235,32 +1243,35 @@ exec_list_validate(&shader->uniforms); nir_foreach_variable(var, &shader->uniforms) { - validate_var_decl(var, true, &state); + validate_var_decl(var, nir_var_uniform | + nir_var_mem_ubo | + nir_var_mem_ssbo, + &state); } exec_list_validate(&shader->inputs); nir_foreach_variable(var, &shader->inputs) { - validate_var_decl(var, true, &state); + validate_var_decl(var, nir_var_shader_in, &state); } exec_list_validate(&shader->outputs); nir_foreach_variable(var, &shader->outputs) { - validate_var_decl(var, true, &state); + validate_var_decl(var, nir_var_shader_out, &state); } exec_list_validate(&shader->shared); nir_foreach_variable(var, &shader->shared) { - validate_var_decl(var, true, &state); + validate_var_decl(var, nir_var_mem_shared, &state); } exec_list_validate(&shader->globals); nir_foreach_variable(var, &shader->globals) { - validate_var_decl(var, true, &state); + validate_var_decl(var, nir_var_shader_temp, &state); } exec_list_validate(&shader->system_values); nir_foreach_variable(var, &shader->system_values) { - validate_var_decl(var, true, &state); + validate_var_decl(var, nir_var_system_value, &state); } exec_list_validate(&shader->functions); diff -Nru mesa-19.2.8/src/compiler/nir/nir_worklist.h mesa-20.0.8/src/compiler/nir/nir_worklist.h --- mesa-19.2.8/src/compiler/nir/nir_worklist.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/nir_worklist.h 2020-06-12 01:21:16.000000000 +0000 @@ -124,7 +124,7 @@ } static inline bool -nir_instr_worklist_empty(nir_instr_worklist *wl) +nir_instr_worklist_is_empty(nir_instr_worklist *wl) { return nir_instr_worklist_length(wl) == 0; } diff -Nru mesa-19.2.8/src/compiler/nir/tests/builder_tests.cpp mesa-20.0.8/src/compiler/nir/tests/builder_tests.cpp --- mesa-19.2.8/src/compiler/nir/tests/builder_tests.cpp 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/tests/builder_tests.cpp 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,155 @@ +/* + * Copyright © 2018 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include + +#include "nir.h" +#include "nir_builder.h" + +namespace { + +class nir_builder_test : public ::testing::Test { +private: + const glsl_type *type_for_def(nir_ssa_def *def) + { + switch (def->bit_size) { + case 8: return glsl_type::u8vec(def->num_components); + case 16: return glsl_type::u16vec(def->num_components); + case 32: return glsl_type::uvec(def->num_components); + case 64: return glsl_type::u64vec(def->num_components); + default: unreachable("Invalid bit size"); + } + } + +protected: + nir_builder_test(); + ~nir_builder_test(); + + void store_test_val(nir_ssa_def *val) + { + nir_variable *var = nir_variable_create(b->shader, nir_var_mem_ssbo, + type_for_def(val), NULL); + nir_intrinsic_instr *store = + nir_intrinsic_instr_create(b->shader, nir_intrinsic_store_deref); + store->num_components = val->num_components; + store->src[0] = nir_src_for_ssa(&nir_build_deref_var(b, var)->dest.ssa); + store->src[1] = nir_src_for_ssa(val); + nir_intrinsic_set_write_mask(store, ((1 << val->num_components) - 1)); + nir_builder_instr_insert(b, &store->instr); + + stores.push_back(store); + } + + nir_ssa_def *test_val(unsigned idx) + { + return stores[idx]->src[1].ssa; + } + + std::vector stores; + + void *mem_ctx; + void *lin_ctx; + + nir_builder *b; +}; + +nir_builder_test::nir_builder_test() +{ + glsl_type_singleton_init_or_ref(); + + mem_ctx = ralloc_context(NULL); + lin_ctx = linear_alloc_parent(mem_ctx, 0); + static const nir_shader_compiler_options options = { }; + b = rzalloc(mem_ctx, nir_builder); + nir_builder_init_simple_shader(b, mem_ctx, MESA_SHADER_COMPUTE, &options); +} + +nir_builder_test::~nir_builder_test() +{ + if (HasFailure()) { + printf("\nShader from the failed test:\n\n"); + nir_print_shader(b->shader, stdout); + } + + ralloc_free(mem_ctx); + + glsl_type_singleton_decref(); +} + +/* Allow grouping the tests while still sharing the helpers. */ +class nir_extract_bits_test : public nir_builder_test {}; + +} // namespace + +// TODO: Re-enable this once we get vec8 support in NIR +TEST_F(nir_extract_bits_test, DISABLED_unaligned8) +{ + nir_ssa_def *srcs[] = { + nir_imm_int(b, 0x03020100), + nir_imm_ivec2(b, 0x07060504, 0x0b0a0908), + }; + + store_test_val(nir_extract_bits(b, srcs, 2, 24, 1, 64)); + + NIR_PASS_V(b->shader, nir_opt_constant_folding); + + nir_src val = nir_src_for_ssa(test_val(0)); + + ASSERT_EQ(nir_src_as_uint(val), 0x0a09080706050403); +} + +TEST_F(nir_extract_bits_test, unaligned16_disabled) +{ + nir_ssa_def *srcs[] = { + nir_imm_int(b, 0x03020100), + nir_imm_ivec2(b, 0x07060504, 0x0b0a0908), + }; + + store_test_val(nir_extract_bits(b, srcs, 2, 16, 1, 64)); + + NIR_PASS_V(b->shader, nir_opt_constant_folding); + + nir_src val = nir_src_for_ssa(test_val(0)); + + ASSERT_EQ(nir_src_as_uint(val), 0x0908070605040302); +} + +TEST_F(nir_extract_bits_test, mixed_bit_sizes) +{ + nir_ssa_def *srcs[] = { + nir_imm_int(b, 0x03020100), + nir_imm_intN_t(b, 0x04, 8), + nir_imm_intN_t(b, 0x08070605, 32), + nir_vec2(b, nir_imm_intN_t(b, 0x0a09, 16), + nir_imm_intN_t(b, 0x0c0b, 16)), + }; + + store_test_val(nir_extract_bits(b, srcs, 4, 24, 2, 32)); + + NIR_PASS_V(b->shader, nir_opt_constant_folding); + + nir_src val = nir_src_for_ssa(test_val(0)); + + ASSERT_EQ(nir_src_comp_as_uint(val, 0), 0x06050403); + ASSERT_EQ(nir_src_comp_as_uint(val, 1), 0x0a090807); +} diff -Nru mesa-19.2.8/src/compiler/nir/tests/comparison_pre_tests.cpp mesa-20.0.8/src/compiler/nir/tests/comparison_pre_tests.cpp --- mesa-19.2.8/src/compiler/nir/tests/comparison_pre_tests.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/tests/comparison_pre_tests.cpp 2020-06-12 01:21:16.000000000 +0000 @@ -28,6 +28,8 @@ protected: comparison_pre_test() { + glsl_type_singleton_init_or_ref(); + static const nir_shader_compiler_options options = { }; nir_builder_init_simple_shader(&bld, NULL, MESA_SHADER_VERTEX, &options); @@ -39,6 +41,7 @@ ~comparison_pre_test() { ralloc_free(bld.shader); + glsl_type_singleton_decref(); } struct nir_builder bld; diff -Nru mesa-19.2.8/src/compiler/nir/tests/control_flow_tests.cpp mesa-20.0.8/src/compiler/nir/tests/control_flow_tests.cpp --- mesa-19.2.8/src/compiler/nir/tests/control_flow_tests.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/tests/control_flow_tests.cpp 2020-06-12 01:21:16.000000000 +0000 @@ -34,6 +34,8 @@ nir_cf_test::nir_cf_test() { + glsl_type_singleton_init_or_ref(); + static const nir_shader_compiler_options options = { }; nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_VERTEX, &options); } @@ -41,6 +43,7 @@ nir_cf_test::~nir_cf_test() { ralloc_free(b.shader); + glsl_type_singleton_decref(); } TEST_F(nir_cf_test, delete_break_in_loop) diff -Nru mesa-19.2.8/src/compiler/nir/tests/load_store_vectorizer_tests.cpp mesa-20.0.8/src/compiler/nir/tests/load_store_vectorizer_tests.cpp --- mesa-19.2.8/src/compiler/nir/tests/load_store_vectorizer_tests.cpp 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/tests/load_store_vectorizer_tests.cpp 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,1766 @@ +/* + * Copyright © 2018 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include + +#include "nir.h" +#include "nir_builder.h" + +namespace { + +class nir_load_store_vectorize_test : public ::testing::Test { +protected: + nir_load_store_vectorize_test(); + ~nir_load_store_vectorize_test(); + + unsigned count_intrinsics(nir_intrinsic_op intrinsic); + + nir_intrinsic_instr *get_intrinsic(nir_intrinsic_op intrinsic, + unsigned index); + + bool run_vectorizer(nir_variable_mode modes, bool cse=false); + + nir_ssa_def *get_resource(uint32_t binding, bool ssbo); + + nir_intrinsic_instr *create_indirect_load(nir_variable_mode mode, uint32_t binding, nir_ssa_def *offset, + uint32_t id, unsigned bit_size=32, unsigned components=1, + unsigned access=0); + void create_indirect_store(nir_variable_mode mode, uint32_t binding, nir_ssa_def *offset, + uint32_t id, unsigned bit_size=32, unsigned components=1, + unsigned wrmask=0xf, unsigned access=0); + + nir_intrinsic_instr *create_load(nir_variable_mode mode, uint32_t binding, uint32_t offset, + uint32_t id, unsigned bit_size=32, unsigned components=1, + unsigned access=0); + void create_store(nir_variable_mode mode, uint32_t binding, uint32_t offset, + uint32_t id, unsigned bit_size=32, unsigned components=1, unsigned wrmask=0xf, + unsigned access=0); + + void create_shared_load(nir_deref_instr *deref, uint32_t id, + unsigned bit_size=32, unsigned components=1); + void create_shared_store(nir_deref_instr *deref, uint32_t id, + unsigned bit_size=32, unsigned components=1, unsigned wrmask=0xf); + + bool test_alu(nir_instr *instr, nir_op op); + bool test_alu_def(nir_instr *instr, unsigned index, nir_ssa_def *def, unsigned swizzle=0); + + static bool mem_vectorize_callback(unsigned align, unsigned bit_size, + unsigned num_components, unsigned high_offset, + nir_intrinsic_instr *low, nir_intrinsic_instr *high); + static void shared_type_info(const struct glsl_type *type, unsigned *size, unsigned *align); + + void *mem_ctx; + + nir_builder *b; + std::map loads; + std::map res_map; +}; + +nir_load_store_vectorize_test::nir_load_store_vectorize_test() +{ + glsl_type_singleton_init_or_ref(); + + mem_ctx = ralloc_context(NULL); + static const nir_shader_compiler_options options = { }; + b = rzalloc(mem_ctx, nir_builder); + nir_builder_init_simple_shader(b, mem_ctx, MESA_SHADER_COMPUTE, &options); +} + +nir_load_store_vectorize_test::~nir_load_store_vectorize_test() +{ + if (HasFailure()) { + printf("\nShader from the failed test:\n\n"); + nir_print_shader(b->shader, stdout); + } + + ralloc_free(mem_ctx); + + glsl_type_singleton_decref(); +} + +unsigned +nir_load_store_vectorize_test::count_intrinsics(nir_intrinsic_op intrinsic) +{ + unsigned count = 0; + nir_foreach_block(block, b->impl) { + nir_foreach_instr(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + if (intrin->intrinsic == intrinsic) + count++; + } + } + return count; +} + +nir_intrinsic_instr * +nir_load_store_vectorize_test::get_intrinsic(nir_intrinsic_op intrinsic, + unsigned index) +{ + nir_foreach_block(block, b->impl) { + nir_foreach_instr(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + if (intrin->intrinsic == intrinsic) { + if (index == 0) + return intrin; + index--; + } + } + } + return NULL; +} + +bool +nir_load_store_vectorize_test::run_vectorizer(nir_variable_mode modes, bool cse) +{ + if (modes & nir_var_mem_shared) + nir_lower_vars_to_explicit_types(b->shader, nir_var_mem_shared, shared_type_info); + bool progress = nir_opt_load_store_vectorize(b->shader, modes, mem_vectorize_callback); + if (progress) { + nir_validate_shader(b->shader, NULL); + if (cse) + nir_opt_cse(b->shader); + nir_copy_prop(b->shader); + nir_opt_algebraic(b->shader); + nir_opt_constant_folding(b->shader); + } + return progress; +} + +nir_ssa_def * +nir_load_store_vectorize_test::get_resource(uint32_t binding, bool ssbo) +{ + if (res_map.count(binding)) + return res_map[binding]; + + nir_intrinsic_instr *res = nir_intrinsic_instr_create( + b->shader, nir_intrinsic_vulkan_resource_index); + nir_ssa_dest_init(&res->instr, &res->dest, 1, 32, NULL); + res->num_components = 1; + res->src[0] = nir_src_for_ssa(nir_imm_zero(b, 1, 32)); + nir_intrinsic_set_desc_type( + res, ssbo ? 7/*VK_DESCRIPTOR_TYPE_STORAGE_BUFFER*/ : 6/*VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER*/); + nir_intrinsic_set_desc_set(res, 0); + nir_intrinsic_set_binding(res, binding); + nir_builder_instr_insert(b, &res->instr); + res_map[binding] = &res->dest.ssa; + return &res->dest.ssa; +} + +nir_intrinsic_instr * +nir_load_store_vectorize_test::create_indirect_load( + nir_variable_mode mode, uint32_t binding, nir_ssa_def *offset, uint32_t id, + unsigned bit_size, unsigned components, unsigned access) +{ + nir_intrinsic_op intrinsic; + nir_ssa_def *res = NULL; + switch (mode) { + case nir_var_mem_ubo: + intrinsic = nir_intrinsic_load_ubo; + res = get_resource(binding, false); + break; + case nir_var_mem_ssbo: + intrinsic = nir_intrinsic_load_ssbo; + res = get_resource(binding, true); + break; + case nir_var_mem_push_const: + intrinsic = nir_intrinsic_load_push_constant; + break; + default: + return NULL; + } + nir_intrinsic_instr *load = nir_intrinsic_instr_create(b->shader, intrinsic); + nir_ssa_dest_init(&load->instr, &load->dest, components, bit_size, NULL); + load->num_components = components; + if (res) { + load->src[0] = nir_src_for_ssa(res); + load->src[1] = nir_src_for_ssa(offset); + } else { + load->src[0] = nir_src_for_ssa(offset); + } + if (mode != nir_var_mem_push_const) { + nir_intrinsic_set_align(load, (bit_size == 1 ? 32 : bit_size) / 8, 0); + nir_intrinsic_set_access(load, (gl_access_qualifier)access); + } + nir_builder_instr_insert(b, &load->instr); + nir_instr *mov = nir_mov(b, &load->dest.ssa)->parent_instr; + loads[id] = &nir_instr_as_alu(mov)->src[0]; + + return load; +} + +void +nir_load_store_vectorize_test::create_indirect_store( + nir_variable_mode mode, uint32_t binding, nir_ssa_def *offset, uint32_t id, + unsigned bit_size, unsigned components, unsigned wrmask, unsigned access) +{ + nir_const_value values[NIR_MAX_VEC_COMPONENTS]; + for (unsigned i = 0; i < components; i++) + values[i] = nir_const_value_for_raw_uint((id << 4) | i, bit_size); + nir_ssa_def *value = nir_build_imm(b, components, bit_size, values); + + nir_intrinsic_op intrinsic; + nir_ssa_def *res = NULL; + switch (mode) { + case nir_var_mem_ssbo: + intrinsic = nir_intrinsic_store_ssbo; + res = get_resource(binding, true); + break; + case nir_var_mem_shared: + intrinsic = nir_intrinsic_store_shared; + break; + default: + return; + } + nir_intrinsic_instr *store = nir_intrinsic_instr_create(b->shader, intrinsic); + nir_ssa_dest_init(&store->instr, &store->dest, components, bit_size, NULL); + store->num_components = components; + if (res) { + store->src[0] = nir_src_for_ssa(value); + store->src[1] = nir_src_for_ssa(res); + store->src[2] = nir_src_for_ssa(offset); + } else { + store->src[0] = nir_src_for_ssa(value); + store->src[1] = nir_src_for_ssa(offset); + } + nir_intrinsic_set_align(store, (bit_size == 1 ? 32 : bit_size) / 8, 0); + nir_intrinsic_set_access(store, (gl_access_qualifier)access); + nir_intrinsic_set_write_mask(store, wrmask & ((1 << components) - 1)); + nir_builder_instr_insert(b, &store->instr); +} + +nir_intrinsic_instr * +nir_load_store_vectorize_test::create_load( + nir_variable_mode mode, uint32_t binding, uint32_t offset, uint32_t id, + unsigned bit_size, unsigned components, unsigned access) +{ + return create_indirect_load(mode, binding, nir_imm_int(b, offset), id, bit_size, components, access); +} + +void +nir_load_store_vectorize_test::create_store( + nir_variable_mode mode, uint32_t binding, uint32_t offset, uint32_t id, + unsigned bit_size, unsigned components, unsigned wrmask, unsigned access) +{ + create_indirect_store(mode, binding, nir_imm_int(b, offset), id, bit_size, components, wrmask, access); +} + +void nir_load_store_vectorize_test::create_shared_load( + nir_deref_instr *deref, uint32_t id, unsigned bit_size, unsigned components) +{ + nir_intrinsic_instr *load = nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_deref); + nir_ssa_dest_init(&load->instr, &load->dest, components, bit_size, NULL); + load->num_components = components; + load->src[0] = nir_src_for_ssa(&deref->dest.ssa); + nir_builder_instr_insert(b, &load->instr); + nir_instr *mov = nir_mov(b, &load->dest.ssa)->parent_instr; + loads[id] = &nir_instr_as_alu(mov)->src[0]; +} + +void nir_load_store_vectorize_test::create_shared_store( + nir_deref_instr *deref, uint32_t id, + unsigned bit_size, unsigned components, unsigned wrmask) +{ + nir_const_value values[NIR_MAX_VEC_COMPONENTS]; + for (unsigned i = 0; i < components; i++) + values[i] = nir_const_value_for_raw_uint((id << 4) | i, bit_size); + nir_ssa_def *value = nir_build_imm(b, components, bit_size, values); + + nir_intrinsic_instr *store = nir_intrinsic_instr_create(b->shader, nir_intrinsic_store_deref); + nir_ssa_dest_init(&store->instr, &store->dest, components, bit_size, NULL); + store->num_components = components; + store->src[0] = nir_src_for_ssa(&deref->dest.ssa); + store->src[1] = nir_src_for_ssa(value); + nir_intrinsic_set_write_mask(store, wrmask & ((1 << components) - 1)); + nir_builder_instr_insert(b, &store->instr); +} + +bool nir_load_store_vectorize_test::test_alu(nir_instr *instr, nir_op op) +{ + return instr->type == nir_instr_type_alu && nir_instr_as_alu(instr)->op == op; +} + +bool nir_load_store_vectorize_test::test_alu_def( + nir_instr *instr, unsigned index, nir_ssa_def *def, unsigned swizzle) +{ + if (instr->type != nir_instr_type_alu) + return false; + + nir_alu_instr *alu = nir_instr_as_alu(instr); + + if (index >= nir_op_infos[alu->op].num_inputs) + return false; + if (alu->src[index].src.ssa != def) + return false; + if (alu->src[index].swizzle[0] != swizzle) + return false; + + return true; +} + +bool nir_load_store_vectorize_test::mem_vectorize_callback( + unsigned align, unsigned bit_size, unsigned num_components, unsigned high_offset, + nir_intrinsic_instr *low, nir_intrinsic_instr *high) +{ + return bit_size / 8; +} + +void nir_load_store_vectorize_test::shared_type_info( + const struct glsl_type *type, unsigned *size, unsigned *align) +{ + assert(glsl_type_is_vector_or_scalar(type)); + + uint32_t comp_size = glsl_type_is_boolean(type) + ? 4 : glsl_get_bit_size(type) / 8; + unsigned length = glsl_get_vector_elements(type); + *size = comp_size * length, + *align = comp_size; +} +} // namespace + +TEST_F(nir_load_store_vectorize_test, ubo_load_adjacent) +{ + create_load(nir_var_mem_ubo, 0, 0, 0x1); + create_load(nir_var_mem_ubo, 0, 4, 0x2); + + nir_validate_shader(b->shader, NULL); + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ubo), 2); + + EXPECT_TRUE(run_vectorizer(nir_var_mem_ubo)); + + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ubo), 1); + + nir_intrinsic_instr *load = get_intrinsic(nir_intrinsic_load_ubo, 0); + ASSERT_EQ(load->dest.ssa.bit_size, 32); + ASSERT_EQ(load->dest.ssa.num_components, 2); + ASSERT_EQ(nir_src_as_uint(load->src[1]), 0); + ASSERT_EQ(loads[0x1]->src.ssa, &load->dest.ssa); + ASSERT_EQ(loads[0x2]->src.ssa, &load->dest.ssa); + ASSERT_EQ(loads[0x1]->swizzle[0], 0); + ASSERT_EQ(loads[0x2]->swizzle[0], 1); +} + +TEST_F(nir_load_store_vectorize_test, ubo_load_intersecting) +{ + create_load(nir_var_mem_ubo, 0, 0, 0x1, 32, 2); + create_load(nir_var_mem_ubo, 0, 4, 0x2, 32, 2); + + nir_validate_shader(b->shader, NULL); + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ubo), 2); + + EXPECT_TRUE(run_vectorizer(nir_var_mem_ubo)); + + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ubo), 1); + + nir_intrinsic_instr *load = get_intrinsic(nir_intrinsic_load_ubo, 0); + ASSERT_EQ(load->dest.ssa.bit_size, 32); + ASSERT_EQ(load->dest.ssa.num_components, 3); + ASSERT_EQ(nir_src_as_uint(load->src[1]), 0); + ASSERT_EQ(loads[0x1]->src.ssa, &load->dest.ssa); + ASSERT_EQ(loads[0x2]->src.ssa, &load->dest.ssa); + ASSERT_EQ(loads[0x1]->swizzle[0], 0); + ASSERT_EQ(loads[0x1]->swizzle[1], 1); + ASSERT_EQ(loads[0x2]->swizzle[0], 1); + ASSERT_EQ(loads[0x2]->swizzle[1], 2); +} + +TEST_F(nir_load_store_vectorize_test, ubo_load_identical) +{ + create_load(nir_var_mem_ubo, 0, 0, 0x1); + create_load(nir_var_mem_ubo, 0, 0, 0x2); + + nir_validate_shader(b->shader, NULL); + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ubo), 2); + + EXPECT_TRUE(run_vectorizer(nir_var_mem_ubo)); + + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ubo), 1); + + nir_intrinsic_instr *load = get_intrinsic(nir_intrinsic_load_ubo, 0); + ASSERT_EQ(load->dest.ssa.bit_size, 32); + ASSERT_EQ(load->dest.ssa.num_components, 1); + ASSERT_EQ(nir_src_as_uint(load->src[1]), 0); + ASSERT_EQ(loads[0x1]->src.ssa, &load->dest.ssa); + ASSERT_EQ(loads[0x2]->src.ssa, &load->dest.ssa); + ASSERT_EQ(loads[0x1]->swizzle[0], 0); + ASSERT_EQ(loads[0x2]->swizzle[0], 0); +} + +TEST_F(nir_load_store_vectorize_test, ubo_load_large) +{ + create_load(nir_var_mem_ubo, 0, 0, 0x1, 32, 2); + create_load(nir_var_mem_ubo, 0, 8, 0x2, 32, 3); + + nir_validate_shader(b->shader, NULL); + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ubo), 2); + + EXPECT_TRUE(run_vectorizer(nir_var_mem_ubo)); + + nir_validate_shader(b->shader, NULL); + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ubo), 2); +} + +TEST_F(nir_load_store_vectorize_test, push_const_load_adjacent) +{ + create_load(nir_var_mem_push_const, 0, 0, 0x1); + create_load(nir_var_mem_push_const, 0, 4, 0x2); + + nir_validate_shader(b->shader, NULL); + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_push_constant), 2); + + EXPECT_TRUE(run_vectorizer(nir_var_mem_push_const)); + + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_push_constant), 1); + + nir_intrinsic_instr *load = get_intrinsic(nir_intrinsic_load_push_constant, 0); + ASSERT_EQ(load->dest.ssa.bit_size, 32); + ASSERT_EQ(load->dest.ssa.num_components, 2); + ASSERT_EQ(nir_src_as_uint(load->src[0]), 0); + ASSERT_EQ(loads[0x1]->src.ssa, &load->dest.ssa); + ASSERT_EQ(loads[0x2]->src.ssa, &load->dest.ssa); + ASSERT_EQ(loads[0x1]->swizzle[0], 0); + ASSERT_EQ(loads[0x2]->swizzle[0], 1); +} + +TEST_F(nir_load_store_vectorize_test, push_const_load_adjacent_base) +{ + create_load(nir_var_mem_push_const, 0, 0, 0x1); + nir_intrinsic_set_base(create_load(nir_var_mem_push_const, 0, 0, 0x2), 4); + + nir_validate_shader(b->shader, NULL); + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_push_constant), 2); + + EXPECT_TRUE(run_vectorizer(nir_var_mem_push_const)); + + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_push_constant), 1); + + nir_intrinsic_instr *load = get_intrinsic(nir_intrinsic_load_push_constant, 0); + ASSERT_EQ(load->dest.ssa.bit_size, 32); + ASSERT_EQ(load->dest.ssa.num_components, 2); + ASSERT_EQ(nir_src_as_uint(load->src[0]), 0); + ASSERT_EQ(loads[0x1]->src.ssa, &load->dest.ssa); + ASSERT_EQ(loads[0x2]->src.ssa, &load->dest.ssa); + ASSERT_EQ(loads[0x1]->swizzle[0], 0); + ASSERT_EQ(loads[0x2]->swizzle[0], 1); +} + +TEST_F(nir_load_store_vectorize_test, ssbo_load_adjacent) +{ + create_load(nir_var_mem_ssbo, 0, 0, 0x1); + create_load(nir_var_mem_ssbo, 0, 4, 0x2); + + nir_validate_shader(b->shader, NULL); + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ssbo), 2); + + EXPECT_TRUE(run_vectorizer(nir_var_mem_ssbo)); + + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ssbo), 1); + + nir_intrinsic_instr *load = get_intrinsic(nir_intrinsic_load_ssbo, 0); + ASSERT_EQ(load->dest.ssa.bit_size, 32); + ASSERT_EQ(load->dest.ssa.num_components, 2); + ASSERT_EQ(nir_src_as_uint(load->src[1]), 0); + ASSERT_EQ(loads[0x1]->src.ssa, &load->dest.ssa); + ASSERT_EQ(loads[0x2]->src.ssa, &load->dest.ssa); + ASSERT_EQ(loads[0x1]->swizzle[0], 0); + ASSERT_EQ(loads[0x2]->swizzle[0], 1); +} + +TEST_F(nir_load_store_vectorize_test, ssbo_load_adjacent_indirect) +{ + nir_ssa_def *index_base = nir_load_local_invocation_index(b); + create_indirect_load(nir_var_mem_ssbo, 0, index_base, 0x1); + create_indirect_load(nir_var_mem_ssbo, 0, nir_iadd_imm(b, index_base, 4), 0x2); + + nir_validate_shader(b->shader, NULL); + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ssbo), 2); + + EXPECT_TRUE(run_vectorizer(nir_var_mem_ssbo)); + + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ssbo), 1); + + nir_intrinsic_instr *load = get_intrinsic(nir_intrinsic_load_ssbo, 0); + ASSERT_EQ(load->dest.ssa.bit_size, 32); + ASSERT_EQ(load->dest.ssa.num_components, 2); + ASSERT_EQ(load->src[1].ssa, index_base); + ASSERT_EQ(loads[0x1]->src.ssa, &load->dest.ssa); + ASSERT_EQ(loads[0x2]->src.ssa, &load->dest.ssa); + ASSERT_EQ(loads[0x1]->swizzle[0], 0); + ASSERT_EQ(loads[0x2]->swizzle[0], 1); +} + +TEST_F(nir_load_store_vectorize_test, ssbo_load_adjacent_indirect_sub) +{ + nir_ssa_def *index_base = nir_load_local_invocation_index(b); + nir_ssa_def *index_base_prev = nir_iadd_imm(b, index_base, 0xfffffffc); + create_indirect_load(nir_var_mem_ssbo, 0, index_base_prev, 0x1); + create_indirect_load(nir_var_mem_ssbo, 0, index_base, 0x2); + + nir_validate_shader(b->shader, NULL); + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ssbo), 2); + + EXPECT_TRUE(run_vectorizer(nir_var_mem_ssbo)); + + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ssbo), 1); + + nir_intrinsic_instr *load = get_intrinsic(nir_intrinsic_load_ssbo, 0); + ASSERT_EQ(load->dest.ssa.bit_size, 32); + ASSERT_EQ(load->dest.ssa.num_components, 2); + ASSERT_EQ(load->src[1].ssa, index_base_prev); + ASSERT_EQ(loads[0x1]->src.ssa, &load->dest.ssa); + ASSERT_EQ(loads[0x2]->src.ssa, &load->dest.ssa); + ASSERT_EQ(loads[0x1]->swizzle[0], 0); + ASSERT_EQ(loads[0x2]->swizzle[0], 1); +} + +TEST_F(nir_load_store_vectorize_test, ssbo_load_adjacent_indirect_neg_stride) +{ + nir_ssa_def *inv = nir_load_local_invocation_index(b); + nir_ssa_def *inv_plus_one = nir_iadd_imm(b, inv, 1); + nir_ssa_def *index_base = nir_imul_imm(b, inv, 0xfffffffc); + nir_ssa_def *index_base_prev = nir_imul_imm(b, inv_plus_one, 0xfffffffc); + create_indirect_load(nir_var_mem_ssbo, 0, index_base_prev, 0x1); + create_indirect_load(nir_var_mem_ssbo, 0, index_base, 0x2); + + nir_validate_shader(b->shader, NULL); + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ssbo), 2); + + EXPECT_TRUE(run_vectorizer(nir_var_mem_ssbo)); + + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ssbo), 1); + + nir_intrinsic_instr *load = get_intrinsic(nir_intrinsic_load_ssbo, 0); + ASSERT_EQ(load->dest.ssa.bit_size, 32); + ASSERT_EQ(load->dest.ssa.num_components, 2); + ASSERT_EQ(loads[0x1]->src.ssa, &load->dest.ssa); + ASSERT_EQ(loads[0x2]->src.ssa, &load->dest.ssa); + ASSERT_EQ(loads[0x1]->swizzle[0], 0); + ASSERT_EQ(loads[0x2]->swizzle[0], 1); + + /* nir_opt_algebraic optimizes the imul */ + ASSERT_TRUE(test_alu(load->src[1].ssa->parent_instr, nir_op_ineg)); + nir_ssa_def *offset = nir_instr_as_alu(load->src[1].ssa->parent_instr)->src[0].src.ssa; + ASSERT_TRUE(test_alu(offset->parent_instr, nir_op_ishl)); + nir_alu_instr *shl = nir_instr_as_alu(offset->parent_instr); + ASSERT_EQ(shl->src[0].src.ssa, inv_plus_one); + ASSERT_EQ(nir_src_as_uint(shl->src[1].src), 2); +} + +TEST_F(nir_load_store_vectorize_test, ssbo_load_identical_store_adjacent) +{ + create_load(nir_var_mem_ssbo, 0, 0, 0x1); + create_store(nir_var_mem_ssbo, 0, 4, 0x2); + create_load(nir_var_mem_ssbo, 0, 0, 0x3); + + nir_validate_shader(b->shader, NULL); + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ssbo), 2); + + EXPECT_TRUE(run_vectorizer(nir_var_mem_ssbo)); + + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ssbo), 1); + + nir_intrinsic_instr *load = get_intrinsic(nir_intrinsic_load_ssbo, 0); + ASSERT_EQ(load->dest.ssa.bit_size, 32); + ASSERT_EQ(load->dest.ssa.num_components, 1); + ASSERT_EQ(nir_src_as_uint(load->src[1]), 0); + ASSERT_EQ(loads[0x1]->src.ssa, &load->dest.ssa); + ASSERT_EQ(loads[0x3]->src.ssa, &load->dest.ssa); + ASSERT_EQ(loads[0x1]->swizzle[0], 0); + ASSERT_EQ(loads[0x3]->swizzle[0], 0); +} + +TEST_F(nir_load_store_vectorize_test, ssbo_load_identical_store_intersecting) +{ + create_load(nir_var_mem_ssbo, 0, 0, 0x1, 32, 2); + create_store(nir_var_mem_ssbo, 0, 4, 0x2); + create_load(nir_var_mem_ssbo, 0, 0, 0x3, 32, 2); + + nir_validate_shader(b->shader, NULL); + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ssbo), 2); + + EXPECT_FALSE(run_vectorizer(nir_var_mem_ssbo)); + + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ssbo), 2); +} + +TEST_F(nir_load_store_vectorize_test, ssbo_load_identical_store_identical) +{ + create_load(nir_var_mem_ssbo, 0, 0, 0x1); + create_store(nir_var_mem_ssbo, 0, 0, 0x2); + create_load(nir_var_mem_ssbo, 0, 0, 0x3); + + nir_validate_shader(b->shader, NULL); + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ssbo), 2); + + EXPECT_FALSE(run_vectorizer(nir_var_mem_ssbo)); + + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ssbo), 2); +} + +TEST_F(nir_load_store_vectorize_test, ssbo_store_identical_load_identical) +{ + create_store(nir_var_mem_ssbo, 0, 0, 0x1); + create_load(nir_var_mem_ssbo, 0, 0, 0x2); + create_store(nir_var_mem_ssbo, 0, 0, 0x3); + + nir_validate_shader(b->shader, NULL); + ASSERT_EQ(count_intrinsics(nir_intrinsic_store_ssbo), 2); + + EXPECT_FALSE(run_vectorizer(nir_var_mem_ssbo)); + + ASSERT_EQ(count_intrinsics(nir_intrinsic_store_ssbo), 2); +} + +/* if nir_opt_load_store_vectorize were implemented like many load/store + * optimization passes are (for example, nir_opt_combine_stores and + * nir_opt_copy_prop_vars) and stopped tracking a load when an aliasing store is + * encountered, this case wouldn't be optimized. + * A similar test for derefs is shared_load_adjacent_store_identical. */ +TEST_F(nir_load_store_vectorize_test, ssbo_load_adjacent_store_identical) +{ + create_load(nir_var_mem_ssbo, 0, 0, 0x1); + create_store(nir_var_mem_ssbo, 0, 0, 0x2); + create_load(nir_var_mem_ssbo, 0, 4, 0x3); + + nir_validate_shader(b->shader, NULL); + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ssbo), 2); + ASSERT_EQ(count_intrinsics(nir_intrinsic_store_ssbo), 1); + + EXPECT_TRUE(run_vectorizer(nir_var_mem_ssbo)); + + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ssbo), 1); + ASSERT_EQ(count_intrinsics(nir_intrinsic_store_ssbo), 1); + + nir_intrinsic_instr *load = get_intrinsic(nir_intrinsic_load_ssbo, 0); + ASSERT_EQ(load->dest.ssa.bit_size, 32); + ASSERT_EQ(load->dest.ssa.num_components, 2); + ASSERT_EQ(nir_src_as_uint(load->src[1]), 0); + ASSERT_EQ(loads[0x1]->src.ssa, &load->dest.ssa); + ASSERT_EQ(loads[0x3]->src.ssa, &load->dest.ssa); + ASSERT_EQ(loads[0x1]->swizzle[0], 0); + ASSERT_EQ(loads[0x3]->swizzle[0], 1); +} + +TEST_F(nir_load_store_vectorize_test, ssbo_store_adjacent) +{ + create_store(nir_var_mem_ssbo, 0, 0, 0x1); + create_store(nir_var_mem_ssbo, 0, 4, 0x2); + + nir_validate_shader(b->shader, NULL); + ASSERT_EQ(count_intrinsics(nir_intrinsic_store_ssbo), 2); + + EXPECT_TRUE(run_vectorizer(nir_var_mem_ssbo)); + + ASSERT_EQ(count_intrinsics(nir_intrinsic_store_ssbo), 1); + + nir_intrinsic_instr *store = get_intrinsic(nir_intrinsic_store_ssbo, 0); + ASSERT_EQ(nir_src_as_uint(store->src[2]), 0); + ASSERT_EQ(nir_intrinsic_write_mask(store), 0x3); + nir_ssa_def *val = store->src[0].ssa; + ASSERT_EQ(val->bit_size, 32); + ASSERT_EQ(val->num_components, 2); + nir_const_value *cv = nir_instr_as_load_const(val->parent_instr)->value; + ASSERT_EQ(nir_const_value_as_uint(cv[0], 32), 0x10); + ASSERT_EQ(nir_const_value_as_uint(cv[1], 32), 0x20); +} + +TEST_F(nir_load_store_vectorize_test, ssbo_store_intersecting) +{ + create_store(nir_var_mem_ssbo, 0, 0, 0x1, 32, 2); + create_store(nir_var_mem_ssbo, 0, 4, 0x2, 32, 2); + + nir_validate_shader(b->shader, NULL); + ASSERT_EQ(count_intrinsics(nir_intrinsic_store_ssbo), 2); + + EXPECT_TRUE(run_vectorizer(nir_var_mem_ssbo)); + + ASSERT_EQ(count_intrinsics(nir_intrinsic_store_ssbo), 1); + + nir_intrinsic_instr *store = get_intrinsic(nir_intrinsic_store_ssbo, 0); + ASSERT_EQ(nir_src_as_uint(store->src[2]), 0); + ASSERT_EQ(nir_intrinsic_write_mask(store), 0x7); + nir_ssa_def *val = store->src[0].ssa; + ASSERT_EQ(val->bit_size, 32); + ASSERT_EQ(val->num_components, 3); + nir_const_value *cv = nir_instr_as_load_const(val->parent_instr)->value; + ASSERT_EQ(nir_const_value_as_uint(cv[0], 32), 0x10); + ASSERT_EQ(nir_const_value_as_uint(cv[1], 32), 0x20); + ASSERT_EQ(nir_const_value_as_uint(cv[2], 32), 0x21); +} + +TEST_F(nir_load_store_vectorize_test, ssbo_store_identical) +{ + create_store(nir_var_mem_ssbo, 0, 0, 0x1); + create_store(nir_var_mem_ssbo, 0, 0, 0x2); + + nir_validate_shader(b->shader, NULL); + ASSERT_EQ(count_intrinsics(nir_intrinsic_store_ssbo), 2); + + EXPECT_TRUE(run_vectorizer(nir_var_mem_ssbo)); + + ASSERT_EQ(count_intrinsics(nir_intrinsic_store_ssbo), 1); + + nir_intrinsic_instr *store = get_intrinsic(nir_intrinsic_store_ssbo, 0); + ASSERT_EQ(nir_src_as_uint(store->src[2]), 0); + ASSERT_EQ(nir_intrinsic_write_mask(store), 0x1); + nir_ssa_def *val = store->src[0].ssa; + ASSERT_EQ(val->bit_size, 32); + ASSERT_EQ(val->num_components, 1); + ASSERT_EQ(nir_src_as_uint(store->src[0]), 0x20); +} + +TEST_F(nir_load_store_vectorize_test, ssbo_store_large) +{ + create_store(nir_var_mem_ssbo, 0, 0, 0x1, 32, 2); + create_store(nir_var_mem_ssbo, 0, 8, 0x2, 32, 3); + + nir_validate_shader(b->shader, NULL); + ASSERT_EQ(count_intrinsics(nir_intrinsic_store_ssbo), 2); + + EXPECT_TRUE(run_vectorizer(nir_var_mem_ssbo)); + + ASSERT_EQ(count_intrinsics(nir_intrinsic_store_ssbo), 2); +} + +TEST_F(nir_load_store_vectorize_test, ubo_load_adjacent_memory_barrier) +{ + create_load(nir_var_mem_ubo, 0, 0, 0x1); + nir_builder_instr_insert(b, &nir_intrinsic_instr_create(b->shader, nir_intrinsic_memory_barrier)->instr); + create_load(nir_var_mem_ubo, 0, 4, 0x2); + + nir_validate_shader(b->shader, NULL); + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ubo), 2); + + EXPECT_TRUE(run_vectorizer(nir_var_mem_ubo)); + + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ubo), 1); +} + +TEST_F(nir_load_store_vectorize_test, ssbo_load_adjacent_memory_barrier) +{ + create_load(nir_var_mem_ssbo, 0, 0, 0x1); + nir_builder_instr_insert(b, &nir_intrinsic_instr_create(b->shader, nir_intrinsic_memory_barrier)->instr); + create_load(nir_var_mem_ssbo, 0, 4, 0x2); + + nir_validate_shader(b->shader, NULL); + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ssbo), 2); + + EXPECT_FALSE(run_vectorizer(nir_var_mem_ssbo)); + + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ssbo), 2); +} + +/* nir_intrinsic_control_barrier only syncs invocations in a workgroup, it + * doesn't require that loads/stores complete. + */ +TEST_F(nir_load_store_vectorize_test, ssbo_load_adjacent_barrier) +{ + create_load(nir_var_mem_ssbo, 0, 0, 0x1); + nir_builder_instr_insert(b, &nir_intrinsic_instr_create(b->shader, nir_intrinsic_control_barrier)->instr); + create_load(nir_var_mem_ssbo, 0, 4, 0x2); + + nir_validate_shader(b->shader, NULL); + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ssbo), 2); + + EXPECT_TRUE(run_vectorizer(nir_var_mem_ssbo)); + + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ssbo), 1); +} + +TEST_F(nir_load_store_vectorize_test, ssbo_load_adjacent_memory_barrier_shared) +{ + create_load(nir_var_mem_ssbo, 0, 0, 0x1); + nir_builder_instr_insert(b, &nir_intrinsic_instr_create(b->shader, nir_intrinsic_memory_barrier_shared)->instr); + create_load(nir_var_mem_ssbo, 0, 4, 0x2); + + nir_validate_shader(b->shader, NULL); + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ssbo), 2); + + EXPECT_TRUE(run_vectorizer(nir_var_mem_ssbo)); + + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ssbo), 1); +} + +TEST_F(nir_load_store_vectorize_test, ssbo_load_adjacent_8_8_16) +{ + create_load(nir_var_mem_ssbo, 0, 0, 0x1, 8); + create_load(nir_var_mem_ssbo, 0, 1, 0x2, 8); + create_load(nir_var_mem_ssbo, 0, 2, 0x3, 16); + + nir_validate_shader(b->shader, NULL); + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ssbo), 3); + + EXPECT_TRUE(run_vectorizer(nir_var_mem_ssbo)); + + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ssbo), 1); + + nir_intrinsic_instr *load = get_intrinsic(nir_intrinsic_load_ssbo, 0); + ASSERT_EQ(load->dest.ssa.bit_size, 8); + ASSERT_EQ(load->dest.ssa.num_components, 4); + ASSERT_EQ(nir_src_as_uint(load->src[1]), 0); + ASSERT_EQ(loads[0x1]->src.ssa, &load->dest.ssa); + ASSERT_EQ(loads[0x2]->src.ssa, &load->dest.ssa); + ASSERT_EQ(loads[0x1]->swizzle[0], 0); + ASSERT_EQ(loads[0x2]->swizzle[0], 1); + + nir_ssa_def *val = loads[0x3]->src.ssa; + ASSERT_EQ(val->bit_size, 16); + ASSERT_EQ(val->num_components, 1); + ASSERT_TRUE(test_alu(val->parent_instr, nir_op_ior)); + nir_ssa_def *low = nir_instr_as_alu(val->parent_instr)->src[0].src.ssa; + nir_ssa_def *high = nir_instr_as_alu(val->parent_instr)->src[1].src.ssa; + ASSERT_TRUE(test_alu(high->parent_instr, nir_op_ishl)); + high = nir_instr_as_alu(high->parent_instr)->src[0].src.ssa; + ASSERT_TRUE(test_alu(low->parent_instr, nir_op_u2u16)); + ASSERT_TRUE(test_alu(high->parent_instr, nir_op_u2u16)); + ASSERT_TRUE(test_alu_def(low->parent_instr, 0, &load->dest.ssa, 2)); + ASSERT_TRUE(test_alu_def(high->parent_instr, 0, &load->dest.ssa, 3)); +} + +TEST_F(nir_load_store_vectorize_test, ssbo_load_adjacent_32_32_64) +{ + create_load(nir_var_mem_ssbo, 0, 0, 0x1, 32, 2); + create_load(nir_var_mem_ssbo, 0, 8, 0x2, 64); + + nir_validate_shader(b->shader, NULL); + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ssbo), 2); + + EXPECT_TRUE(run_vectorizer(nir_var_mem_ssbo)); + + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ssbo), 1); + + nir_intrinsic_instr *load = get_intrinsic(nir_intrinsic_load_ssbo, 0); + ASSERT_EQ(load->dest.ssa.bit_size, 32); + ASSERT_EQ(load->dest.ssa.num_components, 4); + ASSERT_EQ(nir_src_as_uint(load->src[1]), 0); + ASSERT_EQ(loads[0x1]->src.ssa, &load->dest.ssa); + ASSERT_EQ(loads[0x1]->swizzle[0], 0); + ASSERT_EQ(loads[0x1]->swizzle[1], 1); + + nir_ssa_def *val = loads[0x2]->src.ssa; + ASSERT_EQ(val->bit_size, 64); + ASSERT_EQ(val->num_components, 1); + ASSERT_TRUE(test_alu(val->parent_instr, nir_op_pack_64_2x32)); + nir_alu_instr *pack = nir_instr_as_alu(val->parent_instr); + ASSERT_EQ(pack->src[0].src.ssa, &load->dest.ssa); + ASSERT_EQ(pack->src[0].swizzle[0], 2); + ASSERT_EQ(pack->src[0].swizzle[1], 3); +} + +TEST_F(nir_load_store_vectorize_test, ssbo_load_adjacent_32_32_64_64) +{ + create_load(nir_var_mem_ssbo, 0, 0, 0x1, 32, 2); + create_load(nir_var_mem_ssbo, 0, 8, 0x2, 64); + create_load(nir_var_mem_ssbo, 0, 16, 0x3, 64); + + nir_validate_shader(b->shader, NULL); + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ssbo), 3); + + EXPECT_TRUE(run_vectorizer(nir_var_mem_ssbo, true)); + + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ssbo), 1); + + nir_intrinsic_instr *load = get_intrinsic(nir_intrinsic_load_ssbo, 0); + ASSERT_EQ(load->dest.ssa.bit_size, 64); + ASSERT_EQ(load->dest.ssa.num_components, 3); + ASSERT_EQ(nir_src_as_uint(load->src[1]), 0); + ASSERT_EQ(loads[0x3]->src.ssa, &load->dest.ssa); + ASSERT_EQ(loads[0x3]->swizzle[0], 2); + + /* pack_64_2x32(unpack_64_2x32()) is created because the 32-bit and first + * 64-bit loads are combined before the second 64-bit load is even considered. */ + nir_ssa_def *val = loads[0x2]->src.ssa; + ASSERT_EQ(val->bit_size, 64); + ASSERT_EQ(val->num_components, 1); + ASSERT_TRUE(test_alu(val->parent_instr, nir_op_pack_64_2x32)); + nir_alu_instr *pack = nir_instr_as_alu(val->parent_instr); + ASSERT_TRUE(test_alu(pack->src[0].src.ssa->parent_instr, nir_op_unpack_64_2x32)); + nir_alu_instr *unpack = nir_instr_as_alu(pack->src[0].src.ssa->parent_instr); + ASSERT_EQ(unpack->src[0].src.ssa, &load->dest.ssa); + ASSERT_EQ(unpack->src[0].swizzle[0], 1); + + val = loads[0x1]->src.ssa; + ASSERT_EQ(val->bit_size, 32); + ASSERT_EQ(val->num_components, 2); + ASSERT_TRUE(test_alu(val->parent_instr, nir_op_unpack_64_2x32)); + unpack = nir_instr_as_alu(val->parent_instr); + ASSERT_EQ(unpack->src[0].src.ssa, &load->dest.ssa); + ASSERT_EQ(unpack->src[0].swizzle[0], 0); +} + +TEST_F(nir_load_store_vectorize_test, ssbo_load_intersecting_32_32_64) +{ + create_load(nir_var_mem_ssbo, 0, 4, 0x1, 32, 2); + create_load(nir_var_mem_ssbo, 0, 8, 0x2, 64); + + nir_validate_shader(b->shader, NULL); + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ssbo), 2); + + EXPECT_TRUE(run_vectorizer(nir_var_mem_ssbo)); + + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ssbo), 1); + + nir_intrinsic_instr *load = get_intrinsic(nir_intrinsic_load_ssbo, 0); + ASSERT_EQ(load->dest.ssa.bit_size, 32); + ASSERT_EQ(load->dest.ssa.num_components, 3); + ASSERT_EQ(nir_src_as_uint(load->src[1]), 4); + ASSERT_EQ(loads[0x1]->src.ssa, &load->dest.ssa); + ASSERT_EQ(loads[0x1]->swizzle[0], 0); + ASSERT_EQ(loads[0x1]->swizzle[1], 1); + + nir_ssa_def *val = loads[0x2]->src.ssa; + ASSERT_EQ(val->bit_size, 64); + ASSERT_EQ(val->num_components, 1); + ASSERT_TRUE(test_alu(val->parent_instr, nir_op_pack_64_2x32)); + nir_alu_instr *pack = nir_instr_as_alu(val->parent_instr); + ASSERT_EQ(pack->src[0].src.ssa, &load->dest.ssa); + ASSERT_EQ(pack->src[0].swizzle[0], 1); + ASSERT_EQ(pack->src[0].swizzle[1], 2); +} + +TEST_F(nir_load_store_vectorize_test, ssbo_store_adjacent_8_8_16) +{ + create_store(nir_var_mem_ssbo, 0, 0, 0x1, 8); + create_store(nir_var_mem_ssbo, 0, 1, 0x2, 8); + create_store(nir_var_mem_ssbo, 0, 2, 0x3, 16); + + nir_validate_shader(b->shader, NULL); + ASSERT_EQ(count_intrinsics(nir_intrinsic_store_ssbo), 3); + + EXPECT_TRUE(run_vectorizer(nir_var_mem_ssbo)); + + ASSERT_EQ(count_intrinsics(nir_intrinsic_store_ssbo), 1); + + nir_intrinsic_instr *store = get_intrinsic(nir_intrinsic_store_ssbo, 0); + ASSERT_EQ(nir_src_as_uint(store->src[2]), 0); + ASSERT_EQ(nir_intrinsic_write_mask(store), 0xf); + nir_ssa_def *val = store->src[0].ssa; + ASSERT_EQ(val->bit_size, 8); + ASSERT_EQ(val->num_components, 4); + nir_const_value *cv = nir_instr_as_load_const(val->parent_instr)->value; + ASSERT_EQ(nir_const_value_as_uint(cv[0], 32), 0x10); + ASSERT_EQ(nir_const_value_as_uint(cv[1], 32), 0x20); + ASSERT_EQ(nir_const_value_as_uint(cv[2], 32), 0x30); + ASSERT_EQ(nir_const_value_as_uint(cv[3], 32), 0x0); +} + +TEST_F(nir_load_store_vectorize_test, ssbo_store_adjacent_32_32_64) +{ + create_store(nir_var_mem_ssbo, 0, 0, 0x1, 32, 2); + create_store(nir_var_mem_ssbo, 0, 8, 0x2, 64); + + nir_validate_shader(b->shader, NULL); + ASSERT_EQ(count_intrinsics(nir_intrinsic_store_ssbo), 2); + + EXPECT_TRUE(run_vectorizer(nir_var_mem_ssbo)); + + ASSERT_EQ(count_intrinsics(nir_intrinsic_store_ssbo), 1); + + nir_intrinsic_instr *store = get_intrinsic(nir_intrinsic_store_ssbo, 0); + ASSERT_EQ(nir_src_as_uint(store->src[2]), 0); + ASSERT_EQ(nir_intrinsic_write_mask(store), 0xf); + nir_ssa_def *val = store->src[0].ssa; + ASSERT_EQ(val->bit_size, 32); + ASSERT_EQ(val->num_components, 4); + nir_const_value *cv = nir_instr_as_load_const(val->parent_instr)->value; + ASSERT_EQ(nir_const_value_as_uint(cv[0], 32), 0x10); + ASSERT_EQ(nir_const_value_as_uint(cv[1], 32), 0x11); + ASSERT_EQ(nir_const_value_as_uint(cv[2], 32), 0x20); + ASSERT_EQ(nir_const_value_as_uint(cv[3], 32), 0x0); +} + +TEST_F(nir_load_store_vectorize_test, ssbo_store_adjacent_32_32_64_64) +{ + create_store(nir_var_mem_ssbo, 0, 0, 0x1, 32, 2); + create_store(nir_var_mem_ssbo, 0, 8, 0x2, 64); + create_store(nir_var_mem_ssbo, 0, 16, 0x3, 64); + + nir_validate_shader(b->shader, NULL); + ASSERT_EQ(count_intrinsics(nir_intrinsic_store_ssbo), 3); + + EXPECT_TRUE(run_vectorizer(nir_var_mem_ssbo)); + + ASSERT_EQ(count_intrinsics(nir_intrinsic_store_ssbo), 1); + + nir_intrinsic_instr *store = get_intrinsic(nir_intrinsic_store_ssbo, 0); + ASSERT_EQ(nir_src_as_uint(store->src[2]), 0); + ASSERT_EQ(nir_intrinsic_write_mask(store), 0x7); + nir_ssa_def *val = store->src[0].ssa; + ASSERT_EQ(val->bit_size, 64); + ASSERT_EQ(val->num_components, 3); + nir_const_value *cv = nir_instr_as_load_const(val->parent_instr)->value; + ASSERT_EQ(nir_const_value_as_uint(cv[0], 64), 0x1100000010ull); + ASSERT_EQ(nir_const_value_as_uint(cv[1], 64), 0x20); + ASSERT_EQ(nir_const_value_as_uint(cv[2], 64), 0x30); +} + +TEST_F(nir_load_store_vectorize_test, ssbo_store_intersecting_32_32_64) +{ + create_store(nir_var_mem_ssbo, 0, 0, 0x1, 32, 2); + create_store(nir_var_mem_ssbo, 0, 4, 0x2, 64); + + nir_validate_shader(b->shader, NULL); + ASSERT_EQ(count_intrinsics(nir_intrinsic_store_ssbo), 2); + + EXPECT_TRUE(run_vectorizer(nir_var_mem_ssbo)); + + ASSERT_EQ(count_intrinsics(nir_intrinsic_store_ssbo), 1); + + nir_intrinsic_instr *store = get_intrinsic(nir_intrinsic_store_ssbo, 0); + ASSERT_EQ(nir_src_as_uint(store->src[2]), 0); + ASSERT_EQ(nir_intrinsic_write_mask(store), 0x7); + nir_ssa_def *val = store->src[0].ssa; + ASSERT_EQ(val->bit_size, 32); + ASSERT_EQ(val->num_components, 3); + nir_const_value *cv = nir_instr_as_load_const(val->parent_instr)->value; + ASSERT_EQ(nir_const_value_as_uint(cv[0], 32), 0x10); + ASSERT_EQ(nir_const_value_as_uint(cv[1], 32), 0x20); + ASSERT_EQ(nir_const_value_as_uint(cv[2], 32), 0x0); +} + +TEST_F(nir_load_store_vectorize_test, ssbo_store_adjacent_32_64) +{ + create_store(nir_var_mem_ssbo, 0, 0, 0x1, 32); + create_store(nir_var_mem_ssbo, 0, 4, 0x2, 64, 2); + + nir_validate_shader(b->shader, NULL); + ASSERT_EQ(count_intrinsics(nir_intrinsic_store_ssbo), 2); + + EXPECT_FALSE(run_vectorizer(nir_var_mem_ssbo)); + + ASSERT_EQ(count_intrinsics(nir_intrinsic_store_ssbo), 2); +} + +TEST_F(nir_load_store_vectorize_test, ssbo_store_identical_wrmask) +{ + create_store(nir_var_mem_ssbo, 0, 0, 0x1, 32, 4, 1 | 4); + create_store(nir_var_mem_ssbo, 0, 0, 0x2, 32, 4, 2 | 4 | 8); + + nir_validate_shader(b->shader, NULL); + ASSERT_EQ(count_intrinsics(nir_intrinsic_store_ssbo), 2); + + EXPECT_TRUE(run_vectorizer(nir_var_mem_ssbo)); + + ASSERT_EQ(count_intrinsics(nir_intrinsic_store_ssbo), 1); + + nir_intrinsic_instr *store = get_intrinsic(nir_intrinsic_store_ssbo, 0); + ASSERT_EQ(nir_src_as_uint(store->src[2]), 0); + ASSERT_EQ(nir_intrinsic_write_mask(store), 0xf); + nir_ssa_def *val = store->src[0].ssa; + ASSERT_EQ(val->bit_size, 32); + ASSERT_EQ(val->num_components, 4); + nir_const_value *cv = nir_instr_as_load_const(val->parent_instr)->value; + ASSERT_EQ(nir_const_value_as_uint(cv[0], 32), 0x10); + ASSERT_EQ(nir_const_value_as_uint(cv[1], 32), 0x21); + ASSERT_EQ(nir_const_value_as_uint(cv[2], 32), 0x22); + ASSERT_EQ(nir_const_value_as_uint(cv[3], 32), 0x23); +} + +TEST_F(nir_load_store_vectorize_test, shared_load_adjacent) +{ + nir_variable *var = nir_variable_create(b->shader, nir_var_mem_shared, glsl_array_type(glsl_uint_type(), 4, 0), "var"); + nir_deref_instr *deref = nir_build_deref_var(b, var); + + create_shared_load(nir_build_deref_array_imm(b, deref, 0), 0x1); + create_shared_load(nir_build_deref_array_imm(b, deref, 1), 0x2); + + nir_validate_shader(b->shader, NULL); + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_deref), 2); + + EXPECT_TRUE(run_vectorizer(nir_var_mem_shared)); + + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_deref), 1); + + nir_intrinsic_instr *load = get_intrinsic(nir_intrinsic_load_deref, 0); + ASSERT_EQ(load->dest.ssa.bit_size, 32); + ASSERT_EQ(load->dest.ssa.num_components, 2); + + deref = nir_src_as_deref(load->src[0]); + ASSERT_EQ(deref->deref_type, nir_deref_type_cast); + + deref = nir_deref_instr_parent(deref); + ASSERT_EQ(deref->deref_type, nir_deref_type_array); + ASSERT_EQ(nir_src_as_uint(deref->arr.index), 0); + + deref = nir_deref_instr_parent(deref); + ASSERT_EQ(deref->deref_type, nir_deref_type_var); + ASSERT_EQ(deref->var, var); + + ASSERT_EQ(loads[0x1]->src.ssa, &load->dest.ssa); + ASSERT_EQ(loads[0x2]->src.ssa, &load->dest.ssa); + ASSERT_EQ(loads[0x1]->swizzle[0], 0); + ASSERT_EQ(loads[0x2]->swizzle[0], 1); +} + +TEST_F(nir_load_store_vectorize_test, shared_load_distant_64bit) +{ + nir_variable *var = nir_variable_create(b->shader, nir_var_mem_shared, glsl_array_type(glsl_uint_type(), 4, 0), "var"); + nir_deref_instr *deref = nir_build_deref_var(b, var); + nir_ssa_dest_init(&deref->instr, &deref->dest, 1, 64, NULL); + + create_shared_load(nir_build_deref_array_imm(b, deref, 0x100000000), 0x1); + create_shared_load(nir_build_deref_array_imm(b, deref, 0x200000001), 0x2); + + nir_validate_shader(b->shader, NULL); + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_deref), 2); + + EXPECT_FALSE(run_vectorizer(nir_var_mem_shared)); + + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_deref), 2); +} + +TEST_F(nir_load_store_vectorize_test, shared_load_adjacent_indirect) +{ + nir_variable *var = nir_variable_create(b->shader, nir_var_mem_shared, glsl_array_type(glsl_uint_type(), 4, 0), "var"); + nir_deref_instr *deref = nir_build_deref_var(b, var); + nir_ssa_def *index_base = nir_load_local_invocation_index(b); + + create_shared_load(nir_build_deref_array(b, deref, index_base), 0x1); + create_shared_load(nir_build_deref_array(b, deref, nir_iadd_imm(b, index_base, 1)), 0x2); + + nir_validate_shader(b->shader, NULL); + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_deref), 2); + + EXPECT_TRUE(run_vectorizer(nir_var_mem_shared)); + + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_deref), 1); + + nir_intrinsic_instr *load = get_intrinsic(nir_intrinsic_load_deref, 0); + ASSERT_EQ(load->dest.ssa.bit_size, 32); + ASSERT_EQ(load->dest.ssa.num_components, 2); + + deref = nir_src_as_deref(load->src[0]); + ASSERT_EQ(deref->deref_type, nir_deref_type_cast); + + deref = nir_deref_instr_parent(deref); + ASSERT_EQ(deref->deref_type, nir_deref_type_array); + ASSERT_EQ(deref->arr.index.ssa, index_base); + + deref = nir_deref_instr_parent(deref); + ASSERT_EQ(deref->deref_type, nir_deref_type_var); + ASSERT_EQ(deref->var, var); + + ASSERT_EQ(loads[0x1]->src.ssa, &load->dest.ssa); + ASSERT_EQ(loads[0x2]->src.ssa, &load->dest.ssa); + ASSERT_EQ(loads[0x1]->swizzle[0], 0); + ASSERT_EQ(loads[0x2]->swizzle[0], 1); +} + +TEST_F(nir_load_store_vectorize_test, shared_load_adjacent_indirect_sub) +{ + nir_variable *var = nir_variable_create(b->shader, nir_var_mem_shared, glsl_array_type(glsl_uint_type(), 4, 0), "var"); + nir_deref_instr *deref = nir_build_deref_var(b, var); + nir_ssa_def *index_base = nir_load_local_invocation_index(b); + nir_ssa_def *index_base_prev = nir_iadd_imm(b, index_base, 0xffffffff); + + create_shared_load(nir_build_deref_array(b, deref, index_base_prev), 0x1); + create_shared_load(nir_build_deref_array(b, deref, index_base), 0x2); + + nir_validate_shader(b->shader, NULL); + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_deref), 2); + + EXPECT_TRUE(run_vectorizer(nir_var_mem_shared)); + + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_deref), 1); + + nir_intrinsic_instr *load = get_intrinsic(nir_intrinsic_load_deref, 0); + ASSERT_EQ(load->dest.ssa.bit_size, 32); + ASSERT_EQ(load->dest.ssa.num_components, 2); + + deref = nir_src_as_deref(load->src[0]); + ASSERT_EQ(deref->deref_type, nir_deref_type_cast); + + deref = nir_deref_instr_parent(deref); + ASSERT_EQ(deref->deref_type, nir_deref_type_array); + ASSERT_EQ(deref->arr.index.ssa, index_base_prev); + + deref = nir_deref_instr_parent(deref); + ASSERT_EQ(deref->deref_type, nir_deref_type_var); + ASSERT_EQ(deref->var, var); + + ASSERT_EQ(loads[0x1]->src.ssa, &load->dest.ssa); + ASSERT_EQ(loads[0x2]->src.ssa, &load->dest.ssa); + ASSERT_EQ(loads[0x1]->swizzle[0], 0); + ASSERT_EQ(loads[0x2]->swizzle[0], 1); +} + +TEST_F(nir_load_store_vectorize_test, shared_load_struct) +{ + glsl_struct_field fields[2] = {glsl_struct_field(glsl_uint_type(), "field0"), + glsl_struct_field(glsl_array_type(glsl_uint_type(), 4, 0), "field1")}; + + nir_variable *var = nir_variable_create(b->shader, nir_var_mem_shared, glsl_struct_type(fields, 2, "Struct", false), "var"); + nir_deref_instr *deref = nir_build_deref_var(b, var); + + create_shared_load(nir_build_deref_struct(b, deref, 0), 0x1); + create_shared_load(nir_build_deref_array_imm(b, nir_build_deref_struct(b, deref, 1), 0), 0x2); + + nir_validate_shader(b->shader, NULL); + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_deref), 2); + + EXPECT_TRUE(run_vectorizer(nir_var_mem_shared)); + + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_deref), 1); + + nir_intrinsic_instr *load = get_intrinsic(nir_intrinsic_load_deref, 0); + ASSERT_EQ(load->dest.ssa.bit_size, 32); + ASSERT_EQ(load->dest.ssa.num_components, 2); + + deref = nir_src_as_deref(load->src[0]); + ASSERT_EQ(deref->deref_type, nir_deref_type_cast); + + deref = nir_deref_instr_parent(deref); + ASSERT_EQ(deref->deref_type, nir_deref_type_struct); + ASSERT_EQ(deref->strct.index, 0); + + deref = nir_deref_instr_parent(deref); + ASSERT_EQ(deref->deref_type, nir_deref_type_var); + ASSERT_EQ(deref->var, var); + + ASSERT_EQ(loads[0x1]->src.ssa, &load->dest.ssa); + ASSERT_EQ(loads[0x2]->src.ssa, &load->dest.ssa); + ASSERT_EQ(loads[0x1]->swizzle[0], 0); + ASSERT_EQ(loads[0x2]->swizzle[0], 1); +} + +TEST_F(nir_load_store_vectorize_test, shared_load_identical_store_adjacent) +{ + nir_variable *var = nir_variable_create(b->shader, nir_var_mem_shared, glsl_array_type(glsl_uint_type(), 4, 0), "var"); + nir_deref_instr *deref = nir_build_deref_var(b, var); + + create_shared_load(nir_build_deref_array_imm(b, deref, 0), 0x1); + create_shared_store(nir_build_deref_array_imm(b, deref, 1), 0x2); + create_shared_load(nir_build_deref_array_imm(b, deref, 0), 0x3); + + nir_validate_shader(b->shader, NULL); + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_deref), 2); + ASSERT_EQ(count_intrinsics(nir_intrinsic_store_deref), 1); + + EXPECT_TRUE(run_vectorizer(nir_var_mem_shared)); + + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_deref), 1); + ASSERT_EQ(count_intrinsics(nir_intrinsic_store_deref), 1); + + nir_intrinsic_instr *load = get_intrinsic(nir_intrinsic_load_deref, 0); + ASSERT_EQ(load->dest.ssa.bit_size, 32); + ASSERT_EQ(load->dest.ssa.num_components, 1); + + deref = nir_src_as_deref(load->src[0]); + ASSERT_EQ(deref->deref_type, nir_deref_type_array); + ASSERT_EQ(nir_src_as_uint(deref->arr.index), 0); + + deref = nir_deref_instr_parent(deref); + ASSERT_EQ(deref->deref_type, nir_deref_type_var); + ASSERT_EQ(deref->var, var); + + ASSERT_EQ(loads[0x1]->src.ssa, &load->dest.ssa); + ASSERT_EQ(loads[0x3]->src.ssa, &load->dest.ssa); + ASSERT_EQ(loads[0x1]->swizzle[0], 0); + ASSERT_EQ(loads[0x3]->swizzle[0], 0); +} + +TEST_F(nir_load_store_vectorize_test, shared_load_identical_store_identical) +{ + nir_variable *var = nir_variable_create(b->shader, nir_var_mem_shared, glsl_array_type(glsl_uint_type(), 4, 0), "var"); + nir_deref_instr *deref = nir_build_deref_var(b, var); + + create_shared_load(nir_build_deref_array_imm(b, deref, 0), 0x1); + create_shared_store(nir_build_deref_array_imm(b, deref, 0), 0x2); + create_shared_load(nir_build_deref_array_imm(b, deref, 0), 0x3); + + nir_validate_shader(b->shader, NULL); + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_deref), 2); + + EXPECT_FALSE(run_vectorizer(nir_var_mem_shared)); + + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_deref), 2); +} + +TEST_F(nir_load_store_vectorize_test, shared_load_adjacent_store_identical) +{ + nir_variable *var = nir_variable_create(b->shader, nir_var_mem_shared, glsl_array_type(glsl_uint_type(), 4, 0), "var"); + nir_deref_instr *deref = nir_build_deref_var(b, var); + + create_shared_load(nir_build_deref_array_imm(b, deref, 0), 0x1); + create_shared_store(nir_build_deref_array_imm(b, deref, 0), 0x2); + create_shared_load(nir_build_deref_array_imm(b, deref, 1), 0x3); + + nir_validate_shader(b->shader, NULL); + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_deref), 2); + ASSERT_EQ(count_intrinsics(nir_intrinsic_store_deref), 1); + + EXPECT_TRUE(run_vectorizer(nir_var_mem_shared)); + + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_deref), 1); + ASSERT_EQ(count_intrinsics(nir_intrinsic_store_deref), 1); + + nir_intrinsic_instr *load = get_intrinsic(nir_intrinsic_load_deref, 0); + ASSERT_EQ(load->dest.ssa.bit_size, 32); + ASSERT_EQ(load->dest.ssa.num_components, 2); + + deref = nir_src_as_deref(load->src[0]); + ASSERT_EQ(deref->deref_type, nir_deref_type_cast); + + deref = nir_deref_instr_parent(deref); + ASSERT_EQ(deref->deref_type, nir_deref_type_array); + ASSERT_EQ(nir_src_as_uint(deref->arr.index), 0); + + deref = nir_deref_instr_parent(deref); + ASSERT_EQ(deref->deref_type, nir_deref_type_var); + ASSERT_EQ(deref->var, var); + + ASSERT_EQ(loads[0x1]->src.ssa, &load->dest.ssa); + ASSERT_EQ(loads[0x3]->src.ssa, &load->dest.ssa); + ASSERT_EQ(loads[0x1]->swizzle[0], 0); + ASSERT_EQ(loads[0x3]->swizzle[0], 1); +} + +TEST_F(nir_load_store_vectorize_test, shared_load_bool) +{ + nir_variable *var = nir_variable_create(b->shader, nir_var_mem_shared, glsl_array_type(glsl_bool_type(), 4, 0), "var"); + nir_deref_instr *deref = nir_build_deref_var(b, var); + + create_shared_load(nir_build_deref_array_imm(b, deref, 0), 0x1, 1); + create_shared_load(nir_build_deref_array_imm(b, deref, 1), 0x2, 1); + + nir_validate_shader(b->shader, NULL); + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_deref), 2); + + EXPECT_TRUE(run_vectorizer(nir_var_mem_shared)); + + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_deref), 1); + + nir_intrinsic_instr *load = get_intrinsic(nir_intrinsic_load_deref, 0); + ASSERT_EQ(load->dest.ssa.bit_size, 32); + ASSERT_EQ(load->dest.ssa.num_components, 2); + + deref = nir_src_as_deref(load->src[0]); + ASSERT_EQ(deref->deref_type, nir_deref_type_cast); + + deref = nir_deref_instr_parent(deref); + ASSERT_EQ(deref->deref_type, nir_deref_type_array); + ASSERT_EQ(nir_src_as_uint(deref->arr.index), 0); + + deref = nir_deref_instr_parent(deref); + ASSERT_EQ(deref->deref_type, nir_deref_type_var); + ASSERT_EQ(deref->var, var); + + ASSERT_TRUE(test_alu(loads[0x1]->src.ssa->parent_instr, nir_op_i2b1)); + ASSERT_TRUE(test_alu(loads[0x2]->src.ssa->parent_instr, nir_op_i2b1)); + ASSERT_TRUE(test_alu_def(loads[0x1]->src.ssa->parent_instr, 0, &load->dest.ssa, 0)); + ASSERT_TRUE(test_alu_def(loads[0x2]->src.ssa->parent_instr, 0, &load->dest.ssa, 1)); +} + +TEST_F(nir_load_store_vectorize_test, shared_load_bool_mixed) +{ + glsl_struct_field fields[2] = {glsl_struct_field(glsl_bool_type(), "field0"), + glsl_struct_field(glsl_array_type(glsl_uint_type(), 4, 0), "field1")}; + + nir_variable *var = nir_variable_create(b->shader, nir_var_mem_shared, glsl_struct_type(fields, 2, "Struct", false), "var"); + nir_deref_instr *deref = nir_build_deref_var(b, var); + + create_shared_load(nir_build_deref_struct(b, deref, 0), 0x1, 1); + create_shared_load(nir_build_deref_array_imm(b, nir_build_deref_struct(b, deref, 1), 0), 0x2); + + nir_validate_shader(b->shader, NULL); + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_deref), 2); + + EXPECT_TRUE(run_vectorizer(nir_var_mem_shared)); + + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_deref), 1); + + nir_intrinsic_instr *load = get_intrinsic(nir_intrinsic_load_deref, 0); + ASSERT_EQ(load->dest.ssa.bit_size, 32); + ASSERT_EQ(load->dest.ssa.num_components, 2); + + deref = nir_src_as_deref(load->src[0]); + ASSERT_EQ(deref->deref_type, nir_deref_type_cast); + + deref = nir_deref_instr_parent(deref); + ASSERT_EQ(deref->deref_type, nir_deref_type_struct); + ASSERT_EQ(deref->strct.index, 0); + + deref = nir_deref_instr_parent(deref); + ASSERT_EQ(deref->deref_type, nir_deref_type_var); + ASSERT_EQ(deref->var, var); + + ASSERT_TRUE(test_alu(loads[0x1]->src.ssa->parent_instr, nir_op_i2b1)); + ASSERT_TRUE(test_alu_def(loads[0x1]->src.ssa->parent_instr, 0, &load->dest.ssa, 0)); + ASSERT_EQ(loads[0x2]->src.ssa, &load->dest.ssa); + ASSERT_EQ(loads[0x2]->swizzle[0], 1); +} + +TEST_F(nir_load_store_vectorize_test, shared_store_adjacent) +{ + nir_variable *var = nir_variable_create(b->shader, nir_var_mem_shared, glsl_array_type(glsl_uint_type(), 4, 0), "var"); + nir_deref_instr *deref = nir_build_deref_var(b, var); + + create_shared_store(nir_build_deref_array_imm(b, deref, 0), 0x1); + create_shared_store(nir_build_deref_array_imm(b, deref, 1), 0x2); + + nir_validate_shader(b->shader, NULL); + ASSERT_EQ(count_intrinsics(nir_intrinsic_store_deref), 2); + + EXPECT_TRUE(run_vectorizer(nir_var_mem_shared)); + + ASSERT_EQ(count_intrinsics(nir_intrinsic_store_deref), 1); + + nir_intrinsic_instr *store = get_intrinsic(nir_intrinsic_store_deref, 0); + ASSERT_EQ(nir_intrinsic_write_mask(store), 0x3); + nir_ssa_def *val = store->src[1].ssa; + ASSERT_EQ(val->bit_size, 32); + ASSERT_EQ(val->num_components, 2); + nir_const_value *cv = nir_instr_as_load_const(val->parent_instr)->value; + ASSERT_EQ(nir_const_value_as_uint(cv[0], 32), 0x10); + ASSERT_EQ(nir_const_value_as_uint(cv[1], 32), 0x20); + + deref = nir_src_as_deref(store->src[0]); + ASSERT_EQ(deref->deref_type, nir_deref_type_cast); + + deref = nir_deref_instr_parent(deref); + ASSERT_EQ(deref->deref_type, nir_deref_type_array); + ASSERT_EQ(nir_src_as_uint(deref->arr.index), 0); + + deref = nir_deref_instr_parent(deref); + ASSERT_EQ(deref->deref_type, nir_deref_type_var); + ASSERT_EQ(deref->var, var); +} + +TEST_F(nir_load_store_vectorize_test, push_const_load_separate_base) +{ + create_load(nir_var_mem_push_const, 0, 0, 0x1); + nir_intrinsic_set_base(create_load(nir_var_mem_push_const, 0, 4, 0x2), 4); + + nir_validate_shader(b->shader, NULL); + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_push_constant), 2); + + EXPECT_FALSE(run_vectorizer(nir_var_mem_push_const)); + + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_push_constant), 2); +} + +TEST_F(nir_load_store_vectorize_test, push_const_load_separate_direct_direct) +{ + create_load(nir_var_mem_push_const, 0, 0, 0x1); + create_load(nir_var_mem_push_const, 0, 8, 0x2); + + nir_validate_shader(b->shader, NULL); + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_push_constant), 2); + + EXPECT_FALSE(run_vectorizer(nir_var_mem_push_const)); + + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_push_constant), 2); +} + +TEST_F(nir_load_store_vectorize_test, push_const_load_separate_direct_indirect) +{ + nir_ssa_def *index_base = nir_load_local_invocation_index(b); + create_load(nir_var_mem_push_const, 0, 0, 0x1); + create_indirect_load(nir_var_mem_push_const, 0, index_base, 0x2); + + nir_validate_shader(b->shader, NULL); + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_push_constant), 2); + + EXPECT_FALSE(run_vectorizer(nir_var_mem_push_const)); + + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_push_constant), 2); +} + +TEST_F(nir_load_store_vectorize_test, push_const_load_separate_indirect_indirect) +{ + nir_ssa_def *index_base = nir_load_local_invocation_index(b); + create_indirect_load(nir_var_mem_push_const, 0, + nir_iadd(b, nir_imul(b, nir_iadd(b, index_base, nir_imm_int(b, 2)), nir_imm_int(b, 16)), nir_imm_int(b, 32)), 0x1); + create_indirect_load(nir_var_mem_push_const, 0, + nir_iadd(b, nir_imul(b, nir_iadd(b, index_base, nir_imm_int(b, 3)), nir_imm_int(b, 16)), nir_imm_int(b, 32)), 0x2); + + nir_validate_shader(b->shader, NULL); + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_push_constant), 2); + + EXPECT_FALSE(run_vectorizer(nir_var_mem_push_const)); + + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_push_constant), 2); +} + +TEST_F(nir_load_store_vectorize_test, push_const_load_adjacent_complex_indirect) +{ + nir_ssa_def *index_base = nir_load_local_invocation_index(b); + //vec4 pc[]; pc[gl_LocalInvocationIndex].w; pc[gl_LocalInvocationIndex+1].x; + nir_ssa_def *low = nir_iadd(b, nir_imul(b, index_base, nir_imm_int(b, 16)), nir_imm_int(b, 12)); + nir_ssa_def *high = nir_imul(b, nir_iadd(b, index_base, nir_imm_int(b, 1)), nir_imm_int(b, 16)); + create_indirect_load(nir_var_mem_push_const, 0, low, 0x1); + create_indirect_load(nir_var_mem_push_const, 0, high, 0x2); + + nir_validate_shader(b->shader, NULL); + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_push_constant), 2); + + EXPECT_TRUE(run_vectorizer(nir_var_mem_push_const)); + + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_push_constant), 1); + + nir_intrinsic_instr *load = get_intrinsic(nir_intrinsic_load_push_constant, 0); + ASSERT_EQ(load->dest.ssa.bit_size, 32); + ASSERT_EQ(load->dest.ssa.num_components, 2); + ASSERT_EQ(load->src[0].ssa, low); + ASSERT_EQ(loads[0x1]->src.ssa, &load->dest.ssa); + ASSERT_EQ(loads[0x2]->src.ssa, &load->dest.ssa); + ASSERT_EQ(loads[0x1]->swizzle[0], 0); + ASSERT_EQ(loads[0x2]->swizzle[0], 1); +} + +TEST_F(nir_load_store_vectorize_test, ssbo_alias0) +{ + nir_ssa_def *index_base = nir_load_local_invocation_index(b); + create_load(nir_var_mem_ssbo, 0, 0, 0x1); + create_indirect_store(nir_var_mem_ssbo, 0, index_base, 0x2); + create_load(nir_var_mem_ssbo, 0, 0, 0x3); + + nir_validate_shader(b->shader, NULL); + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ssbo), 2); + + EXPECT_FALSE(run_vectorizer(nir_var_mem_ssbo)); + + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ssbo), 2); +} + +TEST_F(nir_load_store_vectorize_test, ssbo_alias1) +{ + nir_ssa_def *load_base = nir_load_global_invocation_index(b, 32); + nir_ssa_def *store_base = nir_load_local_invocation_index(b); + create_indirect_load(nir_var_mem_ssbo, 0, load_base, 0x1); + create_indirect_store(nir_var_mem_ssbo, 0, store_base, 0x2); + create_indirect_load(nir_var_mem_ssbo, 0, load_base, 0x3); + + nir_validate_shader(b->shader, NULL); + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ssbo), 2); + + EXPECT_FALSE(run_vectorizer(nir_var_mem_ssbo)); + + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ssbo), 2); +} + +TEST_F(nir_load_store_vectorize_test, DISABLED_ssbo_alias2) +{ + /* TODO: try to combine these loads */ + nir_ssa_def *index_base = nir_load_local_invocation_index(b); + nir_ssa_def *offset = nir_iadd(b, nir_imul(b, index_base, nir_imm_int(b, 16)), nir_imm_int(b, 4)); + create_indirect_load(nir_var_mem_ssbo, 0, offset, 0x1); + create_store(nir_var_mem_ssbo, 0, 0, 0x2); + create_indirect_load(nir_var_mem_ssbo, 0, offset, 0x3); + + nir_validate_shader(b->shader, NULL); + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ssbo), 2); + + EXPECT_TRUE(run_vectorizer(nir_var_mem_ssbo)); + + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ssbo), 1); + + nir_intrinsic_instr *load = get_intrinsic(nir_intrinsic_load_ssbo, 0); + ASSERT_EQ(load->dest.ssa.bit_size, 32); + ASSERT_EQ(load->dest.ssa.num_components, 1); + ASSERT_EQ(load->src[1].ssa, offset); + ASSERT_EQ(loads[0x1]->src.ssa, &load->dest.ssa); + ASSERT_EQ(loads[0x3]->src.ssa, &load->dest.ssa); + ASSERT_EQ(loads[0x1]->swizzle[0], 0); + ASSERT_EQ(loads[0x3]->swizzle[0], 0); +} + +TEST_F(nir_load_store_vectorize_test, ssbo_alias3) +{ + /* these loads can be combined if nir_alu_instr::no_unsigned_wrap is set. + * these loads can't be combined because if index_base == 268435455, then + * offset == 0 because the addition would wrap around */ + nir_ssa_def *index_base = nir_load_local_invocation_index(b); + nir_ssa_def *offset = nir_iadd(b, nir_imul(b, index_base, nir_imm_int(b, 16)), nir_imm_int(b, 16)); + create_indirect_load(nir_var_mem_ssbo, 0, offset, 0x1); + create_store(nir_var_mem_ssbo, 0, 0, 0x2); + create_indirect_load(nir_var_mem_ssbo, 0, offset, 0x3); + + nir_validate_shader(b->shader, NULL); + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ssbo), 2); + + EXPECT_TRUE(run_vectorizer(nir_var_mem_ssbo)); + + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ssbo), 2); +} + +TEST_F(nir_load_store_vectorize_test, DISABLED_ssbo_alias4) +{ + /* TODO: try to combine these loads */ + nir_ssa_def *index_base = nir_load_local_invocation_index(b); + nir_ssa_def *offset = nir_iadd(b, nir_imul(b, index_base, nir_imm_int(b, 16)), nir_imm_int(b, 16)); + nir_instr_as_alu(offset->parent_instr)->no_unsigned_wrap = true; + create_indirect_load(nir_var_mem_ssbo, 0, offset, 0x1); + create_store(nir_var_mem_ssbo, 0, 0, 0x2); + create_indirect_load(nir_var_mem_ssbo, 0, offset, 0x3); + + nir_validate_shader(b->shader, NULL); + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ssbo), 2); + + EXPECT_TRUE(run_vectorizer(nir_var_mem_ssbo)); + + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ssbo), 1); + + nir_intrinsic_instr *load = get_intrinsic(nir_intrinsic_load_ssbo, 0); + ASSERT_EQ(load->dest.ssa.bit_size, 32); + ASSERT_EQ(load->dest.ssa.num_components, 1); + ASSERT_EQ(load->src[1].ssa, offset); + ASSERT_EQ(loads[0x1]->src.ssa, &load->dest.ssa); + ASSERT_EQ(loads[0x3]->src.ssa, &load->dest.ssa); + ASSERT_EQ(loads[0x1]->swizzle[0], 0); + ASSERT_EQ(loads[0x3]->swizzle[0], 0); +} + +TEST_F(nir_load_store_vectorize_test, ssbo_alias5) +{ + create_load(nir_var_mem_ssbo, 0, 0, 0x1); + create_store(nir_var_mem_ssbo, 1, 0, 0x2); + create_load(nir_var_mem_ssbo, 0, 0, 0x3); + + nir_validate_shader(b->shader, NULL); + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ssbo), 2); + + EXPECT_FALSE(run_vectorizer(nir_var_mem_ssbo)); + + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ssbo), 2); +} + +TEST_F(nir_load_store_vectorize_test, ssbo_alias6) +{ + create_load(nir_var_mem_ssbo, 0, 0, 0x1, 32, 1, ACCESS_RESTRICT); + create_store(nir_var_mem_ssbo, 1, 0, 0x2, 32, 1, 0xf, ACCESS_RESTRICT); + create_load(nir_var_mem_ssbo, 0, 0, 0x3, 32, 1, ACCESS_RESTRICT); + + nir_validate_shader(b->shader, NULL); + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ssbo), 2); + + EXPECT_TRUE(run_vectorizer(nir_var_mem_ssbo)); + + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ssbo), 1); + + nir_intrinsic_instr *load = get_intrinsic(nir_intrinsic_load_ssbo, 0); + ASSERT_EQ(load->dest.ssa.bit_size, 32); + ASSERT_EQ(load->dest.ssa.num_components, 1); + ASSERT_EQ(nir_src_as_uint(load->src[1]), 0); + ASSERT_EQ(loads[0x1]->src.ssa, &load->dest.ssa); + ASSERT_EQ(loads[0x3]->src.ssa, &load->dest.ssa); + ASSERT_EQ(loads[0x1]->swizzle[0], 0); + ASSERT_EQ(loads[0x3]->swizzle[0], 0); +} + +TEST_F(nir_load_store_vectorize_test, DISABLED_shared_alias0) +{ + /* TODO: implement type-based alias analysis so that these loads can be + * combined. this is made a bit more difficult than simply using + * nir_compare_derefs() because the vectorizer creates loads/stores with + * casted derefs. The solution would probably be to keep multiple derefs for + * an entry (one for each load/store combined into it). */ + glsl_struct_field fields[2] = {glsl_struct_field(glsl_array_type(glsl_uint_type(), 4, 0), "field0"), + glsl_struct_field(glsl_array_type(glsl_uint_type(), 4, 0), "field1")}; + + nir_variable *var = nir_variable_create(b->shader, nir_var_mem_shared, glsl_struct_type(fields, 2, "Struct", false), "var"); + nir_deref_instr *deref = nir_build_deref_var(b, var); + + nir_ssa_def *index0 = nir_load_local_invocation_index(b); + nir_ssa_def *index1 = nir_load_global_invocation_index(b, 32); + nir_deref_instr *load_deref = nir_build_deref_array(b, nir_build_deref_struct(b, deref, 0), index0); + + create_shared_load(load_deref, 0x1); + create_shared_store(nir_build_deref_array(b, nir_build_deref_struct(b, deref, 1), index1), 0x2); + create_shared_load(load_deref, 0x3); + + nir_validate_shader(b->shader, NULL); + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_deref), 2); + + EXPECT_TRUE(run_vectorizer(nir_var_mem_shared)); + + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_deref), 1); + + nir_intrinsic_instr *load = get_intrinsic(nir_intrinsic_load_deref, 0); + ASSERT_EQ(load->dest.ssa.bit_size, 32); + ASSERT_EQ(load->dest.ssa.num_components, 1); + ASSERT_EQ(load->src[0].ssa, &load_deref->dest.ssa); + ASSERT_EQ(loads[0x1]->src.ssa, &load->dest.ssa); + ASSERT_EQ(loads[0x3]->src.ssa, &load->dest.ssa); + ASSERT_EQ(loads[0x1]->swizzle[0], 0); + ASSERT_EQ(loads[0x3]->swizzle[0], 0); +} + +TEST_F(nir_load_store_vectorize_test, shared_alias1) +{ + nir_variable *var0 = nir_variable_create(b->shader, nir_var_mem_shared, glsl_uint_type(), "var0"); + nir_variable *var1 = nir_variable_create(b->shader, nir_var_mem_shared, glsl_uint_type(), "var1"); + nir_deref_instr *load_deref = nir_build_deref_var(b, var0); + + create_shared_load(load_deref, 0x1); + create_shared_store(nir_build_deref_var(b, var1), 0x2); + create_shared_load(load_deref, 0x3); + + nir_validate_shader(b->shader, NULL); + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_deref), 2); + + EXPECT_TRUE(run_vectorizer(nir_var_mem_shared)); + + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_deref), 1); + + nir_intrinsic_instr *load = get_intrinsic(nir_intrinsic_load_deref, 0); + ASSERT_EQ(load->dest.ssa.bit_size, 32); + ASSERT_EQ(load->dest.ssa.num_components, 1); + ASSERT_EQ(load->src[0].ssa, &load_deref->dest.ssa); + ASSERT_EQ(loads[0x1]->src.ssa, &load->dest.ssa); + ASSERT_EQ(loads[0x3]->src.ssa, &load->dest.ssa); + ASSERT_EQ(loads[0x1]->swizzle[0], 0); + ASSERT_EQ(loads[0x3]->swizzle[0], 0); +} + +TEST_F(nir_load_store_vectorize_test, ssbo_load_distant_64bit) +{ + create_indirect_load(nir_var_mem_ssbo, 0, nir_imm_intN_t(b, 0x100000000, 64), 0x1); + create_indirect_load(nir_var_mem_ssbo, 0, nir_imm_intN_t(b, 0x200000004, 64), 0x2); + + nir_validate_shader(b->shader, NULL); + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ssbo), 2); + + EXPECT_TRUE(run_vectorizer(nir_var_mem_ssbo)); + + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ssbo), 2); +} + +TEST_F(nir_load_store_vectorize_test, ssbo_load_distant_indirect_64bit) +{ + nir_ssa_def *index_base = nir_u2u64(b, nir_load_local_invocation_index(b)); + nir_ssa_def *first = nir_imul_imm(b, index_base, 0x100000000); + nir_ssa_def *second = nir_imul_imm(b, index_base, 0x200000000); + create_indirect_load(nir_var_mem_ssbo, 0, first, 0x1); + create_indirect_load(nir_var_mem_ssbo, 0, second, 0x2); + + nir_validate_shader(b->shader, NULL); + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ssbo), 2); + + EXPECT_TRUE(run_vectorizer(nir_var_mem_ssbo)); + + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ssbo), 2); +} diff -Nru mesa-19.2.8/src/compiler/nir/tests/negative_equal_tests.cpp mesa-20.0.8/src/compiler/nir/tests/negative_equal_tests.cpp --- mesa-19.2.8/src/compiler/nir/tests/negative_equal_tests.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/tests/negative_equal_tests.cpp 2020-06-12 01:21:16.000000000 +0000 @@ -35,13 +35,15 @@ protected: const_value_negative_equal_test() { + glsl_type_singleton_init_or_ref(); + memset(c1, 0, sizeof(c1)); memset(c2, 0, sizeof(c2)); } ~const_value_negative_equal_test() { - /* empty */ + glsl_type_singleton_decref(); } nir_const_value c1[NIR_MAX_VEC_COMPONENTS]; @@ -52,6 +54,8 @@ protected: alu_srcs_negative_equal_test() { + glsl_type_singleton_init_or_ref(); + static const nir_shader_compiler_options options = { }; nir_builder_init_simple_shader(&bld, NULL, MESA_SHADER_VERTEX, &options); memset(c1, 0, sizeof(c1)); @@ -61,6 +65,7 @@ ~alu_srcs_negative_equal_test() { ralloc_free(bld.shader); + glsl_type_singleton_decref(); } struct nir_builder bld; diff -Nru mesa-19.2.8/src/compiler/nir/tests/serialize_tests.cpp mesa-20.0.8/src/compiler/nir/tests/serialize_tests.cpp --- mesa-19.2.8/src/compiler/nir/tests/serialize_tests.cpp 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/tests/serialize_tests.cpp 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,284 @@ +/* + * Copyright © 2019 Red Hat, Inc + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include + +#include "nir.h" +#include "nir_builder.h" +#include "nir_serialize.h" + +namespace { + +class nir_serialize_test : public ::testing::TestWithParam { +protected: + nir_serialize_test(); + ~nir_serialize_test(); + + void serialize(); + nir_alu_instr *get_last_alu(nir_shader *); + void ASSERT_SWIZZLE_EQ(nir_alu_instr *, nir_alu_instr *, unsigned count, unsigned src); + + void *mem_ctx; + nir_builder *b; + nir_shader *dup; + const nir_shader_compiler_options options; +}; + +nir_serialize_test::nir_serialize_test() +: options() +{ + glsl_type_singleton_init_or_ref(); + + mem_ctx = ralloc_context(NULL); + + b = rzalloc(mem_ctx, nir_builder); + nir_builder_init_simple_shader(b, mem_ctx, MESA_SHADER_COMPUTE, &options); +} + +nir_serialize_test::~nir_serialize_test() +{ + if (HasFailure()) { + printf("\nShader from the failed test\n\n"); + printf("original Shader:\n"); + nir_print_shader(b->shader, stdout); + printf("serialized Shader:\n"); + nir_print_shader(dup, stdout); + } + + ralloc_free(mem_ctx); + + glsl_type_singleton_decref(); +} + +void +nir_serialize_test::serialize() { + struct blob blob; + struct blob_reader reader; + + blob_init(&blob); + + nir_serialize(&blob, b->shader, false); + blob_reader_init(&reader, blob.data, blob.size); + nir_shader *cloned = nir_deserialize(mem_ctx, &options, &reader); + blob_finish(&blob); + + dup = cloned; + + nir_validate_shader(b->shader, "original"); + nir_validate_shader(b->shader, "cloned"); +} + +nir_alu_instr * +nir_serialize_test::get_last_alu(nir_shader *nir) +{ + nir_function_impl *impl = nir_shader_get_entrypoint(nir); + return nir_instr_as_alu(nir_block_last_instr(nir_impl_last_block(impl))); +} + +void +nir_serialize_test::ASSERT_SWIZZLE_EQ(nir_alu_instr *a, nir_alu_instr *b, unsigned c, unsigned s) +{ + ASSERT_EQ(memcmp(a->src[s].swizzle, b->src[s].swizzle, c), 0); +} + +class nir_serialize_all_test : public nir_serialize_test {}; +class nir_serialize_all_but_one_test : public nir_serialize_test {}; + +} // namespace + +#if NIR_MAX_VEC_COMPONENTS == 16 +#define COMPONENTS 2, 3, 4, 8, 16 +#else +#define COMPONENTS 2, 3, 4 +#endif + + +INSTANTIATE_TEST_CASE_P( + nir_serialize_all_test, + nir_serialize_all_test, + ::testing::Values(1, COMPONENTS) +); + +INSTANTIATE_TEST_CASE_P( + nir_serialize_all_but_one_test, + nir_serialize_all_but_one_test, + ::testing::Values(COMPONENTS) +); + +TEST_P(nir_serialize_all_test, alu_single_value_src_swizzle) +{ + nir_ssa_def *zero = nir_imm_zero(b, GetParam(), 32); + nir_ssa_def *fmax = nir_fmax(b, zero, zero); + + nir_alu_instr *fmax_alu = nir_instr_as_alu(fmax->parent_instr); + + memset(fmax_alu->src[0].swizzle, GetParam() - 1, NIR_MAX_VEC_COMPONENTS); + memset(fmax_alu->src[1].swizzle, GetParam() - 1, NIR_MAX_VEC_COMPONENTS); + + serialize(); + + nir_alu_instr *fmax_alu_dup = get_last_alu(dup); + + ASSERT_SWIZZLE_EQ(fmax_alu, fmax_alu_dup, GetParam(), 0); + ASSERT_SWIZZLE_EQ(fmax_alu, fmax_alu_dup, GetParam(), 1); +} + +TEST_P(nir_serialize_all_test, alu_vec) +{ + nir_ssa_def *undef = nir_ssa_undef(b, GetParam(), 32); + nir_ssa_def *undefs[] = { + undef, undef, undef, undef, + undef, undef, undef, undef, + undef, undef, undef, undef, + undef, undef, undef, undef, + }; + + nir_ssa_def *vec = nir_vec(b, undefs, GetParam()); + nir_alu_instr *vec_alu = nir_instr_as_alu(vec->parent_instr); + for (int i = 0; i < GetParam(); i++) + vec_alu->src[i].swizzle[0] = (GetParam() - 1) - i; + + serialize(); + + nir_alu_instr *vec_alu_dup = get_last_alu(dup); + + ASSERT_SWIZZLE_EQ(vec_alu, vec_alu_dup, 1, 0); +} + +TEST_P(nir_serialize_all_test, alu_two_components_full_swizzle) +{ + nir_ssa_def *undef = nir_ssa_undef(b, 2, 32); + nir_ssa_def *fma = nir_ffma(b, undef, undef, undef); + nir_alu_instr *fma_alu = nir_instr_as_alu(fma->parent_instr); + + fma->num_components = GetParam(); + fma_alu->dest.write_mask = (1 << GetParam()) - 1; + + memset(fma_alu->src[0].swizzle, 1, GetParam()); + memset(fma_alu->src[1].swizzle, 1, GetParam()); + memset(fma_alu->src[2].swizzle, 1, GetParam()); + + serialize(); + + nir_alu_instr *fma_alu_dup = get_last_alu(dup); + + ASSERT_SWIZZLE_EQ(fma_alu, fma_alu_dup, GetParam(), 0); + ASSERT_SWIZZLE_EQ(fma_alu, fma_alu_dup, GetParam(), 1); + ASSERT_SWIZZLE_EQ(fma_alu, fma_alu_dup, GetParam(), 2); +} + +TEST_P(nir_serialize_all_but_one_test, alu_two_components_reg_two_swizzle) +{ + nir_ssa_def *undef = nir_ssa_undef(b, 2, 32); + nir_ssa_def *fma = nir_ffma(b, undef, undef, undef); + nir_alu_instr *fma_alu = nir_instr_as_alu(fma->parent_instr); + + memset(fma_alu->src[0].swizzle, 1, GetParam()); + memset(fma_alu->src[1].swizzle, 1, GetParam()); + memset(fma_alu->src[2].swizzle, 1, GetParam()); + + ASSERT_TRUE(nir_convert_from_ssa(b->shader, false)); + + fma_alu = get_last_alu(b->shader); + ASSERT_FALSE(fma_alu->dest.dest.is_ssa); + fma_alu->dest.dest.reg.reg->num_components = GetParam(); + fma_alu->dest.write_mask = 1 | (1 << (GetParam() - 1)); + + serialize(); + + nir_alu_instr *fma_alu_dup = get_last_alu(dup); + + ASSERT_EQ(fma_alu->src[0].swizzle[0], fma_alu_dup->src[0].swizzle[0]); + ASSERT_EQ(fma_alu->src[0].swizzle[GetParam() - 1], fma_alu_dup->src[0].swizzle[GetParam() - 1]); + ASSERT_EQ(fma_alu->src[1].swizzle[0], fma_alu_dup->src[1].swizzle[0]); + ASSERT_EQ(fma_alu->src[1].swizzle[GetParam() - 1], fma_alu_dup->src[1].swizzle[GetParam() - 1]); + ASSERT_EQ(fma_alu->src[2].swizzle[0], fma_alu_dup->src[2].swizzle[0]); + ASSERT_EQ(fma_alu->src[2].swizzle[GetParam() - 1], fma_alu_dup->src[2].swizzle[GetParam() - 1]); +} + +TEST_P(nir_serialize_all_but_one_test, alu_full_width_reg_two_swizzle) +{ + nir_ssa_def *undef = nir_ssa_undef(b, GetParam(), 32); + nir_ssa_def *fma = nir_ffma(b, undef, undef, undef); + nir_alu_instr *fma_alu = nir_instr_as_alu(fma->parent_instr); + + memset(fma_alu->src[0].swizzle, GetParam() - 1, GetParam()); + memset(fma_alu->src[1].swizzle, GetParam() - 1, GetParam()); + memset(fma_alu->src[2].swizzle, GetParam() - 1, GetParam()); + + ASSERT_TRUE(nir_convert_from_ssa(b->shader, false)); + + fma_alu = get_last_alu(b->shader); + ASSERT_FALSE(fma_alu->dest.dest.is_ssa); + fma_alu->dest.write_mask = 1 | (1 << (GetParam() - 1)); + + serialize(); + + nir_alu_instr *fma_alu_dup = get_last_alu(dup); + + ASSERT_EQ(fma_alu->src[0].swizzle[0], fma_alu_dup->src[0].swizzle[0]); + ASSERT_EQ(fma_alu->src[0].swizzle[GetParam() - 1], fma_alu_dup->src[0].swizzle[GetParam() - 1]); + ASSERT_EQ(fma_alu->src[1].swizzle[0], fma_alu_dup->src[1].swizzle[0]); + ASSERT_EQ(fma_alu->src[1].swizzle[GetParam() - 1], fma_alu_dup->src[1].swizzle[GetParam() - 1]); + ASSERT_EQ(fma_alu->src[2].swizzle[0], fma_alu_dup->src[2].swizzle[0]); + ASSERT_EQ(fma_alu->src[2].swizzle[GetParam() - 1], fma_alu_dup->src[2].swizzle[GetParam() - 1]); +} + +TEST_P(nir_serialize_all_but_one_test, alu_two_component_reg_full_src) +{ + nir_ssa_def *undef = nir_ssa_undef(b, GetParam(), 32); + nir_ssa_def *fma = nir_ffma(b, undef, undef, undef); + nir_alu_instr *fma_alu = nir_instr_as_alu(fma->parent_instr); + + memset(fma_alu->src[0].swizzle, 1, GetParam()); + memset(fma_alu->src[1].swizzle, 1, GetParam()); + memset(fma_alu->src[2].swizzle, 1, GetParam()); + + ASSERT_TRUE(nir_convert_from_ssa(b->shader, false)); + + fma_alu = get_last_alu(b->shader); + ASSERT_FALSE(fma_alu->dest.dest.is_ssa); + fma_alu->dest.dest.reg.reg->num_components = 2; + fma_alu->dest.write_mask = 0x3; + + serialize(); + + nir_alu_instr *fma_alu_dup = get_last_alu(dup); + + ASSERT_SWIZZLE_EQ(fma_alu, fma_alu_dup, 2, 0); + ASSERT_SWIZZLE_EQ(fma_alu, fma_alu_dup, 2, 1); + ASSERT_SWIZZLE_EQ(fma_alu, fma_alu_dup, 2, 2); +} + +TEST_P(nir_serialize_all_but_one_test, single_channel) +{ + nir_ssa_def *zero = nir_ssa_undef(b, GetParam(), 32); + nir_ssa_def *vec = nir_channel(b, zero, GetParam() - 1); + nir_alu_instr *vec_alu = nir_instr_as_alu(vec->parent_instr); + + serialize(); + + nir_alu_instr *vec_alu_dup = get_last_alu(dup); + + ASSERT_SWIZZLE_EQ(vec_alu, vec_alu_dup, 1, 0); +} diff -Nru mesa-19.2.8/src/compiler/nir/tests/vars_tests.cpp mesa-20.0.8/src/compiler/nir/tests/vars_tests.cpp --- mesa-19.2.8/src/compiler/nir/tests/vars_tests.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir/tests/vars_tests.cpp 2020-06-12 01:21:16.000000000 +0000 @@ -97,11 +97,13 @@ nir_vars_test::nir_vars_test() { + glsl_type_singleton_init_or_ref(); + mem_ctx = ralloc_context(NULL); lin_ctx = linear_alloc_parent(mem_ctx, 0); static const nir_shader_compiler_options options = { }; b = rzalloc(mem_ctx, nir_builder); - nir_builder_init_simple_shader(b, mem_ctx, MESA_SHADER_FRAGMENT, &options); + nir_builder_init_simple_shader(b, mem_ctx, MESA_SHADER_COMPUTE, &options); } nir_vars_test::~nir_vars_test() @@ -112,6 +114,8 @@ } ralloc_free(mem_ctx); + + glsl_type_singleton_decref(); } unsigned @@ -190,8 +194,38 @@ class nir_dead_write_vars_test : public nir_vars_test {}; class nir_combine_stores_test : public nir_vars_test {}; class nir_split_vars_test : public nir_vars_test {}; + +void +scoped_memory_barrier(nir_builder *b, + nir_memory_semantics semantics, + nir_variable_mode modes, + nir_scope scope = NIR_SCOPE_DEVICE) +{ + nir_intrinsic_instr *intrin = + nir_intrinsic_instr_create(b->shader, nir_intrinsic_scoped_memory_barrier); + nir_intrinsic_set_memory_semantics(intrin, semantics); + nir_intrinsic_set_memory_modes(intrin, modes); + nir_intrinsic_set_memory_scope(intrin, scope); + nir_builder_instr_insert(b, &intrin->instr); +} + } // namespace +static nir_ssa_def * +nir_load_var_volatile(nir_builder *b, nir_variable *var) +{ + return nir_load_deref_with_access(b, nir_build_deref_var(b, var), + ACCESS_VOLATILE); +} + +static void +nir_store_var_volatile(nir_builder *b, nir_variable *var, + nir_ssa_def *value, nir_component_mask_t writemask) +{ + nir_store_deref_with_access(b, nir_build_deref_var(b, var), + value, writemask, ACCESS_VOLATILE); +} + TEST_F(nir_redundant_load_vars_test, duplicated_load) { /* Load a variable twice in the same block. One should be removed. */ @@ -214,6 +248,41 @@ ASSERT_EQ(count_intrinsics(nir_intrinsic_load_deref), 1); } +TEST_F(nir_redundant_load_vars_test, duplicated_load_volatile) +{ + /* Load a variable twice in the same block. One should be removed. */ + + nir_variable *in = create_int(nir_var_shader_in, "in"); + nir_variable **out = create_many_int(nir_var_shader_out, "out", 3); + + /* Volatile prevents us from eliminating a load by combining it with + * another. It shouldn't however, prevent us from combing other + * non-volatile loads. + */ + nir_store_var(b, out[0], nir_load_var(b, in), 1); + nir_store_var(b, out[1], nir_load_var_volatile(b, in), 1); + nir_store_var(b, out[2], nir_load_var(b, in), 1); + + nir_validate_shader(b->shader, NULL); + + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_deref), 3); + + bool progress = nir_opt_copy_prop_vars(b->shader); + EXPECT_TRUE(progress); + + nir_validate_shader(b->shader, NULL); + + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_deref), 2); + + nir_intrinsic_instr *first_store = get_intrinsic(nir_intrinsic_store_deref, 0); + ASSERT_TRUE(first_store->src[1].is_ssa); + + nir_intrinsic_instr *third_store = get_intrinsic(nir_intrinsic_store_deref, 2); + ASSERT_TRUE(third_store->src[1].is_ssa); + + EXPECT_EQ(first_store->src[1].ssa, third_store->src[1].ssa); +} + TEST_F(nir_redundant_load_vars_test, duplicated_load_in_two_blocks) { /* Load a variable twice in different blocks. One should be removed. */ @@ -337,6 +406,22 @@ EXPECT_EQ(first_copy->src[1].ssa, second_copy->src[1].ssa); } +TEST_F(nir_copy_prop_vars_test, self_copy) +{ + nir_variable *v = create_int(nir_var_mem_ssbo, "v"); + + nir_copy_var(b, v, v); + + nir_validate_shader(b->shader, NULL); + + bool progress = nir_opt_copy_prop_vars(b->shader); + EXPECT_TRUE(progress); + + nir_validate_shader(b->shader, NULL); + + ASSERT_EQ(count_intrinsics(nir_intrinsic_copy_deref), 0); +} + TEST_F(nir_copy_prop_vars_test, simple_store_load) { nir_variable **v = create_many_ivec2(nir_var_function_temp, "v", 2); @@ -466,6 +551,77 @@ ASSERT_EQ(nir_src_comp_as_uint(store_to_v1->src[1], 1), 20); } +TEST_F(nir_copy_prop_vars_test, store_volatile) +{ + nir_variable **v = create_many_ivec2(nir_var_function_temp, "v", 2); + unsigned mask = 1 | 2; + + nir_ssa_def *first_value = nir_imm_ivec2(b, 10, 20); + nir_store_var(b, v[0], first_value, mask); + + nir_ssa_def *second_value = nir_imm_ivec2(b, 30, 40); + nir_store_var_volatile(b, v[0], second_value, mask); + + nir_ssa_def *third_value = nir_imm_ivec2(b, 50, 60); + nir_store_var(b, v[0], third_value, mask); + + nir_ssa_def *read_value = nir_load_var(b, v[0]); + nir_store_var(b, v[1], read_value, mask); + + nir_validate_shader(b->shader, NULL); + + bool progress = nir_opt_copy_prop_vars(b->shader); + EXPECT_TRUE(progress); + + nir_validate_shader(b->shader, NULL); + + ASSERT_EQ(count_intrinsics(nir_intrinsic_store_deref), 4); + + /* Our approach here is a bit scorched-earth. We expect the volatile store + * in the middle to cause both that store and the one before it to be kept. + * Technically, volatile only prevents combining the volatile store with + * another store and one could argue that the store before the volatile and + * the one after it could be combined. However, it seems safer to just + * treat a volatile store like an atomic and prevent any combining across + * it. + */ + nir_intrinsic_instr *store_to_v1 = get_intrinsic(nir_intrinsic_store_deref, 3); + ASSERT_EQ(nir_intrinsic_get_var(store_to_v1, 0), v[1]); + ASSERT_TRUE(store_to_v1->src[1].is_ssa); + EXPECT_EQ(store_to_v1->src[1].ssa, third_value); +} + +TEST_F(nir_copy_prop_vars_test, self_copy_volatile) +{ + nir_variable *v = create_int(nir_var_mem_ssbo, "v"); + + nir_copy_var(b, v, v); + nir_copy_deref_with_access(b, nir_build_deref_var(b, v), + nir_build_deref_var(b, v), + (gl_access_qualifier)0, ACCESS_VOLATILE); + nir_copy_deref_with_access(b, nir_build_deref_var(b, v), + nir_build_deref_var(b, v), + ACCESS_VOLATILE, (gl_access_qualifier)0); + nir_copy_var(b, v, v); + + nir_validate_shader(b->shader, NULL); + + bool progress = nir_opt_copy_prop_vars(b->shader); + EXPECT_TRUE(progress); + + nir_validate_shader(b->shader, NULL); + + ASSERT_EQ(count_intrinsics(nir_intrinsic_copy_deref), 2); + + /* Store to v[1] should use second_value directly. */ + nir_intrinsic_instr *first = get_intrinsic(nir_intrinsic_copy_deref, 0); + nir_intrinsic_instr *second = get_intrinsic(nir_intrinsic_copy_deref, 1); + ASSERT_EQ(nir_intrinsic_src_access(first), ACCESS_VOLATILE); + ASSERT_EQ(nir_intrinsic_dst_access(first), (gl_access_qualifier)0); + ASSERT_EQ(nir_intrinsic_src_access(second), (gl_access_qualifier)0); + ASSERT_EQ(nir_intrinsic_dst_access(second), ACCESS_VOLATILE); +} + TEST_F(nir_copy_prop_vars_test, memory_barrier_in_two_blocks) { nir_variable **v = create_many_int(nir_var_mem_ssbo, "v", 4); @@ -491,6 +647,397 @@ ASSERT_EQ(nir_intrinsic_get_var(load, 0), v[1]); } +TEST_F(nir_redundant_load_vars_test, acquire_barrier_prevents_load_removal) +{ + nir_variable **x = create_many_int(nir_var_mem_ssbo, "x", 1); + + nir_load_var(b, x[0]); + + scoped_memory_barrier(b, NIR_MEMORY_ACQUIRE, nir_var_mem_ssbo); + + nir_load_var(b, x[0]); + + bool progress = nir_opt_copy_prop_vars(b->shader); + ASSERT_FALSE(progress); + + ASSERT_EQ(2, count_intrinsics(nir_intrinsic_load_deref)); +} + +TEST_F(nir_redundant_load_vars_test, acquire_barrier_prevents_same_mode_load_removal) +{ + nir_variable **x = create_many_int(nir_var_mem_ssbo, "x", 2); + + nir_load_var(b, x[0]); + nir_load_var(b, x[1]); + + scoped_memory_barrier(b, NIR_MEMORY_ACQUIRE, nir_var_mem_ssbo); + + nir_load_var(b, x[0]); + nir_load_var(b, x[1]); + + bool progress = nir_opt_copy_prop_vars(b->shader); + ASSERT_FALSE(progress); + + ASSERT_EQ(4, count_intrinsics(nir_intrinsic_load_deref)); +} + +TEST_F(nir_redundant_load_vars_test, acquire_barrier_allows_different_mode_load_removal) +{ + nir_variable **x = create_many_int(nir_var_mem_ssbo, "x", 2); + nir_variable **y = create_many_int(nir_var_mem_shared, "y", 2); + + nir_load_var(b, x[0]); + nir_load_var(b, x[1]); + nir_load_var(b, y[0]); + nir_load_var(b, y[1]); + + scoped_memory_barrier(b, NIR_MEMORY_ACQUIRE, nir_var_mem_ssbo); + + nir_load_var(b, x[0]); + nir_load_var(b, x[1]); + nir_load_var(b, y[0]); + nir_load_var(b, y[1]); + + bool progress = nir_opt_copy_prop_vars(b->shader); + ASSERT_TRUE(progress); + + ASSERT_EQ(6, count_intrinsics(nir_intrinsic_load_deref)); + + nir_intrinsic_instr *load; + + load = get_intrinsic(nir_intrinsic_load_deref, 0); + ASSERT_EQ(nir_intrinsic_get_var(load, 0), x[0]); + load = get_intrinsic(nir_intrinsic_load_deref, 1); + ASSERT_EQ(nir_intrinsic_get_var(load, 0), x[1]); + + load = get_intrinsic(nir_intrinsic_load_deref, 2); + ASSERT_EQ(nir_intrinsic_get_var(load, 0), y[0]); + load = get_intrinsic(nir_intrinsic_load_deref, 3); + ASSERT_EQ(nir_intrinsic_get_var(load, 0), y[1]); + + load = get_intrinsic(nir_intrinsic_load_deref, 4); + ASSERT_EQ(nir_intrinsic_get_var(load, 0), x[0]); + load = get_intrinsic(nir_intrinsic_load_deref, 5); + ASSERT_EQ(nir_intrinsic_get_var(load, 0), x[1]); +} + +TEST_F(nir_redundant_load_vars_test, release_barrier_allows_load_removal) +{ + nir_variable **x = create_many_int(nir_var_mem_ssbo, "x", 1); + + nir_load_var(b, x[0]); + + scoped_memory_barrier(b, NIR_MEMORY_RELEASE, nir_var_mem_ssbo); + + nir_load_var(b, x[0]); + + bool progress = nir_opt_copy_prop_vars(b->shader); + ASSERT_TRUE(progress); + + ASSERT_EQ(1, count_intrinsics(nir_intrinsic_load_deref)); +} + +TEST_F(nir_redundant_load_vars_test, release_barrier_allows_same_mode_load_removal) +{ + nir_variable **x = create_many_int(nir_var_mem_ssbo, "x", 2); + + nir_load_var(b, x[0]); + nir_load_var(b, x[1]); + + scoped_memory_barrier(b, NIR_MEMORY_RELEASE, nir_var_mem_ssbo); + + nir_load_var(b, x[0]); + nir_load_var(b, x[1]); + + bool progress = nir_opt_copy_prop_vars(b->shader); + ASSERT_TRUE(progress); + + ASSERT_EQ(2, count_intrinsics(nir_intrinsic_load_deref)); +} + +TEST_F(nir_redundant_load_vars_test, release_barrier_allows_different_mode_load_removal) +{ + nir_variable **x = create_many_int(nir_var_mem_ssbo, "x", 2); + nir_variable **y = create_many_int(nir_var_mem_shared, "y", 2); + + nir_load_var(b, x[0]); + nir_load_var(b, x[1]); + nir_load_var(b, y[0]); + nir_load_var(b, y[1]); + + scoped_memory_barrier(b, NIR_MEMORY_RELEASE, nir_var_mem_ssbo); + + nir_load_var(b, x[0]); + nir_load_var(b, x[1]); + nir_load_var(b, y[0]); + nir_load_var(b, y[1]); + + bool progress = nir_opt_copy_prop_vars(b->shader); + ASSERT_TRUE(progress); + + ASSERT_EQ(4, count_intrinsics(nir_intrinsic_load_deref)); + + nir_intrinsic_instr *load; + + load = get_intrinsic(nir_intrinsic_load_deref, 0); + ASSERT_EQ(nir_intrinsic_get_var(load, 0), x[0]); + load = get_intrinsic(nir_intrinsic_load_deref, 1); + ASSERT_EQ(nir_intrinsic_get_var(load, 0), x[1]); + + load = get_intrinsic(nir_intrinsic_load_deref, 2); + ASSERT_EQ(nir_intrinsic_get_var(load, 0), y[0]); + load = get_intrinsic(nir_intrinsic_load_deref, 3); + ASSERT_EQ(nir_intrinsic_get_var(load, 0), y[1]); +} + +TEST_F(nir_copy_prop_vars_test, acquire_barrier_prevents_propagation) +{ + nir_variable **x = create_many_int(nir_var_mem_ssbo, "x", 1); + + nir_store_var(b, x[0], nir_imm_int(b, 10), 1); + + scoped_memory_barrier(b, NIR_MEMORY_ACQUIRE, nir_var_mem_ssbo); + + nir_load_var(b, x[0]); + + bool progress = nir_opt_copy_prop_vars(b->shader); + ASSERT_FALSE(progress); + + ASSERT_EQ(1, count_intrinsics(nir_intrinsic_store_deref)); + ASSERT_EQ(1, count_intrinsics(nir_intrinsic_load_deref)); +} + +TEST_F(nir_copy_prop_vars_test, acquire_barrier_prevents_same_mode_propagation) +{ + nir_variable **x = create_many_int(nir_var_mem_ssbo, "x", 2); + + nir_store_var(b, x[0], nir_imm_int(b, 10), 1); + nir_store_var(b, x[1], nir_imm_int(b, 20), 1); + + scoped_memory_barrier(b, NIR_MEMORY_ACQUIRE, nir_var_mem_ssbo); + + nir_load_var(b, x[0]); + nir_load_var(b, x[1]); + + bool progress = nir_opt_copy_prop_vars(b->shader); + ASSERT_FALSE(progress); + + ASSERT_EQ(2, count_intrinsics(nir_intrinsic_store_deref)); + ASSERT_EQ(2, count_intrinsics(nir_intrinsic_load_deref)); +} + +TEST_F(nir_copy_prop_vars_test, acquire_barrier_allows_different_mode_propagation) +{ + nir_variable **x = create_many_int(nir_var_mem_ssbo, "x", 2); + nir_variable **y = create_many_int(nir_var_mem_shared, "y", 2); + + nir_store_var(b, x[0], nir_imm_int(b, 10), 1); + nir_store_var(b, x[1], nir_imm_int(b, 20), 1); + nir_store_var(b, y[0], nir_imm_int(b, 30), 1); + nir_store_var(b, y[1], nir_imm_int(b, 40), 1); + + scoped_memory_barrier(b, NIR_MEMORY_ACQUIRE, nir_var_mem_ssbo); + + nir_load_var(b, x[0]); + nir_load_var(b, x[1]); + nir_load_var(b, y[0]); + nir_load_var(b, y[1]); + + bool progress = nir_opt_copy_prop_vars(b->shader); + ASSERT_TRUE(progress); + + ASSERT_EQ(4, count_intrinsics(nir_intrinsic_store_deref)); + ASSERT_EQ(2, count_intrinsics(nir_intrinsic_load_deref)); + + nir_intrinsic_instr *store; + + store = get_intrinsic(nir_intrinsic_store_deref, 0); + ASSERT_EQ(nir_intrinsic_get_var(store, 0), x[0]); + store = get_intrinsic(nir_intrinsic_store_deref, 1); + ASSERT_EQ(nir_intrinsic_get_var(store, 0), x[1]); + + store = get_intrinsic(nir_intrinsic_store_deref, 2); + ASSERT_EQ(nir_intrinsic_get_var(store, 0), y[0]); + store = get_intrinsic(nir_intrinsic_store_deref, 3); + ASSERT_EQ(nir_intrinsic_get_var(store, 0), y[1]); + + nir_intrinsic_instr *load; + + load = get_intrinsic(nir_intrinsic_load_deref, 0); + ASSERT_EQ(nir_intrinsic_get_var(load, 0), x[0]); + load = get_intrinsic(nir_intrinsic_load_deref, 1); + ASSERT_EQ(nir_intrinsic_get_var(load, 0), x[1]); +} + +TEST_F(nir_copy_prop_vars_test, release_barrier_allows_propagation) +{ + nir_variable **x = create_many_int(nir_var_mem_ssbo, "x", 1); + + nir_store_var(b, x[0], nir_imm_int(b, 10), 1); + + scoped_memory_barrier(b, NIR_MEMORY_RELEASE, nir_var_mem_ssbo); + + nir_load_var(b, x[0]); + + bool progress = nir_opt_copy_prop_vars(b->shader); + ASSERT_TRUE(progress); + + ASSERT_EQ(1, count_intrinsics(nir_intrinsic_store_deref)); +} + +TEST_F(nir_copy_prop_vars_test, release_barrier_allows_same_mode_propagation) +{ + nir_variable **x = create_many_int(nir_var_mem_ssbo, "x", 2); + + nir_store_var(b, x[0], nir_imm_int(b, 10), 1); + nir_store_var(b, x[1], nir_imm_int(b, 20), 1); + + scoped_memory_barrier(b, NIR_MEMORY_RELEASE, nir_var_mem_ssbo); + + nir_load_var(b, x[0]); + nir_load_var(b, x[1]); + + bool progress = nir_opt_copy_prop_vars(b->shader); + ASSERT_TRUE(progress); + + ASSERT_EQ(2, count_intrinsics(nir_intrinsic_store_deref)); + ASSERT_EQ(0, count_intrinsics(nir_intrinsic_load_deref)); +} + +TEST_F(nir_copy_prop_vars_test, release_barrier_allows_different_mode_propagation) +{ + nir_variable **x = create_many_int(nir_var_mem_ssbo, "x", 2); + nir_variable **y = create_many_int(nir_var_mem_shared, "y", 2); + + nir_store_var(b, x[0], nir_imm_int(b, 10), 1); + nir_store_var(b, x[1], nir_imm_int(b, 20), 1); + nir_store_var(b, y[0], nir_imm_int(b, 30), 1); + nir_store_var(b, y[1], nir_imm_int(b, 40), 1); + + scoped_memory_barrier(b, NIR_MEMORY_RELEASE, nir_var_mem_ssbo); + + nir_load_var(b, x[0]); + nir_load_var(b, x[1]); + nir_load_var(b, y[0]); + nir_load_var(b, y[1]); + + bool progress = nir_opt_copy_prop_vars(b->shader); + ASSERT_TRUE(progress); + + ASSERT_EQ(4, count_intrinsics(nir_intrinsic_store_deref)); + ASSERT_EQ(0, count_intrinsics(nir_intrinsic_load_deref)); + + nir_intrinsic_instr *store; + + store = get_intrinsic(nir_intrinsic_store_deref, 0); + ASSERT_EQ(nir_intrinsic_get_var(store, 0), x[0]); + store = get_intrinsic(nir_intrinsic_store_deref, 1); + ASSERT_EQ(nir_intrinsic_get_var(store, 0), x[1]); + + store = get_intrinsic(nir_intrinsic_store_deref, 2); + ASSERT_EQ(nir_intrinsic_get_var(store, 0), y[0]); + store = get_intrinsic(nir_intrinsic_store_deref, 3); + ASSERT_EQ(nir_intrinsic_get_var(store, 0), y[1]); +} + +TEST_F(nir_copy_prop_vars_test, acquire_barrier_prevents_propagation_from_copy) +{ + nir_variable **x = create_many_int(nir_var_mem_ssbo, "x", 3); + + nir_copy_var(b, x[1], x[0]); + + scoped_memory_barrier(b, NIR_MEMORY_ACQUIRE, nir_var_mem_ssbo); + + nir_copy_var(b, x[2], x[1]); + + bool progress = nir_opt_copy_prop_vars(b->shader); + ASSERT_FALSE(progress); + + ASSERT_EQ(2, count_intrinsics(nir_intrinsic_copy_deref)); + + nir_intrinsic_instr *copy; + + copy = get_intrinsic(nir_intrinsic_copy_deref, 0); + ASSERT_EQ(nir_intrinsic_get_var(copy, 1), x[0]); + + copy = get_intrinsic(nir_intrinsic_copy_deref, 1); + ASSERT_EQ(nir_intrinsic_get_var(copy, 1), x[1]); +} + +TEST_F(nir_copy_prop_vars_test, acquire_barrier_prevents_propagation_from_copy_to_different_mode) +{ + nir_variable **x = create_many_int(nir_var_mem_ssbo, "x", 2); + nir_variable **y = create_many_int(nir_var_mem_shared, "y", 1); + + nir_copy_var(b, y[0], x[0]); + + scoped_memory_barrier(b, NIR_MEMORY_ACQUIRE, nir_var_mem_ssbo); + + nir_copy_var(b, x[1], y[0]); + + bool progress = nir_opt_copy_prop_vars(b->shader); + ASSERT_FALSE(progress); + + ASSERT_EQ(2, count_intrinsics(nir_intrinsic_copy_deref)); + + nir_intrinsic_instr *copy; + + copy = get_intrinsic(nir_intrinsic_copy_deref, 0); + ASSERT_EQ(nir_intrinsic_get_var(copy, 1), x[0]); + + copy = get_intrinsic(nir_intrinsic_copy_deref, 1); + ASSERT_EQ(nir_intrinsic_get_var(copy, 1), y[0]); +} + +TEST_F(nir_copy_prop_vars_test, release_barrier_allows_propagation_from_copy) +{ + nir_variable **x = create_many_int(nir_var_mem_ssbo, "x", 3); + + nir_copy_var(b, x[1], x[0]); + + scoped_memory_barrier(b, NIR_MEMORY_RELEASE, nir_var_mem_ssbo); + + nir_copy_var(b, x[2], x[1]); + + bool progress = nir_opt_copy_prop_vars(b->shader); + ASSERT_TRUE(progress); + + ASSERT_EQ(2, count_intrinsics(nir_intrinsic_copy_deref)); + + nir_intrinsic_instr *copy; + + copy = get_intrinsic(nir_intrinsic_copy_deref, 0); + ASSERT_EQ(nir_intrinsic_get_var(copy, 1), x[0]); + + copy = get_intrinsic(nir_intrinsic_copy_deref, 1); + ASSERT_EQ(nir_intrinsic_get_var(copy, 1), x[0]); +} + +TEST_F(nir_copy_prop_vars_test, release_barrier_allows_propagation_from_copy_to_different_mode) +{ + nir_variable **x = create_many_int(nir_var_mem_ssbo, "x", 2); + nir_variable **y = create_many_int(nir_var_mem_shared, "y", 1); + + nir_copy_var(b, y[0], x[0]); + + scoped_memory_barrier(b, NIR_MEMORY_RELEASE, nir_var_mem_ssbo); + + nir_copy_var(b, x[1], y[0]); + + bool progress = nir_opt_copy_prop_vars(b->shader); + ASSERT_TRUE(progress); + + ASSERT_EQ(2, count_intrinsics(nir_intrinsic_copy_deref)); + + nir_intrinsic_instr *copy; + + copy = get_intrinsic(nir_intrinsic_copy_deref, 0); + ASSERT_EQ(nir_intrinsic_get_var(copy, 1), x[0]); + + copy = get_intrinsic(nir_intrinsic_copy_deref, 1); + ASSERT_EQ(nir_intrinsic_get_var(copy, 1), x[0]); +} + TEST_F(nir_copy_prop_vars_test, simple_store_load_in_two_blocks) { nir_variable **v = create_many_ivec2(nir_var_function_temp, "v", 2); diff -Nru mesa-19.2.8/src/compiler/nir_types.cpp mesa-20.0.8/src/compiler/nir_types.cpp --- mesa-19.2.8/src/compiler/nir_types.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir_types.cpp 2020-06-12 01:21:16.000000000 +0000 @@ -153,6 +153,19 @@ } unsigned +glsl_count_vec4_slots(const struct glsl_type *type, + bool is_gl_vertex_input, bool is_bindless) +{ + return type->count_vec4_slots(is_gl_vertex_input, is_bindless); +} + +unsigned +glsl_count_dword_slots(const struct glsl_type *type, bool is_bindless) +{ + return type->count_dword_slots(is_bindless); +} + +unsigned glsl_count_attribute_slots(const struct glsl_type *type, bool is_gl_vertex_input) { @@ -645,9 +658,14 @@ break; case GLSL_TYPE_SAMPLER: + case GLSL_TYPE_IMAGE: + /* Bindless samplers and images. */ + *size = 8; + *align = 8; + break; + case GLSL_TYPE_ATOMIC_UINT: case GLSL_TYPE_SUBROUTINE: - case GLSL_TYPE_IMAGE: case GLSL_TYPE_VOID: case GLSL_TYPE_ERROR: case GLSL_TYPE_INTERFACE: @@ -674,6 +692,12 @@ return type->contains_atomic(); } +bool +glsl_contains_opaque(const struct glsl_type *type) +{ + return type->contains_opaque(); +} + int glsl_get_cl_size(const struct glsl_type *type) { diff -Nru mesa-19.2.8/src/compiler/nir_types.h mesa-20.0.8/src/compiler/nir_types.h --- mesa-19.2.8/src/compiler/nir_types.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/nir_types.h 2020-06-12 01:21:16.000000000 +0000 @@ -80,6 +80,9 @@ unsigned glsl_get_aoa_size(const struct glsl_type *type); +unsigned glsl_count_vec4_slots(const struct glsl_type *type, + bool is_gl_vertex_input, bool is_bindless); +unsigned glsl_count_dword_slots(const struct glsl_type *type, bool is_bindless); unsigned glsl_count_attribute_slots(const struct glsl_type *type, bool is_gl_vertex_input); unsigned glsl_get_component_slots(const struct glsl_type *type); @@ -137,6 +140,7 @@ bool glsl_sampler_type_is_shadow(const struct glsl_type *type); bool glsl_sampler_type_is_array(const struct glsl_type *type); bool glsl_contains_atomic(const struct glsl_type *type); +bool glsl_contains_opaque(const struct glsl_type *type); const struct glsl_type *glsl_void_type(void); const struct glsl_type *glsl_float_type(void); diff -Nru mesa-19.2.8/src/compiler/SConscript.glsl mesa-20.0.8/src/compiler/SConscript.glsl --- mesa-19.2.8/src/compiler/SConscript.glsl 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/SConscript.glsl 2020-06-12 01:21:16.000000000 +0000 @@ -63,6 +63,8 @@ for l in ('LIBGLCPP_FILES', 'LIBGLSL_FILES'): glsl_sources += source_lists[l] +glsl_sources += env.StaticObject("glsl/glcpp/pp_standalone_scaffolding.c") + if env['msvc']: env.Prepend(CPPPATH = ['#/src/getopt']) env.PrependUnique(LIBS = [getopt]) diff -Nru mesa-19.2.8/src/compiler/shader_enums.c mesa-20.0.8/src/compiler/shader_enums.c --- mesa-19.2.8/src/compiler/shader_enums.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/shader_enums.c 2020-06-12 01:21:16.000000000 +0000 @@ -250,10 +250,16 @@ ENUM(SYSTEM_VALUE_DEVICE_INDEX), ENUM(SYSTEM_VALUE_VIEW_INDEX), ENUM(SYSTEM_VALUE_VERTEX_CNT), - ENUM(SYSTEM_VALUE_BARYCENTRIC_PIXEL), - ENUM(SYSTEM_VALUE_BARYCENTRIC_SAMPLE), - ENUM(SYSTEM_VALUE_BARYCENTRIC_CENTROID), - ENUM(SYSTEM_VALUE_BARYCENTRIC_SIZE), + ENUM(SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL), + ENUM(SYSTEM_VALUE_BARYCENTRIC_PERSP_SAMPLE), + ENUM(SYSTEM_VALUE_BARYCENTRIC_PERSP_CENTROID), + ENUM(SYSTEM_VALUE_BARYCENTRIC_PERSP_SIZE), + ENUM(SYSTEM_VALUE_BARYCENTRIC_LINEAR_PIXEL), + ENUM(SYSTEM_VALUE_BARYCENTRIC_LINEAR_CENTROID), + ENUM(SYSTEM_VALUE_BARYCENTRIC_LINEAR_SAMPLE), + ENUM(SYSTEM_VALUE_BARYCENTRIC_PULL_MODEL), + ENUM(SYSTEM_VALUE_GS_HEADER_IR3), + ENUM(SYSTEM_VALUE_TCS_HEADER_IR3), }; STATIC_ASSERT(ARRAY_SIZE(names) == SYSTEM_VALUE_MAX); return NAME(sysval); @@ -267,6 +273,7 @@ ENUM(INTERP_MODE_SMOOTH), ENUM(INTERP_MODE_FLAT), ENUM(INTERP_MODE_NOPERSPECTIVE), + ENUM(INTERP_MODE_EXPLICIT), }; STATIC_ASSERT(ARRAY_SIZE(names) == INTERP_MODE_COUNT); return NAME(qual); diff -Nru mesa-19.2.8/src/compiler/shader_enums.h mesa-20.0.8/src/compiler/shader_enums.h --- mesa-19.2.8/src/compiler/shader_enums.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/shader_enums.h 2020-06-12 01:21:16.000000000 +0000 @@ -28,6 +28,9 @@ #include +/* Project-wide (GL and Vulkan) maximum. */ +#define MAX_DRAW_BUFFERS 8 + #ifdef __cplusplus extern "C" { #endif @@ -627,16 +630,28 @@ SYSTEM_VALUE_VERTEX_CNT, /** - * Driver internal varying-coords, used for varying-fetch instructions. - * Not externally visible. + * Required for AMD_shader_explicit_vertex_parameter and also used for + * varying-fetch instructions. * * The _SIZE value is "primitive size", used to scale i/j in primitive * space to pixel space. */ - SYSTEM_VALUE_BARYCENTRIC_PIXEL, - SYSTEM_VALUE_BARYCENTRIC_SAMPLE, - SYSTEM_VALUE_BARYCENTRIC_CENTROID, - SYSTEM_VALUE_BARYCENTRIC_SIZE, + SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL, + SYSTEM_VALUE_BARYCENTRIC_PERSP_SAMPLE, + SYSTEM_VALUE_BARYCENTRIC_PERSP_CENTROID, + SYSTEM_VALUE_BARYCENTRIC_PERSP_SIZE, + SYSTEM_VALUE_BARYCENTRIC_LINEAR_PIXEL, + SYSTEM_VALUE_BARYCENTRIC_LINEAR_CENTROID, + SYSTEM_VALUE_BARYCENTRIC_LINEAR_SAMPLE, + SYSTEM_VALUE_BARYCENTRIC_PULL_MODEL, + + /** + * IR3 specific geometry shader and tesselation control shader system + * values that packs invocation id, thread id and vertex id. Having this + * as a nir level system value lets us do the unpacking in nir. + */ + SYSTEM_VALUE_GS_HEADER_IR3, + SYSTEM_VALUE_TCS_HEADER_IR3, SYSTEM_VALUE_MAX /**< Number of values */ } gl_system_value; @@ -656,6 +671,7 @@ INTERP_MODE_SMOOTH, INTERP_MODE_FLAT, INTERP_MODE_NOPERSPECTIVE, + INTERP_MODE_EXPLICIT, INTERP_MODE_COUNT /**< Number of interpolation qualifiers */ }; @@ -857,6 +873,26 @@ DERIVATIVE_GROUP_LINEAR, }; +enum float_controls +{ + FLOAT_CONTROLS_DEFAULT_FLOAT_CONTROL_MODE = 0x0000, + FLOAT_CONTROLS_DENORM_PRESERVE_FP16 = 0x0001, + FLOAT_CONTROLS_DENORM_PRESERVE_FP32 = 0x0002, + FLOAT_CONTROLS_DENORM_PRESERVE_FP64 = 0x0004, + FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16 = 0x0008, + FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32 = 0x0010, + FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP64 = 0x0020, + FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP16 = 0x0040, + FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP32 = 0x0080, + FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP64 = 0x0100, + FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 = 0x0200, + FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32 = 0x0400, + FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64 = 0x0800, + FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 = 0x1000, + FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 = 0x2000, + FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64 = 0x4000, +}; + #ifdef __cplusplus } /* extern "C" */ #endif diff -Nru mesa-19.2.8/src/compiler/shader_info.h mesa-20.0.8/src/compiler/shader_info.h --- mesa-19.2.8/src/compiler/shader_info.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/shader_info.h 2020-06-12 01:21:16.000000000 +0000 @@ -53,12 +53,15 @@ bool int16; bool int64; bool int64_atomics; + bool integer_functions2; bool kernel; bool min_lod; bool multiview; bool physical_storage_buffer_address; bool post_depth_coverage; bool runtime_descriptor_array; + bool float_controls; + bool shader_clock; bool shader_viewport_index_layer; bool stencil_export; bool storage_8bit; @@ -73,10 +76,15 @@ bool tessellation; bool transform_feedback; bool variable_pointers; + bool vk_memory_model; + bool vk_memory_model_device_scope; bool float16; + bool amd_fragment_mask; bool amd_gcn_shader; bool amd_shader_ballot; bool amd_trinary_minmax; + bool amd_image_read_write_lod; + bool amd_shader_explicit_vertex_parameter; }; typedef struct shader_info { @@ -86,23 +94,29 @@ const char *label; /** The shader stage, such as MESA_SHADER_VERTEX. */ - gl_shader_stage stage; + gl_shader_stage stage:8; /** The shader stage in a non SSO linked program that follows this stage, * such as MESA_SHADER_FRAGMENT. */ - gl_shader_stage next_stage; + gl_shader_stage next_stage:8; /* Number of textures used by this shader */ - unsigned num_textures; + uint8_t num_textures; /* Number of uniform buffers used by this shader */ - unsigned num_ubos; + uint8_t num_ubos; /* Number of atomic buffers used by this shader */ - unsigned num_abos; - /* Number of shader storage buffers used by this shader */ - unsigned num_ssbos; + uint8_t num_abos; + /* Number of shader storage buffers (max .driver_location + 1) used by this + * shader. In the case of nir_lower_atomics_to_ssbo being used, this will + * be the number of actual SSBOs in gl_program->info, and the lowered SSBOs + * and atomic counters in nir_shader->info. + */ + uint8_t num_ssbos; /* Number of images used by this shader */ - unsigned num_images; + uint8_t num_images; + /* Index of the last MSAA image. */ + int8_t last_msaa_image; /* Which inputs are actually read */ uint64_t inputs_read; @@ -120,38 +134,47 @@ /* Which patch outputs are read */ uint32_t patch_outputs_read; - /* Whether or not this shader ever uses textureGather() */ - bool uses_texture_gather; - /** Bitfield of which textures are used */ uint32_t textures_used; /** Bitfield of which textures are used by texelFetch() */ uint32_t textures_used_by_txf; + /* SPV_KHR_float_controls: execution mode for floating point ops */ + uint16_t float_controls_execution_mode; + + /* The size of the gl_ClipDistance[] array, if declared. */ + uint8_t clip_distance_array_size:4; + + /* The size of the gl_CullDistance[] array, if declared. */ + uint8_t cull_distance_array_size:4; + + /* Whether or not this shader ever uses textureGather() */ + bool uses_texture_gather:1; + /** * True if this shader uses the fddx/fddy opcodes. * * Note that this does not include the "fine" and "coarse" variants. */ - bool uses_fddx_fddy; + bool uses_fddx_fddy:1; /** * True if this shader uses 64-bit ALU operations */ - bool uses_64bit; + bool uses_64bit:1; - /* The size of the gl_ClipDistance[] array, if declared. */ - unsigned clip_distance_array_size; - - /* The size of the gl_CullDistance[] array, if declared. */ - unsigned cull_distance_array_size; + /* Whether the first UBO is the default uniform buffer, i.e. uniforms. */ + bool first_ubo_is_default_ubo:1; /* Whether or not separate shader objects were used */ - bool separate_shader; + bool separate_shader:1; /** Was this shader linked with any transform feedback varyings? */ - bool has_transform_feedback_varyings; + bool has_transform_feedback_varyings:1; + + /* Whether flrp has been lowered. */ + bool flrp_lowered:1; union { struct { @@ -163,37 +186,37 @@ * * Valid values: SI_VS_BLIT_SGPRS_POS_* */ - unsigned blit_sgprs_amd; + uint8_t blit_sgprs_amd:4; /* True if the shader writes position in window space coordinates pre-transform */ - bool window_space_position; + bool window_space_position:1; } vs; struct { - /** The number of vertices recieves per input primitive */ - unsigned vertices_in; - /** The output primitive type (GL enum value) */ - unsigned output_primitive; + uint16_t output_primitive; /** The input primitive type (GL enum value) */ - unsigned input_primitive; + uint16_t input_primitive; /** The maximum number of vertices the geometry shader might write. */ - unsigned vertices_out; + uint16_t vertices_out; /** 1 .. MAX_GEOMETRY_SHADER_INVOCATIONS */ - unsigned invocations; + uint8_t invocations; + + /** The number of vertices recieves per input primitive (max. 6) */ + uint8_t vertices_in:3; /** Whether or not this shader uses EndPrimitive */ - bool uses_end_primitive; + bool uses_end_primitive:1; /** Whether or not this shader uses non-zero streams */ - bool uses_streams; + bool uses_streams:1; } gs; struct { - bool uses_discard; + bool uses_discard:1; /** * True if this fragment shader requires helper invocations. This @@ -201,38 +224,38 @@ * instructions which do implicit derivatives, and the use of quad * subgroup operations. */ - bool needs_helper_invocations; + bool needs_helper_invocations:1; /** * Whether any inputs are declared with the "sample" qualifier. */ - bool uses_sample_qualifier; + bool uses_sample_qualifier:1; /** * Whether early fragment tests are enabled as defined by * ARB_shader_image_load_store. */ - bool early_fragment_tests; + bool early_fragment_tests:1; /** * Defined by INTEL_conservative_rasterization. */ - bool inner_coverage; + bool inner_coverage:1; - bool post_depth_coverage; + bool post_depth_coverage:1; /** * \name ARB_fragment_coord_conventions * @{ */ - bool pixel_center_integer; - bool origin_upper_left; + bool pixel_center_integer:1; + bool origin_upper_left:1; /*@}*/ - bool pixel_interlock_ordered; - bool pixel_interlock_unordered; - bool sample_interlock_ordered; - bool sample_interlock_unordered; + bool pixel_interlock_ordered:1; + bool pixel_interlock_unordered:1; + bool sample_interlock_ordered:1; + bool sample_interlock_unordered:1; /** * Flags whether NIR's base types on the FS color outputs should be @@ -251,24 +274,29 @@ * fixups are necessary to handle effectively untyped data being * output from the FS. */ - bool untyped_color_outputs; + bool untyped_color_outputs:1; /** gl_FragDepth layout for ARB_conservative_depth. */ - enum gl_frag_depth_layout depth_layout; + enum gl_frag_depth_layout depth_layout:3; } fs; struct { - unsigned local_size[3]; + uint16_t local_size[3]; - bool local_size_variable; - char user_data_components_amd; + bool local_size_variable:1; + uint8_t user_data_components_amd:3; + + /* + * Arrangement of invocations used to calculate derivatives in a compute + * shader. From NV_compute_shader_derivatives. + */ + enum gl_derivative_group derivative_group:2; /** * Size of shared variables accessed by the compute shader. */ unsigned shared_size; - /** * pointer size is: * AddressingModelLogical: 0 (default) @@ -276,24 +304,19 @@ * AddressingModelPhysical64: 64 */ unsigned ptr_size; - - /* - * Arrangement of invocations used to calculate derivatives in a compute - * shader. From NV_compute_shader_derivatives. - */ - enum gl_derivative_group derivative_group; } cs; /* Applies to both TCS and TES. */ struct { + uint16_t primitive_mode; /* GL_TRIANGLES, GL_QUADS or GL_ISOLINES */ + /** The number of vertices in the TCS output patch. */ - unsigned tcs_vertices_out; + uint8_t tcs_vertices_out; + enum gl_tess_spacing spacing:2; - uint32_t primitive_mode; /* GL_TRIANGLES, GL_QUADS or GL_ISOLINES */ - enum gl_tess_spacing spacing; /** Is the vertex order counterclockwise? */ - bool ccw; - bool point_mode; + bool ccw:1; + bool point_mode:1; } tess; }; } shader_info; diff -Nru mesa-19.2.8/src/compiler/spirv/nir_spirv.h mesa-20.0.8/src/compiler/spirv/nir_spirv.h --- mesa-19.2.8/src/compiler/spirv/nir_spirv.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/spirv/nir_spirv.h 2020-06-12 01:21:16.000000000 +0000 @@ -67,6 +67,11 @@ */ bool frag_coord_is_sysval; + /* Whether to generate only scoped_memory_barrier intrinsics instead of the + * set of memory barrier intrinsics based on GLSL. + */ + bool use_scoped_memory_barrier; + struct spirv_supported_capabilities caps; /* Address format for various kinds of pointers. */ @@ -78,6 +83,12 @@ nir_address_format global_addr_format; nir_address_format temp_addr_format; + /* Whether UniformConstant memory should be treated as normal global memory. + * This is usefull for CL 2.0 implementations with fine grain system SVM + * support. + */ + bool constant_as_global; + struct { void (*func)(void *private_data, enum nir_spirv_debug_level level, diff -Nru mesa-19.2.8/src/compiler/spirv/spirv2nir.c mesa-20.0.8/src/compiler/spirv/spirv2nir.c --- mesa-19.2.8/src/compiler/spirv/spirv2nir.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/spirv/spirv2nir.c 2020-06-12 01:21:16.000000000 +0000 @@ -72,12 +72,16 @@ return 1; } - struct spirv_to_nir_options spirv_opts = {}; + glsl_type_singleton_init_or_ref(); + + struct spirv_to_nir_options spirv_opts = {0}; nir_shader *nir = spirv_to_nir(map, word_count, NULL, 0, MESA_SHADER_FRAGMENT, "main", &spirv_opts, NULL); nir_print_shader(nir, stderr); + glsl_type_singleton_decref(); + return 0; } diff -Nru mesa-19.2.8/src/compiler/spirv/spirv.core.grammar.json mesa-20.0.8/src/compiler/spirv/spirv.core.grammar.json --- mesa-19.2.8/src/compiler/spirv/spirv.core.grammar.json 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/spirv/spirv.core.grammar.json 2020-06-12 01:21:16.000000000 +0000 @@ -26,15 +26,122 @@ ], "magic_number" : "0x07230203", "major_version" : 1, - "minor_version" : 4, + "minor_version" : 5, "revision" : 1, + "instruction_printing_class" : [ + { + "tag" : "@exclude" + }, + { + "tag" : "Miscellaneous", + "heading" : "Miscellaneous Instructions" + }, + { + "tag" : "Debug", + "heading" : "Debug Instructions" + }, + { + "tag" : "Annotation", + "heading" : "Annotation Instructions" + }, + { + "tag" : "Extension", + "heading" : "Extension Instructions" + }, + { + "tag" : "Mode-Setting", + "heading" : "Mode-Setting Instructions" + }, + { + "tag" : "Type-Declaration", + "heading" : "Type-Declaration Instructions" + }, + { + "tag" : "Constant-Creation", + "heading" : "Constant-Creation Instructions" + }, + { + "tag" : "Memory", + "heading" : "Memory Instructions" + }, + { + "tag" : "Function", + "heading" : "Function Instructions" + }, + { + "tag" : "Image", + "heading" : "Image Instructions" + }, + { + "tag" : "Conversion", + "heading" : "Conversion Instructions" + }, + { + "tag" : "Composite", + "heading" : "Composite Instructions" + }, + { + "tag" : "Arithmetic", + "heading" : "Arithmetic Instructions" + }, + { + "tag" : "Bit", + "heading" : "Bit Instructions" + }, + { + "tag" : "Relational_and_Logical", + "heading" : "Relational and Logical Instructions" + }, + { + "tag" : "Derivative", + "heading" : "Derivative Instructions" + }, + { + "tag" : "Control-Flow", + "heading" : "Control-Flow Instructions" + }, + { + "tag" : "Atomic", + "heading" : "Atomic Instructions" + }, + { + "tag" : "Primitive", + "heading" : "Primitive Instructions" + }, + { + "tag" : "Barrier", + "heading" : "Barrier Instructions" + }, + { + "tag" : "Group", + "heading" : "Group and Subgroup Instructions" + }, + { + "tag" : "Device-Side_Enqueue", + "heading" : "Device-Side Enqueue Instructions" + }, + { + "tag" : "Pipe", + "heading" : "Pipe Instructions" + }, + { + "tag" : "Non-Uniform", + "heading" : "Non-Uniform Instructions" + }, + { + "tag" : "Reserved", + "heading" : "Reserved Instructions" + } + ], "instructions" : [ { "opname" : "OpNop", + "class" : "Miscellaneous", "opcode" : 0 }, { "opname" : "OpUndef", + "class" : "Miscellaneous", "opcode" : 1, "operands" : [ { "kind" : "IdResultType" }, @@ -43,6 +150,7 @@ }, { "opname" : "OpSourceContinued", + "class" : "Debug", "opcode" : 2, "operands" : [ { "kind" : "LiteralString", "name" : "'Continued Source'" } @@ -50,6 +158,7 @@ }, { "opname" : "OpSource", + "class" : "Debug", "opcode" : 3, "operands" : [ { "kind" : "SourceLanguage" }, @@ -60,6 +169,7 @@ }, { "opname" : "OpSourceExtension", + "class" : "Debug", "opcode" : 4, "operands" : [ { "kind" : "LiteralString", "name" : "'Extension'" } @@ -67,6 +177,7 @@ }, { "opname" : "OpName", + "class" : "Debug", "opcode" : 5, "operands" : [ { "kind" : "IdRef", "name" : "'Target'" }, @@ -75,6 +186,7 @@ }, { "opname" : "OpMemberName", + "class" : "Debug", "opcode" : 6, "operands" : [ { "kind" : "IdRef", "name" : "'Type'" }, @@ -84,6 +196,7 @@ }, { "opname" : "OpString", + "class" : "Debug", "opcode" : 7, "operands" : [ { "kind" : "IdResult" }, @@ -92,6 +205,7 @@ }, { "opname" : "OpLine", + "class" : "Debug", "opcode" : 8, "operands" : [ { "kind" : "IdRef", "name" : "'File'" }, @@ -101,6 +215,7 @@ }, { "opname" : "OpExtension", + "class" : "Extension", "opcode" : 10, "operands" : [ { "kind" : "LiteralString", "name" : "'Name'" } @@ -108,6 +223,7 @@ }, { "opname" : "OpExtInstImport", + "class" : "Extension", "opcode" : 11, "operands" : [ { "kind" : "IdResult" }, @@ -116,6 +232,7 @@ }, { "opname" : "OpExtInst", + "class" : "Extension", "opcode" : 12, "operands" : [ { "kind" : "IdResultType" }, @@ -127,6 +244,7 @@ }, { "opname" : "OpMemoryModel", + "class" : "Mode-Setting", "opcode" : 14, "operands" : [ { "kind" : "AddressingModel" }, @@ -135,6 +253,7 @@ }, { "opname" : "OpEntryPoint", + "class" : "Mode-Setting", "opcode" : 15, "operands" : [ { "kind" : "ExecutionModel" }, @@ -145,6 +264,7 @@ }, { "opname" : "OpExecutionMode", + "class" : "Mode-Setting", "opcode" : 16, "operands" : [ { "kind" : "IdRef", "name" : "'Entry Point'" }, @@ -153,6 +273,7 @@ }, { "opname" : "OpCapability", + "class" : "Mode-Setting", "opcode" : 17, "operands" : [ { "kind" : "Capability", "name" : "'Capability'" } @@ -160,6 +281,7 @@ }, { "opname" : "OpTypeVoid", + "class" : "Type-Declaration", "opcode" : 19, "operands" : [ { "kind" : "IdResult" } @@ -167,6 +289,7 @@ }, { "opname" : "OpTypeBool", + "class" : "Type-Declaration", "opcode" : 20, "operands" : [ { "kind" : "IdResult" } @@ -174,6 +297,7 @@ }, { "opname" : "OpTypeInt", + "class" : "Type-Declaration", "opcode" : 21, "operands" : [ { "kind" : "IdResult" }, @@ -183,6 +307,7 @@ }, { "opname" : "OpTypeFloat", + "class" : "Type-Declaration", "opcode" : 22, "operands" : [ { "kind" : "IdResult" }, @@ -191,6 +316,7 @@ }, { "opname" : "OpTypeVector", + "class" : "Type-Declaration", "opcode" : 23, "operands" : [ { "kind" : "IdResult" }, @@ -200,6 +326,7 @@ }, { "opname" : "OpTypeMatrix", + "class" : "Type-Declaration", "opcode" : 24, "operands" : [ { "kind" : "IdResult" }, @@ -210,6 +337,7 @@ }, { "opname" : "OpTypeImage", + "class" : "Type-Declaration", "opcode" : 25, "operands" : [ { "kind" : "IdResult" }, @@ -225,6 +353,7 @@ }, { "opname" : "OpTypeSampler", + "class" : "Type-Declaration", "opcode" : 26, "operands" : [ { "kind" : "IdResult" } @@ -232,6 +361,7 @@ }, { "opname" : "OpTypeSampledImage", + "class" : "Type-Declaration", "opcode" : 27, "operands" : [ { "kind" : "IdResult" }, @@ -240,6 +370,7 @@ }, { "opname" : "OpTypeArray", + "class" : "Type-Declaration", "opcode" : 28, "operands" : [ { "kind" : "IdResult" }, @@ -249,6 +380,7 @@ }, { "opname" : "OpTypeRuntimeArray", + "class" : "Type-Declaration", "opcode" : 29, "operands" : [ { "kind" : "IdResult" }, @@ -258,6 +390,7 @@ }, { "opname" : "OpTypeStruct", + "class" : "Type-Declaration", "opcode" : 30, "operands" : [ { "kind" : "IdResult" }, @@ -266,6 +399,7 @@ }, { "opname" : "OpTypeOpaque", + "class" : "Type-Declaration", "opcode" : 31, "operands" : [ { "kind" : "IdResult" }, @@ -275,6 +409,7 @@ }, { "opname" : "OpTypePointer", + "class" : "Type-Declaration", "opcode" : 32, "operands" : [ { "kind" : "IdResult" }, @@ -284,6 +419,7 @@ }, { "opname" : "OpTypeFunction", + "class" : "Type-Declaration", "opcode" : 33, "operands" : [ { "kind" : "IdResult" }, @@ -293,6 +429,7 @@ }, { "opname" : "OpTypeEvent", + "class" : "Type-Declaration", "opcode" : 34, "operands" : [ { "kind" : "IdResult" } @@ -301,6 +438,7 @@ }, { "opname" : "OpTypeDeviceEvent", + "class" : "Type-Declaration", "opcode" : 35, "operands" : [ { "kind" : "IdResult" } @@ -309,6 +447,7 @@ }, { "opname" : "OpTypeReserveId", + "class" : "Type-Declaration", "opcode" : 36, "operands" : [ { "kind" : "IdResult" } @@ -317,6 +456,7 @@ }, { "opname" : "OpTypeQueue", + "class" : "Type-Declaration", "opcode" : 37, "operands" : [ { "kind" : "IdResult" } @@ -325,6 +465,7 @@ }, { "opname" : "OpTypePipe", + "class" : "Type-Declaration", "opcode" : 38, "operands" : [ { "kind" : "IdResult" }, @@ -334,6 +475,7 @@ }, { "opname" : "OpTypeForwardPointer", + "class" : "Type-Declaration", "opcode" : 39, "operands" : [ { "kind" : "IdRef", "name" : "'Pointer Type'" }, @@ -341,11 +483,12 @@ ], "capabilities" : [ "Addresses", - "PhysicalStorageBufferAddressesEXT" + "PhysicalStorageBufferAddresses" ] }, { "opname" : "OpConstantTrue", + "class" : "Constant-Creation", "opcode" : 41, "operands" : [ { "kind" : "IdResultType" }, @@ -354,6 +497,7 @@ }, { "opname" : "OpConstantFalse", + "class" : "Constant-Creation", "opcode" : 42, "operands" : [ { "kind" : "IdResultType" }, @@ -362,6 +506,7 @@ }, { "opname" : "OpConstant", + "class" : "Constant-Creation", "opcode" : 43, "operands" : [ { "kind" : "IdResultType" }, @@ -371,6 +516,7 @@ }, { "opname" : "OpConstantComposite", + "class" : "Constant-Creation", "opcode" : 44, "operands" : [ { "kind" : "IdResultType" }, @@ -380,6 +526,7 @@ }, { "opname" : "OpConstantSampler", + "class" : "Constant-Creation", "opcode" : 45, "operands" : [ { "kind" : "IdResultType" }, @@ -392,6 +539,7 @@ }, { "opname" : "OpConstantNull", + "class" : "Constant-Creation", "opcode" : 46, "operands" : [ { "kind" : "IdResultType" }, @@ -400,6 +548,7 @@ }, { "opname" : "OpSpecConstantTrue", + "class" : "Constant-Creation", "opcode" : 48, "operands" : [ { "kind" : "IdResultType" }, @@ -408,6 +557,7 @@ }, { "opname" : "OpSpecConstantFalse", + "class" : "Constant-Creation", "opcode" : 49, "operands" : [ { "kind" : "IdResultType" }, @@ -416,6 +566,7 @@ }, { "opname" : "OpSpecConstant", + "class" : "Constant-Creation", "opcode" : 50, "operands" : [ { "kind" : "IdResultType" }, @@ -425,6 +576,7 @@ }, { "opname" : "OpSpecConstantComposite", + "class" : "Constant-Creation", "opcode" : 51, "operands" : [ { "kind" : "IdResultType" }, @@ -434,6 +586,7 @@ }, { "opname" : "OpSpecConstantOp", + "class" : "Constant-Creation", "opcode" : 52, "operands" : [ { "kind" : "IdResultType" }, @@ -443,6 +596,7 @@ }, { "opname" : "OpFunction", + "class" : "Function", "opcode" : 54, "operands" : [ { "kind" : "IdResultType" }, @@ -453,6 +607,7 @@ }, { "opname" : "OpFunctionParameter", + "class" : "Function", "opcode" : 55, "operands" : [ { "kind" : "IdResultType" }, @@ -461,10 +616,12 @@ }, { "opname" : "OpFunctionEnd", + "class" : "Function", "opcode" : 56 }, { "opname" : "OpFunctionCall", + "class" : "Function", "opcode" : 57, "operands" : [ { "kind" : "IdResultType" }, @@ -475,6 +632,7 @@ }, { "opname" : "OpVariable", + "class" : "Memory", "opcode" : 59, "operands" : [ { "kind" : "IdResultType" }, @@ -485,6 +643,7 @@ }, { "opname" : "OpImageTexelPointer", + "class" : "Memory", "opcode" : 60, "operands" : [ { "kind" : "IdResultType" }, @@ -496,6 +655,7 @@ }, { "opname" : "OpLoad", + "class" : "Memory", "opcode" : 61, "operands" : [ { "kind" : "IdResultType" }, @@ -506,6 +666,7 @@ }, { "opname" : "OpStore", + "class" : "Memory", "opcode" : 62, "operands" : [ { "kind" : "IdRef", "name" : "'Pointer'" }, @@ -515,6 +676,7 @@ }, { "opname" : "OpCopyMemory", + "class" : "Memory", "opcode" : 63, "operands" : [ { "kind" : "IdRef", "name" : "'Target'" }, @@ -525,6 +687,7 @@ }, { "opname" : "OpCopyMemorySized", + "class" : "Memory", "opcode" : 64, "operands" : [ { "kind" : "IdRef", "name" : "'Target'" }, @@ -537,6 +700,7 @@ }, { "opname" : "OpAccessChain", + "class" : "Memory", "opcode" : 65, "operands" : [ { "kind" : "IdResultType" }, @@ -547,6 +711,7 @@ }, { "opname" : "OpInBoundsAccessChain", + "class" : "Memory", "opcode" : 66, "operands" : [ { "kind" : "IdResultType" }, @@ -557,6 +722,7 @@ }, { "opname" : "OpPtrAccessChain", + "class" : "Memory", "opcode" : 67, "operands" : [ { "kind" : "IdResultType" }, @@ -569,11 +735,12 @@ "Addresses", "VariablePointers", "VariablePointersStorageBuffer", - "PhysicalStorageBufferAddressesEXT" + "PhysicalStorageBufferAddresses" ] }, { "opname" : "OpArrayLength", + "class" : "Memory", "opcode" : 68, "operands" : [ { "kind" : "IdResultType" }, @@ -585,6 +752,7 @@ }, { "opname" : "OpGenericPtrMemSemantics", + "class" : "Memory", "opcode" : 69, "operands" : [ { "kind" : "IdResultType" }, @@ -595,6 +763,7 @@ }, { "opname" : "OpInBoundsPtrAccessChain", + "class" : "Memory", "opcode" : 70, "operands" : [ { "kind" : "IdResultType" }, @@ -607,6 +776,7 @@ }, { "opname" : "OpDecorate", + "class" : "Annotation", "opcode" : 71, "operands" : [ { "kind" : "IdRef", "name" : "'Target'" }, @@ -615,6 +785,7 @@ }, { "opname" : "OpMemberDecorate", + "class" : "Annotation", "opcode" : 72, "operands" : [ { "kind" : "IdRef", "name" : "'Structure Type'" }, @@ -624,6 +795,7 @@ }, { "opname" : "OpDecorationGroup", + "class" : "Annotation", "opcode" : 73, "operands" : [ { "kind" : "IdResult" } @@ -631,6 +803,7 @@ }, { "opname" : "OpGroupDecorate", + "class" : "Annotation", "opcode" : 74, "operands" : [ { "kind" : "IdRef", "name" : "'Decoration Group'" }, @@ -639,6 +812,7 @@ }, { "opname" : "OpGroupMemberDecorate", + "class" : "Annotation", "opcode" : 75, "operands" : [ { "kind" : "IdRef", "name" : "'Decoration Group'" }, @@ -647,6 +821,7 @@ }, { "opname" : "OpVectorExtractDynamic", + "class" : "Composite", "opcode" : 77, "operands" : [ { "kind" : "IdResultType" }, @@ -657,6 +832,7 @@ }, { "opname" : "OpVectorInsertDynamic", + "class" : "Composite", "opcode" : 78, "operands" : [ { "kind" : "IdResultType" }, @@ -668,6 +844,7 @@ }, { "opname" : "OpVectorShuffle", + "class" : "Composite", "opcode" : 79, "operands" : [ { "kind" : "IdResultType" }, @@ -679,6 +856,7 @@ }, { "opname" : "OpCompositeConstruct", + "class" : "Composite", "opcode" : 80, "operands" : [ { "kind" : "IdResultType" }, @@ -688,6 +866,7 @@ }, { "opname" : "OpCompositeExtract", + "class" : "Composite", "opcode" : 81, "operands" : [ { "kind" : "IdResultType" }, @@ -698,6 +877,7 @@ }, { "opname" : "OpCompositeInsert", + "class" : "Composite", "opcode" : 82, "operands" : [ { "kind" : "IdResultType" }, @@ -709,6 +889,7 @@ }, { "opname" : "OpCopyObject", + "class" : "Composite", "opcode" : 83, "operands" : [ { "kind" : "IdResultType" }, @@ -718,6 +899,7 @@ }, { "opname" : "OpTranspose", + "class" : "Composite", "opcode" : 84, "operands" : [ { "kind" : "IdResultType" }, @@ -728,6 +910,7 @@ }, { "opname" : "OpSampledImage", + "class" : "Image", "opcode" : 86, "operands" : [ { "kind" : "IdResultType" }, @@ -738,6 +921,7 @@ }, { "opname" : "OpImageSampleImplicitLod", + "class" : "Image", "opcode" : 87, "operands" : [ { "kind" : "IdResultType" }, @@ -750,6 +934,7 @@ }, { "opname" : "OpImageSampleExplicitLod", + "class" : "Image", "opcode" : 88, "operands" : [ { "kind" : "IdResultType" }, @@ -761,6 +946,7 @@ }, { "opname" : "OpImageSampleDrefImplicitLod", + "class" : "Image", "opcode" : 89, "operands" : [ { "kind" : "IdResultType" }, @@ -774,6 +960,7 @@ }, { "opname" : "OpImageSampleDrefExplicitLod", + "class" : "Image", "opcode" : 90, "operands" : [ { "kind" : "IdResultType" }, @@ -787,6 +974,7 @@ }, { "opname" : "OpImageSampleProjImplicitLod", + "class" : "Image", "opcode" : 91, "operands" : [ { "kind" : "IdResultType" }, @@ -799,6 +987,7 @@ }, { "opname" : "OpImageSampleProjExplicitLod", + "class" : "Image", "opcode" : 92, "operands" : [ { "kind" : "IdResultType" }, @@ -811,6 +1000,7 @@ }, { "opname" : "OpImageSampleProjDrefImplicitLod", + "class" : "Image", "opcode" : 93, "operands" : [ { "kind" : "IdResultType" }, @@ -824,6 +1014,7 @@ }, { "opname" : "OpImageSampleProjDrefExplicitLod", + "class" : "Image", "opcode" : 94, "operands" : [ { "kind" : "IdResultType" }, @@ -837,6 +1028,7 @@ }, { "opname" : "OpImageFetch", + "class" : "Image", "opcode" : 95, "operands" : [ { "kind" : "IdResultType" }, @@ -848,6 +1040,7 @@ }, { "opname" : "OpImageGather", + "class" : "Image", "opcode" : 96, "operands" : [ { "kind" : "IdResultType" }, @@ -861,6 +1054,7 @@ }, { "opname" : "OpImageDrefGather", + "class" : "Image", "opcode" : 97, "operands" : [ { "kind" : "IdResultType" }, @@ -874,6 +1068,7 @@ }, { "opname" : "OpImageRead", + "class" : "Image", "opcode" : 98, "operands" : [ { "kind" : "IdResultType" }, @@ -885,6 +1080,7 @@ }, { "opname" : "OpImageWrite", + "class" : "Image", "opcode" : 99, "operands" : [ { "kind" : "IdRef", "name" : "'Image'" }, @@ -895,6 +1091,7 @@ }, { "opname" : "OpImage", + "class" : "Image", "opcode" : 100, "operands" : [ { "kind" : "IdResultType" }, @@ -904,6 +1101,7 @@ }, { "opname" : "OpImageQueryFormat", + "class" : "Image", "opcode" : 101, "operands" : [ { "kind" : "IdResultType" }, @@ -914,6 +1112,7 @@ }, { "opname" : "OpImageQueryOrder", + "class" : "Image", "opcode" : 102, "operands" : [ { "kind" : "IdResultType" }, @@ -924,6 +1123,7 @@ }, { "opname" : "OpImageQuerySizeLod", + "class" : "Image", "opcode" : 103, "operands" : [ { "kind" : "IdResultType" }, @@ -935,6 +1135,7 @@ }, { "opname" : "OpImageQuerySize", + "class" : "Image", "opcode" : 104, "operands" : [ { "kind" : "IdResultType" }, @@ -945,6 +1146,7 @@ }, { "opname" : "OpImageQueryLod", + "class" : "Image", "opcode" : 105, "operands" : [ { "kind" : "IdResultType" }, @@ -956,6 +1158,7 @@ }, { "opname" : "OpImageQueryLevels", + "class" : "Image", "opcode" : 106, "operands" : [ { "kind" : "IdResultType" }, @@ -966,6 +1169,7 @@ }, { "opname" : "OpImageQuerySamples", + "class" : "Image", "opcode" : 107, "operands" : [ { "kind" : "IdResultType" }, @@ -976,6 +1180,7 @@ }, { "opname" : "OpConvertFToU", + "class" : "Conversion", "opcode" : 109, "operands" : [ { "kind" : "IdResultType" }, @@ -985,6 +1190,7 @@ }, { "opname" : "OpConvertFToS", + "class" : "Conversion", "opcode" : 110, "operands" : [ { "kind" : "IdResultType" }, @@ -994,6 +1200,7 @@ }, { "opname" : "OpConvertSToF", + "class" : "Conversion", "opcode" : 111, "operands" : [ { "kind" : "IdResultType" }, @@ -1003,6 +1210,7 @@ }, { "opname" : "OpConvertUToF", + "class" : "Conversion", "opcode" : 112, "operands" : [ { "kind" : "IdResultType" }, @@ -1012,6 +1220,7 @@ }, { "opname" : "OpUConvert", + "class" : "Conversion", "opcode" : 113, "operands" : [ { "kind" : "IdResultType" }, @@ -1021,6 +1230,7 @@ }, { "opname" : "OpSConvert", + "class" : "Conversion", "opcode" : 114, "operands" : [ { "kind" : "IdResultType" }, @@ -1030,6 +1240,7 @@ }, { "opname" : "OpFConvert", + "class" : "Conversion", "opcode" : 115, "operands" : [ { "kind" : "IdResultType" }, @@ -1039,6 +1250,7 @@ }, { "opname" : "OpQuantizeToF16", + "class" : "Conversion", "opcode" : 116, "operands" : [ { "kind" : "IdResultType" }, @@ -1048,6 +1260,7 @@ }, { "opname" : "OpConvertPtrToU", + "class" : "Conversion", "opcode" : 117, "operands" : [ { "kind" : "IdResultType" }, @@ -1056,11 +1269,12 @@ ], "capabilities" : [ "Addresses", - "PhysicalStorageBufferAddressesEXT" + "PhysicalStorageBufferAddresses" ] }, { "opname" : "OpSatConvertSToU", + "class" : "Conversion", "opcode" : 118, "operands" : [ { "kind" : "IdResultType" }, @@ -1071,6 +1285,7 @@ }, { "opname" : "OpSatConvertUToS", + "class" : "Conversion", "opcode" : 119, "operands" : [ { "kind" : "IdResultType" }, @@ -1081,6 +1296,7 @@ }, { "opname" : "OpConvertUToPtr", + "class" : "Conversion", "opcode" : 120, "operands" : [ { "kind" : "IdResultType" }, @@ -1089,11 +1305,12 @@ ], "capabilities" : [ "Addresses", - "PhysicalStorageBufferAddressesEXT" + "PhysicalStorageBufferAddresses" ] }, { "opname" : "OpPtrCastToGeneric", + "class" : "Conversion", "opcode" : 121, "operands" : [ { "kind" : "IdResultType" }, @@ -1104,6 +1321,7 @@ }, { "opname" : "OpGenericCastToPtr", + "class" : "Conversion", "opcode" : 122, "operands" : [ { "kind" : "IdResultType" }, @@ -1114,6 +1332,7 @@ }, { "opname" : "OpGenericCastToPtrExplicit", + "class" : "Conversion", "opcode" : 123, "operands" : [ { "kind" : "IdResultType" }, @@ -1125,6 +1344,7 @@ }, { "opname" : "OpBitcast", + "class" : "Conversion", "opcode" : 124, "operands" : [ { "kind" : "IdResultType" }, @@ -1134,6 +1354,7 @@ }, { "opname" : "OpSNegate", + "class" : "Arithmetic", "opcode" : 126, "operands" : [ { "kind" : "IdResultType" }, @@ -1143,6 +1364,7 @@ }, { "opname" : "OpFNegate", + "class" : "Arithmetic", "opcode" : 127, "operands" : [ { "kind" : "IdResultType" }, @@ -1152,6 +1374,7 @@ }, { "opname" : "OpIAdd", + "class" : "Arithmetic", "opcode" : 128, "operands" : [ { "kind" : "IdResultType" }, @@ -1162,6 +1385,7 @@ }, { "opname" : "OpFAdd", + "class" : "Arithmetic", "opcode" : 129, "operands" : [ { "kind" : "IdResultType" }, @@ -1172,6 +1396,7 @@ }, { "opname" : "OpISub", + "class" : "Arithmetic", "opcode" : 130, "operands" : [ { "kind" : "IdResultType" }, @@ -1182,6 +1407,7 @@ }, { "opname" : "OpFSub", + "class" : "Arithmetic", "opcode" : 131, "operands" : [ { "kind" : "IdResultType" }, @@ -1192,6 +1418,7 @@ }, { "opname" : "OpIMul", + "class" : "Arithmetic", "opcode" : 132, "operands" : [ { "kind" : "IdResultType" }, @@ -1202,6 +1429,7 @@ }, { "opname" : "OpFMul", + "class" : "Arithmetic", "opcode" : 133, "operands" : [ { "kind" : "IdResultType" }, @@ -1212,6 +1440,7 @@ }, { "opname" : "OpUDiv", + "class" : "Arithmetic", "opcode" : 134, "operands" : [ { "kind" : "IdResultType" }, @@ -1222,6 +1451,7 @@ }, { "opname" : "OpSDiv", + "class" : "Arithmetic", "opcode" : 135, "operands" : [ { "kind" : "IdResultType" }, @@ -1232,6 +1462,7 @@ }, { "opname" : "OpFDiv", + "class" : "Arithmetic", "opcode" : 136, "operands" : [ { "kind" : "IdResultType" }, @@ -1242,6 +1473,7 @@ }, { "opname" : "OpUMod", + "class" : "Arithmetic", "opcode" : 137, "operands" : [ { "kind" : "IdResultType" }, @@ -1252,6 +1484,7 @@ }, { "opname" : "OpSRem", + "class" : "Arithmetic", "opcode" : 138, "operands" : [ { "kind" : "IdResultType" }, @@ -1262,6 +1495,7 @@ }, { "opname" : "OpSMod", + "class" : "Arithmetic", "opcode" : 139, "operands" : [ { "kind" : "IdResultType" }, @@ -1272,6 +1506,7 @@ }, { "opname" : "OpFRem", + "class" : "Arithmetic", "opcode" : 140, "operands" : [ { "kind" : "IdResultType" }, @@ -1282,6 +1517,7 @@ }, { "opname" : "OpFMod", + "class" : "Arithmetic", "opcode" : 141, "operands" : [ { "kind" : "IdResultType" }, @@ -1292,6 +1528,7 @@ }, { "opname" : "OpVectorTimesScalar", + "class" : "Arithmetic", "opcode" : 142, "operands" : [ { "kind" : "IdResultType" }, @@ -1302,6 +1539,7 @@ }, { "opname" : "OpMatrixTimesScalar", + "class" : "Arithmetic", "opcode" : 143, "operands" : [ { "kind" : "IdResultType" }, @@ -1313,6 +1551,7 @@ }, { "opname" : "OpVectorTimesMatrix", + "class" : "Arithmetic", "opcode" : 144, "operands" : [ { "kind" : "IdResultType" }, @@ -1324,6 +1563,7 @@ }, { "opname" : "OpMatrixTimesVector", + "class" : "Arithmetic", "opcode" : 145, "operands" : [ { "kind" : "IdResultType" }, @@ -1335,6 +1575,7 @@ }, { "opname" : "OpMatrixTimesMatrix", + "class" : "Arithmetic", "opcode" : 146, "operands" : [ { "kind" : "IdResultType" }, @@ -1346,6 +1587,7 @@ }, { "opname" : "OpOuterProduct", + "class" : "Arithmetic", "opcode" : 147, "operands" : [ { "kind" : "IdResultType" }, @@ -1357,6 +1599,7 @@ }, { "opname" : "OpDot", + "class" : "Arithmetic", "opcode" : 148, "operands" : [ { "kind" : "IdResultType" }, @@ -1367,6 +1610,7 @@ }, { "opname" : "OpIAddCarry", + "class" : "Arithmetic", "opcode" : 149, "operands" : [ { "kind" : "IdResultType" }, @@ -1377,6 +1621,7 @@ }, { "opname" : "OpISubBorrow", + "class" : "Arithmetic", "opcode" : 150, "operands" : [ { "kind" : "IdResultType" }, @@ -1387,6 +1632,7 @@ }, { "opname" : "OpUMulExtended", + "class" : "Arithmetic", "opcode" : 151, "operands" : [ { "kind" : "IdResultType" }, @@ -1397,6 +1643,7 @@ }, { "opname" : "OpSMulExtended", + "class" : "Arithmetic", "opcode" : 152, "operands" : [ { "kind" : "IdResultType" }, @@ -1407,6 +1654,7 @@ }, { "opname" : "OpAny", + "class" : "Relational_and_Logical", "opcode" : 154, "operands" : [ { "kind" : "IdResultType" }, @@ -1416,6 +1664,7 @@ }, { "opname" : "OpAll", + "class" : "Relational_and_Logical", "opcode" : 155, "operands" : [ { "kind" : "IdResultType" }, @@ -1425,6 +1674,7 @@ }, { "opname" : "OpIsNan", + "class" : "Relational_and_Logical", "opcode" : 156, "operands" : [ { "kind" : "IdResultType" }, @@ -1434,6 +1684,7 @@ }, { "opname" : "OpIsInf", + "class" : "Relational_and_Logical", "opcode" : 157, "operands" : [ { "kind" : "IdResultType" }, @@ -1443,6 +1694,7 @@ }, { "opname" : "OpIsFinite", + "class" : "Relational_and_Logical", "opcode" : 158, "operands" : [ { "kind" : "IdResultType" }, @@ -1453,6 +1705,7 @@ }, { "opname" : "OpIsNormal", + "class" : "Relational_and_Logical", "opcode" : 159, "operands" : [ { "kind" : "IdResultType" }, @@ -1463,6 +1716,7 @@ }, { "opname" : "OpSignBitSet", + "class" : "Relational_and_Logical", "opcode" : 160, "operands" : [ { "kind" : "IdResultType" }, @@ -1473,6 +1727,7 @@ }, { "opname" : "OpLessOrGreater", + "class" : "Relational_and_Logical", "opcode" : 161, "operands" : [ { "kind" : "IdResultType" }, @@ -1484,6 +1739,7 @@ }, { "opname" : "OpOrdered", + "class" : "Relational_and_Logical", "opcode" : 162, "operands" : [ { "kind" : "IdResultType" }, @@ -1495,6 +1751,7 @@ }, { "opname" : "OpUnordered", + "class" : "Relational_and_Logical", "opcode" : 163, "operands" : [ { "kind" : "IdResultType" }, @@ -1506,6 +1763,7 @@ }, { "opname" : "OpLogicalEqual", + "class" : "Relational_and_Logical", "opcode" : 164, "operands" : [ { "kind" : "IdResultType" }, @@ -1516,6 +1774,7 @@ }, { "opname" : "OpLogicalNotEqual", + "class" : "Relational_and_Logical", "opcode" : 165, "operands" : [ { "kind" : "IdResultType" }, @@ -1526,6 +1785,7 @@ }, { "opname" : "OpLogicalOr", + "class" : "Relational_and_Logical", "opcode" : 166, "operands" : [ { "kind" : "IdResultType" }, @@ -1536,6 +1796,7 @@ }, { "opname" : "OpLogicalAnd", + "class" : "Relational_and_Logical", "opcode" : 167, "operands" : [ { "kind" : "IdResultType" }, @@ -1546,6 +1807,7 @@ }, { "opname" : "OpLogicalNot", + "class" : "Relational_and_Logical", "opcode" : 168, "operands" : [ { "kind" : "IdResultType" }, @@ -1555,6 +1817,7 @@ }, { "opname" : "OpSelect", + "class" : "Relational_and_Logical", "opcode" : 169, "operands" : [ { "kind" : "IdResultType" }, @@ -1566,6 +1829,7 @@ }, { "opname" : "OpIEqual", + "class" : "Relational_and_Logical", "opcode" : 170, "operands" : [ { "kind" : "IdResultType" }, @@ -1576,6 +1840,7 @@ }, { "opname" : "OpINotEqual", + "class" : "Relational_and_Logical", "opcode" : 171, "operands" : [ { "kind" : "IdResultType" }, @@ -1586,6 +1851,7 @@ }, { "opname" : "OpUGreaterThan", + "class" : "Relational_and_Logical", "opcode" : 172, "operands" : [ { "kind" : "IdResultType" }, @@ -1596,6 +1862,7 @@ }, { "opname" : "OpSGreaterThan", + "class" : "Relational_and_Logical", "opcode" : 173, "operands" : [ { "kind" : "IdResultType" }, @@ -1606,6 +1873,7 @@ }, { "opname" : "OpUGreaterThanEqual", + "class" : "Relational_and_Logical", "opcode" : 174, "operands" : [ { "kind" : "IdResultType" }, @@ -1616,6 +1884,7 @@ }, { "opname" : "OpSGreaterThanEqual", + "class" : "Relational_and_Logical", "opcode" : 175, "operands" : [ { "kind" : "IdResultType" }, @@ -1626,6 +1895,7 @@ }, { "opname" : "OpULessThan", + "class" : "Relational_and_Logical", "opcode" : 176, "operands" : [ { "kind" : "IdResultType" }, @@ -1636,6 +1906,7 @@ }, { "opname" : "OpSLessThan", + "class" : "Relational_and_Logical", "opcode" : 177, "operands" : [ { "kind" : "IdResultType" }, @@ -1646,6 +1917,7 @@ }, { "opname" : "OpULessThanEqual", + "class" : "Relational_and_Logical", "opcode" : 178, "operands" : [ { "kind" : "IdResultType" }, @@ -1656,6 +1928,7 @@ }, { "opname" : "OpSLessThanEqual", + "class" : "Relational_and_Logical", "opcode" : 179, "operands" : [ { "kind" : "IdResultType" }, @@ -1666,6 +1939,7 @@ }, { "opname" : "OpFOrdEqual", + "class" : "Relational_and_Logical", "opcode" : 180, "operands" : [ { "kind" : "IdResultType" }, @@ -1676,6 +1950,7 @@ }, { "opname" : "OpFUnordEqual", + "class" : "Relational_and_Logical", "opcode" : 181, "operands" : [ { "kind" : "IdResultType" }, @@ -1686,6 +1961,7 @@ }, { "opname" : "OpFOrdNotEqual", + "class" : "Relational_and_Logical", "opcode" : 182, "operands" : [ { "kind" : "IdResultType" }, @@ -1696,6 +1972,7 @@ }, { "opname" : "OpFUnordNotEqual", + "class" : "Relational_and_Logical", "opcode" : 183, "operands" : [ { "kind" : "IdResultType" }, @@ -1706,6 +1983,7 @@ }, { "opname" : "OpFOrdLessThan", + "class" : "Relational_and_Logical", "opcode" : 184, "operands" : [ { "kind" : "IdResultType" }, @@ -1716,6 +1994,7 @@ }, { "opname" : "OpFUnordLessThan", + "class" : "Relational_and_Logical", "opcode" : 185, "operands" : [ { "kind" : "IdResultType" }, @@ -1726,6 +2005,7 @@ }, { "opname" : "OpFOrdGreaterThan", + "class" : "Relational_and_Logical", "opcode" : 186, "operands" : [ { "kind" : "IdResultType" }, @@ -1736,6 +2016,7 @@ }, { "opname" : "OpFUnordGreaterThan", + "class" : "Relational_and_Logical", "opcode" : 187, "operands" : [ { "kind" : "IdResultType" }, @@ -1746,6 +2027,7 @@ }, { "opname" : "OpFOrdLessThanEqual", + "class" : "Relational_and_Logical", "opcode" : 188, "operands" : [ { "kind" : "IdResultType" }, @@ -1756,6 +2038,7 @@ }, { "opname" : "OpFUnordLessThanEqual", + "class" : "Relational_and_Logical", "opcode" : 189, "operands" : [ { "kind" : "IdResultType" }, @@ -1766,6 +2049,7 @@ }, { "opname" : "OpFOrdGreaterThanEqual", + "class" : "Relational_and_Logical", "opcode" : 190, "operands" : [ { "kind" : "IdResultType" }, @@ -1776,6 +2060,7 @@ }, { "opname" : "OpFUnordGreaterThanEqual", + "class" : "Relational_and_Logical", "opcode" : 191, "operands" : [ { "kind" : "IdResultType" }, @@ -1786,6 +2071,7 @@ }, { "opname" : "OpShiftRightLogical", + "class" : "Bit", "opcode" : 194, "operands" : [ { "kind" : "IdResultType" }, @@ -1796,6 +2082,7 @@ }, { "opname" : "OpShiftRightArithmetic", + "class" : "Bit", "opcode" : 195, "operands" : [ { "kind" : "IdResultType" }, @@ -1806,6 +2093,7 @@ }, { "opname" : "OpShiftLeftLogical", + "class" : "Bit", "opcode" : 196, "operands" : [ { "kind" : "IdResultType" }, @@ -1816,6 +2104,7 @@ }, { "opname" : "OpBitwiseOr", + "class" : "Bit", "opcode" : 197, "operands" : [ { "kind" : "IdResultType" }, @@ -1826,6 +2115,7 @@ }, { "opname" : "OpBitwiseXor", + "class" : "Bit", "opcode" : 198, "operands" : [ { "kind" : "IdResultType" }, @@ -1836,6 +2126,7 @@ }, { "opname" : "OpBitwiseAnd", + "class" : "Bit", "opcode" : 199, "operands" : [ { "kind" : "IdResultType" }, @@ -1846,6 +2137,7 @@ }, { "opname" : "OpNot", + "class" : "Bit", "opcode" : 200, "operands" : [ { "kind" : "IdResultType" }, @@ -1855,6 +2147,7 @@ }, { "opname" : "OpBitFieldInsert", + "class" : "Bit", "opcode" : 201, "operands" : [ { "kind" : "IdResultType" }, @@ -1868,6 +2161,7 @@ }, { "opname" : "OpBitFieldSExtract", + "class" : "Bit", "opcode" : 202, "operands" : [ { "kind" : "IdResultType" }, @@ -1880,6 +2174,7 @@ }, { "opname" : "OpBitFieldUExtract", + "class" : "Bit", "opcode" : 203, "operands" : [ { "kind" : "IdResultType" }, @@ -1892,6 +2187,7 @@ }, { "opname" : "OpBitReverse", + "class" : "Bit", "opcode" : 204, "operands" : [ { "kind" : "IdResultType" }, @@ -1902,6 +2198,7 @@ }, { "opname" : "OpBitCount", + "class" : "Bit", "opcode" : 205, "operands" : [ { "kind" : "IdResultType" }, @@ -1911,6 +2208,7 @@ }, { "opname" : "OpDPdx", + "class" : "Derivative", "opcode" : 207, "operands" : [ { "kind" : "IdResultType" }, @@ -1921,6 +2219,7 @@ }, { "opname" : "OpDPdy", + "class" : "Derivative", "opcode" : 208, "operands" : [ { "kind" : "IdResultType" }, @@ -1931,6 +2230,7 @@ }, { "opname" : "OpFwidth", + "class" : "Derivative", "opcode" : 209, "operands" : [ { "kind" : "IdResultType" }, @@ -1941,6 +2241,7 @@ }, { "opname" : "OpDPdxFine", + "class" : "Derivative", "opcode" : 210, "operands" : [ { "kind" : "IdResultType" }, @@ -1951,6 +2252,7 @@ }, { "opname" : "OpDPdyFine", + "class" : "Derivative", "opcode" : 211, "operands" : [ { "kind" : "IdResultType" }, @@ -1961,6 +2263,7 @@ }, { "opname" : "OpFwidthFine", + "class" : "Derivative", "opcode" : 212, "operands" : [ { "kind" : "IdResultType" }, @@ -1971,6 +2274,7 @@ }, { "opname" : "OpDPdxCoarse", + "class" : "Derivative", "opcode" : 213, "operands" : [ { "kind" : "IdResultType" }, @@ -1981,6 +2285,7 @@ }, { "opname" : "OpDPdyCoarse", + "class" : "Derivative", "opcode" : 214, "operands" : [ { "kind" : "IdResultType" }, @@ -1991,6 +2296,7 @@ }, { "opname" : "OpFwidthCoarse", + "class" : "Derivative", "opcode" : 215, "operands" : [ { "kind" : "IdResultType" }, @@ -2001,16 +2307,19 @@ }, { "opname" : "OpEmitVertex", + "class" : "Primitive", "opcode" : 218, "capabilities" : [ "Geometry" ] }, { "opname" : "OpEndPrimitive", + "class" : "Primitive", "opcode" : 219, "capabilities" : [ "Geometry" ] }, { "opname" : "OpEmitStreamVertex", + "class" : "Primitive", "opcode" : 220, "operands" : [ { "kind" : "IdRef", "name" : "'Stream'" } @@ -2019,6 +2328,7 @@ }, { "opname" : "OpEndStreamPrimitive", + "class" : "Primitive", "opcode" : 221, "operands" : [ { "kind" : "IdRef", "name" : "'Stream'" } @@ -2027,6 +2337,7 @@ }, { "opname" : "OpControlBarrier", + "class" : "Barrier", "opcode" : 224, "operands" : [ { "kind" : "IdScope", "name" : "'Execution'" }, @@ -2036,6 +2347,7 @@ }, { "opname" : "OpMemoryBarrier", + "class" : "Barrier", "opcode" : 225, "operands" : [ { "kind" : "IdScope", "name" : "'Memory'" }, @@ -2044,6 +2356,7 @@ }, { "opname" : "OpAtomicLoad", + "class" : "Atomic", "opcode" : 227, "operands" : [ { "kind" : "IdResultType" }, @@ -2055,6 +2368,7 @@ }, { "opname" : "OpAtomicStore", + "class" : "Atomic", "opcode" : 228, "operands" : [ { "kind" : "IdRef", "name" : "'Pointer'" }, @@ -2065,6 +2379,7 @@ }, { "opname" : "OpAtomicExchange", + "class" : "Atomic", "opcode" : 229, "operands" : [ { "kind" : "IdResultType" }, @@ -2077,6 +2392,7 @@ }, { "opname" : "OpAtomicCompareExchange", + "class" : "Atomic", "opcode" : 230, "operands" : [ { "kind" : "IdResultType" }, @@ -2091,6 +2407,7 @@ }, { "opname" : "OpAtomicCompareExchangeWeak", + "class" : "Atomic", "opcode" : 231, "operands" : [ { "kind" : "IdResultType" }, @@ -2107,6 +2424,7 @@ }, { "opname" : "OpAtomicIIncrement", + "class" : "Atomic", "opcode" : 232, "operands" : [ { "kind" : "IdResultType" }, @@ -2118,6 +2436,7 @@ }, { "opname" : "OpAtomicIDecrement", + "class" : "Atomic", "opcode" : 233, "operands" : [ { "kind" : "IdResultType" }, @@ -2129,6 +2448,7 @@ }, { "opname" : "OpAtomicIAdd", + "class" : "Atomic", "opcode" : 234, "operands" : [ { "kind" : "IdResultType" }, @@ -2141,6 +2461,7 @@ }, { "opname" : "OpAtomicISub", + "class" : "Atomic", "opcode" : 235, "operands" : [ { "kind" : "IdResultType" }, @@ -2153,6 +2474,7 @@ }, { "opname" : "OpAtomicSMin", + "class" : "Atomic", "opcode" : 236, "operands" : [ { "kind" : "IdResultType" }, @@ -2165,6 +2487,7 @@ }, { "opname" : "OpAtomicUMin", + "class" : "Atomic", "opcode" : 237, "operands" : [ { "kind" : "IdResultType" }, @@ -2177,6 +2500,7 @@ }, { "opname" : "OpAtomicSMax", + "class" : "Atomic", "opcode" : 238, "operands" : [ { "kind" : "IdResultType" }, @@ -2189,6 +2513,7 @@ }, { "opname" : "OpAtomicUMax", + "class" : "Atomic", "opcode" : 239, "operands" : [ { "kind" : "IdResultType" }, @@ -2201,6 +2526,7 @@ }, { "opname" : "OpAtomicAnd", + "class" : "Atomic", "opcode" : 240, "operands" : [ { "kind" : "IdResultType" }, @@ -2213,6 +2539,7 @@ }, { "opname" : "OpAtomicOr", + "class" : "Atomic", "opcode" : 241, "operands" : [ { "kind" : "IdResultType" }, @@ -2225,6 +2552,7 @@ }, { "opname" : "OpAtomicXor", + "class" : "Atomic", "opcode" : 242, "operands" : [ { "kind" : "IdResultType" }, @@ -2237,6 +2565,7 @@ }, { "opname" : "OpPhi", + "class" : "Control-Flow", "opcode" : 245, "operands" : [ { "kind" : "IdResultType" }, @@ -2246,6 +2575,7 @@ }, { "opname" : "OpLoopMerge", + "class" : "Control-Flow", "opcode" : 246, "operands" : [ { "kind" : "IdRef", "name" : "'Merge Block'" }, @@ -2255,6 +2585,7 @@ }, { "opname" : "OpSelectionMerge", + "class" : "Control-Flow", "opcode" : 247, "operands" : [ { "kind" : "IdRef", "name" : "'Merge Block'" }, @@ -2263,6 +2594,7 @@ }, { "opname" : "OpLabel", + "class" : "Control-Flow", "opcode" : 248, "operands" : [ { "kind" : "IdResult" } @@ -2270,6 +2602,7 @@ }, { "opname" : "OpBranch", + "class" : "Control-Flow", "opcode" : 249, "operands" : [ { "kind" : "IdRef", "name" : "'Target Label'" } @@ -2277,6 +2610,7 @@ }, { "opname" : "OpBranchConditional", + "class" : "Control-Flow", "opcode" : 250, "operands" : [ { "kind" : "IdRef", "name" : "'Condition'" }, @@ -2287,6 +2621,7 @@ }, { "opname" : "OpSwitch", + "class" : "Control-Flow", "opcode" : 251, "operands" : [ { "kind" : "IdRef", "name" : "'Selector'" }, @@ -2296,15 +2631,18 @@ }, { "opname" : "OpKill", + "class" : "Control-Flow", "opcode" : 252, "capabilities" : [ "Shader" ] }, { "opname" : "OpReturn", + "class" : "Control-Flow", "opcode" : 253 }, { "opname" : "OpReturnValue", + "class" : "Control-Flow", "opcode" : 254, "operands" : [ { "kind" : "IdRef", "name" : "'Value'" } @@ -2312,10 +2650,12 @@ }, { "opname" : "OpUnreachable", + "class" : "Control-Flow", "opcode" : 255 }, { "opname" : "OpLifetimeStart", + "class" : "Control-Flow", "opcode" : 256, "operands" : [ { "kind" : "IdRef", "name" : "'Pointer'" }, @@ -2325,6 +2665,7 @@ }, { "opname" : "OpLifetimeStop", + "class" : "Control-Flow", "opcode" : 257, "operands" : [ { "kind" : "IdRef", "name" : "'Pointer'" }, @@ -2334,6 +2675,7 @@ }, { "opname" : "OpGroupAsyncCopy", + "class" : "Group", "opcode" : 259, "operands" : [ { "kind" : "IdResultType" }, @@ -2349,6 +2691,7 @@ }, { "opname" : "OpGroupWaitEvents", + "class" : "Group", "opcode" : 260, "operands" : [ { "kind" : "IdScope", "name" : "'Execution'" }, @@ -2359,6 +2702,7 @@ }, { "opname" : "OpGroupAll", + "class" : "Group", "opcode" : 261, "operands" : [ { "kind" : "IdResultType" }, @@ -2370,6 +2714,7 @@ }, { "opname" : "OpGroupAny", + "class" : "Group", "opcode" : 262, "operands" : [ { "kind" : "IdResultType" }, @@ -2381,6 +2726,7 @@ }, { "opname" : "OpGroupBroadcast", + "class" : "Group", "opcode" : 263, "operands" : [ { "kind" : "IdResultType" }, @@ -2393,6 +2739,7 @@ }, { "opname" : "OpGroupIAdd", + "class" : "Group", "opcode" : 264, "operands" : [ { "kind" : "IdResultType" }, @@ -2405,6 +2752,7 @@ }, { "opname" : "OpGroupFAdd", + "class" : "Group", "opcode" : 265, "operands" : [ { "kind" : "IdResultType" }, @@ -2417,6 +2765,7 @@ }, { "opname" : "OpGroupFMin", + "class" : "Group", "opcode" : 266, "operands" : [ { "kind" : "IdResultType" }, @@ -2429,6 +2778,7 @@ }, { "opname" : "OpGroupUMin", + "class" : "Group", "opcode" : 267, "operands" : [ { "kind" : "IdResultType" }, @@ -2441,6 +2791,7 @@ }, { "opname" : "OpGroupSMin", + "class" : "Group", "opcode" : 268, "operands" : [ { "kind" : "IdResultType" }, @@ -2453,6 +2804,7 @@ }, { "opname" : "OpGroupFMax", + "class" : "Group", "opcode" : 269, "operands" : [ { "kind" : "IdResultType" }, @@ -2465,6 +2817,7 @@ }, { "opname" : "OpGroupUMax", + "class" : "Group", "opcode" : 270, "operands" : [ { "kind" : "IdResultType" }, @@ -2477,6 +2830,7 @@ }, { "opname" : "OpGroupSMax", + "class" : "Group", "opcode" : 271, "operands" : [ { "kind" : "IdResultType" }, @@ -2489,6 +2843,7 @@ }, { "opname" : "OpReadPipe", + "class" : "Pipe", "opcode" : 274, "operands" : [ { "kind" : "IdResultType" }, @@ -2502,6 +2857,7 @@ }, { "opname" : "OpWritePipe", + "class" : "Pipe", "opcode" : 275, "operands" : [ { "kind" : "IdResultType" }, @@ -2515,6 +2871,7 @@ }, { "opname" : "OpReservedReadPipe", + "class" : "Pipe", "opcode" : 276, "operands" : [ { "kind" : "IdResultType" }, @@ -2530,6 +2887,7 @@ }, { "opname" : "OpReservedWritePipe", + "class" : "Pipe", "opcode" : 277, "operands" : [ { "kind" : "IdResultType" }, @@ -2545,6 +2903,7 @@ }, { "opname" : "OpReserveReadPipePackets", + "class" : "Pipe", "opcode" : 278, "operands" : [ { "kind" : "IdResultType" }, @@ -2558,6 +2917,7 @@ }, { "opname" : "OpReserveWritePipePackets", + "class" : "Pipe", "opcode" : 279, "operands" : [ { "kind" : "IdResultType" }, @@ -2571,6 +2931,7 @@ }, { "opname" : "OpCommitReadPipe", + "class" : "Pipe", "opcode" : 280, "operands" : [ { "kind" : "IdRef", "name" : "'Pipe'" }, @@ -2582,6 +2943,7 @@ }, { "opname" : "OpCommitWritePipe", + "class" : "Pipe", "opcode" : 281, "operands" : [ { "kind" : "IdRef", "name" : "'Pipe'" }, @@ -2593,6 +2955,7 @@ }, { "opname" : "OpIsValidReserveId", + "class" : "Pipe", "opcode" : 282, "operands" : [ { "kind" : "IdResultType" }, @@ -2603,6 +2966,7 @@ }, { "opname" : "OpGetNumPipePackets", + "class" : "Pipe", "opcode" : 283, "operands" : [ { "kind" : "IdResultType" }, @@ -2615,6 +2979,7 @@ }, { "opname" : "OpGetMaxPipePackets", + "class" : "Pipe", "opcode" : 284, "operands" : [ { "kind" : "IdResultType" }, @@ -2627,6 +2992,7 @@ }, { "opname" : "OpGroupReserveReadPipePackets", + "class" : "Pipe", "opcode" : 285, "operands" : [ { "kind" : "IdResultType" }, @@ -2641,6 +3007,7 @@ }, { "opname" : "OpGroupReserveWritePipePackets", + "class" : "Pipe", "opcode" : 286, "operands" : [ { "kind" : "IdResultType" }, @@ -2655,6 +3022,7 @@ }, { "opname" : "OpGroupCommitReadPipe", + "class" : "Pipe", "opcode" : 287, "operands" : [ { "kind" : "IdScope", "name" : "'Execution'" }, @@ -2667,6 +3035,7 @@ }, { "opname" : "OpGroupCommitWritePipe", + "class" : "Pipe", "opcode" : 288, "operands" : [ { "kind" : "IdScope", "name" : "'Execution'" }, @@ -2679,6 +3048,7 @@ }, { "opname" : "OpEnqueueMarker", + "class" : "Device-Side_Enqueue", "opcode" : 291, "operands" : [ { "kind" : "IdResultType" }, @@ -2692,6 +3062,7 @@ }, { "opname" : "OpEnqueueKernel", + "class" : "Device-Side_Enqueue", "opcode" : 292, "operands" : [ { "kind" : "IdResultType" }, @@ -2712,6 +3083,7 @@ }, { "opname" : "OpGetKernelNDrangeSubGroupCount", + "class" : "Device-Side_Enqueue", "opcode" : 293, "operands" : [ { "kind" : "IdResultType" }, @@ -2726,6 +3098,7 @@ }, { "opname" : "OpGetKernelNDrangeMaxSubGroupSize", + "class" : "Device-Side_Enqueue", "opcode" : 294, "operands" : [ { "kind" : "IdResultType" }, @@ -2740,6 +3113,7 @@ }, { "opname" : "OpGetKernelWorkGroupSize", + "class" : "Device-Side_Enqueue", "opcode" : 295, "operands" : [ { "kind" : "IdResultType" }, @@ -2753,6 +3127,7 @@ }, { "opname" : "OpGetKernelPreferredWorkGroupSizeMultiple", + "class" : "Device-Side_Enqueue", "opcode" : 296, "operands" : [ { "kind" : "IdResultType" }, @@ -2766,6 +3141,7 @@ }, { "opname" : "OpRetainEvent", + "class" : "Device-Side_Enqueue", "opcode" : 297, "operands" : [ { "kind" : "IdRef", "name" : "'Event'" } @@ -2774,6 +3150,7 @@ }, { "opname" : "OpReleaseEvent", + "class" : "Device-Side_Enqueue", "opcode" : 298, "operands" : [ { "kind" : "IdRef", "name" : "'Event'" } @@ -2782,6 +3159,7 @@ }, { "opname" : "OpCreateUserEvent", + "class" : "Device-Side_Enqueue", "opcode" : 299, "operands" : [ { "kind" : "IdResultType" }, @@ -2791,6 +3169,7 @@ }, { "opname" : "OpIsValidEvent", + "class" : "Device-Side_Enqueue", "opcode" : 300, "operands" : [ { "kind" : "IdResultType" }, @@ -2801,6 +3180,7 @@ }, { "opname" : "OpSetUserEventStatus", + "class" : "Device-Side_Enqueue", "opcode" : 301, "operands" : [ { "kind" : "IdRef", "name" : "'Event'" }, @@ -2810,6 +3190,7 @@ }, { "opname" : "OpCaptureEventProfilingInfo", + "class" : "Device-Side_Enqueue", "opcode" : 302, "operands" : [ { "kind" : "IdRef", "name" : "'Event'" }, @@ -2820,6 +3201,7 @@ }, { "opname" : "OpGetDefaultQueue", + "class" : "Device-Side_Enqueue", "opcode" : 303, "operands" : [ { "kind" : "IdResultType" }, @@ -2829,6 +3211,7 @@ }, { "opname" : "OpBuildNDRange", + "class" : "Device-Side_Enqueue", "opcode" : 304, "operands" : [ { "kind" : "IdResultType" }, @@ -2841,6 +3224,7 @@ }, { "opname" : "OpImageSparseSampleImplicitLod", + "class" : "Image", "opcode" : 305, "operands" : [ { "kind" : "IdResultType" }, @@ -2853,6 +3237,7 @@ }, { "opname" : "OpImageSparseSampleExplicitLod", + "class" : "Image", "opcode" : 306, "operands" : [ { "kind" : "IdResultType" }, @@ -2865,6 +3250,7 @@ }, { "opname" : "OpImageSparseSampleDrefImplicitLod", + "class" : "Image", "opcode" : 307, "operands" : [ { "kind" : "IdResultType" }, @@ -2878,6 +3264,7 @@ }, { "opname" : "OpImageSparseSampleDrefExplicitLod", + "class" : "Image", "opcode" : 308, "operands" : [ { "kind" : "IdResultType" }, @@ -2891,6 +3278,7 @@ }, { "opname" : "OpImageSparseSampleProjImplicitLod", + "class" : "Image", "opcode" : 309, "operands" : [ { "kind" : "IdResultType" }, @@ -2904,6 +3292,7 @@ }, { "opname" : "OpImageSparseSampleProjExplicitLod", + "class" : "Image", "opcode" : 310, "operands" : [ { "kind" : "IdResultType" }, @@ -2917,6 +3306,7 @@ }, { "opname" : "OpImageSparseSampleProjDrefImplicitLod", + "class" : "Image", "opcode" : 311, "operands" : [ { "kind" : "IdResultType" }, @@ -2931,6 +3321,7 @@ }, { "opname" : "OpImageSparseSampleProjDrefExplicitLod", + "class" : "Image", "opcode" : 312, "operands" : [ { "kind" : "IdResultType" }, @@ -2945,6 +3336,7 @@ }, { "opname" : "OpImageSparseFetch", + "class" : "Image", "opcode" : 313, "operands" : [ { "kind" : "IdResultType" }, @@ -2957,6 +3349,7 @@ }, { "opname" : "OpImageSparseGather", + "class" : "Image", "opcode" : 314, "operands" : [ { "kind" : "IdResultType" }, @@ -2970,6 +3363,7 @@ }, { "opname" : "OpImageSparseDrefGather", + "class" : "Image", "opcode" : 315, "operands" : [ { "kind" : "IdResultType" }, @@ -2983,6 +3377,7 @@ }, { "opname" : "OpImageSparseTexelsResident", + "class" : "Image", "opcode" : 316, "operands" : [ { "kind" : "IdResultType" }, @@ -2993,10 +3388,12 @@ }, { "opname" : "OpNoLine", + "class" : "Debug", "opcode" : 317 }, { "opname" : "OpAtomicFlagTestAndSet", + "class" : "Atomic", "opcode" : 318, "operands" : [ { "kind" : "IdResultType" }, @@ -3009,6 +3406,7 @@ }, { "opname" : "OpAtomicFlagClear", + "class" : "Atomic", "opcode" : 319, "operands" : [ { "kind" : "IdRef", "name" : "'Pointer'" }, @@ -3019,6 +3417,7 @@ }, { "opname" : "OpImageSparseRead", + "class" : "Image", "opcode" : 320, "operands" : [ { "kind" : "IdResultType" }, @@ -3031,6 +3430,7 @@ }, { "opname" : "OpSizeOf", + "class" : "Miscellaneous", "opcode" : 321, "operands" : [ { "kind" : "IdResultType" }, @@ -3042,6 +3442,7 @@ }, { "opname" : "OpTypePipeStorage", + "class" : "Type-Declaration", "opcode" : 322, "operands" : [ { "kind" : "IdResult" } @@ -3051,6 +3452,7 @@ }, { "opname" : "OpConstantPipeStorage", + "class" : "Pipe", "opcode" : 323, "operands" : [ { "kind" : "IdResultType" }, @@ -3064,6 +3466,7 @@ }, { "opname" : "OpCreatePipeFromPipeStorage", + "class" : "Pipe", "opcode" : 324, "operands" : [ { "kind" : "IdResultType" }, @@ -3075,6 +3478,7 @@ }, { "opname" : "OpGetKernelLocalSizeForSubgroupCount", + "class" : "Device-Side_Enqueue", "opcode" : 325, "operands" : [ { "kind" : "IdResultType" }, @@ -3090,6 +3494,7 @@ }, { "opname" : "OpGetKernelMaxNumSubgroups", + "class" : "Device-Side_Enqueue", "opcode" : 326, "operands" : [ { "kind" : "IdResultType" }, @@ -3104,6 +3509,7 @@ }, { "opname" : "OpTypeNamedBarrier", + "class" : "Type-Declaration", "opcode" : 327, "operands" : [ { "kind" : "IdResult" } @@ -3113,6 +3519,7 @@ }, { "opname" : "OpNamedBarrierInitialize", + "class" : "Barrier", "opcode" : 328, "operands" : [ { "kind" : "IdResultType" }, @@ -3124,6 +3531,7 @@ }, { "opname" : "OpMemoryNamedBarrier", + "class" : "Barrier", "opcode" : 329, "operands" : [ { "kind" : "IdRef", "name" : "'Named Barrier'" }, @@ -3135,6 +3543,7 @@ }, { "opname" : "OpModuleProcessed", + "class" : "Debug", "opcode" : 330, "operands" : [ { "kind" : "LiteralString", "name" : "'Process'" } @@ -3143,6 +3552,7 @@ }, { "opname" : "OpExecutionModeId", + "class" : "Mode-Setting", "opcode" : 331, "operands" : [ { "kind" : "IdRef", "name" : "'Entry Point'" }, @@ -3152,6 +3562,7 @@ }, { "opname" : "OpDecorateId", + "class" : "Annotation", "opcode" : 332, "operands" : [ { "kind" : "IdRef", "name" : "'Target'" }, @@ -3162,6 +3573,7 @@ }, { "opname" : "OpGroupNonUniformElect", + "class" : "Non-Uniform", "opcode" : 333, "operands" : [ { "kind" : "IdResultType" }, @@ -3173,6 +3585,7 @@ }, { "opname" : "OpGroupNonUniformAll", + "class" : "Non-Uniform", "opcode" : 334, "operands" : [ { "kind" : "IdResultType" }, @@ -3185,6 +3598,7 @@ }, { "opname" : "OpGroupNonUniformAny", + "class" : "Non-Uniform", "opcode" : 335, "operands" : [ { "kind" : "IdResultType" }, @@ -3197,6 +3611,7 @@ }, { "opname" : "OpGroupNonUniformAllEqual", + "class" : "Non-Uniform", "opcode" : 336, "operands" : [ { "kind" : "IdResultType" }, @@ -3209,6 +3624,7 @@ }, { "opname" : "OpGroupNonUniformBroadcast", + "class" : "Non-Uniform", "opcode" : 337, "operands" : [ { "kind" : "IdResultType" }, @@ -3222,6 +3638,7 @@ }, { "opname" : "OpGroupNonUniformBroadcastFirst", + "class" : "Non-Uniform", "opcode" : 338, "operands" : [ { "kind" : "IdResultType" }, @@ -3234,6 +3651,7 @@ }, { "opname" : "OpGroupNonUniformBallot", + "class" : "Non-Uniform", "opcode" : 339, "operands" : [ { "kind" : "IdResultType" }, @@ -3246,6 +3664,7 @@ }, { "opname" : "OpGroupNonUniformInverseBallot", + "class" : "Non-Uniform", "opcode" : 340, "operands" : [ { "kind" : "IdResultType" }, @@ -3258,6 +3677,7 @@ }, { "opname" : "OpGroupNonUniformBallotBitExtract", + "class" : "Non-Uniform", "opcode" : 341, "operands" : [ { "kind" : "IdResultType" }, @@ -3271,6 +3691,7 @@ }, { "opname" : "OpGroupNonUniformBallotBitCount", + "class" : "Non-Uniform", "opcode" : 342, "operands" : [ { "kind" : "IdResultType" }, @@ -3284,6 +3705,7 @@ }, { "opname" : "OpGroupNonUniformBallotFindLSB", + "class" : "Non-Uniform", "opcode" : 343, "operands" : [ { "kind" : "IdResultType" }, @@ -3296,6 +3718,7 @@ }, { "opname" : "OpGroupNonUniformBallotFindMSB", + "class" : "Non-Uniform", "opcode" : 344, "operands" : [ { "kind" : "IdResultType" }, @@ -3308,6 +3731,7 @@ }, { "opname" : "OpGroupNonUniformShuffle", + "class" : "Non-Uniform", "opcode" : 345, "operands" : [ { "kind" : "IdResultType" }, @@ -3321,6 +3745,7 @@ }, { "opname" : "OpGroupNonUniformShuffleXor", + "class" : "Non-Uniform", "opcode" : 346, "operands" : [ { "kind" : "IdResultType" }, @@ -3334,6 +3759,7 @@ }, { "opname" : "OpGroupNonUniformShuffleUp", + "class" : "Non-Uniform", "opcode" : 347, "operands" : [ { "kind" : "IdResultType" }, @@ -3347,6 +3773,7 @@ }, { "opname" : "OpGroupNonUniformShuffleDown", + "class" : "Non-Uniform", "opcode" : 348, "operands" : [ { "kind" : "IdResultType" }, @@ -3360,6 +3787,7 @@ }, { "opname" : "OpGroupNonUniformIAdd", + "class" : "Non-Uniform", "opcode" : 349, "operands" : [ { "kind" : "IdResultType" }, @@ -3374,6 +3802,7 @@ }, { "opname" : "OpGroupNonUniformFAdd", + "class" : "Non-Uniform", "opcode" : 350, "operands" : [ { "kind" : "IdResultType" }, @@ -3388,6 +3817,7 @@ }, { "opname" : "OpGroupNonUniformIMul", + "class" : "Non-Uniform", "opcode" : 351, "operands" : [ { "kind" : "IdResultType" }, @@ -3402,6 +3832,7 @@ }, { "opname" : "OpGroupNonUniformFMul", + "class" : "Non-Uniform", "opcode" : 352, "operands" : [ { "kind" : "IdResultType" }, @@ -3416,6 +3847,7 @@ }, { "opname" : "OpGroupNonUniformSMin", + "class" : "Non-Uniform", "opcode" : 353, "operands" : [ { "kind" : "IdResultType" }, @@ -3430,6 +3862,7 @@ }, { "opname" : "OpGroupNonUniformUMin", + "class" : "Non-Uniform", "opcode" : 354, "operands" : [ { "kind" : "IdResultType" }, @@ -3444,6 +3877,7 @@ }, { "opname" : "OpGroupNonUniformFMin", + "class" : "Non-Uniform", "opcode" : 355, "operands" : [ { "kind" : "IdResultType" }, @@ -3458,6 +3892,7 @@ }, { "opname" : "OpGroupNonUniformSMax", + "class" : "Non-Uniform", "opcode" : 356, "operands" : [ { "kind" : "IdResultType" }, @@ -3472,6 +3907,7 @@ }, { "opname" : "OpGroupNonUniformUMax", + "class" : "Non-Uniform", "opcode" : 357, "operands" : [ { "kind" : "IdResultType" }, @@ -3486,6 +3922,7 @@ }, { "opname" : "OpGroupNonUniformFMax", + "class" : "Non-Uniform", "opcode" : 358, "operands" : [ { "kind" : "IdResultType" }, @@ -3500,6 +3937,7 @@ }, { "opname" : "OpGroupNonUniformBitwiseAnd", + "class" : "Non-Uniform", "opcode" : 359, "operands" : [ { "kind" : "IdResultType" }, @@ -3514,6 +3952,7 @@ }, { "opname" : "OpGroupNonUniformBitwiseOr", + "class" : "Non-Uniform", "opcode" : 360, "operands" : [ { "kind" : "IdResultType" }, @@ -3528,6 +3967,7 @@ }, { "opname" : "OpGroupNonUniformBitwiseXor", + "class" : "Non-Uniform", "opcode" : 361, "operands" : [ { "kind" : "IdResultType" }, @@ -3542,6 +3982,7 @@ }, { "opname" : "OpGroupNonUniformLogicalAnd", + "class" : "Non-Uniform", "opcode" : 362, "operands" : [ { "kind" : "IdResultType" }, @@ -3556,6 +3997,7 @@ }, { "opname" : "OpGroupNonUniformLogicalOr", + "class" : "Non-Uniform", "opcode" : 363, "operands" : [ { "kind" : "IdResultType" }, @@ -3570,6 +4012,7 @@ }, { "opname" : "OpGroupNonUniformLogicalXor", + "class" : "Non-Uniform", "opcode" : 364, "operands" : [ { "kind" : "IdResultType" }, @@ -3584,6 +4027,7 @@ }, { "opname" : "OpGroupNonUniformQuadBroadcast", + "class" : "Non-Uniform", "opcode" : 365, "operands" : [ { "kind" : "IdResultType" }, @@ -3597,6 +4041,7 @@ }, { "opname" : "OpGroupNonUniformQuadSwap", + "class" : "Non-Uniform", "opcode" : 366, "operands" : [ { "kind" : "IdResultType" }, @@ -3610,6 +4055,7 @@ }, { "opname" : "OpCopyLogical", + "class" : "Composite", "opcode" : 400, "operands" : [ { "kind" : "IdResultType" }, @@ -3620,6 +4066,7 @@ }, { "opname" : "OpPtrEqual", + "class" : "Memory", "opcode" : 401, "operands" : [ { "kind" : "IdResultType" }, @@ -3631,6 +4078,7 @@ }, { "opname" : "OpPtrNotEqual", + "class" : "Memory", "opcode" : 402, "operands" : [ { "kind" : "IdResultType" }, @@ -3642,6 +4090,7 @@ }, { "opname" : "OpPtrDiff", + "class" : "Memory", "opcode" : 403, "operands" : [ { "kind" : "IdResultType" }, @@ -3654,6 +4103,7 @@ }, { "opname" : "OpSubgroupBallotKHR", + "class" : "Group", "opcode" : 4421, "operands" : [ { "kind" : "IdResultType" }, @@ -3666,6 +4116,7 @@ }, { "opname" : "OpSubgroupFirstInvocationKHR", + "class" : "Group", "opcode" : 4422, "operands" : [ { "kind" : "IdResultType" }, @@ -3678,6 +4129,7 @@ }, { "opname" : "OpSubgroupAllKHR", + "class" : "Group", "opcode" : 4428, "operands" : [ { "kind" : "IdResultType" }, @@ -3692,6 +4144,7 @@ }, { "opname" : "OpSubgroupAnyKHR", + "class" : "Group", "opcode" : 4429, "operands" : [ { "kind" : "IdResultType" }, @@ -3706,6 +4159,7 @@ }, { "opname" : "OpSubgroupAllEqualKHR", + "class" : "Group", "opcode" : 4430, "operands" : [ { "kind" : "IdResultType" }, @@ -3720,6 +4174,7 @@ }, { "opname" : "OpSubgroupReadInvocationKHR", + "class" : "Group", "opcode" : 4432, "operands" : [ { "kind" : "IdResultType" }, @@ -3733,6 +4188,7 @@ }, { "opname" : "OpGroupIAddNonUniformAMD", + "class" : "Group", "opcode" : 5000, "operands" : [ { "kind" : "IdResultType" }, @@ -3747,6 +4203,7 @@ }, { "opname" : "OpGroupFAddNonUniformAMD", + "class" : "Group", "opcode" : 5001, "operands" : [ { "kind" : "IdResultType" }, @@ -3761,6 +4218,7 @@ }, { "opname" : "OpGroupFMinNonUniformAMD", + "class" : "Group", "opcode" : 5002, "operands" : [ { "kind" : "IdResultType" }, @@ -3775,6 +4233,7 @@ }, { "opname" : "OpGroupUMinNonUniformAMD", + "class" : "Group", "opcode" : 5003, "operands" : [ { "kind" : "IdResultType" }, @@ -3789,6 +4248,7 @@ }, { "opname" : "OpGroupSMinNonUniformAMD", + "class" : "Group", "opcode" : 5004, "operands" : [ { "kind" : "IdResultType" }, @@ -3803,6 +4263,7 @@ }, { "opname" : "OpGroupFMaxNonUniformAMD", + "class" : "Group", "opcode" : 5005, "operands" : [ { "kind" : "IdResultType" }, @@ -3817,6 +4278,7 @@ }, { "opname" : "OpGroupUMaxNonUniformAMD", + "class" : "Group", "opcode" : 5006, "operands" : [ { "kind" : "IdResultType" }, @@ -3831,6 +4293,7 @@ }, { "opname" : "OpGroupSMaxNonUniformAMD", + "class" : "Group", "opcode" : 5007, "operands" : [ { "kind" : "IdResultType" }, @@ -3845,6 +4308,7 @@ }, { "opname" : "OpFragmentMaskFetchAMD", + "class" : "Reserved", "opcode" : 5011, "operands" : [ { "kind" : "IdResultType" }, @@ -3858,6 +4322,7 @@ }, { "opname" : "OpFragmentFetchAMD", + "class" : "Reserved", "opcode" : 5012, "operands" : [ { "kind" : "IdResultType" }, @@ -3871,7 +4336,21 @@ "version" : "None" }, { + "opname" : "OpReadClockKHR", + "class" : "Reserved", + "opcode" : 5056, + "operands" : [ + { "kind" : "IdResultType" }, + { "kind" : "IdResult" }, + { "kind" : "IdScope", "name" : "'Execution'" } + ], + "capabilities" : [ "ShaderClockKHR" ], + "extensions" : [ "SPV_KHR_shader_clock" ], + "version" : "None" + }, + { "opname" : "OpImageSampleFootprintNV", + "class" : "Image", "opcode" : 5283, "operands" : [ { "kind" : "IdResultType" }, @@ -3888,6 +4367,7 @@ }, { "opname" : "OpGroupNonUniformPartitionNV", + "class" : "Non-Uniform", "opcode" : 5296, "operands" : [ { "kind" : "IdResultType" }, @@ -3900,6 +4380,7 @@ }, { "opname" : "OpWritePackedPrimitiveIndices4x8NV", + "class" : "Reserved", "opcode" : 5299, "operands" : [ { "kind" : "IdRef", "name" : "'Index Offset'" }, @@ -3911,6 +4392,7 @@ }, { "opname" : "OpReportIntersectionNV", + "class" : "Reserved", "opcode" : 5334, "operands" : [ { "kind" : "IdResultType" }, @@ -3919,24 +4401,30 @@ { "kind" : "IdRef", "name" : "'HitKind'" } ], "capabilities" : [ "RayTracingNV" ], - "extensions" : [ "SPV_NV_ray_tracing" ] + "extensions" : [ "SPV_NV_ray_tracing" ], + "version" : "None" }, { "opname" : "OpIgnoreIntersectionNV", + "class" : "Reserved", "opcode" : 5335, "capabilities" : [ "RayTracingNV" ], - "extensions" : [ "SPV_NV_ray_tracing" ] + "extensions" : [ "SPV_NV_ray_tracing" ], + "version" : "None" }, { "opname" : "OpTerminateRayNV", + "class" : "Reserved", "opcode" : 5336, "capabilities" : [ "RayTracingNV" ], - "extensions" : [ "SPV_NV_ray_tracing" ] + "extensions" : [ "SPV_NV_ray_tracing" ], + "version" : "None" }, { "opname" : "OpTraceNV", + "class" : "Reserved", "opcode" : 5337, "operands" : [ @@ -3953,19 +4441,23 @@ { "kind" : "IdRef", "name" : "'PayloadId'" } ], "capabilities" : [ "RayTracingNV" ], - "extensions" : [ "SPV_NV_ray_tracing" ] + "extensions" : [ "SPV_NV_ray_tracing" ], + "version" : "None" }, { "opname" : "OpTypeAccelerationStructureNV", + "class" : "Reserved", "opcode" : 5341, "operands" : [ { "kind" : "IdResult" } ], "capabilities" : [ "RayTracingNV" ], - "extensions" : [ "SPV_NV_ray_tracing" ] + "extensions" : [ "SPV_NV_ray_tracing" ], + "version" : "None" }, { "opname" : "OpExecuteCallableNV", + "class" : "Reserved", "opcode" : 5344, "operands" : [ @@ -3973,10 +4465,12 @@ { "kind" : "IdRef", "name" : "'Callable DataId'" } ], "capabilities" : [ "RayTracingNV" ], - "extensions" : [ "SPV_NV_ray_tracing" ] + "extensions" : [ "SPV_NV_ray_tracing" ], + "version" : "None" }, { "opname" : "OpTypeCooperativeMatrixNV", + "class" : "Reserved", "opcode" : 5358, "operands" : [ { "kind" : "IdResult" }, @@ -3991,6 +4485,7 @@ }, { "opname" : "OpCooperativeMatrixLoadNV", + "class" : "Reserved", "opcode" : 5359, "operands" : [ { "kind" : "IdResultType" }, @@ -4006,6 +4501,7 @@ }, { "opname" : "OpCooperativeMatrixStoreNV", + "class" : "Reserved", "opcode" : 5360, "operands" : [ { "kind" : "IdRef", "name" : "'Pointer'" }, @@ -4020,6 +4516,7 @@ }, { "opname" : "OpCooperativeMatrixMulAddNV", + "class" : "Reserved", "opcode" : 5361, "operands" : [ { "kind" : "IdResultType" }, @@ -4034,6 +4531,7 @@ }, { "opname" : "OpCooperativeMatrixLengthNV", + "class" : "Reserved", "opcode" : 5362, "operands" : [ { "kind" : "IdResultType" }, @@ -4046,6 +4544,7 @@ }, { "opname" : "OpBeginInvocationInterlockEXT", + "class" : "Reserved", "opcode" : 5364, "capabilities" : [ "FragmentShaderSampleInterlockEXT", "FragmentShaderPixelInterlockEXT", "FragmentShaderShadingRateInterlockEXT" ], "extensions" : [ "SPV_EXT_fragment_shader_interlock" ], @@ -4053,6 +4552,7 @@ }, { "opname" : "OpEndInvocationInterlockEXT", + "class" : "Reserved", "opcode" : 5365, "capabilities" : [ "FragmentShaderSampleInterlockEXT", "FragmentShaderPixelInterlockEXT", "FragmentShaderShadingRateInterlockEXT" ], "extensions" : [ "SPV_EXT_fragment_shader_interlock" ], @@ -4060,6 +4560,7 @@ }, { "opname" : "OpDemoteToHelperInvocationEXT", + "class" : "Reserved", "opcode" : 5380, "capabilities" : [ "DemoteToHelperInvocationEXT" ], "extensions" : [ "SPV_EXT_demote_to_helper_invocation" ], @@ -4067,6 +4568,7 @@ }, { "opname" : "OpIsHelperInvocationEXT", + "class" : "Reserved", "opcode" : 5381, "operands" : [ { "kind" : "IdResultType" }, @@ -4078,6 +4580,7 @@ }, { "opname" : "OpSubgroupShuffleINTEL", + "class" : "Group", "opcode" : 5571, "operands" : [ { "kind" : "IdResultType" }, @@ -4090,6 +4593,7 @@ }, { "opname" : "OpSubgroupShuffleDownINTEL", + "class" : "Group", "opcode" : 5572, "operands" : [ { "kind" : "IdResultType" }, @@ -4103,6 +4607,7 @@ }, { "opname" : "OpSubgroupShuffleUpINTEL", + "class" : "Group", "opcode" : 5573, "operands" : [ { "kind" : "IdResultType" }, @@ -4116,6 +4621,7 @@ }, { "opname" : "OpSubgroupShuffleXorINTEL", + "class" : "Group", "opcode" : 5574, "operands" : [ { "kind" : "IdResultType" }, @@ -4128,6 +4634,7 @@ }, { "opname" : "OpSubgroupBlockReadINTEL", + "class" : "Group", "opcode" : 5575, "operands" : [ { "kind" : "IdResultType" }, @@ -4139,6 +4646,7 @@ }, { "opname" : "OpSubgroupBlockWriteINTEL", + "class" : "Group", "opcode" : 5576, "operands" : [ { "kind" : "IdRef", "name" : "'Ptr'" }, @@ -4149,6 +4657,7 @@ }, { "opname" : "OpSubgroupImageBlockReadINTEL", + "class" : "Group", "opcode" : 5577, "operands" : [ { "kind" : "IdResultType" }, @@ -4161,6 +4670,7 @@ }, { "opname" : "OpSubgroupImageBlockWriteINTEL", + "class" : "Group", "opcode" : 5578, "operands" : [ { "kind" : "IdRef", "name" : "'Image'" }, @@ -4172,6 +4682,7 @@ }, { "opname" : "OpSubgroupImageMediaBlockReadINTEL", + "class" : "Group", "opcode" : 5580, "operands" : [ { "kind" : "IdResultType" }, @@ -4186,6 +4697,7 @@ }, { "opname" : "OpSubgroupImageMediaBlockWriteINTEL", + "class" : "Group", "opcode" : 5581, "operands" : [ { "kind" : "IdRef", "name" : "'Image'" }, @@ -4199,6 +4711,7 @@ }, { "opname" : "OpUCountLeadingZerosINTEL", + "class" : "Reserved", "opcode" : 5585, "operands" : [ { "kind" : "IdResultType" }, @@ -4210,6 +4723,7 @@ }, { "opname" : "OpUCountTrailingZerosINTEL", + "class" : "Reserved", "opcode" : 5586, "operands" : [ { "kind" : "IdResultType" }, @@ -4221,6 +4735,7 @@ }, { "opname" : "OpAbsISubINTEL", + "class" : "Reserved", "opcode" : 5587, "operands" : [ { "kind" : "IdResultType" }, @@ -4233,6 +4748,7 @@ }, { "opname" : "OpAbsUSubINTEL", + "class" : "Reserved", "opcode" : 5588, "operands" : [ { "kind" : "IdResultType" }, @@ -4245,6 +4761,7 @@ }, { "opname" : "OpIAddSatINTEL", + "class" : "Reserved", "opcode" : 5589, "operands" : [ { "kind" : "IdResultType" }, @@ -4257,6 +4774,7 @@ }, { "opname" : "OpUAddSatINTEL", + "class" : "Reserved", "opcode" : 5590, "operands" : [ { "kind" : "IdResultType" }, @@ -4269,6 +4787,7 @@ }, { "opname" : "OpIAverageINTEL", + "class" : "Reserved", "opcode" : 5591, "operands" : [ { "kind" : "IdResultType" }, @@ -4281,6 +4800,7 @@ }, { "opname" : "OpUAverageINTEL", + "class" : "Reserved", "opcode" : 5592, "operands" : [ { "kind" : "IdResultType" }, @@ -4293,6 +4813,7 @@ }, { "opname" : "OpIAverageRoundedINTEL", + "class" : "Reserved", "opcode" : 5593, "operands" : [ { "kind" : "IdResultType" }, @@ -4305,6 +4826,7 @@ }, { "opname" : "OpUAverageRoundedINTEL", + "class" : "Reserved", "opcode" : 5594, "operands" : [ { "kind" : "IdResultType" }, @@ -4317,6 +4839,7 @@ }, { "opname" : "OpISubSatINTEL", + "class" : "Reserved", "opcode" : 5595, "operands" : [ { "kind" : "IdResultType" }, @@ -4329,6 +4852,7 @@ }, { "opname" : "OpUSubSatINTEL", + "class" : "Reserved", "opcode" : 5596, "operands" : [ { "kind" : "IdResultType" }, @@ -4341,6 +4865,7 @@ }, { "opname" : "OpIMul32x16INTEL", + "class" : "Reserved", "opcode" : 5597, "operands" : [ { "kind" : "IdResultType" }, @@ -4353,6 +4878,7 @@ }, { "opname" : "OpUMul32x16INTEL", + "class" : "Reserved", "opcode" : 5598, "operands" : [ { "kind" : "IdResultType" }, @@ -4365,6 +4891,7 @@ }, { "opname" : "OpDecorateString", + "class" : "Annotation", "opcode" : 5632, "operands" : [ { "kind" : "IdRef", "name" : "'Target'" }, @@ -4375,6 +4902,7 @@ }, { "opname" : "OpDecorateStringGOOGLE", + "class" : "Annotation", "opcode" : 5632, "operands" : [ { "kind" : "IdRef", "name" : "'Target'" }, @@ -4385,6 +4913,7 @@ }, { "opname" : "OpMemberDecorateString", + "class" : "Annotation", "opcode" : 5633, "operands" : [ { "kind" : "IdRef", "name" : "'Struct Type'" }, @@ -4396,6 +4925,7 @@ }, { "opname" : "OpMemberDecorateStringGOOGLE", + "class" : "Annotation", "opcode" : 5633, "operands" : [ { "kind" : "IdRef", "name" : "'Struct Type'" }, @@ -4407,6 +4937,7 @@ }, { "opname" : "OpVmeImageINTEL", + "class" : "@exclude", "opcode" : 5699, "operands" : [ { "kind" : "IdResultType" }, @@ -4419,6 +4950,7 @@ }, { "opname" : "OpTypeVmeImageINTEL", + "class" : "@exclude", "opcode" : 5700, "operands" : [ { "kind" : "IdResult" }, @@ -4429,6 +4961,7 @@ }, { "opname" : "OpTypeAvcImePayloadINTEL", + "class" : "@exclude", "opcode" : 5701, "operands" : [ { "kind" : "IdResult" } @@ -4438,6 +4971,7 @@ }, { "opname" : "OpTypeAvcRefPayloadINTEL", + "class" : "@exclude", "opcode" : 5702, "operands" : [ { "kind" : "IdResult" } @@ -4447,6 +4981,7 @@ }, { "opname" : "OpTypeAvcSicPayloadINTEL", + "class" : "@exclude", "opcode" : 5703, "operands" : [ { "kind" : "IdResult" } @@ -4456,6 +4991,7 @@ }, { "opname" : "OpTypeAvcMcePayloadINTEL", + "class" : "@exclude", "opcode" : 5704, "operands" : [ { "kind" : "IdResult" } @@ -4465,6 +5001,7 @@ }, { "opname" : "OpTypeAvcMceResultINTEL", + "class" : "@exclude", "opcode" : 5705, "operands" : [ { "kind" : "IdResult" } @@ -4474,6 +5011,7 @@ }, { "opname" : "OpTypeAvcImeResultINTEL", + "class" : "@exclude", "opcode" : 5706, "operands" : [ { "kind" : "IdResult" } @@ -4483,6 +5021,7 @@ }, { "opname" : "OpTypeAvcImeResultSingleReferenceStreamoutINTEL", + "class" : "@exclude", "opcode" : 5707, "operands" : [ { "kind" : "IdResult" } @@ -4492,6 +5031,7 @@ }, { "opname" : "OpTypeAvcImeResultDualReferenceStreamoutINTEL", + "class" : "@exclude", "opcode" : 5708, "operands" : [ { "kind" : "IdResult" } @@ -4501,6 +5041,7 @@ }, { "opname" : "OpTypeAvcImeSingleReferenceStreaminINTEL", + "class" : "@exclude", "opcode" : 5709, "operands" : [ { "kind" : "IdResult" } @@ -4510,6 +5051,7 @@ }, { "opname" : "OpTypeAvcImeDualReferenceStreaminINTEL", + "class" : "@exclude", "opcode" : 5710, "operands" : [ { "kind" : "IdResult" } @@ -4519,6 +5061,7 @@ }, { "opname" : "OpTypeAvcRefResultINTEL", + "class" : "@exclude", "opcode" : 5711, "operands" : [ { "kind" : "IdResult" } @@ -4528,6 +5071,7 @@ }, { "opname" : "OpTypeAvcSicResultINTEL", + "class" : "@exclude", "opcode" : 5712, "operands" : [ { "kind" : "IdResult" } @@ -4537,6 +5081,7 @@ }, { "opname" : "OpSubgroupAvcMceGetDefaultInterBaseMultiReferencePenaltyINTEL", + "class" : "@exclude", "opcode" : 5713, "operands" : [ { "kind" : "IdResultType" }, @@ -4549,6 +5094,7 @@ }, { "opname" : "OpSubgroupAvcMceSetInterBaseMultiReferencePenaltyINTEL", + "class" : "@exclude", "opcode" : 5714, "operands" : [ { "kind" : "IdResultType" }, @@ -4561,6 +5107,7 @@ }, { "opname" : "OpSubgroupAvcMceGetDefaultInterShapePenaltyINTEL", + "class" : "@exclude", "opcode" : 5715, "operands" : [ { "kind" : "IdResultType" }, @@ -4573,6 +5120,7 @@ }, { "opname" : "OpSubgroupAvcMceSetInterShapePenaltyINTEL", + "class" : "@exclude", "opcode" : 5716, "operands" : [ { "kind" : "IdResultType" }, @@ -4585,6 +5133,7 @@ }, { "opname" : "OpSubgroupAvcMceGetDefaultInterDirectionPenaltyINTEL", + "class" : "@exclude", "opcode" : 5717, "operands" : [ { "kind" : "IdResultType" }, @@ -4597,6 +5146,7 @@ }, { "opname" : "OpSubgroupAvcMceSetInterDirectionPenaltyINTEL", + "class" : "@exclude", "opcode" : 5718, "operands" : [ { "kind" : "IdResultType" }, @@ -4609,6 +5159,7 @@ }, { "opname" : "OpSubgroupAvcMceGetDefaultIntraLumaShapePenaltyINTEL", + "class" : "@exclude", "opcode" : 5719, "operands" : [ { "kind" : "IdResultType" }, @@ -4621,6 +5172,7 @@ }, { "opname" : "OpSubgroupAvcMceGetDefaultInterMotionVectorCostTableINTEL", + "class" : "@exclude", "opcode" : 5720, "operands" : [ { "kind" : "IdResultType" }, @@ -4633,6 +5185,7 @@ }, { "opname" : "OpSubgroupAvcMceGetDefaultHighPenaltyCostTableINTEL", + "class" : "@exclude", "opcode" : 5721, "operands" : [ { "kind" : "IdResultType" }, @@ -4643,6 +5196,7 @@ }, { "opname" : "OpSubgroupAvcMceGetDefaultMediumPenaltyCostTableINTEL", + "class" : "@exclude", "opcode" : 5722, "operands" : [ { "kind" : "IdResultType" }, @@ -4653,6 +5207,7 @@ }, { "opname" : "OpSubgroupAvcMceGetDefaultLowPenaltyCostTableINTEL", + "class" : "@exclude", "opcode" : 5723, "operands" : [ { "kind" : "IdResultType" }, @@ -4663,6 +5218,7 @@ }, { "opname" : "OpSubgroupAvcMceSetMotionVectorCostFunctionINTEL", + "class" : "@exclude", "opcode" : 5724, "operands" : [ { "kind" : "IdResultType" }, @@ -4677,6 +5233,7 @@ }, { "opname" : "OpSubgroupAvcMceGetDefaultIntraLumaModePenaltyINTEL", + "class" : "@exclude", "opcode" : 5725, "operands" : [ { "kind" : "IdResultType" }, @@ -4689,6 +5246,7 @@ }, { "opname" : "OpSubgroupAvcMceGetDefaultNonDcLumaIntraPenaltyINTEL", + "class" : "@exclude", "opcode" : 5726, "operands" : [ { "kind" : "IdResultType" }, @@ -4699,6 +5257,7 @@ }, { "opname" : "OpSubgroupAvcMceGetDefaultIntraChromaModeBasePenaltyINTEL", + "class" : "@exclude", "opcode" : 5727, "operands" : [ { "kind" : "IdResultType" }, @@ -4709,6 +5268,7 @@ }, { "opname" : "OpSubgroupAvcMceSetAcOnlyHaarINTEL", + "class" : "@exclude", "opcode" : 5728, "operands" : [ { "kind" : "IdResultType" }, @@ -4720,6 +5280,7 @@ }, { "opname" : "OpSubgroupAvcMceSetSourceInterlacedFieldPolarityINTEL", + "class" : "@exclude", "opcode" : 5729, "operands" : [ { "kind" : "IdResultType" }, @@ -4732,6 +5293,7 @@ }, { "opname" : "OpSubgroupAvcMceSetSingleReferenceInterlacedFieldPolarityINTEL", + "class" : "@exclude", "opcode" : 5730, "operands" : [ { "kind" : "IdResultType" }, @@ -4744,6 +5306,7 @@ }, { "opname" : "OpSubgroupAvcMceSetDualReferenceInterlacedFieldPolaritiesINTEL", + "class" : "@exclude", "opcode" : 5731, "operands" : [ { "kind" : "IdResultType" }, @@ -4757,6 +5320,7 @@ }, { "opname" : "OpSubgroupAvcMceConvertToImePayloadINTEL", + "class" : "@exclude", "opcode" : 5732, "operands" : [ { "kind" : "IdResultType" }, @@ -4768,6 +5332,7 @@ }, { "opname" : "OpSubgroupAvcMceConvertToImeResultINTEL", + "class" : "@exclude", "opcode" : 5733, "operands" : [ { "kind" : "IdResultType" }, @@ -4779,6 +5344,7 @@ }, { "opname" : "OpSubgroupAvcMceConvertToRefPayloadINTEL", + "class" : "@exclude", "opcode" : 5734, "operands" : [ { "kind" : "IdResultType" }, @@ -4790,6 +5356,7 @@ }, { "opname" : "OpSubgroupAvcMceConvertToRefResultINTEL", + "class" : "@exclude", "opcode" : 5735, "operands" : [ { "kind" : "IdResultType" }, @@ -4801,6 +5368,7 @@ }, { "opname" : "OpSubgroupAvcMceConvertToSicPayloadINTEL", + "class" : "@exclude", "opcode" : 5736, "operands" : [ { "kind" : "IdResultType" }, @@ -4812,6 +5380,7 @@ }, { "opname" : "OpSubgroupAvcMceConvertToSicResultINTEL", + "class" : "@exclude", "opcode" : 5737, "operands" : [ { "kind" : "IdResultType" }, @@ -4823,6 +5392,7 @@ }, { "opname" : "OpSubgroupAvcMceGetMotionVectorsINTEL", + "class" : "@exclude", "opcode" : 5738, "operands" : [ { "kind" : "IdResultType" }, @@ -4834,6 +5404,7 @@ }, { "opname" : "OpSubgroupAvcMceGetInterDistortionsINTEL", + "class" : "@exclude", "opcode" : 5739, "operands" : [ { "kind" : "IdResultType" }, @@ -4845,6 +5416,7 @@ }, { "opname" : "OpSubgroupAvcMceGetBestInterDistortionsINTEL", + "class" : "@exclude", "opcode" : 5740, "operands" : [ { "kind" : "IdResultType" }, @@ -4856,6 +5428,7 @@ }, { "opname" : "OpSubgroupAvcMceGetInterMajorShapeINTEL", + "class" : "@exclude", "opcode" : 5741, "operands" : [ { "kind" : "IdResultType" }, @@ -4867,6 +5440,7 @@ }, { "opname" : "OpSubgroupAvcMceGetInterMinorShapeINTEL", + "class" : "@exclude", "opcode" : 5742, "operands" : [ { "kind" : "IdResultType" }, @@ -4878,6 +5452,7 @@ }, { "opname" : "OpSubgroupAvcMceGetInterDirectionsINTEL", + "class" : "@exclude", "opcode" : 5743, "operands" : [ { "kind" : "IdResultType" }, @@ -4889,6 +5464,7 @@ }, { "opname" : "OpSubgroupAvcMceGetInterMotionVectorCountINTEL", + "class" : "@exclude", "opcode" : 5744, "operands" : [ { "kind" : "IdResultType" }, @@ -4900,6 +5476,7 @@ }, { "opname" : "OpSubgroupAvcMceGetInterReferenceIdsINTEL", + "class" : "@exclude", "opcode" : 5745, "operands" : [ { "kind" : "IdResultType" }, @@ -4911,6 +5488,7 @@ }, { "opname" : "OpSubgroupAvcMceGetInterReferenceInterlacedFieldPolaritiesINTEL", + "class" : "@exclude", "opcode" : 5746, "operands" : [ { "kind" : "IdResultType" }, @@ -4924,6 +5502,7 @@ }, { "opname" : "OpSubgroupAvcImeInitializeINTEL", + "class" : "@exclude", "opcode" : 5747, "operands" : [ { "kind" : "IdResultType" }, @@ -4937,6 +5516,7 @@ }, { "opname" : "OpSubgroupAvcImeSetSingleReferenceINTEL", + "class" : "@exclude", "opcode" : 5748, "operands" : [ { "kind" : "IdResultType" }, @@ -4950,6 +5530,7 @@ }, { "opname" : "OpSubgroupAvcImeSetDualReferenceINTEL", + "class" : "@exclude", "opcode" : 5749, "operands" : [ { "kind" : "IdResultType" }, @@ -4964,6 +5545,7 @@ }, { "opname" : "OpSubgroupAvcImeRefWindowSizeINTEL", + "class" : "@exclude", "opcode" : 5750, "operands" : [ { "kind" : "IdResultType" }, @@ -4976,6 +5558,7 @@ }, { "opname" : "OpSubgroupAvcImeAdjustRefOffsetINTEL", + "class" : "@exclude", "opcode" : 5751, "operands" : [ { "kind" : "IdResultType" }, @@ -4990,6 +5573,7 @@ }, { "opname" : "OpSubgroupAvcImeConvertToMcePayloadINTEL", + "class" : "@exclude", "opcode" : 5752, "operands" : [ { "kind" : "IdResultType" }, @@ -5001,6 +5585,7 @@ }, { "opname" : "OpSubgroupAvcImeSetMaxMotionVectorCountINTEL", + "class" : "@exclude", "opcode" : 5753, "operands" : [ { "kind" : "IdResultType" }, @@ -5013,6 +5598,7 @@ }, { "opname" : "OpSubgroupAvcImeSetUnidirectionalMixDisableINTEL", + "class" : "@exclude", "opcode" : 5754, "operands" : [ { "kind" : "IdResultType" }, @@ -5024,6 +5610,7 @@ }, { "opname" : "OpSubgroupAvcImeSetEarlySearchTerminationThresholdINTEL", + "class" : "@exclude", "opcode" : 5755, "operands" : [ { "kind" : "IdResultType" }, @@ -5036,6 +5623,7 @@ }, { "opname" : "OpSubgroupAvcImeSetWeightedSadINTEL", + "class" : "@exclude", "opcode" : 5756, "operands" : [ { "kind" : "IdResultType" }, @@ -5048,6 +5636,7 @@ }, { "opname" : "OpSubgroupAvcImeEvaluateWithSingleReferenceINTEL", + "class" : "@exclude", "opcode" : 5757, "operands" : [ { "kind" : "IdResultType" }, @@ -5061,6 +5650,7 @@ }, { "opname" : "OpSubgroupAvcImeEvaluateWithDualReferenceINTEL", + "class" : "@exclude", "opcode" : 5758, "operands" : [ { "kind" : "IdResultType" }, @@ -5075,6 +5665,7 @@ }, { "opname" : "OpSubgroupAvcImeEvaluateWithSingleReferenceStreaminINTEL", + "class" : "@exclude", "opcode" : 5759, "operands" : [ { "kind" : "IdResultType" }, @@ -5089,6 +5680,7 @@ }, { "opname" : "OpSubgroupAvcImeEvaluateWithDualReferenceStreaminINTEL", + "class" : "@exclude", "opcode" : 5760, "operands" : [ { "kind" : "IdResultType" }, @@ -5104,6 +5696,7 @@ }, { "opname" : "OpSubgroupAvcImeEvaluateWithSingleReferenceStreamoutINTEL", + "class" : "@exclude", "opcode" : 5761, "operands" : [ { "kind" : "IdResultType" }, @@ -5117,6 +5710,7 @@ }, { "opname" : "OpSubgroupAvcImeEvaluateWithDualReferenceStreamoutINTEL", + "class" : "@exclude", "opcode" : 5762, "operands" : [ { "kind" : "IdResultType" }, @@ -5131,6 +5725,7 @@ }, { "opname" : "OpSubgroupAvcImeEvaluateWithSingleReferenceStreaminoutINTEL", + "class" : "@exclude", "opcode" : 5763, "operands" : [ { "kind" : "IdResultType" }, @@ -5145,6 +5740,7 @@ }, { "opname" : "OpSubgroupAvcImeEvaluateWithDualReferenceStreaminoutINTEL", + "class" : "@exclude", "opcode" : 5764, "operands" : [ { "kind" : "IdResultType" }, @@ -5160,6 +5756,7 @@ }, { "opname" : "OpSubgroupAvcImeConvertToMceResultINTEL", + "class" : "@exclude", "opcode" : 5765, "operands" : [ { "kind" : "IdResultType" }, @@ -5171,6 +5768,7 @@ }, { "opname" : "OpSubgroupAvcImeGetSingleReferenceStreaminINTEL", + "class" : "@exclude", "opcode" : 5766, "operands" : [ { "kind" : "IdResultType" }, @@ -5182,6 +5780,7 @@ }, { "opname" : "OpSubgroupAvcImeGetDualReferenceStreaminINTEL", + "class" : "@exclude", "opcode" : 5767, "operands" : [ { "kind" : "IdResultType" }, @@ -5193,6 +5792,7 @@ }, { "opname" : "OpSubgroupAvcImeStripSingleReferenceStreamoutINTEL", + "class" : "@exclude", "opcode" : 5768, "operands" : [ { "kind" : "IdResultType" }, @@ -5204,6 +5804,7 @@ }, { "opname" : "OpSubgroupAvcImeStripDualReferenceStreamoutINTEL", + "class" : "@exclude", "opcode" : 5769, "operands" : [ { "kind" : "IdResultType" }, @@ -5215,6 +5816,7 @@ }, { "opname" : "OpSubgroupAvcImeGetStreamoutSingleReferenceMajorShapeMotionVectorsINTEL", + "class" : "@exclude", "opcode" : 5770, "operands" : [ { "kind" : "IdResultType" }, @@ -5227,6 +5829,7 @@ }, { "opname" : "OpSubgroupAvcImeGetStreamoutSingleReferenceMajorShapeDistortionsINTEL", + "class" : "@exclude", "opcode" : 5771, "operands" : [ { "kind" : "IdResultType" }, @@ -5239,6 +5842,7 @@ }, { "opname" : "OpSubgroupAvcImeGetStreamoutSingleReferenceMajorShapeReferenceIdsINTEL", + "class" : "@exclude", "opcode" : 5772, "operands" : [ { "kind" : "IdResultType" }, @@ -5251,6 +5855,7 @@ }, { "opname" : "OpSubgroupAvcImeGetStreamoutDualReferenceMajorShapeMotionVectorsINTEL", + "class" : "@exclude", "opcode" : 5773, "operands" : [ { "kind" : "IdResultType" }, @@ -5264,6 +5869,7 @@ }, { "opname" : "OpSubgroupAvcImeGetStreamoutDualReferenceMajorShapeDistortionsINTEL", + "class" : "@exclude", "opcode" : 5774, "operands" : [ { "kind" : "IdResultType" }, @@ -5277,6 +5883,7 @@ }, { "opname" : "OpSubgroupAvcImeGetStreamoutDualReferenceMajorShapeReferenceIdsINTEL", + "class" : "@exclude", "opcode" : 5775, "operands" : [ { "kind" : "IdResultType" }, @@ -5290,6 +5897,7 @@ }, { "opname" : "OpSubgroupAvcImeGetBorderReachedINTEL", + "class" : "@exclude", "opcode" : 5776, "operands" : [ { "kind" : "IdResultType" }, @@ -5302,6 +5910,7 @@ }, { "opname" : "OpSubgroupAvcImeGetTruncatedSearchIndicationINTEL", + "class" : "@exclude", "opcode" : 5777, "operands" : [ { "kind" : "IdResultType" }, @@ -5313,6 +5922,7 @@ }, { "opname" : "OpSubgroupAvcImeGetUnidirectionalEarlySearchTerminationINTEL", + "class" : "@exclude", "opcode" : 5778, "operands" : [ { "kind" : "IdResultType" }, @@ -5324,6 +5934,7 @@ }, { "opname" : "OpSubgroupAvcImeGetWeightingPatternMinimumMotionVectorINTEL", + "class" : "@exclude", "opcode" : 5779, "operands" : [ { "kind" : "IdResultType" }, @@ -5335,6 +5946,7 @@ }, { "opname" : "OpSubgroupAvcImeGetWeightingPatternMinimumDistortionINTEL", + "class" : "@exclude", "opcode" : 5780, "operands" : [ { "kind" : "IdResultType" }, @@ -5346,6 +5958,7 @@ }, { "opname" : "OpSubgroupAvcFmeInitializeINTEL", + "class" : "@exclude", "opcode" : 5781, "operands" : [ { "kind" : "IdResultType" }, @@ -5363,6 +5976,7 @@ }, { "opname" : "OpSubgroupAvcBmeInitializeINTEL", + "class" : "@exclude", "opcode" : 5782, "operands" : [ { "kind" : "IdResultType" }, @@ -5381,6 +5995,7 @@ }, { "opname" : "OpSubgroupAvcRefConvertToMcePayloadINTEL", + "class" : "@exclude", "opcode" : 5783, "operands" : [ { "kind" : "IdResultType" }, @@ -5392,6 +6007,7 @@ }, { "opname" : "OpSubgroupAvcRefSetBidirectionalMixDisableINTEL", + "class" : "@exclude", "opcode" : 5784, "operands" : [ { "kind" : "IdResultType" }, @@ -5403,6 +6019,7 @@ }, { "opname" : "OpSubgroupAvcRefSetBilinearFilterEnableINTEL", + "class" : "@exclude", "opcode" : 5785, "operands" : [ { "kind" : "IdResultType" }, @@ -5414,6 +6031,7 @@ }, { "opname" : "OpSubgroupAvcRefEvaluateWithSingleReferenceINTEL", + "class" : "@exclude", "opcode" : 5786, "operands" : [ { "kind" : "IdResultType" }, @@ -5427,6 +6045,7 @@ }, { "opname" : "OpSubgroupAvcRefEvaluateWithDualReferenceINTEL", + "class" : "@exclude", "opcode" : 5787, "operands" : [ { "kind" : "IdResultType" }, @@ -5441,6 +6060,7 @@ }, { "opname" : "OpSubgroupAvcRefEvaluateWithMultiReferenceINTEL", + "class" : "@exclude", "opcode" : 5788, "operands" : [ { "kind" : "IdResultType" }, @@ -5454,6 +6074,7 @@ }, { "opname" : "OpSubgroupAvcRefEvaluateWithMultiReferenceInterlacedINTEL", + "class" : "@exclude", "opcode" : 5789, "operands" : [ { "kind" : "IdResultType" }, @@ -5468,6 +6089,7 @@ }, { "opname" : "OpSubgroupAvcRefConvertToMceResultINTEL", + "class" : "@exclude", "opcode" : 5790, "operands" : [ { "kind" : "IdResultType" }, @@ -5479,6 +6101,7 @@ }, { "opname" : "OpSubgroupAvcSicInitializeINTEL", + "class" : "@exclude", "opcode" : 5791, "operands" : [ { "kind" : "IdResultType" }, @@ -5490,6 +6113,7 @@ }, { "opname" : "OpSubgroupAvcSicConfigureSkcINTEL", + "class" : "@exclude", "opcode" : 5792, "operands" : [ { "kind" : "IdResultType" }, @@ -5506,6 +6130,7 @@ }, { "opname" : "OpSubgroupAvcSicConfigureIpeLumaINTEL", + "class" : "@exclude", "opcode" : 5793, "operands" : [ { "kind" : "IdResultType" }, @@ -5524,6 +6149,7 @@ }, { "opname" : "OpSubgroupAvcSicConfigureIpeLumaChromaINTEL", + "class" : "@exclude", "opcode" : 5794, "operands" : [ { "kind" : "IdResultType" }, @@ -5545,6 +6171,7 @@ }, { "opname" : "OpSubgroupAvcSicGetMotionVectorMaskINTEL", + "class" : "@exclude", "opcode" : 5795, "operands" : [ { "kind" : "IdResultType" }, @@ -5557,6 +6184,7 @@ }, { "opname" : "OpSubgroupAvcSicConvertToMcePayloadINTEL", + "class" : "@exclude", "opcode" : 5796, "operands" : [ { "kind" : "IdResultType" }, @@ -5568,6 +6196,7 @@ }, { "opname" : "OpSubgroupAvcSicSetIntraLumaShapePenaltyINTEL", + "class" : "@exclude", "opcode" : 5797, "operands" : [ { "kind" : "IdResultType" }, @@ -5580,6 +6209,7 @@ }, { "opname" : "OpSubgroupAvcSicSetIntraLumaModeCostFunctionINTEL", + "class" : "@exclude", "opcode" : 5798, "operands" : [ { "kind" : "IdResultType" }, @@ -5594,6 +6224,7 @@ }, { "opname" : "OpSubgroupAvcSicSetIntraChromaModeCostFunctionINTEL", + "class" : "@exclude", "opcode" : 5799, "operands" : [ { "kind" : "IdResultType" }, @@ -5606,6 +6237,7 @@ }, { "opname" : "OpSubgroupAvcSicSetBilinearFilterEnableINTEL", + "class" : "@exclude", "opcode" : 5800, "operands" : [ { "kind" : "IdResultType" }, @@ -5617,6 +6249,7 @@ }, { "opname" : "OpSubgroupAvcSicSetSkcForwardTransformEnableINTEL", + "class" : "@exclude", "opcode" : 5801, "operands" : [ { "kind" : "IdResultType" }, @@ -5629,6 +6262,7 @@ }, { "opname" : "OpSubgroupAvcSicSetBlockBasedRawSkipSadINTEL", + "class" : "@exclude", "opcode" : 5802, "operands" : [ { "kind" : "IdResultType" }, @@ -5641,6 +6275,7 @@ }, { "opname" : "OpSubgroupAvcSicEvaluateIpeINTEL", + "class" : "@exclude", "opcode" : 5803, "operands" : [ { "kind" : "IdResultType" }, @@ -5653,6 +6288,7 @@ }, { "opname" : "OpSubgroupAvcSicEvaluateWithSingleReferenceINTEL", + "class" : "@exclude", "opcode" : 5804, "operands" : [ { "kind" : "IdResultType" }, @@ -5666,6 +6302,7 @@ }, { "opname" : "OpSubgroupAvcSicEvaluateWithDualReferenceINTEL", + "class" : "@exclude", "opcode" : 5805, "operands" : [ { "kind" : "IdResultType" }, @@ -5680,6 +6317,7 @@ }, { "opname" : "OpSubgroupAvcSicEvaluateWithMultiReferenceINTEL", + "class" : "@exclude", "opcode" : 5806, "operands" : [ { "kind" : "IdResultType" }, @@ -5693,6 +6331,7 @@ }, { "opname" : "OpSubgroupAvcSicEvaluateWithMultiReferenceInterlacedINTEL", + "class" : "@exclude", "opcode" : 5807, "operands" : [ { "kind" : "IdResultType" }, @@ -5707,6 +6346,7 @@ }, { "opname" : "OpSubgroupAvcSicConvertToMceResultINTEL", + "class" : "@exclude", "opcode" : 5808, "operands" : [ { "kind" : "IdResultType" }, @@ -5718,6 +6358,7 @@ }, { "opname" : "OpSubgroupAvcSicGetIpeLumaShapeINTEL", + "class" : "@exclude", "opcode" : 5809, "operands" : [ { "kind" : "IdResultType" }, @@ -5729,6 +6370,7 @@ }, { "opname" : "OpSubgroupAvcSicGetBestIpeLumaDistortionINTEL", + "class" : "@exclude", "opcode" : 5810, "operands" : [ { "kind" : "IdResultType" }, @@ -5740,6 +6382,7 @@ }, { "opname" : "OpSubgroupAvcSicGetBestIpeChromaDistortionINTEL", + "class" : "@exclude", "opcode" : 5811, "operands" : [ { "kind" : "IdResultType" }, @@ -5751,6 +6394,7 @@ }, { "opname" : "OpSubgroupAvcSicGetPackedIpeLumaModesINTEL", + "class" : "@exclude", "opcode" : 5812, "operands" : [ { "kind" : "IdResultType" }, @@ -5762,6 +6406,7 @@ }, { "opname" : "OpSubgroupAvcSicGetIpeChromaModeINTEL", + "class" : "@exclude", "opcode" : 5813, "operands" : [ { "kind" : "IdResultType" }, @@ -5773,6 +6418,7 @@ }, { "opname" : "OpSubgroupAvcSicGetPackedSkcLumaCountThresholdINTEL", + "class" : "@exclude", "opcode" : 5814, "operands" : [ { "kind" : "IdResultType" }, @@ -5784,6 +6430,7 @@ }, { "opname" : "OpSubgroupAvcSicGetPackedSkcLumaSumThresholdINTEL", + "class" : "@exclude", "opcode" : 5815, "operands" : [ { "kind" : "IdResultType" }, @@ -5795,6 +6442,7 @@ }, { "opname" : "OpSubgroupAvcSicGetInterRawSadsINTEL", + "class" : "@exclude", "opcode" : 5816, "operands" : [ { "kind" : "IdResultType" }, @@ -5876,34 +6524,68 @@ ] }, { + "enumerant" : "MakeTexelAvailable", + "value" : "0x0100", + "capabilities" : [ "VulkanMemoryModel" ], + "parameters" : [ + { "kind" : "IdScope" } + ], + "version" : "1.5" + }, + { "enumerant" : "MakeTexelAvailableKHR", "value" : "0x0100", - "capabilities" : [ "VulkanMemoryModelKHR" ], + "capabilities" : [ "VulkanMemoryModel" ], "parameters" : [ { "kind" : "IdScope" } ], - "version" : "None" + "extensions" : [ "SPV_KHR_vulkan_memory_model" ], + "version" : "1.5" + }, + { + "enumerant" : "MakeTexelVisible", + "value" : "0x0200", + "capabilities" : [ "VulkanMemoryModel" ], + "parameters" : [ + { "kind" : "IdScope" } + ], + "version" : "1.5" }, { "enumerant" : "MakeTexelVisibleKHR", "value" : "0x0200", - "capabilities" : [ "VulkanMemoryModelKHR" ], + "capabilities" : [ "VulkanMemoryModel" ], "parameters" : [ { "kind" : "IdScope" } ], - "version" : "None" + "extensions" : [ "SPV_KHR_vulkan_memory_model" ], + "version" : "1.5" + }, + { + "enumerant" : "NonPrivateTexel", + "value" : "0x0400", + "capabilities" : [ "VulkanMemoryModel" ], + "version" : "1.5" }, { "enumerant" : "NonPrivateTexelKHR", "value" : "0x0400", - "capabilities" : [ "VulkanMemoryModelKHR" ], - "version" : "None" + "capabilities" : [ "VulkanMemoryModel" ], + "extensions" : [ "SPV_KHR_vulkan_memory_model" ], + "version" : "1.5" + }, + { + "enumerant" : "VolatileTexel", + "value" : "0x0800", + "capabilities" : [ "VulkanMemoryModel" ], + "version" : "1.5" }, { "enumerant" : "VolatileTexelKHR", "value" : "0x0800", - "capabilities" : [ "VulkanMemoryModelKHR" ], - "version" : "None" + "capabilities" : [ "VulkanMemoryModel" ], + "extensions" : [ "SPV_KHR_vulkan_memory_model" ], + "version" : "1.5" }, { "enumerant" : "SignExtend", @@ -6122,28 +6804,50 @@ "value" : "0x0800" }, { + "enumerant" : "OutputMemory", + "value" : "0x1000", + "capabilities" : [ "VulkanMemoryModel" ], + "version" : "1.5" + }, + { "enumerant" : "OutputMemoryKHR", "value" : "0x1000", - "capabilities" : [ "VulkanMemoryModelKHR" ], - "version" : "None" + "capabilities" : [ "VulkanMemoryModel" ], + "extensions" : [ "SPV_KHR_vulkan_memory_model" ], + "version" : "1.5" + }, + { + "enumerant" : "MakeAvailable", + "value" : "0x2000", + "capabilities" : [ "VulkanMemoryModel" ], + "version" : "1.5" }, { "enumerant" : "MakeAvailableKHR", "value" : "0x2000", - "capabilities" : [ "VulkanMemoryModelKHR" ], - "version" : "None" + "capabilities" : [ "VulkanMemoryModel" ], + "extensions" : [ "SPV_KHR_vulkan_memory_model" ], + "version" : "1.5" + }, + { + "enumerant" : "MakeVisible", + "value" : "0x4000", + "capabilities" : [ "VulkanMemoryModel" ], + "version" : "1.5" }, { "enumerant" : "MakeVisibleKHR", "value" : "0x4000", - "capabilities" : [ "VulkanMemoryModelKHR" ], - "version" : "None" + "capabilities" : [ "VulkanMemoryModel" ], + "extensions" : [ "SPV_KHR_vulkan_memory_model" ], + "version" : "1.5" }, { "enumerant" : "Volatile", "value" : "0x8000", - "capabilities" : [ "VulkanMemoryModelKHR" ], - "version" : "None" + "capabilities" : [ "VulkanMemoryModel" ], + "extensions" : [ "SPV_KHR_vulkan_memory_model" ], + "version" : "1.5" } ] }, @@ -6171,13 +6875,32 @@ "value" : "0x0004" }, { + "enumerant" : "MakePointerAvailable", + "value" : "0x0008", + "parameters" : [ + { "kind" : "IdScope" } + ], + "capabilities" : [ "VulkanMemoryModel" ], + "version" : "1.5" + }, + { "enumerant" : "MakePointerAvailableKHR", "value" : "0x0008", "parameters" : [ { "kind" : "IdScope" } ], - "capabilities" : [ "VulkanMemoryModelKHR" ], - "version" : "None" + "capabilities" : [ "VulkanMemoryModel" ], + "extensions" : [ "SPV_KHR_vulkan_memory_model" ], + "version" : "1.5" + }, + { + "enumerant" : "MakePointerVisible", + "value" : "0x0010", + "parameters" : [ + { "kind" : "IdScope" } + ], + "capabilities" : [ "VulkanMemoryModel" ], + "version" : "1.5" }, { "enumerant" : "MakePointerVisibleKHR", @@ -6185,14 +6908,22 @@ "parameters" : [ { "kind" : "IdScope" } ], - "capabilities" : [ "VulkanMemoryModelKHR" ], - "version" : "None" + "capabilities" : [ "VulkanMemoryModel" ], + "extensions" : [ "SPV_KHR_vulkan_memory_model" ], + "version" : "1.5" + }, + { + "enumerant" : "NonPrivatePointer", + "value" : "0x0020", + "capabilities" : [ "VulkanMemoryModel" ], + "version" : "1.5" }, { "enumerant" : "NonPrivatePointerKHR", "value" : "0x0020", - "capabilities" : [ "VulkanMemoryModelKHR" ], - "version" : "None" + "capabilities" : [ "VulkanMemoryModel" ], + "extensions" : [ "SPV_KHR_vulkan_memory_model" ], + "version" : "1.5" } ] }, @@ -6349,11 +7080,18 @@ "capabilities" : [ "Addresses" ] }, { + "enumerant" : "PhysicalStorageBuffer64", + "value" : 5348, + "extensions" : [ "SPV_EXT_physical_storage_buffer" ], + "capabilities" : [ "PhysicalStorageBufferAddresses" ], + "version" : "1.5" + }, + { "enumerant" : "PhysicalStorageBuffer64EXT", "value" : 5348, "extensions" : [ "SPV_EXT_physical_storage_buffer" ], - "capabilities" : [ "PhysicalStorageBufferAddressesEXT" ], - "version" : "None" + "capabilities" : [ "PhysicalStorageBufferAddresses" ], + "version" : "1.5" } ] }, @@ -6377,10 +7115,17 @@ "capabilities" : [ "Kernel" ] }, { + "enumerant" : "Vulkan", + "value" : 3, + "capabilities" : [ "VulkanMemoryModel" ], + "version" : "1.5" + }, + { "enumerant" : "VulkanKHR", "value" : 3, - "capabilities" : [ "VulkanMemoryModelKHR" ], - "version" : "None" + "capabilities" : [ "VulkanMemoryModel" ], + "extensions" : [ "SPV_KHR_vulkan_memory_model" ], + "version" : "1.5" } ] }, @@ -6876,11 +7621,18 @@ "version" : "None" }, { + "enumerant" : "PhysicalStorageBuffer", + "value" : 5349, + "extensions" : [ "SPV_EXT_physical_storage_buffer" ], + "capabilities" : [ "PhysicalStorageBufferAddresses" ], + "version" : "1.5" + }, + { "enumerant" : "PhysicalStorageBufferEXT", "value" : 5349, "extensions" : [ "SPV_EXT_physical_storage_buffer" ], - "capabilities" : [ "PhysicalStorageBufferAddressesEXT" ], - "version" : "None" + "capabilities" : [ "PhysicalStorageBufferAddresses" ], + "version" : "1.5" } ] }, @@ -7861,23 +8613,45 @@ "version" : "None" }, { + "enumerant" : "NonUniform", + "value" : 5300, + "capabilities" : [ "ShaderNonUniform" ], + "version" : "1.5" + }, + { "enumerant" : "NonUniformEXT", "value" : 5300, - "capabilities" : [ "ShaderNonUniformEXT" ] + "capabilities" : [ "ShaderNonUniform" ], + "extensions" : [ "SPV_EXT_descriptor_indexing" ], + "version" : "1.5" + }, + { + "enumerant" : "RestrictPointer", + "value" : 5355, + "capabilities" : [ "PhysicalStorageBufferAddresses" ], + "extensions" : [ "SPV_EXT_physical_storage_buffer" ], + "version" : "1.5" }, { "enumerant" : "RestrictPointerEXT", "value" : 5355, - "capabilities" : [ "PhysicalStorageBufferAddressesEXT" ], + "capabilities" : [ "PhysicalStorageBufferAddresses" ], "extensions" : [ "SPV_EXT_physical_storage_buffer" ], - "version" : "None" + "version" : "1.5" + }, + { + "enumerant" : "AliasedPointer", + "value" : 5356, + "capabilities" : [ "PhysicalStorageBufferAddresses" ], + "extensions" : [ "SPV_EXT_physical_storage_buffer" ], + "version" : "1.5" }, { "enumerant" : "AliasedPointerEXT", "value" : 5356, - "capabilities" : [ "PhysicalStorageBufferAddressesEXT" ], + "capabilities" : [ "PhysicalStorageBufferAddresses" ], "extensions" : [ "SPV_EXT_physical_storage_buffer" ], - "version" : "None" + "version" : "1.5" }, { "enumerant" : "CounterBuffer", @@ -7971,12 +8745,12 @@ { "enumerant" : "Layer", "value" : 9, - "capabilities" : [ "Geometry" ] + "capabilities" : [ "Geometry", "ShaderLayer", "ShaderViewportIndexLayerEXT" ] }, { "enumerant" : "ViewportIndex", "value" : 10, - "capabilities" : [ "MultiViewport" ] + "capabilities" : [ "MultiViewport", "ShaderViewportIndex", "ShaderViewportIndexLayerEXT" ] }, { "enumerant" : "TessLevelOuter", @@ -8569,10 +9343,16 @@ "value" : 4 }, { + "enumerant" : "QueueFamily", + "value" : 5, + "capabilities" : [ "VulkanMemoryModel" ], + "version" : "1.5" + }, + { "enumerant" : "QueueFamilyKHR", "value" : 5, - "capabilities" : [ "VulkanMemoryModelKHR" ], - "version" : "None" + "capabilities" : [ "VulkanMemoryModel" ], + "version" : "1.5" } ] }, @@ -8984,6 +9764,16 @@ "version" : "1.3" }, { + "enumerant" : "ShaderLayer", + "value" : 69, + "version" : "1.5" + }, + { + "enumerant" : "ShaderViewportIndex", + "value" : 70, + "version" : "1.5" + }, + { "enumerant" : "SubgroupBallotKHR", "value" : 4423, "extensions" : [ "SPV_KHR_shader_ballot" ], @@ -9089,20 +9879,20 @@ "enumerant" : "StorageBuffer8BitAccess", "value" : 4448, "extensions" : [ "SPV_KHR_8bit_storage" ], - "version" : "None" + "version" : "1.5" }, { "enumerant" : "UniformAndStorageBuffer8BitAccess", "value" : 4449, "capabilities" : [ "StorageBuffer8BitAccess" ], "extensions" : [ "SPV_KHR_8bit_storage" ], - "version" : "None" + "version" : "1.5" }, { "enumerant" : "StoragePushConstant8", "value" : 4450, "extensions" : [ "SPV_KHR_8bit_storage" ], - "version" : "None" + "version" : "1.5" }, { "enumerant" : "DenormPreserve", @@ -9170,6 +9960,13 @@ "version" : "None" }, { + "enumerant" : "ShaderClockKHR", + "value" : 5055, + "capabilities" : [ "Shader" ], + "extensions" : [ "SPV_KHR_shader_clock" ], + "version" : "None" + }, + { "enumerant" : "SampleMaskOverrideCoverageNV", "value" : 5249, "capabilities" : [ "SampleRateShading" ], @@ -9271,88 +10068,160 @@ "version" : "None" }, { + "enumerant" : "ShaderNonUniform", + "value" : 5301, + "capabilities" : [ "Shader" ], + "version" : "1.5" + }, + { "enumerant" : "ShaderNonUniformEXT", "value" : 5301, "capabilities" : [ "Shader" ], "extensions" : [ "SPV_EXT_descriptor_indexing" ], - "version" : "None" + "version" : "1.5" + }, + { + "enumerant" : "RuntimeDescriptorArray", + "value" : 5302, + "capabilities" : [ "Shader" ], + "version" : "1.5" }, { "enumerant" : "RuntimeDescriptorArrayEXT", "value" : 5302, "capabilities" : [ "Shader" ], "extensions" : [ "SPV_EXT_descriptor_indexing" ], - "version" : "None" + "version" : "1.5" + }, + { + "enumerant" : "InputAttachmentArrayDynamicIndexing", + "value" : 5303, + "capabilities" : [ "InputAttachment" ], + "version" : "1.5" }, { "enumerant" : "InputAttachmentArrayDynamicIndexingEXT", "value" : 5303, "capabilities" : [ "InputAttachment" ], "extensions" : [ "SPV_EXT_descriptor_indexing" ], - "version" : "None" + "version" : "1.5" + }, + { + "enumerant" : "UniformTexelBufferArrayDynamicIndexing", + "value" : 5304, + "capabilities" : [ "SampledBuffer" ], + "version" : "1.5" }, { "enumerant" : "UniformTexelBufferArrayDynamicIndexingEXT", "value" : 5304, "capabilities" : [ "SampledBuffer" ], "extensions" : [ "SPV_EXT_descriptor_indexing" ], - "version" : "None" + "version" : "1.5" + }, + { + "enumerant" : "StorageTexelBufferArrayDynamicIndexing", + "value" : 5305, + "capabilities" : [ "ImageBuffer" ], + "version" : "1.5" }, { "enumerant" : "StorageTexelBufferArrayDynamicIndexingEXT", "value" : 5305, "capabilities" : [ "ImageBuffer" ], "extensions" : [ "SPV_EXT_descriptor_indexing" ], - "version" : "None" + "version" : "1.5" + }, + { + "enumerant" : "UniformBufferArrayNonUniformIndexing", + "value" : 5306, + "capabilities" : [ "ShaderNonUniform" ], + "version" : "1.5" }, { "enumerant" : "UniformBufferArrayNonUniformIndexingEXT", "value" : 5306, - "capabilities" : [ "ShaderNonUniformEXT" ], + "capabilities" : [ "ShaderNonUniform" ], "extensions" : [ "SPV_EXT_descriptor_indexing" ], - "version" : "None" + "version" : "1.5" + }, + { + "enumerant" : "SampledImageArrayNonUniformIndexing", + "value" : 5307, + "capabilities" : [ "ShaderNonUniform" ], + "version" : "1.5" }, { "enumerant" : "SampledImageArrayNonUniformIndexingEXT", "value" : 5307, - "capabilities" : [ "ShaderNonUniformEXT" ], + "capabilities" : [ "ShaderNonUniform" ], "extensions" : [ "SPV_EXT_descriptor_indexing" ], - "version" : "None" + "version" : "1.5" + }, + { + "enumerant" : "StorageBufferArrayNonUniformIndexing", + "value" : 5308, + "capabilities" : [ "ShaderNonUniform" ], + "version" : "1.5" }, { "enumerant" : "StorageBufferArrayNonUniformIndexingEXT", "value" : 5308, - "capabilities" : [ "ShaderNonUniformEXT" ], + "capabilities" : [ "ShaderNonUniform" ], "extensions" : [ "SPV_EXT_descriptor_indexing" ], - "version" : "None" + "version" : "1.5" + }, + { + "enumerant" : "StorageImageArrayNonUniformIndexing", + "value" : 5309, + "capabilities" : [ "ShaderNonUniform" ], + "version" : "1.5" }, { "enumerant" : "StorageImageArrayNonUniformIndexingEXT", "value" : 5309, - "capabilities" : [ "ShaderNonUniformEXT" ], + "capabilities" : [ "ShaderNonUniform" ], "extensions" : [ "SPV_EXT_descriptor_indexing" ], - "version" : "None" + "version" : "1.5" + }, + { + "enumerant" : "InputAttachmentArrayNonUniformIndexing", + "value" : 5310, + "capabilities" : [ "InputAttachment", "ShaderNonUniform" ], + "version" : "1.5" }, { "enumerant" : "InputAttachmentArrayNonUniformIndexingEXT", "value" : 5310, - "capabilities" : [ "InputAttachment", "ShaderNonUniformEXT" ], + "capabilities" : [ "InputAttachment", "ShaderNonUniform" ], "extensions" : [ "SPV_EXT_descriptor_indexing" ], - "version" : "None" + "version" : "1.5" + }, + { + "enumerant" : "UniformTexelBufferArrayNonUniformIndexing", + "value" : 5311, + "capabilities" : [ "SampledBuffer", "ShaderNonUniform" ], + "version" : "1.5" }, { "enumerant" : "UniformTexelBufferArrayNonUniformIndexingEXT", "value" : 5311, - "capabilities" : [ "SampledBuffer", "ShaderNonUniformEXT" ], + "capabilities" : [ "SampledBuffer", "ShaderNonUniform" ], "extensions" : [ "SPV_EXT_descriptor_indexing" ], - "version" : "None" + "version" : "1.5" + }, + { + "enumerant" : "StorageTexelBufferArrayNonUniformIndexing", + "value" : 5312, + "capabilities" : [ "ImageBuffer", "ShaderNonUniform" ], + "version" : "1.5" }, { "enumerant" : "StorageTexelBufferArrayNonUniformIndexingEXT", "value" : 5312, - "capabilities" : [ "ImageBuffer", "ShaderNonUniformEXT" ], + "capabilities" : [ "ImageBuffer", "ShaderNonUniform" ], "extensions" : [ "SPV_EXT_descriptor_indexing" ], - "version" : "None" + "version" : "1.5" }, { "enumerant" : "RayTracingNV", @@ -9362,23 +10231,39 @@ "version" : "None" }, { + "enumerant" : "VulkanMemoryModel", + "value" : 5345, + "version" : "1.5" + }, + { "enumerant" : "VulkanMemoryModelKHR", "value" : 5345, "extensions" : [ "SPV_KHR_vulkan_memory_model" ], - "version" : "None" + "version" : "1.5" + }, + { + "enumerant" : "VulkanMemoryModelDeviceScope", + "value" : 5346, + "version" : "1.5" }, { "enumerant" : "VulkanMemoryModelDeviceScopeKHR", "value" : 5346, "extensions" : [ "SPV_KHR_vulkan_memory_model" ], - "version" : "None" + "version" : "1.5" + }, + { + "enumerant" : "PhysicalStorageBufferAddresses", + "value" : 5347, + "capabilities" : [ "Shader" ], + "version" : "1.5" }, { "enumerant" : "PhysicalStorageBufferAddressesEXT", "value" : 5347, "capabilities" : [ "Shader" ], "extensions" : [ "SPV_EXT_physical_storage_buffer" ], - "version" : "None" + "version" : "1.5" }, { "enumerant" : "ComputeDerivativeGroupLinearNV", diff -Nru mesa-19.2.8/src/compiler/spirv/spirv.h mesa-20.0.8/src/compiler/spirv/spirv.h --- mesa-19.2.8/src/compiler/spirv/spirv.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/spirv/spirv.h 2020-06-12 01:21:16.000000000 +0000 @@ -95,6 +95,7 @@ SpvAddressingModelLogical = 0, SpvAddressingModelPhysical32 = 1, SpvAddressingModelPhysical64 = 2, + SpvAddressingModelPhysicalStorageBuffer64 = 5348, SpvAddressingModelPhysicalStorageBuffer64EXT = 5348, SpvAddressingModelMax = 0x7fffffff, } SpvAddressingModel; @@ -103,6 +104,7 @@ SpvMemoryModelSimple = 0, SpvMemoryModelGLSL450 = 1, SpvMemoryModelOpenCL = 2, + SpvMemoryModelVulkan = 3, SpvMemoryModelVulkanKHR = 3, SpvMemoryModelMax = 0x7fffffff, } SpvMemoryModel; @@ -187,6 +189,7 @@ SpvStorageClassHitAttributeNV = 5339, SpvStorageClassIncomingRayPayloadNV = 5342, SpvStorageClassShaderRecordBufferNV = 5343, + SpvStorageClassPhysicalStorageBuffer = 5349, SpvStorageClassPhysicalStorageBufferEXT = 5349, SpvStorageClassMax = 0x7fffffff, } SpvStorageClass; @@ -315,9 +318,13 @@ SpvImageOperandsConstOffsetsShift = 5, SpvImageOperandsSampleShift = 6, SpvImageOperandsMinLodShift = 7, + SpvImageOperandsMakeTexelAvailableShift = 8, SpvImageOperandsMakeTexelAvailableKHRShift = 8, + SpvImageOperandsMakeTexelVisibleShift = 9, SpvImageOperandsMakeTexelVisibleKHRShift = 9, + SpvImageOperandsNonPrivateTexelShift = 10, SpvImageOperandsNonPrivateTexelKHRShift = 10, + SpvImageOperandsVolatileTexelShift = 11, SpvImageOperandsVolatileTexelKHRShift = 11, SpvImageOperandsSignExtendShift = 12, SpvImageOperandsZeroExtendShift = 13, @@ -334,9 +341,13 @@ SpvImageOperandsConstOffsetsMask = 0x00000020, SpvImageOperandsSampleMask = 0x00000040, SpvImageOperandsMinLodMask = 0x00000080, + SpvImageOperandsMakeTexelAvailableMask = 0x00000100, SpvImageOperandsMakeTexelAvailableKHRMask = 0x00000100, + SpvImageOperandsMakeTexelVisibleMask = 0x00000200, SpvImageOperandsMakeTexelVisibleKHRMask = 0x00000200, + SpvImageOperandsNonPrivateTexelMask = 0x00000400, SpvImageOperandsNonPrivateTexelKHRMask = 0x00000400, + SpvImageOperandsVolatileTexelMask = 0x00000800, SpvImageOperandsVolatileTexelKHRMask = 0x00000800, SpvImageOperandsSignExtendMask = 0x00001000, SpvImageOperandsZeroExtendMask = 0x00002000, @@ -452,8 +463,11 @@ SpvDecorationPerViewNV = 5272, SpvDecorationPerTaskNV = 5273, SpvDecorationPerVertexNV = 5285, + SpvDecorationNonUniform = 5300, SpvDecorationNonUniformEXT = 5300, + SpvDecorationRestrictPointer = 5355, SpvDecorationRestrictPointerEXT = 5355, + SpvDecorationAliasedPointer = 5356, SpvDecorationAliasedPointerEXT = 5356, SpvDecorationCounterBuffer = 5634, SpvDecorationHlslCounterBufferGOOGLE = 5634, @@ -634,8 +648,11 @@ SpvMemorySemanticsCrossWorkgroupMemoryShift = 9, SpvMemorySemanticsAtomicCounterMemoryShift = 10, SpvMemorySemanticsImageMemoryShift = 11, + SpvMemorySemanticsOutputMemoryShift = 12, SpvMemorySemanticsOutputMemoryKHRShift = 12, + SpvMemorySemanticsMakeAvailableShift = 13, SpvMemorySemanticsMakeAvailableKHRShift = 13, + SpvMemorySemanticsMakeVisibleShift = 14, SpvMemorySemanticsMakeVisibleKHRShift = 14, SpvMemorySemanticsVolatileShift = 15, SpvMemorySemanticsMax = 0x7fffffff, @@ -653,8 +670,11 @@ SpvMemorySemanticsCrossWorkgroupMemoryMask = 0x00000200, SpvMemorySemanticsAtomicCounterMemoryMask = 0x00000400, SpvMemorySemanticsImageMemoryMask = 0x00000800, + SpvMemorySemanticsOutputMemoryMask = 0x00001000, SpvMemorySemanticsOutputMemoryKHRMask = 0x00001000, + SpvMemorySemanticsMakeAvailableMask = 0x00002000, SpvMemorySemanticsMakeAvailableKHRMask = 0x00002000, + SpvMemorySemanticsMakeVisibleMask = 0x00004000, SpvMemorySemanticsMakeVisibleKHRMask = 0x00004000, SpvMemorySemanticsVolatileMask = 0x00008000, } SpvMemorySemanticsMask; @@ -663,8 +683,11 @@ SpvMemoryAccessVolatileShift = 0, SpvMemoryAccessAlignedShift = 1, SpvMemoryAccessNontemporalShift = 2, + SpvMemoryAccessMakePointerAvailableShift = 3, SpvMemoryAccessMakePointerAvailableKHRShift = 3, + SpvMemoryAccessMakePointerVisibleShift = 4, SpvMemoryAccessMakePointerVisibleKHRShift = 4, + SpvMemoryAccessNonPrivatePointerShift = 5, SpvMemoryAccessNonPrivatePointerKHRShift = 5, SpvMemoryAccessMax = 0x7fffffff, } SpvMemoryAccessShift; @@ -674,8 +697,11 @@ SpvMemoryAccessVolatileMask = 0x00000001, SpvMemoryAccessAlignedMask = 0x00000002, SpvMemoryAccessNontemporalMask = 0x00000004, + SpvMemoryAccessMakePointerAvailableMask = 0x00000008, SpvMemoryAccessMakePointerAvailableKHRMask = 0x00000008, + SpvMemoryAccessMakePointerVisibleMask = 0x00000010, SpvMemoryAccessMakePointerVisibleKHRMask = 0x00000010, + SpvMemoryAccessNonPrivatePointerMask = 0x00000020, SpvMemoryAccessNonPrivatePointerKHRMask = 0x00000020, } SpvMemoryAccessMask; @@ -685,6 +711,7 @@ SpvScopeWorkgroup = 2, SpvScopeSubgroup = 3, SpvScopeInvocation = 4, + SpvScopeQueueFamily = 5, SpvScopeQueueFamilyKHR = 5, SpvScopeMax = 0x7fffffff, } SpvScope; @@ -785,6 +812,8 @@ SpvCapabilityGroupNonUniformShuffleRelative = 66, SpvCapabilityGroupNonUniformClustered = 67, SpvCapabilityGroupNonUniformQuad = 68, + SpvCapabilityShaderLayer = 69, + SpvCapabilityShaderViewportIndex = 70, SpvCapabilitySubgroupBallotKHR = 4423, SpvCapabilityDrawParameters = 4427, SpvCapabilitySubgroupVoteKHR = 4431, @@ -813,6 +842,7 @@ SpvCapabilityFragmentMaskAMD = 5010, SpvCapabilityStencilExportEXT = 5013, SpvCapabilityImageReadWriteLodAMD = 5015, + SpvCapabilityShaderClockKHR = 5055, SpvCapabilitySampleMaskOverrideCoverageNV = 5249, SpvCapabilityGeometryShaderPassthroughNV = 5251, SpvCapabilityShaderViewportIndexLayerEXT = 5254, @@ -828,21 +858,36 @@ SpvCapabilityFragmentDensityEXT = 5291, SpvCapabilityShadingRateNV = 5291, SpvCapabilityGroupNonUniformPartitionedNV = 5297, + SpvCapabilityShaderNonUniform = 5301, SpvCapabilityShaderNonUniformEXT = 5301, + SpvCapabilityRuntimeDescriptorArray = 5302, SpvCapabilityRuntimeDescriptorArrayEXT = 5302, + SpvCapabilityInputAttachmentArrayDynamicIndexing = 5303, SpvCapabilityInputAttachmentArrayDynamicIndexingEXT = 5303, + SpvCapabilityUniformTexelBufferArrayDynamicIndexing = 5304, SpvCapabilityUniformTexelBufferArrayDynamicIndexingEXT = 5304, + SpvCapabilityStorageTexelBufferArrayDynamicIndexing = 5305, SpvCapabilityStorageTexelBufferArrayDynamicIndexingEXT = 5305, + SpvCapabilityUniformBufferArrayNonUniformIndexing = 5306, SpvCapabilityUniformBufferArrayNonUniformIndexingEXT = 5306, + SpvCapabilitySampledImageArrayNonUniformIndexing = 5307, SpvCapabilitySampledImageArrayNonUniformIndexingEXT = 5307, + SpvCapabilityStorageBufferArrayNonUniformIndexing = 5308, SpvCapabilityStorageBufferArrayNonUniformIndexingEXT = 5308, + SpvCapabilityStorageImageArrayNonUniformIndexing = 5309, SpvCapabilityStorageImageArrayNonUniformIndexingEXT = 5309, + SpvCapabilityInputAttachmentArrayNonUniformIndexing = 5310, SpvCapabilityInputAttachmentArrayNonUniformIndexingEXT = 5310, + SpvCapabilityUniformTexelBufferArrayNonUniformIndexing = 5311, SpvCapabilityUniformTexelBufferArrayNonUniformIndexingEXT = 5311, + SpvCapabilityStorageTexelBufferArrayNonUniformIndexing = 5312, SpvCapabilityStorageTexelBufferArrayNonUniformIndexingEXT = 5312, SpvCapabilityRayTracingNV = 5340, + SpvCapabilityVulkanMemoryModel = 5345, SpvCapabilityVulkanMemoryModelKHR = 5345, + SpvCapabilityVulkanMemoryModelDeviceScope = 5346, SpvCapabilityVulkanMemoryModelDeviceScopeKHR = 5346, + SpvCapabilityPhysicalStorageBufferAddresses = 5347, SpvCapabilityPhysicalStorageBufferAddressesEXT = 5347, SpvCapabilityComputeDerivativeGroupLinearNV = 5350, SpvCapabilityCooperativeMatrixNV = 5357, @@ -1223,6 +1268,7 @@ SpvOpGroupSMaxNonUniformAMD = 5007, SpvOpFragmentMaskFetchAMD = 5011, SpvOpFragmentFetchAMD = 5012, + SpvOpReadClockKHR = 5056, SpvOpImageSampleFootprintNV = 5283, SpvOpGroupNonUniformPartitionNV = 5296, SpvOpWritePackedPrimitiveIndices4x8NV = 5299, @@ -1755,6 +1801,7 @@ case SpvOpGroupSMaxNonUniformAMD: *hasResult = true; *hasResultType = true; break; case SpvOpFragmentMaskFetchAMD: *hasResult = true; *hasResultType = true; break; case SpvOpFragmentFetchAMD: *hasResult = true; *hasResultType = true; break; + case SpvOpReadClockKHR: *hasResult = true; *hasResultType = true; break; case SpvOpImageSampleFootprintNV: *hasResult = true; *hasResultType = true; break; case SpvOpGroupNonUniformPartitionNV: *hasResult = true; *hasResultType = true; break; case SpvOpWritePackedPrimitiveIndices4x8NV: *hasResult = false; *hasResultType = false; break; diff -Nru mesa-19.2.8/src/compiler/spirv/spirv_info_c.py mesa-20.0.8/src/compiler/spirv/spirv_info_c.py --- mesa-19.2.8/src/compiler/spirv/spirv_info_c.py 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/spirv/spirv_info_c.py 2020-06-12 01:21:16.000000000 +0000 @@ -43,7 +43,7 @@ seen.add(x["value"]) values.append(x["enumerant"]) - return (kind, values) + return (kind, values, operands["category"]) def collect_opcodes(spirv): seen = set() @@ -59,7 +59,7 @@ values.append(name[2:]) seen.add(opcode) - return ("Op", values) + return ("Op", values, None) def parse_args(): p = argparse.ArgumentParser() @@ -72,8 +72,25 @@ """ + COPYRIGHT + """\ #include "spirv_info.h" -% for kind,values in info: +% for kind,values,category in info: +% if category == "BitEnum": +const char * +spirv_${kind.lower()}_to_string(Spv${kind}Mask v) +{ + switch (v) { + % for name in values: + %if name != "None": + case Spv${kind}${name}Mask: return "Spv${kind}${name}"; + % else: + case Spv${kind}MaskNone: return "Spv${kind}${name}"; + % endif + % endfor + } + + return "unknown"; +} +% else: const char * spirv_${kind.lower()}_to_string(Spv${kind} v) { @@ -86,6 +103,7 @@ return "unknown"; } +% endif % endfor """) @@ -103,7 +121,9 @@ collect_data(spirv_info, "ExecutionMode"), collect_data(spirv_info, "ExecutionModel"), collect_data(spirv_info, "ImageFormat"), + collect_data(spirv_info, "MemoryModel"), collect_data(spirv_info, "StorageClass"), + collect_data(spirv_info, "ImageOperands"), collect_opcodes(spirv_info), ] diff -Nru mesa-19.2.8/src/compiler/spirv/spirv_info.h mesa-20.0.8/src/compiler/spirv/spirv_info.h --- mesa-19.2.8/src/compiler/spirv/spirv_info.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/spirv/spirv_info.h 2020-06-12 01:21:16.000000000 +0000 @@ -34,6 +34,8 @@ const char *spirv_executionmode_to_string(SpvExecutionMode mode); const char *spirv_executionmodel_to_string(SpvExecutionModel model); const char *spirv_imageformat_to_string(SpvImageFormat format); +const char *spirv_imageoperands_to_string(SpvImageOperandsMask op); +const char *spirv_memorymodel_to_string(SpvMemoryModel cap); const char *spirv_op_to_string(SpvOp op); const char *spirv_storageclass_to_string(SpvStorageClass sc); diff -Nru mesa-19.2.8/src/compiler/spirv/spirv_to_nir.c mesa-20.0.8/src/compiler/spirv/spirv_to_nir.c --- mesa-19.2.8/src/compiler/spirv/spirv_to_nir.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/spirv/spirv_to_nir.c 2020-06-12 01:21:16.000000000 +0000 @@ -371,6 +371,14 @@ return w; } +static bool +vtn_handle_non_semantic_instruction(struct vtn_builder *b, SpvOp ext_opcode, + const uint32_t *w, unsigned count) +{ + /* Do nothing. */ + return true; +} + static void vtn_handle_extension(struct vtn_builder *b, SpvOp opcode, const uint32_t *w, unsigned count) @@ -390,8 +398,13 @@ } else if ((strcmp(ext, "SPV_AMD_shader_trinary_minmax") == 0) && (b->options && b->options->caps.amd_trinary_minmax)) { val->ext_handler = vtn_handle_amd_shader_trinary_minmax_instruction; + } else if ((strcmp(ext, "SPV_AMD_shader_explicit_vertex_parameter") == 0) + && (b->options && b->options->caps.amd_shader_explicit_vertex_parameter)) { + val->ext_handler = vtn_handle_amd_shader_explicit_vertex_parameter_instruction; } else if (strcmp(ext, "OpenCL.std") == 0) { val->ext_handler = vtn_handle_opencl_instruction; + } else if (strstr(ext, "NonSemantic.") == ext) { + val->ext_handler = vtn_handle_non_semantic_instruction; } else { vtn_fail("Unsupported extension: %s", ext); } @@ -736,7 +749,7 @@ static void struct_member_decoration_cb(struct vtn_builder *b, - struct vtn_value *val, int member, + UNUSED struct vtn_value *val, int member, const struct vtn_decoration *dec, void *void_ctx) { struct member_decoration_ctx *ctx = void_ctx; @@ -769,6 +782,9 @@ case SpvDecorationFlat: ctx->fields[member].interpolation = INTERP_MODE_FLAT; break; + case SpvDecorationExplicitInterpAMD: + ctx->fields[member].interpolation = INTERP_MODE_EXPLICIT; + break; case SpvDecorationCentroid: ctx->fields[member].centroid = true; break; @@ -776,8 +792,7 @@ ctx->fields[member].sample = true; break; case SpvDecorationStream: - /* Vulkan only allows one GS stream */ - vtn_assert(dec->operands[0] == 0); + /* This is handled later by var_decoration_cb in vtn_variables.c */ break; case SpvDecorationLocation: ctx->fields[member].location = dec->operands[0]; @@ -828,7 +843,7 @@ case SpvDecorationXfbBuffer: case SpvDecorationXfbStride: - vtn_warn("Vulkan does not have transform feedback"); + /* This is handled later by var_decoration_cb in vtn_variables.c */ break; case SpvDecorationCPacked: @@ -851,6 +866,7 @@ break; case SpvDecorationUserSemantic: + case SpvDecorationUserTypeGOOGLE: /* User semantic decorations can safely be ignored by the driver. */ break; @@ -879,7 +895,7 @@ */ static void struct_member_matrix_stride_cb(struct vtn_builder *b, - struct vtn_value *val, int member, + UNUSED struct vtn_value *val, int member, const struct vtn_decoration *dec, void *void_ctx) { @@ -936,7 +952,7 @@ static void type_decoration_cb(struct vtn_builder *b, struct vtn_value *val, int member, - const struct vtn_decoration *dec, void *ctx) + const struct vtn_decoration *dec, UNUSED void *ctx) { struct vtn_type *type = val->type; @@ -974,6 +990,7 @@ case SpvDecorationPatch: case SpvDecorationCentroid: case SpvDecorationSample: + case SpvDecorationExplicitInterpAMD: case SpvDecorationVolatile: case SpvDecorationCoherent: case SpvDecorationNonWritable: @@ -1031,6 +1048,10 @@ spirv_decoration_to_string(dec->decoration)); break; + case SpvDecorationUserTypeGOOGLE: + /* User semantic decorations can safely be ignored by the driver. */ + break; + default: vtn_fail_with_decoration("Unhandled decoration", dec->decoration); } @@ -1338,7 +1359,7 @@ case SpvStorageClassUniform: case SpvStorageClassPushConstant: case SpvStorageClassStorageBuffer: - case SpvStorageClassPhysicalStorageBufferEXT: + case SpvStorageClassPhysicalStorageBuffer: vtn_foreach_decoration(b, val, array_stride_decoration_cb, NULL); break; default: @@ -1351,6 +1372,7 @@ case SpvStorageClassFunction: case SpvStorageClassWorkgroup: case SpvStorageClassCrossWorkgroup: + case SpvStorageClassUniformConstant: val->type->stride = align(glsl_get_cl_size(val->type->deref->type), glsl_get_cl_alignment(val->type->deref->type)); break; @@ -1521,9 +1543,9 @@ } static void -spec_constant_decoration_cb(struct vtn_builder *b, struct vtn_value *v, - int member, const struct vtn_decoration *dec, - void *data) +spec_constant_decoration_cb(struct vtn_builder *b, UNUSED struct vtn_value *val, + ASSERTED int member, + const struct vtn_decoration *dec, void *data) { vtn_assert(member == -1); if (dec->decoration != SpvDecorationSpecId) @@ -1567,9 +1589,9 @@ static void handle_workgroup_size_decoration_cb(struct vtn_builder *b, struct vtn_value *val, - int member, + ASSERTED int member, const struct vtn_decoration *dec, - void *data) + UNUSED void *data) { vtn_assert(member == -1); if (dec->decoration != SpvDecorationBuiltIn || @@ -1896,7 +1918,9 @@ nir_const_value *srcs[3] = { src[0], src[1], src[2], }; - nir_eval_const_opcode(op, val->constant->values, num_components, bit_size, srcs); + nir_eval_const_opcode(op, val->constant->values, + num_components, bit_size, srcs, + b->shader->info.float_controls_execution_mode); break; } /* default */ } @@ -1919,6 +1943,234 @@ vtn_foreach_decoration(b, val, handle_workgroup_size_decoration_cb, NULL); } +SpvMemorySemanticsMask +vtn_storage_class_to_memory_semantics(SpvStorageClass sc) +{ + switch (sc) { + case SpvStorageClassStorageBuffer: + case SpvStorageClassPhysicalStorageBuffer: + return SpvMemorySemanticsUniformMemoryMask; + case SpvStorageClassWorkgroup: + return SpvMemorySemanticsWorkgroupMemoryMask; + default: + return SpvMemorySemanticsMaskNone; + } +} + +static void +vtn_split_barrier_semantics(struct vtn_builder *b, + SpvMemorySemanticsMask semantics, + SpvMemorySemanticsMask *before, + SpvMemorySemanticsMask *after) +{ + /* For memory semantics embedded in operations, we split them into up to + * two barriers, to be added before and after the operation. This is less + * strict than if we propagated until the final backend stage, but still + * result in correct execution. + * + * A further improvement could be pipe this information (and use!) into the + * next compiler layers, at the expense of making the handling of barriers + * more complicated. + */ + + *before = SpvMemorySemanticsMaskNone; + *after = SpvMemorySemanticsMaskNone; + + SpvMemorySemanticsMask order_semantics = + semantics & (SpvMemorySemanticsAcquireMask | + SpvMemorySemanticsReleaseMask | + SpvMemorySemanticsAcquireReleaseMask | + SpvMemorySemanticsSequentiallyConsistentMask); + + if (util_bitcount(order_semantics) > 1) { + /* Old GLSLang versions incorrectly set all the ordering bits. This was + * fixed in c51287d744fb6e7e9ccc09f6f8451e6c64b1dad6 of glslang repo, + * and it is in GLSLang since revision "SPIRV99.1321" (from Jul-2016). + */ + vtn_warn("Multiple memory ordering semantics specified, " + "assuming AcquireRelease."); + order_semantics = SpvMemorySemanticsAcquireReleaseMask; + } + + const SpvMemorySemanticsMask av_vis_semantics = + semantics & (SpvMemorySemanticsMakeAvailableMask | + SpvMemorySemanticsMakeVisibleMask); + + const SpvMemorySemanticsMask storage_semantics = + semantics & (SpvMemorySemanticsUniformMemoryMask | + SpvMemorySemanticsSubgroupMemoryMask | + SpvMemorySemanticsWorkgroupMemoryMask | + SpvMemorySemanticsCrossWorkgroupMemoryMask | + SpvMemorySemanticsAtomicCounterMemoryMask | + SpvMemorySemanticsImageMemoryMask | + SpvMemorySemanticsOutputMemoryMask); + + const SpvMemorySemanticsMask other_semantics = + semantics & ~(order_semantics | av_vis_semantics | storage_semantics); + + if (other_semantics) + vtn_warn("Ignoring unhandled memory semantics: %u\n", other_semantics); + + /* SequentiallyConsistent is treated as AcquireRelease. */ + + /* The RELEASE barrier happens BEFORE the operation, and it is usually + * associated with a Store. All the write operations with a matching + * semantics will not be reordered after the Store. + */ + if (order_semantics & (SpvMemorySemanticsReleaseMask | + SpvMemorySemanticsAcquireReleaseMask | + SpvMemorySemanticsSequentiallyConsistentMask)) { + *before |= SpvMemorySemanticsReleaseMask | storage_semantics; + } + + /* The ACQUIRE barrier happens AFTER the operation, and it is usually + * associated with a Load. All the operations with a matching semantics + * will not be reordered before the Load. + */ + if (order_semantics & (SpvMemorySemanticsAcquireMask | + SpvMemorySemanticsAcquireReleaseMask | + SpvMemorySemanticsSequentiallyConsistentMask)) { + *after |= SpvMemorySemanticsAcquireMask | storage_semantics; + } + + if (av_vis_semantics & SpvMemorySemanticsMakeVisibleMask) + *before |= SpvMemorySemanticsMakeVisibleMask | storage_semantics; + + if (av_vis_semantics & SpvMemorySemanticsMakeAvailableMask) + *after |= SpvMemorySemanticsMakeAvailableMask | storage_semantics; +} + +static void +vtn_emit_scoped_memory_barrier(struct vtn_builder *b, SpvScope scope, + SpvMemorySemanticsMask semantics) +{ + nir_memory_semantics nir_semantics = 0; + + SpvMemorySemanticsMask order_semantics = + semantics & (SpvMemorySemanticsAcquireMask | + SpvMemorySemanticsReleaseMask | + SpvMemorySemanticsAcquireReleaseMask | + SpvMemorySemanticsSequentiallyConsistentMask); + + if (util_bitcount(order_semantics) > 1) { + /* Old GLSLang versions incorrectly set all the ordering bits. This was + * fixed in c51287d744fb6e7e9ccc09f6f8451e6c64b1dad6 of glslang repo, + * and it is in GLSLang since revision "SPIRV99.1321" (from Jul-2016). + */ + vtn_warn("Multiple memory ordering semantics bits specified, " + "assuming AcquireRelease."); + order_semantics = SpvMemorySemanticsAcquireReleaseMask; + } + + switch (order_semantics) { + case 0: + /* Not an ordering barrier. */ + break; + + case SpvMemorySemanticsAcquireMask: + nir_semantics = NIR_MEMORY_ACQUIRE; + break; + + case SpvMemorySemanticsReleaseMask: + nir_semantics = NIR_MEMORY_RELEASE; + break; + + case SpvMemorySemanticsSequentiallyConsistentMask: + /* Fall through. Treated as AcquireRelease in Vulkan. */ + case SpvMemorySemanticsAcquireReleaseMask: + nir_semantics = NIR_MEMORY_ACQUIRE | NIR_MEMORY_RELEASE; + break; + + default: + unreachable("Invalid memory order semantics"); + } + + if (semantics & SpvMemorySemanticsMakeAvailableMask) { + vtn_fail_if(!b->options->caps.vk_memory_model, + "To use MakeAvailable memory semantics the VulkanMemoryModel " + "capability must be declared."); + nir_semantics |= NIR_MEMORY_MAKE_AVAILABLE; + } + + if (semantics & SpvMemorySemanticsMakeVisibleMask) { + vtn_fail_if(!b->options->caps.vk_memory_model, + "To use MakeVisible memory semantics the VulkanMemoryModel " + "capability must be declared."); + nir_semantics |= NIR_MEMORY_MAKE_VISIBLE; + } + + /* Vulkan Environment for SPIR-V says "SubgroupMemory, CrossWorkgroupMemory, + * and AtomicCounterMemory are ignored". + */ + semantics &= ~(SpvMemorySemanticsSubgroupMemoryMask | + SpvMemorySemanticsCrossWorkgroupMemoryMask | + SpvMemorySemanticsAtomicCounterMemoryMask); + + /* TODO: Consider adding nir_var_mem_image mode to NIR so it can be used + * for SpvMemorySemanticsImageMemoryMask. + */ + + nir_variable_mode modes = 0; + if (semantics & (SpvMemorySemanticsUniformMemoryMask | + SpvMemorySemanticsImageMemoryMask)) { + modes |= nir_var_uniform | + nir_var_mem_ubo | + nir_var_mem_ssbo | + nir_var_mem_global; + } + if (semantics & SpvMemorySemanticsWorkgroupMemoryMask) + modes |= nir_var_mem_shared; + if (semantics & SpvMemorySemanticsOutputMemoryMask) { + modes |= nir_var_shader_out; + } + + /* No barrier to add. */ + if (nir_semantics == 0 || modes == 0) + return; + + nir_scope nir_scope; + switch (scope) { + case SpvScopeDevice: + vtn_fail_if(b->options->caps.vk_memory_model && + !b->options->caps.vk_memory_model_device_scope, + "If the Vulkan memory model is declared and any instruction " + "uses Device scope, the VulkanMemoryModelDeviceScope " + "capability must be declared."); + nir_scope = NIR_SCOPE_DEVICE; + break; + + case SpvScopeQueueFamily: + vtn_fail_if(!b->options->caps.vk_memory_model, + "To use Queue Family scope, the VulkanMemoryModel capability " + "must be declared."); + nir_scope = NIR_SCOPE_QUEUE_FAMILY; + break; + + case SpvScopeWorkgroup: + nir_scope = NIR_SCOPE_WORKGROUP; + break; + + case SpvScopeSubgroup: + nir_scope = NIR_SCOPE_SUBGROUP; + break; + + case SpvScopeInvocation: + nir_scope = NIR_SCOPE_INVOCATION; + break; + + default: + vtn_fail("Invalid memory scope"); + } + + nir_intrinsic_instr *intrin = + nir_intrinsic_instr_create(b->shader, nir_intrinsic_scoped_memory_barrier); + nir_intrinsic_set_memory_semantics(intrin, nir_semantics); + + nir_intrinsic_set_memory_modes(intrin, modes); + nir_intrinsic_set_memory_scope(intrin, nir_scope); + nir_builder_instr_insert(&b->nb, &intrin->instr); +} + struct vtn_ssa_value * vtn_create_ssa_value(struct vtn_builder *b, const struct glsl_type *type) { @@ -1973,6 +2225,42 @@ return src; } +static uint32_t +image_operand_arg(struct vtn_builder *b, const uint32_t *w, uint32_t count, + uint32_t mask_idx, SpvImageOperandsMask op) +{ + static const SpvImageOperandsMask ops_with_arg = + SpvImageOperandsBiasMask | + SpvImageOperandsLodMask | + SpvImageOperandsGradMask | + SpvImageOperandsConstOffsetMask | + SpvImageOperandsOffsetMask | + SpvImageOperandsConstOffsetsMask | + SpvImageOperandsSampleMask | + SpvImageOperandsMinLodMask | + SpvImageOperandsMakeTexelAvailableMask | + SpvImageOperandsMakeTexelVisibleMask; + + assert(util_bitcount(op) == 1); + assert(w[mask_idx] & op); + assert(op & ops_with_arg); + + uint32_t idx = util_bitcount(w[mask_idx] & (op - 1) & ops_with_arg) + 1; + + /* Adjust indices for operands with two arguments. */ + static const SpvImageOperandsMask ops_with_two_args = + SpvImageOperandsGradMask; + idx += util_bitcount(w[mask_idx] & (op - 1) & ops_with_two_args); + + idx += mask_idx; + + vtn_fail_if(idx + (op & ops_with_two_args ? 1 : 0) >= count, + "Image op claims to have %s but does not enough " + "following operands", spirv_imageoperands_to_string(op)); + + return idx; +} + static void vtn_handle_texture(struct vtn_builder *b, SpvOp opcode, const uint32_t *w, unsigned count) @@ -1981,8 +2269,6 @@ struct vtn_value *val = vtn_push_value(b, w[2], vtn_value_type_sampled_image); val->sampled_image = ralloc(b, struct vtn_sampled_image); - val->sampled_image->type = - vtn_value(b, w[1], vtn_value_type_type)->type; val->sampled_image->image = vtn_value(b, w[3], vtn_value_type_pointer)->pointer; val->sampled_image->sampler = @@ -2001,20 +2287,24 @@ struct vtn_type *ret_type = vtn_value(b, w[1], vtn_value_type_type)->type; - struct vtn_sampled_image sampled; + struct vtn_pointer *image = NULL, *sampler = NULL; struct vtn_value *sampled_val = vtn_untyped_value(b, w[3]); if (sampled_val->value_type == vtn_value_type_sampled_image) { - sampled = *sampled_val->sampled_image; + image = sampled_val->sampled_image->image; + sampler = sampled_val->sampled_image->sampler; } else { vtn_assert(sampled_val->value_type == vtn_value_type_pointer); - sampled.type = sampled_val->pointer->type; - sampled.image = NULL; - sampled.sampler = sampled_val->pointer; + image = sampled_val->pointer; } - const struct glsl_type *image_type = sampled.type->type; + nir_deref_instr *image_deref = vtn_pointer_to_deref(b, image); + nir_deref_instr *sampler_deref = + sampler ? vtn_pointer_to_deref(b, sampler) : NULL; + + const struct glsl_type *image_type = sampled_val->type->type; const enum glsl_sampler_dim sampler_dim = glsl_get_sampler_dim(image_type); const bool is_array = glsl_sampler_type_is_array(image_type); + nir_alu_type dest_type = nir_type_invalid; /* Figure out the base texture operation */ nir_texop texop; @@ -2034,7 +2324,7 @@ break; case SpvOpImageFetch: - if (glsl_get_sampler_dim(image_type) == GLSL_SAMPLER_DIM_MS) { + if (sampler_dim == GLSL_SAMPLER_DIM_MS) { texop = nir_texop_txf_ms; } else { texop = nir_texop_txf; @@ -2049,18 +2339,30 @@ case SpvOpImageQuerySizeLod: case SpvOpImageQuerySize: texop = nir_texop_txs; + dest_type = nir_type_int; break; case SpvOpImageQueryLod: texop = nir_texop_lod; + dest_type = nir_type_float; break; case SpvOpImageQueryLevels: texop = nir_texop_query_levels; + dest_type = nir_type_int; break; case SpvOpImageQuerySamples: texop = nir_texop_texture_samples; + dest_type = nir_type_int; + break; + + case SpvOpFragmentFetchAMD: + texop = nir_texop_fragment_fetch; + break; + + case SpvOpFragmentMaskFetchAMD: + texop = nir_texop_fragment_mask_fetch; break; default: @@ -2070,11 +2372,7 @@ nir_tex_src srcs[10]; /* 10 should be enough */ nir_tex_src *p = srcs; - nir_deref_instr *sampler = vtn_pointer_to_deref(b, sampled.sampler); - nir_deref_instr *texture = - sampled.image ? vtn_pointer_to_deref(b, sampled.image) : sampler; - - p->src = nir_src_for_ssa(&texture->dest.ssa); + p->src = nir_src_for_ssa(&image_deref->dest.ssa); p->src_type = nir_tex_src_texture_deref; p++; @@ -2085,8 +2383,10 @@ case nir_texop_txd: case nir_texop_tg4: case nir_texop_lod: - /* These operations require a sampler */ - p->src = nir_src_for_ssa(&sampler->dest.ssa); + vtn_fail_if(sampler == NULL, + "%s requires an image of type OpTypeSampledImage", + spirv_op_to_string(opcode)); + p->src = nir_src_for_ssa(&sampler_deref->dest.ssa); p->src_type = nir_tex_src_sampler_deref; p++; break; @@ -2096,6 +2396,8 @@ case nir_texop_query_levels: case nir_texop_texture_samples: case nir_texop_samples_identical: + case nir_texop_fragment_fetch: + case nir_texop_fragment_mask_fetch: /* These don't */ break; case nir_texop_txf_ms_fb: @@ -2103,6 +2405,8 @@ break; case nir_texop_txf_ms_mcs: vtn_fail("unexpected nir_texop_txf_ms_mcs"); + case nir_texop_tex_prefetch: + vtn_fail("unexpected nir_texop_tex_prefetch"); } unsigned idx = 4; @@ -2121,7 +2425,9 @@ case SpvOpImageFetch: case SpvOpImageGather: case SpvOpImageDrefGather: - case SpvOpImageQueryLod: { + case SpvOpImageQueryLod: + case SpvOpFragmentFetchAMD: + case SpvOpFragmentMaskFetchAMD: { /* All these types have the coordinate as their first real argument */ switch (sampler_dim) { case GLSL_SAMPLER_DIM_1D: @@ -2131,6 +2437,7 @@ case GLSL_SAMPLER_DIM_2D: case GLSL_SAMPLER_DIM_RECT: case GLSL_SAMPLER_DIM_MS: + case GLSL_SAMPLER_DIM_SUBPASS_MS: coord_components = 2; break; case GLSL_SAMPLER_DIM_3D: @@ -2199,54 +2506,82 @@ if (opcode == SpvOpImageQuerySizeLod) (*p++) = vtn_tex_src(b, w[idx++], nir_tex_src_lod); + /* For OpFragmentFetchAMD, we always have a multisample index */ + if (opcode == SpvOpFragmentFetchAMD) + (*p++) = vtn_tex_src(b, w[idx++], nir_tex_src_ms_index); + /* Now we need to handle some number of optional arguments */ struct vtn_value *gather_offsets = NULL; if (idx < count) { - uint32_t operands = w[idx++]; + uint32_t operands = w[idx]; if (operands & SpvImageOperandsBiasMask) { vtn_assert(texop == nir_texop_tex); texop = nir_texop_txb; - (*p++) = vtn_tex_src(b, w[idx++], nir_tex_src_bias); + uint32_t arg = image_operand_arg(b, w, count, idx, + SpvImageOperandsBiasMask); + (*p++) = vtn_tex_src(b, w[arg], nir_tex_src_bias); } if (operands & SpvImageOperandsLodMask) { vtn_assert(texop == nir_texop_txl || texop == nir_texop_txf || texop == nir_texop_txs); - (*p++) = vtn_tex_src(b, w[idx++], nir_tex_src_lod); + uint32_t arg = image_operand_arg(b, w, count, idx, + SpvImageOperandsLodMask); + (*p++) = vtn_tex_src(b, w[arg], nir_tex_src_lod); } if (operands & SpvImageOperandsGradMask) { vtn_assert(texop == nir_texop_txl); texop = nir_texop_txd; - (*p++) = vtn_tex_src(b, w[idx++], nir_tex_src_ddx); - (*p++) = vtn_tex_src(b, w[idx++], nir_tex_src_ddy); + uint32_t arg = image_operand_arg(b, w, count, idx, + SpvImageOperandsGradMask); + (*p++) = vtn_tex_src(b, w[arg], nir_tex_src_ddx); + (*p++) = vtn_tex_src(b, w[arg + 1], nir_tex_src_ddy); + } + + vtn_fail_if(util_bitcount(operands & (SpvImageOperandsConstOffsetsMask | + SpvImageOperandsOffsetMask | + SpvImageOperandsConstOffsetMask)) > 1, + "At most one of the ConstOffset, Offset, and ConstOffsets " + "image operands can be used on a given instruction."); + + if (operands & SpvImageOperandsOffsetMask) { + uint32_t arg = image_operand_arg(b, w, count, idx, + SpvImageOperandsOffsetMask); + (*p++) = vtn_tex_src(b, w[arg], nir_tex_src_offset); + } + + if (operands & SpvImageOperandsConstOffsetMask) { + uint32_t arg = image_operand_arg(b, w, count, idx, + SpvImageOperandsConstOffsetMask); + (*p++) = vtn_tex_src(b, w[arg], nir_tex_src_offset); } - if (operands & SpvImageOperandsOffsetMask || - operands & SpvImageOperandsConstOffsetMask) - (*p++) = vtn_tex_src(b, w[idx++], nir_tex_src_offset); - if (operands & SpvImageOperandsConstOffsetsMask) { vtn_assert(texop == nir_texop_tg4); - gather_offsets = vtn_value(b, w[idx++], vtn_value_type_constant); + uint32_t arg = image_operand_arg(b, w, count, idx, + SpvImageOperandsConstOffsetsMask); + gather_offsets = vtn_value(b, w[arg], vtn_value_type_constant); } if (operands & SpvImageOperandsSampleMask) { vtn_assert(texop == nir_texop_txf_ms); + uint32_t arg = image_operand_arg(b, w, count, idx, + SpvImageOperandsSampleMask); texop = nir_texop_txf_ms; - (*p++) = vtn_tex_src(b, w[idx++], nir_tex_src_ms_index); + (*p++) = vtn_tex_src(b, w[arg], nir_tex_src_ms_index); } if (operands & SpvImageOperandsMinLodMask) { vtn_assert(texop == nir_texop_tex || texop == nir_texop_txb || texop == nir_texop_txd); - (*p++) = vtn_tex_src(b, w[idx++], nir_tex_src_min_lod); + uint32_t arg = image_operand_arg(b, w, count, idx, + SpvImageOperandsMinLodMask); + (*p++) = vtn_tex_src(b, w[arg], nir_tex_src_min_lod); } } - /* We should have now consumed exactly all of the arguments */ - vtn_assert(idx == count); nir_tex_instr *instr = nir_tex_instr_create(b->shader, p - srcs); instr->op = texop; @@ -2261,21 +2596,26 @@ is_shadow && glsl_get_components(ret_type->type) == 1; instr->component = gather_component; - if (sampled.image && (sampled.image->access & ACCESS_NON_UNIFORM)) + if (image && (image->access & ACCESS_NON_UNIFORM)) instr->texture_non_uniform = true; - if (sampled.sampler && (sampled.sampler->access & ACCESS_NON_UNIFORM)) + if (sampler && (sampler->access & ACCESS_NON_UNIFORM)) instr->sampler_non_uniform = true; - switch (glsl_get_sampler_result_type(image_type)) { - case GLSL_TYPE_FLOAT: instr->dest_type = nir_type_float; break; - case GLSL_TYPE_INT: instr->dest_type = nir_type_int; break; - case GLSL_TYPE_UINT: instr->dest_type = nir_type_uint; break; - case GLSL_TYPE_BOOL: instr->dest_type = nir_type_bool; break; - default: - vtn_fail("Invalid base type for sampler result"); + /* for non-query ops, get dest_type from sampler type */ + if (dest_type == nir_type_invalid) { + switch (glsl_get_sampler_result_type(image_type)) { + case GLSL_TYPE_FLOAT: dest_type = nir_type_float; break; + case GLSL_TYPE_INT: dest_type = nir_type_int; break; + case GLSL_TYPE_UINT: dest_type = nir_type_uint; break; + case GLSL_TYPE_BOOL: dest_type = nir_type_bool; break; + default: + vtn_fail("Invalid base type for sampler result"); + } } + instr->dest_type = dest_type; + nir_ssa_dest_init(&instr->instr, &instr->dest, nir_tex_instr_dest_size(instr), 32, NULL); @@ -2399,10 +2739,13 @@ val->image->image = vtn_value(b, w[3], vtn_value_type_pointer)->pointer; val->image->coord = get_image_coord(b, w[4]); val->image->sample = vtn_ssa_value(b, w[5])->def; + val->image->lod = nir_imm_int(&b->nb, 0); return; } struct vtn_image_pointer image; + SpvScope scope = SpvScopeInvocation; + SpvMemorySemanticsMask semantics = 0; switch (opcode) { case SpvOpAtomicExchange: @@ -2421,43 +2764,98 @@ case SpvOpAtomicOr: case SpvOpAtomicXor: image = *vtn_value(b, w[3], vtn_value_type_image_pointer)->image; + scope = vtn_constant_uint(b, w[4]); + semantics = vtn_constant_uint(b, w[5]); break; case SpvOpAtomicStore: image = *vtn_value(b, w[1], vtn_value_type_image_pointer)->image; + scope = vtn_constant_uint(b, w[2]); + semantics = vtn_constant_uint(b, w[3]); break; case SpvOpImageQuerySize: image.image = vtn_value(b, w[3], vtn_value_type_pointer)->pointer; image.coord = NULL; image.sample = NULL; + image.lod = NULL; break; - case SpvOpImageRead: + case SpvOpImageRead: { image.image = vtn_value(b, w[3], vtn_value_type_pointer)->pointer; image.coord = get_image_coord(b, w[4]); - if (count > 5 && (w[5] & SpvImageOperandsSampleMask)) { - vtn_assert(w[5] == SpvImageOperandsSampleMask); - image.sample = vtn_ssa_value(b, w[6])->def; + const SpvImageOperandsMask operands = + count > 5 ? w[5] : SpvImageOperandsMaskNone; + + if (operands & SpvImageOperandsSampleMask) { + uint32_t arg = image_operand_arg(b, w, count, 5, + SpvImageOperandsSampleMask); + image.sample = vtn_ssa_value(b, w[arg])->def; } else { image.sample = nir_ssa_undef(&b->nb, 1, 32); } + + if (operands & SpvImageOperandsMakeTexelVisibleMask) { + vtn_fail_if((operands & SpvImageOperandsNonPrivateTexelMask) == 0, + "MakeTexelVisible requires NonPrivateTexel to also be set."); + uint32_t arg = image_operand_arg(b, w, count, 5, + SpvImageOperandsMakeTexelVisibleMask); + semantics = SpvMemorySemanticsMakeVisibleMask; + scope = vtn_constant_uint(b, w[arg]); + } + + if (operands & SpvImageOperandsLodMask) { + uint32_t arg = image_operand_arg(b, w, count, 5, + SpvImageOperandsLodMask); + image.lod = vtn_ssa_value(b, w[arg])->def; + } else { + image.lod = nir_imm_int(&b->nb, 0); + } + + /* TODO: Volatile. */ + break; + } - case SpvOpImageWrite: + case SpvOpImageWrite: { image.image = vtn_value(b, w[1], vtn_value_type_pointer)->pointer; image.coord = get_image_coord(b, w[2]); /* texel = w[3] */ - if (count > 4 && (w[4] & SpvImageOperandsSampleMask)) { - vtn_assert(w[4] == SpvImageOperandsSampleMask); - image.sample = vtn_ssa_value(b, w[5])->def; + const SpvImageOperandsMask operands = + count > 4 ? w[4] : SpvImageOperandsMaskNone; + + if (operands & SpvImageOperandsSampleMask) { + uint32_t arg = image_operand_arg(b, w, count, 4, + SpvImageOperandsSampleMask); + image.sample = vtn_ssa_value(b, w[arg])->def; } else { image.sample = nir_ssa_undef(&b->nb, 1, 32); } + + if (operands & SpvImageOperandsMakeTexelAvailableMask) { + vtn_fail_if((operands & SpvImageOperandsNonPrivateTexelMask) == 0, + "MakeTexelAvailable requires NonPrivateTexel to also be set."); + uint32_t arg = image_operand_arg(b, w, count, 4, + SpvImageOperandsMakeTexelAvailableMask); + semantics = SpvMemorySemanticsMakeAvailableMask; + scope = vtn_constant_uint(b, w[arg]); + } + + if (operands & SpvImageOperandsLodMask) { + uint32_t arg = image_operand_arg(b, w, count, 4, + SpvImageOperandsLodMask); + image.lod = vtn_ssa_value(b, w[arg])->def; + } else { + image.lod = nir_imm_int(&b->nb, 0); + } + + /* TODO: Volatile. */ + break; + } default: vtn_fail_with_opcode("Invalid image opcode", opcode); @@ -2478,10 +2876,10 @@ OP(AtomicIDecrement, atomic_add) OP(AtomicIAdd, atomic_add) OP(AtomicISub, atomic_add) - OP(AtomicSMin, atomic_min) - OP(AtomicUMin, atomic_min) - OP(AtomicSMax, atomic_max) - OP(AtomicUMax, atomic_max) + OP(AtomicSMin, atomic_imin) + OP(AtomicUMin, atomic_umin) + OP(AtomicSMax, atomic_imax) + OP(AtomicUMax, atomic_umax) OP(AtomicAnd, atomic_and) OP(AtomicOr, atomic_or) OP(AtomicXor, atomic_xor) @@ -2510,6 +2908,14 @@ case SpvOpAtomicLoad: case SpvOpImageQuerySize: case SpvOpImageRead: + if (opcode == SpvOpImageRead || opcode == SpvOpAtomicLoad) { + /* Only OpImageRead can support a lod parameter if + * SPV_AMD_shader_image_load_store_lod is used but the current NIR + * intrinsics definition for atomics requires us to set it for + * OpAtomicLoad. + */ + intrin->src[3] = nir_src_for_ssa(image.lod); + } break; case SpvOpAtomicStore: case SpvOpImageWrite: { @@ -2519,6 +2925,12 @@ assert(op == nir_intrinsic_image_deref_store); intrin->num_components = 4; intrin->src[3] = nir_src_for_ssa(expand_to_vec4(&b->nb, value)); + /* Only OpImageWrite can support a lod parameter if + * SPV_AMD_shader_image_load_store_lod is used but the current NIR + * intrinsics definition for atomics requires us to set it for + * OpAtomicStore. + */ + intrin->src[4] = nir_src_for_ssa(image.lod); break; } @@ -2543,6 +2955,16 @@ vtn_fail_with_opcode("Invalid image opcode", opcode); } + /* Image operations implicitly have the Image storage memory semantics. */ + semantics |= SpvMemorySemanticsImageMemoryMask; + + SpvMemorySemanticsMask before_semantics; + SpvMemorySemanticsMask after_semantics; + vtn_split_barrier_semantics(b, semantics, &before_semantics, &after_semantics); + + if (before_semantics) + vtn_emit_memory_barrier(b, scope, before_semantics); + if (opcode != SpvOpImageWrite && opcode != SpvOpAtomicStore) { struct vtn_type *type = vtn_value(b, w[1], vtn_value_type_type)->type; @@ -2566,6 +2988,9 @@ } else { nir_builder_instr_insert(&b->nb, &intrin->instr); } + + if (after_semantics) + vtn_emit_memory_barrier(b, scope, after_semantics); } static nir_intrinsic_op @@ -2621,7 +3046,7 @@ * only need to support GLSL Atomic Counters that are uints and don't * allow direct storage. */ - unreachable("Invalid uniform atomic"); + vtn_fail("Invalid uniform atomic"); } } @@ -2657,11 +3082,14 @@ */ static void vtn_handle_atomics(struct vtn_builder *b, SpvOp opcode, - const uint32_t *w, unsigned count) + const uint32_t *w, UNUSED unsigned count) { struct vtn_pointer *ptr; nir_intrinsic_instr *atomic; + SpvScope scope = SpvScopeInvocation; + SpvMemorySemanticsMask semantics = 0; + switch (opcode) { case SpvOpAtomicLoad: case SpvOpAtomicExchange: @@ -2679,21 +3107,20 @@ case SpvOpAtomicOr: case SpvOpAtomicXor: ptr = vtn_value(b, w[3], vtn_value_type_pointer)->pointer; + scope = vtn_constant_uint(b, w[4]); + semantics = vtn_constant_uint(b, w[5]); break; case SpvOpAtomicStore: ptr = vtn_value(b, w[1], vtn_value_type_pointer)->pointer; + scope = vtn_constant_uint(b, w[2]); + semantics = vtn_constant_uint(b, w[3]); break; default: vtn_fail_with_opcode("Invalid SPIR-V atomic", opcode); } - /* - SpvScope scope = w[4]; - SpvMemorySemanticsMask semantics = w[5]; - */ - /* uniform as "atomic counter uniform" */ if (ptr->mode == vtn_variable_mode_uniform) { nir_deref_instr *deref = vtn_pointer_to_deref(b, ptr); @@ -2832,6 +3259,18 @@ } } + /* Atomic ordering operations will implicitly apply to the atomic operation + * storage class, so include that too. + */ + semantics |= vtn_storage_class_to_memory_semantics(ptr->ptr_type->storage_class); + + SpvMemorySemanticsMask before_semantics; + SpvMemorySemanticsMask after_semantics; + vtn_split_barrier_semantics(b, semantics, &before_semantics, &after_semantics); + + if (before_semantics) + vtn_emit_memory_barrier(b, scope, before_semantics); + if (opcode != SpvOpAtomicStore) { struct vtn_type *type = vtn_value(b, w[1], vtn_value_type_type)->type; @@ -2846,6 +3285,9 @@ } nir_builder_instr_insert(&b->nb, &atomic->instr); + + if (after_semantics) + vtn_emit_memory_barrier(b, scope, after_semantics); } static nir_alu_instr * @@ -2893,7 +3335,10 @@ nir_ssa_def * vtn_vector_extract(struct vtn_builder *b, nir_ssa_def *src, unsigned index) { - return nir_channel(&b->nb, src, index); + if (index > src->num_components) + return nir_ssa_undef(&b->nb, src->num_components, src->bit_size); + else + return nir_channel(&b->nb, src, index); } nir_ssa_def * @@ -3151,10 +3596,15 @@ nir_builder_instr_insert(&b->nb, &intrin->instr); } -static void +void vtn_emit_memory_barrier(struct vtn_builder *b, SpvScope scope, SpvMemorySemanticsMask semantics) { + if (b->options->use_scoped_memory_barrier) { + vtn_emit_scoped_memory_barrier(b, scope, semantics); + return; + } + static const SpvMemorySemanticsMask all_memory_semantics = SpvMemorySemanticsUniformMemoryMask | SpvMemorySemanticsWorkgroupMemoryMask | @@ -3201,6 +3651,10 @@ case SpvMemorySemanticsImageMemoryMask: vtn_emit_barrier(b, nir_intrinsic_memory_barrier_image); break; + case SpvMemorySemanticsOutputMemoryMask: + if (b->nb.shader->info.stage == MESA_SHADER_TESS_CTRL) + vtn_emit_barrier(b, nir_intrinsic_memory_barrier_tcs_patch); + break; default: break;; } @@ -3209,7 +3663,7 @@ static void vtn_handle_barrier(struct vtn_builder *b, SpvOp opcode, - const uint32_t *w, unsigned count) + const uint32_t *w, UNUSED unsigned count) { switch (opcode) { case SpvOpEmitVertex: @@ -3257,13 +3711,47 @@ } case SpvOpControlBarrier: { + SpvScope execution_scope = vtn_constant_uint(b, w[1]); SpvScope memory_scope = vtn_constant_uint(b, w[2]); SpvMemorySemanticsMask memory_semantics = vtn_constant_uint(b, w[3]); + + /* GLSLang, prior to commit 8297936dd6eb3, emitted OpControlBarrier with + * memory semantics of None for GLSL barrier(). + * And before that, prior to c3f1cdfa, emitted the OpControlBarrier with + * Device instead of Workgroup for execution scope. + */ + if (b->wa_glslang_cs_barrier && + b->nb.shader->info.stage == MESA_SHADER_COMPUTE && + (execution_scope == SpvScopeWorkgroup || + execution_scope == SpvScopeDevice) && + memory_semantics == SpvMemorySemanticsMaskNone) { + execution_scope = SpvScopeWorkgroup; + memory_scope = SpvScopeWorkgroup; + memory_semantics = SpvMemorySemanticsAcquireReleaseMask | + SpvMemorySemanticsWorkgroupMemoryMask; + } + + /* From the SPIR-V spec: + * + * "When used with the TessellationControl execution model, it also + * implicitly synchronizes the Output Storage Class: Writes to Output + * variables performed by any invocation executed prior to a + * OpControlBarrier will be visible to any other invocation after + * return from that OpControlBarrier." + */ + if (b->nb.shader->info.stage == MESA_SHADER_TESS_CTRL) { + memory_semantics &= ~(SpvMemorySemanticsAcquireMask | + SpvMemorySemanticsReleaseMask | + SpvMemorySemanticsAcquireReleaseMask | + SpvMemorySemanticsSequentiallyConsistentMask); + memory_semantics |= SpvMemorySemanticsAcquireReleaseMask | + SpvMemorySemanticsOutputMemoryMask; + } + vtn_emit_memory_barrier(b, memory_scope, memory_semantics); - SpvScope execution_scope = vtn_constant_uint(b, w[1]); if (execution_scope == SpvScopeWorkgroup) - vtn_emit_barrier(b, nir_intrinsic_barrier); + vtn_emit_barrier(b, nir_intrinsic_control_barrier); break; } @@ -3433,10 +3921,10 @@ case SpvCapabilityInputAttachment: case SpvCapabilityImageGatherExtended: case SpvCapabilityStorageImageExtendedFormats: + case SpvCapabilityVector16: break; case SpvCapabilityLinkage: - case SpvCapabilityVector16: case SpvCapabilityFloat16Buffer: case SpvCapabilitySparseResidency: vtn_warn("Unsupported SPIR-V capability: %s", @@ -3573,6 +4061,8 @@ spv_check_supported(storage_16bit, cap); break; + case SpvCapabilityShaderLayer: + case SpvCapabilityShaderViewportIndex: case SpvCapabilityShaderViewportIndexLayerEXT: spv_check_supported(shader_viewport_index_layer, cap); break; @@ -3615,7 +4105,15 @@ spv_check_supported(post_depth_coverage, cap); break; - case SpvCapabilityPhysicalStorageBufferAddressesEXT: + case SpvCapabilityDenormFlushToZero: + case SpvCapabilityDenormPreserve: + case SpvCapabilitySignedZeroInfNanPreserve: + case SpvCapabilityRoundingModeRTE: + case SpvCapabilityRoundingModeRTZ: + spv_check_supported(float_controls, cap); + break; + + case SpvCapabilityPhysicalStorageBufferAddresses: spv_check_supported(physical_storage_buffer_address, cap); break; @@ -3640,6 +4138,30 @@ spv_check_supported(demote_to_helper_invocation, cap); break; + case SpvCapabilityShaderClockKHR: + spv_check_supported(shader_clock, cap); + break; + + case SpvCapabilityVulkanMemoryModel: + spv_check_supported(vk_memory_model, cap); + break; + + case SpvCapabilityVulkanMemoryModelDeviceScope: + spv_check_supported(vk_memory_model_device_scope, cap); + break; + + case SpvCapabilityImageReadWriteLodAMD: + spv_check_supported(amd_image_read_write_lod, cap); + break; + + case SpvCapabilityIntegerFunctions2INTEL: + spv_check_supported(integer_functions2, cap); + break; + + case SpvCapabilityFragmentMaskAMD: + spv_check_supported(amd_fragment_mask, cap); + break; + default: vtn_fail("Unhandled capability: %s (%u)", spirv_capability_to_string(cap), cap); @@ -3674,13 +4196,12 @@ case SpvAddressingModelLogical: vtn_fail_if(b->shader->info.stage >= MESA_SHADER_STAGES, "AddressingModelLogical only supported for shaders"); - b->shader->info.cs.ptr_size = 0; b->physical_ptrs = false; break; - case SpvAddressingModelPhysicalStorageBuffer64EXT: + case SpvAddressingModelPhysicalStorageBuffer64: vtn_fail_if(!b->options || !b->options->caps.physical_storage_buffer_address, - "AddressingModelPhysicalStorageBuffer64EXT not supported"); + "AddressingModelPhysicalStorageBuffer64 not supported"); break; default: vtn_fail("Unknown addressing model: %s (%u)", @@ -3688,9 +4209,20 @@ break; } - vtn_assert(w[2] == SpvMemoryModelSimple || - w[2] == SpvMemoryModelGLSL450 || - w[2] == SpvMemoryModelOpenCL); + switch (w[2]) { + case SpvMemoryModelSimple: + case SpvMemoryModelGLSL450: + case SpvMemoryModelOpenCL: + break; + case SpvMemoryModelVulkan: + vtn_fail_if(!b->options->caps.vk_memory_model, + "Vulkan memory model is unsupported by this driver"); + break; + default: + vtn_fail("Unsupported memory model: %s", + spirv_memorymodel_to_string(w[2])); + break; + } break; case SpvOpEntryPoint: @@ -3723,6 +4255,17 @@ vtn_handle_decoration(b, opcode, w, count); break; + case SpvOpExtInst: { + struct vtn_value *val = vtn_value(b, w[3], vtn_value_type_extension); + if (val->ext_handler == vtn_handle_non_semantic_instruction) { + /* NonSemantic extended instructions are acceptable in preamble. */ + vtn_handle_non_semantic_instruction(b, w[4], w, count); + return true; + } else { + return false; /* End of preamble. */ + } + } + default: return false; /* End of preamble */ } @@ -3732,7 +4275,7 @@ static void vtn_handle_execution_mode(struct vtn_builder *b, struct vtn_value *entry_point, - const struct vtn_decoration *mode, void *data) + const struct vtn_decoration *mode, UNUSED void *data) { vtn_assert(b->entry_point == entry_point); @@ -3916,6 +4459,14 @@ b->shader->info.fs.sample_interlock_unordered = true; break; + case SpvExecutionModeDenormPreserve: + case SpvExecutionModeDenormFlushToZero: + case SpvExecutionModeSignedZeroInfNanPreserve: + case SpvExecutionModeRoundingModeRTE: + case SpvExecutionModeRoundingModeRTZ: + /* Already handled in vtn_handle_rounding_mode_in_execution_mode() */ + break; + default: vtn_fail("Unhandled execution mode: %s (%u)", spirv_executionmode_to_string(mode->exec_mode), @@ -3923,6 +4474,63 @@ } } +static void +vtn_handle_rounding_mode_in_execution_mode(struct vtn_builder *b, struct vtn_value *entry_point, + const struct vtn_decoration *mode, void *data) +{ + vtn_assert(b->entry_point == entry_point); + + unsigned execution_mode = 0; + + switch(mode->exec_mode) { + case SpvExecutionModeDenormPreserve: + switch (mode->operands[0]) { + case 16: execution_mode = FLOAT_CONTROLS_DENORM_PRESERVE_FP16; break; + case 32: execution_mode = FLOAT_CONTROLS_DENORM_PRESERVE_FP32; break; + case 64: execution_mode = FLOAT_CONTROLS_DENORM_PRESERVE_FP64; break; + default: vtn_fail("Floating point type not supported"); + } + break; + case SpvExecutionModeDenormFlushToZero: + switch (mode->operands[0]) { + case 16: execution_mode = FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16; break; + case 32: execution_mode = FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32; break; + case 64: execution_mode = FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP64; break; + default: vtn_fail("Floating point type not supported"); + } + break; + case SpvExecutionModeSignedZeroInfNanPreserve: + switch (mode->operands[0]) { + case 16: execution_mode = FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP16; break; + case 32: execution_mode = FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP32; break; + case 64: execution_mode = FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP64; break; + default: vtn_fail("Floating point type not supported"); + } + break; + case SpvExecutionModeRoundingModeRTE: + switch (mode->operands[0]) { + case 16: execution_mode = FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16; break; + case 32: execution_mode = FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32; break; + case 64: execution_mode = FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64; break; + default: vtn_fail("Floating point type not supported"); + } + break; + case SpvExecutionModeRoundingModeRTZ: + switch (mode->operands[0]) { + case 16: execution_mode = FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16; break; + case 32: execution_mode = FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32; break; + case 64: execution_mode = FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64; break; + default: vtn_fail("Floating point type not supported"); + } + break; + + default: + break; + } + + b->shader->info.float_controls_execution_mode |= execution_mode; +} + static bool vtn_handle_variable_or_type_instruction(struct vtn_builder *b, SpvOp opcode, const uint32_t *w, unsigned count) @@ -3996,6 +4604,14 @@ vtn_handle_variables(b, opcode, w, count); break; + case SpvOpExtInst: { + struct vtn_value *val = vtn_value(b, w[3], vtn_value_type_extension); + /* NonSemantic extended instructions are acceptable in preamble, others + * will indicate the end of preamble. + */ + return val->ext_handler == vtn_handle_non_semantic_instruction; + } + default: return false; /* End of preamble */ } @@ -4216,6 +4832,11 @@ break; } + case SpvOpFragmentMaskFetchAMD: + case SpvOpFragmentFetchAMD: + vtn_handle_texture(b, opcode, w, count); + break; + case SpvOpAtomicLoad: case SpvOpAtomicExchange: case SpvOpAtomicCompareExchange: @@ -4352,6 +4973,20 @@ case SpvOpVectorTimesMatrix: case SpvOpMatrixTimesVector: case SpvOpMatrixTimesMatrix: + case SpvOpUCountLeadingZerosINTEL: + case SpvOpUCountTrailingZerosINTEL: + case SpvOpAbsISubINTEL: + case SpvOpAbsUSubINTEL: + case SpvOpIAddSatINTEL: + case SpvOpUAddSatINTEL: + case SpvOpIAverageINTEL: + case SpvOpUAverageINTEL: + case SpvOpIAverageRoundedINTEL: + case SpvOpUAverageRoundedINTEL: + case SpvOpISubSatINTEL: + case SpvOpUSubSatINTEL: + case SpvOpIMul32x16INTEL: + case SpvOpUMul32x16INTEL: vtn_handle_alu(b, opcode, w, count); break; @@ -4477,6 +5112,41 @@ break; } + case SpvOpReadClockKHR: { + assert(vtn_constant_uint(b, w[3]) == SpvScopeSubgroup); + + /* Operation supports two result types: uvec2 and uint64_t. The NIR + * intrinsic gives uvec2, so pack the result for the other case. + */ + nir_intrinsic_instr *intrin = + nir_intrinsic_instr_create(b->nb.shader, nir_intrinsic_shader_clock); + nir_ssa_dest_init(&intrin->instr, &intrin->dest, 2, 32, NULL); + nir_builder_instr_insert(&b->nb, &intrin->instr); + + struct vtn_type *type = vtn_value(b, w[1], vtn_value_type_type)->type; + const struct glsl_type *dest_type = type->type; + nir_ssa_def *result; + + if (glsl_type_is_vector(dest_type)) { + assert(dest_type == glsl_vector_type(GLSL_TYPE_UINT, 2)); + result = &intrin->dest.ssa; + } else { + assert(glsl_type_is_scalar(dest_type)); + assert(glsl_get_base_type(dest_type) == GLSL_TYPE_UINT64); + result = nir_pack_64_2x32(&b->nb, &intrin->dest.ssa); + } + + struct vtn_value *val = vtn_push_value(b, w[2], vtn_value_type_ssa); + val->type = type; + val->ssa = vtn_create_ssa_value(b, dest_type); + val->ssa->def = result; + break; + } + + case SpvOpLifetimeStart: + case SpvOpLifetimeStop: + break; + default: vtn_fail_with_opcode("Unhandled opcode", opcode); } @@ -4530,6 +5200,13 @@ */ b->wa_glslang_179 = (generator_id == 8 && generator_version == 1); + /* In GLSLang commit 8297936dd6eb3, their handling of barrier() was fixed + * to provide correct memory semantics on compute shader barrier() + * commands. Prior to that, we need to fix them up ourselves. This + * GLSLang fix caused them to bump to generator version 3. + */ + b->wa_glslang_cs_barrier = (generator_id == 8 && generator_version < 3); + /* words[2] == generator magic */ unsigned value_id_bound = words[3]; if (words[4] != 0) { @@ -4650,6 +5327,13 @@ if (stage == MESA_SHADER_GEOMETRY) b->shader->info.gs.invocations = 1; + /* Parse rounding mode execution modes. This has to happen earlier than + * other changes in the execution modes since they can affect, for example, + * the result of the floating point constants. + */ + vtn_foreach_execution_mode(b, b->entry_point, + vtn_handle_rounding_mode_in_execution_mode, NULL); + b->specializations = spec; b->num_specializations = num_spec; diff -Nru mesa-19.2.8/src/compiler/spirv/vtn_alu.c mesa-20.0.8/src/compiler/spirv/vtn_alu.c --- mesa-19.2.8/src/compiler/spirv/vtn_alu.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/spirv/vtn_alu.c 2020-06-12 01:21:16.000000000 +0000 @@ -261,6 +261,21 @@ case SpvOpBitReverse: return nir_op_bitfield_reverse; case SpvOpBitCount: return nir_op_bit_count; + case SpvOpUCountLeadingZerosINTEL: return nir_op_uclz; + /* SpvOpUCountTrailingZerosINTEL is handled elsewhere. */ + case SpvOpAbsISubINTEL: return nir_op_uabs_isub; + case SpvOpAbsUSubINTEL: return nir_op_uabs_usub; + case SpvOpIAddSatINTEL: return nir_op_iadd_sat; + case SpvOpUAddSatINTEL: return nir_op_uadd_sat; + case SpvOpIAverageINTEL: return nir_op_ihadd; + case SpvOpUAverageINTEL: return nir_op_uhadd; + case SpvOpIAverageRoundedINTEL: return nir_op_irhadd; + case SpvOpUAverageRoundedINTEL: return nir_op_urhadd; + case SpvOpISubSatINTEL: return nir_op_isub_sat; + case SpvOpUSubSatINTEL: return nir_op_usub_sat; + case SpvOpIMul32x16INTEL: return nir_op_imul_32x16; + case SpvOpUMul32x16INTEL: return nir_op_umul_32x16; + /* The ordered / unordered operators need special implementation besides * the logical operator to use since they also need to check if operands are * ordered. @@ -640,6 +655,12 @@ break; } + case SpvOpUCountTrailingZerosINTEL: + val->ssa->def = nir_umin(&b->nb, + nir_find_lsb(&b->nb, src[0]), + nir_imm_int(&b->nb, 32u)); + break; + default: { bool swap; unsigned src_bit_size = glsl_get_bit_size(vtn_src[0]->type); diff -Nru mesa-19.2.8/src/compiler/spirv/vtn_amd.c mesa-20.0.8/src/compiler/spirv/vtn_amd.c --- mesa-19.2.8/src/compiler/spirv/vtn_amd.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/spirv/vtn_amd.c 2020-06-12 01:21:16.000000000 +0000 @@ -168,3 +168,67 @@ return true; } + +bool +vtn_handle_amd_shader_explicit_vertex_parameter_instruction(struct vtn_builder *b, SpvOp ext_opcode, + const uint32_t *w, unsigned count) +{ + const struct glsl_type *dest_type = + vtn_value(b, w[1], vtn_value_type_type)->type->type; + + struct vtn_value *val = vtn_push_value(b, w[2], vtn_value_type_ssa); + val->ssa = vtn_create_ssa_value(b, dest_type); + + nir_intrinsic_op op; + switch ((enum ShaderExplicitVertexParameterAMD)ext_opcode) { + case InterpolateAtVertexAMD: + op = nir_intrinsic_interp_deref_at_vertex; + break; + default: + unreachable("unknown opcode"); + } + + nir_intrinsic_instr *intrin = nir_intrinsic_instr_create(b->nb.shader, op); + + struct vtn_pointer *ptr = + vtn_value(b, w[5], vtn_value_type_pointer)->pointer; + nir_deref_instr *deref = vtn_pointer_to_deref(b, ptr); + + /* If the value we are interpolating has an index into a vector then + * interpolate the vector and index the result of that instead. This is + * necessary because the index will get generated as a series of nir_bcsel + * instructions so it would no longer be an input variable. + */ + const bool vec_array_deref = deref->deref_type == nir_deref_type_array && + glsl_type_is_vector(nir_deref_instr_parent(deref)->type); + + nir_deref_instr *vec_deref = NULL; + if (vec_array_deref) { + vec_deref = deref; + deref = nir_deref_instr_parent(deref); + } + intrin->src[0] = nir_src_for_ssa(&deref->dest.ssa); + intrin->src[1] = nir_src_for_ssa(vtn_ssa_value(b, w[6])->def); + + intrin->num_components = glsl_get_vector_elements(deref->type); + nir_ssa_dest_init(&intrin->instr, &intrin->dest, + glsl_get_vector_elements(deref->type), + glsl_get_bit_size(deref->type), NULL); + + nir_builder_instr_insert(&b->nb, &intrin->instr); + + if (vec_array_deref) { + assert(vec_deref); + if (nir_src_is_const(vec_deref->arr.index)) { + val->ssa->def = vtn_vector_extract(b, &intrin->dest.ssa, + nir_src_as_uint(vec_deref->arr.index)); + } else { + val->ssa->def = vtn_vector_extract_dynamic(b, &intrin->dest.ssa, + vec_deref->arr.index.ssa); + } + } else { + val->ssa->def = &intrin->dest.ssa; + } + + return true; +} diff -Nru mesa-19.2.8/src/compiler/spirv/vtn_cfg.c mesa-20.0.8/src/compiler/spirv/vtn_cfg.c --- mesa-19.2.8/src/compiler/spirv/vtn_cfg.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/spirv/vtn_cfg.c 2020-06-12 01:21:16.000000000 +0000 @@ -210,9 +210,9 @@ vtn_value(b, arg_id, vtn_value_type_sampled_image)->sampled_image; call->params[param_idx++] = - nir_src_for_ssa(&sampled_image->image->deref->dest.ssa); + nir_src_for_ssa(vtn_pointer_to_ssa(b, sampled_image->image)); call->params[param_idx++] = - nir_src_for_ssa(&sampled_image->sampler->deref->dest.ssa); + nir_src_for_ssa(vtn_pointer_to_ssa(b, sampled_image->sampler)); } else if (arg_type->base_type == vtn_base_type_pointer || arg_type->base_type == vtn_base_type_image || arg_type->base_type == vtn_base_type_sampler) { @@ -274,9 +274,12 @@ unsigned idx = 0; if (func_type->return_type->base_type != vtn_base_type_void) { + nir_address_format addr_format = + vtn_mode_to_address_format(b, vtn_variable_mode_function); /* The return value is a regular pointer */ func->params[idx++] = (nir_parameter) { - .num_components = 1, .bit_size = 32, + .num_components = nir_address_format_num_components(addr_format), + .bit_size = nir_address_format_bit_size(addr_format), }; } @@ -315,14 +318,17 @@ vtn_push_value(b, w[2], vtn_value_type_sampled_image); val->sampled_image = ralloc(b, struct vtn_sampled_image); - val->sampled_image->type = type; + + struct vtn_type *image_type = rzalloc(b, struct vtn_type); + image_type->base_type = vtn_base_type_image; + image_type->type = type->type; struct vtn_type *sampler_type = rzalloc(b, struct vtn_type); sampler_type->base_type = vtn_base_type_sampler; sampler_type->type = glsl_bare_sampler_type(); val->sampled_image->image = - vtn_load_param_pointer(b, type, b->func_param_idx++); + vtn_load_param_pointer(b, image_type, b->func_param_idx++); val->sampled_image->sampler = vtn_load_param_pointer(b, sampler_type, b->func_param_idx++); } else if (type->base_type == vtn_base_type_pointer && @@ -809,6 +815,11 @@ struct vtn_block *pred = vtn_value(b, w[i + 1], vtn_value_type_block)->block; + /* If block does not have end_nop, that is because it is an unreacheable + * block, and hence it is not worth to handle it */ + if (!pred->end_nop) + continue; + b->nb.cursor = nir_after_instr(&pred->end_nop->instr); struct vtn_ssa_value *src = vtn_ssa_value(b, w[i]); @@ -1002,7 +1013,7 @@ vtn_emit_cf_list(b, &vtn_loop->body, NULL, NULL, handler); - if (!list_empty(&vtn_loop->cont_body)) { + if (!list_is_empty(&vtn_loop->cont_body)) { /* If we have a non-trivial continue body then we need to put * it at the beginning of the loop with a flag to ensure that * it doesn't get executed in the first iteration. diff -Nru mesa-19.2.8/src/compiler/spirv/vtn_glsl450.c mesa-20.0.8/src/compiler/spirv/vtn_glsl450.c --- mesa-19.2.8/src/compiler/spirv/vtn_glsl450.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/spirv/vtn_glsl450.c 2020-06-12 01:21:16.000000000 +0000 @@ -234,160 +234,10 @@ expr_tail))); } -/** - * Compute xs[0] + xs[1] + xs[2] + ... using fadd. - */ -static nir_ssa_def * -build_fsum(nir_builder *b, nir_ssa_def **xs, int terms) -{ - nir_ssa_def *accum = xs[0]; - - for (int i = 1; i < terms; i++) - accum = nir_fadd(b, accum, xs[i]); - - return accum; -} - -static nir_ssa_def * -build_atan(nir_builder *b, nir_ssa_def *y_over_x) -{ - const uint32_t bit_size = y_over_x->bit_size; - - nir_ssa_def *abs_y_over_x = nir_fabs(b, y_over_x); - nir_ssa_def *one = nir_imm_floatN_t(b, 1.0f, bit_size); - - /* - * range-reduction, first step: - * - * / y_over_x if |y_over_x| <= 1.0; - * x = < - * \ 1.0 / y_over_x otherwise - */ - nir_ssa_def *x = nir_fdiv(b, nir_fmin(b, abs_y_over_x, one), - nir_fmax(b, abs_y_over_x, one)); - - /* - * approximate atan by evaluating polynomial: - * - * x * 0.9999793128310355 - x^3 * 0.3326756418091246 + - * x^5 * 0.1938924977115610 - x^7 * 0.1173503194786851 + - * x^9 * 0.0536813784310406 - x^11 * 0.0121323213173444 - */ - nir_ssa_def *x_2 = nir_fmul(b, x, x); - nir_ssa_def *x_3 = nir_fmul(b, x_2, x); - nir_ssa_def *x_5 = nir_fmul(b, x_3, x_2); - nir_ssa_def *x_7 = nir_fmul(b, x_5, x_2); - nir_ssa_def *x_9 = nir_fmul(b, x_7, x_2); - nir_ssa_def *x_11 = nir_fmul(b, x_9, x_2); - - nir_ssa_def *polynomial_terms[] = { - nir_fmul_imm(b, x, 0.9999793128310355f), - nir_fmul_imm(b, x_3, -0.3326756418091246f), - nir_fmul_imm(b, x_5, 0.1938924977115610f), - nir_fmul_imm(b, x_7, -0.1173503194786851f), - nir_fmul_imm(b, x_9, 0.0536813784310406f), - nir_fmul_imm(b, x_11, -0.0121323213173444f), - }; - - nir_ssa_def *tmp = - build_fsum(b, polynomial_terms, ARRAY_SIZE(polynomial_terms)); - - /* range-reduction fixup */ - tmp = nir_fadd(b, tmp, - nir_fmul(b, nir_b2f(b, nir_flt(b, one, abs_y_over_x), bit_size), - nir_fadd_imm(b, nir_fmul_imm(b, tmp, -2.0f), M_PI_2f))); - - /* sign fixup */ - return nir_fmul(b, tmp, nir_fsign(b, y_over_x)); -} - -static nir_ssa_def * -build_atan2(nir_builder *b, nir_ssa_def *y, nir_ssa_def *x) -{ - assert(y->bit_size == x->bit_size); - const uint32_t bit_size = x->bit_size; - - nir_ssa_def *zero = nir_imm_floatN_t(b, 0, bit_size); - nir_ssa_def *one = nir_imm_floatN_t(b, 1, bit_size); - - /* If we're on the left half-plane rotate the coordinates π/2 clock-wise - * for the y=0 discontinuity to end up aligned with the vertical - * discontinuity of atan(s/t) along t=0. This also makes sure that we - * don't attempt to divide by zero along the vertical line, which may give - * unspecified results on non-GLSL 4.1-capable hardware. - */ - nir_ssa_def *flip = nir_fge(b, zero, x); - nir_ssa_def *s = nir_bcsel(b, flip, nir_fabs(b, x), y); - nir_ssa_def *t = nir_bcsel(b, flip, y, nir_fabs(b, x)); - - /* If the magnitude of the denominator exceeds some huge value, scale down - * the arguments in order to prevent the reciprocal operation from flushing - * its result to zero, which would cause precision problems, and for s - * infinite would cause us to return a NaN instead of the correct finite - * value. - * - * If fmin and fmax are respectively the smallest and largest positive - * normalized floating point values representable by the implementation, - * the constants below should be in agreement with: - * - * huge <= 1 / fmin - * scale <= 1 / fmin / fmax (for |t| >= huge) - * - * In addition scale should be a negative power of two in order to avoid - * loss of precision. The values chosen below should work for most usual - * floating point representations with at least the dynamic range of ATI's - * 24-bit representation. - */ - const double huge_val = bit_size >= 32 ? 1e18 : 16384; - nir_ssa_def *huge = nir_imm_floatN_t(b, huge_val, bit_size); - nir_ssa_def *scale = nir_bcsel(b, nir_fge(b, nir_fabs(b, t), huge), - nir_imm_floatN_t(b, 0.25, bit_size), one); - nir_ssa_def *rcp_scaled_t = nir_frcp(b, nir_fmul(b, t, scale)); - nir_ssa_def *s_over_t = nir_fmul(b, nir_fmul(b, s, scale), rcp_scaled_t); - - /* For |x| = |y| assume tan = 1 even if infinite (i.e. pretend momentarily - * that ∞/∞ = 1) in order to comply with the rather artificial rules - * inherited from IEEE 754-2008, namely: - * - * "atan2(±∞, −∞) is ±3π/4 - * atan2(±∞, +∞) is ±π/4" - * - * Note that this is inconsistent with the rules for the neighborhood of - * zero that are based on iterated limits: - * - * "atan2(±0, −0) is ±π - * atan2(±0, +0) is ±0" - * - * but GLSL specifically allows implementations to deviate from IEEE rules - * at (0,0), so we take that license (i.e. pretend that 0/0 = 1 here as - * well). - */ - nir_ssa_def *tan = nir_bcsel(b, nir_feq(b, nir_fabs(b, x), nir_fabs(b, y)), - one, nir_fabs(b, s_over_t)); - - /* Calculate the arctangent and fix up the result if we had flipped the - * coordinate system. - */ - nir_ssa_def *arc = - nir_fadd(b, nir_fmul_imm(b, nir_b2f(b, flip, bit_size), M_PI_2f), - build_atan(b, tan)); - - /* Rather convoluted calculation of the sign of the result. When x < 0 we - * cannot use fsign because we need to be able to distinguish between - * negative and positive zero. We don't use bitwise arithmetic tricks for - * consistency with the GLSL front-end. When x >= 0 rcp_scaled_t will - * always be non-negative so this won't be able to distinguish between - * negative and positive zero, but we don't care because atan2 is - * continuous along the whole positive y = 0 half-line, so it won't affect - * the result significantly. - */ - return nir_bcsel(b, nir_flt(b, nir_fmin(b, y, rcp_scaled_t), zero), - nir_fneg(b, arc), arc); -} - static nir_op vtn_nir_alu_op_for_spirv_glsl_opcode(struct vtn_builder *b, - enum GLSLstd450 opcode) + enum GLSLstd450 opcode, + unsigned execution_mode) { switch (opcode) { case GLSLstd450Round: return nir_op_fround_even; @@ -433,7 +283,11 @@ case GLSLstd450UnpackUnorm4x8: return nir_op_unpack_unorm_4x8; case GLSLstd450UnpackSnorm2x16: return nir_op_unpack_snorm_2x16; case GLSLstd450UnpackUnorm2x16: return nir_op_unpack_unorm_2x16; - case GLSLstd450UnpackHalf2x16: return nir_op_unpack_half_2x16; + case GLSLstd450UnpackHalf2x16: + if (execution_mode & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16) + return nir_op_unpack_half_2x16_flush_to_zero; + else + return nir_op_unpack_half_2x16; case GLSLstd450UnpackDouble2x32: return nir_op_unpack_64_2x32; default: @@ -604,25 +458,24 @@ return; case GLSLstd450Tanh: { - /* tanh(x) := (0.5 * (e^x - e^(-x))) / (0.5 * (e^x + e^(-x))) - * - * With a little algebra this reduces to (e^2x - 1) / (e^2x + 1) + /* tanh(x) := (e^x - e^(-x)) / (e^x + e^(-x)) * - * We clamp x to (-inf, +10] to avoid precision problems. When x > 10, - * e^2x is so much larger than 1.0 that 1.0 gets flushed to zero in the - * computation e^2x +/- 1 so it can be ignored. + * We clamp x to [-10, +10] to avoid precision problems. When x > 10, + * e^x dominates the sum, e^(-x) is lost and tanh(x) is 1.0 for 32 bit + * floating point. * - * For 16-bit precision we clamp x to (-inf, +4.2] since the maximum - * representable number is only 65,504 and e^(2*6) exceeds that. Also, - * if x > 4.2, tanh(x) will return 1.0 in fp16. + * For 16-bit precision this we clamp x to [-4.2, +4.2]. */ const uint32_t bit_size = src[0]->bit_size; const double clamped_x = bit_size > 16 ? 10.0 : 4.2; - nir_ssa_def *x = nir_fmin(nb, src[0], - nir_imm_floatN_t(nb, clamped_x, bit_size)); - nir_ssa_def *exp2x = build_exp(nb, nir_fmul_imm(nb, x, 2.0)); - val->ssa->def = nir_fdiv(nb, nir_fadd_imm(nb, exp2x, -1.0), - nir_fadd_imm(nb, exp2x, 1.0)); + nir_ssa_def *x = nir_fclamp(nb, src[0], + nir_imm_floatN_t(nb, -clamped_x, bit_size), + nir_imm_floatN_t(nb, clamped_x, bit_size)); + val->ssa->def = + nir_fdiv(nb, nir_fsub(nb, build_exp(nb, x), + build_exp(nb, nir_fneg(nb, x))), + nir_fadd(nb, build_exp(nb, x), + build_exp(nb, nir_fneg(nb, x)))); return; } @@ -657,11 +510,11 @@ return; case GLSLstd450Atan: - val->ssa->def = build_atan(nb, src[0]); + val->ssa->def = nir_atan(nb, src[0]); return; case GLSLstd450Atan2: - val->ssa->def = build_atan2(nb, src[0], src[1]); + val->ssa->def = nir_atan2(nb, src[0], src[1]); return; case GLSLstd450Frexp: { @@ -678,13 +531,16 @@ return; } - default: + default: { + unsigned execution_mode = + b->shader->info.float_controls_execution_mode; val->ssa->def = nir_build_alu(&b->nb, - vtn_nir_alu_op_for_spirv_glsl_opcode(b, entrypoint), + vtn_nir_alu_op_for_spirv_glsl_opcode(b, entrypoint, execution_mode), src[0], src[1], src[2], NULL); return; } + } } static void diff -Nru mesa-19.2.8/src/compiler/spirv/vtn_opencl.c mesa-20.0.8/src/compiler/spirv/vtn_opencl.c --- mesa-19.2.8/src/compiler/spirv/vtn_opencl.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/spirv/vtn_opencl.c 2020-06-12 01:21:16.000000000 +0000 @@ -25,7 +25,6 @@ */ #include "math.h" - #include "nir/nir_builtin_builder.h" #include "vtn_private.h" @@ -129,6 +128,18 @@ return nir_uabs_diff(nb, srcs[0], srcs[1]); case OpenCLstd_Bitselect: return nir_bitselect(nb, srcs[0], srcs[1], srcs[2]); + case OpenCLstd_SMad_hi: + return nir_imad_hi(nb, srcs[0], srcs[1], srcs[2]); + case OpenCLstd_UMad_hi: + return nir_umad_hi(nb, srcs[0], srcs[1], srcs[2]); + case OpenCLstd_SMul24: + return nir_imul24(nb, srcs[0], srcs[1]); + case OpenCLstd_UMul24: + return nir_umul24(nb, srcs[0], srcs[1]); + case OpenCLstd_SMad24: + return nir_imad24(nb, srcs[0], srcs[1], srcs[2]); + case OpenCLstd_UMad24: + return nir_umad24(nb, srcs[0], srcs[1], srcs[2]); case OpenCLstd_FClamp: return nir_fclamp(nb, srcs[0], srcs[1], srcs[2]); case OpenCLstd_SClamp: @@ -173,6 +184,8 @@ return nir_rotate(nb, srcs[0], srcs[1]); case OpenCLstd_Smoothstep: return nir_smoothstep(nb, srcs[0], srcs[1], srcs[2]); + case OpenCLstd_Clz: + return nir_clz_u(nb, srcs[0]); case OpenCLstd_Select: return nir_select(nb, srcs[0], srcs[1], srcs[2]); case OpenCLstd_Step: @@ -199,25 +212,34 @@ const struct glsl_type *dest_type = type->type; unsigned components = glsl_get_vector_elements(dest_type); - unsigned stride = components * glsl_get_bit_size(dest_type) / 8; nir_ssa_def *offset = vtn_ssa_value(b, w[5 + a])->def; struct vtn_value *p = vtn_value(b, w[6 + a], vtn_value_type_pointer); - nir_deref_instr *deref = vtn_pointer_to_deref(b, p->pointer); + struct vtn_ssa_value *comps[NIR_MAX_VEC_COMPONENTS]; + nir_ssa_def *ncomps[NIR_MAX_VEC_COMPONENTS]; - /* 1. cast to vec type with adjusted stride */ - deref = nir_build_deref_cast(&b->nb, &deref->dest.ssa, deref->mode, - dest_type, stride); - /* 2. deref ptr_as_array */ - deref = nir_build_deref_ptr_as_array(&b->nb, deref, offset); + nir_ssa_def *moffset = nir_imul_imm(&b->nb, offset, components); + nir_deref_instr *deref = vtn_pointer_to_deref(b, p->pointer); + for (int i = 0; i < components; i++) { + nir_ssa_def *coffset = nir_iadd_imm(&b->nb, moffset, i); + nir_deref_instr *arr_deref = nir_build_deref_ptr_as_array(&b->nb, deref, coffset); + + if (load) { + comps[i] = vtn_local_load(b, arr_deref, p->type->access); + ncomps[i] = comps[i]->def; + } else { + struct vtn_ssa_value *ssa = vtn_create_ssa_value(b, glsl_scalar_type(glsl_get_base_type(dest_type))); + struct vtn_ssa_value *val = vtn_ssa_value(b, w[5]); + ssa->def = vtn_vector_extract(b, val->def, i); + vtn_local_store(b, ssa, arr_deref, p->type->access); + } + } if (load) { - struct vtn_ssa_value *val = vtn_local_load(b, deref, p->type->access); - vtn_push_ssa(b, w[2], type, val); - } else { - struct vtn_ssa_value *val = vtn_ssa_value(b, w[5]); - vtn_local_store(b, val, deref, p->type->access); + struct vtn_ssa_value *ssa = vtn_create_ssa_value(b, dest_type); + ssa->def = nir_vec(&b->nb, ncomps, components); + vtn_push_ssa(b, w[2], type, ssa); } } @@ -244,6 +266,52 @@ return nir_imm_int(&b->nb, -1); } +static nir_ssa_def * +handle_shuffle(struct vtn_builder *b, enum OpenCLstd_Entrypoints opcode, unsigned num_srcs, + nir_ssa_def **srcs, const struct glsl_type *dest_type) +{ + struct nir_ssa_def *input = srcs[0]; + struct nir_ssa_def *mask = srcs[1]; + + unsigned out_elems = glsl_get_vector_elements(dest_type); + nir_ssa_def *outres[NIR_MAX_VEC_COMPONENTS]; + unsigned in_elems = input->num_components; + if (mask->bit_size != 32) + mask = nir_u2u32(&b->nb, mask); + mask = nir_iand(&b->nb, mask, nir_imm_intN_t(&b->nb, in_elems - 1, mask->bit_size)); + for (unsigned i = 0; i < out_elems; i++) + outres[i] = nir_vector_extract(&b->nb, input, nir_channel(&b->nb, mask, i)); + + return nir_vec(&b->nb, outres, out_elems); +} + +static nir_ssa_def * +handle_shuffle2(struct vtn_builder *b, enum OpenCLstd_Entrypoints opcode, unsigned num_srcs, + nir_ssa_def **srcs, const struct glsl_type *dest_type) +{ + struct nir_ssa_def *input0 = srcs[0]; + struct nir_ssa_def *input1 = srcs[1]; + struct nir_ssa_def *mask = srcs[2]; + + unsigned out_elems = glsl_get_vector_elements(dest_type); + nir_ssa_def *outres[NIR_MAX_VEC_COMPONENTS]; + unsigned in_elems = input0->num_components; + unsigned total_mask = 2 * in_elems - 1; + unsigned half_mask = in_elems - 1; + if (mask->bit_size != 32) + mask = nir_u2u32(&b->nb, mask); + mask = nir_iand(&b->nb, mask, nir_imm_intN_t(&b->nb, total_mask, mask->bit_size)); + for (unsigned i = 0; i < out_elems; i++) { + nir_ssa_def *this_mask = nir_channel(&b->nb, mask, i); + nir_ssa_def *vmask = nir_iand(&b->nb, this_mask, nir_imm_intN_t(&b->nb, half_mask, mask->bit_size)); + nir_ssa_def *val0 = nir_vector_extract(&b->nb, input0, vmask); + nir_ssa_def *val1 = nir_vector_extract(&b->nb, input1, vmask); + nir_ssa_def *sel = nir_ilt(&b->nb, this_mask, nir_imm_intN_t(&b->nb, in_elems, mask->bit_size)); + outres[i] = nir_bcsel(&b->nb, sel, val0, val1); + } + return nir_vec(&b->nb, outres, out_elems); +} + bool vtn_handle_opencl_instruction(struct vtn_builder *b, SpvOp ext_opcode, const uint32_t *w, unsigned count) @@ -288,6 +356,12 @@ return true; case OpenCLstd_SAbs_diff: case OpenCLstd_UAbs_diff: + case OpenCLstd_SMad_hi: + case OpenCLstd_UMad_hi: + case OpenCLstd_SMad24: + case OpenCLstd_UMad24: + case OpenCLstd_SMul24: + case OpenCLstd_UMul24: case OpenCLstd_Bitselect: case OpenCLstd_FClamp: case OpenCLstd_SClamp: @@ -314,6 +388,7 @@ case OpenCLstd_Smoothstep: case OpenCLstd_S_Upsample: case OpenCLstd_U_Upsample: + case OpenCLstd_Clz: handle_instr(b, ext_opcode, w, count, handle_special); return true; case OpenCLstd_Vloadn: @@ -322,6 +397,12 @@ case OpenCLstd_Vstoren: vtn_handle_opencl_vstore(b, ext_opcode, w, count); return true; + case OpenCLstd_Shuffle: + handle_instr(b, ext_opcode, w, count, handle_shuffle); + return true; + case OpenCLstd_Shuffle2: + handle_instr(b, ext_opcode, w, count, handle_shuffle2); + return true; case OpenCLstd_Printf: handle_instr(b, ext_opcode, w, count, handle_printf); return true; diff -Nru mesa-19.2.8/src/compiler/spirv/vtn_private.h mesa-20.0.8/src/compiler/spirv/vtn_private.h --- mesa-19.2.8/src/compiler/spirv/vtn_private.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/spirv/vtn_private.h 2020-06-12 01:21:16.000000000 +0000 @@ -534,10 +534,10 @@ struct vtn_pointer *image; nir_ssa_def *coord; nir_ssa_def *sample; + nir_ssa_def *lod; }; struct vtn_sampled_image { - struct vtn_type *type; struct vtn_pointer *image; /* Image or array of images */ struct vtn_pointer *sampler; /* Sampler */ }; @@ -626,6 +626,9 @@ /* True if we should watch out for GLSLang issue #179 */ bool wa_glslang_179; + /* True if we need to fix up CS OpControlBarrier */ + bool wa_glslang_cs_barrier; + gl_shader_stage entry_point_stage; const char *entry_point_name; struct vtn_value *entry_point; @@ -887,4 +890,15 @@ bool vtn_handle_amd_shader_trinary_minmax_instruction(struct vtn_builder *b, SpvOp ext_opcode, const uint32_t *words, unsigned count); + +bool vtn_handle_amd_shader_explicit_vertex_parameter_instruction(struct vtn_builder *b, + SpvOp ext_opcode, + const uint32_t *words, + unsigned count); + +SpvMemorySemanticsMask vtn_storage_class_to_memory_semantics(SpvStorageClass sc); + +void vtn_emit_memory_barrier(struct vtn_builder *b, SpvScope scope, + SpvMemorySemanticsMask semantics); + #endif /* _VTN_PRIVATE_H_ */ diff -Nru mesa-19.2.8/src/compiler/spirv/vtn_subgroup.c mesa-20.0.8/src/compiler/spirv/vtn_subgroup.c --- mesa-19.2.8/src/compiler/spirv/vtn_subgroup.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/spirv/vtn_subgroup.c 2020-06-12 01:21:16.000000000 +0000 @@ -213,15 +213,22 @@ case SpvOpSubgroupAnyKHR: op = nir_intrinsic_vote_any; break; + case SpvOpSubgroupAllEqualKHR: + op = nir_intrinsic_vote_ieq; + break; case SpvOpGroupNonUniformAllEqual: - case SpvOpSubgroupAllEqualKHR: { - switch (glsl_get_base_type(val->type->type)) { + switch (glsl_get_base_type(vtn_ssa_value(b, w[4])->type)) { case GLSL_TYPE_FLOAT: + case GLSL_TYPE_FLOAT16: case GLSL_TYPE_DOUBLE: op = nir_intrinsic_vote_feq; break; case GLSL_TYPE_UINT: case GLSL_TYPE_INT: + case GLSL_TYPE_UINT8: + case GLSL_TYPE_INT8: + case GLSL_TYPE_UINT16: + case GLSL_TYPE_INT16: case GLSL_TYPE_UINT64: case GLSL_TYPE_INT64: case GLSL_TYPE_BOOL: @@ -231,7 +238,6 @@ unreachable("Unhandled type"); } break; - } default: unreachable("Unhandled opcode"); } diff -Nru mesa-19.2.8/src/compiler/spirv/vtn_variables.c mesa-20.0.8/src/compiler/spirv/vtn_variables.c --- mesa-19.2.8/src/compiler/spirv/vtn_variables.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/compiler/spirv/vtn_variables.c 2020-06-12 01:21:16.000000000 +0000 @@ -30,18 +30,49 @@ #include "nir_deref.h" #include -static void ptr_decoration_cb(struct vtn_builder *b, - struct vtn_value *val, int member, - const struct vtn_decoration *dec, - void *void_ptr); +static void +ptr_decoration_cb(struct vtn_builder *b, struct vtn_value *val, int member, + const struct vtn_decoration *dec, void *void_ptr) +{ + struct vtn_pointer *ptr = void_ptr; + + switch (dec->decoration) { + case SpvDecorationNonUniformEXT: + ptr->access |= ACCESS_NON_UNIFORM; + break; + + default: + break; + } +} + +static struct vtn_pointer* +vtn_decorate_pointer(struct vtn_builder *b, struct vtn_value *val, + struct vtn_pointer *ptr) +{ + struct vtn_pointer dummy = { .access = 0 }; + vtn_foreach_decoration(b, val, ptr_decoration_cb, &dummy); + + /* If we're adding access flags, make a copy of the pointer. We could + * probably just OR them in without doing so but this prevents us from + * leaking them any further than actually specified in the SPIR-V. + */ + if (dummy.access & ~ptr->access) { + struct vtn_pointer *copy = ralloc(b, struct vtn_pointer); + *copy = *ptr; + copy->access |= dummy.access; + return copy; + } + + return ptr; +} struct vtn_value * vtn_push_value_pointer(struct vtn_builder *b, uint32_t value_id, struct vtn_pointer *ptr) { struct vtn_value *val = vtn_push_value(b, value_id, vtn_value_type_pointer); - val->pointer = ptr; - vtn_foreach_decoration(b, val, ptr_decoration_cb, ptr); + val->pointer = vtn_decorate_pointer(b, val, ptr); return val; } @@ -1479,6 +1510,34 @@ *location = SYSTEM_VALUE_GLOBAL_GROUP_SIZE; set_mode_system_value(b, mode); break; + case SpvBuiltInBaryCoordNoPerspAMD: + *location = SYSTEM_VALUE_BARYCENTRIC_LINEAR_PIXEL; + set_mode_system_value(b, mode); + break; + case SpvBuiltInBaryCoordNoPerspCentroidAMD: + *location = SYSTEM_VALUE_BARYCENTRIC_LINEAR_CENTROID; + set_mode_system_value(b, mode); + break; + case SpvBuiltInBaryCoordNoPerspSampleAMD: + *location = SYSTEM_VALUE_BARYCENTRIC_LINEAR_SAMPLE; + set_mode_system_value(b, mode); + break; + case SpvBuiltInBaryCoordSmoothAMD: + *location = SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL; + set_mode_system_value(b, mode); + break; + case SpvBuiltInBaryCoordSmoothCentroidAMD: + *location = SYSTEM_VALUE_BARYCENTRIC_PERSP_CENTROID; + set_mode_system_value(b, mode); + break; + case SpvBuiltInBaryCoordSmoothSampleAMD: + *location = SYSTEM_VALUE_BARYCENTRIC_PERSP_SAMPLE; + set_mode_system_value(b, mode); + break; + case SpvBuiltInBaryCoordPullModelAMD: + *location = SYSTEM_VALUE_BARYCENTRIC_PULL_MODEL; + set_mode_system_value(b, mode); + break; default: vtn_fail("Unsupported builtin: %s (%u)", spirv_builtin_to_string(builtin), builtin); @@ -1499,6 +1558,9 @@ case SpvDecorationFlat: var_data->interpolation = INTERP_MODE_FLAT; break; + case SpvDecorationExplicitInterpAMD: + var_data->interpolation = INTERP_MODE_EXPLICIT; + break; case SpvDecorationCentroid: var_data->centroid = true; break; @@ -1512,20 +1574,20 @@ var_data->read_only = true; break; case SpvDecorationNonReadable: - var_data->image.access |= ACCESS_NON_READABLE; + var_data->access |= ACCESS_NON_READABLE; break; case SpvDecorationNonWritable: var_data->read_only = true; - var_data->image.access |= ACCESS_NON_WRITEABLE; + var_data->access |= ACCESS_NON_WRITEABLE; break; case SpvDecorationRestrict: - var_data->image.access |= ACCESS_RESTRICT; + var_data->access |= ACCESS_RESTRICT; break; case SpvDecorationVolatile: - var_data->image.access |= ACCESS_VOLATILE; + var_data->access |= ACCESS_VOLATILE; break; case SpvDecorationCoherent: - var_data->image.access |= ACCESS_COHERENT; + var_data->access |= ACCESS_COHERENT; break; case SpvDecorationComponent: var_data->location_frac = dec->operands[0]; @@ -1586,12 +1648,12 @@ case SpvDecorationXfbBuffer: var_data->explicit_xfb_buffer = true; - var_data->xfb_buffer = dec->operands[0]; + var_data->xfb.buffer = dec->operands[0]; var_data->always_active_io = true; break; case SpvDecorationXfbStride: var_data->explicit_xfb_stride = true; - var_data->xfb_stride = dec->operands[0]; + var_data->xfb.stride = dec->operands[0]; break; case SpvDecorationOffset: var_data->explicit_offset = true; @@ -1615,6 +1677,7 @@ break; case SpvDecorationUserSemantic: + case SpvDecorationUserTypeGOOGLE: /* User semantic decorations can safely be ignored by the driver. */ break; @@ -1753,22 +1816,6 @@ } } -static void -ptr_decoration_cb(struct vtn_builder *b, struct vtn_value *val, int member, - const struct vtn_decoration *dec, void *void_ptr) -{ - struct vtn_pointer *ptr = void_ptr; - - switch (dec->decoration) { - case SpvDecorationNonUniformEXT: - ptr->access |= ACCESS_NON_UNIFORM; - break; - - default: - break; - } -} - enum vtn_variable_mode vtn_storage_class_to_mode(struct vtn_builder *b, SpvStorageClass class, @@ -1796,13 +1843,23 @@ mode = vtn_variable_mode_ssbo; nir_mode = nir_var_mem_ssbo; break; - case SpvStorageClassPhysicalStorageBufferEXT: + case SpvStorageClassPhysicalStorageBuffer: mode = vtn_variable_mode_phys_ssbo; nir_mode = nir_var_mem_global; break; case SpvStorageClassUniformConstant: - mode = vtn_variable_mode_uniform; - nir_mode = nir_var_uniform; + if (b->shader->info.stage == MESA_SHADER_KERNEL) { + if (b->options->constant_as_global) { + mode = vtn_variable_mode_cross_workgroup; + nir_mode = nir_var_mem_global; + } else { + mode = vtn_variable_mode_ubo; + nir_mode = nir_var_mem_ubo; + } + } else { + mode = vtn_variable_mode_uniform; + nir_mode = nir_var_uniform; + } break; case SpvStorageClassPushConstant: mode = vtn_variable_mode_push_constant; @@ -1926,10 +1983,10 @@ /* In this case, we're looking for a block index and not an actual * deref. * - * For PhysicalStorageBufferEXT pointers, we don't have a block index + * For PhysicalStorageBuffer pointers, we don't have a block index * at all because we get the pointer directly from the client. This * assumes that there will never be a SSBO binding variable using the - * PhysicalStorageBufferEXT storage class. This assumption appears + * PhysicalStorageBuffer storage class. This assumption appears * to be correct according to the Vulkan spec because the table, * "Shader Resource and Storage Class Correspondence," the only the * Uniform storage class with BufferBlock or the StorageBuffer @@ -2013,10 +2070,10 @@ /* This is a pointer to something internal or a pointer inside a * block. It's just a regular cast. * - * For PhysicalStorageBufferEXT pointers, we don't have a block index + * For PhysicalStorageBuffer pointers, we don't have a block index * at all because we get the pointer directly from the client. This * assumes that there will never be a SSBO binding variable using the - * PhysicalStorageBufferEXT storage class. This assumption appears + * PhysicalStorageBuffer storage class. This assumption appears * to be correct according to the Vulkan spec because the table, * "Shader Resource and Storage Class Correspondence," the only the * Uniform storage class with BufferBlock or the StorageBuffer @@ -2150,7 +2207,7 @@ case vtn_variable_mode_phys_ssbo: vtn_fail("Cannot create a variable with the " - "PhysicalStorageBufferEXT storage class"); + "PhysicalStorageBuffer storage class"); break; default: @@ -2490,14 +2547,13 @@ struct vtn_value *val = vtn_push_value(b, w[2], vtn_value_type_sampled_image); val->sampled_image = ralloc(b, struct vtn_sampled_image); - val->sampled_image->type = base_val->sampled_image->type; val->sampled_image->image = vtn_pointer_dereference(b, base_val->sampled_image->image, chain); val->sampled_image->sampler = base_val->sampled_image->sampler; - vtn_foreach_decoration(b, val, ptr_decoration_cb, - val->sampled_image->image); - vtn_foreach_decoration(b, val, ptr_decoration_cb, - val->sampled_image->sampler); + val->sampled_image->image = + vtn_decorate_pointer(b, val, val->sampled_image->image); + val->sampled_image->sampler = + vtn_decorate_pointer(b, val, val->sampled_image->sampler); } else { vtn_assert(base_val->value_type == vtn_value_type_pointer); struct vtn_pointer *ptr = @@ -2527,10 +2583,33 @@ vtn_assert_types_equal(b, opcode, res_type, src_val->type->deref); - if (glsl_type_is_image(res_type->type) || - glsl_type_is_sampler(res_type->type)) { + if (res_type->base_type == vtn_base_type_image || + res_type->base_type == vtn_base_type_sampler) { vtn_push_value_pointer(b, w[2], src); return; + } else if (res_type->base_type == vtn_base_type_sampled_image) { + struct vtn_value *val = + vtn_push_value(b, w[2], vtn_value_type_sampled_image); + val->sampled_image = ralloc(b, struct vtn_sampled_image); + val->sampled_image->image = val->sampled_image->sampler = + vtn_decorate_pointer(b, val, src); + return; + } + + if (count > 4) { + unsigned idx = 5; + SpvMemoryAccessMask access = w[4]; + if (access & SpvMemoryAccessAlignedMask) + idx++; + + if (access & SpvMemoryAccessMakePointerVisibleMask) { + SpvMemorySemanticsMask semantics = + SpvMemorySemanticsMakeVisibleMask | + vtn_storage_class_to_memory_semantics(src->ptr_type->storage_class); + + SpvScope scope = vtn_constant_uint(b, w[idx]); + vtn_emit_memory_barrier(b, scope, semantics); + } } vtn_push_ssa(b, w[2], res_type, vtn_variable_load(b, src)); @@ -2572,8 +2651,13 @@ vtn_warn("OpStore of a sampler detected. Doing on-the-fly copy " "propagation to workaround the problem."); vtn_assert(dest->var->copy_prop_sampler == NULL); - dest->var->copy_prop_sampler = - vtn_value(b, w[2], vtn_value_type_pointer)->pointer; + struct vtn_value *v = vtn_untyped_value(b, w[2]); + if (v->value_type == vtn_value_type_sampled_image) { + dest->var->copy_prop_sampler = v->sampled_image->sampler; + } else { + vtn_assert(v->value_type == vtn_value_type_pointer); + dest->var->copy_prop_sampler = v->pointer; + } } else { vtn_fail("Vulkan does not allow OpStore of a sampler or image."); } @@ -2582,6 +2666,22 @@ struct vtn_ssa_value *src = vtn_ssa_value(b, w[2]); vtn_variable_store(b, src, dest); + + if (count > 3) { + unsigned idx = 4; + SpvMemoryAccessMask access = w[3]; + + if (access & SpvMemoryAccessAlignedMask) + idx++; + + if (access & SpvMemoryAccessMakePointerAvailableMask) { + SpvMemorySemanticsMask semantics = + SpvMemorySemanticsMakeAvailableMask | + vtn_storage_class_to_memory_semantics(dest->ptr_type->storage_class); + SpvScope scope = vtn_constant_uint(b, w[idx]); + vtn_emit_memory_barrier(b, scope, semantics); + } + } break; } @@ -2652,7 +2752,7 @@ case SpvOpConvertUToPtr: { struct vtn_value *ptr_val = vtn_push_value(b, w[2], vtn_value_type_pointer); - struct vtn_value *u_val = vtn_value(b, w[3], vtn_value_type_ssa); + struct vtn_value *u_val = vtn_untyped_value(b, w[3]); vtn_fail_if(ptr_val->type->type == NULL, "OpConvertUToPtr can only be used on physical pointers"); @@ -2662,7 +2762,8 @@ "OpConvertUToPtr can only be used to cast from a vector or " "scalar type"); - nir_ssa_def *ptr_ssa = nir_sloppy_bitcast(&b->nb, u_val->ssa->def, + struct vtn_ssa_value *u_ssa = vtn_ssa_value(b, w[3]); + nir_ssa_def *ptr_ssa = nir_sloppy_bitcast(&b->nb, u_ssa->def, ptr_val->type->type); ptr_val->pointer = vtn_pointer_from_ssa(b, ptr_ssa, ptr_val->type); vtn_foreach_decoration(b, ptr_val, ptr_decoration_cb, ptr_val->pointer); diff -Nru mesa-19.2.8/src/drm-shim/drm_shim.c mesa-20.0.8/src/drm-shim/drm_shim.c --- mesa-19.2.8/src/drm-shim/drm_shim.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/drm-shim/drm_shim.c 2020-06-12 01:21:16.000000000 +0000 @@ -219,6 +219,7 @@ pipe(fds); write(fds[1], file_overrides[i].contents, strlen(file_overrides[i].contents)); + close(fds[1]); return fdopen(fds[0], "r"); } } diff -Nru mesa-19.2.8/src/egl/drivers/dri2/egl_dri2.c mesa-20.0.8/src/egl/drivers/dri2/egl_dri2.c --- mesa-19.2.8/src/egl/drivers/dri2/egl_dri2.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/egl/drivers/dri2/egl_dri2.c 2020-06-12 01:21:16.000000000 +0000 @@ -65,6 +65,8 @@ #include "util/u_atomic.h" #include "util/u_vector.h" #include "mapi/glapi/glapi.h" +#include "util/bitscan.h" +#include "util/u_math.h" /* Additional definitions not yet in the drm_fourcc.h. */ @@ -82,6 +84,48 @@ #define NUM_ATTRIBS 12 +static const struct dri2_pbuffer_visual { + unsigned int dri_image_format; + int rgba_shifts[4]; + unsigned int rgba_sizes[4]; +} dri2_pbuffer_visuals[] = { + { + __DRI_IMAGE_FORMAT_ABGR16161616F, + { 0, 16, 32, 48 }, + { 16, 16, 16, 16 } + }, + { + __DRI_IMAGE_FORMAT_XBGR16161616F, + { 0, 16, 32, -1 }, + { 16, 16, 16, 0 } + }, + { + __DRI_IMAGE_FORMAT_ARGB2101010, + { 20, 10, 0, 30 }, + { 10, 10, 10, 2 } + }, + { + __DRI_IMAGE_FORMAT_XRGB2101010, + { 20, 10, 0, -1 }, + { 10, 10, 10, 0 } + }, + { + __DRI_IMAGE_FORMAT_ARGB8888, + { 16, 8, 0, 24 }, + { 8, 8, 8, 8 } + }, + { + __DRI_IMAGE_FORMAT_XRGB8888, + { 16, 8, 0, -1 }, + { 8, 8, 8, 0 } + }, + { + __DRI_IMAGE_FORMAT_RGB565, + { 11, 5, 0, -1 }, + { 5, 6, 5, 0 } + }, +}; + static void dri_set_background_context(void *loaderPrivate) { @@ -161,25 +205,102 @@ *h = dri2_surf->base.Height; } -/* HACK: technically we should have swrast_null, instead of these. We - * get away since only pbuffers are supported, thus the callbacks are - * unused. +static int +dri2_get_bytes_per_pixel(struct dri2_egl_surface *dri2_surf) +{ + const int depth = dri2_surf->base.Config->BufferSize; + return depth ? util_next_power_of_two(depth / 8) : 0; +} + +static void +dri2_put_image(__DRIdrawable * draw, int op, + int x, int y, int w, int h, + char *data, void *loaderPrivate) +{ + struct dri2_egl_surface *dri2_surf = loaderPrivate; + const int bpp = dri2_get_bytes_per_pixel(dri2_surf); + const int width = dri2_surf->base.Width; + const int height = dri2_surf->base.Height; + const int dst_stride = width*bpp; + const int src_stride = w*bpp; + const int x_offset = x*bpp; + int copy_width = src_stride; + + if (!dri2_surf->swrast_device_buffer) + dri2_surf->swrast_device_buffer = malloc(height*dst_stride); + + if (dri2_surf->swrast_device_buffer) { + const char *src = data; + char *dst = dri2_surf->swrast_device_buffer; + + dst += x_offset; + dst += y*dst_stride; + + /* Drivers are allowed to submit OOB PutImage requests, so clip here. */ + if (copy_width > dst_stride - x_offset) + copy_width = dst_stride - x_offset; + if (h > height - y) + h = height - y; + + for (; 0 < h; --h) { + memcpy(dst, src, copy_width); + dst += dst_stride; + src += src_stride; + } + } +} + +static void +dri2_get_image(__DRIdrawable * read, + int x, int y, int w, int h, + char *data, void *loaderPrivate) +{ + struct dri2_egl_surface *dri2_surf = loaderPrivate; + const int bpp = dri2_get_bytes_per_pixel(dri2_surf); + const int width = dri2_surf->base.Width; + const int height = dri2_surf->base.Height; + const int src_stride = width*bpp; + const int dst_stride = w*bpp; + const int x_offset = x*bpp; + int copy_width = dst_stride; + const char *src = dri2_surf->swrast_device_buffer; + char *dst = data; + + if (!src) { + memset(data, 0, copy_width * h); + return; + } + + src += x_offset; + src += y*src_stride; + + /* Drivers are allowed to submit OOB GetImage requests, so clip here. */ + if (copy_width > src_stride - x_offset) + copy_width = src_stride - x_offset; + if (h > height - y) + h = height - y; + + for (; 0 < h; --h) { + memcpy(dst, src, copy_width); + src += src_stride; + dst += dst_stride; + } + +} + +/* HACK: technically we should have swrast_null, instead of these. */ const __DRIswrastLoaderExtension swrast_pbuffer_loader_extension = { .base = { __DRI_SWRAST_LOADER, 1 }, .getDrawableInfo = dri2_get_pbuffer_drawable_info, - .putImage = NULL, - .getImage = NULL, + .putImage = dri2_put_image, + .getImage = dri2_get_image, }; static const EGLint dri2_to_egl_attribute_map[__DRI_ATTRIB_MAX] = { [__DRI_ATTRIB_BUFFER_SIZE ] = EGL_BUFFER_SIZE, [__DRI_ATTRIB_LEVEL] = EGL_LEVEL, - [__DRI_ATTRIB_RED_SIZE] = EGL_RED_SIZE, - [__DRI_ATTRIB_GREEN_SIZE] = EGL_GREEN_SIZE, - [__DRI_ATTRIB_BLUE_SIZE] = EGL_BLUE_SIZE, [__DRI_ATTRIB_LUMINANCE_SIZE] = EGL_LUMINANCE_SIZE, - [__DRI_ATTRIB_ALPHA_SIZE] = EGL_ALPHA_SIZE, [__DRI_ATTRIB_DEPTH_SIZE] = EGL_DEPTH_SIZE, [__DRI_ATTRIB_STENCIL_SIZE] = EGL_STENCIL_SIZE, [__DRI_ATTRIB_SAMPLE_BUFFERS] = EGL_SAMPLE_BUFFERS, @@ -214,10 +335,77 @@ return EGL_TRUE; } +void +dri2_get_shifts_and_sizes(const __DRIcoreExtension *core, + const __DRIconfig *config, int *shifts, + unsigned int *sizes) +{ + unsigned int mask; + + if (core->getConfigAttrib(config, __DRI_ATTRIB_RED_SHIFT, (unsigned int *)&shifts[0])) { + core->getConfigAttrib(config, __DRI_ATTRIB_GREEN_SHIFT, (unsigned int *)&shifts[1]); + core->getConfigAttrib(config, __DRI_ATTRIB_BLUE_SHIFT, (unsigned int *)&shifts[2]); + core->getConfigAttrib(config, __DRI_ATTRIB_ALPHA_SHIFT, (unsigned int *)&shifts[3]); + } else { + /* Driver isn't exposing shifts, so convert masks to shifts */ + core->getConfigAttrib(config, __DRI_ATTRIB_RED_MASK, &mask); + shifts[0] = ffs(mask) - 1; + core->getConfigAttrib(config, __DRI_ATTRIB_GREEN_MASK, &mask); + shifts[1] = ffs(mask) - 1; + core->getConfigAttrib(config, __DRI_ATTRIB_BLUE_MASK, &mask); + shifts[2] = ffs(mask) - 1; + core->getConfigAttrib(config, __DRI_ATTRIB_ALPHA_MASK, &mask); + shifts[3] = ffs(mask) - 1; + } + + core->getConfigAttrib(config, __DRI_ATTRIB_RED_SIZE, &sizes[0]); + core->getConfigAttrib(config, __DRI_ATTRIB_GREEN_SIZE, &sizes[1]); + core->getConfigAttrib(config, __DRI_ATTRIB_BLUE_SIZE, &sizes[2]); + core->getConfigAttrib(config, __DRI_ATTRIB_ALPHA_SIZE, &sizes[3]); +} + +void +dri2_get_render_type_float(const __DRIcoreExtension *core, + const __DRIconfig *config, + bool *is_float) +{ + unsigned int render_type; + + core->getConfigAttrib(config, __DRI_ATTRIB_RENDER_TYPE, &render_type); + *is_float = (render_type & __DRI_ATTRIB_FLOAT_BIT) ? true : false; +} + +unsigned int +dri2_image_format_for_pbuffer_config(struct dri2_egl_display *dri2_dpy, + const __DRIconfig *config) +{ + int shifts[4]; + unsigned int sizes[4]; + + dri2_get_shifts_and_sizes(dri2_dpy->core, config, shifts, sizes); + + for (unsigned i = 0; i < ARRAY_SIZE(dri2_pbuffer_visuals); ++i) { + const struct dri2_pbuffer_visual *visual = &dri2_pbuffer_visuals[i]; + + if (shifts[0] == visual->rgba_shifts[0] && + shifts[1] == visual->rgba_shifts[1] && + shifts[2] == visual->rgba_shifts[2] && + shifts[3] == visual->rgba_shifts[3] && + sizes[0] == visual->rgba_sizes[0] && + sizes[1] == visual->rgba_sizes[1] && + sizes[2] == visual->rgba_sizes[2] && + sizes[3] == visual->rgba_sizes[3]) { + return visual->dri_image_format; + } + } + + return __DRI_IMAGE_FORMAT_NONE; +} + struct dri2_egl_config * dri2_add_config(_EGLDisplay *disp, const __DRIconfig *dri_config, int id, EGLint surface_type, const EGLint *attr_list, - const unsigned int *rgba_masks) + const int *rgba_shifts, const unsigned int *rgba_sizes) { struct dri2_egl_config *conf; struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp); @@ -225,7 +413,8 @@ unsigned int attrib, value, double_buffer; bool srgb = false; EGLint key, bind_to_texture_rgb, bind_to_texture_rgba; - unsigned int dri_masks[4] = { 0, 0, 0, 0 }; + int dri_shifts[4] = { -1, -1, -1, -1 }; + unsigned int dri_sizes[4] = { 0, 0, 0, 0 }; _EGLConfig *matching_config; EGLint num_configs = 0; EGLint config_id; @@ -242,6 +431,9 @@ switch (attrib) { case __DRI_ATTRIB_RENDER_TYPE: + if (value & __DRI_ATTRIB_FLOAT_BIT) + _eglSetConfigKey(&base, EGL_COLOR_COMPONENT_TYPE_EXT, + EGL_COLOR_COMPONENT_TYPE_FLOAT_EXT); if (value & __DRI_ATTRIB_RGBA_BIT) value = EGL_RGB_BUFFER; else if (value & __DRI_ATTRIB_LUMINANCE_BIT) @@ -273,20 +465,56 @@ double_buffer = value; break; + case __DRI_ATTRIB_RED_SIZE: + dri_sizes[0] = value; + _eglSetConfigKey(&base, EGL_RED_SIZE, value); + break; + case __DRI_ATTRIB_RED_MASK: - dri_masks[0] = value; + dri_shifts[0] = ffs(value) - 1; + break; + + case __DRI_ATTRIB_RED_SHIFT: + dri_shifts[0] = value; + break; + + case __DRI_ATTRIB_GREEN_SIZE: + dri_sizes[1] = value; + _eglSetConfigKey(&base, EGL_GREEN_SIZE, value); break; case __DRI_ATTRIB_GREEN_MASK: - dri_masks[1] = value; + dri_shifts[1] = ffs(value) - 1; + break; + + case __DRI_ATTRIB_GREEN_SHIFT: + dri_shifts[1] = value; + break; + + case __DRI_ATTRIB_BLUE_SIZE: + dri_sizes[2] = value; + _eglSetConfigKey(&base, EGL_BLUE_SIZE, value); break; case __DRI_ATTRIB_BLUE_MASK: - dri_masks[2] = value; + dri_shifts[2] = ffs(value) - 1; + break; + + case __DRI_ATTRIB_BLUE_SHIFT: + dri_shifts[2] = value; + break; + + case __DRI_ATTRIB_ALPHA_SIZE: + dri_sizes[3] = value; + _eglSetConfigKey(&base, EGL_ALPHA_SIZE, value); break; case __DRI_ATTRIB_ALPHA_MASK: - dri_masks[3] = value; + dri_shifts[3] = ffs(value) - 1; + break; + + case __DRI_ATTRIB_ALPHA_SHIFT: + dri_shifts[3] = value; break; case __DRI_ATTRIB_ACCUM_RED_SIZE: @@ -328,7 +556,10 @@ for (int i = 0; attr_list[i] != EGL_NONE; i += 2) _eglSetConfigKey(&base, attr_list[i], attr_list[i+1]); - if (rgba_masks && memcmp(rgba_masks, dri_masks, sizeof(dri_masks))) + if (rgba_shifts && memcmp(rgba_shifts, dri_shifts, sizeof(dri_shifts))) + return NULL; + + if (rgba_sizes && memcmp(rgba_sizes, dri_sizes, sizeof(dri_sizes))) return NULL; base.NativeRenderable = EGL_TRUE; @@ -341,6 +572,22 @@ base.BindToTextureRGBA = bind_to_texture_rgba; } + if (double_buffer) { + surface_type &= ~EGL_PIXMAP_BIT; + } + + /* No support for pbuffer + MSAA for now. + * + * XXX TODO: pbuffer + MSAA does not work and causes crashes. + * See QT bugreport: https://bugreports.qt.io/browse/QTBUG-47509 + */ + if (base.Samples) { + surface_type &= ~EGL_PBUFFER_BIT; + } + + if (!surface_type) + return NULL; + base.RenderableType = disp->ClientAPIs; base.Conformant = disp->ClientAPIs; @@ -385,19 +632,6 @@ return NULL; } - if (double_buffer) { - surface_type &= ~EGL_PIXMAP_BIT; - } - - /* No support for pbuffer + MSAA for now. - * - * XXX TODO: pbuffer + MSAA does not work and causes crashes. - * See QT bugreport: https://bugreports.qt.io/browse/QTBUG-47509 - */ - if (base.Samples) { - surface_type &= ~EGL_PBUFFER_BIT; - } - conf->base.SurfaceType |= surface_type; return conf; @@ -552,6 +786,7 @@ if (!dri2_bind_extensions(dri2_dpy, driver_extensions, extensions, false)) { dlclose(dri2_dpy->driver); + dri2_dpy->driver = NULL; return EGL_FALSE; } dri2_dpy->driver_extensions = extensions; @@ -2343,6 +2578,8 @@ case DRM_FORMAT_ABGR2101010: case DRM_FORMAT_RGBA1010102: case DRM_FORMAT_BGRA1010102: + case DRM_FORMAT_XBGR16161616F: + case DRM_FORMAT_ABGR16161616F: case DRM_FORMAT_YUYV: case DRM_FORMAT_YVYU: case DRM_FORMAT_UYVY: diff -Nru mesa-19.2.8/src/egl/drivers/dri2/egl_dri2.h mesa-20.0.8/src/egl/drivers/dri2/egl_dri2.h --- mesa-19.2.8/src/egl/drivers/dri2/egl_dri2.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/egl/drivers/dri2/egl_dri2.h 2020-06-12 01:21:16.000000000 +0000 @@ -83,7 +83,7 @@ #include "util/u_vector.h" #include "util/bitset.h" -#define EGL_DRI2_MAX_FORMATS 8 +#define EGL_DRI2_MAX_FORMATS 10 struct wl_buffer; @@ -338,6 +338,9 @@ int out_fence_fd; EGLBoolean enable_out_fence; + + /* swrast device */ + char *swrast_device_buffer; }; struct dri2_egl_config @@ -404,10 +407,24 @@ __DRIimage * dri2_lookup_egl_image(__DRIscreen *screen, void *image, void *data); +void +dri2_get_shifts_and_sizes(const __DRIcoreExtension *core, + const __DRIconfig *config, int *shifts, + unsigned int *sizes); + +void +dri2_get_render_type_float(const __DRIcoreExtension *core, + const __DRIconfig *config, + bool *is_float); + +unsigned int +dri2_image_format_for_pbuffer_config(struct dri2_egl_display *dri2_dpy, + const __DRIconfig *config); + struct dri2_egl_config * dri2_add_config(_EGLDisplay *disp, const __DRIconfig *dri_config, int id, EGLint surface_type, const EGLint *attr_list, - const unsigned int *rgba_masks); + const int *rgba_shifts, const unsigned int *rgba_sizes); _EGLImage * dri2_create_image_khr(_EGLDriver *drv, _EGLDisplay *disp, diff -Nru mesa-19.2.8/src/egl/drivers/dri2/platform_android.c mesa-20.0.8/src/egl/drivers/dri2/platform_android.c --- mesa-19.2.8/src/egl/drivers/dri2/platform_android.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/egl/drivers/dri2/platform_android.c 2020-06-12 01:21:16.000000000 +0000 @@ -37,6 +37,7 @@ #include #include #include +#include #include "loader.h" #include "egl_dri2.h" @@ -61,24 +62,24 @@ int chroma_step; /* Distance in bytes between subsequent chroma pixels. */ /* Result */ - int fourcc; /* __DRI_IMAGE_FOURCC_ */ + int fourcc; /* DRM_FORMAT_ */ }; /* The following table is used to look up a DRI image FourCC based * on native format and information contained in android_ycbcr struct. */ static const struct droid_yuv_format droid_yuv_formats[] = { /* Native format, YCrCb, Chroma step, DRI image FourCC */ - { HAL_PIXEL_FORMAT_YCbCr_420_888, YCbCr, 2, __DRI_IMAGE_FOURCC_NV12 }, - { HAL_PIXEL_FORMAT_YCbCr_420_888, YCbCr, 1, __DRI_IMAGE_FOURCC_YUV420 }, - { HAL_PIXEL_FORMAT_YCbCr_420_888, YCrCb, 1, __DRI_IMAGE_FOURCC_YVU420 }, - { HAL_PIXEL_FORMAT_YV12, YCrCb, 1, __DRI_IMAGE_FOURCC_YVU420 }, + { HAL_PIXEL_FORMAT_YCbCr_420_888, YCbCr, 2, DRM_FORMAT_NV12 }, + { HAL_PIXEL_FORMAT_YCbCr_420_888, YCbCr, 1, DRM_FORMAT_YUV420 }, + { HAL_PIXEL_FORMAT_YCbCr_420_888, YCrCb, 1, DRM_FORMAT_YVU420 }, + { HAL_PIXEL_FORMAT_YV12, YCrCb, 1, DRM_FORMAT_YVU420 }, /* HACK: See droid_create_image_from_prime_fds() and * https://issuetracker.google.com/32077885. */ - { HAL_PIXEL_FORMAT_IMPLEMENTATION_DEFINED, YCbCr, 2, __DRI_IMAGE_FOURCC_NV12 }, - { HAL_PIXEL_FORMAT_IMPLEMENTATION_DEFINED, YCbCr, 1, __DRI_IMAGE_FOURCC_YUV420 }, - { HAL_PIXEL_FORMAT_IMPLEMENTATION_DEFINED, YCrCb, 1, __DRI_IMAGE_FOURCC_YVU420 }, - { HAL_PIXEL_FORMAT_IMPLEMENTATION_DEFINED, YCrCb, 1, __DRI_IMAGE_FOURCC_AYUV }, - { HAL_PIXEL_FORMAT_IMPLEMENTATION_DEFINED, YCrCb, 1, __DRI_IMAGE_FOURCC_XYUV8888 }, + { HAL_PIXEL_FORMAT_IMPLEMENTATION_DEFINED, YCbCr, 2, DRM_FORMAT_NV12 }, + { HAL_PIXEL_FORMAT_IMPLEMENTATION_DEFINED, YCbCr, 1, DRM_FORMAT_YUV420 }, + { HAL_PIXEL_FORMAT_IMPLEMENTATION_DEFINED, YCrCb, 1, DRM_FORMAT_YVU420 }, + { HAL_PIXEL_FORMAT_IMPLEMENTATION_DEFINED, YCrCb, 1, DRM_FORMAT_AYUV }, + { HAL_PIXEL_FORMAT_IMPLEMENTATION_DEFINED, YCrCb, 1, DRM_FORMAT_XYUV8888 }, }; static int @@ -109,6 +110,9 @@ int bpp; switch (native) { + case HAL_PIXEL_FORMAT_RGBA_FP16: + bpp = 8; + break; case HAL_PIXEL_FORMAT_RGBA_8888: case HAL_PIXEL_FORMAT_IMPLEMENTATION_DEFINED: /* @@ -117,6 +121,7 @@ */ case HAL_PIXEL_FORMAT_RGBX_8888: case HAL_PIXEL_FORMAT_BGRA_8888: + case HAL_PIXEL_FORMAT_RGBA_1010102: bpp = 4; break; case HAL_PIXEL_FORMAT_RGB_565: @@ -134,15 +139,17 @@ static int get_fourcc(int native) { switch (native) { - case HAL_PIXEL_FORMAT_RGB_565: return __DRI_IMAGE_FOURCC_RGB565; - case HAL_PIXEL_FORMAT_BGRA_8888: return __DRI_IMAGE_FOURCC_ARGB8888; - case HAL_PIXEL_FORMAT_RGBA_8888: return __DRI_IMAGE_FOURCC_ABGR8888; + case HAL_PIXEL_FORMAT_RGB_565: return DRM_FORMAT_RGB565; + case HAL_PIXEL_FORMAT_BGRA_8888: return DRM_FORMAT_ARGB8888; + case HAL_PIXEL_FORMAT_RGBA_8888: return DRM_FORMAT_ABGR8888; case HAL_PIXEL_FORMAT_IMPLEMENTATION_DEFINED: /* * HACK: Hardcode this to RGBX_8888 as per cros_gralloc hack. * TODO: Remove this once https://issuetracker.google.com/32077885 is fixed. */ - case HAL_PIXEL_FORMAT_RGBX_8888: return __DRI_IMAGE_FOURCC_XBGR8888; + case HAL_PIXEL_FORMAT_RGBX_8888: return DRM_FORMAT_XBGR8888; + case HAL_PIXEL_FORMAT_RGBA_FP16: return DRM_FORMAT_ABGR16161616F; + case HAL_PIXEL_FORMAT_RGBA_1010102: return DRM_FORMAT_ABGR2101010; default: _eglLog(_EGL_WARNING, "unsupported native buffer format 0x%x", native); } @@ -161,6 +168,8 @@ * TODO: Revert this once https://issuetracker.google.com/32077885 is fixed. */ case HAL_PIXEL_FORMAT_RGBX_8888: return __DRI_IMAGE_FORMAT_XBGR8888; + case HAL_PIXEL_FORMAT_RGBA_FP16: return __DRI_IMAGE_FORMAT_ABGR16161616F; + case HAL_PIXEL_FORMAT_RGBA_1010102: return __DRI_IMAGE_FORMAT_ABGR2101010; default: _eglLog(_EGL_WARNING, "unsupported native buffer format 0x%x", format); } @@ -366,6 +375,10 @@ if (type == EGL_WINDOW_BIT) { int format; int buffer_count; + int min_buffer_count, max_buffer_count; + + /* Prefer triple buffering for performance reasons. */ + const int preferred_buffer_count = 3; if (window->common.magic != ANDROID_NATIVE_WINDOW_MAGIC) { _eglError(EGL_BAD_NATIVE_WINDOW, "droid_create_surface"); @@ -376,25 +389,41 @@ goto cleanup_surface; } - /* Query ANativeWindow for MIN_UNDEQUEUED_BUFFER, set buffer count - * and allocate color_buffers. + /* Query ANativeWindow for MIN_UNDEQUEUED_BUFFER, minimum amount + * of undequeued buffers. */ if (window->query(window, NATIVE_WINDOW_MIN_UNDEQUEUED_BUFFERS, - &buffer_count)) { + &min_buffer_count)) { + _eglError(EGL_BAD_NATIVE_WINDOW, "droid_create_surface"); + goto cleanup_surface; + } + + /* Query for maximum buffer count, application can set this + * to limit the total amount of buffers. + */ + if (window->query(window, NATIVE_WINDOW_MAX_BUFFER_COUNT, + &max_buffer_count)) { _eglError(EGL_BAD_NATIVE_WINDOW, "droid_create_surface"); goto cleanup_surface; } - if (native_window_set_buffer_count(window, buffer_count+1)) { + + /* Clamp preferred between minimum (min undequeued + 1 dequeued) + * and maximum. + */ + buffer_count = CLAMP(preferred_buffer_count, min_buffer_count + 1, + max_buffer_count); + + if (native_window_set_buffer_count(window, buffer_count)) { _eglError(EGL_BAD_NATIVE_WINDOW, "droid_create_surface"); goto cleanup_surface; } - dri2_surf->color_buffers = calloc(buffer_count+1, + dri2_surf->color_buffers = calloc(buffer_count, sizeof(*dri2_surf->color_buffers)); if (!dri2_surf->color_buffers) { _eglError(EGL_BAD_ALLOC, "droid_create_surface"); goto cleanup_surface; } - dri2_surf->color_buffers_count = buffer_count+1; + dri2_surf->color_buffers_count = buffer_count; if (format != dri2_conf->base.NativeVisualID) { _eglLog(_EGL_WARNING, "Native format mismatch: 0x%x != 0x%x", @@ -1149,12 +1178,16 @@ struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp); static const struct { int format; - unsigned int rgba_masks[4]; + int rgba_shifts[4]; + unsigned int rgba_sizes[4]; } visuals[] = { - { HAL_PIXEL_FORMAT_RGBA_8888, { 0x000000ff, 0x0000ff00, 0x00ff0000, 0xff000000 } }, - { HAL_PIXEL_FORMAT_RGBX_8888, { 0x000000ff, 0x0000ff00, 0x00ff0000, 0x00000000 } }, - { HAL_PIXEL_FORMAT_RGB_565, { 0x0000f800, 0x000007e0, 0x0000001f, 0x00000000 } }, - { HAL_PIXEL_FORMAT_BGRA_8888, { 0x00ff0000, 0x0000ff00, 0x000000ff, 0xff000000 } }, + { HAL_PIXEL_FORMAT_RGBA_8888, { 0, 8, 16, 24 }, { 8, 8, 8, 8 } }, + { HAL_PIXEL_FORMAT_RGBX_8888, { 0, 8, 16, -1 }, { 8, 8, 8, 0 } }, + { HAL_PIXEL_FORMAT_RGB_565, { 11, 5, 0, -1 }, { 5, 6, 5, 0 } }, + /* This must be after HAL_PIXEL_FORMAT_RGBA_8888, we only keep BGRA + * visual if it turns out RGBA visual is not available. + */ + { HAL_PIXEL_FORMAT_BGRA_8888, { 16, 8, 0, 24 }, { 8, 8, 8, 8 } }, }; unsigned int format_count[ARRAY_SIZE(visuals)] = { 0 }; @@ -1178,7 +1211,13 @@ * (chadversary) testing on Android Nougat, this was good enough to pacify * the buggy clients. */ + bool has_rgba = false; for (int i = 0; i < ARRAY_SIZE(visuals); i++) { + /* Only enable BGRA configs when RGBA is not available. BGRA configs are + * buggy on stock Android. + */ + if (visuals[i].format == HAL_PIXEL_FORMAT_BGRA_8888 && has_rgba) + continue; for (int j = 0; dri2_dpy->driver_configs[j]; j++) { const EGLint surface_type = EGL_WINDOW_BIT | EGL_PBUFFER_BIT; @@ -1193,13 +1232,15 @@ struct dri2_egl_config *dri2_conf = dri2_add_config(disp, dri2_dpy->driver_configs[j], config_count + 1, surface_type, config_attrs, - visuals[i].rgba_masks); + visuals[i].rgba_shifts, visuals[i].rgba_sizes); if (dri2_conf) { if (dri2_conf->base.ConfigID == config_count + 1) config_count++; format_count[i]++; } } + if (visuals[i].format == HAL_PIXEL_FORMAT_RGBA_8888 && format_count[i]) + has_rgba = true; } for (int i = 0; i < ARRAY_SIZE(format_count); i++) { diff -Nru mesa-19.2.8/src/egl/drivers/dri2/platform_device.c mesa-20.0.8/src/egl/drivers/dri2/platform_device.c --- mesa-19.2.8/src/egl/drivers/dri2/platform_device.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/egl/drivers/dri2/platform_device.c 2020-06-12 01:21:16.000000000 +0000 @@ -66,6 +66,9 @@ dri2_dpy->image->destroyImage(dri2_surf->front); dri2_surf->front = NULL; } + + free(dri2_surf->swrast_device_buffer); + dri2_surf->swrast_device_buffer = NULL; } static int @@ -142,15 +145,12 @@ goto cleanup_surface; } - if (!dri2_create_drawable(dri2_dpy, config, dri2_surf, dri2_surf)) + dri2_surf->visual = dri2_image_format_for_pbuffer_config(dri2_dpy, config); + if (dri2_surf->visual == __DRI_IMAGE_FORMAT_NONE) goto cleanup_surface; - if (conf->RedSize == 5) - dri2_surf->visual = __DRI_IMAGE_FORMAT_RGB565; - else if (conf->AlphaSize == 0) - dri2_surf->visual = __DRI_IMAGE_FORMAT_XRGB8888; - else - dri2_surf->visual = __DRI_IMAGE_FORMAT_ARGB8888; + if (!dri2_create_drawable(dri2_dpy, config, dri2_surf, dri2_surf)) + goto cleanup_surface; return &dri2_surf->base; @@ -188,11 +188,14 @@ struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp); static const struct { const char *format_name; - unsigned int rgba_masks[4]; + int rgba_shifts[4]; + unsigned int rgba_sizes[4]; } visuals[] = { - { "ARGB8888", { 0xff0000, 0xff00, 0xff, 0xff000000 } }, - { "RGB888", { 0xff0000, 0xff00, 0xff, 0x0 } }, - { "RGB565", { 0x00f800, 0x07e0, 0x1f, 0x0 } }, + { "A2RGB10", { 20, 10, 0, 30 }, { 10, 10, 10, 2 } }, + { "X2RGB10", { 20, 10, 0, -1 }, { 10, 10, 10, 0 } }, + { "ARGB8888", { 16, 8, 0, 24 }, { 8, 8, 8, 8 } }, + { "RGB888", { 16, 8, 0, -1 }, { 8, 8, 8, 0 } }, + { "RGB565", { 11, 5, 0, -1 }, { 5, 6, 5, 0 } }, }; unsigned int format_count[ARRAY_SIZE(visuals)] = { 0 }; unsigned int config_count = 0; @@ -203,7 +206,7 @@ dri2_conf = dri2_add_config(disp, dri2_dpy->driver_configs[i], config_count + 1, EGL_PBUFFER_BIT, NULL, - visuals[j].rgba_masks); + visuals[j].rgba_shifts, visuals[j].rgba_sizes); if (dri2_conf) { if (dri2_conf->base.ConfigID == config_count + 1) diff -Nru mesa-19.2.8/src/egl/drivers/dri2/platform_drm.c mesa-20.0.8/src/egl/drivers/dri2/platform_drm.c --- mesa-19.2.8/src/egl/drivers/dri2/platform_drm.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/egl/drivers/dri2/platform_drm.c 2020-06-12 01:21:16.000000000 +0000 @@ -96,7 +96,9 @@ struct gbm_surface *surface) { const struct gbm_dri_visual *visual = NULL; - unsigned int red, green, blue, alpha; + int shifts[4]; + unsigned int sizes[4]; + bool is_float; int i; /* Check that the EGLConfig being used to render to the surface is @@ -104,10 +106,9 @@ * otherwise-compatible formats is relatively common, explicitly allow * this. */ - dri2_dpy->core->getConfigAttrib(config, __DRI_ATTRIB_RED_MASK, &red); - dri2_dpy->core->getConfigAttrib(config, __DRI_ATTRIB_GREEN_MASK, &green); - dri2_dpy->core->getConfigAttrib(config, __DRI_ATTRIB_BLUE_MASK, &blue); - dri2_dpy->core->getConfigAttrib(config, __DRI_ATTRIB_ALPHA_MASK, &alpha); + dri2_get_shifts_and_sizes(dri2_dpy->core, config, shifts, sizes); + + dri2_get_render_type_float(dri2_dpy->core, config, &is_float); for (i = 0; i < dri2_dpy->gbm_dri->num_visuals; i++) { visual = &dri2_dpy->gbm_dri->visual_table[i]; @@ -118,10 +119,17 @@ if (i == dri2_dpy->gbm_dri->num_visuals) return false; - if (red != visual->rgba_masks.red || - green != visual->rgba_masks.green || - blue != visual->rgba_masks.blue || - (alpha && visual->rgba_masks.alpha && alpha != visual->rgba_masks.alpha)) { + if (shifts[0] != visual->rgba_shifts.red || + shifts[1] != visual->rgba_shifts.green || + shifts[2] != visual->rgba_shifts.blue || + (shifts[3] > -1 && visual->rgba_shifts.alpha > -1 && + shifts[3] != visual->rgba_shifts.alpha) || + sizes[0] != visual->rgba_sizes.red || + sizes[1] != visual->rgba_sizes.green || + sizes[2] != visual->rgba_sizes.blue || + (sizes[3] > 0 && visual->rgba_sizes.alpha > 0 && + sizes[3] != visual->rgba_sizes.alpha) || + is_float != visual->is_float) { return false; } @@ -612,24 +620,27 @@ memset(format_count, 0, num_visuals * sizeof(unsigned int)); for (unsigned i = 0; dri2_dpy->driver_configs[i]; i++) { - unsigned int red, green, blue, alpha; + const __DRIconfig *config = dri2_dpy->driver_configs[i]; + int shifts[4]; + unsigned int sizes[4]; + bool is_float; + + dri2_get_shifts_and_sizes(dri2_dpy->core, config, shifts, sizes); - dri2_dpy->core->getConfigAttrib(dri2_dpy->driver_configs[i], - __DRI_ATTRIB_RED_MASK, &red); - dri2_dpy->core->getConfigAttrib(dri2_dpy->driver_configs[i], - __DRI_ATTRIB_GREEN_MASK, &green); - dri2_dpy->core->getConfigAttrib(dri2_dpy->driver_configs[i], - __DRI_ATTRIB_BLUE_MASK, &blue); - dri2_dpy->core->getConfigAttrib(dri2_dpy->driver_configs[i], - __DRI_ATTRIB_ALPHA_MASK, &alpha); + dri2_get_render_type_float(dri2_dpy->core, config, &is_float); for (unsigned j = 0; j < num_visuals; j++) { struct dri2_egl_config *dri2_conf; - if (visuals[j].rgba_masks.red != red || - visuals[j].rgba_masks.green != green || - visuals[j].rgba_masks.blue != blue || - visuals[j].rgba_masks.alpha != alpha) + if (visuals[j].rgba_shifts.red != shifts[0] || + visuals[j].rgba_shifts.green != shifts[1] || + visuals[j].rgba_shifts.blue != shifts[2] || + visuals[j].rgba_shifts.alpha != shifts[3] || + visuals[j].rgba_sizes.red != sizes[0] || + visuals[j].rgba_sizes.green != sizes[1] || + visuals[j].rgba_sizes.blue != sizes[2] || + visuals[j].rgba_sizes.alpha != sizes[3] || + visuals[j].is_float != is_float) continue; const EGLint attr_list[] = { @@ -638,7 +649,7 @@ }; dri2_conf = dri2_add_config(disp, dri2_dpy->driver_configs[i], - config_count + 1, EGL_WINDOW_BIT, attr_list, NULL); + config_count + 1, EGL_WINDOW_BIT, attr_list, NULL, NULL); if (dri2_conf) { if (dri2_conf->base.ConfigID == config_count + 1) config_count++; @@ -684,10 +695,6 @@ struct gbm_device *gbm; const char *err; - /* Not supported yet */ - if (disp->Options.ForceSoftware) - return EGL_FALSE; - dri2_dpy = calloc(1, sizeof *dri2_dpy); if (!dri2_dpy) return _eglError(EGL_BAD_ALLOC, "eglInitialize"); @@ -721,7 +728,7 @@ goto cleanup; } - dev = _eglAddDevice(dri2_dpy->fd, false); + dev = _eglAddDevice(dri2_dpy->fd, disp->Options.ForceSoftware); if (!dev) { err = "DRI2: failed to find EGLDevice"; goto cleanup; @@ -730,6 +737,21 @@ disp->Device = dev; dri2_dpy->driver_name = strdup(dri2_dpy->gbm_dri->driver_name); + dri2_dpy->is_render_node = drmGetNodeTypeFromFd(dri2_dpy->fd) == DRM_NODE_RENDER; + + /* render nodes cannot use Gem names, and thus do not support + * the __DRI_DRI2_LOADER extension */ + if (!dri2_dpy->is_render_node) { + if (!dri2_load_driver(disp)) { + err = "DRI2: failed to load driver"; + goto cleanup; + } + } else { + if (!dri2_load_driver_dri3(disp)) { + err = "DRI3: failed to load driver"; + goto cleanup; + } + } dri2_dpy->dri_screen = dri2_dpy->gbm_dri->screen; dri2_dpy->core = dri2_dpy->gbm_dri->core; diff -Nru mesa-19.2.8/src/egl/drivers/dri2/platform_surfaceless.c mesa-20.0.8/src/egl/drivers/dri2/platform_surfaceless.c --- mesa-19.2.8/src/egl/drivers/dri2/platform_surfaceless.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/egl/drivers/dri2/platform_surfaceless.c 2020-06-12 01:21:16.000000000 +0000 @@ -60,6 +60,9 @@ dri2_dpy->image->destroyImage(dri2_surf->front); dri2_surf->front = NULL; } + + free(dri2_surf->swrast_device_buffer); + dri2_surf->swrast_device_buffer = NULL; } static int @@ -136,15 +139,12 @@ goto cleanup_surface; } - if (!dri2_create_drawable(dri2_dpy, config, dri2_surf, dri2_surf)) + dri2_surf->visual = dri2_image_format_for_pbuffer_config(dri2_dpy, config); + if (dri2_surf->visual == __DRI_IMAGE_FORMAT_NONE) goto cleanup_surface; - if (conf->RedSize == 5) - dri2_surf->visual = __DRI_IMAGE_FORMAT_RGB565; - else if (conf->AlphaSize == 0) - dri2_surf->visual = __DRI_IMAGE_FORMAT_XRGB8888; - else - dri2_surf->visual = __DRI_IMAGE_FORMAT_ARGB8888; + if (!dri2_create_drawable(dri2_dpy, config, dri2_surf, dri2_surf)) + goto cleanup_surface; return &dri2_surf->base; @@ -182,11 +182,16 @@ struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp); static const struct { const char *format_name; - unsigned int rgba_masks[4]; + int rgba_shifts[4]; + unsigned int rgba_sizes[4]; } visuals[] = { - { "ARGB8888", { 0xff0000, 0xff00, 0xff, 0xff000000 } }, - { "RGB888", { 0xff0000, 0xff00, 0xff, 0x0 } }, - { "RGB565", { 0x00f800, 0x07e0, 0x1f, 0x0 } }, + { "ABGR16F", { 0, 16, 32, 48 }, { 16, 16, 16, 16 } }, + { "XBGR16F", { 0, 16, 32, -1 }, { 16, 16, 16, 0 } }, + { "A2RGB10", { 20, 10, 0, 30 }, { 10, 10, 10, 2 } }, + { "X2RGB10", { 20, 10, 0, -1 }, { 10, 10, 10, 0 } }, + { "ARGB8888", { 16, 8, 0, 24 }, { 8, 8, 8, 8 } }, + { "RGB888", { 16, 8, 0, -1 }, { 8, 8, 8, 0 } }, + { "RGB565", { 11, 5, 0, -1 }, { 5, 6, 5, 0 } }, }; unsigned int format_count[ARRAY_SIZE(visuals)] = { 0 }; unsigned int config_count = 0; @@ -197,7 +202,7 @@ dri2_conf = dri2_add_config(disp, dri2_dpy->driver_configs[i], config_count + 1, EGL_PBUFFER_BIT, NULL, - visuals[j].rgba_masks); + visuals[j].rgba_shifts, visuals[j].rgba_sizes); if (dri2_conf) { if (dri2_conf->base.ConfigID == config_count + 1) @@ -236,10 +241,23 @@ { } +static unsigned +surfaceless_get_capability(void *loaderPrivate, enum dri_loader_cap cap) +{ + /* Note: loaderPrivate is _EGLDisplay* */ + switch (cap) { + case DRI_LOADER_CAP_FP16: + return 1; + default: + return 0; + } +} + static const __DRIimageLoaderExtension image_loader_extension = { - .base = { __DRI_IMAGE_LOADER, 1 }, + .base = { __DRI_IMAGE_LOADER, 2 }, .getBuffers = surfaceless_image_get_buffers, .flushFrontBuffer = surfaceless_flush_front_buffer, + .getCapability = surfaceless_get_capability, }; static const __DRIextension *image_loader_extensions[] = { @@ -391,6 +409,10 @@ } dri2_setup_screen(disp); +#ifdef HAVE_WAYLAND_PLATFORM + dri2_dpy->device_name = loader_get_device_name_for_fd(dri2_dpy->fd); +#endif + dri2_set_WL_bind_wayland_display(drv, disp); if (!surfaceless_add_configs_for_visuals(drv, disp)) { err = "DRI2: failed to add configs"; diff -Nru mesa-19.2.8/src/egl/drivers/dri2/platform_wayland.c mesa-20.0.8/src/egl/drivers/dri2/platform_wayland.c --- mesa-19.2.8/src/egl/drivers/dri2/platform_wayland.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/egl/drivers/dri2/platform_wayland.c 2020-06-12 01:21:16.000000000 +0000 @@ -51,6 +51,13 @@ #include "wayland-drm-client-protocol.h" #include "linux-dmabuf-unstable-v1-client-protocol.h" +/* cheesy workaround until wayland 1.18 is released */ +#if WAYLAND_VERSION_MAJOR > 1 || \ + (WAYLAND_VERSION_MAJOR == 1 && WAYLAND_VERSION_MINOR < 18) +#define WL_SHM_FORMAT_ABGR16161616F 0x48344241 +#define WL_SHM_FORMAT_XBGR16161616F 0x48344258 +#endif + /* * The index of entries in this table is used as a bitmask in * dri2_dpy->formats, which tracks the formats supported by our server. @@ -69,49 +76,71 @@ */ int alt_dri_image_format; int bpp; - unsigned int rgba_masks[4]; + int rgba_shifts[4]; + unsigned int rgba_sizes[4]; } dri2_wl_visuals[] = { { - "XRGB2101010", - WL_DRM_FORMAT_XRGB2101010, WL_SHM_FORMAT_XRGB2101010, - __DRI_IMAGE_FORMAT_XRGB2101010, __DRI_IMAGE_FORMAT_XBGR2101010, 32, - { 0x3ff00000, 0x000ffc00, 0x000003ff, 0x00000000 } + "ABGR16F", + WL_DRM_FORMAT_ABGR16F, WL_SHM_FORMAT_ABGR16161616F, + __DRI_IMAGE_FORMAT_ABGR16161616F, 0, 64, + { 0, 16, 32, 48 }, + { 16, 16, 16, 16 }, + }, + { + "XBGR16F", + WL_DRM_FORMAT_XBGR16F, WL_SHM_FORMAT_XBGR16161616F, + __DRI_IMAGE_FORMAT_XBGR16161616F, 0, 64, + { 0, 16, 32, -1 }, + { 16, 16, 16, 0 }, }, { - "ARGB2101010", - WL_DRM_FORMAT_ARGB2101010, WL_SHM_FORMAT_ARGB2101010, - __DRI_IMAGE_FORMAT_ARGB2101010, __DRI_IMAGE_FORMAT_ABGR2101010, 32, - { 0x3ff00000, 0x000ffc00, 0x000003ff, 0xc0000000 } + "XRGB2101010", + WL_DRM_FORMAT_XRGB2101010, WL_SHM_FORMAT_XRGB2101010, + __DRI_IMAGE_FORMAT_XRGB2101010, __DRI_IMAGE_FORMAT_XBGR2101010, 32, + { 20, 10, 0, -1 }, + { 10, 10, 10, 0 }, }, { - "XBGR2101010", - WL_DRM_FORMAT_XBGR2101010, WL_SHM_FORMAT_XBGR2101010, - __DRI_IMAGE_FORMAT_XBGR2101010, __DRI_IMAGE_FORMAT_XRGB2101010, 32, - { 0x000003ff, 0x000ffc00, 0x3ff00000, 0x00000000 } + "ARGB2101010", + WL_DRM_FORMAT_ARGB2101010, WL_SHM_FORMAT_ARGB2101010, + __DRI_IMAGE_FORMAT_ARGB2101010, __DRI_IMAGE_FORMAT_ABGR2101010, 32, + { 20, 10, 0, 30 }, + { 10, 10, 10, 2 }, }, { - "ABGR2101010", - WL_DRM_FORMAT_ABGR2101010, WL_SHM_FORMAT_ABGR2101010, - __DRI_IMAGE_FORMAT_ABGR2101010, __DRI_IMAGE_FORMAT_ARGB2101010, 32, - { 0x000003ff, 0x000ffc00, 0x3ff00000, 0xc0000000 } + "XBGR2101010", + WL_DRM_FORMAT_XBGR2101010, WL_SHM_FORMAT_XBGR2101010, + __DRI_IMAGE_FORMAT_XBGR2101010, __DRI_IMAGE_FORMAT_XRGB2101010, 32, + { 0, 10, 20, -1 }, + { 10, 10, 10, 0 }, }, { - "XRGB8888", - WL_DRM_FORMAT_XRGB8888, WL_SHM_FORMAT_XRGB8888, - __DRI_IMAGE_FORMAT_XRGB8888, __DRI_IMAGE_FORMAT_NONE, 32, - { 0x00ff0000, 0x0000ff00, 0x000000ff, 0x00000000 } + "ABGR2101010", + WL_DRM_FORMAT_ABGR2101010, WL_SHM_FORMAT_ABGR2101010, + __DRI_IMAGE_FORMAT_ABGR2101010, __DRI_IMAGE_FORMAT_ARGB2101010, 32, + { 0, 10, 20, 30 }, + { 10, 10, 10, 2 }, }, { - "ARGB8888", - WL_DRM_FORMAT_ARGB8888, WL_SHM_FORMAT_ARGB8888, - __DRI_IMAGE_FORMAT_ARGB8888, __DRI_IMAGE_FORMAT_NONE, 32, - { 0x00ff0000, 0x0000ff00, 0x000000ff, 0xff000000 } + "XRGB8888", + WL_DRM_FORMAT_XRGB8888, WL_SHM_FORMAT_XRGB8888, + __DRI_IMAGE_FORMAT_XRGB8888, __DRI_IMAGE_FORMAT_NONE, 32, + { 16, 8, 0, -1 }, + { 8, 8, 8, 0 }, }, { - "RGB565", - WL_DRM_FORMAT_RGB565, WL_SHM_FORMAT_RGB565, - __DRI_IMAGE_FORMAT_RGB565, __DRI_IMAGE_FORMAT_NONE, 16, - { 0xf800, 0x07e0, 0x001f, 0x0000 } + "ARGB8888", + WL_DRM_FORMAT_ARGB8888, WL_SHM_FORMAT_ARGB8888, + __DRI_IMAGE_FORMAT_ARGB8888, __DRI_IMAGE_FORMAT_NONE, 32, + { 16, 8, 0, 24 }, + { 8, 8, 8, 8 }, + }, + { + "RGB565", + WL_DRM_FORMAT_RGB565, WL_SHM_FORMAT_RGB565, + __DRI_IMAGE_FORMAT_RGB565, __DRI_IMAGE_FORMAT_NONE, 16, + { 11, 5, 0, -1 }, + { 5, 6, 5, 0 }, }, }; @@ -123,20 +152,22 @@ dri2_wl_visual_idx_from_config(struct dri2_egl_display *dri2_dpy, const __DRIconfig *config) { - unsigned int red, green, blue, alpha; + int shifts[4]; + unsigned int sizes[4]; - dri2_dpy->core->getConfigAttrib(config, __DRI_ATTRIB_RED_MASK, &red); - dri2_dpy->core->getConfigAttrib(config, __DRI_ATTRIB_GREEN_MASK, &green); - dri2_dpy->core->getConfigAttrib(config, __DRI_ATTRIB_BLUE_MASK, &blue); - dri2_dpy->core->getConfigAttrib(config, __DRI_ATTRIB_ALPHA_MASK, &alpha); + dri2_get_shifts_and_sizes(dri2_dpy->core, config, shifts, sizes); for (unsigned int i = 0; i < ARRAY_SIZE(dri2_wl_visuals); i++) { const struct dri2_wl_visual *wl_visual = &dri2_wl_visuals[i]; - if (red == wl_visual->rgba_masks[0] && - green == wl_visual->rgba_masks[1] && - blue == wl_visual->rgba_masks[2] && - alpha == wl_visual->rgba_masks[3]) { + if (shifts[0] == wl_visual->rgba_shifts[0] && + shifts[1] == wl_visual->rgba_shifts[1] && + shifts[2] == wl_visual->rgba_shifts[2] && + shifts[3] == wl_visual->rgba_shifts[3] && + sizes[0] == wl_visual->rgba_sizes[0] && + sizes[1] == wl_visual->rgba_sizes[1] && + sizes[2] == wl_visual->rgba_sizes[2] && + sizes[3] == wl_visual->rgba_sizes[3]) { return i; } } @@ -491,6 +522,13 @@ modifiers = u_vector_tail(&dri2_dpy->wl_modifiers[visual_idx]); num_modifiers = u_vector_length(&dri2_dpy->wl_modifiers[visual_idx]); + if (num_modifiers == 1 && modifiers[0] == DRM_FORMAT_MOD_INVALID) { + /* For the purposes of this function, an INVALID modifier on its own + * means the modifiers aren't supported. + */ + num_modifiers = 0; + } + /* Substitute dri image format if server does not support original format */ if (!BITSET_TEST(dri2_dpy->formats, visual_idx)) linear_dri_image_format = dri2_wl_visuals[visual_idx].alt_dri_image_format; @@ -781,19 +819,32 @@ (void) loaderPrivate; } +static unsigned +dri2_wl_get_capability(void *loaderPrivate, enum dri_loader_cap cap) +{ + switch (cap) { + case DRI_LOADER_CAP_FP16: + return 1; + default: + return 0; + } +} + static const __DRIdri2LoaderExtension dri2_loader_extension = { - .base = { __DRI_DRI2_LOADER, 3 }, + .base = { __DRI_DRI2_LOADER, 4 }, .getBuffers = dri2_wl_get_buffers, .flushFrontBuffer = dri2_wl_flush_front_buffer, .getBuffersWithFormat = dri2_wl_get_buffers_with_format, + .getCapability = dri2_wl_get_capability, }; static const __DRIimageLoaderExtension image_loader_extension = { - .base = { __DRI_IMAGE_LOADER, 1 }, + .base = { __DRI_IMAGE_LOADER, 2 }, .getBuffers = image_get_buffers, .flushFrontBuffer = dri2_wl_flush_front_buffer, + .getCapability = dri2_wl_get_capability, }; static void @@ -873,7 +924,31 @@ } } - if (dri2_dpy->wl_dmabuf && modifier != DRM_FORMAT_MOD_INVALID) { + bool supported_modifier = false; + bool mod_invalid_supported = false; + int visual_idx = dri2_wl_visual_idx_from_fourcc(fourcc); + assert(visual_idx != -1); + + uint64_t *mod; + u_vector_foreach(mod, &dri2_dpy->wl_modifiers[visual_idx]) { + if (*mod == DRM_FORMAT_MOD_INVALID) { + mod_invalid_supported = true; + } + if (*mod == modifier) { + supported_modifier = true; + break; + } + } + if (!supported_modifier && mod_invalid_supported) { + /* If the server has advertised DRM_FORMAT_MOD_INVALID then we trust + * that the client has allocated the buffer with the right implicit + * modifier for the format, even though it's allocated a buffer the + * server hasn't explicitly claimed to support. */ + modifier = DRM_FORMAT_MOD_INVALID; + supported_modifier = true; + } + + if (dri2_dpy->wl_dmabuf && supported_modifier) { struct zwp_linux_buffer_params_v1 *params; int i; @@ -1246,10 +1321,6 @@ if (visual_idx == -1) return; - if (modifier_hi == (DRM_FORMAT_MOD_INVALID >> 32) && - modifier_lo == (DRM_FORMAT_MOD_INVALID & 0xffffffff)) - return; - BITSET_SET(dri2_dpy->formats, visual_idx); mod = u_vector_add(&dri2_dpy->wl_modifiers[visual_idx]); @@ -1354,7 +1425,7 @@ continue; dri2_conf = dri2_add_config(disp, dri2_dpy->driver_configs[i], - count + 1, EGL_WINDOW_BIT, NULL, dri2_wl_visuals[j].rgba_masks); + count + 1, EGL_WINDOW_BIT, NULL, dri2_wl_visuals[j].rgba_shifts, dri2_wl_visuals[j].rgba_sizes); if (dri2_conf) { if (dri2_conf->base.ConfigID == count + 1) count++; @@ -1387,7 +1458,8 @@ */ dri2_conf = dri2_add_config(disp, dri2_dpy->driver_configs[i], count + 1, EGL_WINDOW_BIT, NULL, - dri2_wl_visuals[c].rgba_masks); + dri2_wl_visuals[c].rgba_shifts, + dri2_wl_visuals[c].rgba_sizes); if (dri2_conf) { if (dri2_conf->base.ConfigID == count + 1) count++; diff -Nru mesa-19.2.8/src/egl/drivers/dri2/platform_x11.c mesa-20.0.8/src/egl/drivers/dri2/platform_x11.c --- mesa-19.2.8/src/egl/drivers/dri2/platform_x11.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/egl/drivers/dri2/platform_x11.c 2020-06-12 01:21:16.000000000 +0000 @@ -42,6 +42,7 @@ #include #include "util/debug.h" #include "util/macros.h" +#include "util/bitscan.h" #include "egl_dri2.h" #include "egl_dri2_fallbacks.h" @@ -794,16 +795,23 @@ EGL_NONE }; - unsigned int rgba_masks[4] = { - visuals[i].red_mask, - visuals[i].green_mask, - visuals[i].blue_mask, + int rgba_shifts[4] = { + ffs(visuals[i].red_mask) - 1, + ffs(visuals[i].green_mask) - 1, + ffs(visuals[i].blue_mask) - 1, + -1, + }; + + unsigned int rgba_sizes[4] = { + util_bitcount(visuals[i].red_mask), + util_bitcount(visuals[i].green_mask), + util_bitcount(visuals[i].blue_mask), 0, }; dri2_conf = dri2_add_config(disp, config, config_count + 1, surface_type, config_attrs, - rgba_masks); + rgba_shifts, rgba_sizes); if (dri2_conf) if (dri2_conf->base.ConfigID == config_count + 1) config_count++; @@ -817,11 +825,14 @@ * wants... especially on drivers that only have 32-bit RGBA * EGLConfigs! */ if (d.data->depth == 24 || d.data->depth == 30) { - rgba_masks[3] = - ~(rgba_masks[0] | rgba_masks[1] | rgba_masks[2]); + unsigned int rgba_mask = ~(visuals[i].red_mask | + visuals[i].green_mask | + visuals[i].blue_mask); + rgba_shifts[3] = ffs(rgba_mask) - 1; + rgba_sizes[3] = util_bitcount(rgba_mask); dri2_conf = dri2_add_config(disp, config, config_count + 1, surface_type, config_attrs, - rgba_masks); + rgba_shifts, rgba_sizes); if (dri2_conf) if (dri2_conf->base.ConfigID == config_count + 1) config_count++; diff -Nru mesa-19.2.8/src/egl/generate/eglFunctionList.py mesa-20.0.8/src/egl/generate/eglFunctionList.py --- mesa-19.2.8/src/egl/generate/eglFunctionList.py 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/egl/generate/eglFunctionList.py 2020-06-12 01:21:16.000000000 +0000 @@ -213,5 +213,8 @@ _eglFunc("eglGetDisplayDriverName", "display"), _eglFunc("eglGetDisplayDriverConfig", "display"), + # EGL_KHR_partial_update + _eglFunc("eglSetDamageRegionKHR", "display"), + ) diff -Nru mesa-19.2.8/src/egl/generate/gen_egl_dispatch.py mesa-20.0.8/src/egl/generate/gen_egl_dispatch.py --- mesa-19.2.8/src/egl/generate/gen_egl_dispatch.py 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/egl/generate/gen_egl_dispatch.py 2020-06-12 01:21:16.000000000 +0000 @@ -100,6 +100,8 @@ #include #include + #include + #include #include "glvnd/libeglabi.h" """.lstrip("\n")) @@ -131,6 +133,7 @@ text = "" text += '#include "egldispatchstubs.h"\n' text += '#include "g_egldispatchstubs.h"\n' + text += '#include \n' text += "\n" for (func, eglFunc) in functions: diff -Nru mesa-19.2.8/src/egl/main/eglapi.c mesa-20.0.8/src/egl/main/eglapi.c --- mesa-19.2.8/src/egl/main/eglapi.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/egl/main/eglapi.c 2020-06-12 01:21:16.000000000 +0000 @@ -1851,9 +1851,10 @@ (type == EGL_SYNC_FENCE_KHR || type == EGL_SYNC_NATIVE_FENCE_ANDROID)) RETURN_EGL_ERROR(disp, EGL_BAD_MATCH, EGL_NO_SYNC_KHR); - /* return an error if the client API doesn't support GL_OES_EGL_sync */ + /* return an error if the client API doesn't support GL_[OES|MESA]_EGL_sync. */ if (ctx && (ctx->Resource.Display != disp || - ctx->ClientAPI != EGL_OPENGL_ES_API)) + (ctx->ClientAPI != EGL_OPENGL_ES_API && + ctx->ClientAPI != EGL_OPENGL_API))) RETURN_EGL_ERROR(disp, EGL_BAD_MATCH, EGL_NO_SYNC_KHR); switch (type) { @@ -2035,8 +2036,10 @@ _EGL_CHECK_SYNC(disp, s, EGL_FALSE, drv); assert(disp->Extensions.KHR_wait_sync); - /* return an error if the client API doesn't support GL_OES_EGL_sync */ - if (ctx == EGL_NO_CONTEXT || ctx->ClientAPI != EGL_OPENGL_ES_API) + /* return an error if the client API doesn't support GL_[OES|MESA]_EGL_sync. */ + if (ctx == EGL_NO_CONTEXT || + (ctx->ClientAPI != EGL_OPENGL_ES_API && + ctx->ClientAPI != EGL_OPENGL_API)) RETURN_EGL_ERROR(disp, EGL_BAD_MATCH, EGL_FALSE); /* the API doesn't allow any flags yet */ diff -Nru mesa-19.2.8/src/egl/main/egldisplay.c mesa-20.0.8/src/egl/main/egldisplay.c --- mesa-19.2.8/src/egl/main/egldisplay.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/egl/main/egldisplay.c 2020-06-12 01:21:16.000000000 +0000 @@ -105,6 +105,9 @@ } } + if (plat == _EGL_INVALID_PLATFORM) + _eglLog(_EGL_WARNING, "invalid EGL_PLATFORM given"); + return plat; } @@ -147,33 +150,23 @@ _EGLPlatformType _eglGetNativePlatform(void *nativeDisplay) { - static _EGLPlatformType native_platform = _EGL_INVALID_PLATFORM; - _EGLPlatformType detected_platform = native_platform; + _EGLPlatformType detected_platform = _eglGetNativePlatformFromEnv(); + const char *detection_method = "environment"; if (detected_platform == _EGL_INVALID_PLATFORM) { - const char *detection_method; - - detected_platform = _eglGetNativePlatformFromEnv(); - detection_method = "environment overwrite"; - - if (detected_platform == _EGL_INVALID_PLATFORM) { - detected_platform = _eglNativePlatformDetectNativeDisplay(nativeDisplay); - detection_method = "autodetected"; - } - - if (detected_platform == _EGL_INVALID_PLATFORM) { - detected_platform = _EGL_NATIVE_PLATFORM; - detection_method = "build-time configuration"; - } - - _eglLog(_EGL_DEBUG, "Native platform type: %s (%s)", - egl_platforms[detected_platform].name, detection_method); + detected_platform = _eglNativePlatformDetectNativeDisplay(nativeDisplay); + detection_method = "autodetected"; + } - p_atomic_cmpxchg(&native_platform, _EGL_INVALID_PLATFORM, - detected_platform); + if (detected_platform == _EGL_INVALID_PLATFORM) { + detected_platform = _EGL_NATIVE_PLATFORM; + detection_method = "build-time configuration"; } - return native_platform; + _eglLog(_EGL_DEBUG, "Native platform type: %s (%s)", + egl_platforms[detected_platform].name, detection_method); + + return detected_platform; } diff -Nru mesa-19.2.8/src/egl/main/egldriver.c mesa-20.0.8/src/egl/main/egldriver.c --- mesa-19.2.8/src/egl/main/egldriver.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/egl/main/egldriver.c 2020-06-12 01:21:16.000000000 +0000 @@ -92,6 +92,8 @@ /* set options */ disp->Options.ForceSoftware = env_var_as_boolean("LIBGL_ALWAYS_SOFTWARE", false); + if (disp->Options.ForceSoftware) + _eglLog(_EGL_DEBUG, "Found 'LIBGL_ALWAYS_SOFTWARE' set, will use a CPU renderer"); best_drv = _eglMatchAndInitialize(disp); if (!best_drv && !disp->Options.ForceSoftware) { diff -Nru mesa-19.2.8/src/egl/main/eglglobals.c mesa-20.0.8/src/egl/main/eglglobals.c --- mesa-19.2.8/src/egl/main/eglglobals.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/egl/main/eglglobals.c 2020-06-12 01:21:16.000000000 +0000 @@ -38,7 +38,6 @@ #include "egldevice.h" #include "egldisplay.h" #include "egldriver.h" -#include "egllog.h" #include "util/macros.h" @@ -161,10 +160,10 @@ EGLBoolean _eglPointerIsDereferencable(void *p) { -#ifdef HAVE_MINCORE uintptr_t addr = (uintptr_t) p; - unsigned char valid = 0; const long page_size = getpagesize(); +#ifdef HAVE_MINCORE + unsigned char valid = 0; if (p == NULL) return EGL_FALSE; @@ -173,7 +172,6 @@ addr &= ~(page_size - 1); if (mincore((void *) addr, page_size, &valid) < 0) { - _eglLog(_EGL_DEBUG, "mincore failed: %m"); return EGL_FALSE; } @@ -190,6 +188,7 @@ */ return EGL_TRUE; #else - return p != NULL; + // Without mincore(), we just assume that the first page is unmapped. + return addr >= page_size; #endif } diff -Nru mesa-19.2.8/src/egl/main/egltypedefs.h mesa-20.0.8/src/egl/main/egltypedefs.h --- mesa-19.2.8/src/egl/main/egltypedefs.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/egl/main/egltypedefs.h 2020-06-12 01:21:16.000000000 +0000 @@ -33,6 +33,8 @@ #include #include +#include +#include #ifdef __cplusplus extern "C" { diff -Nru mesa-19.2.8/src/egl/meson.build mesa-20.0.8/src/egl/meson.build --- mesa-19.2.8/src/egl/meson.build 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/egl/meson.build 2020-06-12 01:21:16.000000000 +0000 @@ -174,28 +174,19 @@ version : egl_lib_version, ) -# If using glvnd the pkg-config header should not point to EGL_mesa, it should -# point to EGL. glvnd is only available on unix like platforms so adding -l -# should be safe here -if not with_glvnd or not glvnd_has_headers_and_pc_files - if not glvnd_has_headers_and_pc_files - _egl = '-L${libdir} -lEGL' - else - _egl = libegl - endif - +if not with_glvnd pkg.generate( name : 'egl', description : 'Mesa EGL Library', version : meson.project_version(), - libraries : _egl, + libraries : libegl, libraries_private: gl_priv_libs, requires_private : gl_priv_reqs, extra_cflags : gl_pkgconfig_c_flags, ) endif -if with_tests and prog_nm.found() +if with_symbols_check if with_glvnd egl_symbols = files('egl-glvnd-symbols.txt') else @@ -206,7 +197,7 @@ args : [ '--lib', libegl, '--symbols-file', egl_symbols, - '--nm', prog_nm.path(), + symbols_check_args, ], suite : ['egl'], ) diff -Nru mesa-19.2.8/src/egl/wayland/wayland-drm/wayland-drm.xml mesa-20.0.8/src/egl/wayland/wayland-drm/wayland-drm.xml --- mesa-19.2.8/src/egl/wayland/wayland-drm/wayland-drm.xml 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/egl/wayland/wayland-drm/wayland-drm.xml 2020-06-12 01:21:16.000000000 +0000 @@ -100,6 +100,8 @@ + + - - + + @@ -1618,7 +1799,7 @@ - + diff -Nru mesa-19.2.8/src/freedreno/registers/a3xx.xml mesa-20.0.8/src/freedreno/registers/a3xx.xml --- mesa-19.2.8/src/freedreno/registers/a3xx.xml 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/freedreno/registers/a3xx.xml 2020-06-12 01:21:16.000000000 +0000 @@ -1186,7 +1186,6 @@ - diff -Nru mesa-19.2.8/src/freedreno/registers/a4xx.xml mesa-20.0.8/src/freedreno/registers/a4xx.xml --- mesa-19.2.8/src/freedreno/registers/a4xx.xml 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/freedreno/registers/a4xx.xml 2020-06-12 01:21:16.000000000 +0000 @@ -1304,8 +1304,6 @@ - - @@ -1435,7 +1433,6 @@ - diff -Nru mesa-19.2.8/src/freedreno/registers/a5xx.xml mesa-20.0.8/src/freedreno/registers/a5xx.xml --- mesa-19.2.8/src/freedreno/registers/a5xx.xml 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/freedreno/registers/a5xx.xml 2020-06-12 01:21:16.000000000 +0000 @@ -1385,10 +1385,6 @@ - - - - @@ -1707,7 +1703,6 @@ - diff -Nru mesa-19.2.8/src/freedreno/registers/a6xx.xml mesa-20.0.8/src/freedreno/registers/a6xx.xml --- mesa-19.2.8/src/freedreno/registers/a6xx.xml 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/freedreno/registers/a6xx.xml 2020-06-12 01:21:16.000000000 +0000 @@ -26,10 +26,11 @@ - + + @@ -52,8 +53,8 @@ - - + + @@ -230,6 +231,9 @@ + + + @@ -944,7 +948,7 @@ same value... maybe 16b unorm is uncommon enough that it was just easier to upconvert to 32b float internally? - 8b unorm: 10 + 8b unorm: 10 (sometimes 0, is the high bit part of something else?) 16b unorm: 4 32b int: 7 @@ -961,10 +965,12 @@ + + - + @@ -1370,6 +1376,36 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -1744,6 +1780,7 @@ + @@ -1784,10 +1821,12 @@ --> + + @@ -1823,9 +1862,20 @@ + + - - + + + + + + + + + + + @@ -1889,6 +1939,12 @@ + + + + + + @@ -1925,10 +1981,9 @@ update MAX instead of MIN value, ie. GL_GREATER/GL_GEQUAL - - + @@ -1936,12 +1991,14 @@ + + @@ -1954,19 +2011,22 @@ + + - - - + + - + + + + + + + @@ -2150,6 +2210,9 @@ --> + + + @@ -2193,6 +2256,7 @@ + @@ -2230,6 +2294,7 @@ + @@ -2268,11 +2333,13 @@ + + @@ -2307,6 +2374,7 @@ + @@ -2314,6 +2382,7 @@ + @@ -2325,13 +2394,20 @@ - + - + + + + + + + + @@ -2358,11 +2434,20 @@ + + + + + + + + + @@ -2394,12 +2479,18 @@ + + + + + + @@ -2419,7 +2510,7 @@ hw streamout (rather than stg instructions in shader) - + + + + + + + domain shader version of VPC_PACK + + + @@ -2514,6 +2623,16 @@ + + + geometry shader + + + + + + + hull shader? @@ -2536,8 +2655,22 @@ - - + + + geometry shader + + + + + + + + + size in vec4s of per-primitive storage for gs + + + + @@ -2566,6 +2699,7 @@ + @@ -2576,11 +2710,17 @@ - + + + + + + + @@ -2608,34 +2748,6 @@ - - - - - - - - - - - - - - - - - - - - - - + @@ -2673,6 +2787,34 @@ + + + + + + + + + + + + + + + + + + + + + + @@ -2720,6 +2862,31 @@ + + + + + + + + + + + + + + + + + + + + + + + + + @@ -2744,6 +2911,14 @@ + + + + + + + + @@ -2790,7 +2965,28 @@ - + + + + + + + + + + + + + + + + + + + @@ -2850,14 +3046,19 @@ + + + + + @@ -2884,6 +3085,7 @@ + @@ -2896,15 +3098,7 @@ badly named or the functionality moved in a6xx. But downstream kernel calls this "a6xx_sp_ps_tp_2d_cluster" --> - - - - - - - - - + @@ -3125,7 +3319,12 @@ --> - + + @@ -3146,11 +3345,10 @@ - + + + @@ -3234,18 +3432,6 @@ - - - - - - - - - - - - diff -Nru mesa-19.2.8/src/freedreno/registers/adreno_pm4.xml mesa-20.0.8/src/freedreno/registers/adreno_pm4.xml --- mesa-19.2.8/src/freedreno/registers/adreno_pm4.xml 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/freedreno/registers/adreno_pm4.xml 2020-06-12 01:21:16.000000000 +0000 @@ -15,6 +15,9 @@ + + + @@ -62,7 +65,39 @@ - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -122,6 +157,12 @@ + + Takes the same arguments as CP_INDIRECT_BUFFER, but jumps to + another buffer at the same level. Must be at the end of IB, and + doesn't work with draw state IB's. + + indirect buffer dispatch. same as IB, but init is pipelined wait for the IDLE state of the engine @@ -184,7 +225,7 @@ load sequencer instruction memory (code embedded in packet) load constants from a location in memory - + selective invalidation of state pointers dynamically changes shader instruction memory partition @@ -231,7 +272,7 @@ Load a buffer with pre-fetch enabled Set bin (?) - + test 2 memory locations to dword values specified @@ -275,7 +316,7 @@ for A4xx Write to register with address that does not fit into type-0 pkt - + copy from ME scratch RAM to a register @@ -378,15 +419,16 @@ - - - + + + + + + + + + - + + + @@ -784,14 +837,66 @@ + + + Like CP_SET_BIN_DATA5, but set the pointers as offsets from the + pointers stored in VSC_PIPE_{DATA,DATA2,SIZE}_ADDRESS. Useful + for Vulkan where these values aren't known when the command + stream is recorded. + + + + + + + + + + + + + + + + + + + + + + + + Modifies DST_REG using two sources that can either be registers + or immediates. If SRC1_ADD is set, then do the following: + + $dst = (($dst & $src0) rot $rotate) + $src1 + + Otherwise: + + $dst = (($dst & $src0) rot $rotate) | $src1 + + Here "rot" means rotate left. + + + + + + + + + + + + + + + + - - + + @@ -803,18 +908,69 @@ - + + + Like CP_REG_TO_MEM, but the memory address to write to can be + offsetted using either one or two registers or scratch + registers. + - - + + + + + + + + + + + + + + + + + Like CP_REG_TO_MEM, but the memory address to write to can be + offsetted using a DWORD in memory. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -834,6 +990,10 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -881,7 +1096,10 @@ + + + @@ -907,6 +1125,71 @@ + + + Wait until a memory value is greater than or equal to the + reference, using signed comparison. + + + + + + + + + + + + + + + + + + + This uses the same internal comparison as CP_COND_WRITE, + but waits until the comparison is true instead. It busy-loops in + the CP for the given number of cycles before trying again. + + + + + + + + + + + + + + + + + + + + + + + + + + + + Waits for REG0 to not be 0 or REG1 to not equal REF + + + + + + + + + + + + @@ -1103,13 +1386,22 @@ + + + + + + + + - - - + @@ -1155,19 +1447,122 @@ - + + + + + + + + + + - + + + + + + + + + + + + + + + + + + Executes the following DWORDs of commands if the dword at ADDR0 + is not equal to 0 and the dword at ADDR1 is less than REF + (signed comparison). + + + + + + + + + + + + + + + + + + + + + + + + Used by the userspace driver to set various IB's which are + executed during context save/restore for handling + state that isn't restored by the + context switch routine itself. + + + + Executed unconditionally when switching back to the context. + + + + Executed when switching back after switching + away during execution of + a CP_SET_MARKER packet with RM6_YIELD as the + payload *and* the normal save routine was + bypassed for a shorter one. I think this is + connected to the "skipsaverestore" bit set by + the kernel when preempting. + + + + + Executed when switching away from the context, + except for context switches initiated via + CP_YIELD. + + + + + This can only be set by the RB (i.e. the kernel) + and executes with protected mode off, but + is otherwise similar to SAVE_IB. + + + + + + + + + + + + + + + diff -Nru mesa-19.2.8/src/freedreno/registers/gen_header.py mesa-20.0.8/src/freedreno/registers/gen_header.py --- mesa-19.2.8/src/freedreno/registers/gen_header.py 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/freedreno/registers/gen_header.py 2020-06-12 01:21:16.000000000 +0000 @@ -28,6 +28,9 @@ print("\t%s = %d," % (name, value)) print("};\n") + def dump_pack_struct(self): + pass + class Field(object): def __init__(self, name, low, high, shr, type, parser): self.name = name @@ -36,7 +39,7 @@ self.shr = shr self.type = type - builtin_types = [ None, "boolean", "uint", "hex", "int", "fixed", "ufixed", "float" ] + builtin_types = [ None, "boolean", "uint", "hex", "int", "fixed", "ufixed", "float", "address", "waddress" ] if low < 0 or low > 31: raise parser.error("low attribute out of range: %d" % low) @@ -51,37 +54,40 @@ elif not self.type in builtin_types and not self.type in parser.enums: raise parser.error("unknown type '%s'" % self.type); - def ctype(self): + def ctype(self, var_name): if self.type == None: type = "uint32_t" - val = "val" + val = var_name elif self.type == "boolean": type = "bool" - val = "val" + val = var_name elif self.type == "uint" or self.type == "hex": type = "uint32_t" - val = "val" + val = var_name elif self.type == "int": type = "int32_t" - val = "val" + val = var_name elif self.type == "fixed": type = "float" - val = "((int32_t)(val * %d.0))" % (1 << self.radix) + val = "((int32_t)(%s * %d.0))" % (var_name, 1 << self.radix) elif self.type == "ufixed": type = "float" - val = "((uint32_t)(val * %d.0))" % (1 << self.radix) + val = "((uint32_t)(%s * %d.0))" % (var_name, 1 << self.radix) elif self.type == "float" and self.high - self.low == 31: type = "float" - val = "fui(val)" + val = "fui(%s)" % var_name elif self.type == "float" and self.high - self.low == 15: type = "float" - val = "util_float_to_half(val)" + val = "util_float_to_half(%s)" % var_name + elif self.type in [ "address", "waddress" ]: + type = "uint64_t" + val = var_name else: type = "enum %s" % self.type - val = "val" + val = var_name if self.shr > 0: - val = "%s >> %d" % (val, self.shr) + val = "(%s >> %d)" % (val, self.shr) return (type, val) @@ -103,6 +109,98 @@ else: self.fields = [] + def dump_pack_struct(self, prefix=None, array=None): + def field_name(prefix, name): + if f.name: + name = f.name.lower() + else: + name = prefix.lower() + + if (name in [ "double", "float", "int" ]) or not (name[0].isalpha()): + name = "_" + name + + return name + + if not prefix: + return + if prefix == None: + prefix = self.name + + print("struct %s {" % prefix) + for f in self.fields: + if f.type in [ "address", "waddress" ]: + tab_to(" __bo_type", "bo;") + tab_to(" uint32_t", "bo_offset;") + continue + name = field_name(prefix, f.name) + + type, val = f.ctype("var") + + tab_to(" %s" % type, "%s;" % name) + tab_to(" uint32_t", "unknown;") + tab_to(" uint32_t", "dword;") + print("};\n") + + address = None; + for f in self.fields: + if f.type in [ "address", "waddress" ]: + address = f + if array: + print("static inline struct fd_reg_pair\npack_%s(uint32_t i, struct %s fields)\n{" % + (prefix, prefix)); + else: + print("static inline struct fd_reg_pair\npack_%s(struct %s fields)\n{" % + (prefix, prefix)); + + print("#ifndef NDEBUG") + known_mask = 0 + for f in self.fields: + known_mask |= mask(f.low, f.high) + if f.type in [ "boolean", "address", "waddress" ]: + continue + type, val = f.ctype("fields.%s" % field_name(prefix, f.name)) + print(" assert((%-40s & 0x%08x) == 0);" % (val, 0xffffffff ^ mask(0 , f.high - f.low))) + print(" assert((%-40s & 0x%08x) == 0);" % ("fields.unknown", known_mask)) + print("#endif\n") + + print(" return (struct fd_reg_pair) {") + if array: + print(" .reg = REG_%s(i)," % prefix) + else: + print(" .reg = REG_%s," % prefix) + + print(" .value =") + for f in self.fields: + if f.type in [ "address", "waddress" ]: + continue + else: + type, val = f.ctype("fields.%s" % field_name(prefix, f.name)) + print(" (%-40s << %2d) |" % (val, f.low)) + print(" fields.unknown | fields.dword,") + + if address: + print(" .is_address = true,") + print(" .bo = fields.bo,") + if f.type == "waddress": + print(" .bo_write = true,") + print(" .bo_offset = fields.bo_offset,") + print(" .bo_shift = %d" % address.shr) + + print(" };\n}\n") + + if address: + skip = ", { .reg = 0 }" + else: + skip = "" + + if array: + print("#define %s(i, ...) pack_%s(i, (struct %s) { __VA_ARGS__ })%s\n" % + (prefix, prefix, prefix, skip)) + else: + print("#define %s(...) pack_%s((struct %s) { __VA_ARGS__ })%s\n" % + (prefix, prefix, prefix, skip)) + + def dump(self, prefix=None): if prefix == None: prefix = self.name @@ -119,12 +217,13 @@ else: tab_to("#define %s__MASK" % name, "0x%08x" % mask(f.low, f.high)) tab_to("#define %s__SHIFT" % name, "%d" % f.low) - type, val = f.ctype() + type, val = f.ctype("val") print("static inline uint32_t %s(%s val)\n{" % (name, type)) if f.shr > 0: print("\tassert(!(val & 0x%x));" % mask(0, f.shr - 1)) print("\treturn ((%s) << %s__SHIFT) & %s__MASK;\n}" % (val, name, name)) + print() class Array(object): def __init__(self, attrs, domain): @@ -137,27 +236,39 @@ def dump(self): print("static inline uint32_t REG_%s_%s(uint32_t i0) { return 0x%08x + 0x%x*i0; }\n" % (self.domain, self.name, self.offset, self.stride)) + def dump_pack_struct(self): + pass + class Reg(object): - def __init__(self, attrs, domain, array): + def __init__(self, attrs, domain, array, bit_size): self.name = attrs["name"] self.domain = domain self.array = array self.offset = int(attrs["offset"], 0) self.type = None + self.bit_size = bit_size + + if self.array: + self.full_name = self.domain + "_" + self.array.name + "_" + self.name + else: + self.full_name = self.domain + "_" + self.name def dump(self): if self.array: - name = self.domain + "_" + self.array.name + "_" + self.name offset = self.array.offset + self.offset - print("static inline uint32_t REG_%s(uint32_t i0) { return 0x%08x + 0x%x*i0; }" % (name, offset, self.array.stride)) + print("static inline uint32_t REG_%s(uint32_t i0) { return 0x%08x + 0x%x*i0; }" % (self.full_name, offset, self.array.stride)) else: - name = self.domain + "_" + self.name - tab_to("#define REG_%s" % name, "0x%08x" % self.offset) + tab_to("#define REG_%s" % self.full_name, "0x%08x" % self.offset) if self.bitset.inline: - self.bitset.dump(name) + self.bitset.dump(self.full_name) print("") - + + def dump_pack_struct(self): + if self.bitset.inline: + self.bitset.dump_pack_struct(self.full_name, not self.array == None) + + def parse_variants(attrs): if not "variants" in attrs: return None @@ -235,6 +346,21 @@ self.stack = [] self.do_parse(filename) + def parse_reg(self, attrs, bit_size): + if "type" in attrs and attrs["type"] in self.bitsets: + self.current_bitset = self.bitsets[attrs["type"]] + else: + self.current_bitset = Bitset(attrs["name"], None) + self.current_bitset.inline = True + if "type" in attrs: + self.parse_field(None, attrs) + + self.current_reg = Reg(attrs, self.prefix(), self.current_array, bit_size) + self.current_reg.bitset = self.current_bitset + + if len(self.stack) == 1: + self.file.append(self.current_reg) + def start_element(self, name, attrs): if name == "import": filename = os.path.basename(attrs["file"]) @@ -259,19 +385,9 @@ self.current_enum.values.append((attrs["name"], value)) # self.current_enum_value = value + 1 elif name == "reg32": - if "type" in attrs and attrs["type"] in self.bitsets: - self.current_bitset = self.bitsets[attrs["type"]] - else: - self.current_bitset = Bitset(attrs["name"], None) - self.current_bitset.inline = True - if "type" in attrs: - self.parse_field(None, attrs) - - self.current_reg = Reg(attrs, self.prefix(), self.current_array) - self.current_reg.bitset = self.current_bitset - - if len(self.stack) == 1: - self.file.append(self.current_reg) + self.parse_reg(attrs, 32) + elif name == "reg64": + self.parse_reg(attrs, 64) elif name == "array": self.current_array = Array(attrs, self.prefix()) if len(self.stack) == 1: @@ -316,11 +432,21 @@ for e in enums + bitsets + regs: e.dump() + def dump_structs(self): + for e in self.file: + e.dump_pack_struct() + + def main(): p = Parser() xml_file = sys.argv[1] + if len(sys.argv) > 2 and sys.argv[2] == '--pack-structs': + do_structs = True + guard = str.replace(os.path.basename(xml_file), '.', '_').upper() + '_STRUCTS' + else: + do_structs = False + guard = str.replace(os.path.basename(xml_file), '.', '_').upper() - guard = str.replace(os.path.basename(xml_file), '.', '_').upper() print("#ifndef %s\n#define %s\n" % (guard, guard)) try: @@ -329,7 +455,10 @@ print(e) exit(1) - p.dump() + if do_structs: + p.dump_structs() + else: + p.dump() print("\n#endif /* %s */" % guard) diff -Nru mesa-19.2.8/src/freedreno/registers/meson.build mesa-20.0.8/src/freedreno/registers/meson.build --- mesa-19.2.8/src/freedreno/registers/meson.build 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/freedreno/registers/meson.build 2020-06-12 01:21:16.000000000 +0000 @@ -39,3 +39,11 @@ capture : true, ) endforeach + +freedreno_xml_header_files += custom_target( + 'a6xx-pack.xml.h', + input : ['gen_header.py', 'a6xx.xml'], + output : 'a6xx-pack.xml.h', + command : [prog_python, '@INPUT@', '--pack-structs'], + capture : true, + ) diff -Nru mesa-19.2.8/src/freedreno/vulkan/.dir-locals.el mesa-20.0.8/src/freedreno/vulkan/.dir-locals.el --- mesa-19.2.8/src/freedreno/vulkan/.dir-locals.el 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/freedreno/vulkan/.dir-locals.el 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,8 @@ +((prog-mode + (indent-tabs-mode . nil) + (tab-width . 8) + (c-basic-offset . 3) + (c-file-style . "k&r") + (fill-column . 78) + ) + ) diff -Nru mesa-19.2.8/src/freedreno/vulkan/.editorconfig mesa-20.0.8/src/freedreno/vulkan/.editorconfig --- mesa-19.2.8/src/freedreno/vulkan/.editorconfig 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/freedreno/vulkan/.editorconfig 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,5 @@ +[*.{c,h,cpp,hpp,cc,hh}] +indent_style = space +indent_size = 3 +tab_width = 8 +max_line_length = 78 diff -Nru mesa-19.2.8/src/freedreno/vulkan/meson.build mesa-20.0.8/src/freedreno/vulkan/meson.build --- mesa-19.2.8/src/freedreno/vulkan/meson.build 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/freedreno/vulkan/meson.build 2020-06-12 01:21:16.000000000 +0000 @@ -39,18 +39,12 @@ ], ) -tu_format_table_c = custom_target( - 'tu_format_table.c', - input : ['vk_format_table.py', 'vk_format_layout.csv'], - output : 'vk_format_table.c', - command : [prog_python, '@INPUT@'], - depend_files : files('vk_format_parse.py'), - capture : true, -) - libtu_files = files( + 'tu_blit.c', + 'tu_blit.h', 'tu_cmd_buffer.c', 'tu_cs.c', + 'tu_cs.h', 'tu_device.c', 'tu_descriptor_set.c', 'tu_descriptor_set.h', @@ -78,15 +72,34 @@ tu_deps = [] tu_flags = [] +if with_platform_x11 + tu_deps += dep_xcb_dri3 + tu_flags += [ + '-DVK_USE_PLATFORM_XCB_KHR', + '-DVK_USE_PLATFORM_XLIB_KHR', + ] + libtu_files += files('tu_wsi_x11.c') +endif + if with_platform_wayland tu_deps += dep_wayland_client tu_flags += '-DVK_USE_PLATFORM_WAYLAND_KHR' libtu_files += files('tu_wsi_wayland.c') endif +if with_platform_drm + tu_flags += '-DVK_USE_PLATFORM_DISPLAY_KHR' + libtu_files += files('tu_wsi_display.c') +endif + +if with_xlib_lease + tu_deps += [dep_xcb_xrandr, dep_xlib_xrandr] + tu_flags += '-DVK_USE_PLATFORM_XLIB_XRANDR_EXT' +endif + libvulkan_freedreno = shared_library( 'vulkan_freedreno', - [libtu_files, tu_entrypoints, tu_extensions_c, tu_format_table_c, freedreno_xml_header_files], + [libtu_files, tu_entrypoints, tu_extensions_c, freedreno_xml_header_files], include_directories : [ inc_common, inc_compiler, @@ -97,6 +110,7 @@ libvulkan_wsi, libfreedreno_drm, # required by ir3_shader_get_variant, which we don't use libfreedreno_ir3, + libfreedreno_layout, ], dependencies : [ dep_dl, @@ -116,14 +130,14 @@ install : true, ) -if with_tests and prog_nm.found() +if with_symbols_check test( 'tu symbols check', symbols_check, args : [ '--lib', libvulkan_freedreno, '--symbols-file', vulkan_icd_symbols, - '--nm', prog_nm.path(), + symbols_check_args, ], suite : ['freedreno'], ) diff -Nru mesa-19.2.8/src/freedreno/vulkan/tu_android.c mesa-20.0.8/src/freedreno/vulkan/tu_android.c --- mesa-19.2.8/src/freedreno/vulkan/tu_android.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/freedreno/vulkan/tu_android.c 2020-06-12 01:21:16.000000000 +0000 @@ -31,6 +31,8 @@ #include #include +#include "drm-uapi/drm_fourcc.h" + static int tu_hal_open(const struct hw_module_t *mod, const char *id, @@ -120,12 +122,8 @@ struct tu_bo *bo = NULL; VkResult result; - result = tu_image_create( - device_h, - &(struct tu_image_create_info) { - .vk_info = base_info, .scanout = true, .no_metadata_planes = true }, - alloc, &image_h); - + result = tu_image_create(device_h, base_info, alloc, &image_h, + DRM_FORMAT_MOD_LINEAR); if (result != VK_SUCCESS) return result; diff -Nru mesa-19.2.8/src/freedreno/vulkan/tu_blit.c mesa-20.0.8/src/freedreno/vulkan/tu_blit.c --- mesa-19.2.8/src/freedreno/vulkan/tu_blit.c 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/freedreno/vulkan/tu_blit.c 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,367 @@ +/* + * Copyright © 2019 Valve Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * Authors: + * Jonathan Marek + * + */ + +#include "tu_blit.h" + +#include "a6xx.xml.h" +#include "adreno_common.xml.h" +#include "adreno_pm4.xml.h" + +#include "vk_format.h" + +#include "tu_cs.h" + +/* TODO: + * - Avoid disabling tiling for swapped formats + * (image_to_image copy doesn't deal with it) + * - Fix d24_unorm_s8_uint support & aspects + * - UBWC + */ + +static VkFormat +blit_copy_format(VkFormat format) +{ + switch (vk_format_get_blocksizebits(format)) { + case 8: return VK_FORMAT_R8_UINT; + case 16: return VK_FORMAT_R16_UINT; + case 32: return VK_FORMAT_R32_UINT; + case 64: return VK_FORMAT_R32G32_UINT; + case 96: return VK_FORMAT_R32G32B32_UINT; + case 128:return VK_FORMAT_R32G32B32A32_UINT; + default: + unreachable("unhandled format size"); + } +} + +static uint32_t +blit_image_info(const struct tu_blit_surf *img, bool src, bool stencil_read) +{ + const struct tu_native_format *fmt = tu6_get_native_format(img->fmt); + enum a6xx_color_fmt rb = fmt->rb; + enum a3xx_color_swap swap = img->tiled ? WZYX : fmt->swap; + if (rb == RB6_R10G10B10A2_UNORM && src) + rb = RB6_R10G10B10A2_FLOAT16; + if (rb == RB6_Z24_UNORM_S8_UINT) + rb = RB6_Z24_UNORM_S8_UINT_AS_R8G8B8A8; + + if (stencil_read) + swap = XYZW; + + return A6XX_SP_PS_2D_SRC_INFO_COLOR_FORMAT(rb) | + A6XX_SP_PS_2D_SRC_INFO_TILE_MODE(img->tile_mode) | + A6XX_SP_PS_2D_SRC_INFO_COLOR_SWAP(swap) | + COND(vk_format_is_srgb(img->fmt), A6XX_SP_PS_2D_SRC_INFO_SRGB) | + COND(img->ubwc_size, A6XX_SP_PS_2D_SRC_INFO_FLAGS); +} + +static void +emit_blit_step(struct tu_cmd_buffer *cmdbuf, const struct tu_blit *blt) +{ + struct tu_cs *cs = &cmdbuf->cs; + + tu_cs_reserve_space(cmdbuf->device, cs, 66); + + enum a6xx_color_fmt fmt = tu6_get_native_format(blt->dst.fmt)->rb; + if (fmt == RB6_Z24_UNORM_S8_UINT) + fmt = RB6_Z24_UNORM_S8_UINT_AS_R8G8B8A8; + + enum a6xx_2d_ifmt ifmt = tu6_rb_fmt_to_ifmt(fmt); + + if (vk_format_is_srgb(blt->dst.fmt)) { + assert(ifmt == R2D_UNORM8); + ifmt = R2D_UNORM8_SRGB; + } + + uint32_t blit_cntl = A6XX_RB_2D_BLIT_CNTL_ROTATE(blt->rotation) | + COND(blt->type == TU_BLIT_CLEAR, A6XX_RB_2D_BLIT_CNTL_SOLID_COLOR) | + A6XX_RB_2D_BLIT_CNTL_COLOR_FORMAT(fmt) | /* not required? */ + COND(fmt == RB6_Z24_UNORM_S8_UINT_AS_R8G8B8A8, A6XX_RB_2D_BLIT_CNTL_D24S8) | + A6XX_RB_2D_BLIT_CNTL_MASK(0xf) | + A6XX_RB_2D_BLIT_CNTL_IFMT(ifmt); + + tu_cs_emit_pkt4(&cmdbuf->cs, REG_A6XX_RB_2D_BLIT_CNTL, 1); + tu_cs_emit(&cmdbuf->cs, blit_cntl); + + tu_cs_emit_pkt4(&cmdbuf->cs, REG_A6XX_GRAS_2D_BLIT_CNTL, 1); + tu_cs_emit(&cmdbuf->cs, blit_cntl); + + /* + * Emit source: + */ + if (blt->type == TU_BLIT_CLEAR) { + tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_SRC_SOLID_C0, 4); + tu_cs_emit(cs, blt->clear_value[0]); + tu_cs_emit(cs, blt->clear_value[1]); + tu_cs_emit(cs, blt->clear_value[2]); + tu_cs_emit(cs, blt->clear_value[3]); + } else { + tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_INFO, 10); + tu_cs_emit(cs, blit_image_info(&blt->src, true, blt->stencil_read) | + A6XX_SP_PS_2D_SRC_INFO_SAMPLES(tu_msaa_samples(blt->src.samples)) | + /* TODO: should disable this bit for integer formats ? */ + COND(blt->src.samples > 1, A6XX_SP_PS_2D_SRC_INFO_SAMPLES_AVERAGE) | + COND(blt->filter, A6XX_SP_PS_2D_SRC_INFO_FILTER) | + 0x500000); + tu_cs_emit(cs, A6XX_SP_PS_2D_SRC_SIZE_WIDTH(blt->src.x + blt->src.width) | + A6XX_SP_PS_2D_SRC_SIZE_HEIGHT(blt->src.y + blt->src.height)); + tu_cs_emit_qw(cs, blt->src.va); + tu_cs_emit(cs, A6XX_SP_PS_2D_SRC_PITCH_PITCH(blt->src.pitch)); + + tu_cs_emit(cs, 0x00000000); + tu_cs_emit(cs, 0x00000000); + tu_cs_emit(cs, 0x00000000); + tu_cs_emit(cs, 0x00000000); + tu_cs_emit(cs, 0x00000000); + + if (blt->src.ubwc_size) { + tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_FLAGS_LO, 6); + tu_cs_emit_qw(cs, blt->src.ubwc_va); + tu_cs_emit(cs, A6XX_SP_PS_2D_SRC_FLAGS_PITCH_PITCH(blt->src.ubwc_pitch) | + A6XX_SP_PS_2D_SRC_FLAGS_PITCH_ARRAY_PITCH(blt->src.ubwc_size >> 2)); + tu_cs_emit(cs, 0x00000000); + tu_cs_emit(cs, 0x00000000); + tu_cs_emit(cs, 0x00000000); + } + } + + /* + * Emit destination: + */ + tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 9); + tu_cs_emit(cs, blit_image_info(&blt->dst, false, false)); + tu_cs_emit_qw(cs, blt->dst.va); + tu_cs_emit(cs, A6XX_RB_2D_DST_SIZE_PITCH(blt->dst.pitch)); + tu_cs_emit(cs, 0x00000000); + tu_cs_emit(cs, 0x00000000); + tu_cs_emit(cs, 0x00000000); + tu_cs_emit(cs, 0x00000000); + tu_cs_emit(cs, 0x00000000); + + if (blt->dst.ubwc_size) { + tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_FLAGS_LO, 6); + tu_cs_emit_qw(cs, blt->dst.ubwc_va); + tu_cs_emit(cs, A6XX_RB_2D_DST_FLAGS_PITCH_PITCH(blt->dst.ubwc_pitch) | + A6XX_RB_2D_DST_FLAGS_PITCH_ARRAY_PITCH(blt->dst.ubwc_size >> 2)); + tu_cs_emit(cs, 0x00000000); + tu_cs_emit(cs, 0x00000000); + tu_cs_emit(cs, 0x00000000); + } + + tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_2D_SRC_TL_X, 4); + tu_cs_emit(cs, A6XX_GRAS_2D_SRC_TL_X_X(blt->src.x)); + tu_cs_emit(cs, A6XX_GRAS_2D_SRC_BR_X_X(blt->src.x + blt->src.width - 1)); + tu_cs_emit(cs, A6XX_GRAS_2D_SRC_TL_Y_Y(blt->src.y)); + tu_cs_emit(cs, A6XX_GRAS_2D_SRC_BR_Y_Y(blt->src.y + blt->src.height - 1)); + + tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_2D_DST_TL, 2); + tu_cs_emit(cs, A6XX_GRAS_2D_DST_TL_X(blt->dst.x) | + A6XX_GRAS_2D_DST_TL_Y(blt->dst.y)); + tu_cs_emit(cs, A6XX_GRAS_2D_DST_BR_X(blt->dst.x + blt->dst.width - 1) | + A6XX_GRAS_2D_DST_BR_Y(blt->dst.y + blt->dst.height - 1)); + + tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1); + tu_cs_emit(cs, 0x3f); + tu_cs_emit_wfi(cs); + + tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_8C01, 1); + tu_cs_emit(cs, 0); + + if (fmt == RB6_R10G10B10A2_UNORM) + fmt = RB6_R16G16B16A16_FLOAT; + + tu_cs_emit_pkt4(cs, REG_A6XX_SP_2D_SRC_FORMAT, 1); + tu_cs_emit(cs, COND(vk_format_is_sint(blt->src.fmt), A6XX_SP_2D_SRC_FORMAT_SINT) | + COND(vk_format_is_uint(blt->src.fmt), A6XX_SP_2D_SRC_FORMAT_UINT) | + A6XX_SP_2D_SRC_FORMAT_COLOR_FORMAT(fmt) | + COND(ifmt == R2D_UNORM8_SRGB, A6XX_SP_2D_SRC_FORMAT_SRGB) | + A6XX_SP_2D_SRC_FORMAT_MASK(0xf)); + + tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_8E04, 1); + tu_cs_emit(cs, 0x01000000); + + tu_cs_emit_pkt7(cs, CP_BLIT, 1); + tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE)); + + tu_cs_emit_wfi(cs); + + tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_8E04, 1); + tu_cs_emit(cs, 0); +} + +void tu_blit(struct tu_cmd_buffer *cmdbuf, struct tu_blit *blt) +{ + switch (blt->type) { + case TU_BLIT_COPY: + blt->stencil_read = + blt->dst.fmt == VK_FORMAT_R8_UNORM && + blt->src.fmt == VK_FORMAT_D24_UNORM_S8_UINT; + + assert(vk_format_get_blocksize(blt->dst.fmt) == + vk_format_get_blocksize(blt->src.fmt) || blt->stencil_read); + assert(blt->src.samples == blt->dst.samples); + + if (vk_format_is_compressed(blt->src.fmt)) { + unsigned block_width = vk_format_get_blockwidth(blt->src.fmt); + unsigned block_height = vk_format_get_blockheight(blt->src.fmt); + + blt->src.pitch /= block_width; + blt->src.x /= block_width; + blt->src.y /= block_height; + blt->src.fmt = blit_copy_format(blt->src.fmt); + + /* for image_to_image copy, width/height is on the src format */ + blt->dst.width = blt->src.width = DIV_ROUND_UP(blt->src.width, block_width); + blt->dst.height = blt->src.height = DIV_ROUND_UP(blt->src.height, block_height); + } + + if (vk_format_is_compressed(blt->dst.fmt)) { + unsigned block_width = vk_format_get_blockwidth(blt->dst.fmt); + unsigned block_height = vk_format_get_blockheight(blt->dst.fmt); + + blt->dst.pitch /= block_width; + blt->dst.x /= block_width; + blt->dst.y /= block_height; + blt->dst.fmt = blit_copy_format(blt->dst.fmt); + } + + if (blt->dst.fmt == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32) + blt->dst.fmt = blit_copy_format(blt->dst.fmt); + + if (blt->src.fmt == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32) + blt->src.fmt = blit_copy_format(blt->src.fmt); + + /* TODO: multisample image copy does not work correctly with tiling/UBWC */ + blt->src.x *= blt->src.samples; + blt->dst.x *= blt->dst.samples; + blt->src.width *= blt->src.samples; + blt->dst.width *= blt->dst.samples; + blt->src.samples = 1; + blt->dst.samples = 1; + break; + case TU_BLIT_CLEAR: + /* unsupported format cleared as UINT32 */ + if (blt->dst.fmt == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32) + blt->dst.fmt = VK_FORMAT_R32_UINT; + assert(blt->dst.samples == 1); /* TODO */ + blt->src = blt->dst; + break; + default: + assert(blt->dst.samples == 1); + } + + tu_cs_reserve_space(cmdbuf->device, &cmdbuf->cs, 18); + + tu6_emit_event_write(cmdbuf, &cmdbuf->cs, LRZ_FLUSH, false); + tu6_emit_event_write(cmdbuf, &cmdbuf->cs, 0x1d, true); + tu6_emit_event_write(cmdbuf, &cmdbuf->cs, FACENESS_FLUSH, true); + tu6_emit_event_write(cmdbuf, &cmdbuf->cs, PC_CCU_INVALIDATE_COLOR, false); + tu6_emit_event_write(cmdbuf, &cmdbuf->cs, PC_CCU_INVALIDATE_DEPTH, false); + + /* buffer copy setup */ + tu_cs_emit_pkt7(&cmdbuf->cs, CP_SET_MARKER, 1); + tu_cs_emit(&cmdbuf->cs, A6XX_CP_SET_MARKER_0_MODE(RM6_BLIT2DSCALE)); + + for (unsigned layer = 0; layer < blt->layers; layer++) { + if (blt->buffer) { + struct tu_blit line_blt = *blt; + uint64_t dst_va = line_blt.dst.va, src_va = line_blt.src.va; + unsigned blocksize = vk_format_get_blocksize(blt->src.fmt); + uint32_t size = line_blt.src.width, tmp; + + while (size) { + line_blt.src.x = (src_va & 63) / blocksize; + line_blt.src.va = src_va & ~63; + tmp = MIN2(size, 0x4000 - line_blt.src.x); + + line_blt.dst.x = (dst_va & 63) / blocksize; + line_blt.dst.va = dst_va & ~63; + tmp = MIN2(tmp, 0x4000 - line_blt.dst.x); + + line_blt.src.width = line_blt.dst.width = tmp; + + emit_blit_step(cmdbuf, &line_blt); + + src_va += tmp * blocksize; + dst_va += tmp * blocksize; + size -= tmp; + } + } else if ((blt->src.va & 63) || (blt->src.pitch & 63)) { + /* per line copy path (buffer_to_image) */ + assert(blt->type == TU_BLIT_COPY && !blt->src.tiled); + struct tu_blit line_blt = *blt; + uint64_t src_va = line_blt.src.va + blt->src.pitch * blt->src.y; + + line_blt.src.y = 0; + line_blt.src.pitch = 0; + line_blt.src.height = 1; + line_blt.dst.height = 1; + + for (unsigned y = 0; y < blt->src.height; y++) { + line_blt.src.x = blt->src.x + (src_va & 63) / vk_format_get_blocksize(blt->src.fmt); + line_blt.src.va = src_va & ~63; + + emit_blit_step(cmdbuf, &line_blt); + + line_blt.dst.y++; + src_va += blt->src.pitch; + } + } else if ((blt->dst.va & 63) || (blt->dst.pitch & 63)) { + /* per line copy path (image_to_buffer) */ + assert(blt->type == TU_BLIT_COPY && !blt->dst.tiled); + struct tu_blit line_blt = *blt; + uint64_t dst_va = line_blt.dst.va + blt->dst.pitch * blt->dst.y; + + line_blt.dst.y = 0; + line_blt.dst.pitch = 0; + line_blt.src.height = 1; + line_blt.dst.height = 1; + + for (unsigned y = 0; y < blt->src.height; y++) { + line_blt.dst.x = blt->dst.x + (dst_va & 63) / vk_format_get_blocksize(blt->dst.fmt); + line_blt.dst.va = dst_va & ~63; + + emit_blit_step(cmdbuf, &line_blt); + + line_blt.src.y++; + dst_va += blt->dst.pitch; + } + } else { + emit_blit_step(cmdbuf, blt); + } + blt->dst.va += blt->dst.layer_size; + blt->src.va += blt->src.layer_size; + blt->dst.ubwc_va += blt->dst.ubwc_size; + blt->src.ubwc_va += blt->src.ubwc_size; + } + + tu_cs_reserve_space(cmdbuf->device, &cmdbuf->cs, 17); + + tu6_emit_event_write(cmdbuf, &cmdbuf->cs, 0x1d, true); + tu6_emit_event_write(cmdbuf, &cmdbuf->cs, FACENESS_FLUSH, true); + tu6_emit_event_write(cmdbuf, &cmdbuf->cs, CACHE_FLUSH_TS, true); + tu6_emit_event_write(cmdbuf, &cmdbuf->cs, CACHE_INVALIDATE, false); +} diff -Nru mesa-19.2.8/src/freedreno/vulkan/tu_blit.h mesa-20.0.8/src/freedreno/vulkan/tu_blit.h --- mesa-19.2.8/src/freedreno/vulkan/tu_blit.h 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/freedreno/vulkan/tu_blit.h 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,125 @@ +/* + * Copyright © 2019 Valve Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * Authors: + * Jonathan Marek + * + */ + +#ifndef TU_BLIT_H +#define TU_BLIT_H + +#include "tu_private.h" + +#include "vk_format.h" + +struct tu_blit_surf { + VkFormat fmt; + enum a6xx_tile_mode tile_mode; + bool tiled; + uint64_t va; + uint32_t pitch, layer_size; + uint32_t x, y; + uint32_t width, height; + unsigned samples; + uint64_t ubwc_va; + uint32_t ubwc_pitch; + uint32_t ubwc_size; +}; + +static inline struct tu_blit_surf +tu_blit_surf(struct tu_image *image, + VkImageSubresourceLayers subres, + const VkOffset3D *offsets) +{ + unsigned layer = subres.baseArrayLayer; + if (image->type == VK_IMAGE_TYPE_3D) { + assert(layer == 0); + layer = MIN2(offsets[0].z, offsets[1].z); + } + + return (struct tu_blit_surf) { + .fmt = image->vk_format, + .tile_mode = tu6_get_image_tile_mode(image, subres.mipLevel), + .tiled = image->layout.tile_mode != TILE6_LINEAR, + .va = tu_image_base(image, subres.mipLevel, layer), + .pitch = tu_image_stride(image, subres.mipLevel), + .layer_size = tu_layer_size(image, subres.mipLevel), + .x = MIN2(offsets[0].x, offsets[1].x), + .y = MIN2(offsets[0].y, offsets[1].y), + .width = abs(offsets[1].x - offsets[0].x), + .height = abs(offsets[1].y - offsets[0].y), + .samples = image->samples, + .ubwc_va = tu_image_ubwc_base(image, subres.mipLevel, layer), + .ubwc_pitch = tu_image_ubwc_pitch(image, subres.mipLevel), + .ubwc_size = tu_image_ubwc_size(image, subres.mipLevel), + }; +} + +static inline struct tu_blit_surf +tu_blit_surf_ext(struct tu_image *image, + VkImageSubresourceLayers subres, + VkOffset3D offset, + VkExtent3D extent) +{ + return tu_blit_surf(image, subres, (VkOffset3D[]) { + offset, {.x = offset.x + extent.width, + .y = offset.y + extent.height, + .z = offset.z} + }); +} + +static inline struct tu_blit_surf +tu_blit_surf_whole(struct tu_image *image, int level, int layer) +{ + return tu_blit_surf(image, (VkImageSubresourceLayers){ + .mipLevel = level, + .baseArrayLayer = layer, + }, (VkOffset3D[]) { + {}, { + u_minify(image->extent.width, level), + u_minify(image->extent.height, level), + } + }); +} + +enum tu_blit_type { + TU_BLIT_DEFAULT, + TU_BLIT_COPY, + TU_BLIT_CLEAR, +}; + +struct tu_blit { + struct tu_blit_surf dst; + struct tu_blit_surf src; + uint32_t layers; + bool filter; + bool stencil_read; + bool buffer; /* 1d copy/clear */ + enum a6xx_rotation rotation; + uint32_t clear_value[4]; + enum tu_blit_type type; +}; + +void tu_blit(struct tu_cmd_buffer *cmdbuf, struct tu_blit *blt); + +#endif /* TU_BLIT_H */ diff -Nru mesa-19.2.8/src/freedreno/vulkan/tu_cmd_buffer.c mesa-20.0.8/src/freedreno/vulkan/tu_cmd_buffer.c --- mesa-19.2.8/src/freedreno/vulkan/tu_cmd_buffer.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/freedreno/vulkan/tu_cmd_buffer.c 2020-06-12 01:21:16.000000000 +0000 @@ -29,11 +29,13 @@ #include "registers/adreno_pm4.xml.h" #include "registers/adreno_common.xml.h" -#include "registers/a6xx.xml.h" #include "vk_format.h" #include "tu_cs.h" +#include "tu_blit.h" + +#define OVERFLOW_FLAG_REG REG_A6XX_CP_SCRATCH_REG(0) void tu_bo_list_init(struct tu_bo_list *list) @@ -61,6 +63,8 @@ tu_bo_list_add_info(struct tu_bo_list *list, const struct drm_msm_gem_submit_bo *bo_info) { + assert(bo_info->handle != 0); + for (uint32_t i = 0; i < list->count; ++i) { if (list->bo_infos[i].handle == bo_info->handle) { assert(list->bo_infos[i].presumed == bo_info->presumed); @@ -107,28 +111,10 @@ return VK_SUCCESS; } -static VkResult -tu_tiling_config_update_gmem_layout(struct tu_tiling_config *tiling, - const struct tu_device *dev) -{ - const uint32_t gmem_size = dev->physical_device->gmem_size; - uint32_t offset = 0; - - for (uint32_t i = 0; i < tiling->buffer_count; i++) { - /* 16KB-aligned */ - offset = align(offset, 0x4000); - - tiling->gmem_offsets[i] = offset; - offset += tiling->tile0.extent.width * tiling->tile0.extent.height * - tiling->buffer_cpp[i]; - } - - return offset <= gmem_size ? VK_SUCCESS : VK_ERROR_OUT_OF_DEVICE_MEMORY; -} - static void tu_tiling_config_update_tile_layout(struct tu_tiling_config *tiling, - const struct tu_device *dev) + const struct tu_device *dev, + uint32_t pixels) { const uint32_t tile_align_w = dev->physical_device->tile_align_w; const uint32_t tile_align_h = dev->physical_device->tile_align_h; @@ -164,15 +150,17 @@ } /* do not exceed gmem size */ - while (tu_tiling_config_update_gmem_layout(tiling, dev) != VK_SUCCESS) { - if (tiling->tile0.extent.width > tiling->tile0.extent.height) { + while (tiling->tile0.extent.width * tiling->tile0.extent.height > pixels) { + if (tiling->tile0.extent.width > MAX2(tile_align_w, tiling->tile0.extent.height)) { tiling->tile_count.width++; tiling->tile0.extent.width = - align(ra_width / tiling->tile_count.width, tile_align_w); + align(DIV_ROUND_UP(ra_width, tiling->tile_count.width), tile_align_w); } else { + /* if this assert fails then layout is impossible.. */ + assert(tiling->tile0.extent.height > tile_align_h); tiling->tile_count.height++; tiling->tile0.extent.height = - align(ra_height / tiling->tile_count.height, tile_align_h); + align(DIV_ROUND_UP(ra_height, tiling->tile_count.height), tile_align_h); } } } @@ -216,8 +204,8 @@ const uint32_t used_pipe_count = tiling->pipe_count.width * tiling->pipe_count.height; const VkExtent2D last_pipe = { - .width = tiling->tile_count.width % tiling->pipe0.width, - .height = tiling->tile_count.height % tiling->pipe0.height, + .width = (tiling->tile_count.width - 1) % tiling->pipe0.width + 1, + .height = (tiling->tile_count.height - 1) % tiling->pipe0.height + 1, }; assert(used_pipe_count <= max_pipe_count); @@ -248,37 +236,6 @@ } static void -tu_tiling_config_update(struct tu_tiling_config *tiling, - const struct tu_device *dev, - const uint32_t *buffer_cpp, - uint32_t buffer_count, - const VkRect2D *render_area) -{ - /* see if there is any real change */ - const bool ra_changed = - render_area && - memcmp(&tiling->render_area, render_area, sizeof(*render_area)); - const bool buf_changed = tiling->buffer_count != buffer_count || - memcmp(tiling->buffer_cpp, buffer_cpp, - sizeof(*buffer_cpp) * buffer_count); - if (!ra_changed && !buf_changed) - return; - - if (ra_changed) - tiling->render_area = *render_area; - - if (buf_changed) { - memcpy(tiling->buffer_cpp, buffer_cpp, - sizeof(*buffer_cpp) * buffer_count); - tiling->buffer_count = buffer_count; - } - - tu_tiling_config_update_tile_layout(tiling, dev); - tu_tiling_config_update_pipe_layout(tiling, dev); - tu_tiling_config_update_pipes(tiling, dev); -} - -static void tu_tiling_config_get_tile(const struct tu_tiling_config *tiling, const struct tu_device *dev, uint32_t tx, @@ -314,8 +271,8 @@ : tile->begin.y + tiling->tile0.extent.height; } -static enum a3xx_msaa_samples -tu6_msaa_samples(uint32_t samples) +enum a3xx_msaa_samples +tu_msaa_samples(uint32_t samples) { switch (samples) { case 1: @@ -352,18 +309,23 @@ tu_cs_emit_write_reg(cs, cmd->marker_reg, ++cmd->marker_seqno); } -void +unsigned tu6_emit_event_write(struct tu_cmd_buffer *cmd, struct tu_cs *cs, enum vgt_event_type event, bool need_seqno) { + unsigned seqno = 0; + tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, need_seqno ? 4 : 1); tu_cs_emit(cs, CP_EVENT_WRITE_0_EVENT(event)); if (need_seqno) { tu_cs_emit_qw(cs, cmd->scratch_bo.iova); - tu_cs_emit(cs, ++cmd->scratch_seqno); + seqno = ++cmd->scratch_seqno; + tu_cs_emit(cs, seqno); } + + return seqno; } static void @@ -387,62 +349,84 @@ } } +#define tu_image_view_ubwc_pitches(iview) \ + .pitch = tu_image_ubwc_pitch(iview->image, iview->base_mip), \ + .array_pitch = tu_image_ubwc_size(iview->image, iview->base_mip) >> 2 + static void -tu6_emit_zs(struct tu_cmd_buffer *cmd, struct tu_cs *cs) +tu6_emit_zs(struct tu_cmd_buffer *cmd, + const struct tu_subpass *subpass, + struct tu_cs *cs) { - const struct tu_subpass *subpass = cmd->state.subpass; + const struct tu_framebuffer *fb = cmd->state.framebuffer; const uint32_t a = subpass->depth_stencil_attachment.attachment; if (a == VK_ATTACHMENT_UNUSED) { - tu_cs_emit_pkt4(cs, REG_A6XX_RB_DEPTH_BUFFER_INFO, 6); - tu_cs_emit(cs, A6XX_RB_DEPTH_BUFFER_INFO_DEPTH_FORMAT(DEPTH6_NONE)); - tu_cs_emit(cs, 0x00000000); /* RB_DEPTH_BUFFER_PITCH */ - tu_cs_emit(cs, 0x00000000); /* RB_DEPTH_BUFFER_ARRAY_PITCH */ - tu_cs_emit(cs, 0x00000000); /* RB_DEPTH_BUFFER_BASE_LO */ - tu_cs_emit(cs, 0x00000000); /* RB_DEPTH_BUFFER_BASE_HI */ - tu_cs_emit(cs, 0x00000000); /* RB_DEPTH_BUFFER_BASE_GMEM */ + tu_cs_emit_regs(cs, + A6XX_RB_DEPTH_BUFFER_INFO(.depth_format = DEPTH6_NONE), + A6XX_RB_DEPTH_BUFFER_PITCH(0), + A6XX_RB_DEPTH_BUFFER_ARRAY_PITCH(0), + A6XX_RB_DEPTH_BUFFER_BASE(0), + A6XX_RB_DEPTH_BUFFER_BASE_GMEM(0)); + + tu_cs_emit_regs(cs, + A6XX_GRAS_SU_DEPTH_BUFFER_INFO(.depth_format = DEPTH6_NONE)); + + tu_cs_emit_regs(cs, + A6XX_GRAS_LRZ_BUFFER_BASE(0), + A6XX_GRAS_LRZ_BUFFER_PITCH(0), + A6XX_GRAS_LRZ_FAST_CLEAR_BUFFER_BASE(0)); - tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SU_DEPTH_BUFFER_INFO, 1); - tu_cs_emit(cs, - A6XX_GRAS_SU_DEPTH_BUFFER_INFO_DEPTH_FORMAT(DEPTH6_NONE)); - - tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_LRZ_BUFFER_BASE_LO, 5); - tu_cs_emit(cs, 0x00000000); /* RB_DEPTH_FLAG_BUFFER_BASE_LO */ - tu_cs_emit(cs, 0x00000000); /* RB_DEPTH_FLAG_BUFFER_BASE_HI */ - tu_cs_emit(cs, 0x00000000); /* GRAS_LRZ_BUFFER_PITCH */ - tu_cs_emit(cs, 0x00000000); /* GRAS_LRZ_FAST_CLEAR_BUFFER_BASE_LO */ - tu_cs_emit(cs, 0x00000000); /* GRAS_LRZ_FAST_CLEAR_BUFFER_BASE_HI */ - - tu_cs_emit_pkt4(cs, REG_A6XX_RB_STENCIL_INFO, 1); - tu_cs_emit(cs, 0x00000000); /* RB_STENCIL_INFO */ + tu_cs_emit_regs(cs, A6XX_RB_STENCIL_INFO(0)); return; } + const struct tu_image_view *iview = fb->attachments[a].attachment; + enum a6xx_depth_format fmt = tu6_pipe2depth(iview->vk_format); + + tu_cs_emit_regs(cs, + A6XX_RB_DEPTH_BUFFER_INFO(.depth_format = fmt), + A6XX_RB_DEPTH_BUFFER_PITCH(tu_image_stride(iview->image, iview->base_mip)), + A6XX_RB_DEPTH_BUFFER_ARRAY_PITCH(iview->image->layout.layer_size), + A6XX_RB_DEPTH_BUFFER_BASE(tu_image_view_base_ref(iview)), + A6XX_RB_DEPTH_BUFFER_BASE_GMEM(cmd->state.pass->attachments[a].gmem_offset)); + + tu_cs_emit_regs(cs, + A6XX_GRAS_SU_DEPTH_BUFFER_INFO(.depth_format = fmt)); + + tu_cs_emit_regs(cs, + A6XX_RB_DEPTH_FLAG_BUFFER_BASE(tu_image_view_ubwc_base_ref(iview)), + A6XX_RB_DEPTH_FLAG_BUFFER_PITCH(tu_image_view_ubwc_pitches(iview))); + + tu_cs_emit_regs(cs, + A6XX_GRAS_LRZ_BUFFER_BASE(0), + A6XX_GRAS_LRZ_BUFFER_PITCH(0), + A6XX_GRAS_LRZ_FAST_CLEAR_BUFFER_BASE(0)); + + tu_cs_emit_regs(cs, + A6XX_RB_STENCIL_INFO(0)); + /* enable zs? */ } static void -tu6_emit_mrt(struct tu_cmd_buffer *cmd, struct tu_cs *cs) +tu6_emit_mrt(struct tu_cmd_buffer *cmd, + const struct tu_subpass *subpass, + struct tu_cs *cs) { const struct tu_framebuffer *fb = cmd->state.framebuffer; - const struct tu_subpass *subpass = cmd->state.subpass; - const struct tu_tiling_config *tiling = &cmd->state.tiling_config; unsigned char mrt_comp[MAX_RTS] = { 0 }; unsigned srgb_cntl = 0; - uint32_t gmem_index = 0; for (uint32_t i = 0; i < subpass->color_count; ++i) { uint32_t a = subpass->color_attachments[i].attachment; if (a == VK_ATTACHMENT_UNUSED) continue; const struct tu_image_view *iview = fb->attachments[a].attachment; - const struct tu_image_level *slice = - &iview->image->levels[iview->base_mip]; - const enum a6xx_tile_mode tile_mode = TILE6_LINEAR; - uint32_t stride = 0; - uint32_t offset = 0; + const enum a6xx_tile_mode tile_mode = + tu6_get_image_tile_mode(iview->image, iview->base_mip); mrt_comp[i] = 0xf; @@ -453,92 +437,81 @@ tu6_get_native_format(iview->vk_format); assert(format && format->rb >= 0); - offset = slice->offset + slice->size * iview->base_layer; - stride = slice->pitch * vk_format_get_blocksize(iview->vk_format); - - tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(i), 6); - tu_cs_emit(cs, A6XX_RB_MRT_BUF_INFO_COLOR_FORMAT(format->rb) | - A6XX_RB_MRT_BUF_INFO_COLOR_TILE_MODE(tile_mode) | - A6XX_RB_MRT_BUF_INFO_COLOR_SWAP(format->swap)); - tu_cs_emit(cs, A6XX_RB_MRT_PITCH(stride)); - tu_cs_emit(cs, A6XX_RB_MRT_ARRAY_PITCH(slice->size)); - tu_cs_emit_qw(cs, iview->image->bo->iova + iview->image->bo_offset + - offset); /* BASE_LO/HI */ - tu_cs_emit( - cs, tiling->gmem_offsets[gmem_index++]); /* RB_MRT[i].BASE_GMEM */ - - tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_MRT_REG(i), 1); - tu_cs_emit(cs, A6XX_SP_FS_MRT_REG_COLOR_FORMAT(format->rb)); - -#if 0 - /* when we support UBWC, these would be the system memory - * addr/pitch/etc: - */ - tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_FLAG_BUFFER(i), 4); - tu_cs_emit(cs, 0x00000000); /* RB_MRT_FLAG_BUFFER[i].ADDR_LO */ - tu_cs_emit(cs, 0x00000000); /* RB_MRT_FLAG_BUFFER[i].ADDR_HI */ - tu_cs_emit(cs, A6XX_RB_MRT_FLAG_BUFFER_PITCH(0)); - tu_cs_emit(cs, A6XX_RB_MRT_FLAG_BUFFER_ARRAY_PITCH(0)); -#endif - } - - tu_cs_emit_pkt4(cs, REG_A6XX_RB_SRGB_CNTL, 1); - tu_cs_emit(cs, srgb_cntl); - - tu_cs_emit_pkt4(cs, REG_A6XX_SP_SRGB_CNTL, 1); - tu_cs_emit(cs, srgb_cntl); - - tu_cs_emit_pkt4(cs, REG_A6XX_RB_RENDER_COMPONENTS, 1); - tu_cs_emit(cs, A6XX_RB_RENDER_COMPONENTS_RT0(mrt_comp[0]) | - A6XX_RB_RENDER_COMPONENTS_RT1(mrt_comp[1]) | - A6XX_RB_RENDER_COMPONENTS_RT2(mrt_comp[2]) | - A6XX_RB_RENDER_COMPONENTS_RT3(mrt_comp[3]) | - A6XX_RB_RENDER_COMPONENTS_RT4(mrt_comp[4]) | - A6XX_RB_RENDER_COMPONENTS_RT5(mrt_comp[5]) | - A6XX_RB_RENDER_COMPONENTS_RT6(mrt_comp[6]) | - A6XX_RB_RENDER_COMPONENTS_RT7(mrt_comp[7])); - - tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_RENDER_COMPONENTS, 1); - tu_cs_emit(cs, A6XX_SP_FS_RENDER_COMPONENTS_RT0(mrt_comp[0]) | - A6XX_SP_FS_RENDER_COMPONENTS_RT1(mrt_comp[1]) | - A6XX_SP_FS_RENDER_COMPONENTS_RT2(mrt_comp[2]) | - A6XX_SP_FS_RENDER_COMPONENTS_RT3(mrt_comp[3]) | - A6XX_SP_FS_RENDER_COMPONENTS_RT4(mrt_comp[4]) | - A6XX_SP_FS_RENDER_COMPONENTS_RT5(mrt_comp[5]) | - A6XX_SP_FS_RENDER_COMPONENTS_RT6(mrt_comp[6]) | - A6XX_SP_FS_RENDER_COMPONENTS_RT7(mrt_comp[7])); -} - -static void -tu6_emit_msaa(struct tu_cmd_buffer *cmd, struct tu_cs *cs) -{ - const struct tu_subpass *subpass = cmd->state.subpass; - const enum a3xx_msaa_samples samples = - tu6_msaa_samples(subpass->max_sample_count); - - tu_cs_emit_pkt4(cs, REG_A6XX_SP_TP_RAS_MSAA_CNTL, 2); - tu_cs_emit(cs, A6XX_SP_TP_RAS_MSAA_CNTL_SAMPLES(samples)); - tu_cs_emit( - cs, A6XX_SP_TP_DEST_MSAA_CNTL_SAMPLES(samples) | - ((samples == MSAA_ONE) ? A6XX_SP_TP_DEST_MSAA_CNTL_MSAA_DISABLE - : 0)); - - tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_RAS_MSAA_CNTL, 2); - tu_cs_emit(cs, A6XX_GRAS_RAS_MSAA_CNTL_SAMPLES(samples)); - tu_cs_emit( - cs, - A6XX_GRAS_DEST_MSAA_CNTL_SAMPLES(samples) | - ((samples == MSAA_ONE) ? A6XX_GRAS_DEST_MSAA_CNTL_MSAA_DISABLE : 0)); - - tu_cs_emit_pkt4(cs, REG_A6XX_RB_RAS_MSAA_CNTL, 2); - tu_cs_emit(cs, A6XX_RB_RAS_MSAA_CNTL_SAMPLES(samples)); - tu_cs_emit( - cs, - A6XX_RB_DEST_MSAA_CNTL_SAMPLES(samples) | - ((samples == MSAA_ONE) ? A6XX_RB_DEST_MSAA_CNTL_MSAA_DISABLE : 0)); + tu_cs_emit_regs(cs, + A6XX_RB_MRT_BUF_INFO(i, + .color_tile_mode = tile_mode, + .color_format = format->rb, + .color_swap = format->swap), + A6XX_RB_MRT_PITCH(i, tu_image_stride(iview->image, iview->base_mip)), + A6XX_RB_MRT_ARRAY_PITCH(i, iview->image->layout.layer_size), + A6XX_RB_MRT_BASE(i, tu_image_view_base_ref(iview)), + A6XX_RB_MRT_BASE_GMEM(i, cmd->state.pass->attachments[a].gmem_offset)); + + tu_cs_emit_regs(cs, + A6XX_SP_FS_MRT_REG(i, + .color_format = format->rb, + .color_sint = vk_format_is_sint(iview->vk_format), + .color_uint = vk_format_is_uint(iview->vk_format))); + + tu_cs_emit_regs(cs, + A6XX_RB_MRT_FLAG_BUFFER_ADDR(i, tu_image_view_ubwc_base_ref(iview)), + A6XX_RB_MRT_FLAG_BUFFER_PITCH(i, tu_image_view_ubwc_pitches(iview))); + } + + tu_cs_emit_regs(cs, + A6XX_RB_SRGB_CNTL(.dword = srgb_cntl)); + + tu_cs_emit_regs(cs, + A6XX_SP_SRGB_CNTL(.dword = srgb_cntl)); + + tu_cs_emit_regs(cs, + A6XX_RB_RENDER_COMPONENTS( + .rt0 = mrt_comp[0], + .rt1 = mrt_comp[1], + .rt2 = mrt_comp[2], + .rt3 = mrt_comp[3], + .rt4 = mrt_comp[4], + .rt5 = mrt_comp[5], + .rt6 = mrt_comp[6], + .rt7 = mrt_comp[7])); + + tu_cs_emit_regs(cs, + A6XX_SP_FS_RENDER_COMPONENTS( + .rt0 = mrt_comp[0], + .rt1 = mrt_comp[1], + .rt2 = mrt_comp[2], + .rt3 = mrt_comp[3], + .rt4 = mrt_comp[4], + .rt5 = mrt_comp[5], + .rt6 = mrt_comp[6], + .rt7 = mrt_comp[7])); +} + +static void +tu6_emit_msaa(struct tu_cmd_buffer *cmd, + const struct tu_subpass *subpass, + struct tu_cs *cs) +{ + const enum a3xx_msaa_samples samples = tu_msaa_samples(subpass->samples); + bool msaa_disable = samples == MSAA_ONE; + + tu_cs_emit_regs(cs, + A6XX_SP_TP_RAS_MSAA_CNTL(samples), + A6XX_SP_TP_DEST_MSAA_CNTL(.samples = samples, + .msaa_disable = msaa_disable)); + + tu_cs_emit_regs(cs, + A6XX_GRAS_RAS_MSAA_CNTL(samples), + A6XX_GRAS_DEST_MSAA_CNTL(.samples = samples, + .msaa_disable = msaa_disable)); + + tu_cs_emit_regs(cs, + A6XX_RB_RAS_MSAA_CNTL(samples), + A6XX_RB_DEST_MSAA_CNTL(.samples = samples, + .msaa_disable = msaa_disable)); - tu_cs_emit_pkt4(cs, REG_A6XX_RB_MSAA_CNTL, 1); - tu_cs_emit(cs, A6XX_RB_MSAA_CNTL_SAMPLES(samples)); + tu_cs_emit_regs(cs, + A6XX_RB_MSAA_CNTL(samples)); } static void @@ -548,18 +521,20 @@ const uint32_t bin_w = tiling->tile0.extent.width; const uint32_t bin_h = tiling->tile0.extent.height; - tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_BIN_CONTROL, 1); - tu_cs_emit(cs, A6XX_GRAS_BIN_CONTROL_BINW(bin_w) | - A6XX_GRAS_BIN_CONTROL_BINH(bin_h) | flags); - - tu_cs_emit_pkt4(cs, REG_A6XX_RB_BIN_CONTROL, 1); - tu_cs_emit(cs, A6XX_RB_BIN_CONTROL_BINW(bin_w) | - A6XX_RB_BIN_CONTROL_BINH(bin_h) | flags); + tu_cs_emit_regs(cs, + A6XX_GRAS_BIN_CONTROL(.binw = bin_w, + .binh = bin_h, + .dword = flags)); + + tu_cs_emit_regs(cs, + A6XX_RB_BIN_CONTROL(.binw = bin_w, + .binh = bin_h, + .dword = flags)); /* no flag for RB_BIN_CONTROL2... */ - tu_cs_emit_pkt4(cs, REG_A6XX_RB_BIN_CONTROL2, 1); - tu_cs_emit(cs, A6XX_RB_BIN_CONTROL2_BINW(bin_w) | - A6XX_RB_BIN_CONTROL2_BINH(bin_h)); + tu_cs_emit_regs(cs, + A6XX_RB_BIN_CONTROL2(.binw = bin_w, + .binh = bin_h)); } static void @@ -579,19 +554,25 @@ } static void -tu6_emit_blit_scissor(struct tu_cmd_buffer *cmd, struct tu_cs *cs) +tu6_emit_blit_scissor(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool align) { const VkRect2D *render_area = &cmd->state.tiling_config.render_area; - const uint32_t x1 = render_area->offset.x; - const uint32_t y1 = render_area->offset.y; - const uint32_t x2 = x1 + render_area->extent.width - 1; - const uint32_t y2 = y1 + render_area->extent.height - 1; - - tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_SCISSOR_TL, 2); - tu_cs_emit(cs, - A6XX_RB_BLIT_SCISSOR_TL_X(x1) | A6XX_RB_BLIT_SCISSOR_TL_Y(y1)); - tu_cs_emit(cs, - A6XX_RB_BLIT_SCISSOR_BR_X(x2) | A6XX_RB_BLIT_SCISSOR_BR_Y(y2)); + uint32_t x1 = render_area->offset.x; + uint32_t y1 = render_area->offset.y; + uint32_t x2 = x1 + render_area->extent.width - 1; + uint32_t y2 = y1 + render_area->extent.height - 1; + + /* TODO: alignment requirement seems to be less than tile_align_w/h */ + if (align) { + x1 = x1 & ~cmd->device->physical_device->tile_align_w; + y1 = y1 & ~cmd->device->physical_device->tile_align_h; + x2 = ALIGN_POT(x2 + 1, cmd->device->physical_device->tile_align_w) - 1; + y2 = ALIGN_POT(y2 + 1, cmd->device->physical_device->tile_align_h) - 1; + } + + tu_cs_emit_regs(cs, + A6XX_RB_BLIT_SCISSOR_TL(.x = x1, .y = y1), + A6XX_RB_BLIT_SCISSOR_BR(.x = x2, .y = y2)); } static void @@ -599,78 +580,36 @@ struct tu_cs *cs, const struct tu_image_view *iview, uint32_t gmem_offset, - uint32_t blit_info) -{ - const struct tu_image_level *slice = - &iview->image->levels[iview->base_mip]; - const uint32_t offset = slice->offset + slice->size * iview->base_layer; - const uint32_t stride = - slice->pitch * vk_format_get_blocksize(iview->vk_format); - const enum a6xx_tile_mode tile_mode = TILE6_LINEAR; - const enum a3xx_msaa_samples samples = tu6_msaa_samples(1); - - tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_INFO, 1); - tu_cs_emit(cs, blit_info); - - /* tile mode? */ - const struct tu_native_format *format = - tu6_get_native_format(iview->vk_format); - assert(format && format->rb >= 0); - - tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 5); - tu_cs_emit(cs, A6XX_RB_BLIT_DST_INFO_TILE_MODE(tile_mode) | - A6XX_RB_BLIT_DST_INFO_SAMPLES(samples) | - A6XX_RB_BLIT_DST_INFO_COLOR_FORMAT(format->rb) | - A6XX_RB_BLIT_DST_INFO_COLOR_SWAP(format->swap)); - tu_cs_emit_qw(cs, - iview->image->bo->iova + iview->image->bo_offset + offset); - tu_cs_emit(cs, A6XX_RB_BLIT_DST_PITCH(stride)); - tu_cs_emit(cs, A6XX_RB_BLIT_DST_ARRAY_PITCH(slice->size)); - - tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_BASE_GMEM, 1); - tu_cs_emit(cs, gmem_offset); -} - -static void -tu6_emit_blit_clear(struct tu_cmd_buffer *cmd, - struct tu_cs *cs, - const struct tu_image_view *iview, - uint32_t gmem_offset, - const VkClearValue *clear_value) + bool resolve) { - const enum a6xx_tile_mode tile_mode = TILE6_LINEAR; - const enum a3xx_msaa_samples samples = tu6_msaa_samples(1); + tu_cs_emit_regs(cs, + A6XX_RB_BLIT_INFO(.unk0 = !resolve, .gmem = !resolve)); const struct tu_native_format *format = tu6_get_native_format(iview->vk_format); assert(format && format->rb >= 0); - /* must be WZYX; other values are ignored */ - const enum a3xx_color_swap swap = WZYX; - - tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 1); - tu_cs_emit(cs, A6XX_RB_BLIT_DST_INFO_TILE_MODE(tile_mode) | - A6XX_RB_BLIT_DST_INFO_SAMPLES(samples) | - A6XX_RB_BLIT_DST_INFO_COLOR_FORMAT(format->rb) | - A6XX_RB_BLIT_DST_INFO_COLOR_SWAP(swap)); - tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_INFO, 1); - tu_cs_emit(cs, A6XX_RB_BLIT_INFO_GMEM | A6XX_RB_BLIT_INFO_CLEAR_MASK(0xf)); - - tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_BASE_GMEM, 1); - tu_cs_emit(cs, gmem_offset); - - tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_88D0, 1); - tu_cs_emit(cs, 0); - - /* pack clear_value into WZYX order */ - uint32_t clear_vals[4] = { 0 }; - tu_pack_clear_value(clear_value, iview->vk_format, clear_vals); + enum a6xx_tile_mode tile_mode = + tu6_get_image_tile_mode(iview->image, iview->base_mip); + tu_cs_emit_regs(cs, + A6XX_RB_BLIT_DST_INFO( + .tile_mode = tile_mode, + .samples = tu_msaa_samples(iview->image->samples), + .color_format = format->rb, + .color_swap = format->swap, + .flags = iview->image->layout.ubwc_size != 0), + A6XX_RB_BLIT_DST(tu_image_view_base_ref(iview)), + A6XX_RB_BLIT_DST_PITCH(tu_image_stride(iview->image, iview->base_mip)), + A6XX_RB_BLIT_DST_ARRAY_PITCH(iview->image->layout.layer_size)); + + if (iview->image->layout.ubwc_size) { + tu_cs_emit_regs(cs, + A6XX_RB_BLIT_FLAG_DST(tu_image_view_ubwc_base_ref(iview)), + A6XX_RB_BLIT_FLAG_DST_PITCH(tu_image_view_ubwc_pitches(iview))); + } - tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_CLEAR_COLOR_DW0, 4); - tu_cs_emit(cs, clear_vals[0]); - tu_cs_emit(cs, clear_vals[1]); - tu_cs_emit(cs, clear_vals[2]); - tu_cs_emit(cs, clear_vals[3]); + tu_cs_emit_regs(cs, + A6XX_RB_BLIT_BASE_GMEM(gmem_offset)); } static void @@ -689,17 +628,13 @@ uint32_t x2, uint32_t y2) { - tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SC_WINDOW_SCISSOR_TL, 2); - tu_cs_emit(cs, A6XX_GRAS_SC_WINDOW_SCISSOR_TL_X(x1) | - A6XX_GRAS_SC_WINDOW_SCISSOR_TL_Y(y1)); - tu_cs_emit(cs, A6XX_GRAS_SC_WINDOW_SCISSOR_BR_X(x2) | - A6XX_GRAS_SC_WINDOW_SCISSOR_BR_Y(y2)); - - tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_RESOLVE_CNTL_1, 2); - tu_cs_emit( - cs, A6XX_GRAS_RESOLVE_CNTL_1_X(x1) | A6XX_GRAS_RESOLVE_CNTL_1_Y(y1)); - tu_cs_emit( - cs, A6XX_GRAS_RESOLVE_CNTL_2_X(x2) | A6XX_GRAS_RESOLVE_CNTL_2_Y(y2)); + tu_cs_emit_regs(cs, + A6XX_GRAS_SC_WINDOW_SCISSOR_TL(.x = x1, .y = y1), + A6XX_GRAS_SC_WINDOW_SCISSOR_BR(.x = x2, .y = y2)); + + tu_cs_emit_regs(cs, + A6XX_GRAS_RESOLVE_CNTL_1(.x = x1, .y = y1), + A6XX_GRAS_RESOLVE_CNTL_2(.x = x2, .y = y2)); } static void @@ -708,19 +643,28 @@ uint32_t x1, uint32_t y1) { - tu_cs_emit_pkt4(cs, REG_A6XX_RB_WINDOW_OFFSET, 1); - tu_cs_emit(cs, A6XX_RB_WINDOW_OFFSET_X(x1) | A6XX_RB_WINDOW_OFFSET_Y(y1)); + tu_cs_emit_regs(cs, + A6XX_RB_WINDOW_OFFSET(.x = x1, .y = y1)); + + tu_cs_emit_regs(cs, + A6XX_RB_WINDOW_OFFSET2(.x = x1, .y = y1)); + + tu_cs_emit_regs(cs, + A6XX_SP_WINDOW_OFFSET(.x = x1, .y = y1)); + + tu_cs_emit_regs(cs, + A6XX_SP_TP_WINDOW_OFFSET(.x = x1, .y = y1)); +} + +static bool +use_hw_binning(struct tu_cmd_buffer *cmd) +{ + const struct tu_tiling_config *tiling = &cmd->state.tiling_config; - tu_cs_emit_pkt4(cs, REG_A6XX_RB_WINDOW_OFFSET2, 1); - tu_cs_emit(cs, - A6XX_RB_WINDOW_OFFSET2_X(x1) | A6XX_RB_WINDOW_OFFSET2_Y(y1)); - - tu_cs_emit_pkt4(cs, REG_A6XX_SP_WINDOW_OFFSET, 1); - tu_cs_emit(cs, A6XX_SP_WINDOW_OFFSET_X(x1) | A6XX_SP_WINDOW_OFFSET_Y(y1)); - - tu_cs_emit_pkt4(cs, REG_A6XX_SP_TP_WINDOW_OFFSET, 1); - tu_cs_emit( - cs, A6XX_SP_TP_WINDOW_OFFSET_X(x1) | A6XX_SP_TP_WINDOW_OFFSET_Y(y1)); + if (unlikely(cmd->device->physical_device->instance->debug_flags & TU_DEBUG_NOBIN)) + return false; + + return (tiling->tile_count.width * tiling->tile_count.height) > 2; } static void @@ -743,11 +687,53 @@ tu6_emit_window_scissor(cmd, cs, x1, y1, x2, y2); tu6_emit_window_offset(cmd, cs, x1, y1); - tu_cs_emit_pkt4(cs, REG_A6XX_VPC_SO_OVERRIDE, 1); - tu_cs_emit(cs, A6XX_VPC_SO_OVERRIDE_SO_DISABLE); + tu_cs_emit_regs(cs, + A6XX_VPC_SO_OVERRIDE(.so_disable = true)); + + if (use_hw_binning(cmd)) { + tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0); + + tu_cs_emit_pkt7(cs, CP_SET_MODE, 1); + tu_cs_emit(cs, 0x0); + + tu_cs_emit_pkt7(cs, CP_REG_TEST, 1); + tu_cs_emit(cs, A6XX_CP_REG_TEST_0_REG(OVERFLOW_FLAG_REG) | + A6XX_CP_REG_TEST_0_BIT(0) | + A6XX_CP_REG_TEST_0_WAIT_FOR_ME); + + tu_cs_emit_pkt7(cs, CP_COND_REG_EXEC, 2); + tu_cs_emit(cs, CP_COND_REG_EXEC_0_MODE(PRED_TEST)); + tu_cs_emit(cs, CP_COND_REG_EXEC_1_DWORDS(11)); + + /* if (no overflow) */ { + tu_cs_emit_pkt7(cs, CP_SET_BIN_DATA5, 7); + tu_cs_emit(cs, cmd->state.tiling_config.pipe_sizes[tile->pipe] | + CP_SET_BIN_DATA5_0_VSC_N(tile->slot)); + tu_cs_emit_qw(cs, cmd->vsc_data.iova + tile->pipe * cmd->vsc_data_pitch); + tu_cs_emit_qw(cs, cmd->vsc_data.iova + (tile->pipe * 4) + (32 * cmd->vsc_data_pitch)); + tu_cs_emit_qw(cs, cmd->vsc_data2.iova + (tile->pipe * cmd->vsc_data2_pitch)); + + tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1); + tu_cs_emit(cs, 0x0); + + /* use a NOP packet to skip over the 'else' side: */ + tu_cs_emit_pkt7(cs, CP_NOP, 2); + } /* else */ { + tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1); + tu_cs_emit(cs, 0x1); + } + + tu_cs_emit_pkt7(cs, CP_SET_MODE, 1); + tu_cs_emit(cs, 0x0); + + tu_cs_emit_regs(cs, + A6XX_RB_UNKNOWN_8804(0)); + + tu_cs_emit_regs(cs, + A6XX_SP_TP_UNKNOWN_B304(0)); - if (false) { - /* hw binning? */ + tu_cs_emit_regs(cs, + A6XX_GRAS_UNKNOWN_80A4(0)); } else { tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1); tu_cs_emit(cs, 0x1); @@ -758,49 +744,120 @@ } static void -tu6_emit_tile_load(struct tu_cmd_buffer *cmd, struct tu_cs *cs) +tu6_emit_load_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs, uint32_t a) { - const struct tu_framebuffer *fb = cmd->state.framebuffer; - const struct tu_subpass *subpass = cmd->state.subpass; const struct tu_tiling_config *tiling = &cmd->state.tiling_config; - const struct tu_attachment_state *attachments = cmd->state.attachments; - - tu6_emit_blit_scissor(cmd, cs); + const struct tu_framebuffer *fb = cmd->state.framebuffer; + const struct tu_image_view *iview = fb->attachments[a].attachment; + const struct tu_render_pass_attachment *attachment = + &cmd->state.pass->attachments[a]; - uint32_t gmem_index = 0; - for (uint32_t i = 0; i < subpass->color_count; ++i) { - const uint32_t a = subpass->color_attachments[i].attachment; - if (a == VK_ATTACHMENT_UNUSED) - continue; + if (attachment->gmem_offset < 0) + return; - const struct tu_image_view *iview = fb->attachments[a].attachment; - const struct tu_attachment_state *att = attachments + a; - if (att->pending_clear_aspects) { - assert(att->pending_clear_aspects == VK_IMAGE_ASPECT_COLOR_BIT); - tu6_emit_blit_clear(cmd, cs, iview, - tiling->gmem_offsets[gmem_index++], - &att->clear_value); - } else { - tu6_emit_blit_info(cmd, cs, iview, - tiling->gmem_offsets[gmem_index++], - A6XX_RB_BLIT_INFO_UNK0 | A6XX_RB_BLIT_INFO_GMEM); - } + const uint32_t x1 = tiling->render_area.offset.x; + const uint32_t y1 = tiling->render_area.offset.y; + const uint32_t x2 = x1 + tiling->render_area.extent.width; + const uint32_t y2 = y1 + tiling->render_area.extent.height; + const uint32_t tile_x2 = + tiling->tile0.offset.x + tiling->tile0.extent.width * tiling->tile_count.width; + const uint32_t tile_y2 = + tiling->tile0.offset.y + tiling->tile0.extent.height * tiling->tile_count.height; + bool need_load = + x1 != tiling->tile0.offset.x || x2 != MIN2(fb->width, tile_x2) || + y1 != tiling->tile0.offset.y || y2 != MIN2(fb->height, tile_y2); + + if (need_load) + tu_finishme("improve handling of unaligned render area"); + + if (attachment->load_op == VK_ATTACHMENT_LOAD_OP_LOAD) + need_load = true; + + if (vk_format_has_stencil(iview->vk_format) && + attachment->stencil_load_op == VK_ATTACHMENT_LOAD_OP_LOAD) + need_load = true; + if (need_load) { + tu6_emit_blit_info(cmd, cs, iview, attachment->gmem_offset, false); tu6_emit_blit(cmd, cs); } - - /* load/clear zs? */ } static void -tu6_emit_tile_store(struct tu_cmd_buffer *cmd, struct tu_cs *cs) +tu6_emit_clear_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs, + uint32_t a, + const VkRenderPassBeginInfo *info) { const struct tu_framebuffer *fb = cmd->state.framebuffer; - const struct tu_tiling_config *tiling = &cmd->state.tiling_config; + const struct tu_image_view *iview = fb->attachments[a].attachment; + const struct tu_render_pass_attachment *attachment = + &cmd->state.pass->attachments[a]; + unsigned clear_mask = 0; + + /* note: this means it isn't used by any subpass and shouldn't be cleared anyway */ + if (attachment->gmem_offset < 0) + return; + + if (attachment->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) + clear_mask = 0xf; - if (false) { - /* hw binning? */ + if (vk_format_has_stencil(iview->vk_format)) { + clear_mask &= 0x1; + if (attachment->stencil_load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) + clear_mask |= 0x2; } + if (!clear_mask) + return; + + const struct tu_native_format *format = + tu6_get_native_format(iview->vk_format); + assert(format && format->rb >= 0); + + tu_cs_emit_regs(cs, + A6XX_RB_BLIT_DST_INFO(.color_format = format->rb)); + + tu_cs_emit_regs(cs, + A6XX_RB_BLIT_INFO(.gmem = true, + .clear_mask = clear_mask)); + + tu_cs_emit_regs(cs, + A6XX_RB_BLIT_BASE_GMEM(attachment->gmem_offset)); + + tu_cs_emit_regs(cs, + A6XX_RB_UNKNOWN_88D0(0)); + + uint32_t clear_vals[4] = { 0 }; + tu_pack_clear_value(&info->pClearValues[a], iview->vk_format, clear_vals); + + tu_cs_emit_regs(cs, + A6XX_RB_BLIT_CLEAR_COLOR_DW0(clear_vals[0]), + A6XX_RB_BLIT_CLEAR_COLOR_DW1(clear_vals[1]), + A6XX_RB_BLIT_CLEAR_COLOR_DW2(clear_vals[2]), + A6XX_RB_BLIT_CLEAR_COLOR_DW3(clear_vals[3])); + + tu6_emit_blit(cmd, cs); +} + +static void +tu6_emit_store_attachment(struct tu_cmd_buffer *cmd, + struct tu_cs *cs, + uint32_t a, + uint32_t gmem_a) +{ + if (cmd->state.pass->attachments[a].store_op == VK_ATTACHMENT_STORE_OP_DONT_CARE) + return; + + tu6_emit_blit_info(cmd, cs, + cmd->state.framebuffer->attachments[a].attachment, + cmd->state.pass->attachments[gmem_a].gmem_offset, true); + tu6_emit_blit(cmd, cs); +} + +static void +tu6_emit_tile_store(struct tu_cmd_buffer *cmd, struct tu_cs *cs) +{ + const struct tu_render_pass *pass = cmd->state.pass; + const struct tu_subpass *subpass = &pass->subpasses[pass->subpass_count-1]; tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3); tu_cs_emit(cs, CP_SET_DRAW_STATE__0_COUNT(0) | @@ -817,26 +874,28 @@ tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_RESOLVE) | 0x10); tu6_emit_marker(cmd, cs); - tu6_emit_blit_scissor(cmd, cs); + tu6_emit_blit_scissor(cmd, cs, true); - uint32_t gmem_index = 0; - for (uint32_t i = 0; i < cmd->state.subpass->color_count; ++i) { - uint32_t a = cmd->state.subpass->color_attachments[i].attachment; - if (a == VK_ATTACHMENT_UNUSED) - continue; + for (uint32_t a = 0; a < pass->attachment_count; ++a) { + if (pass->attachments[a].gmem_offset >= 0) + tu6_emit_store_attachment(cmd, cs, a, a); + } - const struct tu_image_view *iview = fb->attachments[a].attachment; - tu6_emit_blit_info(cmd, cs, iview, tiling->gmem_offsets[gmem_index++], - 0); - tu6_emit_blit(cmd, cs); + if (subpass->resolve_attachments) { + for (unsigned i = 0; i < subpass->color_count; i++) { + uint32_t a = subpass->resolve_attachments[i].attachment; + if (a != VK_ATTACHMENT_UNUSED) + tu6_emit_store_attachment(cmd, cs, a, + subpass->color_attachments[i].attachment); + } } } static void tu6_emit_restart_index(struct tu_cs *cs, uint32_t restart_index) { - tu_cs_emit_pkt4(cs, REG_A6XX_PC_RESTART_INDEX, 1); - tu_cs_emit(cs, restart_index); + tu_cs_emit_regs(cs, + A6XX_PC_RESTART_INDEX(restart_index)); } static void @@ -873,7 +932,7 @@ tu_cs_emit_write_reg(cs, REG_A6XX_UCHE_CLIENT_PF, 4); tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8E01, 0x0); tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_AB00, 0x5); - tu_cs_emit_write_reg(cs, REG_A6XX_VFD_UNKNOWN_A009, 0x00000001); + tu_cs_emit_write_reg(cs, REG_A6XX_VFD_ADD_OFFSET, A6XX_VFD_ADD_OFFSET_VERTEX); tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8811, 0x00000010); tu_cs_emit_write_reg(cs, REG_A6XX_PC_MODE_CNTL, 0x1f); @@ -909,8 +968,8 @@ tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9806, 0); tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9980, 0); - tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9B06, 0); - tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9B06, 0); + tu_cs_emit_write_reg(cs, REG_A6XX_PC_PRIMITIVE_CNTL_6, 0); + tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9B07, 0); tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_A81B, 0); @@ -954,66 +1013,277 @@ tu_cs_emit(cs, CP_SET_DRAW_STATE__1_ADDR_LO(0)); tu_cs_emit(cs, CP_SET_DRAW_STATE__2_ADDR_HI(0)); - tu_cs_emit_pkt4(cs, REG_A6XX_VPC_SO_BUFFER_BASE_LO(0), 3); - tu_cs_emit(cs, 0x00000000); /* VPC_SO_BUFFER_BASE_LO_0 */ - tu_cs_emit(cs, 0x00000000); /* VPC_SO_BUFFER_BASE_HI_0 */ - tu_cs_emit(cs, 0x00000000); /* VPC_SO_BUFFER_SIZE_0 */ - - tu_cs_emit_pkt4(cs, REG_A6XX_VPC_SO_FLUSH_BASE_LO(0), 2); - tu_cs_emit(cs, 0x00000000); /* VPC_SO_FLUSH_BASE_LO_0 */ - tu_cs_emit(cs, 0x00000000); /* VPC_SO_FLUSH_BASE_HI_0 */ - - tu_cs_emit_pkt4(cs, REG_A6XX_VPC_SO_BUF_CNTL, 1); - tu_cs_emit(cs, 0x00000000); /* VPC_SO_BUF_CNTL */ - - tu_cs_emit_pkt4(cs, REG_A6XX_VPC_SO_BUFFER_OFFSET(0), 1); - tu_cs_emit(cs, 0x00000000); /* UNKNOWN_E2AB */ - - tu_cs_emit_pkt4(cs, REG_A6XX_VPC_SO_BUFFER_BASE_LO(1), 3); - tu_cs_emit(cs, 0x00000000); - tu_cs_emit(cs, 0x00000000); - tu_cs_emit(cs, 0x00000000); - - tu_cs_emit_pkt4(cs, REG_A6XX_VPC_SO_BUFFER_OFFSET(1), 6); - tu_cs_emit(cs, 0x00000000); - tu_cs_emit(cs, 0x00000000); - tu_cs_emit(cs, 0x00000000); - tu_cs_emit(cs, 0x00000000); - tu_cs_emit(cs, 0x00000000); - tu_cs_emit(cs, 0x00000000); - - tu_cs_emit_pkt4(cs, REG_A6XX_VPC_SO_BUFFER_OFFSET(2), 6); - tu_cs_emit(cs, 0x00000000); - tu_cs_emit(cs, 0x00000000); - tu_cs_emit(cs, 0x00000000); - tu_cs_emit(cs, 0x00000000); - tu_cs_emit(cs, 0x00000000); - tu_cs_emit(cs, 0x00000000); - - tu_cs_emit_pkt4(cs, REG_A6XX_VPC_SO_BUFFER_OFFSET(3), 3); - tu_cs_emit(cs, 0x00000000); - tu_cs_emit(cs, 0x00000000); - tu_cs_emit(cs, 0x00000000); - - tu_cs_emit_pkt4(cs, REG_A6XX_SP_HS_CTRL_REG0, 1); - tu_cs_emit(cs, 0x00000000); + tu_cs_emit_regs(cs, + A6XX_VPC_SO_BUFFER_BASE(0), + A6XX_VPC_SO_BUFFER_SIZE(0)); + + tu_cs_emit_regs(cs, + A6XX_VPC_SO_FLUSH_BASE(0)); + + tu_cs_emit_regs(cs, + A6XX_VPC_SO_BUF_CNTL(0)); + + tu_cs_emit_regs(cs, + A6XX_VPC_SO_BUFFER_OFFSET(0, 0)); + + tu_cs_emit_regs(cs, + A6XX_VPC_SO_BUFFER_BASE(1, 0), + A6XX_VPC_SO_BUFFER_SIZE(1, 0)); + + tu_cs_emit_regs(cs, + A6XX_VPC_SO_BUFFER_OFFSET(1, 0), + A6XX_VPC_SO_FLUSH_BASE(1, 0), + A6XX_VPC_SO_BUFFER_BASE(2, 0), + A6XX_VPC_SO_BUFFER_SIZE(2, 0)); + + tu_cs_emit_regs(cs, + A6XX_VPC_SO_BUFFER_OFFSET(2, 0), + A6XX_VPC_SO_FLUSH_BASE(2, 0), + A6XX_VPC_SO_BUFFER_BASE(3, 0), + A6XX_VPC_SO_BUFFER_SIZE(3, 0)); + + tu_cs_emit_regs(cs, + A6XX_VPC_SO_BUFFER_OFFSET(3, 0), + A6XX_VPC_SO_FLUSH_BASE(3, 0)); + + tu_cs_emit_regs(cs, + A6XX_SP_HS_CTRL_REG0(0)); - tu_cs_emit_pkt4(cs, REG_A6XX_SP_GS_CTRL_REG0, 1); - tu_cs_emit(cs, 0x00000000); + tu_cs_emit_regs(cs, + A6XX_SP_GS_CTRL_REG0(0)); - tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_LRZ_CNTL, 1); - tu_cs_emit(cs, 0x00000000); + tu_cs_emit_regs(cs, + A6XX_GRAS_LRZ_CNTL(0)); - tu_cs_emit_pkt4(cs, REG_A6XX_RB_LRZ_CNTL, 1); - tu_cs_emit(cs, 0x00000000); + tu_cs_emit_regs(cs, + A6XX_RB_LRZ_CNTL(0)); tu_cs_sanity_check(cs); } static void +tu6_cache_flush(struct tu_cmd_buffer *cmd, struct tu_cs *cs) +{ + unsigned seqno; + + seqno = tu6_emit_event_write(cmd, cs, CACHE_FLUSH_AND_INV_EVENT, true); + + tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6); + tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ) | + CP_WAIT_REG_MEM_0_POLL_MEMORY); + tu_cs_emit_qw(cs, cmd->scratch_bo.iova); + tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(seqno)); + tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0)); + tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16)); + + seqno = tu6_emit_event_write(cmd, cs, CACHE_FLUSH_TS, true); + + tu_cs_emit_pkt7(cs, CP_WAIT_MEM_GTE, 4); + tu_cs_emit(cs, CP_WAIT_MEM_GTE_0_RESERVED(0)); + tu_cs_emit_qw(cs, cmd->scratch_bo.iova); + tu_cs_emit(cs, CP_WAIT_MEM_GTE_3_REF(seqno)); +} + +static void +update_vsc_pipe(struct tu_cmd_buffer *cmd, struct tu_cs *cs) +{ + const struct tu_tiling_config *tiling = &cmd->state.tiling_config; + + tu_cs_emit_regs(cs, + A6XX_VSC_BIN_SIZE(.width = tiling->tile0.extent.width, + .height = tiling->tile0.extent.height), + A6XX_VSC_SIZE_ADDRESS(.bo = &cmd->vsc_data, + .bo_offset = 32 * cmd->vsc_data_pitch)); + + tu_cs_emit_regs(cs, + A6XX_VSC_BIN_COUNT(.nx = tiling->tile_count.width, + .ny = tiling->tile_count.height)); + + tu_cs_emit_pkt4(cs, REG_A6XX_VSC_PIPE_CONFIG_REG(0), 32); + for (unsigned i = 0; i < 32; i++) + tu_cs_emit(cs, tiling->pipe_config[i]); + + tu_cs_emit_regs(cs, + A6XX_VSC_PIPE_DATA2_ADDRESS(.bo = &cmd->vsc_data2), + A6XX_VSC_PIPE_DATA2_PITCH(cmd->vsc_data2_pitch), + A6XX_VSC_PIPE_DATA2_ARRAY_PITCH(cmd->vsc_data2.size)); + + tu_cs_emit_regs(cs, + A6XX_VSC_PIPE_DATA_ADDRESS(.bo = &cmd->vsc_data), + A6XX_VSC_PIPE_DATA_PITCH(cmd->vsc_data_pitch), + A6XX_VSC_PIPE_DATA_ARRAY_PITCH(cmd->vsc_data.size)); +} + +static void +emit_vsc_overflow_test(struct tu_cmd_buffer *cmd, struct tu_cs *cs) +{ + const struct tu_tiling_config *tiling = &cmd->state.tiling_config; + const uint32_t used_pipe_count = + tiling->pipe_count.width * tiling->pipe_count.height; + + /* Clear vsc_scratch: */ + tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 3); + tu_cs_emit_qw(cs, cmd->scratch_bo.iova + VSC_SCRATCH); + tu_cs_emit(cs, 0x0); + + /* Check for overflow, write vsc_scratch if detected: */ + for (int i = 0; i < used_pipe_count; i++) { + tu_cs_emit_pkt7(cs, CP_COND_WRITE5, 8); + tu_cs_emit(cs, CP_COND_WRITE5_0_FUNCTION(WRITE_GE) | + CP_COND_WRITE5_0_WRITE_MEMORY); + tu_cs_emit(cs, CP_COND_WRITE5_1_POLL_ADDR_LO(REG_A6XX_VSC_SIZE_REG(i))); + tu_cs_emit(cs, CP_COND_WRITE5_2_POLL_ADDR_HI(0)); + tu_cs_emit(cs, CP_COND_WRITE5_3_REF(cmd->vsc_data_pitch)); + tu_cs_emit(cs, CP_COND_WRITE5_4_MASK(~0)); + tu_cs_emit_qw(cs, cmd->scratch_bo.iova + VSC_SCRATCH); + tu_cs_emit(cs, CP_COND_WRITE5_7_WRITE_DATA(1 + cmd->vsc_data_pitch)); + + tu_cs_emit_pkt7(cs, CP_COND_WRITE5, 8); + tu_cs_emit(cs, CP_COND_WRITE5_0_FUNCTION(WRITE_GE) | + CP_COND_WRITE5_0_WRITE_MEMORY); + tu_cs_emit(cs, CP_COND_WRITE5_1_POLL_ADDR_LO(REG_A6XX_VSC_SIZE2_REG(i))); + tu_cs_emit(cs, CP_COND_WRITE5_2_POLL_ADDR_HI(0)); + tu_cs_emit(cs, CP_COND_WRITE5_3_REF(cmd->vsc_data2_pitch)); + tu_cs_emit(cs, CP_COND_WRITE5_4_MASK(~0)); + tu_cs_emit_qw(cs, cmd->scratch_bo.iova + VSC_SCRATCH); + tu_cs_emit(cs, CP_COND_WRITE5_7_WRITE_DATA(3 + cmd->vsc_data2_pitch)); + } + + tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0); + + tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0); + + tu_cs_emit_pkt7(cs, CP_MEM_TO_REG, 3); + tu_cs_emit(cs, CP_MEM_TO_REG_0_REG(OVERFLOW_FLAG_REG) | + CP_MEM_TO_REG_0_CNT(1 - 1)); + tu_cs_emit_qw(cs, cmd->scratch_bo.iova + VSC_SCRATCH); + + /* + * This is a bit awkward, we really want a way to invert the + * CP_REG_TEST/CP_COND_REG_EXEC logic, so that we can conditionally + * execute cmds to use hwbinning when a bit is *not* set. This + * dance is to invert OVERFLOW_FLAG_REG + * + * A CP_NOP packet is used to skip executing the 'else' clause + * if (b0 set).. + */ + + /* b0 will be set if VSC_DATA or VSC_DATA2 overflow: */ + tu_cs_emit_pkt7(cs, CP_REG_TEST, 1); + tu_cs_emit(cs, A6XX_CP_REG_TEST_0_REG(OVERFLOW_FLAG_REG) | + A6XX_CP_REG_TEST_0_BIT(0) | + A6XX_CP_REG_TEST_0_WAIT_FOR_ME); + + tu_cs_emit_pkt7(cs, CP_COND_REG_EXEC, 2); + tu_cs_emit(cs, CP_COND_REG_EXEC_0_MODE(PRED_TEST)); + tu_cs_emit(cs, CP_COND_REG_EXEC_1_DWORDS(7)); + + /* if (b0 set) */ { + /* + * On overflow, mirror the value to control->vsc_overflow + * which CPU is checking to detect overflow (see + * check_vsc_overflow()) + */ + tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3); + tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(OVERFLOW_FLAG_REG) | + CP_REG_TO_MEM_0_CNT(0)); + tu_cs_emit_qw(cs, cmd->scratch_bo.iova + VSC_OVERFLOW); + + tu_cs_emit_pkt4(cs, OVERFLOW_FLAG_REG, 1); + tu_cs_emit(cs, 0x0); + + tu_cs_emit_pkt7(cs, CP_NOP, 2); /* skip 'else' when 'if' is taken */ + } /* else */ { + tu_cs_emit_pkt4(cs, OVERFLOW_FLAG_REG, 1); + tu_cs_emit(cs, 0x1); + } +} + +static void +tu6_emit_binning_pass(struct tu_cmd_buffer *cmd, struct tu_cs *cs) +{ + const struct tu_tiling_config *tiling = &cmd->state.tiling_config; + + uint32_t x1 = tiling->tile0.offset.x; + uint32_t y1 = tiling->tile0.offset.y; + uint32_t x2 = tiling->render_area.offset.x + tiling->render_area.extent.width - 1; + uint32_t y2 = tiling->render_area.offset.y + tiling->render_area.extent.height - 1; + + tu6_emit_window_scissor(cmd, cs, x1, y1, x2, y2); + + tu6_emit_marker(cmd, cs); + tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1); + tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_BINNING)); + tu6_emit_marker(cmd, cs); + + tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1); + tu_cs_emit(cs, 0x1); + + tu_cs_emit_pkt7(cs, CP_SET_MODE, 1); + tu_cs_emit(cs, 0x1); + + tu_cs_emit_wfi(cs); + + tu_cs_emit_regs(cs, + A6XX_VFD_MODE_CNTL(.binning_pass = true)); + + update_vsc_pipe(cmd, cs); + + tu_cs_emit_regs(cs, + A6XX_PC_UNKNOWN_9805(.unknown = 0x1)); + + tu_cs_emit_regs(cs, + A6XX_SP_UNKNOWN_A0F8(.unknown = 0x1)); + + tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1); + tu_cs_emit(cs, UNK_2C); + + tu_cs_emit_regs(cs, + A6XX_RB_WINDOW_OFFSET(.x = 0, .y = 0)); + + tu_cs_emit_regs(cs, + A6XX_SP_TP_WINDOW_OFFSET(.x = 0, .y = 0)); + + /* emit IB to binning drawcmds: */ + tu_cs_emit_call(cs, &cmd->draw_cs); + + tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3); + tu_cs_emit(cs, CP_SET_DRAW_STATE__0_COUNT(0) | + CP_SET_DRAW_STATE__0_DISABLE_ALL_GROUPS | + CP_SET_DRAW_STATE__0_GROUP_ID(0)); + tu_cs_emit(cs, CP_SET_DRAW_STATE__1_ADDR_LO(0)); + tu_cs_emit(cs, CP_SET_DRAW_STATE__2_ADDR_HI(0)); + + tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1); + tu_cs_emit(cs, UNK_2D); + + tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE, false); + tu6_cache_flush(cmd, cs); + + tu_cs_emit_wfi(cs); + + tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0); + + emit_vsc_overflow_test(cmd, cs); + + tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1); + tu_cs_emit(cs, 0x0); + + tu_cs_emit_pkt7(cs, CP_SET_MODE, 1); + tu_cs_emit(cs, 0x0); + + tu_cs_emit_wfi(cs); + + tu_cs_emit_regs(cs, + A6XX_RB_CCU_CNTL(.unknown = 0x7c400004)); + + cmd->wait_for_idle = false; +} + +static void tu6_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs) { - VkResult result = tu_cs_reserve_space(cmd->device, cs, 256); + VkResult result = tu_cs_reserve_space(cmd->device, cs, 1024); if (result != VK_SUCCESS) { cmd->record_result = result; return; @@ -1030,18 +1300,29 @@ /* 0x10000000 for BYPASS.. 0x7c13c080 for GMEM: */ tu6_emit_wfi(cmd, cs); - tu_cs_emit_pkt4(cs, REG_A6XX_RB_CCU_CNTL, 1); - tu_cs_emit(cs, 0x7c400004); /* RB_CCU_CNTL */ + tu_cs_emit_regs(cs, + A6XX_RB_CCU_CNTL(0x7c400004)); + + if (use_hw_binning(cmd)) { + tu6_emit_bin_size(cmd, cs, A6XX_RB_BIN_CONTROL_BINNING_PASS | 0x6000000); + + tu6_emit_render_cntl(cmd, cs, true); + + tu6_emit_binning_pass(cmd, cs); - tu6_emit_zs(cmd, cs); - tu6_emit_mrt(cmd, cs); - tu6_emit_msaa(cmd, cs); + tu6_emit_bin_size(cmd, cs, A6XX_RB_BIN_CONTROL_USE_VIZ | 0x6000000); - if (false) { - /* hw binning? */ + tu_cs_emit_regs(cs, + A6XX_VFD_MODE_CNTL(0)); + + tu_cs_emit_regs(cs, A6XX_PC_UNKNOWN_9805(.unknown = 0x1)); + + tu_cs_emit_regs(cs, A6XX_SP_UNKNOWN_A0F8(.unknown = 0x1)); + + tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1); + tu_cs_emit(cs, 0x1); } else { tu6_emit_bin_size(cmd, cs, 0x6000000); - /* no draws */ } tu6_emit_render_cntl(cmd, cs, false); @@ -1054,7 +1335,7 @@ struct tu_cs *cs, const struct tu_tile *tile) { - const uint32_t render_tile_space = 64 + tu_cs_get_call_size(&cmd->draw_cs); + const uint32_t render_tile_space = 256 + tu_cs_get_call_size(&cmd->draw_cs); VkResult result = tu_cs_reserve_space(cmd->device, cs, render_tile_space); if (result != VK_SUCCESS) { cmd->record_result = result; @@ -1067,6 +1348,22 @@ tu_cs_emit_call(cs, &cmd->draw_cs); cmd->wait_for_idle = true; + if (use_hw_binning(cmd)) { + tu_cs_emit_pkt7(cs, CP_REG_TEST, 1); + tu_cs_emit(cs, A6XX_CP_REG_TEST_0_REG(OVERFLOW_FLAG_REG) | + A6XX_CP_REG_TEST_0_BIT(0) | + A6XX_CP_REG_TEST_0_WAIT_FOR_ME); + + tu_cs_emit_pkt7(cs, CP_COND_REG_EXEC, 2); + tu_cs_emit(cs, 0x10000000); + tu_cs_emit(cs, 2); /* conditionally execute next 2 dwords */ + + /* if (no overflow) */ { + tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1); + tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(0x5) | 0x10); + } + } + tu_cs_emit_ib(cs, &cmd->state.tile_store_ib); tu_cs_sanity_check(cs); @@ -1075,14 +1372,17 @@ static void tu6_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs) { - VkResult result = tu_cs_reserve_space(cmd->device, cs, 16); + const uint32_t space = 16 + tu_cs_get_call_size(&cmd->draw_epilogue_cs); + VkResult result = tu_cs_reserve_space(cmd->device, cs, space); if (result != VK_SUCCESS) { cmd->record_result = result; return; } - tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_LRZ_CNTL, 1); - tu_cs_emit(cs, A6XX_GRAS_LRZ_CNTL_ENABLE | A6XX_GRAS_LRZ_CNTL_UNK3); + tu_cs_emit_call(cs, &cmd->draw_epilogue_cs); + + tu_cs_emit_regs(cs, + A6XX_GRAS_LRZ_CNTL(0)); tu6_emit_lrz_flush(cmd, cs); @@ -1110,39 +1410,53 @@ } static void -tu_cmd_prepare_tile_load_ib(struct tu_cmd_buffer *cmd) +tu_cmd_prepare_tile_load_ib(struct tu_cmd_buffer *cmd, + const VkRenderPassBeginInfo *info) { - const uint32_t tile_load_space = 16 + 32 * MAX_RTS; - const struct tu_subpass *subpass = cmd->state.subpass; - struct tu_attachment_state *attachments = cmd->state.attachments; + const uint32_t tile_load_space = + 8 + (23+19) * cmd->state.pass->attachment_count + + 21 + (13 * cmd->state.subpass->color_count + 8) + 11; + struct tu_cs sub_cs; - VkResult result = tu_cs_begin_sub_stream(cmd->device, &cmd->tile_cs, + VkResult result = tu_cs_begin_sub_stream(cmd->device, &cmd->sub_cs, tile_load_space, &sub_cs); if (result != VK_SUCCESS) { cmd->record_result = result; return; } - /* emit to tile-load sub_cs */ - tu6_emit_tile_load(cmd, &sub_cs); + tu6_emit_blit_scissor(cmd, &sub_cs, true); - cmd->state.tile_load_ib = tu_cs_end_sub_stream(&cmd->tile_cs, &sub_cs); + for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i) + tu6_emit_load_attachment(cmd, &sub_cs, i); - for (uint32_t i = 0; i < subpass->color_count; ++i) { - const uint32_t a = subpass->color_attachments[i].attachment; - if (a != VK_ATTACHMENT_UNUSED) - attachments[a].pending_clear_aspects = 0; - } + tu6_emit_blit_scissor(cmd, &sub_cs, false); + + for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i) + tu6_emit_clear_attachment(cmd, &sub_cs, i, info); + + /* invalidate because reading input attachments will cache GMEM and + * the cache isn''t updated when GMEM is written + * TODO: is there a no-cache bit for textures? + */ + if (cmd->state.subpass->input_count) + tu6_emit_event_write(cmd, &sub_cs, CACHE_INVALIDATE, false); + + tu6_emit_zs(cmd, cmd->state.subpass, &sub_cs); + tu6_emit_mrt(cmd, cmd->state.subpass, &sub_cs); + tu6_emit_msaa(cmd, cmd->state.subpass, &sub_cs); + + cmd->state.tile_load_ib = tu_cs_end_sub_stream(&cmd->sub_cs, &sub_cs); } static void tu_cmd_prepare_tile_store_ib(struct tu_cmd_buffer *cmd) { - const uint32_t tile_store_space = 32 + 32 * MAX_RTS; + const uint32_t tile_store_space = 32 + 23 * cmd->state.pass->attachment_count; struct tu_cs sub_cs; - VkResult result = tu_cs_begin_sub_stream(cmd->device, &cmd->tile_cs, + VkResult result = tu_cs_begin_sub_stream(cmd->device, &cmd->sub_cs, tile_store_space, &sub_cs); if (result != VK_SUCCESS) { cmd->record_result = result; @@ -1152,7 +1466,7 @@ /* emit to tile-store sub_cs */ tu6_emit_tile_store(cmd, &sub_cs); - cmd->state.tile_store_ib = tu_cs_end_sub_stream(&cmd->tile_cs, &sub_cs); + cmd->state.tile_store_ib = tu_cs_end_sub_stream(&cmd->sub_cs, &sub_cs); } static void @@ -1160,36 +1474,13 @@ const VkRect2D *render_area) { const struct tu_device *dev = cmd->device; - const struct tu_render_pass *pass = cmd->state.pass; - const struct tu_subpass *subpass = cmd->state.subpass; struct tu_tiling_config *tiling = &cmd->state.tiling_config; - uint32_t buffer_cpp[MAX_RTS + 2]; - uint32_t buffer_count = 0; - - for (uint32_t i = 0; i < subpass->color_count; ++i) { - const uint32_t a = subpass->color_attachments[i].attachment; - if (a == VK_ATTACHMENT_UNUSED) - continue; - - const struct tu_render_pass_attachment *att = &pass->attachments[a]; - buffer_cpp[buffer_count++] = - vk_format_get_blocksize(att->format) * att->samples; - } - - if (subpass->depth_stencil_attachment.attachment != VK_ATTACHMENT_UNUSED) { - const uint32_t a = subpass->depth_stencil_attachment.attachment; - const struct tu_render_pass_attachment *att = &pass->attachments[a]; + tiling->render_area = *render_area; - /* TODO */ - assert(att->format != VK_FORMAT_D32_SFLOAT_S8_UINT); - - buffer_cpp[buffer_count++] = - vk_format_get_blocksize(att->format) * att->samples; - } - - tu_tiling_config_update(tiling, dev, buffer_cpp, buffer_count, - render_area); + tu_tiling_config_update_tile_layout(tiling, dev, cmd->state.pass->gmem_pixels); + tu_tiling_config_update_pipe_layout(tiling, dev); + tu_tiling_config_update_pipes(tiling, dev); } const struct tu_dynamic_state default_dynamic_state = { @@ -1365,7 +1656,8 @@ tu_bo_list_init(&cmd_buffer->bo_list); tu_cs_init(&cmd_buffer->cs, TU_CS_MODE_GROW, 4096); tu_cs_init(&cmd_buffer->draw_cs, TU_CS_MODE_GROW, 4096); - tu_cs_init(&cmd_buffer->tile_cs, TU_CS_MODE_SUB_STREAM, 1024); + tu_cs_init(&cmd_buffer->draw_epilogue_cs, TU_CS_MODE_GROW, 4096); + tu_cs_init(&cmd_buffer->sub_cs, TU_CS_MODE_SUB_STREAM, 2048); *pCommandBuffer = tu_cmd_buffer_to_handle(cmd_buffer); @@ -1376,15 +1668,40 @@ VkResult result = tu_bo_init_new(device, &cmd_buffer->scratch_bo, 0x1000); if (result != VK_SUCCESS) - return result; + goto fail_scratch_bo; + +#define VSC_DATA_SIZE(pitch) ((pitch) * 32 + 0x100) /* extra size to store VSC_SIZE */ +#define VSC_DATA2_SIZE(pitch) ((pitch) * 32) + + /* TODO: resize on overflow or compute a max size from # of vertices in renderpass?? */ + cmd_buffer->vsc_data_pitch = 0x440 * 4; + cmd_buffer->vsc_data2_pitch = 0x1040 * 4; + + result = tu_bo_init_new(device, &cmd_buffer->vsc_data, VSC_DATA_SIZE(cmd_buffer->vsc_data_pitch)); + if (result != VK_SUCCESS) + goto fail_vsc_data; + + result = tu_bo_init_new(device, &cmd_buffer->vsc_data2, VSC_DATA2_SIZE(cmd_buffer->vsc_data2_pitch)); + if (result != VK_SUCCESS) + goto fail_vsc_data2; return VK_SUCCESS; + +fail_vsc_data2: + tu_bo_finish(cmd_buffer->device, &cmd_buffer->vsc_data); +fail_vsc_data: + tu_bo_finish(cmd_buffer->device, &cmd_buffer->scratch_bo); +fail_scratch_bo: + list_del(&cmd_buffer->pool_link); + return result; } static void tu_cmd_buffer_destroy(struct tu_cmd_buffer *cmd_buffer) { tu_bo_finish(cmd_buffer->device, &cmd_buffer->scratch_bo); + tu_bo_finish(cmd_buffer->device, &cmd_buffer->vsc_data); + tu_bo_finish(cmd_buffer->device, &cmd_buffer->vsc_data2); list_del(&cmd_buffer->pool_link); @@ -1393,7 +1710,8 @@ tu_cs_finish(cmd_buffer->device, &cmd_buffer->cs); tu_cs_finish(cmd_buffer->device, &cmd_buffer->draw_cs); - tu_cs_finish(cmd_buffer->device, &cmd_buffer->tile_cs); + tu_cs_finish(cmd_buffer->device, &cmd_buffer->draw_epilogue_cs); + tu_cs_finish(cmd_buffer->device, &cmd_buffer->sub_cs); tu_bo_list_destroy(&cmd_buffer->bo_list); vk_free(&cmd_buffer->pool->alloc, cmd_buffer); @@ -1409,10 +1727,10 @@ tu_bo_list_reset(&cmd_buffer->bo_list); tu_cs_reset(cmd_buffer->device, &cmd_buffer->cs); tu_cs_reset(cmd_buffer->device, &cmd_buffer->draw_cs); - tu_cs_reset(cmd_buffer->device, &cmd_buffer->tile_cs); + tu_cs_reset(cmd_buffer->device, &cmd_buffer->draw_epilogue_cs); + tu_cs_reset(cmd_buffer->device, &cmd_buffer->sub_cs); for (unsigned i = 0; i < VK_PIPELINE_BIND_POINT_RANGE_SIZE; i++) { - cmd_buffer->descriptors[i].dirty = 0; cmd_buffer->descriptors[i].valid = 0; cmd_buffer->descriptors[i].push_dirty = false; } @@ -1422,86 +1740,20 @@ return cmd_buffer->record_result; } -static VkResult -tu_cmd_state_setup_attachments(struct tu_cmd_buffer *cmd_buffer, - const VkRenderPassBeginInfo *info) +VkResult +tu_AllocateCommandBuffers(VkDevice _device, + const VkCommandBufferAllocateInfo *pAllocateInfo, + VkCommandBuffer *pCommandBuffers) { - struct tu_cmd_state *state = &cmd_buffer->state; - const struct tu_framebuffer *fb = state->framebuffer; - const struct tu_render_pass *pass = state->pass; + TU_FROM_HANDLE(tu_device, device, _device); + TU_FROM_HANDLE(tu_cmd_pool, pool, pAllocateInfo->commandPool); - for (uint32_t i = 0; i < fb->attachment_count; ++i) { - const struct tu_image_view *iview = fb->attachments[i].attachment; - tu_bo_list_add(&cmd_buffer->bo_list, iview->image->bo, - MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE); - } + VkResult result = VK_SUCCESS; + uint32_t i; - if (pass->attachment_count == 0) { - state->attachments = NULL; - return VK_SUCCESS; - } + for (i = 0; i < pAllocateInfo->commandBufferCount; i++) { - state->attachments = - vk_alloc(&cmd_buffer->pool->alloc, - pass->attachment_count * sizeof(state->attachments[0]), 8, - VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); - if (state->attachments == NULL) { - cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY; - return cmd_buffer->record_result; - } - - for (uint32_t i = 0; i < pass->attachment_count; ++i) { - const struct tu_render_pass_attachment *att = &pass->attachments[i]; - VkImageAspectFlags att_aspects = vk_format_aspects(att->format); - VkImageAspectFlags clear_aspects = 0; - - if (att_aspects == VK_IMAGE_ASPECT_COLOR_BIT) { - /* color attachment */ - if (att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) { - clear_aspects |= VK_IMAGE_ASPECT_COLOR_BIT; - } - } else { - /* depthstencil attachment */ - if ((att_aspects & VK_IMAGE_ASPECT_DEPTH_BIT) && - att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) { - clear_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT; - if ((att_aspects & VK_IMAGE_ASPECT_STENCIL_BIT) && - att->stencil_load_op == VK_ATTACHMENT_LOAD_OP_DONT_CARE) - clear_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT; - } - if ((att_aspects & VK_IMAGE_ASPECT_STENCIL_BIT) && - att->stencil_load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) { - clear_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT; - } - } - - state->attachments[i].pending_clear_aspects = clear_aspects; - state->attachments[i].cleared_views = 0; - if (clear_aspects && info) { - assert(info->clearValueCount > i); - state->attachments[i].clear_value = info->pClearValues[i]; - } - - state->attachments[i].current_layout = att->initial_layout; - } - - return VK_SUCCESS; -} - -VkResult -tu_AllocateCommandBuffers(VkDevice _device, - const VkCommandBufferAllocateInfo *pAllocateInfo, - VkCommandBuffer *pCommandBuffers) -{ - TU_FROM_HANDLE(tu_device, device, _device); - TU_FROM_HANDLE(tu_cmd_pool, pool, pAllocateInfo->commandPool); - - VkResult result = VK_SUCCESS; - uint32_t i; - - for (i = 0; i < pAllocateInfo->commandBufferCount; i++) { - - if (!list_empty(&pool->free_cmd_buffers)) { + if (!list_is_empty(&pool->free_cmd_buffers)) { struct tu_cmd_buffer *cmd_buffer = list_first_entry( &pool->free_cmd_buffers, struct tu_cmd_buffer, pool_link); @@ -1589,6 +1841,8 @@ cmd_buffer->usage_flags = pBeginInfo->flags; tu_cs_begin(&cmd_buffer->cs); + tu_cs_begin(&cmd_buffer->draw_cs); + tu_cs_begin(&cmd_buffer->draw_epilogue_cs); cmd_buffer->marker_seqno = 0; cmd_buffer->scratch_seqno = 0; @@ -1602,6 +1856,11 @@ default: break; } + } else if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY && + (pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT)) { + assert(pBeginInfo->pInheritanceInfo); + cmd_buffer->state.pass = tu_render_pass_from_handle(pBeginInfo->pInheritanceInfo->renderPass); + cmd_buffer->state.subpass = &cmd_buffer->state.pass->subpasses[pBeginInfo->pInheritanceInfo->subpass]; } cmd_buffer->status = TU_CMD_BUFFER_STATUS_RECORDING; @@ -1673,6 +1932,30 @@ uint32_t dynamicOffsetCount, const uint32_t *pDynamicOffsets) { + TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer); + TU_FROM_HANDLE(tu_pipeline_layout, layout, _layout); + unsigned dyn_idx = 0; + + struct tu_descriptor_state *descriptors_state = + tu_get_descriptors_state(cmd_buffer, pipelineBindPoint); + + for (unsigned i = 0; i < descriptorSetCount; ++i) { + unsigned idx = i + firstSet; + TU_FROM_HANDLE(tu_descriptor_set, set, pDescriptorSets[i]); + + descriptors_state->sets[idx] = set; + descriptors_state->valid |= (1u << idx); + + for(unsigned j = 0; j < set->layout->dynamic_offset_count; ++j, ++dyn_idx) { + unsigned idx = j + layout->set[i + firstSet].dynamic_offset_start; + assert(dyn_idx < dynamicOffsetCount); + + descriptors_state->dynamic_buffers[idx] = + set->dynamic_descriptors[j].va + pDynamicOffsets[dyn_idx]; + } + } + + cmd_buffer->state.dirty |= TU_CMD_DIRTY_DESCRIPTOR_SETS; } void @@ -1683,6 +1966,9 @@ uint32_t size, const void *pValues) { + TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); + memcpy((void*) cmd->push_constants + offset, pValues, size); + cmd->state.dirty |= TU_CMD_DIRTY_PUSH_CONSTANTS; } VkResult @@ -1695,19 +1981,31 @@ MSM_SUBMIT_BO_WRITE); } + if (cmd_buffer->use_vsc_data) { + tu_bo_list_add(&cmd_buffer->bo_list, &cmd_buffer->vsc_data, + MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE); + tu_bo_list_add(&cmd_buffer->bo_list, &cmd_buffer->vsc_data2, + MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE); + } + for (uint32_t i = 0; i < cmd_buffer->draw_cs.bo_count; i++) { tu_bo_list_add(&cmd_buffer->bo_list, cmd_buffer->draw_cs.bos[i], MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_DUMP); } - for (uint32_t i = 0; i < cmd_buffer->tile_cs.bo_count; i++) { - tu_bo_list_add(&cmd_buffer->bo_list, cmd_buffer->tile_cs.bos[i], + for (uint32_t i = 0; i < cmd_buffer->draw_epilogue_cs.bo_count; i++) { + tu_bo_list_add(&cmd_buffer->bo_list, cmd_buffer->draw_epilogue_cs.bos[i], MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_DUMP); } - tu_cs_end(&cmd_buffer->cs); + for (uint32_t i = 0; i < cmd_buffer->sub_cs.bo_count; i++) { + tu_bo_list_add(&cmd_buffer->bo_list, cmd_buffer->sub_cs.bos[i], + MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_DUMP); + } - assert(!cmd_buffer->state.attachments); + tu_cs_end(&cmd_buffer->cs); + tu_cs_end(&cmd_buffer->draw_cs); + tu_cs_end(&cmd_buffer->draw_epilogue_cs); cmd_buffer->status = TU_CMD_BUFFER_STATUS_EXECUTABLE; @@ -1728,12 +2026,20 @@ cmd->state.dirty |= TU_CMD_DIRTY_PIPELINE; break; case VK_PIPELINE_BIND_POINT_COMPUTE: - tu_finishme("binding compute pipeline"); + cmd->state.compute_pipeline = pipeline; + cmd->state.dirty |= TU_CMD_DIRTY_COMPUTE_PIPELINE; break; default: unreachable("unrecognized pipeline bind point"); break; } + + tu_bo_list_add(&cmd->bo_list, &pipeline->program.binary_bo, + MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_DUMP); + for (uint32_t i = 0; i < pipeline->cs.bo_count; i++) { + tu_bo_list_add(&cmd->bo_list, pipeline->cs.bos[i], + MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_DUMP); + } } void @@ -1888,6 +2194,34 @@ uint32_t commandBufferCount, const VkCommandBuffer *pCmdBuffers) { + TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); + VkResult result; + + assert(commandBufferCount > 0); + + for (uint32_t i = 0; i < commandBufferCount; i++) { + TU_FROM_HANDLE(tu_cmd_buffer, secondary, pCmdBuffers[i]); + + result = tu_bo_list_merge(&cmd->bo_list, &secondary->bo_list); + if (result != VK_SUCCESS) { + cmd->record_result = result; + break; + } + + result = tu_cs_add_entries(&cmd->draw_cs, &secondary->draw_cs); + if (result != VK_SUCCESS) { + cmd->record_result = result; + break; + } + + result = tu_cs_add_entries(&cmd->draw_epilogue_cs, + &secondary->draw_epilogue_cs); + if (result != VK_SUCCESS) { + cmd->record_result = result; + break; + } + } + cmd->state.dirty = ~0u; /* TODO: set dirty only what needs to be */ } VkResult @@ -1986,32 +2320,33 @@ const VkRenderPassBeginInfo *pRenderPassBegin, VkSubpassContents contents) { - TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer); + TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); TU_FROM_HANDLE(tu_render_pass, pass, pRenderPassBegin->renderPass); - TU_FROM_HANDLE(tu_framebuffer, framebuffer, pRenderPassBegin->framebuffer); - VkResult result; + TU_FROM_HANDLE(tu_framebuffer, fb, pRenderPassBegin->framebuffer); - cmd_buffer->state.pass = pass; - cmd_buffer->state.subpass = pass->subpasses; - cmd_buffer->state.framebuffer = framebuffer; + cmd->state.pass = pass; + cmd->state.subpass = pass->subpasses; + cmd->state.framebuffer = fb; - result = tu_cmd_state_setup_attachments(cmd_buffer, pRenderPassBegin); - if (result != VK_SUCCESS) - return; + tu_cmd_update_tiling_config(cmd, &pRenderPassBegin->renderArea); + tu_cmd_prepare_tile_load_ib(cmd, pRenderPassBegin); + tu_cmd_prepare_tile_store_ib(cmd); - tu_cmd_update_tiling_config(cmd_buffer, &pRenderPassBegin->renderArea); - tu_cmd_prepare_tile_load_ib(cmd_buffer); - tu_cmd_prepare_tile_store_ib(cmd_buffer); + /* note: use_hw_binning only checks tiling config */ + if (use_hw_binning(cmd)) + cmd->use_vsc_data = true; - /* draw_cs should contain entries only for this render pass */ - assert(!cmd_buffer->draw_cs.entry_count); - tu_cs_begin(&cmd_buffer->draw_cs); + for (uint32_t i = 0; i < fb->attachment_count; ++i) { + const struct tu_image_view *iview = fb->attachments[i].attachment; + tu_bo_list_add(&cmd->bo_list, iview->image->bo, + MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE); + } } void -tu_CmdBeginRenderPass2KHR(VkCommandBuffer commandBuffer, - const VkRenderPassBeginInfo *pRenderPassBeginInfo, - const VkSubpassBeginInfoKHR *pSubpassBeginInfo) +tu_CmdBeginRenderPass2(VkCommandBuffer commandBuffer, + const VkRenderPassBeginInfo *pRenderPassBeginInfo, + const VkSubpassBeginInfoKHR *pSubpassBeginInfo) { tu_CmdBeginRenderPass(commandBuffer, pRenderPassBeginInfo, pSubpassBeginInfo->contents); @@ -2021,20 +2356,65 @@ tu_CmdNextSubpass(VkCommandBuffer commandBuffer, VkSubpassContents contents) { TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); + const struct tu_render_pass *pass = cmd->state.pass; + struct tu_cs *cs = &cmd->draw_cs; - tu_cmd_render_tiles(cmd); + VkResult result = tu_cs_reserve_space(cmd->device, cs, 1024); + if (result != VK_SUCCESS) { + cmd->record_result = result; + return; + } - cmd->state.subpass++; + const struct tu_subpass *subpass = cmd->state.subpass++; + /* TODO: + * if msaa samples change between subpasses, + * attachment store is broken for some attachments + */ + if (subpass->resolve_attachments) { + tu6_emit_blit_scissor(cmd, cs, true); + for (unsigned i = 0; i < subpass->color_count; i++) { + uint32_t a = subpass->resolve_attachments[i].attachment; + if (a != VK_ATTACHMENT_UNUSED) { + tu6_emit_store_attachment(cmd, cs, a, + subpass->color_attachments[i].attachment); + } + } + } - tu_cmd_update_tiling_config(cmd, NULL); - tu_cmd_prepare_tile_load_ib(cmd); - tu_cmd_prepare_tile_store_ib(cmd); + /* invalidate because reading input attachments will cache GMEM and + * the cache isn''t updated when GMEM is written + * TODO: is there a no-cache bit for textures? + */ + if (cmd->state.subpass->input_count) + tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE, false); + + /* emit mrt/zs/msaa state for the subpass that is starting */ + tu6_emit_zs(cmd, cmd->state.subpass, cs); + tu6_emit_mrt(cmd, cmd->state.subpass, cs); + tu6_emit_msaa(cmd, cmd->state.subpass, cs); + + /* TODO: + * since we don't know how to do GMEM->GMEM resolve, + * resolve attachments are resolved to memory then loaded to GMEM again if needed + */ + if (subpass->resolve_attachments) { + for (unsigned i = 0; i < subpass->color_count; i++) { + uint32_t a = subpass->resolve_attachments[i].attachment; + const struct tu_image_view *iview = + cmd->state.framebuffer->attachments[a].attachment; + if (a != VK_ATTACHMENT_UNUSED && pass->attachments[a].gmem_offset >= 0) { + tu_finishme("missing GMEM->GMEM resolve, performance will suffer\n"); + tu6_emit_blit_info(cmd, cs, iview, pass->attachments[a].gmem_offset, false); + tu6_emit_blit(cmd, cs); + } + } + } } void -tu_CmdNextSubpass2KHR(VkCommandBuffer commandBuffer, - const VkSubpassBeginInfoKHR *pSubpassBeginInfo, - const VkSubpassEndInfoKHR *pSubpassEndInfo) +tu_CmdNextSubpass2(VkCommandBuffer commandBuffer, + const VkSubpassBeginInfoKHR *pSubpassBeginInfo, + const VkSubpassEndInfoKHR *pSubpassEndInfo) { tu_CmdNextSubpass(commandBuffer, pSubpassBeginInfo->contents); } @@ -2085,6 +2465,9 @@ uint64_t count_buffer_offset; }; +#define ENABLE_ALL (CP_SET_DRAW_STATE__0_BINNING | CP_SET_DRAW_STATE__0_GMEM | CP_SET_DRAW_STATE__0_SYSMEM) +#define ENABLE_DRAW (CP_SET_DRAW_STATE__0_GMEM | CP_SET_DRAW_STATE__0_SYSMEM) + enum tu_draw_state_group_id { TU_DRAW_STATE_PROGRAM, @@ -2095,6 +2478,12 @@ TU_DRAW_STATE_RAST, TU_DRAW_STATE_DS, TU_DRAW_STATE_BLEND, + TU_DRAW_STATE_VS_CONST, + TU_DRAW_STATE_FS_CONST, + TU_DRAW_STATE_VS_TEX, + TU_DRAW_STATE_FS_TEX, + TU_DRAW_STATE_FS_IBO, + TU_DRAW_STATE_VS_PARAMS, TU_DRAW_STATE_COUNT, }; @@ -2103,10 +2492,678 @@ { enum tu_draw_state_group_id id; uint32_t enable_mask; - const struct tu_cs_entry *ib; + struct tu_cs_entry ib; }; +const static struct tu_sampler* +sampler_ptr(struct tu_descriptor_state *descriptors_state, + const struct tu_descriptor_map *map, unsigned i, + unsigned array_index) +{ + assert(descriptors_state->valid & (1 << map->set[i])); + + struct tu_descriptor_set *set = descriptors_state->sets[map->set[i]]; + assert(map->binding[i] < set->layout->binding_count); + + const struct tu_descriptor_set_binding_layout *layout = + &set->layout->binding[map->binding[i]]; + + if (layout->immutable_samplers_offset) { + const struct tu_sampler *immutable_samplers = + tu_immutable_samplers(set->layout, layout); + + return &immutable_samplers[array_index]; + } + + switch (layout->type) { + case VK_DESCRIPTOR_TYPE_SAMPLER: + return (struct tu_sampler*) &set->mapped_ptr[layout->offset / 4]; + case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: + return (struct tu_sampler*) &set->mapped_ptr[layout->offset / 4 + A6XX_TEX_CONST_DWORDS + + array_index * + (A6XX_TEX_CONST_DWORDS + + sizeof(struct tu_sampler) / 4)]; + default: + unreachable("unimplemented descriptor type"); + break; + } +} + static void +write_tex_const(struct tu_cmd_buffer *cmd, + uint32_t *dst, + struct tu_descriptor_state *descriptors_state, + const struct tu_descriptor_map *map, + unsigned i, unsigned array_index) +{ + assert(descriptors_state->valid & (1 << map->set[i])); + + struct tu_descriptor_set *set = descriptors_state->sets[map->set[i]]; + assert(map->binding[i] < set->layout->binding_count); + + const struct tu_descriptor_set_binding_layout *layout = + &set->layout->binding[map->binding[i]]; + + switch (layout->type) { + case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE: + case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: + case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: + case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: + memcpy(dst, &set->mapped_ptr[layout->offset / 4 + + array_index * A6XX_TEX_CONST_DWORDS], + A6XX_TEX_CONST_DWORDS * 4); + break; + case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: + memcpy(dst, &set->mapped_ptr[layout->offset / 4 + + array_index * + (A6XX_TEX_CONST_DWORDS + + sizeof(struct tu_sampler) / 4)], + A6XX_TEX_CONST_DWORDS * 4); + break; + default: + unreachable("unimplemented descriptor type"); + break; + } + + if (layout->type == VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT) { + const struct tu_tiling_config *tiling = &cmd->state.tiling_config; + uint32_t a = cmd->state.subpass->input_attachments[map->value[i] + + array_index].attachment; + const struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[a]; + + assert(att->gmem_offset >= 0); + + dst[0] &= ~(A6XX_TEX_CONST_0_SWAP__MASK | A6XX_TEX_CONST_0_TILE_MODE__MASK); + dst[0] |= A6XX_TEX_CONST_0_TILE_MODE(TILE6_2); + dst[2] &= ~(A6XX_TEX_CONST_2_TYPE__MASK | A6XX_TEX_CONST_2_PITCH__MASK); + dst[2] |= + A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D) | + A6XX_TEX_CONST_2_PITCH(tiling->tile0.extent.width * att->cpp); + dst[3] = 0; + dst[4] = 0x100000 + att->gmem_offset; + dst[5] = A6XX_TEX_CONST_5_DEPTH(1); + for (unsigned i = 6; i < A6XX_TEX_CONST_DWORDS; i++) + dst[i] = 0; + + if (cmd->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) + tu_finishme("patch input attachment pitch for secondary cmd buffer"); + } +} + +static void +write_image_ibo(struct tu_cmd_buffer *cmd, + uint32_t *dst, + struct tu_descriptor_state *descriptors_state, + const struct tu_descriptor_map *map, + unsigned i, unsigned array_index) +{ + assert(descriptors_state->valid & (1 << map->set[i])); + + struct tu_descriptor_set *set = descriptors_state->sets[map->set[i]]; + assert(map->binding[i] < set->layout->binding_count); + + const struct tu_descriptor_set_binding_layout *layout = + &set->layout->binding[map->binding[i]]; + + assert(layout->type == VK_DESCRIPTOR_TYPE_STORAGE_IMAGE); + + memcpy(dst, &set->mapped_ptr[layout->offset / 4 + + (array_index * 2 + 1) * A6XX_TEX_CONST_DWORDS], + A6XX_TEX_CONST_DWORDS * 4); +} + +static uint64_t +buffer_ptr(struct tu_descriptor_state *descriptors_state, + const struct tu_descriptor_map *map, + unsigned i, unsigned array_index) +{ + assert(descriptors_state->valid & (1 << map->set[i])); + + struct tu_descriptor_set *set = descriptors_state->sets[map->set[i]]; + assert(map->binding[i] < set->layout->binding_count); + + const struct tu_descriptor_set_binding_layout *layout = + &set->layout->binding[map->binding[i]]; + + switch (layout->type) { + case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: + return descriptors_state->dynamic_buffers[layout->dynamic_offset_offset + + array_index]; + case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: + return (uint64_t) set->mapped_ptr[layout->offset / 4 + array_index * 2 + 1] << 32 | + set->mapped_ptr[layout->offset / 4 + array_index * 2]; + default: + unreachable("unimplemented descriptor type"); + break; + } +} + +static inline uint32_t +tu6_stage2opcode(gl_shader_stage type) +{ + switch (type) { + case MESA_SHADER_VERTEX: + case MESA_SHADER_TESS_CTRL: + case MESA_SHADER_TESS_EVAL: + case MESA_SHADER_GEOMETRY: + return CP_LOAD_STATE6_GEOM; + case MESA_SHADER_FRAGMENT: + case MESA_SHADER_COMPUTE: + case MESA_SHADER_KERNEL: + return CP_LOAD_STATE6_FRAG; + default: + unreachable("bad shader type"); + } +} + +static inline enum a6xx_state_block +tu6_stage2shadersb(gl_shader_stage type) +{ + switch (type) { + case MESA_SHADER_VERTEX: + return SB6_VS_SHADER; + case MESA_SHADER_FRAGMENT: + return SB6_FS_SHADER; + case MESA_SHADER_COMPUTE: + case MESA_SHADER_KERNEL: + return SB6_CS_SHADER; + default: + unreachable("bad shader type"); + return ~0; + } +} + +static void +tu6_emit_user_consts(struct tu_cs *cs, const struct tu_pipeline *pipeline, + struct tu_descriptor_state *descriptors_state, + gl_shader_stage type, + uint32_t *push_constants) +{ + const struct tu_program_descriptor_linkage *link = + &pipeline->program.link[type]; + const struct ir3_ubo_analysis_state *state = &link->ubo_state; + + for (uint32_t i = 0; i < ARRAY_SIZE(state->range); i++) { + if (state->range[i].start < state->range[i].end) { + uint32_t size = state->range[i].end - state->range[i].start; + uint32_t offset = state->range[i].start; + + /* and even if the start of the const buffer is before + * first_immediate, the end may not be: + */ + size = MIN2(size, (16 * link->constlen) - state->range[i].offset); + + if (size == 0) + continue; + + /* things should be aligned to vec4: */ + debug_assert((state->range[i].offset % 16) == 0); + debug_assert((size % 16) == 0); + debug_assert((offset % 16) == 0); + + if (i == 0) { + /* push constants */ + tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3 + (size / 4)); + tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(state->range[i].offset / 16) | + CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | + CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) | + CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) | + CP_LOAD_STATE6_0_NUM_UNIT(size / 16)); + tu_cs_emit(cs, 0); + tu_cs_emit(cs, 0); + for (unsigned i = 0; i < size / 4; i++) + tu_cs_emit(cs, push_constants[i + offset / 4]); + continue; + } + + /* Look through the UBO map to find our UBO index, and get the VA for + * that UBO. + */ + uint64_t va = 0; + uint32_t ubo_idx = i - 1; + uint32_t ubo_map_base = 0; + for (int j = 0; j < link->ubo_map.num; j++) { + if (ubo_idx >= ubo_map_base && + ubo_idx < ubo_map_base + link->ubo_map.array_size[j]) { + va = buffer_ptr(descriptors_state, &link->ubo_map, j, + ubo_idx - ubo_map_base); + break; + } + ubo_map_base += link->ubo_map.array_size[j]; + } + assert(va); + + tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3); + tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(state->range[i].offset / 16) | + CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | + CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) | + CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) | + CP_LOAD_STATE6_0_NUM_UNIT(size / 16)); + tu_cs_emit_qw(cs, va + offset); + } + } +} + +static void +tu6_emit_ubos(struct tu_cs *cs, const struct tu_pipeline *pipeline, + struct tu_descriptor_state *descriptors_state, + gl_shader_stage type) +{ + const struct tu_program_descriptor_linkage *link = + &pipeline->program.link[type]; + + uint32_t num = MIN2(link->ubo_map.num_desc, link->const_state.num_ubos); + uint32_t anum = align(num, 2); + + if (!num) + return; + + tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3 + (2 * anum)); + tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(link->const_state.offsets.ubo) | + CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | + CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) | + CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) | + CP_LOAD_STATE6_0_NUM_UNIT(anum/2)); + tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0)); + tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0)); + + unsigned emitted = 0; + for (unsigned i = 0; emitted < num && i < link->ubo_map.num; i++) { + for (unsigned j = 0; emitted < num && j < link->ubo_map.array_size[i]; j++) { + tu_cs_emit_qw(cs, buffer_ptr(descriptors_state, &link->ubo_map, i, j)); + emitted++; + } + } + + for (; emitted < anum; emitted++) { + tu_cs_emit(cs, 0xffffffff); + tu_cs_emit(cs, 0xffffffff); + } +} + +static struct tu_cs_entry +tu6_emit_consts(struct tu_cmd_buffer *cmd, + const struct tu_pipeline *pipeline, + struct tu_descriptor_state *descriptors_state, + gl_shader_stage type) +{ + struct tu_cs cs; + tu_cs_begin_sub_stream(cmd->device, &cmd->sub_cs, 512, &cs); /* TODO: maximum size? */ + + tu6_emit_user_consts(&cs, pipeline, descriptors_state, type, cmd->push_constants); + tu6_emit_ubos(&cs, pipeline, descriptors_state, type); + + return tu_cs_end_sub_stream(&cmd->sub_cs, &cs); +} + +static VkResult +tu6_emit_vs_params(struct tu_cmd_buffer *cmd, + const struct tu_draw_info *draw, + struct tu_cs_entry *entry) +{ + /* TODO: fill out more than just base instance */ + const struct tu_program_descriptor_linkage *link = + &cmd->state.pipeline->program.link[MESA_SHADER_VERTEX]; + const struct ir3_const_state *const_state = &link->const_state; + struct tu_cs cs; + + if (const_state->offsets.driver_param >= link->constlen) { + *entry = (struct tu_cs_entry) {}; + return VK_SUCCESS; + } + + VkResult result = tu_cs_begin_sub_stream(cmd->device, &cmd->sub_cs, 8, &cs); + if (result != VK_SUCCESS) + return result; + + tu_cs_emit_pkt7(&cs, CP_LOAD_STATE6_GEOM, 3 + 4); + tu_cs_emit(&cs, CP_LOAD_STATE6_0_DST_OFF(const_state->offsets.driver_param) | + CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | + CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) | + CP_LOAD_STATE6_0_STATE_BLOCK(SB6_VS_SHADER) | + CP_LOAD_STATE6_0_NUM_UNIT(1)); + tu_cs_emit(&cs, 0); + tu_cs_emit(&cs, 0); + + STATIC_ASSERT(IR3_DP_INSTID_BASE == 2); + + tu_cs_emit(&cs, 0); + tu_cs_emit(&cs, 0); + tu_cs_emit(&cs, draw->first_instance); + tu_cs_emit(&cs, 0); + + *entry = tu_cs_end_sub_stream(&cmd->sub_cs, &cs); + return VK_SUCCESS; +} + +static VkResult +tu6_emit_textures(struct tu_cmd_buffer *cmd, + const struct tu_pipeline *pipeline, + struct tu_descriptor_state *descriptors_state, + gl_shader_stage type, + struct tu_cs_entry *entry, + bool *needs_border) +{ + struct tu_device *device = cmd->device; + struct tu_cs *draw_state = &cmd->sub_cs; + const struct tu_program_descriptor_linkage *link = + &pipeline->program.link[type]; + VkResult result; + + if (link->texture_map.num_desc == 0 && link->sampler_map.num_desc == 0) { + *entry = (struct tu_cs_entry) {}; + return VK_SUCCESS; + } + + /* allocate and fill texture state */ + struct ts_cs_memory tex_const; + result = tu_cs_alloc(device, draw_state, link->texture_map.num_desc, + A6XX_TEX_CONST_DWORDS, &tex_const); + if (result != VK_SUCCESS) + return result; + + int tex_index = 0; + for (unsigned i = 0; i < link->texture_map.num; i++) { + for (int j = 0; j < link->texture_map.array_size[i]; j++) { + write_tex_const(cmd, + &tex_const.map[A6XX_TEX_CONST_DWORDS * tex_index++], + descriptors_state, &link->texture_map, i, j); + } + } + + /* allocate and fill sampler state */ + struct ts_cs_memory tex_samp = { 0 }; + if (link->sampler_map.num_desc) { + result = tu_cs_alloc(device, draw_state, link->sampler_map.num_desc, + A6XX_TEX_SAMP_DWORDS, &tex_samp); + if (result != VK_SUCCESS) + return result; + + int sampler_index = 0; + for (unsigned i = 0; i < link->sampler_map.num; i++) { + for (int j = 0; j < link->sampler_map.array_size[i]; j++) { + const struct tu_sampler *sampler = sampler_ptr(descriptors_state, + &link->sampler_map, + i, j); + memcpy(&tex_samp.map[A6XX_TEX_SAMP_DWORDS * sampler_index++], + sampler->state, sizeof(sampler->state)); + *needs_border |= sampler->needs_border; + } + } + } + + unsigned tex_samp_reg, tex_const_reg, tex_count_reg; + enum a6xx_state_block sb; + + switch (type) { + case MESA_SHADER_VERTEX: + sb = SB6_VS_TEX; + tex_samp_reg = REG_A6XX_SP_VS_TEX_SAMP_LO; + tex_const_reg = REG_A6XX_SP_VS_TEX_CONST_LO; + tex_count_reg = REG_A6XX_SP_VS_TEX_COUNT; + break; + case MESA_SHADER_FRAGMENT: + sb = SB6_FS_TEX; + tex_samp_reg = REG_A6XX_SP_FS_TEX_SAMP_LO; + tex_const_reg = REG_A6XX_SP_FS_TEX_CONST_LO; + tex_count_reg = REG_A6XX_SP_FS_TEX_COUNT; + break; + case MESA_SHADER_COMPUTE: + sb = SB6_CS_TEX; + tex_samp_reg = REG_A6XX_SP_CS_TEX_SAMP_LO; + tex_const_reg = REG_A6XX_SP_CS_TEX_CONST_LO; + tex_count_reg = REG_A6XX_SP_CS_TEX_COUNT; + break; + default: + unreachable("bad state block"); + } + + struct tu_cs cs; + result = tu_cs_begin_sub_stream(device, draw_state, 16, &cs); + if (result != VK_SUCCESS) + return result; + + if (link->sampler_map.num_desc) { + /* output sampler state: */ + tu_cs_emit_pkt7(&cs, tu6_stage2opcode(type), 3); + tu_cs_emit(&cs, CP_LOAD_STATE6_0_DST_OFF(0) | + CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) | + CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) | + CP_LOAD_STATE6_0_STATE_BLOCK(sb) | + CP_LOAD_STATE6_0_NUM_UNIT(link->sampler_map.num_desc)); + tu_cs_emit_qw(&cs, tex_samp.iova); /* SRC_ADDR_LO/HI */ + + tu_cs_emit_pkt4(&cs, tex_samp_reg, 2); + tu_cs_emit_qw(&cs, tex_samp.iova); /* SRC_ADDR_LO/HI */ + } + + /* emit texture state: */ + tu_cs_emit_pkt7(&cs, tu6_stage2opcode(type), 3); + tu_cs_emit(&cs, CP_LOAD_STATE6_0_DST_OFF(0) | + CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | + CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) | + CP_LOAD_STATE6_0_STATE_BLOCK(sb) | + CP_LOAD_STATE6_0_NUM_UNIT(link->texture_map.num_desc)); + tu_cs_emit_qw(&cs, tex_const.iova); /* SRC_ADDR_LO/HI */ + + tu_cs_emit_pkt4(&cs, tex_const_reg, 2); + tu_cs_emit_qw(&cs, tex_const.iova); /* SRC_ADDR_LO/HI */ + + tu_cs_emit_pkt4(&cs, tex_count_reg, 1); + tu_cs_emit(&cs, link->texture_map.num_desc); + + *entry = tu_cs_end_sub_stream(draw_state, &cs); + return VK_SUCCESS; +} + +static VkResult +tu6_emit_ibo(struct tu_cmd_buffer *cmd, + const struct tu_pipeline *pipeline, + struct tu_descriptor_state *descriptors_state, + gl_shader_stage type, + struct tu_cs_entry *entry) +{ + struct tu_device *device = cmd->device; + struct tu_cs *draw_state = &cmd->sub_cs; + const struct tu_program_descriptor_linkage *link = + &pipeline->program.link[type]; + VkResult result; + + unsigned num_desc = link->ssbo_map.num_desc + link->image_map.num_desc; + + if (num_desc == 0) { + *entry = (struct tu_cs_entry) {}; + return VK_SUCCESS; + } + + struct ts_cs_memory ibo_const; + result = tu_cs_alloc(device, draw_state, num_desc, + A6XX_TEX_CONST_DWORDS, &ibo_const); + if (result != VK_SUCCESS) + return result; + + int ssbo_index = 0; + for (unsigned i = 0; i < link->ssbo_map.num; i++) { + for (int j = 0; j < link->ssbo_map.array_size[i]; j++) { + uint32_t *dst = &ibo_const.map[A6XX_TEX_CONST_DWORDS * ssbo_index]; + + uint64_t va = buffer_ptr(descriptors_state, &link->ssbo_map, i, j); + /* We don't expose robustBufferAccess, so leave the size unlimited. */ + uint32_t sz = MAX_STORAGE_BUFFER_RANGE / 4; + + dst[0] = A6XX_IBO_0_FMT(TFMT6_32_UINT); + dst[1] = A6XX_IBO_1_WIDTH(sz & MASK(15)) | + A6XX_IBO_1_HEIGHT(sz >> 15); + dst[2] = A6XX_IBO_2_UNK4 | + A6XX_IBO_2_UNK31 | + A6XX_IBO_2_TYPE(A6XX_TEX_1D); + dst[3] = 0; + dst[4] = va; + dst[5] = va >> 32; + for (int i = 6; i < A6XX_TEX_CONST_DWORDS; i++) + dst[i] = 0; + + ssbo_index++; + } + } + + for (unsigned i = 0; i < link->image_map.num; i++) { + for (int j = 0; j < link->image_map.array_size[i]; j++) { + uint32_t *dst = &ibo_const.map[A6XX_TEX_CONST_DWORDS * ssbo_index]; + + write_image_ibo(cmd, dst, + descriptors_state, &link->image_map, i, j); + + ssbo_index++; + } + } + + assert(ssbo_index == num_desc); + + struct tu_cs cs; + result = tu_cs_begin_sub_stream(device, draw_state, 7, &cs); + if (result != VK_SUCCESS) + return result; + + uint32_t opcode, ibo_addr_reg; + enum a6xx_state_block sb; + enum a6xx_state_type st; + + switch (type) { + case MESA_SHADER_FRAGMENT: + opcode = CP_LOAD_STATE6; + st = ST6_SHADER; + sb = SB6_IBO; + ibo_addr_reg = REG_A6XX_SP_IBO_LO; + break; + case MESA_SHADER_COMPUTE: + opcode = CP_LOAD_STATE6_FRAG; + st = ST6_IBO; + sb = SB6_CS_SHADER; + ibo_addr_reg = REG_A6XX_SP_CS_IBO_LO; + break; + default: + unreachable("unsupported stage for ibos"); + } + + /* emit texture state: */ + tu_cs_emit_pkt7(&cs, opcode, 3); + tu_cs_emit(&cs, CP_LOAD_STATE6_0_DST_OFF(0) | + CP_LOAD_STATE6_0_STATE_TYPE(st) | + CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) | + CP_LOAD_STATE6_0_STATE_BLOCK(sb) | + CP_LOAD_STATE6_0_NUM_UNIT(num_desc)); + tu_cs_emit_qw(&cs, ibo_const.iova); /* SRC_ADDR_LO/HI */ + + tu_cs_emit_pkt4(&cs, ibo_addr_reg, 2); + tu_cs_emit_qw(&cs, ibo_const.iova); /* SRC_ADDR_LO/HI */ + + *entry = tu_cs_end_sub_stream(draw_state, &cs); + return VK_SUCCESS; +} + +struct PACKED bcolor_entry { + uint32_t fp32[4]; + uint16_t ui16[4]; + int16_t si16[4]; + uint16_t fp16[4]; + uint16_t rgb565; + uint16_t rgb5a1; + uint16_t rgba4; + uint8_t __pad0[2]; + uint8_t ui8[4]; + int8_t si8[4]; + uint32_t rgb10a2; + uint32_t z24; /* also s8? */ + uint16_t srgb[4]; /* appears to duplicate fp16[], but clamped, used for srgb */ + uint8_t __pad1[56]; +} border_color[] = { + [VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK] = {}, + [VK_BORDER_COLOR_INT_TRANSPARENT_BLACK] = {}, + [VK_BORDER_COLOR_FLOAT_OPAQUE_BLACK] = { + .fp32[3] = 0x3f800000, + .ui16[3] = 0xffff, + .si16[3] = 0x7fff, + .fp16[3] = 0x3c00, + .rgb5a1 = 0x8000, + .rgba4 = 0xf000, + .ui8[3] = 0xff, + .si8[3] = 0x7f, + .rgb10a2 = 0xc0000000, + .srgb[3] = 0x3c00, + }, + [VK_BORDER_COLOR_INT_OPAQUE_BLACK] = { + .fp32[3] = 1, + .fp16[3] = 1, + }, + [VK_BORDER_COLOR_FLOAT_OPAQUE_WHITE] = { + .fp32[0 ... 3] = 0x3f800000, + .ui16[0 ... 3] = 0xffff, + .si16[0 ... 3] = 0x7fff, + .fp16[0 ... 3] = 0x3c00, + .rgb565 = 0xffff, + .rgb5a1 = 0xffff, + .rgba4 = 0xffff, + .ui8[0 ... 3] = 0xff, + .si8[0 ... 3] = 0x7f, + .rgb10a2 = 0xffffffff, + .z24 = 0xffffff, + .srgb[0 ... 3] = 0x3c00, + }, + [VK_BORDER_COLOR_INT_OPAQUE_WHITE] = { + .fp32[0 ... 3] = 1, + .fp16[0 ... 3] = 1, + }, +}; + +static VkResult +tu6_emit_border_color(struct tu_cmd_buffer *cmd, + struct tu_cs *cs) +{ + STATIC_ASSERT(sizeof(struct bcolor_entry) == 128); + + const struct tu_pipeline *pipeline = cmd->state.pipeline; + struct tu_descriptor_state *descriptors_state = + &cmd->descriptors[VK_PIPELINE_BIND_POINT_GRAPHICS]; + const struct tu_descriptor_map *vs_sampler = + &pipeline->program.link[MESA_SHADER_VERTEX].sampler_map; + const struct tu_descriptor_map *fs_sampler = + &pipeline->program.link[MESA_SHADER_FRAGMENT].sampler_map; + struct ts_cs_memory ptr; + + VkResult result = tu_cs_alloc(cmd->device, &cmd->sub_cs, + vs_sampler->num_desc + fs_sampler->num_desc, + 128 / 4, + &ptr); + if (result != VK_SUCCESS) + return result; + + for (unsigned i = 0; i < vs_sampler->num; i++) { + for (unsigned j = 0; j < vs_sampler->array_size[i]; j++) { + const struct tu_sampler *sampler = sampler_ptr(descriptors_state, + vs_sampler, i, j); + memcpy(ptr.map, &border_color[sampler->border], 128); + ptr.map += 128 / 4; + } + } + + for (unsigned i = 0; i < fs_sampler->num; i++) { + for (unsigned j = 0; j < fs_sampler->array_size[i]; j++) { + const struct tu_sampler *sampler = sampler_ptr(descriptors_state, + fs_sampler, i, j); + memcpy(ptr.map, &border_color[sampler->border], 128); + ptr.map += 128 / 4; + } + } + + tu_cs_emit_pkt4(cs, REG_A6XX_SP_TP_BORDER_COLOR_BASE_ADDR_LO, 2); + tu_cs_emit_qw(cs, ptr.iova); + return VK_SUCCESS; +} + +static VkResult tu6_bind_draw_states(struct tu_cmd_buffer *cmd, struct tu_cs *cs, const struct tu_draw_info *draw) @@ -2116,24 +3173,22 @@ struct tu_draw_state_group draw_state_groups[TU_DRAW_STATE_COUNT]; uint32_t draw_state_group_count = 0; + struct tu_descriptor_state *descriptors_state = + &cmd->descriptors[VK_PIPELINE_BIND_POINT_GRAPHICS]; + VkResult result = tu_cs_reserve_space(cmd->device, cs, 256); - if (result != VK_SUCCESS) { - cmd->record_result = result; - return; - } + if (result != VK_SUCCESS) + return result; /* TODO lrz */ - uint32_t pc_primitive_cntl = 0; - if (pipeline->ia.primitive_restart && draw->indexed) - pc_primitive_cntl |= A6XX_PC_PRIMITIVE_CNTL_0_PRIMITIVE_RESTART; - tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9806, 0); tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9990, 0); tu_cs_emit_write_reg(cs, REG_A6XX_VFD_UNKNOWN_A008, 0); - tu_cs_emit_pkt4(cs, REG_A6XX_PC_PRIMITIVE_CNTL_0, 1); - tu_cs_emit(cs, pc_primitive_cntl); + tu_cs_emit_regs(cs, + A6XX_PC_PRIMITIVE_CNTL_0(.primitive_restart = + pipeline->ia.primitive_restart && draw->indexed)); if (cmd->state.dirty & (TU_CMD_DIRTY_PIPELINE | TU_CMD_DIRTY_DYNAMIC_LINE_WIDTH) && @@ -2172,77 +3227,149 @@ const VkDeviceSize size = offset < buf->bo->size ? buf->bo->size - offset : 0; - tu_cs_emit_pkt4(cs, REG_A6XX_VFD_FETCH(i), 4); - tu_cs_emit_qw(cs, buf->bo->iova + offset); - tu_cs_emit(cs, size); - tu_cs_emit(cs, stride); + tu_cs_emit_regs(cs, + A6XX_VFD_FETCH_BASE(i, .bo = buf->bo, .bo_offset = offset), + A6XX_VFD_FETCH_SIZE(i, size), + A6XX_VFD_FETCH_STRIDE(i, stride)); } } - /* TODO shader consts */ - if (cmd->state.dirty & TU_CMD_DIRTY_PIPELINE) { draw_state_groups[draw_state_group_count++] = (struct tu_draw_state_group) { .id = TU_DRAW_STATE_PROGRAM, - .enable_mask = 0x6, - .ib = &pipeline->program.state_ib, + .enable_mask = ENABLE_DRAW, + .ib = pipeline->program.state_ib, }; draw_state_groups[draw_state_group_count++] = (struct tu_draw_state_group) { .id = TU_DRAW_STATE_PROGRAM_BINNING, - .enable_mask = 0x1, - .ib = &pipeline->program.binning_state_ib, + .enable_mask = CP_SET_DRAW_STATE__0_BINNING, + .ib = pipeline->program.binning_state_ib, }; draw_state_groups[draw_state_group_count++] = (struct tu_draw_state_group) { .id = TU_DRAW_STATE_VI, - .enable_mask = 0x6, - .ib = &pipeline->vi.state_ib, + .enable_mask = ENABLE_DRAW, + .ib = pipeline->vi.state_ib, }; draw_state_groups[draw_state_group_count++] = (struct tu_draw_state_group) { .id = TU_DRAW_STATE_VI_BINNING, - .enable_mask = 0x1, - .ib = &pipeline->vi.binning_state_ib, + .enable_mask = CP_SET_DRAW_STATE__0_BINNING, + .ib = pipeline->vi.binning_state_ib, }; draw_state_groups[draw_state_group_count++] = (struct tu_draw_state_group) { .id = TU_DRAW_STATE_VP, - .enable_mask = 0x7, - .ib = &pipeline->vp.state_ib, + .enable_mask = ENABLE_ALL, + .ib = pipeline->vp.state_ib, }; draw_state_groups[draw_state_group_count++] = (struct tu_draw_state_group) { .id = TU_DRAW_STATE_RAST, - .enable_mask = 0x7, - .ib = &pipeline->rast.state_ib, + .enable_mask = ENABLE_ALL, + .ib = pipeline->rast.state_ib, }; draw_state_groups[draw_state_group_count++] = (struct tu_draw_state_group) { .id = TU_DRAW_STATE_DS, - .enable_mask = 0x7, - .ib = &pipeline->ds.state_ib, + .enable_mask = ENABLE_ALL, + .ib = pipeline->ds.state_ib, }; draw_state_groups[draw_state_group_count++] = (struct tu_draw_state_group) { .id = TU_DRAW_STATE_BLEND, - .enable_mask = 0x7, - .ib = &pipeline->blend.state_ib, + .enable_mask = ENABLE_ALL, + .ib = pipeline->blend.state_ib, + }; + } + + if (cmd->state.dirty & + (TU_CMD_DIRTY_PIPELINE | TU_CMD_DIRTY_DESCRIPTOR_SETS | TU_CMD_DIRTY_PUSH_CONSTANTS)) { + draw_state_groups[draw_state_group_count++] = + (struct tu_draw_state_group) { + .id = TU_DRAW_STATE_VS_CONST, + .enable_mask = ENABLE_ALL, + .ib = tu6_emit_consts(cmd, pipeline, descriptors_state, MESA_SHADER_VERTEX) + }; + draw_state_groups[draw_state_group_count++] = + (struct tu_draw_state_group) { + .id = TU_DRAW_STATE_FS_CONST, + .enable_mask = ENABLE_DRAW, + .ib = tu6_emit_consts(cmd, pipeline, descriptors_state, MESA_SHADER_FRAGMENT) + }; + } + + if (cmd->state.dirty & + (TU_CMD_DIRTY_PIPELINE | TU_CMD_DIRTY_DESCRIPTOR_SETS)) { + bool needs_border = false; + struct tu_cs_entry vs_tex, fs_tex, fs_ibo; + + result = tu6_emit_textures(cmd, pipeline, descriptors_state, + MESA_SHADER_VERTEX, &vs_tex, &needs_border); + if (result != VK_SUCCESS) + return result; + + result = tu6_emit_textures(cmd, pipeline, descriptors_state, + MESA_SHADER_FRAGMENT, &fs_tex, &needs_border); + if (result != VK_SUCCESS) + return result; + + result = tu6_emit_ibo(cmd, pipeline, descriptors_state, + MESA_SHADER_FRAGMENT, &fs_ibo); + if (result != VK_SUCCESS) + return result; + + draw_state_groups[draw_state_group_count++] = + (struct tu_draw_state_group) { + .id = TU_DRAW_STATE_VS_TEX, + .enable_mask = ENABLE_ALL, + .ib = vs_tex, + }; + draw_state_groups[draw_state_group_count++] = + (struct tu_draw_state_group) { + .id = TU_DRAW_STATE_FS_TEX, + .enable_mask = ENABLE_DRAW, + .ib = fs_tex, + }; + draw_state_groups[draw_state_group_count++] = + (struct tu_draw_state_group) { + .id = TU_DRAW_STATE_FS_IBO, + .enable_mask = ENABLE_DRAW, + .ib = fs_ibo, }; + + if (needs_border) { + result = tu6_emit_border_color(cmd, cs); + if (result != VK_SUCCESS) + return result; + } } + struct tu_cs_entry vs_params; + result = tu6_emit_vs_params(cmd, draw, &vs_params); + if (result != VK_SUCCESS) + return result; + + draw_state_groups[draw_state_group_count++] = + (struct tu_draw_state_group) { + .id = TU_DRAW_STATE_VS_PARAMS, + .enable_mask = ENABLE_ALL, + .ib = vs_params, + }; + tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * draw_state_group_count); for (uint32_t i = 0; i < draw_state_group_count; i++) { const struct tu_draw_state_group *group = &draw_state_groups[i]; - + debug_assert((group->enable_mask & ~ENABLE_ALL) == 0); uint32_t cp_set_draw_state = - CP_SET_DRAW_STATE__0_COUNT(group->ib->size / 4) | - CP_SET_DRAW_STATE__0_ENABLE_MASK(group->enable_mask) | + CP_SET_DRAW_STATE__0_COUNT(group->ib.size / 4) | + group->enable_mask | CP_SET_DRAW_STATE__0_GROUP_ID(group->id); uint64_t iova; - if (group->ib->size) { - iova = group->ib->bo->iova + group->ib->offset; + if (group->ib.size) { + iova = group->ib.bo->iova + group->ib.offset; } else { cp_set_draw_state |= CP_SET_DRAW_STATE__0_DISABLE; iova = 0; @@ -2255,14 +3382,6 @@ tu_cs_sanity_check(cs); /* track BOs */ - if (cmd->state.dirty & TU_CMD_DIRTY_PIPELINE) { - tu_bo_list_add(&cmd->bo_list, &pipeline->program.binary_bo, - MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_DUMP); - for (uint32_t i = 0; i < pipeline->cs.bo_count; i++) { - tu_bo_list_add(&cmd->bo_list, pipeline->cs.bos[i], - MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_DUMP); - } - } if (cmd->state.dirty & TU_CMD_DIRTY_VERTEX_BUFFERS) { for (uint32_t i = 0; i < MAX_VBS; i++) { const struct tu_buffer *buf = cmd->state.vb.buffers[i]; @@ -2270,8 +3389,23 @@ tu_bo_list_add(&cmd->bo_list, buf->bo, MSM_SUBMIT_BO_READ); } } + if (cmd->state.dirty & TU_CMD_DIRTY_DESCRIPTOR_SETS) { + unsigned i; + for_each_bit(i, descriptors_state->valid) { + struct tu_descriptor_set *set = descriptors_state->sets[i]; + for (unsigned j = 0; j < set->layout->buffer_count; ++j) + if (set->descriptors[j]) { + tu_bo_list_add(&cmd->bo_list, set->descriptors[j], + MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE); + } + } + } - cmd->state.dirty = 0; + /* Fragment shader state overwrites compute shader state, so flag the + * compute pipeline for re-emit. + */ + cmd->state.dirty = TU_CMD_DIRTY_COMPUTE_PIPELINE; + return VK_SUCCESS; } static void @@ -2282,9 +3416,9 @@ const enum pc_di_primtype primtype = cmd->state.pipeline->ia.primtype; - tu_cs_emit_pkt4(cs, REG_A6XX_VFD_INDEX_OFFSET, 2); - tu_cs_emit(cs, draw->vertex_offset); - tu_cs_emit(cs, draw->first_instance); + tu_cs_emit_regs(cs, + A6XX_VFD_INDEX_OFFSET(draw->vertex_offset), + A6XX_VFD_INSTANCE_START_OFFSET(draw->first_instance)); /* TODO hw binning */ if (draw->indexed) { @@ -2301,7 +3435,7 @@ CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(primtype) | CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_DMA) | CP_DRAW_INDX_OFFSET_0_INDEX_SIZE(index_size) | - CP_DRAW_INDX_OFFSET_0_VIS_CULL(IGNORE_VISIBILITY) | 0x2000; + CP_DRAW_INDX_OFFSET_0_VIS_CULL(USE_VISIBILITY) | 0x2000; tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 7); tu_cs_emit(cs, cp_draw_indx); @@ -2314,7 +3448,7 @@ const uint32_t cp_draw_indx = CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(primtype) | CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_AUTO_INDEX) | - CP_DRAW_INDX_OFFSET_0_VIS_CULL(IGNORE_VISIBILITY) | 0x2000; + CP_DRAW_INDX_OFFSET_0_VIS_CULL(USE_VISIBILITY) | 0x2000; tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3); tu_cs_emit(cs, cp_draw_indx); @@ -2327,10 +3461,15 @@ tu_draw(struct tu_cmd_buffer *cmd, const struct tu_draw_info *draw) { struct tu_cs *cs = &cmd->draw_cs; + VkResult result; - tu6_bind_draw_states(cmd, cs, draw); + result = tu6_bind_draw_states(cmd, cs, draw); + if (result != VK_SUCCESS) { + cmd->record_result = result; + return; + } - VkResult result = tu_cs_reserve_space(cmd->device, cs, 32); + result = tu_cs_reserve_space(cmd->device, cs, 32); if (result != VK_SUCCESS) { cmd->record_result = result; return; @@ -2342,6 +3481,7 @@ } /* TODO tu6_emit_marker should pick different regs depending on cs */ + tu6_emit_marker(cmd, cs); tu6_emit_draw_direct(cmd, cs, draw); tu6_emit_marker(cmd, cs); @@ -2454,9 +3594,160 @@ }; static void -tu_dispatch(struct tu_cmd_buffer *cmd_buffer, +tu_emit_compute_driver_params(struct tu_cs *cs, struct tu_pipeline *pipeline, + const struct tu_dispatch_info *info) +{ + gl_shader_stage type = MESA_SHADER_COMPUTE; + const struct tu_program_descriptor_linkage *link = + &pipeline->program.link[type]; + const struct ir3_const_state *const_state = &link->const_state; + uint32_t offset = const_state->offsets.driver_param; + + if (link->constlen <= offset) + return; + + if (!info->indirect) { + uint32_t driver_params[IR3_DP_CS_COUNT] = { + [IR3_DP_NUM_WORK_GROUPS_X] = info->blocks[0], + [IR3_DP_NUM_WORK_GROUPS_Y] = info->blocks[1], + [IR3_DP_NUM_WORK_GROUPS_Z] = info->blocks[2], + [IR3_DP_LOCAL_GROUP_SIZE_X] = pipeline->compute.local_size[0], + [IR3_DP_LOCAL_GROUP_SIZE_Y] = pipeline->compute.local_size[1], + [IR3_DP_LOCAL_GROUP_SIZE_Z] = pipeline->compute.local_size[2], + }; + + uint32_t num_consts = MIN2(const_state->num_driver_params, + (link->constlen - offset) * 4); + /* push constants */ + tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3 + num_consts); + tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset) | + CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | + CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) | + CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) | + CP_LOAD_STATE6_0_NUM_UNIT(num_consts / 4)); + tu_cs_emit(cs, 0); + tu_cs_emit(cs, 0); + uint32_t i; + for (i = 0; i < num_consts; i++) + tu_cs_emit(cs, driver_params[i]); + } else { + tu_finishme("Indirect driver params"); + } +} + +static void +tu_dispatch(struct tu_cmd_buffer *cmd, const struct tu_dispatch_info *info) { + struct tu_cs *cs = &cmd->cs; + struct tu_pipeline *pipeline = cmd->state.compute_pipeline; + struct tu_descriptor_state *descriptors_state = + &cmd->descriptors[VK_PIPELINE_BIND_POINT_COMPUTE]; + + VkResult result = tu_cs_reserve_space(cmd->device, cs, 256); + if (result != VK_SUCCESS) { + cmd->record_result = result; + return; + } + + if (cmd->state.dirty & TU_CMD_DIRTY_COMPUTE_PIPELINE) + tu_cs_emit_ib(cs, &pipeline->program.state_ib); + + struct tu_cs_entry ib; + + ib = tu6_emit_consts(cmd, pipeline, descriptors_state, MESA_SHADER_COMPUTE); + if (ib.size) + tu_cs_emit_ib(cs, &ib); + + tu_emit_compute_driver_params(cs, pipeline, info); + + bool needs_border; + result = tu6_emit_textures(cmd, pipeline, descriptors_state, + MESA_SHADER_COMPUTE, &ib, &needs_border); + if (result != VK_SUCCESS) { + cmd->record_result = result; + return; + } + + if (ib.size) + tu_cs_emit_ib(cs, &ib); + + if (needs_border) + tu_finishme("compute border color"); + + result = tu6_emit_ibo(cmd, pipeline, descriptors_state, MESA_SHADER_COMPUTE, &ib); + if (result != VK_SUCCESS) { + cmd->record_result = result; + return; + } + + if (ib.size) + tu_cs_emit_ib(cs, &ib); + + /* track BOs */ + if (cmd->state.dirty & TU_CMD_DIRTY_DESCRIPTOR_SETS) { + unsigned i; + for_each_bit(i, descriptors_state->valid) { + struct tu_descriptor_set *set = descriptors_state->sets[i]; + for (unsigned j = 0; j < set->layout->buffer_count; ++j) + if (set->descriptors[j]) { + tu_bo_list_add(&cmd->bo_list, set->descriptors[j], + MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE); + } + } + } + + /* Compute shader state overwrites fragment shader state, so we flag the + * graphics pipeline for re-emit. + */ + cmd->state.dirty = TU_CMD_DIRTY_PIPELINE; + + tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1); + tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(0x8)); + + const uint32_t *local_size = pipeline->compute.local_size; + const uint32_t *num_groups = info->blocks; + tu_cs_emit_regs(cs, + A6XX_HLSQ_CS_NDRANGE_0(.kerneldim = 3, + .localsizex = local_size[0] - 1, + .localsizey = local_size[1] - 1, + .localsizez = local_size[2] - 1), + A6XX_HLSQ_CS_NDRANGE_1(.globalsize_x = local_size[0] * num_groups[0]), + A6XX_HLSQ_CS_NDRANGE_2(.globaloff_x = 0), + A6XX_HLSQ_CS_NDRANGE_3(.globalsize_y = local_size[1] * num_groups[1]), + A6XX_HLSQ_CS_NDRANGE_4(.globaloff_y = 0), + A6XX_HLSQ_CS_NDRANGE_5(.globalsize_z = local_size[2] * num_groups[2]), + A6XX_HLSQ_CS_NDRANGE_6(.globaloff_z = 0)); + + tu_cs_emit_regs(cs, + A6XX_HLSQ_CS_KERNEL_GROUP_X(1), + A6XX_HLSQ_CS_KERNEL_GROUP_Y(1), + A6XX_HLSQ_CS_KERNEL_GROUP_Z(1)); + + if (info->indirect) { + uint64_t iova = tu_buffer_iova(info->indirect) + info->indirect_offset; + + tu_bo_list_add(&cmd->bo_list, info->indirect->bo, + MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE); + + tu_cs_emit_pkt7(cs, CP_EXEC_CS_INDIRECT, 4); + tu_cs_emit(cs, 0x00000000); + tu_cs_emit_qw(cs, iova); + tu_cs_emit(cs, + A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEX(local_size[0] - 1) | + A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEY(local_size[1] - 1) | + A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEZ(local_size[2] - 1)); + } else { + tu_cs_emit_pkt7(cs, CP_EXEC_CS, 4); + tu_cs_emit(cs, 0x00000000); + tu_cs_emit(cs, CP_EXEC_CS_1_NGROUPS_X(info->blocks[0])); + tu_cs_emit(cs, CP_EXEC_CS_2_NGROUPS_Y(info->blocks[1])); + tu_cs_emit(cs, CP_EXEC_CS_3_NGROUPS_Z(info->blocks[2])); + } + + tu_cs_emit_wfi(cs); + + tu6_emit_cache_flush(cmd, cs); } void @@ -2511,14 +3802,16 @@ TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer); tu_cs_end(&cmd_buffer->draw_cs); + tu_cs_end(&cmd_buffer->draw_epilogue_cs); tu_cmd_render_tiles(cmd_buffer); - /* discard draw_cs entries now that the tiles are rendered */ + /* discard draw_cs and draw_epilogue_cs entries now that the tiles are + rendered */ tu_cs_discard_entries(&cmd_buffer->draw_cs); - - vk_free(&cmd_buffer->pool->alloc, cmd_buffer->state.attachments); - cmd_buffer->state.attachments = NULL; + tu_cs_begin(&cmd_buffer->draw_cs); + tu_cs_discard_entries(&cmd_buffer->draw_epilogue_cs); + tu_cs_begin(&cmd_buffer->draw_epilogue_cs); cmd_buffer->state.pass = NULL; cmd_buffer->state.subpass = NULL; @@ -2526,8 +3819,8 @@ } void -tu_CmdEndRenderPass2KHR(VkCommandBuffer commandBuffer, - const VkSubpassEndInfoKHR *pSubpassEndInfo) +tu_CmdEndRenderPass2(VkCommandBuffer commandBuffer, + const VkSubpassEndInfoKHR *pSubpassEndInfo) { tu_CmdEndRenderPass(commandBuffer); } @@ -2576,11 +3869,23 @@ } static void -write_event(struct tu_cmd_buffer *cmd_buffer, - struct tu_event *event, - VkPipelineStageFlags stageMask, - unsigned value) +write_event(struct tu_cmd_buffer *cmd, struct tu_event *event, unsigned value) { + struct tu_cs *cs = &cmd->cs; + + VkResult result = tu_cs_reserve_space(cmd->device, cs, 4); + if (result != VK_SUCCESS) { + cmd->record_result = result; + return; + } + + tu_bo_list_add(&cmd->bo_list, &event->bo, MSM_SUBMIT_BO_WRITE); + + /* TODO: any flush required before/after ? */ + + tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 3); + tu_cs_emit_qw(cs, event->bo.iova); /* ADDR_LO/HI */ + tu_cs_emit(cs, value); } void @@ -2588,10 +3893,10 @@ VkEvent _event, VkPipelineStageFlags stageMask) { - TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer); + TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); TU_FROM_HANDLE(tu_event, event, _event); - write_event(cmd_buffer, event, stageMask, 1); + write_event(cmd, event, 1); } void @@ -2599,10 +3904,10 @@ VkEvent _event, VkPipelineStageFlags stageMask) { - TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer); + TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); TU_FROM_HANDLE(tu_event, event, _event); - write_event(cmd_buffer, event, stageMask, 0); + write_event(cmd, event, 0); } void @@ -2618,16 +3923,30 @@ uint32_t imageMemoryBarrierCount, const VkImageMemoryBarrier *pImageMemoryBarriers) { - TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer); - struct tu_barrier_info info; + TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); + struct tu_cs *cs = &cmd->cs; - info.eventCount = eventCount; - info.pEvents = pEvents; - info.srcStageMask = 0; + VkResult result = tu_cs_reserve_space(cmd->device, cs, eventCount * 7); + if (result != VK_SUCCESS) { + cmd->record_result = result; + return; + } - tu_barrier(cmd_buffer, memoryBarrierCount, pMemoryBarriers, - bufferMemoryBarrierCount, pBufferMemoryBarriers, - imageMemoryBarrierCount, pImageMemoryBarriers, &info); + /* TODO: any flush required before/after? (CP_WAIT_FOR_ME?) */ + + for (uint32_t i = 0; i < eventCount; i++) { + TU_FROM_HANDLE(tu_event, event, pEvents[i]); + + tu_bo_list_add(&cmd->bo_list, &event->bo, MSM_SUBMIT_BO_READ); + + tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6); + tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ) | + CP_WAIT_REG_MEM_0_POLL_MEMORY); + tu_cs_emit_qw(cs, event->bo.iova); /* POLL_ADDR_LO/HI */ + tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(1)); + tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0u)); + tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(20)); + } } void diff -Nru mesa-19.2.8/src/freedreno/vulkan/tu_cs.c mesa-20.0.8/src/freedreno/vulkan/tu_cs.c --- mesa-19.2.8/src/freedreno/vulkan/tu_cs.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/freedreno/vulkan/tu_cs.c 2020-06-12 01:21:16.000000000 +0000 @@ -159,7 +159,7 @@ * Reserve an IB entry. */ static VkResult -tu_cs_reserve_entry(struct tu_device *dev, struct tu_cs *cs) +tu_cs_reserve_entry(struct tu_cs *cs) { /* entries are only for TU_CS_MODE_GROW */ assert(cs->mode == TU_CS_MODE_GROW); @@ -210,6 +210,30 @@ } /** + * same behavior as tu_cs_emit_call but without the indirect + */ +VkResult +tu_cs_add_entries(struct tu_cs *cs, struct tu_cs *target) +{ + VkResult result; + + assert(cs->mode == TU_CS_MODE_GROW); + assert(target->mode == TU_CS_MODE_GROW); + + if (!tu_cs_is_empty(cs)) + tu_cs_add_entry(cs); + + for (unsigned i = 0; i < target->entry_count; i++) { + result = tu_cs_reserve_entry(cs); + if (result != VK_SUCCESS) + return result; + cs->entries[cs->entry_count++] = target->entries[i]; + } + + return VK_SUCCESS; +} + +/** * Begin (or continue) command packet emission. This does nothing but sanity * checks currently. \a cs must not be in TU_CS_MODE_SUB_STREAM mode. */ @@ -263,6 +287,41 @@ } /** + * Allocate count*size dwords, aligned to size dwords. + * \a cs must be in TU_CS_MODE_SUB_STREAM mode. + * + */ +VkResult +tu_cs_alloc(struct tu_device *dev, + struct tu_cs *cs, + uint32_t count, + uint32_t size, + struct ts_cs_memory *memory) +{ + assert(cs->mode == TU_CS_MODE_SUB_STREAM); + assert(size && size <= 1024); + + if (!count) + return VK_SUCCESS; + + /* TODO: smarter way to deal with alignment? */ + + VkResult result = tu_cs_reserve_space(dev, cs, count * size + (size-1)); + if (result != VK_SUCCESS) + return result; + + struct tu_bo *bo = cs->bos[cs->bo_count - 1]; + size_t offset = align(tu_cs_get_offset(cs), size); + + memory->map = bo->map + offset * sizeof(uint32_t); + memory->iova = bo->iova + offset * sizeof(uint32_t); + + cs->start = cs->cur = (uint32_t*) bo->map + offset + count * size; + + return VK_SUCCESS; +} + +/** * End command packet emission to a sub-stream. \a sub_cs becomes invalid * after this call. * @@ -332,7 +391,7 @@ if (cs->mode == TU_CS_MODE_GROW) { /* reserve an entry for the next call to this function or tu_cs_end */ - return tu_cs_reserve_entry(dev, cs); + return tu_cs_reserve_entry(cs); } return VK_SUCCESS; diff -Nru mesa-19.2.8/src/freedreno/vulkan/tu_cs.h mesa-20.0.8/src/freedreno/vulkan/tu_cs.h --- mesa-19.2.8/src/freedreno/vulkan/tu_cs.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/freedreno/vulkan/tu_cs.h 2020-06-12 01:21:16.000000000 +0000 @@ -48,6 +48,13 @@ uint32_t size, struct tu_cs *sub_cs); +VkResult +tu_cs_alloc(struct tu_device *dev, + struct tu_cs *cs, + uint32_t count, + uint32_t size, + struct ts_cs_memory *memory); + struct tu_cs_entry tu_cs_end_sub_stream(struct tu_cs *cs, struct tu_cs *sub_cs); @@ -59,6 +66,9 @@ void tu_cs_reset(struct tu_device *dev, struct tu_cs *cs); +VkResult +tu_cs_add_entries(struct tu_cs *cs, struct tu_cs *target); + /** * Discard all entries. This allows \a cs to be reused while keeping the * existing BOs and command packets intact. @@ -197,4 +207,82 @@ tu_cs_emit_ib(cs, target->entries + i); } +#define fd_reg_pair tu_reg_value +#define __bo_type struct tu_bo * + +#include "a6xx.xml.h" +#include "a6xx-pack.xml.h" + +#define __assert_eq(a, b) \ + do { \ + if ((a) != (b)) { \ + fprintf(stderr, "assert failed: " #a " (0x%x) != " #b " (0x%x)\n", a, b); \ + assert((a) == (b)); \ + } \ + } while (0) + +#define __ONE_REG(i, regs) \ + do { \ + if (i < ARRAY_SIZE(regs) && regs[i].reg > 0) { \ + __assert_eq(regs[0].reg + i, regs[i].reg); \ + if (regs[i].bo) { \ + uint64_t v = regs[i].bo->iova + regs[i].bo_offset; \ + v >>= regs[i].bo_shift; \ + v |= regs[i].value; \ + \ + *p++ = v; \ + *p++ = v >> 32; \ + } else { \ + *p++ = regs[i].value; \ + if (regs[i].is_address) \ + *p++ = regs[i].value >> 32; \ + } \ + } \ + } while (0) + +/* Emits a sequence of register writes in order using a pkt4. This will check + * (at runtime on a !NDEBUG build) that the registers were actually set up in + * order in the code. + * + * Note that references to buffers aren't automatically added to the CS, + * unlike in freedreno. We are clever in various places to avoid duplicating + * the reference add work. + * + * Also, 64-bit address registers don't have a way (currently) to set a 64-bit + * address without having a reference to a BO, since the .dword field in the + * register's struct is only 32-bit wide. We should fix this in the pack + * codegen later. + */ +#define tu_cs_emit_regs(cs, ...) do { \ + const struct fd_reg_pair regs[] = { __VA_ARGS__ }; \ + unsigned count = ARRAY_SIZE(regs); \ + \ + STATIC_ASSERT(count > 0); \ + STATIC_ASSERT(count <= 16); \ + \ + uint32_t *p = cs->cur; \ + *p++ = CP_TYPE4_PKT | count | \ + (tu_odd_parity_bit(count) << 7) | \ + ((regs[0].reg & 0x3ffff) << 8) | \ + ((tu_odd_parity_bit(regs[0].reg) << 27)); \ + \ + __ONE_REG( 0, regs); \ + __ONE_REG( 1, regs); \ + __ONE_REG( 2, regs); \ + __ONE_REG( 3, regs); \ + __ONE_REG( 4, regs); \ + __ONE_REG( 5, regs); \ + __ONE_REG( 6, regs); \ + __ONE_REG( 7, regs); \ + __ONE_REG( 8, regs); \ + __ONE_REG( 9, regs); \ + __ONE_REG(10, regs); \ + __ONE_REG(11, regs); \ + __ONE_REG(12, regs); \ + __ONE_REG(13, regs); \ + __ONE_REG(14, regs); \ + __ONE_REG(15, regs); \ + cs->cur = p; \ + } while (0) + #endif /* TU_CS_H */ diff -Nru mesa-19.2.8/src/freedreno/vulkan/tu_descriptor_set.c mesa-20.0.8/src/freedreno/vulkan/tu_descriptor_set.c --- mesa-19.2.8/src/freedreno/vulkan/tu_descriptor_set.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/freedreno/vulkan/tu_descriptor_set.c 2020-06-12 01:21:16.000000000 +0000 @@ -21,6 +21,21 @@ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. */ + +/** + * @file + * + * The texture and sampler descriptors are laid out in a single global space + * across all shader stages, for both simplicity of implementation and because + * that seems to be how things have to be structured for border color + * handling. + * + * Each shader stage will declare its texture/sampler count based on the last + * descriptor set it uses. At draw emit time (though it really should be + * CmdBind time), we upload the descriptor sets used by each shader stage to + * their stage. + */ + #include "tu_private.h" #include @@ -61,6 +76,36 @@ return sorted_bindings; } +static uint32_t +descriptor_size(enum VkDescriptorType type) +{ + switch (type) { + case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: + return 0; + case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: + /* 64bit pointer */ + return 8; + case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE: + case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: + case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: + case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: + return A6XX_TEX_CONST_DWORDS * 4; + case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: + /* We may need the IBO or the TEX representation, or both. */ + return A6XX_TEX_CONST_DWORDS * 4 * 2; + case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: + /* texture const + tu_sampler struct (includes border color) */ + return A6XX_TEX_CONST_DWORDS * 4 + sizeof(struct tu_sampler); + case VK_DESCRIPTOR_TYPE_SAMPLER: + return sizeof(struct tu_sampler); + default: + unreachable("unknown descriptor type\n"); + return 0; + } +} + VkResult tu_CreateDescriptorSetLayout( VkDevice _device, @@ -82,15 +127,16 @@ uint32_t immutable_sampler_count = 0; for (uint32_t j = 0; j < pCreateInfo->bindingCount; j++) { max_binding = MAX2(max_binding, pCreateInfo->pBindings[j].binding); - if (pCreateInfo->pBindings[j].pImmutableSamplers) + if ((pCreateInfo->pBindings[j].descriptorType == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER || + pCreateInfo->pBindings[j].descriptorType == VK_DESCRIPTOR_TYPE_SAMPLER) && + pCreateInfo->pBindings[j].pImmutableSamplers) { immutable_sampler_count += pCreateInfo->pBindings[j].descriptorCount; + } } - uint32_t samplers_offset = - sizeof(struct tu_descriptor_set_layout) + + uint32_t samplers_offset = sizeof(struct tu_descriptor_set_layout) + (max_binding + 1) * sizeof(set_layout->binding[0]); - size_t size = - samplers_offset + immutable_sampler_count * 4 * sizeof(uint32_t); + uint32_t size = samplers_offset + immutable_sampler_count * sizeof(struct tu_sampler); set_layout = vk_alloc2(&device->alloc, pAllocator, size, 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); @@ -100,8 +146,7 @@ set_layout->flags = pCreateInfo->flags; /* We just allocate all the samplers at the end of the struct */ - uint32_t *samplers = (uint32_t *) &set_layout->binding[max_binding + 1]; - (void) samplers; /* TODO: Use me */ + struct tu_sampler *samplers = (void*) &set_layout->binding[max_binding + 1]; VkDescriptorSetLayoutBinding *bindings = create_sorted_bindings( pCreateInfo->pBindings, pCreateInfo->bindingCount); @@ -125,48 +170,19 @@ for (uint32_t j = 0; j < pCreateInfo->bindingCount; j++) { const VkDescriptorSetLayoutBinding *binding = bindings + j; uint32_t b = binding->binding; - uint32_t alignment; - unsigned binding_buffer_count = 0; + uint32_t alignment = 4; + unsigned binding_buffer_count = 1; switch (binding->descriptorType) { + case VK_DESCRIPTOR_TYPE_SAMPLER: + binding_buffer_count = 0; + break; case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: - assert(!(pCreateInfo->flags & - VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR)); + assert(!(pCreateInfo->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR)); set_layout->binding[b].dynamic_offset_count = 1; - set_layout->dynamic_shader_stages |= binding->stageFlags; - set_layout->binding[b].size = 0; - binding_buffer_count = 1; - alignment = 1; - break; - case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: - case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: - case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: - case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: - set_layout->binding[b].size = 16; - binding_buffer_count = 1; - alignment = 16; - break; - case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: - case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE: - case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: - /* main descriptor + fmask descriptor */ - set_layout->binding[b].size = 64; - binding_buffer_count = 1; - alignment = 32; - break; - case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: - /* main descriptor + fmask descriptor + sampler */ - set_layout->binding[b].size = 96; - binding_buffer_count = 1; - alignment = 32; - break; - case VK_DESCRIPTOR_TYPE_SAMPLER: - set_layout->binding[b].size = 16; - alignment = 16; break; default: - unreachable("unknown descriptor type\n"); break; } @@ -176,6 +192,7 @@ set_layout->binding[b].offset = set_layout->size; set_layout->binding[b].buffer_offset = buffer_count; set_layout->binding[b].dynamic_offset_offset = dynamic_offset_count; + set_layout->binding[b].size = descriptor_size(binding->descriptorType); if (variable_flags && binding->binding < variable_flags->bindingCount && (variable_flags->pBindingFlags[binding->binding] & @@ -187,9 +204,17 @@ set_layout->has_variable_descriptors = true; } - if (binding->pImmutableSamplers) { + if ((binding->descriptorType == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER || + binding->descriptorType == VK_DESCRIPTOR_TYPE_SAMPLER) && + binding->pImmutableSamplers) { set_layout->binding[b].immutable_samplers_offset = samplers_offset; set_layout->has_immutable_samplers = true; + + for (uint32_t i = 0; i < binding->descriptorCount; i++) + samplers[i] = *tu_sampler_from_handle(binding->pImmutableSamplers[i]); + + samplers += binding->descriptorCount; + samplers_offset += sizeof(struct tu_sampler) * binding->descriptorCount; } set_layout->size += @@ -254,37 +279,8 @@ for (uint32_t i = 0; i < pCreateInfo->bindingCount; i++) { const VkDescriptorSetLayoutBinding *binding = bindings + i; - uint64_t descriptor_size = 0; - uint64_t descriptor_alignment = 1; - switch (binding->descriptorType) { - case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: - case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: - break; - case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: - case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: - case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: - case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: - descriptor_size = 16; - descriptor_alignment = 16; - break; - case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: - case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE: - case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: - descriptor_size = 64; - descriptor_alignment = 32; - break; - case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: - descriptor_size = 96; - descriptor_alignment = 32; - break; - case VK_DESCRIPTOR_TYPE_SAMPLER: - descriptor_size = 16; - descriptor_alignment = 16; - break; - default: - unreachable("unknown descriptor type\n"); - break; - } + uint64_t descriptor_sz = descriptor_size(binding->descriptorType); + uint64_t descriptor_alignment = 8; if (size && !align_u64(size, descriptor_alignment)) { supported = false; @@ -292,8 +288,8 @@ size = align_u64(size, descriptor_alignment); uint64_t max_count = UINT64_MAX; - if (descriptor_size) - max_count = (UINT64_MAX - size) / descriptor_size; + if (descriptor_sz) + max_count = (UINT64_MAX - size) / descriptor_sz; if (max_count < binding->descriptorCount) { supported = false; @@ -305,7 +301,7 @@ variable_count->maxVariableDescriptorCount = MIN2(UINT32_MAX, max_count); } - size += binding->descriptorCount * descriptor_size; + size += binding->descriptorCount * descriptor_sz; } free(bindings); @@ -394,6 +390,134 @@ #define EMPTY 1 +static VkResult +tu_descriptor_set_create(struct tu_device *device, + struct tu_descriptor_pool *pool, + const struct tu_descriptor_set_layout *layout, + const uint32_t *variable_count, + struct tu_descriptor_set **out_set) +{ + struct tu_descriptor_set *set; + uint32_t buffer_count = layout->buffer_count; + if (variable_count) { + unsigned stride = 1; + if (layout->binding[layout->binding_count - 1].type == VK_DESCRIPTOR_TYPE_SAMPLER || + layout->binding[layout->binding_count - 1].type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) + stride = 0; + buffer_count = layout->binding[layout->binding_count - 1].buffer_offset + + *variable_count * stride; + } + unsigned range_offset = sizeof(struct tu_descriptor_set) + + sizeof(struct tu_bo *) * buffer_count; + unsigned mem_size = range_offset + + sizeof(struct tu_descriptor_range) * layout->dynamic_offset_count; + + if (pool->host_memory_base) { + if (pool->host_memory_end - pool->host_memory_ptr < mem_size) + return vk_error(device->instance, VK_ERROR_OUT_OF_POOL_MEMORY); + + set = (struct tu_descriptor_set*)pool->host_memory_ptr; + pool->host_memory_ptr += mem_size; + } else { + set = vk_alloc2(&device->alloc, NULL, mem_size, 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + + if (!set) + return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); + } + + memset(set, 0, mem_size); + + if (layout->dynamic_offset_count) { + set->dynamic_descriptors = (struct tu_descriptor_range*)((uint8_t*)set + range_offset); + } + + set->layout = layout; + uint32_t layout_size = layout->size; + if (variable_count) { + assert(layout->has_variable_descriptors); + uint32_t stride = layout->binding[layout->binding_count - 1].size; + if (layout->binding[layout->binding_count - 1].type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) + stride = 1; + + layout_size = layout->binding[layout->binding_count - 1].offset + + *variable_count * stride; + } + + if (layout_size) { + set->size = layout_size; + + if (!pool->host_memory_base && pool->entry_count == pool->max_entry_count) { + vk_free2(&device->alloc, NULL, set); + return vk_error(device->instance, VK_ERROR_OUT_OF_POOL_MEMORY); + } + + /* try to allocate linearly first, so that we don't spend + * time looking for gaps if the app only allocates & + * resets via the pool. */ + if (pool->current_offset + layout_size <= pool->size) { + set->mapped_ptr = (uint32_t*)(pool->bo.map + pool->current_offset); + set->va = pool->bo.iova + pool->current_offset; + if (!pool->host_memory_base) { + pool->entries[pool->entry_count].offset = pool->current_offset; + pool->entries[pool->entry_count].size = layout_size; + pool->entries[pool->entry_count].set = set; + pool->entry_count++; + } + pool->current_offset += layout_size; + } else if (!pool->host_memory_base) { + uint64_t offset = 0; + int index; + + for (index = 0; index < pool->entry_count; ++index) { + if (pool->entries[index].offset - offset >= layout_size) + break; + offset = pool->entries[index].offset + pool->entries[index].size; + } + + if (pool->size - offset < layout_size) { + vk_free2(&device->alloc, NULL, set); + return vk_error(device->instance, VK_ERROR_OUT_OF_POOL_MEMORY); + } + + set->mapped_ptr = (uint32_t*)(pool->bo.map + offset); + set->va = pool->bo.iova + offset; + memmove(&pool->entries[index + 1], &pool->entries[index], + sizeof(pool->entries[0]) * (pool->entry_count - index)); + pool->entries[index].offset = offset; + pool->entries[index].size = layout_size; + pool->entries[index].set = set; + pool->entry_count++; + } else + return vk_error(device->instance, VK_ERROR_OUT_OF_POOL_MEMORY); + } + + *out_set = set; + return VK_SUCCESS; +} + +static void +tu_descriptor_set_destroy(struct tu_device *device, + struct tu_descriptor_pool *pool, + struct tu_descriptor_set *set, + bool free_bo) +{ + assert(!pool->host_memory_base); + + if (free_bo && set->size && !pool->host_memory_base) { + uint32_t offset = (uint8_t*)set->mapped_ptr - (uint8_t*)pool->bo.map; + for (int i = 0; i < pool->entry_count; ++i) { + if (pool->entries[i].offset == offset) { + memmove(&pool->entries[i], &pool->entries[i+1], + sizeof(pool->entries[i]) * (pool->entry_count - i - 1)); + --pool->entry_count; + break; + } + } + } + vk_free2(&device->alloc, NULL, set); +} + VkResult tu_CreateDescriptorPool(VkDevice _device, const VkDescriptorPoolCreateInfo *pCreateInfo, @@ -401,8 +525,61 @@ VkDescriptorPool *pDescriptorPool) { TU_FROM_HANDLE(tu_device, device, _device); - tu_use_args(device); - tu_stub(); + struct tu_descriptor_pool *pool; + uint64_t size = sizeof(struct tu_descriptor_pool); + uint64_t bo_size = 0, bo_count = 0, range_count = 0; + + for (unsigned i = 0; i < pCreateInfo->poolSizeCount; ++i) { + if (pCreateInfo->pPoolSizes[i].type != VK_DESCRIPTOR_TYPE_SAMPLER) + bo_count += pCreateInfo->pPoolSizes[i].descriptorCount; + + switch(pCreateInfo->pPoolSizes[i].type) { + case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: + range_count += pCreateInfo->pPoolSizes[i].descriptorCount; + default: + break; + } + + bo_size += descriptor_size(pCreateInfo->pPoolSizes[i].type) * + pCreateInfo->pPoolSizes[i].descriptorCount; + } + + if (!(pCreateInfo->flags & VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT)) { + uint64_t host_size = pCreateInfo->maxSets * sizeof(struct tu_descriptor_set); + host_size += sizeof(struct tu_bo*) * bo_count; + host_size += sizeof(struct tu_descriptor_range) * range_count; + size += host_size; + } else { + size += sizeof(struct tu_descriptor_pool_entry) * pCreateInfo->maxSets; + } + + pool = vk_alloc2(&device->alloc, pAllocator, size, 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (!pool) + return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); + + memset(pool, 0, sizeof(*pool)); + + if (!(pCreateInfo->flags & VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT)) { + pool->host_memory_base = (uint8_t*)pool + sizeof(struct tu_descriptor_pool); + pool->host_memory_ptr = pool->host_memory_base; + pool->host_memory_end = (uint8_t*)pool + size; + } + + if (bo_size) { + VkResult ret; + + ret = tu_bo_init_new(device, &pool->bo, bo_size); + assert(ret == VK_SUCCESS); + + ret = tu_bo_map(device, &pool->bo); + assert(ret == VK_SUCCESS); + } + pool->size = bo_size; + pool->max_entry_count = pCreateInfo->maxSets; + + *pDescriptorPool = tu_descriptor_pool_to_handle(pool); return VK_SUCCESS; } @@ -411,6 +588,21 @@ VkDescriptorPool _pool, const VkAllocationCallbacks *pAllocator) { + TU_FROM_HANDLE(tu_device, device, _device); + TU_FROM_HANDLE(tu_descriptor_pool, pool, _pool); + + if (!pool) + return; + + if (!pool->host_memory_base) { + for(int i = 0; i < pool->entry_count; ++i) { + tu_descriptor_set_destroy(device, pool, pool->entries[i].set, false); + } + } + + if (pool->size) + tu_bo_finish(device, &pool->bo); + vk_free2(&device->alloc, pAllocator, pool); } VkResult @@ -421,8 +613,16 @@ TU_FROM_HANDLE(tu_device, device, _device); TU_FROM_HANDLE(tu_descriptor_pool, pool, descriptorPool); - tu_use_args(device, pool); - tu_stub(); + if (!pool->host_memory_base) { + for(int i = 0; i < pool->entry_count; ++i) { + tu_descriptor_set_destroy(device, pool, pool->entries[i].set, false); + } + pool->entry_count = 0; + } + + pool->current_offset = 0; + pool->host_memory_ptr = pool->host_memory_base; + return VK_SUCCESS; } @@ -434,9 +634,44 @@ TU_FROM_HANDLE(tu_device, device, _device); TU_FROM_HANDLE(tu_descriptor_pool, pool, pAllocateInfo->descriptorPool); - tu_use_args(device, pool); - tu_stub(); - return VK_SUCCESS; + VkResult result = VK_SUCCESS; + uint32_t i; + struct tu_descriptor_set *set = NULL; + + const VkDescriptorSetVariableDescriptorCountAllocateInfoEXT *variable_counts = + vk_find_struct_const(pAllocateInfo->pNext, DESCRIPTOR_SET_VARIABLE_DESCRIPTOR_COUNT_ALLOCATE_INFO_EXT); + const uint32_t zero = 0; + + /* allocate a set of buffers for each shader to contain descriptors */ + for (i = 0; i < pAllocateInfo->descriptorSetCount; i++) { + TU_FROM_HANDLE(tu_descriptor_set_layout, layout, + pAllocateInfo->pSetLayouts[i]); + + const uint32_t *variable_count = NULL; + if (variable_counts) { + if (i < variable_counts->descriptorSetCount) + variable_count = variable_counts->pDescriptorCounts + i; + else + variable_count = &zero; + } + + assert(!(layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR)); + + result = tu_descriptor_set_create(device, pool, layout, variable_count, &set); + if (result != VK_SUCCESS) + break; + + pDescriptorSets[i] = tu_descriptor_set_to_handle(set); + } + + if (result != VK_SUCCESS) { + tu_FreeDescriptorSets(_device, pAllocateInfo->descriptorPool, + i, pDescriptorSets); + for (i = 0; i < pAllocateInfo->descriptorSetCount; i++) { + pDescriptorSets[i] = VK_NULL_HANDLE; + } + } + return result; } VkResult @@ -448,11 +683,119 @@ TU_FROM_HANDLE(tu_device, device, _device); TU_FROM_HANDLE(tu_descriptor_pool, pool, descriptorPool); - tu_use_args(device, pool); - tu_stub(); + for (uint32_t i = 0; i < count; i++) { + TU_FROM_HANDLE(tu_descriptor_set, set, pDescriptorSets[i]); + + if (set && !pool->host_memory_base) + tu_descriptor_set_destroy(device, pool, set, true); + } return VK_SUCCESS; } +static void write_texel_buffer_descriptor(struct tu_device *device, + struct tu_cmd_buffer *cmd_buffer, + unsigned *dst, + struct tu_bo **buffer_list, + const VkBufferView buffer_view) +{ + TU_FROM_HANDLE(tu_buffer_view, view, buffer_view); + + memcpy(dst, view->descriptor, sizeof(view->descriptor)); + + if (cmd_buffer) + tu_bo_list_add(&cmd_buffer->bo_list, view->buffer->bo, MSM_SUBMIT_BO_READ); + else + *buffer_list = view->buffer->bo; +} + +static void write_buffer_descriptor(struct tu_device *device, + struct tu_cmd_buffer *cmd_buffer, + unsigned *dst, + struct tu_bo **buffer_list, + const VkDescriptorBufferInfo *buffer_info) +{ + TU_FROM_HANDLE(tu_buffer, buffer, buffer_info->buffer); + + uint64_t va = tu_buffer_iova(buffer) + buffer_info->offset; + dst[0] = va; + dst[1] = va >> 32; + + if (cmd_buffer) + tu_bo_list_add(&cmd_buffer->bo_list, buffer->bo, MSM_SUBMIT_BO_READ); + else + *buffer_list = buffer->bo; +} + +static void write_dynamic_buffer_descriptor(struct tu_device *device, + struct tu_descriptor_range *range, + struct tu_bo **buffer_list, + const VkDescriptorBufferInfo *buffer_info) +{ + TU_FROM_HANDLE(tu_buffer, buffer, buffer_info->buffer); + uint64_t va = tu_buffer_iova(buffer) + buffer_info->offset; + unsigned size = buffer_info->range; + + if (buffer_info->range == VK_WHOLE_SIZE) + size = buffer->size - buffer_info->offset; + + range->va = va; + range->size = size; + + *buffer_list = buffer->bo; +} + +static void +write_image_descriptor(struct tu_device *device, + struct tu_cmd_buffer *cmd_buffer, + unsigned *dst, + struct tu_bo **buffer_list, + VkDescriptorType descriptor_type, + const VkDescriptorImageInfo *image_info) +{ + TU_FROM_HANDLE(tu_image_view, iview, image_info->imageView); + + memcpy(dst, iview->descriptor, sizeof(iview->descriptor)); + if (descriptor_type == VK_DESCRIPTOR_TYPE_STORAGE_IMAGE) { + memcpy(&dst[A6XX_TEX_CONST_DWORDS], iview->storage_descriptor, + sizeof(iview->storage_descriptor)); + } + + if (cmd_buffer) + tu_bo_list_add(&cmd_buffer->bo_list, iview->image->bo, MSM_SUBMIT_BO_READ); + else + *buffer_list = iview->image->bo; +} + +static void +write_combined_image_sampler_descriptor(struct tu_device *device, + struct tu_cmd_buffer *cmd_buffer, + unsigned sampler_offset, + unsigned *dst, + struct tu_bo **buffer_list, + VkDescriptorType descriptor_type, + const VkDescriptorImageInfo *image_info, + bool has_sampler) +{ + TU_FROM_HANDLE(tu_sampler, sampler, image_info->sampler); + + write_image_descriptor(device, cmd_buffer, dst, buffer_list, + descriptor_type, image_info); + /* copy over sampler state */ + if (has_sampler) { + memcpy(dst + sampler_offset / sizeof(*dst), sampler, sizeof(*sampler)); + } +} + +static void +write_sampler_descriptor(struct tu_device *device, + unsigned *dst, + const VkDescriptorImageInfo *image_info) +{ + TU_FROM_HANDLE(tu_sampler, sampler, image_info->sampler); + + memcpy(dst, sampler, sizeof(*sampler)); +} + void tu_update_descriptor_sets(struct tu_device *device, struct tu_cmd_buffer *cmd_buffer, @@ -462,6 +805,124 @@ uint32_t descriptorCopyCount, const VkCopyDescriptorSet *pDescriptorCopies) { + uint32_t i, j; + for (i = 0; i < descriptorWriteCount; i++) { + const VkWriteDescriptorSet *writeset = &pDescriptorWrites[i]; + TU_FROM_HANDLE(tu_descriptor_set, set, + dstSetOverride ? dstSetOverride : writeset->dstSet); + const struct tu_descriptor_set_binding_layout *binding_layout = + set->layout->binding + writeset->dstBinding; + uint32_t *ptr = set->mapped_ptr; + struct tu_bo **buffer_list = set->descriptors; + + ptr += binding_layout->offset / 4; + + ptr += binding_layout->size * writeset->dstArrayElement / 4; + buffer_list += binding_layout->buffer_offset; + buffer_list += writeset->dstArrayElement; + for (j = 0; j < writeset->descriptorCount; ++j) { + switch(writeset->descriptorType) { + case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: { + unsigned idx = writeset->dstArrayElement + j; + idx += binding_layout->dynamic_offset_offset; + assert(!(set->layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR)); + write_dynamic_buffer_descriptor(device, set->dynamic_descriptors + idx, + buffer_list, writeset->pBufferInfo + j); + break; + } + + case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: + write_buffer_descriptor(device, cmd_buffer, ptr, buffer_list, + writeset->pBufferInfo + j); + break; + case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: + case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: + write_texel_buffer_descriptor(device, cmd_buffer, ptr, buffer_list, + writeset->pTexelBufferView[j]); + break; + case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE: + case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: + case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: + write_image_descriptor(device, cmd_buffer, ptr, buffer_list, + writeset->descriptorType, + writeset->pImageInfo + j); + break; + case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: + write_combined_image_sampler_descriptor(device, cmd_buffer, + A6XX_TEX_CONST_DWORDS * 4, + ptr, buffer_list, + writeset->descriptorType, + writeset->pImageInfo + j, + !binding_layout->immutable_samplers_offset); + break; + case VK_DESCRIPTOR_TYPE_SAMPLER: + write_sampler_descriptor(device, ptr, writeset->pImageInfo + j); + break; + default: + unreachable("unimplemented descriptor type"); + break; + } + ptr += binding_layout->size / 4; + ++buffer_list; + } + } + + for (i = 0; i < descriptorCopyCount; i++) { + const VkCopyDescriptorSet *copyset = &pDescriptorCopies[i]; + TU_FROM_HANDLE(tu_descriptor_set, src_set, + copyset->srcSet); + TU_FROM_HANDLE(tu_descriptor_set, dst_set, + copyset->dstSet); + const struct tu_descriptor_set_binding_layout *src_binding_layout = + src_set->layout->binding + copyset->srcBinding; + const struct tu_descriptor_set_binding_layout *dst_binding_layout = + dst_set->layout->binding + copyset->dstBinding; + uint32_t *src_ptr = src_set->mapped_ptr; + uint32_t *dst_ptr = dst_set->mapped_ptr; + struct tu_bo **src_buffer_list = src_set->descriptors; + struct tu_bo **dst_buffer_list = dst_set->descriptors; + + src_ptr += src_binding_layout->offset / 4; + dst_ptr += dst_binding_layout->offset / 4; + + src_ptr += src_binding_layout->size * copyset->srcArrayElement / 4; + dst_ptr += dst_binding_layout->size * copyset->dstArrayElement / 4; + + src_buffer_list += src_binding_layout->buffer_offset; + src_buffer_list += copyset->srcArrayElement; + + dst_buffer_list += dst_binding_layout->buffer_offset; + dst_buffer_list += copyset->dstArrayElement; + + for (j = 0; j < copyset->descriptorCount; ++j) { + switch (src_binding_layout->type) { + case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: { + unsigned src_idx = copyset->srcArrayElement + j; + unsigned dst_idx = copyset->dstArrayElement + j; + struct tu_descriptor_range *src_range, *dst_range; + src_idx += src_binding_layout->dynamic_offset_offset; + dst_idx += dst_binding_layout->dynamic_offset_offset; + + src_range = src_set->dynamic_descriptors + src_idx; + dst_range = dst_set->dynamic_descriptors + dst_idx; + *dst_range = *src_range; + break; + } + default: + memcpy(dst_ptr, src_ptr, src_binding_layout->size); + } + src_ptr += src_binding_layout->size / 4; + dst_ptr += dst_binding_layout->size / 4; + + if (src_binding_layout->type != VK_DESCRIPTOR_TYPE_SAMPLER) { + /* Sampler descriptors don't have a buffer list. */ + dst_buffer_list[j] = src_buffer_list[j]; + } + } + } } void diff -Nru mesa-19.2.8/src/freedreno/vulkan/tu_descriptor_set.h mesa-20.0.8/src/freedreno/vulkan/tu_descriptor_set.h --- mesa-19.2.8/src/freedreno/vulkan/tu_descriptor_set.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/freedreno/vulkan/tu_descriptor_set.h 2020-06-12 01:21:16.000000000 +0000 @@ -92,11 +92,11 @@ unsigned char sha1[20]; }; -static inline const uint32_t * +static inline const struct tu_sampler* tu_immutable_samplers(const struct tu_descriptor_set_layout *set, const struct tu_descriptor_set_binding_layout *binding) { - return (const uint32_t *) ((const char *) set + + return (struct tu_sampler *) ((const char *) set + binding->immutable_samplers_offset); } #endif /* TU_DESCRIPTOR_SET_H */ diff -Nru mesa-19.2.8/src/freedreno/vulkan/tu_device.c mesa-20.0.8/src/freedreno/vulkan/tu_device.c --- mesa-19.2.8/src/freedreno/vulkan/tu_device.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/freedreno/vulkan/tu_device.c 2020-06-12 01:21:16.000000000 +0000 @@ -257,8 +257,9 @@ switch (device->gpu_id) { case 630: - device->tile_align_w = 32; - device->tile_align_h = 32; + case 640: + device->tile_align_w = 64; + device->tile_align_h = 16; break; default: result = vk_errorf(instance, VK_ERROR_INITIALIZATION_FAILED, @@ -353,6 +354,7 @@ { "startup", TU_DEBUG_STARTUP }, { "nir", TU_DEBUG_NIR }, { "ir3", TU_DEBUG_IR3 }, + { "nobin", TU_DEBUG_NOBIN }, { NULL, 0 } }; @@ -579,11 +581,11 @@ .largePoints = false, .alphaToOne = false, .multiViewport = false, - .samplerAnisotropy = false, - .textureCompressionETC2 = false, - .textureCompressionASTC_LDR = false, - .textureCompressionBC = false, - .occlusionQueryPrecise = false, + .samplerAnisotropy = true, + .textureCompressionETC2 = true, + .textureCompressionASTC_LDR = true, + .textureCompressionBC = true, + .occlusionQueryPrecise = true, .pipelineStatisticsQuery = false, .vertexPipelineStoresAndAtomics = false, .fragmentStoresAndAtomics = false, @@ -700,7 +702,8 @@ VkPhysicalDeviceProperties *pProperties) { TU_FROM_HANDLE(tu_physical_device, pdevice, physicalDevice); - VkSampleCountFlags sample_counts = 0xf; + VkSampleCountFlags sample_counts = VK_SAMPLE_COUNT_1_BIT | + VK_SAMPLE_COUNT_2_BIT | VK_SAMPLE_COUNT_4_BIT | VK_SAMPLE_COUNT_8_BIT; /* make sure that the entire descriptor set is addressable with a signed * 32-bit int. So the sum of all limits scaled by descriptor size has to @@ -723,7 +726,7 @@ .maxImageArrayLayers = (1 << 11), .maxTexelBufferElements = 128 * 1024 * 1024, .maxUniformBufferRange = UINT32_MAX, - .maxStorageBufferRange = UINT32_MAX, + .maxStorageBufferRange = MAX_STORAGE_BUFFER_RANGE, .maxPushConstantsSize = MAX_PUSH_CONSTANTS_SIZE, .maxMemoryAllocationCount = UINT32_MAX, .maxSamplerAllocationCount = 64 * 1024, @@ -783,7 +786,7 @@ .viewportBoundsRange = { INT16_MIN, INT16_MAX }, .viewportSubPixelBits = 8, .minMemoryMapAlignment = 4096, /* A page */ - .minTexelBufferOffsetAlignment = 1, + .minTexelBufferOffsetAlignment = 64, .minUniformBufferOffsetAlignment = 4, .minStorageBufferOffsetAlignment = 4, .minTexelOffset = -32, @@ -807,7 +810,7 @@ .sampledImageStencilSampleCounts = sample_counts, .storageImageSampleCounts = VK_SAMPLE_COUNT_1_BIT, .maxSampleMaskWords = 1, - .timestampComputeAndGraphics = true, + .timestampComputeAndGraphics = false, /* FINISHME */ .timestampPeriod = 1, .maxClipDistances = 8, .maxCullDistances = 8, @@ -896,7 +899,7 @@ .queueFlags = VK_QUEUE_GRAPHICS_BIT | VK_QUEUE_COMPUTE_BIT | VK_QUEUE_TRANSFER_BIT, .queueCount = 1, - .timestampValidBits = 64, + .timestampValidBits = 0, /* FINISHME */ .minImageTransferGranularity = { 1, 1, 1 }, }; @@ -1560,7 +1563,7 @@ TU_FROM_HANDLE(tu_image, image, _image); pMemoryRequirements->memoryTypeBits = 1; - pMemoryRequirements->size = image->size; + pMemoryRequirements->size = image->layout.size; pMemoryRequirements->alignment = image->alignment; } @@ -1729,9 +1732,23 @@ if (!event) return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); + VkResult result = tu_bo_init_new(device, &event->bo, 0x1000); + if (result != VK_SUCCESS) + goto fail_alloc; + + result = tu_bo_map(device, &event->bo); + if (result != VK_SUCCESS) + goto fail_map; + *pEvent = tu_event_to_handle(event); return VK_SUCCESS; + +fail_map: + tu_bo_finish(device, &event->bo); +fail_alloc: + vk_free2(&device->alloc, pAllocator, event); + return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); } void @@ -1744,6 +1761,8 @@ if (!event) return; + + tu_bo_finish(device, &event->bo); vk_free2(&device->alloc, pAllocator, event); } @@ -1752,7 +1771,7 @@ { TU_FROM_HANDLE(tu_event, event, _event); - if (*event->map == 1) + if (*(uint64_t*) event->bo.map == 1) return VK_EVENT_SET; return VK_EVENT_RESET; } @@ -1761,7 +1780,7 @@ tu_SetEvent(VkDevice _device, VkEvent _event) { TU_FROM_HANDLE(tu_event, event, _event); - *event->map = 1; + *(uint64_t*) event->bo.map = 1; return VK_SUCCESS; } @@ -1770,7 +1789,7 @@ tu_ResetEvent(VkDevice _device, VkEvent _event) { TU_FROM_HANDLE(tu_event, event, _event); - *event->map = 0; + *(uint64_t*) event->bo.map = 0; return VK_SUCCESS; } @@ -1872,11 +1891,78 @@ vk_free2(&device->alloc, pAllocator, fb); } +static enum a6xx_tex_clamp +tu6_tex_wrap(VkSamplerAddressMode address_mode, bool *needs_border) +{ + switch (address_mode) { + case VK_SAMPLER_ADDRESS_MODE_REPEAT: + return A6XX_TEX_REPEAT; + case VK_SAMPLER_ADDRESS_MODE_MIRRORED_REPEAT: + return A6XX_TEX_MIRROR_REPEAT; + case VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE: + return A6XX_TEX_CLAMP_TO_EDGE; + case VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER: + *needs_border = true; + return A6XX_TEX_CLAMP_TO_BORDER; + case VK_SAMPLER_ADDRESS_MODE_MIRROR_CLAMP_TO_EDGE: + /* only works for PoT.. need to emulate otherwise! */ + return A6XX_TEX_MIRROR_CLAMP; + default: + unreachable("illegal tex wrap mode"); + break; + } +} + +static enum a6xx_tex_filter +tu6_tex_filter(VkFilter filter, unsigned aniso) +{ + switch (filter) { + case VK_FILTER_NEAREST: + return A6XX_TEX_NEAREST; + case VK_FILTER_LINEAR: + return aniso ? A6XX_TEX_ANISO : A6XX_TEX_LINEAR; + case VK_FILTER_CUBIC_IMG: + default: + unreachable("illegal texture filter"); + break; + } +} + static void tu_init_sampler(struct tu_device *device, struct tu_sampler *sampler, const VkSamplerCreateInfo *pCreateInfo) { + unsigned aniso = pCreateInfo->anisotropyEnable ? + util_last_bit(MIN2((uint32_t)pCreateInfo->maxAnisotropy >> 1, 8)) : 0; + bool miplinear = (pCreateInfo->mipmapMode == VK_SAMPLER_MIPMAP_MODE_LINEAR); + bool needs_border = false; + + sampler->state[0] = + COND(miplinear, A6XX_TEX_SAMP_0_MIPFILTER_LINEAR_NEAR) | + A6XX_TEX_SAMP_0_XY_MAG(tu6_tex_filter(pCreateInfo->magFilter, aniso)) | + A6XX_TEX_SAMP_0_XY_MIN(tu6_tex_filter(pCreateInfo->minFilter, aniso)) | + A6XX_TEX_SAMP_0_ANISO(aniso) | + A6XX_TEX_SAMP_0_WRAP_S(tu6_tex_wrap(pCreateInfo->addressModeU, &needs_border)) | + A6XX_TEX_SAMP_0_WRAP_T(tu6_tex_wrap(pCreateInfo->addressModeV, &needs_border)) | + A6XX_TEX_SAMP_0_WRAP_R(tu6_tex_wrap(pCreateInfo->addressModeW, &needs_border)) | + A6XX_TEX_SAMP_0_LOD_BIAS(pCreateInfo->mipLodBias); + sampler->state[1] = + /* COND(!cso->seamless_cube_map, A6XX_TEX_SAMP_1_CUBEMAPSEAMLESSFILTOFF) | */ + COND(pCreateInfo->unnormalizedCoordinates, A6XX_TEX_SAMP_1_UNNORM_COORDS) | + A6XX_TEX_SAMP_1_MIN_LOD(pCreateInfo->minLod) | + A6XX_TEX_SAMP_1_MAX_LOD(pCreateInfo->maxLod) | + COND(pCreateInfo->compareEnable, A6XX_TEX_SAMP_1_COMPARE_FUNC(pCreateInfo->compareOp)); + sampler->state[2] = 0; + sampler->state[3] = 0; + + /* TODO: + * A6XX_TEX_SAMP_1_MIPFILTER_LINEAR_FAR disables mipmapping, but vk has no NONE mipfilter? + * border color + */ + + sampler->needs_border = needs_border; + sampler->border = pCreateInfo->borderColor; } VkResult diff -Nru mesa-19.2.8/src/freedreno/vulkan/tu_extensions.py mesa-20.0.8/src/freedreno/vulkan/tu_extensions.py --- mesa-19.2.8/src/freedreno/vulkan/tu_extensions.py 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/freedreno/vulkan/tu_extensions.py 2020-06-12 01:21:16.000000000 +0000 @@ -31,7 +31,7 @@ from mako.template import Template -MAX_API_VERSION = '1.1.82' +MAX_API_VERSION = '1.2.131' class Extension: def __init__(self, name, ext_version, enable): @@ -60,6 +60,7 @@ Extension('VK_KHR_maintenance1', 1, True), Extension('VK_KHR_maintenance2', 1, True), Extension('VK_KHR_maintenance3', 1, True), + Extension('VK_KHR_sampler_mirror_clamp_to_edge', 1, True), Extension('VK_KHR_surface', 25, 'TU_HAS_SURFACE'), Extension('VK_KHR_swapchain', 68, 'TU_HAS_SURFACE'), Extension('VK_KHR_wayland_surface', 6, 'VK_USE_PLATFORM_WAYLAND_KHR'), @@ -75,6 +76,7 @@ Extension('VK_KHR_external_memory', 1, True), Extension('VK_KHR_external_memory_fd', 1, True), Extension('VK_EXT_external_memory_dma_buf', 1, True), + Extension('VK_EXT_image_drm_format_modifier', 1, False), ] class VkVersion: diff -Nru mesa-19.2.8/src/freedreno/vulkan/tu_fence.c mesa-20.0.8/src/freedreno/vulkan/tu_fence.c --- mesa-19.2.8/src/freedreno/vulkan/tu_fence.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/freedreno/vulkan/tu_fence.c 2020-06-12 01:21:16.000000000 +0000 @@ -86,6 +86,7 @@ { fence->signaled = signaled; fence->fd = -1; + fence->fence_wsi = NULL; } void @@ -93,6 +94,8 @@ { if (fence->fd >= 0) close(fence->fd); + if (fence->fence_wsi) + fence->fence_wsi->destroy(fence->fence_wsi); } /** @@ -208,6 +211,10 @@ for (uint32_t i = 0; i < fence_count; i++) { TU_FROM_HANDLE(tu_fence, fence, fences[i]); + /* skip wsi fences */ + if (fence->fence_wsi) + continue; + if (fence->signaled) { if (wait_all) { /* skip signaled fences */ @@ -289,6 +296,10 @@ for (uint32_t i = 0; i < fence_count; i++) { TU_FROM_HANDLE(tu_fence, fence, fences[i]); + /* skip wsi fences */ + if (fence->fence_wsi) + continue; + /* no signaled fence in fds */ if (fence->signaled) continue; @@ -349,6 +360,18 @@ if (fds != stack_fds) vk_free(&device->alloc, fds); + if (result != VK_SUCCESS) + return result; + + for (uint32_t i = 0; i < fenceCount; ++i) { + TU_FROM_HANDLE(tu_fence, fence, pFences[i]); + if (fence->fence_wsi) { + VkResult result = fence->fence_wsi->wait(fence->fence_wsi, timeout); + if (result != VK_SUCCESS) + return result; + } + } + return result; } @@ -376,6 +399,15 @@ else if (err && errno != ETIME) return VK_ERROR_OUT_OF_HOST_MEMORY; } + if (fence->fence_wsi) { + VkResult result = fence->fence_wsi->wait(fence->fence_wsi, 0); + + if (result != VK_SUCCESS) { + if (result == VK_TIMEOUT) + return VK_NOT_READY; + return result; + } + } return fence->signaled ? VK_SUCCESS : VK_NOT_READY; } diff -Nru mesa-19.2.8/src/freedreno/vulkan/tu_formats.c mesa-20.0.8/src/freedreno/vulkan/tu_formats.c --- mesa-19.2.8/src/freedreno/vulkan/tu_formats.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/freedreno/vulkan/tu_formats.c 2020-06-12 01:21:16.000000000 +0000 @@ -29,10 +29,12 @@ #include "registers/a6xx.xml.h" #include "util/format_r11g11b10f.h" +#include "util/format_rgb9e5.h" #include "util/format_srgb.h" #include "util/u_half.h" #include "vk_format.h" #include "vk_util.h" +#include "drm-uapi/drm_fourcc.h" /** * Declare a format table. A format table is an array of tu_native_format. @@ -86,7 +88,7 @@ TU6_xTC(R4G4B4A4_UNORM_PACK16, 4_4_4_4_UNORM, R4G4B4A4_UNORM, XYZW), /* 2 */ TU6_xTC(B4G4R4A4_UNORM_PACK16, 4_4_4_4_UNORM, R4G4B4A4_UNORM, ZYXW), /* 3 */ TU6_xTC(R5G6B5_UNORM_PACK16, 5_6_5_UNORM, R5G6B5_UNORM, WXYZ), /* 4 */ - TU6_xTC(B5G6R5_UNORM_PACK16, 5_6_5_UNORM, R5G6B5_UNORM, WXYZ), /* 5 */ + TU6_xTC(B5G6R5_UNORM_PACK16, 5_6_5_UNORM, R5G6B5_UNORM, WZYX), /* 5 */ TU6_xxx(R5G5B5A1_UNORM_PACK16, 1_5_5_5_UNORM, A1R5G5B5_UNORM, XYZW), /* 6 */ TU6_xxx(B5G5R5A1_UNORM_PACK16, 1_5_5_5_UNORM, A1R5G5B5_UNORM, XYZW), /* 7 */ TU6_xTC(A1R5G5B5_UNORM_PACK16, 5_5_5_1_UNORM, R5G5B5A1_UNORM, WXYZ), /* 8 */ @@ -119,12 +121,12 @@ TU6_xxx(R8G8B8_SRGB, 8_8_8_UNORM, R8G8B8_UNORM, WZYX), /* 29 */ /* 24-bit BGR */ - TU6_Vxx(B8G8R8_UNORM, 8_8_8_UNORM, R8G8B8_UNORM, WXYZ), /* 30 */ - TU6_Vxx(B8G8R8_SNORM, 8_8_8_SNORM, R8G8B8_SNORM, WXYZ), /* 31 */ - TU6_Vxx(B8G8R8_USCALED, 8_8_8_UINT, R8G8B8_UINT, WXYZ), /* 32 */ - TU6_Vxx(B8G8R8_SSCALED, 8_8_8_SINT, R8G8B8_SINT, WXYZ), /* 33 */ - TU6_Vxx(B8G8R8_UINT, 8_8_8_UINT, R8G8B8_UINT, WXYZ), /* 34 */ - TU6_Vxx(B8G8R8_SINT, 8_8_8_SINT, R8G8B8_SINT, WXYZ), /* 35 */ + TU6_xxx(B8G8R8_UNORM, 8_8_8_UNORM, R8G8B8_UNORM, WXYZ), /* 30 */ + TU6_xxx(B8G8R8_SNORM, 8_8_8_SNORM, R8G8B8_SNORM, WXYZ), /* 31 */ + TU6_xxx(B8G8R8_USCALED, 8_8_8_UINT, R8G8B8_UINT, WXYZ), /* 32 */ + TU6_xxx(B8G8R8_SSCALED, 8_8_8_SINT, R8G8B8_SINT, WXYZ), /* 33 */ + TU6_xxx(B8G8R8_UINT, 8_8_8_UINT, R8G8B8_UINT, WXYZ), /* 34 */ + TU6_xxx(B8G8R8_SINT, 8_8_8_SINT, R8G8B8_SINT, WXYZ), /* 35 */ TU6_xxx(B8G8R8_SRGB, 8_8_8_UNORM, R8G8B8_UNORM, WXYZ), /* 36 */ /* 32-bit RGBA */ @@ -178,8 +180,8 @@ /* 32-bit RG */ TU6_VTC(R16G16_UNORM, 16_16_UNORM, R16G16_UNORM, WZYX), /* 77 */ TU6_VTC(R16G16_SNORM, 16_16_SNORM, R16G16_SNORM, WZYX), /* 78 */ - TU6_VTx(R16G16_USCALED, 16_16_UINT, R16G16_UINT, WZYX), /* 79 */ - TU6_VTx(R16G16_SSCALED, 16_16_SINT, R16G16_SINT, WZYX), /* 80 */ + TU6_Vxx(R16G16_USCALED, 16_16_UINT, R16G16_UINT, WZYX), /* 79 */ + TU6_Vxx(R16G16_SSCALED, 16_16_SINT, R16G16_SINT, WZYX), /* 80 */ TU6_VTC(R16G16_UINT, 16_16_UINT, R16G16_UINT, WZYX), /* 81 */ TU6_VTC(R16G16_SINT, 16_16_SINT, R16G16_SINT, WZYX), /* 82 */ TU6_VTC(R16G16_SFLOAT, 16_16_FLOAT, R16G16_FLOAT, WZYX), /* 83 */ @@ -196,8 +198,8 @@ /* 64-bit RGBA */ TU6_VTC(R16G16B16A16_UNORM, 16_16_16_16_UNORM, R16G16B16A16_UNORM, WZYX), /* 91 */ TU6_VTC(R16G16B16A16_SNORM, 16_16_16_16_SNORM, R16G16B16A16_SNORM, WZYX), /* 92 */ - TU6_VTx(R16G16B16A16_USCALED, 16_16_16_16_UINT, R16G16B16A16_UINT, WZYX), /* 93 */ - TU6_VTx(R16G16B16A16_SSCALED, 16_16_16_16_SINT, R16G16B16A16_SINT, WZYX), /* 94 */ + TU6_Vxx(R16G16B16A16_USCALED, 16_16_16_16_UINT, R16G16B16A16_UINT, WZYX), /* 93 */ + TU6_Vxx(R16G16B16A16_SSCALED, 16_16_16_16_SINT, R16G16B16A16_SINT, WZYX), /* 94 */ TU6_VTC(R16G16B16A16_UINT, 16_16_16_16_UINT, R16G16B16A16_UINT, WZYX), /* 95 */ TU6_VTC(R16G16B16A16_SINT, 16_16_16_16_SINT, R16G16B16A16_SINT, WZYX), /* 96 */ TU6_VTC(R16G16B16A16_SFLOAT, 16_16_16_16_FLOAT, R16G16B16A16_FLOAT, WZYX), /* 97 */ @@ -213,9 +215,9 @@ TU6_VTC(R32G32_SFLOAT, 32_32_FLOAT, R32G32_FLOAT, WZYX), /* 103 */ /* 96-bit RGB */ - TU6_VTx(R32G32B32_UINT, 32_32_32_UINT, R32G32B32_UINT, WZYX), /* 104 */ - TU6_VTx(R32G32B32_SINT, 32_32_32_SINT, R32G32B32_SINT, WZYX), /* 105 */ - TU6_VTx(R32G32B32_SFLOAT, 32_32_32_FLOAT, R32G32B32_FLOAT, WZYX), /* 106 */ + TU6_Vxx(R32G32B32_UINT, 32_32_32_UINT, R32G32B32_UINT, WZYX), /* 104 */ + TU6_Vxx(R32G32B32_SINT, 32_32_32_SINT, R32G32B32_SINT, WZYX), /* 105 */ + TU6_Vxx(R32G32B32_SFLOAT, 32_32_32_FLOAT, R32G32B32_FLOAT, WZYX), /* 106 */ /* 128-bit RGBA */ TU6_VTC(R32G32B32A32_UINT, 32_32_32_32_UINT, R32G32B32A32_UINT, WZYX), /* 107 */ @@ -248,12 +250,12 @@ /* depth/stencil */ TU6_xTC(D16_UNORM, 16_UNORM, R16_UNORM, WZYX), /* 124 */ - TU6_xTC(X8_D24_UNORM_PACK32, X8Z24_UNORM, X8Z24_UNORM, WZYX), /* 125 */ + TU6_xTC(X8_D24_UNORM_PACK32, Z24_UNORM_S8_UINT, Z24_UNORM_S8_UINT, WZYX), /* 125 */ TU6_xTC(D32_SFLOAT, 32_FLOAT, R32_FLOAT, WZYX), /* 126 */ - TU6_xTC(S8_UINT, 8_UINT, R8_UNORM, WZYX), /* 127 */ + TU6_xTC(S8_UINT, 8_UINT, R8_UINT, WZYX), /* 127 */ TU6_xxx(D16_UNORM_S8_UINT, X8Z16_UNORM, X8Z16_UNORM, WZYX), /* 128 */ - TU6_xTC(D24_UNORM_S8_UINT, X8Z24_UNORM, X8Z24_UNORM, WZYX), /* 129 */ - TU6_xTC(D32_SFLOAT_S8_UINT, 32_FLOAT, R32_FLOAT, WZYX), /* 130 */ + TU6_xTC(D24_UNORM_S8_UINT, Z24_UNORM_S8_UINT, Z24_UNORM_S8_UINT, WZYX), /* 129 */ + TU6_xxx(D32_SFLOAT_S8_UINT, x, x, WZYX), /* 130 */ /* compressed */ TU6_xTx(BC1_RGB_UNORM_BLOCK, DXT1, DXT1, WZYX), /* 131 */ @@ -322,6 +324,14 @@ if (format >= tu6_format_table0_first && format <= tu6_format_table0_last) fmt = &tu6_format_table0[format - tu6_format_table0_first]; + if (!fmt || !fmt->present) + return NULL; + + if (vk_format_to_pipe_format(format) == PIPE_FORMAT_NONE) { + tu_finishme("vk_format %d missing matching pipe format.\n", format); + return NULL; + } + return (fmt && fmt->present) ? fmt : NULL; } @@ -335,8 +345,13 @@ case RB6_R8G8_UNORM: case RB6_R8G8_SNORM: case RB6_R8G8B8A8_UNORM: - case RB6_R8G8B8_UNORM: + case RB6_R8G8B8X8_UNORM: case RB6_R8G8B8A8_SNORM: + case RB6_R4G4B4A4_UNORM: + case RB6_R5G5B5A1_UNORM: + case RB6_R5G6B5_UNORM: + case RB6_Z24_UNORM_S8_UINT: + case RB6_Z24_UNORM_S8_UINT_AS_R8G8B8A8: return R2D_UNORM8; case RB6_R32_UINT: @@ -353,6 +368,7 @@ case RB6_R16G16_SINT: case RB6_R16G16B16A16_UINT: case RB6_R16G16B16A16_SINT: + case RB6_R10G10B10A2_UINT: return R2D_INT16; case RB6_R8_UINT: @@ -377,23 +393,32 @@ case RB6_R16_FLOAT: case RB6_R16G16_FLOAT: case RB6_R16G16B16A16_FLOAT: + case RB6_R11G11B10_FLOAT: + case RB6_R10G10B10A2_UNORM: return R2D_FLOAT16; - case RB6_R4G4B4A4_UNORM: - case RB6_R5G5B5A1_UNORM: - case RB6_R5G6B5_UNORM: - case RB6_R10G10B10A2_UNORM: - case RB6_R10G10B10A2_UINT: - case RB6_R11G11B10_FLOAT: - case RB6_X8Z24_UNORM: - // ??? - return 0; default: unreachable("bad format"); return 0; } } +enum a6xx_depth_format +tu6_pipe2depth(VkFormat format) +{ + switch (format) { + case VK_FORMAT_D16_UNORM: + return DEPTH6_16; + case VK_FORMAT_X8_D24_UNORM_PACK32: + case VK_FORMAT_D24_UNORM_S8_UINT: + return DEPTH6_24_8; + case VK_FORMAT_D32_SFLOAT: + return DEPTH6_32; + default: + return ~0; + } +} + static uint32_t tu_pack_mask(int bits) { @@ -483,32 +508,30 @@ static uint32_t tu_pack_clear_component_value(union tu_clear_component_value val, - const struct vk_format_channel_description *ch) + const struct util_format_channel_description *ch) { uint32_t packed; switch (ch->type) { - case VK_FORMAT_TYPE_UNSIGNED: + case UTIL_FORMAT_TYPE_UNSIGNED: /* normalized, scaled, or pure integer */ - assert(ch->normalized + ch->scaled + ch->pure_integer == 1); if (ch->normalized) packed = tu_pack_float32_for_unorm(val.float32, ch->size); - else if (ch->scaled) - packed = tu_pack_float32_for_uscaled(val.float32, ch->size); - else + else if (ch->pure_integer) packed = tu_pack_uint32_for_uint(val.uint32, ch->size); + else + packed = tu_pack_float32_for_uscaled(val.float32, ch->size); break; - case VK_FORMAT_TYPE_SIGNED: + case UTIL_FORMAT_TYPE_SIGNED: /* normalized, scaled, or pure integer */ - assert(ch->normalized + ch->scaled + ch->pure_integer == 1); if (ch->normalized) packed = tu_pack_float32_for_snorm(val.float32, ch->size); - else if (ch->scaled) - packed = tu_pack_float32_for_sscaled(val.float32, ch->size); - else + else if (ch->pure_integer) packed = tu_pack_int32_for_sint(val.int32, ch->size); + else + packed = tu_pack_float32_for_sscaled(val.float32, ch->size); break; - case VK_FORMAT_TYPE_FLOAT: + case UTIL_FORMAT_TYPE_FLOAT: packed = tu_pack_float32_for_sfloat(val.float32, ch->size); break; default: @@ -521,18 +544,18 @@ return packed; } -static const struct vk_format_channel_description * -tu_get_format_channel_description(const struct vk_format_description *desc, +static const struct util_format_channel_description * +tu_get_format_channel_description(const struct util_format_description *desc, int comp) { switch (desc->swizzle[comp]) { - case VK_SWIZZLE_X: + case PIPE_SWIZZLE_X: return &desc->channel[0]; - case VK_SWIZZLE_Y: + case PIPE_SWIZZLE_Y: return &desc->channel[1]; - case VK_SWIZZLE_Z: + case PIPE_SWIZZLE_Z: return &desc->channel[2]; - case VK_SWIZZLE_W: + case PIPE_SWIZZLE_W: return &desc->channel[3]; default: return NULL; @@ -540,18 +563,29 @@ } static union tu_clear_component_value -tu_get_clear_component_value(const VkClearValue *val, int comp, bool color) +tu_get_clear_component_value(const VkClearValue *val, int comp, + enum util_format_colorspace colorspace) { + assert(comp < 4); + union tu_clear_component_value tmp; - if (color) { - assert(comp < 4); - tmp.uint32 = val->color.uint32[comp]; - } else { + switch (colorspace) { + case UTIL_FORMAT_COLORSPACE_ZS: assert(comp < 2); if (comp == 0) tmp.float32 = val->depthStencil.depth; else tmp.uint32 = val->depthStencil.stencil; + break; + case UTIL_FORMAT_COLORSPACE_SRGB: + if (comp < 3) { + tmp.float32 = util_format_linear_to_srgb_float(val->color.float32[comp]); + break; + } + default: + assert(comp < 4); + tmp.uint32 = val->color.uint32[comp]; + break; } return tmp; @@ -564,11 +598,23 @@ * * Return the number of uint32_t's used. */ -int +void tu_pack_clear_value(const VkClearValue *val, VkFormat format, uint32_t buf[4]) { - const struct vk_format_description *desc = vk_format_description(format); - assert(desc && desc->layout == VK_FORMAT_LAYOUT_PLAIN); + const struct util_format_description *desc = vk_format_description(format); + + switch (format) { + case VK_FORMAT_B10G11R11_UFLOAT_PACK32: + buf[0] = float3_to_r11g11b10f(val->color.float32); + return; + case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32: + buf[0] = float3_to_rgb9e5(val->color.float32); + return; + default: + break; + } + + assert(desc && desc->layout == UTIL_FORMAT_LAYOUT_PLAIN); /* S8_UINT is special and has no depth */ const int max_components = @@ -577,15 +623,16 @@ int buf_offset = 0; int bit_shift = 0; for (int comp = 0; comp < max_components; comp++) { - const struct vk_format_channel_description *ch = + const struct util_format_channel_description *ch = tu_get_format_channel_description(desc, comp); if (!ch) { - assert(format == VK_FORMAT_S8_UINT && comp == 0); + assert((format == VK_FORMAT_S8_UINT && comp == 0) || + (format == VK_FORMAT_X8_D24_UNORM_PACK32 && comp == 1)); continue; } union tu_clear_component_value v = tu_get_clear_component_value( - val, comp, desc->colorspace != VK_FORMAT_COLORSPACE_ZS); + val, comp, desc->colorspace); /* move to the next uint32_t when there is not enough space */ assert(ch->size <= 32); @@ -600,8 +647,76 @@ buf[buf_offset] |= tu_pack_clear_component_value(v, ch) << bit_shift; bit_shift += ch->size; } +} - return buf_offset + 1; +void +tu_2d_clear_color(const VkClearColorValue *val, VkFormat format, uint32_t buf[4]) +{ + const struct util_format_description *desc = vk_format_description(format); + + /* not supported by 2D engine, cleared as U32 */ + if (format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32) { + buf[0] = float3_to_rgb9e5(val->float32); + return; + } + + enum a6xx_2d_ifmt ifmt = tu6_rb_fmt_to_ifmt(tu6_get_native_format(format)->rb); + + assert(desc && (desc->layout == UTIL_FORMAT_LAYOUT_PLAIN || + format == VK_FORMAT_B10G11R11_UFLOAT_PACK32)); + + for (unsigned i = 0; i < desc->nr_channels; i++) { + const struct util_format_channel_description *ch = &desc->channel[i]; + + switch (ifmt) { + case R2D_INT32: + case R2D_INT16: + case R2D_INT8: + case R2D_FLOAT32: + buf[i] = val->uint32[i]; + break; + case R2D_FLOAT16: + buf[i] = util_float_to_half(val->float32[i]); + break; + case R2D_UNORM8: { + float linear = val->float32[i]; + if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB && i < 3) + linear = util_format_linear_to_srgb_float(val->float32[i]); + + if (ch->type == UTIL_FORMAT_TYPE_SIGNED) + buf[i] = tu_pack_float32_for_snorm(linear, 8); + else + buf[i] = tu_pack_float32_for_unorm(linear, 8); + } break; + default: + unreachable("unexpected ifmt"); + break; + } + } +} + +void +tu_2d_clear_zs(const VkClearDepthStencilValue *val, VkFormat format, uint32_t buf[4]) +{ + switch (format) { + case VK_FORMAT_X8_D24_UNORM_PACK32: + case VK_FORMAT_D24_UNORM_S8_UINT: + buf[0] = tu_pack_float32_for_unorm(val->depth, 24); + buf[1] = buf[0] >> 8; + buf[2] = buf[0] >> 16; + buf[3] = val->stencil; + return; + case VK_FORMAT_D16_UNORM: + case VK_FORMAT_D32_SFLOAT: + buf[0] = fui(val->depth); + return; + case VK_FORMAT_S8_UINT: + buf[0] = val->stencil; + return; + default: + unreachable("unexpected zs format"); + break; + } } static void @@ -610,37 +725,35 @@ VkFormat format, VkFormatProperties *out_properties) { - VkFormatFeatureFlags linear = 0, tiled = 0, buffer = 0; - const struct vk_format_description *desc = vk_format_description(format); + VkFormatFeatureFlags image = 0, buffer = 0; + const struct util_format_description *desc = vk_format_description(format); const struct tu_native_format *native_fmt = tu6_get_native_format(format); if (!desc || !native_fmt) { - out_properties->linearTilingFeatures = linear; - out_properties->optimalTilingFeatures = tiled; - out_properties->bufferFeatures = buffer; - return; + goto end; } - linear |= VK_FORMAT_FEATURE_TRANSFER_SRC_BIT | VK_FORMAT_FEATURE_TRANSFER_DST_BIT; - tiled |= VK_FORMAT_FEATURE_TRANSFER_SRC_BIT | VK_FORMAT_FEATURE_TRANSFER_DST_BIT; buffer |= VK_FORMAT_FEATURE_TRANSFER_SRC_BIT | VK_FORMAT_FEATURE_TRANSFER_DST_BIT; + if (native_fmt->vtx >= 0) { + buffer |= VK_FORMAT_FEATURE_VERTEX_BUFFER_BIT; + } + + if (native_fmt->tex >= 0 || native_fmt->rb >= 0) + image |= VK_FORMAT_FEATURE_TRANSFER_SRC_BIT | VK_FORMAT_FEATURE_TRANSFER_DST_BIT; if (native_fmt->tex >= 0) { - linear |= VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT; - tiled |= VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT; + image |= VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT | VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_LINEAR_BIT; buffer |= VK_FORMAT_FEATURE_UNIFORM_TEXEL_BUFFER_BIT; } - if (native_fmt->rb >= 0) { - linear |= VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT; - tiled |= VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT; - } + if (native_fmt->rb >= 0) + image |= VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT | VK_FORMAT_FEATURE_BLIT_SRC_BIT | VK_FORMAT_FEATURE_BLIT_DST_BIT; - if (native_fmt->vtx >= 0) { - buffer |= VK_FORMAT_FEATURE_VERTEX_BUFFER_BIT; - } + if (tu6_pipe2depth(format) != (enum a6xx_depth_format)~0) + image |= VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT; - out_properties->linearTilingFeatures = linear; - out_properties->optimalTilingFeatures = tiled; +end: + out_properties->linearTilingFeatures = image; + out_properties->optimalTilingFeatures = image; out_properties->bufferFeatures = buffer; } @@ -665,6 +778,24 @@ tu_physical_device_get_format_properties( physical_device, format, &pFormatProperties->formatProperties); + + VkDrmFormatModifierPropertiesListEXT *list = + vk_find_struct(pFormatProperties->pNext, DRM_FORMAT_MODIFIER_PROPERTIES_LIST_EXT); + if (list) { + VK_OUTARRAY_MAKE(out, list->pDrmFormatModifierProperties, + &list->drmFormatModifierCount); + + vk_outarray_append(&out, mod_props) { + mod_props->drmFormatModifier = DRM_FORMAT_MOD_LINEAR; + mod_props->drmFormatModifierPlaneCount = 1; + } + + /* TODO: any cases where this should be disabled? */ + vk_outarray_append(&out, mod_props) { + mod_props->drmFormatModifier = DRM_FORMAT_MOD_QCOM_COMPRESSED; + mod_props->drmFormatModifierPlaneCount = 1; + } + } } static VkResult @@ -683,13 +814,8 @@ tu_physical_device_get_format_properties(physical_device, info->format, &format_props); - if (info->tiling == VK_IMAGE_TILING_LINEAR) { - format_feature_flags = format_props.linearTilingFeatures; - } else if (info->tiling == VK_IMAGE_TILING_OPTIMAL) { - format_feature_flags = format_props.optimalTilingFeatures; - } else { - unreachable("bad VkImageTiling"); - } + assert(format_props.optimalTilingFeatures == format_props.linearTilingFeatures); + format_feature_flags = format_props.optimalTilingFeatures; if (format_feature_flags == 0) goto unsupported; @@ -731,8 +857,10 @@ VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT)) && !(info->flags & VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT) && !(info->usage & VK_IMAGE_USAGE_STORAGE_BIT)) { - sampleCounts |= VK_SAMPLE_COUNT_2_BIT | VK_SAMPLE_COUNT_4_BIT | - VK_SAMPLE_COUNT_8_BIT; + sampleCounts |= VK_SAMPLE_COUNT_2_BIT | VK_SAMPLE_COUNT_4_BIT; + /* 8x MSAA on 128bpp formats doesn't seem to work */ + if (vk_format_get_blocksize(info->format) <= 8) + sampleCounts |= VK_SAMPLE_COUNT_8_BIT; } if (info->usage & VK_IMAGE_USAGE_SAMPLED_BIT) { diff -Nru mesa-19.2.8/src/freedreno/vulkan/tu_image.c mesa-20.0.8/src/freedreno/vulkan/tu_image.c --- mesa-19.2.8/src/freedreno/vulkan/tu_image.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/freedreno/vulkan/tu_image.c 2020-06-12 01:21:16.000000000 +0000 @@ -29,109 +29,36 @@ #include "util/debug.h" #include "util/u_atomic.h" +#include "util/format/u_format.h" #include "vk_format.h" #include "vk_util.h" +#include "drm-uapi/drm_fourcc.h" static inline bool -image_level_linear(struct tu_image *image, int level) +image_level_linear(struct tu_image *image, int level, bool ubwc) { unsigned w = u_minify(image->extent.width, level); - return w < 16; + /* all levels are tiled/compressed with UBWC */ + return ubwc ? false : (w < 16); } -/* indexed by cpp: */ -static const struct +enum a6xx_tile_mode +tu6_get_image_tile_mode(struct tu_image *image, int level) { - unsigned pitchalign; - unsigned heightalign; -} tile_alignment[] = { - [1] = { 128, 32 }, [2] = { 128, 16 }, [3] = { 128, 16 }, [4] = { 64, 16 }, - [8] = { 64, 16 }, [12] = { 64, 16 }, [16] = { 64, 16 }, -}; - -static void -setup_slices(struct tu_image *image, const VkImageCreateInfo *pCreateInfo) -{ - enum vk_format_layout layout = - vk_format_description(pCreateInfo->format)->layout; - uint32_t layer_size = 0; - uint32_t width = pCreateInfo->extent.width; - uint32_t height = pCreateInfo->extent.height; - uint32_t depth = pCreateInfo->extent.depth; - bool layer_first = pCreateInfo->imageType != VK_IMAGE_TYPE_3D; - uint32_t alignment = pCreateInfo->imageType == VK_IMAGE_TYPE_3D ? 4096 : 1; - uint32_t cpp = vk_format_get_blocksize(pCreateInfo->format); - - uint32_t heightalign = tile_alignment[cpp].heightalign; - - for (unsigned level = 0; level < pCreateInfo->mipLevels; level++) { - struct tu_image_level *slice = &image->levels[level]; - bool linear_level = image_level_linear(image, level); - uint32_t aligned_height = height; - uint32_t blocks; - uint32_t pitchalign; - - if (image->tile_mode && !linear_level) { - pitchalign = tile_alignment[cpp].pitchalign; - aligned_height = align(aligned_height, heightalign); - } else { - pitchalign = 64; - - /* The blits used for mem<->gmem work at a granularity of - * 32x32, which can cause faults due to over-fetch on the - * last level. The simple solution is to over-allocate a - * bit the last level to ensure any over-fetch is harmless. - * The pitch is already sufficiently aligned, but height - * may not be: - */ - if ((level + 1 == pCreateInfo->mipLevels)) - aligned_height = align(aligned_height, 32); - } - - if (layout == VK_FORMAT_LAYOUT_ASTC) - slice->pitch = util_align_npot( - width, - pitchalign * vk_format_get_blockwidth(pCreateInfo->format)); - else - slice->pitch = align(width, pitchalign); - - slice->offset = layer_size; - blocks = vk_format_get_block_count(pCreateInfo->format, slice->pitch, - aligned_height); - - /* 1d array and 2d array textures must all have the same layer size - * for each miplevel on a3xx. 3d textures can have different layer - * sizes for high levels, but the hw auto-sizer is buggy (or at least - * different than what this code does), so as soon as the layer size - * range gets into range, we stop reducing it. - */ - if (pCreateInfo->imageType == VK_IMAGE_TYPE_3D && - (level == 1 || - (level > 1 && image->levels[level - 1].size > 0xf000))) - slice->size = align(blocks * cpp, alignment); - else if (level == 0 || layer_first || alignment == 1) - slice->size = align(blocks * cpp, alignment); - else - slice->size = image->levels[level - 1].size; - - layer_size += slice->size * depth; - - width = u_minify(width, 1); - height = u_minify(height, 1); - depth = u_minify(depth, 1); - } - - image->layer_size = layer_size; + if (image_level_linear(image, level, !!image->layout.ubwc_size)) + return TILE6_LINEAR; + else + return image->layout.tile_mode; } VkResult tu_image_create(VkDevice _device, - const struct tu_image_create_info *create_info, + const VkImageCreateInfo *pCreateInfo, const VkAllocationCallbacks *alloc, - VkImage *pImage) + VkImage *pImage, + uint64_t modifier) { TU_FROM_HANDLE(tu_device, device, _device); - const VkImageCreateInfo *pCreateInfo = create_info->vk_info; struct tu_image *image = NULL; assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO); @@ -156,6 +83,7 @@ image->extent = pCreateInfo->extent; image->level_count = pCreateInfo->mipLevels; image->layer_count = pCreateInfo->arrayLayers; + image->samples = pCreateInfo->samples; image->exclusive = pCreateInfo->sharingMode == VK_SHARING_MODE_EXCLUSIVE; if (pCreateInfo->sharingMode == VK_SHARING_MODE_CONCURRENT) { @@ -172,15 +100,143 @@ vk_find_struct_const(pCreateInfo->pNext, EXTERNAL_MEMORY_IMAGE_CREATE_INFO) != NULL; - image->tile_mode = pCreateInfo->tiling == VK_IMAGE_TILING_OPTIMAL ? 3 : 0; - setup_slices(image, pCreateInfo); + image->layout.tile_mode = TILE6_3; + bool ubwc_enabled = true; + + /* disable tiling when linear is requested and for compressed formats */ + if (pCreateInfo->tiling == VK_IMAGE_TILING_LINEAR || + modifier == DRM_FORMAT_MOD_LINEAR || + vk_format_is_compressed(image->vk_format)) { + image->layout.tile_mode = TILE6_LINEAR; + ubwc_enabled = false; + } + + /* using UBWC with D24S8 breaks the "stencil read" copy path (why?) + * (causes any deqp tests that need to check stencil to fail) + * disable UBWC for this format until we properly support copy aspect masks + */ + if (image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) + ubwc_enabled = false; + + /* UBWC can't be used with E5B9G9R9 */ + if (image->vk_format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32) + ubwc_enabled = false; + + if (image->extent.depth > 1) { + tu_finishme("UBWC with 3D textures"); + ubwc_enabled = false; + } + + /* Disable UBWC for storage images. + * + * The closed GL driver skips UBWC for storage images (and additionally + * uses linear for writeonly images). We seem to have image tiling working + * in freedreno in general, so turnip matches that. freedreno also enables + * UBWC on images, but it's not really tested due to the lack of + * UBWC-enabled mipmaps in freedreno currently. Just match the closed GL + * behavior of no UBWC. + */ + if (image->usage & VK_IMAGE_USAGE_STORAGE_BIT) + ubwc_enabled = false; + + uint32_t ubwc_blockwidth, ubwc_blockheight; + fdl6_get_ubwc_blockwidth(&image->layout, + &ubwc_blockwidth, &ubwc_blockheight); + if (!ubwc_blockwidth) { + tu_finishme("UBWC for cpp=%d", image->layout.cpp); + ubwc_enabled = false; + } + + /* expect UBWC enabled if we asked for it */ + assert(modifier != DRM_FORMAT_MOD_QCOM_COMPRESSED || ubwc_enabled); + + fdl6_layout(&image->layout, vk_format_to_pipe_format(image->vk_format), + image->samples, + pCreateInfo->extent.width, + pCreateInfo->extent.height, + pCreateInfo->extent.depth, + pCreateInfo->mipLevels, + pCreateInfo->arrayLayers, + pCreateInfo->imageType == VK_IMAGE_TYPE_3D, + ubwc_enabled); - image->size = image->layer_size * pCreateInfo->arrayLayers; *pImage = tu_image_to_handle(image); return VK_SUCCESS; } +static enum a6xx_tex_fetchsize +tu6_fetchsize(VkFormat format) +{ + if (vk_format_description(format)->layout == UTIL_FORMAT_LAYOUT_ASTC) + return TFETCH6_16_BYTE; + + switch (vk_format_get_blocksize(format) / vk_format_get_blockwidth(format)) { + case 1: return TFETCH6_1_BYTE; + case 2: return TFETCH6_2_BYTE; + case 4: return TFETCH6_4_BYTE; + case 8: return TFETCH6_8_BYTE; + case 16: return TFETCH6_16_BYTE; + default: + unreachable("bad block size"); + } +} + +static uint32_t +tu6_texswiz(const VkComponentMapping *comps, + VkFormat format, + VkImageAspectFlagBits aspect_mask) +{ + unsigned char swiz[4] = {comps->r, comps->g, comps->b, comps->a}; + unsigned char vk_swizzle[] = { + [VK_COMPONENT_SWIZZLE_ZERO] = A6XX_TEX_ZERO, + [VK_COMPONENT_SWIZZLE_ONE] = A6XX_TEX_ONE, + [VK_COMPONENT_SWIZZLE_R] = A6XX_TEX_X, + [VK_COMPONENT_SWIZZLE_G] = A6XX_TEX_Y, + [VK_COMPONENT_SWIZZLE_B] = A6XX_TEX_Z, + [VK_COMPONENT_SWIZZLE_A] = A6XX_TEX_W, + }; + const unsigned char *fmt_swiz = vk_format_description(format)->swizzle; + + for (unsigned i = 0; i < 4; i++) { + swiz[i] = (swiz[i] == VK_COMPONENT_SWIZZLE_IDENTITY) ? i : vk_swizzle[swiz[i]]; + /* if format has 0/1 in channel, use that (needed for bc1_rgb) */ + if (swiz[i] < 4) { + if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT && + format == VK_FORMAT_D24_UNORM_S8_UINT) + swiz[i] = A6XX_TEX_Y; + switch (fmt_swiz[swiz[i]]) { + case PIPE_SWIZZLE_0: swiz[i] = A6XX_TEX_ZERO; break; + case PIPE_SWIZZLE_1: swiz[i] = A6XX_TEX_ONE; break; + } + } + } + + return A6XX_TEX_CONST_0_SWIZ_X(swiz[0]) | + A6XX_TEX_CONST_0_SWIZ_Y(swiz[1]) | + A6XX_TEX_CONST_0_SWIZ_Z(swiz[2]) | + A6XX_TEX_CONST_0_SWIZ_W(swiz[3]); +} + +static enum a6xx_tex_type +tu6_tex_type(VkImageViewType type) +{ + switch (type) { + default: + case VK_IMAGE_VIEW_TYPE_1D: + case VK_IMAGE_VIEW_TYPE_1D_ARRAY: + return A6XX_TEX_1D; + case VK_IMAGE_VIEW_TYPE_2D: + case VK_IMAGE_VIEW_TYPE_2D_ARRAY: + return A6XX_TEX_2D; + case VK_IMAGE_VIEW_TYPE_3D: + return A6XX_TEX_3D; + case VK_IMAGE_VIEW_TYPE_CUBE: + case VK_IMAGE_VIEW_TYPE_CUBE_ARRAY: + return A6XX_TEX_CUBE; + } +} + void tu_image_view_init(struct tu_image_view *iview, struct tu_device *device, @@ -208,12 +264,6 @@ iview->vk_format = pCreateInfo->format; iview->aspect_mask = pCreateInfo->subresourceRange.aspectMask; - if (iview->aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT) { - iview->vk_format = vk_format_stencil_only(iview->vk_format); - } else if (iview->aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT) { - iview->vk_format = vk_format_depth_only(iview->vk_format); - } - // should we minify? iview->extent = image->extent; @@ -221,6 +271,88 @@ iview->layer_count = tu_get_layerCount(image, range); iview->base_mip = range->baseMipLevel; iview->level_count = tu_get_levelCount(image, range); + + memset(iview->descriptor, 0, sizeof(iview->descriptor)); + + const struct tu_native_format *fmt = tu6_get_native_format(iview->vk_format); + uint64_t base_addr = tu_image_base(image, iview->base_mip, iview->base_layer); + uint64_t ubwc_addr = tu_image_ubwc_base(image, iview->base_mip, iview->base_layer); + + uint32_t pitch = tu_image_stride(image, iview->base_mip) / vk_format_get_blockwidth(iview->vk_format); + enum a6xx_tile_mode tile_mode = tu6_get_image_tile_mode(image, iview->base_mip); + uint32_t width = u_minify(image->extent.width, iview->base_mip); + uint32_t height = u_minify(image->extent.height, iview->base_mip); + uint32_t depth = pCreateInfo->viewType == VK_IMAGE_VIEW_TYPE_3D ? + u_minify(image->extent.depth, iview->base_mip) : iview->layer_count; + + unsigned fmt_tex = fmt->tex; + if (iview->aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT && + iview->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) + fmt_tex = TFMT6_S8Z24_UINT; + + iview->descriptor[0] = + A6XX_TEX_CONST_0_TILE_MODE(tile_mode) | + COND(vk_format_is_srgb(iview->vk_format), A6XX_TEX_CONST_0_SRGB) | + A6XX_TEX_CONST_0_FMT(fmt_tex) | + A6XX_TEX_CONST_0_SAMPLES(tu_msaa_samples(image->samples)) | + A6XX_TEX_CONST_0_SWAP(image->layout.tile_mode ? WZYX : fmt->swap) | + tu6_texswiz(&pCreateInfo->components, iview->vk_format, iview->aspect_mask) | + A6XX_TEX_CONST_0_MIPLVLS(iview->level_count - 1); + iview->descriptor[1] = A6XX_TEX_CONST_1_WIDTH(width) | A6XX_TEX_CONST_1_HEIGHT(height); + iview->descriptor[2] = + A6XX_TEX_CONST_2_FETCHSIZE(tu6_fetchsize(iview->vk_format)) | + A6XX_TEX_CONST_2_PITCH(pitch) | + A6XX_TEX_CONST_2_TYPE(tu6_tex_type(pCreateInfo->viewType)); + iview->descriptor[3] = A6XX_TEX_CONST_3_ARRAY_PITCH(tu_layer_size(image, iview->base_mip)); + iview->descriptor[4] = base_addr; + iview->descriptor[5] = (base_addr >> 32) | A6XX_TEX_CONST_5_DEPTH(depth); + + if (image->layout.ubwc_size) { + uint32_t block_width, block_height; + fdl6_get_ubwc_blockwidth(&image->layout, + &block_width, &block_height); + + iview->descriptor[3] |= A6XX_TEX_CONST_3_FLAG | A6XX_TEX_CONST_3_TILE_ALL; + iview->descriptor[7] = ubwc_addr; + iview->descriptor[8] = ubwc_addr >> 32; + iview->descriptor[9] |= A6XX_TEX_CONST_9_FLAG_BUFFER_ARRAY_PITCH(tu_image_ubwc_size(image, iview->base_mip) >> 2); + iview->descriptor[10] |= + A6XX_TEX_CONST_10_FLAG_BUFFER_PITCH(tu_image_ubwc_pitch(image, iview->base_mip)) | + A6XX_TEX_CONST_10_FLAG_BUFFER_LOGW(util_logbase2_ceil(DIV_ROUND_UP(width, block_width))) | + A6XX_TEX_CONST_10_FLAG_BUFFER_LOGH(util_logbase2_ceil(DIV_ROUND_UP(height, block_height))); + } + + if (pCreateInfo->viewType == VK_IMAGE_VIEW_TYPE_3D) { + iview->descriptor[3] |= + A6XX_TEX_CONST_3_MIN_LAYERSZ(image->layout.slices[image->level_count - 1].size0); + } + + if (image->usage & VK_IMAGE_USAGE_STORAGE_BIT) { + memset(iview->storage_descriptor, 0, sizeof(iview->storage_descriptor)); + + iview->storage_descriptor[0] = + A6XX_IBO_0_FMT(fmt->tex) | + A6XX_IBO_0_TILE_MODE(tile_mode); + iview->storage_descriptor[1] = + A6XX_IBO_1_WIDTH(width) | + A6XX_IBO_1_HEIGHT(height); + iview->storage_descriptor[2] = + A6XX_IBO_2_PITCH(pitch) | + A6XX_IBO_2_TYPE(tu6_tex_type(pCreateInfo->viewType)); + iview->storage_descriptor[3] = A6XX_IBO_3_ARRAY_PITCH(tu_layer_size(image, iview->base_mip)); + + iview->storage_descriptor[4] = base_addr; + iview->storage_descriptor[5] = (base_addr >> 32) | A6XX_IBO_5_DEPTH(depth); + + if (image->layout.ubwc_size) { + iview->storage_descriptor[3] |= A6XX_IBO_3_FLAG | A6XX_IBO_3_UNK27; + iview->storage_descriptor[7] |= ubwc_addr; + iview->storage_descriptor[8] |= ubwc_addr >> 32; + iview->storage_descriptor[9] = A6XX_IBO_9_FLAG_BUFFER_ARRAY_PITCH(tu_image_ubwc_size(image, iview->base_mip) >> 2); + iview->storage_descriptor[10] = + A6XX_IBO_10_FLAG_BUFFER_PITCH(tu_image_ubwc_pitch(image, iview->base_mip)); + } + } } unsigned @@ -252,12 +384,25 @@ pAllocator, pImage); #endif - return tu_image_create(device, - &(struct tu_image_create_info) { - .vk_info = pCreateInfo, - .scanout = false, - }, - pAllocator, pImage); + uint64_t modifier = DRM_FORMAT_MOD_INVALID; + if (pCreateInfo->tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT) { + const VkImageDrmFormatModifierListCreateInfoEXT *mod_info = + vk_find_struct_const(pCreateInfo->pNext, + IMAGE_DRM_FORMAT_MODIFIER_LIST_CREATE_INFO_EXT); + + modifier = DRM_FORMAT_MOD_LINEAR; + for (unsigned i = 0; i < mod_info->drmFormatModifierCount; i++) { + if (mod_info->pDrmFormatModifiers[i] == DRM_FORMAT_MOD_QCOM_COMPRESSED) + modifier = DRM_FORMAT_MOD_QCOM_COMPRESSED; + } + } else { + const struct wsi_image_create_info *wsi_info = + vk_find_struct_const(pCreateInfo->pNext, WSI_IMAGE_CREATE_INFO_MESA); + if (wsi_info && wsi_info->scanout) + modifier = DRM_FORMAT_MOD_LINEAR; + } + + return tu_image_create(device, pCreateInfo, pAllocator, pImage, modifier); } void @@ -285,18 +430,48 @@ { TU_FROM_HANDLE(tu_image, image, _image); - const uint32_t layer_offset = image->layer_size * pSubresource->arrayLayer; - const struct tu_image_level *level = - image->levels + pSubresource->mipLevel; + const struct fdl_slice *slice = image->layout.slices + pSubresource->mipLevel; - pLayout->offset = layer_offset + level->offset; - pLayout->size = level->size; + pLayout->offset = fdl_surface_offset(&image->layout, + pSubresource->mipLevel, + pSubresource->arrayLayer); + pLayout->size = slice->size0; pLayout->rowPitch = - level->pitch * vk_format_get_blocksize(image->vk_format); - pLayout->arrayPitch = image->layer_size; - pLayout->depthPitch = level->size; + slice->pitch * vk_format_get_blocksize(image->vk_format); + pLayout->arrayPitch = image->layout.layer_size; + pLayout->depthPitch = slice->size0; + + if (image->layout.ubwc_size) { + /* UBWC starts at offset 0 */ + pLayout->offset = 0; + /* UBWC scanout won't match what the kernel wants if we have levels/layers */ + assert(image->level_count == 1 && image->layer_count == 1); + } } +VkResult tu_GetImageDrmFormatModifierPropertiesEXT( + VkDevice device, + VkImage _image, + VkImageDrmFormatModifierPropertiesEXT* pProperties) +{ + TU_FROM_HANDLE(tu_image, image, _image); + + assert(pProperties->sType == + VK_STRUCTURE_TYPE_IMAGE_DRM_FORMAT_MODIFIER_PROPERTIES_EXT); + + /* TODO invent a modifier for tiled but not UBWC buffers */ + + if (!image->layout.tile_mode) + pProperties->drmFormatModifier = DRM_FORMAT_MOD_LINEAR; + else if (image->layout.ubwc_size) + pProperties->drmFormatModifier = DRM_FORMAT_MOD_QCOM_COMPRESSED; + else + pProperties->drmFormatModifier = DRM_FORMAT_MOD_INVALID; + + return VK_SUCCESS; +} + + VkResult tu_CreateImageView(VkDevice _device, const VkImageViewCreateInfo *pCreateInfo, @@ -338,10 +513,45 @@ { TU_FROM_HANDLE(tu_buffer, buffer, pCreateInfo->buffer); - view->range = pCreateInfo->range == VK_WHOLE_SIZE - ? buffer->size - pCreateInfo->offset - : pCreateInfo->range; - view->vk_format = pCreateInfo->format; + view->buffer = buffer; + + enum VkFormat vfmt = pCreateInfo->format; + enum pipe_format pfmt = vk_format_to_pipe_format(vfmt); + const struct tu_native_format *fmt = tu6_get_native_format(vfmt); + + uint32_t range; + if (pCreateInfo->range == VK_WHOLE_SIZE) + range = buffer->size - pCreateInfo->offset; + else + range = pCreateInfo->range; + uint32_t elements = range / util_format_get_blocksize(pfmt); + + static const VkComponentMapping components = { + .r = VK_COMPONENT_SWIZZLE_R, + .g = VK_COMPONENT_SWIZZLE_G, + .b = VK_COMPONENT_SWIZZLE_B, + .a = VK_COMPONENT_SWIZZLE_A, + }; + + uint64_t iova = tu_buffer_iova(buffer) + pCreateInfo->offset; + + memset(&view->descriptor, 0, sizeof(view->descriptor)); + + view->descriptor[0] = + A6XX_TEX_CONST_0_TILE_MODE(TILE6_LINEAR) | + A6XX_TEX_CONST_0_SWAP(fmt->swap) | + A6XX_TEX_CONST_0_FMT(fmt->tex) | + A6XX_TEX_CONST_0_MIPLVLS(0) | + tu6_texswiz(&components, vfmt, VK_IMAGE_ASPECT_COLOR_BIT); + COND(vk_format_is_srgb(vfmt), A6XX_TEX_CONST_0_SRGB); + view->descriptor[1] = + A6XX_TEX_CONST_1_WIDTH(elements & MASK(15)) | + A6XX_TEX_CONST_1_HEIGHT(elements >> 15); + view->descriptor[2] = + A6XX_TEX_CONST_2_UNK4 | + A6XX_TEX_CONST_2_UNK31; + view->descriptor[4] = iova; + view->descriptor[5] = iova >> 32; } VkResult diff -Nru mesa-19.2.8/src/freedreno/vulkan/tu_meta_blit.c mesa-20.0.8/src/freedreno/vulkan/tu_meta_blit.c --- mesa-19.2.8/src/freedreno/vulkan/tu_meta_blit.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/freedreno/vulkan/tu_meta_blit.c 2020-06-12 01:21:16.000000000 +0000 @@ -23,7 +23,49 @@ #include "tu_private.h" -#include "nir/nir_builder.h" +#include "tu_blit.h" + +static void +tu_blit_image(struct tu_cmd_buffer *cmdbuf, + struct tu_image *src_image, + struct tu_image *dst_image, + const VkImageBlit *info, + VkFilter filter) +{ + static const enum a6xx_rotation rotate[2][2] = { + {ROTATE_0, ROTATE_HFLIP}, + {ROTATE_VFLIP, ROTATE_180}, + }; + bool mirror_x = (info->srcOffsets[1].x < info->srcOffsets[0].x) != + (info->dstOffsets[1].x < info->dstOffsets[0].x); + bool mirror_y = (info->srcOffsets[1].y < info->srcOffsets[0].y) != + (info->dstOffsets[1].y < info->dstOffsets[0].y); + bool mirror_z = (info->srcOffsets[1].z < info->srcOffsets[0].z) != + (info->dstOffsets[1].z < info->dstOffsets[0].z); + + if (mirror_z) { + tu_finishme("blit z mirror\n"); + return; + } + + if (info->srcOffsets[1].z - info->srcOffsets[0].z != + info->dstOffsets[1].z - info->dstOffsets[0].z) { + tu_finishme("blit z filter\n"); + return; + } + assert(info->dstSubresource.layerCount == info->srcSubresource.layerCount); + + struct tu_blit blt = { + .dst = tu_blit_surf(dst_image, info->dstSubresource, info->dstOffsets), + .src = tu_blit_surf(src_image, info->srcSubresource, info->srcOffsets), + .layers = MAX2(info->srcOffsets[1].z - info->srcOffsets[0].z, + info->dstSubresource.layerCount), + .filter = filter == VK_FILTER_LINEAR, + .rotation = rotate[mirror_y][mirror_x], + }; + + tu_blit(cmdbuf, &blt); +} void tu_CmdBlitImage(VkCommandBuffer commandBuffer, @@ -36,4 +78,14 @@ VkFilter filter) { + TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer); + TU_FROM_HANDLE(tu_image, src_image, srcImage); + TU_FROM_HANDLE(tu_image, dst_image, destImage); + + tu_bo_list_add(&cmdbuf->bo_list, src_image->bo, MSM_SUBMIT_BO_READ); + tu_bo_list_add(&cmdbuf->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE); + + for (uint32_t i = 0; i < regionCount; ++i) { + tu_blit_image(cmdbuf, src_image, dst_image, pRegions + i, filter); + } } diff -Nru mesa-19.2.8/src/freedreno/vulkan/tu_meta_buffer.c mesa-20.0.8/src/freedreno/vulkan/tu_meta_buffer.c --- mesa-19.2.8/src/freedreno/vulkan/tu_meta_buffer.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/freedreno/vulkan/tu_meta_buffer.c 2020-06-12 01:21:16.000000000 +0000 @@ -1,4 +1,6 @@ #include "tu_private.h" +#include "tu_blit.h" +#include "tu_cs.h" void tu_CmdFillBuffer(VkCommandBuffer commandBuffer, @@ -7,6 +9,27 @@ VkDeviceSize fillSize, uint32_t data) { + TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); + TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer); + + if (fillSize == VK_WHOLE_SIZE) + fillSize = buffer->size - dstOffset; + + tu_bo_list_add(&cmd->bo_list, buffer->bo, MSM_SUBMIT_BO_WRITE); + + tu_blit(cmd, &(struct tu_blit) { + .dst = { + .fmt = VK_FORMAT_R32_UINT, + .va = tu_buffer_iova(buffer) + dstOffset, + .width = fillSize / 4, + .height = 1, + .samples = 1, + }, + .layers = 1, + .clear_value[0] = data, + .type = TU_BLIT_CLEAR, + .buffer = true, + }); } void @@ -16,4 +39,37 @@ VkDeviceSize dataSize, const void *pData) { + TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); + TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer); + + tu_bo_list_add(&cmd->bo_list, buffer->bo, MSM_SUBMIT_BO_WRITE); + + struct ts_cs_memory tmp; + VkResult result = tu_cs_alloc(cmd->device, &cmd->sub_cs, DIV_ROUND_UP(dataSize, 64), 64, &tmp); + if (result != VK_SUCCESS) { + cmd->record_result = result; + return; + } + + memcpy(tmp.map, pData, dataSize); + + tu_blit(cmd, &(struct tu_blit) { + .dst = { + .fmt = VK_FORMAT_R32_UINT, + .va = tu_buffer_iova(buffer) + dstOffset, + .width = dataSize / 4, + .height = 1, + .samples = 1, + }, + .src = { + .fmt = VK_FORMAT_R32_UINT, + .va = tmp.iova, + .width = dataSize / 4, + .height = 1, + .samples = 1, + }, + .layers = 1, + .type = TU_BLIT_COPY, + .buffer = true, + }); } diff -Nru mesa-19.2.8/src/freedreno/vulkan/tu_meta_clear.c mesa-20.0.8/src/freedreno/vulkan/tu_meta_clear.c --- mesa-19.2.8/src/freedreno/vulkan/tu_meta_clear.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/freedreno/vulkan/tu_meta_clear.c 2020-06-12 01:21:16.000000000 +0000 @@ -22,6 +22,35 @@ */ #include "tu_private.h" +#include "tu_blit.h" +#include "tu_cs.h" + +static void +clear_image(struct tu_cmd_buffer *cmdbuf, + struct tu_image *image, + uint32_t clear_value[4], + const VkImageSubresourceRange *range) +{ + uint32_t level_count = tu_get_levelCount(image, range); + uint32_t layer_count = tu_get_layerCount(image, range); + + if (image->type == VK_IMAGE_TYPE_3D) { + assert(layer_count == 1); + assert(range->baseArrayLayer == 0); + } + + for (unsigned j = 0; j < level_count; j++) { + if (image->type == VK_IMAGE_TYPE_3D) + layer_count = u_minify(image->extent.depth, range->baseMipLevel + j); + + tu_blit(cmdbuf, &(struct tu_blit) { + .dst = tu_blit_surf_whole(image, range->baseMipLevel + j, range->baseArrayLayer), + .layers = layer_count, + .clear_value = {clear_value[0], clear_value[1], clear_value[2], clear_value[3]}, + .type = TU_BLIT_CLEAR, + }); + } +} void tu_CmdClearColorImage(VkCommandBuffer commandBuffer, @@ -31,6 +60,16 @@ uint32_t rangeCount, const VkImageSubresourceRange *pRanges) { + TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer); + TU_FROM_HANDLE(tu_image, image, image_h); + uint32_t clear_value[4] = {}; + + tu_2d_clear_color(pColor, image->vk_format, clear_value); + + tu_bo_list_add(&cmdbuf->bo_list, image->bo, MSM_SUBMIT_BO_WRITE); + + for (unsigned i = 0; i < rangeCount; i++) + clear_image(cmdbuf, image, clear_value, pRanges + i); } void @@ -41,6 +80,16 @@ uint32_t rangeCount, const VkImageSubresourceRange *pRanges) { + TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer); + TU_FROM_HANDLE(tu_image, image, image_h); + uint32_t clear_value[4] = {}; + + tu_2d_clear_zs(pDepthStencil, image->vk_format, clear_value); + + tu_bo_list_add(&cmdbuf->bo_list, image->bo, MSM_SUBMIT_BO_WRITE); + + for (unsigned i = 0; i < rangeCount; i++) + clear_image(cmdbuf, image, clear_value, pRanges + i); } void @@ -50,4 +99,74 @@ uint32_t rectCount, const VkClearRect *pRects) { + TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); + const struct tu_subpass *subpass = cmd->state.subpass; + struct tu_cs *cs = &cmd->draw_cs; + + VkResult result = tu_cs_reserve_space(cmd->device, cs, + rectCount * (3 + 15 * attachmentCount)); + if (result != VK_SUCCESS) { + cmd->record_result = result; + return; + } + + /* TODO: deal with layered rendering (when layered rendering is implemented) + * TODO: disable bypass rendering for subpass (when bypass is implemented) + */ + + for (unsigned i = 0; i < rectCount; i++) { + unsigned x1 = pRects[i].rect.offset.x; + unsigned y1 = pRects[i].rect.offset.y; + unsigned x2 = x1 + pRects[i].rect.extent.width - 1; + unsigned y2 = y1 + pRects[i].rect.extent.height - 1; + + tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_SCISSOR_TL, 2); + tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_TL_X(x1) | A6XX_RB_BLIT_SCISSOR_TL_Y(y1)); + tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_BR_X(x2) | A6XX_RB_BLIT_SCISSOR_BR_Y(y2)); + + for (unsigned j = 0; j < attachmentCount; j++) { + uint32_t a; + unsigned clear_mask = 0; + if (pAttachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) { + clear_mask = 0xf; + a = subpass->color_attachments[pAttachments[j].colorAttachment].attachment; + } else { + a = subpass->depth_stencil_attachment.attachment; + if (pAttachments[j].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) + clear_mask |= 1; + if (pAttachments[j].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) + clear_mask |= 2; + } + + if (a == VK_ATTACHMENT_UNUSED) + continue; + + VkFormat fmt = cmd->state.pass->attachments[a].format; + const struct tu_native_format *format = tu6_get_native_format(fmt); + assert(format && format->rb >= 0); + + tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 1); + tu_cs_emit(cs, A6XX_RB_BLIT_DST_INFO_COLOR_FORMAT(format->rb)); + + tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_INFO, 1); + tu_cs_emit(cs, A6XX_RB_BLIT_INFO_GMEM | A6XX_RB_BLIT_INFO_CLEAR_MASK(clear_mask)); + + tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_BASE_GMEM, 1); + tu_cs_emit(cs, cmd->state.pass->attachments[a].gmem_offset); + + tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_88D0, 1); + tu_cs_emit(cs, 0); + + uint32_t clear_vals[4] = { 0 }; + tu_pack_clear_value(&pAttachments[j].clearValue, fmt, clear_vals); + + tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_CLEAR_COLOR_DW0, 4); + tu_cs_emit(cs, clear_vals[0]); + tu_cs_emit(cs, clear_vals[1]); + tu_cs_emit(cs, clear_vals[2]); + tu_cs_emit(cs, clear_vals[3]); + + tu6_emit_event_write(cmd, cs, BLIT, false); + } + } } diff -Nru mesa-19.2.8/src/freedreno/vulkan/tu_meta_copy.c mesa-20.0.8/src/freedreno/vulkan/tu_meta_copy.c --- mesa-19.2.8/src/freedreno/vulkan/tu_meta_copy.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/freedreno/vulkan/tu_meta_copy.c 2020-06-12 01:21:16.000000000 +0000 @@ -30,579 +30,115 @@ #include "vk_format.h" #include "tu_cs.h" - -/* - * TODO: - * - image -> image copies - * - 3D textures - * - compressed image formats (need to divide offset/extent) - */ - -static uint32_t -blit_control(enum a6xx_color_fmt fmt) -{ - unsigned blit_cntl = 0xf00000; - blit_cntl |= A6XX_RB_2D_BLIT_CNTL_COLOR_FORMAT(fmt); - blit_cntl |= A6XX_RB_2D_BLIT_CNTL_IFMT(tu6_rb_fmt_to_ifmt(fmt)); - return blit_cntl; -} - -static uint32_t tu6_sp_2d_src_format(VkFormat format) -{ - const struct vk_format_description *desc = vk_format_description(format); - uint32_t reg = 0xf000 | A6XX_SP_2D_SRC_FORMAT_COLOR_FORMAT(tu6_get_native_format(format)->rb); - - int channel = vk_format_get_first_non_void_channel(format); - if (channel < 0) { - /* TODO special format. */ - return reg; - } - if (desc->channel[channel].normalized) { - if (desc->channel[channel].type == VK_FORMAT_TYPE_SIGNED) - reg |= A6XX_SP_2D_SRC_FORMAT_SINT; - reg |= A6XX_SP_2D_SRC_FORMAT_NORM; - } else if (desc->channel[channel].pure_integer) { - if (desc->channel[channel].type == VK_FORMAT_TYPE_SIGNED) - reg |= A6XX_SP_2D_SRC_FORMAT_SINT; - else - reg |= A6XX_SP_2D_SRC_FORMAT_UINT; - } - return reg; -} - -static void -tu_dma_prepare(struct tu_cmd_buffer *cmdbuf) -{ - tu_cs_reserve_space(cmdbuf->device, &cmdbuf->cs, 10); - - tu_cs_emit_pkt7(&cmdbuf->cs, CP_EVENT_WRITE, 1); - tu_cs_emit(&cmdbuf->cs, PC_CCU_INVALIDATE_COLOR); - - tu_cs_emit_pkt7(&cmdbuf->cs, CP_EVENT_WRITE, 1); - tu_cs_emit(&cmdbuf->cs, LRZ_FLUSH); - - tu_cs_emit_pkt7(&cmdbuf->cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1); - tu_cs_emit(&cmdbuf->cs, 0x0); - - tu_cs_emit_wfi(&cmdbuf->cs); - - tu_cs_emit_pkt4(&cmdbuf->cs, REG_A6XX_RB_CCU_CNTL, 1); - tu_cs_emit(&cmdbuf->cs, 0x10000000); -} - -static void -tu_copy_buffer(struct tu_cmd_buffer *cmdbuf, - struct tu_bo *src_bo, - uint64_t src_offset, - struct tu_bo *dst_bo, - uint64_t dst_offset, - uint64_t size) -{ - const unsigned max_size_per_iter = 0x4000 - 0x40; - const unsigned max_iterations = - (size + max_size_per_iter) / max_size_per_iter; - - tu_bo_list_add(&cmdbuf->bo_list, src_bo, MSM_SUBMIT_BO_READ); - tu_bo_list_add(&cmdbuf->bo_list, dst_bo, MSM_SUBMIT_BO_WRITE); - - tu_dma_prepare(cmdbuf); - - tu_cs_reserve_space(cmdbuf->device, &cmdbuf->cs, 21 + 48 * max_iterations); - - /* buffer copy setup */ - tu_cs_emit_pkt7(&cmdbuf->cs, CP_SET_MARKER, 1); - tu_cs_emit(&cmdbuf->cs, A6XX_CP_SET_MARKER_0_MODE(RM6_BLIT2DSCALE)); - - const uint32_t blit_cntl = blit_control(RB6_R8_UNORM) | 0x20000000; - - tu_cs_emit_pkt4(&cmdbuf->cs, REG_A6XX_RB_2D_BLIT_CNTL, 1); - tu_cs_emit(&cmdbuf->cs, blit_cntl); - - tu_cs_emit_pkt4(&cmdbuf->cs, REG_A6XX_GRAS_2D_BLIT_CNTL, 1); - tu_cs_emit(&cmdbuf->cs, blit_cntl); - - for (; size;) { - uint64_t src_va = src_bo->iova + src_offset; - uint64_t dst_va = dst_bo->iova + dst_offset; - - unsigned src_shift = src_va & 0x3f; - unsigned dst_shift = dst_va & 0x3f; - unsigned max_shift = MAX2(src_shift, dst_shift); - - src_va -= src_shift; - dst_va -= dst_shift; - - uint32_t size_todo = MIN2(0x4000 - max_shift, size); - unsigned pitch = (size_todo + max_shift + 63) & ~63; - - /* - * Emit source: - */ - tu_cs_emit_pkt4(&cmdbuf->cs, REG_A6XX_SP_PS_2D_SRC_INFO, 13); - tu_cs_emit(&cmdbuf->cs, - A6XX_SP_PS_2D_SRC_INFO_COLOR_FORMAT(RB6_R8_UNORM) | - A6XX_SP_PS_2D_SRC_INFO_TILE_MODE(TILE6_LINEAR) | - A6XX_SP_PS_2D_SRC_INFO_COLOR_SWAP(WZYX) | 0x500000); - tu_cs_emit(&cmdbuf->cs, - A6XX_SP_PS_2D_SRC_SIZE_WIDTH(src_shift + size_todo) | - A6XX_SP_PS_2D_SRC_SIZE_HEIGHT(1)); /* SP_PS_2D_SRC_SIZE */ - tu_cs_emit_qw(&cmdbuf->cs, src_va); - tu_cs_emit(&cmdbuf->cs, A6XX_SP_PS_2D_SRC_PITCH_PITCH(pitch)); - - tu_cs_emit(&cmdbuf->cs, 0x00000000); - tu_cs_emit(&cmdbuf->cs, 0x00000000); - tu_cs_emit(&cmdbuf->cs, 0x00000000); - tu_cs_emit(&cmdbuf->cs, 0x00000000); - tu_cs_emit(&cmdbuf->cs, 0x00000000); - - tu_cs_emit(&cmdbuf->cs, 0x00000000); - tu_cs_emit(&cmdbuf->cs, 0x00000000); - tu_cs_emit(&cmdbuf->cs, 0x00000000); - - /* - * Emit destination: - */ - tu_cs_emit_pkt4(&cmdbuf->cs, REG_A6XX_RB_2D_DST_INFO, 9); - tu_cs_emit(&cmdbuf->cs, A6XX_RB_2D_DST_INFO_COLOR_FORMAT(RB6_R8_UNORM) | - A6XX_RB_2D_DST_INFO_TILE_MODE(TILE6_LINEAR) | - A6XX_RB_2D_DST_INFO_COLOR_SWAP(WZYX)); - tu_cs_emit_qw(&cmdbuf->cs, dst_va); - - tu_cs_emit(&cmdbuf->cs, A6XX_RB_2D_DST_SIZE_PITCH(pitch)); - tu_cs_emit(&cmdbuf->cs, 0x00000000); - tu_cs_emit(&cmdbuf->cs, 0x00000000); - tu_cs_emit(&cmdbuf->cs, 0x00000000); - tu_cs_emit(&cmdbuf->cs, 0x00000000); - tu_cs_emit(&cmdbuf->cs, 0x00000000); - - /* - * Blit command: - */ - tu_cs_emit_pkt4(&cmdbuf->cs, REG_A6XX_GRAS_2D_SRC_TL_X, 4); - tu_cs_emit(&cmdbuf->cs, A6XX_GRAS_2D_SRC_TL_X_X(src_shift)); - tu_cs_emit(&cmdbuf->cs, - A6XX_GRAS_2D_SRC_BR_X_X(src_shift + size_todo - 1)); - tu_cs_emit(&cmdbuf->cs, A6XX_GRAS_2D_SRC_TL_Y_Y(0)); - tu_cs_emit(&cmdbuf->cs, A6XX_GRAS_2D_SRC_BR_Y_Y(0)); - - tu_cs_emit_pkt4(&cmdbuf->cs, REG_A6XX_GRAS_2D_DST_TL, 2); - tu_cs_emit(&cmdbuf->cs, - A6XX_GRAS_2D_DST_TL_X(dst_shift) | A6XX_GRAS_2D_DST_TL_Y(0)); - tu_cs_emit(&cmdbuf->cs, - A6XX_GRAS_2D_DST_BR_X(dst_shift + size_todo - 1) | - A6XX_GRAS_2D_DST_BR_Y(0)); - - tu_cs_emit_pkt7(&cmdbuf->cs, CP_EVENT_WRITE, 1); - tu_cs_emit(&cmdbuf->cs, 0x3f); - tu_cs_emit_wfi(&cmdbuf->cs); - - tu_cs_emit_pkt4(&cmdbuf->cs, REG_A6XX_RB_UNKNOWN_8C01, 1); - tu_cs_emit(&cmdbuf->cs, 0); - - tu_cs_emit_pkt4(&cmdbuf->cs, REG_A6XX_SP_2D_SRC_FORMAT, 1); - tu_cs_emit(&cmdbuf->cs, 0xf180); - - tu_cs_emit_pkt4(&cmdbuf->cs, REG_A6XX_RB_UNKNOWN_8E04, 1); - tu_cs_emit(&cmdbuf->cs, 0x01000000); - - tu_cs_emit_pkt7(&cmdbuf->cs, CP_BLIT, 1); - tu_cs_emit(&cmdbuf->cs, CP_BLIT_0_OP(BLIT_OP_SCALE)); - - tu_cs_emit_wfi(&cmdbuf->cs); - - tu_cs_emit_pkt4(&cmdbuf->cs, REG_A6XX_RB_UNKNOWN_8E04, 1); - tu_cs_emit(&cmdbuf->cs, 0); - - src_offset += size_todo; - dst_offset += size_todo; - size -= size_todo; - } - - tu6_emit_event_write(cmdbuf, &cmdbuf->cs, 0x1d, true); - tu6_emit_event_write(cmdbuf, &cmdbuf->cs, FACENESS_FLUSH, true); - tu6_emit_event_write(cmdbuf, &cmdbuf->cs, CACHE_FLUSH_TS, true); -} +#include "tu_blit.h" static void -tu_copy_buffer_to_image_step(struct tu_cmd_buffer *cmdbuf, - struct tu_buffer *src_buffer, - struct tu_image *dst_image, - const VkBufferImageCopy *copy_info, - VkFormat format, - uint32_t layer, - uint64_t src_va) +tu_copy_buffer(struct tu_cmd_buffer *cmd, + struct tu_buffer *src, + struct tu_buffer *dst, + const VkBufferCopy *region) +{ + tu_bo_list_add(&cmd->bo_list, src->bo, MSM_SUBMIT_BO_READ); + tu_bo_list_add(&cmd->bo_list, dst->bo, MSM_SUBMIT_BO_WRITE); + + tu_blit(cmd, &(struct tu_blit) { + .dst = { + .fmt = VK_FORMAT_R8_UNORM, + .va = tu_buffer_iova(dst) + region->dstOffset, + .width = region->size, + .height = 1, + .samples = 1, + }, + .src = { + .fmt = VK_FORMAT_R8_UNORM, + .va = tu_buffer_iova(src) + region->srcOffset, + .width = region->size, + .height = 1, + .samples = 1, + }, + .layers = 1, + .type = TU_BLIT_COPY, + .buffer = true, + }); +} + +static struct tu_blit_surf +tu_blit_buffer(struct tu_buffer *buffer, + VkFormat format, + const VkBufferImageCopy *info) { - const enum a6xx_color_fmt rb_fmt = tu6_get_native_format(format)->rb; + if (info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) + format = VK_FORMAT_R8_UNORM; - uint64_t dst_va = dst_image->bo->iova + dst_image->bo_offset + dst_image->layer_size * layer + dst_image->levels[copy_info->imageSubresource.mipLevel].offset; - unsigned dst_pitch = dst_image->levels[copy_info->imageSubresource.mipLevel].pitch * + unsigned pitch = (info->bufferRowLength ?: info->imageExtent.width) * vk_format_get_blocksize(format); - unsigned src_pitch; - unsigned src_offset = 0; - if (copy_info->imageExtent.height == 1) { - /* Can't find this in the spec, but not having it is sort of insane? */ - assert(src_va % vk_format_get_blocksize(format) == 0); - - src_offset = (src_va & 63) / vk_format_get_blocksize(format); - src_va &= ~63; - - src_pitch = align((src_offset + copy_info->imageExtent.width) * vk_format_get_blocksize(format), 64); - } else { - unsigned src_pixel_stride = copy_info->bufferRowLength - ? copy_info->bufferRowLength - : copy_info->imageExtent.width; - src_pitch = src_pixel_stride * vk_format_get_blocksize(format); - assert(!(src_pitch & 63)); - assert(!(src_va & 63)); - } - - tu_cs_reserve_space(cmdbuf->device, &cmdbuf->cs, 48); - - /* - * Emit source: - */ - tu_cs_emit_pkt4(&cmdbuf->cs, REG_A6XX_SP_PS_2D_SRC_INFO, 13); - tu_cs_emit(&cmdbuf->cs, A6XX_SP_PS_2D_SRC_INFO_COLOR_FORMAT(rb_fmt) | - A6XX_SP_PS_2D_SRC_INFO_TILE_MODE(TILE6_LINEAR) | - A6XX_SP_PS_2D_SRC_INFO_COLOR_SWAP(WZYX) | - 0x500000); - tu_cs_emit(&cmdbuf->cs, - A6XX_SP_PS_2D_SRC_SIZE_WIDTH(src_offset + copy_info->imageExtent.width) | - A6XX_SP_PS_2D_SRC_SIZE_HEIGHT( - copy_info->imageExtent.height)); /* SP_PS_2D_SRC_SIZE */ - tu_cs_emit_qw(&cmdbuf->cs, src_va); - tu_cs_emit(&cmdbuf->cs, A6XX_SP_PS_2D_SRC_PITCH_PITCH(src_pitch)); - - tu_cs_emit(&cmdbuf->cs, 0x00000000); - tu_cs_emit(&cmdbuf->cs, 0x00000000); - tu_cs_emit(&cmdbuf->cs, 0x00000000); - tu_cs_emit(&cmdbuf->cs, 0x00000000); - tu_cs_emit(&cmdbuf->cs, 0x00000000); - - tu_cs_emit(&cmdbuf->cs, 0x00000000); - tu_cs_emit(&cmdbuf->cs, 0x00000000); - tu_cs_emit(&cmdbuf->cs, 0x00000000); - - /* - * Emit destination: - */ - tu_cs_emit_pkt4(&cmdbuf->cs, REG_A6XX_RB_2D_DST_INFO, 9); - tu_cs_emit(&cmdbuf->cs, - A6XX_RB_2D_DST_INFO_COLOR_FORMAT(rb_fmt) | - A6XX_RB_2D_DST_INFO_TILE_MODE(dst_image->tile_mode) | - A6XX_RB_2D_DST_INFO_COLOR_SWAP(WZYX)); - tu_cs_emit_qw(&cmdbuf->cs, dst_va); - tu_cs_emit(&cmdbuf->cs, A6XX_RB_2D_DST_SIZE_PITCH(dst_pitch)); - tu_cs_emit(&cmdbuf->cs, 0x00000000); - tu_cs_emit(&cmdbuf->cs, 0x00000000); - tu_cs_emit(&cmdbuf->cs, 0x00000000); - tu_cs_emit(&cmdbuf->cs, 0x00000000); - tu_cs_emit(&cmdbuf->cs, 0x00000000); - - tu_cs_emit_pkt4(&cmdbuf->cs, REG_A6XX_GRAS_2D_SRC_TL_X, 4); - tu_cs_emit(&cmdbuf->cs, A6XX_GRAS_2D_SRC_TL_X_X(src_offset)); - tu_cs_emit(&cmdbuf->cs, - A6XX_GRAS_2D_SRC_BR_X_X(src_offset + copy_info->imageExtent.width - 1)); - tu_cs_emit(&cmdbuf->cs, A6XX_GRAS_2D_SRC_TL_Y_Y(0)); - tu_cs_emit(&cmdbuf->cs, - A6XX_GRAS_2D_SRC_BR_Y_Y(copy_info->imageExtent.height - 1)); - - tu_cs_emit_pkt4(&cmdbuf->cs, REG_A6XX_GRAS_2D_DST_TL, 2); - tu_cs_emit(&cmdbuf->cs, - A6XX_GRAS_2D_DST_TL_X(copy_info->imageOffset.x) | - A6XX_GRAS_2D_DST_TL_Y(copy_info->imageOffset.y)); - tu_cs_emit(&cmdbuf->cs, - A6XX_GRAS_2D_DST_BR_X(copy_info->imageOffset.x + - copy_info->imageExtent.width - 1) | - A6XX_GRAS_2D_DST_BR_Y(copy_info->imageOffset.y + - copy_info->imageExtent.height - 1)); - - tu_cs_emit_pkt7(&cmdbuf->cs, CP_EVENT_WRITE, 1); - tu_cs_emit(&cmdbuf->cs, 0x3f); - tu_cs_emit_wfi(&cmdbuf->cs); - - tu_cs_emit_pkt4(&cmdbuf->cs, REG_A6XX_RB_UNKNOWN_8C01, 1); - tu_cs_emit(&cmdbuf->cs, 0); - - tu_cs_emit_pkt4(&cmdbuf->cs, REG_A6XX_SP_2D_SRC_FORMAT, 1); - tu_cs_emit(&cmdbuf->cs, tu6_sp_2d_src_format(format)); - - tu_cs_emit_pkt4(&cmdbuf->cs, REG_A6XX_RB_UNKNOWN_8E04, 1); - tu_cs_emit(&cmdbuf->cs, 0x01000000); - - tu_cs_emit_pkt7(&cmdbuf->cs, CP_BLIT, 1); - tu_cs_emit(&cmdbuf->cs, CP_BLIT_0_OP(BLIT_OP_SCALE)); - - tu_cs_emit_wfi(&cmdbuf->cs); - - tu_cs_emit_pkt4(&cmdbuf->cs, REG_A6XX_RB_UNKNOWN_8E04, 1); - tu_cs_emit(&cmdbuf->cs, 0); + return (struct tu_blit_surf) { + .fmt = format, + .tile_mode = TILE6_LINEAR, + .va = tu_buffer_iova(buffer) + info->bufferOffset, + .pitch = pitch, + .layer_size = (info->bufferImageHeight ?: info->imageExtent.height) * pitch / vk_format_get_blockwidth(format) / vk_format_get_blockheight(format), + .width = info->imageExtent.width, + .height = info->imageExtent.height, + .samples = 1, + }; } static void tu_copy_buffer_to_image(struct tu_cmd_buffer *cmdbuf, struct tu_buffer *src_buffer, struct tu_image *dst_image, - const VkBufferImageCopy *copy_info) + const VkBufferImageCopy *info) { - tu_bo_list_add(&cmdbuf->bo_list, src_buffer->bo, MSM_SUBMIT_BO_READ); - tu_bo_list_add(&cmdbuf->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE); - - /* general setup */ - tu_dma_prepare(cmdbuf); - - tu_cs_reserve_space(cmdbuf->device, &cmdbuf->cs, 6); - - /* buffer copy setup */ - tu_cs_emit_pkt7(&cmdbuf->cs, CP_SET_MARKER, 1); - tu_cs_emit(&cmdbuf->cs, A6XX_CP_SET_MARKER_0_MODE(RM6_BLIT2DSCALE)); - - VkFormat format = dst_image->vk_format; - const enum a6xx_color_fmt rb_fmt = tu6_get_native_format(format)->rb; - - const uint32_t blit_cntl = blit_control(rb_fmt) | 0x20000000; - - tu_cs_emit_pkt4(&cmdbuf->cs, REG_A6XX_RB_2D_BLIT_CNTL, 1); - tu_cs_emit(&cmdbuf->cs, blit_cntl); - - tu_cs_emit_pkt4(&cmdbuf->cs, REG_A6XX_GRAS_2D_BLIT_CNTL, 1); - tu_cs_emit(&cmdbuf->cs, blit_cntl); - - unsigned src_pixel_stride = copy_info->bufferRowLength - ? copy_info->bufferRowLength - : copy_info->imageExtent.width; - unsigned cpp = vk_format_get_blocksize(format); - unsigned src_pitch = src_pixel_stride * cpp; - - for (unsigned layer_offset = 0; layer_offset < copy_info->imageSubresource.layerCount; ++layer_offset) { - unsigned layer = copy_info->imageSubresource.baseArrayLayer + layer_offset; - uint64_t src_va = src_buffer->bo->iova + src_buffer->bo_offset + copy_info->bufferOffset + layer_offset * copy_info->bufferImageHeight * src_pitch; - - if ((src_pitch & 63) || (src_va & 63)) { - /* Do a per line copy */ - VkBufferImageCopy line_copy_info = *copy_info; - line_copy_info.imageExtent.height = 1; - for (unsigned r = 0; r < copy_info->imageExtent.height; ++r) { - /* - * if src_va is not aligned the line copy will need to adjust. Give it - * room to do so. - */ - unsigned max_width = 16384 - (src_va & 0x3f) ? 64 : 0; - line_copy_info.imageOffset.x = copy_info->imageOffset.x; - line_copy_info.imageExtent.width = copy_info->imageExtent.width; - - for (unsigned c = 0; c < copy_info->imageExtent.width; c += max_width) { - tu_copy_buffer_to_image_step(cmdbuf, src_buffer, dst_image, &line_copy_info, format, layer, src_va + c * cpp); - - line_copy_info.imageOffset.x += max_width; - line_copy_info.imageExtent.width -= max_width; - } - - line_copy_info.imageOffset.y++; - src_va += src_pitch; - } - } else { - tu_copy_buffer_to_image_step(cmdbuf, src_buffer, dst_image, copy_info, format, layer, src_va); - } + if (info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT && + vk_format_get_blocksize(dst_image->vk_format) == 4) { + tu_finishme("aspect mask\n"); + return; } - tu_cs_reserve_space(cmdbuf->device, &cmdbuf->cs, 15); - - tu6_emit_event_write(cmdbuf, &cmdbuf->cs, 0x1d, true); - tu6_emit_event_write(cmdbuf, &cmdbuf->cs, FACENESS_FLUSH, true); - tu6_emit_event_write(cmdbuf, &cmdbuf->cs, CACHE_FLUSH_TS, true); -} - -static void -tu_copy_image_to_buffer_step(struct tu_cmd_buffer *cmdbuf, - struct tu_image *src_image, - struct tu_buffer *dst_buffer, - const VkBufferImageCopy *copy_info, - VkFormat format, - uint32_t layer, - uint64_t dst_va) -{ - const enum a6xx_color_fmt rb_fmt = tu6_get_native_format(format)->rb; - - uint64_t src_va = src_image->bo->iova + src_image->bo_offset + src_image->layer_size * layer + src_image->levels[copy_info->imageSubresource.mipLevel].offset; - unsigned src_pitch = src_image->levels[copy_info->imageSubresource.mipLevel].pitch * - vk_format_get_blocksize(format); - - unsigned dst_pitch; - unsigned dst_offset = 0; - if (copy_info->imageExtent.height == 1) { - /* Can't find this in the spec, but not having it is sort of insane? */ - assert(dst_va % vk_format_get_blocksize(format) == 0); - - dst_offset = (dst_va & 63) / vk_format_get_blocksize(format); - dst_va &= ~63; - - dst_pitch = align((dst_offset + copy_info->imageExtent.width) * vk_format_get_blocksize(format), 64); - } else { - unsigned dst_pixel_stride = copy_info->bufferRowLength - ? copy_info->bufferRowLength - : copy_info->imageExtent.width; - dst_pitch = dst_pixel_stride * vk_format_get_blocksize(format); - assert(!(dst_pitch & 63)); - assert(!(dst_va & 63)); - } - - - tu_cs_reserve_space(cmdbuf->device, &cmdbuf->cs, 48); - - /* - * Emit source: - */ - tu_cs_emit_pkt4(&cmdbuf->cs, REG_A6XX_SP_PS_2D_SRC_INFO, 13); - tu_cs_emit(&cmdbuf->cs, - A6XX_SP_PS_2D_SRC_INFO_COLOR_FORMAT(rb_fmt) | - A6XX_SP_PS_2D_SRC_INFO_TILE_MODE(src_image->tile_mode) | - A6XX_SP_PS_2D_SRC_INFO_COLOR_SWAP(WZYX) | 0x500000); - tu_cs_emit(&cmdbuf->cs, - A6XX_SP_PS_2D_SRC_SIZE_WIDTH(src_image->extent.width) | - A6XX_SP_PS_2D_SRC_SIZE_HEIGHT( - src_image->extent.height)); /* SP_PS_2D_SRC_SIZE */ - tu_cs_emit_qw(&cmdbuf->cs, src_va); - tu_cs_emit(&cmdbuf->cs, A6XX_SP_PS_2D_SRC_PITCH_PITCH(src_pitch)); - - tu_cs_emit(&cmdbuf->cs, 0x00000000); - tu_cs_emit(&cmdbuf->cs, 0x00000000); - tu_cs_emit(&cmdbuf->cs, 0x00000000); - tu_cs_emit(&cmdbuf->cs, 0x00000000); - tu_cs_emit(&cmdbuf->cs, 0x00000000); - - tu_cs_emit(&cmdbuf->cs, 0x00000000); - tu_cs_emit(&cmdbuf->cs, 0x00000000); - tu_cs_emit(&cmdbuf->cs, 0x00000000); - - /* - * Emit destination: - */ - tu_cs_emit_pkt4(&cmdbuf->cs, REG_A6XX_RB_2D_DST_INFO, 9); - tu_cs_emit(&cmdbuf->cs, A6XX_RB_2D_DST_INFO_COLOR_FORMAT(rb_fmt) | - A6XX_RB_2D_DST_INFO_TILE_MODE(TILE6_LINEAR) | - A6XX_RB_2D_DST_INFO_COLOR_SWAP(WZYX)); - tu_cs_emit_qw(&cmdbuf->cs, dst_va); - tu_cs_emit(&cmdbuf->cs, A6XX_RB_2D_DST_SIZE_PITCH(dst_pitch)); - tu_cs_emit(&cmdbuf->cs, 0x00000000); - tu_cs_emit(&cmdbuf->cs, 0x00000000); - tu_cs_emit(&cmdbuf->cs, 0x00000000); - tu_cs_emit(&cmdbuf->cs, 0x00000000); - tu_cs_emit(&cmdbuf->cs, 0x00000000); - - tu_cs_emit_pkt4(&cmdbuf->cs, REG_A6XX_GRAS_2D_SRC_TL_X, 4); - tu_cs_emit(&cmdbuf->cs, A6XX_GRAS_2D_SRC_TL_X_X(copy_info->imageOffset.x)); - tu_cs_emit(&cmdbuf->cs, - A6XX_GRAS_2D_SRC_BR_X_X(copy_info->imageOffset.x + - copy_info->imageExtent.width - 1)); - tu_cs_emit(&cmdbuf->cs, A6XX_GRAS_2D_SRC_TL_Y_Y(copy_info->imageOffset.y)); - tu_cs_emit(&cmdbuf->cs, - A6XX_GRAS_2D_SRC_BR_Y_Y(copy_info->imageOffset.y + - copy_info->imageExtent.height - 1)); - - tu_cs_emit_pkt4(&cmdbuf->cs, REG_A6XX_GRAS_2D_DST_TL, 2); - tu_cs_emit(&cmdbuf->cs, - A6XX_GRAS_2D_DST_TL_X(dst_offset) | A6XX_GRAS_2D_DST_TL_Y(0)); - tu_cs_emit(&cmdbuf->cs, - A6XX_GRAS_2D_DST_BR_X(dst_offset + copy_info->imageExtent.width - 1) | - A6XX_GRAS_2D_DST_BR_Y(copy_info->imageExtent.height - 1)); - - tu_cs_emit_pkt7(&cmdbuf->cs, CP_EVENT_WRITE, 1); - tu_cs_emit(&cmdbuf->cs, 0x3f); - tu_cs_emit_wfi(&cmdbuf->cs); - - tu_cs_emit_pkt4(&cmdbuf->cs, REG_A6XX_RB_UNKNOWN_8C01, 1); - tu_cs_emit(&cmdbuf->cs, 0); - - tu_cs_emit_pkt4(&cmdbuf->cs, REG_A6XX_SP_2D_SRC_FORMAT, 1); - tu_cs_emit(&cmdbuf->cs, tu6_sp_2d_src_format(format)); - - tu_cs_emit_pkt4(&cmdbuf->cs, REG_A6XX_RB_UNKNOWN_8E04, 1); - tu_cs_emit(&cmdbuf->cs, 0x01000000); - - tu_cs_emit_pkt7(&cmdbuf->cs, CP_BLIT, 1); - tu_cs_emit(&cmdbuf->cs, CP_BLIT_0_OP(BLIT_OP_SCALE)); - - tu_cs_emit_wfi(&cmdbuf->cs); - - tu_cs_emit_pkt4(&cmdbuf->cs, REG_A6XX_RB_UNKNOWN_8E04, 1); - tu_cs_emit(&cmdbuf->cs, 0); + tu_blit(cmdbuf, &(struct tu_blit) { + .dst = tu_blit_surf_ext(dst_image, info->imageSubresource, info->imageOffset, info->imageExtent), + .src = tu_blit_buffer(src_buffer, dst_image->vk_format, info), + .layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount), + .type = TU_BLIT_COPY, + }); } static void tu_copy_image_to_buffer(struct tu_cmd_buffer *cmdbuf, struct tu_image *src_image, struct tu_buffer *dst_buffer, - const VkBufferImageCopy *copy_info) + const VkBufferImageCopy *info) { - tu_bo_list_add(&cmdbuf->bo_list, src_image->bo, MSM_SUBMIT_BO_READ); - tu_bo_list_add(&cmdbuf->bo_list, dst_buffer->bo, MSM_SUBMIT_BO_WRITE); - - /* general setup */ - tu_dma_prepare(cmdbuf); - - tu_cs_reserve_space(cmdbuf->device, &cmdbuf->cs, 6); - - /* buffer copy setup */ - tu_cs_emit_pkt7(&cmdbuf->cs, CP_SET_MARKER, 1); - tu_cs_emit(&cmdbuf->cs, A6XX_CP_SET_MARKER_0_MODE(RM6_BLIT2DSCALE)); - - VkFormat format = src_image->vk_format; - const enum a6xx_color_fmt rb_fmt = tu6_get_native_format(format)->rb; - - unsigned dst_pixel_stride = copy_info->bufferRowLength - ? copy_info->bufferRowLength - : copy_info->imageExtent.width; - unsigned cpp = vk_format_get_blocksize(format); - unsigned dst_pitch = dst_pixel_stride * cpp; - - - const uint32_t blit_cntl = blit_control(rb_fmt) | 0x20000000; - - tu_cs_emit_pkt4(&cmdbuf->cs, REG_A6XX_RB_2D_BLIT_CNTL, 1); - tu_cs_emit(&cmdbuf->cs, blit_cntl); - - tu_cs_emit_pkt4(&cmdbuf->cs, REG_A6XX_GRAS_2D_BLIT_CNTL, 1); - tu_cs_emit(&cmdbuf->cs, blit_cntl); - - for (unsigned layer_offset = 0; layer_offset < copy_info->imageSubresource.layerCount; ++layer_offset) { - unsigned layer = copy_info->imageSubresource.baseArrayLayer + layer_offset; - uint64_t dst_va = dst_buffer->bo->iova + dst_buffer->bo_offset + copy_info->bufferOffset + layer_offset * copy_info->bufferImageHeight * dst_pitch; - - if ((dst_pitch & 63) || (dst_va & 63)) { - /* Do a per line copy */ - VkBufferImageCopy line_copy_info = *copy_info; - line_copy_info.imageExtent.height = 1; - for (unsigned r = 0; r < copy_info->imageExtent.height; ++r) { - /* - * if dst_va is not aligned the line copy will need to adjust. Give it - * room to do so. - */ - unsigned max_width = 16384 - (dst_va & 0x3f) ? 64 : 0; - line_copy_info.imageOffset.x = copy_info->imageOffset.x; - line_copy_info.imageExtent.width = copy_info->imageExtent.width; - - for (unsigned c = 0; c < copy_info->imageExtent.width; c += max_width) { - tu_copy_image_to_buffer_step(cmdbuf, src_image, dst_buffer, &line_copy_info, format, layer, dst_va + c * cpp); - - line_copy_info.imageOffset.x += max_width; - line_copy_info.imageExtent.width -= max_width; - } - - line_copy_info.imageOffset.y++; - dst_va += dst_pitch; - } - } else { - tu_copy_image_to_buffer_step(cmdbuf, src_image, dst_buffer, copy_info, format, layer, dst_va); - } - } - - tu_cs_reserve_space(cmdbuf->device, &cmdbuf->cs, 15); + tu_blit(cmdbuf, &(struct tu_blit) { + .dst = tu_blit_buffer(dst_buffer, src_image->vk_format, info), + .src = tu_blit_surf_ext(src_image, info->imageSubresource, info->imageOffset, info->imageExtent), + .layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount), + .type = TU_BLIT_COPY, + }); +} - tu6_emit_event_write(cmdbuf, &cmdbuf->cs, 0x1d, true); - tu6_emit_event_write(cmdbuf, &cmdbuf->cs, FACENESS_FLUSH, true); - tu6_emit_event_write(cmdbuf, &cmdbuf->cs, CACHE_FLUSH_TS, true); +static void +tu_copy_image_to_image(struct tu_cmd_buffer *cmdbuf, + struct tu_image *src_image, + struct tu_image *dst_image, + const VkImageCopy *info) +{ + if ((info->dstSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT && + vk_format_get_blocksize(dst_image->vk_format) == 4) || + (info->srcSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT && + vk_format_get_blocksize(src_image->vk_format) == 4)) { + tu_finishme("aspect mask\n"); + return; + } + + tu_blit(cmdbuf, &(struct tu_blit) { + .dst = tu_blit_surf_ext(dst_image, info->dstSubresource, info->dstOffset, info->extent), + .src = tu_blit_surf_ext(src_image, info->srcSubresource, info->srcOffset, info->extent), + .layers = info->extent.depth, + .type = TU_BLIT_COPY, + }); } void @@ -616,13 +152,8 @@ TU_FROM_HANDLE(tu_buffer, src_buffer, srcBuffer); TU_FROM_HANDLE(tu_buffer, dst_buffer, destBuffer); - for (unsigned i = 0; i < regionCount; ++i) { - uint64_t src_offset = src_buffer->bo_offset + pRegions[i].srcOffset; - uint64_t dst_offset = dst_buffer->bo_offset + pRegions[i].dstOffset; - - tu_copy_buffer(cmdbuf, src_buffer->bo, src_offset, dst_buffer->bo, - dst_offset, pRegions[i].size); - } + for (unsigned i = 0; i < regionCount; ++i) + tu_copy_buffer(cmdbuf, src_buffer, dst_buffer, &pRegions[i]); } void @@ -633,14 +164,15 @@ uint32_t regionCount, const VkBufferImageCopy *pRegions) { - TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer); - TU_FROM_HANDLE(tu_image, dest_image, destImage); + TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer); + TU_FROM_HANDLE(tu_image, dst_image, destImage); TU_FROM_HANDLE(tu_buffer, src_buffer, srcBuffer); - for (unsigned i = 0; i < regionCount; ++i) { - tu_copy_buffer_to_image(cmd_buffer, src_buffer, dest_image, - pRegions + i); - } + tu_bo_list_add(&cmdbuf->bo_list, src_buffer->bo, MSM_SUBMIT_BO_READ); + tu_bo_list_add(&cmdbuf->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE); + + for (unsigned i = 0; i < regionCount; ++i) + tu_copy_buffer_to_image(cmdbuf, src_buffer, dst_image, pRegions + i); } void @@ -651,25 +183,15 @@ uint32_t regionCount, const VkBufferImageCopy *pRegions) { - TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer); + TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer); TU_FROM_HANDLE(tu_image, src_image, srcImage); TU_FROM_HANDLE(tu_buffer, dst_buffer, destBuffer); - for (unsigned i = 0; i < regionCount; ++i) { - tu_copy_image_to_buffer(cmd_buffer, src_image, dst_buffer, - pRegions + i); - } -} + tu_bo_list_add(&cmdbuf->bo_list, src_image->bo, MSM_SUBMIT_BO_READ); + tu_bo_list_add(&cmdbuf->bo_list, dst_buffer->bo, MSM_SUBMIT_BO_WRITE); -static void -meta_copy_image(struct tu_cmd_buffer *cmd_buffer, - struct tu_image *src_image, - VkImageLayout src_image_layout, - struct tu_image *dest_image, - VkImageLayout dest_image_layout, - uint32_t regionCount, - const VkImageCopy *pRegions) -{ + for (unsigned i = 0; i < regionCount; ++i) + tu_copy_image_to_buffer(cmdbuf, src_image, dst_buffer, pRegions + i); } void @@ -681,10 +203,13 @@ uint32_t regionCount, const VkImageCopy *pRegions) { - TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer); + TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer); TU_FROM_HANDLE(tu_image, src_image, srcImage); - TU_FROM_HANDLE(tu_image, dest_image, destImage); + TU_FROM_HANDLE(tu_image, dst_image, destImage); + + tu_bo_list_add(&cmdbuf->bo_list, src_image->bo, MSM_SUBMIT_BO_READ); + tu_bo_list_add(&cmdbuf->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE); - meta_copy_image(cmd_buffer, src_image, srcImageLayout, dest_image, - destImageLayout, regionCount, pRegions); + for (uint32_t i = 0; i < regionCount; ++i) + tu_copy_image_to_image(cmdbuf, src_image, dst_image, pRegions + i); } diff -Nru mesa-19.2.8/src/freedreno/vulkan/tu_meta_resolve.c mesa-20.0.8/src/freedreno/vulkan/tu_meta_resolve.c --- mesa-19.2.8/src/freedreno/vulkan/tu_meta_resolve.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/freedreno/vulkan/tu_meta_resolve.c 2020-06-12 01:21:16.000000000 +0000 @@ -29,6 +29,23 @@ #include "nir/nir_builder.h" #include "vk_format.h" +#include "tu_blit.h" + +static void +tu_resolve_image(struct tu_cmd_buffer *cmdbuf, + struct tu_image *src_image, + struct tu_image *dst_image, + const VkImageResolve *info) +{ + assert(info->dstSubresource.layerCount == info->srcSubresource.layerCount); + + tu_blit(cmdbuf, &(struct tu_blit) { + .dst = tu_blit_surf_ext(dst_image, info->dstSubresource, info->dstOffset, info->extent), + .src = tu_blit_surf_ext(src_image, info->srcSubresource, info->srcOffset, info->extent), + .layers = MAX2(info->extent.depth, info->dstSubresource.layerCount) + }); +} + void tu_CmdResolveImage(VkCommandBuffer cmd_buffer_h, VkImage src_image_h, @@ -38,4 +55,13 @@ uint32_t region_count, const VkImageResolve *regions) { + TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, cmd_buffer_h); + TU_FROM_HANDLE(tu_image, src_image, src_image_h); + TU_FROM_HANDLE(tu_image, dst_image, dest_image_h); + + tu_bo_list_add(&cmdbuf->bo_list, src_image->bo, MSM_SUBMIT_BO_READ); + tu_bo_list_add(&cmdbuf->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE); + + for (uint32_t i = 0; i < region_count; ++i) + tu_resolve_image(cmdbuf, src_image, dst_image, regions + i); } diff -Nru mesa-19.2.8/src/freedreno/vulkan/tu_pass.c mesa-20.0.8/src/freedreno/vulkan/tu_pass.c --- mesa-19.2.8/src/freedreno/vulkan/tu_pass.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/freedreno/vulkan/tu_pass.c 2020-06-12 01:21:16.000000000 +0000 @@ -27,6 +27,62 @@ #include "tu_private.h" #include "vk_util.h" +#include "vk_format.h" + +static void update_samples(struct tu_subpass *subpass, + VkSampleCountFlagBits samples) +{ + assert(subpass->samples == 0 || subpass->samples == samples); + subpass->samples = samples; +} + +#define GMEM_ALIGN 0x4000 + +static void +compute_gmem_offsets(struct tu_render_pass *pass, uint32_t gmem_size) +{ + /* calculate total bytes per pixel */ + uint32_t cpp_total = 0; + for (uint32_t i = 0; i < pass->attachment_count; i++) { + struct tu_render_pass_attachment *att = &pass->attachments[i]; + if (att->gmem_offset >= 0) + cpp_total += att->cpp; + } + + /* no gmem attachments */ + if (cpp_total == 0) { + /* any value non-zero value so tiling config works with no attachments */ + pass->gmem_pixels = 1024*1024; + return; + } + + /* TODO: this algorithm isn't optimal + * for example, two attachments with cpp = {1, 4} + * result: nblocks = {12, 52}, pixels = 196608 + * optimal: nblocks = {13, 51}, pixels = 208896 + */ + uint32_t gmem_blocks = gmem_size / GMEM_ALIGN; + uint32_t offset = 0, pixels = ~0u; + for (uint32_t i = 0; i < pass->attachment_count; i++) { + struct tu_render_pass_attachment *att = &pass->attachments[i]; + if (att->gmem_offset < 0) + continue; + + att->gmem_offset = offset; + + /* Note: divide by 16 is for GMEM_ALIGN=16k, tile align w=64/h=16 */ + uint32_t align = MAX2(1, att->cpp / 16); + uint32_t nblocks = MAX2((gmem_blocks * att->cpp / cpp_total) & ~(align - 1), align); + + gmem_blocks -= nblocks; + cpp_total -= att->cpp; + offset += nblocks * GMEM_ALIGN; + pixels = MIN2(pixels, nblocks * GMEM_ALIGN / att->cpp); + } + + pass->gmem_pixels = pixels; + assert(pixels); +} VkResult tu_CreateRenderPass(VkDevice _device, @@ -38,7 +94,6 @@ struct tu_render_pass *pass; size_t size; size_t attachments_offset; - VkRenderPassMultiviewCreateInfo *multiview_info = NULL; assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO); @@ -57,29 +112,21 @@ pass->subpass_count = pCreateInfo->subpassCount; pass->attachments = (void *) pass + attachments_offset; - vk_foreach_struct(ext, pCreateInfo->pNext) - { - switch (ext->sType) { - case VK_STRUCTURE_TYPE_RENDER_PASS_MULTIVIEW_CREATE_INFO: - multiview_info = (VkRenderPassMultiviewCreateInfo *) ext; - break; - default: - break; - } - } - for (uint32_t i = 0; i < pCreateInfo->attachmentCount; i++) { struct tu_render_pass_attachment *att = &pass->attachments[i]; att->format = pCreateInfo->pAttachments[i].format; - att->samples = pCreateInfo->pAttachments[i].samples; + att->cpp = vk_format_get_blocksize(att->format) * + pCreateInfo->pAttachments[i].samples; att->load_op = pCreateInfo->pAttachments[i].loadOp; att->stencil_load_op = pCreateInfo->pAttachments[i].stencilLoadOp; - att->initial_layout = pCreateInfo->pAttachments[i].initialLayout; - att->final_layout = pCreateInfo->pAttachments[i].finalLayout; - // att->store_op = pCreateInfo->pAttachments[i].storeOp; - // att->stencil_store_op = pCreateInfo->pAttachments[i].stencilStoreOp; + att->store_op = pCreateInfo->pAttachments[i].storeOp; + if (pCreateInfo->pAttachments[i].stencilStoreOp == VK_ATTACHMENT_STORE_OP_STORE && + vk_format_has_stencil(att->format)) + att->store_op = VK_ATTACHMENT_STORE_OP_STORE; + att->gmem_offset = -1; } + uint32_t subpass_attachment_count = 0; struct tu_subpass_attachment *p; for (uint32_t i = 0; i < pCreateInfo->subpassCount; i++) { @@ -87,8 +134,7 @@ subpass_attachment_count += desc->inputAttachmentCount + desc->colorAttachmentCount + - (desc->pResolveAttachments ? desc->colorAttachmentCount : 0) + - (desc->pDepthStencilAttachment != NULL); + (desc->pResolveAttachments ? desc->colorAttachmentCount : 0); } if (subpass_attachment_count) { @@ -106,26 +152,21 @@ p = pass->subpass_attachments; for (uint32_t i = 0; i < pCreateInfo->subpassCount; i++) { const VkSubpassDescription *desc = &pCreateInfo->pSubpasses[i]; - uint32_t color_sample_count = 1, depth_sample_count = 1; struct tu_subpass *subpass = &pass->subpasses[i]; subpass->input_count = desc->inputAttachmentCount; subpass->color_count = desc->colorAttachmentCount; - if (multiview_info) - subpass->view_mask = multiview_info->pViewMasks[i]; + subpass->samples = 0; if (desc->inputAttachmentCount > 0) { subpass->input_attachments = p; p += desc->inputAttachmentCount; for (uint32_t j = 0; j < desc->inputAttachmentCount; j++) { - subpass->input_attachments[j] = (struct tu_subpass_attachment) { - .attachment = desc->pInputAttachments[j].attachment, - .layout = desc->pInputAttachments[j].layout, - }; - if (desc->pInputAttachments[j].attachment != VK_ATTACHMENT_UNUSED) - pass->attachments[desc->pInputAttachments[j].attachment] - .view_mask |= subpass->view_mask; + uint32_t a = desc->pInputAttachments[j].attachment; + subpass->input_attachments[j].attachment = a; + if (a != VK_ATTACHMENT_UNUSED) + pass->attachments[a].gmem_offset = 0; } } @@ -134,100 +175,55 @@ p += desc->colorAttachmentCount; for (uint32_t j = 0; j < desc->colorAttachmentCount; j++) { - subpass->color_attachments[j] = (struct tu_subpass_attachment) { - .attachment = desc->pColorAttachments[j].attachment, - .layout = desc->pColorAttachments[j].layout, - }; - if (desc->pColorAttachments[j].attachment != - VK_ATTACHMENT_UNUSED) { - pass->attachments[desc->pColorAttachments[j].attachment] - .view_mask |= subpass->view_mask; - color_sample_count = - pCreateInfo - ->pAttachments[desc->pColorAttachments[j].attachment] - .samples; + uint32_t a = desc->pColorAttachments[j].attachment; + subpass->color_attachments[j].attachment = a; + + if (a != VK_ATTACHMENT_UNUSED) { + pass->attachments[a].gmem_offset = 0; + update_samples(subpass, pCreateInfo->pAttachments[a].samples); } } } - subpass->has_resolve = false; + subpass->resolve_attachments = desc->pResolveAttachments ? p : NULL; if (desc->pResolveAttachments) { - subpass->resolve_attachments = p; p += desc->colorAttachmentCount; - for (uint32_t j = 0; j < desc->colorAttachmentCount; j++) { - uint32_t a = desc->pResolveAttachments[j].attachment; - subpass->resolve_attachments[j] = (struct tu_subpass_attachment) { - .attachment = desc->pResolveAttachments[j].attachment, - .layout = desc->pResolveAttachments[j].layout, - }; - if (a != VK_ATTACHMENT_UNUSED) { - subpass->has_resolve = true; - pass->attachments[desc->pResolveAttachments[j].attachment] - .view_mask |= subpass->view_mask; - } + subpass->resolve_attachments[j].attachment = + desc->pResolveAttachments[j].attachment; } } - if (desc->pDepthStencilAttachment) { - subpass->depth_stencil_attachment = (struct tu_subpass_attachment) { - .attachment = desc->pDepthStencilAttachment->attachment, - .layout = desc->pDepthStencilAttachment->layout, - }; - if (desc->pDepthStencilAttachment->attachment != - VK_ATTACHMENT_UNUSED) { - pass->attachments[desc->pDepthStencilAttachment->attachment] - .view_mask |= subpass->view_mask; - depth_sample_count = - pCreateInfo - ->pAttachments[desc->pDepthStencilAttachment->attachment] - .samples; - } - } else { - subpass->depth_stencil_attachment.attachment = VK_ATTACHMENT_UNUSED; + uint32_t a = desc->pDepthStencilAttachment ? + desc->pDepthStencilAttachment->attachment : VK_ATTACHMENT_UNUSED; + subpass->depth_stencil_attachment.attachment = a; + if (a != VK_ATTACHMENT_UNUSED) { + pass->attachments[a].gmem_offset = 0; + update_samples(subpass, pCreateInfo->pAttachments[a].samples); } - subpass->max_sample_count = - MAX2(color_sample_count, depth_sample_count); - } - - for (unsigned i = 0; i < pCreateInfo->dependencyCount; ++i) { - uint32_t dst = pCreateInfo->pDependencies[i].dstSubpass; - if (dst == VK_SUBPASS_EXTERNAL) { - pass->end_barrier.src_stage_mask = - pCreateInfo->pDependencies[i].srcStageMask; - pass->end_barrier.src_access_mask = - pCreateInfo->pDependencies[i].srcAccessMask; - pass->end_barrier.dst_access_mask = - pCreateInfo->pDependencies[i].dstAccessMask; - } else { - pass->subpasses[dst].start_barrier.src_stage_mask = - pCreateInfo->pDependencies[i].srcStageMask; - pass->subpasses[dst].start_barrier.src_access_mask = - pCreateInfo->pDependencies[i].srcAccessMask; - pass->subpasses[dst].start_barrier.dst_access_mask = - pCreateInfo->pDependencies[i].dstAccessMask; - } + subpass->samples = subpass->samples ?: 1; } *pRenderPass = tu_render_pass_to_handle(pass); + compute_gmem_offsets(pass, device->physical_device->gmem_size); + return VK_SUCCESS; } VkResult -tu_CreateRenderPass2KHR(VkDevice _device, - const VkRenderPassCreateInfo2KHR *pCreateInfo, - const VkAllocationCallbacks *pAllocator, - VkRenderPass *pRenderPass) +tu_CreateRenderPass2(VkDevice _device, + const VkRenderPassCreateInfo2KHR *pCreateInfo, + const VkAllocationCallbacks *pAllocator, + VkRenderPass *pRenderPass) { TU_FROM_HANDLE(tu_device, device, _device); struct tu_render_pass *pass; size_t size; size_t attachments_offset; - assert(pCreateInfo->sType == - VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO_2_KHR); + assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO_2_KHR); size = sizeof(*pass); size += pCreateInfo->subpassCount * sizeof(pass->subpasses[0]); @@ -248,13 +244,16 @@ struct tu_render_pass_attachment *att = &pass->attachments[i]; att->format = pCreateInfo->pAttachments[i].format; - att->samples = pCreateInfo->pAttachments[i].samples; + att->cpp = vk_format_get_blocksize(att->format) * + pCreateInfo->pAttachments[i].samples; att->load_op = pCreateInfo->pAttachments[i].loadOp; att->stencil_load_op = pCreateInfo->pAttachments[i].stencilLoadOp; - att->initial_layout = pCreateInfo->pAttachments[i].initialLayout; - att->final_layout = pCreateInfo->pAttachments[i].finalLayout; - // att->store_op = pCreateInfo->pAttachments[i].storeOp; - // att->stencil_store_op = pCreateInfo->pAttachments[i].stencilStoreOp; + att->store_op = pCreateInfo->pAttachments[i].storeOp; + att->stencil_store_op = pCreateInfo->pAttachments[i].stencilStoreOp; + if (pCreateInfo->pAttachments[i].stencilStoreOp == VK_ATTACHMENT_STORE_OP_STORE && + vk_format_has_stencil(att->format)) + att->store_op = VK_ATTACHMENT_STORE_OP_STORE; + att->gmem_offset = -1; } uint32_t subpass_attachment_count = 0; struct tu_subpass_attachment *p; @@ -263,8 +262,7 @@ subpass_attachment_count += desc->inputAttachmentCount + desc->colorAttachmentCount + - (desc->pResolveAttachments ? desc->colorAttachmentCount : 0) + - (desc->pDepthStencilAttachment != NULL); + (desc->pResolveAttachments ? desc->colorAttachmentCount : 0); } if (subpass_attachment_count) { @@ -282,25 +280,21 @@ p = pass->subpass_attachments; for (uint32_t i = 0; i < pCreateInfo->subpassCount; i++) { const VkSubpassDescription2KHR *desc = &pCreateInfo->pSubpasses[i]; - uint32_t color_sample_count = 1, depth_sample_count = 1; struct tu_subpass *subpass = &pass->subpasses[i]; subpass->input_count = desc->inputAttachmentCount; subpass->color_count = desc->colorAttachmentCount; - subpass->view_mask = desc->viewMask; + subpass->samples = 0; if (desc->inputAttachmentCount > 0) { subpass->input_attachments = p; p += desc->inputAttachmentCount; for (uint32_t j = 0; j < desc->inputAttachmentCount; j++) { - subpass->input_attachments[j] = (struct tu_subpass_attachment) { - .attachment = desc->pInputAttachments[j].attachment, - .layout = desc->pInputAttachments[j].layout, - }; - if (desc->pInputAttachments[j].attachment != VK_ATTACHMENT_UNUSED) - pass->attachments[desc->pInputAttachments[j].attachment] - .view_mask |= subpass->view_mask; + uint32_t a = desc->pInputAttachments[j].attachment; + subpass->input_attachments[j].attachment = a; + if (a != VK_ATTACHMENT_UNUSED) + pass->attachments[a].gmem_offset = 0; } } @@ -309,84 +303,41 @@ p += desc->colorAttachmentCount; for (uint32_t j = 0; j < desc->colorAttachmentCount; j++) { - subpass->color_attachments[j] = (struct tu_subpass_attachment) { - .attachment = desc->pColorAttachments[j].attachment, - .layout = desc->pColorAttachments[j].layout, - }; - if (desc->pColorAttachments[j].attachment != - VK_ATTACHMENT_UNUSED) { - pass->attachments[desc->pColorAttachments[j].attachment] - .view_mask |= subpass->view_mask; - color_sample_count = - pCreateInfo - ->pAttachments[desc->pColorAttachments[j].attachment] - .samples; + uint32_t a = desc->pColorAttachments[j].attachment; + subpass->color_attachments[j].attachment = a; + + if (a != VK_ATTACHMENT_UNUSED) { + pass->attachments[a].gmem_offset = 0; + update_samples(subpass, pCreateInfo->pAttachments[a].samples); } } } - subpass->has_resolve = false; + subpass->resolve_attachments = desc->pResolveAttachments ? p : NULL; if (desc->pResolveAttachments) { - subpass->resolve_attachments = p; p += desc->colorAttachmentCount; - for (uint32_t j = 0; j < desc->colorAttachmentCount; j++) { - uint32_t a = desc->pResolveAttachments[j].attachment; - subpass->resolve_attachments[j] = (struct tu_subpass_attachment) { - .attachment = desc->pResolveAttachments[j].attachment, - .layout = desc->pResolveAttachments[j].layout, - }; - if (a != VK_ATTACHMENT_UNUSED) { - subpass->has_resolve = true; - pass->attachments[desc->pResolveAttachments[j].attachment] - .view_mask |= subpass->view_mask; - } + subpass->resolve_attachments[j].attachment = + desc->pResolveAttachments[j].attachment; } } - if (desc->pDepthStencilAttachment) { - subpass->depth_stencil_attachment = (struct tu_subpass_attachment) { - .attachment = desc->pDepthStencilAttachment->attachment, - .layout = desc->pDepthStencilAttachment->layout, - }; - if (desc->pDepthStencilAttachment->attachment != - VK_ATTACHMENT_UNUSED) { - pass->attachments[desc->pDepthStencilAttachment->attachment] - .view_mask |= subpass->view_mask; - depth_sample_count = - pCreateInfo - ->pAttachments[desc->pDepthStencilAttachment->attachment] - .samples; - } - } else { - subpass->depth_stencil_attachment.attachment = VK_ATTACHMENT_UNUSED; - } - subpass->max_sample_count = - MAX2(color_sample_count, depth_sample_count); - } - - for (unsigned i = 0; i < pCreateInfo->dependencyCount; ++i) { - uint32_t dst = pCreateInfo->pDependencies[i].dstSubpass; - if (dst == VK_SUBPASS_EXTERNAL) { - pass->end_barrier.src_stage_mask = - pCreateInfo->pDependencies[i].srcStageMask; - pass->end_barrier.src_access_mask = - pCreateInfo->pDependencies[i].srcAccessMask; - pass->end_barrier.dst_access_mask = - pCreateInfo->pDependencies[i].dstAccessMask; - } else { - pass->subpasses[dst].start_barrier.src_stage_mask = - pCreateInfo->pDependencies[i].srcStageMask; - pass->subpasses[dst].start_barrier.src_access_mask = - pCreateInfo->pDependencies[i].srcAccessMask; - pass->subpasses[dst].start_barrier.dst_access_mask = - pCreateInfo->pDependencies[i].dstAccessMask; + uint32_t a = desc->pDepthStencilAttachment ? + desc->pDepthStencilAttachment->attachment : VK_ATTACHMENT_UNUSED; + subpass->depth_stencil_attachment.attachment = a; + if (a != VK_ATTACHMENT_UNUSED) { + pass->attachments[a].gmem_offset = 0; + update_samples(subpass, pCreateInfo->pAttachments[a].samples); } + + subpass->samples = subpass->samples ?: 1; } *pRenderPass = tu_render_pass_to_handle(pass); + compute_gmem_offsets(pass, device->physical_device->gmem_size); + return VK_SUCCESS; } @@ -400,6 +351,7 @@ if (!_pass) return; + vk_free2(&device->alloc, pAllocator, pass->subpass_attachments); vk_free2(&device->alloc, pAllocator, pass); } diff -Nru mesa-19.2.8/src/freedreno/vulkan/tu_pipeline.c mesa-20.0.8/src/freedreno/vulkan/tu_pipeline.c --- mesa-19.2.8/src/freedreno/vulkan/tu_pipeline.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/freedreno/vulkan/tu_pipeline.c 2020-06-12 01:21:16.000000000 +0000 @@ -43,6 +43,7 @@ { struct tu_device *device; struct tu_pipeline_cache *cache; + struct tu_pipeline_layout *layout; const VkAllocationCallbacks *alloc; const VkGraphicsPipelineCreateInfo *create_info; @@ -177,7 +178,7 @@ case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST: return DI_PT_TRILIST; case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP: - return DI_PT_TRILIST; + return DI_PT_TRISTRIP; case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN: return DI_PT_TRIFAN; case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY: @@ -357,19 +358,31 @@ } } +static unsigned +tu_shader_nibo(const struct tu_shader *shader) +{ + /* Don't use ir3_shader_nibo(), because that would include declared but + * unused storage images and SSBOs. + */ + return shader->ssbo_map.num_desc + shader->image_map.num_desc; +} + static void -tu6_emit_vs_config(struct tu_cs *cs, const struct ir3_shader_variant *vs) +tu6_emit_vs_config(struct tu_cs *cs, struct tu_shader *shader, + const struct ir3_shader_variant *vs) { uint32_t sp_vs_ctrl = A6XX_SP_VS_CTRL_REG0_THREADSIZE(FOUR_QUADS) | A6XX_SP_VS_CTRL_REG0_FULLREGFOOTPRINT(vs->info.max_reg + 1) | A6XX_SP_VS_CTRL_REG0_MERGEDREGS | A6XX_SP_VS_CTRL_REG0_BRANCHSTACK(vs->branchstack); - if (vs->num_samp) + if (vs->need_pixlod) sp_vs_ctrl |= A6XX_SP_VS_CTRL_REG0_PIXLODENABLE; + if (vs->need_fine_derivatives) + sp_vs_ctrl |= A6XX_SP_VS_CTRL_REG0_DIFF_FINE; - uint32_t sp_vs_config = A6XX_SP_VS_CONFIG_NTEX(vs->num_samp) | - A6XX_SP_VS_CONFIG_NSAMP(vs->num_samp); + uint32_t sp_vs_config = A6XX_SP_VS_CONFIG_NTEX(shader->texture_map.num_desc) | + A6XX_SP_VS_CONFIG_NSAMP(shader->sampler_map.num_desc); if (vs->instrlen) sp_vs_config |= A6XX_SP_VS_CONFIG_ENABLED; @@ -381,11 +394,13 @@ tu_cs_emit(cs, vs->instrlen); tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_VS_CNTL, 1); - tu_cs_emit(cs, A6XX_HLSQ_VS_CNTL_CONSTLEN(align(vs->constlen, 4)) | 0x100); + tu_cs_emit(cs, A6XX_HLSQ_VS_CNTL_CONSTLEN(align(vs->constlen, 4)) | + A6XX_HLSQ_VS_CNTL_ENABLED); } static void -tu6_emit_hs_config(struct tu_cs *cs, const struct ir3_shader_variant *hs) +tu6_emit_hs_config(struct tu_cs *cs, struct tu_shader *shader, + const struct ir3_shader_variant *hs) { uint32_t sp_hs_config = 0; if (hs->instrlen) @@ -403,7 +418,8 @@ } static void -tu6_emit_ds_config(struct tu_cs *cs, const struct ir3_shader_variant *ds) +tu6_emit_ds_config(struct tu_cs *cs, struct tu_shader *shader, + const struct ir3_shader_variant *ds) { uint32_t sp_ds_config = 0; if (ds->instrlen) @@ -418,7 +434,8 @@ } static void -tu6_emit_gs_config(struct tu_cs *cs, const struct ir3_shader_variant *gs) +tu6_emit_gs_config(struct tu_cs *cs, struct tu_shader *shader, + const struct ir3_shader_variant *gs) { uint32_t sp_gs_config = 0; if (gs->instrlen) @@ -436,26 +453,27 @@ } static void -tu6_emit_fs_config(struct tu_cs *cs, const struct ir3_shader_variant *fs) +tu6_emit_fs_config(struct tu_cs *cs, struct tu_shader *shader, + const struct ir3_shader_variant *fs) { uint32_t sp_fs_ctrl = A6XX_SP_FS_CTRL_REG0_THREADSIZE(FOUR_QUADS) | 0x1000000 | A6XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT(fs->info.max_reg + 1) | A6XX_SP_FS_CTRL_REG0_MERGEDREGS | A6XX_SP_FS_CTRL_REG0_BRANCHSTACK(fs->branchstack); - if (fs->total_in > 0 || fs->frag_coord) + if (fs->total_in > 0) sp_fs_ctrl |= A6XX_SP_FS_CTRL_REG0_VARYING; - if (fs->num_samp > 0) + if (fs->need_pixlod) sp_fs_ctrl |= A6XX_SP_FS_CTRL_REG0_PIXLODENABLE; + if (fs->need_fine_derivatives) + sp_fs_ctrl |= A6XX_SP_FS_CTRL_REG0_DIFF_FINE; - uint32_t sp_fs_config = A6XX_SP_FS_CONFIG_NTEX(fs->num_samp) | - A6XX_SP_FS_CONFIG_NSAMP(fs->num_samp); + uint32_t sp_fs_config = A6XX_SP_FS_CONFIG_NTEX(shader->texture_map.num_desc) | + A6XX_SP_FS_CONFIG_NSAMP(shader->sampler_map.num_desc) | + A6XX_SP_FS_CONFIG_NIBO(tu_shader_nibo(shader)); if (fs->instrlen) sp_fs_config |= A6XX_SP_FS_CONFIG_ENABLED; - tu_cs_emit_pkt4(cs, REG_A6XX_SP_UNKNOWN_A99E, 1); - tu_cs_emit(cs, 0x7fc0); - tu_cs_emit_pkt4(cs, REG_A6XX_SP_UNKNOWN_A9A8, 1); tu_cs_emit(cs, 0); @@ -470,7 +488,58 @@ tu_cs_emit(cs, fs->instrlen); tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_FS_CNTL, 1); - tu_cs_emit(cs, A6XX_HLSQ_FS_CNTL_CONSTLEN(align(fs->constlen, 4)) | 0x100); + tu_cs_emit(cs, A6XX_HLSQ_FS_CNTL_CONSTLEN(align(fs->constlen, 4)) | + A6XX_HLSQ_FS_CNTL_ENABLED); + + tu_cs_emit_pkt4(cs, REG_A6XX_SP_IBO_COUNT, 1); + tu_cs_emit(cs, tu_shader_nibo(shader)); +} + +static void +tu6_emit_cs_config(struct tu_cs *cs, const struct tu_shader *shader, + const struct ir3_shader_variant *v) +{ + tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_UPDATE_CNTL, 1); + tu_cs_emit(cs, 0xff); + + unsigned constlen = align(v->constlen, 4); + tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_CS_CNTL, 1); + tu_cs_emit(cs, A6XX_HLSQ_CS_CNTL_CONSTLEN(constlen) | + A6XX_HLSQ_CS_CNTL_ENABLED); + + tu_cs_emit_pkt4(cs, REG_A6XX_SP_CS_CONFIG, 2); + tu_cs_emit(cs, A6XX_SP_CS_CONFIG_ENABLED | + A6XX_SP_CS_CONFIG_NIBO(tu_shader_nibo(shader)) | + A6XX_SP_CS_CONFIG_NTEX(shader->texture_map.num_desc) | + A6XX_SP_CS_CONFIG_NSAMP(shader->sampler_map.num_desc)); + tu_cs_emit(cs, v->instrlen); + + tu_cs_emit_pkt4(cs, REG_A6XX_SP_CS_CTRL_REG0, 1); + tu_cs_emit(cs, A6XX_SP_CS_CTRL_REG0_THREADSIZE(FOUR_QUADS) | + A6XX_SP_CS_CTRL_REG0_FULLREGFOOTPRINT(v->info.max_reg + 1) | + A6XX_SP_CS_CTRL_REG0_MERGEDREGS | + A6XX_SP_CS_CTRL_REG0_BRANCHSTACK(v->branchstack) | + COND(v->need_pixlod, A6XX_SP_CS_CTRL_REG0_PIXLODENABLE) | + COND(v->need_fine_derivatives, A6XX_SP_CS_CTRL_REG0_DIFF_FINE)); + + tu_cs_emit_pkt4(cs, REG_A6XX_SP_CS_UNKNOWN_A9B1, 1); + tu_cs_emit(cs, 0x41); + + uint32_t local_invocation_id = + ir3_find_sysval_regid(v, SYSTEM_VALUE_LOCAL_INVOCATION_ID); + uint32_t work_group_id = + ir3_find_sysval_regid(v, SYSTEM_VALUE_WORK_GROUP_ID); + + tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_CS_CNTL_0, 2); + tu_cs_emit(cs, + A6XX_HLSQ_CS_CNTL_0_WGIDCONSTID(work_group_id) | + A6XX_HLSQ_CS_CNTL_0_UNK0(regid(63, 0)) | + A6XX_HLSQ_CS_CNTL_0_UNK1(regid(63, 0)) | + A6XX_HLSQ_CS_CNTL_0_LOCALIDREGID(local_invocation_id)); + tu_cs_emit(cs, 0x2fc); /* HLSQ_CS_UNKNOWN_B998 */ + + tu_cs_emit_pkt4(cs, REG_A6XX_SP_CS_IBO_COUNT, 1); + tu_cs_emit(cs, tu_shader_nibo(shader)); } static void @@ -478,7 +547,7 @@ const struct ir3_shader_variant *vs) { const uint32_t vertexid_regid = - ir3_find_sysval_regid(vs, SYSTEM_VALUE_VERTEX_ID_ZERO_BASE); + ir3_find_sysval_regid(vs, SYSTEM_VALUE_VERTEX_ID); const uint32_t instanceid_regid = ir3_find_sysval_regid(vs, SYSTEM_VALUE_INSTANCE_ID); @@ -523,9 +592,11 @@ ir3_find_output_regid(vs, VARYING_SLOT_POS); const uint32_t pointsize_regid = ir3_find_output_regid(vs, VARYING_SLOT_PSIZ); - uint32_t pointsize_loc = 0xff; - if (position_regid != regid(63, 0)) + uint32_t pointsize_loc = 0xff, position_loc = 0xff; + if (position_regid != regid(63, 0)) { + position_loc = linkage.max_loc; ir3_link_add(&linkage, position_regid, 0xf, linkage.max_loc); + } if (pointsize_regid != regid(63, 0)) { pointsize_loc = linkage.max_loc; ir3_link_add(&linkage, pointsize_regid, 0x1, linkage.max_loc); @@ -559,7 +630,7 @@ 0xff00ff00); tu_cs_emit_pkt4(cs, REG_A6XX_VPC_PACK, 1); - tu_cs_emit(cs, A6XX_VPC_PACK_NUMNONPOSVAR(fs->total_in) | + tu_cs_emit(cs, A6XX_VPC_PACK_POSITIONLOC(position_loc) | A6XX_VPC_PACK_PSIZELOC(pointsize_loc) | A6XX_VPC_PACK_STRIDE_IN_VPC(linkage.max_loc)); @@ -675,46 +746,63 @@ } static void -tu6_emit_fs_system_values(struct tu_cs *cs, - const struct ir3_shader_variant *fs) +tu6_emit_fs_inputs(struct tu_cs *cs, const struct ir3_shader_variant *fs) { - const uint32_t frontfacing_regid = - ir3_find_sysval_regid(fs, SYSTEM_VALUE_FRONT_FACE); - const uint32_t sampleid_regid = - ir3_find_sysval_regid(fs, SYSTEM_VALUE_SAMPLE_ID); - const uint32_t samplemaskin_regid = - ir3_find_sysval_regid(fs, SYSTEM_VALUE_SAMPLE_MASK_IN); - const uint32_t fragcoord_xy_regid = - ir3_find_sysval_regid(fs, SYSTEM_VALUE_FRAG_COORD); - const uint32_t fragcoord_zw_regid = (fragcoord_xy_regid != regid(63, 0)) - ? (fragcoord_xy_regid + 2) - : fragcoord_xy_regid; - const uint32_t varyingcoord_regid = - ir3_find_sysval_regid(fs, SYSTEM_VALUE_BARYCENTRIC_PIXEL); + uint32_t face_regid, coord_regid, zwcoord_regid, samp_id_regid; + uint32_t ij_pix_regid, ij_samp_regid, ij_cent_regid, ij_size_regid; + uint32_t smask_in_regid; + + bool sample_shading = fs->per_samp; /* TODO | key->sample_shading; */ + bool enable_varyings = fs->total_in > 0; + + samp_id_regid = ir3_find_sysval_regid(fs, SYSTEM_VALUE_SAMPLE_ID); + smask_in_regid = ir3_find_sysval_regid(fs, SYSTEM_VALUE_SAMPLE_MASK_IN); + face_regid = ir3_find_sysval_regid(fs, SYSTEM_VALUE_FRONT_FACE); + coord_regid = ir3_find_sysval_regid(fs, SYSTEM_VALUE_FRAG_COORD); + zwcoord_regid = VALIDREG(coord_regid) ? coord_regid + 2 : regid(63, 0); + ij_pix_regid = ir3_find_sysval_regid(fs, SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL); + ij_samp_regid = ir3_find_sysval_regid(fs, SYSTEM_VALUE_BARYCENTRIC_PERSP_SAMPLE); + ij_cent_regid = ir3_find_sysval_regid(fs, SYSTEM_VALUE_BARYCENTRIC_PERSP_CENTROID); + ij_size_regid = ir3_find_sysval_regid(fs, SYSTEM_VALUE_BARYCENTRIC_PERSP_SIZE); + + if (fs->num_sampler_prefetch > 0) { + assert(VALIDREG(ij_pix_regid)); + /* also, it seems like ij_pix is *required* to be r0.x */ + assert(ij_pix_regid == regid(0, 0)); + } + + tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_PREFETCH_CNTL, 1 + fs->num_sampler_prefetch); + tu_cs_emit(cs, A6XX_SP_FS_PREFETCH_CNTL_COUNT(fs->num_sampler_prefetch) | + A6XX_SP_FS_PREFETCH_CNTL_UNK4(regid(63, 0)) | + 0x7000); // XXX); + for (int i = 0; i < fs->num_sampler_prefetch; i++) { + const struct ir3_sampler_prefetch *prefetch = &fs->sampler_prefetch[i]; + tu_cs_emit(cs, A6XX_SP_FS_PREFETCH_CMD_SRC(prefetch->src) | + A6XX_SP_FS_PREFETCH_CMD_SAMP_ID(prefetch->samp_id) | + A6XX_SP_FS_PREFETCH_CMD_TEX_ID(prefetch->tex_id) | + A6XX_SP_FS_PREFETCH_CMD_DST(prefetch->dst) | + A6XX_SP_FS_PREFETCH_CMD_WRMASK(prefetch->wrmask) | + COND(prefetch->half_precision, A6XX_SP_FS_PREFETCH_CMD_HALF) | + A6XX_SP_FS_PREFETCH_CMD_CMD(prefetch->cmd)); + } tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_CONTROL_1_REG, 5); tu_cs_emit(cs, 0x7); - tu_cs_emit(cs, A6XX_HLSQ_CONTROL_2_REG_FACEREGID(frontfacing_regid) | - A6XX_HLSQ_CONTROL_2_REG_SAMPLEID(sampleid_regid) | - A6XX_HLSQ_CONTROL_2_REG_SAMPLEMASK(samplemaskin_regid) | - A6XX_HLSQ_CONTROL_2_REG_SIZE(regid(63, 0))); - tu_cs_emit(cs, - A6XX_HLSQ_CONTROL_3_REG_BARY_IJ_PIXEL(varyingcoord_regid) | - A6XX_HLSQ_CONTROL_3_REG_BARY_IJ_CENTROID(regid(63, 0)) | - 0xfc00fc00); - tu_cs_emit(cs, - A6XX_HLSQ_CONTROL_4_REG_XYCOORDREGID(fragcoord_xy_regid) | - A6XX_HLSQ_CONTROL_4_REG_ZWCOORDREGID(fragcoord_zw_regid) | - A6XX_HLSQ_CONTROL_4_REG_BARY_IJ_PIXEL_PERSAMP(regid(63, 0)) | - 0x0000fc00); + tu_cs_emit(cs, A6XX_HLSQ_CONTROL_2_REG_FACEREGID(face_regid) | + A6XX_HLSQ_CONTROL_2_REG_SAMPLEID(samp_id_regid) | + A6XX_HLSQ_CONTROL_2_REG_SAMPLEMASK(smask_in_regid) | + A6XX_HLSQ_CONTROL_2_REG_SIZE(ij_size_regid)); + tu_cs_emit(cs, A6XX_HLSQ_CONTROL_3_REG_BARY_IJ_PIXEL(ij_pix_regid) | + A6XX_HLSQ_CONTROL_3_REG_BARY_IJ_CENTROID(ij_cent_regid) | + 0xfc00fc00); + tu_cs_emit(cs, A6XX_HLSQ_CONTROL_4_REG_XYCOORDREGID(coord_regid) | + A6XX_HLSQ_CONTROL_4_REG_ZWCOORDREGID(zwcoord_regid) | + A6XX_HLSQ_CONTROL_4_REG_BARY_IJ_PIXEL_PERSAMP(ij_samp_regid) | + 0x0000fc00); tu_cs_emit(cs, 0xfc); -} -static void -tu6_emit_fs_inputs(struct tu_cs *cs, const struct ir3_shader_variant *fs) -{ tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_UNKNOWN_B980, 1); - tu_cs_emit(cs, fs->total_in > 0 ? 3 : 1); + tu_cs_emit(cs, enable_varyings ? 3 : 1); tu_cs_emit_pkt4(cs, REG_A6XX_SP_UNKNOWN_A982, 1); tu_cs_emit(cs, 0); /* XXX */ @@ -722,33 +810,41 @@ tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_UPDATE_CNTL, 1); tu_cs_emit(cs, 0xff); /* XXX */ - uint32_t gras_cntl = 0; - if (fs->total_in > 0) - gras_cntl |= A6XX_GRAS_CNTL_VARYING; - if (fs->frag_coord) { - gras_cntl |= A6XX_GRAS_CNTL_SIZE | A6XX_GRAS_CNTL_XCOORD | - A6XX_GRAS_CNTL_YCOORD | A6XX_GRAS_CNTL_ZCOORD | - A6XX_GRAS_CNTL_WCOORD; - } - tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_CNTL, 1); - tu_cs_emit(cs, gras_cntl); - - uint32_t rb_render_control = 0; - if (fs->total_in > 0) { - rb_render_control = - A6XX_RB_RENDER_CONTROL0_VARYING | A6XX_RB_RENDER_CONTROL0_UNK10; - } - if (fs->frag_coord) { - rb_render_control |= - A6XX_RB_RENDER_CONTROL0_SIZE | A6XX_RB_RENDER_CONTROL0_XCOORD | - A6XX_RB_RENDER_CONTROL0_YCOORD | A6XX_RB_RENDER_CONTROL0_ZCOORD | - A6XX_RB_RENDER_CONTROL0_WCOORD; - } + tu_cs_emit(cs, + CONDREG(ij_pix_regid, A6XX_GRAS_CNTL_VARYING) | + CONDREG(ij_cent_regid, A6XX_GRAS_CNTL_CENTROID) | + CONDREG(ij_samp_regid, A6XX_GRAS_CNTL_PERSAMP_VARYING) | + COND(VALIDREG(ij_size_regid) && !sample_shading, A6XX_GRAS_CNTL_SIZE) | + COND(VALIDREG(ij_size_regid) && sample_shading, A6XX_GRAS_CNTL_SIZE_PERSAMP) | + COND(fs->frag_coord, + A6XX_GRAS_CNTL_SIZE | + A6XX_GRAS_CNTL_XCOORD | + A6XX_GRAS_CNTL_YCOORD | + A6XX_GRAS_CNTL_ZCOORD | + A6XX_GRAS_CNTL_WCOORD) | + COND(fs->frag_face, A6XX_GRAS_CNTL_SIZE)); tu_cs_emit_pkt4(cs, REG_A6XX_RB_RENDER_CONTROL0, 2); - tu_cs_emit(cs, rb_render_control); - tu_cs_emit(cs, (fs->frag_face ? A6XX_RB_RENDER_CONTROL1_FACENESS : 0)); + tu_cs_emit(cs, + CONDREG(ij_pix_regid, A6XX_RB_RENDER_CONTROL0_VARYING) | + CONDREG(ij_cent_regid, A6XX_RB_RENDER_CONTROL0_CENTROID) | + CONDREG(ij_samp_regid, A6XX_RB_RENDER_CONTROL0_PERSAMP_VARYING) | + COND(enable_varyings, A6XX_RB_RENDER_CONTROL0_UNK10) | + COND(VALIDREG(ij_size_regid) && !sample_shading, A6XX_RB_RENDER_CONTROL0_SIZE) | + COND(VALIDREG(ij_size_regid) && sample_shading, A6XX_RB_RENDER_CONTROL0_SIZE_PERSAMP) | + COND(fs->frag_coord, + A6XX_RB_RENDER_CONTROL0_SIZE | + A6XX_RB_RENDER_CONTROL0_XCOORD | + A6XX_RB_RENDER_CONTROL0_YCOORD | + A6XX_RB_RENDER_CONTROL0_ZCOORD | + A6XX_RB_RENDER_CONTROL0_WCOORD) | + COND(fs->frag_face, A6XX_RB_RENDER_CONTROL0_SIZE)); + tu_cs_emit(cs, + CONDREG(smask_in_regid, A6XX_RB_RENDER_CONTROL1_SAMPLEMASK) | + CONDREG(samp_id_regid, A6XX_RB_RENDER_CONTROL1_SAMPLEID) | + CONDREG(ij_size_regid, A6XX_RB_RENDER_CONTROL1_SIZE) | + COND(fs->frag_face, A6XX_RB_RENDER_CONTROL1_FACENESS)); } static void @@ -756,8 +852,11 @@ const struct ir3_shader_variant *fs, uint32_t mrt_count) { - const uint32_t fragdepth_regid = - ir3_find_output_regid(fs, FRAG_RESULT_DEPTH); + uint32_t smask_regid, posz_regid; + + posz_regid = ir3_find_output_regid(fs, FRAG_RESULT_DEPTH); + smask_regid = ir3_find_output_regid(fs, FRAG_RESULT_SAMPLE_MASK); + uint32_t fragdata_regid[8]; if (fs->color0_mrt) { fragdata_regid[0] = ir3_find_output_regid(fs, FRAG_RESULT_COLOR); @@ -769,8 +868,9 @@ } tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2); - tu_cs_emit( - cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(fragdepth_regid) | 0xfcfc0000); + tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(posz_regid) | + A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(smask_regid) | + 0xfc000000); tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(mrt_count)); tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), 8); @@ -782,12 +882,13 @@ } tu_cs_emit_pkt4(cs, REG_A6XX_RB_FS_OUTPUT_CNTL0, 2); - tu_cs_emit(cs, fs->writes_pos ? A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_Z : 0); + tu_cs_emit(cs, COND(fs->writes_pos, A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_Z) | + COND(fs->writes_smask, A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_SAMPMASK)); tu_cs_emit(cs, A6XX_RB_FS_OUTPUT_CNTL1_MRT(mrt_count)); uint32_t gras_su_depth_plane_cntl = 0; uint32_t rb_depth_plane_cntl = 0; - if (fs->no_earlyz | fs->writes_pos) { + if (fs->no_earlyz || fs->writes_pos) { gras_su_depth_plane_cntl |= A6XX_GRAS_SU_DEPTH_PLANE_CNTL_FRAG_WRITES_Z; rb_depth_plane_cntl |= A6XX_RB_DEPTH_PLANE_CNTL_FRAG_WRITES_Z; } @@ -886,6 +987,43 @@ } static void +tu6_emit_immediates(struct tu_cs *cs, const struct ir3_shader_variant *v, + uint32_t opcode, enum a6xx_state_block block) +{ + /* dummy variant */ + if (!v->shader) + return; + + const struct ir3_const_state *const_state = &v->shader->const_state; + uint32_t base = const_state->offsets.immediate; + int size = const_state->immediates_count; + + /* truncate size to avoid writing constants that shader + * does not use: + */ + size = MIN2(size + base, v->constlen) - base; + + if (size <= 0) + return; + + tu_cs_emit_pkt7(cs, opcode, 3 + size * 4); + tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(base) | + CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | + CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) | + CP_LOAD_STATE6_0_STATE_BLOCK(block) | + CP_LOAD_STATE6_0_NUM_UNIT(size)); + tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0)); + tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0)); + + for (unsigned i = 0; i < size; i++) { + tu_cs_emit(cs, const_state->immediates[i].val[0]); + tu_cs_emit(cs, const_state->immediates[i].val[1]); + tu_cs_emit(cs, const_state->immediates[i].val[2]); + tu_cs_emit(cs, const_state->immediates[i].val[3]); + } +} + +static void tu6_emit_program(struct tu_cs *cs, const struct tu_pipeline_builder *builder, const struct tu_bo *binary_bo, @@ -919,24 +1057,27 @@ fs = &dummy_variant; } - tu6_emit_vs_config(cs, vs); - tu6_emit_hs_config(cs, hs); - tu6_emit_ds_config(cs, ds); - tu6_emit_gs_config(cs, gs); - tu6_emit_fs_config(cs, fs); + tu6_emit_vs_config(cs, builder->shaders[MESA_SHADER_VERTEX], vs); + tu6_emit_hs_config(cs, builder->shaders[MESA_SHADER_TESS_CTRL], hs); + tu6_emit_ds_config(cs, builder->shaders[MESA_SHADER_TESS_EVAL], ds); + tu6_emit_gs_config(cs, builder->shaders[MESA_SHADER_GEOMETRY], gs); + tu6_emit_fs_config(cs, builder->shaders[MESA_SHADER_FRAGMENT], fs); tu6_emit_vs_system_values(cs, vs); tu6_emit_vpc(cs, vs, fs, binning_pass); tu6_emit_vpc_varying_modes(cs, fs, binning_pass); - tu6_emit_fs_system_values(cs, fs); tu6_emit_fs_inputs(cs, fs); tu6_emit_fs_outputs(cs, fs, builder->color_attachment_count); tu6_emit_shader_object(cs, MESA_SHADER_VERTEX, vs, binary_bo, - builder->shader_offsets[MESA_SHADER_VERTEX]); + binning_pass ? builder->binning_vs_offset : builder->shader_offsets[MESA_SHADER_VERTEX]); tu6_emit_shader_object(cs, MESA_SHADER_FRAGMENT, fs, binary_bo, builder->shader_offsets[MESA_SHADER_FRAGMENT]); + + tu6_emit_immediates(cs, vs, CP_LOAD_STATE6_GEOM, SB6_VS_SHADER); + if (!binning_pass) + tu6_emit_immediates(cs, fs, CP_LOAD_STATE6_FRAG, SB6_FS_SHADER); } static void @@ -950,9 +1091,7 @@ { uint32_t vfd_decode_idx = 0; - /* why do we go beyond inputs_count? */ - assert(vs->inputs_count + 1 <= MAX_VERTEX_ATTRIBS); - for (uint32_t i = 0; i <= vs->inputs_count; i++) { + for (uint32_t i = 0; i < vs->inputs_count; i++) { if (vs->inputs[i].sysval || !vs->inputs[i].compmask) continue; @@ -993,6 +1132,7 @@ offsets[vfd_decode_idx] = vi_attr->offset; vfd_decode_idx++; + assert(vfd_decode_idx <= MAX_VERTEX_ATTRIBS); } tu_cs_emit_pkt4(cs, REG_A6XX_VFD_CONTROL_0, 1); @@ -1045,12 +1185,12 @@ guardband_adj.height = tu6_guardband_adj(max.y - min.y); tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_CL_VPORT_XOFFSET_0, 6); - tu_cs_emit(cs, A6XX_GRAS_CL_VPORT_XOFFSET_0(offsets[0])); - tu_cs_emit(cs, A6XX_GRAS_CL_VPORT_XSCALE_0(scales[0])); - tu_cs_emit(cs, A6XX_GRAS_CL_VPORT_YOFFSET_0(offsets[1])); - tu_cs_emit(cs, A6XX_GRAS_CL_VPORT_YSCALE_0(scales[1])); - tu_cs_emit(cs, A6XX_GRAS_CL_VPORT_ZOFFSET_0(offsets[2])); - tu_cs_emit(cs, A6XX_GRAS_CL_VPORT_ZSCALE_0(scales[2])); + tu_cs_emit(cs, A6XX_GRAS_CL_VPORT_XOFFSET_0(offsets[0]).value); + tu_cs_emit(cs, A6XX_GRAS_CL_VPORT_XSCALE_0(scales[0]).value); + tu_cs_emit(cs, A6XX_GRAS_CL_VPORT_YOFFSET_0(offsets[1]).value); + tu_cs_emit(cs, A6XX_GRAS_CL_VPORT_YSCALE_0(scales[1]).value); + tu_cs_emit(cs, A6XX_GRAS_CL_VPORT_ZOFFSET_0(offsets[2]).value); + tu_cs_emit(cs, A6XX_GRAS_CL_VPORT_ZSCALE_0(scales[2]).value); tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL_0, 2); tu_cs_emit(cs, A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL_0_X(min.x) | @@ -1087,7 +1227,7 @@ tu_cs_emit(cs, 0x80); tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_UNKNOWN_8001, 1); tu_cs_emit(cs, 0x0); - tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_UNKNOWN_8004, 1); + tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_LAYER_CNTL, 1); tu_cs_emit(cs, 0x0); } @@ -1097,7 +1237,7 @@ tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SU_POINT_MINMAX, 2); tu_cs_emit(cs, A6XX_GRAS_SU_POINT_MINMAX_MIN(1.0f / 16.0f) | A6XX_GRAS_SU_POINT_MINMAX_MAX(4092.0f)); - tu_cs_emit(cs, A6XX_GRAS_SU_POINT_SIZE(1.0f)); + tu_cs_emit(cs, A6XX_GRAS_SU_POINT_SIZE(1.0f).value); } static uint32_t @@ -1144,9 +1284,9 @@ float slope_factor) { tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SU_POLY_OFFSET_SCALE, 3); - tu_cs_emit(cs, A6XX_GRAS_SU_POLY_OFFSET_SCALE(slope_factor)); - tu_cs_emit(cs, A6XX_GRAS_SU_POLY_OFFSET_OFFSET(constant_factor)); - tu_cs_emit(cs, A6XX_GRAS_SU_POLY_OFFSET_OFFSET_CLAMP(clamp)); + tu_cs_emit(cs, A6XX_GRAS_SU_POLY_OFFSET_SCALE(slope_factor).value); + tu_cs_emit(cs, A6XX_GRAS_SU_POLY_OFFSET_OFFSET(constant_factor).value); + tu_cs_emit(cs, A6XX_GRAS_SU_POLY_OFFSET_OFFSET_CLAMP(clamp).value); } static void @@ -1368,13 +1508,12 @@ } static VkResult -tu_pipeline_builder_create_pipeline(struct tu_pipeline_builder *builder, - struct tu_pipeline **out_pipeline) +tu_pipeline_create(struct tu_device *dev, + const VkAllocationCallbacks *pAllocator, + struct tu_pipeline **out_pipeline) { - struct tu_device *dev = builder->device; - struct tu_pipeline *pipeline = - vk_zalloc2(&dev->alloc, builder->alloc, sizeof(*pipeline), 8, + vk_zalloc2(&dev->alloc, pAllocator, sizeof(*pipeline), 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); if (!pipeline) return VK_ERROR_OUT_OF_HOST_MEMORY; @@ -1384,7 +1523,7 @@ /* reserve the space now such that tu_cs_begin_sub_stream never fails */ VkResult result = tu_cs_reserve_space(dev, &pipeline->cs, 2048); if (result != VK_SUCCESS) { - vk_free2(&dev->alloc, builder->alloc, pipeline); + vk_free2(&dev->alloc, pAllocator, pipeline); return result; } @@ -1417,7 +1556,8 @@ continue; struct tu_shader *shader = - tu_shader_create(builder->device, stage, stage_info, builder->alloc); + tu_shader_create(builder->device, stage, stage_info, builder->layout, + builder->alloc); if (!shader) return VK_ERROR_OUT_OF_HOST_MEMORY; @@ -1495,6 +1635,21 @@ } static void +tu_pipeline_set_linkage(struct tu_program_descriptor_linkage *link, + struct tu_shader *shader, + struct ir3_shader_variant *v) +{ + link->ubo_state = v->shader->ubo_state; + link->const_state = v->shader->const_state; + link->constlen = v->constlen; + link->texture_map = shader->texture_map; + link->sampler_map = shader->sampler_map; + link->ubo_map = shader->ubo_map; + link->ssbo_map = shader->ssbo_map; + link->image_map = shader->image_map; +} + +static void tu_pipeline_builder_parse_shader_stages(struct tu_pipeline_builder *builder, struct tu_pipeline *pipeline) { @@ -1507,6 +1662,15 @@ tu6_emit_program(&prog_cs, builder, &pipeline->program.binary_bo, true); pipeline->program.binning_state_ib = tu_cs_end_sub_stream(&pipeline->cs, &prog_cs); + + for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) { + if (!builder->shaders[i]) + continue; + + tu_pipeline_set_linkage(&pipeline->program.link[i], + builder->shaders[i], + &builder->shaders[i]->variants[0]); + } } static void @@ -1723,7 +1887,8 @@ tu_pipeline_builder_build(struct tu_pipeline_builder *builder, struct tu_pipeline **pipeline) { - VkResult result = tu_pipeline_builder_create_pipeline(builder, pipeline); + VkResult result = tu_pipeline_create(builder->device, builder->alloc, + pipeline); if (result != VK_SUCCESS) return result; @@ -1774,11 +1939,14 @@ const VkGraphicsPipelineCreateInfo *create_info, const VkAllocationCallbacks *alloc) { + TU_FROM_HANDLE(tu_pipeline_layout, layout, create_info->layout); + *builder = (struct tu_pipeline_builder) { .device = dev, .cache = cache, .create_info = create_info, .alloc = alloc, + .layout = layout, }; builder->rasterizer_discard = @@ -1797,8 +1965,9 @@ builder->use_depth_stencil_attachment = subpass->depth_stencil_attachment.attachment != VK_ATTACHMENT_UNUSED; - assert(subpass->color_count == - create_info->pColorBlendState->attachmentCount); + assert(subpass->color_count == 0 || + !create_info->pColorBlendState || + subpass->color_count == create_info->pColorBlendState->attachmentCount); builder->color_attachment_count = subpass->color_count; for (uint32_t i = 0; i < subpass->color_count; i++) { const uint32_t a = subpass->color_attachments[i].attachment; @@ -1811,6 +1980,32 @@ } } +static VkResult +tu_graphics_pipeline_create(VkDevice device, + VkPipelineCache pipelineCache, + const VkGraphicsPipelineCreateInfo *pCreateInfo, + const VkAllocationCallbacks *pAllocator, + VkPipeline *pPipeline) +{ + TU_FROM_HANDLE(tu_device, dev, device); + TU_FROM_HANDLE(tu_pipeline_cache, cache, pipelineCache); + + struct tu_pipeline_builder builder; + tu_pipeline_builder_init_graphics(&builder, dev, cache, + pCreateInfo, pAllocator); + + struct tu_pipeline *pipeline = NULL; + VkResult result = tu_pipeline_builder_build(&builder, &pipeline); + tu_pipeline_builder_finish(&builder); + + if (result == VK_SUCCESS) + *pPipeline = tu_pipeline_to_handle(pipeline); + else + *pPipeline = VK_NULL_HANDLE; + + return result; +} + VkResult tu_CreateGraphicsPipelines(VkDevice device, VkPipelineCache pipelineCache, @@ -1819,65 +2014,145 @@ const VkAllocationCallbacks *pAllocator, VkPipeline *pPipelines) { - TU_FROM_HANDLE(tu_device, dev, device); - TU_FROM_HANDLE(tu_pipeline_cache, cache, pipelineCache); + VkResult final_result = VK_SUCCESS; for (uint32_t i = 0; i < count; i++) { - struct tu_pipeline_builder builder; - tu_pipeline_builder_init_graphics(&builder, dev, cache, - &pCreateInfos[i], pAllocator); - - struct tu_pipeline *pipeline; - VkResult result = tu_pipeline_builder_build(&builder, &pipeline); - tu_pipeline_builder_finish(&builder); - - if (result != VK_SUCCESS) { - for (uint32_t j = 0; j < i; j++) { - tu_DestroyPipeline(device, pPipelines[j], pAllocator); - pPipelines[j] = VK_NULL_HANDLE; - } + VkResult result = tu_graphics_pipeline_create(device, pipelineCache, + &pCreateInfos[i], pAllocator, + &pPipelines[i]); - return result; - } - - pPipelines[i] = tu_pipeline_to_handle(pipeline); + if (result != VK_SUCCESS) + final_result = result; } + return final_result; +} + +static void +tu6_emit_compute_program(struct tu_cs *cs, + struct tu_shader *shader, + const struct tu_bo *binary_bo) +{ + const struct ir3_shader_variant *v = &shader->variants[0]; + + tu6_emit_cs_config(cs, shader, v); + + /* The compute program is the only one in the pipeline, so 0 offset. */ + tu6_emit_shader_object(cs, MESA_SHADER_COMPUTE, v, binary_bo, 0); + + tu6_emit_immediates(cs, v, CP_LOAD_STATE6_FRAG, SB6_CS_SHADER); +} + +static VkResult +tu_compute_upload_shader(VkDevice device, + struct tu_pipeline *pipeline, + struct tu_shader *shader) +{ + TU_FROM_HANDLE(tu_device, dev, device); + struct tu_bo *bo = &pipeline->program.binary_bo; + struct ir3_shader_variant *v = &shader->variants[0]; + + uint32_t shader_size = sizeof(uint32_t) * v->info.sizedwords; + VkResult result = + tu_bo_init_new(dev, bo, shader_size); + if (result != VK_SUCCESS) + return result; + + result = tu_bo_map(dev, bo); + if (result != VK_SUCCESS) + return result; + + memcpy(bo->map, shader->binary, shader_size); + return VK_SUCCESS; } + static VkResult -tu_compute_pipeline_create(VkDevice _device, +tu_compute_pipeline_create(VkDevice device, VkPipelineCache _cache, const VkComputePipelineCreateInfo *pCreateInfo, const VkAllocationCallbacks *pAllocator, VkPipeline *pPipeline) { + TU_FROM_HANDLE(tu_device, dev, device); + TU_FROM_HANDLE(tu_pipeline_layout, layout, pCreateInfo->layout); + const VkPipelineShaderStageCreateInfo *stage_info = &pCreateInfo->stage; + VkResult result; + + struct tu_pipeline *pipeline; + + *pPipeline = VK_NULL_HANDLE; + + result = tu_pipeline_create(dev, pAllocator, &pipeline); + if (result != VK_SUCCESS) + return result; + + pipeline->layout = layout; + + struct tu_shader_compile_options options; + tu_shader_compile_options_init(&options, NULL); + + struct tu_shader *shader = + tu_shader_create(dev, MESA_SHADER_COMPUTE, stage_info, layout, pAllocator); + if (!shader) { + result = VK_ERROR_OUT_OF_HOST_MEMORY; + goto fail; + } + + result = tu_shader_compile(dev, shader, NULL, &options, pAllocator); + if (result != VK_SUCCESS) + goto fail; + + struct ir3_shader_variant *v = &shader->variants[0]; + + tu_pipeline_set_linkage(&pipeline->program.link[MESA_SHADER_COMPUTE], + shader, v); + + result = tu_compute_upload_shader(device, pipeline, shader); + if (result != VK_SUCCESS) + goto fail; + + for (int i = 0; i < 3; i++) + pipeline->compute.local_size[i] = v->shader->nir->info.cs.local_size[i]; + + struct tu_cs prog_cs; + tu_cs_begin_sub_stream(dev, &pipeline->cs, 512, &prog_cs); + tu6_emit_compute_program(&prog_cs, shader, &pipeline->program.binary_bo); + pipeline->program.state_ib = tu_cs_end_sub_stream(&pipeline->cs, &prog_cs); + + *pPipeline = tu_pipeline_to_handle(pipeline); return VK_SUCCESS; + +fail: + if (shader) + tu_shader_destroy(dev, shader, pAllocator); + + tu_pipeline_finish(pipeline, dev, pAllocator); + vk_free2(&dev->alloc, pAllocator, pipeline); + + return result; } VkResult -tu_CreateComputePipelines(VkDevice _device, +tu_CreateComputePipelines(VkDevice device, VkPipelineCache pipelineCache, uint32_t count, const VkComputePipelineCreateInfo *pCreateInfos, const VkAllocationCallbacks *pAllocator, VkPipeline *pPipelines) { - VkResult result = VK_SUCCESS; + VkResult final_result = VK_SUCCESS; - unsigned i = 0; - for (; i < count; i++) { - VkResult r; - r = tu_compute_pipeline_create(_device, pipelineCache, &pCreateInfos[i], - pAllocator, &pPipelines[i]); - if (r != VK_SUCCESS) { - result = r; - pPipelines[i] = VK_NULL_HANDLE; - } + for (uint32_t i = 0; i < count; i++) { + VkResult result = tu_compute_pipeline_create(device, pipelineCache, + &pCreateInfos[i], + pAllocator, &pPipelines[i]); + if (result != VK_SUCCESS) + final_result = result; } - return result; + return final_result; } void diff -Nru mesa-19.2.8/src/freedreno/vulkan/tu_private.h mesa-20.0.8/src/freedreno/vulkan/tu_private.h --- mesa-19.2.8/src/freedreno/vulkan/tu_private.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/freedreno/vulkan/tu_private.h 2020-06-12 01:21:16.000000000 +0000 @@ -40,11 +40,10 @@ #include #define VG(x) x #else -#define VG(x) +#define VG(x) ((void)0) #endif #include "c11/threads.h" -#include "compiler/shader_enums.h" #include "main/macros.h" #include "util/list.h" #include "util/macros.h" @@ -59,6 +58,7 @@ #include "adreno_common.xml.h" #include "adreno_pm4.xml.h" #include "a6xx.xml.h" +#include "fdl/freedreno_layout.h" #include "tu_descriptor_set.h" #include "tu_extensions.h" @@ -94,6 +94,8 @@ #define NUM_META_FS_KEYS 13 #define TU_MAX_DRM_DEVICES 8 #define MAX_VIEWS 8 +/* The Qualcomm driver exposes 0x20000058 */ +#define MAX_STORAGE_BUFFER_RANGE 0x20000000 #define NUM_DEPTH_CLEAR_PIPELINES 3 @@ -103,6 +105,9 @@ */ #define TU_BUFFER_OPS_CS_THRESHOLD 4096 +#define A6XX_TEX_CONST_DWORDS 16 +#define A6XX_TEX_SAMP_DWORDS 4 + enum tu_mem_heap { TU_MEM_HEAP_VRAM, @@ -211,6 +216,8 @@ memcpy((dest), (src), (count) * sizeof(*(src))); \ }) +#define COND(bool, val) ((bool) ? (val) : 0) + /* Whenever we generate an error, pass it through this function. Useful for * debugging, where we can break on it. Only call at error site, not when * propagating errors. Might be useful to plug in a stack trace here. @@ -322,6 +329,7 @@ TU_DEBUG_STARTUP = 1 << 0, TU_DEBUG_NIR = 1 << 1, TU_DEBUG_IR3 = 1 << 2, + TU_DEBUG_NOBIN = 1 << 3, }; struct tu_instance @@ -415,6 +423,7 @@ struct tu_fence { + struct wsi_fence *fence_wsi; bool signaled; int fd; }; @@ -501,6 +510,11 @@ uint32_t offset; }; +struct ts_cs_memory { + uint32_t *map; + uint64_t iova; +}; + enum tu_cs_mode { @@ -581,6 +595,8 @@ uint64_t va; uint32_t *mapped_ptr; struct tu_descriptor_range *dynamic_descriptors; + + struct tu_bo *descriptors[0]; }; struct tu_push_descriptor_set @@ -598,7 +614,7 @@ struct tu_descriptor_pool { - uint8_t *mapped_ptr; + struct tu_bo bo; uint64_t current_offset; uint64_t size; @@ -656,6 +672,12 @@ VkDeviceSize bo_offset; }; +static inline uint64_t +tu_buffer_iova(struct tu_buffer *buffer) +{ + return buffer->bo->iova + buffer->bo_offset; +} + enum tu_dynamic_state_bits { TU_DYNAMIC_VIEWPORT = 1 << 0, @@ -753,27 +775,13 @@ const char * tu_get_perftest_option_name(int id); -/** - * Attachment state when recording a renderpass instance. - * - * The clear value is valid only if there exists a pending clear. - */ -struct tu_attachment_state -{ - VkImageAspectFlags pending_clear_aspects; - uint32_t cleared_views; - VkClearValue clear_value; - VkImageLayout current_layout; -}; - struct tu_descriptor_state { struct tu_descriptor_set *sets[MAX_SETS]; - uint32_t dirty; uint32_t valid; struct tu_push_descriptor_set push_set; bool push_dirty; - uint32_t dynamic_buffers[4 * MAX_DYNAMIC_BUFFERS]; + uint64_t dynamic_buffers[MAX_DYNAMIC_BUFFERS]; }; struct tu_tile @@ -787,16 +795,12 @@ struct tu_tiling_config { VkRect2D render_area; - uint32_t buffer_cpp[MAX_RTS + 2]; - uint32_t buffer_count; /* position and size of the first tile */ VkRect2D tile0; /* number of tiles */ VkExtent2D tile_count; - uint32_t gmem_offsets[MAX_RTS + 2]; - /* size of the first VSC pipe */ VkExtent2D pipe0; /* number of VSC pipes */ @@ -810,7 +814,10 @@ enum tu_cmd_dirty_bits { TU_CMD_DIRTY_PIPELINE = 1 << 0, - TU_CMD_DIRTY_VERTEX_BUFFERS = 1 << 1, + TU_CMD_DIRTY_COMPUTE_PIPELINE = 1 << 1, + TU_CMD_DIRTY_VERTEX_BUFFERS = 1 << 2, + TU_CMD_DIRTY_DESCRIPTOR_SETS = 1 << 3, + TU_CMD_DIRTY_PUSH_CONSTANTS = 1 << 4, TU_CMD_DIRTY_DYNAMIC_LINE_WIDTH = 1 << 16, TU_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK = 1 << 17, @@ -823,6 +830,7 @@ uint32_t dirty; struct tu_pipeline *pipeline; + struct tu_pipeline *compute_pipeline; /* Vertex buffers */ struct @@ -843,7 +851,6 @@ const struct tu_render_pass *pass; const struct tu_subpass *subpass; const struct tu_framebuffer *framebuffer; - struct tu_attachment_state *attachments; struct tu_tiling_config tiling_config; @@ -915,7 +922,7 @@ struct tu_vertex_binding vertex_bindings[MAX_VBS]; uint32_t queue_family_index; - uint8_t push_constants[MAX_PUSH_CONSTANTS_SIZE]; + uint32_t push_constants[MAX_PUSH_CONSTANTS_SIZE / 4]; VkShaderStageFlags push_constant_stages; struct tu_descriptor_set meta_push_descriptors; @@ -928,18 +935,40 @@ struct tu_bo_list bo_list; struct tu_cs cs; struct tu_cs draw_cs; - struct tu_cs tile_cs; + struct tu_cs draw_epilogue_cs; + struct tu_cs sub_cs; uint16_t marker_reg; uint32_t marker_seqno; struct tu_bo scratch_bo; uint32_t scratch_seqno; +#define VSC_OVERFLOW 0x8 +#define VSC_SCRATCH 0x10 + + struct tu_bo vsc_data; + struct tu_bo vsc_data2; + uint32_t vsc_data_pitch; + uint32_t vsc_data2_pitch; + bool use_vsc_data; bool wait_for_idle; }; -void +/* Temporary struct for tracking a register state to be written, used by + * a6xx-pack.h and tu_cs_emit_regs() + */ +struct tu_reg_value { + uint32_t reg; + uint64_t value; + bool is_address; + struct tu_bo *bo; + bool bo_write; + uint32_t bo_offset; + uint32_t bo_shift; +}; + +unsigned tu6_emit_event_write(struct tu_cmd_buffer *cmd, struct tu_cs *cs, enum vgt_event_type event, @@ -950,6 +979,13 @@ struct tu_device_memory *memory, int *pFD); +static inline struct tu_descriptor_state * +tu_get_descriptors_state(struct tu_cmd_buffer *cmd_buffer, + VkPipelineBindPoint bind_point) +{ + return &cmd_buffer->descriptors[bind_point]; +} + /* * Takes x,y,z as exact numbers of invocations, instead of blocks. * @@ -965,7 +1001,7 @@ struct tu_event { - uint64_t *map; + struct tu_bo bo; }; struct tu_shader_module; @@ -1016,10 +1052,27 @@ bool include_binning_pass; }; +struct tu_descriptor_map +{ + /* TODO: avoid fixed size array/justify the size */ + unsigned num; /* number of array entries */ + unsigned num_desc; /* Number of descriptors (sum of array_size[]) */ + int set[64]; + int binding[64]; + int value[64]; + int array_size[64]; +}; + struct tu_shader { struct ir3_shader ir3_shader; + struct tu_descriptor_map texture_map; + struct tu_descriptor_map sampler_map; + struct tu_descriptor_map ubo_map; + struct tu_descriptor_map ssbo_map; + struct tu_descriptor_map image_map; + /* This may be true for vertex shaders. When true, variants[1] is the * binning variant and binning_binary is non-NULL. */ @@ -1035,6 +1088,7 @@ tu_shader_create(struct tu_device *dev, gl_shader_stage stage, const VkPipelineShaderStageCreateInfo *stage_info, + struct tu_pipeline_layout *layout, const VkAllocationCallbacks *alloc); void @@ -1054,6 +1108,20 @@ const struct tu_shader_compile_options *options, const VkAllocationCallbacks *alloc); +struct tu_program_descriptor_linkage +{ + struct ir3_ubo_analysis_state ubo_state; + struct ir3_const_state const_state; + + uint32_t constlen; + + struct tu_descriptor_map texture_map; + struct tu_descriptor_map sampler_map; + struct tu_descriptor_map ubo_map; + struct tu_descriptor_map ssbo_map; + struct tu_descriptor_map image_map; +}; + struct tu_pipeline { struct tu_cs cs; @@ -1070,6 +1138,8 @@ struct tu_bo binary_bo; struct tu_cs_entry state_ib; struct tu_cs_entry binning_state_ib; + + struct tu_program_descriptor_linkage link[MESA_SHADER_STAGES]; } program; struct @@ -1114,6 +1184,11 @@ { struct tu_cs_entry state_ib; } blend; + + struct + { + uint32_t local_size[3]; + } compute; }; void @@ -1180,11 +1255,19 @@ const struct tu_native_format * tu6_get_native_format(VkFormat format); -int +void tu_pack_clear_value(const VkClearValue *val, VkFormat format, uint32_t buf[4]); + +void +tu_2d_clear_color(const VkClearColorValue *val, VkFormat format, uint32_t buf[4]); + +void +tu_2d_clear_zs(const VkClearDepthStencilValue *val, VkFormat format, uint32_t buf[4]); + enum a6xx_2d_ifmt tu6_rb_fmt_to_ifmt(enum a6xx_color_fmt fmt); +enum a6xx_depth_format tu6_pipe2depth(VkFormat format); struct tu_image_level { @@ -1207,14 +1290,12 @@ VkExtent3D extent; uint32_t level_count; uint32_t layer_count; + VkSampleCountFlagBits samples; + - VkDeviceSize size; uint32_t alignment; - /* memory layout */ - VkDeviceSize layer_size; - struct tu_image_level levels[15]; - unsigned tile_mode; + struct fdl_layout layout; unsigned queue_family_mask; bool exclusive; @@ -1224,7 +1305,7 @@ VkDeviceMemory owned_memory; /* Set when bound */ - const struct tu_bo *bo; + struct tu_bo *bo; VkDeviceSize bo_offset; }; @@ -1251,6 +1332,72 @@ : range->levelCount; } +static inline VkDeviceSize +tu_layer_size(struct tu_image *image, int level) +{ + return fdl_layer_stride(&image->layout, level); +} + +static inline uint32_t +tu_image_stride(struct tu_image *image, int level) +{ + return image->layout.slices[level].pitch * image->layout.cpp; +} + +static inline uint64_t +tu_image_base(struct tu_image *image, int level, int layer) +{ + return image->bo->iova + image->bo_offset + + fdl_surface_offset(&image->layout, level, layer); +} + +#define tu_image_base_ref(image, level, layer) \ + .bo = image->bo, \ + .bo_offset = (image->bo_offset + fdl_surface_offset(&image->layout, \ + level, layer)) + +#define tu_image_view_base_ref(iview) \ + tu_image_base_ref(iview->image, iview->base_mip, iview->base_layer) + +static inline VkDeviceSize +tu_image_ubwc_size(struct tu_image *image, int level) +{ + return image->layout.ubwc_size; +} + +static inline uint32_t +tu_image_ubwc_pitch(struct tu_image *image, int level) +{ + return image->layout.ubwc_slices[level].pitch; +} + +static inline uint64_t +tu_image_ubwc_surface_offset(struct tu_image *image, int level, int layer) +{ + return image->layout.ubwc_slices[level].offset + + layer * tu_image_ubwc_size(image, level); +} + +static inline uint64_t +tu_image_ubwc_base(struct tu_image *image, int level, int layer) +{ + return image->bo->iova + image->bo_offset + + tu_image_ubwc_surface_offset(image, level, layer); +} + +#define tu_image_ubwc_base_ref(image, level, layer) \ + .bo = image->bo, \ + .bo_offset = (image->bo_offset + tu_image_ubwc_surface_offset(image, \ + level, layer)) + +#define tu_image_view_ubwc_base_ref(iview) \ + tu_image_ubwc_base_ref(iview->image, iview->base_mip, iview->base_layer) + +enum a6xx_tile_mode +tu6_get_image_tile_mode(struct tu_image *image, int level); +enum a3xx_msaa_samples +tu_msaa_samples(uint32_t samples); + struct tu_image_view { struct tu_image *image; /**< VkImageViewCreateInfo::image */ @@ -1264,30 +1411,28 @@ uint32_t level_count; VkExtent3D extent; /**< Extent of VkImageViewCreateInfo::baseMipLevel. */ - uint32_t descriptor[16]; + uint32_t descriptor[A6XX_TEX_CONST_DWORDS]; /* Descriptor for use as a storage image as opposed to a sampled image. * This has a few differences for cube maps (e.g. type). */ - uint32_t storage_descriptor[16]; + uint32_t storage_descriptor[A6XX_TEX_CONST_DWORDS]; }; struct tu_sampler { -}; + uint32_t state[A6XX_TEX_SAMP_DWORDS]; -struct tu_image_create_info -{ - const VkImageCreateInfo *vk_info; - bool scanout; - bool no_metadata_planes; + bool needs_border; + VkBorderColor border; }; VkResult tu_image_create(VkDevice _device, - const struct tu_image_create_info *info, + const VkImageCreateInfo *pCreateInfo, const VkAllocationCallbacks *alloc, - VkImage *pImage); + VkImage *pImage, + uint64_t modifier); VkResult tu_image_from_gralloc(VkDevice device_h, @@ -1303,9 +1448,9 @@ struct tu_buffer_view { - VkFormat vk_format; - uint64_t range; /**< VkBufferViewCreateInfo::range */ - uint32_t state[4]; + uint32_t descriptor[A6XX_TEX_CONST_DWORDS]; + + struct tu_buffer *buffer; }; void tu_buffer_view_init(struct tu_buffer_view *view, @@ -1359,21 +1504,9 @@ struct tu_attachment_info attachments[0]; }; -struct tu_subpass_barrier -{ - VkPipelineStageFlags src_stage_mask; - VkAccessFlags src_access_mask; - VkAccessFlags dst_access_mask; -}; - -void -tu_subpass_barrier(struct tu_cmd_buffer *cmd_buffer, - const struct tu_subpass_barrier *barrier); - struct tu_subpass_attachment { uint32_t attachment; - VkImageLayout layout; }; struct tu_subpass @@ -1385,33 +1518,27 @@ struct tu_subpass_attachment *resolve_attachments; struct tu_subpass_attachment depth_stencil_attachment; - /** Subpass has at least one resolve attachment */ - bool has_resolve; - - struct tu_subpass_barrier start_barrier; - - uint32_t view_mask; - VkSampleCountFlagBits max_sample_count; + VkSampleCountFlagBits samples; }; struct tu_render_pass_attachment { VkFormat format; - uint32_t samples; + uint32_t cpp; VkAttachmentLoadOp load_op; VkAttachmentLoadOp stencil_load_op; - VkImageLayout initial_layout; - VkImageLayout final_layout; - uint32_t view_mask; + VkAttachmentStoreOp store_op; + VkAttachmentStoreOp stencil_store_op; + int32_t gmem_offset; }; struct tu_render_pass { uint32_t attachment_count; uint32_t subpass_count; + uint32_t gmem_pixels; struct tu_subpass_attachment *subpass_attachments; struct tu_render_pass_attachment *attachments; - struct tu_subpass_barrier end_barrier; struct tu_subpass subpasses[0]; }; @@ -1422,12 +1549,11 @@ struct tu_query_pool { + VkQueryType type; uint32_t stride; - uint32_t availability_offset; uint64_t size; - char *ptr; - VkQueryType type; - uint32_t pipeline_stats_mask; + uint32_t pipeline_statistics; + struct tu_bo bo; }; struct tu_semaphore diff -Nru mesa-19.2.8/src/freedreno/vulkan/tu_query.c mesa-20.0.8/src/freedreno/vulkan/tu_query.c --- mesa-19.2.8/src/freedreno/vulkan/tu_query.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/freedreno/vulkan/tu_query.c 2020-06-12 01:21:16.000000000 +0000 @@ -31,7 +31,54 @@ #include #include +#include "registers/adreno_pm4.xml.h" +#include "registers/adreno_common.xml.h" +#include "registers/a6xx.xml.h" + #include "nir/nir_builder.h" +#include "util/os_time.h" + +#include "tu_cs.h" + +#define NSEC_PER_SEC 1000000000ull +#define WAIT_TIMEOUT 5 + +/* It seems like sample counts need to be copied over to 16-byte aligned + * memory. */ +struct PACKED slot_value { + uint64_t value; + uint64_t __padding; +}; + +struct PACKED occlusion_query_slot { + struct slot_value available; /* 0 when unavailable, 1 when available */ + struct slot_value begin; + struct slot_value end; + struct slot_value result; +}; + +/* Returns the IOVA of a given uint64_t field in a given slot of a query + * pool. */ +#define query_iova(type, pool, query, field) \ + pool->bo.iova + pool->stride * query + offsetof(type, field) + \ + offsetof(struct slot_value, value) + +#define occlusion_query_iova(pool, query, field) \ + query_iova(struct occlusion_query_slot, pool, query, field) + +#define query_is_available(type, slot) \ + ((type*)slot)->available.value + +#define occlusion_query_is_available(slot) \ + query_is_available(struct occlusion_query_slot, slot) + +/* + * Returns a pointer to a given slot in a query pool. + */ +static void* slot_address(struct tu_query_pool *pool, uint32_t query) +{ + return (char*)pool->bo.map + query * pool->stride; +} VkResult tu_CreateQueryPool(VkDevice _device, @@ -40,6 +87,21 @@ VkQueryPool *pQueryPool) { TU_FROM_HANDLE(tu_device, device, _device); + assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO); + assert(pCreateInfo->queryCount > 0); + + uint32_t slot_size; + switch (pCreateInfo->queryType) { + case VK_QUERY_TYPE_OCCLUSION: + slot_size = sizeof(struct occlusion_query_slot); + break; + case VK_QUERY_TYPE_PIPELINE_STATISTICS: + case VK_QUERY_TYPE_TIMESTAMP: + unreachable("Unimplemented query type"); + default: + assert(!"Invalid query type"); + } + struct tu_query_pool *pool = vk_alloc2(&device->alloc, pAllocator, sizeof(*pool), 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); @@ -47,7 +109,29 @@ if (!pool) return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); + VkResult result = tu_bo_init_new(device, &pool->bo, + pCreateInfo->queryCount * slot_size); + if (result != VK_SUCCESS) { + vk_free2(&device->alloc, pAllocator, pool); + return result; + } + + result = tu_bo_map(device, &pool->bo); + if (result != VK_SUCCESS) { + tu_bo_finish(device, &pool->bo); + vk_free2(&device->alloc, pAllocator, pool); + return result; + } + + /* Initialize all query statuses to unavailable */ + memset(pool->bo.map, 0, pool->bo.size); + + pool->type = pCreateInfo->queryType; + pool->stride = slot_size; + pool->size = pCreateInfo->queryCount; + pool->pipeline_statistics = pCreateInfo->pipelineStatistics; *pQueryPool = tu_query_pool_to_handle(pool); + return VK_SUCCESS; } @@ -62,9 +146,110 @@ if (!pool) return; + tu_bo_finish(device, &pool->bo); vk_free2(&device->alloc, pAllocator, pool); } +/* Wait on the the availability status of a query up until a timeout. */ +static VkResult +wait_for_available(struct tu_device *device, struct tu_query_pool *pool, + uint32_t query) +{ + /* TODO: Use the MSM_IOVA_WAIT ioctl to wait on the available bit in a + * scheduler friendly way instead of busy polling once the patch has landed + * upstream. */ + struct occlusion_query_slot *slot = slot_address(pool, query); + uint64_t abs_timeout = os_time_get_absolute_timeout( + WAIT_TIMEOUT * NSEC_PER_SEC); + while(os_time_get_nano() < abs_timeout) { + if (occlusion_query_is_available(slot)) + return VK_SUCCESS; + } + return vk_error(device->instance, VK_TIMEOUT); +} + +/* Writes a query value to a buffer from the CPU. */ +static void +write_query_value_cpu(char* base, + uint32_t offset, + uint64_t value, + VkQueryResultFlags flags) +{ + if (flags & VK_QUERY_RESULT_64_BIT) { + *(uint64_t*)(base + (offset * sizeof(uint64_t))) = value; + } else { + *(uint32_t*)(base + (offset * sizeof(uint32_t))) = value; + } +} + +static VkResult +get_occlusion_query_pool_results(struct tu_device *device, + struct tu_query_pool *pool, + uint32_t firstQuery, + uint32_t queryCount, + size_t dataSize, + void *pData, + VkDeviceSize stride, + VkQueryResultFlags flags) +{ + assert(dataSize >= stride * queryCount); + + char *result_base = pData; + VkResult result = VK_SUCCESS; + for (uint32_t i = 0; i < queryCount; i++) { + uint32_t query = firstQuery + i; + struct occlusion_query_slot *slot = slot_address(pool, query); + bool available = occlusion_query_is_available(slot); + if ((flags & VK_QUERY_RESULT_WAIT_BIT) && !available) { + VkResult wait_result = wait_for_available(device, pool, query); + if (wait_result != VK_SUCCESS) + return wait_result; + available = true; + } else if (!(flags & VK_QUERY_RESULT_PARTIAL_BIT) && !available) { + /* From the Vulkan 1.1.130 spec: + * + * If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are + * both not set then no result values are written to pData for + * queries that are in the unavailable state at the time of the + * call, and vkGetQueryPoolResults returns VK_NOT_READY. However, + * availability state is still written to pData for those queries + * if VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set. + */ + result = VK_NOT_READY; + if (!(flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)) { + result_base += stride; + continue; + } + } + + if (available) + write_query_value_cpu(result_base, 0, slot->result.value, flags); + else if (flags & VK_QUERY_RESULT_PARTIAL_BIT) + /* From the Vulkan 1.1.130 spec: + * + * If VK_QUERY_RESULT_PARTIAL_BIT is set, VK_QUERY_RESULT_WAIT_BIT + * is not set, and the query’s status is unavailable, an + * intermediate result value between zero and the final result + * value is written to pData for that query. + * + * Just return 0 here for simplicity since it's a valid result. + */ + write_query_value_cpu(result_base, 0, 0, flags); + + if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) + /* From the Vulkan 1.1.130 spec: + * + * If VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set, the final + * integer value written for each query is non-zero if the query’s + * status was available or zero if the status was unavailable. + */ + write_query_value_cpu(result_base, 1, available, flags); + + result_base += stride; + } + return result; +} + VkResult tu_GetQueryPoolResults(VkDevice _device, VkQueryPool queryPool, @@ -75,9 +260,124 @@ VkDeviceSize stride, VkQueryResultFlags flags) { + TU_FROM_HANDLE(tu_device, device, _device); + TU_FROM_HANDLE(tu_query_pool, pool, queryPool); + assert(firstQuery + queryCount <= pool->size); + + switch (pool->type) { + case VK_QUERY_TYPE_OCCLUSION: { + return get_occlusion_query_pool_results(device, pool, firstQuery, + queryCount, dataSize, pData, stride, flags); + } + case VK_QUERY_TYPE_PIPELINE_STATISTICS: + case VK_QUERY_TYPE_TIMESTAMP: + unreachable("Unimplemented query type"); + default: + assert(!"Invalid query type"); + } return VK_SUCCESS; } +/* Copies a query value from one buffer to another from the GPU. */ +static void +copy_query_value_gpu(struct tu_cmd_buffer *cmdbuf, + struct tu_cs *cs, + uint64_t src_iova, + uint64_t base_write_iova, + uint32_t offset, + VkQueryResultFlags flags) { + uint32_t element_size = flags & VK_QUERY_RESULT_64_BIT ? + sizeof(uint64_t) : sizeof(uint32_t); + uint64_t write_iova = base_write_iova + (offset * element_size); + + tu_cs_reserve_space(cmdbuf->device, cs, 6); + tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 5); + uint32_t mem_to_mem_flags = flags & VK_QUERY_RESULT_64_BIT ? + CP_MEM_TO_MEM_0_DOUBLE : 0; + tu_cs_emit(cs, mem_to_mem_flags); + tu_cs_emit_qw(cs, write_iova); + tu_cs_emit_qw(cs, src_iova); +} + +static void +emit_copy_occlusion_query_pool_results(struct tu_cmd_buffer *cmdbuf, + struct tu_cs *cs, + struct tu_query_pool *pool, + uint32_t firstQuery, + uint32_t queryCount, + struct tu_buffer *buffer, + VkDeviceSize dstOffset, + VkDeviceSize stride, + VkQueryResultFlags flags) +{ + /* From the Vulkan 1.1.130 spec: + * + * vkCmdCopyQueryPoolResults is guaranteed to see the effect of previous + * uses of vkCmdResetQueryPool in the same queue, without any additional + * synchronization. + * + * To ensure that previous writes to the available bit are coherent, first + * wait for all writes to complete. + */ + tu_cs_reserve_space(cmdbuf->device, cs, 1); + tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0); + + for (uint32_t i = 0; i < queryCount; i++) { + uint32_t query = firstQuery + i; + uint64_t available_iova = occlusion_query_iova(pool, query, available); + uint64_t result_iova = occlusion_query_iova(pool, query, result); + uint64_t buffer_iova = tu_buffer_iova(buffer) + dstOffset + i * stride; + /* Wait for the available bit to be set if executed with the + * VK_QUERY_RESULT_WAIT_BIT flag. */ + if (flags & VK_QUERY_RESULT_WAIT_BIT) { + tu_cs_reserve_space(cmdbuf->device, cs, 7); + tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6); + tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ) | + CP_WAIT_REG_MEM_0_POLL_MEMORY); + tu_cs_emit_qw(cs, available_iova); + tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(0x1)); + tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0)); + tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16)); + } + + if (flags & VK_QUERY_RESULT_PARTIAL_BIT) { + /* Unconditionally copying the bo->result into the buffer here is + * valid because we only set bo->result on vkCmdEndQuery. Thus, even + * if the query is unavailable, this will copy the correct partial + * value of 0. + */ + copy_query_value_gpu(cmdbuf, cs, result_iova, buffer_iova, + 0 /* offset */, flags); + } else { + /* Conditionally copy bo->result into the buffer based on whether the + * query is available. + * + * NOTE: For the conditional packets to be executed, CP_COND_EXEC + * tests that ADDR0 != 0 and ADDR1 < REF. The packet here simply tests + * that 0 < available < 2, aka available == 1. + */ + tu_cs_reserve_space(cmdbuf->device, cs, 7); + tu_cs_emit_pkt7(cs, CP_COND_EXEC, 6); + tu_cs_emit_qw(cs, available_iova); + tu_cs_emit_qw(cs, available_iova); + tu_cs_emit(cs, CP_COND_EXEC_4_REF(0x2)); + tu_cs_emit(cs, 6); /* Cond execute the next 6 DWORDS */ + + /* Start of conditional execution */ + copy_query_value_gpu(cmdbuf, cs, result_iova, buffer_iova, + 0 /* offset */, flags); + /* End of conditional execution */ + } + + if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) { + copy_query_value_gpu(cmdbuf, cs, available_iova, buffer_iova, + 1 /* offset */, flags); + } + } + + tu_bo_list_add(&cmdbuf->bo_list, buffer->bo, MSM_SUBMIT_BO_WRITE); +} + void tu_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer, VkQueryPool queryPool, @@ -88,6 +388,46 @@ VkDeviceSize stride, VkQueryResultFlags flags) { + TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer); + TU_FROM_HANDLE(tu_query_pool, pool, queryPool); + TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer); + struct tu_cs *cs = &cmdbuf->cs; + assert(firstQuery + queryCount <= pool->size); + + switch (pool->type) { + case VK_QUERY_TYPE_OCCLUSION: { + return emit_copy_occlusion_query_pool_results(cmdbuf, cs, pool, + firstQuery, queryCount, buffer, dstOffset, stride, flags); + } + case VK_QUERY_TYPE_PIPELINE_STATISTICS: + case VK_QUERY_TYPE_TIMESTAMP: + unreachable("Unimplemented query type"); + default: + assert(!"Invalid query type"); + } +} + +static void +emit_reset_occlusion_query_pool(struct tu_cmd_buffer *cmdbuf, + struct tu_query_pool *pool, + uint32_t firstQuery, + uint32_t queryCount) +{ + struct tu_cs *cs = &cmdbuf->cs; + + for (uint32_t i = 0; i < queryCount; i++) { + uint32_t query = firstQuery + i; + uint64_t available_iova = occlusion_query_iova(pool, query, available); + uint64_t result_iova = occlusion_query_iova(pool, query, result); + tu_cs_reserve_space(cmdbuf->device, cs, 11); + tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4); + tu_cs_emit_qw(cs, available_iova); + tu_cs_emit_qw(cs, 0x0); + + tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4); + tu_cs_emit_qw(cs, result_iova); + tu_cs_emit_qw(cs, 0x0); + } } void @@ -96,6 +436,54 @@ uint32_t firstQuery, uint32_t queryCount) { + TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer); + TU_FROM_HANDLE(tu_query_pool, pool, queryPool); + + switch (pool->type) { + case VK_QUERY_TYPE_OCCLUSION: + emit_reset_occlusion_query_pool(cmdbuf, pool, firstQuery, queryCount); + break; + case VK_QUERY_TYPE_PIPELINE_STATISTICS: + case VK_QUERY_TYPE_TIMESTAMP: + unreachable("Unimplemented query type"); + default: + assert(!"Invalid query type"); + } + + tu_bo_list_add(&cmdbuf->bo_list, &pool->bo, MSM_SUBMIT_BO_WRITE); +} + +static void +emit_begin_occlusion_query(struct tu_cmd_buffer *cmdbuf, + struct tu_query_pool *pool, + uint32_t query) +{ + /* From the Vulkan 1.1.130 spec: + * + * A query must begin and end inside the same subpass of a render pass + * instance, or must both begin and end outside of a render pass + * instance. + * + * Unlike on an immediate-mode renderer, Turnip renders all tiles on + * vkCmdEndRenderPass, not individually on each vkCmdDraw*. As such, if a + * query begins/ends inside the same subpass of a render pass, we need to + * record the packets on the secondary draw command stream. cmdbuf->draw_cs + * is then run on every tile during render, so we just need to accumulate + * sample counts in slot->result to compute the query result. + */ + struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs; + + uint64_t begin_iova = occlusion_query_iova(pool, query, begin); + + tu_cs_reserve_space(cmdbuf->device, cs, 7); + tu_cs_emit_regs(cs, + A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true)); + + tu_cs_emit_regs(cs, + A6XX_RB_SAMPLE_COUNT_ADDR_LO(begin_iova)); + + tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1); + tu_cs_emit(cs, ZPASS_DONE); } void @@ -104,6 +492,101 @@ uint32_t query, VkQueryControlFlags flags) { + TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer); + TU_FROM_HANDLE(tu_query_pool, pool, queryPool); + assert(query < pool->size); + + switch (pool->type) { + case VK_QUERY_TYPE_OCCLUSION: + /* In freedreno, there is no implementation difference between + * GL_SAMPLES_PASSED and GL_ANY_SAMPLES_PASSED, so we can similarly + * ignore the VK_QUERY_CONTROL_PRECISE_BIT flag here. + */ + emit_begin_occlusion_query(cmdbuf, pool, query); + break; + case VK_QUERY_TYPE_PIPELINE_STATISTICS: + case VK_QUERY_TYPE_TIMESTAMP: + unreachable("Unimplemented query type"); + default: + assert(!"Invalid query type"); + } + + tu_bo_list_add(&cmdbuf->bo_list, &pool->bo, MSM_SUBMIT_BO_WRITE); +} + +static void +emit_end_occlusion_query(struct tu_cmd_buffer *cmdbuf, + struct tu_query_pool *pool, + uint32_t query) +{ + /* Ending an occlusion query happens in a few steps: + * 1) Set the slot->end to UINT64_MAX. + * 2) Set up the SAMPLE_COUNT registers and trigger a CP_EVENT_WRITE to + * write the current sample count value into slot->end. + * 3) Since (2) is asynchronous, wait until slot->end is not equal to + * UINT64_MAX before continuing via CP_WAIT_REG_MEM. + * 4) Accumulate the results of the query (slot->end - slot->begin) into + * slot->result. + * 5) If vkCmdEndQuery is *not* called from within the scope of a render + * pass, set the slot's available bit since the query is now done. + * 6) If vkCmdEndQuery *is* called from within the scope of a render + * pass, we cannot mark as available yet since the commands in + * draw_cs are not run until vkCmdEndRenderPass. + */ + const struct tu_render_pass *pass = cmdbuf->state.pass; + struct tu_cs *cs = pass ? &cmdbuf->draw_cs : &cmdbuf->cs; + + uint64_t available_iova = occlusion_query_iova(pool, query, available); + uint64_t begin_iova = occlusion_query_iova(pool, query, begin); + uint64_t end_iova = occlusion_query_iova(pool, query, end); + uint64_t result_iova = occlusion_query_iova(pool, query, result); + tu_cs_reserve_space(cmdbuf->device, cs, 31); + tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4); + tu_cs_emit_qw(cs, end_iova); + tu_cs_emit_qw(cs, 0xffffffffffffffffull); + + tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0); + + tu_cs_emit_regs(cs, + A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true)); + + tu_cs_emit_regs(cs, + A6XX_RB_SAMPLE_COUNT_ADDR_LO(end_iova)); + + tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1); + tu_cs_emit(cs, ZPASS_DONE); + + tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6); + tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_NE) | + CP_WAIT_REG_MEM_0_POLL_MEMORY); + tu_cs_emit_qw(cs, end_iova); + tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(0xffffffff)); + tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0)); + tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16)); + + /* result (dst) = result (srcA) + end (srcB) - begin (srcC) */ + tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9); + tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C); + tu_cs_emit_qw(cs, result_iova); + tu_cs_emit_qw(cs, result_iova); + tu_cs_emit_qw(cs, end_iova); + tu_cs_emit_qw(cs, begin_iova); + + tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0); + + if (pass) + /* Technically, queries should be tracked per-subpass, but here we track + * at the render pass level to simply the code a bit. This is safe + * because the only commands that use the available bit are + * vkCmdCopyQueryPoolResults and vkCmdResetQueryPool, both of which + * cannot be invoked from inside a render pass scope. + */ + cs = &cmdbuf->draw_epilogue_cs; + + tu_cs_reserve_space(cmdbuf->device, cs, 5); + tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4); + tu_cs_emit_qw(cs, available_iova); + tu_cs_emit_qw(cs, 0x1); } void @@ -111,6 +594,22 @@ VkQueryPool queryPool, uint32_t query) { + TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer); + TU_FROM_HANDLE(tu_query_pool, pool, queryPool); + assert(query < pool->size); + + switch (pool->type) { + case VK_QUERY_TYPE_OCCLUSION: + emit_end_occlusion_query(cmdbuf, pool, query); + break; + case VK_QUERY_TYPE_PIPELINE_STATISTICS: + case VK_QUERY_TYPE_TIMESTAMP: + unreachable("Unimplemented query type"); + default: + assert(!"Invalid query type"); + } + + tu_bo_list_add(&cmdbuf->bo_list, &pool->bo, MSM_SUBMIT_BO_WRITE); } void diff -Nru mesa-19.2.8/src/freedreno/vulkan/tu_shader.c mesa-20.0.8/src/freedreno/vulkan/tu_shader.c --- mesa-19.2.8/src/freedreno/vulkan/tu_shader.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/freedreno/vulkan/tu_shader.c 2020-06-12 01:21:16.000000000 +0000 @@ -38,6 +38,7 @@ { /* TODO these are made-up */ const struct spirv_to_nir_options spirv_options = { + .frag_coord_is_sysval = true, .lower_ubo_ssbo_access_to_offsets = true, .caps = { false }, }; @@ -57,10 +58,23 @@ const void *data = spec_info->pData + entry->offset; assert(data + entry->size <= spec_info->pData + spec_info->dataSize); spec[i].id = entry->constantID; - if (entry->size == 8) - spec[i].data64 = *(const uint64_t *) data; - else - spec[i].data32 = *(const uint32_t *) data; + switch (entry->size) { + case 8: + spec[i].data64 = *(const uint64_t *)data; + break; + case 4: + spec[i].data32 = *(const uint32_t *)data; + break; + case 2: + spec[i].data32 = *(const uint16_t *)data; + break; + case 1: + spec[i].data32 = *(const uint8_t *)data; + break; + default: + assert(!"Invalid spec constant size"); + break; + } spec[i].defined_on_module = false; } @@ -79,38 +93,310 @@ return nir; } +static unsigned +map_add(struct tu_descriptor_map *map, int set, int binding, int value, + int array_size) +{ + unsigned index = 0; + for (unsigned i = 0; i < map->num; i++) { + if (set == map->set[i] && binding == map->binding[i]) { + assert(value == map->value[i]); + assert(array_size == map->array_size[i]); + return index; + } + index += map->array_size[i]; + } + + assert(index == map->num_desc); + + map->set[map->num] = set; + map->binding[map->num] = binding; + map->value[map->num] = value; + map->array_size[map->num] = array_size; + map->num++; + map->num_desc += array_size; + + return index; +} + +static void +lower_tex_src_to_offset(nir_builder *b, nir_tex_instr *instr, unsigned src_idx, + struct tu_shader *shader, + const struct tu_pipeline_layout *layout) +{ + nir_ssa_def *index = NULL; + unsigned base_index = 0; + unsigned array_elements = 1; + nir_tex_src *src = &instr->src[src_idx]; + bool is_sampler = src->src_type == nir_tex_src_sampler_deref; + + /* We compute first the offsets */ + nir_deref_instr *deref = nir_instr_as_deref(src->src.ssa->parent_instr); + while (deref->deref_type != nir_deref_type_var) { + assert(deref->parent.is_ssa); + nir_deref_instr *parent = + nir_instr_as_deref(deref->parent.ssa->parent_instr); + + assert(deref->deref_type == nir_deref_type_array); + + if (nir_src_is_const(deref->arr.index) && index == NULL) { + /* We're still building a direct index */ + base_index += nir_src_as_uint(deref->arr.index) * array_elements; + } else { + if (index == NULL) { + /* We used to be direct but not anymore */ + index = nir_imm_int(b, base_index); + base_index = 0; + } + + index = nir_iadd(b, index, + nir_imul(b, nir_imm_int(b, array_elements), + nir_ssa_for_src(b, deref->arr.index, 1))); + } + + array_elements *= glsl_get_length(parent->type); + + deref = parent; + } + + if (index) + index = nir_umin(b, index, nir_imm_int(b, array_elements - 1)); + + /* We have the offsets, we apply them, rewriting the source or removing + * instr if needed + */ + if (index) { + nir_instr_rewrite_src(&instr->instr, &src->src, + nir_src_for_ssa(index)); + + src->src_type = is_sampler ? + nir_tex_src_sampler_offset : + nir_tex_src_texture_offset; + + instr->texture_array_size = array_elements; + } else { + nir_tex_instr_remove_src(instr, src_idx); + } + + uint32_t set = deref->var->data.descriptor_set; + uint32_t binding = deref->var->data.binding; + struct tu_descriptor_set_layout *set_layout = layout->set[set].layout; + struct tu_descriptor_set_binding_layout *binding_layout = + &set_layout->binding[binding]; + + int desc_index = map_add(is_sampler ? + &shader->sampler_map : &shader->texture_map, + deref->var->data.descriptor_set, + deref->var->data.binding, + deref->var->data.index, + binding_layout->array_size) + base_index; + if (is_sampler) + instr->sampler_index = desc_index; + else + instr->texture_index = desc_index; +} + +static bool +lower_sampler(nir_builder *b, nir_tex_instr *instr, struct tu_shader *shader, + const struct tu_pipeline_layout *layout) +{ + int texture_idx = + nir_tex_instr_src_index(instr, nir_tex_src_texture_deref); + + if (texture_idx >= 0) + lower_tex_src_to_offset(b, instr, texture_idx, shader, layout); + + int sampler_idx = + nir_tex_instr_src_index(instr, nir_tex_src_sampler_deref); + + if (sampler_idx >= 0) + lower_tex_src_to_offset(b, instr, sampler_idx, shader, layout); + + if (texture_idx < 0 && sampler_idx < 0) + return false; + + return true; +} + +static void +lower_load_push_constant(nir_builder *b, nir_intrinsic_instr *instr, + struct tu_shader *shader) +{ + /* note: ir3 wants load_ubo, not load_uniform */ + assert(nir_intrinsic_base(instr) == 0); + + nir_intrinsic_instr *load = + nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_ubo); + load->num_components = instr->num_components; + load->src[0] = nir_src_for_ssa(nir_imm_int(b, 0)); + load->src[1] = instr->src[0]; + nir_ssa_dest_init(&load->instr, &load->dest, + load->num_components, instr->dest.ssa.bit_size, + instr->dest.ssa.name); + nir_builder_instr_insert(b, &load->instr); + nir_ssa_def_rewrite_uses(&instr->dest.ssa, nir_src_for_ssa(&load->dest.ssa)); + + nir_instr_remove(&instr->instr); +} + +static void +lower_vulkan_resource_index(nir_builder *b, nir_intrinsic_instr *instr, + struct tu_shader *shader, + const struct tu_pipeline_layout *layout) +{ + nir_const_value *const_val = nir_src_as_const_value(instr->src[0]); + + unsigned set = nir_intrinsic_desc_set(instr); + unsigned binding = nir_intrinsic_binding(instr); + struct tu_descriptor_set_layout *set_layout = layout->set[set].layout; + struct tu_descriptor_set_binding_layout *binding_layout = + &set_layout->binding[binding]; + unsigned index = 0; + + switch (nir_intrinsic_desc_type(instr)) { + case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: + case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: + if (!const_val) + tu_finishme("non-constant vulkan_resource_index array index"); + /* skip index 0 which is used for push constants */ + index = map_add(&shader->ubo_map, set, binding, 0, + binding_layout->array_size) + 1; + index += const_val->u32; + break; + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: + if (!const_val) + tu_finishme("non-constant vulkan_resource_index array index"); + index = map_add(&shader->ssbo_map, set, binding, 0, + binding_layout->array_size); + index += const_val->u32; + break; + default: + tu_finishme("unsupported desc_type for vulkan_resource_index"); + break; + } + + nir_ssa_def_rewrite_uses(&instr->dest.ssa, + nir_src_for_ssa(nir_imm_int(b, index))); + nir_instr_remove(&instr->instr); +} + static void -tu_sort_variables_by_location(struct exec_list *variables) +add_image_deref_mapping(nir_intrinsic_instr *instr, struct tu_shader *shader, + const struct tu_pipeline_layout *layout) { - struct exec_list sorted; - exec_list_make_empty(&sorted); + nir_deref_instr *deref = nir_src_as_deref(instr->src[0]); + nir_variable *var = nir_deref_instr_get_variable(deref); + + uint32_t set = var->data.descriptor_set; + uint32_t binding = var->data.binding; + struct tu_descriptor_set_layout *set_layout = layout->set[set].layout; + struct tu_descriptor_set_binding_layout *binding_layout = + &set_layout->binding[binding]; + + var->data.driver_location = + map_add(&shader->image_map, set, binding, var->data.index, + binding_layout->array_size); +} + +static bool +lower_intrinsic(nir_builder *b, nir_intrinsic_instr *instr, + struct tu_shader *shader, + const struct tu_pipeline_layout *layout) +{ + switch (instr->intrinsic) { + case nir_intrinsic_load_layer_id: + /* TODO: remove this when layered rendering is implemented */ + nir_ssa_def_rewrite_uses(&instr->dest.ssa, + nir_src_for_ssa(nir_imm_int(b, 0))); + nir_instr_remove(&instr->instr); + return true; + + case nir_intrinsic_load_push_constant: + lower_load_push_constant(b, instr, shader); + return true; + + case nir_intrinsic_vulkan_resource_index: + lower_vulkan_resource_index(b, instr, shader, layout); + return true; + + case nir_intrinsic_image_deref_load: + case nir_intrinsic_image_deref_store: + case nir_intrinsic_image_deref_atomic_add: + case nir_intrinsic_image_deref_atomic_imin: + case nir_intrinsic_image_deref_atomic_umin: + case nir_intrinsic_image_deref_atomic_imax: + case nir_intrinsic_image_deref_atomic_umax: + case nir_intrinsic_image_deref_atomic_and: + case nir_intrinsic_image_deref_atomic_or: + case nir_intrinsic_image_deref_atomic_xor: + case nir_intrinsic_image_deref_atomic_exchange: + case nir_intrinsic_image_deref_atomic_comp_swap: + case nir_intrinsic_image_deref_size: + case nir_intrinsic_image_deref_samples: + case nir_intrinsic_image_deref_load_param_intel: + case nir_intrinsic_image_deref_load_raw_intel: + case nir_intrinsic_image_deref_store_raw_intel: + add_image_deref_mapping(instr, shader, layout); + return true; + + default: + return false; + } +} - nir_foreach_variable_safe(var, variables) - { - exec_node_remove(&var->node); - - /* insert the variable into the sorted list */ - nir_variable *next = NULL; - nir_foreach_variable(tmp, &sorted) - { - if (var->data.location < tmp->data.location) { - next = tmp; +static bool +lower_impl(nir_function_impl *impl, struct tu_shader *shader, + const struct tu_pipeline_layout *layout) +{ + nir_builder b; + nir_builder_init(&b, impl); + bool progress = false; + + nir_foreach_block(block, impl) { + nir_foreach_instr_safe(instr, block) { + b.cursor = nir_before_instr(instr); + switch (instr->type) { + case nir_instr_type_tex: + progress |= lower_sampler(&b, nir_instr_as_tex(instr), shader, layout); + break; + case nir_instr_type_intrinsic: + progress |= lower_intrinsic(&b, nir_instr_as_intrinsic(instr), shader, layout); + break; + default: break; } } - if (next) - exec_node_insert_node_before(&next->node, &var->node); - else - exec_list_push_tail(&sorted, &var->node); } - exec_list_move_nodes_to(&sorted, variables); + return progress; +} + +static bool +tu_lower_io(nir_shader *shader, struct tu_shader *tu_shader, + const struct tu_pipeline_layout *layout) +{ + bool progress = false; + + nir_foreach_function(function, shader) { + if (function->impl) + progress |= lower_impl(function->impl, tu_shader, layout); + } + + /* spirv_to_nir produces num_ssbos equal to the number of SSBO-containing + * variables, while ir3 wants the number of descriptors (like the gallium + * path). + */ + shader->info.num_ssbos = tu_shader->ssbo_map.num_desc; + + return progress; } struct tu_shader * tu_shader_create(struct tu_device *dev, gl_shader_stage stage, const VkPipelineShaderStageCreateInfo *stage_info, + struct tu_pipeline_layout *layout, const VkAllocationCallbacks *alloc) { const struct tu_shader_module *module = @@ -140,41 +426,71 @@ nir_print_shader(nir, stderr); } - /* TODO what needs to happen? */ - - switch (stage) { - case MESA_SHADER_VERTEX: - tu_sort_variables_by_location(&nir->outputs); - break; - case MESA_SHADER_TESS_CTRL: - case MESA_SHADER_TESS_EVAL: - case MESA_SHADER_GEOMETRY: - tu_sort_variables_by_location(&nir->inputs); - tu_sort_variables_by_location(&nir->outputs); - break; - case MESA_SHADER_FRAGMENT: - tu_sort_variables_by_location(&nir->inputs); - break; - case MESA_SHADER_COMPUTE: - break; - default: - unreachable("invalid gl_shader_stage"); - break; + /* multi step inlining procedure */ + NIR_PASS_V(nir, nir_lower_constant_initializers, nir_var_function_temp); + NIR_PASS_V(nir, nir_lower_returns); + NIR_PASS_V(nir, nir_inline_functions); + NIR_PASS_V(nir, nir_opt_deref); + foreach_list_typed_safe(nir_function, func, node, &nir->functions) { + if (!func->is_entrypoint) + exec_node_remove(&func->node); } + assert(exec_list_length(&nir->functions) == 1); + NIR_PASS_V(nir, nir_lower_constant_initializers, ~nir_var_function_temp); + + /* Split member structs. We do this before lower_io_to_temporaries so that + * it doesn't lower system values to temporaries by accident. + */ + NIR_PASS_V(nir, nir_split_var_copies); + NIR_PASS_V(nir, nir_split_per_member_structs); + + NIR_PASS_V(nir, nir_remove_dead_variables, + nir_var_shader_in | nir_var_shader_out | nir_var_system_value | nir_var_mem_shared); + + NIR_PASS_V(nir, nir_propagate_invariant); - nir_assign_var_locations(&nir->inputs, &nir->num_inputs, - ir3_glsl_type_size); - nir_assign_var_locations(&nir->outputs, &nir->num_outputs, - ir3_glsl_type_size); - nir_assign_var_locations(&nir->uniforms, &nir->num_uniforms, - ir3_glsl_type_size); + NIR_PASS_V(nir, nir_lower_io_to_temporaries, nir_shader_get_entrypoint(nir), true, true); + + NIR_PASS_V(nir, nir_lower_global_vars_to_local); + NIR_PASS_V(nir, nir_split_var_copies); + NIR_PASS_V(nir, nir_lower_var_copies); + + NIR_PASS_V(nir, nir_opt_copy_prop_vars); + NIR_PASS_V(nir, nir_opt_combine_stores, nir_var_all); + + /* ir3 doesn't support indirect input/output */ + NIR_PASS_V(nir, nir_lower_indirect_derefs, nir_var_shader_in | nir_var_shader_out); + + NIR_PASS_V(nir, nir_lower_io_arrays_to_elements_no_indirects, false); + + nir_assign_io_var_locations(&nir->inputs, &nir->num_inputs, stage); + nir_assign_io_var_locations(&nir->outputs, &nir->num_outputs, stage); NIR_PASS_V(nir, nir_lower_system_values); NIR_PASS_V(nir, nir_lower_frexp); + + if (stage == MESA_SHADER_FRAGMENT) + NIR_PASS_V(nir, nir_lower_input_attachments, true); + + NIR_PASS_V(nir, tu_lower_io, shader, layout); + NIR_PASS_V(nir, nir_lower_io, nir_var_all, ir3_glsl_type_size, 0); + if (stage == MESA_SHADER_FRAGMENT) { + /* NOTE: lower load_barycentric_at_sample first, since it + * produces load_barycentric_at_offset: + */ + NIR_PASS_V(nir, ir3_nir_lower_load_barycentric_at_sample); + NIR_PASS_V(nir, ir3_nir_lower_load_barycentric_at_offset); + + NIR_PASS_V(nir, ir3_nir_move_varying_inputs); + } + nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir)); + /* num_uniforms only used by ir3 for size of ubo 0 (push constants) */ + nir->num_uniforms = MAX_PUSH_CONSTANTS_SIZE / 16; + shader->ir3_shader.compiler = dev->compiler; shader->ir3_shader.type = stage; shader->ir3_shader.nir = nir; @@ -213,8 +529,10 @@ *options = (struct tu_shader_compile_options) { /* TODO ir3_key */ - .optimize = !(pipeline_info->flags & - VK_PIPELINE_CREATE_DISABLE_OPTIMIZATION_BIT), + /* TODO: VK_PIPELINE_CREATE_DISABLE_OPTIMIZATION_BIT + * some optimizations need to happen otherwise shader might not compile + */ + .optimize = true, .include_binning_pass = true, }; } @@ -222,13 +540,14 @@ static uint32_t * tu_compile_shader_variant(struct ir3_shader *shader, const struct ir3_shader_key *key, - bool binning_pass, + struct ir3_shader_variant *nonbinning, struct ir3_shader_variant *variant) { variant->shader = shader; variant->type = shader->type; variant->key = *key; - variant->binning_pass = binning_pass; + variant->binning_pass = !!nonbinning; + variant->nonbinning = nonbinning; int ret = ir3_compile_shader_nir(shader->compiler, variant); if (ret) @@ -259,7 +578,7 @@ } shader->binary = tu_compile_shader_variant( - &shader->ir3_shader, &options->key, false, &shader->variants[0]); + &shader->ir3_shader, &options->key, NULL, &shader->variants[0]); if (!shader->binary) return VK_ERROR_OUT_OF_HOST_MEMORY; @@ -267,7 +586,8 @@ if (options->include_binning_pass && shader->ir3_shader.type == MESA_SHADER_VERTEX) { shader->binning_binary = tu_compile_shader_variant( - &shader->ir3_shader, &options->key, true, &shader->variants[1]); + &shader->ir3_shader, &options->key, &shader->variants[0], + &shader->variants[1]); if (!shader->binning_binary) return VK_ERROR_OUT_OF_HOST_MEMORY; diff -Nru mesa-19.2.8/src/freedreno/vulkan/tu_wsi.c mesa-20.0.8/src/freedreno/vulkan/tu_wsi.c --- mesa-19.2.8/src/freedreno/vulkan/tu_wsi.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/freedreno/vulkan/tu_wsi.c 2020-06-12 01:21:16.000000000 +0000 @@ -27,6 +27,7 @@ #include "vk_util.h" #include "wsi_common.h" +#include "drm-uapi/drm_fourcc.h" static PFN_vkVoidFunction tu_wsi_proc_addr(VkPhysicalDevice physicalDevice, const char *pName) @@ -37,10 +38,19 @@ VkResult tu_wsi_init(struct tu_physical_device *physical_device) { - return wsi_device_init(&physical_device->wsi_device, - tu_physical_device_to_handle(physical_device), - tu_wsi_proc_addr, &physical_device->instance->alloc, - physical_device->master_fd, NULL); + VkResult result; + + result = wsi_device_init(&physical_device->wsi_device, + tu_physical_device_to_handle(physical_device), + tu_wsi_proc_addr, + &physical_device->instance->alloc, + physical_device->master_fd, NULL); + if (result != VK_SUCCESS) + return result; + + physical_device->wsi_device.supports_modifiers = true; + + return VK_SUCCESS; } void diff -Nru mesa-19.2.8/src/freedreno/vulkan/tu_wsi_display.c mesa-20.0.8/src/freedreno/vulkan/tu_wsi_display.c --- mesa-19.2.8/src/freedreno/vulkan/tu_wsi_display.c 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/freedreno/vulkan/tu_wsi_display.c 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,339 @@ +/* + * Copyright © 2017 Keith Packard + * + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that copyright + * notice and this permission notice appear in supporting documentation, and + * that the name of the copyright holders not be used in advertising or + * publicity pertaining to distribution of the software without specific, + * written prior permission. The copyright holders make no representations + * about the suitability of this software for any purpose. It is provided "as + * is" without express or implied warranty. + * + * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, + * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO + * EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT OR + * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, + * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER + * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THIS SOFTWARE. + */ + +#include +#include +#include +#include +#include "tu_private.h" +#include "tu_cs.h" +#include "util/disk_cache.h" +#include "util/strtod.h" +#include "vk_util.h" +#include +#include +#include "vk_format.h" +#include "util/debug.h" +#include "wsi_common_display.h" + +VkResult +tu_GetPhysicalDeviceDisplayPropertiesKHR(VkPhysicalDevice physical_device, + uint32_t *property_count, + VkDisplayPropertiesKHR *properties) +{ + TU_FROM_HANDLE(tu_physical_device, pdevice, physical_device); + + return wsi_display_get_physical_device_display_properties( + physical_device, + &pdevice->wsi_device, + property_count, + properties); +} + +VkResult +tu_GetPhysicalDeviceDisplayProperties2KHR(VkPhysicalDevice physical_device, + uint32_t *property_count, + VkDisplayProperties2KHR *properties) +{ + TU_FROM_HANDLE(tu_physical_device, pdevice, physical_device); + + return wsi_display_get_physical_device_display_properties2( + physical_device, + &pdevice->wsi_device, + property_count, + properties); +} + +VkResult +tu_GetPhysicalDeviceDisplayPlanePropertiesKHR( + VkPhysicalDevice physical_device, + uint32_t *property_count, + VkDisplayPlanePropertiesKHR *properties) +{ + TU_FROM_HANDLE(tu_physical_device, pdevice, physical_device); + + return wsi_display_get_physical_device_display_plane_properties( + physical_device, + &pdevice->wsi_device, + property_count, + properties); +} + +VkResult +tu_GetPhysicalDeviceDisplayPlaneProperties2KHR( + VkPhysicalDevice physical_device, + uint32_t *property_count, + VkDisplayPlaneProperties2KHR *properties) +{ + TU_FROM_HANDLE(tu_physical_device, pdevice, physical_device); + + return wsi_display_get_physical_device_display_plane_properties2( + physical_device, + &pdevice->wsi_device, + property_count, + properties); +} + +VkResult +tu_GetDisplayPlaneSupportedDisplaysKHR(VkPhysicalDevice physical_device, + uint32_t plane_index, + uint32_t *display_count, + VkDisplayKHR *displays) +{ + TU_FROM_HANDLE(tu_physical_device, pdevice, physical_device); + + return wsi_display_get_display_plane_supported_displays( + physical_device, + &pdevice->wsi_device, + plane_index, + display_count, + displays); +} + + +VkResult +tu_GetDisplayModePropertiesKHR(VkPhysicalDevice physical_device, + VkDisplayKHR display, + uint32_t *property_count, + VkDisplayModePropertiesKHR *properties) +{ + TU_FROM_HANDLE(tu_physical_device, pdevice, physical_device); + + return wsi_display_get_display_mode_properties(physical_device, + &pdevice->wsi_device, + display, + property_count, + properties); +} + +VkResult +tu_GetDisplayModeProperties2KHR(VkPhysicalDevice physical_device, + VkDisplayKHR display, + uint32_t *property_count, + VkDisplayModeProperties2KHR *properties) +{ + TU_FROM_HANDLE(tu_physical_device, pdevice, physical_device); + + return wsi_display_get_display_mode_properties2(physical_device, + &pdevice->wsi_device, + display, + property_count, + properties); +} + +VkResult +tu_CreateDisplayModeKHR(VkPhysicalDevice physical_device, + VkDisplayKHR display, + const VkDisplayModeCreateInfoKHR *create_info, + const VkAllocationCallbacks *allocator, + VkDisplayModeKHR *mode) +{ + TU_FROM_HANDLE(tu_physical_device, pdevice, physical_device); + + return wsi_display_create_display_mode(physical_device, + &pdevice->wsi_device, + display, + create_info, + allocator, + mode); +} + +VkResult +tu_GetDisplayPlaneCapabilitiesKHR(VkPhysicalDevice physical_device, + VkDisplayModeKHR mode_khr, + uint32_t plane_index, + VkDisplayPlaneCapabilitiesKHR *capabilities) +{ + TU_FROM_HANDLE(tu_physical_device, pdevice, physical_device); + + return wsi_get_display_plane_capabilities(physical_device, + &pdevice->wsi_device, + mode_khr, + plane_index, + capabilities); +} + +VkResult +tu_GetDisplayPlaneCapabilities2KHR(VkPhysicalDevice physical_device, + const VkDisplayPlaneInfo2KHR *pDisplayPlaneInfo, + VkDisplayPlaneCapabilities2KHR *capabilities) +{ + TU_FROM_HANDLE(tu_physical_device, pdevice, physical_device); + + return wsi_get_display_plane_capabilities2(physical_device, + &pdevice->wsi_device, + pDisplayPlaneInfo, + capabilities); +} + +VkResult +tu_CreateDisplayPlaneSurfaceKHR( + VkInstance _instance, + const VkDisplaySurfaceCreateInfoKHR *create_info, + const VkAllocationCallbacks *allocator, + VkSurfaceKHR *surface) +{ + TU_FROM_HANDLE(tu_instance, instance, _instance); + const VkAllocationCallbacks *alloc; + + if (allocator) + alloc = allocator; + else + alloc = &instance->alloc; + + return wsi_create_display_surface(_instance, alloc, + create_info, surface); +} + +VkResult +tu_ReleaseDisplayEXT(VkPhysicalDevice physical_device, + VkDisplayKHR display) +{ + TU_FROM_HANDLE(tu_physical_device, pdevice, physical_device); + + return wsi_release_display(physical_device, + &pdevice->wsi_device, + display); +} + +#ifdef VK_USE_PLATFORM_XLIB_XRANDR_EXT +VkResult +tu_AcquireXlibDisplayEXT(VkPhysicalDevice physical_device, + Display *dpy, + VkDisplayKHR display) +{ + TU_FROM_HANDLE(tu_physical_device, pdevice, physical_device); + + return wsi_acquire_xlib_display(physical_device, + &pdevice->wsi_device, + dpy, + display); +} + +VkResult +tu_GetRandROutputDisplayEXT(VkPhysicalDevice physical_device, + Display *dpy, + RROutput output, + VkDisplayKHR *display) +{ + TU_FROM_HANDLE(tu_physical_device, pdevice, physical_device); + + return wsi_get_randr_output_display(physical_device, + &pdevice->wsi_device, + dpy, + output, + display); +} +#endif /* VK_USE_PLATFORM_XLIB_XRANDR_EXT */ + +/* VK_EXT_display_control */ + +VkResult +tu_DisplayPowerControlEXT(VkDevice _device, + VkDisplayKHR display, + const VkDisplayPowerInfoEXT *display_power_info) +{ + TU_FROM_HANDLE(tu_device, device, _device); + + return wsi_display_power_control(_device, + &device->physical_device->wsi_device, + display, + display_power_info); +} + +VkResult +tu_RegisterDeviceEventEXT(VkDevice _device, + const VkDeviceEventInfoEXT *device_event_info, + const VkAllocationCallbacks *allocator, + VkFence *_fence) +{ + TU_FROM_HANDLE(tu_device, device, _device); + struct tu_fence *fence; + VkResult ret; + + fence = vk_alloc2(&device->instance->alloc, allocator, sizeof (*fence), + 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (!fence) + return VK_ERROR_OUT_OF_HOST_MEMORY; + + tu_fence_init(fence, false); + + ret = wsi_register_device_event(_device, + &device->physical_device->wsi_device, + device_event_info, + allocator, + &fence->fence_wsi); + if (ret == VK_SUCCESS) + *_fence = tu_fence_to_handle(fence); + else + vk_free2(&device->instance->alloc, allocator, fence); + return ret; +} + +VkResult +tu_RegisterDisplayEventEXT(VkDevice _device, + VkDisplayKHR display, + const VkDisplayEventInfoEXT *display_event_info, + const VkAllocationCallbacks *allocator, + VkFence *_fence) +{ + TU_FROM_HANDLE(tu_device, device, _device); + + struct tu_fence *fence; + VkResult ret; + + fence = vk_alloc2(&device->instance->alloc, allocator, sizeof (*fence), + 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (!fence) + return VK_ERROR_OUT_OF_HOST_MEMORY; + + tu_fence_init(fence, false); + + ret = wsi_register_display_event(_device, + &device->physical_device->wsi_device, + display, + display_event_info, + allocator, + &fence->fence_wsi); + + if (ret == VK_SUCCESS) + *_fence = tu_fence_to_handle(fence); + else + vk_free2(&device->instance->alloc, allocator, fence); + return ret; +} + +VkResult +tu_GetSwapchainCounterEXT(VkDevice _device, + VkSwapchainKHR swapchain, + VkSurfaceCounterFlagBitsEXT flag_bits, + uint64_t *value) +{ + TU_FROM_HANDLE(tu_device, device, _device); + + return wsi_get_swapchain_counter(_device, + &device->physical_device->wsi_device, + swapchain, + flag_bits, + value); +} + diff -Nru mesa-19.2.8/src/freedreno/vulkan/tu_wsi_x11.c mesa-20.0.8/src/freedreno/vulkan/tu_wsi_x11.c --- mesa-19.2.8/src/freedreno/vulkan/tu_wsi_x11.c 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/freedreno/vulkan/tu_wsi_x11.c 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,100 @@ +/* + * Copyright © 2016 Red Hat. + * Copyright © 2016 Bas Nieuwenhuizen + * + * based mostly on anv driver which is: + * Copyright © 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include + +#include "wsi_common_x11.h" +#include "tu_private.h" + +VkBool32 tu_GetPhysicalDeviceXcbPresentationSupportKHR( + VkPhysicalDevice physicalDevice, + uint32_t queueFamilyIndex, + xcb_connection_t* connection, + xcb_visualid_t visual_id) +{ + TU_FROM_HANDLE(tu_physical_device, device, physicalDevice); + + return wsi_get_physical_device_xcb_presentation_support( + &device->wsi_device, + queueFamilyIndex, + connection, visual_id); +} + +VkBool32 tu_GetPhysicalDeviceXlibPresentationSupportKHR( + VkPhysicalDevice physicalDevice, + uint32_t queueFamilyIndex, + Display* dpy, + VisualID visualID) +{ + TU_FROM_HANDLE(tu_physical_device, device, physicalDevice); + + return wsi_get_physical_device_xcb_presentation_support( + &device->wsi_device, + queueFamilyIndex, + XGetXCBConnection(dpy), visualID); +} + +VkResult tu_CreateXcbSurfaceKHR( + VkInstance _instance, + const VkXcbSurfaceCreateInfoKHR* pCreateInfo, + const VkAllocationCallbacks* pAllocator, + VkSurfaceKHR* pSurface) +{ + TU_FROM_HANDLE(tu_instance, instance, _instance); + const VkAllocationCallbacks *alloc; + assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_XCB_SURFACE_CREATE_INFO_KHR); + + if (pAllocator) + alloc = pAllocator; + else + alloc = &instance->alloc; + + return wsi_create_xcb_surface(alloc, pCreateInfo, pSurface); +} + +VkResult tu_CreateXlibSurfaceKHR( + VkInstance _instance, + const VkXlibSurfaceCreateInfoKHR* pCreateInfo, + const VkAllocationCallbacks* pAllocator, + VkSurfaceKHR* pSurface) +{ + TU_FROM_HANDLE(tu_instance, instance, _instance); + const VkAllocationCallbacks *alloc; + + assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_XLIB_SURFACE_CREATE_INFO_KHR); + + if (pAllocator) + alloc = pAllocator; + else + alloc = &instance->alloc; + + return wsi_create_xlib_surface(alloc, pCreateInfo, pSurface); +} diff -Nru mesa-19.2.8/src/freedreno/vulkan/vk_format.h mesa-20.0.8/src/freedreno/vulkan/vk_format.h --- mesa-19.2.8/src/freedreno/vulkan/vk_format.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/freedreno/vulkan/vk_format.h 2020-06-12 01:21:16.000000000 +0000 @@ -29,120 +29,16 @@ #include #include +#include +#include #include -enum vk_format_layout +static inline const struct util_format_description * +vk_format_description(VkFormat format) { - /** - * Formats with vk_format_block::width == vk_format_block::height == 1 - * that can be described as an ordinary data structure. - */ - VK_FORMAT_LAYOUT_PLAIN = 0, - - /** - * Formats with sub-sampled channels. - * - * This is for formats like YVYU where there is less than one sample per - * pixel. - */ - VK_FORMAT_LAYOUT_SUBSAMPLED = 3, - - /** - * S3 Texture Compression formats. - */ - VK_FORMAT_LAYOUT_S3TC = 4, - - /** - * Red-Green Texture Compression formats. - */ - VK_FORMAT_LAYOUT_RGTC = 5, - - /** - * Ericsson Texture Compression - */ - VK_FORMAT_LAYOUT_ETC = 6, - - /** - * BC6/7 Texture Compression - */ - VK_FORMAT_LAYOUT_BPTC = 7, - - /** - * ASTC - */ - VK_FORMAT_LAYOUT_ASTC = 8, - - /** - * Everything else that doesn't fit in any of the above layouts. - */ - VK_FORMAT_LAYOUT_OTHER = 9 -}; - -struct vk_format_block -{ - /** Block width in pixels */ - unsigned width; - - /** Block height in pixels */ - unsigned height; - - /** Block size in bits */ - unsigned bits; -}; - -enum vk_format_type -{ - VK_FORMAT_TYPE_VOID = 0, - VK_FORMAT_TYPE_UNSIGNED = 1, - VK_FORMAT_TYPE_SIGNED = 2, - VK_FORMAT_TYPE_FIXED = 3, - VK_FORMAT_TYPE_FLOAT = 4 -}; - -enum vk_format_colorspace -{ - VK_FORMAT_COLORSPACE_RGB = 0, - VK_FORMAT_COLORSPACE_SRGB = 1, - VK_FORMAT_COLORSPACE_YUV = 2, - VK_FORMAT_COLORSPACE_ZS = 3 -}; - -struct vk_format_channel_description -{ - unsigned type : 5; - unsigned normalized : 1; - unsigned pure_integer : 1; - unsigned scaled : 1; - unsigned size : 8; - unsigned shift : 16; -}; - -struct vk_format_description -{ - VkFormat format; - const char *name; - const char *short_name; - - struct vk_format_block block; - enum vk_format_layout layout; - - unsigned nr_channels : 3; - unsigned is_array : 1; - unsigned is_bitmask : 1; - unsigned is_mixed : 1; - - struct vk_format_channel_description channel[4]; - - unsigned char swizzle[4]; - - enum vk_format_colorspace colorspace; -}; - -extern const struct vk_format_description vk_format_description_table[]; - -const struct vk_format_description * -vk_format_description(VkFormat format); + return util_format_description(vk_format_to_pipe_format(format)); +} /** * Return total bits needed for the pixel format per block. @@ -150,14 +46,7 @@ static inline unsigned vk_format_get_blocksizebits(VkFormat format) { - const struct vk_format_description *desc = vk_format_description(format); - - assert(desc); - if (!desc) { - return 0; - } - - return desc->block.bits; + return util_format_get_blocksizebits(vk_format_to_pipe_format(format)); } /** @@ -166,97 +55,40 @@ static inline unsigned vk_format_get_blocksize(VkFormat format) { - unsigned bits = vk_format_get_blocksizebits(format); - unsigned bytes = bits / 8; - - assert(bits % 8 == 0); - assert(bytes > 0); - if (bytes == 0) { - bytes = 1; - } - - return bytes; + return util_format_get_blocksize(vk_format_to_pipe_format(format)); } static inline unsigned vk_format_get_blockwidth(VkFormat format) { - const struct vk_format_description *desc = vk_format_description(format); - - assert(desc); - if (!desc) { - return 1; - } - - return desc->block.width; + return util_format_get_blockwidth(vk_format_to_pipe_format(format)); } static inline unsigned vk_format_get_blockheight(VkFormat format) { - const struct vk_format_description *desc = vk_format_description(format); - - assert(desc); - if (!desc) { - return 1; - } - - return desc->block.height; + return util_format_get_blockheight(vk_format_to_pipe_format(format)); } static inline unsigned vk_format_get_block_count_width(VkFormat format, unsigned width) { - unsigned blockwidth = vk_format_get_blockwidth(format); - return (width + blockwidth - 1) / blockwidth; + return util_format_get_nblocksx(vk_format_to_pipe_format(format), width); } static inline unsigned vk_format_get_block_count_height(VkFormat format, unsigned height) { - unsigned blockheight = vk_format_get_blockheight(format); - return (height + blockheight - 1) / blockheight; + return util_format_get_nblocksy(vk_format_to_pipe_format(format), height); } static inline unsigned vk_format_get_block_count(VkFormat format, unsigned width, unsigned height) { - return vk_format_get_block_count_width(format, width) * - vk_format_get_block_count_height(format, height); + return util_format_get_nblocks(vk_format_to_pipe_format(format), + width, height); } -/** - * Return the index of the first non-void channel - * -1 if no non-void channels - */ -static inline int -vk_format_get_first_non_void_channel(VkFormat format) -{ - const struct vk_format_description *desc = vk_format_description(format); - int i; - - for (i = 0; i < 4; i++) - if (desc->channel[i].type != VK_FORMAT_TYPE_VOID) - break; - - if (i == 4) - return -1; - - return i; -} - -enum vk_swizzle -{ - VK_SWIZZLE_X, - VK_SWIZZLE_Y, - VK_SWIZZLE_Z, - VK_SWIZZLE_W, - VK_SWIZZLE_0, - VK_SWIZZLE_1, - VK_SWIZZLE_NONE, - VK_SWIZZLE_MAX, /**< Number of enums counter (must be last) */ -}; - static inline VkImageAspectFlags vk_format_aspects(VkFormat format) { @@ -282,7 +114,7 @@ } } -static inline enum vk_swizzle +static inline enum pipe_swizzle tu_swizzle_conv(VkComponentSwizzle component, const unsigned char chan[4], VkComponentSwizzle vk_swiz) @@ -293,29 +125,29 @@ vk_swiz = component; switch (vk_swiz) { case VK_COMPONENT_SWIZZLE_ZERO: - return VK_SWIZZLE_0; + return PIPE_SWIZZLE_0; case VK_COMPONENT_SWIZZLE_ONE: - return VK_SWIZZLE_1; + return PIPE_SWIZZLE_1; case VK_COMPONENT_SWIZZLE_R: for (x = 0; x < 4; x++) if (chan[x] == 0) return x; - return VK_SWIZZLE_0; + return PIPE_SWIZZLE_0; case VK_COMPONENT_SWIZZLE_G: for (x = 0; x < 4; x++) if (chan[x] == 1) return x; - return VK_SWIZZLE_0; + return PIPE_SWIZZLE_0; case VK_COMPONENT_SWIZZLE_B: for (x = 0; x < 4; x++) if (chan[x] == 2) return x; - return VK_SWIZZLE_0; + return PIPE_SWIZZLE_0; case VK_COMPONENT_SWIZZLE_A: for (x = 0; x < 4; x++) if (chan[x] == 3) return x; - return VK_SWIZZLE_1; + return PIPE_SWIZZLE_1; default: unreachable("Illegal swizzle"); } @@ -324,7 +156,7 @@ static inline void vk_format_compose_swizzles(const VkComponentMapping *mapping, const unsigned char swz[4], - enum vk_swizzle dst[4]) + enum pipe_swizzle dst[4]) { dst[0] = tu_swizzle_conv(VK_COMPONENT_SWIZZLE_R, swz, mapping->r); dst[1] = tu_swizzle_conv(VK_COMPONENT_SWIZZLE_G, swz, mapping->g); @@ -335,77 +167,29 @@ static inline bool vk_format_is_compressed(VkFormat format) { - const struct vk_format_description *desc = vk_format_description(format); - - assert(desc); - if (!desc) { - return false; - } - - switch (desc->layout) { - case VK_FORMAT_LAYOUT_S3TC: - case VK_FORMAT_LAYOUT_RGTC: - case VK_FORMAT_LAYOUT_ETC: - case VK_FORMAT_LAYOUT_BPTC: - case VK_FORMAT_LAYOUT_ASTC: - /* XXX add other formats in the future */ - return true; - default: - return false; - } + return util_format_is_compressed(vk_format_to_pipe_format(format)); } static inline bool -vk_format_has_depth(const struct vk_format_description *desc) +vk_format_has_depth(VkFormat format) { - return desc->colorspace == VK_FORMAT_COLORSPACE_ZS && - desc->swizzle[0] != VK_SWIZZLE_NONE; -} + const struct util_format_description *desc = vk_format_description(format); -static inline bool -vk_format_has_stencil(const struct vk_format_description *desc) -{ - return desc->colorspace == VK_FORMAT_COLORSPACE_ZS && - desc->swizzle[1] != VK_SWIZZLE_NONE; + return util_format_has_depth(desc); } static inline bool -vk_format_is_depth_or_stencil(VkFormat format) +vk_format_has_stencil(VkFormat format) { - const struct vk_format_description *desc = vk_format_description(format); + const struct util_format_description *desc = vk_format_description(format); - assert(desc); - if (!desc) { - return false; - } - - return vk_format_has_depth(desc) || vk_format_has_stencil(desc); + return util_format_has_stencil(desc); } static inline bool -vk_format_is_depth(VkFormat format) +vk_format_is_depth_or_stencil(VkFormat format) { - const struct vk_format_description *desc = vk_format_description(format); - - assert(desc); - if (!desc) { - return false; - } - - return vk_format_has_depth(desc); -} - -static inline bool -vk_format_is_stencil(VkFormat format) -{ - const struct vk_format_description *desc = vk_format_description(format); - - assert(desc); - if (!desc) { - return false; - } - - return vk_format_has_stencil(desc); + return vk_format_has_depth(format) || vk_format_has_stencil(format); } static inline bool @@ -417,11 +201,7 @@ static inline bool vk_format_has_alpha(VkFormat format) { - const struct vk_format_description *desc = vk_format_description(format); - - return (desc->colorspace == VK_FORMAT_COLORSPACE_RGB || - desc->colorspace == VK_FORMAT_COLORSPACE_SRGB) && - desc->swizzle[3] != VK_SWIZZLE_1; + return util_format_has_alpha(vk_format_to_pipe_format(format)); } static inline VkFormat @@ -442,17 +222,25 @@ static inline bool vk_format_is_int(VkFormat format) { - const struct vk_format_description *desc = vk_format_description(format); - int channel = vk_format_get_first_non_void_channel(format); + return util_format_is_pure_integer(vk_format_to_pipe_format(format)); +} + +static inline bool +vk_format_is_uint(VkFormat format) +{ + return util_format_is_pure_uint(vk_format_to_pipe_format(format)); +} - return channel >= 0 && desc->channel[channel].pure_integer; +static inline bool +vk_format_is_sint(VkFormat format) +{ + return util_format_is_pure_sint(vk_format_to_pipe_format(format)); } static inline bool vk_format_is_srgb(VkFormat format) { - const struct vk_format_description *desc = vk_format_description(format); - return desc->colorspace == VK_FORMAT_COLORSPACE_SRGB; + return util_format_is_srgb(vk_format_to_pipe_format(format)); } static inline VkFormat @@ -503,75 +291,17 @@ static inline unsigned vk_format_get_component_bits(VkFormat format, - enum vk_format_colorspace colorspace, + enum util_format_colorspace colorspace, unsigned component) { - const struct vk_format_description *desc = vk_format_description(format); - enum vk_format_colorspace desc_colorspace; - - assert(format); - if (!format) { - return 0; - } - - assert(component < 4); - - /* Treat RGB and SRGB as equivalent. */ - if (colorspace == VK_FORMAT_COLORSPACE_SRGB) { - colorspace = VK_FORMAT_COLORSPACE_RGB; - } - if (desc->colorspace == VK_FORMAT_COLORSPACE_SRGB) { - desc_colorspace = VK_FORMAT_COLORSPACE_RGB; - } else { - desc_colorspace = desc->colorspace; - } - - if (desc_colorspace != colorspace) { - return 0; - } - - switch (desc->swizzle[component]) { - case VK_SWIZZLE_X: - return desc->channel[0].size; - case VK_SWIZZLE_Y: - return desc->channel[1].size; - case VK_SWIZZLE_Z: - return desc->channel[2].size; - case VK_SWIZZLE_W: - return desc->channel[3].size; - default: - return 0; - } -} - -static inline VkFormat -vk_to_non_srgb_format(VkFormat format) -{ - switch (format) { - case VK_FORMAT_R8_SRGB: - return VK_FORMAT_R8_UNORM; - case VK_FORMAT_R8G8_SRGB: - return VK_FORMAT_R8G8_UNORM; - case VK_FORMAT_R8G8B8_SRGB: - return VK_FORMAT_R8G8B8_UNORM; - case VK_FORMAT_B8G8R8_SRGB: - return VK_FORMAT_B8G8R8_UNORM; - case VK_FORMAT_R8G8B8A8_SRGB: - return VK_FORMAT_R8G8B8A8_UNORM; - case VK_FORMAT_B8G8R8A8_SRGB: - return VK_FORMAT_B8G8R8A8_UNORM; - case VK_FORMAT_A8B8G8R8_SRGB_PACK32: - return VK_FORMAT_A8B8G8R8_UNORM_PACK32; - default: - return format; - } + return util_format_get_component_bits(vk_format_to_pipe_format(format), + colorspace, component); } static inline unsigned vk_format_get_nr_components(VkFormat format) { - const struct vk_format_description *desc = vk_format_description(format); - return desc->nr_channels; + return util_format_get_nr_components(vk_format_to_pipe_format(format)); } #endif /* VK_FORMAT_H */ diff -Nru mesa-19.2.8/src/freedreno/vulkan/vk_format_layout.csv mesa-20.0.8/src/freedreno/vulkan/vk_format_layout.csv --- mesa-19.2.8/src/freedreno/vulkan/vk_format_layout.csv 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/freedreno/vulkan/vk_format_layout.csv 1970-01-01 00:00:00.000000000 +0000 @@ -1,188 +0,0 @@ -/* this is pretty much taken from the gallium one. */ - - -VK_FORMAT_UNDEFINED , plain, 1, 1, u8 , , , , x001, rgb -VK_FORMAT_R4G4_UNORM_PACK8 , plain, 1, 1, un4 , un4 , , , xy01, rgb -VK_FORMAT_R4G4B4A4_UNORM_PACK16 , plain, 1, 1, un4 , un4 , un4 , un4 , wzyx, rgb -VK_FORMAT_B4G4R4A4_UNORM_PACK16 , plain, 1, 1, un4 , un4 , un4 , un4 , wxyz, rgb -VK_FORMAT_R5G6B5_UNORM_PACK16 , plain, 1, 1, un5 , un6 , un5 , , zyx1, rgb -VK_FORMAT_B5G6R5_UNORM_PACK16 , plain, 1, 1, un5 , un6 , un5 , , xyz1, rgb -VK_FORMAT_R5G5B5A1_UNORM_PACK16 , plain, 1, 1, un1 , un5 , un5 , un5 , wzyx, rgb -VK_FORMAT_B5G5R5A1_UNORM_PACK16 , plain, 1, 1, un1 , un5 , un5 , un5 , wxyz, rgb -VK_FORMAT_A1R5G5B5_UNORM_PACK16 , plain, 1, 1, un5 , un5 , un5 , un1 , zyxw, rgb -VK_FORMAT_R8_UNORM , plain, 1, 1, un8 , , , , x001, rgb -VK_FORMAT_R8_SNORM , plain, 1, 1, sn8 , , , , x001, rgb -VK_FORMAT_R8_USCALED , plain, 1, 1, us8 , , , , x001, rgb -VK_FORMAT_R8_SSCALED , plain, 1, 1, ss8 , , , , x001, rgb -VK_FORMAT_R8_UINT , plain, 1, 1, up8 , , , , x001, rgb -VK_FORMAT_R8_SINT , plain, 1, 1, sp8 , , , , x001, rgb -VK_FORMAT_R8_SRGB , plain, 1, 1, un8 , , , , x001, srgb -VK_FORMAT_R8G8_UNORM , plain, 1, 1, un8 , un8 , , , xy01, rgb -VK_FORMAT_R8G8_SNORM , plain, 1, 1, sn8 , sn8 , , , xy01, rgb -VK_FORMAT_R8G8_USCALED , plain, 1, 1, us8 , us8 , , , xy01, rgb -VK_FORMAT_R8G8_SSCALED , plain, 1, 1, ss8 , ss8 , , , xy01, rgb -VK_FORMAT_R8G8_UINT , plain, 1, 1, up8 , up8 , , , xy01, rgb -VK_FORMAT_R8G8_SINT , plain, 1, 1, sp8 , sp8 , , , xy01, rgb -VK_FORMAT_R8G8_SRGB , plain, 1, 1, un8 , un8 , , , xy01, srgb -VK_FORMAT_R8G8B8_UNORM , plain, 1, 1, un8 , un8 , un8 , , xyz1, rgb -VK_FORMAT_R8G8B8_SNORM , plain, 1, 1, sn8 , sn8 , sn8 , , xyz1, rgb -VK_FORMAT_R8G8B8_USCALED , plain, 1, 1, us8 , us8 , us8 , , xyz1, rgb -VK_FORMAT_R8G8B8_SSCALED , plain, 1, 1, ss8 , ss8 , ss8 , , xyz1, rgb -VK_FORMAT_R8G8B8_UINT , plain, 1, 1, up8 , up8 , up8 , , xyz1, rgb -VK_FORMAT_R8G8B8_SINT , plain, 1, 1, sp8 , sp8 , sp8 , , xyz1, rgb -VK_FORMAT_R8G8B8_SRGB , plain, 1, 1, un8 , un8 , un8 , , xyz1, srgb -VK_FORMAT_B8G8R8_UNORM , plain, 1, 1, un8 , un8 , un8 , , zyx1, rgb -VK_FORMAT_B8G8R8_SNORM , plain, 1, 1, sn8 , sn8 , sn8 , , zyx1, rgb -VK_FORMAT_B8G8R8_USCALED , plain, 1, 1, us8 , us8 , us8 , , zyx1, rgb -VK_FORMAT_B8G8R8_SSCALED , plain, 1, 1, ss8 , ss8 , ss8 , , zyx1, rgb -VK_FORMAT_B8G8R8_UINT , plain, 1, 1, up8 , up8 , up8 , , zyx1, rgb -VK_FORMAT_B8G8R8_SINT , plain, 1, 1, sp8 , sp8 , sp8 , , zyx1, rgb -VK_FORMAT_B8G8R8_SRGB , plain, 1, 1, un8 , un8 , un8 , , zyx1, srgb -VK_FORMAT_R8G8B8A8_UNORM , plain, 1, 1, un8 , un8 , un8 , un8 , xyzw, rgb -VK_FORMAT_R8G8B8A8_SNORM , plain, 1, 1, sn8 , sn8 , sn8 , sn8 , xyzw, rgb -VK_FORMAT_R8G8B8A8_USCALED , plain, 1, 1, us8 , us8 , us8 , us8 , xyzw, rgb -VK_FORMAT_R8G8B8A8_SSCALED , plain, 1, 1, ss8 , ss8 , ss8 , ss8 , xyzw, rgb -VK_FORMAT_R8G8B8A8_UINT , plain, 1, 1, up8 , up8 , up8 , up8 , xyzw, rgb -VK_FORMAT_R8G8B8A8_SINT , plain, 1, 1, sp8 , sp8 , sp8 , sp8 , xyzw, rgb -VK_FORMAT_R8G8B8A8_SRGB , plain, 1, 1, un8 , un8 , un8 , un8 , xyzw, srgb -VK_FORMAT_B8G8R8A8_UNORM , plain, 1, 1, un8 , un8 , un8 , un8 , zyxw, rgb -VK_FORMAT_B8G8R8A8_SNORM , plain, 1, 1, sn8 , sn8 , sn8 , sn8 , zyxw, rgb -VK_FORMAT_B8G8R8A8_USCALED , plain, 1, 1, us8 , us8 , us8 , us8 , zyxw, rgb -VK_FORMAT_B8G8R8A8_SSCALED , plain, 1, 1, ss8 , ss8 , ss8 , ss8 , zyxw, rgb -VK_FORMAT_B8G8R8A8_UINT , plain, 1, 1, up8 , up8 , up8 , up8 , zyxw, rgb -VK_FORMAT_B8G8R8A8_SINT , plain, 1, 1, sp8 , sp8 , sp8 , sp8 , zyxw, rgb -VK_FORMAT_B8G8R8A8_SRGB , plain, 1, 1, un8 , un8 , un8 , un8 , zyxw, srgb -VK_FORMAT_A8B8G8R8_UNORM_PACK32 , plain, 1, 1, un8 , un8 , un8 , un8 , xyzw, rgb -VK_FORMAT_A8B8G8R8_SNORM_PACK32 , plain, 1, 1, sn8 , sn8 , sn8 , sn8 , xyzw, rgb -VK_FORMAT_A8B8G8R8_USCALED_PACK32 , plain, 1, 1, us8 , us8 , us8 , us8 , xyzw, rgb -VK_FORMAT_A8B8G8R8_SSCALED_PACK32 , plain, 1, 1, ss8 , ss8 , ss8 , ss8 , xyzw, rgb -VK_FORMAT_A8B8G8R8_UINT_PACK32 , plain, 1, 1, up8 , up8 , up8 , up8 , xyzw, rgb -VK_FORMAT_A8B8G8R8_SINT_PACK32 , plain, 1, 1, sp8 , sp8 , sp8 , sp8 , xyzw, rgb -VK_FORMAT_A8B8G8R8_SRGB_PACK32 , plain, 1, 1, un8 , un8 , un8 , un8 , xyzw, srgb -VK_FORMAT_A2R10G10B10_UNORM_PACK32 , plain, 1, 1, un10, un10, un10, un2 , zyxw, rgb -VK_FORMAT_A2R10G10B10_SNORM_PACK32 , plain, 1, 1, sn10, sn10, sn10, sn2 , zyxw, rgb -VK_FORMAT_A2R10G10B10_USCALED_PACK32 , plain, 1, 1, us10, us10, us10, us2 , zyxw, rgb -VK_FORMAT_A2R10G10B10_SSCALED_PACK32 , plain, 1, 1, ss10, ss10, ss10, ss2 , zyxw, rgb -VK_FORMAT_A2R10G10B10_UINT_PACK32 , plain, 1, 1, up10, up10, up10, up2 , zyxw, rgb -VK_FORMAT_A2R10G10B10_SINT_PACK32 , plain, 1, 1, sp10, sp10, sp10, sp2 , zyxw, rgb -VK_FORMAT_A2B10G10R10_UNORM_PACK32 , plain, 1, 1, un10, un10, un10, un2 , xyzw, rgb -VK_FORMAT_A2B10G10R10_SNORM_PACK32 , plain, 1, 1, sn10, sn10, sn10, sn2 , xyzw, rgb -VK_FORMAT_A2B10G10R10_USCALED_PACK32 , plain, 1, 1, us10, us10, us10, us2 , xyzw, rgb -VK_FORMAT_A2B10G10R10_SSCALED_PACK32 , plain, 1, 1, ss10, ss10, ss10, ss2 , xyzw, rgb -VK_FORMAT_A2B10G10R10_UINT_PACK32 , plain, 1, 1, up10, up10, up10, up2 , xyzw, rgb -VK_FORMAT_A2B10G10R10_SINT_PACK32 , plain, 1, 1, sp10, sp10, sp10, sp2 , xyzw, rgb -VK_FORMAT_R16_UNORM , plain, 1, 1, un16, , , , x001, rgb -VK_FORMAT_R16_SNORM , plain, 1, 1, sn16, , , , x001, rgb -VK_FORMAT_R16_USCALED , plain, 1, 1, us16, , , , x001, rgb -VK_FORMAT_R16_SSCALED , plain, 1, 1, ss16, , , , x001, rgb -VK_FORMAT_R16_UINT , plain, 1, 1, up16, , , , x001, rgb -VK_FORMAT_R16_SINT , plain, 1, 1, sp16, , , , x001, rgb -VK_FORMAT_R16_SFLOAT , plain, 1, 1, f16 , , , , x001, rgb -VK_FORMAT_R16G16_UNORM , plain, 1, 1, un16, un16, , , xy01, rgb -VK_FORMAT_R16G16_SNORM , plain, 1, 1, sn16, sn16, , , xy01, rgb -VK_FORMAT_R16G16_USCALED , plain, 1, 1, us16, us16, , , xy01, rgb -VK_FORMAT_R16G16_SSCALED , plain, 1, 1, ss16, ss16, , , xy01, rgb -VK_FORMAT_R16G16_UINT , plain, 1, 1, up16, up16, , , xy01, rgb -VK_FORMAT_R16G16_SINT , plain, 1, 1, sp16, sp16, , , xy01, rgb -VK_FORMAT_R16G16_SFLOAT , plain, 1, 1, f16 , f16 , , , xy01, rgb -VK_FORMAT_R16G16B16_UNORM , plain, 1, 1, un16, un16, un16, , xyz1, rgb -VK_FORMAT_R16G16B16_SNORM , plain, 1, 1, sn16, sn16, sn16, , xyz1, rgb -VK_FORMAT_R16G16B16_USCALED , plain, 1, 1, us16, us16, us16, , xyz1, rgb -VK_FORMAT_R16G16B16_SSCALED , plain, 1, 1, ss16, ss16, ss16, , xyz1, rgb -VK_FORMAT_R16G16B16_UINT , plain, 1, 1, up16, up16, up16, , xyz1, rgb -VK_FORMAT_R16G16B16_SINT , plain, 1, 1, sp16, sp16, sp16, , xyz1, rgb -VK_FORMAT_R16G16B16_SFLOAT , plain, 1, 1, f16 , f16 , f16 , , xyz1, rgb -VK_FORMAT_R16G16B16A16_UNORM , plain, 1, 1, un16, un16, un16, un16, xyzw, rgb -VK_FORMAT_R16G16B16A16_SNORM , plain, 1, 1, sn16, sn16, sn16, sn16, xyzw, rgb -VK_FORMAT_R16G16B16A16_USCALED , plain, 1, 1, us16, us16, us16, us16, xyzw, rgb -VK_FORMAT_R16G16B16A16_SSCALED , plain, 1, 1, ss16, ss16, ss16, ss16, xyzw, rgb -VK_FORMAT_R16G16B16A16_UINT , plain, 1, 1, up16, up16, up16, up16, xyzw, rgb -VK_FORMAT_R16G16B16A16_SINT , plain, 1, 1, sp16, sp16, sp16, sp16, xyzw, rgb -VK_FORMAT_R16G16B16A16_SFLOAT , plain, 1, 1, f16 , f16 , f16 , f16 , xyzw, rgb -VK_FORMAT_R32_UINT , plain, 1, 1, up32, , , , x001, rgb -VK_FORMAT_R32_SINT , plain, 1, 1, sp32, , , , x001, rgb -VK_FORMAT_R32_SFLOAT , plain, 1, 1, f32 , , , , x001, rgb -VK_FORMAT_R32G32_UINT , plain, 1, 1, up32, up32, , , xy01, rgb -VK_FORMAT_R32G32_SINT , plain, 1, 1, sp32, sp32, , , xy01, rgb -VK_FORMAT_R32G32_SFLOAT , plain, 1, 1, f32 , f32 , , , xy01, rgb -VK_FORMAT_R32G32B32_UINT , plain, 1, 1, up32, up32, up32, , xyz1, rgb -VK_FORMAT_R32G32B32_SINT , plain, 1, 1, sp32, sp32, sp32, , xyz1, rgb -VK_FORMAT_R32G32B32_SFLOAT , plain, 1, 1, f32 , f32 , f32 , , xyz1, rgb -VK_FORMAT_R32G32B32A32_UINT , plain, 1, 1, up32, up32, up32, up32, xyzw, rgb -VK_FORMAT_R32G32B32A32_SINT , plain, 1, 1, sp32, sp32, sp32, sp32, xyzw, rgb -VK_FORMAT_R32G32B32A32_SFLOAT , plain, 1, 1, f32 , f32 , f32 , f32 , xyzw, rgb -VK_FORMAT_R64_UINT , plain, 1, 1, up64, , , , x001, rgb -VK_FORMAT_R64_SINT , plain, 1, 1, sp64, , , , x001, rgb -VK_FORMAT_R64_SFLOAT , plain, 1, 1, f64 , , , , x001, rgb -VK_FORMAT_R64G64_UINT , plain, 1, 1, up64, up64, , , xy01, rgb -VK_FORMAT_R64G64_SINT , plain, 1, 1, sp64, sp64, , , xy01, rgb -VK_FORMAT_R64G64_SFLOAT , plain, 1, 1, f64 , f64 , , , xy01, rgb -VK_FORMAT_R64G64B64_UINT , plain, 1, 1, up64, up64, up64, , xyz1, rgb -VK_FORMAT_R64G64B64_SINT , plain, 1, 1, sp64, sp64, sp64, , xyz1, rgb -VK_FORMAT_R64G64B64_SFLOAT , plain, 1, 1, f64 , f64 , f64 , , xyz1, rgb -VK_FORMAT_R64G64B64A64_UINT , plain, 1, 1, up64, up64, up64, up64, xyzw, rgb -VK_FORMAT_R64G64B64A64_SINT , plain, 1, 1, sp64, sp64, sp64, sp64, xyzw, rgb -VK_FORMAT_R64G64B64A64_SFLOAT , plain, 1, 1, f64 , f64 , f64 , f64 , xyzw, rgb -VK_FORMAT_B10G11R11_UFLOAT_PACK32 , other, 1, 1, x32 , , , , xyz1, rgb -VK_FORMAT_E5B9G9R9_UFLOAT_PACK32 , other, 1, 1, x32 , , , , xyz1, rgb -VK_FORMAT_D16_UNORM , plain, 1, 1, un16, , , , x___, zs -VK_FORMAT_X8_D24_UNORM_PACK32 , plain, 1, 1, un24, x8 , , , x___, zs -VK_FORMAT_D32_SFLOAT , plain, 1, 1, f32 , , , , x___, zs -VK_FORMAT_S8_UINT , plain, 1, 1, up8 , , , , _x__, zs -VK_FORMAT_D16_UNORM_S8_UINT , plain, 1, 1, un16, up8 , , , xy__, zs -VK_FORMAT_D24_UNORM_S8_UINT , plain, 1, 1, un24, up8 , , , xy__, zs -VK_FORMAT_D32_SFLOAT_S8_UINT , plain, 1, 1, f32 , up8 , , , xy__, zs -VK_FORMAT_BC1_RGB_UNORM_BLOCK , s3tc, 4, 4, x64 , , , , xyz1, rgb -VK_FORMAT_BC1_RGB_SRGB_BLOCK , s3tc, 4, 4, x64 , , , , xyz1, srgb -VK_FORMAT_BC1_RGBA_UNORM_BLOCK , s3tc, 4, 4, x64 , , , , xyzw, rgb -VK_FORMAT_BC1_RGBA_SRGB_BLOCK , s3tc, 4, 4, x64 , , , , xyzw, srgb -VK_FORMAT_BC2_UNORM_BLOCK , s3tc, 4, 4, x128, , , , xyzw, rgb -VK_FORMAT_BC2_SRGB_BLOCK , s3tc, 4, 4, x128, , , , xyzw, srgb -VK_FORMAT_BC3_UNORM_BLOCK , s3tc, 4, 4, x128, , , , xyzw, rgb -VK_FORMAT_BC3_SRGB_BLOCK , s3tc, 4, 4, x128, , , , xyzw, srgb -VK_FORMAT_BC4_UNORM_BLOCK , rgtc, 4, 4, x64, , , , x001, rgb -VK_FORMAT_BC4_SNORM_BLOCK , rgtc, 4, 4, x64, , , , x001, rgb -VK_FORMAT_BC5_UNORM_BLOCK , rgtc, 4, 4, x128, , , , xy01, rgb -VK_FORMAT_BC5_SNORM_BLOCK , rgtc, 4, 4, x128, , , , xy01, rgb -VK_FORMAT_BC6H_UFLOAT_BLOCK , bptc, 4, 4, x128, , , , xyz1, rgb -VK_FORMAT_BC6H_SFLOAT_BLOCK , bptc, 4, 4, x128, , , , xyz1, rgb -VK_FORMAT_BC7_UNORM_BLOCK , bptc, 4, 4, x128, , , , xyzw, rgb -VK_FORMAT_BC7_SRGB_BLOCK , bptc, 4, 4, x128, , , , xyzw, srgb -VK_FORMAT_ETC2_R8G8B8_UNORM_BLOCK , etc, 4, 4, x64, , , , xyz1, rgb -VK_FORMAT_ETC2_R8G8B8_SRGB_BLOCK , etc, 4, 4, x64, , , , xyz1, srgb -VK_FORMAT_ETC2_R8G8B8A1_UNORM_BLOCK , etc, 4, 4, x64, , , , xyzw, rgb -VK_FORMAT_ETC2_R8G8B8A1_SRGB_BLOCK , etc, 4, 4, x64, , , , xyzw, srgb -VK_FORMAT_ETC2_R8G8B8A8_UNORM_BLOCK , etc, 4, 4, x128, , , , xyzw, rgb -VK_FORMAT_ETC2_R8G8B8A8_SRGB_BLOCK , etc, 4, 4, x128, , , , xyzw, srgb -VK_FORMAT_EAC_R11_UNORM_BLOCK , etc, 4, 4, x64, , , , x001, rgb -VK_FORMAT_EAC_R11_SNORM_BLOCK , etc, 4, 4, x64, , , , x001, rgb -VK_FORMAT_EAC_R11G11_UNORM_BLOCK , etc, 4, 4, x128, , , , xy01, rgb -VK_FORMAT_EAC_R11G11_SNORM_BLOCK , etc, 4, 4, x128, , , , xy01, rgb -VK_FORMAT_ASTC_4x4_UNORM_BLOCK, -VK_FORMAT_ASTC_4x4_SRGB_BLOCK, -VK_FORMAT_ASTC_5x4_UNORM_BLOCK, -VK_FORMAT_ASTC_5x4_SRGB_BLOCK, -VK_FORMAT_ASTC_5x5_UNORM_BLOCK, -VK_FORMAT_ASTC_5x5_SRGB_BLOCK, -VK_FORMAT_ASTC_6x5_UNORM_BLOCK, -VK_FORMAT_ASTC_6x5_SRGB_BLOCK, -VK_FORMAT_ASTC_6x6_UNORM_BLOCK, -VK_FORMAT_ASTC_6x6_SRGB_BLOCK, -VK_FORMAT_ASTC_8x5_UNORM_BLOCK, -VK_FORMAT_ASTC_8x5_SRGB_BLOCK, -VK_FORMAT_ASTC_8x6_UNORM_BLOCK, -VK_FORMAT_ASTC_8x6_SRGB_BLOCK, -VK_FORMAT_ASTC_8x8_UNORM_BLOCK, -VK_FORMAT_ASTC_8x8_SRGB_BLOCK, -VK_FORMAT_ASTC_10x5_UNORM_BLOCK, -VK_FORMAT_ASTC_10x5_SRGB_BLOCK, -VK_FORMAT_ASTC_10x6_UNORM_BLOCK, -VK_FORMAT_ASTC_10x6_SRGB_BLOCK, -VK_FORMAT_ASTC_10x8_UNORM_BLOCK, -VK_FORMAT_ASTC_10x8_SRGB_BLOCK, -VK_FORMAT_ASTC_10x10_UNORM_BLOCK, -VK_FORMAT_ASTC_10x10_SRGB_BLOCK, -VK_FORMAT_ASTC_12x10_UNORM_BLOCK, -VK_FORMAT_ASTC_12x10_SRGB_BLOCK, -VK_FORMAT_ASTC_12x12_UNORM_BLOCK, -VK_FORMAT_ASTC_12x12_SRGB_BLOCK, diff -Nru mesa-19.2.8/src/freedreno/vulkan/vk_format_table.py mesa-20.0.8/src/freedreno/vulkan/vk_format_table.py --- mesa-19.2.8/src/freedreno/vulkan/vk_format_table.py 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/freedreno/vulkan/vk_format_table.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,173 +0,0 @@ -from __future__ import print_function - -CopyRight = ''' -/************************************************************************** - * - * Copyright 2010 VMware, Inc. - * All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sub license, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice (including the - * next paragraph) shall be included in all copies or substantial portions - * of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. - * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR - * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - **************************************************************************/ -''' - - -import sys - -from vk_format_parse import * - -def layout_map(layout): - return 'VK_FORMAT_LAYOUT_' + str(layout).upper() - - -def colorspace_map(colorspace): - return 'VK_FORMAT_COLORSPACE_' + str(colorspace).upper() - - -colorspace_channels_map = { - 'rgb': ['r', 'g', 'b', 'a'], - 'srgb': ['sr', 'sg', 'sb', 'a'], - 'zs': ['z', 's'], - 'yuv': ['y', 'u', 'v'], -} - - -type_map = { - VOID: "VK_FORMAT_TYPE_VOID", - UNSIGNED: "VK_FORMAT_TYPE_UNSIGNED", - SIGNED: "VK_FORMAT_TYPE_SIGNED", - FIXED: "VK_FORMAT_TYPE_FIXED", - FLOAT: "VK_FORMAT_TYPE_FLOAT", -} - - -def bool_map(value): - if value: - return "true" - else: - return "false" - - -swizzle_map = { - SWIZZLE_X: "VK_SWIZZLE_X", - SWIZZLE_Y: "VK_SWIZZLE_Y", - SWIZZLE_Z: "VK_SWIZZLE_Z", - SWIZZLE_W: "VK_SWIZZLE_W", - SWIZZLE_0: "VK_SWIZZLE_0", - SWIZZLE_1: "VK_SWIZZLE_1", - SWIZZLE_NONE: "VK_SWIZZLE_NONE", -} - -def print_channels(format, func): - if format.nr_channels() <= 1: - func(format.le_channels, format.le_swizzles) - else: - print('#ifdef PIPE_ARCH_BIG_ENDIAN') - func(format.be_channels, format.be_swizzles) - print('#else') - func(format.le_channels, format.le_swizzles) - print('#endif') - -def write_format_table(formats): - print('/* This file is autogenerated by vk_format_table.py from vk_format_layout.csv. Do not edit directly. */') - print() - # This will print the copyright message on the top of this file - print(CopyRight.strip()) - print() - print('#include "stdbool.h"') - print('#include "vk_format.h"') - print() - - def do_channel_array(channels, swizzles): - print(" {") - for i in range(4): - channel = channels[i] - if i < 3: - sep = "," - else: - sep = "" - if channel.size: - print(" {%s, %s, %s, %s, %u, %u}%s\t/* %s = %s */" % (type_map[channel.type], bool_map(channel.norm), bool_map(channel.pure), bool_map(channel.scaled), channel.size, channel.shift, sep, "xyzw"[i], channel.name)) - else: - print(" {0, 0, 0, 0, 0}%s" % (sep,)) - print(" },") - - def do_swizzle_array(channels, swizzles): - print(" {") - for i in range(4): - swizzle = swizzles[i] - if i < 3: - sep = "," - else: - sep = "" - try: - comment = colorspace_channels_map[format.colorspace][i] - except (KeyError, IndexError): - comment = 'ignored' - print(" %s%s\t/* %s */" % (swizzle_map[swizzle], sep, comment)) - print(" },") - - for format in formats: - print('static const struct vk_format_description') - print('vk_format_%s_description = {' % (format.short_name(),)) - print(" %s," % (format.name,)) - print(" \"%s\"," % (format.name,)) - print(" \"%s\"," % (format.short_name(),)) - print(" {%u, %u, %u},\t/* block */" % (format.block_width, format.block_height, format.block_size())) - print(" %s," % (layout_map(format.layout),)) - print(" %u,\t/* nr_channels */" % (format.nr_channels(),)) - print(" %s,\t/* is_array */" % (bool_map(format.is_array()),)) - print(" %s,\t/* is_bitmask */" % (bool_map(format.is_bitmask()),)) - print(" %s,\t/* is_mixed */" % (bool_map(format.is_mixed()),)) - print_channels(format, do_channel_array) - print_channels(format, do_swizzle_array) - print(" %s," % (colorspace_map(format.colorspace),)) - print("};") - print() - - print("const struct vk_format_description *") - print("vk_format_description(VkFormat format)") - print("{") - print(" if (format > VK_FORMAT_END_RANGE) {") - print(" return NULL;") - print(" }") - print() - print(" switch (format) {") - for format in formats: - print(" case %s:" % format.name) - print(" return &vk_format_%s_description;" % (format.short_name(),)) - print(" default:") - print(" return NULL;") - print(" }") - print("}") - print() - - -def main(): - - formats = [] - for arg in sys.argv[1:]: - formats.extend(parse(arg)) - write_format_table(formats) - - -if __name__ == '__main__': - main() diff -Nru mesa-19.2.8/src/gallium/Android.common.mk mesa-20.0.8/src/gallium/Android.common.mk --- mesa-19.2.8/src/gallium/Android.common.mk 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/Android.common.mk 2020-06-12 01:21:16.000000000 +0000 @@ -28,6 +28,7 @@ $(GALLIUM_TOP)/auxiliary \ $(GALLIUM_TOP)/winsys \ $(GALLIUM_TOP)/drivers \ + $(MESA_TOP)/src/etnaviv \ $(MESA_TOP)/src/freedreno \ $(MESA_TOP)/src/freedreno/ir3 \ $(MESA_TOP)/src/freedreno/registers diff -Nru mesa-19.2.8/src/gallium/Android.mk mesa-20.0.8/src/gallium/Android.mk --- mesa-19.2.8/src/gallium/Android.mk 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/Android.mk 2020-06-12 01:21:16.000000000 +0000 @@ -49,6 +49,7 @@ SUBDIRS += state_trackers/dri SUBDIRS += winsys/iris/drm drivers/iris SUBDIRS += winsys/lima/drm drivers/lima +SUBDIRS += winsys/panfrost/drm drivers/panfrost # sort to eliminate any duplicates INC_DIRS := $(call all-named-subdir-makefiles,$(sort $(SUBDIRS))) diff -Nru mesa-19.2.8/src/gallium/auxiliary/Android.mk mesa-20.0.8/src/gallium/auxiliary/Android.mk --- mesa-19.2.8/src/gallium/auxiliary/Android.mk 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/Android.mk 2020-06-12 01:21:16.000000000 +0000 @@ -70,10 +70,18 @@ $(intermediates)/util/u_format_srgb.c: $(intermediates)/%.c: $(LOCAL_PATH)/%.py $(transform-generated-source) -$(intermediates)/util/u_format_table.c: $(intermediates)/%.c: $(LOCAL_PATH)/%.py $(LOCAL_PATH)/util/u_format.csv - $(transform-generated-source) - LOCAL_GENERATED_SOURCES += $(MESA_GEN_NIR_H) include $(GALLIUM_COMMON_MK) include $(BUILD_STATIC_LIBRARY) + +# Build libmesa_galliumvl used by radeonsi +include $(CLEAR_VARS) + +LOCAL_SRC_FILES := \ + $(VL_SOURCES) + +LOCAL_MODULE := libmesa_galliumvl + +include $(GALLIUM_COMMON_MK) +include $(BUILD_STATIC_LIBRARY) diff -Nru mesa-19.2.8/src/gallium/auxiliary/cso_cache/cso_context.c mesa-20.0.8/src/gallium/auxiliary/cso_cache/cso_context.c --- mesa-19.2.8/src/gallium/auxiliary/cso_cache/cso_context.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/cso_cache/cso_context.c 2020-06-12 01:21:16.000000000 +0000 @@ -64,7 +64,10 @@ struct cso_context { struct pipe_context *pipe; struct cso_cache *cache; + struct u_vbuf *vbuf; + struct u_vbuf *vbuf_current; + bool always_use_vbuf; boolean has_geometry_shader; boolean has_tessellation; @@ -287,15 +290,22 @@ static void cso_init_vbuf(struct cso_context *cso, unsigned flags) { struct u_vbuf_caps caps; + bool uses_user_vertex_buffers = !(flags & CSO_NO_USER_VERTEX_BUFFERS); + + u_vbuf_get_caps(cso->pipe->screen, &caps); - /* Install u_vbuf if there is anything unsupported. */ - if (u_vbuf_get_caps(cso->pipe->screen, &caps, flags)) { + /* Enable u_vbuf if needed. */ + if (caps.fallback_always || + (uses_user_vertex_buffers && + caps.fallback_only_for_user_vbuffers)) { cso->vbuf = u_vbuf_create(cso->pipe, &caps); + cso->vbuf_current = cso->vbuf; + cso->always_use_vbuf = caps.fallback_always; } } struct cso_context * -cso_create_context(struct pipe_context *pipe, unsigned u_vbuf_flags) +cso_create_context(struct pipe_context *pipe, unsigned flags) { struct cso_context *ctx = CALLOC_STRUCT(cso_context); if (!ctx) @@ -311,7 +321,7 @@ ctx->pipe = pipe; ctx->sample_mask = ~0; - cso_init_vbuf(ctx, u_vbuf_flags); + cso_init_vbuf(ctx, flags); /* Enable for testing: */ if (0) cso_set_maximum_cache_size( ctx->cache, 4 ); @@ -652,16 +662,6 @@ } } -void cso_delete_fragment_shader(struct cso_context *ctx, void *handle ) -{ - if (handle == ctx->fragment_shader) { - /* unbind before deleting */ - ctx->pipe->bind_fs_state(ctx->pipe, NULL); - ctx->fragment_shader = NULL; - } - ctx->pipe->delete_fs_state(ctx->pipe, handle); -} - static void cso_save_fragment_shader(struct cso_context *ctx) { @@ -688,16 +688,6 @@ } } -void cso_delete_vertex_shader(struct cso_context *ctx, void *handle ) -{ - if (handle == ctx->vertex_shader) { - /* unbind before deleting */ - ctx->pipe->bind_vs_state(ctx->pipe, NULL); - ctx->vertex_shader = NULL; - } - ctx->pipe->delete_vs_state(ctx->pipe, handle); -} - static void cso_save_vertex_shader(struct cso_context *ctx) { @@ -904,16 +894,6 @@ } } -void cso_delete_geometry_shader(struct cso_context *ctx, void *handle) -{ - if (handle == ctx->geometry_shader) { - /* unbind before deleting */ - ctx->pipe->bind_gs_state(ctx->pipe, NULL); - ctx->geometry_shader = NULL; - } - ctx->pipe->delete_gs_state(ctx->pipe, handle); -} - static void cso_save_geometry_shader(struct cso_context *ctx) { @@ -949,16 +929,6 @@ } } -void cso_delete_tessctrl_shader(struct cso_context *ctx, void *handle) -{ - if (handle == ctx->tessctrl_shader) { - /* unbind before deleting */ - ctx->pipe->bind_tcs_state(ctx->pipe, NULL); - ctx->tessctrl_shader = NULL; - } - ctx->pipe->delete_tcs_state(ctx->pipe, handle); -} - static void cso_save_tessctrl_shader(struct cso_context *ctx) { @@ -994,16 +964,6 @@ } } -void cso_delete_tesseval_shader(struct cso_context *ctx, void *handle) -{ - if (handle == ctx->tesseval_shader) { - /* unbind before deleting */ - ctx->pipe->bind_tes_state(ctx->pipe, NULL); - ctx->tesseval_shader = NULL; - } - ctx->pipe->delete_tes_state(ctx->pipe, handle); -} - static void cso_save_tesseval_shader(struct cso_context *ctx) { @@ -1039,32 +999,16 @@ } } -void cso_delete_compute_shader(struct cso_context *ctx, void *handle) -{ - if (handle == ctx->compute_shader) { - /* unbind before deleting */ - ctx->pipe->bind_compute_state(ctx->pipe, NULL); - ctx->compute_shader = NULL; - } - ctx->pipe->delete_compute_state(ctx->pipe, handle); -} - -enum pipe_error -cso_set_vertex_elements(struct cso_context *ctx, - unsigned count, - const struct pipe_vertex_element *states) +static void +cso_set_vertex_elements_direct(struct cso_context *ctx, + unsigned count, + const struct pipe_vertex_element *states) { - struct u_vbuf *vbuf = ctx->vbuf; unsigned key_size, hash_key; struct cso_hash_iter iter; void *handle; struct cso_velems_state velems_state; - if (vbuf) { - u_vbuf_set_vertex_elements(vbuf, count, states); - return PIPE_OK; - } - /* Need to include the count into the stored state data too. * Otherwise first few count pipe_vertex_elements could be identical * even if count is different, and there's no guarantee the hash would @@ -1081,7 +1025,7 @@ if (cso_hash_iter_is_null(iter)) { struct cso_velements *cso = MALLOC(sizeof(struct cso_velements)); if (!cso) - return PIPE_ERROR_OUT_OF_MEMORY; + return; memcpy(&cso->state, &velems_state, key_size); cso->data = ctx->pipe->create_vertex_elements_state(ctx->pipe, count, @@ -1093,7 +1037,7 @@ iter = cso_insert_state(ctx->cache, hash_key, CSO_VELEMENTS, cso); if (cso_hash_iter_is_null(iter)) { FREE(cso); - return PIPE_ERROR_OUT_OF_MEMORY; + return; } handle = cso->data; @@ -1106,13 +1050,28 @@ ctx->velements = handle; ctx->pipe->bind_vertex_elements_state(ctx->pipe, handle); } +} + +enum pipe_error +cso_set_vertex_elements(struct cso_context *ctx, + unsigned count, + const struct pipe_vertex_element *states) +{ + struct u_vbuf *vbuf = ctx->vbuf_current; + + if (vbuf) { + u_vbuf_set_vertex_elements(vbuf, count, states); + return PIPE_OK; + } + + cso_set_vertex_elements_direct(ctx, count, states); return PIPE_OK; } static void cso_save_vertex_elements(struct cso_context *ctx) { - struct u_vbuf *vbuf = ctx->vbuf; + struct u_vbuf *vbuf = ctx->vbuf_current; if (vbuf) { u_vbuf_save_vertex_elements(vbuf); @@ -1126,7 +1085,7 @@ static void cso_restore_vertex_elements(struct cso_context *ctx) { - struct u_vbuf *vbuf = ctx->vbuf; + struct u_vbuf *vbuf = ctx->vbuf_current; if (vbuf) { u_vbuf_restore_vertex_elements(vbuf); @@ -1142,11 +1101,32 @@ /* vertex buffers */ +static void +cso_set_vertex_buffers_direct(struct cso_context *ctx, + unsigned start_slot, unsigned count, + const struct pipe_vertex_buffer *buffers) +{ + /* Save what's in the auxiliary slot, so that we can save and restore it + * for meta ops. + */ + if (start_slot == 0) { + if (buffers) { + pipe_vertex_buffer_reference(&ctx->vertex_buffer0_current, + buffers); + } else { + pipe_vertex_buffer_unreference(&ctx->vertex_buffer0_current); + } + } + + ctx->pipe->set_vertex_buffers(ctx->pipe, start_slot, count, buffers); +} + + void cso_set_vertex_buffers(struct cso_context *ctx, unsigned start_slot, unsigned count, const struct pipe_vertex_buffer *buffers) { - struct u_vbuf *vbuf = ctx->vbuf; + struct u_vbuf *vbuf = ctx->vbuf_current; if (!count) return; @@ -1156,24 +1136,13 @@ return; } - /* Save what's in the auxiliary slot, so that we can save and restore it - * for meta ops. */ - if (start_slot == 0) { - if (buffers) { - pipe_vertex_buffer_reference(&ctx->vertex_buffer0_current, - buffers); - } else { - pipe_vertex_buffer_unreference(&ctx->vertex_buffer0_current); - } - } - - ctx->pipe->set_vertex_buffers(ctx->pipe, start_slot, count, buffers); + cso_set_vertex_buffers_direct(ctx, start_slot, count, buffers); } static void cso_save_vertex_buffer0(struct cso_context *ctx) { - struct u_vbuf *vbuf = ctx->vbuf; + struct u_vbuf *vbuf = ctx->vbuf_current; if (vbuf) { u_vbuf_save_vertex_buffer0(vbuf); @@ -1187,7 +1156,7 @@ static void cso_restore_vertex_buffer0(struct cso_context *ctx) { - struct u_vbuf *vbuf = ctx->vbuf; + struct u_vbuf *vbuf = ctx->vbuf_current; if (vbuf) { u_vbuf_restore_vertex_buffer0(vbuf); @@ -1198,6 +1167,68 @@ pipe_vertex_buffer_unreference(&ctx->vertex_buffer0_saved); } +/** + * Set vertex buffers and vertex elements. Skip u_vbuf if it's only needed + * for user vertex buffers and user vertex buffers are not set by this call. + * u_vbuf will be disabled. To re-enable u_vbuf, call this function again. + * + * Skipping u_vbuf decreases CPU overhead for draw calls that don't need it, + * such as VBOs, glBegin/End, and display lists. + * + * Internal operations that do "save states, draw, restore states" shouldn't + * use this, because the states are only saved in either cso_context or + * u_vbuf, not both. + */ +void +cso_set_vertex_buffers_and_elements(struct cso_context *ctx, + unsigned velem_count, + const struct pipe_vertex_element *velems, + unsigned vb_count, + unsigned unbind_trailing_vb_count, + const struct pipe_vertex_buffer *vbuffers, + bool uses_user_vertex_buffers) +{ + struct u_vbuf *vbuf = ctx->vbuf; + + if (vbuf && (ctx->always_use_vbuf || uses_user_vertex_buffers)) { + if (!ctx->vbuf_current) { + /* Unbind all buffers in cso_context, because we'll use u_vbuf. */ + unsigned unbind_vb_count = vb_count + unbind_trailing_vb_count; + if (unbind_vb_count) + cso_set_vertex_buffers_direct(ctx, 0, unbind_vb_count, NULL); + + /* Unset this to make sure the CSO is re-bound on the next use. */ + ctx->velements = NULL; + ctx->vbuf_current = vbuf; + } else if (unbind_trailing_vb_count) { + u_vbuf_set_vertex_buffers(vbuf, vb_count, unbind_trailing_vb_count, + NULL); + } + + if (vb_count) + u_vbuf_set_vertex_buffers(vbuf, 0, vb_count, vbuffers); + u_vbuf_set_vertex_elements(vbuf, velem_count, velems); + return; + } + + if (ctx->vbuf_current) { + /* Unbind all buffers in u_vbuf, because we'll use cso_context. */ + unsigned unbind_vb_count = vb_count + unbind_trailing_vb_count; + if (unbind_vb_count) + u_vbuf_set_vertex_buffers(vbuf, 0, unbind_vb_count, NULL); + + /* Unset this to make sure the CSO is re-bound on the next use. */ + u_vbuf_unset_vertex_elements(vbuf); + ctx->vbuf_current = NULL; + } else if (unbind_trailing_vb_count) { + cso_set_vertex_buffers_direct(ctx, vb_count, unbind_trailing_vb_count, + NULL); + } + + if (vb_count) + cso_set_vertex_buffers_direct(ctx, 0, vb_count, vbuffers); + cso_set_vertex_elements_direct(ctx, velem_count, velems); +} void cso_single_sampler(struct cso_context *ctx, enum pipe_shader_type shader_stage, @@ -1693,7 +1724,7 @@ cso_draw_vbo(struct cso_context *cso, const struct pipe_draw_info *info) { - struct u_vbuf *vbuf = cso->vbuf; + struct u_vbuf *vbuf = cso->vbuf_current; /* We can't have both indirect drawing and SO-vertex-count drawing */ assert(info->indirect == NULL || info->count_from_stream_output == NULL); diff -Nru mesa-19.2.8/src/gallium/auxiliary/cso_cache/cso_context.h mesa-20.0.8/src/gallium/auxiliary/cso_cache/cso_context.h --- mesa-19.2.8/src/gallium/auxiliary/cso_cache/cso_context.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/cso_cache/cso_context.h 2020-06-12 01:21:16.000000000 +0000 @@ -41,8 +41,10 @@ struct cso_context; struct u_vbuf; +#define CSO_NO_USER_VERTEX_BUFFERS (1 << 0) + struct cso_context *cso_create_context(struct pipe_context *pipe, - unsigned u_vbuf_flags); + unsigned flags); void cso_destroy_context( struct cso_context *cso ); struct pipe_context *cso_get_pipe_context(struct cso_context *cso); @@ -101,27 +103,11 @@ */ void cso_set_fragment_shader_handle(struct cso_context *ctx, void *handle); -void cso_delete_fragment_shader(struct cso_context *ctx, void *handle ); - - void cso_set_vertex_shader_handle(struct cso_context *ctx, void *handle); -void cso_delete_vertex_shader(struct cso_context *ctx, void *handle ); - - void cso_set_geometry_shader_handle(struct cso_context *ctx, void *handle); -void cso_delete_geometry_shader(struct cso_context *ctx, void *handle); - - void cso_set_tessctrl_shader_handle(struct cso_context *ctx, void *handle); -void cso_delete_tessctrl_shader(struct cso_context *ctx, void *handle); - - void cso_set_tesseval_shader_handle(struct cso_context *ctx, void *handle); -void cso_delete_tesseval_shader(struct cso_context *ctx, void *handle); - - void cso_set_compute_shader_handle(struct cso_context *ctx, void *handle); -void cso_delete_compute_shader(struct cso_context *ctx, void *handle); void cso_set_framebuffer(struct cso_context *cso, @@ -217,6 +203,15 @@ void cso_restore_constant_buffer_slot0(struct cso_context *cso, enum pipe_shader_type shader_stage); +/* Optimized version. */ +void +cso_set_vertex_buffers_and_elements(struct cso_context *ctx, + unsigned velem_count, + const struct pipe_vertex_element *velems, + unsigned vb_count, + unsigned unbind_trailing_vb_count, + const struct pipe_vertex_buffer *vbuffers, + bool uses_user_vertex_buffers); /* drawing */ diff -Nru mesa-19.2.8/src/gallium/auxiliary/draw/draw_context.c mesa-20.0.8/src/gallium/auxiliary/draw/draw_context.c --- mesa-19.2.8/src/gallium/auxiliary/draw/draw_context.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/draw/draw_context.c 2020-06-12 01:21:16.000000000 +0000 @@ -38,14 +38,14 @@ #include "util/u_inlines.h" #include "util/u_helpers.h" #include "util/u_prim.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "draw_context.h" #include "draw_pipe.h" #include "draw_prim_assembler.h" #include "draw_vs.h" #include "draw_gs.h" -#if HAVE_LLVM +#ifdef LLVM_AVAILABLE #include "gallivm/lp_bld_init.h" #include "gallivm/lp_bld_limits.h" #include "draw_llvm.h" @@ -63,6 +63,15 @@ } #endif +bool +draw_has_llvm(void) +{ +#ifdef LLVM_AVAILABLE + return draw_get_option_use_llvm(); +#else + return false; +#endif +} /** * Create new draw module context with gallivm state for LLVM JIT. @@ -78,7 +87,7 @@ /* we need correct cpu caps for disabling denorms in draw_vbo() */ util_cpu_detect(); -#if HAVE_LLVM +#ifdef LLVM_AVAILABLE if (try_llvm && draw_get_option_use_llvm()) { draw->llvm = draw_llvm_create(draw, (LLVMContextRef)context); } @@ -112,7 +121,7 @@ } -#if HAVE_LLVM +#ifdef LLVM_AVAILABLE struct draw_context * draw_create_with_llvm_context(struct pipe_context *pipe, void *context) @@ -220,7 +229,7 @@ draw_pt_destroy( draw ); draw_vs_destroy( draw ); draw_gs_destroy( draw ); -#ifdef HAVE_LLVM +#ifdef LLVM_AVAILABLE if (draw->llvm) draw_llvm_destroy( draw->llvm ); #endif @@ -1027,13 +1036,34 @@ draw->num_samplers[shader_stage] = num; -#ifdef HAVE_LLVM +#ifdef LLVM_AVAILABLE if (draw->llvm) draw_llvm_set_sampler_state(draw, shader_stage); #endif } void +draw_set_images(struct draw_context *draw, + enum pipe_shader_type shader_stage, + struct pipe_image_view *views, + unsigned num) +{ + unsigned i; + + debug_assert(shader_stage < PIPE_SHADER_TYPES); + debug_assert(num <= PIPE_MAX_SHADER_IMAGES); + + draw_do_flush( draw, DRAW_FLUSH_STATE_CHANGE ); + + for (i = 0; i < num; ++i) + draw->images[shader_stage][i] = &views[i]; + for (i = num; i < draw->num_sampler_views[shader_stage]; ++i) + draw->images[shader_stage][i] = NULL; + + draw->num_images[shader_stage] = num; +} + +void draw_set_mapped_texture(struct draw_context *draw, enum pipe_shader_type shader_stage, unsigned sview_idx, @@ -1044,7 +1074,7 @@ uint32_t img_stride[PIPE_MAX_TEXTURE_LEVELS], uint32_t mip_offsets[PIPE_MAX_TEXTURE_LEVELS]) { -#ifdef HAVE_LLVM +#ifdef LLVM_AVAILABLE if (draw->llvm) draw_llvm_set_mapped_texture(draw, shader_stage, @@ -1055,6 +1085,26 @@ #endif } +void +draw_set_mapped_image(struct draw_context *draw, + enum pipe_shader_type shader_stage, + unsigned idx, + uint32_t width, uint32_t height, uint32_t depth, + const void *base_ptr, + uint32_t row_stride, + uint32_t img_stride) +{ +#ifdef LLVM_AVAILABLE + if (draw->llvm) + draw_llvm_set_mapped_image(draw, + shader_stage, + idx, + width, height, depth, + base_ptr, + row_stride, img_stride); +#endif +} + /** * XXX: Results for PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS because there are two * different ways of setting textures, and drivers typically only support one. @@ -1082,7 +1132,7 @@ draw_get_shader_param(enum pipe_shader_type shader, enum pipe_shader_cap param) { -#ifdef HAVE_LLVM +#ifdef LLVM_AVAILABLE if (draw_get_option_use_llvm()) { switch(shader) { case PIPE_SHADER_VERTEX: @@ -1115,6 +1165,15 @@ } /** + * Enable/disable primitives generated gathering. + */ +void draw_collect_primitives_generated(struct draw_context *draw, + bool enable) +{ + draw->collect_primgen = enable; +} + +/** * Computes clipper invocation statistics. * * Figures out how many primitives would have been diff -Nru mesa-19.2.8/src/gallium/auxiliary/draw/draw_context.h mesa-20.0.8/src/gallium/auxiliary/draw/draw_context.h --- mesa-19.2.8/src/gallium/auxiliary/draw/draw_context.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/draw/draw_context.h 2020-06-12 01:21:16.000000000 +0000 @@ -63,9 +63,11 @@ int internal_offset; }; +bool draw_has_llvm(void); + struct draw_context *draw_create( struct pipe_context *pipe ); -#if HAVE_LLVM +#ifdef LLVM_AVAILABLE struct draw_context *draw_create_with_llvm_context(struct pipe_context *pipe, void *context); #endif @@ -177,6 +179,12 @@ unsigned num); void +draw_set_images(struct draw_context *draw, + enum pipe_shader_type shader_stage, + struct pipe_image_view *images, + unsigned num); + +void draw_set_mapped_texture(struct draw_context *draw, enum pipe_shader_type shader_stage, unsigned sview_idx, @@ -187,6 +195,14 @@ uint32_t img_stride[PIPE_MAX_TEXTURE_LEVELS], uint32_t mip_offsets[PIPE_MAX_TEXTURE_LEVELS]); +void +draw_set_mapped_image(struct draw_context *draw, + enum pipe_shader_type shader_stage, + unsigned idx, + uint32_t width, uint32_t height, uint32_t depth, + const void *base_ptr, + uint32_t row_stride, + uint32_t img_stride); /* * Vertex shader functions @@ -298,6 +314,9 @@ void draw_collect_pipeline_statistics(struct draw_context *draw, boolean enable); +void draw_collect_primitives_generated(struct draw_context *draw, + bool eanble); + /******************************************************************************* * Draw pipeline */ diff -Nru mesa-19.2.8/src/gallium/auxiliary/draw/draw_fs.c mesa-20.0.8/src/gallium/auxiliary/draw/draw_fs.c --- mesa-19.2.8/src/gallium/auxiliary/draw/draw_fs.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/draw/draw_fs.c 2020-06-12 01:21:16.000000000 +0000 @@ -32,6 +32,7 @@ #include "util/u_prim.h" #include "tgsi/tgsi_parse.h" +#include "nir/nir_to_tgsi_info.h" #include "draw_fs.h" #include "draw_private.h" @@ -47,7 +48,10 @@ dfs = CALLOC_STRUCT(draw_fragment_shader); if (dfs) { dfs->base = *shader; - tgsi_scan_shader(shader->tokens, &dfs->info); + if (shader->type == PIPE_SHADER_IR_TGSI) + tgsi_scan_shader(shader->tokens, &dfs->info); + else + nir_tgsi_scan_shader(shader->ir.nir, &dfs->info, true); } return dfs; diff -Nru mesa-19.2.8/src/gallium/auxiliary/draw/draw_gs.c mesa-20.0.8/src/gallium/auxiliary/draw/draw_gs.c --- mesa-19.2.8/src/gallium/auxiliary/draw/draw_gs.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/draw/draw_gs.c 2020-06-12 01:21:16.000000000 +0000 @@ -29,19 +29,19 @@ #include "draw_private.h" #include "draw_context.h" -#ifdef HAVE_LLVM +#ifdef LLVM_AVAILABLE #include "draw_llvm.h" #endif #include "tgsi/tgsi_parse.h" #include "tgsi/tgsi_exec.h" - +#include "nir/nir_to_tgsi_info.h" #include "pipe/p_shader_tokens.h" #include "util/u_math.h" #include "util/u_memory.h" #include "util/u_prim.h" - +#include "util/ralloc.h" /* fixme: move it from here */ #define MAX_PRIMITIVES 64 @@ -239,7 +239,7 @@ } } -#ifdef HAVE_LLVM +#ifdef LLVM_AVAILABLE static void llvm_fetch_gs_input(struct draw_geometry_shader *shader, @@ -632,7 +632,7 @@ shader->input = input; shader->input_info = input_info; -#ifdef HAVE_LLVM +#ifdef LLVM_AVAILABLE if (shader->draw->llvm) { shader->gs_output = output_verts[0].verts; if (max_out_prims > shader->max_out_prims) { @@ -765,14 +765,14 @@ draw_create_geometry_shader(struct draw_context *draw, const struct pipe_shader_state *state) { -#ifdef HAVE_LLVM +#ifdef LLVM_AVAILABLE boolean use_llvm = draw->llvm != NULL; struct llvm_geometry_shader *llvm_gs = NULL; #endif struct draw_geometry_shader *gs; unsigned i; -#ifdef HAVE_LLVM +#ifdef LLVM_AVAILABLE if (use_llvm) { llvm_gs = CALLOC_STRUCT(llvm_geometry_shader); @@ -793,18 +793,22 @@ gs->draw = draw; gs->state = *state; - gs->state.tokens = tgsi_dup_tokens(state->tokens); - if (!gs->state.tokens) { - FREE(gs); - return NULL; - } - tgsi_scan_shader(state->tokens, &gs->info); + if (state->type == PIPE_SHADER_IR_TGSI) { + gs->state.tokens = tgsi_dup_tokens(state->tokens); + if (!gs->state.tokens) { + FREE(gs); + return NULL; + } + + tgsi_scan_shader(state->tokens, &gs->info); + } else + nir_tgsi_scan_shader(state->ir.nir, &gs->info, true); /* setup the defaults */ gs->max_out_prims = 0; -#ifdef HAVE_LLVM +#ifdef LLVM_AVAILABLE if (use_llvm) { /* TODO: change the input array to handle the following vector length, instead of the currently hardcoded @@ -861,7 +865,7 @@ gs->num_vertex_streams = gs->state.stream_output.output[i].stream + 1; } -#ifdef HAVE_LLVM +#ifdef LLVM_AVAILABLE if (use_llvm) { int vector_size = gs->vector_length * sizeof(float); gs->gs_input = align_malloc(sizeof(struct draw_gs_inputs), 16); @@ -883,7 +887,8 @@ llvm_gs->variant_key_size = draw_gs_llvm_variant_key_size( MAX2(gs->info.file_max[TGSI_FILE_SAMPLER]+1, - gs->info.file_max[TGSI_FILE_SAMPLER_VIEW]+1)); + gs->info.file_max[TGSI_FILE_SAMPLER_VIEW]+1), + gs->info.file_max[TGSI_FILE_IMAGE]+1); } else #endif { @@ -920,7 +925,7 @@ if (!dgs) { return; } -#ifdef HAVE_LLVM +#ifdef LLVM_AVAILABLE if (draw->llvm) { struct llvm_geometry_shader *shader = llvm_geometry_shader(dgs); struct draw_gs_llvm_variant_list_item *li; @@ -951,12 +956,15 @@ for (i = 0; i < TGSI_MAX_VERTEX_STREAMS; i++) FREE(dgs->stream[i].primitive_lengths); + + if (dgs->state.ir.nir) + ralloc_free(dgs->state.ir.nir); FREE((void*) dgs->state.tokens); FREE(dgs); } -#ifdef HAVE_LLVM +#ifdef LLVM_AVAILABLE void draw_gs_set_current_variant(struct draw_geometry_shader *shader, struct draw_gs_llvm_variant *variant) { diff -Nru mesa-19.2.8/src/gallium/auxiliary/draw/draw_gs.h mesa-20.0.8/src/gallium/auxiliary/draw/draw_gs.h --- mesa-19.2.8/src/gallium/auxiliary/draw/draw_gs.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/draw/draw_gs.h 2020-06-12 01:21:16.000000000 +0000 @@ -36,7 +36,7 @@ struct draw_context; -#ifdef HAVE_LLVM +#ifdef LLVM_AVAILABLE struct draw_gs_jit_context; struct draw_gs_llvm_variant; @@ -96,7 +96,7 @@ unsigned num_invocations; unsigned invocation_id; -#ifdef HAVE_LLVM +#ifdef LLVM_AVAILABLE struct draw_gs_inputs *gs_input; struct draw_gs_jit_context *jit_context; struct draw_gs_llvm_variant *current_variant; @@ -146,7 +146,7 @@ int draw_gs_max_output_vertices(struct draw_geometry_shader *shader, unsigned pipe_prim); -#ifdef HAVE_LLVM +#ifdef LLVM_AVAILABLE void draw_gs_set_current_variant(struct draw_geometry_shader *shader, struct draw_gs_llvm_variant *variant); #endif diff -Nru mesa-19.2.8/src/gallium/auxiliary/draw/draw_llvm.c mesa-20.0.8/src/gallium/auxiliary/draw/draw_llvm.c --- mesa-19.2.8/src/gallium/auxiliary/draw/draw_llvm.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/draw/draw_llvm.c 2020-06-12 01:21:16.000000000 +0000 @@ -43,6 +43,7 @@ #include "gallivm/lp_bld_flow.h" #include "gallivm/lp_bld_debug.h" #include "gallivm/lp_bld_tgsi.h" +#include "gallivm/lp_bld_nir.h" #include "gallivm/lp_bld_printf.h" #include "gallivm/lp_bld_intr.h" #include "gallivm/lp_bld_init.h" @@ -67,14 +68,14 @@ struct draw_gs_llvm_iface { - struct lp_build_tgsi_gs_iface base; + struct lp_build_gs_iface base; struct draw_gs_llvm_variant *variant; LLVMValueRef input; }; static inline const struct draw_gs_llvm_iface * -draw_gs_llvm_iface(const struct lp_build_tgsi_gs_iface *iface) +draw_gs_llvm_iface(const struct lp_build_gs_iface *iface) { return (const struct draw_gs_llvm_iface *)iface; } @@ -145,12 +146,6 @@ LP_CHECK_MEMBER_OFFSET(struct draw_jit_texture, depth, target, texture_type, DRAW_JIT_TEXTURE_DEPTH); - LP_CHECK_MEMBER_OFFSET(struct draw_jit_texture, first_level, - target, texture_type, - DRAW_JIT_TEXTURE_FIRST_LEVEL); - LP_CHECK_MEMBER_OFFSET(struct draw_jit_texture, last_level, - target, texture_type, - DRAW_JIT_TEXTURE_LAST_LEVEL); LP_CHECK_MEMBER_OFFSET(struct draw_jit_texture, base, target, texture_type, DRAW_JIT_TEXTURE_BASE); @@ -160,6 +155,12 @@ LP_CHECK_MEMBER_OFFSET(struct draw_jit_texture, img_stride, target, texture_type, DRAW_JIT_TEXTURE_IMG_STRIDE); + LP_CHECK_MEMBER_OFFSET(struct draw_jit_texture, first_level, + target, texture_type, + DRAW_JIT_TEXTURE_FIRST_LEVEL); + LP_CHECK_MEMBER_OFFSET(struct draw_jit_texture, last_level, + target, texture_type, + DRAW_JIT_TEXTURE_LAST_LEVEL); LP_CHECK_MEMBER_OFFSET(struct draw_jit_texture, mip_offsets, target, texture_type, DRAW_JIT_TEXTURE_MIP_OFFSETS); @@ -208,6 +209,52 @@ return sampler_type; } +/** + * Create LLVM type for struct draw_jit_texture + */ +static LLVMTypeRef +create_jit_image_type(struct gallivm_state *gallivm, const char *struct_name) +{ + LLVMTargetDataRef target = gallivm->target; + LLVMTypeRef image_type; + LLVMTypeRef elem_types[DRAW_JIT_IMAGE_NUM_FIELDS]; + LLVMTypeRef int32_type = LLVMInt32TypeInContext(gallivm->context); + + elem_types[DRAW_JIT_IMAGE_WIDTH] = + elem_types[DRAW_JIT_IMAGE_HEIGHT] = + elem_types[DRAW_JIT_IMAGE_DEPTH] = + elem_types[DRAW_JIT_IMAGE_ROW_STRIDE] = + elem_types[DRAW_JIT_IMAGE_IMG_STRIDE] = int32_type; + elem_types[DRAW_JIT_IMAGE_BASE] = + LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0); + + image_type = LLVMStructTypeInContext(gallivm->context, elem_types, + ARRAY_SIZE(elem_types), 0); + + (void) target; /* silence unused var warning for non-debug build */ + LP_CHECK_MEMBER_OFFSET(struct draw_jit_image, width, + target, image_type, + DRAW_JIT_IMAGE_WIDTH); + LP_CHECK_MEMBER_OFFSET(struct draw_jit_image, height, + target, image_type, + DRAW_JIT_IMAGE_HEIGHT); + LP_CHECK_MEMBER_OFFSET(struct draw_jit_image, depth, + target, image_type, + DRAW_JIT_IMAGE_DEPTH); + LP_CHECK_MEMBER_OFFSET(struct draw_jit_image, base, + target, image_type, + DRAW_JIT_IMAGE_BASE); + LP_CHECK_MEMBER_OFFSET(struct draw_jit_image, row_stride, + target, image_type, + DRAW_JIT_IMAGE_ROW_STRIDE); + LP_CHECK_MEMBER_OFFSET(struct draw_jit_image, img_stride, + target, image_type, + DRAW_JIT_IMAGE_IMG_STRIDE); + + LP_CHECK_STRUCT_SIZE(struct draw_jit_image, target, image_type); + + return image_type; +} /** * Create LLVM type for struct draw_jit_context @@ -215,6 +262,7 @@ static LLVMTypeRef create_jit_context_type(struct gallivm_state *gallivm, LLVMTypeRef texture_type, LLVMTypeRef sampler_type, + LLVMTypeRef image_type, const char *struct_name) { LLVMTargetDataRef target = gallivm->target; @@ -234,9 +282,11 @@ PIPE_MAX_SHADER_SAMPLER_VIEWS); /* textures */ elem_types[5] = LLVMArrayType(sampler_type, PIPE_MAX_SAMPLERS); /* samplers */ - elem_types[6] = LLVMArrayType(LLVMPointerType(int_type, 0), /* vs_ssbo */ + elem_types[6] = LLVMArrayType(image_type, + PIPE_MAX_SHADER_IMAGES); /* images */ + elem_types[7] = LLVMArrayType(LLVMPointerType(int_type, 0), /* vs_ssbo */ LP_MAX_TGSI_SHADER_BUFFERS); - elem_types[7] = LLVMArrayType(int_type, /* num_vs_ssbos */ + elem_types[8] = LLVMArrayType(int_type, /* num_vs_ssbos */ LP_MAX_TGSI_SHADER_BUFFERS); context_type = LLVMStructTypeInContext(gallivm->context, elem_types, ARRAY_SIZE(elem_types), 0); @@ -256,6 +306,8 @@ LP_CHECK_MEMBER_OFFSET(struct draw_jit_context, samplers, target, context_type, DRAW_JIT_CTX_SAMPLERS); + LP_CHECK_MEMBER_OFFSET(struct draw_jit_context, images, + target, context_type, DRAW_JIT_CTX_IMAGES); LP_CHECK_MEMBER_OFFSET(struct draw_jit_context, vs_ssbos, target, context_type, DRAW_JIT_CTX_SSBOS); LP_CHECK_MEMBER_OFFSET(struct draw_jit_context, num_vs_ssbos, @@ -274,6 +326,7 @@ create_gs_jit_context_type(struct gallivm_state *gallivm, unsigned vector_length, LLVMTypeRef texture_type, LLVMTypeRef sampler_type, + LLVMTypeRef image_type, const char *struct_name) { LLVMTargetDataRef target = gallivm->target; @@ -294,17 +347,19 @@ PIPE_MAX_SHADER_SAMPLER_VIEWS); /* textures */ elem_types[5] = LLVMArrayType(sampler_type, PIPE_MAX_SAMPLERS); /* samplers */ - - elem_types[6] = LLVMPointerType(LLVMPointerType(int_type, 0), 0); - elem_types[7] = LLVMPointerType(LLVMVectorType(int_type, - vector_length), 0); + elem_types[6] = LLVMArrayType(image_type, + PIPE_MAX_SHADER_IMAGES); /* images */ + elem_types[7] = LLVMPointerType(LLVMPointerType(int_type, 0), 0); elem_types[8] = LLVMPointerType(LLVMVectorType(int_type, vector_length), 0); + elem_types[9] = LLVMPointerType(LLVMVectorType(int_type, + vector_length), 0); - elem_types[9] = LLVMArrayType(LLVMPointerType(int_type, 0), /* ssbos */ + elem_types[10] = LLVMArrayType(LLVMPointerType(int_type, 0), /* ssbos */ LP_MAX_TGSI_SHADER_BUFFERS); - elem_types[10] = LLVMArrayType(int_type, /* num_ssbos */ + elem_types[11] = LLVMArrayType(int_type, /* num_ssbos */ LP_MAX_TGSI_SHADER_BUFFERS); + context_type = LLVMStructTypeInContext(gallivm->context, elem_types, ARRAY_SIZE(elem_types), 0); @@ -336,6 +391,8 @@ target, context_type, DRAW_GS_JIT_CTX_SSBOS); LP_CHECK_MEMBER_OFFSET(struct draw_gs_jit_context, num_ssbos, target, context_type, DRAW_GS_JIT_CTX_NUM_SSBOS); + LP_CHECK_MEMBER_OFFSET(struct draw_gs_jit_context, images, + target, context_type, DRAW_GS_JIT_CTX_IMAGES); LP_CHECK_STRUCT_SIZE(struct draw_gs_jit_context, target, context_type); @@ -449,12 +506,14 @@ { struct gallivm_state *gallivm = variant->gallivm; LLVMTypeRef texture_type, sampler_type, context_type, buffer_type, - vb_type; + vb_type, image_type; texture_type = create_jit_texture_type(gallivm, "texture"); sampler_type = create_jit_sampler_type(gallivm, "sampler"); + image_type = create_jit_image_type(gallivm, "image"); context_type = create_jit_context_type(gallivm, texture_type, sampler_type, + image_type, "draw_jit_context"); variant->context_ptr_type = LLVMPointerType(context_type, 0); @@ -588,7 +647,10 @@ memcpy(&variant->key, key, shader->variant_key_size); if (gallivm_debug & (GALLIVM_DEBUG_TGSI | GALLIVM_DEBUG_IR)) { - tgsi_dump(llvm->draw->vs.vertex_shader->state.tokens, 0); + if (llvm->draw->vs.vertex_shader->state.type == PIPE_SHADER_IR_TGSI) + tgsi_dump(llvm->draw->vs.vertex_shader->state.tokens, 0); + else + nir_print_shader(llvm->draw->vs.vertex_shader->state.ir.nir, stderr); draw_llvm_dump_variant_key(&variant->key); } @@ -623,6 +685,7 @@ const struct lp_bld_tgsi_system_values *system_values, LLVMValueRef context_ptr, const struct lp_build_sampler_soa *draw_sampler, + const struct lp_build_image_soa *draw_image, boolean clamp_vertex_color, struct lp_build_mask_context *bld_mask) { @@ -651,11 +714,19 @@ params.info = &llvm->draw->vs.vertex_shader->info; params.ssbo_ptr = ssbos_ptr; params.ssbo_sizes_ptr = num_ssbos_ptr; + params.image = draw_image; - lp_build_tgsi_soa(variant->gallivm, - tokens, - ¶ms, - outputs); + if (llvm->draw->vs.vertex_shader->state.ir.nir && + llvm->draw->vs.vertex_shader->state.type == PIPE_SHADER_IR_NIR) + lp_build_nir_soa(variant->gallivm, + llvm->draw->vs.vertex_shader->state.ir.nir, + ¶ms, + outputs); + else + lp_build_tgsi_soa(variant->gallivm, + tokens, + ¶ms, + outputs); { LLVMValueRef out; @@ -866,7 +937,7 @@ adjust_mask(struct gallivm_state *gallivm, LLVMValueRef mask) { -#ifdef PIPE_ARCH_BIG_ENDIAN +#if UTIL_ARCH_BIG_ENDIAN LLVMBuilderRef builder = gallivm->builder; LLVMValueRef vertex_id; LLVMValueRef clipmask; @@ -1402,8 +1473,8 @@ } static LLVMValueRef -draw_gs_llvm_fetch_input(const struct lp_build_tgsi_gs_iface *gs_iface, - struct lp_build_tgsi_context * bld_base, +draw_gs_llvm_fetch_input(const struct lp_build_gs_iface *gs_iface, + struct lp_build_context * bld, boolean is_vindex_indirect, LLVMValueRef vertex_index, boolean is_aindex_indirect, @@ -1411,15 +1482,15 @@ LLVMValueRef swizzle_index) { const struct draw_gs_llvm_iface *gs = draw_gs_llvm_iface(gs_iface); - struct gallivm_state *gallivm = bld_base->base.gallivm; + struct gallivm_state *gallivm = bld->gallivm; LLVMBuilderRef builder = gallivm->builder; LLVMValueRef indices[3]; LLVMValueRef res; - struct lp_type type = bld_base->base.type; + struct lp_type type = bld->type; if (is_vindex_indirect || is_aindex_indirect) { int i; - res = bld_base->base.zero; + res = bld->zero; for (i = 0; i < type.length; ++i) { LLVMValueRef idx = lp_build_const_int32(gallivm, i); LLVMValueRef vert_chan_index = vertex_index; @@ -1458,16 +1529,17 @@ } static void -draw_gs_llvm_emit_vertex(const struct lp_build_tgsi_gs_iface *gs_base, - struct lp_build_tgsi_context * bld_base, +draw_gs_llvm_emit_vertex(const struct lp_build_gs_iface *gs_base, + struct lp_build_context * bld, LLVMValueRef (*outputs)[4], - LLVMValueRef emitted_vertices_vec) + LLVMValueRef emitted_vertices_vec, + LLVMValueRef stream_id) { const struct draw_gs_llvm_iface *gs_iface = draw_gs_llvm_iface(gs_base); struct draw_gs_llvm_variant *variant = gs_iface->variant; struct gallivm_state *gallivm = variant->gallivm; LLVMBuilderRef builder = gallivm->builder; - struct lp_type gs_type = bld_base->base.type; + struct lp_type gs_type = bld->type; LLVMValueRef clipmask = lp_build_const_int_vec(gallivm, lp_int_type(gs_type), 0); LLVMValueRef indices[LP_MAX_VECTOR_LENGTH]; @@ -1492,10 +1564,12 @@ } static void -draw_gs_llvm_end_primitive(const struct lp_build_tgsi_gs_iface *gs_base, - struct lp_build_tgsi_context * bld_base, +draw_gs_llvm_end_primitive(const struct lp_build_gs_iface *gs_base, + struct lp_build_context * bld, + LLVMValueRef total_emitted_vertices_vec_ptr, LLVMValueRef verts_per_prim_vec, - LLVMValueRef emitted_prims_vec) + LLVMValueRef emitted_prims_vec, + LLVMValueRef mask_vec) { const struct draw_gs_llvm_iface *gs_iface = draw_gs_llvm_iface(gs_base); struct draw_gs_llvm_variant *variant = gs_iface->variant; @@ -1505,7 +1579,7 @@ draw_gs_jit_prim_lengths(variant->gallivm, variant->context_ptr); unsigned i; - for (i = 0; i < bld_base->base.type.length; ++i) { + for (i = 0; i < bld->type.length; ++i) { LLVMValueRef ind = lp_build_const_int32(gallivm, i); LLVMValueRef prims_emitted = LLVMBuildExtractElement(builder, emitted_prims_vec, ind, ""); @@ -1521,8 +1595,7 @@ } static void -draw_gs_llvm_epilogue(const struct lp_build_tgsi_gs_iface *gs_base, - struct lp_build_tgsi_context * bld_base, +draw_gs_llvm_epilogue(const struct lp_build_gs_iface *gs_base, LLVMValueRef total_emitted_vertices_vec, LLVMValueRef emitted_prims_vec) { @@ -1549,7 +1622,7 @@ struct gallivm_state *gallivm = variant->gallivm; LLVMContextRef context = gallivm->context; LLVMTypeRef int32_type = LLVMInt32TypeInContext(context); - LLVMTypeRef arg_types[11]; + LLVMTypeRef arg_types[12]; unsigned num_arg_types = ARRAY_SIZE(arg_types); LLVMTypeRef func_type; LLVMValueRef context_ptr; @@ -1558,7 +1631,7 @@ char func_name[64]; struct lp_type vs_type; LLVMValueRef count, fetch_elts, start_or_maxelt; - LLVMValueRef vertex_id_offset, start_instance; + LLVMValueRef vertex_id_offset; LLVMValueRef stride, step, io_itr; LLVMValueRef ind_vec, start_vec, have_elts, fetch_max, tmp; LLVMValueRef io_ptr, vbuffers_ptr, vb_ptr; @@ -1577,6 +1650,7 @@ const int vector_length = lp_native_vector_width / 32; LLVMValueRef outputs[PIPE_MAX_SHADER_OUTPUTS][TGSI_NUM_CHANNELS]; struct lp_build_sampler_soa *sampler = 0; + struct lp_build_image_soa *image = NULL; LLVMValueRef ret, clipmask_bool_ptr; struct draw_llvm_variant_key *key = &variant->key; /* If geometry shader is present we need to skip both the viewport @@ -1599,7 +1673,7 @@ struct lp_bld_tgsi_system_values system_values; memset(&system_values, 0, sizeof(system_values)); - + memset(&outputs, 0, sizeof(outputs)); snprintf(func_name, sizeof(func_name), "draw_llvm_vs_variant%u", variant->shader->variants_cached); @@ -1615,6 +1689,7 @@ arg_types[i++] = int32_type; /* vertex_id_offset */ arg_types[i++] = int32_type; /* start_instance */ arg_types[i++] = LLVMPointerType(int32_type, 0); /* fetch_elts */ + arg_types[i++] = int32_type; /* draw_id */ func_type = LLVMFunctionType(LLVMInt8TypeInContext(context), arg_types, num_arg_types, 0); @@ -1647,8 +1722,9 @@ vb_ptr = LLVMGetParam(variant_func, 6); system_values.instance_id = LLVMGetParam(variant_func, 7); vertex_id_offset = LLVMGetParam(variant_func, 8); - start_instance = LLVMGetParam(variant_func, 9); + system_values.base_instance = LLVMGetParam(variant_func, 9); fetch_elts = LLVMGetParam(variant_func, 10); + system_values.draw_id = LLVMGetParam(variant_func, 11); lp_build_name(context_ptr, "context"); lp_build_name(io_ptr, "io"); @@ -1659,8 +1735,9 @@ lp_build_name(vb_ptr, "vb"); lp_build_name(system_values.instance_id, "instance_id"); lp_build_name(vertex_id_offset, "vertex_id_offset"); - lp_build_name(start_instance, "start_instance"); + lp_build_name(system_values.base_instance, "start_instance"); lp_build_name(fetch_elts, "fetch_elts"); + lp_build_name(system_values.draw_id, "draw_id"); /* * Function body @@ -1692,6 +1769,8 @@ /* code generated texture sampling */ sampler = draw_llvm_sampler_soa_create(draw_llvm_variant_key_samplers(key)); + image = draw_llvm_image_soa_create(draw_llvm_variant_key_images(key)); + step = lp_build_const_int32(gallivm, vector_length); ind_vec = blduivec.undef; @@ -1772,7 +1851,7 @@ lp_build_const_int32(gallivm, velem->instance_divisor), "instance_divisor"); - instance_index[j] = lp_build_uadd_overflow(gallivm, start_instance, + instance_index[j] = lp_build_uadd_overflow(gallivm, system_values.base_instance, current_instance, &ofbit); } @@ -1920,12 +1999,13 @@ * the primitive was split (we split rendering into chunks of at * most 4095-vertices) we need to back out the original start * index out of our vertex id here. + * for ARB_shader_draw_parameters, base_vertex should be 0 for non-indexed draws. */ - system_values.basevertex = lp_build_broadcast_scalar(&blduivec, - vertex_id_offset); + LLVMValueRef base_vertex = lp_build_select(&bld, have_elts, vertex_id_offset, lp_build_const_int32(gallivm, 0));; + system_values.basevertex = lp_build_broadcast_scalar(&blduivec, base_vertex); system_values.vertex_id = true_index_array; system_values.vertex_id_nobase = LLVMBuildSub(builder, true_index_array, - system_values.basevertex, ""); + lp_build_broadcast_scalar(&blduivec, vertex_id_offset), ""); ptr_aos = (const LLVMValueRef (*)[TGSI_NUM_CHANNELS]) inputs; generate_vs(variant, @@ -1936,6 +2016,7 @@ &system_values, context_ptr, sampler, + image, key->clamp_vertex_color, &mask); @@ -1982,6 +2063,7 @@ lp_build_loop_end_cond(&lp_loop, count, step, LLVMIntUGE); sampler->destroy(sampler); + image->destroy(image); /* return clipping boolean value for function */ ret = clipmask_booli8(gallivm, vs_type, clipmask_bool_ptr, @@ -1999,6 +2081,7 @@ unsigned i; struct draw_llvm_variant_key *key; struct draw_sampler_static_state *draw_sampler; + struct draw_image_static_state *draw_image; key = (struct draw_llvm_variant_key *)store; @@ -2031,6 +2114,8 @@ key->nr_sampler_views = key->nr_samplers; } + key->nr_images = llvm->draw->vs.vertex_shader->info.file_max[TGSI_FILE_IMAGE] + 1; + /* Presumably all variants of the shader should have the same * number of vertex elements - ie the number of shader inputs. * NOTE: we NEED to store the needed number of needed inputs @@ -2069,6 +2154,13 @@ llvm->draw->sampler_views[PIPE_SHADER_VERTEX][i]); } + draw_image = draw_llvm_variant_key_images(key); + memset(draw_image, 0, + key->nr_images * sizeof *draw_image); + for (i = 0; i < key->nr_images; i++) { + lp_sampler_static_texture_state_image(&draw_image[i].image_state, + llvm->draw->images[PIPE_SHADER_VERTEX][i]); + } return key; } @@ -2078,7 +2170,7 @@ { unsigned i; struct draw_sampler_static_state *sampler = draw_llvm_variant_key_samplers(key); - + struct draw_image_static_state *image = draw_llvm_variant_key_images(key); debug_printf("clamp_vertex_color = %u\n", key->clamp_vertex_color); debug_printf("clip_xy = %u\n", key->clip_xy); debug_printf("clip_z = %u\n", key->clip_z); @@ -2099,6 +2191,9 @@ for (i = 0 ; i < key->nr_sampler_views; i++) { debug_printf("sampler[%i].src_format = %s\n", i, util_format_name(sampler[i].texture_state.format)); } + + for (i = 0 ; i < key->nr_images; i++) + debug_printf("images[%i].format = %s\n", i, util_format_name(image[i].image_state.format)); } @@ -2146,6 +2241,42 @@ } } +void +draw_llvm_set_mapped_image(struct draw_context *draw, + enum pipe_shader_type shader_stage, + unsigned idx, + uint32_t width, uint32_t height, uint32_t depth, + const void *base_ptr, + uint32_t row_stride, + uint32_t img_stride) +{ + struct draw_jit_image *jit_image; + + assert(shader_stage == PIPE_SHADER_VERTEX || + shader_stage == PIPE_SHADER_GEOMETRY); + + if (shader_stage == PIPE_SHADER_VERTEX) { + assert(idx < ARRAY_SIZE(draw->llvm->jit_context.images)); + + jit_image = &draw->llvm->jit_context.images[idx]; + } else if (shader_stage == PIPE_SHADER_GEOMETRY) { + assert(idx < ARRAY_SIZE(draw->llvm->gs_jit_context.images)); + + jit_image = &draw->llvm->gs_jit_context.images[idx]; + } else { + assert(0); + return; + } + + jit_image->width = width; + jit_image->height = height; + jit_image->depth = depth; + jit_image->base = base_ptr; + + jit_image->row_stride = row_stride; + jit_image->img_stride = img_stride; +} + void draw_llvm_set_sampler_state(struct draw_context *draw, @@ -2210,14 +2341,16 @@ create_gs_jit_types(struct draw_gs_llvm_variant *var) { struct gallivm_state *gallivm = var->gallivm; - LLVMTypeRef texture_type, sampler_type, context_type; + LLVMTypeRef texture_type, sampler_type, image_type, context_type; texture_type = create_jit_texture_type(gallivm, "texture"); sampler_type = create_jit_sampler_type(gallivm, "sampler"); + image_type = create_jit_image_type(gallivm, "image"); context_type = create_gs_jit_context_type(gallivm, var->shader->base.vector_length, texture_type, sampler_type, + image_type, "draw_gs_jit_context"); var->context_ptr_type = LLVMPointerType(context_type, 0); @@ -2271,6 +2404,7 @@ LLVMBuilderRef builder; LLVMValueRef io_ptr, input_array, num_prims, mask_val; struct lp_build_sampler_soa *sampler = 0; + struct lp_build_image_soa *image = NULL; struct lp_build_context bld; struct lp_bld_tgsi_system_values system_values; char func_name[64]; @@ -2286,6 +2420,7 @@ unsigned vector_length = variant->shader->base.vector_length; memset(&system_values, 0, sizeof(system_values)); + memset(&outputs, 0, sizeof(outputs)); snprintf(func_name, sizeof(func_name), "draw_llvm_gs_variant%u", variant->shader->variants_cached); @@ -2367,7 +2502,7 @@ /* code generated texture sampling */ sampler = draw_llvm_sampler_soa_create(variant->key.samplers); - + image = draw_llvm_image_soa_create(draw_gs_llvm_variant_key_images(&variant->key)); mask_val = generate_mask_value(variant, gs_type); lp_build_mask_begin(&mask, gallivm, gs_type, mask_val); @@ -2376,7 +2511,10 @@ } if (gallivm_debug & (GALLIVM_DEBUG_TGSI | GALLIVM_DEBUG_IR)) { - tgsi_dump(tokens, 0); + if (llvm->draw->gs.geometry_shader->state.type == PIPE_SHADER_IR_TGSI) + tgsi_dump(tokens, 0); + else + nir_print_shader(llvm->draw->gs.geometry_shader->state.ir.nir, stderr); draw_gs_llvm_dump_variant_key(&variant->key); } @@ -2391,16 +2529,24 @@ params.context_ptr = context_ptr; params.sampler = sampler; params.info = &llvm->draw->gs.geometry_shader->info; - params.gs_iface = (const struct lp_build_tgsi_gs_iface *)&gs_iface; + params.gs_iface = (const struct lp_build_gs_iface *)&gs_iface; params.ssbo_ptr = ssbos_ptr; params.ssbo_sizes_ptr = num_ssbos_ptr; + params.image = image; - lp_build_tgsi_soa(variant->gallivm, - tokens, - ¶ms, - outputs); + if (llvm->draw->gs.geometry_shader->state.type == PIPE_SHADER_IR_TGSI) + lp_build_tgsi_soa(variant->gallivm, + tokens, + ¶ms, + outputs); + else + lp_build_nir_soa(variant->gallivm, + llvm->draw->gs.geometry_shader->state.ir.nir, + ¶ms, + outputs); sampler->destroy(sampler); + image->destroy(image); lp_build_mask_end(&mask); @@ -2485,6 +2631,7 @@ unsigned i; struct draw_gs_llvm_variant_key *key; struct draw_sampler_static_state *draw_sampler; + struct draw_image_static_state *draw_image; key = (struct draw_gs_llvm_variant_key *)store; @@ -2505,6 +2652,8 @@ key->nr_sampler_views = key->nr_samplers; } + key->nr_images = llvm->draw->gs.geometry_shader->info.file_max[TGSI_FILE_IMAGE] + 1; + draw_sampler = key->samplers; memset(draw_sampler, 0, MAX2(key->nr_samplers, key->nr_sampler_views) * sizeof *draw_sampler); @@ -2518,6 +2667,13 @@ llvm->draw->sampler_views[PIPE_SHADER_GEOMETRY][i]); } + draw_image = draw_gs_llvm_variant_key_images(key); + memset(draw_image, 0, + key->nr_images * sizeof *draw_image); + for (i = 0; i < key->nr_images; i++) { + lp_sampler_static_texture_state_image(&draw_image[i].image_state, + llvm->draw->images[PIPE_SHADER_GEOMETRY][i]); + } return key; } @@ -2526,9 +2682,13 @@ { unsigned i; struct draw_sampler_static_state *sampler = key->samplers; - + struct draw_image_static_state *image = draw_gs_llvm_variant_key_images(key); for (i = 0 ; i < key->nr_sampler_views; i++) { debug_printf("sampler[%i].src_format = %s\n", i, util_format_name(sampler[i].texture_state.format)); } + + for (i = 0 ; i < key->nr_images; i++) + debug_printf("images[%i].format = %s\n", i, util_format_name(image[i].image_state.format)); + } diff -Nru mesa-19.2.8/src/gallium/auxiliary/draw/draw_llvm.h mesa-20.0.8/src/gallium/auxiliary/draw/draw_llvm.h --- mesa-19.2.8/src/gallium/auxiliary/draw/draw_llvm.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/draw/draw_llvm.h 2020-06-12 01:21:16.000000000 +0000 @@ -49,11 +49,11 @@ uint32_t width; uint32_t height; uint32_t depth; - uint32_t first_level; - uint32_t last_level; const void *base; uint32_t row_stride[PIPE_MAX_TEXTURE_LEVELS]; uint32_t img_stride[PIPE_MAX_TEXTURE_LEVELS]; + uint32_t first_level; + uint32_t last_level; uint32_t mip_offsets[PIPE_MAX_TEXTURE_LEVELS]; }; @@ -69,6 +69,11 @@ struct lp_static_texture_state texture_state; }; +struct draw_image_static_state +{ + struct lp_static_texture_state image_state; +}; + struct draw_jit_sampler { @@ -79,15 +84,25 @@ }; +struct draw_jit_image +{ + uint32_t width; + uint32_t height; + uint32_t depth; + const void *base; + uint32_t row_stride; + uint32_t img_stride; +}; + enum { DRAW_JIT_TEXTURE_WIDTH = 0, DRAW_JIT_TEXTURE_HEIGHT, DRAW_JIT_TEXTURE_DEPTH, - DRAW_JIT_TEXTURE_FIRST_LEVEL, - DRAW_JIT_TEXTURE_LAST_LEVEL, DRAW_JIT_TEXTURE_BASE, DRAW_JIT_TEXTURE_ROW_STRIDE, DRAW_JIT_TEXTURE_IMG_STRIDE, + DRAW_JIT_TEXTURE_FIRST_LEVEL, + DRAW_JIT_TEXTURE_LAST_LEVEL, DRAW_JIT_TEXTURE_MIP_OFFSETS, DRAW_JIT_TEXTURE_NUM_FIELDS /* number of fields above */ }; @@ -108,6 +123,16 @@ DRAW_JIT_VERTEX_DATA }; +enum { + DRAW_JIT_IMAGE_WIDTH = 0, + DRAW_JIT_IMAGE_HEIGHT, + DRAW_JIT_IMAGE_DEPTH, + DRAW_JIT_IMAGE_BASE, + DRAW_JIT_IMAGE_ROW_STRIDE, + DRAW_JIT_IMAGE_IMG_STRIDE, + DRAW_JIT_IMAGE_NUM_FIELDS /* number of fields above */ +}; + /** * This structure is passed directly to the generated vertex shader. * @@ -128,6 +153,7 @@ struct draw_jit_texture textures[PIPE_MAX_SHADER_SAMPLER_VIEWS]; struct draw_jit_sampler samplers[PIPE_MAX_SAMPLERS]; + struct draw_jit_image images[PIPE_MAX_SHADER_IMAGES]; const uint32_t *vs_ssbos[LP_MAX_TGSI_SHADER_BUFFERS]; int num_vs_ssbos[LP_MAX_TGSI_SHADER_BUFFERS]; @@ -140,8 +166,9 @@ DRAW_JIT_CTX_VIEWPORT = 3, DRAW_JIT_CTX_TEXTURES = 4, DRAW_JIT_CTX_SAMPLERS = 5, - DRAW_JIT_CTX_SSBOS = 6, - DRAW_JIT_CTX_NUM_SSBOS = 7, + DRAW_JIT_CTX_IMAGES = 6, + DRAW_JIT_CTX_SSBOS = 7, + DRAW_JIT_CTX_NUM_SSBOS = 8, DRAW_JIT_CTX_NUM_FIELDS }; @@ -163,6 +190,9 @@ #define draw_jit_context_samplers(_gallivm, _ptr) \ lp_build_struct_get_ptr(_gallivm, _ptr, DRAW_JIT_CTX_SAMPLERS, "samplers") +#define draw_jit_context_images(_gallivm, _ptr) \ + lp_build_struct_get_ptr(_gallivm, _ptr, DRAW_JIT_CTX_IMAGES, "images") + #define draw_jit_context_vs_ssbos(_gallivm, _ptr) \ lp_build_struct_get_ptr(_gallivm, _ptr, DRAW_JIT_CTX_SSBOS, "vs_ssbos") @@ -221,7 +251,8 @@ * DRAW_JIT_CTX_SAMPLERS positions in the struct */ struct draw_jit_texture textures[PIPE_MAX_SHADER_SAMPLER_VIEWS]; struct draw_jit_sampler samplers[PIPE_MAX_SAMPLERS]; - + struct draw_jit_image images[PIPE_MAX_SHADER_IMAGES]; + int **prim_lengths; int *emitted_vertices; int *emitted_prims; @@ -241,12 +272,13 @@ * VS ctx structure for sampling to work. */ DRAW_GS_JIT_CTX_TEXTURES = DRAW_JIT_CTX_TEXTURES, DRAW_GS_JIT_CTX_SAMPLERS = DRAW_JIT_CTX_SAMPLERS, - DRAW_GS_JIT_CTX_PRIM_LENGTHS = 6, - DRAW_GS_JIT_CTX_EMITTED_VERTICES = 7, - DRAW_GS_JIT_CTX_EMITTED_PRIMS = 8, - DRAW_GS_JIT_CTX_SSBOS = 9, - DRAW_GS_JIT_CTX_NUM_SSBOS = 10, - DRAW_GS_JIT_CTX_NUM_FIELDS = 11 + DRAW_GS_JIT_CTX_IMAGES = DRAW_JIT_CTX_IMAGES, + DRAW_GS_JIT_CTX_PRIM_LENGTHS = 7, + DRAW_GS_JIT_CTX_EMITTED_VERTICES = 8, + DRAW_GS_JIT_CTX_EMITTED_PRIMS = 9, + DRAW_GS_JIT_CTX_SSBOS = 10, + DRAW_GS_JIT_CTX_NUM_SSBOS = 11, + DRAW_GS_JIT_CTX_NUM_FIELDS = 12 }; #define draw_gs_jit_context_constants(_gallivm, _ptr) \ @@ -267,6 +299,9 @@ #define draw_gs_jit_context_samplers(_gallivm, _ptr) \ lp_build_struct_get_ptr(_gallivm, _ptr, DRAW_GS_JIT_CTX_SAMPLERS, "samplers") +#define draw_gs_jit_context_images(_gallivm, _ptr) \ + lp_build_struct_get_ptr(_gallivm, _ptr, DRAW_GS_JIT_CTX_IMAGES, "images") + #define draw_gs_jit_prim_lengths(_gallivm, _ptr) \ lp_build_struct_get(_gallivm, _ptr, DRAW_GS_JIT_CTX_PRIM_LENGTHS, "prim_lengths") @@ -293,7 +328,8 @@ unsigned instance_id, unsigned vertex_id_offset, unsigned start_instance, - const unsigned *fetch_elts); + const unsigned *fetch_elts, + unsigned draw_id); typedef int @@ -310,6 +346,7 @@ unsigned nr_vertex_elements:8; unsigned nr_samplers:8; unsigned nr_sampler_views:8; + unsigned nr_images:8; unsigned clamp_vertex_color:1; unsigned clip_xy:1; unsigned clip_z:1; @@ -329,42 +366,50 @@ /* Followed by variable number of samplers: */ /* struct draw_sampler_static_state sampler; */ + /* Followed by variable number of images + */ }; struct draw_gs_llvm_variant_key { unsigned nr_samplers:8; unsigned nr_sampler_views:8; + unsigned nr_images:8; unsigned num_outputs:8; /* note padding here - must use memset */ struct draw_sampler_static_state samplers[1]; + /* Followed by variable number of images.*/ }; #define DRAW_LLVM_MAX_VARIANT_KEY_SIZE \ (sizeof(struct draw_llvm_variant_key) + \ PIPE_MAX_SHADER_SAMPLER_VIEWS * sizeof(struct draw_sampler_static_state) + \ + PIPE_MAX_SHADER_IMAGES * sizeof(struct draw_image_static_state) + \ (PIPE_MAX_ATTRIBS-1) * sizeof(struct pipe_vertex_element)) #define DRAW_GS_LLVM_MAX_VARIANT_KEY_SIZE \ (sizeof(struct draw_gs_llvm_variant_key) + \ + PIPE_MAX_SHADER_IMAGES * sizeof(struct draw_image_static_state) + \ PIPE_MAX_SHADER_SAMPLER_VIEWS * sizeof(struct draw_sampler_static_state)) static inline size_t draw_llvm_variant_key_size(unsigned nr_vertex_elements, - unsigned nr_samplers) + unsigned nr_samplers, unsigned nr_images) { return (sizeof(struct draw_llvm_variant_key) + nr_samplers * sizeof(struct draw_sampler_static_state) + + nr_images * sizeof(struct draw_image_static_state) + (nr_vertex_elements - 1) * sizeof(struct pipe_vertex_element)); } static inline size_t -draw_gs_llvm_variant_key_size(unsigned nr_samplers) +draw_gs_llvm_variant_key_size(unsigned nr_samplers, unsigned nr_images) { return (sizeof(struct draw_gs_llvm_variant_key) + + (nr_images) * sizeof(struct draw_sampler_static_state) + (nr_samplers - 1) * sizeof(struct draw_sampler_static_state)); } @@ -376,6 +421,21 @@ &key->vertex_element[key->nr_vertex_elements]; } +static inline struct draw_image_static_state * +draw_llvm_variant_key_images(struct draw_llvm_variant_key *key) +{ + struct draw_sampler_static_state *samplers = (struct draw_sampler_static_state *) + (&key->vertex_element[key->nr_vertex_elements]); + return (struct draw_image_static_state *) + &samplers[key->nr_samplers]; +} + +static inline struct draw_image_static_state * +draw_gs_llvm_variant_key_images(struct draw_gs_llvm_variant_key *key) +{ + return (struct draw_image_static_state *) + &key->samplers[key->nr_samplers]; +} struct draw_llvm_variant_list_item { @@ -528,6 +588,9 @@ struct lp_build_sampler_soa * draw_llvm_sampler_soa_create(const struct draw_sampler_static_state *static_state); +struct lp_build_image_soa * +draw_llvm_image_soa_create(const struct draw_image_static_state *static_state); + void draw_llvm_set_sampler_state(struct draw_context *draw, enum pipe_shader_type shader_stage); @@ -543,4 +606,12 @@ uint32_t img_stride[PIPE_MAX_TEXTURE_LEVELS], uint32_t mip_offsets[PIPE_MAX_TEXTURE_LEVELS]); +void +draw_llvm_set_mapped_image(struct draw_context *draw, + enum pipe_shader_type shader_stage, + unsigned idx, + uint32_t width, uint32_t height, uint32_t depth, + const void *base_ptr, + uint32_t row_stride, + uint32_t img_stride); #endif diff -Nru mesa-19.2.8/src/gallium/auxiliary/draw/draw_llvm_sample.c mesa-20.0.8/src/gallium/auxiliary/draw/draw_llvm_sample.c --- mesa-19.2.8/src/gallium/auxiliary/draw/draw_llvm_sample.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/draw/draw_llvm_sample.c 2020-06-12 01:21:16.000000000 +0000 @@ -72,6 +72,19 @@ struct draw_llvm_sampler_dynamic_state dynamic_state; }; +struct draw_llvm_image_dynamic_state +{ + struct lp_sampler_dynamic_state base; + + const struct draw_image_static_state *static_state; +}; + +struct draw_llvm_image_soa +{ + struct lp_build_image_soa base; + + struct draw_llvm_image_dynamic_state dynamic_state; +}; /** * Fetch the specified member of the lp_jit_texture structure. @@ -164,6 +177,50 @@ return res; } +/** + * Fetch the specified member of the lp_jit_texture structure. + * \param emit_load if TRUE, emit the LLVM load instruction to actually + * fetch the field's value. Otherwise, just emit the + * GEP code to address the field. + * + * @sa http://llvm.org/docs/GetElementPtr.html + */ +static LLVMValueRef +draw_llvm_image_member(const struct lp_sampler_dynamic_state *base, + struct gallivm_state *gallivm, + LLVMValueRef context_ptr, + unsigned image_unit, + unsigned member_index, + const char *member_name, + boolean emit_load) +{ + LLVMBuilderRef builder = gallivm->builder; + LLVMValueRef indices[4]; + LLVMValueRef ptr; + LLVMValueRef res; + + debug_assert(image_unit < PIPE_MAX_SHADER_IMAGES); + + /* context[0] */ + indices[0] = lp_build_const_int32(gallivm, 0); + /* context[0].textures */ + indices[1] = lp_build_const_int32(gallivm, DRAW_JIT_CTX_IMAGES); + /* context[0].textures[unit] */ + indices[2] = lp_build_const_int32(gallivm, image_unit); + /* context[0].textures[unit].member */ + indices[3] = lp_build_const_int32(gallivm, member_index); + + ptr = LLVMBuildGEP(builder, context_ptr, indices, ARRAY_SIZE(indices), ""); + + if (emit_load) + res = LLVMBuildLoad(builder, ptr, ""); + else + res = ptr; + + lp_build_name(res, "context.image%u.%s", image_unit, member_name); + + return res; +} /** * Helper macro to instantiate the functions that generate the code to @@ -214,6 +271,24 @@ DRAW_LLVM_SAMPLER_MEMBER(lod_bias, DRAW_JIT_SAMPLER_LOD_BIAS, TRUE) DRAW_LLVM_SAMPLER_MEMBER(border_color, DRAW_JIT_SAMPLER_BORDER_COLOR, FALSE) +#define DRAW_LLVM_IMAGE_MEMBER(_name, _index, _emit_load) \ + static LLVMValueRef \ + draw_llvm_image_##_name( const struct lp_sampler_dynamic_state *base, \ + struct gallivm_state *gallivm, \ + LLVMValueRef context_ptr, \ + unsigned image_unit) \ + { \ + return draw_llvm_image_member(base, gallivm, context_ptr, \ + image_unit, _index, #_name, _emit_load ); \ + } + + +DRAW_LLVM_IMAGE_MEMBER(width, DRAW_JIT_IMAGE_WIDTH, TRUE) +DRAW_LLVM_IMAGE_MEMBER(height, DRAW_JIT_IMAGE_HEIGHT, TRUE) +DRAW_LLVM_IMAGE_MEMBER(depth, DRAW_JIT_IMAGE_DEPTH, TRUE) +DRAW_LLVM_IMAGE_MEMBER(base_ptr, DRAW_JIT_IMAGE_BASE, TRUE) +DRAW_LLVM_IMAGE_MEMBER(row_stride, DRAW_JIT_IMAGE_ROW_STRIDE, TRUE) +DRAW_LLVM_IMAGE_MEMBER(img_stride, DRAW_JIT_IMAGE_IMG_STRIDE, TRUE) static void draw_llvm_sampler_soa_destroy(struct lp_build_sampler_soa *sampler) @@ -293,3 +368,64 @@ return &sampler->base; } +static void +draw_llvm_image_soa_emit_op(const struct lp_build_image_soa *base, + struct gallivm_state *gallivm, + const struct lp_img_params *params) +{ + struct draw_llvm_image_soa *image = (struct draw_llvm_image_soa *)base; + unsigned image_index = params->image_index; + assert(image_index < PIPE_MAX_SHADER_IMAGES); + + lp_build_img_op_soa(&image->dynamic_state.static_state[image_index].image_state, + &image->dynamic_state.base, + gallivm, params); +} +/** + * Fetch the texture size. + */ +static void +draw_llvm_image_soa_emit_size_query(const struct lp_build_image_soa *base, + struct gallivm_state *gallivm, + const struct lp_sampler_size_query_params *params) +{ + struct draw_llvm_image_soa *image = (struct draw_llvm_image_soa *)base; + + assert(params->texture_unit < PIPE_MAX_SHADER_IMAGES); + + lp_build_size_query_soa(gallivm, + &image->dynamic_state.static_state[params->texture_unit].image_state, + &image->dynamic_state.base, + params); +} +static void +draw_llvm_image_soa_destroy(struct lp_build_image_soa *image) +{ + FREE(image); +} + +struct lp_build_image_soa * +draw_llvm_image_soa_create(const struct draw_image_static_state *static_state) +{ + struct draw_llvm_image_soa *image; + + image = CALLOC_STRUCT(draw_llvm_image_soa); + if (!image) + return NULL; + + image->base.destroy = draw_llvm_image_soa_destroy; + image->base.emit_op = draw_llvm_image_soa_emit_op; + image->base.emit_size_query = draw_llvm_image_soa_emit_size_query; + + image->dynamic_state.base.width = draw_llvm_image_width; + image->dynamic_state.base.height = draw_llvm_image_height; + + image->dynamic_state.base.depth = draw_llvm_image_depth; + image->dynamic_state.base.base_ptr = draw_llvm_image_base_ptr; + image->dynamic_state.base.row_stride = draw_llvm_image_row_stride; + image->dynamic_state.base.img_stride = draw_llvm_image_img_stride; + + image->dynamic_state.static_state = static_state; + + return &image->base; +} diff -Nru mesa-19.2.8/src/gallium/auxiliary/draw/draw_pipe_aaline.c mesa-20.0.8/src/gallium/auxiliary/draw/draw_pipe_aaline.c --- mesa-19.2.8/src/gallium/auxiliary/draw/draw_pipe_aaline.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/draw/draw_pipe_aaline.c 2020-06-12 01:21:16.000000000 +0000 @@ -37,7 +37,7 @@ #include "pipe/p_shader_tokens.h" #include "util/u_inlines.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_math.h" #include "util/u_memory.h" @@ -48,6 +48,8 @@ #include "draw_private.h" #include "draw_pipe.h" +#include "nir.h" +#include "nir/nir_draw_helpers.h" /** Approx number of new tokens for instructions in aa_transform_inst() */ #define NUM_NEW_TOKENS 53 @@ -318,6 +320,30 @@ return FALSE; } +static boolean +generate_aaline_fs_nir(struct aaline_stage *aaline) +{ +#ifdef LLVM_AVAILABLE + struct pipe_context *pipe = aaline->stage.draw->pipe; + const struct pipe_shader_state *orig_fs = &aaline->fs->state; + struct pipe_shader_state aaline_fs; + + aaline_fs = *orig_fs; /* copy to init */ + aaline_fs.ir.nir = nir_shader_clone(NULL, orig_fs->ir.nir); + if (!aaline_fs.ir.nir) + return FALSE; + + nir_lower_aaline_fs(aaline_fs.ir.nir, &aaline->fs->generic_attrib); + aaline->fs->aaline_fs = aaline->driver_create_fs_state(pipe, &aaline_fs); + if (aaline->fs->aaline_fs == NULL) + goto fail; + + return TRUE; + +fail: +#endif + return FALSE; +} /** * When we're about to draw our first AA line in a batch, this function is @@ -329,8 +355,14 @@ struct draw_context *draw = aaline->stage.draw; struct pipe_context *pipe = draw->pipe; - if (!aaline->fs->aaline_fs && !generate_aaline_fs(aaline)) - return FALSE; + if (!aaline->fs->aaline_fs) { + if (aaline->fs->state.type == PIPE_SHADER_IR_NIR) { + if (!generate_aaline_fs_nir(aaline)) + return FALSE; + } else + if (!generate_aaline_fs(aaline)) + return FALSE; + } draw->suspend_flushing = TRUE; aaline->driver_bind_fs_state(pipe, aaline->fs->aaline_fs); @@ -618,7 +650,13 @@ if (!aafs) return NULL; - aafs->state.tokens = tgsi_dup_tokens(fs->tokens); + aafs->state.type = fs->type; + if (fs->type == PIPE_SHADER_IR_TGSI) + aafs->state.tokens = tgsi_dup_tokens(fs->tokens); +#ifdef LLVM_AVAILABLE + else + aafs->state.ir.nir = nir_shader_clone(NULL, fs->ir.nir); +#endif /* pass-through */ aafs->driver_fs = aaline->driver_create_fs_state(pipe, fs); @@ -662,7 +700,10 @@ aaline->driver_delete_fs_state(pipe, aafs->aaline_fs); } - FREE((void*)aafs->state.tokens); + if (aafs->state.type == PIPE_SHADER_IR_TGSI) + FREE((void*)aafs->state.tokens); + else + ralloc_free(aafs->state.ir.nir); FREE(aafs); } @@ -681,9 +722,12 @@ return; /* allocate the extra post-transformed vertex attribute */ - aaline->coord_slot = draw_alloc_extra_vertex_attrib(draw, - TGSI_SEMANTIC_GENERIC, - aaline->fs->generic_attrib); + if (aaline->fs->aaline_fs) + aaline->coord_slot = draw_alloc_extra_vertex_attrib(draw, + TGSI_SEMANTIC_GENERIC, + aaline->fs->generic_attrib); + else + aaline->coord_slot = -1; } /** diff -Nru mesa-19.2.8/src/gallium/auxiliary/draw/draw_pipe_aapoint.c mesa-20.0.8/src/gallium/auxiliary/draw/draw_pipe_aapoint.c --- mesa-19.2.8/src/gallium/auxiliary/draw/draw_pipe_aapoint.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/draw/draw_pipe_aapoint.c 2020-06-12 01:21:16.000000000 +0000 @@ -52,6 +52,8 @@ #include "draw_vs.h" #include "draw_pipe.h" +#include "nir.h" +#include "nir/nir_draw_helpers.h" /** Approx number of new tokens for instructions in aa_transform_inst() */ #define NUM_NEW_TOKENS 200 @@ -364,6 +366,8 @@ struct pipe_context *pipe = aapoint->stage.draw->pipe; aapoint_fs = *orig_fs; /* copy to init */ + + assert(aapoint_fs.type == PIPE_SHADER_IR_TGSI); aapoint_fs.tokens = tgsi_alloc_tokens(newLen); if (aapoint_fs.tokens == NULL) return FALSE; @@ -404,6 +408,30 @@ return FALSE; } +static boolean +generate_aapoint_fs_nir(struct aapoint_stage *aapoint) +{ +#ifdef LLVM_AVAILABLE + struct pipe_context *pipe = aapoint->stage.draw->pipe; + const struct pipe_shader_state *orig_fs = &aapoint->fs->state; + struct pipe_shader_state aapoint_fs; + + aapoint_fs = *orig_fs; /* copy to init */ + aapoint_fs.ir.nir = nir_shader_clone(NULL, orig_fs->ir.nir); + if (!aapoint_fs.ir.nir) + return FALSE; + + nir_lower_aapoint_fs(aapoint_fs.ir.nir, &aapoint->fs->generic_attrib); + aapoint->fs->aapoint_fs = aapoint->driver_create_fs_state(pipe, &aapoint_fs); + if (aapoint->fs->aapoint_fs == NULL) + goto fail; + + return TRUE; + +fail: +#endif + return FALSE; +} /** * When we're about to draw our first AA point in a batch, this function is @@ -415,9 +443,13 @@ struct draw_context *draw = aapoint->stage.draw; struct pipe_context *pipe = draw->pipe; - if (!aapoint->fs->aapoint_fs && - !generate_aapoint_fs(aapoint)) - return FALSE; + if (!aapoint->fs->aapoint_fs) { + if (aapoint->fs->state.type == PIPE_SHADER_IR_NIR) { + if (!generate_aapoint_fs_nir(aapoint)) + return FALSE; + } else if (!generate_aapoint_fs(aapoint)) + return FALSE; + } draw->suspend_flushing = TRUE; aapoint->driver_bind_fs_state(pipe, aapoint->fs->aapoint_fs); @@ -637,11 +669,14 @@ if (!rast->point_smooth) return; - /* allocate the extra post-transformed vertex attribute */ - aapoint->tex_slot = draw_alloc_extra_vertex_attrib(draw, - TGSI_SEMANTIC_GENERIC, - aapoint->fs->generic_attrib); - assert(aapoint->tex_slot > 0); /* output[0] is vertex pos */ + if (aapoint->fs->aapoint_fs) { + /* allocate the extra post-transformed vertex attribute */ + aapoint->tex_slot = draw_alloc_extra_vertex_attrib(draw, + TGSI_SEMANTIC_GENERIC, + aapoint->fs->generic_attrib); + assert(aapoint->tex_slot > 0); /* output[0] is vertex pos */ + } else + aapoint->tex_slot = -1; /* find psize slot in post-transform vertex */ aapoint->psize_slot = -1; @@ -710,8 +745,13 @@ if (!aafs) return NULL; - aafs->state.tokens = tgsi_dup_tokens(fs->tokens); - + aafs->state.type = fs->type; + if (fs->type == PIPE_SHADER_IR_TGSI) + aafs->state.tokens = tgsi_dup_tokens(fs->tokens); +#ifdef LLVM_AVAILABLE + else + aafs->state.ir.nir = nir_shader_clone(NULL, fs->ir.nir); +#endif /* pass-through */ aafs->driver_fs = aapoint->driver_create_fs_state(pipe, fs); @@ -744,7 +784,10 @@ if (aafs->aapoint_fs) aapoint->driver_delete_fs_state(pipe, aafs->aapoint_fs); - FREE((void*)aafs->state.tokens); + if (aafs->state.type == PIPE_SHADER_IR_TGSI) + FREE((void*)aafs->state.tokens); + else + ralloc_free(aafs->state.ir.nir); FREE(aafs); } diff -Nru mesa-19.2.8/src/gallium/auxiliary/draw/draw_pipe_offset.c mesa-20.0.8/src/gallium/auxiliary/draw/draw_pipe_offset.c --- mesa-19.2.8/src/gallium/auxiliary/draw/draw_pipe_offset.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/draw/draw_pipe_offset.c 2020-06-12 01:21:16.000000000 +0000 @@ -32,7 +32,7 @@ * \author Brian Paul */ -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_math.h" #include "util/u_memory.h" #include "draw_pipe.h" diff -Nru mesa-19.2.8/src/gallium/auxiliary/draw/draw_pipe_pstipple.c mesa-20.0.8/src/gallium/auxiliary/draw/draw_pipe_pstipple.c --- mesa-19.2.8/src/gallium/auxiliary/draw/draw_pipe_pstipple.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/draw/draw_pipe_pstipple.c 2020-06-12 01:21:16.000000000 +0000 @@ -40,7 +40,7 @@ #include "pipe/p_shader_tokens.h" #include "util/u_inlines.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_math.h" #include "util/u_memory.h" #include "util/u_pstipple.h" @@ -51,6 +51,8 @@ #include "draw_context.h" #include "draw_pipe.h" +#include "nir.h" +#include "nir/nir_draw_helpers.h" /** Approx number of new tokens for instructions in pstip_transform_inst() */ #define NUM_NEW_TOKENS 53 @@ -133,12 +135,20 @@ TGSI_FILE_SYSTEM_VALUE : TGSI_FILE_INPUT; pstip_fs = *orig_fs; /* copy to init */ - pstip_fs.tokens = util_pstipple_create_fragment_shader(orig_fs->tokens, - &pstip->fs->sampler_unit, - 0, - wincoord_file); - if (pstip_fs.tokens == NULL) - return FALSE; + if (orig_fs->type == PIPE_SHADER_IR_TGSI) { + pstip_fs.tokens = util_pstipple_create_fragment_shader(orig_fs->tokens, + &pstip->fs->sampler_unit, + 0, + wincoord_file); + if (pstip_fs.tokens == NULL) + return FALSE; + } else { +#ifdef LLVM_AVAILABLE + pstip_fs.ir.nir = nir_shader_clone(NULL, orig_fs->ir.nir); + nir_lower_pstipple_fs(pstip_fs.ir.nir, + &pstip->fs->sampler_unit, 0, wincoord_file == TGSI_FILE_SYSTEM_VALUE); +#endif + } assert(pstip->fs->sampler_unit < PIPE_MAX_SAMPLERS); @@ -334,7 +344,11 @@ struct pstip_fragment_shader *pstipfs = CALLOC_STRUCT(pstip_fragment_shader); if (pstipfs) { - pstipfs->state.tokens = tgsi_dup_tokens(fs->tokens); + pstipfs->state.type = fs->type; + if (fs->type == PIPE_SHADER_IR_TGSI) + pstipfs->state.tokens = tgsi_dup_tokens(fs->tokens); + else + pstipfs->state.ir.nir = nir_shader_clone(NULL, fs->ir.nir); /* pass-through */ pstipfs->driver_fs = pstip->driver_create_fs_state(pstip->pipe, fs); @@ -368,7 +382,10 @@ if (pstipfs->pstip_fs) pstip->driver_delete_fs_state(pstip->pipe, pstipfs->pstip_fs); - FREE((void*)pstipfs->state.tokens); + if (pstipfs->state.type == PIPE_SHADER_IR_TGSI) + FREE((void*)pstipfs->state.tokens); + else + ralloc_free(pstipfs->state.ir.nir); FREE(pstipfs); } diff -Nru mesa-19.2.8/src/gallium/auxiliary/draw/draw_private.h mesa-20.0.8/src/gallium/auxiliary/draw/draw_private.h --- mesa-19.2.8/src/gallium/auxiliary/draw/draw_private.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/draw/draw_private.h 2020-06-12 01:21:16.000000000 +0000 @@ -46,7 +46,7 @@ #include "tgsi/tgsi_scan.h" -#ifdef HAVE_LLVM +#ifdef LLVM_AVAILABLE struct gallivm_state; #endif @@ -196,6 +196,7 @@ int eltBias; unsigned min_index; unsigned max_index; + unsigned drawid; /** vertex arrays */ struct draw_vertex_buffer vbuffer[PIPE_MAX_ATTRIBS]; @@ -342,9 +343,14 @@ const struct pipe_sampler_state *samplers[PIPE_SHADER_TYPES][PIPE_MAX_SAMPLERS]; unsigned num_samplers[PIPE_SHADER_TYPES]; + struct pipe_image_view *images[PIPE_SHADER_TYPES][PIPE_MAX_SHADER_IMAGES]; + unsigned num_images[PIPE_SHADER_TYPES]; + struct pipe_query_data_pipeline_statistics statistics; boolean collect_statistics; + bool collect_primgen; + struct draw_assembler *ia; void *driver_private; diff -Nru mesa-19.2.8/src/gallium/auxiliary/draw/draw_pt.c mesa-20.0.8/src/gallium/auxiliary/draw/draw_pt.c --- mesa-19.2.8/src/gallium/auxiliary/draw/draw_pt.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/draw/draw_pt.c 2020-06-12 01:21:16.000000000 +0000 @@ -39,7 +39,7 @@ #include "tgsi/tgsi_dump.h" #include "util/u_math.h" #include "util/u_prim.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_draw.h" @@ -191,7 +191,7 @@ if (!draw->pt.middle.general) return FALSE; -#if HAVE_LLVM +#ifdef LLVM_AVAILABLE if (draw->llvm) draw->pt.middle.llvm = draw_pt_fetch_pipeline_or_emit_llvm( draw ); #endif @@ -485,6 +485,7 @@ draw->pt.user.min_index = info->min_index; draw->pt.user.max_index = info->max_index; draw->pt.user.eltSize = info->index_size ? draw->pt.user.eltSizeIB : 0; + draw->pt.user.drawid = info->drawid; if (0) debug_printf("draw_vbo(mode=%u start=%u count=%u):\n", @@ -522,7 +523,7 @@ draw->pt.vertex_element, draw->pt.nr_vertex_elements, info); -#if HAVE_LLVM +#ifdef LLVM_AVAILABLE if (!draw->llvm) #endif { diff -Nru mesa-19.2.8/src/gallium/auxiliary/draw/draw_pt_fetch.c mesa-20.0.8/src/gallium/auxiliary/draw/draw_pt_fetch.c --- mesa-19.2.8/src/gallium/auxiliary/draw/draw_pt_fetch.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/draw/draw_pt_fetch.c 2020-06-12 01:21:16.000000000 +0000 @@ -27,7 +27,7 @@ #include "util/u_memory.h" #include "util/u_math.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "draw/draw_context.h" #include "draw/draw_private.h" #include "draw/draw_pt.h" diff -Nru mesa-19.2.8/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c mesa-20.0.8/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c --- mesa-19.2.8/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c 2020-06-12 01:21:16.000000000 +0000 @@ -289,8 +289,13 @@ unsigned i; for (i = 0; i < ARRAY_SIZE(llvm->jit_context.vs_constants); ++i) { + /* + * There could be a potential issue with rounding this up, as the + * shader expects 16-byte allocations, the fix is likely to move + * to LOAD intrinsic in the future and remove the vec4 constraint. + */ int num_consts = - draw->pt.user.vs_constants_size[i] / (sizeof(float) * 4); + DIV_ROUND_UP(draw->pt.user.vs_constants_size[i], (sizeof(float) * 4)); llvm->jit_context.vs_constants[i] = draw->pt.user.vs_constants[i]; llvm->jit_context.num_vs_constants[i] = num_consts; if (num_consts == 0) { @@ -308,7 +313,7 @@ for (i = 0; i < ARRAY_SIZE(llvm->gs_jit_context.constants); ++i) { int num_consts = - draw->pt.user.gs_constants_size[i] / (sizeof(float) * 4); + DIV_ROUND_UP(draw->pt.user.gs_constants_size[i], (sizeof(float) * 4)); llvm->gs_jit_context.constants[i] = draw->pt.user.gs_constants[i]; llvm->gs_jit_context.num_constants[i] = num_consts; if (num_consts == 0) { @@ -424,7 +429,7 @@ draw->instance_id, vid_base, draw->start_instance, - elts); + elts, draw->pt.user.drawid); /* Finished with fetch and vs: */ diff -Nru mesa-19.2.8/src/gallium/auxiliary/draw/draw_pt_so_emit.c mesa-20.0.8/src/gallium/auxiliary/draw/draw_pt_so_emit.c --- mesa-19.2.8/src/gallium/auxiliary/draw/draw_pt_so_emit.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/draw/draw_pt_so_emit.c 2020-06-12 01:21:16.000000000 +0000 @@ -36,6 +36,7 @@ #include "pipe/p_state.h" #include "util/u_math.h" +#include "util/u_prim.h" #include "util/u_memory.h" struct pt_so_emit { @@ -273,8 +274,20 @@ struct vbuf_render *render = draw->render; unsigned start, i, stream; - if (!emit->has_so) + if (!emit->has_so) { + if (draw->collect_primgen) { + unsigned i; + unsigned total = 0; + for (i = 0; i < input_prims->primitive_count; i++) { + total += + u_decomposed_prims_for_vertices(input_prims->prim, + input_prims->primitive_lengths[i]); + } + render->set_stream_output_info(render, + 0, 0, total); + } return; + } if (!draw->so.num_targets) return; diff -Nru mesa-19.2.8/src/gallium/auxiliary/draw/draw_vs.c mesa-20.0.8/src/gallium/auxiliary/draw/draw_vs.c --- mesa-19.2.8/src/gallium/auxiliary/draw/draw_vs.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/draw/draw_vs.c 2020-06-12 01:21:16.000000000 +0000 @@ -59,7 +59,7 @@ tgsi_dump(shader->tokens, 0); } -#if HAVE_LLVM +#ifdef LLVM_AVAILABLE if (draw->pt.middle.llvm) { vs = draw_create_vs_llvm(draw, shader); } diff -Nru mesa-19.2.8/src/gallium/auxiliary/draw/draw_vs_exec.c mesa-20.0.8/src/gallium/auxiliary/draw/draw_vs_exec.c --- mesa-19.2.8/src/gallium/auxiliary/draw/draw_vs_exec.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/draw/draw_vs_exec.c 2020-06-12 01:21:16.000000000 +0000 @@ -230,6 +230,7 @@ tgsi_scan_shader(state->tokens, &vs->base.info); + vs->base.state.type = state->type; vs->base.state.stream_output = state->stream_output; vs->base.draw = draw; vs->base.prepare = vs_exec_prepare; diff -Nru mesa-19.2.8/src/gallium/auxiliary/draw/draw_vs.h mesa-20.0.8/src/gallium/auxiliary/draw/draw_vs.h --- mesa-19.2.8/src/gallium/auxiliary/draw/draw_vs.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/draw/draw_vs.h 2020-06-12 01:21:16.000000000 +0000 @@ -164,7 +164,7 @@ struct draw_vs_variant_key; struct draw_vertex_shader; -#if HAVE_LLVM +#ifdef LLVM_AVAILABLE struct draw_vertex_shader * draw_create_vs_llvm(struct draw_context *draw, const struct pipe_shader_state *state); diff -Nru mesa-19.2.8/src/gallium/auxiliary/draw/draw_vs_llvm.c mesa-20.0.8/src/gallium/auxiliary/draw/draw_vs_llvm.c --- mesa-19.2.8/src/gallium/auxiliary/draw/draw_vs_llvm.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/draw/draw_vs_llvm.c 2020-06-12 01:21:16.000000000 +0000 @@ -37,6 +37,8 @@ #include "tgsi/tgsi_parse.h" #include "tgsi/tgsi_scan.h" +#include "nir/nir_to_tgsi_info.h" +#include "nir.h" static void vs_llvm_prepare(struct draw_vertex_shader *shader, @@ -76,6 +78,8 @@ } assert(shader->variants_cached == 0); + if (dvs->state.ir.nir) + ralloc_free(dvs->state.ir.nir); FREE((void*) dvs->state.tokens); FREE( dvs ); } @@ -90,21 +94,30 @@ if (!vs) return NULL; - /* we make a private copy of the tokens */ - vs->base.state.tokens = tgsi_dup_tokens(state->tokens); - if (!vs->base.state.tokens) { - FREE(vs); - return NULL; - } + /* due to some bugs in the feedback state tracker we have to check + for ir.nir & PIPE_SHADER_IR_NIR here. */ + if (state->ir.nir && state->type == PIPE_SHADER_IR_NIR) { + vs->base.state.ir.nir = state->ir.nir; + nir_tgsi_scan_shader(state->ir.nir, &vs->base.info, true); + } else { + /* we make a private copy of the tokens */ + vs->base.state.tokens = tgsi_dup_tokens(state->tokens); + if (!vs->base.state.tokens) { + FREE(vs); + return NULL; + } - tgsi_scan_shader(state->tokens, &vs->base.info); + tgsi_scan_shader(state->tokens, &vs->base.info); + } vs->variant_key_size = draw_llvm_variant_key_size( vs->base.info.file_max[TGSI_FILE_INPUT]+1, MAX2(vs->base.info.file_max[TGSI_FILE_SAMPLER]+1, - vs->base.info.file_max[TGSI_FILE_SAMPLER_VIEW]+1)); + vs->base.info.file_max[TGSI_FILE_SAMPLER_VIEW]+1), + vs->base.info.file_max[TGSI_FILE_IMAGE]+1); + vs->base.state.type = state->type; vs->base.state.stream_output = state->stream_output; vs->base.draw = draw; vs->base.prepare = vs_llvm_prepare; diff -Nru mesa-19.2.8/src/gallium/auxiliary/driver_ddebug/dd_context.c mesa-20.0.8/src/gallium/auxiliary/driver_ddebug/dd_context.c --- mesa-19.2.8/src/gallium/auxiliary/driver_ddebug/dd_context.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/driver_ddebug/dd_context.c 2020-06-12 01:21:16.000000000 +0000 @@ -623,7 +623,7 @@ mtx_destroy(&dctx->mutex); cnd_destroy(&dctx->cond); - assert(list_empty(&dctx->records)); + assert(list_is_empty(&dctx->records)); if (pipe->set_log_context) { pipe->set_log_context(pipe, NULL); diff -Nru mesa-19.2.8/src/gallium/auxiliary/driver_ddebug/dd_draw.c mesa-20.0.8/src/gallium/auxiliary/driver_ddebug/dd_draw.c --- mesa-19.2.8/src/gallium/auxiliary/driver_ddebug/dd_draw.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/driver_ddebug/dd_draw.c 2020-06-12 01:21:16.000000000 +0000 @@ -28,7 +28,7 @@ #include "dd_pipe.h" #include "util/u_dump.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_framebuffer.h" #include "util/u_helpers.h" #include "util/u_inlines.h" @@ -1103,7 +1103,7 @@ if (dctx->api_stalled) cnd_signal(&dctx->cond); - if (list_empty(&records)) { + if (list_is_empty(&records)) { if (dctx->kill_thread) break; @@ -1184,7 +1184,7 @@ dctx->api_stalled = false; } - if (list_empty(&dctx->records)) + if (list_is_empty(&dctx->records)) cnd_signal(&dctx->cond); list_addtail(&record->list, &dctx->records); diff -Nru mesa-19.2.8/src/gallium/auxiliary/driver_ddebug/dd_screen.c mesa-20.0.8/src/gallium/auxiliary/driver_ddebug/dd_screen.c --- mesa-19.2.8/src/gallium/auxiliary/driver_ddebug/dd_screen.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/driver_ddebug/dd_screen.c 2020-06-12 01:21:16.000000000 +0000 @@ -412,6 +412,14 @@ */ static void +dd_screen_finalize_nir(struct pipe_screen *_screen, void *nir, bool optimize) +{ + struct pipe_screen *screen = dd_screen(_screen)->screen; + + screen->finalize_nir(screen, nir, optimize); +} + +static void dd_screen_destroy(struct pipe_screen *_screen) { struct dd_screen *dscreen = dd_screen(_screen); @@ -597,6 +605,7 @@ SCR_INIT(get_compiler_options); SCR_INIT(get_driver_uuid); SCR_INIT(get_device_uuid); + SCR_INIT(finalize_nir); #undef SCR_INIT diff -Nru mesa-19.2.8/src/gallium/auxiliary/driver_noop/noop_pipe.c mesa-20.0.8/src/gallium/auxiliary/driver_noop/noop_pipe.c --- mesa-19.2.8/src/gallium/auxiliary/driver_noop/noop_pipe.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/driver_noop/noop_pipe.c 2020-06-12 01:21:16.000000000 +0000 @@ -28,7 +28,7 @@ #include "pipe/p_screen.h" #include "util/u_memory.h" #include "util/u_inlines.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_upload_mgr.h" #include "noop_public.h" @@ -498,6 +498,29 @@ screen->query_memory_info(screen, info); } +static struct disk_cache *noop_get_disk_shader_cache(struct pipe_screen *pscreen) +{ + struct pipe_screen *screen = ((struct noop_pipe_screen*)pscreen)->oscreen; + + return screen->get_disk_shader_cache(screen); +} + +static const void *noop_get_compiler_options(struct pipe_screen *pscreen, + enum pipe_shader_ir ir, + enum pipe_shader_type shader) +{ + struct pipe_screen *screen = ((struct noop_pipe_screen*)pscreen)->oscreen; + + return screen->get_compiler_options(screen, ir, shader); +} + +static void noop_finalize_nir(struct pipe_screen *pscreen, void *nir, bool optimize) +{ + struct pipe_screen *screen = ((struct noop_pipe_screen*)pscreen)->oscreen; + + screen->finalize_nir(screen, nir, optimize); +} + struct pipe_screen *noop_screen_create(struct pipe_screen *oscreen) { struct noop_pipe_screen *noop_screen; @@ -535,6 +558,9 @@ screen->fence_reference = noop_fence_reference; screen->fence_finish = noop_fence_finish; screen->query_memory_info = noop_query_memory_info; + screen->get_disk_shader_cache = noop_get_disk_shader_cache; + screen->get_compiler_options = noop_get_compiler_options; + screen->finalize_nir = noop_finalize_nir; return screen; } diff -Nru mesa-19.2.8/src/gallium/auxiliary/driver_rbug/rbug_context.c mesa-20.0.8/src/gallium/auxiliary/driver_rbug/rbug_context.c --- mesa-19.2.8/src/gallium/auxiliary/driver_rbug/rbug_context.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/driver_rbug/rbug_context.c 2020-06-12 01:21:16.000000000 +0000 @@ -1004,6 +1004,31 @@ mtx_unlock(&rb_pipe->call_mutex); } +static void +rbug_create_fence_fd(struct pipe_context *_pipe, + struct pipe_fence_handle **fence, int fd, + enum pipe_fd_type type) +{ + struct rbug_context *rb_pipe = rbug_context(_pipe); + struct pipe_context *pipe = rb_pipe->pipe; + + mtx_lock(&rb_pipe->call_mutex); + pipe->create_fence_fd(pipe, fence, fd, type); + mtx_unlock(&rb_pipe->call_mutex); +} + +static void +rbug_fence_server_sync(struct pipe_context *_pipe, + struct pipe_fence_handle *fence) +{ + struct rbug_context *rb_pipe = rbug_context(_pipe); + struct pipe_context *pipe = rb_pipe->pipe; + + mtx_lock(&rb_pipe->call_mutex); + pipe->fence_server_sync(pipe, fence); + mtx_unlock(&rb_pipe->call_mutex); +} + static struct pipe_sampler_view * rbug_context_create_sampler_view(struct pipe_context *_pipe, struct pipe_resource *_resource, @@ -1178,6 +1203,17 @@ mtx_unlock(&rb_pipe->call_mutex); } +static void +rbug_context_texture_barrier(struct pipe_context *_context, unsigned flags) +{ + struct rbug_context *rb_pipe = rbug_context(_context); + struct pipe_context *context = rb_pipe->pipe; + + mtx_lock(&rb_pipe->call_mutex); + context->texture_barrier(context, + flags); + mtx_unlock(&rb_pipe->call_mutex); +} struct pipe_context * rbug_context_create(struct pipe_screen *_screen, struct pipe_context *pipe) @@ -1252,11 +1288,12 @@ rb_pipe->base.set_stream_output_targets = rbug_set_stream_output_targets; rb_pipe->base.resource_copy_region = rbug_resource_copy_region; rb_pipe->base.blit = rbug_blit; - rb_pipe->base.flush_resource = rbug_flush_resource; rb_pipe->base.clear = rbug_clear; rb_pipe->base.clear_render_target = rbug_clear_render_target; rb_pipe->base.clear_depth_stencil = rbug_clear_depth_stencil; rb_pipe->base.flush = rbug_flush; + rb_pipe->base.create_fence_fd = rbug_create_fence_fd; + rb_pipe->base.fence_server_sync = rbug_fence_server_sync; rb_pipe->base.create_sampler_view = rbug_context_create_sampler_view; rb_pipe->base.sampler_view_destroy = rbug_context_sampler_view_destroy; rb_pipe->base.create_surface = rbug_context_create_surface; @@ -1266,6 +1303,8 @@ rb_pipe->base.transfer_flush_region = rbug_context_transfer_flush_region; rb_pipe->base.buffer_subdata = rbug_context_buffer_subdata; rb_pipe->base.texture_subdata = rbug_context_texture_subdata; + rb_pipe->base.texture_barrier = rbug_context_texture_barrier; + rb_pipe->base.flush_resource = rbug_flush_resource; rb_pipe->pipe = pipe; diff -Nru mesa-19.2.8/src/gallium/auxiliary/driver_rbug/rbug_core.c mesa-20.0.8/src/gallium/auxiliary/driver_rbug/rbug_core.c --- mesa-19.2.8/src/gallium/auxiliary/driver_rbug/rbug_core.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/driver_rbug/rbug_core.c 2020-06-12 01:21:16.000000000 +0000 @@ -27,7 +27,7 @@ #include "os/os_thread.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_string.h" #include "util/u_inlines.h" #include "util/u_memory.h" diff -Nru mesa-19.2.8/src/gallium/auxiliary/driver_rbug/rbug_screen.c mesa-20.0.8/src/gallium/auxiliary/driver_rbug/rbug_screen.c --- mesa-19.2.8/src/gallium/auxiliary/driver_rbug/rbug_screen.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/driver_rbug/rbug_screen.c 2020-06-12 01:21:16.000000000 +0000 @@ -138,6 +138,23 @@ tex_usage); } +static void +rbug_screen_query_dmabuf_modifiers(struct pipe_screen *_screen, + enum pipe_format format, int max, + uint64_t *modifiers, + unsigned int *external_only, int *count) +{ + struct rbug_screen *rb_screen = rbug_screen(_screen); + struct pipe_screen *screen = rb_screen->screen; + + screen->query_dmabuf_modifiers(screen, + format, + max, + modifiers, + external_only, + count); +} + static struct pipe_context * rbug_screen_context_create(struct pipe_screen *_screen, void *priv, unsigned flags) @@ -152,6 +169,17 @@ return NULL; } +static bool +rbug_screen_can_create_resource(struct pipe_screen *_screen, + const struct pipe_resource *templat) +{ + struct rbug_screen *rb_screen = rbug_screen(_screen); + struct pipe_screen *screen = rb_screen->screen; + + return screen->can_create_resource(screen, + templat); +} + static struct pipe_resource * rbug_screen_resource_create(struct pipe_screen *_screen, const struct pipe_resource *templat) @@ -169,6 +197,25 @@ } static struct pipe_resource * +rbug_screen_resource_create_with_modifiers(struct pipe_screen *_screen, + const struct pipe_resource *templat, + const uint64_t *modifiers, int count) +{ + struct rbug_screen *rb_screen = rbug_screen(_screen); + struct pipe_screen *screen = rb_screen->screen; + struct pipe_resource *result; + + result = screen->resource_create_with_modifiers(screen, + templat, + modifiers, + count); + + if (result) + return rbug_resource_create(rb_screen, result); + return NULL; +} + +static struct pipe_resource * rbug_screen_resource_from_handle(struct pipe_screen *_screen, const struct pipe_resource *templ, struct winsys_handle *handle, @@ -260,8 +307,7 @@ struct pipe_screen *screen = rb_screen->screen; struct pipe_resource *resource = rb_resource->resource; - if (screen->resource_changed) - screen->resource_changed(screen, resource); + screen->resource_changed(screen, resource); } static void @@ -314,6 +360,24 @@ return screen->fence_finish(screen, ctx, fence, timeout); } +static int +rbug_screen_fence_get_fd(struct pipe_screen *_screen, + struct pipe_fence_handle *fence) +{ + struct rbug_screen *rb_screen = rbug_screen(_screen); + struct pipe_screen *screen = rb_screen->screen; + + return screen->fence_get_fd(screen, fence); +} + +static void +rbug_screen_finalize_nir(struct pipe_screen *_screen, void *nir, bool optimize) +{ + struct pipe_screen *screen = rbug_screen(_screen)->screen; + + screen->finalize_nir(screen, nir, optimize); +} + bool rbug_enabled() { @@ -350,8 +414,11 @@ rb_screen->base.get_shader_param = rbug_screen_get_shader_param; rb_screen->base.get_paramf = rbug_screen_get_paramf; rb_screen->base.is_format_supported = rbug_screen_is_format_supported; + SCR_INIT(query_dmabuf_modifiers); rb_screen->base.context_create = rbug_screen_context_create; + SCR_INIT(can_create_resource); rb_screen->base.resource_create = rbug_screen_resource_create; + SCR_INIT(resource_create_with_modifiers); rb_screen->base.resource_from_handle = rbug_screen_resource_from_handle; SCR_INIT(check_resource_capability); rb_screen->base.resource_get_handle = rbug_screen_resource_get_handle; @@ -362,6 +429,8 @@ rb_screen->base.flush_frontbuffer = rbug_screen_flush_frontbuffer; rb_screen->base.fence_reference = rbug_screen_fence_reference; rb_screen->base.fence_finish = rbug_screen_fence_finish; + rb_screen->base.fence_get_fd = rbug_screen_fence_get_fd; + SCR_INIT(finalize_nir); rb_screen->screen = screen; diff -Nru mesa-19.2.8/src/gallium/auxiliary/driver_trace/tr_dump.c mesa-20.0.8/src/gallium/auxiliary/driver_trace/tr_dump.c --- mesa-19.2.8/src/gallium/auxiliary/driver_trace/tr_dump.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/driver_trace/tr_dump.c 2020-06-12 01:21:16.000000000 +0000 @@ -50,7 +50,7 @@ #include "util/u_memory.h" #include "util/u_string.h" #include "util/u_math.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "tr_dump.h" #include "tr_screen.h" diff -Nru mesa-19.2.8/src/gallium/auxiliary/driver_trace/tr_dump_defines.h mesa-20.0.8/src/gallium/auxiliary/driver_trace/tr_dump_defines.h --- mesa-19.2.8/src/gallium/auxiliary/driver_trace/tr_dump_defines.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/driver_trace/tr_dump_defines.h 2020-06-12 01:21:16.000000000 +0000 @@ -29,7 +29,7 @@ #define TR_DUMP_DEFINES_H_ #include "pipe/p_compiler.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_dump.h" #include "tr_dump.h" diff -Nru mesa-19.2.8/src/gallium/auxiliary/driver_trace/tr_dump_state.c mesa-20.0.8/src/gallium/auxiliary/driver_trace/tr_dump_state.c --- mesa-19.2.8/src/gallium/auxiliary/driver_trace/tr_dump_state.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/driver_trace/tr_dump_state.c 2020-06-12 01:21:16.000000000 +0000 @@ -28,7 +28,7 @@ #include "pipe/p_compiler.h" #include "util/u_memory.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "tgsi/tgsi_dump.h" #include "tr_dump.h" diff -Nru mesa-19.2.8/src/gallium/auxiliary/driver_trace/tr_screen.c mesa-20.0.8/src/gallium/auxiliary/driver_trace/tr_screen.c --- mesa-19.2.8/src/gallium/auxiliary/driver_trace/tr_screen.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/driver_trace/tr_screen.c 2020-06-12 01:21:16.000000000 +0000 @@ -25,7 +25,7 @@ * **************************************************************************/ -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_memory.h" #include "util/simple_list.h" @@ -644,6 +644,14 @@ } static void +trace_screen_finalize_nir(struct pipe_screen *_screen, void *nir, bool optimize) +{ + struct pipe_screen *screen = trace_screen(_screen)->screen; + + screen->finalize_nir(screen, nir, optimize); +} + +static void trace_screen_destroy(struct pipe_screen *_screen) { struct trace_screen *tr_scr = trace_screen(_screen); @@ -722,6 +730,7 @@ tr_scr->base.get_timestamp = trace_screen_get_timestamp; SCR_INIT(get_driver_uuid); SCR_INIT(get_device_uuid); + SCR_INIT(finalize_nir); tr_scr->screen = screen; diff -Nru mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_arit.c mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_arit.c --- mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_arit.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_arit.c 2020-06-12 01:21:16.000000000 +0000 @@ -47,6 +47,8 @@ #include +#include + #include "util/u_memory.h" #include "util/u_debug.h" #include "util/u_math.h" @@ -142,49 +144,6 @@ intrinsic = "llvm.ppc.altivec.vminfp"; intr_size = 128; } - } else if (HAVE_LLVM < 0x0309 && - util_cpu_caps.has_avx2 && type.length > 4) { - intr_size = 256; - switch (type.width) { - case 8: - intrinsic = type.sign ? "llvm.x86.avx2.pmins.b" : "llvm.x86.avx2.pminu.b"; - break; - case 16: - intrinsic = type.sign ? "llvm.x86.avx2.pmins.w" : "llvm.x86.avx2.pminu.w"; - break; - case 32: - intrinsic = type.sign ? "llvm.x86.avx2.pmins.d" : "llvm.x86.avx2.pminu.d"; - break; - } - } else if (HAVE_LLVM < 0x0309 && - util_cpu_caps.has_sse2 && type.length >= 2) { - intr_size = 128; - if ((type.width == 8 || type.width == 16) && - (type.width * type.length <= 64) && - (gallivm_debug & GALLIVM_DEBUG_PERF)) { - debug_printf("%s: inefficient code, bogus shuffle due to packing\n", - __FUNCTION__); - } - if (type.width == 8 && !type.sign) { - intrinsic = "llvm.x86.sse2.pminu.b"; - } - else if (type.width == 16 && type.sign) { - intrinsic = "llvm.x86.sse2.pmins.w"; - } - if (util_cpu_caps.has_sse4_1) { - if (type.width == 8 && type.sign) { - intrinsic = "llvm.x86.sse41.pminsb"; - } - if (type.width == 16 && !type.sign) { - intrinsic = "llvm.x86.sse41.pminuw"; - } - if (type.width == 32 && !type.sign) { - intrinsic = "llvm.x86.sse41.pminud"; - } - if (type.width == 32 && type.sign) { - intrinsic = "llvm.x86.sse41.pminsd"; - } - } } else if (util_cpu_caps.has_altivec) { intr_size = 128; if (type.width == 8) { @@ -285,12 +244,7 @@ LLVMTypeRef type = LLVMTypeOf(a); assert(type == LLVMTypeOf(b)); assert(type == LLVMTypeOf(c)); - if (HAVE_LLVM < 0x0304) { - /* XXX: LLVM 3.3 does not breakdown llvm.fmuladd into mul+add when FMA is - * not supported, and instead it falls-back to a C function. - */ - return LLVMBuildFAdd(builder, LLVMBuildFMul(builder, a, b, ""), c, ""); - } + char intrinsic[32]; lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fmuladd", type); LLVMValueRef args[] = { a, b, c }; @@ -360,50 +314,6 @@ intrinsic = "llvm.ppc.altivec.vmaxfp"; intr_size = 128; } - } else if (HAVE_LLVM < 0x0309 && - util_cpu_caps.has_avx2 && type.length > 4) { - intr_size = 256; - switch (type.width) { - case 8: - intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.b" : "llvm.x86.avx2.pmaxu.b"; - break; - case 16: - intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.w" : "llvm.x86.avx2.pmaxu.w"; - break; - case 32: - intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.d" : "llvm.x86.avx2.pmaxu.d"; - break; - } - } else if (HAVE_LLVM < 0x0309 && - util_cpu_caps.has_sse2 && type.length >= 2) { - intr_size = 128; - if ((type.width == 8 || type.width == 16) && - (type.width * type.length <= 64) && - (gallivm_debug & GALLIVM_DEBUG_PERF)) { - debug_printf("%s: inefficient code, bogus shuffle due to packing\n", - __FUNCTION__); - } - if (type.width == 8 && !type.sign) { - intrinsic = "llvm.x86.sse2.pmaxu.b"; - intr_size = 128; - } - else if (type.width == 16 && type.sign) { - intrinsic = "llvm.x86.sse2.pmaxs.w"; - } - if (util_cpu_caps.has_sse4_1) { - if (type.width == 8 && type.sign) { - intrinsic = "llvm.x86.sse41.pmaxsb"; - } - if (type.width == 16 && !type.sign) { - intrinsic = "llvm.x86.sse41.pmaxuw"; - } - if (type.width == 32 && !type.sign) { - intrinsic = "llvm.x86.sse41.pmaxud"; - } - if (type.width == 32 && type.sign) { - intrinsic = "llvm.x86.sse41.pmaxsd"; - } - } } else if (util_cpu_caps.has_altivec) { intr_size = 128; if (type.width == 8) { @@ -555,7 +465,7 @@ return bld->one; if (!type.floating && !type.fixed) { - if (HAVE_LLVM >= 0x0800) { + if (LLVM_VERSION_MAJOR >= 8) { char intrin[32]; intrinsic = type.sign ? "llvm.sadd.sat" : "llvm.uadd.sat"; lp_format_intrinsic(intrin, sizeof intrin, intrinsic, bld->vec_type); @@ -879,7 +789,7 @@ return bld->zero; if (!type.floating && !type.fixed) { - if (HAVE_LLVM >= 0x0800) { + if (LLVM_VERSION_MAJOR >= 8) { char intrin[32]; intrinsic = type.sign ? "llvm.ssub.sat" : "llvm.usub.sat"; lp_format_intrinsic(intrin, sizeof intrin, intrinsic, bld->vec_type); @@ -1161,8 +1071,13 @@ * https://llvm.org/bugs/show_bug.cgi?id=30845 * So, whip up our own code, albeit only for length 4 and 8 (which * should be good enough)... + * FIXME: For llvm >= 7.0 we should match the autoupgrade pattern + * (bitcast/and/mul/shuffle for unsigned, bitcast/shl/ashr/mul/shuffle + * for signed), which the fallback code does not, without this llvm + * will likely still produce atrocious code. */ - if ((bld->type.length == 4 || bld->type.length == 8) && + if (LLVM_VERSION_MAJOR < 7 && + (bld->type.length == 4 || bld->type.length == 8) && ((util_cpu_caps.has_sse2 && (bld->type.sign == 0)) || util_cpu_caps.has_sse4_1)) { const char *intrinsic = NULL; @@ -1822,23 +1737,12 @@ return a; if(type.floating) { - if (0x0306 <= HAVE_LLVM && HAVE_LLVM < 0x0309) { - /* Workaround llvm.org/PR27332 */ - LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type); - unsigned long long absMask = ~(1ULL << (type.width - 1)); - LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask)); - a = LLVMBuildBitCast(builder, a, int_vec_type, ""); - a = LLVMBuildAnd(builder, a, mask, ""); - a = LLVMBuildBitCast(builder, a, vec_type, ""); - return a; - } else { - char intrinsic[32]; - lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fabs", vec_type); - return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a); - } + char intrinsic[32]; + lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fabs", vec_type); + return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a); } - if(type.width*type.length == 128 && util_cpu_caps.has_ssse3 && HAVE_LLVM < 0x0600) { + if(type.width*type.length == 128 && util_cpu_caps.has_ssse3 && LLVM_VERSION_MAJOR < 6) { switch(type.width) { case 8: return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a); @@ -1848,7 +1752,7 @@ return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a); } } - else if (type.width*type.length == 256 && util_cpu_caps.has_avx2 && HAVE_LLVM < 0x0600) { + else if (type.width*type.length == 256 && util_cpu_caps.has_avx2 && LLVM_VERSION_MAJOR < 6) { switch(type.width) { case 8: return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.b", vec_type, a); diff -Nru mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_bitarit.c mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_bitarit.c --- mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_bitarit.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_bitarit.c 2020-06-12 01:21:16.000000000 +0000 @@ -32,7 +32,7 @@ #include "lp_bld_debug.h" #include "lp_bld_const.h" #include "lp_bld_bitarit.h" - +#include "lp_bld_intr.h" /** * Return (a | b) @@ -240,3 +240,56 @@ assert(imm < bld->type.width); return lp_build_shr(bld, a, b); } + +LLVMValueRef +lp_build_popcount(struct lp_build_context *bld, LLVMValueRef a) +{ + LLVMBuilderRef builder = bld->gallivm->builder; + LLVMValueRef result; + char intr_str[256]; + + lp_format_intrinsic(intr_str, sizeof(intr_str), "llvm.ctpop", bld->vec_type); + result = lp_build_intrinsic_unary(builder, intr_str, bld->vec_type, a); + return result; +} + +LLVMValueRef +lp_build_bitfield_reverse(struct lp_build_context *bld, LLVMValueRef a) +{ + LLVMBuilderRef builder = bld->gallivm->builder; + LLVMValueRef result; + char intr_str[256]; + + lp_format_intrinsic(intr_str, sizeof(intr_str), "llvm.bitreverse", bld->vec_type); + result = lp_build_intrinsic_unary(builder, intr_str, bld->vec_type, a); + return result; +} + +LLVMValueRef +lp_build_cttz(struct lp_build_context *bld, LLVMValueRef a) +{ + LLVMBuilderRef builder = bld->gallivm->builder; + LLVMValueRef result; + char intr_str[256]; + + lp_format_intrinsic(intr_str, sizeof(intr_str), "llvm.cttz", bld->vec_type); + + LLVMValueRef undef_val = LLVMConstNull(LLVMInt1TypeInContext(bld->gallivm->context)); + result = lp_build_intrinsic_binary(builder, intr_str, bld->vec_type, a, undef_val); + return LLVMBuildSelect(builder, LLVMBuildICmp(builder, LLVMIntEQ, a, bld->zero, ""), + lp_build_const_int_vec(bld->gallivm, bld->type, -1), result, ""); +} + +LLVMValueRef +lp_build_ctlz(struct lp_build_context *bld, LLVMValueRef a) +{ + LLVMBuilderRef builder = bld->gallivm->builder; + LLVMValueRef result; + char intr_str[256]; + + lp_format_intrinsic(intr_str, sizeof(intr_str), "llvm.ctlz", bld->vec_type); + + LLVMValueRef undef_val = LLVMConstNull(LLVMInt1TypeInContext(bld->gallivm->context)); + result = lp_build_intrinsic_binary(builder, intr_str, bld->vec_type, a, undef_val); + return result; +} diff -Nru mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_bitarit.h mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_bitarit.h --- mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_bitarit.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_bitarit.h 2020-06-12 01:21:16.000000000 +0000 @@ -71,4 +71,15 @@ LLVMValueRef lp_build_not(struct lp_build_context *bld, LLVMValueRef a); +LLVMValueRef +lp_build_popcount(struct lp_build_context *bld, LLVMValueRef a); + +LLVMValueRef +lp_build_bitfield_reverse(struct lp_build_context *bld, LLVMValueRef a); + +LLVMValueRef +lp_build_cttz(struct lp_build_context *bld, LLVMValueRef a); + +LLVMValueRef +lp_build_ctlz(struct lp_build_context *bld, LLVMValueRef a); #endif /* !LP_BLD_ARIT_H */ diff -Nru mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_const.h mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_const.h --- mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_const.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_const.h 2020-06-12 01:21:16.000000000 +0000 @@ -126,6 +126,11 @@ return LLVMConstInt(LLVMInt32TypeInContext(gallivm->context), i, 0); } +static inline LLVMValueRef +lp_build_const_int64(struct gallivm_state *gallivm, int64_t i) +{ + return LLVMConstInt(LLVMInt64TypeInContext(gallivm->context), i, 0); +} static inline LLVMValueRef lp_build_const_float(struct gallivm_state *gallivm, float x) diff -Nru mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_coro.c mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_coro.c --- mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_coro.c 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_coro.c 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,200 @@ +/************************************************************************** + * + * Copyright 2019 Red Hat. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **************************************************************************/ + +#include +#include "lp_bld_coro.h" +#include "util/os_memory.h" +#include "lp_bld_init.h" +#include "lp_bld_const.h" +#include "lp_bld_intr.h" +#include "lp_bld_flow.h" + +#if LLVM_VERSION_MAJOR < 6 +/* not a wrapper, just lets it compile */ +static LLVMTypeRef LLVMTokenTypeInContext(LLVMContextRef C) +{ + assert(0); + return LLVMVoidTypeInContext(C); +} +#endif + +LLVMValueRef lp_build_coro_id(struct gallivm_state *gallivm) +{ + LLVMValueRef coro_id_args[4]; + coro_id_args[0] = lp_build_const_int32(gallivm, 0); + coro_id_args[1] = LLVMConstPointerNull(LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0)); + coro_id_args[2] = coro_id_args[1]; + coro_id_args[3] = coro_id_args[1]; + LLVMValueRef coro_id = lp_build_intrinsic(gallivm->builder, + "llvm.coro.id", + LLVMTokenTypeInContext(gallivm->context), + coro_id_args, 4, 0); + return coro_id; +} + +LLVMValueRef lp_build_coro_size(struct gallivm_state *gallivm) +{ + return lp_build_intrinsic(gallivm->builder, + "llvm.coro.size.i32", + LLVMInt32TypeInContext(gallivm->context), + NULL, 0, 0); +} + +LLVMValueRef lp_build_coro_begin(struct gallivm_state *gallivm, + LLVMValueRef coro_id, LLVMValueRef mem_ptr) +{ + LLVMValueRef coro_begin_args[2]; + coro_begin_args[0] = coro_id; + coro_begin_args[1] = mem_ptr; + LLVMValueRef coro_hdl = lp_build_intrinsic(gallivm->builder, + "llvm.coro.begin", + LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), + coro_begin_args, 2, 0); + return coro_hdl; +} + +LLVMValueRef lp_build_coro_free(struct gallivm_state *gallivm, + LLVMValueRef coro_id, LLVMValueRef coro_hdl) +{ + LLVMValueRef coro_free_args[2]; + coro_free_args[0] = coro_id; + coro_free_args[1] = coro_hdl; + return lp_build_intrinsic(gallivm->builder, + "llvm.coro.free", + LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), + coro_free_args, 2, 0); +} + +void lp_build_coro_end(struct gallivm_state *gallivm, LLVMValueRef coro_hdl) +{ + LLVMValueRef coro_end_args[2]; + coro_end_args[0] = coro_hdl; + coro_end_args[1] = LLVMConstInt(LLVMInt1TypeInContext(gallivm->context), 0, 0); + lp_build_intrinsic(gallivm->builder, + "llvm.coro.end", + LLVMInt1TypeInContext(gallivm->context), + coro_end_args, 2, 0); +} + +void lp_build_coro_resume(struct gallivm_state *gallivm, LLVMValueRef coro_hdl) +{ + lp_build_intrinsic(gallivm->builder, + "llvm.coro.resume", + LLVMVoidTypeInContext(gallivm->context), + &coro_hdl, 1, 0); +} + +void lp_build_coro_destroy(struct gallivm_state *gallivm, LLVMValueRef coro_hdl) +{ + lp_build_intrinsic(gallivm->builder, + "llvm.coro.destroy", + LLVMVoidTypeInContext(gallivm->context), + &coro_hdl, 1, 0); +} + +LLVMValueRef lp_build_coro_done(struct gallivm_state *gallivm, LLVMValueRef coro_hdl) +{ + return lp_build_intrinsic(gallivm->builder, + "llvm.coro.done", + LLVMInt1TypeInContext(gallivm->context), + &coro_hdl, 1, 0); +} + +LLVMValueRef lp_build_coro_suspend(struct gallivm_state *gallivm, bool last) +{ + LLVMValueRef coro_susp_args[2]; + coro_susp_args[0] = LLVMConstNull(LLVMTokenTypeInContext(gallivm->context)); + coro_susp_args[1] = LLVMConstInt(LLVMInt1TypeInContext(gallivm->context), last, 0); + LLVMValueRef coro_suspend = lp_build_intrinsic(gallivm->builder, + "llvm.coro.suspend", + LLVMInt8TypeInContext(gallivm->context), + coro_susp_args, 2, 0); + return coro_suspend; +} + +LLVMValueRef lp_build_coro_alloc(struct gallivm_state *gallivm, LLVMValueRef id) +{ + return lp_build_intrinsic(gallivm->builder, + "llvm.coro.alloc", + LLVMInt1TypeInContext(gallivm->context), + &id, 1, 0); +} + +static char * +coro_malloc(int size) +{ + return os_malloc_aligned(size, 4096); +} + +static void +coro_free(char *ptr) +{ + os_free_aligned(ptr); +} + +LLVMValueRef lp_build_coro_begin_alloc_mem(struct gallivm_state *gallivm, LLVMValueRef coro_id) +{ + LLVMValueRef do_alloc = lp_build_coro_alloc(gallivm, coro_id); + LLVMTypeRef mem_ptr_type = LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0); + LLVMValueRef alloc_mem_store = lp_build_alloca(gallivm, mem_ptr_type, "coro mem"); + struct lp_build_if_state if_state_coro; + lp_build_if(&if_state_coro, gallivm, do_alloc); + LLVMValueRef coro_size = lp_build_coro_size(gallivm); + LLVMValueRef alloc_mem; + LLVMTypeRef int32_type = LLVMInt32TypeInContext(gallivm->context); + + LLVMTypeRef malloc_type = LLVMFunctionType(mem_ptr_type, &int32_type, 1, 0); + + LLVMValueRef func_malloc = lp_build_const_int_pointer(gallivm, func_to_pointer((func_pointer)coro_malloc)); + func_malloc = LLVMBuildBitCast(gallivm->builder, func_malloc, LLVMPointerType(malloc_type, 0), "coro_malloc"); + alloc_mem = LLVMBuildCall(gallivm->builder, func_malloc, &coro_size, 1, ""); + + LLVMBuildStore(gallivm->builder, alloc_mem, alloc_mem_store); + lp_build_endif(&if_state_coro); + alloc_mem = LLVMBuildLoad(gallivm->builder, alloc_mem_store, ""); + LLVMValueRef coro_hdl = lp_build_coro_begin(gallivm, coro_id, alloc_mem); + return coro_hdl; +} + +void lp_build_coro_free_mem(struct gallivm_state *gallivm, LLVMValueRef coro_id, LLVMValueRef coro_hdl) +{ + LLVMValueRef alloc_mem = lp_build_coro_free(gallivm, coro_id, coro_hdl); + LLVMTypeRef ptr_type = LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0); + LLVMTypeRef free_type = LLVMFunctionType(LLVMVoidTypeInContext(gallivm->context), &ptr_type, 1, 0); + LLVMValueRef func_free = lp_build_const_int_pointer(gallivm, func_to_pointer((func_pointer)coro_free)); + func_free = LLVMBuildBitCast(gallivm->builder, func_free, LLVMPointerType(free_type, 0), "coro_free"); + alloc_mem = LLVMBuildCall(gallivm->builder, func_free, &alloc_mem, 1, ""); +} + +void lp_build_coro_suspend_switch(struct gallivm_state *gallivm, const struct lp_build_coro_suspend_info *sus_info, + LLVMBasicBlockRef resume_block, bool final_suspend) +{ + LLVMValueRef coro_suspend = lp_build_coro_suspend(gallivm, final_suspend); + LLVMValueRef myswitch = LLVMBuildSwitch(gallivm->builder, coro_suspend, + sus_info->suspend, resume_block ? 2 : 1); + LLVMAddCase(myswitch, LLVMConstInt(LLVMInt8TypeInContext(gallivm->context), 1, 0), sus_info->cleanup); + if (resume_block) + LLVMAddCase(myswitch, LLVMConstInt(LLVMInt8TypeInContext(gallivm->context), 0, 0), resume_block); +} diff -Nru mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_coro.h mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_coro.h --- mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_coro.h 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_coro.h 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,69 @@ +/************************************************************************** + * + * Copyright 2019 Red Hat. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **************************************************************************/ + +#ifndef LP_BLD_CORO_H +#define LP_BLD_CORO_H + +#include +#include "pipe/p_compiler.h" +#include "gallivm/lp_bld.h" + +struct gallivm_state; +LLVMValueRef lp_build_coro_id(struct gallivm_state *gallivm); + +LLVMValueRef lp_build_coro_size(struct gallivm_state *gallivm); + +LLVMValueRef lp_build_coro_begin(struct gallivm_state *gallivm, + LLVMValueRef coro_id, LLVMValueRef mem_ptr); + +LLVMValueRef lp_build_coro_free(struct gallivm_state *gallivm, + LLVMValueRef coro_id, LLVMValueRef coro_hdl); + +void lp_build_coro_end(struct gallivm_state *gallivm, + LLVMValueRef coro_hdl); + +void lp_build_coro_resume(struct gallivm_state *gallivm, LLVMValueRef coro_hdl); + +void lp_build_coro_destroy(struct gallivm_state *gallivm, LLVMValueRef coro_hdl); + +LLVMValueRef lp_build_coro_done(struct gallivm_state *gallivm, LLVMValueRef coro_hdl); + +LLVMValueRef lp_build_coro_suspend(struct gallivm_state *gallivm, bool last); + +LLVMValueRef lp_build_coro_alloc(struct gallivm_state *gallivm, LLVMValueRef id); + +LLVMValueRef lp_build_coro_begin_alloc_mem(struct gallivm_state *gallivm, LLVMValueRef coro_id); +void lp_build_coro_free_mem(struct gallivm_state *gallivm, LLVMValueRef coro_id, LLVMValueRef coro_hdl); + +struct lp_build_coro_suspend_info { + LLVMBasicBlockRef suspend; + LLVMBasicBlockRef cleanup; +}; + +void lp_build_coro_suspend_switch(struct gallivm_state *gallivm, + const struct lp_build_coro_suspend_info *sus_info, + LLVMBasicBlockRef resume_block, + bool final_suspend); +#endif diff -Nru mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_debug.cpp mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_debug.cpp --- mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_debug.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_debug.cpp 2020-06-12 01:21:16.000000000 +0000 @@ -30,6 +30,7 @@ #include #include +#include #include #include #include @@ -71,20 +72,11 @@ extern "C" void lp_debug_dump_value(LLVMValueRef value) { -#if HAVE_LLVM >= 0x0304 char *str = LLVMPrintValueToString(value); if (str) { os_log_message(str); LLVMDisposeMessage(str); } -#elif defined(PIPE_OS_WINDOWS) || defined(PIPE_OS_EMBEDDED) - std::string str; - llvm::raw_string_ostream os(str); - llvm::unwrap(value)->print(os); - os_log_message(str.c_str()); -#else - LLVMDumpValue(value); -#endif } diff -Nru mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_flow.c mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_flow.c --- mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_flow.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_flow.c 2020-06-12 01:21:16.000000000 +0000 @@ -265,6 +265,20 @@ state->counter = LLVMBuildLoad(builder, state->counter_var, ""); } +void +lp_build_loop_force_set_counter(struct lp_build_loop_state *state, + LLVMValueRef end) +{ + LLVMBuilderRef builder = state->gallivm->builder; + LLVMBuildStore(builder, end, state->counter_var); +} + +void +lp_build_loop_force_reload_counter(struct lp_build_loop_state *state) +{ + LLVMBuilderRef builder = state->gallivm->builder; + state->counter = LLVMBuildLoad(builder, state->counter_var, ""); +} void lp_build_loop_end(struct lp_build_loop_state *state, diff -Nru mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_flow.h mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_flow.h --- mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_flow.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_flow.h 2020-06-12 01:21:16.000000000 +0000 @@ -128,6 +128,12 @@ LLVMValueRef step); void +lp_build_loop_force_set_counter(struct lp_build_loop_state *state, + LLVMValueRef end); + +void +lp_build_loop_force_reload_counter(struct lp_build_loop_state *state); +void lp_build_loop_end_cond(struct lp_build_loop_state *state, LLVMValueRef end, LLVMValueRef step, diff -Nru mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_format_aos_array.c mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_format_aos_array.c --- mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_format_aos_array.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_format_aos_array.c 2020-06-12 01:21:16.000000000 +0000 @@ -36,7 +36,7 @@ #include "lp_bld_gather.h" #include "util/u_memory.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "pipe/p_state.h" diff -Nru mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c --- mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c 2020-06-12 01:21:16.000000000 +0000 @@ -33,7 +33,7 @@ */ -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_memory.h" #include "util/u_math.h" #include "util/u_pointer.h" diff -Nru mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_format_float.c mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_format_float.c --- mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_format_float.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_format_float.c 2020-06-12 01:21:16.000000000 +0000 @@ -235,7 +235,7 @@ */ LLVMValueRef lp_build_float_to_r11g11b10(struct gallivm_state *gallivm, - LLVMValueRef *src) + const LLVMValueRef *src) { LLVMValueRef dst, rcomp, bcomp, gcomp; struct lp_build_context i32_bld; diff -Nru mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_format.h mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_format.h --- mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_format.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_format.h 2020-06-12 01:21:16.000000000 +0000 @@ -151,6 +151,16 @@ LLVMValueRef cache, LLVMValueRef rgba_out[4]); +void +lp_build_store_rgba_soa(struct gallivm_state *gallivm, + const struct util_format_description *format_desc, + struct lp_type type, + LLVMValueRef exec_mask, + LLVMValueRef base_ptr, + LLVMValueRef offset, + LLVMValueRef out_of_bounds, + const LLVMValueRef rgba_in[4]); + /* * YUV */ @@ -204,7 +214,7 @@ LLVMValueRef lp_build_float_to_r11g11b10(struct gallivm_state *gallivm, - LLVMValueRef *src); + const LLVMValueRef *src); void lp_build_r11g11b10_to_float(struct gallivm_state *gallivm, diff -Nru mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_format_s3tc.c mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_format_s3tc.c --- mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_format_s3tc.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_format_s3tc.c 2020-06-12 01:21:16.000000000 +0000 @@ -34,7 +34,9 @@ */ -#include "util/u_format.h" +#include + +#include "util/format/u_format.h" #include "util/u_math.h" #include "util/u_string.h" #include "util/u_cpu_detect.h" @@ -465,7 +467,7 @@ LLVMBuilderRef builder = gallivm->builder; assert(bld8->type.width == 8); assert(bld8->type.length == 16 || bld8->type.length == 32); - if (HAVE_LLVM < 0x0600) { + if (LLVM_VERSION_MAJOR < 6) { LLVMValueRef intrargs[2]; char *intr_name = bld8->type.length == 32 ? "llvm.x86.avx2.pavg.b" : "llvm.x86.sse2.pavg.b"; diff -Nru mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c --- mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c 2020-06-12 01:21:16.000000000 +0000 @@ -28,7 +28,7 @@ #include "pipe/p_defines.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_memory.h" #include "util/u_string.h" #include "util/u_math.h" @@ -42,7 +42,9 @@ #include "lp_bld_format.h" #include "lp_bld_arit.h" #include "lp_bld_pack.h" - +#include "lp_bld_flow.h" +#include "lp_bld_printf.h" +#include "lp_bld_intr.h" static void convert_to_soa(struct gallivm_state *gallivm, @@ -365,7 +367,7 @@ /* Decode the input vector components */ for (chan = 0; chan < 4; ++chan) { -#ifdef PIPE_ARCH_LITTLE_ENDIAN +#if UTIL_ARCH_LITTLE_ENDIAN unsigned start = chan*8; #else unsigned start = (3-chan)*8; @@ -652,7 +654,7 @@ unsigned blockbits = type.width; unsigned vec_nr; -#ifdef PIPE_ARCH_BIG_ENDIAN +#if UTIL_ARCH_BIG_ENDIAN vec_nr = (format_desc->block.bits - (chan_desc.shift + chan_desc.size)) / type.width; #else vec_nr = chan_desc.shift / type.width; @@ -858,3 +860,231 @@ convert_to_soa(gallivm, aos_fetch, rgba_out, type); } } + +static void +lp_build_insert_soa_chan(struct lp_build_context *bld, + unsigned blockbits, + struct util_format_channel_description chan_desc, + LLVMValueRef *output, + LLVMValueRef rgba) +{ + struct gallivm_state *gallivm = bld->gallivm; + LLVMBuilderRef builder = gallivm->builder; + struct lp_type type = bld->type; + const unsigned width = chan_desc.size; + const unsigned start = chan_desc.shift; + const unsigned stop = start + width; + LLVMValueRef chan; + switch(chan_desc.type) { + case UTIL_FORMAT_TYPE_UNSIGNED: + + if (chan_desc.pure_integer) + chan = LLVMBuildBitCast(builder, rgba, bld->int_vec_type, ""); + else if (type.floating) { + if (chan_desc.normalized) + chan = lp_build_clamped_float_to_unsigned_norm(gallivm, type, width, rgba); + else + chan = LLVMBuildFPToSI(builder, rgba, bld->vec_type, ""); + } + if (start) + chan = LLVMBuildShl(builder, chan, + lp_build_const_int_vec(gallivm, type, start), ""); + if (!*output) + *output = chan; + else + *output = LLVMBuildOr(builder, *output, chan, ""); + break; + case UTIL_FORMAT_TYPE_SIGNED: + if (chan_desc.pure_integer) + chan = LLVMBuildBitCast(builder, rgba, bld->int_vec_type, ""); + else if (type.floating) { + uint32_t mask_val = (1UL << chan_desc.size) - 1; + if (chan_desc.normalized) { + char intrin[32]; + double scale = ((1 << (chan_desc.size - 1)) - 1); + LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale); + rgba = lp_build_clamp(bld, rgba, lp_build_negate(bld, bld->one), bld->one); + rgba = LLVMBuildFMul(builder, rgba, scale_val, ""); + lp_format_intrinsic(intrin, sizeof intrin, "llvm.rint", bld->vec_type); + rgba = lp_build_intrinsic_unary(builder, intrin, bld->vec_type, rgba); + } + chan = LLVMBuildFPToSI(builder, rgba, bld->int_vec_type, ""); + chan = LLVMBuildAnd(builder, chan, lp_build_const_int_vec(gallivm, type, mask_val), ""); + } + if (start) + chan = LLVMBuildShl(builder, chan, + lp_build_const_int_vec(gallivm, type, start), ""); + if (!*output) + *output = chan; + else + *output = LLVMBuildOr(builder, *output, chan, ""); + break; + case UTIL_FORMAT_TYPE_FLOAT: + if (type.floating) { + if (chan_desc.size == 16) { + chan = lp_build_float_to_half(gallivm, rgba); + chan = LLVMBuildZExt(builder, chan, bld->int_vec_type, ""); + if (start) + chan = LLVMBuildShl(builder, chan, + lp_build_const_int_vec(gallivm, type, start), ""); + if (!*output) + *output = chan; + else + *output = LLVMBuildOr(builder, *output, chan, ""); + } else { + assert(start == 0); + assert(stop == 32); + assert(type.width == 32); + *output = LLVMBuildBitCast(builder, rgba, bld->int_vec_type, ""); + } + } else + assert(0); + break; + default: + assert(0); + *output = bld->undef; + } +} + +static void +lp_build_pack_rgba_soa(struct gallivm_state *gallivm, + const struct util_format_description *format_desc, + struct lp_type type, + const LLVMValueRef rgba_in[4], + LLVMValueRef *packed) +{ + unsigned chan; + struct lp_build_context bld; + assert(format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN); + assert(format_desc->block.width == 1); + assert(format_desc->block.height == 1); + assert(format_desc->block.bits <= type.width); + /* FIXME: Support more output types */ + assert(type.width == 32); + + lp_build_context_init(&bld, gallivm, type); + for (chan = 0; chan < format_desc->nr_channels; ++chan) { + struct util_format_channel_description chan_desc = format_desc->channel[chan]; + + lp_build_insert_soa_chan(&bld, format_desc->block.bits, + chan_desc, + packed, + rgba_in[chan]); + } +} + +void +lp_build_store_rgba_soa(struct gallivm_state *gallivm, + const struct util_format_description *format_desc, + struct lp_type type, + LLVMValueRef exec_mask, + LLVMValueRef base_ptr, + LLVMValueRef offset, + LLVMValueRef out_of_bounds, + const LLVMValueRef rgba_in[4]) +{ + enum pipe_format format = format_desc->format; + LLVMValueRef packed[4]; + unsigned num_stores; + + memset(packed, 0, sizeof(LLVMValueRef) * 4); + if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN && + format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB && + format_desc->block.width == 1 && + format_desc->block.height == 1 && + format_desc->block.bits <= type.width && + (format_desc->channel[0].type != UTIL_FORMAT_TYPE_FLOAT || + format_desc->channel[0].size == 32 || + format_desc->channel[0].size == 16)) + { + lp_build_pack_rgba_soa(gallivm, format_desc, type, rgba_in, &packed[0]); + + num_stores = 1; + } else if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN && + (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB) && + format_desc->block.width == 1 && + format_desc->block.height == 1 && + format_desc->block.bits > type.width && + ((format_desc->block.bits <= type.width * type.length && + format_desc->channel[0].size <= type.width) || + (format_desc->channel[0].size == 64 && + format_desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT && + type.floating))) + { + /* + * Similar to above, but the packed pixel is larger than what fits + * into an element of the destination format. The packed pixels will be + * shuffled into SoA vectors appropriately, and then the extraction will + * be done in parallel as much as possible. + * Good for 16xn (n > 2) and 32xn (n > 1) formats, care is taken so + * the gathered vectors can be shuffled easily (even with avx). + * 64xn float -> 32xn float is handled too but it's a bit special as + * it does the conversion pre-shuffle. + */ + struct lp_build_context bld; + + lp_build_context_init(&bld, gallivm, type); + assert(type.width == 32); + assert(format_desc->block.bits > type.width); + + unsigned store_width = util_next_power_of_two(format_desc->block.bits); + num_stores = store_width / type.width; + for (unsigned i = 0; i < format_desc->nr_channels; i++) { + struct util_format_channel_description chan_desc = format_desc->channel[i]; + unsigned blockbits = type.width; + unsigned vec_nr; + + vec_nr = chan_desc.shift / type.width; + chan_desc.shift %= type.width; + + lp_build_insert_soa_chan(&bld, blockbits, + chan_desc, + &packed[vec_nr], + rgba_in[i]); + } + + assert(num_stores == 4 || num_stores == 2); + /* we can transpose and store at the same time */ + } else if (format == PIPE_FORMAT_R11G11B10_FLOAT) { + packed[0] = lp_build_float_to_r11g11b10(gallivm, rgba_in); + num_stores = 1; + } else + assert(0); + + assert(exec_mask); + + LLVMTypeRef int32_ptr_type = LLVMPointerType(LLVMInt32TypeInContext(gallivm->context), 0); + LLVMTypeRef int16_ptr_type = LLVMPointerType(LLVMInt16TypeInContext(gallivm->context), 0); + LLVMTypeRef int8_ptr_type = LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0); + + LLVMValueRef should_store_mask = LLVMBuildAnd(gallivm->builder, exec_mask, LLVMBuildNot(gallivm->builder, out_of_bounds, ""), "store_mask"); + should_store_mask = LLVMBuildICmp(gallivm->builder, LLVMIntNE, should_store_mask, lp_build_const_int_vec(gallivm, type, 0), ""); + for (unsigned i = 0; i < num_stores; i++) { + struct lp_build_loop_state loop_state; + + LLVMValueRef store_offset = LLVMBuildAdd(gallivm->builder, offset, lp_build_const_int_vec(gallivm, type, i * 4), ""); + store_offset = LLVMBuildGEP(gallivm->builder, base_ptr, &store_offset, 1, ""); + + lp_build_loop_begin(&loop_state, gallivm, lp_build_const_int32(gallivm, 0)); + + struct lp_build_if_state ifthen; + LLVMValueRef cond = LLVMBuildExtractElement(gallivm->builder, should_store_mask, loop_state.counter, ""); + lp_build_if(&ifthen, gallivm, cond); + + LLVMValueRef data = LLVMBuildExtractElement(gallivm->builder, packed[i], loop_state.counter, ""); + LLVMValueRef this_offset = LLVMBuildExtractElement(gallivm->builder, store_offset, loop_state.counter, ""); + + if (format_desc->block.bits == 8) { + this_offset = LLVMBuildBitCast(gallivm->builder, this_offset, int8_ptr_type, ""); + data = LLVMBuildTrunc(gallivm->builder, data, LLVMInt8TypeInContext(gallivm->context), ""); + } else if (format_desc->block.bits == 16) { + this_offset = LLVMBuildBitCast(gallivm->builder, this_offset, int16_ptr_type, ""); + data = LLVMBuildTrunc(gallivm->builder, data, LLVMInt16TypeInContext(gallivm->context), ""); + } else + this_offset = LLVMBuildBitCast(gallivm->builder, this_offset, int32_ptr_type, ""); + LLVMBuildStore(gallivm->builder, data, this_offset); + lp_build_endif(&ifthen); + lp_build_loop_end_cond(&loop_state, lp_build_const_int32(gallivm, type.length), + NULL, LLVMIntUGE); + } +} diff -Nru mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c --- mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c 2020-06-12 01:21:16.000000000 +0000 @@ -34,7 +34,7 @@ */ -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_cpu_detect.h" #include "lp_bld_arit.h" @@ -104,7 +104,7 @@ #endif { LLVMValueRef shift; -#ifdef PIPE_ARCH_LITTLE_ENDIAN +#if UTIL_ARCH_LITTLE_ENDIAN shift = LLVMBuildMul(builder, i, lp_build_const_int_vec(gallivm, type, 16), ""); shift = LLVMBuildAdd(builder, shift, lp_build_const_int_vec(gallivm, type, 8), ""); #else @@ -114,7 +114,7 @@ *y = LLVMBuildLShr(builder, packed, shift, ""); } -#ifdef PIPE_ARCH_LITTLE_ENDIAN +#if UTIL_ARCH_LITTLE_ENDIAN *u = packed; *v = LLVMBuildLShr(builder, packed, lp_build_const_int_vec(gallivm, type, 16), ""); #else @@ -187,7 +187,7 @@ #endif { LLVMValueRef shift; -#ifdef PIPE_ARCH_LITTLE_ENDIAN +#if UTIL_ARCH_LITTLE_ENDIAN shift = LLVMBuildMul(builder, i, lp_build_const_int_vec(gallivm, type, 16), ""); #else shift = LLVMBuildMul(builder, i, lp_build_const_int_vec(gallivm, type, -16), ""); @@ -196,7 +196,7 @@ *y = LLVMBuildLShr(builder, packed, shift, ""); } -#ifdef PIPE_ARCH_LITTLE_ENDIAN +#if UTIL_ARCH_LITTLE_ENDIAN *u = LLVMBuildLShr(builder, packed, lp_build_const_int_vec(gallivm, type, 8), ""); *v = LLVMBuildLShr(builder, packed, lp_build_const_int_vec(gallivm, type, 24), ""); #else @@ -334,7 +334,7 @@ * Make a 4 x unorm8 vector */ -#ifdef PIPE_ARCH_LITTLE_ENDIAN +#if UTIL_ARCH_LITTLE_ENDIAN r = r; g = LLVMBuildShl(builder, g, lp_build_const_int_vec(gallivm, type, 8), ""); b = LLVMBuildShl(builder, b, lp_build_const_int_vec(gallivm, type, 16), ""); diff -Nru mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_gather.c mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_gather.c --- mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_gather.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_gather.c 2020-06-12 01:21:16.000000000 +0000 @@ -141,7 +141,7 @@ if (src_width < dst_width) { res = LLVMBuildZExt(gallivm->builder, res, dst_elem_type, ""); if (vector_justify) { -#ifdef PIPE_ARCH_BIG_ENDIAN +#if UTIL_ARCH_BIG_ENDIAN res = LLVMBuildShl(gallivm->builder, res, LLVMConstInt(dst_elem_type, dst_width - src_width, 0), ""); #endif @@ -234,7 +234,7 @@ */ res = LLVMBuildZExt(gallivm->builder, res, dst_elem_type, ""); -#ifdef PIPE_ARCH_BIG_ENDIAN +#if UTIL_ARCH_BIG_ENDIAN if (vector_justify) { res = LLVMBuildShl(gallivm->builder, res, LLVMConstInt(dst_elem_type, @@ -553,7 +553,7 @@ if (vec_zext) { res = LLVMBuildZExt(gallivm->builder, res, res_t, ""); if (vector_justify) { -#ifdef PIPE_ARCH_BIG_ENDIAN +#if UTIL_ARCH_BIG_ENDIAN unsigned sv = dst_type.width - src_width; res = LLVMBuildShl(gallivm->builder, res, lp_build_const_int_vec(gallivm, res_type, sv), ""); diff -Nru mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld.h mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld.h --- mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld.h 2020-06-12 01:21:16.000000000 +0000 @@ -46,24 +46,10 @@ * for a standalone example. */ -#include - - -/** Ensure HAVE_LLVM is set to avoid #ifdef HAVE_LLVM everywhere */ -#ifndef HAVE_LLVM -#error "HAVE_LLVM should be set with LLVM's version number, e.g. (0x0207 for 2.7)" -#endif -#if HAVE_LLVM < 0x303 -#error "LLVM 3.3 or newer required" -#endif +#include +#include -#if HAVE_LLVM <= 0x0303 -/* We won't actually use LLVMMCJITMemoryManagerRef, just create a dummy - * typedef to simplify things elsewhere. - */ -typedef void *LLVMMCJITMemoryManagerRef; -#endif /** @@ -95,17 +81,10 @@ #define LLVMInsertBasicBlock ILLEGAL_LLVM_FUNCTION #define LLVMCreateBuilder ILLEGAL_LLVM_FUNCTION - -/* - * Before LLVM 3.4 LLVMSetAlignment only supported GlobalValue, not - * LoadInst/StoreInst as we need. - */ -#if HAVE_LLVM < 0x0304 -# ifdef __cplusplus - extern "C" -# endif - void LLVMSetAlignmentBackport(LLVMValueRef V, unsigned Bytes); -# define LLVMSetAlignment LLVMSetAlignmentBackport +#if LLVM_VERSION_MAJOR >= 8 +#define GALLIVM_HAVE_CORO 1 +#else +#define GALLIVM_HAVE_CORO 0 #endif #endif /* LP_BLD_H */ diff -Nru mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_init.c mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_init.c --- mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_init.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_init.c 2020-06-12 01:21:16.000000000 +0000 @@ -38,25 +38,18 @@ #include "lp_bld_misc.h" #include "lp_bld_init.h" +#include #include #include -#if HAVE_LLVM >= 0x0700 +#if LLVM_VERSION_MAJOR >= 7 #include #endif #include - - -/* Only MCJIT is available as of LLVM SVN r216982 */ -#if HAVE_LLVM >= 0x0306 -# define USE_MCJIT 1 -#elif defined(PIPE_ARCH_PPC_64) || defined(PIPE_ARCH_S390) || defined(PIPE_ARCH_ARM) || defined(PIPE_ARCH_AARCH64) -# define USE_MCJIT 1 +#if GALLIVM_HAVE_CORO +#if LLVM_VERSION_MAJOR <= 8 && defined(PIPE_ARCH_AARCH64) +#include #endif - -#if defined(USE_MCJIT) -static const bool use_mcjit = USE_MCJIT; -#else -static bool use_mcjit = FALSE; +#include #endif unsigned gallivm_perf = 0; @@ -125,17 +118,16 @@ gallivm->passmgr = LLVMCreateFunctionPassManagerForModule(gallivm->module); if (!gallivm->passmgr) return FALSE; + +#if GALLIVM_HAVE_CORO + gallivm->cgpassmgr = LLVMCreatePassManager(); +#endif /* * TODO: some per module pass manager with IPO passes might be helpful - * the generated texture functions may benefit from inlining if they are * simple, or constant propagation into them, etc. */ -#if HAVE_LLVM < 0x0309 - // Old versions of LLVM get the DataLayout from the pass manager. - LLVMAddTargetData(gallivm->target, gallivm->passmgr); -#endif - { char *td_str; // New ones from the Module. @@ -144,6 +136,15 @@ free(td_str); } +#if GALLIVM_HAVE_CORO +#if LLVM_VERSION_MAJOR <= 8 && defined(PIPE_ARCH_AARCH64) + LLVMAddFunctionAttrsPass(gallivm->cgpassmgr); +#endif + LLVMAddCoroEarlyPass(gallivm->cgpassmgr); + LLVMAddCoroSplitPass(gallivm->cgpassmgr); + LLVMAddCoroElidePass(gallivm->cgpassmgr); +#endif + if ((gallivm_perf & GALLIVM_PERF_NO_OPT) == 0) { /* * TODO: Evaluate passes some more - keeping in mind @@ -170,6 +171,9 @@ LLVMAddConstantPropagationPass(gallivm->passmgr); LLVMAddInstructionCombiningPass(gallivm->passmgr); LLVMAddGVNPass(gallivm->passmgr); +#if GALLIVM_HAVE_CORO + LLVMAddCoroCleanupPass(gallivm->passmgr); +#endif } else { /* We need at least this pass to prevent the backends to fail in @@ -193,6 +197,12 @@ LLVMDisposePassManager(gallivm->passmgr); } +#if GALLIVM_HAVE_CORO + if (gallivm->cgpassmgr) { + LLVMDisposePassManager(gallivm->cgpassmgr); + } +#endif + if (gallivm->engine) { /* This will already destroy any associated module */ LLVMDisposeExecutionEngine(gallivm->engine); @@ -202,12 +212,8 @@ FREE(gallivm->module_name); - if (!use_mcjit) { - /* Don't free the TargetData, it's owned by the exec engine */ - } else { - if (gallivm->target) { - LLVMDisposeTargetData(gallivm->target); - } + if (gallivm->target) { + LLVMDisposeTargetData(gallivm->target); } if (gallivm->builder) @@ -219,6 +225,7 @@ gallivm->target = NULL; gallivm->module = NULL; gallivm->module_name = NULL; + gallivm->cgpassmgr = NULL; gallivm->passmgr = NULL; gallivm->context = NULL; gallivm->builder = NULL; @@ -260,7 +267,6 @@ gallivm->module, gallivm->memorymgr, (unsigned) optlevel, - use_mcjit, &error); if (ret) { _debug_printf("%s\n", error); @@ -269,31 +275,25 @@ } } - if (!use_mcjit) { - gallivm->target = LLVMGetExecutionEngineTargetData(gallivm->engine); - if (!gallivm->target) - goto fail; - } else { - if (0) { - /* - * Dump the data layout strings. - */ - - LLVMTargetDataRef target = LLVMGetExecutionEngineTargetData(gallivm->engine); - char *data_layout; - char *engine_data_layout; - - data_layout = LLVMCopyStringRepOfTargetData(gallivm->target); - engine_data_layout = LLVMCopyStringRepOfTargetData(target); - - if (1) { - debug_printf("module target data = %s\n", data_layout); - debug_printf("engine target data = %s\n", engine_data_layout); - } + if (0) { + /* + * Dump the data layout strings. + */ + + LLVMTargetDataRef target = LLVMGetExecutionEngineTargetData(gallivm->engine); + char *data_layout; + char *engine_data_layout; + + data_layout = LLVMCopyStringRepOfTargetData(gallivm->target); + engine_data_layout = LLVMCopyStringRepOfTargetData(target); + + if (1) { + debug_printf("module target data = %s\n", data_layout); + debug_printf("engine target data = %s\n", engine_data_layout); + } - free(data_layout); - free(engine_data_layout); - } + free(data_layout); + free(engine_data_layout); } return TRUE; @@ -348,44 +348,39 @@ * complete when MC-JIT is created. So defer the MC-JIT engine creation for * now. */ - if (!use_mcjit) { - if (!init_gallivm_engine(gallivm)) { - goto fail; - } - } else { - /* - * MC-JIT engine compiles the module immediately on creation, so we can't - * obtain the target data from it. Instead we create a target data layout - * from a string. - * - * The produced layout strings are not precisely the same, but should make - * no difference for the kind of optimization passes we run. - * - * For reference this is the layout string on x64: - * - * e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-f128:128:128-n8:16:32:64 - * - * See also: - * - http://llvm.org/docs/LangRef.html#datalayout - */ - { - const unsigned pointer_size = 8 * sizeof(void *); - char layout[512]; - snprintf(layout, sizeof layout, "%c-p:%u:%u:%u-i64:64:64-a0:0:%u-s0:%u:%u", -#ifdef PIPE_ARCH_LITTLE_ENDIAN - 'e', // little endian + /* + * MC-JIT engine compiles the module immediately on creation, so we can't + * obtain the target data from it. Instead we create a target data layout + * from a string. + * + * The produced layout strings are not precisely the same, but should make + * no difference for the kind of optimization passes we run. + * + * For reference this is the layout string on x64: + * + * e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-f128:128:128-n8:16:32:64 + * + * See also: + * - http://llvm.org/docs/LangRef.html#datalayout + */ + + { + const unsigned pointer_size = 8 * sizeof(void *); + char layout[512]; + snprintf(layout, sizeof layout, "%c-p:%u:%u:%u-i64:64:64-a0:0:%u-s0:%u:%u", +#if UTIL_ARCH_LITTLE_ENDIAN + 'e', // little endian #else - 'E', // big endian + 'E', // big endian #endif - pointer_size, pointer_size, pointer_size, // pointer size, abi alignment, preferred alignment - pointer_size, // aggregate preferred alignment - pointer_size, pointer_size); // stack objects abi alignment, preferred alignment - - gallivm->target = LLVMCreateTargetData(layout); - if (!gallivm->target) { - return FALSE; - } + pointer_size, pointer_size, pointer_size, // pointer size, abi alignment, preferred alignment + pointer_size, // aggregate preferred alignment + pointer_size, pointer_size); // stack objects abi alignment, preferred alignment + + gallivm->target = LLVMCreateTargetData(layout); + if (!gallivm->target) { + return FALSE; } } @@ -412,17 +407,7 @@ * component is linked at buildtime, which is sufficient for its static * constructors to be called at load time. */ -#if defined(USE_MCJIT) -# if USE_MCJIT - LLVMLinkInMCJIT(); -# else - LLVMLinkInJIT(); -# endif -#else - use_mcjit = debug_get_bool_option("GALLIVM_MCJIT", FALSE); - LLVMLinkInJIT(); LLVMLinkInMCJIT(); -#endif #ifdef DEBUG gallivm_debug = debug_get_option_gallivm_debug(); @@ -482,11 +467,6 @@ util_cpu_caps.has_f16c = 0; util_cpu_caps.has_fma = 0; } - if (HAVE_LLVM < 0x0304 || !use_mcjit) { - /* AVX2 support has only been tested with LLVM 3.4, and it requires - * MCJIT. */ - util_cpu_caps.has_avx2 = 0; - } #ifdef PIPE_ARCH_PPC_64 /* Set the NJ bit in VSCR to 0 so denormalized values are handled as @@ -603,13 +583,16 @@ "-sroa -early-cse -simplifycfg -reassociate " "-mem2reg -constprop -instcombine -gvn", filename, gallivm_debug & GALLIVM_PERF_NO_OPT ? 0 : 2, - (HAVE_LLVM >= 0x0305) ? "[-mcpu=<-mcpu option>] " : "", + "[-mcpu=<-mcpu option>] ", "[-mattr=<-mattr option(s)>]"); } if (gallivm_debug & GALLIVM_DEBUG_PERF) time_begin = os_time_get(); +#if GALLIVM_HAVE_CORO + LLVMRunPassManager(gallivm->cgpassmgr, gallivm->module); +#endif /* Run optimization passes */ LLVMInitializeFunctionPassManager(gallivm->passmgr); func = LLVMGetFirstFunction(gallivm->module); @@ -620,9 +603,7 @@ /* Disable frame pointer omission on debug/profile builds */ /* XXX: And workaround http://llvm.org/PR21435 */ -#if HAVE_LLVM >= 0x0307 && \ - (defined(DEBUG) || defined(PROFILE) || \ - defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)) +#if defined(DEBUG) || defined(PROFILE) || defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) LLVMAddTargetDependentFunctionAttr(func, "no-frame-pointer-elim", "true"); LLVMAddTargetDependentFunctionAttr(func, "no-frame-pointer-elim-non-leaf", "true"); #endif @@ -640,29 +621,27 @@ gallivm->module_name, time_msec); } - if (use_mcjit) { - /* Setting the module's DataLayout to an empty string will cause the - * ExecutionEngine to copy to the DataLayout string from its target - * machine to the module. As of LLVM 3.8 the module and the execution - * engine are required to have the same DataLayout. - * - * We must make sure we do this after running the optimization passes, - * because those passes need a correct datalayout string. For example, - * if those optimization passes see an empty datalayout, they will assume - * this is a little endian target and will do optimizations that break big - * endian machines. - * - * TODO: This is just a temporary work-around. The correct solution is - * for gallivm_init_state() to create a TargetMachine and pull the - * DataLayout from there. Currently, the TargetMachine used by llvmpipe - * is being implicitly created by the EngineBuilder in - * lp_build_create_jit_compiler_for_module() - */ - LLVMSetDataLayout(gallivm->module, ""); - assert(!gallivm->engine); - if (!init_gallivm_engine(gallivm)) { - assert(0); - } + /* Setting the module's DataLayout to an empty string will cause the + * ExecutionEngine to copy to the DataLayout string from its target machine + * to the module. As of LLVM 3.8 the module and the execution engine are + * required to have the same DataLayout. + * + * We must make sure we do this after running the optimization passes, + * because those passes need a correct datalayout string. For example, if + * those optimization passes see an empty datalayout, they will assume this + * is a little endian target and will do optimizations that break big endian + * machines. + * + * TODO: This is just a temporary work-around. The correct solution is for + * gallivm_init_state() to create a TargetMachine and pull the DataLayout + * from there. Currently, the TargetMachine used by llvmpipe is being + * implicitly created by the EngineBuilder in + * lp_build_create_jit_compiler_for_module() + */ + LLVMSetDataLayout(gallivm->module, ""); + assert(!gallivm->engine); + if (!init_gallivm_engine(gallivm)) { + assert(0); } assert(gallivm->engine); diff -Nru mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_init.h mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_init.h --- mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_init.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_init.h 2020-06-12 01:21:16.000000000 +0000 @@ -46,6 +46,7 @@ LLVMExecutionEngineRef engine; LLVMTargetDataRef target; LLVMPassManagerRef passmgr; + LLVMPassManagerRef cgpassmgr; LLVMContextRef context; LLVMBuilderRef builder; LLVMMCJITMemoryManagerRef memorymgr; diff -Nru mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_intr.c mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_intr.c --- mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_intr.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_intr.c 2020-06-12 01:21:16.000000000 +0000 @@ -43,6 +43,7 @@ * @author Jose Fonseca */ +#include #include "util/u_debug.h" #include "util/u_string.h" @@ -121,7 +122,7 @@ } -#if HAVE_LLVM < 0x0400 +#if LLVM_VERSION_MAJOR < 4 static LLVMAttribute lp_attr_to_llvm_attr(enum lp_func_attr attr) { switch (attr) { @@ -164,7 +165,7 @@ int attr_idx, enum lp_func_attr attr) { -#if HAVE_LLVM < 0x0400 +#if LLVM_VERSION_MAJOR < 4 LLVMAttribute llvm_attr = lp_attr_to_llvm_attr(attr); if (LLVMIsAFunction(function_or_call)) { if (attr_idx == -1) { @@ -224,7 +225,7 @@ { LLVMModuleRef module = LLVMGetGlobalParent(LLVMGetBasicBlockParent(LLVMGetInsertBlock(builder))); LLVMValueRef function, call; - bool set_callsite_attrs = HAVE_LLVM >= 0x0400 && + bool set_callsite_attrs = LLVM_VERSION_MAJOR >= 4 && !(attr_mask & LP_FUNC_ATTR_LEGACY); function = LLVMGetNamedFunction(module, name); @@ -246,8 +247,9 @@ * than a call to address zero in the jited code). */ if (LLVMGetIntrinsicID(function) == 0) { - _debug_printf("llvm (version 0x%x) found no intrinsic for %s, going to crash...\n", - HAVE_LLVM, name); + _debug_printf("llvm (version " MESA_LLVM_VERSION_STRING + ") found no intrinsic for %s, going to crash...\n", + name); abort(); } diff -Nru mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_intr.h mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_intr.h --- mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_intr.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_intr.h 2020-06-12 01:21:16.000000000 +0000 @@ -36,6 +36,7 @@ #ifndef LP_BLD_INTR_H #define LP_BLD_INTR_H +#include #include "gallivm/lp_bld.h" #include "gallivm/lp_bld_init.h" @@ -53,9 +54,9 @@ LP_FUNC_ATTR_NOUNWIND = (1 << 4), LP_FUNC_ATTR_READNONE = (1 << 5), LP_FUNC_ATTR_READONLY = (1 << 6), - LP_FUNC_ATTR_WRITEONLY = HAVE_LLVM >= 0x0400 ? (1 << 7) : 0, - LP_FUNC_ATTR_INACCESSIBLE_MEM_ONLY = HAVE_LLVM >= 0x0400 ? (1 << 8) : 0, - LP_FUNC_ATTR_CONVERGENT = HAVE_LLVM >= 0x0400 ? (1 << 9) : 0, + LP_FUNC_ATTR_WRITEONLY = LLVM_VERSION_MAJOR >= 4 ? (1 << 7) : 0, + LP_FUNC_ATTR_INACCESSIBLE_MEM_ONLY = LLVM_VERSION_MAJOR >= 4 ? (1 << 8) : 0, + LP_FUNC_ATTR_CONVERGENT = LLVM_VERSION_MAJOR >= 4 ? (1 << 9) : 0, /* Legacy intrinsic that needs attributes on function declarations * and they must match the internal LLVM definition exactly, otherwise diff -Nru mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_ir_common.c mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_ir_common.c --- mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_ir_common.c 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_ir_common.c 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,466 @@ +/************************************************************************** + * + * Copyright 2009 VMware, Inc. + * Copyright 2007-2008 VMware, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +#include "util/u_memory.h" +#include "lp_bld_type.h" +#include "lp_bld_init.h" +#include "lp_bld_flow.h" +#include "lp_bld_ir_common.h" +#include "lp_bld_logic.h" + +/* + * Return the context for the current function. + * (always 'main', if shader doesn't do any function calls) + */ +static inline struct function_ctx * +func_ctx(struct lp_exec_mask *mask) +{ + assert(mask->function_stack_size > 0); + assert(mask->function_stack_size <= LP_MAX_NUM_FUNCS); + return &mask->function_stack[mask->function_stack_size - 1]; +} + +/* + * Returns true if we're in a loop. + * It's global, meaning that it returns true even if there's + * no loop inside the current function, but we were inside + * a loop inside another function, from which this one was called. + */ +static inline boolean +mask_has_loop(struct lp_exec_mask *mask) +{ + int i; + for (i = mask->function_stack_size - 1; i >= 0; --i) { + const struct function_ctx *ctx = &mask->function_stack[i]; + if (ctx->loop_stack_size > 0) + return TRUE; + } + return FALSE; +} + +/* + * Returns true if we're inside a switch statement. + * It's global, meaning that it returns true even if there's + * no switch in the current function, but we were inside + * a switch inside another function, from which this one was called. + */ +static inline boolean +mask_has_switch(struct lp_exec_mask *mask) +{ + int i; + for (i = mask->function_stack_size - 1; i >= 0; --i) { + const struct function_ctx *ctx = &mask->function_stack[i]; + if (ctx->switch_stack_size > 0) + return TRUE; + } + return FALSE; +} + +/* + * Returns true if we're inside a conditional. + * It's global, meaning that it returns true even if there's + * no conditional in the current function, but we were inside + * a conditional inside another function, from which this one was called. + */ +static inline boolean +mask_has_cond(struct lp_exec_mask *mask) +{ + int i; + for (i = mask->function_stack_size - 1; i >= 0; --i) { + const struct function_ctx *ctx = &mask->function_stack[i]; + if (ctx->cond_stack_size > 0) + return TRUE; + } + return FALSE; +} + +void lp_exec_mask_update(struct lp_exec_mask *mask) +{ + LLVMBuilderRef builder = mask->bld->gallivm->builder; + boolean has_loop_mask = mask_has_loop(mask); + boolean has_cond_mask = mask_has_cond(mask); + boolean has_switch_mask = mask_has_switch(mask); + boolean has_ret_mask = mask->function_stack_size > 1 || + mask->ret_in_main; + + if (has_loop_mask) { + /*for loops we need to update the entire mask at runtime */ + LLVMValueRef tmp; + assert(mask->break_mask); + tmp = LLVMBuildAnd(builder, + mask->cont_mask, + mask->break_mask, + "maskcb"); + mask->exec_mask = LLVMBuildAnd(builder, + mask->cond_mask, + tmp, + "maskfull"); + } else + mask->exec_mask = mask->cond_mask; + + if (has_switch_mask) { + mask->exec_mask = LLVMBuildAnd(builder, + mask->exec_mask, + mask->switch_mask, + "switchmask"); + } + + if (has_ret_mask) { + mask->exec_mask = LLVMBuildAnd(builder, + mask->exec_mask, + mask->ret_mask, + "callmask"); + } + + mask->has_mask = (has_cond_mask || + has_loop_mask || + has_switch_mask || + has_ret_mask); +} + +/* + * Initialize a function context at the specified index. + */ +void +lp_exec_mask_function_init(struct lp_exec_mask *mask, int function_idx) +{ + LLVMTypeRef int_type = LLVMInt32TypeInContext(mask->bld->gallivm->context); + LLVMBuilderRef builder = mask->bld->gallivm->builder; + struct function_ctx *ctx = &mask->function_stack[function_idx]; + + ctx->cond_stack_size = 0; + ctx->loop_stack_size = 0; + ctx->bgnloop_stack_size = 0; + ctx->switch_stack_size = 0; + + if (function_idx == 0) { + ctx->ret_mask = mask->ret_mask; + } + + ctx->loop_limiter = lp_build_alloca(mask->bld->gallivm, + int_type, "looplimiter"); + LLVMBuildStore( + builder, + LLVMConstInt(int_type, LP_MAX_TGSI_LOOP_ITERATIONS, false), + ctx->loop_limiter); +} + +void lp_exec_mask_init(struct lp_exec_mask *mask, struct lp_build_context *bld) +{ + mask->bld = bld; + mask->has_mask = FALSE; + mask->ret_in_main = FALSE; + /* For the main function */ + mask->function_stack_size = 1; + + mask->int_vec_type = lp_build_int_vec_type(bld->gallivm, mask->bld->type); + mask->exec_mask = mask->ret_mask = mask->break_mask = mask->cont_mask = + mask->cond_mask = mask->switch_mask = + LLVMConstAllOnes(mask->int_vec_type); + + mask->function_stack = CALLOC(LP_MAX_NUM_FUNCS, + sizeof(mask->function_stack[0])); + lp_exec_mask_function_init(mask, 0); +} + +void +lp_exec_mask_fini(struct lp_exec_mask *mask) +{ + FREE(mask->function_stack); +} + +/* stores val into an address pointed to by dst_ptr. + * mask->exec_mask is used to figure out which bits of val + * should be stored into the address + * (0 means don't store this bit, 1 means do store). + */ +void lp_exec_mask_store(struct lp_exec_mask *mask, + struct lp_build_context *bld_store, + LLVMValueRef val, + LLVMValueRef dst_ptr) +{ + LLVMBuilderRef builder = mask->bld->gallivm->builder; + LLVMValueRef exec_mask = mask->has_mask ? mask->exec_mask : NULL; + + assert(lp_check_value(bld_store->type, val)); + assert(LLVMGetTypeKind(LLVMTypeOf(dst_ptr)) == LLVMPointerTypeKind); + assert(LLVMGetElementType(LLVMTypeOf(dst_ptr)) == LLVMTypeOf(val) || + LLVMGetTypeKind(LLVMGetElementType(LLVMTypeOf(dst_ptr))) == LLVMArrayTypeKind); + + if (exec_mask) { + LLVMValueRef res, dst; + + dst = LLVMBuildLoad(builder, dst_ptr, ""); + res = lp_build_select(bld_store, exec_mask, val, dst); + LLVMBuildStore(builder, res, dst_ptr); + } else + LLVMBuildStore(builder, val, dst_ptr); +} + +void lp_exec_bgnloop_post_phi(struct lp_exec_mask *mask) +{ + LLVMBuilderRef builder = mask->bld->gallivm->builder; + struct function_ctx *ctx = func_ctx(mask); + + if (ctx->loop_stack_size != ctx->bgnloop_stack_size) { + mask->break_mask = LLVMBuildLoad(builder, ctx->break_var, ""); + lp_exec_mask_update(mask); + ctx->bgnloop_stack_size = ctx->loop_stack_size; + } +} + +void lp_exec_bgnloop(struct lp_exec_mask *mask, bool load) +{ + LLVMBuilderRef builder = mask->bld->gallivm->builder; + struct function_ctx *ctx = func_ctx(mask); + + if (ctx->loop_stack_size >= LP_MAX_TGSI_NESTING) { + ++ctx->loop_stack_size; + return; + } + + ctx->break_type_stack[ctx->loop_stack_size + ctx->switch_stack_size] = + ctx->break_type; + ctx->break_type = LP_EXEC_MASK_BREAK_TYPE_LOOP; + + ctx->loop_stack[ctx->loop_stack_size].loop_block = ctx->loop_block; + ctx->loop_stack[ctx->loop_stack_size].cont_mask = mask->cont_mask; + ctx->loop_stack[ctx->loop_stack_size].break_mask = mask->break_mask; + ctx->loop_stack[ctx->loop_stack_size].break_var = ctx->break_var; + ++ctx->loop_stack_size; + + ctx->break_var = lp_build_alloca(mask->bld->gallivm, mask->int_vec_type, ""); + LLVMBuildStore(builder, mask->break_mask, ctx->break_var); + + ctx->loop_block = lp_build_insert_new_block(mask->bld->gallivm, "bgnloop"); + + LLVMBuildBr(builder, ctx->loop_block); + LLVMPositionBuilderAtEnd(builder, ctx->loop_block); + + if (load) { + lp_exec_bgnloop_post_phi(mask); + } +} + +void lp_exec_endloop(struct gallivm_state *gallivm, + struct lp_exec_mask *mask) +{ + LLVMBuilderRef builder = mask->bld->gallivm->builder; + struct function_ctx *ctx = func_ctx(mask); + LLVMBasicBlockRef endloop; + LLVMTypeRef int_type = LLVMInt32TypeInContext(mask->bld->gallivm->context); + LLVMTypeRef reg_type = LLVMIntTypeInContext(gallivm->context, + mask->bld->type.width * + mask->bld->type.length); + LLVMValueRef i1cond, i2cond, icond, limiter; + + assert(mask->break_mask); + + assert(ctx->loop_stack_size); + if (ctx->loop_stack_size > LP_MAX_TGSI_NESTING) { + --ctx->loop_stack_size; + --ctx->bgnloop_stack_size; + return; + } + + /* + * Restore the cont_mask, but don't pop + */ + mask->cont_mask = ctx->loop_stack[ctx->loop_stack_size - 1].cont_mask; + lp_exec_mask_update(mask); + + /* + * Unlike the continue mask, the break_mask must be preserved across loop + * iterations + */ + LLVMBuildStore(builder, mask->break_mask, ctx->break_var); + + /* Decrement the loop limiter */ + limiter = LLVMBuildLoad(builder, ctx->loop_limiter, ""); + + limiter = LLVMBuildSub( + builder, + limiter, + LLVMConstInt(int_type, 1, false), + ""); + + LLVMBuildStore(builder, limiter, ctx->loop_limiter); + + /* i1cond = (mask != 0) */ + i1cond = LLVMBuildICmp( + builder, + LLVMIntNE, + LLVMBuildBitCast(builder, mask->exec_mask, reg_type, ""), + LLVMConstNull(reg_type), "i1cond"); + + /* i2cond = (looplimiter > 0) */ + i2cond = LLVMBuildICmp( + builder, + LLVMIntSGT, + limiter, + LLVMConstNull(int_type), "i2cond"); + + /* if( i1cond && i2cond ) */ + icond = LLVMBuildAnd(builder, i1cond, i2cond, ""); + + endloop = lp_build_insert_new_block(mask->bld->gallivm, "endloop"); + + LLVMBuildCondBr(builder, + icond, ctx->loop_block, endloop); + + LLVMPositionBuilderAtEnd(builder, endloop); + + assert(ctx->loop_stack_size); + --ctx->loop_stack_size; + --ctx->bgnloop_stack_size; + mask->cont_mask = ctx->loop_stack[ctx->loop_stack_size].cont_mask; + mask->break_mask = ctx->loop_stack[ctx->loop_stack_size].break_mask; + ctx->loop_block = ctx->loop_stack[ctx->loop_stack_size].loop_block; + ctx->break_var = ctx->loop_stack[ctx->loop_stack_size].break_var; + ctx->break_type = ctx->break_type_stack[ctx->loop_stack_size + + ctx->switch_stack_size]; + + lp_exec_mask_update(mask); +} + +void lp_exec_mask_cond_push(struct lp_exec_mask *mask, + LLVMValueRef val) +{ + LLVMBuilderRef builder = mask->bld->gallivm->builder; + struct function_ctx *ctx = func_ctx(mask); + + if (ctx->cond_stack_size >= LP_MAX_TGSI_NESTING) { + ctx->cond_stack_size++; + return; + } + if (ctx->cond_stack_size == 0 && mask->function_stack_size == 1) { + assert(mask->cond_mask == LLVMConstAllOnes(mask->int_vec_type)); + } + ctx->cond_stack[ctx->cond_stack_size++] = mask->cond_mask; + assert(LLVMTypeOf(val) == mask->int_vec_type); + mask->cond_mask = LLVMBuildAnd(builder, + mask->cond_mask, + val, + ""); + lp_exec_mask_update(mask); +} + +void lp_exec_mask_cond_invert(struct lp_exec_mask *mask) +{ + LLVMBuilderRef builder = mask->bld->gallivm->builder; + struct function_ctx *ctx = func_ctx(mask); + LLVMValueRef prev_mask; + LLVMValueRef inv_mask; + + assert(ctx->cond_stack_size); + if (ctx->cond_stack_size >= LP_MAX_TGSI_NESTING) + return; + prev_mask = ctx->cond_stack[ctx->cond_stack_size - 1]; + if (ctx->cond_stack_size == 1 && mask->function_stack_size == 1) { + assert(prev_mask == LLVMConstAllOnes(mask->int_vec_type)); + } + + inv_mask = LLVMBuildNot(builder, mask->cond_mask, ""); + + mask->cond_mask = LLVMBuildAnd(builder, + inv_mask, + prev_mask, ""); + lp_exec_mask_update(mask); +} + +void lp_exec_mask_cond_pop(struct lp_exec_mask *mask) +{ + struct function_ctx *ctx = func_ctx(mask); + assert(ctx->cond_stack_size); + --ctx->cond_stack_size; + if (ctx->cond_stack_size >= LP_MAX_TGSI_NESTING) + return; + mask->cond_mask = ctx->cond_stack[ctx->cond_stack_size]; + lp_exec_mask_update(mask); +} + + +void lp_exec_continue(struct lp_exec_mask *mask) +{ + LLVMBuilderRef builder = mask->bld->gallivm->builder; + LLVMValueRef exec_mask = LLVMBuildNot(builder, + mask->exec_mask, + ""); + + mask->cont_mask = LLVMBuildAnd(builder, + mask->cont_mask, + exec_mask, ""); + + lp_exec_mask_update(mask); +} + +void lp_exec_break(struct lp_exec_mask *mask, int *pc, + bool break_always) +{ + LLVMBuilderRef builder = mask->bld->gallivm->builder; + struct function_ctx *ctx = func_ctx(mask); + + if (ctx->break_type == LP_EXEC_MASK_BREAK_TYPE_LOOP) { + LLVMValueRef exec_mask = LLVMBuildNot(builder, + mask->exec_mask, + "break"); + + mask->break_mask = LLVMBuildAnd(builder, + mask->break_mask, + exec_mask, "break_full"); + } + else { + if (ctx->switch_in_default) { + /* + * stop default execution but only if this is an unconditional switch. + * (The condition here is not perfect since dead code after break is + * allowed but should be sufficient since false negatives are just + * unoptimized - so we don't have to pre-evaluate that). + */ + if(break_always && ctx->switch_pc) { + if (pc) + *pc = ctx->switch_pc; + return; + } + } + + if (break_always) { + mask->switch_mask = LLVMConstNull(mask->bld->int_vec_type); + } + else { + LLVMValueRef exec_mask = LLVMBuildNot(builder, + mask->exec_mask, + "break"); + mask->switch_mask = LLVMBuildAnd(builder, + mask->switch_mask, + exec_mask, "break_switch"); + } + } + + lp_exec_mask_update(mask); +} diff -Nru mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_ir_common.h mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_ir_common.h --- mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_ir_common.h 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_ir_common.h 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,120 @@ +/************************************************************************** + * + * Copyright 2011-2012 Advanced Micro Devices, Inc. + * Copyright 2009 VMware, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +#ifndef LP_BLD_IR_COMMON_H +#define LP_BLD_IR_COMMON_H + +#include "gallivm/lp_bld.h" +#include "gallivm/lp_bld_limits.h" + +/* SM 4.0 says that subroutines can nest 32 deep and + * we need one more for our main function */ +#define LP_MAX_NUM_FUNCS 33 + +enum lp_exec_mask_break_type { + LP_EXEC_MASK_BREAK_TYPE_LOOP, + LP_EXEC_MASK_BREAK_TYPE_SWITCH +}; + +struct lp_exec_mask { + struct lp_build_context *bld; + + boolean has_mask; + boolean ret_in_main; + + LLVMTypeRef int_vec_type; + + LLVMValueRef exec_mask; + + LLVMValueRef ret_mask; + LLVMValueRef cond_mask; + LLVMValueRef switch_mask; /* current switch exec mask */ + LLVMValueRef cont_mask; + LLVMValueRef break_mask; + + struct function_ctx { + int pc; + LLVMValueRef ret_mask; + + LLVMValueRef cond_stack[LP_MAX_TGSI_NESTING]; + int cond_stack_size; + + /* keep track if break belongs to switch or loop */ + enum lp_exec_mask_break_type break_type_stack[LP_MAX_TGSI_NESTING]; + enum lp_exec_mask_break_type break_type; + + struct { + LLVMValueRef switch_val; + LLVMValueRef switch_mask; + LLVMValueRef switch_mask_default; + boolean switch_in_default; + unsigned switch_pc; + } switch_stack[LP_MAX_TGSI_NESTING]; + int switch_stack_size; + LLVMValueRef switch_val; + LLVMValueRef switch_mask_default; /* reverse of switch mask used for default */ + boolean switch_in_default; /* if switch exec is currently in default */ + unsigned switch_pc; /* when used points to default or endswitch-1 */ + + LLVMValueRef loop_limiter; + LLVMBasicBlockRef loop_block; + LLVMValueRef break_var; + struct { + LLVMBasicBlockRef loop_block; + LLVMValueRef cont_mask; + LLVMValueRef break_mask; + LLVMValueRef break_var; + } loop_stack[LP_MAX_TGSI_NESTING]; + int loop_stack_size; + int bgnloop_stack_size; + + } *function_stack; + int function_stack_size; +}; + +void lp_exec_mask_function_init(struct lp_exec_mask *mask, int function_idx); +void lp_exec_mask_init(struct lp_exec_mask *mask, struct lp_build_context *bld); +void lp_exec_mask_fini(struct lp_exec_mask *mask); +void lp_exec_mask_store(struct lp_exec_mask *mask, + struct lp_build_context *bld_store, + LLVMValueRef val, + LLVMValueRef dst_ptr); +void lp_exec_mask_update(struct lp_exec_mask *mask); +void lp_exec_bgnloop_post_phi(struct lp_exec_mask *mask); +void lp_exec_bgnloop(struct lp_exec_mask *mask, bool load_mask); +void lp_exec_endloop(struct gallivm_state *gallivm, + struct lp_exec_mask *mask); +void lp_exec_mask_cond_push(struct lp_exec_mask *mask, + LLVMValueRef val); +void lp_exec_mask_cond_invert(struct lp_exec_mask *mask); +void lp_exec_mask_cond_pop(struct lp_exec_mask *mask); +void lp_exec_continue(struct lp_exec_mask *mask); + +void lp_exec_break(struct lp_exec_mask *mask, int *pc, bool break_always); + +#endif diff -Nru mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_limits.h mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_limits.h --- mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_limits.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_limits.h 2020-06-12 01:21:16.000000000 +0000 @@ -59,6 +59,8 @@ #define LP_MAX_TGSI_SHADER_BUFFER_SIZE (1 << 27) +#define LP_MAX_TGSI_SHADER_IMAGES 8 + /* * For quick access we cache registers in statically * allocated arrays. Here we define the maximum size @@ -132,7 +134,7 @@ case PIPE_SHADER_CAP_PREFERRED_IR: return PIPE_SHADER_IR_TGSI; case PIPE_SHADER_CAP_SUPPORTED_IRS: - return 1 << PIPE_SHADER_IR_TGSI; + return (1 << PIPE_SHADER_IR_TGSI) | (1 << PIPE_SHADER_IR_NIR); case PIPE_SHADER_CAP_TGSI_SQRT_SUPPORTED: case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE: return 1; @@ -140,18 +142,17 @@ case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED: case PIPE_SHADER_CAP_TGSI_LDEXP_SUPPORTED: case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED: - case PIPE_SHADER_CAP_MAX_SHADER_IMAGES: case PIPE_SHADER_CAP_LOWER_IF_THRESHOLD: case PIPE_SHADER_CAP_TGSI_SKIP_MERGE_REGISTERS: case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTERS: case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTER_BUFFERS: return 0; - case PIPE_SHADER_CAP_SCALAR_ISA: - return 1; case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT: return 32; case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS: return LP_MAX_TGSI_SHADER_BUFFERS; + case PIPE_SHADER_CAP_MAX_SHADER_IMAGES: + return LP_MAX_TGSI_SHADER_IMAGES; } /* if we get here, we missed a shader cap above (and should have seen * a compiler warning.) diff -Nru mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_logic.c mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_logic.c --- mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_logic.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_logic.c 2020-06-12 01:21:16.000000000 +0000 @@ -32,6 +32,7 @@ * @author Jose Fonseca */ +#include #include "util/u_cpu_detect.h" #include "util/u_memory.h" @@ -256,6 +257,7 @@ LLVMBuilderRef builder = bld->gallivm->builder; struct lp_type type = bld->type; LLVMValueRef res; + LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type); assert(lp_check_value(type, a)); assert(lp_check_value(type, b)); @@ -265,11 +267,12 @@ } if(type.floating) { - LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type); a = LLVMBuildBitCast(builder, a, int_vec_type, ""); b = LLVMBuildBitCast(builder, b, int_vec_type, ""); } + if (type.width > 32) + mask = LLVMBuildSExt(builder, mask, int_vec_type, ""); a = LLVMBuildAnd(builder, a, mask, ""); /* This often gets translated to PANDN, but sometimes the NOT is @@ -317,16 +320,14 @@ mask = LLVMBuildTrunc(builder, mask, LLVMInt1TypeInContext(lc), ""); res = LLVMBuildSelect(builder, mask, a, b, ""); } - else if (!(HAVE_LLVM == 0x0307) && - (LLVMIsConstant(mask) || - LLVMGetInstructionOpcode(mask) == LLVMSExt)) { + else if (LLVMIsConstant(mask) || + LLVMGetInstructionOpcode(mask) == LLVMSExt) { /* Generate a vector select. * * Using vector selects should avoid emitting intrinsics hence avoid * hindering optimization passes, but vector selects weren't properly * supported yet for a long time, and LLVM will generate poor code when * the mask is not the result of a comparison. - * Also, llvm 3.7 may miscompile them (bug 94972). * XXX: Even if the instruction was an SExt, this may still produce * terrible code. Try piglit stencil-twoside. */ @@ -360,6 +361,11 @@ LLVMTypeRef arg_type; LLVMValueRef args[3]; + LLVMTypeRef mask_type = LLVMGetElementType(LLVMTypeOf(mask)); + if (LLVMGetIntTypeWidth(mask_type) != type.width) { + LLVMTypeRef int_vec_type = LLVMVectorType(LLVMIntTypeInContext(lc, type.width), type.length); + mask = LLVMBuildSExt(builder, mask, int_vec_type, ""); + } /* * There's only float blend in AVX but can just cast i32/i64 * to float. diff -Nru mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp --- mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp 2020-06-12 01:21:16.000000000 +0000 @@ -42,39 +42,30 @@ #include +#include + +#if LLVM_VERSION_MAJOR < 7 // Workaround http://llvm.org/PR23628 -#if HAVE_LLVM >= 0x0307 -# pragma push_macro("DEBUG") -# undef DEBUG +#pragma push_macro("DEBUG") +#undef DEBUG #endif +#include #include -#if HAVE_LLVM >= 0x0306 #include -#endif #include #include #include #include -#if HAVE_LLVM >= 0x0307 #include -#else -#include -#endif -#if HAVE_LLVM < 0x0306 -#include -#else #include -#endif #include #include #include #include -#if HAVE_LLVM >= 0x0305 #include -#endif #include #include #include @@ -84,9 +75,9 @@ #include #endif +#if LLVM_VERSION_MAJOR < 7 // Workaround http://llvm.org/PR23628 -#if HAVE_LLVM >= 0x0307 -# pragma pop_macro("DEBUG") +#pragma pop_macro("DEBUG") #endif #include "c11/threads.h" @@ -125,7 +116,7 @@ llvm::InitializeNativeTargetAsmPrinter(); llvm::InitializeNativeTargetDisassembler(); -#if DEBUG && HAVE_LLVM >= 0x0306 +#if DEBUG { char *env_llc_options = getenv("GALLIVM_LLC_OPTIONS"); if (env_llc_options) { @@ -150,15 +141,6 @@ extern "C" void lp_set_target_options(void) { -#if HAVE_LLVM < 0x0304 - /* - * By default LLVM adds a signal handler to output a pretty stack trace. - * This signal handler is never removed, causing problems when unloading the - * shared object where the gallium driver resides. - */ - llvm::DisablePrettyStackTrace = true; -#endif - /* The llvm target registry is not thread-safe, so drivers and state-trackers * that want to initialize targets should use the lp_set_target_options() * function to safely initialize targets. @@ -174,11 +156,7 @@ gallivm_create_target_library_info(const char *triple) { return reinterpret_cast( -#if HAVE_LLVM < 0x0307 - new llvm::TargetLibraryInfo( -#else new llvm::TargetLibraryInfoImpl( -#endif llvm::Triple(triple))); } @@ -187,43 +165,12 @@ gallivm_dispose_target_library_info(LLVMTargetLibraryInfoRef library_info) { delete reinterpret_cast< -#if HAVE_LLVM < 0x0307 - llvm::TargetLibraryInfo -#else llvm::TargetLibraryInfoImpl -#endif *>(library_info); } -#if HAVE_LLVM < 0x0304 - -extern "C" -void -LLVMSetAlignmentBackport(LLVMValueRef V, - unsigned Bytes) -{ - switch (LLVMGetInstructionOpcode(V)) { - case LLVMLoad: - llvm::unwrap(V)->setAlignment(Bytes); - break; - case LLVMStore: - llvm::unwrap(V)->setAlignment(Bytes); - break; - default: - assert(0); - break; - } -} - -#endif - - -#if HAVE_LLVM < 0x0306 -typedef llvm::JITMemoryManager BaseMemoryManager; -#else typedef llvm::RTDyldMemoryManager BaseMemoryManager; -#endif /* @@ -237,96 +184,9 @@ virtual BaseMemoryManager *mgr() const = 0; public: -#if HAVE_LLVM < 0x0306 - /* - * From JITMemoryManager - */ - virtual void setMemoryWritable() { - mgr()->setMemoryWritable(); - } - virtual void setMemoryExecutable() { - mgr()->setMemoryExecutable(); - } - virtual void setPoisonMemory(bool poison) { - mgr()->setPoisonMemory(poison); - } - virtual void AllocateGOT() { - mgr()->AllocateGOT(); - /* - * isManagingGOT() is not virtual in base class so we can't delegate. - * Instead we mirror the value of HasGOT in our instance. - */ - HasGOT = mgr()->isManagingGOT(); - } - virtual uint8_t *getGOTBase() const { - return mgr()->getGOTBase(); - } - virtual uint8_t *startFunctionBody(const llvm::Function *F, - uintptr_t &ActualSize) { - return mgr()->startFunctionBody(F, ActualSize); - } - virtual uint8_t *allocateStub(const llvm::GlobalValue *F, - unsigned StubSize, - unsigned Alignment) { - return mgr()->allocateStub(F, StubSize, Alignment); - } - virtual void endFunctionBody(const llvm::Function *F, - uint8_t *FunctionStart, - uint8_t *FunctionEnd) { - mgr()->endFunctionBody(F, FunctionStart, FunctionEnd); - } - virtual uint8_t *allocateSpace(intptr_t Size, unsigned Alignment) { - return mgr()->allocateSpace(Size, Alignment); - } - virtual uint8_t *allocateGlobal(uintptr_t Size, unsigned Alignment) { - return mgr()->allocateGlobal(Size, Alignment); - } - virtual void deallocateFunctionBody(void *Body) { - mgr()->deallocateFunctionBody(Body); - } -#if HAVE_LLVM < 0x0304 - virtual uint8_t *startExceptionTable(const llvm::Function *F, - uintptr_t &ActualSize) { - return mgr()->startExceptionTable(F, ActualSize); - } - virtual void endExceptionTable(const llvm::Function *F, - uint8_t *TableStart, - uint8_t *TableEnd, - uint8_t *FrameRegister) { - mgr()->endExceptionTable(F, TableStart, TableEnd, - FrameRegister); - } - virtual void deallocateExceptionTable(void *ET) { - mgr()->deallocateExceptionTable(ET); - } -#endif - virtual bool CheckInvariants(std::string &s) { - return mgr()->CheckInvariants(s); - } - virtual size_t GetDefaultCodeSlabSize() { - return mgr()->GetDefaultCodeSlabSize(); - } - virtual size_t GetDefaultDataSlabSize() { - return mgr()->GetDefaultDataSlabSize(); - } - virtual size_t GetDefaultStubSlabSize() { - return mgr()->GetDefaultStubSlabSize(); - } - virtual unsigned GetNumCodeSlabs() { - return mgr()->GetNumCodeSlabs(); - } - virtual unsigned GetNumDataSlabs() { - return mgr()->GetNumDataSlabs(); - } - virtual unsigned GetNumStubSlabs() { - return mgr()->GetNumStubSlabs(); - } -#endif - /* * From RTDyldMemoryManager */ -#if HAVE_LLVM >= 0x0304 virtual uint8_t *allocateCodeSection(uintptr_t Size, unsigned Alignment, unsigned SectionID, @@ -334,40 +194,23 @@ return mgr()->allocateCodeSection(Size, Alignment, SectionID, SectionName); } -#else - virtual uint8_t *allocateCodeSection(uintptr_t Size, - unsigned Alignment, - unsigned SectionID) { - return mgr()->allocateCodeSection(Size, Alignment, SectionID); - } -#endif virtual uint8_t *allocateDataSection(uintptr_t Size, unsigned Alignment, unsigned SectionID, -#if HAVE_LLVM >= 0x0304 llvm::StringRef SectionName, -#endif bool IsReadOnly) { return mgr()->allocateDataSection(Size, Alignment, SectionID, -#if HAVE_LLVM >= 0x0304 SectionName, -#endif IsReadOnly); } -#if HAVE_LLVM >= 0x0304 virtual void registerEHFrames(uint8_t *Addr, uint64_t LoadAddr, size_t Size) { mgr()->registerEHFrames(Addr, LoadAddr, Size); } -#else - virtual void registerEHFrames(llvm::StringRef SectionData) { - mgr()->registerEHFrames(SectionData); - } -#endif -#if HAVE_LLVM >= 0x0500 +#if LLVM_VERSION_MAJOR >= 5 virtual void deregisterEHFrames() { mgr()->deregisterEHFrames(); } -#elif HAVE_LLVM >= 0x0304 +#else virtual void deregisterEHFrames(uint8_t *Addr, uint64_t LoadAddr, size_t Size) { mgr()->deregisterEHFrames(Addr, LoadAddr, Size); } @@ -376,15 +219,9 @@ bool AbortOnFailure=true) { return mgr()->getPointerToNamedFunction(Name, AbortOnFailure); } -#if HAVE_LLVM <= 0x0303 - virtual bool applyPermissions(std::string *ErrMsg = 0) { - return mgr()->applyPermissions(ErrMsg); - } -#else virtual bool finalizeMemory(std::string *ErrMsg = 0) { return mgr()->finalizeMemory(ErrMsg); } -#endif }; @@ -413,21 +250,6 @@ } ~GeneratedCode() { - /* - * Deallocate things as previously requested and - * free shared manager when no longer used. - */ -#if HAVE_LLVM < 0x0306 - Vec::iterator i; - - assert(TheMM); - for ( i = FunctionBody.begin(); i != FunctionBody.end(); ++i ) - TheMM->deallocateFunctionBody(*i); -#if HAVE_LLVM < 0x0304 - for ( i = ExceptionTable.begin(); i != ExceptionTable.end(); ++i ) - TheMM->deallocateExceptionTable(*i); -#endif /* HAVE_LLVM < 0x0304 */ -#endif /* HAVE_LLVM < 0x0306 */ } }; @@ -459,13 +281,6 @@ delete (GeneratedCode *) code; } -#if HAVE_LLVM < 0x0304 - virtual void deallocateExceptionTable(void *ET) { - // remember for later deallocation - code->ExceptionTable.push_back(ET); - } -#endif - virtual void deallocateFunctionBody(void *Body) { // remember for later deallocation code->FunctionBody.push_back(Body); @@ -490,17 +305,12 @@ LLVMModuleRef M, LLVMMCJITMemoryManagerRef CMM, unsigned OptLevel, - int useMCJIT, char **OutError) { using namespace llvm; std::string Error; -#if HAVE_LLVM >= 0x0306 EngineBuilder builder(std::unique_ptr(unwrap(M))); -#else - EngineBuilder builder(unwrap(M)); -#endif /** * LLVM 3.1+ haven't more "extern unsigned llvm::StackAlignmentOverride" and @@ -509,24 +319,6 @@ TargetOptions options; #if defined(PIPE_ARCH_X86) options.StackAlignmentOverride = 4; -#if HAVE_LLVM < 0x0304 - options.RealignStack = true; -#endif -#endif - -#if defined(DEBUG) && HAVE_LLVM < 0x0307 - options.JITEmitDebugInfo = true; -#endif - - /* XXX: Workaround http://llvm.org/PR21435 */ -#if defined(DEBUG) || defined(PROFILE) || \ - (HAVE_LLVM >= 0x0303 && (defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64))) -#if HAVE_LLVM < 0x0304 - options.NoFramePointerElimNonLeaf = true; -#endif -#if HAVE_LLVM < 0x0307 - options.NoFramePointerElim = true; -#endif #endif builder.setEngineKind(EngineKind::JIT) @@ -534,29 +326,24 @@ .setTargetOptions(options) .setOptLevel((CodeGenOpt::Level)OptLevel); - if (useMCJIT) { -#if HAVE_LLVM < 0x0306 - builder.setUseMCJIT(true); -#endif #ifdef _WIN32 - /* - * MCJIT works on Windows, but currently only through ELF object format. - * - * XXX: We could use `LLVM_HOST_TRIPLE "-elf"` but LLVM_HOST_TRIPLE has - * different strings for MinGW/MSVC, so better play it safe and be - * explicit. - */ + /* + * MCJIT works on Windows, but currently only through ELF object format. + * + * XXX: We could use `LLVM_HOST_TRIPLE "-elf"` but LLVM_HOST_TRIPLE has + * different strings for MinGW/MSVC, so better play it safe and be + * explicit. + */ # ifdef _WIN64 - LLVMSetTarget(M, "x86_64-pc-win32-elf"); + LLVMSetTarget(M, "x86_64-pc-win32-elf"); # else - LLVMSetTarget(M, "i686-pc-win32-elf"); + LLVMSetTarget(M, "i686-pc-win32-elf"); # endif #endif - } llvm::SmallVector MAttrs; -#if HAVE_LLVM >= 0x0400 && (defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) || defined(PIPE_ARCH_ARM)) +#if LLVM_VERSION_MAJOR >= 4 && (defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) || defined(PIPE_ARCH_ARM)) /* llvm-3.3+ implements sys::getHostCPUFeatures for Arm * and llvm-3.7+ for x86, which allows us to enable/disable * code generation based on the results of cpuid on these @@ -584,16 +371,8 @@ MAttrs.push_back(util_cpu_caps.has_sse2 ? "+sse2" : "-sse2" ); MAttrs.push_back(util_cpu_caps.has_sse3 ? "+sse3" : "-sse3" ); MAttrs.push_back(util_cpu_caps.has_ssse3 ? "+ssse3" : "-ssse3" ); -#if HAVE_LLVM >= 0x0304 MAttrs.push_back(util_cpu_caps.has_sse4_1 ? "+sse4.1" : "-sse4.1"); -#else - MAttrs.push_back(util_cpu_caps.has_sse4_1 ? "+sse41" : "-sse41" ); -#endif -#if HAVE_LLVM >= 0x0304 MAttrs.push_back(util_cpu_caps.has_sse4_2 ? "+sse4.2" : "-sse4.2"); -#else - MAttrs.push_back(util_cpu_caps.has_sse4_2 ? "+sse42" : "-sse42" ); -#endif /* * AVX feature is not automatically detected from CPUID by the X86 target * yet, because the old (yet default) JIT engine is not capable of @@ -602,29 +381,17 @@ */ MAttrs.push_back(util_cpu_caps.has_avx ? "+avx" : "-avx"); MAttrs.push_back(util_cpu_caps.has_f16c ? "+f16c" : "-f16c"); - if (HAVE_LLVM >= 0x0304) { - MAttrs.push_back(util_cpu_caps.has_fma ? "+fma" : "-fma"); - } else { - /* - * The old JIT in LLVM 3.3 has a bug encoding llvm.fmuladd.f32 and - * llvm.fmuladd.v2f32 intrinsics when FMA is available. - */ - MAttrs.push_back("-fma"); - } + MAttrs.push_back(util_cpu_caps.has_fma ? "+fma" : "-fma"); MAttrs.push_back(util_cpu_caps.has_avx2 ? "+avx2" : "-avx2"); /* disable avx512 and all subvariants */ -#if HAVE_LLVM >= 0x0304 MAttrs.push_back("-avx512cd"); MAttrs.push_back("-avx512er"); MAttrs.push_back("-avx512f"); MAttrs.push_back("-avx512pf"); -#endif -#if HAVE_LLVM >= 0x0305 MAttrs.push_back("-avx512bw"); MAttrs.push_back("-avx512dq"); MAttrs.push_back("-avx512vl"); #endif -#endif #if defined(PIPE_ARCH_ARM) if (!util_cpu_caps.has_neon) { MAttrs.push_back("-neon"); @@ -635,8 +402,7 @@ #if defined(PIPE_ARCH_PPC) MAttrs.push_back(util_cpu_caps.has_altivec ? "+altivec" : "-altivec"); -#if (HAVE_LLVM >= 0x0304) -#if (HAVE_LLVM < 0x0400) +#if (LLVM_VERSION_MAJOR < 4) /* * Make sure VSX instructions are disabled * See LLVM bugs: @@ -664,7 +430,6 @@ } #endif #endif -#endif builder.setMAttrs(MAttrs); @@ -678,7 +443,6 @@ } } -#if HAVE_LLVM >= 0x0305 StringRef MCPU = llvm::sys::getHostCPUName(); /* * The cpu bits are no longer set automatically, so need to set mcpu manually. @@ -705,7 +469,7 @@ */ builder.setCodeModel(CodeModel::Large); -#ifdef PIPE_ARCH_LITTLE_ENDIAN +#if UTIL_ARCH_LITTLE_ENDIAN /* * Versions of LLVM prior to 4.0 lacked a table entry for "POWER8NVL", * resulting in (big-endian) "generic" being returned on @@ -723,33 +487,14 @@ if (gallivm_debug & (GALLIVM_DEBUG_IR | GALLIVM_DEBUG_ASM | GALLIVM_DEBUG_DUMP_BC)) { debug_printf("llc -mcpu option: %s\n", MCPU.str().c_str()); } -#endif ShaderMemoryManager *MM = NULL; - if (useMCJIT) { - BaseMemoryManager* JMM = reinterpret_cast(CMM); - MM = new ShaderMemoryManager(JMM); - *OutCode = MM->getGeneratedCode(); - -#if HAVE_LLVM >= 0x0306 - builder.setMCJITMemoryManager(std::unique_ptr(MM)); - MM = NULL; // ownership taken by std::unique_ptr -#elif HAVE_LLVM > 0x0303 - builder.setMCJITMemoryManager(MM); -#else - builder.setJITMemoryManager(MM); -#endif - } else { -#if HAVE_LLVM < 0x0306 - BaseMemoryManager* JMM = reinterpret_cast(CMM); - MM = new ShaderMemoryManager(JMM); - *OutCode = MM->getGeneratedCode(); + BaseMemoryManager* JMM = reinterpret_cast(CMM); + MM = new ShaderMemoryManager(JMM); + *OutCode = MM->getGeneratedCode(); - builder.setJITMemoryManager(MM); -#else - assert(0); -#endif - } + builder.setMCJITMemoryManager(std::unique_ptr(MM)); + MM = NULL; // ownership taken by std::unique_ptr ExecutionEngine *JIT; @@ -782,11 +527,7 @@ lp_get_default_memory_manager() { BaseMemoryManager *mm; -#if HAVE_LLVM < 0x0306 - mm = llvm::JITMemoryManager::CreateDefaultMemManager(); -#else mm = new llvm::SectionMemoryManager(); -#endif return reinterpret_cast(mm); } @@ -800,53 +541,11 @@ extern "C" LLVMValueRef lp_get_called_value(LLVMValueRef call) { -#if HAVE_LLVM >= 0x0309 return LLVMGetCalledValue(call); -#elif HAVE_LLVM >= 0x0305 - return llvm::wrap(llvm::CallSite(llvm::unwrap(call)).getCalledValue()); -#else - return NULL; /* radeonsi doesn't support so old LLVM. */ -#endif } extern "C" bool lp_is_function(LLVMValueRef v) { -#if HAVE_LLVM >= 0x0309 return LLVMGetValueKind(v) == LLVMFunctionValueKind; -#else - return llvm::isa(llvm::unwrap(v)); -#endif -} - -#if HAVE_LLVM < 0x309 -static llvm::AtomicOrdering mapFromLLVMOrdering(LLVMAtomicOrdering Ordering) { - switch (Ordering) { - case LLVMAtomicOrderingNotAtomic: return llvm::AtomicOrdering::NotAtomic; - case LLVMAtomicOrderingUnordered: return llvm::AtomicOrdering::Unordered; - case LLVMAtomicOrderingMonotonic: return llvm::AtomicOrdering::Monotonic; - case LLVMAtomicOrderingAcquire: return llvm::AtomicOrdering::Acquire; - case LLVMAtomicOrderingRelease: return llvm::AtomicOrdering::Release; - case LLVMAtomicOrderingAcquireRelease: - return llvm::AtomicOrdering::AcquireRelease; - case LLVMAtomicOrderingSequentiallyConsistent: - return llvm::AtomicOrdering::SequentiallyConsistent; - } - - llvm_unreachable("Invalid LLVMAtomicOrdering value!"); } - -LLVMValueRef LLVMBuildAtomicCmpXchg(LLVMBuilderRef B, LLVMValueRef Ptr, - LLVMValueRef Cmp, LLVMValueRef New, - LLVMAtomicOrdering SuccessOrdering, - LLVMAtomicOrdering FailureOrdering, - LLVMBool SingleThread) -{ - return llvm::wrap(llvm::unwrap(B)->CreateAtomicCmpXchg(llvm::unwrap(Ptr), llvm::unwrap(Cmp), - llvm::unwrap(New), mapFromLLVMOrdering(SuccessOrdering), -#if HAVE_LLVM >= 0x305 - mapFromLLVMOrdering(FailureOrdering), -#endif - SingleThread ? llvm::SynchronizationScope::SingleThread : llvm::SynchronizationScope::CrossThread)); -} -#endif diff -Nru mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_misc.h mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_misc.h --- mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_misc.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_misc.h 2020-06-12 01:21:16.000000000 +0000 @@ -31,6 +31,7 @@ #include "lp_bld.h" +#include #include #include @@ -58,7 +59,6 @@ LLVMModuleRef M, LLVMMCJITMemoryManagerRef MM, unsigned OptLevel, - int useMCJIT, char **OutError); extern void @@ -76,15 +76,6 @@ extern bool lp_is_function(LLVMValueRef v); -/* LLVM 3.9 introduces this, provide our own for earlier */ -#if HAVE_LLVM < 0x309 -LLVMValueRef LLVMBuildAtomicCmpXchg(LLVMBuilderRef B, LLVMValueRef Ptr, - LLVMValueRef Cmp, LLVMValueRef New, - LLVMAtomicOrdering SuccessOrdering, - LLVMAtomicOrdering FailureOrdering, - LLVMBool SingleThread); -#endif - #ifdef __cplusplus } #endif diff -Nru mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_nir.c mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_nir.c --- mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_nir.c 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_nir.c 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,1865 @@ +/************************************************************************** + * + * Copyright 2019 Red Hat. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **************************************************************************/ + +#include "lp_bld_nir.h" +#include "lp_bld_arit.h" +#include "lp_bld_bitarit.h" +#include "lp_bld_const.h" +#include "lp_bld_gather.h" +#include "lp_bld_logic.h" +#include "lp_bld_quad.h" +#include "lp_bld_flow.h" +#include "lp_bld_struct.h" +#include "lp_bld_debug.h" +#include "lp_bld_printf.h" +#include "nir_deref.h" + +static void visit_cf_list(struct lp_build_nir_context *bld_base, + struct exec_list *list); + +static LLVMValueRef cast_type(struct lp_build_nir_context *bld_base, LLVMValueRef val, + nir_alu_type alu_type, unsigned bit_size) +{ + LLVMBuilderRef builder = bld_base->base.gallivm->builder; + switch (alu_type) { + case nir_type_float: + switch (bit_size) { + case 32: + return LLVMBuildBitCast(builder, val, bld_base->base.vec_type, ""); + case 64: + return LLVMBuildBitCast(builder, val, bld_base->dbl_bld.vec_type, ""); + default: + assert(0); + break; + } + break; + case nir_type_int: + switch (bit_size) { + case 8: + return LLVMBuildBitCast(builder, val, bld_base->int8_bld.vec_type, ""); + case 16: + return LLVMBuildBitCast(builder, val, bld_base->int16_bld.vec_type, ""); + case 32: + return LLVMBuildBitCast(builder, val, bld_base->int_bld.vec_type, ""); + case 64: + return LLVMBuildBitCast(builder, val, bld_base->int64_bld.vec_type, ""); + default: + assert(0); + break; + } + break; + case nir_type_uint: + switch (bit_size) { + case 8: + return LLVMBuildBitCast(builder, val, bld_base->uint8_bld.vec_type, ""); + case 16: + return LLVMBuildBitCast(builder, val, bld_base->uint16_bld.vec_type, ""); + case 32: + return LLVMBuildBitCast(builder, val, bld_base->uint_bld.vec_type, ""); + case 64: + return LLVMBuildBitCast(builder, val, bld_base->uint64_bld.vec_type, ""); + default: + assert(0); + break; + } + break; + case nir_type_uint32: + return LLVMBuildBitCast(builder, val, bld_base->uint_bld.vec_type, ""); + default: + return val; + } + return NULL; +} + + +static struct lp_build_context *get_flt_bld(struct lp_build_nir_context *bld_base, + unsigned op_bit_size) +{ + if (op_bit_size == 64) + return &bld_base->dbl_bld; + else + return &bld_base->base; +} + +static unsigned glsl_sampler_to_pipe(int sampler_dim, bool is_array) +{ + unsigned pipe_target = PIPE_BUFFER; + switch (sampler_dim) { + case GLSL_SAMPLER_DIM_1D: + pipe_target = is_array ? PIPE_TEXTURE_1D_ARRAY : PIPE_TEXTURE_1D; + break; + case GLSL_SAMPLER_DIM_2D: + pipe_target = is_array ? PIPE_TEXTURE_2D_ARRAY : PIPE_TEXTURE_2D; + break; + case GLSL_SAMPLER_DIM_3D: + pipe_target = PIPE_TEXTURE_3D; + break; + case GLSL_SAMPLER_DIM_CUBE: + pipe_target = is_array ? PIPE_TEXTURE_CUBE_ARRAY : PIPE_TEXTURE_CUBE; + break; + case GLSL_SAMPLER_DIM_RECT: + pipe_target = PIPE_TEXTURE_RECT; + break; + case GLSL_SAMPLER_DIM_BUF: + pipe_target = PIPE_BUFFER; + break; + default: + break; + } + return pipe_target; +} + +static LLVMValueRef get_ssa_src(struct lp_build_nir_context *bld_base, nir_ssa_def *ssa) +{ + return bld_base->ssa_defs[ssa->index]; +} + +static LLVMValueRef get_src(struct lp_build_nir_context *bld_base, nir_src src); + +static LLVMValueRef get_reg_src(struct lp_build_nir_context *bld_base, nir_reg_src src) +{ + struct hash_entry *entry = _mesa_hash_table_search(bld_base->regs, src.reg); + LLVMValueRef reg_storage = (LLVMValueRef)entry->data; + struct lp_build_context *reg_bld = get_int_bld(bld_base, true, src.reg->bit_size); + LLVMValueRef indir_src = NULL; + if (src.indirect) + indir_src = get_src(bld_base, *src.indirect); + return bld_base->load_reg(bld_base, reg_bld, &src, indir_src, reg_storage); +} + +static LLVMValueRef get_src(struct lp_build_nir_context *bld_base, nir_src src) +{ + if (src.is_ssa) + return get_ssa_src(bld_base, src.ssa); + else + return get_reg_src(bld_base, src.reg); +} + +static void assign_ssa(struct lp_build_nir_context *bld_base, int idx, LLVMValueRef ptr) +{ + bld_base->ssa_defs[idx] = ptr; +} + +static void assign_ssa_dest(struct lp_build_nir_context *bld_base, const nir_ssa_def *ssa, + LLVMValueRef vals[NIR_MAX_VEC_COMPONENTS]) +{ + assign_ssa(bld_base, ssa->index, ssa->num_components == 1 ? vals[0] : lp_nir_array_build_gather_values(bld_base->base.gallivm->builder, vals, ssa->num_components)); +} + +static void assign_reg(struct lp_build_nir_context *bld_base, const nir_reg_dest *reg, + unsigned write_mask, + LLVMValueRef vals[NIR_MAX_VEC_COMPONENTS]) +{ + struct hash_entry *entry = _mesa_hash_table_search(bld_base->regs, reg->reg); + LLVMValueRef reg_storage = (LLVMValueRef)entry->data; + struct lp_build_context *reg_bld = get_int_bld(bld_base, true, reg->reg->bit_size); + LLVMValueRef indir_src = NULL; + if (reg->indirect) + indir_src = get_src(bld_base, *reg->indirect); + bld_base->store_reg(bld_base, reg_bld, reg, write_mask ? write_mask : 0xf, indir_src, reg_storage, vals); +} + +static void assign_dest(struct lp_build_nir_context *bld_base, const nir_dest *dest, LLVMValueRef vals[NIR_MAX_VEC_COMPONENTS]) +{ + if (dest->is_ssa) + assign_ssa_dest(bld_base, &dest->ssa, vals); + else + assign_reg(bld_base, &dest->reg, 0, vals); +} + +static void assign_alu_dest(struct lp_build_nir_context *bld_base, const nir_alu_dest *dest, LLVMValueRef vals[NIR_MAX_VEC_COMPONENTS]) +{ + if (dest->dest.is_ssa) + assign_ssa_dest(bld_base, &dest->dest.ssa, vals); + else + assign_reg(bld_base, &dest->dest.reg, dest->write_mask, vals); +} + +static LLVMValueRef int_to_bool32(struct lp_build_nir_context *bld_base, + uint32_t src_bit_size, + bool is_unsigned, + LLVMValueRef val) +{ + LLVMBuilderRef builder = bld_base->base.gallivm->builder; + struct lp_build_context *int_bld = get_int_bld(bld_base, is_unsigned, src_bit_size); + LLVMValueRef result = lp_build_compare(bld_base->base.gallivm, int_bld->type, PIPE_FUNC_NOTEQUAL, val, int_bld->zero); + if (src_bit_size == 64) + result = LLVMBuildTrunc(builder, result, bld_base->int_bld.vec_type, ""); + return result; +} + +static LLVMValueRef flt_to_bool32(struct lp_build_nir_context *bld_base, + uint32_t src_bit_size, + LLVMValueRef val) +{ + LLVMBuilderRef builder = bld_base->base.gallivm->builder; + struct lp_build_context *flt_bld = get_flt_bld(bld_base, src_bit_size); + LLVMValueRef result = lp_build_cmp(flt_bld, PIPE_FUNC_NOTEQUAL, val, flt_bld->zero); + if (src_bit_size == 64) + result = LLVMBuildTrunc(builder, result, bld_base->int_bld.vec_type, ""); + return result; +} + +static LLVMValueRef fcmp32(struct lp_build_nir_context *bld_base, + enum pipe_compare_func compare, + uint32_t src_bit_size, + LLVMValueRef src[NIR_MAX_VEC_COMPONENTS]) +{ + LLVMBuilderRef builder = bld_base->base.gallivm->builder; + struct lp_build_context *flt_bld = get_flt_bld(bld_base, src_bit_size); + LLVMValueRef result; + + if (compare != PIPE_FUNC_NOTEQUAL) + result = lp_build_cmp_ordered(flt_bld, compare, src[0], src[1]); + else + result = lp_build_cmp(flt_bld, compare, src[0], src[1]); + if (src_bit_size == 64) + result = LLVMBuildTrunc(builder, result, bld_base->int_bld.vec_type, ""); + return result; +} + +static LLVMValueRef icmp32(struct lp_build_nir_context *bld_base, + enum pipe_compare_func compare, + bool is_unsigned, + uint32_t src_bit_size, + LLVMValueRef src[NIR_MAX_VEC_COMPONENTS]) +{ + LLVMBuilderRef builder = bld_base->base.gallivm->builder; + struct lp_build_context *i_bld = get_int_bld(bld_base, is_unsigned, src_bit_size); + LLVMValueRef result = lp_build_cmp(i_bld, compare, src[0], src[1]); + if (src_bit_size < 32) + result = LLVMBuildSExt(builder, result, bld_base->int_bld.vec_type, ""); + else if (src_bit_size == 64) + result = LLVMBuildTrunc(builder, result, bld_base->int_bld.vec_type, ""); + return result; +} + +static LLVMValueRef get_alu_src(struct lp_build_nir_context *bld_base, + nir_alu_src src, + unsigned num_components) +{ + LLVMBuilderRef builder = bld_base->base.gallivm->builder; + struct gallivm_state *gallivm = bld_base->base.gallivm; + LLVMValueRef value = get_src(bld_base, src.src); + bool need_swizzle = false; + + assert(value); + unsigned src_components = nir_src_num_components(src.src); + for (unsigned i = 0; i < num_components; ++i) { + assert(src.swizzle[i] < src_components); + if (src.swizzle[i] != i) + need_swizzle = true; + } + + if (need_swizzle || num_components != src_components) { + if (src_components > 1 && num_components == 1) { + value = LLVMBuildExtractValue(gallivm->builder, value, + src.swizzle[0], ""); + } else if (src_components == 1 && num_components > 1) { + LLVMValueRef values[] = {value, value, value, value, value, value, value, value, value, value, value, value, value, value, value, value}; + value = lp_nir_array_build_gather_values(builder, values, num_components); + } else { + LLVMValueRef arr = LLVMGetUndef(LLVMArrayType(LLVMTypeOf(LLVMBuildExtractValue(builder, value, 0, "")), num_components)); + for (unsigned i = 0; i < num_components; i++) + arr = LLVMBuildInsertValue(builder, arr, LLVMBuildExtractValue(builder, value, src.swizzle[i], ""), i, ""); + value = arr; + } + } + assert(!src.negate); + assert(!src.abs); + return value; +} + +static LLVMValueRef emit_b2f(struct lp_build_nir_context *bld_base, + LLVMValueRef src0, + unsigned bitsize) +{ + LLVMBuilderRef builder = bld_base->base.gallivm->builder; + LLVMValueRef result = LLVMBuildAnd(builder, cast_type(bld_base, src0, nir_type_int, 32), + LLVMBuildBitCast(builder, lp_build_const_vec(bld_base->base.gallivm, bld_base->base.type, + 1.0), bld_base->int_bld.vec_type, ""), + ""); + result = LLVMBuildBitCast(builder, result, bld_base->base.vec_type, ""); + switch (bitsize) { + case 32: + break; + case 64: + result = LLVMBuildFPExt(builder, result, bld_base->dbl_bld.vec_type, ""); + break; + default: + unreachable("unsupported bit size."); + } + return result; +} + +static LLVMValueRef emit_b2i(struct lp_build_nir_context *bld_base, + LLVMValueRef src0, + unsigned bitsize) +{ + LLVMBuilderRef builder = bld_base->base.gallivm->builder; + LLVMValueRef result = LLVMBuildAnd(builder, cast_type(bld_base, src0, nir_type_int, 32), + lp_build_const_int_vec(bld_base->base.gallivm, bld_base->base.type, 1), ""); + switch (bitsize) { + case 32: + return result; + case 64: + return LLVMBuildZExt(builder, result, bld_base->int64_bld.vec_type, ""); + default: + unreachable("unsupported bit size."); + } +} + +static LLVMValueRef emit_b32csel(struct lp_build_nir_context *bld_base, + unsigned src_bit_size[NIR_MAX_VEC_COMPONENTS], + LLVMValueRef src[NIR_MAX_VEC_COMPONENTS]) +{ + LLVMValueRef sel = cast_type(bld_base, src[0], nir_type_int, 32); + LLVMValueRef v = lp_build_compare(bld_base->base.gallivm, bld_base->int_bld.type, PIPE_FUNC_NOTEQUAL, sel, bld_base->int_bld.zero); + struct lp_build_context *bld = get_int_bld(bld_base, false, src_bit_size[1]); + return lp_build_select(bld, v, src[1], src[2]); +} + +static LLVMValueRef split_64bit(struct lp_build_nir_context *bld_base, + LLVMValueRef src, + bool hi) +{ + struct gallivm_state *gallivm = bld_base->base.gallivm; + LLVMValueRef shuffles[LP_MAX_VECTOR_WIDTH/32]; + LLVMValueRef shuffles2[LP_MAX_VECTOR_WIDTH/32]; + int len = bld_base->base.type.length * 2; + for (unsigned i = 0; i < bld_base->base.type.length; i++) { + shuffles[i] = lp_build_const_int32(gallivm, i * 2); + shuffles2[i] = lp_build_const_int32(gallivm, (i * 2) + 1); + } + + src = LLVMBuildBitCast(gallivm->builder, src, LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), len), ""); + return LLVMBuildShuffleVector(gallivm->builder, src, + LLVMGetUndef(LLVMTypeOf(src)), + LLVMConstVector(hi ? shuffles2 : shuffles, + bld_base->base.type.length), + ""); +} + +static LLVMValueRef +merge_64bit(struct lp_build_nir_context *bld_base, + LLVMValueRef input, + LLVMValueRef input2) +{ + struct gallivm_state *gallivm = bld_base->base.gallivm; + LLVMBuilderRef builder = gallivm->builder; + int i; + LLVMValueRef shuffles[2 * (LP_MAX_VECTOR_WIDTH/32)]; + int len = bld_base->base.type.length * 2; + assert(len <= (2 * (LP_MAX_VECTOR_WIDTH/32))); + + for (i = 0; i < bld_base->base.type.length * 2; i+=2) { + shuffles[i] = lp_build_const_int32(gallivm, i / 2); + shuffles[i + 1] = lp_build_const_int32(gallivm, i / 2 + bld_base->base.type.length); + } + return LLVMBuildShuffleVector(builder, input, input2, LLVMConstVector(shuffles, len), ""); +} + +static LLVMValueRef +do_int_divide(struct lp_build_nir_context *bld_base, + bool is_unsigned, unsigned src_bit_size, + LLVMValueRef src, LLVMValueRef src2) +{ + struct gallivm_state *gallivm = bld_base->base.gallivm; + LLVMBuilderRef builder = gallivm->builder; + struct lp_build_context *int_bld = get_int_bld(bld_base, is_unsigned, src_bit_size); + struct lp_build_context *mask_bld = get_int_bld(bld_base, true, src_bit_size); + LLVMValueRef div_mask = lp_build_cmp(mask_bld, PIPE_FUNC_EQUAL, src2, + mask_bld->zero); + + if (!is_unsigned) { + /* INT_MIN (0x80000000) / -1 (0xffffffff) causes sigfpe, seen with blender. */ + div_mask = LLVMBuildAnd(builder, div_mask, lp_build_const_int_vec(gallivm, int_bld->type, 0x7fffffff), ""); + } + LLVMValueRef divisor = LLVMBuildOr(builder, + div_mask, + src2, ""); + LLVMValueRef result = lp_build_div(int_bld, src, divisor); + + if (!is_unsigned) { + LLVMValueRef not_div_mask = LLVMBuildNot(builder, div_mask, ""); + return LLVMBuildAnd(builder, not_div_mask, result, ""); + } else + /* udiv by zero is guaranteed to return 0xffffffff at least with d3d10 + * may as well do same for idiv */ + return LLVMBuildOr(builder, div_mask, result, ""); +} + +static LLVMValueRef +do_int_mod(struct lp_build_nir_context *bld_base, + bool is_unsigned, unsigned src_bit_size, + LLVMValueRef src, LLVMValueRef src2) +{ + struct gallivm_state *gallivm = bld_base->base.gallivm; + LLVMBuilderRef builder = gallivm->builder; + struct lp_build_context *int_bld = get_int_bld(bld_base, is_unsigned, src_bit_size); + LLVMValueRef div_mask = lp_build_cmp(int_bld, PIPE_FUNC_EQUAL, src2, + int_bld->zero); + LLVMValueRef divisor = LLVMBuildOr(builder, + div_mask, + src2, ""); + LLVMValueRef result = lp_build_mod(int_bld, src, divisor); + return LLVMBuildOr(builder, div_mask, result, ""); +} + +static LLVMValueRef do_alu_action(struct lp_build_nir_context *bld_base, + nir_op op, unsigned src_bit_size[NIR_MAX_VEC_COMPONENTS], LLVMValueRef src[NIR_MAX_VEC_COMPONENTS]) +{ + struct gallivm_state *gallivm = bld_base->base.gallivm; + LLVMBuilderRef builder = gallivm->builder; + LLVMValueRef result; + switch (op) { + case nir_op_b2f32: + result = emit_b2f(bld_base, src[0], 32); + break; + case nir_op_b2f64: + result = emit_b2f(bld_base, src[0], 64); + break; + case nir_op_b2i32: + result = emit_b2i(bld_base, src[0], 32); + break; + case nir_op_b2i64: + result = emit_b2i(bld_base, src[0], 64); + break; + case nir_op_b32csel: + result = emit_b32csel(bld_base, src_bit_size, src); + break; + case nir_op_bit_count: + result = lp_build_popcount(get_int_bld(bld_base, false, src_bit_size[0]), src[0]); + break; + case nir_op_bitfield_select: + result = lp_build_xor(&bld_base->uint_bld, src[2], lp_build_and(&bld_base->uint_bld, src[0], lp_build_xor(&bld_base->uint_bld, src[1], src[2]))); + break; + case nir_op_bitfield_reverse: + result = lp_build_bitfield_reverse(get_int_bld(bld_base, false, src_bit_size[0]), src[0]); + break; + case nir_op_f2b32: + result = flt_to_bool32(bld_base, src_bit_size[0], src[0]); + break; + case nir_op_f2f32: + result = LLVMBuildFPTrunc(builder, src[0], + bld_base->base.vec_type, ""); + break; + case nir_op_f2f64: + result = LLVMBuildFPExt(builder, src[0], + bld_base->dbl_bld.vec_type, ""); + break; + case nir_op_f2i32: + result = LLVMBuildFPToSI(builder, src[0], bld_base->base.int_vec_type, ""); + break; + case nir_op_f2u32: + result = LLVMBuildFPToUI(builder, + src[0], + bld_base->base.int_vec_type, ""); + break; + case nir_op_f2i64: + result = LLVMBuildFPToSI(builder, + src[0], + bld_base->int64_bld.vec_type, ""); + break; + case nir_op_f2u64: + result = LLVMBuildFPToUI(builder, + src[0], + bld_base->uint64_bld.vec_type, ""); + break; + case nir_op_fabs: + result = lp_build_abs(get_flt_bld(bld_base, src_bit_size[0]), src[0]); + break; + case nir_op_fadd: + result = lp_build_add(get_flt_bld(bld_base, src_bit_size[0]), + src[0], src[1]); + break; + case nir_op_fceil: + result = lp_build_ceil(get_flt_bld(bld_base, src_bit_size[0]), src[0]); + break; + case nir_op_fcos: + result = lp_build_cos(&bld_base->base, src[0]); + break; + case nir_op_fddx: + case nir_op_fddx_coarse: + case nir_op_fddx_fine: + result = lp_build_ddx(&bld_base->base, src[0]); + break; + case nir_op_fddy: + case nir_op_fddy_coarse: + case nir_op_fddy_fine: + result = lp_build_ddy(&bld_base->base, src[0]); + break; + case nir_op_fdiv: + result = lp_build_div(get_flt_bld(bld_base, src_bit_size[0]), + src[0], src[1]); + break; + case nir_op_feq32: + result = fcmp32(bld_base, PIPE_FUNC_EQUAL, src_bit_size[0], src); + break; + case nir_op_fexp2: + result = lp_build_exp2(&bld_base->base, src[0]); + break; + case nir_op_ffloor: + result = lp_build_floor(get_flt_bld(bld_base, src_bit_size[0]), src[0]); + break; + case nir_op_ffma: + result = lp_build_fmuladd(builder, src[0], src[1], src[2]); + break; + case nir_op_ffract: { + struct lp_build_context *flt_bld = get_flt_bld(bld_base, src_bit_size[0]); + LLVMValueRef tmp = lp_build_floor(flt_bld, src[0]); + result = lp_build_sub(flt_bld, src[0], tmp); + break; + } + case nir_op_fge32: + result = fcmp32(bld_base, PIPE_FUNC_GEQUAL, src_bit_size[0], src); + break; + case nir_op_find_lsb: + result = lp_build_cttz(get_int_bld(bld_base, false, src_bit_size[0]), src[0]); + break; + case nir_op_flog2: + result = lp_build_log2_safe(&bld_base->base, src[0]); + break; + case nir_op_flt32: + result = fcmp32(bld_base, PIPE_FUNC_LESS, src_bit_size[0], src); + break; + case nir_op_fmin: + result = lp_build_min(get_flt_bld(bld_base, src_bit_size[0]), src[0], src[1]); + break; + case nir_op_fmod: { + struct lp_build_context *flt_bld = get_flt_bld(bld_base, src_bit_size[0]); + result = lp_build_div(flt_bld, src[0], src[1]); + result = lp_build_floor(flt_bld, result); + result = lp_build_mul(flt_bld, src[1], result); + result = lp_build_sub(flt_bld, src[0], result); + break; + } + case nir_op_fmul: + result = lp_build_mul(get_flt_bld(bld_base, src_bit_size[0]), + src[0], src[1]); + break; + case nir_op_fmax: + result = lp_build_max(get_flt_bld(bld_base, src_bit_size[0]), src[0], src[1]); + break; + case nir_op_fne32: + result = fcmp32(bld_base, PIPE_FUNC_NOTEQUAL, src_bit_size[0], src); + break; + case nir_op_fneg: + result = lp_build_negate(get_flt_bld(bld_base, src_bit_size[0]), src[0]); + break; + case nir_op_fpow: + result = lp_build_pow(&bld_base->base, src[0], src[1]); + break; + case nir_op_frcp: + result = lp_build_rcp(get_flt_bld(bld_base, src_bit_size[0]), src[0]); + break; + case nir_op_fround_even: + result = lp_build_round(get_flt_bld(bld_base, src_bit_size[0]), src[0]); + break; + case nir_op_frsq: + result = lp_build_rsqrt(get_flt_bld(bld_base, src_bit_size[0]), src[0]); + break; + case nir_op_fsat: + result = lp_build_clamp_zero_one_nanzero(get_flt_bld(bld_base, src_bit_size[0]), src[0]); + break; + case nir_op_fsign: + result = lp_build_sgn(get_flt_bld(bld_base, src_bit_size[0]), src[0]); + break; + case nir_op_fsin: + result = lp_build_sin(&bld_base->base, src[0]); + break; + case nir_op_fsqrt: + result = lp_build_sqrt(get_flt_bld(bld_base, src_bit_size[0]), src[0]); + break; + case nir_op_ftrunc: + result = lp_build_trunc(get_flt_bld(bld_base, src_bit_size[0]), src[0]); + break; + case nir_op_i2b32: + result = int_to_bool32(bld_base, src_bit_size[0], false, src[0]); + break; + case nir_op_i2f32: + result = lp_build_int_to_float(&bld_base->base, src[0]); + break; + case nir_op_i2f64: + result = lp_build_int_to_float(&bld_base->dbl_bld, src[0]); + break; + case nir_op_i2i8: + result = LLVMBuildTrunc(builder, src[0], bld_base->int8_bld.vec_type, ""); + break; + case nir_op_i2i16: + if (src_bit_size[0] < 16) + result = LLVMBuildSExt(builder, src[0], bld_base->int16_bld.vec_type, ""); + else + result = LLVMBuildTrunc(builder, src[0], bld_base->int16_bld.vec_type, ""); + break; + case nir_op_i2i32: + if (src_bit_size[0] < 32) + result = LLVMBuildSExt(builder, src[0], bld_base->int_bld.vec_type, ""); + else + result = LLVMBuildTrunc(builder, src[0], bld_base->int_bld.vec_type, ""); + break; + case nir_op_i2i64: + result = LLVMBuildSExt(builder, src[0], bld_base->int64_bld.vec_type, ""); + break; + case nir_op_iabs: + result = lp_build_abs(get_int_bld(bld_base, false, src_bit_size[0]), src[0]); + break; + case nir_op_iadd: + result = lp_build_add(get_int_bld(bld_base, false, src_bit_size[0]), + src[0], src[1]); + break; + case nir_op_iand: + result = lp_build_and(get_int_bld(bld_base, false, src_bit_size[0]), + src[0], src[1]); + break; + case nir_op_idiv: + result = do_int_divide(bld_base, false, src_bit_size[0], src[0], src[1]); + break; + case nir_op_ieq32: + result = icmp32(bld_base, PIPE_FUNC_EQUAL, false, src_bit_size[0], src); + break; + case nir_op_ige32: + result = icmp32(bld_base, PIPE_FUNC_GEQUAL, false, src_bit_size[0], src); + break; + case nir_op_ilt32: + result = icmp32(bld_base, PIPE_FUNC_LESS, false, src_bit_size[0], src); + break; + case nir_op_imax: + result = lp_build_max(get_int_bld(bld_base, false, src_bit_size[0]), src[0], src[1]); + break; + case nir_op_imin: + result = lp_build_min(get_int_bld(bld_base, false, src_bit_size[0]), src[0], src[1]); + break; + case nir_op_imul: + case nir_op_imul24: + result = lp_build_mul(get_int_bld(bld_base, false, src_bit_size[0]), + src[0], src[1]); + break; + case nir_op_imul_high: { + LLVMValueRef hi_bits; + lp_build_mul_32_lohi(&bld_base->int_bld, src[0], src[1], &hi_bits); + result = hi_bits; + break; + } + case nir_op_ine32: + result = icmp32(bld_base, PIPE_FUNC_NOTEQUAL, false, src_bit_size[0], src); + break; + case nir_op_ineg: + result = lp_build_negate(get_int_bld(bld_base, false, src_bit_size[0]), src[0]); + break; + case nir_op_inot: + result = lp_build_not(get_int_bld(bld_base, false, src_bit_size[0]), src[0]); + break; + case nir_op_ior: + result = lp_build_or(get_int_bld(bld_base, false, src_bit_size[0]), + src[0], src[1]); + break; + case nir_op_irem: + result = do_int_mod(bld_base, false, src_bit_size[0], src[0], src[1]); + break; + case nir_op_ishl: { + struct lp_build_context *uint_bld = get_int_bld(bld_base, true, src_bit_size[0]); + struct lp_build_context *int_bld = get_int_bld(bld_base, false, src_bit_size[0]); + if (src_bit_size[0] == 64) + src[1] = LLVMBuildZExt(builder, src[1], uint_bld->vec_type, ""); + if (src_bit_size[0] < 32) + src[1] = LLVMBuildTrunc(builder, src[1], uint_bld->vec_type, ""); + src[1] = lp_build_and(uint_bld, src[1], lp_build_const_int_vec(gallivm, uint_bld->type, (src_bit_size[0] - 1))); + result = lp_build_shl(int_bld, src[0], src[1]); + break; + } + case nir_op_ishr: { + struct lp_build_context *uint_bld = get_int_bld(bld_base, true, src_bit_size[0]); + struct lp_build_context *int_bld = get_int_bld(bld_base, false, src_bit_size[0]); + if (src_bit_size[0] == 64) + src[1] = LLVMBuildZExt(builder, src[1], uint_bld->vec_type, ""); + if (src_bit_size[0] < 32) + src[1] = LLVMBuildTrunc(builder, src[1], uint_bld->vec_type, ""); + src[1] = lp_build_and(uint_bld, src[1], lp_build_const_int_vec(gallivm, uint_bld->type, (src_bit_size[0] - 1))); + result = lp_build_shr(int_bld, src[0], src[1]); + break; + } + case nir_op_isign: + result = lp_build_sgn(get_int_bld(bld_base, false, src_bit_size[0]), src[0]); + break; + case nir_op_isub: + result = lp_build_sub(get_int_bld(bld_base, false, src_bit_size[0]), + src[0], src[1]); + break; + case nir_op_ixor: + result = lp_build_xor(get_int_bld(bld_base, false, src_bit_size[0]), + src[0], src[1]); + break; + case nir_op_mov: + result = src[0]; + break; + case nir_op_unpack_64_2x32_split_x: + result = split_64bit(bld_base, src[0], false); + break; + case nir_op_unpack_64_2x32_split_y: + result = split_64bit(bld_base, src[0], true); + break; + + case nir_op_pack_64_2x32_split: { + LLVMValueRef tmp = merge_64bit(bld_base, src[0], src[1]); + result = LLVMBuildBitCast(builder, tmp, bld_base->dbl_bld.vec_type, ""); + break; + } + case nir_op_u2f32: + result = LLVMBuildUIToFP(builder, src[0], bld_base->base.vec_type, ""); + break; + case nir_op_u2f64: + result = LLVMBuildUIToFP(builder, src[0], bld_base->dbl_bld.vec_type, ""); + break; + case nir_op_u2u8: + result = LLVMBuildTrunc(builder, src[0], bld_base->uint8_bld.vec_type, ""); + break; + case nir_op_u2u16: + if (src_bit_size[0] < 16) + result = LLVMBuildZExt(builder, src[0], bld_base->uint16_bld.vec_type, ""); + else + result = LLVMBuildTrunc(builder, src[0], bld_base->uint16_bld.vec_type, ""); + break; + case nir_op_u2u32: + if (src_bit_size[0] < 32) + result = LLVMBuildZExt(builder, src[0], bld_base->uint_bld.vec_type, ""); + else + result = LLVMBuildTrunc(builder, src[0], bld_base->uint_bld.vec_type, ""); + break; + case nir_op_u2u64: + result = LLVMBuildZExt(builder, src[0], bld_base->uint64_bld.vec_type, ""); + break; + case nir_op_udiv: + result = do_int_divide(bld_base, true, src_bit_size[0], src[0], src[1]); + break; + case nir_op_ufind_msb: { + struct lp_build_context *uint_bld = get_int_bld(bld_base, true, src_bit_size[0]); + result = lp_build_ctlz(uint_bld, src[0]); + result = lp_build_sub(uint_bld, lp_build_const_int_vec(gallivm, uint_bld->type, src_bit_size[0] - 1), result); + break; + } + case nir_op_uge32: + result = icmp32(bld_base, PIPE_FUNC_GEQUAL, true, src_bit_size[0], src); + break; + case nir_op_ult32: + result = icmp32(bld_base, PIPE_FUNC_LESS, true, src_bit_size[0], src); + break; + case nir_op_umax: + result = lp_build_max(get_int_bld(bld_base, true, src_bit_size[0]), src[0], src[1]); + break; + case nir_op_umin: + result = lp_build_min(get_int_bld(bld_base, true, src_bit_size[0]), src[0], src[1]); + break; + case nir_op_umod: + result = do_int_mod(bld_base, true, src_bit_size[0], src[0], src[1]); + break; + case nir_op_umul_high: { + LLVMValueRef hi_bits; + lp_build_mul_32_lohi(&bld_base->uint_bld, src[0], src[1], &hi_bits); + result = hi_bits; + break; + } + case nir_op_ushr: { + struct lp_build_context *uint_bld = get_int_bld(bld_base, true, src_bit_size[0]); + if (src_bit_size[0] == 64) + src[1] = LLVMBuildZExt(builder, src[1], uint_bld->vec_type, ""); + if (src_bit_size[0] < 32) + src[1] = LLVMBuildTrunc(builder, src[1], uint_bld->vec_type, ""); + src[1] = lp_build_and(uint_bld, src[1], lp_build_const_int_vec(gallivm, uint_bld->type, (src_bit_size[0] - 1))); + result = lp_build_shr(uint_bld, src[0], src[1]); + break; + } + default: + assert(0); + break; + } + return result; +} + +static void visit_alu(struct lp_build_nir_context *bld_base, const nir_alu_instr *instr) +{ + struct gallivm_state *gallivm = bld_base->base.gallivm; + LLVMValueRef src[NIR_MAX_VEC_COMPONENTS]; + unsigned src_bit_size[NIR_MAX_VEC_COMPONENTS]; + unsigned num_components = nir_dest_num_components(instr->dest.dest); + unsigned src_components; + switch (instr->op) { + case nir_op_vec2: + case nir_op_vec3: + case nir_op_vec4: + case nir_op_vec8: + case nir_op_vec16: + src_components = 1; + break; + case nir_op_pack_half_2x16: + src_components = 2; + break; + case nir_op_unpack_half_2x16: + src_components = 1; + break; + case nir_op_cube_face_coord: + case nir_op_cube_face_index: + src_components = 3; + break; + default: + src_components = num_components; + break; + } + for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) { + src[i] = get_alu_src(bld_base, instr->src[i], src_components); + src_bit_size[i] = nir_src_bit_size(instr->src[i].src); + } + + LLVMValueRef result[NIR_MAX_VEC_COMPONENTS]; + if (instr->op == nir_op_vec4 || instr->op == nir_op_vec3 || instr->op == nir_op_vec2 || instr->op == nir_op_vec8 || instr->op == nir_op_vec16) { + for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) { + result[i] = cast_type(bld_base, src[i], nir_op_infos[instr->op].input_types[i], src_bit_size[i]); + } + } else { + for (unsigned c = 0; c < num_components; c++) { + LLVMValueRef src_chan[NIR_MAX_VEC_COMPONENTS]; + + for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) { + if (num_components > 1) { + src_chan[i] = LLVMBuildExtractValue(gallivm->builder, + src[i], c, ""); + } else + src_chan[i] = src[i]; + src_chan[i] = cast_type(bld_base, src_chan[i], nir_op_infos[instr->op].input_types[i], src_bit_size[i]); + } + result[c] = do_alu_action(bld_base, instr->op, src_bit_size, src_chan); + result[c] = cast_type(bld_base, result[c], nir_op_infos[instr->op].output_type, nir_dest_bit_size(instr->dest.dest)); + } + } + assign_alu_dest(bld_base, &instr->dest, result); + } + +static void visit_load_const(struct lp_build_nir_context *bld_base, + const nir_load_const_instr *instr) +{ + LLVMValueRef result[NIR_MAX_VEC_COMPONENTS]; + struct lp_build_context *int_bld = get_int_bld(bld_base, true, instr->def.bit_size); + for (unsigned i = 0; i < instr->def.num_components; i++) + result[i] = lp_build_const_int_vec(bld_base->base.gallivm, int_bld->type, instr->value[i].u64); + assign_ssa_dest(bld_base, &instr->def, result); +} + +static void +get_deref_offset(struct lp_build_nir_context *bld_base, nir_deref_instr *instr, + bool vs_in, unsigned *vertex_index_out, + LLVMValueRef *vertex_index_ref, + unsigned *const_out, LLVMValueRef *indir_out) +{ + LLVMBuilderRef builder = bld_base->base.gallivm->builder; + nir_variable *var = nir_deref_instr_get_variable(instr); + nir_deref_path path; + unsigned idx_lvl = 1; + + nir_deref_path_init(&path, instr, NULL); + + if (vertex_index_out != NULL || vertex_index_ref != NULL) { + if (vertex_index_ref) { + *vertex_index_ref = get_src(bld_base, path.path[idx_lvl]->arr.index); + if (vertex_index_out) + *vertex_index_out = 0; + } else { + *vertex_index_out = nir_src_as_uint(path.path[idx_lvl]->arr.index); + } + ++idx_lvl; + } + + uint32_t const_offset = 0; + LLVMValueRef offset = NULL; + + if (var->data.compact) { + assert(instr->deref_type == nir_deref_type_array); + const_offset = nir_src_as_uint(instr->arr.index); + goto out; + } + + for (; path.path[idx_lvl]; ++idx_lvl) { + const struct glsl_type *parent_type = path.path[idx_lvl - 1]->type; + if (path.path[idx_lvl]->deref_type == nir_deref_type_struct) { + unsigned index = path.path[idx_lvl]->strct.index; + + for (unsigned i = 0; i < index; i++) { + const struct glsl_type *ft = glsl_get_struct_field(parent_type, i); + const_offset += glsl_count_attribute_slots(ft, vs_in); + } + } else if(path.path[idx_lvl]->deref_type == nir_deref_type_array) { + unsigned size = glsl_count_attribute_slots(path.path[idx_lvl]->type, vs_in); + if (nir_src_is_const(path.path[idx_lvl]->arr.index)) { + const_offset += nir_src_comp_as_int(path.path[idx_lvl]->arr.index, 0) * size; + } else { + LLVMValueRef idx_src = get_src(bld_base, path.path[idx_lvl]->arr.index); + idx_src = cast_type(bld_base, idx_src, nir_type_uint, 32); + LLVMValueRef array_off = lp_build_mul(&bld_base->uint_bld, lp_build_const_int_vec(bld_base->base.gallivm, bld_base->base.type, size), + idx_src); + if (offset) + offset = lp_build_add(&bld_base->uint_bld, offset, array_off); + else + offset = array_off; + } + } else + unreachable("Uhandled deref type in get_deref_instr_offset"); + } + +out: + nir_deref_path_finish(&path); + + if (const_offset && offset) + offset = LLVMBuildAdd(builder, offset, + lp_build_const_int_vec(bld_base->base.gallivm, bld_base->uint_bld.type, const_offset), + ""); + *const_out = const_offset; + *indir_out = offset; +} + +static void visit_load_var(struct lp_build_nir_context *bld_base, + nir_intrinsic_instr *instr, + LLVMValueRef result[NIR_MAX_VEC_COMPONENTS]) +{ + nir_deref_instr *deref = nir_instr_as_deref(instr->src[0].ssa->parent_instr); + nir_variable *var = nir_deref_instr_get_variable(deref); + nir_variable_mode mode = deref->mode; + unsigned const_index; + LLVMValueRef indir_index; + unsigned vertex_index = 0; + unsigned nc = nir_dest_num_components(instr->dest); + unsigned bit_size = nir_dest_bit_size(instr->dest); + if (var) { + bool vs_in = bld_base->shader->info.stage == MESA_SHADER_VERTEX && + var->data.mode == nir_var_shader_in; + bool gs_in = bld_base->shader->info.stage == MESA_SHADER_GEOMETRY && + var->data.mode == nir_var_shader_in; + mode = var->data.mode; + + get_deref_offset(bld_base, deref, vs_in, gs_in ? &vertex_index : NULL, NULL, + &const_index, &indir_index); + } + bld_base->load_var(bld_base, mode, nc, bit_size, var, vertex_index, const_index, indir_index, result); +} + +static void +visit_store_var(struct lp_build_nir_context *bld_base, + nir_intrinsic_instr *instr) +{ + nir_deref_instr *deref = nir_instr_as_deref(instr->src[0].ssa->parent_instr); + nir_variable *var = nir_deref_instr_get_variable(deref); + nir_variable_mode mode = deref->mode; + int writemask = instr->const_index[0]; + unsigned bit_size = nir_src_bit_size(instr->src[1]); + LLVMValueRef src = get_src(bld_base, instr->src[1]); + unsigned const_index = 0; + LLVMValueRef indir_index; + if (var) + get_deref_offset(bld_base, deref, false, NULL, NULL, + &const_index, &indir_index); + bld_base->store_var(bld_base, mode, bit_size, instr->num_components, writemask, const_index, var, src); +} + +static void visit_load_ubo(struct lp_build_nir_context *bld_base, + nir_intrinsic_instr *instr, + LLVMValueRef result[NIR_MAX_VEC_COMPONENTS]) +{ + struct gallivm_state *gallivm = bld_base->base.gallivm; + LLVMBuilderRef builder = gallivm->builder; + LLVMValueRef idx = get_src(bld_base, instr->src[0]); + LLVMValueRef offset = get_src(bld_base, instr->src[1]); + + bool offset_is_uniform = nir_src_is_dynamically_uniform(instr->src[1]); + idx = LLVMBuildExtractElement(builder, idx, lp_build_const_int32(gallivm, 0), ""); + bld_base->load_ubo(bld_base, nir_dest_num_components(instr->dest), nir_dest_bit_size(instr->dest), + offset_is_uniform, idx, offset, result); +} + + +static void visit_load_ssbo(struct lp_build_nir_context *bld_base, + nir_intrinsic_instr *instr, + LLVMValueRef result[NIR_MAX_VEC_COMPONENTS]) +{ + LLVMValueRef idx = get_src(bld_base, instr->src[0]); + LLVMValueRef offset = get_src(bld_base, instr->src[1]); + bld_base->load_mem(bld_base, nir_dest_num_components(instr->dest), nir_dest_bit_size(instr->dest), + idx, offset, result); +} + +static void visit_store_ssbo(struct lp_build_nir_context *bld_base, + nir_intrinsic_instr *instr) +{ + LLVMValueRef val = get_src(bld_base, instr->src[0]); + LLVMValueRef idx = get_src(bld_base, instr->src[1]); + LLVMValueRef offset = get_src(bld_base, instr->src[2]); + int writemask = instr->const_index[0]; + int nc = nir_src_num_components(instr->src[0]); + int bitsize = nir_src_bit_size(instr->src[0]); + bld_base->store_mem(bld_base, writemask, nc, bitsize, idx, offset, val); +} + +static void visit_get_buffer_size(struct lp_build_nir_context *bld_base, + nir_intrinsic_instr *instr, + LLVMValueRef result[NIR_MAX_VEC_COMPONENTS]) +{ + LLVMValueRef idx = get_src(bld_base, instr->src[0]); + result[0] = bld_base->get_buffer_size(bld_base, idx); +} + +static void visit_ssbo_atomic(struct lp_build_nir_context *bld_base, + nir_intrinsic_instr *instr, + LLVMValueRef result[NIR_MAX_VEC_COMPONENTS]) +{ + LLVMValueRef idx = get_src(bld_base, instr->src[0]); + LLVMValueRef offset = get_src(bld_base, instr->src[1]); + LLVMValueRef val = get_src(bld_base, instr->src[2]); + LLVMValueRef val2 = NULL; + if (instr->intrinsic == nir_intrinsic_ssbo_atomic_comp_swap) + val2 = get_src(bld_base, instr->src[3]); + + bld_base->atomic_mem(bld_base, instr->intrinsic, idx, offset, val, val2, &result[0]); + +} + +static void visit_load_image(struct lp_build_nir_context *bld_base, + nir_intrinsic_instr *instr, + LLVMValueRef result[NIR_MAX_VEC_COMPONENTS]) +{ + struct gallivm_state *gallivm = bld_base->base.gallivm; + LLVMBuilderRef builder = gallivm->builder; + nir_deref_instr *deref = nir_instr_as_deref(instr->src[0].ssa->parent_instr); + nir_variable *var = nir_deref_instr_get_variable(deref); + LLVMValueRef coord_val = get_src(bld_base, instr->src[1]); + LLVMValueRef coords[5]; + struct lp_img_params params; + const struct glsl_type *type = glsl_without_array(var->type); + + memset(¶ms, 0, sizeof(params)); + params.target = glsl_sampler_to_pipe(glsl_get_sampler_dim(type), glsl_sampler_type_is_array(type)); + for (unsigned i = 0; i < 4; i++) + coords[i] = LLVMBuildExtractValue(builder, coord_val, i, ""); + if (params.target == PIPE_TEXTURE_1D_ARRAY) + coords[2] = coords[1]; + + params.coords = coords; + params.outdata = result; + params.img_op = LP_IMG_LOAD; + params.image_index = var->data.binding; + bld_base->image_op(bld_base, ¶ms); +} + +static void visit_store_image(struct lp_build_nir_context *bld_base, + nir_intrinsic_instr *instr) +{ + struct gallivm_state *gallivm = bld_base->base.gallivm; + LLVMBuilderRef builder = gallivm->builder; + nir_deref_instr *deref = nir_instr_as_deref(instr->src[0].ssa->parent_instr); + nir_variable *var = nir_deref_instr_get_variable(deref); + LLVMValueRef coord_val = get_src(bld_base, instr->src[1]); + LLVMValueRef in_val = get_src(bld_base, instr->src[3]); + LLVMValueRef coords[5]; + struct lp_img_params params; + const struct glsl_type *type = glsl_without_array(var->type); + + memset(¶ms, 0, sizeof(params)); + params.target = glsl_sampler_to_pipe(glsl_get_sampler_dim(type), glsl_sampler_type_is_array(type)); + for (unsigned i = 0; i < 4; i++) + coords[i] = LLVMBuildExtractValue(builder, coord_val, i, ""); + if (params.target == PIPE_TEXTURE_1D_ARRAY) + coords[2] = coords[1]; + params.coords = coords; + + for (unsigned i = 0; i < 4; i++) { + params.indata[i] = LLVMBuildExtractValue(builder, in_val, i, ""); + params.indata[i] = LLVMBuildBitCast(builder, params.indata[i], bld_base->base.vec_type, ""); + } + params.img_op = LP_IMG_STORE; + params.image_index = var->data.binding; + + if (params.target == PIPE_TEXTURE_1D_ARRAY) + coords[2] = coords[1]; + bld_base->image_op(bld_base, ¶ms); +} + +static void visit_atomic_image(struct lp_build_nir_context *bld_base, + nir_intrinsic_instr *instr, + LLVMValueRef result[NIR_MAX_VEC_COMPONENTS]) +{ + struct gallivm_state *gallivm = bld_base->base.gallivm; + LLVMBuilderRef builder = gallivm->builder; + nir_deref_instr *deref = nir_instr_as_deref(instr->src[0].ssa->parent_instr); + nir_variable *var = nir_deref_instr_get_variable(deref); + struct lp_img_params params; + LLVMValueRef coord_val = get_src(bld_base, instr->src[1]); + LLVMValueRef in_val = get_src(bld_base, instr->src[3]); + LLVMValueRef coords[5]; + const struct glsl_type *type = glsl_without_array(var->type); + + memset(¶ms, 0, sizeof(params)); + + switch (instr->intrinsic) { + case nir_intrinsic_image_deref_atomic_add: + params.op = LLVMAtomicRMWBinOpAdd; + break; + case nir_intrinsic_image_deref_atomic_exchange: + params.op = LLVMAtomicRMWBinOpXchg; + break; + case nir_intrinsic_image_deref_atomic_and: + params.op = LLVMAtomicRMWBinOpAnd; + break; + case nir_intrinsic_image_deref_atomic_or: + params.op = LLVMAtomicRMWBinOpOr; + break; + case nir_intrinsic_image_deref_atomic_xor: + params.op = LLVMAtomicRMWBinOpXor; + break; + case nir_intrinsic_image_deref_atomic_umin: + params.op = LLVMAtomicRMWBinOpUMin; + break; + case nir_intrinsic_image_deref_atomic_umax: + params.op = LLVMAtomicRMWBinOpUMax; + break; + case nir_intrinsic_image_deref_atomic_imin: + params.op = LLVMAtomicRMWBinOpMin; + break; + case nir_intrinsic_image_deref_atomic_imax: + params.op = LLVMAtomicRMWBinOpMax; + break; + default: + break; + } + + params.target = glsl_sampler_to_pipe(glsl_get_sampler_dim(type), glsl_sampler_type_is_array(type)); + for (unsigned i = 0; i < 4; i++) + coords[i] = LLVMBuildExtractValue(builder, coord_val, i, ""); + if (params.target == PIPE_TEXTURE_1D_ARRAY) + coords[2] = coords[1]; + params.coords = coords; + if (instr->intrinsic == nir_intrinsic_image_deref_atomic_comp_swap) { + LLVMValueRef cas_val = get_src(bld_base, instr->src[4]); + params.indata[0] = in_val; + params.indata2[0] = cas_val; + } else + params.indata[0] = in_val; + + params.outdata = result; + params.img_op = (instr->intrinsic == nir_intrinsic_image_deref_atomic_comp_swap) ? LP_IMG_ATOMIC_CAS : LP_IMG_ATOMIC; + params.image_index = var->data.binding; + + bld_base->image_op(bld_base, ¶ms); +} + + +static void visit_image_size(struct lp_build_nir_context *bld_base, + nir_intrinsic_instr *instr, + LLVMValueRef result[NIR_MAX_VEC_COMPONENTS]) +{ + nir_deref_instr *deref = nir_instr_as_deref(instr->src[0].ssa->parent_instr); + nir_variable *var = nir_deref_instr_get_variable(deref); + struct lp_sampler_size_query_params params = { 0 }; + params.texture_unit = var->data.binding; + params.target = glsl_sampler_to_pipe(glsl_get_sampler_dim(var->type), glsl_sampler_type_is_array(var->type)); + params.sizes_out = result; + + bld_base->image_size(bld_base, ¶ms); +} + +static void visit_shared_load(struct lp_build_nir_context *bld_base, + nir_intrinsic_instr *instr, + LLVMValueRef result[NIR_MAX_VEC_COMPONENTS]) +{ + LLVMValueRef offset = get_src(bld_base, instr->src[0]); + bld_base->load_mem(bld_base, nir_dest_num_components(instr->dest), nir_dest_bit_size(instr->dest), + NULL, offset, result); +} + +static void visit_shared_store(struct lp_build_nir_context *bld_base, + nir_intrinsic_instr *instr) +{ + LLVMValueRef val = get_src(bld_base, instr->src[0]); + LLVMValueRef offset = get_src(bld_base, instr->src[1]); + int writemask = instr->const_index[1]; + int nc = nir_src_num_components(instr->src[0]); + int bitsize = nir_src_bit_size(instr->src[0]); + bld_base->store_mem(bld_base, writemask, nc, bitsize, NULL, offset, val); +} + +static void visit_shared_atomic(struct lp_build_nir_context *bld_base, + nir_intrinsic_instr *instr, + LLVMValueRef result[NIR_MAX_VEC_COMPONENTS]) +{ + LLVMValueRef offset = get_src(bld_base, instr->src[0]); + LLVMValueRef val = get_src(bld_base, instr->src[1]); + LLVMValueRef val2 = NULL; + if (instr->intrinsic == nir_intrinsic_shared_atomic_comp_swap) + val2 = get_src(bld_base, instr->src[2]); + + bld_base->atomic_mem(bld_base, instr->intrinsic, NULL, offset, val, val2, &result[0]); + +} + +static void visit_barrier(struct lp_build_nir_context *bld_base) +{ + bld_base->barrier(bld_base); +} + +static void visit_discard(struct lp_build_nir_context *bld_base, + nir_intrinsic_instr *instr) +{ + LLVMValueRef cond = NULL; + if (instr->intrinsic == nir_intrinsic_discard_if) { + cond = get_src(bld_base, instr->src[0]); + cond = cast_type(bld_base, cond, nir_type_int, 32); + } + bld_base->discard(bld_base, cond); +} + +static void visit_load_kernel_input(struct lp_build_nir_context *bld_base, + nir_intrinsic_instr *instr, LLVMValueRef result[NIR_MAX_VEC_COMPONENTS]) +{ + LLVMValueRef offset = get_src(bld_base, instr->src[0]); + + bool offset_is_uniform = nir_src_is_dynamically_uniform(instr->src[0]); + bld_base->load_kernel_arg(bld_base, nir_dest_num_components(instr->dest), nir_dest_bit_size(instr->dest), + nir_src_bit_size(instr->src[0]), + offset_is_uniform, offset, result); +} + +static void visit_load_global(struct lp_build_nir_context *bld_base, + nir_intrinsic_instr *instr, LLVMValueRef result[NIR_MAX_VEC_COMPONENTS]) +{ + LLVMValueRef addr = get_src(bld_base, instr->src[0]); + bld_base->load_global(bld_base, nir_dest_num_components(instr->dest), nir_dest_bit_size(instr->dest), + nir_src_bit_size(instr->src[0]), + addr, result); +} + +static void visit_store_global(struct lp_build_nir_context *bld_base, + nir_intrinsic_instr *instr) +{ + LLVMValueRef val = get_src(bld_base, instr->src[0]); + int nc = nir_src_num_components(instr->src[0]); + int bitsize = nir_src_bit_size(instr->src[0]); + LLVMValueRef addr = get_src(bld_base, instr->src[1]); + int addr_bitsize = nir_src_bit_size(instr->src[1]); + int writemask = instr->const_index[0]; + bld_base->store_global(bld_base, writemask, nc, bitsize, addr_bitsize, addr, val); +} + +static void visit_global_atomic(struct lp_build_nir_context *bld_base, + nir_intrinsic_instr *instr, + LLVMValueRef result[NIR_MAX_VEC_COMPONENTS]) +{ + LLVMValueRef addr = get_src(bld_base, instr->src[0]); + LLVMValueRef val = get_src(bld_base, instr->src[1]); + LLVMValueRef val2 = NULL; + int addr_bitsize = nir_src_bit_size(instr->src[0]); + if (instr->intrinsic == nir_intrinsic_global_atomic_comp_swap) + val2 = get_src(bld_base, instr->src[2]); + + bld_base->atomic_global(bld_base, instr->intrinsic, addr_bitsize, addr, val, val2, &result[0]); +} + +static void visit_intrinsic(struct lp_build_nir_context *bld_base, + nir_intrinsic_instr *instr) +{ + LLVMValueRef result[NIR_MAX_VEC_COMPONENTS] = {0}; + switch (instr->intrinsic) { + case nir_intrinsic_load_deref: + visit_load_var(bld_base, instr, result); + break; + case nir_intrinsic_store_deref: + visit_store_var(bld_base, instr); + break; + case nir_intrinsic_load_ubo: + visit_load_ubo(bld_base, instr, result); + break; + case nir_intrinsic_load_ssbo: + visit_load_ssbo(bld_base, instr, result); + break; + case nir_intrinsic_store_ssbo: + visit_store_ssbo(bld_base, instr); + break; + case nir_intrinsic_get_buffer_size: + visit_get_buffer_size(bld_base, instr, result); + break; + case nir_intrinsic_load_vertex_id: + case nir_intrinsic_load_primitive_id: + case nir_intrinsic_load_instance_id: + case nir_intrinsic_load_base_instance: + case nir_intrinsic_load_base_vertex: + case nir_intrinsic_load_work_group_id: + case nir_intrinsic_load_local_invocation_id: + case nir_intrinsic_load_num_work_groups: + case nir_intrinsic_load_invocation_id: + case nir_intrinsic_load_front_face: + case nir_intrinsic_load_draw_id: + case nir_intrinsic_load_local_group_size: + case nir_intrinsic_load_work_dim: + bld_base->sysval_intrin(bld_base, instr, result); + break; + case nir_intrinsic_discard_if: + case nir_intrinsic_discard: + visit_discard(bld_base, instr); + break; + case nir_intrinsic_emit_vertex: + bld_base->emit_vertex(bld_base, nir_intrinsic_stream_id(instr)); + break; + case nir_intrinsic_end_primitive: + bld_base->end_primitive(bld_base, nir_intrinsic_stream_id(instr)); + break; + case nir_intrinsic_ssbo_atomic_add: + case nir_intrinsic_ssbo_atomic_imin: + case nir_intrinsic_ssbo_atomic_imax: + case nir_intrinsic_ssbo_atomic_umin: + case nir_intrinsic_ssbo_atomic_umax: + case nir_intrinsic_ssbo_atomic_and: + case nir_intrinsic_ssbo_atomic_or: + case nir_intrinsic_ssbo_atomic_xor: + case nir_intrinsic_ssbo_atomic_exchange: + case nir_intrinsic_ssbo_atomic_comp_swap: + visit_ssbo_atomic(bld_base, instr, result); + break; + case nir_intrinsic_image_deref_load: + visit_load_image(bld_base, instr, result); + break; + case nir_intrinsic_image_deref_store: + visit_store_image(bld_base, instr); + break; + case nir_intrinsic_image_deref_atomic_add: + case nir_intrinsic_image_deref_atomic_imin: + case nir_intrinsic_image_deref_atomic_imax: + case nir_intrinsic_image_deref_atomic_umin: + case nir_intrinsic_image_deref_atomic_umax: + case nir_intrinsic_image_deref_atomic_and: + case nir_intrinsic_image_deref_atomic_or: + case nir_intrinsic_image_deref_atomic_xor: + case nir_intrinsic_image_deref_atomic_exchange: + case nir_intrinsic_image_deref_atomic_comp_swap: + visit_atomic_image(bld_base, instr, result); + break; + case nir_intrinsic_image_deref_size: + visit_image_size(bld_base, instr, result); + break; + case nir_intrinsic_load_shared: + visit_shared_load(bld_base, instr, result); + break; + case nir_intrinsic_store_shared: + visit_shared_store(bld_base, instr); + break; + case nir_intrinsic_shared_atomic_add: + case nir_intrinsic_shared_atomic_imin: + case nir_intrinsic_shared_atomic_umin: + case nir_intrinsic_shared_atomic_imax: + case nir_intrinsic_shared_atomic_umax: + case nir_intrinsic_shared_atomic_and: + case nir_intrinsic_shared_atomic_or: + case nir_intrinsic_shared_atomic_xor: + case nir_intrinsic_shared_atomic_exchange: + case nir_intrinsic_shared_atomic_comp_swap: + visit_shared_atomic(bld_base, instr, result); + break; + case nir_intrinsic_control_barrier: + visit_barrier(bld_base); + break; + case nir_intrinsic_memory_barrier: + case nir_intrinsic_memory_barrier_shared: + case nir_intrinsic_memory_barrier_buffer: + case nir_intrinsic_memory_barrier_image: + case nir_intrinsic_memory_barrier_tcs_patch: + break; + case nir_intrinsic_load_kernel_input: + visit_load_kernel_input(bld_base, instr, result); + break; + case nir_intrinsic_load_global: + visit_load_global(bld_base, instr, result); + break; + case nir_intrinsic_store_global: + visit_store_global(bld_base, instr); + break; + case nir_intrinsic_global_atomic_add: + case nir_intrinsic_global_atomic_imin: + case nir_intrinsic_global_atomic_umin: + case nir_intrinsic_global_atomic_imax: + case nir_intrinsic_global_atomic_umax: + case nir_intrinsic_global_atomic_and: + case nir_intrinsic_global_atomic_or: + case nir_intrinsic_global_atomic_xor: + case nir_intrinsic_global_atomic_exchange: + case nir_intrinsic_global_atomic_comp_swap: + visit_global_atomic(bld_base, instr, result); + case nir_intrinsic_vote_all: + case nir_intrinsic_vote_any: + case nir_intrinsic_vote_ieq: + bld_base->vote(bld_base, cast_type(bld_base, get_src(bld_base, instr->src[0]), nir_type_int, 32), instr, result); + break; + default: + assert(0); + break; + } + if (result[0]) { + assign_dest(bld_base, &instr->dest, result); + } +} + +static void visit_txs(struct lp_build_nir_context *bld_base, nir_tex_instr *instr) +{ + struct lp_sampler_size_query_params params; + LLVMValueRef sizes_out[NIR_MAX_VEC_COMPONENTS]; + LLVMValueRef explicit_lod = NULL; + + for (unsigned i = 0; i < instr->num_srcs; i++) { + switch (instr->src[i].src_type) { + case nir_tex_src_lod: + explicit_lod = cast_type(bld_base, get_src(bld_base, instr->src[i].src), nir_type_int, 32); + break; + default: + break; + } + } + + params.target = glsl_sampler_to_pipe(instr->sampler_dim, instr->is_array); + params.texture_unit = instr->texture_index; + params.explicit_lod = explicit_lod; + params.is_sviewinfo = TRUE; + params.sizes_out = sizes_out; + + if (instr->op == nir_texop_query_levels) + params.explicit_lod = bld_base->uint_bld.zero; + bld_base->tex_size(bld_base, ¶ms); + assign_dest(bld_base, &instr->dest, &sizes_out[instr->op == nir_texop_query_levels ? 3 : 0]); +} + +static enum lp_sampler_lod_property lp_build_nir_lod_property(struct lp_build_nir_context *bld_base, + nir_src lod_src) +{ + enum lp_sampler_lod_property lod_property; + + if (nir_src_is_dynamically_uniform(lod_src)) + lod_property = LP_SAMPLER_LOD_SCALAR; + else if (bld_base->shader->info.stage == MESA_SHADER_FRAGMENT) { + if (gallivm_perf & GALLIVM_PERF_NO_QUAD_LOD) + lod_property = LP_SAMPLER_LOD_PER_ELEMENT; + else + lod_property = LP_SAMPLER_LOD_PER_QUAD; + } + else + lod_property = LP_SAMPLER_LOD_PER_ELEMENT; + return lod_property; +} + +static void visit_tex(struct lp_build_nir_context *bld_base, nir_tex_instr *instr) +{ + struct gallivm_state *gallivm = bld_base->base.gallivm; + LLVMBuilderRef builder = gallivm->builder; + LLVMValueRef coords[5]; + LLVMValueRef offsets[3] = { NULL }; + LLVMValueRef explicit_lod = NULL, projector = NULL; + struct lp_sampler_params params; + struct lp_derivatives derivs; + unsigned sample_key = 0; + nir_deref_instr *texture_deref_instr = NULL; + nir_deref_instr *sampler_deref_instr = NULL; + LLVMValueRef texel[NIR_MAX_VEC_COMPONENTS]; + unsigned lod_src = 0; + LLVMValueRef coord_undef = LLVMGetUndef(bld_base->base.int_vec_type); + + memset(¶ms, 0, sizeof(params)); + enum lp_sampler_lod_property lod_property = LP_SAMPLER_LOD_SCALAR; + + if (instr->op == nir_texop_txs || instr->op == nir_texop_query_levels) { + visit_txs(bld_base, instr); + return; + } + if (instr->op == nir_texop_txf || instr->op == nir_texop_txf_ms) + sample_key |= LP_SAMPLER_OP_FETCH << LP_SAMPLER_OP_TYPE_SHIFT; + else if (instr->op == nir_texop_tg4) { + sample_key |= LP_SAMPLER_OP_GATHER << LP_SAMPLER_OP_TYPE_SHIFT; + sample_key |= (instr->component << LP_SAMPLER_GATHER_COMP_SHIFT); + } else if (instr->op == nir_texop_lod) + sample_key |= LP_SAMPLER_OP_LODQ << LP_SAMPLER_OP_TYPE_SHIFT; + for (unsigned i = 0; i < instr->num_srcs; i++) { + switch (instr->src[i].src_type) { + case nir_tex_src_coord: { + LLVMValueRef coord = get_src(bld_base, instr->src[i].src); + if (instr->coord_components == 1) + coords[0] = coord; + else { + for (unsigned chan = 0; chan < instr->coord_components; ++chan) + coords[chan] = LLVMBuildExtractValue(builder, coord, + chan, ""); + } + for (unsigned chan = instr->coord_components; chan < 5; chan++) + coords[chan] = coord_undef; + + break; + } + case nir_tex_src_texture_deref: + texture_deref_instr = nir_src_as_deref(instr->src[i].src); + break; + case nir_tex_src_sampler_deref: + sampler_deref_instr = nir_src_as_deref(instr->src[i].src); + break; + case nir_tex_src_projector: + projector = lp_build_rcp(&bld_base->base, cast_type(bld_base, get_src(bld_base, instr->src[i].src), nir_type_float, 32)); + break; + case nir_tex_src_comparator: + sample_key |= LP_SAMPLER_SHADOW; + coords[4] = get_src(bld_base, instr->src[i].src); + coords[4] = cast_type(bld_base, coords[4], nir_type_float, 32); + break; + case nir_tex_src_bias: + sample_key |= LP_SAMPLER_LOD_BIAS << LP_SAMPLER_LOD_CONTROL_SHIFT; + lod_src = i; + explicit_lod = cast_type(bld_base, get_src(bld_base, instr->src[i].src), nir_type_float, 32); + break; + case nir_tex_src_lod: + sample_key |= LP_SAMPLER_LOD_EXPLICIT << LP_SAMPLER_LOD_CONTROL_SHIFT; + lod_src = i; + if (instr->op == nir_texop_txf) + explicit_lod = cast_type(bld_base, get_src(bld_base, instr->src[i].src), nir_type_int, 32); + else + explicit_lod = cast_type(bld_base, get_src(bld_base, instr->src[i].src), nir_type_float, 32); + break; + case nir_tex_src_ddx: { + int deriv_cnt = instr->coord_components; + if (instr->is_array) + deriv_cnt--; + LLVMValueRef deriv_val = get_src(bld_base, instr->src[i].src); + if (deriv_cnt == 1) + derivs.ddx[0] = deriv_val; + else + for (unsigned chan = 0; chan < deriv_cnt; ++chan) + derivs.ddx[chan] = LLVMBuildExtractValue(builder, deriv_val, + chan, ""); + for (unsigned chan = 0; chan < deriv_cnt; ++chan) + derivs.ddx[chan] = cast_type(bld_base, derivs.ddx[chan], nir_type_float, 32); + break; + } + case nir_tex_src_ddy: { + int deriv_cnt = instr->coord_components; + if (instr->is_array) + deriv_cnt--; + LLVMValueRef deriv_val = get_src(bld_base, instr->src[i].src); + if (deriv_cnt == 1) + derivs.ddy[0] = deriv_val; + else + for (unsigned chan = 0; chan < deriv_cnt; ++chan) + derivs.ddy[chan] = LLVMBuildExtractValue(builder, deriv_val, + chan, ""); + for (unsigned chan = 0; chan < deriv_cnt; ++chan) + derivs.ddy[chan] = cast_type(bld_base, derivs.ddy[chan], nir_type_float, 32); + break; + } + case nir_tex_src_offset: { + int offset_cnt = instr->coord_components; + if (instr->is_array) + offset_cnt--; + LLVMValueRef offset_val = get_src(bld_base, instr->src[i].src); + sample_key |= LP_SAMPLER_OFFSETS; + if (offset_cnt == 1) + offsets[0] = cast_type(bld_base, offset_val, nir_type_int, 32); + else { + for (unsigned chan = 0; chan < offset_cnt; ++chan) { + offsets[chan] = LLVMBuildExtractValue(builder, offset_val, + chan, ""); + offsets[chan] = cast_type(bld_base, offsets[chan], nir_type_int, 32); + } + } + break; + } + case nir_tex_src_ms_index: + break; + default: + assert(0); + break; + } + } + if (!sampler_deref_instr) + sampler_deref_instr = texture_deref_instr; + + if (explicit_lod) + lod_property = lp_build_nir_lod_property(bld_base, instr->src[lod_src].src); + + if (instr->op == nir_texop_tex || instr->op == nir_texop_tg4 || instr->op == nir_texop_txb || + instr->op == nir_texop_txl || instr->op == nir_texop_txd || instr->op == nir_texop_lod) + for (unsigned chan = 0; chan < instr->coord_components; ++chan) + coords[chan] = cast_type(bld_base, coords[chan], nir_type_float, 32); + else if (instr->op == nir_texop_txf || instr->op == nir_texop_txf_ms) + for (unsigned chan = 0; chan < instr->coord_components; ++chan) + coords[chan] = cast_type(bld_base, coords[chan], nir_type_int, 32); + + if (instr->is_array && instr->sampler_dim == GLSL_SAMPLER_DIM_1D) { + /* move layer coord for 1d arrays. */ + coords[2] = coords[1]; + coords[1] = coord_undef; + } + + if (projector) { + for (unsigned chan = 0; chan < instr->coord_components; ++chan) + coords[chan] = lp_build_mul(&bld_base->base, coords[chan], projector); + if (sample_key & LP_SAMPLER_SHADOW) + coords[4] = lp_build_mul(&bld_base->base, coords[4], projector); + } + + uint32_t base_index = 0; + if (!texture_deref_instr) { + int samp_src_index = nir_tex_instr_src_index(instr, nir_tex_src_sampler_handle); + if (samp_src_index == -1) { + base_index = instr->sampler_index; + } + } + + if (instr->op == nir_texop_txd) { + sample_key |= LP_SAMPLER_LOD_DERIVATIVES << LP_SAMPLER_LOD_CONTROL_SHIFT; + params.derivs = &derivs; + if (bld_base->shader->info.stage == MESA_SHADER_FRAGMENT) { + if (gallivm_perf & GALLIVM_PERF_NO_QUAD_LOD) + lod_property = LP_SAMPLER_LOD_PER_ELEMENT; + else + lod_property = LP_SAMPLER_LOD_PER_QUAD; + } else + lod_property = LP_SAMPLER_LOD_PER_ELEMENT; + } + + sample_key |= lod_property << LP_SAMPLER_LOD_PROPERTY_SHIFT; + params.sample_key = sample_key; + params.offsets = offsets; + params.texture_index = base_index; + params.sampler_index = base_index; + params.coords = coords; + params.texel = texel; + params.lod = explicit_lod; + bld_base->tex(bld_base, ¶ms); + assign_dest(bld_base, &instr->dest, texel); +} + +static void visit_ssa_undef(struct lp_build_nir_context *bld_base, + const nir_ssa_undef_instr *instr) +{ + unsigned num_components = instr->def.num_components; + LLVMValueRef undef[NIR_MAX_VEC_COMPONENTS]; + struct lp_build_context *undef_bld = get_int_bld(bld_base, true, instr->def.bit_size); + for (unsigned i = 0; i < num_components; i++) + undef[i] = LLVMGetUndef(undef_bld->vec_type); + assign_ssa_dest(bld_base, &instr->def, undef); +} + +static void visit_jump(struct lp_build_nir_context *bld_base, + const nir_jump_instr *instr) +{ + switch (instr->type) { + case nir_jump_break: + bld_base->break_stmt(bld_base); + break; + case nir_jump_continue: + bld_base->continue_stmt(bld_base); + break; + default: + unreachable("Unknown jump instr\n"); + } +} + +static void visit_deref(struct lp_build_nir_context *bld_base, + nir_deref_instr *instr) +{ + if (instr->mode != nir_var_mem_shared && + instr->mode != nir_var_mem_global) + return; + LLVMValueRef result = NULL; + switch(instr->deref_type) { + case nir_deref_type_var: { + struct hash_entry *entry = _mesa_hash_table_search(bld_base->vars, instr->var); + result = entry->data; + break; + } + default: + unreachable("Unhandled deref_instr deref type"); + } + + assign_ssa(bld_base, instr->dest.ssa.index, result); +} + +static void visit_block(struct lp_build_nir_context *bld_base, nir_block *block) +{ + nir_foreach_instr(instr, block) + { + switch (instr->type) { + case nir_instr_type_alu: + visit_alu(bld_base, nir_instr_as_alu(instr)); + break; + case nir_instr_type_load_const: + visit_load_const(bld_base, nir_instr_as_load_const(instr)); + break; + case nir_instr_type_intrinsic: + visit_intrinsic(bld_base, nir_instr_as_intrinsic(instr)); + break; + case nir_instr_type_tex: + visit_tex(bld_base, nir_instr_as_tex(instr)); + break; + case nir_instr_type_phi: + assert(0); + break; + case nir_instr_type_ssa_undef: + visit_ssa_undef(bld_base, nir_instr_as_ssa_undef(instr)); + break; + case nir_instr_type_jump: + visit_jump(bld_base, nir_instr_as_jump(instr)); + break; + case nir_instr_type_deref: + visit_deref(bld_base, nir_instr_as_deref(instr)); + break; + default: + fprintf(stderr, "Unknown NIR instr type: "); + nir_print_instr(instr, stderr); + fprintf(stderr, "\n"); + abort(); + } + } +} + +static void visit_if(struct lp_build_nir_context *bld_base, nir_if *if_stmt) +{ + LLVMValueRef cond = get_src(bld_base, if_stmt->condition); + + bld_base->if_cond(bld_base, cond); + visit_cf_list(bld_base, &if_stmt->then_list); + + if (!exec_list_is_empty(&if_stmt->else_list)) { + bld_base->else_stmt(bld_base); + visit_cf_list(bld_base, &if_stmt->else_list); + } + bld_base->endif_stmt(bld_base); +} + +static void visit_loop(struct lp_build_nir_context *bld_base, nir_loop *loop) +{ + bld_base->bgnloop(bld_base); + visit_cf_list(bld_base, &loop->body); + bld_base->endloop(bld_base); +} + +static void visit_cf_list(struct lp_build_nir_context *bld_base, + struct exec_list *list) +{ + foreach_list_typed(nir_cf_node, node, node, list) + { + switch (node->type) { + case nir_cf_node_block: + visit_block(bld_base, nir_cf_node_as_block(node)); + break; + + case nir_cf_node_if: + visit_if(bld_base, nir_cf_node_as_if(node)); + break; + + case nir_cf_node_loop: + visit_loop(bld_base, nir_cf_node_as_loop(node)); + break; + + default: + assert(0); + } + } +} + +static void +handle_shader_output_decl(struct lp_build_nir_context *bld_base, + struct nir_shader *nir, + struct nir_variable *variable) +{ + bld_base->emit_var_decl(bld_base, variable); +} + +/* vector registers are stored as arrays in LLVM side, + so we can use GEP on them, as to do exec mask stores + we need to operate on a single components. + arrays are: + 0.x, 1.x, 2.x, 3.x + 0.y, 1.y, 2.y, 3.y + .... +*/ +static LLVMTypeRef get_register_type(struct lp_build_nir_context *bld_base, + nir_register *reg) +{ + struct lp_build_context *int_bld = get_int_bld(bld_base, true, reg->bit_size); + + LLVMTypeRef type = int_bld->vec_type; + if (reg->num_array_elems) + type = LLVMArrayType(type, reg->num_array_elems); + if (reg->num_components > 1) + type = LLVMArrayType(type, reg->num_components); + + return type; +} + + +bool lp_build_nir_llvm( + struct lp_build_nir_context *bld_base, + struct nir_shader *nir) +{ + struct nir_function *func; + + nir_convert_from_ssa(nir, true); + nir_lower_locals_to_regs(nir); + nir_remove_dead_derefs(nir); + nir_remove_dead_variables(nir, nir_var_function_temp); + + nir_foreach_variable(variable, &nir->outputs) + handle_shader_output_decl(bld_base, nir, variable); + + bld_base->regs = _mesa_hash_table_create(NULL, _mesa_hash_pointer, + _mesa_key_pointer_equal); + bld_base->vars = _mesa_hash_table_create(NULL, _mesa_hash_pointer, + _mesa_key_pointer_equal); + + func = (struct nir_function *)exec_list_get_head(&nir->functions); + + nir_foreach_register(reg, &func->impl->registers) { + LLVMTypeRef type = get_register_type(bld_base, reg); + LLVMValueRef reg_alloc = lp_build_alloca_undef(bld_base->base.gallivm, + type, "reg"); + _mesa_hash_table_insert(bld_base->regs, reg, reg_alloc); + } + nir_index_ssa_defs(func->impl); + bld_base->ssa_defs = calloc(func->impl->ssa_alloc, sizeof(LLVMValueRef)); + visit_cf_list(bld_base, &func->impl->body); + + free(bld_base->ssa_defs); + ralloc_free(bld_base->vars); + ralloc_free(bld_base->regs); + return true; +} + +/* do some basic opts to remove some things we don't want to see. */ +void lp_build_opt_nir(struct nir_shader *nir) +{ + bool progress; + do { + progress = false; + NIR_PASS_V(nir, nir_opt_constant_folding); + NIR_PASS_V(nir, nir_opt_algebraic); + NIR_PASS_V(nir, nir_lower_pack); + } while (progress); + nir_lower_bool_to_int32(nir); +} diff -Nru mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_nir.h mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_nir.h --- mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_nir.h 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_nir.h 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,277 @@ +/************************************************************************** + * + * Copyright 2019 Red Hat. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **************************************************************************/ + +#ifndef LP_BLD_NIR_H +#define LP_BLD_NIR_H + +#include "gallivm/lp_bld.h" +#include "gallivm/lp_bld_limits.h" +#include "lp_bld_type.h" + +#include "gallivm/lp_bld_tgsi.h" +#include "nir.h" + +struct nir_shader; + +void lp_build_nir_soa(struct gallivm_state *gallivm, + struct nir_shader *shader, + const struct lp_build_tgsi_params *params, + LLVMValueRef (*outputs)[4]); + +struct lp_build_nir_context +{ + struct lp_build_context base; + struct lp_build_context uint_bld; + struct lp_build_context int_bld; + struct lp_build_context uint8_bld; + struct lp_build_context int8_bld; + struct lp_build_context uint16_bld; + struct lp_build_context int16_bld; + struct lp_build_context dbl_bld; + struct lp_build_context uint64_bld; + struct lp_build_context int64_bld; + + LLVMValueRef *ssa_defs; + struct hash_table *regs; + struct hash_table *vars; + + nir_shader *shader; + + void (*load_ubo)(struct lp_build_nir_context *bld_base, + unsigned nc, + unsigned bit_size, + bool offset_is_uniform, + LLVMValueRef index, LLVMValueRef offset, LLVMValueRef result[NIR_MAX_VEC_COMPONENTS]); + + void (*load_kernel_arg)(struct lp_build_nir_context *bld_base, + unsigned nc, + unsigned bit_size, + unsigned offset_bit_size, + bool offset_is_uniform, + LLVMValueRef offset, LLVMValueRef result[NIR_MAX_VEC_COMPONENTS]); + + void (*load_global)(struct lp_build_nir_context *bld_base, + unsigned nc, unsigned bit_size, + unsigned offset_bit_size, + LLVMValueRef offset, LLVMValueRef result[NIR_MAX_VEC_COMPONENTS]); + + void (*store_global)(struct lp_build_nir_context *bld_base, + unsigned writemask, + unsigned nc, unsigned bit_size, + unsigned addr_bit_size, + LLVMValueRef addr, LLVMValueRef dst); + + void (*atomic_global)(struct lp_build_nir_context *bld_base, + nir_intrinsic_op op, + unsigned addr_bit_size, + LLVMValueRef addr, + LLVMValueRef val, LLVMValueRef val2, + LLVMValueRef *result); + + /* for SSBO and shared memory */ + void (*load_mem)(struct lp_build_nir_context *bld_base, + unsigned nc, unsigned bit_size, + LLVMValueRef index, LLVMValueRef offset, LLVMValueRef result[NIR_MAX_VEC_COMPONENTS]); + void (*store_mem)(struct lp_build_nir_context *bld_base, + unsigned writemask, unsigned nc, unsigned bit_size, + LLVMValueRef index, LLVMValueRef offset, LLVMValueRef dst); + + void (*atomic_mem)(struct lp_build_nir_context *bld_base, + nir_intrinsic_op op, + LLVMValueRef index, LLVMValueRef offset, + LLVMValueRef val, LLVMValueRef val2, + LLVMValueRef *result); + + void (*barrier)(struct lp_build_nir_context *bld_base); + + void (*image_op)(struct lp_build_nir_context *bld_base, + struct lp_img_params *params); + void (*image_size)(struct lp_build_nir_context *bld_base, + struct lp_sampler_size_query_params *params); + LLVMValueRef (*get_buffer_size)(struct lp_build_nir_context *bld_base, + LLVMValueRef index); + + void (*load_var)(struct lp_build_nir_context *bld_base, + nir_variable_mode deref_mode, + unsigned num_components, + unsigned bit_size, + nir_variable *var, + unsigned vertex_index, + unsigned const_index, + LLVMValueRef indir_index, + LLVMValueRef result[NIR_MAX_VEC_COMPONENTS]); + void (*store_var)(struct lp_build_nir_context *bld_base, + nir_variable_mode deref_mode, + unsigned bit_size, + unsigned num_components, + unsigned writemask, + unsigned const_index, + nir_variable *var, LLVMValueRef dst); + + LLVMValueRef (*load_reg)(struct lp_build_nir_context *bld_base, + struct lp_build_context *reg_bld, + const nir_reg_src *reg, + LLVMValueRef indir_src, + LLVMValueRef reg_storage); + void (*store_reg)(struct lp_build_nir_context *bld_base, + struct lp_build_context *reg_bld, + const nir_reg_dest *reg, + unsigned writemask, + LLVMValueRef indir_src, + LLVMValueRef reg_storage, + LLVMValueRef dst[NIR_MAX_VEC_COMPONENTS]); + + void (*emit_var_decl)(struct lp_build_nir_context *bld_base, + nir_variable *var); + + void (*tex)(struct lp_build_nir_context *bld_base, + struct lp_sampler_params *params); + + void (*tex_size)(struct lp_build_nir_context *bld_base, + struct lp_sampler_size_query_params *params); + + void (*sysval_intrin)(struct lp_build_nir_context *bld_base, + nir_intrinsic_instr *instr, + LLVMValueRef result[NIR_MAX_VEC_COMPONENTS]); + void (*discard)(struct lp_build_nir_context *bld_base, + LLVMValueRef cond); + + void (*bgnloop)(struct lp_build_nir_context *bld_base); + void (*endloop)(struct lp_build_nir_context *bld_base); + void (*if_cond)(struct lp_build_nir_context *bld_base, LLVMValueRef cond); + void (*else_stmt)(struct lp_build_nir_context *bld_base); + void (*endif_stmt)(struct lp_build_nir_context *bld_base); + void (*break_stmt)(struct lp_build_nir_context *bld_base); + void (*continue_stmt)(struct lp_build_nir_context *bld_base); + + void (*emit_vertex)(struct lp_build_nir_context *bld_base, uint32_t stream_id); + void (*end_primitive)(struct lp_build_nir_context *bld_base, uint32_t stream_id); + + void (*vote)(struct lp_build_nir_context *bld_base, LLVMValueRef src, nir_intrinsic_instr *instr, LLVMValueRef dst[4]); +// LLVMValueRef main_function +}; + +struct lp_build_nir_soa_context +{ + struct lp_build_nir_context bld_base; + + /* Builder for scalar elements of shader's data type (float) */ + struct lp_build_context elem_bld; + struct lp_build_context uint_elem_bld; + + LLVMValueRef consts_ptr; + LLVMValueRef const_sizes_ptr; + LLVMValueRef consts[LP_MAX_TGSI_CONST_BUFFERS]; + LLVMValueRef consts_sizes[LP_MAX_TGSI_CONST_BUFFERS]; + const LLVMValueRef (*inputs)[TGSI_NUM_CHANNELS]; + LLVMValueRef (*outputs)[TGSI_NUM_CHANNELS]; + LLVMValueRef context_ptr; + LLVMValueRef thread_data_ptr; + + LLVMValueRef ssbo_ptr; + LLVMValueRef ssbo_sizes_ptr; + LLVMValueRef ssbos[LP_MAX_TGSI_SHADER_BUFFERS]; + LLVMValueRef ssbo_sizes[LP_MAX_TGSI_SHADER_BUFFERS]; + + LLVMValueRef shared_ptr; + + const struct lp_build_coro_suspend_info *coro; + + const struct lp_build_sampler_soa *sampler; + const struct lp_build_image_soa *image; + + const struct lp_build_gs_iface *gs_iface; + LLVMValueRef emitted_prims_vec_ptr; + LLVMValueRef total_emitted_vertices_vec_ptr; + LLVMValueRef emitted_vertices_vec_ptr; + LLVMValueRef max_output_vertices_vec; + struct lp_bld_tgsi_system_values system_values; + + nir_variable_mode indirects; + struct lp_build_mask_context *mask; + struct lp_exec_mask exec_mask; + + /* We allocate/use this array of inputs if (indirects & nir_var_shader_in) is + * set. The inputs[] array above is unused then. + */ + LLVMValueRef inputs_array; + + LLVMValueRef kernel_args_ptr; +}; + +bool +lp_build_nir_llvm(struct lp_build_nir_context *bld_base, + struct nir_shader *nir); + +void lp_build_opt_nir(struct nir_shader *nir); + +static inline LLVMValueRef +lp_nir_array_build_gather_values(LLVMBuilderRef builder, + LLVMValueRef * values, + unsigned value_count) +{ + LLVMTypeRef arr_type = LLVMArrayType(LLVMTypeOf(values[0]), value_count); + LLVMValueRef arr = LLVMGetUndef(arr_type); + unsigned i; + + for (i = 0; i < value_count; i++) { + arr = LLVMBuildInsertValue(builder, arr, values[i], i, ""); + } + return arr; +} + + +static inline struct lp_build_context *get_int_bld(struct lp_build_nir_context *bld_base, + bool is_unsigned, + unsigned op_bit_size) +{ + if (is_unsigned) { + switch (op_bit_size) { + case 64: + return &bld_base->uint64_bld; + case 32: + default: + return &bld_base->uint_bld; + case 16: + return &bld_base->uint16_bld; + case 8: + return &bld_base->uint8_bld; + } + } else { + switch (op_bit_size) { + case 64: + return &bld_base->int64_bld; + default: + case 32: + return &bld_base->int_bld; + case 16: + return &bld_base->int16_bld; + case 8: + return &bld_base->int8_bld; + } + } +} + +#endif diff -Nru mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_nir_soa.c mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_nir_soa.c --- mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_nir_soa.c 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_nir_soa.c 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,1613 @@ +/************************************************************************** + * + * Copyright 2019 Red Hat. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **************************************************************************/ + +#include "lp_bld_nir.h" +#include "lp_bld_init.h" +#include "lp_bld_flow.h" +#include "lp_bld_logic.h" +#include "lp_bld_gather.h" +#include "lp_bld_const.h" +#include "lp_bld_struct.h" +#include "lp_bld_arit.h" +#include "lp_bld_bitarit.h" +#include "lp_bld_coro.h" +#include "lp_bld_printf.h" +#include "util/u_math.h" +/* + * combine the execution mask if there is one with the current mask. + */ +static LLVMValueRef +mask_vec(struct lp_build_nir_context *bld_base) +{ + struct lp_build_nir_soa_context * bld = (struct lp_build_nir_soa_context *)bld_base; + LLVMBuilderRef builder = bld->bld_base.base.gallivm->builder; + struct lp_exec_mask *exec_mask = &bld->exec_mask; + LLVMValueRef bld_mask = bld->mask ? lp_build_mask_value(bld->mask) : NULL; + if (!exec_mask->has_mask) { + return bld_mask; + } + if (!bld_mask) + return exec_mask->exec_mask; + return LLVMBuildAnd(builder, lp_build_mask_value(bld->mask), + exec_mask->exec_mask, ""); +} + +static LLVMValueRef +emit_fetch_64bit( + struct lp_build_nir_context * bld_base, + LLVMValueRef input, + LLVMValueRef input2) +{ + struct gallivm_state *gallivm = bld_base->base.gallivm; + LLVMBuilderRef builder = gallivm->builder; + LLVMValueRef res; + int i; + LLVMValueRef shuffles[2 * (LP_MAX_VECTOR_WIDTH/32)]; + int len = bld_base->base.type.length * 2; + assert(len <= (2 * (LP_MAX_VECTOR_WIDTH/32))); + + for (i = 0; i < bld_base->base.type.length * 2; i+=2) { + shuffles[i] = lp_build_const_int32(gallivm, i / 2); + shuffles[i + 1] = lp_build_const_int32(gallivm, i / 2 + bld_base->base.type.length); + } + res = LLVMBuildShuffleVector(builder, input, input2, LLVMConstVector(shuffles, len), ""); + + return LLVMBuildBitCast(builder, res, bld_base->dbl_bld.vec_type, ""); +} + +static void +emit_store_64bit_chan(struct lp_build_nir_context *bld_base, + LLVMValueRef chan_ptr, + LLVMValueRef chan_ptr2, + LLVMValueRef value) +{ + struct lp_build_nir_soa_context *bld = (struct lp_build_nir_soa_context *)bld_base; + struct gallivm_state *gallivm = bld_base->base.gallivm; + LLVMBuilderRef builder = gallivm->builder; + struct lp_build_context *float_bld = &bld_base->base; + unsigned i; + LLVMValueRef temp, temp2; + LLVMValueRef shuffles[LP_MAX_VECTOR_WIDTH/32]; + LLVMValueRef shuffles2[LP_MAX_VECTOR_WIDTH/32]; + int len = bld_base->base.type.length * 2; + + value = LLVMBuildBitCast(gallivm->builder, value, LLVMVectorType(LLVMFloatTypeInContext(gallivm->context), len), ""); + for (i = 0; i < bld_base->base.type.length; i++) { + shuffles[i] = lp_build_const_int32(gallivm, i * 2); + shuffles2[i] = lp_build_const_int32(gallivm, (i * 2) + 1); + } + + temp = LLVMBuildShuffleVector(builder, value, + LLVMGetUndef(LLVMTypeOf(value)), + LLVMConstVector(shuffles, + bld_base->base.type.length), + ""); + temp2 = LLVMBuildShuffleVector(builder, value, + LLVMGetUndef(LLVMTypeOf(value)), + LLVMConstVector(shuffles2, + bld_base->base.type.length), + ""); + + lp_exec_mask_store(&bld->exec_mask, float_bld, temp, chan_ptr); + lp_exec_mask_store(&bld->exec_mask, float_bld, temp2, chan_ptr2); +} + +static LLVMValueRef +get_soa_array_offsets(struct lp_build_context *uint_bld, + LLVMValueRef indirect_index, + int num_components, + unsigned chan_index, + bool need_perelement_offset) +{ + struct gallivm_state *gallivm = uint_bld->gallivm; + LLVMValueRef chan_vec = + lp_build_const_int_vec(uint_bld->gallivm, uint_bld->type, chan_index); + LLVMValueRef length_vec = + lp_build_const_int_vec(gallivm, uint_bld->type, uint_bld->type.length); + LLVMValueRef index_vec; + + /* index_vec = (indirect_index * 4 + chan_index) * length + offsets */ + index_vec = lp_build_mul(uint_bld, indirect_index, lp_build_const_int_vec(uint_bld->gallivm, uint_bld->type, num_components)); + index_vec = lp_build_add(uint_bld, index_vec, chan_vec); + index_vec = lp_build_mul(uint_bld, index_vec, length_vec); + + if (need_perelement_offset) { + LLVMValueRef pixel_offsets; + unsigned i; + /* build pixel offset vector: {0, 1, 2, 3, ...} */ + pixel_offsets = uint_bld->undef; + for (i = 0; i < uint_bld->type.length; i++) { + LLVMValueRef ii = lp_build_const_int32(gallivm, i); + pixel_offsets = LLVMBuildInsertElement(gallivm->builder, pixel_offsets, + ii, ii, ""); + } + index_vec = lp_build_add(uint_bld, index_vec, pixel_offsets); + } + return index_vec; +} + +static LLVMValueRef +build_gather(struct lp_build_nir_context *bld_base, + struct lp_build_context *bld, + LLVMValueRef base_ptr, + LLVMValueRef indexes, + LLVMValueRef overflow_mask, + LLVMValueRef indexes2) +{ + struct gallivm_state *gallivm = bld_base->base.gallivm; + LLVMBuilderRef builder = gallivm->builder; + struct lp_build_context *uint_bld = &bld_base->uint_bld; + LLVMValueRef res; + unsigned i; + + if (indexes2) + res = LLVMGetUndef(LLVMVectorType(LLVMFloatTypeInContext(gallivm->context), bld_base->base.type.length * 2)); + else + res = bld->undef; + /* + * overflow_mask is a vector telling us which channels + * in the vector overflowed. We use the overflow behavior for + * constant buffers which is defined as: + * Out of bounds access to constant buffer returns 0 in all + * components. Out of bounds behavior is always with respect + * to the size of the buffer bound at that slot. + */ + + if (overflow_mask) { + /* + * We avoid per-element control flow here (also due to llvm going crazy, + * though I suspect it's better anyway since overflow is likely rare). + * Note that since we still fetch from buffers even if num_elements was + * zero (in this case we'll fetch from index zero) the jit func callers + * MUST provide valid fake constant buffers of size 4x32 (the values do + * not matter), otherwise we'd still need (not per element though) + * control flow. + */ + indexes = lp_build_select(uint_bld, overflow_mask, uint_bld->zero, indexes); + if (indexes2) + indexes2 = lp_build_select(uint_bld, overflow_mask, uint_bld->zero, indexes2); + } + + /* + * Loop over elements of index_vec, load scalar value, insert it into 'res'. + */ + for (i = 0; i < bld->type.length * (indexes2 ? 2 : 1); i++) { + LLVMValueRef si, di; + LLVMValueRef index; + LLVMValueRef scalar_ptr, scalar; + + di = lp_build_const_int32(gallivm, i); + if (indexes2) + si = lp_build_const_int32(gallivm, i >> 1); + else + si = di; + + if (indexes2 && (i & 1)) { + index = LLVMBuildExtractElement(builder, + indexes2, si, ""); + } else { + index = LLVMBuildExtractElement(builder, + indexes, si, ""); + } + scalar_ptr = LLVMBuildGEP(builder, base_ptr, + &index, 1, "gather_ptr"); + scalar = LLVMBuildLoad(builder, scalar_ptr, ""); + + res = LLVMBuildInsertElement(builder, res, scalar, di, ""); + } + + if (overflow_mask) { + if (indexes2) { + res = LLVMBuildBitCast(builder, res, bld_base->dbl_bld.vec_type, ""); + overflow_mask = LLVMBuildSExt(builder, overflow_mask, + bld_base->dbl_bld.int_vec_type, ""); + res = lp_build_select(&bld_base->dbl_bld, overflow_mask, + bld_base->dbl_bld.zero, res); + } else + res = lp_build_select(bld, overflow_mask, bld->zero, res); + } + + return res; +} + +/** + * Scatter/store vector. + */ +static void +emit_mask_scatter(struct lp_build_nir_soa_context *bld, + LLVMValueRef base_ptr, + LLVMValueRef indexes, + LLVMValueRef values, + struct lp_exec_mask *mask) +{ + struct gallivm_state *gallivm = bld->bld_base.base.gallivm; + LLVMBuilderRef builder = gallivm->builder; + unsigned i; + LLVMValueRef pred = mask->has_mask ? mask->exec_mask : NULL; + + /* + * Loop over elements of index_vec, store scalar value. + */ + for (i = 0; i < bld->bld_base.base.type.length; i++) { + LLVMValueRef ii = lp_build_const_int32(gallivm, i); + LLVMValueRef index = LLVMBuildExtractElement(builder, indexes, ii, ""); + LLVMValueRef scalar_ptr = LLVMBuildGEP(builder, base_ptr, &index, 1, "scatter_ptr"); + LLVMValueRef val = LLVMBuildExtractElement(builder, values, ii, "scatter_val"); + LLVMValueRef scalar_pred = pred ? + LLVMBuildExtractElement(builder, pred, ii, "scatter_pred") : NULL; + + if (0) + lp_build_printf(gallivm, "scatter %d: val %f at %d %p\n", + ii, val, index, scalar_ptr); + + if (scalar_pred) { + LLVMValueRef real_val, dst_val; + dst_val = LLVMBuildLoad(builder, scalar_ptr, ""); + real_val = lp_build_select(&bld->uint_elem_bld, scalar_pred, val, dst_val); + LLVMBuildStore(builder, real_val, scalar_ptr); + } + else { + LLVMBuildStore(builder, val, scalar_ptr); + } + } +} + +static void emit_load_var(struct lp_build_nir_context *bld_base, + nir_variable_mode deref_mode, + unsigned num_components, + unsigned bit_size, + nir_variable *var, + unsigned vertex_index, + unsigned const_index, + LLVMValueRef indir_index, + LLVMValueRef result[NIR_MAX_VEC_COMPONENTS]) +{ + struct lp_build_nir_soa_context *bld = (struct lp_build_nir_soa_context *)bld_base; + struct gallivm_state *gallivm = bld_base->base.gallivm; + int dmul = bit_size == 64 ? 2 : 1; + switch (deref_mode) { + case nir_var_shader_in: { + for (unsigned i = 0; i < num_components; i++) { + int idx = (i * dmul) + var->data.location_frac; + if (bld->gs_iface) { + LLVMValueRef vertex_index_val = lp_build_const_int32(gallivm, vertex_index); + LLVMValueRef attrib_index_val = lp_build_const_int32(gallivm, const_index + var->data.driver_location); + LLVMValueRef swizzle_index_val = lp_build_const_int32(gallivm, idx); + LLVMValueRef result2; + result[i] = bld->gs_iface->fetch_input(bld->gs_iface, &bld_base->base, + false, vertex_index_val, 0, attrib_index_val, swizzle_index_val); + if (bit_size == 64) { + LLVMValueRef swizzle_index_val = lp_build_const_int32(gallivm, idx + 1); + result2 = bld->gs_iface->fetch_input(bld->gs_iface, &bld_base->base, + false, vertex_index_val, 0, attrib_index_val, swizzle_index_val); + result[i] = emit_fetch_64bit(bld_base, result[i], result2); + } + } else { + if (indir_index) { + LLVMValueRef attrib_index_val = lp_build_add(&bld_base->uint_bld, indir_index, lp_build_const_int_vec(gallivm, bld_base->uint_bld.type, var->data.driver_location)); + LLVMValueRef index_vec = get_soa_array_offsets(&bld_base->uint_bld, + attrib_index_val, 4, idx, + TRUE); + LLVMValueRef index_vec2 = NULL; + LLVMTypeRef fptr_type; + LLVMValueRef inputs_array; + fptr_type = LLVMPointerType(LLVMFloatTypeInContext(gallivm->context), 0); + inputs_array = LLVMBuildBitCast(gallivm->builder, bld->inputs_array, fptr_type, ""); + + if (bit_size == 64) + index_vec2 = get_soa_array_offsets(&bld_base->uint_bld, + indir_index, 4, idx + 1, TRUE); + + /* Gather values from the input register array */ + result[i] = build_gather(bld_base, &bld_base->base, inputs_array, index_vec, NULL, index_vec2); + } else { + if (bld->indirects & nir_var_shader_in) { + LLVMValueRef lindex = lp_build_const_int32(gallivm, + var->data.driver_location * 4 + idx); + LLVMValueRef input_ptr = lp_build_pointer_get(gallivm->builder, + bld->inputs_array, lindex); + if (bit_size == 64) { + LLVMValueRef lindex2 = lp_build_const_int32(gallivm, + var->data.driver_location * 4 + (idx + 1)); + LLVMValueRef input_ptr2 = lp_build_pointer_get(gallivm->builder, + bld->inputs_array, lindex2); + result[i] = emit_fetch_64bit(bld_base, input_ptr, input_ptr2); + } else { + result[i] = input_ptr; + } + } else { + if (bit_size == 64) { + LLVMValueRef tmp[2]; + tmp[0] = bld->inputs[var->data.driver_location + const_index][idx]; + tmp[1] = bld->inputs[var->data.driver_location + const_index][idx + 1]; + result[i] = emit_fetch_64bit(bld_base, tmp[0], tmp[1]); + } else { + result[i] = bld->inputs[var->data.driver_location + const_index][idx]; + } + } + } + } + } + } + default: + break; + } +} + +static void emit_store_chan(struct lp_build_nir_context *bld_base, + nir_variable_mode deref_mode, + unsigned bit_size, + unsigned location, unsigned comp, + unsigned chan, + LLVMValueRef dst) +{ + struct lp_build_nir_soa_context *bld = (struct lp_build_nir_soa_context *)bld_base; + LLVMBuilderRef builder = bld->bld_base.base.gallivm->builder; + struct lp_build_context *float_bld = &bld_base->base; + + if (bit_size == 64) { + chan *= 2; + chan += comp; + if (chan >= 4) { + chan -= 4; + location++; + } + emit_store_64bit_chan(bld_base, bld->outputs[location][chan], + bld->outputs[location][chan + 1], dst); + } else { + dst = LLVMBuildBitCast(builder, dst, float_bld->vec_type, ""); + lp_exec_mask_store(&bld->exec_mask, float_bld, dst, + bld->outputs[location][chan + comp]); + } +} + +static void emit_store_var(struct lp_build_nir_context *bld_base, + nir_variable_mode deref_mode, + unsigned bit_size, + unsigned num_components, + unsigned writemask, + unsigned const_index, + nir_variable *var, LLVMValueRef dst) +{ + struct lp_build_nir_soa_context *bld = (struct lp_build_nir_soa_context *)bld_base; + LLVMBuilderRef builder = bld->bld_base.base.gallivm->builder; + switch (deref_mode) { + case nir_var_shader_out: { + unsigned location = var->data.driver_location + const_index; + unsigned comp = var->data.location_frac; + if (bld_base->shader->info.stage == MESA_SHADER_FRAGMENT) { + if (var->data.location == FRAG_RESULT_STENCIL) + comp = 1; + else if (var->data.location == FRAG_RESULT_DEPTH) + comp = 2; + } + for (unsigned chan = 0; chan < num_components; chan++) { + if (writemask & (1u << chan)) { + LLVMValueRef chan_val = (num_components == 1) ? dst : LLVMBuildExtractValue(builder, dst, chan, ""); + emit_store_chan(bld_base, deref_mode, bit_size, location, comp, chan, chan_val); + } + } + break; + } + default: + break; + } +} + +static LLVMValueRef emit_load_reg(struct lp_build_nir_context *bld_base, + struct lp_build_context *reg_bld, + const nir_reg_src *reg, + LLVMValueRef indir_src, + LLVMValueRef reg_storage) +{ + struct gallivm_state *gallivm = bld_base->base.gallivm; + LLVMBuilderRef builder = gallivm->builder; + int nc = reg->reg->num_components; + LLVMValueRef vals[NIR_MAX_VEC_COMPONENTS]; + struct lp_build_context *uint_bld = &bld_base->uint_bld; + if (reg->reg->num_array_elems) { + LLVMValueRef indirect_val = lp_build_const_int_vec(gallivm, uint_bld->type, reg->base_offset); + if (reg->indirect) { + LLVMValueRef max_index = lp_build_const_int_vec(gallivm, uint_bld->type, reg->reg->num_array_elems - 1); + indirect_val = LLVMBuildAdd(builder, indirect_val, indir_src, ""); + indirect_val = lp_build_min(uint_bld, indirect_val, max_index); + } + reg_storage = LLVMBuildBitCast(builder, reg_storage, LLVMPointerType(reg_bld->elem_type, 0), ""); + for (unsigned i = 0; i < nc; i++) { + LLVMValueRef indirect_offset = get_soa_array_offsets(uint_bld, indirect_val, nc, i, TRUE); + vals[i] = build_gather(bld_base, reg_bld, reg_storage, indirect_offset, NULL, NULL); + } + } else { + for (unsigned i = 0; i < nc; i++) { + LLVMValueRef this_storage = nc == 1 ? reg_storage : lp_build_array_get_ptr(gallivm, reg_storage, + lp_build_const_int32(gallivm, i)); + vals[i] = LLVMBuildLoad(builder, this_storage, ""); + } + } + return nc == 1 ? vals[0] : lp_nir_array_build_gather_values(builder, vals, nc); +} + +static void emit_store_reg(struct lp_build_nir_context *bld_base, + struct lp_build_context *reg_bld, + const nir_reg_dest *reg, + unsigned writemask, + LLVMValueRef indir_src, + LLVMValueRef reg_storage, + LLVMValueRef dst[NIR_MAX_VEC_COMPONENTS]) +{ + struct lp_build_nir_soa_context *bld = (struct lp_build_nir_soa_context *)bld_base; + struct gallivm_state *gallivm = bld_base->base.gallivm; + LLVMBuilderRef builder = gallivm->builder; + struct lp_build_context *uint_bld = &bld_base->uint_bld; + int nc = reg->reg->num_components; + if (reg->reg->num_array_elems > 0) { + LLVMValueRef indirect_val = lp_build_const_int_vec(gallivm, uint_bld->type, reg->base_offset); + if (reg->indirect) { + LLVMValueRef max_index = lp_build_const_int_vec(gallivm, uint_bld->type, reg->reg->num_array_elems - 1); + indirect_val = LLVMBuildAdd(builder, indirect_val, indir_src, ""); + indirect_val = lp_build_min(uint_bld, indirect_val, max_index); + } + reg_storage = LLVMBuildBitCast(builder, reg_storage, LLVMPointerType(reg_bld->elem_type, 0), ""); + for (unsigned i = 0; i < nc; i++) { + if (!(writemask & (1 << i))) + continue; + LLVMValueRef indirect_offset = get_soa_array_offsets(uint_bld, indirect_val, nc, i, TRUE); + dst[i] = LLVMBuildBitCast(builder, dst[i], reg_bld->vec_type, ""); + emit_mask_scatter(bld, reg_storage, indirect_offset, dst[i], &bld->exec_mask); + } + return; + } + + for (unsigned i = 0; i < nc; i++) { + LLVMValueRef this_storage = nc == 1 ? reg_storage : lp_build_array_get_ptr(gallivm, reg_storage, + lp_build_const_int32(gallivm, i)); + dst[i] = LLVMBuildBitCast(builder, dst[i], reg_bld->vec_type, ""); + lp_exec_mask_store(&bld->exec_mask, reg_bld, dst[i], this_storage); + } +} + +static void emit_load_kernel_arg(struct lp_build_nir_context *bld_base, + unsigned nc, + unsigned bit_size, + unsigned offset_bit_size, + bool offset_is_uniform, + LLVMValueRef offset, + LLVMValueRef result[NIR_MAX_VEC_COMPONENTS]) +{ + struct lp_build_nir_soa_context *bld = (struct lp_build_nir_soa_context *)bld_base; + struct gallivm_state *gallivm = bld_base->base.gallivm; + LLVMBuilderRef builder = gallivm->builder; + struct lp_build_context *bld_broad = get_int_bld(bld_base, true, bit_size); + LLVMValueRef kernel_args_ptr = bld->kernel_args_ptr; + unsigned size_shift = 0; + struct lp_build_context *bld_offset = get_int_bld(bld_base, true, offset_bit_size); + if (bit_size == 16) + size_shift = 1; + else if (bit_size == 32) + size_shift = 2; + else if (bit_size == 64) + size_shift = 3; + if (size_shift) + offset = lp_build_shr(bld_offset, offset, lp_build_const_int_vec(gallivm, bld_offset->type, size_shift)); + + LLVMTypeRef ptr_type = LLVMPointerType(bld_broad->elem_type, 0); + kernel_args_ptr = LLVMBuildBitCast(builder, kernel_args_ptr, ptr_type, ""); + + if (offset_is_uniform) { + offset = LLVMBuildExtractElement(builder, offset, lp_build_const_int32(gallivm, 0), ""); + + for (unsigned c = 0; c < nc; c++) { + LLVMValueRef this_offset = LLVMBuildAdd(builder, offset, offset_bit_size == 64 ? lp_build_const_int64(gallivm, c) : lp_build_const_int32(gallivm, c), ""); + + LLVMValueRef scalar = lp_build_pointer_get(builder, kernel_args_ptr, this_offset); + result[c] = lp_build_broadcast_scalar(bld_broad, scalar); + } + } +} + +static LLVMValueRef global_addr_to_ptr(struct gallivm_state *gallivm, LLVMValueRef addr_ptr, unsigned bit_size) +{ + LLVMBuilderRef builder = gallivm->builder; + switch (bit_size) { + case 8: + addr_ptr = LLVMBuildIntToPtr(builder, addr_ptr, LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), ""); + break; + case 16: + addr_ptr = LLVMBuildIntToPtr(builder, addr_ptr, LLVMPointerType(LLVMInt16TypeInContext(gallivm->context), 0), ""); + break; + case 32: + default: + addr_ptr = LLVMBuildIntToPtr(builder, addr_ptr, LLVMPointerType(LLVMInt32TypeInContext(gallivm->context), 0), ""); + break; + case 64: + addr_ptr = LLVMBuildIntToPtr(builder, addr_ptr, LLVMPointerType(LLVMInt64TypeInContext(gallivm->context), 0), ""); + break; + } + return addr_ptr; +} + +static void emit_load_global(struct lp_build_nir_context *bld_base, + unsigned nc, + unsigned bit_size, + unsigned addr_bit_size, + LLVMValueRef addr, + LLVMValueRef outval[NIR_MAX_VEC_COMPONENTS]) +{ + struct gallivm_state *gallivm = bld_base->base.gallivm; + LLVMBuilderRef builder = gallivm->builder; + struct lp_build_context *uint_bld = &bld_base->uint_bld; + struct lp_build_context *res_bld; + + res_bld = get_int_bld(bld_base, true, bit_size); + + for (unsigned c = 0; c < nc; c++) { + LLVMValueRef result = lp_build_alloca(gallivm, res_bld->vec_type, ""); + + struct lp_build_loop_state loop_state; + lp_build_loop_begin(&loop_state, gallivm, lp_build_const_int32(gallivm, 0)); + + LLVMValueRef addr_ptr = LLVMBuildExtractElement(gallivm->builder, addr, + loop_state.counter, ""); + addr_ptr = global_addr_to_ptr(gallivm, addr_ptr, bit_size); + + LLVMValueRef value_ptr = lp_build_pointer_get(builder, addr_ptr, lp_build_const_int32(gallivm, c)); + + LLVMValueRef temp_res; + temp_res = LLVMBuildLoad(builder, result, ""); + temp_res = LLVMBuildInsertElement(builder, temp_res, value_ptr, loop_state.counter, ""); + LLVMBuildStore(builder, temp_res, result); + lp_build_loop_end_cond(&loop_state, lp_build_const_int32(gallivm, uint_bld->type.length), + NULL, LLVMIntUGE); + outval[c] = LLVMBuildLoad(builder, result, ""); + } +} + +static void emit_store_global(struct lp_build_nir_context *bld_base, + unsigned writemask, + unsigned nc, unsigned bit_size, + unsigned addr_bit_size, + LLVMValueRef addr, + LLVMValueRef dst) +{ + struct gallivm_state *gallivm = bld_base->base.gallivm; + LLVMBuilderRef builder = gallivm->builder; + struct lp_build_context *uint_bld = &bld_base->uint_bld; + + for (unsigned c = 0; c < nc; c++) { + if (!(writemask & (1u << c))) + continue; + LLVMValueRef val = (nc == 1) ? dst : LLVMBuildExtractValue(builder, dst, c, ""); + + LLVMValueRef exec_mask = mask_vec(bld_base); + struct lp_build_loop_state loop_state; + lp_build_loop_begin(&loop_state, gallivm, lp_build_const_int32(gallivm, 0)); + LLVMValueRef value_ptr = LLVMBuildExtractElement(gallivm->builder, val, + loop_state.counter, ""); + + LLVMValueRef addr_ptr = LLVMBuildExtractElement(gallivm->builder, addr, + loop_state.counter, ""); + addr_ptr = global_addr_to_ptr(gallivm, addr_ptr, bit_size); + switch (bit_size) { + case 32: + value_ptr = LLVMBuildBitCast(builder, value_ptr, LLVMInt32TypeInContext(gallivm->context), ""); + break; + case 64: + value_ptr = LLVMBuildBitCast(builder, value_ptr, LLVMInt64TypeInContext(gallivm->context), ""); + break; + default: + break; + } + struct lp_build_if_state ifthen; + + LLVMValueRef cond = LLVMBuildICmp(gallivm->builder, LLVMIntNE, exec_mask, uint_bld->zero, ""); + cond = LLVMBuildExtractElement(gallivm->builder, cond, loop_state.counter, ""); + lp_build_if(&ifthen, gallivm, cond); + lp_build_pointer_set(builder, addr_ptr, lp_build_const_int32(gallivm, c), value_ptr); + lp_build_endif(&ifthen); + lp_build_loop_end_cond(&loop_state, lp_build_const_int32(gallivm, uint_bld->type.length), + NULL, LLVMIntUGE); + } +} + +static void emit_atomic_global(struct lp_build_nir_context *bld_base, + nir_intrinsic_op nir_op, + unsigned addr_bit_size, + LLVMValueRef addr, + LLVMValueRef val, LLVMValueRef val2, + LLVMValueRef *result) +{ + struct gallivm_state *gallivm = bld_base->base.gallivm; + LLVMBuilderRef builder = gallivm->builder; + struct lp_build_context *uint_bld = &bld_base->uint_bld; + LLVMAtomicRMWBinOp op; + switch (nir_op) { + case nir_intrinsic_global_atomic_add: + op = LLVMAtomicRMWBinOpAdd; + break; + case nir_intrinsic_global_atomic_exchange: + op = LLVMAtomicRMWBinOpXchg; + break; + case nir_intrinsic_global_atomic_and: + op = LLVMAtomicRMWBinOpAnd; + break; + case nir_intrinsic_global_atomic_or: + op = LLVMAtomicRMWBinOpOr; + break; + case nir_intrinsic_global_atomic_xor: + op = LLVMAtomicRMWBinOpXor; + break; + case nir_intrinsic_global_atomic_umin: + op = LLVMAtomicRMWBinOpUMin; + break; + case nir_intrinsic_global_atomic_umax: + op = LLVMAtomicRMWBinOpUMax; + break; + case nir_intrinsic_global_atomic_imin: + op = LLVMAtomicRMWBinOpMin; + break; + case nir_intrinsic_global_atomic_imax: + op = LLVMAtomicRMWBinOpMax; + break; + default: + break; + } + + LLVMValueRef atom_res = lp_build_alloca(gallivm, + uint_bld->vec_type, ""); + LLVMValueRef exec_mask = mask_vec(bld_base); + struct lp_build_loop_state loop_state; + lp_build_loop_begin(&loop_state, gallivm, lp_build_const_int32(gallivm, 0)); + + LLVMValueRef value_ptr = LLVMBuildExtractElement(gallivm->builder, val, + loop_state.counter, ""); + + LLVMValueRef addr_ptr = LLVMBuildExtractElement(gallivm->builder, addr, + loop_state.counter, ""); + addr_ptr = global_addr_to_ptr(gallivm, addr_ptr, 32); + struct lp_build_if_state ifthen; + LLVMValueRef cond, temp_res; + LLVMValueRef scalar; + cond = LLVMBuildICmp(gallivm->builder, LLVMIntNE, exec_mask, uint_bld->zero, ""); + cond = LLVMBuildExtractElement(gallivm->builder, cond, loop_state.counter, ""); + lp_build_if(&ifthen, gallivm, cond); + + if (nir_op == nir_intrinsic_global_atomic_comp_swap) { + LLVMValueRef cas_src_ptr = LLVMBuildExtractElement(gallivm->builder, val2, + loop_state.counter, ""); + cas_src_ptr = LLVMBuildBitCast(gallivm->builder, cas_src_ptr, uint_bld->elem_type, ""); + scalar = LLVMBuildAtomicCmpXchg(builder, addr_ptr, value_ptr, + cas_src_ptr, + LLVMAtomicOrderingSequentiallyConsistent, + LLVMAtomicOrderingSequentiallyConsistent, + false); + scalar = LLVMBuildExtractValue(gallivm->builder, scalar, 0, ""); + } else { + scalar = LLVMBuildAtomicRMW(builder, op, + addr_ptr, value_ptr, + LLVMAtomicOrderingSequentiallyConsistent, + false); + } + temp_res = LLVMBuildLoad(builder, atom_res, ""); + temp_res = LLVMBuildInsertElement(builder, temp_res, scalar, loop_state.counter, ""); + LLVMBuildStore(builder, temp_res, atom_res); + lp_build_else(&ifthen); + temp_res = LLVMBuildLoad(builder, atom_res, ""); + temp_res = LLVMBuildInsertElement(builder, temp_res, lp_build_const_int32(gallivm, 0), loop_state.counter, ""); + LLVMBuildStore(builder, temp_res, atom_res); + lp_build_endif(&ifthen); + lp_build_loop_end_cond(&loop_state, lp_build_const_int32(gallivm, uint_bld->type.length), + NULL, LLVMIntUGE); + *result = LLVMBuildLoad(builder, atom_res, ""); +} + +static void emit_load_ubo(struct lp_build_nir_context *bld_base, + unsigned nc, + unsigned bit_size, + bool offset_is_uniform, + LLVMValueRef index, + LLVMValueRef offset, + LLVMValueRef result[NIR_MAX_VEC_COMPONENTS]) +{ + struct lp_build_nir_soa_context *bld = (struct lp_build_nir_soa_context *)bld_base; + struct gallivm_state *gallivm = bld_base->base.gallivm; + LLVMBuilderRef builder = gallivm->builder; + struct lp_build_context *uint_bld = &bld_base->uint_bld; + struct lp_build_context *bld_broad = bit_size == 64 ? &bld_base->dbl_bld : &bld_base->base; + LLVMValueRef consts_ptr = lp_build_array_get(gallivm, bld->consts_ptr, index); + unsigned size_shift = 0; + if (bit_size == 32) + size_shift = 2; + else if (bit_size == 64) + size_shift = 3; + if (size_shift) + offset = lp_build_shr(uint_bld, offset, lp_build_const_int_vec(gallivm, uint_bld->type, size_shift)); + if (bit_size == 64) { + LLVMTypeRef dptr_type = LLVMPointerType(bld_base->dbl_bld.elem_type, 0); + consts_ptr = LLVMBuildBitCast(builder, consts_ptr, dptr_type, ""); + } + + if (offset_is_uniform) { + offset = LLVMBuildExtractElement(builder, offset, lp_build_const_int32(gallivm, 0), ""); + + for (unsigned c = 0; c < nc; c++) { + LLVMValueRef this_offset = LLVMBuildAdd(builder, offset, lp_build_const_int32(gallivm, c), ""); + + LLVMValueRef scalar = lp_build_pointer_get(builder, consts_ptr, this_offset); + result[c] = lp_build_broadcast_scalar(bld_broad, scalar); + } + } else { + LLVMValueRef overflow_mask; + LLVMValueRef num_consts = lp_build_array_get(gallivm, bld->const_sizes_ptr, index); + + num_consts = LLVMBuildShl(gallivm->builder, num_consts, lp_build_const_int32(gallivm, 4), ""); + num_consts = lp_build_broadcast_scalar(uint_bld, num_consts); + for (unsigned c = 0; c < nc; c++) { + LLVMValueRef this_offset = lp_build_add(uint_bld, offset, lp_build_const_int_vec(gallivm, uint_bld->type, c)); + overflow_mask = lp_build_compare(gallivm, uint_bld->type, PIPE_FUNC_GEQUAL, + this_offset, num_consts); + + result[c] = build_gather(bld_base, bld_broad, consts_ptr, this_offset, overflow_mask, NULL); + } + } +} + + +static void emit_load_mem(struct lp_build_nir_context *bld_base, + unsigned nc, + unsigned bit_size, + LLVMValueRef index, + LLVMValueRef offset, + LLVMValueRef outval[NIR_MAX_VEC_COMPONENTS]) +{ + struct gallivm_state *gallivm = bld_base->base.gallivm; + struct lp_build_nir_soa_context *bld = (struct lp_build_nir_soa_context *)bld_base; + LLVMBuilderRef builder = bld->bld_base.base.gallivm->builder; + LLVMValueRef ssbo_ptr = NULL; + struct lp_build_context *uint_bld = &bld_base->uint_bld; + struct lp_build_context *uint64_bld = &bld_base->uint64_bld; + LLVMValueRef ssbo_limit = NULL; + + if (index) { + LLVMValueRef ssbo_size_ptr = lp_build_array_get(gallivm, bld->ssbo_sizes_ptr, LLVMBuildExtractElement(builder, index, lp_build_const_int32(gallivm, 0), "")); + ssbo_limit = LLVMBuildAShr(gallivm->builder, ssbo_size_ptr, lp_build_const_int32(gallivm, bit_size == 64 ? 3 : 2), ""); + ssbo_limit = lp_build_broadcast_scalar(uint_bld, ssbo_limit); + + ssbo_ptr = lp_build_array_get(gallivm, bld->ssbo_ptr, LLVMBuildExtractElement(builder, index, lp_build_const_int32(gallivm, 0), "")); + } else + ssbo_ptr = bld->shared_ptr; + + offset = LLVMBuildAShr(gallivm->builder, offset, lp_build_const_int_vec(gallivm, uint_bld->type, bit_size == 64 ? 3 : 2), ""); + for (unsigned c = 0; c < nc; c++) { + LLVMValueRef loop_index = lp_build_add(uint_bld, offset, lp_build_const_int_vec(gallivm, uint_bld->type, c)); + LLVMValueRef exec_mask = mask_vec(bld_base); + + if (ssbo_limit) { + LLVMValueRef ssbo_oob_cmp = lp_build_cmp(uint_bld, PIPE_FUNC_LESS, loop_index, ssbo_limit); + exec_mask = LLVMBuildAnd(builder, exec_mask, ssbo_oob_cmp, ""); + } + + LLVMValueRef result = lp_build_alloca(gallivm, bit_size == 64 ? uint64_bld->vec_type : uint_bld->vec_type, ""); + struct lp_build_loop_state loop_state; + lp_build_loop_begin(&loop_state, gallivm, lp_build_const_int32(gallivm, 0)); + + struct lp_build_if_state ifthen; + LLVMValueRef cond, temp_res; + + loop_index = LLVMBuildExtractElement(gallivm->builder, loop_index, + loop_state.counter, ""); + + cond = LLVMBuildICmp(gallivm->builder, LLVMIntNE, exec_mask, uint_bld->zero, ""); + cond = LLVMBuildExtractElement(gallivm->builder, cond, loop_state.counter, ""); + + lp_build_if(&ifthen, gallivm, cond); + LLVMValueRef scalar; + if (bit_size == 64) { + LLVMValueRef ssbo_ptr2 = LLVMBuildBitCast(builder, ssbo_ptr, LLVMPointerType(uint64_bld->elem_type, 0), ""); + scalar = lp_build_pointer_get(builder, ssbo_ptr2, loop_index); + } else + scalar = lp_build_pointer_get(builder, ssbo_ptr, loop_index); + + temp_res = LLVMBuildLoad(builder, result, ""); + temp_res = LLVMBuildInsertElement(builder, temp_res, scalar, loop_state.counter, ""); + LLVMBuildStore(builder, temp_res, result); + lp_build_else(&ifthen); + temp_res = LLVMBuildLoad(builder, result, ""); + LLVMValueRef zero; + if (bit_size == 64) + zero = LLVMConstInt(LLVMInt64TypeInContext(gallivm->context), 0, 0); + else + zero = lp_build_const_int32(gallivm, 0); + temp_res = LLVMBuildInsertElement(builder, temp_res, zero, loop_state.counter, ""); + LLVMBuildStore(builder, temp_res, result); + lp_build_endif(&ifthen); + lp_build_loop_end_cond(&loop_state, lp_build_const_int32(gallivm, uint_bld->type.length), + NULL, LLVMIntUGE); + outval[c] = LLVMBuildLoad(gallivm->builder, result, ""); + } +} + +static void emit_store_mem(struct lp_build_nir_context *bld_base, + unsigned writemask, + unsigned nc, + unsigned bit_size, + LLVMValueRef index, + LLVMValueRef offset, + LLVMValueRef dst) +{ + struct gallivm_state *gallivm = bld_base->base.gallivm; + struct lp_build_nir_soa_context *bld = (struct lp_build_nir_soa_context *)bld_base; + LLVMBuilderRef builder = bld->bld_base.base.gallivm->builder; + LLVMValueRef ssbo_ptr; + struct lp_build_context *uint_bld = &bld_base->uint_bld; + LLVMValueRef ssbo_limit = NULL; + + if (index) { + LLVMValueRef ssbo_size_ptr = lp_build_array_get(gallivm, bld->ssbo_sizes_ptr, LLVMBuildExtractElement(builder, index, lp_build_const_int32(gallivm, 0), "")); + ssbo_limit = LLVMBuildAShr(gallivm->builder, ssbo_size_ptr, lp_build_const_int32(gallivm, bit_size == 64 ? 3 : 2), ""); + ssbo_limit = lp_build_broadcast_scalar(uint_bld, ssbo_limit); + ssbo_ptr = lp_build_array_get(gallivm, bld->ssbo_ptr, LLVMBuildExtractElement(builder, index, lp_build_const_int32(gallivm, 0), "")); + } else + ssbo_ptr = bld->shared_ptr; + + offset = lp_build_shr_imm(uint_bld, offset, bit_size == 64 ? 3 : 2); + for (unsigned c = 0; c < nc; c++) { + if (!(writemask & (1u << c))) + continue; + LLVMValueRef loop_index = lp_build_add(uint_bld, offset, lp_build_const_int_vec(gallivm, uint_bld->type, c)); + LLVMValueRef val = (nc == 1) ? dst : LLVMBuildExtractValue(builder, dst, c, ""); + + LLVMValueRef exec_mask = mask_vec(bld_base); + if (ssbo_limit) { + LLVMValueRef ssbo_oob_cmp = lp_build_cmp(uint_bld, PIPE_FUNC_LESS, loop_index, ssbo_limit); + exec_mask = LLVMBuildAnd(builder, exec_mask, ssbo_oob_cmp, ""); + } + + struct lp_build_loop_state loop_state; + lp_build_loop_begin(&loop_state, gallivm, lp_build_const_int32(gallivm, 0)); + LLVMValueRef value_ptr = LLVMBuildExtractElement(gallivm->builder, val, + loop_state.counter, ""); + if (bit_size == 64) + value_ptr = LLVMBuildBitCast(gallivm->builder, value_ptr, bld_base->uint64_bld.elem_type, ""); + else + value_ptr = LLVMBuildBitCast(gallivm->builder, value_ptr, uint_bld->elem_type, ""); + struct lp_build_if_state ifthen; + LLVMValueRef cond; + + loop_index = LLVMBuildExtractElement(gallivm->builder, loop_index, + loop_state.counter, ""); + cond = LLVMBuildICmp(gallivm->builder, LLVMIntNE, exec_mask, uint_bld->zero, ""); + cond = LLVMBuildExtractElement(gallivm->builder, cond, loop_state.counter, ""); + lp_build_if(&ifthen, gallivm, cond); + if (bit_size == 64) { + LLVMValueRef ssbo_ptr2 = LLVMBuildBitCast(builder, ssbo_ptr, LLVMPointerType(bld_base->uint64_bld.elem_type, 0), ""); + lp_build_pointer_set(builder, ssbo_ptr2, loop_index, value_ptr); + } else + lp_build_pointer_set(builder, ssbo_ptr, loop_index, value_ptr); + lp_build_endif(&ifthen); + lp_build_loop_end_cond(&loop_state, lp_build_const_int32(gallivm, uint_bld->type.length), + NULL, LLVMIntUGE); + } +} + +static void emit_atomic_mem(struct lp_build_nir_context *bld_base, + nir_intrinsic_op nir_op, + LLVMValueRef index, LLVMValueRef offset, + LLVMValueRef val, LLVMValueRef val2, + LLVMValueRef *result) +{ + struct gallivm_state *gallivm = bld_base->base.gallivm; + struct lp_build_nir_soa_context *bld = (struct lp_build_nir_soa_context *)bld_base; + LLVMBuilderRef builder = bld->bld_base.base.gallivm->builder; + LLVMValueRef ssbo_ptr; + struct lp_build_context *uint_bld = &bld_base->uint_bld; + LLVMAtomicRMWBinOp op; + LLVMValueRef ssbo_limit = NULL; + + if (index) { + LLVMValueRef ssbo_size_ptr = lp_build_array_get(gallivm, bld->ssbo_sizes_ptr, LLVMBuildExtractElement(builder, index, lp_build_const_int32(gallivm, 0), "")); + ssbo_limit = LLVMBuildAShr(gallivm->builder, ssbo_size_ptr, lp_build_const_int32(gallivm, 2), ""); + ssbo_limit = lp_build_broadcast_scalar(uint_bld, ssbo_limit); + ssbo_ptr = lp_build_array_get(gallivm, bld->ssbo_ptr, LLVMBuildExtractElement(builder, index, lp_build_const_int32(gallivm, 0), "")); + } else + ssbo_ptr = bld->shared_ptr; + + switch (nir_op) { + case nir_intrinsic_shared_atomic_add: + case nir_intrinsic_ssbo_atomic_add: + op = LLVMAtomicRMWBinOpAdd; + break; + case nir_intrinsic_shared_atomic_exchange: + case nir_intrinsic_ssbo_atomic_exchange: + op = LLVMAtomicRMWBinOpXchg; + break; + case nir_intrinsic_shared_atomic_and: + case nir_intrinsic_ssbo_atomic_and: + op = LLVMAtomicRMWBinOpAnd; + break; + case nir_intrinsic_shared_atomic_or: + case nir_intrinsic_ssbo_atomic_or: + op = LLVMAtomicRMWBinOpOr; + break; + case nir_intrinsic_shared_atomic_xor: + case nir_intrinsic_ssbo_atomic_xor: + op = LLVMAtomicRMWBinOpXor; + break; + case nir_intrinsic_shared_atomic_umin: + case nir_intrinsic_ssbo_atomic_umin: + op = LLVMAtomicRMWBinOpUMin; + break; + case nir_intrinsic_shared_atomic_umax: + case nir_intrinsic_ssbo_atomic_umax: + op = LLVMAtomicRMWBinOpUMax; + break; + case nir_intrinsic_ssbo_atomic_imin: + case nir_intrinsic_shared_atomic_imin: + op = LLVMAtomicRMWBinOpMin; + break; + case nir_intrinsic_ssbo_atomic_imax: + case nir_intrinsic_shared_atomic_imax: + op = LLVMAtomicRMWBinOpMax; + break; + default: + break; + } + + offset = lp_build_shr_imm(uint_bld, offset, 2); + LLVMValueRef atom_res = lp_build_alloca(gallivm, + uint_bld->vec_type, ""); + + LLVMValueRef exec_mask = mask_vec(bld_base); + if (ssbo_limit) { + LLVMValueRef ssbo_oob_cmp = lp_build_cmp(uint_bld, PIPE_FUNC_LESS, offset, ssbo_limit); + exec_mask = LLVMBuildAnd(builder, exec_mask, ssbo_oob_cmp, ""); + } + + struct lp_build_loop_state loop_state; + lp_build_loop_begin(&loop_state, gallivm, lp_build_const_int32(gallivm, 0)); + + LLVMValueRef value_ptr = LLVMBuildExtractElement(gallivm->builder, val, + loop_state.counter, ""); + value_ptr = LLVMBuildBitCast(gallivm->builder, value_ptr, uint_bld->elem_type, ""); + + offset = LLVMBuildExtractElement(gallivm->builder, offset, + loop_state.counter, ""); + + LLVMValueRef scalar_ptr = LLVMBuildGEP(builder, ssbo_ptr, + &offset, 1, ""); + + struct lp_build_if_state ifthen; + LLVMValueRef cond, temp_res; + LLVMValueRef scalar; + cond = LLVMBuildICmp(gallivm->builder, LLVMIntNE, exec_mask, uint_bld->zero, ""); + cond = LLVMBuildExtractElement(gallivm->builder, cond, loop_state.counter, ""); + lp_build_if(&ifthen, gallivm, cond); + + if (nir_op == nir_intrinsic_ssbo_atomic_comp_swap || nir_op == nir_intrinsic_shared_atomic_comp_swap) { + LLVMValueRef cas_src_ptr = LLVMBuildExtractElement(gallivm->builder, val2, + loop_state.counter, ""); + cas_src_ptr = LLVMBuildBitCast(gallivm->builder, cas_src_ptr, uint_bld->elem_type, ""); + scalar = LLVMBuildAtomicCmpXchg(builder, scalar_ptr, value_ptr, + cas_src_ptr, + LLVMAtomicOrderingSequentiallyConsistent, + LLVMAtomicOrderingSequentiallyConsistent, + false); + scalar = LLVMBuildExtractValue(gallivm->builder, scalar, 0, ""); + } else { + scalar = LLVMBuildAtomicRMW(builder, op, + scalar_ptr, value_ptr, + LLVMAtomicOrderingSequentiallyConsistent, + false); + } + temp_res = LLVMBuildLoad(builder, atom_res, ""); + temp_res = LLVMBuildInsertElement(builder, temp_res, scalar, loop_state.counter, ""); + LLVMBuildStore(builder, temp_res, atom_res); + lp_build_else(&ifthen); + temp_res = LLVMBuildLoad(builder, atom_res, ""); + temp_res = LLVMBuildInsertElement(builder, temp_res, lp_build_const_int32(gallivm, 0), loop_state.counter, ""); + LLVMBuildStore(builder, temp_res, atom_res); + lp_build_endif(&ifthen); + + lp_build_loop_end_cond(&loop_state, lp_build_const_int32(gallivm, uint_bld->type.length), + NULL, LLVMIntUGE); + *result = LLVMBuildLoad(builder, atom_res, ""); +} + +static void emit_barrier(struct lp_build_nir_context *bld_base) +{ + struct lp_build_nir_soa_context *bld = (struct lp_build_nir_soa_context *)bld_base; + struct gallivm_state * gallivm = bld_base->base.gallivm; + + LLVMBasicBlockRef resume = lp_build_insert_new_block(gallivm, "resume"); + + lp_build_coro_suspend_switch(gallivm, bld->coro, resume, false); + LLVMPositionBuilderAtEnd(gallivm->builder, resume); +} + +static LLVMValueRef emit_get_buffer_size(struct lp_build_nir_context *bld_base, + LLVMValueRef index) +{ + struct lp_build_nir_soa_context *bld = (struct lp_build_nir_soa_context *)bld_base; + LLVMBuilderRef builder = bld->bld_base.base.gallivm->builder; + struct lp_build_context *bld_broad = &bld_base->uint_bld; + LLVMValueRef size_ptr = lp_build_array_get(bld_base->base.gallivm, bld->ssbo_sizes_ptr, LLVMBuildExtractElement(builder, index, bld_broad->zero, "")); + return lp_build_broadcast_scalar(bld_broad, size_ptr); +} + +static void emit_image_op(struct lp_build_nir_context *bld_base, + struct lp_img_params *params) +{ + struct lp_build_nir_soa_context *bld = (struct lp_build_nir_soa_context *)bld_base; + params->type = bld_base->base.type; + params->context_ptr = bld->context_ptr; + params->thread_data_ptr = bld->thread_data_ptr; + params->exec_mask = mask_vec(bld_base); + bld->image->emit_op(bld->image, + bld->bld_base.base.gallivm, + params); + +} + +static void emit_image_size(struct lp_build_nir_context *bld_base, + struct lp_sampler_size_query_params *params) +{ + struct lp_build_nir_soa_context *bld = (struct lp_build_nir_soa_context *)bld_base; + + params->int_type = bld_base->int_bld.type; + params->context_ptr = bld->context_ptr; + + bld->image->emit_size_query(bld->image, + bld->bld_base.base.gallivm, + params); + +} + +static void init_var_slots(struct lp_build_nir_context *bld_base, + nir_variable *var, unsigned sc) +{ + struct lp_build_nir_soa_context *bld = (struct lp_build_nir_soa_context *)bld_base; + unsigned slots = glsl_count_attribute_slots(var->type, false) * 4; + + for (unsigned comp = sc; comp < slots + sc; comp++) { + unsigned this_loc = var->data.driver_location + (comp / 4); + unsigned this_chan = comp % 4; + + if (!bld->outputs[this_loc][this_chan]) + bld->outputs[this_loc][this_chan] = lp_build_alloca(bld_base->base.gallivm, + bld_base->base.vec_type, "output"); + } +} + +static void emit_var_decl(struct lp_build_nir_context *bld_base, + nir_variable *var) +{ + unsigned sc = var->data.location_frac; + switch (var->data.mode) { + case nir_var_shader_out: { + if (bld_base->shader->info.stage == MESA_SHADER_FRAGMENT) { + if (var->data.location == FRAG_RESULT_STENCIL) + sc = 1; + else if (var->data.location == FRAG_RESULT_DEPTH) + sc = 2; + } + init_var_slots(bld_base, var, sc); + break; + } + default: + break; + } +} + +static void emit_tex(struct lp_build_nir_context *bld_base, + struct lp_sampler_params *params) +{ + struct lp_build_nir_soa_context *bld = (struct lp_build_nir_soa_context *)bld_base; + + params->type = bld_base->base.type; + params->context_ptr = bld->context_ptr; + params->thread_data_ptr = bld->thread_data_ptr; + + bld->sampler->emit_tex_sample(bld->sampler, + bld->bld_base.base.gallivm, + params); +} + +static void emit_tex_size(struct lp_build_nir_context *bld_base, + struct lp_sampler_size_query_params *params) +{ + struct lp_build_nir_soa_context *bld = (struct lp_build_nir_soa_context *)bld_base; + + params->int_type = bld_base->int_bld.type; + params->context_ptr = bld->context_ptr; + + bld->sampler->emit_size_query(bld->sampler, + bld->bld_base.base.gallivm, + params); +} + +static void emit_sysval_intrin(struct lp_build_nir_context *bld_base, + nir_intrinsic_instr *instr, + LLVMValueRef result[NIR_MAX_VEC_COMPONENTS]) +{ + struct lp_build_nir_soa_context *bld = (struct lp_build_nir_soa_context *)bld_base; + struct gallivm_state *gallivm = bld_base->base.gallivm; + switch (instr->intrinsic) { + case nir_intrinsic_load_instance_id: + result[0] = lp_build_broadcast_scalar(&bld_base->uint_bld, bld->system_values.instance_id); + break; + case nir_intrinsic_load_base_instance: + result[0] = lp_build_broadcast_scalar(&bld_base->uint_bld, bld->system_values.base_instance); + break; + case nir_intrinsic_load_base_vertex: + result[0] = bld->system_values.basevertex; + break; + case nir_intrinsic_load_vertex_id: + result[0] = bld->system_values.vertex_id; + break; + case nir_intrinsic_load_primitive_id: + result[0] = bld->system_values.prim_id; + break; + case nir_intrinsic_load_work_group_id: + for (unsigned i = 0; i < 3; i++) + result[i] = lp_build_broadcast_scalar(&bld_base->uint_bld, LLVMBuildExtractElement(gallivm->builder, bld->system_values.block_id, lp_build_const_int32(gallivm, i), "")); + break; + case nir_intrinsic_load_local_invocation_id: + for (unsigned i = 0; i < 3; i++) + result[i] = LLVMBuildExtractValue(gallivm->builder, bld->system_values.thread_id, i, ""); + break; + case nir_intrinsic_load_num_work_groups: + for (unsigned i = 0; i < 3; i++) + result[i] = lp_build_broadcast_scalar(&bld_base->uint_bld, LLVMBuildExtractElement(gallivm->builder, bld->system_values.grid_size, lp_build_const_int32(gallivm, i), "")); + break; + case nir_intrinsic_load_invocation_id: + result[0] = lp_build_broadcast_scalar(&bld_base->uint_bld, bld->system_values.invocation_id); + break; + case nir_intrinsic_load_front_face: + result[0] = lp_build_broadcast_scalar(&bld_base->uint_bld, bld->system_values.front_facing); + break; + case nir_intrinsic_load_draw_id: + result[0] = lp_build_broadcast_scalar(&bld_base->uint_bld, bld->system_values.draw_id); + break; + default: + break; + case nir_intrinsic_load_local_group_size: + for (unsigned i = 0; i < 3; i++) + result[i] = lp_build_broadcast_scalar(&bld_base->uint_bld, LLVMBuildExtractElement(gallivm->builder, bld->system_values.block_size, lp_build_const_int32(gallivm, i), "")); + break; + case nir_intrinsic_load_work_dim: + result[0] = lp_build_broadcast_scalar(&bld_base->uint_bld, bld->system_values.work_dim); + break; + } +} + +static void bgnloop(struct lp_build_nir_context *bld_base) +{ + struct lp_build_nir_soa_context *bld = (struct lp_build_nir_soa_context *)bld_base; + lp_exec_bgnloop(&bld->exec_mask, true); +} + +static void endloop(struct lp_build_nir_context *bld_base) +{ + struct lp_build_nir_soa_context *bld = (struct lp_build_nir_soa_context *)bld_base; + lp_exec_endloop(bld_base->base.gallivm, &bld->exec_mask); +} + +static void if_cond(struct lp_build_nir_context *bld_base, LLVMValueRef cond) +{ + LLVMBuilderRef builder = bld_base->base.gallivm->builder; + struct lp_build_nir_soa_context *bld = (struct lp_build_nir_soa_context *)bld_base; + lp_exec_mask_cond_push(&bld->exec_mask, LLVMBuildBitCast(builder, cond, bld_base->base.int_vec_type, "")); +} + +static void else_stmt(struct lp_build_nir_context *bld_base) +{ + struct lp_build_nir_soa_context *bld = (struct lp_build_nir_soa_context *)bld_base; + lp_exec_mask_cond_invert(&bld->exec_mask); +} + +static void endif_stmt(struct lp_build_nir_context *bld_base) +{ + struct lp_build_nir_soa_context *bld = (struct lp_build_nir_soa_context *)bld_base; + lp_exec_mask_cond_pop(&bld->exec_mask); +} + +static void break_stmt(struct lp_build_nir_context *bld_base) +{ + struct lp_build_nir_soa_context *bld = (struct lp_build_nir_soa_context *)bld_base; + + lp_exec_break(&bld->exec_mask, NULL, false); +} + +static void continue_stmt(struct lp_build_nir_context *bld_base) +{ + struct lp_build_nir_soa_context *bld = (struct lp_build_nir_soa_context *)bld_base; + lp_exec_continue(&bld->exec_mask); +} + +static void discard(struct lp_build_nir_context *bld_base, LLVMValueRef cond) +{ + struct lp_build_nir_soa_context *bld = (struct lp_build_nir_soa_context *)bld_base; + LLVMBuilderRef builder = bld->bld_base.base.gallivm->builder; + LLVMValueRef mask; + + if (!cond) { + if (bld->exec_mask.has_mask) { + mask = LLVMBuildNot(builder, bld->exec_mask.exec_mask, "kilp"); + } else { + mask = LLVMConstNull(bld->bld_base.base.int_vec_type); + } + } else { + mask = LLVMBuildNot(builder, cond, ""); + if (bld->exec_mask.has_mask) { + LLVMValueRef invmask; + invmask = LLVMBuildNot(builder, bld->exec_mask.exec_mask, "kilp"); + mask = LLVMBuildOr(builder, mask, invmask, ""); + } + } + lp_build_mask_update(bld->mask, mask); +} + +static void +increment_vec_ptr_by_mask(struct lp_build_nir_context * bld_base, + LLVMValueRef ptr, + LLVMValueRef mask) +{ + LLVMBuilderRef builder = bld_base->base.gallivm->builder; + LLVMValueRef current_vec = LLVMBuildLoad(builder, ptr, ""); + + current_vec = LLVMBuildSub(builder, current_vec, mask, ""); + + LLVMBuildStore(builder, current_vec, ptr); +} + +static void +clear_uint_vec_ptr_from_mask(struct lp_build_nir_context * bld_base, + LLVMValueRef ptr, + LLVMValueRef mask) +{ + LLVMBuilderRef builder = bld_base->base.gallivm->builder; + LLVMValueRef current_vec = LLVMBuildLoad(builder, ptr, ""); + + current_vec = lp_build_select(&bld_base->uint_bld, + mask, + bld_base->uint_bld.zero, + current_vec); + + LLVMBuildStore(builder, current_vec, ptr); +} + +static LLVMValueRef +clamp_mask_to_max_output_vertices(struct lp_build_nir_soa_context * bld, + LLVMValueRef current_mask_vec, + LLVMValueRef total_emitted_vertices_vec) +{ + LLVMBuilderRef builder = bld->bld_base.base.gallivm->builder; + struct lp_build_context *int_bld = &bld->bld_base.int_bld; + LLVMValueRef max_mask = lp_build_cmp(int_bld, PIPE_FUNC_LESS, + total_emitted_vertices_vec, + bld->max_output_vertices_vec); + + return LLVMBuildAnd(builder, current_mask_vec, max_mask, ""); +} + +static void emit_vertex(struct lp_build_nir_context *bld_base, uint32_t stream_id) +{ + struct lp_build_nir_soa_context *bld = (struct lp_build_nir_soa_context *)bld_base; + LLVMBuilderRef builder = bld->bld_base.base.gallivm->builder; + + assert(bld->gs_iface->emit_vertex); + LLVMValueRef total_emitted_vertices_vec = + LLVMBuildLoad(builder, bld->total_emitted_vertices_vec_ptr, ""); + LLVMValueRef mask = mask_vec(bld_base); + mask = clamp_mask_to_max_output_vertices(bld, mask, + total_emitted_vertices_vec); + bld->gs_iface->emit_vertex(bld->gs_iface, &bld->bld_base.base, + bld->outputs, + total_emitted_vertices_vec, + lp_build_const_int_vec(bld->bld_base.base.gallivm, bld->bld_base.base.type, stream_id)); + + increment_vec_ptr_by_mask(bld_base, bld->emitted_vertices_vec_ptr, + mask); + increment_vec_ptr_by_mask(bld_base, bld->total_emitted_vertices_vec_ptr, + mask); +} + +static void +end_primitive_masked(struct lp_build_nir_context * bld_base, + LLVMValueRef mask) +{ + struct lp_build_nir_soa_context *bld = (struct lp_build_nir_soa_context *)bld_base; + LLVMBuilderRef builder = bld->bld_base.base.gallivm->builder; + + struct lp_build_context *uint_bld = &bld_base->uint_bld; + LLVMValueRef emitted_vertices_vec = + LLVMBuildLoad(builder, bld->emitted_vertices_vec_ptr, ""); + LLVMValueRef emitted_prims_vec = + LLVMBuildLoad(builder, bld->emitted_prims_vec_ptr, ""); + LLVMValueRef total_emitted_vertices_vec = + LLVMBuildLoad(builder, bld->total_emitted_vertices_vec_ptr, ""); + + LLVMValueRef emitted_mask = lp_build_cmp(uint_bld, + PIPE_FUNC_NOTEQUAL, + emitted_vertices_vec, + uint_bld->zero); + mask = LLVMBuildAnd(builder, mask, emitted_mask, ""); + bld->gs_iface->end_primitive(bld->gs_iface, &bld->bld_base.base, + total_emitted_vertices_vec, + emitted_vertices_vec, emitted_prims_vec, mask_vec(bld_base)); + increment_vec_ptr_by_mask(bld_base, bld->emitted_prims_vec_ptr, + mask); + clear_uint_vec_ptr_from_mask(bld_base, bld->emitted_vertices_vec_ptr, + mask); +} + +static void end_primitive(struct lp_build_nir_context *bld_base, uint32_t stream_id) +{ + struct lp_build_nir_soa_context *bld = (struct lp_build_nir_soa_context *)bld_base; + + assert(bld->gs_iface->end_primitive); + + LLVMValueRef mask = mask_vec(bld_base); + end_primitive_masked(bld_base, mask); +} + +static void +emit_prologue(struct lp_build_nir_soa_context *bld) +{ + struct gallivm_state * gallivm = bld->bld_base.base.gallivm; + if (bld->indirects & nir_var_shader_in && !bld->gs_iface) { + uint32_t num_inputs = util_bitcount64(bld->bld_base.shader->info.inputs_read); + unsigned index, chan; + LLVMTypeRef vec_type = bld->bld_base.base.vec_type; + LLVMValueRef array_size = lp_build_const_int32(gallivm, num_inputs * 4); + bld->inputs_array = lp_build_array_alloca(gallivm, + vec_type, array_size, + "input_array"); + + for (index = 0; index < num_inputs; ++index) { + for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) { + LLVMValueRef lindex = + lp_build_const_int32(gallivm, index * 4 + chan); + LLVMValueRef input_ptr = + LLVMBuildGEP(gallivm->builder, bld->inputs_array, + &lindex, 1, ""); + LLVMValueRef value = bld->inputs[index][chan]; + if (value) + LLVMBuildStore(gallivm->builder, value, input_ptr); + } + } + } +} + +static void emit_vote(struct lp_build_nir_context *bld_base, LLVMValueRef src, nir_intrinsic_instr *instr, LLVMValueRef result[4]) +{ + struct gallivm_state * gallivm = bld_base->base.gallivm; + LLVMBuilderRef builder = gallivm->builder; + + LLVMValueRef exec_mask = mask_vec(bld_base); + struct lp_build_loop_state loop_state; + + LLVMValueRef outer_cond = LLVMBuildICmp(builder, LLVMIntNE, exec_mask, bld_base->uint_bld.zero, ""); + + LLVMValueRef res_store = lp_build_alloca(gallivm, bld_base->int_bld.elem_type, ""); + LLVMValueRef init_val; + if (instr->intrinsic == nir_intrinsic_vote_ieq) { + /* for equal we unfortunately have to loop and find the first valid one. */ + lp_build_loop_begin(&loop_state, gallivm, lp_build_const_int32(gallivm, 0)); + LLVMValueRef if_cond = LLVMBuildExtractElement(gallivm->builder, outer_cond, loop_state.counter, ""); + + struct lp_build_if_state ifthen; + lp_build_if(&ifthen, gallivm, if_cond); + LLVMValueRef value_ptr = LLVMBuildExtractElement(gallivm->builder, src, + loop_state.counter, ""); + LLVMBuildStore(builder, value_ptr, res_store); + lp_build_endif(&ifthen); + lp_build_loop_end_cond(&loop_state, lp_build_const_int32(gallivm, bld_base->uint_bld.type.length), + NULL, LLVMIntUGE); + lp_build_print_value(gallivm, "init_val is ", LLVMBuildLoad(builder, res_store, "")); + init_val = LLVMBuildLoad(builder, res_store, ""); + } else { + LLVMBuildStore(builder, lp_build_const_int32(gallivm, instr->intrinsic == nir_intrinsic_vote_any ? 0 : -1), res_store); + } + + LLVMValueRef res; + lp_build_loop_begin(&loop_state, gallivm, lp_build_const_int32(gallivm, 0)); + LLVMValueRef value_ptr = LLVMBuildExtractElement(gallivm->builder, src, + loop_state.counter, ""); + struct lp_build_if_state ifthen; + LLVMValueRef if_cond; + if_cond = LLVMBuildExtractElement(gallivm->builder, outer_cond, loop_state.counter, ""); + + lp_build_if(&ifthen, gallivm, if_cond); + res = LLVMBuildLoad(builder, res_store, ""); + + if (instr->intrinsic == nir_intrinsic_vote_ieq) { + LLVMValueRef tmp = LLVMBuildICmp(builder, LLVMIntEQ, init_val, value_ptr, ""); + tmp = LLVMBuildSExt(builder, tmp, bld_base->uint_bld.elem_type, ""); + res = LLVMBuildOr(builder, res, tmp, ""); + } else if (instr->intrinsic == nir_intrinsic_vote_any) + res = LLVMBuildOr(builder, res, value_ptr, ""); + else + res = LLVMBuildAnd(builder, res, value_ptr, ""); + LLVMBuildStore(builder, res, res_store); + lp_build_endif(&ifthen); + lp_build_loop_end_cond(&loop_state, lp_build_const_int32(gallivm, bld_base->uint_bld.type.length), + NULL, LLVMIntUGE); + result[0] = lp_build_broadcast_scalar(&bld_base->uint_bld, LLVMBuildLoad(builder, res_store, "")); +} + +void lp_build_nir_soa(struct gallivm_state *gallivm, + struct nir_shader *shader, + const struct lp_build_tgsi_params *params, + LLVMValueRef (*outputs)[4]) +{ + struct lp_build_nir_soa_context bld; + struct lp_type type = params->type; + struct lp_type res_type; + + assert(type.length <= LP_MAX_VECTOR_LENGTH); + memset(&res_type, 0, sizeof res_type); + res_type.width = type.width; + res_type.length = type.length; + res_type.sign = 1; + + /* Setup build context */ + memset(&bld, 0, sizeof bld); + lp_build_context_init(&bld.bld_base.base, gallivm, type); + lp_build_context_init(&bld.bld_base.uint_bld, gallivm, lp_uint_type(type)); + lp_build_context_init(&bld.bld_base.int_bld, gallivm, lp_int_type(type)); + lp_build_context_init(&bld.elem_bld, gallivm, lp_elem_type(type)); + lp_build_context_init(&bld.uint_elem_bld, gallivm, lp_elem_type(lp_uint_type(type))); + { + struct lp_type dbl_type; + dbl_type = type; + dbl_type.width *= 2; + lp_build_context_init(&bld.bld_base.dbl_bld, gallivm, dbl_type); + } + { + struct lp_type uint64_type; + uint64_type = lp_uint_type(type); + uint64_type.width *= 2; + lp_build_context_init(&bld.bld_base.uint64_bld, gallivm, uint64_type); + } + { + struct lp_type int64_type; + int64_type = lp_int_type(type); + int64_type.width *= 2; + lp_build_context_init(&bld.bld_base.int64_bld, gallivm, int64_type); + } + { + struct lp_type uint16_type; + uint16_type = lp_uint_type(type); + uint16_type.width /= 2; + lp_build_context_init(&bld.bld_base.uint16_bld, gallivm, uint16_type); + } + { + struct lp_type int16_type; + int16_type = lp_int_type(type); + int16_type.width /= 2; + lp_build_context_init(&bld.bld_base.int16_bld, gallivm, int16_type); + } + { + struct lp_type uint8_type; + uint8_type = lp_uint_type(type); + uint8_type.width /= 4; + lp_build_context_init(&bld.bld_base.uint8_bld, gallivm, uint8_type); + } + { + struct lp_type int8_type; + int8_type = lp_int_type(type); + int8_type.width /= 4; + lp_build_context_init(&bld.bld_base.int8_bld, gallivm, int8_type); + } + bld.bld_base.load_var = emit_load_var; + bld.bld_base.store_var = emit_store_var; + bld.bld_base.load_reg = emit_load_reg; + bld.bld_base.store_reg = emit_store_reg; + bld.bld_base.emit_var_decl = emit_var_decl; + bld.bld_base.load_ubo = emit_load_ubo; + bld.bld_base.load_kernel_arg = emit_load_kernel_arg; + bld.bld_base.load_global = emit_load_global; + bld.bld_base.store_global = emit_store_global; + bld.bld_base.atomic_global = emit_atomic_global; + bld.bld_base.tex = emit_tex; + bld.bld_base.tex_size = emit_tex_size; + bld.bld_base.bgnloop = bgnloop; + bld.bld_base.endloop = endloop; + bld.bld_base.if_cond = if_cond; + bld.bld_base.else_stmt = else_stmt; + bld.bld_base.endif_stmt = endif_stmt; + bld.bld_base.break_stmt = break_stmt; + bld.bld_base.continue_stmt = continue_stmt; + bld.bld_base.sysval_intrin = emit_sysval_intrin; + bld.bld_base.discard = discard; + bld.bld_base.emit_vertex = emit_vertex; + bld.bld_base.end_primitive = end_primitive; + bld.bld_base.load_mem = emit_load_mem; + bld.bld_base.store_mem = emit_store_mem; + bld.bld_base.get_buffer_size = emit_get_buffer_size; + bld.bld_base.atomic_mem = emit_atomic_mem; + bld.bld_base.barrier = emit_barrier; + bld.bld_base.image_op = emit_image_op; + bld.bld_base.image_size = emit_image_size; + bld.bld_base.vote = emit_vote; + + bld.mask = params->mask; + bld.inputs = params->inputs; + bld.outputs = outputs; + bld.consts_ptr = params->consts_ptr; + bld.const_sizes_ptr = params->const_sizes_ptr; + bld.ssbo_ptr = params->ssbo_ptr; + bld.ssbo_sizes_ptr = params->ssbo_sizes_ptr; + bld.sampler = params->sampler; +// bld.bld_base.info = params->info; + + bld.context_ptr = params->context_ptr; + bld.thread_data_ptr = params->thread_data_ptr; + bld.image = params->image; + bld.shared_ptr = params->shared_ptr; + bld.coro = params->coro; + bld.kernel_args_ptr = params->kernel_args; + bld.indirects = 0; + if (params->info->indirect_files & (1 << TGSI_FILE_INPUT)) + bld.indirects |= nir_var_shader_in; + + bld.gs_iface = params->gs_iface; + if (bld.gs_iface) { + struct lp_build_context *uint_bld = &bld.bld_base.uint_bld; + + bld.max_output_vertices_vec = lp_build_const_int_vec(gallivm, bld.bld_base.int_bld.type, + shader->info.gs.vertices_out); + bld.emitted_prims_vec_ptr = + lp_build_alloca(gallivm, uint_bld->vec_type, "emitted_prims_ptr"); + bld.emitted_vertices_vec_ptr = + lp_build_alloca(gallivm, uint_bld->vec_type, "emitted_vertices_ptr"); + bld.total_emitted_vertices_vec_ptr = + lp_build_alloca(gallivm, uint_bld->vec_type, "total_emitted_vertices_ptr"); + } + lp_exec_mask_init(&bld.exec_mask, &bld.bld_base.int_bld); + + bld.system_values = *params->system_values; + + bld.bld_base.shader = shader; + + emit_prologue(&bld); + lp_build_nir_llvm(&bld.bld_base, shader); + + if (bld.gs_iface) { + LLVMBuilderRef builder = bld.bld_base.base.gallivm->builder; + LLVMValueRef total_emitted_vertices_vec; + LLVMValueRef emitted_prims_vec; + end_primitive_masked(&bld.bld_base, lp_build_mask_value(bld.mask)); + total_emitted_vertices_vec = + LLVMBuildLoad(builder, bld.total_emitted_vertices_vec_ptr, ""); + emitted_prims_vec = + LLVMBuildLoad(builder, bld.emitted_prims_vec_ptr, ""); + + bld.gs_iface->gs_epilogue(bld.gs_iface, + total_emitted_vertices_vec, + emitted_prims_vec); + } + lp_exec_mask_fini(&bld.exec_mask); +} diff -Nru mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_pack.c mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_pack.c --- mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_pack.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_pack.c 2020-06-12 01:21:16.000000000 +0000 @@ -166,7 +166,7 @@ assert(n <= LP_MAX_VECTOR_LENGTH); for(i = 0; i < n; ++i) -#ifdef PIPE_ARCH_LITTLE_ENDIAN +#if UTIL_ARCH_LITTLE_ENDIAN elems[i] = lp_build_const_int32(gallivm, 2*i); #else elems[i] = lp_build_const_int32(gallivm, 2*i+1); @@ -429,7 +429,7 @@ msb = lp_build_zero(gallivm, src_type); /* Interleave bits */ -#ifdef PIPE_ARCH_LITTLE_ENDIAN +#if UTIL_ARCH_LITTLE_ENDIAN *dst_lo = lp_build_interleave2(gallivm, src_type, src, msb, 0); *dst_hi = lp_build_interleave2(gallivm, src_type, src, msb, 1); @@ -483,7 +483,7 @@ msb = lp_build_zero(gallivm, src_type); /* Interleave bits */ -#ifdef PIPE_ARCH_LITTLE_ENDIAN +#if UTIL_ARCH_LITTLE_ENDIAN if (src_type.length * src_type.width == 256 && util_cpu_caps.has_avx2) { *dst_lo = lp_build_interleave2_half(gallivm, src_type, src, msb, 0); *dst_hi = lp_build_interleave2_half(gallivm, src_type, src, msb, 1); @@ -606,7 +606,7 @@ } else { intrinsic = "llvm.ppc.altivec.vpkuwus"; } -#ifdef PIPE_ARCH_LITTLE_ENDIAN +#if UTIL_ARCH_LITTLE_ENDIAN swap_intrinsic_operands = TRUE; #endif } @@ -617,7 +617,7 @@ intrinsic = "llvm.x86.sse2.packsswb.128"; } else if (util_cpu_caps.has_altivec) { intrinsic = "llvm.ppc.altivec.vpkshss"; -#ifdef PIPE_ARCH_LITTLE_ENDIAN +#if UTIL_ARCH_LITTLE_ENDIAN swap_intrinsic_operands = TRUE; #endif } @@ -626,7 +626,7 @@ intrinsic = "llvm.x86.sse2.packuswb.128"; } else if (util_cpu_caps.has_altivec) { intrinsic = "llvm.ppc.altivec.vpkshus"; -#ifdef PIPE_ARCH_LITTLE_ENDIAN +#if UTIL_ARCH_LITTLE_ENDIAN swap_intrinsic_operands = TRUE; #endif } diff -Nru mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_printf.h mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_printf.h --- mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_printf.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_printf.h 2020-06-12 01:21:16.000000000 +0000 @@ -28,6 +28,9 @@ #ifndef LP_BLD_PRINTF_H #define LP_BLD_PRINTF_H +#ifdef __cplusplus +extern "C" { +#endif #include "pipe/p_compiler.h" #include "lp_bld.h" @@ -43,5 +46,9 @@ const char *msg, LLVMValueRef value); +#ifdef __cplusplus +} +#endif + #endif diff -Nru mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c --- mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c 2020-06-12 01:21:16.000000000 +0000 @@ -39,7 +39,7 @@ #include "util/u_dump.h" #include "util/u_memory.h" #include "util/u_math.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_cpu_detect.h" #include "lp_bld_debug.h" #include "lp_bld_type.h" @@ -602,7 +602,7 @@ r_fpart = LLVMBuildBitCast(builder, r_fpart, u8n_vec_type, ""); for (j = 0; j < u8n.type.length; j += 4) { -#ifdef PIPE_ARCH_LITTLE_ENDIAN +#if UTIL_ARCH_LITTLE_ENDIAN unsigned subindex = 0; #else unsigned subindex = 3; diff -Nru mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_sample.c mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_sample.c --- mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_sample.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_sample.c 2020-06-12 01:21:16.000000000 +0000 @@ -34,7 +34,7 @@ #include "pipe/p_defines.h" #include "pipe/p_state.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_math.h" #include "util/u_cpu_detect.h" #include "lp_bld_arit.h" @@ -125,6 +125,41 @@ */ } +/** + * Initialize lp_sampler_static_texture_state object with the gallium + * texture/sampler_view state (this contains the parts which are + * considered static). + */ +void +lp_sampler_static_texture_state_image(struct lp_static_texture_state *state, + const struct pipe_image_view *view) +{ + const struct pipe_resource *resource; + + memset(state, 0, sizeof *state); + + if (!view || !view->resource) + return; + + resource = view->resource; + + state->format = view->format; + state->swizzle_r = PIPE_SWIZZLE_X; + state->swizzle_g = PIPE_SWIZZLE_Y; + state->swizzle_b = PIPE_SWIZZLE_Z; + state->swizzle_a = PIPE_SWIZZLE_W; + + state->target = view->resource->target; + state->pot_width = util_is_power_of_two_or_zero(resource->width0); + state->pot_height = util_is_power_of_two_or_zero(resource->height0); + state->pot_depth = util_is_power_of_two_or_zero(resource->depth0); + state->level_zero_only = 0; + + /* + * the layer / element / level parameters are all either dynamic + * state or handled transparently wrt execution. + */ +} /** * Initialize lp_sampler_static_sampler_state object with the gallium sampler diff -Nru mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_sample.h mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_sample.h --- mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_sample.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_sample.h 2020-06-12 01:21:16.000000000 +0000 @@ -49,6 +49,7 @@ struct pipe_resource; struct pipe_sampler_view; struct pipe_sampler_state; +struct pipe_image_view; struct util_format_description; struct lp_type; struct lp_build_context; @@ -95,6 +96,8 @@ #define LP_SAMPLER_LOD_CONTROL_MASK (3 << 4) #define LP_SAMPLER_LOD_PROPERTY_SHIFT 6 #define LP_SAMPLER_LOD_PROPERTY_MASK (3 << 6) +#define LP_SAMPLER_GATHER_COMP_SHIFT 8 +#define LP_SAMPLER_GATHER_COMP_MASK (3 << 8) struct lp_sampler_params { @@ -122,6 +125,27 @@ LLVMValueRef explicit_lod; LLVMValueRef *sizes_out; }; + +#define LP_IMG_LOAD 0 +#define LP_IMG_STORE 1 +#define LP_IMG_ATOMIC 2 +#define LP_IMG_ATOMIC_CAS 3 + +struct lp_img_params +{ + struct lp_type type; + unsigned image_index; + unsigned img_op; + unsigned target; + LLVMAtomicRMWBinOp op; + LLVMValueRef exec_mask; + LLVMValueRef context_ptr; + LLVMValueRef thread_data_ptr; + const LLVMValueRef *coords; + LLVMValueRef indata[4]; + LLVMValueRef indata2[4]; + LLVMValueRef *outdata; +}; /** * Texture static state. * @@ -323,6 +347,7 @@ /** number of lod values (valid are 1, length/4, length) */ unsigned num_lods; + unsigned gather_comp; boolean no_quad_lod; boolean no_brilinear; boolean no_rho_approx; @@ -489,6 +514,9 @@ lp_sampler_static_texture_state(struct lp_static_texture_state *state, const struct pipe_sampler_view *view); +void +lp_sampler_static_texture_state_image(struct lp_static_texture_state *state, + const struct pipe_image_view *view); void lp_build_lod_selector(struct lp_build_sample_context *bld, @@ -639,6 +667,12 @@ LLVMValueRef level, boolean lod_scalar); +void +lp_build_img_op_soa(const struct lp_static_texture_state *static_texture_state, + struct lp_sampler_dynamic_state *dynamic_state, + struct gallivm_state *gallivm, + const struct lp_img_params *params); + #ifdef __cplusplus } #endif diff -Nru mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c --- mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c 2020-06-12 01:21:16.000000000 +0000 @@ -40,7 +40,7 @@ #include "util/u_dump.h" #include "util/u_memory.h" #include "util/u_math.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_cpu_detect.h" #include "util/format_rgb9e5.h" #include "lp_bld_debug.h" @@ -61,6 +61,7 @@ #include "lp_bld_quad.h" #include "lp_bld_pack.h" #include "lp_bld_intr.h" +#include "lp_bld_misc.h" /** @@ -1032,11 +1033,27 @@ boolean seamless_cube_filter, accurate_cube_corners; unsigned chan_swiz = bld->static_texture_state->swizzle_r; + if (is_gather) { + switch (bld->gather_comp) { + case 0: chan_swiz = bld->static_texture_state->swizzle_r; break; + case 1: chan_swiz = bld->static_texture_state->swizzle_g; break; + case 2: chan_swiz = bld->static_texture_state->swizzle_b; break; + case 3: chan_swiz = bld->static_texture_state->swizzle_a; break; + default: + break; + } + } + seamless_cube_filter = (bld->static_texture_state->target == PIPE_TEXTURE_CUBE || bld->static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) && bld->static_sampler_state->seamless_cube_map; - accurate_cube_corners = ACCURATE_CUBE_CORNERS && seamless_cube_filter; + /* + * Disable accurate cube corners for integer textures, which should only + * get here in the gather path. + */ + accurate_cube_corners = ACCURATE_CUBE_CORNERS && seamless_cube_filter && + !util_format_is_pure_integer(bld->static_texture_state->format); lp_build_extract_image_sizes(bld, &bld->int_size_bld, @@ -2972,7 +2989,8 @@ bld.num_lods = num_quads; } - + if (op_is_gather) + bld.gather_comp = (sample_key & LP_SAMPLER_GATHER_COMP_MASK) >> LP_SAMPLER_GATHER_COMP_SHIFT; bld.lodf_type = type; /* we want native vector size to be able to use our intrinsics */ if (bld.num_lods != type.length) { @@ -3947,3 +3965,193 @@ num_levels); } } + +static void +lp_build_do_atomic_soa(struct gallivm_state *gallivm, + const struct util_format_description *format_desc, + struct lp_type type, + LLVMValueRef exec_mask, + LLVMValueRef base_ptr, + LLVMValueRef offset, + LLVMValueRef out_of_bounds, + unsigned img_op, + LLVMAtomicRMWBinOp op, + const LLVMValueRef rgba_in[4], + const LLVMValueRef rgba2_in[4], + LLVMValueRef atomic_result[4]) +{ + enum pipe_format format = format_desc->format; + + if (format != PIPE_FORMAT_R32_UINT && format != PIPE_FORMAT_R32_SINT && format != PIPE_FORMAT_R32_FLOAT) { + atomic_result[0] = lp_build_zero(gallivm, type); + return; + } + + LLVMValueRef atom_res = lp_build_alloca(gallivm, + LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), type.length), ""); + + offset = LLVMBuildGEP(gallivm->builder, base_ptr, &offset, 1, ""); + struct lp_build_loop_state loop_state; + lp_build_loop_begin(&loop_state, gallivm, lp_build_const_int32(gallivm, 0)); + struct lp_build_if_state ifthen; + LLVMValueRef cond; + LLVMValueRef packed = rgba_in[0], packed2 = rgba2_in[0]; + + LLVMValueRef should_store_mask = LLVMBuildAnd(gallivm->builder, exec_mask, LLVMBuildNot(gallivm->builder, out_of_bounds, ""), "store_mask"); + assert(exec_mask); + + cond = LLVMBuildICmp(gallivm->builder, LLVMIntNE, should_store_mask, lp_build_const_int_vec(gallivm, type, 0), ""); + cond = LLVMBuildExtractElement(gallivm->builder, cond, loop_state.counter, ""); + lp_build_if(&ifthen, gallivm, cond); + + LLVMValueRef data = LLVMBuildExtractElement(gallivm->builder, packed, loop_state.counter, ""); + LLVMValueRef cast_base_ptr = LLVMBuildExtractElement(gallivm->builder, offset, loop_state.counter, ""); + cast_base_ptr = LLVMBuildBitCast(gallivm->builder, cast_base_ptr, LLVMPointerType(LLVMInt32TypeInContext(gallivm->context), 0), ""); + data = LLVMBuildBitCast(gallivm->builder, data, LLVMInt32TypeInContext(gallivm->context), ""); + + if (img_op == LP_IMG_ATOMIC_CAS) { + LLVMValueRef cas_src_ptr = LLVMBuildExtractElement(gallivm->builder, packed2, loop_state.counter, ""); + LLVMValueRef cas_src = LLVMBuildBitCast(gallivm->builder, cas_src_ptr, LLVMInt32TypeInContext(gallivm->context), ""); + data = LLVMBuildAtomicCmpXchg(gallivm->builder, cast_base_ptr, data, + cas_src, + LLVMAtomicOrderingSequentiallyConsistent, + LLVMAtomicOrderingSequentiallyConsistent, + false); + data = LLVMBuildExtractValue(gallivm->builder, data, 0, ""); + } else { + data = LLVMBuildAtomicRMW(gallivm->builder, op, + cast_base_ptr, data, + LLVMAtomicOrderingSequentiallyConsistent, + false); + } + + LLVMValueRef temp_res = LLVMBuildLoad(gallivm->builder, atom_res, ""); + temp_res = LLVMBuildInsertElement(gallivm->builder, temp_res, data, loop_state.counter, ""); + LLVMBuildStore(gallivm->builder, temp_res, atom_res); + + lp_build_endif(&ifthen); + lp_build_loop_end_cond(&loop_state, lp_build_const_int32(gallivm, type.length), + NULL, LLVMIntUGE); + atomic_result[0] = LLVMBuildLoad(gallivm->builder, atom_res, ""); +} + +void +lp_build_img_op_soa(const struct lp_static_texture_state *static_texture_state, + struct lp_sampler_dynamic_state *dynamic_state, + struct gallivm_state *gallivm, + const struct lp_img_params *params) +{ + unsigned target = params->target; + unsigned dims = texture_dims(target); + /** regular scalar int type */ + struct lp_type int_type, int_coord_type; + struct lp_build_context int_bld, int_coord_bld; + const struct util_format_description *format_desc = util_format_description(static_texture_state->format); + LLVMValueRef x = params->coords[0], y = params->coords[1], z = params->coords[2]; + LLVMValueRef row_stride_vec = NULL, img_stride_vec = NULL; + int_type = lp_type_int(32); + int_coord_type = lp_int_type(params->type); + lp_build_context_init(&int_bld, gallivm, int_type); + lp_build_context_init(&int_coord_bld, gallivm, int_coord_type); + + LLVMValueRef offset, i, j; + + LLVMValueRef row_stride = dynamic_state->row_stride(dynamic_state, gallivm, + params->context_ptr, params->image_index); + LLVMValueRef img_stride = dynamic_state->img_stride(dynamic_state, gallivm, + params->context_ptr, params->image_index); + LLVMValueRef base_ptr = dynamic_state->base_ptr(dynamic_state, gallivm, + params->context_ptr, params->image_index); + LLVMValueRef width = dynamic_state->width(dynamic_state, gallivm, + params->context_ptr, params->image_index); + LLVMValueRef height = dynamic_state->height(dynamic_state, gallivm, + params->context_ptr, params->image_index); + LLVMValueRef depth = dynamic_state->depth(dynamic_state, gallivm, + params->context_ptr, params->image_index); + boolean layer_coord = has_layer_coord(target); + + width = lp_build_broadcast_scalar(&int_coord_bld, width); + if (dims >= 2) { + height = lp_build_broadcast_scalar(&int_coord_bld, height); + row_stride_vec = lp_build_broadcast_scalar(&int_coord_bld, row_stride); + } + if (dims >= 3 || layer_coord) { + depth = lp_build_broadcast_scalar(&int_coord_bld, depth); + img_stride_vec = lp_build_broadcast_scalar(&int_coord_bld, img_stride); + } + + LLVMValueRef out_of_bounds = int_coord_bld.zero; + LLVMValueRef out1; + out1 = lp_build_cmp(&int_coord_bld, PIPE_FUNC_GEQUAL, x, width); + out_of_bounds = lp_build_or(&int_coord_bld, out_of_bounds, out1); + + if (dims >= 2) { + out1 = lp_build_cmp(&int_coord_bld, PIPE_FUNC_GEQUAL, y, height); + out_of_bounds = lp_build_or(&int_coord_bld, out_of_bounds, out1); + } + if (dims >= 3) { + out1 = lp_build_cmp(&int_coord_bld, PIPE_FUNC_GEQUAL, z, depth); + out_of_bounds = lp_build_or(&int_coord_bld, out_of_bounds, out1); + } + lp_build_sample_offset(&int_coord_bld, + format_desc, + x, y, z, row_stride_vec, img_stride_vec, + &offset, &i, &j); + + if (params->img_op == LP_IMG_LOAD) { + struct lp_type texel_type = params->type; + if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB && + format_desc->channel[0].pure_integer) { + if (format_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED) { + texel_type = lp_type_int_vec(params->type.width, params->type.width * params->type.length); + } else if (format_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED) { + texel_type = lp_type_uint_vec(params->type.width, params->type.width * params->type.length); + } + } + + if (static_texture_state->format == PIPE_FORMAT_NONE) { + /* + * If there's nothing bound, format is NONE, and we must return + * all zero as mandated by d3d10 in this case. + */ + unsigned chan; + LLVMValueRef zero = lp_build_zero(gallivm, params->type); + for (chan = 0; chan < 4; chan++) { + params->outdata[chan] = zero; + } + return; + } + + offset = lp_build_andnot(&int_coord_bld, offset, out_of_bounds); + struct lp_build_context texel_bld; + lp_build_context_init(&texel_bld, gallivm, texel_type); + lp_build_fetch_rgba_soa(gallivm, + format_desc, + texel_type, TRUE, + base_ptr, offset, + i, j, + NULL, + params->outdata); + + for (unsigned chan = 0; chan < 4; chan++) { + params->outdata[chan] = lp_build_select(&texel_bld, out_of_bounds, + texel_bld.zero, params->outdata[chan]); + } + } else if (params->img_op == LP_IMG_STORE) { + if (static_texture_state->format == PIPE_FORMAT_NONE) + return; + lp_build_store_rgba_soa(gallivm, format_desc, params->type, params->exec_mask, base_ptr, offset, out_of_bounds, + params->indata); + } else { + if (static_texture_state->format == PIPE_FORMAT_NONE) { + /* + * For atomic operation just return 0 in the unbound case to avoid a crash. + */ + LLVMValueRef zero = lp_build_zero(gallivm, params->type); + params->outdata[0] = zero; + return; + } + lp_build_do_atomic_soa(gallivm, format_desc, params->type, params->exec_mask, base_ptr, offset, out_of_bounds, + params->img_op, params->op, params->indata, params->indata2, params->outdata); + } +} diff -Nru mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c --- mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c 2020-06-12 01:21:16.000000000 +0000 @@ -222,7 +222,7 @@ * XX XX XX XX if shift right (shift == -1) * */ -#ifdef PIPE_ARCH_LITTLE_ENDIAN +#if UTIL_ARCH_LITTLE_ENDIAN shift = channel == 0 ? 1 : -1; #else shift = channel == 0 ? -1 : 1; @@ -293,7 +293,7 @@ int shift = shifts[channel][i]; /* See endianness diagram above */ -#ifdef PIPE_ARCH_BIG_ENDIAN +#if UTIL_ARCH_BIG_ENDIAN shift = -shift; #endif @@ -519,7 +519,7 @@ for (chan = 0; chan < 4; ++chan) { if (swizzles[chan] < 4) { /* We need to move channel swizzles[chan] into channel chan */ -#ifdef PIPE_ARCH_LITTLE_ENDIAN +#if UTIL_ARCH_LITTLE_ENDIAN if (swizzles[chan] - chan == -shift) { mask |= ((1ULL << type.width) - 1) << (swizzles[chan] * type.width); } @@ -652,7 +652,7 @@ struct lp_type double_type_lp = single_type_lp; LLVMTypeRef single_type; LLVMTypeRef double_type; - LLVMValueRef t0, t1, t2, t3; + LLVMValueRef t0 = NULL, t1 = NULL, t2 = NULL, t3 = NULL; double_type_lp.length >>= 1; double_type_lp.width <<= 1; @@ -660,17 +660,45 @@ double_type = lp_build_vec_type(gallivm, double_type_lp); single_type = lp_build_vec_type(gallivm, single_type_lp); + LLVMValueRef double_type_zero = LLVMConstNull(double_type); /* Interleave x, y, z, w -> xy and zw */ - t0 = lp_build_interleave2_half(gallivm, single_type_lp, src[0], src[1], 0); - t1 = lp_build_interleave2_half(gallivm, single_type_lp, src[2], src[3], 0); - t2 = lp_build_interleave2_half(gallivm, single_type_lp, src[0], src[1], 1); - t3 = lp_build_interleave2_half(gallivm, single_type_lp, src[2], src[3], 1); - - /* Cast to double width type for second interleave */ - t0 = LLVMBuildBitCast(gallivm->builder, t0, double_type, "t0"); - t1 = LLVMBuildBitCast(gallivm->builder, t1, double_type, "t1"); - t2 = LLVMBuildBitCast(gallivm->builder, t2, double_type, "t2"); - t3 = LLVMBuildBitCast(gallivm->builder, t3, double_type, "t3"); + if (src[0] || src[1]) { + LLVMValueRef src0 = src[0]; + LLVMValueRef src1 = src[1]; + if (!src0) + src0 = LLVMConstNull(single_type); + if (!src1) + src1 = LLVMConstNull(single_type); + t0 = lp_build_interleave2_half(gallivm, single_type_lp, src0, src1, 0); + t2 = lp_build_interleave2_half(gallivm, single_type_lp, src0, src1, 1); + + /* Cast to double width type for second interleave */ + t0 = LLVMBuildBitCast(gallivm->builder, t0, double_type, "t0"); + t2 = LLVMBuildBitCast(gallivm->builder, t2, double_type, "t2"); + } + if (src[2] || src[3]) { + LLVMValueRef src2 = src[2]; + LLVMValueRef src3 = src[3]; + if (!src2) + src2 = LLVMConstNull(single_type); + if (!src3) + src3 = LLVMConstNull(single_type); + t1 = lp_build_interleave2_half(gallivm, single_type_lp, src2, src3, 0); + t3 = lp_build_interleave2_half(gallivm, single_type_lp, src2, src3, 1); + + /* Cast to double width type for second interleave */ + t1 = LLVMBuildBitCast(gallivm->builder, t1, double_type, "t1"); + t3 = LLVMBuildBitCast(gallivm->builder, t3, double_type, "t3"); + } + + if (!t0) + t0 = double_type_zero; + if (!t1) + t1 = double_type_zero; + if (!t2) + t2 = double_type_zero; + if (!t3) + t3 = double_type_zero; /* Interleave xy, zw -> xyzw */ dst[0] = lp_build_interleave2_half(gallivm, double_type_lp, t0, t1, 0); diff -Nru mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c --- mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c 2020-06-12 01:21:16.000000000 +0000 @@ -2398,6 +2398,134 @@ emit_data->output[emit_data->chan] = lp_build_shr(uint_bld, emit_data->args[0], masked_count); } +static void bfi_emit_cpu(const struct lp_build_tgsi_action *action, + struct lp_build_tgsi_context *bld_base, + struct lp_build_emit_data *emit_data) { + /* + * def bfi(base, insert, offset, bits): + * if offset < 0 or bits < 0 or offset + bits > 32: + * return undefined + * # << defined such that mask == ~0 when bits == 32, offset == 0 + * mask = ((1 << bits) - 1) << offset + * return ((insert << offset) & mask) | (base & ~mask) + */ + struct lp_build_context *uint_bld = &bld_base->uint_bld; + LLVMValueRef one_shl_bits_dec_one = lp_build_sub( + uint_bld, lp_build_shl(uint_bld, uint_bld->one, emit_data->args[3]), + uint_bld->one); + LLVMValueRef mask = + lp_build_shl(uint_bld, one_shl_bits_dec_one, emit_data->args[2]); + LLVMValueRef insert_shl_offset = + lp_build_shl(uint_bld, emit_data->args[1], emit_data->args[2]); + LLVMValueRef insert_shl_offset_and_mask = + lp_build_and(uint_bld, insert_shl_offset, mask); + LLVMValueRef base_and_not_mask = + lp_build_and(uint_bld, emit_data->args[0], lp_build_not(uint_bld, mask)); + + emit_data->output[emit_data->chan] = + lp_build_or(uint_bld, insert_shl_offset_and_mask, base_and_not_mask); +} + +static void lsb_emit_cpu(const struct lp_build_tgsi_action *action, + struct lp_build_tgsi_context *bld_base, + struct lp_build_emit_data *emit_data) { + struct lp_build_context *uint_bld = &bld_base->int_bld; + + LLVMValueRef result = lp_build_cttz(uint_bld, emit_data->args[0]); + LLVMValueRef cond = + lp_build_cmp(uint_bld, PIPE_FUNC_LESS, result, + lp_build_const_vec(uint_bld->gallivm, uint_bld->type, 32)); + emit_data->output[emit_data->chan] = lp_build_select( + uint_bld, cond, result, + lp_build_const_vec(uint_bld->gallivm, uint_bld->type, -1)); +} + +static void umsb_emit_cpu(const struct lp_build_tgsi_action *action, + struct lp_build_tgsi_context *bld_base, + struct lp_build_emit_data *emit_data) { + struct lp_build_context *uint_bld = &bld_base->int_bld; + emit_data->output[emit_data->chan] = lp_build_sub( + uint_bld, lp_build_const_vec(uint_bld->gallivm, uint_bld->type, 31), + lp_build_ctlz(uint_bld, emit_data->args[0])); +} + +static void imsb_emit_cpu(const struct lp_build_tgsi_action *action, + struct lp_build_tgsi_context *bld_base, + struct lp_build_emit_data *emit_data) { + struct lp_build_context *uint_bld = &bld_base->int_bld; + + LLVMValueRef cond = + lp_build_cmp(uint_bld, PIPE_FUNC_LESS, emit_data->args[0], + lp_build_const_vec(uint_bld->gallivm, uint_bld->type, 0)); + emit_data->args[0] = lp_build_select( + uint_bld, cond, lp_build_not(uint_bld, emit_data->args[0]), + emit_data->args[0]); + umsb_emit_cpu(action, bld_base, emit_data); +} + +static void popc_emit_cpu(const struct lp_build_tgsi_action *action, + struct lp_build_tgsi_context *bld_base, + struct lp_build_emit_data *emit_data) { + struct lp_build_context *uint_bld = &bld_base->int_bld; + emit_data->output[emit_data->chan] = + lp_build_popcount(uint_bld, emit_data->args[0]); +} + +static void ibfe_emit_cpu(const struct lp_build_tgsi_action *action, + struct lp_build_tgsi_context *bld_base, + struct lp_build_emit_data *emit_data) { + /* def ibfe(value, offset, bits): + * if offset < 0 or bits < 0 or offset + bits > 32: + * return undefined + * if bits == 0: return 0 + * # Note: >> sign-extends + * return (value << (32 - offset - bits)) >> (32 - bits) + */ + struct lp_build_context *uint_bld = &bld_base->int_bld; + + LLVMValueRef r_32_sub_bits = lp_build_sub( + uint_bld, lp_build_const_vec(uint_bld->gallivm, uint_bld->type, 32), + emit_data->args[2]); + LLVMValueRef temp1 = + lp_build_sub(uint_bld, r_32_sub_bits, emit_data->args[1]); + LLVMValueRef temp2 = lp_build_shl(uint_bld, emit_data->args[0], temp1); + LLVMValueRef cond = + lp_build_cmp(uint_bld, PIPE_FUNC_EQUAL, emit_data->args[2], + lp_build_const_vec(uint_bld->gallivm, uint_bld->type, 0)); + emit_data->output[emit_data->chan] = lp_build_select( + uint_bld, cond, lp_build_const_vec(uint_bld->gallivm, uint_bld->type, 0), + lp_build_shr(uint_bld, temp2, r_32_sub_bits)); +} + +static void ubfe_emit_cpu(const struct lp_build_tgsi_action *action, + struct lp_build_tgsi_context *bld_base, + struct lp_build_emit_data *emit_data) { + /* def ubfe(value, offset, bits): + * if offset < 0 or bits < 0 or offset + bits > 32: + * return undefined + * if bits == 0: return 0 + * # Note: >> does not sign-extend + * return (value << (32 - offset - bits)) >> (32 - bits) + */ + struct lp_build_context *uint_bld = &bld_base->uint_bld; + + LLVMValueRef r_32_sub_bits = lp_build_sub( + uint_bld, lp_build_const_vec(uint_bld->gallivm, uint_bld->type, 32), + emit_data->args[2]); + LLVMValueRef temp1 = + lp_build_sub(uint_bld, r_32_sub_bits, emit_data->args[1]); + LLVMValueRef temp2 = lp_build_shl(uint_bld, emit_data->args[0], temp1); + emit_data->output[emit_data->chan] = + lp_build_shr(uint_bld, temp2, r_32_sub_bits); +} + +static void brev_emit_cpu(const struct lp_build_tgsi_action *action, + struct lp_build_tgsi_context *bld_base, + struct lp_build_emit_data *emit_data) { + struct lp_build_context *uint_bld = &bld_base->uint_bld; + emit_data->output[emit_data->chan] = + lp_build_bitfield_reverse(uint_bld, emit_data->args[0]); +} void lp_set_default_actions_cpu( @@ -2507,4 +2635,14 @@ bld_base->op_actions[TGSI_OPCODE_U64SHL].emit = u64shl_emit_cpu; bld_base->op_actions[TGSI_OPCODE_I64SHR].emit = i64shr_emit_cpu; bld_base->op_actions[TGSI_OPCODE_U64SHR].emit = u64shr_emit_cpu; + + bld_base->op_actions[TGSI_OPCODE_BFI].emit = bfi_emit_cpu; + bld_base->op_actions[TGSI_OPCODE_POPC].emit = popc_emit_cpu; + bld_base->op_actions[TGSI_OPCODE_LSB].emit = lsb_emit_cpu; + bld_base->op_actions[TGSI_OPCODE_IMSB].emit = imsb_emit_cpu; + bld_base->op_actions[TGSI_OPCODE_UMSB].emit = umsb_emit_cpu; + bld_base->op_actions[TGSI_OPCODE_IBFE].emit = ibfe_emit_cpu; + bld_base->op_actions[TGSI_OPCODE_UBFE].emit = ubfe_emit_cpu; + bld_base->op_actions[TGSI_OPCODE_BREV].emit = brev_emit_cpu; + } diff -Nru mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c --- mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c 2020-06-12 01:21:16.000000000 +0000 @@ -549,6 +549,10 @@ } } + if (bld_base->emit_prologue_post_decl) { + bld_base->emit_prologue_post_decl(bld_base); + } + while (bld_base->pc != -1) { const struct tgsi_full_instruction *instr = bld_base->instructions + bld_base->pc; diff -Nru mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h --- mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h 2020-06-12 01:21:16.000000000 +0000 @@ -41,6 +41,7 @@ #include "gallivm/lp_bld_tgsi_action.h" #include "gallivm/lp_bld_limits.h" #include "gallivm/lp_bld_sample.h" +#include "gallivm/lp_bld_ir_common.h" #include "lp_bld_type.h" #include "pipe/p_compiler.h" #include "pipe/p_state.h" @@ -60,14 +61,14 @@ struct tgsi_full_immediate; struct tgsi_full_instruction; struct tgsi_full_src_register; +struct tgsi_full_dst_register; struct tgsi_opcode_info; struct tgsi_token; struct tgsi_shader_info; struct lp_build_mask_context; struct gallivm_state; struct lp_derivatives; -struct lp_build_tgsi_gs_iface; - +struct lp_build_gs_iface; enum lp_build_tex_modifier { LP_BLD_TEX_MODIFIER_NONE = 0, @@ -165,11 +166,23 @@ */ struct lp_bld_tgsi_system_values { LLVMValueRef instance_id; + LLVMValueRef base_instance; LLVMValueRef vertex_id; LLVMValueRef vertex_id_nobase; LLVMValueRef prim_id; LLVMValueRef basevertex; LLVMValueRef invocation_id; + LLVMValueRef draw_id; + LLVMValueRef thread_id; + LLVMValueRef block_id; + LLVMValueRef grid_size; + LLVMValueRef front_facing; + LLVMValueRef work_dim; + LLVMValueRef block_size; + LLVMValueRef tess_coord; + LLVMValueRef tess_outer; + LLVMValueRef tess_inner; + LLVMValueRef vertices_in; }; @@ -210,6 +223,23 @@ enum lp_build_tex_modifier modifier); }; +struct lp_img_params; + +struct lp_build_image_soa +{ + void + (*destroy)( struct lp_build_image_soa *image ); + + void + (*emit_op)(const struct lp_build_image_soa *image, + struct gallivm_state *gallivm, + const struct lp_img_params *params); + + void + (*emit_size_query)( const struct lp_build_image_soa *sampler, + struct gallivm_state *gallivm, + const struct lp_sampler_size_query_params *params); +}; void lp_build_tgsi_info(const struct tgsi_token *tokens, @@ -227,9 +257,15 @@ LLVMValueRef thread_data_ptr; const struct lp_build_sampler_soa *sampler; const struct tgsi_shader_info *info; - const struct lp_build_tgsi_gs_iface *gs_iface; + const struct lp_build_gs_iface *gs_iface; + const struct lp_build_tcs_iface *tcs_iface; + const struct lp_build_tes_iface *tes_iface; LLVMValueRef ssbo_ptr; LLVMValueRef ssbo_sizes_ptr; + const struct lp_build_image_soa *image; + LLVMValueRef shared_ptr; + const struct lp_build_coro_suspend_info *coro; + LLVMValueRef kernel_args; }; void @@ -238,7 +274,6 @@ const struct lp_build_tgsi_params *params, LLVMValueRef (*outputs)[4]); - void lp_build_tgsi_aos(struct gallivm_state *gallivm, const struct tgsi_token *tokens, @@ -251,67 +286,6 @@ const struct tgsi_shader_info *info); -enum lp_exec_mask_break_type { - LP_EXEC_MASK_BREAK_TYPE_LOOP, - LP_EXEC_MASK_BREAK_TYPE_SWITCH -}; - - -struct lp_exec_mask { - struct lp_build_context *bld; - - boolean has_mask; - boolean ret_in_main; - - LLVMTypeRef int_vec_type; - - LLVMValueRef exec_mask; - - LLVMValueRef ret_mask; - LLVMValueRef cond_mask; - LLVMValueRef switch_mask; /* current switch exec mask */ - LLVMValueRef cont_mask; - LLVMValueRef break_mask; - - struct function_ctx { - int pc; - LLVMValueRef ret_mask; - - LLVMValueRef cond_stack[LP_MAX_TGSI_NESTING]; - int cond_stack_size; - - /* keep track if break belongs to switch or loop */ - enum lp_exec_mask_break_type break_type_stack[LP_MAX_TGSI_NESTING]; - enum lp_exec_mask_break_type break_type; - - struct { - LLVMValueRef switch_val; - LLVMValueRef switch_mask; - LLVMValueRef switch_mask_default; - boolean switch_in_default; - unsigned switch_pc; - } switch_stack[LP_MAX_TGSI_NESTING]; - int switch_stack_size; - LLVMValueRef switch_val; - LLVMValueRef switch_mask_default; /* reverse of switch mask used for default */ - boolean switch_in_default; /* if switch exec is currently in default */ - unsigned switch_pc; /* when used points to default or endswitch-1 */ - - LLVMValueRef loop_limiter; - LLVMBasicBlockRef loop_block; - LLVMValueRef break_var; - struct { - LLVMBasicBlockRef loop_block; - LLVMValueRef cont_mask; - LLVMValueRef break_mask; - LLVMValueRef break_var; - } loop_stack[LP_MAX_TGSI_NESTING]; - int loop_stack_size; - - } *function_stack; - int function_stack_size; -}; - struct lp_build_tgsi_inst_list { struct tgsi_full_instruction *instructions; @@ -335,6 +309,14 @@ enum tgsi_opcode_type, unsigned); +typedef void (*lp_build_emit_store_reg_fn)(struct lp_build_tgsi_context *, + enum tgsi_opcode_type, + const struct tgsi_full_dst_register *, + unsigned, + unsigned, + LLVMValueRef, + LLVMValueRef); + struct lp_build_tgsi_context { struct lp_build_context base; @@ -364,6 +346,7 @@ const struct tgsi_shader_info *info; lp_build_emit_fetch_fn emit_fetch_funcs[TGSI_FILE_COUNT]; + lp_build_emit_store_reg_fn emit_store_reg_funcs[TGSI_FILE_COUNT]; LLVMValueRef (*emit_swizzle)(struct lp_build_tgsi_context *, LLVMValueRef, unsigned, unsigned, unsigned, unsigned); @@ -404,6 +387,12 @@ */ void (*emit_prologue)(struct lp_build_tgsi_context*); + /** This function allows the user to insert some instructions after + * declarations section, but before any other code. + * It is optional and does not need to be implemented. + */ + void (*emit_prologue_post_decl)(struct lp_build_tgsi_context*); + /** This function allows the user to insert some instructions at the end of * the program. This callback is intended to be used for emitting * instructions to handle the export for the output registers, but it can @@ -413,29 +402,82 @@ void (*emit_epilogue)(struct lp_build_tgsi_context*); }; -struct lp_build_tgsi_gs_iface +struct lp_build_gs_iface { - LLVMValueRef (*fetch_input)(const struct lp_build_tgsi_gs_iface *gs_iface, - struct lp_build_tgsi_context * bld_base, + LLVMValueRef (*fetch_input)(const struct lp_build_gs_iface *gs_iface, + struct lp_build_context * bld, boolean is_vindex_indirect, LLVMValueRef vertex_index, boolean is_aindex_indirect, LLVMValueRef attrib_index, LLVMValueRef swizzle_index); - void (*emit_vertex)(const struct lp_build_tgsi_gs_iface *gs_iface, - struct lp_build_tgsi_context * bld_base, + void (*emit_vertex)(const struct lp_build_gs_iface *gs_iface, + struct lp_build_context * bld, LLVMValueRef (*outputs)[4], - LLVMValueRef emitted_vertices_vec); - void (*end_primitive)(const struct lp_build_tgsi_gs_iface *gs_iface, - struct lp_build_tgsi_context * bld_base, + LLVMValueRef emitted_vertices_vec, + LLVMValueRef stream_id); + void (*end_primitive)(const struct lp_build_gs_iface *gs_iface, + struct lp_build_context * bld, + LLVMValueRef total_emitted_vertices_vec, LLVMValueRef verts_per_prim_vec, - LLVMValueRef emitted_prims_vec); - void (*gs_epilogue)(const struct lp_build_tgsi_gs_iface *gs_iface, - struct lp_build_tgsi_context * bld_base, + LLVMValueRef emitted_prims_vec, + LLVMValueRef mask_vec); + void (*gs_epilogue)(const struct lp_build_gs_iface *gs_iface, LLVMValueRef total_emitted_vertices_vec, LLVMValueRef emitted_prims_vec); }; +struct lp_build_tcs_iface +{ + void (*emit_prologue)(struct lp_build_context * bld); + void (*emit_epilogue)(struct lp_build_context * bld); + void (*emit_barrier)(struct lp_build_context *bld_base); + + void (*emit_store_output)(const struct lp_build_tcs_iface *tcs_iface, + struct lp_build_context * bld, + unsigned name, + boolean is_vindex_indirect, + LLVMValueRef vertex_index, + boolean is_aindex_indirect, + LLVMValueRef attrib_index, + LLVMValueRef swizzle_index, + LLVMValueRef value); + + LLVMValueRef (*emit_fetch_input)(const struct lp_build_tcs_iface *tcs_iface, + struct lp_build_context * bld, + boolean is_vindex_indirect, + LLVMValueRef vertex_index, + boolean is_aindex_indirect, + LLVMValueRef attrib_index, + LLVMValueRef swizzle_index); + + LLVMValueRef (*emit_fetch_output)(const struct lp_build_tcs_iface *tcs_iface, + struct lp_build_context * bld, + boolean is_vindex_indirect, + LLVMValueRef vertex_index, + boolean is_aindex_indirect, + LLVMValueRef attrib_index, + LLVMValueRef swizzle_index, + uint32_t name); +}; + +struct lp_build_tes_iface +{ + LLVMValueRef (*fetch_vertex_input)(const struct lp_build_tes_iface *tes_iface, + struct lp_build_context * bld, + boolean is_vindex_indirect, + LLVMValueRef vertex_index, + boolean is_aindex_indirect, + LLVMValueRef attrib_index, + LLVMValueRef swizzle_index); + + LLVMValueRef (*fetch_patch_input)(const struct lp_build_tes_iface *tes_iface, + struct lp_build_context * bld, + boolean is_aindex_indirect, + LLVMValueRef attrib_index, + LLVMValueRef swizzle_index); +}; + struct lp_build_tgsi_soa_context { struct lp_build_tgsi_context bld_base; @@ -443,7 +485,10 @@ /* Builder for scalar elements of shader's data type (float) */ struct lp_build_context elem_bld; - const struct lp_build_tgsi_gs_iface *gs_iface; + const struct lp_build_gs_iface *gs_iface; + const struct lp_build_tcs_iface *tcs_iface; + const struct lp_build_tes_iface *tes_iface; + LLVMValueRef emitted_prims_vec_ptr; LLVMValueRef total_emitted_vertices_vec_ptr; LLVMValueRef emitted_vertices_vec_ptr; @@ -463,7 +508,12 @@ LLVMValueRef ssbos[LP_MAX_TGSI_SHADER_BUFFERS]; LLVMValueRef ssbo_sizes[LP_MAX_TGSI_SHADER_BUFFERS]; + LLVMValueRef shared_ptr; + + const struct lp_build_coro_suspend_info *coro; + const struct lp_build_sampler_soa *sampler; + const struct lp_build_image_soa *image; struct tgsi_declaration_sampler_view sv[PIPE_MAX_SHADER_SAMPLER_VIEWS]; diff -Nru mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_tgsi_info.c mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_tgsi_info.c --- mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_tgsi_info.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_tgsi_info.c 2020-06-12 01:21:16.000000000 +0000 @@ -309,6 +309,10 @@ continue; } else if (dst->File == TGSI_FILE_BUFFER) { continue; + } else if (dst->File == TGSI_FILE_IMAGE) { + continue; + } else if (dst->File == TGSI_FILE_MEMORY) { + continue; } else { assert(0); continue; diff -Nru mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c --- mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c 2020-06-12 01:21:16.000000000 +0000 @@ -1,9 +1,9 @@ /************************************************************************** - * + * * Copyright 2009 VMware, Inc. * Copyright 2007-2008 VMware, Inc. * All Rights Reserved. - * + * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the * "Software"), to deal in the Software without restriction, including @@ -11,11 +11,11 @@ * distribute, sub license, and/or sell copies of the Software, and to * permit persons to whom the Software is furnished to do so, subject to * the following conditions: - * + * * The above copyright notice and this permission notice (including the * next paragraph) shall be included in all copies or substantial portions * of the Software. - * + * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. @@ -23,7 +23,7 @@ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * + * **************************************************************************/ /** @@ -60,6 +60,7 @@ #include "lp_bld_misc.h" #include "lp_bld_swizzle.h" #include "lp_bld_flow.h" +#include "lp_bld_coro.h" #include "lp_bld_quad.h" #include "lp_bld_tgsi.h" #include "lp_bld_limits.h" @@ -68,10 +69,6 @@ #include "lp_bld_sample.h" #include "lp_bld_struct.h" -/* SM 4.0 says that subroutines can nest 32 deep and - * we need one more for our main function */ -#define LP_MAX_NUM_FUNCS 33 - #define DUMP_GS_EMITS 0 /* @@ -104,10 +101,6 @@ lp_build_print_value(gallivm, buf, value); } -/* - * Return the context for the current function. - * (always 'main', if shader doesn't do any function calls) - */ static inline struct function_ctx * func_ctx(struct lp_exec_mask *mask) { @@ -117,24 +110,6 @@ } /* - * Returns true if we're in a loop. - * It's global, meaning that it returns true even if there's - * no loop inside the current function, but we were inside - * a loop inside another function, from which this one was called. - */ -static inline boolean -mask_has_loop(struct lp_exec_mask *mask) -{ - int i; - for (i = mask->function_stack_size - 1; i >= 0; --i) { - const struct function_ctx *ctx = &mask->function_stack[i]; - if (ctx->loop_stack_size > 0) - return TRUE; - } - return FALSE; -} - -/* * combine the execution mask if there is one with the current mask. */ static LLVMValueRef @@ -153,370 +128,14 @@ exec_mask->exec_mask, ""); } -/* - * Returns true if we're inside a switch statement. - * It's global, meaning that it returns true even if there's - * no switch in the current function, but we were inside - * a switch inside another function, from which this one was called. - */ -static inline boolean -mask_has_switch(struct lp_exec_mask *mask) -{ - int i; - for (i = mask->function_stack_size - 1; i >= 0; --i) { - const struct function_ctx *ctx = &mask->function_stack[i]; - if (ctx->switch_stack_size > 0) - return TRUE; - } - return FALSE; -} - -/* - * Returns true if we're inside a conditional. - * It's global, meaning that it returns true even if there's - * no conditional in the current function, but we were inside - * a conditional inside another function, from which this one was called. - */ -static inline boolean -mask_has_cond(struct lp_exec_mask *mask) -{ - int i; - for (i = mask->function_stack_size - 1; i >= 0; --i) { - const struct function_ctx *ctx = &mask->function_stack[i]; - if (ctx->cond_stack_size > 0) - return TRUE; - } - return FALSE; -} - - -/* - * Initialize a function context at the specified index. - */ -static void -lp_exec_mask_function_init(struct lp_exec_mask *mask, int function_idx) -{ - LLVMTypeRef int_type = LLVMInt32TypeInContext(mask->bld->gallivm->context); - LLVMBuilderRef builder = mask->bld->gallivm->builder; - struct function_ctx *ctx = &mask->function_stack[function_idx]; - - ctx->cond_stack_size = 0; - ctx->loop_stack_size = 0; - ctx->switch_stack_size = 0; - - if (function_idx == 0) { - ctx->ret_mask = mask->ret_mask; - } - - ctx->loop_limiter = lp_build_alloca(mask->bld->gallivm, - int_type, "looplimiter"); - LLVMBuildStore( - builder, - LLVMConstInt(int_type, LP_MAX_TGSI_LOOP_ITERATIONS, false), - ctx->loop_limiter); -} - -static void lp_exec_mask_init(struct lp_exec_mask *mask, struct lp_build_context *bld) -{ - mask->bld = bld; - mask->has_mask = FALSE; - mask->ret_in_main = FALSE; - /* For the main function */ - mask->function_stack_size = 1; - - mask->int_vec_type = lp_build_int_vec_type(bld->gallivm, mask->bld->type); - mask->exec_mask = mask->ret_mask = mask->break_mask = mask->cont_mask = - mask->cond_mask = mask->switch_mask = - LLVMConstAllOnes(mask->int_vec_type); - - mask->function_stack = CALLOC(LP_MAX_NUM_FUNCS, - sizeof(mask->function_stack[0])); - lp_exec_mask_function_init(mask, 0); -} - -static void -lp_exec_mask_fini(struct lp_exec_mask *mask) -{ - FREE(mask->function_stack); -} - -static void lp_exec_mask_update(struct lp_exec_mask *mask) -{ - LLVMBuilderRef builder = mask->bld->gallivm->builder; - boolean has_loop_mask = mask_has_loop(mask); - boolean has_cond_mask = mask_has_cond(mask); - boolean has_switch_mask = mask_has_switch(mask); - boolean has_ret_mask = mask->function_stack_size > 1 || - mask->ret_in_main; - - if (has_loop_mask) { - /*for loops we need to update the entire mask at runtime */ - LLVMValueRef tmp; - assert(mask->break_mask); - tmp = LLVMBuildAnd(builder, - mask->cont_mask, - mask->break_mask, - "maskcb"); - mask->exec_mask = LLVMBuildAnd(builder, - mask->cond_mask, - tmp, - "maskfull"); - } else - mask->exec_mask = mask->cond_mask; - - if (has_switch_mask) { - mask->exec_mask = LLVMBuildAnd(builder, - mask->exec_mask, - mask->switch_mask, - "switchmask"); - } - - if (has_ret_mask) { - mask->exec_mask = LLVMBuildAnd(builder, - mask->exec_mask, - mask->ret_mask, - "callmask"); - } - - mask->has_mask = (has_cond_mask || - has_loop_mask || - has_switch_mask || - has_ret_mask); -} - -static void lp_exec_mask_cond_push(struct lp_exec_mask *mask, - LLVMValueRef val) -{ - LLVMBuilderRef builder = mask->bld->gallivm->builder; - struct function_ctx *ctx = func_ctx(mask); - - if (ctx->cond_stack_size >= LP_MAX_TGSI_NESTING) { - ctx->cond_stack_size++; - return; - } - if (ctx->cond_stack_size == 0 && mask->function_stack_size == 1) { - assert(mask->cond_mask == LLVMConstAllOnes(mask->int_vec_type)); - } - ctx->cond_stack[ctx->cond_stack_size++] = mask->cond_mask; - assert(LLVMTypeOf(val) == mask->int_vec_type); - mask->cond_mask = LLVMBuildAnd(builder, - mask->cond_mask, - val, - ""); - lp_exec_mask_update(mask); -} - -static void lp_exec_mask_cond_invert(struct lp_exec_mask *mask) -{ - LLVMBuilderRef builder = mask->bld->gallivm->builder; - struct function_ctx *ctx = func_ctx(mask); - LLVMValueRef prev_mask; - LLVMValueRef inv_mask; - - assert(ctx->cond_stack_size); - if (ctx->cond_stack_size >= LP_MAX_TGSI_NESTING) - return; - prev_mask = ctx->cond_stack[ctx->cond_stack_size - 1]; - if (ctx->cond_stack_size == 1 && mask->function_stack_size == 1) { - assert(prev_mask == LLVMConstAllOnes(mask->int_vec_type)); - } - - inv_mask = LLVMBuildNot(builder, mask->cond_mask, ""); - - mask->cond_mask = LLVMBuildAnd(builder, - inv_mask, - prev_mask, ""); - lp_exec_mask_update(mask); -} - -static void lp_exec_mask_cond_pop(struct lp_exec_mask *mask) -{ - struct function_ctx *ctx = func_ctx(mask); - assert(ctx->cond_stack_size); - --ctx->cond_stack_size; - if (ctx->cond_stack_size >= LP_MAX_TGSI_NESTING) - return; - mask->cond_mask = ctx->cond_stack[ctx->cond_stack_size]; - lp_exec_mask_update(mask); -} - -static void lp_exec_bgnloop(struct lp_exec_mask *mask) -{ - LLVMBuilderRef builder = mask->bld->gallivm->builder; - struct function_ctx *ctx = func_ctx(mask); - - if (ctx->loop_stack_size >= LP_MAX_TGSI_NESTING) { - ++ctx->loop_stack_size; - return; - } - - ctx->break_type_stack[ctx->loop_stack_size + ctx->switch_stack_size] = - ctx->break_type; - ctx->break_type = LP_EXEC_MASK_BREAK_TYPE_LOOP; - - ctx->loop_stack[ctx->loop_stack_size].loop_block = ctx->loop_block; - ctx->loop_stack[ctx->loop_stack_size].cont_mask = mask->cont_mask; - ctx->loop_stack[ctx->loop_stack_size].break_mask = mask->break_mask; - ctx->loop_stack[ctx->loop_stack_size].break_var = ctx->break_var; - ++ctx->loop_stack_size; - - ctx->break_var = lp_build_alloca(mask->bld->gallivm, mask->int_vec_type, ""); - LLVMBuildStore(builder, mask->break_mask, ctx->break_var); - - ctx->loop_block = lp_build_insert_new_block(mask->bld->gallivm, "bgnloop"); - - LLVMBuildBr(builder, ctx->loop_block); - LLVMPositionBuilderAtEnd(builder, ctx->loop_block); - - mask->break_mask = LLVMBuildLoad(builder, ctx->break_var, ""); - - lp_exec_mask_update(mask); -} - -static void lp_exec_break(struct lp_exec_mask *mask, +static void lp_exec_tgsi_break(struct lp_exec_mask *mask, struct lp_build_tgsi_context * bld_base) { - LLVMBuilderRef builder = mask->bld->gallivm->builder; - struct function_ctx *ctx = func_ctx(mask); - - if (ctx->break_type == LP_EXEC_MASK_BREAK_TYPE_LOOP) { - LLVMValueRef exec_mask = LLVMBuildNot(builder, - mask->exec_mask, - "break"); - - mask->break_mask = LLVMBuildAnd(builder, - mask->break_mask, - exec_mask, "break_full"); - } - else { - enum tgsi_opcode opcode = - bld_base->instructions[bld_base->pc + 1].Instruction.Opcode; - boolean break_always = (opcode == TGSI_OPCODE_ENDSWITCH || - opcode == TGSI_OPCODE_CASE); - - - if (ctx->switch_in_default) { - /* - * stop default execution but only if this is an unconditional switch. - * (The condition here is not perfect since dead code after break is - * allowed but should be sufficient since false negatives are just - * unoptimized - so we don't have to pre-evaluate that). - */ - if(break_always && ctx->switch_pc) { - bld_base->pc = ctx->switch_pc; - return; - } - } - - if (break_always) { - mask->switch_mask = LLVMConstNull(mask->bld->int_vec_type); - } - else { - LLVMValueRef exec_mask = LLVMBuildNot(builder, - mask->exec_mask, - "break"); - mask->switch_mask = LLVMBuildAnd(builder, - mask->switch_mask, - exec_mask, "break_switch"); - } - } - - lp_exec_mask_update(mask); -} - -static void lp_exec_continue(struct lp_exec_mask *mask) -{ - LLVMBuilderRef builder = mask->bld->gallivm->builder; - LLVMValueRef exec_mask = LLVMBuildNot(builder, - mask->exec_mask, - ""); - - mask->cont_mask = LLVMBuildAnd(builder, - mask->cont_mask, - exec_mask, ""); - - lp_exec_mask_update(mask); -} - - -static void lp_exec_endloop(struct gallivm_state *gallivm, - struct lp_exec_mask *mask) -{ - LLVMBuilderRef builder = mask->bld->gallivm->builder; - struct function_ctx *ctx = func_ctx(mask); - LLVMBasicBlockRef endloop; - LLVMTypeRef int_type = LLVMInt32TypeInContext(mask->bld->gallivm->context); - LLVMTypeRef reg_type = LLVMIntTypeInContext(gallivm->context, - mask->bld->type.width * - mask->bld->type.length); - LLVMValueRef i1cond, i2cond, icond, limiter; - - assert(mask->break_mask); - - - assert(ctx->loop_stack_size); - if (ctx->loop_stack_size > LP_MAX_TGSI_NESTING) { - --ctx->loop_stack_size; - return; - } - - /* - * Restore the cont_mask, but don't pop - */ - mask->cont_mask = ctx->loop_stack[ctx->loop_stack_size - 1].cont_mask; - lp_exec_mask_update(mask); - - /* - * Unlike the continue mask, the break_mask must be preserved across loop - * iterations - */ - LLVMBuildStore(builder, mask->break_mask, ctx->break_var); - - /* Decrement the loop limiter */ - limiter = LLVMBuildLoad(builder, ctx->loop_limiter, ""); - - limiter = LLVMBuildSub( - builder, - limiter, - LLVMConstInt(int_type, 1, false), - ""); - - LLVMBuildStore(builder, limiter, ctx->loop_limiter); - - /* i1cond = (mask != 0) */ - i1cond = LLVMBuildICmp( - builder, - LLVMIntNE, - LLVMBuildBitCast(builder, mask->exec_mask, reg_type, ""), - LLVMConstNull(reg_type), "i1cond"); - - /* i2cond = (looplimiter > 0) */ - i2cond = LLVMBuildICmp( - builder, - LLVMIntSGT, - limiter, - LLVMConstNull(int_type), "i2cond"); - - /* if( i1cond && i2cond ) */ - icond = LLVMBuildAnd(builder, i1cond, i2cond, ""); - - endloop = lp_build_insert_new_block(mask->bld->gallivm, "endloop"); - - LLVMBuildCondBr(builder, - icond, ctx->loop_block, endloop); - - LLVMPositionBuilderAtEnd(builder, endloop); - - assert(ctx->loop_stack_size); - --ctx->loop_stack_size; - mask->cont_mask = ctx->loop_stack[ctx->loop_stack_size].cont_mask; - mask->break_mask = ctx->loop_stack[ctx->loop_stack_size].break_mask; - ctx->loop_block = ctx->loop_stack[ctx->loop_stack_size].loop_block; - ctx->break_var = ctx->loop_stack[ctx->loop_stack_size].break_var; - ctx->break_type = ctx->break_type_stack[ctx->loop_stack_size + - ctx->switch_stack_size]; - - lp_exec_mask_update(mask); + enum tgsi_opcode opcode = + bld_base->instructions[bld_base->pc + 1].Instruction.Opcode; + bool break_always = (opcode == TGSI_OPCODE_ENDSWITCH || + opcode == TGSI_OPCODE_CASE); + lp_exec_break(mask, &bld_base->pc, break_always); } static void lp_exec_switch(struct lp_exec_mask *mask, @@ -747,34 +366,6 @@ } -/* stores val into an address pointed to by dst_ptr. - * mask->exec_mask is used to figure out which bits of val - * should be stored into the address - * (0 means don't store this bit, 1 means do store). - */ -static void lp_exec_mask_store(struct lp_exec_mask *mask, - struct lp_build_context *bld_store, - LLVMValueRef val, - LLVMValueRef dst_ptr) -{ - LLVMBuilderRef builder = mask->bld->gallivm->builder; - LLVMValueRef exec_mask = mask->has_mask ? mask->exec_mask : NULL; - - assert(lp_check_value(bld_store->type, val)); - assert(LLVMGetTypeKind(LLVMTypeOf(dst_ptr)) == LLVMPointerTypeKind); - assert(LLVMGetElementType(LLVMTypeOf(dst_ptr)) == LLVMTypeOf(val) || - LLVMGetTypeKind(LLVMGetElementType(LLVMTypeOf(dst_ptr))) == LLVMArrayTypeKind); - - if (exec_mask) { - LLVMValueRef res, dst; - - dst = LLVMBuildLoad(builder, dst_ptr, ""); - res = lp_build_select(bld_store, exec_mask, val, dst); - LLVMBuildStore(builder, res, dst_ptr); - } else - LLVMBuildStore(builder, val, dst_ptr); -} - static void lp_exec_mask_call(struct lp_exec_mask *mask, int func, int *pc) @@ -1580,7 +1171,7 @@ vertex_index = lp_build_const_int32(gallivm, reg->Dimension.Index); } - res = bld->gs_iface->fetch_input(bld->gs_iface, bld_base, + res = bld->gs_iface->fetch_input(bld->gs_iface, &bld_base->base, reg->Dimension.Indirect, vertex_index, reg->Register.Indirect, @@ -1591,7 +1182,7 @@ if (tgsi_type_is_64bit(stype)) { LLVMValueRef swizzle_index = lp_build_const_int32(gallivm, swizzle_in >> 16); LLVMValueRef res2; - res2 = bld->gs_iface->fetch_input(bld->gs_iface, bld_base, + res2 = bld->gs_iface->fetch_input(bld->gs_iface, &bld_base->base, reg->Dimension.Indirect, vertex_index, reg->Register.Indirect, @@ -1609,6 +1200,199 @@ } static LLVMValueRef +emit_fetch_tcs_input( + struct lp_build_tgsi_context * bld_base, + const struct tgsi_full_src_register * reg, + enum tgsi_opcode_type stype, + unsigned swizzle_in) +{ + struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base); + struct gallivm_state *gallivm = bld->bld_base.base.gallivm; + const struct tgsi_shader_info *info = bld->bld_base.info; + LLVMBuilderRef builder = gallivm->builder; + LLVMValueRef attrib_index = NULL; + LLVMValueRef vertex_index = NULL; + unsigned swizzle = swizzle_in & 0xffff; + LLVMValueRef swizzle_index = lp_build_const_int32(gallivm, swizzle); + LLVMValueRef res; + + if (info->input_semantic_name[reg->Register.Index] == TGSI_SEMANTIC_PRIMID) { + /* This is really a system value not a regular input */ + assert(!reg->Register.Indirect); + assert(!reg->Dimension.Indirect); + res = bld->system_values.prim_id; + if (stype != TGSI_TYPE_UNSIGNED && stype != TGSI_TYPE_SIGNED) { + res = LLVMBuildBitCast(builder, res, bld_base->base.vec_type, ""); + } + return res; + } + + if (reg->Register.Indirect) { + int index_limit = info->file_max[reg->Register.File]; + attrib_index = get_indirect_index(bld, + reg->Register.File, + reg->Register.Index, + ®->Indirect, + index_limit); + } else { + attrib_index = lp_build_const_int32(gallivm, reg->Register.Index); + } + + if (reg->Dimension.Indirect) { + vertex_index = get_indirect_index(bld, + reg->Register.File, + reg->Dimension.Index, + ®->DimIndirect, + PIPE_MAX_SHADER_INPUTS); + } else { + vertex_index = lp_build_const_int32(gallivm, reg->Dimension.Index); + } + + // TCS can read from its own outputs + if (reg->Register.File == TGSI_FILE_OUTPUT) { + res = bld->tcs_iface->emit_fetch_output(bld->tcs_iface, (struct lp_build_context*)bld_base, + reg->Dimension.Indirect, + vertex_index, + reg->Register.Indirect, + attrib_index, + swizzle_index, + bld_base->info->output_semantic_name[reg->Register.Index]); + } else { + res = bld->tcs_iface->emit_fetch_input(bld->tcs_iface, (struct lp_build_context*)bld_base, + reg->Dimension.Indirect, + vertex_index, + reg->Register.Indirect, + attrib_index, + swizzle_index); + } + + + assert(res); + if (tgsi_type_is_64bit(stype)) { + LLVMValueRef swizzle_index = lp_build_const_int32(gallivm, swizzle_in >> 16); + LLVMValueRef res2; + if (reg->Register.File == TGSI_FILE_OUTPUT) { + res2 = bld->tcs_iface->emit_fetch_output(bld->tcs_iface, (struct lp_build_context*)bld_base, + reg->Dimension.Indirect, + vertex_index, + reg->Register.Indirect, + attrib_index, + swizzle_index, + bld_base->info->output_semantic_name[reg->Register.Index]); + } else { + res2 = bld->tcs_iface->emit_fetch_input(bld->tcs_iface, (struct lp_build_context*)bld_base, + reg->Dimension.Indirect, + vertex_index, + reg->Register.Indirect, + attrib_index, + swizzle_index); + } + assert(res2); + res = emit_fetch_64bit(bld_base, stype, res, res2); + } else if (stype == TGSI_TYPE_UNSIGNED) { + res = LLVMBuildBitCast(builder, res, bld_base->uint_bld.vec_type, ""); + } else if (stype == TGSI_TYPE_SIGNED) { + res = LLVMBuildBitCast(builder, res, bld_base->int_bld.vec_type, ""); + } + + return res; +} + +static LLVMValueRef +emit_fetch_tes_input( + struct lp_build_tgsi_context * bld_base, + const struct tgsi_full_src_register * reg, + enum tgsi_opcode_type stype, + unsigned swizzle_in) +{ + struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base); + struct gallivm_state *gallivm = bld->bld_base.base.gallivm; + const struct tgsi_shader_info *info = bld->bld_base.info; + LLVMBuilderRef builder = gallivm->builder; + LLVMValueRef attrib_index = NULL; + LLVMValueRef vertex_index = NULL; + unsigned swizzle = swizzle_in & 0xffff; + LLVMValueRef swizzle_index = lp_build_const_int32(gallivm, swizzle); + LLVMValueRef res; + + if (info->input_semantic_name[reg->Register.Index] == TGSI_SEMANTIC_PRIMID) { + /* This is really a system value not a regular input */ + assert(!reg->Register.Indirect); + assert(!reg->Dimension.Indirect); + res = bld->system_values.prim_id; + if (stype != TGSI_TYPE_UNSIGNED && stype != TGSI_TYPE_SIGNED) { + res = LLVMBuildBitCast(builder, res, bld_base->base.vec_type, ""); + } + return res; + } + + if (reg->Register.Indirect) { + int index_limit = info->file_max[reg->Register.File]; + attrib_index = get_indirect_index(bld, + reg->Register.File, + reg->Register.Index, + ®->Indirect, + index_limit); + } else { + attrib_index = lp_build_const_int32(gallivm, reg->Register.Index); + } + + if (reg->Dimension.Indirect) { + vertex_index = get_indirect_index(bld, + reg->Register.File, + reg->Dimension.Index, + ®->DimIndirect, + PIPE_MAX_SHADER_INPUTS); + } else { + vertex_index = lp_build_const_int32(gallivm, reg->Dimension.Index); + } + + if (info->input_semantic_name[reg->Register.Index] == TGSI_SEMANTIC_PATCH) { + res = bld->tes_iface->fetch_patch_input(bld->tes_iface, (struct lp_build_context*)bld_base, + reg->Register.Indirect, + attrib_index, + swizzle_index); + } else { + res = bld->tes_iface->fetch_vertex_input(bld->tes_iface, (struct lp_build_context*)bld_base, + reg->Dimension.Indirect, + vertex_index, + reg->Register.Indirect, + attrib_index, + swizzle_index); + } + + assert(res); + if (tgsi_type_is_64bit(stype)) { + LLVMValueRef swizzle_index = lp_build_const_int32(gallivm, swizzle_in >> 16); + LLVMValueRef res2; + if (info->input_semantic_name[reg->Register.Index] == TGSI_SEMANTIC_PATCH) { + res2 = bld->tes_iface->fetch_patch_input(bld->tes_iface, (struct lp_build_context*)bld_base, + reg->Register.Indirect, + attrib_index, + swizzle_index); + } + else { + res2 = bld->tes_iface->fetch_vertex_input(bld->tes_iface, (struct lp_build_context*)bld_base, + reg->Dimension.Indirect, + vertex_index, + reg->Register.Indirect, + attrib_index, + swizzle_index); + } + assert(res2); + res = emit_fetch_64bit(bld_base, stype, res, res2); + } else if (stype == TGSI_TYPE_UNSIGNED) { + res = LLVMBuildBitCast(builder, res, bld_base->uint_bld.vec_type, ""); + } else if (stype == TGSI_TYPE_SIGNED) { + res = LLVMBuildBitCast(builder, res, bld_base->int_bld.vec_type, ""); + } + + return res; +} + + + +static LLVMValueRef emit_fetch_temporary( struct lp_build_tgsi_context * bld_base, const struct tgsi_full_src_register * reg, @@ -1690,6 +1474,7 @@ LLVMBuilderRef builder = gallivm->builder; LLVMValueRef res; enum tgsi_opcode_type atype; // Actual type of the value + unsigned swizzle = swizzle_in & 0xffff; assert(!reg->Register.Indirect); @@ -1714,13 +1499,79 @@ atype = TGSI_TYPE_UNSIGNED; break; + case TGSI_SEMANTIC_BASEINSTANCE: + res = lp_build_broadcast_scalar(&bld_base->uint_bld, bld->system_values.base_instance); + atype = TGSI_TYPE_UNSIGNED; + break; + case TGSI_SEMANTIC_PRIMID: res = bld->system_values.prim_id; atype = TGSI_TYPE_UNSIGNED; break; case TGSI_SEMANTIC_INVOCATIONID: - res = lp_build_broadcast_scalar(&bld_base->uint_bld, bld->system_values.invocation_id); + if (info->processor == PIPE_SHADER_TESS_CTRL) + res = bld->system_values.invocation_id; + else + res = lp_build_broadcast_scalar(&bld_base->uint_bld, bld->system_values.invocation_id); + atype = TGSI_TYPE_UNSIGNED; + break; + + case TGSI_SEMANTIC_HELPER_INVOCATION: + res = LLVMBuildNot(gallivm->builder, lp_build_mask_value(bld->mask), ""); + atype = TGSI_TYPE_UNSIGNED; + break; + + case TGSI_SEMANTIC_THREAD_ID: + res = LLVMBuildExtractValue(gallivm->builder, bld->system_values.thread_id, swizzle, ""); + atype = TGSI_TYPE_UNSIGNED; + break; + + case TGSI_SEMANTIC_BLOCK_ID: + res = lp_build_extract_broadcast(gallivm, lp_type_int_vec(32, 96), bld_base->uint_bld.type, bld->system_values.block_id, lp_build_const_int32(gallivm, swizzle)); + atype = TGSI_TYPE_UNSIGNED; + break; + + case TGSI_SEMANTIC_GRID_SIZE: + res = lp_build_extract_broadcast(gallivm, lp_type_int_vec(32, 96), bld_base->uint_bld.type, bld->system_values.grid_size, lp_build_const_int32(gallivm, swizzle)); + atype = TGSI_TYPE_UNSIGNED; + break; + + case TGSI_SEMANTIC_TESSCOORD: + { + LLVMValueRef index[] = { lp_build_const_int32(gallivm, 0), lp_build_const_int32(gallivm, swizzle_in) }; + LLVMValueRef array_indexed = LLVMBuildGEP(gallivm->builder, bld->system_values.tess_coord, index, 2, "tess_coord_array_indexed"); + res = LLVMBuildLoad(builder, array_indexed, "tess_coord"); + } + atype = TGSI_TYPE_FLOAT; + break; + + case TGSI_SEMANTIC_FACE: + res = lp_build_broadcast_scalar(&bld_base->uint_bld, bld->system_values.front_facing); + atype = TGSI_TYPE_UNSIGNED; + break; + + case TGSI_SEMANTIC_DRAWID: + res = lp_build_broadcast_scalar(&bld_base->uint_bld, bld->system_values.draw_id); + atype = TGSI_TYPE_UNSIGNED; + break; + + case TGSI_SEMANTIC_TESSOUTER: + res = lp_build_extract_broadcast(gallivm, lp_type_float_vec(32, 128), bld_base->base.type, + bld->system_values.tess_outer, + lp_build_const_int32(gallivm, swizzle_in)); + atype = TGSI_TYPE_FLOAT; + break; + + case TGSI_SEMANTIC_TESSINNER: + res = lp_build_extract_broadcast(gallivm, lp_type_float_vec(32, 128), bld_base->base.type, + bld->system_values.tess_inner, + lp_build_const_int32(gallivm, swizzle_in)); + atype = TGSI_TYPE_FLOAT; + break; + + case TGSI_SEMANTIC_VERTICESIN: + res = lp_build_broadcast_scalar(&bld_base->uint_bld, bld->system_values.vertices_in); atype = TGSI_TYPE_UNSIGNED; break; @@ -1760,54 +1611,234 @@ /* TODO: use interpolation coeffs for inputs */ - if (ddx) - *ddx = lp_build_ddx(&bld->bld_base.base, src); + if (ddx) + *ddx = lp_build_ddx(&bld->bld_base.base, src); + + if (ddy) + *ddy = lp_build_ddy(&bld->bld_base.base, src); +} + +/** + * store an array of vec-length 64-bit into two arrays of vec_length floats + * i.e. + * value is d0, d1, d2, d3 etc. + * each 64-bit has high and low pieces x, y + * so gets stored into the separate channels as: + * chan_ptr = d0.x, d1.x, d2.x, d3.x + * chan_ptr2 = d0.y, d1.y, d2.y, d3.y + */ +static void +emit_store_64bit_chan(struct lp_build_tgsi_context *bld_base, + LLVMValueRef chan_ptr, LLVMValueRef chan_ptr2, + LLVMValueRef value) +{ + struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base); + struct gallivm_state *gallivm = bld_base->base.gallivm; + LLVMBuilderRef builder = gallivm->builder; + struct lp_build_context *float_bld = &bld_base->base; + unsigned i; + LLVMValueRef temp, temp2; + LLVMValueRef shuffles[LP_MAX_VECTOR_WIDTH/32]; + LLVMValueRef shuffles2[LP_MAX_VECTOR_WIDTH/32]; + + for (i = 0; i < bld_base->base.type.length; i++) { + shuffles[i] = lp_build_const_int32(gallivm, i * 2); + shuffles2[i] = lp_build_const_int32(gallivm, (i * 2) + 1); + } + + temp = LLVMBuildShuffleVector(builder, value, + LLVMGetUndef(LLVMTypeOf(value)), + LLVMConstVector(shuffles, + bld_base->base.type.length), + ""); + temp2 = LLVMBuildShuffleVector(builder, value, + LLVMGetUndef(LLVMTypeOf(value)), + LLVMConstVector(shuffles2, + bld_base->base.type.length), + ""); + + lp_exec_mask_store(&bld->exec_mask, float_bld, temp, chan_ptr); + lp_exec_mask_store(&bld->exec_mask, float_bld, temp2, chan_ptr2); +} + +static void +emit_store_output(struct lp_build_tgsi_context *bld_base, + enum tgsi_opcode_type dtype, + const struct tgsi_full_dst_register *reg, + unsigned index, + unsigned chan_index, + LLVMValueRef indirect_index, + LLVMValueRef value) +{ + struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base); + struct gallivm_state *gallivm = bld_base->base.gallivm; + LLVMBuilderRef builder = gallivm->builder; + struct lp_build_context *float_bld = &bld_base->base; + + /* Outputs are always stored as floats */ + value = LLVMBuildBitCast(builder, value, float_bld->vec_type, ""); + + if (reg->Register.Indirect) { + LLVMValueRef index_vec; /* indexes into the output registers */ + LLVMValueRef outputs_array; + LLVMTypeRef fptr_type; + + index_vec = get_soa_array_offsets(&bld_base->uint_bld, + indirect_index, + chan_index, + TRUE); + + fptr_type = LLVMPointerType(LLVMFloatTypeInContext(gallivm->context), 0); + outputs_array = LLVMBuildBitCast(builder, bld->outputs_array, fptr_type, ""); + + /* Scatter store values into output registers */ + emit_mask_scatter(bld, outputs_array, index_vec, value, + &bld->exec_mask); + } + else { + assert(LLVMTypeOf(value) == float_bld->vec_type); + LLVMValueRef out_ptr = lp_get_output_ptr(bld, reg->Register.Index, + chan_index); + + if (tgsi_type_is_64bit(dtype)) { + LLVMValueRef out_ptr2 = lp_get_output_ptr(bld, reg->Register.Index, + chan_index + 1); + emit_store_64bit_chan(bld_base, out_ptr, out_ptr2, + value); + } else + lp_exec_mask_store(&bld->exec_mask, float_bld, value, out_ptr); + } +} + +static void +emit_store_tcs_output(struct lp_build_tgsi_context *bld_base, + enum tgsi_opcode_type dtype, + const struct tgsi_full_dst_register *reg, + unsigned index, + unsigned chan_index, + LLVMValueRef indirect_index, + LLVMValueRef value) +{ + struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base); + struct gallivm_state *gallivm = bld->bld_base.base.gallivm; + const struct tgsi_shader_info *info = bld->bld_base.info; + LLVMValueRef attrib_index = NULL; + LLVMValueRef vertex_index = NULL; + LLVMValueRef channel_index = NULL; + + if (reg->Register.Indirect) { + /* + * XXX: this is possibly not quite the right value, since file_max may be + * larger than the max attrib index, due to it being the max of declared + * inputs AND the max vertices per prim (which is 6 for tri adj). + * It should however be safe to use (since we always allocate + * PIPE_MAX_SHADER_INPUTS (80) for it, which is overallocated quite a bit). + */ + int index_limit = info->file_max[reg->Register.File]; + attrib_index = get_indirect_index(bld, + reg->Register.File, + reg->Register.Index, + ®->Indirect, + index_limit); + } else { + attrib_index = lp_build_const_int32(gallivm, reg->Register.Index); + } + + if (reg->Dimension.Indirect) { + vertex_index = get_indirect_index(bld, + reg->Register.File, + reg->Dimension.Index, + ®->DimIndirect, + PIPE_MAX_SHADER_OUTPUTS); + } else { + vertex_index = lp_build_const_int32(gallivm, reg->Dimension.Index); + } + + channel_index = lp_build_const_int32(gallivm, chan_index); + + assert(bld->tcs_iface->emit_store_output); + bld->tcs_iface->emit_store_output(bld->tcs_iface, (struct lp_build_context*)bld_base, + bld_base->info->output_semantic_name[reg->Register.Index], + reg->Dimension.Indirect, + vertex_index, + reg->Register.Indirect, + attrib_index, + channel_index, + value); +} + +static void +emit_store_temp(struct lp_build_tgsi_context *bld_base, + enum tgsi_opcode_type dtype, + const struct tgsi_full_dst_register *reg, + unsigned index, + unsigned chan_index, + LLVMValueRef indirect_index, + LLVMValueRef value) +{ + struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base); + struct gallivm_state *gallivm = bld_base->base.gallivm; + LLVMBuilderRef builder = gallivm->builder; + struct lp_build_context *float_bld = &bld_base->base; + + /* Temporaries are always stored as floats */ + if (!tgsi_type_is_64bit(dtype)) + value = LLVMBuildBitCast(builder, value, float_bld->vec_type, ""); + else + value = LLVMBuildBitCast(builder, value, LLVMVectorType(LLVMFloatTypeInContext(gallivm->context), bld_base->base.type.length * 2), ""); + + if (reg->Register.Indirect) { + LLVMValueRef index_vec; /* indexes into the temp registers */ + LLVMValueRef temps_array; + LLVMTypeRef fptr_type; + + index_vec = get_soa_array_offsets(&bld_base->uint_bld, + indirect_index, + chan_index, + TRUE); + + fptr_type = LLVMPointerType(LLVMFloatTypeInContext(gallivm->context), 0); + temps_array = LLVMBuildBitCast(builder, bld->temps_array, fptr_type, ""); + + /* Scatter store values into temp registers */ + emit_mask_scatter(bld, temps_array, index_vec, value, + &bld->exec_mask); + } + else { + LLVMValueRef temp_ptr; + temp_ptr = lp_get_temp_ptr_soa(bld, reg->Register.Index, chan_index); - if (ddy) - *ddy = lp_build_ddy(&bld->bld_base.base, src); + if (tgsi_type_is_64bit(dtype)) { + LLVMValueRef temp_ptr2 = lp_get_temp_ptr_soa(bld, + reg->Register.Index, + chan_index + 1); + emit_store_64bit_chan(bld_base, temp_ptr, temp_ptr2, + value); + } + else + lp_exec_mask_store(&bld->exec_mask, float_bld, value, temp_ptr); + } } -/** - * store an array of vec-length 64-bit into two arrays of vec_length floats - * i.e. - * value is d0, d1, d2, d3 etc. - * each 64-bit has high and low pieces x, y - * so gets stored into the separate channels as: - * chan_ptr = d0.x, d1.x, d2.x, d3.x - * chan_ptr2 = d0.y, d1.y, d2.y, d3.y - */ static void -emit_store_64bit_chan(struct lp_build_tgsi_context *bld_base, - LLVMValueRef chan_ptr, LLVMValueRef chan_ptr2, - LLVMValueRef value) +emit_store_address(struct lp_build_tgsi_context *bld_base, + enum tgsi_opcode_type dtype, + const struct tgsi_full_dst_register *reg, + unsigned index, + unsigned chan_index, + LLVMValueRef indirect_index, + LLVMValueRef value) { struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base); struct gallivm_state *gallivm = bld_base->base.gallivm; LLVMBuilderRef builder = gallivm->builder; - struct lp_build_context *float_bld = &bld_base->base; - unsigned i; - LLVMValueRef temp, temp2; - LLVMValueRef shuffles[LP_MAX_VECTOR_WIDTH/32]; - LLVMValueRef shuffles2[LP_MAX_VECTOR_WIDTH/32]; - - for (i = 0; i < bld_base->base.type.length; i++) { - shuffles[i] = lp_build_const_int32(gallivm, i * 2); - shuffles2[i] = lp_build_const_int32(gallivm, (i * 2) + 1); - } - - temp = LLVMBuildShuffleVector(builder, value, - LLVMGetUndef(LLVMTypeOf(value)), - LLVMConstVector(shuffles, - bld_base->base.type.length), - ""); - temp2 = LLVMBuildShuffleVector(builder, value, - LLVMGetUndef(LLVMTypeOf(value)), - LLVMConstVector(shuffles2, - bld_base->base.type.length), - ""); + struct lp_build_context *int_bld = &bld_base->int_bld; - lp_exec_mask_store(&bld->exec_mask, float_bld, temp, chan_ptr); - lp_exec_mask_store(&bld->exec_mask, float_bld, temp2, chan_ptr2); + assert(dtype == TGSI_TYPE_SIGNED); + assert(LLVMTypeOf(value) == int_bld->vec_type); + value = LLVMBuildBitCast(builder, value, int_bld->vec_type, ""); + lp_exec_mask_store(&bld->exec_mask, int_bld, value, + bld->addr[reg->Register.Index][chan_index]); } /** @@ -1826,7 +1857,6 @@ LLVMBuilderRef builder = gallivm->builder; const struct tgsi_full_dst_register *reg = &inst->Dst[index]; struct lp_build_context *float_bld = &bld_base->base; - struct lp_build_context *int_bld = &bld_base->int_bld; LLVMValueRef indirect_index = NULL; enum tgsi_opcode_type dtype = tgsi_opcode_infer_dst_type(inst->Instruction.Opcode, index); @@ -1862,93 +1892,14 @@ emit_dump_reg(gallivm, reg->Register.File, reg->Register.Index, chan_index, value); } - switch( reg->Register.File ) { - case TGSI_FILE_OUTPUT: - /* Outputs are always stored as floats */ - value = LLVMBuildBitCast(builder, value, float_bld->vec_type, ""); - - if (reg->Register.Indirect) { - LLVMValueRef index_vec; /* indexes into the output registers */ - LLVMValueRef outputs_array; - LLVMTypeRef fptr_type; - - index_vec = get_soa_array_offsets(&bld_base->uint_bld, - indirect_index, - chan_index, - TRUE); - - fptr_type = LLVMPointerType(LLVMFloatTypeInContext(gallivm->context), 0); - outputs_array = LLVMBuildBitCast(builder, bld->outputs_array, fptr_type, ""); - - /* Scatter store values into output registers */ - emit_mask_scatter(bld, outputs_array, index_vec, value, - &bld->exec_mask); - } - else { - LLVMValueRef out_ptr = lp_get_output_ptr(bld, reg->Register.Index, - chan_index); - - if (tgsi_type_is_64bit(dtype)) { - LLVMValueRef out_ptr2 = lp_get_output_ptr(bld, reg->Register.Index, - chan_index + 1); - emit_store_64bit_chan(bld_base, out_ptr, out_ptr2, - value); - } else - lp_exec_mask_store(&bld->exec_mask, float_bld, value, out_ptr); - } - break; - - case TGSI_FILE_TEMPORARY: - /* Temporaries are always stored as floats */ - if (!tgsi_type_is_64bit(dtype)) - value = LLVMBuildBitCast(builder, value, float_bld->vec_type, ""); - else - value = LLVMBuildBitCast(builder, value, LLVMVectorType(LLVMFloatTypeInContext(gallivm->context), bld_base->base.type.length * 2), ""); - - if (reg->Register.Indirect) { - LLVMValueRef index_vec; /* indexes into the temp registers */ - LLVMValueRef temps_array; - LLVMTypeRef fptr_type; - - index_vec = get_soa_array_offsets(&bld_base->uint_bld, - indirect_index, - chan_index, - TRUE); - - fptr_type = LLVMPointerType(LLVMFloatTypeInContext(gallivm->context), 0); - temps_array = LLVMBuildBitCast(builder, bld->temps_array, fptr_type, ""); - - /* Scatter store values into temp registers */ - emit_mask_scatter(bld, temps_array, index_vec, value, - &bld->exec_mask); - } - else { - LLVMValueRef temp_ptr; - temp_ptr = lp_get_temp_ptr_soa(bld, reg->Register.Index, chan_index); - - if (tgsi_type_is_64bit(dtype)) { - LLVMValueRef temp_ptr2 = lp_get_temp_ptr_soa(bld, - reg->Register.Index, - chan_index + 1); - emit_store_64bit_chan(bld_base, temp_ptr, temp_ptr2, - value); - } - else - lp_exec_mask_store(&bld->exec_mask, float_bld, value, temp_ptr); - } - break; - - case TGSI_FILE_ADDRESS: - assert(dtype == TGSI_TYPE_SIGNED); - assert(LLVMTypeOf(value) == int_bld->vec_type); - value = LLVMBuildBitCast(builder, value, int_bld->vec_type, ""); - lp_exec_mask_store(&bld->exec_mask, int_bld, value, - bld->addr[reg->Register.Index][chan_index]); - break; - - default: - assert( 0 ); - } + assert(bld_base->emit_store_reg_funcs[reg->Register.File]); + bld_base->emit_store_reg_funcs[reg->Register.File](bld_base, + dtype, + reg, + index, + chan_index, + indirect_index, + value); (void)dtype; } @@ -2073,7 +2024,8 @@ * constant coords maybe). * There's at least hope for sample opcodes as well as size queries. */ - if (reg->Register.File == TGSI_FILE_CONSTANT || + if (inst->Instruction.Opcode == TGSI_OPCODE_TEX_LZ || + reg->Register.File == TGSI_FILE_CONSTANT || reg->Register.File == TGSI_FILE_IMMEDIATE) { lod_property = LP_SAMPLER_LOD_SCALAR; } @@ -2198,8 +2150,10 @@ /* Note lod and especially projected are illegal in a LOT of cases */ if (modifier == LP_BLD_TEX_MODIFIER_LOD_BIAS || modifier == LP_BLD_TEX_MODIFIER_EXPLICIT_LOD) { - if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE || - inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY) { + if (inst->Instruction.Opcode == TGSI_OPCODE_TEX_LZ) { + lod = bld->bld_base.base.zero; + } else if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE || + inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY) { /* note that shadow cube array with bias/explicit lod does not exist */ lod = lp_build_emit_fetch(&bld->bld_base, inst, 1, 0); } @@ -2215,6 +2169,10 @@ lod_property = lp_build_lod_property(&bld->bld_base, inst, 0); } + if (sampler_op == LP_SAMPLER_OP_GATHER) { + uint32_t comp_val = inst->Src[sampler_reg].Register.SwizzleX; + sample_key |= (comp_val << LP_SAMPLER_GATHER_COMP_SHIFT); + } if (modifier == LP_BLD_TEX_MODIFIER_PROJECTED) { oow = lp_build_emit_fetch(&bld->bld_base, inst, 0, 3); oow = lp_build_rcp(&bld->bld_base.base, oow); @@ -2552,7 +2510,8 @@ /* always have lod except for buffers and msaa targets ? */ if (target != TGSI_TEXTURE_BUFFER && target != TGSI_TEXTURE_2D_MSAA && - target != TGSI_TEXTURE_2D_ARRAY_MSAA) { + target != TGSI_TEXTURE_2D_ARRAY_MSAA && + inst->Instruction.Opcode != TGSI_OPCODE_TXF_LZ) { sample_key |= LP_SAMPLER_LOD_EXPLICIT << LP_SAMPLER_LOD_CONTROL_SHIFT; explicit_lod = lp_build_emit_fetch(&bld->bld_base, inst, 0, 3); lod_property = lp_build_lod_property(&bld->bld_base, inst, 0); @@ -2993,6 +2952,8 @@ } break; + case TGSI_FILE_MEMORY: + break; default: /* don't need to declare other vars */ break; @@ -3387,6 +3348,79 @@ FALSE, LP_SAMPLER_OP_LODQ, emit_data->output); } +static void target_to_dims_layer(unsigned target, + unsigned *dims, + unsigned *layer_coord) +{ + *layer_coord = 0; + switch (target) { + case TGSI_TEXTURE_1D: + case TGSI_TEXTURE_BUFFER: + *dims = 1; + break; + case TGSI_TEXTURE_1D_ARRAY: + *layer_coord = 1; + *dims = 1; + break; + case TGSI_TEXTURE_2D: + case TGSI_TEXTURE_RECT: + *dims = 2; + break; + case TGSI_TEXTURE_2D_ARRAY: + *layer_coord = 2; + *dims = 2; + break; + case TGSI_TEXTURE_3D: + case TGSI_TEXTURE_CUBE: + case TGSI_TEXTURE_CUBE_ARRAY: + *dims = 3; + break; + default: + assert(0); + return; + } +} + +static void +img_load_emit( + const struct lp_build_tgsi_action * action, + struct lp_build_tgsi_context * bld_base, + struct lp_build_emit_data * emit_data) +{ + struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base); + struct lp_img_params params; + LLVMValueRef coords[5]; + LLVMValueRef coord_undef = LLVMGetUndef(bld->bld_base.base.int_vec_type); + unsigned dims; + unsigned target = emit_data->inst->Memory.Texture; + unsigned layer_coord; + + target_to_dims_layer(target, &dims, &layer_coord); + + for (unsigned i = 0; i < dims; i++) { + coords[i] = lp_build_emit_fetch(&bld->bld_base, emit_data->inst, 1, i); + } + for (unsigned i = dims; i < 5; i++) { + coords[i] = coord_undef; + } + if (layer_coord) + coords[2] = lp_build_emit_fetch(&bld->bld_base, emit_data->inst, 1, layer_coord); + + memset(¶ms, 0, sizeof(params)); + + params.type = bld->bld_base.base.type; + params.context_ptr = bld->context_ptr; + params.thread_data_ptr = bld->thread_data_ptr; + params.coords = coords; + params.outdata = emit_data->output; + params.target = tgsi_to_pipe_tex_target(target); + params.image_index = emit_data->inst->Src[0].Register.Index; + params.img_op = LP_IMG_LOAD; + bld->image->emit_op(bld->image, + bld->bld_base.base.gallivm, + ¶ms); +} + static void load_emit( const struct lp_build_tgsi_action * action, @@ -3398,10 +3432,46 @@ LLVMBuilderRef builder = bld->bld_base.base.gallivm->builder; const struct tgsi_full_src_register *bufreg = &emit_data->inst->Src[0]; unsigned buf = bufreg->Register.Index; - assert(bufreg->Register.File == TGSI_FILE_BUFFER); + assert(bufreg->Register.File == TGSI_FILE_BUFFER || + bufreg->Register.File == TGSI_FILE_IMAGE || + bufreg->Register.File == TGSI_FILE_MEMORY || + bufreg->Register.File == TGSI_FILE_CONSTBUF); + bool is_shared = bufreg->Register.File == TGSI_FILE_MEMORY; struct lp_build_context *uint_bld = &bld_base->uint_bld; - if (0) { + if (bufreg->Register.File == TGSI_FILE_IMAGE) { + img_load_emit(action, bld_base, emit_data); + } else if (bufreg->Register.File == TGSI_FILE_CONSTBUF) { + LLVMValueRef consts_ptr = bld->consts[buf]; + LLVMValueRef num_consts = bld->consts_sizes[buf]; + + LLVMValueRef indirect_index; + LLVMValueRef overflow_mask; + + indirect_index = lp_build_emit_fetch(bld_base, emit_data->inst, 1, 0); + indirect_index = lp_build_shr_imm(uint_bld, indirect_index, 4); + + /* All fetches are from the same constant buffer, so + * we need to propagate the size to a vector to do a + * vector comparison */ + num_consts = lp_build_broadcast_scalar(uint_bld, num_consts); + + /* Gather values from the constant buffer */ + unsigned chan_index; + TGSI_FOR_EACH_DST0_ENABLED_CHANNEL(emit_data->inst, chan_index) { + /* Construct a boolean vector telling us which channels + * overflow the bound constant buffer */ + overflow_mask = lp_build_compare(gallivm, uint_bld->type, PIPE_FUNC_GEQUAL, + indirect_index, num_consts); + + /* index_vec = indirect_index * 4 */ + LLVMValueRef index_vec = lp_build_shl_imm(uint_bld, indirect_index, 2); + index_vec = lp_build_add(uint_bld, index_vec, + lp_build_const_int_vec(gallivm, uint_bld->type, chan_index)); + + emit_data->output[chan_index] = build_gather(bld_base, consts_ptr, index_vec, overflow_mask, NULL); + } + } else if (0) { /* for indirect support with ARB_gpu_shader5 */ } else { LLVMValueRef index; @@ -3411,19 +3481,23 @@ index = lp_build_emit_fetch(&bld->bld_base, emit_data->inst, 1, 0); index = lp_build_shr_imm(uint_bld, index, 2); - scalar_ptr = bld->ssbos[buf]; + scalar_ptr = is_shared ? bld->shared_ptr : bld->ssbos[buf]; LLVMValueRef ssbo_limit; - ssbo_limit = LLVMBuildAShr(gallivm->builder, bld->ssbo_sizes[buf], lp_build_const_int32(gallivm, 2), ""); - ssbo_limit = lp_build_broadcast_scalar(uint_bld, ssbo_limit); + if (!is_shared) { + ssbo_limit = LLVMBuildAShr(gallivm->builder, bld->ssbo_sizes[buf], lp_build_const_int32(gallivm, 2), ""); + ssbo_limit = lp_build_broadcast_scalar(uint_bld, ssbo_limit); + } TGSI_FOR_EACH_DST0_ENABLED_CHANNEL(emit_data->inst, chan_index) { LLVMValueRef loop_index = lp_build_add(uint_bld, index, lp_build_const_int_vec(gallivm, uint_bld->type, chan_index)); LLVMValueRef exec_mask = mask_vec(bld_base); - LLVMValueRef ssbo_oob_cmp = lp_build_cmp(uint_bld, PIPE_FUNC_LESS, loop_index, ssbo_limit); - exec_mask = LLVMBuildAnd(builder, exec_mask, ssbo_oob_cmp, ""); + if (!is_shared) { + LLVMValueRef ssbo_oob_cmp = lp_build_cmp(uint_bld, PIPE_FUNC_LESS, loop_index, ssbo_limit); + exec_mask = LLVMBuildAnd(builder, exec_mask, ssbo_oob_cmp, ""); + } LLVMValueRef result = lp_build_alloca(gallivm, uint_bld->vec_type, ""); struct lp_build_loop_state loop_state; @@ -3457,6 +3531,48 @@ } static void +img_store_emit( + const struct lp_build_tgsi_action * action, + struct lp_build_tgsi_context * bld_base, + struct lp_build_emit_data * emit_data) +{ + struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base); + struct lp_img_params params; + LLVMValueRef coords[5]; + LLVMValueRef coord_undef = LLVMGetUndef(bld->bld_base.base.int_vec_type); + unsigned dims; + unsigned target = emit_data->inst->Memory.Texture; + unsigned layer_coord; + + target_to_dims_layer(target, &dims, &layer_coord); + for (unsigned i = 0; i < dims; i++) { + coords[i] = lp_build_emit_fetch(&bld->bld_base, emit_data->inst, 0, i); + } + for (unsigned i = dims; i < 5; i++) { + coords[i] = coord_undef; + } + if (layer_coord) + coords[2] = lp_build_emit_fetch(&bld->bld_base, emit_data->inst, 0, layer_coord); + memset(¶ms, 0, sizeof(params)); + + params.type = bld->bld_base.base.type; + params.context_ptr = bld->context_ptr; + params.thread_data_ptr = bld->thread_data_ptr; + params.coords = coords; + params.outdata = NULL; + params.exec_mask = mask_vec(bld_base); + params.target = tgsi_to_pipe_tex_target(target); + params.image_index = emit_data->inst->Dst[0].Register.Index; + params.img_op = LP_IMG_STORE; + for (unsigned i = 0; i < 4; i++) + params.indata[i] = lp_build_emit_fetch(&bld->bld_base, emit_data->inst, 1, i); + + bld->image->emit_op(bld->image, + bld->bld_base.base.gallivm, + ¶ms); +} + +static void store_emit( const struct lp_build_tgsi_action * action, struct lp_build_tgsi_context * bld_base, @@ -3468,9 +3584,12 @@ struct lp_build_context *uint_bld = &bld_base->uint_bld; const struct tgsi_full_dst_register *bufreg = &emit_data->inst->Dst[0]; unsigned buf = bufreg->Register.Index; - assert(bufreg->Register.File == TGSI_FILE_BUFFER); + assert(bufreg->Register.File == TGSI_FILE_BUFFER || bufreg->Register.File == TGSI_FILE_IMAGE || bufreg->Register.File == TGSI_FILE_MEMORY); + bool is_shared = bufreg->Register.File == TGSI_FILE_MEMORY; - if (0) { + if (bufreg->Register.File == TGSI_FILE_IMAGE) { + img_store_emit(action, bld_base, emit_data); + } else if (0) { } else { LLVMValueRef index; /* index into the const buffer */ @@ -3481,12 +3600,14 @@ index = lp_build_emit_fetch(&bld->bld_base, emit_data->inst, 0, 0); index = lp_build_shr_imm(uint_bld, index, 2); - scalar_ptr = bld->ssbos[buf]; + scalar_ptr = is_shared ? bld->shared_ptr : bld->ssbos[buf]; LLVMValueRef ssbo_limit; - ssbo_limit = LLVMBuildAShr(gallivm->builder, bld->ssbo_sizes[buf], lp_build_const_int32(gallivm, 2), ""); - ssbo_limit = lp_build_broadcast_scalar(uint_bld, ssbo_limit); + if (!is_shared) { + ssbo_limit = LLVMBuildAShr(gallivm->builder, bld->ssbo_sizes[buf], lp_build_const_int32(gallivm, 2), ""); + ssbo_limit = lp_build_broadcast_scalar(uint_bld, ssbo_limit); + } TGSI_FOR_EACH_DST0_ENABLED_CHANNEL(emit_data->inst, chan_index) { LLVMValueRef loop_index = lp_build_add(uint_bld, index, lp_build_const_int_vec(gallivm, uint_bld->type, chan_index)); @@ -3494,8 +3615,10 @@ value = lp_build_emit_fetch(&bld->bld_base, emit_data->inst, 1, chan_index); LLVMValueRef exec_mask = mask_vec(bld_base); - LLVMValueRef ssbo_oob_cmp = lp_build_cmp(uint_bld, PIPE_FUNC_LESS, loop_index, ssbo_limit); - exec_mask = LLVMBuildAnd(builder, exec_mask, ssbo_oob_cmp, ""); + if (!is_shared) { + LLVMValueRef ssbo_oob_cmp = lp_build_cmp(uint_bld, PIPE_FUNC_LESS, loop_index, ssbo_limit); + exec_mask = LLVMBuildAnd(builder, exec_mask, ssbo_oob_cmp, ""); + } struct lp_build_loop_state loop_state; lp_build_loop_begin(&loop_state, gallivm, lp_build_const_int32(gallivm, 0)); @@ -3534,11 +3657,74 @@ const struct tgsi_full_src_register *bufreg = &emit_data->inst->Src[0]; unsigned buf = bufreg->Register.Index; - assert(bufreg->Register.File == TGSI_FILE_BUFFER); + assert(bufreg->Register.File == TGSI_FILE_BUFFER || bufreg->Register.File == TGSI_FILE_IMAGE); - LLVMValueRef num_ssbo = bld->ssbo_sizes[buf]; + if (bufreg->Register.File == TGSI_FILE_IMAGE) { + unsigned target = emit_data->inst->Memory.Texture; + struct lp_sampler_size_query_params params = { 0 }; + params.int_type = bld->bld_base.int_bld.type; + params.texture_unit = buf; + params.target = tgsi_to_pipe_tex_target(target); + params.context_ptr = bld->context_ptr; + params.sizes_out = emit_data->output; + + bld->image->emit_size_query(bld->image, + bld->bld_base.base.gallivm, + ¶ms); + } else { + LLVMValueRef num_ssbo = bld->ssbo_sizes[buf]; + + emit_data->output[emit_data->chan] = lp_build_broadcast_scalar(uint_bld, num_ssbo); + } +} + +static void +img_atomic_emit( + const struct lp_build_tgsi_action * action, + struct lp_build_tgsi_context * bld_base, + struct lp_build_emit_data * emit_data, + LLVMAtomicRMWBinOp op) +{ + struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base); + struct lp_img_params params; + LLVMValueRef coords[5]; + LLVMValueRef coord_undef = LLVMGetUndef(bld->bld_base.base.int_vec_type); + unsigned dims; + unsigned layer_coord; + unsigned target = emit_data->inst->Memory.Texture; + + target_to_dims_layer(target, &dims, &layer_coord); + + for (unsigned i = 0; i < dims; i++) { + coords[i] = lp_build_emit_fetch(&bld->bld_base, emit_data->inst, 1, i); + } + for (unsigned i = dims; i < 5; i++) { + coords[i] = coord_undef; + } + if (layer_coord) + coords[2] = lp_build_emit_fetch(&bld->bld_base, emit_data->inst, 1, layer_coord); + memset(¶ms, 0, sizeof(params)); - emit_data->output[emit_data->chan] = lp_build_broadcast_scalar(uint_bld, num_ssbo); + params.type = bld->bld_base.base.type; + params.context_ptr = bld->context_ptr; + params.thread_data_ptr = bld->thread_data_ptr; + params.exec_mask = mask_vec(bld_base); + params.image_index = emit_data->inst->Src[0].Register.Index; + params.coords = coords; + params.target = tgsi_to_pipe_tex_target(target); + params.op = op; + params.outdata = emit_data->output; + params.img_op = (emit_data->inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) ? LP_IMG_ATOMIC_CAS : LP_IMG_ATOMIC; + + for (unsigned i = 0; i < 4; i++) + params.indata[i] = lp_build_emit_fetch(&bld->bld_base, emit_data->inst, 2, i); + if (emit_data->inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) { + for (unsigned i = 0; i < 4; i++) + params.indata2[i] = lp_build_emit_fetch(&bld->bld_base, emit_data->inst, 3, i); + } + bld->image->emit_op(bld->image, + bld->bld_base.base.gallivm, + ¶ms); } static void @@ -3553,8 +3739,9 @@ struct lp_build_context *uint_bld = &bld_base->uint_bld; const struct tgsi_full_src_register *bufreg = &emit_data->inst->Src[0]; - assert(bufreg->Register.File == TGSI_FILE_BUFFER); + assert(bufreg->Register.File == TGSI_FILE_BUFFER || bufreg->Register.File == TGSI_FILE_IMAGE || bufreg->Register.File == TGSI_FILE_MEMORY); unsigned buf = bufreg->Register.Index; + bool is_shared = bufreg->Register.File == TGSI_FILE_MEMORY; LLVMAtomicRMWBinOp op; switch (emit_data->inst->Instruction.Opcode) { @@ -3592,7 +3779,9 @@ return; } - if (0) { + if (bufreg->Register.File == TGSI_FILE_IMAGE) { + img_atomic_emit(action, bld_base, emit_data, op); + } else if (0) { } else { LLVMValueRef index; /* index into the const buffer */ LLVMValueRef scalar, scalar_ptr; @@ -3602,20 +3791,28 @@ value = lp_build_emit_fetch(&bld->bld_base, emit_data->inst, 2, 0); index = lp_build_shr_imm(uint_bld, index, 2); - index = lp_build_add(uint_bld, index, lp_build_const_int_vec(gallivm, uint_bld->type, emit_data->chan)); - scalar_ptr = bld->ssbos[buf]; + if (!is_shared) { + index = lp_build_add(uint_bld, index, lp_build_const_int_vec(gallivm, uint_bld->type, emit_data->chan)); + scalar_ptr = bld->ssbos[buf]; + } else + scalar_ptr = bld->shared_ptr; LLVMValueRef atom_res = lp_build_alloca(gallivm, uint_bld->vec_type, ""); LLVMValueRef ssbo_limit; - ssbo_limit = LLVMBuildAShr(gallivm->builder, bld->ssbo_sizes[buf], lp_build_const_int32(gallivm, 2), ""); - ssbo_limit = lp_build_broadcast_scalar(uint_bld, ssbo_limit); + if (!is_shared) { + ssbo_limit = LLVMBuildAShr(gallivm->builder, bld->ssbo_sizes[buf], lp_build_const_int32(gallivm, 2), ""); + ssbo_limit = lp_build_broadcast_scalar(uint_bld, ssbo_limit); + } LLVMValueRef exec_mask = mask_vec(bld_base); - LLVMValueRef ssbo_oob_cmp = lp_build_cmp(uint_bld, PIPE_FUNC_LESS, index, ssbo_limit); - exec_mask = LLVMBuildAnd(builder, exec_mask, ssbo_oob_cmp, ""); + + if (!is_shared) { + LLVMValueRef ssbo_oob_cmp = lp_build_cmp(uint_bld, PIPE_FUNC_LESS, index, ssbo_limit); + exec_mask = LLVMBuildAnd(builder, exec_mask, ssbo_oob_cmp, ""); + } struct lp_build_loop_state loop_state; lp_build_loop_begin(&loop_state, gallivm, lp_build_const_int32(gallivm, 0)); @@ -3670,6 +3867,31 @@ } static void +barrier_emit( + const struct lp_build_tgsi_action * action, + struct lp_build_tgsi_context * bld_base, + struct lp_build_emit_data * emit_data) +{ + struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base); + struct gallivm_state * gallivm = bld_base->base.gallivm; + + LLVMBasicBlockRef resume = lp_build_insert_new_block(gallivm, "resume"); + + lp_build_coro_suspend_switch(gallivm, bld->coro, resume, false); + LLVMPositionBuilderAtEnd(gallivm->builder, resume); +} + +static void +membar_emit( + const struct lp_build_tgsi_action * action, + struct lp_build_tgsi_context * bld_base, + struct lp_build_emit_data * emit_data) +{ + LLVMBuilderRef builder = bld_base->base.gallivm->builder; + LLVMBuildFence(builder, LLVMAtomicOrderingSequentiallyConsistent, false, ""); +} + +static void increment_vec_ptr_by_mask(struct lp_build_tgsi_context * bld_base, LLVMValueRef ptr, LLVMValueRef mask) @@ -3722,15 +3944,21 @@ LLVMBuilderRef builder = bld->bld_base.base.gallivm->builder; if (bld->gs_iface->emit_vertex) { + uint32_t stream_reg_idx = emit_data->inst->Src[0].Register.Index; + uint32_t stream_reg_swiz = emit_data->inst->Src[0].Register.SwizzleX; + LLVMValueRef stream_id = bld->immediates[stream_reg_idx][stream_reg_swiz]; LLVMValueRef mask = mask_vec(bld_base); LLVMValueRef total_emitted_vertices_vec = LLVMBuildLoad(builder, bld->total_emitted_vertices_vec_ptr, ""); + + stream_id = LLVMBuildBitCast(builder, stream_id, bld_base->uint_bld.vec_type, ""); mask = clamp_mask_to_max_output_vertices(bld, mask, total_emitted_vertices_vec); gather_outputs(bld); - bld->gs_iface->emit_vertex(bld->gs_iface, &bld->bld_base, + bld->gs_iface->emit_vertex(bld->gs_iface, &bld->bld_base.base, bld->outputs, - total_emitted_vertices_vec); + total_emitted_vertices_vec, + stream_id); increment_vec_ptr_by_mask(bld_base, bld->emitted_vertices_vec_ptr, mask); increment_vec_ptr_by_mask(bld_base, bld->total_emitted_vertices_vec_ptr, @@ -3760,7 +3988,8 @@ LLVMBuildLoad(builder, bld->emitted_vertices_vec_ptr, ""); LLVMValueRef emitted_prims_vec = LLVMBuildLoad(builder, bld->emitted_prims_vec_ptr, ""); - + LLVMValueRef total_emitted_vertices_vec = + LLVMBuildLoad(builder, bld->total_emitted_vertices_vec_ptr, ""); LLVMValueRef emitted_mask = lp_build_cmp(uint_bld, PIPE_FUNC_NOTEQUAL, emitted_vertices_vec, uint_bld->zero); @@ -3770,9 +3999,11 @@ executes only on the paths that have unflushed vertices */ mask = LLVMBuildAnd(builder, mask, emitted_mask, ""); - bld->gs_iface->end_primitive(bld->gs_iface, &bld->bld_base, + bld->gs_iface->end_primitive(bld->gs_iface, &bld->bld_base.base, + total_emitted_vertices_vec, emitted_vertices_vec, - emitted_prims_vec); + emitted_prims_vec, + mask_vec(bld_base)); #if DUMP_GS_EMITS lp_build_print_value(bld->bld_base.base.gallivm, @@ -3815,6 +4046,20 @@ } static void +barrier_emit_tcs( + const struct lp_build_tgsi_action * action, + struct lp_build_tgsi_context * bld_base, + struct lp_build_emit_data * emit_data) +{ + struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base); + + if (bld->tcs_iface->emit_barrier) { + bld->tcs_iface->emit_barrier((struct lp_build_context*)bld_base); + } +} + + +static void cal_emit( const struct lp_build_tgsi_action * action, struct lp_build_tgsi_context * bld_base, @@ -3845,7 +4090,7 @@ { struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base); - lp_exec_break(&bld->exec_mask, bld_base); + lp_exec_tgsi_break(&bld->exec_mask, bld_base); } static void @@ -3929,7 +4174,7 @@ { struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base); - lp_exec_bgnloop(&bld->exec_mask); + lp_exec_bgnloop(&bld->exec_mask, true); } static void @@ -4028,7 +4273,8 @@ /* If we have indirect addressing in inputs we need to copy them into * our alloca array to be able to iterate over them */ - if (bld->indirect_files & (1 << TGSI_FILE_INPUT) && !bld->gs_iface) { + if (bld->indirect_files & (1 << TGSI_FILE_INPUT) && + !bld->gs_iface && !bld->tes_iface && !bld->tcs_iface) { unsigned index, chan; LLVMTypeRef vec_type = bld_base->base.vec_type; LLVMValueRef array_size = lp_build_const_int32(gallivm, @@ -4085,6 +4331,15 @@ } } +static void emit_prologue_post_decl(struct lp_build_tgsi_context * bld_base) +{ + struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base); + + if (bld->tcs_iface && bld->tcs_iface->emit_prologue) { + bld->tcs_iface->emit_prologue((struct lp_build_context*)bld_base); + } +} + static void emit_epilogue(struct lp_build_tgsi_context * bld_base) { struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base); @@ -4099,6 +4354,10 @@ lp_build_printf(bld_base->base.gallivm, "\n"); } + if (bld->tcs_iface && bld->tcs_iface->emit_epilogue) { + bld->tcs_iface->emit_epilogue((struct lp_build_context*)bld_base); + } + /* If we have indirect addressing in outputs we need to copy our alloca array * to the outputs slots specified by the caller */ if (bld->gs_iface) { @@ -4108,14 +4367,13 @@ vertices in the cache. Note must not call end_primitive here since the exec_mask is not valid at this point. */ end_primitive_masked(bld_base, lp_build_mask_value(bld->mask)); - + total_emitted_vertices_vec = LLVMBuildLoad(builder, bld->total_emitted_vertices_vec_ptr, ""); emitted_prims_vec = LLVMBuildLoad(builder, bld->emitted_prims_vec_ptr, ""); bld->gs_iface->gs_epilogue(bld->gs_iface, - &bld->bld_base, total_emitted_vertices_vec, emitted_prims_vec); } else { @@ -4175,6 +4433,9 @@ bld.indirect_files = params->info->indirect_files; bld.context_ptr = params->context_ptr; bld.thread_data_ptr = params->thread_data_ptr; + bld.image = params->image; + bld.shared_ptr = params->shared_ptr; + bld.coro = params->coro; /* * If the number of temporaries is rather large then we just @@ -4203,12 +4464,17 @@ bld.bld_base.emit_fetch_funcs[TGSI_FILE_INPUT] = emit_fetch_input; bld.bld_base.emit_fetch_funcs[TGSI_FILE_TEMPORARY] = emit_fetch_temporary; bld.bld_base.emit_fetch_funcs[TGSI_FILE_SYSTEM_VALUE] = emit_fetch_system_value; + bld.bld_base.emit_store = emit_store; + bld.bld_base.emit_store_reg_funcs[TGSI_FILE_OUTPUT] = emit_store_output; + bld.bld_base.emit_store_reg_funcs[TGSI_FILE_TEMPORARY] = emit_store_temp; + bld.bld_base.emit_store_reg_funcs[TGSI_FILE_ADDRESS] = emit_store_address; bld.bld_base.emit_declaration = lp_emit_declaration_soa; bld.bld_base.emit_immediate = lp_emit_immediate_soa; bld.bld_base.emit_prologue = emit_prologue; + bld.bld_base.emit_prologue_post_decl = emit_prologue_post_decl; bld.bld_base.emit_epilogue = emit_epilogue; /* Set opcode actions */ @@ -4238,9 +4504,11 @@ bld.bld_base.op_actions[TGSI_OPCODE_TXB].emit = txb_emit; bld.bld_base.op_actions[TGSI_OPCODE_TXD].emit = txd_emit; bld.bld_base.op_actions[TGSI_OPCODE_TXL].emit = txl_emit; + bld.bld_base.op_actions[TGSI_OPCODE_TEX_LZ].emit = txl_emit; bld.bld_base.op_actions[TGSI_OPCODE_TXP].emit = txp_emit; bld.bld_base.op_actions[TGSI_OPCODE_TXQ].emit = txq_emit; bld.bld_base.op_actions[TGSI_OPCODE_TXF].emit = txf_emit; + bld.bld_base.op_actions[TGSI_OPCODE_TXF_LZ].emit = txf_emit; bld.bld_base.op_actions[TGSI_OPCODE_TEX2].emit = tex2_emit; bld.bld_base.op_actions[TGSI_OPCODE_TXB2].emit = txb2_emit; bld.bld_base.op_actions[TGSI_OPCODE_TXL2].emit = txl2_emit; @@ -4274,6 +4542,9 @@ bld.bld_base.op_actions[TGSI_OPCODE_ATOMIMIN].emit = atomic_emit; bld.bld_base.op_actions[TGSI_OPCODE_ATOMIMAX].emit = atomic_emit; + bld.bld_base.op_actions[TGSI_OPCODE_MEMBAR].emit = membar_emit; + bld.bld_base.op_actions[TGSI_OPCODE_BARRIER].emit = barrier_emit; + if (params->gs_iface) { /* There's no specific value for this because it should always * be set, but apps using ext_geometry_shader4 quite often @@ -4299,6 +4570,24 @@ max_output_vertices); } + if (params->tes_iface) { + /* inputs are always indirect with tes */ + bld.indirect_files |= (1 << TGSI_FILE_INPUT); + bld.tes_iface = params->tes_iface; + bld.bld_base.emit_fetch_funcs[TGSI_FILE_INPUT] = emit_fetch_tes_input; + } + + if (params->tcs_iface) { + bld.tcs_iface = params->tcs_iface; + /* outputs and inputs are always indirect with tcs */ + bld.indirect_files |= (1 << TGSI_FILE_OUTPUT); + bld.bld_base.emit_store_reg_funcs[TGSI_FILE_OUTPUT] = emit_store_tcs_output; + bld.indirect_files |= (1 << TGSI_FILE_INPUT); + bld.bld_base.emit_fetch_funcs[TGSI_FILE_INPUT] = emit_fetch_tcs_input; + bld.bld_base.emit_fetch_funcs[TGSI_FILE_OUTPUT] = emit_fetch_tcs_input; + bld.bld_base.op_actions[TGSI_OPCODE_BARRIER].emit = barrier_emit_tcs; + } + lp_exec_mask_init(&bld.exec_mask, &bld.bld_base.int_bld); bld.system_values = *params->system_values; diff -Nru mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_type.h mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_type.h --- mesa-19.2.8/src/gallium/auxiliary/gallivm/lp_bld_type.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/gallivm/lp_bld_type.h 2020-06-12 01:21:16.000000000 +0000 @@ -37,7 +37,7 @@ #define LP_BLD_TYPE_H -#include "util/u_format.h" +#include "util/format/u_format.h" #include "pipe/p_compiler.h" #include "gallivm/lp_bld.h" diff -Nru mesa-19.2.8/src/gallium/auxiliary/hud/hud_context.c mesa-20.0.8/src/gallium/auxiliary/hud/hud_context.c --- mesa-19.2.8/src/gallium/auxiliary/hud/hud_context.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/hud/hud_context.c 2020-06-12 01:21:16.000000000 +0000 @@ -42,7 +42,7 @@ #include "cso_cache/cso_context.h" #include "util/u_draw_quad.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_inlines.h" #include "util/u_memory.h" #include "util/u_math.h" @@ -696,8 +696,8 @@ */ if (gr->current_value < LIST_ENTRY(struct hud_graph, next, head)->current_value) { - LIST_DEL(&gr->head); - LIST_ADD(&gr->head, &next->head); + list_del(&gr->head); + list_add(&gr->head, &next->head); } } } @@ -898,7 +898,7 @@ pane->sort_items = sort_items; pane->initial_max_value = max_value; hud_pane_set_max_value(pane, max_value); - LIST_INITHEAD(&pane->graph_list); + list_inithead(&pane->graph_list); return pane; } @@ -946,7 +946,7 @@ gr->color[1] = colors[color][1]; gr->color[2] = colors[color][2]; gr->pane = pane; - LIST_ADDTAIL(&gr->head, &pane->graph_list); + list_addtail(&gr->head, &pane->graph_list); pane->num_graphs++; pane->next_color++; } @@ -1431,7 +1431,7 @@ env += num; strip_hyphens(s); - if (added && !LIST_IS_EMPTY(&pane->graph_list)) { + if (added && !list_is_empty(&pane->graph_list)) { struct hud_graph *graph; graph = LIST_ENTRY(struct hud_graph, pane->graph_list.prev, head); strncpy(graph->name, s, sizeof(graph->name)-1); @@ -1458,7 +1458,7 @@ height = 100; if (pane && pane->num_graphs) { - LIST_ADDTAIL(&pane->head, &hud->pane_list); + list_addtail(&pane->head, &hud->pane_list); pane = NULL; } break; @@ -1471,7 +1471,7 @@ height = 100; if (pane && pane->num_graphs) { - LIST_ADDTAIL(&pane->head, &hud->pane_list); + list_addtail(&pane->head, &hud->pane_list); pane = NULL; } @@ -1494,7 +1494,7 @@ if (pane) { if (pane->num_graphs) { - LIST_ADDTAIL(&pane->head, &hud->pane_list); + list_addtail(&pane->head, &hud->pane_list); } else { FREE(pane); @@ -1686,7 +1686,7 @@ }; struct tgsi_token tokens[1000]; - struct pipe_shader_state state; + struct pipe_shader_state state = {0}; if (!tgsi_text_translate(fragment_shader_text, tokens, ARRAY_SIZE(tokens))) { assert(0); @@ -1723,7 +1723,7 @@ }; struct tgsi_token tokens[1000]; - struct pipe_shader_state state; + struct pipe_shader_state state = {0}; if (!tgsi_text_translate(vertex_shader_text, tokens, ARRAY_SIZE(tokens))) { assert(0); goto fail; @@ -1752,10 +1752,10 @@ LIST_FOR_EACH_ENTRY_SAFE(pane, pane_tmp, &hud->pane_list, head) { LIST_FOR_EACH_ENTRY_SAFE(graph, graph_tmp, &pane->graph_list, head) { - LIST_DEL(&graph->head); + list_del(&graph->head); hud_graph_destroy(graph, pipe); } - LIST_DEL(&pane->head); + list_del(&pane->head); FREE(pane); } @@ -1816,7 +1816,9 @@ #ifdef PIPE_OS_UNIX unsigned signo = debug_get_num_option("GALLIUM_HUD_TOGGLE_SIGNAL", 0); static boolean sig_handled = FALSE; - struct sigaction action = {}; + struct sigaction action; + + memset(&action, 0, sizeof(action)); #endif huds_visible = debug_get_bool_option("GALLIUM_HUD_VISIBLE", TRUE); @@ -1885,7 +1887,7 @@ hud->constbuf.buffer_size = sizeof(hud->constants); hud->constbuf.user_buffer = &hud->constants; - LIST_INITHEAD(&hud->pane_list); + list_inithead(&hud->pane_list); /* setup sig handler once for all hud contexts */ #ifdef PIPE_OS_UNIX diff -Nru mesa-19.2.8/src/gallium/auxiliary/hud/hud_cpu.c mesa-20.0.8/src/gallium/auxiliary/hud/hud_cpu.c --- mesa-19.2.8/src/gallium/auxiliary/hud/hud_cpu.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/hud/hud_cpu.c 2020-06-12 01:21:16.000000000 +0000 @@ -38,11 +38,15 @@ #ifdef PIPE_OS_WINDOWS #include #endif -#ifdef PIPE_OS_FREEBSD +#if defined(PIPE_OS_BSD) #include #include +#if defined(PIPE_OS_NETBSD) || defined(PIPE_OS_OPENBSD) +#include +#else #include #endif +#endif #ifdef PIPE_OS_WINDOWS @@ -91,20 +95,54 @@ return TRUE; } -#elif defined(PIPE_OS_FREEBSD) +#elif defined(PIPE_OS_BSD) static boolean get_cpu_stats(unsigned cpu_index, uint64_t *busy_time, uint64_t *total_time) { +#if defined(PIPE_OS_NETBSD) || defined(PIPE_OS_OPENBSD) + uint64_t cp_time[CPUSTATES]; +#else long cp_time[CPUSTATES]; +#endif size_t len; if (cpu_index == ALL_CPUS) { len = sizeof(cp_time); +#if defined(PIPE_OS_NETBSD) + int mib[] = { CTL_KERN, KERN_CP_TIME }; + + if (sysctl(mib, ARRAY_SIZE(mib), cp_time, &len, NULL, 0) == -1) + return FALSE; +#elif defined(PIPE_OS_OPENBSD) + int mib[] = { CTL_KERN, KERN_CPTIME }; + long sum_cp_time[CPUSTATES]; + + len = sizeof(sum_cp_time); + if (sysctl(mib, ARRAY_SIZE(mib), sum_cp_time, &len, NULL, 0) == -1) + return FALSE; + + for (int state = 0; state < CPUSTATES; state++) + cp_time[state] = sum_cp_time[state]; +#else if (sysctlbyname("kern.cp_time", cp_time, &len, NULL, 0) == -1) return FALSE; +#endif } else { +#if defined(PIPE_OS_NETBSD) + int mib[] = { CTL_KERN, KERN_CP_TIME, cpu_index }; + + len = sizeof(cp_time); + if (sysctl(mib, ARRAY_SIZE(mib), cp_time, &len, NULL, 0) == -1) + return FALSE; +#elif defined(PIPE_OS_OPENBSD) + int mib[] = { CTL_KERN, KERN_CPTIME2, cpu_index }; + + len = sizeof(cp_time); + if (sysctl(mib, ARRAY_SIZE(mib), cp_time, &len, NULL, 0) == -1) + return FALSE; +#else long *cp_times = NULL; if (sysctlbyname("kern.cp_times", NULL, &len, NULL, 0) == -1) @@ -121,6 +159,7 @@ memcpy(cp_time, cp_times + (cpu_index * CPUSTATES), sizeof(cp_time)); free(cp_times); +#endif } *busy_time = cp_time[CP_USER] + cp_time[CP_NICE] + diff -Nru mesa-19.2.8/src/gallium/auxiliary/Makefile.sources mesa-20.0.8/src/gallium/auxiliary/Makefile.sources --- mesa-19.2.8/src/gallium/auxiliary/Makefile.sources 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/Makefile.sources 2020-06-12 01:21:16.000000000 +0000 @@ -224,7 +224,6 @@ util/u_debug_flush.h \ util/u_debug_image.c \ util/u_debug_image.h \ - util/u_debug_memory.c \ util/u_debug_refcnt.c \ util/u_debug_refcnt.h \ util/u_debug_stack.c \ @@ -244,26 +243,6 @@ util/u_dump.h \ util/u_dump_state.c \ util/u_fifo.h \ - util/u_format.c \ - util/u_format.h \ - util/u_format_bptc.c \ - util/u_format_bptc.h \ - util/u_format_etc.c \ - util/u_format_etc.h \ - util/u_format_latc.c \ - util/u_format_latc.h \ - util/u_format_other.c \ - util/u_format_other.h \ - util/u_format_rgtc.c \ - util/u_format_rgtc.h \ - util/u_format_s3tc.c \ - util/u_format_s3tc.h \ - util/u_format_tests.c \ - util/u_format_tests.h \ - util/u_format_yuv.c \ - util/u_format_yuv.h \ - util/u_format_zs.c \ - util/u_format_zs.h \ util/u_framebuffer.c \ util/u_framebuffer.h \ util/u_gen_mipmap.c \ @@ -282,10 +261,10 @@ util/u_inlines.h \ util/u_linear.c \ util/u_linear.h \ + util/u_live_shader_cache.c \ + util/u_live_shader_cache.h \ util/u_log.c \ util/u_log.h \ - util/u_mm.c \ - util/u_mm.h \ util/u_network.c \ util/u_network.h \ util/u_pack_color.h \ @@ -307,6 +286,8 @@ util/u_screen.h \ util/u_simple_shaders.c \ util/u_simple_shaders.h \ + util/u_split_draw.c \ + util/u_split_draw.h \ util/u_split_prim.h \ util/u_sse.h \ util/u_suballoc.c \ @@ -337,7 +318,9 @@ NIR_SOURCES := \ nir/tgsi_to_nir.c \ - nir/tgsi_to_nir.h + nir/tgsi_to_nir.h \ + nir/nir_draw_helpers.c \ + nir/nir_draw_helpers.h VL_SOURCES := \ vl/vl_bicubic_filter.c \ @@ -396,8 +379,7 @@ GENERATED_SOURCES := \ indices/u_indices_gen.c \ - indices/u_unfilled_gen.c \ - util/u_format_table.c + indices/u_unfilled_gen.c GALLIVM_SOURCES := \ gallivm/lp_bld_arit.c \ @@ -412,6 +394,8 @@ gallivm/lp_bld_const.h \ gallivm/lp_bld_conv.c \ gallivm/lp_bld_conv.h \ + gallivm/lp_bld_coro.c \ + gallivm/lp_bld_coro.h \ gallivm/lp_bld_debug.cpp \ gallivm/lp_bld_debug.h \ gallivm/lp_bld_flow.c \ @@ -432,11 +416,16 @@ gallivm/lp_bld_init.h \ gallivm/lp_bld_intr.c \ gallivm/lp_bld_intr.h \ + gallivm/lp_bld_ir_common.c \ + gallivm/lp_bld_ir_common.h \ gallivm/lp_bld_limits.h \ gallivm/lp_bld_logic.c \ gallivm/lp_bld_logic.h \ gallivm/lp_bld_misc.cpp \ gallivm/lp_bld_misc.h \ + gallivm/lp_bld_nir.c \ + gallivm/lp_bld_nir.h \ + gallivm/lp_bld_nir_soa.c \ gallivm/lp_bld_pack.c \ gallivm/lp_bld_pack.h \ gallivm/lp_bld_printf.c \ @@ -461,6 +450,8 @@ gallivm/lp_bld_tgsi_soa.c \ gallivm/lp_bld_type.c \ gallivm/lp_bld_type.h \ + nir/nir_to_tgsi_info.c \ + nir/nir_to_tgsi_info.h \ draw/draw_llvm.c \ draw/draw_llvm.h \ draw/draw_llvm_sample.c \ diff -Nru mesa-19.2.8/src/gallium/auxiliary/meson.build mesa-20.0.8/src/gallium/auxiliary/meson.build --- mesa-19.2.8/src/gallium/auxiliary/meson.build 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/meson.build 2020-06-12 01:21:16.000000000 +0000 @@ -244,7 +244,6 @@ 'util/u_debug_flush.h', 'util/u_debug_image.c', 'util/u_debug_image.h', - 'util/u_debug_memory.c', 'util/u_debug_refcnt.c', 'util/u_debug_refcnt.h', 'util/u_debug_stack.c', @@ -264,26 +263,6 @@ 'util/u_dump.h', 'util/u_dump_state.c', 'util/u_fifo.h', - 'util/u_format.c', - 'util/u_format.h', - 'util/u_format_bptc.c', - 'util/u_format_bptc.h', - 'util/u_format_etc.c', - 'util/u_format_etc.h', - 'util/u_format_latc.c', - 'util/u_format_latc.h', - 'util/u_format_other.c', - 'util/u_format_other.h', - 'util/u_format_rgtc.c', - 'util/u_format_rgtc.h', - 'util/u_format_s3tc.c', - 'util/u_format_s3tc.h', - 'util/u_format_tests.c', - 'util/u_format_tests.h', - 'util/u_format_yuv.c', - 'util/u_format_yuv.h', - 'util/u_format_zs.c', - 'util/u_format_zs.h', 'util/u_framebuffer.c', 'util/u_framebuffer.h', 'util/u_gen_mipmap.c', @@ -302,10 +281,10 @@ 'util/u_inlines.h', 'util/u_linear.c', 'util/u_linear.h', + 'util/u_live_shader_cache.c', + 'util/u_live_shader_cache.h', 'util/u_log.c', 'util/u_log.h', - 'util/u_mm.c', - 'util/u_mm.h', 'util/u_network.c', 'util/u_network.h', 'util/u_pack_color.h', @@ -327,6 +306,8 @@ 'util/u_screen.h', 'util/u_simple_shaders.c', 'util/u_simple_shaders.h', + 'util/u_split_draw.c', + 'util/u_split_draw.h', 'util/u_split_prim.h', 'util/u_sse.h', 'util/u_suballoc.c', @@ -356,6 +337,8 @@ 'util/u_viewport.h', 'nir/tgsi_to_nir.c', 'nir/tgsi_to_nir.h', + 'nir/nir_draw_helpers.c', + 'nir/nir_draw_helpers.h', ) if dep_libdrm.found() @@ -379,6 +362,8 @@ 'gallivm/lp_bld_const.h', 'gallivm/lp_bld_conv.c', 'gallivm/lp_bld_conv.h', + 'gallivm/lp_bld_coro.c', + 'gallivm/lp_bld_coro.h', 'gallivm/lp_bld_debug.cpp', 'gallivm/lp_bld_debug.h', 'gallivm/lp_bld_flow.c', @@ -399,11 +384,16 @@ 'gallivm/lp_bld_init.h', 'gallivm/lp_bld_intr.c', 'gallivm/lp_bld_intr.h', + 'gallivm/lp_bld_ir_common.c', + 'gallivm/lp_bld_ir_common.h', 'gallivm/lp_bld_limits.h', 'gallivm/lp_bld_logic.c', 'gallivm/lp_bld_logic.h', 'gallivm/lp_bld_misc.cpp', 'gallivm/lp_bld_misc.h', + 'gallivm/lp_bld_nir.h', + 'gallivm/lp_bld_nir.c', + 'gallivm/lp_bld_nir_soa.c', 'gallivm/lp_bld_pack.c', 'gallivm/lp_bld_pack.h', 'gallivm/lp_bld_printf.c', @@ -433,6 +423,8 @@ 'draw/draw_llvm_sample.c', 'draw/draw_pt_fetch_shade_pipeline_llvm.c', 'draw/draw_vs_llvm.c', + 'nir/nir_to_tgsi_info.c', + 'nir/nir_to_tgsi_info.h', ) endif @@ -506,18 +498,9 @@ capture : true, ) -u_format_table_c = custom_target( - 'u_format_table.c', - input : ['util/u_format_table.py', 'util/u_format.csv'], - output : 'u_format_table.c', - command : [prog_python, '@INPUT@'], - depend_files : files('util/u_format_pack.py', 'util/u_format_parse.py'), - capture : true, -) - libgallium = static_library( 'gallium', - [files_libgallium, u_indices_gen_c, u_unfilled_gen_c, u_format_table_c], + [files_libgallium, u_indices_gen_c, u_unfilled_gen_c], include_directories : [ inc_loader, inc_gallium, inc_src, inc_include, include_directories('util') ], @@ -525,12 +508,9 @@ cpp_args : [cpp_vis_args, cpp_msvc_compat_args], dependencies : [ dep_libdrm, dep_llvm, dep_unwind, dep_dl, dep_m, dep_thread, dep_lmsensors, - idep_nir_headers, + idep_nir, idep_nir_headers, idep_mesautil, ], - build_by_default : false, - link_with: [ - libglsl - ] + build_by_default : false ) libgalliumvl_stub = static_library( diff -Nru mesa-19.2.8/src/gallium/auxiliary/nir/nir_draw_helpers.c mesa-20.0.8/src/gallium/auxiliary/nir/nir_draw_helpers.c --- mesa-19.2.8/src/gallium/auxiliary/nir/nir_draw_helpers.c 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/nir/nir_draw_helpers.c 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,372 @@ +/************************************************************************** + * + * Copyright 2019 Red Hat. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **************************************************************************/ + +/* + * NIR lowering passes to handle the draw stages for + * - pstipple + * - aaline + * - aapoint. + * + * These are all ported from the equivalent TGSI transforms. + */ + +#include "nir.h" +#include "tgsi/tgsi_from_mesa.h" +#include "nir_builder.h" + +#include "nir_draw_helpers.h" + +typedef struct { + nir_builder b; + nir_shader *shader; + bool fs_pos_is_sysval; + nir_variable *stip_tex; + nir_ssa_def *fragcoord; +} lower_pstipple; + +static nir_ssa_def * +load_frag_coord(nir_builder *b) +{ + int max_driver_loc = -1; + nir_foreach_variable(var, &b->shader->inputs) { + if (var->data.location == VARYING_SLOT_POS) + return nir_load_var(b, var); + if (max_driver_loc < (int)var->data.driver_location) + max_driver_loc = var->data.driver_location; + } + + nir_variable *pos = nir_variable_create(b->shader, nir_var_shader_in, + glsl_vec4_type(), NULL); + pos->data.location = VARYING_SLOT_POS; + pos->data.interpolation = INTERP_MODE_NOPERSPECTIVE; + pos->data.driver_location = max_driver_loc + 1; + b->shader->num_inputs++; + return nir_load_var(b, pos); +} + +static void +nir_lower_pstipple_block(nir_block *block, + lower_pstipple *state) +{ + nir_builder *b = &state->b; + nir_ssa_def *texcoord; + + b->cursor = nir_before_block(block); + + nir_ssa_def *div32 = nir_imm_vec2(b, 1.0/32.0, 1.0/32.0); + + nir_ssa_def *frag_coord = state->fs_pos_is_sysval ? nir_load_frag_coord(b) : load_frag_coord(b); + + texcoord = nir_fmul(b, frag_coord, div32); + + nir_tex_instr *tex = nir_tex_instr_create(b->shader, 1); + tex->op = nir_texop_tex; + tex->sampler_dim = GLSL_SAMPLER_DIM_2D; + tex->coord_components = 2; + tex->dest_type = nir_type_float; + tex->texture_index = state->stip_tex->data.binding; + tex->sampler_index = state->stip_tex->data.binding; + tex->src[0].src_type = nir_tex_src_coord; + tex->src[0].src = nir_src_for_ssa(texcoord); + nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, NULL); + + nir_builder_instr_insert(b, &tex->instr); + + nir_ssa_def *condition = nir_f2b32(b, nir_channel(b, &tex->dest.ssa, 3)); + nir_intrinsic_instr *discard = nir_intrinsic_instr_create(b->shader, nir_intrinsic_discard_if); + discard->src[0] = nir_src_for_ssa(condition); + nir_builder_instr_insert(b, &discard->instr); + b->shader->info.fs.uses_discard = true; +} + +static void +nir_lower_pstipple_impl(nir_function_impl *impl, + lower_pstipple *state) +{ + nir_builder *b = &state->b; + + nir_builder_init(b, impl); + + nir_block *start = nir_start_block(impl); + nir_lower_pstipple_block(start, state); +} + +void +nir_lower_pstipple_fs(struct nir_shader *shader, + unsigned *samplerUnitOut, + unsigned fixedUnit, + bool fs_pos_is_sysval) +{ + lower_pstipple state = { + .shader = shader, + .fs_pos_is_sysval = fs_pos_is_sysval, + }; + if (shader->info.stage != MESA_SHADER_FRAGMENT) + return; + + int binding = 0; + nir_foreach_variable(var, &shader->uniforms) { + if (glsl_type_is_sampler(var->type)) { + if (var->data.binding >= binding) + binding = var->data.binding + 1; + } + } + const struct glsl_type *sampler2D = + glsl_sampler_type(GLSL_SAMPLER_DIM_2D, false, false, GLSL_TYPE_FLOAT); + + nir_variable *tex_var = nir_variable_create(shader, nir_var_uniform, sampler2D, "stipple_tex"); + tex_var->data.binding = binding; + tex_var->data.explicit_binding = true; + tex_var->data.how_declared = nir_var_hidden; + + shader->info.textures_used |= (1 << binding); + state.stip_tex = tex_var; + + nir_foreach_function(function, shader) { + if (function->impl) { + nir_lower_pstipple_impl(function->impl, &state); + } + } + *samplerUnitOut = binding; +} + +typedef struct { + nir_builder b; + nir_shader *shader; + nir_variable *line_width_input; +} lower_aaline; + +static void +nir_lower_aaline_block(nir_block *block, + lower_aaline *state) +{ + nir_builder *b = &state->b; + nir_foreach_instr(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + if (intrin->intrinsic != nir_intrinsic_store_deref) + continue; + + nir_variable *var = nir_intrinsic_get_var(intrin, 0); + if (var->data.mode != nir_var_shader_out) + continue; + if (var->data.location != FRAG_RESULT_COLOR) + continue; + + nir_ssa_def *out_input = intrin->src[1].ssa; + b->cursor = nir_before_instr(instr); + nir_ssa_def *lw = nir_load_var(b, state->line_width_input); + nir_ssa_def *tmp = nir_fsat(b, nir_fadd(b, nir_channel(b, lw, 1), + nir_fneg(b, nir_fabs(b, nir_channel(b, lw, 0))))); + nir_ssa_def *tmp1 = nir_fsat(b, nir_fadd(b, nir_channel(b, lw, 3), + nir_fneg(b, nir_fabs(b, nir_channel(b, lw, 2))))); + + tmp = nir_fmul(b, tmp, tmp1); + tmp = nir_fmul(b, nir_channel(b, out_input, 3), tmp); + + nir_ssa_def *out = nir_vec4(b, nir_channel(b, out_input, 0), + nir_channel(b, out_input, 1), + nir_channel(b, out_input, 2), + tmp); + nir_instr_rewrite_src(instr, &intrin->src[1], nir_src_for_ssa(out)); + } + +} + +static void +nir_lower_aaline_impl(nir_function_impl *impl, + lower_aaline *state) +{ + nir_builder *b = &state->b; + + nir_builder_init(b, impl); + + nir_foreach_block(block, impl) { + nir_lower_aaline_block(block, state); + } +} + +void +nir_lower_aaline_fs(struct nir_shader *shader, int *varying) +{ + lower_aaline state = { + .shader = shader, + }; + if (shader->info.stage != MESA_SHADER_FRAGMENT) + return; + + int highest_location = -1, highest_drv_location = -1; + nir_foreach_variable(var, &shader->inputs) { + if ((int)var->data.location > highest_location) + highest_location = var->data.location; + if ((int)var->data.driver_location > highest_drv_location) + highest_drv_location = var->data.driver_location; + } + + nir_variable *line_width = nir_variable_create(shader, nir_var_shader_in, + glsl_vec4_type(), "aaline"); + if (highest_location == -1 || highest_location < VARYING_SLOT_VAR0) { + line_width->data.location = VARYING_SLOT_VAR0; + line_width->data.driver_location = highest_drv_location + 1; + } else { + line_width->data.location = highest_location + 1; + line_width->data.driver_location = highest_drv_location + 1; + } + shader->num_inputs++; + *varying = tgsi_get_generic_gl_varying_index(line_width->data.location, true); + state.line_width_input = line_width; + + nir_foreach_function(function, shader) { + if (function->impl) { + nir_lower_aaline_impl(function->impl, &state); + } + } +} + +typedef struct { + nir_builder b; + nir_shader *shader; + nir_variable *input; +} lower_aapoint; + +static void +nir_lower_aapoint_block(nir_block *block, + lower_aapoint *state, nir_ssa_def *sel) +{ + nir_builder *b = &state->b; + nir_foreach_instr(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + if (intrin->intrinsic != nir_intrinsic_store_deref) + continue; + + nir_variable *var = nir_intrinsic_get_var(intrin, 0); + if (var->data.mode != nir_var_shader_out) + continue; + if (var->data.location != FRAG_RESULT_COLOR) + continue; + + nir_ssa_def *out_input = intrin->src[1].ssa; + b->cursor = nir_before_instr(instr); + + nir_ssa_def *tmp = nir_fmul(b, nir_channel(b, out_input, 3), sel); + nir_ssa_def *out = nir_vec4(b, nir_channel(b, out_input, 0), + nir_channel(b, out_input, 1), + nir_channel(b, out_input, 2), + tmp); + nir_instr_rewrite_src(instr, &intrin->src[1], nir_src_for_ssa(out)); + } + +} + +static void +nir_lower_aapoint_impl(nir_function_impl *impl, + lower_aapoint *state) +{ + nir_builder *b = &state->b; + + nir_builder_init(b, impl); + + nir_block *block = nir_start_block(impl); + b->cursor = nir_before_block(block); + + nir_ssa_def *aainput = nir_load_var(b, state->input); + + nir_ssa_def *dist = nir_fadd(b, nir_fmul(b, nir_channel(b, aainput, 0), nir_channel(b, aainput, 0)), + nir_fmul(b, nir_channel(b, aainput, 1), nir_channel(b, aainput, 1))); + + nir_ssa_def *k = nir_channel(b, aainput, 2); + nir_ssa_def *chan_val_one = nir_channel(b, aainput, 3); + nir_ssa_def *comp = nir_flt32(b, chan_val_one, dist); + + nir_intrinsic_instr *discard = nir_intrinsic_instr_create(b->shader, nir_intrinsic_discard_if); + discard->src[0] = nir_src_for_ssa(comp); + nir_builder_instr_insert(b, &discard->instr); + b->shader->info.fs.uses_discard = true; + + /* compute coverage factor = (1-d)/(1-k) */ + /* 1 - k */ + nir_ssa_def *tmp = nir_fadd(b, chan_val_one, nir_fneg(b, k)); + /* 1.0 / (1 - k) */ + tmp = nir_frcp(b, tmp); + + /* 1 - d */ + nir_ssa_def *tmp2 = nir_fadd(b, chan_val_one, nir_fneg(b, dist)); + + /* (1 - d) / (1 - k) */ + nir_ssa_def *coverage = nir_fmul(b, tmp, tmp2); + + /* if (k >= distance) + * sel = coverage; + * else + * sel = 1.0; + */ + nir_ssa_def *sel = nir_b32csel(b, nir_fge32(b, k, dist), coverage, chan_val_one); + + nir_foreach_block(block, impl) { + nir_lower_aapoint_block(block, state, sel); + } +} + +void +nir_lower_aapoint_fs(struct nir_shader *shader, int *varying) +{ + lower_aapoint state = { + .shader = shader, + }; + if (shader->info.stage != MESA_SHADER_FRAGMENT) + return; + + int highest_location = -1, highest_drv_location = -1; + nir_foreach_variable(var, &shader->inputs) { + if ((int)var->data.location > highest_location) + highest_location = var->data.location; + if ((int)var->data.driver_location > highest_drv_location) + highest_drv_location = var->data.driver_location; + } + + nir_variable *aapoint_input = nir_variable_create(shader, nir_var_shader_in, + glsl_vec4_type(), "aapoint"); + if (highest_location == -1 || highest_location < VARYING_SLOT_VAR0) { + aapoint_input->data.location = VARYING_SLOT_VAR0; + } else { + aapoint_input->data.location = highest_location + 1; + } + aapoint_input->data.driver_location = highest_drv_location + 1; + + shader->num_inputs++; + *varying = tgsi_get_generic_gl_varying_index(aapoint_input->data.location, true); + state.input = aapoint_input; + + nir_foreach_function(function, shader) { + if (function->impl) { + nir_lower_aapoint_impl(function->impl, &state); + } + } +} diff -Nru mesa-19.2.8/src/gallium/auxiliary/nir/nir_draw_helpers.h mesa-20.0.8/src/gallium/auxiliary/nir/nir_draw_helpers.h --- mesa-19.2.8/src/gallium/auxiliary/nir/nir_draw_helpers.h 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/nir/nir_draw_helpers.h 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,40 @@ +/************************************************************************** + * + * Copyright 2019 Red Hat. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **************************************************************************/ + +#ifndef NIR_DRAW_HELPERS_H +#define NIR_DRAW_HELPERS_H +struct nir_shader; +void +nir_lower_pstipple_fs(struct nir_shader *shader, + unsigned *samplerUnitOut, + unsigned fixedUnit, + bool fs_pos_is_sysval); + +void +nir_lower_aaline_fs(struct nir_shader *shader, int *varying); + +void +nir_lower_aapoint_fs(struct nir_shader *shader, int *varying); +#endif diff -Nru mesa-19.2.8/src/gallium/auxiliary/nir/nir_to_tgsi_info.c mesa-20.0.8/src/gallium/auxiliary/nir/nir_to_tgsi_info.c --- mesa-19.2.8/src/gallium/auxiliary/nir/nir_to_tgsi_info.c 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/nir/nir_to_tgsi_info.c 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,766 @@ +/* + * Copyright 2017 Advanced Micro Devices, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +/* + * This is ported mostly out of radeonsi, if we can drop TGSI, we can likely + * make a lot this go away. + */ + +#include "nir_to_tgsi_info.h" +#include "util/u_math.h" +#include "nir.h" +#include "nir_deref.h" +#include "tgsi/tgsi_scan.h" +#include "tgsi/tgsi_from_mesa.h" + +static nir_variable* tex_get_texture_var(nir_tex_instr *instr) +{ + for (unsigned i = 0; i < instr->num_srcs; i++) { + switch (instr->src[i].src_type) { + case nir_tex_src_texture_deref: + return nir_deref_instr_get_variable(nir_src_as_deref(instr->src[i].src)); + default: + break; + } + } + + return NULL; +} + +static nir_variable* intrinsic_get_var(nir_intrinsic_instr *instr) +{ + return nir_deref_instr_get_variable(nir_src_as_deref(instr->src[0])); +} + + +static void gather_usage_helper(const nir_deref_instr **deref_ptr, + unsigned location, + uint8_t mask, + uint8_t *usage_mask) +{ + for (; *deref_ptr; deref_ptr++) { + const nir_deref_instr *deref = *deref_ptr; + switch (deref->deref_type) { + case nir_deref_type_array: { + unsigned elem_size = + glsl_count_attribute_slots(deref->type, false); + if (nir_src_is_const(deref->arr.index)) { + location += elem_size * nir_src_as_uint(deref->arr.index); + } else { + unsigned array_elems = + glsl_get_length(deref_ptr[-1]->type); + for (unsigned i = 0; i < array_elems; i++) { + gather_usage_helper(deref_ptr + 1, + location + elem_size * i, + mask, usage_mask); + } + return; + } + break; + } + case nir_deref_type_struct: { + const struct glsl_type *parent_type = + deref_ptr[-1]->type; + unsigned index = deref->strct.index; + for (unsigned i = 0; i < index; i++) { + const struct glsl_type *ft = glsl_get_struct_field(parent_type, i); + location += glsl_count_attribute_slots(ft, false); + } + break; + } + default: + unreachable("Unhandled deref type in gather_components_used_helper"); + } + } + + usage_mask[location] |= mask & 0xf; + if (mask & 0xf0) + usage_mask[location + 1] |= (mask >> 4) & 0xf; +} + +static void gather_usage(const nir_deref_instr *deref, + uint8_t mask, + uint8_t *usage_mask) +{ + nir_deref_path path; + nir_deref_path_init(&path, (nir_deref_instr *)deref, NULL); + + unsigned location_frac = path.path[0]->var->data.location_frac; + if (glsl_type_is_64bit(deref->type)) { + uint8_t new_mask = 0; + for (unsigned i = 0; i < 4; i++) { + if (mask & (1 << i)) + new_mask |= 0x3 << (2 * i); + } + mask = new_mask << location_frac; + } else { + mask <<= location_frac; + mask &= 0xf; + } + + gather_usage_helper((const nir_deref_instr **)&path.path[1], + path.path[0]->var->data.driver_location, + mask, usage_mask); + + nir_deref_path_finish(&path); +} + +static void gather_intrinsic_load_deref_info(const nir_shader *nir, + const nir_intrinsic_instr *instr, + const nir_deref_instr *deref, + bool need_texcoord, + nir_variable *var, + struct tgsi_shader_info *info) +{ + assert(var && var->data.mode == nir_var_shader_in); + + gather_usage(deref, nir_ssa_def_components_read(&instr->dest.ssa), + info->input_usage_mask); + switch (nir->info.stage) { + case MESA_SHADER_VERTEX: { + break; + } + default: { + unsigned semantic_name, semantic_index; + tgsi_get_gl_varying_semantic(var->data.location, need_texcoord, + &semantic_name, &semantic_index); + + if (semantic_name == TGSI_SEMANTIC_COLOR) { + uint8_t mask = nir_ssa_def_components_read(&instr->dest.ssa); + info->colors_read |= mask << (semantic_index * 4); + } + if (semantic_name == TGSI_SEMANTIC_FACE) { + info->uses_frontface = true; + } + break; + } + } +} + +static void scan_instruction(const struct nir_shader *nir, + bool need_texcoord, + struct tgsi_shader_info *info, + nir_instr *instr) +{ + if (instr->type == nir_instr_type_alu) { + nir_alu_instr *alu = nir_instr_as_alu(instr); + + switch (alu->op) { + case nir_op_fddx: + case nir_op_fddy: + case nir_op_fddx_fine: + case nir_op_fddy_fine: + case nir_op_fddx_coarse: + case nir_op_fddy_coarse: + info->uses_derivatives = true; + break; + default: + break; + } + } else if (instr->type == nir_instr_type_tex) { + nir_tex_instr *tex = nir_instr_as_tex(instr); + nir_variable *texture = tex_get_texture_var(tex); + + if (!texture) { + info->samplers_declared |= + u_bit_consecutive(tex->sampler_index, 1); + } else { + if (texture->data.bindless) + info->uses_bindless_samplers = true; + } + + switch (tex->op) { + case nir_texop_tex: + case nir_texop_txb: + case nir_texop_lod: + info->uses_derivatives = true; + break; + default: + break; + } + } else if (instr->type == nir_instr_type_intrinsic) { + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + + switch (intr->intrinsic) { + case nir_intrinsic_load_front_face: + info->uses_frontface = 1; + break; + case nir_intrinsic_load_instance_id: + info->uses_instanceid = 1; + break; + case nir_intrinsic_load_invocation_id: + info->uses_invocationid = true; + break; + case nir_intrinsic_load_num_work_groups: + info->uses_grid_size = true; + break; + case nir_intrinsic_load_local_group_size: + /* The block size is translated to IMM with a fixed block size. */ + if (info->properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] == 0) + info->uses_block_size = true; + break; + case nir_intrinsic_load_local_invocation_id: + case nir_intrinsic_load_work_group_id: { + unsigned mask = nir_ssa_def_components_read(&intr->dest.ssa); + while (mask) { + unsigned i = u_bit_scan(&mask); + + if (intr->intrinsic == nir_intrinsic_load_work_group_id) + info->uses_block_id[i] = true; + else + info->uses_thread_id[i] = true; + } + break; + } + case nir_intrinsic_load_vertex_id: + info->uses_vertexid = 1; + break; + case nir_intrinsic_load_vertex_id_zero_base: + info->uses_vertexid_nobase = 1; + break; + case nir_intrinsic_load_base_vertex: + info->uses_basevertex = 1; + break; + case nir_intrinsic_load_draw_id: + info->uses_drawid = 1; + break; + case nir_intrinsic_load_primitive_id: + info->uses_primid = 1; + break; + case nir_intrinsic_load_sample_mask_in: + info->reads_samplemask = true; + break; + case nir_intrinsic_load_tess_level_inner: + case nir_intrinsic_load_tess_level_outer: + info->reads_tess_factors = true; + break; + case nir_intrinsic_bindless_image_load: + info->uses_bindless_images = true; + + if (nir_intrinsic_image_dim(intr) == GLSL_SAMPLER_DIM_BUF) + info->uses_bindless_buffer_load = true; + else + info->uses_bindless_image_load = true; + break; + case nir_intrinsic_bindless_image_size: + case nir_intrinsic_bindless_image_samples: + info->uses_bindless_images = true; + break; + case nir_intrinsic_bindless_image_store: + info->uses_bindless_images = true; + + if (nir_intrinsic_image_dim(intr) == GLSL_SAMPLER_DIM_BUF) + info->uses_bindless_buffer_store = true; + else + info->uses_bindless_image_store = true; + + info->writes_memory = true; + break; + case nir_intrinsic_image_deref_store: + info->writes_memory = true; + break; + case nir_intrinsic_bindless_image_atomic_add: + case nir_intrinsic_bindless_image_atomic_imin: + case nir_intrinsic_bindless_image_atomic_imax: + case nir_intrinsic_bindless_image_atomic_umin: + case nir_intrinsic_bindless_image_atomic_umax: + case nir_intrinsic_bindless_image_atomic_and: + case nir_intrinsic_bindless_image_atomic_or: + case nir_intrinsic_bindless_image_atomic_xor: + case nir_intrinsic_bindless_image_atomic_exchange: + case nir_intrinsic_bindless_image_atomic_comp_swap: + info->uses_bindless_images = true; + + if (nir_intrinsic_image_dim(intr) == GLSL_SAMPLER_DIM_BUF) + info->uses_bindless_buffer_atomic = true; + else + info->uses_bindless_image_atomic = true; + + info->writes_memory = true; + break; + case nir_intrinsic_image_deref_atomic_add: + case nir_intrinsic_image_deref_atomic_imin: + case nir_intrinsic_image_deref_atomic_imax: + case nir_intrinsic_image_deref_atomic_umin: + case nir_intrinsic_image_deref_atomic_umax: + case nir_intrinsic_image_deref_atomic_and: + case nir_intrinsic_image_deref_atomic_or: + case nir_intrinsic_image_deref_atomic_xor: + case nir_intrinsic_image_deref_atomic_exchange: + case nir_intrinsic_image_deref_atomic_comp_swap: + info->writes_memory = true; + break; + case nir_intrinsic_store_ssbo: + case nir_intrinsic_ssbo_atomic_add: + case nir_intrinsic_ssbo_atomic_imin: + case nir_intrinsic_ssbo_atomic_umin: + case nir_intrinsic_ssbo_atomic_imax: + case nir_intrinsic_ssbo_atomic_umax: + case nir_intrinsic_ssbo_atomic_and: + case nir_intrinsic_ssbo_atomic_or: + case nir_intrinsic_ssbo_atomic_xor: + case nir_intrinsic_ssbo_atomic_exchange: + case nir_intrinsic_ssbo_atomic_comp_swap: + info->writes_memory = true; + break; + case nir_intrinsic_load_deref: { + nir_variable *var = intrinsic_get_var(intr); + nir_variable_mode mode = var->data.mode; + nir_deref_instr *const deref = nir_src_as_deref(intr->src[0]); + enum glsl_base_type base_type = + glsl_get_base_type(glsl_without_array(var->type)); + + if (nir_deref_instr_has_indirect(deref)) { + if (mode == nir_var_shader_in) + info->indirect_files |= (1 << TGSI_FILE_INPUT); + } + if (mode == nir_var_shader_in) { + gather_intrinsic_load_deref_info(nir, intr, deref, need_texcoord, var, info); + + switch (var->data.interpolation) { + case INTERP_MODE_NONE: + if (glsl_base_type_is_integer(base_type)) + break; + + /* fall-through */ + case INTERP_MODE_SMOOTH: + if (var->data.sample) + info->uses_persp_sample = true; + else if (var->data.centroid) + info->uses_persp_centroid = true; + else + info->uses_persp_center = true; + break; + + case INTERP_MODE_NOPERSPECTIVE: + if (var->data.sample) + info->uses_linear_sample = true; + else if (var->data.centroid) + info->uses_linear_centroid = true; + else + info->uses_linear_center = true; + break; + } + } + break; + } + case nir_intrinsic_interp_deref_at_centroid: + case nir_intrinsic_interp_deref_at_sample: + case nir_intrinsic_interp_deref_at_offset: { + enum glsl_interp_mode interp = intrinsic_get_var(intr)->data.interpolation; + switch (interp) { + case INTERP_MODE_SMOOTH: + case INTERP_MODE_NONE: + if (intr->intrinsic == nir_intrinsic_interp_deref_at_centroid) + info->uses_persp_opcode_interp_centroid = true; + else if (intr->intrinsic == nir_intrinsic_interp_deref_at_sample) + info->uses_persp_opcode_interp_sample = true; + else + info->uses_persp_opcode_interp_offset = true; + break; + case INTERP_MODE_NOPERSPECTIVE: + if (intr->intrinsic == nir_intrinsic_interp_deref_at_centroid) + info->uses_linear_opcode_interp_centroid = true; + else if (intr->intrinsic == nir_intrinsic_interp_deref_at_sample) + info->uses_linear_opcode_interp_sample = true; + else + info->uses_linear_opcode_interp_offset = true; + break; + case INTERP_MODE_FLAT: + break; + default: + unreachable("Unsupported interpoation type"); + } + break; + } + default: + break; + } + } +} + +void nir_tgsi_scan_shader(const struct nir_shader *nir, + struct tgsi_shader_info *info, + bool need_texcoord) +{ + nir_function *func; + unsigned i; + + info->processor = pipe_shader_type_from_mesa(nir->info.stage); + info->num_tokens = 2; /* indicate that the shader is non-empty */ + info->num_instructions = 2; + + info->properties[TGSI_PROPERTY_NEXT_SHADER] = + pipe_shader_type_from_mesa(nir->info.next_stage); + + if (nir->info.stage == MESA_SHADER_VERTEX) { + info->properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION] = + nir->info.vs.window_space_position; + } + + if (nir->info.stage == MESA_SHADER_TESS_CTRL) { + info->properties[TGSI_PROPERTY_TCS_VERTICES_OUT] = + nir->info.tess.tcs_vertices_out; + } + + if (nir->info.stage == MESA_SHADER_TESS_EVAL) { + if (nir->info.tess.primitive_mode == GL_ISOLINES) + info->properties[TGSI_PROPERTY_TES_PRIM_MODE] = PIPE_PRIM_LINES; + else + info->properties[TGSI_PROPERTY_TES_PRIM_MODE] = nir->info.tess.primitive_mode; + + STATIC_ASSERT((TESS_SPACING_EQUAL + 1) % 3 == PIPE_TESS_SPACING_EQUAL); + STATIC_ASSERT((TESS_SPACING_FRACTIONAL_ODD + 1) % 3 == + PIPE_TESS_SPACING_FRACTIONAL_ODD); + STATIC_ASSERT((TESS_SPACING_FRACTIONAL_EVEN + 1) % 3 == + PIPE_TESS_SPACING_FRACTIONAL_EVEN); + + info->properties[TGSI_PROPERTY_TES_SPACING] = (nir->info.tess.spacing + 1) % 3; + info->properties[TGSI_PROPERTY_TES_VERTEX_ORDER_CW] = !nir->info.tess.ccw; + info->properties[TGSI_PROPERTY_TES_POINT_MODE] = nir->info.tess.point_mode; + } + + if (nir->info.stage == MESA_SHADER_GEOMETRY) { + info->properties[TGSI_PROPERTY_GS_INPUT_PRIM] = nir->info.gs.input_primitive; + info->properties[TGSI_PROPERTY_GS_OUTPUT_PRIM] = nir->info.gs.output_primitive; + info->properties[TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES] = nir->info.gs.vertices_out; + info->properties[TGSI_PROPERTY_GS_INVOCATIONS] = nir->info.gs.invocations; + } + + if (nir->info.stage == MESA_SHADER_FRAGMENT) { + info->properties[TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL] = + nir->info.fs.early_fragment_tests | nir->info.fs.post_depth_coverage; + info->properties[TGSI_PROPERTY_FS_POST_DEPTH_COVERAGE] = nir->info.fs.post_depth_coverage; + + if (nir->info.fs.pixel_center_integer) { + info->properties[TGSI_PROPERTY_FS_COORD_PIXEL_CENTER] = + TGSI_FS_COORD_PIXEL_CENTER_INTEGER; + } + + if (nir->info.fs.depth_layout != FRAG_DEPTH_LAYOUT_NONE) { + switch (nir->info.fs.depth_layout) { + case FRAG_DEPTH_LAYOUT_ANY: + info->properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT] = TGSI_FS_DEPTH_LAYOUT_ANY; + break; + case FRAG_DEPTH_LAYOUT_GREATER: + info->properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT] = TGSI_FS_DEPTH_LAYOUT_GREATER; + break; + case FRAG_DEPTH_LAYOUT_LESS: + info->properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT] = TGSI_FS_DEPTH_LAYOUT_LESS; + break; + case FRAG_DEPTH_LAYOUT_UNCHANGED: + info->properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT] = TGSI_FS_DEPTH_LAYOUT_UNCHANGED; + break; + default: + unreachable("Unknow depth layout"); + } + } + } + + if (gl_shader_stage_is_compute(nir->info.stage)) { + info->properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] = nir->info.cs.local_size[0]; + info->properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] = nir->info.cs.local_size[1]; + info->properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH] = nir->info.cs.local_size[2]; + } + + i = 0; + uint64_t processed_inputs = 0; + nir_foreach_variable(variable, &nir->inputs) { + unsigned semantic_name, semantic_index; + + const struct glsl_type *type = variable->type; + if (nir_is_per_vertex_io(variable, nir->info.stage)) { + assert(glsl_type_is_array(type)); + type = glsl_get_array_element(type); + } + + unsigned attrib_count = glsl_count_attribute_slots(type, + nir->info.stage == MESA_SHADER_VERTEX); + + i = variable->data.driver_location; + + /* Vertex shader inputs don't have semantics. The state + * tracker has already mapped them to attributes via + * variable->data.driver_location. + */ + if (nir->info.stage == MESA_SHADER_VERTEX) { + continue; + } + + for (unsigned j = 0; j < attrib_count; j++, i++) { + + if (processed_inputs & ((uint64_t)1 << i)) + continue; + + processed_inputs |= ((uint64_t)1 << i); + + tgsi_get_gl_varying_semantic(variable->data.location + j, need_texcoord, + &semantic_name, &semantic_index); + + info->input_semantic_name[i] = semantic_name; + info->input_semantic_index[i] = semantic_index; + + if (semantic_name == TGSI_SEMANTIC_PRIMID) + info->uses_primid = true; + + enum glsl_base_type base_type = + glsl_get_base_type(glsl_without_array(variable->type)); + + switch (variable->data.interpolation) { + case INTERP_MODE_NONE: + if (glsl_base_type_is_integer(base_type)) { + info->input_interpolate[i] = TGSI_INTERPOLATE_CONSTANT; + break; + } + + if (semantic_name == TGSI_SEMANTIC_COLOR) { + info->input_interpolate[i] = TGSI_INTERPOLATE_COLOR; + break; + } + /* fall-through */ + + case INTERP_MODE_SMOOTH: + assert(!glsl_base_type_is_integer(base_type)); + + info->input_interpolate[i] = TGSI_INTERPOLATE_PERSPECTIVE; + break; + + case INTERP_MODE_NOPERSPECTIVE: + assert(!glsl_base_type_is_integer(base_type)); + + info->input_interpolate[i] = TGSI_INTERPOLATE_LINEAR; + break; + + case INTERP_MODE_FLAT: + info->input_interpolate[i] = TGSI_INTERPOLATE_CONSTANT; + break; + } + } + } + + info->num_inputs = nir->num_inputs; + info->file_max[TGSI_FILE_INPUT] = nir->num_inputs - 1; + + i = 0; + uint64_t processed_outputs = 0; + unsigned num_outputs = 0; + nir_foreach_variable(variable, &nir->outputs) { + unsigned semantic_name, semantic_index; + + i = variable->data.driver_location; + + const struct glsl_type *type = variable->type; + if (nir_is_per_vertex_io(variable, nir->info.stage)) { + assert(glsl_type_is_array(type)); + type = glsl_get_array_element(type); + } + + unsigned attrib_count = glsl_count_attribute_slots(type, false); + for (unsigned k = 0; k < attrib_count; k++, i++) { + + if (nir->info.stage == MESA_SHADER_FRAGMENT) { + tgsi_get_gl_frag_result_semantic(variable->data.location + k, + &semantic_name, &semantic_index); + + /* Adjust for dual source blending */ + if (variable->data.index > 0) { + semantic_index++; + } + } else { + tgsi_get_gl_varying_semantic(variable->data.location + k, need_texcoord, + &semantic_name, &semantic_index); + } + + unsigned num_components = 4; + unsigned vector_elements = glsl_get_vector_elements(glsl_without_array(variable->type)); + if (vector_elements) + num_components = vector_elements; + + unsigned component = variable->data.location_frac; + if (glsl_type_is_64bit(glsl_without_array(variable->type))) { + if (glsl_type_is_dual_slot(glsl_without_array(variable->type)) && k % 2) { + num_components = (num_components * 2) - 4; + component = 0; + } else { + num_components = MIN2(num_components * 2, 4); + } + } + + ubyte usagemask = 0; + for (unsigned j = component; j < num_components + component; j++) { + switch (j) { + case 0: + usagemask |= TGSI_WRITEMASK_X; + break; + case 1: + usagemask |= TGSI_WRITEMASK_Y; + break; + case 2: + usagemask |= TGSI_WRITEMASK_Z; + break; + case 3: + usagemask |= TGSI_WRITEMASK_W; + break; + default: + unreachable("error calculating component index"); + } + } + + unsigned gs_out_streams; + if (variable->data.stream & NIR_STREAM_PACKED) { + gs_out_streams = variable->data.stream & ~NIR_STREAM_PACKED; + } else { + assert(variable->data.stream < 4); + gs_out_streams = 0; + for (unsigned j = 0; j < num_components; ++j) + gs_out_streams |= variable->data.stream << (2 * (component + j)); + } + + unsigned streamx = gs_out_streams & 3; + unsigned streamy = (gs_out_streams >> 2) & 3; + unsigned streamz = (gs_out_streams >> 4) & 3; + unsigned streamw = (gs_out_streams >> 6) & 3; + + if (usagemask & TGSI_WRITEMASK_X) { + info->output_usagemask[i] |= TGSI_WRITEMASK_X; + info->output_streams[i] |= streamx; + info->num_stream_output_components[streamx]++; + } + if (usagemask & TGSI_WRITEMASK_Y) { + info->output_usagemask[i] |= TGSI_WRITEMASK_Y; + info->output_streams[i] |= streamy << 2; + info->num_stream_output_components[streamy]++; + } + if (usagemask & TGSI_WRITEMASK_Z) { + info->output_usagemask[i] |= TGSI_WRITEMASK_Z; + info->output_streams[i] |= streamz << 4; + info->num_stream_output_components[streamz]++; + } + if (usagemask & TGSI_WRITEMASK_W) { + info->output_usagemask[i] |= TGSI_WRITEMASK_W; + info->output_streams[i] |= streamw << 6; + info->num_stream_output_components[streamw]++; + } + + /* make sure we only count this location once against + * the num_outputs counter. + */ + if (processed_outputs & ((uint64_t)1 << i)) + continue; + + processed_outputs |= ((uint64_t)1 << i); + num_outputs++; + + info->output_semantic_name[i] = semantic_name; + info->output_semantic_index[i] = semantic_index; + + switch (semantic_name) { + case TGSI_SEMANTIC_PRIMID: + info->writes_primid = true; + break; + case TGSI_SEMANTIC_VIEWPORT_INDEX: + info->writes_viewport_index = true; + break; + case TGSI_SEMANTIC_LAYER: + info->writes_layer = true; + break; + case TGSI_SEMANTIC_PSIZE: + info->writes_psize = true; + break; + case TGSI_SEMANTIC_CLIPVERTEX: + info->writes_clipvertex = true; + break; + case TGSI_SEMANTIC_COLOR: + info->colors_written |= 1 << semantic_index; + break; + case TGSI_SEMANTIC_STENCIL: + info->writes_stencil = true; + break; + case TGSI_SEMANTIC_SAMPLEMASK: + info->writes_samplemask = true; + break; + case TGSI_SEMANTIC_EDGEFLAG: + info->writes_edgeflag = true; + break; + case TGSI_SEMANTIC_POSITION: + if (info->processor == PIPE_SHADER_FRAGMENT) + info->writes_z = true; + else + info->writes_position = true; + break; + } + + if (nir->info.stage == MESA_SHADER_TESS_CTRL) { + switch (semantic_name) { + case TGSI_SEMANTIC_PATCH: + info->reads_perpatch_outputs = true; + break; + case TGSI_SEMANTIC_TESSINNER: + case TGSI_SEMANTIC_TESSOUTER: + info->reads_tessfactor_outputs = true; + break; + default: + info->reads_pervertex_outputs = true; + } + } + } + + unsigned loc = variable->data.location; + if (nir->info.stage == MESA_SHADER_FRAGMENT && + loc == FRAG_RESULT_COLOR && + nir->info.outputs_written & (1ull << loc)) { + assert(attrib_count == 1); + info->properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS] = true; + } + } + + info->num_outputs = num_outputs; + + info->const_file_max[0] = nir->num_uniforms - 1; + info->const_buffers_declared = u_bit_consecutive(1, nir->info.num_ubos); + if (nir->num_uniforms > 0) + info->const_buffers_declared |= 1; + info->images_declared = u_bit_consecutive(0, nir->info.num_images); + info->samplers_declared = nir->info.textures_used; + + info->file_max[TGSI_FILE_SAMPLER] = util_last_bit(info->samplers_declared) - 1; + info->file_max[TGSI_FILE_SAMPLER_VIEW] = info->file_max[TGSI_FILE_SAMPLER]; + info->file_mask[TGSI_FILE_SAMPLER] = info->file_mask[TGSI_FILE_SAMPLER_VIEW] = info->samplers_declared; + info->file_max[TGSI_FILE_IMAGE] = util_last_bit(info->images_declared) - 1; + info->file_mask[TGSI_FILE_IMAGE] = info->images_declared; + + info->num_written_clipdistance = nir->info.clip_distance_array_size; + info->num_written_culldistance = nir->info.cull_distance_array_size; + info->clipdist_writemask = u_bit_consecutive(0, info->num_written_clipdistance); + info->culldist_writemask = u_bit_consecutive(0, info->num_written_culldistance); + + if (info->processor == PIPE_SHADER_FRAGMENT) + info->uses_kill = nir->info.fs.uses_discard; + + func = (struct nir_function *)exec_list_get_head_const(&nir->functions); + nir_foreach_block(block, func->impl) { + nir_foreach_instr(instr, block) + scan_instruction(nir, need_texcoord, info, instr); + } +} diff -Nru mesa-19.2.8/src/gallium/auxiliary/nir/nir_to_tgsi_info.h mesa-20.0.8/src/gallium/auxiliary/nir/nir_to_tgsi_info.h --- mesa-19.2.8/src/gallium/auxiliary/nir/nir_to_tgsi_info.h 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/nir/nir_to_tgsi_info.h 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,42 @@ +/* + * Copyright 2019 Red Hat + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ +#ifndef _NIR_TO_TGSI_INFO_H_ +#define _NIR_TO_TGSI_INFO_H_ + +#include +struct nir_shader; +struct tgsi_shader_info; + +/* only llvmpipe uses this path, so handle draw not using llvm */ +#ifdef LLVM_AVAILABLE +void nir_tgsi_scan_shader(const struct nir_shader *nir, + struct tgsi_shader_info *info, + bool need_texcoord); +#else +static inline void nir_tgsi_scan_shader(const struct nir_shader *nir, + struct tgsi_shader_info *info, + bool need_texcoord) {} +#endif + +#endif diff -Nru mesa-19.2.8/src/gallium/auxiliary/nir/tgsi_to_nir.c mesa-20.0.8/src/gallium/auxiliary/nir/tgsi_to_nir.c --- mesa-19.2.8/src/gallium/auxiliary/nir/tgsi_to_nir.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/nir/tgsi_to_nir.c 2020-06-12 01:21:16.000000000 +0000 @@ -28,8 +28,6 @@ #include "compiler/nir/nir.h" #include "compiler/nir/nir_control_flow.h" #include "compiler/nir/nir_builder.h" -#include "compiler/glsl/gl_nir.h" -#include "compiler/glsl/list.h" #include "compiler/shader_enums.h" #include "tgsi_to_nir.h" @@ -74,6 +72,10 @@ nir_variable *images[PIPE_MAX_SHADER_IMAGES]; nir_variable *ssbo[PIPE_MAX_SHADER_BUFFERS]; + unsigned num_samplers; + unsigned num_images; + unsigned num_msaa_images; + nir_variable *input_var_face; nir_variable *input_var_position; nir_variable *input_var_point; @@ -102,7 +104,6 @@ /* How many TGSI_FILE_IMMEDIATE vec4s have been parsed so far. */ unsigned next_imm; - bool cap_scalar; bool cap_face_is_sysval; bool cap_position_is_sysval; bool cap_point_is_sysval; @@ -439,6 +440,10 @@ } else { var->data.location = tgsi_varying_semantic_to_slot(semantic_name, semantic_index); + if (var->data.location == VARYING_SLOT_FOGC || + var->data.location == VARYING_SLOT_PSIZ) { + var->type = glsl_float_type(); + } } if (is_array) { @@ -657,7 +662,9 @@ unreachable("bad system value"); } - if (load->num_components == 3) + if (load->num_components == 2) + load = nir_swizzle(b, load, SWIZ(X, Y, Y, Y), 4); + else if (load->num_components == 3) load = nir_swizzle(b, load, SWIZ(X, Y, Z, Z), 4); src = nir_src_for_ssa(load); @@ -1111,7 +1118,11 @@ static void ttn_kill_if(nir_builder *b, nir_op op, nir_alu_dest dest, nir_ssa_def **src) { + /* flt must be exact, because NaN shouldn't discard. (apps rely on this) */ + b->exact = true; nir_ssa_def *cmp = nir_bany(b, nir_flt(b, src[0], nir_imm_float(b, 0.0))); + b->exact = false; + nir_intrinsic_instr *discard = nir_intrinsic_instr_create(b->shader, nir_intrinsic_discard_if); discard->src[0] = nir_src_for_ssa(cmp); @@ -1307,7 +1318,8 @@ enum glsl_sampler_dim dim, bool is_shadow, bool is_array, - enum glsl_base_type base_type) + enum glsl_base_type base_type, + nir_texop op) { nir_variable *var = c->samplers[binding]; if (!var) { @@ -1317,7 +1329,17 @@ "sampler"); var->data.binding = binding; var->data.explicit_binding = true; + c->samplers[binding] = var; + c->num_samplers = MAX2(c->num_samplers, binding + 1); + + /* Record textures used */ + unsigned mask = 1 << binding; + c->build.shader->info.textures_used |= mask; + if (op == nir_texop_txf || + op == nir_texop_txf_ms || + op == nir_texop_txf_ms_mcs) + c->build.shader->info.textures_used_by_txf |= mask; } return var; @@ -1339,9 +1361,13 @@ var = nir_variable_create(c->build.shader, nir_var_uniform, type, "image"); var->data.binding = binding; var->data.explicit_binding = true; - var->data.image.access = access; + var->data.access = access; var->data.image.format = format; + c->images[binding] = var; + c->num_images = MAX2(c->num_images, binding + 1); + if (dim == GLSL_SAMPLER_DIM_MS) + c->num_msaa_images = c->num_images; } return var; @@ -1502,7 +1528,8 @@ get_sampler_var(c, sview, instr->sampler_dim, instr->is_shadow, instr->is_array, - base_type_for_alu_type(instr->dest_type)); + base_type_for_alu_type(instr->dest_type), + op); nir_deref_instr *deref = nir_build_deref_var(b, var); @@ -1666,7 +1693,8 @@ get_sampler_var(c, tex_index, txs->sampler_dim, txs->is_shadow, txs->is_array, - base_type_for_alu_type(txs->dest_type)); + base_type_for_alu_type(txs->dest_type), + nir_texop_txs); nir_deref_instr *deref = nir_build_deref_var(b, var); @@ -1902,7 +1930,7 @@ nir_deref_instr *image_deref = nir_build_deref_var(b, image); const struct glsl_type *type = image_deref->type; - nir_intrinsic_set_access(instr, image_deref->var->data.image.access); + nir_intrinsic_set_access(instr, image_deref->var->data.access); instr->src[0] = nir_src_for_ssa(&image_deref->dest.ssa); instr->src[1] = nir_src_for_ssa(src[addr_src_index]); @@ -1914,8 +1942,13 @@ instr->src[2] = nir_src_for_ssa(nir_ssa_undef(b, 1, 32)); } + if (tgsi_inst->Instruction.Opcode == TGSI_OPCODE_LOAD) { + instr->src[3] = nir_src_for_ssa(nir_imm_int(b, 0)); /* LOD */ + } + if (tgsi_inst->Instruction.Opcode == TGSI_OPCODE_STORE) { instr->src[3] = nir_src_for_ssa(nir_swizzle(b, src[1], SWIZ(X, Y, Z, W), 4)); + instr->src[4] = nir_src_for_ssa(nir_imm_int(b, 0)); /* LOD */ } instr->num_components = 4; @@ -2362,6 +2395,12 @@ store_value = nir_channel(b, store_value, 2); else if (var->data.location == FRAG_RESULT_STENCIL) store_value = nir_channel(b, store_value, 1); + } else { + /* FOGC and PSIZ are scalar values */ + if (var->data.location == VARYING_SLOT_FOGC || + var->data.location == VARYING_SLOT_PSIZ) { + store_value = nir_channel(b, store_value, 0); + } } nir_store_deref(b, nir_build_deref_var(b, var), store_value, @@ -2407,7 +2446,6 @@ ttn_read_pipe_caps(struct ttn_compile *c, struct pipe_screen *screen) { - c->cap_scalar = screen->get_shader_param(screen, c->scan->processor, PIPE_SHADER_CAP_SCALAR_ISA); c->cap_packed_uniforms = screen->get_param(screen, PIPE_CAP_PACKED_UNIFORMS); c->cap_samplers_as_deref = screen->get_param(screen, PIPE_CAP_NIR_SAMPLERS_AS_DEREF); c->cap_face_is_sysval = screen->get_param(screen, PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL); @@ -2458,6 +2496,10 @@ s->num_inputs = scan.file_max[TGSI_FILE_INPUT] + 1; s->num_uniforms = scan.const_file_max[0] + 1; s->num_outputs = scan.file_max[TGSI_FILE_OUTPUT] + 1; + s->info.num_ssbos = util_last_bit(scan.shader_buffers_declared); + s->info.num_ubos = util_last_bit(scan.const_buffers_declared >> 1); + s->info.num_images = util_last_bit(scan.images_declared); + s->info.num_textures = util_last_bit(scan.samplers_declared); for (unsigned i = 0; i < TGSI_PROPERTY_COUNT; i++) { unsigned value = scan.properties[i]; @@ -2548,7 +2590,7 @@ } static void -ttn_optimize_nir(nir_shader *nir, bool scalar) +ttn_optimize_nir(nir_shader *nir) { bool progress; do { @@ -2556,8 +2598,8 @@ NIR_PASS_V(nir, nir_lower_vars_to_ssa); - if (scalar) { - NIR_PASS_V(nir, nir_lower_alu_to_scalar, NULL); + if (nir->options->lower_to_scalar) { + NIR_PASS_V(nir, nir_lower_alu_to_scalar, NULL, NULL); NIR_PASS_V(nir, nir_lower_phis_to_scalar); } @@ -2599,7 +2641,7 @@ * so we have to do it here too. */ static void -ttn_finalize_nir(struct ttn_compile *c) +ttn_finalize_nir(struct ttn_compile *c, struct pipe_screen *screen) { struct nir_shader *nir = c->build.shader; @@ -2614,13 +2656,20 @@ if (c->cap_packed_uniforms) NIR_PASS_V(nir, nir_lower_uniforms_to_ubo, 16); - if (c->cap_samplers_as_deref) - NIR_PASS_V(nir, gl_nir_lower_samplers_as_deref, NULL); - else - NIR_PASS_V(nir, gl_nir_lower_samplers, NULL); + if (!c->cap_samplers_as_deref) + NIR_PASS_V(nir, nir_lower_samplers); + + if (screen->finalize_nir) { + screen->finalize_nir(screen, nir, true); + } else { + ttn_optimize_nir(nir); + nir_shader_gather_info(nir, c->build.impl); + } + + nir->info.num_images = c->num_images; + nir->info.num_textures = c->num_samplers; + nir->info.last_msaa_image = c->num_msaa_images - 1; - ttn_optimize_nir(nir, c->cap_scalar); - nir_shader_gather_info(nir, c->build.impl); nir_validate_shader(nir, "TTN: after all optimizations"); } @@ -2633,7 +2682,7 @@ c = ttn_compile_init(tgsi_tokens, NULL, screen); s = c->build.shader; - ttn_finalize_nir(c); + ttn_finalize_nir(c, screen); ralloc_free(c); return s; diff -Nru mesa-19.2.8/src/gallium/auxiliary/nir/tgsi_to_nir.h mesa-20.0.8/src/gallium/auxiliary/nir/tgsi_to_nir.h --- mesa-19.2.8/src/gallium/auxiliary/nir/tgsi_to_nir.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/nir/tgsi_to_nir.h 2020-06-12 01:21:16.000000000 +0000 @@ -21,6 +21,9 @@ * IN THE SOFTWARE. */ +#ifndef TGSI_TO_NIR_H +#define TGSI_TO_NIR_H + #include "compiler/nir/nir.h" #include "pipe/p_screen.h" @@ -31,3 +34,5 @@ struct nir_shader * tgsi_to_nir_noscreen(const void *tgsi_tokens, const nir_shader_compiler_options *options); + +#endif diff -Nru mesa-19.2.8/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c mesa-20.0.8/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c --- mesa-19.2.8/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c 2020-06-12 01:21:16.000000000 +0000 @@ -250,7 +250,7 @@ assert(!fenced_buf->fence); assert(fenced_buf->head.prev); assert(fenced_buf->head.next); - LIST_DEL(&fenced_buf->head); + list_del(&fenced_buf->head); assert(fenced_mgr->num_unfenced); --fenced_mgr->num_unfenced; @@ -276,10 +276,10 @@ p_atomic_inc(&fenced_buf->base.reference.count); - LIST_DEL(&fenced_buf->head); + list_del(&fenced_buf->head); assert(fenced_mgr->num_unfenced); --fenced_mgr->num_unfenced; - LIST_ADDTAIL(&fenced_buf->head, &fenced_mgr->fenced); + list_addtail(&fenced_buf->head, &fenced_mgr->fenced); ++fenced_mgr->num_fenced; } @@ -305,11 +305,11 @@ assert(fenced_buf->head.prev); assert(fenced_buf->head.next); - LIST_DEL(&fenced_buf->head); + list_del(&fenced_buf->head); assert(fenced_mgr->num_fenced); --fenced_mgr->num_fenced; - LIST_ADDTAIL(&fenced_buf->head, &fenced_mgr->unfenced); + list_addtail(&fenced_buf->head, &fenced_mgr->unfenced); ++fenced_mgr->num_unfenced; if (p_atomic_dec_zero(&fenced_buf->base.reference.count)) { @@ -939,7 +939,7 @@ assert(fenced_buf->buffer || fenced_buf->data); - LIST_ADDTAIL(&fenced_buf->head, &fenced_mgr->unfenced); + list_addtail(&fenced_buf->head, &fenced_mgr->unfenced); ++fenced_mgr->num_unfenced; mtx_unlock(&fenced_mgr->mutex); @@ -1027,10 +1027,10 @@ fenced_mgr->max_buffer_size = max_buffer_size; fenced_mgr->max_cpu_total_size = max_cpu_total_size; - LIST_INITHEAD(&fenced_mgr->fenced); + list_inithead(&fenced_mgr->fenced); fenced_mgr->num_fenced = 0; - LIST_INITHEAD(&fenced_mgr->unfenced); + list_inithead(&fenced_mgr->unfenced); fenced_mgr->num_unfenced = 0; (void) mtx_init(&fenced_mgr->mutex, mtx_plain); diff -Nru mesa-19.2.8/src/gallium/auxiliary/pipebuffer/pb_bufmgr_debug.c mesa-20.0.8/src/gallium/auxiliary/pipebuffer/pb_bufmgr_debug.c --- mesa-19.2.8/src/gallium/auxiliary/pipebuffer/pb_bufmgr_debug.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/pipebuffer/pb_bufmgr_debug.c 2020-06-12 01:21:16.000000000 +0000 @@ -236,7 +236,7 @@ pb_debug_buffer_check(buf); mtx_lock(&mgr->mutex); - LIST_DEL(&buf->head); + list_del(&buf->head); mtx_unlock(&mgr->mutex); mtx_destroy(&buf->mutex); @@ -391,7 +391,7 @@ #if 0 mtx_lock(&mgr->mutex); debug_printf("%s: failed to create buffer\n", __FUNCTION__); - if(!LIST_IS_EMPTY(&mgr->list)) + if(!list_is_empty(&mgr->list)) pb_debug_manager_dump_locked(mgr); mtx_unlock(&mgr->mutex); #endif @@ -421,7 +421,7 @@ (void) mtx_init(&buf->mutex, mtx_plain); mtx_lock(&mgr->mutex); - LIST_ADDTAIL(&buf->head, &mgr->list); + list_addtail(&buf->head, &mgr->list); mtx_unlock(&mgr->mutex); return &buf->base; @@ -444,7 +444,7 @@ struct pb_debug_manager *mgr = pb_debug_manager(_mgr); mtx_lock(&mgr->mutex); - if(!LIST_IS_EMPTY(&mgr->list)) { + if(!list_is_empty(&mgr->list)) { debug_printf("%s: unfreed buffers\n", __FUNCTION__); pb_debug_manager_dump_locked(mgr); } @@ -477,7 +477,7 @@ mgr->overflow_size = overflow_size; (void) mtx_init(&mgr->mutex, mtx_plain); - LIST_INITHEAD(&mgr->list); + list_inithead(&mgr->list); return &mgr->base; } diff -Nru mesa-19.2.8/src/gallium/auxiliary/pipebuffer/pb_bufmgr_slab.c mesa-20.0.8/src/gallium/auxiliary/pipebuffer/pb_bufmgr_slab.c --- mesa-19.2.8/src/gallium/auxiliary/pipebuffer/pb_bufmgr_slab.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/pipebuffer/pb_bufmgr_slab.c 2020-06-12 01:21:16.000000000 +0000 @@ -204,17 +204,17 @@ buf->mapCount = 0; - LIST_DEL(list); - LIST_ADDTAIL(list, &slab->freeBuffers); + list_del(list); + list_addtail(list, &slab->freeBuffers); slab->numFree++; if (slab->head.next == &slab->head) - LIST_ADDTAIL(&slab->head, &mgr->slabs); + list_addtail(&slab->head, &mgr->slabs); /* If the slab becomes totally empty, free it */ if (slab->numFree == slab->numBuffers) { list = &slab->head; - LIST_DELINIT(list); + list_delinit(list); pb_reference(&slab->bo, NULL); FREE(slab->buffers); FREE(slab); @@ -333,8 +333,8 @@ goto out_err1; } - LIST_INITHEAD(&slab->head); - LIST_INITHEAD(&slab->freeBuffers); + list_inithead(&slab->head); + list_inithead(&slab->freeBuffers); slab->numBuffers = numBuffers; slab->numFree = 0; slab->mgr = mgr; @@ -350,13 +350,13 @@ buf->start = i* mgr->bufSize; buf->mapCount = 0; cnd_init(&buf->event); - LIST_ADDTAIL(&buf->head, &slab->freeBuffers); + list_addtail(&buf->head, &slab->freeBuffers); slab->numFree++; buf++; } /* Add this slab to the list of partial slabs */ - LIST_ADDTAIL(&slab->head, &mgr->slabs); + list_addtail(&slab->head, &mgr->slabs); return PIPE_OK; @@ -412,10 +412,10 @@ /* If totally full remove from the partial slab list */ if (--slab->numFree == 0) - LIST_DELINIT(list); + list_delinit(list); list = slab->freeBuffers.next; - LIST_DELINIT(list); + list_delinit(list); mtx_unlock(&mgr->mutex); buf = LIST_ENTRY(struct pb_slab_buffer, list, head); @@ -470,7 +470,7 @@ mgr->slabSize = slabSize; mgr->desc = *desc; - LIST_INITHEAD(&mgr->slabs); + list_inithead(&mgr->slabs); (void) mtx_init(&mgr->mutex, mtx_plain); diff -Nru mesa-19.2.8/src/gallium/auxiliary/pipebuffer/pb_cache.c mesa-20.0.8/src/gallium/auxiliary/pipebuffer/pb_cache.c --- mesa-19.2.8/src/gallium/auxiliary/pipebuffer/pb_cache.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/pipebuffer/pb_cache.c 2020-06-12 01:21:16.000000000 +0000 @@ -42,7 +42,7 @@ assert(!pipe_is_referenced(&buf->reference)); if (entry->head.next) { - LIST_DEL(&entry->head); + list_del(&entry->head); assert(mgr->num_buffers); --mgr->num_buffers; mgr->cache_size -= buf->size; @@ -104,7 +104,7 @@ entry->start = os_time_get(); entry->end = entry->start + mgr->usecs; - LIST_ADDTAIL(&entry->head, cache); + list_addtail(&entry->head, cache); ++mgr->num_buffers; mgr->cache_size += buf->size; mtx_unlock(&mgr->mutex); @@ -208,7 +208,7 @@ struct pb_buffer *buf = entry->buffer; mgr->cache_size -= buf->size; - LIST_DEL(&entry->head); + list_del(&entry->head); --mgr->num_buffers; mtx_unlock(&mgr->mutex); /* Increase refcount */ @@ -290,7 +290,7 @@ return; for (i = 0; i < num_heaps; i++) - LIST_INITHEAD(&mgr->buckets[i]); + list_inithead(&mgr->buckets[i]); (void) mtx_init(&mgr->mutex, mtx_plain); mgr->cache_size = 0; diff -Nru mesa-19.2.8/src/gallium/auxiliary/pipebuffer/pb_slab.c mesa-20.0.8/src/gallium/auxiliary/pipebuffer/pb_slab.c --- mesa-19.2.8/src/gallium/auxiliary/pipebuffer/pb_slab.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/pipebuffer/pb_slab.c 2020-06-12 01:21:16.000000000 +0000 @@ -55,18 +55,18 @@ { struct pb_slab *slab = entry->slab; - LIST_DEL(&entry->head); /* remove from reclaim list */ - LIST_ADD(&entry->head, &slab->free); + list_del(&entry->head); /* remove from reclaim list */ + list_add(&entry->head, &slab->free); slab->num_free++; /* Add slab to the group's list if it isn't already linked. */ if (!slab->head.next) { struct pb_slab_group *group = &slabs->groups[entry->group_index]; - LIST_ADDTAIL(&slab->head, &group->slabs); + list_addtail(&slab->head, &group->slabs); } if (slab->num_free >= slab->num_entries) { - LIST_DEL(&slab->head); + list_del(&slab->head); slabs->slab_free(slabs->priv, slab); } } @@ -74,7 +74,7 @@ static void pb_slabs_reclaim_locked(struct pb_slabs *slabs) { - while (!LIST_IS_EMPTY(&slabs->reclaim)) { + while (!list_is_empty(&slabs->reclaim)) { struct pb_slab_entry *entry = LIST_ENTRY(struct pb_slab_entry, slabs->reclaim.next, head); @@ -114,20 +114,20 @@ /* If there is no candidate slab at all, or the first slab has no free * entries, try reclaiming entries. */ - if (LIST_IS_EMPTY(&group->slabs) || - LIST_IS_EMPTY(&LIST_ENTRY(struct pb_slab, group->slabs.next, head)->free)) + if (list_is_empty(&group->slabs) || + list_is_empty(&LIST_ENTRY(struct pb_slab, group->slabs.next, head)->free)) pb_slabs_reclaim_locked(slabs); /* Remove slabs without free entries. */ - while (!LIST_IS_EMPTY(&group->slabs)) { + while (!list_is_empty(&group->slabs)) { slab = LIST_ENTRY(struct pb_slab, group->slabs.next, head); - if (!LIST_IS_EMPTY(&slab->free)) + if (!list_is_empty(&slab->free)) break; - LIST_DEL(&slab->head); + list_del(&slab->head); } - if (LIST_IS_EMPTY(&group->slabs)) { + if (list_is_empty(&group->slabs)) { /* Drop the mutex temporarily to prevent a deadlock where the allocation * calls back into slab functions (most likely to happen for * pb_slab_reclaim if memory is low). @@ -141,11 +141,11 @@ return NULL; mtx_lock(&slabs->mutex); - LIST_ADD(&slab->head, &group->slabs); + list_add(&slab->head, &group->slabs); } entry = LIST_ENTRY(struct pb_slab_entry, slab->free.next, head); - LIST_DEL(&entry->head); + list_del(&entry->head); slab->num_free--; mtx_unlock(&slabs->mutex); @@ -163,7 +163,7 @@ pb_slab_free(struct pb_slabs* slabs, struct pb_slab_entry *entry) { mtx_lock(&slabs->mutex); - LIST_ADDTAIL(&entry->head, &slabs->reclaim); + list_addtail(&entry->head, &slabs->reclaim); mtx_unlock(&slabs->mutex); } @@ -212,7 +212,7 @@ slabs->slab_alloc = slab_alloc; slabs->slab_free = slab_free; - LIST_INITHEAD(&slabs->reclaim); + list_inithead(&slabs->reclaim); num_groups = slabs->num_orders * slabs->num_heaps; slabs->groups = CALLOC(num_groups, sizeof(*slabs->groups)); @@ -221,7 +221,7 @@ for (i = 0; i < num_groups; ++i) { struct pb_slab_group *group = &slabs->groups[i]; - LIST_INITHEAD(&group->slabs); + list_inithead(&group->slabs); } (void) mtx_init(&slabs->mutex, mtx_plain); @@ -241,7 +241,7 @@ /* Reclaim all slab entries (even those that are still in flight). This * implicitly calls slab_free for everything. */ - while (!LIST_IS_EMPTY(&slabs->reclaim)) { + while (!list_is_empty(&slabs->reclaim)) { struct pb_slab_entry *entry = LIST_ENTRY(struct pb_slab_entry, slabs->reclaim.next, head); pb_slab_reclaim(slabs, entry); diff -Nru mesa-19.2.8/src/gallium/auxiliary/pipebuffer/pb_validate.c mesa-20.0.8/src/gallium/auxiliary/pipebuffer/pb_validate.c --- mesa-19.2.8/src/gallium/auxiliary/pipebuffer/pb_validate.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/pipebuffer/pb_validate.c 2020-06-12 01:21:16.000000000 +0000 @@ -78,7 +78,7 @@ flags &= PB_USAGE_GPU_READ_WRITE; if (ht) { - unsigned long entry_idx = (unsigned long) util_hash_table_get(ht, buf); + unsigned entry_idx = (unsigned)(uintptr_t)util_hash_table_get(ht, buf); if (entry_idx) { struct pb_validate_entry *entry = &vl->entries[entry_idx - 1]; @@ -118,7 +118,7 @@ ++vl->used; if (ht) - util_hash_table_set(ht, buf, (void *) (unsigned long) vl->used); + util_hash_table_set(ht, buf, (void *) (uintptr_t) vl->used); return PIPE_OK; } diff -Nru mesa-19.2.8/src/gallium/auxiliary/pipe-loader/driinfo_gallium.h mesa-20.0.8/src/gallium/auxiliary/pipe-loader/driinfo_gallium.h --- mesa-19.2.8/src/gallium/auxiliary/pipe-loader/driinfo_gallium.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/pipe-loader/driinfo_gallium.h 2020-06-12 01:21:16.000000000 +0000 @@ -38,5 +38,7 @@ DRI_CONF_SECTION_MISCELLANEOUS DRI_CONF_ALWAYS_HAVE_DEPTH_BUFFER("false") DRI_CONF_GLSL_ZERO_INIT("false") + DRI_CONF_VS_POSITION_ALWAYS_INVARIANT("false") DRI_CONF_ALLOW_RGB10_CONFIGS("true") + DRI_CONF_ALLOW_FP16_CONFIGS("false") DRI_CONF_SECTION_END diff -Nru mesa-19.2.8/src/gallium/auxiliary/pipe-loader/meson.build mesa-20.0.8/src/gallium/auxiliary/pipe-loader/meson.build --- mesa-19.2.8/src/gallium/auxiliary/pipe-loader/meson.build 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/pipe-loader/meson.build 2020-06-12 01:21:16.000000000 +0000 @@ -27,9 +27,11 @@ ) libpipe_loader_defines = [] +libpipe_loader_links = [] if dep_libdrm.found() files_pipe_loader += files('pipe_loader_drm.c') + libpipe_loader_links += libloader endif if with_dri libpipe_loader_defines += '-DHAVE_PIPE_LOADER_DRI' @@ -37,10 +39,13 @@ if with_gallium_drisw_kms libpipe_loader_defines += '-DHAVE_PIPE_LOADER_KMS' endif +if not (with_gallium_st_nine or with_gallium_opencl) + libpipe_loader_defines += '-DDROP_PIPE_LOADER_MISC' +endif libpipe_loader_static = static_library( 'pipe_loader_static', - [files_pipe_loader, xmlpool_options_h], + files_pipe_loader, include_directories : [ inc_util, inc_loader, inc_gallium, inc_include, inc_src, inc_gallium_aux, inc_gallium_winsys, @@ -53,7 +58,7 @@ libpipe_loader_dynamic = static_library( 'pipe_loader_dynamic', - [files_pipe_loader, xmlpool_options_h], + files_pipe_loader, include_directories : [ inc_util, inc_loader, inc_gallium, inc_include, inc_src, inc_gallium_aux, inc_gallium_winsys, @@ -64,7 +69,7 @@ join_paths(get_option('prefix'), get_option('libdir'), 'gallium-pipe') ) ], - link_with : libloader, + link_with : [libpipe_loader_links], dependencies : [dep_libdrm, idep_xmlconfig], build_by_default : false, ) diff -Nru mesa-19.2.8/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c mesa-20.0.8/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c --- mesa-19.2.8/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c 2020-06-12 01:21:16.000000000 +0000 @@ -135,6 +135,10 @@ .driver_name = "lima", .create_screen = pipe_lima_create_screen, }, + { + .driver_name = "zink", + .create_screen = pipe_zink_create_screen, + }, }; static const struct drm_driver_descriptor default_driver_descriptor = { @@ -208,6 +212,11 @@ plib = &ddev->lib; #endif ddev->dd = get_driver_descriptor(ddev->base.driver_name, plib); + + /* kmsro supports lots of drivers, try as a fallback */ + if (!ddev->dd) + ddev->dd = get_driver_descriptor("kmsro", plib); + if (!ddev->dd) goto fail; @@ -322,7 +331,7 @@ const struct drm_driver_descriptor *dd = get_driver_descriptor(driver_name, &lib); - if (dd && dd->driconf_xml) + if (dd && dd->driconf_xml && *dd->driconf_xml) xml = strdup(*dd->driconf_xml); if (lib) diff -Nru mesa-19.2.8/src/gallium/auxiliary/renderonly/renderonly.c mesa-20.0.8/src/gallium/auxiliary/renderonly/renderonly.c --- mesa-19.2.8/src/gallium/auxiliary/renderonly/renderonly.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/renderonly/renderonly.c 2020-06-12 01:21:16.000000000 +0000 @@ -33,7 +33,7 @@ #include "state_tracker/drm_driver.h" #include "pipe/p_screen.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_inlines.h" #include "util/u_memory.h" @@ -55,7 +55,7 @@ renderonly_scanout_destroy(struct renderonly_scanout *scanout, struct renderonly *ro) { - struct drm_mode_destroy_dumb destroy_dumb = { }; + struct drm_mode_destroy_dumb destroy_dumb = {0}; if (ro->kms_fd != -1) { destroy_dumb.handle = scanout->handle; @@ -76,7 +76,7 @@ .height = rsc->height0, .bpp = util_format_get_blocksizebits(rsc->format), }; - struct drm_mode_destroy_dumb destroy_dumb = { }; + struct drm_mode_destroy_dumb destroy_dumb = {0}; scanout = CALLOC_STRUCT(renderonly_scanout); if (!scanout) diff -Nru mesa-19.2.8/src/gallium/auxiliary/SConscript mesa-20.0.8/src/gallium/auxiliary/SConscript --- mesa-19.2.8/src/gallium/auxiliary/SConscript 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/SConscript 2020-06-12 01:21:16.000000000 +0000 @@ -6,6 +6,8 @@ '#src', 'indices', 'util', + '#src/compiler/nir', + '../../compiler/nir', ]) env = env.Clone() @@ -26,18 +28,6 @@ command = python_cmd + ' $SCRIPT > $TARGET' ) -env.CodeGenerate( - target = 'util/u_format_table.c', - script = '#src/gallium/auxiliary/util/u_format_table.py', - source = ['#src/gallium/auxiliary/util/u_format.csv'], - command = python_cmd + ' $SCRIPT $SOURCE > $TARGET' -) - -env.Depends('util/u_format_table.c', [ - '#src/gallium/auxiliary/util/u_format_parse.py', - 'util/u_format_pack.py', -]) - source = env.ParseSourceList('Makefile.sources', [ 'C_SOURCES', 'VL_STUB_SOURCES', @@ -46,6 +36,7 @@ if env['llvm']: source += env.ParseSourceList('Makefile.sources', [ + 'NIR_SOURCES', 'GALLIVM_SOURCES', ]) diff -Nru mesa-19.2.8/src/gallium/auxiliary/target-helpers/drm_helper.h mesa-20.0.8/src/gallium/auxiliary/target-helpers/drm_helper.h --- mesa-19.2.8/src/gallium/auxiliary/target-helpers/drm_helper.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/target-helpers/drm_helper.h 2020-06-12 01:21:16.000000000 +0000 @@ -414,4 +414,26 @@ #endif +#ifdef GALLIUM_ZINK +#include "zink/zink_public.h" + +struct pipe_screen * +pipe_zink_create_screen(int fd, const struct pipe_screen_config *config) +{ + struct pipe_screen *screen; + screen = zink_drm_create_screen(fd); + return screen ? debug_screen_wrap(screen) : NULL; +} + +#else + +struct pipe_screen * +pipe_zink_create_screen(int fd, const struct pipe_screen_config *config) +{ + fprintf(stderr, "zink: driver missing\n"); + return NULL; +} + +#endif + #endif /* DRM_HELPER_H */ diff -Nru mesa-19.2.8/src/gallium/auxiliary/target-helpers/drm_helper_public.h mesa-20.0.8/src/gallium/auxiliary/target-helpers/drm_helper_public.h --- mesa-19.2.8/src/gallium/auxiliary/target-helpers/drm_helper_public.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/target-helpers/drm_helper_public.h 2020-06-12 01:21:16.000000000 +0000 @@ -4,10 +4,10 @@ struct pipe_screen; struct pipe_screen_config; -const char *iris_driconf_xml; -const char *radeonsi_driconf_xml; -const char *v3d_driconf_xml; -const char *virgl_driconf_xml; +extern const char *iris_driconf_xml; +extern const char *radeonsi_driconf_xml; +extern const char *v3d_driconf_xml; +extern const char *virgl_driconf_xml; struct pipe_screen * pipe_i915_create_screen(int fd, const struct pipe_screen_config *config); @@ -60,4 +60,8 @@ struct pipe_screen * pipe_lima_create_screen(int fd, const struct pipe_screen_config *config); +struct pipe_screen * +pipe_zink_create_screen(int fd, const struct pipe_screen_config *config); + + #endif /* _DRM_HELPER_PUBLIC_H */ diff -Nru mesa-19.2.8/src/gallium/auxiliary/target-helpers/inline_sw_helper.h mesa-20.0.8/src/gallium/auxiliary/target-helpers/inline_sw_helper.h --- mesa-19.2.8/src/gallium/auxiliary/target-helpers/inline_sw_helper.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/target-helpers/inline_sw_helper.h 2020-06-12 01:21:16.000000000 +0000 @@ -55,6 +55,11 @@ screen = swr_create_screen(winsys); #endif +#if defined(GALLIUM_ZINK) + if (screen == NULL && strcmp(driver, "zink") == 0) + screen = zink_create_screen(winsys); +#endif + return screen; } @@ -71,6 +76,8 @@ default_driver = "softpipe"; #elif defined(GALLIUM_SWR) default_driver = "swr"; +#elif defined(GALLIUM_ZINK) + default_driver = "zink"; #else default_driver = ""; #endif diff -Nru mesa-19.2.8/src/gallium/auxiliary/target-helpers/sw_helper.h mesa-20.0.8/src/gallium/auxiliary/target-helpers/sw_helper.h --- mesa-19.2.8/src/gallium/auxiliary/target-helpers/sw_helper.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/target-helpers/sw_helper.h 2020-06-12 01:21:16.000000000 +0000 @@ -12,6 +12,10 @@ * llvmpipe, softpipe, swr. */ +#ifdef GALLIUM_ZINK +#include "zink/zink_public.h" +#endif + #ifdef GALLIUM_SOFTPIPE #include "softpipe/sp_public.h" #endif @@ -57,6 +61,11 @@ screen = swr_create_screen(winsys); #endif +#if defined(GALLIUM_ZINK) + if (screen == NULL && strcmp(driver, "zink") == 0) + screen = zink_create_screen(winsys); +#endif + return screen; } @@ -73,6 +82,8 @@ default_driver = "softpipe"; #elif defined(GALLIUM_SWR) default_driver = "swr"; +#elif defined(GALLIUM_ZINK) + default_driver = "zink"; #else default_driver = ""; #endif diff -Nru mesa-19.2.8/src/gallium/auxiliary/tgsi/tgsi_exec.c mesa-20.0.8/src/gallium/auxiliary/tgsi/tgsi_exec.c --- mesa-19.2.8/src/gallium/auxiliary/tgsi/tgsi_exec.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/tgsi/tgsi_exec.c 2020-06-12 01:21:16.000000000 +0000 @@ -1271,7 +1271,6 @@ tgsi_exec_machine_create(enum pipe_shader_type shader_type) { struct tgsi_exec_machine *mach; - uint i; mach = align_malloc( sizeof *mach, 16 ); if (!mach) diff -Nru mesa-19.2.8/src/gallium/auxiliary/tgsi/tgsi_exec.h mesa-20.0.8/src/gallium/auxiliary/tgsi/tgsi_exec.h --- mesa-19.2.8/src/gallium/auxiliary/tgsi/tgsi_exec.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/tgsi/tgsi_exec.h 2020-06-12 01:21:16.000000000 +0000 @@ -530,8 +530,6 @@ case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTERS: case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTER_BUFFERS: return 0; - case PIPE_SHADER_CAP_SCALAR_ISA: - return 1; case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS: return PIPE_MAX_SHADER_BUFFERS; case PIPE_SHADER_CAP_MAX_SHADER_IMAGES: diff -Nru mesa-19.2.8/src/gallium/auxiliary/tgsi/tgsi_from_mesa.c mesa-20.0.8/src/gallium/auxiliary/tgsi/tgsi_from_mesa.c --- mesa-19.2.8/src/gallium/auxiliary/tgsi/tgsi_from_mesa.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/tgsi/tgsi_from_mesa.c 2020-06-12 01:21:16.000000000 +0000 @@ -123,6 +123,10 @@ *semantic_name = TGSI_SEMANTIC_VIEWPORT_INDEX; *semantic_index = 0; break; + case VARYING_SLOT_FACE: + *semantic_name = TGSI_SEMANTIC_FACE; + *semantic_index = 0; + break; case VARYING_SLOT_PNTC: *semantic_name = TGSI_SEMANTIC_PCOORD; *semantic_index = 0; diff -Nru mesa-19.2.8/src/gallium/auxiliary/tgsi/tgsi_from_mesa.h mesa-20.0.8/src/gallium/auxiliary/tgsi/tgsi_from_mesa.h --- mesa-19.2.8/src/gallium/auxiliary/tgsi/tgsi_from_mesa.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/tgsi/tgsi_from_mesa.h 2020-06-12 01:21:16.000000000 +0000 @@ -32,6 +32,11 @@ #include "compiler/shader_enums.h" +#ifdef __cplusplus +extern "C" { +#endif + + void tgsi_get_gl_varying_semantic(gl_varying_slot attr, bool needs_texcoord_semantic, @@ -62,6 +67,7 @@ case MESA_SHADER_FRAGMENT: return PIPE_SHADER_FRAGMENT; case MESA_SHADER_COMPUTE: + case MESA_SHADER_KERNEL: return PIPE_SHADER_COMPUTE; default: unreachable("bad shader stage"); @@ -83,4 +89,8 @@ } } +#ifdef __cplusplus +} +#endif + #endif /* TGSI_FROM_MESA_H */ diff -Nru mesa-19.2.8/src/gallium/auxiliary/tgsi/tgsi_info.h mesa-20.0.8/src/gallium/auxiliary/tgsi/tgsi_info.h --- mesa-19.2.8/src/gallium/auxiliary/tgsi/tgsi_info.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/tgsi/tgsi_info.h 2020-06-12 01:21:16.000000000 +0000 @@ -30,7 +30,7 @@ #include "pipe/p_compiler.h" #include "pipe/p_shader_tokens.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #if defined __cplusplus extern "C" { diff -Nru mesa-19.2.8/src/gallium/auxiliary/tgsi/tgsi_info_opcodes.h mesa-20.0.8/src/gallium/auxiliary/tgsi/tgsi_info_opcodes.h --- mesa-19.2.8/src/gallium/auxiliary/tgsi/tgsi_info_opcodes.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/tgsi/tgsi_info_opcodes.h 2020-06-12 01:21:16.000000000 +0000 @@ -29,11 +29,11 @@ OPCODE(1, 1, REPL, EX2) OPCODE(1, 1, REPL, LG2) OPCODE(1, 2, REPL, POW) -OPCODE_GAP(31) /* removed */ +OPCODE(0, 0, NONE, DEMOTE) OPCODE(1, 1, COMP, U2I64) OPCODE(1, 0, OTHR, CLOCK) OPCODE(1, 1, COMP, I2I64) -OPCODE_GAP(35) /* removed */ +OPCODE(1, 0, COMP, READ_HELPER) OPCODE(1, 1, REPL, COS) OPCODE(1, 1, COMP, DDX) OPCODE(1, 1, COMP, DDY) diff -Nru mesa-19.2.8/src/gallium/auxiliary/tgsi/tgsi_scan.c mesa-20.0.8/src/gallium/auxiliary/tgsi/tgsi_scan.c --- mesa-19.2.8/src/gallium/auxiliary/tgsi/tgsi_scan.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/tgsi/tgsi_scan.c 2020-06-12 01:21:16.000000000 +0000 @@ -293,6 +293,15 @@ !is_mem_query_inst(fullinst->Instruction.Opcode)) { *is_mem_inst = true; + if (src->Register.File == TGSI_FILE_IMAGE && + (fullinst->Memory.Texture == TGSI_TEXTURE_2D_MSAA || + fullinst->Memory.Texture == TGSI_TEXTURE_2D_ARRAY_MSAA)) { + if (src->Register.Indirect) + info->msaa_images_declared = info->images_declared; + else + info->msaa_images_declared |= 1 << src->Register.Index; + } + if (tgsi_get_opcode_info(fullinst->Instruction.Opcode)->is_store) { info->writes_memory = TRUE; @@ -560,6 +569,14 @@ info->writes_memory = TRUE; if (dst->Register.File == TGSI_FILE_IMAGE) { + if (fullinst->Memory.Texture == TGSI_TEXTURE_2D_MSAA || + fullinst->Memory.Texture == TGSI_TEXTURE_2D_ARRAY_MSAA) { + if (dst->Register.Indirect) + info->msaa_images_declared = info->images_declared; + else + info->msaa_images_declared |= 1 << dst->Register.Index; + } + if (dst->Register.Indirect) info->images_store = info->images_declared; else diff -Nru mesa-19.2.8/src/gallium/auxiliary/tgsi/tgsi_scan.h mesa-20.0.8/src/gallium/auxiliary/tgsi/tgsi_scan.h --- mesa-19.2.8/src/gallium/auxiliary/tgsi/tgsi_scan.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/tgsi/tgsi_scan.h 2020-06-12 01:21:16.000000000 +0000 @@ -126,6 +126,7 @@ boolean uses_block_id[3]; boolean uses_block_size; boolean uses_grid_size; + boolean uses_subgroup_info; boolean writes_position; boolean writes_psize; boolean writes_clipvertex; @@ -144,6 +145,8 @@ unsigned num_written_clipdistance; unsigned images_declared; /**< bitmask of declared images */ + unsigned msaa_images_declared; /**< bitmask of declared MSAA images */ + /** * Bitmask indicating which declared image is a buffer. */ diff -Nru mesa-19.2.8/src/gallium/auxiliary/tgsi/tgsi_ureg.c mesa-20.0.8/src/gallium/auxiliary/tgsi/tgsi_ureg.c --- mesa-19.2.8/src/gallium/auxiliary/tgsi/tgsi_ureg.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/tgsi/tgsi_ureg.c 2020-06-12 01:21:16.000000000 +0000 @@ -2133,7 +2133,7 @@ struct pipe_context *pipe, const struct pipe_stream_output_info *so ) { - struct pipe_shader_state state; + struct pipe_shader_state state = {0}; pipe_shader_state_from_tgsi(&state, ureg_finalize(ureg)); if(!state.tokens) diff -Nru mesa-19.2.8/src/gallium/auxiliary/translate/translate_generic.c mesa-20.0.8/src/gallium/auxiliary/translate/translate_generic.c --- mesa-19.2.8/src/gallium/auxiliary/translate/translate_generic.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/translate/translate_generic.c 2020-06-12 01:21:16.000000000 +0000 @@ -31,7 +31,7 @@ */ #include "util/u_memory.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_half.h" #include "util/u_math.h" #include "pipe/p_state.h" diff -Nru mesa-19.2.8/src/gallium/auxiliary/translate/translate_sse.c mesa-20.0.8/src/gallium/auxiliary/translate/translate_sse.c --- mesa-19.2.8/src/gallium/auxiliary/translate/translate_sse.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/translate/translate_sse.c 2020-06-12 01:21:16.000000000 +0000 @@ -30,7 +30,7 @@ #include "pipe/p_compiler.h" #include "util/u_memory.h" #include "util/u_math.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "translate.h" diff -Nru mesa-19.2.8/src/gallium/auxiliary/util/u_blit.c mesa-20.0.8/src/gallium/auxiliary/util/u_blit.c --- mesa-19.2.8/src/gallium/auxiliary/util/u_blit.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/util/u_blit.c 2020-06-12 01:21:16.000000000 +0000 @@ -42,7 +42,7 @@ #include "util/u_blit.h" #include "util/u_draw_quad.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_math.h" #include "util/u_memory.h" #include "util/u_sampler.h" diff -Nru mesa-19.2.8/src/gallium/auxiliary/util/u_blitter.c mesa-20.0.8/src/gallium/auxiliary/util/u_blitter.c --- mesa-19.2.8/src/gallium/auxiliary/util/u_blitter.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/util/u_blitter.c 2020-06-12 01:21:16.000000000 +0000 @@ -38,7 +38,7 @@ #include "pipe/p_shader_tokens.h" #include "pipe/p_state.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_memory.h" #include "util/u_math.h" #include "util/u_blitter.h" @@ -334,8 +334,10 @@ pipe->screen->get_param(pipe->screen, PIPE_CAP_TGSI_VS_LAYER_VIEWPORT); /* set invariant vertex coordinates */ - for (i = 0; i < 4; i++) + for (i = 0; i < 4; i++) { + ctx->vertices[i][0][2] = 0; /*v.z*/ ctx->vertices[i][0][3] = 1; /*v.w*/ + } return &ctx->base; } @@ -791,8 +793,6 @@ int x1, int y1, int x2, int y2, float depth) { - int i; - /* set vertex positions */ ctx->vertices[0][0][0] = (float)x1 / ctx->dst_width * 2.0f - 1.0f; /*v0.x*/ ctx->vertices[0][0][1] = (float)y1 / ctx->dst_height * 2.0f - 1.0f; /*v0.y*/ @@ -806,17 +806,14 @@ ctx->vertices[3][0][0] = (float)x1 / ctx->dst_width * 2.0f - 1.0f; /*v3.x*/ ctx->vertices[3][0][1] = (float)y2 / ctx->dst_height * 2.0f - 1.0f; /*v3.y*/ - for (i = 0; i < 4; i++) - ctx->vertices[i][0][2] = depth; /*z*/ - /* viewport */ struct pipe_viewport_state viewport; viewport.scale[0] = 0.5f * ctx->dst_width; viewport.scale[1] = 0.5f * ctx->dst_height; - viewport.scale[2] = 1.0f; + viewport.scale[2] = 0.0f; viewport.translate[0] = 0.5f * ctx->dst_width; viewport.translate[1] = 0.5f * ctx->dst_height; - viewport.translate[2] = 0.0f; + viewport.translate[2] = depth; ctx->base.pipe->set_viewport_states(ctx->base.pipe, 0, 1, &viewport); } diff -Nru mesa-19.2.8/src/gallium/auxiliary/util/u_compute.c mesa-20.0.8/src/gallium/auxiliary/util/u_compute.c --- mesa-19.2.8/src/gallium/auxiliary/util/u_compute.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/util/u_compute.c 2020-06-12 01:21:16.000000000 +0000 @@ -30,7 +30,7 @@ #include "pipe/p_state.h" #include "u_bitcast.h" -#include "u_format.h" +#include "util/format/u_format.h" #include "u_sampler.h" #include "tgsi/tgsi_text.h" #include "tgsi/tgsi_ureg.h" @@ -120,12 +120,6 @@ ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 1, &image); - /* Initialize the sampler view. */ - u_sampler_view_default_template(&src_templ, src, src->format); - src_templ.format = util_format_linear(blit_info->src.format); - src_view = ctx->create_sampler_view(ctx, src, &src_templ); - ctx->set_sampler_views(ctx, PIPE_SHADER_COMPUTE, 0, 1, &src_view); - struct pipe_sampler_state sampler_state={0}; sampler_state.wrap_s = PIPE_TEX_WRAP_CLAMP_TO_EDGE; sampler_state.wrap_t = PIPE_TEX_WRAP_CLAMP_TO_EDGE; @@ -140,6 +134,12 @@ sampler_state_p = ctx->create_sampler_state(ctx, &sampler_state); ctx->bind_sampler_states(ctx, PIPE_SHADER_COMPUTE, 0, 1, &sampler_state_p); + /* Initialize the sampler view. */ + u_sampler_view_default_template(&src_templ, src, src->format); + src_templ.format = util_format_linear(blit_info->src.format); + src_view = ctx->create_sampler_view(ctx, src, &src_templ); + ctx->set_sampler_views(ctx, PIPE_SHADER_COMPUTE, 0, 1, &src_view); + if (!*compute_state) *compute_state = blit_compute_shader(ctx); ctx->bind_compute_state(ctx, *compute_state); diff -Nru mesa-19.2.8/src/gallium/auxiliary/util/u_debug_describe.c mesa-20.0.8/src/gallium/auxiliary/util/u_debug_describe.c --- mesa-19.2.8/src/gallium/auxiliary/util/u_debug_describe.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/util/u_debug_describe.c 2020-06-12 01:21:16.000000000 +0000 @@ -25,7 +25,7 @@ **************************************************************************/ #include "pipe/p_state.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_debug_describe.h" #include "util/u_string.h" diff -Nru mesa-19.2.8/src/gallium/auxiliary/util/u_debug_flush.c mesa-20.0.8/src/gallium/auxiliary/util/u_debug_flush.c --- mesa-19.2.8/src/gallium/auxiliary/util/u_debug_flush.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/util/u_debug_flush.c 2020-06-12 01:21:16.000000000 +0000 @@ -51,7 +51,7 @@ #include /* Future improvement: Use realloc instead? */ -#define DEBUG_FLUSH_MAP_DEPTH 16 +#define DEBUG_FLUSH_MAP_DEPTH 32 struct debug_map_item { struct debug_stack_frame *frame; @@ -111,7 +111,7 @@ static unsigned debug_flush_pointer_hash(void *key) { - return (unsigned) (unsigned long) key; + return (unsigned) (uintptr_t) key; } struct debug_flush_buf * diff -Nru mesa-19.2.8/src/gallium/auxiliary/util/u_debug_gallium.c mesa-20.0.8/src/gallium/auxiliary/util/u_debug_gallium.c --- mesa-19.2.8/src/gallium/auxiliary/util/u_debug_gallium.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/util/u_debug_gallium.c 2020-06-12 01:21:16.000000000 +0000 @@ -30,7 +30,7 @@ #include "util/u_debug.h" #include "u_debug_gallium.h" #include "u_dump.h" -#include "u_format.h" +#include "util/format/u_format.h" #ifdef DEBUG diff -Nru mesa-19.2.8/src/gallium/auxiliary/util/u_debug_image.c mesa-20.0.8/src/gallium/auxiliary/util/u_debug_image.c --- mesa-19.2.8/src/gallium/auxiliary/util/u_debug_image.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/util/u_debug_image.c 2020-06-12 01:21:16.000000000 +0000 @@ -25,7 +25,7 @@ #include "util/u_debug_image.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_inlines.h" #include "util/u_memory.h" #include "util/u_string.h" diff -Nru mesa-19.2.8/src/gallium/auxiliary/util/u_debug_memory.c mesa-20.0.8/src/gallium/auxiliary/util/u_debug_memory.c --- mesa-19.2.8/src/gallium/auxiliary/util/u_debug_memory.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/util/u_debug_memory.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,452 +0,0 @@ -/************************************************************************** - * - * Copyright 2008 VMware, Inc. - * All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sub license, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice (including the - * next paragraph) shall be included in all copies or substantial portions - * of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. - * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR - * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - **************************************************************************/ - -/** - * @file - * Memory debugging. - * - * @author José Fonseca - */ - -#include "pipe/p_config.h" - -#define DEBUG_MEMORY_IMPLEMENTATION - -#include "os/os_thread.h" - -#include "util/u_debug.h" -#include "util/u_debug_gallium.h" -#include "util/u_debug_stack.h" -#include "util/list.h" -#include "util/os_memory.h" -#include "util/os_memory_debug.h" - - -#define DEBUG_MEMORY_MAGIC 0x6e34090aU -#define DEBUG_MEMORY_STACK 0 /* XXX: disabled until we have symbol lookup */ - -/** - * Set to 1 to enable checking of freed blocks of memory. - * Basically, don't really deallocate freed memory; keep it in the list - * but mark it as freed and do extra checking in debug_memory_check(). - * This can detect some cases of use-after-free. But note that since we - * never really free anything this will use a lot of memory. - */ -#define DEBUG_FREED_MEMORY 0 -#define DEBUG_FREED_BYTE 0x33 - - -struct debug_memory_header -{ - struct list_head head; - - unsigned long no; - const char *file; - unsigned line; - const char *function; -#if DEBUG_MEMORY_STACK - struct debug_stack_frame backtrace[DEBUG_MEMORY_STACK]; -#endif - size_t size; -#if DEBUG_FREED_MEMORY - boolean freed; /**< Is this a freed block? */ -#endif - - unsigned magic; - unsigned tag; -}; - -struct debug_memory_footer -{ - unsigned magic; -}; - - -static struct list_head list = { &list, &list }; - -static mtx_t list_mutex = _MTX_INITIALIZER_NP; - -static unsigned long last_no = 0; - - -static inline struct debug_memory_header * -header_from_data(void *data) -{ - if (data) - return (struct debug_memory_header *)((char *)data - sizeof(struct debug_memory_header)); - else - return NULL; -} - -static inline void * -data_from_header(struct debug_memory_header *hdr) -{ - if (hdr) - return (void *)((char *)hdr + sizeof(struct debug_memory_header)); - else - return NULL; -} - -static inline struct debug_memory_footer * -footer_from_header(struct debug_memory_header *hdr) -{ - if (hdr) - return (struct debug_memory_footer *)((char *)hdr + sizeof(struct debug_memory_header) + hdr->size); - else - return NULL; -} - - -void * -debug_malloc(const char *file, unsigned line, const char *function, - size_t size) -{ - struct debug_memory_header *hdr; - struct debug_memory_footer *ftr; - - hdr = os_malloc(sizeof(*hdr) + size + sizeof(*ftr)); - if (!hdr) { - debug_printf("%s:%u:%s: out of memory when trying to allocate %lu bytes\n", - file, line, function, - (long unsigned)size); - return NULL; - } - - hdr->no = last_no++; - hdr->file = file; - hdr->line = line; - hdr->function = function; - hdr->size = size; - hdr->magic = DEBUG_MEMORY_MAGIC; - hdr->tag = 0; -#if DEBUG_FREED_MEMORY - hdr->freed = FALSE; -#endif - -#if DEBUG_MEMORY_STACK - debug_backtrace_capture(hdr->backtrace, 0, DEBUG_MEMORY_STACK); -#endif - - ftr = footer_from_header(hdr); - ftr->magic = DEBUG_MEMORY_MAGIC; - - mtx_lock(&list_mutex); - LIST_ADDTAIL(&hdr->head, &list); - mtx_unlock(&list_mutex); - - return data_from_header(hdr); -} - -void -debug_free(const char *file, unsigned line, const char *function, - void *ptr) -{ - struct debug_memory_header *hdr; - struct debug_memory_footer *ftr; - - if (!ptr) - return; - - hdr = header_from_data(ptr); - if (hdr->magic != DEBUG_MEMORY_MAGIC) { - debug_printf("%s:%u:%s: freeing bad or corrupted memory %p\n", - file, line, function, - ptr); - debug_assert(0); - return; - } - - ftr = footer_from_header(hdr); - if (ftr->magic != DEBUG_MEMORY_MAGIC) { - debug_printf("%s:%u:%s: buffer overflow %p\n", - hdr->file, hdr->line, hdr->function, - ptr); - debug_assert(0); - } - -#if DEBUG_FREED_MEMORY - /* Check for double-free */ - assert(!hdr->freed); - /* Mark the block as freed but don't really free it */ - hdr->freed = TRUE; - /* Save file/line where freed */ - hdr->file = file; - hdr->line = line; - /* set freed memory to special value */ - memset(ptr, DEBUG_FREED_BYTE, hdr->size); -#else - mtx_lock(&list_mutex); - LIST_DEL(&hdr->head); - mtx_unlock(&list_mutex); - hdr->magic = 0; - ftr->magic = 0; - - os_free(hdr); -#endif -} - -void * -debug_calloc(const char *file, unsigned line, const char *function, - size_t count, size_t size ) -{ - void *ptr = debug_malloc( file, line, function, count * size ); - if (ptr) - memset( ptr, 0, count * size ); - return ptr; -} - -void * -debug_realloc(const char *file, unsigned line, const char *function, - void *old_ptr, size_t old_size, size_t new_size ) -{ - struct debug_memory_header *old_hdr, *new_hdr; - struct debug_memory_footer *old_ftr, *new_ftr; - void *new_ptr; - - if (!old_ptr) - return debug_malloc( file, line, function, new_size ); - - if (!new_size) { - debug_free( file, line, function, old_ptr ); - return NULL; - } - - old_hdr = header_from_data(old_ptr); - if (old_hdr->magic != DEBUG_MEMORY_MAGIC) { - debug_printf("%s:%u:%s: reallocating bad or corrupted memory %p\n", - file, line, function, - old_ptr); - debug_assert(0); - return NULL; - } - - old_ftr = footer_from_header(old_hdr); - if (old_ftr->magic != DEBUG_MEMORY_MAGIC) { - debug_printf("%s:%u:%s: buffer overflow %p\n", - old_hdr->file, old_hdr->line, old_hdr->function, - old_ptr); - debug_assert(0); - } - - /* alloc new */ - new_hdr = os_malloc(sizeof(*new_hdr) + new_size + sizeof(*new_ftr)); - if (!new_hdr) { - debug_printf("%s:%u:%s: out of memory when trying to allocate %lu bytes\n", - file, line, function, - (long unsigned)new_size); - return NULL; - } - new_hdr->no = old_hdr->no; - new_hdr->file = old_hdr->file; - new_hdr->line = old_hdr->line; - new_hdr->function = old_hdr->function; - new_hdr->size = new_size; - new_hdr->magic = DEBUG_MEMORY_MAGIC; - new_hdr->tag = 0; -#if DEBUG_FREED_MEMORY - new_hdr->freed = FALSE; -#endif - - new_ftr = footer_from_header(new_hdr); - new_ftr->magic = DEBUG_MEMORY_MAGIC; - - mtx_lock(&list_mutex); - LIST_REPLACE(&old_hdr->head, &new_hdr->head); - mtx_unlock(&list_mutex); - - /* copy data */ - new_ptr = data_from_header(new_hdr); - memcpy( new_ptr, old_ptr, old_size < new_size ? old_size : new_size ); - - /* free old */ - old_hdr->magic = 0; - old_ftr->magic = 0; - os_free(old_hdr); - - return new_ptr; -} - -unsigned long -debug_memory_begin(void) -{ - return last_no; -} - -void -debug_memory_end(unsigned long start_no) -{ - size_t total_size = 0; - struct list_head *entry; - - if (start_no == last_no) - return; - - entry = list.prev; - for (; entry != &list; entry = entry->prev) { - struct debug_memory_header *hdr; - void *ptr; - struct debug_memory_footer *ftr; - - hdr = LIST_ENTRY(struct debug_memory_header, entry, head); - ptr = data_from_header(hdr); - ftr = footer_from_header(hdr); - - if (hdr->magic != DEBUG_MEMORY_MAGIC) { - debug_printf("%s:%u:%s: bad or corrupted memory %p\n", - hdr->file, hdr->line, hdr->function, - ptr); - debug_assert(0); - } - - if ((start_no <= hdr->no && hdr->no < last_no) || - (last_no < start_no && (hdr->no < last_no || start_no <= hdr->no))) { - debug_printf("%s:%u:%s: %lu bytes at %p not freed\n", - hdr->file, hdr->line, hdr->function, - (unsigned long) hdr->size, ptr); -#if DEBUG_MEMORY_STACK - debug_backtrace_dump(hdr->backtrace, DEBUG_MEMORY_STACK); -#endif - total_size += hdr->size; - } - - if (ftr->magic != DEBUG_MEMORY_MAGIC) { - debug_printf("%s:%u:%s: buffer overflow %p\n", - hdr->file, hdr->line, hdr->function, - ptr); - debug_assert(0); - } - } - - if (total_size) { - debug_printf("Total of %lu KB of system memory apparently leaked\n", - (unsigned long) (total_size + 1023)/1024); - } - else { - debug_printf("No memory leaks detected.\n"); - } -} - - -/** - * Put a tag (arbitrary integer) on a memory block. - * Can be useful for debugging. - */ -void -debug_memory_tag(void *ptr, unsigned tag) -{ - struct debug_memory_header *hdr; - - if (!ptr) - return; - - hdr = header_from_data(ptr); - if (hdr->magic != DEBUG_MEMORY_MAGIC) { - debug_printf("%s corrupted memory at %p\n", __FUNCTION__, ptr); - debug_assert(0); - } - - hdr->tag = tag; -} - - -/** - * Check the given block of memory for validity/corruption. - */ -void -debug_memory_check_block(void *ptr) -{ - struct debug_memory_header *hdr; - struct debug_memory_footer *ftr; - - if (!ptr) - return; - - hdr = header_from_data(ptr); - ftr = footer_from_header(hdr); - - if (hdr->magic != DEBUG_MEMORY_MAGIC) { - debug_printf("%s:%u:%s: bad or corrupted memory %p\n", - hdr->file, hdr->line, hdr->function, ptr); - debug_assert(0); - } - - if (ftr->magic != DEBUG_MEMORY_MAGIC) { - debug_printf("%s:%u:%s: buffer overflow %p\n", - hdr->file, hdr->line, hdr->function, ptr); - debug_assert(0); - } -} - - - -/** - * We can periodically call this from elsewhere to do a basic sanity - * check of the heap memory we've allocated. - */ -void -debug_memory_check(void) -{ - struct list_head *entry; - - entry = list.prev; - for (; entry != &list; entry = entry->prev) { - struct debug_memory_header *hdr; - struct debug_memory_footer *ftr; - const char *ptr; - - hdr = LIST_ENTRY(struct debug_memory_header, entry, head); - ftr = footer_from_header(hdr); - ptr = (const char *) data_from_header(hdr); - - if (hdr->magic != DEBUG_MEMORY_MAGIC) { - debug_printf("%s:%u:%s: bad or corrupted memory %p\n", - hdr->file, hdr->line, hdr->function, ptr); - debug_assert(0); - } - - if (ftr->magic != DEBUG_MEMORY_MAGIC) { - debug_printf("%s:%u:%s: buffer overflow %p\n", - hdr->file, hdr->line, hdr->function, ptr); - debug_assert(0); - } - -#if DEBUG_FREED_MEMORY - /* If this block is marked as freed, check that it hasn't been touched */ - if (hdr->freed) { - int i; - for (i = 0; i < hdr->size; i++) { - if (ptr[i] != DEBUG_FREED_BYTE) { - debug_printf("Memory error: byte %d of block at %p of size %d is 0x%x\n", - i, ptr, hdr->size, ptr[i]); - debug_printf("Block was freed at %s:%d\n", hdr->file, hdr->line); - } - assert(ptr[i] == DEBUG_FREED_BYTE); - } - } -#endif - } -} diff -Nru mesa-19.2.8/src/gallium/auxiliary/util/u_dirty_surfaces.h mesa-20.0.8/src/gallium/auxiliary/util/u_dirty_surfaces.h --- mesa-19.2.8/src/gallium/auxiliary/util/u_dirty_surfaces.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/util/u_dirty_surfaces.h 2020-06-12 01:21:16.000000000 +0000 @@ -50,7 +50,7 @@ static inline void util_dirty_surfaces_init(struct util_dirty_surfaces *ds) { - LIST_INITHEAD(&ds->dirty_list); + list_inithead(&ds->dirty_list); } static inline void @@ -85,7 +85,7 @@ static inline void util_dirty_surfaces_use_for_sampling_with(struct pipe_context *pipe, struct util_dirty_surfaces *dss, struct pipe_sampler_view *psv, struct pipe_sampler_state *pss, util_dirty_surface_flush_t flush) { - if(!LIST_IS_EMPTY(&dss->dirty_list)) + if(!list_is_empty(&dss->dirty_list)) util_dirty_surfaces_use_levels_for_sampling(pipe, dss, (unsigned)pss->min_lod + psv->u.tex.first_level, MIN2((unsigned)ceilf(pss->max_lod) + psv->u.tex.first_level, psv->u.tex.last_level), flush); } @@ -93,27 +93,27 @@ static inline void util_dirty_surface_init(struct util_dirty_surface *ds) { - LIST_INITHEAD(&ds->dirty_list); + list_inithead(&ds->dirty_list); } static inline boolean util_dirty_surface_is_dirty(struct util_dirty_surface *ds) { - return !LIST_IS_EMPTY(&ds->dirty_list); + return !list_is_empty(&ds->dirty_list); } static inline void util_dirty_surface_set_dirty(struct util_dirty_surfaces *dss, struct util_dirty_surface *ds) { - if(LIST_IS_EMPTY(&ds->dirty_list)) - LIST_ADDTAIL(&ds->dirty_list, &dss->dirty_list); + if(list_is_empty(&ds->dirty_list)) + list_addtail(&ds->dirty_list, &dss->dirty_list); } static inline void util_dirty_surface_set_clean(struct util_dirty_surfaces *dss, struct util_dirty_surface *ds) { - if(!LIST_IS_EMPTY(&ds->dirty_list)) - LIST_DELINIT(&ds->dirty_list); + if(!list_is_empty(&ds->dirty_list)) + list_delinit(&ds->dirty_list); } #endif diff -Nru mesa-19.2.8/src/gallium/auxiliary/util/u_draw.c mesa-20.0.8/src/gallium/auxiliary/util/u_draw.c --- mesa-19.2.8/src/gallium/auxiliary/util/u_draw.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/util/u_draw.c 2020-06-12 01:21:16.000000000 +0000 @@ -29,7 +29,7 @@ #include "util/u_debug.h" #include "util/u_inlines.h" #include "util/u_math.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_draw.h" @@ -143,11 +143,28 @@ memcpy(&info, info_in, sizeof(info)); + uint32_t draw_count = info_in->indirect->draw_count; + + if (info_in->indirect->indirect_draw_count) { + struct pipe_transfer *dc_transfer; + uint32_t *dc_param = pipe_buffer_map_range(pipe, + info_in->indirect->indirect_draw_count, + info_in->indirect->indirect_draw_count_offset, + 4, PIPE_TRANSFER_READ, &dc_transfer); + if (!dc_transfer) { + debug_printf("%s: failed to map indirect draw count buffer\n", __FUNCTION__); + return; + } + if (dc_param[0] < draw_count) + draw_count = dc_param[0]; + pipe_buffer_unmap(pipe, dc_transfer); + } + params = (uint32_t *) pipe_buffer_map_range(pipe, info_in->indirect->buffer, info_in->indirect->offset, - num_params * sizeof(uint32_t), + (num_params * info_in->indirect->draw_count) * sizeof(uint32_t), PIPE_TRANSFER_READ, &transfer); if (!transfer) { @@ -155,14 +172,18 @@ return; } - info.count = params[0]; - info.instance_count = params[1]; - info.start = params[2]; - info.index_bias = info_in->index_size ? params[3] : 0; - info.start_instance = info_in->index_size ? params[4] : params[3]; - info.indirect = NULL; + for (unsigned i = 0; i < draw_count; i++) { + info.count = params[0]; + info.instance_count = params[1]; + info.start = params[2]; + info.index_bias = info_in->index_size ? params[3] : 0; + info.start_instance = info_in->index_size ? params[4] : params[3]; + info.drawid = i; + info.indirect = NULL; - pipe_buffer_unmap(pipe, transfer); + pipe->draw_vbo(pipe, &info); - pipe->draw_vbo(pipe, &info); + params += info_in->indirect->stride / 4; + } + pipe_buffer_unmap(pipe, transfer); } diff -Nru mesa-19.2.8/src/gallium/auxiliary/util/u_dump_state.c mesa-20.0.8/src/gallium/auxiliary/util/u_dump_state.c --- mesa-19.2.8/src/gallium/auxiliary/util/u_dump_state.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/util/u_dump_state.c 2020-06-12 01:21:16.000000000 +0000 @@ -29,7 +29,7 @@ #include "pipe/p_compiler.h" #include "util/u_memory.h" #include "util/u_string.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "tgsi/tgsi_dump.h" #include diff -Nru mesa-19.2.8/src/gallium/auxiliary/util/u_fifo.h mesa-20.0.8/src/gallium/auxiliary/util/u_fifo.h --- mesa-19.2.8/src/gallium/auxiliary/util/u_fifo.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/util/u_fifo.h 2020-06-12 01:21:16.000000000 +0000 @@ -80,7 +80,7 @@ *ptr = array[fifo->tail]; - ++fifo->num; + --fifo->num; return TRUE; } diff -Nru mesa-19.2.8/src/gallium/auxiliary/util/u_format_bptc.c mesa-20.0.8/src/gallium/auxiliary/util/u_format_bptc.c --- mesa-19.2.8/src/gallium/auxiliary/util/u_format_bptc.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/util/u_format_bptc.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,279 +0,0 @@ -/************************************************************************** - * - * Copyright (C) 1999-2007 Brian Paul All Rights Reserved. - * Copyright (c) 2008 VMware, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - * - **************************************************************************/ - -#include "u_format.h" -#include "u_format_bptc.h" -#include "util/format_srgb.h" -#include "util/u_math.h" - -#define BPTC_BLOCK_DECODE -#include "../../../mesa/main/texcompress_bptc_tmp.h" - -void -util_format_bptc_rgba_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - decompress_rgba_unorm(width, height, - src_row, src_stride, - dst_row, dst_stride); -} - -void -util_format_bptc_rgba_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - compress_rgba_unorm(width, height, - src_row, src_stride, - dst_row, dst_stride); -} - -void -util_format_bptc_rgba_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - uint8_t *temp_block; - temp_block = malloc(width * height * 4 * sizeof(uint8_t)); - decompress_rgba_unorm(width, height, - src_row, src_stride, - temp_block, width * 4 * sizeof(uint8_t)); - util_format_read_4f(PIPE_FORMAT_R8G8B8A8_UNORM, - dst_row, dst_stride, - temp_block, width * 4 * sizeof(uint8_t), - 0, 0, width, height); - free((void *) temp_block); -} - -void -util_format_bptc_rgba_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, - const float *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - uint8_t *temp_block; - temp_block = malloc(width * height * 4 * sizeof(uint8_t)); - util_format_read_4ub(PIPE_FORMAT_R32G32B32A32_FLOAT, - temp_block, width * 4 * sizeof(uint8_t), - src_row, src_stride, - 0, 0, width, height); - compress_rgba_unorm(width, height, - temp_block, width * 4 * sizeof(uint8_t), - dst_row, dst_stride); - free((void *) temp_block); -} - -void -util_format_bptc_rgba_unorm_fetch_rgba_float(float *dst, const uint8_t *src, - unsigned width, unsigned height) -{ - uint8_t temp_block[4]; - - fetch_rgba_unorm_from_block(src + ((width * sizeof(uint8_t)) * (height / 4) + (width / 4)) * 16, - temp_block, (width % 4) + (height % 4) * 4); - - util_format_read_4f(PIPE_FORMAT_R8G8B8A8_UNORM, - dst, 4 * sizeof(float), - temp_block, 4 * sizeof(uint8_t), - 0, 0, 1, 1); -} - -void -util_format_bptc_srgba_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - decompress_rgba_unorm(width, height, - src_row, src_stride, - dst_row, dst_stride); -} - -void -util_format_bptc_srgba_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - compress_rgba_unorm(width, height, - src_row, src_stride, - dst_row, dst_stride); -} - -void -util_format_bptc_srgba_unpack_rgba_float(float *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - uint8_t *temp_block; - temp_block = malloc(width * height * 4 * sizeof(uint8_t)); - decompress_rgba_unorm(width, height, - src_row, src_stride, - temp_block, width * 4 * sizeof(uint8_t)); - util_format_read_4f(PIPE_FORMAT_R8G8B8A8_SRGB, - dst_row, dst_stride, - temp_block, width * 4 * sizeof(uint8_t), - 0, 0, width, height); - free((void *) temp_block); -} - -void -util_format_bptc_srgba_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, - const float *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - compress_rgb_float(width, height, - src_row, src_stride, - dst_row, dst_stride, - true); -} - -void -util_format_bptc_srgba_fetch_rgba_float(float *dst, const uint8_t *src, - unsigned width, unsigned height) -{ - uint8_t temp_block[4]; - - fetch_rgba_unorm_from_block(src + ((width * sizeof(uint8_t)) * (height / 4) + (width / 4)) * 16, - temp_block, (width % 4) + (height % 4) * 4); - util_format_read_4f(PIPE_FORMAT_R8G8B8A8_SRGB, - dst, 4 * sizeof(float), - temp_block, width * 4 * sizeof(uint8_t), - 0, 0, 1, 1); -} - -void -util_format_bptc_rgb_float_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - float *temp_block; - temp_block = malloc(width * height * 4 * sizeof(float)); - decompress_rgb_float(width, height, - src_row, src_stride, - temp_block, width * 4 * sizeof(float), - true); - util_format_read_4ub(PIPE_FORMAT_R32G32B32A32_FLOAT, - dst_row, dst_stride, - temp_block, width * 4 * sizeof(float), - 0, 0, width, height); - free((void *) temp_block); -} - -void -util_format_bptc_rgb_float_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - compress_rgba_unorm(width, height, - src_row, src_stride, - dst_row, dst_stride); -} - -void -util_format_bptc_rgb_float_unpack_rgba_float(float *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - decompress_rgb_float(width, height, - src_row, src_stride, - dst_row, dst_stride, - true); -} - -void -util_format_bptc_rgb_float_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, - const float *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - compress_rgb_float(width, height, - src_row, src_stride, - dst_row, dst_stride, - true); -} - -void -util_format_bptc_rgb_float_fetch_rgba_float(float *dst, const uint8_t *src, - unsigned width, unsigned height) -{ - fetch_rgb_float_from_block(src + ((width * sizeof(uint8_t)) * (height / 4) + (width / 4)) * 16, - dst, (width % 4) + (height % 4) * 4, true); -} - -void -util_format_bptc_rgb_ufloat_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - float *temp_block; - temp_block = malloc(width * height * 4 * sizeof(float)); - decompress_rgb_float(width, height, - src_row, src_stride, - temp_block, width * 4 * sizeof(float), - false); - util_format_read_4ub(PIPE_FORMAT_R32G32B32A32_FLOAT, - dst_row, dst_stride, - temp_block, width * 4 * sizeof(float), - 0, 0, width, height); - free((void *) temp_block); -} - -void -util_format_bptc_rgb_ufloat_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - compress_rgba_unorm(width, height, - src_row, src_stride, - dst_row, dst_stride); -} - -void -util_format_bptc_rgb_ufloat_unpack_rgba_float(float *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - decompress_rgb_float(width, height, - src_row, src_stride, - dst_row, dst_stride, - false); -} - -void -util_format_bptc_rgb_ufloat_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, - const float *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - compress_rgb_float(width, height, - src_row, src_stride, - dst_row, dst_stride, - false); -} - -void -util_format_bptc_rgb_ufloat_fetch_rgba_float(float *dst, const uint8_t *src, - unsigned width, unsigned height) -{ - fetch_rgb_float_from_block(src + ((width * sizeof(uint8_t)) * (height / 4) + (width / 4)) * 16, - dst, (width % 4) + (height % 4) * 4, false); -} diff -Nru mesa-19.2.8/src/gallium/auxiliary/util/u_format_bptc.h mesa-20.0.8/src/gallium/auxiliary/util/u_format_bptc.h --- mesa-19.2.8/src/gallium/auxiliary/util/u_format_bptc.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/util/u_format_bptc.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,122 +0,0 @@ -/************************************************************************** - * - * Copyright 2010 VMware, Inc. - * All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sub license, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL - * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, - * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR - * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE - * USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * The above copyright notice and this permission notice (including the - * next paragraph) shall be included in all copies or substantial portions - * of the Software. - * - **************************************************************************/ - - -#ifndef U_FORMAT_BPTC_H_ -#define U_FORMAT_BPTC_H_ - - -#include "pipe/p_compiler.h" - -#ifdef __cplusplus -extern "C" { -#endif - -void -util_format_bptc_rgba_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height); -void -util_format_bptc_rgba_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height); -void -util_format_bptc_rgba_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height); -void -util_format_bptc_rgba_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, - const float *src_row, unsigned src_stride, - unsigned width, unsigned height); -void -util_format_bptc_rgba_unorm_fetch_rgba_float(float *dst, const uint8_t *src, - unsigned width, unsigned height); - -void -util_format_bptc_srgba_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height); -void -util_format_bptc_srgba_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height); -void -util_format_bptc_srgba_unpack_rgba_float(float *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height); -void -util_format_bptc_srgba_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, - const float *src_row, unsigned src_stride, - unsigned width, unsigned height); -void -util_format_bptc_srgba_fetch_rgba_float(float *dst, const uint8_t *src, - unsigned width, unsigned height); - -void -util_format_bptc_rgb_float_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height); -void -util_format_bptc_rgb_float_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height); -void -util_format_bptc_rgb_float_unpack_rgba_float(float *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height); -void -util_format_bptc_rgb_float_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, - const float *src_row, unsigned src_stride, - unsigned width, unsigned height); -void -util_format_bptc_rgb_float_fetch_rgba_float(float *dst, const uint8_t *src, - unsigned width, unsigned height); - -void -util_format_bptc_rgb_ufloat_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height); -void -util_format_bptc_rgb_ufloat_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height); -void -util_format_bptc_rgb_ufloat_unpack_rgba_float(float *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height); -void -util_format_bptc_rgb_ufloat_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, - const float *src_row, unsigned src_stride, - unsigned width, unsigned height); -void -util_format_bptc_rgb_ufloat_fetch_rgba_float(float *dst, const uint8_t *src, - unsigned width, unsigned height); -#ifdef __cplusplus -} -#endif - -#endif /* U_FORMAT_BPTC_H_ */ diff -Nru mesa-19.2.8/src/gallium/auxiliary/util/u_format.c mesa-20.0.8/src/gallium/auxiliary/util/u_format.c --- mesa-19.2.8/src/gallium/auxiliary/util/u_format.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/util/u_format.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,970 +0,0 @@ -/************************************************************************** - * - * Copyright 2010 VMware, Inc. - * All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sub license, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice (including the - * next paragraph) shall be included in all copies or substantial portions - * of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. - * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR - * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - **************************************************************************/ - -/** - * @file - * Pixel format accessor functions. - * - * @author Jose Fonseca - */ - -#include "util/u_memory.h" -#include "u_format.h" -#include "u_format_s3tc.h" -#include "u_surface.h" -#include "util/u_math.h" - -#include "pipe/p_defines.h" - - -boolean -util_format_is_float(enum pipe_format format) -{ - const struct util_format_description *desc = util_format_description(format); - int i; - - assert(desc); - if (!desc) { - return FALSE; - } - - i = util_format_get_first_non_void_channel(format); - if (i < 0) { - return FALSE; - } - - return desc->channel[i].type == UTIL_FORMAT_TYPE_FLOAT ? TRUE : FALSE; -} - - -/** Test if the format contains RGB, but not alpha */ -boolean -util_format_has_alpha(enum pipe_format format) -{ - const struct util_format_description *desc = - util_format_description(format); - - return (desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB || - desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) && - desc->swizzle[3] != PIPE_SWIZZLE_1; -} - - -boolean -util_format_is_luminance(enum pipe_format format) -{ - const struct util_format_description *desc = - util_format_description(format); - - if ((desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB || - desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) && - desc->swizzle[0] == PIPE_SWIZZLE_X && - desc->swizzle[1] == PIPE_SWIZZLE_X && - desc->swizzle[2] == PIPE_SWIZZLE_X && - desc->swizzle[3] == PIPE_SWIZZLE_1) { - return TRUE; - } - return FALSE; -} - -boolean -util_format_is_alpha(enum pipe_format format) -{ - const struct util_format_description *desc = - util_format_description(format); - - if ((desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB || - desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) && - desc->swizzle[0] == PIPE_SWIZZLE_0 && - desc->swizzle[1] == PIPE_SWIZZLE_0 && - desc->swizzle[2] == PIPE_SWIZZLE_0 && - desc->swizzle[3] == PIPE_SWIZZLE_X) { - return TRUE; - } - return FALSE; -} - -boolean -util_format_is_pure_integer(enum pipe_format format) -{ - const struct util_format_description *desc = util_format_description(format); - int i; - - /* Find the first non-void channel. */ - i = util_format_get_first_non_void_channel(format); - if (i == -1) - return FALSE; - - return desc->channel[i].pure_integer ? TRUE : FALSE; -} - -boolean -util_format_is_pure_sint(enum pipe_format format) -{ - const struct util_format_description *desc = util_format_description(format); - int i; - - i = util_format_get_first_non_void_channel(format); - if (i == -1) - return FALSE; - - return (desc->channel[i].type == UTIL_FORMAT_TYPE_SIGNED && desc->channel[i].pure_integer) ? TRUE : FALSE; -} - -boolean -util_format_is_pure_uint(enum pipe_format format) -{ - const struct util_format_description *desc = util_format_description(format); - int i; - - i = util_format_get_first_non_void_channel(format); - if (i == -1) - return FALSE; - - return (desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED && desc->channel[i].pure_integer) ? TRUE : FALSE; -} - -/** - * Returns true if the format contains normalized signed channels. - */ -boolean -util_format_is_snorm(enum pipe_format format) -{ - const struct util_format_description *desc = util_format_description(format); - - return desc->is_snorm; -} - -/** - * Returns true if the format contains normalized unsigned channels. - */ -boolean -util_format_is_unorm(enum pipe_format format) -{ - const struct util_format_description *desc = util_format_description(format); - - return desc->is_unorm; -} - -boolean -util_format_is_snorm8(enum pipe_format format) -{ - const struct util_format_description *desc = util_format_description(format); - int i; - - if (desc->is_mixed) - return FALSE; - - i = util_format_get_first_non_void_channel(format); - if (i == -1) - return FALSE; - - return desc->channel[i].type == UTIL_FORMAT_TYPE_SIGNED && - !desc->channel[i].pure_integer && - desc->channel[i].normalized && - desc->channel[i].size == 8; -} - -boolean -util_format_is_luminance_alpha(enum pipe_format format) -{ - const struct util_format_description *desc = - util_format_description(format); - - if ((desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB || - desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) && - desc->swizzle[0] == PIPE_SWIZZLE_X && - desc->swizzle[1] == PIPE_SWIZZLE_X && - desc->swizzle[2] == PIPE_SWIZZLE_X && - desc->swizzle[3] == PIPE_SWIZZLE_Y) { - return TRUE; - } - return FALSE; -} - - -boolean -util_format_is_intensity(enum pipe_format format) -{ - const struct util_format_description *desc = - util_format_description(format); - - if ((desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB || - desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) && - desc->swizzle[0] == PIPE_SWIZZLE_X && - desc->swizzle[1] == PIPE_SWIZZLE_X && - desc->swizzle[2] == PIPE_SWIZZLE_X && - desc->swizzle[3] == PIPE_SWIZZLE_X) { - return TRUE; - } - return FALSE; -} - -boolean -util_format_is_subsampled_422(enum pipe_format format) -{ - const struct util_format_description *desc = - util_format_description(format); - - return desc->layout == UTIL_FORMAT_LAYOUT_SUBSAMPLED && - desc->block.width == 2 && - desc->block.height == 1 && - desc->block.bits == 32; -} - -/** - * Calculates the MRD for the depth format. MRD is used in depth bias - * for UNORM and unbound depth buffers. When the depth buffer is floating - * point, the depth bias calculation does not use the MRD. However, the - * default MRD will be 1.0 / ((1 << 24) - 1). - */ -double -util_get_depth_format_mrd(const struct util_format_description *desc) -{ - /* - * Depth buffer formats without a depth component OR scenarios - * without a bound depth buffer default to D24. - */ - double mrd = 1.0 / ((1 << 24) - 1); - unsigned depth_channel; - - assert(desc); - - /* - * Some depth formats do not store the depth component in the first - * channel, detect the format and adjust the depth channel. Get the - * swizzled depth component channel. - */ - depth_channel = desc->swizzle[0]; - - if (desc->channel[depth_channel].type == UTIL_FORMAT_TYPE_UNSIGNED && - desc->channel[depth_channel].normalized) { - int depth_bits; - - depth_bits = desc->channel[depth_channel].size; - mrd = 1.0 / ((1ULL << depth_bits) - 1); - } - - return mrd; -} - - -void -util_format_read_4f(enum pipe_format format, - float *dst, unsigned dst_stride, - const void *src, unsigned src_stride, - unsigned x, unsigned y, unsigned w, unsigned h) -{ - const struct util_format_description *format_desc; - const uint8_t *src_row; - float *dst_row; - - format_desc = util_format_description(format); - - assert(x % format_desc->block.width == 0); - assert(y % format_desc->block.height == 0); - - src_row = (const uint8_t *)src + y*src_stride + x*(format_desc->block.bits/8); - dst_row = dst; - - format_desc->unpack_rgba_float(dst_row, dst_stride, src_row, src_stride, w, h); -} - - -void -util_format_write_4f(enum pipe_format format, - const float *src, unsigned src_stride, - void *dst, unsigned dst_stride, - unsigned x, unsigned y, unsigned w, unsigned h) -{ - const struct util_format_description *format_desc; - uint8_t *dst_row; - const float *src_row; - - format_desc = util_format_description(format); - - assert(x % format_desc->block.width == 0); - assert(y % format_desc->block.height == 0); - - dst_row = (uint8_t *)dst + y*dst_stride + x*(format_desc->block.bits/8); - src_row = src; - - format_desc->pack_rgba_float(dst_row, dst_stride, src_row, src_stride, w, h); -} - - -void -util_format_read_4ub(enum pipe_format format, uint8_t *dst, unsigned dst_stride, const void *src, unsigned src_stride, unsigned x, unsigned y, unsigned w, unsigned h) -{ - const struct util_format_description *format_desc; - const uint8_t *src_row; - uint8_t *dst_row; - - format_desc = util_format_description(format); - - assert(x % format_desc->block.width == 0); - assert(y % format_desc->block.height == 0); - - src_row = (const uint8_t *)src + y*src_stride + x*(format_desc->block.bits/8); - dst_row = dst; - - format_desc->unpack_rgba_8unorm(dst_row, dst_stride, src_row, src_stride, w, h); -} - - -void -util_format_write_4ub(enum pipe_format format, const uint8_t *src, unsigned src_stride, void *dst, unsigned dst_stride, unsigned x, unsigned y, unsigned w, unsigned h) -{ - const struct util_format_description *format_desc; - uint8_t *dst_row; - const uint8_t *src_row; - - format_desc = util_format_description(format); - - assert(x % format_desc->block.width == 0); - assert(y % format_desc->block.height == 0); - - dst_row = (uint8_t *)dst + y*dst_stride + x*(format_desc->block.bits/8); - src_row = src; - - format_desc->pack_rgba_8unorm(dst_row, dst_stride, src_row, src_stride, w, h); -} - -void -util_format_read_4ui(enum pipe_format format, - unsigned *dst, unsigned dst_stride, - const void *src, unsigned src_stride, - unsigned x, unsigned y, unsigned w, unsigned h) -{ - const struct util_format_description *format_desc; - const uint8_t *src_row; - uint32_t *dst_row; - - format_desc = util_format_description(format); - - assert(x % format_desc->block.width == 0); - assert(y % format_desc->block.height == 0); - - src_row = (const uint8_t *)src + y*src_stride + x*(format_desc->block.bits/8); - dst_row = dst; - - format_desc->unpack_rgba_uint(dst_row, dst_stride, src_row, src_stride, w, h); -} - -void -util_format_write_4ui(enum pipe_format format, - const unsigned int *src, unsigned src_stride, - void *dst, unsigned dst_stride, - unsigned x, unsigned y, unsigned w, unsigned h) -{ - const struct util_format_description *format_desc; - uint8_t *dst_row; - const uint32_t *src_row; - - format_desc = util_format_description(format); - - assert(x % format_desc->block.width == 0); - assert(y % format_desc->block.height == 0); - - dst_row = (uint8_t *)dst + y*dst_stride + x*(format_desc->block.bits/8); - src_row = src; - - format_desc->pack_rgba_uint(dst_row, dst_stride, src_row, src_stride, w, h); -} - -void -util_format_read_4i(enum pipe_format format, - int *dst, unsigned dst_stride, - const void *src, unsigned src_stride, - unsigned x, unsigned y, unsigned w, unsigned h) -{ - const struct util_format_description *format_desc; - const uint8_t *src_row; - int32_t *dst_row; - - format_desc = util_format_description(format); - - assert(x % format_desc->block.width == 0); - assert(y % format_desc->block.height == 0); - - src_row = (const uint8_t *)src + y*src_stride + x*(format_desc->block.bits/8); - dst_row = dst; - - format_desc->unpack_rgba_sint(dst_row, dst_stride, src_row, src_stride, w, h); -} - -void -util_format_write_4i(enum pipe_format format, - const int *src, unsigned src_stride, - void *dst, unsigned dst_stride, - unsigned x, unsigned y, unsigned w, unsigned h) -{ - const struct util_format_description *format_desc; - uint8_t *dst_row; - const int32_t *src_row; - - format_desc = util_format_description(format); - - assert(x % format_desc->block.width == 0); - assert(y % format_desc->block.height == 0); - - dst_row = (uint8_t *)dst + y*dst_stride + x*(format_desc->block.bits/8); - src_row = src; - - format_desc->pack_rgba_sint(dst_row, dst_stride, src_row, src_stride, w, h); -} - -/** - * Check if we can safely memcopy from the source format to the dest format. - * This basically covers the cases of a "used" channel copied to a typeless - * channel, plus some 1-channel cases. - * Examples of compatible copy formats include: - * b8g8r8a8_unorm -> b8g8r8x8_unorm - * a8r8g8b8_unorm -> x8r8g8b8_unorm - * b5g5r5a1_unorm -> b5g5r5x1_unorm - * b4g4r4a4_unorm -> b4g4r4x4_unorm - * l8_unorm -> r8_unorm - * i8_unorm -> l8_unorm - * i8_unorm -> a8_unorm - * i8_unorm -> r8_unorm - * l16_unorm -> r16_unorm - * z24_unorm_s8_uint -> z24x8_unorm - * s8_uint_z24_unorm -> x8z24_unorm - * r8g8b8a8_unorm -> r8g8b8x8_unorm - * a8b8g8r8_srgb -> x8b8g8r8_srgb - * b8g8r8a8_srgb -> b8g8r8x8_srgb - * a8r8g8b8_srgb -> x8r8g8b8_srgb - * a8b8g8r8_unorm -> x8b8g8r8_unorm - * r10g10b10a2_uscaled -> r10g10b10x2_uscaled - * r10sg10sb10sa2u_norm -> r10g10b10x2_snorm - */ -boolean -util_is_format_compatible(const struct util_format_description *src_desc, - const struct util_format_description *dst_desc) -{ - unsigned chan; - - if (src_desc->format == dst_desc->format) { - return TRUE; - } - - if (src_desc->layout != UTIL_FORMAT_LAYOUT_PLAIN || - dst_desc->layout != UTIL_FORMAT_LAYOUT_PLAIN) { - return FALSE; - } - - if (src_desc->block.bits != dst_desc->block.bits || - src_desc->nr_channels != dst_desc->nr_channels || - src_desc->colorspace != dst_desc->colorspace) { - return FALSE; - } - - for (chan = 0; chan < 4; ++chan) { - if (src_desc->channel[chan].size != - dst_desc->channel[chan].size) { - return FALSE; - } - } - - for (chan = 0; chan < 4; ++chan) { - enum pipe_swizzle swizzle = dst_desc->swizzle[chan]; - - if (swizzle < 4) { - if (src_desc->swizzle[chan] != swizzle) { - return FALSE; - } - if ((src_desc->channel[swizzle].type != - dst_desc->channel[swizzle].type) || - (src_desc->channel[swizzle].normalized != - dst_desc->channel[swizzle].normalized)) { - return FALSE; - } - } - } - - return TRUE; -} - - -boolean -util_format_fits_8unorm(const struct util_format_description *format_desc) -{ - unsigned chan; - - /* - * After linearized sRGB values require more than 8bits. - */ - - if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) { - return FALSE; - } - - switch (format_desc->layout) { - - case UTIL_FORMAT_LAYOUT_S3TC: - /* - * These are straight forward. - */ - return TRUE; - case UTIL_FORMAT_LAYOUT_RGTC: - if (format_desc->format == PIPE_FORMAT_RGTC1_SNORM || - format_desc->format == PIPE_FORMAT_RGTC2_SNORM || - format_desc->format == PIPE_FORMAT_LATC1_SNORM || - format_desc->format == PIPE_FORMAT_LATC2_SNORM) - return FALSE; - return TRUE; - case UTIL_FORMAT_LAYOUT_BPTC: - if (format_desc->format == PIPE_FORMAT_BPTC_RGBA_UNORM) - return TRUE; - return FALSE; - - case UTIL_FORMAT_LAYOUT_ETC: - if (format_desc->format == PIPE_FORMAT_ETC1_RGB8) - return TRUE; - return FALSE; - - case UTIL_FORMAT_LAYOUT_PLAIN: - /* - * For these we can find a generic rule. - */ - - for (chan = 0; chan < format_desc->nr_channels; ++chan) { - switch (format_desc->channel[chan].type) { - case UTIL_FORMAT_TYPE_VOID: - break; - case UTIL_FORMAT_TYPE_UNSIGNED: - if (!format_desc->channel[chan].normalized || - format_desc->channel[chan].size > 8) { - return FALSE; - } - break; - default: - return FALSE; - } - } - return TRUE; - - default: - /* - * Handle all others on a case by case basis. - */ - - switch (format_desc->format) { - case PIPE_FORMAT_R1_UNORM: - case PIPE_FORMAT_UYVY: - case PIPE_FORMAT_YUYV: - case PIPE_FORMAT_R8G8_B8G8_UNORM: - case PIPE_FORMAT_G8R8_G8B8_UNORM: - return TRUE; - - default: - return FALSE; - } - } -} - - -boolean -util_format_translate(enum pipe_format dst_format, - void *dst, unsigned dst_stride, - unsigned dst_x, unsigned dst_y, - enum pipe_format src_format, - const void *src, unsigned src_stride, - unsigned src_x, unsigned src_y, - unsigned width, unsigned height) -{ - const struct util_format_description *dst_format_desc; - const struct util_format_description *src_format_desc; - uint8_t *dst_row; - const uint8_t *src_row; - unsigned x_step, y_step; - unsigned dst_step; - unsigned src_step; - - dst_format_desc = util_format_description(dst_format); - src_format_desc = util_format_description(src_format); - - if (util_is_format_compatible(src_format_desc, dst_format_desc)) { - /* - * Trivial case. - */ - - util_copy_rect(dst, dst_format, dst_stride, dst_x, dst_y, - width, height, src, (int)src_stride, - src_x, src_y); - return TRUE; - } - - assert(dst_x % dst_format_desc->block.width == 0); - assert(dst_y % dst_format_desc->block.height == 0); - assert(src_x % src_format_desc->block.width == 0); - assert(src_y % src_format_desc->block.height == 0); - - dst_row = (uint8_t *)dst + dst_y*dst_stride + dst_x*(dst_format_desc->block.bits/8); - src_row = (const uint8_t *)src + src_y*src_stride + src_x*(src_format_desc->block.bits/8); - - /* - * This works because all pixel formats have pixel blocks with power of two - * sizes. - */ - - y_step = MAX2(dst_format_desc->block.height, src_format_desc->block.height); - x_step = MAX2(dst_format_desc->block.width, src_format_desc->block.width); - assert(y_step % dst_format_desc->block.height == 0); - assert(y_step % src_format_desc->block.height == 0); - - dst_step = y_step / dst_format_desc->block.height * dst_stride; - src_step = y_step / src_format_desc->block.height * src_stride; - - /* - * TODO: double formats will loose precision - * TODO: Add a special case for formats that are mere swizzles of each other - */ - - if (src_format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS || - dst_format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) { - float *tmp_z = NULL; - uint8_t *tmp_s = NULL; - - assert(x_step == 1); - assert(y_step == 1); - - if (src_format_desc->unpack_z_float && - dst_format_desc->pack_z_float) { - tmp_z = MALLOC(width * sizeof *tmp_z); - } - - if (src_format_desc->unpack_s_8uint && - dst_format_desc->pack_s_8uint) { - tmp_s = MALLOC(width * sizeof *tmp_s); - } - - while (height--) { - if (tmp_z) { - src_format_desc->unpack_z_float(tmp_z, 0, src_row, src_stride, width, 1); - dst_format_desc->pack_z_float(dst_row, dst_stride, tmp_z, 0, width, 1); - } - - if (tmp_s) { - src_format_desc->unpack_s_8uint(tmp_s, 0, src_row, src_stride, width, 1); - dst_format_desc->pack_s_8uint(dst_row, dst_stride, tmp_s, 0, width, 1); - } - - dst_row += dst_step; - src_row += src_step; - } - - FREE(tmp_s); - - FREE(tmp_z); - - return TRUE; - } - - if (util_format_fits_8unorm(src_format_desc) || - util_format_fits_8unorm(dst_format_desc)) { - unsigned tmp_stride; - uint8_t *tmp_row; - - if (!src_format_desc->unpack_rgba_8unorm || - !dst_format_desc->pack_rgba_8unorm) { - return FALSE; - } - - tmp_stride = MAX2(width, x_step) * 4 * sizeof *tmp_row; - tmp_row = MALLOC(y_step * tmp_stride); - if (!tmp_row) - return FALSE; - - while (height >= y_step) { - src_format_desc->unpack_rgba_8unorm(tmp_row, tmp_stride, src_row, src_stride, width, y_step); - dst_format_desc->pack_rgba_8unorm(dst_row, dst_stride, tmp_row, tmp_stride, width, y_step); - - dst_row += dst_step; - src_row += src_step; - height -= y_step; - } - - if (height) { - src_format_desc->unpack_rgba_8unorm(tmp_row, tmp_stride, src_row, src_stride, width, height); - dst_format_desc->pack_rgba_8unorm(dst_row, dst_stride, tmp_row, tmp_stride, width, height); - } - - FREE(tmp_row); - } - else if (util_format_is_pure_sint(src_format) || - util_format_is_pure_sint(dst_format)) { - unsigned tmp_stride; - int *tmp_row; - - if (!src_format_desc->unpack_rgba_sint || - !dst_format_desc->pack_rgba_sint) { - return FALSE; - } - - tmp_stride = MAX2(width, x_step) * 4 * sizeof *tmp_row; - tmp_row = MALLOC(y_step * tmp_stride); - if (!tmp_row) - return FALSE; - - while (height >= y_step) { - src_format_desc->unpack_rgba_sint(tmp_row, tmp_stride, src_row, src_stride, width, y_step); - dst_format_desc->pack_rgba_sint(dst_row, dst_stride, tmp_row, tmp_stride, width, y_step); - - dst_row += dst_step; - src_row += src_step; - height -= y_step; - } - - if (height) { - src_format_desc->unpack_rgba_sint(tmp_row, tmp_stride, src_row, src_stride, width, height); - dst_format_desc->pack_rgba_sint(dst_row, dst_stride, tmp_row, tmp_stride, width, height); - } - - FREE(tmp_row); - } - else if (util_format_is_pure_uint(src_format) || - util_format_is_pure_uint(dst_format)) { - unsigned tmp_stride; - unsigned int *tmp_row; - - if (!src_format_desc->unpack_rgba_uint || - !dst_format_desc->pack_rgba_uint) { - return FALSE; - } - - tmp_stride = MAX2(width, x_step) * 4 * sizeof *tmp_row; - tmp_row = MALLOC(y_step * tmp_stride); - if (!tmp_row) - return FALSE; - - while (height >= y_step) { - src_format_desc->unpack_rgba_uint(tmp_row, tmp_stride, src_row, src_stride, width, y_step); - dst_format_desc->pack_rgba_uint(dst_row, dst_stride, tmp_row, tmp_stride, width, y_step); - - dst_row += dst_step; - src_row += src_step; - height -= y_step; - } - - if (height) { - src_format_desc->unpack_rgba_uint(tmp_row, tmp_stride, src_row, src_stride, width, height); - dst_format_desc->pack_rgba_uint(dst_row, dst_stride, tmp_row, tmp_stride, width, height); - } - - FREE(tmp_row); - } - else { - unsigned tmp_stride; - float *tmp_row; - - if (!src_format_desc->unpack_rgba_float || - !dst_format_desc->pack_rgba_float) { - return FALSE; - } - - tmp_stride = MAX2(width, x_step) * 4 * sizeof *tmp_row; - tmp_row = MALLOC(y_step * tmp_stride); - if (!tmp_row) - return FALSE; - - while (height >= y_step) { - src_format_desc->unpack_rgba_float(tmp_row, tmp_stride, src_row, src_stride, width, y_step); - dst_format_desc->pack_rgba_float(dst_row, dst_stride, tmp_row, tmp_stride, width, y_step); - - dst_row += dst_step; - src_row += src_step; - height -= y_step; - } - - if (height) { - src_format_desc->unpack_rgba_float(tmp_row, tmp_stride, src_row, src_stride, width, height); - dst_format_desc->pack_rgba_float(dst_row, dst_stride, tmp_row, tmp_stride, width, height); - } - - FREE(tmp_row); - } - return TRUE; -} - -boolean -util_format_translate_3d(enum pipe_format dst_format, - void *dst, unsigned dst_stride, - unsigned dst_slice_stride, - unsigned dst_x, unsigned dst_y, - unsigned dst_z, - enum pipe_format src_format, - const void *src, unsigned src_stride, - unsigned src_slice_stride, - unsigned src_x, unsigned src_y, - unsigned src_z, unsigned width, - unsigned height, unsigned depth) -{ - uint8_t *dst_layer; - const uint8_t *src_layer; - unsigned z; - dst_layer = dst; - src_layer = src; - dst_layer += dst_z * dst_slice_stride; - src_layer += src_z * src_slice_stride; - for (z = 0; z < depth; ++z) { - if (!util_format_translate(dst_format, dst_layer, dst_stride, - dst_x, dst_y, - src_format, src_layer, src_stride, - src_x, src_y, - width, height)) - return FALSE; - - dst_layer += dst_slice_stride; - src_layer += src_slice_stride; - } - return TRUE; -} - -void util_format_compose_swizzles(const unsigned char swz1[4], - const unsigned char swz2[4], - unsigned char dst[4]) -{ - unsigned i; - - for (i = 0; i < 4; i++) { - dst[i] = swz2[i] <= PIPE_SWIZZLE_W ? - swz1[swz2[i]] : swz2[i]; - } -} - -void util_format_apply_color_swizzle(union pipe_color_union *dst, - const union pipe_color_union *src, - const unsigned char swz[4], - const boolean is_integer) -{ - unsigned c; - - if (is_integer) { - for (c = 0; c < 4; ++c) { - switch (swz[c]) { - case PIPE_SWIZZLE_X: dst->ui[c] = src->ui[0]; break; - case PIPE_SWIZZLE_Y: dst->ui[c] = src->ui[1]; break; - case PIPE_SWIZZLE_Z: dst->ui[c] = src->ui[2]; break; - case PIPE_SWIZZLE_W: dst->ui[c] = src->ui[3]; break; - default: - dst->ui[c] = (swz[c] == PIPE_SWIZZLE_1) ? 1 : 0; - break; - } - } - } else { - for (c = 0; c < 4; ++c) { - switch (swz[c]) { - case PIPE_SWIZZLE_X: dst->f[c] = src->f[0]; break; - case PIPE_SWIZZLE_Y: dst->f[c] = src->f[1]; break; - case PIPE_SWIZZLE_Z: dst->f[c] = src->f[2]; break; - case PIPE_SWIZZLE_W: dst->f[c] = src->f[3]; break; - default: - dst->f[c] = (swz[c] == PIPE_SWIZZLE_1) ? 1.0f : 0.0f; - break; - } - } - } -} - -void pipe_swizzle_4f(float *dst, const float *src, - const unsigned char swz[4]) -{ - unsigned i; - - for (i = 0; i < 4; i++) { - if (swz[i] <= PIPE_SWIZZLE_W) - dst[i] = src[swz[i]]; - else if (swz[i] == PIPE_SWIZZLE_0) - dst[i] = 0; - else if (swz[i] == PIPE_SWIZZLE_1) - dst[i] = 1; - } -} - -void util_format_unswizzle_4f(float *dst, const float *src, - const unsigned char swz[4]) -{ - unsigned i; - - for (i = 0; i < 4; i++) { - switch (swz[i]) { - case PIPE_SWIZZLE_X: - dst[0] = src[i]; - break; - case PIPE_SWIZZLE_Y: - dst[1] = src[i]; - break; - case PIPE_SWIZZLE_Z: - dst[2] = src[i]; - break; - case PIPE_SWIZZLE_W: - dst[3] = src[i]; - break; - } - } -} - -enum pipe_format -util_format_snorm8_to_sint8(enum pipe_format format) -{ - switch (format) { - case PIPE_FORMAT_R8_SNORM: - return PIPE_FORMAT_R8_SINT; - case PIPE_FORMAT_R8G8_SNORM: - return PIPE_FORMAT_R8G8_SINT; - case PIPE_FORMAT_R8G8B8_SNORM: - return PIPE_FORMAT_R8G8B8_SINT; - case PIPE_FORMAT_R8G8B8A8_SNORM: - return PIPE_FORMAT_R8G8B8A8_SINT; - - case PIPE_FORMAT_A8_SNORM: - return PIPE_FORMAT_A8_SINT; - case PIPE_FORMAT_L8_SNORM: - return PIPE_FORMAT_L8_SINT; - case PIPE_FORMAT_L8A8_SNORM: - return PIPE_FORMAT_L8A8_SINT; - case PIPE_FORMAT_I8_SNORM: - return PIPE_FORMAT_I8_SINT; - - case PIPE_FORMAT_R8G8B8X8_SNORM: - return PIPE_FORMAT_R8G8B8X8_SINT; - case PIPE_FORMAT_R8A8_SNORM: - return PIPE_FORMAT_R8A8_SINT; - case PIPE_FORMAT_A8L8_SNORM: - return PIPE_FORMAT_A8L8_SINT; - case PIPE_FORMAT_G8R8_SNORM: - return PIPE_FORMAT_G8R8_SINT; - case PIPE_FORMAT_A8B8G8R8_SNORM: - return PIPE_FORMAT_A8B8G8R8_SINT; - case PIPE_FORMAT_X8B8G8R8_SNORM: - return PIPE_FORMAT_X8B8G8R8_SINT; - - default: - return format; - } -} diff -Nru mesa-19.2.8/src/gallium/auxiliary/util/u_format.csv mesa-20.0.8/src/gallium/auxiliary/util/u_format.csv --- mesa-19.2.8/src/gallium/auxiliary/util/u_format.csv 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/util/u_format.csv 1970-01-01 00:00:00.000000000 +0000 @@ -1,466 +0,0 @@ -########################################################################### -# -# Copyright 2009-2010 VMware, Inc. -# All Rights Reserved. -# -# Permission is hereby granted, free of charge, to any person obtaining a -# copy of this software and associated documentation files (the -# "Software"), to deal in the Software without restriction, including -# without limitation the rights to use, copy, modify, merge, publish, -# distribute, sub license, and/or sell copies of the Software, and to -# permit persons to whom the Software is furnished to do so, subject to -# the following conditions: -# -# The above copyright notice and this permission notice (including the -# next paragraph) shall be included in all copies or substantial portions -# of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS -# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. -# IN NO EVENT SHALL THE AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR -# ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, -# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE -# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -########################################################################### - -# This CSV file has the input data for u_format.h's struct -# util_format_description. It is also used as input for radeonsi's format -# mapping. -# -# Each format entry contains: -# - name, per enum pipe_format -# - layout, per enum util_format_layout, in shortened lower caps -# - pixel block's width -# - pixel block's height -# - channel encoding (only meaningful for plain layout), containing for each -# channel the following information: -# - type, one of -# - 'x': void -# - 'u': unsigned -# - 's': signed -# - 'h': fixed -# - 'f': FLOAT -# - optionally followed by 'n' if it is normalized -# - optionally followed by 'p' if it is pure -# - number of bits -# - channel swizzle -# - color space: rgb, srgb, yuv, zs -# - (optional) channel encoding for big-endian targets -# - (optional) channel swizzle for big-endian targets -# -# See also: -# - http://msdn.microsoft.com/en-us/library/bb172558.aspx (D3D9) -# - http://msdn.microsoft.com/en-us/library/bb205073.aspx#mapping_texture_formats (D3D9 -> D3D10) -# - http://msdn.microsoft.com/en-us/library/bb173059.aspx (D3D10) -# -# Note that GL doesn't really specify the layout of internal formats. See -# OpenGL 2.1 specification, Table 3.16, on the "Correspondence of sized -# internal formats to base in- ternal formats, and desired component -# resolutions for each sized internal format." - -# None -# Described as regular uint_8 bytes, i.e. PIPE_FORMAT_R8_USCALED -PIPE_FORMAT_NONE , plain, 1, 1, u8 , , , , x001, rgb - -# Typical rendertarget formats -PIPE_FORMAT_B8G8R8A8_UNORM , plain, 1, 1, un8 , un8 , un8 , un8 , zyxw, rgb -PIPE_FORMAT_B8G8R8X8_UNORM , plain, 1, 1, un8 , un8 , un8 , x8 , zyx1, rgb -PIPE_FORMAT_A8R8G8B8_UNORM , plain, 1, 1, un8 , un8 , un8 , un8 , yzwx, rgb -PIPE_FORMAT_X8R8G8B8_UNORM , plain, 1, 1, x8 , un8 , un8 , un8 , yzw1, rgb -PIPE_FORMAT_A8B8G8R8_UNORM , plain, 1, 1, un8 , un8 , un8 , un8 , wzyx, rgb -PIPE_FORMAT_X8B8G8R8_UNORM , plain, 1, 1, x8 , un8 , un8 , un8 , wzy1, rgb -# PIPE_FORMAT_R8G8B8A8_UNORM is below -PIPE_FORMAT_R8G8B8X8_UNORM , plain, 1, 1, un8 , un8 , un8 , x8 , xyz1, rgb -PIPE_FORMAT_B5G5R5X1_UNORM , plain, 1, 1, un5 , un5 , un5 , x1 , zyx1, rgb, x1 , un5 , un5 , un5 , yzw1 -PIPE_FORMAT_B5G5R5A1_UNORM , plain, 1, 1, un5 , un5 , un5 , un1 , zyxw, rgb, un1 , un5 , un5 , un5 , yzwx -PIPE_FORMAT_X1B5G5R5_UNORM , plain, 1, 1, x1 , un5 , un5 , un5 , wzy1, rgb, un5 , un5 , un5 , x1 , xyz1 -PIPE_FORMAT_A1B5G5R5_UNORM , plain, 1, 1, un1 , un5 , un5 , un5 , wzyx, rgb, un5 , un5 , un5 , un1 , xyzw -PIPE_FORMAT_B4G4R4A4_UNORM , plain, 1, 1, un4 , un4 , un4 , un4 , zyxw, rgb, un4 , un4 , un4 , un4 , yzwx -PIPE_FORMAT_B4G4R4X4_UNORM , plain, 1, 1, un4 , un4 , un4 , x4 , zyx1, rgb, x4 , un4 , un4 , un4 , yzw1 -PIPE_FORMAT_A4B4G4R4_UNORM , plain, 1, 1, un4 , un4 , un4 , un4 , wzyx, rgb, un4 , un4 , un4 , un4 , xyzw -PIPE_FORMAT_B5G6R5_UNORM , plain, 1, 1, un5 , un6 , un5 , , zyx1, rgb, un5 , un6 , un5 , , xyz1 -PIPE_FORMAT_R10G10B10A2_UNORM , plain, 1, 1, un10, un10, un10, un2 , xyzw, rgb, un2 , un10, un10, un10, wzyx -PIPE_FORMAT_R10G10B10X2_UNORM , plain, 1, 1, un10, un10, un10, x2, xyz1, rgb, x2 , un10, un10, un10, wzy1 -PIPE_FORMAT_B10G10R10A2_UNORM , plain, 1, 1, un10, un10, un10, un2 , zyxw, rgb, un2 , un10, un10, un10, yzwx -PIPE_FORMAT_B2G3R3_UNORM , plain, 1, 1, un2 , un3 , un3 , , zyx1, rgb, un3 , un3 , un2 , , xyz1 - -# Luminance/Intensity/Alpha formats -PIPE_FORMAT_L8_UNORM , plain, 1, 1, un8 , , , , xxx1, rgb -PIPE_FORMAT_A8_UNORM , plain, 1, 1, un8 , , , , 000x, rgb -PIPE_FORMAT_I8_UNORM , plain, 1, 1, un8 , , , , xxxx, rgb -PIPE_FORMAT_L4A4_UNORM , plain, 1, 1, un4 , un4 , , , xxxy, rgb, un4 , un4 , , , yyyx -PIPE_FORMAT_L8A8_UNORM , plain, 1, 1, un8 , un8 , , , xxxy, rgb -PIPE_FORMAT_L16_UNORM , plain, 1, 1, un16, , , , xxx1, rgb -PIPE_FORMAT_A16_UNORM , plain, 1, 1, un16, , , , 000x, rgb -PIPE_FORMAT_I16_UNORM , plain, 1, 1, un16, , , , xxxx, rgb -PIPE_FORMAT_L16A16_UNORM , plain, 1, 1, un16, un16, , , xxxy, rgb -PIPE_FORMAT_A8_SNORM , plain, 1, 1, sn8 , , , , 000x, rgb -PIPE_FORMAT_L8_SNORM , plain, 1, 1, sn8 , , , , xxx1, rgb -PIPE_FORMAT_L8A8_SNORM , plain, 1, 1, sn8 , sn8 , , , xxxy, rgb -PIPE_FORMAT_I8_SNORM , plain, 1, 1, sn8 , , , , xxxx, rgb -PIPE_FORMAT_A16_SNORM , plain, 1, 1, sn16, , , , 000x, rgb -PIPE_FORMAT_L16_SNORM , plain, 1, 1, sn16, , , , xxx1, rgb -PIPE_FORMAT_L16A16_SNORM , plain, 1, 1, sn16, sn16, , , xxxy, rgb -PIPE_FORMAT_I16_SNORM , plain, 1, 1, sn16, , , , xxxx, rgb -PIPE_FORMAT_A16_FLOAT , plain, 1, 1, f16 , , , , 000x, rgb -PIPE_FORMAT_L16_FLOAT , plain, 1, 1, f16 , , , , xxx1, rgb -PIPE_FORMAT_L16A16_FLOAT , plain, 1, 1, f16 , f16 , , , xxxy, rgb -PIPE_FORMAT_I16_FLOAT , plain, 1, 1, f16 , , , , xxxx, rgb -PIPE_FORMAT_A32_FLOAT , plain, 1, 1, f32 , , , , 000x, rgb -PIPE_FORMAT_L32_FLOAT , plain, 1, 1, f32 , , , , xxx1, rgb -PIPE_FORMAT_L32A32_FLOAT , plain, 1, 1, f32 , f32 , , , xxxy, rgb -PIPE_FORMAT_I32_FLOAT , plain, 1, 1, f32 , , , , xxxx, rgb - -# SRGB formats -PIPE_FORMAT_L8_SRGB , plain, 1, 1, un8 , , , , xxx1, srgb -PIPE_FORMAT_R8_SRGB , plain, 1, 1, un8 , , , , x001, srgb -PIPE_FORMAT_L8A8_SRGB , plain, 1, 1, un8 , un8 , , , xxxy, srgb -PIPE_FORMAT_R8G8B8_SRGB , plain, 1, 1, un8 , un8 , un8 , , xyz1, srgb -PIPE_FORMAT_R8G8B8A8_SRGB , plain, 1, 1, un8 , un8 , un8 , un8 , xyzw, srgb -PIPE_FORMAT_A8B8G8R8_SRGB , plain, 1, 1, un8 , un8 , un8 , un8 , wzyx, srgb -PIPE_FORMAT_X8B8G8R8_SRGB , plain, 1, 1, x8 , un8 , un8 , un8 , wzy1, srgb -PIPE_FORMAT_B8G8R8A8_SRGB , plain, 1, 1, un8 , un8 , un8 , un8 , zyxw, srgb -PIPE_FORMAT_B8G8R8X8_SRGB , plain, 1, 1, un8 , un8 , un8 , x8 , zyx1, srgb -PIPE_FORMAT_A8R8G8B8_SRGB , plain, 1, 1, un8 , un8 , un8 , un8 , yzwx, srgb -PIPE_FORMAT_X8R8G8B8_SRGB , plain, 1, 1, x8 , un8 , un8 , un8 , yzw1, srgb - -# Mixed-sign formats (typically used for bump map textures) -PIPE_FORMAT_R8SG8SB8UX8U_NORM , plain, 1, 1, sn8 , sn8 , un8 , x8 , xyz1, rgb -PIPE_FORMAT_R10SG10SB10SA2U_NORM , plain, 1, 1, sn10, sn10, sn10, un2 , xyzw, rgb, un2 , sn10, sn10, sn10, wzyx -PIPE_FORMAT_R5SG5SB6U_NORM , plain, 1, 1, sn5 , sn5 , un6 , , xyz1, rgb, un6 , sn5 , sn5 , , zyx1 - -# Depth-stencil formats -PIPE_FORMAT_S8_UINT , plain, 1, 1, up8 , , , , _x__, zs -PIPE_FORMAT_Z16_UNORM , plain, 1, 1, un16, , , , x___, zs -PIPE_FORMAT_Z32_UNORM , plain, 1, 1, un32, , , , x___, zs -PIPE_FORMAT_Z32_FLOAT , plain, 1, 1, f32 , , , , x___, zs -PIPE_FORMAT_Z24_UNORM_S8_UINT , plain, 1, 1, un24, up8 , , , xy__, zs, up8 , un24, , , yx__ -PIPE_FORMAT_S8_UINT_Z24_UNORM , plain, 1, 1, up8 , un24, , , yx__, zs, un24, up8 , , , xy__ -PIPE_FORMAT_X24S8_UINT , plain, 1, 1, x24 , up8 , , , _y__, zs, up8 , x24 , , , _x__ -PIPE_FORMAT_S8X24_UINT , plain, 1, 1, up8 , x24 , , , _x__, zs, x24 , up8 , , , _y__ -PIPE_FORMAT_Z24X8_UNORM , plain, 1, 1, un24, x8 , , , x___, zs, x8 , un24, , , y___ -PIPE_FORMAT_X8Z24_UNORM , plain, 1, 1, x8 , un24, , , y___, zs, un24, x8 , , , x___ -PIPE_FORMAT_Z32_FLOAT_S8X24_UINT , plain, 1, 1, f32 , up8 , x24, , xy__, zs, f32 , x24 , up8, , xz__ -PIPE_FORMAT_X32_S8X24_UINT , plain, 1, 1, x32 , up8 , x24, , _y__, zs, x32 , x24 , up8, , _z__ - -# Depth-stencil formats equivalent to blitting PIPE_FORMAT_Z24_UNORM_S8_UINT -# as PIPE_FORMAT_R8G8B8A8_*, in that it is an equivalent size to the z/s -# format. This is mainly for hw that has some sort of bandwidth compressed -# format where the compression for z24s8 is not equivalent to r8g8b8a8, -# and therefore some special handling is required for blits. -PIPE_FORMAT_Z24_UNORM_S8_UINT_AS_R8G8B8A8 , plain, 1, 1, un8 , un8 , un8 , un8 , xyzw, rgb - -# YUV formats -# http://www.fourcc.org/yuv.php#UYVY -PIPE_FORMAT_UYVY , subsampled, 2, 1, x32 , , , , xyz1, yuv -# http://www.fourcc.org/yuv.php#YUYV (a.k.a http://www.fourcc.org/yuv.php#YUY2) -PIPE_FORMAT_YUYV , subsampled, 2, 1, x32 , , , , xyz1, yuv - -PIPE_FORMAT_AYUV , other, 4, 4, un8 , , , , xyzw, yuv -PIPE_FORMAT_XYUV , other, 4, 4, un8 , , , , xyz1, yuv - -# same subsampling but with rgb channels -PIPE_FORMAT_R8G8_B8G8_UNORM , subsampled, 2, 1, x32 , , , , xyz1, rgb -PIPE_FORMAT_G8R8_G8B8_UNORM , subsampled, 2, 1, x32 , , , , xyz1, rgb -PIPE_FORMAT_G8R8_B8R8_UNORM , subsampled, 2, 1, x32 , , , , yxz1, rgb -PIPE_FORMAT_R8G8_R8B8_UNORM , subsampled, 2, 1, x32 , , , , yxz1, rgb - -# some special formats not fitting anywhere else -PIPE_FORMAT_R11G11B10_FLOAT , other, 1, 1, x32 , , , , xyz1, rgb -PIPE_FORMAT_R9G9B9E5_FLOAT , other, 1, 1, x32 , , , , xyz1, rgb -PIPE_FORMAT_R1_UNORM , other, 8, 1, x8 , , , , x001, rgb -# A.k.a. D3DFMT_CxV8U8 -PIPE_FORMAT_R8G8Bx_SNORM , other, 1, 1, sn8 , sn8 , , , xyz1, rgb - -# Compressed formats -# - http://en.wikipedia.org/wiki/S3_Texture_Compression -# - http://www.opengl.org/registry/specs/EXT/texture_compression_s3tc.txt -# - http://www.opengl.org/registry/specs/ARB/texture_compression_rgtc.txt -# - http://www.opengl.org/registry/specs/EXT/texture_compression_latc.txt -# - http://www.opengl.org/registry/specs/ARB/texture_compression_bptc.txt -# - http://www.khronos.org/registry/gles/extensions/OES/OES_compressed_ETC1_RGB8_texture.txt -# - http://msdn.microsoft.com/en-us/library/bb694531.aspx -PIPE_FORMAT_DXT1_RGB , s3tc, 4, 4, x64 , , , , xyz1, rgb -PIPE_FORMAT_DXT1_RGBA , s3tc, 4, 4, x64 , , , , xyzw, rgb -PIPE_FORMAT_DXT3_RGBA , s3tc, 4, 4, x128, , , , xyzw, rgb -PIPE_FORMAT_DXT5_RGBA , s3tc, 4, 4, x128, , , , xyzw, rgb -PIPE_FORMAT_DXT1_SRGB , s3tc, 4, 4, x64 , , , , xyz1, srgb -PIPE_FORMAT_DXT1_SRGBA , s3tc, 4, 4, x64 , , , , xyzw, srgb -PIPE_FORMAT_DXT3_SRGBA , s3tc, 4, 4, x128, , , , xyzw, srgb -PIPE_FORMAT_DXT5_SRGBA , s3tc, 4, 4, x128, , , , xyzw, srgb - -PIPE_FORMAT_RGTC1_UNORM , rgtc, 4, 4, x64, , , , x001, rgb -PIPE_FORMAT_RGTC1_SNORM , rgtc, 4, 4, x64, , , , x001, rgb -PIPE_FORMAT_RGTC2_UNORM , rgtc, 4, 4, x128, , , , xy01, rgb -PIPE_FORMAT_RGTC2_SNORM , rgtc, 4, 4, x128, , , , xy01, rgb - -PIPE_FORMAT_LATC1_UNORM , rgtc, 4, 4, x64, , , , xxx1, rgb -PIPE_FORMAT_LATC1_SNORM , rgtc, 4, 4, x64, , , , xxx1, rgb -PIPE_FORMAT_LATC2_UNORM , rgtc, 4, 4, x128, , , , xxxy, rgb -PIPE_FORMAT_LATC2_SNORM , rgtc, 4, 4, x128, , , , xxxy, rgb - -PIPE_FORMAT_ETC1_RGB8 , etc, 4, 4, x64, , , , xyz1, rgb - -PIPE_FORMAT_ETC2_RGB8 , etc, 4, 4, x64, , , , xyz1, rgb -PIPE_FORMAT_ETC2_SRGB8 , etc, 4, 4, x64, , , , xyz1, srgb -PIPE_FORMAT_ETC2_RGB8A1 , etc, 4, 4, x64, , , , xyzw, rgb -PIPE_FORMAT_ETC2_SRGB8A1 , etc, 4, 4, x64, , , , xyzw, srgb -PIPE_FORMAT_ETC2_RGBA8 , etc, 4, 4, x128, , , , xyzw, rgb -PIPE_FORMAT_ETC2_SRGBA8 , etc, 4, 4, x128, , , , xyzw, srgb -PIPE_FORMAT_ETC2_R11_UNORM , etc, 4, 4, x64, , , , x001, rgb -PIPE_FORMAT_ETC2_R11_SNORM , etc, 4, 4, x64, , , , x001, rgb -PIPE_FORMAT_ETC2_RG11_UNORM , etc, 4, 4, x128, , , , xy01, rgb -PIPE_FORMAT_ETC2_RG11_SNORM , etc, 4, 4, x128, , , , xy01, rgb - -PIPE_FORMAT_BPTC_RGBA_UNORM , bptc, 4, 4, x128, , , , xyzw, rgb -PIPE_FORMAT_BPTC_SRGBA , bptc, 4, 4, x128, , , , xyzw, srgb -PIPE_FORMAT_BPTC_RGB_FLOAT , bptc, 4, 4, x128, , , , xyz1, rgb -PIPE_FORMAT_BPTC_RGB_UFLOAT , bptc, 4, 4, x128, , , , xyz1, rgb - -PIPE_FORMAT_ASTC_4x4 , astc, 4, 4, x128, , , , xyzw, rgb -PIPE_FORMAT_ASTC_5x4 , astc, 5, 4, x128, , , , xyzw, rgb -PIPE_FORMAT_ASTC_5x5 , astc, 5, 5, x128, , , , xyzw, rgb -PIPE_FORMAT_ASTC_6x5 , astc, 6, 5, x128, , , , xyzw, rgb -PIPE_FORMAT_ASTC_6x6 , astc, 6, 6, x128, , , , xyzw, rgb -PIPE_FORMAT_ASTC_8x5 , astc, 8, 5, x128, , , , xyzw, rgb -PIPE_FORMAT_ASTC_8x6 , astc, 8, 6, x128, , , , xyzw, rgb -PIPE_FORMAT_ASTC_8x8 , astc, 8, 8, x128, , , , xyzw, rgb -PIPE_FORMAT_ASTC_10x5 , astc,10, 5, x128, , , , xyzw, rgb -PIPE_FORMAT_ASTC_10x6 , astc,10, 6, x128, , , , xyzw, rgb -PIPE_FORMAT_ASTC_10x8 , astc,10, 8, x128, , , , xyzw, rgb -PIPE_FORMAT_ASTC_10x10 , astc,10,10, x128, , , , xyzw, rgb -PIPE_FORMAT_ASTC_12x10 , astc,12,10, x128, , , , xyzw, rgb -PIPE_FORMAT_ASTC_12x12 , astc,12,12, x128, , , , xyzw, rgb - -PIPE_FORMAT_ASTC_4x4_SRGB , astc, 4, 4, x128, , , , xyzw, srgb -PIPE_FORMAT_ASTC_5x4_SRGB , astc, 5, 4, x128, , , , xyzw, srgb -PIPE_FORMAT_ASTC_5x5_SRGB , astc, 5, 5, x128, , , , xyzw, srgb -PIPE_FORMAT_ASTC_6x5_SRGB , astc, 6, 5, x128, , , , xyzw, srgb -PIPE_FORMAT_ASTC_6x6_SRGB , astc, 6, 6, x128, , , , xyzw, srgb -PIPE_FORMAT_ASTC_8x5_SRGB , astc, 8, 5, x128, , , , xyzw, srgb -PIPE_FORMAT_ASTC_8x6_SRGB , astc, 8, 6, x128, , , , xyzw, srgb -PIPE_FORMAT_ASTC_8x8_SRGB , astc, 8, 8, x128, , , , xyzw, srgb -PIPE_FORMAT_ASTC_10x5_SRGB , astc,10, 5, x128, , , , xyzw, srgb -PIPE_FORMAT_ASTC_10x6_SRGB , astc,10, 6, x128, , , , xyzw, srgb -PIPE_FORMAT_ASTC_10x8_SRGB , astc,10, 8, x128, , , , xyzw, srgb -PIPE_FORMAT_ASTC_10x10_SRGB , astc,10,10, x128, , , , xyzw, srgb -PIPE_FORMAT_ASTC_12x10_SRGB , astc,12,10, x128, , , , xyzw, srgb -PIPE_FORMAT_ASTC_12x12_SRGB , astc,12,12, x128, , , , xyzw, srgb - -PIPE_FORMAT_ATC_RGB , atc, 4, 4, x64, , , , xyz1, rgb -PIPE_FORMAT_ATC_RGBA_EXPLICIT , atc, 4, 4, x128, , , , xyzw, rgb -PIPE_FORMAT_ATC_RGBA_INTERPOLATED , atc, 4, 4, x128, , , , xyzw, rgb - -# Straightforward D3D10-like formats (also used for -# vertex buffer element description) -# -# See also: -# - src/gallium/auxiliary/translate/translate_generic.c -# - src/mesa/state_tracker/st_draw.c -PIPE_FORMAT_R64_FLOAT , plain, 1, 1, f64 , , , , x001, rgb -PIPE_FORMAT_R64G64_FLOAT , plain, 1, 1, f64 , f64 , , , xy01, rgb -PIPE_FORMAT_R64G64B64_FLOAT , plain, 1, 1, f64 , f64 , f64 , , xyz1, rgb -PIPE_FORMAT_R64G64B64A64_FLOAT , plain, 1, 1, f64 , f64 , f64 , f64 , xyzw, rgb -PIPE_FORMAT_R32_FLOAT , plain, 1, 1, f32 , , , , x001, rgb -PIPE_FORMAT_R32G32_FLOAT , plain, 1, 1, f32 , f32 , , , xy01, rgb -PIPE_FORMAT_R32G32B32_FLOAT , plain, 1, 1, f32 , f32 , f32 , , xyz1, rgb -PIPE_FORMAT_R32G32B32A32_FLOAT , plain, 1, 1, f32 , f32 , f32 , f32 , xyzw, rgb -PIPE_FORMAT_R32_UNORM , plain, 1, 1, un32, , , , x001, rgb -PIPE_FORMAT_R32G32_UNORM , plain, 1, 1, un32, un32, , , xy01, rgb -PIPE_FORMAT_R32G32B32_UNORM , plain, 1, 1, un32, un32, un32, , xyz1, rgb -PIPE_FORMAT_R32G32B32A32_UNORM , plain, 1, 1, un32, un32, un32, un32, xyzw, rgb -PIPE_FORMAT_R32_USCALED , plain, 1, 1, u32 , , , , x001, rgb -PIPE_FORMAT_R32G32_USCALED , plain, 1, 1, u32 , u32 , , , xy01, rgb -PIPE_FORMAT_R32G32B32_USCALED , plain, 1, 1, u32 , u32 , u32 , , xyz1, rgb -PIPE_FORMAT_R32G32B32A32_USCALED , plain, 1, 1, u32 , u32 , u32 , u32 , xyzw, rgb -PIPE_FORMAT_R32_SNORM , plain, 1, 1, sn32, , , , x001, rgb -PIPE_FORMAT_R32G32_SNORM , plain, 1, 1, sn32, sn32, , , xy01, rgb -PIPE_FORMAT_R32G32B32_SNORM , plain, 1, 1, sn32, sn32, sn32, , xyz1, rgb -PIPE_FORMAT_R32G32B32A32_SNORM , plain, 1, 1, sn32, sn32, sn32, sn32, xyzw, rgb -PIPE_FORMAT_R32_SSCALED , plain, 1, 1, s32 , , , , x001, rgb -PIPE_FORMAT_R32G32_SSCALED , plain, 1, 1, s32 , s32 , , , xy01, rgb -PIPE_FORMAT_R32G32B32_SSCALED , plain, 1, 1, s32 , s32 , s32 , , xyz1, rgb -PIPE_FORMAT_R32G32B32A32_SSCALED , plain, 1, 1, s32 , s32 , s32 , s32 , xyzw, rgb -PIPE_FORMAT_R16_FLOAT , plain, 1, 1, f16 , , , , x001, rgb -PIPE_FORMAT_R16G16_FLOAT , plain, 1, 1, f16 , f16 , , , xy01, rgb -PIPE_FORMAT_R16G16B16_FLOAT , plain, 1, 1, f16 , f16 , f16 , , xyz1, rgb -PIPE_FORMAT_R16G16B16A16_FLOAT , plain, 1, 1, f16 , f16 , f16 , f16 , xyzw, rgb -PIPE_FORMAT_R16_UNORM , plain, 1, 1, un16, , , , x001, rgb -PIPE_FORMAT_R16G16_UNORM , plain, 1, 1, un16, un16, , , xy01, rgb -PIPE_FORMAT_R16G16B16_UNORM , plain, 1, 1, un16, un16, un16, , xyz1, rgb -PIPE_FORMAT_R16G16B16A16_UNORM , plain, 1, 1, un16, un16, un16, un16, xyzw, rgb -PIPE_FORMAT_R16_USCALED , plain, 1, 1, u16 , , , , x001, rgb -PIPE_FORMAT_R16G16_USCALED , plain, 1, 1, u16 , u16 , , , xy01, rgb -PIPE_FORMAT_R16G16B16_USCALED , plain, 1, 1, u16 , u16 , u16 , , xyz1, rgb -PIPE_FORMAT_R16G16B16A16_USCALED , plain, 1, 1, u16 , u16 , u16 , u16 , xyzw, rgb -PIPE_FORMAT_R16_SNORM , plain, 1, 1, sn16, , , , x001, rgb -PIPE_FORMAT_R16G16_SNORM , plain, 1, 1, sn16, sn16, , , xy01, rgb -PIPE_FORMAT_R16G16B16_SNORM , plain, 1, 1, sn16, sn16, sn16, , xyz1, rgb -PIPE_FORMAT_R16G16B16A16_SNORM , plain, 1, 1, sn16, sn16, sn16, sn16, xyzw, rgb -PIPE_FORMAT_R16_SSCALED , plain, 1, 1, s16 , , , , x001, rgb -PIPE_FORMAT_R16G16_SSCALED , plain, 1, 1, s16 , s16 , , , xy01, rgb -PIPE_FORMAT_R16G16B16_SSCALED , plain, 1, 1, s16 , s16 , s16 , , xyz1, rgb -PIPE_FORMAT_R16G16B16A16_SSCALED , plain, 1, 1, s16 , s16 , s16 , s16 , xyzw, rgb -PIPE_FORMAT_R8_UNORM , plain, 1, 1, un8 , , , , x001, rgb -PIPE_FORMAT_R8G8_UNORM , plain, 1, 1, un8 , un8 , , , xy01, rgb -PIPE_FORMAT_R8G8B8_UNORM , plain, 1, 1, un8 , un8 , un8 , , xyz1, rgb -PIPE_FORMAT_R8G8B8A8_UNORM , plain, 1, 1, un8 , un8 , un8 , un8 , xyzw, rgb -PIPE_FORMAT_R8_USCALED , plain, 1, 1, u8 , , , , x001, rgb -PIPE_FORMAT_R8G8_USCALED , plain, 1, 1, u8 , u8 , , , xy01, rgb -PIPE_FORMAT_R8G8B8_USCALED , plain, 1, 1, u8 , u8 , u8 , , xyz1, rgb -PIPE_FORMAT_R8G8B8A8_USCALED , plain, 1, 1, u8 , u8 , u8 , u8 , xyzw, rgb -PIPE_FORMAT_R8_SNORM , plain, 1, 1, sn8 , , , , x001, rgb -PIPE_FORMAT_R8G8_SNORM , plain, 1, 1, sn8 , sn8 , , , xy01, rgb -PIPE_FORMAT_R8G8B8_SNORM , plain, 1, 1, sn8 , sn8 , sn8 , , xyz1, rgb -PIPE_FORMAT_R8G8B8A8_SNORM , plain, 1, 1, sn8 , sn8 , sn8 , sn8 , xyzw, rgb -PIPE_FORMAT_R8_SSCALED , plain, 1, 1, s8 , , , , x001, rgb -PIPE_FORMAT_R8G8_SSCALED , plain, 1, 1, s8 , s8 , , , xy01, rgb -PIPE_FORMAT_R8G8B8_SSCALED , plain, 1, 1, s8 , s8 , s8 , , xyz1, rgb -PIPE_FORMAT_R8G8B8A8_SSCALED , plain, 1, 1, s8 , s8 , s8 , s8 , xyzw, rgb - -# GL-specific vertex buffer element formats -# A.k.a. GL_FIXED -PIPE_FORMAT_R32_FIXED , plain, 1, 1, h32 , , , , x001, rgb -PIPE_FORMAT_R32G32_FIXED , plain, 1, 1, h32 , h32 , , , xy01, rgb -PIPE_FORMAT_R32G32B32_FIXED , plain, 1, 1, h32 , h32 , h32 , , xyz1, rgb -PIPE_FORMAT_R32G32B32A32_FIXED , plain, 1, 1, h32 , h32 , h32 , h32 , xyzw, rgb - -# D3D9-specific vertex buffer element formats -# See also: -# - http://msdn.microsoft.com/en-us/library/bb172533.aspx -# A.k.a. D3DDECLTYPE_UDEC3 -PIPE_FORMAT_R10G10B10X2_USCALED , plain, 1, 1, u10 , u10 , u10 , x2 , xyz1, rgb, x2 , u10 , u10 , u10 , wzy1 -# A.k.a. D3DDECLTYPE_DEC3N -PIPE_FORMAT_R10G10B10X2_SNORM , plain, 1, 1, sn10, sn10, sn10 , x2 , xyz1, rgb, x2 , sn10, sn10, sn10, wzy1 - -PIPE_FORMAT_YV12 , other, 1, 1, x8 , x8 , x8 , x8 , xyzw, yuv -PIPE_FORMAT_YV16 , other, 1, 1, x8 , x8 , x8 , x8 , xyzw, yuv -PIPE_FORMAT_IYUV , other, 1, 1, x8 , x8 , x8 , x8 , xyzw, yuv -PIPE_FORMAT_NV12 , other, 1, 1, x8 , x8 , x8 , x8 , xyzw, yuv -PIPE_FORMAT_NV21 , other, 1, 1, x8 , x8 , x8 , x8 , xyzw, yuv - -PIPE_FORMAT_P016 , other, 1, 1, x16 , x16 , , , xyzw, yuv - -# Usually used to implement IA44 and AI44 formats in video decoding -PIPE_FORMAT_A4R4_UNORM , plain, 1, 1, un4 , un4 , , , y00x, rgb, un4, un4 , , , x00y -PIPE_FORMAT_R4A4_UNORM , plain, 1, 1, un4 , un4 , , , x00y, rgb, un4, un4 , , , y00x -PIPE_FORMAT_R8A8_UNORM , plain, 1, 1, un8 , un8 , , , x00y, rgb -PIPE_FORMAT_A8R8_UNORM , plain, 1, 1, un8 , un8 , , , y00x, rgb - -# ARB_vertex_type_10_10_10_2_REV -PIPE_FORMAT_R10G10B10A2_USCALED , plain, 1, 1, u10 , u10 , u10 , u2 , xyzw, rgb, u2 , u10 , u10 , u10 , wzyx -PIPE_FORMAT_R10G10B10A2_SSCALED , plain, 1, 1, s10 , s10 , s10 , s2 , xyzw, rgb, s2 , s10 , s10 , s10 , wzyx -PIPE_FORMAT_R10G10B10A2_SNORM , plain, 1, 1, sn10, sn10, sn10, sn2 , xyzw, rgb, sn2 , sn10, sn10, sn10, wzyx -PIPE_FORMAT_B10G10R10A2_USCALED , plain, 1, 1, u10 , u10 , u10 , u2 , zyxw, rgb, u2 , u10 , u10 , u10 , yzwx -PIPE_FORMAT_B10G10R10A2_SSCALED , plain, 1, 1, s10 , s10 , s10 , s2 , zyxw, rgb, s2 , s10 , s10 , s10 , yzwx -PIPE_FORMAT_B10G10R10A2_SNORM , plain, 1, 1, sn10, sn10, sn10, sn2 , zyxw, rgb, sn2 , sn10, sn10, sn10, yzwx - -PIPE_FORMAT_R8_UINT , plain, 1, 1, up8, , , , x001, rgb -PIPE_FORMAT_R8G8_UINT , plain, 1, 1, up8, up8, , , xy01, rgb -PIPE_FORMAT_R8G8B8_UINT , plain, 1, 1, up8, up8, up8, , xyz1, rgb -PIPE_FORMAT_R8G8B8A8_UINT , plain, 1, 1, up8, up8, up8, up8, xyzw, rgb - -PIPE_FORMAT_R8_SINT , plain, 1, 1, sp8, , , , x001, rgb -PIPE_FORMAT_R8G8_SINT , plain, 1, 1, sp8, sp8, , , xy01, rgb -PIPE_FORMAT_R8G8B8_SINT , plain, 1, 1, sp8, sp8, sp8, , xyz1, rgb -PIPE_FORMAT_R8G8B8A8_SINT , plain, 1, 1, sp8, sp8, sp8, sp8, xyzw, rgb - -PIPE_FORMAT_R16_UINT , plain, 1, 1, up16, , , , x001, rgb -PIPE_FORMAT_R16G16_UINT , plain, 1, 1, up16, up16, , , xy01, rgb -PIPE_FORMAT_R16G16B16_UINT , plain, 1, 1, up16, up16, up16, , xyz1, rgb -PIPE_FORMAT_R16G16B16A16_UINT , plain, 1, 1, up16, up16, up16, up16, xyzw, rgb - -PIPE_FORMAT_R16_SINT , plain, 1, 1, sp16, , , , x001, rgb -PIPE_FORMAT_R16G16_SINT , plain, 1, 1, sp16, sp16, , , xy01, rgb -PIPE_FORMAT_R16G16B16_SINT , plain, 1, 1, sp16, sp16, sp16, , xyz1, rgb -PIPE_FORMAT_R16G16B16A16_SINT , plain, 1, 1, sp16, sp16, sp16, sp16, xyzw, rgb - -PIPE_FORMAT_R32_UINT , plain, 1, 1, up32, , , , x001, rgb -PIPE_FORMAT_R32G32_UINT , plain, 1, 1, up32, up32, , , xy01, rgb -PIPE_FORMAT_R32G32B32_UINT , plain, 1, 1, up32, up32, up32, , xyz1, rgb -PIPE_FORMAT_R32G32B32A32_UINT , plain, 1, 1, up32, up32, up32, up32, xyzw, rgb - -PIPE_FORMAT_R32_SINT , plain, 1, 1, sp32, , , , x001, rgb -PIPE_FORMAT_R32G32_SINT , plain, 1, 1, sp32, sp32, , , xy01, rgb -PIPE_FORMAT_R32G32B32_SINT , plain, 1, 1, sp32, sp32, sp32, , xyz1, rgb -PIPE_FORMAT_R32G32B32A32_SINT , plain, 1, 1, sp32, sp32, sp32, sp32, xyzw, rgb - -PIPE_FORMAT_A8_UINT , plain, 1, 1, up8, , , , 000x, rgb -PIPE_FORMAT_I8_UINT , plain, 1, 1, up8, , , , xxxx, rgb -PIPE_FORMAT_L8_UINT , plain, 1, 1, up8, , , , xxx1, rgb -PIPE_FORMAT_L8A8_UINT , plain, 1, 1, up8, up8, , , xxxy, rgb - -PIPE_FORMAT_A8_SINT , plain, 1, 1, sp8, , , , 000x, rgb -PIPE_FORMAT_I8_SINT , plain, 1, 1, sp8, , , , xxxx, rgb -PIPE_FORMAT_L8_SINT , plain, 1, 1, sp8, , , , xxx1, rgb -PIPE_FORMAT_L8A8_SINT , plain, 1, 1, sp8, sp8, , , xxxy, rgb - -PIPE_FORMAT_A16_UINT , plain, 1, 1, up16, , , , 000x, rgb -PIPE_FORMAT_I16_UINT , plain, 1, 1, up16, , , , xxxx, rgb -PIPE_FORMAT_L16_UINT , plain, 1, 1, up16, , , , xxx1, rgb -PIPE_FORMAT_L16A16_UINT , plain, 1, 1, up16, up16, , , xxxy, rgb - -PIPE_FORMAT_A16_SINT , plain, 1, 1, sp16, , , , 000x, rgb -PIPE_FORMAT_I16_SINT , plain, 1, 1, sp16, , , , xxxx, rgb -PIPE_FORMAT_L16_SINT , plain, 1, 1, sp16, , , , xxx1, rgb -PIPE_FORMAT_L16A16_SINT , plain, 1, 1, sp16, sp16, , , xxxy, rgb - -PIPE_FORMAT_A32_UINT , plain, 1, 1, up32, , , , 000x, rgb -PIPE_FORMAT_I32_UINT , plain, 1, 1, up32, , , , xxxx, rgb -PIPE_FORMAT_L32_UINT , plain, 1, 1, up32, , , , xxx1, rgb -PIPE_FORMAT_L32A32_UINT , plain, 1, 1, up32, up32, , , xxxy, rgb - -PIPE_FORMAT_A32_SINT , plain, 1, 1, sp32, , , , 000x, rgb -PIPE_FORMAT_I32_SINT , plain, 1, 1, sp32, , , , xxxx, rgb -PIPE_FORMAT_L32_SINT , plain, 1, 1, sp32, , , , xxx1, rgb -PIPE_FORMAT_L32A32_SINT , plain, 1, 1, sp32, sp32, , , xxxy, rgb - -PIPE_FORMAT_B10G10R10A2_UINT , plain, 1, 1, up10, up10, up10, up2, zyxw, rgb, up2 , up10, up10, up10, yzwx - -PIPE_FORMAT_R8G8B8X8_SNORM , plain, 1, 1, sn8, sn8, sn8, x8, xyz1, rgb -PIPE_FORMAT_R8G8B8X8_SRGB , plain, 1, 1, un8, un8, un8, x8, xyz1, srgb -PIPE_FORMAT_R8G8B8X8_UINT , plain, 1, 1, up8, up8, up8, x8, xyz1, rgb -PIPE_FORMAT_R8G8B8X8_SINT , plain, 1, 1, sp8, sp8, sp8, x8, xyz1, rgb -PIPE_FORMAT_B10G10R10X2_UNORM , plain, 1, 1, un10, un10, un10, x2, zyx1, rgb, x2 , un10, un10, un10, yzw1 -PIPE_FORMAT_R16G16B16X16_UNORM , plain, 1, 1, un16, un16, un16, x16, xyz1, rgb -PIPE_FORMAT_R16G16B16X16_SNORM , plain, 1, 1, sn16, sn16, sn16, x16, xyz1, rgb -PIPE_FORMAT_R16G16B16X16_FLOAT , plain, 1, 1, f16, f16, f16, x16, xyz1, rgb -PIPE_FORMAT_R16G16B16X16_UINT , plain, 1, 1, up16, up16, up16, x16, xyz1, rgb -PIPE_FORMAT_R16G16B16X16_SINT , plain, 1, 1, sp16, sp16, sp16, x16, xyz1, rgb -PIPE_FORMAT_R32G32B32X32_FLOAT , plain, 1, 1, f32, f32, f32, x32, xyz1, rgb -PIPE_FORMAT_R32G32B32X32_UINT , plain, 1, 1, up32, up32, up32, x32, xyz1, rgb -PIPE_FORMAT_R32G32B32X32_SINT , plain, 1, 1, sp32, sp32, sp32, x32, xyz1, rgb - -PIPE_FORMAT_R8A8_SNORM , plain, 1, 1, sn8 , sn8 , , , x00y, rgb -PIPE_FORMAT_R16A16_UNORM , plain, 1, 1, un16 , un16 , , , x00y, rgb -PIPE_FORMAT_R16A16_SNORM , plain, 1, 1, sn16 , sn16 , , , x00y, rgb -PIPE_FORMAT_R16A16_FLOAT , plain, 1, 1, f16 , f16 , , , x00y, rgb -PIPE_FORMAT_R32A32_FLOAT , plain, 1, 1, f32 , f32 , , , x00y, rgb -PIPE_FORMAT_R8A8_UINT , plain, 1, 1, up8 , up8 , , , x00y, rgb -PIPE_FORMAT_R8A8_SINT , plain, 1, 1, sp8 , sp8 , , , x00y, rgb -PIPE_FORMAT_R16A16_UINT , plain, 1, 1, up16 , up16 , , , x00y, rgb -PIPE_FORMAT_R16A16_SINT , plain, 1, 1, sp16 , sp16 , , , x00y, rgb -PIPE_FORMAT_R32A32_UINT , plain, 1, 1, up32 , up32 , , , x00y, rgb -PIPE_FORMAT_R32A32_SINT , plain, 1, 1, sp32 , sp32 , , , x00y, rgb -PIPE_FORMAT_R10G10B10A2_UINT , plain, 1, 1, up10 , up10 , up10, up2 , xyzw, rgb, up2 , up10, up10, up10, wzyx - -PIPE_FORMAT_B5G6R5_SRGB , plain, 1, 1, un5 , un6 , un5 , , zyx1, srgb, un5 , un6 , un5 , , xyz1 - -PIPE_FORMAT_A8L8_UNORM , plain, 1, 1, un8 , un8 , , , yyyx, rgb -PIPE_FORMAT_A8L8_SNORM , plain, 1, 1, sn8 , sn8 , , , yyyx, rgb -PIPE_FORMAT_A8L8_SINT , plain, 1, 1, sp8 , sp8 , , , yyyx, rgb -PIPE_FORMAT_A8L8_SRGB , plain, 1, 1, un8 , un8 , , , yyyx, srgb -PIPE_FORMAT_A16L16_UNORM , plain, 1, 1, un16, un16, , , yyyx, rgb - -PIPE_FORMAT_G8R8_UNORM , plain, 1, 1, un8 , un8 , , , yx01, rgb -PIPE_FORMAT_G8R8_SNORM , plain, 1, 1, sn8 , sn8 , , , yx01, rgb -PIPE_FORMAT_G8R8_SINT , plain, 1, 1, sp8 , sp8 , , , yx01, rgb -PIPE_FORMAT_G16R16_UNORM , plain, 1, 1, un16, un16, , , yx01, rgb -PIPE_FORMAT_G16R16_SNORM , plain, 1, 1, sn16, sn16, , , yx01, rgb - -PIPE_FORMAT_A8B8G8R8_SNORM , plain, 1, 1, sn8 , sn8 , sn8 , sn8 , wzyx, rgb -PIPE_FORMAT_A8B8G8R8_SINT , plain, 1, 1, sp8 , sp8 , sp8 , sp8 , wzyx, rgb -PIPE_FORMAT_X8B8G8R8_SNORM , plain, 1, 1, x8, sn8, sn8, sn8, wzy1, rgb -PIPE_FORMAT_X8B8G8R8_SINT , plain, 1, 1, x8, sp8, sp8, sp8, wzy1, rgb diff -Nru mesa-19.2.8/src/gallium/auxiliary/util/u_format_etc.c mesa-20.0.8/src/gallium/auxiliary/util/u_format_etc.c --- mesa-19.2.8/src/gallium/auxiliary/util/u_format_etc.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/util/u_format_etc.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,84 +0,0 @@ -#include "pipe/p_compiler.h" -#include "util/u_debug.h" -#include "util/u_math.h" -#include "u_format_etc.h" - -/* define etc1_parse_block and etc. */ -#define UINT8_TYPE uint8_t -#define TAG(x) x -#include "../../../mesa/main/texcompress_etc_tmp.h" -#undef TAG -#undef UINT8_TYPE - -void -util_format_etc1_rgb8_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height) -{ - etc1_unpack_rgba8888(dst_row, dst_stride, src_row, src_stride, width, height); -} - -void -util_format_etc1_rgb8_pack_rgba_8unorm(UNUSED uint8_t *dst_row, UNUSED unsigned dst_stride, - UNUSED const uint8_t *src_row, UNUSED unsigned src_stride, - UNUSED unsigned width, UNUSED unsigned height) -{ - assert(0); -} - -void -util_format_etc1_rgb8_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height) -{ - const unsigned bw = 4, bh = 4, bs = 8, comps = 4; - struct etc1_block block; - unsigned x, y, i, j; - - for (y = 0; y < height; y += bh) { - const uint8_t *src = src_row; - - for (x = 0; x < width; x+= bw) { - etc1_parse_block(&block, src); - - for (j = 0; j < bh; j++) { - float *dst = dst_row + (y + j) * dst_stride / sizeof(*dst_row) + x * comps; - uint8_t tmp[3]; - - for (i = 0; i < bw; i++) { - etc1_fetch_texel(&block, i, j, tmp); - dst[0] = ubyte_to_float(tmp[0]); - dst[1] = ubyte_to_float(tmp[1]); - dst[2] = ubyte_to_float(tmp[2]); - dst[3] = 1.0f; - dst += comps; - } - } - - src += bs; - } - - src_row += src_stride; - } -} - -void -util_format_etc1_rgb8_pack_rgba_float(UNUSED uint8_t *dst_row, UNUSED unsigned dst_stride, - UNUSED const float *src_row, UNUSED unsigned src_stride, - UNUSED unsigned width, UNUSED unsigned height) -{ - assert(0); -} - -void -util_format_etc1_rgb8_fetch_rgba_float(float *dst, const uint8_t *src, unsigned i, unsigned j) -{ - struct etc1_block block; - uint8_t tmp[3]; - - assert(i < 4 && j < 4); /* check i, j against 4x4 block size */ - - etc1_parse_block(&block, src); - etc1_fetch_texel(&block, i, j, tmp); - - dst[0] = ubyte_to_float(tmp[0]); - dst[1] = ubyte_to_float(tmp[1]); - dst[2] = ubyte_to_float(tmp[2]); - dst[3] = 1.0f; -} diff -Nru mesa-19.2.8/src/gallium/auxiliary/util/u_format_etc.h mesa-20.0.8/src/gallium/auxiliary/util/u_format_etc.h --- mesa-19.2.8/src/gallium/auxiliary/util/u_format_etc.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/util/u_format_etc.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,46 +0,0 @@ -/************************************************************************** - * - * Copyright 2011 LunarG, Inc. - * All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sub license, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL - * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, - * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR - * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE - * USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * The above copyright notice and this permission notice (including the - * next paragraph) shall be included in all copies or substantial portions - * of the Software. - * - **************************************************************************/ - -#ifndef U_FORMAT_ETC1_H_ -#define U_FORMAT_ETC1_H_ - -void -util_format_etc1_rgb8_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); - -void -util_format_etc1_rgb8_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); - -void -util_format_etc1_rgb8_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); - -void -util_format_etc1_rgb8_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height); - -void -util_format_etc1_rgb8_fetch_rgba_float(float *dst, const uint8_t *src, unsigned i, unsigned j); - -#endif /* U_FORMAT_ETC1_H_ */ diff -Nru mesa-19.2.8/src/gallium/auxiliary/util/u_format.h mesa-20.0.8/src/gallium/auxiliary/util/u_format.h --- mesa-19.2.8/src/gallium/auxiliary/util/u_format.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/util/u_format.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,1393 +0,0 @@ -/************************************************************************** - * - * Copyright 2009-2010 Vmware, Inc. - * All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sub license, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice (including the - * next paragraph) shall be included in all copies or substantial portions - * of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. - * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR - * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - **************************************************************************/ - - -#ifndef U_FORMAT_H -#define U_FORMAT_H - - -#include "pipe/p_format.h" -#include "pipe/p_defines.h" -#include "util/u_debug.h" - -union pipe_color_union; - - -#ifdef __cplusplus -extern "C" { -#endif - - -/** - * Describe how to pack/unpack pixels into/from the prescribed format. - * - * XXX: This could be renamed to something like util_format_pack, or broke down - * in flags inside util_format_block that said exactly what we want. - */ -enum util_format_layout { - /** - * Formats with util_format_block::width == util_format_block::height == 1 - * that can be described as an ordinary data structure. - */ - UTIL_FORMAT_LAYOUT_PLAIN = 0, - - /** - * Formats with sub-sampled channels. - * - * This is for formats like YVYU where there is less than one sample per - * pixel. - */ - UTIL_FORMAT_LAYOUT_SUBSAMPLED = 3, - - /** - * S3 Texture Compression formats. - */ - UTIL_FORMAT_LAYOUT_S3TC = 4, - - /** - * Red-Green Texture Compression formats. - */ - UTIL_FORMAT_LAYOUT_RGTC = 5, - - /** - * Ericsson Texture Compression - */ - UTIL_FORMAT_LAYOUT_ETC = 6, - - /** - * BC6/7 Texture Compression - */ - UTIL_FORMAT_LAYOUT_BPTC = 7, - - UTIL_FORMAT_LAYOUT_ASTC = 8, - - UTIL_FORMAT_LAYOUT_ATC = 9, - - /** - * Everything else that doesn't fit in any of the above layouts. - */ - UTIL_FORMAT_LAYOUT_OTHER = 10 -}; - - -struct util_format_block -{ - /** Block width in pixels */ - unsigned width; - - /** Block height in pixels */ - unsigned height; - - /** Block size in bits */ - unsigned bits; -}; - - -enum util_format_type { - UTIL_FORMAT_TYPE_VOID = 0, - UTIL_FORMAT_TYPE_UNSIGNED = 1, - UTIL_FORMAT_TYPE_SIGNED = 2, - UTIL_FORMAT_TYPE_FIXED = 3, - UTIL_FORMAT_TYPE_FLOAT = 4 -}; - - -enum util_format_colorspace { - UTIL_FORMAT_COLORSPACE_RGB = 0, - UTIL_FORMAT_COLORSPACE_SRGB = 1, - UTIL_FORMAT_COLORSPACE_YUV = 2, - UTIL_FORMAT_COLORSPACE_ZS = 3 -}; - - -struct util_format_channel_description -{ - unsigned type:5; /**< UTIL_FORMAT_TYPE_x */ - unsigned normalized:1; - unsigned pure_integer:1; - unsigned size:9; /**< bits per channel */ - unsigned shift:16; /** number of bits from lsb */ -}; - - -struct util_format_description -{ - enum pipe_format format; - - const char *name; - - /** - * Short name, striped of the prefix, lower case. - */ - const char *short_name; - - /** - * Pixel block dimensions. - */ - struct util_format_block block; - - enum util_format_layout layout; - - /** - * The number of channels. - */ - unsigned nr_channels:3; - - /** - * Whether all channels have the same number of (whole) bytes and type. - */ - unsigned is_array:1; - - /** - * Whether the pixel format can be described as a bitfield structure. - * - * In particular: - * - pixel depth must be 8, 16, or 32 bits; - * - all channels must be unsigned, signed, or void - */ - unsigned is_bitmask:1; - - /** - * Whether channels have mixed types (ignoring UTIL_FORMAT_TYPE_VOID). - */ - unsigned is_mixed:1; - - /** - * Whether the format contains UNORM channels - */ - unsigned is_unorm:1; - - /** - * Whether the format contains SNORM channels - */ - unsigned is_snorm:1; - - /** - * Input channel description, in the order XYZW. - * - * Only valid for UTIL_FORMAT_LAYOUT_PLAIN formats. - * - * If each channel is accessed as an individual N-byte value, X is always - * at the lowest address in memory, Y is always next, and so on. For all - * currently-defined formats, the N-byte value has native endianness. - * - * If instead a group of channels is accessed as a single N-byte value, - * the order of the channels within that value depends on endianness. - * For big-endian targets, X is the most significant subvalue, - * otherwise it is the least significant one. - * - * For example, if X is 8 bits and Y is 24 bits, the memory order is: - * - * 0 1 2 3 - * little-endian: X Yl Ym Yu (l = lower, m = middle, u = upper) - * big-endian: X Yu Ym Yl - * - * If X is 5 bits, Y is 5 bits, Z is 5 bits and W is 1 bit, the layout is: - * - * 0 1 - * msb lsb msb lsb - * little-endian: YYYXXXXX WZZZZZYY - * big-endian: XXXXXYYY YYZZZZZW - */ - struct util_format_channel_description channel[4]; - - /** - * Output channel swizzle. - * - * The order is either: - * - RGBA - * - YUV(A) - * - ZS - * depending on the colorspace. - */ - unsigned char swizzle[4]; - - /** - * Colorspace transformation. - */ - enum util_format_colorspace colorspace; - - /** - * Unpack pixel blocks to R8G8B8A8_UNORM. - * Note: strides are in bytes. - * - * Only defined for non-depth-stencil formats. - */ - void - (*unpack_rgba_8unorm)(uint8_t *dst, unsigned dst_stride, - const uint8_t *src, unsigned src_stride, - unsigned width, unsigned height); - - /** - * Pack pixel blocks from R8G8B8A8_UNORM. - * Note: strides are in bytes. - * - * Only defined for non-depth-stencil formats. - */ - void - (*pack_rgba_8unorm)(uint8_t *dst, unsigned dst_stride, - const uint8_t *src, unsigned src_stride, - unsigned width, unsigned height); - - /** - * Fetch a single pixel (i, j) from a block. - * - * XXX: Only defined for a very few select formats. - */ - void - (*fetch_rgba_8unorm)(uint8_t *dst, - const uint8_t *src, - unsigned i, unsigned j); - - /** - * Unpack pixel blocks to R32G32B32A32_FLOAT. - * Note: strides are in bytes. - * - * Only defined for non-depth-stencil formats. - */ - void - (*unpack_rgba_float)(float *dst, unsigned dst_stride, - const uint8_t *src, unsigned src_stride, - unsigned width, unsigned height); - - /** - * Pack pixel blocks from R32G32B32A32_FLOAT. - * Note: strides are in bytes. - * - * Only defined for non-depth-stencil formats. - */ - void - (*pack_rgba_float)(uint8_t *dst, unsigned dst_stride, - const float *src, unsigned src_stride, - unsigned width, unsigned height); - - /** - * Fetch a single pixel (i, j) from a block. - * - * Only defined for non-depth-stencil and non-integer formats. - */ - void - (*fetch_rgba_float)(float *dst, - const uint8_t *src, - unsigned i, unsigned j); - - /** - * Unpack pixels to Z32_UNORM. - * Note: strides are in bytes. - * - * Only defined for depth formats. - */ - void - (*unpack_z_32unorm)(uint32_t *dst, unsigned dst_stride, - const uint8_t *src, unsigned src_stride, - unsigned width, unsigned height); - - /** - * Pack pixels from Z32_FLOAT. - * Note: strides are in bytes. - * - * Only defined for depth formats. - */ - void - (*pack_z_32unorm)(uint8_t *dst, unsigned dst_stride, - const uint32_t *src, unsigned src_stride, - unsigned width, unsigned height); - - /** - * Unpack pixels to Z32_FLOAT. - * Note: strides are in bytes. - * - * Only defined for depth formats. - */ - void - (*unpack_z_float)(float *dst, unsigned dst_stride, - const uint8_t *src, unsigned src_stride, - unsigned width, unsigned height); - - /** - * Pack pixels from Z32_FLOAT. - * Note: strides are in bytes. - * - * Only defined for depth formats. - */ - void - (*pack_z_float)(uint8_t *dst, unsigned dst_stride, - const float *src, unsigned src_stride, - unsigned width, unsigned height); - - /** - * Unpack pixels to S8_UINT. - * Note: strides are in bytes. - * - * Only defined for stencil formats. - */ - void - (*unpack_s_8uint)(uint8_t *dst, unsigned dst_stride, - const uint8_t *src, unsigned src_stride, - unsigned width, unsigned height); - - /** - * Pack pixels from S8_UINT. - * Note: strides are in bytes. - * - * Only defined for stencil formats. - */ - void - (*pack_s_8uint)(uint8_t *dst, unsigned dst_stride, - const uint8_t *src, unsigned src_stride, - unsigned width, unsigned height); - - /** - * Unpack pixel blocks to R32G32B32A32_UINT. - * Note: strides are in bytes. - * - * Only defined for INT formats. - */ - void - (*unpack_rgba_uint)(uint32_t *dst, unsigned dst_stride, - const uint8_t *src, unsigned src_stride, - unsigned width, unsigned height); - - void - (*pack_rgba_uint)(uint8_t *dst, unsigned dst_stride, - const uint32_t *src, unsigned src_stride, - unsigned width, unsigned height); - - /** - * Unpack pixel blocks to R32G32B32A32_SINT. - * Note: strides are in bytes. - * - * Only defined for INT formats. - */ - void - (*unpack_rgba_sint)(int32_t *dst, unsigned dst_stride, - const uint8_t *src, unsigned src_stride, - unsigned width, unsigned height); - - void - (*pack_rgba_sint)(uint8_t *dst, unsigned dst_stride, - const int32_t *src, unsigned src_stride, - unsigned width, unsigned height); - - /** - * Fetch a single pixel (i, j) from a block. - * - * Only defined for unsigned (pure) integer formats. - */ - void - (*fetch_rgba_uint)(uint32_t *dst, - const uint8_t *src, - unsigned i, unsigned j); - - /** - * Fetch a single pixel (i, j) from a block. - * - * Only defined for signed (pure) integer formats. - */ - void - (*fetch_rgba_sint)(int32_t *dst, - const uint8_t *src, - unsigned i, unsigned j); -}; - - -extern const struct util_format_description -util_format_description_table[]; - - -const struct util_format_description * -util_format_description(enum pipe_format format); - - -/* - * Format query functions. - */ - -static inline const char * -util_format_name(enum pipe_format format) -{ - const struct util_format_description *desc = util_format_description(format); - - assert(desc); - if (!desc) { - return "PIPE_FORMAT_???"; - } - - return desc->name; -} - -static inline const char * -util_format_short_name(enum pipe_format format) -{ - const struct util_format_description *desc = util_format_description(format); - - assert(desc); - if (!desc) { - return "???"; - } - - return desc->short_name; -} - -/** - * Whether this format is plain, see UTIL_FORMAT_LAYOUT_PLAIN for more info. - */ -static inline boolean -util_format_is_plain(enum pipe_format format) -{ - const struct util_format_description *desc = util_format_description(format); - - if (!format) { - return FALSE; - } - - return desc->layout == UTIL_FORMAT_LAYOUT_PLAIN ? TRUE : FALSE; -} - -static inline boolean -util_format_is_compressed(enum pipe_format format) -{ - const struct util_format_description *desc = util_format_description(format); - - assert(desc); - if (!desc) { - return FALSE; - } - - switch (desc->layout) { - case UTIL_FORMAT_LAYOUT_S3TC: - case UTIL_FORMAT_LAYOUT_RGTC: - case UTIL_FORMAT_LAYOUT_ETC: - case UTIL_FORMAT_LAYOUT_BPTC: - case UTIL_FORMAT_LAYOUT_ASTC: - case UTIL_FORMAT_LAYOUT_ATC: - /* XXX add other formats in the future */ - return TRUE; - default: - return FALSE; - } -} - -static inline boolean -util_format_is_s3tc(enum pipe_format format) -{ - const struct util_format_description *desc = util_format_description(format); - - assert(desc); - if (!desc) { - return FALSE; - } - - return desc->layout == UTIL_FORMAT_LAYOUT_S3TC ? TRUE : FALSE; -} - -static inline boolean -util_format_is_etc(enum pipe_format format) -{ - const struct util_format_description *desc = util_format_description(format); - - assert(desc); - if (!desc) { - return FALSE; - } - - return desc->layout == UTIL_FORMAT_LAYOUT_ETC ? TRUE : FALSE; -} - -static inline boolean -util_format_is_srgb(enum pipe_format format) -{ - const struct util_format_description *desc = util_format_description(format); - return desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB; -} - -static inline boolean -util_format_has_depth(const struct util_format_description *desc) -{ - return desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS && - desc->swizzle[0] != PIPE_SWIZZLE_NONE; -} - -static inline boolean -util_format_has_stencil(const struct util_format_description *desc) -{ - return desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS && - desc->swizzle[1] != PIPE_SWIZZLE_NONE; -} - -static inline boolean -util_format_is_depth_or_stencil(enum pipe_format format) -{ - const struct util_format_description *desc = util_format_description(format); - - assert(desc); - if (!desc) { - return FALSE; - } - - return util_format_has_depth(desc) || - util_format_has_stencil(desc); -} - -static inline boolean -util_format_is_depth_and_stencil(enum pipe_format format) -{ - const struct util_format_description *desc = util_format_description(format); - - assert(desc); - if (!desc) { - return FALSE; - } - - return util_format_has_depth(desc) && - util_format_has_stencil(desc); -} - -/** - * For depth-stencil formats, return the equivalent depth-only format. - */ -static inline enum pipe_format -util_format_get_depth_only(enum pipe_format format) -{ - switch (format) { - case PIPE_FORMAT_Z24_UNORM_S8_UINT: - return PIPE_FORMAT_Z24X8_UNORM; - - case PIPE_FORMAT_S8_UINT_Z24_UNORM: - return PIPE_FORMAT_X8Z24_UNORM; - - case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: - return PIPE_FORMAT_Z32_FLOAT; - - default: - return format; - } -} - -static inline boolean -util_format_is_yuv(enum pipe_format format) -{ - const struct util_format_description *desc = util_format_description(format); - - assert(desc); - if (!desc) { - return FALSE; - } - - return desc->colorspace == UTIL_FORMAT_COLORSPACE_YUV; -} - -/** - * Calculates the depth format type based upon the incoming format description. - */ -static inline unsigned -util_get_depth_format_type(const struct util_format_description *desc) -{ - unsigned depth_channel = desc->swizzle[0]; - if (desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS && - depth_channel != PIPE_SWIZZLE_NONE) { - return desc->channel[depth_channel].type; - } else { - return UTIL_FORMAT_TYPE_VOID; - } -} - - -/** - * Calculates the MRD for the depth format. MRD is used in depth bias - * for UNORM and unbound depth buffers. When the depth buffer is floating - * point, the depth bias calculation does not use the MRD. However, the - * default MRD will be 1.0 / ((1 << 24) - 1). - */ -double -util_get_depth_format_mrd(const struct util_format_description *desc); - - -/** - * Return whether this is an RGBA, Z, S, or combined ZS format. - * Useful for initializing pipe_blit_info::mask. - */ -static inline unsigned -util_format_get_mask(enum pipe_format format) -{ - const struct util_format_description *desc = - util_format_description(format); - - if (!desc) - return 0; - - if (util_format_has_depth(desc)) { - if (util_format_has_stencil(desc)) { - return PIPE_MASK_ZS; - } else { - return PIPE_MASK_Z; - } - } else { - if (util_format_has_stencil(desc)) { - return PIPE_MASK_S; - } else { - return PIPE_MASK_RGBA; - } - } -} - -/** - * Give the RGBA colormask of the channels that can be represented in this - * format. - * - * That is, the channels whose values are preserved. - */ -static inline unsigned -util_format_colormask(const struct util_format_description *desc) -{ - unsigned colormask; - unsigned chan; - - switch (desc->colorspace) { - case UTIL_FORMAT_COLORSPACE_RGB: - case UTIL_FORMAT_COLORSPACE_SRGB: - case UTIL_FORMAT_COLORSPACE_YUV: - colormask = 0; - for (chan = 0; chan < 4; ++chan) { - if (desc->swizzle[chan] < 4) { - colormask |= (1 << chan); - } - } - return colormask; - case UTIL_FORMAT_COLORSPACE_ZS: - return 0; - default: - assert(0); - return 0; - } -} - - -/** - * Checks if color mask covers every channel for the specified format - * - * @param desc a format description to check colormask with - * @param colormask a bit mask for channels, matches format of PIPE_MASK_RGBA - */ -static inline boolean -util_format_colormask_full(const struct util_format_description *desc, unsigned colormask) -{ - return (~colormask & util_format_colormask(desc)) == 0; -} - - -boolean -util_format_is_float(enum pipe_format format); - - -boolean -util_format_has_alpha(enum pipe_format format); - - -boolean -util_format_is_luminance(enum pipe_format format); - -boolean -util_format_is_alpha(enum pipe_format format); - -boolean -util_format_is_luminance_alpha(enum pipe_format format); - - -boolean -util_format_is_intensity(enum pipe_format format); - -boolean -util_format_is_subsampled_422(enum pipe_format format); - -boolean -util_format_is_pure_integer(enum pipe_format format); - -boolean -util_format_is_pure_sint(enum pipe_format format); - -boolean -util_format_is_pure_uint(enum pipe_format format); - -boolean -util_format_is_snorm(enum pipe_format format); - -boolean -util_format_is_unorm(enum pipe_format format); - -boolean -util_format_is_snorm8(enum pipe_format format); - -/** - * Check if the src format can be blitted to the destination format with - * a simple memcpy. For example, blitting from RGBA to RGBx is OK, but not - * the reverse. - */ -boolean -util_is_format_compatible(const struct util_format_description *src_desc, - const struct util_format_description *dst_desc); - -/** - * Whether this format is a rgab8 variant. - * - * That is, any format that matches the - * - * PIPE_FORMAT_?8?8?8?8_UNORM - */ -static inline boolean -util_format_is_rgba8_variant(const struct util_format_description *desc) -{ - unsigned chan; - - if(desc->block.width != 1 || - desc->block.height != 1 || - desc->block.bits != 32) - return FALSE; - - for(chan = 0; chan < 4; ++chan) { - if(desc->channel[chan].type != UTIL_FORMAT_TYPE_UNSIGNED && - desc->channel[chan].type != UTIL_FORMAT_TYPE_VOID) - return FALSE; - if(desc->channel[chan].type == UTIL_FORMAT_TYPE_UNSIGNED && - !desc->channel[chan].normalized) - return FALSE; - if(desc->channel[chan].size != 8) - return FALSE; - } - - return TRUE; -} - -/** - * Return total bits needed for the pixel format per block. - */ -static inline uint -util_format_get_blocksizebits(enum pipe_format format) -{ - const struct util_format_description *desc = util_format_description(format); - - assert(desc); - if (!desc) { - return 0; - } - - return desc->block.bits; -} - -/** - * Return bytes per block (not pixel) for the given format. - */ -static inline uint -util_format_get_blocksize(enum pipe_format format) -{ - uint bits = util_format_get_blocksizebits(format); - uint bytes = bits / 8; - - assert(bits % 8 == 0); - assert(bytes > 0); - if (bytes == 0) { - bytes = 1; - } - - return bytes; -} - -static inline uint -util_format_get_blockwidth(enum pipe_format format) -{ - const struct util_format_description *desc = util_format_description(format); - - assert(desc); - if (!desc) { - return 1; - } - - return desc->block.width; -} - -static inline uint -util_format_get_blockheight(enum pipe_format format) -{ - const struct util_format_description *desc = util_format_description(format); - - assert(desc); - if (!desc) { - return 1; - } - - return desc->block.height; -} - -static inline unsigned -util_format_get_nblocksx(enum pipe_format format, - unsigned x) -{ - unsigned blockwidth = util_format_get_blockwidth(format); - return (x + blockwidth - 1) / blockwidth; -} - -static inline unsigned -util_format_get_nblocksy(enum pipe_format format, - unsigned y) -{ - unsigned blockheight = util_format_get_blockheight(format); - return (y + blockheight - 1) / blockheight; -} - -static inline unsigned -util_format_get_nblocks(enum pipe_format format, - unsigned width, - unsigned height) -{ - return util_format_get_nblocksx(format, width) * util_format_get_nblocksy(format, height); -} - -static inline size_t -util_format_get_stride(enum pipe_format format, - unsigned width) -{ - return (size_t)util_format_get_nblocksx(format, width) * util_format_get_blocksize(format); -} - -static inline size_t -util_format_get_2d_size(enum pipe_format format, - size_t stride, - unsigned height) -{ - return util_format_get_nblocksy(format, height) * stride; -} - -static inline uint -util_format_get_component_bits(enum pipe_format format, - enum util_format_colorspace colorspace, - uint component) -{ - const struct util_format_description *desc = util_format_description(format); - enum util_format_colorspace desc_colorspace; - - assert(format); - if (!format) { - return 0; - } - - assert(component < 4); - - /* Treat RGB and SRGB as equivalent. */ - if (colorspace == UTIL_FORMAT_COLORSPACE_SRGB) { - colorspace = UTIL_FORMAT_COLORSPACE_RGB; - } - if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) { - desc_colorspace = UTIL_FORMAT_COLORSPACE_RGB; - } else { - desc_colorspace = desc->colorspace; - } - - if (desc_colorspace != colorspace) { - return 0; - } - - switch (desc->swizzle[component]) { - case PIPE_SWIZZLE_X: - return desc->channel[0].size; - case PIPE_SWIZZLE_Y: - return desc->channel[1].size; - case PIPE_SWIZZLE_Z: - return desc->channel[2].size; - case PIPE_SWIZZLE_W: - return desc->channel[3].size; - default: - return 0; - } -} - -/** - * Given a linear RGB colorspace format, return the corresponding SRGB - * format, or PIPE_FORMAT_NONE if none. - */ -static inline enum pipe_format -util_format_srgb(enum pipe_format format) -{ - if (util_format_is_srgb(format)) - return format; - - switch (format) { - case PIPE_FORMAT_L8_UNORM: - return PIPE_FORMAT_L8_SRGB; - case PIPE_FORMAT_R8_UNORM: - return PIPE_FORMAT_R8_SRGB; - case PIPE_FORMAT_L8A8_UNORM: - return PIPE_FORMAT_L8A8_SRGB; - case PIPE_FORMAT_R8G8B8_UNORM: - return PIPE_FORMAT_R8G8B8_SRGB; - case PIPE_FORMAT_A8B8G8R8_UNORM: - return PIPE_FORMAT_A8B8G8R8_SRGB; - case PIPE_FORMAT_X8B8G8R8_UNORM: - return PIPE_FORMAT_X8B8G8R8_SRGB; - case PIPE_FORMAT_B8G8R8A8_UNORM: - return PIPE_FORMAT_B8G8R8A8_SRGB; - case PIPE_FORMAT_B8G8R8X8_UNORM: - return PIPE_FORMAT_B8G8R8X8_SRGB; - case PIPE_FORMAT_A8R8G8B8_UNORM: - return PIPE_FORMAT_A8R8G8B8_SRGB; - case PIPE_FORMAT_X8R8G8B8_UNORM: - return PIPE_FORMAT_X8R8G8B8_SRGB; - case PIPE_FORMAT_R8G8B8A8_UNORM: - return PIPE_FORMAT_R8G8B8A8_SRGB; - case PIPE_FORMAT_R8G8B8X8_UNORM: - return PIPE_FORMAT_R8G8B8X8_SRGB; - case PIPE_FORMAT_DXT1_RGB: - return PIPE_FORMAT_DXT1_SRGB; - case PIPE_FORMAT_DXT1_RGBA: - return PIPE_FORMAT_DXT1_SRGBA; - case PIPE_FORMAT_DXT3_RGBA: - return PIPE_FORMAT_DXT3_SRGBA; - case PIPE_FORMAT_DXT5_RGBA: - return PIPE_FORMAT_DXT5_SRGBA; - case PIPE_FORMAT_B5G6R5_UNORM: - return PIPE_FORMAT_B5G6R5_SRGB; - case PIPE_FORMAT_BPTC_RGBA_UNORM: - return PIPE_FORMAT_BPTC_SRGBA; - case PIPE_FORMAT_ASTC_4x4: - return PIPE_FORMAT_ASTC_4x4_SRGB; - case PIPE_FORMAT_ASTC_5x4: - return PIPE_FORMAT_ASTC_5x4_SRGB; - case PIPE_FORMAT_ASTC_5x5: - return PIPE_FORMAT_ASTC_5x5_SRGB; - case PIPE_FORMAT_ASTC_6x5: - return PIPE_FORMAT_ASTC_6x5_SRGB; - case PIPE_FORMAT_ASTC_6x6: - return PIPE_FORMAT_ASTC_6x6_SRGB; - case PIPE_FORMAT_ASTC_8x5: - return PIPE_FORMAT_ASTC_8x5_SRGB; - case PIPE_FORMAT_ASTC_8x6: - return PIPE_FORMAT_ASTC_8x6_SRGB; - case PIPE_FORMAT_ASTC_8x8: - return PIPE_FORMAT_ASTC_8x8_SRGB; - case PIPE_FORMAT_ASTC_10x5: - return PIPE_FORMAT_ASTC_10x5_SRGB; - case PIPE_FORMAT_ASTC_10x6: - return PIPE_FORMAT_ASTC_10x6_SRGB; - case PIPE_FORMAT_ASTC_10x8: - return PIPE_FORMAT_ASTC_10x8_SRGB; - case PIPE_FORMAT_ASTC_10x10: - return PIPE_FORMAT_ASTC_10x10_SRGB; - case PIPE_FORMAT_ASTC_12x10: - return PIPE_FORMAT_ASTC_12x10_SRGB; - case PIPE_FORMAT_ASTC_12x12: - return PIPE_FORMAT_ASTC_12x12_SRGB; - - default: - return PIPE_FORMAT_NONE; - } -} - -/** - * Given an sRGB format, return the corresponding linear colorspace format. - * For non sRGB formats, return the format unchanged. - */ -static inline enum pipe_format -util_format_linear(enum pipe_format format) -{ - switch (format) { - case PIPE_FORMAT_L8_SRGB: - return PIPE_FORMAT_L8_UNORM; - case PIPE_FORMAT_R8_SRGB: - return PIPE_FORMAT_R8_UNORM; - case PIPE_FORMAT_L8A8_SRGB: - return PIPE_FORMAT_L8A8_UNORM; - case PIPE_FORMAT_R8G8B8_SRGB: - return PIPE_FORMAT_R8G8B8_UNORM; - case PIPE_FORMAT_A8B8G8R8_SRGB: - return PIPE_FORMAT_A8B8G8R8_UNORM; - case PIPE_FORMAT_X8B8G8R8_SRGB: - return PIPE_FORMAT_X8B8G8R8_UNORM; - case PIPE_FORMAT_B8G8R8A8_SRGB: - return PIPE_FORMAT_B8G8R8A8_UNORM; - case PIPE_FORMAT_B8G8R8X8_SRGB: - return PIPE_FORMAT_B8G8R8X8_UNORM; - case PIPE_FORMAT_A8R8G8B8_SRGB: - return PIPE_FORMAT_A8R8G8B8_UNORM; - case PIPE_FORMAT_X8R8G8B8_SRGB: - return PIPE_FORMAT_X8R8G8B8_UNORM; - case PIPE_FORMAT_R8G8B8A8_SRGB: - return PIPE_FORMAT_R8G8B8A8_UNORM; - case PIPE_FORMAT_R8G8B8X8_SRGB: - return PIPE_FORMAT_R8G8B8X8_UNORM; - case PIPE_FORMAT_DXT1_SRGB: - return PIPE_FORMAT_DXT1_RGB; - case PIPE_FORMAT_DXT1_SRGBA: - return PIPE_FORMAT_DXT1_RGBA; - case PIPE_FORMAT_DXT3_SRGBA: - return PIPE_FORMAT_DXT3_RGBA; - case PIPE_FORMAT_DXT5_SRGBA: - return PIPE_FORMAT_DXT5_RGBA; - case PIPE_FORMAT_B5G6R5_SRGB: - return PIPE_FORMAT_B5G6R5_UNORM; - case PIPE_FORMAT_BPTC_SRGBA: - return PIPE_FORMAT_BPTC_RGBA_UNORM; - case PIPE_FORMAT_ASTC_4x4_SRGB: - return PIPE_FORMAT_ASTC_4x4; - case PIPE_FORMAT_ASTC_5x4_SRGB: - return PIPE_FORMAT_ASTC_5x4; - case PIPE_FORMAT_ASTC_5x5_SRGB: - return PIPE_FORMAT_ASTC_5x5; - case PIPE_FORMAT_ASTC_6x5_SRGB: - return PIPE_FORMAT_ASTC_6x5; - case PIPE_FORMAT_ASTC_6x6_SRGB: - return PIPE_FORMAT_ASTC_6x6; - case PIPE_FORMAT_ASTC_8x5_SRGB: - return PIPE_FORMAT_ASTC_8x5; - case PIPE_FORMAT_ASTC_8x6_SRGB: - return PIPE_FORMAT_ASTC_8x6; - case PIPE_FORMAT_ASTC_8x8_SRGB: - return PIPE_FORMAT_ASTC_8x8; - case PIPE_FORMAT_ASTC_10x5_SRGB: - return PIPE_FORMAT_ASTC_10x5; - case PIPE_FORMAT_ASTC_10x6_SRGB: - return PIPE_FORMAT_ASTC_10x6; - case PIPE_FORMAT_ASTC_10x8_SRGB: - return PIPE_FORMAT_ASTC_10x8; - case PIPE_FORMAT_ASTC_10x10_SRGB: - return PIPE_FORMAT_ASTC_10x10; - case PIPE_FORMAT_ASTC_12x10_SRGB: - return PIPE_FORMAT_ASTC_12x10; - case PIPE_FORMAT_ASTC_12x12_SRGB: - return PIPE_FORMAT_ASTC_12x12; - default: - return format; - } -} - -/** - * Given a depth-stencil format, return the corresponding stencil-only format. - * For stencil-only formats, return the format unchanged. - */ -static inline enum pipe_format -util_format_stencil_only(enum pipe_format format) -{ - switch (format) { - /* mask out the depth component */ - case PIPE_FORMAT_Z24_UNORM_S8_UINT: - return PIPE_FORMAT_X24S8_UINT; - case PIPE_FORMAT_S8_UINT_Z24_UNORM: - return PIPE_FORMAT_S8X24_UINT; - case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: - return PIPE_FORMAT_X32_S8X24_UINT; - - /* stencil only formats */ - case PIPE_FORMAT_X24S8_UINT: - case PIPE_FORMAT_S8X24_UINT: - case PIPE_FORMAT_X32_S8X24_UINT: - case PIPE_FORMAT_S8_UINT: - return format; - - default: - assert(0); - return PIPE_FORMAT_NONE; - } -} - -/** - * Converts PIPE_FORMAT_*I* to PIPE_FORMAT_*R*. - * This is identity for non-intensity formats. - */ -static inline enum pipe_format -util_format_intensity_to_red(enum pipe_format format) -{ - switch (format) { - case PIPE_FORMAT_I8_UNORM: - return PIPE_FORMAT_R8_UNORM; - case PIPE_FORMAT_I8_SNORM: - return PIPE_FORMAT_R8_SNORM; - case PIPE_FORMAT_I16_UNORM: - return PIPE_FORMAT_R16_UNORM; - case PIPE_FORMAT_I16_SNORM: - return PIPE_FORMAT_R16_SNORM; - case PIPE_FORMAT_I16_FLOAT: - return PIPE_FORMAT_R16_FLOAT; - case PIPE_FORMAT_I32_FLOAT: - return PIPE_FORMAT_R32_FLOAT; - case PIPE_FORMAT_I8_UINT: - return PIPE_FORMAT_R8_UINT; - case PIPE_FORMAT_I8_SINT: - return PIPE_FORMAT_R8_SINT; - case PIPE_FORMAT_I16_UINT: - return PIPE_FORMAT_R16_UINT; - case PIPE_FORMAT_I16_SINT: - return PIPE_FORMAT_R16_SINT; - case PIPE_FORMAT_I32_UINT: - return PIPE_FORMAT_R32_UINT; - case PIPE_FORMAT_I32_SINT: - return PIPE_FORMAT_R32_SINT; - default: - assert(!util_format_is_intensity(format)); - return format; - } -} - -/** - * Converts PIPE_FORMAT_*L* to PIPE_FORMAT_*R*. - * This is identity for non-luminance formats. - */ -static inline enum pipe_format -util_format_luminance_to_red(enum pipe_format format) -{ - switch (format) { - case PIPE_FORMAT_L8_UNORM: - return PIPE_FORMAT_R8_UNORM; - case PIPE_FORMAT_L8_SNORM: - return PIPE_FORMAT_R8_SNORM; - case PIPE_FORMAT_L16_UNORM: - return PIPE_FORMAT_R16_UNORM; - case PIPE_FORMAT_L16_SNORM: - return PIPE_FORMAT_R16_SNORM; - case PIPE_FORMAT_L16_FLOAT: - return PIPE_FORMAT_R16_FLOAT; - case PIPE_FORMAT_L32_FLOAT: - return PIPE_FORMAT_R32_FLOAT; - case PIPE_FORMAT_L8_UINT: - return PIPE_FORMAT_R8_UINT; - case PIPE_FORMAT_L8_SINT: - return PIPE_FORMAT_R8_SINT; - case PIPE_FORMAT_L16_UINT: - return PIPE_FORMAT_R16_UINT; - case PIPE_FORMAT_L16_SINT: - return PIPE_FORMAT_R16_SINT; - case PIPE_FORMAT_L32_UINT: - return PIPE_FORMAT_R32_UINT; - case PIPE_FORMAT_L32_SINT: - return PIPE_FORMAT_R32_SINT; - - case PIPE_FORMAT_LATC1_UNORM: - return PIPE_FORMAT_RGTC1_UNORM; - case PIPE_FORMAT_LATC1_SNORM: - return PIPE_FORMAT_RGTC1_SNORM; - - case PIPE_FORMAT_L4A4_UNORM: - return PIPE_FORMAT_R4A4_UNORM; - - case PIPE_FORMAT_L8A8_UNORM: - return PIPE_FORMAT_R8A8_UNORM; - case PIPE_FORMAT_L8A8_SNORM: - return PIPE_FORMAT_R8A8_SNORM; - case PIPE_FORMAT_L16A16_UNORM: - return PIPE_FORMAT_R16A16_UNORM; - case PIPE_FORMAT_L16A16_SNORM: - return PIPE_FORMAT_R16A16_SNORM; - case PIPE_FORMAT_L16A16_FLOAT: - return PIPE_FORMAT_R16A16_FLOAT; - case PIPE_FORMAT_L32A32_FLOAT: - return PIPE_FORMAT_R32A32_FLOAT; - case PIPE_FORMAT_L8A8_UINT: - return PIPE_FORMAT_R8A8_UINT; - case PIPE_FORMAT_L8A8_SINT: - return PIPE_FORMAT_R8A8_SINT; - case PIPE_FORMAT_L16A16_UINT: - return PIPE_FORMAT_R16A16_UINT; - case PIPE_FORMAT_L16A16_SINT: - return PIPE_FORMAT_R16A16_SINT; - case PIPE_FORMAT_L32A32_UINT: - return PIPE_FORMAT_R32A32_UINT; - case PIPE_FORMAT_L32A32_SINT: - return PIPE_FORMAT_R32A32_SINT; - - /* We don't have compressed red-alpha variants for these. */ - case PIPE_FORMAT_LATC2_UNORM: - case PIPE_FORMAT_LATC2_SNORM: - return PIPE_FORMAT_NONE; - - default: - assert(!util_format_is_luminance(format) && - !util_format_is_luminance_alpha(format)); - return format; - } -} - -/** - * Return the number of components stored. - * Formats with block size != 1x1 will always have 1 component (the block). - */ -static inline unsigned -util_format_get_nr_components(enum pipe_format format) -{ - const struct util_format_description *desc = util_format_description(format); - return desc->nr_channels; -} - -/** - * Return the index of the first non-void channel - * -1 if no non-void channels - */ -static inline int -util_format_get_first_non_void_channel(enum pipe_format format) -{ - const struct util_format_description *desc = util_format_description(format); - int i; - - for (i = 0; i < 4; i++) - if (desc->channel[i].type != UTIL_FORMAT_TYPE_VOID) - break; - - if (i == 4) - return -1; - - return i; -} - -/** - * Whether this format is any 8-bit UNORM variant. Looser than - * util_is_rgba8_variant (also includes alpha textures, for instance). - */ - -static inline bool -util_format_is_unorm8(const struct util_format_description *desc) -{ - int c = util_format_get_first_non_void_channel(desc->format); - - if (c == -1) - return false; - - return desc->is_unorm && desc->is_array && desc->channel[c].size == 8; -} - -/* - * Format access functions. - */ - -void -util_format_read_4f(enum pipe_format format, - float *dst, unsigned dst_stride, - const void *src, unsigned src_stride, - unsigned x, unsigned y, unsigned w, unsigned h); - -void -util_format_write_4f(enum pipe_format format, - const float *src, unsigned src_stride, - void *dst, unsigned dst_stride, - unsigned x, unsigned y, unsigned w, unsigned h); - -void -util_format_read_4ub(enum pipe_format format, - uint8_t *dst, unsigned dst_stride, - const void *src, unsigned src_stride, - unsigned x, unsigned y, unsigned w, unsigned h); - -void -util_format_write_4ub(enum pipe_format format, - const uint8_t *src, unsigned src_stride, - void *dst, unsigned dst_stride, - unsigned x, unsigned y, unsigned w, unsigned h); - -void -util_format_read_4ui(enum pipe_format format, - unsigned *dst, unsigned dst_stride, - const void *src, unsigned src_stride, - unsigned x, unsigned y, unsigned w, unsigned h); - -void -util_format_write_4ui(enum pipe_format format, - const unsigned int *src, unsigned src_stride, - void *dst, unsigned dst_stride, - unsigned x, unsigned y, unsigned w, unsigned h); - -void -util_format_read_4i(enum pipe_format format, - int *dst, unsigned dst_stride, - const void *src, unsigned src_stride, - unsigned x, unsigned y, unsigned w, unsigned h); - -void -util_format_write_4i(enum pipe_format format, - const int *src, unsigned src_stride, - void *dst, unsigned dst_stride, - unsigned x, unsigned y, unsigned w, unsigned h); - -/* - * Generic format conversion; - */ - -boolean -util_format_fits_8unorm(const struct util_format_description *format_desc); - -boolean -util_format_translate(enum pipe_format dst_format, - void *dst, unsigned dst_stride, - unsigned dst_x, unsigned dst_y, - enum pipe_format src_format, - const void *src, unsigned src_stride, - unsigned src_x, unsigned src_y, - unsigned width, unsigned height); - -boolean -util_format_translate_3d(enum pipe_format dst_format, - void *dst, unsigned dst_stride, - unsigned dst_slice_stride, - unsigned dst_x, unsigned dst_y, - unsigned dst_z, - enum pipe_format src_format, - const void *src, unsigned src_stride, - unsigned src_slice_stride, - unsigned src_x, unsigned src_y, - unsigned src_z, unsigned width, - unsigned height, unsigned depth); - -/* - * Swizzle operations. - */ - -/* Compose two sets of swizzles. - * If V is a 4D vector and the function parameters represent functions that - * swizzle vector components, this holds: - * swz2(swz1(V)) = dst(V) - */ -void util_format_compose_swizzles(const unsigned char swz1[4], - const unsigned char swz2[4], - unsigned char dst[4]); - -/* Apply the swizzle provided in \param swz (which is one of PIPE_SWIZZLE_x) - * to \param src and store the result in \param dst. - * \param is_integer determines the value written for PIPE_SWIZZLE_1. - */ -void util_format_apply_color_swizzle(union pipe_color_union *dst, - const union pipe_color_union *src, - const unsigned char swz[4], - const boolean is_integer); - -void pipe_swizzle_4f(float *dst, const float *src, - const unsigned char swz[4]); - -void util_format_unswizzle_4f(float *dst, const float *src, - const unsigned char swz[4]); - -enum pipe_format -util_format_snorm8_to_sint8(enum pipe_format format); - -#ifdef __cplusplus -} // extern "C" { -#endif - -#endif /* ! U_FORMAT_H */ diff -Nru mesa-19.2.8/src/gallium/auxiliary/util/u_format_latc.c mesa-20.0.8/src/gallium/auxiliary/util/u_format_latc.c --- mesa-19.2.8/src/gallium/auxiliary/util/u_format_latc.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/util/u_format_latc.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,306 +0,0 @@ -/************************************************************************** - * - * Copyright (C) 2011 Red Hat Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - * - **************************************************************************/ - -#include -#include "u_format.h" -#include "u_format_rgtc.h" -#include "u_format_latc.h" -#include "util/rgtc.h" -#include "util/u_math.h" - -void -util_format_latc1_unorm_fetch_rgba_8unorm(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j) -{ - /* Fix warnings here: */ - (void) util_format_unsigned_encode_rgtc_ubyte; - (void) util_format_signed_encode_rgtc_ubyte; - - util_format_unsigned_fetch_texel_rgtc(0, src, i, j, dst, 1); - dst[1] = dst[0]; - dst[2] = dst[0]; - dst[3] = 255; -} - -void -util_format_latc1_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height) -{ - util_format_rgtc1_unorm_unpack_rgba_8unorm(dst_row, dst_stride, src_row, src_stride, width, height); -} - -void -util_format_latc1_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, - unsigned src_stride, unsigned width, unsigned height) -{ - util_format_rgtc1_unorm_pack_rgba_8unorm(dst_row, dst_stride, src_row, src_stride, width, height); -} - -void -util_format_latc1_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height) -{ - unsigned x, y, i, j; - int block_size = 8; - - for(y = 0; y < height; y += 4) { - const uint8_t *src = src_row; - for(x = 0; x < width; x += 4) { - for(j = 0; j < 4; ++j) { - for(i = 0; i < 4; ++i) { - float *dst = dst_row + (y + j)*dst_stride/sizeof(*dst_row) + (x + i)*4; - uint8_t tmp_r; - util_format_unsigned_fetch_texel_rgtc(0, src, i, j, &tmp_r, 1); - dst[0] = - dst[1] = - dst[2] = ubyte_to_float(tmp_r); - dst[3] = 1.0; - } - } - src += block_size; - } - src_row += src_stride; - } -} - -void -util_format_latc1_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height) -{ - util_format_rgtc1_unorm_pack_rgba_float(dst_row, dst_stride, src_row, src_stride, width, height); -} - -void -util_format_latc1_unorm_fetch_rgba_float(float *dst, const uint8_t *src, unsigned i, unsigned j) -{ - uint8_t tmp_r; - - util_format_unsigned_fetch_texel_rgtc(0, src, i, j, &tmp_r, 1); - dst[0] = - dst[1] = - dst[2] = ubyte_to_float(tmp_r); - dst[3] = 1.0; -} - -void -util_format_latc1_snorm_fetch_rgba_8unorm(UNUSED uint8_t *dst, UNUSED const uint8_t *src, - UNUSED unsigned i, UNUSED unsigned j) -{ - fprintf(stderr,"%s\n", __func__); -} - -void -util_format_latc1_snorm_unpack_rgba_8unorm(UNUSED uint8_t *dst_row, UNUSED unsigned dst_stride, - UNUSED const uint8_t *src_row, UNUSED unsigned src_stride, - UNUSED unsigned width, UNUSED unsigned height) -{ - fprintf(stderr,"%s\n", __func__); -} - -void -util_format_latc1_snorm_pack_rgba_8unorm(UNUSED uint8_t *dst_row, UNUSED unsigned dst_stride, - UNUSED const uint8_t *src_row, UNUSED unsigned src_stride, - UNUSED unsigned width, UNUSED unsigned height) -{ - fprintf(stderr,"%s\n", __func__); -} - -void -util_format_latc1_snorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height) -{ - util_format_rgtc1_snorm_pack_rgba_float(dst_row, dst_stride, src_row, src_stride, width, height); -} - -void -util_format_latc1_snorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height) -{ - unsigned x, y, i, j; - int block_size = 8; - - for(y = 0; y < height; y += 4) { - const int8_t *src = (int8_t *)src_row; - for(x = 0; x < width; x += 4) { - for(j = 0; j < 4; ++j) { - for(i = 0; i < 4; ++i) { - float *dst = dst_row + (y + j)*dst_stride/sizeof(*dst_row) + (x + i)*4; - int8_t tmp_r; - util_format_signed_fetch_texel_rgtc(0, src, i, j, &tmp_r, 1); - dst[0] = - dst[1] = - dst[2] = byte_to_float_tex(tmp_r); - dst[3] = 1.0; - } - } - src += block_size; - } - src_row += src_stride; - } -} - -void -util_format_latc1_snorm_fetch_rgba_float(float *dst, const uint8_t *src, unsigned i, unsigned j) -{ - int8_t tmp_r; - - util_format_signed_fetch_texel_rgtc(0, (int8_t *)src, i, j, &tmp_r, 1); - dst[0] = - dst[1] = - dst[2] = byte_to_float_tex(tmp_r); - dst[3] = 1.0; -} - - -void -util_format_latc2_unorm_fetch_rgba_8unorm(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j) -{ - util_format_unsigned_fetch_texel_rgtc(0, src, i, j, dst, 2); - dst[1] = dst[0]; - dst[2] = dst[0]; - util_format_unsigned_fetch_texel_rgtc(0, src + 8, i, j, dst + 3, 2); -} - -void -util_format_latc2_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height) -{ - util_format_rgtc2_unorm_unpack_rgba_8unorm(dst_row, dst_stride, src_row, src_stride, width, height); -} - -void -util_format_latc2_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height) -{ - util_format_rgtc2_unorm_pack_rgba_8unorm(dst_row, dst_stride, src_row, src_stride, width, height); -} - -void -util_format_latc2_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height) -{ - util_format_rxtc2_unorm_pack_rgba_float(dst_row, dst_stride, src_row, src_stride, width, height, 3); -} - -void -util_format_latc2_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height) -{ - unsigned x, y, i, j; - int block_size = 16; - - for(y = 0; y < height; y += 4) { - const uint8_t *src = src_row; - for(x = 0; x < width; x += 4) { - for(j = 0; j < 4; ++j) { - for(i = 0; i < 4; ++i) { - float *dst = dst_row + (y + j)*dst_stride/sizeof(*dst_row) + (x + i)*4; - uint8_t tmp_r, tmp_g; - util_format_unsigned_fetch_texel_rgtc(0, src, i, j, &tmp_r, 2); - util_format_unsigned_fetch_texel_rgtc(0, src + 8, i, j, &tmp_g, 2); - dst[0] = - dst[1] = - dst[2] = ubyte_to_float(tmp_r); - dst[3] = ubyte_to_float(tmp_g); - } - } - src += block_size; - } - src_row += src_stride; - } -} - -void -util_format_latc2_unorm_fetch_rgba_float(float *dst, const uint8_t *src, unsigned i, unsigned j) -{ - uint8_t tmp_r, tmp_g; - - util_format_unsigned_fetch_texel_rgtc(0, src, i, j, &tmp_r, 2); - util_format_unsigned_fetch_texel_rgtc(0, src + 8, i, j, &tmp_g, 2); - dst[0] = - dst[1] = - dst[2] = ubyte_to_float(tmp_r); - dst[3] = ubyte_to_float(tmp_g); -} - - -void -util_format_latc2_snorm_fetch_rgba_8unorm(UNUSED uint8_t *dst, UNUSED const uint8_t *src, - UNUSED unsigned i, UNUSED unsigned j) -{ - fprintf(stderr,"%s\n", __func__); -} - -void -util_format_latc2_snorm_unpack_rgba_8unorm(UNUSED uint8_t *dst_row, UNUSED unsigned dst_stride, - UNUSED const uint8_t *src_row, UNUSED unsigned src_stride, - UNUSED unsigned width, UNUSED unsigned height) -{ - fprintf(stderr,"%s\n", __func__); -} - -void -util_format_latc2_snorm_pack_rgba_8unorm(UNUSED uint8_t *dst_row, UNUSED unsigned dst_stride, - UNUSED const uint8_t *src_row, UNUSED unsigned src_stride, - UNUSED unsigned width, UNUSED unsigned height) -{ - fprintf(stderr,"%s\n", __func__); -} - -void -util_format_latc2_snorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height) -{ - unsigned x, y, i, j; - int block_size = 16; - - for(y = 0; y < height; y += 4) { - const int8_t *src = (int8_t *)src_row; - for(x = 0; x < width; x += 4) { - for(j = 0; j < 4; ++j) { - for(i = 0; i < 4; ++i) { - float *dst = dst_row + (y + j)*dst_stride/sizeof(*dst_row) + (x + i)*4; - int8_t tmp_r, tmp_g; - util_format_signed_fetch_texel_rgtc(0, src, i, j, &tmp_r, 2); - util_format_signed_fetch_texel_rgtc(0, src + 8, i, j, &tmp_g, 2); - dst[0] = - dst[1] = - dst[2] = byte_to_float_tex(tmp_r); - dst[3] = byte_to_float_tex(tmp_g); - } - } - src += block_size; - } - src_row += src_stride; - } -} - -void -util_format_latc2_snorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height) -{ - util_format_rxtc2_snorm_pack_rgba_float(dst_row, dst_stride, src_row, src_stride, width, height, 3); -} - -void -util_format_latc2_snorm_fetch_rgba_float(float *dst, const uint8_t *src, unsigned i, unsigned j) -{ - int8_t tmp_r, tmp_g; - - util_format_signed_fetch_texel_rgtc(0, (int8_t *)src, i, j, &tmp_r, 2); - util_format_signed_fetch_texel_rgtc(0, (int8_t *)src + 8, i, j, &tmp_g, 2); - dst[0] = - dst[1] = - dst[2] = byte_to_float_tex(tmp_r); - dst[3] = byte_to_float_tex(tmp_g); -} - diff -Nru mesa-19.2.8/src/gallium/auxiliary/util/u_format_latc.h mesa-20.0.8/src/gallium/auxiliary/util/u_format_latc.h --- mesa-19.2.8/src/gallium/auxiliary/util/u_format_latc.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/util/u_format_latc.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,108 +0,0 @@ -/************************************************************************** - * - * Copyright 2011 Red Hat Inc. - * All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sub license, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL - * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, - * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR - * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE - * USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * The above copyright notice and this permission notice (including the - * next paragraph) shall be included in all copies or substantial portions - * of the Software. - * - **************************************************************************/ - -#ifndef U_FORMAT_LATC_H_ -#define U_FORMAT_LATC_H_ - -void -util_format_latc1_unorm_fetch_rgba_8unorm(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j); - -void -util_format_latc1_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); - -void -util_format_latc1_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); - -void -util_format_latc1_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); - -void -util_format_latc1_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height); - -void -util_format_latc1_unorm_fetch_rgba_float(float *dst, const uint8_t *src, unsigned i, unsigned j); - - - -void -util_format_latc1_snorm_fetch_rgba_8unorm(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j); - -void -util_format_latc1_snorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); - -void -util_format_latc1_snorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); - -void -util_format_latc1_snorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height); - -void -util_format_latc1_snorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); - -void -util_format_latc1_snorm_fetch_rgba_float(float *dst, const uint8_t *src, unsigned i, unsigned j); - - -void -util_format_latc2_unorm_fetch_rgba_8unorm(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j); - -void -util_format_latc2_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); - -void -util_format_latc2_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); - -void -util_format_latc2_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height); - -void -util_format_latc2_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); - -void -util_format_latc2_unorm_fetch_rgba_float(float *dst, const uint8_t *src, unsigned i, unsigned j); - - -void -util_format_latc2_snorm_fetch_rgba_8unorm(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j); - -void -util_format_latc2_snorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); - -void -util_format_latc2_snorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); - -void -util_format_latc2_snorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); - -void -util_format_latc2_snorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height); - -void -util_format_latc2_snorm_fetch_rgba_float(float *dst, const uint8_t *src, unsigned i, unsigned j); - - -#endif diff -Nru mesa-19.2.8/src/gallium/auxiliary/util/u_format_other.c mesa-20.0.8/src/gallium/auxiliary/util/u_format_other.c --- mesa-19.2.8/src/gallium/auxiliary/util/u_format_other.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/util/u_format_other.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,422 +0,0 @@ -/************************************************************************** - * - * Copyright 2010 VMware, Inc. - * All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sub license, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL - * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, - * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR - * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE - * USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * The above copyright notice and this permission notice (including the - * next paragraph) shall be included in all copies or substantial portions - * of the Software. - * - **************************************************************************/ - - -#include "u_format_other.h" -#include "util/u_math.h" -#include "util/format_rgb9e5.h" -#include "util/format_r11g11b10f.h" - - -void -util_format_r9g9b9e5_float_unpack_rgba_float(float *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - unsigned x, y; - for(y = 0; y < height; y += 1) { - float *dst = dst_row; - const uint8_t *src = src_row; - for(x = 0; x < width; x += 1) { - uint32_t value = util_cpu_to_le32(*(const uint32_t *)src); - rgb9e5_to_float3(value, dst); - dst[3] = 1; /* a */ - src += 4; - dst += 4; - } - src_row += src_stride; - dst_row += dst_stride/sizeof(*dst_row); - } -} - -void -util_format_r9g9b9e5_float_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, - const float *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - unsigned x, y; - for(y = 0; y < height; y += 1) { - const float *src = src_row; - uint8_t *dst = dst_row; - for(x = 0; x < width; x += 1) { - uint32_t value = util_cpu_to_le32(float3_to_rgb9e5(src)); - *(uint32_t *)dst = value; - src += 4; - dst += 4; - } - dst_row += dst_stride; - src_row += src_stride/sizeof(*src_row); - } -} - -void -util_format_r9g9b9e5_float_fetch_rgba_float(float *dst, const uint8_t *src, - UNUSED unsigned i, UNUSED unsigned j) -{ - uint32_t value = util_cpu_to_le32(*(const uint32_t *)src); - rgb9e5_to_float3(value, dst); - dst[3] = 1; /* a */ -} - - -void -util_format_r9g9b9e5_float_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - unsigned x, y; - float p[3]; - for(y = 0; y < height; y += 1) { - uint8_t *dst = dst_row; - const uint8_t *src = src_row; - for(x = 0; x < width; x += 1) { - uint32_t value = util_cpu_to_le32(*(const uint32_t *)src); - rgb9e5_to_float3(value, p); - dst[0] = float_to_ubyte(p[0]); /* r */ - dst[1] = float_to_ubyte(p[1]); /* g */ - dst[2] = float_to_ubyte(p[2]); /* b */ - dst[3] = 255; /* a */ - src += 4; - dst += 4; - } - src_row += src_stride; - dst_row += dst_stride/sizeof(*dst_row); - } -} - - -void -util_format_r9g9b9e5_float_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - unsigned x, y; - float p[3]; - for(y = 0; y < height; y += 1) { - const uint8_t *src = src_row; - uint8_t *dst = dst_row; - for(x = 0; x < width; x += 1) { - uint32_t value; - p[0] = ubyte_to_float(src[0]); - p[1] = ubyte_to_float(src[1]); - p[2] = ubyte_to_float(src[2]); - value = util_cpu_to_le32(float3_to_rgb9e5(p)); - *(uint32_t *)dst = value; - src += 4; - dst += 4; - } - dst_row += dst_stride; - src_row += src_stride/sizeof(*src_row); - } -} - - -void -util_format_r11g11b10_float_unpack_rgba_float(float *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - unsigned x, y; - for(y = 0; y < height; y += 1) { - float *dst = dst_row; - const uint8_t *src = src_row; - for(x = 0; x < width; x += 1) { - uint32_t value = util_cpu_to_le32(*(const uint32_t *)src); - r11g11b10f_to_float3(value, dst); - dst[3] = 1; /* a */ - src += 4; - dst += 4; - } - src_row += src_stride; - dst_row += dst_stride/sizeof(*dst_row); - } -} - -void -util_format_r11g11b10_float_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, - const float *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - unsigned x, y; - for(y = 0; y < height; y += 1) { - const float *src = src_row; - uint8_t *dst = dst_row; - for(x = 0; x < width; x += 1) { - uint32_t value = util_cpu_to_le32(float3_to_r11g11b10f(src)); - *(uint32_t *)dst = value; - src += 4; - dst += 4; - } - dst_row += dst_stride; - src_row += src_stride/sizeof(*src_row); - } -} - -void -util_format_r11g11b10_float_fetch_rgba_float(float *dst, const uint8_t *src, - UNUSED unsigned i, UNUSED unsigned j) -{ - uint32_t value = util_cpu_to_le32(*(const uint32_t *)src); - r11g11b10f_to_float3(value, dst); - dst[3] = 1; /* a */ -} - - -void -util_format_r11g11b10_float_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - unsigned x, y; - float p[3]; - for(y = 0; y < height; y += 1) { - uint8_t *dst = dst_row; - const uint8_t *src = src_row; - for(x = 0; x < width; x += 1) { - uint32_t value = util_cpu_to_le32(*(const uint32_t *)src); - r11g11b10f_to_float3(value, p); - dst[0] = float_to_ubyte(p[0]); /* r */ - dst[1] = float_to_ubyte(p[1]); /* g */ - dst[2] = float_to_ubyte(p[2]); /* b */ - dst[3] = 255; /* a */ - src += 4; - dst += 4; - } - src_row += src_stride; - dst_row += dst_stride/sizeof(*dst_row); - } -} - - -void -util_format_r11g11b10_float_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - unsigned x, y; - float p[3]; - for(y = 0; y < height; y += 1) { - const uint8_t *src = src_row; - uint8_t *dst = dst_row; - for(x = 0; x < width; x += 1) { - uint32_t value; - p[0] = ubyte_to_float(src[0]); - p[1] = ubyte_to_float(src[1]); - p[2] = ubyte_to_float(src[2]); - value = util_cpu_to_le32(float3_to_r11g11b10f(p)); - *(uint32_t *)dst = value; - src += 4; - dst += 4; - } - dst_row += dst_stride; - src_row += src_stride/sizeof(*src_row); - } -} - - -void -util_format_r1_unorm_unpack_rgba_float(UNUSED float *dst_row, UNUSED unsigned dst_stride, - UNUSED const uint8_t *src_row, UNUSED unsigned src_stride, - UNUSED unsigned width, UNUSED unsigned height) -{ - -} - - -void -util_format_r1_unorm_pack_rgba_float(UNUSED uint8_t *dst_row, UNUSED unsigned dst_stride, - UNUSED const float *src_row, UNUSED unsigned src_stride, - UNUSED unsigned width, UNUSED unsigned height) -{ - -} - - -void -util_format_r1_unorm_fetch_rgba_float(UNUSED float *dst, UNUSED const uint8_t *src, - UNUSED unsigned i, UNUSED unsigned j) -{ - -} - - -void -util_format_r1_unorm_unpack_rgba_8unorm(UNUSED uint8_t *dst_row, UNUSED unsigned dst_stride, - UNUSED const uint8_t *src_row, UNUSED unsigned src_stride, - UNUSED unsigned width, UNUSED unsigned height) -{ - -} - - -void -util_format_r1_unorm_pack_rgba_8unorm(UNUSED uint8_t *dst_row, UNUSED unsigned dst_stride, - UNUSED const uint8_t *src_row, UNUSED unsigned src_stride, - UNUSED unsigned width, UNUSED unsigned height) -{ -} - - -/* - * PIPE_FORMAT_R8G8Bx_SNORM - * - * A.k.a. D3DFMT_CxV8U8 - */ - -static uint8_t -r8g8bx_derive(int16_t r, int16_t g) -{ - /* Derive blue from red and green components. - * Apparently, we must always use integers to perform calculations, - * otherwise the results won't match D3D's CxV8U8 definition. - */ - return (uint8_t)sqrtf(0x7f * 0x7f - r * r - g * g) * 0xff / 0x7f; -} - -void -util_format_r8g8bx_snorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - unsigned x, y; - - for(y = 0; y < height; y += 1) { - float *dst = dst_row; - const uint16_t *src = (const uint16_t *)src_row; - for(x = 0; x < width; x += 1) { - uint16_t value = util_cpu_to_le16(*src++); - int16_t r, g; - - r = ((int16_t)(value << 8)) >> 8; - g = ((int16_t)(value << 0)) >> 8; - - dst[0] = (float)(r * (1.0f/0x7f)); /* r */ - dst[1] = (float)(g * (1.0f/0x7f)); /* g */ - dst[2] = r8g8bx_derive(r, g) * (1.0f/0xff); /* b */ - dst[3] = 1.0f; /* a */ - dst += 4; - } - src_row += src_stride; - dst_row += dst_stride/sizeof(*dst_row); - } -} - - -void -util_format_r8g8bx_snorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - unsigned x, y; - for(y = 0; y < height; y += 1) { - uint8_t *dst = dst_row; - const uint16_t *src = (const uint16_t *)src_row; - for(x = 0; x < width; x += 1) { - uint16_t value = util_cpu_to_le16(*src++); - int16_t r, g; - - r = ((int16_t)(value << 8)) >> 8; - g = ((int16_t)(value << 0)) >> 8; - - dst[0] = (uint8_t)(((uint16_t)MAX2(r, 0)) * 0xff / 0x7f); /* r */ - dst[1] = (uint8_t)(((uint16_t)MAX2(g, 0)) * 0xff / 0x7f); /* g */ - dst[2] = r8g8bx_derive(r, g); /* b */ - dst[3] = 255; /* a */ - dst += 4; - } - src_row += src_stride; - dst_row += dst_stride/sizeof(*dst_row); - } -} - - -void -util_format_r8g8bx_snorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, - const float *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - unsigned x, y; - for(y = 0; y < height; y += 1) { - const float *src = src_row; - uint16_t *dst = (uint16_t *)dst_row; - for(x = 0; x < width; x += 1) { - uint16_t value = 0; - - value |= (uint16_t)(((int8_t)(CLAMP(src[0], -1, 1) * 0x7f)) & 0xff) ; - value |= (uint16_t)((((int8_t)(CLAMP(src[1], -1, 1) * 0x7f)) & 0xff) << 8) ; - - *dst++ = util_le16_to_cpu(value); - - src += 4; - } - dst_row += dst_stride; - src_row += src_stride/sizeof(*src_row); - } -} - - -void -util_format_r8g8bx_snorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - unsigned x, y; - - for(y = 0; y < height; y += 1) { - const uint8_t *src = src_row; - uint16_t *dst = (uint16_t *)dst_row; - for(x = 0; x < width; x += 1) { - uint16_t value = 0; - - value |= src[0] >> 1; - value |= (src[1] >> 1) << 8; - - *dst++ = util_le16_to_cpu(value); - - src += 4; - } - dst_row += dst_stride; - src_row += src_stride/sizeof(*src_row); - } -} - - -void -util_format_r8g8bx_snorm_fetch_rgba_float(float *dst, const uint8_t *src, - UNUSED unsigned i, UNUSED unsigned j) -{ - uint16_t value = util_cpu_to_le16(*(const uint16_t *)src); - int16_t r, g; - - r = ((int16_t)(value << 8)) >> 8; - g = ((int16_t)(value << 0)) >> 8; - - dst[0] = r * (1.0f/0x7f); /* r */ - dst[1] = g * (1.0f/0x7f); /* g */ - dst[2] = r8g8bx_derive(r, g) * (1.0f/0xff); /* b */ - dst[3] = 1.0f; /* a */ -} diff -Nru mesa-19.2.8/src/gallium/auxiliary/util/u_format_other.h mesa-20.0.8/src/gallium/auxiliary/util/u_format_other.h --- mesa-19.2.8/src/gallium/auxiliary/util/u_format_other.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/util/u_format_other.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,134 +0,0 @@ -/************************************************************************** - * - * Copyright 2010 VMware, Inc. - * All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sub license, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL - * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, - * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR - * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE - * USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * The above copyright notice and this permission notice (including the - * next paragraph) shall be included in all copies or substantial portions - * of the Software. - * - **************************************************************************/ - - -#ifndef U_FORMAT_OTHER_H_ -#define U_FORMAT_OTHER_H_ - - -#include "pipe/p_compiler.h" - - -void -util_format_r9g9b9e5_float_unpack_rgba_float(float *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height); - -void -util_format_r9g9b9e5_float_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, - const float *src_row, unsigned src_stride, - unsigned width, unsigned height); - -void -util_format_r9g9b9e5_float_fetch_rgba_float(float *dst, const uint8_t *src, - unsigned i, unsigned j); - -void -util_format_r9g9b9e5_float_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height); - -void -util_format_r9g9b9e5_float_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height); - - -void -util_format_r11g11b10_float_unpack_rgba_float(float *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height); - -void -util_format_r11g11b10_float_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, - const float *src_row, unsigned src_stride, - unsigned width, unsigned height); - -void -util_format_r11g11b10_float_fetch_rgba_float(float *dst, const uint8_t *src, - unsigned i, unsigned j); - -void -util_format_r11g11b10_float_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height); - -void -util_format_r11g11b10_float_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height); - - -void -util_format_r1_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height); - -void -util_format_r1_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, - const float *src_row, unsigned src_stride, - unsigned width, unsigned height); - -void -util_format_r1_unorm_fetch_rgba_float(float *dst, const uint8_t *src, - unsigned i, unsigned j); - -void -util_format_r1_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height); - -void -util_format_r1_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height); - -void -util_format_r8g8bx_snorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height); - -void -util_format_r8g8bx_snorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, - const float *src_row, unsigned src_stride, - unsigned width, unsigned height); - -void -util_format_r8g8bx_snorm_fetch_rgba_float(float *dst, const uint8_t *src, - unsigned i, unsigned j); - -void -util_format_r8g8bx_snorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height); - -void -util_format_r8g8bx_snorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height); - -#endif /* U_FORMAT_OTHER_H_ */ diff -Nru mesa-19.2.8/src/gallium/auxiliary/util/u_format_pack.py mesa-20.0.8/src/gallium/auxiliary/util/u_format_pack.py --- mesa-19.2.8/src/gallium/auxiliary/util/u_format_pack.py 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/util/u_format_pack.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,763 +0,0 @@ - -''' -/************************************************************************** - * - * Copyright 2009-2010 VMware, Inc. - * All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sub license, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice (including the - * next paragraph) shall be included in all copies or substantial portions - * of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. - * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR - * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - **************************************************************************/ - -/** - * @file - * Pixel format packing and unpacking functions. - * - * @author Jose Fonseca - */ -''' - - -from __future__ import division, print_function - -import sys - -from u_format_parse import * - - -if sys.version_info < (3, 0): - integer_types = (int, long) - -else: - integer_types = (int, ) - - -def inv_swizzles(swizzles): - '''Return an array[4] of inverse swizzle terms''' - '''Only pick the first matching value to avoid l8 getting blue and i8 getting alpha''' - inv_swizzle = [None]*4 - for i in range(4): - swizzle = swizzles[i] - if swizzle < 4 and inv_swizzle[swizzle] == None: - inv_swizzle[swizzle] = i - return inv_swizzle - -def print_channels(format, func): - if format.nr_channels() <= 1: - func(format.le_channels, format.le_swizzles) - else: - print('#ifdef PIPE_ARCH_BIG_ENDIAN') - func(format.be_channels, format.be_swizzles) - print('#else') - func(format.le_channels, format.le_swizzles) - print('#endif') - -def generate_format_type(format): - '''Generate a structure that describes the format.''' - - assert format.layout == PLAIN - - def generate_bitfields(channels, swizzles): - for channel in channels: - if channel.type == VOID: - if channel.size: - print(' unsigned %s:%u;' % (channel.name, channel.size)) - elif channel.type == UNSIGNED: - print(' unsigned %s:%u;' % (channel.name, channel.size)) - elif channel.type in (SIGNED, FIXED): - print(' int %s:%u;' % (channel.name, channel.size)) - elif channel.type == FLOAT: - if channel.size == 64: - print(' double %s;' % (channel.name)) - elif channel.size == 32: - print(' float %s;' % (channel.name)) - else: - print(' unsigned %s:%u;' % (channel.name, channel.size)) - else: - assert 0 - - def generate_full_fields(channels, swizzles): - for channel in channels: - assert channel.size % 8 == 0 and is_pot(channel.size) - if channel.type == VOID: - if channel.size: - print(' uint%u_t %s;' % (channel.size, channel.name)) - elif channel.type == UNSIGNED: - print(' uint%u_t %s;' % (channel.size, channel.name)) - elif channel.type in (SIGNED, FIXED): - print(' int%u_t %s;' % (channel.size, channel.name)) - elif channel.type == FLOAT: - if channel.size == 64: - print(' double %s;' % (channel.name)) - elif channel.size == 32: - print(' float %s;' % (channel.name)) - elif channel.size == 16: - print(' uint16_t %s;' % (channel.name)) - else: - assert 0 - else: - assert 0 - - print('union util_format_%s {' % format.short_name()) - - if format.block_size() in (8, 16, 32, 64): - print(' uint%u_t value;' % (format.block_size(),)) - - use_bitfields = False - for channel in format.le_channels: - if channel.size % 8 or not is_pot(channel.size): - use_bitfields = True - - print(' struct {') - if use_bitfields: - print_channels(format, generate_bitfields) - else: - print_channels(format, generate_full_fields) - print(' } chan;') - print('};') - print() - - -def is_format_supported(format): - '''Determines whether we actually have the plumbing necessary to generate the - to read/write to/from this format.''' - - # FIXME: Ideally we would support any format combination here. - - if format.layout != PLAIN: - return False - - for i in range(4): - channel = format.le_channels[i] - if channel.type not in (VOID, UNSIGNED, SIGNED, FLOAT, FIXED): - return False - if channel.type == FLOAT and channel.size not in (16, 32, 64): - return False - - return True - -def native_type(format): - '''Get the native appropriate for a format.''' - - if format.name == 'PIPE_FORMAT_R11G11B10_FLOAT': - return 'uint32_t' - if format.name == 'PIPE_FORMAT_R9G9B9E5_FLOAT': - return 'uint32_t' - - if format.layout == PLAIN: - if not format.is_array(): - # For arithmetic pixel formats return the integer type that matches the whole pixel - return 'uint%u_t' % format.block_size() - else: - # For array pixel formats return the integer type that matches the color channel - channel = format.array_element() - if channel.type in (UNSIGNED, VOID): - return 'uint%u_t' % channel.size - elif channel.type in (SIGNED, FIXED): - return 'int%u_t' % channel.size - elif channel.type == FLOAT: - if channel.size == 16: - return 'uint16_t' - elif channel.size == 32: - return 'float' - elif channel.size == 64: - return 'double' - else: - assert False - else: - assert False - else: - assert False - - -def intermediate_native_type(bits, sign): - '''Find a native type adequate to hold intermediate results of the request bit size.''' - - bytes = 4 # don't use anything smaller than 32bits - while bytes * 8 < bits: - bytes *= 2 - bits = bytes*8 - - if sign: - return 'int%u_t' % bits - else: - return 'uint%u_t' % bits - - -def get_one_shift(type): - '''Get the number of the bit that matches unity for this type.''' - if type.type == 'FLOAT': - assert False - if not type.norm: - return 0 - if type.type == UNSIGNED: - return type.size - if type.type == SIGNED: - return type.size - 1 - if type.type == FIXED: - return type.size / 2 - assert False - - -def truncate_mantissa(x, bits): - '''Truncate an integer so it can be represented exactly with a floating - point mantissa''' - - assert isinstance(x, integer_types) - - s = 1 - if x < 0: - s = -1 - x = -x - - # We can represent integers up to mantissa + 1 bits exactly - mask = (1 << (bits + 1)) - 1 - - # Slide the mask until the MSB matches - shift = 0 - while (x >> shift) & ~mask: - shift += 1 - - x &= mask << shift - x *= s - return x - - -def value_to_native(type, value): - '''Get the value of unity for this type.''' - if type.type == FLOAT: - if type.size <= 32 \ - and isinstance(value, integer_types): - return truncate_mantissa(value, 23) - return value - if type.type == FIXED: - return int(value * (1 << (type.size // 2))) - if not type.norm: - return int(value) - if type.type == UNSIGNED: - return int(value * ((1 << type.size) - 1)) - if type.type == SIGNED: - return int(value * ((1 << (type.size - 1)) - 1)) - assert False - - -def native_to_constant(type, value): - '''Get the value of unity for this type.''' - if type.type == FLOAT: - if type.size <= 32: - return "%.1ff" % float(value) - else: - return "%.1f" % float(value) - else: - return str(int(value)) - - -def get_one(type): - '''Get the value of unity for this type.''' - return value_to_native(type, 1) - - -def clamp_expr(src_channel, dst_channel, dst_native_type, value): - '''Generate the expression to clamp the value in the source type to the - destination type range.''' - - if src_channel == dst_channel: - return value - - src_min = src_channel.min() - src_max = src_channel.max() - dst_min = dst_channel.min() - dst_max = dst_channel.max() - - # Translate the destination range to the src native value - dst_min_native = native_to_constant(src_channel, value_to_native(src_channel, dst_min)) - dst_max_native = native_to_constant(src_channel, value_to_native(src_channel, dst_max)) - - if src_min < dst_min and src_max > dst_max: - return 'CLAMP(%s, %s, %s)' % (value, dst_min_native, dst_max_native) - - if src_max > dst_max: - return 'MIN2(%s, %s)' % (value, dst_max_native) - - if src_min < dst_min: - return 'MAX2(%s, %s)' % (value, dst_min_native) - - return value - - -def conversion_expr(src_channel, - dst_channel, dst_native_type, - value, - clamp=True, - src_colorspace = RGB, - dst_colorspace = RGB): - '''Generate the expression to convert a value between two types.''' - - if src_colorspace != dst_colorspace: - if src_colorspace == SRGB: - assert src_channel.type == UNSIGNED - assert src_channel.norm - assert src_channel.size <= 8 - assert src_channel.size >= 4 - assert dst_colorspace == RGB - if src_channel.size < 8: - value = '%s << %x | %s >> %x' % (value, 8 - src_channel.size, value, 2 * src_channel.size - 8) - if dst_channel.type == FLOAT: - return 'util_format_srgb_8unorm_to_linear_float(%s)' % value - else: - assert dst_channel.type == UNSIGNED - assert dst_channel.norm - assert dst_channel.size == 8 - return 'util_format_srgb_to_linear_8unorm(%s)' % value - elif dst_colorspace == SRGB: - assert dst_channel.type == UNSIGNED - assert dst_channel.norm - assert dst_channel.size <= 8 - assert src_colorspace == RGB - if src_channel.type == FLOAT: - value = 'util_format_linear_float_to_srgb_8unorm(%s)' % value - else: - assert src_channel.type == UNSIGNED - assert src_channel.norm - assert src_channel.size == 8 - value = 'util_format_linear_to_srgb_8unorm(%s)' % value - # XXX rounding is all wrong. - if dst_channel.size < 8: - return '%s >> %x' % (value, 8 - dst_channel.size) - else: - return value - elif src_colorspace == ZS: - pass - elif dst_colorspace == ZS: - pass - else: - assert 0 - - if src_channel == dst_channel: - return value - - src_type = src_channel.type - src_size = src_channel.size - src_norm = src_channel.norm - src_pure = src_channel.pure - - # Promote half to float - if src_type == FLOAT and src_size == 16: - value = 'util_half_to_float(%s)' % value - src_size = 32 - - # Special case for float <-> ubytes for more accurate results - # Done before clamping since these functions already take care of that - if src_type == UNSIGNED and src_norm and src_size == 8 and dst_channel.type == FLOAT and dst_channel.size == 32: - return 'ubyte_to_float(%s)' % value - if src_type == FLOAT and src_size == 32 and dst_channel.type == UNSIGNED and dst_channel.norm and dst_channel.size == 8: - return 'float_to_ubyte(%s)' % value - - if clamp: - if dst_channel.type != FLOAT or src_type != FLOAT: - value = clamp_expr(src_channel, dst_channel, dst_native_type, value) - - if src_type in (SIGNED, UNSIGNED) and dst_channel.type in (SIGNED, UNSIGNED): - if not src_norm and not dst_channel.norm: - # neither is normalized -- just cast - return '(%s)%s' % (dst_native_type, value) - - src_one = get_one(src_channel) - dst_one = get_one(dst_channel) - - if src_one > dst_one and src_norm and dst_channel.norm: - # We can just bitshift - src_shift = get_one_shift(src_channel) - dst_shift = get_one_shift(dst_channel) - value = '(%s >> %s)' % (value, src_shift - dst_shift) - else: - # We need to rescale using an intermediate type big enough to hold the multiplication of both - tmp_native_type = intermediate_native_type(src_size + dst_channel.size, src_channel.sign and dst_channel.sign) - value = '((%s)%s)' % (tmp_native_type, value) - value = '(%s * 0x%x / 0x%x)' % (value, dst_one, src_one) - value = '(%s)%s' % (dst_native_type, value) - return value - - # Promote to either float or double - if src_type != FLOAT: - if src_norm or src_type == FIXED: - one = get_one(src_channel) - if src_size <= 23: - value = '(%s * (1.0f/0x%x))' % (value, one) - if dst_channel.size <= 32: - value = '(float)%s' % value - src_size = 32 - else: - # bigger than single precision mantissa, use double - value = '(%s * (1.0/0x%x))' % (value, one) - src_size = 64 - src_norm = False - else: - if src_size <= 23 or dst_channel.size <= 32: - value = '(float)%s' % value - src_size = 32 - else: - # bigger than single precision mantissa, use double - value = '(double)%s' % value - src_size = 64 - src_type = FLOAT - - # Convert double or float to non-float - if dst_channel.type != FLOAT: - if dst_channel.norm or dst_channel.type == FIXED: - dst_one = get_one(dst_channel) - if dst_channel.size <= 23: - value = 'util_iround(%s * 0x%x)' % (value, dst_one) - else: - # bigger than single precision mantissa, use double - value = '(%s * (double)0x%x)' % (value, dst_one) - value = '(%s)%s' % (dst_native_type, value) - else: - # Cast double to float when converting to either half or float - if dst_channel.size <= 32 and src_size > 32: - value = '(float)%s' % value - src_size = 32 - - if dst_channel.size == 16: - value = 'util_float_to_half(%s)' % value - elif dst_channel.size == 64 and src_size < 64: - value = '(double)%s' % value - - return value - - -def generate_unpack_kernel(format, dst_channel, dst_native_type): - - if not is_format_supported(format): - return - - assert format.layout == PLAIN - - src_native_type = native_type(format) - - def unpack_from_bitmask(channels, swizzles): - depth = format.block_size() - print(' uint%u_t value = *(const uint%u_t *)src;' % (depth, depth)) - - # Declare the intermediate variables - for i in range(format.nr_channels()): - src_channel = channels[i] - if src_channel.type == UNSIGNED: - print(' uint%u_t %s;' % (depth, src_channel.name)) - elif src_channel.type == SIGNED: - print(' int%u_t %s;' % (depth, src_channel.name)) - - # Compute the intermediate unshifted values - for i in range(format.nr_channels()): - src_channel = channels[i] - value = 'value' - shift = src_channel.shift - if src_channel.type == UNSIGNED: - if shift: - value = '%s >> %u' % (value, shift) - if shift + src_channel.size < depth: - value = '(%s) & 0x%x' % (value, (1 << src_channel.size) - 1) - elif src_channel.type == SIGNED: - if shift + src_channel.size < depth: - # Align the sign bit - lshift = depth - (shift + src_channel.size) - value = '%s << %u' % (value, lshift) - # Cast to signed - value = '(int%u_t)(%s) ' % (depth, value) - if src_channel.size < depth: - # Align the LSB bit - rshift = depth - src_channel.size - value = '(%s) >> %u' % (value, rshift) - else: - value = None - - if value is not None: - print(' %s = %s;' % (src_channel.name, value)) - - # Convert, swizzle, and store final values - for i in range(4): - swizzle = swizzles[i] - if swizzle < 4: - src_channel = channels[swizzle] - src_colorspace = format.colorspace - if src_colorspace == SRGB and i == 3: - # Alpha channel is linear - src_colorspace = RGB - value = src_channel.name - value = conversion_expr(src_channel, - dst_channel, dst_native_type, - value, - src_colorspace = src_colorspace) - elif swizzle == SWIZZLE_0: - value = '0' - elif swizzle == SWIZZLE_1: - value = get_one(dst_channel) - elif swizzle == SWIZZLE_NONE: - value = '0' - else: - assert False - print(' dst[%u] = %s; /* %s */' % (i, value, 'rgba'[i])) - - def unpack_from_union(channels, swizzles): - print(' union util_format_%s pixel;' % format.short_name()) - print(' memcpy(&pixel, src, sizeof pixel);') - - for i in range(4): - swizzle = swizzles[i] - if swizzle < 4: - src_channel = channels[swizzle] - src_colorspace = format.colorspace - if src_colorspace == SRGB and i == 3: - # Alpha channel is linear - src_colorspace = RGB - value = 'pixel.chan.%s' % src_channel.name - value = conversion_expr(src_channel, - dst_channel, dst_native_type, - value, - src_colorspace = src_colorspace) - elif swizzle == SWIZZLE_0: - value = '0' - elif swizzle == SWIZZLE_1: - value = get_one(dst_channel) - elif swizzle == SWIZZLE_NONE: - value = '0' - else: - assert False - print(' dst[%u] = %s; /* %s */' % (i, value, 'rgba'[i])) - - if format.is_bitmask(): - print_channels(format, unpack_from_bitmask) - else: - print_channels(format, unpack_from_union) - - -def generate_pack_kernel(format, src_channel, src_native_type): - - if not is_format_supported(format): - return - - dst_native_type = native_type(format) - - assert format.layout == PLAIN - - def pack_into_bitmask(channels, swizzles): - inv_swizzle = inv_swizzles(swizzles) - - depth = format.block_size() - print(' uint%u_t value = 0;' % depth) - - for i in range(4): - dst_channel = channels[i] - shift = dst_channel.shift - if inv_swizzle[i] is not None: - value ='src[%u]' % inv_swizzle[i] - dst_colorspace = format.colorspace - if dst_colorspace == SRGB and inv_swizzle[i] == 3: - # Alpha channel is linear - dst_colorspace = RGB - value = conversion_expr(src_channel, - dst_channel, dst_native_type, - value, - dst_colorspace = dst_colorspace) - if dst_channel.type in (UNSIGNED, SIGNED): - if shift + dst_channel.size < depth: - value = '(%s) & 0x%x' % (value, (1 << dst_channel.size) - 1) - if shift: - value = '(%s) << %u' % (value, shift) - if dst_channel.type == SIGNED: - # Cast to unsigned - value = '(uint%u_t)(%s) ' % (depth, value) - else: - value = None - if value is not None: - print(' value |= %s;' % (value)) - - print(' *(uint%u_t *)dst = value;' % depth) - - def pack_into_union(channels, swizzles): - inv_swizzle = inv_swizzles(swizzles) - - print(' union util_format_%s pixel;' % format.short_name()) - - for i in range(4): - dst_channel = channels[i] - width = dst_channel.size - if inv_swizzle[i] is None: - continue - dst_colorspace = format.colorspace - if dst_colorspace == SRGB and inv_swizzle[i] == 3: - # Alpha channel is linear - dst_colorspace = RGB - value ='src[%u]' % inv_swizzle[i] - value = conversion_expr(src_channel, - dst_channel, dst_native_type, - value, - dst_colorspace = dst_colorspace) - print(' pixel.chan.%s = %s;' % (dst_channel.name, value)) - - print(' memcpy(dst, &pixel, sizeof pixel);') - - if format.is_bitmask(): - print_channels(format, pack_into_bitmask) - else: - print_channels(format, pack_into_union) - - -def generate_format_unpack(format, dst_channel, dst_native_type, dst_suffix): - '''Generate the function to unpack pixels from a particular format''' - - name = format.short_name() - - print('static inline void') - print('util_format_%s_unpack_%s(%s *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)' % (name, dst_suffix, dst_native_type)) - print('{') - - if is_format_supported(format): - print(' unsigned x, y;') - print(' for(y = 0; y < height; y += %u) {' % (format.block_height,)) - print(' %s *dst = dst_row;' % (dst_native_type)) - print(' const uint8_t *src = src_row;') - print(' for(x = 0; x < width; x += %u) {' % (format.block_width,)) - - generate_unpack_kernel(format, dst_channel, dst_native_type) - - print(' src += %u;' % (format.block_size() / 8,)) - print(' dst += 4;') - print(' }') - print(' src_row += src_stride;') - print(' dst_row += dst_stride/sizeof(*dst_row);') - print(' }') - - print('}') - print() - - -def generate_format_pack(format, src_channel, src_native_type, src_suffix): - '''Generate the function to pack pixels to a particular format''' - - name = format.short_name() - - print('static inline void') - print('util_format_%s_pack_%s(uint8_t *dst_row, unsigned dst_stride, const %s *src_row, unsigned src_stride, unsigned width, unsigned height)' % (name, src_suffix, src_native_type)) - print('{') - - if is_format_supported(format): - print(' unsigned x, y;') - print(' for(y = 0; y < height; y += %u) {' % (format.block_height,)) - print(' const %s *src = src_row;' % (src_native_type)) - print(' uint8_t *dst = dst_row;') - print(' for(x = 0; x < width; x += %u) {' % (format.block_width,)) - - generate_pack_kernel(format, src_channel, src_native_type) - - print(' src += 4;') - print(' dst += %u;' % (format.block_size() / 8,)) - print(' }') - print(' dst_row += dst_stride;') - print(' src_row += src_stride/sizeof(*src_row);') - print(' }') - - print('}') - print() - - -def generate_format_fetch(format, dst_channel, dst_native_type, dst_suffix): - '''Generate the function to unpack pixels from a particular format''' - - name = format.short_name() - - print('static inline void') - print('util_format_%s_fetch_%s(%s *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)' % (name, dst_suffix, dst_native_type)) - print('{') - - if is_format_supported(format): - generate_unpack_kernel(format, dst_channel, dst_native_type) - - print('}') - print() - - -def is_format_hand_written(format): - return format.layout in ('s3tc', 'rgtc', 'etc', 'bptc', 'astc', 'atc', 'subsampled', 'other') or format.colorspace == ZS - - -def generate(formats): - print() - print('#include "pipe/p_compiler.h"') - print('#include "util/u_math.h"') - print('#include "u_half.h"') - print('#include "u_format.h"') - print('#include "u_format_other.h"') - print('#include "util/format_srgb.h"') - print('#include "u_format_yuv.h"') - print('#include "u_format_zs.h"') - print() - - for format in formats: - if not is_format_hand_written(format): - - if is_format_supported(format): - generate_format_type(format) - - if format.is_pure_unsigned(): - native_type = 'unsigned' - suffix = 'unsigned' - channel = Channel(UNSIGNED, False, True, 32) - - generate_format_unpack(format, channel, native_type, suffix) - generate_format_pack(format, channel, native_type, suffix) - generate_format_fetch(format, channel, native_type, suffix) - - channel = Channel(SIGNED, False, True, 32) - native_type = 'int' - suffix = 'signed' - generate_format_unpack(format, channel, native_type, suffix) - generate_format_pack(format, channel, native_type, suffix) - elif format.is_pure_signed(): - native_type = 'int' - suffix = 'signed' - channel = Channel(SIGNED, False, True, 32) - - generate_format_unpack(format, channel, native_type, suffix) - generate_format_pack(format, channel, native_type, suffix) - generate_format_fetch(format, channel, native_type, suffix) - - native_type = 'unsigned' - suffix = 'unsigned' - channel = Channel(UNSIGNED, False, True, 32) - generate_format_unpack(format, channel, native_type, suffix) - generate_format_pack(format, channel, native_type, suffix) - else: - channel = Channel(FLOAT, False, False, 32) - native_type = 'float' - suffix = 'rgba_float' - - generate_format_unpack(format, channel, native_type, suffix) - generate_format_pack(format, channel, native_type, suffix) - generate_format_fetch(format, channel, native_type, suffix) - - channel = Channel(UNSIGNED, True, False, 8) - native_type = 'uint8_t' - suffix = 'rgba_8unorm' - - generate_format_unpack(format, channel, native_type, suffix) - generate_format_pack(format, channel, native_type, suffix) - diff -Nru mesa-19.2.8/src/gallium/auxiliary/util/u_format_parse.py mesa-20.0.8/src/gallium/auxiliary/util/u_format_parse.py --- mesa-19.2.8/src/gallium/auxiliary/util/u_format_parse.py 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/util/u_format_parse.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,393 +0,0 @@ - -''' -/************************************************************************** - * - * Copyright 2009 VMware, Inc. - * All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sub license, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice (including the - * next paragraph) shall be included in all copies or substantial portions - * of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. - * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR - * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - **************************************************************************/ -''' - - -from __future__ import division - - -VOID, UNSIGNED, SIGNED, FIXED, FLOAT = range(5) - -SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_W, SWIZZLE_0, SWIZZLE_1, SWIZZLE_NONE, = range(7) - -PLAIN = 'plain' - -RGB = 'rgb' -SRGB = 'srgb' -YUV = 'yuv' -ZS = 'zs' - - -def is_pot(x): - return (x & (x - 1)) == 0 - - -VERY_LARGE = 99999999999999999999999 - - -class Channel: - '''Describe the channel of a color channel.''' - - def __init__(self, type, norm, pure, size, name = ''): - self.type = type - self.norm = norm - self.pure = pure - self.size = size - self.sign = type in (SIGNED, FIXED, FLOAT) - self.name = name - - def __str__(self): - s = str(self.type) - if self.norm: - s += 'n' - if self.pure: - s += 'p' - s += str(self.size) - return s - - def __eq__(self, other): - if other is None: - return False - - return self.type == other.type and self.norm == other.norm and self.pure == other.pure and self.size == other.size - - def __ne__(self, other): - return not self == other - - def max(self): - '''Maximum representable number.''' - if self.type == FLOAT: - return VERY_LARGE - if self.type == FIXED: - return (1 << (self.size // 2)) - 1 - if self.norm: - return 1 - if self.type == UNSIGNED: - return (1 << self.size) - 1 - if self.type == SIGNED: - return (1 << (self.size - 1)) - 1 - assert False - - def min(self): - '''Minimum representable number.''' - if self.type == FLOAT: - return -VERY_LARGE - if self.type == FIXED: - return -(1 << (self.size // 2)) - if self.type == UNSIGNED: - return 0 - if self.norm: - return -1 - if self.type == SIGNED: - return -(1 << (self.size - 1)) - assert False - - -class Format: - '''Describe a pixel format.''' - - def __init__(self, name, layout, block_width, block_height, le_channels, le_swizzles, be_channels, be_swizzles, colorspace): - self.name = name - self.layout = layout - self.block_width = block_width - self.block_height = block_height - self.le_channels = le_channels - self.le_swizzles = le_swizzles - self.be_channels = be_channels - self.be_swizzles = be_swizzles - self.name = name - self.colorspace = colorspace - - def __str__(self): - return self.name - - def short_name(self): - '''Make up a short norm for a format, suitable to be used as suffix in - function names.''' - - name = self.name - if name.startswith('PIPE_FORMAT_'): - name = name[len('PIPE_FORMAT_'):] - name = name.lower() - return name - - def block_size(self): - size = 0 - for channel in self.le_channels: - size += channel.size - return size - - def nr_channels(self): - nr_channels = 0 - for channel in self.le_channels: - if channel.size: - nr_channels += 1 - return nr_channels - - def array_element(self): - if self.layout != PLAIN: - return None - ref_channel = self.le_channels[0] - if ref_channel.type == VOID: - ref_channel = self.le_channels[1] - for channel in self.le_channels: - if channel.size and (channel.size != ref_channel.size or channel.size % 8): - return None - if channel.type != VOID: - if channel.type != ref_channel.type: - return None - if channel.norm != ref_channel.norm: - return None - if channel.pure != ref_channel.pure: - return None - return ref_channel - - def is_array(self): - return self.array_element() != None - - def is_mixed(self): - if self.layout != PLAIN: - return False - ref_channel = self.le_channels[0] - if ref_channel.type == VOID: - ref_channel = self.le_channels[1] - for channel in self.le_channels[1:]: - if channel.type != VOID: - if channel.type != ref_channel.type: - return True - if channel.norm != ref_channel.norm: - return True - if channel.pure != ref_channel.pure: - return True - return False - - def is_compressed(self): - for channel in self.le_channels: - if channel.type != VOID: - return False - return True - - def is_unorm(self): - # Non-compressed formats all have unorm or srgb in their name. - for keyword in ['_UNORM', '_SRGB']: - if keyword in self.name: - return True - - # All the compressed formats in GLES3.2 and GL4.6 ("Table 8.14: Generic - # and specific compressed internal formats.") that aren't snorm for - # border colors are unorm, other than BPTC_*_FLOAT. - return self.is_compressed() and not ('FLOAT' in self.name or self.is_snorm()) - - def is_snorm(self): - return '_SNORM' in self.name - - def is_pot(self): - return is_pot(self.block_size()) - - def is_int(self): - if self.layout != PLAIN: - return False - for channel in self.le_channels: - if channel.type not in (VOID, UNSIGNED, SIGNED): - return False - return True - - def is_float(self): - if self.layout != PLAIN: - return False - for channel in self.le_channels: - if channel.type not in (VOID, FLOAT): - return False - return True - - def is_bitmask(self): - if self.layout != PLAIN: - return False - if self.block_size() not in (8, 16, 32): - return False - for channel in self.le_channels: - if channel.type not in (VOID, UNSIGNED, SIGNED): - return False - return True - - def is_pure_color(self): - if self.layout != PLAIN or self.colorspace == ZS: - return False - pures = [channel.pure - for channel in self.le_channels - if channel.type != VOID] - for x in pures: - assert x == pures[0] - return pures[0] - - def channel_type(self): - types = [channel.type - for channel in self.le_channels - if channel.type != VOID] - for x in types: - assert x == types[0] - return types[0] - - def is_pure_signed(self): - return self.is_pure_color() and self.channel_type() == SIGNED - - def is_pure_unsigned(self): - return self.is_pure_color() and self.channel_type() == UNSIGNED - - def has_channel(self, id): - return self.le_swizzles[id] != SWIZZLE_NONE - - def has_depth(self): - return self.colorspace == ZS and self.has_channel(0) - - def has_stencil(self): - return self.colorspace == ZS and self.has_channel(1) - - def stride(self): - return self.block_size()/8 - - -_type_parse_map = { - '': VOID, - 'x': VOID, - 'u': UNSIGNED, - 's': SIGNED, - 'h': FIXED, - 'f': FLOAT, -} - -_swizzle_parse_map = { - 'x': SWIZZLE_X, - 'y': SWIZZLE_Y, - 'z': SWIZZLE_Z, - 'w': SWIZZLE_W, - '0': SWIZZLE_0, - '1': SWIZZLE_1, - '_': SWIZZLE_NONE, -} - -def _parse_channels(fields, layout, colorspace, swizzles): - if layout == PLAIN: - names = ['']*4 - if colorspace in (RGB, SRGB): - for i in range(4): - swizzle = swizzles[i] - if swizzle < 4: - names[swizzle] += 'rgba'[i] - elif colorspace == ZS: - for i in range(4): - swizzle = swizzles[i] - if swizzle < 4: - names[swizzle] += 'zs'[i] - else: - assert False - for i in range(4): - if names[i] == '': - names[i] = 'x' - else: - names = ['x', 'y', 'z', 'w'] - - channels = [] - for i in range(0, 4): - field = fields[i] - if field: - type = _type_parse_map[field[0]] - if field[1] == 'n': - norm = True - pure = False - size = int(field[2:]) - elif field[1] == 'p': - pure = True - norm = False - size = int(field[2:]) - else: - norm = False - pure = False - size = int(field[1:]) - else: - type = VOID - norm = False - pure = False - size = 0 - channel = Channel(type, norm, pure, size, names[i]) - channels.append(channel) - - return channels - -def parse(filename): - '''Parse the format description in CSV format in terms of the - Channel and Format classes above.''' - - stream = open(filename) - formats = [] - for line in stream: - try: - comment = line.index('#') - except ValueError: - pass - else: - line = line[:comment] - line = line.strip() - if not line: - continue - - fields = [field.strip() for field in line.split(',')] - if len (fields) == 10: - fields += fields[4:9] - assert len (fields) == 15 - - name = fields[0] - layout = fields[1] - block_width, block_height = map(int, fields[2:4]) - colorspace = fields[9] - - le_swizzles = [_swizzle_parse_map[swizzle] for swizzle in fields[8]] - le_channels = _parse_channels(fields[4:8], layout, colorspace, le_swizzles) - - be_swizzles = [_swizzle_parse_map[swizzle] for swizzle in fields[14]] - be_channels = _parse_channels(fields[10:14], layout, colorspace, be_swizzles) - - le_shift = 0 - for channel in le_channels: - channel.shift = le_shift - le_shift += channel.size - - be_shift = 0 - for channel in be_channels[3::-1]: - channel.shift = be_shift - be_shift += channel.size - - assert le_shift == be_shift - for i in range(4): - assert (le_swizzles[i] != SWIZZLE_NONE) == (be_swizzles[i] != SWIZZLE_NONE) - - format = Format(name, layout, block_width, block_height, le_channels, le_swizzles, be_channels, be_swizzles, colorspace) - formats.append(format) - return formats - diff -Nru mesa-19.2.8/src/gallium/auxiliary/util/u_format_rgtc.c mesa-20.0.8/src/gallium/auxiliary/util/u_format_rgtc.c --- mesa-19.2.8/src/gallium/auxiliary/util/u_format_rgtc.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/util/u_format_rgtc.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,448 +0,0 @@ -/************************************************************************** - * - * Copyright (C) 2011 Red Hat Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - * - **************************************************************************/ - -#include -#include "u_format.h" -#include "u_format_rgtc.h" -#include "util/u_math.h" -#include "util/rgtc.h" - -void -util_format_rgtc1_unorm_fetch_rgba_8unorm(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j) -{ - util_format_unsigned_fetch_texel_rgtc(0, src, i, j, dst, 1); - dst[1] = 0; - dst[2] = 0; - dst[3] = 255; -} - -void -util_format_rgtc1_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height) -{ - const unsigned bw = 4, bh = 4, comps = 4; - unsigned x, y, i, j; - unsigned block_size = 8; - - for(y = 0; y < height; y += bh) { - const uint8_t *src = src_row; - for(x = 0; x < width; x += bw) { - for(j = 0; j < bh; ++j) { - for(i = 0; i < bw; ++i) { - uint8_t *dst = dst_row + (y + j)*dst_stride/sizeof(*dst_row) + (x + i)*comps; - util_format_unsigned_fetch_texel_rgtc(0, src, i, j, dst, 1); - dst[1] = 0; - dst[2] = 0; - dst[3] = 255; - } - } - src += block_size; - } - src_row += src_stride; - } -} - -void -util_format_rgtc1_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, - unsigned src_stride, unsigned width, unsigned height) -{ - const unsigned bw = 4, bh = 4, bytes_per_block = 8; - unsigned x, y, i, j; - - for(y = 0; y < height; y += bh) { - uint8_t *dst = dst_row; - for(x = 0; x < width; x += bw) { - uint8_t tmp[4][4]; /* [bh][bw][comps] */ - for(j = 0; j < bh; ++j) { - for(i = 0; i < bw; ++i) { - tmp[j][i] = src_row[(y + j)*src_stride/sizeof(*src_row) + (x + i)*4]; - } - } - util_format_unsigned_encode_rgtc_ubyte(dst, tmp, 4, 4); - dst += bytes_per_block; - } - dst_row += dst_stride / sizeof(*dst_row); - } -} - -void -util_format_rgtc1_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height) -{ - unsigned x, y, i, j; - int block_size = 8; - for(y = 0; y < height; y += 4) { - const uint8_t *src = src_row; - for(x = 0; x < width; x += 4) { - for(j = 0; j < 4; ++j) { - for(i = 0; i < 4; ++i) { - float *dst = dst_row + (y + j)*dst_stride/sizeof(*dst_row) + (x + i)*4; - uint8_t tmp_r; - util_format_unsigned_fetch_texel_rgtc(0, src, i, j, &tmp_r, 1); - dst[0] = ubyte_to_float(tmp_r); - dst[1] = 0.0; - dst[2] = 0.0; - dst[3] = 1.0; - } - } - src += block_size; - } - src_row += src_stride; - } -} - -void -util_format_rgtc1_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height) -{ - const unsigned bw = 4, bh = 4, bytes_per_block = 8; - unsigned x, y, i, j; - - for(y = 0; y < height; y += bh) { - uint8_t *dst = dst_row; - for(x = 0; x < width; x += bw) { - uint8_t tmp[4][4]; /* [bh][bw][comps] */ - for(j = 0; j < bh; ++j) { - for(i = 0; i < bw; ++i) { - tmp[j][i] = float_to_ubyte(src_row[(y + j)*src_stride/sizeof(*src_row) + (x + i)*4]); - } - } - util_format_unsigned_encode_rgtc_ubyte(dst, tmp, 4, 4); - dst += bytes_per_block; - } - dst_row += dst_stride / sizeof(*dst_row); - } -} - -void -util_format_rgtc1_unorm_fetch_rgba_float(float *dst, const uint8_t *src, unsigned i, unsigned j) -{ - uint8_t tmp_r; - util_format_unsigned_fetch_texel_rgtc(0, src, i, j, &tmp_r, 1); - dst[0] = ubyte_to_float(tmp_r); - dst[1] = 0.0; - dst[2] = 0.0; - dst[3] = 1.0; -} - -void -util_format_rgtc1_snorm_fetch_rgba_8unorm(UNUSED uint8_t *dst, UNUSED const uint8_t *src, - UNUSED unsigned i, UNUSED unsigned j) -{ - fprintf(stderr,"%s\n", __func__); -} - -void -util_format_rgtc1_snorm_unpack_rgba_8unorm(UNUSED uint8_t *dst_row, UNUSED unsigned dst_stride, - UNUSED const uint8_t *src_row, UNUSED unsigned src_stride, - UNUSED unsigned width, UNUSED unsigned height) -{ - fprintf(stderr,"%s\n", __func__); -} - -void -util_format_rgtc1_snorm_pack_rgba_8unorm(UNUSED uint8_t *dst_row, UNUSED unsigned dst_stride, - UNUSED const uint8_t *src_row, UNUSED unsigned src_stride, - UNUSED unsigned width, UNUSED unsigned height) -{ - fprintf(stderr,"%s\n", __func__); -} - -void -util_format_rgtc1_snorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height) -{ - const unsigned bw = 4, bh = 4, bytes_per_block = 8; - unsigned x, y, i, j; - - for(y = 0; y < height; y += bh) { - int8_t *dst = (int8_t *)dst_row; - for(x = 0; x < width; x += bw) { - int8_t tmp[4][4]; /* [bh][bw][comps] */ - for(j = 0; j < bh; ++j) { - for(i = 0; i < bw; ++i) { - tmp[j][i] = float_to_byte_tex(src_row[(y + j)*src_stride/sizeof(*src_row) + (x + i)*4]); - } - } - util_format_signed_encode_rgtc_ubyte(dst, tmp, 4, 4); - dst += bytes_per_block; - } - dst_row += dst_stride / sizeof(*dst_row); - } -} - -void -util_format_rgtc1_snorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height) -{ - unsigned x, y, i, j; - int block_size = 8; - for(y = 0; y < height; y += 4) { - const int8_t *src = (int8_t *)src_row; - for(x = 0; x < width; x += 4) { - for(j = 0; j < 4; ++j) { - for(i = 0; i < 4; ++i) { - float *dst = dst_row + (y + j)*dst_stride/sizeof(*dst_row) + (x + i)*4; - int8_t tmp_r; - util_format_signed_fetch_texel_rgtc(0, src, i, j, &tmp_r, 1); - dst[0] = byte_to_float_tex(tmp_r); - dst[1] = 0.0; - dst[2] = 0.0; - dst[3] = 1.0; - } - } - src += block_size; - } - src_row += src_stride; - } -} - -void -util_format_rgtc1_snorm_fetch_rgba_float(float *dst, const uint8_t *src, unsigned i, unsigned j) -{ - int8_t tmp_r; - util_format_signed_fetch_texel_rgtc(0, (int8_t *)src, i, j, &tmp_r, 1); - dst[0] = byte_to_float_tex(tmp_r); - dst[1] = 0.0; - dst[2] = 0.0; - dst[3] = 1.0; -} - - -void -util_format_rgtc2_unorm_fetch_rgba_8unorm(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j) -{ - util_format_unsigned_fetch_texel_rgtc(0, src, i, j, dst, 2); - util_format_unsigned_fetch_texel_rgtc(0, src + 8, i, j, dst + 1, 2); - dst[2] = 0; - dst[3] = 255; -} - -void -util_format_rgtc2_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height) -{ - const unsigned bw = 4, bh = 4, comps = 4; - unsigned x, y, i, j; - unsigned block_size = 16; - - for(y = 0; y < height; y += bh) { - const uint8_t *src = src_row; - for(x = 0; x < width; x += bw) { - for(j = 0; j < bh; ++j) { - for(i = 0; i < bw; ++i) { - uint8_t *dst = dst_row + (y + j)*dst_stride/sizeof(*dst_row) + (x + i)*comps; - util_format_unsigned_fetch_texel_rgtc(0, src, i, j, dst, 2); - util_format_unsigned_fetch_texel_rgtc(0, src + 8, i, j, dst + 1, 2); - dst[2] = 0; - dst[3] = 255; - } - } - src += block_size; - } - src_row += src_stride; - } -} - -void -util_format_rgtc2_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height) -{ - const unsigned bw = 4, bh = 4, bytes_per_block = 16; - unsigned x, y, i, j; - - for(y = 0; y < height; y += bh) { - uint8_t *dst = dst_row; - for(x = 0; x < width; x += bw) { - uint8_t tmp_r[4][4]; /* [bh][bw] */ - uint8_t tmp_g[4][4]; /* [bh][bw] */ - for(j = 0; j < bh; ++j) { - for(i = 0; i < bw; ++i) { - tmp_r[j][i] = src_row[(y + j)*src_stride/sizeof(*src_row) + (x + i)*4]; - tmp_g[j][i] = src_row[((y + j)*src_stride/sizeof(*src_row) + (x + i)*4) + 1]; - } - } - util_format_unsigned_encode_rgtc_ubyte(dst, tmp_r, 4, 4); - util_format_unsigned_encode_rgtc_ubyte(dst + 8, tmp_g, 4, 4); - dst += bytes_per_block; - } - dst_row += dst_stride / sizeof(*dst_row); - } -} - -void -util_format_rxtc2_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height, unsigned chan2off) -{ - const unsigned bw = 4, bh = 4, bytes_per_block = 16; - unsigned x, y, i, j; - - for(y = 0; y < height; y += bh) { - uint8_t *dst = dst_row; - for(x = 0; x < width; x += bw) { - uint8_t tmp_r[4][4]; /* [bh][bw][comps] */ - uint8_t tmp_g[4][4]; /* [bh][bw][comps] */ - for(j = 0; j < bh; ++j) { - for(i = 0; i < bw; ++i) { - tmp_r[j][i] = float_to_ubyte(src_row[(y + j)*src_stride/sizeof(*src_row) + (x + i)*4]); - tmp_g[j][i] = float_to_ubyte(src_row[(y + j)*src_stride/sizeof(*src_row) + (x + i)*4 + chan2off]); - } - } - util_format_unsigned_encode_rgtc_ubyte(dst, tmp_r, 4, 4); - util_format_unsigned_encode_rgtc_ubyte(dst + 8, tmp_g, 4, 4); - dst += bytes_per_block; - } - dst_row += dst_stride / sizeof(*dst_row); - } -} - -void -util_format_rgtc2_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height) -{ - util_format_rxtc2_unorm_pack_rgba_float(dst_row, dst_stride, src_row, src_stride, width, height, 1); -} - -void -util_format_rgtc2_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height) -{ - unsigned x, y, i, j; - int block_size = 16; - for(y = 0; y < height; y += 4) { - const uint8_t *src = src_row; - for(x = 0; x < width; x += 4) { - for(j = 0; j < 4; ++j) { - for(i = 0; i < 4; ++i) { - float *dst = dst_row + (y + j)*dst_stride/sizeof(*dst_row) + (x + i)*4; - uint8_t tmp_r, tmp_g; - util_format_unsigned_fetch_texel_rgtc(0, src, i, j, &tmp_r, 2); - util_format_unsigned_fetch_texel_rgtc(0, src + 8, i, j, &tmp_g, 2); - dst[0] = ubyte_to_float(tmp_r); - dst[1] = ubyte_to_float(tmp_g); - dst[2] = 0.0; - dst[3] = 1.0; - } - } - src += block_size; - } - src_row += src_stride; - } -} - -void -util_format_rgtc2_unorm_fetch_rgba_float(float *dst, const uint8_t *src, unsigned i, unsigned j) -{ - uint8_t tmp_r, tmp_g; - util_format_unsigned_fetch_texel_rgtc(0, src, i, j, &tmp_r, 2); - util_format_unsigned_fetch_texel_rgtc(0, src + 8, i, j, &tmp_g, 2); - dst[0] = ubyte_to_float(tmp_r); - dst[1] = ubyte_to_float(tmp_g); - dst[2] = 0.0; - dst[3] = 1.0; -} - - -void -util_format_rgtc2_snorm_fetch_rgba_8unorm(UNUSED uint8_t *dst, UNUSED const uint8_t *src, - UNUSED unsigned i, UNUSED unsigned j) -{ - fprintf(stderr,"%s\n", __func__); -} - -void -util_format_rgtc2_snorm_unpack_rgba_8unorm(UNUSED uint8_t *dst_row, UNUSED unsigned dst_stride, - UNUSED const uint8_t *src_row, UNUSED unsigned src_stride, - UNUSED unsigned width, UNUSED unsigned height) -{ - fprintf(stderr,"%s\n", __func__); -} - -void -util_format_rgtc2_snorm_pack_rgba_8unorm(UNUSED uint8_t *dst_row, UNUSED unsigned dst_stride, - UNUSED const uint8_t *src_row, UNUSED unsigned src_stride, - UNUSED unsigned width, UNUSED unsigned height) -{ - fprintf(stderr,"%s\n", __func__); -} - -void -util_format_rgtc2_snorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height) -{ - unsigned x, y, i, j; - int block_size = 16; - for(y = 0; y < height; y += 4) { - const int8_t *src = (int8_t *)src_row; - for(x = 0; x < width; x += 4) { - for(j = 0; j < 4; ++j) { - for(i = 0; i < 4; ++i) { - float *dst = dst_row + (y + j)*dst_stride/sizeof(*dst_row) + (x + i)*4; - int8_t tmp_r, tmp_g; - util_format_signed_fetch_texel_rgtc(0, src, i, j, &tmp_r, 2); - util_format_signed_fetch_texel_rgtc(0, src + 8, i, j, &tmp_g, 2); - dst[0] = byte_to_float_tex(tmp_r); - dst[1] = byte_to_float_tex(tmp_g); - dst[2] = 0.0; - dst[3] = 1.0; - } - } - src += block_size; - } - src_row += src_stride; - } -} - -void -util_format_rxtc2_snorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height, unsigned chan2off) -{ - const unsigned bw = 4, bh = 4, bytes_per_block = 16; - unsigned x, y, i, j; - - for(y = 0; y < height; y += bh) { - int8_t *dst = (int8_t *)dst_row; - for(x = 0; x < width; x += bw) { - int8_t tmp_r[4][4]; /* [bh][bw][comps] */ - int8_t tmp_g[4][4]; /* [bh][bw][comps] */ - for(j = 0; j < bh; ++j) { - for(i = 0; i < bw; ++i) { - tmp_r[j][i] = float_to_byte_tex(src_row[(y + j)*src_stride/sizeof(*src_row) + (x + i)*4]); - tmp_g[j][i] = float_to_byte_tex(src_row[(y + j)*src_stride/sizeof(*src_row) + (x + i)*4 + chan2off]); - } - } - util_format_signed_encode_rgtc_ubyte(dst, tmp_r, 4, 4); - util_format_signed_encode_rgtc_ubyte(dst + 8, tmp_g, 4, 4); - dst += bytes_per_block; - } - dst_row += dst_stride / sizeof(*dst_row); - } -} - -void -util_format_rgtc2_snorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height) -{ - util_format_rxtc2_snorm_pack_rgba_float(dst_row, dst_stride, src_row, src_stride, width, height, 1); -} - -void -util_format_rgtc2_snorm_fetch_rgba_float(float *dst, const uint8_t *src, unsigned i, unsigned j) -{ - int8_t tmp_r, tmp_g; - util_format_signed_fetch_texel_rgtc(0, (int8_t *)src, i, j, &tmp_r, 2); - util_format_signed_fetch_texel_rgtc(0, (int8_t *)src + 8, i, j, &tmp_g, 2); - dst[0] = byte_to_float_tex(tmp_r); - dst[1] = byte_to_float_tex(tmp_g); - dst[2] = 0.0; - dst[3] = 1.0; -} - diff -Nru mesa-19.2.8/src/gallium/auxiliary/util/u_format_rgtc.h mesa-20.0.8/src/gallium/auxiliary/util/u_format_rgtc.h --- mesa-19.2.8/src/gallium/auxiliary/util/u_format_rgtc.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/util/u_format_rgtc.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,114 +0,0 @@ -/************************************************************************** - * - * Copyright 2011 Red Hat Inc. - * All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sub license, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL - * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, - * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR - * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE - * USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * The above copyright notice and this permission notice (including the - * next paragraph) shall be included in all copies or substantial portions - * of the Software. - * - **************************************************************************/ - -#ifndef U_FORMAT_RGTC_H_ -#define U_FORMAT_RGTC_H_ - -void -util_format_rgtc1_unorm_fetch_rgba_8unorm(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j); - -void -util_format_rgtc1_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); - -void -util_format_rgtc1_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); - -void -util_format_rgtc1_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); - -void -util_format_rgtc1_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height); - -void -util_format_rgtc1_unorm_fetch_rgba_float(float *dst, const uint8_t *src, unsigned i, unsigned j); - - - -void -util_format_rgtc1_snorm_fetch_rgba_8unorm(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j); - -void -util_format_rgtc1_snorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); - -void -util_format_rgtc1_snorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); - -void -util_format_rgtc1_snorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height); - -void -util_format_rgtc1_snorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); - -void -util_format_rgtc1_snorm_fetch_rgba_float(float *dst, const uint8_t *src, unsigned i, unsigned j); - - -void -util_format_rgtc2_unorm_fetch_rgba_8unorm(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j); - -void -util_format_rgtc2_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); - -void -util_format_rgtc2_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); - -void -util_format_rxtc2_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height, unsigned chan2off); - -void -util_format_rgtc2_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height); - -void -util_format_rgtc2_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); - -void -util_format_rgtc2_unorm_fetch_rgba_float(float *dst, const uint8_t *src, unsigned i, unsigned j); - - -void -util_format_rgtc2_snorm_fetch_rgba_8unorm(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j); - -void -util_format_rgtc2_snorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); - -void -util_format_rgtc2_snorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); - -void -util_format_rgtc2_snorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); - -void -util_format_rxtc2_snorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height, unsigned chan2off); - -void -util_format_rgtc2_snorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height); - -void -util_format_rgtc2_snorm_fetch_rgba_float(float *dst, const uint8_t *src, unsigned i, unsigned j); - - -#endif diff -Nru mesa-19.2.8/src/gallium/auxiliary/util/u_format_s3tc.c mesa-20.0.8/src/gallium/auxiliary/util/u_format_s3tc.c --- mesa-19.2.8/src/gallium/auxiliary/util/u_format_s3tc.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/util/u_format_s3tc.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,674 +0,0 @@ -/************************************************************************** - * - * Copyright (C) 1999-2007 Brian Paul All Rights Reserved. - * Copyright (c) 2008 VMware, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - * - **************************************************************************/ - -#include "u_dl.h" -#include "u_format.h" -#include "u_format_s3tc.h" -#include "util/format_srgb.h" -#include "util/u_math.h" -#include "../../../mesa/main/texcompress_s3tc_tmp.h" - - -util_format_dxtn_fetch_t util_format_dxt1_rgb_fetch = (util_format_dxtn_fetch_t)fetch_2d_texel_rgb_dxt1; -util_format_dxtn_fetch_t util_format_dxt1_rgba_fetch = (util_format_dxtn_fetch_t)fetch_2d_texel_rgba_dxt1; -util_format_dxtn_fetch_t util_format_dxt3_rgba_fetch = (util_format_dxtn_fetch_t)fetch_2d_texel_rgba_dxt3; -util_format_dxtn_fetch_t util_format_dxt5_rgba_fetch = (util_format_dxtn_fetch_t)fetch_2d_texel_rgba_dxt5; - -util_format_dxtn_pack_t util_format_dxtn_pack = (util_format_dxtn_pack_t)tx_compress_dxtn; - - -/* - * Pixel fetch. - */ - -void -util_format_dxt1_rgb_fetch_rgba_8unorm(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j) -{ - util_format_dxt1_rgb_fetch(0, src, i, j, dst); -} - -void -util_format_dxt1_rgba_fetch_rgba_8unorm(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j) -{ - util_format_dxt1_rgba_fetch(0, src, i, j, dst); -} - -void -util_format_dxt3_rgba_fetch_rgba_8unorm(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j) -{ - util_format_dxt3_rgba_fetch(0, src, i, j, dst); -} - -void -util_format_dxt5_rgba_fetch_rgba_8unorm(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j) -{ - util_format_dxt5_rgba_fetch(0, src, i, j, dst); -} - -void -util_format_dxt1_rgb_fetch_rgba_float(float *dst, const uint8_t *src, unsigned i, unsigned j) -{ - uint8_t tmp[4]; - util_format_dxt1_rgb_fetch(0, src, i, j, tmp); - dst[0] = ubyte_to_float(tmp[0]); - dst[1] = ubyte_to_float(tmp[1]); - dst[2] = ubyte_to_float(tmp[2]); - dst[3] = 1.0; -} - -void -util_format_dxt1_rgba_fetch_rgba_float(float *dst, const uint8_t *src, unsigned i, unsigned j) -{ - uint8_t tmp[4]; - util_format_dxt1_rgba_fetch(0, src, i, j, tmp); - dst[0] = ubyte_to_float(tmp[0]); - dst[1] = ubyte_to_float(tmp[1]); - dst[2] = ubyte_to_float(tmp[2]); - dst[3] = ubyte_to_float(tmp[3]); -} - -void -util_format_dxt3_rgba_fetch_rgba_float(float *dst, const uint8_t *src, unsigned i, unsigned j) -{ - uint8_t tmp[4]; - util_format_dxt3_rgba_fetch(0, src, i, j, tmp); - dst[0] = ubyte_to_float(tmp[0]); - dst[1] = ubyte_to_float(tmp[1]); - dst[2] = ubyte_to_float(tmp[2]); - dst[3] = ubyte_to_float(tmp[3]); -} - -void -util_format_dxt5_rgba_fetch_rgba_float(float *dst, const uint8_t *src, unsigned i, unsigned j) -{ - uint8_t tmp[4]; - util_format_dxt5_rgba_fetch(0, src, i, j, tmp); - dst[0] = ubyte_to_float(tmp[0]); - dst[1] = ubyte_to_float(tmp[1]); - dst[2] = ubyte_to_float(tmp[2]); - dst[3] = ubyte_to_float(tmp[3]); -} - - -/* - * Block decompression. - */ - -static inline void -util_format_dxtn_rgb_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height, - util_format_dxtn_fetch_t fetch, - unsigned block_size, boolean srgb) -{ - const unsigned bw = 4, bh = 4, comps = 4; - unsigned x, y, i, j; - for(y = 0; y < height; y += bh) { - const uint8_t *src = src_row; - for(x = 0; x < width; x += bw) { - for(j = 0; j < bh; ++j) { - for(i = 0; i < bw; ++i) { - uint8_t *dst = dst_row + (y + j)*dst_stride/sizeof(*dst_row) + (x + i)*comps; - fetch(0, src, i, j, dst); - if (srgb) { - dst[0] = util_format_srgb_to_linear_8unorm(dst[0]); - dst[1] = util_format_srgb_to_linear_8unorm(dst[1]); - dst[2] = util_format_srgb_to_linear_8unorm(dst[2]); - } - } - } - src += block_size; - } - src_row += src_stride; - } -} - -void -util_format_dxt1_rgb_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - util_format_dxtn_rgb_unpack_rgba_8unorm(dst_row, dst_stride, - src_row, src_stride, - width, height, - util_format_dxt1_rgb_fetch, - 8, FALSE); -} - -void -util_format_dxt1_rgba_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - util_format_dxtn_rgb_unpack_rgba_8unorm(dst_row, dst_stride, - src_row, src_stride, - width, height, - util_format_dxt1_rgba_fetch, - 8, FALSE); -} - -void -util_format_dxt3_rgba_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - util_format_dxtn_rgb_unpack_rgba_8unorm(dst_row, dst_stride, - src_row, src_stride, - width, height, - util_format_dxt3_rgba_fetch, - 16, FALSE); -} - -void -util_format_dxt5_rgba_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - util_format_dxtn_rgb_unpack_rgba_8unorm(dst_row, dst_stride, - src_row, src_stride, - width, height, - util_format_dxt5_rgba_fetch, - 16, FALSE); -} - -static inline void -util_format_dxtn_rgb_unpack_rgba_float(float *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height, - util_format_dxtn_fetch_t fetch, - unsigned block_size, boolean srgb) -{ - unsigned x, y, i, j; - for(y = 0; y < height; y += 4) { - const uint8_t *src = src_row; - for(x = 0; x < width; x += 4) { - for(j = 0; j < 4; ++j) { - for(i = 0; i < 4; ++i) { - float *dst = dst_row + (y + j)*dst_stride/sizeof(*dst_row) + (x + i)*4; - uint8_t tmp[4]; - fetch(0, src, i, j, tmp); - if (srgb) { - dst[0] = util_format_srgb_8unorm_to_linear_float(tmp[0]); - dst[1] = util_format_srgb_8unorm_to_linear_float(tmp[1]); - dst[2] = util_format_srgb_8unorm_to_linear_float(tmp[2]); - } - else { - dst[0] = ubyte_to_float(tmp[0]); - dst[1] = ubyte_to_float(tmp[1]); - dst[2] = ubyte_to_float(tmp[2]); - } - dst[3] = ubyte_to_float(tmp[3]); - } - } - src += block_size; - } - src_row += src_stride; - } -} - -void -util_format_dxt1_rgb_unpack_rgba_float(float *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - util_format_dxtn_rgb_unpack_rgba_float(dst_row, dst_stride, - src_row, src_stride, - width, height, - util_format_dxt1_rgb_fetch, - 8, FALSE); -} - -void -util_format_dxt1_rgba_unpack_rgba_float(float *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - util_format_dxtn_rgb_unpack_rgba_float(dst_row, dst_stride, - src_row, src_stride, - width, height, - util_format_dxt1_rgba_fetch, - 8, FALSE); -} - -void -util_format_dxt3_rgba_unpack_rgba_float(float *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - util_format_dxtn_rgb_unpack_rgba_float(dst_row, dst_stride, - src_row, src_stride, - width, height, - util_format_dxt3_rgba_fetch, - 16, FALSE); -} - -void -util_format_dxt5_rgba_unpack_rgba_float(float *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - util_format_dxtn_rgb_unpack_rgba_float(dst_row, dst_stride, - src_row, src_stride, - width, height, - util_format_dxt5_rgba_fetch, - 16, FALSE); -} - - -/* - * Block compression. - */ - -static inline void -util_format_dxtn_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src, unsigned src_stride, - unsigned width, unsigned height, - enum util_format_dxtn format, - unsigned block_size, boolean srgb) -{ - const unsigned bw = 4, bh = 4, comps = 4; - unsigned x, y, i, j, k; - for(y = 0; y < height; y += bh) { - uint8_t *dst = dst_row; - for(x = 0; x < width; x += bw) { - uint8_t tmp[4][4][4]; /* [bh][bw][comps] */ - for(j = 0; j < bh; ++j) { - for(i = 0; i < bw; ++i) { - uint8_t src_tmp; - for(k = 0; k < 3; ++k) { - src_tmp = src[(y + j)*src_stride/sizeof(*src) + (x+i)*comps + k]; - if (srgb) { - tmp[j][i][k] = util_format_linear_to_srgb_8unorm(src_tmp); - } - else { - tmp[j][i][k] = src_tmp; - } - } - /* for sake of simplicity there's an unneeded 4th component for dxt1_rgb */ - tmp[j][i][3] = src[(y + j)*src_stride/sizeof(*src) + (x+i)*comps + 3]; - } - } - /* even for dxt1_rgb have 4 src comps */ - util_format_dxtn_pack(4, 4, 4, &tmp[0][0][0], format, dst, 0); - dst += block_size; - } - dst_row += dst_stride / sizeof(*dst_row); - } - -} - -void -util_format_dxt1_rgb_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src, unsigned src_stride, - unsigned width, unsigned height) -{ - util_format_dxtn_pack_rgba_8unorm(dst_row, dst_stride, src, src_stride, - width, height, UTIL_FORMAT_DXT1_RGB, - 8, FALSE); -} - -void -util_format_dxt1_rgba_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src, unsigned src_stride, - unsigned width, unsigned height) -{ - util_format_dxtn_pack_rgba_8unorm(dst_row, dst_stride, src, src_stride, - width, height, UTIL_FORMAT_DXT1_RGBA, - 8, FALSE); -} - -void -util_format_dxt3_rgba_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src, unsigned src_stride, - unsigned width, unsigned height) -{ - util_format_dxtn_pack_rgba_8unorm(dst_row, dst_stride, src, src_stride, - width, height, UTIL_FORMAT_DXT3_RGBA, - 16, FALSE); -} - -void -util_format_dxt5_rgba_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src, unsigned src_stride, - unsigned width, unsigned height) -{ - util_format_dxtn_pack_rgba_8unorm(dst_row, dst_stride, src, src_stride, - width, height, UTIL_FORMAT_DXT5_RGBA, - 16, FALSE); -} - -static inline void -util_format_dxtn_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, - const float *src, unsigned src_stride, - unsigned width, unsigned height, - enum util_format_dxtn format, - unsigned block_size, boolean srgb) -{ - unsigned x, y, i, j, k; - for(y = 0; y < height; y += 4) { - uint8_t *dst = dst_row; - for(x = 0; x < width; x += 4) { - uint8_t tmp[4][4][4]; - for(j = 0; j < 4; ++j) { - for(i = 0; i < 4; ++i) { - float src_tmp; - for(k = 0; k < 3; ++k) { - src_tmp = src[(y + j)*src_stride/sizeof(*src) + (x+i)*4 + k]; - if (srgb) { - tmp[j][i][k] = util_format_linear_float_to_srgb_8unorm(src_tmp); - } - else { - tmp[j][i][k] = float_to_ubyte(src_tmp); - } - } - /* for sake of simplicity there's an unneeded 4th component for dxt1_rgb */ - src_tmp = src[(y + j)*src_stride/sizeof(*src) + (x+i)*4 + 3]; - tmp[j][i][3] = float_to_ubyte(src_tmp); - } - } - util_format_dxtn_pack(4, 4, 4, &tmp[0][0][0], format, dst, 0); - dst += block_size; - } - dst_row += 4*dst_stride/sizeof(*dst_row); - } -} - -void -util_format_dxt1_rgb_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, - const float *src, unsigned src_stride, - unsigned width, unsigned height) -{ - util_format_dxtn_pack_rgba_float(dst_row, dst_stride, src, src_stride, - width, height, UTIL_FORMAT_DXT1_RGB, - 8, FALSE); -} - -void -util_format_dxt1_rgba_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, - const float *src, unsigned src_stride, - unsigned width, unsigned height) -{ - util_format_dxtn_pack_rgba_float(dst_row, dst_stride, src, src_stride, - width, height, UTIL_FORMAT_DXT1_RGBA, - 8, FALSE); -} - -void -util_format_dxt3_rgba_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, - const float *src, unsigned src_stride, - unsigned width, unsigned height) -{ - util_format_dxtn_pack_rgba_float(dst_row, dst_stride, src, src_stride, - width, height, UTIL_FORMAT_DXT3_RGBA, - 16, FALSE); -} - -void -util_format_dxt5_rgba_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, - const float *src, unsigned src_stride, - unsigned width, unsigned height) -{ - util_format_dxtn_pack_rgba_float(dst_row, dst_stride, src, src_stride, - width, height, UTIL_FORMAT_DXT5_RGBA, - 16, FALSE); -} - - -/* - * SRGB variants. - */ - -void -util_format_dxt1_srgb_fetch_rgba_8unorm(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j) -{ - uint8_t tmp[4]; - util_format_dxt1_rgb_fetch(0, src, i, j, tmp); - dst[0] = util_format_srgb_to_linear_8unorm(tmp[0]); - dst[1] = util_format_srgb_to_linear_8unorm(tmp[1]); - dst[2] = util_format_srgb_to_linear_8unorm(tmp[2]); - dst[3] = 255; -} - -void -util_format_dxt1_srgba_fetch_rgba_8unorm(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j) -{ - uint8_t tmp[4]; - util_format_dxt1_rgba_fetch(0, src, i, j, tmp); - dst[0] = util_format_srgb_to_linear_8unorm(tmp[0]); - dst[1] = util_format_srgb_to_linear_8unorm(tmp[1]); - dst[2] = util_format_srgb_to_linear_8unorm(tmp[2]); - dst[3] = tmp[3]; -} - -void -util_format_dxt3_srgba_fetch_rgba_8unorm(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j) -{ - uint8_t tmp[4]; - util_format_dxt3_rgba_fetch(0, src, i, j, tmp); - dst[0] = util_format_srgb_to_linear_8unorm(tmp[0]); - dst[1] = util_format_srgb_to_linear_8unorm(tmp[1]); - dst[2] = util_format_srgb_to_linear_8unorm(tmp[2]); - dst[3] = tmp[3]; -} - -void -util_format_dxt5_srgba_fetch_rgba_8unorm(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j) -{ - uint8_t tmp[4]; - util_format_dxt5_rgba_fetch(0, src, i, j, tmp); - dst[0] = util_format_srgb_to_linear_8unorm(tmp[0]); - dst[1] = util_format_srgb_to_linear_8unorm(tmp[1]); - dst[2] = util_format_srgb_to_linear_8unorm(tmp[2]); - dst[3] = tmp[3]; -} - -void -util_format_dxt1_srgb_fetch_rgba_float(float *dst, const uint8_t *src, unsigned i, unsigned j) -{ - uint8_t tmp[4]; - util_format_dxt1_rgb_fetch(0, src, i, j, tmp); - dst[0] = util_format_srgb_8unorm_to_linear_float(tmp[0]); - dst[1] = util_format_srgb_8unorm_to_linear_float(tmp[1]); - dst[2] = util_format_srgb_8unorm_to_linear_float(tmp[2]); - dst[3] = 1.0f; -} - -void -util_format_dxt1_srgba_fetch_rgba_float(float *dst, const uint8_t *src, unsigned i, unsigned j) -{ - uint8_t tmp[4]; - util_format_dxt1_rgba_fetch(0, src, i, j, tmp); - dst[0] = util_format_srgb_8unorm_to_linear_float(tmp[0]); - dst[1] = util_format_srgb_8unorm_to_linear_float(tmp[1]); - dst[2] = util_format_srgb_8unorm_to_linear_float(tmp[2]); - dst[3] = ubyte_to_float(tmp[3]); -} - -void -util_format_dxt3_srgba_fetch_rgba_float(float *dst, const uint8_t *src, unsigned i, unsigned j) -{ - uint8_t tmp[4]; - util_format_dxt3_rgba_fetch(0, src, i, j, tmp); - dst[0] = util_format_srgb_8unorm_to_linear_float(tmp[0]); - dst[1] = util_format_srgb_8unorm_to_linear_float(tmp[1]); - dst[2] = util_format_srgb_8unorm_to_linear_float(tmp[2]); - dst[3] = ubyte_to_float(tmp[3]); -} - -void -util_format_dxt5_srgba_fetch_rgba_float(float *dst, const uint8_t *src, unsigned i, unsigned j) -{ - uint8_t tmp[4]; - util_format_dxt5_rgba_fetch(0, src, i, j, tmp); - dst[0] = util_format_srgb_8unorm_to_linear_float(tmp[0]); - dst[1] = util_format_srgb_8unorm_to_linear_float(tmp[1]); - dst[2] = util_format_srgb_8unorm_to_linear_float(tmp[2]); - dst[3] = ubyte_to_float(tmp[3]); -} - -void -util_format_dxt1_srgb_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height) -{ - util_format_dxtn_rgb_unpack_rgba_8unorm(dst_row, dst_stride, - src_row, src_stride, - width, height, - util_format_dxt1_rgb_fetch, - 8, TRUE); -} - -void -util_format_dxt1_srgba_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height) -{ - util_format_dxtn_rgb_unpack_rgba_8unorm(dst_row, dst_stride, - src_row, src_stride, - width, height, - util_format_dxt1_rgba_fetch, - 8, TRUE); -} - -void -util_format_dxt3_srgba_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height) -{ - util_format_dxtn_rgb_unpack_rgba_8unorm(dst_row, dst_stride, - src_row, src_stride, - width, height, - util_format_dxt3_rgba_fetch, - 16, TRUE); -} - -void -util_format_dxt5_srgba_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height) -{ - util_format_dxtn_rgb_unpack_rgba_8unorm(dst_row, dst_stride, - src_row, src_stride, - width, height, - util_format_dxt5_rgba_fetch, - 16, TRUE); -} - -void -util_format_dxt1_srgb_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height) -{ - util_format_dxtn_rgb_unpack_rgba_float(dst_row, dst_stride, - src_row, src_stride, - width, height, - util_format_dxt1_rgb_fetch, - 8, TRUE); -} - -void -util_format_dxt1_srgba_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height) -{ - util_format_dxtn_rgb_unpack_rgba_float(dst_row, dst_stride, - src_row, src_stride, - width, height, - util_format_dxt1_rgba_fetch, - 8, TRUE); -} - -void -util_format_dxt3_srgba_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height) -{ - util_format_dxtn_rgb_unpack_rgba_float(dst_row, dst_stride, - src_row, src_stride, - width, height, - util_format_dxt3_rgba_fetch, - 16, TRUE); -} - -void -util_format_dxt5_srgba_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height) -{ - util_format_dxtn_rgb_unpack_rgba_float(dst_row, dst_stride, - src_row, src_stride, - width, height, - util_format_dxt5_rgba_fetch, - 16, TRUE); -} - -void -util_format_dxt1_srgb_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height) -{ - util_format_dxtn_pack_rgba_8unorm(dst_row, dst_stride, src_row, src_stride, - width, height, UTIL_FORMAT_DXT1_RGB, - 8, TRUE); -} - -void -util_format_dxt1_srgba_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height) -{ - util_format_dxtn_pack_rgba_8unorm(dst_row, dst_stride, src_row, src_stride, - width, height, UTIL_FORMAT_DXT1_RGBA, - 8, TRUE); -} - -void -util_format_dxt3_srgba_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height) -{ - util_format_dxtn_pack_rgba_8unorm(dst_row, dst_stride, src_row, src_stride, - width, height, UTIL_FORMAT_DXT3_RGBA, - 16, TRUE); -} - -void -util_format_dxt5_srgba_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height) -{ - util_format_dxtn_pack_rgba_8unorm(dst_row, dst_stride, src_row, src_stride, - width, height, UTIL_FORMAT_DXT5_RGBA, - 16, TRUE); -} - -void -util_format_dxt1_srgb_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height) -{ - util_format_dxtn_pack_rgba_float(dst_row, dst_stride, src_row, src_stride, - width, height, UTIL_FORMAT_DXT1_RGB, - 8, TRUE); -} - -void -util_format_dxt1_srgba_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height) -{ - util_format_dxtn_pack_rgba_float(dst_row, dst_stride, src_row, src_stride, - width, height, UTIL_FORMAT_DXT1_RGBA, - 8, TRUE); -} - -void -util_format_dxt3_srgba_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height) -{ - util_format_dxtn_pack_rgba_float(dst_row, dst_stride, src_row, src_stride, - width, height, UTIL_FORMAT_DXT3_RGBA, - 16, TRUE); -} - -void -util_format_dxt5_srgba_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height) -{ - util_format_dxtn_pack_rgba_float(dst_row, dst_stride, src_row, src_stride, - width, height, UTIL_FORMAT_DXT5_RGBA, - 16, TRUE); -} - diff -Nru mesa-19.2.8/src/gallium/auxiliary/util/u_format_s3tc.h mesa-20.0.8/src/gallium/auxiliary/util/u_format_s3tc.h --- mesa-19.2.8/src/gallium/auxiliary/util/u_format_s3tc.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/util/u_format_s3tc.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,218 +0,0 @@ -/************************************************************************** - * - * Copyright 2010 VMware, Inc. - * All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sub license, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL - * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, - * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR - * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE - * USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * The above copyright notice and this permission notice (including the - * next paragraph) shall be included in all copies or substantial portions - * of the Software. - * - **************************************************************************/ - - -#ifndef U_FORMAT_S3TC_H_ -#define U_FORMAT_S3TC_H_ - - -#include "pipe/p_compiler.h" - -#ifdef __cplusplus -extern "C" { -#endif - -enum util_format_dxtn { - UTIL_FORMAT_DXT1_RGB = 0x83F0, - UTIL_FORMAT_DXT1_RGBA = 0x83F1, - UTIL_FORMAT_DXT3_RGBA = 0x83F2, - UTIL_FORMAT_DXT5_RGBA = 0x83F3 -}; - - -typedef void -(*util_format_dxtn_fetch_t)( int src_stride, - const uint8_t *src, - int col, int row, - uint8_t *dst ); - -typedef void -(*util_format_dxtn_pack_t)( int src_comps, - int width, int height, - const uint8_t *src, - enum util_format_dxtn dst_format, - uint8_t *dst, - int dst_stride); - -extern util_format_dxtn_fetch_t util_format_dxt1_rgb_fetch; -extern util_format_dxtn_fetch_t util_format_dxt1_rgba_fetch; -extern util_format_dxtn_fetch_t util_format_dxt3_rgba_fetch; -extern util_format_dxtn_fetch_t util_format_dxt5_rgba_fetch; - -extern util_format_dxtn_pack_t util_format_dxtn_pack; - - -void -util_format_dxt1_rgb_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); - -void -util_format_dxt1_rgb_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); - -void -util_format_dxt1_rgb_fetch_rgba_8unorm(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j); - -void -util_format_dxt1_rgba_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); - -void -util_format_dxt1_rgba_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); - -void -util_format_dxt1_rgba_fetch_rgba_8unorm(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j); - -void -util_format_dxt3_rgba_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); - -void -util_format_dxt3_rgba_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); - -void -util_format_dxt3_rgba_fetch_rgba_8unorm(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j); - -void -util_format_dxt5_rgba_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); - -void -util_format_dxt5_rgba_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); - -void -util_format_dxt5_rgba_fetch_rgba_8unorm(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j); - -void -util_format_dxt1_srgb_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); - -void -util_format_dxt1_srgb_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); - -void -util_format_dxt1_srgb_fetch_rgba_8unorm(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j); - -void -util_format_dxt1_srgba_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); - -void -util_format_dxt1_srgba_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); - -void -util_format_dxt1_srgba_fetch_rgba_8unorm(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j); - -void -util_format_dxt3_srgba_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); - -void -util_format_dxt3_srgba_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); - -void -util_format_dxt3_srgba_fetch_rgba_8unorm(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j); - -void -util_format_dxt5_srgba_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); - -void -util_format_dxt5_srgba_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); - -void -util_format_dxt5_srgba_fetch_rgba_8unorm(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j); - - -void -util_format_dxt1_rgb_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); - -void -util_format_dxt1_rgb_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height); - -void -util_format_dxt1_rgb_fetch_rgba_float(float *dst, const uint8_t *src, unsigned i, unsigned j); - -void -util_format_dxt1_rgba_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); - -void -util_format_dxt1_rgba_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height); - -void -util_format_dxt1_rgba_fetch_rgba_float(float *dst, const uint8_t *src, unsigned i, unsigned j); - -void -util_format_dxt3_rgba_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); - -void -util_format_dxt3_rgba_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height); - -void -util_format_dxt3_rgba_fetch_rgba_float(float *dst, const uint8_t *src, unsigned i, unsigned j); - -void -util_format_dxt5_rgba_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); - -void -util_format_dxt5_rgba_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height); - -void -util_format_dxt5_rgba_fetch_rgba_float(float *dst, const uint8_t *src, unsigned i, unsigned j); - -void -util_format_dxt1_srgb_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); - -void -util_format_dxt1_srgb_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height); - -void -util_format_dxt1_srgb_fetch_rgba_float(float *dst, const uint8_t *src, unsigned i, unsigned j); - -void -util_format_dxt1_srgba_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); - -void -util_format_dxt1_srgba_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height); - -void -util_format_dxt1_srgba_fetch_rgba_float(float *dst, const uint8_t *src, unsigned i, unsigned j); - -void -util_format_dxt3_srgba_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); - -void -util_format_dxt3_srgba_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height); - -void -util_format_dxt3_srgba_fetch_rgba_float(float *dst, const uint8_t *src, unsigned i, unsigned j); - -void -util_format_dxt5_srgba_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); - -void -util_format_dxt5_srgba_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height); - -void -util_format_dxt5_srgba_fetch_rgba_float(float *dst, const uint8_t *src, unsigned i, unsigned j); - -#ifdef __cplusplus -} -#endif - -#endif /* U_FORMAT_S3TC_H_ */ diff -Nru mesa-19.2.8/src/gallium/auxiliary/util/u_format_table.py mesa-20.0.8/src/gallium/auxiliary/util/u_format_table.py --- mesa-19.2.8/src/gallium/auxiliary/util/u_format_table.py 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/util/u_format_table.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,233 +0,0 @@ -from __future__ import print_function - -CopyRight = ''' -/************************************************************************** - * - * Copyright 2010 VMware, Inc. - * All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sub license, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice (including the - * next paragraph) shall be included in all copies or substantial portions - * of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. - * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR - * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - **************************************************************************/ -''' - - -import sys - -from u_format_parse import * -import u_format_pack - - -def layout_map(layout): - return 'UTIL_FORMAT_LAYOUT_' + str(layout).upper() - - -def colorspace_map(colorspace): - return 'UTIL_FORMAT_COLORSPACE_' + str(colorspace).upper() - - -colorspace_channels_map = { - 'rgb': ['r', 'g', 'b', 'a'], - 'srgb': ['sr', 'sg', 'sb', 'a'], - 'zs': ['z', 's'], - 'yuv': ['y', 'u', 'v'], -} - - -type_map = { - VOID: "UTIL_FORMAT_TYPE_VOID", - UNSIGNED: "UTIL_FORMAT_TYPE_UNSIGNED", - SIGNED: "UTIL_FORMAT_TYPE_SIGNED", - FIXED: "UTIL_FORMAT_TYPE_FIXED", - FLOAT: "UTIL_FORMAT_TYPE_FLOAT", -} - - -def bool_map(value): - if value: - return "TRUE" - else: - return "FALSE" - - -swizzle_map = { - SWIZZLE_X: "PIPE_SWIZZLE_X", - SWIZZLE_Y: "PIPE_SWIZZLE_Y", - SWIZZLE_Z: "PIPE_SWIZZLE_Z", - SWIZZLE_W: "PIPE_SWIZZLE_W", - SWIZZLE_0: "PIPE_SWIZZLE_0", - SWIZZLE_1: "PIPE_SWIZZLE_1", - SWIZZLE_NONE: "PIPE_SWIZZLE_NONE", -} - - -def write_format_table(formats): - print('/* This file is autogenerated by u_format_table.py from u_format.csv. Do not edit directly. */') - print() - # This will print the copyright message on the top of this file - print(CopyRight.strip()) - print() - print('#include "u_format.h"') - print('#include "u_format_bptc.h"') - print('#include "u_format_s3tc.h"') - print('#include "u_format_rgtc.h"') - print('#include "u_format_latc.h"') - print('#include "u_format_etc.h"') - print() - - u_format_pack.generate(formats) - - def do_channel_array(channels, swizzles): - print(" {") - for i in range(4): - channel = channels[i] - if i < 3: - sep = "," - else: - sep = "" - if channel.size: - print(" {%s, %s, %s, %u, %u}%s\t/* %s = %s */" % (type_map[channel.type], bool_map(channel.norm), bool_map(channel.pure), channel.size, channel.shift, sep, "xyzw"[i], channel.name)) - else: - print(" {0, 0, 0, 0, 0}%s" % (sep,)) - print(" },") - - def do_swizzle_array(channels, swizzles): - print(" {") - for i in range(4): - swizzle = swizzles[i] - if i < 3: - sep = "," - else: - sep = "" - try: - comment = colorspace_channels_map[format.colorspace][i] - except (KeyError, IndexError): - comment = 'ignored' - print(" %s%s\t/* %s */" % (swizzle_map[swizzle], sep, comment)) - print(" },") - - for format in formats: - print('const struct util_format_description') - print('util_format_%s_description = {' % (format.short_name(),)) - print(" %s," % (format.name,)) - print(" \"%s\"," % (format.name,)) - print(" \"%s\"," % (format.short_name(),)) - print(" {%u, %u, %u},\t/* block */" % (format.block_width, format.block_height, format.block_size())) - print(" %s," % (layout_map(format.layout),)) - print(" %u,\t/* nr_channels */" % (format.nr_channels(),)) - print(" %s,\t/* is_array */" % (bool_map(format.is_array()),)) - print(" %s,\t/* is_bitmask */" % (bool_map(format.is_bitmask()),)) - print(" %s,\t/* is_mixed */" % (bool_map(format.is_mixed()),)) - print(" %s,\t/* is_unorm */" % (bool_map(format.is_unorm()),)) - print(" %s,\t/* is_snorm */" % (bool_map(format.is_snorm()),)) - u_format_pack.print_channels(format, do_channel_array) - u_format_pack.print_channels(format, do_swizzle_array) - print(" %s," % (colorspace_map(format.colorspace),)) - access = True - if format.layout == 'astc' or format.layout == 'atc': - access = False - if format.layout == 'etc' and format.short_name() != 'etc1_rgb8': - access = False - if format.colorspace != ZS and not format.is_pure_color() and access: - print(" &util_format_%s_unpack_rgba_8unorm," % format.short_name()) - print(" &util_format_%s_pack_rgba_8unorm," % format.short_name()) - if format.layout == 's3tc' or format.layout == 'rgtc': - print(" &util_format_%s_fetch_rgba_8unorm," % format.short_name()) - else: - print(" NULL, /* fetch_rgba_8unorm */") - print(" &util_format_%s_unpack_rgba_float," % format.short_name()) - print(" &util_format_%s_pack_rgba_float," % format.short_name()) - print(" &util_format_%s_fetch_rgba_float," % format.short_name()) - else: - print(" NULL, /* unpack_rgba_8unorm */") - print(" NULL, /* pack_rgba_8unorm */") - print(" NULL, /* fetch_rgba_8unorm */") - print(" NULL, /* unpack_rgba_float */") - print(" NULL, /* pack_rgba_float */") - print(" NULL, /* fetch_rgba_float */") - if format.has_depth(): - print(" &util_format_%s_unpack_z_32unorm," % format.short_name()) - print(" &util_format_%s_pack_z_32unorm," % format.short_name()) - print(" &util_format_%s_unpack_z_float," % format.short_name()) - print(" &util_format_%s_pack_z_float," % format.short_name()) - else: - print(" NULL, /* unpack_z_32unorm */") - print(" NULL, /* pack_z_32unorm */") - print(" NULL, /* unpack_z_float */") - print(" NULL, /* pack_z_float */") - if format.has_stencil(): - print(" &util_format_%s_unpack_s_8uint," % format.short_name()) - print(" &util_format_%s_pack_s_8uint," % format.short_name()) - else: - print(" NULL, /* unpack_s_8uint */") - print(" NULL, /* pack_s_8uint */") - if format.is_pure_unsigned(): - print(" &util_format_%s_unpack_unsigned, /* unpack_rgba_uint */" % format.short_name()) - print(" &util_format_%s_pack_unsigned, /* pack_rgba_uint */" % format.short_name()) - print(" &util_format_%s_unpack_signed, /* unpack_rgba_sint */" % format.short_name()) - print(" &util_format_%s_pack_signed, /* pack_rgba_sint */" % format.short_name()) - print(" &util_format_%s_fetch_unsigned, /* fetch_rgba_uint */" % format.short_name()) - print(" NULL /* fetch_rgba_sint */") - elif format.is_pure_signed(): - print(" &util_format_%s_unpack_unsigned, /* unpack_rgba_uint */" % format.short_name()) - print(" &util_format_%s_pack_unsigned, /* pack_rgba_uint */" % format.short_name()) - print(" &util_format_%s_unpack_signed, /* unpack_rgba_sint */" % format.short_name()) - print(" &util_format_%s_pack_signed, /* pack_rgba_sint */" % format.short_name()) - print(" NULL, /* fetch_rgba_uint */") - print(" &util_format_%s_fetch_signed /* fetch_rgba_sint */" % format.short_name()) - else: - print(" NULL, /* unpack_rgba_uint */") - print(" NULL, /* pack_rgba_uint */") - print(" NULL, /* unpack_rgba_sint */") - print(" NULL, /* pack_rgba_sint */") - print(" NULL, /* fetch_rgba_uint */") - print(" NULL /* fetch_rgba_sint */") - print("};") - print() - - print("const struct util_format_description *") - print("util_format_description(enum pipe_format format)") - print("{") - print(" if (format >= PIPE_FORMAT_COUNT) {") - print(" return NULL;") - print(" }") - print() - print(" switch (format) {") - for format in formats: - print(" case %s:" % format.name) - print(" return &util_format_%s_description;" % (format.short_name(),)) - print(" default:") - print(" return NULL;") - print(" }") - print("}") - print() - - -def main(): - - formats = [] - for arg in sys.argv[1:]: - formats.extend(parse(arg)) - write_format_table(formats) - - -if __name__ == '__main__': - main() diff -Nru mesa-19.2.8/src/gallium/auxiliary/util/u_format_tests.c mesa-20.0.8/src/gallium/auxiliary/util/u_format_tests.c --- mesa-19.2.8/src/gallium/auxiliary/util/u_format_tests.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/util/u_format_tests.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,1034 +0,0 @@ -/************************************************************************** - * - * Copyright 2009-2010 VMware, Inc. - * All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sub license, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice (including the - * next paragraph) shall be included in all copies or substantial portions - * of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. - * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR - * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - **************************************************************************/ - - -#include -#include - -#include "pipe/p_config.h" -#include "util/u_memory.h" -#include "u_format_tests.h" - - -/* - * Helper macros to create the packed bytes for longer words. - */ - -#define PACKED_1x8(x) {x, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} -#define PACKED_2x8(x, y) {x, y, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} -#define PACKED_3x8(x, y, z) {x, y, z, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} -#define PACKED_4x8(x, y, z, w) {x, y, z, w, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} -#define PACKED_8x8(a, b, c, d, e, f, g, h) {a, b, c, d, e, f, g, h, 0, 0, 0, 0, 0, 0, 0, 0} - -#define PACKED_1x16(x) {(x) & 0xff, (x) >> 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} -#define PACKED_2x16(x, y) {(x) & 0xff, (x) >> 8, (y) & 0xff, (y) >> 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} -#define PACKED_3x16(x, y, z) {(x) & 0xff, (x) >> 8, (y) & 0xff, (y) >> 8, (z) & 0xff, (z) >> 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} -#define PACKED_4x16(x, y, z, w) {(x) & 0xff, (x) >> 8, (y) & 0xff, (y) >> 8, (z) & 0xff, (z) >> 8, (w) & 0xff, (w) >> 8, 0, 0, 0, 0, 0, 0, 0, 0} - -#define PACKED_1x32(x) {(x) & 0xff, ((x) >> 8) & 0xff, ((x) >> 16) & 0xff, (x) >> 24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} -#define PACKED_2x32(x, y) {(x) & 0xff, ((x) >> 8) & 0xff, ((x) >> 16) & 0xff, (x) >> 24, (y) & 0xff, ((y) >> 8) & 0xff, ((y) >> 16) & 0xff, (y) >> 24, 0, 0, 0, 0, 0, 0, 0, 0} -#define PACKED_3x32(x, y, z) {(x) & 0xff, ((x) >> 8) & 0xff, ((x) >> 16) & 0xff, (x) >> 24, (y) & 0xff, ((y) >> 8) & 0xff, ((y) >> 16) & 0xff, (y) >> 24, (z) & 0xff, ((z) >> 8) & 0xff, ((z) >> 16) & 0xff, (z) >> 24, 0, 0, 0, 0} -#define PACKED_4x32(x, y, z, w) {(x) & 0xff, ((x) >> 8) & 0xff, ((x) >> 16) & 0xff, (x) >> 24, (y) & 0xff, ((y) >> 8) & 0xff, ((y) >> 16) & 0xff, (y) >> 24, (z) & 0xff, ((z) >> 8) & 0xff, ((z) >> 16) & 0xff, (z) >> 24, (w) & 0xff, ((w) >> 8) & 0xff, ((w) >> 16) & 0xff, (w) >> 24} - -#define UNPACKED_1x1(r, g, b, a) \ - {{{r, g, b, a}, {0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}}, \ - {{0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}}, \ - {{0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}}, \ - {{0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}}} - -#define UNPACKED_2x1(r0, g0, b0, a0, r1, g1, b1, a1) \ - {{{r0, g0, b0, a0}, {r1, g1, b1, a1}, {0, 0, 0, 0}, {0, 0, 0, 0}}, \ - {{ 0, 0, 0, 0}, { 0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}}, \ - {{ 0, 0, 0, 0}, { 0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}}, \ - {{ 0, 0, 0, 0}, { 0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}}} - - -/** - * Test cases. - * - * These were manually entered. We could generate these - * - * To keep this to a we cover only the corner cases, which should produce - * good enough coverage since that pixel format transformations are afine for - * non SRGB formats. - */ -const struct util_format_test_case -util_format_test_cases[] = -{ - - /* - * 32-bit rendertarget formats - */ - - {PIPE_FORMAT_B8G8R8A8_UNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0x00000000), UNPACKED_1x1(0.0, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_B8G8R8A8_UNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0x000000ff), UNPACKED_1x1(0.0, 0.0, 1.0, 0.0)}, - {PIPE_FORMAT_B8G8R8A8_UNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0x0000ff00), UNPACKED_1x1(0.0, 1.0, 0.0, 0.0)}, - {PIPE_FORMAT_B8G8R8A8_UNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0x00ff0000), UNPACKED_1x1(1.0, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_B8G8R8A8_UNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0xff000000), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_B8G8R8A8_UNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0xffffffff), UNPACKED_1x1(1.0, 1.0, 1.0, 1.0)}, - - {PIPE_FORMAT_B8G8R8X8_UNORM, PACKED_1x32(0x00ffffff), PACKED_1x32(0x00000000), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_B8G8R8X8_UNORM, PACKED_1x32(0x00ffffff), PACKED_1x32(0x000000ff), UNPACKED_1x1(0.0, 0.0, 1.0, 1.0)}, - {PIPE_FORMAT_B8G8R8X8_UNORM, PACKED_1x32(0x00ffffff), PACKED_1x32(0x0000ff00), UNPACKED_1x1(0.0, 1.0, 0.0, 1.0)}, - {PIPE_FORMAT_B8G8R8X8_UNORM, PACKED_1x32(0x00ffffff), PACKED_1x32(0x00ff0000), UNPACKED_1x1(1.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_B8G8R8X8_UNORM, PACKED_1x32(0x00ffffff), PACKED_1x32(0xff000000), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_B8G8R8X8_UNORM, PACKED_1x32(0x00ffffff), PACKED_1x32(0xffffffff), UNPACKED_1x1(1.0, 1.0, 1.0, 1.0)}, - - {PIPE_FORMAT_A8R8G8B8_UNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0x00000000), UNPACKED_1x1(0.0, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_A8R8G8B8_UNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0x000000ff), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_A8R8G8B8_UNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0x0000ff00), UNPACKED_1x1(1.0, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_A8R8G8B8_UNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0x00ff0000), UNPACKED_1x1(0.0, 1.0, 0.0, 0.0)}, - {PIPE_FORMAT_A8R8G8B8_UNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0xff000000), UNPACKED_1x1(0.0, 0.0, 1.0, 0.0)}, - {PIPE_FORMAT_A8R8G8B8_UNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0xffffffff), UNPACKED_1x1(1.0, 1.0, 1.0, 1.0)}, - - {PIPE_FORMAT_X8R8G8B8_UNORM, PACKED_1x32(0xffffff00), PACKED_1x32(0x00000000), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_X8R8G8B8_UNORM, PACKED_1x32(0xffffff00), PACKED_1x32(0x000000ff), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_X8R8G8B8_UNORM, PACKED_1x32(0xffffff00), PACKED_1x32(0x0000ff00), UNPACKED_1x1(1.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_X8R8G8B8_UNORM, PACKED_1x32(0xffffff00), PACKED_1x32(0x00ff0000), UNPACKED_1x1(0.0, 1.0, 0.0, 1.0)}, - {PIPE_FORMAT_X8R8G8B8_UNORM, PACKED_1x32(0xffffff00), PACKED_1x32(0xff000000), UNPACKED_1x1(0.0, 0.0, 1.0, 1.0)}, - {PIPE_FORMAT_X8R8G8B8_UNORM, PACKED_1x32(0xffffff00), PACKED_1x32(0xffffffff), UNPACKED_1x1(1.0, 1.0, 1.0, 1.0)}, - - {PIPE_FORMAT_A8B8G8R8_UNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0x00000000), UNPACKED_1x1(0.0, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_A8B8G8R8_UNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0x000000ff), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_A8B8G8R8_UNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0x0000ff00), UNPACKED_1x1(0.0, 0.0, 1.0, 0.0)}, - {PIPE_FORMAT_A8B8G8R8_UNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0x00ff0000), UNPACKED_1x1(0.0, 1.0, 0.0, 0.0)}, - {PIPE_FORMAT_A8B8G8R8_UNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0xff000000), UNPACKED_1x1(1.0, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_A8B8G8R8_UNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0xffffffff), UNPACKED_1x1(1.0, 1.0, 1.0, 1.0)}, - - {PIPE_FORMAT_X8B8G8R8_UNORM, PACKED_1x32(0xffffff00), PACKED_1x32(0x00000000), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_X8B8G8R8_UNORM, PACKED_1x32(0xffffff00), PACKED_1x32(0x000000ff), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_X8B8G8R8_UNORM, PACKED_1x32(0xffffff00), PACKED_1x32(0x0000ff00), UNPACKED_1x1(0.0, 0.0, 1.0, 1.0)}, - {PIPE_FORMAT_X8B8G8R8_UNORM, PACKED_1x32(0xffffff00), PACKED_1x32(0x00ff0000), UNPACKED_1x1(0.0, 1.0, 0.0, 1.0)}, - {PIPE_FORMAT_X8B8G8R8_UNORM, PACKED_1x32(0xffffff00), PACKED_1x32(0xff000000), UNPACKED_1x1(1.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_X8B8G8R8_UNORM, PACKED_1x32(0xffffff00), PACKED_1x32(0xffffffff), UNPACKED_1x1(1.0, 1.0, 1.0, 1.0)}, - - {PIPE_FORMAT_R8G8B8X8_UNORM, PACKED_1x32(0x00ffffff), PACKED_1x32(0x00000000), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R8G8B8X8_UNORM, PACKED_1x32(0x00ffffff), PACKED_1x32(0x000000ff), UNPACKED_1x1(1.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R8G8B8X8_UNORM, PACKED_1x32(0x00ffffff), PACKED_1x32(0x0000ff00), UNPACKED_1x1(0.0, 1.0, 0.0, 1.0)}, - {PIPE_FORMAT_R8G8B8X8_UNORM, PACKED_1x32(0x00ffffff), PACKED_1x32(0x00ff0000), UNPACKED_1x1(0.0, 0.0, 1.0, 1.0)}, - {PIPE_FORMAT_R8G8B8X8_UNORM, PACKED_1x32(0x00ffffff), PACKED_1x32(0xff000000), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R8G8B8X8_UNORM, PACKED_1x32(0x00ffffff), PACKED_1x32(0xffffffff), UNPACKED_1x1(1.0, 1.0, 1.0, 1.0)}, - - {PIPE_FORMAT_R10G10B10A2_UNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0x00000000), UNPACKED_1x1(0.0, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_R10G10B10A2_UNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0x000003ff), UNPACKED_1x1(1.0, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_R10G10B10A2_UNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0x000ffc00), UNPACKED_1x1(0.0, 1.0, 0.0, 0.0)}, - {PIPE_FORMAT_R10G10B10A2_UNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0x3ff00000), UNPACKED_1x1(0.0, 0.0, 1.0, 0.0)}, - {PIPE_FORMAT_R10G10B10A2_UNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0xc0000000), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R10G10B10A2_UNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0xffffffff), UNPACKED_1x1(1.0, 1.0, 1.0, 1.0)}, - - {PIPE_FORMAT_R10G10B10X2_UNORM, PACKED_1x32(0x3fffffff), PACKED_1x32(0x00000000), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R10G10B10X2_UNORM, PACKED_1x32(0x3fffffff), PACKED_1x32(0x000003ff), UNPACKED_1x1(1.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R10G10B10X2_UNORM, PACKED_1x32(0x3fffffff), PACKED_1x32(0x000ffc00), UNPACKED_1x1(0.0, 1.0, 0.0, 1.0)}, - {PIPE_FORMAT_R10G10B10X2_UNORM, PACKED_1x32(0x3fffffff), PACKED_1x32(0x3ff00000), UNPACKED_1x1(0.0, 0.0, 1.0, 1.0)}, - {PIPE_FORMAT_R10G10B10X2_UNORM, PACKED_1x32(0x3fffffff), PACKED_1x32(0x3fffffff), UNPACKED_1x1(1.0, 1.0, 1.0, 1.0)}, - - {PIPE_FORMAT_B10G10R10A2_UNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0x00000000), UNPACKED_1x1(0.0, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_B10G10R10A2_UNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0x000003ff), UNPACKED_1x1(0.0, 0.0, 1.0, 0.0)}, - {PIPE_FORMAT_B10G10R10A2_UNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0x000ffc00), UNPACKED_1x1(0.0, 1.0, 0.0, 0.0)}, - {PIPE_FORMAT_B10G10R10A2_UNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0x3ff00000), UNPACKED_1x1(1.0, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_B10G10R10A2_UNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0xc0000000), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_B10G10R10A2_UNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0xffffffff), UNPACKED_1x1(1.0, 1.0, 1.0, 1.0)}, - - /* - * 16-bit rendertarget formats - */ - - {PIPE_FORMAT_B5G5R5X1_UNORM, PACKED_1x16(0x7fff), PACKED_1x16(0x0000), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_B5G5R5X1_UNORM, PACKED_1x16(0x7fff), PACKED_1x16(0x001f), UNPACKED_1x1(0.0, 0.0, 1.0, 1.0)}, - {PIPE_FORMAT_B5G5R5X1_UNORM, PACKED_1x16(0x7fff), PACKED_1x16(0x03e0), UNPACKED_1x1(0.0, 1.0, 0.0, 1.0)}, - {PIPE_FORMAT_B5G5R5X1_UNORM, PACKED_1x16(0x7fff), PACKED_1x16(0x7c00), UNPACKED_1x1(1.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_B5G5R5X1_UNORM, PACKED_1x16(0x7fff), PACKED_1x16(0x7fff), UNPACKED_1x1(1.0, 1.0, 1.0, 1.0)}, - - {PIPE_FORMAT_B5G5R5A1_UNORM, PACKED_1x16(0xffff), PACKED_1x16(0x0000), UNPACKED_1x1(0.0, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_B5G5R5A1_UNORM, PACKED_1x16(0xffff), PACKED_1x16(0x001f), UNPACKED_1x1(0.0, 0.0, 1.0, 0.0)}, - {PIPE_FORMAT_B5G5R5A1_UNORM, PACKED_1x16(0xffff), PACKED_1x16(0x03e0), UNPACKED_1x1(0.0, 1.0, 0.0, 0.0)}, - {PIPE_FORMAT_B5G5R5A1_UNORM, PACKED_1x16(0xffff), PACKED_1x16(0x7c00), UNPACKED_1x1(1.0, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_B5G5R5A1_UNORM, PACKED_1x16(0xffff), PACKED_1x16(0x8000), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_B5G5R5A1_UNORM, PACKED_1x16(0xffff), PACKED_1x16(0xffff), UNPACKED_1x1(1.0, 1.0, 1.0, 1.0)}, - - {PIPE_FORMAT_X1B5G5R5_UNORM, PACKED_1x16(0xfffe), PACKED_1x16(0x0000), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_X1B5G5R5_UNORM, PACKED_1x16(0xfffe), PACKED_1x16(0x003e), UNPACKED_1x1(0.0, 0.0, 1.0, 1.0)}, - {PIPE_FORMAT_X1B5G5R5_UNORM, PACKED_1x16(0xfffe), PACKED_1x16(0x07c0), UNPACKED_1x1(0.0, 1.0, 0.0, 1.0)}, - {PIPE_FORMAT_X1B5G5R5_UNORM, PACKED_1x16(0xfffe), PACKED_1x16(0xf800), UNPACKED_1x1(1.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_X1B5G5R5_UNORM, PACKED_1x16(0xfffe), PACKED_1x16(0xfffe), UNPACKED_1x1(1.0, 1.0, 1.0, 1.0)}, - - {PIPE_FORMAT_A1B5G5R5_UNORM, PACKED_1x16(0xffff), PACKED_1x16(0x0000), UNPACKED_1x1(0.0, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_A1B5G5R5_UNORM, PACKED_1x16(0xffff), PACKED_1x16(0x003e), UNPACKED_1x1(0.0, 0.0, 1.0, 0.0)}, - {PIPE_FORMAT_A1B5G5R5_UNORM, PACKED_1x16(0xffff), PACKED_1x16(0x07c0), UNPACKED_1x1(0.0, 1.0, 0.0, 0.0)}, - {PIPE_FORMAT_A1B5G5R5_UNORM, PACKED_1x16(0xffff), PACKED_1x16(0xf800), UNPACKED_1x1(1.0, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_A1B5G5R5_UNORM, PACKED_1x16(0xffff), PACKED_1x16(0x0001), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_A1B5G5R5_UNORM, PACKED_1x16(0xffff), PACKED_1x16(0xffff), UNPACKED_1x1(1.0, 1.0, 1.0, 1.0)}, - - {PIPE_FORMAT_B4G4R4X4_UNORM, PACKED_1x16(0x0fff), PACKED_1x16(0x0000), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_B4G4R4X4_UNORM, PACKED_1x16(0x0fff), PACKED_1x16(0x000f), UNPACKED_1x1(0.0, 0.0, 1.0, 1.0)}, - {PIPE_FORMAT_B4G4R4X4_UNORM, PACKED_1x16(0x0fff), PACKED_1x16(0x00f0), UNPACKED_1x1(0.0, 1.0, 0.0, 1.0)}, - {PIPE_FORMAT_B4G4R4X4_UNORM, PACKED_1x16(0x0fff), PACKED_1x16(0x0f00), UNPACKED_1x1(1.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_B4G4R4X4_UNORM, PACKED_1x16(0x0fff), PACKED_1x16(0x0fff), UNPACKED_1x1(1.0, 1.0, 1.0, 1.0)}, - - {PIPE_FORMAT_B4G4R4A4_UNORM, PACKED_1x16(0xffff), PACKED_1x16(0x0000), UNPACKED_1x1(0.0, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_B4G4R4A4_UNORM, PACKED_1x16(0xffff), PACKED_1x16(0x000f), UNPACKED_1x1(0.0, 0.0, 1.0, 0.0)}, - {PIPE_FORMAT_B4G4R4A4_UNORM, PACKED_1x16(0xffff), PACKED_1x16(0x00f0), UNPACKED_1x1(0.0, 1.0, 0.0, 0.0)}, - {PIPE_FORMAT_B4G4R4A4_UNORM, PACKED_1x16(0xffff), PACKED_1x16(0x0f00), UNPACKED_1x1(1.0, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_B4G4R4A4_UNORM, PACKED_1x16(0xffff), PACKED_1x16(0xf000), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_B4G4R4A4_UNORM, PACKED_1x16(0xffff), PACKED_1x16(0xffff), UNPACKED_1x1(1.0, 1.0, 1.0, 1.0)}, - - {PIPE_FORMAT_B5G6R5_UNORM, PACKED_1x16(0xffff), PACKED_1x16(0x0000), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_B5G6R5_UNORM, PACKED_1x16(0xffff), PACKED_1x16(0x001f), UNPACKED_1x1(0.0, 0.0, 1.0, 1.0)}, - {PIPE_FORMAT_B5G6R5_UNORM, PACKED_1x16(0xffff), PACKED_1x16(0x07e0), UNPACKED_1x1(0.0, 1.0, 0.0, 1.0)}, - {PIPE_FORMAT_B5G6R5_UNORM, PACKED_1x16(0xffff), PACKED_1x16(0xf800), UNPACKED_1x1(1.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_B5G6R5_UNORM, PACKED_1x16(0xffff), PACKED_1x16(0xffff), UNPACKED_1x1(1.0, 1.0, 1.0, 1.0)}, - - /* - * Luminance/intensity/alpha formats - */ - - {PIPE_FORMAT_L8_UNORM, PACKED_1x8(0xff), PACKED_1x8(0x00), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_L8_UNORM, PACKED_1x8(0xff), PACKED_1x8(0xff), UNPACKED_1x1(1.0, 1.0, 1.0, 1.0)}, - - {PIPE_FORMAT_A8_UNORM, PACKED_1x8(0xff), PACKED_1x8(0x00), UNPACKED_1x1(0.0, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_A8_UNORM, PACKED_1x8(0xff), PACKED_1x8(0xff), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)}, - - {PIPE_FORMAT_I8_UNORM, PACKED_1x8(0xff), PACKED_1x8(0x00), UNPACKED_1x1(0.0, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_I8_UNORM, PACKED_1x8(0xff), PACKED_1x8(0xff), UNPACKED_1x1(1.0, 1.0, 1.0, 1.0)}, - - {PIPE_FORMAT_L4A4_UNORM, PACKED_1x8(0xff), PACKED_1x8(0x00), UNPACKED_1x1(0.0, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_L4A4_UNORM, PACKED_1x8(0xff), PACKED_1x8(0x0f), UNPACKED_1x1(1.0, 1.0, 1.0, 0.0)}, - {PIPE_FORMAT_L4A4_UNORM, PACKED_1x8(0xff), PACKED_1x8(0xf0), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_L4A4_UNORM, PACKED_1x8(0xff), PACKED_1x8(0xff), UNPACKED_1x1(1.0, 1.0, 1.0, 1.0)}, - - {PIPE_FORMAT_L8A8_UNORM, PACKED_1x16(0xffff), PACKED_1x16(0x0000), UNPACKED_1x1(0.0, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_L8A8_UNORM, PACKED_1x16(0xffff), PACKED_1x16(0x00ff), UNPACKED_1x1(1.0, 1.0, 1.0, 0.0)}, - {PIPE_FORMAT_L8A8_UNORM, PACKED_1x16(0xffff), PACKED_1x16(0xff00), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_L8A8_UNORM, PACKED_1x16(0xffff), PACKED_1x16(0xffff), UNPACKED_1x1(1.0, 1.0, 1.0, 1.0)}, - - {PIPE_FORMAT_L16_UNORM, PACKED_1x16(0xffff), PACKED_1x16(0x0000), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_L16_UNORM, PACKED_1x16(0xffff), PACKED_1x16(0xffff), UNPACKED_1x1(1.0, 1.0, 1.0, 1.0)}, - - /* - * SRGB formats - */ - - {PIPE_FORMAT_L8_SRGB, PACKED_1x8(0xff), PACKED_1x8(0x00), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_L8_SRGB, PACKED_1x8(0xff), PACKED_1x8(0xbc), UNPACKED_1x1(0.502886458033, 0.502886458033, 0.502886458033, 1.0)}, - {PIPE_FORMAT_L8_SRGB, PACKED_1x8(0xff), PACKED_1x8(0xff), UNPACKED_1x1(1.0, 1.0, 1.0, 1.0)}, - - {PIPE_FORMAT_R8_SRGB, PACKED_1x8(0xff), PACKED_1x8(0x00), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R8_SRGB, PACKED_1x8(0xff), PACKED_1x8(0xbc), UNPACKED_1x1(0.502886458033, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R8_SRGB, PACKED_1x8(0xff), PACKED_1x8(0xff), UNPACKED_1x1(1.0, 0.0, 0.0, 1.0)}, - - {PIPE_FORMAT_L8A8_SRGB, PACKED_1x16(0xffff), PACKED_1x16(0x0000), UNPACKED_1x1(0.0, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_L8A8_SRGB, PACKED_1x16(0xffff), PACKED_1x16(0x00bc), UNPACKED_1x1(0.502886458033, 0.502886458033, 0.502886458033, 0.0)}, - {PIPE_FORMAT_L8A8_SRGB, PACKED_1x16(0xffff), PACKED_1x16(0x00ff), UNPACKED_1x1(1.0, 1.0, 1.0, 0.0)}, - {PIPE_FORMAT_L8A8_SRGB, PACKED_1x16(0xffff), PACKED_1x16(0xcc00), UNPACKED_1x1(0.0, 0.0, 0.0, 0.8)}, - {PIPE_FORMAT_L8A8_SRGB, PACKED_1x16(0xffff), PACKED_1x16(0xff00), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_L8A8_SRGB, PACKED_1x16(0xffff), PACKED_1x16(0xffff), UNPACKED_1x1(1.0, 1.0, 1.0, 1.0)}, - - {PIPE_FORMAT_R8G8B8_SRGB, PACKED_3x8(0xff, 0xff, 0xff), PACKED_3x8(0x00, 0x00, 0x00), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R8G8B8_SRGB, PACKED_3x8(0xff, 0xff, 0xff), PACKED_3x8(0xbc, 0x00, 0x00), UNPACKED_1x1(0.502886458033, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R8G8B8_SRGB, PACKED_3x8(0xff, 0xff, 0xff), PACKED_3x8(0xff, 0x00, 0x00), UNPACKED_1x1(1.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R8G8B8_SRGB, PACKED_3x8(0xff, 0xff, 0xff), PACKED_3x8(0x00, 0xbc, 0x00), UNPACKED_1x1(0.0, 0.502886458033, 0.0, 1.0)}, - {PIPE_FORMAT_R8G8B8_SRGB, PACKED_3x8(0xff, 0xff, 0xff), PACKED_3x8(0x00, 0xff, 0x00), UNPACKED_1x1(0.0, 1.0, 0.0, 1.0)}, - {PIPE_FORMAT_R8G8B8_SRGB, PACKED_3x8(0xff, 0xff, 0xff), PACKED_3x8(0x00, 0x00, 0xbc), UNPACKED_1x1(0.0, 0.0, 0.502886458033, 1.0)}, - {PIPE_FORMAT_R8G8B8_SRGB, PACKED_3x8(0xff, 0xff, 0xff), PACKED_3x8(0x00, 0x00, 0xff), UNPACKED_1x1(0.0, 0.0, 1.0, 1.0)}, - {PIPE_FORMAT_R8G8B8_SRGB, PACKED_3x8(0xff, 0xff, 0xff), PACKED_3x8(0xff, 0xff, 0xff), UNPACKED_1x1(1.0, 1.0, 1.0, 1.0)}, - - {PIPE_FORMAT_R8G8B8A8_SRGB, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x00, 0x00, 0x00, 0x00), UNPACKED_1x1(0.0, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_R8G8B8A8_SRGB, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0xbc, 0x00, 0x00, 0x00), UNPACKED_1x1(0.502886458033, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_R8G8B8A8_SRGB, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0xff, 0x00, 0x00, 0x00), UNPACKED_1x1(1.0, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_R8G8B8A8_SRGB, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x00, 0xbc, 0x00, 0x00), UNPACKED_1x1(0.0, 0.502886458033, 0.0, 0.0)}, - {PIPE_FORMAT_R8G8B8A8_SRGB, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x00, 0xff, 0x00, 0x00), UNPACKED_1x1(0.0, 1.0, 0.0, 0.0)}, - {PIPE_FORMAT_R8G8B8A8_SRGB, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x00, 0x00, 0xbc, 0x00), UNPACKED_1x1(0.0, 0.0, 0.502886458033, 0.0)}, - {PIPE_FORMAT_R8G8B8A8_SRGB, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x00, 0x00, 0xff, 0x00), UNPACKED_1x1(0.0, 0.0, 1.0, 0.0)}, - {PIPE_FORMAT_R8G8B8A8_SRGB, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x00, 0x00, 0x00, 0xcc), UNPACKED_1x1(0.0, 0.0, 0.0, 0.8)}, - {PIPE_FORMAT_R8G8B8A8_SRGB, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x00, 0x00, 0x00, 0xff), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R8G8B8A8_SRGB, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0xff, 0xff, 0xff, 0xff), UNPACKED_1x1(1.0, 1.0, 1.0, 1.0)}, - - {PIPE_FORMAT_B8G8R8A8_SRGB, PACKED_1x32(0xffffffff), PACKED_1x32(0x00000000), UNPACKED_1x1(0.0, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_B8G8R8A8_SRGB, PACKED_1x32(0xffffffff), PACKED_1x32(0x000000bc), UNPACKED_1x1(0.0, 0.0, 0.502886458033, 0.0)}, - {PIPE_FORMAT_B8G8R8A8_SRGB, PACKED_1x32(0xffffffff), PACKED_1x32(0x000000ff), UNPACKED_1x1(0.0, 0.0, 1.0, 0.0)}, - {PIPE_FORMAT_B8G8R8A8_SRGB, PACKED_1x32(0xffffffff), PACKED_1x32(0x0000bc00), UNPACKED_1x1(0.0, 0.502886458033, 0.0, 0.0)}, - {PIPE_FORMAT_B8G8R8A8_SRGB, PACKED_1x32(0xffffffff), PACKED_1x32(0x0000ff00), UNPACKED_1x1(0.0, 1.0, 0.0, 0.0)}, - {PIPE_FORMAT_B8G8R8A8_SRGB, PACKED_1x32(0xffffffff), PACKED_1x32(0x00bc0000), UNPACKED_1x1(0.502886458033, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_B8G8R8A8_SRGB, PACKED_1x32(0xffffffff), PACKED_1x32(0x00ff0000), UNPACKED_1x1(1.0, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_B8G8R8A8_SRGB, PACKED_1x32(0xffffffff), PACKED_1x32(0xcc000000), UNPACKED_1x1(0.0, 0.0, 0.0, 0.8)}, - {PIPE_FORMAT_B8G8R8A8_SRGB, PACKED_1x32(0xffffffff), PACKED_1x32(0xff000000), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_B8G8R8A8_SRGB, PACKED_1x32(0xffffffff), PACKED_1x32(0xffffffff), UNPACKED_1x1(1.0, 1.0, 1.0, 1.0)}, - - {PIPE_FORMAT_B8G8R8X8_SRGB, PACKED_1x32(0x00ffffff), PACKED_1x32(0x00000000), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_B8G8R8X8_SRGB, PACKED_1x32(0x00ffffff), PACKED_1x32(0x000000bc), UNPACKED_1x1(0.0, 0.0, 0.502886458033, 1.0)}, - {PIPE_FORMAT_B8G8R8X8_SRGB, PACKED_1x32(0x00ffffff), PACKED_1x32(0x000000ff), UNPACKED_1x1(0.0, 0.0, 1.0, 1.0)}, - {PIPE_FORMAT_B8G8R8X8_SRGB, PACKED_1x32(0x00ffffff), PACKED_1x32(0x0000bc00), UNPACKED_1x1(0.0, 0.502886458033, 0.0, 1.0)}, - {PIPE_FORMAT_B8G8R8X8_SRGB, PACKED_1x32(0x00ffffff), PACKED_1x32(0x0000ff00), UNPACKED_1x1(0.0, 1.0, 0.0, 1.0)}, - {PIPE_FORMAT_B8G8R8X8_SRGB, PACKED_1x32(0x00ffffff), PACKED_1x32(0x00bc0000), UNPACKED_1x1(0.502886458033, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_B8G8R8X8_SRGB, PACKED_1x32(0x00ffffff), PACKED_1x32(0x00ff0000), UNPACKED_1x1(1.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_B8G8R8X8_SRGB, PACKED_1x32(0x00ffffff), PACKED_1x32(0xffffffff), UNPACKED_1x1(1.0, 1.0, 1.0, 1.0)}, - - {PIPE_FORMAT_A8R8G8B8_SRGB, PACKED_1x32(0xffffffff), PACKED_1x32(0x00000000), UNPACKED_1x1(0.0, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_A8R8G8B8_SRGB, PACKED_1x32(0xffffffff), PACKED_1x32(0x000000cc), UNPACKED_1x1(0.0, 0.0, 0.0, 0.8)}, - {PIPE_FORMAT_A8R8G8B8_SRGB, PACKED_1x32(0xffffffff), PACKED_1x32(0x000000ff), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_A8R8G8B8_SRGB, PACKED_1x32(0xffffffff), PACKED_1x32(0x0000bc00), UNPACKED_1x1(0.502886458033, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_A8R8G8B8_SRGB, PACKED_1x32(0xffffffff), PACKED_1x32(0x0000ff00), UNPACKED_1x1(1.0, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_A8R8G8B8_SRGB, PACKED_1x32(0xffffffff), PACKED_1x32(0x00bc0000), UNPACKED_1x1(0.0, 0.502886458033, 0.0, 0.0)}, - {PIPE_FORMAT_A8R8G8B8_SRGB, PACKED_1x32(0xffffffff), PACKED_1x32(0x00ff0000), UNPACKED_1x1(0.0, 1.0, 0.0, 0.0)}, - {PIPE_FORMAT_A8R8G8B8_SRGB, PACKED_1x32(0xffffffff), PACKED_1x32(0xbc000000), UNPACKED_1x1(0.0, 0.0, 0.502886458033, 0.0)}, - {PIPE_FORMAT_A8R8G8B8_SRGB, PACKED_1x32(0xffffffff), PACKED_1x32(0xff000000), UNPACKED_1x1(0.0, 0.0, 1.0, 0.0)}, - {PIPE_FORMAT_A8R8G8B8_SRGB, PACKED_1x32(0xffffffff), PACKED_1x32(0xffffffff), UNPACKED_1x1(1.0, 1.0, 1.0, 1.0)}, - - {PIPE_FORMAT_X8R8G8B8_SRGB, PACKED_1x32(0xffffff00), PACKED_1x32(0x00000000), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_X8R8G8B8_SRGB, PACKED_1x32(0xffffff00), PACKED_1x32(0x0000bc00), UNPACKED_1x1(0.502886458033, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_X8R8G8B8_SRGB, PACKED_1x32(0xffffff00), PACKED_1x32(0x0000ff00), UNPACKED_1x1(1.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_X8R8G8B8_SRGB, PACKED_1x32(0xffffff00), PACKED_1x32(0x00bc0000), UNPACKED_1x1(0.0, 0.502886458033, 0.0, 1.0)}, - {PIPE_FORMAT_X8R8G8B8_SRGB, PACKED_1x32(0xffffff00), PACKED_1x32(0x00ff0000), UNPACKED_1x1(0.0, 1.0, 0.0, 1.0)}, - {PIPE_FORMAT_X8R8G8B8_SRGB, PACKED_1x32(0xffffff00), PACKED_1x32(0xbc000000), UNPACKED_1x1(0.0, 0.0, 0.502886458033, 1.0)}, - {PIPE_FORMAT_X8R8G8B8_SRGB, PACKED_1x32(0xffffff00), PACKED_1x32(0xff000000), UNPACKED_1x1(0.0, 0.0, 1.0, 1.0)}, - {PIPE_FORMAT_X8R8G8B8_SRGB, PACKED_1x32(0xffffff00), PACKED_1x32(0xffffffff), UNPACKED_1x1(1.0, 1.0, 1.0, 1.0)}, - - {PIPE_FORMAT_A8B8G8R8_SRGB, PACKED_1x32(0xffffffff), PACKED_1x32(0x00000000), UNPACKED_1x1(0.0, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_A8B8G8R8_SRGB, PACKED_1x32(0xffffffff), PACKED_1x32(0x000000cc), UNPACKED_1x1(0.0, 0.0, 0.0, 0.8)}, - {PIPE_FORMAT_A8B8G8R8_SRGB, PACKED_1x32(0xffffffff), PACKED_1x32(0x000000ff), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_A8B8G8R8_SRGB, PACKED_1x32(0xffffffff), PACKED_1x32(0x0000bc00), UNPACKED_1x1(0.0, 0.0, 0.502886458033, 0.0)}, - {PIPE_FORMAT_A8B8G8R8_SRGB, PACKED_1x32(0xffffffff), PACKED_1x32(0x0000ff00), UNPACKED_1x1(0.0, 0.0, 1.0, 0.0)}, - {PIPE_FORMAT_A8B8G8R8_SRGB, PACKED_1x32(0xffffffff), PACKED_1x32(0x00bc0000), UNPACKED_1x1(0.0, 0.502886458033, 0.0, 0.0)}, - {PIPE_FORMAT_A8B8G8R8_SRGB, PACKED_1x32(0xffffffff), PACKED_1x32(0x00ff0000), UNPACKED_1x1(0.0, 1.0, 0.0, 0.0)}, - {PIPE_FORMAT_A8B8G8R8_SRGB, PACKED_1x32(0xffffffff), PACKED_1x32(0xbc000000), UNPACKED_1x1(0.502886458033, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_A8B8G8R8_SRGB, PACKED_1x32(0xffffffff), PACKED_1x32(0xff000000), UNPACKED_1x1(1.0, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_A8B8G8R8_SRGB, PACKED_1x32(0xffffffff), PACKED_1x32(0xffffffff), UNPACKED_1x1(1.0, 1.0, 1.0, 1.0)}, - - {PIPE_FORMAT_X8B8G8R8_SRGB, PACKED_1x32(0xffffff00), PACKED_1x32(0x00000000), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_X8B8G8R8_SRGB, PACKED_1x32(0xffffff00), PACKED_1x32(0x0000bc00), UNPACKED_1x1(0.0, 0.0, 0.502886458033, 1.0)}, - {PIPE_FORMAT_X8B8G8R8_SRGB, PACKED_1x32(0xffffff00), PACKED_1x32(0x0000ff00), UNPACKED_1x1(0.0, 0.0, 1.0, 1.0)}, - {PIPE_FORMAT_X8B8G8R8_SRGB, PACKED_1x32(0xffffff00), PACKED_1x32(0x00bc0000), UNPACKED_1x1(0.0, 0.502886458033, 0.0, 1.0)}, - {PIPE_FORMAT_X8B8G8R8_SRGB, PACKED_1x32(0xffffff00), PACKED_1x32(0x00ff0000), UNPACKED_1x1(0.0, 1.0, 0.0, 1.0)}, - {PIPE_FORMAT_X8B8G8R8_SRGB, PACKED_1x32(0xffffff00), PACKED_1x32(0xbc000000), UNPACKED_1x1(0.502886458033, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_X8B8G8R8_SRGB, PACKED_1x32(0xffffff00), PACKED_1x32(0xff000000), UNPACKED_1x1(1.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_X8B8G8R8_SRGB, PACKED_1x32(0xffffff00), PACKED_1x32(0xffffffff), UNPACKED_1x1(1.0, 1.0, 1.0, 1.0)}, - - /* - * Mixed-signed formats - */ - - {PIPE_FORMAT_R8SG8SB8UX8U_NORM, PACKED_4x8(0xff, 0xff, 0xff, 0x00), PACKED_4x8(0x00, 0x00, 0x00, 0x00), UNPACKED_1x1( 0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R8SG8SB8UX8U_NORM, PACKED_4x8(0xff, 0xff, 0xff, 0x00), PACKED_4x8(0x7f, 0x00, 0x00, 0x00), UNPACKED_1x1( 1.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R8SG8SB8UX8U_NORM, PACKED_4x8(0xff, 0xff, 0xff, 0x00), PACKED_4x8(0x81, 0x00, 0x00, 0x00), UNPACKED_1x1(-1.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R8SG8SB8UX8U_NORM, PACKED_4x8(0xff, 0xff, 0xff, 0x00), PACKED_4x8(0x00, 0x7f, 0x00, 0x00), UNPACKED_1x1( 0.0, 1.0, 0.0, 1.0)}, - {PIPE_FORMAT_R8SG8SB8UX8U_NORM, PACKED_4x8(0xff, 0xff, 0xff, 0x00), PACKED_4x8(0x00, 0x81, 0x00, 0x00), UNPACKED_1x1( 0.0, -1.0, 0.0, 1.0)}, - {PIPE_FORMAT_R8SG8SB8UX8U_NORM, PACKED_4x8(0xff, 0xff, 0xff, 0x00), PACKED_4x8(0x00, 0x00, 0xff, 0x00), UNPACKED_1x1( 0.0, 0.0, 1.0, 1.0)}, - - {PIPE_FORMAT_R10SG10SB10SA2U_NORM, PACKED_1x32(0xffffffff), PACKED_1x32(0x00000000), UNPACKED_1x1( 0.0, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_R10SG10SB10SA2U_NORM, PACKED_1x32(0xffffffff), PACKED_1x32(0x000001ff), UNPACKED_1x1( 1.0, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_R10SG10SB10SA2U_NORM, PACKED_1x32(0xffffffff), PACKED_1x32(0x00000201), UNPACKED_1x1(-1.0, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_R10SG10SB10SA2U_NORM, PACKED_1x32(0xffffffff), PACKED_1x32(0x0007fc00), UNPACKED_1x1( 0.0, 1.0, 0.0, 0.0)}, - {PIPE_FORMAT_R10SG10SB10SA2U_NORM, PACKED_1x32(0xffffffff), PACKED_1x32(0x00080400), UNPACKED_1x1( 0.0, -1.0, 0.0, 0.0)}, - {PIPE_FORMAT_R10SG10SB10SA2U_NORM, PACKED_1x32(0xffffffff), PACKED_1x32(0x1ff00000), UNPACKED_1x1( 0.0, 0.0, 1.0, 0.0)}, - {PIPE_FORMAT_R10SG10SB10SA2U_NORM, PACKED_1x32(0xffffffff), PACKED_1x32(0x20100000), UNPACKED_1x1( 0.0, 0.0, -1.0, 0.0)}, - {PIPE_FORMAT_R10SG10SB10SA2U_NORM, PACKED_1x32(0xffffffff), PACKED_1x32(0xc0000000), UNPACKED_1x1( 0.0, 0.0, 0.0, 1.0)}, - - {PIPE_FORMAT_R5SG5SB6U_NORM, PACKED_1x16(0xffff), PACKED_1x16(0x0000), UNPACKED_1x1( 0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R5SG5SB6U_NORM, PACKED_1x16(0xffff), PACKED_1x16(0x000f), UNPACKED_1x1( 1.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R5SG5SB6U_NORM, PACKED_1x16(0xffff), PACKED_1x16(0x0011), UNPACKED_1x1(-1.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R5SG5SB6U_NORM, PACKED_1x16(0xffff), PACKED_1x16(0x01e0), UNPACKED_1x1( 0.0, 1.0, 0.0, 1.0)}, - {PIPE_FORMAT_R5SG5SB6U_NORM, PACKED_1x16(0xffff), PACKED_1x16(0x0220), UNPACKED_1x1( 0.0, -1.0, 0.0, 1.0)}, - {PIPE_FORMAT_R5SG5SB6U_NORM, PACKED_1x16(0xffff), PACKED_1x16(0xfc00), UNPACKED_1x1( 0.0, 0.0, 1.0, 1.0)}, - - {PIPE_FORMAT_R8G8Bx_SNORM, PACKED_2x8(0xff, 0xff), PACKED_2x8(0x00, 0x00), UNPACKED_1x1( 0.0, 0.0, 1.0, 1.0)}, - {PIPE_FORMAT_R8G8Bx_SNORM, PACKED_2x8(0xff, 0xff), PACKED_2x8(0x7f, 0x00), UNPACKED_1x1( 1.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R8G8Bx_SNORM, PACKED_2x8(0xff, 0xff), PACKED_2x8(0x81, 0x00), UNPACKED_1x1(-1.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R8G8Bx_SNORM, PACKED_2x8(0xff, 0xff), PACKED_2x8(0x00, 0x7f), UNPACKED_1x1( 0.0, 1.0, 0.0, 1.0)}, - {PIPE_FORMAT_R8G8Bx_SNORM, PACKED_2x8(0xff, 0xff), PACKED_2x8(0x00, 0x81), UNPACKED_1x1( 0.0, -1.0, 0.0, 1.0)}, - - /* - * Depth-stencil formats - */ - - {PIPE_FORMAT_S8_UINT, PACKED_1x8(0xff), PACKED_1x8(0x00), UNPACKED_1x1(0.0, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_S8_UINT, PACKED_1x8(0xff), PACKED_1x8(0xff), UNPACKED_1x1(0.0, 255.0, 0.0, 0.0)}, - - {PIPE_FORMAT_Z16_UNORM, PACKED_1x16(0xffff), PACKED_1x16(0x0000), UNPACKED_1x1(0.0, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_Z16_UNORM, PACKED_1x16(0xffff), PACKED_1x16(0xffff), UNPACKED_1x1(1.0, 0.0, 0.0, 0.0)}, - - {PIPE_FORMAT_Z32_UNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0x00000000), UNPACKED_1x1(0.0, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_Z32_UNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0xffffffff), UNPACKED_1x1(1.0, 0.0, 0.0, 0.0)}, - - {PIPE_FORMAT_Z32_FLOAT, PACKED_1x32(0xffffffff), PACKED_1x32(0x00000000), UNPACKED_1x1(0.0, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_Z32_FLOAT, PACKED_1x32(0xffffffff), PACKED_1x32(0x3f800000), UNPACKED_1x1(1.0, 0.0, 0.0, 0.0)}, - - {PIPE_FORMAT_Z24_UNORM_S8_UINT, PACKED_1x32(0xffffffff), PACKED_1x32(0x00000000), UNPACKED_1x1(0.0, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_Z24_UNORM_S8_UINT, PACKED_1x32(0xffffffff), PACKED_1x32(0x00ffffff), UNPACKED_1x1(1.0, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_Z24_UNORM_S8_UINT, PACKED_1x32(0xffffffff), PACKED_1x32(0xff000000), UNPACKED_1x1(0.0, 255.0, 0.0, 0.0)}, - {PIPE_FORMAT_Z24_UNORM_S8_UINT, PACKED_1x32(0xffffffff), PACKED_1x32(0xffffffff), UNPACKED_1x1(1.0, 255.0, 0.0, 0.0)}, - - {PIPE_FORMAT_S8_UINT_Z24_UNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0x00000000), UNPACKED_1x1(0.0, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_S8_UINT_Z24_UNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0xffffff00), UNPACKED_1x1(1.0, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_S8_UINT_Z24_UNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0x000000ff), UNPACKED_1x1(0.0, 255.0, 0.0, 0.0)}, - {PIPE_FORMAT_S8_UINT_Z24_UNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0xffffffff), UNPACKED_1x1(1.0, 255.0, 0.0, 0.0)}, - - {PIPE_FORMAT_Z24X8_UNORM, PACKED_1x32(0x00ffffff), PACKED_1x32(0x00000000), UNPACKED_1x1(0.0, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_Z24X8_UNORM, PACKED_1x32(0x00ffffff), PACKED_1x32(0x00ffffff), UNPACKED_1x1(1.0, 0.0, 0.0, 0.0)}, - - {PIPE_FORMAT_X8Z24_UNORM, PACKED_1x32(0xffffff00), PACKED_1x32(0x00000000), UNPACKED_1x1(0.0, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_X8Z24_UNORM, PACKED_1x32(0xffffff00), PACKED_1x32(0xffffff00), UNPACKED_1x1(1.0, 0.0, 0.0, 0.0)}, - - {PIPE_FORMAT_Z32_FLOAT_S8X24_UINT, PACKED_2x32(0xffffffff, 0x000000ff), PACKED_2x32(0x00000000, 0x00000000), UNPACKED_1x1( 0.0, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_Z32_FLOAT_S8X24_UINT, PACKED_2x32(0xffffffff, 0x000000ff), PACKED_2x32(0x3f800000, 0x00000000), UNPACKED_1x1( 1.0, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_Z32_FLOAT_S8X24_UINT, PACKED_2x32(0xffffffff, 0x000000ff), PACKED_2x32(0x00000000, 0x000000ff), UNPACKED_1x1( 0.0, 255.0, 0.0, 0.0)}, - - /* - * YUV formats - */ - - {PIPE_FORMAT_R8G8_B8G8_UNORM, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x00, 0x00, 0x00, 0x00), UNPACKED_2x1(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R8G8_B8G8_UNORM, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0xff, 0x00, 0x00, 0x00), UNPACKED_2x1(1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R8G8_B8G8_UNORM, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x00, 0xff, 0x00, 0x00), UNPACKED_2x1(0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R8G8_B8G8_UNORM, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x00, 0x00, 0xff, 0x00), UNPACKED_2x1(0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0)}, - {PIPE_FORMAT_R8G8_B8G8_UNORM, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x00, 0x00, 0x00, 0xff), UNPACKED_2x1(0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0)}, - {PIPE_FORMAT_R8G8_B8G8_UNORM, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0xff, 0xff, 0xff, 0xff), UNPACKED_2x1(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0)}, - - {PIPE_FORMAT_G8R8_G8B8_UNORM, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x00, 0x00, 0x00, 0x00), UNPACKED_2x1(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_G8R8_G8B8_UNORM, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0xff, 0x00, 0x00, 0x00), UNPACKED_2x1(0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_G8R8_G8B8_UNORM, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x00, 0xff, 0x00, 0x00), UNPACKED_2x1(1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_G8R8_G8B8_UNORM, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x00, 0x00, 0xff, 0x00), UNPACKED_2x1(0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0)}, - {PIPE_FORMAT_G8R8_G8B8_UNORM, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x00, 0x00, 0x00, 0xff), UNPACKED_2x1(0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0)}, - {PIPE_FORMAT_G8R8_G8B8_UNORM, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0xff, 0xff, 0xff, 0xff), UNPACKED_2x1(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0)}, - - /* - * TODO: Exercise the UV channels as well. - */ - {PIPE_FORMAT_UYVY, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x80, 0x10, 0x80, 0x10), UNPACKED_2x1(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_UYVY, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x80, 0xeb, 0x80, 0x10), UNPACKED_2x1(1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_UYVY, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x80, 0x10, 0x80, 0xeb), UNPACKED_2x1(0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0)}, - - {PIPE_FORMAT_YUYV, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x10, 0x80, 0x10, 0x80), UNPACKED_2x1(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_YUYV, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0xeb, 0x80, 0x10, 0x80), UNPACKED_2x1(1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_YUYV, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x10, 0x80, 0xeb, 0x80), UNPACKED_2x1(0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0)}, - - /* - * Compressed formats - */ - - { - PIPE_FORMAT_DXT1_RGB, - PACKED_8x8(0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff), - PACKED_8x8(0xf2, 0xd7, 0xb0, 0x20, 0xae, 0x2c, 0x6f, 0x97), - { - { - {0x99/255.0, 0xb0/255.0, 0x8e/255.0, 0xff/255.0}, - {0x5d/255.0, 0x62/255.0, 0x89/255.0, 0xff/255.0}, - {0x99/255.0, 0xb0/255.0, 0x8e/255.0, 0xff/255.0}, - {0x99/255.0, 0xb0/255.0, 0x8e/255.0, 0xff/255.0} - }, - { - {0xd6/255.0, 0xff/255.0, 0x94/255.0, 0xff/255.0}, - {0x5d/255.0, 0x62/255.0, 0x89/255.0, 0xff/255.0}, - {0x99/255.0, 0xb0/255.0, 0x8e/255.0, 0xff/255.0}, - {0xd6/255.0, 0xff/255.0, 0x94/255.0, 0xff/255.0} - }, - { - {0x5d/255.0, 0x62/255.0, 0x89/255.0, 0xff/255.0}, - {0x5d/255.0, 0x62/255.0, 0x89/255.0, 0xff/255.0}, - {0x99/255.0, 0xb0/255.0, 0x8e/255.0, 0xff/255.0}, - {0x21/255.0, 0x14/255.0, 0x84/255.0, 0xff/255.0} - }, - { - {0x5d/255.0, 0x62/255.0, 0x89/255.0, 0xff/255.0}, - {0x21/255.0, 0x14/255.0, 0x84/255.0, 0xff/255.0}, - {0x21/255.0, 0x14/255.0, 0x84/255.0, 0xff/255.0}, - {0x99/255.0, 0xb0/255.0, 0x8e/255.0, 0xff/255.0} - } - } - }, - { - PIPE_FORMAT_DXT1_RGBA, - PACKED_8x8(0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff), - PACKED_8x8(0xff, 0x2f, 0xa4, 0x72, 0xeb, 0xb2, 0xbd, 0xbe), - { - { - {0x00/255.0, 0x00/255.0, 0x00/255.0, 0x00/255.0}, - {0x4e/255.0, 0xaa/255.0, 0x90/255.0, 0xff/255.0}, - {0x4e/255.0, 0xaa/255.0, 0x90/255.0, 0xff/255.0}, - {0x00/255.0, 0x00/255.0, 0x00/255.0, 0x00/255.0} - }, - { - {0x4e/255.0, 0xaa/255.0, 0x90/255.0, 0xff/255.0}, - {0x29/255.0, 0xff/255.0, 0xff/255.0, 0xff/255.0}, - {0x00/255.0, 0x00/255.0, 0x00/255.0, 0x00/255.0}, - {0x4e/255.0, 0xaa/255.0, 0x90/255.0, 0xff/255.0} - }, - { - {0x73/255.0, 0x55/255.0, 0x21/255.0, 0xff/255.0}, - {0x00/255.0, 0x00/255.0, 0x00/255.0, 0x00/255.0}, - {0x00/255.0, 0x00/255.0, 0x00/255.0, 0x00/255.0}, - {0x4e/255.0, 0xaa/255.0, 0x90/255.0, 0xff/255.0} - }, - { - {0x4e/255.0, 0xaa/255.0, 0x90/255.0, 0xff/255.0}, - {0x00/255.0, 0x00/255.0, 0x00/255.0, 0x00/255.0}, - {0x00/255.0, 0x00/255.0, 0x00/255.0, 0x00/255.0}, - {0x4e/255.0, 0xaa/255.0, 0x90/255.0, 0xff/255.0} - } - } - }, - { - PIPE_FORMAT_DXT3_RGBA, - {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, - {0xe7, 0x4a, 0x8f, 0x96, 0x5b, 0xc1, 0x1c, 0x84, 0xf6, 0x8f, 0xab, 0x32, 0x2a, 0x9a, 0x95, 0x5a}, - { - { - {0x6d/255.0, 0xc6/255.0, 0x96/255.0, 0x77/255.0}, - {0x6d/255.0, 0xc6/255.0, 0x96/255.0, 0xee/255.0}, - {0x6d/255.0, 0xc6/255.0, 0x96/255.0, 0xaa/255.0}, - {0x8c/255.0, 0xff/255.0, 0xb5/255.0, 0x44/255.0} - }, - { - {0x6d/255.0, 0xc6/255.0, 0x96/255.0, 0xff/255.0}, - {0x6d/255.0, 0xc6/255.0, 0x96/255.0, 0x88/255.0}, - {0x31/255.0, 0x55/255.0, 0x5a/255.0, 0x66/255.0}, - {0x6d/255.0, 0xc6/255.0, 0x96/255.0, 0x99/255.0} - }, - { - {0x31/255.0, 0x55/255.0, 0x5a/255.0, 0xbb/255.0}, - {0x31/255.0, 0x55/255.0, 0x5a/255.0, 0x55/255.0}, - {0x31/255.0, 0x55/255.0, 0x5a/255.0, 0x11/255.0}, - {0x6d/255.0, 0xc6/255.0, 0x96/255.0, 0xcc/255.0} - }, - { - {0x6d/255.0, 0xc6/255.0, 0x96/255.0, 0xcc/255.0}, - {0x6d/255.0, 0xc6/255.0, 0x96/255.0, 0x11/255.0}, - {0x31/255.0, 0x55/255.0, 0x5a/255.0, 0x44/255.0}, - {0x31/255.0, 0x55/255.0, 0x5a/255.0, 0x88/255.0} - } - } - }, - { - PIPE_FORMAT_DXT5_RGBA, - {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, - {0xf8, 0x11, 0xc5, 0x0c, 0x9a, 0x73, 0xb4, 0x9c, 0xf6, 0x8f, 0xab, 0x32, 0x2a, 0x9a, 0x95, 0x5a}, - { - { - {0x6d/255.0, 0xc6/255.0, 0x96/255.0, 0x74/255.0}, - {0x6d/255.0, 0xc6/255.0, 0x96/255.0, 0xf8/255.0}, - {0x6d/255.0, 0xc6/255.0, 0x96/255.0, 0xb6/255.0}, - {0x8c/255.0, 0xff/255.0, 0xb5/255.0, 0x53/255.0} - }, - { - {0x6d/255.0, 0xc6/255.0, 0x96/255.0, 0xf8/255.0}, - {0x6d/255.0, 0xc6/255.0, 0x96/255.0, 0x95/255.0}, - {0x31/255.0, 0x55/255.0, 0x5a/255.0, 0x53/255.0}, - {0x6d/255.0, 0xc6/255.0, 0x96/255.0, 0x95/255.0} - }, - { - {0x31/255.0, 0x55/255.0, 0x5a/255.0, 0xb6/255.0}, - {0x31/255.0, 0x55/255.0, 0x5a/255.0, 0x53/255.0}, - {0x31/255.0, 0x55/255.0, 0x5a/255.0, 0x11/255.0}, - {0x6d/255.0, 0xc6/255.0, 0x96/255.0, 0xd7/255.0} - }, - { - {0x6d/255.0, 0xc6/255.0, 0x96/255.0, 0xb6/255.0}, - {0x6d/255.0, 0xc6/255.0, 0x96/255.0, 0x11/255.0}, - {0x31/255.0, 0x55/255.0, 0x5a/255.0, 0x32/255.0}, - {0x31/255.0, 0x55/255.0, 0x5a/255.0, 0x95/255.0} - } - } - }, - - - /* - * Standard 8-bit integer formats - */ - - {PIPE_FORMAT_R8_UNORM, PACKED_1x8(0xff), PACKED_1x8(0x00), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R8_UNORM, PACKED_1x8(0xff), PACKED_1x8(0xff), UNPACKED_1x1(1.0, 0.0, 0.0, 1.0)}, - - {PIPE_FORMAT_R8G8_UNORM, PACKED_2x8(0xff, 0xff), PACKED_2x8(0x00, 0x00), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R8G8_UNORM, PACKED_2x8(0xff, 0xff), PACKED_2x8(0xff, 0x00), UNPACKED_1x1(1.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R8G8_UNORM, PACKED_2x8(0xff, 0xff), PACKED_2x8(0x00, 0xff), UNPACKED_1x1(0.0, 1.0, 0.0, 1.0)}, - {PIPE_FORMAT_R8G8_UNORM, PACKED_2x8(0xff, 0xff), PACKED_2x8(0xff, 0xff), UNPACKED_1x1(1.0, 1.0, 0.0, 1.0)}, - - {PIPE_FORMAT_R8G8B8_UNORM, PACKED_3x8(0xff, 0xff, 0xff), PACKED_3x8(0x00, 0x00, 0x00), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R8G8B8_UNORM, PACKED_3x8(0xff, 0xff, 0xff), PACKED_3x8(0xff, 0x00, 0x00), UNPACKED_1x1(1.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R8G8B8_UNORM, PACKED_3x8(0xff, 0xff, 0xff), PACKED_3x8(0x00, 0xff, 0x00), UNPACKED_1x1(0.0, 1.0, 0.0, 1.0)}, - {PIPE_FORMAT_R8G8B8_UNORM, PACKED_3x8(0xff, 0xff, 0xff), PACKED_3x8(0x00, 0x00, 0xff), UNPACKED_1x1(0.0, 0.0, 1.0, 1.0)}, - {PIPE_FORMAT_R8G8B8_UNORM, PACKED_3x8(0xff, 0xff, 0xff), PACKED_3x8(0xff, 0xff, 0xff), UNPACKED_1x1(1.0, 1.0, 1.0, 1.0)}, - - {PIPE_FORMAT_R8G8B8A8_UNORM, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x00, 0x00, 0x00, 0x00), UNPACKED_1x1(0.0, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_R8G8B8A8_UNORM, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0xff, 0x00, 0x00, 0x00), UNPACKED_1x1(1.0, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_R8G8B8A8_UNORM, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x00, 0xff, 0x00, 0x00), UNPACKED_1x1(0.0, 1.0, 0.0, 0.0)}, - {PIPE_FORMAT_R8G8B8A8_UNORM, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x00, 0x00, 0xff, 0x00), UNPACKED_1x1(0.0, 0.0, 1.0, 0.0)}, - {PIPE_FORMAT_R8G8B8A8_UNORM, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x00, 0x00, 0x00, 0xff), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R8G8B8A8_UNORM, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0xff, 0xff, 0xff, 0xff), UNPACKED_1x1(1.0, 1.0, 1.0, 1.0)}, - - {PIPE_FORMAT_R8_USCALED, PACKED_1x8(0xff), PACKED_1x8(0x00), UNPACKED_1x1( 0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R8_USCALED, PACKED_1x8(0xff), PACKED_1x8(0xff), UNPACKED_1x1(255.0, 0.0, 0.0, 1.0)}, - - {PIPE_FORMAT_R8G8_USCALED, PACKED_2x8(0xff, 0xff), PACKED_2x8(0x00, 0x00), UNPACKED_1x1( 0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R8G8_USCALED, PACKED_2x8(0xff, 0xff), PACKED_2x8(0xff, 0x00), UNPACKED_1x1(255.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R8G8_USCALED, PACKED_2x8(0xff, 0xff), PACKED_2x8(0x00, 0xff), UNPACKED_1x1( 0.0, 255.0, 0.0, 1.0)}, - {PIPE_FORMAT_R8G8_USCALED, PACKED_2x8(0xff, 0xff), PACKED_2x8(0xff, 0xff), UNPACKED_1x1(255.0, 255.0, 0.0, 1.0)}, - - {PIPE_FORMAT_R8G8B8_USCALED, PACKED_3x8(0xff, 0xff, 0xff), PACKED_3x8(0x00, 0x00, 0x00), UNPACKED_1x1( 0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R8G8B8_USCALED, PACKED_3x8(0xff, 0xff, 0xff), PACKED_3x8(0xff, 0x00, 0x00), UNPACKED_1x1(255.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R8G8B8_USCALED, PACKED_3x8(0xff, 0xff, 0xff), PACKED_3x8(0x00, 0xff, 0x00), UNPACKED_1x1( 0.0, 255.0, 0.0, 1.0)}, - {PIPE_FORMAT_R8G8B8_USCALED, PACKED_3x8(0xff, 0xff, 0xff), PACKED_3x8(0x00, 0x00, 0xff), UNPACKED_1x1( 0.0, 0.0, 255.0, 1.0)}, - {PIPE_FORMAT_R8G8B8_USCALED, PACKED_3x8(0xff, 0xff, 0xff), PACKED_3x8(0xff, 0xff, 0xff), UNPACKED_1x1(255.0, 255.0, 255.0, 1.0)}, - - {PIPE_FORMAT_R8G8B8A8_USCALED, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x00, 0x00, 0x00, 0x00), UNPACKED_1x1( 0.0, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_R8G8B8A8_USCALED, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0xff, 0x00, 0x00, 0x00), UNPACKED_1x1(255.0, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_R8G8B8A8_USCALED, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x00, 0xff, 0x00, 0x00), UNPACKED_1x1( 0.0, 255.0, 0.0, 0.0)}, - {PIPE_FORMAT_R8G8B8A8_USCALED, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x00, 0x00, 0xff, 0x00), UNPACKED_1x1( 0.0, 0.0, 255.0, 0.0)}, - {PIPE_FORMAT_R8G8B8A8_USCALED, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x00, 0x00, 0x00, 0xff), UNPACKED_1x1( 0.0, 0.0, 0.0, 255.0)}, - {PIPE_FORMAT_R8G8B8A8_USCALED, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0xff, 0xff, 0xff, 0xff), UNPACKED_1x1(255.0, 255.0, 255.0, 255.0)}, - - {PIPE_FORMAT_R8_SNORM, PACKED_1x8(0xff), PACKED_1x8(0x00), UNPACKED_1x1( 0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R8_SNORM, PACKED_1x8(0xff), PACKED_1x8(0x7f), UNPACKED_1x1( 1.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R8_SNORM, PACKED_1x8(0xff), PACKED_1x8(0x81), UNPACKED_1x1(-1.0, 0.0, 0.0, 1.0)}, - - {PIPE_FORMAT_R8G8_SNORM, PACKED_2x8(0xff, 0xff), PACKED_2x8(0x00, 0x00), UNPACKED_1x1( 0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R8G8_SNORM, PACKED_2x8(0xff, 0xff), PACKED_2x8(0x7f, 0x00), UNPACKED_1x1( 1.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R8G8_SNORM, PACKED_2x8(0xff, 0xff), PACKED_2x8(0x81, 0x00), UNPACKED_1x1(-1.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R8G8_SNORM, PACKED_2x8(0xff, 0xff), PACKED_2x8(0x00, 0x7f), UNPACKED_1x1( 0.0, 1.0, 0.0, 1.0)}, - {PIPE_FORMAT_R8G8_SNORM, PACKED_2x8(0xff, 0xff), PACKED_2x8(0x00, 0x81), UNPACKED_1x1( 0.0, -1.0, 0.0, 1.0)}, - - {PIPE_FORMAT_R8G8B8_SNORM, PACKED_3x8(0xff, 0xff, 0xff), PACKED_3x8(0x00, 0x00, 0x00), UNPACKED_1x1( 0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R8G8B8_SNORM, PACKED_3x8(0xff, 0xff, 0xff), PACKED_3x8(0x7f, 0x00, 0x00), UNPACKED_1x1( 1.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R8G8B8_SNORM, PACKED_3x8(0xff, 0xff, 0xff), PACKED_3x8(0x81, 0x00, 0x00), UNPACKED_1x1(-1.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R8G8B8_SNORM, PACKED_3x8(0xff, 0xff, 0xff), PACKED_3x8(0x00, 0x7f, 0x00), UNPACKED_1x1( 0.0, 1.0, 0.0, 1.0)}, - {PIPE_FORMAT_R8G8B8_SNORM, PACKED_3x8(0xff, 0xff, 0xff), PACKED_3x8(0x00, 0x81, 0x00), UNPACKED_1x1( 0.0, -1.0, 0.0, 1.0)}, - {PIPE_FORMAT_R8G8B8_SNORM, PACKED_3x8(0xff, 0xff, 0xff), PACKED_3x8(0x00, 0x00, 0x7f), UNPACKED_1x1( 0.0, 0.0, 1.0, 1.0)}, - {PIPE_FORMAT_R8G8B8_SNORM, PACKED_3x8(0xff, 0xff, 0xff), PACKED_3x8(0x00, 0x00, 0x81), UNPACKED_1x1( 0.0, 0.0, -1.0, 1.0)}, - - {PIPE_FORMAT_R8G8B8A8_SNORM, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x00, 0x00, 0x00, 0x00), UNPACKED_1x1( 0.0, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_R8G8B8A8_SNORM, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x7f, 0x00, 0x00, 0x00), UNPACKED_1x1( 1.0, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_R8G8B8A8_SNORM, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x81, 0x00, 0x00, 0x00), UNPACKED_1x1(-1.0, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_R8G8B8A8_SNORM, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x00, 0x7f, 0x00, 0x00), UNPACKED_1x1( 0.0, 1.0, 0.0, 0.0)}, - {PIPE_FORMAT_R8G8B8A8_SNORM, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x00, 0x81, 0x00, 0x00), UNPACKED_1x1( 0.0, -1.0, 0.0, 0.0)}, - {PIPE_FORMAT_R8G8B8A8_SNORM, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x00, 0x00, 0x7f, 0x00), UNPACKED_1x1( 0.0, 0.0, 1.0, 0.0)}, - {PIPE_FORMAT_R8G8B8A8_SNORM, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x00, 0x00, 0x81, 0x00), UNPACKED_1x1( 0.0, 0.0, -1.0, 0.0)}, - {PIPE_FORMAT_R8G8B8A8_SNORM, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x00, 0x00, 0x00, 0x7f), UNPACKED_1x1( 0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R8G8B8A8_SNORM, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x00, 0x00, 0x00, 0x81), UNPACKED_1x1( 0.0, 0.0, 0.0, -1.0)}, - - {PIPE_FORMAT_R8_SSCALED, PACKED_1x8(0xff), PACKED_1x8(0x00), UNPACKED_1x1( 0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R8_SSCALED, PACKED_1x8(0xff), PACKED_1x8(0x7f), UNPACKED_1x1( 127.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R8_SSCALED, PACKED_1x8(0xff), PACKED_1x8(0x80), UNPACKED_1x1(-128.0, 0.0, 0.0, 1.0)}, - - {PIPE_FORMAT_R8G8_SSCALED, PACKED_2x8(0xff, 0xff), PACKED_2x8(0x00, 0x00), UNPACKED_1x1( 0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R8G8_SSCALED, PACKED_2x8(0xff, 0xff), PACKED_2x8(0x7f, 0x00), UNPACKED_1x1( 127.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R8G8_SSCALED, PACKED_2x8(0xff, 0xff), PACKED_2x8(0x80, 0x00), UNPACKED_1x1(-128.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R8G8_SSCALED, PACKED_2x8(0xff, 0xff), PACKED_2x8(0x00, 0x7f), UNPACKED_1x1( 0.0, 127.0, 0.0, 1.0)}, - {PIPE_FORMAT_R8G8_SSCALED, PACKED_2x8(0xff, 0xff), PACKED_2x8(0x00, 0x80), UNPACKED_1x1( 0.0, -128.0, 0.0, 1.0)}, - - {PIPE_FORMAT_R8G8B8_SSCALED, PACKED_3x8(0xff, 0xff, 0xff), PACKED_3x8(0x00, 0x00, 0x00), UNPACKED_1x1( 0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R8G8B8_SSCALED, PACKED_3x8(0xff, 0xff, 0xff), PACKED_3x8(0x7f, 0x00, 0x00), UNPACKED_1x1( 127.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R8G8B8_SSCALED, PACKED_3x8(0xff, 0xff, 0xff), PACKED_3x8(0x80, 0x00, 0x00), UNPACKED_1x1(-128.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R8G8B8_SSCALED, PACKED_3x8(0xff, 0xff, 0xff), PACKED_3x8(0x00, 0x7f, 0x00), UNPACKED_1x1( 0.0, 127.0, 0.0, 1.0)}, - {PIPE_FORMAT_R8G8B8_SSCALED, PACKED_3x8(0xff, 0xff, 0xff), PACKED_3x8(0x00, 0x80, 0x00), UNPACKED_1x1( 0.0, -128.0, 0.0, 1.0)}, - {PIPE_FORMAT_R8G8B8_SSCALED, PACKED_3x8(0xff, 0xff, 0xff), PACKED_3x8(0x00, 0x00, 0x7f), UNPACKED_1x1( 0.0, 0.0, 127.0, 1.0)}, - {PIPE_FORMAT_R8G8B8_SSCALED, PACKED_3x8(0xff, 0xff, 0xff), PACKED_3x8(0x00, 0x00, 0x80), UNPACKED_1x1( 0.0, 0.0, -128.0, 1.0)}, - - {PIPE_FORMAT_R8G8B8A8_SSCALED, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x00, 0x00, 0x00, 0x00), UNPACKED_1x1( 0.0, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_R8G8B8A8_SSCALED, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x7f, 0x00, 0x00, 0x00), UNPACKED_1x1( 127.0, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_R8G8B8A8_SSCALED, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x80, 0x00, 0x00, 0x00), UNPACKED_1x1(-128.0, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_R8G8B8A8_SSCALED, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x00, 0x7f, 0x00, 0x00), UNPACKED_1x1( 0.0, 127.0, 0.0, 0.0)}, - {PIPE_FORMAT_R8G8B8A8_SSCALED, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x00, 0x80, 0x00, 0x00), UNPACKED_1x1( 0.0, -128.0, 0.0, 0.0)}, - {PIPE_FORMAT_R8G8B8A8_SSCALED, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x00, 0x00, 0x7f, 0x00), UNPACKED_1x1( 0.0, 0.0, 127.0, 0.0)}, - {PIPE_FORMAT_R8G8B8A8_SSCALED, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x00, 0x00, 0x80, 0x00), UNPACKED_1x1( 0.0, 0.0, -128.0, 0.0)}, - {PIPE_FORMAT_R8G8B8A8_SSCALED, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x00, 0x00, 0x00, 0x7f), UNPACKED_1x1( 0.0, 0.0, 0.0, 127.0)}, - {PIPE_FORMAT_R8G8B8A8_SSCALED, PACKED_4x8(0xff, 0xff, 0xff, 0xff), PACKED_4x8(0x00, 0x00, 0x00, 0x80), UNPACKED_1x1( 0.0, 0.0, 0.0, -128.0)}, - - /* - * Standard 16-bit integer formats - */ - - {PIPE_FORMAT_R16_UNORM, PACKED_1x16(0xffff), PACKED_1x16(0x0000), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R16_UNORM, PACKED_1x16(0xffff), PACKED_1x16(0xffff), UNPACKED_1x1(1.0, 0.0, 0.0, 1.0)}, - - {PIPE_FORMAT_R16G16_UNORM, PACKED_2x16(0xffff, 0xffff), PACKED_2x16(0x0000, 0x0000), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R16G16_UNORM, PACKED_2x16(0xffff, 0xffff), PACKED_2x16(0xffff, 0x0000), UNPACKED_1x1(1.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R16G16_UNORM, PACKED_2x16(0xffff, 0xffff), PACKED_2x16(0x0000, 0xffff), UNPACKED_1x1(0.0, 1.0, 0.0, 1.0)}, - {PIPE_FORMAT_R16G16_UNORM, PACKED_2x16(0xffff, 0xffff), PACKED_2x16(0xffff, 0xffff), UNPACKED_1x1(1.0, 1.0, 0.0, 1.0)}, - - {PIPE_FORMAT_R16G16B16_UNORM, PACKED_3x16(0xffff, 0xffff, 0xffff), PACKED_3x16(0x0000, 0x0000, 0x0000), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R16G16B16_UNORM, PACKED_3x16(0xffff, 0xffff, 0xffff), PACKED_3x16(0xffff, 0x0000, 0x0000), UNPACKED_1x1(1.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R16G16B16_UNORM, PACKED_3x16(0xffff, 0xffff, 0xffff), PACKED_3x16(0x0000, 0xffff, 0x0000), UNPACKED_1x1(0.0, 1.0, 0.0, 1.0)}, - {PIPE_FORMAT_R16G16B16_UNORM, PACKED_3x16(0xffff, 0xffff, 0xffff), PACKED_3x16(0x0000, 0x0000, 0xffff), UNPACKED_1x1(0.0, 0.0, 1.0, 1.0)}, - {PIPE_FORMAT_R16G16B16_UNORM, PACKED_3x16(0xffff, 0xffff, 0xffff), PACKED_3x16(0xffff, 0xffff, 0xffff), UNPACKED_1x1(1.0, 1.0, 1.0, 1.0)}, - - {PIPE_FORMAT_R16G16B16A16_UNORM, PACKED_4x16(0xffff, 0xffff, 0xffff, 0xffff), PACKED_4x16(0x0000, 0x0000, 0x0000, 0x0000), UNPACKED_1x1(0.0, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_R16G16B16A16_UNORM, PACKED_4x16(0xffff, 0xffff, 0xffff, 0xffff), PACKED_4x16(0xffff, 0x0000, 0x0000, 0x0000), UNPACKED_1x1(1.0, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_R16G16B16A16_UNORM, PACKED_4x16(0xffff, 0xffff, 0xffff, 0xffff), PACKED_4x16(0x0000, 0xffff, 0x0000, 0x0000), UNPACKED_1x1(0.0, 1.0, 0.0, 0.0)}, - {PIPE_FORMAT_R16G16B16A16_UNORM, PACKED_4x16(0xffff, 0xffff, 0xffff, 0xffff), PACKED_4x16(0x0000, 0x0000, 0xffff, 0x0000), UNPACKED_1x1(0.0, 0.0, 1.0, 0.0)}, - {PIPE_FORMAT_R16G16B16A16_UNORM, PACKED_4x16(0xffff, 0xffff, 0xffff, 0xffff), PACKED_4x16(0x0000, 0x0000, 0x0000, 0xffff), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R16G16B16A16_UNORM, PACKED_4x16(0xffff, 0xffff, 0xffff, 0xffff), PACKED_4x16(0xffff, 0xffff, 0xffff, 0xffff), UNPACKED_1x1(1.0, 1.0, 1.0, 1.0)}, - - {PIPE_FORMAT_R16_USCALED, PACKED_1x16(0xffff), PACKED_1x16(0x0000), UNPACKED_1x1( 0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R16_USCALED, PACKED_1x16(0xffff), PACKED_1x16(0xffff), UNPACKED_1x1(65535.0, 0.0, 0.0, 1.0)}, - - {PIPE_FORMAT_R16G16_USCALED, PACKED_2x16(0xffff, 0xffff), PACKED_2x16(0x0000, 0x0000), UNPACKED_1x1( 0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R16G16_USCALED, PACKED_2x16(0xffff, 0xffff), PACKED_2x16(0xffff, 0x0000), UNPACKED_1x1(65535.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R16G16_USCALED, PACKED_2x16(0xffff, 0xffff), PACKED_2x16(0x0000, 0xffff), UNPACKED_1x1( 0.0, 65535.0, 0.0, 1.0)}, - {PIPE_FORMAT_R16G16_USCALED, PACKED_2x16(0xffff, 0xffff), PACKED_2x16(0xffff, 0xffff), UNPACKED_1x1(65535.0, 65535.0, 0.0, 1.0)}, - - {PIPE_FORMAT_R16G16B16_USCALED, PACKED_3x16(0xffff, 0xffff, 0xffff), PACKED_3x16(0x0000, 0x0000, 0x0000), UNPACKED_1x1( 0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R16G16B16_USCALED, PACKED_3x16(0xffff, 0xffff, 0xffff), PACKED_3x16(0xffff, 0x0000, 0x0000), UNPACKED_1x1(65535.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R16G16B16_USCALED, PACKED_3x16(0xffff, 0xffff, 0xffff), PACKED_3x16(0x0000, 0xffff, 0x0000), UNPACKED_1x1( 0.0, 65535.0, 0.0, 1.0)}, - {PIPE_FORMAT_R16G16B16_USCALED, PACKED_3x16(0xffff, 0xffff, 0xffff), PACKED_3x16(0x0000, 0x0000, 0xffff), UNPACKED_1x1( 0.0, 0.0, 65535.0, 1.0)}, - {PIPE_FORMAT_R16G16B16_USCALED, PACKED_3x16(0xffff, 0xffff, 0xffff), PACKED_3x16(0xffff, 0xffff, 0xffff), UNPACKED_1x1(65535.0, 65535.0, 65535.0, 1.0)}, - - {PIPE_FORMAT_R16G16B16A16_USCALED, PACKED_4x16(0xffff, 0xffff, 0xffff, 0xffff), PACKED_4x16(0x0000, 0x0000, 0x0000, 0x0000), UNPACKED_1x1( 0.0, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_R16G16B16A16_USCALED, PACKED_4x16(0xffff, 0xffff, 0xffff, 0xffff), PACKED_4x16(0xffff, 0x0000, 0x0000, 0x0000), UNPACKED_1x1(65535.0, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_R16G16B16A16_USCALED, PACKED_4x16(0xffff, 0xffff, 0xffff, 0xffff), PACKED_4x16(0x0000, 0xffff, 0x0000, 0x0000), UNPACKED_1x1( 0.0, 65535.0, 0.0, 0.0)}, - {PIPE_FORMAT_R16G16B16A16_USCALED, PACKED_4x16(0xffff, 0xffff, 0xffff, 0xffff), PACKED_4x16(0x0000, 0x0000, 0xffff, 0x0000), UNPACKED_1x1( 0.0, 0.0, 65535.0, 0.0)}, - {PIPE_FORMAT_R16G16B16A16_USCALED, PACKED_4x16(0xffff, 0xffff, 0xffff, 0xffff), PACKED_4x16(0x0000, 0x0000, 0x0000, 0xffff), UNPACKED_1x1( 0.0, 0.0, 0.0, 65535.0)}, - {PIPE_FORMAT_R16G16B16A16_USCALED, PACKED_4x16(0xffff, 0xffff, 0xffff, 0xffff), PACKED_4x16(0xffff, 0xffff, 0xffff, 0xffff), UNPACKED_1x1(65535.0, 65535.0, 65535.0, 65535.0)}, - - {PIPE_FORMAT_R16_SNORM, PACKED_1x16(0xffff), PACKED_1x16(0x0000), UNPACKED_1x1( 0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R16_SNORM, PACKED_1x16(0xffff), PACKED_1x16(0x7fff), UNPACKED_1x1( 1.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R16_SNORM, PACKED_1x16(0xffff), PACKED_1x16(0x8001), UNPACKED_1x1( -1.0, 0.0, 0.0, 1.0)}, - - {PIPE_FORMAT_R16G16_SNORM, PACKED_2x16(0xffff, 0xffff), PACKED_2x16(0x0000, 0x0000), UNPACKED_1x1( 0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R16G16_SNORM, PACKED_2x16(0xffff, 0xffff), PACKED_2x16(0x7fff, 0x0000), UNPACKED_1x1( 1.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R16G16_SNORM, PACKED_2x16(0xffff, 0xffff), PACKED_2x16(0x8001, 0x0000), UNPACKED_1x1( -1.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R16G16_SNORM, PACKED_2x16(0xffff, 0xffff), PACKED_2x16(0x0000, 0x7fff), UNPACKED_1x1( 0.0, 1.0, 0.0, 1.0)}, - {PIPE_FORMAT_R16G16_SNORM, PACKED_2x16(0xffff, 0xffff), PACKED_2x16(0x0000, 0x8001), UNPACKED_1x1( 0.0, -1.0, 0.0, 1.0)}, - - {PIPE_FORMAT_R16G16B16_SNORM, PACKED_3x16(0xffff, 0xffff, 0xffff), PACKED_3x16(0x0000, 0x0000, 0x0000), UNPACKED_1x1( 0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R16G16B16_SNORM, PACKED_3x16(0xffff, 0xffff, 0xffff), PACKED_3x16(0x7fff, 0x0000, 0x0000), UNPACKED_1x1( 1.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R16G16B16_SNORM, PACKED_3x16(0xffff, 0xffff, 0xffff), PACKED_3x16(0x8001, 0x0000, 0x0000), UNPACKED_1x1( -1.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R16G16B16_SNORM, PACKED_3x16(0xffff, 0xffff, 0xffff), PACKED_3x16(0x0000, 0x7fff, 0x0000), UNPACKED_1x1( 0.0, 1.0, 0.0, 1.0)}, - {PIPE_FORMAT_R16G16B16_SNORM, PACKED_3x16(0xffff, 0xffff, 0xffff), PACKED_3x16(0x0000, 0x8001, 0x0000), UNPACKED_1x1( 0.0, -1.0, 0.0, 1.0)}, - {PIPE_FORMAT_R16G16B16_SNORM, PACKED_3x16(0xffff, 0xffff, 0xffff), PACKED_3x16(0x0000, 0x0000, 0x7fff), UNPACKED_1x1( 0.0, 0.0, 1.0, 1.0)}, - {PIPE_FORMAT_R16G16B16_SNORM, PACKED_3x16(0xffff, 0xffff, 0xffff), PACKED_3x16(0x0000, 0x0000, 0x8001), UNPACKED_1x1( 0.0, 0.0, -1.0, 1.0)}, - - {PIPE_FORMAT_R16G16B16A16_SNORM, PACKED_4x16(0xffff, 0xffff, 0xffff, 0xffff), PACKED_4x16(0x0000, 0x0000, 0x0000, 0x0000), UNPACKED_1x1( 0.0, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_R16G16B16A16_SNORM, PACKED_4x16(0xffff, 0xffff, 0xffff, 0xffff), PACKED_4x16(0x7fff, 0x0000, 0x0000, 0x0000), UNPACKED_1x1( 1.0, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_R16G16B16A16_SNORM, PACKED_4x16(0xffff, 0xffff, 0xffff, 0xffff), PACKED_4x16(0x8001, 0x0000, 0x0000, 0x0000), UNPACKED_1x1( -1.0, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_R16G16B16A16_SNORM, PACKED_4x16(0xffff, 0xffff, 0xffff, 0xffff), PACKED_4x16(0x0000, 0x7fff, 0x0000, 0x0000), UNPACKED_1x1( 0.0, 1.0, 0.0, 0.0)}, - {PIPE_FORMAT_R16G16B16A16_SNORM, PACKED_4x16(0xffff, 0xffff, 0xffff, 0xffff), PACKED_4x16(0x0000, 0x8001, 0x0000, 0x0000), UNPACKED_1x1( 0.0, -1.0, 0.0, 0.0)}, - {PIPE_FORMAT_R16G16B16A16_SNORM, PACKED_4x16(0xffff, 0xffff, 0xffff, 0xffff), PACKED_4x16(0x0000, 0x0000, 0x7fff, 0x0000), UNPACKED_1x1( 0.0, 0.0, 1.0, 0.0)}, - {PIPE_FORMAT_R16G16B16A16_SNORM, PACKED_4x16(0xffff, 0xffff, 0xffff, 0xffff), PACKED_4x16(0x0000, 0x0000, 0x8001, 0x0000), UNPACKED_1x1( 0.0, 0.0, -1.0, 0.0)}, - {PIPE_FORMAT_R16G16B16A16_SNORM, PACKED_4x16(0xffff, 0xffff, 0xffff, 0xffff), PACKED_4x16(0x0000, 0x0000, 0x0000, 0x7fff), UNPACKED_1x1( 0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R16G16B16A16_SNORM, PACKED_4x16(0xffff, 0xffff, 0xffff, 0xffff), PACKED_4x16(0x0000, 0x0000, 0x0000, 0x8001), UNPACKED_1x1( 0.0, 0.0, 0.0, -1.0)}, - - {PIPE_FORMAT_R16_SSCALED, PACKED_1x16(0xffff), PACKED_1x16(0x0000), UNPACKED_1x1( 0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R16_SSCALED, PACKED_1x16(0xffff), PACKED_1x16(0x7fff), UNPACKED_1x1( 32767.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R16_SSCALED, PACKED_1x16(0xffff), PACKED_1x16(0x8000), UNPACKED_1x1(-32768.0, 0.0, 0.0, 1.0)}, - - {PIPE_FORMAT_R16G16_SSCALED, PACKED_2x16(0xffff, 0xffff), PACKED_2x16(0x0000, 0x0000), UNPACKED_1x1( 0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R16G16_SSCALED, PACKED_2x16(0xffff, 0xffff), PACKED_2x16(0x7fff, 0x0000), UNPACKED_1x1( 32767.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R16G16_SSCALED, PACKED_2x16(0xffff, 0xffff), PACKED_2x16(0x8000, 0x0000), UNPACKED_1x1(-32768.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R16G16_SSCALED, PACKED_2x16(0xffff, 0xffff), PACKED_2x16(0x0000, 0x7fff), UNPACKED_1x1( 0.0, 32767.0, 0.0, 1.0)}, - {PIPE_FORMAT_R16G16_SSCALED, PACKED_2x16(0xffff, 0xffff), PACKED_2x16(0x0000, 0x8000), UNPACKED_1x1( 0.0, -32768.0, 0.0, 1.0)}, - - {PIPE_FORMAT_R16G16B16_SSCALED, PACKED_3x16(0xffff, 0xffff, 0xffff), PACKED_3x16(0x0000, 0x0000, 0x0000), UNPACKED_1x1( 0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R16G16B16_SSCALED, PACKED_3x16(0xffff, 0xffff, 0xffff), PACKED_3x16(0x7fff, 0x0000, 0x0000), UNPACKED_1x1( 32767.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R16G16B16_SSCALED, PACKED_3x16(0xffff, 0xffff, 0xffff), PACKED_3x16(0x8000, 0x0000, 0x0000), UNPACKED_1x1(-32768.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R16G16B16_SSCALED, PACKED_3x16(0xffff, 0xffff, 0xffff), PACKED_3x16(0x0000, 0x7fff, 0x0000), UNPACKED_1x1( 0.0, 32767.0, 0.0, 1.0)}, - {PIPE_FORMAT_R16G16B16_SSCALED, PACKED_3x16(0xffff, 0xffff, 0xffff), PACKED_3x16(0x0000, 0x8000, 0x0000), UNPACKED_1x1( 0.0, -32768.0, 0.0, 1.0)}, - {PIPE_FORMAT_R16G16B16_SSCALED, PACKED_3x16(0xffff, 0xffff, 0xffff), PACKED_3x16(0x0000, 0x0000, 0x7fff), UNPACKED_1x1( 0.0, 0.0, 32767.0, 1.0)}, - {PIPE_FORMAT_R16G16B16_SSCALED, PACKED_3x16(0xffff, 0xffff, 0xffff), PACKED_3x16(0x0000, 0x0000, 0x8000), UNPACKED_1x1( 0.0, 0.0, -32768.0, 1.0)}, - - {PIPE_FORMAT_R16G16B16A16_SSCALED, PACKED_4x16(0xffff, 0xffff, 0xffff, 0xffff), PACKED_4x16(0x0000, 0x0000, 0x0000, 0x0000), UNPACKED_1x1( 0.0, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_R16G16B16A16_SSCALED, PACKED_4x16(0xffff, 0xffff, 0xffff, 0xffff), PACKED_4x16(0x7fff, 0x0000, 0x0000, 0x0000), UNPACKED_1x1( 32767.0, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_R16G16B16A16_SSCALED, PACKED_4x16(0xffff, 0xffff, 0xffff, 0xffff), PACKED_4x16(0x8000, 0x0000, 0x0000, 0x0000), UNPACKED_1x1(-32768.0, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_R16G16B16A16_SSCALED, PACKED_4x16(0xffff, 0xffff, 0xffff, 0xffff), PACKED_4x16(0x0000, 0x7fff, 0x0000, 0x0000), UNPACKED_1x1( 0.0, 32767.0, 0.0, 0.0)}, - {PIPE_FORMAT_R16G16B16A16_SSCALED, PACKED_4x16(0xffff, 0xffff, 0xffff, 0xffff), PACKED_4x16(0x0000, 0x8000, 0x0000, 0x0000), UNPACKED_1x1( 0.0, -32768.0, 0.0, 0.0)}, - {PIPE_FORMAT_R16G16B16A16_SSCALED, PACKED_4x16(0xffff, 0xffff, 0xffff, 0xffff), PACKED_4x16(0x0000, 0x0000, 0x7fff, 0x0000), UNPACKED_1x1( 0.0, 0.0, 32767.0, 0.0)}, - {PIPE_FORMAT_R16G16B16A16_SSCALED, PACKED_4x16(0xffff, 0xffff, 0xffff, 0xffff), PACKED_4x16(0x0000, 0x0000, 0x8000, 0x0000), UNPACKED_1x1( 0.0, 0.0, -32768.0, 0.0)}, - {PIPE_FORMAT_R16G16B16A16_SSCALED, PACKED_4x16(0xffff, 0xffff, 0xffff, 0xffff), PACKED_4x16(0x0000, 0x0000, 0x0000, 0x7fff), UNPACKED_1x1( 0.0, 0.0, 0.0, 32767.0)}, - {PIPE_FORMAT_R16G16B16A16_SSCALED, PACKED_4x16(0xffff, 0xffff, 0xffff, 0xffff), PACKED_4x16(0x0000, 0x0000, 0x0000, 0x8000), UNPACKED_1x1( 0.0, 0.0, 0.0, -32768.0)}, - - /* - * Standard 32-bit integer formats - * - * NOTE: We can't accurately represent integers larger than +/-0x1000000 - * with single precision floats, so that's as far as we test. - */ - - {PIPE_FORMAT_R32_UNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0x00000000), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R32_UNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0xffffffff), UNPACKED_1x1(1.0, 0.0, 0.0, 1.0)}, - - {PIPE_FORMAT_R32G32_UNORM, PACKED_2x32(0xffffffff, 0xffffffff), PACKED_2x32(0x00000000, 0x00000000), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R32G32_UNORM, PACKED_2x32(0xffffffff, 0xffffffff), PACKED_2x32(0xffffffff, 0x00000000), UNPACKED_1x1(1.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R32G32_UNORM, PACKED_2x32(0xffffffff, 0xffffffff), PACKED_2x32(0x00000000, 0xffffffff), UNPACKED_1x1(0.0, 1.0, 0.0, 1.0)}, - {PIPE_FORMAT_R32G32_UNORM, PACKED_2x32(0xffffffff, 0xffffffff), PACKED_2x32(0xffffffff, 0xffffffff), UNPACKED_1x1(1.0, 1.0, 0.0, 1.0)}, - - {PIPE_FORMAT_R32G32B32_UNORM, PACKED_3x32(0xffffffff, 0xffffffff, 0xffffffff), PACKED_3x32(0x00000000, 0x00000000, 0x00000000), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R32G32B32_UNORM, PACKED_3x32(0xffffffff, 0xffffffff, 0xffffffff), PACKED_3x32(0xffffffff, 0x00000000, 0x00000000), UNPACKED_1x1(1.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R32G32B32_UNORM, PACKED_3x32(0xffffffff, 0xffffffff, 0xffffffff), PACKED_3x32(0x00000000, 0xffffffff, 0x00000000), UNPACKED_1x1(0.0, 1.0, 0.0, 1.0)}, - {PIPE_FORMAT_R32G32B32_UNORM, PACKED_3x32(0xffffffff, 0xffffffff, 0xffffffff), PACKED_3x32(0x00000000, 0x00000000, 0xffffffff), UNPACKED_1x1(0.0, 0.0, 1.0, 1.0)}, - {PIPE_FORMAT_R32G32B32_UNORM, PACKED_3x32(0xffffffff, 0xffffffff, 0xffffffff), PACKED_3x32(0xffffffff, 0xffffffff, 0xffffffff), UNPACKED_1x1(1.0, 1.0, 1.0, 1.0)}, - - {PIPE_FORMAT_R32G32B32A32_UNORM, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x00000000, 0x00000000, 0x00000000, 0x00000000), UNPACKED_1x1(0.0, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_R32G32B32A32_UNORM, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0xffffffff, 0x00000000, 0x00000000, 0x00000000), UNPACKED_1x1(1.0, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_R32G32B32A32_UNORM, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x00000000, 0xffffffff, 0x00000000, 0x00000000), UNPACKED_1x1(0.0, 1.0, 0.0, 0.0)}, - {PIPE_FORMAT_R32G32B32A32_UNORM, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x00000000, 0x00000000, 0xffffffff, 0x00000000), UNPACKED_1x1(0.0, 0.0, 1.0, 0.0)}, - {PIPE_FORMAT_R32G32B32A32_UNORM, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x00000000, 0x00000000, 0x00000000, 0xffffffff), UNPACKED_1x1(0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R32G32B32A32_UNORM, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), UNPACKED_1x1(1.0, 1.0, 1.0, 1.0)}, - - {PIPE_FORMAT_R32_USCALED, PACKED_1x32(0xffffffff), PACKED_1x32(0x00000000), UNPACKED_1x1( 0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R32_USCALED, PACKED_1x32(0xffffffff), PACKED_1x32(0x01000000), UNPACKED_1x1(16777216.0, 0.0, 0.0, 1.0)}, - - {PIPE_FORMAT_R32G32_USCALED, PACKED_2x32(0xffffffff, 0xffffffff), PACKED_2x32(0x00000000, 0x00000000), UNPACKED_1x1( 0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R32G32_USCALED, PACKED_2x32(0xffffffff, 0xffffffff), PACKED_2x32(0x01000000, 0x00000000), UNPACKED_1x1(16777216.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R32G32_USCALED, PACKED_2x32(0xffffffff, 0xffffffff), PACKED_2x32(0x00000000, 0x01000000), UNPACKED_1x1( 0.0, 16777216.0, 0.0, 1.0)}, - {PIPE_FORMAT_R32G32_USCALED, PACKED_2x32(0xffffffff, 0xffffffff), PACKED_2x32(0x01000000, 0x01000000), UNPACKED_1x1(16777216.0, 16777216.0, 0.0, 1.0)}, - - {PIPE_FORMAT_R32G32B32_USCALED, PACKED_3x32(0xffffffff, 0xffffffff, 0xffffffff), PACKED_3x32(0x00000000, 0x00000000, 0x00000000), UNPACKED_1x1( 0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R32G32B32_USCALED, PACKED_3x32(0xffffffff, 0xffffffff, 0xffffffff), PACKED_3x32(0x01000000, 0x00000000, 0x00000000), UNPACKED_1x1(16777216.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R32G32B32_USCALED, PACKED_3x32(0xffffffff, 0xffffffff, 0xffffffff), PACKED_3x32(0x00000000, 0x01000000, 0x00000000), UNPACKED_1x1( 0.0, 16777216.0, 0.0, 1.0)}, - {PIPE_FORMAT_R32G32B32_USCALED, PACKED_3x32(0xffffffff, 0xffffffff, 0xffffffff), PACKED_3x32(0x00000000, 0x00000000, 0x01000000), UNPACKED_1x1( 0.0, 0.0, 16777216.0, 1.0)}, - {PIPE_FORMAT_R32G32B32_USCALED, PACKED_3x32(0xffffffff, 0xffffffff, 0xffffffff), PACKED_3x32(0x01000000, 0x01000000, 0x01000000), UNPACKED_1x1(16777216.0, 16777216.0, 16777216.0, 1.0)}, - - {PIPE_FORMAT_R32G32B32A32_USCALED, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x00000000, 0x00000000, 0x00000000, 0x00000000), UNPACKED_1x1( 0.0, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_R32G32B32A32_USCALED, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x01000000, 0x00000000, 0x00000000, 0x00000000), UNPACKED_1x1(16777216.0, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_R32G32B32A32_USCALED, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x00000000, 0x01000000, 0x00000000, 0x00000000), UNPACKED_1x1( 0.0, 16777216.0, 0.0, 0.0)}, - {PIPE_FORMAT_R32G32B32A32_USCALED, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x00000000, 0x00000000, 0x01000000, 0x00000000), UNPACKED_1x1( 0.0, 0.0, 16777216.0, 0.0)}, - {PIPE_FORMAT_R32G32B32A32_USCALED, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x00000000, 0x00000000, 0x00000000, 0x01000000), UNPACKED_1x1( 0.0, 0.0, 0.0, 16777216.0)}, - {PIPE_FORMAT_R32G32B32A32_USCALED, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x01000000, 0x01000000, 0x01000000, 0x01000000), UNPACKED_1x1(16777216.0, 16777216.0, 16777216.0, 16777216.0)}, - - {PIPE_FORMAT_R32_SNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0x00000000), UNPACKED_1x1( 0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R32_SNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0x7fffffff), UNPACKED_1x1( 1.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R32_SNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0x80000001), UNPACKED_1x1( -1.0, 0.0, 0.0, 1.0)}, - - {PIPE_FORMAT_R32G32_SNORM, PACKED_2x32(0xffffffff, 0xffffffff), PACKED_2x32(0x00000000, 0x00000000), UNPACKED_1x1( 0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R32G32_SNORM, PACKED_2x32(0xffffffff, 0xffffffff), PACKED_2x32(0x7fffffff, 0x00000000), UNPACKED_1x1( 1.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R32G32_SNORM, PACKED_2x32(0xffffffff, 0xffffffff), PACKED_2x32(0x80000001, 0x00000000), UNPACKED_1x1( -1.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R32G32_SNORM, PACKED_2x32(0xffffffff, 0xffffffff), PACKED_2x32(0x00000000, 0x7fffffff), UNPACKED_1x1( 0.0, 1.0, 0.0, 1.0)}, - {PIPE_FORMAT_R32G32_SNORM, PACKED_2x32(0xffffffff, 0xffffffff), PACKED_2x32(0x00000000, 0x80000001), UNPACKED_1x1( 0.0, -1.0, 0.0, 1.0)}, - - {PIPE_FORMAT_R32G32B32_SNORM, PACKED_3x32(0xffffffff, 0xffffffff, 0xffffffff), PACKED_3x32(0x00000000, 0x00000000, 0x00000000), UNPACKED_1x1( 0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R32G32B32_SNORM, PACKED_3x32(0xffffffff, 0xffffffff, 0xffffffff), PACKED_3x32(0x7fffffff, 0x00000000, 0x00000000), UNPACKED_1x1( 1.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R32G32B32_SNORM, PACKED_3x32(0xffffffff, 0xffffffff, 0xffffffff), PACKED_3x32(0x80000001, 0x00000000, 0x00000000), UNPACKED_1x1( -1.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R32G32B32_SNORM, PACKED_3x32(0xffffffff, 0xffffffff, 0xffffffff), PACKED_3x32(0x00000000, 0x7fffffff, 0x00000000), UNPACKED_1x1( 0.0, 1.0, 0.0, 1.0)}, - {PIPE_FORMAT_R32G32B32_SNORM, PACKED_3x32(0xffffffff, 0xffffffff, 0xffffffff), PACKED_3x32(0x00000000, 0x80000001, 0x00000000), UNPACKED_1x1( 0.0, -1.0, 0.0, 1.0)}, - {PIPE_FORMAT_R32G32B32_SNORM, PACKED_3x32(0xffffffff, 0xffffffff, 0xffffffff), PACKED_3x32(0x00000000, 0x00000000, 0x7fffffff), UNPACKED_1x1( 0.0, 0.0, 1.0, 1.0)}, - {PIPE_FORMAT_R32G32B32_SNORM, PACKED_3x32(0xffffffff, 0xffffffff, 0xffffffff), PACKED_3x32(0x00000000, 0x00000000, 0x80000001), UNPACKED_1x1( 0.0, 0.0, -1.0, 1.0)}, - - {PIPE_FORMAT_R32G32B32A32_SNORM, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x00000000, 0x00000000, 0x00000000, 0x00000000), UNPACKED_1x1( 0.0, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_R32G32B32A32_SNORM, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x7fffffff, 0x00000000, 0x00000000, 0x00000000), UNPACKED_1x1( 1.0, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_R32G32B32A32_SNORM, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x80000001, 0x00000000, 0x00000000, 0x00000000), UNPACKED_1x1( -1.0, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_R32G32B32A32_SNORM, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x00000000, 0x7fffffff, 0x00000000, 0x00000000), UNPACKED_1x1( 0.0, 1.0, 0.0, 0.0)}, - {PIPE_FORMAT_R32G32B32A32_SNORM, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x00000000, 0x80000001, 0x00000000, 0x00000000), UNPACKED_1x1( 0.0, -1.0, 0.0, 0.0)}, - {PIPE_FORMAT_R32G32B32A32_SNORM, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x00000000, 0x00000000, 0x7fffffff, 0x00000000), UNPACKED_1x1( 0.0, 0.0, 1.0, 0.0)}, - {PIPE_FORMAT_R32G32B32A32_SNORM, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x00000000, 0x00000000, 0x80000001, 0x00000000), UNPACKED_1x1( 0.0, 0.0, -1.0, 0.0)}, - {PIPE_FORMAT_R32G32B32A32_SNORM, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x00000000, 0x00000000, 0x00000000, 0x7fffffff), UNPACKED_1x1( 0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R32G32B32A32_SNORM, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x00000000, 0x00000000, 0x00000000, 0x80000001), UNPACKED_1x1( 0.0, 0.0, 0.0, -1.0)}, - - {PIPE_FORMAT_R32_SSCALED, PACKED_1x32(0xffffffff), PACKED_1x32(0x00000000), UNPACKED_1x1( 0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R32_SSCALED, PACKED_1x32(0xffffffff), PACKED_1x32(0x01000000), UNPACKED_1x1( 16777216.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R32_SSCALED, PACKED_1x32(0xffffffff), PACKED_1x32(0xff000000), UNPACKED_1x1(-16777216.0, 0.0, 0.0, 1.0)}, - - {PIPE_FORMAT_R32G32_SSCALED, PACKED_2x32(0xffffffff, 0xffffffff), PACKED_2x32(0x00000000, 0x00000000), UNPACKED_1x1( 0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R32G32_SSCALED, PACKED_2x32(0xffffffff, 0xffffffff), PACKED_2x32(0x01000000, 0x00000000), UNPACKED_1x1( 16777216.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R32G32_SSCALED, PACKED_2x32(0xffffffff, 0xffffffff), PACKED_2x32(0xff000000, 0x00000000), UNPACKED_1x1(-16777216.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R32G32_SSCALED, PACKED_2x32(0xffffffff, 0xffffffff), PACKED_2x32(0x00000000, 0x01000000), UNPACKED_1x1( 0.0, 16777216.0, 0.0, 1.0)}, - {PIPE_FORMAT_R32G32_SSCALED, PACKED_2x32(0xffffffff, 0xffffffff), PACKED_2x32(0x00000000, 0xff000000), UNPACKED_1x1( 0.0, -16777216.0, 0.0, 1.0)}, - - {PIPE_FORMAT_R32G32B32_SSCALED, PACKED_3x32(0xffffffff, 0xffffffff, 0xffffffff), PACKED_3x32(0x00000000, 0x00000000, 0x00000000), UNPACKED_1x1( 0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R32G32B32_SSCALED, PACKED_3x32(0xffffffff, 0xffffffff, 0xffffffff), PACKED_3x32(0x01000000, 0x00000000, 0x00000000), UNPACKED_1x1( 16777216.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R32G32B32_SSCALED, PACKED_3x32(0xffffffff, 0xffffffff, 0xffffffff), PACKED_3x32(0xff000000, 0x00000000, 0x00000000), UNPACKED_1x1(-16777216.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R32G32B32_SSCALED, PACKED_3x32(0xffffffff, 0xffffffff, 0xffffffff), PACKED_3x32(0x00000000, 0x01000000, 0x00000000), UNPACKED_1x1( 0.0, 16777216.0, 0.0, 1.0)}, - {PIPE_FORMAT_R32G32B32_SSCALED, PACKED_3x32(0xffffffff, 0xffffffff, 0xffffffff), PACKED_3x32(0x00000000, 0xff000000, 0x00000000), UNPACKED_1x1( 0.0, -16777216.0, 0.0, 1.0)}, - {PIPE_FORMAT_R32G32B32_SSCALED, PACKED_3x32(0xffffffff, 0xffffffff, 0xffffffff), PACKED_3x32(0x00000000, 0x00000000, 0x01000000), UNPACKED_1x1( 0.0, 0.0, 16777216.0, 1.0)}, - {PIPE_FORMAT_R32G32B32_SSCALED, PACKED_3x32(0xffffffff, 0xffffffff, 0xffffffff), PACKED_3x32(0x00000000, 0x00000000, 0xff000000), UNPACKED_1x1( 0.0, 0.0, -16777216.0, 1.0)}, - - {PIPE_FORMAT_R32G32B32A32_SSCALED, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x00000000, 0x00000000, 0x00000000, 0x00000000), UNPACKED_1x1( 0.0, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_R32G32B32A32_SSCALED, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x01000000, 0x00000000, 0x00000000, 0x00000000), UNPACKED_1x1( 16777216.0, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_R32G32B32A32_SSCALED, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0xff000000, 0x00000000, 0x00000000, 0x00000000), UNPACKED_1x1(-16777216.0, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_R32G32B32A32_SSCALED, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x00000000, 0x01000000, 0x00000000, 0x00000000), UNPACKED_1x1( 0.0, 16777216.0, 0.0, 0.0)}, - {PIPE_FORMAT_R32G32B32A32_SSCALED, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x00000000, 0xff000000, 0x00000000, 0x00000000), UNPACKED_1x1( 0.0, -16777216.0, 0.0, 0.0)}, - {PIPE_FORMAT_R32G32B32A32_SSCALED, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x00000000, 0x00000000, 0x01000000, 0x00000000), UNPACKED_1x1( 0.0, 0.0, 16777216.0, 0.0)}, - {PIPE_FORMAT_R32G32B32A32_SSCALED, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x00000000, 0x00000000, 0xff000000, 0x00000000), UNPACKED_1x1( 0.0, 0.0, -16777216.0, 0.0)}, - {PIPE_FORMAT_R32G32B32A32_SSCALED, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x00000000, 0x00000000, 0x00000000, 0x01000000), UNPACKED_1x1( 0.0, 0.0, 0.0, 16777216.0)}, - {PIPE_FORMAT_R32G32B32A32_SSCALED, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x00000000, 0x00000000, 0x00000000, 0xff000000), UNPACKED_1x1( 0.0, 0.0, 0.0, -16777216.0)}, - - /* - * Standard 32-bit float formats - */ - - {PIPE_FORMAT_R32_FLOAT, PACKED_1x32(0xffffffff), PACKED_1x32(0x00000000), UNPACKED_1x1( 0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R32_FLOAT, PACKED_1x32(0xffffffff), PACKED_1x32(0x3f800000), UNPACKED_1x1( 1.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R32_FLOAT, PACKED_1x32(0xffffffff), PACKED_1x32(0xbf800000), UNPACKED_1x1( -1.0, 0.0, 0.0, 1.0)}, - - {PIPE_FORMAT_R32G32_FLOAT, PACKED_2x32(0xffffffff, 0xffffffff), PACKED_2x32(0x00000000, 0x00000000), UNPACKED_1x1( 0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R32G32_FLOAT, PACKED_2x32(0xffffffff, 0xffffffff), PACKED_2x32(0x3f800000, 0x00000000), UNPACKED_1x1( 1.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R32G32_FLOAT, PACKED_2x32(0xffffffff, 0xffffffff), PACKED_2x32(0xbf800000, 0x00000000), UNPACKED_1x1(-1.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R32G32_FLOAT, PACKED_2x32(0xffffffff, 0xffffffff), PACKED_2x32(0x00000000, 0x3f800000), UNPACKED_1x1( 0.0, 1.0, 0.0, 1.0)}, - {PIPE_FORMAT_R32G32_FLOAT, PACKED_2x32(0xffffffff, 0xffffffff), PACKED_2x32(0x00000000, 0xbf800000), UNPACKED_1x1( 0.0, -1.0, 0.0, 1.0)}, - {PIPE_FORMAT_R32G32_FLOAT, PACKED_2x32(0xffffffff, 0xffffffff), PACKED_2x32(0x3f800000, 0x3f800000), UNPACKED_1x1( 1.0, 1.0, 0.0, 1.0)}, - - {PIPE_FORMAT_R32G32B32_FLOAT, PACKED_3x32(0xffffffff, 0xffffffff, 0xffffffff), PACKED_3x32(0x00000000, 0x00000000, 0x00000000), UNPACKED_1x1( 0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R32G32B32_FLOAT, PACKED_3x32(0xffffffff, 0xffffffff, 0xffffffff), PACKED_3x32(0x3f800000, 0x00000000, 0x00000000), UNPACKED_1x1( 1.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R32G32B32_FLOAT, PACKED_3x32(0xffffffff, 0xffffffff, 0xffffffff), PACKED_3x32(0xbf800000, 0x00000000, 0x00000000), UNPACKED_1x1(-1.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R32G32B32_FLOAT, PACKED_3x32(0xffffffff, 0xffffffff, 0xffffffff), PACKED_3x32(0x00000000, 0x3f800000, 0x00000000), UNPACKED_1x1( 0.0, 1.0, 0.0, 1.0)}, - {PIPE_FORMAT_R32G32B32_FLOAT, PACKED_3x32(0xffffffff, 0xffffffff, 0xffffffff), PACKED_3x32(0x00000000, 0xbf800000, 0x00000000), UNPACKED_1x1( 0.0, -1.0, 0.0, 1.0)}, - {PIPE_FORMAT_R32G32B32_FLOAT, PACKED_3x32(0xffffffff, 0xffffffff, 0xffffffff), PACKED_3x32(0x00000000, 0x00000000, 0x3f800000), UNPACKED_1x1( 0.0, 0.0, 1.0, 1.0)}, - {PIPE_FORMAT_R32G32B32_FLOAT, PACKED_3x32(0xffffffff, 0xffffffff, 0xffffffff), PACKED_3x32(0x00000000, 0x00000000, 0xbf800000), UNPACKED_1x1( 0.0, 0.0, -1.0, 1.0)}, - {PIPE_FORMAT_R32G32B32_FLOAT, PACKED_3x32(0xffffffff, 0xffffffff, 0xffffffff), PACKED_3x32(0x3f800000, 0x3f800000, 0x3f800000), UNPACKED_1x1( 1.0, 1.0, 1.0, 1.0)}, - - {PIPE_FORMAT_R32G32B32A32_FLOAT, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x00000000, 0x00000000, 0x00000000, 0x00000000), UNPACKED_1x1( 0.0, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_R32G32B32A32_FLOAT, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x3f800000, 0x00000000, 0x00000000, 0x00000000), UNPACKED_1x1( 1.0, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_R32G32B32A32_FLOAT, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0xbf800000, 0x00000000, 0x00000000, 0x00000000), UNPACKED_1x1(-1.0, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_R32G32B32A32_FLOAT, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x00000000, 0x3f800000, 0x00000000, 0x00000000), UNPACKED_1x1( 0.0, 1.0, 0.0, 0.0)}, - {PIPE_FORMAT_R32G32B32A32_FLOAT, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x00000000, 0xbf800000, 0x00000000, 0x00000000), UNPACKED_1x1( 0.0, -1.0, 0.0, 0.0)}, - {PIPE_FORMAT_R32G32B32A32_FLOAT, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x00000000, 0x00000000, 0x3f800000, 0x00000000), UNPACKED_1x1( 0.0, 0.0, 1.0, 0.0)}, - {PIPE_FORMAT_R32G32B32A32_FLOAT, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x00000000, 0x00000000, 0xbf800000, 0x00000000), UNPACKED_1x1( 0.0, 0.0, -1.0, 0.0)}, - {PIPE_FORMAT_R32G32B32A32_FLOAT, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x00000000, 0x00000000, 0x00000000, 0x3f800000), UNPACKED_1x1( 0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R32G32B32A32_FLOAT, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x00000000, 0x00000000, 0x00000000, 0xbf800000), UNPACKED_1x1( 0.0, 0.0, 0.0, -1.0)}, - {PIPE_FORMAT_R32G32B32A32_FLOAT, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000), UNPACKED_1x1( 1.0, 1.0, 1.0, 1.0)}, - - /* - * Half float formats - */ - - /* Minimum positive normal */ - {PIPE_FORMAT_R16_FLOAT, PACKED_1x16(0xffff), PACKED_1x16(0x0400), UNPACKED_1x1( 6.10352E-5, 0.0, 0.0, 1.0)}, - - /* XXX: Now that we disable denormals this test cases fails, except on - * IvyBridge processors which have intrinsics dedicated to half-float - * packing/unpacking. */ -#if 0 - /* Max denormal */ - {PIPE_FORMAT_R16_FLOAT, PACKED_1x16(0xffff), PACKED_1x16(0x03FF), UNPACKED_1x1( 6.09756E-5, 0.0, 0.0, 1.0)}, -#endif - - /* Minimum positive denormal */ - {PIPE_FORMAT_R16_FLOAT, PACKED_1x16(0xffff), PACKED_1x16(0x0001), UNPACKED_1x1( 5.96046E-8, 0.0, 0.0, 1.0)}, - - /* Min representable value */ - {PIPE_FORMAT_R16_FLOAT, PACKED_1x16(0xffff), PACKED_1x16(0xfbff), UNPACKED_1x1( -65504.0, 0.0, 0.0, 1.0)}, - - /* Max representable value */ - {PIPE_FORMAT_R16_FLOAT, PACKED_1x16(0xffff), PACKED_1x16(0x7bff), UNPACKED_1x1( 65504.0, 0.0, 0.0, 1.0)}, - -#if !defined(PIPE_CC_MSVC) - - /* NaNs */ - {PIPE_FORMAT_R16_FLOAT, PACKED_1x16(0xffff), PACKED_1x16(0x7c01), UNPACKED_1x1( NAN, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R16_FLOAT, PACKED_1x16(0xffff), PACKED_1x16(0xfc01), UNPACKED_1x1( -NAN, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R16_FLOAT, PACKED_1x16(0xffff), PACKED_1x16(0x7fff), UNPACKED_1x1( NAN, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R16_FLOAT, PACKED_1x16(0xffff), PACKED_1x16(0xffff), UNPACKED_1x1( -NAN, 0.0, 0.0, 1.0)}, - - /* Inf */ - {PIPE_FORMAT_R16_FLOAT, PACKED_1x16(0xffff), PACKED_1x16(0x7c00), UNPACKED_1x1( INFINITY, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R16_FLOAT, PACKED_1x16(0xffff), PACKED_1x16(0xfc00), UNPACKED_1x1( -INFINITY, 0.0, 0.0, 1.0)}, - -#endif - - /* Zero, ignore sign */ - {PIPE_FORMAT_R16_FLOAT, PACKED_1x16(0x7fff), PACKED_1x16(0x8000), UNPACKED_1x1( -0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R16_FLOAT, PACKED_1x16(0x7fff), PACKED_1x16(0x0000), UNPACKED_1x1( 0.0, 0.0, 0.0, 1.0)}, - - {PIPE_FORMAT_R16_FLOAT, PACKED_1x16(0xffff), PACKED_1x16(0x3c00), UNPACKED_1x1( 1.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R16_FLOAT, PACKED_1x16(0xffff), PACKED_1x16(0xbc00), UNPACKED_1x1( -1.0, 0.0, 0.0, 1.0)}, - - {PIPE_FORMAT_R16G16_FLOAT, PACKED_2x16(0xffff, 0xffff), PACKED_2x16(0x0000, 0x0000), UNPACKED_1x1( 0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R16G16_FLOAT, PACKED_2x16(0xffff, 0xffff), PACKED_2x16(0x3c00, 0x0000), UNPACKED_1x1( 1.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R16G16_FLOAT, PACKED_2x16(0xffff, 0xffff), PACKED_2x16(0xbc00, 0x0000), UNPACKED_1x1(-1.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R16G16_FLOAT, PACKED_2x16(0xffff, 0xffff), PACKED_2x16(0x0000, 0x3c00), UNPACKED_1x1( 0.0, 1.0, 0.0, 1.0)}, - {PIPE_FORMAT_R16G16_FLOAT, PACKED_2x16(0xffff, 0xffff), PACKED_2x16(0x0000, 0xbc00), UNPACKED_1x1( 0.0, -1.0, 0.0, 1.0)}, - {PIPE_FORMAT_R16G16_FLOAT, PACKED_2x16(0xffff, 0xffff), PACKED_2x16(0x3c00, 0x3c00), UNPACKED_1x1( 1.0, 1.0, 0.0, 1.0)}, - - {PIPE_FORMAT_R16G16B16_FLOAT, PACKED_3x16(0xffff, 0xffff, 0xffff), PACKED_3x16(0x0000, 0x0000, 0x0000), UNPACKED_1x1( 0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R16G16B16_FLOAT, PACKED_3x16(0xffff, 0xffff, 0xffff), PACKED_3x16(0x3c00, 0x0000, 0x0000), UNPACKED_1x1( 1.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R16G16B16_FLOAT, PACKED_3x16(0xffff, 0xffff, 0xffff), PACKED_3x16(0xbc00, 0x0000, 0x0000), UNPACKED_1x1(-1.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R16G16B16_FLOAT, PACKED_3x16(0xffff, 0xffff, 0xffff), PACKED_3x16(0x0000, 0x3c00, 0x0000), UNPACKED_1x1( 0.0, 1.0, 0.0, 1.0)}, - {PIPE_FORMAT_R16G16B16_FLOAT, PACKED_3x16(0xffff, 0xffff, 0xffff), PACKED_3x16(0x0000, 0xbc00, 0x0000), UNPACKED_1x1( 0.0, -1.0, 0.0, 1.0)}, - {PIPE_FORMAT_R16G16B16_FLOAT, PACKED_3x16(0xffff, 0xffff, 0xffff), PACKED_3x16(0x0000, 0x0000, 0x3c00), UNPACKED_1x1( 0.0, 0.0, 1.0, 1.0)}, - {PIPE_FORMAT_R16G16B16_FLOAT, PACKED_3x16(0xffff, 0xffff, 0xffff), PACKED_3x16(0x0000, 0x0000, 0xbc00), UNPACKED_1x1( 0.0, 0.0, -1.0, 1.0)}, - {PIPE_FORMAT_R16G16B16_FLOAT, PACKED_3x16(0xffff, 0xffff, 0xffff), PACKED_3x16(0x3c00, 0x3c00, 0x3c00), UNPACKED_1x1( 1.0, 1.0, 1.0, 1.0)}, - - {PIPE_FORMAT_R16G16B16A16_FLOAT, PACKED_4x16(0xffff, 0xffff, 0xffff, 0xffff), PACKED_4x16(0x0000, 0x0000, 0x0000, 0x0000), UNPACKED_1x1( 0.0, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_R16G16B16A16_FLOAT, PACKED_4x16(0xffff, 0xffff, 0xffff, 0xffff), PACKED_4x16(0x3c00, 0x0000, 0x0000, 0x0000), UNPACKED_1x1( 1.0, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_R16G16B16A16_FLOAT, PACKED_4x16(0xffff, 0xffff, 0xffff, 0xffff), PACKED_4x16(0xbc00, 0x0000, 0x0000, 0x0000), UNPACKED_1x1(-1.0, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_R16G16B16A16_FLOAT, PACKED_4x16(0xffff, 0xffff, 0xffff, 0xffff), PACKED_4x16(0x0000, 0x3c00, 0x0000, 0x0000), UNPACKED_1x1( 0.0, 1.0, 0.0, 0.0)}, - {PIPE_FORMAT_R16G16B16A16_FLOAT, PACKED_4x16(0xffff, 0xffff, 0xffff, 0xffff), PACKED_4x16(0x0000, 0xbc00, 0x0000, 0x0000), UNPACKED_1x1( 0.0, -1.0, 0.0, 0.0)}, - {PIPE_FORMAT_R16G16B16A16_FLOAT, PACKED_4x16(0xffff, 0xffff, 0xffff, 0xffff), PACKED_4x16(0x0000, 0x0000, 0x3c00, 0x0000), UNPACKED_1x1( 0.0, 0.0, 1.0, 0.0)}, - {PIPE_FORMAT_R16G16B16A16_FLOAT, PACKED_4x16(0xffff, 0xffff, 0xffff, 0xffff), PACKED_4x16(0x0000, 0x0000, 0xbc00, 0x0000), UNPACKED_1x1( 0.0, 0.0, -1.0, 0.0)}, - {PIPE_FORMAT_R16G16B16A16_FLOAT, PACKED_4x16(0xffff, 0xffff, 0xffff, 0xffff), PACKED_4x16(0x0000, 0x0000, 0x0000, 0x3c00), UNPACKED_1x1( 0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R16G16B16A16_FLOAT, PACKED_4x16(0xffff, 0xffff, 0xffff, 0xffff), PACKED_4x16(0x0000, 0x0000, 0x0000, 0xbc00), UNPACKED_1x1( 0.0, 0.0, 0.0, -1.0)}, - {PIPE_FORMAT_R16G16B16A16_FLOAT, PACKED_4x16(0xffff, 0xffff, 0xffff, 0xffff), PACKED_4x16(0x3c00, 0x3c00, 0x3c00, 0x3c00), UNPACKED_1x1( 1.0, 1.0, 1.0, 1.0)}, - - /* - * 32-bit fixed point formats - */ - - {PIPE_FORMAT_R32_FIXED, PACKED_1x32(0xffffffff), PACKED_1x32(0x00000000), UNPACKED_1x1( 0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R32_FIXED, PACKED_1x32(0xffffffff), PACKED_1x32(0x00010000), UNPACKED_1x1( 1.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R32_FIXED, PACKED_1x32(0xffffffff), PACKED_1x32(0xffff0000), UNPACKED_1x1( -1.0, 0.0, 0.0, 1.0)}, - - {PIPE_FORMAT_R32G32_FIXED, PACKED_2x32(0xffffffff, 0xffffffff), PACKED_2x32(0x00000000, 0x00000000), UNPACKED_1x1( 0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R32G32_FIXED, PACKED_2x32(0xffffffff, 0xffffffff), PACKED_2x32(0x00010000, 0x00000000), UNPACKED_1x1( 1.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R32G32_FIXED, PACKED_2x32(0xffffffff, 0xffffffff), PACKED_2x32(0xffff0000, 0x00000000), UNPACKED_1x1(-1.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R32G32_FIXED, PACKED_2x32(0xffffffff, 0xffffffff), PACKED_2x32(0x00000000, 0x00010000), UNPACKED_1x1( 0.0, 1.0, 0.0, 1.0)}, - {PIPE_FORMAT_R32G32_FIXED, PACKED_2x32(0xffffffff, 0xffffffff), PACKED_2x32(0x00000000, 0xffff0000), UNPACKED_1x1( 0.0, -1.0, 0.0, 1.0)}, - {PIPE_FORMAT_R32G32_FIXED, PACKED_2x32(0xffffffff, 0xffffffff), PACKED_2x32(0x00010000, 0x00010000), UNPACKED_1x1( 1.0, 1.0, 0.0, 1.0)}, - - {PIPE_FORMAT_R32G32B32_FIXED, PACKED_3x32(0xffffffff, 0xffffffff, 0xffffffff), PACKED_3x32(0x00000000, 0x00000000, 0x00000000), UNPACKED_1x1( 0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R32G32B32_FIXED, PACKED_3x32(0xffffffff, 0xffffffff, 0xffffffff), PACKED_3x32(0x00010000, 0x00000000, 0x00000000), UNPACKED_1x1( 1.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R32G32B32_FIXED, PACKED_3x32(0xffffffff, 0xffffffff, 0xffffffff), PACKED_3x32(0xffff0000, 0x00000000, 0x00000000), UNPACKED_1x1(-1.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R32G32B32_FIXED, PACKED_3x32(0xffffffff, 0xffffffff, 0xffffffff), PACKED_3x32(0x00000000, 0x00010000, 0x00000000), UNPACKED_1x1( 0.0, 1.0, 0.0, 1.0)}, - {PIPE_FORMAT_R32G32B32_FIXED, PACKED_3x32(0xffffffff, 0xffffffff, 0xffffffff), PACKED_3x32(0x00000000, 0xffff0000, 0x00000000), UNPACKED_1x1( 0.0, -1.0, 0.0, 1.0)}, - {PIPE_FORMAT_R32G32B32_FIXED, PACKED_3x32(0xffffffff, 0xffffffff, 0xffffffff), PACKED_3x32(0x00000000, 0x00000000, 0x00010000), UNPACKED_1x1( 0.0, 0.0, 1.0, 1.0)}, - {PIPE_FORMAT_R32G32B32_FIXED, PACKED_3x32(0xffffffff, 0xffffffff, 0xffffffff), PACKED_3x32(0x00000000, 0x00000000, 0xffff0000), UNPACKED_1x1( 0.0, 0.0, -1.0, 1.0)}, - {PIPE_FORMAT_R32G32B32_FIXED, PACKED_3x32(0xffffffff, 0xffffffff, 0xffffffff), PACKED_3x32(0x00010000, 0x00010000, 0x00010000), UNPACKED_1x1( 1.0, 1.0, 1.0, 1.0)}, - - {PIPE_FORMAT_R32G32B32A32_FIXED, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x00000000, 0x00000000, 0x00000000, 0x00000000), UNPACKED_1x1( 0.0, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_R32G32B32A32_FIXED, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x00010000, 0x00000000, 0x00000000, 0x00000000), UNPACKED_1x1( 1.0, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_R32G32B32A32_FIXED, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0xffff0000, 0x00000000, 0x00000000, 0x00000000), UNPACKED_1x1(-1.0, 0.0, 0.0, 0.0)}, - {PIPE_FORMAT_R32G32B32A32_FIXED, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x00000000, 0x00010000, 0x00000000, 0x00000000), UNPACKED_1x1( 0.0, 1.0, 0.0, 0.0)}, - {PIPE_FORMAT_R32G32B32A32_FIXED, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x00000000, 0xffff0000, 0x00000000, 0x00000000), UNPACKED_1x1( 0.0, -1.0, 0.0, 0.0)}, - {PIPE_FORMAT_R32G32B32A32_FIXED, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x00000000, 0x00000000, 0x00010000, 0x00000000), UNPACKED_1x1( 0.0, 0.0, 1.0, 0.0)}, - {PIPE_FORMAT_R32G32B32A32_FIXED, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x00000000, 0x00000000, 0xffff0000, 0x00000000), UNPACKED_1x1( 0.0, 0.0, -1.0, 0.0)}, - {PIPE_FORMAT_R32G32B32A32_FIXED, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x00000000, 0x00000000, 0x00000000, 0x00010000), UNPACKED_1x1( 0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R32G32B32A32_FIXED, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x00000000, 0x00000000, 0x00000000, 0xffff0000), UNPACKED_1x1( 0.0, 0.0, 0.0, -1.0)}, - {PIPE_FORMAT_R32G32B32A32_FIXED, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x00010000, 0x00010000, 0x00010000, 0x00010000), UNPACKED_1x1( 1.0, 1.0, 1.0, 1.0)}, - - /* - * D3D9 specific vertex formats - */ - - {PIPE_FORMAT_R10G10B10X2_USCALED, PACKED_1x32(0x3fffffff), PACKED_1x32(0x00000000), UNPACKED_1x1( 0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R10G10B10X2_USCALED, PACKED_1x32(0x3fffffff), PACKED_1x32(0x000003ff), UNPACKED_1x1(1023.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R10G10B10X2_USCALED, PACKED_1x32(0x3fffffff), PACKED_1x32(0x000ffc00), UNPACKED_1x1( 0.0, 1023.0, 0.0, 1.0)}, - {PIPE_FORMAT_R10G10B10X2_USCALED, PACKED_1x32(0x3fffffff), PACKED_1x32(0x3ff00000), UNPACKED_1x1( 0.0, 0.0, 1023.0, 1.0)}, - {PIPE_FORMAT_R10G10B10X2_USCALED, PACKED_1x32(0x3fffffff), PACKED_1x32(0x3fffffff), UNPACKED_1x1(1023.0, 1023.0, 1023.0, 1.0)}, - - {PIPE_FORMAT_R10G10B10X2_SNORM, PACKED_1x32(0x3fffffff), PACKED_1x32(0x00000000), UNPACKED_1x1( 0.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R10G10B10X2_SNORM, PACKED_1x32(0x3fffffff), PACKED_1x32(0x000001ff), UNPACKED_1x1( 1.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R10G10B10X2_SNORM, PACKED_1x32(0x3fffffff), PACKED_1x32(0x00000201), UNPACKED_1x1(-1.0, 0.0, 0.0, 1.0)}, - {PIPE_FORMAT_R10G10B10X2_SNORM, PACKED_1x32(0x3fffffff), PACKED_1x32(0x0007fc00), UNPACKED_1x1( 0.0, 1.0, 0.0, 1.0)}, - {PIPE_FORMAT_R10G10B10X2_SNORM, PACKED_1x32(0x3fffffff), PACKED_1x32(0x00080400), UNPACKED_1x1( 0.0, -1.0, 0.0, 1.0)}, - {PIPE_FORMAT_R10G10B10X2_SNORM, PACKED_1x32(0x3fffffff), PACKED_1x32(0x1ff00000), UNPACKED_1x1( 0.0, 0.0, 1.0, 1.0)}, - {PIPE_FORMAT_R10G10B10X2_SNORM, PACKED_1x32(0x3fffffff), PACKED_1x32(0x20100000), UNPACKED_1x1( 0.0, 0.0, -1.0, 1.0)}, - - /* - * Special formats that not fit anywhere else - */ - -}; - - -const unsigned util_format_nr_test_cases = ARRAY_SIZE(util_format_test_cases); diff -Nru mesa-19.2.8/src/gallium/auxiliary/util/u_format_tests.h mesa-20.0.8/src/gallium/auxiliary/util/u_format_tests.h --- mesa-19.2.8/src/gallium/auxiliary/util/u_format_tests.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/util/u_format_tests.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,71 +0,0 @@ -/************************************************************************** - * - * Copyright 2010 VMware, Inc. - * All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sub license, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL - * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, - * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR - * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE - * USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * The above copyright notice and this permission notice (including the - * next paragraph) shall be included in all copies or substantial portions - * of the Software. - * - **************************************************************************/ - - -#ifndef U_FORMAT_TESTS_H_ -#define U_FORMAT_TESTS_H_ - - -#include "pipe/p_compiler.h" -#include "pipe/p_format.h" - - -#define UTIL_FORMAT_MAX_PACKED_BYTES 32 // R64G64B64A64_FLOAT -#define UTIL_FORMAT_MAX_UNPACKED_WIDTH 12 // ASTC 12x12 -#define UTIL_FORMAT_MAX_UNPACKED_HEIGHT 12 - - -/** - * A (packed, unpacked) color pair. - */ -struct util_format_test_case -{ - enum pipe_format format; - - /** - * Mask of the bits that actually meaningful data. Used to mask out the - * "X" channels. - */ - uint8_t mask[UTIL_FORMAT_MAX_PACKED_BYTES]; - - uint8_t packed[UTIL_FORMAT_MAX_PACKED_BYTES]; - - /** - * RGBA. - */ - double unpacked[UTIL_FORMAT_MAX_UNPACKED_HEIGHT][UTIL_FORMAT_MAX_UNPACKED_WIDTH][4]; -}; - - -extern const struct util_format_test_case -util_format_test_cases[]; - - -extern const unsigned util_format_nr_test_cases; - - -#endif /* U_FORMAT_TESTS_H_ */ diff -Nru mesa-19.2.8/src/gallium/auxiliary/util/u_format_yuv.c mesa-20.0.8/src/gallium/auxiliary/util/u_format_yuv.c --- mesa-19.2.8/src/gallium/auxiliary/util/u_format_yuv.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/util/u_format_yuv.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,1130 +0,0 @@ -/************************************************************************** - * - * Copyright 2010 VMware, Inc. - * All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sub license, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL - * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, - * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR - * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE - * USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * The above copyright notice and this permission notice (including the - * next paragraph) shall be included in all copies or substantial portions - * of the Software. - * - **************************************************************************/ - - -/** - * @file - * YUV and RGB subsampled formats conversion. - * - * @author Jose Fonseca - */ - - -#include "util/u_debug.h" -#include "util/u_format_yuv.h" - - -void -util_format_r8g8_b8g8_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - unsigned x, y; - - for (y = 0; y < height; y += 1) { - float *dst = dst_row; - const uint32_t *src = (const uint32_t *)src_row; - uint32_t value; - float r, g0, g1, b; - - for (x = 0; x + 1 < width; x += 2) { - value = util_cpu_to_le32(*src++); - - r = ubyte_to_float((value >> 0) & 0xff); - g0 = ubyte_to_float((value >> 8) & 0xff); - b = ubyte_to_float((value >> 16) & 0xff); - g1 = ubyte_to_float((value >> 24) & 0xff); - - dst[0] = r; /* r */ - dst[1] = g0; /* g */ - dst[2] = b; /* b */ - dst[3] = 1.0f; /* a */ - dst += 4; - - dst[0] = r; /* r */ - dst[1] = g1; /* g */ - dst[2] = b; /* b */ - dst[3] = 1.0f; /* a */ - dst += 4; - } - - if (x < width) { - value = util_cpu_to_le32(*src); - - r = ubyte_to_float((value >> 0) & 0xff); - g0 = ubyte_to_float((value >> 8) & 0xff); - b = ubyte_to_float((value >> 16) & 0xff); - g1 = ubyte_to_float((value >> 24) & 0xff); - - dst[0] = r; /* r */ - dst[1] = g0; /* g */ - dst[2] = b; /* b */ - dst[3] = 1.0f; /* a */ - } - - src_row += src_stride/sizeof(*src_row); - dst_row += dst_stride/sizeof(*dst_row); - } -} - - -void -util_format_r8g8_b8g8_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - unsigned x, y; - - for (y = 0; y < height; y += 1) { - uint8_t *dst = dst_row; - const uint32_t *src = (const uint32_t *)src_row; - uint32_t value; - uint8_t r, g0, g1, b; - - for (x = 0; x + 1 < width; x += 2) { - value = util_cpu_to_le32(*src++); - - r = (value >> 0) & 0xff; - g0 = (value >> 8) & 0xff; - b = (value >> 16) & 0xff; - g1 = (value >> 24) & 0xff; - - dst[0] = r; /* r */ - dst[1] = g0; /* g */ - dst[2] = b; /* b */ - dst[3] = 0xff; /* a */ - dst += 4; - - dst[0] = r; /* r */ - dst[1] = g1; /* g */ - dst[2] = b; /* b */ - dst[3] = 0xff; /* a */ - dst += 4; - } - - if (x < width) { - value = util_cpu_to_le32(*src); - - r = (value >> 0) & 0xff; - g0 = (value >> 8) & 0xff; - b = (value >> 16) & 0xff; - g1 = (value >> 24) & 0xff; - - dst[0] = r; /* r */ - dst[1] = g0; /* g */ - dst[2] = b; /* b */ - dst[3] = 0xff; /* a */ - } - - src_row += src_stride/sizeof(*src_row); - dst_row += dst_stride/sizeof(*dst_row); - } -} - - -void -util_format_r8g8_b8g8_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, - const float *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - unsigned x, y; - - for (y = 0; y < height; y += 1) { - const float *src = src_row; - uint32_t *dst = (uint32_t *)dst_row; - float r, g0, g1, b; - uint32_t value; - - for (x = 0; x + 1 < width; x += 2) { - r = 0.5f*(src[0] + src[4]); - g0 = src[1]; - g1 = src[5]; - b = 0.5f*(src[2] + src[6]); - - value = float_to_ubyte(r); - value |= float_to_ubyte(g0) << 8; - value |= float_to_ubyte(b) << 16; - value |= float_to_ubyte(g1) << 24; - - *dst++ = util_le32_to_cpu(value); - - src += 8; - } - - if (x < width) { - r = src[0]; - g0 = src[1]; - g1 = 0; - b = src[2]; - - value = float_to_ubyte(r); - value |= float_to_ubyte(g0) << 8; - value |= float_to_ubyte(b) << 16; - value |= float_to_ubyte(g1) << 24; - - *dst = util_le32_to_cpu(value); - } - - dst_row += dst_stride/sizeof(*dst_row); - src_row += src_stride/sizeof(*src_row); - } -} - - -void -util_format_r8g8_b8g8_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - unsigned x, y; - - for (y = 0; y < height; y += 1) { - const uint8_t *src = src_row; - uint32_t *dst = (uint32_t *)dst_row; - uint32_t r, g0, g1, b; - uint32_t value; - - for (x = 0; x + 1 < width; x += 2) { - r = (src[0] + src[4] + 1) >> 1; - g0 = src[1]; - g1 = src[5]; - b = (src[2] + src[6] + 1) >> 1; - - value = r; - value |= g0 << 8; - value |= b << 16; - value |= g1 << 24; - - *dst++ = util_le32_to_cpu(value); - - src += 8; - } - - if (x < width) { - r = src[0]; - g0 = src[1]; - g1 = 0; - b = src[2]; - - value = r; - value |= g0 << 8; - value |= b << 16; - value |= g1 << 24; - - *dst = util_le32_to_cpu(value); - } - - dst_row += dst_stride/sizeof(*dst_row); - src_row += src_stride/sizeof(*src_row); - } -} - - -void -util_format_r8g8_b8g8_unorm_fetch_rgba_float(float *dst, const uint8_t *src, - unsigned i, ASSERTED unsigned j) -{ - assert(i < 2); - assert(j < 1); - - dst[0] = ubyte_to_float(src[0]); /* r */ - dst[1] = ubyte_to_float(src[1 + 2*i]); /* g */ - dst[2] = ubyte_to_float(src[2]); /* b */ - dst[3] = 1.0f; /* a */ -} - - -void -util_format_g8r8_g8b8_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - unsigned x, y; - - for (y = 0; y < height; y += 1) { - float *dst = dst_row; - const uint32_t *src = (const uint32_t *)src_row; - uint32_t value; - float r, g0, g1, b; - - for (x = 0; x + 1 < width; x += 2) { - value = util_cpu_to_le32(*src++); - - g0 = ubyte_to_float((value >> 0) & 0xff); - r = ubyte_to_float((value >> 8) & 0xff); - g1 = ubyte_to_float((value >> 16) & 0xff); - b = ubyte_to_float((value >> 24) & 0xff); - - dst[0] = r; /* r */ - dst[1] = g0; /* g */ - dst[2] = b; /* b */ - dst[3] = 1.0f; /* a */ - dst += 4; - - dst[0] = r; /* r */ - dst[1] = g1; /* g */ - dst[2] = b; /* b */ - dst[3] = 1.0f; /* a */ - dst += 4; - } - - if (x < width) { - value = util_cpu_to_le32(*src); - - g0 = ubyte_to_float((value >> 0) & 0xff); - r = ubyte_to_float((value >> 8) & 0xff); - g1 = ubyte_to_float((value >> 16) & 0xff); - b = ubyte_to_float((value >> 24) & 0xff); - - dst[0] = r; /* r */ - dst[1] = g0; /* g */ - dst[2] = b; /* b */ - dst[3] = 1.0f; /* a */ - } - - src_row += src_stride/sizeof(*src_row); - dst_row += dst_stride/sizeof(*dst_row); - } -} - - -void -util_format_g8r8_g8b8_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - unsigned x, y; - - for (y = 0; y < height; y += 1) { - uint8_t *dst = dst_row; - const uint32_t *src = (const uint32_t *)src_row; - uint32_t value; - uint8_t r, g0, g1, b; - - for (x = 0; x + 1 < width; x += 2) { - value = util_cpu_to_le32(*src++); - - g0 = (value >> 0) & 0xff; - r = (value >> 8) & 0xff; - g1 = (value >> 16) & 0xff; - b = (value >> 24) & 0xff; - - dst[0] = r; /* r */ - dst[1] = g0; /* g */ - dst[2] = b; /* b */ - dst[3] = 0xff; /* a */ - dst += 4; - - dst[0] = r; /* r */ - dst[1] = g1; /* g */ - dst[2] = b; /* b */ - dst[3] = 0xff; /* a */ - dst += 4; - } - - if (x < width) { - value = util_cpu_to_le32(*src); - - g0 = (value >> 0) & 0xff; - r = (value >> 8) & 0xff; - g1 = (value >> 16) & 0xff; - b = (value >> 24) & 0xff; - - dst[0] = r; /* r */ - dst[1] = g0; /* g */ - dst[2] = b; /* b */ - dst[3] = 0xff; /* a */ - } - - src_row += src_stride/sizeof(*src_row); - dst_row += dst_stride/sizeof(*dst_row); - } -} - - -void -util_format_g8r8_g8b8_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, - const float *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - unsigned x, y; - - for (y = 0; y < height; y += 1) { - const float *src = src_row; - uint32_t *dst = (uint32_t *)dst_row; - float r, g0, g1, b; - uint32_t value; - - for (x = 0; x + 1 < width; x += 2) { - r = 0.5f*(src[0] + src[4]); - g0 = src[1]; - g1 = src[5]; - b = 0.5f*(src[2] + src[6]); - - value = float_to_ubyte(g0); - value |= float_to_ubyte(r) << 8; - value |= float_to_ubyte(g1) << 16; - value |= float_to_ubyte(b) << 24; - - *dst++ = util_le32_to_cpu(value); - - src += 8; - } - - if (x < width) { - r = src[0]; - g0 = src[1]; - g1 = 0; - b = src[2]; - - value = float_to_ubyte(g0); - value |= float_to_ubyte(r) << 8; - value |= float_to_ubyte(g1) << 16; - value |= float_to_ubyte(b) << 24; - - *dst = util_le32_to_cpu(value); - } - - dst_row += dst_stride/sizeof(*dst_row); - src_row += src_stride/sizeof(*src_row); - } -} - - -void -util_format_g8r8_g8b8_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - unsigned x, y; - - for (y = 0; y < height; y += 1) { - const uint8_t *src = src_row; - uint32_t *dst = (uint32_t *)dst_row; - uint32_t r, g0, g1, b; - uint32_t value; - - for (x = 0; x + 1 < width; x += 2) { - r = (src[0] + src[4] + 1) >> 1; - g0 = src[1]; - g1 = src[5]; - b = (src[2] + src[6] + 1) >> 1; - - value = g0; - value |= r << 8; - value |= g1 << 16; - value |= b << 24; - - *dst++ = util_le32_to_cpu(value); - - src += 8; - } - - if (x < width) { - r = src[0]; - g0 = src[1]; - g1 = 0; - b = src[2]; - - value = g0; - value |= r << 8; - value |= g1 << 16; - value |= b << 24; - - *dst = util_le32_to_cpu(value); - } - - dst_row += dst_stride/sizeof(*dst_row); - src_row += src_stride/sizeof(*src_row); - } -} - - -void -util_format_g8r8_g8b8_unorm_fetch_rgba_float(float *dst, const uint8_t *src, - unsigned i, ASSERTED unsigned j) -{ - assert(i < 2); - assert(j < 1); - - dst[0] = ubyte_to_float(src[1]); /* r */ - dst[1] = ubyte_to_float(src[0 + 2*i]); /* g */ - dst[2] = ubyte_to_float(src[3]); /* b */ - dst[3] = 1.0f; /* a */ -} - - -void -util_format_uyvy_unpack_rgba_float(float *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - unsigned x, y; - - for (y = 0; y < height; y += 1) { - float *dst = dst_row; - const uint32_t *src = (const uint32_t *)src_row; - uint32_t value; - uint8_t y0, y1, u, v; - - for (x = 0; x + 1 < width; x += 2) { - value = util_cpu_to_le32(*src++); - - u = (value >> 0) & 0xff; - y0 = (value >> 8) & 0xff; - v = (value >> 16) & 0xff; - y1 = (value >> 24) & 0xff; - - util_format_yuv_to_rgb_float(y0, u, v, &dst[0], &dst[1], &dst[2]); - dst[3] = 1.0f; /* a */ - dst += 4; - - util_format_yuv_to_rgb_float(y1, u, v, &dst[0], &dst[1], &dst[2]); - dst[3] = 1.0f; /* a */ - dst += 4; - } - - if (x < width) { - value = util_cpu_to_le32(*src); - - u = (value >> 0) & 0xff; - y0 = (value >> 8) & 0xff; - v = (value >> 16) & 0xff; - y1 = (value >> 24) & 0xff; - - util_format_yuv_to_rgb_float(y0, u, v, &dst[0], &dst[1], &dst[2]); - dst[3] = 1.0f; /* a */ - } - - src_row += src_stride/sizeof(*src_row); - dst_row += dst_stride/sizeof(*dst_row); - } -} - - -void -util_format_uyvy_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - unsigned x, y; - - for (y = 0; y < height; y += 1) { - uint8_t *dst = dst_row; - const uint32_t *src = (const uint32_t *)src_row; - uint32_t value; - uint8_t y0, y1, u, v; - - for (x = 0; x + 1 < width; x += 2) { - value = util_cpu_to_le32(*src++); - - u = (value >> 0) & 0xff; - y0 = (value >> 8) & 0xff; - v = (value >> 16) & 0xff; - y1 = (value >> 24) & 0xff; - - util_format_yuv_to_rgb_8unorm(y0, u, v, &dst[0], &dst[1], &dst[2]); - dst[3] = 0xff; /* a */ - dst += 4; - - util_format_yuv_to_rgb_8unorm(y1, u, v, &dst[0], &dst[1], &dst[2]); - dst[3] = 0xff; /* a */ - dst += 4; - } - - if (x < width) { - value = util_cpu_to_le32(*src); - - u = (value >> 0) & 0xff; - y0 = (value >> 8) & 0xff; - v = (value >> 16) & 0xff; - y1 = (value >> 24) & 0xff; - - util_format_yuv_to_rgb_8unorm(y0, u, v, &dst[0], &dst[1], &dst[2]); - dst[3] = 0xff; /* a */ - } - - src_row += src_stride/sizeof(*src_row); - dst_row += dst_stride/sizeof(*dst_row); - } -} - - -void -util_format_uyvy_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, - const float *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - unsigned x, y; - - for (y = 0; y < height; y += 1) { - const float *src = src_row; - uint32_t *dst = (uint32_t *)dst_row; - uint8_t y0, y1, u, v; - uint32_t value; - - for (x = 0; x + 1 < width; x += 2) { - uint8_t y0, y1, u0, u1, v0, v1, u, v; - - util_format_rgb_float_to_yuv(src[0], src[1], src[2], - &y0, &u0, &v0); - util_format_rgb_float_to_yuv(src[4], src[5], src[6], - &y1, &u1, &v1); - - u = (u0 + u1 + 1) >> 1; - v = (v0 + v1 + 1) >> 1; - - value = u; - value |= y0 << 8; - value |= v << 16; - value |= y1 << 24; - - *dst++ = util_le32_to_cpu(value); - - src += 8; - } - - if (x < width) { - util_format_rgb_float_to_yuv(src[0], src[1], src[2], - &y0, &u, &v); - y1 = 0; - - value = u; - value |= y0 << 8; - value |= v << 16; - value |= y1 << 24; - - *dst = util_le32_to_cpu(value); - } - - dst_row += dst_stride/sizeof(*dst_row); - src_row += src_stride/sizeof(*src_row); - } -} - - -void -util_format_uyvy_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - unsigned x, y; - - for (y = 0; y < height; y += 1) { - const uint8_t *src = src_row; - uint32_t *dst = (uint32_t *)dst_row; - uint8_t y0, y1, u, v; - uint32_t value; - - for (x = 0; x + 1 < width; x += 2) { - uint8_t y0, y1, u0, u1, v0, v1, u, v; - - util_format_rgb_8unorm_to_yuv(src[0], src[1], src[2], - &y0, &u0, &v0); - util_format_rgb_8unorm_to_yuv(src[4], src[5], src[6], - &y1, &u1, &v1); - - u = (u0 + u1 + 1) >> 1; - v = (v0 + v1 + 1) >> 1; - - value = u; - value |= y0 << 8; - value |= v << 16; - value |= y1 << 24; - - *dst++ = util_le32_to_cpu(value); - - src += 8; - } - - if (x < width) { - util_format_rgb_8unorm_to_yuv(src[0], src[1], src[2], - &y0, &u, &v); - y1 = 0; - - value = u; - value |= y0 << 8; - value |= v << 16; - value |= y1 << 24; - - *dst = util_le32_to_cpu(value); - } - - dst_row += dst_stride/sizeof(*dst_row); - src_row += src_stride/sizeof(*src_row); - } -} - - -void -util_format_uyvy_fetch_rgba_float(float *dst, const uint8_t *src, - unsigned i, ASSERTED unsigned j) -{ - uint8_t y, u, v; - - assert(i < 2); - assert(j < 1); - - y = src[1 + i*2]; - u = src[0]; - v = src[2]; - - util_format_yuv_to_rgb_float(y, u, v, &dst[0], &dst[1], &dst[2]); - - dst[3] = 1.0f; -} - - -void -util_format_yuyv_unpack_rgba_float(float *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - unsigned x, y; - - for (y = 0; y < height; y += 1) { - float *dst = dst_row; - const uint32_t *src = (const uint32_t *)src_row; - uint32_t value; - uint8_t y0, y1, u, v; - - for (x = 0; x + 1 < width; x += 2) { - value = util_cpu_to_le32(*src++); - - y0 = (value >> 0) & 0xff; - u = (value >> 8) & 0xff; - y1 = (value >> 16) & 0xff; - v = (value >> 24) & 0xff; - - util_format_yuv_to_rgb_float(y0, u, v, &dst[0], &dst[1], &dst[2]); - dst[3] = 1.0f; /* a */ - dst += 4; - - util_format_yuv_to_rgb_float(y1, u, v, &dst[0], &dst[1], &dst[2]); - dst[3] = 1.0f; /* a */ - dst += 4; - } - - if (x < width) { - value = util_cpu_to_le32(*src); - - y0 = (value >> 0) & 0xff; - u = (value >> 8) & 0xff; - y1 = (value >> 16) & 0xff; - v = (value >> 24) & 0xff; - - util_format_yuv_to_rgb_float(y0, u, v, &dst[0], &dst[1], &dst[2]); - dst[3] = 1.0f; /* a */ - } - - src_row += src_stride/sizeof(*src_row); - dst_row += dst_stride/sizeof(*dst_row); - } -} - - -void -util_format_yuyv_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - unsigned x, y; - - for (y = 0; y < height; y += 1) { - uint8_t *dst = dst_row; - const uint32_t *src = (const uint32_t *)src_row; - uint32_t value; - uint8_t y0, y1, u, v; - - for (x = 0; x + 1 < width; x += 2) { - value = util_cpu_to_le32(*src++); - - y0 = (value >> 0) & 0xff; - u = (value >> 8) & 0xff; - y1 = (value >> 16) & 0xff; - v = (value >> 24) & 0xff; - - util_format_yuv_to_rgb_8unorm(y0, u, v, &dst[0], &dst[1], &dst[2]); - dst[3] = 0xff; /* a */ - dst += 4; - - util_format_yuv_to_rgb_8unorm(y1, u, v, &dst[0], &dst[1], &dst[2]); - dst[3] = 0xff; /* a */ - dst += 4; - } - - if (x < width) { - value = util_cpu_to_le32(*src); - - y0 = (value >> 0) & 0xff; - u = (value >> 8) & 0xff; - y1 = (value >> 16) & 0xff; - v = (value >> 24) & 0xff; - - util_format_yuv_to_rgb_8unorm(y0, u, v, &dst[0], &dst[1], &dst[2]); - dst[3] = 0xff; /* a */ - } - - src_row += src_stride/sizeof(*src_row); - dst_row += dst_stride/sizeof(*dst_row); - } -} - - -void -util_format_yuyv_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, - const float *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - unsigned x, y; - - for (y = 0; y < height; y += 1) { - const float *src = src_row; - uint32_t *dst = (uint32_t *)dst_row; - uint8_t y0, y1, u, v; - uint32_t value; - - for (x = 0; x + 1 < width; x += 2) { - uint8_t y0, y1, u0, u1, v0, v1, u, v; - - util_format_rgb_float_to_yuv(src[0], src[1], src[2], - &y0, &u0, &v0); - util_format_rgb_float_to_yuv(src[4], src[5], src[6], - &y1, &u1, &v1); - - u = (u0 + u1 + 1) >> 1; - v = (v0 + v1 + 1) >> 1; - - value = y0; - value |= u << 8; - value |= y1 << 16; - value |= v << 24; - - *dst++ = util_le32_to_cpu(value); - - src += 8; - } - - if (x < width) { - util_format_rgb_float_to_yuv(src[0], src[1], src[2], - &y0, &u, &v); - y1 = 0; - - value = y0; - value |= u << 8; - value |= y1 << 16; - value |= v << 24; - - *dst = util_le32_to_cpu(value); - } - - dst_row += dst_stride/sizeof(*dst_row); - src_row += src_stride/sizeof(*src_row); - } -} - - -void -util_format_yuyv_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - unsigned x, y; - - for (y = 0; y < height; y += 1) { - const uint8_t *src = src_row; - uint32_t *dst = (uint32_t *)dst_row; - uint8_t y0, y1, u, v; - uint32_t value; - - for (x = 0; x + 1 < width; x += 2) { - uint8_t y0, y1, u0, u1, v0, v1, u, v; - - util_format_rgb_8unorm_to_yuv(src[0], src[1], src[2], - &y0, &u0, &v0); - util_format_rgb_8unorm_to_yuv(src[4], src[5], src[6], - &y1, &u1, &v1); - - u = (u0 + u1 + 1) >> 1; - v = (v0 + v1 + 1) >> 1; - - value = y0; - value |= u << 8; - value |= y1 << 16; - value |= v << 24; - - *dst++ = util_le32_to_cpu(value); - - src += 8; - } - - if (x < width) { - util_format_rgb_8unorm_to_yuv(src[0], src[1], src[2], - &y0, &u, &v); - y1 = 0; - - value = y0; - value |= u << 8; - value |= y1 << 16; - value |= v << 24; - - *dst = util_le32_to_cpu(value); - } - - dst_row += dst_stride/sizeof(*dst_row); - src_row += src_stride/sizeof(*src_row); - } -} - - -void -util_format_yuyv_fetch_rgba_float(float *dst, const uint8_t *src, - unsigned i, ASSERTED unsigned j) -{ - uint8_t y, u, v; - - assert(i < 2); - assert(j < 1); - - y = src[0 + i*2]; - u = src[1]; - v = src[3]; - - util_format_yuv_to_rgb_float(y, u, v, &dst[0], &dst[1], &dst[2]); - - dst[3] = 1.0f; -} - -/* XXX: Stubbed for now */ -void -util_format_yv12_unpack_rgba_8unorm(UNUSED uint8_t *dst_row, UNUSED unsigned dst_stride, - UNUSED const uint8_t *src_row, UNUSED unsigned src_stride, - UNUSED unsigned width, UNUSED unsigned height) {} -void -util_format_yv12_pack_rgba_8unorm(UNUSED uint8_t *dst_row, UNUSED unsigned dst_stride, - UNUSED const uint8_t *src_row, UNUSED unsigned src_stride, - UNUSED unsigned width, UNUSED unsigned height) {} -void -util_format_yv12_unpack_rgba_float(UNUSED float *dst_row, UNUSED unsigned dst_stride, - UNUSED const uint8_t *src_row, UNUSED unsigned src_stride, - UNUSED unsigned width, UNUSED unsigned height) {} -void -util_format_yv12_pack_rgba_float(UNUSED uint8_t *dst_row, UNUSED unsigned dst_stride, - UNUSED const float *src_row, UNUSED unsigned src_stride, - UNUSED unsigned width, UNUSED unsigned height) {} -void -util_format_yv12_fetch_rgba_float(UNUSED float *dst, UNUSED const uint8_t *src, - UNUSED unsigned i, UNUSED unsigned j) {} -void -util_format_yv16_unpack_rgba_8unorm(UNUSED uint8_t *dst_row, UNUSED unsigned dst_stride, - UNUSED const uint8_t *src_row, UNUSED unsigned src_stride, - UNUSED unsigned width, UNUSED unsigned height) {} -void -util_format_yv16_pack_rgba_8unorm(UNUSED uint8_t *dst_row, UNUSED unsigned dst_stride, - UNUSED const uint8_t *src_row, UNUSED unsigned src_stride, - UNUSED unsigned width, UNUSED unsigned height) {} -void -util_format_yv16_unpack_rgba_float(UNUSED float *dst_row, UNUSED unsigned dst_stride, - UNUSED const uint8_t *src_row, UNUSED unsigned src_stride, - UNUSED unsigned width, UNUSED unsigned height) {} -void -util_format_yv16_pack_rgba_float(UNUSED uint8_t *dst_row, UNUSED unsigned dst_stride, - UNUSED const float *src_row, UNUSED unsigned src_stride, - UNUSED unsigned width, UNUSED unsigned height) {} -void -util_format_yv16_fetch_rgba_float(UNUSED float *dst, UNUSED const uint8_t *src, - UNUSED unsigned i, UNUSED unsigned j) {} -void -util_format_iyuv_unpack_rgba_8unorm(UNUSED uint8_t *dst_row, UNUSED unsigned dst_stride, - UNUSED const uint8_t *src_row, UNUSED unsigned src_stride, - UNUSED unsigned width, UNUSED unsigned height) {} -void -util_format_iyuv_pack_rgba_8unorm(UNUSED uint8_t *dst_row, UNUSED unsigned dst_stride, - UNUSED const uint8_t *src_row, UNUSED unsigned src_stride, - UNUSED unsigned width, UNUSED unsigned height) {} -void -util_format_iyuv_unpack_rgba_float(UNUSED float *dst_row, UNUSED unsigned dst_stride, - UNUSED const uint8_t *src_row, UNUSED unsigned src_stride, - UNUSED unsigned width, UNUSED unsigned height) {} -void -util_format_iyuv_pack_rgba_float(UNUSED uint8_t *dst_row, UNUSED unsigned dst_stride, - UNUSED const float *src_row, UNUSED unsigned src_stride, - UNUSED unsigned width, UNUSED unsigned height) {} -void -util_format_iyuv_fetch_rgba_float(UNUSED float *dst, UNUSED const uint8_t *src, - UNUSED unsigned i, UNUSED unsigned j) {} -void -util_format_nv12_unpack_rgba_8unorm(UNUSED uint8_t *dst_row, UNUSED unsigned dst_stride, - UNUSED const uint8_t *src_row, UNUSED unsigned src_stride, - UNUSED unsigned width, UNUSED unsigned height) {} -void -util_format_nv12_pack_rgba_8unorm(UNUSED uint8_t *dst_row, UNUSED unsigned dst_stride, - UNUSED const uint8_t *src_row, UNUSED unsigned src_stride, - UNUSED unsigned width, UNUSED unsigned height) {} -void -util_format_nv12_unpack_rgba_float(UNUSED float *dst_row, UNUSED unsigned dst_stride, - UNUSED const uint8_t *src_row, UNUSED unsigned src_stride, - UNUSED unsigned width, UNUSED unsigned height) {} -void -util_format_nv12_pack_rgba_float(UNUSED uint8_t *dst_row, UNUSED unsigned dst_stride, - UNUSED const float *src_row, UNUSED unsigned src_stride, - UNUSED unsigned width, UNUSED unsigned height) {} -void -util_format_nv12_fetch_rgba_float(UNUSED float *dst, UNUSED const uint8_t *src, - UNUSED unsigned i, UNUSED unsigned j) {} -void -util_format_nv21_unpack_rgba_8unorm(UNUSED uint8_t *dst_row, UNUSED unsigned dst_stride, - UNUSED const uint8_t *src_row, UNUSED unsigned src_stride, - UNUSED unsigned width, UNUSED unsigned height) {} -void -util_format_nv21_pack_rgba_8unorm(UNUSED uint8_t *dst_row, UNUSED unsigned dst_stride, - UNUSED const uint8_t *src_row, UNUSED unsigned src_stride, - UNUSED unsigned width, UNUSED unsigned height) {} -void -util_format_nv21_unpack_rgba_float(UNUSED float *dst_row, UNUSED unsigned dst_stride, - UNUSED const uint8_t *src_row, UNUSED unsigned src_stride, - UNUSED unsigned width, UNUSED unsigned height) {} -void -util_format_nv21_pack_rgba_float(UNUSED uint8_t *dst_row, UNUSED unsigned dst_stride, - UNUSED const float *src_row, UNUSED unsigned src_stride, - UNUSED unsigned width, UNUSED unsigned height) {} -void -util_format_nv21_fetch_rgba_float(UNUSED float *dst, UNUSED const uint8_t *src, - UNUSED unsigned i, UNUSED unsigned j) {} -void -util_format_p016_unpack_rgba_8unorm(UNUSED uint8_t *dst_row, UNUSED unsigned dst_stride, - UNUSED const uint8_t *src_row, UNUSED unsigned src_stride, - UNUSED unsigned width, UNUSED unsigned height) {} -void -util_format_p016_pack_rgba_8unorm(UNUSED uint8_t *dst_row, UNUSED unsigned dst_stride, - UNUSED const uint8_t *src_row, UNUSED unsigned src_stride, - UNUSED unsigned width, UNUSED unsigned height) {} -void -util_format_p016_unpack_rgba_float(UNUSED float *dst_row, UNUSED unsigned dst_stride, - UNUSED const uint8_t *src_row, UNUSED unsigned src_stride, - UNUSED unsigned width, UNUSED unsigned height) {} -void -util_format_p016_pack_rgba_float(UNUSED uint8_t *dst_row, UNUSED unsigned dst_stride, - UNUSED const float *src_row, UNUSED unsigned src_stride, - UNUSED unsigned width, UNUSED unsigned height) {} -void -util_format_p016_fetch_rgba_float(UNUSED float *dst, UNUSED const uint8_t *src, - UNUSED unsigned i, UNUSED unsigned j) {} - -void -util_format_xyuv_unpack_rgba_float(UNUSED float *dst_row, UNUSED unsigned dst_stride, - UNUSED const uint8_t *src_row, UNUSED unsigned src_stride, - UNUSED unsigned width, UNUSED unsigned height) {} - -void -util_format_xyuv_unpack_rgba_8unorm(UNUSED uint8_t *dst_row, UNUSED unsigned dst_stride, - UNUSED const uint8_t *src_row, UNUSED unsigned src_stride, - UNUSED unsigned width, UNUSED unsigned height) {} - -void -util_format_xyuv_pack_rgba_float(UNUSED uint8_t *dst_row, UNUSED unsigned dst_stride, - UNUSED const float *src_row, UNUSED unsigned src_stride, - UNUSED unsigned width, UNUSED unsigned height) {} - -void -util_format_xyuv_pack_rgba_8unorm(UNUSED uint8_t *dst_row, UNUSED unsigned dst_stride, - UNUSED const uint8_t *src_row, UNUSED unsigned src_stride, - UNUSED unsigned width, UNUSED unsigned height) {} - -void -util_format_xyuv_fetch_rgba_float(UNUSED float *dst, UNUSED const uint8_t *src, - UNUSED unsigned i, UNUSED unsigned j) {} -void -util_format_ayuv_unpack_rgba_float(UNUSED float *dst_row, UNUSED unsigned dst_stride, - UNUSED const uint8_t *src_row, UNUSED unsigned src_stride, - UNUSED unsigned width, UNUSED unsigned height) {} - -void -util_format_ayuv_unpack_rgba_8unorm(UNUSED uint8_t *dst_row, UNUSED unsigned dst_stride, - UNUSED const uint8_t *src_row, UNUSED unsigned src_stride, - UNUSED unsigned width, UNUSED unsigned height) {} - -void -util_format_ayuv_pack_rgba_float(UNUSED uint8_t *dst_row, UNUSED unsigned dst_stride, - UNUSED const float *src_row, UNUSED unsigned src_stride, - UNUSED unsigned width, UNUSED unsigned height) {} - -void -util_format_ayuv_pack_rgba_8unorm(UNUSED uint8_t *dst_row, UNUSED unsigned dst_stride, - UNUSED const uint8_t *src_row, UNUSED unsigned src_stride, - UNUSED unsigned width, UNUSED unsigned height) {} - -void -util_format_ayuv_fetch_rgba_float(UNUSED float *dst, UNUSED const uint8_t *src, - UNUSED unsigned i, UNUSED unsigned j) {} -void -util_format_r8g8_r8b8_unorm_unpack_rgba_float(UNUSED float *dst_row, UNUSED unsigned dst_stride, - UNUSED const uint8_t *src_row, UNUSED unsigned src_stride, - UNUSED unsigned width, UNUSED unsigned height) {} - -void -util_format_r8g8_r8b8_unorm_unpack_rgba_8unorm(UNUSED uint8_t *dst_row, UNUSED unsigned dst_stride, - UNUSED const uint8_t *src_row, UNUSED unsigned src_stride, - UNUSED unsigned width, UNUSED unsigned height) {} - -void -util_format_r8g8_r8b8_unorm_pack_rgba_float(UNUSED uint8_t *dst_row, UNUSED unsigned dst_stride, - UNUSED const float *src_row, UNUSED unsigned src_stride, - UNUSED unsigned width, UNUSED unsigned height) {} - -void -util_format_r8g8_r8b8_unorm_pack_rgba_8unorm(UNUSED uint8_t *dst_row, UNUSED unsigned dst_stride, - UNUSED const uint8_t *src_row, UNUSED unsigned src_stride, - UNUSED unsigned width, UNUSED unsigned height) {} - -void -util_format_r8g8_r8b8_unorm_fetch_rgba_float(UNUSED float *dst, UNUSED const uint8_t *src, - UNUSED unsigned i, UNUSED unsigned j) {} - -void -util_format_g8r8_b8r8_unorm_unpack_rgba_float(UNUSED float *dst_row, UNUSED unsigned dst_stride, - UNUSED const uint8_t *src_row, UNUSED unsigned src_stride, - UNUSED unsigned width, UNUSED unsigned height) {} - -void -util_format_g8r8_b8r8_unorm_unpack_rgba_8unorm(UNUSED uint8_t *dst_row, UNUSED unsigned dst_stride, - UNUSED const uint8_t *src_row, UNUSED unsigned src_stride, - UNUSED unsigned width, UNUSED unsigned height) {} - -void -util_format_g8r8_b8r8_unorm_pack_rgba_float(UNUSED uint8_t *dst_row, UNUSED unsigned dst_stride, - UNUSED const float *src_row, UNUSED unsigned src_stride, - UNUSED unsigned width, UNUSED unsigned height) {} - -void -util_format_g8r8_b8r8_unorm_pack_rgba_8unorm(UNUSED uint8_t *dst_row, UNUSED unsigned dst_stride, - UNUSED const uint8_t *src_row, UNUSED unsigned src_stride, - UNUSED unsigned width, UNUSED unsigned height) {} - -void -util_format_g8r8_b8r8_unorm_fetch_rgba_float(UNUSED float *dst, UNUSED const uint8_t *src, - UNUSED unsigned i, UNUSED unsigned j) {} diff -Nru mesa-19.2.8/src/gallium/auxiliary/util/u_format_yuv.h mesa-20.0.8/src/gallium/auxiliary/util/u_format_yuv.h --- mesa-19.2.8/src/gallium/auxiliary/util/u_format_yuv.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/util/u_format_yuv.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,430 +0,0 @@ -/************************************************************************** - * - * Copyright 2010 VMware, Inc. - * All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sub license, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL - * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, - * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR - * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE - * USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * The above copyright notice and this permission notice (including the - * next paragraph) shall be included in all copies or substantial portions - * of the Software. - * - **************************************************************************/ - - -/** - * @file - * YUV colorspace conversion. - * - * @author Brian Paul - * @author Michal Krol - * @author Jose Fonseca - * - * See also: - * - http://www.fourcc.org/fccyvrgb.php - * - http://msdn.microsoft.com/en-us/library/ms893078 - * - http://en.wikipedia.org/wiki/YUV - */ - - -#ifndef U_FORMAT_YUV_H_ -#define U_FORMAT_YUV_H_ - - -#include "pipe/p_compiler.h" -#include "util/u_math.h" - - -/* - * TODO: Ensure we use consistent and right floating formulas, with enough - * precision in the coefficients. - */ - -static inline void -util_format_rgb_float_to_yuv(float r, float g, float b, - uint8_t *y, uint8_t *u, uint8_t *v) -{ - const float _r = CLAMP(r, 0.0f, 1.0f); - const float _g = CLAMP(g, 0.0f, 1.0f); - const float _b = CLAMP(b, 0.0f, 1.0f); - - const float scale = 255.0f; - - const int _y = scale * ( (0.257f * _r) + (0.504f * _g) + (0.098f * _b)); - const int _u = scale * (-(0.148f * _r) - (0.291f * _g) + (0.439f * _b)); - const int _v = scale * ( (0.439f * _r) - (0.368f * _g) - (0.071f * _b)); - - *y = _y + 16; - *u = _u + 128; - *v = _v + 128; -} - - -static inline void -util_format_yuv_to_rgb_float(uint8_t y, uint8_t u, uint8_t v, - float *r, float *g, float *b) -{ - const int _y = y - 16; - const int _u = u - 128; - const int _v = v - 128; - - const float y_factor = 255.0f / 219.0f; - - const float scale = 1.0f / 255.0f; - - *r = scale * (y_factor * _y + 1.596f * _v); - *g = scale * (y_factor * _y - 0.391f * _u - 0.813f * _v); - *b = scale * (y_factor * _y + 2.018f * _u ); -} - - -static inline void -util_format_rgb_8unorm_to_yuv(uint8_t r, uint8_t g, uint8_t b, - uint8_t *y, uint8_t *u, uint8_t *v) -{ - *y = (( 66 * r + 129 * g + 25 * b + 128) >> 8) + 16; - *u = (( -38 * r - 74 * g + 112 * b + 128) >> 8) + 128; - *v = (( 112 * r - 94 * g - 18 * b + 128) >> 8) + 128; -} - - -static inline void -util_format_yuv_to_rgb_8unorm(uint8_t y, uint8_t u, uint8_t v, - uint8_t *r, uint8_t *g, uint8_t *b) -{ - const int _y = y - 16; - const int _u = u - 128; - const int _v = v - 128; - - const int _r = (298 * _y + 409 * _v + 128) >> 8; - const int _g = (298 * _y - 100 * _u - 208 * _v + 128) >> 8; - const int _b = (298 * _y + 516 * _u + 128) >> 8; - - *r = CLAMP(_r, 0, 255); - *g = CLAMP(_g, 0, 255); - *b = CLAMP(_b, 0, 255); -} - - - -void -util_format_uyvy_unpack_rgba_float(float *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height); - -void -util_format_uyvy_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height); - -void -util_format_uyvy_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, - const float *src_row, unsigned src_stride, - unsigned width, unsigned height); - -void -util_format_uyvy_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height); - -void -util_format_uyvy_fetch_rgba_float(float *dst, const uint8_t *src, - unsigned i, unsigned j); - -void -util_format_yuyv_unpack_rgba_float(float *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height); - -void -util_format_yuyv_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height); - -void -util_format_yuyv_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, - const float *src_row, unsigned src_stride, - unsigned width, unsigned height); - -void -util_format_yuyv_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height); - -void -util_format_yuyv_fetch_rgba_float(float *dst, const uint8_t *src, - unsigned i, unsigned j); - -/* XXX: Stubbed for now */ -void -util_format_yv12_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height); -void -util_format_yv12_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height); -void -util_format_yv12_unpack_rgba_float(float *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height); -void -util_format_yv12_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, - const float *src_row, unsigned src_stride, - unsigned width, unsigned height); -void -util_format_yv12_fetch_rgba_float(float *dst, const uint8_t *src, - unsigned i, unsigned j); -void -util_format_yv16_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height); -void -util_format_yv16_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height); -void -util_format_yv16_unpack_rgba_float(float *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height); -void -util_format_yv16_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, - const float *src_row, unsigned src_stride, - unsigned width, unsigned height); -void -util_format_yv16_fetch_rgba_float(float *dst, const uint8_t *src, - unsigned i, unsigned j); -void -util_format_iyuv_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height); -void -util_format_iyuv_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height); -void -util_format_iyuv_unpack_rgba_float(float *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height); -void -util_format_iyuv_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, - const float *src_row, unsigned src_stride, - unsigned width, unsigned height); -void -util_format_iyuv_fetch_rgba_float(float *dst, const uint8_t *src, - unsigned i, unsigned j); -void -util_format_nv12_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height); -void -util_format_nv12_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height); -void -util_format_nv12_unpack_rgba_float(float *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height); -void -util_format_nv12_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, - const float *src_row, unsigned src_stride, - unsigned width, unsigned height); -void -util_format_nv12_fetch_rgba_float(float *dst, const uint8_t *src, - unsigned i, unsigned j); -void -util_format_nv21_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height); -void -util_format_nv21_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height); -void -util_format_nv21_unpack_rgba_float(float *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height); -void -util_format_nv21_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, - const float *src_row, unsigned src_stride, - unsigned width, unsigned height); -void -util_format_nv21_fetch_rgba_float(float *dst, const uint8_t *src, - unsigned i, unsigned j); -void -util_format_p016_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height); -void -util_format_p016_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height); -void -util_format_p016_unpack_rgba_float(float *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height); -void -util_format_p016_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, - const float *src_row, unsigned src_stride, - unsigned width, unsigned height); -void -util_format_p016_fetch_rgba_float(float *dst, const uint8_t *src, - unsigned i, unsigned j); - -void -util_format_xyuv_unpack_rgba_float(float *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height); - -void -util_format_xyuv_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height); - -void -util_format_xyuv_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, - const float *src_row, unsigned src_stride, - unsigned width, unsigned height); - -void -util_format_xyuv_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height); - -void -util_format_xyuv_fetch_rgba_float(float *dst, const uint8_t *src, - unsigned i, unsigned j); -void -util_format_ayuv_unpack_rgba_float(float *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height); - -void -util_format_ayuv_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height); - -void -util_format_ayuv_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, - const float *src_row, unsigned src_stride, - unsigned width, unsigned height); - -void -util_format_ayuv_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height); - -void -util_format_ayuv_fetch_rgba_float(float *dst, const uint8_t *src, - unsigned i, unsigned j); -void -util_format_r8g8_b8g8_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height); - -void -util_format_r8g8_b8g8_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height); - -void -util_format_r8g8_b8g8_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, - const float *src_row, unsigned src_stride, - unsigned width, unsigned height); - -void -util_format_r8g8_b8g8_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height); - -void -util_format_r8g8_b8g8_unorm_fetch_rgba_float(float *dst, const uint8_t *src, - unsigned i, unsigned j); - -void -util_format_g8r8_g8b8_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height); - -void -util_format_g8r8_g8b8_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height); - -void -util_format_g8r8_g8b8_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, - const float *src_row, unsigned src_stride, - unsigned width, unsigned height); - -void -util_format_g8r8_g8b8_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height); - -void -util_format_g8r8_g8b8_unorm_fetch_rgba_float(float *dst, const uint8_t *src, - unsigned i, unsigned j); - -void -util_format_r8g8_r8b8_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height); - -void -util_format_r8g8_r8b8_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height); - -void -util_format_r8g8_r8b8_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, - const float *src_row, unsigned src_stride, - unsigned width, unsigned height); - -void -util_format_r8g8_r8b8_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height); - -void -util_format_r8g8_r8b8_unorm_fetch_rgba_float(float *dst, const uint8_t *src, - unsigned i, unsigned j); - -void -util_format_g8r8_b8r8_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height); - -void -util_format_g8r8_b8r8_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height); - -void -util_format_g8r8_b8r8_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, - const float *src_row, unsigned src_stride, - unsigned width, unsigned height); - -void -util_format_g8r8_b8r8_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height); - -void -util_format_g8r8_b8r8_unorm_fetch_rgba_float(float *dst, const uint8_t *src, - unsigned i, unsigned j); - -#endif /* U_FORMAT_YUV_H_ */ diff -Nru mesa-19.2.8/src/gallium/auxiliary/util/u_format_zs.c mesa-20.0.8/src/gallium/auxiliary/util/u_format_zs.c --- mesa-19.2.8/src/gallium/auxiliary/util/u_format_zs.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/util/u_format_zs.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,896 +0,0 @@ -/************************************************************************** - * - * Copyright 2010 VMware, Inc. - * All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sub license, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL - * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, - * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR - * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE - * USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * The above copyright notice and this permission notice (including the - * next paragraph) shall be included in all copies or substantial portions - * of the Software. - * - **************************************************************************/ - - -#include "u_format_zs.h" -#include "util/u_math.h" - - -/* - * z32_unorm conversion functions - */ - -static inline uint16_t -z32_unorm_to_z16_unorm(uint32_t z) -{ - /* z * 0xffff / 0xffffffff */ - return z >> 16; -} - -static inline uint32_t -z16_unorm_to_z32_unorm(uint16_t z) -{ - /* z * 0xffffffff / 0xffff */ - return (z << 16) | z; -} - -static inline uint32_t -z32_unorm_to_z24_unorm(uint32_t z) -{ - /* z * 0xffffff / 0xffffffff */ - return z >> 8; -} - -static inline uint32_t -z24_unorm_to_z32_unorm(uint32_t z) -{ - /* z * 0xffffffff / 0xffffff */ - return (z << 8) | (z >> 16); -} - - -/* - * z32_float conversion functions - */ - -static inline uint16_t -z32_float_to_z16_unorm(float z) -{ - const float scale = 0xffff; - return (uint16_t)(z * scale + 0.5f); -} - -static inline float -z16_unorm_to_z32_float(uint16_t z) -{ - const float scale = 1.0 / 0xffff; - return (float)(z * scale); -} - -static inline uint32_t -z32_float_to_z24_unorm(float z) -{ - const double scale = 0xffffff; - return (uint32_t)(z * scale) & 0xffffff; -} - -static inline float -z24_unorm_to_z32_float(uint32_t z) -{ - const double scale = 1.0 / 0xffffff; - return (float)(z * scale); -} - -static inline uint32_t -z32_float_to_z32_unorm(float z) -{ - const double scale = 0xffffffff; - return (uint32_t)(z * scale); -} - -static inline float -z32_unorm_to_z32_float(uint32_t z) -{ - const double scale = 1.0 / 0xffffffff; - return (float)(z * scale); -} - - -void -util_format_s8_uint_unpack_s_8uint(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - unsigned y; - for(y = 0; y < height; ++y) { - memcpy(dst_row, src_row, width); - src_row += src_stride/sizeof(*src_row); - dst_row += dst_stride/sizeof(*dst_row); - } -} - -void -util_format_s8_uint_pack_s_8uint(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - unsigned y; - for(y = 0; y < height; ++y) { - memcpy(dst_row, src_row, width); - src_row += src_stride/sizeof(*src_row); - dst_row += dst_stride/sizeof(*dst_row); - } -} - -void -util_format_z16_unorm_unpack_z_float(float *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - unsigned x, y; - for(y = 0; y < height; ++y) { - float *dst = dst_row; - const uint16_t *src = (const uint16_t *)src_row; - for(x = 0; x < width; ++x) { - uint16_t value = util_cpu_to_le16(*src++); - *dst++ = z16_unorm_to_z32_float(value); - } - src_row += src_stride/sizeof(*src_row); - dst_row += dst_stride/sizeof(*dst_row); - } -} - -void -util_format_z16_unorm_pack_z_float(uint8_t *dst_row, unsigned dst_stride, - const float *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - unsigned x, y; - for(y = 0; y < height; ++y) { - const float *src = src_row; - uint16_t *dst = (uint16_t *)dst_row; - for(x = 0; x < width; ++x) { - uint16_t value; - value = z32_float_to_z16_unorm(*src++); - *dst++ = util_le16_to_cpu(value); - } - dst_row += dst_stride/sizeof(*dst_row); - src_row += src_stride/sizeof(*src_row); - } -} - -void -util_format_z16_unorm_unpack_z_32unorm(uint32_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - unsigned x, y; - for(y = 0; y < height; ++y) { - uint32_t *dst = dst_row; - const uint16_t *src = (const uint16_t *)src_row; - for(x = 0; x < width; ++x) { - uint16_t value = util_cpu_to_le16(*src++); - *dst++ = z16_unorm_to_z32_unorm(value); - } - src_row += src_stride/sizeof(*src_row); - dst_row += dst_stride/sizeof(*dst_row); - } -} - -void -util_format_z16_unorm_pack_z_32unorm(uint8_t *dst_row, unsigned dst_stride, - const uint32_t *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - unsigned x, y; - for(y = 0; y < height; ++y) { - const uint32_t *src = src_row; - uint16_t *dst = (uint16_t *)dst_row; - for(x = 0; x < width; ++x) { - uint16_t value; - value = z32_unorm_to_z16_unorm(*src++); - *dst++ = util_le16_to_cpu(value); - } - dst_row += dst_stride/sizeof(*dst_row); - src_row += src_stride/sizeof(*src_row); - } -} - -void -util_format_z32_unorm_unpack_z_float(float *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - unsigned x, y; - for(y = 0; y < height; ++y) { - float *dst = dst_row; - const uint32_t *src = (const uint32_t *)src_row; - for(x = 0; x < width; ++x) { - uint32_t value = util_cpu_to_le32(*src++); - *dst++ = z32_unorm_to_z32_float(value); - } - src_row += src_stride/sizeof(*src_row); - dst_row += dst_stride/sizeof(*dst_row); - } -} - -void -util_format_z32_unorm_pack_z_float(uint8_t *dst_row, unsigned dst_stride, - const float *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - unsigned x, y; - for(y = 0; y < height; ++y) { - const float *src = src_row; - uint32_t *dst = (uint32_t *)dst_row; - for(x = 0; x < width; ++x) { - uint32_t value; - value = z32_float_to_z32_unorm(*src++); - *dst++ = util_le32_to_cpu(value); - } - dst_row += dst_stride/sizeof(*dst_row); - src_row += src_stride/sizeof(*src_row); - } -} - -void -util_format_z32_unorm_unpack_z_32unorm(uint32_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - unsigned y; - for(y = 0; y < height; ++y) { - memcpy(dst_row, src_row, width * 4); - src_row += src_stride/sizeof(*src_row); - dst_row += dst_stride/sizeof(*dst_row); - } -} - -void -util_format_z32_unorm_pack_z_32unorm(uint8_t *dst_row, unsigned dst_stride, - const uint32_t *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - unsigned y; - for(y = 0; y < height; ++y) { - memcpy(dst_row, src_row, width * 4); - src_row += src_stride/sizeof(*src_row); - dst_row += dst_stride/sizeof(*dst_row); - } -} - -void -util_format_z32_float_unpack_z_float(float *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - unsigned y; - for(y = 0; y < height; ++y) { - memcpy(dst_row, src_row, width * 4); - src_row += src_stride/sizeof(*src_row); - dst_row += dst_stride/sizeof(*dst_row); - } -} - -void -util_format_z32_float_pack_z_float(uint8_t *dst_row, unsigned dst_stride, - const float *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - unsigned y; - for(y = 0; y < height; ++y) { - memcpy(dst_row, src_row, width * 4); - src_row += src_stride/sizeof(*src_row); - dst_row += dst_stride/sizeof(*dst_row); - } -} - -void -util_format_z32_float_unpack_z_32unorm(uint32_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - unsigned x, y; - for(y = 0; y < height; ++y) { - uint32_t *dst = dst_row; - const float *src = (const float *)src_row; - for(x = 0; x < width; ++x) { - *dst++ = z32_float_to_z32_unorm(*src++); - } - src_row += src_stride/sizeof(*src_row); - dst_row += dst_stride/sizeof(*dst_row); - } -} - -void -util_format_z32_float_pack_z_32unorm(uint8_t *dst_row, unsigned dst_stride, - const uint32_t *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - unsigned x, y; - for(y = 0; y < height; ++y) { - const uint32_t *src = src_row; - float *dst = (float *)dst_row; - for(x = 0; x < width; ++x) { - *dst++ = z32_unorm_to_z32_float(*src++); - } - dst_row += dst_stride/sizeof(*dst_row); - src_row += src_stride/sizeof(*src_row); - } -} - -void -util_format_z24_unorm_s8_uint_unpack_z_float(float *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - unsigned x, y; - for(y = 0; y < height; ++y) { - float *dst = dst_row; - const uint32_t *src = (const uint32_t *)src_row; - for(x = 0; x < width; ++x) { - uint32_t value = util_cpu_to_le32(*src++); - *dst++ = z24_unorm_to_z32_float(value & 0xffffff); - } - src_row += src_stride/sizeof(*src_row); - dst_row += dst_stride/sizeof(*dst_row); - } -} - -void -util_format_z24_unorm_s8_uint_pack_z_float(uint8_t *dst_row, unsigned dst_stride, - const float *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - unsigned x, y; - for(y = 0; y < height; ++y) { - const float *src = src_row; - uint32_t *dst = (uint32_t *)dst_row; - for(x = 0; x < width; ++x) { - uint32_t value = util_le32_to_cpu(*dst); - value &= 0xff000000; - value |= z32_float_to_z24_unorm(*src++); - *dst++ = util_cpu_to_le32(value); - } - dst_row += dst_stride/sizeof(*dst_row); - src_row += src_stride/sizeof(*src_row); - } -} - -void -util_format_z24_unorm_s8_uint_unpack_z_32unorm(uint32_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - unsigned x, y; - for(y = 0; y < height; ++y) { - uint32_t *dst = dst_row; - const uint32_t *src = (const uint32_t *)src_row; - for(x = 0; x < width; ++x) { - uint32_t value = util_cpu_to_le32(*src++); - *dst++ = z24_unorm_to_z32_unorm(value & 0xffffff); - } - src_row += src_stride/sizeof(*src_row); - dst_row += dst_stride/sizeof(*dst_row); - } -} - -void -util_format_z24_unorm_s8_uint_pack_z_32unorm(uint8_t *dst_row, unsigned dst_stride, - const uint32_t *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - unsigned x, y; - for(y = 0; y < height; ++y) { - const uint32_t *src = src_row; - uint32_t *dst = (uint32_t *)dst_row; - for(x = 0; x < width; ++x) { - uint32_t value = util_le32_to_cpu(*dst); - value &= 0xff000000; - value |= z32_unorm_to_z24_unorm(*src++); - *dst++ = util_cpu_to_le32(value); - } - dst_row += dst_stride/sizeof(*dst_row); - src_row += src_stride/sizeof(*src_row); - } -} - -void -util_format_z24_unorm_s8_uint_unpack_s_8uint(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - unsigned x, y; - for(y = 0; y < height; ++y) { - uint8_t *dst = dst_row; - const uint32_t *src = (const uint32_t *)src_row; - for(x = 0; x < width; ++x) { - uint32_t value = util_cpu_to_le32(*src++); - *dst++ = value >> 24; - } - src_row += src_stride/sizeof(*src_row); - dst_row += dst_stride/sizeof(*dst_row); - } -} - -void -util_format_z24_unorm_s8_uint_pack_s_8uint(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - unsigned x, y; - for(y = 0; y < height; ++y) { - const uint8_t *src = src_row; - uint32_t *dst = (uint32_t *)dst_row; - for(x = 0; x < width; ++x) { - uint32_t value = util_le32_to_cpu(*dst); - value &= 0x00ffffff; - value |= *src++ << 24; - *dst++ = util_cpu_to_le32(value); - } - dst_row += dst_stride/sizeof(*dst_row); - src_row += src_stride/sizeof(*src_row); - } -} - -void -util_format_z24_unorm_s8_uint_pack_separate(uint8_t *dst_row, unsigned dst_stride, - const uint32_t *z_src_row, unsigned z_src_stride, - const uint8_t *s_src_row, unsigned s_src_stride, - unsigned width, unsigned height) -{ - unsigned x, y; - for (y = 0; y < height; ++y) { - const uint32_t *z_src = z_src_row; - const uint8_t *s_src = s_src_row; - uint32_t *dst = (uint32_t *)dst_row; - for (x = 0; x < width; ++x) { - *dst++ = (*z_src++ & 0x00ffffff) | (*s_src++ << 24); - } - dst_row += dst_stride / sizeof(*dst_row); - z_src_row += z_src_stride / sizeof(*z_src_row); - s_src_row += s_src_stride / sizeof(*s_src_row); - } -} - -void -util_format_s8_uint_z24_unorm_unpack_z_float(float *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - unsigned x, y; - for(y = 0; y < height; ++y) { - float *dst = dst_row; - const uint32_t *src = (const uint32_t *)src_row; - for(x = 0; x < width; ++x) { - uint32_t value = util_cpu_to_le32(*src++); - *dst++ = z24_unorm_to_z32_float(value >> 8); - } - src_row += src_stride/sizeof(*src_row); - dst_row += dst_stride/sizeof(*dst_row); - } -} - -void -util_format_s8_uint_z24_unorm_pack_z_float(uint8_t *dst_row, unsigned dst_stride, - const float *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - unsigned x, y; - for(y = 0; y < height; ++y) { - const float *src = src_row; - uint32_t *dst = (uint32_t *)dst_row; - for(x = 0; x < width; ++x) { - uint32_t value = util_le32_to_cpu(*dst); - value &= 0x000000ff; - value |= z32_float_to_z24_unorm(*src++) << 8; - *dst++ = util_cpu_to_le32(value); - } - dst_row += dst_stride/sizeof(*dst_row); - src_row += src_stride/sizeof(*src_row); - } -} - -void -util_format_s8_uint_z24_unorm_unpack_z_32unorm(uint32_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - unsigned x, y; - for(y = 0; y < height; ++y) { - uint32_t *dst = dst_row; - const uint32_t *src = (const uint32_t *)src_row; - for(x = 0; x < width; ++x) { - uint32_t value = util_cpu_to_le32(*src++); - *dst++ = z24_unorm_to_z32_unorm(value >> 8); - } - src_row += src_stride/sizeof(*src_row); - dst_row += dst_stride/sizeof(*dst_row); - } -} - -void -util_format_s8_uint_z24_unorm_pack_z_32unorm(uint8_t *dst_row, unsigned dst_stride, - const uint32_t *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - unsigned x, y; - for(y = 0; y < height; ++y) { - const uint32_t *src = src_row; - uint32_t *dst = (uint32_t *)dst_row; - for(x = 0; x < width; ++x) { - uint32_t value = util_le32_to_cpu(*dst); - value &= 0x000000ff; - value |= *src++ & 0xffffff00; - *dst++ = util_cpu_to_le32(value); - } - dst_row += dst_stride/sizeof(*dst_row); - src_row += src_stride/sizeof(*src_row); - } -} - -void -util_format_s8_uint_z24_unorm_unpack_s_8uint(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - unsigned x, y; - for(y = 0; y < height; ++y) { - uint8_t *dst = dst_row; - const uint32_t *src = (const uint32_t *)src_row; - for(x = 0; x < width; ++x) { - uint32_t value = util_cpu_to_le32(*src++); - *dst++ = value & 0xff; - } - src_row += src_stride/sizeof(*src_row); - dst_row += dst_stride/sizeof(*dst_row); - } -} - -void -util_format_s8_uint_z24_unorm_pack_s_8uint(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - unsigned x, y; - for(y = 0; y < height; ++y) { - const uint8_t *src = src_row; - uint32_t *dst = (uint32_t *)dst_row; - for(x = 0; x < width; ++x) { - uint32_t value = util_le32_to_cpu(*dst); - value &= 0xffffff00; - value |= *src++; - *dst++ = util_cpu_to_le32(value); - } - dst_row += dst_stride/sizeof(*dst_row); - src_row += src_stride/sizeof(*src_row); - } -} - -void -util_format_z24x8_unorm_unpack_z_float(float *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - unsigned x, y; - for(y = 0; y < height; ++y) { - float *dst = dst_row; - const uint32_t *src = (const uint32_t *)src_row; - for(x = 0; x < width; ++x) { - uint32_t value = util_cpu_to_le32(*src++); - *dst++ = z24_unorm_to_z32_float(value & 0xffffff); - } - src_row += src_stride/sizeof(*src_row); - dst_row += dst_stride/sizeof(*dst_row); - } -} - -void -util_format_z24x8_unorm_pack_z_float(uint8_t *dst_row, unsigned dst_stride, - const float *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - unsigned x, y; - for(y = 0; y < height; ++y) { - const float *src = src_row; - uint32_t *dst = (uint32_t *)dst_row; - for(x = 0; x < width; ++x) { - uint32_t value; - value = z32_float_to_z24_unorm(*src++); - *dst++ = util_le32_to_cpu(value); - } - dst_row += dst_stride/sizeof(*dst_row); - src_row += src_stride/sizeof(*src_row); - } -} - -void -util_format_z24x8_unorm_unpack_z_32unorm(uint32_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - unsigned x, y; - for(y = 0; y < height; ++y) { - uint32_t *dst = dst_row; - const uint32_t *src = (const uint32_t *)src_row; - for(x = 0; x < width; ++x) { - uint32_t value = util_cpu_to_le32(*src++); - *dst++ = z24_unorm_to_z32_unorm(value & 0xffffff); - } - src_row += src_stride/sizeof(*src_row); - dst_row += dst_stride/sizeof(*dst_row); - } -} - -void -util_format_z24x8_unorm_pack_z_32unorm(uint8_t *dst_row, unsigned dst_stride, - const uint32_t *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - unsigned x, y; - for(y = 0; y < height; ++y) { - const uint32_t *src = src_row; - uint32_t *dst = (uint32_t *)dst_row; - for(x = 0; x < width; ++x) { - uint32_t value; - value = z32_unorm_to_z24_unorm(*src++); - *dst++ = util_cpu_to_le32(value); - } - dst_row += dst_stride/sizeof(*dst_row); - src_row += src_stride/sizeof(*src_row); - } -} - -void -util_format_x8z24_unorm_unpack_z_float(float *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - unsigned x, y; - for(y = 0; y < height; ++y) { - float *dst = dst_row; - const uint32_t *src = (uint32_t *)src_row; - for(x = 0; x < width; ++x) { - uint32_t value = util_cpu_to_le32(*src++); - *dst++ = z24_unorm_to_z32_float(value >> 8); - } - src_row += src_stride/sizeof(*src_row); - dst_row += dst_stride/sizeof(*dst_row); - } -} - -void -util_format_x8z24_unorm_pack_z_float(uint8_t *dst_row, unsigned dst_stride, - const float *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - unsigned x, y; - for(y = 0; y < height; ++y) { - const float *src = src_row; - uint32_t *dst = (uint32_t *)dst_row; - for(x = 0; x < width; ++x) { - uint32_t value; - value = z32_float_to_z24_unorm(*src++) << 8; - *dst++ = util_cpu_to_le32(value); - } - dst_row += dst_stride/sizeof(*dst_row); - src_row += src_stride/sizeof(*src_row); - } -} - -void -util_format_x8z24_unorm_unpack_z_32unorm(uint32_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - unsigned x, y; - for(y = 0; y < height; ++y) { - uint32_t *dst = dst_row; - const uint32_t *src = (const uint32_t *)src_row; - for(x = 0; x < width; ++x) { - uint32_t value = util_cpu_to_le32(*src++); - *dst++ = z24_unorm_to_z32_unorm(value >> 8); - } - src_row += src_stride/sizeof(*src_row); - dst_row += dst_stride/sizeof(*dst_row); - } -} - -void -util_format_x8z24_unorm_pack_z_32unorm(uint8_t *dst_row, unsigned dst_stride, - const uint32_t *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - unsigned x, y; - for(y = 0; y < height; ++y) { - const uint32_t *src = src_row; - uint32_t *dst = (uint32_t *)dst_row; - for(x = 0; x < width; ++x) { - uint32_t value; - value = z32_unorm_to_z24_unorm(*src++) << 8; - *dst++ = util_cpu_to_le32(value); - } - dst_row += dst_stride/sizeof(*dst_row); - src_row += src_stride/sizeof(*src_row); - } -} - -void -util_format_z32_float_s8x24_uint_unpack_z_float(float *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - unsigned x, y; - for(y = 0; y < height; ++y) { - float *dst = dst_row; - const float *src = (const float *)src_row; - for(x = 0; x < width; ++x) { - *dst = *src; - src += 2; - dst += 1; - } - src_row += src_stride/sizeof(*src_row); - dst_row += dst_stride/sizeof(*dst_row); - } -} - -void -util_format_z32_float_s8x24_uint_pack_z_float(uint8_t *dst_row, unsigned dst_stride, - const float *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - unsigned x, y; - for(y = 0; y < height; ++y) { - const float *src = src_row; - float *dst = (float *)dst_row; - for(x = 0; x < width; ++x) { - *dst = *src; - src += 1; - dst += 2; - } - dst_row += dst_stride/sizeof(*dst_row); - src_row += src_stride/sizeof(*src_row); - } -} - -void -util_format_z32_float_s8x24_uint_unpack_z_32unorm(uint32_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - unsigned x, y; - for(y = 0; y < height; ++y) { - uint32_t *dst = dst_row; - const float *src = (const float *)src_row; - for(x = 0; x < width; ++x) { - *dst = z32_float_to_z32_unorm(*src); - src += 2; - dst += 1; - } - src_row += src_stride/sizeof(*src_row); - dst_row += dst_stride/sizeof(*dst_row); - } -} - -void -util_format_z32_float_s8x24_uint_pack_z_32unorm(uint8_t *dst_row, unsigned dst_stride, - const uint32_t *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - unsigned x, y; - for(y = 0; y < height; ++y) { - const uint32_t *src = src_row; - float *dst = (float *)dst_row; - for(x = 0; x < width; ++x) { - *dst++ = z32_unorm_to_z32_float(*src++); - } - dst_row += dst_stride/sizeof(*dst_row); - src_row += src_stride/sizeof(*src_row); - } -} - -void -util_format_z32_float_s8x24_uint_unpack_s_8uint(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - unsigned x, y; - for(y = 0; y < height; ++y) { - uint8_t *dst = dst_row; - const uint8_t *src = src_row + 4; - for(x = 0; x < width; ++x) { - *dst = *src; - src += 8; - dst += 1; - } - src_row += src_stride/sizeof(*src_row); - dst_row += dst_stride/sizeof(*dst_row); - } -} - -void -util_format_z32_float_s8x24_uint_pack_s_8uint(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - unsigned x, y; - for(y = 0; y < height; ++y) { - const uint8_t *src = src_row; - uint32_t *dst = ((uint32_t *)dst_row) + 1; - for(x = 0; x < width; ++x) { - *dst = util_cpu_to_le32(*src); - src += 1; - dst += 2; - } - dst_row += dst_stride/sizeof(*dst_row); - src_row += src_stride/sizeof(*src_row); - } -} - - -void -util_format_x24s8_uint_unpack_s_8uint(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height) -{ - util_format_z24_unorm_s8_uint_unpack_s_8uint(dst_row, dst_stride, - src_row, src_stride, - width, height); -} - -void -util_format_x24s8_uint_pack_s_8uint(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height) -{ - util_format_z24_unorm_s8_uint_pack_s_8uint(dst_row, dst_stride, - src_row, src_stride, - width, height); -} - -void -util_format_s8x24_uint_unpack_s_8uint(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height) -{ - util_format_s8_uint_z24_unorm_unpack_s_8uint(dst_row, dst_stride, - src_row, src_stride, - width, height); -} - -void -util_format_s8x24_uint_pack_s_8uint(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height) -{ - util_format_s8_uint_z24_unorm_pack_s_8uint(dst_row, dst_stride, - src_row, src_stride, - width, height); -} - -void -util_format_x32_s8x24_uint_unpack_s_8uint(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - util_format_z32_float_s8x24_uint_unpack_s_8uint(dst_row, dst_stride, - src_row, src_stride, - width, height); - -} - -void -util_format_x32_s8x24_uint_pack_s_8uint(uint8_t *dst_row, unsigned dst_stride, - const uint8_t *src_row, unsigned src_stride, - unsigned width, unsigned height) -{ - util_format_z32_float_s8x24_uint_pack_s_8uint(dst_row, dst_stride, - src_row, src_stride, - width, height); -} diff -Nru mesa-19.2.8/src/gallium/auxiliary/util/u_format_zs.h mesa-20.0.8/src/gallium/auxiliary/util/u_format_zs.h --- mesa-19.2.8/src/gallium/auxiliary/util/u_format_zs.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/util/u_format_zs.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,214 +0,0 @@ -/************************************************************************** - * - * Copyright 2010 VMware, Inc. - * All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sub license, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL - * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, - * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR - * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE - * USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * The above copyright notice and this permission notice (including the - * next paragraph) shall be included in all copies or substantial portions - * of the Software. - * - **************************************************************************/ - - -#ifndef U_FORMAT_ZS_H_ -#define U_FORMAT_ZS_H_ - - -#include "pipe/p_compiler.h" - - -void -util_format_s8_uint_unpack_s_8uint(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); - - -void -util_format_s8_uint_pack_s_8uint(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); - - -void -util_format_z16_unorm_unpack_z_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); - - -void -util_format_z16_unorm_pack_z_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height); - - -void -util_format_z16_unorm_unpack_z_32unorm(uint32_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); - - -void -util_format_z16_unorm_pack_z_32unorm(uint8_t *dst_row, unsigned dst_stride, const uint32_t *src_row, unsigned src_stride, unsigned width, unsigned height); - - -void -util_format_z32_unorm_unpack_z_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); - - -void -util_format_z32_unorm_pack_z_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height); - - -void -util_format_z32_unorm_unpack_z_32unorm(uint32_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); - - -void -util_format_z32_unorm_pack_z_32unorm(uint8_t *dst_row, unsigned dst_stride, const uint32_t *src_row, unsigned src_stride, unsigned width, unsigned height); - - -void -util_format_z32_float_unpack_z_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); - - -void -util_format_z32_float_pack_z_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height); - - -void -util_format_z32_float_unpack_z_32unorm(uint32_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); - - -void -util_format_z32_float_pack_z_32unorm(uint8_t *dst_row, unsigned dst_stride, const uint32_t *src_row, unsigned src_stride, unsigned width, unsigned height); - - -void -util_format_z24_unorm_s8_uint_unpack_z_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); - - -void -util_format_z24_unorm_s8_uint_pack_z_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height); - - -void -util_format_z24_unorm_s8_uint_unpack_z_32unorm(uint32_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); - - -void -util_format_z24_unorm_s8_uint_pack_z_32unorm(uint8_t *dst_row, unsigned dst_stride, const uint32_t *src_row, unsigned src_stride, unsigned width, unsigned height); - - -void -util_format_z24_unorm_s8_uint_unpack_s_8uint(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); - - -void -util_format_z24_unorm_s8_uint_pack_s_8uint(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); - -void -util_format_z24_unorm_s8_uint_pack_separate(uint8_t *dst_row, unsigned dst_stride, const uint32_t *z_src_row, unsigned z_src_stride, const uint8_t *s_src_row, unsigned s_src_stride, unsigned width, unsigned height); - -void -util_format_s8_uint_z24_unorm_unpack_z_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); - - -void -util_format_s8_uint_z24_unorm_pack_z_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height); - - -void -util_format_s8_uint_z24_unorm_unpack_z_32unorm(uint32_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); - - -void -util_format_s8_uint_z24_unorm_pack_z_32unorm(uint8_t *dst_row, unsigned dst_stride, const uint32_t *src_row, unsigned src_stride, unsigned width, unsigned height); - - -void -util_format_s8_uint_z24_unorm_unpack_s_8uint(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); - - -void -util_format_s8_uint_z24_unorm_pack_s_8uint(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); - - -void -util_format_z24x8_unorm_unpack_z_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); - - -void -util_format_z24x8_unorm_pack_z_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height); - - -void -util_format_z24x8_unorm_unpack_z_32unorm(uint32_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); - - -void -util_format_z24x8_unorm_pack_z_32unorm(uint8_t *dst_row, unsigned dst_stride, const uint32_t *src_row, unsigned src_stride, unsigned width, unsigned height); - - -void -util_format_x8z24_unorm_unpack_z_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); - - -void -util_format_x8z24_unorm_pack_z_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height); - - -void -util_format_x8z24_unorm_unpack_z_32unorm(uint32_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); - - -void -util_format_x8z24_unorm_pack_z_32unorm(uint8_t *dst_row, unsigned dst_stride, const uint32_t *src_row, unsigned src_stride, unsigned width, unsigned height); - - -void -util_format_z32_float_s8x24_uint_unpack_z_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); - - -void -util_format_z32_float_s8x24_uint_pack_z_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height); - - -void -util_format_z32_float_s8x24_uint_unpack_z_32unorm(uint32_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); - - -void -util_format_z32_float_s8x24_uint_pack_z_32unorm(uint8_t *dst_row, unsigned dst_stride, const uint32_t *src_row, unsigned src_stride, unsigned width, unsigned height); - - -void -util_format_z32_float_s8x24_uint_unpack_s_8uint(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); - - -void -util_format_z32_float_s8x24_uint_pack_s_8uint(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); - -void -util_format_x24s8_uint_unpack_s_8uint(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); - -void -util_format_x24s8_uint_pack_s_8uint(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); - -void -util_format_s8x24_uint_unpack_s_8uint(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); - -void -util_format_s8x24_uint_pack_s_8uint(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); - -void -util_format_x32_s8x24_uint_unpack_s_8uint(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); - -void -util_format_x32_s8x24_uint_pack_s_8uint(uint8_t *dst_row, unsigned dst_sride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height); -#endif /* U_FORMAT_ZS_H_ */ diff -Nru mesa-19.2.8/src/gallium/auxiliary/util/u_gen_mipmap.c mesa-20.0.8/src/gallium/auxiliary/util/u_gen_mipmap.c --- mesa-19.2.8/src/gallium/auxiliary/util/u_gen_mipmap.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/util/u_gen_mipmap.c 2020-06-12 01:21:16.000000000 +0000 @@ -36,7 +36,7 @@ #include "util/u_gen_mipmap.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_inlines.h" diff -Nru mesa-19.2.8/src/gallium/auxiliary/util/u_half.h mesa-20.0.8/src/gallium/auxiliary/util/u_half.h --- mesa-19.2.8/src/gallium/auxiliary/util/u_half.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/util/u_half.h 2020-06-12 01:21:16.000000000 +0000 @@ -123,7 +123,7 @@ f32.ui |= 0xff << 23; /* Sign */ - f32.ui |= (f16 & 0x8000) << 16; + f32.ui |= (uint32_t)(f16 & 0x8000) << 16; return f32.f; } diff -Nru mesa-19.2.8/src/gallium/auxiliary/util/u_helpers.c mesa-20.0.8/src/gallium/auxiliary/util/u_helpers.c --- mesa-19.2.8/src/gallium/auxiliary/util/u_helpers.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/util/u_helpers.c 2020-06-12 01:21:16.000000000 +0000 @@ -52,6 +52,8 @@ dst += start_slot; + *enabled_buffers &= ~u_bit_consecutive(start_slot, count); + if (src) { for (i = 0; i < count; i++) { if (src[i].buffer.resource) @@ -66,15 +68,12 @@ /* Copy over the other members of pipe_vertex_buffer. */ memcpy(dst, src, count * sizeof(struct pipe_vertex_buffer)); - *enabled_buffers &= ~(((1ull << count) - 1) << start_slot); *enabled_buffers |= bitmask << start_slot; } else { /* Unreference the buffers. */ for (i = 0; i < count; i++) pipe_vertex_buffer_unreference(&dst[i]); - - *enabled_buffers &= ~(((1ull << count) - 1) << start_slot); } } @@ -145,12 +144,12 @@ util_upload_index_buffer(struct pipe_context *pipe, const struct pipe_draw_info *info, struct pipe_resource **out_buffer, - unsigned *out_offset) + unsigned *out_offset, unsigned alignment) { unsigned start_offset = info->start * info->index_size; u_upload_data(pipe->stream_uploader, start_offset, - info->count * info->index_size, 4, + info->count * info->index_size, alignment, (char*)info->index.user + start_offset, out_offset, out_buffer); u_upload_unmap(pipe->stream_uploader); diff -Nru mesa-19.2.8/src/gallium/auxiliary/util/u_helpers.h mesa-20.0.8/src/gallium/auxiliary/util/u_helpers.h --- mesa-19.2.8/src/gallium/auxiliary/util/u_helpers.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/util/u_helpers.h 2020-06-12 01:21:16.000000000 +0000 @@ -54,7 +54,7 @@ bool util_upload_index_buffer(struct pipe_context *pipe, const struct pipe_draw_info *info, struct pipe_resource **out_buffer, - unsigned *out_offset); + unsigned *out_offset, unsigned alignment); void util_pin_driver_threads_to_random_L3(struct pipe_context *ctx, diff -Nru mesa-19.2.8/src/gallium/auxiliary/util/u_inlines.h mesa-20.0.8/src/gallium/auxiliary/util/u_inlines.h --- mesa-19.2.8/src/gallium/auxiliary/util/u_inlines.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/util/u_inlines.h 2020-06-12 01:21:16.000000000 +0000 @@ -107,7 +107,8 @@ { struct pipe_surface *old_dst = *dst; - if (pipe_reference_described(&old_dst->reference, &src->reference, + if (pipe_reference_described(old_dst ? &old_dst->reference : NULL, + src ? &src->reference : NULL, (debug_reference_descriptor) debug_describe_surface)) old_dst->context->surface_destroy(old_dst->context, old_dst); @@ -138,7 +139,8 @@ { struct pipe_resource *old_dst = *dst; - if (pipe_reference_described(&old_dst->reference, &src->reference, + if (pipe_reference_described(old_dst ? &old_dst->reference : NULL, + src ? &src->reference : NULL, (debug_reference_descriptor) debug_describe_resource)) { /* Avoid recursion, which would prevent inlining this function */ @@ -147,7 +149,8 @@ old_dst->screen->resource_destroy(old_dst->screen, old_dst); old_dst = next; - } while (pipe_reference_described(&old_dst->reference, NULL, + } while (pipe_reference_described(old_dst ? &old_dst->reference : NULL, + NULL, (debug_reference_descriptor) debug_describe_resource)); } @@ -185,7 +188,8 @@ { struct pipe_sampler_view *old_dst = *dst; - if (pipe_reference_described(&old_dst->reference, &src->reference, + if (pipe_reference_described(old_dst ? &old_dst->reference : NULL, + src ? &src->reference : NULL, (debug_reference_descriptor) debug_describe_sampler_view)) old_dst->context->sampler_view_destroy(old_dst->context, old_dst); @@ -198,7 +202,8 @@ { struct pipe_stream_output_target *old_dst = *dst; - if (pipe_reference_described(&old_dst->reference, &src->reference, + if (pipe_reference_described(old_dst ? &old_dst->reference : NULL, + src ? &src->reference : NULL, (debug_reference_descriptor)debug_describe_so_target)) old_dst->context->stream_output_target_destroy(old_dst->context, old_dst); *dst = src; diff -Nru mesa-19.2.8/src/gallium/auxiliary/util/u_live_shader_cache.c mesa-20.0.8/src/gallium/auxiliary/util/u_live_shader_cache.c --- mesa-19.2.8/src/gallium/auxiliary/util/u_live_shader_cache.c 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/util/u_live_shader_cache.c 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,187 @@ +/* + * Copyright 2019 Advanced Micro Devices, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "util/u_live_shader_cache.h" + +#include "util/u_inlines.h" +#include "tgsi/tgsi_from_mesa.h" +#include "tgsi/tgsi_parse.h" + +#include "compiler/nir/nir_serialize.h" + +#include "util/blob.h" +#include "util/hash_table.h" +#include "util/mesa-sha1.h" + +static uint32_t key_hash(const void *key) +{ + /* Take the first dword of SHA1. */ + return *(uint32_t*)key; +} + +static bool key_equals(const void *a, const void *b) +{ + /* Compare SHA1s. */ + return memcmp(a, b, 20) == 0; +} + +void +util_live_shader_cache_init(struct util_live_shader_cache *cache, + void *(*create_shader)(struct pipe_context *, + const struct pipe_shader_state *state), + void (*destroy_shader)(struct pipe_context *, void *)) +{ + simple_mtx_init(&cache->lock, mtx_plain); + cache->hashtable = _mesa_hash_table_create(NULL, key_hash, key_equals); + cache->create_shader = create_shader; + cache->destroy_shader = destroy_shader; +} + +void +util_live_shader_cache_deinit(struct util_live_shader_cache *cache) +{ + if (cache->hashtable) { + /* The hash table should be empty at this point. */ + _mesa_hash_table_destroy(cache->hashtable, NULL); + simple_mtx_destroy(&cache->lock); + } +} + +void * +util_live_shader_cache_get(struct pipe_context *ctx, + struct util_live_shader_cache *cache, + const struct pipe_shader_state *state) +{ + struct blob blob = {0}; + unsigned ir_size; + const void *ir_binary; + enum pipe_shader_type stage; + + /* Get the shader binary and shader stage. */ + if (state->type == PIPE_SHADER_IR_TGSI) { + ir_binary = state->tokens; + ir_size = tgsi_num_tokens(state->tokens) * + sizeof(struct tgsi_token); + stage = tgsi_get_processor_type(state->tokens); + } else if (state->type == PIPE_SHADER_IR_NIR) { + blob_init(&blob); + nir_serialize(&blob, state->ir.nir, true); + ir_binary = blob.data; + ir_size = blob.size; + stage = pipe_shader_type_from_mesa(((nir_shader*)state->ir.nir)->info.stage); + } else { + assert(0); + return NULL; + } + + /* Compute SHA1 of pipe_shader_state. */ + struct mesa_sha1 sha1_ctx; + unsigned char sha1[20]; + _mesa_sha1_init(&sha1_ctx); + _mesa_sha1_update(&sha1_ctx, ir_binary, ir_size); + if ((stage == PIPE_SHADER_VERTEX || + stage == PIPE_SHADER_TESS_EVAL || + stage == PIPE_SHADER_GEOMETRY) && + state->stream_output.num_outputs) { + _mesa_sha1_update(&sha1_ctx, &state->stream_output, + sizeof(state->stream_output)); + } + _mesa_sha1_final(&sha1_ctx, sha1); + + if (ir_binary == blob.data) + blob_finish(&blob); + + /* Find the shader in the live cache. */ + simple_mtx_lock(&cache->lock); + struct hash_entry *entry = _mesa_hash_table_search(cache->hashtable, sha1); + struct util_live_shader *shader = entry ? entry->data : NULL; + + /* Increase the refcount. */ + if (shader) { + pipe_reference(NULL, &shader->reference); + cache->hits++; + } + simple_mtx_unlock(&cache->lock); + + /* Return if the shader already exists. */ + if (shader) { + if (state->type == PIPE_SHADER_IR_NIR) + ralloc_free(state->ir.nir); + return shader; + } + + /* The cache mutex is unlocked to allow multiple create_shader + * invocations to run simultaneously. + */ + shader = (struct util_live_shader*)cache->create_shader(ctx, state); + pipe_reference_init(&shader->reference, 1); + memcpy(shader->sha1, sha1, sizeof(sha1)); + + simple_mtx_lock(&cache->lock); + /* The same shader might have been created in parallel. This is rare. + * If so, keep the one already in cache. + */ + struct hash_entry *entry2 = _mesa_hash_table_search(cache->hashtable, sha1); + struct util_live_shader *shader2 = entry2 ? entry2->data : NULL; + + if (shader2) { + cache->destroy_shader(ctx, shader); + shader = shader2; + /* Increase the refcount. */ + pipe_reference(NULL, &shader->reference); + } else { + _mesa_hash_table_insert(cache->hashtable, shader->sha1, shader); + } + cache->misses++; + simple_mtx_unlock(&cache->lock); + + return shader; +} + +void +util_shader_reference(struct pipe_context *ctx, + struct util_live_shader_cache *cache, + void **dst, void *src) +{ + if (*dst == src) + return; + + struct util_live_shader *dst_shader = (struct util_live_shader*)*dst; + struct util_live_shader *src_shader = (struct util_live_shader*)src; + + simple_mtx_lock(&cache->lock); + bool destroy = pipe_reference(&dst_shader->reference, &src_shader->reference); + if (destroy) { + struct hash_entry *entry = _mesa_hash_table_search(cache->hashtable, + dst_shader->sha1); + assert(entry); + _mesa_hash_table_remove(cache->hashtable, entry); + } + simple_mtx_unlock(&cache->lock); + + if (destroy) + cache->destroy_shader(ctx, dst_shader); + + *dst = src; +} diff -Nru mesa-19.2.8/src/gallium/auxiliary/util/u_live_shader_cache.h mesa-20.0.8/src/gallium/auxiliary/util/u_live_shader_cache.h --- mesa-19.2.8/src/gallium/auxiliary/util/u_live_shader_cache.h 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/util/u_live_shader_cache.h 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,90 @@ +/* + * Copyright 2019 Advanced Micro Devices, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +/* This deduplicates live shader CSOs, meaning that creating 2 shaders with + * the same IR will return the same CSO. + * + * How to use this: + * + * - create_xx_state should only call util_live_shader_cache_get. + * + * - delete_xx_state should only call util_shader_reference(&shader, NULL). + * This will decrease the reference count. + * + * - Driver shaders must inherit util_live_shader. They don't have to + * initialize it. + * + * - Declare struct util_live_shader_cache in your pipe_screen (no pointer) if + * you support shareable shaders. If not, declare it in your pipe_context. + * + * - Set your create_shader and destroy_shader driver callbacks with + * util_live_shader_cache_init. These are your driver versions of + * create_xx_state and delete_xx_state. There is no distinction between + * vs, tcs, tes, gs, fs. Instead, get the shader type from the IR. + * + * - Call util_live_shader_cache_deinit when you destroy your screen or context. + */ + +#ifndef U_LIVE_SHADER_CACHE_H +#define U_LIVE_SHADER_CACHE_H + +#include "util/simple_mtx.h" +#include "pipe/p_state.h" + +struct util_live_shader_cache { + simple_mtx_t lock; + struct hash_table *hashtable; + + void *(*create_shader)(struct pipe_context *, + const struct pipe_shader_state *state); + void (*destroy_shader)(struct pipe_context *, void *); + + unsigned hits, misses; +}; + +struct util_live_shader { + struct pipe_reference reference; + unsigned char sha1[20]; +}; + +void +util_live_shader_cache_init(struct util_live_shader_cache *cache, + void *(*create_shader)(struct pipe_context *, + const struct pipe_shader_state *state), + void (*destroy_shader)(struct pipe_context *, void *)); + +void +util_live_shader_cache_deinit(struct util_live_shader_cache *cache); + +void * +util_live_shader_cache_get(struct pipe_context *ctx, + struct util_live_shader_cache *cache, + const struct pipe_shader_state *state); + +void +util_shader_reference(struct pipe_context *ctx, + struct util_live_shader_cache *cache, + void **dst, void *src); + +#endif diff -Nru mesa-19.2.8/src/gallium/auxiliary/util/u_mm.c mesa-20.0.8/src/gallium/auxiliary/util/u_mm.c --- mesa-19.2.8/src/gallium/auxiliary/util/u_mm.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/util/u_mm.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,299 +0,0 @@ -/************************************************************************** - * - * Copyright (C) 1999 Wittawat Yamwong - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * WITTAWAT YAMWONG, OR ANY OTHER CONTRIBUTORS BE LIABLE FOR ANY CLAIM, - * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR - * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE - * OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - **************************************************************************/ - - -#include "pipe/p_compiler.h" -#include "util/u_debug.h" - -#include "util/u_memory.h" -#include "util/u_mm.h" - - -void -u_mmDumpMemInfo(const struct mem_block *heap) -{ - debug_printf("Memory heap %p:\n", (void *) heap); - if (heap == NULL) { - debug_printf(" heap == 0\n"); - } - else { - const struct mem_block *p; - int total_used = 0, total_free = 0; - - for (p = heap->next; p != heap; p = p->next) { - debug_printf(" Offset:%08x, Size:%08x, %c%c\n", p->ofs, p->size, - p->free ? 'F':'.', - p->reserved ? 'R':'.'); - if (p->free) - total_free += p->size; - else - total_used += p->size; - } - - debug_printf("'\nMemory stats: total = %d, used = %d, free = %d\n", - total_used + total_free, total_used, total_free); - debug_printf("\nFree list:\n"); - - for (p = heap->next_free; p != heap; p = p->next_free) { - debug_printf(" FREE Offset:%08x, Size:%08x, %c%c\n", p->ofs, p->size, - p->free ? 'F':'.', - p->reserved ? 'R':'.'); - } - - } - debug_printf("End of memory blocks\n"); -} - - -struct mem_block * -u_mmInit(int ofs, int size) -{ - struct mem_block *heap, *block; - - if (size <= 0) - return NULL; - - heap = CALLOC_STRUCT(mem_block); - if (!heap) - return NULL; - - block = CALLOC_STRUCT(mem_block); - if (!block) { - FREE(heap); - return NULL; - } - - heap->next = block; - heap->prev = block; - heap->next_free = block; - heap->prev_free = block; - - block->heap = heap; - block->next = heap; - block->prev = heap; - block->next_free = heap; - block->prev_free = heap; - - block->ofs = ofs; - block->size = size; - block->free = 1; - - return heap; -} - - -static struct mem_block * -SliceBlock(struct mem_block *p, - int startofs, int size, - int reserved, UNUSED int alignment) -{ - struct mem_block *newblock; - - /* break left [p, newblock, p->next], then p = newblock */ - if (startofs > p->ofs) { - newblock = CALLOC_STRUCT(mem_block); - if (!newblock) - return NULL; - newblock->ofs = startofs; - newblock->size = p->size - (startofs - p->ofs); - newblock->free = 1; - newblock->heap = p->heap; - - newblock->next = p->next; - newblock->prev = p; - p->next->prev = newblock; - p->next = newblock; - - newblock->next_free = p->next_free; - newblock->prev_free = p; - p->next_free->prev_free = newblock; - p->next_free = newblock; - - p->size -= newblock->size; - p = newblock; - } - - /* break right, also [p, newblock, p->next] */ - if (size < p->size) { - newblock = CALLOC_STRUCT(mem_block); - if (!newblock) - return NULL; - newblock->ofs = startofs + size; - newblock->size = p->size - size; - newblock->free = 1; - newblock->heap = p->heap; - - newblock->next = p->next; - newblock->prev = p; - p->next->prev = newblock; - p->next = newblock; - - newblock->next_free = p->next_free; - newblock->prev_free = p; - p->next_free->prev_free = newblock; - p->next_free = newblock; - - p->size = size; - } - - /* p = middle block */ - p->free = 0; - - /* Remove p from the free list: - */ - p->next_free->prev_free = p->prev_free; - p->prev_free->next_free = p->next_free; - - p->next_free = 0; - p->prev_free = 0; - - p->reserved = reserved; - return p; -} - - -struct mem_block * -u_mmAllocMem(struct mem_block *heap, int size, int align2, int startSearch) -{ - struct mem_block *p; - const int mask = (1 << align2)-1; - int startofs = 0; - int endofs; - - assert(size >= 0); - assert(align2 >= 0); - /* Make sure that a byte alignment isn't getting passed for our - * power-of-two alignment arg. - */ - assert(align2 < 32); - - if (!heap || align2 < 0 || size <= 0) - return NULL; - - for (p = heap->next_free; p != heap; p = p->next_free) { - assert(p->free); - - startofs = (p->ofs + mask) & ~mask; - if ( startofs < startSearch ) { - startofs = startSearch; - } - endofs = startofs+size; - if (endofs <= (p->ofs+p->size)) - break; - } - - if (p == heap) - return NULL; - - assert(p->free); - p = SliceBlock(p,startofs,size,0,mask+1); - - return p; -} - - -struct mem_block * -u_mmFindBlock(struct mem_block *heap, int start) -{ - struct mem_block *p; - - for (p = heap->next; p != heap; p = p->next) { - if (p->ofs == start) - return p; - } - - return NULL; -} - - -static inline int -Join2Blocks(struct mem_block *p) -{ - /* XXX there should be some assertions here */ - - /* NOTE: heap->free == 0 */ - - if (p->free && p->next->free) { - struct mem_block *q = p->next; - - assert(p->ofs + p->size == q->ofs); - p->size += q->size; - - p->next = q->next; - q->next->prev = p; - - q->next_free->prev_free = q->prev_free; - q->prev_free->next_free = q->next_free; - - FREE(q); - return 1; - } - return 0; -} - -int -u_mmFreeMem(struct mem_block *b) -{ - if (!b) - return 0; - - if (b->free) { - debug_printf("block already free\n"); - return -1; - } - if (b->reserved) { - debug_printf("block is reserved\n"); - return -1; - } - - b->free = 1; - b->next_free = b->heap->next_free; - b->prev_free = b->heap; - b->next_free->prev_free = b; - b->prev_free->next_free = b; - - Join2Blocks(b); - if (b->prev != b->heap) - Join2Blocks(b->prev); - - return 0; -} - - -void -u_mmDestroy(struct mem_block *heap) -{ - struct mem_block *p; - - if (!heap) - return; - - for (p = heap->next; p != heap; ) { - struct mem_block *next = p->next; - FREE(p); - p = next; - } - - FREE(heap); -} diff -Nru mesa-19.2.8/src/gallium/auxiliary/util/u_mm.h mesa-20.0.8/src/gallium/auxiliary/util/u_mm.h --- mesa-19.2.8/src/gallium/auxiliary/util/u_mm.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/util/u_mm.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,91 +0,0 @@ -/************************************************************************** - * - * Copyright (C) 1999 Wittawat Yamwong - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * KEITH WHITWELL, OR ANY OTHER CONTRIBUTORS BE LIABLE FOR ANY CLAIM, - * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR - * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE - * OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - **************************************************************************/ - - -/** - * @file - * Memory manager code. Primarily used by device drivers to manage texture - * heaps, etc. - */ - - -#ifndef _U_MM_H_ -#define _U_MM_H_ - - -struct mem_block { - struct mem_block *next, *prev; - struct mem_block *next_free, *prev_free; - struct mem_block *heap; - int ofs,size; - unsigned int free:1; - unsigned int reserved:1; -}; - - - -/** - * input: total size in bytes - * return: a heap pointer if OK, NULL if error - */ -extern struct mem_block *u_mmInit(int ofs, int size); - -/** - * Allocate 'size' bytes with 2^align2 bytes alignment, - * restrict the search to free memory after 'startSearch' - * depth and back buffers should be in different 4mb banks - * to get better page hits if possible - * input: size = size of block - * align2 = 2^align2 bytes alignment - * startSearch = linear offset from start of heap to begin search - * return: pointer to the allocated block, 0 if error - */ -extern struct mem_block *u_mmAllocMem(struct mem_block *heap, int size, int align2, - int startSearch); - -/** - * Free block starts at offset - * input: pointer to a block - * return: 0 if OK, -1 if error - */ -extern int u_mmFreeMem(struct mem_block *b); - -/** - * Free block starts at offset - * input: pointer to a heap, start offset - * return: pointer to a block - */ -extern struct mem_block *u_mmFindBlock(struct mem_block *heap, int start); - -/** - * destroy MM - */ -extern void u_mmDestroy(struct mem_block *mmInit); - -/** - * For debugging purposes. - */ -extern void u_mmDumpMemInfo(const struct mem_block *mmInit); - -#endif diff -Nru mesa-19.2.8/src/gallium/auxiliary/util/u_network.c mesa-20.0.8/src/gallium/auxiliary/util/u_network.c --- mesa-19.2.8/src/gallium/auxiliary/util/u_network.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/util/u_network.c 2020-06-12 01:21:16.000000000 +0000 @@ -167,7 +167,7 @@ return -1; } - listen(s, 0); + listen(s, 1); return s; #else diff -Nru mesa-19.2.8/src/gallium/auxiliary/util/u_pack_color.h mesa-20.0.8/src/gallium/auxiliary/util/u_pack_color.h --- mesa-19.2.8/src/gallium/auxiliary/util/u_pack_color.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/util/u_pack_color.h 2020-06-12 01:21:16.000000000 +0000 @@ -38,7 +38,7 @@ #include "pipe/p_compiler.h" #include "pipe/p_format.h" #include "util/u_debug_gallium.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_math.h" diff -Nru mesa-19.2.8/src/gallium/auxiliary/util/u_pstipple.c mesa-20.0.8/src/gallium/auxiliary/util/u_pstipple.c --- mesa-19.2.8/src/gallium/auxiliary/util/u_pstipple.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/util/u_pstipple.c 2020-06-12 01:21:16.000000000 +0000 @@ -45,7 +45,7 @@ #include "pipe/p_shader_tokens.h" #include "util/u_inlines.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_memory.h" #include "util/u_pstipple.h" #include "util/u_sampler.h" diff -Nru mesa-19.2.8/src/gallium/auxiliary/util/u_pwr8.h mesa-20.0.8/src/gallium/auxiliary/util/u_pwr8.h --- mesa-19.2.8/src/gallium/auxiliary/util/u_pwr8.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/util/u_pwr8.h 2020-06-12 01:21:16.000000000 +0000 @@ -32,7 +32,7 @@ #ifndef U_PWR8_H_ #define U_PWR8_H_ -#if defined(_ARCH_PWR8) && defined(PIPE_ARCH_LITTLE_ENDIAN) +#if defined(_ARCH_PWR8) && UTIL_ARCH_LITTLE_ENDIAN #define VECTOR_ALIGN_16 __attribute__ ((__aligned__ (16))) @@ -53,7 +53,7 @@ { __m128i_union vdst; -#ifdef PIPE_ARCH_LITTLE_ENDIAN +#if UTIL_ARCH_LITTLE_ENDIAN vdst.i[0] = i0; vdst.i[1] = i1; vdst.i[2] = i2; @@ -78,7 +78,7 @@ vec_unpacklo_epi32 (__m128i even, __m128i odd) { static const __m128i perm_mask = -#ifdef PIPE_ARCH_LITTLE_ENDIAN +#if UTIL_ARCH_LITTLE_ENDIAN { 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23}; #else {24, 25, 26, 27, 8, 9, 10, 11, 28, 29, 30, 31, 12, 13, 14, 15}; @@ -91,7 +91,7 @@ vec_unpackhi_epi32 (__m128i even, __m128i odd) { static const __m128i perm_mask = -#ifdef PIPE_ARCH_LITTLE_ENDIAN +#if UTIL_ARCH_LITTLE_ENDIAN { 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31}; #else {16, 17, 18, 19, 0, 1, 2, 3, 20, 21, 22, 23, 4, 5, 6, 7}; @@ -104,7 +104,7 @@ vec_unpacklo_epi64 (__m128i even, __m128i odd) { static const __m128i perm_mask = -#ifdef PIPE_ARCH_LITTLE_ENDIAN +#if UTIL_ARCH_LITTLE_ENDIAN { 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23}; #else {24, 25, 26, 27, 28, 29, 30, 31, 8, 9, 10, 11, 12, 13, 14, 15}; @@ -117,7 +117,7 @@ vec_unpackhi_epi64 (__m128i even, __m128i odd) { static const __m128i perm_mask = -#ifdef PIPE_ARCH_LITTLE_ENDIAN +#if UTIL_ARCH_LITTLE_ENDIAN { 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31}; #else {16, 17, 18, 19, 20, 21, 22, 23, 0, 1, 2, 3, 4, 5, 6, 7}; @@ -236,7 +236,7 @@ { __m128i_union vsrc; -#ifdef PIPE_ARCH_LITTLE_ENDIAN +#if UTIL_ARCH_LITTLE_ENDIAN vsrc.m128ui = *((vector unsigned int *) src); @@ -280,7 +280,7 @@ vtemp.m128i = vec_vgbbd(vsrc); -#ifdef PIPE_ARCH_LITTLE_ENDIAN +#if UTIL_ARCH_LITTLE_ENDIAN result = vtemp.ub[15] << 8 | vtemp.ub[7]; #else result = vtemp.ub[0] << 8 | vtemp.ub[8]; @@ -292,7 +292,7 @@ static inline __m128i vec_packs_epi16 (__m128i a, __m128i b) { -#ifdef PIPE_ARCH_LITTLE_ENDIAN +#if UTIL_ARCH_LITTLE_ENDIAN return (__m128i) vec_packs ((vector signed short) a, (vector signed short) b); #else @@ -304,13 +304,13 @@ static inline __m128i vec_packs_epi32 (__m128i a, __m128i b) { -#ifdef PIPE_ARCH_LITTLE_ENDIAN +#if UTIL_ARCH_LITTLE_ENDIAN return (__m128i) vec_packs ((vector signed int) a, (vector signed int) b); #else return (__m128i) vec_packs ((vector signed int) b, (vector signed int) a); #endif } -#endif /* _ARCH_PWR8 && PIPE_ARCH_LITTLE_ENDIAN */ +#endif /* _ARCH_PWR8 && UTIL_ARCH_LITTLE_ENDIAN */ #endif /* U_PWR8_H_ */ diff -Nru mesa-19.2.8/src/gallium/auxiliary/util/u_range.h mesa-20.0.8/src/gallium/auxiliary/util/u_range.h --- mesa-19.2.8/src/gallium/auxiliary/util/u_range.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/util/u_range.h 2020-06-12 01:21:16.000000000 +0000 @@ -35,15 +35,16 @@ #define U_RANGE_H #include "os/os_thread.h" - +#include "pipe/p_state.h" #include "util/u_math.h" +#include "util/simple_mtx.h" struct util_range { unsigned start; /* inclusive */ unsigned end; /* exclusive */ /* for the range to be consistent with multiple contexts: */ - mtx_t write_mutex; + simple_mtx_t write_mutex; }; @@ -56,13 +57,19 @@ /* This is like a union of two sets. */ static inline void -util_range_add(struct util_range *range, unsigned start, unsigned end) +util_range_add(struct pipe_resource *resource, struct util_range *range, + unsigned start, unsigned end) { if (start < range->start || end > range->end) { - mtx_lock(&range->write_mutex); - range->start = MIN2(start, range->start); - range->end = MAX2(end, range->end); - mtx_unlock(&range->write_mutex); + if (resource->flags & PIPE_RESOURCE_FLAG_SINGLE_THREAD_USE) { + range->start = MIN2(start, range->start); + range->end = MAX2(end, range->end); + } else { + simple_mtx_lock(&range->write_mutex); + range->start = MIN2(start, range->start); + range->end = MAX2(end, range->end); + simple_mtx_unlock(&range->write_mutex); + } } } @@ -79,14 +86,14 @@ static inline void util_range_init(struct util_range *range) { - (void) mtx_init(&range->write_mutex, mtx_plain); + (void) simple_mtx_init(&range->write_mutex, mtx_plain); util_range_set_empty(range); } static inline void util_range_destroy(struct util_range *range) { - mtx_destroy(&range->write_mutex); + simple_mtx_destroy(&range->write_mutex); } #endif diff -Nru mesa-19.2.8/src/gallium/auxiliary/util/u_resource.c mesa-20.0.8/src/gallium/auxiliary/util/u_resource.c --- mesa-19.2.8/src/gallium/auxiliary/util/u_resource.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/util/u_resource.c 2020-06-12 01:21:16.000000000 +0000 @@ -26,7 +26,7 @@ #include "pipe/p_defines.h" #include "pipe/p_state.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_math.h" #include "util/u_resource.h" diff -Nru mesa-19.2.8/src/gallium/auxiliary/util/u_sampler.c mesa-20.0.8/src/gallium/auxiliary/util/u_sampler.c --- mesa-19.2.8/src/gallium/auxiliary/util/u_sampler.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/util/u_sampler.c 2020-06-12 01:21:16.000000000 +0000 @@ -26,7 +26,7 @@ **************************************************************************/ -#include "u_format.h" +#include "util/format/u_format.h" #include "u_sampler.h" diff -Nru mesa-19.2.8/src/gallium/auxiliary/util/u_screen.c mesa-20.0.8/src/gallium/auxiliary/util/u_screen.c --- mesa-19.2.8/src/gallium/auxiliary/util/u_screen.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/util/u_screen.c 2020-06-12 01:21:16.000000000 +0000 @@ -296,6 +296,10 @@ */ return 1; + case PIPE_CAP_PREFER_IMM_ARRAYS_AS_CONSTBUF: + /* Don't unset this unless your driver can do better */ + return 1; + case PIPE_CAP_POST_DEPTH_COVERAGE: case PIPE_CAP_BINDLESS_TEXTURE: case PIPE_CAP_NIR_SAMPLERS_AS_DEREF: @@ -337,6 +341,7 @@ case PIPE_CAP_ATOMIC_FLOAT_MINMAX: case PIPE_CAP_SHADER_SAMPLES_IDENTICAL: case PIPE_CAP_TGSI_ATOMINC_WRAP: + case PIPE_CAP_TGSI_TG4_COMPONENT_IN_SWIZZLE: return 0; case PIPE_CAP_MAX_GS_INVOCATIONS: @@ -366,12 +371,19 @@ case PIPE_CAP_COMPUTE_SHADER_DERIVATIVES: return 0; - case PIPE_CAP_MAX_FRAMES_IN_FLIGHT: + case PIPE_CAP_THROTTLE: return 1; case PIPE_CAP_TEXTURE_SHADOW_LOD: return 0; + case PIPE_CAP_GL_SPIRV: + case PIPE_CAP_GL_SPIRV_VARIABLE_POINTERS: + return 0; + + case PIPE_CAP_DEMOTE_TO_HELPER_INVOCATION: + return 0; + case PIPE_CAP_DMABUF: #if defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD) return 1; @@ -379,6 +391,23 @@ return 0; #endif + case PIPE_CAP_TEXTURE_SHADOW_MAP: /* Enables ARB_shadow */ + return 1; + + case PIPE_CAP_FLATSHADE: + case PIPE_CAP_ALPHA_TEST: + case PIPE_CAP_POINT_SIZE_FIXED: + case PIPE_CAP_TWO_SIDED_COLOR: + case PIPE_CAP_CLIP_PLANES: + return 1; + + case PIPE_CAP_MAX_VERTEX_BUFFERS: + return 16; + + case PIPE_CAP_OPENCL_INTEGER_FUNCTIONS: + case PIPE_CAP_INTEGER_MULTIPLY_32X16: + return 0; + default: unreachable("bad PIPE_CAP_*"); } diff -Nru mesa-19.2.8/src/gallium/auxiliary/util/u_simple_shaders.c mesa-20.0.8/src/gallium/auxiliary/util/u_simple_shaders.c --- mesa-19.2.8/src/gallium/auxiliary/util/u_simple_shaders.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/util/u_simple_shaders.c 2020-06-12 01:21:16.000000000 +0000 @@ -145,7 +145,7 @@ "MOV OUT[2].x, SV[0].xxxx\n" "END\n"; struct tgsi_token tokens[1000]; - struct pipe_shader_state state; + struct pipe_shader_state state = {0}; if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) { assert(0); @@ -189,7 +189,7 @@ "EMIT IMM[0].xxxx\n" "END\n"; struct tgsi_token tokens[1000]; - struct pipe_shader_state state; + struct pipe_shader_state state = {0}; if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) { assert(0); @@ -458,7 +458,7 @@ char text[sizeof(shader_templ)+100]; struct tgsi_token tokens[1000]; - struct pipe_shader_state state; + struct pipe_shader_state state = {0}; sprintf(text, shader_templ, write_all_cbufs ? "PROPERTY FS_COLOR0_WRITES_ALL_CBUFS 1\n" : "", @@ -551,7 +551,7 @@ const char *type = tgsi_texture_names[tgsi_tex]; char text[sizeof(shader_templ)+100]; struct tgsi_token tokens[1000]; - struct pipe_shader_state state; + struct pipe_shader_state state = {0}; assert(tgsi_tex == TGSI_TEXTURE_2D_MSAA || tgsi_tex == TGSI_TEXTURE_2D_ARRAY_MSAA); @@ -669,7 +669,7 @@ const char *type = tgsi_texture_names[tgsi_tex]; char text[sizeof(shader_templ)+100]; struct tgsi_token tokens[1000]; - struct pipe_shader_state state; + struct pipe_shader_state state = {0}; assert(tgsi_tex == TGSI_TEXTURE_2D_MSAA || tgsi_tex == TGSI_TEXTURE_2D_ARRAY_MSAA); diff -Nru mesa-19.2.8/src/gallium/auxiliary/util/u_split_draw.c mesa-20.0.8/src/gallium/auxiliary/util/u_split_draw.c --- mesa-19.2.8/src/gallium/auxiliary/util/u_split_draw.c 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/util/u_split_draw.c 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,67 @@ +/* + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "pipe/p_defines.h" +#include "util/u_debug.h" +#include "util/u_split_draw.h" + +bool +u_split_draw(const struct pipe_draw_info *info, uint32_t max_verts, + uint32_t *count, uint32_t *step) +{ + if (*count <= max_verts) { + *step = *count; + return false; + } + + switch (info->mode) { + case PIPE_PRIM_POINTS: + *count = *step = max_verts; + break; + case PIPE_PRIM_LINES: + *count = *step = max_verts - (max_verts % 2); + break; + case PIPE_PRIM_LINE_STRIP: + *count = max_verts; + *step = max_verts - 1; + break; + case PIPE_PRIM_LINE_LOOP: + *count = max_verts; + *step = max_verts - 1; + debug_warn_once("unhandled line loop " + "looping behavior with " + ">max vert count\n"); + break; + case PIPE_PRIM_TRIANGLES: + *count = *step = max_verts - (max_verts % 3); + break; + case PIPE_PRIM_TRIANGLE_STRIP: + *count = max_verts; + *step = max_verts - 2; + break; + default: + debug_warn_once("unhandled primitive " + "max vert count, truncating\n"); + *count = *step = max_verts; + } + + return true; +} diff -Nru mesa-19.2.8/src/gallium/auxiliary/util/u_split_draw.h mesa-20.0.8/src/gallium/auxiliary/util/u_split_draw.h --- mesa-19.2.8/src/gallium/auxiliary/util/u_split_draw.h 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/util/u_split_draw.h 2020-06-12 01:21:16.000000000 +0000 @@ -0,0 +1,48 @@ +/* + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef U_SPLIT_DRAW_H +#define U_SPLIT_DRAW_H + +#include "pipe/p_state.h" + +/** + * For non-indexed drawing, this function helps work around hardware + * limits on the number of verts in a single draw. + * + * For the given mode of primitive from info, calculate the count and + * step in the buffer so the draw can be split into multiple draws. + * + * \param info pointer to the original pipe_draw_info from draw_vbo + * \param max_verts max number of vertices that can be handled by the hardware + * \param count number of vertices remaining in the draw call. It is also + * used as a return parameter, containing how many vertices + * should be sent in the next job to the hardware. + * \param step return parameter, will contain how many vertices should be + * skipped from the original count on the next call to this + * function (may differ from count if the primitive mode + * requires the last vertices to be reused in the next draw) + */ +bool +u_split_draw(const struct pipe_draw_info *info, uint32_t max_verts, + uint32_t *count, uint32_t *step); + +#endif diff -Nru mesa-19.2.8/src/gallium/auxiliary/util/u_surface.c mesa-20.0.8/src/gallium/auxiliary/util/u_surface.c --- mesa-19.2.8/src/gallium/auxiliary/util/u_surface.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/util/u_surface.c 2020-06-12 01:21:16.000000000 +0000 @@ -36,7 +36,7 @@ #include "pipe/p_screen.h" #include "pipe/p_state.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_inlines.h" #include "util/u_rect.h" #include "util/u_surface.h" @@ -57,59 +57,6 @@ } -/** - * Copy 2D rect from one place to another. - * Position and sizes are in pixels. - * src_stride may be negative to do vertical flip of pixels from source. - */ -void -util_copy_rect(ubyte * dst, - enum pipe_format format, - unsigned dst_stride, - unsigned dst_x, - unsigned dst_y, - unsigned width, - unsigned height, - const ubyte * src, - int src_stride, - unsigned src_x, - unsigned src_y) -{ - unsigned i; - int src_stride_pos = src_stride < 0 ? -src_stride : src_stride; - int blocksize = util_format_get_blocksize(format); - int blockwidth = util_format_get_blockwidth(format); - int blockheight = util_format_get_blockheight(format); - - assert(blocksize > 0); - assert(blockwidth > 0); - assert(blockheight > 0); - - dst_x /= blockwidth; - dst_y /= blockheight; - width = (width + blockwidth - 1)/blockwidth; - height = (height + blockheight - 1)/blockheight; - src_x /= blockwidth; - src_y /= blockheight; - - dst += dst_x * blocksize; - src += src_x * blocksize; - dst += dst_y * dst_stride; - src += src_y * src_stride_pos; - width *= blocksize; - - if (width == dst_stride && width == (unsigned)src_stride) - memcpy(dst, src, height * width); - else { - for (i = 0; i < height; i++) { - memcpy(dst, src, width); - dst += dst_stride; - src += src_stride; - } - } -} - - /** * Copy 3D box from one place to another. * Position and sizes are in pixels. diff -Nru mesa-19.2.8/src/gallium/auxiliary/util/u_surface.h mesa-20.0.8/src/gallium/auxiliary/util/u_surface.h --- mesa-19.2.8/src/gallium/auxiliary/util/u_surface.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/util/u_surface.h 2020-06-12 01:21:16.000000000 +0000 @@ -45,12 +45,6 @@ const struct pipe_resource *texture); extern void -util_copy_rect(ubyte * dst, enum pipe_format format, - unsigned dst_stride, unsigned dst_x, unsigned dst_y, - unsigned width, unsigned height, const ubyte * src, - int src_stride, unsigned src_x, unsigned src_y); - -extern void util_copy_box(ubyte * dst, enum pipe_format format, unsigned dst_stride, unsigned dst_slice_stride, diff -Nru mesa-19.2.8/src/gallium/auxiliary/util/u_tests.c mesa-20.0.8/src/gallium/auxiliary/util/u_tests.c --- mesa-19.2.8/src/gallium/auxiliary/util/u_tests.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/util/u_tests.c 2020-06-12 01:21:16.000000000 +0000 @@ -28,7 +28,7 @@ #include "util/u_tests.h" #include "util/u_draw_quad.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_inlines.h" #include "util/u_memory.h" #include "util/u_simple_shaders.h" @@ -38,6 +38,7 @@ #include "tgsi/tgsi_strings.h" #include "tgsi/tgsi_text.h" #include "cso_cache/cso_context.h" +#include "state_tracker/winsys_handle.h" #include #define TOLERANCE 0.01 @@ -858,6 +859,162 @@ util_report_result(pass); } +#define NV12_WIDTH 2560 +#define NV12_HEIGHT 1440 + +static bool +nv12_validate_resource_fields(struct pipe_resource *tex) +{ + return tex->format == util_format_get_plane_format(PIPE_FORMAT_NV12, 0) && + tex->width0 == NV12_WIDTH && + tex->height0 == NV12_HEIGHT && + tex->last_level == 0 && + tex->usage == PIPE_USAGE_DEFAULT && + tex->next && + tex->next->format == util_format_get_plane_format(PIPE_FORMAT_NV12, 1) && + tex->next->width0 == tex->width0 / 2 && + tex->next->height0 == tex->height0 / 2 && + tex->next->usage == tex->usage; +} + +/* This test enforces the behavior of NV12 allocation and exports. */ +static void +test_nv12(struct pipe_screen *screen) +{ + struct pipe_resource *tex = util_create_texture2d(screen, NV12_WIDTH, NV12_HEIGHT, + PIPE_FORMAT_NV12, 1); + + if (!tex) { + printf("resource_create failed\n"); + util_report_result(false); + return; + } + + if (!nv12_validate_resource_fields(tex)) { + printf("incorrect pipe_resource fields\n"); + util_report_result(false); + return; + } + + /* resource_get_param */ + if (screen->resource_get_param) { + struct { + uint64_t handle, dmabuf, offset, stride, planes; + } handle[3]; + + /* Export */ + for (unsigned i = 0; i < 3; i++) { + struct pipe_resource *res = i == 2 ? tex->next : tex; + unsigned plane = i == 2 ? 0 : i; + + if (!screen->resource_get_param(screen, NULL, res, plane, 0, + PIPE_RESOURCE_PARAM_HANDLE_TYPE_KMS, + 0, &handle[i].handle)) { + printf("resource_get_param failed\n"); + util_report_result(false); + goto cleanup; + } + + if (!screen->resource_get_param(screen, NULL, res, plane, 0, + PIPE_RESOURCE_PARAM_HANDLE_TYPE_FD, + 0, &handle[i].dmabuf)) { + printf("resource_get_param failed\n"); + util_report_result(false); + goto cleanup; + } + + if (!screen->resource_get_param(screen, NULL, res, plane, 0, + PIPE_RESOURCE_PARAM_OFFSET, + 0, &handle[i].offset)) { + printf("resource_get_param failed\n"); + util_report_result(false); + goto cleanup; + } + + if (!screen->resource_get_param(screen, NULL, res, plane, 0, + PIPE_RESOURCE_PARAM_STRIDE, + 0, &handle[i].stride)) { + printf("resource_get_param failed\n"); + util_report_result(false); + goto cleanup; + } + + if (!screen->resource_get_param(screen, NULL, res, plane, 0, + PIPE_RESOURCE_PARAM_NPLANES, + 0, &handle[i].planes)) { + printf("resource_get_param failed\n"); + util_report_result(false); + goto cleanup; + } + } + + /* Validate export. */ + bool get_param_pass = /* Sanity checking */ + handle[0].handle && handle[1].handle && handle[2].handle && + handle[0].dmabuf && handle[1].dmabuf && handle[2].dmabuf && + handle[0].stride && handle[1].stride && handle[2].stride && + handle[0].planes == 2 && + handle[1].planes == 2 && + handle[2].planes == 2 && + /* Different planes */ + handle[0].handle == handle[1].handle && + handle[0].offset != handle[1].offset && + /* Same planes. */ + handle[1].handle == handle[2].handle && + handle[1].stride == handle[2].stride && + handle[1].offset == handle[2].offset; + + if (!get_param_pass) { + printf("resource_get_param returned incorrect values\n"); + util_report_result(false); + goto cleanup; + } + } + + /* resource_get_handle */ + struct winsys_handle handle[4] = {{0}}; + + /* Export */ + for (unsigned i = 0; i < 4; i++) { + handle[i].type = i < 2 ? WINSYS_HANDLE_TYPE_KMS : WINSYS_HANDLE_TYPE_FD; + handle[i].plane = i % 2; + + if (!screen->resource_get_handle(screen, NULL, tex, &handle[i], 0)) { + printf("resource_get_handle failed\n"); + util_report_result(false); + goto cleanup; + } + } + + /* Validate export. */ + bool get_handle_pass = /* Sanity checking */ + handle[0].handle && handle[1].handle && + handle[0].stride && handle[1].stride && + handle[2].handle && handle[3].handle && + handle[2].stride && handle[3].stride && + /* KMS - different planes */ + handle[0].handle == handle[1].handle && + handle[0].offset != handle[1].offset && + /* DMABUF - different planes */ + handle[2].offset != handle[3].offset && + /* KMS and DMABUF equivalence */ + handle[0].offset == handle[2].offset && + handle[1].offset == handle[3].offset && + handle[0].stride == handle[2].stride && + handle[1].stride == handle[3].stride; + + if (!get_handle_pass) { + printf("resource_get_handle returned incorrect values\n"); + util_report_result(false); + goto cleanup; + } + + util_report_result(true); + +cleanup: + pipe_resource_reference(&tex, NULL); +} + /** * Run all tests. This should be run with a clean context after * context_create. @@ -884,6 +1041,8 @@ test_compute_clear_image(ctx); ctx->destroy(ctx); + test_nv12(screen); + puts("Done. Exiting.."); exit(0); } diff -Nru mesa-19.2.8/src/gallium/auxiliary/util/u_threaded_context.c mesa-20.0.8/src/gallium/auxiliary/util/u_threaded_context.c --- mesa-19.2.8/src/gallium/auxiliary/util/u_threaded_context.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/util/u_threaded_context.c 2020-06-12 01:21:16.000000000 +0000 @@ -26,7 +26,7 @@ #include "util/u_threaded_context.h" #include "util/u_cpu_detect.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_inlines.h" #include "util/u_memory.h" #include "util/u_upload_mgr.h" @@ -116,7 +116,7 @@ } util_queue_add_job(&tc->queue, next, &next->fence, tc_batch_execute, - NULL); + NULL, 0); tc->last = tc->next; tc->next = (tc->next + 1) % TC_MAX_BATCHES; } @@ -352,7 +352,7 @@ struct threaded_query *tq = threaded_query(payload->query); if (tq->head_unflushed.next) - LIST_DEL(&tq->head_unflushed); + list_del(&tq->head_unflushed); pipe->destroy_query(pipe, payload->query); } @@ -393,7 +393,7 @@ struct threaded_query *tq = threaded_query(p->query); if (!tq->head_unflushed.next) - LIST_ADD(&tq->head_unflushed, &p->tc->unflushed_queries); + list_add(&tq->head_unflushed, &p->tc->unflushed_queries); pipe->end_query(pipe, p->query); } @@ -432,7 +432,7 @@ tq->flushed = true; if (tq->head_unflushed.next) { /* This is safe because it can only happen after we sync'd. */ - LIST_DEL(&tq->head_unflushed); + list_del(&tq->head_unflushed); } } return success; @@ -879,7 +879,8 @@ struct threaded_resource *tres = threaded_resource(images[i].resource); - util_range_add(&tres->valid_buffer_range, images[i].u.buf.offset, + util_range_add(&tres->b, &tres->valid_buffer_range, + images[i].u.buf.offset, images[i].u.buf.offset + images[i].u.buf.size); } } @@ -945,7 +946,8 @@ if (src->buffer) { struct threaded_resource *tres = threaded_resource(src->buffer); - util_range_add(&tres->valid_buffer_range, src->buffer_offset, + util_range_add(&tres->b, &tres->valid_buffer_range, + src->buffer_offset, src->buffer_offset + src->buffer_size); } } @@ -1135,7 +1137,7 @@ struct pipe_stream_output_target *view; tc_sync(threaded_context(_pipe)); - util_range_add(&tres->valid_buffer_range, buffer_offset, + util_range_add(&tres->b, &tres->valid_buffer_range, buffer_offset, buffer_offset + buffer_size); view = pipe->create_stream_output_target(pipe, res, buffer_offset, @@ -1538,7 +1540,8 @@ ttrans->staging, 0, &src_box); } - util_range_add(tres->base_valid_buffer_range, box->x, box->x + box->width); + util_range_add(&tres->b, tres->base_valid_buffer_range, + box->x, box->x + box->width); } static void @@ -1658,7 +1661,7 @@ return; } - util_range_add(&tres->valid_buffer_range, offset, offset + size); + util_range_add(&tres->b, &tres->valid_buffer_range, offset, offset + size); /* The upload is small. Enqueue it. */ struct tc_buffer_subdata *p = @@ -1962,7 +1965,7 @@ { struct threaded_query *tq, *tmp; LIST_FOR_EACH_ENTRY_SAFE(tq, tmp, &tc->unflushed_queries, head_unflushed) { - LIST_DEL(&tq->head_unflushed); + list_del(&tq->head_unflushed); /* Memory release semantics: due to a possible race with * tc_get_query_result, we must ensure that the linked list changes @@ -2185,7 +2188,8 @@ p->src_box = *src_box; if (dst->target == PIPE_BUFFER) - util_range_add(&tdst->valid_buffer_range, dstx, dstx + src_box->width); + util_range_add(&tdst->b, &tdst->valid_buffer_range, + dstx, dstx + src_box->width); } static void @@ -2401,7 +2405,7 @@ memcpy(p->clear_value, clear_value, clear_value_size); p->clear_value_size = clear_value_size; - util_range_add(&tres->valid_buffer_range, offset, offset + size); + util_range_add(&tres->b, &tres->valid_buffer_range, offset, offset + size); } struct tc_clear_texture { @@ -2623,7 +2627,7 @@ util_queue_fence_init(&tc->batch_slots[i].fence); } - LIST_INITHEAD(&tc->unflushed_queries); + list_inithead(&tc->unflushed_queries); slab_create_child(&tc->pool_transfers, parent_transfer_pool); diff -Nru mesa-19.2.8/src/gallium/auxiliary/util/u_tile.c mesa-20.0.8/src/gallium/auxiliary/util/u_tile.c --- mesa-19.2.8/src/gallium/auxiliary/util/u_tile.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/util/u_tile.c 2020-06-12 01:21:16.000000000 +0000 @@ -34,8 +34,8 @@ #include "pipe/p_defines.h" #include "util/u_inlines.h" -#include "util/u_format.h" -#include "util/u_format_bptc.h" +#include "util/format/u_format.h" +#include "util/format/u_format_bptc.h" #include "util/u_math.h" #include "util/u_memory.h" #include "util/u_surface.h" diff -Nru mesa-19.2.8/src/gallium/auxiliary/util/u_transfer_helper.c mesa-20.0.8/src/gallium/auxiliary/util/u_transfer_helper.c --- mesa-19.2.8/src/gallium/auxiliary/util/u_transfer_helper.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/util/u_transfer_helper.c 2020-06-12 01:21:16.000000000 +0000 @@ -24,9 +24,9 @@ #include "pipe/p_screen.h" #include "util/u_box.h" -#include "util/u_format.h" -#include "util/u_format_rgtc.h" -#include "util/u_format_zs.h" +#include "util/format/u_format.h" +#include "util/format/u_format_rgtc.h" +#include "util/format/u_format_zs.h" #include "util/u_inlines.h" #include "util/u_transfer_helper.h" diff -Nru mesa-19.2.8/src/gallium/auxiliary/util/u_upload_mgr.c mesa-20.0.8/src/gallium/auxiliary/util/u_upload_mgr.c --- mesa-19.2.8/src/gallium/auxiliary/util/u_upload_mgr.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/util/u_upload_mgr.c 2020-06-12 01:21:16.000000000 +0000 @@ -202,7 +202,7 @@ buffer.format = PIPE_FORMAT_R8_UNORM; /* want TYPELESS or similar */ buffer.bind = upload->bind; buffer.usage = upload->usage; - buffer.flags = upload->flags; + buffer.flags = upload->flags | PIPE_RESOURCE_FLAG_SINGLE_THREAD_USE; buffer.width0 = size; buffer.height0 = 1; buffer.depth0 = 1; diff -Nru mesa-19.2.8/src/gallium/auxiliary/util/u_vbuf.c mesa-20.0.8/src/gallium/auxiliary/util/u_vbuf.c --- mesa-19.2.8/src/gallium/auxiliary/util/u_vbuf.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/util/u_vbuf.c 2020-06-12 01:21:16.000000000 +0000 @@ -88,9 +88,10 @@ #include "util/u_vbuf.h" #include "util/u_dump.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_inlines.h" #include "util/u_memory.h" +#include "util/u_screen.h" #include "util/u_upload_mgr.h" #include "translate/translate.h" #include "translate/translate_cache.h" @@ -180,6 +181,8 @@ uint32_t incompatible_vb_mask; /* each bit describes a corresp. buffer */ /* Which buffer has a non-zero stride. */ uint32_t nonzero_stride_vb_mask; /* each bit describes a corresp. buffer */ + /* Which buffers are allowed (supported by hardware). */ + uint32_t allowed_vb_mask; }; static void * @@ -252,11 +255,11 @@ { PIPE_FORMAT_R8G8B8A8_SSCALED, PIPE_FORMAT_R32G32B32A32_FLOAT }, }; -boolean u_vbuf_get_caps(struct pipe_screen *screen, struct u_vbuf_caps *caps, - unsigned flags) +void u_vbuf_get_caps(struct pipe_screen *screen, struct u_vbuf_caps *caps) { unsigned i; - boolean fallback = FALSE; + + memset(caps, 0, sizeof(*caps)); /* I'd rather have a bitfield of which formats are supported and a static * table of the translations indexed by format, but since we don't have C99 @@ -272,7 +275,7 @@ if (!screen->is_format_supported(screen, format, PIPE_BUFFER, 0, 0, PIPE_BIND_VERTEX_BUFFER)) { caps->format_translation[format] = vbuf_format_fallbacks[i].to; - fallback = TRUE; + caps->fallback_always = true; } } @@ -287,15 +290,20 @@ PIPE_CAP_VERTEX_ELEMENT_SRC_OFFSET_4BYTE_ALIGNED_ONLY); caps->user_vertex_buffers = screen->get_param(screen, PIPE_CAP_USER_VERTEX_BUFFERS); + caps->max_vertex_buffers = + screen->get_param(screen, PIPE_CAP_MAX_VERTEX_BUFFERS); + + /* OpenGL 2.0 requires a minimum of 16 vertex buffers */ + if (caps->max_vertex_buffers < 16) + caps->fallback_always = true; if (!caps->buffer_offset_unaligned || !caps->buffer_stride_unaligned || - !caps->velem_src_offset_unaligned || - (!(flags & U_VBUF_FLAG_NO_USER_VBOS) && !caps->user_vertex_buffers)) { - fallback = TRUE; - } + !caps->velem_src_offset_unaligned) + caps->fallback_always = true; - return fallback; + if (!caps->fallback_always && !caps->user_vertex_buffers) + caps->fallback_only_for_user_vbuffers = true; } struct u_vbuf * @@ -308,6 +316,7 @@ mgr->cso_cache = cso_cache_create(); mgr->translate_cache = translate_cache_create(); memset(mgr->fallback_vbs, ~0, sizeof(mgr->fallback_vbs)); + mgr->allowed_vb_mask = u_bit_consecutive(0, mgr->caps.max_vertex_buffers); mgr->has_signed_vb_offset = pipe->screen->get_param(pipe->screen, @@ -364,6 +373,11 @@ mgr->ve = u_vbuf_set_vertex_elements_internal(mgr, count, states); } +void u_vbuf_unset_vertex_elements(struct u_vbuf *mgr) +{ + mgr->ve = NULL; +} + void u_vbuf_destroy(struct u_vbuf *mgr) { struct pipe_screen *screen = mgr->pipe->screen; @@ -536,16 +550,24 @@ uint32_t unused_vb_mask = mgr->ve->incompatible_vb_mask_all | mgr->incompatible_vb_mask | ~mgr->enabled_vb_mask; + uint32_t unused_vb_mask_orig; + boolean insufficient_buffers = false; + + /* No vertex buffers available at all */ + if (!unused_vb_mask) + return FALSE; memset(fallback_vbs, ~0, sizeof(fallback_vbs)); /* Find free slots for each type if needed. */ + unused_vb_mask_orig = unused_vb_mask; for (type = 0; type < VB_NUM; type++) { if (mask[type]) { uint32_t index; if (!unused_vb_mask) { - return FALSE; + insufficient_buffers = true; + break; } index = ffs(unused_vb_mask) - 1; @@ -555,6 +577,17 @@ } } + if (insufficient_buffers) { + /* not enough vbs for all types supported by the hardware, they will have to share one + * buffer */ + uint32_t index = ffs(unused_vb_mask_orig) - 1; + /* When sharing one vertex buffer use per-vertex frequency for everything. */ + fallback_vbs[VB_VERTEX] = index; + mask[VB_VERTEX] = mask[VB_VERTEX] | mask[VB_CONST] | mask[VB_INSTANCE]; + mask[VB_CONST] = 0; + mask[VB_INSTANCE] = 0; + } + for (type = 0; type < VB_NUM; type++) { if (mask[type]) { mgr->dirty_real_vb_mask |= 1 << fallback_vbs[type]; @@ -783,6 +816,17 @@ } } + if (used_buffers & ~mgr->allowed_vb_mask) { + /* More vertex buffers are used than the hardware supports. In + * principle, we only need to make sure that less vertex buffers are + * used, and mark some of the latter vertex buffers as incompatible. + * For now, mark all vertex buffers as incompatible. + */ + ve->incompatible_vb_mask_any = used_buffers; + ve->compatible_vb_mask_any = 0; + ve->incompatible_elem_mask = u_bit_consecutive(0, count); + } + ve->used_vb_mask = used_buffers; ve->compatible_vb_mask_all = ~ve->incompatible_vb_mask_any & used_buffers; ve->incompatible_vb_mask_all = ~ve->compatible_vb_mask_any & used_buffers; @@ -795,8 +839,12 @@ } } - ve->driver_cso = - pipe->create_vertex_elements_state(pipe, count, driver_attribs); + /* Only create driver CSO if no incompatible elements */ + if (!ve->incompatible_elem_mask) { + ve->driver_cso = + pipe->create_vertex_elements_state(pipe, count, driver_attribs); + } + return ve; } @@ -805,7 +853,8 @@ struct pipe_context *pipe = mgr->pipe; struct u_vbuf_elements *ve = cso; - pipe->delete_vertex_elements_state(pipe, ve->driver_cso); + if (ve->driver_cso) + pipe->delete_vertex_elements_state(pipe, ve->driver_cso); FREE(ve); } @@ -1030,12 +1079,17 @@ const void *indices, unsigned *out_min_index, unsigned *out_max_index) { - unsigned max = 0; - unsigned min = ~0u; + if (!info->count) { + *out_min_index = 0; + *out_max_index = 0; + return; + } switch (info->index_size) { case 4: { const unsigned *ui_indices = (const unsigned*)indices; + unsigned max = 0; + unsigned min = ~0u; if (info->primitive_restart) { for (unsigned i = 0; i < info->count; i++) { if (ui_indices[i] != info->restart_index) { @@ -1050,10 +1104,14 @@ if (ui_indices[i] < min) min = ui_indices[i]; } } + *out_min_index = min; + *out_max_index = max; break; } case 2: { const unsigned short *us_indices = (const unsigned short*)indices; + unsigned short max = 0; + unsigned short min = ~((unsigned short)0); if (info->primitive_restart) { for (unsigned i = 0; i < info->count; i++) { if (us_indices[i] != info->restart_index) { @@ -1068,10 +1126,14 @@ if (us_indices[i] < min) min = us_indices[i]; } } + *out_min_index = min; + *out_max_index = max; break; } case 1: { const unsigned char *ub_indices = (const unsigned char*)indices; + unsigned char max = 0; + unsigned char min = ~((unsigned char)0); if (info->primitive_restart) { for (unsigned i = 0; i < info->count; i++) { if (ub_indices[i] != info->restart_index) { @@ -1086,14 +1148,13 @@ if (ub_indices[i] < min) min = ub_indices[i]; } } + *out_min_index = min; + *out_max_index = max; break; } default: assert(0); } - - *out_min_index = min; - *out_max_index = max; } void u_vbuf_get_minmax_index(struct pipe_context *pipe, diff -Nru mesa-19.2.8/src/gallium/auxiliary/util/u_vbuf.h mesa-20.0.8/src/gallium/auxiliary/util/u_vbuf.h --- mesa-19.2.8/src/gallium/auxiliary/util/u_vbuf.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/util/u_vbuf.h 2020-06-12 01:21:16.000000000 +0000 @@ -40,8 +40,6 @@ struct cso_context; struct u_vbuf; -#define U_VBUF_FLAG_NO_USER_VBOS (1 << 0) - /* Hardware vertex fetcher limitations can be described by this structure. */ struct u_vbuf_caps { enum pipe_format format_translation[PIPE_FORMAT_COUNT]; @@ -54,11 +52,16 @@ /* Whether the driver supports user vertex buffers. */ unsigned user_vertex_buffers:1; + + /* Maximum number of vertex buffers */ + unsigned max_vertex_buffers:6; + + bool fallback_always; + bool fallback_only_for_user_vbuffers; }; -boolean u_vbuf_get_caps(struct pipe_screen *screen, struct u_vbuf_caps *caps, - unsigned flags); +void u_vbuf_get_caps(struct pipe_screen *screen, struct u_vbuf_caps *caps); struct u_vbuf * u_vbuf_create(struct pipe_context *pipe, struct u_vbuf_caps *caps); @@ -68,6 +71,7 @@ /* State and draw functions. */ void u_vbuf_set_vertex_elements(struct u_vbuf *mgr, unsigned count, const struct pipe_vertex_element *states); +void u_vbuf_unset_vertex_elements(struct u_vbuf *mgr); void u_vbuf_set_vertex_buffers(struct u_vbuf *mgr, unsigned start_slot, unsigned count, const struct pipe_vertex_buffer *bufs); diff -Nru mesa-19.2.8/src/gallium/auxiliary/vl/vl_bicubic_filter.c mesa-20.0.8/src/gallium/auxiliary/vl/vl_bicubic_filter.c --- mesa-19.2.8/src/gallium/auxiliary/vl/vl_bicubic_filter.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/vl/vl_bicubic_filter.c 2020-06-12 01:21:16.000000000 +0000 @@ -423,7 +423,7 @@ } viewport.scale[2] = 1; - struct pipe_constant_buffer cb = {}; + struct pipe_constant_buffer cb = {0}; float *ptr = NULL; u_upload_alloc(filter->pipe->const_uploader, 0, 2 * sizeof(float), 256, diff -Nru mesa-19.2.8/src/gallium/auxiliary/vl/vl_compositor_cs.c mesa-20.0.8/src/gallium/auxiliary/vl/vl_compositor_cs.c --- mesa-19.2.8/src/gallium/auxiliary/vl/vl_compositor_cs.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/vl/vl_compositor_cs.c 2020-06-12 01:21:16.000000000 +0000 @@ -514,7 +514,9 @@ /* Scale */ "DIV TEMP[2], TEMP[2], CONST[3].zwzw\n" + "DIV TEMP[2], TEMP[2], IMM[1].xyxy\n" "DIV TEMP[3], TEMP[3], CONST[3].zwzw\n" + "DIV TEMP[3], TEMP[3], IMM[1].xyxy\n" /* Fetch texels */ "TEX_LZ TEMP[4].x, TEMP[2], SAMP[0], RECT\n" @@ -564,7 +566,9 @@ /* Scale */ "DIV TEMP[2], TEMP[2], CONST[3].zwzw\n" + "DIV TEMP[2], TEMP[2], IMM[1].xyxy\n" "DIV TEMP[3], TEMP[3], CONST[3].zwzw\n" + "DIV TEMP[3], TEMP[3], IMM[1].xyxy\n" /* Fetch texels */ "TEX_LZ TEMP[4].x, TEMP[2], SAMP[0], RECT\n" @@ -588,7 +592,7 @@ struct pipe_context *ctx = c->pipe; /* Bind the image */ - struct pipe_image_view image = {}; + struct pipe_image_view image = {0}; image.resource = c->fb_state.cbufs[0]->texture; image.shader_access = image.access = PIPE_IMAGE_ACCESS_READ_WRITE; image.format = c->fb_state.cbufs[0]->texture->format; @@ -599,7 +603,7 @@ ctx->bind_compute_state(ctx, cs); /* Dispatch compute */ - struct pipe_grid_info info = {}; + struct pipe_grid_info info = {0}; info.block[0] = 8; info.block[1] = 8; info.block[2] = 1; @@ -741,7 +745,7 @@ return NULL; } - struct pipe_compute_state state = {}; + struct pipe_compute_state state = {0}; state.ir_type = PIPE_SHADER_IR_TGSI; state.prog = tokens; diff -Nru mesa-19.2.8/src/gallium/auxiliary/vl/vl_deint_filter.c mesa-20.0.8/src/gallium/auxiliary/vl/vl_deint_filter.c --- mesa-19.2.8/src/gallium/auxiliary/vl/vl_deint_filter.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/vl/vl_deint_filter.c 2020-06-12 01:21:16.000000000 +0000 @@ -47,7 +47,7 @@ #include "util/u_draw.h" #include "util/u_memory.h" #include "util/u_math.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "vl_types.h" #include "vl_video_buffer.h" diff -Nru mesa-19.2.8/src/gallium/auxiliary/vl/vl_stubs.c mesa-20.0.8/src/gallium/auxiliary/vl/vl_stubs.c --- mesa-19.2.8/src/gallium/auxiliary/vl/vl_stubs.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/vl/vl_stubs.c 2020-06-12 01:21:16.000000000 +0000 @@ -39,11 +39,11 @@ /* * vl_video_buffer stubs */ -const enum pipe_format * -vl_video_buffer_formats(struct pipe_screen *screen, enum pipe_format format) +void +vl_get_video_buffer_formats(struct pipe_screen *screen, enum pipe_format format, + enum pipe_format out_format[VL_NUM_COMPONENTS]) { assert(0); - return NULL; } bool diff -Nru mesa-19.2.8/src/gallium/auxiliary/vl/vl_vertex_buffers.c mesa-20.0.8/src/gallium/auxiliary/vl/vl_vertex_buffers.c --- mesa-19.2.8/src/gallium/auxiliary/vl/vl_vertex_buffers.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/vl/vl_vertex_buffers.c 2020-06-12 01:21:16.000000000 +0000 @@ -26,7 +26,7 @@ **************************************************************************/ #include -#include "util/u_format.h" +#include "util/format/u_format.h" #include "vl_vertex_buffers.h" #include "vl_types.h" diff -Nru mesa-19.2.8/src/gallium/auxiliary/vl/vl_video_buffer.c mesa-20.0.8/src/gallium/auxiliary/vl/vl_video_buffer.c --- mesa-19.2.8/src/gallium/auxiliary/vl/vl_video_buffer.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/vl/vl_video_buffer.c 2020-06-12 01:21:16.000000000 +0000 @@ -31,67 +31,13 @@ #include "pipe/p_context.h" #include "pipe/p_state.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_inlines.h" #include "util/u_sampler.h" #include "util/u_memory.h" #include "vl_video_buffer.h" -const enum pipe_format const_resource_formats_YV12[3] = { - PIPE_FORMAT_R8_UNORM, - PIPE_FORMAT_R8_UNORM, - PIPE_FORMAT_R8_UNORM -}; - -const enum pipe_format const_resource_formats_NV12[3] = { - PIPE_FORMAT_R8_UNORM, - PIPE_FORMAT_R8G8_UNORM, - PIPE_FORMAT_NONE -}; - -const enum pipe_format const_resource_formats_YUVA[3] = { - PIPE_FORMAT_R8G8B8A8_UNORM, - PIPE_FORMAT_NONE, - PIPE_FORMAT_NONE -}; - -const enum pipe_format const_resource_formats_VUYA[3] = { - PIPE_FORMAT_B8G8R8A8_UNORM, - PIPE_FORMAT_NONE, - PIPE_FORMAT_NONE -}; - -const enum pipe_format const_resource_formats_YUVX[3] = { - PIPE_FORMAT_R8G8B8X8_UNORM, - PIPE_FORMAT_NONE, - PIPE_FORMAT_NONE -}; - -const enum pipe_format const_resource_formats_VUYX[3] = { - PIPE_FORMAT_B8G8R8X8_UNORM, - PIPE_FORMAT_NONE, - PIPE_FORMAT_NONE -}; - -const enum pipe_format const_resource_formats_YUYV[3] = { - PIPE_FORMAT_R8G8_R8B8_UNORM, - PIPE_FORMAT_NONE, - PIPE_FORMAT_NONE -}; - -const enum pipe_format const_resource_formats_UYVY[3] = { - PIPE_FORMAT_G8R8_B8R8_UNORM, - PIPE_FORMAT_NONE, - PIPE_FORMAT_NONE -}; - -const enum pipe_format const_resource_formats_P016[3] = { - PIPE_FORMAT_R16_UNORM, - PIPE_FORMAT_R16G16_UNORM, - PIPE_FORMAT_NONE -}; - const unsigned const_resource_plane_order_YUV[3] = { 0, 1, @@ -104,40 +50,22 @@ 1 }; -const enum pipe_format * -vl_video_buffer_formats(struct pipe_screen *screen, enum pipe_format format) +void +vl_get_video_buffer_formats(struct pipe_screen *screen, enum pipe_format format, + enum pipe_format out_format[VL_NUM_COMPONENTS]) { - switch(format) { - case PIPE_FORMAT_YV12: - return const_resource_formats_YV12; - - case PIPE_FORMAT_NV12: - return const_resource_formats_NV12; - - case PIPE_FORMAT_R8G8B8A8_UNORM: - return const_resource_formats_YUVA; - - case PIPE_FORMAT_B8G8R8A8_UNORM: - return const_resource_formats_VUYA; - - case PIPE_FORMAT_R8G8B8X8_UNORM: - return const_resource_formats_YUVX; - - case PIPE_FORMAT_B8G8R8X8_UNORM: - return const_resource_formats_VUYX; - - case PIPE_FORMAT_YUYV: - return const_resource_formats_YUYV; - - case PIPE_FORMAT_UYVY: - return const_resource_formats_UYVY; - - case PIPE_FORMAT_P016: - return const_resource_formats_P016; + unsigned num_planes = util_format_get_num_planes(format); + unsigned i; - default: - return NULL; - } + for (i = 0; i < num_planes; i++) + out_format[i] = util_format_get_plane_format(format, i); + for (; i < VL_NUM_COMPONENTS; i++) + out_format[i] = PIPE_FORMAT_NONE; + + if (format == PIPE_FORMAT_YUYV) + out_format[0] = PIPE_FORMAT_R8G8_R8B8_UNORM; + else if (format == PIPE_FORMAT_UYVY) + out_format[0] = PIPE_FORMAT_G8R8_B8R8_UNORM; } const unsigned * @@ -152,6 +80,7 @@ case PIPE_FORMAT_B8G8R8A8_UNORM: case PIPE_FORMAT_YUYV: case PIPE_FORMAT_UYVY: + case PIPE_FORMAT_P010: case PIPE_FORMAT_P016: return const_resource_plane_order_YUV; @@ -178,12 +107,10 @@ enum pipe_video_profile profile, enum pipe_video_entrypoint entrypoint) { - const enum pipe_format *resource_formats; + enum pipe_format resource_formats[VL_NUM_COMPONENTS]; unsigned i; - resource_formats = vl_video_buffer_formats(screen, format); - if (!resource_formats) - return false; + vl_get_video_buffer_formats(screen, format, resource_formats); for (i = 0; i < VL_NUM_COMPONENTS; ++i) { enum pipe_format format = resource_formats[i]; @@ -328,7 +255,7 @@ struct vl_video_buffer *buf = (struct vl_video_buffer *)buffer; struct pipe_sampler_view sv_templ; struct pipe_context *pipe; - const enum pipe_format *sampler_format; + enum pipe_format sampler_format[VL_NUM_COMPONENTS]; const unsigned *plane_order; unsigned i, j, component; @@ -336,7 +263,7 @@ pipe = buf->base.context; - sampler_format = vl_video_buffer_formats(pipe->screen, buf->base.buffer_format); + vl_get_video_buffer_formats(pipe->screen, buf->base.buffer_format, sampler_format); plane_order = vl_video_buffer_plane_order(buf->base.buffer_format); for (component = 0, i = 0; i < buf->num_planes; ++i ) { @@ -416,7 +343,7 @@ vl_video_buffer_create(struct pipe_context *pipe, const struct pipe_video_buffer *tmpl) { - const enum pipe_format *resource_formats; + enum pipe_format resource_formats[VL_NUM_COMPONENTS]; struct pipe_video_buffer templat, *result; bool pot_buffers; @@ -431,9 +358,7 @@ PIPE_VIDEO_CAP_NPOT_TEXTURES ); - resource_formats = vl_video_buffer_formats(pipe->screen, tmpl->buffer_format); - if (!resource_formats) - return NULL; + vl_get_video_buffer_formats(pipe->screen, tmpl->buffer_format, resource_formats); templat = *tmpl; templat.width = pot_buffers ? util_next_power_of_two(tmpl->width) @@ -531,3 +456,43 @@ return &buffer->base; } + +/* Create pipe_video_buffer by using resource_create with planar formats. */ +struct pipe_video_buffer * +vl_video_buffer_create_as_resource(struct pipe_context *pipe, + const struct pipe_video_buffer *tmpl) +{ + struct pipe_resource templ, *resources[VL_NUM_COMPONENTS] = {0}; + unsigned array_size = tmpl->interlaced ? 2 : 1; + + memset(&templ, 0, sizeof(templ)); + templ.target = array_size > 1 ? PIPE_TEXTURE_2D_ARRAY : PIPE_TEXTURE_2D; + templ.width0 = align(tmpl->width, VL_MACROBLOCK_WIDTH); + templ.height0 = align(tmpl->height / array_size, VL_MACROBLOCK_HEIGHT); + templ.depth0 = 1; + templ.array_size = array_size; + templ.bind = PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_RENDER_TARGET | tmpl->bind; + templ.usage = PIPE_USAGE_DEFAULT; + + if (tmpl->buffer_format == PIPE_FORMAT_YUYV) + templ.format = PIPE_FORMAT_R8G8_R8B8_UNORM; + else if (tmpl->buffer_format == PIPE_FORMAT_UYVY) + templ.format = PIPE_FORMAT_G8R8_B8R8_UNORM; + else + templ.format = tmpl->buffer_format; + + resources[0] = pipe->screen->resource_create(pipe->screen, &templ); + if (!resources[0]) + return NULL; + + if (resources[0]->next) { + pipe_resource_reference(&resources[1], resources[0]->next); + if (resources[1]->next) + pipe_resource_reference(&resources[2], resources[1]->next); + } + + struct pipe_video_buffer vidtemplate = *tmpl; + vidtemplate.width = templ.width0; + vidtemplate.height = templ.height0 * array_size; + return vl_video_buffer_create_ex2(pipe, &vidtemplate, resources); +} diff -Nru mesa-19.2.8/src/gallium/auxiliary/vl/vl_video_buffer.h mesa-20.0.8/src/gallium/auxiliary/vl/vl_video_buffer.h --- mesa-19.2.8/src/gallium/auxiliary/vl/vl_video_buffer.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/vl/vl_video_buffer.h 2020-06-12 01:21:16.000000000 +0000 @@ -69,8 +69,9 @@ /** * get subformats for each plane */ -const enum pipe_format * -vl_video_buffer_formats(struct pipe_screen *screen, enum pipe_format format); +void +vl_get_video_buffer_formats(struct pipe_screen *screen, enum pipe_format format, + enum pipe_format out_format[VL_NUM_COMPONENTS]); /** * get YUV plane order @@ -144,4 +145,9 @@ const struct pipe_video_buffer *templat, struct pipe_resource *resources[VL_NUM_COMPONENTS]); +/* Create pipe_video_buffer by using resource_create with planar formats. */ +struct pipe_video_buffer * +vl_video_buffer_create_as_resource(struct pipe_context *pipe, + const struct pipe_video_buffer *tmpl); + #endif /* vl_video_buffer_h */ diff -Nru mesa-19.2.8/src/gallium/auxiliary/vl/vl_vlc.h mesa-20.0.8/src/gallium/auxiliary/vl/vl_vlc.h --- mesa-19.2.8/src/gallium/auxiliary/vl/vl_vlc.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/auxiliary/vl/vl_vlc.h 2020-06-12 01:21:16.000000000 +0000 @@ -151,7 +151,7 @@ /* enough bytes in buffer, read in a whole dword */ uint64_t value = *(const uint32_t*)vlc->data; -#ifndef PIPE_ARCH_BIG_ENDIAN +#if !UTIL_ARCH_BIG_ENDIAN value = util_bswap32(value); #endif diff -Nru mesa-19.2.8/src/gallium/docs/source/screen.rst mesa-20.0.8/src/gallium/docs/source/screen.rst --- mesa-19.2.8/src/gallium/docs/source/screen.rst 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/docs/source/screen.rst 2020-06-12 01:21:16.000000000 +0000 @@ -34,6 +34,9 @@ bound. * ``PIPE_CAP_OCCLUSION_QUERY``: Whether occlusion queries are available. * ``PIPE_CAP_QUERY_TIME_ELAPSED``: Whether PIPE_QUERY_TIME_ELAPSED queries are available. +* ``PIPE_CAP_TEXTURE_SHADOW_MAP``: indicates whether the fragment shader hardware + can do the depth texture / Z comparison operation in TEX instructions + for shadow testing. * ``PIPE_CAP_TEXTURE_SWIZZLE``: Whether swizzling through sampler views is supported. * ``PIPE_CAP_MAX_TEXTURE_2D_SIZE``: The maximum size of 2D (and 1D) textures. @@ -523,8 +526,7 @@ A driver might rely on the input mapping that was defined with the original GLSL code. * ``PIPE_CAP_IMAGE_LOAD_FORMATTED``: True if a format for image loads does not need to be specified in the shader IR -* ``PIPE_CAP_MAX_FRAMES_IN_FLIGHT``: Maximum number of frames that state - trackers should allow to be in flight before throttling pipe_context +* ``PIPE_CAP_THROTTLE``: Whether or not state trackers should throttle pipe_context execution. 0 = throttling is disabled. * ``PIPE_CAP_DMABUF``: Whether Linux DMABUF handles are supported by resource_from_handle and resource_get_handle. @@ -548,6 +550,23 @@ types with texture functions having interaction with LOD of texture lookup. * ``PIPE_CAP_SHADER_SAMPLES_IDENTICAL``: True if the driver supports a shader query to tell whether all samples of a multisampled surface are definitely identical. * ``PIPE_CAP_TGSI_ATOMINC_WRAP``: Atomic increment/decrement + wrap around are supported. +* ``PIPE_CAP_PREFER_IMM_ARRAYS_AS_CONSTBUF``: True if the state tracker should + turn arrays whose contents can be deduced at compile time into constant + buffer loads, or false if the driver can handle such arrays itself in a more + efficient manner. +* ``PIPE_CAP_GL_SPIRV``: True if the driver supports ARB_gl_spirv extension. +* ``PIPE_CAP_GL_SPIRV_VARIABLE_POINTERS``: True if the driver supports Variable Pointers in SPIR-V shaders. +* ``PIPE_CAP_DEMOTE_TO_HELPER_INVOCATION``: True if driver supports demote keyword in GLSL programs. +* ``PIPE_CAP_TGSI_TG4_COMPONENT_IN_SWIZZLE``: True if driver wants the TG4 component encoded in sampler swizzle rather than as a separate source. +* ``PIPE_CAP_FLATSHADE``: Driver supports pipe_rasterizer_state::flatshade. +* ``PIPE_CAP_ALPHA_TEST``: Driver supports alpha-testing. +* ``PIPE_CAP_POINT_SIZE_FIXED``: Driver supports point-sizes that are fixed, + as opposed to writing gl_PointSize for every point. +* ``PIPE_CAP_TWO_SIDED_COLOR``: Driver supports two-sided coloring. +* ``PIPE_CAP_CLIP_PLANES``: Driver supports user-defined clip-planes. +* ``PIPE_CAP_MAX_VERTEX_BUFFERS``: Number of supported vertex buffers. +* ``PIPE_CAP_OPENCL_INTEGER_FUNCTIONS``: Driver supports extended OpenCL-style integer functions. This includes averge, saturating additiong, saturating subtraction, absolute difference, count leading zeros, and count trailing zeros. +* ``PIPE_CAP_INTEGER_MULTIPLY_32X16``: Driver supports integer multiplication between a 32-bit integer and a 16-bit integer. If the second operand is 32-bits, the upper 16-bits are ignored, and the low 16-bits are possibly sign extended as necessary. .. _pipe_capf: @@ -655,7 +674,6 @@ how many HW counters are available for this stage. (0 uses SSBO atomics). * ``PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTER_BUFFERS``: If atomic counters are separate, how many atomic counter buffers are available for this stage. -* ``PIPE_SHADER_CAP_SCALAR_ISA``: Whether the ISA is a scalar one. .. _pipe_compute_cap: diff -Nru mesa-19.2.8/src/gallium/docs/source/tgsi.rst mesa-20.0.8/src/gallium/docs/source/tgsi.rst --- mesa-19.2.8/src/gallium/docs/source/tgsi.rst 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/docs/source/tgsi.rst 2020-06-12 01:21:16.000000000 +0000 @@ -681,6 +681,27 @@ Unconditional discard. Allowed in fragment shaders only. +.. opcode:: DEMOTE - Demote Invocation to a Helper + + This demotes the current invocation to a helper, but continues + execution (while KILL may or may not terminate the + invocation). After this runs, all the usual helper invocation rules + apply about discarding buffer and render target writes. This is + useful for having accurate derivatives in the other invocations + which have not been demoted. + + Allowed in fragment shaders only. + + +.. opcode:: READ_HELPER - Reads Invocation Helper Status + + This is identical to ``TGSI_SEMANTIC_HELPER_INVOCATION``, except + this will read the current value, which might change as a result of + a ``DEMOTE`` instruction. + + Allowed in fragment shaders only. + + .. opcode:: TXB - Texture Lookup With Bias for cube map array textures and shadow cube maps, the bias value @@ -941,14 +962,22 @@ require another CAP is hw can do it natively. For now we lower that before TGSI. + PIPE_CAP_TGSI_TG4_COMPONENT_IN_SWIZZLE changes the encoding so that component + is stored in the sampler source swizzle x. + .. math:: coord = src0 + (without TGSI_TG4_COMPONENT_IN_SWIZZLE) component = src1 dst = texture\_gather4 (unit, coord, component) + (with TGSI_TG4_COMPONENT_IN_SWIZZLE) + dst = texture\_gather4 (unit, coord) + component is encoded in sampler swizzle. + (with SM5 - cube array shadow) .. math:: diff -Nru mesa-19.2.8/src/gallium/drivers/etnaviv/Android.mk mesa-20.0.8/src/gallium/drivers/etnaviv/Android.mk --- mesa-19.2.8/src/gallium/drivers/etnaviv/Android.mk 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/etnaviv/Android.mk 2020-06-12 01:21:16.000000000 +0000 @@ -28,7 +28,10 @@ LOCAL_SRC_FILES := \ $(C_SOURCES) -LOCAL_SHARED_LIBRARIES := libdrm_etnaviv +LOCAL_GENERATED_SOURCES := $(MESA_GEN_NIR_H) + +LOCAL_SHARED_LIBRARIES := libdrm +LOCAL_STATIC_LIBRARIES := libmesa_nir libetnaviv_drm LOCAL_MODULE := libmesa_pipe_etnaviv include $(GALLIUM_COMMON_MK) diff -Nru mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_asm.h mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_asm.h --- mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_asm.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_asm.h 2020-06-12 01:21:16.000000000 +0000 @@ -29,6 +29,7 @@ #include #include +#include "util/u_math.h" #include "hw/isa.xml.h" /* Size of an instruction in 32-bit words */ @@ -145,6 +146,21 @@ }; } +static inline struct etna_inst_src +etna_immediate_float(float x) +{ + uint32_t bits = fui(x); + assert((bits & 0xfff) == 0); /* 12 lsb cut off */ + return etna_immediate_src(0, bits >> 12); +} + +static inline struct etna_inst_src +etna_immediate_int(int x) +{ + assert(x >= -0x80000 && x < 0x80000); /* 20-bit signed int */ + return etna_immediate_src(1, x); +} + /** * Build vivante instruction from structure with * opcode, cond, sat, dst_use, dst_amode, diff -Nru mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_blend.c mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_blend.c --- mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_blend.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_blend.c 2020-06-12 01:21:16.000000000 +0000 @@ -123,7 +123,7 @@ uint32_t colormask; if (pfb->cbufs[0] && - translate_rs_format_rb_swap(pfb->cbufs[0]->format)) { + translate_pe_format_rb_swap(pfb->cbufs[0]->format)) { colormask = rt0->colormask & (PIPE_MASK_A | PIPE_MASK_G); if (rt0->colormask & PIPE_MASK_R) colormask |= PIPE_MASK_B; @@ -164,7 +164,7 @@ { struct pipe_framebuffer_state *pfb = &ctx->framebuffer_s; struct compiled_blend_color *cs = &ctx->blend_color; - bool rb_swap = (pfb->cbufs[0] && translate_rs_format_rb_swap(pfb->cbufs[0]->format)); + bool rb_swap = (pfb->cbufs[0] && translate_pe_format_rb_swap(pfb->cbufs[0]->format)); cs->PE_ALPHA_BLEND_COLOR = VIVS_PE_ALPHA_BLEND_COLOR_R(etna_cfloat_to_uint8(cs->color[rb_swap ? 2 : 0])) | diff -Nru mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_blt.c mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_blt.c --- mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_blt.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_blt.c 2020-06-12 01:21:16.000000000 +0000 @@ -49,8 +49,21 @@ #include -/* Currently, used BLT formats overlap 100% with RS formats */ -#define translate_blt_format translate_rs_format +static uint32_t +etna_compatible_blt_format(enum pipe_format fmt) +{ + /* YUYV and UYVY are blocksize 4, but 2 bytes per pixel */ + if (fmt == PIPE_FORMAT_YUYV || fmt == PIPE_FORMAT_UYVY) + return BLT_FORMAT_R8G8; + + switch (util_format_get_blocksize(fmt)) { + case 1: return BLT_FORMAT_R8; + case 2: return BLT_FORMAT_R8G8; + case 4: return BLT_FORMAT_A8R8G8B8; + case 8: return BLT_FORMAT_A16R16G16B16; + default: return ETNA_NO_MATCH; + } +} static inline uint32_t blt_compute_stride_bits(const struct blt_imginfo *img) @@ -202,7 +215,7 @@ { struct etna_context *ctx = etna_context(pctx); struct etna_surface *surf = etna_surface(dst); - uint32_t new_clear_value = etna_clear_blit_pack_rgba(surf->base.format, color->f); + uint64_t new_clear_value = etna_clear_blit_pack_rgba(surf->base.format, color); struct etna_resource *res = etna_resource(surf->base.texture); struct blt_clear_op clr = {}; @@ -219,13 +232,13 @@ clr.dest.ts_addr.offset = 0; clr.dest.ts_addr.flags = ETNA_RELOC_WRITE; clr.dest.ts_clear_value[0] = new_clear_value; - clr.dest.ts_clear_value[1] = new_clear_value; + clr.dest.ts_clear_value[1] = new_clear_value >> 32; clr.dest.ts_mode = surf->level->ts_mode; clr.dest.ts_compress_fmt = surf->level->ts_compress_fmt; } clr.clear_value[0] = new_clear_value; - clr.clear_value[1] = new_clear_value; + clr.clear_value[1] = new_clear_value >> 32; clr.clear_bits[0] = 0xffffffff; /* TODO: Might want to clear only specific channels? */ clr.clear_bits[1] = 0xffffffff; clr.rect_x = 0; /* What about scissors? */ @@ -238,7 +251,9 @@ /* This made the TS valid */ if (surf->surf.ts_size) { ctx->framebuffer.TS_COLOR_CLEAR_VALUE = new_clear_value; + ctx->framebuffer.TS_COLOR_CLEAR_VALUE_EXT = new_clear_value >> 32; surf->level->ts_valid = true; + ctx->dirty |= ETNA_DIRTY_TS | ETNA_DIRTY_DERIVE_TS; } surf->level->clear_value = new_clear_value; @@ -276,6 +291,10 @@ if (buffers & PIPE_CLEAR_STENCIL) new_clear_bits |= clear_bits_stencil; + /* if all bits are cleared, update TS clear value */ + if (new_clear_bits == 0xffffffff) + surf->level->clear_value = new_clear_value; + /* TODO unduplicate this */ struct etna_resource *res = etna_resource(surf->base.texture); struct blt_clear_op clr = {}; @@ -291,8 +310,8 @@ clr.dest.ts_addr.bo = res->ts_bo; clr.dest.ts_addr.offset = 0; clr.dest.ts_addr.flags = ETNA_RELOC_WRITE; - clr.dest.ts_clear_value[0] = new_clear_value; - clr.dest.ts_clear_value[1] = new_clear_value; + clr.dest.ts_clear_value[0] = surf->level->clear_value; + clr.dest.ts_clear_value[1] = surf->level->clear_value; clr.dest.ts_mode = surf->level->ts_mode; clr.dest.ts_compress_fmt = surf->level->ts_compress_fmt; } @@ -310,11 +329,11 @@ /* This made the TS valid */ if (surf->surf.ts_size) { - ctx->framebuffer.TS_DEPTH_CLEAR_VALUE = new_clear_value; + ctx->framebuffer.TS_DEPTH_CLEAR_VALUE = surf->level->clear_value; surf->level->ts_valid = true; + ctx->dirty |= ETNA_DIRTY_TS | ETNA_DIRTY_DERIVE_TS; } - surf->level->clear_value = new_clear_value; resource_written(ctx, surf->base.texture); etna_resource(surf->base.texture)->seqno++; } @@ -324,6 +343,7 @@ const union pipe_color_union *color, double depth, unsigned stencil) { struct etna_context *ctx = etna_context(pctx); + mtx_lock(&ctx->lock); etna_set_state(ctx->stream, VIVS_GL_FLUSH_CACHE, 0x00000c23); etna_set_state(ctx->stream, VIVS_TS_FLUSH_CACHE, VIVS_TS_FLUSH_CACHE_FLUSH); @@ -344,9 +364,9 @@ etna_set_state(ctx->stream, VIVS_GL_FLUSH_CACHE, 0x00000c23); else etna_set_state(ctx->stream, VIVS_GL_FLUSH_CACHE, 0x00000002); + mtx_unlock(&ctx->lock); } - static bool etna_try_blt_blit(struct pipe_context *pctx, const struct pipe_blit_info *blit_info) @@ -360,7 +380,7 @@ assert(blit_info->src.level <= src->base.last_level); assert(blit_info->dst.level <= dst->base.last_level); - if (!translate_samples_to_xyscale(src->base.nr_samples, &msaa_xscale, &msaa_yscale, NULL)) + if (!translate_samples_to_xyscale(src->base.nr_samples, &msaa_xscale, &msaa_yscale)) return false; /* The width/height are in pixels; they do not change as a result of @@ -383,21 +403,20 @@ return false; } - /* TODO: 1 byte per pixel formats aren't handled by etna_compatible_rs_format nor - * translate_rs_format. + /* Only support same format (used tiling/detiling) blits for now. + * TODO: figure out which different-format blits are possible and test them + * - need to use correct swizzle + * - set sRGB bits correctly + * - avoid trying to convert between float/int formats? */ - unsigned src_format = blit_info->src.format; - unsigned dst_format = blit_info->dst.format; + if (blit_info->src.format != blit_info->dst.format) + return false; - /* for a copy with same dst/src format, we can use a different format */ - if (translate_blt_format(src_format) == ETNA_NO_MATCH && - src_format == dst_format) { - src_format = dst_format = etna_compatible_rs_format(src_format); - } + uint32_t format = etna_compatible_blt_format(blit_info->dst.format); + if (format == ETNA_NO_MATCH) + return false; - if (translate_blt_format(src_format) == ETNA_NO_MATCH || - translate_blt_format(dst_format) == ETNA_NO_MATCH || - blit_info->scissor_enable || + if (blit_info->scissor_enable || blit_info->dst.box.depth != blit_info->src.box.depth || blit_info->dst.box.depth != 1) { return false; @@ -416,6 +435,7 @@ return true; } + mtx_lock(&ctx->lock); /* Kick off BLT here */ if (src == dst && src_lev->ts_compress_fmt < 0) { /* Resolve-in-place */ @@ -443,13 +463,11 @@ op.src.addr.bo = src->bo; op.src.addr.offset = src_lev->offset + blit_info->src.box.z * src_lev->layer_stride; op.src.addr.flags = ETNA_RELOC_READ; - op.src.format = translate_blt_format(src_format); + op.src.format = format; op.src.stride = src_lev->stride; op.src.tiling = src->layout; - const struct util_format_description *src_format_desc = - util_format_description(src_format); for (unsigned x=0; x<4; ++x) - op.src.swizzle[x] = src_format_desc->swizzle[x]; + op.src.swizzle[x] = x; if (src_lev->ts_size && src_lev->ts_valid) { op.src.use_ts = 1; @@ -465,13 +483,11 @@ op.dest.addr.bo = dst->bo; op.dest.addr.offset = dst_lev->offset + blit_info->dst.box.z * dst_lev->layer_stride; op.dest.addr.flags = ETNA_RELOC_WRITE; - op.dest.format = translate_blt_format(dst_format); + op.dest.format = format; op.dest.stride = dst_lev->stride; op.dest.tiling = dst->layout; - const struct util_format_description *dst_format_desc = - util_format_description(dst_format); for (unsigned x=0; x<4; ++x) - op.dest.swizzle[x] = dst_format_desc->swizzle[x]; + op.dest.swizzle[x] = x; op.dest_x = blit_info->dst.box.x; op.dest_y = blit_info->dst.box.y; @@ -510,6 +526,7 @@ dst->seqno++; dst_lev->ts_valid = false; + mtx_unlock(&ctx->lock); return true; } diff -Nru mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_clear_blit.c mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_clear_blit.c --- mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_clear_blit.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_clear_blit.c 2020-06-12 01:21:16.000000000 +0000 @@ -66,15 +66,29 @@ ctx->num_fragment_sampler_views, ctx->sampler_view); } -uint32_t -etna_clear_blit_pack_rgba(enum pipe_format format, const float *rgba) +uint64_t +etna_clear_blit_pack_rgba(enum pipe_format format, const union pipe_color_union *color) { union util_color uc; - util_pack_color(rgba, format, &uc); - if (util_format_get_blocksize(format) == 2) - return uc.ui[0] << 16 | (uc.ui[0] & 0xffff); - else - return uc.ui[0]; + + if (util_format_is_pure_uint(format)) { + util_format_write_4ui(format, color->ui, 0, &uc, 0, 0, 0, 1, 1); + } else if (util_format_is_pure_sint(format)) { + util_format_write_4i(format, color->i, 0, &uc, 0, 0, 0, 1, 1); + } else { + util_pack_color(color->f, format, &uc); + } + + switch (util_format_get_blocksize(format)) { + case 1: + uc.ui[0] = uc.ui[0] << 8 | (uc.ui[0] & 0xff); + case 2: + uc.ui[0] = uc.ui[0] << 16 | (uc.ui[0] & 0xffff); + case 4: + uc.ui[1] = uc.ui[0]; + default: + return (uint64_t) uc.ui[1] << 32 | uc.ui[0]; + } } static void @@ -114,9 +128,6 @@ { struct etna_context *ctx = etna_context(pctx); - /* The resource must be of the same format. */ - assert(src->format == dst->format); - /* XXX we can use the RS as a literal copy engine here * the only complexity is tiling; the size of the boxes needs to be aligned * to the tile size @@ -141,10 +152,10 @@ { struct etna_resource *rsc = etna_resource(prsc); - if (rsc->external) { - if (etna_resource_older(etna_resource(rsc->external), rsc)) { - etna_copy_resource(pctx, rsc->external, prsc, 0, 0); - etna_resource(rsc->external)->seqno = rsc->seqno; + if (rsc->render) { + if (etna_resource_older(rsc, etna_resource(rsc->render))) { + etna_copy_resource(pctx, prsc, rsc->render, 0, 0); + rsc->seqno = etna_resource(rsc->render)->seqno; } } else if (etna_resource_needs_flush(rsc)) { etna_copy_resource(pctx, prsc, prsc, 0, 0); diff -Nru mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_clear_blit.h mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_clear_blit.h --- mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_clear_blit.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_clear_blit.h 2020-06-12 01:21:16.000000000 +0000 @@ -36,7 +36,7 @@ void etna_rs_gen_clear_surface(struct etna_context *ctx, struct etna_surface *surf, - uint32_t clear_value); + uint64_t clear_value); void etna_copy_resource(struct pipe_context *pctx, struct pipe_resource *dst, @@ -50,8 +50,8 @@ void etna_blit_save_state(struct etna_context *ctx); -uint32_t -etna_clear_blit_pack_rgba(enum pipe_format format, const float *rgba); +uint64_t +etna_clear_blit_pack_rgba(enum pipe_format format, const union pipe_color_union *color); void etna_clear_blit_init(struct pipe_context *pctx); diff -Nru mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_compiler_nir.c mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_compiler_nir.c --- mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_compiler_nir.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_compiler_nir.c 2020-06-12 01:21:16.000000000 +0000 @@ -49,9 +49,6 @@ const struct etna_specs *specs; struct etna_shader_variant *variant; - /* register assigned to each output, indexed by driver_location */ - unsigned output_reg[ETNA_NUM_INPUTS]; - /* block # to instr index */ unsigned *block_ptr; @@ -59,34 +56,19 @@ int inst_ptr; /* current instruction pointer */ struct etna_inst code[ETNA_MAX_INSTRUCTIONS * ETNA_INST_SIZE]; + /* constants */ + uint64_t consts[ETNA_MAX_IMM]; + /* There was an error during compilation */ bool error; }; -#define compile_error(ctx, args...) ({ \ - printf(args); \ - ctx->error = true; \ - assert(0); \ -}) - /* io related lowering * run after lower_int_to_float because it adds i2f/f2i ops */ static void etna_lower_io(nir_shader *shader, struct etna_shader_variant *v) { - bool rb_swap = shader->info.stage == MESA_SHADER_FRAGMENT && v->key.frag_rb_swap; - - unsigned color_location = 0; - nir_foreach_variable(var, &shader->outputs) { - switch (var->data.location) { - case FRAG_RESULT_COLOR: - case FRAG_RESULT_DATA0: - color_location = var->data.driver_location; - break; - } - } - nir_foreach_function(function, shader) { nir_builder b; nir_builder_init(&b, function->impl); @@ -98,39 +80,77 @@ switch (intr->intrinsic) { case nir_intrinsic_load_front_face: { - /* front face inverted (run after int_to_float, so invert as float) */ + /* HW front_face is 0.0/1.0, not 0/~0u for bool + * lower with a comparison with 0 + */ + intr->dest.ssa.bit_size = 32; + b.cursor = nir_after_instr(instr); - nir_ssa_def *ssa = nir_seq(&b, &intr->dest.ssa, nir_imm_float(&b, 0.0)); + nir_ssa_def *ssa = nir_ine(&b, &intr->dest.ssa, nir_imm_int(&b, 0)); + if (v->key.front_ccw) + nir_instr_as_alu(ssa->parent_instr)->op = nir_op_ieq; + nir_ssa_def_rewrite_uses_after(&intr->dest.ssa, nir_src_for_ssa(ssa), ssa->parent_instr); } break; - case nir_intrinsic_store_output: { - if (!rb_swap || nir_intrinsic_base(intr) != color_location) + case nir_intrinsic_store_deref: { + nir_deref_instr *deref = nir_src_as_deref(intr->src[0]); + if (shader->info.stage != MESA_SHADER_FRAGMENT || !v->key.frag_rb_swap) break; + + assert(deref->deref_type == nir_deref_type_var); + + if (deref->var->data.location != FRAG_RESULT_COLOR && + deref->var->data.location != FRAG_RESULT_DATA0) + break; + b.cursor = nir_before_instr(instr); - nir_ssa_def *ssa = nir_mov(&b, intr->src[0].ssa); + nir_ssa_def *ssa = nir_mov(&b, intr->src[1].ssa); nir_alu_instr *alu = nir_instr_as_alu(ssa->parent_instr); alu->src[0].swizzle[0] = 2; alu->src[0].swizzle[2] = 0; - nir_instr_rewrite_src(instr, &intr->src[0], nir_src_for_ssa(ssa)); - } break; - case nir_intrinsic_load_instance_id: { - b.cursor = nir_after_instr(instr); - nir_ssa_def *ssa = nir_i2f32(&b, &intr->dest.ssa); - nir_ssa_def_rewrite_uses_after(&intr->dest.ssa, - nir_src_for_ssa(ssa), - ssa->parent_instr); + nir_instr_rewrite_src(instr, &intr->src[1], nir_src_for_ssa(ssa)); } break; case nir_intrinsic_load_uniform: { - /* multiply by 16 and convert to int */ + /* convert indirect load_uniform to load_ubo when possible + * this is required on HALTI5+ because address register is not implemented + * address register loads also arent done optimally + */ + if (v->shader->specs->halti < 2 || nir_src_is_const(intr->src[0])) + break; + + nir_intrinsic_instr *load_ubo = + nir_intrinsic_instr_create(b.shader, nir_intrinsic_load_ubo); + load_ubo->num_components = intr->num_components; + nir_ssa_dest_init(&load_ubo->instr, &load_ubo->dest, + load_ubo->num_components, 32, NULL); + b.cursor = nir_before_instr(instr); - nir_ssa_def *ssa = nir_f2u32(&b, nir_fmul(&b, intr->src[0].ssa, - nir_imm_float(&b, 16.0f))); - nir_instr_rewrite_src(instr, &intr->src[0], nir_src_for_ssa(ssa)); + load_ubo->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0)); + load_ubo->src[1] = nir_src_for_ssa(nir_iadd(&b, + nir_imul(&b, intr->src[0].ssa, nir_imm_int(&b, 16)), + nir_imm_int(&b, nir_intrinsic_base(intr) * 16))); + nir_builder_instr_insert(&b, &load_ubo->instr); + nir_ssa_def_rewrite_uses(&intr->dest.ssa, + nir_src_for_ssa(&load_ubo->dest.ssa)); + nir_instr_remove(&intr->instr); } break; + case nir_intrinsic_load_ubo: { + nir_const_value *idx = nir_src_as_const_value(intr->src[0]); + assert(idx); + /* offset index by 1, index 0 is used for converted load_uniform */ + b.cursor = nir_before_instr(instr); + nir_instr_rewrite_src(instr, &intr->src[0], + nir_src_for_ssa(nir_imm_int(&b, idx[0].u32 + 1))); + } break; + case nir_intrinsic_load_vertex_id: + case nir_intrinsic_load_instance_id: + /* detect use of vertex_id/instance_id */ + v->vs_id_in_reg = v->infile.num_reg; + break; default: break; } @@ -157,6 +177,8 @@ lod_bias = &tex->src[i].src; lod_bias_idx = i; break; + case nir_tex_src_comparator: + break; default: assert(0); break; @@ -208,25 +230,49 @@ } } -static void -etna_lower_alu_to_scalar(nir_shader *shader, const struct etna_specs *specs) +static bool +etna_alu_to_scalar_filter_cb(const nir_instr *instr, const void *data) { - BITSET_DECLARE(scalar_ops, nir_num_opcodes); - BITSET_ZERO(scalar_ops); + const struct etna_specs *specs = data; - BITSET_SET(scalar_ops, nir_op_frsq); - BITSET_SET(scalar_ops, nir_op_frcp); - BITSET_SET(scalar_ops, nir_op_flog2); - BITSET_SET(scalar_ops, nir_op_fexp2); - BITSET_SET(scalar_ops, nir_op_fsqrt); - BITSET_SET(scalar_ops, nir_op_fcos); - BITSET_SET(scalar_ops, nir_op_fsin); - BITSET_SET(scalar_ops, nir_op_fdiv); + if (instr->type != nir_instr_type_alu) + return false; - if (!specs->has_halti2_instructions) - BITSET_SET(scalar_ops, nir_op_fdot2); + nir_alu_instr *alu = nir_instr_as_alu(instr); + switch (alu->op) { + case nir_op_frsq: + case nir_op_frcp: + case nir_op_flog2: + case nir_op_fexp2: + case nir_op_fsqrt: + case nir_op_fcos: + case nir_op_fsin: + case nir_op_fdiv: + case nir_op_imul: + return true; + /* TODO: can do better than alu_to_scalar for vector compares */ + case nir_op_b32all_fequal2: + case nir_op_b32all_fequal3: + case nir_op_b32all_fequal4: + case nir_op_b32any_fnequal2: + case nir_op_b32any_fnequal3: + case nir_op_b32any_fnequal4: + case nir_op_b32all_iequal2: + case nir_op_b32all_iequal3: + case nir_op_b32all_iequal4: + case nir_op_b32any_inequal2: + case nir_op_b32any_inequal3: + case nir_op_b32any_inequal4: + return true; + case nir_op_fdot2: + if (!specs->has_halti2_instructions) + return true; + break; + default: + break; + } - nir_lower_alu_to_scalar(shader, scalar_ops); + return false; } static void @@ -332,7 +378,11 @@ INST_TYPE_##type \ } #define OPC(nir, op, src, cond) OPCT(nir, op, src, cond, F32) +#define IOPC(nir, op, src, cond) OPCT(nir, op, src, cond, S32) +#define UOPC(nir, op, src, cond) OPCT(nir, op, src, cond, U32) #define OP(nir, op, src) OPC(nir, op, src, TRUE) +#define IOP(nir, op, src) IOPC(nir, op, src, TRUE) +#define UOP(nir, op, src) UOPC(nir, op, src, TRUE) OP(mov, MOV, X_X_0), OP(fneg, MOV, X_X_0), OP(fabs, MOV, X_X_0), OP(fsat, MOV, X_X_0), OP(fmul, MUL, 0_1_X), OP(fadd, ADD, 0_X_1), OP(ffma, MAD, 0_1_2), OP(fdot2, DP2, 0_1_X), OP(fdot3, DP3, 0_1_X), OP(fdot4, DP4, 0_1_X), @@ -346,9 +396,51 @@ OP(fdiv, DIV, 0_1_X), OP(fddx, DSX, 0_X_0), OP(fddy, DSY, 0_X_0), - /* integer opcodes */ - OPCT(i2f32, I2F, 0_X_X, TRUE, S32), - OPCT(f2u32, F2I, 0_X_X, TRUE, U32), + /* type convert */ + IOP(i2f32, I2F, 0_X_X), + UOP(u2f32, I2F, 0_X_X), + IOP(f2i32, F2I, 0_X_X), + UOP(f2u32, F2I, 0_X_X), + UOP(b2f32, AND, 0_X_X), /* AND with fui(1.0f) */ + UOP(b2i32, AND, 0_X_X), /* AND with 1 */ + OPC(f2b32, CMP, 0_X_X, NE), /* != 0.0 */ + UOPC(i2b32, CMP, 0_X_X, NE), /* != 0 */ + + /* arithmetic */ + IOP(iadd, ADD, 0_X_1), + IOP(imul, IMULLO0, 0_1_X), + /* IOP(imad, IMADLO0, 0_1_2), */ + IOP(ineg, ADD, X_X_0), /* ADD 0, -x */ + IOP(iabs, IABS, X_X_0), + IOP(isign, SIGN, X_X_0), + IOPC(imin, SELECT, 0_1_0, GT), + IOPC(imax, SELECT, 0_1_0, LT), + UOPC(umin, SELECT, 0_1_0, GT), + UOPC(umax, SELECT, 0_1_0, LT), + + /* select */ + UOPC(b32csel, SELECT, 0_1_2, NZ), + + /* compare with int result */ + OPC(feq32, CMP, 0_1_X, EQ), + OPC(fne32, CMP, 0_1_X, NE), + OPC(fge32, CMP, 0_1_X, GE), + OPC(flt32, CMP, 0_1_X, LT), + IOPC(ieq32, CMP, 0_1_X, EQ), + IOPC(ine32, CMP, 0_1_X, NE), + IOPC(ige32, CMP, 0_1_X, GE), + IOPC(ilt32, CMP, 0_1_X, LT), + UOPC(uge32, CMP, 0_1_X, GE), + UOPC(ult32, CMP, 0_1_X, LT), + + /* bit ops */ + IOP(ior, OR, 0_X_1), + IOP(iand, AND, 0_X_1), + IOP(ixor, XOR, 0_X_1), + IOP(inot, NOT, X_X_0), + IOP(ishl, LSHIFT, 0_X_1), + IOP(ishr, RSHIFT, 0_X_1), + UOP(ushr, RSHIFT, 0_X_1), }; static void @@ -362,6 +454,7 @@ struct etna_inst_src src[3], bool saturate) { struct etna_op_info ei = etna_ops[op]; + unsigned swiz_scalar = INST_SWIZ_BROADCAST(ffs(dst.write_mask) - 1); assert(ei.opcode != 0xff); @@ -385,17 +478,36 @@ case nir_op_frcp: case nir_op_fexp2: case nir_op_fsqrt: - case nir_op_i2f32: - case nir_op_f2u32: - /* for these instructions we want src to be in x component - * note: on HALTI2+ i2f/f2u are not scalar but we only use them this way currently - */ - src[0].swiz = inst_swiz_compose(src[0].swiz, - INST_SWIZ_BROADCAST(ffs(inst.dst.write_mask)-1)); + case nir_op_imul: + /* scalar instructions we want src to be in x component */ + src[0].swiz = inst_swiz_compose(src[0].swiz, swiz_scalar); + src[1].swiz = inst_swiz_compose(src[1].swiz, swiz_scalar); + break; + /* deal with instructions which don't have 1:1 mapping */ + case nir_op_b2f32: + inst.src[2] = etna_immediate_float(1.0f); + break; + case nir_op_b2i32: + inst.src[2] = etna_immediate_int(1); + break; + case nir_op_f2b32: + inst.src[1] = etna_immediate_float(0.0f); + break; + case nir_op_i2b32: + inst.src[1] = etna_immediate_int(0); + break; + case nir_op_ineg: + inst.src[0] = etna_immediate_int(0); + src[0].neg = 1; + break; default: break; } + /* set the "true" value for CMP instructions */ + if (inst.opcode == INST_OPCODE_CMP) + inst.src[2] = etna_immediate_int(-1); + for (unsigned j = 0; j < 3; j++) { unsigned i = ((ei.src >> j*2) & 3); if (i < 3) @@ -408,7 +520,7 @@ static void etna_emit_tex(struct etna_compile *c, nir_texop op, unsigned texid, unsigned dst_swiz, struct etna_inst_dst dst, struct etna_inst_src coord, - struct etna_inst_src lod_bias) + struct etna_inst_src lod_bias, struct etna_inst_src compare) { struct etna_inst inst = { .dst = dst, @@ -420,6 +532,9 @@ if (lod_bias.use) inst.src[1] = lod_bias; + if (compare.use) + inst.src[2] = compare; + switch (op) { case nir_texop_tex: inst.opcode = INST_OPCODE_TEXLD; break; case nir_texop_txb: inst.opcode = INST_OPCODE_TEXLDB; break; @@ -460,7 +575,8 @@ struct etna_inst inst = { .opcode = INST_OPCODE_TEXKILL, - .cond = INST_CONDITION_GZ, + .cond = INST_CONDITION_NZ, + .type = (c->specs->halti < 2) ? INST_TYPE_F32 : INST_TYPE_U32, .src[0] = condition, }; inst.src[0].swiz = INST_SWIZ_BROADCAST(inst.src[0].swiz & 3); @@ -468,22 +584,39 @@ } static void -etna_emit_output(struct etna_compile *c, unsigned index, struct etna_inst_src src) +etna_emit_output(struct etna_compile *c, nir_variable *var, struct etna_inst_src src) { - c->output_reg[index] = src.reg; -} + struct etna_shader_io_file *sf = &c->variant->outfile; -static void -etna_emit_load_ubo(struct etna_compile *c, struct etna_inst_dst dst, - struct etna_inst_src src, struct etna_inst_src base) -{ - emit_inst(c, &(struct etna_inst) { - .opcode = INST_OPCODE_LOAD, - .type = INST_TYPE_U32, - .dst = dst, - .src[0] = src, - .src[1] = base, - }); + if (is_fs(c)) { + switch (var->data.location) { + case FRAG_RESULT_COLOR: + case FRAG_RESULT_DATA0: /* DATA0 is used by gallium shaders for color */ + c->variant->ps_color_out_reg = src.reg; + break; + case FRAG_RESULT_DEPTH: + c->variant->ps_depth_out_reg = src.reg; + break; + default: + unreachable("Unsupported fs output"); + } + return; + } + + switch (var->data.location) { + case VARYING_SLOT_POS: + c->variant->vs_pos_out_reg = src.reg; + break; + case VARYING_SLOT_PSIZ: + c->variant->vs_pointsize_out_reg = src.reg; + break; + default: + sf->reg[sf->num_reg].reg = src.reg; + sf->reg[sf->num_reg].slot = var->data.location; + sf->reg[sf->num_reg].num_components = glsl_get_components(var->type); + sf->num_reg++; + break; + } } #define OPT(nir, pass, ...) ({ \ @@ -584,7 +717,7 @@ unsigned idx = var->data.driver_location; sf->reg[idx].reg = idx; sf->reg[idx].slot = var->data.location; - sf->reg[idx].num_components = 4; /* TODO */ + sf->reg[idx].num_components = glsl_get_components(var->type); sf->num_reg = MAX2(sf->num_reg, idx+1); } } else { @@ -593,47 +726,56 @@ unsigned idx = var->data.driver_location; sf->reg[idx].reg = idx + 1; sf->reg[idx].slot = var->data.location; - sf->reg[idx].num_components = 4; /* TODO */ + sf->reg[idx].num_components = glsl_get_components(var->type); sf->num_reg = MAX2(sf->num_reg, idx+1); count++; } assert(sf->num_reg == count); } - NIR_PASS_V(s, nir_lower_io, nir_var_all, etna_glsl_type_size, + NIR_PASS_V(s, nir_lower_io, ~nir_var_shader_out, etna_glsl_type_size, (nir_lower_io_options)0); OPT_V(s, nir_lower_regs_to_ssa); OPT_V(s, nir_lower_vars_to_ssa); OPT_V(s, nir_lower_indirect_derefs, nir_var_all); OPT_V(s, nir_lower_tex, &(struct nir_lower_tex_options) { .lower_txp = ~0u }); - OPT_V(s, etna_lower_alu_to_scalar, specs); + OPT_V(s, nir_lower_alu_to_scalar, etna_alu_to_scalar_filter_cb, specs); etna_optimize_loop(s); - /* use opt_algebraic between int_to_float and boot_to_float because - * int_to_float emits ftrunc, and ftrunc lowering generates bool ops - */ - OPT_V(s, nir_lower_int_to_float); - OPT_V(s, nir_opt_algebraic); - OPT_V(s, nir_lower_bool_to_float); - - /* after int to float because insert i2f for instance_id */ OPT_V(s, etna_lower_io, v); + if (v->shader->specs->vs_need_z_div) + NIR_PASS_V(s, nir_lower_clip_halfz); + + /* lower pre-halti2 to float (halti0 has integers, but only scalar..) */ + if (c->specs->halti < 2) { + /* use opt_algebraic between int_to_float and boot_to_float because + * int_to_float emits ftrunc, and ftrunc lowering generates bool ops + */ + OPT_V(s, nir_lower_int_to_float); + OPT_V(s, nir_opt_algebraic); + OPT_V(s, nir_lower_bool_to_float); + } else { + OPT_V(s, nir_lower_idiv, nir_lower_idiv_fast); + OPT_V(s, nir_lower_bool_to_int32); + } + etna_optimize_loop(s); if (DBG_ENABLED(ETNA_DBG_DUMP_SHADERS)) nir_print_shader(s, stdout); while( OPT(s, nir_opt_vectorize) ); - OPT_V(s, etna_lower_alu_to_scalar, specs); + OPT_V(s, nir_lower_alu_to_scalar, etna_alu_to_scalar_filter_cb, specs); NIR_PASS_V(s, nir_remove_dead_variables, nir_var_function_temp); NIR_PASS_V(s, nir_opt_algebraic_late); NIR_PASS_V(s, nir_move_vec_src_uses_to_dest); NIR_PASS_V(s, nir_copy_prop); + /* only HW supported integer source mod is ineg for iadd instruction (?) */ NIR_PASS_V(s, nir_lower_to_source_mods, ~nir_lower_int_source_mods); /* need copy prop after uses_to_dest, and before src mods: see * dEQP-GLES2.functional.shaders.random.all_features.fragment.95 @@ -646,22 +788,11 @@ if (DBG_ENABLED(ETNA_DBG_DUMP_SHADERS)) nir_print_shader(s, stdout); - uint64_t consts[ETNA_MAX_IMM] = {}; - unsigned block_ptr[nir_shader_get_entrypoint(s)->num_blocks]; c->block_ptr = block_ptr; - struct emit_options options = { - .max_temps = ETNA_MAX_TEMPS, - .max_consts = ETNA_MAX_IMM / 4, - .id_reg = sf->num_reg, - .single_const_src = c->specs->halti < 5, - .etna_new_transcendentals = c->specs->has_new_transcendentals, - .user = c, - .consts = consts, - }; unsigned num_consts; - ASSERTED bool ok = emit_shader(c->nir, &options, &v->num_temps, &num_consts); + ASSERTED bool ok = emit_shader(c, &v->num_temps, &num_consts); assert(ok); /* empty shader, emit NOP */ @@ -669,7 +800,7 @@ emit_inst(c, &(struct etna_inst) { .opcode = INST_OPCODE_NOP }); /* assemble instructions, fixing up labels */ - uint32_t *code = MALLOC(c->inst_ptr * 16 + 1024); + uint32_t *code = MALLOC(c->inst_ptr * 16); for (unsigned i = 0; i < c->inst_ptr; i++) { struct etna_inst *inst = &c->code[i]; if (inst->opcode == INST_OPCODE_BRANCH) @@ -683,27 +814,11 @@ v->code = code; v->needs_icache = c->inst_ptr > specs->max_instructions; - copy_uniform_state_to_shader(v, consts, num_consts); + copy_uniform_state_to_shader(v, c->consts, num_consts); if (s->info.stage == MESA_SHADER_FRAGMENT) { v->input_count_unk8 = 31; /* XXX what is this */ - - nir_foreach_variable(var, &s->outputs) { - unsigned reg = c->output_reg[var->data.driver_location]; - switch (var->data.location) { - case FRAG_RESULT_COLOR: - case FRAG_RESULT_DATA0: /* DATA0 is used by gallium shaders for color */ - v->ps_color_out_reg = reg; - break; - case FRAG_RESULT_DEPTH: - v->ps_depth_out_reg = reg; - break; - default: - compile_error(c, "Unsupported fs output %s\n", gl_frag_result_name(var->data.location)); - } - } assert(v->ps_depth_out_reg <= 0); - v->outfile.num_reg = 0; ralloc_free(c->nir); FREE(c); return true; @@ -711,27 +826,6 @@ v->input_count_unk8 = DIV_ROUND_UP(v->infile.num_reg + 4, 16); /* XXX what is this */ - sf = &v->outfile; - sf->num_reg = 0; - nir_foreach_variable(var, &s->outputs) { - unsigned native = c->output_reg[var->data.driver_location]; - - if (var->data.location == VARYING_SLOT_POS) { - v->vs_pos_out_reg = native; - continue; - } - - if (var->data.location == VARYING_SLOT_PSIZ) { - v->vs_pointsize_out_reg = native; - continue; - } - - sf->reg[sf->num_reg].reg = native; - sf->reg[sf->num_reg].slot = var->data.location; - sf->reg[sf->num_reg].num_components = 4; /* TODO */ - sf->num_reg++; - } - /* fill in "mystery meat" load balancing value. This value determines how * work is scheduled between VS and PS * in the unified shader architecture. More precisely, it is determined from diff -Nru mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_compiler_nir_emit.h mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_compiler_nir_emit.h --- mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_compiler_nir_emit.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_compiler_nir_emit.h 2020-06-12 01:21:16.000000000 +0000 @@ -32,23 +32,12 @@ #include "compiler/nir/nir_worklist.h" #include "util/register_allocate.h" -struct emit_options { - unsigned max_temps; /* max # of vec4 registers */ - unsigned max_consts; /* max # of vec4 consts */ - unsigned id_reg; /* register with vertex/instance id */ - bool single_const_src : 1; /* limited to 1 vec4 const src */ - bool etna_new_transcendentals : 1; - void *user; - uint64_t *consts; -}; - #define ALU_SWIZ(s) INST_SWIZ((s)->swizzle[0], (s)->swizzle[1], (s)->swizzle[2], (s)->swizzle[3]) #define SRC_DISABLE ((hw_src){}) #define SRC_CONST(idx, s) ((hw_src){.use=1, .rgroup = INST_RGROUP_UNIFORM_0, .reg=idx, .swiz=s}) #define SRC_REG(idx, s) ((hw_src){.use=1, .rgroup = INST_RGROUP_TEMP, .reg=idx, .swiz=s}) -#define option(name) (state->options->name) -#define emit(type, args...) etna_emit_##type(state->options->user, args) +#define emit(type, args...) etna_emit_##type(state->c, args) typedef struct etna_inst_dst hw_dst; typedef struct etna_inst_src hw_src; @@ -59,7 +48,8 @@ }; struct state { - const struct emit_options *options; + struct etna_compile *c; + unsigned const_count; nir_shader *shader; @@ -72,10 +62,18 @@ unsigned num_nodes; }; +#define compile_error(ctx, args...) ({ \ + printf(args); \ + ctx->error = true; \ + assert(0); \ +}) + static inline hw_src src_swizzle(hw_src src, unsigned swizzle) { - src.swiz = inst_swiz_compose(src.swiz, swizzle); + if (src.rgroup != INST_RGROUP_IMMEDIATE) + src.swiz = inst_swiz_compose(src.swiz, swizzle); + return src; } @@ -96,7 +94,6 @@ #define CONST_VAL(a, b) (nir_const_value) {.u64 = (uint64_t)(a) << 32 | (uint64_t)(b)} #define CONST(x) CONST_VAL(ETNA_IMMEDIATE_CONSTANT, x) #define UNIFORM(x) CONST_VAL(ETNA_IMMEDIATE_UNIFORM, x) -#define UNIFORM_BASE(x) CONST_VAL(ETNA_IMMEDIATE_UBO0_ADDR, x) #define TEXSCALE(x, i) CONST_VAL(ETNA_IMMEDIATE_TEXRECT_SCALE_X + (i), x) static int @@ -114,10 +111,28 @@ static hw_src const_src(struct state *state, nir_const_value *value, unsigned num_components) { + /* use inline immediates if possible */ + if (state->c->specs->halti >= 2 && num_components == 1 && + value[0].u64 >> 32 == ETNA_IMMEDIATE_CONSTANT) { + uint32_t bits = value[0].u32; + + /* "float" - shifted by 12 */ + if ((bits & 0xfff) == 0) + return etna_immediate_src(0, bits >> 12); + + /* "unsigned" - raw 20 bit value */ + if (bits < (1 << 20)) + return etna_immediate_src(2, bits); + + /* "signed" - sign extended 20-bit (sign included) value */ + if (bits >= 0xfff80000) + return etna_immediate_src(1, bits); + } + unsigned i; int swiz = -1; for (i = 0; swiz < 0; i++) { - uint64_t *a = &option(consts)[i*4]; + uint64_t *a = &state->c->consts[i*4]; uint64_t save[4]; memcpy(save, a, sizeof(save)); swiz = 0; @@ -132,7 +147,7 @@ } } - assert(i <= option(max_consts)); + assert(i <= ETNA_MAX_IMM / 4); state->const_count = MAX2(state->const_count, i); return SRC_CONST(i - 1, swiz); @@ -157,6 +172,9 @@ REG_CLASS_VEC4, /* special vec2 class for fast transcendentals, limited to XY or ZW */ REG_CLASS_VIRT_VEC2T, + /* special classes for LOAD - contiguous components */ + REG_CLASS_VIRT_VEC2C, + REG_CLASS_VIRT_VEC3C, NUM_REG_CLASSES, } reg_class; @@ -178,6 +196,11 @@ REG_TYPE_VIRT_SCALAR_W, REG_TYPE_VIRT_VEC2T_XY, REG_TYPE_VIRT_VEC2T_ZW, + REG_TYPE_VIRT_VEC2C_XY, + REG_TYPE_VIRT_VEC2C_YZ, + REG_TYPE_VIRT_VEC2C_ZW, + REG_TYPE_VIRT_VEC3C_XYZ, + REG_TYPE_VIRT_VEC3C_YZW, NUM_REG_TYPES, } reg_type; @@ -189,18 +212,23 @@ [REG_TYPE_VIRT_SCALAR_Y] = 0x2, [REG_TYPE_VIRT_VEC2_XY] = 0x3, [REG_TYPE_VIRT_VEC2T_XY] = 0x3, + [REG_TYPE_VIRT_VEC2C_XY] = 0x3, [REG_TYPE_VIRT_SCALAR_Z] = 0x4, [REG_TYPE_VIRT_VEC2_XZ] = 0x5, [REG_TYPE_VIRT_VEC2_YZ] = 0x6, + [REG_TYPE_VIRT_VEC2C_YZ] = 0x6, [REG_TYPE_VIRT_VEC3_XYZ] = 0x7, + [REG_TYPE_VIRT_VEC3C_XYZ] = 0x7, [REG_TYPE_VIRT_SCALAR_W] = 0x8, [REG_TYPE_VIRT_VEC2_XW] = 0x9, [REG_TYPE_VIRT_VEC2_YW] = 0xa, [REG_TYPE_VIRT_VEC3_XYW] = 0xb, [REG_TYPE_VIRT_VEC2_ZW] = 0xc, [REG_TYPE_VIRT_VEC2T_ZW] = 0xc, + [REG_TYPE_VIRT_VEC2C_ZW] = 0xc, [REG_TYPE_VIRT_VEC3_XZW] = 0xd, [REG_TYPE_VIRT_VEC3_YZW] = 0xe, + [REG_TYPE_VIRT_VEC3C_YZW] = 0xe, }; /* how to swizzle when used as a src */ @@ -211,18 +239,23 @@ [REG_TYPE_VIRT_SCALAR_Y] = SWIZZLE(Y, Y, Y, Y), [REG_TYPE_VIRT_VEC2_XY] = INST_SWIZ_IDENTITY, [REG_TYPE_VIRT_VEC2T_XY] = INST_SWIZ_IDENTITY, + [REG_TYPE_VIRT_VEC2C_XY] = INST_SWIZ_IDENTITY, [REG_TYPE_VIRT_SCALAR_Z] = SWIZZLE(Z, Z, Z, Z), [REG_TYPE_VIRT_VEC2_XZ] = SWIZZLE(X, Z, X, Z), [REG_TYPE_VIRT_VEC2_YZ] = SWIZZLE(Y, Z, Y, Z), + [REG_TYPE_VIRT_VEC2C_YZ] = SWIZZLE(Y, Z, Y, Z), [REG_TYPE_VIRT_VEC3_XYZ] = INST_SWIZ_IDENTITY, + [REG_TYPE_VIRT_VEC3C_XYZ] = INST_SWIZ_IDENTITY, [REG_TYPE_VIRT_SCALAR_W] = SWIZZLE(W, W, W, W), [REG_TYPE_VIRT_VEC2_XW] = SWIZZLE(X, W, X, W), [REG_TYPE_VIRT_VEC2_YW] = SWIZZLE(Y, W, Y, W), [REG_TYPE_VIRT_VEC3_XYW] = SWIZZLE(X, Y, W, X), [REG_TYPE_VIRT_VEC2_ZW] = SWIZZLE(Z, W, Z, W), [REG_TYPE_VIRT_VEC2T_ZW] = SWIZZLE(Z, W, Z, W), + [REG_TYPE_VIRT_VEC2C_ZW] = SWIZZLE(Z, W, Z, W), [REG_TYPE_VIRT_VEC3_XZW] = SWIZZLE(X, Z, W, X), [REG_TYPE_VIRT_VEC3_YZW] = SWIZZLE(Y, Z, W, X), + [REG_TYPE_VIRT_VEC3C_YZW] = SWIZZLE(Y, Z, W, X), }; /* how to swizzle when used as a dest */ @@ -233,18 +266,23 @@ [REG_TYPE_VIRT_SCALAR_Y] = SWIZZLE(X, X, X, X), [REG_TYPE_VIRT_VEC2_XY] = INST_SWIZ_IDENTITY, [REG_TYPE_VIRT_VEC2T_XY] = INST_SWIZ_IDENTITY, + [REG_TYPE_VIRT_VEC2C_XY] = INST_SWIZ_IDENTITY, [REG_TYPE_VIRT_SCALAR_Z] = SWIZZLE(X, X, X, X), [REG_TYPE_VIRT_VEC2_XZ] = SWIZZLE(X, X, Y, Y), [REG_TYPE_VIRT_VEC2_YZ] = SWIZZLE(X, X, Y, Y), + [REG_TYPE_VIRT_VEC2C_YZ] = SWIZZLE(X, X, Y, Y), [REG_TYPE_VIRT_VEC3_XYZ] = INST_SWIZ_IDENTITY, + [REG_TYPE_VIRT_VEC3C_XYZ] = INST_SWIZ_IDENTITY, [REG_TYPE_VIRT_SCALAR_W] = SWIZZLE(X, X, X, X), [REG_TYPE_VIRT_VEC2_XW] = SWIZZLE(X, X, Y, Y), [REG_TYPE_VIRT_VEC2_YW] = SWIZZLE(X, X, Y, Y), [REG_TYPE_VIRT_VEC3_XYW] = SWIZZLE(X, Y, Z, Z), [REG_TYPE_VIRT_VEC2_ZW] = SWIZZLE(X, X, X, Y), [REG_TYPE_VIRT_VEC2T_ZW] = SWIZZLE(X, X, X, Y), + [REG_TYPE_VIRT_VEC2C_ZW] = SWIZZLE(X, X, X, Y), [REG_TYPE_VIRT_VEC3_XZW] = SWIZZLE(X, Y, Y, Z), [REG_TYPE_VIRT_VEC3_YZW] = SWIZZLE(X, X, Y, Z), + [REG_TYPE_VIRT_VEC3C_YZW] = SWIZZLE(X, X, Y, Z), }; static inline int reg_get_type(int virt_reg) @@ -256,10 +294,15 @@ { /* offset by 1 to avoid reserved position register */ if (state->shader->info.stage == MESA_SHADER_FRAGMENT) - return virt_reg / NUM_REG_TYPES + 1; + return (virt_reg / NUM_REG_TYPES + 1) % ETNA_MAX_TEMPS; return virt_reg / NUM_REG_TYPES; } +/* use "r63.z" for depth reg, it will wrap around to r0.z by reg_get_base + * (fs registers are offset by 1 to avoid reserving r0) + */ +#define REG_FRAG_DEPTH ((ETNA_MAX_TEMPS - 1) * NUM_REG_TYPES + REG_TYPE_VIRT_SCALAR_Z) + static inline int reg_get_class(int virt_reg) { switch (reg_get_type(virt_reg)) { @@ -285,6 +328,13 @@ case REG_TYPE_VIRT_VEC2T_XY: case REG_TYPE_VIRT_VEC2T_ZW: return REG_CLASS_VIRT_VEC2T; + case REG_TYPE_VIRT_VEC2C_XY: + case REG_TYPE_VIRT_VEC2C_YZ: + case REG_TYPE_VIRT_VEC2C_ZW: + return REG_CLASS_VIRT_VEC2C; + case REG_TYPE_VIRT_VEC3C_XYZ: + case REG_TYPE_VIRT_VEC3C_YZW: + return REG_CLASS_VIRT_VEC3C; } assert(false); @@ -337,13 +387,15 @@ case nir_intrinsic_load_input: case nir_intrinsic_load_instance_id: case nir_intrinsic_load_uniform: + case nir_intrinsic_load_ubo: return ra_src(state, src); case nir_intrinsic_load_front_face: return (hw_src) { .use = 1, .rgroup = INST_RGROUP_INTERNAL }; case nir_intrinsic_load_frag_coord: return SRC_REG(0, INST_SWIZ_IDENTITY); default: - assert(0); + compile_error(state->c, "Unhandled NIR intrinsic type: %s\n", + nir_intrinsic_infos[intr->intrinsic].name); break; } } break; @@ -356,7 +408,7 @@ return src_swizzle(const_src(state, &value, 1), SWIZZLE(X,X,X,X)); } default: - assert(0); + compile_error(state->c, "Unhandled NIR instruction type: %d\n", instr->type); break; } @@ -529,15 +581,18 @@ dest = &nir_instr_as_alu(instr)->dest.dest; break; case nir_instr_type_tex: - dest =&nir_instr_as_tex(instr)->dest; + dest = &nir_instr_as_tex(instr)->dest; break; case nir_instr_type_intrinsic: { nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); if (intr->intrinsic == nir_intrinsic_load_uniform || + intr->intrinsic == nir_intrinsic_load_ubo || intr->intrinsic == nir_intrinsic_load_input || intr->intrinsic == nir_intrinsic_load_instance_id) dest = &intr->dest; - } + } break; + case nir_instr_type_deref: + return NULL; default: break; } @@ -598,7 +653,7 @@ if (src->is_ssa) { nir_instr *instr = src->ssa->parent_instr; - if (is_sysval(instr)) + if (is_sysval(instr) || instr->type == nir_instr_type_deref) return true; switch (instr->type) { @@ -733,7 +788,7 @@ /* output live till the end */ if (instr->type == nir_instr_type_intrinsic) { nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); - if (intr->intrinsic == nir_intrinsic_store_output) + if (intr->intrinsic == nir_intrinsic_store_deref) state.index = ~0u; } @@ -760,13 +815,12 @@ /* apply live_in/live_out to ranges */ nir_foreach_block(block, impl) { - BITSET_WORD tmp; int i; - BITSET_FOREACH_SET(i, tmp, block->live_in, state.num_defs) + BITSET_FOREACH_SET(i, block->live_in, state.num_defs) range_include(&state.defs[i], block_live_index[block->index]); - BITSET_FOREACH_SET(i, tmp, block->live_out, state.num_defs) + BITSET_FOREACH_SET(i, block->live_out, state.num_defs) range_include(&state.defs[i], block_live_index[block->index + 1]); } @@ -775,17 +829,19 @@ /* precomputed by register_allocate */ static unsigned int *q_values[] = { - (unsigned int[]) { 1, 2, 3, 4, 2 }, - (unsigned int[]) { 3, 5, 6, 6, 5 }, - (unsigned int[]) { 3, 4, 4, 4, 4 }, - (unsigned int[]) { 1, 1, 1, 1, 1 }, - (unsigned int[]) { 1, 2, 2, 2, 1 }, + (unsigned int[]) {1, 2, 3, 4, 2, 2, 3, }, + (unsigned int[]) {3, 5, 6, 6, 5, 5, 6, }, + (unsigned int[]) {3, 4, 4, 4, 4, 4, 4, }, + (unsigned int[]) {1, 1, 1, 1, 1, 1, 1, }, + (unsigned int[]) {1, 2, 2, 2, 1, 2, 2, }, + (unsigned int[]) {2, 3, 3, 3, 2, 3, 3, }, + (unsigned int[]) {2, 2, 2, 2, 2, 2, 2, }, }; static void ra_assign(struct state *state, nir_shader *shader) { - struct ra_regs *regs = ra_alloc_reg_set(NULL, option(max_temps) * + struct ra_regs *regs = ra_alloc_reg_set(NULL, ETNA_MAX_TEMPS * NUM_REG_TYPES, false); /* classes always be created from index 0, so equal to the class enum @@ -794,10 +850,10 @@ for (int c = 0; c < NUM_REG_CLASSES; c++) ra_alloc_reg_class(regs); /* add each register of each class */ - for (int r = 0; r < NUM_REG_TYPES * option(max_temps); r++) + for (int r = 0; r < NUM_REG_TYPES * ETNA_MAX_TEMPS; r++) ra_class_add_reg(regs, reg_get_class(r), r); /* set conflicts */ - for (int r = 0; r < option(max_temps); r++) { + for (int r = 0; r < ETNA_MAX_TEMPS; r++) { for (int i = 0; i < NUM_REG_TYPES; i++) { for (int j = 0; j < i; j++) { if (reg_writemask[i] & reg_writemask[j]) { @@ -835,21 +891,35 @@ for (unsigned i = 0; i < num_nodes; i++) { nir_instr *instr = defs[i].instr; nir_dest *dest = defs[i].dest; + unsigned c = nir_dest_num_components(*dest) - 1; - ra_set_node_class(g, i, nir_dest_num_components(*dest) - 1); - - if (instr->type == nir_instr_type_alu && option(etna_new_transcendentals)) { + if (instr->type == nir_instr_type_alu && + state->c->specs->has_new_transcendentals) { switch (nir_instr_as_alu(instr)->op) { case nir_op_fdiv: case nir_op_flog2: case nir_op_fsin: case nir_op_fcos: assert(dest->is_ssa); - ra_set_node_class(g, i, REG_CLASS_VIRT_VEC2T); + c = REG_CLASS_VIRT_VEC2T; default: break; } } + + if (instr->type == nir_instr_type_intrinsic) { + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + /* can't have dst swizzle or sparse writemask on UBO loads */ + if (intr->intrinsic == nir_intrinsic_load_ubo) { + assert(dest == &intr->dest); + if (dest->ssa.num_components == 2) + c = REG_CLASS_VIRT_VEC2C; + if (dest->ssa.num_components == 3) + c = REG_CLASS_VIRT_VEC3C; + } + } + + ra_set_node_class(g, i, c); } nir_foreach_block(block, impl) { @@ -862,11 +932,20 @@ unsigned reg; switch (intr->intrinsic) { - case nir_intrinsic_store_output: { - /* don't want output to be swizzled + case nir_intrinsic_store_deref: { + /* don't want outputs to be swizzled * TODO: better would be to set the type to X/XY/XYZ/XYZW + * TODO: what if fragcoord.z is read after writing fragdepth? */ - ra_set_node_class(g, live_map[src_index(impl, &intr->src[0])], REG_CLASS_VEC4); + nir_deref_instr *deref = nir_src_as_deref(intr->src[0]); + unsigned index = live_map[src_index(impl, &intr->src[1])]; + + if (shader->info.stage == MESA_SHADER_FRAGMENT && + deref->var->data.location == FRAG_RESULT_DEPTH) { + ra_set_node_reg(g, index, REG_FRAG_DEPTH); + } else { + ra_set_node_class(g, index, REG_CLASS_VEC4); + } } continue; case nir_intrinsic_load_input: reg = nir_intrinsic_base(intr) * NUM_REG_TYPES + (unsigned[]) { @@ -877,7 +956,7 @@ }[nir_dest_num_components(*dest) - 1]; break; case nir_intrinsic_load_instance_id: - reg = option(id_reg) * NUM_REG_TYPES + REG_TYPE_VIRT_SCALAR_Y; + reg = state->c->variant->infile.num_reg * NUM_REG_TYPES + REG_TYPE_VIRT_SCALAR_Y; break; default: continue; @@ -982,8 +1061,7 @@ { unsigned dst_swiz; hw_dst dst = ra_dest(state, &tex->dest, &dst_swiz); - nir_src *coord = NULL; - nir_src *lod_bias = NULL; + nir_src *coord = NULL, *lod_bias = NULL, *compare = NULL; for (unsigned i = 0; i < tex->num_srcs; i++) { switch (tex->src[i].src_type) { @@ -995,34 +1073,67 @@ assert(!lod_bias); lod_bias = &tex->src[i].src; break; + case nir_tex_src_comparator: + compare = &tex->src[i].src; + break; default: - assert(0); + compile_error(state->c, "Unhandled NIR tex src type: %d\n", + tex->src[i].src_type); break; } } emit(tex, tex->op, tex->sampler_index, dst_swiz, dst, get_src(state, coord), - lod_bias ? get_src(state, lod_bias) : SRC_DISABLE); + lod_bias ? get_src(state, lod_bias) : SRC_DISABLE, + compare ? get_src(state, compare) : SRC_DISABLE); } static void emit_intrinsic(struct state *state, nir_intrinsic_instr * intr) { switch (intr->intrinsic) { - case nir_intrinsic_store_output: - emit(output, nir_intrinsic_base(intr), get_src(state, &intr->src[0])); + case nir_intrinsic_store_deref: + emit(output, nir_src_as_deref(intr->src[0])->var, get_src(state, &intr->src[1])); break; case nir_intrinsic_discard_if: emit(discard, get_src(state, &intr->src[0])); - break; + break; case nir_intrinsic_discard: emit(discard, SRC_DISABLE); break; case nir_intrinsic_load_uniform: { unsigned dst_swiz; - hw_dst dst = ra_dest(state, &intr->dest, &dst_swiz); - /* TODO: might have a problem with dst_swiz .. */ - emit(load_ubo, dst, get_src(state, &intr->src[0]), const_src(state, &UNIFORM_BASE(nir_intrinsic_base(intr) * 16), 1)); + struct etna_inst_dst dst = ra_dest(state, &intr->dest, &dst_swiz); + + /* TODO: rework so extra MOV isn't required, load up to 4 addresses at once */ + emit_inst(state->c, &(struct etna_inst) { + .opcode = INST_OPCODE_MOVAR, + .dst.write_mask = 0x1, + .src[2] = get_src(state, &intr->src[0]), + }); + emit_inst(state->c, &(struct etna_inst) { + .opcode = INST_OPCODE_MOV, + .dst = dst, + .src[2] = { + .use = 1, + .rgroup = INST_RGROUP_UNIFORM_0, + .reg = nir_intrinsic_base(intr), + .swiz = dst_swiz, + .amode = INST_AMODE_ADD_A_X, + }, + }); + } break; + case nir_intrinsic_load_ubo: { + /* TODO: if offset is of the form (x + C) then add C to the base instead */ + unsigned idx = nir_src_as_const_value(intr->src[0])[0].u32; + unsigned dst_swiz; + emit_inst(state->c, &(struct etna_inst) { + .opcode = INST_OPCODE_LOAD, + .type = INST_TYPE_U32, + .dst = ra_dest(state, &intr->dest, &dst_swiz), + .src[0] = get_src(state, &intr->src[1]), + .src[1] = const_src(state, &CONST_VAL(ETNA_IMMEDIATE_UBO0_ADDR + idx, 0), 1), + }); } break; case nir_intrinsic_load_front_face: case nir_intrinsic_load_frag_coord: @@ -1032,7 +1143,8 @@ case nir_intrinsic_load_instance_id: break; default: - assert(0); + compile_error(state->c, "Unhandled NIR intrinsic type: %s\n", + nir_intrinsic_infos[intr->intrinsic].name); } } @@ -1053,9 +1165,10 @@ assert(nir_instr_is_last(instr)); case nir_instr_type_load_const: case nir_instr_type_ssa_undef: + case nir_instr_type_deref: break; default: - assert(0); + compile_error(state->c, "Unhandled NIR instruction type: %d\n", instr->type); break; } } @@ -1108,7 +1221,7 @@ emit_cf_list(state, &nir_cf_node_as_loop(node)->body); break; default: - assert(0); + compile_error(state->c, "Unknown NIR node type\n"); break; } } @@ -1180,41 +1293,13 @@ switch (alu->op) { case nir_op_vec2: case nir_op_vec3: - case nir_op_vec4: { - nir_const_value value[4]; - unsigned num_components = 0; - - for (unsigned i = 0; i < info->num_inputs; i++) { - nir_const_value *cv = nir_src_as_const_value(alu->src[i].src); - if (cv) - value[num_components++] = cv[alu->src[i].swizzle[0]]; - } - - if (num_components <= 1) /* nothing to do */ - break; - - nir_ssa_def *def = nir_build_imm(&b, num_components, 32, value); - - if (num_components == info->num_inputs) { - nir_ssa_def_rewrite_uses(&alu->dest.dest.ssa, nir_src_for_ssa(def)); - nir_instr_remove(&alu->instr); - return; - } - - for (unsigned i = 0, j = 0; i < info->num_inputs; i++) { - nir_const_value *cv = nir_src_as_const_value(alu->src[i].src); - if (!cv) - continue; - - nir_instr_rewrite_src(&alu->instr, &alu->src[i].src, nir_src_for_ssa(def)); - alu->src[i].swizzle[0] = j++; - } - } break; - default: { - if (!option(single_const_src)) + case nir_op_vec4: + break; + default: + /* pre-GC7000L can only have 1 uniform src per instruction */ + if (state->c->specs->halti >= 5) return; - /* pre-GC7000L can only have 1 uniform src per instruction */ nir_const_value value[4] = {}; uint8_t swizzle[4][4] = {}; unsigned swiz_max = 0, num_const = 0; @@ -1268,7 +1353,39 @@ nir_ssa_def *mov = nir_mov(&b, alu->src[i].src.ssa); nir_instr_rewrite_src(&alu->instr, &alu->src[i].src, nir_src_for_ssa(mov)); } - } return; + return; + } + + nir_const_value value[4]; + unsigned num_components = 0; + + for (unsigned i = 0; i < info->num_inputs; i++) { + nir_const_value *cv = nir_src_as_const_value(alu->src[i].src); + if (cv) + value[num_components++] = cv[alu->src[i].swizzle[0]]; + } + + /* if there is more than one constant source to the vecN, combine them + * into a single load_const (removing the vecN completely if all components + * are constant) + */ + if (num_components > 1) { + nir_ssa_def *def = nir_build_imm(&b, num_components, 32, value); + + if (num_components == info->num_inputs) { + nir_ssa_def_rewrite_uses(&alu->dest.dest.ssa, nir_src_for_ssa(def)); + nir_instr_remove(&alu->instr); + return; + } + + for (unsigned i = 0, j = 0; i < info->num_inputs; i++) { + nir_const_value *cv = nir_src_as_const_value(alu->src[i].src); + if (!cv) + continue; + + nir_instr_rewrite_src(&alu->instr, &alu->src[i].src, nir_src_for_ssa(def)); + alu->src[i].swizzle[0] = j++; + } } unsigned finished_write_mask = 0; @@ -1305,14 +1422,17 @@ } static bool -emit_shader(nir_shader *shader, const struct emit_options *options, - unsigned *num_temps, unsigned *num_consts) +emit_shader(struct etna_compile *c, unsigned *num_temps, unsigned *num_consts) { + nir_shader *shader = c->nir; + struct state state = { - .options = options, + .c = c, .shader = shader, .impl = nir_shader_get_entrypoint(shader), }; + bool have_indirect_uniform = false; + unsigned indirect_max = 0; nir_builder b; nir_builder_init(&b, state.impl); @@ -1332,13 +1452,25 @@ } break; case nir_instr_type_intrinsic: { nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + /* TODO: load_ubo can also become a constant in some cases + * (at the moment it can end up emitting a LOAD with two + * uniform sources, which could be a problem on HALTI2) + */ if (intr->intrinsic != nir_intrinsic_load_uniform) break; nir_const_value *off = nir_src_as_const_value(intr->src[0]); - if (!off || off[0].u64 >> 32 != ETNA_IMMEDIATE_CONSTANT) + if (!off || off[0].u64 >> 32 != ETNA_IMMEDIATE_CONSTANT) { + have_indirect_uniform = true; + indirect_max = nir_intrinsic_base(intr) + nir_intrinsic_range(intr); break; + } - unsigned base = nir_intrinsic_base(intr) + off[0].u32 / 16; + unsigned base = nir_intrinsic_base(intr); + /* pre halti2 uniform offset will be float */ + if (c->specs->halti < 2) + base += (unsigned) off[0].f32; + else + base += off[0].u32; nir_const_value value[4]; for (unsigned i = 0; i < intr->dest.ssa.num_components; i++) { @@ -1360,6 +1492,13 @@ } } + /* TODO: only emit required indirect uniform ranges */ + if (have_indirect_uniform) { + for (unsigned i = 0; i < indirect_max * 4; i++) + c->consts[i] = UNIFORM(i).u64; + state.const_count = indirect_max; + } + /* add mov for any store output using sysval/const */ nir_foreach_block(block, state.impl) { nir_foreach_instr_safe(instr, block) { @@ -1369,8 +1508,8 @@ nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); switch (intr->intrinsic) { - case nir_intrinsic_store_output: { - nir_src *src = &intr->src[0]; + case nir_intrinsic_store_deref: { + nir_src *src = &intr->src[1]; if (nir_src_is_const(*src) || is_sysval(src->ssa->parent_instr)) { b.cursor = nir_before_instr(instr); nir_instr_rewrite_src(instr, src, nir_src_for_ssa(nir_mov(&b, src->ssa))); diff -Nru mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_compiler_tgsi.c mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_compiler_tgsi.c --- mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_compiler_tgsi.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_compiler_tgsi.c 2020-06-12 01:21:16.000000000 +0000 @@ -222,6 +222,16 @@ }; } +static struct etna_native_reg +etna_native_internal(unsigned reg) +{ + return (struct etna_native_reg) { + .valid = 1, + .rgroup = INST_RGROUP_INTERNAL, + .id = reg + }; +} + /** Register allocation **/ enum reg_sort_order { FIRST_USE_ASC, @@ -622,12 +632,19 @@ /* never assign t0 as it is the position output, start assigning at t1 */ c->next_free_native = 1; - /* hardwire TGSI_SEMANTIC_POSITION (input and output) to t0 */ for (int idx = 0; idx < c->total_decls; ++idx) { struct etna_reg_desc *reg = &c->decl[idx]; - if (reg->active && reg->semantic.Name == TGSI_SEMANTIC_POSITION) + if (!reg->active) + continue; + + /* hardwire TGSI_SEMANTIC_POSITION (input and output) to t0 */ + if (reg->semantic.Name == TGSI_SEMANTIC_POSITION) reg->native = etna_native_temp(0); + + /* hardwire TGSI_SEMANTIC_FACE to i0 */ + if (reg->semantic.Name == TGSI_SEMANTIC_FACE) + reg->native = etna_native_internal(0); } } } @@ -824,7 +841,34 @@ unsigned uni_reg = -1; for (int src = 0; src < ETNA_NUM_SRC; ++src) { - if (etna_rgroup_is_uniform(inst->src[src].rgroup)) { + if (inst->src[src].rgroup == INST_RGROUP_INTERNAL && + c->info.processor == PIPE_SHADER_FRAGMENT && + c->key->front_ccw) { + struct etna_native_reg inner_temp = etna_compile_get_inner_temp(c); + + /* + * Set temporary register to 0.0 or 1.0 based on the gl_FrontFacing + * configuration (CW or CCW). + */ + etna_assemble(&c->code[c->inst_ptr * 4], &(struct etna_inst) { + .opcode = INST_OPCODE_SET, + .cond = INST_CONDITION_NE, + .dst = etna_native_to_dst(inner_temp, INST_COMPS_X | INST_COMPS_Y | + INST_COMPS_Z | INST_COMPS_W), + .src[0] = inst->src[src], + .src[1] = alloc_imm_f32(c, 1.0f) + }); + c->inst_ptr++; + + /* Modify instruction to use temp register instead of uniform */ + inst->src[src].use = 1; + inst->src[src].rgroup = INST_RGROUP_TEMP; + inst->src[src].reg = inner_temp.id; + inst->src[src].swiz = INST_SWIZ_IDENTITY; /* swizzling happens on MOV */ + inst->src[src].neg = 0; /* negation happens on MOV */ + inst->src[src].abs = 0; /* abs happens on MOV */ + inst->src[src].amode = 0; /* amode effects happen on MOV */ + } else if (etna_rgroup_is_uniform(inst->src[src].rgroup)) { if (uni_reg == -1) { /* first unique uniform used */ uni_rgroup = inst->src[src].rgroup; uni_reg = inst->src[src].reg; @@ -2037,8 +2081,9 @@ permute_ps_inputs(struct etna_compile *c) { /* Special inputs: - * gl_FragCoord VARYING_SLOT_POS TGSI_SEMANTIC_POSITION - * gl_PointCoord VARYING_SLOT_PNTC TGSI_SEMANTIC_PCOORD + * gl_FragCoord VARYING_SLOT_POS TGSI_SEMANTIC_POSITION + * gl_FrontFacing VARYING_SLOT_FACE TGSI_SEMANTIC_FACE + * gl_PointCoord VARYING_SLOT_PNTC TGSI_SEMANTIC_PCOORD */ uint native_idx = 1; @@ -2047,7 +2092,9 @@ uint input_id; assert(reg->has_semantic); - if (!reg->active || reg->semantic.Name == TGSI_SEMANTIC_POSITION) + if (!reg->active || + reg->semantic.Name == TGSI_SEMANTIC_POSITION || + reg->semantic.Name == TGSI_SEMANTIC_FACE) continue; input_id = native_idx++; @@ -2347,10 +2394,6 @@ /* optimize outputs */ etna_compile_pass_optimize_outputs(c); - /* XXX assign special inputs: gl_FrontFacing (VARYING_SLOT_FACE) - * this is part of RGROUP_INTERNAL - */ - /* assign inputs: last usage of input should be <= first usage of temp */ /* potential optimization case: * if single MOV TEMP[y], IN[x] before which temp y is not used, and diff -Nru mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_context.c mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_context.c --- mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_context.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_context.c 2020-06-12 01:21:16.000000000 +0000 @@ -48,6 +48,7 @@ #include "pipe/p_context.h" #include "pipe/p_state.h" +#include "util/hash_table.h" #include "util/u_blitter.h" #include "util/u_helpers.h" #include "util/u_memory.h" @@ -61,6 +62,39 @@ { struct etna_context *ctx = etna_context(pctx); + mtx_lock(&ctx->lock); + if (ctx->used_resources_read) { + + /* + * There should be no resources tracked in the context when it's being + * destroyed. Be sure there are none to avoid memory leaks on buggy + * programs. + */ + set_foreach(ctx->used_resources_read, entry) { + struct etna_resource *rsc = (struct etna_resource *)entry->key; + + _mesa_set_remove_key(rsc->pending_ctx, ctx); + } + _mesa_set_destroy(ctx->used_resources_read, NULL); + + } + if (ctx->used_resources_write) { + + /* + * There should be no resources tracked in the context when it's being + * destroyed. Be sure there are none to avoid memory leaks on buggy + * programs. + */ + set_foreach(ctx->used_resources_write, entry) { + struct etna_resource *rsc = (struct etna_resource *)entry->key; + + _mesa_set_remove_key(rsc->pending_ctx, ctx); + } + _mesa_set_destroy(ctx->used_resources_write, NULL); + + } + mtx_unlock(&ctx->lock); + if (ctx->dummy_rt) etna_bo_del(ctx->dummy_rt); @@ -83,6 +117,8 @@ if (ctx->in_fence_fd != -1) close(ctx->in_fence_fd); + mtx_destroy(&ctx->lock); + FREE(pctx); } @@ -190,7 +226,7 @@ if (info->index_size) { indexbuf = info->has_user_indices ? NULL : info->index.resource; if (info->has_user_indices && - !util_upload_index_buffer(pctx, info, &indexbuf, &index_offset)) { + !util_upload_index_buffer(pctx, info, &indexbuf, &index_offset, 4)) { BUG("Index buffer upload failed."); return; } @@ -214,9 +250,12 @@ } ctx->dirty |= ETNA_DIRTY_INDEX_BUFFER; - struct etna_shader_key key = {}; + struct etna_shader_key key = { + .front_ccw = ctx->rasterizer->front_ccw, + }; + if (pfb->cbufs[0]) - key.frag_rb_swap = !!translate_rs_format_rb_swap(pfb->cbufs[0]->format); + key.frag_rb_swap = !!translate_pe_format_rb_swap(pfb->cbufs[0]->format); if (!etna_get_vs(ctx, key) || !etna_get_fs(ctx, key)) { BUG("compiled shaders are not okay"); @@ -227,6 +266,8 @@ if (!etna_state_update(ctx)) return; + mtx_lock(&ctx->lock); + /* * Figure out the buffers/features we need: */ @@ -247,11 +288,13 @@ } /* Mark constant buffers as being read */ - resource_read(ctx, ctx->constant_buffer[PIPE_SHADER_VERTEX].buffer); - resource_read(ctx, ctx->constant_buffer[PIPE_SHADER_FRAGMENT].buffer); + for (unsigned i = 0; i < ETNA_MAX_CONST_BUF; i++) { + resource_read(ctx, ctx->constant_buffer[PIPE_SHADER_VERTEX][i].buffer); + resource_read(ctx, ctx->constant_buffer[PIPE_SHADER_FRAGMENT][i].buffer); + } /* Mark VBOs as being read */ - for (i = 0; i < ctx->vertex_buffer.count; i++) { + foreach_bit(i, ctx->vertex_buffer.enabled_mask) { assert(!ctx->vertex_buffer.vb[i].is_user_buffer); resource_read(ctx, ctx->vertex_buffer.vb[i].buffer.resource); } @@ -260,9 +303,17 @@ resource_read(ctx, indexbuf); /* Mark textures as being read */ - for (i = 0; i < PIPE_MAX_SAMPLERS; i++) - if (ctx->sampler_view[i]) - resource_read(ctx, ctx->sampler_view[i]->texture); + for (i = 0; i < PIPE_MAX_SAMPLERS; i++) { + if (ctx->sampler_view[i]) { + resource_read(ctx, ctx->sampler_view[i]->texture); + + /* if texture was modified since the last update, + * we need to clear the texture cache and possibly + * resolve/update ts + */ + etna_update_sampler_source(ctx->sampler_view[i], i); + } + } list_for_each_entry(struct etna_hw_query, hq, &ctx->active_hw_queries, node) resource_written(ctx, hq->prsc); @@ -278,7 +329,7 @@ if (ctx->specs.halti >= 2) { /* On HALTI2+ (GC3000 and higher) only use instanced drawing commands, as the blob does */ - etna_draw_instanced(ctx->stream, info->index_size, draw_mode, 1, + etna_draw_instanced(ctx->stream, info->index_size, draw_mode, info->instance_count, info->count, info->index_size ? info->index_bias : info->start); } else { if (info->index_size) @@ -293,6 +344,7 @@ * draw op has caused the hang. */ etna_stall(ctx->stream, SYNC_RECIPIENT_FE, SYNC_RECIPIENT_PE); } + mtx_unlock(&ctx->lock); if (DBG_ENABLED(ETNA_DBG_FLUSH_ALL)) pctx->flush(pctx, NULL, 0); @@ -306,36 +358,9 @@ } static void -etna_flush(struct pipe_context *pctx, struct pipe_fence_handle **fence, - enum pipe_flush_flags flags) -{ - struct etna_context *ctx = etna_context(pctx); - struct etna_screen *screen = ctx->screen; - int out_fence_fd = -1; - - mtx_lock(&screen->lock); - - list_for_each_entry(struct etna_hw_query, hq, &ctx->active_hw_queries, node) - etna_hw_query_suspend(hq, ctx); - - etna_cmd_stream_flush2(ctx->stream, ctx->in_fence_fd, - (flags & PIPE_FLUSH_FENCE_FD) ? &out_fence_fd : - NULL); - - list_for_each_entry(struct etna_hw_query, hq, &ctx->active_hw_queries, node) - etna_hw_query_resume(hq, ctx); - - if (fence) - *fence = etna_fence_create(pctx, out_fence_fd); - - mtx_unlock(&screen->lock); -} - -static void -etna_cmd_stream_reset_notify(struct etna_cmd_stream *stream, void *priv) +etna_reset_gpu_state(struct etna_context *ctx) { - struct etna_context *ctx = priv; - struct etna_screen *screen = ctx->screen; + struct etna_cmd_stream *stream = ctx->stream; etna_set_state(stream, VIVS_GL_API_MODE, VIVS_GL_API_MODE_OPENGL); etna_set_state(stream, VIVS_GL_VERTEX_ELEMENT_CONFIG, 0x00000001); @@ -348,7 +373,6 @@ etna_set_state(stream, VIVS_PA_VIEWPORT_UNK00A84, fui(8192.0)); etna_set_state(stream, VIVS_PA_ZFARCLIPPING, 0x00000000); etna_set_state(stream, VIVS_RA_HDEPTH_CONTROL, 0x00007000); - etna_set_state(stream, VIVS_PE_STENCIL_CONFIG_EXT2, 0x00000000); etna_set_state(stream, VIVS_PS_CONTROL_EXT, 0x00000000); /* There is no HALTI0 specific state */ @@ -384,21 +408,92 @@ etna_set_state(stream, VIVS_RS_SINGLE_BUFFER, COND(ctx->specs.single_buffer, VIVS_RS_SINGLE_BUFFER_ENABLE)); } + if (ctx->specs.halti >= 5) { + /* TXDESC cache flush - do this once at the beginning, as texture + * descriptors are only written by the CPU once, then patched by the kernel + * before command stream submission. It does not need flushing if the + * referenced image data changes. + */ + etna_set_state(stream, VIVS_NTE_DESCRIPTOR_FLUSH, 0); + etna_set_state(stream, VIVS_GL_FLUSH_CACHE, + VIVS_GL_FLUSH_CACHE_DESCRIPTOR_UNK12 | + VIVS_GL_FLUSH_CACHE_DESCRIPTOR_UNK13); + + /* Icache invalidate (should do this on shader change?) */ + etna_set_state(stream, VIVS_VS_ICACHE_INVALIDATE, + VIVS_VS_ICACHE_INVALIDATE_UNK0 | VIVS_VS_ICACHE_INVALIDATE_UNK1 | + VIVS_VS_ICACHE_INVALIDATE_UNK2 | VIVS_VS_ICACHE_INVALIDATE_UNK3 | + VIVS_VS_ICACHE_INVALIDATE_UNK4); + } + ctx->dirty = ~0L; ctx->dirty_sampler_views = ~0L; +} + +static void +etna_flush(struct pipe_context *pctx, struct pipe_fence_handle **fence, + enum pipe_flush_flags flags) +{ + struct etna_context *ctx = etna_context(pctx); + int out_fence_fd = -1; + + mtx_lock(&ctx->lock); + + list_for_each_entry(struct etna_hw_query, hq, &ctx->active_hw_queries, node) + etna_hw_query_suspend(hq, ctx); + + etna_cmd_stream_flush(ctx->stream, ctx->in_fence_fd, + (flags & PIPE_FLUSH_FENCE_FD) ? &out_fence_fd : NULL); + + list_for_each_entry(struct etna_hw_query, hq, &ctx->active_hw_queries, node) + etna_hw_query_resume(hq, ctx); + + if (fence) + *fence = etna_fence_create(pctx, out_fence_fd); /* - * Go through all _resources_ associated with this _screen_, pending - * in this _context_ and mark them as not pending in this _context_ - * anymore, since they were just flushed. - */ - mtx_lock(&screen->lock); - set_foreach(screen->used_resources, entry) { + * Go through all _resources_ pending in this _context_ and mark them as + * not pending in this _context_ anymore, since they were just flushed. + */ + set_foreach(ctx->used_resources_read, entry) { + struct etna_resource *rsc = (struct etna_resource *)entry->key; + struct pipe_resource *referenced = &rsc->base; + + _mesa_set_remove_key(rsc->pending_ctx, ctx); + + /* if resource has no pending ctx's reset its status */ + if (_mesa_set_next_entry(rsc->pending_ctx, NULL) == NULL) + rsc->status &= ~ETNA_PENDING_READ; + + pipe_resource_reference(&referenced, NULL); + } + _mesa_set_clear(ctx->used_resources_read, NULL); + + set_foreach(ctx->used_resources_write, entry) { struct etna_resource *rsc = (struct etna_resource *)entry->key; + struct pipe_resource *referenced = &rsc->base; _mesa_set_remove_key(rsc->pending_ctx, ctx); + + /* if resource has no pending ctx's reset its status */ + if (_mesa_set_next_entry(rsc->pending_ctx, NULL) == NULL) + rsc->status &= ~ETNA_PENDING_WRITE; + + pipe_resource_reference(&referenced, NULL); } - mtx_unlock(&screen->lock); + _mesa_set_clear(ctx->used_resources_write, NULL); + + etna_reset_gpu_state(ctx); + mtx_unlock(&ctx->lock); +} + +static void +etna_context_force_flush(struct etna_cmd_stream *stream, void *priv) +{ + struct pipe_context *pctx = priv; + + pctx->flush(pctx, NULL, 0); + } static void @@ -432,10 +527,23 @@ pctx->const_uploader = pctx->stream_uploader; screen = etna_screen(pscreen); - ctx->stream = etna_cmd_stream_new(screen->pipe, 0x2000, &etna_cmd_stream_reset_notify, ctx); + ctx->stream = etna_cmd_stream_new(screen->pipe, 0x2000, + &etna_context_force_flush, pctx); if (ctx->stream == NULL) goto fail; + ctx->used_resources_read = _mesa_set_create(NULL, _mesa_hash_pointer, + _mesa_key_pointer_equal); + if (!ctx->used_resources_read) + goto fail; + + ctx->used_resources_write = _mesa_set_create(NULL, _mesa_hash_pointer, + _mesa_key_pointer_equal); + if (!ctx->used_resources_write) + goto fail; + + mtx_init(&ctx->lock, mtx_recursive); + /* context ctxate setup */ ctx->specs = screen->specs; ctx->screen = screen; @@ -443,7 +551,7 @@ ctx->sample_mask = 0xffff; /* Set sensible defaults for state */ - etna_cmd_stream_reset_notify(ctx->stream, ctx); + etna_reset_gpu_state(ctx); ctx->in_fence_fd = -1; @@ -476,9 +584,15 @@ 1 << PIPE_PRIM_LINES | 1 << PIPE_PRIM_LINE_STRIP | 1 << PIPE_PRIM_TRIANGLES | - 1 << PIPE_PRIM_TRIANGLE_STRIP | 1 << PIPE_PRIM_TRIANGLE_FAN; + /* TODO: The bug relates only to indexed draws, but here we signal + * that there is no support for triangle strips at all. This should + * be refined. + */ + if (VIV_FEATURE(ctx->screen, chipMinorFeatures2, BUG_FIXES8)) + ctx->prim_hwsupport |= 1 << PIPE_PRIM_TRIANGLE_STRIP; + if (VIV_FEATURE(ctx->screen, chipMinorFeatures2, LINE_LOOP)) ctx->prim_hwsupport |= 1 << PIPE_PRIM_LINE_LOOP; @@ -499,6 +613,20 @@ ctx->dummy_rt_reloc.offset = 0; ctx->dummy_rt_reloc.flags = ETNA_RELOC_READ | ETNA_RELOC_WRITE; + if (screen->specs.halti >= 5) { + /* Create an empty dummy texture descriptor */ + ctx->dummy_desc_bo = etna_bo_new(ctx->screen->dev, 0x100, DRM_ETNA_GEM_CACHE_WC); + if (!ctx->dummy_desc_bo) + goto fail; + uint32_t *buf = etna_bo_map(ctx->dummy_desc_bo); + etna_bo_cpu_prep(ctx->dummy_desc_bo, DRM_ETNA_PREP_WRITE); + memset(buf, 0, 0x100); + etna_bo_cpu_fini(ctx->dummy_desc_bo); + ctx->DUMMY_DESC_ADDR.bo = ctx->dummy_desc_bo; + ctx->DUMMY_DESC_ADDR.offset = 0; + ctx->DUMMY_DESC_ADDR.flags = ETNA_RELOC_READ; + } + return pctx; fail: diff -Nru mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_context.h mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_context.h --- mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_context.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_context.h 2020-06-12 01:21:16.000000000 +0000 @@ -92,7 +92,7 @@ ETNA_IMMEDIATE_TEXRECT_SCALE_X, ETNA_IMMEDIATE_TEXRECT_SCALE_Y, ETNA_IMMEDIATE_UBO0_ADDR, - ETNA_IMMEDIATE_UBOMAX_ADDR = ETNA_IMMEDIATE_UBO0_ADDR + 255, + ETNA_IMMEDIATE_UBOMAX_ADDR = ETNA_IMMEDIATE_UBO0_ADDR + ETNA_MAX_CONST_BUF - 1, }; struct etna_shader_uniform_info { @@ -164,7 +164,7 @@ uint32_t active_sampler_views; uint32_t dirty_sampler_views; struct pipe_sampler_view *sampler_view[PIPE_MAX_SAMPLERS]; - struct pipe_constant_buffer constant_buffer[PIPE_SHADER_TYPES]; + struct pipe_constant_buffer constant_buffer[PIPE_SHADER_TYPES][ETNA_MAX_CONST_BUF]; struct etna_vertexbuf_state vertex_buffer; struct etna_index_buffer index_buffer; struct etna_shader_state shader; @@ -190,6 +190,16 @@ struct etna_bo *dummy_rt; struct etna_reloc dummy_rt_reloc; + + /* Dummy texture descriptor (if needed) */ + struct etna_bo *dummy_desc_bo; + struct etna_reloc DUMMY_DESC_ADDR; + + /* set of resources used by currently-unsubmitted renders */ + struct set *used_resources_read; + struct set *used_resources_write; + + mtx_t lock; }; static inline struct etna_context * diff -Nru mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_debug.h mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_debug.h --- mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_debug.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_debug.h 2020-06-12 01:21:16.000000000 +0000 @@ -54,6 +54,7 @@ #define ETNA_DBG_SHADERDB 0x800000 /* dump program compile information */ #define ETNA_DBG_NO_SINGLEBUF 0x1000000 /* disable single buffer feature */ #define ETNA_DBG_NIR 0x2000000 /* use new NIR compiler */ +#define ETNA_DBG_DEQP 0x4000000 /* Hacks to run dEQP GLES3 tests */ extern int etna_mesa_debug; /* set in etna_screen.c from ETNA_DEBUG */ diff -Nru mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_disasm.c mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_disasm.c --- mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_disasm.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_disasm.c 2020-06-12 01:21:16.000000000 +0000 @@ -508,6 +508,8 @@ OPC(STORE), OPC(IMULLO0), OPC(IMULHI0), + OPC(IMADLO0), + OPC(IMADHI0), OPC(LEADZERO), OPC(LSHIFT), OPC(RSHIFT), @@ -518,6 +520,7 @@ OPC(NOT), OPC(DP2), OPC(DIV), + OPC(IABS), }; static void diff -Nru mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_emit.c mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_emit.c --- mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_emit.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_emit.c 2020-06-12 01:21:16.000000000 +0000 @@ -132,6 +132,7 @@ etna_coalesce_start(stream, &coalesce); if (unlikely(dirty & (ETNA_DIRTY_SHADER))) { /* Magic states (load balancing, inter-unit sync, buffers) */ + /*007C4*/ EMIT_STATE(FE_HALTI5_ID_CONFIG, ctx->shader_state.FE_HALTI5_ID_CONFIG); /*00870*/ EMIT_STATE(VS_HALTI5_OUTPUT_COUNT, vs_output_count | ((vs_output_count * 0x10) << 8)); /*008A0*/ EMIT_STATE(VS_HALTI5_UNK008A0, 0x0001000e | ((0x110/vs_output_count) << 20)); for (int x = 0; x < 4; ++x) { @@ -214,6 +215,8 @@ etna_emit_state(struct etna_context *ctx) { struct etna_cmd_stream *stream = ctx->stream; + unsigned ccw = ctx->rasterizer->front_ccw; + /* Pre-reserve the command buffer space which we are likely to need. * This must cover all the state emitted below, and the following @@ -325,11 +328,6 @@ /*14640*/ EMIT_STATE(NFE_VERTEX_STREAMS_CONTROL(x), ctx->vertex_buffer.cvb[x].FE_VERTEX_STREAM_CONTROL); } } - for (int x = 0; x < ctx->vertex_buffer.count; ++x) { - if (ctx->vertex_buffer.cvb[x].FE_VERTEX_STREAM_BASE_ADDR.bo) { - /*14680*/ EMIT_STATE(NFE_VERTEX_STREAMS_VERTEX_DIVISOR(x), ctx->vertex_buffer.cvb[x].FE_VERTEX_STREAM_UNK14680); - } - } } else if(ctx->specs.stream_count > 1) { /* hw w/ multiple vertex streams */ for (int x = 0; x < ctx->vertex_buffer.count; ++x) { /*00680*/ EMIT_STATE_RELOC(FE_VERTEX_STREAMS_BASE_ADDR(x), &ctx->vertex_buffer.cvb[x].FE_VERTEX_STREAM_BASE_ADDR); @@ -344,6 +342,13 @@ /*00650*/ EMIT_STATE(FE_VERTEX_STREAM_CONTROL, ctx->vertex_buffer.cvb[0].FE_VERTEX_STREAM_CONTROL); } } + /* gallium has instance divisor as part of elements state */ + if ((dirty & (ETNA_DIRTY_VERTEX_ELEMENTS)) && ctx->specs.halti >= 2) { + for (int x = 0; x < ctx->vertex_elements->num_buffers; ++x) { + /*14680*/ EMIT_STATE(NFE_VERTEX_STREAMS_VERTEX_DIVISOR(x), ctx->vertex_elements->NFE_VERTEX_STREAMS_VERTEX_DIVISOR[x]); + } + } + if (unlikely(dirty & (ETNA_DIRTY_SHADER | ETNA_DIRTY_RASTERIZER))) { /*00804*/ EMIT_STATE(VS_OUTPUT_COUNT, vs_output_count); @@ -447,11 +452,13 @@ ctx->framebuffer.msaa_mode ? ctx->shader_state.PS_TEMP_REGISTER_CONTROL_MSAA : ctx->shader_state.PS_TEMP_REGISTER_CONTROL); - /*01010*/ EMIT_STATE(PS_CONTROL, ctx->shader_state.PS_CONTROL); + /*01010*/ EMIT_STATE(PS_CONTROL, ctx->framebuffer.PS_CONTROL); + /*01030*/ EMIT_STATE(PS_CONTROL_EXT, ctx->framebuffer.PS_CONTROL_EXT); } - if (unlikely(dirty & (ETNA_DIRTY_ZSA | ETNA_DIRTY_FRAMEBUFFER))) { - uint32_t val = etna_zsa_state(ctx->zsa)->PE_DEPTH_CONFIG; - /*01400*/ EMIT_STATE(PE_DEPTH_CONFIG, val | ctx->framebuffer.PE_DEPTH_CONFIG); + if (unlikely(dirty & (ETNA_DIRTY_ZSA | ETNA_DIRTY_FRAMEBUFFER | ETNA_DIRTY_SHADER))) { + /*01400*/ EMIT_STATE(PE_DEPTH_CONFIG, (etna_zsa_state(ctx->zsa)->PE_DEPTH_CONFIG | + ctx->framebuffer.PE_DEPTH_CONFIG) & + ctx->shader_state.PE_DEPTH_CONFIG); } if (unlikely(dirty & (ETNA_DIRTY_VIEWPORT))) { /*01404*/ EMIT_STATE(PE_DEPTH_NEAR, ctx->viewport.PE_DEPTH_NEAR); @@ -466,13 +473,14 @@ /*01414*/ EMIT_STATE(PE_DEPTH_STRIDE, ctx->framebuffer.PE_DEPTH_STRIDE); } - if (unlikely(dirty & (ETNA_DIRTY_ZSA))) { - uint32_t val = etna_zsa_state(ctx->zsa)->PE_STENCIL_OP; + + if (unlikely(dirty & (ETNA_DIRTY_ZSA | ETNA_DIRTY_RASTERIZER))) { + uint32_t val = etna_zsa_state(ctx->zsa)->PE_STENCIL_OP[ccw]; /*01418*/ EMIT_STATE(PE_STENCIL_OP, val); } - if (unlikely(dirty & (ETNA_DIRTY_ZSA | ETNA_DIRTY_STENCIL_REF))) { - uint32_t val = etna_zsa_state(ctx->zsa)->PE_STENCIL_CONFIG; - /*0141C*/ EMIT_STATE(PE_STENCIL_CONFIG, val | ctx->stencil_ref.PE_STENCIL_CONFIG); + if (unlikely(dirty & (ETNA_DIRTY_ZSA | ETNA_DIRTY_STENCIL_REF | ETNA_DIRTY_RASTERIZER))) { + uint32_t val = etna_zsa_state(ctx->zsa)->PE_STENCIL_CONFIG[ccw]; + /*0141C*/ EMIT_STATE(PE_STENCIL_CONFIG, val | ctx->stencil_ref.PE_STENCIL_CONFIG[ccw]); } if (unlikely(dirty & (ETNA_DIRTY_ZSA))) { uint32_t val = etna_zsa_state(ctx->zsa)->PE_ALPHA_OP; @@ -511,8 +519,8 @@ abort(); } } - if (unlikely(dirty & (ETNA_DIRTY_STENCIL_REF))) { - /*014A0*/ EMIT_STATE(PE_STENCIL_CONFIG_EXT, ctx->stencil_ref.PE_STENCIL_CONFIG_EXT); + if (unlikely(dirty & (ETNA_DIRTY_STENCIL_REF | ETNA_DIRTY_RASTERIZER))) { + /*014A0*/ EMIT_STATE(PE_STENCIL_CONFIG_EXT, ctx->stencil_ref.PE_STENCIL_CONFIG_EXT[ccw]); } if (unlikely(dirty & (ETNA_DIRTY_BLEND | ETNA_DIRTY_FRAMEBUFFER))) { struct etna_blend_state *blend = etna_blend_state(ctx->blend); @@ -524,10 +532,14 @@ /*014A8*/ EMIT_STATE(PE_DITHER(x), blend->PE_DITHER[x]); } } - if (unlikely(dirty & (ETNA_DIRTY_BLEND_COLOR))) { + if (unlikely(dirty & (ETNA_DIRTY_BLEND_COLOR)) && + VIV_FEATURE(ctx->screen, chipMinorFeatures1, HALF_FLOAT)) { /*014B0*/ EMIT_STATE(PE_ALPHA_COLOR_EXT0, ctx->blend_color.PE_ALPHA_COLOR_EXT0); /*014B4*/ EMIT_STATE(PE_ALPHA_COLOR_EXT1, ctx->blend_color.PE_ALPHA_COLOR_EXT1); } + if (unlikely(dirty & (ETNA_DIRTY_ZSA | ETNA_DIRTY_RASTERIZER))) { + /*014B8*/ EMIT_STATE(PE_STENCIL_CONFIG_EXT2, etna_zsa_state(ctx->zsa)->PE_STENCIL_CONFIG_EXT2[ccw]); + } if (unlikely(dirty & (ETNA_DIRTY_FRAMEBUFFER)) && ctx->specs.halti >= 3) /*014BC*/ EMIT_STATE(PE_MEM_CONFIG, ctx->framebuffer.PE_MEM_CONFIG); if (unlikely(dirty & (ETNA_DIRTY_FRAMEBUFFER | ETNA_DIRTY_TS))) { @@ -538,6 +550,7 @@ /*01664*/ EMIT_STATE_RELOC(TS_DEPTH_STATUS_BASE, &ctx->framebuffer.TS_DEPTH_STATUS_BASE); /*01668*/ EMIT_STATE_RELOC(TS_DEPTH_SURFACE_BASE, &ctx->framebuffer.TS_DEPTH_SURFACE_BASE); /*0166C*/ EMIT_STATE(TS_DEPTH_CLEAR_VALUE, ctx->framebuffer.TS_DEPTH_CLEAR_VALUE); + /*016BC*/ EMIT_STATE(TS_COLOR_CLEAR_VALUE_EXT, ctx->framebuffer.TS_COLOR_CLEAR_VALUE_EXT); } if (unlikely(dirty & (ETNA_DIRTY_SHADER))) { /*0381C*/ EMIT_STATE(GL_VARYING_TOTAL_COMPONENTS, ctx->shader_state.GL_VARYING_TOTAL_COMPONENTS); @@ -656,12 +669,12 @@ if (do_uniform_flush) etna_set_state(stream, VIVS_VS_UNIFORM_CACHE, VIVS_VS_UNIFORM_CACHE_FLUSH); - etna_uniforms_write(ctx, ctx->shader.vs, &ctx->constant_buffer[PIPE_SHADER_VERTEX]); + etna_uniforms_write(ctx, ctx->shader.vs, ctx->constant_buffer[PIPE_SHADER_VERTEX]); if (do_uniform_flush) etna_set_state(stream, VIVS_VS_UNIFORM_CACHE, VIVS_VS_UNIFORM_CACHE_FLUSH | VIVS_VS_UNIFORM_CACHE_PS); - etna_uniforms_write(ctx, ctx->shader.fs, &ctx->constant_buffer[PIPE_SHADER_FRAGMENT]); + etna_uniforms_write(ctx, ctx->shader.fs, ctx->constant_buffer[PIPE_SHADER_FRAGMENT]); if (ctx->specs.halti >= 5) { /* HALTI5 needs to be prompted to pre-fetch shaders */ @@ -675,14 +688,14 @@ etna_set_state(stream, VIVS_VS_UNIFORM_CACHE, VIVS_VS_UNIFORM_CACHE_FLUSH); if (dirty & (uniform_dirty_bits | ctx->shader.vs->uniforms_dirty_bits)) - etna_uniforms_write(ctx, ctx->shader.vs, &ctx->constant_buffer[PIPE_SHADER_VERTEX]); + etna_uniforms_write(ctx, ctx->shader.vs, ctx->constant_buffer[PIPE_SHADER_VERTEX]); /* ideally this cache would only be flushed if there are PS uniform changes */ if (do_uniform_flush) etna_set_state(stream, VIVS_VS_UNIFORM_CACHE, VIVS_VS_UNIFORM_CACHE_FLUSH | VIVS_VS_UNIFORM_CACHE_PS); if (dirty & (uniform_dirty_bits | ctx->shader.fs->uniforms_dirty_bits)) - etna_uniforms_write(ctx, ctx->shader.fs, &ctx->constant_buffer[PIPE_SHADER_FRAGMENT]); + etna_uniforms_write(ctx, ctx->shader.fs, ctx->constant_buffer[PIPE_SHADER_FRAGMENT]); } /**** End of state update ****/ #undef EMIT_STATE diff -Nru mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_emit.h mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_emit.h --- mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_emit.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_emit.h 2020-06-12 01:21:16.000000000 +0000 @@ -32,7 +32,6 @@ #include "hw/cmdstream.xml.h" struct etna_context; -struct compiled_rs_state; struct etna_coalesce { uint32_t start; diff -Nru mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_etc2.c mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_etc2.c --- mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_etc2.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_etc2.c 2020-06-12 01:21:16.000000000 +0000 @@ -29,7 +29,7 @@ #include "etnaviv_resource.h" #include "etnaviv_screen.h" #include "hw/common.xml.h" -#include "util/u_format.h" +#include "util/format/u_format.h" bool etna_etc2_needs_patching(const struct pipe_resource *prsc) diff -Nru mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_fence.c mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_fence.c --- mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_fence.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_fence.c 2020-06-12 01:21:16.000000000 +0000 @@ -88,7 +88,8 @@ { struct etna_context *ctx = etna_context(pctx); - sync_accumulate("etnaviv", &ctx->in_fence_fd, pfence->fence_fd); + if (pfence->fence_fd != -1) + sync_accumulate("etnaviv", &ctx->in_fence_fd, pfence->fence_fd); } static int diff -Nru mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_format.c mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_format.c --- mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_format.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_format.c 2020-06-12 01:21:16.000000000 +0000 @@ -39,47 +39,37 @@ struct etna_format { unsigned vtx; unsigned tex; - unsigned rs; + unsigned pe; bool present; - const unsigned char tex_swiz[4]; }; -#define RS_FORMAT_NONE ~0 +#define PE_FORMAT_NONE ~0 -#define RS_FORMAT_MASK 0xf -#define RS_FORMAT(x) ((x) & RS_FORMAT_MASK) -#define RS_FORMAT_RB_SWAP 0x10 +#define PE_FORMAT_MASK 0x7f +#define PE_FORMAT(x) ((x) & PE_FORMAT_MASK) +#define PE_FORMAT_RB_SWAP 0x80 -#define RS_FORMAT_X8B8G8R8 (RS_FORMAT_X8R8G8B8 | RS_FORMAT_RB_SWAP) -#define RS_FORMAT_A8B8G8R8 (RS_FORMAT_A8R8G8B8 | RS_FORMAT_RB_SWAP) +#define PE_FORMAT_X8B8G8R8 (PE_FORMAT_X8R8G8B8 | PE_FORMAT_RB_SWAP) +#define PE_FORMAT_A8B8G8R8 (PE_FORMAT_A8R8G8B8 | PE_FORMAT_RB_SWAP) #define TS_SAMPLER_FORMAT_NONE ETNA_NO_MATCH -#define SWIZ(x,y,z,w) { \ - PIPE_SWIZZLE_##x, \ - PIPE_SWIZZLE_##y, \ - PIPE_SWIZZLE_##z, \ - PIPE_SWIZZLE_##w \ -} - /* vertex + texture */ -#define VT(pipe, vtxfmt, texfmt, texswiz, rsfmt) \ +#define VT(pipe, vtxfmt, texfmt, rsfmt) \ [PIPE_FORMAT_##pipe] = { \ .vtx = FE_DATA_TYPE_##vtxfmt, \ .tex = TEXTURE_FORMAT_##texfmt, \ - .rs = RS_FORMAT_##rsfmt, \ + .pe = PE_FORMAT_##rsfmt, \ .present = 1, \ - .tex_swiz = texswiz, \ } /* texture-only */ -#define _T(pipe, fmt, swiz, rsfmt) \ +#define _T(pipe, fmt, rsfmt) \ [PIPE_FORMAT_##pipe] = { \ .vtx = ETNA_NO_MATCH, \ .tex = TEXTURE_FORMAT_##fmt, \ - .rs = RS_FORMAT_##rsfmt, \ + .pe = PE_FORMAT_##rsfmt, \ .present = 1, \ - .tex_swiz = swiz, \ } /* vertex-only */ @@ -87,103 +77,107 @@ [PIPE_FORMAT_##pipe] = { \ .vtx = FE_DATA_TYPE_##fmt, \ .tex = ETNA_NO_MATCH, \ - .rs = RS_FORMAT_##rsfmt, \ + .pe = PE_FORMAT_##rsfmt, \ .present = 1, \ } static struct etna_format formats[PIPE_FORMAT_COUNT] = { /* 8-bit */ - VT(R8_UNORM, UNSIGNED_BYTE, L8, SWIZ(X, 0, 0, 1), NONE), - V_(R8_SNORM, BYTE, NONE), - V_(R8_UINT, UNSIGNED_BYTE, NONE), - V_(R8_SINT, BYTE, NONE), + VT(R8_UNORM, UNSIGNED_BYTE, L8, R8), + VT(R8_SNORM, BYTE, EXT_R8_SNORM | EXT_FORMAT, NONE), + VT(R8_UINT, BYTE_I, EXT_R8I | EXT_FORMAT, R8I), + VT(R8_SINT, BYTE_I, EXT_R8I | EXT_FORMAT, R8I), V_(R8_USCALED, UNSIGNED_BYTE, NONE), V_(R8_SSCALED, BYTE, NONE), - _T(A8_UNORM, A8, SWIZ(X, Y, Z, W), NONE), - _T(L8_UNORM, L8, SWIZ(X, Y, Z, W), NONE), - _T(I8_UNORM, I8, SWIZ(X, Y, Z, W), NONE), + _T(A8_UNORM, A8, NONE), + _T(L8_UNORM, L8, NONE), + _T(I8_UNORM, I8, NONE), /* 16-bit */ V_(R16_UNORM, UNSIGNED_SHORT, NONE), V_(R16_SNORM, SHORT, NONE), - V_(R16_UINT, UNSIGNED_SHORT, NONE), - V_(R16_SINT, SHORT, NONE), + VT(R16_UINT, SHORT_I, EXT_R16I | EXT_FORMAT, R16I), + VT(R16_SINT, SHORT_I, EXT_R16I | EXT_FORMAT, R16I), V_(R16_USCALED, UNSIGNED_SHORT, NONE), V_(R16_SSCALED, SHORT, NONE), - V_(R16_FLOAT, HALF_FLOAT, NONE), + VT(R16_FLOAT, HALF_FLOAT, EXT_R16F | EXT_FORMAT, R16F), - _T(B4G4R4A4_UNORM, A4R4G4B4, SWIZ(X, Y, Z, W), A4R4G4B4), - _T(B4G4R4X4_UNORM, X4R4G4B4, SWIZ(X, Y, Z, W), X4R4G4B4), + _T(B4G4R4A4_UNORM, A4R4G4B4, A4R4G4B4), + _T(B4G4R4X4_UNORM, X4R4G4B4, X4R4G4B4), - _T(L8A8_UNORM, A8L8, SWIZ(X, Y, Z, W), NONE), + _T(L8A8_UNORM, A8L8, NONE), - _T(Z16_UNORM, D16, SWIZ(X, Y, Z, W), A4R4G4B4), - _T(B5G6R5_UNORM, R5G6B5, SWIZ(X, Y, Z, W), R5G6B5), - _T(B5G5R5A1_UNORM, A1R5G5B5, SWIZ(X, Y, Z, W), A1R5G5B5), - _T(B5G5R5X1_UNORM, X1R5G5B5, SWIZ(X, Y, Z, W), X1R5G5B5), - - VT(R8G8_UNORM, UNSIGNED_BYTE, EXT_G8R8 | EXT_FORMAT, SWIZ(X, Y, 0, 1), NONE), - V_(R8G8_SNORM, BYTE, NONE), - V_(R8G8_UINT, UNSIGNED_BYTE, NONE), - V_(R8G8_SINT, BYTE, NONE), + _T(Z16_UNORM, D16, NONE), + _T(B5G6R5_UNORM, R5G6B5, R5G6B5), + _T(B5G5R5A1_UNORM, A1R5G5B5, A1R5G5B5), + _T(B5G5R5X1_UNORM, X1R5G5B5, X1R5G5B5), + + VT(R8G8_UNORM, UNSIGNED_BYTE, EXT_G8R8 | EXT_FORMAT, G8R8), + VT(R8G8_SNORM, BYTE, EXT_G8R8_SNORM | EXT_FORMAT, NONE), + VT(R8G8_UINT, BYTE_I, EXT_G8R8I | EXT_FORMAT, G8R8I), + VT(R8G8_SINT, BYTE_I, EXT_G8R8I | EXT_FORMAT, G8R8I), V_(R8G8_USCALED, UNSIGNED_BYTE, NONE), V_(R8G8_SSCALED, BYTE, NONE), /* 24-bit */ V_(R8G8B8_UNORM, UNSIGNED_BYTE, NONE), V_(R8G8B8_SNORM, BYTE, NONE), - V_(R8G8B8_UINT, UNSIGNED_BYTE, NONE), - V_(R8G8B8_SINT, BYTE, NONE), + V_(R8G8B8_UINT, BYTE_I, NONE), + V_(R8G8B8_SINT, BYTE_I, NONE), V_(R8G8B8_USCALED, UNSIGNED_BYTE, NONE), V_(R8G8B8_SSCALED, BYTE, NONE), /* 32-bit */ V_(R32_UNORM, UNSIGNED_INT, NONE), V_(R32_SNORM, INT, NONE), - V_(R32_SINT, INT, NONE), - V_(R32_UINT, UNSIGNED_INT, NONE), + VT(R32_SINT, FLOAT, EXT_R32F | EXT_FORMAT, R32F), + VT(R32_UINT, FLOAT, EXT_R32F | EXT_FORMAT, R32F), V_(R32_USCALED, UNSIGNED_INT, NONE), V_(R32_SSCALED, INT, NONE), - V_(R32_FLOAT, FLOAT, NONE), + VT(R32_FLOAT, FLOAT, EXT_R32F | EXT_FORMAT, R32F), V_(R32_FIXED, FIXED, NONE), V_(R16G16_UNORM, UNSIGNED_SHORT, NONE), V_(R16G16_SNORM, SHORT, NONE), - V_(R16G16_UINT, UNSIGNED_SHORT, NONE), - V_(R16G16_SINT, SHORT, NONE), + VT(R16G16_UINT, SHORT_I, EXT_G16R16I | EXT_FORMAT, G16R16I), + VT(R16G16_SINT, SHORT_I, EXT_G16R16I | EXT_FORMAT, G16R16I), V_(R16G16_USCALED, UNSIGNED_SHORT, NONE), V_(R16G16_SSCALED, SHORT, NONE), - V_(R16G16_FLOAT, HALF_FLOAT, NONE), + VT(R16G16_FLOAT, HALF_FLOAT, EXT_G16R16F | EXT_FORMAT, G16R16F), V_(A8B8G8R8_UNORM, UNSIGNED_BYTE, NONE), - VT(R8G8B8A8_UNORM, UNSIGNED_BYTE, A8B8G8R8, SWIZ(X, Y, Z, W), A8B8G8R8), - V_(R8G8B8A8_SNORM, BYTE, A8B8G8R8), - _T(R8G8B8X8_UNORM, X8B8G8R8, SWIZ(X, Y, Z, W), X8B8G8R8), - V_(R8G8B8A8_UINT, UNSIGNED_BYTE, A8B8G8R8), - V_(R8G8B8A8_SINT, BYTE, A8B8G8R8), + VT(R8G8B8A8_UNORM, UNSIGNED_BYTE, A8B8G8R8, A8B8G8R8), + VT(R8G8B8A8_SNORM, BYTE, EXT_A8B8G8R8_SNORM | EXT_FORMAT, NONE), + _T(R8G8B8X8_UNORM, X8B8G8R8, X8B8G8R8), + _T(R8G8B8X8_SNORM, EXT_X8B8G8R8_SNORM | EXT_FORMAT, NONE), + VT(R8G8B8A8_UINT, BYTE_I, EXT_A8B8G8R8I | EXT_FORMAT, A8B8G8R8I), + VT(R8G8B8A8_SINT, BYTE_I, EXT_A8B8G8R8I | EXT_FORMAT, A8B8G8R8I), V_(R8G8B8A8_USCALED, UNSIGNED_BYTE, A8B8G8R8), V_(R8G8B8A8_SSCALED, BYTE, A8B8G8R8), - _T(B8G8R8A8_UNORM, A8R8G8B8, SWIZ(X, Y, Z, W), A8R8G8B8), - _T(B8G8R8X8_UNORM, X8R8G8B8, SWIZ(X, Y, Z, W), X8R8G8B8), - _T(B8G8R8A8_SRGB, A8R8G8B8, SWIZ(X, Y, Z, W), A8R8G8B8), - _T(B8G8R8X8_SRGB, X8R8G8B8, SWIZ(X, Y, Z, W), X8R8G8B8), - - V_(R10G10B10A2_UNORM, UNSIGNED_INT_10_10_10_2, NONE), - V_(R10G10B10A2_SNORM, INT_10_10_10_2, NONE), - V_(R10G10B10A2_USCALED, UNSIGNED_INT_10_10_10_2, NONE), - V_(R10G10B10A2_SSCALED, INT_10_10_10_2, NONE), + _T(B8G8R8A8_UNORM, A8R8G8B8, A8R8G8B8), + _T(B8G8R8X8_UNORM, X8R8G8B8, X8R8G8B8), + + VT(R10G10B10A2_UNORM, UNSIGNED_INT_2_10_10_10_REV, EXT_A2B10G10R10 | EXT_FORMAT, A2B10G10R10), + _T(R10G10B10X2_UNORM, EXT_A2B10G10R10 | EXT_FORMAT, A2B10G10R10), + V_(R10G10B10A2_SNORM, INT_2_10_10_10_REV, NONE), + _T(R10G10B10A2_UINT, EXT_A2B10G10R10UI | EXT_FORMAT, A2B10G10R10UI), + V_(R10G10B10A2_USCALED, UNSIGNED_INT_2_10_10_10_REV, NONE), + V_(R10G10B10A2_SSCALED, INT_2_10_10_10_REV, NONE), + + _T(X8Z24_UNORM, D24X8, NONE), + _T(S8_UINT_Z24_UNORM, D24X8, NONE), - _T(X8Z24_UNORM, D24X8, SWIZ(X, Y, Z, W), A8R8G8B8), - _T(S8_UINT_Z24_UNORM, D24X8, SWIZ(X, Y, Z, W), A8R8G8B8), + _T(R9G9B9E5_FLOAT, E5B9G9R9, NONE), + _T(R11G11B10_FLOAT, EXT_B10G11R11F | EXT_FORMAT, B10G11R11F), /* 48-bit */ V_(R16G16B16_UNORM, UNSIGNED_SHORT, NONE), V_(R16G16B16_SNORM, SHORT, NONE), - V_(R16G16B16_UINT, UNSIGNED_SHORT, NONE), - V_(R16G16B16_SINT, SHORT, NONE), + V_(R16G16B16_UINT, SHORT_I, NONE), + V_(R16G16B16_SINT, SHORT_I, NONE), V_(R16G16B16_USCALED, UNSIGNED_SHORT, NONE), V_(R16G16B16_SSCALED, SHORT, NONE), V_(R16G16B16_FLOAT, HALF_FLOAT, NONE), @@ -191,26 +185,26 @@ /* 64-bit */ V_(R16G16B16A16_UNORM, UNSIGNED_SHORT, NONE), V_(R16G16B16A16_SNORM, SHORT, NONE), - V_(R16G16B16A16_UINT, UNSIGNED_SHORT, NONE), - V_(R16G16B16A16_SINT, SHORT, NONE), + VT(R16G16B16A16_UINT, SHORT_I, EXT_A16B16G16R16I | EXT_FORMAT, A16B16G16R16I), + VT(R16G16B16A16_SINT, SHORT_I, EXT_A16B16G16R16I | EXT_FORMAT, A16B16G16R16I), V_(R16G16B16A16_USCALED, UNSIGNED_SHORT, NONE), V_(R16G16B16A16_SSCALED, SHORT, NONE), - V_(R16G16B16A16_FLOAT, HALF_FLOAT, NONE), + VT(R16G16B16A16_FLOAT, HALF_FLOAT, EXT_A16B16G16R16F | EXT_FORMAT, A16B16G16R16F), V_(R32G32_UNORM, UNSIGNED_INT, NONE), V_(R32G32_SNORM, INT, NONE), - V_(R32G32_UINT, UNSIGNED_INT, NONE), - V_(R32G32_SINT, INT, NONE), + VT(R32G32_UINT, FLOAT, EXT_G32R32F | EXT_FORMAT, G32R32F), + VT(R32G32_SINT, FLOAT, EXT_G32R32F | EXT_FORMAT, G32R32F), V_(R32G32_USCALED, UNSIGNED_INT, NONE), V_(R32G32_SSCALED, INT, NONE), - V_(R32G32_FLOAT, FLOAT, NONE), + VT(R32G32_FLOAT, FLOAT, EXT_G32R32F | EXT_FORMAT, G32R32F), V_(R32G32_FIXED, FIXED, NONE), /* 96-bit */ V_(R32G32B32_UNORM, UNSIGNED_INT, NONE), V_(R32G32B32_SNORM, INT, NONE), - V_(R32G32B32_UINT, UNSIGNED_INT, NONE), - V_(R32G32B32_SINT, INT, NONE), + V_(R32G32B32_UINT, FLOAT, NONE), + V_(R32G32B32_SINT, FLOAT, NONE), V_(R32G32B32_USCALED, UNSIGNED_INT, NONE), V_(R32G32B32_SSCALED, INT, NONE), V_(R32G32B32_FLOAT, FLOAT, NONE), @@ -219,73 +213,54 @@ /* 128-bit */ V_(R32G32B32A32_UNORM, UNSIGNED_INT, NONE), V_(R32G32B32A32_SNORM, INT, NONE), - V_(R32G32B32A32_UINT, UNSIGNED_INT, NONE), - V_(R32G32B32A32_SINT, INT, NONE), + V_(R32G32B32A32_UINT, FLOAT, NONE), + V_(R32G32B32A32_SINT, FLOAT, NONE), V_(R32G32B32A32_USCALED, UNSIGNED_INT, NONE), V_(R32G32B32A32_SSCALED, INT, NONE), V_(R32G32B32A32_FLOAT, FLOAT, NONE), V_(R32G32B32A32_FIXED, FIXED, NONE), /* compressed */ - _T(ETC1_RGB8, ETC1, SWIZ(X, Y, Z, W), NONE), + _T(ETC1_RGB8, ETC1, NONE), - _T(DXT1_RGB, DXT1, SWIZ(X, Y, Z, W), NONE), - _T(DXT1_SRGBA,DXT1, SWIZ(X, Y, Z, W), NONE), - _T(DXT1_RGBA, DXT1, SWIZ(X, Y, Z, W), NONE), - _T(DXT3_SRGBA,DXT2_DXT3, SWIZ(X, Y, Z, W), NONE), - _T(DXT3_RGBA, DXT2_DXT3, SWIZ(X, Y, Z, W), NONE), - _T(DXT5_SRGBA,DXT4_DXT5, SWIZ(X, Y, Z, W), NONE), - _T(DXT5_RGBA, DXT4_DXT5, SWIZ(X, Y, Z, W), NONE), - - _T(ETC2_RGB8, EXT_NONE | EXT_FORMAT, SWIZ(X, Y, Z, W), NONE), /* Extd. format NONE doubles as ETC2_RGB8 */ - _T(ETC2_SRGB8, EXT_NONE | EXT_FORMAT, SWIZ(X, Y, Z, W), NONE), - _T(ETC2_RGB8A1, EXT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2 | EXT_FORMAT, SWIZ(X, Y, Z, W), NONE), - _T(ETC2_SRGB8A1, EXT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2 | EXT_FORMAT, SWIZ(X, Y, Z, W), NONE), - _T(ETC2_RGBA8, EXT_RGBA8_ETC2_EAC | EXT_FORMAT, SWIZ(X, Y, Z, W), NONE), - _T(ETC2_SRGBA8, EXT_RGBA8_ETC2_EAC | EXT_FORMAT, SWIZ(X, Y, Z, W), NONE), - _T(ETC2_R11_UNORM, EXT_R11_EAC | EXT_FORMAT, SWIZ(X, Y, Z, W), NONE), - _T(ETC2_R11_SNORM, EXT_SIGNED_R11_EAC | EXT_FORMAT, SWIZ(X, Y, Z, W), NONE), - _T(ETC2_RG11_UNORM, EXT_RG11_EAC | EXT_FORMAT, SWIZ(X, Y, Z, W), NONE), - _T(ETC2_RG11_SNORM, EXT_SIGNED_RG11_EAC | EXT_FORMAT, SWIZ(X, Y, Z, W), NONE), - - _T(ASTC_4x4, ASTC_RGBA_4x4 | ASTC_FORMAT, SWIZ(X, Y, Z, W), NONE), - _T(ASTC_5x4, ASTC_RGBA_5x4 | ASTC_FORMAT, SWIZ(X, Y, Z, W), NONE), - _T(ASTC_5x5, ASTC_RGBA_5x5 | ASTC_FORMAT, SWIZ(X, Y, Z, W), NONE), - _T(ASTC_6x5, ASTC_RGBA_6x5 | ASTC_FORMAT, SWIZ(X, Y, Z, W), NONE), - _T(ASTC_6x6, ASTC_RGBA_6x6 | ASTC_FORMAT, SWIZ(X, Y, Z, W), NONE), - _T(ASTC_8x5, ASTC_RGBA_8x5 | ASTC_FORMAT, SWIZ(X, Y, Z, W), NONE), - _T(ASTC_8x6, ASTC_RGBA_8x6 | ASTC_FORMAT, SWIZ(X, Y, Z, W), NONE), - _T(ASTC_8x8, ASTC_RGBA_8x8 | ASTC_FORMAT, SWIZ(X, Y, Z, W), NONE), - _T(ASTC_10x5, ASTC_RGBA_10x5 | ASTC_FORMAT, SWIZ(X, Y, Z, W), NONE), - _T(ASTC_10x6, ASTC_RGBA_10x6 | ASTC_FORMAT, SWIZ(X, Y, Z, W), NONE), - _T(ASTC_10x8, ASTC_RGBA_10x8 | ASTC_FORMAT, SWIZ(X, Y, Z, W), NONE), - _T(ASTC_10x10, ASTC_RGBA_10x10 | ASTC_FORMAT, SWIZ(X, Y, Z, W), NONE), - _T(ASTC_12x10, ASTC_RGBA_12x10 | ASTC_FORMAT, SWIZ(X, Y, Z, W), NONE), - _T(ASTC_12x12, ASTC_RGBA_12x12 | ASTC_FORMAT, SWIZ(X, Y, Z, W), NONE), - - _T(ASTC_4x4_SRGB, ASTC_RGBA_4x4 | ASTC_FORMAT, SWIZ(X, Y, Z, W), NONE), - _T(ASTC_5x4_SRGB, ASTC_RGBA_5x4 | ASTC_FORMAT, SWIZ(X, Y, Z, W), NONE), - _T(ASTC_5x5_SRGB, ASTC_RGBA_5x5 | ASTC_FORMAT, SWIZ(X, Y, Z, W), NONE), - _T(ASTC_6x5_SRGB, ASTC_RGBA_6x5 | ASTC_FORMAT, SWIZ(X, Y, Z, W), NONE), - _T(ASTC_6x6_SRGB, ASTC_RGBA_6x6 | ASTC_FORMAT, SWIZ(X, Y, Z, W), NONE), - _T(ASTC_8x5_SRGB, ASTC_RGBA_8x5 | ASTC_FORMAT, SWIZ(X, Y, Z, W), NONE), - _T(ASTC_8x6_SRGB, ASTC_RGBA_8x6 | ASTC_FORMAT, SWIZ(X, Y, Z, W), NONE), - _T(ASTC_8x8_SRGB, ASTC_RGBA_8x8 | ASTC_FORMAT, SWIZ(X, Y, Z, W), NONE), - _T(ASTC_10x5_SRGB, ASTC_RGBA_10x5 | ASTC_FORMAT, SWIZ(X, Y, Z, W), NONE), - _T(ASTC_10x6_SRGB, ASTC_RGBA_10x6 | ASTC_FORMAT, SWIZ(X, Y, Z, W), NONE), - _T(ASTC_10x8_SRGB, ASTC_RGBA_10x8 | ASTC_FORMAT, SWIZ(X, Y, Z, W), NONE), - _T(ASTC_10x10_SRGB, ASTC_RGBA_10x10 | ASTC_FORMAT, SWIZ(X, Y, Z, W), NONE), - _T(ASTC_12x10_SRGB, ASTC_RGBA_12x10 | ASTC_FORMAT, SWIZ(X, Y, Z, W), NONE), - _T(ASTC_12x12_SRGB, ASTC_RGBA_12x12 | ASTC_FORMAT, SWIZ(X, Y, Z, W), NONE), + _T(DXT1_RGB, DXT1, NONE), + _T(DXT1_RGBA, DXT1, NONE), + _T(DXT3_RGBA, DXT2_DXT3, NONE), + _T(DXT5_RGBA, DXT4_DXT5, NONE), + + _T(ETC2_RGB8, EXT_NONE | EXT_FORMAT, NONE), /* Extd. format NONE doubles as ETC2_RGB8 */ + _T(ETC2_RGB8A1, EXT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2 | EXT_FORMAT, NONE), + _T(ETC2_RGBA8, EXT_RGBA8_ETC2_EAC | EXT_FORMAT, NONE), + _T(ETC2_R11_UNORM, EXT_R11_EAC | EXT_FORMAT, NONE), + _T(ETC2_R11_SNORM, EXT_SIGNED_R11_EAC | EXT_FORMAT, NONE), + _T(ETC2_RG11_UNORM, EXT_RG11_EAC | EXT_FORMAT, NONE), + _T(ETC2_RG11_SNORM, EXT_SIGNED_RG11_EAC | EXT_FORMAT, NONE), + + _T(ASTC_4x4, ASTC_RGBA_4x4 | ASTC_FORMAT, NONE), + _T(ASTC_5x4, ASTC_RGBA_5x4 | ASTC_FORMAT, NONE), + _T(ASTC_5x5, ASTC_RGBA_5x5 | ASTC_FORMAT, NONE), + _T(ASTC_6x5, ASTC_RGBA_6x5 | ASTC_FORMAT, NONE), + _T(ASTC_6x6, ASTC_RGBA_6x6 | ASTC_FORMAT, NONE), + _T(ASTC_8x5, ASTC_RGBA_8x5 | ASTC_FORMAT, NONE), + _T(ASTC_8x6, ASTC_RGBA_8x6 | ASTC_FORMAT, NONE), + _T(ASTC_8x8, ASTC_RGBA_8x8 | ASTC_FORMAT, NONE), + _T(ASTC_10x5, ASTC_RGBA_10x5 | ASTC_FORMAT, NONE), + _T(ASTC_10x6, ASTC_RGBA_10x6 | ASTC_FORMAT, NONE), + _T(ASTC_10x8, ASTC_RGBA_10x8 | ASTC_FORMAT, NONE), + _T(ASTC_10x10, ASTC_RGBA_10x10 | ASTC_FORMAT, NONE), + _T(ASTC_12x10, ASTC_RGBA_12x10 | ASTC_FORMAT, NONE), + _T(ASTC_12x12, ASTC_RGBA_12x12 | ASTC_FORMAT, NONE), /* YUV */ - _T(YUYV, YUY2, SWIZ(X, Y, Z, W), YUY2), - _T(UYVY, UYVY, SWIZ(X, Y, Z, W), NONE), + _T(YUYV, YUY2, YUY2), + _T(UYVY, UYVY, NONE), }; uint32_t translate_texture_format(enum pipe_format fmt) { + fmt = util_format_linear(fmt); + if (!formats[fmt].present) return ETNA_NO_MATCH; @@ -293,15 +268,43 @@ } bool -texture_format_needs_swiz(enum pipe_format fmt) +texture_use_int_filter(const struct pipe_sampler_view *so, bool tex_desc) { - static const unsigned char def[4] = SWIZ(X, Y, Z, W); - bool swiz = false; + switch (so->target) { + case PIPE_TEXTURE_1D_ARRAY: + case PIPE_TEXTURE_2D_ARRAY: + if (tex_desc) + break; + case PIPE_TEXTURE_3D: + return false; + default: + break; + } - if (formats[fmt].present) - swiz = !!memcmp(def, formats[fmt].tex_swiz, sizeof(formats[fmt].tex_swiz)); + /* only unorm formats can use int filter */ + if (!util_format_is_unorm(so->format)) + return false; + + if (util_format_is_srgb(so->format)) + return false; + + switch (so->format) { + /* apparently D16 can't use int filter but D24 can */ + case PIPE_FORMAT_Z16_UNORM: + case PIPE_FORMAT_R10G10B10A2_UNORM: + case PIPE_FORMAT_R10G10B10X2_UNORM: + case PIPE_FORMAT_ETC2_R11_UNORM: + case PIPE_FORMAT_ETC2_RG11_UNORM: + return false; + default: + return true; + } +} - return swiz; +bool +texture_format_needs_swiz(enum pipe_format fmt) +{ + return util_format_linear(fmt) == PIPE_FORMAT_R8_UNORM; } uint32_t @@ -310,10 +313,15 @@ { unsigned char swiz[4] = { swizzle_r, swizzle_g, swizzle_b, swizzle_a, - }, rswiz[4]; + }; - assert(formats[fmt].present); - util_format_compose_swizzles(formats[fmt].tex_swiz, swiz, rswiz); + if (util_format_linear(fmt) == PIPE_FORMAT_R8_UNORM) { + /* R8 is emulated with L8, needs yz channels set to zero */ + for (unsigned i = 0; i < 4; i++) { + if (swiz[i] == PIPE_SWIZZLE_Y || swiz[i] == PIPE_SWIZZLE_Z) + swiz[i] = PIPE_SWIZZLE_0; + } + } /* PIPE_SWIZZLE_ maps 1:1 to TEXTURE_SWIZZLE_ */ STATIC_ASSERT(PIPE_SWIZZLE_X == TEXTURE_SWIZZLE_RED); @@ -323,30 +331,33 @@ STATIC_ASSERT(PIPE_SWIZZLE_0 == TEXTURE_SWIZZLE_ZERO); STATIC_ASSERT(PIPE_SWIZZLE_1 == TEXTURE_SWIZZLE_ONE); - return VIVS_TE_SAMPLER_CONFIG1_SWIZZLE_R(rswiz[0]) | - VIVS_TE_SAMPLER_CONFIG1_SWIZZLE_G(rswiz[1]) | - VIVS_TE_SAMPLER_CONFIG1_SWIZZLE_B(rswiz[2]) | - VIVS_TE_SAMPLER_CONFIG1_SWIZZLE_A(rswiz[3]); + return VIVS_TE_SAMPLER_CONFIG1_SWIZZLE_R(swiz[0]) | + VIVS_TE_SAMPLER_CONFIG1_SWIZZLE_G(swiz[1]) | + VIVS_TE_SAMPLER_CONFIG1_SWIZZLE_B(swiz[2]) | + VIVS_TE_SAMPLER_CONFIG1_SWIZZLE_A(swiz[3]); } uint32_t -translate_rs_format(enum pipe_format fmt) +translate_pe_format(enum pipe_format fmt) { + fmt = util_format_linear(fmt); + if (!formats[fmt].present) return ETNA_NO_MATCH; - if (formats[fmt].rs == ETNA_NO_MATCH) + if (formats[fmt].pe == ETNA_NO_MATCH) return ETNA_NO_MATCH; - return RS_FORMAT(formats[fmt].rs); + return PE_FORMAT(formats[fmt].pe); } int -translate_rs_format_rb_swap(enum pipe_format fmt) +translate_pe_format_rb_swap(enum pipe_format fmt) { + fmt = util_format_linear(fmt); assert(formats[fmt].present); - return formats[fmt].rs & RS_FORMAT_RB_SWAP; + return formats[fmt].pe & PE_FORMAT_RB_SWAP; } /* Return type flags for vertex element format */ diff -Nru mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_format.h mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_format.h --- mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_format.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_format.h 2020-06-12 01:21:16.000000000 +0000 @@ -27,7 +27,8 @@ #ifndef ETNAVIV_FORMAT_H_ #define ETNAVIV_FORMAT_H_ -#include "util/u_format.h" +#include "util/format/u_format.h" +#include "pipe/p_state.h" #include #define ETNA_NO_MATCH (~0) @@ -38,6 +39,9 @@ translate_texture_format(enum pipe_format fmt); bool +texture_use_int_filter(const struct pipe_sampler_view *so, bool tex_desc); + +bool texture_format_needs_swiz(enum pipe_format fmt); uint32_t @@ -45,10 +49,10 @@ unsigned swizzle_g, unsigned swizzle_b, unsigned swizzle_a); uint32_t -translate_rs_format(enum pipe_format fmt); +translate_pe_format(enum pipe_format fmt); int -translate_rs_format_rb_swap(enum pipe_format fmt); +translate_pe_format_rb_swap(enum pipe_format fmt); uint32_t translate_vertex_format_type(enum pipe_format fmt); diff -Nru mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_internal.h mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_internal.h --- mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_internal.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_internal.h 2020-06-12 01:21:16.000000000 +0000 @@ -38,6 +38,7 @@ #define ETNA_NUM_LOD (14) #define ETNA_NUM_LAYERS (6) #define ETNA_MAX_UNIFORMS (256) +#define ETNA_MAX_CONST_BUF 16 #define ETNA_MAX_PIXELPIPES 2 /* All RS operations must have width%16 = 0 */ @@ -90,6 +91,8 @@ unsigned use_blt : 1; /* can use any kind of wrapping mode on npot textures */ unsigned npot_tex_any_wrap : 1; + /* supports seamless cube map */ + unsigned seamless_cube_map : 1; /* number of bits per TS tile */ unsigned bits_per_tile; /* clear value for TS (dependent on bits_per_tile) */ @@ -152,8 +155,8 @@ /* Compiled pipe_stencil_ref */ struct compiled_stencil_ref { - uint32_t PE_STENCIL_CONFIG; - uint32_t PE_STENCIL_CONFIG_EXT; + uint32_t PE_STENCIL_CONFIG[2]; + uint32_t PE_STENCIL_CONFIG_EXT[2]; }; /* Compiled pipe_scissor_state */ @@ -212,9 +215,12 @@ struct etna_reloc TS_DEPTH_STATUS_BASE; struct etna_reloc TS_DEPTH_SURFACE_BASE; uint32_t TS_COLOR_CLEAR_VALUE; + uint32_t TS_COLOR_CLEAR_VALUE_EXT; struct etna_reloc TS_COLOR_STATUS_BASE; struct etna_reloc TS_COLOR_SURFACE_BASE; uint32_t PE_LOGIC_OP; + uint32_t PS_CONTROL; + uint32_t PS_CONTROL_EXT; bool msaa_mode; /* adds input (and possible temp) to PS */ }; @@ -225,12 +231,13 @@ uint32_t NFE_GENERIC_ATTRIB_CONFIG0[VIVS_NFE_GENERIC_ATTRIB__LEN]; uint32_t NFE_GENERIC_ATTRIB_SCALE[VIVS_NFE_GENERIC_ATTRIB__LEN]; uint32_t NFE_GENERIC_ATTRIB_CONFIG1[VIVS_NFE_GENERIC_ATTRIB__LEN]; + unsigned num_buffers; + uint32_t NFE_VERTEX_STREAMS_VERTEX_DIVISOR[VIVS_NFE_VERTEX_STREAMS__LEN]; }; /* Compiled context->set_vertex_buffer result */ struct compiled_set_vertex_buffer { uint32_t FE_VERTEX_STREAM_CONTROL; - uint32_t FE_VERTEX_STREAM_UNK14680; struct etna_reloc FE_VERTEX_STREAM_BASE_ADDR; }; @@ -255,12 +262,13 @@ uint32_t PS_INPUT_COUNT_MSAA; /* Adds an input */ uint32_t PS_TEMP_REGISTER_CONTROL; uint32_t PS_TEMP_REGISTER_CONTROL_MSAA; /* Adds a temporary if needed to make space for extra input */ - uint32_t PS_CONTROL; uint32_t PS_START_PC; + uint32_t PE_DEPTH_CONFIG; uint32_t GL_VARYING_TOTAL_COMPONENTS; uint32_t GL_VARYING_NUM_COMPONENTS; uint32_t GL_VARYING_COMPONENT_USE[2]; uint32_t GL_HALTI5_SH_SPECIALS; + uint32_t FE_HALTI5_ID_CONFIG; unsigned vs_inst_mem_size; unsigned ps_inst_mem_size; uint32_t *VS_INST_MEM; diff -Nru mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_query_hw.c mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_query_hw.c --- mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_query_hw.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_query_hw.c 2020-06-12 01:21:16.000000000 +0000 @@ -150,7 +150,7 @@ p->start(hq, ctx); /* add to active list */ - assert(list_empty(&hq->node)); + assert(list_is_empty(&hq->node)); list_addtail(&hq->node, &ctx->active_hw_queries); return true; @@ -176,7 +176,7 @@ struct etna_resource *rsc = etna_resource(hq->prsc); const struct etna_hw_sample_provider *p = hq->provider; - assert(LIST_IS_EMPTY(&hq->node)); + assert(list_is_empty(&hq->node)); if (!wait) { int ret; diff -Nru mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_resource.c mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_resource.c --- mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_resource.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_resource.c 2020-06-12 01:21:16.000000000 +0000 @@ -145,7 +145,7 @@ const struct pipe_resource *templat) { struct etna_screen *screen = etna_screen(pscreen); - if (!translate_samples_to_xyscale(templat->nr_samples, NULL, NULL, NULL)) + if (!translate_samples_to_xyscale(templat->nr_samples, NULL, NULL)) return false; /* templat->bind is not set here, so we must use the minimum sizes */ @@ -204,8 +204,7 @@ /* Create a new resource object, using the given template info */ struct pipe_resource * etna_resource_alloc(struct pipe_screen *pscreen, unsigned layout, - enum etna_resource_addressing_mode mode, uint64_t modifier, - const struct pipe_resource *templat) + uint64_t modifier, const struct pipe_resource *templat) { struct etna_screen *screen = etna_screen(pscreen); struct etna_resource *rsc; @@ -230,7 +229,7 @@ } int msaa_xscale = 1, msaa_yscale = 1; - if (!translate_samples_to_xyscale(nr_samples, &msaa_xscale, &msaa_yscale, NULL)) { + if (!translate_samples_to_xyscale(nr_samples, &msaa_xscale, &msaa_yscale)) { /* Number of samples not supported */ return NULL; } @@ -299,9 +298,9 @@ rsc->base.nr_samples = nr_samples; rsc->layout = layout; rsc->halign = halign; - rsc->addressing_mode = mode; pipe_reference_init(&rsc->base.reference, 1); + util_range_init(&rsc->valid_buffer_range); size = setup_miptree(rsc, paddingX, paddingY, msaa_xscale, msaa_yscale); @@ -339,58 +338,38 @@ const struct pipe_resource *templat) { struct etna_screen *screen = etna_screen(pscreen); + unsigned layout = ETNA_LAYOUT_TILED; - /* Figure out what tiling and address mode to use -- for now, assume that - * texture cannot be linear. there is a capability LINEAR_TEXTURE_SUPPORT - * (supported on gc880 and gc2000 at least), but not sure how it works. - * Buffers always have LINEAR layout. + /* At this point we don't know if the resource will be used as a texture, + * render target, or both, because gallium sets the bits whenever possible + * This matters because on some GPUs (GC2000) there is no tiling that is + * compatible with both TE and PE. + * + * We expect that depth/stencil buffers will always be used by PE (rendering), + * and any other non-scanout resource will be used as a texture at some point, + * So allocate a render-compatible base buffer for scanout/depthstencil buffers, + * and a texture-compatible base buffer in other cases + * */ - unsigned layout = ETNA_LAYOUT_LINEAR; - enum etna_resource_addressing_mode mode = ETNA_ADDRESSING_MODE_TILED; - - if (etna_resource_sampler_only(templat)) { - /* The buffer is only used for texturing, so create something - * directly compatible with the sampler. Such a buffer can - * never be rendered to. */ - layout = ETNA_LAYOUT_TILED; - - if (util_format_is_compressed(templat->format)) - layout = ETNA_LAYOUT_LINEAR; - } else if (templat->target != PIPE_BUFFER) { - bool want_multitiled = false; - bool want_supertiled = screen->specs.can_supertile; - - /* When this GPU supports single-buffer rendering, don't ever enable - * multi-tiling. This replicates the blob behavior on GC3000. - */ - if (!screen->specs.single_buffer) - want_multitiled = screen->specs.pixel_pipes > 1; - - /* Keep single byte blocksized resources as tiled, since we - * are unable to use the RS blit to de-tile them. However, - * if they're used as a render target or depth/stencil, they - * must be multi-tiled for GPUs with multiple pixel pipes. - * Ignore depth/stencil here, but it is an error for a render - * target. - */ - if (util_format_get_blocksize(templat->format) == 1 && - !(templat->bind & PIPE_BIND_DEPTH_STENCIL)) { - assert(!(templat->bind & PIPE_BIND_RENDER_TARGET && want_multitiled)); - want_multitiled = want_supertiled = false; - } - - layout = ETNA_LAYOUT_BIT_TILE; - if (want_multitiled) + if (templat->bind & (PIPE_BIND_SCANOUT | PIPE_BIND_DEPTH_STENCIL)) { + if (screen->specs.pixel_pipes > 1 && !screen->specs.single_buffer) layout |= ETNA_LAYOUT_BIT_MULTI; - if (want_supertiled) + if (screen->specs.can_supertile) layout |= ETNA_LAYOUT_BIT_SUPER; + } else if (VIV_FEATURE(screen, chipMinorFeatures2, SUPERTILED_TEXTURE) && + etna_resource_hw_tileable(screen->specs.use_blt, templat)) { + layout |= ETNA_LAYOUT_BIT_SUPER; } - if (templat->target == PIPE_TEXTURE_3D) + if ((templat->bind & PIPE_BIND_LINEAR) || /* linear base requested */ + templat->target == PIPE_BUFFER || /* buffer always linear */ + /* compressed textures don't use tiling, they have their own "tiles" */ + util_format_is_compressed(templat->format)) { layout = ETNA_LAYOUT_LINEAR; + } /* modifier is only used for scanout surfaces, so safe to use LINEAR here */ - return etna_resource_alloc(pscreen, layout, mode, DRM_FORMAT_MOD_LINEAR, templat); + return etna_resource_alloc(pscreen, layout, DRM_FORMAT_MOD_LINEAR, templat); } enum modifier_priority { @@ -470,31 +449,22 @@ */ tmpl.bind |= PIPE_BIND_SCANOUT; - return etna_resource_alloc(pscreen, modifier_to_layout(modifier), - ETNA_ADDRESSING_MODE_TILED, modifier, &tmpl); + return etna_resource_alloc(pscreen, modifier_to_layout(modifier), modifier, &tmpl); } static void etna_resource_changed(struct pipe_screen *pscreen, struct pipe_resource *prsc) { - struct etna_resource *res = etna_resource(prsc); - - if (res->external) - etna_resource(res->external)->seqno++; - else - res->seqno++; + etna_resource(prsc)->seqno++; } static void etna_resource_destroy(struct pipe_screen *pscreen, struct pipe_resource *prsc) { - struct etna_screen *screen = etna_screen(pscreen); struct etna_resource *rsc = etna_resource(prsc); - mtx_lock(&screen->lock); - _mesa_set_remove_key(screen->used_resources, rsc); + assert(!_mesa_set_next_entry(rsc->pending_ctx, NULL)); _mesa_set_destroy(rsc->pending_ctx, NULL); - mtx_unlock(&screen->lock); if (rsc->bo) etna_bo_del(rsc->bo); @@ -505,8 +475,10 @@ if (rsc->scanout) renderonly_scanout_destroy(rsc->scanout, etna_screen(pscreen)->ro); + util_range_destroy(&rsc->valid_buffer_range); + pipe_resource_reference(&rsc->texture, NULL); - pipe_resource_reference(&rsc->external, NULL); + pipe_resource_reference(&rsc->render, NULL); for (unsigned i = 0; i < ETNA_NUM_LOD; i++) FREE(rsc->levels[i].patch_offsets); @@ -523,7 +495,6 @@ struct etna_resource *rsc; struct etna_resource_level *level; struct pipe_resource *prsc; - struct pipe_resource *ptiled = NULL; DBG("target=%d, format=%s, %ux%ux%u, array_size=%u, last_level=%u, " "nr_samples=%u, usage=%u, bind=%x, flags=%x", @@ -541,6 +512,7 @@ *prsc = *tmpl; pipe_reference_init(&prsc->reference, 1); + util_range_init(&rsc->valid_buffer_range); prsc->screen = pscreen; rsc->bo = etna_screen_bo_from_handle(pscreen, handle, &level->stride); @@ -550,8 +522,6 @@ rsc->seqno = 1; rsc->layout = modifier_to_layout(handle->modifier); rsc->halign = TEXTURE_HALIGN_FOUR; - rsc->addressing_mode = ETNA_ADDRESSING_MODE_TILED; - level->width = tmpl->width0; level->height = tmpl->height0; @@ -594,35 +564,10 @@ if (!rsc->pending_ctx) goto fail; - if (rsc->layout == ETNA_LAYOUT_LINEAR) { - /* - * Both sampler and pixel pipes can't handle linear, create a compatible - * base resource, where we can attach the imported buffer as an external - * resource. - */ - struct pipe_resource tiled_templat = *tmpl; - - /* - * Remove BIND_SCANOUT to avoid recursion, as etna_resource_create uses - * this function to import the scanout buffer and get a tiled resource. - */ - tiled_templat.bind &= ~PIPE_BIND_SCANOUT; - - ptiled = etna_resource_create(pscreen, &tiled_templat); - if (!ptiled) - goto fail; - - etna_resource(ptiled)->external = prsc; - - return ptiled; - } - return prsc; fail: etna_resource_destroy(pscreen, prsc); - if (ptiled) - etna_resource_destroy(pscreen, ptiled); return NULL; } @@ -637,13 +582,6 @@ /* Scanout is always attached to the base resource */ struct renderonly_scanout *scanout = rsc->scanout; - /* - * External resources are preferred, so a import->export chain of - * render/sampler incompatible buffers yield the same handle. - */ - if (rsc->external) - rsc = etna_resource(rsc->external); - handle->stride = rsc->levels[0].stride; handle->offset = rsc->levels[0].offset; handle->modifier = layout_to_modifier(rsc->layout); @@ -665,51 +603,80 @@ } } +enum etna_resource_status +etna_resource_get_status(struct etna_context *ctx, struct etna_resource *rsc) +{ + enum etna_resource_status newstatus = 0; + + set_foreach(rsc->pending_ctx, entry) { + struct etna_context *extctx = (struct etna_context *)entry->key; + + set_foreach(extctx->used_resources_read, entry2) { + struct etna_resource *rsc2 = (struct etna_resource *)entry2->key; + if (ctx == extctx || rsc2 != rsc) + continue; + + newstatus |= ETNA_PENDING_READ; + } + + set_foreach(extctx->used_resources_write, entry2) { + struct etna_resource *rsc2 = (struct etna_resource *)entry2->key; + if (ctx == extctx || rsc2 != rsc) + continue; + + newstatus |= ETNA_PENDING_WRITE; + } + } + + return newstatus; +} + void etna_resource_used(struct etna_context *ctx, struct pipe_resource *prsc, enum etna_resource_status status) { - struct etna_screen *screen = ctx->screen; + struct pipe_resource *referenced = NULL; struct etna_resource *rsc; if (!prsc) return; + mtx_lock(&ctx->lock); + rsc = etna_resource(prsc); - mtx_lock(&screen->lock); + set_foreach(rsc->pending_ctx, entry) { + struct etna_context *extctx = (struct etna_context *)entry->key; + struct pipe_context *pctx = &extctx->base; + + set_foreach(extctx->used_resources_read, entry2) { + struct etna_resource *rsc2 = (struct etna_resource *)entry2->key; + if (ctx == extctx || rsc2 != rsc) + continue; - /* - * if we are pending read or write by any other context or - * if reading a resource pending a write, then - * flush all the contexts to maintain coherency - */ - if (((status & ETNA_PENDING_WRITE) && rsc->status) || - ((status & ETNA_PENDING_READ) && (rsc->status & ETNA_PENDING_WRITE))) { - set_foreach(rsc->pending_ctx, entry) { - struct etna_context *extctx = (struct etna_context *)entry->key; - struct pipe_context *pctx = &extctx->base; + if (status & ETNA_PENDING_WRITE) + pctx->flush(pctx, NULL, 0); + } - if (extctx == ctx) + set_foreach(extctx->used_resources_write, entry2) { + struct etna_resource *rsc2 = (struct etna_resource *)entry2->key; + if (ctx == extctx || rsc2 != rsc) continue; pctx->flush(pctx, NULL, 0); - /* It's safe to clear the status here. If we need to flush it means - * either another context had the resource in exclusive (write) use, - * or we transition the resource to exclusive use in our context. - * In both cases the new status accurately reflects the resource use - * after the flush. - */ - rsc->status = 0; } } - rsc->status |= status; + rsc->status = status; - _mesa_set_add(screen->used_resources, rsc); - _mesa_set_add(rsc->pending_ctx, ctx); + if (!_mesa_set_search(rsc->pending_ctx, ctx)) { + pipe_resource_reference(&referenced, prsc); + _mesa_set_add((status & ETNA_PENDING_READ) ? + ctx->used_resources_read : ctx->used_resources_write, rsc); + _mesa_set_add(rsc->pending_ctx, ctx); + } - mtx_unlock(&screen->lock); + mtx_unlock(&ctx->lock); } bool diff -Nru mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_resource.h mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_resource.h --- mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_resource.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_resource.h 2020-06-12 01:21:16.000000000 +0000 @@ -30,9 +30,11 @@ #include "etnaviv_internal.h" #include "etnaviv_tiling.h" #include "pipe/p_state.h" +#include "util/format/u_format.h" #include "util/list.h" #include "util/set.h" #include "util/u_helpers.h" +#include "util/u_range.h" struct etna_context; struct pipe_screen; @@ -50,7 +52,7 @@ uint32_t ts_offset; uint32_t ts_layer_stride; uint32_t ts_size; - uint32_t clear_value; /* clear value of resource level (mainly for TS) */ + uint64_t clear_value; /* clear value of resource level (mainly for TS) */ bool ts_valid; uint8_t ts_mode; int8_t ts_compress_fmt; /* COLOR_COMPRESSION_FORMAT_* (-1 = disable) */ @@ -60,11 +62,6 @@ struct util_dynarray *patch_offsets; }; -enum etna_resource_addressing_mode { - ETNA_ADDRESSING_MODE_TILED = 0, - ETNA_ADDRESSING_MODE_LINEAR, -}; - /* status of queued up but not flushed reads and write operations. * In _transfer_map() we need to know if queued up rendering needs * to be flushed to preserve the order of cpu and gpu access. */ @@ -82,7 +79,6 @@ /* only lod 0 used for non-texture buffers */ /* Layout for surface (tiled, multitiled, split tiled, ...) */ enum etna_surface_layout layout; - enum etna_resource_addressing_mode addressing_mode; /* Horizontal alignment for texture unit (TEXTURE_HALIGN_*) */ unsigned halign; struct etna_bo *bo; /* Surface video memory */ @@ -90,13 +86,13 @@ struct etna_resource_level levels[ETNA_NUM_LOD]; - /* When we are rendering to a texture, we need a differently tiled resource */ + /* buffer range that has been initialized */ + struct util_range valid_buffer_range; + + /* for when TE doesn't support the base layout */ struct pipe_resource *texture; - /* - * If imported resources have an render/sampler incompatible tiling, we keep - * them as an external resource, which is blitted as needed. - */ - struct pipe_resource *external; + /* for when PE doesn't support the base layout */ + struct pipe_resource *render; enum etna_resource_status status; @@ -137,12 +133,26 @@ PIPE_BIND_SAMPLER_VIEW; } +static inline bool +etna_resource_hw_tileable(bool use_blt, const struct pipe_resource *pres) +{ + if (use_blt) + return true; + + /* RS can only tile 16bpp or 32bpp formats */ + return util_format_get_blocksize(pres->format) == 2 || + util_format_get_blocksize(pres->format) == 4; +} + static inline struct etna_resource * etna_resource(struct pipe_resource *p) { return (struct etna_resource *)p; } +enum etna_resource_status +etna_resource_get_status(struct etna_context *ctx, struct etna_resource *rsc); + void etna_resource_used(struct etna_context *ctx, struct pipe_resource *prsc, enum etna_resource_status status); @@ -169,8 +179,7 @@ struct pipe_resource * etna_resource_alloc(struct pipe_screen *pscreen, unsigned layout, - enum etna_resource_addressing_mode mode, uint64_t modifier, - const struct pipe_resource *templat); + uint64_t modifier, const struct pipe_resource *templat); void etna_resource_screen_init(struct pipe_screen *pscreen); diff -Nru mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_rs.c mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_rs.c --- mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_rs.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_rs.c 2020-06-12 01:21:16.000000000 +0000 @@ -50,6 +50,21 @@ #include +/* return a RS "compatible" format for use when copying */ +static uint32_t +etna_compatible_rs_format(enum pipe_format fmt) +{ + /* YUYV and UYVY are blocksize 4, but 2 bytes per pixel */ + if (fmt == PIPE_FORMAT_YUYV || fmt == PIPE_FORMAT_UYVY) + return RS_FORMAT_A4R4G4B4; + + switch (util_format_get_blocksize(fmt)) { + case 2: return RS_FORMAT_A4R4G4B4; + case 4: return RS_FORMAT_A8R8G8B8; + default: return ETNA_NO_MATCH; + } +} + void etna_compile_rs_state(struct etna_context *ctx, struct compiled_rs_state *cs, const struct rs_state *rs) @@ -250,7 +265,7 @@ /* Generate clear command for a surface (non-fast clear case) */ void etna_rs_gen_clear_surface(struct etna_context *ctx, struct etna_surface *surf, - uint32_t clear_value) + uint64_t clear_value) { struct etna_resource *dst = etna_resource(surf->base.texture); uint32_t format; @@ -286,7 +301,7 @@ .dither = {0xffffffff, 0xffffffff}, .width = surf->surf.padded_width, /* These must be padded to 16x4 if !LINEAR, otherwise RS will hang */ .height = surf->surf.padded_height, - .clear_value = {clear_value}, + .clear_value = {clear_value, clear_value >> 32, clear_value, clear_value >> 32}, .clear_mode = VIVS_RS_CLEAR_CONTROL_MODE_ENABLED1, .clear_bits = 0xffff }); @@ -298,10 +313,11 @@ { struct etna_context *ctx = etna_context(pctx); struct etna_surface *surf = etna_surface(dst); - uint32_t new_clear_value = etna_clear_blit_pack_rgba(surf->base.format, color->f); + uint64_t new_clear_value = etna_clear_blit_pack_rgba(surf->base.format, color); if (surf->surf.ts_size) { /* TS: use precompiled clear command */ ctx->framebuffer.TS_COLOR_CLEAR_VALUE = new_clear_value; + ctx->framebuffer.TS_COLOR_CLEAR_VALUE_EXT = new_clear_value >> 32; if (VIV_FEATURE(ctx->screen, chipMinorFeatures1, AUTO_DISABLE)) { /* Set number of color tiles to be filled */ @@ -392,6 +408,7 @@ const union pipe_color_union *color, double depth, unsigned stencil) { struct etna_context *ctx = etna_context(pctx); + mtx_lock(&ctx->lock); /* Flush color and depth cache before clearing anything. * This is especially important when coming from another surface, as @@ -437,6 +454,7 @@ etna_blit_clear_zs_rs(pctx, ctx->framebuffer_s.zsbuf, buffers, depth, stencil); etna_stall(ctx->stream, SYNC_RECIPIENT_RA, SYNC_RECIPIENT_PE); + mtx_unlock(&ctx->lock); } static bool @@ -533,6 +551,30 @@ *height_mask = h_align -1; } +static bool msaa_config(const struct pipe_resource *src, + const struct pipe_resource *dst, + int *msaa_xscale, + int *msaa_yscale) +{ + int src_xscale = 1, src_yscale = 1; + int dst_xscale = 1, dst_yscale = 1; + + assert(src->nr_samples <= 4); + assert(dst->nr_samples <= 4); + + translate_samples_to_xyscale(src->nr_samples, &src_xscale, &src_yscale); + translate_samples_to_xyscale(dst->nr_samples, &dst_xscale, &dst_yscale); + + /* RS does not support upscaling */ + if ((src_xscale < dst_xscale) || (src_yscale < dst_yscale)) + return false; + + *msaa_xscale = src_xscale - dst_xscale + 1; + *msaa_yscale = src_yscale - dst_yscale + 1; + + return true; +} + static bool etna_try_rs_blit(struct pipe_context *pctx, const struct pipe_blit_info *blit_info) @@ -547,8 +589,10 @@ assert(blit_info->src.level <= src->base.last_level); assert(blit_info->dst.level <= dst->base.last_level); - if (!translate_samples_to_xyscale(src->base.nr_samples, &msaa_xscale, &msaa_yscale, NULL)) + if (!msaa_config(&src->base, &dst->base, &msaa_xscale, &msaa_yscale)) { + DBG("upsampling not supported"); return false; + } /* The width/height are in pixels; they do not change as a result of * multi-sampling. So, when blitting from a 4x multisampled surface @@ -569,18 +613,19 @@ return false; } - unsigned src_format = blit_info->src.format; - unsigned dst_format = blit_info->dst.format; + /* Only support same format (used tiling/detiling) blits for now. + * TODO: figure out which different-format blits are possible and test them + * - fail if swizzle needed + * - avoid trying to convert between float/int formats? + */ + if (blit_info->src.format != blit_info->dst.format) + return false; - /* for a copy with same dst/src format, we can use a different format */ - if (translate_rs_format(src_format) == ETNA_NO_MATCH && - src_format == dst_format) { - src_format = dst_format = etna_compatible_rs_format(src_format); - } + uint32_t format = etna_compatible_rs_format(blit_info->dst.format); + if (format == ETNA_NO_MATCH) + return false; - if (translate_rs_format(src_format) == ETNA_NO_MATCH || - translate_rs_format(dst_format) == ETNA_NO_MATCH || - blit_info->scissor_enable || + if (blit_info->scissor_enable || blit_info->dst.box.depth != blit_info->src.box.depth || blit_info->dst.box.depth != 1) { return false; @@ -647,6 +692,8 @@ width & (w_align - 1) || height & (h_align - 1)) goto manual; + mtx_lock(&ctx->lock); + /* Always flush color and depth cache together before resolving. This works * around artifacts that appear in some cases when scanning out a texture * directly after it has been rendered to, such as rendering an animated web @@ -696,6 +743,7 @@ etna_set_state_reloc(ctx->stream, VIVS_TS_COLOR_SURFACE_BASE, &reloc); etna_set_state(ctx->stream, VIVS_TS_COLOR_CLEAR_VALUE, src_lev->clear_value); + etna_set_state(ctx->stream, VIVS_TS_COLOR_CLEAR_VALUE_EXT, src_lev->clear_value >> 32); source_ts_valid = true; } else { @@ -705,7 +753,7 @@ /* Kick off RS here */ etna_compile_rs_state(ctx, ©_to_screen, &(struct rs_state) { - .source_format = translate_rs_format(src_format), + .source_format = format, .source_tiling = src->layout, .source = src->bo, .source_offset = src_offset, @@ -714,7 +762,7 @@ .source_padded_height = src_lev->padded_height, .source_ts_valid = source_ts_valid, .source_ts_compressed = src_lev->ts_compress_fmt >= 0, - .dest_format = translate_rs_format(dst_format), + .dest_format = format, .dest_tiling = dst->layout, .dest = dst->bo, .dest_offset = dst_offset, @@ -736,6 +784,7 @@ dst->seqno++; dst_lev->ts_valid = false; ctx->dirty |= ETNA_DIRTY_DERIVE_TS; + mtx_unlock(&ctx->lock); return true; diff -Nru mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_screen.c mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_screen.c --- mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_screen.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_screen.c 2020-06-12 01:21:17.000000000 +0000 @@ -73,6 +73,7 @@ {"shaderdb", ETNA_DBG_SHADERDB, "Enable shaderdb output"}, {"no_singlebuffer",ETNA_DBG_NO_SINGLEBUF, "Disable single buffer feature"}, {"nir", ETNA_DBG_NIR, "use new NIR compiler"}, + {"deqp", ETNA_DBG_DEQP, "Hacks to run dEQP GLES3 tests"}, /* needs MESA_GLES_VERSION_OVERRIDE=3.0 */ DEBUG_NAMED_VALUE_END }; @@ -84,9 +85,6 @@ { struct etna_screen *screen = etna_screen(pscreen); - _mesa_set_destroy(screen->used_resources, NULL); - mtx_destroy(&screen->lock); - if (screen->perfmon) etna_perfmon_del(screen->perfmon); @@ -152,6 +150,7 @@ case PIPE_CAP_TGSI_TEXCOORD: case PIPE_CAP_VERTEX_COLOR_UNCLAMPED: case PIPE_CAP_MIXED_COLOR_DEPTH_BITS: + case PIPE_CAP_MIXED_FRAMEBUFFER_SIZES: return 1; case PIPE_CAP_NATIVE_FENCE_FD: return screen->drm_version >= ETNA_DRM_VERSION_FENCE_FD; @@ -182,6 +181,8 @@ return 0; /* Stream output. */ + case PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS: + return DBG_ENABLED(ETNA_DBG_DEQP) ? 4 : 0; case PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_COMPONENTS: case PIPE_CAP_MAX_STREAM_OUTPUT_INTERLEAVED_COMPONENTS: return 0; @@ -190,8 +191,15 @@ return 128; case PIPE_CAP_MAX_VERTEX_ELEMENT_SRC_OFFSET: return 255; + case PIPE_CAP_MAX_VERTEX_BUFFERS: + return screen->specs.stream_count; + case PIPE_CAP_VERTEX_ELEMENT_INSTANCE_DIVISOR: + return VIV_FEATURE(screen, chipMinorFeatures4, HALTI2); + /* Texturing. */ + case PIPE_CAP_TEXTURE_SHADOW_MAP: + return DBG_ENABLED(ETNA_DBG_NIR) && screen->specs.halti >= 2; case PIPE_CAP_MAX_TEXTURE_2D_SIZE: case PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS: /* TODO: verify */ return screen->specs.max_texture_size; @@ -210,7 +218,7 @@ case PIPE_CAP_MAX_TEXEL_OFFSET: return 7; case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE: - return VIV_FEATURE(screen, chipMinorFeatures2, SEAMLESS_CUBE_MAP); + return screen->specs.seamless_cube_map; /* Timer queries. */ case PIPE_CAP_OCCLUSION_QUERY: @@ -272,6 +280,10 @@ enum pipe_shader_cap param) { struct etna_screen *screen = etna_screen(pscreen); + bool ubo_enable = screen->specs.halti >= 2 && DBG_ENABLED(ETNA_DBG_NIR); + + if (DBG_ENABLED(ETNA_DBG_DEQP)) + ubo_enable = true; switch (shader) { case PIPE_SHADER_FRAGMENT: @@ -307,7 +319,7 @@ case PIPE_SHADER_CAP_MAX_TEMPS: return 64; /* Max native temporaries. */ case PIPE_SHADER_CAP_MAX_CONST_BUFFERS: - return 1; + return ubo_enable ? ETNA_MAX_CONST_BUF : 1; case PIPE_SHADER_CAP_TGSI_CONT_SUPPORTED: return 1; case PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR: @@ -319,10 +331,11 @@ return 0; case PIPE_SHADER_CAP_TGSI_SQRT_SUPPORTED: return VIV_FEATURE(screen, chipMinorFeatures0, HAS_SQRT_TRIG); - case PIPE_SHADER_CAP_INTEGERS: case PIPE_SHADER_CAP_INT64_ATOMICS: case PIPE_SHADER_CAP_FP16: return 0; + case PIPE_SHADER_CAP_INTEGERS: + return DBG_ENABLED(ETNA_DBG_NIR) && screen->specs.halti >= 2; case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS: case PIPE_SHADER_CAP_MAX_SAMPLER_VIEWS: return shader == PIPE_SHADER_FRAGMENT @@ -331,6 +344,8 @@ case PIPE_SHADER_CAP_PREFERRED_IR: return DBG_ENABLED(ETNA_DBG_NIR) ? PIPE_SHADER_IR_NIR : PIPE_SHADER_IR_TGSI; case PIPE_SHADER_CAP_MAX_CONST_BUFFER_SIZE: + if (ubo_enable) + return 16384; /* 16384 so state tracker enables UBOs */ return shader == PIPE_SHADER_FRAGMENT ? screen->specs.max_ps_uniforms * sizeof(float[4]) : screen->specs.max_vs_uniforms * sizeof(float[4]); @@ -350,7 +365,6 @@ case PIPE_SHADER_CAP_TGSI_SKIP_MERGE_REGISTERS: case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTERS: case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTER_BUFFERS: - case PIPE_SHADER_CAP_SCALAR_ISA: return 0; } @@ -403,6 +417,14 @@ supported = screen->specs.tex_astc; } + if (util_format_is_snorm(format)) + supported = VIV_FEATURE(screen, chipMinorFeatures2, HALTI1); + + if (format != PIPE_FORMAT_S8_UINT_Z24_UNORM && + (util_format_is_pure_integer(format) || util_format_is_float(format))) + supported = VIV_FEATURE(screen, chipMinorFeatures4, HALTI2); + + if (!supported) return false; @@ -413,6 +435,61 @@ } static bool +gpu_supports_render_format(struct etna_screen *screen, enum pipe_format format, + unsigned sample_count) +{ + const uint32_t fmt = translate_pe_format(format); + + if (fmt == ETNA_NO_MATCH) + return false; + + /* Validate MSAA; number of samples must be allowed, and render target + * must have MSAA'able format. */ + if (sample_count > 1) { + if (!VIV_FEATURE(screen, chipFeatures, MSAA)) + return false; + if (!translate_samples_to_xyscale(sample_count, NULL, NULL)) + return false; + if (translate_ts_format(format) == ETNA_NO_MATCH) + return false; + } + + if (format == PIPE_FORMAT_R8_UNORM) + return VIV_FEATURE(screen, chipMinorFeatures5, HALTI5); + + /* figure out 8bpp RS clear to enable these formats */ + if (format == PIPE_FORMAT_R8_SINT || format == PIPE_FORMAT_R8_UINT) + return VIV_FEATURE(screen, chipMinorFeatures5, HALTI5); + + if (util_format_is_srgb(format)) + return VIV_FEATURE(screen, chipMinorFeatures5, HALTI3); + + if (util_format_is_pure_integer(format) || util_format_is_float(format)) + return VIV_FEATURE(screen, chipMinorFeatures4, HALTI2); + + if (format == PIPE_FORMAT_R8G8_UNORM) + return VIV_FEATURE(screen, chipMinorFeatures4, HALTI2); + + /* any other extended format is HALTI0 (only R10G10B10A2?) */ + if (fmt >= PE_FORMAT_R16F) + return VIV_FEATURE(screen, chipMinorFeatures1, HALTI0); + + return true; +} + +static bool +gpu_supports_vertex_format(struct etna_screen *screen, enum pipe_format format) +{ + if (translate_vertex_format_type(format) == ETNA_NO_MATCH) + return false; + + if (util_format_is_pure_integer(format)) + return VIV_FEATURE(screen, chipMinorFeatures4, HALTI2); + + return true; +} + +static bool etna_screen_is_format_supported(struct pipe_screen *pscreen, enum pipe_format format, enum pipe_texture_target target, @@ -430,19 +507,8 @@ return false; if (usage & PIPE_BIND_RENDER_TARGET) { - /* if render target, must be RS-supported format */ - if (translate_rs_format(format) != ETNA_NO_MATCH) { - /* Validate MSAA; number of samples must be allowed, and render target - * must have MSAA'able format. */ - if (sample_count > 1) { - if (translate_samples_to_xyscale(sample_count, NULL, NULL, NULL) && - translate_ts_format(format) != ETNA_NO_MATCH) { - allowed |= PIPE_BIND_RENDER_TARGET; - } - } else { - allowed |= PIPE_BIND_RENDER_TARGET; - } - } + if (gpu_supports_render_format(screen, format, sample_count)) + allowed |= PIPE_BIND_RENDER_TARGET; } if (usage & PIPE_BIND_DEPTH_STENCIL) { @@ -461,7 +527,7 @@ } if (usage & PIPE_BIND_VERTEX_BUFFER) { - if (translate_vertex_format_type(format) != ETNA_NO_MATCH) + if (gpu_supports_vertex_format(screen, format)) allowed |= PIPE_BIND_VERTEX_BUFFER; } @@ -653,6 +719,10 @@ screen->specs.vertex_sampler_offset = 8; screen->specs.fragment_sampler_count = 8; screen->specs.vertex_sampler_count = 4; + + if (screen->model == 0x400) + screen->specs.vertex_sampler_count = 0; + screen->specs.vs_need_z_div = screen->model < 0x1000 && screen->model != 0x880; screen->specs.has_sin_cos_sqrt = @@ -669,6 +739,9 @@ VIV_FEATURE(screen, chipMinorFeatures4, HALTI2); screen->specs.v4_compression = VIV_FEATURE(screen, chipMinorFeatures6, V4_COMPRESSION); + screen->specs.seamless_cube_map = + (screen->model != 0x880) && /* Seamless cubemap is broken on GC880? */ + VIV_FEATURE(screen, chipMinorFeatures2, SEAMLESS_CUBE_MAP); if (screen->specs.halti >= 5) { /* GC7000 - this core must load shaders from memory. */ @@ -752,7 +825,8 @@ if (screen->specs.single_buffer) DBG("etnaviv: Single buffer mode enabled with %d pixel pipes", screen->specs.pixel_pipes); - screen->specs.tex_astc = VIV_FEATURE(screen, chipMinorFeatures4, TEXTURE_ASTC); + screen->specs.tex_astc = VIV_FEATURE(screen, chipMinorFeatures4, TEXTURE_ASTC) && + !VIV_FEATURE(screen, chipMinorFeatures6, NO_ASTC); screen->specs.use_blt = VIV_FEATURE(screen, chipMinorFeatures5, BLT_ENGINE); @@ -897,6 +971,11 @@ if (!etna_get_specs(screen)) goto fail; + if (screen->specs.halti >= 5 && !etnaviv_device_softpin_capable(dev)) { + DBG("halti5 requires softpin"); + goto fail; + } + screen->options = (nir_shader_compiler_options) { .lower_fpow = true, .lower_sub = true, @@ -954,16 +1033,8 @@ if (screen->drm_version >= ETNA_DRM_VERSION_PERFMON) etna_pm_query_setup(screen); - mtx_init(&screen->lock, mtx_recursive); - screen->used_resources = _mesa_set_create(NULL, _mesa_hash_pointer, - _mesa_key_pointer_equal); - if (!screen->used_resources) - goto fail2; - return pscreen; -fail2: - mtx_destroy(&screen->lock); fail: etna_screen_destroy(pscreen); return NULL; diff -Nru mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_screen.h mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_screen.h --- mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_screen.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_screen.h 2020-06-12 01:21:17.000000000 +0000 @@ -85,10 +85,6 @@ uint32_t drm_version; - /* set of resources used by currently-unsubmitted renders */ - mtx_t lock; - struct set *used_resources; - nir_shader_compiler_options options; }; diff -Nru mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_shader.c mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_shader.c --- mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_shader.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_shader.c 2020-06-12 01:21:17.000000000 +0000 @@ -151,6 +151,10 @@ cs->VS_OUTPUT_COUNT_PSIZE = cs->VS_OUTPUT_COUNT; } + /* if fragment shader doesn't read pointcoord, disable it */ + if (link.pcoord_varying_comp_ofs == -1) + cs->PA_CONFIG &= ~VIVS_PA_CONFIG_POINT_SPRITE_ENABLE; + cs->VS_LOAD_BALANCING = vs->vs_load_balancing; cs->VS_START_PC = 0; @@ -161,7 +165,6 @@ VIVS_PS_INPUT_COUNT_UNK8(fs->input_count_unk8); cs->PS_TEMP_REGISTER_CONTROL = VIVS_PS_TEMP_REGISTER_CONTROL_NUM_TEMPS(MAX2(fs->num_temps, link.num_varyings + 1)); - cs->PS_CONTROL = VIVS_PS_CONTROL_SATURATE_RT0; /* XXX when can we set BYPASS? */ cs->PS_START_PC = 0; /* Precompute PS_INPUT_COUNT and TEMP_REGISTER_CONTROL in the case of MSAA @@ -199,6 +202,9 @@ VIVS_GL_HALTI5_SH_SPECIALS_PS_PCOORD_IN((link.pcoord_varying_comp_ofs != -1) ? link.pcoord_varying_comp_ofs : 0x7f); + /* mask out early Z bit when frag depth is written */ + cs->PE_DEPTH_CONFIG = ~COND(fs->ps_depth_out_reg >= 0, VIVS_PE_DEPTH_CONFIG_EARLY_Z); + /* reference instruction memory */ cs->vs_inst_mem_size = vs->code_size; cs->VS_INST_MEM = vs->code; @@ -206,7 +212,7 @@ cs->ps_inst_mem_size = fs->code_size; cs->PS_INST_MEM = fs->code; - if (vs->needs_icache | fs->needs_icache) { + if (vs->needs_icache || fs->needs_icache) { /* If either of the shaders needs ICACHE, we use it for both. It is * either switched on or off for the entire shader processor. */ @@ -278,6 +284,20 @@ etna_bitarray_set(vs_input, 8, idx, cur_temp++); } + if (vs->vs_id_in_reg >= 0) { + cs->VS_INPUT_COUNT = VIVS_VS_INPUT_COUNT_COUNT(num_vs_inputs + 1) | + VIVS_VS_INPUT_COUNT_UNK8(vs->input_count_unk8) | + VIVS_VS_INPUT_COUNT_ID_ENABLE; + + etna_bitarray_set(vs_input, 8, num_vs_inputs, vs->vs_id_in_reg); + + cs->FE_HALTI5_ID_CONFIG = + VIVS_FE_HALTI5_ID_CONFIG_VERTEX_ID_ENABLE | + VIVS_FE_HALTI5_ID_CONFIG_INSTANCE_ID_ENABLE | + VIVS_FE_HALTI5_ID_CONFIG_VERTEX_ID_REG(vs->vs_id_in_reg * 4) | + VIVS_FE_HALTI5_ID_CONFIG_INSTANCE_ID_REG(vs->vs_id_in_reg * 4 + 1); + } + for (int idx = 0; idx < ARRAY_SIZE(cs->VS_INPUT); ++idx) cs->VS_INPUT[idx] = vs_input[idx]; @@ -303,15 +323,12 @@ if (!unlikely(etna_mesa_debug & ETNA_DBG_SHADERDB)) return; - pipe_debug_message(debug, SHADER_INFO, "\n" - "SHADER-DB: %s prog %d/%d: %u instructions %u temps\n" - "SHADER-DB: %s prog %d/%d: %u immediates %u loops\n", + pipe_debug_message(debug, SHADER_INFO, + "%s shader: %u instructions, %u temps, " + "%u immediates, %u loops", etna_shader_stage(v), - v->shader->id, v->id, v->code_size, v->num_temps, - etna_shader_stage(v), - v->shader->id, v->id, v->uniforms.imm_count, v->num_loops); } diff -Nru mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_shader.h mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_shader.h --- mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_shader.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_shader.h 2020-06-12 01:21:17.000000000 +0000 @@ -43,6 +43,8 @@ /* do we need to swap rb in frag color? */ unsigned frag_rb_swap : 1; + /* do we need to invert front facing value? */ + unsigned front_ccw : 1; }; uint32_t global; }; diff -Nru mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_state.c mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_state.c --- mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_state.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_state.c 2020-06-12 01:21:17.000000000 +0000 @@ -52,10 +52,12 @@ ctx->stencil_ref_s = *sr; - cs->PE_STENCIL_CONFIG = VIVS_PE_STENCIL_CONFIG_REF_FRONT(sr->ref_value[0]); - /* rest of bits weaved in from depth_stencil_alpha */ - cs->PE_STENCIL_CONFIG_EXT = - VIVS_PE_STENCIL_CONFIG_EXT_REF_BACK(sr->ref_value[0]); + for (unsigned i = 0; i < 2; i++) { + cs->PE_STENCIL_CONFIG[i] = + VIVS_PE_STENCIL_CONFIG_REF_FRONT(sr->ref_value[i]); + cs->PE_STENCIL_CONFIG_EXT[i] = + VIVS_PE_STENCIL_CONFIG_EXT_REF_BACK(sr->ref_value[!i]); + } ctx->dirty |= ETNA_DIRTY_STENCIL_REF; } @@ -81,24 +83,19 @@ { struct etna_context *ctx = etna_context(pctx); - if (unlikely(index > 0)) { - DBG("Unhandled buffer index %i", index); - return; - } + assert(index < ETNA_MAX_CONST_BUF); - - util_copy_constant_buffer(&ctx->constant_buffer[shader], cb); + util_copy_constant_buffer(&ctx->constant_buffer[shader][index], cb); /* Note that the state tracker can unbind constant buffers by * passing NULL here. */ if (unlikely(!cb || (!cb->buffer && !cb->user_buffer))) return; - /* there is no support for ARB_uniform_buffer_object */ - assert(cb->buffer == NULL && cb->user_buffer != NULL); + assert(index != 0 || cb->user_buffer != NULL); if (!cb->buffer) { - struct pipe_constant_buffer *cb = &ctx->constant_buffer[shader]; + struct pipe_constant_buffer *cb = &ctx->constant_buffer[shader][index]; u_upload_data(pctx->const_uploader, 0, cb->buffer_size, 16, cb->user_buffer, &cb->buffer_offset, &cb->buffer); } @@ -106,20 +103,25 @@ } static void -etna_update_render_resource(struct pipe_context *pctx, struct pipe_resource *pres) +etna_update_render_resource(struct pipe_context *pctx, struct etna_resource *base) { - struct etna_resource *res = etna_resource(pres); + struct etna_resource *to = base, *from = base; + + if (base->texture && etna_resource_newer(etna_resource(base->texture), base)) + from = etna_resource(base->texture); - if (res->texture && etna_resource_older(res, etna_resource(res->texture))) { - /* The render buffer is older than the texture buffer. Copy it over. */ - etna_copy_resource(pctx, pres, res->texture, 0, pres->last_level); - res->seqno = etna_resource(res->texture)->seqno; + if (base->render) + to = etna_resource(base->render); + + if ((to != from) && etna_resource_older(to, from)) { + etna_copy_resource(pctx, &to->base, &from->base, 0, base->base.last_level); + to->seqno = from->seqno; } } static void etna_set_framebuffer_state(struct pipe_context *pctx, - const struct pipe_framebuffer_state *sv) + const struct pipe_framebuffer_state *fb) { struct etna_context *ctx = etna_context(pctx); struct compiled_framebuffer_state *cs = &ctx->framebuffer; @@ -129,17 +131,24 @@ /* Set up TS as well. Warning: this state is used by both the RS and PE */ uint32_t ts_mem_config = 0; uint32_t pe_mem_config = 0; + uint32_t pe_logic_op = 0; - if (sv->nr_cbufs > 0) { /* at least one color buffer? */ - struct etna_surface *cbuf = etna_surface(sv->cbufs[0]); + if (fb->nr_cbufs > 0) { /* at least one color buffer? */ + struct etna_surface *cbuf = etna_surface(fb->cbufs[0]); struct etna_resource *res = etna_resource(cbuf->base.texture); bool color_supertiled = (res->layout & ETNA_LAYOUT_BIT_SUPER) != 0; + uint32_t fmt = translate_pe_format(cbuf->base.format); assert(res->layout & ETNA_LAYOUT_BIT_TILE); /* Cannot render to linear surfaces */ - etna_update_render_resource(pctx, cbuf->base.texture); + etna_update_render_resource(pctx, etna_resource(cbuf->prsc)); + + if (fmt >= PE_FORMAT_R16F) + cs->PE_COLOR_FORMAT = VIVS_PE_COLOR_FORMAT_FORMAT_EXT(fmt) | + VIVS_PE_COLOR_FORMAT_FORMAT_MASK; + else + cs->PE_COLOR_FORMAT = VIVS_PE_COLOR_FORMAT_FORMAT(fmt); - cs->PE_COLOR_FORMAT = - VIVS_PE_COLOR_FORMAT_FORMAT(translate_rs_format(cbuf->base.format)) | + cs->PE_COLOR_FORMAT |= VIVS_PE_COLOR_FORMAT_COMPONENTS__MASK | VIVS_PE_COLOR_FORMAT_OVERWRITE | COND(color_supertiled, VIVS_PE_COLOR_FORMAT_SUPER_TILED) | @@ -174,6 +183,7 @@ if (cbuf->surf.ts_size) { cs->TS_COLOR_CLEAR_VALUE = cbuf->level->clear_value; + cs->TS_COLOR_CLEAR_VALUE_EXT = cbuf->level->clear_value >> 32; cs->TS_COLOR_STATUS_BASE = cbuf->ts_reloc; cs->TS_COLOR_STATUS_BASE.flags = ETNA_RELOC_READ | ETNA_RELOC_WRITE; @@ -195,6 +205,13 @@ } nr_samples_color = cbuf->base.texture->nr_samples; + + if (util_format_is_srgb(cbuf->base.format)) + pe_logic_op |= VIVS_PE_LOGIC_OP_SRGB; + + cs->PS_CONTROL = COND(util_format_is_unorm(cbuf->base.format), VIVS_PS_CONTROL_SATURATE_RT0); + cs->PS_CONTROL_EXT = + VIVS_PS_CONTROL_EXT_OUTPUT_MODE0(translate_output_mode(cbuf->base.format, ctx->specs.halti >= 5)); } else { /* Clearing VIVS_PE_COLOR_FORMAT_COMPONENTS__MASK and * VIVS_PE_COLOR_FORMAT_OVERWRITE prevents us from overwriting the @@ -209,11 +226,11 @@ cs->PE_PIPE_COLOR_ADDR[i] = ctx->dummy_rt_reloc; } - if (sv->zsbuf != NULL) { - struct etna_surface *zsbuf = etna_surface(sv->zsbuf); + if (fb->zsbuf != NULL) { + struct etna_surface *zsbuf = etna_surface(fb->zsbuf); struct etna_resource *res = etna_resource(zsbuf->base.texture); - etna_update_render_resource(pctx, zsbuf->base.texture); + etna_update_render_resource(pctx, etna_resource(zsbuf->prsc)); assert(res->layout &ETNA_LAYOUT_BIT_TILE); /* Cannot render to linear surfaces */ @@ -328,10 +345,10 @@ /* Scissor setup */ cs->SE_SCISSOR_LEFT = 0; /* affected by rasterizer and scissor state as well */ cs->SE_SCISSOR_TOP = 0; - cs->SE_SCISSOR_RIGHT = (sv->width << 16) + ETNA_SE_SCISSOR_MARGIN_RIGHT; - cs->SE_SCISSOR_BOTTOM = (sv->height << 16) + ETNA_SE_SCISSOR_MARGIN_BOTTOM; - cs->SE_CLIP_RIGHT = (sv->width << 16) + ETNA_SE_CLIP_MARGIN_RIGHT; - cs->SE_CLIP_BOTTOM = (sv->height << 16) + ETNA_SE_CLIP_MARGIN_BOTTOM; + cs->SE_SCISSOR_RIGHT = (fb->width << 16) + ETNA_SE_SCISSOR_MARGIN_RIGHT; + cs->SE_SCISSOR_BOTTOM = (fb->height << 16) + ETNA_SE_SCISSOR_MARGIN_BOTTOM; + cs->SE_CLIP_RIGHT = (fb->width << 16) + ETNA_SE_CLIP_MARGIN_RIGHT; + cs->SE_CLIP_BOTTOM = (fb->height << 16) + ETNA_SE_CLIP_MARGIN_BOTTOM; cs->TS_MEM_CONFIG = ts_mem_config; cs->PE_MEM_CONFIG = pe_mem_config; @@ -339,11 +356,13 @@ /* Single buffer setup. There is only one switch for this, not a separate * one per color buffer / depth buffer. To keep the logic simple always use * single buffer when this feature is available. + * note: the blob will use 2 in some situations, figure out why? */ - cs->PE_LOGIC_OP = VIVS_PE_LOGIC_OP_SINGLE_BUFFER(ctx->specs.single_buffer ? 3 : 0); + pe_logic_op |= VIVS_PE_LOGIC_OP_SINGLE_BUFFER(ctx->specs.single_buffer ? 3 : 0); + cs->PE_LOGIC_OP = pe_logic_op; /* keep copy of original structure */ - util_copy_framebuffer_state(&ctx->framebuffer_s, sv); + util_copy_framebuffer_state(&ctx->framebuffer_s, fb); ctx->dirty |= ETNA_DIRTY_FRAMEBUFFER | ETNA_DIRTY_DERIVE_TS; } @@ -519,29 +538,14 @@ /* XXX could minimize number of consecutive stretches here by sorting, and * permuting the inputs in shader or does Mesa do this already? */ - /* Check that vertex element binding is compatible with hardware; thus - * elements[idx].vertex_buffer_index are < stream_count. If not, the binding - * uses more streams than is supported, and u_vbuf should have done some - * reorganization for compatibility. */ - - /* TODO: does mesa this for us? */ - bool incompatible = false; - for (unsigned idx = 0; idx < num_elements; ++idx) { - if (elements[idx].vertex_buffer_index >= ctx->specs.stream_count || elements[idx].instance_divisor > 0) - incompatible = true; - } - cs->num_elements = num_elements; - if (incompatible || num_elements == 0) { - DBG("Error: zero vertex elements, or more vertex buffers used than supported"); - FREE(cs); - return NULL; - } unsigned start_offset = 0; /* start of current consecutive stretch */ bool nonconsecutive = true; /* previous value of nonconsecutive */ + uint32_t buffer_mask = 0; /* mask of buffer_idx already seen */ for (unsigned idx = 0; idx < num_elements; ++idx) { + unsigned buffer_idx = elements[idx].vertex_buffer_index; unsigned element_size = util_format_get_blocksize(elements[idx].src_format); unsigned end_offset = elements[idx].src_offset + element_size; uint32_t format_type, normalize; @@ -549,12 +553,15 @@ if (nonconsecutive) start_offset = elements[idx].src_offset; + /* guaranteed by PIPE_CAP_MAX_VERTEX_BUFFERS */ + assert(buffer_idx < ctx->specs.stream_count); + /* maximum vertex size is 256 bytes */ - assert(element_size != 0 && end_offset <= 256); + assert(element_size != 0 && (end_offset - start_offset) < 256); /* check whether next element is consecutive to this one */ nonconsecutive = (idx == (num_elements - 1)) || - elements[idx + 1].vertex_buffer_index != elements[idx].vertex_buffer_index || + elements[idx + 1].vertex_buffer_index != buffer_idx || end_offset != elements[idx + 1].src_offset; format_type = translate_vertex_format_type(elements[idx].src_format); @@ -569,7 +576,7 @@ format_type | VIVS_FE_VERTEX_ELEMENT_CONFIG_NUM(util_format_get_nr_components(elements[idx].src_format)) | normalize | VIVS_FE_VERTEX_ELEMENT_CONFIG_ENDIAN(ENDIAN_MODE_NO_SWAP) | - VIVS_FE_VERTEX_ELEMENT_CONFIG_STREAM(elements[idx].vertex_buffer_index) | + VIVS_FE_VERTEX_ELEMENT_CONFIG_STREAM(buffer_idx) | VIVS_FE_VERTEX_ELEMENT_CONFIG_START(elements[idx].src_offset) | VIVS_FE_VERTEX_ELEMENT_CONFIG_END(end_offset - start_offset); } else { /* HALTI5 spread vertex attrib config over two registers */ @@ -577,13 +584,26 @@ format_type | VIVS_NFE_GENERIC_ATTRIB_CONFIG0_NUM(util_format_get_nr_components(elements[idx].src_format)) | normalize | VIVS_NFE_GENERIC_ATTRIB_CONFIG0_ENDIAN(ENDIAN_MODE_NO_SWAP) | - VIVS_NFE_GENERIC_ATTRIB_CONFIG0_STREAM(elements[idx].vertex_buffer_index) | + VIVS_NFE_GENERIC_ATTRIB_CONFIG0_STREAM(buffer_idx) | VIVS_NFE_GENERIC_ATTRIB_CONFIG0_START(elements[idx].src_offset); cs->NFE_GENERIC_ATTRIB_CONFIG1[idx] = COND(nonconsecutive, VIVS_NFE_GENERIC_ATTRIB_CONFIG1_NONCONSECUTIVE) | VIVS_NFE_GENERIC_ATTRIB_CONFIG1_END(end_offset - start_offset); } - cs->NFE_GENERIC_ATTRIB_SCALE[idx] = 0x3f800000; /* 1 for integer, 1.0 for float */ + + if (util_format_is_pure_integer(elements[idx].src_format)) + cs->NFE_GENERIC_ATTRIB_SCALE[idx] = 1; + else + cs->NFE_GENERIC_ATTRIB_SCALE[idx] = fui(1.0f); + + /* instance_divisor is part of elements state but should be the same for all buffers */ + if (buffer_mask & 1 << buffer_idx) + assert(cs->NFE_VERTEX_STREAMS_VERTEX_DIVISOR[buffer_idx] == elements[idx].instance_divisor); + else + cs->NFE_VERTEX_STREAMS_VERTEX_DIVISOR[buffer_idx] = elements[idx].instance_divisor; + + buffer_mask |= 1 << buffer_idx; + cs->num_buffers = MAX2(cs->num_buffers, buffer_idx + 1); } return cs; diff -Nru mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_surface.c mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_surface.c --- mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_surface.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_surface.c 2020-06-12 01:21:17.000000000 +0000 @@ -37,12 +37,47 @@ #include "hw/common.xml.h" +#include "drm-uapi/drm_fourcc.h" + +static struct etna_resource * +etna_render_handle_incompatible(struct pipe_context *pctx, struct pipe_resource *prsc) +{ + struct etna_context *ctx = etna_context(pctx); + struct etna_resource *res = etna_resource(prsc); + bool need_multitiled = ctx->specs.pixel_pipes > 1 && !ctx->specs.single_buffer; + bool want_supertiled = ctx->specs.can_supertile; + + /* Resource is compatible if it is tiled and has multi tiling when required + * TODO: LINEAR_PE feature means render to linear is possible ? + */ + if (res->layout != ETNA_LAYOUT_LINEAR && + (!need_multitiled || (res->layout & ETNA_LAYOUT_BIT_MULTI))) + return res; + + if (!res->render) { + struct pipe_resource templat = *prsc; + unsigned layout = ETNA_LAYOUT_TILED; + if (need_multitiled) + layout |= ETNA_LAYOUT_BIT_MULTI; + if (want_supertiled) + layout |= ETNA_LAYOUT_BIT_SUPER; + + templat.bind &= (PIPE_BIND_DEPTH_STENCIL | PIPE_BIND_RENDER_TARGET | + PIPE_BIND_BLENDABLE); + res->render = + etna_resource_alloc(pctx->screen, layout, + DRM_FORMAT_MOD_LINEAR, &templat); + assert(res->render); + } + return etna_resource(res->render); +} + static struct pipe_surface * etna_create_surface(struct pipe_context *pctx, struct pipe_resource *prsc, const struct pipe_surface *templat) { struct etna_context *ctx = etna_context(pctx); - struct etna_resource *rsc = etna_resource(prsc); + struct etna_resource *rsc = etna_render_handle_incompatible(pctx, prsc); struct etna_surface *surf = CALLOC_STRUCT(etna_surface); if (!surf) @@ -57,6 +92,7 @@ pipe_reference_init(&surf->base.reference, 1); pipe_resource_reference(&surf->base.texture, &rsc->base); + pipe_resource_reference(&surf->prsc, prsc); /* Allocate a TS for the resource if there isn't one yet, * and it is allowed by the hw (width is a multiple of 16). @@ -67,13 +103,15 @@ if (VIV_FEATURE(ctx->screen, chipFeatures, FAST_CLEAR) && VIV_FEATURE(ctx->screen, chipMinorFeatures0, MC20) && !rsc->ts_bo && + /* needs to be RS/BLT compatible for transfer_map/unmap */ (rsc->levels[level].padded_width & ETNA_RS_WIDTH_MASK) == 0 && - (rsc->levels[level].padded_height & ETNA_RS_HEIGHT_MASK) == 0) { + (rsc->levels[level].padded_height & ETNA_RS_HEIGHT_MASK) == 0 && + etna_resource_hw_tileable(ctx->specs.use_blt, prsc)) { etna_screen_resource_alloc_ts(pctx->screen, rsc); } surf->base.texture = &rsc->base; - surf->base.format = rsc->base.format; + surf->base.format = templat->format; surf->base.width = rsc->levels[level].width; surf->base.height = rsc->levels[level].height; surf->base.writable = templat->writable; /* what is this for anyway */ @@ -148,6 +186,7 @@ etna_surface_destroy(struct pipe_context *pctx, struct pipe_surface *psurf) { pipe_resource_reference(&psurf->texture, NULL); + pipe_resource_reference(&etna_surface(psurf)->prsc, NULL); FREE(psurf); } diff -Nru mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_surface.h mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_surface.h --- mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_surface.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_surface.h 2020-06-12 01:21:17.000000000 +0000 @@ -41,6 +41,8 @@ struct etna_resource_level *level; struct etna_reloc reloc[ETNA_MAX_PIXELPIPES]; struct etna_reloc ts_reloc; + /* keep pointer to original resource (for when a render compatible resource is used) */ + struct pipe_resource *prsc; }; static inline struct etna_surface * diff -Nru mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_texture.c mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_texture.c --- mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_texture.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_texture.c 2020-06-12 01:21:17.000000000 +0000 @@ -32,6 +32,7 @@ #include "etnaviv_context.h" #include "etnaviv_emit.h" #include "etnaviv_format.h" +#include "etnaviv_texture_desc.h" #include "etnaviv_texture_state.h" #include "etnaviv_translate.h" #include "util/u_inlines.h" @@ -73,31 +74,40 @@ ctx->dirty |= ETNA_DIRTY_SAMPLERS; } -static void +static bool etna_configure_sampler_ts(struct etna_sampler_ts *sts, struct pipe_sampler_view *pview, bool enable) { + bool dirty = (sts->enable != enable); + assert(sts); sts->enable = enable; - if (enable) { - struct etna_resource *rsc = etna_resource(pview->texture); - struct etna_resource_level *lev = &rsc->levels[0]; - assert(rsc->ts_bo && lev->ts_valid); - - sts->mode = lev->ts_mode; - sts->TS_SAMPLER_CONFIG = - VIVS_TS_SAMPLER_CONFIG_ENABLE | - COND(lev->ts_compress_fmt >= 0, VIVS_TS_SAMPLER_CONFIG_COMPRESSION) | - VIVS_TS_SAMPLER_CONFIG_COMPRESSION_FORMAT(lev->ts_compress_fmt); - sts->TS_SAMPLER_CLEAR_VALUE = lev->clear_value; - sts->TS_SAMPLER_CLEAR_VALUE2 = lev->clear_value; /* To handle 64-bit formats this needs a different value */ - sts->TS_SAMPLER_STATUS_BASE.bo = rsc->ts_bo; - sts->TS_SAMPLER_STATUS_BASE.offset = lev->ts_offset; - sts->TS_SAMPLER_STATUS_BASE.flags = ETNA_RELOC_READ; - } else { + + if (!enable) { sts->TS_SAMPLER_CONFIG = 0; sts->TS_SAMPLER_STATUS_BASE.bo = NULL; + return dirty; } - /* n.b.: relies on caller to mark ETNA_DIRTY_SAMPLER_VIEWS */ + + struct etna_resource *rsc = etna_resource(pview->texture); + struct etna_resource_level *lev = &rsc->levels[0]; + + if (lev->clear_value != sts->TS_SAMPLER_CLEAR_VALUE) + dirty = true; + + assert(rsc->ts_bo && lev->ts_valid); + + sts->mode = lev->ts_mode; + sts->TS_SAMPLER_CONFIG = + VIVS_TS_SAMPLER_CONFIG_ENABLE | + COND(lev->ts_compress_fmt >= 0, VIVS_TS_SAMPLER_CONFIG_COMPRESSION) | + VIVS_TS_SAMPLER_CONFIG_COMPRESSION_FORMAT(lev->ts_compress_fmt); + sts->TS_SAMPLER_CLEAR_VALUE = lev->clear_value; + sts->TS_SAMPLER_CLEAR_VALUE2 = lev->clear_value >> 32; + sts->TS_SAMPLER_STATUS_BASE.bo = rsc->ts_bo; + sts->TS_SAMPLER_STATUS_BASE.offset = lev->ts_offset; + sts->TS_SAMPLER_STATUS_BASE.flags = ETNA_RELOC_READ; + + return dirty; } /* Return true if the GPU can use sampler TS with this sampler view. @@ -121,6 +131,7 @@ */ struct etna_resource *rsc = etna_resource(view->texture); struct etna_screen *screen = etna_screen(rsc->base.screen); + return VIV_FEATURE(screen, chipMinorFeatures2, TEXTURE_TILED_READ) && num < VIVS_TS_SAMPLER__LEN && rsc->base.target != PIPE_BUFFER && @@ -129,7 +140,7 @@ rsc->levels[0].ts_valid; } -static void +void etna_update_sampler_source(struct pipe_sampler_view *view, int num) { struct etna_resource *base = etna_resource(view->texture); @@ -137,8 +148,8 @@ struct etna_context *ctx = etna_context(view->context); bool enable_sampler_ts = false; - if (base->external && etna_resource_newer(etna_resource(base->external), base)) - from = etna_resource(base->external); + if (base->render && etna_resource_newer(etna_resource(base->render), base)) + from = etna_resource(base->render); if (base->texture) to = etna_resource(base->texture); @@ -147,6 +158,7 @@ etna_copy_resource(view->context, &to->base, &from->base, 0, view->texture->last_level); to->seqno = from->seqno; + ctx->dirty |= ETNA_DIRTY_TEXTURE_CACHES; } else if ((to == from) && etna_resource_needs_flush(to)) { if (ctx->ts_for_sampler_view && etna_can_use_sampler_ts(view, num)) { enable_sampler_ts = true; @@ -156,10 +168,16 @@ etna_copy_resource(view->context, &to->base, &from->base, 0, view->texture->last_level); to->flush_seqno = from->seqno; + ctx->dirty |= ETNA_DIRTY_TEXTURE_CACHES; } + } else if ((to == from) && (to->flush_seqno < from->seqno)) { + to->flush_seqno = from->seqno; + ctx->dirty |= ETNA_DIRTY_TEXTURE_CACHES; } - if (ctx->ts_for_sampler_view) { - etna_configure_sampler_ts(ctx->ts_for_sampler_view(view), view, enable_sampler_ts); + if (ctx->ts_for_sampler_view && + etna_configure_sampler_ts(ctx->ts_for_sampler_view(view), view, enable_sampler_ts)) { + ctx->dirty |= ETNA_DIRTY_SAMPLER_VIEWS | ETNA_DIRTY_TEXTURE_CACHES; + ctx->dirty_sampler_views |= (1 << num); } } @@ -207,7 +225,6 @@ PIPE_BIND_BLENDABLE); res->texture = etna_resource_alloc(pctx->screen, ETNA_LAYOUT_TILED, - ETNA_ADDRESSING_MODE_TILED, DRM_FORMAT_MOD_LINEAR, &templat); } @@ -277,11 +294,6 @@ ctx->dirty |= ETNA_DIRTY_SAMPLER_VIEWS | ETNA_DIRTY_TEXTURE_CACHES; - for (unsigned idx = 0; idx < num_views; ++idx) { - if (views[idx]) - etna_update_sampler_source(views[idx], idx); - } - switch (shader) { case PIPE_SHADER_FRAGMENT: etna_fragtex_set_sampler_views(ctx, num_views, views); @@ -299,7 +311,9 @@ struct etna_context *ctx = etna_context(pctx); /* clear color and texture cache to make sure that texture unit reads * what has been written */ + mtx_lock(&ctx->lock); etna_set_state(ctx->stream, VIVS_GL_FLUSH_CACHE, VIVS_GL_FLUSH_CACHE_COLOR | VIVS_GL_FLUSH_CACHE_TEXTURE); + mtx_unlock(&ctx->lock); } uint32_t @@ -311,8 +325,14 @@ void etna_texture_init(struct pipe_context *pctx) { + struct etna_context *ctx = etna_context(pctx); + pctx->bind_sampler_states = etna_bind_sampler_states; pctx->set_sampler_views = etna_set_sampler_views; pctx->texture_barrier = etna_texture_barrier; - etna_texture_state_init(pctx); + + if (ctx->specs.halti >= 5) + etna_texture_desc_init(pctx); + else + etna_texture_state_init(pctx); } diff -Nru mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_texture_desc.c mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_texture_desc.c --- mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_texture_desc.c 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_texture_desc.c 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,353 @@ +/* + * Copyright (c) 2017 Etnaviv Project + * Copyright (C) 2017 Zodiac Inflight Innovations + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sub license, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Authors: + * Wladimir J. van der Laan + */ + +#include "etnaviv_texture_desc.h" + +#include "hw/common.xml.h" +#include "hw/texdesc_3d.xml.h" + +#include "etnaviv_clear_blit.h" +#include "etnaviv_context.h" +#include "etnaviv_emit.h" +#include "etnaviv_format.h" +#include "etnaviv_translate.h" +#include "etnaviv_texture.h" +#include "util/u_inlines.h" +#include "util/u_memory.h" + +#include + +struct etna_sampler_state_desc { + struct pipe_sampler_state base; + uint32_t SAMP_CTRL0; + uint32_t SAMP_CTRL1; + uint32_t SAMP_LOD_MINMAX; + uint32_t SAMP_LOD_BIAS; +}; + +static inline struct etna_sampler_state_desc * +etna_sampler_state_desc(struct pipe_sampler_state *samp) +{ + return (struct etna_sampler_state_desc *)samp; +} + +struct etna_sampler_view_desc { + struct pipe_sampler_view base; + /* format-dependent merged with sampler state */ + uint32_t SAMP_CTRL0; + uint32_t SAMP_CTRL1; + + struct etna_bo *bo; + struct etna_reloc DESC_ADDR; + struct etna_sampler_ts ts; +}; + +static inline struct etna_sampler_view_desc * +etna_sampler_view_desc(struct pipe_sampler_view *view) +{ + return (struct etna_sampler_view_desc *)view; +} + +static void * +etna_create_sampler_state_desc(struct pipe_context *pipe, + const struct pipe_sampler_state *ss) +{ + struct etna_sampler_state_desc *cs = CALLOC_STRUCT(etna_sampler_state_desc); + + if (!cs) + return NULL; + + cs->SAMP_CTRL0 = + VIVS_NTE_DESCRIPTOR_SAMP_CTRL0_UWRAP(translate_texture_wrapmode(ss->wrap_s)) | + VIVS_NTE_DESCRIPTOR_SAMP_CTRL0_VWRAP(translate_texture_wrapmode(ss->wrap_t)) | + VIVS_NTE_DESCRIPTOR_SAMP_CTRL0_WWRAP(translate_texture_wrapmode(ss->wrap_r)) | + VIVS_NTE_DESCRIPTOR_SAMP_CTRL0_MIN(translate_texture_filter(ss->min_img_filter)) | + VIVS_NTE_DESCRIPTOR_SAMP_CTRL0_MIP(translate_texture_mipfilter(ss->min_mip_filter)) | + VIVS_NTE_DESCRIPTOR_SAMP_CTRL0_MAG(translate_texture_filter(ss->mag_img_filter)) | + VIVS_NTE_DESCRIPTOR_SAMP_CTRL0_UNK21; + /* no ROUND_UV bit? */ + cs->SAMP_CTRL1 = VIVS_NTE_DESCRIPTOR_SAMP_CTRL1_UNK1; + uint32_t min_lod_fp8 = MIN2(etna_float_to_fixp88(ss->min_lod), 0xfff); + uint32_t max_lod_fp8 = MIN2(etna_float_to_fixp88(ss->max_lod), 0xfff); + uint32_t max_lod_min = ss->min_img_filter != ss->mag_img_filter ? 4 : 0; + + if (ss->min_mip_filter != PIPE_TEX_MIPFILTER_NONE) { + cs->SAMP_LOD_MINMAX = + VIVS_NTE_DESCRIPTOR_SAMP_LOD_MINMAX_MAX(MAX2(max_lod_fp8, max_lod_min)) | + VIVS_NTE_DESCRIPTOR_SAMP_LOD_MINMAX_MIN(min_lod_fp8); + } else { + cs->SAMP_LOD_MINMAX = + VIVS_NTE_DESCRIPTOR_SAMP_LOD_MINMAX_MAX(MAX2(max_lod_fp8, max_lod_min)) | + VIVS_NTE_DESCRIPTOR_SAMP_LOD_MINMAX_MIN(min_lod_fp8); + } + cs->SAMP_LOD_BIAS = + VIVS_NTE_DESCRIPTOR_SAMP_LOD_BIAS_BIAS(etna_float_to_fixp88(ss->lod_bias)) | + COND(ss->lod_bias != 0.0, VIVS_NTE_DESCRIPTOR_SAMP_LOD_BIAS_ENABLE); + + return cs; +} + +static void +etna_delete_sampler_state_desc(struct pipe_context *pctx, void *ss) +{ + FREE(ss); +} + +static struct pipe_sampler_view * +etna_create_sampler_view_desc(struct pipe_context *pctx, struct pipe_resource *prsc, + const struct pipe_sampler_view *so) +{ + const struct util_format_description *desc = util_format_description(so->format); + struct etna_sampler_view_desc *sv = CALLOC_STRUCT(etna_sampler_view_desc); + struct etna_context *ctx = etna_context(pctx); + const uint32_t format = translate_texture_format(so->format); + const bool ext = !!(format & EXT_FORMAT); + const bool astc = !!(format & ASTC_FORMAT); + const uint32_t swiz = get_texture_swiz(so->format, so->swizzle_r, + so->swizzle_g, so->swizzle_b, + so->swizzle_a); + + if (!sv) + return NULL; + + struct etna_resource *res = etna_texture_handle_incompatible(pctx, prsc); + if (!res) { + free(sv); + return NULL; + } + + sv->base = *so; + pipe_reference_init(&sv->base.reference, 1); + sv->base.texture = NULL; + pipe_resource_reference(&sv->base.texture, prsc); + sv->base.context = pctx; + + /* Determine whether target supported */ + uint32_t target_hw = translate_texture_target(sv->base.target); + if (target_hw == ETNA_NO_MATCH) { + BUG("Unhandled texture target"); + free(sv); + return NULL; + } + + /* Texture descriptor sampler bits */ + if (util_format_is_srgb(so->format)) + sv->SAMP_CTRL1 |= VIVS_NTE_DESCRIPTOR_SAMP_CTRL1_SRGB; + + if (texture_use_int_filter(so, true)) + sv->SAMP_CTRL0 |= VIVS_NTE_DESCRIPTOR_SAMP_CTRL0_INT_FILTER; + + /* Create texture descriptor */ + sv->bo = etna_bo_new(ctx->screen->dev, 0x100, DRM_ETNA_GEM_CACHE_WC); + if (!sv->bo) + goto error; + + uint32_t *buf = etna_bo_map(sv->bo); + etna_bo_cpu_prep(sv->bo, DRM_ETNA_PREP_WRITE); + memset(buf, 0, 0x100); + + /** GC7000 needs the size of the BASELOD level */ + uint32_t base_width = u_minify(res->base.width0, sv->base.u.tex.first_level); + uint32_t base_height = u_minify(res->base.height0, sv->base.u.tex.first_level); + uint32_t base_depth = u_minify(res->base.depth0, sv->base.u.tex.first_level); + bool is_array = false; + bool sint = util_format_is_pure_sint(so->format); + + if (sv->base.target == PIPE_TEXTURE_1D_ARRAY) { + is_array = true; + base_height = res->base.array_size; + } else if (sv->base.target == PIPE_TEXTURE_2D_ARRAY) { + is_array = true; + base_depth = res->base.array_size; + } + +#define DESC_SET(x, y) buf[(TEXDESC_##x)>>2] = (y) + DESC_SET(CONFIG0, COND(!ext && !astc, VIVS_TE_SAMPLER_CONFIG0_FORMAT(format)) + | VIVS_TE_SAMPLER_CONFIG0_TYPE(target_hw) | + COND(res->layout == ETNA_LAYOUT_LINEAR && !util_format_is_compressed(so->format), + VIVS_TE_SAMPLER_CONFIG0_ADDRESSING_MODE(TEXTURE_ADDRESSING_MODE_LINEAR))); + DESC_SET(CONFIG1, COND(ext, VIVS_TE_SAMPLER_CONFIG1_FORMAT_EXT(format)) | + COND(astc, VIVS_TE_SAMPLER_CONFIG1_FORMAT_EXT(TEXTURE_FORMAT_EXT_ASTC)) | + COND(is_array, VIVS_TE_SAMPLER_CONFIG1_TEXTURE_ARRAY) | + VIVS_TE_SAMPLER_CONFIG1_HALIGN(res->halign) | swiz); + DESC_SET(CONFIG2, 0x00030000 | + COND(sint && desc->channel[0].size == 8, TE_SAMPLER_CONFIG2_SIGNED_INT8) | + COND(sint && desc->channel[0].size == 16, TE_SAMPLER_CONFIG2_SIGNED_INT16)); + DESC_SET(LINEAR_STRIDE, res->levels[0].stride); + DESC_SET(VOLUME, etna_log2_fixp88(base_depth)); + DESC_SET(SLICE, res->levels[0].layer_stride); + DESC_SET(3D_CONFIG, VIVS_TE_SAMPLER_3D_CONFIG_DEPTH(base_depth)); + DESC_SET(ASTC0, COND(astc, VIVS_NTE_SAMPLER_ASTC0_ASTC_FORMAT(format)) | + VIVS_NTE_SAMPLER_ASTC0_UNK8(0xc) | + VIVS_NTE_SAMPLER_ASTC0_UNK16(0xc) | + VIVS_NTE_SAMPLER_ASTC0_UNK24(0xc)); + DESC_SET(BASELOD, TEXDESC_BASELOD_BASELOD(sv->base.u.tex.first_level) | + TEXDESC_BASELOD_MAXLOD(MIN2(sv->base.u.tex.last_level, res->base.last_level))); + DESC_SET(LOG_SIZE_EXT, TEXDESC_LOG_SIZE_EXT_WIDTH(etna_log2_fixp88(base_width)) | + TEXDESC_LOG_SIZE_EXT_HEIGHT(etna_log2_fixp88(base_height))); + DESC_SET(SIZE, VIVS_TE_SAMPLER_SIZE_WIDTH(base_width) | + VIVS_TE_SAMPLER_SIZE_HEIGHT(base_height)); + for (int lod = 0; lod <= res->base.last_level; ++lod) + DESC_SET(LOD_ADDR(lod), etna_bo_gpu_va(res->bo) + res->levels[lod].offset); +#undef DESC_SET + + etna_bo_cpu_fini(sv->bo); + + sv->DESC_ADDR.bo = sv->bo; + sv->DESC_ADDR.offset = 0; + sv->DESC_ADDR.flags = ETNA_RELOC_READ; + + return &sv->base; +error: + free(sv); + return NULL; +} + +static void +etna_sampler_view_update_descriptor(struct etna_context *ctx, + struct etna_cmd_stream *stream, + struct etna_sampler_view_desc *sv) +{ + /* TODO: this should instruct the kernel to update the descriptor when the + * bo is submitted. For now, just prevent the bo from being freed + * while it is in use indirectly. + */ + struct etna_resource *res = etna_resource(sv->base.texture); + if (res->texture) { + res = etna_resource(res->texture); + } + /* No need to ref LOD levels individually as they'll always come from the same bo */ + etna_cmd_stream_ref_bo(stream, res->bo, ETNA_RELOC_READ); +} + +static void +etna_sampler_view_desc_destroy(struct pipe_context *pctx, + struct pipe_sampler_view *so) +{ + struct etna_sampler_view_desc *sv = etna_sampler_view_desc(so); + pipe_resource_reference(&sv->base.texture, NULL); + etna_bo_del(sv->bo); + FREE(sv); +} + +static void +etna_emit_texture_desc(struct etna_context *ctx) +{ + struct etna_cmd_stream *stream = ctx->stream; + uint32_t active_samplers = active_samplers_bits(ctx); + uint32_t dirty = ctx->dirty; + + if (unlikely(dirty & ETNA_DIRTY_SAMPLER_VIEWS)) { + for (int x = 0; x < VIVS_TS_SAMPLER__LEN; ++x) { + if ((1 << x) & active_samplers) { + struct etna_sampler_view_desc *sv = etna_sampler_view_desc(ctx->sampler_view[x]); + struct etna_resource *res = etna_resource(sv->base.texture); + struct etna_reloc LOD_ADDR_0; + + if (!sv->ts.enable) + continue; + + etna_set_state(stream, VIVS_TS_SAMPLER_CONFIG(x), sv->ts.TS_SAMPLER_CONFIG); + etna_set_state_reloc(stream, VIVS_TS_SAMPLER_STATUS_BASE(x), &sv->ts.TS_SAMPLER_STATUS_BASE); + etna_set_state(stream, VIVS_TS_SAMPLER_CLEAR_VALUE(x), sv->ts.TS_SAMPLER_CLEAR_VALUE); + etna_set_state(stream, VIVS_TS_SAMPLER_CLEAR_VALUE2(x), sv->ts.TS_SAMPLER_CLEAR_VALUE2); + + LOD_ADDR_0.bo = res->bo; + LOD_ADDR_0.offset = res->levels[0].offset; + LOD_ADDR_0.flags = ETNA_RELOC_READ; + + etna_set_state_reloc(stream, VIVS_TS_SAMPLER_SURFACE_BASE(x), &LOD_ADDR_0); + } + } + } + + if (unlikely(dirty & (ETNA_DIRTY_SAMPLERS | ETNA_DIRTY_SAMPLER_VIEWS))) { + for (int x = 0; x < PIPE_MAX_SAMPLERS; ++x) { + if ((1 << x) & active_samplers) { + struct etna_sampler_state_desc *ss = etna_sampler_state_desc(ctx->sampler[x]); + struct etna_sampler_view_desc *sv = etna_sampler_view_desc(ctx->sampler_view[x]); + etna_set_state(stream, VIVS_NTE_DESCRIPTOR_TX_CTRL(x), + COND(sv->ts.enable, VIVS_NTE_DESCRIPTOR_TX_CTRL_TS_ENABLE) | + VIVS_NTE_DESCRIPTOR_TX_CTRL_TS_MODE(sv->ts.mode) | + VIVS_NTE_DESCRIPTOR_TX_CTRL_TS_INDEX(x)); + etna_set_state(stream, VIVS_NTE_DESCRIPTOR_SAMP_CTRL0(x), ss->SAMP_CTRL0 | sv->SAMP_CTRL0); + etna_set_state(stream, VIVS_NTE_DESCRIPTOR_SAMP_CTRL1(x), ss->SAMP_CTRL1 | sv->SAMP_CTRL1); + etna_set_state(stream, VIVS_NTE_DESCRIPTOR_SAMP_LOD_MINMAX(x), ss->SAMP_LOD_MINMAX); + etna_set_state(stream, VIVS_NTE_DESCRIPTOR_SAMP_LOD_BIAS(x), ss->SAMP_LOD_BIAS); + } + } + } + + if (unlikely(dirty & ETNA_DIRTY_SAMPLER_VIEWS)) { + /* Set texture descriptors */ + for (int x = 0; x < PIPE_MAX_SAMPLERS; ++x) { + if ((1 << x) & ctx->dirty_sampler_views) { + if ((1 << x) & active_samplers) { + struct etna_sampler_view_desc *sv = etna_sampler_view_desc(ctx->sampler_view[x]); + etna_sampler_view_update_descriptor(ctx, stream, sv); + etna_set_state_reloc(stream, VIVS_NTE_DESCRIPTOR_ADDR(x), &sv->DESC_ADDR); + } else { + /* dummy texture descriptors for unused samplers */ + etna_set_state_reloc(stream, VIVS_NTE_DESCRIPTOR_ADDR(x), &ctx->DUMMY_DESC_ADDR); + } + } + } + } + + if (unlikely(dirty & ETNA_DIRTY_SAMPLER_VIEWS)) { + /* Invalidate all dirty sampler views. + */ + for (int x = 0; x < PIPE_MAX_SAMPLERS; ++x) { + if ((1 << x) & ctx->dirty_sampler_views) { + etna_set_state(stream, VIVS_NTE_DESCRIPTOR_INVALIDATE, + VIVS_NTE_DESCRIPTOR_INVALIDATE_UNK29 | + VIVS_NTE_DESCRIPTOR_INVALIDATE_IDX(x)); + } + } + } +} + +static struct etna_sampler_ts* +etna_ts_for_sampler_view_state(struct pipe_sampler_view *pview) +{ + struct etna_sampler_view_desc *sv = etna_sampler_view_desc(pview); + return &sv->ts; +} + +void +etna_texture_desc_init(struct pipe_context *pctx) +{ + struct etna_context *ctx = etna_context(pctx); + DBG("etnaviv: Using descriptor-based texturing\n"); + ctx->base.create_sampler_state = etna_create_sampler_state_desc; + ctx->base.delete_sampler_state = etna_delete_sampler_state_desc; + ctx->base.create_sampler_view = etna_create_sampler_view_desc; + ctx->base.sampler_view_destroy = etna_sampler_view_desc_destroy; + ctx->emit_texture_state = etna_emit_texture_desc; + ctx->ts_for_sampler_view = etna_ts_for_sampler_view_state; +} + diff -Nru mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_texture_desc.h mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_texture_desc.h --- mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_texture_desc.h 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_texture_desc.h 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2017 Etnaviv Project + * Copyright (C) 2017 Zodiac Inflight Innovations + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sub license, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Authors: + * Wladimir J. van der Laan + */ + +#ifndef H_ETNAVIV_TEXTURE_DESC +#define H_ETNAVIV_TEXTURE_DESC + +#include "etnaviv_texture.h" + +/* Initialize context for descriptor-based texture views and descriptors */ +void +etna_texture_desc_init(struct pipe_context *pctx); + +#endif + diff -Nru mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_texture.h mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_texture.h --- mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_texture.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_texture.h 2020-06-12 01:21:17.000000000 +0000 @@ -61,4 +61,8 @@ uint32_t active_samplers_bits(struct etna_context *ctx); +/* update TS / cache for a sampler if required */ +void +etna_update_sampler_source(struct pipe_sampler_view *view, int num); + #endif diff -Nru mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_texture_state.c mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_texture_state.c --- mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_texture_state.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_texture_state.c 2020-06-12 01:21:17.000000000 +0000 @@ -39,11 +39,56 @@ #include "drm-uapi/drm_fourcc.h" +struct etna_sampler_state { + struct pipe_sampler_state base; + + /* sampler offset +4*sampler, interleave when committing state */ + uint32_t TE_SAMPLER_CONFIG0; + uint32_t TE_SAMPLER_CONFIG1; + uint32_t TE_SAMPLER_LOD_CONFIG; + uint32_t TE_SAMPLER_3D_CONFIG; + uint32_t NTE_SAMPLER_BASELOD; + unsigned min_lod, max_lod, max_lod_min; +}; + +static inline struct etna_sampler_state * +etna_sampler_state(struct pipe_sampler_state *samp) +{ + return (struct etna_sampler_state *)samp; +} + +struct etna_sampler_view { + struct pipe_sampler_view base; + + /* sampler offset +4*sampler, interleave when committing state */ + uint32_t TE_SAMPLER_CONFIG0; + uint32_t TE_SAMPLER_CONFIG0_MASK; + uint32_t TE_SAMPLER_CONFIG1; + uint32_t TE_SAMPLER_3D_CONFIG; + uint32_t TE_SAMPLER_SIZE; + uint32_t TE_SAMPLER_LOG_SIZE; + uint32_t TE_SAMPLER_ASTC0; + uint32_t TE_SAMPLER_LINEAR_STRIDE[VIVS_TE_SAMPLER_LINEAR_STRIDE__LEN]; + struct etna_reloc TE_SAMPLER_LOD_ADDR[VIVS_TE_SAMPLER_LOD_ADDR__LEN]; + unsigned min_lod, max_lod; /* 5.5 fixp */ + + struct etna_sampler_ts ts; +}; + +static inline struct etna_sampler_view * +etna_sampler_view(struct pipe_sampler_view *view) +{ + return (struct etna_sampler_view *)view; +} + static void * etna_create_sampler_state_state(struct pipe_context *pipe, const struct pipe_sampler_state *ss) { struct etna_sampler_state *cs = CALLOC_STRUCT(etna_sampler_state); + struct etna_context *ctx = etna_context(pipe); + struct etna_screen *screen = ctx->screen; + const bool ansio = ss->max_anisotropy > 1; if (!cs) return NULL; @@ -61,8 +106,8 @@ cs->TE_SAMPLER_CONFIG0 |= VIVS_TE_SAMPLER_CONFIG0_ROUND_UV; } - cs->TE_SAMPLER_CONFIG1 = - COND(ss->seamless_cube_map, VIVS_TE_SAMPLER_CONFIG1_SEAMLESS_CUBE_MAP); + cs->TE_SAMPLER_CONFIG1 = screen->specs.seamless_cube_map ? + COND(ss->seamless_cube_map, VIVS_TE_SAMPLER_CONFIG1_SEAMLESS_CUBE_MAP) : 0; cs->TE_SAMPLER_LOD_CONFIG = COND(ss->lod_bias != 0.0, VIVS_TE_SAMPLER_LOD_CONFIG_BIAS_ENABLE) | @@ -86,6 +131,10 @@ */ cs->max_lod_min = (ss->min_img_filter != ss->mag_img_filter) ? 1 : 0; + cs->NTE_SAMPLER_BASELOD = + COND(ss->compare_mode, VIVS_NTE_SAMPLER_BASELOD_COMPARE_ENABLE) | + VIVS_NTE_SAMPLER_BASELOD_COMPARE_FUNC(translate_texture_compare(ss->compare_func)); + return cs; } @@ -157,7 +206,7 @@ break; } - if (res->addressing_mode == ETNA_ADDRESSING_MODE_LINEAR) { + if (res->layout == ETNA_LAYOUT_LINEAR && !util_format_is_compressed(so->format)) { sv->TE_SAMPLER_CONFIG0 |= VIVS_TE_SAMPLER_CONFIG0_ADDRESSING_MODE(TEXTURE_ADDRESSING_MODE_LINEAR); for (int lod = 0; lod <= res->base.last_level; ++lod) @@ -183,7 +232,8 @@ VIVS_TE_SAMPLER_LOG_SIZE_WIDTH(etna_log2_fixp55(res->base.width0)) | VIVS_TE_SAMPLER_LOG_SIZE_HEIGHT(etna_log2_fixp55(base_height)) | COND(util_format_is_srgb(so->format) && !astc, VIVS_TE_SAMPLER_LOG_SIZE_SRGB) | - COND(astc, VIVS_TE_SAMPLER_LOG_SIZE_ASTC); + COND(astc, VIVS_TE_SAMPLER_LOG_SIZE_ASTC) | + COND(texture_use_int_filter(so, false), VIVS_TE_SAMPLER_LOG_SIZE_INT_FILTER); sv->TE_SAMPLER_3D_CONFIG = VIVS_TE_SAMPLER_3D_CONFIG_DEPTH(base_depth) | VIVS_TE_SAMPLER_3D_CONFIG_LOG_DEPTH(etna_log2_fixp55(base_depth)); @@ -365,6 +415,14 @@ } } } + if (unlikely(ctx->specs.halti >= 1 && (dirty & (ETNA_DIRTY_SAMPLER_VIEWS)))) { + for (int x = 0; x < VIVS_TE_SAMPLER__LEN; ++x) { + if ((1 << x) & active_samplers) { + struct etna_sampler_state *ss = etna_sampler_state(ctx->sampler[x]); + /*10700*/ EMIT_STATE(NTE_SAMPLER_BASELOD(x), ss->NTE_SAMPLER_BASELOD); + } + } + } etna_coalesce_end(stream, &coalesce); } diff -Nru mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_texture_state.h mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_texture_state.h --- mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_texture_state.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_texture_state.h 2020-06-12 01:21:17.000000000 +0000 @@ -27,56 +27,8 @@ #ifndef H_ETNAVIV_TEXTURE_PLAIN #define H_ETNAVIV_TEXTURE_PLAIN -#include "drm/etnaviv_drmif.h" - #include "etnaviv_texture.h" -#include "pipe/p_context.h" -#include "pipe/p_state.h" - -#include "hw/state_3d.xml.h" - -struct etna_sampler_state { - struct pipe_sampler_state base; - - /* sampler offset +4*sampler, interleave when committing state */ - uint32_t TE_SAMPLER_CONFIG0; - uint32_t TE_SAMPLER_CONFIG1; - uint32_t TE_SAMPLER_LOD_CONFIG; - uint32_t TE_SAMPLER_3D_CONFIG; - unsigned min_lod, max_lod, max_lod_min; -}; - -static inline struct etna_sampler_state * -etna_sampler_state(struct pipe_sampler_state *samp) -{ - return (struct etna_sampler_state *)samp; -} - -struct etna_sampler_view { - struct pipe_sampler_view base; - - /* sampler offset +4*sampler, interleave when committing state */ - uint32_t TE_SAMPLER_CONFIG0; - uint32_t TE_SAMPLER_CONFIG0_MASK; - uint32_t TE_SAMPLER_CONFIG1; - uint32_t TE_SAMPLER_3D_CONFIG; - uint32_t TE_SAMPLER_SIZE; - uint32_t TE_SAMPLER_LOG_SIZE; - uint32_t TE_SAMPLER_ASTC0; - uint32_t TE_SAMPLER_LINEAR_STRIDE[VIVS_TE_SAMPLER_LINEAR_STRIDE__LEN]; - struct etna_reloc TE_SAMPLER_LOD_ADDR[VIVS_TE_SAMPLER_LOD_ADDR__LEN]; - unsigned min_lod, max_lod; /* 5.5 fixp */ - - struct etna_sampler_ts ts; -}; - -static inline struct etna_sampler_view * -etna_sampler_view(struct pipe_sampler_view *view) -{ - return (struct etna_sampler_view *)view; -} - /* Initialize context for "plain" (non-descriptor, state-based) texture views * and descriptors */ void diff -Nru mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_tiling.c mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_tiling.c --- mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_tiling.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_tiling.c 2020-06-12 01:21:17.000000000 +0000 @@ -68,7 +68,9 @@ unsigned dst_stride, unsigned width, unsigned height, unsigned src_stride, unsigned elmtsize) { - if (elmtsize == 4) { + if (elmtsize == 8) { + DO_TILE(uint64_t) + } else if (elmtsize == 4) { DO_TILE(uint32_t) } else if (elmtsize == 2) { DO_TILE(uint16_t) @@ -84,7 +86,9 @@ unsigned src_stride, unsigned width, unsigned height, unsigned dst_stride, unsigned elmtsize) { - if (elmtsize == 4) { + if (elmtsize == 8) { + DO_UNTILE(uint64_t) + } else if (elmtsize == 4) { DO_UNTILE(uint32_t); } else if (elmtsize == 2) { DO_UNTILE(uint16_t); diff -Nru mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_transfer.c mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_transfer.c --- mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_transfer.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_transfer.c 2020-06-12 01:21:17.000000000 +0000 @@ -35,7 +35,7 @@ #include "pipe/p_format.h" #include "pipe/p_screen.h" #include "pipe/p_state.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_inlines.h" #include "util/u_memory.h" #include "util/u_surface.h" @@ -174,6 +174,14 @@ if (!trans->rsc && !(ptrans->usage & PIPE_TRANSFER_UNSYNCHRONIZED)) etna_bo_cpu_fini(rsc->bo); + if ((ptrans->resource->target == PIPE_BUFFER) && + (ptrans->usage & PIPE_TRANSFER_WRITE)) { + util_range_add(&rsc->base, + &rsc->valid_buffer_range, + ptrans->box.x, + ptrans->box.x + ptrans->box.width); + } + pipe_resource_reference(&trans->rsc, NULL); pipe_resource_reference(&ptrans->resource, NULL); slab_free(&ctx->transfer_pool, trans); @@ -199,13 +207,16 @@ /* slab_alloc() doesn't zero */ memset(trans, 0, sizeof(*trans)); - ptrans = &trans->base; - pipe_resource_reference(&ptrans->resource, prsc); - ptrans->level = level; - ptrans->usage = usage; - ptrans->box = *box; - - assert(level <= prsc->last_level); + /* + * Upgrade to UNSYNCHRONIZED if target is PIPE_BUFFER and range is uninitialized. + */ + if ((usage & PIPE_TRANSFER_WRITE) && + (prsc->target == PIPE_BUFFER) && + !util_ranges_intersect(&rsc->valid_buffer_range, + box->x, + box->x + box->width)) { + usage |= PIPE_TRANSFER_UNSYNCHRONIZED; + } /* Upgrade DISCARD_RANGE to WHOLE_RESOURCE if the whole resource is * being mapped. If we add buffer reallocation to avoid CPU/GPU sync this @@ -221,6 +232,25 @@ usage |= PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE; } + ptrans = &trans->base; + pipe_resource_reference(&ptrans->resource, prsc); + ptrans->level = level; + ptrans->usage = usage; + ptrans->box = *box; + + assert(level <= prsc->last_level); + + /* This one is a little tricky: if we have a separate render resource, which + * is newer than the base resource we want the transfer to target this one, + * to get the most up-to-date content, but only if we don't have a texture + * target of the same age, as transfering in/out of the texture target is + * generally preferred for the reasons listed below */ + if (rsc->render && etna_resource_newer(etna_resource(rsc->render), rsc) && + (!rsc->texture || etna_resource_newer(etna_resource(rsc->render), + etna_resource(rsc->texture)))) { + rsc = etna_resource(rsc->render); + } + if (rsc->texture && !etna_resource_newer(rsc, etna_resource(rsc->texture))) { /* We have a texture resource which is the same age or newer than the * render resource. Use the texture resource, which avoids bouncing @@ -228,7 +258,7 @@ rsc = etna_resource(rsc->texture); } else if (rsc->ts_bo || (rsc->layout != ETNA_LAYOUT_LINEAR && - util_format_get_blocksize(format) > 1 && + etna_resource_hw_tileable(ctx->specs.use_blt, prsc) && /* HALIGN 4 resources are incompatible with the resolve engine, * so fall back to using software to detile this resource. */ rsc->halign != TEXTURE_HALIGN_FOUR)) { @@ -254,8 +284,7 @@ templ.bind = PIPE_BIND_RENDER_TARGET; trans->rsc = etna_resource_alloc(pctx->screen, ETNA_LAYOUT_LINEAR, - ETNA_ADDRESSING_MODE_TILED, DRM_FORMAT_MOD_LINEAR, - &templ); + DRM_FORMAT_MOD_LINEAR, &templ); if (!trans->rsc) { slab_free(&ctx->transfer_pool, trans); return NULL; @@ -284,7 +313,7 @@ } if (!(usage & PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE)) - etna_copy_resource_box(pctx, trans->rsc, prsc, level, &ptrans->box); + etna_copy_resource_box(pctx, trans->rsc, &rsc->base, level, &ptrans->box); /* Switch to using the temporary resource instead */ rsc = etna_resource(trans->rsc); @@ -347,7 +376,6 @@ * transfers without a temporary resource. */ if (trans->rsc || !(usage & PIPE_TRANSFER_UNSYNCHRONIZED)) { - struct etna_screen *screen = ctx->screen; uint32_t prep_flags = 0; /* @@ -356,7 +384,7 @@ * current GPU usage (reads must wait for GPU writes, writes must have * exclusive access to the buffer). */ - mtx_lock(&screen->lock); + mtx_lock(&ctx->lock); if ((trans->rsc && (etna_resource(trans->rsc)->status & ETNA_PENDING_WRITE)) || (!trans->rsc && @@ -370,7 +398,7 @@ } } - mtx_unlock(&screen->lock); + mtx_unlock(&ctx->lock); if (usage & PIPE_TRANSFER_READ) prep_flags |= DRM_ETNA_PREP_READ; @@ -462,10 +490,16 @@ static void etna_transfer_flush_region(struct pipe_context *pctx, - struct pipe_transfer *transfer, + struct pipe_transfer *ptrans, const struct pipe_box *box) { - /* NOOP for now */ + struct etna_resource *rsc = etna_resource(ptrans->resource); + + if (ptrans->resource->target == PIPE_BUFFER) + util_range_add(&rsc->base, + &rsc->valid_buffer_range, + ptrans->box.x + box->x, + ptrans->box.x + box->x + box->width); } void diff -Nru mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_translate.h mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_translate.h --- mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_translate.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_translate.h 2020-06-12 01:21:17.000000000 +0000 @@ -36,7 +36,7 @@ #include "hw/state.xml.h" #include "hw/state_3d.xml.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_math.h" /* Returned when there is no match of pipe value to etna value */ @@ -234,28 +234,10 @@ } } -/* return a RS "compatible" format for use when copying */ -static inline enum pipe_format -etna_compatible_rs_format(enum pipe_format fmt) -{ - /* YUYV and UYVY are blocksize 4, but 2 bytes per pixel */ - if (fmt == PIPE_FORMAT_YUYV || fmt == PIPE_FORMAT_UYVY) - return PIPE_FORMAT_B4G4R4A4_UNORM; - - switch (util_format_get_blocksize(fmt)) { - case 2: - return PIPE_FORMAT_B4G4R4A4_UNORM; - case 4: - return PIPE_FORMAT_B8G8R8A8_UNORM; - default: - return fmt; - } -} - static inline int translate_rb_src_dst_swap(enum pipe_format src, enum pipe_format dst) { - return translate_rs_format_rb_swap(src) ^ translate_rs_format_rb_swap(dst); + return translate_pe_format_rb_swap(src) ^ translate_pe_format_rb_swap(dst); } static inline uint32_t @@ -320,11 +302,33 @@ /* assumes that normalization of channel 0 holds for all channels; * this holds for all vertex formats that we support */ return desc->channel[0].normalized - ? VIVS_FE_VERTEX_ELEMENT_CONFIG_NORMALIZE_ON + ? VIVS_FE_VERTEX_ELEMENT_CONFIG_NORMALIZE_SIGN_EXTEND : VIVS_FE_VERTEX_ELEMENT_CONFIG_NORMALIZE_OFF; } static inline uint32_t +translate_output_mode(enum pipe_format fmt, bool halti5) +{ + const unsigned bits = + util_format_get_component_bits(fmt, UTIL_FORMAT_COLORSPACE_RGB, 0); + + if (bits == 32) + return COLOR_OUTPUT_MODE_UIF32; + + if (!util_format_is_pure_integer(fmt)) + return COLOR_OUTPUT_MODE_NORMAL; + + /* generic integer output mode pre-halti5 (?) */ + if (bits == 10 || !halti5) + return COLOR_OUTPUT_MODE_A2B10G10R10UI; + + if (util_format_is_pure_sint(fmt)) + return bits == 8 ? COLOR_OUTPUT_MODE_I8 : COLOR_OUTPUT_MODE_I16; + + return bits == 8 ? COLOR_OUTPUT_MODE_U8 : COLOR_OUTPUT_MODE_U16; +} + +static inline uint32_t translate_index_size(unsigned index_size) { switch (index_size) { @@ -435,32 +439,26 @@ return clear_value; } -/* Convert MSAA number of samples to x and y scaling factor and - * VIVS_GL_MULTI_SAMPLE_CONFIG value. +/* Convert MSAA number of samples to x and y scaling factor. * Return true if supported and false otherwise. */ static inline bool -translate_samples_to_xyscale(int num_samples, int *xscale_out, int *yscale_out, - uint32_t *config_out) +translate_samples_to_xyscale(int num_samples, int *xscale_out, int *yscale_out) { int xscale, yscale; - uint32_t config; switch (num_samples) { case 0: case 1: xscale = 1; yscale = 1; - config = VIVS_GL_MULTI_SAMPLE_CONFIG_MSAA_SAMPLES_NONE; break; case 2: xscale = 2; yscale = 1; - config = VIVS_GL_MULTI_SAMPLE_CONFIG_MSAA_SAMPLES_2X; break; case 4: xscale = 2; yscale = 2; - config = VIVS_GL_MULTI_SAMPLE_CONFIG_MSAA_SAMPLES_4X; break; default: return false; @@ -470,8 +468,6 @@ *xscale_out = xscale; if (yscale_out) *yscale_out = yscale; - if (config_out) - *config_out = config; return true; } @@ -497,4 +493,29 @@ } } +static inline uint32_t +translate_texture_compare(enum pipe_compare_func compare_func) +{ + switch (compare_func) { + case PIPE_FUNC_NEVER: + return TEXTURE_COMPARE_FUNC_NEVER; + case PIPE_FUNC_LESS: + return TEXTURE_COMPARE_FUNC_LESS; + case PIPE_FUNC_EQUAL: + return TEXTURE_COMPARE_FUNC_EQUAL; + case PIPE_FUNC_LEQUAL: + return TEXTURE_COMPARE_FUNC_LEQUAL; + case PIPE_FUNC_GREATER: + return TEXTURE_COMPARE_FUNC_GREATER; + case PIPE_FUNC_NOTEQUAL: + return TEXTURE_COMPARE_FUNC_NOTEQUAL; + case PIPE_FUNC_GEQUAL: + return TEXTURE_COMPARE_FUNC_GEQUAL; + case PIPE_FUNC_ALWAYS: + return TEXTURE_COMPARE_FUNC_ALWAYS; + default: + unreachable("Invalid compare func"); + } +} + #endif diff -Nru mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_uniforms.c mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_uniforms.c --- mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_uniforms.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_uniforms.c 2020-06-12 01:21:17.000000000 +0000 @@ -67,6 +67,7 @@ const struct etna_shader_uniform_info *uinfo = &sobj->uniforms; bool frag = (sobj == ctx->shader.fs); uint32_t base = frag ? ctx->specs.ps_uniforms_offset : ctx->specs.vs_uniforms_offset; + unsigned idx; if (!uinfo->imm_count) return; @@ -94,11 +95,11 @@ break; case ETNA_IMMEDIATE_UBO0_ADDR ... ETNA_IMMEDIATE_UBOMAX_ADDR: - assert(uinfo->imm_contents[i] == ETNA_IMMEDIATE_UBO0_ADDR); + idx = uinfo->imm_contents[i] - ETNA_IMMEDIATE_UBO0_ADDR; etna_cmd_stream_reloc(stream, &(struct etna_reloc) { - .bo = etna_resource(cb->buffer)->bo, + .bo = etna_resource(cb[idx].buffer)->bo, .flags = ETNA_RELOC_READ, - .offset = cb->buffer_offset + val, + .offset = cb[idx].buffer_offset + val, }); break; diff -Nru mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_util.h mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_util.h --- mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_util.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_util.h 2020-06-12 01:21:17.000000000 +0000 @@ -30,6 +30,9 @@ /* for conditionally setting boolean flag(s): */ #define COND(bool, val) ((bool) ? (val) : 0) +#define foreach_bit(b, mask) \ + for (uint32_t _m = (mask); _m && ({(b) = u_bit_scan(&_m); 1;});) + /* align to a value divisable by granularity >= value, works only for powers of two */ static inline uint32_t etna_align_up(uint32_t value, uint32_t granularity) @@ -37,12 +40,6 @@ return (value + (granularity - 1)) & (~(granularity - 1)); } -static inline uint32_t -etna_bits_ones(unsigned num) -{ - return (1 << num) - 1; -} - /* clamped float [0.0 .. 1.0] -> [0 .. 255] */ static inline uint8_t etna_cfloat_to_uint8(float f) @@ -85,6 +82,19 @@ return (int32_t)(f * 32.0f + 0.5f); } +/* float to fixp 8.8 */ +static inline uint32_t +etna_float_to_fixp88(float f) +{ + if (f >= (32767.0 - 1.0f) / 256.0f) + return 32767; + + if (f < -16.0f) + return 32768; + + return (int32_t)(f * 256.0f + 0.5f); +} + /* texture size to log2 in fixp 5.5 format */ static inline uint32_t etna_log2_fixp55(unsigned width) @@ -92,6 +102,13 @@ return etna_float_to_fixp55(logf((float)width) * RCPLOG2); } +/* texture size to log2 in fixp 8.8 format */ +static inline uint32_t +etna_log2_fixp88(unsigned width) +{ + return etna_float_to_fixp88(logf((float)width) * RCPLOG2); +} + /* float to fixp 16.16 */ static inline uint32_t etna_f32_to_fixp16(float f) diff -Nru mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_zsa.c mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_zsa.c --- mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_zsa.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_zsa.c 2020-06-12 01:21:17.000000000 +0000 @@ -104,21 +104,27 @@ COND(so->alpha.enabled, VIVS_PE_ALPHA_OP_ALPHA_TEST) | VIVS_PE_ALPHA_OP_ALPHA_FUNC(so->alpha.func) | VIVS_PE_ALPHA_OP_ALPHA_REF(etna_cfloat_to_uint8(so->alpha.ref_value)); - cs->PE_STENCIL_OP = - VIVS_PE_STENCIL_OP_FUNC_FRONT(so->stencil[0].func) | - VIVS_PE_STENCIL_OP_FUNC_BACK(so->stencil[1].func) | - VIVS_PE_STENCIL_OP_FAIL_FRONT(translate_stencil_op(so->stencil[0].fail_op)) | - VIVS_PE_STENCIL_OP_FAIL_BACK(translate_stencil_op(so->stencil[1].fail_op)) | - VIVS_PE_STENCIL_OP_DEPTH_FAIL_FRONT(translate_stencil_op(so->stencil[0].zfail_op)) | - VIVS_PE_STENCIL_OP_DEPTH_FAIL_BACK(translate_stencil_op(so->stencil[1].zfail_op)) | - VIVS_PE_STENCIL_OP_PASS_FRONT(translate_stencil_op(so->stencil[0].zpass_op)) | - VIVS_PE_STENCIL_OP_PASS_BACK(translate_stencil_op(so->stencil[1].zpass_op)); - cs->PE_STENCIL_CONFIG = - translate_stencil_mode(so->stencil[0].enabled, so->stencil[1].enabled) | - VIVS_PE_STENCIL_CONFIG_MASK_FRONT(so->stencil[0].valuemask) | - VIVS_PE_STENCIL_CONFIG_WRITE_MASK_FRONT(so->stencil[0].writemask); - /* XXX back masks in VIVS_PE_DEPTH_CONFIG_EXT? */ - /* XXX VIVS_PE_STENCIL_CONFIG_REF_FRONT comes from pipe_stencil_ref */ + + for (unsigned i = 0; i < 2; i++) { + const struct pipe_stencil_state *stencil_front = so->stencil[1].enabled ? &so->stencil[i] : &so->stencil[0]; + const struct pipe_stencil_state *stencil_back = so->stencil[1].enabled ? &so->stencil[!i] : &so->stencil[0]; + cs->PE_STENCIL_OP[i] = + VIVS_PE_STENCIL_OP_FUNC_FRONT(stencil_front->func) | + VIVS_PE_STENCIL_OP_FUNC_BACK(stencil_back->func) | + VIVS_PE_STENCIL_OP_FAIL_FRONT(translate_stencil_op(stencil_front->fail_op)) | + VIVS_PE_STENCIL_OP_FAIL_BACK(translate_stencil_op(stencil_back->fail_op)) | + VIVS_PE_STENCIL_OP_DEPTH_FAIL_FRONT(translate_stencil_op(stencil_front->zfail_op)) | + VIVS_PE_STENCIL_OP_DEPTH_FAIL_BACK(translate_stencil_op(stencil_back->zfail_op)) | + VIVS_PE_STENCIL_OP_PASS_FRONT(translate_stencil_op(stencil_front->zpass_op)) | + VIVS_PE_STENCIL_OP_PASS_BACK(translate_stencil_op(stencil_back->zpass_op)); + cs->PE_STENCIL_CONFIG[i] = + translate_stencil_mode(so->stencil[0].enabled, so->stencil[0].enabled) | + VIVS_PE_STENCIL_CONFIG_MASK_FRONT(stencil_front->valuemask) | + VIVS_PE_STENCIL_CONFIG_WRITE_MASK_FRONT(stencil_front->writemask); + cs->PE_STENCIL_CONFIG_EXT2[i] = + VIVS_PE_STENCIL_CONFIG_EXT2_MASK_BACK(stencil_back->valuemask) | + VIVS_PE_STENCIL_CONFIG_EXT2_WRITE_MASK_BACK(stencil_back->writemask); + } /* XXX does alpha/stencil test affect PE_COLOR_FORMAT_OVERWRITE? */ return cs; diff -Nru mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_zsa.h mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_zsa.h --- mesa-19.2.8/src/gallium/drivers/etnaviv/etnaviv_zsa.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/etnaviv/etnaviv_zsa.h 2020-06-12 01:21:17.000000000 +0000 @@ -35,8 +35,10 @@ uint32_t PE_DEPTH_CONFIG; uint32_t PE_ALPHA_OP; - uint32_t PE_STENCIL_OP; - uint32_t PE_STENCIL_CONFIG; + uint32_t PE_STENCIL_OP[2]; + uint32_t PE_STENCIL_CONFIG[2]; + uint32_t PE_STENCIL_CONFIG_EXT2[2]; + }; static inline struct etna_zsa_state * diff -Nru mesa-19.2.8/src/gallium/drivers/etnaviv/hw/common_3d.xml.h mesa-20.0.8/src/gallium/drivers/etnaviv/hw/common_3d.xml.h --- mesa-19.2.8/src/gallium/drivers/etnaviv/hw/common_3d.xml.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/etnaviv/hw/common_3d.xml.h 2020-06-12 01:21:17.000000000 +0000 @@ -8,10 +8,10 @@ git clone git://0x04.net/rules-ng-ng The rules-ng-ng source files this header was generated from are: -- texdesc_3d.xml ( 3183 bytes, from 2019-01-07 09:52:31) -- copyright.xml ( 1597 bytes, from 2019-01-07 09:52:31) -- common.xml ( 35468 bytes, from 2019-01-07 09:52:31) -- common_3d.xml ( 14322 bytes, from 2019-08-19 14:35:07) +- texdesc_3d.xml ( 3183 bytes, from 2019-08-09 17:33:50) +- copyright.xml ( 1597 bytes, from 2019-08-09 17:34:08) +- common.xml ( 35468 bytes, from 2019-08-09 17:16:20) +- common_3d.xml ( 14991 bytes, from 2019-09-12 20:32:47) Copyright (C) 2012-2019 by the following authors: - Wladimir J. van der Laan @@ -129,6 +129,14 @@ #define TS_MODE_256B 0x00000001 #define TEXTURE_ADDRESSING_MODE_TILED 0x00000000 #define TEXTURE_ADDRESSING_MODE_LINEAR 0x00000003 +#define TEXTURE_COMPARE_FUNC_LEQUAL 0x00000000 +#define TEXTURE_COMPARE_FUNC_GEQUAL 0x00000001 +#define TEXTURE_COMPARE_FUNC_LESS 0x00000002 +#define TEXTURE_COMPARE_FUNC_GREATER 0x00000003 +#define TEXTURE_COMPARE_FUNC_EQUAL 0x00000004 +#define TEXTURE_COMPARE_FUNC_NOTEQUAL 0x00000005 +#define TEXTURE_COMPARE_FUNC_ALWAYS 0x00000006 +#define TEXTURE_COMPARE_FUNC_NEVER 0x00000007 #define COMPRESSION_FORMAT_A4R4G4B4 0x00000000 #define COMPRESSION_FORMAT_A1R5G5B5 0x00000001 #define COMPRESSION_FORMAT_R5G6B5 0x00000002 diff -Nru mesa-19.2.8/src/gallium/drivers/etnaviv/hw/state_3d.xml.h mesa-20.0.8/src/gallium/drivers/etnaviv/hw/state_3d.xml.h --- mesa-19.2.8/src/gallium/drivers/etnaviv/hw/state_3d.xml.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/etnaviv/hw/state_3d.xml.h 2020-06-12 01:21:17.000000000 +0000 @@ -8,17 +8,17 @@ git clone git://0x04.net/rules-ng-ng The rules-ng-ng source files this header was generated from are: -- state.xml ( 26666 bytes, from 2019-08-19 14:35:07) -- common.xml ( 35468 bytes, from 2019-01-07 09:52:31) -- common_3d.xml ( 14322 bytes, from 2019-08-19 14:35:07) -- state_hi.xml ( 30232 bytes, from 2019-01-07 09:52:31) -- copyright.xml ( 1597 bytes, from 2019-01-07 09:52:31) -- state_2d.xml ( 51552 bytes, from 2019-01-07 09:52:31) -- state_3d.xml ( 83505 bytes, from 2019-08-19 14:46:17) -- state_blt.xml ( 14252 bytes, from 2019-08-19 14:35:07) -- state_vg.xml ( 5975 bytes, from 2019-01-07 09:52:31) +- state.xml ( 26666 bytes, from 2019-08-12 13:32:55) +- common.xml ( 35468 bytes, from 2019-08-09 17:16:20) +- common_3d.xml ( 15058 bytes, from 2019-09-12 20:37:35) +- state_hi.xml ( 30552 bytes, from 2020-01-06 02:44:00) +- copyright.xml ( 1597 bytes, from 2019-08-09 17:34:08) +- state_2d.xml ( 51552 bytes, from 2019-08-09 17:34:00) +- state_3d.xml ( 83644 bytes, from 2020-01-06 02:44:06) +- state_blt.xml ( 14252 bytes, from 2019-09-12 20:21:39) +- state_vg.xml ( 5975 bytes, from 2019-08-09 17:33:52) -Copyright (C) 2012-2019 by the following authors: +Copyright (C) 2012-2020 by the following authors: - Wladimir J. van der Laan - Christian Gmeiner - Lucas Stach @@ -167,6 +167,7 @@ #define VIVS_VS_INPUT_COUNT_UNK8__MASK 0x00001f00 #define VIVS_VS_INPUT_COUNT_UNK8__SHIFT 8 #define VIVS_VS_INPUT_COUNT_UNK8(x) (((x) << VIVS_VS_INPUT_COUNT_UNK8__SHIFT) & VIVS_VS_INPUT_COUNT_UNK8__MASK) +#define VIVS_VS_INPUT_COUNT_ID_ENABLE 0x80000000 #define VIVS_VS_TEMP_REGISTER_CONTROL 0x0000080c #define VIVS_VS_TEMP_REGISTER_CONTROL_NUM_TEMPS__MASK 0x0000003f @@ -1233,6 +1234,7 @@ #define VIVS_RS_SOURCE_STRIDE_STRIDE__MASK 0x0003ffff #define VIVS_RS_SOURCE_STRIDE_STRIDE__SHIFT 0 #define VIVS_RS_SOURCE_STRIDE_STRIDE(x) (((x) << VIVS_RS_SOURCE_STRIDE_STRIDE__SHIFT) & VIVS_RS_SOURCE_STRIDE_STRIDE__MASK) +#define VIVS_RS_SOURCE_STRIDE_UNK29 0x20000000 #define VIVS_RS_SOURCE_STRIDE_MULTI 0x40000000 #define VIVS_RS_SOURCE_STRIDE_TILING 0x80000000 @@ -1476,7 +1478,7 @@ #define VIVS_TE_SAMPLER_LOG_SIZE_HEIGHT__SHIFT 10 #define VIVS_TE_SAMPLER_LOG_SIZE_HEIGHT(x) (((x) << VIVS_TE_SAMPLER_LOG_SIZE_HEIGHT__SHIFT) & VIVS_TE_SAMPLER_LOG_SIZE_HEIGHT__MASK) #define VIVS_TE_SAMPLER_LOG_SIZE_ASTC 0x10000000 -#define VIVS_TE_SAMPLER_LOG_SIZE_RGB 0x20000000 +#define VIVS_TE_SAMPLER_LOG_SIZE_INT_FILTER 0x20000000 #define VIVS_TE_SAMPLER_LOG_SIZE_SRGB 0x80000000 #define VIVS_TE_SAMPLER_LOD_CONFIG(i0) (0x000020c0 + 0x4*(i0)) @@ -1536,6 +1538,27 @@ #define VIVS_TE_SAMPLER_UNK02240(i0) (0x00002240 + 0x4*(i0)) +#define VIVS_TE_SAMPLER_ASTC0(i0) (0x00002280 + 0x4*(i0)) +#define VIVS_TE_SAMPLER_ASTC0_ASTC_FORMAT__MASK 0x0000000f +#define VIVS_TE_SAMPLER_ASTC0_ASTC_FORMAT__SHIFT 0 +#define VIVS_TE_SAMPLER_ASTC0_ASTC_FORMAT(x) (((x) << VIVS_TE_SAMPLER_ASTC0_ASTC_FORMAT__SHIFT) & VIVS_TE_SAMPLER_ASTC0_ASTC_FORMAT__MASK) +#define VIVS_TE_SAMPLER_ASTC0_ASTC_SRGB 0x00000010 +#define VIVS_TE_SAMPLER_ASTC0_UNK8__MASK 0x0000ff00 +#define VIVS_TE_SAMPLER_ASTC0_UNK8__SHIFT 8 +#define VIVS_TE_SAMPLER_ASTC0_UNK8(x) (((x) << VIVS_TE_SAMPLER_ASTC0_UNK8__SHIFT) & VIVS_TE_SAMPLER_ASTC0_UNK8__MASK) +#define VIVS_TE_SAMPLER_ASTC0_UNK16__MASK 0x00ff0000 +#define VIVS_TE_SAMPLER_ASTC0_UNK16__SHIFT 16 +#define VIVS_TE_SAMPLER_ASTC0_UNK16(x) (((x) << VIVS_TE_SAMPLER_ASTC0_UNK16__SHIFT) & VIVS_TE_SAMPLER_ASTC0_UNK16__MASK) +#define VIVS_TE_SAMPLER_ASTC0_UNK24__MASK 0xff000000 +#define VIVS_TE_SAMPLER_ASTC0_UNK24__SHIFT 24 +#define VIVS_TE_SAMPLER_ASTC0_UNK24(x) (((x) << VIVS_TE_SAMPLER_ASTC0_UNK24__SHIFT) & VIVS_TE_SAMPLER_ASTC0_UNK24__MASK) + +#define VIVS_TE_SAMPLER_ASTC1(i0) (0x00002300 + 0x4*(i0)) + +#define VIVS_TE_SAMPLER_ASTC2(i0) (0x00002380 + 0x4*(i0)) + +#define VIVS_TE_SAMPLER_ASTC3(i0) (0x00002340 + 0x4*(i0)) + #define VIVS_TE_SAMPLER_LOD_ADDR(i0, i1) (0x00002400 + 0x4*(i0) + 0x40*(i1)) #define VIVS_TE_SAMPLER_LOD_ADDR__ESIZE 0x00000040 #define VIVS_TE_SAMPLER_LOD_ADDR__LEN 0x0000000e @@ -1599,7 +1622,7 @@ #define VIVS_NTE_SAMPLER_LOG_SIZE_HEIGHT__SHIFT 10 #define VIVS_NTE_SAMPLER_LOG_SIZE_HEIGHT(x) (((x) << VIVS_NTE_SAMPLER_LOG_SIZE_HEIGHT__SHIFT) & VIVS_NTE_SAMPLER_LOG_SIZE_HEIGHT__MASK) #define VIVS_NTE_SAMPLER_LOG_SIZE_ASTC 0x10000000 -#define VIVS_NTE_SAMPLER_LOG_SIZE_RGB 0x20000000 +#define VIVS_NTE_SAMPLER_LOG_SIZE_INT_FILTER 0x20000000 #define VIVS_NTE_SAMPLER_LOG_SIZE_SRGB 0x80000000 #define VIVS_NTE_SAMPLER_LOD_CONFIG(i0) (0x00010180 + 0x4*(i0)) @@ -1616,7 +1639,9 @@ #define VIVS_NTE_SAMPLER_UNK10200(i0) (0x00010200 + 0x4*(i0)) -#define VIVS_NTE_SAMPLER_UNK10280(i0) (0x00010280 + 0x4*(i0)) +#define VIVS_NTE_SAMPLER_LINEAR_STRIDE(i0, i1) (0x00010280 + 0x4*(i0) + 0x4*(i1)) +#define VIVS_NTE_SAMPLER_LINEAR_STRIDE__ESIZE 0x00000004 +#define VIVS_NTE_SAMPLER_LINEAR_STRIDE__LEN 0x00000020 #define VIVS_NTE_SAMPLER_3D_CONFIG(i0) (0x00010300 + 0x4*(i0)) #define VIVS_NTE_SAMPLER_3D_CONFIG_DEPTH__MASK 0x00003fff @@ -1678,16 +1703,20 @@ #define VIVS_NTE_SAMPLER_ASTC2(i0) (0x00010600 + 0x4*(i0)) -#define VIVS_NTE_SAMPLER_ASTC3(i0) (0x00010600 + 0x4*(i0)) +#define VIVS_NTE_SAMPLER_ASTC3(i0) (0x00010680 + 0x4*(i0)) #define VIVS_NTE_SAMPLER_BASELOD(i0) (0x00010700 + 0x4*(i0)) -#define VIVS_NTE_SAMPLER_BASELOD_UNK23 0x00800000 #define VIVS_NTE_SAMPLER_BASELOD_BASELOD__MASK 0x0000000f #define VIVS_NTE_SAMPLER_BASELOD_BASELOD__SHIFT 0 #define VIVS_NTE_SAMPLER_BASELOD_BASELOD(x) (((x) << VIVS_NTE_SAMPLER_BASELOD_BASELOD__SHIFT) & VIVS_NTE_SAMPLER_BASELOD_BASELOD__MASK) #define VIVS_NTE_SAMPLER_BASELOD_MAXLOD__MASK 0x00000f00 #define VIVS_NTE_SAMPLER_BASELOD_MAXLOD__SHIFT 8 #define VIVS_NTE_SAMPLER_BASELOD_MAXLOD(x) (((x) << VIVS_NTE_SAMPLER_BASELOD_MAXLOD__SHIFT) & VIVS_NTE_SAMPLER_BASELOD_MAXLOD__MASK) +#define VIVS_NTE_SAMPLER_BASELOD_COMPARE_ENABLE 0x00010000 +#define VIVS_NTE_SAMPLER_BASELOD_COMPARE_FUNC__MASK 0x00700000 +#define VIVS_NTE_SAMPLER_BASELOD_COMPARE_FUNC__SHIFT 20 +#define VIVS_NTE_SAMPLER_BASELOD_COMPARE_FUNC(x) (((x) << VIVS_NTE_SAMPLER_BASELOD_COMPARE_FUNC__SHIFT) & VIVS_NTE_SAMPLER_BASELOD_COMPARE_FUNC__MASK) +#define VIVS_NTE_SAMPLER_BASELOD_BASELOD_ENABLE 0x00800000 #define VIVS_NTE_SAMPLER_UNK10780(i0) (0x00010780 + 0x4*(i0)) @@ -1790,17 +1819,10 @@ #define VIVS_NTE_DESCRIPTOR_SAMP_CTRL0_COMPARE_ENABLE 0x00020000 #define VIVS_NTE_DESCRIPTOR_SAMP_CTRL0_COMPARE_FUNC__MASK 0x001c0000 #define VIVS_NTE_DESCRIPTOR_SAMP_CTRL0_COMPARE_FUNC__SHIFT 18 -#define VIVS_NTE_DESCRIPTOR_SAMP_CTRL0_COMPARE_FUNC_LE 0x00000000 -#define VIVS_NTE_DESCRIPTOR_SAMP_CTRL0_COMPARE_FUNC_GE 0x00040000 -#define VIVS_NTE_DESCRIPTOR_SAMP_CTRL0_COMPARE_FUNC_LT 0x00080000 -#define VIVS_NTE_DESCRIPTOR_SAMP_CTRL0_COMPARE_FUNC_GT 0x000c0000 -#define VIVS_NTE_DESCRIPTOR_SAMP_CTRL0_COMPARE_FUNC_EQ 0x00100000 -#define VIVS_NTE_DESCRIPTOR_SAMP_CTRL0_COMPARE_FUNC_NE 0x00140000 -#define VIVS_NTE_DESCRIPTOR_SAMP_CTRL0_COMPARE_FUNC_ALWAYS 0x00180000 -#define VIVS_NTE_DESCRIPTOR_SAMP_CTRL0_COMPARE_FUNC_NEVER 0x001c0000 +#define VIVS_NTE_DESCRIPTOR_SAMP_CTRL0_COMPARE_FUNC(x) (((x) << VIVS_NTE_DESCRIPTOR_SAMP_CTRL0_COMPARE_FUNC__SHIFT) & VIVS_NTE_DESCRIPTOR_SAMP_CTRL0_COMPARE_FUNC__MASK) #define VIVS_NTE_DESCRIPTOR_SAMP_CTRL0_UNK21 0x00200000 #define VIVS_NTE_DESCRIPTOR_SAMP_CTRL0_UNK22 0x00400000 -#define VIVS_NTE_DESCRIPTOR_SAMP_CTRL0_RGB 0x00800000 +#define VIVS_NTE_DESCRIPTOR_SAMP_CTRL0_INT_FILTER 0x00800000 #define VIVS_NTE_DESCRIPTOR_SAMP_CTRL1(i0) (0x00016e00 + 0x4*(i0)) #define VIVS_NTE_DESCRIPTOR_SAMP_CTRL1_UNK1 0x00000002 diff -Nru mesa-19.2.8/src/gallium/drivers/etnaviv/hw/texdesc_3d.xml.h mesa-20.0.8/src/gallium/drivers/etnaviv/hw/texdesc_3d.xml.h --- mesa-19.2.8/src/gallium/drivers/etnaviv/hw/texdesc_3d.xml.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/etnaviv/hw/texdesc_3d.xml.h 2020-06-12 01:21:17.000000000 +0000 @@ -8,10 +8,10 @@ git clone git://0x04.net/rules-ng-ng The rules-ng-ng source files this header was generated from are: -- texdesc_3d.xml ( 3183 bytes, from 2019-01-07 09:52:31) -- copyright.xml ( 1597 bytes, from 2019-01-07 09:52:31) -- common.xml ( 35468 bytes, from 2019-01-07 09:52:31) -- common_3d.xml ( 14322 bytes, from 2019-08-19 14:35:07) +- texdesc_3d.xml ( 3183 bytes, from 2019-08-09 17:33:50) +- copyright.xml ( 1597 bytes, from 2019-08-09 17:34:08) +- common.xml ( 35468 bytes, from 2019-08-09 17:16:20) +- common_3d.xml ( 14991 bytes, from 2019-09-12 20:32:47) Copyright (C) 2012-2019 by the following authors: - Wladimir J. van der Laan @@ -139,13 +139,17 @@ #define TEXDESC_ASTC3 0x00000064 #define TEXDESC_BASELOD 0x00000068 -#define TEXDESC_BASELOD_UNK23 0x00800000 #define TEXDESC_BASELOD_BASELOD__MASK 0x0000000f #define TEXDESC_BASELOD_BASELOD__SHIFT 0 #define TEXDESC_BASELOD_BASELOD(x) (((x) << TEXDESC_BASELOD_BASELOD__SHIFT) & TEXDESC_BASELOD_BASELOD__MASK) #define TEXDESC_BASELOD_MAXLOD__MASK 0x00000f00 #define TEXDESC_BASELOD_MAXLOD__SHIFT 8 #define TEXDESC_BASELOD_MAXLOD(x) (((x) << TEXDESC_BASELOD_MAXLOD__SHIFT) & TEXDESC_BASELOD_MAXLOD__MASK) +#define TEXDESC_BASELOD_COMPARE_ENABLE 0x00010000 +#define TEXDESC_BASELOD_COMPARE_FUNC__MASK 0x00700000 +#define TEXDESC_BASELOD_COMPARE_FUNC__SHIFT 20 +#define TEXDESC_BASELOD_COMPARE_FUNC(x) (((x) << TEXDESC_BASELOD_COMPARE_FUNC__SHIFT) & TEXDESC_BASELOD_COMPARE_FUNC__MASK) +#define TEXDESC_BASELOD_BASELOD_ENABLE 0x00800000 #define TEXDESC_CONFIG2 0x0000006c @@ -184,7 +188,7 @@ #define TEXDESC_LOG_SIZE_HEIGHT__SHIFT 10 #define TEXDESC_LOG_SIZE_HEIGHT(x) (((x) << TEXDESC_LOG_SIZE_HEIGHT__SHIFT) & TEXDESC_LOG_SIZE_HEIGHT__MASK) #define TEXDESC_LOG_SIZE_ASTC 0x10000000 -#define TEXDESC_LOG_SIZE_RGB 0x20000000 +#define TEXDESC_LOG_SIZE_INT_FILTER 0x20000000 #define TEXDESC_LOG_SIZE_SRGB 0x80000000 #define TEXDESC_BORDER_COLOR_R 0x0000008c diff -Nru mesa-19.2.8/src/gallium/drivers/etnaviv/Makefile.sources mesa-20.0.8/src/gallium/drivers/etnaviv/Makefile.sources --- mesa-19.2.8/src/gallium/drivers/etnaviv/Makefile.sources 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/etnaviv/Makefile.sources 2020-06-12 01:21:16.000000000 +0000 @@ -58,6 +58,8 @@ etnaviv_surface.h \ etnaviv_texture.c \ etnaviv_texture.h \ + etnaviv_texture_desc.c \ + etnaviv_texture_desc.h \ etnaviv_texture_state.c \ etnaviv_texture_state.h \ etnaviv_tiling.c \ diff -Nru mesa-19.2.8/src/gallium/drivers/etnaviv/meson.build mesa-20.0.8/src/gallium/drivers/etnaviv/meson.build --- mesa-19.2.8/src/gallium/drivers/etnaviv/meson.build 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/etnaviv/meson.build 2020-06-12 01:21:17.000000000 +0000 @@ -77,6 +77,8 @@ 'etnaviv_surface.h', 'etnaviv_texture.c', 'etnaviv_texture.h', + 'etnaviv_texture_desc.c', + 'etnaviv_texture_desc.h', 'etnaviv_texture_state.c', 'etnaviv_texture_state.h', 'etnaviv_tiling.c', diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/a2xx/fd2_blend.c mesa-20.0.8/src/gallium/drivers/freedreno/a2xx/fd2_blend.c --- mesa-19.2.8/src/gallium/drivers/freedreno/a2xx/fd2_blend.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/a2xx/fd2_blend.c 2020-06-12 01:21:17.000000000 +0000 @@ -78,21 +78,21 @@ so->rb_colorcontrol = A2XX_RB_COLORCONTROL_ROP_CODE(rop); - so->rb_blendcontrol_rgb = + so->rb_blendcontrol = A2XX_RB_BLEND_CONTROL_COLOR_SRCBLEND(fd_blend_factor(rt->rgb_src_factor)) | A2XX_RB_BLEND_CONTROL_COLOR_COMB_FCN(blend_func(rt->rgb_func)) | A2XX_RB_BLEND_CONTROL_COLOR_DESTBLEND(fd_blend_factor(rt->rgb_dst_factor)); - so->rb_blendcontrol_alpha = - A2XX_RB_BLEND_CONTROL_ALPHA_SRCBLEND(fd_blend_factor(rt->alpha_src_factor)) | + /* hardware doesn't support SRC_ALPHA_SATURATE for alpha, but it is equivalent to ONE */ + unsigned alpha_src_factor = rt->alpha_src_factor; + if (alpha_src_factor == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE) + alpha_src_factor = PIPE_BLENDFACTOR_ONE; + + so->rb_blendcontrol |= + A2XX_RB_BLEND_CONTROL_ALPHA_SRCBLEND(fd_blend_factor(alpha_src_factor)) | A2XX_RB_BLEND_CONTROL_ALPHA_COMB_FCN(blend_func(rt->alpha_func)) | A2XX_RB_BLEND_CONTROL_ALPHA_DESTBLEND(fd_blend_factor(rt->alpha_dst_factor)); - so->rb_blendcontrol_no_alpha_rgb = - A2XX_RB_BLEND_CONTROL_COLOR_SRCBLEND(fd_blend_factor(util_blend_dst_alpha_to_one(rt->rgb_src_factor))) | - A2XX_RB_BLEND_CONTROL_COLOR_COMB_FCN(blend_func(rt->rgb_func)) | - A2XX_RB_BLEND_CONTROL_COLOR_DESTBLEND(fd_blend_factor(util_blend_dst_alpha_to_one(rt->rgb_dst_factor))); - if (rt->colormask & PIPE_MASK_R) so->rb_colormask |= A2XX_RB_COLOR_MASK_WRITE_RED; if (rt->colormask & PIPE_MASK_G) diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/a2xx/fd2_blend.h mesa-20.0.8/src/gallium/drivers/freedreno/a2xx/fd2_blend.h --- mesa-19.2.8/src/gallium/drivers/freedreno/a2xx/fd2_blend.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/a2xx/fd2_blend.h 2020-06-12 01:21:17.000000000 +0000 @@ -32,9 +32,7 @@ struct fd2_blend_stateobj { struct pipe_blend_state base; - uint32_t rb_blendcontrol_rgb; - uint32_t rb_blendcontrol_alpha; - uint32_t rb_blendcontrol_no_alpha_rgb; + uint32_t rb_blendcontrol; uint32_t rb_colorcontrol; /* must be OR'd w/ zsa->rb_colorcontrol */ uint32_t rb_colormask; }; diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/a2xx/fd2_draw.c mesa-20.0.8/src/gallium/drivers/freedreno/a2xx/fd2_draw.c --- mesa-19.2.8/src/gallium/drivers/freedreno/a2xx/fd2_draw.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/a2xx/fd2_draw.c 2020-06-12 01:21:17.000000000 +0000 @@ -154,7 +154,7 @@ fd2_draw_vbo(struct fd_context *ctx, const struct pipe_draw_info *pinfo, unsigned index_offset) { - if (!ctx->prog.fp || !ctx->prog.vp) + if (!ctx->prog.fs || !ctx->prog.vs) return false; if (ctx->dirty & FD_DIRTY_VTXBUF) diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/a2xx/fd2_emit.c mesa-20.0.8/src/gallium/drivers/freedreno/a2xx/fd2_emit.c --- mesa-19.2.8/src/gallium/drivers/freedreno/a2xx/fd2_emit.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/a2xx/fd2_emit.c 2020-06-12 01:21:17.000000000 +0000 @@ -199,7 +199,7 @@ if (dirty & (FD_DIRTY_PROG | FD_DIRTY_CONST)) { emit_constants(ring, VS_CONST_BASE * 4, &ctx->constbuf[PIPE_SHADER_VERTEX], - (dirty & FD_DIRTY_PROG) ? ctx->prog.vp : NULL); + (dirty & FD_DIRTY_PROG) ? ctx->prog.vs : NULL); } if (dirty & FD_DIRTY_VIEWPORT) { @@ -217,15 +217,9 @@ /* not sure why this is needed */ if (dirty & (FD_DIRTY_BLEND | FD_DIRTY_FRAMEBUFFER)) { - enum pipe_format format = - pipe_surface_format(ctx->batch->framebuffer.cbufs[0]); - bool has_alpha = util_format_has_alpha(format); - OUT_PKT3(ring, CP_SET_CONSTANT, 2); OUT_RING(ring, CP_REG(REG_A2XX_RB_BLEND_CONTROL)); - OUT_RING(ring, blend->rb_blendcontrol_alpha | - COND(has_alpha, blend->rb_blendcontrol_rgb) | - COND(!has_alpha, blend->rb_blendcontrol_no_alpha_rgb)); + OUT_RING(ring, blend->rb_blendcontrol); OUT_PKT3(ring, CP_SET_CONSTANT, 2); OUT_RING(ring, CP_REG(REG_A2XX_RB_COLOR_MASK)); @@ -242,7 +236,7 @@ { struct fd2_blend_stateobj *blend = fd2_blend_stateobj(ctx->blend); struct fd2_zsa_stateobj *zsa = fd2_zsa_stateobj(ctx->zsa); - struct fd2_shader_stateobj *fp = ctx->prog.fp; + struct fd2_shader_stateobj *fs = ctx->prog.fs; struct fd_ringbuffer *ring = ctx->batch->draw; /* NOTE: we probably want to eventually refactor this so each state @@ -262,7 +256,7 @@ struct pipe_stencil_ref *sr = &ctx->stencil_ref; uint32_t val = zsa->rb_depthcontrol; - if (fp->has_kill) + if (fs->has_kill) val &= ~A2XX_RB_DEPTHCONTROL_EARLY_Z_ENABLE; OUT_PKT3(ring, CP_SET_CONSTANT, 2); @@ -301,6 +295,18 @@ OUT_RING(ring, fui(1.0)); /* PA_CL_GB_VERT_DISC_ADJ */ OUT_RING(ring, fui(1.0)); /* PA_CL_GB_HORZ_CLIP_ADJ */ OUT_RING(ring, fui(1.0)); /* PA_CL_GB_HORZ_DISC_ADJ */ + + if (rasterizer->base.offset_tri) { + /* TODO: why multiply scale by 2 ? without it deqp test fails + * deqp/piglit tests aren't very precise + */ + OUT_PKT3(ring, CP_SET_CONSTANT, 5); + OUT_RING(ring, CP_REG(REG_A2XX_PA_SU_POLY_OFFSET_FRONT_SCALE)); + OUT_RING(ring, fui(rasterizer->base.offset_scale * 2.0f)); /* FRONT_SCALE */ + OUT_RING(ring, fui(rasterizer->base.offset_units)); /* FRONT_OFFSET */ + OUT_RING(ring, fui(rasterizer->base.offset_scale * 2.0f)); /* BACK_SCALE */ + OUT_RING(ring, fui(rasterizer->base.offset_units)); /* BACK_OFFSET */ + } } /* NOTE: scissor enabled bit is part of rasterizer state: */ @@ -351,10 +357,10 @@ if (dirty & (FD_DIRTY_PROG | FD_DIRTY_CONST)) { emit_constants(ring, VS_CONST_BASE * 4, &ctx->constbuf[PIPE_SHADER_VERTEX], - (dirty & FD_DIRTY_PROG) ? ctx->prog.vp : NULL); + (dirty & FD_DIRTY_PROG) ? ctx->prog.vs : NULL); emit_constants(ring, PS_CONST_BASE * 4, &ctx->constbuf[PIPE_SHADER_FRAGMENT], - (dirty & FD_DIRTY_PROG) ? ctx->prog.fp : NULL); + (dirty & FD_DIRTY_PROG) ? ctx->prog.fs : NULL); } if (dirty & (FD_DIRTY_BLEND | FD_DIRTY_ZSA)) { @@ -364,15 +370,9 @@ } if (dirty & (FD_DIRTY_BLEND | FD_DIRTY_FRAMEBUFFER)) { - enum pipe_format format = - pipe_surface_format(ctx->batch->framebuffer.cbufs[0]); - bool has_alpha = util_format_has_alpha(format); - OUT_PKT3(ring, CP_SET_CONSTANT, 2); OUT_RING(ring, CP_REG(REG_A2XX_RB_BLEND_CONTROL)); - OUT_RING(ring, blend->rb_blendcontrol_alpha | - COND(has_alpha, blend->rb_blendcontrol_rgb) | - COND(!has_alpha, blend->rb_blendcontrol_no_alpha_rgb)); + OUT_RING(ring, blend->rb_blendcontrol); OUT_PKT3(ring, CP_SET_CONSTANT, 2); OUT_RING(ring, CP_REG(REG_A2XX_RB_COLOR_MASK)); diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/a2xx/fd2_gmem.c mesa-20.0.8/src/gallium/drivers/freedreno/a2xx/fd2_gmem.c --- mesa-19.2.8/src/gallium/drivers/freedreno/a2xx/fd2_gmem.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/a2xx/fd2_gmem.c 2020-06-12 01:21:17.000000000 +0000 @@ -52,7 +52,7 @@ case PIPE_FORMAT_B5G5R5X1_UNORM: case PIPE_FORMAT_B4G4R4A4_UNORM: case PIPE_FORMAT_B4G4R4X4_UNORM: - /* TODO probably some more.. */ + case PIPE_FORMAT_B2G3R3_UNORM: return 1; default: return 0; @@ -62,7 +62,7 @@ static bool use_hw_binning(struct fd_batch *batch) { - struct fd_gmem_stateobj *gmem = &batch->ctx->gmem; + const struct fd_gmem_stateobj *gmem = batch->gmem_state; /* we hardcoded a limit of 8 "pipes", we can increase this limit * at the cost of a slightly larger command stream @@ -89,11 +89,10 @@ { struct fd_ringbuffer *ring = batch->tile_fini; struct fd_resource *rsc = fd_resource(psurf->texture); - uint32_t swap = fmt2swap(psurf->format); - struct fd_resource_slice *slice = - fd_resource_slice(rsc, psurf->u.tex.level); + struct fdl_slice *slice = fd_resource_slice(rsc, psurf->u.tex.level); uint32_t offset = fd_resource_offset(rsc, psurf->u.tex.level, psurf->u.tex.first_layer); + enum pipe_format format = fd_gmem_restore_format(psurf->format); assert((slice->pitch & 31) == 0); assert((offset & 0xfff) == 0); @@ -103,9 +102,8 @@ OUT_PKT3(ring, CP_SET_CONSTANT, 2); OUT_RING(ring, CP_REG(REG_A2XX_RB_COLOR_INFO)); - OUT_RING(ring, A2XX_RB_COLOR_INFO_SWAP(swap) | - A2XX_RB_COLOR_INFO_BASE(base) | - A2XX_RB_COLOR_INFO_FORMAT(fd2_pipe2color(psurf->format))); + OUT_RING(ring, A2XX_RB_COLOR_INFO_BASE(base) | + A2XX_RB_COLOR_INFO_FORMAT(fd2_pipe2color(format))); OUT_PKT3(ring, CP_SET_CONSTANT, 5); OUT_RING(ring, CP_REG(REG_A2XX_RB_COPY_CONTROL)); @@ -113,9 +111,8 @@ OUT_RELOCW(ring, rsc->bo, offset, 0, 0); /* RB_COPY_DEST_BASE */ OUT_RING(ring, slice->pitch >> 5); /* RB_COPY_DEST_PITCH */ OUT_RING(ring, /* RB_COPY_DEST_INFO */ - A2XX_RB_COPY_DEST_INFO_FORMAT(fd2_pipe2color(psurf->format)) | - COND(!rsc->tile_mode, A2XX_RB_COPY_DEST_INFO_LINEAR) | - A2XX_RB_COPY_DEST_INFO_SWAP(swap) | + A2XX_RB_COPY_DEST_INFO_FORMAT(fd2_pipe2color(format)) | + COND(!rsc->layout.tile_mode, A2XX_RB_COPY_DEST_INFO_LINEAR) | A2XX_RB_COPY_DEST_INFO_WRITE_RED | A2XX_RB_COPY_DEST_INFO_WRITE_GREEN | A2XX_RB_COPY_DEST_INFO_WRITE_BLUE | @@ -139,7 +136,7 @@ { struct fd_context *ctx = batch->ctx; struct fd2_context *fd2_ctx = fd2_context(ctx); - struct fd_gmem_stateobj *gmem = &ctx->gmem; + const struct fd_gmem_stateobj *gmem = batch->gmem_state; struct pipe_framebuffer_state *pfb = &batch->framebuffer; struct fd_ringbuffer *ring; @@ -219,7 +216,7 @@ } static void -fd2_emit_tile_gmem2mem(struct fd_batch *batch, struct fd_tile *tile) +fd2_emit_tile_gmem2mem(struct fd_batch *batch, const struct fd_tile *tile) { fd2_emit_ib(batch->gmem, batch->tile_fini); } @@ -232,20 +229,15 @@ { struct fd_ringbuffer *ring = batch->gmem; struct fd_resource *rsc = fd_resource(psurf->texture); - struct fd_resource_slice *slice = - fd_resource_slice(rsc, psurf->u.tex.level); + struct fdl_slice *slice = fd_resource_slice(rsc, psurf->u.tex.level); uint32_t offset = fd_resource_offset(rsc, psurf->u.tex.level, psurf->u.tex.first_layer); - uint32_t swiz; + enum pipe_format format = fd_gmem_restore_format(psurf->format); OUT_PKT3(ring, CP_SET_CONSTANT, 2); OUT_RING(ring, CP_REG(REG_A2XX_RB_COLOR_INFO)); - OUT_RING(ring, A2XX_RB_COLOR_INFO_SWAP(fmt2swap(psurf->format)) | - A2XX_RB_COLOR_INFO_BASE(base) | - A2XX_RB_COLOR_INFO_FORMAT(fd2_pipe2color(psurf->format))); - - swiz = fd2_tex_swiz(psurf->format, PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, - PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W); + OUT_RING(ring, A2XX_RB_COLOR_INFO_BASE(base) | + A2XX_RB_COLOR_INFO_FORMAT(fd2_pipe2color(format))); /* emit fb as a texture: */ OUT_PKT3(ring, CP_SET_CONSTANT, 7); @@ -255,12 +247,15 @@ A2XX_SQ_TEX_0_CLAMP_Z(SQ_TEX_WRAP) | A2XX_SQ_TEX_0_PITCH(slice->pitch)); OUT_RELOC(ring, rsc->bo, offset, - fd2_pipe2surface(psurf->format) | + A2XX_SQ_TEX_1_FORMAT(fd2_pipe2surface(format).format) | A2XX_SQ_TEX_1_CLAMP_POLICY(SQ_TEX_CLAMP_POLICY_OGL), 0); OUT_RING(ring, A2XX_SQ_TEX_2_WIDTH(psurf->width - 1) | A2XX_SQ_TEX_2_HEIGHT(psurf->height - 1)); OUT_RING(ring, A2XX_SQ_TEX_3_MIP_FILTER(SQ_TEX_FILTER_BASEMAP) | - swiz | + A2XX_SQ_TEX_3_SWIZ_X(0) | + A2XX_SQ_TEX_3_SWIZ_Y(1) | + A2XX_SQ_TEX_3_SWIZ_Z(2) | + A2XX_SQ_TEX_3_SWIZ_W(3) | A2XX_SQ_TEX_3_XY_MAG_FILTER(SQ_TEX_FILTER_POINT) | A2XX_SQ_TEX_3_XY_MIN_FILTER(SQ_TEX_FILTER_POINT)); OUT_RING(ring, 0x00000000); @@ -278,11 +273,11 @@ } static void -fd2_emit_tile_mem2gmem(struct fd_batch *batch, struct fd_tile *tile) +fd2_emit_tile_mem2gmem(struct fd_batch *batch, const struct fd_tile *tile) { struct fd_context *ctx = batch->ctx; struct fd2_context *fd2_ctx = fd2_context(ctx); - struct fd_gmem_stateobj *gmem = &ctx->gmem; + const struct fd_gmem_stateobj *gmem = batch->gmem_state; struct fd_ringbuffer *ring = batch->gmem; struct pipe_framebuffer_state *pfb = &batch->framebuffer; unsigned bin_w = tile->bin_w; @@ -440,8 +435,7 @@ return; struct fd_resource *rsc = fd_resource(psurf->texture); - struct fd_resource_slice *slice = - fd_resource_slice(rsc, psurf->u.tex.level); + struct fdl_slice *slice = fd_resource_slice(rsc, psurf->u.tex.level); uint32_t offset = fd_resource_offset(rsc, psurf->u.tex.level, psurf->u.tex.first_layer); @@ -457,7 +451,7 @@ OUT_PKT3(ring, CP_SET_CONSTANT, 2); OUT_RING(ring, CP_REG(REG_A2XX_RB_COLOR_INFO)); OUT_RELOCW(ring, rsc->bo, offset, - COND(!rsc->tile_mode, A2XX_RB_COLOR_INFO_LINEAR) | + COND(!rsc->layout.tile_mode, A2XX_RB_COLOR_INFO_LINEAR) | A2XX_RB_COLOR_INFO_SWAP(fmt2swap(psurf->format)) | A2XX_RB_COLOR_INFO_FORMAT(fd2_pipe2color(psurf->format)), 0); @@ -484,7 +478,7 @@ struct fd_context *ctx = batch->ctx; struct fd_ringbuffer *ring = batch->gmem; struct pipe_framebuffer_state *pfb = &batch->framebuffer; - struct fd_gmem_stateobj *gmem = &ctx->gmem; + const struct fd_gmem_stateobj *gmem = batch->gmem_state; enum pipe_format format = pipe_surface_format(pfb->cbufs[0]); uint32_t reg; @@ -589,16 +583,14 @@ OUT_RING(ring, 0x0000000C); for (int i = 0; i < gmem->num_vsc_pipes; i++) { - struct fd_vsc_pipe *pipe = &ctx->vsc_pipe[i]; - /* allocate in 64k increments to avoid reallocs */ uint32_t bo_size = align(batch->num_vertices, 0x10000); - if (!pipe->bo || fd_bo_size(pipe->bo) < bo_size) { - if (pipe->bo) - fd_bo_del(pipe->bo); - pipe->bo = fd_bo_new(ctx->dev, bo_size, + if (!ctx->vsc_pipe_bo[i] || fd_bo_size(ctx->vsc_pipe_bo[i]) < bo_size) { + if (ctx->vsc_pipe_bo[i]) + fd_bo_del(ctx->vsc_pipe_bo[i]); + ctx->vsc_pipe_bo[i] = fd_bo_new(ctx->dev, bo_size, DRM_FREEDRENO_GEM_TYPE_KMEM, "vsc_pipe[%u]", i); - assert(pipe->bo); + assert(ctx->vsc_pipe_bo[i]); } /* memory export address (export32): @@ -607,7 +599,7 @@ * .z: 0x4B00D000 (?) * .w: 0x4B000000 (?) | max_index (?) */ - OUT_RELOCW(ring, pipe->bo, 0, 0x40000000, -2); + OUT_RELOCW(ring, ctx->vsc_pipe_bo[i], 0, 0x40000000, -2); OUT_RING(ring, 0x00000000); OUT_RING(ring, 0x4B00D000); OUT_RING(ring, 0x4B000000 | bo_size); @@ -617,7 +609,7 @@ OUT_RING(ring, 0x0000018C); for (int i = 0; i < gmem->num_vsc_pipes; i++) { - struct fd_vsc_pipe *pipe = &ctx->vsc_pipe[i]; + const struct fd_vsc_pipe *pipe = &gmem->vsc_pipe[i]; float off_x, off_y, mul_x, mul_y; /* const to tranform from [-1,1] to bin coordinates for this pipe @@ -663,7 +655,7 @@ /* before mem2gmem */ static void -fd2_emit_tile_prep(struct fd_batch *batch, struct fd_tile *tile) +fd2_emit_tile_prep(struct fd_batch *batch, const struct fd_tile *tile) { struct fd_ringbuffer *ring = batch->gmem; struct pipe_framebuffer_state *pfb = &batch->framebuffer; @@ -685,7 +677,7 @@ /* before IB to rendering cmds: */ static void -fd2_emit_tile_renderprep(struct fd_batch *batch, struct fd_tile *tile) +fd2_emit_tile_renderprep(struct fd_batch *batch, const struct fd_tile *tile) { struct fd_context *ctx = batch->ctx; struct fd2_context *fd2_ctx = fd2_context(ctx); @@ -729,7 +721,7 @@ } if (use_hw_binning(batch)) { - struct fd_vsc_pipe *pipe = &ctx->vsc_pipe[tile->p]; + struct fd_bo *pipe_bo = ctx->vsc_pipe_bo[tile->p]; OUT_PKT3(ring, CP_SET_CONSTANT, 2); OUT_RING(ring, CP_REG(REG_A2XX_VGT_CURRENT_BIN_ID_MIN)); @@ -741,7 +733,7 @@ /* TODO only emit this when tile->p changes */ OUT_PKT3(ring, CP_SET_DRAW_INIT_FLAGS, 1); - OUT_RELOC(ring, pipe->bo, 0, 0, 0); + OUT_RELOC(ring, pipe_bo, 0, 0, 0); } } diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/a2xx/fd2_perfcntr.c mesa-20.0.8/src/gallium/drivers/freedreno/a2xx/fd2_perfcntr.c --- mesa-19.2.8/src/gallium/drivers/freedreno/a2xx/fd2_perfcntr.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/a2xx/fd2_perfcntr.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,813 +0,0 @@ -/* - * Copyright (C) 2018 Jonathan Marek - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Authors: - * Jonathan Marek - * Rob Clark - */ - -#include "freedreno_perfcntr.h" -#include "freedreno_util.h" -#include "a2xx.xml.h" - -#define REG(_x) REG_A2XX_ ## _x - -#define COUNTER(_sel, _lo, _hi) { \ - .select_reg = REG(_sel), \ - .counter_reg_lo = REG(_lo), \ - .counter_reg_hi = REG(_hi), \ -} - -#define COUNTABLE(_selector, _query_type, _result_type) { \ - .name = #_selector, \ - .selector = _selector, \ - .query_type = PIPE_DRIVER_QUERY_TYPE_ ## _query_type, \ - .result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_ ## _result_type, \ -} - -#define GROUP(_name, _counters, _countables) { \ - .name = _name, \ - .num_counters = ARRAY_SIZE(_counters), \ - .counters = _counters, \ - .num_countables = ARRAY_SIZE(_countables), \ - .countables = _countables, \ -} - -static const struct fd_perfcntr_countable pa_su_countables[] = { - COUNTABLE(PERF_PAPC_PASX_REQ, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_PASX_FIRST_VECTOR, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_PASX_SECOND_VECTOR, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_PASX_FIRST_DEAD, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_PASX_SECOND_DEAD, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_PASX_VTX_KILL_DISCARD, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_PASX_VTX_NAN_DISCARD, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_PA_INPUT_PRIM, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_PA_INPUT_NULL_PRIM, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_PA_INPUT_EVENT_FLAG, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_PA_INPUT_FIRST_PRIM_SLOT, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_PA_INPUT_END_OF_PACKET, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_CLPR_CULL_PRIM, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_CLPR_VV_CULL_PRIM, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_CLPR_VTX_KILL_CULL_PRIM, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_CLPR_VTX_NAN_CULL_PRIM, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_CLPR_CULL_TO_NULL_PRIM, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_CLPR_VV_CLIP_PRIM, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_CLPR_POINT_CLIP_CANDIDATE, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_CLPR_CLIP_PLANE_CNT_1, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_CLPR_CLIP_PLANE_CNT_2, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_CLPR_CLIP_PLANE_CNT_3, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_CLPR_CLIP_PLANE_CNT_4, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_CLPR_CLIP_PLANE_CNT_5, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_CLPR_CLIP_PLANE_CNT_6, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_CLPR_CLIP_PLANE_NEAR, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_CLPR_CLIP_PLANE_FAR, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_CLPR_CLIP_PLANE_LEFT, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_CLPR_CLIP_PLANE_RIGHT, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_CLPR_CLIP_PLANE_TOP, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_CLPR_CLIP_PLANE_BOTTOM, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_CLSM_NULL_PRIM, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_CLSM_TOTALLY_VISIBLE_PRIM, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_CLSM_CLIP_PRIM, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_CLSM_CULL_TO_NULL_PRIM, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_CLSM_OUT_PRIM_CNT_1, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_CLSM_OUT_PRIM_CNT_2, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_CLSM_OUT_PRIM_CNT_3, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_CLSM_OUT_PRIM_CNT_4, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_CLSM_OUT_PRIM_CNT_5, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_CLSM_OUT_PRIM_CNT_6_7, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_CLSM_NON_TRIVIAL_CULL, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_SU_INPUT_PRIM, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_SU_INPUT_CLIP_PRIM, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_SU_INPUT_NULL_PRIM, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_SU_ZERO_AREA_CULL_PRIM, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_SU_BACK_FACE_CULL_PRIM, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_SU_FRONT_FACE_CULL_PRIM, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_SU_POLYMODE_FACE_CULL, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_SU_POLYMODE_BACK_CULL, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_SU_POLYMODE_FRONT_CULL, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_SU_POLYMODE_INVALID_FILL, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_SU_OUTPUT_PRIM, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_SU_OUTPUT_CLIP_PRIM, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_SU_OUTPUT_NULL_PRIM, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_SU_OUTPUT_EVENT_FLAG, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_SU_OUTPUT_FIRST_PRIM_SLOT, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_SU_OUTPUT_END_OF_PACKET, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_SU_OUTPUT_POLYMODE_FACE, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_SU_OUTPUT_POLYMODE_BACK, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_SU_OUTPUT_POLYMODE_FRONT, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_SU_OUT_CLIP_POLYMODE_FACE, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_SU_OUT_CLIP_POLYMODE_BACK, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_SU_OUT_CLIP_POLYMODE_FRONT, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_PASX_REQ_IDLE, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_PASX_REQ_BUSY, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_PASX_REQ_STALLED, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_PASX_REC_IDLE, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_PASX_REC_BUSY, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_PASX_REC_STARVED_SX, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_PASX_REC_STALLED, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_PASX_REC_STALLED_POS_MEM, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_PASX_REC_STALLED_CCGSM_IN, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_CCGSM_IDLE, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_CCGSM_BUSY, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_CCGSM_STALLED, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_CLPRIM_IDLE, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_CLPRIM_BUSY, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_CLPRIM_STALLED, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_CLPRIM_STARVED_CCGSM, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_CLIPSM_IDLE, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_CLIPSM_BUSY, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_CLIPSM_WAIT_CLIP_VERT_ENGH, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_CLIPSM_WAIT_HIGH_PRI_SEQ, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_CLIPSM_WAIT_CLIPGA, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_CLIPSM_WAIT_AVAIL_VTE_CLIP, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_CLIPSM_WAIT_CLIP_OUTSM, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_CLIPGA_IDLE, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_CLIPGA_BUSY, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_CLIPGA_STARVED_VTE_CLIP, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_CLIPGA_STALLED, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_CLIP_IDLE, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_CLIP_BUSY, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_SU_IDLE, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_SU_BUSY, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_SU_STARVED_CLIP, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_SU_STALLED_SC, UINT64, AVERAGE), - COUNTABLE(PERF_PAPC_SU_FACENESS_CULL, UINT64, AVERAGE), -}; - -static const struct fd_perfcntr_countable pa_sc_countables[] = { - COUNTABLE(SC_SR_WINDOW_VALID, UINT64, AVERAGE), - COUNTABLE(SC_CW_WINDOW_VALID, UINT64, AVERAGE), - COUNTABLE(SC_QM_WINDOW_VALID, UINT64, AVERAGE), - COUNTABLE(SC_FW_WINDOW_VALID, UINT64, AVERAGE), - COUNTABLE(SC_EZ_WINDOW_VALID, UINT64, AVERAGE), - COUNTABLE(SC_IT_WINDOW_VALID, UINT64, AVERAGE), - COUNTABLE(SC_STARVED_BY_PA, UINT64, AVERAGE), - COUNTABLE(SC_STALLED_BY_RB_TILE, UINT64, AVERAGE), - COUNTABLE(SC_STALLED_BY_RB_SAMP, UINT64, AVERAGE), - COUNTABLE(SC_STARVED_BY_RB_EZ, UINT64, AVERAGE), - COUNTABLE(SC_STALLED_BY_SAMPLE_FF, UINT64, AVERAGE), - COUNTABLE(SC_STALLED_BY_SQ, UINT64, AVERAGE), - COUNTABLE(SC_STALLED_BY_SP, UINT64, AVERAGE), - COUNTABLE(SC_TOTAL_NO_PRIMS, UINT64, AVERAGE), - COUNTABLE(SC_NON_EMPTY_PRIMS, UINT64, AVERAGE), - COUNTABLE(SC_NO_TILES_PASSING_QM, UINT64, AVERAGE), - COUNTABLE(SC_NO_PIXELS_PRE_EZ, UINT64, AVERAGE), - COUNTABLE(SC_NO_PIXELS_POST_EZ, UINT64, AVERAGE), -}; - -static const struct fd_perfcntr_countable vgt_countables[] = { - COUNTABLE(VGT_SQ_EVENT_WINDOW_ACTIVE, UINT64, AVERAGE), - COUNTABLE(VGT_SQ_SEND, UINT64, AVERAGE), - COUNTABLE(VGT_SQ_STALLED, UINT64, AVERAGE), - COUNTABLE(VGT_SQ_STARVED_BUSY, UINT64, AVERAGE), - COUNTABLE(VGT_SQ_STARVED_IDLE, UINT64, AVERAGE), - COUNTABLE(VGT_SQ_STATIC, UINT64, AVERAGE), - COUNTABLE(VGT_PA_EVENT_WINDOW_ACTIVE, UINT64, AVERAGE), - COUNTABLE(VGT_PA_CLIP_V_SEND, UINT64, AVERAGE), - COUNTABLE(VGT_PA_CLIP_V_STALLED, UINT64, AVERAGE), - COUNTABLE(VGT_PA_CLIP_V_STARVED_BUSY, UINT64, AVERAGE), - COUNTABLE(VGT_PA_CLIP_V_STARVED_IDLE, UINT64, AVERAGE), - COUNTABLE(VGT_PA_CLIP_V_STATIC, UINT64, AVERAGE), - COUNTABLE(VGT_PA_CLIP_P_SEND, UINT64, AVERAGE), - COUNTABLE(VGT_PA_CLIP_P_STALLED, UINT64, AVERAGE), - COUNTABLE(VGT_PA_CLIP_P_STARVED_BUSY, UINT64, AVERAGE), - COUNTABLE(VGT_PA_CLIP_P_STARVED_IDLE, UINT64, AVERAGE), - COUNTABLE(VGT_PA_CLIP_P_STATIC, UINT64, AVERAGE), - COUNTABLE(VGT_PA_CLIP_S_SEND, UINT64, AVERAGE), - COUNTABLE(VGT_PA_CLIP_S_STALLED, UINT64, AVERAGE), - COUNTABLE(VGT_PA_CLIP_S_STARVED_BUSY, UINT64, AVERAGE), - COUNTABLE(VGT_PA_CLIP_S_STARVED_IDLE, UINT64, AVERAGE), - COUNTABLE(VGT_PA_CLIP_S_STATIC, UINT64, AVERAGE), - COUNTABLE(RBIU_FIFOS_EVENT_WINDOW_ACTIVE, UINT64, AVERAGE), - COUNTABLE(RBIU_IMMED_DATA_FIFO_STARVED, UINT64, AVERAGE), - COUNTABLE(RBIU_IMMED_DATA_FIFO_STALLED, UINT64, AVERAGE), - COUNTABLE(RBIU_DMA_REQUEST_FIFO_STARVED, UINT64, AVERAGE), - COUNTABLE(RBIU_DMA_REQUEST_FIFO_STALLED, UINT64, AVERAGE), - COUNTABLE(RBIU_DRAW_INITIATOR_FIFO_STARVED, UINT64, AVERAGE), - COUNTABLE(RBIU_DRAW_INITIATOR_FIFO_STALLED, UINT64, AVERAGE), - COUNTABLE(BIN_PRIM_NEAR_CULL, UINT64, AVERAGE), - COUNTABLE(BIN_PRIM_ZERO_CULL, UINT64, AVERAGE), - COUNTABLE(BIN_PRIM_FAR_CULL, UINT64, AVERAGE), - COUNTABLE(BIN_PRIM_BIN_CULL, UINT64, AVERAGE), - COUNTABLE(BIN_PRIM_FACE_CULL, UINT64, AVERAGE), - COUNTABLE(SPARE34, UINT64, AVERAGE), - COUNTABLE(SPARE35, UINT64, AVERAGE), - COUNTABLE(SPARE36, UINT64, AVERAGE), - COUNTABLE(SPARE37, UINT64, AVERAGE), - COUNTABLE(SPARE38, UINT64, AVERAGE), - COUNTABLE(SPARE39, UINT64, AVERAGE), - COUNTABLE(TE_SU_IN_VALID, UINT64, AVERAGE), - COUNTABLE(TE_SU_IN_READ, UINT64, AVERAGE), - COUNTABLE(TE_SU_IN_PRIM, UINT64, AVERAGE), - COUNTABLE(TE_SU_IN_EOP, UINT64, AVERAGE), - COUNTABLE(TE_SU_IN_NULL_PRIM, UINT64, AVERAGE), - COUNTABLE(TE_WK_IN_VALID, UINT64, AVERAGE), - COUNTABLE(TE_WK_IN_READ, UINT64, AVERAGE), - COUNTABLE(TE_OUT_PRIM_VALID, UINT64, AVERAGE), - COUNTABLE(TE_OUT_PRIM_READ, UINT64, AVERAGE), -}; - -static const struct fd_perfcntr_countable tcr_countables[] = { - COUNTABLE(DGMMPD_IPMUX0_STALL, UINT64, AVERAGE), - COUNTABLE(DGMMPD_IPMUX_ALL_STALL, UINT64, AVERAGE), - COUNTABLE(OPMUX0_L2_WRITES, UINT64, AVERAGE), -}; - -static const struct fd_perfcntr_countable tp0_countables[] = { - COUNTABLE(POINT_QUADS, UINT64, AVERAGE), - COUNTABLE(BILIN_QUADS, UINT64, AVERAGE), - COUNTABLE(ANISO_QUADS, UINT64, AVERAGE), - COUNTABLE(MIP_QUADS, UINT64, AVERAGE), - COUNTABLE(VOL_QUADS, UINT64, AVERAGE), - COUNTABLE(MIP_VOL_QUADS, UINT64, AVERAGE), - COUNTABLE(MIP_ANISO_QUADS, UINT64, AVERAGE), - COUNTABLE(VOL_ANISO_QUADS, UINT64, AVERAGE), - COUNTABLE(ANISO_2_1_QUADS, UINT64, AVERAGE), - COUNTABLE(ANISO_4_1_QUADS, UINT64, AVERAGE), - COUNTABLE(ANISO_6_1_QUADS, UINT64, AVERAGE), - COUNTABLE(ANISO_8_1_QUADS, UINT64, AVERAGE), - COUNTABLE(ANISO_10_1_QUADS, UINT64, AVERAGE), - COUNTABLE(ANISO_12_1_QUADS, UINT64, AVERAGE), - COUNTABLE(ANISO_14_1_QUADS, UINT64, AVERAGE), - COUNTABLE(ANISO_16_1_QUADS, UINT64, AVERAGE), - COUNTABLE(MIP_VOL_ANISO_QUADS, UINT64, AVERAGE), - COUNTABLE(ALIGN_2_QUADS, UINT64, AVERAGE), - COUNTABLE(ALIGN_4_QUADS, UINT64, AVERAGE), - COUNTABLE(PIX_0_QUAD, UINT64, AVERAGE), - COUNTABLE(PIX_1_QUAD, UINT64, AVERAGE), - COUNTABLE(PIX_2_QUAD, UINT64, AVERAGE), - COUNTABLE(PIX_3_QUAD, UINT64, AVERAGE), - COUNTABLE(PIX_4_QUAD, UINT64, AVERAGE), - COUNTABLE(TP_MIPMAP_LOD0, UINT64, AVERAGE), - COUNTABLE(TP_MIPMAP_LOD1, UINT64, AVERAGE), - COUNTABLE(TP_MIPMAP_LOD2, UINT64, AVERAGE), - COUNTABLE(TP_MIPMAP_LOD3, UINT64, AVERAGE), - COUNTABLE(TP_MIPMAP_LOD4, UINT64, AVERAGE), - COUNTABLE(TP_MIPMAP_LOD5, UINT64, AVERAGE), - COUNTABLE(TP_MIPMAP_LOD6, UINT64, AVERAGE), - COUNTABLE(TP_MIPMAP_LOD7, UINT64, AVERAGE), - COUNTABLE(TP_MIPMAP_LOD8, UINT64, AVERAGE), - COUNTABLE(TP_MIPMAP_LOD9, UINT64, AVERAGE), - COUNTABLE(TP_MIPMAP_LOD10, UINT64, AVERAGE), - COUNTABLE(TP_MIPMAP_LOD11, UINT64, AVERAGE), - COUNTABLE(TP_MIPMAP_LOD12, UINT64, AVERAGE), - COUNTABLE(TP_MIPMAP_LOD13, UINT64, AVERAGE), - COUNTABLE(TP_MIPMAP_LOD14, UINT64, AVERAGE), -}; - -static const struct fd_perfcntr_countable tcm_countables[] = { - COUNTABLE(QUAD0_RD_LAT_FIFO_EMPTY, UINT64, AVERAGE), - COUNTABLE(QUAD0_RD_LAT_FIFO_4TH_FULL, UINT64, AVERAGE), - COUNTABLE(QUAD0_RD_LAT_FIFO_HALF_FULL, UINT64, AVERAGE), - COUNTABLE(QUAD0_RD_LAT_FIFO_FULL, UINT64, AVERAGE), - COUNTABLE(QUAD0_RD_LAT_FIFO_LT_4TH_FULL, UINT64, AVERAGE), - COUNTABLE(READ_STARVED_QUAD0, UINT64, AVERAGE), - COUNTABLE(READ_STARVED, UINT64, AVERAGE), - COUNTABLE(READ_STALLED_QUAD0, UINT64, AVERAGE), - COUNTABLE(READ_STALLED, UINT64, AVERAGE), - COUNTABLE(VALID_READ_QUAD0, UINT64, AVERAGE), - COUNTABLE(TC_TP_STARVED_QUAD0, UINT64, AVERAGE), - COUNTABLE(TC_TP_STARVED, UINT64, AVERAGE), -}; - -static const struct fd_perfcntr_countable tcf_countables[] = { - COUNTABLE(VALID_CYCLES, UINT64, AVERAGE), - COUNTABLE(SINGLE_PHASES, UINT64, AVERAGE), - COUNTABLE(ANISO_PHASES, UINT64, AVERAGE), - COUNTABLE(MIP_PHASES, UINT64, AVERAGE), - COUNTABLE(VOL_PHASES, UINT64, AVERAGE), - COUNTABLE(MIP_VOL_PHASES, UINT64, AVERAGE), - COUNTABLE(MIP_ANISO_PHASES, UINT64, AVERAGE), - COUNTABLE(VOL_ANISO_PHASES, UINT64, AVERAGE), - COUNTABLE(ANISO_2_1_PHASES, UINT64, AVERAGE), - COUNTABLE(ANISO_4_1_PHASES, UINT64, AVERAGE), - COUNTABLE(ANISO_6_1_PHASES, UINT64, AVERAGE), - COUNTABLE(ANISO_8_1_PHASES, UINT64, AVERAGE), - COUNTABLE(ANISO_10_1_PHASES, UINT64, AVERAGE), - COUNTABLE(ANISO_12_1_PHASES, UINT64, AVERAGE), - COUNTABLE(ANISO_14_1_PHASES, UINT64, AVERAGE), - COUNTABLE(ANISO_16_1_PHASES, UINT64, AVERAGE), - COUNTABLE(MIP_VOL_ANISO_PHASES, UINT64, AVERAGE), - COUNTABLE(ALIGN_2_PHASES, UINT64, AVERAGE), - COUNTABLE(ALIGN_4_PHASES, UINT64, AVERAGE), - COUNTABLE(TPC_BUSY, UINT64, AVERAGE), - COUNTABLE(TPC_STALLED, UINT64, AVERAGE), - COUNTABLE(TPC_STARVED, UINT64, AVERAGE), - COUNTABLE(TPC_WORKING, UINT64, AVERAGE), - COUNTABLE(TPC_WALKER_BUSY, UINT64, AVERAGE), - COUNTABLE(TPC_WALKER_STALLED, UINT64, AVERAGE), - COUNTABLE(TPC_WALKER_WORKING, UINT64, AVERAGE), - COUNTABLE(TPC_ALIGNER_BUSY, UINT64, AVERAGE), - COUNTABLE(TPC_ALIGNER_STALLED, UINT64, AVERAGE), - COUNTABLE(TPC_ALIGNER_STALLED_BY_BLEND, UINT64, AVERAGE), - COUNTABLE(TPC_ALIGNER_STALLED_BY_CACHE, UINT64, AVERAGE), - COUNTABLE(TPC_ALIGNER_WORKING, UINT64, AVERAGE), - COUNTABLE(TPC_BLEND_BUSY, UINT64, AVERAGE), - COUNTABLE(TPC_BLEND_SYNC, UINT64, AVERAGE), - COUNTABLE(TPC_BLEND_STARVED, UINT64, AVERAGE), - COUNTABLE(TPC_BLEND_WORKING, UINT64, AVERAGE), - COUNTABLE(OPCODE_0x00, UINT64, AVERAGE), - COUNTABLE(OPCODE_0x01, UINT64, AVERAGE), - COUNTABLE(OPCODE_0x04, UINT64, AVERAGE), - COUNTABLE(OPCODE_0x10, UINT64, AVERAGE), - COUNTABLE(OPCODE_0x11, UINT64, AVERAGE), - COUNTABLE(OPCODE_0x12, UINT64, AVERAGE), - COUNTABLE(OPCODE_0x13, UINT64, AVERAGE), - COUNTABLE(OPCODE_0x18, UINT64, AVERAGE), - COUNTABLE(OPCODE_0x19, UINT64, AVERAGE), - COUNTABLE(OPCODE_0x1A, UINT64, AVERAGE), - COUNTABLE(OPCODE_OTHER, UINT64, AVERAGE), - COUNTABLE(IN_FIFO_0_EMPTY, UINT64, AVERAGE), - COUNTABLE(IN_FIFO_0_LT_HALF_FULL, UINT64, AVERAGE), - COUNTABLE(IN_FIFO_0_HALF_FULL, UINT64, AVERAGE), - COUNTABLE(IN_FIFO_0_FULL, UINT64, AVERAGE), - COUNTABLE(IN_FIFO_TPC_EMPTY, UINT64, AVERAGE), - COUNTABLE(IN_FIFO_TPC_LT_HALF_FULL, UINT64, AVERAGE), - COUNTABLE(IN_FIFO_TPC_HALF_FULL, UINT64, AVERAGE), - COUNTABLE(IN_FIFO_TPC_FULL, UINT64, AVERAGE), - COUNTABLE(TPC_TC_XFC, UINT64, AVERAGE), - COUNTABLE(TPC_TC_STATE, UINT64, AVERAGE), - COUNTABLE(TC_STALL, UINT64, AVERAGE), - COUNTABLE(QUAD0_TAPS, UINT64, AVERAGE), - COUNTABLE(QUADS, UINT64, AVERAGE), - COUNTABLE(TCA_SYNC_STALL, UINT64, AVERAGE), - COUNTABLE(TAG_STALL, UINT64, AVERAGE), - COUNTABLE(TCB_SYNC_STALL, UINT64, AVERAGE), - COUNTABLE(TCA_VALID, UINT64, AVERAGE), - COUNTABLE(PROBES_VALID, UINT64, AVERAGE), - COUNTABLE(MISS_STALL, UINT64, AVERAGE), - COUNTABLE(FETCH_FIFO_STALL, UINT64, AVERAGE), - COUNTABLE(TCO_STALL, UINT64, AVERAGE), - COUNTABLE(ANY_STALL, UINT64, AVERAGE), - COUNTABLE(TAG_MISSES, UINT64, AVERAGE), - COUNTABLE(TAG_HITS, UINT64, AVERAGE), - COUNTABLE(SUB_TAG_MISSES, UINT64, AVERAGE), - COUNTABLE(SET0_INVALIDATES, UINT64, AVERAGE), - COUNTABLE(SET1_INVALIDATES, UINT64, AVERAGE), - COUNTABLE(SET2_INVALIDATES, UINT64, AVERAGE), - COUNTABLE(SET3_INVALIDATES, UINT64, AVERAGE), - COUNTABLE(SET0_TAG_MISSES, UINT64, AVERAGE), - COUNTABLE(SET1_TAG_MISSES, UINT64, AVERAGE), - COUNTABLE(SET2_TAG_MISSES, UINT64, AVERAGE), - COUNTABLE(SET3_TAG_MISSES, UINT64, AVERAGE), - COUNTABLE(SET0_TAG_HITS, UINT64, AVERAGE), - COUNTABLE(SET1_TAG_HITS, UINT64, AVERAGE), - COUNTABLE(SET2_TAG_HITS, UINT64, AVERAGE), - COUNTABLE(SET3_TAG_HITS, UINT64, AVERAGE), - COUNTABLE(SET0_SUB_TAG_MISSES, UINT64, AVERAGE), - COUNTABLE(SET1_SUB_TAG_MISSES, UINT64, AVERAGE), - COUNTABLE(SET2_SUB_TAG_MISSES, UINT64, AVERAGE), - COUNTABLE(SET3_SUB_TAG_MISSES, UINT64, AVERAGE), - COUNTABLE(SET0_EVICT1, UINT64, AVERAGE), - COUNTABLE(SET0_EVICT2, UINT64, AVERAGE), - COUNTABLE(SET0_EVICT3, UINT64, AVERAGE), - COUNTABLE(SET0_EVICT4, UINT64, AVERAGE), - COUNTABLE(SET0_EVICT5, UINT64, AVERAGE), - COUNTABLE(SET0_EVICT6, UINT64, AVERAGE), - COUNTABLE(SET0_EVICT7, UINT64, AVERAGE), - COUNTABLE(SET0_EVICT8, UINT64, AVERAGE), - COUNTABLE(SET1_EVICT1, UINT64, AVERAGE), - COUNTABLE(SET1_EVICT2, UINT64, AVERAGE), - COUNTABLE(SET1_EVICT3, UINT64, AVERAGE), - COUNTABLE(SET1_EVICT4, UINT64, AVERAGE), - COUNTABLE(SET1_EVICT5, UINT64, AVERAGE), - COUNTABLE(SET1_EVICT6, UINT64, AVERAGE), - COUNTABLE(SET1_EVICT7, UINT64, AVERAGE), - COUNTABLE(SET1_EVICT8, UINT64, AVERAGE), - COUNTABLE(SET2_EVICT1, UINT64, AVERAGE), - COUNTABLE(SET2_EVICT2, UINT64, AVERAGE), - COUNTABLE(SET2_EVICT3, UINT64, AVERAGE), - COUNTABLE(SET2_EVICT4, UINT64, AVERAGE), - COUNTABLE(SET2_EVICT5, UINT64, AVERAGE), - COUNTABLE(SET2_EVICT6, UINT64, AVERAGE), - COUNTABLE(SET2_EVICT7, UINT64, AVERAGE), - COUNTABLE(SET2_EVICT8, UINT64, AVERAGE), - COUNTABLE(SET3_EVICT1, UINT64, AVERAGE), - COUNTABLE(SET3_EVICT2, UINT64, AVERAGE), - COUNTABLE(SET3_EVICT3, UINT64, AVERAGE), - COUNTABLE(SET3_EVICT4, UINT64, AVERAGE), - COUNTABLE(SET3_EVICT5, UINT64, AVERAGE), - COUNTABLE(SET3_EVICT6, UINT64, AVERAGE), - COUNTABLE(SET3_EVICT7, UINT64, AVERAGE), - COUNTABLE(SET3_EVICT8, UINT64, AVERAGE), - COUNTABLE(FF_EMPTY, UINT64, AVERAGE), - COUNTABLE(FF_LT_HALF_FULL, UINT64, AVERAGE), - COUNTABLE(FF_HALF_FULL, UINT64, AVERAGE), - COUNTABLE(FF_FULL, UINT64, AVERAGE), - COUNTABLE(FF_XFC, UINT64, AVERAGE), - COUNTABLE(FF_STALLED, UINT64, AVERAGE), - COUNTABLE(FG_MASKS, UINT64, AVERAGE), - COUNTABLE(FG_LEFT_MASKS, UINT64, AVERAGE), - COUNTABLE(FG_LEFT_MASK_STALLED, UINT64, AVERAGE), - COUNTABLE(FG_LEFT_NOT_DONE_STALL, UINT64, AVERAGE), - COUNTABLE(FG_LEFT_FG_STALL, UINT64, AVERAGE), - COUNTABLE(FG_LEFT_SECTORS, UINT64, AVERAGE), - COUNTABLE(FG0_REQUESTS, UINT64, AVERAGE), - COUNTABLE(FG0_STALLED, UINT64, AVERAGE), - COUNTABLE(MEM_REQ512, UINT64, AVERAGE), - COUNTABLE(MEM_REQ_SENT, UINT64, AVERAGE), - COUNTABLE(MEM_LOCAL_READ_REQ, UINT64, AVERAGE), - COUNTABLE(TC0_MH_STALLED, UINT64, AVERAGE), -}; - -static const struct fd_perfcntr_countable sq_countables[] = { - COUNTABLE(SQ_PIXEL_VECTORS_SUB, UINT64, AVERAGE), - COUNTABLE(SQ_VERTEX_VECTORS_SUB, UINT64, AVERAGE), - COUNTABLE(SQ_ALU0_ACTIVE_VTX_SIMD0, UINT64, AVERAGE), - COUNTABLE(SQ_ALU1_ACTIVE_VTX_SIMD0, UINT64, AVERAGE), - COUNTABLE(SQ_ALU0_ACTIVE_PIX_SIMD0, UINT64, AVERAGE), - COUNTABLE(SQ_ALU1_ACTIVE_PIX_SIMD0, UINT64, AVERAGE), - COUNTABLE(SQ_ALU0_ACTIVE_VTX_SIMD1, UINT64, AVERAGE), - COUNTABLE(SQ_ALU1_ACTIVE_VTX_SIMD1, UINT64, AVERAGE), - COUNTABLE(SQ_ALU0_ACTIVE_PIX_SIMD1, UINT64, AVERAGE), - COUNTABLE(SQ_ALU1_ACTIVE_PIX_SIMD1, UINT64, AVERAGE), - COUNTABLE(SQ_EXPORT_CYCLES, UINT64, AVERAGE), - COUNTABLE(SQ_ALU_CST_WRITTEN, UINT64, AVERAGE), - COUNTABLE(SQ_TEX_CST_WRITTEN, UINT64, AVERAGE), - COUNTABLE(SQ_ALU_CST_STALL, UINT64, AVERAGE), - COUNTABLE(SQ_ALU_TEX_STALL, UINT64, AVERAGE), - COUNTABLE(SQ_INST_WRITTEN, UINT64, AVERAGE), - COUNTABLE(SQ_BOOLEAN_WRITTEN, UINT64, AVERAGE), - COUNTABLE(SQ_LOOPS_WRITTEN, UINT64, AVERAGE), - COUNTABLE(SQ_PIXEL_SWAP_IN, UINT64, AVERAGE), - COUNTABLE(SQ_PIXEL_SWAP_OUT, UINT64, AVERAGE), - COUNTABLE(SQ_VERTEX_SWAP_IN, UINT64, AVERAGE), - COUNTABLE(SQ_VERTEX_SWAP_OUT, UINT64, AVERAGE), - COUNTABLE(SQ_ALU_VTX_INST_ISSUED, UINT64, AVERAGE), - COUNTABLE(SQ_TEX_VTX_INST_ISSUED, UINT64, AVERAGE), - COUNTABLE(SQ_VC_VTX_INST_ISSUED, UINT64, AVERAGE), - COUNTABLE(SQ_CF_VTX_INST_ISSUED, UINT64, AVERAGE), - COUNTABLE(SQ_ALU_PIX_INST_ISSUED, UINT64, AVERAGE), - COUNTABLE(SQ_TEX_PIX_INST_ISSUED, UINT64, AVERAGE), - COUNTABLE(SQ_VC_PIX_INST_ISSUED, UINT64, AVERAGE), - COUNTABLE(SQ_CF_PIX_INST_ISSUED, UINT64, AVERAGE), - COUNTABLE(SQ_ALU0_FIFO_EMPTY_SIMD0, UINT64, AVERAGE), - COUNTABLE(SQ_ALU1_FIFO_EMPTY_SIMD0, UINT64, AVERAGE), - COUNTABLE(SQ_ALU0_FIFO_EMPTY_SIMD1, UINT64, AVERAGE), - COUNTABLE(SQ_ALU1_FIFO_EMPTY_SIMD1, UINT64, AVERAGE), - COUNTABLE(SQ_ALU_NOPS, UINT64, AVERAGE), - COUNTABLE(SQ_PRED_SKIP, UINT64, AVERAGE), - COUNTABLE(SQ_SYNC_ALU_STALL_SIMD0_VTX, UINT64, AVERAGE), - COUNTABLE(SQ_SYNC_ALU_STALL_SIMD1_VTX, UINT64, AVERAGE), - COUNTABLE(SQ_SYNC_TEX_STALL_VTX, UINT64, AVERAGE), - COUNTABLE(SQ_SYNC_VC_STALL_VTX, UINT64, AVERAGE), - COUNTABLE(SQ_CONSTANTS_USED_SIMD0, UINT64, AVERAGE), - COUNTABLE(SQ_CONSTANTS_SENT_SP_SIMD0, UINT64, AVERAGE), - COUNTABLE(SQ_GPR_STALL_VTX, UINT64, AVERAGE), - COUNTABLE(SQ_GPR_STALL_PIX, UINT64, AVERAGE), - COUNTABLE(SQ_VTX_RS_STALL, UINT64, AVERAGE), - COUNTABLE(SQ_PIX_RS_STALL, UINT64, AVERAGE), - COUNTABLE(SQ_SX_PC_FULL, UINT64, AVERAGE), - COUNTABLE(SQ_SX_EXP_BUFF_FULL, UINT64, AVERAGE), - COUNTABLE(SQ_SX_POS_BUFF_FULL, UINT64, AVERAGE), - COUNTABLE(SQ_INTERP_QUADS, UINT64, AVERAGE), - COUNTABLE(SQ_INTERP_ACTIVE, UINT64, AVERAGE), - COUNTABLE(SQ_IN_PIXEL_STALL, UINT64, AVERAGE), - COUNTABLE(SQ_IN_VTX_STALL, UINT64, AVERAGE), - COUNTABLE(SQ_VTX_CNT, UINT64, AVERAGE), - COUNTABLE(SQ_VTX_VECTOR2, UINT64, AVERAGE), - COUNTABLE(SQ_VTX_VECTOR3, UINT64, AVERAGE), - COUNTABLE(SQ_VTX_VECTOR4, UINT64, AVERAGE), - COUNTABLE(SQ_PIXEL_VECTOR1, UINT64, AVERAGE), - COUNTABLE(SQ_PIXEL_VECTOR23, UINT64, AVERAGE), - COUNTABLE(SQ_PIXEL_VECTOR4, UINT64, AVERAGE), - COUNTABLE(SQ_CONSTANTS_USED_SIMD1, UINT64, AVERAGE), - COUNTABLE(SQ_CONSTANTS_SENT_SP_SIMD1, UINT64, AVERAGE), - COUNTABLE(SQ_SX_MEM_EXP_FULL, UINT64, AVERAGE), - COUNTABLE(SQ_ALU0_ACTIVE_VTX_SIMD2, UINT64, AVERAGE), - COUNTABLE(SQ_ALU1_ACTIVE_VTX_SIMD2, UINT64, AVERAGE), - COUNTABLE(SQ_ALU0_ACTIVE_PIX_SIMD2, UINT64, AVERAGE), - COUNTABLE(SQ_ALU1_ACTIVE_PIX_SIMD2, UINT64, AVERAGE), - COUNTABLE(SQ_ALU0_ACTIVE_VTX_SIMD3, UINT64, AVERAGE), - COUNTABLE(SQ_PERFCOUNT_VTX_QUAL_TP_DONE, UINT64, AVERAGE), - COUNTABLE(SQ_ALU0_ACTIVE_PIX_SIMD3, UINT64, AVERAGE), - COUNTABLE(SQ_PERFCOUNT_PIX_QUAL_TP_DONE, UINT64, AVERAGE), - COUNTABLE(SQ_ALU0_FIFO_EMPTY_SIMD2, UINT64, AVERAGE), - COUNTABLE(SQ_ALU1_FIFO_EMPTY_SIMD2, UINT64, AVERAGE), - COUNTABLE(SQ_ALU0_FIFO_EMPTY_SIMD3, UINT64, AVERAGE), - COUNTABLE(SQ_ALU1_FIFO_EMPTY_SIMD3, UINT64, AVERAGE), - COUNTABLE(SQ_SYNC_ALU_STALL_SIMD2_VTX, UINT64, AVERAGE), - COUNTABLE(SQ_PERFCOUNT_VTX_POP_THREAD, UINT64, AVERAGE), - COUNTABLE(SQ_SYNC_ALU_STALL_SIMD0_PIX, UINT64, AVERAGE), - COUNTABLE(SQ_SYNC_ALU_STALL_SIMD1_PIX, UINT64, AVERAGE), - COUNTABLE(SQ_SYNC_ALU_STALL_SIMD2_PIX, UINT64, AVERAGE), - COUNTABLE(SQ_PERFCOUNT_PIX_POP_THREAD, UINT64, AVERAGE), - COUNTABLE(SQ_SYNC_TEX_STALL_PIX, UINT64, AVERAGE), - COUNTABLE(SQ_SYNC_VC_STALL_PIX, UINT64, AVERAGE), - COUNTABLE(SQ_CONSTANTS_USED_SIMD2, UINT64, AVERAGE), - COUNTABLE(SQ_CONSTANTS_SENT_SP_SIMD2, UINT64, AVERAGE), - COUNTABLE(SQ_PERFCOUNT_VTX_DEALLOC_ACK, UINT64, AVERAGE), - COUNTABLE(SQ_PERFCOUNT_PIX_DEALLOC_ACK, UINT64, AVERAGE), - COUNTABLE(SQ_ALU0_FIFO_FULL_SIMD0, UINT64, AVERAGE), - COUNTABLE(SQ_ALU1_FIFO_FULL_SIMD0, UINT64, AVERAGE), - COUNTABLE(SQ_ALU0_FIFO_FULL_SIMD1, UINT64, AVERAGE), - COUNTABLE(SQ_ALU1_FIFO_FULL_SIMD1, UINT64, AVERAGE), - COUNTABLE(SQ_ALU0_FIFO_FULL_SIMD2, UINT64, AVERAGE), - COUNTABLE(SQ_ALU1_FIFO_FULL_SIMD2, UINT64, AVERAGE), - COUNTABLE(SQ_ALU0_FIFO_FULL_SIMD3, UINT64, AVERAGE), - COUNTABLE(SQ_ALU1_FIFO_FULL_SIMD3, UINT64, AVERAGE), - COUNTABLE(VC_PERF_STATIC, UINT64, AVERAGE), - COUNTABLE(VC_PERF_STALLED, UINT64, AVERAGE), - COUNTABLE(VC_PERF_STARVED, UINT64, AVERAGE), - COUNTABLE(VC_PERF_SEND, UINT64, AVERAGE), - COUNTABLE(VC_PERF_ACTUAL_STARVED, UINT64, AVERAGE), - COUNTABLE(PIXEL_THREAD_0_ACTIVE, UINT64, AVERAGE), - COUNTABLE(VERTEX_THREAD_0_ACTIVE, UINT64, AVERAGE), - COUNTABLE(PIXEL_THREAD_0_NUMBER, UINT64, AVERAGE), - COUNTABLE(VERTEX_THREAD_0_NUMBER, UINT64, AVERAGE), - COUNTABLE(VERTEX_EVENT_NUMBER, UINT64, AVERAGE), - COUNTABLE(PIXEL_EVENT_NUMBER, UINT64, AVERAGE), - COUNTABLE(PTRBUFF_EF_PUSH, UINT64, AVERAGE), - COUNTABLE(PTRBUFF_EF_POP_EVENT, UINT64, AVERAGE), - COUNTABLE(PTRBUFF_EF_POP_NEW_VTX, UINT64, AVERAGE), - COUNTABLE(PTRBUFF_EF_POP_DEALLOC, UINT64, AVERAGE), - COUNTABLE(PTRBUFF_EF_POP_PVECTOR, UINT64, AVERAGE), - COUNTABLE(PTRBUFF_EF_POP_PVECTOR_X, UINT64, AVERAGE), - COUNTABLE(PTRBUFF_EF_POP_PVECTOR_VNZ, UINT64, AVERAGE), - COUNTABLE(PTRBUFF_PB_DEALLOC, UINT64, AVERAGE), - COUNTABLE(PTRBUFF_PI_STATE_PPB_POP, UINT64, AVERAGE), - COUNTABLE(PTRBUFF_PI_RTR, UINT64, AVERAGE), - COUNTABLE(PTRBUFF_PI_READ_EN, UINT64, AVERAGE), - COUNTABLE(PTRBUFF_PI_BUFF_SWAP, UINT64, AVERAGE), - COUNTABLE(PTRBUFF_SQ_FREE_BUFF, UINT64, AVERAGE), - COUNTABLE(PTRBUFF_SQ_DEC, UINT64, AVERAGE), - COUNTABLE(PTRBUFF_SC_VALID_CNTL_EVENT, UINT64, AVERAGE), - COUNTABLE(PTRBUFF_SC_VALID_IJ_XFER, UINT64, AVERAGE), - COUNTABLE(PTRBUFF_SC_NEW_VECTOR_1_Q, UINT64, AVERAGE), - COUNTABLE(PTRBUFF_QUAL_NEW_VECTOR, UINT64, AVERAGE), - COUNTABLE(PTRBUFF_QUAL_EVENT, UINT64, AVERAGE), - COUNTABLE(PTRBUFF_END_BUFFER, UINT64, AVERAGE), - COUNTABLE(PTRBUFF_FILL_QUAD, UINT64, AVERAGE), - COUNTABLE(VERTS_WRITTEN_SPI, UINT64, AVERAGE), - COUNTABLE(TP_FETCH_INSTR_EXEC, UINT64, AVERAGE), - COUNTABLE(TP_FETCH_INSTR_REQ, UINT64, AVERAGE), - COUNTABLE(TP_DATA_RETURN, UINT64, AVERAGE), - COUNTABLE(SPI_WRITE_CYCLES_SP, UINT64, AVERAGE), - COUNTABLE(SPI_WRITES_SP, UINT64, AVERAGE), - COUNTABLE(SP_ALU_INSTR_EXEC, UINT64, AVERAGE), - COUNTABLE(SP_CONST_ADDR_TO_SQ, UINT64, AVERAGE), - COUNTABLE(SP_PRED_KILLS_TO_SQ, UINT64, AVERAGE), - COUNTABLE(SP_EXPORT_CYCLES_TO_SX, UINT64, AVERAGE), - COUNTABLE(SP_EXPORTS_TO_SX, UINT64, AVERAGE), - COUNTABLE(SQ_CYCLES_ELAPSED, UINT64, AVERAGE), - COUNTABLE(SQ_TCFS_OPT_ALLOC_EXEC, UINT64, AVERAGE), - COUNTABLE(SQ_TCFS_NO_OPT_ALLOC, UINT64, AVERAGE), - COUNTABLE(SQ_ALU0_NO_OPT_ALLOC, UINT64, AVERAGE), - COUNTABLE(SQ_ALU1_NO_OPT_ALLOC, UINT64, AVERAGE), - COUNTABLE(SQ_TCFS_ARB_XFC_CNT, UINT64, AVERAGE), - COUNTABLE(SQ_ALU0_ARB_XFC_CNT, UINT64, AVERAGE), - COUNTABLE(SQ_ALU1_ARB_XFC_CNT, UINT64, AVERAGE), - COUNTABLE(SQ_TCFS_CFS_UPDATE_CNT, UINT64, AVERAGE), - COUNTABLE(SQ_ALU0_CFS_UPDATE_CNT, UINT64, AVERAGE), - COUNTABLE(SQ_ALU1_CFS_UPDATE_CNT, UINT64, AVERAGE), - COUNTABLE(SQ_VTX_PUSH_THREAD_CNT, UINT64, AVERAGE), - COUNTABLE(SQ_VTX_POP_THREAD_CNT, UINT64, AVERAGE), - COUNTABLE(SQ_PIX_PUSH_THREAD_CNT, UINT64, AVERAGE), - COUNTABLE(SQ_PIX_POP_THREAD_CNT, UINT64, AVERAGE), - COUNTABLE(SQ_PIX_TOTAL, UINT64, AVERAGE), - COUNTABLE(SQ_PIX_KILLED, UINT64, AVERAGE), -}; - -static const struct fd_perfcntr_countable sx_countables[] = { - COUNTABLE(SX_EXPORT_VECTORS, UINT64, AVERAGE), - COUNTABLE(SX_DUMMY_QUADS, UINT64, AVERAGE), - COUNTABLE(SX_ALPHA_FAIL, UINT64, AVERAGE), - COUNTABLE(SX_RB_QUAD_BUSY, UINT64, AVERAGE), - COUNTABLE(SX_RB_COLOR_BUSY, UINT64, AVERAGE), - COUNTABLE(SX_RB_QUAD_STALL, UINT64, AVERAGE), - COUNTABLE(SX_RB_COLOR_STALL, UINT64, AVERAGE), -}; - -static const struct fd_perfcntr_countable rb_countables[] = { - COUNTABLE(RBPERF_CNTX_BUSY, UINT64, AVERAGE), - COUNTABLE(RBPERF_CNTX_BUSY_MAX, UINT64, AVERAGE), - COUNTABLE(RBPERF_SX_QUAD_STARVED, UINT64, AVERAGE), - COUNTABLE(RBPERF_SX_QUAD_STARVED_MAX, UINT64, AVERAGE), - COUNTABLE(RBPERF_GA_GC_CH0_SYS_REQ, UINT64, AVERAGE), - COUNTABLE(RBPERF_GA_GC_CH0_SYS_REQ_MAX, UINT64, AVERAGE), - COUNTABLE(RBPERF_GA_GC_CH1_SYS_REQ, UINT64, AVERAGE), - COUNTABLE(RBPERF_GA_GC_CH1_SYS_REQ_MAX, UINT64, AVERAGE), - COUNTABLE(RBPERF_MH_STARVED, UINT64, AVERAGE), - COUNTABLE(RBPERF_MH_STARVED_MAX, UINT64, AVERAGE), - COUNTABLE(RBPERF_AZ_BC_COLOR_BUSY, UINT64, AVERAGE), - COUNTABLE(RBPERF_AZ_BC_COLOR_BUSY_MAX, UINT64, AVERAGE), - COUNTABLE(RBPERF_AZ_BC_Z_BUSY, UINT64, AVERAGE), - COUNTABLE(RBPERF_AZ_BC_Z_BUSY_MAX, UINT64, AVERAGE), - COUNTABLE(RBPERF_RB_SC_TILE_RTR_N, UINT64, AVERAGE), - COUNTABLE(RBPERF_RB_SC_TILE_RTR_N_MAX, UINT64, AVERAGE), - COUNTABLE(RBPERF_RB_SC_SAMP_RTR_N, UINT64, AVERAGE), - COUNTABLE(RBPERF_RB_SC_SAMP_RTR_N_MAX, UINT64, AVERAGE), - COUNTABLE(RBPERF_RB_SX_QUAD_RTR_N, UINT64, AVERAGE), - COUNTABLE(RBPERF_RB_SX_QUAD_RTR_N_MAX, UINT64, AVERAGE), - COUNTABLE(RBPERF_RB_SX_COLOR_RTR_N, UINT64, AVERAGE), - COUNTABLE(RBPERF_RB_SX_COLOR_RTR_N_MAX, UINT64, AVERAGE), - COUNTABLE(RBPERF_RB_SC_SAMP_LZ_BUSY, UINT64, AVERAGE), - COUNTABLE(RBPERF_RB_SC_SAMP_LZ_BUSY_MAX, UINT64, AVERAGE), - COUNTABLE(RBPERF_ZXP_STALL, UINT64, AVERAGE), - COUNTABLE(RBPERF_ZXP_STALL_MAX, UINT64, AVERAGE), - COUNTABLE(RBPERF_EVENT_PENDING, UINT64, AVERAGE), - COUNTABLE(RBPERF_EVENT_PENDING_MAX, UINT64, AVERAGE), - COUNTABLE(RBPERF_RB_MH_VALID, UINT64, AVERAGE), - COUNTABLE(RBPERF_RB_MH_VALID_MAX, UINT64, AVERAGE), - COUNTABLE(RBPERF_SX_RB_QUAD_SEND, UINT64, AVERAGE), - COUNTABLE(RBPERF_SX_RB_COLOR_SEND, UINT64, AVERAGE), - COUNTABLE(RBPERF_SC_RB_TILE_SEND, UINT64, AVERAGE), - COUNTABLE(RBPERF_SC_RB_SAMPLE_SEND, UINT64, AVERAGE), - COUNTABLE(RBPERF_SX_RB_MEM_EXPORT, UINT64, AVERAGE), - COUNTABLE(RBPERF_SX_RB_QUAD_EVENT, UINT64, AVERAGE), - COUNTABLE(RBPERF_SC_RB_TILE_EVENT_FILTERED, UINT64, AVERAGE), - COUNTABLE(RBPERF_SC_RB_TILE_EVENT_ALL, UINT64, AVERAGE), - COUNTABLE(RBPERF_RB_SC_EZ_SEND, UINT64, AVERAGE), - COUNTABLE(RBPERF_RB_SX_INDEX_SEND, UINT64, AVERAGE), - COUNTABLE(RBPERF_GMEM_INTFO_RD, UINT64, AVERAGE), - COUNTABLE(RBPERF_GMEM_INTF1_RD, UINT64, AVERAGE), - COUNTABLE(RBPERF_GMEM_INTFO_WR, UINT64, AVERAGE), - COUNTABLE(RBPERF_GMEM_INTF1_WR, UINT64, AVERAGE), - COUNTABLE(RBPERF_RB_CP_CONTEXT_DONE, UINT64, AVERAGE), - COUNTABLE(RBPERF_RB_CP_CACHE_FLUSH, UINT64, AVERAGE), - COUNTABLE(RBPERF_ZPASS_DONE, UINT64, AVERAGE), - COUNTABLE(RBPERF_ZCMD_VALID, UINT64, AVERAGE), - COUNTABLE(RBPERF_CCMD_VALID, UINT64, AVERAGE), - COUNTABLE(RBPERF_ACCUM_GRANT, UINT64, AVERAGE), - COUNTABLE(RBPERF_ACCUM_C0_GRANT, UINT64, AVERAGE), - COUNTABLE(RBPERF_ACCUM_C1_GRANT, UINT64, AVERAGE), - COUNTABLE(RBPERF_ACCUM_FULL_BE_WR, UINT64, AVERAGE), - COUNTABLE(RBPERF_ACCUM_REQUEST_NO_GRANT, UINT64, AVERAGE), - COUNTABLE(RBPERF_ACCUM_TIMEOUT_PULSE, UINT64, AVERAGE), - COUNTABLE(RBPERF_ACCUM_LIN_TIMEOUT_PULSE, UINT64, AVERAGE), - COUNTABLE(RBPERF_ACCUM_CAM_HIT_FLUSHING, UINT64, AVERAGE), -}; - -static const struct fd_perfcntr_counter pa_su_counters[] = { - COUNTER(PA_SU_PERFCOUNTER0_SELECT, PA_SU_PERFCOUNTER0_LOW, PA_SU_PERFCOUNTER0_HI), - COUNTER(PA_SU_PERFCOUNTER1_SELECT, PA_SU_PERFCOUNTER1_LOW, PA_SU_PERFCOUNTER1_HI), - COUNTER(PA_SU_PERFCOUNTER2_SELECT, PA_SU_PERFCOUNTER2_LOW, PA_SU_PERFCOUNTER2_HI), - COUNTER(PA_SU_PERFCOUNTER3_SELECT, PA_SU_PERFCOUNTER3_LOW, PA_SU_PERFCOUNTER3_HI), -}; - -static const struct fd_perfcntr_counter pa_sc_counters[] = { - COUNTER(PA_SC_PERFCOUNTER0_SELECT, PA_SC_PERFCOUNTER0_LOW, PA_SC_PERFCOUNTER0_HI), -}; - -static const struct fd_perfcntr_counter vgt_counters[] = { - COUNTER(VGT_PERFCOUNTER0_SELECT, VGT_PERFCOUNTER0_LOW, VGT_PERFCOUNTER0_HI), - COUNTER(VGT_PERFCOUNTER1_SELECT, VGT_PERFCOUNTER1_LOW, VGT_PERFCOUNTER1_HI), - COUNTER(VGT_PERFCOUNTER2_SELECT, VGT_PERFCOUNTER2_LOW, VGT_PERFCOUNTER2_HI), - COUNTER(VGT_PERFCOUNTER3_SELECT, VGT_PERFCOUNTER3_LOW, VGT_PERFCOUNTER3_HI), -}; - -static const struct fd_perfcntr_counter tcr_counters[] = { - COUNTER(TCR_PERFCOUNTER0_SELECT, TCR_PERFCOUNTER0_LOW, TCR_PERFCOUNTER0_HI), - COUNTER(TCR_PERFCOUNTER1_SELECT, TCR_PERFCOUNTER1_LOW, TCR_PERFCOUNTER1_HI), -}; - -static const struct fd_perfcntr_counter tp0_counters[] = { - COUNTER(TP0_PERFCOUNTER0_SELECT, TP0_PERFCOUNTER0_LOW, TP0_PERFCOUNTER0_HI), - COUNTER(TP0_PERFCOUNTER1_SELECT, TP0_PERFCOUNTER1_LOW, TP0_PERFCOUNTER1_HI), -}; - -static const struct fd_perfcntr_counter tcm_counters[] = { - COUNTER(TCM_PERFCOUNTER0_SELECT, TCM_PERFCOUNTER0_LOW, TCM_PERFCOUNTER0_HI), - COUNTER(TCM_PERFCOUNTER1_SELECT, TCM_PERFCOUNTER1_LOW, TCM_PERFCOUNTER1_HI), -}; - -static const struct fd_perfcntr_counter tcf_counters[] = { - COUNTER(TCF_PERFCOUNTER0_SELECT, TCF_PERFCOUNTER0_LOW, TCF_PERFCOUNTER0_HI), - COUNTER(TCF_PERFCOUNTER1_SELECT, TCF_PERFCOUNTER1_LOW, TCF_PERFCOUNTER1_HI), - COUNTER(TCF_PERFCOUNTER2_SELECT, TCF_PERFCOUNTER2_LOW, TCF_PERFCOUNTER2_HI), - COUNTER(TCF_PERFCOUNTER3_SELECT, TCF_PERFCOUNTER3_LOW, TCF_PERFCOUNTER3_HI), - COUNTER(TCF_PERFCOUNTER4_SELECT, TCF_PERFCOUNTER4_LOW, TCF_PERFCOUNTER4_HI), - COUNTER(TCF_PERFCOUNTER5_SELECT, TCF_PERFCOUNTER5_LOW, TCF_PERFCOUNTER5_HI), - COUNTER(TCF_PERFCOUNTER6_SELECT, TCF_PERFCOUNTER6_LOW, TCF_PERFCOUNTER6_HI), - COUNTER(TCF_PERFCOUNTER7_SELECT, TCF_PERFCOUNTER7_LOW, TCF_PERFCOUNTER7_HI), - COUNTER(TCF_PERFCOUNTER8_SELECT, TCF_PERFCOUNTER8_LOW, TCF_PERFCOUNTER8_HI), - COUNTER(TCF_PERFCOUNTER9_SELECT, TCF_PERFCOUNTER9_LOW, TCF_PERFCOUNTER9_HI), - COUNTER(TCF_PERFCOUNTER10_SELECT, TCF_PERFCOUNTER10_LOW, TCF_PERFCOUNTER10_HI), - COUNTER(TCF_PERFCOUNTER11_SELECT, TCF_PERFCOUNTER11_LOW, TCF_PERFCOUNTER11_HI), -}; - -static const struct fd_perfcntr_counter sq_counters[] = { - COUNTER(SQ_PERFCOUNTER0_SELECT, SQ_PERFCOUNTER0_LOW, SQ_PERFCOUNTER0_HI), - COUNTER(SQ_PERFCOUNTER1_SELECT, SQ_PERFCOUNTER1_LOW, SQ_PERFCOUNTER1_HI), - COUNTER(SQ_PERFCOUNTER2_SELECT, SQ_PERFCOUNTER2_LOW, SQ_PERFCOUNTER2_HI), - COUNTER(SQ_PERFCOUNTER3_SELECT, SQ_PERFCOUNTER3_LOW, SQ_PERFCOUNTER3_HI), -}; - -static const struct fd_perfcntr_countable rbbm_countables[] = { - COUNTABLE(RBBM1_COUNT, UINT64, AVERAGE), - COUNTABLE(RBBM1_NRT_BUSY, UINT64, AVERAGE), - COUNTABLE(RBBM1_RB_BUSY, UINT64, AVERAGE), - COUNTABLE(RBBM1_SQ_CNTX0_BUSY, UINT64, AVERAGE), - COUNTABLE(RBBM1_SQ_CNTX17_BUSY, UINT64, AVERAGE), - COUNTABLE(RBBM1_VGT_BUSY, UINT64, AVERAGE), - COUNTABLE(RBBM1_VGT_NODMA_BUSY, UINT64, AVERAGE), - COUNTABLE(RBBM1_PA_BUSY, UINT64, AVERAGE), - COUNTABLE(RBBM1_SC_CNTX_BUSY, UINT64, AVERAGE), - COUNTABLE(RBBM1_TPC_BUSY, UINT64, AVERAGE), - COUNTABLE(RBBM1_TC_BUSY, UINT64, AVERAGE), - COUNTABLE(RBBM1_SX_BUSY, UINT64, AVERAGE), - COUNTABLE(RBBM1_CP_COHER_BUSY, UINT64, AVERAGE), - COUNTABLE(RBBM1_CP_NRT_BUSY, UINT64, AVERAGE), - COUNTABLE(RBBM1_GFX_IDLE_STALL, UINT64, AVERAGE), - COUNTABLE(RBBM1_INTERRUPT, UINT64, AVERAGE), -}; - -static const struct fd_perfcntr_countable cp_countables[] = { - COUNTABLE(ALWAYS_COUNT, UINT64, AVERAGE), - COUNTABLE(TRANS_FIFO_FULL, UINT64, AVERAGE), - COUNTABLE(TRANS_FIFO_AF, UINT64, AVERAGE), - COUNTABLE(RCIU_PFPTRANS_WAIT, UINT64, AVERAGE), - COUNTABLE(RCIU_NRTTRANS_WAIT, UINT64, AVERAGE), - COUNTABLE(CSF_NRT_READ_WAIT, UINT64, AVERAGE), - COUNTABLE(CSF_I1_FIFO_FULL, UINT64, AVERAGE), - COUNTABLE(CSF_I2_FIFO_FULL, UINT64, AVERAGE), - COUNTABLE(CSF_ST_FIFO_FULL, UINT64, AVERAGE), - COUNTABLE(CSF_RING_ROQ_FULL, UINT64, AVERAGE), - COUNTABLE(CSF_I1_ROQ_FULL, UINT64, AVERAGE), - COUNTABLE(CSF_I2_ROQ_FULL, UINT64, AVERAGE), - COUNTABLE(CSF_ST_ROQ_FULL, UINT64, AVERAGE), - COUNTABLE(MIU_TAG_MEM_FULL, UINT64, AVERAGE), - COUNTABLE(MIU_WRITECLEAN, UINT64, AVERAGE), - COUNTABLE(MIU_NRT_WRITE_STALLED, UINT64, AVERAGE), - COUNTABLE(MIU_NRT_READ_STALLED, UINT64, AVERAGE), - COUNTABLE(ME_WRITE_CONFIRM_FIFO_FULL, UINT64, AVERAGE), - COUNTABLE(ME_VS_DEALLOC_FIFO_FULL, UINT64, AVERAGE), - COUNTABLE(ME_PS_DEALLOC_FIFO_FULL, UINT64, AVERAGE), - COUNTABLE(ME_REGS_VS_EVENT_FIFO_FULL, UINT64, AVERAGE), - COUNTABLE(ME_REGS_PS_EVENT_FIFO_FULL, UINT64, AVERAGE), - COUNTABLE(ME_REGS_CF_EVENT_FIFO_FULL, UINT64, AVERAGE), - COUNTABLE(ME_MICRO_RB_STARVED, UINT64, AVERAGE), - COUNTABLE(ME_MICRO_I1_STARVED, UINT64, AVERAGE), - COUNTABLE(ME_MICRO_I2_STARVED, UINT64, AVERAGE), - COUNTABLE(ME_MICRO_ST_STARVED, UINT64, AVERAGE), - COUNTABLE(RCIU_RBBM_DWORD_SENT, UINT64, AVERAGE), - COUNTABLE(ME_BUSY_CLOCKS, UINT64, AVERAGE), - COUNTABLE(ME_WAIT_CONTEXT_AVAIL, UINT64, AVERAGE), - COUNTABLE(PFP_TYPE0_PACKET, UINT64, AVERAGE), - COUNTABLE(PFP_TYPE3_PACKET, UINT64, AVERAGE), - COUNTABLE(CSF_RB_WPTR_NEQ_RPTR, UINT64, AVERAGE), - COUNTABLE(CSF_I1_SIZE_NEQ_ZERO, UINT64, AVERAGE), - COUNTABLE(CSF_I2_SIZE_NEQ_ZERO, UINT64, AVERAGE), - COUNTABLE(CSF_RBI1I2_FETCHING, UINT64, AVERAGE), -}; - -static const struct fd_perfcntr_counter sx_counters[] = { - COUNTER(SX_PERFCOUNTER0_SELECT, SX_PERFCOUNTER0_LOW, SX_PERFCOUNTER0_HI), -}; - -// We don't have the enums for MH perfcntrs -#if 0 -static const struct fd_perfcntr_counter mh_counters[] = { - COUNTER(MH_PERFCOUNTER0_SELECT, MH_PERFCOUNTER0_LOW, MH_PERFCOUNTER0_HI), - COUNTER(MH_PERFCOUNTER1_SELECT, MH_PERFCOUNTER1_LOW, MH_PERFCOUNTER1_HI), -}; -#endif - -static const struct fd_perfcntr_counter rbbm_counters[] = { - COUNTER(RBBM_PERFCOUNTER1_SELECT, RBBM_PERFCOUNTER1_LO, RBBM_PERFCOUNTER1_HI), -}; - -static const struct fd_perfcntr_counter cp_counters[] = { - COUNTER(CP_PERFCOUNTER_SELECT, CP_PERFCOUNTER_LO, CP_PERFCOUNTER_HI), -}; - -static const struct fd_perfcntr_counter rb_counters[] = { - COUNTER(RB_PERFCOUNTER0_SELECT, RB_PERFCOUNTER0_LOW, RB_PERFCOUNTER0_HI), -}; - -const struct fd_perfcntr_group a2xx_perfcntr_groups[] = { - GROUP("PA_SU", pa_su_counters, pa_su_countables), - GROUP("PA_SC", pa_sc_counters, pa_sc_countables), - GROUP("VGT", vgt_counters, vgt_countables), - GROUP("TCR", tcr_counters, tcr_countables), - GROUP("TP0", tp0_counters, tp0_countables), - GROUP("TCM", tcm_counters, tcm_countables), - GROUP("TCF", tcf_counters, tcf_countables), - GROUP("SQ", sq_counters, sq_countables), - GROUP("SX", sx_counters, sx_countables), -// GROUP("MH", mh_counters, mh_countables), - GROUP("RBBM", rbbm_counters, rbbm_countables), - GROUP("CP", cp_counters, cp_countables), - GROUP("RB", rb_counters, rb_countables), -}; - -const unsigned a2xx_num_perfcntr_groups = ARRAY_SIZE(a2xx_perfcntr_groups); diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/a2xx/fd2_program.c mesa-20.0.8/src/gallium/drivers/freedreno/a2xx/fd2_program.c --- mesa-19.2.8/src/gallium/drivers/freedreno/a2xx/fd2_program.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/a2xx/fd2_program.c 2020-06-12 01:21:17.000000000 +0000 @@ -29,7 +29,7 @@ #include "util/u_string.h" #include "util/u_memory.h" #include "util/u_inlines.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "tgsi/tgsi_dump.h" #include "tgsi/tgsi_parse.h" #include "nir/tgsi_to_nir.h" @@ -166,30 +166,15 @@ patch_vtx_fetch(struct fd_context *ctx, struct pipe_vertex_element *elem, instr_fetch_vtx_t *instr, uint16_t dst_swiz) { - struct pipe_vertex_buffer *vb = - &ctx->vtx.vertexbuf.vb[elem->vertex_buffer_index]; - enum pipe_format format = elem->src_format; - const struct util_format_description *desc = - util_format_description(format); - unsigned j; - - /* Find the first non-VOID channel. */ - for (j = 0; j < 4; j++) - if (desc->channel[j].type != UTIL_FORMAT_TYPE_VOID) - break; - - instr->format = fd2_pipe2surface(format); - instr->num_format_all = !desc->channel[j].normalized; - instr->format_comp_all = desc->channel[j].type == UTIL_FORMAT_TYPE_SIGNED; - instr->stride = vb->stride; - instr->offset = elem->src_offset; + struct surface_format fmt = fd2_pipe2surface(elem->src_format); - unsigned swiz = 0; - for (int i = 0; i < 4; i++) { - unsigned s = dst_swiz >> i*3 & 7; - swiz |= (s >= 4 ? s : desc->swizzle[s]) << i*3; - } - instr->dst_swiz = swiz; + instr->dst_swiz = fd2_vtx_swiz(elem->src_format, dst_swiz); + instr->format_comp_all = fmt.sign == SQ_TEX_SIGN_SIGNED; + instr->num_format_all = fmt.num_format; + instr->format = fmt.format; + instr->exp_adjust_all = fmt.exp_adjust; + instr->stride = ctx->vtx.vertexbuf.vb[elem->vertex_buffer_index].stride; + instr->offset = elem->src_offset; } static void @@ -225,11 +210,11 @@ bool binning = (ctx->batch && ring == ctx->batch->binning); unsigned variant = 0; - vp = prog->vp; + vp = prog->vs; /* find variant matching the linked fragment shader */ if (!binning) { - fp = prog->fp; + fp = prog->fs; for (variant = 1; variant < ARRAY_SIZE(vp->variant); variant++) { /* if checked all variants, compile a new variant */ if (!vp->variant[variant].info.sizedwords) { @@ -311,8 +296,8 @@ /* XXX maybe its possible to reuse patch_vtx_fetch somehow? */ prog = &ctx->solid_prog; - so = prog->vp; - ir2_compile(prog->vp, 1, prog->fp); + so = prog->vs; + ir2_compile(prog->vs, 1, prog->fs); #define IR2_FETCH_SWIZ_XY01 0xb08 #define IR2_FETCH_SWIZ_XYZ1 0xa88 @@ -329,8 +314,8 @@ instr->dst_swiz = IR2_FETCH_SWIZ_XYZ1; prog = &ctx->blit_prog[0]; - so = prog->vp; - ir2_compile(prog->vp, 1, prog->fp); + so = prog->vs; + ir2_compile(prog->vs, 1, prog->fs); info = &so->variant[1].info; diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/a2xx/fd2_query.c mesa-20.0.8/src/gallium/drivers/freedreno/a2xx/fd2_query.c --- mesa-19.2.8/src/gallium/drivers/freedreno/a2xx/fd2_query.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/a2xx/fd2_query.c 2020-06-12 01:21:17.000000000 +0000 @@ -106,7 +106,7 @@ const struct fd_perfcntr_counter *counter = &g->counters[counter_idx]; OUT_PKT3(ring, CP_REG_TO_MEM, 2); - OUT_RING(ring, counter->counter_reg_lo | CP_MEM_TO_REG_0_ACCUMULATE); + OUT_RING(ring, counter->counter_reg_lo | CP_REG_TO_MEM_0_ACCUMULATE); OUT_RELOCW(ring, query_sample_idx(aq, i, start)); } } @@ -133,7 +133,7 @@ const struct fd_perfcntr_counter *counter = &g->counters[counter_idx]; OUT_PKT3(ring, CP_REG_TO_MEM, 2); - OUT_RING(ring, counter->counter_reg_lo | CP_MEM_TO_REG_0_ACCUMULATE); + OUT_RING(ring, counter->counter_reg_lo | CP_REG_TO_MEM_0_ACCUMULATE); OUT_RELOCW(ring, query_sample_idx(aq, i, stop)); } } @@ -218,7 +218,7 @@ counters_per_group[entry->gid]++; } - q = fd_acc_create_query2(ctx, 0, &perfcntr); + q = fd_acc_create_query2(ctx, 0, 0, &perfcntr); aq = fd_acc_query(q); /* sample buffer size is based on # of queries: */ diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/a2xx/fd2_resource.c mesa-20.0.8/src/gallium/drivers/freedreno/a2xx/fd2_resource.c --- mesa-19.2.8/src/gallium/drivers/freedreno/a2xx/fd2_resource.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/a2xx/fd2_resource.c 2020-06-12 01:21:17.000000000 +0000 @@ -37,7 +37,7 @@ uint32_t depth = prsc->depth0; for (level = 0; level <= prsc->last_level; level++) { - struct fd_resource_slice *slice = fd_resource_slice(rsc, level); + struct fdl_slice *slice = fd_resource_slice(rsc, level); uint32_t blocks; /* 32 * 32 block alignment */ @@ -67,7 +67,7 @@ blocks = util_format_get_nblocks(format, width, height); /* 4k aligned size */ - slice->size0 = align(blocks * rsc->cpp, 4096); + slice->size0 = align(blocks * rsc->layout.cpp, 4096); size += slice->size0 * depth * prsc->array_size; diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/a2xx/fd2_screen.c mesa-20.0.8/src/gallium/drivers/freedreno/a2xx/fd2_screen.c --- mesa-19.2.8/src/gallium/drivers/freedreno/a2xx/fd2_screen.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/a2xx/fd2_screen.c 2020-06-12 01:21:17.000000000 +0000 @@ -25,7 +25,7 @@ */ #include "pipe/p_screen.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "fd2_screen.h" #include "fd2_context.h" @@ -53,27 +53,20 @@ if (MAX2(1, sample_count) != MAX2(1, storage_sample_count)) return false; - /* TODO figure out how to render to other formats.. */ if ((usage & PIPE_BIND_RENDER_TARGET) && - ((format != PIPE_FORMAT_B5G6R5_UNORM) && - (format != PIPE_FORMAT_B5G5R5A1_UNORM) && - (format != PIPE_FORMAT_B5G5R5X1_UNORM) && - (format != PIPE_FORMAT_B4G4R4A4_UNORM) && - (format != PIPE_FORMAT_B4G4R4X4_UNORM) && - (format != PIPE_FORMAT_B8G8R8A8_UNORM) && - (format != PIPE_FORMAT_B8G8R8X8_UNORM) && - (format != PIPE_FORMAT_R8G8B8A8_UNORM) && - (format != PIPE_FORMAT_R8G8B8X8_UNORM))) { - DBG("not supported render target: format=%s, target=%d, sample_count=%d, usage=%x", - util_format_name(format), target, sample_count, usage); - return false; + fd2_pipe2color(format) != (enum a2xx_colorformatx)~0) { + retval |= PIPE_BIND_RENDER_TARGET; } - if ((usage & (PIPE_BIND_SAMPLER_VIEW | - PIPE_BIND_VERTEX_BUFFER)) && - (fd2_pipe2surface(format) != (enum a2xx_sq_surfaceformat)~0)) { - retval |= usage & (PIPE_BIND_SAMPLER_VIEW | - PIPE_BIND_VERTEX_BUFFER); + if ((usage & (PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_VERTEX_BUFFER)) && + !util_format_is_srgb(format) && + !util_format_is_pure_integer(format) && + fd2_pipe2surface(format).format != FMT_INVALID) { + retval |= usage & PIPE_BIND_VERTEX_BUFFER; + /* the only npot blocksize supported texture format is R32G32B32_FLOAT */ + if (util_is_power_of_two_or_zero(util_format_get_blocksize(format)) || + format == PIPE_FORMAT_R32G32B32_FLOAT) + retval |= usage & PIPE_BIND_SAMPLER_VIEW; } if ((usage & (PIPE_BIND_RENDER_TARGET | @@ -106,9 +99,6 @@ return retval == usage; } -extern const struct fd_perfcntr_group a2xx_perfcntr_groups[]; -extern const unsigned a2xx_num_perfcntr_groups; - void fd2_screen_init(struct pipe_screen *pscreen) { @@ -122,10 +112,5 @@ if (fd_mesa_debug & FD_DBG_TTILE) screen->tile_mode = fd2_tile_mode; - if (fd_mesa_debug & FD_DBG_PERFC) { - screen->perfcntr_groups = a2xx_perfcntr_groups; - screen->num_perfcntr_groups = a2xx_num_perfcntr_groups; - } - fd2_emit_init_screen(pscreen); } diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/a2xx/fd2_texture.c mesa-20.0.8/src/gallium/drivers/freedreno/a2xx/fd2_texture.c --- mesa-19.2.8/src/gallium/drivers/freedreno/a2xx/fd2_texture.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/a2xx/fd2_texture.c 2020-06-12 01:21:17.000000000 +0000 @@ -170,6 +170,7 @@ { struct fd2_pipe_sampler_view *so = CALLOC_STRUCT(fd2_pipe_sampler_view); struct fd_resource *rsc = fd_resource(prsc); + struct surface_format fmt = fd2_pipe2surface(cso->format); if (!so) return NULL; @@ -180,17 +181,25 @@ so->base.reference.count = 1; so->base.context = pctx; + struct fdl_slice *slice0 = fd_resource_slice(rsc, 0); so->tex0 = - A2XX_SQ_TEX_0_PITCH(rsc->slices[0].pitch) | - COND(rsc->tile_mode, A2XX_SQ_TEX_0_TILED); + A2XX_SQ_TEX_0_SIGN_X(fmt.sign) | + A2XX_SQ_TEX_0_SIGN_Y(fmt.sign) | + A2XX_SQ_TEX_0_SIGN_Z(fmt.sign) | + A2XX_SQ_TEX_0_SIGN_W(fmt.sign) | + A2XX_SQ_TEX_0_PITCH(slice0->pitch) | + COND(rsc->layout.tile_mode, A2XX_SQ_TEX_0_TILED); so->tex1 = - A2XX_SQ_TEX_1_FORMAT(fd2_pipe2surface(cso->format)) | + A2XX_SQ_TEX_1_FORMAT(fmt.format) | A2XX_SQ_TEX_1_CLAMP_POLICY(SQ_TEX_CLAMP_POLICY_OGL); so->tex2 = A2XX_SQ_TEX_2_HEIGHT(prsc->height0 - 1) | A2XX_SQ_TEX_2_WIDTH(prsc->width0 - 1); - so->tex3 = fd2_tex_swiz(cso->format, cso->swizzle_r, cso->swizzle_g, - cso->swizzle_b, cso->swizzle_a); + so->tex3 = + A2XX_SQ_TEX_3_NUM_FORMAT(fmt.num_format) | + fd2_tex_swiz(cso->format, cso->swizzle_r, cso->swizzle_g, + cso->swizzle_b, cso->swizzle_a) | + A2XX_SQ_TEX_3_EXP_ADJUST(fmt.exp_adjust); so->tex4 = A2XX_SQ_TEX_4_MIP_MIN_LEVEL(fd_sampler_first_level(cso)) | diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/a2xx/fd2_util.c mesa-20.0.8/src/gallium/drivers/freedreno/a2xx/fd2_util.c --- mesa-19.2.8/src/gallium/drivers/freedreno/a2xx/fd2_util.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/a2xx/fd2_util.c 2020-06-12 01:21:17.000000000 +0000 @@ -25,188 +25,110 @@ */ #include "pipe/p_defines.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "fd2_util.h" -enum a2xx_sq_surfaceformat -fd2_pipe2surface(enum pipe_format format) +static enum a2xx_sq_surfaceformat +pipe2surface(enum pipe_format format, struct surface_format *fmt) { - switch (format) { - /* 8-bit buffers. */ - case PIPE_FORMAT_A8_UNORM: - case PIPE_FORMAT_A8_SNORM: - case PIPE_FORMAT_A8_UINT: - case PIPE_FORMAT_A8_SINT: - case PIPE_FORMAT_I8_UNORM: - case PIPE_FORMAT_I8_SNORM: - case PIPE_FORMAT_I8_UINT: - case PIPE_FORMAT_I8_SINT: - case PIPE_FORMAT_L8_UNORM: - case PIPE_FORMAT_L8_SNORM: - case PIPE_FORMAT_L8_UINT: - case PIPE_FORMAT_L8_SINT: - case PIPE_FORMAT_L8_SRGB: - case PIPE_FORMAT_R8_UNORM: - case PIPE_FORMAT_R8_SNORM: - case PIPE_FORMAT_R8_UINT: - case PIPE_FORMAT_R8_SINT: - return FMT_8; + const struct util_format_description *desc = util_format_description(format); - /* 16-bit buffers. */ - case PIPE_FORMAT_B5G6R5_UNORM: - return FMT_5_6_5; - case PIPE_FORMAT_B5G5R5A1_UNORM: - case PIPE_FORMAT_B5G5R5X1_UNORM: - return FMT_1_5_5_5; - case PIPE_FORMAT_B4G4R4A4_UNORM: - case PIPE_FORMAT_B4G4R4X4_UNORM: - return FMT_4_4_4_4; - case PIPE_FORMAT_Z16_UNORM: - return FMT_16; - case PIPE_FORMAT_L8A8_UNORM: - case PIPE_FORMAT_L8A8_SNORM: - case PIPE_FORMAT_L8A8_UINT: - case PIPE_FORMAT_L8A8_SINT: - case PIPE_FORMAT_L8A8_SRGB: - case PIPE_FORMAT_R8G8_UNORM: - case PIPE_FORMAT_R8G8_SNORM: - case PIPE_FORMAT_R8G8_UINT: - case PIPE_FORMAT_R8G8_SINT: - return FMT_8_8; - case PIPE_FORMAT_R16_UNORM: - case PIPE_FORMAT_R16_SNORM: - case PIPE_FORMAT_R16_UINT: - case PIPE_FORMAT_R16_SINT: - case PIPE_FORMAT_A16_UNORM: - case PIPE_FORMAT_A16_SNORM: - case PIPE_FORMAT_A16_UINT: - case PIPE_FORMAT_A16_SINT: - case PIPE_FORMAT_L16_UNORM: - case PIPE_FORMAT_L16_SNORM: - case PIPE_FORMAT_L16_UINT: - case PIPE_FORMAT_L16_SINT: - case PIPE_FORMAT_I16_UNORM: - case PIPE_FORMAT_I16_SNORM: - case PIPE_FORMAT_I16_UINT: - case PIPE_FORMAT_I16_SINT: - return FMT_16; - case PIPE_FORMAT_R16_FLOAT: - case PIPE_FORMAT_A16_FLOAT: - case PIPE_FORMAT_L16_FLOAT: - case PIPE_FORMAT_I16_FLOAT: - return FMT_16_FLOAT; + if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN) { + switch (format) { + /* Compressed textures. */ + case PIPE_FORMAT_ETC1_RGB8: + return FMT_ETC1_RGB; + case PIPE_FORMAT_DXT1_RGB: + case PIPE_FORMAT_DXT1_RGBA: + return FMT_DXT1; + case PIPE_FORMAT_DXT3_RGBA: + return FMT_DXT2_3; + case PIPE_FORMAT_DXT5_RGBA: + return FMT_DXT4_5; + case PIPE_FORMAT_ATC_RGB: + return FMT_ATI_TC_555_565_RGB; + case PIPE_FORMAT_ATC_RGBA_EXPLICIT: + return FMT_ATI_TC_555_565_RGBA; + case PIPE_FORMAT_ATC_RGBA_INTERPOLATED: + return FMT_ATI_TC_555_565_RGBA_INTERP; + /* YUV buffers. */ + case PIPE_FORMAT_UYVY: + return FMT_Y1_Cr_Y0_Cb; + case PIPE_FORMAT_YUYV: + return FMT_Cr_Y1_Cb_Y0; + default: + return ~0; + } + } - /* 32-bit buffers. */ - case PIPE_FORMAT_A8B8G8R8_SRGB: - case PIPE_FORMAT_A8B8G8R8_UNORM: - case PIPE_FORMAT_A8R8G8B8_UNORM: - case PIPE_FORMAT_B8G8R8A8_SRGB: - case PIPE_FORMAT_B8G8R8A8_UNORM: - case PIPE_FORMAT_B8G8R8X8_UNORM: - case PIPE_FORMAT_R8G8B8A8_SNORM: - case PIPE_FORMAT_R8G8B8A8_UNORM: - case PIPE_FORMAT_R8G8B8X8_UNORM: - case PIPE_FORMAT_R8SG8SB8UX8U_NORM: - case PIPE_FORMAT_X8B8G8R8_UNORM: - case PIPE_FORMAT_X8R8G8B8_UNORM: - case PIPE_FORMAT_R8G8B8_UNORM: - case PIPE_FORMAT_R8G8B8A8_SINT: - case PIPE_FORMAT_R8G8B8A8_UINT: - return FMT_8_8_8_8; - case PIPE_FORMAT_R10G10B10A2_UNORM: - case PIPE_FORMAT_R10G10B10X2_SNORM: - case PIPE_FORMAT_B10G10R10A2_UNORM: - case PIPE_FORMAT_B10G10R10A2_UINT: - case PIPE_FORMAT_R10SG10SB10SA2U_NORM: - return FMT_2_10_10_10; - case PIPE_FORMAT_Z24X8_UNORM: - case PIPE_FORMAT_Z24_UNORM_S8_UINT: - return FMT_24_8; - case PIPE_FORMAT_R32_UINT: - case PIPE_FORMAT_R32_SINT: - case PIPE_FORMAT_A32_UINT: - case PIPE_FORMAT_A32_SINT: - case PIPE_FORMAT_L32_UINT: - case PIPE_FORMAT_L32_SINT: - case PIPE_FORMAT_I32_UINT: - case PIPE_FORMAT_I32_SINT: - return FMT_32; - case PIPE_FORMAT_R32_FLOAT: - case PIPE_FORMAT_A32_FLOAT: - case PIPE_FORMAT_L32_FLOAT: - case PIPE_FORMAT_I32_FLOAT: - case PIPE_FORMAT_Z32_FLOAT: - return FMT_32_FLOAT; - case PIPE_FORMAT_R16G16_FLOAT: - case PIPE_FORMAT_L16A16_FLOAT: - return FMT_16_16_FLOAT; - case PIPE_FORMAT_R16G16_UNORM: - case PIPE_FORMAT_R16G16_SNORM: - case PIPE_FORMAT_R16G16_UINT: - case PIPE_FORMAT_R16G16_SINT: - case PIPE_FORMAT_L16A16_UNORM: - case PIPE_FORMAT_L16A16_SNORM: - case PIPE_FORMAT_L16A16_UINT: - case PIPE_FORMAT_L16A16_SINT: - return FMT_16_16; - - /* 64-bit buffers. */ - case PIPE_FORMAT_R16G16B16A16_UINT: - case PIPE_FORMAT_R16G16B16A16_SINT: - case PIPE_FORMAT_R16G16B16A16_UNORM: - case PIPE_FORMAT_R16G16B16A16_SNORM: - return FMT_16_16_16_16; - case PIPE_FORMAT_R16G16B16A16_FLOAT: - return FMT_16_16_16_16_FLOAT; - case PIPE_FORMAT_R32G32_FLOAT: - case PIPE_FORMAT_L32A32_FLOAT: - return FMT_32_32_FLOAT; - case PIPE_FORMAT_R32G32_SINT: - case PIPE_FORMAT_R32G32_UINT: - case PIPE_FORMAT_L32A32_UINT: - case PIPE_FORMAT_L32A32_SINT: - return FMT_32_32; - - /* 96-bit buffers. */ - case PIPE_FORMAT_R32G32B32_FLOAT: - return FMT_32_32_32_FLOAT; - - /* 128-bit buffers. */ - case PIPE_FORMAT_R32G32B32A32_SNORM: - case PIPE_FORMAT_R32G32B32A32_UNORM: - case PIPE_FORMAT_R32G32B32A32_SINT: - case PIPE_FORMAT_R32G32B32A32_UINT: - return FMT_32_32_32_32; - case PIPE_FORMAT_R32G32B32A32_FLOAT: - return FMT_32_32_32_32_FLOAT; + uint32_t channel_size = 0; + for (unsigned i = 0; i < 4; i++) + channel_size |= desc->channel[i].size << i*8; + + unsigned i = util_format_get_first_non_void_channel(format); + if (desc->channel[i].type == UTIL_FORMAT_TYPE_SIGNED || + desc->channel[i].type == UTIL_FORMAT_TYPE_FIXED) + fmt->sign = SQ_TEX_SIGN_SIGNED; + if (!desc->channel[i].normalized) + fmt->num_format = SQ_TEX_NUM_FORMAT_INT; + if (desc->channel[i].type == UTIL_FORMAT_TYPE_FIXED) + fmt->exp_adjust = -16; + + /* Note: the 3 channel 24bpp/48bpp/96bpp formats are only for vertex fetch + * we can use the 4 channel format and ignore the 4th component just isn't used + * XXX: is it possible for the extra loaded component to cause a MMU fault? + */ + +#define CASE(r, g, b, a) case (r | g << 8 | b << 16 | a << 24) + if (desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT) { + switch (channel_size) { + CASE(16, 0, 0, 0): return FMT_16_FLOAT; + CASE(16, 16, 0, 0): return FMT_16_16_FLOAT; + CASE(16, 16, 16, 0): return FMT_16_16_16_16_FLOAT; /* Note: only for vertex */ + CASE(16, 16, 16, 16): return FMT_16_16_16_16_FLOAT; + CASE(32, 0, 0, 0): return FMT_32_FLOAT; + CASE(32, 32, 0, 0): return FMT_32_32_FLOAT; + CASE(32, 32, 32, 0): return FMT_32_32_32_FLOAT; + CASE(32, 32, 32, 32): return FMT_32_32_32_32_FLOAT; + } + } else { + switch (channel_size) { + CASE( 8, 0, 0, 0): return FMT_8; + CASE( 8, 8, 0, 0): return FMT_8_8; + CASE( 8, 8, 8, 0): return FMT_8_8_8_8; /* Note: only for vertex */ + CASE( 8, 8, 8, 8): return FMT_8_8_8_8; + CASE(16, 0, 0, 0): return FMT_16; + CASE(16, 16, 0, 0): return FMT_16_16; + CASE(16, 16, 16, 0): return FMT_16_16_16_16; /* Note: only for vertex */ + CASE(16, 16, 16, 16): return FMT_16_16_16_16; + CASE(32, 0, 0, 0): return FMT_32; + CASE(32, 32, 0, 0): return FMT_32_32; + CASE(32, 32, 32, 0): return FMT_32_32_32_32; /* Note: only for vertex */ + CASE(32, 32, 32, 32): return FMT_32_32_32_32; + CASE( 4, 4, 4, 4): return FMT_4_4_4_4; + CASE( 5, 5, 5, 1): return FMT_1_5_5_5; + CASE( 5, 6, 5, 0): return FMT_5_6_5; + CASE(10, 10, 10, 2): return FMT_2_10_10_10; + CASE( 8, 24, 0, 0): return FMT_24_8; + CASE( 2, 3, 3, 0): return FMT_2_3_3; /* Note: R/B swapped */ + } + } +#undef CASE - /* Compressed textures. */ - case PIPE_FORMAT_ETC1_RGB8: - return FMT_ETC1_RGB; - case PIPE_FORMAT_DXT1_RGB: - case PIPE_FORMAT_DXT1_RGBA: - return FMT_DXT1; - case PIPE_FORMAT_DXT3_RGBA: - return FMT_DXT2_3; - case PIPE_FORMAT_DXT5_RGBA: - return FMT_DXT4_5; - case PIPE_FORMAT_ATC_RGB: - return FMT_ATI_TC_555_565_RGB; - case PIPE_FORMAT_ATC_RGBA_EXPLICIT: - return FMT_ATI_TC_555_565_RGBA; - case PIPE_FORMAT_ATC_RGBA_INTERPOLATED: - return FMT_ATI_TC_555_565_RGBA_INTERP; - - /* YUV buffers. */ - case PIPE_FORMAT_UYVY: - return FMT_Cr_Y1_Cb_Y0; - case PIPE_FORMAT_YUYV: - return FMT_Y1_Cr_Y0_Cb; + return ~0; +} - default: - return ~0; - } +struct surface_format +fd2_pipe2surface(enum pipe_format format) +{ + struct surface_format fmt = { + .sign = SQ_TEX_SIGN_UNSIGNED, + .num_format = SQ_TEX_NUM_FORMAT_FRAC, + .exp_adjust = 0, + }; + fmt.format = pipe2surface(format, &fmt); + return fmt; } enum a2xx_colorformatx @@ -214,24 +136,10 @@ { switch (format) { /* 8-bit buffers. */ - case PIPE_FORMAT_A8_UNORM: - case PIPE_FORMAT_A8_SNORM: - case PIPE_FORMAT_A8_UINT: - case PIPE_FORMAT_A8_SINT: - case PIPE_FORMAT_I8_UNORM: - case PIPE_FORMAT_I8_SNORM: - case PIPE_FORMAT_I8_UINT: - case PIPE_FORMAT_I8_SINT: - case PIPE_FORMAT_L8_UNORM: - case PIPE_FORMAT_L8_SNORM: - case PIPE_FORMAT_L8_UINT: - case PIPE_FORMAT_L8_SINT: - case PIPE_FORMAT_L8_SRGB: case PIPE_FORMAT_R8_UNORM: - case PIPE_FORMAT_R8_SNORM: - case PIPE_FORMAT_R8_UINT: - case PIPE_FORMAT_R8_SINT: return COLORX_8; + case PIPE_FORMAT_B2G3R3_UNORM: + return COLORX_2_3_3; /* note: untested */ /* 16-bit buffers. */ case PIPE_FORMAT_B5G6R5_UNORM: @@ -242,60 +150,31 @@ case PIPE_FORMAT_B4G4R4A4_UNORM: case PIPE_FORMAT_B4G4R4X4_UNORM: return COLORX_4_4_4_4; - case PIPE_FORMAT_L8A8_UNORM: - case PIPE_FORMAT_L8A8_SNORM: - case PIPE_FORMAT_L8A8_UINT: - case PIPE_FORMAT_L8A8_SINT: - case PIPE_FORMAT_L8A8_SRGB: case PIPE_FORMAT_R8G8_UNORM: - case PIPE_FORMAT_R8G8_SNORM: - case PIPE_FORMAT_R8G8_UINT: - case PIPE_FORMAT_R8G8_SINT: - case PIPE_FORMAT_Z16_UNORM: return COLORX_8_8; - case PIPE_FORMAT_R16_FLOAT: - case PIPE_FORMAT_A16_FLOAT: - case PIPE_FORMAT_L16_FLOAT: - case PIPE_FORMAT_I16_FLOAT: - return COLORX_16_FLOAT; /* 32-bit buffers. */ - case PIPE_FORMAT_A8B8G8R8_SRGB: - case PIPE_FORMAT_A8B8G8R8_UNORM: - case PIPE_FORMAT_A8R8G8B8_UNORM: - case PIPE_FORMAT_B8G8R8A8_SRGB: case PIPE_FORMAT_B8G8R8A8_UNORM: case PIPE_FORMAT_B8G8R8X8_UNORM: - case PIPE_FORMAT_R8G8B8A8_SNORM: case PIPE_FORMAT_R8G8B8A8_UNORM: case PIPE_FORMAT_R8G8B8X8_UNORM: - case PIPE_FORMAT_R8SG8SB8UX8U_NORM: - case PIPE_FORMAT_X8B8G8R8_UNORM: - case PIPE_FORMAT_X8R8G8B8_UNORM: - case PIPE_FORMAT_R8G8B8_UNORM: - case PIPE_FORMAT_R8G8B8A8_SINT: - case PIPE_FORMAT_R8G8B8A8_UINT: - case PIPE_FORMAT_Z24_UNORM_S8_UINT: - case PIPE_FORMAT_Z24X8_UNORM: return COLORX_8_8_8_8; - case PIPE_FORMAT_R32_FLOAT: - case PIPE_FORMAT_A32_FLOAT: - case PIPE_FORMAT_L32_FLOAT: - case PIPE_FORMAT_I32_FLOAT: - case PIPE_FORMAT_Z32_FLOAT: - return COLORX_32_FLOAT; + /* Note: snorm untested */ + case PIPE_FORMAT_R8G8B8A8_SNORM: + case PIPE_FORMAT_R8G8B8X8_SNORM: + return COLORX_S8_8_8_8; + + /* float buffers */ + case PIPE_FORMAT_R16_FLOAT: + return COLORX_16_FLOAT; case PIPE_FORMAT_R16G16_FLOAT: - case PIPE_FORMAT_L16A16_FLOAT: return COLORX_16_16_FLOAT; - - /* 64-bit buffers. */ case PIPE_FORMAT_R16G16B16A16_FLOAT: return COLORX_16_16_16_16_FLOAT; + case PIPE_FORMAT_R32_FLOAT: + return COLORX_32_FLOAT; case PIPE_FORMAT_R32G32_FLOAT: - case PIPE_FORMAT_L32A32_FLOAT: return COLORX_32_32_FLOAT; - - /* 128-bit buffers. */ case PIPE_FORMAT_R32G32B32A32_FLOAT: return COLORX_32_32_32_32_FLOAT; @@ -335,3 +214,18 @@ A2XX_SQ_TEX_3_SWIZ_Z(tex_swiz(rswiz[2])) | A2XX_SQ_TEX_3_SWIZ_W(tex_swiz(rswiz[3])); } + +uint32_t +fd2_vtx_swiz(enum pipe_format format, unsigned swizzle) +{ + const struct util_format_description *desc = + util_format_description(format); + unsigned char swiz[4], rswiz[4]; + + for (unsigned i = 0; i < 4; i++) + swiz[i] = (swizzle >> i * 3) & 7; + + util_format_compose_swizzles(desc->swizzle, swiz, rswiz); + + return rswiz[0] | rswiz[1] << 3 | rswiz[2] << 6 | rswiz[3] << 9; +} diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/a2xx/fd2_util.h mesa-20.0.8/src/gallium/drivers/freedreno/a2xx/fd2_util.h --- mesa-19.2.8/src/gallium/drivers/freedreno/a2xx/fd2_util.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/a2xx/fd2_util.h 2020-06-12 01:21:17.000000000 +0000 @@ -31,10 +31,19 @@ #include "a2xx.xml.h" -enum a2xx_sq_surfaceformat fd2_pipe2surface(enum pipe_format format); +struct surface_format { +#define FMT_INVALID 0x7f + enum a2xx_sq_surfaceformat format : 7; + enum sq_tex_sign sign : 2; + enum sq_tex_num_format num_format : 1; + int exp_adjust : 6; +}; + +struct surface_format fd2_pipe2surface(enum pipe_format format); enum a2xx_colorformatx fd2_pipe2color(enum pipe_format format); uint32_t fd2_tex_swiz(enum pipe_format format, unsigned swizzle_r, unsigned swizzle_g, unsigned swizzle_b, unsigned swizzle_a); +uint32_t fd2_vtx_swiz(enum pipe_format format, unsigned swizzle); /* convert x,y to dword */ static inline uint32_t xy2d(uint16_t x, uint16_t y) diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/a2xx/instr-a2xx.h mesa-20.0.8/src/gallium/drivers/freedreno/a2xx/instr-a2xx.h --- mesa-19.2.8/src/gallium/drivers/freedreno/a2xx/instr-a2xx.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/a2xx/instr-a2xx.h 2020-06-12 01:21:17.000000000 +0000 @@ -372,8 +372,8 @@ uint8_t signed_rf_mode_all : 1; uint8_t reserved1 : 1; instr_surf_fmt_t format : 6; - uint8_t reserved2 : 1; - uint8_t exp_adjust_all : 7; + uint8_t reserved2 : 2; + uint8_t exp_adjust_all : 6; uint8_t reserved3 : 1; uint8_t pred_select : 1; /* dword2: */ diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/a2xx/ir2.c mesa-20.0.8/src/gallium/drivers/freedreno/a2xx/ir2.c --- mesa-19.2.8/src/gallium/drivers/freedreno/a2xx/ir2.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/a2xx/ir2.c 2020-06-12 01:21:17.000000000 +0000 @@ -133,6 +133,12 @@ sched = s; } *comp = ffs(mask) - 1; + + if (sched) { + for (s = sched; s != &ctx->instr_sched[ctx->instr_sched_count]; s++) + s->reg_state[reg_idx/8] |= 1 << (*comp+reg_idx%8*4); + } + return sched; } @@ -258,6 +264,22 @@ is_ok &= !ctx->instr[src->num].need_emit; } } + /* don't reorder non-ssa write before read */ + if (!instr->is_ssa) { + ir2_foreach_instr(p, ctx) { + if (!p->need_emit || p->idx >= instr->idx) + continue; + + ir2_foreach_src(src, p) { + if (get_reg_src(ctx, src) == instr->reg) + is_ok = false; + } + } + } + /* don't reorder across predicates */ + if (avail_count && instr->pred != avail[0]->pred) + is_ok = false; + if (!is_ok) continue; diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/a2xx/ir2_cp.c mesa-20.0.8/src/gallium/drivers/freedreno/a2xx/ir2_cp.c --- mesa-19.2.8/src/gallium/drivers/freedreno/a2xx/ir2_cp.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/a2xx/ir2_cp.c 2020-06-12 01:21:17.000000000 +0000 @@ -68,6 +68,9 @@ if (!is_mov(p)) break; + if (p->alu.saturate) + break; + /* cant apply abs to const src, const src only for alu */ if (p->src[0].type == IR2_SRC_CONST && (src->abs || instr->type != IR2_ALU)) @@ -200,8 +203,9 @@ p->is_ssa = true; p->ssa.ncomp = 0; memset(p->ssa.comp, 0, sizeof(p->ssa.comp)); + p->alu.saturate |= instr->alu.saturate; - switch (instr->alu.vector_opc) { + switch (p->alu.vector_opc) { case PRED_SETE_PUSHv ... PRED_SETGTE_PUSHv: case DOT2ADDv: case DOT3v: diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/a2xx/ir2_nir.c mesa-20.0.8/src/gallium/drivers/freedreno/a2xx/ir2_nir.c --- mesa-19.2.8/src/gallium/drivers/freedreno/a2xx/ir2_nir.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/a2xx/ir2_nir.c 2020-06-12 01:21:17.000000000 +0000 @@ -42,6 +42,7 @@ .lower_bitops = true, .lower_rotate = true, .lower_vector_cmp = true, + .lower_fdph = true, }; const nir_shader_compiler_options * @@ -503,14 +504,6 @@ assert(slot >= 0); switch (slot) { - case VARYING_SLOT_PNTC: - /* need to extract with abs and invert y */ - instr = instr_create_alu_dest(ctx, nir_op_ffma, dst); - instr->src[0] = ir2_src(ctx->f->inputs_count, IR2_SWIZZLE_ZW, IR2_SRC_INPUT); - instr->src[0].abs = true; - instr->src[1] = load_const(ctx, (float[]) {1.0f, -1.0f}, 2); - instr->src[2] = load_const(ctx, (float[]) {0.0f, 1.0f}, 2); - break; case VARYING_SLOT_POS: /* need to extract xy with abs and add tile offset on a20x * zw from fragcoord input (w inverted in fragment shader) @@ -639,6 +632,13 @@ instr->src[0] = ir2_src(tmp->idx, 0, IR2_SRC_SSA); instr->src[1] = ir2_zero(ctx); break; + case nir_intrinsic_load_point_coord: + /* param.zw (note: abs might be needed like fragcoord in param.xy?) */ + ctx->so->need_param = true; + + instr = instr_create_alu_dest(ctx, nir_op_mov, &intr->dest); + instr->src[0] = ir2_src(ctx->f->inputs_count, IR2_SWIZZLE_ZW, IR2_SRC_INPUT); + break; default: compile_error(ctx, "unimplemented intr %d\n", intr->intrinsic); break; @@ -760,11 +760,6 @@ if (ctx->so->type != MESA_SHADER_FRAGMENT) compile_error(ctx, "unknown shader type: %d\n", ctx->so->type); - if (slot == VARYING_SLOT_PNTC) { - so->need_param = true; - return; - } - n = ctx->f->inputs_count++; /* half of fragcoord from param reg, half from a varying */ @@ -1062,6 +1057,29 @@ ir2_optimize_nir(ctx->nir, false); } +static bool +ir2_alu_to_scalar_filter_cb(const nir_instr *instr, const void *data) +{ + if (instr->type != nir_instr_type_alu) + return false; + + nir_alu_instr *alu = nir_instr_as_alu(instr); + switch (alu->op) { + case nir_op_frsq: + case nir_op_frcp: + case nir_op_flog2: + case nir_op_fexp2: + case nir_op_fsqrt: + case nir_op_fcos: + case nir_op_fsin: + return true; + default: + break; + } + + return false; +} + void ir2_nir_compile(struct ir2_context *ctx, bool binning) { @@ -1084,17 +1102,7 @@ OPT_V(ctx->nir, nir_opt_algebraic_late); OPT_V(ctx->nir, nir_lower_to_source_mods, nir_lower_all_source_mods); - /* TODO: static bitset ? */ - BITSET_DECLARE(scalar_ops, nir_num_opcodes); - BITSET_ZERO(scalar_ops); - BITSET_SET(scalar_ops, nir_op_frsq); - BITSET_SET(scalar_ops, nir_op_frcp); - BITSET_SET(scalar_ops, nir_op_flog2); - BITSET_SET(scalar_ops, nir_op_fexp2); - BITSET_SET(scalar_ops, nir_op_fsqrt); - BITSET_SET(scalar_ops, nir_op_fcos); - BITSET_SET(scalar_ops, nir_op_fsin); - OPT_V(ctx->nir, nir_lower_alu_to_scalar, scalar_ops); + OPT_V(ctx->nir, nir_lower_alu_to_scalar, ir2_alu_to_scalar_filter_cb, NULL); OPT_V(ctx->nir, nir_lower_locals_to_regs); diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/a3xx/fd3_blend.c mesa-20.0.8/src/gallium/drivers/freedreno/a3xx/fd3_blend.c --- mesa-19.2.8/src/gallium/drivers/freedreno/a3xx/fd3_blend.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/a3xx/fd3_blend.c 2020-06-12 01:21:17.000000000 +0000 @@ -98,21 +98,14 @@ else rt = &cso->rt[0]; - so->rb_mrt[i].blend_control_rgb = + so->rb_mrt[i].blend_control = A3XX_RB_MRT_BLEND_CONTROL_RGB_SRC_FACTOR(fd_blend_factor(rt->rgb_src_factor)) | A3XX_RB_MRT_BLEND_CONTROL_RGB_BLEND_OPCODE(blend_func(rt->rgb_func)) | - A3XX_RB_MRT_BLEND_CONTROL_RGB_DEST_FACTOR(fd_blend_factor(rt->rgb_dst_factor)); - - so->rb_mrt[i].blend_control_alpha = + A3XX_RB_MRT_BLEND_CONTROL_RGB_DEST_FACTOR(fd_blend_factor(rt->rgb_dst_factor)) | A3XX_RB_MRT_BLEND_CONTROL_ALPHA_SRC_FACTOR(fd_blend_factor(rt->alpha_src_factor)) | A3XX_RB_MRT_BLEND_CONTROL_ALPHA_BLEND_OPCODE(blend_func(rt->alpha_func)) | A3XX_RB_MRT_BLEND_CONTROL_ALPHA_DEST_FACTOR(fd_blend_factor(rt->alpha_dst_factor)); - so->rb_mrt[i].blend_control_no_alpha_rgb = - A3XX_RB_MRT_BLEND_CONTROL_RGB_SRC_FACTOR(fd_blend_factor(util_blend_dst_alpha_to_one(rt->rgb_src_factor))) | - A3XX_RB_MRT_BLEND_CONTROL_RGB_BLEND_OPCODE(blend_func(rt->rgb_func)) | - A3XX_RB_MRT_BLEND_CONTROL_RGB_DEST_FACTOR(fd_blend_factor(util_blend_dst_alpha_to_one(rt->rgb_dst_factor))); - so->rb_mrt[i].control = A3XX_RB_MRT_CONTROL_ROP_CODE(rop) | A3XX_RB_MRT_CONTROL_COMPONENT_ENABLE(rt->colormask); diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/a3xx/fd3_blend.h mesa-20.0.8/src/gallium/drivers/freedreno/a3xx/fd3_blend.h --- mesa-19.2.8/src/gallium/drivers/freedreno/a3xx/fd3_blend.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/a3xx/fd3_blend.h 2020-06-12 01:21:17.000000000 +0000 @@ -36,12 +36,7 @@ struct pipe_blend_state base; uint32_t rb_render_control; struct { - /* Blend control bits for color if there is an alpha channel */ - uint32_t blend_control_rgb; - /* Blend control bits for color if there is no alpha channel */ - uint32_t blend_control_no_alpha_rgb; - /* Blend control bits for alpha channel */ - uint32_t blend_control_alpha; + uint32_t blend_control; uint32_t control; } rb_mrt[A3XX_MAX_RENDER_TARGETS]; }; diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/a3xx/fd3_draw.c mesa-20.0.8/src/gallium/drivers/freedreno/a3xx/fd3_draw.c --- mesa-19.2.8/src/gallium/drivers/freedreno/a3xx/fd3_draw.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/a3xx/fd3_draw.c 2020-06-12 01:21:17.000000000 +0000 @@ -28,7 +28,7 @@ #include "util/u_string.h" #include "util/u_memory.h" #include "util/u_prim.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "freedreno_state.h" #include "freedreno_resource.h" @@ -141,7 +141,7 @@ .sprite_coord_mode = ctx->rasterizer->sprite_coord_mode, }; - if (fd3_needs_manual_clipping(ctx->prog.vp, ctx->rasterizer)) + if (fd3_needs_manual_clipping(ctx->prog.vs, ctx->rasterizer)) emit.key.ucp_enables = ctx->rasterizer->clip_plane_enable; fixup_shader_state(ctx, &emit.key); @@ -165,8 +165,8 @@ /* and now binning pass: */ emit.binning_pass = true; emit.dirty = dirty & ~(FD_DIRTY_BLEND); - emit.vp = NULL; /* we changed key so need to refetch vp */ - emit.fp = NULL; + emit.vs = NULL; /* we changed key so need to refetch vs */ + emit.fs = NULL; draw_impl(ctx, ctx->batch->binning, &emit, index_offset); fd_context_all_clean(ctx); diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/a3xx/fd3_emit.c mesa-20.0.8/src/gallium/drivers/freedreno/a3xx/fd3_emit.c --- mesa-19.2.8/src/gallium/drivers/freedreno/a3xx/fd3_emit.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/a3xx/fd3_emit.c 2020-06-12 01:21:17.000000000 +0000 @@ -28,7 +28,7 @@ #include "util/u_string.h" #include "util/u_memory.h" #include "util/u_helpers.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_viewport.h" #include "freedreno_resource.h" @@ -215,8 +215,7 @@ unsigned end = fd_sampler_last_level(&view->base); for (j = 0; j < (end - start + 1); j++) { - struct fd_resource_slice *slice = - fd_resource_slice(rsc, j + start); + struct fdl_slice *slice = fd_resource_slice(rsc, j + start); OUT_RELOC(ring, rsc->bo, slice->offset, 0, 0); } } @@ -311,18 +310,19 @@ /* note: PIPE_BUFFER disallowed for surfaces */ unsigned lvl = psurf[i]->u.tex.level; - struct fd_resource_slice *slice = fd_resource_slice(rsc, lvl); + struct fdl_slice *slice = fd_resource_slice(rsc, lvl); debug_assert(psurf[i]->u.tex.first_layer == psurf[i]->u.tex.last_layer); - OUT_RING(ring, A3XX_TEX_CONST_0_FMT(fd3_pipe2tex(format)) | + OUT_RING(ring, A3XX_TEX_CONST_0_TILE_MODE(rsc->layout.tile_mode) | + A3XX_TEX_CONST_0_FMT(fd3_pipe2tex(format)) | A3XX_TEX_CONST_0_TYPE(A3XX_TEX_2D) | fd3_tex_swiz(format, PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W)); OUT_RING(ring, A3XX_TEX_CONST_1_FETCHSIZE(TFETCH_DISABLE) | A3XX_TEX_CONST_1_WIDTH(psurf[i]->width) | A3XX_TEX_CONST_1_HEIGHT(psurf[i]->height)); - OUT_RING(ring, A3XX_TEX_CONST_2_PITCH(slice->pitch * rsc->cpp) | + OUT_RING(ring, A3XX_TEX_CONST_2_PITCH(slice->pitch * rsc->layout.cpp) | A3XX_TEX_CONST_2_INDX(BASETABLE_SZ * i)); OUT_RING(ring, 0x00000000); } @@ -372,9 +372,6 @@ continue; if (vp->inputs[i].sysval) { switch(vp->inputs[i].slot) { - case SYSTEM_VALUE_FIRST_VERTEX: - /* handled elsewhere */ - break; case SYSTEM_VALUE_VERTEX_ID_ZERO_BASE: vertex_regid = vp->inputs[i].regid; break; @@ -440,7 +437,7 @@ COND(isint, A3XX_VFD_DECODE_INSTR_INT) | COND(switchnext, A3XX_VFD_DECODE_INSTR_SWITCHNEXT)); - total_in += vp->inputs[i].ncomp; + total_in += util_bitcount(vp->inputs[i].compmask); j++; } } @@ -738,7 +735,6 @@ bool is_int = util_format_is_pure_integer(format); bool has_alpha = util_format_has_alpha(format); uint32_t control = blend->rb_mrt[i].control; - uint32_t blend_control = blend->rb_mrt[i].blend_control_alpha; if (is_int) { control &= (A3XX_RB_MRT_CONTROL_COMPONENT_ENABLE__MASK | @@ -749,10 +745,7 @@ if (format == PIPE_FORMAT_NONE) control &= ~A3XX_RB_MRT_CONTROL_COMPONENT_ENABLE__MASK; - if (has_alpha) { - blend_control |= blend->rb_mrt[i].blend_control_rgb; - } else { - blend_control |= blend->rb_mrt[i].blend_control_no_alpha_rgb; + if (!has_alpha) { control &= ~A3XX_RB_MRT_CONTROL_BLEND2; } @@ -772,7 +765,7 @@ OUT_RING(ring, control); OUT_PKT0(ring, REG_A3XX_RB_MRT_BLEND_CONTROL(i), 1); - OUT_RING(ring, blend_control | + OUT_RING(ring, blend->rb_mrt[i].blend_control | COND(!is_float, A3XX_RB_MRT_BLEND_CONTROL_CLAMP_ENABLE)); } } diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/a3xx/fd3_emit.h mesa-20.0.8/src/gallium/drivers/freedreno/a3xx/fd3_emit.h --- mesa-19.2.8/src/gallium/drivers/freedreno/a3xx/fd3_emit.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/a3xx/fd3_emit.h 2020-06-12 01:21:17.000000000 +0000 @@ -54,35 +54,35 @@ bool rasterflat; /* cached to avoid repeated lookups of same variants: */ - const struct ir3_shader_variant *vp, *fp; + const struct ir3_shader_variant *vs, *fs; }; static inline const struct ir3_shader_variant * fd3_emit_get_vp(struct fd3_emit *emit) { - if (!emit->vp) { - struct ir3_shader *shader = emit->prog->vp; - emit->vp = ir3_shader_variant(shader, emit->key, + if (!emit->vs) { + struct ir3_shader *shader = emit->prog->vs; + emit->vs = ir3_shader_variant(shader, emit->key, emit->binning_pass, emit->debug); } - return emit->vp; + return emit->vs; } static inline const struct ir3_shader_variant * fd3_emit_get_fp(struct fd3_emit *emit) { - if (!emit->fp) { + if (!emit->fs) { if (emit->binning_pass) { /* use dummy stateobj to simplify binning vs non-binning: */ - static const struct ir3_shader_variant binning_fp = {}; - emit->fp = &binning_fp; + static const struct ir3_shader_variant binning_fs = {}; + emit->fs = &binning_fs; } else { - struct ir3_shader *shader = emit->prog->fp; - emit->fp = ir3_shader_variant(shader, emit->key, + struct ir3_shader *shader = emit->prog->fs; + emit->fs = ir3_shader_variant(shader, emit->key, false, emit->debug); } } - return emit->fp; + return emit->fs; } void fd3_emit_vertex_bufs(struct fd_ringbuffer *ring, struct fd3_emit *emit); diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/a3xx/fd3_format.c mesa-20.0.8/src/gallium/drivers/freedreno/a3xx/fd3_format.c --- mesa-19.2.8/src/gallium/drivers/freedreno/a3xx/fd3_format.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/a3xx/fd3_format.c 2020-06-12 01:21:17.000000000 +0000 @@ -23,7 +23,7 @@ */ #include "pipe/p_defines.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "fd3_format.h" @@ -78,7 +78,7 @@ VT(R8_UINT, 8_UINT, R8_UINT, WZYX), VT(R8_SINT, 8_SINT, R8_SINT, WZYX), V_(R8_USCALED, 8_UINT, NONE, WZYX), - V_(R8_SSCALED, 8_UINT, NONE, WZYX), + V_(R8_SSCALED, 8_SINT, NONE, WZYX), _T(A8_UNORM, 8_UNORM, A8_UNORM, WZYX), _T(L8_UNORM, 8_UNORM, R8_UNORM, WZYX), @@ -99,7 +99,7 @@ VT(R16_UINT, 16_UINT, R16_UINT, WZYX), VT(R16_SINT, 16_SINT, R16_SINT, WZYX), V_(R16_USCALED, 16_UINT, NONE, WZYX), - V_(R16_SSCALED, 16_UINT, NONE, WZYX), + V_(R16_SSCALED, 16_SINT, NONE, WZYX), VT(R16_FLOAT, 16_FLOAT, R16_FLOAT,WZYX), _T(A16_UINT, 16_UINT, NONE, WZYX), @@ -137,7 +137,7 @@ VT(R32_UINT, 32_UINT, R32_UINT, WZYX), VT(R32_SINT, 32_SINT, R32_SINT, WZYX), V_(R32_USCALED, 32_UINT, NONE, WZYX), - V_(R32_SSCALED, 32_UINT, NONE, WZYX), + V_(R32_SSCALED, 32_SINT, NONE, WZYX), VT(R32_FLOAT, 32_FLOAT, R32_FLOAT,WZYX), V_(R32_FIXED, 32_FIXED, NONE, WZYX), diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/a3xx/fd3_format.h mesa-20.0.8/src/gallium/drivers/freedreno/a3xx/fd3_format.h --- mesa-19.2.8/src/gallium/drivers/freedreno/a3xx/fd3_format.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/a3xx/fd3_format.h 2020-06-12 01:21:17.000000000 +0000 @@ -25,7 +25,7 @@ #ifndef FD3_FORMAT_H_ #define FD3_FORMAT_H_ -#include "util/u_format.h" +#include "util/format/u_format.h" #include "freedreno_util.h" #include "a3xx.xml.h" diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c mesa-20.0.8/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c --- mesa-19.2.8/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c 2020-06-12 01:21:17.000000000 +0000 @@ -28,7 +28,7 @@ #include "util/u_string.h" #include "util/u_memory.h" #include "util/u_inlines.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "freedreno_draw.h" #include "freedreno_state.h" @@ -43,7 +43,7 @@ static void emit_mrt(struct fd_ringbuffer *ring, unsigned nr_bufs, - struct pipe_surface **bufs, uint32_t *bases, uint32_t bin_w, + struct pipe_surface **bufs, const uint32_t *bases, uint32_t bin_w, bool decode_srgb) { enum a3xx_tile_mode tile_mode; @@ -55,7 +55,7 @@ enum a3xx_color_swap swap = WZYX; bool srgb = false; struct fd_resource *rsc = NULL; - struct fd_resource_slice *slice = NULL; + struct fdl_slice *slice = NULL; uint32_t stride = 0; uint32_t base = 0; uint32_t offset = 0; @@ -91,17 +91,17 @@ offset = fd_resource_offset(rsc, psurf->u.tex.level, psurf->u.tex.first_layer); - swap = rsc->tile_mode ? WZYX : fd3_pipe2swap(pformat); + swap = rsc->layout.tile_mode ? WZYX : fd3_pipe2swap(pformat); if (bin_w) { - stride = bin_w * rsc->cpp; + stride = bin_w * rsc->layout.cpp; if (bases) { base = bases[i]; } } else { - stride = slice->pitch * rsc->cpp; - tile_mode = rsc->tile_mode; + stride = slice->pitch * rsc->layout.cpp; + tile_mode = rsc->layout.tile_mode; } } else if (i < nr_bufs && bases) { base = bases[i]; @@ -129,7 +129,7 @@ static bool use_hw_binning(struct fd_batch *batch) { - struct fd_gmem_stateobj *gmem = &batch->ctx->gmem; + const struct fd_gmem_stateobj *gmem = batch->gmem_state; /* workaround: combining scissor optimization and hw binning * seems problematic. Seems like we end up with a mismatch @@ -163,7 +163,7 @@ emit_binning_workaround(struct fd_batch *batch) { struct fd_context *ctx = batch->ctx; - struct fd_gmem_stateobj *gmem = &ctx->gmem; + const struct fd_gmem_stateobj *gmem = batch->gmem_state; struct fd_ringbuffer *ring = batch->gmem; struct fd3_emit emit = { .debug = &ctx->debug, @@ -330,7 +330,7 @@ format = rsc->base.format; } - struct fd_resource_slice *slice = fd_resource_slice(rsc, psurf->u.tex.level); + struct fdl_slice *slice = fd_resource_slice(rsc, psurf->u.tex.level); uint32_t offset = fd_resource_offset(rsc, psurf->u.tex.level, psurf->u.tex.first_layer); @@ -345,8 +345,8 @@ A3XX_RB_COPY_CONTROL_DEPTH32_RESOLVE)); OUT_RELOCW(ring, rsc->bo, offset, 0, -1); /* RB_COPY_DEST_BASE */ - OUT_RING(ring, A3XX_RB_COPY_DEST_PITCH_PITCH(slice->pitch * rsc->cpp)); - OUT_RING(ring, A3XX_RB_COPY_DEST_INFO_TILE(LINEAR) | + OUT_RING(ring, A3XX_RB_COPY_DEST_PITCH_PITCH(slice->pitch * rsc->layout.cpp)); + OUT_RING(ring, A3XX_RB_COPY_DEST_INFO_TILE(rsc->layout.tile_mode) | A3XX_RB_COPY_DEST_INFO_FORMAT(fd3_pipe2color(format)) | A3XX_RB_COPY_DEST_INFO_COMPONENT_ENABLE(0xf) | A3XX_RB_COPY_DEST_INFO_ENDIAN(ENDIAN_NONE) | @@ -357,10 +357,11 @@ } static void -fd3_emit_tile_gmem2mem(struct fd_batch *batch, struct fd_tile *tile) +fd3_emit_tile_gmem2mem(struct fd_batch *batch, const struct fd_tile *tile) { struct fd_context *ctx = batch->ctx; struct fd_ringbuffer *ring = batch->gmem; + const struct fd_gmem_stateobj *gmem = batch->gmem_state; struct pipe_framebuffer_state *pfb = &batch->framebuffer; struct fd3_emit emit = { .debug = &ctx->debug, @@ -419,7 +420,7 @@ OUT_RING(ring, A3XX_RB_RENDER_CONTROL_DISABLE_COLOR_PIPE | A3XX_RB_RENDER_CONTROL_ENABLE_GMEM | A3XX_RB_RENDER_CONTROL_ALPHA_TEST_FUNC(FUNC_NEVER) | - A3XX_RB_RENDER_CONTROL_BIN_WIDTH(ctx->gmem.bin_w)); + A3XX_RB_RENDER_CONTROL_BIN_WIDTH(batch->gmem_state->bin_w)); OUT_PKT0(ring, REG_A3XX_GRAS_SC_CONTROL, 1); OUT_RING(ring, A3XX_GRAS_SC_CONTROL_RENDER_MODE(RB_RESOLVE_PASS) | @@ -451,10 +452,10 @@ struct fd_resource *rsc = fd_resource(pfb->zsbuf->texture); if (!rsc->stencil || batch->resolve & FD_BUFFER_DEPTH) emit_gmem2mem_surf(batch, RB_COPY_DEPTH_STENCIL, false, - ctx->gmem.zsbuf_base[0], pfb->zsbuf); + gmem->zsbuf_base[0], pfb->zsbuf); if (rsc->stencil && batch->resolve & FD_BUFFER_STENCIL) emit_gmem2mem_surf(batch, RB_COPY_DEPTH_STENCIL, true, - ctx->gmem.zsbuf_base[1], pfb->zsbuf); + gmem->zsbuf_base[1], pfb->zsbuf); } if (batch->resolve & FD_BUFFER_COLOR) { @@ -464,7 +465,7 @@ if (!(batch->resolve & (PIPE_CLEAR_COLOR0 << i))) continue; emit_gmem2mem_surf(batch, RB_COPY_RESOLVE, false, - ctx->gmem.cbuf_base[i], pfb->cbufs[i]); + gmem->cbuf_base[i], pfb->cbufs[i]); } } @@ -482,7 +483,7 @@ /* transfer from system memory to gmem */ static void -emit_mem2gmem_surf(struct fd_batch *batch, uint32_t bases[], +emit_mem2gmem_surf(struct fd_batch *batch, const uint32_t bases[], struct pipe_surface **psurf, uint32_t bufs, uint32_t bin_w) { struct fd_ringbuffer *ring = batch->gmem; @@ -512,7 +513,7 @@ OUT_PKT0(ring, REG_A3XX_RB_DEPTH_INFO, 2); OUT_RING(ring, A3XX_RB_DEPTH_INFO_DEPTH_BASE(bases[0]) | A3XX_RB_DEPTH_INFO_DEPTH_FORMAT(DEPTHX_32)); - OUT_RING(ring, A3XX_RB_DEPTH_PITCH(4 * batch->ctx->gmem.bin_w)); + OUT_RING(ring, A3XX_RB_DEPTH_PITCH(4 * batch->gmem_state->bin_w)); if (psurf[0]->format == PIPE_FORMAT_Z32_FLOAT) { OUT_PKT0(ring, REG_A3XX_RB_MRT_CONTROL(0), 1); @@ -538,10 +539,10 @@ } static void -fd3_emit_tile_mem2gmem(struct fd_batch *batch, struct fd_tile *tile) +fd3_emit_tile_mem2gmem(struct fd_batch *batch, const struct fd_tile *tile) { struct fd_context *ctx = batch->ctx; - struct fd_gmem_stateobj *gmem = &ctx->gmem; + const struct fd_gmem_stateobj *gmem = batch->gmem_state; struct fd_ringbuffer *ring = batch->gmem; struct pipe_framebuffer_state *pfb = &batch->framebuffer; struct fd3_emit emit = { @@ -667,7 +668,7 @@ if (fd_gmem_needs_restore(batch, tile, FD_BUFFER_COLOR)) { emit.prog = &ctx->blit_prog[pfb->nr_cbufs - 1]; - emit.fp = NULL; /* frag shader changed so clear cache */ + emit.fs = NULL; /* frag shader changed so clear cache */ fd3_program_emit(ring, &emit, pfb->nr_cbufs, pfb->cbufs); emit_mem2gmem_surf(batch, gmem->cbuf_base, pfb->cbufs, pfb->nr_cbufs, bin_w); } @@ -688,7 +689,7 @@ emit.prog = &ctx->blit_zs; emit.key.half_precision = false; } - emit.fp = NULL; /* frag shader changed so clear cache */ + emit.fs = NULL; /* frag shader changed so clear cache */ fd3_program_emit(ring, &emit, 1, &pfb->zsbuf); emit_mem2gmem_surf(batch, gmem->zsbuf_base, &pfb->zsbuf, 1, bin_w); } @@ -738,7 +739,10 @@ struct pipe_surface *psurf = pfb->cbufs[i]; if (!psurf) continue; - pitch = fd_resource(psurf->texture)->slices[psurf->u.tex.level].pitch; + struct fdl_slice *slice = + fd_resource_slice(fd_resource(psurf->texture), + psurf->u.tex.level); + pitch = slice->pitch; } fd3_emit_restore(batch, ring); @@ -774,6 +778,7 @@ update_vsc_pipe(struct fd_batch *batch) { struct fd_context *ctx = batch->ctx; + const struct fd_gmem_stateobj *gmem = batch->gmem_state; struct fd3_context *fd3_ctx = fd3_context(ctx); struct fd_ringbuffer *ring = batch->gmem; int i; @@ -782,10 +787,10 @@ OUT_RELOCW(ring, fd3_ctx->vsc_size_mem, 0, 0, 0); /* VSC_SIZE_ADDRESS */ for (i = 0; i < 8; i++) { - struct fd_vsc_pipe *pipe = &ctx->vsc_pipe[i]; + const struct fd_vsc_pipe *pipe = &gmem->vsc_pipe[i]; - if (!pipe->bo) { - pipe->bo = fd_bo_new(ctx->dev, 0x40000, + if (!ctx->vsc_pipe_bo[i]) { + ctx->vsc_pipe_bo[i] = fd_bo_new(ctx->dev, 0x40000, DRM_FREEDRENO_GEM_TYPE_KMEM, "vsc_pipe[%u]", i); } @@ -794,8 +799,8 @@ A3XX_VSC_PIPE_CONFIG_Y(pipe->y) | A3XX_VSC_PIPE_CONFIG_W(pipe->w) | A3XX_VSC_PIPE_CONFIG_H(pipe->h)); - OUT_RELOCW(ring, pipe->bo, 0, 0, 0); /* VSC_PIPE[i].DATA_ADDRESS */ - OUT_RING(ring, fd_bo_size(pipe->bo) - 32); /* VSC_PIPE[i].DATA_LENGTH */ + OUT_RELOCW(ring, ctx->vsc_pipe_bo[i], 0, 0, 0); /* VSC_PIPE[i].DATA_ADDRESS */ + OUT_RING(ring, fd_bo_size(ctx->vsc_pipe_bo[i]) - 32); /* VSC_PIPE[i].DATA_LENGTH */ } } @@ -803,7 +808,7 @@ emit_binning_pass(struct fd_batch *batch) { struct fd_context *ctx = batch->ctx; - struct fd_gmem_stateobj *gmem = &ctx->gmem; + const struct fd_gmem_stateobj *gmem = batch->gmem_state; struct pipe_framebuffer_state *pfb = &batch->framebuffer; struct fd_ringbuffer *ring = batch->gmem; int i; @@ -932,7 +937,7 @@ { struct fd_ringbuffer *ring = batch->gmem; struct pipe_framebuffer_state *pfb = &batch->framebuffer; - struct fd_gmem_stateobj *gmem = &batch->ctx->gmem; + const struct fd_gmem_stateobj *gmem = batch->gmem_state; uint32_t rb_render_control; fd3_emit_restore(batch, ring); @@ -968,7 +973,7 @@ /* before mem2gmem */ static void -fd3_emit_tile_prep(struct fd_batch *batch, struct fd_tile *tile) +fd3_emit_tile_prep(struct fd_batch *batch, const struct fd_tile *tile) { struct fd_ringbuffer *ring = batch->gmem; struct pipe_framebuffer_state *pfb = &batch->framebuffer; @@ -981,12 +986,12 @@ /* before IB to rendering cmds: */ static void -fd3_emit_tile_renderprep(struct fd_batch *batch, struct fd_tile *tile) +fd3_emit_tile_renderprep(struct fd_batch *batch, const struct fd_tile *tile) { struct fd_context *ctx = batch->ctx; struct fd3_context *fd3_ctx = fd3_context(ctx); struct fd_ringbuffer *ring = batch->gmem; - struct fd_gmem_stateobj *gmem = &ctx->gmem; + const struct fd_gmem_stateobj *gmem = batch->gmem_state; struct pipe_framebuffer_state *pfb = &batch->framebuffer; uint32_t x1 = tile->xoff; @@ -1004,18 +1009,19 @@ OUT_RING(ring, reg); if (pfb->zsbuf) { struct fd_resource *rsc = fd_resource(pfb->zsbuf->texture); - OUT_RING(ring, A3XX_RB_DEPTH_PITCH(rsc->cpp * gmem->bin_w)); + OUT_RING(ring, A3XX_RB_DEPTH_PITCH(rsc->layout.cpp * gmem->bin_w)); if (rsc->stencil) { OUT_PKT0(ring, REG_A3XX_RB_STENCIL_INFO, 2); OUT_RING(ring, A3XX_RB_STENCIL_INFO_STENCIL_BASE(gmem->zsbuf_base[1])); - OUT_RING(ring, A3XX_RB_STENCIL_PITCH(rsc->stencil->cpp * gmem->bin_w)); + OUT_RING(ring, A3XX_RB_STENCIL_PITCH(rsc->stencil->layout.cpp * gmem->bin_w)); } } else { OUT_RING(ring, 0x00000000); } if (use_hw_binning(batch)) { - struct fd_vsc_pipe *pipe = &ctx->vsc_pipe[tile->p]; + const struct fd_vsc_pipe *pipe = &gmem->vsc_pipe[tile->p]; + struct fd_bo *pipe_bo = ctx->vsc_pipe_bo[tile->p]; assert(pipe->w && pipe->h); @@ -1028,7 +1034,7 @@ OUT_PKT3(ring, CP_SET_BIN_DATA, 2); - OUT_RELOCW(ring, pipe->bo, 0, 0, 0); /* BIN_DATA_ADDR <- VSC_PIPE[p].DATA_ADDRESS */ + OUT_RELOCW(ring, pipe_bo, 0, 0, 0); /* BIN_DATA_ADDR <- VSC_PIPE[p].DATA_ADDRESS */ OUT_RELOCW(ring, fd3_ctx->vsc_size_mem, /* BIN_SIZE_ADDR <- VSC_SIZE_ADDRESS + (p * 4) */ (tile->p * 4), 0, 0); } else { diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/a3xx/fd3_program.c mesa-20.0.8/src/gallium/drivers/freedreno/a3xx/fd3_program.c --- mesa-19.2.8/src/gallium/drivers/freedreno/a3xx/fd3_program.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/a3xx/fd3_program.c 2020-06-12 01:21:17.000000000 +0000 @@ -29,7 +29,7 @@ #include "util/u_math.h" #include "util/u_memory.h" #include "util/u_inlines.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "freedreno_program.h" @@ -38,43 +38,6 @@ #include "fd3_texture.h" #include "fd3_format.h" -static struct ir3_shader * -create_shader_stateobj(struct pipe_context *pctx, const struct pipe_shader_state *cso, - gl_shader_stage type) -{ - struct fd_context *ctx = fd_context(pctx); - struct ir3_compiler *compiler = ctx->screen->compiler; - return ir3_shader_create(compiler, cso, type, &ctx->debug, pctx->screen); -} - -static void * -fd3_fp_state_create(struct pipe_context *pctx, - const struct pipe_shader_state *cso) -{ - return create_shader_stateobj(pctx, cso, MESA_SHADER_FRAGMENT); -} - -static void -fd3_fp_state_delete(struct pipe_context *pctx, void *hwcso) -{ - struct ir3_shader *so = hwcso; - ir3_shader_destroy(so); -} - -static void * -fd3_vp_state_create(struct pipe_context *pctx, - const struct pipe_shader_state *cso) -{ - return create_shader_stateobj(pctx, cso, MESA_SHADER_VERTEX); -} - -static void -fd3_vp_state_delete(struct pipe_context *pctx, void *hwcso) -{ - struct ir3_shader *so = hwcso; - ir3_shader_destroy(so); -} - bool fd3_needs_manual_clipping(const struct ir3_shader *shader, const struct pipe_rasterizer_state *rast) @@ -211,7 +174,7 @@ face_regid = ir3_find_sysval_regid(fp, SYSTEM_VALUE_FRONT_FACE); coord_regid = ir3_find_sysval_regid(fp, SYSTEM_VALUE_FRAG_COORD); zwcoord_regid = (coord_regid == regid(63,0)) ? regid(63,0) : (coord_regid + 2); - vcoord_regid = ir3_find_sysval_regid(fp, SYSTEM_VALUE_BARYCENTRIC_PIXEL); + vcoord_regid = ir3_find_sysval_regid(fp, SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL); /* adjust regids for alpha output formats. there is no alpha render * format, so it's just treated like red @@ -358,7 +321,7 @@ OUT_PKT0(ring, REG_A3XX_SP_FS_MRT_REG(0), 4); for (i = 0; i < 4; i++) { uint32_t mrt_reg = A3XX_SP_FS_MRT_REG_REGID(color_regid[i]) | - COND(fp->key.half_precision, A3XX_SP_FS_MRT_REG_HALF_PRECISION); + COND(color_regid[i] & HALF_REG_ID, A3XX_SP_FS_MRT_REG_HALF_PRECISION); if (i < nr) { enum pipe_format fmt = pipe_surface_format(bufs[i]); @@ -485,11 +448,6 @@ void fd3_prog_init(struct pipe_context *pctx) { - pctx->create_fs_state = fd3_fp_state_create; - pctx->delete_fs_state = fd3_fp_state_delete; - - pctx->create_vs_state = fd3_vp_state_create; - pctx->delete_vs_state = fd3_vp_state_delete; - + ir3_prog_init(pctx); fd_prog_init(pctx); } diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/a3xx/fd3_resource.c mesa-20.0.8/src/gallium/drivers/freedreno/a3xx/fd3_resource.c --- mesa-19.2.8/src/gallium/drivers/freedreno/a3xx/fd3_resource.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/a3xx/fd3_resource.c 2020-06-12 01:21:17.000000000 +0000 @@ -37,20 +37,34 @@ uint32_t depth = prsc->depth0; for (level = 0; level <= prsc->last_level; level++) { - struct fd_resource_slice *slice = fd_resource_slice(rsc, level); + struct fdl_slice *slice = fd_resource_slice(rsc, level); uint32_t blocks; - if (rsc->tile_mode) { - width = util_next_power_of_two(width); - height = util_next_power_of_two(height); - uint32_t tpitch = width * rsc->cpp; - slice->pitch = (tpitch > 32) ? tpitch : 32; + if (rsc->layout.tile_mode) { + if (prsc->target != PIPE_TEXTURE_CUBE) { + if (level == 0) { + width = util_next_power_of_two(width); + height = util_next_power_of_two(height); + } + width = MAX2(width, 8); + height = MAX2(height, 4); + // Multiplying by 4 is the result of the 4x4 tiling pattern. + slice->pitch = width * 4; + blocks = util_format_get_nblocks(format, width, height); + } else { + uint32_t twidth, theight; + twidth = align(width, 8); + theight = align(height, 4); + // Multiplying by 4 is the result of the 4x4 tiling pattern. + slice->pitch = twidth * 4; + blocks = util_format_get_nblocks(format, twidth, theight); + } } else { slice->pitch = width = align(width, pitchalign); + blocks = util_format_get_nblocks(format, slice->pitch, height); } slice->offset = size; - blocks = util_format_get_nblocks(format, slice->pitch, height); /* 1d array and 2d array textures must all have the same layer size * for each miplevel on a3xx. 3d textures can have different layer * sizes for high levels, but the hw auto-sizer is buggy (or at least @@ -59,12 +73,12 @@ */ if (prsc->target == PIPE_TEXTURE_3D && ( level == 1 || - (level > 1 && rsc->slices[level - 1].size0 > 0xf000))) - slice->size0 = align(blocks * rsc->cpp, alignment); + (level > 1 && fd_resource_slice(rsc, level - 1)->size0 > 0xf000))) + slice->size0 = align(blocks * rsc->layout.cpp, alignment); else if (level == 0 || alignment == 1) - slice->size0 = align(blocks * rsc->cpp, alignment); + slice->size0 = align(blocks * rsc->layout.cpp, alignment); else - slice->size0 = rsc->slices[level - 1].size0; + slice->size0 = fd_resource_slice(rsc, level - 1)->size0; size += slice->size0 * depth * prsc->array_size; @@ -96,16 +110,13 @@ } static bool -ok_format(enum pipe_format pfmt, const struct pipe_resource * tmpl) +ok_format(enum pipe_format pfmt) { enum a3xx_color_fmt fmt = fd3_pipe2color(pfmt); if (fmt == ~0) return false; - if (tmpl->target == PIPE_TEXTURE_CUBE) - return false; - switch (pfmt) { case PIPE_FORMAT_R8_UINT: case PIPE_FORMAT_R8_SINT: @@ -121,7 +132,7 @@ unsigned fd3_tile_mode(const struct pipe_resource *tmpl) { - if (ok_format(tmpl->format, tmpl)) + if (ok_format(tmpl->format)) return TILE_4X4; return LINEAR; } diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/a3xx/fd3_screen.c mesa-20.0.8/src/gallium/drivers/freedreno/a3xx/fd3_screen.c --- mesa-19.2.8/src/gallium/drivers/freedreno/a3xx/fd3_screen.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/a3xx/fd3_screen.c 2020-06-12 01:21:17.000000000 +0000 @@ -25,7 +25,7 @@ */ #include "pipe/p_screen.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "fd3_screen.h" #include "fd3_context.h" diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/a3xx/fd3_texture.c mesa-20.0.8/src/gallium/drivers/freedreno/a3xx/fd3_texture.c --- mesa-19.2.8/src/gallium/drivers/freedreno/a3xx/fd3_texture.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/a3xx/fd3_texture.c 2020-06-12 01:21:17.000000000 +0000 @@ -28,7 +28,7 @@ #include "util/u_string.h" #include "util/u_memory.h" #include "util/u_inlines.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "fd3_texture.h" #include "fd3_format.h" @@ -127,13 +127,20 @@ if (cso->compare_mode) so->texsamp0 |= A3XX_TEX_SAMP_0_COMPARE_FUNC(cso->compare_func); /* maps 1:1 */ + so->texsamp1 = A3XX_TEX_SAMP_1_LOD_BIAS(cso->lod_bias); + if (cso->min_mip_filter != PIPE_TEX_MIPFILTER_NONE) { - so->texsamp1 = - A3XX_TEX_SAMP_1_LOD_BIAS(cso->lod_bias) | - A3XX_TEX_SAMP_1_MIN_LOD(cso->min_lod) | - A3XX_TEX_SAMP_1_MAX_LOD(cso->max_lod); + so->texsamp1 |= + A3XX_TEX_SAMP_1_MIN_LOD(cso->min_lod) | + A3XX_TEX_SAMP_1_MAX_LOD(cso->max_lod); } else { - so->texsamp1 = 0x00000000; + /* If we're not doing mipmap filtering, we still need a slightly > 0 + * LOD clamp so the HW can decide between min and mag filtering of + * level 0. + */ + so->texsamp1 |= + A3XX_TEX_SAMP_1_MIN_LOD(MIN2(cso->min_lod, 0.125)) | + A3XX_TEX_SAMP_1_MAX_LOD(MIN2(cso->max_lod, 0.125)); } return so; @@ -215,7 +222,6 @@ struct fd3_pipe_sampler_view *so = CALLOC_STRUCT(fd3_pipe_sampler_view); struct fd_resource *rsc = fd_resource(prsc); unsigned lvl; - uint32_t sz2 = 0; if (!so) return NULL; @@ -227,7 +233,7 @@ so->base.context = pctx; so->texconst0 = - A3XX_TEX_CONST_0_TILE_MODE(rsc->tile_mode) | + A3XX_TEX_CONST_0_TILE_MODE(rsc->layout.tile_mode) | A3XX_TEX_CONST_0_TYPE(tex_type(prsc->target)) | A3XX_TEX_CONST_0_FMT(fd3_pipe2tex(cso->format)) | fd3_tex_swiz(cso->format, cso->swizzle_r, cso->swizzle_g, @@ -257,22 +263,22 @@ A3XX_TEX_CONST_1_HEIGHT(u_minify(prsc->height0, lvl)); } /* when emitted, A3XX_TEX_CONST_2_INDX() must be OR'd in: */ + struct fdl_slice *slice = fd_resource_slice(rsc, lvl); so->texconst2 = - A3XX_TEX_CONST_2_PITCH(fd3_pipe2nblocksx(cso->format, rsc->slices[lvl].pitch) * rsc->cpp); + A3XX_TEX_CONST_2_PITCH(fd3_pipe2nblocksx(cso->format, slice->pitch) * rsc->layout.cpp); switch (prsc->target) { case PIPE_TEXTURE_1D_ARRAY: case PIPE_TEXTURE_2D_ARRAY: so->texconst3 = A3XX_TEX_CONST_3_DEPTH(prsc->array_size - 1) | - A3XX_TEX_CONST_3_LAYERSZ1(rsc->slices[0].size0); + A3XX_TEX_CONST_3_LAYERSZ1(slice->size0); break; case PIPE_TEXTURE_3D: so->texconst3 = A3XX_TEX_CONST_3_DEPTH(u_minify(prsc->depth0, lvl)) | - A3XX_TEX_CONST_3_LAYERSZ1(rsc->slices[lvl].size0); - while (lvl < cso->u.tex.last_level && sz2 != rsc->slices[lvl+1].size0) - sz2 = rsc->slices[++lvl].size0; - so->texconst3 |= A3XX_TEX_CONST_3_LAYERSZ2(sz2); + A3XX_TEX_CONST_3_LAYERSZ1(slice->size0); + so->texconst3 |= A3XX_TEX_CONST_3_LAYERSZ2( + fd_resource_slice(rsc, prsc->last_level)->size0); break; default: so->texconst3 = 0x00000000; diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/a4xx/fd4_blend.c mesa-20.0.8/src/gallium/drivers/freedreno/a4xx/fd4_blend.c --- mesa-19.2.8/src/gallium/drivers/freedreno/a4xx/fd4_blend.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/a4xx/fd4_blend.c 2020-06-12 01:21:17.000000000 +0000 @@ -97,22 +97,14 @@ else rt = &cso->rt[0]; - so->rb_mrt[i].blend_control_rgb = + so->rb_mrt[i].blend_control = A4XX_RB_MRT_BLEND_CONTROL_RGB_SRC_FACTOR(fd_blend_factor(rt->rgb_src_factor)) | A4XX_RB_MRT_BLEND_CONTROL_RGB_BLEND_OPCODE(blend_func(rt->rgb_func)) | - A4XX_RB_MRT_BLEND_CONTROL_RGB_DEST_FACTOR(fd_blend_factor(rt->rgb_dst_factor)); - - so->rb_mrt[i].blend_control_alpha = + A4XX_RB_MRT_BLEND_CONTROL_RGB_DEST_FACTOR(fd_blend_factor(rt->rgb_dst_factor)) | A4XX_RB_MRT_BLEND_CONTROL_ALPHA_SRC_FACTOR(fd_blend_factor(rt->alpha_src_factor)) | A4XX_RB_MRT_BLEND_CONTROL_ALPHA_BLEND_OPCODE(blend_func(rt->alpha_func)) | A4XX_RB_MRT_BLEND_CONTROL_ALPHA_DEST_FACTOR(fd_blend_factor(rt->alpha_dst_factor)); - so->rb_mrt[i].blend_control_no_alpha_rgb = - A4XX_RB_MRT_BLEND_CONTROL_RGB_SRC_FACTOR(fd_blend_factor(util_blend_dst_alpha_to_one(rt->rgb_src_factor))) | - A4XX_RB_MRT_BLEND_CONTROL_RGB_BLEND_OPCODE(blend_func(rt->rgb_func)) | - A4XX_RB_MRT_BLEND_CONTROL_RGB_DEST_FACTOR(fd_blend_factor(util_blend_dst_alpha_to_one(rt->rgb_dst_factor))); - - so->rb_mrt[i].control = A4XX_RB_MRT_CONTROL_ROP_CODE(rop) | COND(cso->logicop_enable, A4XX_RB_MRT_CONTROL_ROP_ENABLE) | diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/a4xx/fd4_blend.h mesa-20.0.8/src/gallium/drivers/freedreno/a4xx/fd4_blend.h --- mesa-19.2.8/src/gallium/drivers/freedreno/a4xx/fd4_blend.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/a4xx/fd4_blend.h 2020-06-12 01:21:17.000000000 +0000 @@ -37,12 +37,7 @@ struct { uint32_t control; uint32_t buf_info; - /* Blend control bits for color if there is an alpha channel */ - uint32_t blend_control_rgb; - /* Blend control bits for color if there is no alpha channel */ - uint32_t blend_control_no_alpha_rgb; - /* Blend control bits for alpha channel */ - uint32_t blend_control_alpha; + uint32_t blend_control; } rb_mrt[A4XX_MAX_RENDER_TARGETS]; uint32_t rb_fs_output; }; diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/a4xx/fd4_draw.c mesa-20.0.8/src/gallium/drivers/freedreno/a4xx/fd4_draw.c --- mesa-19.2.8/src/gallium/drivers/freedreno/a4xx/fd4_draw.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/a4xx/fd4_draw.c 2020-06-12 01:21:17.000000000 +0000 @@ -170,8 +170,8 @@ /* and now binning pass: */ emit.binning_pass = true; emit.dirty = dirty & ~(FD_DIRTY_BLEND); - emit.vp = NULL; /* we changed key so need to refetch vp */ - emit.fp = NULL; + emit.vs = NULL; /* we changed key so need to refetch vs */ + emit.fs = NULL; draw_impl(ctx, ctx->batch->binning, &emit, index_offset); fd_context_all_clean(ctx); diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/a4xx/fd4_emit.c mesa-20.0.8/src/gallium/drivers/freedreno/a4xx/fd4_emit.c --- mesa-19.2.8/src/gallium/drivers/freedreno/a4xx/fd4_emit.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/a4xx/fd4_emit.c 2020-06-12 01:21:17.000000000 +0000 @@ -28,7 +28,7 @@ #include "util/u_string.h" #include "util/u_memory.h" #include "util/u_helpers.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_viewport.h" #include "freedreno_resource.h" @@ -301,7 +301,7 @@ /* note: PIPE_BUFFER disallowed for surfaces */ unsigned lvl = bufs[i]->u.tex.level; - struct fd_resource_slice *slice = fd_resource_slice(rsc, lvl); + struct fdl_slice *slice = fd_resource_slice(rsc, lvl); unsigned offset = fd_resource_offset(rsc, lvl, bufs[i]->u.tex.first_layer); /* z32 restore is accomplished using depth write. If there is @@ -323,7 +323,7 @@ PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W)); OUT_RING(ring, A4XX_TEX_CONST_1_WIDTH(bufs[i]->width) | A4XX_TEX_CONST_1_HEIGHT(bufs[i]->height)); - OUT_RING(ring, A4XX_TEX_CONST_2_PITCH(slice->pitch * rsc->cpp) | + OUT_RING(ring, A4XX_TEX_CONST_2_PITCH(slice->pitch * rsc->layout.cpp) | A4XX_TEX_CONST_2_FETCHSIZE(fd4_pipe2fetchsize(format))); OUT_RING(ring, 0x00000000); OUT_RELOC(ring, rsc->bo, offset, 0, 0); @@ -376,9 +376,6 @@ continue; if (vp->inputs[i].sysval) { switch(vp->inputs[i].slot) { - case SYSTEM_VALUE_FIRST_VERTEX: - /* handled elsewhere */ - break; case SYSTEM_VALUE_VERTEX_ID_ZERO_BASE: vertex_regid = vp->inputs[i].regid; break; @@ -443,7 +440,7 @@ COND(isint, A4XX_VFD_DECODE_INSTR_INT) | COND(switchnext, A4XX_VFD_DECODE_INSTR_SWITCHNEXT)); - total_in += vp->inputs[i].ncomp; + total_in += util_bitcount(vp->inputs[i].compmask); j++; } } @@ -700,17 +697,13 @@ bool is_int = util_format_is_pure_integer(format); bool has_alpha = util_format_has_alpha(format); uint32_t control = blend->rb_mrt[i].control; - uint32_t blend_control = blend->rb_mrt[i].blend_control_alpha; if (is_int) { control &= A4XX_RB_MRT_CONTROL_COMPONENT_ENABLE__MASK; control |= A4XX_RB_MRT_CONTROL_ROP_CODE(ROP_COPY); } - if (has_alpha) { - blend_control |= blend->rb_mrt[i].blend_control_rgb; - } else { - blend_control |= blend->rb_mrt[i].blend_control_no_alpha_rgb; + if (!has_alpha) { control &= ~A4XX_RB_MRT_CONTROL_BLEND2; } @@ -718,7 +711,7 @@ OUT_RING(ring, control); OUT_PKT0(ring, REG_A4XX_RB_MRT_BLEND_CONTROL(i), 1); - OUT_RING(ring, blend_control); + OUT_RING(ring, blend->rb_mrt[i].blend_control); } OUT_PKT0(ring, REG_A4XX_RB_FS_OUTPUT, 1); diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/a4xx/fd4_emit.h mesa-20.0.8/src/gallium/drivers/freedreno/a4xx/fd4_emit.h --- mesa-19.2.8/src/gallium/drivers/freedreno/a4xx/fd4_emit.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/a4xx/fd4_emit.h 2020-06-12 01:21:17.000000000 +0000 @@ -55,7 +55,7 @@ bool no_decode_srgb; /* cached to avoid repeated lookups of same variants: */ - const struct ir3_shader_variant *vp, *fp; + const struct ir3_shader_variant *vs, *fs; /* TODO: other shader stages.. */ }; @@ -69,29 +69,29 @@ static inline const struct ir3_shader_variant * fd4_emit_get_vp(struct fd4_emit *emit) { - if (!emit->vp) { - struct ir3_shader *shader = emit->prog->vp; - emit->vp = ir3_shader_variant(shader, emit->key, + if (!emit->vs) { + struct ir3_shader *shader = emit->prog->vs; + emit->vs = ir3_shader_variant(shader, emit->key, emit->binning_pass, emit->debug); } - return emit->vp; + return emit->vs; } static inline const struct ir3_shader_variant * fd4_emit_get_fp(struct fd4_emit *emit) { - if (!emit->fp) { + if (!emit->fs) { if (emit->binning_pass) { /* use dummy stateobj to simplify binning vs non-binning: */ - static const struct ir3_shader_variant binning_fp = {}; - emit->fp = &binning_fp; + static const struct ir3_shader_variant binning_fs = {}; + emit->fs = &binning_fs; } else { - struct ir3_shader *shader = emit->prog->fp; - emit->fp = ir3_shader_variant(shader, emit->key, + struct ir3_shader *shader = emit->prog->fs; + emit->fs = ir3_shader_variant(shader, emit->key, false, emit->debug); } } - return emit->fp; + return emit->fs; } void fd4_emit_vertex_bufs(struct fd_ringbuffer *ring, struct fd4_emit *emit); diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/a4xx/fd4_format.c mesa-20.0.8/src/gallium/drivers/freedreno/a4xx/fd4_format.c --- mesa-19.2.8/src/gallium/drivers/freedreno/a4xx/fd4_format.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/a4xx/fd4_format.c 2020-06-12 01:21:17.000000000 +0000 @@ -25,7 +25,7 @@ */ #include "pipe/p_defines.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "fd4_format.h" @@ -81,7 +81,7 @@ VT(R8_UINT, 8_UINT, R8_UINT, WZYX), VT(R8_SINT, 8_SINT, R8_SINT, WZYX), V_(R8_USCALED, 8_UINT, NONE, WZYX), - V_(R8_SSCALED, 8_UINT, NONE, WZYX), + V_(R8_SSCALED, 8_SINT, NONE, WZYX), _T(A8_UNORM, 8_UNORM, A8_UNORM, WZYX), _T(L8_UNORM, 8_UNORM, R8_UNORM, WZYX), @@ -102,7 +102,7 @@ VT(R16_UINT, 16_UINT, R16_UINT, WZYX), VT(R16_SINT, 16_SINT, R16_SINT, WZYX), V_(R16_USCALED, 16_UINT, NONE, WZYX), - V_(R16_SSCALED, 16_UINT, NONE, WZYX), + V_(R16_SSCALED, 16_SINT, NONE, WZYX), VT(R16_FLOAT, 16_FLOAT, R16_FLOAT, WZYX), _T(A16_UNORM, 16_UNORM, NONE, WZYX), @@ -145,7 +145,7 @@ VT(R32_UINT, 32_UINT, R32_UINT, WZYX), VT(R32_SINT, 32_SINT, R32_SINT, WZYX), V_(R32_USCALED, 32_UINT, NONE, WZYX), - V_(R32_SSCALED, 32_UINT, NONE, WZYX), + V_(R32_SSCALED, 32_SINT, NONE, WZYX), VT(R32_FLOAT, 32_FLOAT, R32_FLOAT,WZYX), V_(R32_FIXED, 32_FIXED, NONE, WZYX), diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/a4xx/fd4_gmem.c mesa-20.0.8/src/gallium/drivers/freedreno/a4xx/fd4_gmem.c --- mesa-19.2.8/src/gallium/drivers/freedreno/a4xx/fd4_gmem.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/a4xx/fd4_gmem.c 2020-06-12 01:21:17.000000000 +0000 @@ -28,7 +28,7 @@ #include "util/u_string.h" #include "util/u_memory.h" #include "util/u_inlines.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "freedreno_draw.h" #include "freedreno_state.h" @@ -44,7 +44,7 @@ static void emit_mrt(struct fd_ringbuffer *ring, unsigned nr_bufs, - struct pipe_surface **bufs, uint32_t *bases, + struct pipe_surface **bufs, const uint32_t *bases, uint32_t bin_w, bool decode_srgb) { enum a4xx_tile_mode tile_mode; @@ -61,7 +61,7 @@ enum a3xx_color_swap swap = WZYX; bool srgb = false; struct fd_resource *rsc = NULL; - struct fd_resource_slice *slice = NULL; + struct fdl_slice *slice = NULL; uint32_t stride = 0; uint32_t base = 0; uint32_t offset = 0; @@ -97,13 +97,13 @@ psurf->u.tex.first_layer); if (bin_w) { - stride = bin_w * rsc->cpp; + stride = bin_w * rsc->layout.cpp; if (bases) { base = bases[i]; } } else { - stride = slice->pitch * rsc->cpp; + stride = slice->pitch * rsc->layout.cpp; } } else if ((i < nr_bufs) && bases) { base = bases[i]; @@ -132,7 +132,7 @@ static bool use_hw_binning(struct fd_batch *batch) { - struct fd_gmem_stateobj *gmem = &batch->ctx->gmem; + const struct fd_gmem_stateobj *gmem = batch->gmem_state; if ((gmem->maxpw * gmem->maxph) > 32) return false; @@ -152,7 +152,7 @@ struct fd_ringbuffer *ring = batch->gmem; struct fd_resource *rsc = fd_resource(psurf->texture); enum pipe_format pformat = psurf->format; - struct fd_resource_slice *slice; + struct fdl_slice *slice; uint32_t offset; if (!rsc->valid) @@ -164,7 +164,7 @@ pformat = rsc->base.format; } - slice = &rsc->slices[psurf->u.tex.level]; + slice = fd_resource_slice(rsc, psurf->u.tex.level); offset = fd_resource_offset(rsc, psurf->u.tex.level, psurf->u.tex.first_layer); @@ -175,7 +175,7 @@ A4XX_RB_COPY_CONTROL_MODE(RB_COPY_RESOLVE) | A4XX_RB_COPY_CONTROL_GMEM_BASE(base)); OUT_RELOCW(ring, rsc->bo, offset, 0, 0); /* RB_COPY_DEST_BASE */ - OUT_RING(ring, A4XX_RB_COPY_DEST_PITCH_PITCH(slice->pitch * rsc->cpp)); + OUT_RING(ring, A4XX_RB_COPY_DEST_PITCH_PITCH(slice->pitch * rsc->layout.cpp)); OUT_RING(ring, A4XX_RB_COPY_DEST_INFO_TILE(TILE4_LINEAR) | A4XX_RB_COPY_DEST_INFO_FORMAT(fd4_pipe2color(pformat)) | A4XX_RB_COPY_DEST_INFO_COMPONENT_ENABLE(0xf) | @@ -187,10 +187,10 @@ } static void -fd4_emit_tile_gmem2mem(struct fd_batch *batch, struct fd_tile *tile) +fd4_emit_tile_gmem2mem(struct fd_batch *batch, const struct fd_tile *tile) { struct fd_context *ctx = batch->ctx; - struct fd_gmem_stateobj *gmem = &ctx->gmem; + const struct fd_gmem_stateobj *gmem = batch->gmem_state; struct fd_ringbuffer *ring = batch->gmem; struct pipe_framebuffer_state *pfb = &batch->framebuffer; struct fd4_emit emit = { @@ -274,9 +274,9 @@ if (batch->resolve & (FD_BUFFER_DEPTH | FD_BUFFER_STENCIL)) { struct fd_resource *rsc = fd_resource(pfb->zsbuf->texture); if (!rsc->stencil || (batch->resolve & FD_BUFFER_DEPTH)) - emit_gmem2mem_surf(batch, false, ctx->gmem.zsbuf_base[0], pfb->zsbuf); + emit_gmem2mem_surf(batch, false, gmem->zsbuf_base[0], pfb->zsbuf); if (rsc->stencil && (batch->resolve & FD_BUFFER_STENCIL)) - emit_gmem2mem_surf(batch, true, ctx->gmem.zsbuf_base[1], pfb->zsbuf); + emit_gmem2mem_surf(batch, true, gmem->zsbuf_base[1], pfb->zsbuf); } if (batch->resolve & FD_BUFFER_COLOR) { @@ -300,7 +300,7 @@ /* transfer from system memory to gmem */ static void -emit_mem2gmem_surf(struct fd_batch *batch, uint32_t *bases, +emit_mem2gmem_surf(struct fd_batch *batch, const uint32_t *bases, struct pipe_surface **bufs, uint32_t nr_bufs, uint32_t bin_w) { struct fd_ringbuffer *ring = batch->gmem; @@ -325,10 +325,10 @@ } static void -fd4_emit_tile_mem2gmem(struct fd_batch *batch, struct fd_tile *tile) +fd4_emit_tile_mem2gmem(struct fd_batch *batch, const struct fd_tile *tile) { struct fd_context *ctx = batch->ctx; - struct fd_gmem_stateobj *gmem = &ctx->gmem; + const struct fd_gmem_stateobj *gmem = batch->gmem_state; struct fd_ringbuffer *ring = batch->gmem; struct pipe_framebuffer_state *pfb = &batch->framebuffer; struct fd4_emit emit = { @@ -459,7 +459,7 @@ if (fd_gmem_needs_restore(batch, tile, FD_BUFFER_COLOR)) { emit.prog = &ctx->blit_prog[pfb->nr_cbufs - 1]; - emit.fp = NULL; /* frag shader changed so clear cache */ + emit.fs = NULL; /* frag shader changed so clear cache */ fd4_program_emit(ring, &emit, pfb->nr_cbufs, pfb->cbufs); emit_mem2gmem_surf(batch, gmem->cbuf_base, pfb->cbufs, pfb->nr_cbufs, bin_w); } @@ -493,7 +493,7 @@ emit.key.half_precision = true; break; } - emit.fp = NULL; /* frag shader changed so clear cache */ + emit.fs = NULL; /* frag shader changed so clear cache */ fd4_program_emit(ring, &emit, 1, &pfb->zsbuf); emit_mem2gmem_surf(batch, gmem->zsbuf_base, &pfb->zsbuf, 1, bin_w); } @@ -561,6 +561,7 @@ update_vsc_pipe(struct fd_batch *batch) { struct fd_context *ctx = batch->ctx; + const struct fd_gmem_stateobj *gmem = batch->gmem_state; struct fd4_context *fd4_ctx = fd4_context(ctx); struct fd_ringbuffer *ring = batch->gmem; int i; @@ -570,7 +571,7 @@ OUT_PKT0(ring, REG_A4XX_VSC_PIPE_CONFIG_REG(0), 8); for (i = 0; i < 8; i++) { - struct fd_vsc_pipe *pipe = &ctx->vsc_pipe[i]; + const struct fd_vsc_pipe *pipe = &gmem->vsc_pipe[i]; OUT_RING(ring, A4XX_VSC_PIPE_CONFIG_REG_X(pipe->x) | A4XX_VSC_PIPE_CONFIG_REG_Y(pipe->y) | A4XX_VSC_PIPE_CONFIG_REG_W(pipe->w) | @@ -579,26 +580,23 @@ OUT_PKT0(ring, REG_A4XX_VSC_PIPE_DATA_ADDRESS_REG(0), 8); for (i = 0; i < 8; i++) { - struct fd_vsc_pipe *pipe = &ctx->vsc_pipe[i]; - if (!pipe->bo) { - pipe->bo = fd_bo_new(ctx->dev, 0x40000, + if (!ctx->vsc_pipe_bo[i]) { + ctx->vsc_pipe_bo[i] = fd_bo_new(ctx->dev, 0x40000, DRM_FREEDRENO_GEM_TYPE_KMEM, "vsc_pipe[%u]", i); } - OUT_RELOCW(ring, pipe->bo, 0, 0, 0); /* VSC_PIPE_DATA_ADDRESS[i] */ + OUT_RELOCW(ring, ctx->vsc_pipe_bo[i], 0, 0, 0); /* VSC_PIPE_DATA_ADDRESS[i] */ } OUT_PKT0(ring, REG_A4XX_VSC_PIPE_DATA_LENGTH_REG(0), 8); for (i = 0; i < 8; i++) { - struct fd_vsc_pipe *pipe = &ctx->vsc_pipe[i]; - OUT_RING(ring, fd_bo_size(pipe->bo) - 32); /* VSC_PIPE_DATA_LENGTH[i] */ + OUT_RING(ring, fd_bo_size(ctx->vsc_pipe_bo[i]) - 32); /* VSC_PIPE_DATA_LENGTH[i] */ } } static void emit_binning_pass(struct fd_batch *batch) { - struct fd_context *ctx = batch->ctx; - struct fd_gmem_stateobj *gmem = &ctx->gmem; + const struct fd_gmem_stateobj *gmem = batch->gmem_state; struct pipe_framebuffer_state *pfb = &batch->framebuffer; struct fd_ringbuffer *ring = batch->gmem; int i; @@ -665,7 +663,7 @@ { struct fd_ringbuffer *ring = batch->gmem; struct pipe_framebuffer_state *pfb = &batch->framebuffer; - struct fd_gmem_stateobj *gmem = &batch->ctx->gmem; + const struct fd_gmem_stateobj *gmem = batch->gmem_state; fd4_emit_restore(batch, ring); @@ -706,16 +704,15 @@ /* before mem2gmem */ static void -fd4_emit_tile_prep(struct fd_batch *batch, struct fd_tile *tile) +fd4_emit_tile_prep(struct fd_batch *batch, const struct fd_tile *tile) { - struct fd_context *ctx = batch->ctx; struct fd_ringbuffer *ring = batch->gmem; struct pipe_framebuffer_state *pfb = &batch->framebuffer; - struct fd_gmem_stateobj *gmem = &ctx->gmem; + const struct fd_gmem_stateobj *gmem = batch->gmem_state; if (pfb->zsbuf) { struct fd_resource *rsc = fd_resource(pfb->zsbuf->texture); - uint32_t cpp = rsc->cpp; + uint32_t cpp = rsc->layout.cpp; OUT_PKT0(ring, REG_A4XX_RB_DEPTH_INFO, 3); OUT_RING(ring, A4XX_RB_DEPTH_INFO_DEPTH_BASE(gmem->zsbuf_base[0]) | @@ -727,7 +724,7 @@ if (rsc->stencil) { OUT_RING(ring, A4XX_RB_STENCIL_INFO_SEPARATE_STENCIL | A4XX_RB_STENCIL_INFO_STENCIL_BASE(gmem->zsbuf_base[1])); - OUT_RING(ring, A4XX_RB_STENCIL_PITCH(rsc->stencil->cpp * gmem->bin_w)); + OUT_RING(ring, A4XX_RB_STENCIL_PITCH(rsc->stencil->layout.cpp * gmem->bin_w)); } else { OUT_RING(ring, 0x00000000); OUT_RING(ring, 0x00000000); @@ -754,12 +751,12 @@ /* before IB to rendering cmds: */ static void -fd4_emit_tile_renderprep(struct fd_batch *batch, struct fd_tile *tile) +fd4_emit_tile_renderprep(struct fd_batch *batch, const struct fd_tile *tile) { struct fd_context *ctx = batch->ctx; struct fd4_context *fd4_ctx = fd4_context(ctx); struct fd_ringbuffer *ring = batch->gmem; - struct fd_gmem_stateobj *gmem = &ctx->gmem; + const struct fd_gmem_stateobj *gmem = batch->gmem_state; struct pipe_framebuffer_state *pfb = &batch->framebuffer; uint32_t x1 = tile->xoff; @@ -768,7 +765,8 @@ uint32_t y2 = tile->yoff + tile->bin_h - 1; if (use_hw_binning(batch)) { - struct fd_vsc_pipe *pipe = &ctx->vsc_pipe[tile->p]; + const struct fd_vsc_pipe *pipe = &gmem->vsc_pipe[tile->p]; + struct fd_bo *pipe_bo = ctx->vsc_pipe_bo[tile->p]; assert(pipe->w && pipe->h); @@ -780,7 +778,7 @@ A4XX_PC_VSTREAM_CONTROL_N(tile->n)); OUT_PKT3(ring, CP_SET_BIN_DATA, 2); - OUT_RELOCW(ring, pipe->bo, 0, 0, 0); /* BIN_DATA_ADDR <- VSC_PIPE[p].DATA_ADDRESS */ + OUT_RELOCW(ring, pipe_bo, 0, 0, 0); /* BIN_DATA_ADDR <- VSC_PIPE[p].DATA_ADDRESS */ OUT_RELOCW(ring, fd4_ctx->vsc_size_mem, /* BIN_SIZE_ADDR <- VSC_SIZE_ADDRESS + (p * 4) */ (tile->p * 4), 0, 0); } else { diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/a4xx/fd4_program.c mesa-20.0.8/src/gallium/drivers/freedreno/a4xx/fd4_program.c --- mesa-19.2.8/src/gallium/drivers/freedreno/a4xx/fd4_program.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/a4xx/fd4_program.c 2020-06-12 01:21:17.000000000 +0000 @@ -28,7 +28,7 @@ #include "util/u_string.h" #include "util/u_memory.h" #include "util/u_inlines.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "freedreno_program.h" @@ -37,43 +37,6 @@ #include "fd4_texture.h" #include "fd4_format.h" -static struct ir3_shader * -create_shader_stateobj(struct pipe_context *pctx, const struct pipe_shader_state *cso, - gl_shader_stage type) -{ - struct fd_context *ctx = fd_context(pctx); - struct ir3_compiler *compiler = ctx->screen->compiler; - return ir3_shader_create(compiler, cso, type, &ctx->debug, pctx->screen); -} - -static void * -fd4_fp_state_create(struct pipe_context *pctx, - const struct pipe_shader_state *cso) -{ - return create_shader_stateobj(pctx, cso, MESA_SHADER_FRAGMENT); -} - -static void -fd4_fp_state_delete(struct pipe_context *pctx, void *hwcso) -{ - struct ir3_shader *so = hwcso; - ir3_shader_destroy(so); -} - -static void * -fd4_vp_state_create(struct pipe_context *pctx, - const struct pipe_shader_state *cso) -{ - return create_shader_stateobj(pctx, cso, MESA_SHADER_VERTEX); -} - -static void -fd4_vp_state_delete(struct pipe_context *pctx, void *hwcso) -{ - struct ir3_shader *so = hwcso; - ir3_shader_destroy(so); -} - static void emit_shader(struct fd_ringbuffer *ring, const struct ir3_shader_variant *so) { @@ -245,7 +208,7 @@ face_regid = ir3_find_sysval_regid(s[FS].v, SYSTEM_VALUE_FRONT_FACE); coord_regid = ir3_find_sysval_regid(s[FS].v, SYSTEM_VALUE_FRAG_COORD); zwcoord_regid = (coord_regid == regid(63,0)) ? regid(63,0) : (coord_regid + 2); - vcoord_regid = ir3_find_sysval_regid(s[FS].v, SYSTEM_VALUE_BARYCENTRIC_PIXEL); + vcoord_regid = ir3_find_sysval_regid(s[FS].v, SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL); /* we could probably divide this up into things that need to be * emitted if frag-prog is dirty vs if vert-prog is dirty.. @@ -448,7 +411,7 @@ OUT_RING(ring, A4XX_SP_FS_MRT_REG_REGID(color_regid[i]) | A4XX_SP_FS_MRT_REG_MRTFORMAT(format) | COND(srgb, A4XX_SP_FS_MRT_REG_COLOR_SRGB) | - COND(emit->key.half_precision, + COND(color_regid[i] & HALF_REG_ID, A4XX_SP_FS_MRT_REG_HALF_PRECISION)); } @@ -569,11 +532,6 @@ void fd4_prog_init(struct pipe_context *pctx) { - pctx->create_fs_state = fd4_fp_state_create; - pctx->delete_fs_state = fd4_fp_state_delete; - - pctx->create_vs_state = fd4_vp_state_create; - pctx->delete_vs_state = fd4_vp_state_delete; - + ir3_prog_init(pctx); fd_prog_init(pctx); } diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/a4xx/fd4_query.c mesa-20.0.8/src/gallium/drivers/freedreno/a4xx/fd4_query.c --- mesa-19.2.8/src/gallium/drivers/freedreno/a4xx/fd4_query.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/a4xx/fd4_query.c 2020-06-12 01:21:17.000000000 +0000 @@ -166,7 +166,7 @@ OUT_PKT3(ring, CP_REG_TO_MEM, 2); OUT_RING(ring, CP_REG_TO_MEM_0_REG(REG_A4XX_RBBM_PERFCTR_CP_0_LO) | CP_REG_TO_MEM_0_64B | - CP_REG_TO_MEM_0_CNT(2-1)); /* write 2 regs to mem */ + CP_REG_TO_MEM_0_CNT(2)); /* write 2 regs to mem */ OUT_RELOCW(ring, scratch_bo, sample_off, 0, 0); /* ok... here we really *would* like to use the CP_SET_CONSTANT @@ -188,7 +188,7 @@ OUT_PKT3(ring, CP_REG_TO_MEM, 2); OUT_RING(ring, CP_REG_TO_MEM_0_REG(HW_QUERY_BASE_REG) | CP_REG_TO_MEM_0_ACCUMULATE | - CP_REG_TO_MEM_0_CNT(1-1)); /* readback 1 regs */ + CP_REG_TO_MEM_0_CNT(0)); /* readback 1 regs */ OUT_RELOCW(ring, scratch_bo, addr_off, 0, 0); /* now copy that back to CP_ME_NRT_ADDR: */ diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/a4xx/fd4_screen.c mesa-20.0.8/src/gallium/drivers/freedreno/a4xx/fd4_screen.c --- mesa-19.2.8/src/gallium/drivers/freedreno/a4xx/fd4_screen.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/a4xx/fd4_screen.c 2020-06-12 01:21:17.000000000 +0000 @@ -25,7 +25,7 @@ */ #include "pipe/p_screen.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "fd4_screen.h" #include "fd4_context.h" @@ -60,9 +60,9 @@ } if ((usage & PIPE_BIND_SAMPLER_VIEW) && + (fd4_pipe2tex(format) != (enum a4xx_tex_fmt)~0) && (target == PIPE_BUFFER || - util_format_get_blocksize(format) != 12) && - (fd4_pipe2tex(format) != (enum a4xx_tex_fmt)~0)) { + util_format_get_blocksize(format) != 12)) { retval |= PIPE_BIND_SAMPLER_VIEW; } diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/a4xx/fd4_texture.c mesa-20.0.8/src/gallium/drivers/freedreno/a4xx/fd4_texture.c --- mesa-19.2.8/src/gallium/drivers/freedreno/a4xx/fd4_texture.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/a4xx/fd4_texture.c 2020-06-12 01:21:17.000000000 +0000 @@ -28,7 +28,7 @@ #include "util/u_string.h" #include "util/u_memory.h" #include "util/u_inlines.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "fd4_texture.h" #include "fd4_format.h" @@ -222,9 +222,9 @@ { struct fd4_pipe_sampler_view *so = CALLOC_STRUCT(fd4_pipe_sampler_view); struct fd_resource *rsc = fd_resource(prsc); + struct fdl_slice *slice = NULL; enum pipe_format format = cso->format; unsigned lvl, layers = 0; - uint32_t sz2 = 0; if (!so) return NULL; @@ -261,12 +261,13 @@ A4XX_TEX_CONST_1_HEIGHT(1); so->texconst2 = A4XX_TEX_CONST_2_FETCHSIZE(fd4_pipe2fetchsize(format)) | - A4XX_TEX_CONST_2_PITCH(elements * rsc->cpp); + A4XX_TEX_CONST_2_PITCH(elements * rsc->layout.cpp); so->offset = cso->u.buf.offset; } else { unsigned miplevels; lvl = fd_sampler_first_level(cso); + slice = fd_resource_slice(rsc, lvl); miplevels = fd_sampler_last_level(cso) - lvl; layers = cso->u.tex.last_layer - cso->u.tex.first_layer + 1; @@ -277,8 +278,7 @@ so->texconst2 = A4XX_TEX_CONST_2_FETCHSIZE(fd4_pipe2fetchsize(format)) | A4XX_TEX_CONST_2_PITCH( - util_format_get_nblocksx( - format, rsc->slices[lvl].pitch) * rsc->cpp); + util_format_get_nblocksx(format, slice->pitch) * rsc->layout.cpp); so->offset = fd_resource_offset(rsc, lvl, cso->u.tex.first_layer); } @@ -299,21 +299,20 @@ case PIPE_TEXTURE_2D_ARRAY: so->texconst3 = A4XX_TEX_CONST_3_DEPTH(layers) | - A4XX_TEX_CONST_3_LAYERSZ(rsc->layer_size); + A4XX_TEX_CONST_3_LAYERSZ(rsc->layout.layer_size); break; case PIPE_TEXTURE_CUBE: case PIPE_TEXTURE_CUBE_ARRAY: so->texconst3 = A4XX_TEX_CONST_3_DEPTH(layers / 6) | - A4XX_TEX_CONST_3_LAYERSZ(rsc->layer_size); + A4XX_TEX_CONST_3_LAYERSZ(rsc->layout.layer_size); break; case PIPE_TEXTURE_3D: so->texconst3 = A4XX_TEX_CONST_3_DEPTH(u_minify(prsc->depth0, lvl)) | - A4XX_TEX_CONST_3_LAYERSZ(rsc->slices[lvl].size0); - while (lvl < cso->u.tex.last_level && sz2 != rsc->slices[lvl+1].size0) - sz2 = rsc->slices[++lvl].size0; - so->texconst4 = A4XX_TEX_CONST_4_LAYERSZ(sz2); + A4XX_TEX_CONST_3_LAYERSZ(slice->size0); + so->texconst4 = A4XX_TEX_CONST_4_LAYERSZ( + fd_resource_slice(rsc, prsc->last_level)->size0); break; default: so->texconst3 = 0x00000000; diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/a5xx/fd5_blend.c mesa-20.0.8/src/gallium/drivers/freedreno/a5xx/fd5_blend.c --- mesa-19.2.8/src/gallium/drivers/freedreno/a5xx/fd5_blend.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/a5xx/fd5_blend.c 2020-06-12 01:21:17.000000000 +0000 @@ -100,22 +100,14 @@ else rt = &cso->rt[0]; - so->rb_mrt[i].blend_control_rgb = + so->rb_mrt[i].blend_control = A5XX_RB_MRT_BLEND_CONTROL_RGB_SRC_FACTOR(fd_blend_factor(rt->rgb_src_factor)) | A5XX_RB_MRT_BLEND_CONTROL_RGB_BLEND_OPCODE(blend_func(rt->rgb_func)) | - A5XX_RB_MRT_BLEND_CONTROL_RGB_DEST_FACTOR(fd_blend_factor(rt->rgb_dst_factor)); - - so->rb_mrt[i].blend_control_alpha = + A5XX_RB_MRT_BLEND_CONTROL_RGB_DEST_FACTOR(fd_blend_factor(rt->rgb_dst_factor)) | A5XX_RB_MRT_BLEND_CONTROL_ALPHA_SRC_FACTOR(fd_blend_factor(rt->alpha_src_factor)) | A5XX_RB_MRT_BLEND_CONTROL_ALPHA_BLEND_OPCODE(blend_func(rt->alpha_func)) | A5XX_RB_MRT_BLEND_CONTROL_ALPHA_DEST_FACTOR(fd_blend_factor(rt->alpha_dst_factor)); - so->rb_mrt[i].blend_control_no_alpha_rgb = - A5XX_RB_MRT_BLEND_CONTROL_RGB_SRC_FACTOR(fd_blend_factor(util_blend_dst_alpha_to_one(rt->rgb_src_factor))) | - A5XX_RB_MRT_BLEND_CONTROL_RGB_BLEND_OPCODE(blend_func(rt->rgb_func)) | - A5XX_RB_MRT_BLEND_CONTROL_RGB_DEST_FACTOR(fd_blend_factor(util_blend_dst_alpha_to_one(rt->rgb_dst_factor))); - - so->rb_mrt[i].control = A5XX_RB_MRT_CONTROL_ROP_CODE(rop) | COND(cso->logicop_enable, A5XX_RB_MRT_CONTROL_ROP_ENABLE) | diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/a5xx/fd5_blend.h mesa-20.0.8/src/gallium/drivers/freedreno/a5xx/fd5_blend.h --- mesa-19.2.8/src/gallium/drivers/freedreno/a5xx/fd5_blend.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/a5xx/fd5_blend.h 2020-06-12 01:21:17.000000000 +0000 @@ -38,12 +38,7 @@ struct { uint32_t control; uint32_t buf_info; - /* Blend control bits for color if there is an alpha channel */ - uint32_t blend_control_rgb; - /* Blend control bits for color if there is no alpha channel */ - uint32_t blend_control_no_alpha_rgb; - /* Blend control bits for alpha channel */ - uint32_t blend_control_alpha; + uint32_t blend_control; } rb_mrt[A5XX_MAX_RENDER_TARGETS]; uint32_t rb_blend_cntl; uint32_t sp_blend_cntl; diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/a5xx/fd5_blitter.c mesa-20.0.8/src/gallium/drivers/freedreno/a5xx/fd5_blitter.c --- mesa-19.2.8/src/gallium/drivers/freedreno/a5xx/fd5_blitter.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/a5xx/fd5_blitter.c 2020-06-12 01:21:17.000000000 +0000 @@ -98,8 +98,8 @@ * untiling by setting both src and dst COLOR_SWAP=WZYX, but that * means the formats must match: */ - if ((fd_resource(info->dst.resource)->tile_mode || - fd_resource(info->src.resource)->tile_mode) && + if ((fd_resource(info->dst.resource)->layout.tile_mode || + fd_resource(info->src.resource)->layout.tile_mode) && info->dst.format != info->src.format) return false; @@ -215,8 +215,8 @@ src = fd_resource(info->src.resource); dst = fd_resource(info->dst.resource); - debug_assert(src->cpp == 1); - debug_assert(dst->cpp == 1); + debug_assert(src->layout.cpp == 1); + debug_assert(dst->layout.cpp == 1); debug_assert(info->src.resource->format == info->dst.resource->format); debug_assert((sbox->y == 0) && (sbox->height == 1)); debug_assert((dbox->y == 0) && (dbox->height == 1)); @@ -325,7 +325,7 @@ const struct pipe_box *sbox = &info->src.box; const struct pipe_box *dbox = &info->dst.box; struct fd_resource *src, *dst; - struct fd_resource_slice *sslice, *dslice; + struct fdl_slice *sslice, *dslice; enum a5xx_color_fmt sfmt, dfmt; enum a5xx_tile_mode stile, dtile; enum a3xx_color_swap sswap, dswap; @@ -342,16 +342,14 @@ sfmt = fd5_pipe2color(info->src.format); dfmt = fd5_pipe2color(info->dst.format); - stile = fd_resource_level_linear(info->src.resource, info->src.level) ? - TILE5_LINEAR : src->tile_mode; - dtile = fd_resource_level_linear(info->dst.resource, info->dst.level) ? - TILE5_LINEAR : dst->tile_mode; + stile = fd_resource_tile_mode(info->src.resource, info->src.level); + dtile = fd_resource_tile_mode(info->dst.resource, info->dst.level); sswap = fd5_pipe2swap(info->src.format); dswap = fd5_pipe2swap(info->dst.format); - spitch = sslice->pitch * src->cpp; - dpitch = dslice->pitch * dst->cpp; + spitch = sslice->pitch * src->layout.cpp; + dpitch = dslice->pitch * dst->layout.cpp; /* if dtile, then dswap ignored by hw, and likewise if stile then sswap * ignored by hw.. but in this case we have already rejected the blit @@ -376,12 +374,12 @@ if (info->src.resource->target == PIPE_TEXTURE_3D) ssize = sslice->size0; else - ssize = src->layer_size; + ssize = src->layout.layer_size; if (info->dst.resource->target == PIPE_TEXTURE_3D) dsize = dslice->size0; else - dsize = dst->layer_size; + dsize = dst->layout.layer_size; for (unsigned i = 0; i < info->dst.box.depth; i++) { unsigned soff = fd_resource_offset(src, info->src.level, sbox->z + i); @@ -468,8 +466,8 @@ if ((info->src.resource->target == PIPE_BUFFER) && (info->dst.resource->target == PIPE_BUFFER)) { - assert(fd_resource(info->src.resource)->tile_mode == TILE5_LINEAR); - assert(fd_resource(info->dst.resource)->tile_mode == TILE5_LINEAR); + assert(fd_resource(info->src.resource)->layout.tile_mode == TILE5_LINEAR); + assert(fd_resource(info->dst.resource)->layout.tile_mode == TILE5_LINEAR); emit_blit_buffer(batch->draw, info); } else { /* I don't *think* we need to handle blits between buffer <-> !buffer */ @@ -481,7 +479,7 @@ fd_resource(info->dst.resource)->valid = true; batch->needs_flush = true; - fd_batch_flush(batch, false); + fd_batch_flush(batch); fd_batch_reference(&batch, NULL); return true; diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/a5xx/fd5_draw.c mesa-20.0.8/src/gallium/drivers/freedreno/a5xx/fd5_draw.c --- mesa-19.2.8/src/gallium/drivers/freedreno/a5xx/fd5_draw.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/a5xx/fd5_draw.c 2020-06-12 01:21:17.000000000 +0000 @@ -152,8 +152,8 @@ /* and now binning pass: */ emit.binning_pass = true; emit.dirty = dirty & ~(FD_DIRTY_BLEND); - emit.vp = NULL; /* we changed key so need to refetch vp */ - emit.fp = NULL; + emit.vs = NULL; /* we changed key so need to refetch vp */ + emit.fs = NULL; draw_impl(ctx, ctx->batch->binning, &emit, index_offset); if (emit.streamout_mask) { diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/a5xx/fd5_emit.c mesa-20.0.8/src/gallium/drivers/freedreno/a5xx/fd5_emit.c --- mesa-19.2.8/src/gallium/drivers/freedreno/a5xx/fd5_emit.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/a5xx/fd5_emit.c 2020-06-12 01:21:17.000000000 +0000 @@ -28,7 +28,7 @@ #include "util/u_string.h" #include "util/u_memory.h" #include "util/u_helpers.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_viewport.h" #include "freedreno_resource.h" @@ -365,7 +365,7 @@ enum a5xx_tile_mode tile_mode = TILE5_LINEAR; if (view->base.texture) - tile_mode = fd_resource(view->base.texture)->tile_mode; + tile_mode = fd_resource(view->base.texture)->layout.tile_mode; OUT_RING(ring, view->texconst0 | A5XX_TEX_CONST_0_TILE_MODE(tile_mode)); @@ -400,13 +400,10 @@ const struct ir3_shader_variant *v) { unsigned count = util_last_bit(so->enabled_mask); - const struct ir3_ibo_mapping *m = &v->image_mapping; for (unsigned i = 0; i < count; i++) { - unsigned slot = m->ssbo_to_ibo[i]; - OUT_PKT7(ring, CP_LOAD_STATE4, 5); - OUT_RING(ring, CP_LOAD_STATE4_0_DST_OFF(slot) | + OUT_RING(ring, CP_LOAD_STATE4_0_DST_OFF(i) | CP_LOAD_STATE4_0_STATE_SRC(SS4_DIRECT) | CP_LOAD_STATE4_0_STATE_BLOCK(sb) | CP_LOAD_STATE4_0_NUM_UNIT(1)); @@ -424,7 +421,7 @@ OUT_RING(ring, A5XX_SSBO_1_1_HEIGHT(sz >> 16)); OUT_PKT7(ring, CP_LOAD_STATE4, 5); - OUT_RING(ring, CP_LOAD_STATE4_0_DST_OFF(slot) | + OUT_RING(ring, CP_LOAD_STATE4_0_DST_OFF(i) | CP_LOAD_STATE4_0_STATE_SRC(SS4_DIRECT) | CP_LOAD_STATE4_0_STATE_BLOCK(sb) | CP_LOAD_STATE4_0_NUM_UNIT(1)); @@ -728,17 +725,13 @@ bool is_int = util_format_is_pure_integer(format); bool has_alpha = util_format_has_alpha(format); uint32_t control = blend->rb_mrt[i].control; - uint32_t blend_control = blend->rb_mrt[i].blend_control_alpha; if (is_int) { control &= A5XX_RB_MRT_CONTROL_COMPONENT_ENABLE__MASK; control |= A5XX_RB_MRT_CONTROL_ROP_CODE(ROP_COPY); } - if (has_alpha) { - blend_control |= blend->rb_mrt[i].blend_control_rgb; - } else { - blend_control |= blend->rb_mrt[i].blend_control_no_alpha_rgb; + if (!has_alpha) { control &= ~A5XX_RB_MRT_CONTROL_BLEND2; } @@ -746,7 +739,7 @@ OUT_RING(ring, control); OUT_PKT4(ring, REG_A5XX_RB_MRT_BLEND_CONTROL(i), 1); - OUT_RING(ring, blend_control); + OUT_RING(ring, blend->rb_mrt[i].blend_control); } OUT_PKT4(ring, REG_A5XX_SP_BLEND_CNTL, 1); diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/a5xx/fd5_emit.h mesa-20.0.8/src/gallium/drivers/freedreno/a5xx/fd5_emit.h --- mesa-19.2.8/src/gallium/drivers/freedreno/a5xx/fd5_emit.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/a5xx/fd5_emit.h 2020-06-12 01:21:17.000000000 +0000 @@ -60,7 +60,7 @@ bool no_lrz_write; /* cached to avoid repeated lookups of same variants: */ - const struct ir3_shader_variant *vp, *fp; + const struct ir3_shader_variant *vs, *fs; /* TODO: other shader stages.. */ unsigned streamout_mask; @@ -76,29 +76,29 @@ static inline const struct ir3_shader_variant * fd5_emit_get_vp(struct fd5_emit *emit) { - if (!emit->vp) { - struct ir3_shader *shader = emit->prog->vp; - emit->vp = ir3_shader_variant(shader, emit->key, + if (!emit->vs) { + struct ir3_shader *shader = emit->prog->vs; + emit->vs = ir3_shader_variant(shader, emit->key, emit->binning_pass, emit->debug); } - return emit->vp; + return emit->vs; } static inline const struct ir3_shader_variant * fd5_emit_get_fp(struct fd5_emit *emit) { - if (!emit->fp) { + if (!emit->fs) { if (emit->binning_pass) { /* use dummy stateobj to simplify binning vs non-binning: */ - static const struct ir3_shader_variant binning_fp = {}; - emit->fp = &binning_fp; + static const struct ir3_shader_variant binning_fs = {}; + emit->fs = &binning_fs; } else { - struct ir3_shader *shader = emit->prog->fp; - emit->fp = ir3_shader_variant(shader, emit->key, + struct ir3_shader *shader = emit->prog->fs; + emit->fs = ir3_shader_variant(shader, emit->key, false, emit->debug); } } - return emit->fp; + return emit->fs; } static inline void diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/a5xx/fd5_format.c mesa-20.0.8/src/gallium/drivers/freedreno/a5xx/fd5_format.c --- mesa-19.2.8/src/gallium/drivers/freedreno/a5xx/fd5_format.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/a5xx/fd5_format.c 2020-06-12 01:21:17.000000000 +0000 @@ -25,7 +25,7 @@ */ #include "pipe/p_defines.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "fd5_format.h" @@ -84,7 +84,7 @@ VT(R8_UINT, 8_UINT, R8_UINT, WZYX), VT(R8_SINT, 8_SINT, R8_SINT, WZYX), V_(R8_USCALED, 8_UINT, NONE, WZYX), - V_(R8_SSCALED, 8_UINT, NONE, WZYX), + V_(R8_SSCALED, 8_SINT, NONE, WZYX), _T(A8_UNORM, 8_UNORM, A8_UNORM, WZYX), _T(L8_UNORM, 8_UNORM, R8_UNORM, WZYX), @@ -105,7 +105,7 @@ VT(R16_UINT, 16_UINT, R16_UINT, WZYX), VT(R16_SINT, 16_SINT, R16_SINT, WZYX), V_(R16_USCALED, 16_UINT, NONE, WZYX), - V_(R16_SSCALED, 16_UINT, NONE, WZYX), + V_(R16_SSCALED, 16_SINT, NONE, WZYX), VT(R16_FLOAT, 16_FLOAT, R16_FLOAT, WZYX), _T(Z16_UNORM, 16_UNORM, R16_UNORM, WZYX), @@ -149,7 +149,7 @@ VT(R32_UINT, 32_UINT, R32_UINT, WZYX), VT(R32_SINT, 32_SINT, R32_SINT, WZYX), V_(R32_USCALED, 32_UINT, NONE, WZYX), - V_(R32_SSCALED, 32_UINT, NONE, WZYX), + V_(R32_SSCALED, 32_SINT, NONE, WZYX), VT(R32_FLOAT, 32_FLOAT, R32_FLOAT,WZYX), V_(R32_FIXED, 32_FIXED, NONE, WZYX), diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/a5xx/fd5_gmem.c mesa-20.0.8/src/gallium/drivers/freedreno/a5xx/fd5_gmem.c --- mesa-19.2.8/src/gallium/drivers/freedreno/a5xx/fd5_gmem.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/a5xx/fd5_gmem.c 2020-06-12 01:21:17.000000000 +0000 @@ -28,7 +28,7 @@ #include "util/u_string.h" #include "util/u_memory.h" #include "util/u_inlines.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "freedreno_draw.h" #include "freedreno_state.h" @@ -44,7 +44,7 @@ static void emit_mrt(struct fd_ringbuffer *ring, unsigned nr_bufs, - struct pipe_surface **bufs, struct fd_gmem_stateobj *gmem) + struct pipe_surface **bufs, const struct fd_gmem_stateobj *gmem) { enum a5xx_tile_mode tile_mode; unsigned i; @@ -54,7 +54,7 @@ enum a3xx_color_swap swap = WZYX; bool srgb = false, sint = false, uint = false; struct fd_resource *rsc = NULL; - struct fd_resource_slice *slice = NULL; + struct fdl_slice *slice = NULL; uint32_t stride = 0; uint32_t size = 0; uint32_t base = 0; @@ -89,11 +89,10 @@ size = stride * gmem->bin_h; base = gmem->cbuf_base[i]; } else { - stride = slice->pitch * rsc->cpp; + stride = slice->pitch * rsc->layout.cpp; size = slice->size0; - if (!fd_resource_level_linear(psurf->texture, psurf->u.tex.level)) - tile_mode = rsc->tile_mode; + tile_mode = fd_resource_tile_mode(psurf->texture, psurf->u.tex.level); } } @@ -132,12 +131,12 @@ static void emit_zs(struct fd_ringbuffer *ring, struct pipe_surface *zsbuf, - struct fd_gmem_stateobj *gmem) + const struct fd_gmem_stateobj *gmem) { if (zsbuf) { struct fd_resource *rsc = fd_resource(zsbuf->texture); enum a5xx_depth_format fmt = fd5_pipe2depth(zsbuf->format); - uint32_t cpp = rsc->cpp; + uint32_t cpp = rsc->layout.cpp; uint32_t stride = 0; uint32_t size = 0; @@ -145,8 +144,8 @@ stride = cpp * gmem->bin_w; size = stride * gmem->bin_h; } else { - struct fd_resource_slice *slice = fd_resource_slice(rsc, 0); - stride = slice->pitch * rsc->cpp; + struct fdl_slice *slice = fd_resource_slice(rsc, 0); + stride = slice->pitch * rsc->layout.cpp; size = slice->size0; } @@ -192,8 +191,8 @@ stride = 1 * gmem->bin_w; size = stride * gmem->bin_h; } else { - struct fd_resource_slice *slice = fd_resource_slice(rsc->stencil, 0); - stride = slice->pitch * rsc->cpp; + struct fdl_slice *slice = fd_resource_slice(rsc->stencil, 0); + stride = slice->pitch * rsc->layout.cpp; size = slice->size0; } @@ -235,7 +234,7 @@ static bool use_hw_binning(struct fd_batch *batch) { - struct fd_gmem_stateobj *gmem = &batch->ctx->gmem; + const struct fd_gmem_stateobj *gmem = batch->gmem_state; if ((gmem->maxpw * gmem->maxph) > 32) return false; @@ -263,7 +262,7 @@ { struct fd_context *ctx = batch->ctx; struct fd5_context *fd5_ctx = fd5_context(ctx); - struct fd_gmem_stateobj *gmem = &batch->ctx->gmem; + const struct fd_gmem_stateobj *gmem = batch->gmem_state; struct fd_ringbuffer *ring = batch->gmem; int i; @@ -278,7 +277,7 @@ OUT_PKT4(ring, REG_A5XX_VSC_PIPE_CONFIG_REG(0), 16); for (i = 0; i < 16; i++) { - struct fd_vsc_pipe *pipe = &ctx->vsc_pipe[i]; + const struct fd_vsc_pipe *pipe = &gmem->vsc_pipe[i]; OUT_RING(ring, A5XX_VSC_PIPE_CONFIG_REG_X(pipe->x) | A5XX_VSC_PIPE_CONFIG_REG_Y(pipe->y) | A5XX_VSC_PIPE_CONFIG_REG_W(pipe->w) | @@ -287,18 +286,16 @@ OUT_PKT4(ring, REG_A5XX_VSC_PIPE_DATA_ADDRESS_LO(0), 32); for (i = 0; i < 16; i++) { - struct fd_vsc_pipe *pipe = &ctx->vsc_pipe[i]; - if (!pipe->bo) { - pipe->bo = fd_bo_new(ctx->dev, 0x20000, + if (!ctx->vsc_pipe_bo[i]) { + ctx->vsc_pipe_bo[i] = fd_bo_new(ctx->dev, 0x20000, DRM_FREEDRENO_GEM_TYPE_KMEM, "vsc_pipe[%u]", i); } - OUT_RELOCW(ring, pipe->bo, 0, 0, 0); /* VSC_PIPE_DATA_ADDRESS[i].LO/HI */ + OUT_RELOCW(ring, ctx->vsc_pipe_bo[i], 0, 0, 0); /* VSC_PIPE_DATA_ADDRESS[i].LO/HI */ } OUT_PKT4(ring, REG_A5XX_VSC_PIPE_DATA_LENGTH_REG(0), 16); for (i = 0; i < 16; i++) { - struct fd_vsc_pipe *pipe = &ctx->vsc_pipe[i]; - OUT_RING(ring, fd_bo_size(pipe->bo) - 32); /* VSC_PIPE_DATA_LENGTH[i] */ + OUT_RING(ring, fd_bo_size(ctx->vsc_pipe_bo[i]) - 32); /* VSC_PIPE_DATA_LENGTH[i] */ } } @@ -307,7 +304,7 @@ { struct fd_context *ctx = batch->ctx; struct fd_ringbuffer *ring = batch->gmem; - struct fd_gmem_stateobj *gmem = &batch->ctx->gmem; + const struct fd_gmem_stateobj *gmem = batch->gmem_state; uint32_t x1 = gmem->minx; uint32_t y1 = gmem->miny; @@ -369,7 +366,6 @@ static void fd5_emit_tile_init(struct fd_batch *batch) { - struct fd_context *ctx = batch->ctx; struct fd_ringbuffer *ring = batch->gmem; struct pipe_framebuffer_state *pfb = &batch->framebuffer; @@ -397,8 +393,8 @@ OUT_PKT4(ring, REG_A5XX_RB_CCU_CNTL, 1); OUT_RING(ring, 0x7c13c080); /* RB_CCU_CNTL */ - emit_zs(ring, pfb->zsbuf, &ctx->gmem); - emit_mrt(ring, pfb->nr_cbufs, pfb->cbufs, &ctx->gmem); + emit_zs(ring, pfb->zsbuf, batch->gmem_state); + emit_mrt(ring, pfb->nr_cbufs, pfb->cbufs, batch->gmem_state); if (use_hw_binning(batch)) { emit_binning_pass(batch); @@ -413,9 +409,10 @@ /* before mem2gmem */ static void -fd5_emit_tile_prep(struct fd_batch *batch, struct fd_tile *tile) +fd5_emit_tile_prep(struct fd_batch *batch, const struct fd_tile *tile) { struct fd_context *ctx = batch->ctx; + const struct fd_gmem_stateobj *gmem = batch->gmem_state; struct fd5_context *fd5_ctx = fd5_context(ctx); struct fd_ringbuffer *ring = batch->gmem; @@ -437,7 +434,8 @@ A5XX_RB_RESOLVE_CNTL_2_Y(y2)); if (use_hw_binning(batch)) { - struct fd_vsc_pipe *pipe = &ctx->vsc_pipe[tile->p]; + const struct fd_vsc_pipe *pipe = &gmem->vsc_pipe[tile->p]; + struct fd_bo *pipe_bo = ctx->vsc_pipe_bo[tile->p]; OUT_PKT7(ring, CP_WAIT_FOR_ME, 0); @@ -447,7 +445,7 @@ OUT_PKT7(ring, CP_SET_BIN_DATA5, 5); OUT_RING(ring, CP_SET_BIN_DATA5_0_VSC_SIZE(pipe->w * pipe->h) | CP_SET_BIN_DATA5_0_VSC_N(tile->n)); - OUT_RELOC(ring, pipe->bo, 0, 0, 0); /* VSC_PIPE[p].DATA_ADDRESS */ + OUT_RELOC(ring, pipe_bo, 0, 0, 0); /* VSC_PIPE[p].DATA_ADDRESS */ OUT_RELOC(ring, fd5_ctx->vsc_size_mem, /* VSC_SIZE_ADDRESS + (p * 4) */ (tile->p * 4), 0, 0); } else { @@ -470,7 +468,7 @@ struct pipe_surface *psurf, enum a5xx_blit_buf buf) { struct fd_ringbuffer *ring = batch->gmem; - struct fd_gmem_stateobj *gmem = &batch->ctx->gmem; + const struct fd_gmem_stateobj *gmem = batch->gmem_state; struct fd_resource *rsc = fd_resource(psurf->texture); uint32_t stride, size; @@ -485,22 +483,22 @@ // possibly we want to flip this around gmem2mem and keep depth // tiled in sysmem (and fixup sampler state to assume tiled).. this // might be required for doing depth/stencil in bypass mode? - struct fd_resource_slice *slice = fd_resource_slice(rsc, 0); + struct fdl_slice *slice = fd_resource_slice(rsc, 0); enum a5xx_color_fmt format = fd5_pipe2color(fd_gmem_restore_format(rsc->base.format)); OUT_PKT4(ring, REG_A5XX_RB_MRT_BUF_INFO(0), 5); OUT_RING(ring, A5XX_RB_MRT_BUF_INFO_COLOR_FORMAT(format) | - A5XX_RB_MRT_BUF_INFO_COLOR_TILE_MODE(rsc->tile_mode) | + A5XX_RB_MRT_BUF_INFO_COLOR_TILE_MODE(rsc->layout.tile_mode) | A5XX_RB_MRT_BUF_INFO_COLOR_SWAP(WZYX)); - OUT_RING(ring, A5XX_RB_MRT_PITCH(slice->pitch * rsc->cpp)); + OUT_RING(ring, A5XX_RB_MRT_PITCH(slice->pitch * rsc->layout.cpp)); OUT_RING(ring, A5XX_RB_MRT_ARRAY_PITCH(slice->size0)); OUT_RELOC(ring, rsc->bo, 0, 0, 0); /* BASE_LO/HI */ buf = BLIT_MRT0; } - stride = gmem->bin_w * rsc->cpp; + stride = gmem->bin_w * rsc->layout.cpp; size = stride * gmem->bin_h; OUT_PKT4(ring, REG_A5XX_RB_BLIT_FLAG_DST_LO, 4); @@ -523,11 +521,10 @@ } static void -fd5_emit_tile_mem2gmem(struct fd_batch *batch, struct fd_tile *tile) +fd5_emit_tile_mem2gmem(struct fd_batch *batch, const struct fd_tile *tile) { struct fd_ringbuffer *ring = batch->gmem; - struct fd_context *ctx = batch->ctx; - struct fd_gmem_stateobj *gmem = &ctx->gmem; + const struct fd_gmem_stateobj *gmem = batch->gmem_state; struct pipe_framebuffer_state *pfb = &batch->framebuffer; /* @@ -567,10 +564,10 @@ /* before IB to rendering cmds: */ static void -fd5_emit_tile_renderprep(struct fd_batch *batch, struct fd_tile *tile) +fd5_emit_tile_renderprep(struct fd_batch *batch, const struct fd_tile *tile) { struct fd_ringbuffer *ring = batch->gmem; - struct fd_gmem_stateobj *gmem = &batch->ctx->gmem; + const struct fd_gmem_stateobj *gmem = batch->gmem_state; struct pipe_framebuffer_state *pfb = &batch->framebuffer; OUT_PKT4(ring, REG_A5XX_RB_CNTL, 1); @@ -610,7 +607,7 @@ { struct fd_ringbuffer *ring = batch->gmem; struct fd_resource *rsc = fd_resource(psurf->texture); - struct fd_resource_slice *slice; + struct fdl_slice *slice; bool tiled; uint32_t offset; @@ -632,14 +629,13 @@ OUT_RING(ring, 0x00000000); /* RB_BLIT_FLAG_DST_PITCH */ OUT_RING(ring, 0x00000000); /* RB_BLIT_FLAG_DST_ARRAY_PITCH */ - tiled = rsc->tile_mode && - !fd_resource_level_linear(psurf->texture, psurf->u.tex.level); + tiled = fd_resource_tile_mode(psurf->texture, psurf->u.tex.level); OUT_PKT4(ring, REG_A5XX_RB_RESOLVE_CNTL_3, 5); OUT_RING(ring, 0x00000004 | /* XXX RB_RESOLVE_CNTL_3 */ COND(tiled, A5XX_RB_RESOLVE_CNTL_3_TILED)); OUT_RELOCW(ring, rsc->bo, offset, 0, 0); /* RB_BLIT_DST_LO/HI */ - OUT_RING(ring, A5XX_RB_BLIT_DST_PITCH(slice->pitch * rsc->cpp)); + OUT_RING(ring, A5XX_RB_BLIT_DST_PITCH(slice->pitch * rsc->layout.cpp)); OUT_RING(ring, A5XX_RB_BLIT_DST_ARRAY_PITCH(slice->size0)); OUT_PKT4(ring, REG_A5XX_RB_BLIT_CNTL, 1); @@ -654,10 +650,9 @@ } static void -fd5_emit_tile_gmem2mem(struct fd_batch *batch, struct fd_tile *tile) +fd5_emit_tile_gmem2mem(struct fd_batch *batch, const struct fd_tile *tile) { - struct fd_context *ctx = batch->ctx; - struct fd_gmem_stateobj *gmem = &ctx->gmem; + const struct fd_gmem_stateobj *gmem = batch->gmem_state; struct pipe_framebuffer_state *pfb = &batch->framebuffer; if (batch->resolve & (FD_BUFFER_DEPTH | FD_BUFFER_STENCIL)) { diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/a5xx/fd5_image.c mesa-20.0.8/src/gallium/drivers/freedreno/a5xx/fd5_image.c --- mesa-19.2.8/src/gallium/drivers/freedreno/a5xx/fd5_image.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/a5xx/fd5_image.c 2020-06-12 01:21:17.000000000 +0000 @@ -62,6 +62,7 @@ enum pipe_format format = pimg->format; struct pipe_resource *prsc = pimg->resource; struct fd_resource *rsc = fd_resource(prsc); + struct fdl_slice *slice = NULL; unsigned lvl; if (!pimg->resource) { @@ -74,7 +75,7 @@ img->fetchsize = fd5_pipe2fetchsize(format); img->type = fd5_tex_type(prsc->target); img->srgb = util_format_is_srgb(format); - img->cpp = rsc->cpp; + img->cpp = rsc->layout.cpp; img->bo = rsc->bo; if (prsc->target == PIPE_BUFFER) { @@ -83,8 +84,9 @@ img->pitch = pimg->u.buf.size; } else { lvl = pimg->u.tex.level; + slice = fd_resource_slice(rsc, lvl); img->offset = fd_resource_offset(rsc, lvl, pimg->u.tex.first_layer); - img->pitch = rsc->slices[lvl].pitch * rsc->cpp; + img->pitch = slice->pitch * rsc->layout.cpp; } img->width = u_minify(prsc->width0, lvl); @@ -96,21 +98,21 @@ case PIPE_TEXTURE_RECT: case PIPE_TEXTURE_1D: case PIPE_TEXTURE_2D: - img->array_pitch = rsc->layer_size; + img->array_pitch = rsc->layout.layer_size; img->depth = 1; break; case PIPE_TEXTURE_1D_ARRAY: case PIPE_TEXTURE_2D_ARRAY: - img->array_pitch = rsc->layer_size; + img->array_pitch = rsc->layout.layer_size; img->depth = layers; break; case PIPE_TEXTURE_CUBE: case PIPE_TEXTURE_CUBE_ARRAY: - img->array_pitch = rsc->layer_size; + img->array_pitch = rsc->layout.layer_size; img->depth = layers; break; case PIPE_TEXTURE_3D: - img->array_pitch = rsc->slices[lvl].size0; + img->array_pitch = slice->size0; img->depth = u_minify(prsc->depth0, lvl); break; default: @@ -208,6 +210,6 @@ translate_image(&img, &so->si[index]); emit_image_tex(ring, m->image_to_tex[index] + m->tex_base, &img, shader); - emit_image_ssbo(ring, m->image_to_ibo[index], &img, shader); + emit_image_ssbo(ring, v->shader->nir->info.num_ssbos + index, &img, shader); } } diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/a5xx/fd5_perfcntr.c mesa-20.0.8/src/gallium/drivers/freedreno/a5xx/fd5_perfcntr.c --- mesa-19.2.8/src/gallium/drivers/freedreno/a5xx/fd5_perfcntr.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/a5xx/fd5_perfcntr.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,766 +0,0 @@ -/* - * Copyright (C) 2018 Rob Clark - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Authors: - * Rob Clark - */ - -#ifndef FD5_PERFCNTR_H_ -#define FD5_PERFCNTR_H_ - -#include "freedreno_perfcntr.h" -#include "fd5_format.h" - -#define REG(_x) REG_A5XX_ ## _x - -#define COUNTER(_sel, _lo, _hi) { \ - .select_reg = REG(_sel), \ - .counter_reg_lo = REG(_lo), \ - .counter_reg_hi = REG(_hi), \ -} - -#define COUNTER2(_sel, _lo, _hi, _en, _clr) { \ - .select_reg = REG(_sel), \ - .counter_reg_lo = REG(_lo), \ - .counter_reg_hi = REG(_hi), \ - .enable = REG(_en), \ - .clear = REG(_clr), \ -} - -#define COUNTABLE(_selector, _query_type, _result_type) { \ - .name = #_selector, \ - .selector = _selector, \ - .query_type = PIPE_DRIVER_QUERY_TYPE_ ## _query_type, \ - .result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_ ## _result_type, \ -} - -#define GROUP(_name, _counters, _countables) { \ - .name = _name, \ - .num_counters = ARRAY_SIZE(_counters), \ - .counters = _counters, \ - .num_countables = ARRAY_SIZE(_countables), \ - .countables = _countables, \ -} - -static const struct fd_perfcntr_counter cp_counters[] = { -//RESERVED: for kernel -// COUNTER(CP_PERFCTR_CP_SEL_0, RBBM_PERFCTR_CP_0_LO, RBBM_PERFCTR_CP_0_HI), - COUNTER(CP_PERFCTR_CP_SEL_1, RBBM_PERFCTR_CP_1_LO, RBBM_PERFCTR_CP_1_HI), - COUNTER(CP_PERFCTR_CP_SEL_2, RBBM_PERFCTR_CP_2_LO, RBBM_PERFCTR_CP_2_HI), - COUNTER(CP_PERFCTR_CP_SEL_3, RBBM_PERFCTR_CP_3_LO, RBBM_PERFCTR_CP_3_HI), - COUNTER(CP_PERFCTR_CP_SEL_4, RBBM_PERFCTR_CP_4_LO, RBBM_PERFCTR_CP_4_HI), - COUNTER(CP_PERFCTR_CP_SEL_5, RBBM_PERFCTR_CP_5_LO, RBBM_PERFCTR_CP_5_HI), - COUNTER(CP_PERFCTR_CP_SEL_6, RBBM_PERFCTR_CP_6_LO, RBBM_PERFCTR_CP_6_HI), - COUNTER(CP_PERFCTR_CP_SEL_7, RBBM_PERFCTR_CP_7_LO, RBBM_PERFCTR_CP_7_HI), -}; - -static const struct fd_perfcntr_countable cp_countables[] = { - COUNTABLE(PERF_CP_ALWAYS_COUNT, UINT64, AVERAGE), - COUNTABLE(PERF_CP_BUSY_GFX_CORE_IDLE, UINT64, AVERAGE), - COUNTABLE(PERF_CP_BUSY_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_CP_PFP_IDLE, UINT64, AVERAGE), - COUNTABLE(PERF_CP_PFP_BUSY_WORKING, UINT64, AVERAGE), - COUNTABLE(PERF_CP_PFP_STALL_CYCLES_ANY, UINT64, AVERAGE), - COUNTABLE(PERF_CP_PFP_STARVE_CYCLES_ANY, UINT64, AVERAGE), - COUNTABLE(PERF_CP_PFP_ICACHE_MISS, UINT64, AVERAGE), - COUNTABLE(PERF_CP_PFP_ICACHE_HIT, UINT64, AVERAGE), - COUNTABLE(PERF_CP_PFP_MATCH_PM4_PKT_PROFILE, UINT64, AVERAGE), - COUNTABLE(PERF_CP_ME_BUSY_WORKING, UINT64, AVERAGE), - COUNTABLE(PERF_CP_ME_IDLE, UINT64, AVERAGE), - COUNTABLE(PERF_CP_ME_STARVE_CYCLES_ANY, UINT64, AVERAGE), - COUNTABLE(PERF_CP_ME_FIFO_EMPTY_PFP_IDLE, UINT64, AVERAGE), - COUNTABLE(PERF_CP_ME_FIFO_EMPTY_PFP_BUSY, UINT64, AVERAGE), - COUNTABLE(PERF_CP_ME_FIFO_FULL_ME_BUSY, UINT64, AVERAGE), - COUNTABLE(PERF_CP_ME_FIFO_FULL_ME_NON_WORKING, UINT64, AVERAGE), - COUNTABLE(PERF_CP_ME_STALL_CYCLES_ANY, UINT64, AVERAGE), - COUNTABLE(PERF_CP_ME_ICACHE_MISS, UINT64, AVERAGE), - COUNTABLE(PERF_CP_ME_ICACHE_HIT, UINT64, AVERAGE), - COUNTABLE(PERF_CP_NUM_PREEMPTIONS, UINT64, AVERAGE), - COUNTABLE(PERF_CP_PREEMPTION_REACTION_DELAY, UINT64, AVERAGE), - COUNTABLE(PERF_CP_PREEMPTION_SWITCH_OUT_TIME, UINT64, AVERAGE), - COUNTABLE(PERF_CP_PREEMPTION_SWITCH_IN_TIME, UINT64, AVERAGE), - COUNTABLE(PERF_CP_DEAD_DRAWS_IN_BIN_RENDER, UINT64, AVERAGE), - COUNTABLE(PERF_CP_PREDICATED_DRAWS_KILLED, UINT64, AVERAGE), - COUNTABLE(PERF_CP_MODE_SWITCH, UINT64, AVERAGE), - COUNTABLE(PERF_CP_ZPASS_DONE, UINT64, AVERAGE), - COUNTABLE(PERF_CP_CONTEXT_DONE, UINT64, AVERAGE), - COUNTABLE(PERF_CP_CACHE_FLUSH, UINT64, AVERAGE), - COUNTABLE(PERF_CP_LONG_PREEMPTIONS, UINT64, AVERAGE), -}; - -static const struct fd_perfcntr_counter ccu_counters[] = { - COUNTER(RB_PERFCTR_CCU_SEL_0, RBBM_PERFCTR_CCU_0_LO, RBBM_PERFCTR_CCU_0_HI), - COUNTER(RB_PERFCTR_CCU_SEL_1, RBBM_PERFCTR_CCU_1_LO, RBBM_PERFCTR_CCU_1_HI), - COUNTER(RB_PERFCTR_CCU_SEL_2, RBBM_PERFCTR_CCU_2_LO, RBBM_PERFCTR_CCU_2_HI), - COUNTER(RB_PERFCTR_CCU_SEL_3, RBBM_PERFCTR_CCU_3_LO, RBBM_PERFCTR_CCU_3_HI), -}; - -static const struct fd_perfcntr_countable ccu_countables[] = { - COUNTABLE(PERF_CCU_BUSY_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_CCU_STALL_CYCLES_RB_DEPTH_RETURN, UINT64, AVERAGE), - COUNTABLE(PERF_CCU_STALL_CYCLES_RB_COLOR_RETURN, UINT64, AVERAGE), - COUNTABLE(PERF_CCU_STARVE_CYCLES_FLAG_RETURN, UINT64, AVERAGE), - COUNTABLE(PERF_CCU_DEPTH_BLOCKS, UINT64, AVERAGE), - COUNTABLE(PERF_CCU_COLOR_BLOCKS, UINT64, AVERAGE), - COUNTABLE(PERF_CCU_DEPTH_BLOCK_HIT, UINT64, AVERAGE), - COUNTABLE(PERF_CCU_COLOR_BLOCK_HIT, UINT64, AVERAGE), - COUNTABLE(PERF_CCU_PARTIAL_BLOCK_READ, UINT64, AVERAGE), - COUNTABLE(PERF_CCU_GMEM_READ, UINT64, AVERAGE), - COUNTABLE(PERF_CCU_GMEM_WRITE, UINT64, AVERAGE), - COUNTABLE(PERF_CCU_DEPTH_READ_FLAG0_COUNT, UINT64, AVERAGE), - COUNTABLE(PERF_CCU_DEPTH_READ_FLAG1_COUNT, UINT64, AVERAGE), - COUNTABLE(PERF_CCU_DEPTH_READ_FLAG2_COUNT, UINT64, AVERAGE), - COUNTABLE(PERF_CCU_DEPTH_READ_FLAG3_COUNT, UINT64, AVERAGE), - COUNTABLE(PERF_CCU_DEPTH_READ_FLAG4_COUNT, UINT64, AVERAGE), - COUNTABLE(PERF_CCU_COLOR_READ_FLAG0_COUNT, UINT64, AVERAGE), - COUNTABLE(PERF_CCU_COLOR_READ_FLAG1_COUNT, UINT64, AVERAGE), - COUNTABLE(PERF_CCU_COLOR_READ_FLAG2_COUNT, UINT64, AVERAGE), - COUNTABLE(PERF_CCU_COLOR_READ_FLAG3_COUNT, UINT64, AVERAGE), - COUNTABLE(PERF_CCU_COLOR_READ_FLAG4_COUNT, UINT64, AVERAGE), - COUNTABLE(PERF_CCU_2D_BUSY_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_CCU_2D_RD_REQ, UINT64, AVERAGE), - COUNTABLE(PERF_CCU_2D_WR_REQ, UINT64, AVERAGE), - COUNTABLE(PERF_CCU_2D_REORDER_STARVE_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_CCU_2D_PIXELS, UINT64, AVERAGE), -}; - -static const struct fd_perfcntr_counter tse_counters[] = { - COUNTER(GRAS_PERFCTR_TSE_SEL_0, RBBM_PERFCTR_TSE_0_LO, RBBM_PERFCTR_TSE_0_HI), - COUNTER(GRAS_PERFCTR_TSE_SEL_1, RBBM_PERFCTR_TSE_1_LO, RBBM_PERFCTR_TSE_1_HI), - COUNTER(GRAS_PERFCTR_TSE_SEL_2, RBBM_PERFCTR_TSE_2_LO, RBBM_PERFCTR_TSE_2_HI), - COUNTER(GRAS_PERFCTR_TSE_SEL_3, RBBM_PERFCTR_TSE_3_LO, RBBM_PERFCTR_TSE_3_HI), -}; - -static const struct fd_perfcntr_countable tse_countables[] = { - COUNTABLE(PERF_TSE_BUSY_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_TSE_CLIPPING_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_TSE_STALL_CYCLES_RAS, UINT64, AVERAGE), - COUNTABLE(PERF_TSE_STALL_CYCLES_LRZ_BARYPLANE, UINT64, AVERAGE), - COUNTABLE(PERF_TSE_STALL_CYCLES_LRZ_ZPLANE, UINT64, AVERAGE), - COUNTABLE(PERF_TSE_STARVE_CYCLES_PC, UINT64, AVERAGE), - COUNTABLE(PERF_TSE_INPUT_PRIM, UINT64, AVERAGE), - COUNTABLE(PERF_TSE_INPUT_NULL_PRIM, UINT64, AVERAGE), - COUNTABLE(PERF_TSE_TRIVAL_REJ_PRIM, UINT64, AVERAGE), - COUNTABLE(PERF_TSE_CLIPPED_PRIM, UINT64, AVERAGE), - COUNTABLE(PERF_TSE_ZERO_AREA_PRIM, UINT64, AVERAGE), - COUNTABLE(PERF_TSE_FACENESS_CULLED_PRIM, UINT64, AVERAGE), - COUNTABLE(PERF_TSE_ZERO_PIXEL_PRIM, UINT64, AVERAGE), - COUNTABLE(PERF_TSE_OUTPUT_NULL_PRIM, UINT64, AVERAGE), - COUNTABLE(PERF_TSE_OUTPUT_VISIBLE_PRIM, UINT64, AVERAGE), - COUNTABLE(PERF_TSE_CINVOCATION, UINT64, AVERAGE), - COUNTABLE(PERF_TSE_CPRIMITIVES, UINT64, AVERAGE), - COUNTABLE(PERF_TSE_2D_INPUT_PRIM, UINT64, AVERAGE), - COUNTABLE(PERF_TSE_2D_ALIVE_CLCLES, UINT64, AVERAGE), -}; - -static const struct fd_perfcntr_counter ras_counters[] = { - COUNTER(GRAS_PERFCTR_RAS_SEL_0, RBBM_PERFCTR_RAS_0_LO, RBBM_PERFCTR_RAS_0_HI), - COUNTER(GRAS_PERFCTR_RAS_SEL_1, RBBM_PERFCTR_RAS_1_LO, RBBM_PERFCTR_RAS_1_HI), - COUNTER(GRAS_PERFCTR_RAS_SEL_2, RBBM_PERFCTR_RAS_2_LO, RBBM_PERFCTR_RAS_2_HI), - COUNTER(GRAS_PERFCTR_RAS_SEL_3, RBBM_PERFCTR_RAS_3_LO, RBBM_PERFCTR_RAS_3_HI), -}; - -static const struct fd_perfcntr_countable ras_countables[] = { - COUNTABLE(PERF_RAS_BUSY_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_RAS_SUPERTILE_ACTIVE_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_RAS_STALL_CYCLES_LRZ, UINT64, AVERAGE), - COUNTABLE(PERF_RAS_STARVE_CYCLES_TSE, UINT64, AVERAGE), - COUNTABLE(PERF_RAS_SUPER_TILES, UINT64, AVERAGE), - COUNTABLE(PERF_RAS_8X4_TILES, UINT64, AVERAGE), - COUNTABLE(PERF_RAS_MASKGEN_ACTIVE, UINT64, AVERAGE), - COUNTABLE(PERF_RAS_FULLY_COVERED_SUPER_TILES, UINT64, AVERAGE), - COUNTABLE(PERF_RAS_FULLY_COVERED_8X4_TILES, UINT64, AVERAGE), - COUNTABLE(PERF_RAS_PRIM_KILLED_INVISILBE, UINT64, AVERAGE), -}; - -static const struct fd_perfcntr_counter lrz_counters[] = { - COUNTER(GRAS_PERFCTR_LRZ_SEL_0, RBBM_PERFCTR_LRZ_0_LO, RBBM_PERFCTR_LRZ_0_HI), - COUNTER(GRAS_PERFCTR_LRZ_SEL_1, RBBM_PERFCTR_LRZ_1_LO, RBBM_PERFCTR_LRZ_1_HI), - COUNTER(GRAS_PERFCTR_LRZ_SEL_2, RBBM_PERFCTR_LRZ_2_LO, RBBM_PERFCTR_LRZ_2_HI), - COUNTER(GRAS_PERFCTR_LRZ_SEL_3, RBBM_PERFCTR_LRZ_3_LO, RBBM_PERFCTR_LRZ_3_HI), -}; - -static const struct fd_perfcntr_countable lrz_countables[] = { - COUNTABLE(PERF_LRZ_BUSY_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_LRZ_STARVE_CYCLES_RAS, UINT64, AVERAGE), - COUNTABLE(PERF_LRZ_STALL_CYCLES_RB, UINT64, AVERAGE), - COUNTABLE(PERF_LRZ_STALL_CYCLES_VSC, UINT64, AVERAGE), - COUNTABLE(PERF_LRZ_STALL_CYCLES_VPC, UINT64, AVERAGE), - COUNTABLE(PERF_LRZ_STALL_CYCLES_FLAG_PREFETCH, UINT64, AVERAGE), - COUNTABLE(PERF_LRZ_STALL_CYCLES_UCHE, UINT64, AVERAGE), - COUNTABLE(PERF_LRZ_LRZ_READ, UINT64, AVERAGE), - COUNTABLE(PERF_LRZ_LRZ_WRITE, UINT64, AVERAGE), - COUNTABLE(PERF_LRZ_READ_LATENCY, UINT64, AVERAGE), - COUNTABLE(PERF_LRZ_MERGE_CACHE_UPDATING, UINT64, AVERAGE), - COUNTABLE(PERF_LRZ_PRIM_KILLED_BY_MASKGEN, UINT64, AVERAGE), - COUNTABLE(PERF_LRZ_PRIM_KILLED_BY_LRZ, UINT64, AVERAGE), - COUNTABLE(PERF_LRZ_VISIBLE_PRIM_AFTER_LRZ, UINT64, AVERAGE), - COUNTABLE(PERF_LRZ_FULL_8X8_TILES, UINT64, AVERAGE), - COUNTABLE(PERF_LRZ_PARTIAL_8X8_TILES, UINT64, AVERAGE), - COUNTABLE(PERF_LRZ_TILE_KILLED, UINT64, AVERAGE), - COUNTABLE(PERF_LRZ_TOTAL_PIXEL, UINT64, AVERAGE), - COUNTABLE(PERF_LRZ_VISIBLE_PIXEL_AFTER_LRZ, UINT64, AVERAGE), -}; - -static const struct fd_perfcntr_counter hlsq_counters[] = { - COUNTER(HLSQ_PERFCTR_HLSQ_SEL_0, RBBM_PERFCTR_HLSQ_0_LO, RBBM_PERFCTR_HLSQ_0_HI), - COUNTER(HLSQ_PERFCTR_HLSQ_SEL_1, RBBM_PERFCTR_HLSQ_1_LO, RBBM_PERFCTR_HLSQ_1_HI), - COUNTER(HLSQ_PERFCTR_HLSQ_SEL_2, RBBM_PERFCTR_HLSQ_2_LO, RBBM_PERFCTR_HLSQ_2_HI), - COUNTER(HLSQ_PERFCTR_HLSQ_SEL_3, RBBM_PERFCTR_HLSQ_3_LO, RBBM_PERFCTR_HLSQ_3_HI), - COUNTER(HLSQ_PERFCTR_HLSQ_SEL_4, RBBM_PERFCTR_HLSQ_4_LO, RBBM_PERFCTR_HLSQ_4_HI), - COUNTER(HLSQ_PERFCTR_HLSQ_SEL_5, RBBM_PERFCTR_HLSQ_5_LO, RBBM_PERFCTR_HLSQ_5_HI), - COUNTER(HLSQ_PERFCTR_HLSQ_SEL_6, RBBM_PERFCTR_HLSQ_6_LO, RBBM_PERFCTR_HLSQ_6_HI), - COUNTER(HLSQ_PERFCTR_HLSQ_SEL_7, RBBM_PERFCTR_HLSQ_7_LO, RBBM_PERFCTR_HLSQ_7_HI), -}; - -static const struct fd_perfcntr_countable hlsq_countables[] = { - COUNTABLE(PERF_HLSQ_BUSY_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_HLSQ_STALL_CYCLES_UCHE, UINT64, AVERAGE), - COUNTABLE(PERF_HLSQ_STALL_CYCLES_SP_STATE, UINT64, AVERAGE), - COUNTABLE(PERF_HLSQ_STALL_CYCLES_SP_FS_STAGE, UINT64, AVERAGE), - COUNTABLE(PERF_HLSQ_UCHE_LATENCY_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_HLSQ_UCHE_LATENCY_COUNT, UINT64, AVERAGE), - COUNTABLE(PERF_HLSQ_FS_STAGE_32_WAVES, UINT64, AVERAGE), - COUNTABLE(PERF_HLSQ_FS_STAGE_64_WAVES, UINT64, AVERAGE), - COUNTABLE(PERF_HLSQ_QUADS, UINT64, AVERAGE), - COUNTABLE(PERF_HLSQ_SP_STATE_COPY_TRANS_FS_STAGE, UINT64, AVERAGE), - COUNTABLE(PERF_HLSQ_SP_STATE_COPY_TRANS_VS_STAGE, UINT64, AVERAGE), - COUNTABLE(PERF_HLSQ_TP_STATE_COPY_TRANS_FS_STAGE, UINT64, AVERAGE), - COUNTABLE(PERF_HLSQ_TP_STATE_COPY_TRANS_VS_STAGE, UINT64, AVERAGE), - COUNTABLE(PERF_HLSQ_CS_INVOCATIONS, UINT64, AVERAGE), - COUNTABLE(PERF_HLSQ_COMPUTE_DRAWCALLS, UINT64, AVERAGE), -}; - -static const struct fd_perfcntr_counter pc_counters[] = { - COUNTER(PC_PERFCTR_PC_SEL_0, RBBM_PERFCTR_PC_0_LO, RBBM_PERFCTR_PC_0_HI), - COUNTER(PC_PERFCTR_PC_SEL_1, RBBM_PERFCTR_PC_1_LO, RBBM_PERFCTR_PC_1_HI), - COUNTER(PC_PERFCTR_PC_SEL_2, RBBM_PERFCTR_PC_2_LO, RBBM_PERFCTR_PC_2_HI), - COUNTER(PC_PERFCTR_PC_SEL_3, RBBM_PERFCTR_PC_3_LO, RBBM_PERFCTR_PC_3_HI), - COUNTER(PC_PERFCTR_PC_SEL_4, RBBM_PERFCTR_PC_4_LO, RBBM_PERFCTR_PC_4_HI), - COUNTER(PC_PERFCTR_PC_SEL_5, RBBM_PERFCTR_PC_5_LO, RBBM_PERFCTR_PC_5_HI), - COUNTER(PC_PERFCTR_PC_SEL_6, RBBM_PERFCTR_PC_6_LO, RBBM_PERFCTR_PC_6_HI), - COUNTER(PC_PERFCTR_PC_SEL_7, RBBM_PERFCTR_PC_7_LO, RBBM_PERFCTR_PC_7_HI), -}; - -static const struct fd_perfcntr_countable pc_countables[] = { - COUNTABLE(PERF_PC_BUSY_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_PC_WORKING_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_PC_STALL_CYCLES_VFD, UINT64, AVERAGE), - COUNTABLE(PERF_PC_STALL_CYCLES_TSE, UINT64, AVERAGE), - COUNTABLE(PERF_PC_STALL_CYCLES_VPC, UINT64, AVERAGE), - COUNTABLE(PERF_PC_STALL_CYCLES_UCHE, UINT64, AVERAGE), - COUNTABLE(PERF_PC_STALL_CYCLES_TESS, UINT64, AVERAGE), - COUNTABLE(PERF_PC_STALL_CYCLES_TSE_ONLY, UINT64, AVERAGE), - COUNTABLE(PERF_PC_STALL_CYCLES_VPC_ONLY, UINT64, AVERAGE), - COUNTABLE(PERF_PC_PASS1_TF_STALL_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_PC_STARVE_CYCLES_FOR_INDEX, UINT64, AVERAGE), - COUNTABLE(PERF_PC_STARVE_CYCLES_FOR_TESS_FACTOR, UINT64, AVERAGE), - COUNTABLE(PERF_PC_STARVE_CYCLES_FOR_VIZ_STREAM, UINT64, AVERAGE), - COUNTABLE(PERF_PC_STARVE_CYCLES_FOR_POSITION, UINT64, AVERAGE), - COUNTABLE(PERF_PC_STARVE_CYCLES_DI, UINT64, AVERAGE), - COUNTABLE(PERF_PC_VIS_STREAMS_LOADED, UINT64, AVERAGE), - COUNTABLE(PERF_PC_INSTANCES, UINT64, AVERAGE), - COUNTABLE(PERF_PC_VPC_PRIMITIVES, UINT64, AVERAGE), - COUNTABLE(PERF_PC_DEAD_PRIM, UINT64, AVERAGE), - COUNTABLE(PERF_PC_LIVE_PRIM, UINT64, AVERAGE), - COUNTABLE(PERF_PC_VERTEX_HITS, UINT64, AVERAGE), - COUNTABLE(PERF_PC_IA_VERTICES, UINT64, AVERAGE), - COUNTABLE(PERF_PC_IA_PRIMITIVES, UINT64, AVERAGE), - COUNTABLE(PERF_PC_GS_PRIMITIVES, UINT64, AVERAGE), - COUNTABLE(PERF_PC_HS_INVOCATIONS, UINT64, AVERAGE), - COUNTABLE(PERF_PC_DS_INVOCATIONS, UINT64, AVERAGE), - COUNTABLE(PERF_PC_VS_INVOCATIONS, UINT64, AVERAGE), - COUNTABLE(PERF_PC_GS_INVOCATIONS, UINT64, AVERAGE), - COUNTABLE(PERF_PC_DS_PRIMITIVES, UINT64, AVERAGE), - COUNTABLE(PERF_PC_VPC_POS_DATA_TRANSACTION, UINT64, AVERAGE), - COUNTABLE(PERF_PC_3D_DRAWCALLS, UINT64, AVERAGE), - COUNTABLE(PERF_PC_2D_DRAWCALLS, UINT64, AVERAGE), - COUNTABLE(PERF_PC_NON_DRAWCALL_GLOBAL_EVENTS, UINT64, AVERAGE), - COUNTABLE(PERF_TESS_BUSY_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_TESS_WORKING_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_TESS_STALL_CYCLES_PC, UINT64, AVERAGE), - COUNTABLE(PERF_TESS_STARVE_CYCLES_PC, UINT64, AVERAGE), -}; - -static const struct fd_perfcntr_counter rb_counters[] = { - COUNTER(RB_PERFCTR_RB_SEL_0, RBBM_PERFCTR_RB_0_LO, RBBM_PERFCTR_RB_0_HI), - COUNTER(RB_PERFCTR_RB_SEL_1, RBBM_PERFCTR_RB_1_LO, RBBM_PERFCTR_RB_1_HI), - COUNTER(RB_PERFCTR_RB_SEL_2, RBBM_PERFCTR_RB_2_LO, RBBM_PERFCTR_RB_2_HI), - COUNTER(RB_PERFCTR_RB_SEL_3, RBBM_PERFCTR_RB_3_LO, RBBM_PERFCTR_RB_3_HI), - COUNTER(RB_PERFCTR_RB_SEL_4, RBBM_PERFCTR_RB_4_LO, RBBM_PERFCTR_RB_4_HI), - COUNTER(RB_PERFCTR_RB_SEL_5, RBBM_PERFCTR_RB_5_LO, RBBM_PERFCTR_RB_5_HI), - COUNTER(RB_PERFCTR_RB_SEL_6, RBBM_PERFCTR_RB_6_LO, RBBM_PERFCTR_RB_6_HI), - COUNTER(RB_PERFCTR_RB_SEL_7, RBBM_PERFCTR_RB_7_LO, RBBM_PERFCTR_RB_7_HI), -}; - -static const struct fd_perfcntr_countable rb_countables[] = { - COUNTABLE(PERF_RB_BUSY_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_RB_STALL_CYCLES_CCU, UINT64, AVERAGE), - COUNTABLE(PERF_RB_STALL_CYCLES_HLSQ, UINT64, AVERAGE), - COUNTABLE(PERF_RB_STALL_CYCLES_FIFO0_FULL, UINT64, AVERAGE), - COUNTABLE(PERF_RB_STALL_CYCLES_FIFO1_FULL, UINT64, AVERAGE), - COUNTABLE(PERF_RB_STALL_CYCLES_FIFO2_FULL, UINT64, AVERAGE), - COUNTABLE(PERF_RB_STARVE_CYCLES_SP, UINT64, AVERAGE), - COUNTABLE(PERF_RB_STARVE_CYCLES_LRZ_TILE, UINT64, AVERAGE), - COUNTABLE(PERF_RB_STARVE_CYCLES_CCU, UINT64, AVERAGE), - COUNTABLE(PERF_RB_STARVE_CYCLES_Z_PLANE, UINT64, AVERAGE), - COUNTABLE(PERF_RB_STARVE_CYCLES_BARY_PLANE, UINT64, AVERAGE), - COUNTABLE(PERF_RB_Z_WORKLOAD, UINT64, AVERAGE), - COUNTABLE(PERF_RB_HLSQ_ACTIVE, UINT64, AVERAGE), - COUNTABLE(PERF_RB_Z_READ, UINT64, AVERAGE), - COUNTABLE(PERF_RB_Z_WRITE, UINT64, AVERAGE), - COUNTABLE(PERF_RB_C_READ, UINT64, AVERAGE), - COUNTABLE(PERF_RB_C_WRITE, UINT64, AVERAGE), - COUNTABLE(PERF_RB_TOTAL_PASS, UINT64, AVERAGE), - COUNTABLE(PERF_RB_Z_PASS, UINT64, AVERAGE), - COUNTABLE(PERF_RB_Z_FAIL, UINT64, AVERAGE), - COUNTABLE(PERF_RB_S_FAIL, UINT64, AVERAGE), - COUNTABLE(PERF_RB_BLENDED_FXP_COMPONENTS, UINT64, AVERAGE), - COUNTABLE(PERF_RB_BLENDED_FP16_COMPONENTS, UINT64, AVERAGE), - COUNTABLE(RB_RESERVED, UINT64, AVERAGE), - COUNTABLE(PERF_RB_2D_ALIVE_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_RB_2D_STALL_CYCLES_A2D, UINT64, AVERAGE), - COUNTABLE(PERF_RB_2D_STARVE_CYCLES_SRC, UINT64, AVERAGE), - COUNTABLE(PERF_RB_2D_STARVE_CYCLES_SP, UINT64, AVERAGE), - COUNTABLE(PERF_RB_2D_STARVE_CYCLES_DST, UINT64, AVERAGE), - COUNTABLE(PERF_RB_2D_VALID_PIXELS, UINT64, AVERAGE), -}; - -static const struct fd_perfcntr_counter rbbm_counters[] = { -//RESERVED: for kernel -// COUNTER(RBBM_PERFCTR_RBBM_SEL_0, RBBM_PERFCTR_RBBM_0_LO, RBBM_PERFCTR_RBBM_0_HI), - COUNTER(RBBM_PERFCTR_RBBM_SEL_1, RBBM_PERFCTR_RBBM_1_LO, RBBM_PERFCTR_RBBM_1_HI), - COUNTER(RBBM_PERFCTR_RBBM_SEL_2, RBBM_PERFCTR_RBBM_2_LO, RBBM_PERFCTR_RBBM_2_HI), - COUNTER(RBBM_PERFCTR_RBBM_SEL_3, RBBM_PERFCTR_RBBM_3_LO, RBBM_PERFCTR_RBBM_3_HI), -}; - -static const struct fd_perfcntr_countable rbbm_countables[] = { - COUNTABLE(PERF_RBBM_ALWAYS_COUNT, UINT64, AVERAGE), - COUNTABLE(PERF_RBBM_ALWAYS_ON, UINT64, AVERAGE), - COUNTABLE(PERF_RBBM_TSE_BUSY, UINT64, AVERAGE), - COUNTABLE(PERF_RBBM_RAS_BUSY, UINT64, AVERAGE), - COUNTABLE(PERF_RBBM_PC_DCALL_BUSY, UINT64, AVERAGE), - COUNTABLE(PERF_RBBM_PC_VSD_BUSY, UINT64, AVERAGE), - COUNTABLE(PERF_RBBM_STATUS_MASKED, UINT64, AVERAGE), - COUNTABLE(PERF_RBBM_COM_BUSY, UINT64, AVERAGE), - COUNTABLE(PERF_RBBM_DCOM_BUSY, UINT64, AVERAGE), - COUNTABLE(PERF_RBBM_VBIF_BUSY, UINT64, AVERAGE), - COUNTABLE(PERF_RBBM_VSC_BUSY, UINT64, AVERAGE), - COUNTABLE(PERF_RBBM_TESS_BUSY, UINT64, AVERAGE), - COUNTABLE(PERF_RBBM_UCHE_BUSY, UINT64, AVERAGE), - COUNTABLE(PERF_RBBM_HLSQ_BUSY, UINT64, AVERAGE), -}; - -static const struct fd_perfcntr_counter sp_counters[] = { -//RESERVED: for kernel -// COUNTER(SP_PERFCTR_SP_SEL_0, RBBM_PERFCTR_SP_0_LO, RBBM_PERFCTR_SP_0_HI), - COUNTER(SP_PERFCTR_SP_SEL_1, RBBM_PERFCTR_SP_1_LO, RBBM_PERFCTR_SP_1_HI), - COUNTER(SP_PERFCTR_SP_SEL_2, RBBM_PERFCTR_SP_2_LO, RBBM_PERFCTR_SP_2_HI), - COUNTER(SP_PERFCTR_SP_SEL_3, RBBM_PERFCTR_SP_3_LO, RBBM_PERFCTR_SP_3_HI), - COUNTER(SP_PERFCTR_SP_SEL_4, RBBM_PERFCTR_SP_4_LO, RBBM_PERFCTR_SP_4_HI), - COUNTER(SP_PERFCTR_SP_SEL_5, RBBM_PERFCTR_SP_5_LO, RBBM_PERFCTR_SP_5_HI), - COUNTER(SP_PERFCTR_SP_SEL_6, RBBM_PERFCTR_SP_6_LO, RBBM_PERFCTR_SP_6_HI), - COUNTER(SP_PERFCTR_SP_SEL_7, RBBM_PERFCTR_SP_7_LO, RBBM_PERFCTR_SP_7_HI), - COUNTER(SP_PERFCTR_SP_SEL_8, RBBM_PERFCTR_SP_8_LO, RBBM_PERFCTR_SP_8_HI), - COUNTER(SP_PERFCTR_SP_SEL_9, RBBM_PERFCTR_SP_9_LO, RBBM_PERFCTR_SP_9_HI), - COUNTER(SP_PERFCTR_SP_SEL_10, RBBM_PERFCTR_SP_10_LO, RBBM_PERFCTR_SP_10_HI), - COUNTER(SP_PERFCTR_SP_SEL_11, RBBM_PERFCTR_SP_11_LO, RBBM_PERFCTR_SP_11_HI), -}; - -static const struct fd_perfcntr_countable sp_countables[] = { - COUNTABLE(PERF_SP_BUSY_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_SP_ALU_WORKING_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_SP_EFU_WORKING_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_SP_STALL_CYCLES_VPC, UINT64, AVERAGE), - COUNTABLE(PERF_SP_STALL_CYCLES_TP, UINT64, AVERAGE), - COUNTABLE(PERF_SP_STALL_CYCLES_UCHE, UINT64, AVERAGE), - COUNTABLE(PERF_SP_STALL_CYCLES_RB, UINT64, AVERAGE), - COUNTABLE(PERF_SP_SCHEDULER_NON_WORKING, UINT64, AVERAGE), - COUNTABLE(PERF_SP_WAVE_CONTEXTS, UINT64, AVERAGE), - COUNTABLE(PERF_SP_WAVE_CONTEXT_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_SP_FS_STAGE_WAVE_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_SP_FS_STAGE_WAVE_SAMPLES, UINT64, AVERAGE), - COUNTABLE(PERF_SP_VS_STAGE_WAVE_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_SP_VS_STAGE_WAVE_SAMPLES, UINT64, AVERAGE), - COUNTABLE(PERF_SP_FS_STAGE_DURATION_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_SP_VS_STAGE_DURATION_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_SP_WAVE_CTRL_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_SP_WAVE_LOAD_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_SP_WAVE_EMIT_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_SP_WAVE_NOP_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_SP_WAVE_WAIT_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_SP_WAVE_FETCH_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_SP_WAVE_IDLE_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_SP_WAVE_END_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_SP_WAVE_LONG_SYNC_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_SP_WAVE_SHORT_SYNC_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_SP_WAVE_JOIN_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_SP_LM_LOAD_INSTRUCTIONS, UINT64, AVERAGE), - COUNTABLE(PERF_SP_LM_STORE_INSTRUCTIONS, UINT64, AVERAGE), - COUNTABLE(PERF_SP_LM_ATOMICS, UINT64, AVERAGE), - COUNTABLE(PERF_SP_GM_LOAD_INSTRUCTIONS, UINT64, AVERAGE), - COUNTABLE(PERF_SP_GM_STORE_INSTRUCTIONS, UINT64, AVERAGE), - COUNTABLE(PERF_SP_GM_ATOMICS, UINT64, AVERAGE), - COUNTABLE(PERF_SP_VS_STAGE_TEX_INSTRUCTIONS, UINT64, AVERAGE), - COUNTABLE(PERF_SP_VS_STAGE_CFLOW_INSTRUCTIONS, UINT64, AVERAGE), - COUNTABLE(PERF_SP_VS_STAGE_EFU_INSTRUCTIONS, UINT64, AVERAGE), - COUNTABLE(PERF_SP_VS_STAGE_FULL_ALU_INSTRUCTIONS, UINT64, AVERAGE), - COUNTABLE(PERF_SP_VS_STAGE_HALF_ALU_INSTRUCTIONS, UINT64, AVERAGE), - COUNTABLE(PERF_SP_FS_STAGE_TEX_INSTRUCTIONS, UINT64, AVERAGE), - COUNTABLE(PERF_SP_FS_STAGE_CFLOW_INSTRUCTIONS, UINT64, AVERAGE), - COUNTABLE(PERF_SP_FS_STAGE_EFU_INSTRUCTIONS, UINT64, AVERAGE), - COUNTABLE(PERF_SP_FS_STAGE_FULL_ALU_INSTRUCTIONS, UINT64, AVERAGE), - COUNTABLE(PERF_SP_FS_STAGE_HALF_ALU_INSTRUCTIONS, UINT64, AVERAGE), - COUNTABLE(PERF_SP_FS_STAGE_BARY_INSTRUCTIONS, UINT64, AVERAGE), - COUNTABLE(PERF_SP_VS_INSTRUCTIONS, UINT64, AVERAGE), - COUNTABLE(PERF_SP_FS_INSTRUCTIONS, UINT64, AVERAGE), - COUNTABLE(PERF_SP_ADDR_LOCK_COUNT, UINT64, AVERAGE), - COUNTABLE(PERF_SP_UCHE_READ_TRANS, UINT64, AVERAGE), - COUNTABLE(PERF_SP_UCHE_WRITE_TRANS, UINT64, AVERAGE), - COUNTABLE(PERF_SP_EXPORT_VPC_TRANS, UINT64, AVERAGE), - COUNTABLE(PERF_SP_EXPORT_RB_TRANS, UINT64, AVERAGE), - COUNTABLE(PERF_SP_PIXELS_KILLED, UINT64, AVERAGE), - COUNTABLE(PERF_SP_ICL1_REQUESTS, UINT64, AVERAGE), - COUNTABLE(PERF_SP_ICL1_MISSES, UINT64, AVERAGE), - COUNTABLE(PERF_SP_ICL0_REQUESTS, UINT64, AVERAGE), - COUNTABLE(PERF_SP_ICL0_MISSES, UINT64, AVERAGE), - COUNTABLE(PERF_SP_HS_INSTRUCTIONS, UINT64, AVERAGE), - COUNTABLE(PERF_SP_DS_INSTRUCTIONS, UINT64, AVERAGE), - COUNTABLE(PERF_SP_GS_INSTRUCTIONS, UINT64, AVERAGE), - COUNTABLE(PERF_SP_CS_INSTRUCTIONS, UINT64, AVERAGE), - COUNTABLE(PERF_SP_GPR_READ, UINT64, AVERAGE), - COUNTABLE(PERF_SP_GPR_WRITE, UINT64, AVERAGE), - COUNTABLE(PERF_SP_LM_CH0_REQUESTS, UINT64, AVERAGE), - COUNTABLE(PERF_SP_LM_CH1_REQUESTS, UINT64, AVERAGE), - COUNTABLE(PERF_SP_LM_BANK_CONFLICTS, UINT64, AVERAGE), -}; - -static const struct fd_perfcntr_counter tp_counters[] = { - COUNTER(TPL1_PERFCTR_TP_SEL_0, RBBM_PERFCTR_TP_0_LO, RBBM_PERFCTR_TP_0_HI), - COUNTER(TPL1_PERFCTR_TP_SEL_1, RBBM_PERFCTR_TP_1_LO, RBBM_PERFCTR_TP_1_HI), - COUNTER(TPL1_PERFCTR_TP_SEL_2, RBBM_PERFCTR_TP_2_LO, RBBM_PERFCTR_TP_2_HI), - COUNTER(TPL1_PERFCTR_TP_SEL_3, RBBM_PERFCTR_TP_3_LO, RBBM_PERFCTR_TP_3_HI), - COUNTER(TPL1_PERFCTR_TP_SEL_4, RBBM_PERFCTR_TP_4_LO, RBBM_PERFCTR_TP_4_HI), - COUNTER(TPL1_PERFCTR_TP_SEL_5, RBBM_PERFCTR_TP_5_LO, RBBM_PERFCTR_TP_5_HI), - COUNTER(TPL1_PERFCTR_TP_SEL_6, RBBM_PERFCTR_TP_6_LO, RBBM_PERFCTR_TP_6_HI), - COUNTER(TPL1_PERFCTR_TP_SEL_7, RBBM_PERFCTR_TP_7_LO, RBBM_PERFCTR_TP_7_HI), -}; - -static const struct fd_perfcntr_countable tp_countables[] = { - COUNTABLE(PERF_TP_BUSY_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_TP_STALL_CYCLES_UCHE, UINT64, AVERAGE), - COUNTABLE(PERF_TP_LATENCY_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_TP_LATENCY_TRANS, UINT64, AVERAGE), - COUNTABLE(PERF_TP_FLAG_CACHE_REQUEST_SAMPLES, UINT64, AVERAGE), - COUNTABLE(PERF_TP_FLAG_CACHE_REQUEST_LATENCY, UINT64, AVERAGE), - COUNTABLE(PERF_TP_L1_CACHELINE_REQUESTS, UINT64, AVERAGE), - COUNTABLE(PERF_TP_L1_CACHELINE_MISSES, UINT64, AVERAGE), - COUNTABLE(PERF_TP_SP_TP_TRANS, UINT64, AVERAGE), - COUNTABLE(PERF_TP_TP_SP_TRANS, UINT64, AVERAGE), - COUNTABLE(PERF_TP_OUTPUT_PIXELS, UINT64, AVERAGE), - COUNTABLE(PERF_TP_FILTER_WORKLOAD_16BIT, UINT64, AVERAGE), - COUNTABLE(PERF_TP_FILTER_WORKLOAD_32BIT, UINT64, AVERAGE), - COUNTABLE(PERF_TP_QUADS_RECEIVED, UINT64, AVERAGE), - COUNTABLE(PERF_TP_QUADS_OFFSET, UINT64, AVERAGE), - COUNTABLE(PERF_TP_QUADS_SHADOW, UINT64, AVERAGE), - COUNTABLE(PERF_TP_QUADS_ARRAY, UINT64, AVERAGE), - COUNTABLE(PERF_TP_QUADS_GRADIENT, UINT64, AVERAGE), - COUNTABLE(PERF_TP_QUADS_1D, UINT64, AVERAGE), - COUNTABLE(PERF_TP_QUADS_2D, UINT64, AVERAGE), - COUNTABLE(PERF_TP_QUADS_BUFFER, UINT64, AVERAGE), - COUNTABLE(PERF_TP_QUADS_3D, UINT64, AVERAGE), - COUNTABLE(PERF_TP_QUADS_CUBE, UINT64, AVERAGE), - COUNTABLE(PERF_TP_STATE_CACHE_REQUESTS, UINT64, AVERAGE), - COUNTABLE(PERF_TP_STATE_CACHE_MISSES, UINT64, AVERAGE), - COUNTABLE(PERF_TP_DIVERGENT_QUADS_RECEIVED, UINT64, AVERAGE), - COUNTABLE(PERF_TP_BINDLESS_STATE_CACHE_REQUESTS, UINT64, AVERAGE), - COUNTABLE(PERF_TP_BINDLESS_STATE_CACHE_MISSES, UINT64, AVERAGE), - COUNTABLE(PERF_TP_PRT_NON_RESIDENT_EVENTS, UINT64, AVERAGE), - COUNTABLE(PERF_TP_OUTPUT_PIXELS_POINT, UINT64, AVERAGE), - COUNTABLE(PERF_TP_OUTPUT_PIXELS_BILINEAR, UINT64, AVERAGE), - COUNTABLE(PERF_TP_OUTPUT_PIXELS_MIP, UINT64, AVERAGE), - COUNTABLE(PERF_TP_OUTPUT_PIXELS_ANISO, UINT64, AVERAGE), - COUNTABLE(PERF_TP_OUTPUT_PIXELS_ZERO_LOD, UINT64, AVERAGE), - COUNTABLE(PERF_TP_FLAG_CACHE_REQUESTS, UINT64, AVERAGE), - COUNTABLE(PERF_TP_FLAG_CACHE_MISSES, UINT64, AVERAGE), - COUNTABLE(PERF_TP_L1_5_L2_REQUESTS, UINT64, AVERAGE), - COUNTABLE(PERF_TP_2D_OUTPUT_PIXELS, UINT64, AVERAGE), - COUNTABLE(PERF_TP_2D_OUTPUT_PIXELS_POINT, UINT64, AVERAGE), - COUNTABLE(PERF_TP_2D_OUTPUT_PIXELS_BILINEAR, UINT64, AVERAGE), - COUNTABLE(PERF_TP_2D_FILTER_WORKLOAD_16BIT, UINT64, AVERAGE), - COUNTABLE(PERF_TP_2D_FILTER_WORKLOAD_32BIT, UINT64, AVERAGE), -}; - -static const struct fd_perfcntr_counter uche_counters[] = { - COUNTER(UCHE_PERFCTR_UCHE_SEL_0, RBBM_PERFCTR_UCHE_0_LO, RBBM_PERFCTR_UCHE_0_HI), - COUNTER(UCHE_PERFCTR_UCHE_SEL_1, RBBM_PERFCTR_UCHE_1_LO, RBBM_PERFCTR_UCHE_1_HI), - COUNTER(UCHE_PERFCTR_UCHE_SEL_2, RBBM_PERFCTR_UCHE_2_LO, RBBM_PERFCTR_UCHE_2_HI), - COUNTER(UCHE_PERFCTR_UCHE_SEL_3, RBBM_PERFCTR_UCHE_3_LO, RBBM_PERFCTR_UCHE_3_HI), - COUNTER(UCHE_PERFCTR_UCHE_SEL_4, RBBM_PERFCTR_UCHE_4_LO, RBBM_PERFCTR_UCHE_4_HI), - COUNTER(UCHE_PERFCTR_UCHE_SEL_5, RBBM_PERFCTR_UCHE_5_LO, RBBM_PERFCTR_UCHE_5_HI), - COUNTER(UCHE_PERFCTR_UCHE_SEL_6, RBBM_PERFCTR_UCHE_6_LO, RBBM_PERFCTR_UCHE_6_HI), - COUNTER(UCHE_PERFCTR_UCHE_SEL_7, RBBM_PERFCTR_UCHE_7_LO, RBBM_PERFCTR_UCHE_7_HI), -}; - -static const struct fd_perfcntr_countable uche_countables[] = { - COUNTABLE(PERF_UCHE_BUSY_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_UCHE_STALL_CYCLES_VBIF, UINT64, AVERAGE), - COUNTABLE(PERF_UCHE_VBIF_LATENCY_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_UCHE_VBIF_LATENCY_SAMPLES, UINT64, AVERAGE), - COUNTABLE(PERF_UCHE_VBIF_READ_BEATS_TP, UINT64, AVERAGE), - COUNTABLE(PERF_UCHE_VBIF_READ_BEATS_VFD, UINT64, AVERAGE), - COUNTABLE(PERF_UCHE_VBIF_READ_BEATS_HLSQ, UINT64, AVERAGE), - COUNTABLE(PERF_UCHE_VBIF_READ_BEATS_LRZ, UINT64, AVERAGE), - COUNTABLE(PERF_UCHE_VBIF_READ_BEATS_SP, UINT64, AVERAGE), - COUNTABLE(PERF_UCHE_READ_REQUESTS_TP, UINT64, AVERAGE), - COUNTABLE(PERF_UCHE_READ_REQUESTS_VFD, UINT64, AVERAGE), - COUNTABLE(PERF_UCHE_READ_REQUESTS_HLSQ, UINT64, AVERAGE), - COUNTABLE(PERF_UCHE_READ_REQUESTS_LRZ, UINT64, AVERAGE), - COUNTABLE(PERF_UCHE_READ_REQUESTS_SP, UINT64, AVERAGE), - COUNTABLE(PERF_UCHE_WRITE_REQUESTS_LRZ, UINT64, AVERAGE), - COUNTABLE(PERF_UCHE_WRITE_REQUESTS_SP, UINT64, AVERAGE), - COUNTABLE(PERF_UCHE_WRITE_REQUESTS_VPC, UINT64, AVERAGE), - COUNTABLE(PERF_UCHE_WRITE_REQUESTS_VSC, UINT64, AVERAGE), - COUNTABLE(PERF_UCHE_EVICTS, UINT64, AVERAGE), - COUNTABLE(PERF_UCHE_BANK_REQ0, UINT64, AVERAGE), - COUNTABLE(PERF_UCHE_BANK_REQ1, UINT64, AVERAGE), - COUNTABLE(PERF_UCHE_BANK_REQ2, UINT64, AVERAGE), - COUNTABLE(PERF_UCHE_BANK_REQ3, UINT64, AVERAGE), - COUNTABLE(PERF_UCHE_BANK_REQ4, UINT64, AVERAGE), - COUNTABLE(PERF_UCHE_BANK_REQ5, UINT64, AVERAGE), - COUNTABLE(PERF_UCHE_BANK_REQ6, UINT64, AVERAGE), - COUNTABLE(PERF_UCHE_BANK_REQ7, UINT64, AVERAGE), - COUNTABLE(PERF_UCHE_VBIF_READ_BEATS_CH0, UINT64, AVERAGE), - COUNTABLE(PERF_UCHE_VBIF_READ_BEATS_CH1, UINT64, AVERAGE), - COUNTABLE(PERF_UCHE_GMEM_READ_BEATS, UINT64, AVERAGE), - COUNTABLE(PERF_UCHE_FLAG_COUNT, UINT64, AVERAGE), -}; - -static const struct fd_perfcntr_counter vfd_counters[] = { - COUNTER(VFD_PERFCTR_VFD_SEL_0, RBBM_PERFCTR_VFD_0_LO, RBBM_PERFCTR_VFD_0_HI), - COUNTER(VFD_PERFCTR_VFD_SEL_1, RBBM_PERFCTR_VFD_1_LO, RBBM_PERFCTR_VFD_1_HI), - COUNTER(VFD_PERFCTR_VFD_SEL_2, RBBM_PERFCTR_VFD_2_LO, RBBM_PERFCTR_VFD_2_HI), - COUNTER(VFD_PERFCTR_VFD_SEL_3, RBBM_PERFCTR_VFD_3_LO, RBBM_PERFCTR_VFD_3_HI), - COUNTER(VFD_PERFCTR_VFD_SEL_4, RBBM_PERFCTR_VFD_4_LO, RBBM_PERFCTR_VFD_4_HI), - COUNTER(VFD_PERFCTR_VFD_SEL_5, RBBM_PERFCTR_VFD_5_LO, RBBM_PERFCTR_VFD_5_HI), - COUNTER(VFD_PERFCTR_VFD_SEL_6, RBBM_PERFCTR_VFD_6_LO, RBBM_PERFCTR_VFD_6_HI), - COUNTER(VFD_PERFCTR_VFD_SEL_7, RBBM_PERFCTR_VFD_7_LO, RBBM_PERFCTR_VFD_7_HI), -}; - -static const struct fd_perfcntr_countable vfd_countables[] = { - COUNTABLE(PERF_VFD_BUSY_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_VFD_STALL_CYCLES_UCHE, UINT64, AVERAGE), - COUNTABLE(PERF_VFD_STALL_CYCLES_VPC_ALLOC, UINT64, AVERAGE), - COUNTABLE(PERF_VFD_STALL_CYCLES_MISS_VB, UINT64, AVERAGE), - COUNTABLE(PERF_VFD_STALL_CYCLES_MISS_Q, UINT64, AVERAGE), - COUNTABLE(PERF_VFD_STALL_CYCLES_SP_INFO, UINT64, AVERAGE), - COUNTABLE(PERF_VFD_STALL_CYCLES_SP_ATTR, UINT64, AVERAGE), - COUNTABLE(PERF_VFD_STALL_CYCLES_VFDP_VB, UINT64, AVERAGE), - COUNTABLE(PERF_VFD_STALL_CYCLES_VFDP_Q, UINT64, AVERAGE), - COUNTABLE(PERF_VFD_DECODER_PACKER_STALL, UINT64, AVERAGE), - COUNTABLE(PERF_VFD_STARVE_CYCLES_UCHE, UINT64, AVERAGE), - COUNTABLE(PERF_VFD_RBUFFER_FULL, UINT64, AVERAGE), - COUNTABLE(PERF_VFD_ATTR_INFO_FIFO_FULL, UINT64, AVERAGE), - COUNTABLE(PERF_VFD_DECODED_ATTRIBUTE_BYTES, UINT64, AVERAGE), - COUNTABLE(PERF_VFD_NUM_ATTRIBUTES, UINT64, AVERAGE), - COUNTABLE(PERF_VFD_INSTRUCTIONS, UINT64, AVERAGE), - COUNTABLE(PERF_VFD_UPPER_SHADER_FIBERS, UINT64, AVERAGE), - COUNTABLE(PERF_VFD_LOWER_SHADER_FIBERS, UINT64, AVERAGE), - COUNTABLE(PERF_VFD_MODE_0_FIBERS, UINT64, AVERAGE), - COUNTABLE(PERF_VFD_MODE_1_FIBERS, UINT64, AVERAGE), - COUNTABLE(PERF_VFD_MODE_2_FIBERS, UINT64, AVERAGE), - COUNTABLE(PERF_VFD_MODE_3_FIBERS, UINT64, AVERAGE), - COUNTABLE(PERF_VFD_MODE_4_FIBERS, UINT64, AVERAGE), - COUNTABLE(PERF_VFD_TOTAL_VERTICES, UINT64, AVERAGE), - COUNTABLE(PERF_VFD_NUM_ATTR_MISS, UINT64, AVERAGE), - COUNTABLE(PERF_VFD_1_BURST_REQ, UINT64, AVERAGE), - COUNTABLE(PERF_VFDP_STALL_CYCLES_VFD, UINT64, AVERAGE), - COUNTABLE(PERF_VFDP_STALL_CYCLES_VFD_INDEX, UINT64, AVERAGE), - COUNTABLE(PERF_VFDP_STALL_CYCLES_VFD_PROG, UINT64, AVERAGE), - COUNTABLE(PERF_VFDP_STARVE_CYCLES_PC, UINT64, AVERAGE), - COUNTABLE(PERF_VFDP_VS_STAGE_32_WAVES, UINT64, AVERAGE), -}; - -static const struct fd_perfcntr_counter vpc_counters[] = { - COUNTER(VPC_PERFCTR_VPC_SEL_0, RBBM_PERFCTR_VPC_0_LO, RBBM_PERFCTR_VPC_0_HI), - COUNTER(VPC_PERFCTR_VPC_SEL_1, RBBM_PERFCTR_VPC_1_LO, RBBM_PERFCTR_VPC_1_HI), - COUNTER(VPC_PERFCTR_VPC_SEL_2, RBBM_PERFCTR_VPC_2_LO, RBBM_PERFCTR_VPC_2_HI), - COUNTER(VPC_PERFCTR_VPC_SEL_3, RBBM_PERFCTR_VPC_3_LO, RBBM_PERFCTR_VPC_3_HI), -}; - -static const struct fd_perfcntr_countable vpc_countables[] = { - COUNTABLE(PERF_VPC_BUSY_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_VPC_WORKING_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_VPC_STALL_CYCLES_UCHE, UINT64, AVERAGE), - COUNTABLE(PERF_VPC_STALL_CYCLES_VFD_WACK, UINT64, AVERAGE), - COUNTABLE(PERF_VPC_STALL_CYCLES_HLSQ_PRIM_ALLOC, UINT64, AVERAGE), - COUNTABLE(PERF_VPC_STALL_CYCLES_PC, UINT64, AVERAGE), - COUNTABLE(PERF_VPC_STALL_CYCLES_SP_LM, UINT64, AVERAGE), - COUNTABLE(PERF_VPC_POS_EXPORT_STALL_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_VPC_STARVE_CYCLES_SP, UINT64, AVERAGE), - COUNTABLE(PERF_VPC_STARVE_CYCLES_LRZ, UINT64, AVERAGE), - COUNTABLE(PERF_VPC_PC_PRIMITIVES, UINT64, AVERAGE), - COUNTABLE(PERF_VPC_SP_COMPONENTS, UINT64, AVERAGE), - COUNTABLE(PERF_VPC_SP_LM_PRIMITIVES, UINT64, AVERAGE), - COUNTABLE(PERF_VPC_SP_LM_COMPONENTS, UINT64, AVERAGE), - COUNTABLE(PERF_VPC_SP_LM_DWORDS, UINT64, AVERAGE), - COUNTABLE(PERF_VPC_STREAMOUT_COMPONENTS, UINT64, AVERAGE), - COUNTABLE(PERF_VPC_GRANT_PHASES, UINT64, AVERAGE), -}; - -static const struct fd_perfcntr_counter vsc_counters[] = { - COUNTER(VSC_PERFCTR_VSC_SEL_0, RBBM_PERFCTR_VSC_0_LO, RBBM_PERFCTR_VSC_0_HI), - COUNTER(VSC_PERFCTR_VSC_SEL_1, RBBM_PERFCTR_VSC_1_LO, RBBM_PERFCTR_VSC_1_HI), -}; - -static const struct fd_perfcntr_countable vsc_countables[] = { - COUNTABLE(PERF_VSC_BUSY_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_VSC_WORKING_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_VSC_STALL_CYCLES_UCHE, UINT64, AVERAGE), - COUNTABLE(PERF_VSC_EOT_NUM, UINT64, AVERAGE), -}; - -/* VBIF counters probably not too userful for userspace, and they make - * frameretrace take many more passes to collect all the metrics, so - * for now let's hide them. - */ -#if 0 -/* VBIF counters break the pattern a bit, with enable and clear regs: */ -static const struct fd_perfcntr_counter vbif_counters[] = { - COUNTER2(VBIF_PERF_CNT_SEL0, VBIF_PERF_CNT_LOW0, VBIF_PERF_CNT_HIGH0, VBIF_PERF_CNT_EN0, VBIF_PERF_CNT_CLR0), - COUNTER2(VBIF_PERF_CNT_SEL1, VBIF_PERF_CNT_LOW1, VBIF_PERF_CNT_HIGH1, VBIF_PERF_CNT_EN1, VBIF_PERF_CNT_CLR1), - COUNTER2(VBIF_PERF_CNT_SEL2, VBIF_PERF_CNT_LOW2, VBIF_PERF_CNT_HIGH2, VBIF_PERF_CNT_EN2, VBIF_PERF_CNT_CLR2), - COUNTER2(VBIF_PERF_CNT_SEL3, VBIF_PERF_CNT_LOW3, VBIF_PERF_CNT_HIGH3, VBIF_PERF_CNT_EN3, VBIF_PERF_CNT_CLR3), -}; - -static const struct fd_perfcntr_countable vbif_countables[] = { - COUNTABLE(AXI_READ_REQUESTS_ID_0, UINT64, AVERAGE), - COUNTABLE(AXI_READ_REQUESTS_ID_1, UINT64, AVERAGE), - COUNTABLE(AXI_READ_REQUESTS_ID_2, UINT64, AVERAGE), - COUNTABLE(AXI_READ_REQUESTS_ID_3, UINT64, AVERAGE), - COUNTABLE(AXI_READ_REQUESTS_ID_4, UINT64, AVERAGE), - COUNTABLE(AXI_READ_REQUESTS_ID_5, UINT64, AVERAGE), - COUNTABLE(AXI_READ_REQUESTS_ID_6, UINT64, AVERAGE), - COUNTABLE(AXI_READ_REQUESTS_ID_7, UINT64, AVERAGE), - COUNTABLE(AXI_READ_REQUESTS_ID_8, UINT64, AVERAGE), - COUNTABLE(AXI_READ_REQUESTS_ID_9, UINT64, AVERAGE), - COUNTABLE(AXI_READ_REQUESTS_ID_10, UINT64, AVERAGE), - COUNTABLE(AXI_READ_REQUESTS_ID_11, UINT64, AVERAGE), - COUNTABLE(AXI_READ_REQUESTS_ID_12, UINT64, AVERAGE), - COUNTABLE(AXI_READ_REQUESTS_ID_13, UINT64, AVERAGE), - COUNTABLE(AXI_READ_REQUESTS_ID_14, UINT64, AVERAGE), - COUNTABLE(AXI_READ_REQUESTS_ID_15, UINT64, AVERAGE), - COUNTABLE(AXI0_READ_REQUESTS_TOTAL, UINT64, AVERAGE), - COUNTABLE(AXI1_READ_REQUESTS_TOTAL, UINT64, AVERAGE), - COUNTABLE(AXI2_READ_REQUESTS_TOTAL, UINT64, AVERAGE), - COUNTABLE(AXI3_READ_REQUESTS_TOTAL, UINT64, AVERAGE), - COUNTABLE(AXI_READ_REQUESTS_TOTAL, UINT64, AVERAGE), - COUNTABLE(AXI_WRITE_REQUESTS_ID_0, UINT64, AVERAGE), - COUNTABLE(AXI_WRITE_REQUESTS_ID_1, UINT64, AVERAGE), - COUNTABLE(AXI_WRITE_REQUESTS_ID_2, UINT64, AVERAGE), - COUNTABLE(AXI_WRITE_REQUESTS_ID_3, UINT64, AVERAGE), - COUNTABLE(AXI_WRITE_REQUESTS_ID_4, UINT64, AVERAGE), - COUNTABLE(AXI_WRITE_REQUESTS_ID_5, UINT64, AVERAGE), - COUNTABLE(AXI_WRITE_REQUESTS_ID_6, UINT64, AVERAGE), - COUNTABLE(AXI_WRITE_REQUESTS_ID_7, UINT64, AVERAGE), - COUNTABLE(AXI_WRITE_REQUESTS_ID_8, UINT64, AVERAGE), - COUNTABLE(AXI_WRITE_REQUESTS_ID_9, UINT64, AVERAGE), - COUNTABLE(AXI_WRITE_REQUESTS_ID_10, UINT64, AVERAGE), - COUNTABLE(AXI_WRITE_REQUESTS_ID_11, UINT64, AVERAGE), - COUNTABLE(AXI_WRITE_REQUESTS_ID_12, UINT64, AVERAGE), - COUNTABLE(AXI_WRITE_REQUESTS_ID_13, UINT64, AVERAGE), - COUNTABLE(AXI_WRITE_REQUESTS_ID_14, UINT64, AVERAGE), - COUNTABLE(AXI_WRITE_REQUESTS_ID_15, UINT64, AVERAGE), - COUNTABLE(AXI0_WRITE_REQUESTS_TOTAL, UINT64, AVERAGE), - COUNTABLE(AXI1_WRITE_REQUESTS_TOTAL, UINT64, AVERAGE), - COUNTABLE(AXI2_WRITE_REQUESTS_TOTAL, UINT64, AVERAGE), - COUNTABLE(AXI3_WRITE_REQUESTS_TOTAL, UINT64, AVERAGE), - COUNTABLE(AXI_WRITE_REQUESTS_TOTAL, UINT64, AVERAGE), - COUNTABLE(AXI_TOTAL_REQUESTS, UINT64, AVERAGE), - COUNTABLE(AXI_READ_DATA_BEATS_ID_0, UINT64, AVERAGE), - COUNTABLE(AXI_READ_DATA_BEATS_ID_1, UINT64, AVERAGE), - COUNTABLE(AXI_READ_DATA_BEATS_ID_2, UINT64, AVERAGE), - COUNTABLE(AXI_READ_DATA_BEATS_ID_3, UINT64, AVERAGE), - COUNTABLE(AXI_READ_DATA_BEATS_ID_4, UINT64, AVERAGE), - COUNTABLE(AXI_READ_DATA_BEATS_ID_5, UINT64, AVERAGE), - COUNTABLE(AXI_READ_DATA_BEATS_ID_6, UINT64, AVERAGE), - COUNTABLE(AXI_READ_DATA_BEATS_ID_7, UINT64, AVERAGE), - COUNTABLE(AXI_READ_DATA_BEATS_ID_8, UINT64, AVERAGE), - COUNTABLE(AXI_READ_DATA_BEATS_ID_9, UINT64, AVERAGE), - COUNTABLE(AXI_READ_DATA_BEATS_ID_10, UINT64, AVERAGE), - COUNTABLE(AXI_READ_DATA_BEATS_ID_11, UINT64, AVERAGE), - COUNTABLE(AXI_READ_DATA_BEATS_ID_12, UINT64, AVERAGE), - COUNTABLE(AXI_READ_DATA_BEATS_ID_13, UINT64, AVERAGE), - COUNTABLE(AXI_READ_DATA_BEATS_ID_14, UINT64, AVERAGE), - COUNTABLE(AXI_READ_DATA_BEATS_ID_15, UINT64, AVERAGE), - COUNTABLE(AXI0_READ_DATA_BEATS_TOTAL, UINT64, AVERAGE), - COUNTABLE(AXI1_READ_DATA_BEATS_TOTAL, UINT64, AVERAGE), - COUNTABLE(AXI2_READ_DATA_BEATS_TOTAL, UINT64, AVERAGE), - COUNTABLE(AXI3_READ_DATA_BEATS_TOTAL, UINT64, AVERAGE), - COUNTABLE(AXI_READ_DATA_BEATS_TOTAL, UINT64, AVERAGE), - COUNTABLE(AXI_WRITE_DATA_BEATS_ID_0, UINT64, AVERAGE), - COUNTABLE(AXI_WRITE_DATA_BEATS_ID_1, UINT64, AVERAGE), - COUNTABLE(AXI_WRITE_DATA_BEATS_ID_2, UINT64, AVERAGE), - COUNTABLE(AXI_WRITE_DATA_BEATS_ID_3, UINT64, AVERAGE), - COUNTABLE(AXI_WRITE_DATA_BEATS_ID_4, UINT64, AVERAGE), - COUNTABLE(AXI_WRITE_DATA_BEATS_ID_5, UINT64, AVERAGE), - COUNTABLE(AXI_WRITE_DATA_BEATS_ID_6, UINT64, AVERAGE), - COUNTABLE(AXI_WRITE_DATA_BEATS_ID_7, UINT64, AVERAGE), - COUNTABLE(AXI_WRITE_DATA_BEATS_ID_8, UINT64, AVERAGE), - COUNTABLE(AXI_WRITE_DATA_BEATS_ID_9, UINT64, AVERAGE), - COUNTABLE(AXI_WRITE_DATA_BEATS_ID_10, UINT64, AVERAGE), - COUNTABLE(AXI_WRITE_DATA_BEATS_ID_11, UINT64, AVERAGE), - COUNTABLE(AXI_WRITE_DATA_BEATS_ID_12, UINT64, AVERAGE), - COUNTABLE(AXI_WRITE_DATA_BEATS_ID_13, UINT64, AVERAGE), - COUNTABLE(AXI_WRITE_DATA_BEATS_ID_14, UINT64, AVERAGE), - COUNTABLE(AXI_WRITE_DATA_BEATS_ID_15, UINT64, AVERAGE), - COUNTABLE(AXI0_WRITE_DATA_BEATS_TOTAL, UINT64, AVERAGE), - COUNTABLE(AXI1_WRITE_DATA_BEATS_TOTAL, UINT64, AVERAGE), - COUNTABLE(AXI2_WRITE_DATA_BEATS_TOTAL, UINT64, AVERAGE), - COUNTABLE(AXI3_WRITE_DATA_BEATS_TOTAL, UINT64, AVERAGE), - COUNTABLE(AXI_WRITE_DATA_BEATS_TOTAL, UINT64, AVERAGE), - COUNTABLE(AXI_DATA_BEATS_TOTAL, UINT64, AVERAGE), -}; -#endif - -const struct fd_perfcntr_group a5xx_perfcntr_groups[] = { - GROUP("CP", cp_counters, cp_countables), - GROUP("CCU", ccu_counters, ccu_countables), - GROUP("TSE", tse_counters, tse_countables), - GROUP("RAS", ras_counters, ras_countables), - GROUP("LRZ", lrz_counters, lrz_countables), - GROUP("HLSQ", hlsq_counters, hlsq_countables), - GROUP("PC", pc_counters, pc_countables), - GROUP("RB", rb_counters, rb_countables), - GROUP("RBBM", rbbm_counters, rbbm_countables), - GROUP("SP", sp_counters, sp_countables), - GROUP("TP", tp_counters, tp_countables), - GROUP("UCHE", uche_counters, uche_countables), - GROUP("VFD", vfd_counters, vfd_countables), - GROUP("VPC", vpc_counters, vpc_countables), - GROUP("VSC", vsc_counters, vsc_countables), -// GROUP("VBIF", vbif_counters, vbif_countables), -}; - -const unsigned a5xx_num_perfcntr_groups = ARRAY_SIZE(a5xx_perfcntr_groups); - -#endif /* FD5_PERFCNTR_H_ */ diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/a5xx/fd5_program.c mesa-20.0.8/src/gallium/drivers/freedreno/a5xx/fd5_program.c --- mesa-19.2.8/src/gallium/drivers/freedreno/a5xx/fd5_program.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/a5xx/fd5_program.c 2020-06-12 01:21:17.000000000 +0000 @@ -28,7 +28,7 @@ #include "util/u_string.h" #include "util/u_memory.h" #include "util/u_inlines.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/bitset.h" #include "freedreno_program.h" @@ -40,43 +40,6 @@ #include "ir3_cache.h" -static struct ir3_shader * -create_shader_stateobj(struct pipe_context *pctx, const struct pipe_shader_state *cso, - gl_shader_stage type) -{ - struct fd_context *ctx = fd_context(pctx); - struct ir3_compiler *compiler = ctx->screen->compiler; - return ir3_shader_create(compiler, cso, type, &ctx->debug, pctx->screen); -} - -static void * -fd5_fp_state_create(struct pipe_context *pctx, - const struct pipe_shader_state *cso) -{ - return create_shader_stateobj(pctx, cso, MESA_SHADER_FRAGMENT); -} - -static void -fd5_fp_state_delete(struct pipe_context *pctx, void *hwcso) -{ - struct ir3_shader *so = hwcso; - ir3_shader_destroy(so); -} - -static void * -fd5_vp_state_create(struct pipe_context *pctx, - const struct pipe_shader_state *cso) -{ - return create_shader_stateobj(pctx, cso, MESA_SHADER_VERTEX); -} - -static void -fd5_vp_state_delete(struct pipe_context *pctx, void *hwcso) -{ - struct ir3_shader *so = hwcso; - ir3_shader_destroy(so); -} - void fd5_emit_shader(struct fd_ringbuffer *ring, const struct ir3_shader_variant *so) { @@ -357,7 +320,7 @@ face_regid = ir3_find_sysval_regid(s[FS].v, SYSTEM_VALUE_FRONT_FACE); coord_regid = ir3_find_sysval_regid(s[FS].v, SYSTEM_VALUE_FRAG_COORD); zwcoord_regid = (coord_regid == regid(63,0)) ? regid(63,0) : (coord_regid + 2); - vcoord_regid = ir3_find_sysval_regid(s[FS].v, SYSTEM_VALUE_BARYCENTRIC_PIXEL); + vcoord_regid = ir3_find_sysval_regid(s[FS].v, SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL); /* we could probably divide this up into things that need to be * emitted if frag-prog is dirty vs if vert-prog is dirty.. @@ -722,11 +685,6 @@ void fd5_prog_init(struct pipe_context *pctx) { - pctx->create_fs_state = fd5_fp_state_create; - pctx->delete_fs_state = fd5_fp_state_delete; - - pctx->create_vs_state = fd5_vp_state_create; - pctx->delete_vs_state = fd5_vp_state_delete; - + ir3_prog_init(pctx); fd_prog_init(pctx); } diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/a5xx/fd5_query.c mesa-20.0.8/src/gallium/drivers/freedreno/a5xx/fd5_query.c --- mesa-19.2.8/src/gallium/drivers/freedreno/a5xx/fd5_query.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/a5xx/fd5_query.c 2020-06-12 01:21:17.000000000 +0000 @@ -433,7 +433,7 @@ counters_per_group[entry->gid]++; } - q = fd_acc_create_query2(ctx, 0, &perfcntr); + q = fd_acc_create_query2(ctx, 0, 0, &perfcntr); aq = fd_acc_query(q); /* sample buffer size is based on # of queries: */ diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/a5xx/fd5_resource.c mesa-20.0.8/src/gallium/drivers/freedreno/a5xx/fd5_resource.c --- mesa-19.2.8/src/gallium/drivers/freedreno/a5xx/fd5_resource.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/a5xx/fd5_resource.c 2020-06-12 01:21:17.000000000 +0000 @@ -58,18 +58,17 @@ /* in layer_first layout, the level (slice) contains just one * layer (since in fact the layer contains the slices) */ - uint32_t layers_in_level = rsc->layer_first ? 1 : prsc->array_size; + uint32_t layers_in_level = rsc->layout.layer_first ? 1 : prsc->array_size; - heightalign = tile_alignment[rsc->cpp].heightalign; + heightalign = tile_alignment[rsc->layout.cpp].heightalign; for (level = 0; level <= prsc->last_level; level++) { - struct fd_resource_slice *slice = fd_resource_slice(rsc, level); - bool linear_level = fd_resource_level_linear(prsc, level); + struct fdl_slice *slice = fd_resource_slice(rsc, level); uint32_t aligned_height = height; uint32_t blocks; - if (rsc->tile_mode && !linear_level) { - pitchalign = tile_alignment[rsc->cpp].pitchalign; + if (fd_resource_tile_mode(prsc, level)) { + pitchalign = tile_alignment[rsc->layout.cpp].pitchalign; aligned_height = align(aligned_height, heightalign); } else { pitchalign = 64; @@ -102,18 +101,18 @@ */ if (prsc->target == PIPE_TEXTURE_3D && ( level == 1 || - (level > 1 && rsc->slices[level - 1].size0 > 0xf000))) - slice->size0 = align(blocks * rsc->cpp, alignment); - else if (level == 0 || rsc->layer_first || alignment == 1) - slice->size0 = align(blocks * rsc->cpp, alignment); + (level > 1 && fd_resource_slice(rsc, level - 1)->size0 > 0xf000))) + slice->size0 = align(blocks * rsc->layout.cpp, alignment); + else if (level == 0 || rsc->layout.layer_first || alignment == 1) + slice->size0 = align(blocks * rsc->layout.cpp, alignment); else - slice->size0 = rsc->slices[level - 1].size0; + slice->size0 = fd_resource_slice(rsc, level - 1)->size0; #if 0 debug_printf("%s: %ux%ux%u@%u: %2u: stride=%4u, size=%7u, aligned_height=%3u\n", util_format_name(prsc->format), - prsc->width0, prsc->height0, prsc->depth0, rsc->cpp, - level, slice->pitch * rsc->cpp, + prsc->width0, prsc->height0, prsc->depth0, rsc->layout.cpp, + level, slice->pitch * rsc->layout.cpp, slice->size0 * depth * layers_in_level, aligned_height); #endif @@ -128,18 +127,49 @@ return size; } +static void +setup_lrz(struct fd_resource *rsc) +{ + struct fd_screen *screen = fd_screen(rsc->base.screen); + const uint32_t flags = DRM_FREEDRENO_GEM_CACHE_WCOMBINE | + DRM_FREEDRENO_GEM_TYPE_KMEM; /* TODO */ + unsigned lrz_pitch = align(DIV_ROUND_UP(rsc->base.width0, 8), 64); + unsigned lrz_height = DIV_ROUND_UP(rsc->base.height0, 8); + + /* LRZ buffer is super-sampled: */ + switch (rsc->base.nr_samples) { + case 4: + lrz_pitch *= 2; + /* fallthrough */ + case 2: + lrz_height *= 2; + } + + unsigned size = lrz_pitch * lrz_height * 2; + + size += 0x1000; /* for GRAS_LRZ_FAST_CLEAR_BUFFER */ + + rsc->lrz_height = lrz_height; + rsc->lrz_width = lrz_pitch; + rsc->lrz_pitch = lrz_pitch; + rsc->lrz = fd_bo_new(screen->dev, size, flags, "lrz"); +} + uint32_t fd5_setup_slices(struct fd_resource *rsc) { uint32_t alignment; + if ((fd_mesa_debug & FD_DBG_LRZ) && has_depth(rsc->base.format)) + setup_lrz(rsc); + switch (rsc->base.target) { case PIPE_TEXTURE_3D: - rsc->layer_first = false; + rsc->layout.layer_first = false; alignment = 4096; break; default: - rsc->layer_first = true; + rsc->layout.layer_first = true; alignment = 1; break; } diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/a5xx/fd5_screen.c mesa-20.0.8/src/gallium/drivers/freedreno/a5xx/fd5_screen.c --- mesa-19.2.8/src/gallium/drivers/freedreno/a5xx/fd5_screen.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/a5xx/fd5_screen.c 2020-06-12 01:21:17.000000000 +0000 @@ -25,7 +25,7 @@ */ #include "pipe/p_screen.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "fd5_screen.h" #include "fd5_blitter.h" @@ -76,9 +76,9 @@ } if ((usage & (PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_SHADER_IMAGE)) && + (fd5_pipe2tex(format) != (enum a5xx_tex_fmt)~0) && (target == PIPE_BUFFER || - util_format_get_blocksize(format) != 12) && - (fd5_pipe2tex(format) != (enum a5xx_tex_fmt)~0)) { + util_format_get_blocksize(format) != 12)) { retval |= usage & (PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_SHADER_IMAGE); } @@ -121,9 +121,6 @@ return retval == usage; } -extern const struct fd_perfcntr_group a5xx_perfcntr_groups[]; -extern const unsigned a5xx_num_perfcntr_groups; - void fd5_screen_init(struct pipe_screen *pscreen) { @@ -137,10 +134,5 @@ if (fd_mesa_debug & FD_DBG_TTILE) screen->tile_mode = fd5_tile_mode; - if (fd_mesa_debug & FD_DBG_PERFC) { - screen->perfcntr_groups = a5xx_perfcntr_groups; - screen->num_perfcntr_groups = a5xx_num_perfcntr_groups; - } - fd5_emit_init_screen(pscreen); } diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/a5xx/fd5_texture.c mesa-20.0.8/src/gallium/drivers/freedreno/a5xx/fd5_texture.c --- mesa-19.2.8/src/gallium/drivers/freedreno/a5xx/fd5_texture.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/a5xx/fd5_texture.c 2020-06-12 01:21:17.000000000 +0000 @@ -28,7 +28,7 @@ #include "util/u_string.h" #include "util/u_memory.h" #include "util/u_inlines.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "fd5_texture.h" #include "fd5_format.h" @@ -198,6 +198,7 @@ { struct fd5_pipe_sampler_view *so = CALLOC_STRUCT(fd5_pipe_sampler_view); struct fd_resource *rsc = fd_resource(prsc); + struct fdl_slice *slice = NULL; enum pipe_format format = cso->format; unsigned lvl, layers = 0; @@ -249,12 +250,13 @@ A5XX_TEX_CONST_1_HEIGHT(1); so->texconst2 = A5XX_TEX_CONST_2_FETCHSIZE(fd5_pipe2fetchsize(format)) | - A5XX_TEX_CONST_2_PITCH(elements * rsc->cpp); + A5XX_TEX_CONST_2_PITCH(elements * rsc->layout.cpp); so->offset = cso->u.buf.offset; } else { unsigned miplevels; lvl = fd_sampler_first_level(cso); + slice = fd_resource_slice(rsc, lvl); miplevels = fd_sampler_last_level(cso) - lvl; layers = cso->u.tex.last_layer - cso->u.tex.first_layer + 1; @@ -265,8 +267,7 @@ so->texconst2 = A5XX_TEX_CONST_2_FETCHSIZE(fd5_pipe2fetchsize(format)) | A5XX_TEX_CONST_2_PITCH( - util_format_get_nblocksx( - format, rsc->slices[lvl].pitch) * rsc->cpp); + util_format_get_nblocksx(format, slice->pitch) * rsc->layout.cpp); so->offset = fd_resource_offset(rsc, lvl, cso->u.tex.first_layer); } @@ -277,27 +278,27 @@ case PIPE_TEXTURE_1D: case PIPE_TEXTURE_2D: so->texconst3 = - A5XX_TEX_CONST_3_ARRAY_PITCH(rsc->layer_size); + A5XX_TEX_CONST_3_ARRAY_PITCH(rsc->layout.layer_size); so->texconst5 = A5XX_TEX_CONST_5_DEPTH(1); break; case PIPE_TEXTURE_1D_ARRAY: case PIPE_TEXTURE_2D_ARRAY: so->texconst3 = - A5XX_TEX_CONST_3_ARRAY_PITCH(rsc->layer_size); + A5XX_TEX_CONST_3_ARRAY_PITCH(rsc->layout.layer_size); so->texconst5 = A5XX_TEX_CONST_5_DEPTH(layers); break; case PIPE_TEXTURE_CUBE: case PIPE_TEXTURE_CUBE_ARRAY: so->texconst3 = - A5XX_TEX_CONST_3_ARRAY_PITCH(rsc->layer_size); + A5XX_TEX_CONST_3_ARRAY_PITCH(rsc->layout.layer_size); so->texconst5 = A5XX_TEX_CONST_5_DEPTH(layers / 6); break; case PIPE_TEXTURE_3D: so->texconst3 = - A5XX_TEX_CONST_3_ARRAY_PITCH(rsc->slices[lvl].size0); + A5XX_TEX_CONST_3_ARRAY_PITCH(slice->size0); so->texconst5 = A5XX_TEX_CONST_5_DEPTH(u_minify(prsc->depth0, lvl)); break; diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/a6xx/fd6_blend.c mesa-20.0.8/src/gallium/drivers/freedreno/a6xx/fd6_blend.c --- mesa-19.2.8/src/gallium/drivers/freedreno/a6xx/fd6_blend.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/a6xx/fd6_blend.c 2020-06-12 01:21:17.000000000 +0000 @@ -33,6 +33,7 @@ #include "fd6_blend.h" #include "fd6_context.h" #include "fd6_format.h" +#include "fd6_pack.h" // XXX move somewhere common.. same across a3xx/a4xx/a5xx.. static enum a3xx_rb_blend_opcode @@ -59,6 +60,7 @@ fd6_blend_state_create(struct pipe_context *pctx, const struct pipe_blend_state *cso) { + struct fd_context *ctx = fd_context(pctx); struct fd6_blend_stateobj *so; enum a3xx_rop_code rop = ROP_COPY; bool reads_dest = false; @@ -90,8 +92,13 @@ return NULL; so->base = *cso; + struct fd_ringbuffer *ring = fd_ringbuffer_new_object(ctx->pipe, + ((A6XX_MAX_RENDER_TARGETS * 4) + 4) * 4); + so->stateobj = ring; - for (i = 0; i < ARRAY_SIZE(so->rb_mrt); i++) { + so->lrz_write = true; /* unless blend enabled for any MRT */ + + for (i = 0; i < A6XX_MAX_RENDER_TARGETS; i++) { const struct pipe_rt_blend_state *rt; if (cso->independent_blend_enable) @@ -99,58 +106,64 @@ else rt = &cso->rt[0]; - so->rb_mrt[i].blend_control_rgb = - A6XX_RB_MRT_BLEND_CONTROL_RGB_SRC_FACTOR(fd_blend_factor(rt->rgb_src_factor)) | - A6XX_RB_MRT_BLEND_CONTROL_RGB_BLEND_OPCODE(blend_func(rt->rgb_func)) | - A6XX_RB_MRT_BLEND_CONTROL_RGB_DEST_FACTOR(fd_blend_factor(rt->rgb_dst_factor)); - - so->rb_mrt[i].blend_control_alpha = - A6XX_RB_MRT_BLEND_CONTROL_ALPHA_SRC_FACTOR(fd_blend_factor(rt->alpha_src_factor)) | - A6XX_RB_MRT_BLEND_CONTROL_ALPHA_BLEND_OPCODE(blend_func(rt->alpha_func)) | - A6XX_RB_MRT_BLEND_CONTROL_ALPHA_DEST_FACTOR(fd_blend_factor(rt->alpha_dst_factor)); - - so->rb_mrt[i].blend_control_no_alpha_rgb = - A6XX_RB_MRT_BLEND_CONTROL_RGB_SRC_FACTOR(fd_blend_factor(util_blend_dst_alpha_to_one(rt->rgb_src_factor))) | - A6XX_RB_MRT_BLEND_CONTROL_RGB_BLEND_OPCODE(blend_func(rt->rgb_func)) | - A6XX_RB_MRT_BLEND_CONTROL_RGB_DEST_FACTOR(fd_blend_factor(util_blend_dst_alpha_to_one(rt->rgb_dst_factor))); - - - so->rb_mrt[i].control = - A6XX_RB_MRT_CONTROL_ROP_CODE(rop) | - COND(cso->logicop_enable, A6XX_RB_MRT_CONTROL_ROP_ENABLE) | - A6XX_RB_MRT_CONTROL_COMPONENT_ENABLE(rt->colormask); + OUT_REG(ring, A6XX_RB_MRT_BLEND_CONTROL(i, + .rgb_src_factor = fd_blend_factor(rt->rgb_src_factor), + .rgb_blend_opcode = blend_func(rt->rgb_func), + .rgb_dest_factor = fd_blend_factor(rt->rgb_dst_factor), + .alpha_src_factor = fd_blend_factor(rt->alpha_src_factor), + .alpha_blend_opcode = blend_func(rt->alpha_func), + .alpha_dest_factor = fd_blend_factor(rt->alpha_dst_factor), + )); + + OUT_REG(ring, A6XX_RB_MRT_CONTROL(i, + .rop_code = rop, + .rop_enable = cso->logicop_enable, + .component_enable = rt->colormask, + .blend = rt->blend_enable, + .blend2 = rt->blend_enable, + )); if (rt->blend_enable) { - so->rb_mrt[i].control |= -// A6XX_RB_MRT_CONTROL_READ_DEST_ENABLE | - A6XX_RB_MRT_CONTROL_BLEND | - A6XX_RB_MRT_CONTROL_BLEND2; mrt_blend |= (1 << i); + so->lrz_write = false; } if (reads_dest) { -// so->rb_mrt[i].control |= A6XX_RB_MRT_CONTROL_READ_DEST_ENABLE; mrt_blend |= (1 << i); + so->lrz_write = false; } } - if (cso->dither) { - so->rb_dither_cntl = A6XX_RB_DITHER_CNTL_DITHER_MODE_MRT0(DITHER_ALWAYS) | - A6XX_RB_DITHER_CNTL_DITHER_MODE_MRT1(DITHER_ALWAYS) | - A6XX_RB_DITHER_CNTL_DITHER_MODE_MRT2(DITHER_ALWAYS) | - A6XX_RB_DITHER_CNTL_DITHER_MODE_MRT3(DITHER_ALWAYS) | - A6XX_RB_DITHER_CNTL_DITHER_MODE_MRT4(DITHER_ALWAYS) | - A6XX_RB_DITHER_CNTL_DITHER_MODE_MRT5(DITHER_ALWAYS) | - A6XX_RB_DITHER_CNTL_DITHER_MODE_MRT6(DITHER_ALWAYS) | - A6XX_RB_DITHER_CNTL_DITHER_MODE_MRT7(DITHER_ALWAYS); - } + OUT_REG(ring, A6XX_RB_DITHER_CNTL( + .dither_mode_mrt0 = cso->dither ? DITHER_ALWAYS : DITHER_DISABLE, + .dither_mode_mrt1 = cso->dither ? DITHER_ALWAYS : DITHER_DISABLE, + .dither_mode_mrt2 = cso->dither ? DITHER_ALWAYS : DITHER_DISABLE, + .dither_mode_mrt3 = cso->dither ? DITHER_ALWAYS : DITHER_DISABLE, + .dither_mode_mrt4 = cso->dither ? DITHER_ALWAYS : DITHER_DISABLE, + .dither_mode_mrt5 = cso->dither ? DITHER_ALWAYS : DITHER_DISABLE, + .dither_mode_mrt6 = cso->dither ? DITHER_ALWAYS : DITHER_DISABLE, + .dither_mode_mrt7 = cso->dither ? DITHER_ALWAYS : DITHER_DISABLE, + )); so->rb_blend_cntl = A6XX_RB_BLEND_CNTL_ENABLE_BLEND(mrt_blend) | COND(cso->alpha_to_coverage, A6XX_RB_BLEND_CNTL_ALPHA_TO_COVERAGE) | COND(cso->independent_blend_enable, A6XX_RB_BLEND_CNTL_INDEPENDENT_BLEND); - so->sp_blend_cntl = A6XX_SP_BLEND_CNTL_UNK8 | - COND(cso->alpha_to_coverage, A6XX_SP_BLEND_CNTL_ALPHA_TO_COVERAGE) | - COND(mrt_blend, A6XX_SP_BLEND_CNTL_ENABLED); + + OUT_REG(ring, A6XX_SP_BLEND_CNTL( + .unk8 = true, + .alpha_to_coverage = cso->alpha_to_coverage, + .enabled = !!mrt_blend, + )); return so; } + +void +fd6_blend_state_delete(struct pipe_context *pctx, void *hwcso) +{ + struct fd6_blend_stateobj *so = hwcso; + + fd_ringbuffer_del(so->stateobj); + + FREE(hwcso); +} diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/a6xx/fd6_blend.h mesa-20.0.8/src/gallium/drivers/freedreno/a6xx/fd6_blend.h --- mesa-19.2.8/src/gallium/drivers/freedreno/a6xx/fd6_blend.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/a6xx/fd6_blend.h 2020-06-12 01:21:17.000000000 +0000 @@ -31,24 +31,16 @@ #include "pipe/p_state.h" #include "pipe/p_context.h" +#include "freedreno_context.h" #include "freedreno_util.h" struct fd6_blend_stateobj { struct pipe_blend_state base; - struct { - uint32_t control; - uint32_t buf_info; - /* Blend control bits for color if there is an alpha channel */ - uint32_t blend_control_rgb; - /* Blend control bits for color if there is no alpha channel */ - uint32_t blend_control_no_alpha_rgb; - /* Blend control bits for alpha channel */ - uint32_t blend_control_alpha; - } rb_mrt[A6XX_MAX_RENDER_TARGETS]; uint32_t rb_blend_cntl; - uint32_t rb_dither_cntl; - uint32_t sp_blend_cntl; + + bool lrz_write; + struct fd_ringbuffer *stateobj; }; static inline struct fd6_blend_stateobj * @@ -59,5 +51,6 @@ void * fd6_blend_state_create(struct pipe_context *pctx, const struct pipe_blend_state *cso); +void fd6_blend_state_delete(struct pipe_context *, void *hwcso); #endif /* FD6_BLEND_H_ */ diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/a6xx/fd6_blitter.c mesa-20.0.8/src/gallium/drivers/freedreno/a6xx/fd6_blitter.c --- mesa-19.2.8/src/gallium/drivers/freedreno/a6xx/fd6_blitter.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/a6xx/fd6_blitter.c 2020-06-12 01:21:17.000000000 +0000 @@ -26,6 +26,7 @@ */ #include "util/u_dump.h" +#include "util/half_float.h" #include "freedreno_blitter.h" #include "freedreno_fence.h" @@ -34,6 +35,8 @@ #include "fd6_blitter.h" #include "fd6_format.h" #include "fd6_emit.h" +#include "fd6_resource.h" +#include "fd6_pack.h" /* Make sure none of the requested dimensions extend beyond the size of the * resource. Not entirely sure why this happens, but sometimes it does, and @@ -79,7 +82,9 @@ return true; } +#define DEBUG_BLIT 0 #define DEBUG_BLIT_FALLBACK 0 + #define fail_if(cond) \ do { \ if (cond) { \ @@ -108,13 +113,8 @@ fail_if(!ok_format(info->src.format)); fail_if(!ok_format(info->dst.format)); - /* We can blit if both or neither formats are compressed formats... */ - fail_if(util_format_is_compressed(info->src.format) != - util_format_is_compressed(info->src.format)); - - /* ... but only if they're the same compression format. */ - fail_if(util_format_is_compressed(info->src.format) && - info->src.format != info->dst.format); + debug_assert(!util_format_is_compressed(info->src.format)); + debug_assert(!util_format_is_compressed(info->dst.format)); fail_if(!ok_dims(info->src.resource, &info->src.box, info->src.level)); @@ -124,34 +124,10 @@ debug_assert(info->dst.box.height >= 0); debug_assert(info->dst.box.depth >= 0); - /* We could probably blit between resources with equal sample count.. */ fail_if(info->dst.resource->nr_samples > 1); - /* CP_BLIT supports resolving, but seems to pick one only of the samples - * (no blending). This doesn't work for RGBA resolves, so we fall back in - * that case. However, GL/GLES spec says: - * - * "If the source formats are integer types or stencil values, a single - * sample’s value is selected for each pixel. If the source formats are - * floating-point or normalized types, the sample values for each pixel - * are resolved in an implementationdependent manner. If the source - * formats are depth values, sample values are resolved in an - * implementation-dependent manner where the result will be between the - * minimum and maximum depth values in the pixel." - * - * so do those with CP_BLIT. - * - * TODO since we re-write z/s blits to RGBA, we'll fail this check in some - * cases where we don't need to. - */ - fail_if((info->mask & PIPE_MASK_RGBA) && - info->src.resource->nr_samples > 1); - fail_if(info->window_rectangle_include); - fail_if(util_format_is_srgb(info->src.format)); - fail_if(util_format_is_srgb(info->dst.format)); - const struct util_format_description *src_desc = util_format_description(info->src.format); const struct util_format_description *dst_desc = @@ -183,26 +159,33 @@ } static uint32_t -blit_control(enum a6xx_color_fmt fmt) +blit_control(enum a6xx_color_fmt fmt, bool is_srgb) { - unsigned blit_cntl = 0xf00000; - blit_cntl |= A6XX_RB_2D_BLIT_CNTL_COLOR_FORMAT(fmt); - blit_cntl |= A6XX_RB_2D_BLIT_CNTL_IFMT(fd6_ifmt(fmt)); - return blit_cntl; + enum a6xx_2d_ifmt ifmt = fd6_ifmt(fmt); + + if (is_srgb) { + assert(ifmt == R2D_UNORM8); + ifmt = R2D_UNORM8_SRGB; + } + + return A6XX_RB_2D_BLIT_CNTL_MASK(0xf) | + A6XX_RB_2D_BLIT_CNTL_COLOR_FORMAT(fmt) | + A6XX_RB_2D_BLIT_CNTL_IFMT(ifmt); } /* buffers need to be handled specially since x/width can exceed the bounds * supported by hw.. if necessary decompose into (potentially) two 2D blits */ static void -emit_blit_buffer(struct fd_ringbuffer *ring, const struct pipe_blit_info *info) +emit_blit_buffer(struct fd_context *ctx, struct fd_ringbuffer *ring, + const struct pipe_blit_info *info) { const struct pipe_box *sbox = &info->src.box; const struct pipe_box *dbox = &info->dst.box; struct fd_resource *src, *dst; unsigned sshift, dshift; - if (DEBUG_BLIT_FALLBACK) { + if (DEBUG_BLIT) { fprintf(stderr, "buffer blit: "); util_dump_blit_info(stderr, info); fprintf(stderr, "\ndst resource: "); @@ -215,8 +198,8 @@ src = fd_resource(info->src.resource); dst = fd_resource(info->dst.resource); - debug_assert(src->cpp == 1); - debug_assert(dst->cpp == 1); + debug_assert(src->layout.cpp == 1); + debug_assert(dst->layout.cpp == 1); debug_assert(info->src.resource->format == info->dst.resource->format); debug_assert((sbox->y == 0) && (sbox->height == 1)); debug_assert((dbox->y == 0) && (dbox->height == 1)); @@ -250,7 +233,7 @@ OUT_PKT7(ring, CP_SET_MARKER, 1); OUT_RING(ring, A6XX_CP_SET_MARKER_0_MODE(RM6_BLIT2DSCALE)); - uint32_t blit_cntl = blit_control(RB6_R8_UNORM) | 0x20000000; + uint32_t blit_cntl = blit_control(RB6_R8_UNORM, false) | 0x20000000; OUT_PKT4(ring, REG_A6XX_RB_2D_BLIT_CNTL, 1); OUT_RING(ring, blit_cntl); @@ -327,7 +310,7 @@ OUT_RING(ring, 0xf180); OUT_PKT4(ring, REG_A6XX_RB_UNKNOWN_8E04, 1); - OUT_RING(ring, 0x01000000); + OUT_RING(ring, fd6_context(ctx)->magic.RB_UNKNOWN_8E04_blit); OUT_PKT7(ring, CP_BLIT, 1); OUT_RING(ring, CP_BLIT_0_OP(BLIT_OP_SCALE)); @@ -335,17 +318,18 @@ OUT_WFI5(ring); OUT_PKT4(ring, REG_A6XX_RB_UNKNOWN_8E04, 1); - OUT_RING(ring, 0); + OUT_RING(ring, 0); /* RB_UNKNOWN_8E04 */ } } static void -emit_blit_texture(struct fd_ringbuffer *ring, const struct pipe_blit_info *info) +emit_blit_or_clear_texture(struct fd_context *ctx, struct fd_ringbuffer *ring, + const struct pipe_blit_info *info, union pipe_color_union *color) { const struct pipe_box *sbox = &info->src.box; const struct pipe_box *dbox = &info->dst.box; struct fd_resource *src, *dst; - struct fd_resource_slice *sslice, *dslice; + struct fdl_slice *sslice, *dslice; enum a6xx_color_fmt sfmt, dfmt; enum a6xx_tile_mode stile, dtile; enum a3xx_color_swap sswap, dswap; @@ -353,7 +337,7 @@ int sx1, sy1, sx2, sy2; int dx1, dy1, dx2, dy2; - if (DEBUG_BLIT_FALLBACK) { + if (DEBUG_BLIT) { fprintf(stderr, "texture blit: "); util_dump_blit_info(stderr, info); fprintf(stderr, "\ndst resource: "); @@ -372,48 +356,101 @@ sfmt = fd6_pipe2color(info->src.format); dfmt = fd6_pipe2color(info->dst.format); - int blocksize = util_format_get_blocksize(info->src.format); - int blockwidth = util_format_get_blockwidth(info->src.format); - int blockheight = util_format_get_blockheight(info->src.format); - int nelements; - - stile = fd_resource_level_linear(info->src.resource, info->src.level) ? - TILE6_LINEAR : src->tile_mode; - dtile = fd_resource_level_linear(info->dst.resource, info->dst.level) ? - TILE6_LINEAR : dst->tile_mode; - - sswap = stile ? WZYX : fd6_pipe2swap(info->src.format); - dswap = dtile ? WZYX : fd6_pipe2swap(info->dst.format); - - if (util_format_is_compressed(info->src.format)) { - debug_assert(info->src.format == info->dst.format); - sfmt = dfmt = RB6_R8_UNORM; - nelements = blocksize; - } else { - debug_assert(!util_format_is_compressed(info->dst.format)); - nelements = 1; - } + stile = fd_resource_tile_mode(info->src.resource, info->src.level); + dtile = fd_resource_tile_mode(info->dst.resource, info->dst.level); - spitch = DIV_ROUND_UP(sslice->pitch, blockwidth) * src->cpp; - dpitch = DIV_ROUND_UP(dslice->pitch, blockwidth) * dst->cpp; + /* Linear levels of a tiled resource are always WZYX, so look at + * rsc->tile_mode to determine the swap. + */ + sswap = fd6_resource_swap(src, info->src.format); + dswap = fd6_resource_swap(dst, info->dst.format); - sx1 = sbox->x / blockwidth * nelements; - sy1 = sbox->y / blockheight; - sx2 = DIV_ROUND_UP(sbox->x + sbox->width, blockwidth) * nelements - 1; - sy2 = DIV_ROUND_UP(sbox->y + sbox->height, blockheight) - 1; + /* Use the underlying resource format so that we get the right block width + * for compressed textures. + */ + spitch = util_format_get_nblocksx(src->base.format, sslice->pitch) * src->layout.cpp; + dpitch = util_format_get_nblocksx(dst->base.format, dslice->pitch) * dst->layout.cpp; - dx1 = dbox->x / blockwidth * nelements; - dy1 = dbox->y / blockheight; - dx2 = DIV_ROUND_UP(dbox->x + dbox->width, blockwidth) * nelements - 1; - dy2 = DIV_ROUND_UP(dbox->y + dbox->height, blockheight) - 1; + uint32_t nr_samples = fd_resource_nr_samples(&dst->base); + sx1 = sbox->x * nr_samples; + sy1 = sbox->y; + sx2 = (sbox->x + sbox->width) * nr_samples - 1; + sy2 = sbox->y + sbox->height - 1; + + dx1 = dbox->x * nr_samples; + dy1 = dbox->y; + dx2 = (dbox->x + dbox->width) * nr_samples - 1; + dy2 = dbox->y + dbox->height - 1; - uint32_t width = DIV_ROUND_UP(u_minify(src->base.width0, info->src.level), blockwidth) * nelements; - uint32_t height = DIV_ROUND_UP(u_minify(src->base.height0, info->src.level), blockheight); + uint32_t width = u_minify(src->base.width0, info->src.level) * nr_samples; + uint32_t height = u_minify(src->base.height0, info->src.level); OUT_PKT7(ring, CP_SET_MARKER, 1); OUT_RING(ring, A6XX_CP_SET_MARKER_0_MODE(RM6_BLIT2DSCALE)); - uint32_t blit_cntl = blit_control(dfmt); + uint32_t blit_cntl = blit_control(dfmt, util_format_is_srgb(info->dst.format)); + + if (color) { + blit_cntl |= A6XX_RB_2D_BLIT_CNTL_SOLID_COLOR; + + switch (info->dst.format) { + case PIPE_FORMAT_Z24X8_UNORM: + case PIPE_FORMAT_Z24_UNORM_S8_UINT: + case PIPE_FORMAT_X24S8_UINT: { + uint32_t depth_unorm24 = color->f[0] * ((1u << 24) - 1); + uint8_t stencil = color->ui[1]; + color->ui[0] = depth_unorm24 & 0xff; + color->ui[1] = (depth_unorm24 >> 8) & 0xff; + color->ui[2] = (depth_unorm24 >> 16) & 0xff; + color->ui[3] = stencil; + + dfmt = RB6_Z24_UNORM_S8_UINT_AS_R8G8B8A8; + break; + } + case PIPE_FORMAT_B5G6R5_UNORM: + case PIPE_FORMAT_B5G5R5A1_UNORM: + case PIPE_FORMAT_B5G5R5X1_UNORM: + case PIPE_FORMAT_B4G4R4A4_UNORM: + color->ui[0] = float_to_ubyte(color->f[0]); + color->ui[1] = float_to_ubyte(color->f[1]); + color->ui[2] = float_to_ubyte(color->f[2]); + color->ui[3] = float_to_ubyte(color->f[3]); + break; + default: + break; + } + + OUT_PKT4(ring, REG_A6XX_RB_2D_SRC_SOLID_C0, 4); + + switch (fd6_ifmt(dfmt)) { + case R2D_UNORM8: + case R2D_UNORM8_SRGB: + OUT_RING(ring, float_to_ubyte(color->f[0])); + OUT_RING(ring, float_to_ubyte(color->f[1])); + OUT_RING(ring, float_to_ubyte(color->f[2])); + OUT_RING(ring, float_to_ubyte(color->f[3])); + break; + case R2D_FLOAT16: + OUT_RING(ring, _mesa_float_to_half(color->f[0])); + OUT_RING(ring, _mesa_float_to_half(color->f[1])); + OUT_RING(ring, _mesa_float_to_half(color->f[2])); + OUT_RING(ring, _mesa_float_to_half(color->f[3])); + sfmt = RB6_R16G16B16A16_FLOAT; + break; + + case R2D_FLOAT32: + case R2D_INT32: + case R2D_INT16: + case R2D_INT8: + case R2D_RAW: + default: + OUT_RING(ring, color->ui[0]); + OUT_RING(ring, color->ui[1]); + OUT_RING(ring, color->ui[2]); + OUT_RING(ring, color->ui[3]); + break; + } + } if (dtile != stile) blit_cntl |= 0x20000000; @@ -436,8 +473,6 @@ for (unsigned i = 0; i < info->dst.box.depth; i++) { unsigned soff = fd_resource_offset(src, info->src.level, sbox->z + i); unsigned doff = fd_resource_offset(dst, info->dst.level, dbox->z + i); - unsigned subwcoff = fd_resource_ubwc_offset(src, info->src.level, sbox->z + i); - unsigned dubwcoff = fd_resource_ubwc_offset(dst, info->dst.level, dbox->z + i); bool subwc_enabled = fd_resource_ubwc_enabled(src, info->src.level); bool dubwc_enabled = fd_resource_ubwc_enabled(dst, info->dst.level); @@ -450,13 +485,19 @@ enum a3xx_msaa_samples samples = fd_msaa_samples(src->base.nr_samples); + if (sfmt == RB6_R10G10B10A2_UNORM) + sfmt = RB6_R10G10B10A2_FLOAT16; + OUT_PKT4(ring, REG_A6XX_SP_PS_2D_SRC_INFO, 10); OUT_RING(ring, A6XX_SP_PS_2D_SRC_INFO_COLOR_FORMAT(sfmt) | A6XX_SP_PS_2D_SRC_INFO_TILE_MODE(stile) | A6XX_SP_PS_2D_SRC_INFO_COLOR_SWAP(sswap) | - A6XX_SP_PS_2D_SRC_INFO_SAMPLES(samples) | - COND(subwc_enabled, A6XX_SP_PS_2D_SRC_INFO_FLAGS) | - 0x500000 | filter); + A6XX_SP_PS_2D_SRC_INFO_SAMPLES(samples) | + COND(samples > MSAA_ONE && (info->mask & PIPE_MASK_RGBA), + A6XX_SP_PS_2D_SRC_INFO_SAMPLES_AVERAGE) | + COND(subwc_enabled, A6XX_SP_PS_2D_SRC_INFO_FLAGS) | + COND(util_format_is_srgb(info->src.format), A6XX_SP_PS_2D_SRC_INFO_SRGB) | + 0x500000 | filter); OUT_RING(ring, A6XX_SP_PS_2D_SRC_SIZE_WIDTH(width) | A6XX_SP_PS_2D_SRC_SIZE_HEIGHT(height)); /* SP_PS_2D_SRC_SIZE */ OUT_RELOC(ring, src->bo, soff, 0, 0); /* SP_PS_2D_SRC_LO/HI */ @@ -470,9 +511,7 @@ if (subwc_enabled) { OUT_PKT4(ring, REG_A6XX_SP_PS_2D_SRC_FLAGS_LO, 6); - OUT_RELOC(ring, src->bo, subwcoff, 0, 0); - OUT_RING(ring, A6XX_SP_PS_2D_SRC_FLAGS_PITCH_PITCH(src->ubwc_pitch) | - A6XX_SP_PS_2D_SRC_FLAGS_PITCH_ARRAY_PITCH(src->ubwc_size)); + fd6_emit_flag_reference(ring, src, info->src.level, sbox->z + i); OUT_RING(ring, 0x00000000); OUT_RING(ring, 0x00000000); OUT_RING(ring, 0x00000000); @@ -485,6 +524,7 @@ OUT_RING(ring, A6XX_RB_2D_DST_INFO_COLOR_FORMAT(dfmt) | A6XX_RB_2D_DST_INFO_TILE_MODE(dtile) | A6XX_RB_2D_DST_INFO_COLOR_SWAP(dswap) | + COND(util_format_is_srgb(info->dst.format), A6XX_RB_2D_DST_INFO_SRGB) | COND(dubwc_enabled, A6XX_RB_2D_DST_INFO_FLAGS)); OUT_RELOCW(ring, dst->bo, doff, 0, 0); /* RB_2D_DST_LO/HI */ OUT_RING(ring, A6XX_RB_2D_DST_SIZE_PITCH(dpitch)); @@ -496,9 +536,7 @@ if (dubwc_enabled) { OUT_PKT4(ring, REG_A6XX_RB_2D_DST_FLAGS_LO, 6); - OUT_RELOCW(ring, dst->bo, dubwcoff, 0, 0); - OUT_RING(ring, A6XX_RB_2D_DST_FLAGS_PITCH_PITCH(dst->ubwc_pitch) | - A6XX_RB_2D_DST_FLAGS_PITCH_ARRAY_PITCH(dst->ubwc_size)); + fd6_emit_flag_reference(ring, dst, info->dst.level, dbox->z + i); OUT_RING(ring, 0x00000000); OUT_RING(ring, 0x00000000); OUT_RING(ring, 0x00000000); @@ -524,6 +562,13 @@ OUT_PKT4(ring, REG_A6XX_RB_UNKNOWN_8C01, 1); OUT_RING(ring, 0); + if (dfmt == RB6_R10G10B10A2_UNORM) + sfmt = RB6_R16G16B16A16_FLOAT; + + /* This register is probably badly named... it seems that it's + * controlling the internal/accumulator format or something like + * that. It's certainly not tied to only the src format. + */ OUT_PKT4(ring, REG_A6XX_SP_2D_SRC_FORMAT, 1); OUT_RING(ring, A6XX_SP_2D_SRC_FORMAT_COLOR_FORMAT(sfmt) | COND(util_format_is_pure_sint(info->src.format), @@ -537,10 +582,11 @@ // TODO sometimes blob uses UINT+NORM but dEQP seems unhappy about that // A6XX_SP_2D_SRC_FORMAT_UINT | A6XX_SP_2D_SRC_FORMAT_NORM) | - 0xf000); + COND(util_format_is_srgb(info->dst.format), A6XX_SP_2D_SRC_FORMAT_SRGB) | + A6XX_SP_2D_SRC_FORMAT_MASK(0xf)); OUT_PKT4(ring, REG_A6XX_RB_UNKNOWN_8E04, 1); - OUT_RING(ring, 0x01000000); + OUT_RING(ring, fd6_context(ctx)->magic.RB_UNKNOWN_8E04_blit); OUT_PKT7(ring, CP_BLIT, 1); OUT_RING(ring, CP_BLIT_0_OP(BLIT_OP_SCALE)); @@ -548,11 +594,89 @@ OUT_WFI5(ring); OUT_PKT4(ring, REG_A6XX_RB_UNKNOWN_8E04, 1); - OUT_RING(ring, 0); + OUT_RING(ring, 0); /* RB_UNKNOWN_8E04 */ } } -static bool handle_rgba_blit(struct fd_context *ctx, const struct pipe_blit_info *info); +void +fd6_clear_surface(struct fd_context *ctx, + struct fd_ringbuffer *ring, struct pipe_surface *psurf, + uint32_t width, uint32_t height, union pipe_color_union *color) +{ + struct pipe_blit_info info = {}; + + info.dst.resource = psurf->texture; + info.dst.level = psurf->u.tex.level; + info.dst.box.x = 0; + info.dst.box.y = 0; + info.dst.box.z = psurf->u.tex.first_layer; + info.dst.box.width = width; + info.dst.box.height = height; + info.dst.box.depth = psurf->u.tex.last_layer + 1 - psurf->u.tex.first_layer; + info.dst.format = psurf->format; + info.src = info.dst; + info.mask = util_format_get_mask(psurf->format); + info.filter = PIPE_TEX_FILTER_NEAREST; + info.scissor_enable = 0; + + emit_blit_or_clear_texture(ctx, ring, &info, color); +} + +static bool +handle_rgba_blit(struct fd_context *ctx, const struct pipe_blit_info *info) +{ + struct fd_batch *batch; + + debug_assert(!(info->mask & PIPE_MASK_ZS)); + + if (!can_do_blit(info)) + return false; + + batch = fd_bc_alloc_batch(&ctx->screen->batch_cache, ctx, true); + + fd6_emit_restore(batch, batch->draw); + fd6_emit_lrz_flush(batch->draw); + + mtx_lock(&ctx->screen->lock); + + fd_batch_resource_used(batch, fd_resource(info->src.resource), false); + fd_batch_resource_used(batch, fd_resource(info->dst.resource), true); + + mtx_unlock(&ctx->screen->lock); + + /* Clearing last_fence must come after the batch dependency tracking + * (resource_read()/resource_write()), as that can trigger a flush, + * re-populating last_fence + */ + fd_fence_ref(&ctx->last_fence, NULL); + + emit_setup(batch); + + if ((info->src.resource->target == PIPE_BUFFER) && + (info->dst.resource->target == PIPE_BUFFER)) { + assert(fd_resource(info->src.resource)->layout.tile_mode == TILE6_LINEAR); + assert(fd_resource(info->dst.resource)->layout.tile_mode == TILE6_LINEAR); + emit_blit_buffer(ctx, batch->draw, info); + } else { + /* I don't *think* we need to handle blits between buffer <-> !buffer */ + debug_assert(info->src.resource->target != PIPE_BUFFER); + debug_assert(info->dst.resource->target != PIPE_BUFFER); + emit_blit_or_clear_texture(ctx, batch->draw, info, NULL); + } + + fd6_event_write(batch, batch->draw, 0x1d, true); + fd6_event_write(batch, batch->draw, FACENESS_FLUSH, true); + fd6_event_write(batch, batch->draw, CACHE_FLUSH_TS, true); + fd6_cache_inv(batch, batch->draw); + + fd_resource(info->dst.resource)->valid = true; + batch->needs_flush = true; + + fd_batch_flush(batch); + fd_batch_reference(&batch, NULL); + + return true; +} /** * Re-written z/s blits can still fail for various reasons (for example MSAA). @@ -579,7 +703,7 @@ { struct pipe_blit_info blit = *info; - if (DEBUG_BLIT_FALLBACK) { + if (DEBUG_BLIT) { fprintf(stderr, "---- handle_zs_blit: "); util_dump_blit_info(stderr, info); fprintf(stderr, "\ndst resource: "); @@ -647,55 +771,44 @@ } static bool -handle_rgba_blit(struct fd_context *ctx, const struct pipe_blit_info *info) +handle_compressed_blit(struct fd_context *ctx, const struct pipe_blit_info *info) { - struct fd_batch *batch; - - debug_assert(!(info->mask & PIPE_MASK_ZS)); - - if (!can_do_blit(info)) - return false; - - fd_fence_ref(&ctx->last_fence, NULL); - - batch = fd_bc_alloc_batch(&ctx->screen->batch_cache, ctx, true); - - fd6_emit_restore(batch, batch->draw); - fd6_emit_lrz_flush(batch->draw); - - mtx_lock(&ctx->screen->lock); - - fd_batch_resource_used(batch, fd_resource(info->src.resource), false); - fd_batch_resource_used(batch, fd_resource(info->dst.resource), true); + struct pipe_blit_info blit = *info; - mtx_unlock(&ctx->screen->lock); + if (DEBUG_BLIT) { + fprintf(stderr, "---- handle_compressed_blit: "); + util_dump_blit_info(stderr, info); + fprintf(stderr, "\ndst resource: "); + util_dump_resource(stderr, info->dst.resource); + fprintf(stderr, "\nsrc resource: "); + util_dump_resource(stderr, info->src.resource); + fprintf(stderr, "\n"); + } - emit_setup(batch); + if (info->src.format != info->dst.format) + return fd_blitter_blit(ctx, info); - if ((info->src.resource->target == PIPE_BUFFER) && - (info->dst.resource->target == PIPE_BUFFER)) { - assert(fd_resource(info->src.resource)->tile_mode == TILE6_LINEAR); - assert(fd_resource(info->dst.resource)->tile_mode == TILE6_LINEAR); - emit_blit_buffer(batch->draw, info); + if (util_format_get_blocksize(info->src.format) == 8) { + blit.src.format = blit.dst.format = PIPE_FORMAT_R16G16B16A16_UINT; } else { - /* I don't *think* we need to handle blits between buffer <-> !buffer */ - debug_assert(info->src.resource->target != PIPE_BUFFER); - debug_assert(info->dst.resource->target != PIPE_BUFFER); - emit_blit_texture(batch->draw, info); + debug_assert(util_format_get_blocksize(info->src.format) == 16); + blit.src.format = blit.dst.format = PIPE_FORMAT_R32G32B32A32_UINT; } - fd6_event_write(batch, batch->draw, 0x1d, true); - fd6_event_write(batch, batch->draw, FACENESS_FLUSH, true); - fd6_event_write(batch, batch->draw, CACHE_FLUSH_TS, true); - fd6_cache_inv(batch, batch->draw); + int bw = util_format_get_blockwidth(info->src.format); + int bh = util_format_get_blockheight(info->src.format); - fd_resource(info->dst.resource)->valid = true; - batch->needs_flush = true; + blit.src.box.x /= bw; + blit.src.box.y /= bh; + blit.src.box.width /= bw; + blit.src.box.height /= bh; - fd_batch_flush(batch, false); - fd_batch_reference(&batch, NULL); + blit.dst.box.x /= bw; + blit.dst.box.y /= bh; + blit.dst.box.width /= bw; + blit.dst.box.height /= bh; - return true; + return do_rewritten_blit(ctx, &blit); } static bool @@ -703,6 +816,10 @@ { if (info->mask & PIPE_MASK_ZS) return handle_zs_blit(ctx, info); + if (util_format_is_compressed(info->src.format) || + util_format_is_compressed(info->dst.format)) + return handle_compressed_blit(ctx, info); + return handle_rgba_blit(ctx, info); } @@ -718,6 +835,12 @@ unsigned fd6_tile_mode(const struct pipe_resource *tmpl) { + /* if the mipmap level 0 is still too small to be tiled, then don't + * bother pretending: + */ + if (fd_resource_level_linear(tmpl, 0)) + return TILE6_LINEAR; + /* basically just has to be a format we can blit, so uploads/downloads * via linear staging buffer works: */ diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/a6xx/fd6_blitter.h mesa-20.0.8/src/gallium/drivers/freedreno/a6xx/fd6_blitter.h --- mesa-19.2.8/src/gallium/drivers/freedreno/a6xx/fd6_blitter.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/a6xx/fd6_blitter.h 2020-06-12 01:21:17.000000000 +0000 @@ -35,4 +35,9 @@ void fd6_blitter_init(struct pipe_context *pctx); unsigned fd6_tile_mode(const struct pipe_resource *tmpl); +void +fd6_clear_surface(struct fd_context *ctx, + struct fd_ringbuffer *ring, struct pipe_surface *psurf, + uint32_t width, uint32_t height, union pipe_color_union *color); + #endif /* FD6_BLIT_H_ */ diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/a6xx/fd6_compute.c mesa-20.0.8/src/gallium/drivers/freedreno/a6xx/fd6_compute.c --- mesa-19.2.8/src/gallium/drivers/freedreno/a6xx/fd6_compute.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/a6xx/fd6_compute.c 2020-06-12 01:21:17.000000000 +0000 @@ -86,7 +86,8 @@ OUT_PKT4(ring, REG_A6XX_SP_CS_CONFIG, 2); OUT_RING(ring, A6XX_SP_CS_CONFIG_ENABLED | - A6XX_SP_CS_CONFIG_NIBO(v->image_mapping.num_ibo) | + A6XX_SP_CS_CONFIG_NIBO(v->shader->nir->info.num_ssbos + + v->shader->nir->info.num_images) | A6XX_SP_CS_CONFIG_NTEX(v->num_samp) | A6XX_SP_CS_CONFIG_NSAMP(v->num_samp)); /* SP_VS_CONFIG */ OUT_RING(ring, v->instrlen); /* SP_VS_INSTRLEN */ diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/a6xx/fd6_context.c mesa-20.0.8/src/gallium/drivers/freedreno/a6xx/fd6_context.c --- mesa-19.2.8/src/gallium/drivers/freedreno/a6xx/fd6_context.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/a6xx/fd6_context.c 2020-06-12 01:21:17.000000000 +0000 @@ -66,14 +66,19 @@ } static const uint8_t primtypes[] = { - [PIPE_PRIM_POINTS] = DI_PT_POINTLIST, - [PIPE_PRIM_LINES] = DI_PT_LINELIST, - [PIPE_PRIM_LINE_STRIP] = DI_PT_LINESTRIP, - [PIPE_PRIM_LINE_LOOP] = DI_PT_LINELOOP, - [PIPE_PRIM_TRIANGLES] = DI_PT_TRILIST, - [PIPE_PRIM_TRIANGLE_STRIP] = DI_PT_TRISTRIP, - [PIPE_PRIM_TRIANGLE_FAN] = DI_PT_TRIFAN, - [PIPE_PRIM_MAX] = DI_PT_RECTLIST, /* internal clear blits */ + [PIPE_PRIM_POINTS] = DI_PT_POINTLIST, + [PIPE_PRIM_LINES] = DI_PT_LINELIST, + [PIPE_PRIM_LINE_STRIP] = DI_PT_LINESTRIP, + [PIPE_PRIM_LINE_LOOP] = DI_PT_LINELOOP, + [PIPE_PRIM_TRIANGLES] = DI_PT_TRILIST, + [PIPE_PRIM_TRIANGLE_STRIP] = DI_PT_TRISTRIP, + [PIPE_PRIM_TRIANGLE_FAN] = DI_PT_TRIFAN, + [PIPE_PRIM_LINES_ADJACENCY] = DI_PT_LINE_ADJ, + [PIPE_PRIM_LINE_STRIP_ADJACENCY] = DI_PT_LINESTRIP_ADJ, + [PIPE_PRIM_TRIANGLES_ADJACENCY] = DI_PT_TRI_ADJ, + [PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY] = DI_PT_TRISTRIP_ADJ, + [PIPE_PRIM_PATCHES] = DI_PT_PATCHES0, + [PIPE_PRIM_MAX] = DI_PT_RECTLIST, /* internal clear blits */ }; struct pipe_context * @@ -86,6 +91,50 @@ if (!fd6_ctx) return NULL; + + switch (screen->gpu_id) { + case 618: +/* +GRAS_BIN_CONTROL: +RB_BIN_CONTROL: + - a618 doesn't appear to set .USE_VIZ; also bin size diffs + +RB_CCU_CNTL: + - 0x3c400004 -> 0x3e400004 + - 0x10000000 -> 0x08000000 + +RB_UNKNOWN_8E04: <-- see stencil-0000.rd.gz + - 0x01000000 -> 0x00100000 + +SP_UNKNOWN_A0F8: +PC_UNKNOWN_9805: + - 0x1 -> 0 + */ + fd6_ctx->magic.RB_UNKNOWN_8E04_blit = 0x00100000; + fd6_ctx->magic.RB_CCU_CNTL_gmem = 0x3e400004; + fd6_ctx->magic.RB_CCU_CNTL_bypass = 0x08000000; + fd6_ctx->magic.PC_UNKNOWN_9805 = 0x0; + fd6_ctx->magic.SP_UNKNOWN_A0F8 = 0x0; + break; + case 630: + fd6_ctx->magic.RB_UNKNOWN_8E04_blit = 0x01000000; + // NOTE: newer blob using 0x3c400004, need to revisit: + fd6_ctx->magic.RB_CCU_CNTL_gmem = 0x7c400004; + fd6_ctx->magic.RB_CCU_CNTL_bypass = 0x10000000; + fd6_ctx->magic.PC_UNKNOWN_9805 = 0x1; + fd6_ctx->magic.SP_UNKNOWN_A0F8 = 0x1; + break; + case 640: + fd6_ctx->magic.RB_UNKNOWN_8E04_blit = 0x00100000; + fd6_ctx->magic.RB_CCU_CNTL_gmem = 0x7c400000; + fd6_ctx->magic.RB_CCU_CNTL_bypass = 0x10000000; + fd6_ctx->magic.PC_UNKNOWN_9805 = 0x1; + fd6_ctx->magic.SP_UNKNOWN_A0F8 = 0x1; + break; + default: + unreachable("missing magic config"); + } + pctx = &fd6_ctx->base.base; pctx->screen = pscreen; @@ -116,6 +165,7 @@ /* fd_context_init overwrites delete_rasterizer_state, so set this * here. */ pctx->delete_rasterizer_state = fd6_rasterizer_state_delete; + pctx->delete_blend_state = fd6_blend_state_delete; pctx->delete_depth_stencil_alpha_state = fd6_depth_stencil_alpha_state_delete; /* initial sizes for VSC buffers (or rather the per-pipe sizes diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/a6xx/fd6_context.h mesa-20.0.8/src/gallium/drivers/freedreno/a6xx/fd6_context.h --- mesa-19.2.8/src/gallium/drivers/freedreno/a6xx/fd6_context.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/a6xx/fd6_context.h 2020-06-12 01:21:17.000000000 +0000 @@ -91,6 +91,17 @@ uint16_t tex_seqno; struct hash_table *tex_cache; + + /* collection of magic register values which differ between + * various different a6xx + */ + struct { + uint32_t RB_UNKNOWN_8E04_blit; /* value for CP_BLIT's */ + uint32_t RB_CCU_CNTL_bypass; /* for sysmem rendering */ + uint32_t RB_CCU_CNTL_gmem; /* for GMEM rendering */ + uint32_t PC_UNKNOWN_9805; + uint32_t SP_UNKNOWN_A0F8; + } magic; }; static inline struct fd6_context * @@ -107,13 +118,19 @@ struct fd6_control { uint32_t seqno; /* seqno for async CP_EVENT_WRITE, etc */ uint32_t _pad0; - uint32_t flush_base; /* dummy address for VPC_SO[i].FLUSH_BASE_LO/HI */ + volatile uint32_t vsc_overflow; uint32_t _pad1; /* flag set from cmdstream when VSC overflow detected: */ - volatile uint32_t vsc_overflow; - uint32_t _pad2; uint32_t vsc_scratch; + uint32_t _pad2; uint32_t _pad3; + uint32_t _pad4; + + /* scratch space for VPC_SO[i].FLUSH_BASE_LO/HI, start on 32 byte boundary. */ + struct { + uint32_t offset; + uint32_t pad[7]; + } flush_base[4]; }; #define control_ptr(fd6_ctx, member) \ @@ -125,8 +142,16 @@ { extern unsigned marker_cnt; unsigned reg = REG_A6XX_CP_SCRATCH_REG(scratch_idx); - OUT_PKT4(ring, reg, 1); - OUT_RING(ring, ++marker_cnt); +#ifdef DEBUG +# define __EMIT_MARKER 1 +#else +# define __EMIT_MARKER 0 +#endif + if (__EMIT_MARKER) { + OUT_WFI5(ring); + OUT_PKT4(ring, reg, 1); + OUT_RING(ring, ++marker_cnt); + } } #endif /* FD6_CONTEXT_H_ */ diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/a6xx/fd6_draw.c mesa-20.0.8/src/gallium/drivers/freedreno/a6xx/fd6_draw.c --- mesa-19.2.8/src/gallium/drivers/freedreno/a6xx/fd6_draw.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/a6xx/fd6_draw.c 2020-06-12 01:21:17.000000000 +0000 @@ -118,6 +118,19 @@ } } +static void +fixup_draw_state(struct fd_context *ctx, struct fd6_emit *emit) +{ + if (ctx->last.dirty || + (ctx->last.primitive_restart != emit->primitive_restart)) { + /* rasterizer state is effected by primitive-restart: */ + ctx->dirty |= FD_DIRTY_RASTERIZER; + ctx->last.primitive_restart = emit->primitive_restart; + } + + ctx->last.dirty = false; +} + static bool fd6_draw_vbo(struct fd_context *ctx, const struct pipe_draw_info *info, unsigned index_offset) @@ -128,8 +141,9 @@ .vtx = &ctx->vtx, .info = info, .key = { - .vs = ctx->prog.vp, - .fs = ctx->prog.fp, + .vs = ctx->prog.vs, + .gs = ctx->prog.gs, + .fs = ctx->prog.fs, .key = { .color_two_side = ctx->rasterizer->light_twoside, .vclamp_color = ctx->rasterizer->clamp_vertex_color, @@ -152,8 +166,32 @@ .rasterflat = ctx->rasterizer->flatshade, .sprite_coord_enable = ctx->rasterizer->sprite_coord_enable, .sprite_coord_mode = ctx->rasterizer->sprite_coord_mode, + .primitive_restart = info->primitive_restart && info->index_size, }; + if (info->mode == PIPE_PRIM_PATCHES) { + emit.key.hs = ctx->prog.hs; + emit.key.ds = ctx->prog.ds; + + shader_info *ds_info = &emit.key.ds->nir->info; + switch (ds_info->tess.primitive_mode) { + case GL_ISOLINES: + emit.key.key.tessellation = IR3_TESS_ISOLINES; + break; + case GL_TRIANGLES: + emit.key.key.tessellation = IR3_TESS_TRIANGLES; + break; + case GL_QUADS: + emit.key.key.tessellation = IR3_TESS_QUADS; + break; + default: + unreachable("bad tessmode"); + } + } + + if (emit.key.gs) + emit.key.key.has_gs = true; + fixup_shader_state(ctx, &emit.key.key); if (!(ctx->dirty & FD_DIRTY_PROG)) { @@ -169,31 +207,95 @@ emit.dirty = ctx->dirty; /* *after* fixup_shader_state() */ emit.bs = fd6_emit_get_prog(&emit)->bs; emit.vs = fd6_emit_get_prog(&emit)->vs; + emit.hs = fd6_emit_get_prog(&emit)->hs; + emit.ds = fd6_emit_get_prog(&emit)->ds; + emit.gs = fd6_emit_get_prog(&emit)->gs; emit.fs = fd6_emit_get_prog(&emit)->fs; - const struct ir3_shader_variant *vp = emit.vs; - const struct ir3_shader_variant *fp = emit.fs; - - ctx->stats.vs_regs += ir3_shader_halfregs(vp); - ctx->stats.fs_regs += ir3_shader_halfregs(fp); + ctx->stats.vs_regs += ir3_shader_halfregs(emit.vs); + ctx->stats.hs_regs += COND(emit.hs, ir3_shader_halfregs(emit.hs)); + ctx->stats.ds_regs += COND(emit.ds, ir3_shader_halfregs(emit.ds)); + ctx->stats.gs_regs += COND(emit.gs, ir3_shader_halfregs(emit.gs)); + ctx->stats.fs_regs += ir3_shader_halfregs(emit.fs); /* figure out whether we need to disable LRZ write for binning - * pass using draw pass's fp: + * pass using draw pass's fs: */ - emit.no_lrz_write = fp->writes_pos || fp->no_earlyz; + emit.no_lrz_write = emit.fs->writes_pos || emit.fs->no_earlyz; struct fd_ringbuffer *ring = ctx->batch->draw; enum pc_di_primtype primtype = ctx->primtypes[info->mode]; - fd6_emit_state(ring, &emit); + uint32_t tess_draw0 = 0; + if (info->mode == PIPE_PRIM_PATCHES) { + shader_info *ds_info = &emit.ds->shader->nir->info; + uint32_t factor_stride; + uint32_t patch_type; + + switch (ds_info->tess.primitive_mode) { + case GL_ISOLINES: + patch_type = TESS_ISOLINES; + factor_stride = 12; + break; + case GL_TRIANGLES: + patch_type = TESS_TRIANGLES; + factor_stride = 20; + break; + case GL_QUADS: + patch_type = TESS_QUADS; + factor_stride = 28; + break; + default: + unreachable("bad tessmode"); + } + + primtype = DI_PT_PATCHES0 + info->vertices_per_patch; + tess_draw0 |= CP_DRAW_INDX_OFFSET_0_PATCH_TYPE(patch_type) | + CP_DRAW_INDX_OFFSET_0_TESS_ENABLE; + + ctx->batch->tessellation = true; + ctx->batch->tessparam_size = MAX2(ctx->batch->tessparam_size, + emit.hs->shader->output_size * 4 * info->count); + ctx->batch->tessfactor_size = MAX2(ctx->batch->tessfactor_size, + factor_stride * info->count); + + if (!ctx->batch->tess_addrs_constobj) { + /* Reserve space for the bo address - we'll write them later in + * setup_tess_buffers(). We need 2 bo address, but indirect + * constant upload needs at least 4 vec4s. + */ + unsigned size = 4 * 16; + + ctx->batch->tess_addrs_constobj = fd_submit_new_ringbuffer( + ctx->batch->submit, size, FD_RINGBUFFER_STREAMING); - OUT_PKT4(ring, REG_A6XX_VFD_INDEX_OFFSET, 2); - OUT_RING(ring, info->index_size ? info->index_bias : info->start); /* VFD_INDEX_OFFSET */ - OUT_RING(ring, info->start_instance); /* VFD_INSTANCE_START_OFFSET */ - - OUT_PKT4(ring, REG_A6XX_PC_RESTART_INDEX, 1); - OUT_RING(ring, info->primitive_restart ? /* PC_RESTART_INDEX */ - info->restart_index : 0xffffffff); + ctx->batch->tess_addrs_constobj->cur += size; + } + } + + uint32_t index_start = info->index_size ? info->index_bias : info->start; + if (ctx->last.dirty || (ctx->last.index_start != index_start)) { + OUT_PKT4(ring, REG_A6XX_VFD_INDEX_OFFSET, 1); + OUT_RING(ring, index_start); /* VFD_INDEX_OFFSET */ + ctx->last.index_start = index_start; + } + + if (ctx->last.dirty || (ctx->last.instance_start != info->start_instance)) { + OUT_PKT4(ring, REG_A6XX_VFD_INSTANCE_START_OFFSET, 1); + OUT_RING(ring, info->start_instance); /* VFD_INSTANCE_START_OFFSET */ + ctx->last.instance_start = info->start_instance; + } + + uint32_t restart_index = info->primitive_restart ? info->restart_index : 0xffffffff; + if (ctx->last.dirty || (ctx->last.restart_index != restart_index)) { + OUT_PKT4(ring, REG_A6XX_PC_RESTART_INDEX, 1); + OUT_RING(ring, restart_index); /* PC_RESTART_INDEX */ + ctx->last.restart_index = restart_index; + } + + fixup_draw_state(ctx, &emit); + + fd6_emit_state(ring, &emit); /* for debug after a lock up, write a unique counter value * to scratch7 for each draw, to make it easier to match up @@ -203,13 +305,11 @@ */ emit_marker6(ring, 7); - /* leave vis mode blank for now, it will be patched up when - * we know if we are binning or not - */ uint32_t draw0 = - CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(primtype) | CP_DRAW_INDX_OFFSET_0_VIS_CULL(USE_VISIBILITY) | - 0x2000; + CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(primtype) | + tess_draw0 | + COND(emit.key.gs, CP_DRAW_INDX_OFFSET_0_GS_ENABLE); if (info->index_size) { draw0 |= @@ -248,6 +348,7 @@ fd6_clear_lrz(struct fd_batch *batch, struct fd_resource *zsbuf, double depth) { struct fd_ringbuffer *ring; + struct fd6_context *fd6_ctx = fd6_context(batch->ctx); // TODO mid-frame clears (ie. app doing crazy stuff)?? Maybe worth // splitting both clear and lrz clear out into their own rb's. And @@ -269,7 +370,7 @@ OUT_WFI5(ring); OUT_PKT4(ring, REG_A6XX_RB_CCU_CNTL, 1); - OUT_RING(ring, 0x10000000); + OUT_RING(ring, fd6_ctx->magic.RB_CCU_CNTL_bypass); OUT_PKT4(ring, REG_A6XX_HLSQ_UPDATE_CNTL, 1); OUT_RING(ring, 0x7ffff); @@ -346,7 +447,7 @@ OUT_WFI5(ring); OUT_PKT4(ring, REG_A6XX_RB_UNKNOWN_8E04, 1); - OUT_RING(ring, 0x1000000); + OUT_RING(ring, fd6_ctx->magic.RB_UNKNOWN_8E04_blit); OUT_PKT7(ring, CP_BLIT, 1); OUT_RING(ring, CP_BLIT_0_OP(BLIT_OP_SCALE)); @@ -354,7 +455,7 @@ OUT_WFI5(ring); OUT_PKT4(ring, REG_A6XX_RB_UNKNOWN_8E04, 1); - OUT_RING(ring, 0x0); + OUT_RING(ring, 0x0); /* RB_UNKNOWN_8E04 */ fd6_event_write(batch, ring, UNK_1D, true); fd6_event_write(batch, ring, FACENESS_FLUSH, true); diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/a6xx/fd6_emit.c mesa-20.0.8/src/gallium/drivers/freedreno/a6xx/fd6_emit.c --- mesa-19.2.8/src/gallium/drivers/freedreno/a6xx/fd6_emit.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/a6xx/fd6_emit.c 2020-06-12 01:21:17.000000000 +0000 @@ -29,7 +29,7 @@ #include "util/u_string.h" #include "util/u_memory.h" #include "util/u_helpers.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_viewport.h" #include "freedreno_resource.h" @@ -344,7 +344,7 @@ OUT_RINGP(state, A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D) | A6XX_TEX_CONST_2_FETCHSIZE(TFETCH6_2_BYTE), &ctx->batch->fb_read_patches); - OUT_RING(state, A6XX_TEX_CONST_3_ARRAY_PITCH(rsc->layer_size)); + OUT_RING(state, A6XX_TEX_CONST_3_ARRAY_PITCH(rsc->layout.layer_size)); OUT_RING(state, A6XX_TEX_CONST_4_BASE_LO(ctx->screen->gmem_base)); OUT_RING(state, A6XX_TEX_CONST_5_BASE_HI(ctx->screen->gmem_base >> 32) | @@ -380,6 +380,27 @@ tex_const_reg = REG_A6XX_SP_VS_TEX_CONST_LO; tex_count_reg = REG_A6XX_SP_VS_TEX_COUNT; break; + case PIPE_SHADER_TESS_CTRL: + sb = SB6_HS_TEX; + opcode = CP_LOAD_STATE6_GEOM; + tex_samp_reg = REG_A6XX_SP_HS_TEX_SAMP_LO; + tex_const_reg = REG_A6XX_SP_HS_TEX_CONST_LO; + tex_count_reg = REG_A6XX_SP_HS_TEX_COUNT; + break; + case PIPE_SHADER_TESS_EVAL: + sb = SB6_DS_TEX; + opcode = CP_LOAD_STATE6_GEOM; + tex_samp_reg = REG_A6XX_SP_DS_TEX_SAMP_LO; + tex_const_reg = REG_A6XX_SP_DS_TEX_CONST_LO; + tex_count_reg = REG_A6XX_SP_DS_TEX_COUNT; + break; + case PIPE_SHADER_GEOMETRY: + sb = SB6_GS_TEX; + opcode = CP_LOAD_STATE6_GEOM; + tex_samp_reg = REG_A6XX_SP_GS_TEX_SAMP_LO; + tex_const_reg = REG_A6XX_SP_GS_TEX_CONST_LO; + tex_count_reg = REG_A6XX_SP_GS_TEX_COUNT; + break; case PIPE_SHADER_FRAGMENT: sb = SB6_FS_TEX; opcode = CP_LOAD_STATE6_FRAG; @@ -553,8 +574,11 @@ enum fd6_state_id state_id; unsigned enable_mask; } s[PIPE_SHADER_TYPES] = { - [PIPE_SHADER_VERTEX] = { FD6_GROUP_VS_TEX, 0x7 }, - [PIPE_SHADER_FRAGMENT] = { FD6_GROUP_FS_TEX, 0x6 }, + [PIPE_SHADER_VERTEX] = { FD6_GROUP_VS_TEX, ENABLE_ALL }, + [PIPE_SHADER_TESS_CTRL] = { FD6_GROUP_HS_TEX, ENABLE_ALL }, + [PIPE_SHADER_TESS_EVAL] = { FD6_GROUP_DS_TEX, ENABLE_ALL }, + [PIPE_SHADER_GEOMETRY] = { FD6_GROUP_GS_TEX, ENABLE_ALL }, + [PIPE_SHADER_FRAGMENT] = { FD6_GROUP_FS_TEX, ENABLE_DRAW }, }; debug_assert(s[type].state_id); @@ -672,6 +696,7 @@ static struct fd_ringbuffer * build_lrz(struct fd6_emit *emit, bool binning_pass) { + struct fd6_blend_stateobj *blend = fd6_blend_stateobj(emit->ctx->blend); struct fd6_zsa_stateobj *zsa = fd6_zsa_stateobj(emit->ctx->zsa); struct pipe_framebuffer_state *pfb = &emit->ctx->batch->framebuffer; struct fd_resource *rsc = fd_resource(pfb->zsbuf->texture); @@ -681,10 +706,14 @@ struct fd_ringbuffer *ring = fd_submit_new_ringbuffer(emit->ctx->batch->submit, 16, FD_RINGBUFFER_STREAMING); - if (emit->no_lrz_write || !rsc->lrz || !rsc->lrz_valid) { + if (zsa->invalidate_lrz) { + rsc->lrz_valid = false; gras_lrz_cntl = 0; rb_lrz_cntl = 0; - } else if (binning_pass && zsa->lrz_write) { + } else if (emit->no_lrz_write || !rsc->lrz || !rsc->lrz_valid) { + gras_lrz_cntl = 0; + rb_lrz_cntl = 0; + } else if (binning_pass && blend->lrz_write && zsa->lrz_write) { gras_lrz_cntl |= A6XX_GRAS_LRZ_CNTL_LRZ_WRITE; } @@ -712,21 +741,27 @@ if (!target) continue; - unsigned offset = (so->offsets[i] * info->stride[i] * 4) + - target->buffer_offset; - OUT_PKT4(ring, REG_A6XX_VPC_SO_BUFFER_BASE_LO(i), 3); /* VPC_SO[i].BUFFER_BASE_LO: */ - OUT_RELOCW(ring, fd_resource(target->buffer)->bo, 0, 0, 0); - OUT_RING(ring, target->buffer_size + offset); + OUT_RELOCW(ring, fd_resource(target->buffer)->bo, target->buffer_offset, 0, 0); + OUT_RING(ring, target->buffer_size - target->buffer_offset); + + if (so->reset & (1 << i)) { + unsigned offset = (so->offsets[i] * info->stride[i] * 4); + OUT_PKT4(ring, REG_A6XX_VPC_SO_BUFFER_OFFSET(i), 1); + OUT_RING(ring, offset); + } else { + OUT_PKT7(ring, CP_MEM_TO_REG, 3); + OUT_RING(ring, CP_MEM_TO_REG_0_REG(REG_A6XX_VPC_SO_BUFFER_OFFSET(i)) | + CP_MEM_TO_REG_0_SHIFT_BY_2 | CP_MEM_TO_REG_0_UNK31 | + CP_MEM_TO_REG_0_CNT(0)); + OUT_RELOC(ring, control_ptr(fd6_context(ctx), flush_base[i].offset)); + } + + OUT_PKT4(ring, REG_A6XX_VPC_SO_FLUSH_BASE_LO(i), 2); + OUT_RELOCW(ring, control_ptr(fd6_context(ctx), flush_base[i])); - OUT_PKT4(ring, REG_A6XX_VPC_SO_BUFFER_OFFSET(i), 3); - OUT_RING(ring, offset); - /* VPC_SO[i].FLUSH_BASE_LO/HI: */ - // TODO just give hw a dummy addr for now.. we should - // be using this an then CP_MEM_TO_REG to set the - // VPC_SO[i].BUFFER_OFFSET for the next draw.. - OUT_RELOCW(ring, control_ptr(fd6_context(ctx), flush_base)); + so->reset &= ~(1 << i); emit->streamout_mask |= (1 << i); } @@ -751,20 +786,127 @@ OUT_RING(ring, REG_A6XX_VPC_SO_PROG); OUT_RING(ring, tf->prog[i]); } - - OUT_PKT4(ring, REG_A6XX_VPC_SO_OVERRIDE, 1); - OUT_RING(ring, 0x0); } else { OUT_PKT7(ring, CP_CONTEXT_REG_BUNCH, 4); OUT_RING(ring, REG_A6XX_VPC_SO_CNTL); OUT_RING(ring, 0); OUT_RING(ring, REG_A6XX_VPC_SO_BUF_CNTL); OUT_RING(ring, 0); + } +} - OUT_PKT4(ring, REG_A6XX_VPC_SO_OVERRIDE, 1); - OUT_RING(ring, A6XX_VPC_SO_OVERRIDE_SO_DISABLE); +static void +emit_tess_bos(struct fd_ringbuffer *ring, struct fd6_emit *emit, struct ir3_shader_variant *s) +{ + struct fd_context *ctx = emit->ctx; + const unsigned regid = s->shader->const_state.offsets.primitive_param * 4 + 4; + uint32_t dwords = 16; + + OUT_PKT7(ring, fd6_stage2opcode(s->type), 3); + OUT_RING(ring, CP_LOAD_STATE6_0_DST_OFF(regid / 4) | + CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS)| + CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) | + CP_LOAD_STATE6_0_STATE_BLOCK(fd6_stage2shadersb(s->type)) | + CP_LOAD_STATE6_0_NUM_UNIT(dwords / 4)); + OUT_RB(ring, ctx->batch->tess_addrs_constobj); +} + +static void +emit_stage_tess_consts(struct fd_ringbuffer *ring, struct ir3_shader_variant *v, + uint32_t *params, int num_params) +{ + const unsigned regid = v->shader->const_state.offsets.primitive_param; + int size = MIN2(1 + regid, v->constlen) - regid; + if (size > 0) + fd6_emit_const(ring, v->type, regid * 4, 0, num_params, params, NULL); +} + +static void +fd6_emit_tess_const(struct fd6_emit *emit) +{ + struct fd_context *ctx = emit->ctx; + + struct fd_ringbuffer *constobj = fd_submit_new_ringbuffer( + ctx->batch->submit, 0x1000, FD_RINGBUFFER_STREAMING); + + /* VS sizes are in bytes since that's what STLW/LDLW use, while the HS + * size is dwords, since that's what LDG/STG use. + */ + unsigned num_vertices = + emit->hs ? + emit->info->vertices_per_patch : + emit->gs->shader->nir->info.gs.vertices_in; + + uint32_t vs_params[4] = { + emit->vs->shader->output_size * num_vertices * 4, /* vs primitive stride */ + emit->vs->shader->output_size * 4, /* vs vertex stride */ + 0, + 0 + }; + + emit_stage_tess_consts(constobj, emit->vs, vs_params, ARRAY_SIZE(vs_params)); + + if (emit->hs) { + uint32_t hs_params[4] = { + emit->vs->shader->output_size * num_vertices * 4, /* vs primitive stride */ + emit->vs->shader->output_size * 4, /* vs vertex stride */ + emit->hs->shader->output_size, + emit->info->vertices_per_patch + }; + + emit_stage_tess_consts(constobj, emit->hs, hs_params, ARRAY_SIZE(hs_params)); + emit_tess_bos(constobj, emit, emit->hs); + + if (emit->gs) + num_vertices = emit->gs->shader->nir->info.gs.vertices_in; + + uint32_t ds_params[4] = { + emit->ds->shader->output_size * num_vertices * 4, /* ds primitive stride */ + emit->ds->shader->output_size * 4, /* ds vertex stride */ + emit->hs->shader->output_size, /* hs vertex stride (dwords) */ + emit->hs->shader->nir->info.tess.tcs_vertices_out + }; + + emit_stage_tess_consts(constobj, emit->ds, ds_params, ARRAY_SIZE(ds_params)); + emit_tess_bos(constobj, emit, emit->ds); + } + + if (emit->gs) { + struct ir3_shader_variant *prev; + if (emit->ds) + prev = emit->ds; + else + prev = emit->vs; + + uint32_t gs_params[4] = { + prev->shader->output_size * num_vertices * 4, /* ds primitive stride */ + prev->shader->output_size * 4, /* ds vertex stride */ + 0, + 0, + }; + + num_vertices = emit->gs->shader->nir->info.gs.vertices_in; + emit_stage_tess_consts(constobj, emit->gs, gs_params, ARRAY_SIZE(gs_params)); } + fd6_emit_take_group(emit, constobj, FD6_GROUP_PRIMITIVE_PARAMS, ENABLE_ALL); +} + +static void +fd6_emit_consts(struct fd6_emit *emit, const struct ir3_shader_variant *v, + enum pipe_shader_type type, enum fd6_state_id id, unsigned enable_mask) +{ + struct fd_context *ctx = emit->ctx; + + if (v && ctx->dirty_shader[type] & (FD_DIRTY_SHADER_PROG | FD_DIRTY_SHADER_CONST)) { + struct fd_ringbuffer *constobj = fd_submit_new_ringbuffer( + ctx->batch->submit, v->shader->ubo_state.cmdstream_size, + FD_RINGBUFFER_STREAMING); + + ir3_emit_user_consts(ctx->screen, v, constobj, &ctx->constbuf[type]); + ir3_emit_ubos(ctx->screen, v, constobj, &ctx->constbuf[type]); + fd6_emit_take_group(emit, constobj, id, enable_mask); + } } void @@ -773,8 +915,11 @@ struct fd_context *ctx = emit->ctx; struct pipe_framebuffer_state *pfb = &ctx->batch->framebuffer; const struct fd6_program_state *prog = fd6_emit_get_prog(emit); - const struct ir3_shader_variant *vp = emit->vs; - const struct ir3_shader_variant *fp = emit->fs; + const struct ir3_shader_variant *vs = emit->vs; + const struct ir3_shader_variant *hs = emit->hs; + const struct ir3_shader_variant *ds = emit->ds; + const struct ir3_shader_variant *gs = emit->gs; + const struct ir3_shader_variant *fs = emit->fs; const enum fd_dirty_3d_state dirty = emit->dirty; bool needs_border = false; @@ -784,33 +929,34 @@ * we might at some point decide to do sysmem in some cases when * blend is enabled: */ - if (fp->fb_read) + if (fs->fb_read) ctx->batch->gmem_reason |= FD_GMEM_FB_READ; if (emit->dirty & (FD_DIRTY_VTXBUF | FD_DIRTY_VTXSTATE)) { struct fd_ringbuffer *state; state = build_vbo_state(emit, emit->vs); - fd6_emit_take_group(emit, state, FD6_GROUP_VBO, 0x7); + fd6_emit_take_group(emit, state, FD6_GROUP_VBO, ENABLE_ALL); } if (dirty & FD_DIRTY_ZSA) { struct fd6_zsa_stateobj *zsa = fd6_zsa_stateobj(ctx->zsa); if (util_format_is_pure_integer(pipe_surface_format(pfb->cbufs[0]))) - fd6_emit_add_group(emit, zsa->stateobj_no_alpha, FD6_GROUP_ZSA, 0x7); + fd6_emit_add_group(emit, zsa->stateobj_no_alpha, FD6_GROUP_ZSA, ENABLE_ALL); else - fd6_emit_add_group(emit, zsa->stateobj, FD6_GROUP_ZSA, 0x7); + fd6_emit_add_group(emit, zsa->stateobj, FD6_GROUP_ZSA, ENABLE_ALL); } - if ((dirty & (FD_DIRTY_ZSA | FD_DIRTY_PROG)) && pfb->zsbuf) { + if ((dirty & (FD_DIRTY_ZSA | FD_DIRTY_BLEND | FD_DIRTY_PROG)) && pfb->zsbuf) { struct fd_ringbuffer *state; state = build_lrz(emit, false); - fd6_emit_take_group(emit, state, FD6_GROUP_LRZ, 0x6); + fd6_emit_take_group(emit, state, FD6_GROUP_LRZ, ENABLE_DRAW); state = build_lrz(emit, true); - fd6_emit_take_group(emit, state, FD6_GROUP_LRZ_BINNING, 0x1); + fd6_emit_take_group(emit, state, + FD6_GROUP_LRZ_BINNING, CP_SET_DRAW_STATE__0_BINNING); } if (dirty & FD_DIRTY_STENCIL_REF) { @@ -863,136 +1009,73 @@ } if (dirty & FD_DIRTY_PROG) { - fd6_emit_add_group(emit, prog->config_stateobj, FD6_GROUP_PROG_CONFIG, 0x7); - fd6_emit_add_group(emit, prog->stateobj, FD6_GROUP_PROG, 0x6); + fd6_emit_add_group(emit, prog->config_stateobj, FD6_GROUP_PROG_CONFIG, ENABLE_ALL); + fd6_emit_add_group(emit, prog->stateobj, FD6_GROUP_PROG, ENABLE_DRAW); fd6_emit_add_group(emit, prog->binning_stateobj, - FD6_GROUP_PROG_BINNING, 0x1); + FD6_GROUP_PROG_BINNING, CP_SET_DRAW_STATE__0_BINNING); - /* emit remaining non-stateobj program state, ie. what depends - * on other emit state, so cannot be pre-baked. This could - * be moved to a separate stateobj which is dynamically - * created. + /* emit remaining streaming program state, ie. what depends on + * other emit state, so cannot be pre-baked. */ - fd6_program_emit(ring, emit); - } + struct fd_ringbuffer *streaming = fd6_program_interp_state(emit); - if (dirty & FD_DIRTY_RASTERIZER) { - struct fd6_rasterizer_stateobj *rasterizer = - fd6_rasterizer_stateobj(ctx->rasterizer); - fd6_emit_add_group(emit, rasterizer->stateobj, - FD6_GROUP_RASTERIZER, 0x7); + fd6_emit_take_group(emit, streaming, FD6_GROUP_PROG_INTERP, ENABLE_DRAW); } - /* Since the primitive restart state is not part of a tracked object, we - * re-emit this register every time. - */ - if (emit->info && ctx->rasterizer) { - struct fd6_rasterizer_stateobj *rasterizer = - fd6_rasterizer_stateobj(ctx->rasterizer); - OUT_PKT4(ring, REG_A6XX_PC_UNKNOWN_9806, 1); - OUT_RING(ring, 0); - OUT_PKT4(ring, REG_A6XX_PC_UNKNOWN_9990, 1); - OUT_RING(ring, 0); - OUT_PKT4(ring, REG_A6XX_VFD_UNKNOWN_A008, 1); - OUT_RING(ring, 0); - - OUT_PKT4(ring, REG_A6XX_PC_PRIMITIVE_CNTL_0, 1); - OUT_RING(ring, rasterizer->pc_primitive_cntl | - COND(emit->info->primitive_restart && emit->info->index_size, - A6XX_PC_PRIMITIVE_CNTL_0_PRIMITIVE_RESTART)); + if (dirty & FD_DIRTY_RASTERIZER) { + struct fd_ringbuffer *stateobj = + fd6_rasterizer_state(ctx, emit->primitive_restart); + fd6_emit_add_group(emit, stateobj, + FD6_GROUP_RASTERIZER, ENABLE_ALL); } if (dirty & (FD_DIRTY_FRAMEBUFFER | FD_DIRTY_RASTERIZER | FD_DIRTY_PROG)) { + struct fd_ringbuffer *ring = fd_submit_new_ringbuffer( + emit->ctx->batch->submit, 5 * 4, FD_RINGBUFFER_STREAMING); + unsigned nr = pfb->nr_cbufs; if (ctx->rasterizer->rasterizer_discard) nr = 0; OUT_PKT4(ring, REG_A6XX_RB_FS_OUTPUT_CNTL0, 2); - OUT_RING(ring, COND(fp->writes_pos, A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_Z) | - COND(fp->writes_smask && pfb->samples > 1, + OUT_RING(ring, COND(fs->writes_pos, A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_Z) | + COND(fs->writes_smask && pfb->samples > 1, A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_SAMPMASK)); OUT_RING(ring, A6XX_RB_FS_OUTPUT_CNTL1_MRT(nr)); OUT_PKT4(ring, REG_A6XX_SP_FS_OUTPUT_CNTL1, 1); OUT_RING(ring, A6XX_SP_FS_OUTPUT_CNTL1_MRT(nr)); - } - -#define DIRTY_CONST (FD_DIRTY_SHADER_PROG | FD_DIRTY_SHADER_CONST) - - if (ctx->dirty_shader[PIPE_SHADER_VERTEX] & DIRTY_CONST) { - struct fd_ringbuffer *vsconstobj = fd_submit_new_ringbuffer( - ctx->batch->submit, 0x1000, FD_RINGBUFFER_STREAMING); - ir3_emit_user_consts(ctx->screen, vp, vsconstobj, - &ctx->constbuf[PIPE_SHADER_VERTEX]); - ir3_emit_ubos(ctx->screen, vp, vsconstobj, - &ctx->constbuf[PIPE_SHADER_VERTEX]); - - fd6_emit_take_group(emit, vsconstobj, FD6_GROUP_VS_CONST, 0x7); + fd6_emit_take_group(emit, ring, FD6_GROUP_PROG_FB_RAST, ENABLE_DRAW); } - if (ctx->dirty_shader[PIPE_SHADER_FRAGMENT] & DIRTY_CONST) { - struct fd_ringbuffer *fsconstobj = fd_submit_new_ringbuffer( - ctx->batch->submit, 0x1000, FD_RINGBUFFER_STREAMING); - - ir3_emit_user_consts(ctx->screen, fp, fsconstobj, - &ctx->constbuf[PIPE_SHADER_FRAGMENT]); - ir3_emit_ubos(ctx->screen, fp, fsconstobj, - &ctx->constbuf[PIPE_SHADER_FRAGMENT]); + fd6_emit_consts(emit, vs, PIPE_SHADER_VERTEX, FD6_GROUP_VS_CONST, ENABLE_ALL); + fd6_emit_consts(emit, hs, PIPE_SHADER_TESS_CTRL, FD6_GROUP_HS_CONST, ENABLE_ALL); + fd6_emit_consts(emit, ds, PIPE_SHADER_TESS_EVAL, FD6_GROUP_DS_CONST, ENABLE_ALL); + fd6_emit_consts(emit, gs, PIPE_SHADER_GEOMETRY, FD6_GROUP_GS_CONST, ENABLE_ALL); + fd6_emit_consts(emit, fs, PIPE_SHADER_FRAGMENT, FD6_GROUP_FS_CONST, ENABLE_DRAW); - fd6_emit_take_group(emit, fsconstobj, FD6_GROUP_FS_CONST, 0x6); - } + if (emit->key.key.has_gs || emit->key.key.tessellation) + fd6_emit_tess_const(emit); /* if driver-params are needed, emit each time: */ - if (ir3_needs_vs_driver_params(vp)) { + if (ir3_needs_vs_driver_params(vs)) { struct fd_ringbuffer *dpconstobj = fd_submit_new_ringbuffer( ctx->batch->submit, IR3_DP_VS_COUNT * 4, FD_RINGBUFFER_STREAMING); - ir3_emit_vs_driver_params(vp, dpconstobj, ctx, emit->info); - fd6_emit_take_group(emit, dpconstobj, FD6_GROUP_VS_DRIVER_PARAMS, 0x7); + ir3_emit_vs_driver_params(vs, dpconstobj, ctx, emit->info); + fd6_emit_take_group(emit, dpconstobj, FD6_GROUP_VS_DRIVER_PARAMS, ENABLE_ALL); } else { - fd6_emit_take_group(emit, NULL, FD6_GROUP_VS_DRIVER_PARAMS, 0x7); + fd6_emit_take_group(emit, NULL, FD6_GROUP_VS_DRIVER_PARAMS, ENABLE_ALL); } - struct ir3_stream_output_info *info = &vp->shader->stream_output; + struct ir3_stream_output_info *info = &fd6_last_shader(prog)->shader->stream_output; if (info->num_outputs) fd6_emit_streamout(ring, emit, info); if (dirty & FD_DIRTY_BLEND) { struct fd6_blend_stateobj *blend = fd6_blend_stateobj(ctx->blend); - uint32_t i; - - for (i = 0; i < pfb->nr_cbufs; i++) { - enum pipe_format format = pipe_surface_format(pfb->cbufs[i]); - bool is_int = util_format_is_pure_integer(format); - bool has_alpha = util_format_has_alpha(format); - uint32_t control = blend->rb_mrt[i].control; - uint32_t blend_control = blend->rb_mrt[i].blend_control_alpha; - - if (is_int) { - control &= A6XX_RB_MRT_CONTROL_COMPONENT_ENABLE__MASK; - control |= A6XX_RB_MRT_CONTROL_ROP_CODE(ROP_COPY); - } - - if (has_alpha) { - blend_control |= blend->rb_mrt[i].blend_control_rgb; - } else { - blend_control |= blend->rb_mrt[i].blend_control_no_alpha_rgb; - control &= ~A6XX_RB_MRT_CONTROL_BLEND2; - } - - OUT_PKT4(ring, REG_A6XX_RB_MRT_CONTROL(i), 1); - OUT_RING(ring, control); - - OUT_PKT4(ring, REG_A6XX_RB_MRT_BLEND_CONTROL(i), 1); - OUT_RING(ring, blend_control); - } - - OUT_PKT4(ring, REG_A6XX_RB_DITHER_CNTL, 1); - OUT_RING(ring, blend->rb_dither_cntl); - - OUT_PKT4(ring, REG_A6XX_SP_BLEND_CNTL, 1); - OUT_RING(ring, blend->sp_blend_cntl); + fd6_emit_add_group(emit, blend->stateobj, FD6_GROUP_BLEND, ENABLE_DRAW); } if (dirty & (FD_DIRTY_BLEND | FD_DIRTY_SAMPLE_MASK)) { @@ -1013,27 +1096,41 @@ OUT_RING(ring, A6XX_RB_BLEND_ALPHA_F32(bcolor->color[3])); } - needs_border |= fd6_emit_combined_textures(ring, emit, PIPE_SHADER_VERTEX, vp); - needs_border |= fd6_emit_combined_textures(ring, emit, PIPE_SHADER_FRAGMENT, fp); + needs_border |= fd6_emit_combined_textures(ring, emit, PIPE_SHADER_VERTEX, vs); + if (hs) { + needs_border |= fd6_emit_combined_textures(ring, emit, PIPE_SHADER_TESS_CTRL, hs); + needs_border |= fd6_emit_combined_textures(ring, emit, PIPE_SHADER_TESS_EVAL, ds); + } + if (gs) { + needs_border |= fd6_emit_combined_textures(ring, emit, PIPE_SHADER_GEOMETRY, gs); + } + needs_border |= fd6_emit_combined_textures(ring, emit, PIPE_SHADER_FRAGMENT, fs); if (needs_border) emit_border_color(ctx, ring); + if (hs) { + debug_assert(ir3_shader_nibo(hs) == 0); + debug_assert(ir3_shader_nibo(ds) == 0); + } + if (gs) { + debug_assert(ir3_shader_nibo(gs) == 0); + } + #define DIRTY_IBO (FD_DIRTY_SHADER_SSBO | FD_DIRTY_SHADER_IMAGE | \ FD_DIRTY_SHADER_PROG) if (ctx->dirty_shader[PIPE_SHADER_FRAGMENT] & DIRTY_IBO) { struct fd_ringbuffer *state = - fd6_build_ibo_state(ctx, fp, PIPE_SHADER_FRAGMENT); + fd6_build_ibo_state(ctx, fs, PIPE_SHADER_FRAGMENT); struct fd_ringbuffer *obj = fd_submit_new_ringbuffer( ctx->batch->submit, 0x100, FD_RINGBUFFER_STREAMING); - const struct ir3_ibo_mapping *mapping = &fp->image_mapping; OUT_PKT7(obj, CP_LOAD_STATE6, 3); OUT_RING(obj, CP_LOAD_STATE6_0_DST_OFF(0) | CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) | CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) | CP_LOAD_STATE6_0_STATE_BLOCK(SB6_IBO) | - CP_LOAD_STATE6_0_NUM_UNIT(mapping->num_ibo)); + CP_LOAD_STATE6_0_NUM_UNIT(ir3_shader_nibo(fs))); OUT_RB(obj, state); OUT_PKT4(obj, REG_A6XX_SP_IBO_LO, 2); @@ -1043,14 +1140,14 @@ * de-duplicate this from program->config_stateobj */ OUT_PKT4(obj, REG_A6XX_SP_IBO_COUNT, 1); - OUT_RING(obj, mapping->num_ibo); + OUT_RING(obj, ir3_shader_nibo(fs)); - ir3_emit_ssbo_sizes(ctx->screen, fp, obj, + ir3_emit_ssbo_sizes(ctx->screen, fs, obj, &ctx->shaderbuf[PIPE_SHADER_FRAGMENT]); - ir3_emit_image_dims(ctx->screen, fp, obj, + ir3_emit_image_dims(ctx->screen, fs, obj, &ctx->shaderimg[PIPE_SHADER_FRAGMENT]); - fd6_emit_take_group(emit, obj, FD6_GROUP_IBO, 0x6); + fd6_emit_take_group(emit, obj, FD6_GROUP_IBO, ENABLE_DRAW); fd_ringbuffer_del(state); } @@ -1061,16 +1158,18 @@ unsigned n = g->stateobj ? fd_ringbuffer_size(g->stateobj) / 4 : 0; + debug_assert((g->enable_mask & ~ENABLE_ALL) == 0); + if (n == 0) { OUT_RING(ring, CP_SET_DRAW_STATE__0_COUNT(0) | CP_SET_DRAW_STATE__0_DISABLE | - CP_SET_DRAW_STATE__0_ENABLE_MASK(g->enable_mask) | + g->enable_mask | CP_SET_DRAW_STATE__0_GROUP_ID(g->group_id)); OUT_RING(ring, 0x00000000); OUT_RING(ring, 0x00000000); } else { OUT_RING(ring, CP_SET_DRAW_STATE__0_COUNT(n) | - CP_SET_DRAW_STATE__0_ENABLE_MASK(g->enable_mask) | + g->enable_mask | CP_SET_DRAW_STATE__0_GROUP_ID(g->group_id)); OUT_RB(ring, g->stateobj); } @@ -1118,21 +1217,20 @@ if (dirty & (FD_DIRTY_SHADER_SSBO | FD_DIRTY_SHADER_IMAGE)) { struct fd_ringbuffer *state = fd6_build_ibo_state(ctx, cp, PIPE_SHADER_COMPUTE); - const struct ir3_ibo_mapping *mapping = &cp->image_mapping; OUT_PKT7(ring, CP_LOAD_STATE6_FRAG, 3); OUT_RING(ring, CP_LOAD_STATE6_0_DST_OFF(0) | CP_LOAD_STATE6_0_STATE_TYPE(ST6_IBO) | CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) | CP_LOAD_STATE6_0_STATE_BLOCK(SB6_CS_SHADER) | - CP_LOAD_STATE6_0_NUM_UNIT(mapping->num_ibo)); + CP_LOAD_STATE6_0_NUM_UNIT(ir3_shader_nibo(cp))); OUT_RB(ring, state); OUT_PKT4(ring, REG_A6XX_SP_CS_IBO_LO, 2); OUT_RB(ring, state); OUT_PKT4(ring, REG_A6XX_SP_CS_IBO_COUNT, 1); - OUT_RING(ring, mapping->num_ibo); + OUT_RING(ring, ir3_shader_nibo(cp)); fd_ringbuffer_del(state); } @@ -1152,20 +1250,9 @@ OUT_PKT4(ring, REG_A6XX_HLSQ_UPDATE_CNTL, 1); OUT_RING(ring, 0xfffff); -/* -t7 opcode: CP_PERFCOUNTER_ACTION (50) (4 dwords) -0000000500024048: 70d08003 00000000 001c5000 00000005 -t7 opcode: CP_PERFCOUNTER_ACTION (50) (4 dwords) -0000000500024058: 70d08003 00000010 001c7000 00000005 - -t7 opcode: CP_WAIT_FOR_IDLE (26) (1 dwords) -0000000500024068: 70268000 -*/ - OUT_WFI5(ring); - WRITE(REG_A6XX_RB_CCU_CNTL, 0x7c400004); - WRITE(REG_A6XX_RB_UNKNOWN_8E04, 0x00100000); + WRITE(REG_A6XX_RB_UNKNOWN_8E04, 0x0); WRITE(REG_A6XX_SP_UNKNOWN_AE04, 0x8); WRITE(REG_A6XX_SP_UNKNOWN_AE00, 0); WRITE(REG_A6XX_SP_UNKNOWN_AE0F, 0x3f); @@ -1185,7 +1272,7 @@ WRITE(REG_A6XX_UCHE_CLIENT_PF, 4); WRITE(REG_A6XX_RB_UNKNOWN_8E01, 0x1); WRITE(REG_A6XX_SP_UNKNOWN_AB00, 0x5); - WRITE(REG_A6XX_VFD_UNKNOWN_A009, 0x00000001); + WRITE(REG_A6XX_VFD_ADD_OFFSET, A6XX_VFD_ADD_OFFSET_VERTEX); WRITE(REG_A6XX_RB_UNKNOWN_8811, 0x00000010); WRITE(REG_A6XX_PC_MODE_CNTL, 0x1f); @@ -1196,10 +1283,6 @@ WRITE(REG_A6XX_GRAS_SAMPLE_CNTL, 0); WRITE(REG_A6XX_GRAS_UNKNOWN_8110, 0x2); - WRITE(REG_A6XX_RB_RENDER_CONTROL0, 0x401); - WRITE(REG_A6XX_RB_RENDER_CONTROL1, 0); - WRITE(REG_A6XX_RB_FS_OUTPUT_CNTL0, 0); - WRITE(REG_A6XX_RB_SAMPLE_CNTL, 0); WRITE(REG_A6XX_RB_UNKNOWN_8818, 0); WRITE(REG_A6XX_RB_UNKNOWN_8819, 0); WRITE(REG_A6XX_RB_UNKNOWN_881A, 0); @@ -1209,21 +1292,17 @@ WRITE(REG_A6XX_RB_UNKNOWN_881E, 0); WRITE(REG_A6XX_RB_UNKNOWN_88F0, 0); - WRITE(REG_A6XX_VPC_UNKNOWN_9101, 0xffff00); - WRITE(REG_A6XX_VPC_UNKNOWN_9107, 0); - WRITE(REG_A6XX_VPC_UNKNOWN_9236, A6XX_VPC_UNKNOWN_9236_POINT_COORD_INVERT(0)); WRITE(REG_A6XX_VPC_UNKNOWN_9300, 0); WRITE(REG_A6XX_VPC_SO_OVERRIDE, A6XX_VPC_SO_OVERRIDE_SO_DISABLE); - WRITE(REG_A6XX_PC_UNKNOWN_9801, 0); WRITE(REG_A6XX_PC_UNKNOWN_9806, 0); + WRITE(REG_A6XX_PC_UNKNOWN_9990, 0); WRITE(REG_A6XX_PC_UNKNOWN_9980, 0); - WRITE(REG_A6XX_PC_UNKNOWN_9B06, 0); - WRITE(REG_A6XX_PC_UNKNOWN_9B06, 0); + WRITE(REG_A6XX_PC_UNKNOWN_9B07, 0); WRITE(REG_A6XX_SP_UNKNOWN_A81B, 0); @@ -1275,12 +1354,6 @@ OUT_PKT4(ring, REG_A6XX_VPC_SO_BUF_CNTL, 1); OUT_RING(ring, 0x00000000); /* VPC_SO_BUF_CNTL */ - OUT_PKT4(ring, REG_A6XX_SP_HS_CTRL_REG0, 1); - OUT_RING(ring, 0x00000000); - - OUT_PKT4(ring, REG_A6XX_SP_GS_CTRL_REG0, 1); - OUT_RING(ring, 0x00000000); - OUT_PKT4(ring, REG_A6XX_GRAS_LRZ_CNTL, 1); OUT_RING(ring, 0x00000000); @@ -1324,11 +1397,12 @@ seqno = fd6_event_write(batch, ring, CACHE_FLUSH_AND_INV_EVENT, true); OUT_PKT7(ring, CP_WAIT_REG_MEM, 6); - OUT_RING(ring, 0x00000013); + OUT_RING(ring, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ) | + CP_WAIT_REG_MEM_0_POLL_MEMORY); OUT_RELOC(ring, control_ptr(fd6_ctx, seqno)); - OUT_RING(ring, seqno); - OUT_RING(ring, 0xffffffff); - OUT_RING(ring, 0x00000010); + OUT_RING(ring, CP_WAIT_REG_MEM_3_REF(seqno)); + OUT_RING(ring, CP_WAIT_REG_MEM_4_MASK(~0)); + OUT_RING(ring, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16)); fd6_event_write(batch, ring, UNK_1D, true); fd6_event_write(batch, ring, UNK_1C, true); @@ -1337,10 +1411,10 @@ fd6_event_write(batch, ring, 0x31, false); - OUT_PKT7(ring, CP_UNK_A6XX_14, 4); - OUT_RING(ring, 0x00000000); + OUT_PKT7(ring, CP_WAIT_MEM_GTE, 4); + OUT_RING(ring, CP_WAIT_MEM_GTE_0_RESERVED(0)); OUT_RELOC(ring, control_ptr(fd6_ctx, seqno)); - OUT_RING(ring, seqno); + OUT_RING(ring, CP_WAIT_MEM_GTE_3_REF(seqno)); } void diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/a6xx/fd6_emit.h mesa-20.0.8/src/gallium/drivers/freedreno/a6xx/fd6_emit.h --- mesa-19.2.8/src/gallium/drivers/freedreno/a6xx/fd6_emit.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/a6xx/fd6_emit.h 2020-06-12 01:21:17.000000000 +0000 @@ -46,26 +46,39 @@ FD6_GROUP_PROG_CONFIG, FD6_GROUP_PROG, FD6_GROUP_PROG_BINNING, + FD6_GROUP_PROG_INTERP, + FD6_GROUP_PROG_FB_RAST, FD6_GROUP_LRZ, FD6_GROUP_LRZ_BINNING, FD6_GROUP_VBO, FD6_GROUP_VS_CONST, + FD6_GROUP_HS_CONST, + FD6_GROUP_DS_CONST, + FD6_GROUP_GS_CONST, FD6_GROUP_FS_CONST, FD6_GROUP_VS_DRIVER_PARAMS, + FD6_GROUP_PRIMITIVE_PARAMS, FD6_GROUP_VS_TEX, + FD6_GROUP_HS_TEX, + FD6_GROUP_DS_TEX, + FD6_GROUP_GS_TEX, FD6_GROUP_FS_TEX, FD6_GROUP_IBO, FD6_GROUP_RASTERIZER, FD6_GROUP_ZSA, + FD6_GROUP_BLEND, }; +#define ENABLE_ALL (CP_SET_DRAW_STATE__0_BINNING | CP_SET_DRAW_STATE__0_GMEM | CP_SET_DRAW_STATE__0_SYSMEM) +#define ENABLE_DRAW (CP_SET_DRAW_STATE__0_GMEM | CP_SET_DRAW_STATE__0_SYSMEM) + struct fd6_state_group { struct fd_ringbuffer *stateobj; enum fd6_state_id group_id; /* enable_mask controls which states the stateobj is evaluated in, * b0 is binning pass b1 and/or b2 is draw pass */ - uint8_t enable_mask; + uint32_t enable_mask; }; /* grouped together emit-state for prog/vertex/state emit: */ @@ -80,6 +93,7 @@ bool sprite_coord_mode; bool rasterflat; bool no_decode_srgb; + bool primitive_restart; /* in binning pass, we don't have real frag shader, so we * don't know if real draw disqualifies lrz write. So just @@ -92,6 +106,9 @@ struct ir3_shader_variant *bs; struct ir3_shader_variant *vs; + struct ir3_shader_variant *hs; + struct ir3_shader_variant *ds; + struct ir3_shader_variant *gs; struct ir3_shader_variant *fs; unsigned streamout_mask; @@ -165,18 +182,19 @@ seqno = fd6_event_write(batch, ring, CACHE_FLUSH_AND_INV_EVENT, true); OUT_PKT7(ring, CP_WAIT_REG_MEM, 6); - OUT_RING(ring, 0x00000013); + OUT_RING(ring, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ) | + CP_WAIT_REG_MEM_0_POLL_MEMORY); OUT_RELOC(ring, control_ptr(fd6_ctx, seqno)); - OUT_RING(ring, seqno); - OUT_RING(ring, 0xffffffff); - OUT_RING(ring, 0x00000010); + OUT_RING(ring, CP_WAIT_REG_MEM_3_REF(seqno)); + OUT_RING(ring, CP_WAIT_REG_MEM_4_MASK(~0)); + OUT_RING(ring, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16)); seqno = fd6_event_write(batch, ring, CACHE_FLUSH_TS, true); - OUT_PKT7(ring, CP_UNK_A6XX_14, 4); - OUT_RING(ring, 0x00000000); + OUT_PKT7(ring, CP_WAIT_MEM_GTE, 4); + OUT_RING(ring, CP_WAIT_MEM_GTE_0_RESERVED(0)); OUT_RELOC(ring, control_ptr(fd6_ctx, seqno)); - OUT_RING(ring, seqno); + OUT_RING(ring, CP_WAIT_MEM_GTE_3_REF(seqno)); } static inline void @@ -218,6 +236,12 @@ switch (type) { case MESA_SHADER_VERTEX: return SB6_VS_SHADER; + case MESA_SHADER_TESS_CTRL: + return SB6_HS_SHADER; + case MESA_SHADER_TESS_EVAL: + return SB6_DS_SHADER; + case MESA_SHADER_GEOMETRY: + return SB6_GS_SHADER; case MESA_SHADER_FRAGMENT: return SB6_FS_SHADER; case MESA_SHADER_COMPUTE: @@ -229,6 +253,22 @@ } } +static inline enum a6xx_tess_spacing +fd6_gl2spacing(enum gl_tess_spacing spacing) +{ + switch (spacing) { + case TESS_SPACING_EQUAL: + return TESS_EQUAL; + case TESS_SPACING_FRACTIONAL_ODD: + return TESS_FRACTIONAL_ODD; + case TESS_SPACING_FRACTIONAL_EVEN: + return TESS_FRACTIONAL_EVEN; + case TESS_SPACING_UNSPECIFIED: + default: + unreachable("spacing must be specified"); + } +} + bool fd6_emit_textures(struct fd_pipe *pipe, struct fd_ringbuffer *ring, enum pipe_shader_type type, struct fd_texture_stateobj *tex, unsigned bcolor_offset, diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/a6xx/fd6_format.c mesa-20.0.8/src/gallium/drivers/freedreno/a6xx/fd6_format.c --- mesa-19.2.8/src/gallium/drivers/freedreno/a6xx/fd6_format.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/a6xx/fd6_format.c 2020-06-12 01:21:17.000000000 +0000 @@ -26,7 +26,7 @@ */ #include "pipe/p_defines.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "fd6_format.h" #include "freedreno_resource.h" @@ -86,7 +86,7 @@ VT(R8_UINT, 8_UINT, R8_UINT, WZYX), VT(R8_SINT, 8_SINT, R8_SINT, WZYX), V_(R8_USCALED, 8_UINT, NONE, WZYX), - V_(R8_SSCALED, 8_UINT, NONE, WZYX), + V_(R8_SSCALED, 8_SINT, NONE, WZYX), _T(A8_UNORM, 8_UNORM, A8_UNORM, WZYX), _T(L8_UNORM, 8_UNORM, R8_UNORM, WZYX), @@ -107,7 +107,7 @@ VT(R16_UINT, 16_UINT, R16_UINT, WZYX), VT(R16_SINT, 16_SINT, R16_SINT, WZYX), V_(R16_USCALED, 16_UINT, NONE, WZYX), - V_(R16_SSCALED, 16_UINT, NONE, WZYX), + V_(R16_SSCALED, 16_SINT, NONE, WZYX), VT(R16_FLOAT, 16_FLOAT, R16_FLOAT, WZYX), _T(Z16_UNORM, 16_UNORM, R16_UNORM, WZYX), @@ -151,7 +151,7 @@ VT(R32_UINT, 32_UINT, R32_UINT, WZYX), VT(R32_SINT, 32_SINT, R32_SINT, WZYX), V_(R32_USCALED, 32_UINT, NONE, WZYX), - V_(R32_SSCALED, 32_UINT, NONE, WZYX), + V_(R32_SSCALED, 32_SINT, NONE, WZYX), VT(R32_FLOAT, 32_FLOAT, R32_FLOAT,WZYX), V_(R32_FIXED, 32_FIXED, NONE, WZYX), @@ -176,9 +176,9 @@ _T(L16A16_SINT, 16_16_SINT, NONE, WZYX), VT(R8G8B8A8_UNORM, 8_8_8_8_UNORM, R8G8B8A8_UNORM, WZYX), - _T(R8G8B8X8_UNORM, 8_8_8_8_UNORM, R8G8B8A8_UNORM, WZYX), + _T(R8G8B8X8_UNORM, 8_8_8_8_UNORM, R8G8B8X8_UNORM, WZYX), _T(R8G8B8A8_SRGB, 8_8_8_8_UNORM, R8G8B8A8_UNORM, WZYX), - _T(R8G8B8X8_SRGB, 8_8_8_8_UNORM, R8G8B8A8_UNORM, WZYX), + _T(R8G8B8X8_SRGB, 8_8_8_8_UNORM, R8G8B8X8_UNORM, WZYX), VT(R8G8B8A8_SNORM, 8_8_8_8_SNORM, R8G8B8A8_SNORM, WZYX), VT(R8G8B8A8_UINT, 8_8_8_8_UINT, R8G8B8A8_UINT, WZYX), VT(R8G8B8A8_SINT, 8_8_8_8_SINT, R8G8B8A8_SINT, WZYX), @@ -186,19 +186,19 @@ V_(R8G8B8A8_SSCALED, 8_8_8_8_SINT, NONE, WZYX), VT(B8G8R8A8_UNORM, 8_8_8_8_UNORM, R8G8B8A8_UNORM, WXYZ), - _T(B8G8R8X8_UNORM, 8_8_8_8_UNORM, R8G8B8A8_UNORM, WXYZ), + _T(B8G8R8X8_UNORM, 8_8_8_8_UNORM, R8G8B8X8_UNORM, WXYZ), VT(B8G8R8A8_SRGB, 8_8_8_8_UNORM, R8G8B8A8_UNORM, WXYZ), - _T(B8G8R8X8_SRGB, 8_8_8_8_UNORM, R8G8B8A8_UNORM, WXYZ), + _T(B8G8R8X8_SRGB, 8_8_8_8_UNORM, R8G8B8X8_UNORM, WXYZ), VT(A8B8G8R8_UNORM, 8_8_8_8_UNORM, R8G8B8A8_UNORM, XYZW), - _T(X8B8G8R8_UNORM, 8_8_8_8_UNORM, R8G8B8A8_UNORM, XYZW), + _T(X8B8G8R8_UNORM, 8_8_8_8_UNORM, R8G8B8X8_UNORM, XYZW), _T(A8B8G8R8_SRGB, 8_8_8_8_UNORM, R8G8B8A8_UNORM, XYZW), - _T(X8B8G8R8_SRGB, 8_8_8_8_UNORM, R8G8B8A8_UNORM, XYZW), + _T(X8B8G8R8_SRGB, 8_8_8_8_UNORM, R8G8B8X8_UNORM, XYZW), VT(A8R8G8B8_UNORM, 8_8_8_8_UNORM, R8G8B8A8_UNORM, ZYXW), - _T(X8R8G8B8_UNORM, 8_8_8_8_UNORM, R8G8B8A8_UNORM, ZYXW), + _T(X8R8G8B8_UNORM, 8_8_8_8_UNORM, R8G8B8X8_UNORM, ZYXW), _T(A8R8G8B8_SRGB, 8_8_8_8_UNORM, R8G8B8A8_UNORM, ZYXW), - _T(X8R8G8B8_SRGB, 8_8_8_8_UNORM, R8G8B8A8_UNORM, ZYXW), + _T(X8R8G8B8_SRGB, 8_8_8_8_UNORM, R8G8B8X8_UNORM, ZYXW), VT(R10G10B10A2_UNORM, 10_10_10_2_UNORM, R10G10B10A2_UNORM, WZYX), VT(B10G10R10A2_UNORM, 10_10_10_2_UNORM, R10G10B10A2_UNORM, WXYZ), @@ -215,15 +215,15 @@ VT(R11G11B10_FLOAT, 11_11_10_FLOAT, R11G11B10_FLOAT, WZYX), _T(R9G9B9E5_FLOAT, 9_9_9_E5_FLOAT, NONE, WZYX), - _T(Z24X8_UNORM, X8Z24_UNORM, X8Z24_UNORM, WZYX), - _T(X24S8_UINT, 8_8_8_8_UINT, X8Z24_UNORM, WZYX), - _T(Z24_UNORM_S8_UINT, X8Z24_UNORM, X8Z24_UNORM, WZYX), - _T(Z32_FLOAT, 32_FLOAT, R32_FLOAT, WZYX), - _T(Z32_FLOAT_S8X24_UINT, 32_FLOAT, R32_FLOAT, WZYX), - _T(X32_S8X24_UINT, 8_UINT, R8_UINT, WZYX), + _T(Z24X8_UNORM, X8Z24_UNORM, Z24_UNORM_S8_UINT, WZYX), + _T(X24S8_UINT, 8_8_8_8_UINT, Z24_UNORM_S8_UINT, WZYX), + _T(Z24_UNORM_S8_UINT, X8Z24_UNORM, Z24_UNORM_S8_UINT, WZYX), + _T(Z32_FLOAT, 32_FLOAT, R32_FLOAT, WZYX), + _T(Z32_FLOAT_S8X24_UINT, 32_FLOAT, R32_FLOAT, WZYX), + _T(X32_S8X24_UINT, 8_UINT, R8_UINT, WZYX), /* special format for blits: */ - _T(Z24_UNORM_S8_UINT_AS_R8G8B8A8, Z24_UNORM_S8_UINT, Z24_UNORM_S8_UINT, WZYX), + _T(Z24_UNORM_S8_UINT_AS_R8G8B8A8, Z24_UNORM_S8_UINT, Z24_UNORM_S8_UINT_AS_R8G8B8A8, WZYX), /* 48-bit */ V_(R16G16B16_UNORM, 16_16_16_UNORM, NONE, WZYX), @@ -478,32 +478,20 @@ unsigned swizzle_b, unsigned swizzle_a) { struct fd_resource *rsc = fd_resource(prsc); - uint32_t swap, texconst0 = 0; unsigned char swiz[4]; - if (util_format_is_srgb(format)) { - texconst0 |= A6XX_TEX_CONST_0_SRGB; - } - - if (rsc->tile_mode && !fd_resource_level_linear(prsc, level)) { - texconst0 |= A6XX_TEX_CONST_0_TILE_MODE(rsc->tile_mode); - swap = WZYX; - } else { - swap = fd6_pipe2swap(format); - } - fd6_tex_swiz(format, swiz, swizzle_r, swizzle_g, swizzle_b, swizzle_a); - texconst0 |= + return A6XX_TEX_CONST_0_FMT(fd6_pipe2tex(format)) | A6XX_TEX_CONST_0_SAMPLES(fd_msaa_samples(prsc->nr_samples)) | - A6XX_TEX_CONST_0_SWAP(swap) | + A6XX_TEX_CONST_0_SWAP(fd6_resource_swap(rsc, format)) | + A6XX_TEX_CONST_0_TILE_MODE(fd_resource_tile_mode(prsc, level)) | + COND(util_format_is_srgb(format), A6XX_TEX_CONST_0_SRGB) | A6XX_TEX_CONST_0_SWIZ_X(fd6_pipe2swiz(swiz[0])) | A6XX_TEX_CONST_0_SWIZ_Y(fd6_pipe2swiz(swiz[1])) | A6XX_TEX_CONST_0_SWIZ_Z(fd6_pipe2swiz(swiz[2])) | A6XX_TEX_CONST_0_SWIZ_W(fd6_pipe2swiz(swiz[3])); - - return texconst0; } diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/a6xx/fd6_format.h mesa-20.0.8/src/gallium/drivers/freedreno/a6xx/fd6_format.h --- mesa-19.2.8/src/gallium/drivers/freedreno/a6xx/fd6_format.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/a6xx/fd6_format.h 2020-06-12 01:21:17.000000000 +0000 @@ -60,7 +60,7 @@ case RB6_R8G8_UNORM: case RB6_R8G8_SNORM: case RB6_R8G8B8A8_UNORM: - case RB6_R8G8B8_UNORM: + case RB6_R8G8B8X8_UNORM: case RB6_R8G8B8A8_SNORM: return R2D_UNORM8; @@ -102,21 +102,27 @@ case RB6_R16_FLOAT: case RB6_R16G16_FLOAT: case RB6_R16G16B16A16_FLOAT: + case RB6_R11G11B10_FLOAT: return R2D_FLOAT16; + case RB6_R10G10B10A2_UNORM: case RB6_R4G4B4A4_UNORM: case RB6_R5G5B5A1_UNORM: case RB6_R5G6B5_UNORM: - case RB6_R10G10B10A2_UNORM: case RB6_R10G10B10A2_UINT: - case RB6_R11G11B10_FLOAT: - case RB6_X8Z24_UNORM: - // ??? - return 0; + case RB6_Z24_UNORM_S8_UINT: + case RB6_Z24_UNORM_S8_UINT_AS_R8G8B8A8: + return R2D_RAW; default: unreachable("bad format"); return 0; } } +static inline uint32_t +fd6_resource_swap(struct fd_resource *rsc, enum pipe_format format) +{ + return rsc->layout.tile_mode ? WZYX : fd6_pipe2swap(format); +} + #endif /* FD6_UTIL_H_ */ diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/a6xx/fd6_gmem.c mesa-20.0.8/src/gallium/drivers/freedreno/a6xx/fd6_gmem.c --- mesa-19.2.8/src/gallium/drivers/freedreno/a6xx/fd6_gmem.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/a6xx/fd6_gmem.c 2020-06-12 01:21:17.000000000 +0000 @@ -31,41 +31,63 @@ #include "util/u_string.h" #include "util/u_memory.h" #include "util/u_inlines.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "freedreno_draw.h" #include "freedreno_state.h" #include "freedreno_resource.h" +#include "fd6_blitter.h" #include "fd6_gmem.h" #include "fd6_context.h" #include "fd6_draw.h" #include "fd6_emit.h" #include "fd6_program.h" #include "fd6_format.h" +#include "fd6_resource.h" #include "fd6_zsa.h" +#include "fd6_pack.h" -/* some bits in common w/ a4xx: */ -#include "a4xx/fd4_draw.h" +/** + * Emits the flags registers, suitable for RB_MRT_FLAG_BUFFER, + * RB_DEPTH_FLAG_BUFFER, SP_PS_2D_SRC_FLAGS, and RB_BLIT_FLAG_DST. + */ +void +fd6_emit_flag_reference(struct fd_ringbuffer *ring, struct fd_resource *rsc, + int level, int layer) +{ + if (fd_resource_ubwc_enabled(rsc, level)) { + OUT_RELOCW(ring, rsc->bo, fd_resource_ubwc_offset(rsc, level, layer), 0, 0); + OUT_RING(ring, + A6XX_RB_MRT_FLAG_BUFFER_PITCH_PITCH(rsc->layout.ubwc_slices[level].pitch) | + A6XX_RB_MRT_FLAG_BUFFER_PITCH_ARRAY_PITCH(rsc->layout.ubwc_size)); + } else { + OUT_RING(ring, 0x00000000); /* RB_MRT_FLAG_BUFFER[i].ADDR_LO */ + OUT_RING(ring, 0x00000000); /* RB_MRT_FLAG_BUFFER[i].ADDR_HI */ + OUT_RING(ring, 0x00000000); + } +} static void emit_mrt(struct fd_ringbuffer *ring, struct pipe_framebuffer_state *pfb, - struct fd_gmem_stateobj *gmem) + const struct fd_gmem_stateobj *gmem) { unsigned char mrt_comp[A6XX_MAX_RENDER_TARGETS] = {0}; unsigned srgb_cntl = 0; unsigned i; + bool layered = false; + unsigned type = 0; + for (i = 0; i < pfb->nr_cbufs; i++) { enum a6xx_color_fmt format = 0; enum a3xx_color_swap swap = WZYX; bool sint = false, uint = false; struct fd_resource *rsc = NULL; - struct fd_resource_slice *slice = NULL; + struct fdl_slice *slice = NULL; uint32_t stride = 0; - uint32_t offset, ubwc_offset; + uint32_t offset; uint32_t tile_mode; - bool ubwc_enabled; if (!pfb->cbufs[i]) continue; @@ -89,122 +111,105 @@ offset = fd_resource_offset(rsc, psurf->u.tex.level, psurf->u.tex.first_layer); - ubwc_offset = fd_resource_ubwc_offset(rsc, psurf->u.tex.level, - psurf->u.tex.first_layer); - ubwc_enabled = fd_resource_ubwc_enabled(rsc, psurf->u.tex.level); - stride = slice->pitch * rsc->cpp * pfb->samples; - swap = rsc->tile_mode ? WZYX : fd6_pipe2swap(pformat); + stride = slice->pitch * rsc->layout.cpp; + swap = fd6_resource_swap(rsc, pformat); - if (rsc->tile_mode && - fd_resource_level_linear(psurf->texture, psurf->u.tex.level)) - tile_mode = TILE6_LINEAR; - else - tile_mode = rsc->tile_mode; + tile_mode = fd_resource_tile_mode(psurf->texture, psurf->u.tex.level); + + if (psurf->u.tex.first_layer < psurf->u.tex.last_layer) { + layered = true; + if (psurf->texture->target == PIPE_TEXTURE_2D_ARRAY && psurf->texture->nr_samples > 0) + type = LAYER_MULTISAMPLE_ARRAY; + else if (psurf->texture->target == PIPE_TEXTURE_2D_ARRAY) + type = LAYER_2D_ARRAY; + else if (psurf->texture->target == PIPE_TEXTURE_CUBE) + type = LAYER_CUBEMAP; + else if (psurf->texture->target == PIPE_TEXTURE_3D) + type = LAYER_3D; + } - debug_assert(psurf->u.tex.first_layer == psurf->u.tex.last_layer); debug_assert((offset + slice->size0) <= fd_bo_size(rsc->bo)); - OUT_PKT4(ring, REG_A6XX_RB_MRT_BUF_INFO(i), 6); - OUT_RING(ring, A6XX_RB_MRT_BUF_INFO_COLOR_FORMAT(format) | - A6XX_RB_MRT_BUF_INFO_COLOR_TILE_MODE(tile_mode) | - A6XX_RB_MRT_BUF_INFO_COLOR_SWAP(swap)); - OUT_RING(ring, A6XX_RB_MRT_PITCH(stride)); - OUT_RING(ring, A6XX_RB_MRT_ARRAY_PITCH(slice->size0)); - OUT_RELOCW(ring, rsc->bo, offset, 0, 0); /* BASE_LO/HI */ - OUT_RING(ring, base); /* RB_MRT[i].BASE_GMEM */ - OUT_PKT4(ring, REG_A6XX_SP_FS_MRT_REG(i), 1); - OUT_RING(ring, A6XX_SP_FS_MRT_REG_COLOR_FORMAT(format) | - COND(sint, A6XX_SP_FS_MRT_REG_COLOR_SINT) | - COND(uint, A6XX_SP_FS_MRT_REG_COLOR_UINT)); + OUT_REG(ring, + A6XX_RB_MRT_BUF_INFO(i, + .color_format = format, + .color_tile_mode = tile_mode, + .color_swap = swap), + A6XX_RB_MRT_PITCH(i, .a6xx_rb_mrt_pitch = stride), + A6XX_RB_MRT_ARRAY_PITCH(i, .a6xx_rb_mrt_array_pitch = slice->size0), + A6XX_RB_MRT_BASE(i, .bo = rsc->bo, .bo_offset = offset), + A6XX_RB_MRT_BASE_GMEM(i, .unknown = base)); + + OUT_REG(ring, + A6XX_SP_FS_MRT_REG(i, .color_format = format, + .color_sint = sint, .color_uint = uint)); OUT_PKT4(ring, REG_A6XX_RB_MRT_FLAG_BUFFER(i), 3); - if (ubwc_enabled) { - OUT_RELOCW(ring, rsc->bo, ubwc_offset, 0, 0); /* BASE_LO/HI */ - OUT_RING(ring, A6XX_RB_MRT_FLAG_BUFFER_PITCH_PITCH(rsc->ubwc_pitch) | - A6XX_RB_MRT_FLAG_BUFFER_PITCH_ARRAY_PITCH(rsc->ubwc_size)); - } else { - OUT_RING(ring, 0x00000000); /* RB_MRT_FLAG_BUFFER[i].ADDR_LO */ - OUT_RING(ring, 0x00000000); /* RB_MRT_FLAG_BUFFER[i].ADDR_HI */ - OUT_RING(ring, 0x00000000); - } + fd6_emit_flag_reference(ring, rsc, + psurf->u.tex.level, psurf->u.tex.first_layer); } - OUT_PKT4(ring, REG_A6XX_RB_SRGB_CNTL, 1); - OUT_RING(ring, srgb_cntl); - - OUT_PKT4(ring, REG_A6XX_SP_SRGB_CNTL, 1); - OUT_RING(ring, srgb_cntl); + OUT_REG(ring, A6XX_RB_SRGB_CNTL(.dword = srgb_cntl)); + OUT_REG(ring, A6XX_SP_SRGB_CNTL(.dword = srgb_cntl)); - OUT_PKT4(ring, REG_A6XX_RB_RENDER_COMPONENTS, 1); - OUT_RING(ring, A6XX_RB_RENDER_COMPONENTS_RT0(mrt_comp[0]) | - A6XX_RB_RENDER_COMPONENTS_RT1(mrt_comp[1]) | - A6XX_RB_RENDER_COMPONENTS_RT2(mrt_comp[2]) | - A6XX_RB_RENDER_COMPONENTS_RT3(mrt_comp[3]) | - A6XX_RB_RENDER_COMPONENTS_RT4(mrt_comp[4]) | - A6XX_RB_RENDER_COMPONENTS_RT5(mrt_comp[5]) | - A6XX_RB_RENDER_COMPONENTS_RT6(mrt_comp[6]) | - A6XX_RB_RENDER_COMPONENTS_RT7(mrt_comp[7])); + OUT_REG(ring, A6XX_RB_RENDER_COMPONENTS( + .rt0 = mrt_comp[0], + .rt1 = mrt_comp[1], + .rt2 = mrt_comp[2], + .rt3 = mrt_comp[3], + .rt4 = mrt_comp[4], + .rt5 = mrt_comp[5], + .rt6 = mrt_comp[6], + .rt7 = mrt_comp[7])); + + OUT_REG(ring, A6XX_SP_FS_RENDER_COMPONENTS( + .rt0 = mrt_comp[0], + .rt1 = mrt_comp[1], + .rt2 = mrt_comp[2], + .rt3 = mrt_comp[3], + .rt4 = mrt_comp[4], + .rt5 = mrt_comp[5], + .rt6 = mrt_comp[6], + .rt7 = mrt_comp[7])); - OUT_PKT4(ring, REG_A6XX_SP_FS_RENDER_COMPONENTS, 1); - OUT_RING(ring, - A6XX_SP_FS_RENDER_COMPONENTS_RT0(mrt_comp[0]) | - A6XX_SP_FS_RENDER_COMPONENTS_RT1(mrt_comp[1]) | - A6XX_SP_FS_RENDER_COMPONENTS_RT2(mrt_comp[2]) | - A6XX_SP_FS_RENDER_COMPONENTS_RT3(mrt_comp[3]) | - A6XX_SP_FS_RENDER_COMPONENTS_RT4(mrt_comp[4]) | - A6XX_SP_FS_RENDER_COMPONENTS_RT5(mrt_comp[5]) | - A6XX_SP_FS_RENDER_COMPONENTS_RT6(mrt_comp[6]) | - A6XX_SP_FS_RENDER_COMPONENTS_RT7(mrt_comp[7])); + OUT_REG(ring, A6XX_GRAS_LAYER_CNTL(.layered = layered, .type = type)); } static void emit_zs(struct fd_ringbuffer *ring, struct pipe_surface *zsbuf, - struct fd_gmem_stateobj *gmem) + const struct fd_gmem_stateobj *gmem) { if (zsbuf) { struct fd_resource *rsc = fd_resource(zsbuf->texture); enum a6xx_depth_format fmt = fd6_pipe2depth(zsbuf->format); - struct fd_resource_slice *slice = fd_resource_slice(rsc, 0); - uint32_t stride = slice->pitch * rsc->cpp; + struct fdl_slice *slice = fd_resource_slice(rsc, 0); + uint32_t stride = slice->pitch * rsc->layout.cpp; uint32_t size = slice->size0; uint32_t base = gmem ? gmem->zsbuf_base[0] : 0; uint32_t offset = fd_resource_offset(rsc, zsbuf->u.tex.level, zsbuf->u.tex.first_layer); - uint32_t ubwc_offset = fd_resource_ubwc_offset(rsc, zsbuf->u.tex.level, - zsbuf->u.tex.first_layer); - - bool ubwc_enabled = fd_resource_ubwc_enabled(rsc, zsbuf->u.tex.level); - OUT_PKT4(ring, REG_A6XX_RB_DEPTH_BUFFER_INFO, 6); - OUT_RING(ring, A6XX_RB_DEPTH_BUFFER_INFO_DEPTH_FORMAT(fmt)); - OUT_RING(ring, A6XX_RB_DEPTH_BUFFER_PITCH(stride)); - OUT_RING(ring, A6XX_RB_DEPTH_BUFFER_ARRAY_PITCH(size)); - OUT_RELOCW(ring, rsc->bo, offset, 0, 0); /* RB_DEPTH_BUFFER_BASE_LO/HI */ - OUT_RING(ring, base); /* RB_DEPTH_BUFFER_BASE_GMEM */ + OUT_REG(ring, + A6XX_RB_DEPTH_BUFFER_INFO(.depth_format = fmt), + A6XX_RB_DEPTH_BUFFER_PITCH(.a6xx_rb_depth_buffer_pitch = stride), + A6XX_RB_DEPTH_BUFFER_ARRAY_PITCH(.a6xx_rb_depth_buffer_array_pitch = size), + A6XX_RB_DEPTH_BUFFER_BASE(.bo = rsc->bo, .bo_offset = offset), + A6XX_RB_DEPTH_BUFFER_BASE_GMEM(.dword = base)); - OUT_PKT4(ring, REG_A6XX_GRAS_SU_DEPTH_BUFFER_INFO, 1); - OUT_RING(ring, A6XX_GRAS_SU_DEPTH_BUFFER_INFO_DEPTH_FORMAT(fmt)); + OUT_REG(ring, A6XX_GRAS_SU_DEPTH_BUFFER_INFO(.depth_format = fmt)); OUT_PKT4(ring, REG_A6XX_RB_DEPTH_FLAG_BUFFER_BASE_LO, 3); - if (ubwc_enabled) { - OUT_RELOCW(ring, rsc->bo, ubwc_offset, 0, 0); /* BASE_LO/HI */ - OUT_RING(ring, A6XX_RB_DEPTH_FLAG_BUFFER_PITCH_PITCH(rsc->ubwc_pitch) | - A6XX_RB_DEPTH_FLAG_BUFFER_PITCH_ARRAY_PITCH(rsc->ubwc_size)); - } else { - OUT_RING(ring, 0x00000000); /* RB_DEPTH_FLAG_BUFFER_BASE_LO */ - OUT_RING(ring, 0x00000000); /* RB_DEPTH_FLAG_BUFFER_BASE_HI */ - OUT_RING(ring, 0x00000000); /* RB_DEPTH_FLAG_BUFFER_PITCH */ - } + fd6_emit_flag_reference(ring, rsc, + zsbuf->u.tex.level, zsbuf->u.tex.first_layer); if (rsc->lrz) { - OUT_PKT4(ring, REG_A6XX_GRAS_LRZ_BUFFER_BASE_LO, 5); - OUT_RELOCW(ring, rsc->lrz, 0, 0, 0); - OUT_RING(ring, A6XX_GRAS_LRZ_BUFFER_PITCH_PITCH(rsc->lrz_pitch)); - //OUT_RELOCW(ring, rsc->lrz, 0, 0, 0); /* GRAS_LRZ_FAST_CLEAR_BUFFER_BASE_LO/HI */ - // XXX a6xx seems to use a different buffer here.. not sure what for.. - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); + OUT_REG(ring, + A6XX_GRAS_LRZ_BUFFER_BASE(.bo = rsc->lrz), + A6XX_GRAS_LRZ_BUFFER_PITCH(.pitch = rsc->lrz_pitch), + // XXX a6xx seems to use a different buffer here.. not sure what for.. + A6XX_GRAS_LRZ_FAST_CLEAR_BUFFER_BASE_LO(0), + A6XX_GRAS_LRZ_FAST_CLEAR_BUFFER_BASE_HI(0)); } else { OUT_PKT4(ring, REG_A6XX_GRAS_LRZ_BUFFER_BASE_LO, 5); OUT_RING(ring, 0x00000000); @@ -221,32 +226,30 @@ OUT_RING(ring, CP_EVENT_WRITE_0_EVENT(UNK_25)); if (rsc->stencil) { - struct fd_resource_slice *slice = fd_resource_slice(rsc->stencil, 0); - stride = slice->pitch * rsc->stencil->cpp; + struct fdl_slice *slice = fd_resource_slice(rsc->stencil, 0); + stride = slice->pitch * rsc->stencil->layout.cpp; size = slice->size0; uint32_t base = gmem ? gmem->zsbuf_base[1] : 0; - OUT_PKT4(ring, REG_A6XX_RB_STENCIL_INFO, 6); - OUT_RING(ring, A6XX_RB_STENCIL_INFO_SEPARATE_STENCIL); - OUT_RING(ring, A6XX_RB_STENCIL_BUFFER_PITCH(stride)); - OUT_RING(ring, A6XX_RB_STENCIL_BUFFER_ARRAY_PITCH(size)); - OUT_RELOCW(ring, rsc->stencil->bo, 0, 0, 0); /* RB_STENCIL_BASE_LO/HI */ - OUT_RING(ring, base); /* RB_STENCIL_BASE_LO */ + OUT_REG(ring, + A6XX_RB_STENCIL_INFO(.separate_stencil = true), + A6XX_RB_STENCIL_BUFFER_PITCH(.a6xx_rb_stencil_buffer_pitch = stride), + A6XX_RB_STENCIL_BUFFER_ARRAY_PITCH(.a6xx_rb_stencil_buffer_array_pitch = size), + A6XX_RB_STENCIL_BUFFER_BASE(.bo = rsc->stencil->bo), + A6XX_RB_STENCIL_BUFFER_BASE_GMEM(.dword = base)); } else { - OUT_PKT4(ring, REG_A6XX_RB_STENCIL_INFO, 1); - OUT_RING(ring, 0x00000000); /* RB_STENCIL_INFO */ + OUT_REG(ring, A6XX_RB_STENCIL_INFO(0)); } } else { - OUT_PKT4(ring, REG_A6XX_RB_DEPTH_BUFFER_INFO, 6); - OUT_RING(ring, A6XX_RB_DEPTH_BUFFER_INFO_DEPTH_FORMAT(DEPTH6_NONE)); + OUT_PKT4(ring, REG_A6XX_RB_DEPTH_BUFFER_INFO, 6); + OUT_RING(ring, A6XX_RB_DEPTH_BUFFER_INFO_DEPTH_FORMAT(DEPTH6_NONE)); OUT_RING(ring, 0x00000000); /* RB_DEPTH_BUFFER_PITCH */ OUT_RING(ring, 0x00000000); /* RB_DEPTH_BUFFER_ARRAY_PITCH */ OUT_RING(ring, 0x00000000); /* RB_DEPTH_BUFFER_BASE_LO */ OUT_RING(ring, 0x00000000); /* RB_DEPTH_BUFFER_BASE_HI */ OUT_RING(ring, 0x00000000); /* RB_DEPTH_BUFFER_BASE_GMEM */ - OUT_PKT4(ring, REG_A6XX_GRAS_SU_DEPTH_BUFFER_INFO, 1); - OUT_RING(ring, A6XX_GRAS_SU_DEPTH_BUFFER_INFO_DEPTH_FORMAT(DEPTH6_NONE)); + OUT_REG(ring, A6XX_GRAS_SU_DEPTH_BUFFER_INFO(.depth_format = DEPTH6_NONE)); OUT_PKT4(ring, REG_A6XX_GRAS_LRZ_BUFFER_BASE_LO, 5); OUT_RING(ring, 0x00000000); /* RB_DEPTH_FLAG_BUFFER_BASE_LO */ @@ -255,26 +258,25 @@ OUT_RING(ring, 0x00000000); /* GRAS_LRZ_FAST_CLEAR_BUFFER_BASE_LO */ OUT_RING(ring, 0x00000000); /* GRAS_LRZ_FAST_CLEAR_BUFFER_BASE_HI */ - OUT_PKT4(ring, REG_A6XX_RB_STENCIL_INFO, 1); - OUT_RING(ring, 0x00000000); /* RB_STENCIL_INFO */ + OUT_REG(ring, A6XX_RB_STENCIL_INFO(0)); } } static bool use_hw_binning(struct fd_batch *batch) { - struct fd_gmem_stateobj *gmem = &batch->ctx->gmem; + const struct fd_gmem_stateobj *gmem = batch->gmem_state; // TODO figure out hw limits for binning - return fd_binning_enabled && ((gmem->nbins_x * gmem->nbins_y) > 2) && + return fd_binning_enabled && ((gmem->nbins_x * gmem->nbins_y) >= 2) && (batch->num_draws > 0); } static void patch_fb_read(struct fd_batch *batch) { - struct fd_gmem_stateobj *gmem = &batch->ctx->gmem; + const struct fd_gmem_stateobj *gmem = batch->gmem_state; for (unsigned i = 0; i < fd_patch_num_elements(&batch->fb_read_patches); i++) { struct fd_cs_patch *patch = fd_patch_element(&batch->fb_read_patches, i); @@ -330,7 +332,7 @@ { struct fd_context *ctx = batch->ctx; struct fd6_context *fd6_ctx = fd6_context(ctx); - struct fd_gmem_stateobj *gmem = &ctx->gmem; + const struct fd_gmem_stateobj *gmem = batch->gmem_state; struct fd_ringbuffer *ring = batch->gmem; int i; @@ -347,34 +349,31 @@ DRM_FREEDRENO_GEM_TYPE_KMEM, "vsc_data2"); } - OUT_PKT4(ring, REG_A6XX_VSC_BIN_SIZE, 3); - OUT_RING(ring, A6XX_VSC_BIN_SIZE_WIDTH(gmem->bin_w) | - A6XX_VSC_BIN_SIZE_HEIGHT(gmem->bin_h)); - OUT_RELOCW(ring, fd6_ctx->vsc_data, - 32 * fd6_ctx->vsc_data_pitch, 0, 0); /* VSC_SIZE_ADDRESS_LO/HI */ - - OUT_PKT4(ring, REG_A6XX_VSC_BIN_COUNT, 1); - OUT_RING(ring, A6XX_VSC_BIN_COUNT_NX(gmem->nbins_x) | - A6XX_VSC_BIN_COUNT_NY(gmem->nbins_y)); + OUT_REG(ring, + A6XX_VSC_BIN_SIZE(.width = gmem->bin_w, .height = gmem->bin_h), + A6XX_VSC_SIZE_ADDRESS(.bo = fd6_ctx->vsc_data, .bo_offset = 32 * fd6_ctx->vsc_data_pitch)); + + OUT_REG(ring, A6XX_VSC_BIN_COUNT(.nx = gmem->nbins_x, + .ny = gmem->nbins_y)); OUT_PKT4(ring, REG_A6XX_VSC_PIPE_CONFIG_REG(0), 32); for (i = 0; i < 32; i++) { - struct fd_vsc_pipe *pipe = &ctx->vsc_pipe[i]; + const struct fd_vsc_pipe *pipe = &gmem->vsc_pipe[i]; OUT_RING(ring, A6XX_VSC_PIPE_CONFIG_REG_X(pipe->x) | A6XX_VSC_PIPE_CONFIG_REG_Y(pipe->y) | A6XX_VSC_PIPE_CONFIG_REG_W(pipe->w) | A6XX_VSC_PIPE_CONFIG_REG_H(pipe->h)); } - OUT_PKT4(ring, REG_A6XX_VSC_PIPE_DATA2_ADDRESS_LO, 4); - OUT_RELOCW(ring, fd6_ctx->vsc_data2, 0, 0, 0); - OUT_RING(ring, fd6_ctx->vsc_data2_pitch); - OUT_RING(ring, fd_bo_size(fd6_ctx->vsc_data2)); - - OUT_PKT4(ring, REG_A6XX_VSC_PIPE_DATA_ADDRESS_LO, 4); - OUT_RELOCW(ring, fd6_ctx->vsc_data, 0, 0, 0); - OUT_RING(ring, fd6_ctx->vsc_data_pitch); - OUT_RING(ring, fd_bo_size(fd6_ctx->vsc_data)); + OUT_REG(ring, + A6XX_VSC_PIPE_DATA2_ADDRESS(.bo = fd6_ctx->vsc_data2), + A6XX_VSC_PIPE_DATA2_PITCH(.dword = fd6_ctx->vsc_data2_pitch), + A6XX_VSC_PIPE_DATA2_ARRAY_PITCH(.dword = fd_bo_size(fd6_ctx->vsc_data2))); + + OUT_REG(ring, + A6XX_VSC_PIPE_DATA_ADDRESS(.bo = fd6_ctx->vsc_data), + A6XX_VSC_PIPE_DATA_PITCH(.dword = fd6_ctx->vsc_data_pitch), + A6XX_VSC_PIPE_DATA_ARRAY_PITCH(.dword = fd_bo_size(fd6_ctx->vsc_data))); } /* TODO we probably have more than 8 scratch regs.. although the first @@ -401,7 +400,7 @@ emit_vsc_overflow_test(struct fd_batch *batch) { struct fd_ringbuffer *ring = batch->gmem; - struct fd_gmem_stateobj *gmem = &batch->ctx->gmem; + const struct fd_gmem_stateobj *gmem = batch->gmem_state; struct fd6_context *fd6_ctx = fd6_context(batch->ctx); debug_assert((fd6_ctx->vsc_data_pitch & 0x3) == 0); @@ -441,7 +440,7 @@ OUT_PKT7(ring, CP_MEM_TO_REG, 3); OUT_RING(ring, CP_MEM_TO_REG_0_REG(OVERFLOW_FLAG_REG) | - CP_MEM_TO_REG_0_CNT(1 - 1)); + CP_MEM_TO_REG_0_CNT(0)); OUT_RELOC(ring, control_ptr(fd6_ctx, vsc_scratch)); /* SRC_LO/HI */ /* @@ -460,11 +459,11 @@ OUT_PKT7(ring, CP_REG_TEST, 1); OUT_RING(ring, A6XX_CP_REG_TEST_0_REG(OVERFLOW_FLAG_REG) | A6XX_CP_REG_TEST_0_BIT(0) | - A6XX_CP_REG_TEST_0_UNK25); + A6XX_CP_REG_TEST_0_WAIT_FOR_ME); OUT_PKT7(ring, CP_COND_REG_EXEC, 2); - OUT_RING(ring, 0x10000000); - OUT_RING(ring, 7); /* conditionally execute next 7 dwords */ + OUT_RING(ring, CP_COND_REG_EXEC_0_MODE(PRED_TEST)); + OUT_RING(ring, CP_COND_REG_EXEC_1_DWORDS(7)); /* if (b0 set) */ { /* @@ -550,7 +549,7 @@ * is skipped for tiles that have no visible geometry. */ static void -emit_conditional_ib(struct fd_batch *batch, struct fd_tile *tile, +emit_conditional_ib(struct fd_batch *batch, const struct fd_tile *tile, struct fd_ringbuffer *target) { struct fd_ringbuffer *ring = batch->gmem; @@ -567,11 +566,11 @@ OUT_PKT7(ring, CP_REG_TEST, 1); OUT_RING(ring, A6XX_CP_REG_TEST_0_REG(REG_A6XX_VSC_STATE_REG(tile->p)) | A6XX_CP_REG_TEST_0_BIT(tile->n) | - A6XX_CP_REG_TEST_0_UNK25); + A6XX_CP_REG_TEST_0_WAIT_FOR_ME); OUT_PKT7(ring, CP_COND_REG_EXEC, 2); - OUT_RING(ring, 0x10000000); - OUT_RING(ring, 4 * count); /* conditionally execute next 4*count dwords */ + OUT_RING(ring, CP_COND_REG_EXEC_0_MODE(PRED_TEST)); + OUT_RING(ring, CP_COND_REG_EXEC_1_DWORDS(4 * count)); for (unsigned i = 0; i < count; i++) { uint32_t dwords; @@ -587,47 +586,38 @@ static void set_scissor(struct fd_ringbuffer *ring, uint32_t x1, uint32_t y1, uint32_t x2, uint32_t y2) { - OUT_PKT4(ring, REG_A6XX_GRAS_SC_WINDOW_SCISSOR_TL, 2); - OUT_RING(ring, A6XX_GRAS_SC_WINDOW_SCISSOR_TL_X(x1) | - A6XX_GRAS_SC_WINDOW_SCISSOR_TL_Y(y1)); - OUT_RING(ring, A6XX_GRAS_SC_WINDOW_SCISSOR_BR_X(x2) | - A6XX_GRAS_SC_WINDOW_SCISSOR_BR_Y(y2)); - - OUT_PKT4(ring, REG_A6XX_GRAS_RESOLVE_CNTL_1, 2); - OUT_RING(ring, A6XX_GRAS_RESOLVE_CNTL_1_X(x1) | - A6XX_GRAS_RESOLVE_CNTL_1_Y(y1)); - OUT_RING(ring, A6XX_GRAS_RESOLVE_CNTL_2_X(x2) | - A6XX_GRAS_RESOLVE_CNTL_2_Y(y2)); + OUT_REG(ring, + A6XX_GRAS_SC_WINDOW_SCISSOR_TL(.x = x1, .y = y1), + A6XX_GRAS_SC_WINDOW_SCISSOR_BR(.x = x2, .y = y2)); + + OUT_REG(ring, + A6XX_GRAS_RESOLVE_CNTL_1(.x = x1, .y = y1), + A6XX_GRAS_RESOLVE_CNTL_2(.x = x2, .y = y2)); } static void set_bin_size(struct fd_ringbuffer *ring, uint32_t w, uint32_t h, uint32_t flag) { - OUT_PKT4(ring, REG_A6XX_GRAS_BIN_CONTROL, 1); - OUT_RING(ring, A6XX_GRAS_BIN_CONTROL_BINW(w) | - A6XX_GRAS_BIN_CONTROL_BINH(h) | flag); - - OUT_PKT4(ring, REG_A6XX_RB_BIN_CONTROL, 1); - OUT_RING(ring, A6XX_RB_BIN_CONTROL_BINW(w) | - A6XX_RB_BIN_CONTROL_BINH(h) | flag); - + OUT_REG(ring, A6XX_GRAS_BIN_CONTROL(.binw = w, .binh = h, .dword = flag)); + OUT_REG(ring, A6XX_RB_BIN_CONTROL(.binw = w, .binh = h, .dword = flag)); /* no flag for RB_BIN_CONTROL2... */ - OUT_PKT4(ring, REG_A6XX_RB_BIN_CONTROL2, 1); - OUT_RING(ring, A6XX_RB_BIN_CONTROL2_BINW(w) | - A6XX_RB_BIN_CONTROL2_BINH(h)); + OUT_REG(ring, A6XX_RB_BIN_CONTROL2(.binw = w, .binh = h)); } static void emit_binning_pass(struct fd_batch *batch) { struct fd_ringbuffer *ring = batch->gmem; - struct fd_gmem_stateobj *gmem = &batch->ctx->gmem; + const struct fd_gmem_stateobj *gmem = batch->gmem_state; + struct fd6_context *fd6_ctx = fd6_context(batch->ctx); uint32_t x1 = gmem->minx; uint32_t y1 = gmem->miny; uint32_t x2 = gmem->minx + gmem->width - 1; uint32_t y2 = gmem->miny + gmem->height - 1; + debug_assert(!batch->tessellation); + set_scissor(ring, x1, y1, x2, y2); emit_marker6(ring, 7); @@ -643,16 +633,15 @@ OUT_WFI5(ring); - OUT_PKT4(ring, REG_A6XX_VFD_MODE_CNTL, 1); - OUT_RING(ring, A6XX_VFD_MODE_CNTL_BINNING_PASS); + OUT_REG(ring, A6XX_VFD_MODE_CNTL(.binning_pass = true)); update_vsc_pipe(batch); OUT_PKT4(ring, REG_A6XX_PC_UNKNOWN_9805, 1); - OUT_RING(ring, 0x1); + OUT_RING(ring, fd6_ctx->magic.PC_UNKNOWN_9805); OUT_PKT4(ring, REG_A6XX_SP_UNKNOWN_A0F8, 1); - OUT_RING(ring, 0x1); + OUT_RING(ring, fd6_ctx->magic.SP_UNKNOWN_A0F8); OUT_PKT7(ring, CP_EVENT_WRITE, 1); OUT_RING(ring, UNK_2C); @@ -697,7 +686,7 @@ OUT_WFI5(ring); OUT_PKT4(ring, REG_A6XX_RB_CCU_CNTL, 1); - OUT_RING(ring, 0x7c400004); /* RB_CCU_CNTL */ + OUT_RING(ring, fd6_ctx->magic.RB_CCU_CNTL_gmem); } static void @@ -734,7 +723,7 @@ struct fd_context *ctx = batch->ctx; struct fd_ringbuffer *ring = batch->gmem; struct pipe_framebuffer_state *pfb = &batch->framebuffer; - struct fd_gmem_stateobj *gmem = &batch->ctx->gmem; + const struct fd_gmem_stateobj *gmem = batch->gmem_state; fd6_emit_restore(batch, ring); @@ -751,28 +740,38 @@ OUT_PKT7(ring, CP_SKIP_IB2_ENABLE_GLOBAL, 1); OUT_RING(ring, 0x0); - /* 0x10000000 for BYPASS.. 0x7c13c080 for GMEM: */ fd_wfi(batch, ring); OUT_PKT4(ring, REG_A6XX_RB_CCU_CNTL, 1); - OUT_RING(ring, 0x7c400004); /* RB_CCU_CNTL */ + OUT_RING(ring, fd6_context(ctx)->magic.RB_CCU_CNTL_gmem); - emit_zs(ring, pfb->zsbuf, &ctx->gmem); - emit_mrt(ring, pfb, &ctx->gmem); + emit_zs(ring, pfb->zsbuf, batch->gmem_state); + emit_mrt(ring, pfb, batch->gmem_state); emit_msaa(ring, pfb->samples); patch_fb_read(batch); if (use_hw_binning(batch)) { + /* enable stream-out during binning pass: */ + OUT_PKT4(ring, REG_A6XX_VPC_SO_OVERRIDE, 1); + OUT_RING(ring, 0); + set_bin_size(ring, gmem->bin_w, gmem->bin_h, A6XX_RB_BIN_CONTROL_BINNING_PASS | 0x6000000); update_render_cntl(batch, pfb, true); emit_binning_pass(batch); + /* and disable stream-out for draw pass: */ + OUT_PKT4(ring, REG_A6XX_VPC_SO_OVERRIDE, 1); + OUT_RING(ring, A6XX_VPC_SO_OVERRIDE_SO_DISABLE); + /* * NOTE: even if we detect VSC overflow and disable use of * visibility stream in draw pass, it is still safe to execute * the reset of these cmds: */ +// NOTE a618 not setting .USE_VIZ .. from a quick check on a630, it +// does not appear that this bit changes much (ie. it isn't actually +// .USE_VIZ like previous gens) set_bin_size(ring, gmem->bin_w, gmem->bin_h, A6XX_RB_BIN_CONTROL_USE_VIZ | 0x6000000); @@ -780,14 +779,18 @@ OUT_RING(ring, 0x0); OUT_PKT4(ring, REG_A6XX_PC_UNKNOWN_9805, 1); - OUT_RING(ring, 0x1); + OUT_RING(ring, fd6_context(ctx)->magic.PC_UNKNOWN_9805); OUT_PKT4(ring, REG_A6XX_SP_UNKNOWN_A0F8, 1); - OUT_RING(ring, 0x1); + OUT_RING(ring, fd6_context(ctx)->magic.SP_UNKNOWN_A0F8); OUT_PKT7(ring, CP_SKIP_IB2_ENABLE_GLOBAL, 1); OUT_RING(ring, 0x1); } else { + /* no binning pass, so enable stream-out for draw pass:: */ + OUT_PKT4(ring, REG_A6XX_VPC_SO_OVERRIDE, 1); + OUT_RING(ring, 0); + set_bin_size(ring, gmem->bin_w, gmem->bin_h, 0x6000000); } @@ -816,9 +819,10 @@ /* before mem2gmem */ static void -fd6_emit_tile_prep(struct fd_batch *batch, struct fd_tile *tile) +fd6_emit_tile_prep(struct fd_batch *batch, const struct fd_tile *tile) { struct fd_context *ctx = batch->ctx; + const struct fd_gmem_stateobj *gmem = batch->gmem_state; struct fd6_context *fd6_ctx = fd6_context(ctx); struct fd_ringbuffer *ring = batch->gmem; @@ -834,11 +838,8 @@ set_scissor(ring, x1, y1, x2, y2); - OUT_PKT4(ring, REG_A6XX_VPC_SO_OVERRIDE, 1); - OUT_RING(ring, A6XX_VPC_SO_OVERRIDE_SO_DISABLE); - if (use_hw_binning(batch)) { - struct fd_vsc_pipe *pipe = &ctx->vsc_pipe[tile->p]; + const struct fd_vsc_pipe *pipe = &gmem->vsc_pipe[tile->p]; OUT_PKT7(ring, CP_WAIT_FOR_ME, 0); @@ -854,11 +855,11 @@ OUT_PKT7(ring, CP_REG_TEST, 1); OUT_RING(ring, A6XX_CP_REG_TEST_0_REG(OVERFLOW_FLAG_REG) | A6XX_CP_REG_TEST_0_BIT(0) | - A6XX_CP_REG_TEST_0_UNK25); + A6XX_CP_REG_TEST_0_WAIT_FOR_ME); OUT_PKT7(ring, CP_COND_REG_EXEC, 2); - OUT_RING(ring, 0x10000000); - OUT_RING(ring, 11); /* conditionally execute next 11 dwords */ + OUT_RING(ring, CP_COND_REG_EXEC_0_MODE(PRED_TEST)); + OUT_RING(ring, CP_COND_REG_EXEC_1_DWORDS(11)); /* if (no overflow) */ { OUT_PKT7(ring, CP_SET_BIN_DATA5, 7); @@ -883,12 +884,9 @@ set_window_offset(ring, x1, y1); - struct fd_gmem_stateobj *gmem = &batch->ctx->gmem; + const struct fd_gmem_stateobj *gmem = batch->gmem_state; set_bin_size(ring, gmem->bin_w, gmem->bin_h, 0x6000000); - OUT_PKT4(ring, REG_A6XX_VPC_SO_OVERRIDE, 1); - OUT_RING(ring, A6XX_VPC_SO_OVERRIDE_SO_DISABLE); - OUT_PKT7(ring, CP_SET_MODE, 1); OUT_RING(ring, 0x0); @@ -938,12 +936,14 @@ struct pipe_surface *psurf, bool stencil) { - struct fd_resource_slice *slice; + struct fdl_slice *slice; struct fd_resource *rsc = fd_resource(psurf->texture); enum pipe_format pfmt = psurf->format; - uint32_t offset, ubwc_offset; + uint32_t offset; bool ubwc_enabled; + debug_assert(psurf->u.tex.first_layer == psurf->u.tex.last_layer); + /* separate stencil case: */ if (stencil) { rsc = rsc->stencil; @@ -954,44 +954,30 @@ offset = fd_resource_offset(rsc, psurf->u.tex.level, psurf->u.tex.first_layer); ubwc_enabled = fd_resource_ubwc_enabled(rsc, psurf->u.tex.level); - ubwc_offset = fd_resource_ubwc_offset(rsc, psurf->u.tex.level, - psurf->u.tex.first_layer); debug_assert(psurf->u.tex.first_layer == psurf->u.tex.last_layer); enum a6xx_color_fmt format = fd6_pipe2color(pfmt); - uint32_t stride = slice->pitch * rsc->cpp; + uint32_t stride = slice->pitch * rsc->layout.cpp; uint32_t size = slice->size0; - enum a3xx_color_swap swap = rsc->tile_mode ? WZYX : fd6_pipe2swap(pfmt); + enum a3xx_color_swap swap = fd6_resource_swap(rsc, pfmt); enum a3xx_msaa_samples samples = fd_msaa_samples(rsc->base.nr_samples); - uint32_t tile_mode; + uint32_t tile_mode = fd_resource_tile_mode(&rsc->base, psurf->u.tex.level); - if (rsc->tile_mode && - fd_resource_level_linear(&rsc->base, psurf->u.tex.level)) - tile_mode = TILE6_LINEAR; - else - tile_mode = rsc->tile_mode; - - OUT_PKT4(ring, REG_A6XX_RB_BLIT_DST_INFO, 5); - OUT_RING(ring, - A6XX_RB_BLIT_DST_INFO_TILE_MODE(tile_mode) | - A6XX_RB_BLIT_DST_INFO_SAMPLES(samples) | - A6XX_RB_BLIT_DST_INFO_COLOR_FORMAT(format) | - A6XX_RB_BLIT_DST_INFO_COLOR_SWAP(swap) | - COND(ubwc_enabled, A6XX_RB_BLIT_DST_INFO_FLAGS)); - OUT_RELOCW(ring, rsc->bo, offset, 0, 0); /* RB_BLIT_DST_LO/HI */ - OUT_RING(ring, A6XX_RB_BLIT_DST_PITCH(stride)); - OUT_RING(ring, A6XX_RB_BLIT_DST_ARRAY_PITCH(size)); + OUT_REG(ring, + A6XX_RB_BLIT_DST_INFO(.tile_mode = tile_mode, .samples = samples, + .color_format = format, .color_swap = swap, .flags = ubwc_enabled), + A6XX_RB_BLIT_DST(.bo = rsc->bo, .bo_offset = offset), + A6XX_RB_BLIT_DST_PITCH(.a6xx_rb_blit_dst_pitch = stride), + A6XX_RB_BLIT_DST_ARRAY_PITCH(.a6xx_rb_blit_dst_array_pitch = size)); - OUT_PKT4(ring, REG_A6XX_RB_BLIT_BASE_GMEM, 1); - OUT_RING(ring, base); + OUT_REG(ring, A6XX_RB_BLIT_BASE_GMEM(.dword = base)); if (ubwc_enabled) { OUT_PKT4(ring, REG_A6XX_RB_BLIT_FLAG_DST_LO, 3); - OUT_RELOCW(ring, rsc->bo, ubwc_offset, 0, 0); - OUT_RING(ring, A6XX_RB_BLIT_FLAG_DST_PITCH_PITCH(rsc->ubwc_pitch) | - A6XX_RB_BLIT_FLAG_DST_PITCH_ARRAY_PITCH(rsc->ubwc_size)); + fd6_emit_flag_reference(ring, rsc, + psurf->u.tex.level, psurf->u.tex.first_layer); } fd6_emit_blit(batch, ring); @@ -1004,27 +990,12 @@ struct pipe_surface *psurf, unsigned buffer) { - uint32_t info = 0; - bool stencil = false; + bool stencil = (buffer == FD_BUFFER_STENCIL); - switch (buffer) { - case FD_BUFFER_COLOR: - info |= A6XX_RB_BLIT_INFO_UNK0; - break; - case FD_BUFFER_STENCIL: - info |= A6XX_RB_BLIT_INFO_UNK0; - stencil = true; - break; - case FD_BUFFER_DEPTH: - info |= A6XX_RB_BLIT_INFO_DEPTH | A6XX_RB_BLIT_INFO_UNK0; - break; - } - - if (util_format_is_pure_integer(psurf->format)) - info |= A6XX_RB_BLIT_INFO_INTEGER; - - OUT_PKT4(ring, REG_A6XX_RB_BLIT_INFO, 1); - OUT_RING(ring, info | A6XX_RB_BLIT_INFO_GMEM); + OUT_REG(ring, A6XX_RB_BLIT_INFO( + .gmem = true, .unk0 = true, + .depth = (buffer == FD_BUFFER_DEPTH), + .integer = util_format_is_pure_integer(psurf->format))); emit_blit(batch, ring, base, psurf, stencil); } @@ -1033,7 +1004,7 @@ emit_clears(struct fd_batch *batch, struct fd_ringbuffer *ring) { struct pipe_framebuffer_state *pfb = &batch->framebuffer; - struct fd_gmem_stateobj *gmem = &batch->ctx->gmem; + const struct fd_gmem_stateobj *gmem = batch->gmem_state; enum a3xx_msaa_samples samples = fd_msaa_samples(pfb->samples); uint32_t buffers = batch->fast_cleared; @@ -1196,8 +1167,7 @@ static void emit_restore_blits(struct fd_batch *batch, struct fd_ringbuffer *ring) { - struct fd_context *ctx = batch->ctx; - struct fd_gmem_stateobj *gmem = &ctx->gmem; + const struct fd_gmem_stateobj *gmem = batch->gmem_state; struct pipe_framebuffer_state *pfb = &batch->framebuffer; if (batch->restore & FD_BUFFER_COLOR) { @@ -1242,13 +1212,13 @@ * transfer from system memory to gmem */ static void -fd6_emit_tile_mem2gmem(struct fd_batch *batch, struct fd_tile *tile) +fd6_emit_tile_mem2gmem(struct fd_batch *batch, const struct fd_tile *tile) { } /* before IB to rendering cmds: */ static void -fd6_emit_tile_renderprep(struct fd_batch *batch, struct fd_tile *tile) +fd6_emit_tile_renderprep(struct fd_batch *batch, const struct fd_tile *tile) { if (batch->fast_cleared || !use_hw_binning(batch)) { fd6_emit_ib(batch->gmem, batch->tile_setup); @@ -1298,8 +1268,7 @@ static void prepare_tile_fini_ib(struct fd_batch *batch) { - struct fd_context *ctx = batch->ctx; - struct fd_gmem_stateobj *gmem = &ctx->gmem; + const struct fd_gmem_stateobj *gmem = batch->gmem_state; struct pipe_framebuffer_state *pfb = &batch->framebuffer; struct fd_ringbuffer *ring; @@ -1338,7 +1307,7 @@ } static void -fd6_emit_tile(struct fd_batch *batch, struct fd_tile *tile) +fd6_emit_tile(struct fd_batch *batch, const struct fd_tile *tile) { if (!use_hw_binning(batch)) { fd6_emit_ib(batch->gmem, batch->draw); @@ -1348,7 +1317,7 @@ } static void -fd6_emit_tile_gmem2mem(struct fd_batch *batch, struct fd_tile *tile) +fd6_emit_tile_gmem2mem(struct fd_batch *batch, const struct fd_tile *tile) { struct fd_ringbuffer *ring = batch->gmem; @@ -1360,11 +1329,11 @@ OUT_PKT7(ring, CP_REG_TEST, 1); OUT_RING(ring, A6XX_CP_REG_TEST_0_REG(OVERFLOW_FLAG_REG) | A6XX_CP_REG_TEST_0_BIT(0) | - A6XX_CP_REG_TEST_0_UNK25); + A6XX_CP_REG_TEST_0_WAIT_FOR_ME); OUT_PKT7(ring, CP_COND_REG_EXEC, 2); - OUT_RING(ring, 0x10000000); - OUT_RING(ring, 2); /* conditionally execute next 2 dwords */ + OUT_RING(ring, CP_COND_REG_EXEC_0_MODE(PRED_TEST)); + OUT_RING(ring, CP_COND_REG_EXEC_1_DWORDS(2)); /* if (no overflow) */ { OUT_PKT7(ring, CP_SET_MARKER, 1); @@ -1415,6 +1384,79 @@ } static void +emit_sysmem_clears(struct fd_batch *batch, struct fd_ringbuffer *ring) +{ + struct fd_context *ctx = batch->ctx; + struct pipe_framebuffer_state *pfb = &batch->framebuffer; + + uint32_t buffers = batch->fast_cleared; + + if (buffers & PIPE_CLEAR_COLOR) { + for (int i = 0; i < pfb->nr_cbufs; i++) { + union pipe_color_union *color = &batch->clear_color[i]; + + if (!pfb->cbufs[i]) + continue; + + if (!(buffers & (PIPE_CLEAR_COLOR0 << i))) + continue; + + fd6_clear_surface(ctx, ring, + pfb->cbufs[i], pfb->width, pfb->height, color); + } + } + if (buffers & (PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL)) { + union pipe_color_union value = {}; + + const bool has_depth = pfb->zsbuf; + struct pipe_resource *separate_stencil = + has_depth && fd_resource(pfb->zsbuf->texture)->stencil ? + &fd_resource(pfb->zsbuf->texture)->stencil->base : NULL; + + if ((has_depth && (buffers & PIPE_CLEAR_DEPTH)) || + (!separate_stencil && (buffers & PIPE_CLEAR_STENCIL))) { + value.f[0] = batch->clear_depth; + value.ui[1] = batch->clear_stencil; + fd6_clear_surface(ctx, ring, + pfb->zsbuf, pfb->width, pfb->height, &value); + } + + if (separate_stencil && (buffers & PIPE_CLEAR_STENCIL)) { + value.ui[0] = batch->clear_stencil; + + struct pipe_surface stencil_surf = *pfb->zsbuf; + stencil_surf.texture = separate_stencil; + + fd6_clear_surface(ctx, ring, + &stencil_surf, pfb->width, pfb->height, &value); + } + } + + fd6_event_write(batch, ring, UNK_1D, true); +} + +static void +setup_tess_buffers(struct fd_batch *batch, struct fd_ringbuffer *ring) +{ + struct fd_context *ctx = batch->ctx; + + batch->tessfactor_bo = fd_bo_new(ctx->screen->dev, + batch->tessfactor_size, + DRM_FREEDRENO_GEM_TYPE_KMEM, "tessfactor"); + + batch->tessparam_bo = fd_bo_new(ctx->screen->dev, + batch->tessparam_size, + DRM_FREEDRENO_GEM_TYPE_KMEM, "tessparam"); + + OUT_PKT4(ring, REG_A6XX_PC_TESSFACTOR_ADDR_LO, 2); + OUT_RELOCW(ring, batch->tessfactor_bo, 0, 0, 0); + + batch->tess_addrs_constobj->cur = batch->tess_addrs_constobj->start; + OUT_RELOCW(batch->tess_addrs_constobj, batch->tessparam_bo, 0, 0, 0); + OUT_RELOCW(batch->tess_addrs_constobj, batch->tessfactor_bo, 0, 0, 0); +} + +static void fd6_emit_sysmem_prep(struct fd_batch *batch) { struct pipe_framebuffer_state *pfb = &batch->framebuffer; @@ -1422,6 +1464,17 @@ fd6_emit_restore(batch, ring); + if (pfb->width > 0 && pfb->height > 0) + set_scissor(ring, 0, 0, pfb->width - 1, pfb->height - 1); + else + set_scissor(ring, 0, 0, 0, 0); + + set_window_offset(ring, 0, 0); + + set_bin_size(ring, 0, 0, 0xc00000); /* 0xc00000 = BYPASS? */ + + emit_sysmem_clears(batch, ring); + fd6_emit_lrz_flush(ring); emit_marker6(ring, 7); @@ -1429,32 +1482,22 @@ OUT_RING(ring, A6XX_CP_SET_MARKER_0_MODE(RM6_BYPASS) | 0x10); /* | 0x10 ? */ emit_marker6(ring, 7); + if (batch->tessellation) + setup_tess_buffers(batch, ring); + OUT_PKT7(ring, CP_SKIP_IB2_ENABLE_GLOBAL, 1); OUT_RING(ring, 0x0); fd6_event_write(batch, ring, PC_CCU_INVALIDATE_COLOR, false); fd6_cache_inv(batch, ring); -#if 0 - OUT_PKT4(ring, REG_A6XX_PC_POWER_CNTL, 1); - OUT_RING(ring, 0x00000003); /* PC_POWER_CNTL */ -#endif - -#if 0 - OUT_PKT4(ring, REG_A6XX_VFD_POWER_CNTL, 1); - OUT_RING(ring, 0x00000003); /* VFD_POWER_CNTL */ -#endif - - /* 0x10000000 for BYPASS.. 0x7c13c080 for GMEM: */ fd_wfi(batch, ring); OUT_PKT4(ring, REG_A6XX_RB_CCU_CNTL, 1); - OUT_RING(ring, 0x10000000); /* RB_CCU_CNTL */ + OUT_RING(ring, fd6_context(batch->ctx)->magic.RB_CCU_CNTL_bypass); - set_scissor(ring, 0, 0, pfb->width - 1, pfb->height - 1); - - set_window_offset(ring, 0, 0); - - set_bin_size(ring, 0, 0, 0xc00000); /* 0xc00000 = BYPASS? */ + /* enable stream-out, with sysmem there is only one pass: */ + OUT_PKT4(ring, REG_A6XX_VPC_SO_OVERRIDE, 1); + OUT_RING(ring, 0); OUT_PKT7(ring, CP_SET_VISIBILITY_OVERRIDE, 1); OUT_RING(ring, 0x1); diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/a6xx/fd6_image.c mesa-20.0.8/src/gallium/drivers/freedreno/a6xx/fd6_image.c --- mesa-19.2.8/src/gallium/drivers/freedreno/a6xx/fd6_image.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/a6xx/fd6_image.c 2020-06-12 01:21:17.000000000 +0000 @@ -72,7 +72,7 @@ img->fetchsize = fd6_pipe2fetchsize(format); img->type = fd6_tex_type(prsc->target); img->srgb = util_format_is_srgb(format); - img->cpp = rsc->cpp; + img->cpp = rsc->layout.cpp; img->bo = rsc->bo; /* Treat cube textures as 2d-array: */ @@ -97,30 +97,31 @@ img->buffer = false; unsigned lvl = pimg->u.tex.level; + struct fdl_slice *slice = fd_resource_slice(rsc, lvl); unsigned layers = pimg->u.tex.last_layer - pimg->u.tex.first_layer + 1; img->ubwc_offset = fd_resource_ubwc_offset(rsc, lvl, pimg->u.tex.first_layer); img->offset = fd_resource_offset(rsc, lvl, pimg->u.tex.first_layer); - img->pitch = rsc->slices[lvl].pitch * rsc->cpp; + img->pitch = slice->pitch * rsc->layout.cpp; switch (prsc->target) { case PIPE_TEXTURE_RECT: case PIPE_TEXTURE_1D: case PIPE_TEXTURE_2D: - img->array_pitch = rsc->layer_size; + img->array_pitch = rsc->layout.layer_size; img->depth = 1; break; case PIPE_TEXTURE_1D_ARRAY: case PIPE_TEXTURE_2D_ARRAY: case PIPE_TEXTURE_CUBE: case PIPE_TEXTURE_CUBE_ARRAY: - img->array_pitch = rsc->layer_size; + img->array_pitch = rsc->layout.layer_size; // TODO the CUBE/CUBE_ARRAY might need to be layers/6 for tex state, // but empirically for ibo state it shouldn't be divided. img->depth = layers; break; case PIPE_TEXTURE_3D: - img->array_pitch = rsc->slices[lvl].size0; + img->array_pitch = slice->size0; img->depth = u_minify(prsc->depth0, lvl); break; default: @@ -150,7 +151,7 @@ img->fetchsize = fd6_pipe2fetchsize(format); img->type = fd6_tex_type(prsc->target); img->srgb = util_format_is_srgb(format); - img->cpp = rsc->cpp; + img->cpp = rsc->layout.cpp; img->bo = rsc->bo; img->buffer = true; @@ -183,7 +184,7 @@ A6XX_TEX_CONST_2_TYPE(img->type) | A6XX_TEX_CONST_2_PITCH(img->pitch)); OUT_RING(ring, A6XX_TEX_CONST_3_ARRAY_PITCH(img->array_pitch) | - COND(ubwc_enabled, A6XX_TEX_CONST_3_FLAG | A6XX_TEX_CONST_3_UNK27)); + COND(ubwc_enabled, A6XX_TEX_CONST_3_FLAG | A6XX_TEX_CONST_3_TILE_ALL)); if (img->bo) { OUT_RELOC(ring, img->bo, img->offset, (uint64_t)A6XX_TEX_CONST_5_DEPTH(img->depth) << 32, 0); @@ -195,9 +196,10 @@ OUT_RING(ring, 0x00000000); /* texconst6 */ if (ubwc_enabled) { + struct fdl_slice *ubwc_slice = &rsc->layout.ubwc_slices[img->level]; OUT_RELOC(ring, rsc->bo, img->ubwc_offset, 0, 0); - OUT_RING(ring, A6XX_TEX_CONST_9_FLAG_BUFFER_ARRAY_PITCH(rsc->ubwc_size)); - OUT_RING(ring, A6XX_TEX_CONST_10_FLAG_BUFFER_PITCH(rsc->ubwc_pitch)); + OUT_RING(ring, A6XX_TEX_CONST_9_FLAG_BUFFER_ARRAY_PITCH(rsc->layout.ubwc_size)); + OUT_RING(ring, A6XX_TEX_CONST_10_FLAG_BUFFER_PITCH(ubwc_slice->pitch)); } else { OUT_RING(ring, 0x00000000); /* texconst7 */ OUT_RING(ring, 0x00000000); /* texconst8 */ @@ -230,14 +232,19 @@ static void emit_image_ssbo(struct fd_ringbuffer *ring, struct fd6_image *img) { + /* If the SSBO isn't present (becasue gallium doesn't pack atomic + * counters), zero-fill the slot. + */ + if (!img->prsc) { + for (int i = 0; i < 16; i++) + OUT_RING(ring, 0); + return; + } + struct fd_resource *rsc = fd_resource(img->prsc); - enum a6xx_tile_mode tile_mode = TILE6_LINEAR; + enum a6xx_tile_mode tile_mode = fd_resource_tile_mode(img->prsc, img->level); bool ubwc_enabled = fd_resource_ubwc_enabled(rsc, img->level); - if (rsc->tile_mode && !fd_resource_level_linear(img->prsc, img->level)) { - tile_mode = rsc->tile_mode; - } - OUT_RING(ring, A6XX_IBO_0_FMT(img->fmt) | A6XX_IBO_0_TILE_MODE(tile_mode)); OUT_RING(ring, A6XX_IBO_1_WIDTH(img->width) | @@ -257,9 +264,10 @@ OUT_RING(ring, 0x00000000); if (ubwc_enabled) { + struct fdl_slice *ubwc_slice = &rsc->layout.ubwc_slices[img->level]; OUT_RELOCW(ring, rsc->bo, img->ubwc_offset, 0, 0); - OUT_RING(ring, A6XX_IBO_9_FLAG_BUFFER_ARRAY_PITCH(rsc->ubwc_size)); - OUT_RING(ring, A6XX_IBO_10_FLAG_BUFFER_PITCH(rsc->ubwc_pitch)); + OUT_RING(ring, A6XX_IBO_9_FLAG_BUFFER_ARRAY_PITCH(rsc->layout.ubwc_size)); + OUT_RING(ring, A6XX_IBO_10_FLAG_BUFFER_PITCH(ubwc_slice->pitch)); } else { OUT_RING(ring, 0x00000000); OUT_RING(ring, 0x00000000); @@ -281,24 +289,24 @@ { struct fd_shaderbuf_stateobj *bufso = &ctx->shaderbuf[shader]; struct fd_shaderimg_stateobj *imgso = &ctx->shaderimg[shader]; - const struct ir3_ibo_mapping *mapping = &v->image_mapping; struct fd_ringbuffer *state = fd_submit_new_ringbuffer(ctx->batch->submit, - mapping->num_ibo * 16 * 4, FD_RINGBUFFER_STREAMING); + (v->shader->nir->info.num_ssbos + + v->shader->nir->info.num_images) * 16 * 4, + FD_RINGBUFFER_STREAMING); assert(shader == PIPE_SHADER_COMPUTE || shader == PIPE_SHADER_FRAGMENT); - for (unsigned i = 0; i < mapping->num_ibo; i++) { + for (unsigned i = 0; i < v->shader->nir->info.num_ssbos; i++) { struct fd6_image img; - unsigned idx = mapping->ibo_to_image[i]; - - if (idx & IBO_SSBO) { - translate_buf(&img, &bufso->sb[idx & ~IBO_SSBO]); - } else { - translate_image(&img, &imgso->si[idx]); - } + translate_buf(&img, &bufso->sb[i]); + emit_image_ssbo(state, &img); + } + for (unsigned i = 0; i < v->shader->nir->info.num_images; i++) { + struct fd6_image img; + translate_image(&img, &imgso->si[i]); emit_image_ssbo(state, &img); } diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/a6xx/fd6_pack.h mesa-20.0.8/src/gallium/drivers/freedreno/a6xx/fd6_pack.h --- mesa-19.2.8/src/gallium/drivers/freedreno/a6xx/fd6_pack.h 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/a6xx/fd6_pack.h 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,112 @@ +/* + * Copyright © 2019 Google, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef FD6_PACK_H +#define FD6_PACK_H + +#include "a6xx.xml.h" + +struct fd_reg_pair { + uint32_t reg; + uint64_t value; + struct fd_bo *bo; + bool is_address; + bool bo_write; + uint32_t bo_offset; + uint32_t bo_shift; +}; + +#define __bo_type struct fd_bo * + +#include "a6xx-pack.xml.h" + +#define __assert_eq(a, b) \ + do { \ + if ((a) != (b)) { \ + fprintf(stderr, "assert failed: " #a " (0x%x) != " #b " (0x%x)\n", a, b); \ + assert((a) == (b)); \ + } \ + } while (0) + +#define __ONE_REG(i, ...) \ + do { \ + const struct fd_reg_pair regs[] = { __VA_ARGS__ }; \ + if (i < ARRAY_SIZE(regs) && regs[i].reg > 0) { \ + __assert_eq(regs[0].reg + i, regs[i].reg); \ + if (regs[i].bo) { \ + struct fd_reloc reloc = { \ + .bo = regs[i].bo, \ + .flags = FD_RELOC_READ | \ + (regs[i].bo_write ? FD_RELOC_WRITE : 0), \ + \ + .offset = regs[i].bo_offset, \ + .or = regs[i].value, \ + .shift = regs[i].bo_shift, \ + .orhi = regs[i].value >> 32 \ + }; \ + ring->cur = p; \ + p += 2; \ + fd_ringbuffer_reloc(ring, &reloc); \ + } else { \ + *p++ = regs[i].value; \ + if (regs[i].is_address) \ + *p++ = regs[i].value >> 32; \ + } \ + } \ + } while (0) + +#define OUT_REG(ring, ...) \ + do { \ + const struct fd_reg_pair regs[] = { __VA_ARGS__ }; \ + unsigned count = ARRAY_SIZE(regs); \ + \ + STATIC_ASSERT(count > 0); \ + STATIC_ASSERT(count <= 16); \ + \ + BEGIN_RING(ring, count + 1); \ + uint32_t *p = ring->cur; \ + *p++ = CP_TYPE4_PKT | count | \ + (_odd_parity_bit(count) << 7) | \ + ((regs[0].reg & 0x3ffff) << 8) | \ + ((_odd_parity_bit(regs[0].reg) << 27)); \ + \ + __ONE_REG( 0, __VA_ARGS__); \ + __ONE_REG( 1, __VA_ARGS__); \ + __ONE_REG( 2, __VA_ARGS__); \ + __ONE_REG( 3, __VA_ARGS__); \ + __ONE_REG( 4, __VA_ARGS__); \ + __ONE_REG( 5, __VA_ARGS__); \ + __ONE_REG( 6, __VA_ARGS__); \ + __ONE_REG( 7, __VA_ARGS__); \ + __ONE_REG( 8, __VA_ARGS__); \ + __ONE_REG( 9, __VA_ARGS__); \ + __ONE_REG(10, __VA_ARGS__); \ + __ONE_REG(11, __VA_ARGS__); \ + __ONE_REG(12, __VA_ARGS__); \ + __ONE_REG(13, __VA_ARGS__); \ + __ONE_REG(14, __VA_ARGS__); \ + __ONE_REG(15, __VA_ARGS__); \ + ring->cur = p; \ + } while (0) + +#endif /* FD6_PACK_H */ diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/a6xx/fd6_perfcntr.c mesa-20.0.8/src/gallium/drivers/freedreno/a6xx/fd6_perfcntr.c --- mesa-19.2.8/src/gallium/drivers/freedreno/a6xx/fd6_perfcntr.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/a6xx/fd6_perfcntr.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,805 +0,0 @@ -/* - * Copyright (C) 2019 Rob Clark - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Authors: - * Rob Clark - */ - -#ifndef FD6_PERFCNTR_H_ -#define FD6_PERFCNTR_H_ - -#include "freedreno_perfcntr.h" -#include "fd6_format.h" - -#define REG(_x) REG_A6XX_ ## _x - -#define COUNTER(_sel, _lo, _hi) { \ - .select_reg = REG(_sel), \ - .counter_reg_lo = REG(_lo), \ - .counter_reg_hi = REG(_hi), \ -} - -#define COUNTER2(_sel, _lo, _hi, _en, _clr) { \ - .select_reg = REG(_sel), \ - .counter_reg_lo = REG(_lo), \ - .counter_reg_hi = REG(_hi), \ - .enable = REG(_en), \ - .clear = REG(_clr), \ -} - -#define COUNTABLE(_selector, _query_type, _result_type) { \ - .name = #_selector, \ - .selector = _selector, \ - .query_type = PIPE_DRIVER_QUERY_TYPE_ ## _query_type, \ - .result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_ ## _result_type, \ -} - -#define GROUP(_name, _counters, _countables) { \ - .name = _name, \ - .num_counters = ARRAY_SIZE(_counters), \ - .counters = _counters, \ - .num_countables = ARRAY_SIZE(_countables), \ - .countables = _countables, \ -} - -static const struct fd_perfcntr_counter cp_counters[] = { -//RESERVED: for kernel -// COUNTER(CP_PERFCTR_CP_SEL_0, RBBM_PERFCTR_CP_0_LO, RBBM_PERFCTR_CP_0_HI), - COUNTER(CP_PERFCTR_CP_SEL_1, RBBM_PERFCTR_CP_1_LO, RBBM_PERFCTR_CP_1_HI), - COUNTER(CP_PERFCTR_CP_SEL_2, RBBM_PERFCTR_CP_2_LO, RBBM_PERFCTR_CP_2_HI), - COUNTER(CP_PERFCTR_CP_SEL_3, RBBM_PERFCTR_CP_3_LO, RBBM_PERFCTR_CP_3_HI), - COUNTER(CP_PERFCTR_CP_SEL_4, RBBM_PERFCTR_CP_4_LO, RBBM_PERFCTR_CP_4_HI), - COUNTER(CP_PERFCTR_CP_SEL_5, RBBM_PERFCTR_CP_5_LO, RBBM_PERFCTR_CP_5_HI), - COUNTER(CP_PERFCTR_CP_SEL_6, RBBM_PERFCTR_CP_6_LO, RBBM_PERFCTR_CP_6_HI), - COUNTER(CP_PERFCTR_CP_SEL_7, RBBM_PERFCTR_CP_7_LO, RBBM_PERFCTR_CP_7_HI), - COUNTER(CP_PERFCTR_CP_SEL_8, RBBM_PERFCTR_CP_8_LO, RBBM_PERFCTR_CP_8_HI), - COUNTER(CP_PERFCTR_CP_SEL_9, RBBM_PERFCTR_CP_9_LO, RBBM_PERFCTR_CP_9_HI), - COUNTER(CP_PERFCTR_CP_SEL_10, RBBM_PERFCTR_CP_10_LO, RBBM_PERFCTR_CP_10_HI), - COUNTER(CP_PERFCTR_CP_SEL_11, RBBM_PERFCTR_CP_11_LO, RBBM_PERFCTR_CP_11_HI), - COUNTER(CP_PERFCTR_CP_SEL_12, RBBM_PERFCTR_CP_12_LO, RBBM_PERFCTR_CP_12_HI), - COUNTER(CP_PERFCTR_CP_SEL_13, RBBM_PERFCTR_CP_13_LO, RBBM_PERFCTR_CP_13_HI), -}; - -static const struct fd_perfcntr_countable cp_countables[] = { - COUNTABLE(PERF_CP_ALWAYS_COUNT, UINT64, AVERAGE), - COUNTABLE(PERF_CP_BUSY_GFX_CORE_IDLE, UINT64, AVERAGE), - COUNTABLE(PERF_CP_BUSY_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_CP_NUM_PREEMPTIONS, UINT64, AVERAGE), - COUNTABLE(PERF_CP_PREEMPTION_REACTION_DELAY, UINT64, AVERAGE), - COUNTABLE(PERF_CP_PREEMPTION_SWITCH_OUT_TIME, UINT64, AVERAGE), - COUNTABLE(PERF_CP_PREEMPTION_SWITCH_IN_TIME, UINT64, AVERAGE), - COUNTABLE(PERF_CP_DEAD_DRAWS_IN_BIN_RENDER, UINT64, AVERAGE), - COUNTABLE(PERF_CP_PREDICATED_DRAWS_KILLED, UINT64, AVERAGE), - COUNTABLE(PERF_CP_MODE_SWITCH, UINT64, AVERAGE), - COUNTABLE(PERF_CP_ZPASS_DONE, UINT64, AVERAGE), - COUNTABLE(PERF_CP_CONTEXT_DONE, UINT64, AVERAGE), - COUNTABLE(PERF_CP_CACHE_FLUSH, UINT64, AVERAGE), - COUNTABLE(PERF_CP_LONG_PREEMPTIONS, UINT64, AVERAGE), - COUNTABLE(PERF_CP_SQE_I_CACHE_STARVE, UINT64, AVERAGE), - COUNTABLE(PERF_CP_SQE_IDLE, UINT64, AVERAGE), - COUNTABLE(PERF_CP_SQE_PM4_STARVE_RB_IB, UINT64, AVERAGE), - COUNTABLE(PERF_CP_SQE_PM4_STARVE_SDS, UINT64, AVERAGE), - COUNTABLE(PERF_CP_SQE_MRB_STARVE, UINT64, AVERAGE), - COUNTABLE(PERF_CP_SQE_RRB_STARVE, UINT64, AVERAGE), - COUNTABLE(PERF_CP_SQE_VSD_STARVE, UINT64, AVERAGE), - COUNTABLE(PERF_CP_VSD_DECODE_STARVE, UINT64, AVERAGE), - COUNTABLE(PERF_CP_SQE_PIPE_OUT_STALL, UINT64, AVERAGE), - COUNTABLE(PERF_CP_SQE_SYNC_STALL, UINT64, AVERAGE), - COUNTABLE(PERF_CP_SQE_PM4_WFI_STALL, UINT64, AVERAGE), - COUNTABLE(PERF_CP_SQE_SYS_WFI_STALL, UINT64, AVERAGE), - COUNTABLE(PERF_CP_SQE_T4_EXEC, UINT64, AVERAGE), - COUNTABLE(PERF_CP_SQE_LOAD_STATE_EXEC, UINT64, AVERAGE), - COUNTABLE(PERF_CP_SQE_SAVE_SDS_STATE, UINT64, AVERAGE), - COUNTABLE(PERF_CP_SQE_DRAW_EXEC, UINT64, AVERAGE), - COUNTABLE(PERF_CP_SQE_CTXT_REG_BUNCH_EXEC, UINT64, AVERAGE), - COUNTABLE(PERF_CP_SQE_EXEC_PROFILED, UINT64, AVERAGE), - COUNTABLE(PERF_CP_MEMORY_POOL_EMPTY, UINT64, AVERAGE), - COUNTABLE(PERF_CP_MEMORY_POOL_SYNC_STALL, UINT64, AVERAGE), - COUNTABLE(PERF_CP_MEMORY_POOL_ABOVE_THRESH, UINT64, AVERAGE), - COUNTABLE(PERF_CP_AHB_WR_STALL_PRE_DRAWS, UINT64, AVERAGE), - COUNTABLE(PERF_CP_AHB_STALL_SQE_GMU, UINT64, AVERAGE), - COUNTABLE(PERF_CP_AHB_STALL_SQE_WR_OTHER, UINT64, AVERAGE), - COUNTABLE(PERF_CP_AHB_STALL_SQE_RD_OTHER, UINT64, AVERAGE), - COUNTABLE(PERF_CP_CLUSTER0_EMPTY, UINT64, AVERAGE), - COUNTABLE(PERF_CP_CLUSTER1_EMPTY, UINT64, AVERAGE), - COUNTABLE(PERF_CP_CLUSTER2_EMPTY, UINT64, AVERAGE), - COUNTABLE(PERF_CP_CLUSTER3_EMPTY, UINT64, AVERAGE), - COUNTABLE(PERF_CP_CLUSTER4_EMPTY, UINT64, AVERAGE), - COUNTABLE(PERF_CP_CLUSTER5_EMPTY, UINT64, AVERAGE), - COUNTABLE(PERF_CP_PM4_DATA, UINT64, AVERAGE), - COUNTABLE(PERF_CP_PM4_HEADERS, UINT64, AVERAGE), - COUNTABLE(PERF_CP_VBIF_READ_BEATS, UINT64, AVERAGE), - COUNTABLE(PERF_CP_VBIF_WRITE_BEATS, UINT64, AVERAGE), - COUNTABLE(PERF_CP_SQE_INSTR_COUNTER, UINT64, AVERAGE), -}; - -static const struct fd_perfcntr_counter ccu_counters[] = { - COUNTER(RB_PERFCTR_CCU_SEL_0, RBBM_PERFCTR_CCU_0_LO, RBBM_PERFCTR_CCU_0_HI), - COUNTER(RB_PERFCTR_CCU_SEL_1, RBBM_PERFCTR_CCU_1_LO, RBBM_PERFCTR_CCU_1_HI), - COUNTER(RB_PERFCTR_CCU_SEL_2, RBBM_PERFCTR_CCU_2_LO, RBBM_PERFCTR_CCU_2_HI), - COUNTER(RB_PERFCTR_CCU_SEL_3, RBBM_PERFCTR_CCU_3_LO, RBBM_PERFCTR_CCU_3_HI), - COUNTER(RB_PERFCTR_CCU_SEL_4, RBBM_PERFCTR_CCU_4_LO, RBBM_PERFCTR_CCU_4_HI), -}; - -static const struct fd_perfcntr_countable ccu_countables[] = { - COUNTABLE(PERF_CCU_BUSY_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_CCU_STALL_CYCLES_RB_DEPTH_RETURN, UINT64, AVERAGE), - COUNTABLE(PERF_CCU_STALL_CYCLES_RB_COLOR_RETURN, UINT64, AVERAGE), - COUNTABLE(PERF_CCU_STARVE_CYCLES_FLAG_RETURN, UINT64, AVERAGE), - COUNTABLE(PERF_CCU_DEPTH_BLOCKS, UINT64, AVERAGE), - COUNTABLE(PERF_CCU_COLOR_BLOCKS, UINT64, AVERAGE), - COUNTABLE(PERF_CCU_DEPTH_BLOCK_HIT, UINT64, AVERAGE), - COUNTABLE(PERF_CCU_COLOR_BLOCK_HIT, UINT64, AVERAGE), - COUNTABLE(PERF_CCU_PARTIAL_BLOCK_READ, UINT64, AVERAGE), - COUNTABLE(PERF_CCU_GMEM_READ, UINT64, AVERAGE), - COUNTABLE(PERF_CCU_GMEM_WRITE, UINT64, AVERAGE), - COUNTABLE(PERF_CCU_DEPTH_READ_FLAG0_COUNT, UINT64, AVERAGE), - COUNTABLE(PERF_CCU_DEPTH_READ_FLAG1_COUNT, UINT64, AVERAGE), - COUNTABLE(PERF_CCU_DEPTH_READ_FLAG2_COUNT, UINT64, AVERAGE), - COUNTABLE(PERF_CCU_DEPTH_READ_FLAG3_COUNT, UINT64, AVERAGE), - COUNTABLE(PERF_CCU_DEPTH_READ_FLAG4_COUNT, UINT64, AVERAGE), - COUNTABLE(PERF_CCU_DEPTH_READ_FLAG5_COUNT, UINT64, AVERAGE), - COUNTABLE(PERF_CCU_DEPTH_READ_FLAG6_COUNT, UINT64, AVERAGE), - COUNTABLE(PERF_CCU_DEPTH_READ_FLAG8_COUNT, UINT64, AVERAGE), - COUNTABLE(PERF_CCU_COLOR_READ_FLAG0_COUNT, UINT64, AVERAGE), - COUNTABLE(PERF_CCU_COLOR_READ_FLAG1_COUNT, UINT64, AVERAGE), - COUNTABLE(PERF_CCU_COLOR_READ_FLAG2_COUNT, UINT64, AVERAGE), - COUNTABLE(PERF_CCU_COLOR_READ_FLAG3_COUNT, UINT64, AVERAGE), - COUNTABLE(PERF_CCU_COLOR_READ_FLAG4_COUNT, UINT64, AVERAGE), - COUNTABLE(PERF_CCU_COLOR_READ_FLAG5_COUNT, UINT64, AVERAGE), - COUNTABLE(PERF_CCU_COLOR_READ_FLAG6_COUNT, UINT64, AVERAGE), - COUNTABLE(PERF_CCU_COLOR_READ_FLAG8_COUNT, UINT64, AVERAGE), - COUNTABLE(PERF_CCU_2D_RD_REQ, UINT64, AVERAGE), - COUNTABLE(PERF_CCU_2D_WR_REQ, UINT64, AVERAGE), -}; - -static const struct fd_perfcntr_counter tse_counters[] = { - COUNTER(GRAS_PERFCTR_TSE_SEL_0, RBBM_PERFCTR_TSE_0_LO, RBBM_PERFCTR_TSE_0_HI), - COUNTER(GRAS_PERFCTR_TSE_SEL_1, RBBM_PERFCTR_TSE_1_LO, RBBM_PERFCTR_TSE_1_HI), - COUNTER(GRAS_PERFCTR_TSE_SEL_2, RBBM_PERFCTR_TSE_2_LO, RBBM_PERFCTR_TSE_2_HI), - COUNTER(GRAS_PERFCTR_TSE_SEL_3, RBBM_PERFCTR_TSE_3_LO, RBBM_PERFCTR_TSE_3_HI), -}; - -static const struct fd_perfcntr_countable tse_countables[] = { - COUNTABLE(PERF_TSE_BUSY_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_TSE_CLIPPING_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_TSE_STALL_CYCLES_RAS, UINT64, AVERAGE), - COUNTABLE(PERF_TSE_STALL_CYCLES_LRZ_BARYPLANE, UINT64, AVERAGE), - COUNTABLE(PERF_TSE_STALL_CYCLES_LRZ_ZPLANE, UINT64, AVERAGE), - COUNTABLE(PERF_TSE_STARVE_CYCLES_PC, UINT64, AVERAGE), - COUNTABLE(PERF_TSE_INPUT_PRIM, UINT64, AVERAGE), - COUNTABLE(PERF_TSE_INPUT_NULL_PRIM, UINT64, AVERAGE), - COUNTABLE(PERF_TSE_TRIVAL_REJ_PRIM, UINT64, AVERAGE), - COUNTABLE(PERF_TSE_CLIPPED_PRIM, UINT64, AVERAGE), - COUNTABLE(PERF_TSE_ZERO_AREA_PRIM, UINT64, AVERAGE), - COUNTABLE(PERF_TSE_FACENESS_CULLED_PRIM, UINT64, AVERAGE), - COUNTABLE(PERF_TSE_ZERO_PIXEL_PRIM, UINT64, AVERAGE), - COUNTABLE(PERF_TSE_OUTPUT_NULL_PRIM, UINT64, AVERAGE), - COUNTABLE(PERF_TSE_OUTPUT_VISIBLE_PRIM, UINT64, AVERAGE), - COUNTABLE(PERF_TSE_CINVOCATION, UINT64, AVERAGE), - COUNTABLE(PERF_TSE_CPRIMITIVES, UINT64, AVERAGE), - COUNTABLE(PERF_TSE_2D_INPUT_PRIM, UINT64, AVERAGE), - COUNTABLE(PERF_TSE_2D_ALIVE_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_TSE_CLIP_PLANES, UINT64, AVERAGE), -}; - -static const struct fd_perfcntr_counter ras_counters[] = { - COUNTER(GRAS_PERFCTR_RAS_SEL_0, RBBM_PERFCTR_RAS_0_LO, RBBM_PERFCTR_RAS_0_HI), - COUNTER(GRAS_PERFCTR_RAS_SEL_1, RBBM_PERFCTR_RAS_1_LO, RBBM_PERFCTR_RAS_1_HI), - COUNTER(GRAS_PERFCTR_RAS_SEL_2, RBBM_PERFCTR_RAS_2_LO, RBBM_PERFCTR_RAS_2_HI), - COUNTER(GRAS_PERFCTR_RAS_SEL_3, RBBM_PERFCTR_RAS_3_LO, RBBM_PERFCTR_RAS_3_HI), -}; - -static const struct fd_perfcntr_countable ras_countables[] = { - COUNTABLE(PERF_RAS_BUSY_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_RAS_SUPERTILE_ACTIVE_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_RAS_STALL_CYCLES_LRZ, UINT64, AVERAGE), - COUNTABLE(PERF_RAS_STARVE_CYCLES_TSE, UINT64, AVERAGE), - COUNTABLE(PERF_RAS_SUPER_TILES, UINT64, AVERAGE), - COUNTABLE(PERF_RAS_8X4_TILES, UINT64, AVERAGE), - COUNTABLE(PERF_RAS_MASKGEN_ACTIVE, UINT64, AVERAGE), - COUNTABLE(PERF_RAS_FULLY_COVERED_SUPER_TILES, UINT64, AVERAGE), - COUNTABLE(PERF_RAS_FULLY_COVERED_8X4_TILES, UINT64, AVERAGE), - COUNTABLE(PERF_RAS_PRIM_KILLED_INVISILBE, UINT64, AVERAGE), - COUNTABLE(PERF_RAS_SUPERTILE_GEN_ACTIVE_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_RAS_LRZ_INTF_WORKING_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_RAS_BLOCKS, UINT64, AVERAGE), -}; - -static const struct fd_perfcntr_counter lrz_counters[] = { - COUNTER(GRAS_PERFCTR_LRZ_SEL_0, RBBM_PERFCTR_LRZ_0_LO, RBBM_PERFCTR_LRZ_0_HI), - COUNTER(GRAS_PERFCTR_LRZ_SEL_1, RBBM_PERFCTR_LRZ_1_LO, RBBM_PERFCTR_LRZ_1_HI), - COUNTER(GRAS_PERFCTR_LRZ_SEL_2, RBBM_PERFCTR_LRZ_2_LO, RBBM_PERFCTR_LRZ_2_HI), - COUNTER(GRAS_PERFCTR_LRZ_SEL_3, RBBM_PERFCTR_LRZ_3_LO, RBBM_PERFCTR_LRZ_3_HI), -}; - -static const struct fd_perfcntr_countable lrz_countables[] = { - COUNTABLE(PERF_LRZ_BUSY_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_LRZ_STARVE_CYCLES_RAS, UINT64, AVERAGE), - COUNTABLE(PERF_LRZ_STALL_CYCLES_RB, UINT64, AVERAGE), - COUNTABLE(PERF_LRZ_STALL_CYCLES_VSC, UINT64, AVERAGE), - COUNTABLE(PERF_LRZ_STALL_CYCLES_VPC, UINT64, AVERAGE), - COUNTABLE(PERF_LRZ_STALL_CYCLES_FLAG_PREFETCH, UINT64, AVERAGE), - COUNTABLE(PERF_LRZ_STALL_CYCLES_UCHE, UINT64, AVERAGE), - COUNTABLE(PERF_LRZ_LRZ_READ, UINT64, AVERAGE), - COUNTABLE(PERF_LRZ_LRZ_WRITE, UINT64, AVERAGE), - COUNTABLE(PERF_LRZ_READ_LATENCY, UINT64, AVERAGE), - COUNTABLE(PERF_LRZ_MERGE_CACHE_UPDATING, UINT64, AVERAGE), - COUNTABLE(PERF_LRZ_PRIM_KILLED_BY_MASKGEN, UINT64, AVERAGE), - COUNTABLE(PERF_LRZ_PRIM_KILLED_BY_LRZ, UINT64, AVERAGE), - COUNTABLE(PERF_LRZ_VISIBLE_PRIM_AFTER_LRZ, UINT64, AVERAGE), - COUNTABLE(PERF_LRZ_FULL_8X8_TILES, UINT64, AVERAGE), - COUNTABLE(PERF_LRZ_PARTIAL_8X8_TILES, UINT64, AVERAGE), - COUNTABLE(PERF_LRZ_TILE_KILLED, UINT64, AVERAGE), - COUNTABLE(PERF_LRZ_TOTAL_PIXEL, UINT64, AVERAGE), - COUNTABLE(PERF_LRZ_VISIBLE_PIXEL_AFTER_LRZ, UINT64, AVERAGE), - COUNTABLE(PERF_LRZ_FULLY_COVERED_TILES, UINT64, AVERAGE), - COUNTABLE(PERF_LRZ_PARTIAL_COVERED_TILES, UINT64, AVERAGE), - COUNTABLE(PERF_LRZ_FEEDBACK_ACCEPT, UINT64, AVERAGE), - COUNTABLE(PERF_LRZ_FEEDBACK_DISCARD, UINT64, AVERAGE), - COUNTABLE(PERF_LRZ_FEEDBACK_STALL, UINT64, AVERAGE), - COUNTABLE(PERF_LRZ_STALL_CYCLES_RB_ZPLANE, UINT64, AVERAGE), - COUNTABLE(PERF_LRZ_STALL_CYCLES_RB_BPLANE, UINT64, AVERAGE), - COUNTABLE(PERF_LRZ_STALL_CYCLES_VC, UINT64, AVERAGE), - COUNTABLE(PERF_LRZ_RAS_MASK_TRANS, UINT64, AVERAGE), -}; - -static const struct fd_perfcntr_counter hlsq_counters[] = { - COUNTER(HLSQ_PERFCTR_HLSQ_SEL_0, RBBM_PERFCTR_HLSQ_0_LO, RBBM_PERFCTR_HLSQ_0_HI), - COUNTER(HLSQ_PERFCTR_HLSQ_SEL_1, RBBM_PERFCTR_HLSQ_1_LO, RBBM_PERFCTR_HLSQ_1_HI), - COUNTER(HLSQ_PERFCTR_HLSQ_SEL_2, RBBM_PERFCTR_HLSQ_2_LO, RBBM_PERFCTR_HLSQ_2_HI), - COUNTER(HLSQ_PERFCTR_HLSQ_SEL_3, RBBM_PERFCTR_HLSQ_3_LO, RBBM_PERFCTR_HLSQ_3_HI), - COUNTER(HLSQ_PERFCTR_HLSQ_SEL_4, RBBM_PERFCTR_HLSQ_4_LO, RBBM_PERFCTR_HLSQ_4_HI), - COUNTER(HLSQ_PERFCTR_HLSQ_SEL_5, RBBM_PERFCTR_HLSQ_5_LO, RBBM_PERFCTR_HLSQ_5_HI), -// TODO did we loose some HLSQ counters or are they just missing from xml -// COUNTER(HLSQ_PERFCTR_HLSQ_SEL_6, RBBM_PERFCTR_HLSQ_6_LO, RBBM_PERFCTR_HLSQ_6_HI), -// COUNTER(HLSQ_PERFCTR_HLSQ_SEL_7, RBBM_PERFCTR_HLSQ_7_LO, RBBM_PERFCTR_HLSQ_7_HI), -}; - -static const struct fd_perfcntr_countable hlsq_countables[] = { - COUNTABLE(PERF_HLSQ_BUSY_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_HLSQ_STALL_CYCLES_UCHE, UINT64, AVERAGE), - COUNTABLE(PERF_HLSQ_STALL_CYCLES_SP_STATE, UINT64, AVERAGE), - COUNTABLE(PERF_HLSQ_STALL_CYCLES_SP_FS_STAGE, UINT64, AVERAGE), - COUNTABLE(PERF_HLSQ_UCHE_LATENCY_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_HLSQ_UCHE_LATENCY_COUNT, UINT64, AVERAGE), - COUNTABLE(PERF_HLSQ_FS_STAGE_1X_WAVES, UINT64, AVERAGE), - COUNTABLE(PERF_HLSQ_FS_STAGE_2X_WAVES, UINT64, AVERAGE), - COUNTABLE(PERF_HLSQ_QUADS, UINT64, AVERAGE), - COUNTABLE(PERF_HLSQ_CS_INVOCATIONS, UINT64, AVERAGE), - COUNTABLE(PERF_HLSQ_COMPUTE_DRAWCALLS, UINT64, AVERAGE), - COUNTABLE(PERF_HLSQ_FS_DATA_WAIT_PROGRAMMING, UINT64, AVERAGE), - COUNTABLE(PERF_HLSQ_DUAL_FS_PROG_ACTIVE, UINT64, AVERAGE), - COUNTABLE(PERF_HLSQ_DUAL_VS_PROG_ACTIVE, UINT64, AVERAGE), - COUNTABLE(PERF_HLSQ_FS_BATCH_COUNT_ZERO, UINT64, AVERAGE), - COUNTABLE(PERF_HLSQ_VS_BATCH_COUNT_ZERO, UINT64, AVERAGE), - COUNTABLE(PERF_HLSQ_WAVE_PENDING_NO_QUAD, UINT64, AVERAGE), - COUNTABLE(PERF_HLSQ_WAVE_PENDING_NO_PRIM_BASE, UINT64, AVERAGE), - COUNTABLE(PERF_HLSQ_STALL_CYCLES_VPC, UINT64, AVERAGE), - COUNTABLE(PERF_HLSQ_PIXELS, UINT64, AVERAGE), - COUNTABLE(PERF_HLSQ_DRAW_MODE_SWITCH_VSFS_SYNC, UINT64, AVERAGE), -}; - -static const struct fd_perfcntr_counter pc_counters[] = { - COUNTER(PC_PERFCTR_PC_SEL_0, RBBM_PERFCTR_PC_0_LO, RBBM_PERFCTR_PC_0_HI), - COUNTER(PC_PERFCTR_PC_SEL_1, RBBM_PERFCTR_PC_1_LO, RBBM_PERFCTR_PC_1_HI), - COUNTER(PC_PERFCTR_PC_SEL_2, RBBM_PERFCTR_PC_2_LO, RBBM_PERFCTR_PC_2_HI), - COUNTER(PC_PERFCTR_PC_SEL_3, RBBM_PERFCTR_PC_3_LO, RBBM_PERFCTR_PC_3_HI), - COUNTER(PC_PERFCTR_PC_SEL_4, RBBM_PERFCTR_PC_4_LO, RBBM_PERFCTR_PC_4_HI), - COUNTER(PC_PERFCTR_PC_SEL_5, RBBM_PERFCTR_PC_5_LO, RBBM_PERFCTR_PC_5_HI), - COUNTER(PC_PERFCTR_PC_SEL_6, RBBM_PERFCTR_PC_6_LO, RBBM_PERFCTR_PC_6_HI), - COUNTER(PC_PERFCTR_PC_SEL_7, RBBM_PERFCTR_PC_7_LO, RBBM_PERFCTR_PC_7_HI), -}; - -static const struct fd_perfcntr_countable pc_countables[] = { - COUNTABLE(PERF_PC_BUSY_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_PC_WORKING_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_PC_STALL_CYCLES_VFD, UINT64, AVERAGE), - COUNTABLE(PERF_PC_STALL_CYCLES_TSE, UINT64, AVERAGE), - COUNTABLE(PERF_PC_STALL_CYCLES_VPC, UINT64, AVERAGE), - COUNTABLE(PERF_PC_STALL_CYCLES_UCHE, UINT64, AVERAGE), - COUNTABLE(PERF_PC_STALL_CYCLES_TESS, UINT64, AVERAGE), - COUNTABLE(PERF_PC_STALL_CYCLES_TSE_ONLY, UINT64, AVERAGE), - COUNTABLE(PERF_PC_STALL_CYCLES_VPC_ONLY, UINT64, AVERAGE), - COUNTABLE(PERF_PC_PASS1_TF_STALL_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_PC_STARVE_CYCLES_FOR_INDEX, UINT64, AVERAGE), - COUNTABLE(PERF_PC_STARVE_CYCLES_FOR_TESS_FACTOR, UINT64, AVERAGE), - COUNTABLE(PERF_PC_STARVE_CYCLES_FOR_VIZ_STREAM, UINT64, AVERAGE), - COUNTABLE(PERF_PC_STARVE_CYCLES_FOR_POSITION, UINT64, AVERAGE), - COUNTABLE(PERF_PC_STARVE_CYCLES_DI, UINT64, AVERAGE), - COUNTABLE(PERF_PC_VIS_STREAMS_LOADED, UINT64, AVERAGE), - COUNTABLE(PERF_PC_INSTANCES, UINT64, AVERAGE), - COUNTABLE(PERF_PC_VPC_PRIMITIVES, UINT64, AVERAGE), - COUNTABLE(PERF_PC_DEAD_PRIM, UINT64, AVERAGE), - COUNTABLE(PERF_PC_LIVE_PRIM, UINT64, AVERAGE), - COUNTABLE(PERF_PC_VERTEX_HITS, UINT64, AVERAGE), - COUNTABLE(PERF_PC_IA_VERTICES, UINT64, AVERAGE), - COUNTABLE(PERF_PC_IA_PRIMITIVES, UINT64, AVERAGE), - COUNTABLE(PERF_PC_GS_PRIMITIVES, UINT64, AVERAGE), - COUNTABLE(PERF_PC_HS_INVOCATIONS, UINT64, AVERAGE), - COUNTABLE(PERF_PC_DS_INVOCATIONS, UINT64, AVERAGE), - COUNTABLE(PERF_PC_VS_INVOCATIONS, UINT64, AVERAGE), - COUNTABLE(PERF_PC_GS_INVOCATIONS, UINT64, AVERAGE), - COUNTABLE(PERF_PC_DS_PRIMITIVES, UINT64, AVERAGE), - COUNTABLE(PERF_PC_VPC_POS_DATA_TRANSACTION, UINT64, AVERAGE), - COUNTABLE(PERF_PC_3D_DRAWCALLS, UINT64, AVERAGE), - COUNTABLE(PERF_PC_2D_DRAWCALLS, UINT64, AVERAGE), - COUNTABLE(PERF_PC_NON_DRAWCALL_GLOBAL_EVENTS, UINT64, AVERAGE), - COUNTABLE(PERF_TESS_BUSY_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_TESS_WORKING_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_TESS_STALL_CYCLES_PC, UINT64, AVERAGE), - COUNTABLE(PERF_TESS_STARVE_CYCLES_PC, UINT64, AVERAGE), - COUNTABLE(PERF_PC_TSE_TRANSACTION, UINT64, AVERAGE), - COUNTABLE(PERF_PC_TSE_VERTEX, UINT64, AVERAGE), - COUNTABLE(PERF_PC_TESS_PC_UV_TRANS, UINT64, AVERAGE), - COUNTABLE(PERF_PC_TESS_PC_UV_PATCHES, UINT64, AVERAGE), - COUNTABLE(PERF_PC_TESS_FACTOR_TRANS, UINT64, AVERAGE), -}; - -static const struct fd_perfcntr_counter rb_counters[] = { - COUNTER(RB_PERFCTR_RB_SEL_0, RBBM_PERFCTR_RB_0_LO, RBBM_PERFCTR_RB_0_HI), - COUNTER(RB_PERFCTR_RB_SEL_1, RBBM_PERFCTR_RB_1_LO, RBBM_PERFCTR_RB_1_HI), - COUNTER(RB_PERFCTR_RB_SEL_2, RBBM_PERFCTR_RB_2_LO, RBBM_PERFCTR_RB_2_HI), - COUNTER(RB_PERFCTR_RB_SEL_3, RBBM_PERFCTR_RB_3_LO, RBBM_PERFCTR_RB_3_HI), - COUNTER(RB_PERFCTR_RB_SEL_4, RBBM_PERFCTR_RB_4_LO, RBBM_PERFCTR_RB_4_HI), - COUNTER(RB_PERFCTR_RB_SEL_5, RBBM_PERFCTR_RB_5_LO, RBBM_PERFCTR_RB_5_HI), - COUNTER(RB_PERFCTR_RB_SEL_6, RBBM_PERFCTR_RB_6_LO, RBBM_PERFCTR_RB_6_HI), - COUNTER(RB_PERFCTR_RB_SEL_7, RBBM_PERFCTR_RB_7_LO, RBBM_PERFCTR_RB_7_HI), -}; - -static const struct fd_perfcntr_countable rb_countables[] = { - COUNTABLE(PERF_RB_BUSY_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_RB_STALL_CYCLES_HLSQ, UINT64, AVERAGE), - COUNTABLE(PERF_RB_STALL_CYCLES_FIFO0_FULL, UINT64, AVERAGE), - COUNTABLE(PERF_RB_STALL_CYCLES_FIFO1_FULL, UINT64, AVERAGE), - COUNTABLE(PERF_RB_STALL_CYCLES_FIFO2_FULL, UINT64, AVERAGE), - COUNTABLE(PERF_RB_STARVE_CYCLES_SP, UINT64, AVERAGE), - COUNTABLE(PERF_RB_STARVE_CYCLES_LRZ_TILE, UINT64, AVERAGE), - COUNTABLE(PERF_RB_STARVE_CYCLES_CCU, UINT64, AVERAGE), - COUNTABLE(PERF_RB_STARVE_CYCLES_Z_PLANE, UINT64, AVERAGE), - COUNTABLE(PERF_RB_STARVE_CYCLES_BARY_PLANE, UINT64, AVERAGE), - COUNTABLE(PERF_RB_Z_WORKLOAD, UINT64, AVERAGE), - COUNTABLE(PERF_RB_HLSQ_ACTIVE, UINT64, AVERAGE), - COUNTABLE(PERF_RB_Z_READ, UINT64, AVERAGE), - COUNTABLE(PERF_RB_Z_WRITE, UINT64, AVERAGE), - COUNTABLE(PERF_RB_C_READ, UINT64, AVERAGE), - COUNTABLE(PERF_RB_C_WRITE, UINT64, AVERAGE), - COUNTABLE(PERF_RB_TOTAL_PASS, UINT64, AVERAGE), - COUNTABLE(PERF_RB_Z_PASS, UINT64, AVERAGE), - COUNTABLE(PERF_RB_Z_FAIL, UINT64, AVERAGE), - COUNTABLE(PERF_RB_S_FAIL, UINT64, AVERAGE), - COUNTABLE(PERF_RB_BLENDED_FXP_COMPONENTS, UINT64, AVERAGE), - COUNTABLE(PERF_RB_BLENDED_FP16_COMPONENTS, UINT64, AVERAGE), - COUNTABLE(PERF_RB_PS_INVOCATIONS, UINT64, AVERAGE), - COUNTABLE(PERF_RB_2D_ALIVE_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_RB_2D_STALL_CYCLES_A2D, UINT64, AVERAGE), - COUNTABLE(PERF_RB_2D_STARVE_CYCLES_SRC, UINT64, AVERAGE), - COUNTABLE(PERF_RB_2D_STARVE_CYCLES_SP, UINT64, AVERAGE), - COUNTABLE(PERF_RB_2D_STARVE_CYCLES_DST, UINT64, AVERAGE), - COUNTABLE(PERF_RB_2D_VALID_PIXELS, UINT64, AVERAGE), - COUNTABLE(PERF_RB_3D_PIXELS, UINT64, AVERAGE), - COUNTABLE(PERF_RB_BLENDER_WORKING_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_RB_ZPROC_WORKING_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_RB_CPROC_WORKING_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_RB_SAMPLER_WORKING_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_RB_STALL_CYCLES_CCU_COLOR_READ, UINT64, AVERAGE), - COUNTABLE(PERF_RB_STALL_CYCLES_CCU_COLOR_WRITE, UINT64, AVERAGE), - COUNTABLE(PERF_RB_STALL_CYCLES_CCU_DEPTH_READ, UINT64, AVERAGE), - COUNTABLE(PERF_RB_STALL_CYCLES_CCU_DEPTH_WRITE, UINT64, AVERAGE), - COUNTABLE(PERF_RB_STALL_CYCLES_VPC, UINT64, AVERAGE), - COUNTABLE(PERF_RB_2D_INPUT_TRANS, UINT64, AVERAGE), - COUNTABLE(PERF_RB_2D_OUTPUT_RB_DST_TRANS, UINT64, AVERAGE), - COUNTABLE(PERF_RB_2D_OUTPUT_RB_SRC_TRANS, UINT64, AVERAGE), - COUNTABLE(PERF_RB_BLENDED_FP32_COMPONENTS, UINT64, AVERAGE), - COUNTABLE(PERF_RB_COLOR_PIX_TILES, UINT64, AVERAGE), - COUNTABLE(PERF_RB_STALL_CYCLES_CCU, UINT64, AVERAGE), - COUNTABLE(PERF_RB_EARLY_Z_ARB3_GRANT, UINT64, AVERAGE), - COUNTABLE(PERF_RB_LATE_Z_ARB3_GRANT, UINT64, AVERAGE), - COUNTABLE(PERF_RB_EARLY_Z_SKIP_GRANT, UINT64, AVERAGE), -}; - -static const struct fd_perfcntr_counter rbbm_counters[] = { -//RESERVED: for kernel -// COUNTER(RBBM_PERFCTR_RBBM_SEL_0, RBBM_PERFCTR_RBBM_0_LO, RBBM_PERFCTR_RBBM_0_HI), - COUNTER(RBBM_PERFCTR_RBBM_SEL_1, RBBM_PERFCTR_RBBM_1_LO, RBBM_PERFCTR_RBBM_1_HI), - COUNTER(RBBM_PERFCTR_RBBM_SEL_2, RBBM_PERFCTR_RBBM_2_LO, RBBM_PERFCTR_RBBM_2_HI), - COUNTER(RBBM_PERFCTR_RBBM_SEL_3, RBBM_PERFCTR_RBBM_3_LO, RBBM_PERFCTR_RBBM_3_HI), -}; - -static const struct fd_perfcntr_countable rbbm_countables[] = { - COUNTABLE(PERF_RBBM_ALWAYS_COUNT, UINT64, AVERAGE), - COUNTABLE(PERF_RBBM_ALWAYS_ON, UINT64, AVERAGE), - COUNTABLE(PERF_RBBM_TSE_BUSY, UINT64, AVERAGE), - COUNTABLE(PERF_RBBM_RAS_BUSY, UINT64, AVERAGE), - COUNTABLE(PERF_RBBM_PC_DCALL_BUSY, UINT64, AVERAGE), - COUNTABLE(PERF_RBBM_PC_VSD_BUSY, UINT64, AVERAGE), - COUNTABLE(PERF_RBBM_STATUS_MASKED, UINT64, AVERAGE), - COUNTABLE(PERF_RBBM_COM_BUSY, UINT64, AVERAGE), - COUNTABLE(PERF_RBBM_DCOM_BUSY, UINT64, AVERAGE), - COUNTABLE(PERF_RBBM_VBIF_BUSY, UINT64, AVERAGE), - COUNTABLE(PERF_RBBM_VSC_BUSY, UINT64, AVERAGE), - COUNTABLE(PERF_RBBM_TESS_BUSY, UINT64, AVERAGE), - COUNTABLE(PERF_RBBM_UCHE_BUSY, UINT64, AVERAGE), - COUNTABLE(PERF_RBBM_HLSQ_BUSY, UINT64, AVERAGE), -}; - -static const struct fd_perfcntr_counter sp_counters[] = { -//RESERVED: for kernel -// COUNTER(SP_PERFCTR_SP_SEL_0, RBBM_PERFCTR_SP_0_LO, RBBM_PERFCTR_SP_0_HI), - COUNTER(SP_PERFCTR_SP_SEL_1, RBBM_PERFCTR_SP_1_LO, RBBM_PERFCTR_SP_1_HI), - COUNTER(SP_PERFCTR_SP_SEL_2, RBBM_PERFCTR_SP_2_LO, RBBM_PERFCTR_SP_2_HI), - COUNTER(SP_PERFCTR_SP_SEL_3, RBBM_PERFCTR_SP_3_LO, RBBM_PERFCTR_SP_3_HI), - COUNTER(SP_PERFCTR_SP_SEL_4, RBBM_PERFCTR_SP_4_LO, RBBM_PERFCTR_SP_4_HI), - COUNTER(SP_PERFCTR_SP_SEL_5, RBBM_PERFCTR_SP_5_LO, RBBM_PERFCTR_SP_5_HI), - COUNTER(SP_PERFCTR_SP_SEL_6, RBBM_PERFCTR_SP_6_LO, RBBM_PERFCTR_SP_6_HI), - COUNTER(SP_PERFCTR_SP_SEL_7, RBBM_PERFCTR_SP_7_LO, RBBM_PERFCTR_SP_7_HI), - COUNTER(SP_PERFCTR_SP_SEL_8, RBBM_PERFCTR_SP_8_LO, RBBM_PERFCTR_SP_8_HI), - COUNTER(SP_PERFCTR_SP_SEL_9, RBBM_PERFCTR_SP_9_LO, RBBM_PERFCTR_SP_9_HI), - COUNTER(SP_PERFCTR_SP_SEL_10, RBBM_PERFCTR_SP_10_LO, RBBM_PERFCTR_SP_10_HI), - COUNTER(SP_PERFCTR_SP_SEL_11, RBBM_PERFCTR_SP_11_LO, RBBM_PERFCTR_SP_11_HI), - COUNTER(SP_PERFCTR_SP_SEL_12, RBBM_PERFCTR_SP_12_LO, RBBM_PERFCTR_SP_12_HI), - COUNTER(SP_PERFCTR_SP_SEL_13, RBBM_PERFCTR_SP_13_LO, RBBM_PERFCTR_SP_13_HI), - COUNTER(SP_PERFCTR_SP_SEL_14, RBBM_PERFCTR_SP_14_LO, RBBM_PERFCTR_SP_14_HI), - COUNTER(SP_PERFCTR_SP_SEL_15, RBBM_PERFCTR_SP_15_LO, RBBM_PERFCTR_SP_15_HI), - COUNTER(SP_PERFCTR_SP_SEL_16, RBBM_PERFCTR_SP_16_LO, RBBM_PERFCTR_SP_16_HI), - COUNTER(SP_PERFCTR_SP_SEL_17, RBBM_PERFCTR_SP_17_LO, RBBM_PERFCTR_SP_17_HI), - COUNTER(SP_PERFCTR_SP_SEL_18, RBBM_PERFCTR_SP_18_LO, RBBM_PERFCTR_SP_18_HI), - COUNTER(SP_PERFCTR_SP_SEL_19, RBBM_PERFCTR_SP_19_LO, RBBM_PERFCTR_SP_19_HI), - COUNTER(SP_PERFCTR_SP_SEL_20, RBBM_PERFCTR_SP_20_LO, RBBM_PERFCTR_SP_20_HI), - COUNTER(SP_PERFCTR_SP_SEL_21, RBBM_PERFCTR_SP_21_LO, RBBM_PERFCTR_SP_21_HI), - COUNTER(SP_PERFCTR_SP_SEL_22, RBBM_PERFCTR_SP_22_LO, RBBM_PERFCTR_SP_22_HI), - COUNTER(SP_PERFCTR_SP_SEL_23, RBBM_PERFCTR_SP_23_LO, RBBM_PERFCTR_SP_23_HI), -}; - -static const struct fd_perfcntr_countable sp_countables[] = { - COUNTABLE(PERF_SP_BUSY_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_SP_ALU_WORKING_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_SP_EFU_WORKING_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_SP_STALL_CYCLES_VPC, UINT64, AVERAGE), - COUNTABLE(PERF_SP_STALL_CYCLES_TP, UINT64, AVERAGE), - COUNTABLE(PERF_SP_STALL_CYCLES_UCHE, UINT64, AVERAGE), - COUNTABLE(PERF_SP_STALL_CYCLES_RB, UINT64, AVERAGE), - COUNTABLE(PERF_SP_NON_EXECUTION_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_SP_WAVE_CONTEXTS, UINT64, AVERAGE), - COUNTABLE(PERF_SP_WAVE_CONTEXT_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_SP_FS_STAGE_WAVE_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_SP_FS_STAGE_WAVE_SAMPLES, UINT64, AVERAGE), - COUNTABLE(PERF_SP_VS_STAGE_WAVE_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_SP_VS_STAGE_WAVE_SAMPLES, UINT64, AVERAGE), - COUNTABLE(PERF_SP_FS_STAGE_DURATION_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_SP_VS_STAGE_DURATION_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_SP_WAVE_CTRL_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_SP_WAVE_LOAD_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_SP_WAVE_EMIT_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_SP_WAVE_NOP_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_SP_WAVE_WAIT_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_SP_WAVE_FETCH_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_SP_WAVE_IDLE_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_SP_WAVE_END_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_SP_WAVE_LONG_SYNC_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_SP_WAVE_SHORT_SYNC_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_SP_WAVE_JOIN_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_SP_LM_LOAD_INSTRUCTIONS, UINT64, AVERAGE), - COUNTABLE(PERF_SP_LM_STORE_INSTRUCTIONS, UINT64, AVERAGE), - COUNTABLE(PERF_SP_LM_ATOMICS, UINT64, AVERAGE), - COUNTABLE(PERF_SP_GM_LOAD_INSTRUCTIONS, UINT64, AVERAGE), - COUNTABLE(PERF_SP_GM_STORE_INSTRUCTIONS, UINT64, AVERAGE), - COUNTABLE(PERF_SP_GM_ATOMICS, UINT64, AVERAGE), - COUNTABLE(PERF_SP_VS_STAGE_TEX_INSTRUCTIONS, UINT64, AVERAGE), - COUNTABLE(PERF_SP_VS_STAGE_EFU_INSTRUCTIONS, UINT64, AVERAGE), - COUNTABLE(PERF_SP_VS_STAGE_FULL_ALU_INSTRUCTIONS, UINT64, AVERAGE), - COUNTABLE(PERF_SP_VS_STAGE_HALF_ALU_INSTRUCTIONS, UINT64, AVERAGE), - COUNTABLE(PERF_SP_FS_STAGE_TEX_INSTRUCTIONS, UINT64, AVERAGE), - COUNTABLE(PERF_SP_FS_STAGE_CFLOW_INSTRUCTIONS, UINT64, AVERAGE), - COUNTABLE(PERF_SP_FS_STAGE_EFU_INSTRUCTIONS, UINT64, AVERAGE), - COUNTABLE(PERF_SP_FS_STAGE_FULL_ALU_INSTRUCTIONS, UINT64, AVERAGE), - COUNTABLE(PERF_SP_FS_STAGE_HALF_ALU_INSTRUCTIONS, UINT64, AVERAGE), - COUNTABLE(PERF_SP_FS_STAGE_BARY_INSTRUCTIONS, UINT64, AVERAGE), - COUNTABLE(PERF_SP_VS_INSTRUCTIONS, UINT64, AVERAGE), - COUNTABLE(PERF_SP_FS_INSTRUCTIONS, UINT64, AVERAGE), - COUNTABLE(PERF_SP_ADDR_LOCK_COUNT, UINT64, AVERAGE), - COUNTABLE(PERF_SP_UCHE_READ_TRANS, UINT64, AVERAGE), - COUNTABLE(PERF_SP_UCHE_WRITE_TRANS, UINT64, AVERAGE), - COUNTABLE(PERF_SP_EXPORT_VPC_TRANS, UINT64, AVERAGE), - COUNTABLE(PERF_SP_EXPORT_RB_TRANS, UINT64, AVERAGE), - COUNTABLE(PERF_SP_PIXELS_KILLED, UINT64, AVERAGE), - COUNTABLE(PERF_SP_ICL1_REQUESTS, UINT64, AVERAGE), - COUNTABLE(PERF_SP_ICL1_MISSES, UINT64, AVERAGE), - COUNTABLE(PERF_SP_HS_INSTRUCTIONS, UINT64, AVERAGE), - COUNTABLE(PERF_SP_DS_INSTRUCTIONS, UINT64, AVERAGE), - COUNTABLE(PERF_SP_GS_INSTRUCTIONS, UINT64, AVERAGE), - COUNTABLE(PERF_SP_CS_INSTRUCTIONS, UINT64, AVERAGE), - COUNTABLE(PERF_SP_GPR_READ, UINT64, AVERAGE), - COUNTABLE(PERF_SP_GPR_WRITE, UINT64, AVERAGE), - COUNTABLE(PERF_SP_FS_STAGE_HALF_EFU_INSTRUCTIONS, UINT64, AVERAGE), - COUNTABLE(PERF_SP_VS_STAGE_HALF_EFU_INSTRUCTIONS, UINT64, AVERAGE), - COUNTABLE(PERF_SP_LM_BANK_CONFLICTS, UINT64, AVERAGE), - COUNTABLE(PERF_SP_TEX_CONTROL_WORKING_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_SP_LOAD_CONTROL_WORKING_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_SP_FLOW_CONTROL_WORKING_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_SP_LM_WORKING_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_SP_DISPATCHER_WORKING_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_SP_SEQUENCER_WORKING_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_SP_LOW_EFFICIENCY_STARVED_BY_TP, UINT64, AVERAGE), - COUNTABLE(PERF_SP_STARVE_CYCLES_HLSQ, UINT64, AVERAGE), - COUNTABLE(PERF_SP_NON_EXECUTION_LS_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_SP_WORKING_EU, UINT64, AVERAGE), - COUNTABLE(PERF_SP_ANY_EU_WORKING, UINT64, AVERAGE), - COUNTABLE(PERF_SP_WORKING_EU_FS_STAGE, UINT64, AVERAGE), - COUNTABLE(PERF_SP_ANY_EU_WORKING_FS_STAGE, UINT64, AVERAGE), - COUNTABLE(PERF_SP_WORKING_EU_VS_STAGE, UINT64, AVERAGE), - COUNTABLE(PERF_SP_ANY_EU_WORKING_VS_STAGE, UINT64, AVERAGE), - COUNTABLE(PERF_SP_WORKING_EU_CS_STAGE, UINT64, AVERAGE), - COUNTABLE(PERF_SP_ANY_EU_WORKING_CS_STAGE, UINT64, AVERAGE), - COUNTABLE(PERF_SP_GPR_READ_PREFETCH, UINT64, AVERAGE), - COUNTABLE(PERF_SP_GPR_READ_CONFLICT, UINT64, AVERAGE), - COUNTABLE(PERF_SP_GPR_WRITE_CONFLICT, UINT64, AVERAGE), - COUNTABLE(PERF_SP_GM_LOAD_LATENCY_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_SP_GM_LOAD_LATENCY_SAMPLES, UINT64, AVERAGE), - COUNTABLE(PERF_SP_EXECUTABLE_WAVES, UINT64, AVERAGE), -}; - -static const struct fd_perfcntr_counter tp_counters[] = { - COUNTER(TPL1_PERFCTR_TP_SEL_0, RBBM_PERFCTR_TP_0_LO, RBBM_PERFCTR_TP_0_HI), - COUNTER(TPL1_PERFCTR_TP_SEL_1, RBBM_PERFCTR_TP_1_LO, RBBM_PERFCTR_TP_1_HI), - COUNTER(TPL1_PERFCTR_TP_SEL_2, RBBM_PERFCTR_TP_2_LO, RBBM_PERFCTR_TP_2_HI), - COUNTER(TPL1_PERFCTR_TP_SEL_3, RBBM_PERFCTR_TP_3_LO, RBBM_PERFCTR_TP_3_HI), - COUNTER(TPL1_PERFCTR_TP_SEL_4, RBBM_PERFCTR_TP_4_LO, RBBM_PERFCTR_TP_4_HI), - COUNTER(TPL1_PERFCTR_TP_SEL_5, RBBM_PERFCTR_TP_5_LO, RBBM_PERFCTR_TP_5_HI), - COUNTER(TPL1_PERFCTR_TP_SEL_6, RBBM_PERFCTR_TP_6_LO, RBBM_PERFCTR_TP_6_HI), - COUNTER(TPL1_PERFCTR_TP_SEL_7, RBBM_PERFCTR_TP_7_LO, RBBM_PERFCTR_TP_7_HI), - COUNTER(TPL1_PERFCTR_TP_SEL_8, RBBM_PERFCTR_TP_8_LO, RBBM_PERFCTR_TP_8_HI), - COUNTER(TPL1_PERFCTR_TP_SEL_9, RBBM_PERFCTR_TP_9_LO, RBBM_PERFCTR_TP_9_HI), - COUNTER(TPL1_PERFCTR_TP_SEL_10, RBBM_PERFCTR_TP_10_LO, RBBM_PERFCTR_TP_10_HI), - COUNTER(TPL1_PERFCTR_TP_SEL_11, RBBM_PERFCTR_TP_11_LO, RBBM_PERFCTR_TP_11_HI), -}; - -static const struct fd_perfcntr_countable tp_countables[] = { - COUNTABLE(PERF_TP_BUSY_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_TP_STALL_CYCLES_UCHE, UINT64, AVERAGE), - COUNTABLE(PERF_TP_LATENCY_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_TP_LATENCY_TRANS, UINT64, AVERAGE), - COUNTABLE(PERF_TP_FLAG_CACHE_REQUEST_SAMPLES, UINT64, AVERAGE), - COUNTABLE(PERF_TP_FLAG_CACHE_REQUEST_LATENCY, UINT64, AVERAGE), - COUNTABLE(PERF_TP_L1_CACHELINE_REQUESTS, UINT64, AVERAGE), - COUNTABLE(PERF_TP_L1_CACHELINE_MISSES, UINT64, AVERAGE), - COUNTABLE(PERF_TP_SP_TP_TRANS, UINT64, AVERAGE), - COUNTABLE(PERF_TP_TP_SP_TRANS, UINT64, AVERAGE), - COUNTABLE(PERF_TP_OUTPUT_PIXELS, UINT64, AVERAGE), - COUNTABLE(PERF_TP_FILTER_WORKLOAD_16BIT, UINT64, AVERAGE), - COUNTABLE(PERF_TP_FILTER_WORKLOAD_32BIT, UINT64, AVERAGE), - COUNTABLE(PERF_TP_QUADS_RECEIVED, UINT64, AVERAGE), - COUNTABLE(PERF_TP_QUADS_OFFSET, UINT64, AVERAGE), - COUNTABLE(PERF_TP_QUADS_SHADOW, UINT64, AVERAGE), - COUNTABLE(PERF_TP_QUADS_ARRAY, UINT64, AVERAGE), - COUNTABLE(PERF_TP_QUADS_GRADIENT, UINT64, AVERAGE), - COUNTABLE(PERF_TP_QUADS_1D, UINT64, AVERAGE), - COUNTABLE(PERF_TP_QUADS_2D, UINT64, AVERAGE), - COUNTABLE(PERF_TP_QUADS_BUFFER, UINT64, AVERAGE), - COUNTABLE(PERF_TP_QUADS_3D, UINT64, AVERAGE), - COUNTABLE(PERF_TP_QUADS_CUBE, UINT64, AVERAGE), - COUNTABLE(PERF_TP_DIVERGENT_QUADS_RECEIVED, UINT64, AVERAGE), - COUNTABLE(PERF_TP_PRT_NON_RESIDENT_EVENTS, UINT64, AVERAGE), - COUNTABLE(PERF_TP_OUTPUT_PIXELS_POINT, UINT64, AVERAGE), - COUNTABLE(PERF_TP_OUTPUT_PIXELS_BILINEAR, UINT64, AVERAGE), - COUNTABLE(PERF_TP_OUTPUT_PIXELS_MIP, UINT64, AVERAGE), - COUNTABLE(PERF_TP_OUTPUT_PIXELS_ANISO, UINT64, AVERAGE), - COUNTABLE(PERF_TP_OUTPUT_PIXELS_ZERO_LOD, UINT64, AVERAGE), - COUNTABLE(PERF_TP_FLAG_CACHE_REQUESTS, UINT64, AVERAGE), - COUNTABLE(PERF_TP_FLAG_CACHE_MISSES, UINT64, AVERAGE), - COUNTABLE(PERF_TP_L1_5_L2_REQUESTS, UINT64, AVERAGE), - COUNTABLE(PERF_TP_2D_OUTPUT_PIXELS, UINT64, AVERAGE), - COUNTABLE(PERF_TP_2D_OUTPUT_PIXELS_POINT, UINT64, AVERAGE), - COUNTABLE(PERF_TP_2D_OUTPUT_PIXELS_BILINEAR, UINT64, AVERAGE), - COUNTABLE(PERF_TP_2D_FILTER_WORKLOAD_16BIT, UINT64, AVERAGE), - COUNTABLE(PERF_TP_2D_FILTER_WORKLOAD_32BIT, UINT64, AVERAGE), - COUNTABLE(PERF_TP_TPA2TPC_TRANS, UINT64, AVERAGE), - COUNTABLE(PERF_TP_L1_MISSES_ASTC_1TILE, UINT64, AVERAGE), - COUNTABLE(PERF_TP_L1_MISSES_ASTC_2TILE, UINT64, AVERAGE), - COUNTABLE(PERF_TP_L1_MISSES_ASTC_4TILE, UINT64, AVERAGE), - COUNTABLE(PERF_TP_L1_5_L2_COMPRESS_REQS, UINT64, AVERAGE), - COUNTABLE(PERF_TP_L1_5_L2_COMPRESS_MISS, UINT64, AVERAGE), - COUNTABLE(PERF_TP_L1_BANK_CONFLICT, UINT64, AVERAGE), - COUNTABLE(PERF_TP_L1_5_MISS_LATENCY_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_TP_L1_5_MISS_LATENCY_TRANS, UINT64, AVERAGE), - COUNTABLE(PERF_TP_QUADS_CONSTANT_MULTIPLIED, UINT64, AVERAGE), - COUNTABLE(PERF_TP_FRONTEND_WORKING_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_TP_L1_TAG_WORKING_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_TP_L1_DATA_WRITE_WORKING_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_TP_PRE_L1_DECOM_WORKING_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_TP_BACKEND_WORKING_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_TP_FLAG_CACHE_WORKING_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_TP_L1_5_CACHE_WORKING_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_TP_STARVE_CYCLES_SP, UINT64, AVERAGE), - COUNTABLE(PERF_TP_STARVE_CYCLES_UCHE, UINT64, AVERAGE), -}; - -static const struct fd_perfcntr_counter uche_counters[] = { - COUNTER(UCHE_PERFCTR_UCHE_SEL_0, RBBM_PERFCTR_UCHE_0_LO, RBBM_PERFCTR_UCHE_0_HI), - COUNTER(UCHE_PERFCTR_UCHE_SEL_1, RBBM_PERFCTR_UCHE_1_LO, RBBM_PERFCTR_UCHE_1_HI), - COUNTER(UCHE_PERFCTR_UCHE_SEL_2, RBBM_PERFCTR_UCHE_2_LO, RBBM_PERFCTR_UCHE_2_HI), - COUNTER(UCHE_PERFCTR_UCHE_SEL_3, RBBM_PERFCTR_UCHE_3_LO, RBBM_PERFCTR_UCHE_3_HI), - COUNTER(UCHE_PERFCTR_UCHE_SEL_4, RBBM_PERFCTR_UCHE_4_LO, RBBM_PERFCTR_UCHE_4_HI), - COUNTER(UCHE_PERFCTR_UCHE_SEL_5, RBBM_PERFCTR_UCHE_5_LO, RBBM_PERFCTR_UCHE_5_HI), - COUNTER(UCHE_PERFCTR_UCHE_SEL_6, RBBM_PERFCTR_UCHE_6_LO, RBBM_PERFCTR_UCHE_6_HI), - COUNTER(UCHE_PERFCTR_UCHE_SEL_7, RBBM_PERFCTR_UCHE_7_LO, RBBM_PERFCTR_UCHE_7_HI), - COUNTER(UCHE_PERFCTR_UCHE_SEL_8, RBBM_PERFCTR_UCHE_8_LO, RBBM_PERFCTR_UCHE_8_HI), - COUNTER(UCHE_PERFCTR_UCHE_SEL_9, RBBM_PERFCTR_UCHE_9_LO, RBBM_PERFCTR_UCHE_9_HI), - COUNTER(UCHE_PERFCTR_UCHE_SEL_10, RBBM_PERFCTR_UCHE_10_LO, RBBM_PERFCTR_UCHE_10_HI), - COUNTER(UCHE_PERFCTR_UCHE_SEL_11, RBBM_PERFCTR_UCHE_11_LO, RBBM_PERFCTR_UCHE_11_HI), -}; - -static const struct fd_perfcntr_countable uche_countables[] = { - COUNTABLE(PERF_UCHE_BUSY_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_UCHE_STALL_CYCLES_ARBITER, UINT64, AVERAGE), - COUNTABLE(PERF_UCHE_VBIF_LATENCY_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_UCHE_VBIF_LATENCY_SAMPLES, UINT64, AVERAGE), - COUNTABLE(PERF_UCHE_VBIF_READ_BEATS_TP, UINT64, AVERAGE), - COUNTABLE(PERF_UCHE_VBIF_READ_BEATS_VFD, UINT64, AVERAGE), - COUNTABLE(PERF_UCHE_VBIF_READ_BEATS_HLSQ, UINT64, AVERAGE), - COUNTABLE(PERF_UCHE_VBIF_READ_BEATS_LRZ, UINT64, AVERAGE), - COUNTABLE(PERF_UCHE_VBIF_READ_BEATS_SP, UINT64, AVERAGE), - COUNTABLE(PERF_UCHE_READ_REQUESTS_TP, UINT64, AVERAGE), - COUNTABLE(PERF_UCHE_READ_REQUESTS_VFD, UINT64, AVERAGE), - COUNTABLE(PERF_UCHE_READ_REQUESTS_HLSQ, UINT64, AVERAGE), - COUNTABLE(PERF_UCHE_READ_REQUESTS_LRZ, UINT64, AVERAGE), - COUNTABLE(PERF_UCHE_READ_REQUESTS_SP, UINT64, AVERAGE), - COUNTABLE(PERF_UCHE_WRITE_REQUESTS_LRZ, UINT64, AVERAGE), - COUNTABLE(PERF_UCHE_WRITE_REQUESTS_SP, UINT64, AVERAGE), - COUNTABLE(PERF_UCHE_WRITE_REQUESTS_VPC, UINT64, AVERAGE), - COUNTABLE(PERF_UCHE_WRITE_REQUESTS_VSC, UINT64, AVERAGE), - COUNTABLE(PERF_UCHE_EVICTS, UINT64, AVERAGE), - COUNTABLE(PERF_UCHE_BANK_REQ0, UINT64, AVERAGE), - COUNTABLE(PERF_UCHE_BANK_REQ1, UINT64, AVERAGE), - COUNTABLE(PERF_UCHE_BANK_REQ2, UINT64, AVERAGE), - COUNTABLE(PERF_UCHE_BANK_REQ3, UINT64, AVERAGE), - COUNTABLE(PERF_UCHE_BANK_REQ4, UINT64, AVERAGE), - COUNTABLE(PERF_UCHE_BANK_REQ5, UINT64, AVERAGE), - COUNTABLE(PERF_UCHE_BANK_REQ6, UINT64, AVERAGE), - COUNTABLE(PERF_UCHE_BANK_REQ7, UINT64, AVERAGE), - COUNTABLE(PERF_UCHE_VBIF_READ_BEATS_CH0, UINT64, AVERAGE), - COUNTABLE(PERF_UCHE_VBIF_READ_BEATS_CH1, UINT64, AVERAGE), - COUNTABLE(PERF_UCHE_GMEM_READ_BEATS, UINT64, AVERAGE), - COUNTABLE(PERF_UCHE_TPH_REF_FULL, UINT64, AVERAGE), - COUNTABLE(PERF_UCHE_TPH_VICTIM_FULL, UINT64, AVERAGE), - COUNTABLE(PERF_UCHE_TPH_EXT_FULL, UINT64, AVERAGE), - COUNTABLE(PERF_UCHE_VBIF_STALL_WRITE_DATA, UINT64, AVERAGE), - COUNTABLE(PERF_UCHE_DCMP_LATENCY_SAMPLES, UINT64, AVERAGE), - COUNTABLE(PERF_UCHE_DCMP_LATENCY_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_UCHE_VBIF_READ_BEATS_PC, UINT64, AVERAGE), - COUNTABLE(PERF_UCHE_READ_REQUESTS_PC, UINT64, AVERAGE), - COUNTABLE(PERF_UCHE_RAM_READ_REQ, UINT64, AVERAGE), - COUNTABLE(PERF_UCHE_RAM_WRITE_REQ, UINT64, AVERAGE), -}; - -static const struct fd_perfcntr_counter vfd_counters[] = { - COUNTER(VFD_PERFCTR_VFD_SEL_0, RBBM_PERFCTR_VFD_0_LO, RBBM_PERFCTR_VFD_0_HI), - COUNTER(VFD_PERFCTR_VFD_SEL_1, RBBM_PERFCTR_VFD_1_LO, RBBM_PERFCTR_VFD_1_HI), - COUNTER(VFD_PERFCTR_VFD_SEL_2, RBBM_PERFCTR_VFD_2_LO, RBBM_PERFCTR_VFD_2_HI), - COUNTER(VFD_PERFCTR_VFD_SEL_3, RBBM_PERFCTR_VFD_3_LO, RBBM_PERFCTR_VFD_3_HI), - COUNTER(VFD_PERFCTR_VFD_SEL_4, RBBM_PERFCTR_VFD_4_LO, RBBM_PERFCTR_VFD_4_HI), - COUNTER(VFD_PERFCTR_VFD_SEL_5, RBBM_PERFCTR_VFD_5_LO, RBBM_PERFCTR_VFD_5_HI), - COUNTER(VFD_PERFCTR_VFD_SEL_6, RBBM_PERFCTR_VFD_6_LO, RBBM_PERFCTR_VFD_6_HI), - COUNTER(VFD_PERFCTR_VFD_SEL_7, RBBM_PERFCTR_VFD_7_LO, RBBM_PERFCTR_VFD_7_HI), -}; - -static const struct fd_perfcntr_countable vfd_countables[] = { - COUNTABLE(PERF_VFD_BUSY_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_VFD_STALL_CYCLES_UCHE, UINT64, AVERAGE), - COUNTABLE(PERF_VFD_STALL_CYCLES_VPC_ALLOC, UINT64, AVERAGE), - COUNTABLE(PERF_VFD_STALL_CYCLES_SP_INFO, UINT64, AVERAGE), - COUNTABLE(PERF_VFD_STALL_CYCLES_SP_ATTR, UINT64, AVERAGE), - COUNTABLE(PERF_VFD_STARVE_CYCLES_UCHE, UINT64, AVERAGE), - COUNTABLE(PERF_VFD_RBUFFER_FULL, UINT64, AVERAGE), - COUNTABLE(PERF_VFD_ATTR_INFO_FIFO_FULL, UINT64, AVERAGE), - COUNTABLE(PERF_VFD_DECODED_ATTRIBUTE_BYTES, UINT64, AVERAGE), - COUNTABLE(PERF_VFD_NUM_ATTRIBUTES, UINT64, AVERAGE), - COUNTABLE(PERF_VFD_UPPER_SHADER_FIBERS, UINT64, AVERAGE), - COUNTABLE(PERF_VFD_LOWER_SHADER_FIBERS, UINT64, AVERAGE), - COUNTABLE(PERF_VFD_MODE_0_FIBERS, UINT64, AVERAGE), - COUNTABLE(PERF_VFD_MODE_1_FIBERS, UINT64, AVERAGE), - COUNTABLE(PERF_VFD_MODE_2_FIBERS, UINT64, AVERAGE), - COUNTABLE(PERF_VFD_MODE_3_FIBERS, UINT64, AVERAGE), - COUNTABLE(PERF_VFD_MODE_4_FIBERS, UINT64, AVERAGE), - COUNTABLE(PERF_VFD_TOTAL_VERTICES, UINT64, AVERAGE), - COUNTABLE(PERF_VFDP_STALL_CYCLES_VFD, UINT64, AVERAGE), - COUNTABLE(PERF_VFDP_STALL_CYCLES_VFD_INDEX, UINT64, AVERAGE), - COUNTABLE(PERF_VFDP_STALL_CYCLES_VFD_PROG, UINT64, AVERAGE), - COUNTABLE(PERF_VFDP_STARVE_CYCLES_PC, UINT64, AVERAGE), - COUNTABLE(PERF_VFDP_VS_STAGE_WAVES, UINT64, AVERAGE), -}; - -static const struct fd_perfcntr_counter vpc_counters[] = { - COUNTER(VPC_PERFCTR_VPC_SEL_0, RBBM_PERFCTR_VPC_0_LO, RBBM_PERFCTR_VPC_0_HI), - COUNTER(VPC_PERFCTR_VPC_SEL_1, RBBM_PERFCTR_VPC_1_LO, RBBM_PERFCTR_VPC_1_HI), - COUNTER(VPC_PERFCTR_VPC_SEL_2, RBBM_PERFCTR_VPC_2_LO, RBBM_PERFCTR_VPC_2_HI), - COUNTER(VPC_PERFCTR_VPC_SEL_3, RBBM_PERFCTR_VPC_3_LO, RBBM_PERFCTR_VPC_3_HI), - COUNTER(VPC_PERFCTR_VPC_SEL_4, RBBM_PERFCTR_VPC_4_LO, RBBM_PERFCTR_VPC_4_HI), - COUNTER(VPC_PERFCTR_VPC_SEL_5, RBBM_PERFCTR_VPC_5_LO, RBBM_PERFCTR_VPC_5_HI), -}; - -static const struct fd_perfcntr_countable vpc_countables[] = { - COUNTABLE(PERF_VPC_BUSY_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_VPC_WORKING_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_VPC_STALL_CYCLES_UCHE, UINT64, AVERAGE), - COUNTABLE(PERF_VPC_STALL_CYCLES_VFD_WACK, UINT64, AVERAGE), - COUNTABLE(PERF_VPC_STALL_CYCLES_HLSQ_PRIM_ALLOC, UINT64, AVERAGE), - COUNTABLE(PERF_VPC_STALL_CYCLES_PC, UINT64, AVERAGE), - COUNTABLE(PERF_VPC_STALL_CYCLES_SP_LM, UINT64, AVERAGE), - COUNTABLE(PERF_VPC_STARVE_CYCLES_SP, UINT64, AVERAGE), - COUNTABLE(PERF_VPC_STARVE_CYCLES_LRZ, UINT64, AVERAGE), - COUNTABLE(PERF_VPC_PC_PRIMITIVES, UINT64, AVERAGE), - COUNTABLE(PERF_VPC_SP_COMPONENTS, UINT64, AVERAGE), - COUNTABLE(PERF_VPC_STALL_CYCLES_VPCRAM_POS, UINT64, AVERAGE), - COUNTABLE(PERF_VPC_LRZ_ASSIGN_PRIMITIVES, UINT64, AVERAGE), - COUNTABLE(PERF_VPC_RB_VISIBLE_PRIMITIVES, UINT64, AVERAGE), - COUNTABLE(PERF_VPC_LM_TRANSACTION, UINT64, AVERAGE), - COUNTABLE(PERF_VPC_STREAMOUT_TRANSACTION, UINT64, AVERAGE), - COUNTABLE(PERF_VPC_VS_BUSY_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_VPC_PS_BUSY_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_VPC_VS_WORKING_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_VPC_PS_WORKING_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_VPC_STARVE_CYCLES_RB, UINT64, AVERAGE), - COUNTABLE(PERF_VPC_NUM_VPCRAM_READ_POS, UINT64, AVERAGE), - COUNTABLE(PERF_VPC_WIT_FULL_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_VPC_VPCRAM_FULL_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_VPC_LM_FULL_WAIT_FOR_INTP_END, UINT64, AVERAGE), - COUNTABLE(PERF_VPC_NUM_VPCRAM_WRITE, UINT64, AVERAGE), - COUNTABLE(PERF_VPC_NUM_VPCRAM_READ_SO, UINT64, AVERAGE), - COUNTABLE(PERF_VPC_NUM_ATTR_REQ_LM, UINT64, AVERAGE), -}; - -static const struct fd_perfcntr_counter vsc_counters[] = { - COUNTER(VSC_PERFCTR_VSC_SEL_0, RBBM_PERFCTR_VSC_0_LO, RBBM_PERFCTR_VSC_0_HI), - COUNTER(VSC_PERFCTR_VSC_SEL_1, RBBM_PERFCTR_VSC_1_LO, RBBM_PERFCTR_VSC_1_HI), -}; - -static const struct fd_perfcntr_countable vsc_countables[] = { - COUNTABLE(PERF_VSC_BUSY_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_VSC_WORKING_CYCLES, UINT64, AVERAGE), - COUNTABLE(PERF_VSC_STALL_CYCLES_UCHE, UINT64, AVERAGE), - COUNTABLE(PERF_VSC_EOT_NUM, UINT64, AVERAGE), - COUNTABLE(PERF_VSC_INPUT_TILES, UINT64, AVERAGE), -}; - -const struct fd_perfcntr_group a6xx_perfcntr_groups[] = { - GROUP("CP", cp_counters, cp_countables), - GROUP("CCU", ccu_counters, ccu_countables), - GROUP("TSE", tse_counters, tse_countables), - GROUP("RAS", ras_counters, ras_countables), - GROUP("LRZ", lrz_counters, lrz_countables), - GROUP("HLSQ", hlsq_counters, hlsq_countables), - GROUP("PC", pc_counters, pc_countables), - GROUP("RB", rb_counters, rb_countables), - GROUP("RBBM", rbbm_counters, rbbm_countables), - GROUP("SP", sp_counters, sp_countables), - GROUP("TP", tp_counters, tp_countables), - GROUP("UCHE", uche_counters, uche_countables), - GROUP("VFD", vfd_counters, vfd_countables), - GROUP("VPC", vpc_counters, vpc_countables), - GROUP("VSC", vsc_counters, vsc_countables), -// GROUP("VBIF", vbif_counters, vbif_countables), -}; - -const unsigned a6xx_num_perfcntr_groups = ARRAY_SIZE(a6xx_perfcntr_groups); - -#endif /* FD5_PERFCNTR_H_ */ diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/a6xx/fd6_program.c mesa-20.0.8/src/gallium/drivers/freedreno/a6xx/fd6_program.c --- mesa-19.2.8/src/gallium/drivers/freedreno/a6xx/fd6_program.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/a6xx/fd6_program.c 2020-06-12 01:21:17.000000000 +0000 @@ -29,7 +29,7 @@ #include "util/u_string.h" #include "util/u_memory.h" #include "util/u_inlines.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/bitset.h" #include "freedreno_program.h" @@ -39,51 +39,49 @@ #include "fd6_texture.h" #include "fd6_format.h" -static struct ir3_shader * -create_shader_stateobj(struct pipe_context *pctx, const struct pipe_shader_state *cso, - gl_shader_stage type) -{ - struct fd_context *ctx = fd_context(pctx); - struct ir3_compiler *compiler = ctx->screen->compiler; - return ir3_shader_create(compiler, cso, type, &ctx->debug, pctx->screen); -} - -static void * -fd6_fp_state_create(struct pipe_context *pctx, - const struct pipe_shader_state *cso) +void +fd6_emit_shader(struct fd_ringbuffer *ring, const struct ir3_shader_variant *so) { - return create_shader_stateobj(pctx, cso, MESA_SHADER_FRAGMENT); -} + enum a6xx_state_block sb = fd6_stage2shadersb(so->type); -static void -fd6_fp_state_delete(struct pipe_context *pctx, void *hwcso) -{ - struct ir3_shader *so = hwcso; - struct fd_context *ctx = fd_context(pctx); - ir3_cache_invalidate(fd6_context(ctx)->shader_cache, hwcso); - ir3_shader_destroy(so); -} + uint32_t obj_start; + uint32_t instrlen; -static void * -fd6_vp_state_create(struct pipe_context *pctx, - const struct pipe_shader_state *cso) -{ - return create_shader_stateobj(pctx, cso, MESA_SHADER_VERTEX); -} + switch (so->type) { + case MESA_SHADER_VERTEX: + obj_start = REG_A6XX_SP_VS_OBJ_START_LO; + instrlen = REG_A6XX_SP_VS_INSTRLEN; + break; + case MESA_SHADER_TESS_CTRL: + obj_start = REG_A6XX_SP_HS_OBJ_START_LO; + instrlen = REG_A6XX_SP_HS_INSTRLEN; + break; + case MESA_SHADER_TESS_EVAL: + obj_start = REG_A6XX_SP_DS_OBJ_START_LO; + instrlen = REG_A6XX_SP_DS_INSTRLEN; + break; + case MESA_SHADER_GEOMETRY: + obj_start = REG_A6XX_SP_GS_OBJ_START_LO; + instrlen = REG_A6XX_SP_GS_INSTRLEN; + break; + case MESA_SHADER_FRAGMENT: + obj_start = REG_A6XX_SP_FS_OBJ_START_LO; + instrlen = REG_A6XX_SP_FS_INSTRLEN; + break; + case MESA_SHADER_COMPUTE: + case MESA_SHADER_KERNEL: + obj_start = REG_A6XX_SP_CS_OBJ_START_LO; + instrlen = REG_A6XX_SP_CS_INSTRLEN; + break; + case MESA_SHADER_NONE: + unreachable(""); + } -static void -fd6_vp_state_delete(struct pipe_context *pctx, void *hwcso) -{ - struct ir3_shader *so = hwcso; - struct fd_context *ctx = fd_context(pctx); - ir3_cache_invalidate(fd6_context(ctx)->shader_cache, hwcso); - ir3_shader_destroy(so); -} + OUT_PKT4(ring, instrlen, 1); + OUT_RING(ring, so->instrlen); -void -fd6_emit_shader(struct fd_ringbuffer *ring, const struct ir3_shader_variant *so) -{ - enum a6xx_state_block sb = fd6_stage2shadersb(so->type); + OUT_PKT4(ring, obj_start, 2); + OUT_RELOC(ring, so->bo, 0, 0, 0); OUT_PKT7(ring, fd6_stage2opcode(so->type), 3); OUT_RING(ring, CP_LOAD_STATE6_0_DST_OFF(0) | @@ -200,47 +198,64 @@ OUT_PKT4(ring, REG_A6XX_HLSQ_UPDATE_CNTL, 1); OUT_RING(ring, 0xff); /* XXX */ - debug_assert(state->vs->constlen >= state->bs->constlen); + if (state->ds) + debug_assert(state->ds->constlen >= state->bs->constlen); + else + debug_assert(state->vs->constlen >= state->bs->constlen); OUT_PKT4(ring, REG_A6XX_HLSQ_VS_CNTL, 4); OUT_RING(ring, A6XX_HLSQ_VS_CNTL_CONSTLEN(align(state->vs->constlen, 4)) | A6XX_HLSQ_VS_CNTL_ENABLED); - OUT_RING(ring, A6XX_HLSQ_HS_CNTL_CONSTLEN(0)); - OUT_RING(ring, A6XX_HLSQ_DS_CNTL_CONSTLEN(0)); - OUT_RING(ring, A6XX_HLSQ_GS_CNTL_CONSTLEN(0)); - + OUT_RING(ring, COND(state->hs, + A6XX_HLSQ_HS_CNTL_ENABLED | + A6XX_HLSQ_HS_CNTL_CONSTLEN(align(state->hs->constlen, 4)))); + OUT_RING(ring, COND(state->ds, + A6XX_HLSQ_DS_CNTL_ENABLED | + A6XX_HLSQ_DS_CNTL_CONSTLEN(align(state->ds->constlen, 4)))); + OUT_RING(ring, COND(state->gs, + A6XX_HLSQ_GS_CNTL_ENABLED | + A6XX_HLSQ_GS_CNTL_CONSTLEN(align(state->gs->constlen, 4)))); OUT_PKT4(ring, REG_A6XX_HLSQ_FS_CNTL, 1); OUT_RING(ring, A6XX_HLSQ_FS_CNTL_CONSTLEN(align(state->fs->constlen, 4)) | A6XX_HLSQ_FS_CNTL_ENABLED); OUT_PKT4(ring, REG_A6XX_SP_VS_CONFIG, 1); OUT_RING(ring, COND(state->vs, A6XX_SP_VS_CONFIG_ENABLED) | - A6XX_SP_VS_CONFIG_NIBO(state->vs->image_mapping.num_ibo) | + A6XX_SP_VS_CONFIG_NIBO(ir3_shader_nibo(state->vs)) | A6XX_SP_VS_CONFIG_NTEX(state->vs->num_samp) | A6XX_SP_VS_CONFIG_NSAMP(state->vs->num_samp)); - OUT_PKT4(ring, REG_A6XX_SP_FS_CONFIG, 1); - OUT_RING(ring, COND(state->fs, A6XX_SP_FS_CONFIG_ENABLED) | - A6XX_SP_FS_CONFIG_NIBO(state->fs->image_mapping.num_ibo) | - A6XX_SP_FS_CONFIG_NTEX(state->fs->num_samp) | - A6XX_SP_FS_CONFIG_NSAMP(state->fs->num_samp)); - OUT_PKT4(ring, REG_A6XX_SP_HS_CONFIG, 1); - OUT_RING(ring, COND(false, A6XX_SP_HS_CONFIG_ENABLED)); + OUT_RING(ring, COND(state->hs, + A6XX_SP_HS_CONFIG_ENABLED | + A6XX_SP_HS_CONFIG_NIBO(ir3_shader_nibo(state->hs)) | + A6XX_SP_HS_CONFIG_NTEX(state->hs->num_samp) | + A6XX_SP_HS_CONFIG_NSAMP(state->hs->num_samp))); OUT_PKT4(ring, REG_A6XX_SP_DS_CONFIG, 1); - OUT_RING(ring, COND(false, A6XX_SP_DS_CONFIG_ENABLED)); + OUT_RING(ring, COND(state->ds, + A6XX_SP_DS_CONFIG_ENABLED | + A6XX_SP_DS_CONFIG_NIBO(ir3_shader_nibo(state->ds)) | + A6XX_SP_DS_CONFIG_NTEX(state->ds->num_samp) | + A6XX_SP_DS_CONFIG_NSAMP(state->ds->num_samp))); OUT_PKT4(ring, REG_A6XX_SP_GS_CONFIG, 1); - OUT_RING(ring, COND(false, A6XX_SP_GS_CONFIG_ENABLED)); + OUT_RING(ring, COND(state->gs, + A6XX_SP_GS_CONFIG_ENABLED | + A6XX_SP_GS_CONFIG_NIBO(ir3_shader_nibo(state->gs)) | + A6XX_SP_GS_CONFIG_NTEX(state->gs->num_samp) | + A6XX_SP_GS_CONFIG_NSAMP(state->gs->num_samp))); + + OUT_PKT4(ring, REG_A6XX_SP_FS_CONFIG, 1); + OUT_RING(ring, COND(state->fs, A6XX_SP_FS_CONFIG_ENABLED) | + A6XX_SP_FS_CONFIG_NIBO(ir3_shader_nibo(state->fs)) | + A6XX_SP_FS_CONFIG_NTEX(state->fs->num_samp) | + A6XX_SP_FS_CONFIG_NSAMP(state->fs->num_samp)); OUT_PKT4(ring, REG_A6XX_SP_IBO_COUNT, 1); - OUT_RING(ring, state->fs->image_mapping.num_ibo); + OUT_RING(ring, ir3_shader_nibo(state->fs)); } -#define VALIDREG(r) ((r) != regid(63,0)) -#define CONDREG(r, val) COND(VALIDREG(r), (val)) - static inline uint32_t next_regid(uint32_t reg, uint32_t increment) { @@ -258,16 +273,28 @@ uint32_t pos_regid, psize_regid, color_regid[8], posz_regid; uint32_t face_regid, coord_regid, zwcoord_regid, samp_id_regid; uint32_t smask_in_regid, smask_regid; - uint32_t vertex_regid, instance_regid; + uint32_t vertex_regid, instance_regid, layer_regid, primitive_regid; + uint32_t hs_invocation_regid; + uint32_t tess_coord_x_regid, tess_coord_y_regid, hs_patch_regid, ds_patch_regid; uint32_t ij_pix_regid, ij_samp_regid, ij_cent_regid, ij_size_regid; + uint32_t gs_header_regid; enum a3xx_threadsize fssz; - uint8_t psize_loc = ~0; + uint8_t psize_loc = ~0, pos_loc = ~0, layer_loc = ~0; int i, j; static const struct ir3_shader_variant dummy_fs = {0}; const struct ir3_shader_variant *vs = binning_pass ? state->bs : state->vs; + const struct ir3_shader_variant *hs = state->hs; + const struct ir3_shader_variant *ds = state->ds; + const struct ir3_shader_variant *gs = state->gs; const struct ir3_shader_variant *fs = binning_pass ? &dummy_fs : state->fs; + /* binning VS is wrong when GS is present, so use nonbinning VS + * TODO: compile both binning VS/GS variants correctly + */ + if (binning_pass && state->gs) + vs = state->vs; + bool sample_shading = fs->per_samp | key->sample_shading; fssz = FOUR_QUADS; @@ -277,6 +304,35 @@ vertex_regid = ir3_find_sysval_regid(vs, SYSTEM_VALUE_VERTEX_ID); instance_regid = ir3_find_sysval_regid(vs, SYSTEM_VALUE_INSTANCE_ID); + if (hs) { + tess_coord_x_regid = ir3_find_sysval_regid(ds, SYSTEM_VALUE_TESS_COORD); + tess_coord_y_regid = next_regid(tess_coord_x_regid, 1); + hs_patch_regid = ir3_find_sysval_regid(hs, SYSTEM_VALUE_PRIMITIVE_ID); + ds_patch_regid = ir3_find_sysval_regid(ds, SYSTEM_VALUE_PRIMITIVE_ID); + hs_invocation_regid = ir3_find_sysval_regid(hs, SYSTEM_VALUE_TCS_HEADER_IR3); + + pos_regid = ir3_find_output_regid(ds, VARYING_SLOT_POS); + psize_regid = ir3_find_output_regid(ds, VARYING_SLOT_PSIZ); + } else { + tess_coord_x_regid = regid(63, 0); + tess_coord_y_regid = regid(63, 0); + hs_patch_regid = regid(63, 0); + ds_patch_regid = regid(63, 0); + hs_invocation_regid = regid(63, 0); + } + + if (gs) { + gs_header_regid = ir3_find_sysval_regid(gs, SYSTEM_VALUE_GS_HEADER_IR3); + primitive_regid = ir3_find_sysval_regid(gs, SYSTEM_VALUE_PRIMITIVE_ID); + pos_regid = ir3_find_output_regid(gs, VARYING_SLOT_POS); + psize_regid = ir3_find_output_regid(gs, VARYING_SLOT_PSIZ); + layer_regid = ir3_find_output_regid(gs, VARYING_SLOT_LAYER); + } else { + gs_header_regid = regid(63, 0); + primitive_regid = regid(63, 0); + layer_regid = regid(63, 0); + } + if (fs->color0_mrt) { color_regid[0] = color_regid[1] = color_regid[2] = color_regid[3] = color_regid[4] = color_regid[5] = color_regid[6] = color_regid[7] = @@ -297,13 +353,22 @@ face_regid = ir3_find_sysval_regid(fs, SYSTEM_VALUE_FRONT_FACE); coord_regid = ir3_find_sysval_regid(fs, SYSTEM_VALUE_FRAG_COORD); zwcoord_regid = next_regid(coord_regid, 2); - ij_pix_regid = ir3_find_sysval_regid(fs, SYSTEM_VALUE_BARYCENTRIC_PIXEL); - ij_samp_regid = ir3_find_sysval_regid(fs, SYSTEM_VALUE_BARYCENTRIC_SAMPLE); - ij_cent_regid = ir3_find_sysval_regid(fs, SYSTEM_VALUE_BARYCENTRIC_CENTROID); - ij_size_regid = ir3_find_sysval_regid(fs, SYSTEM_VALUE_BARYCENTRIC_SIZE); + ij_pix_regid = ir3_find_sysval_regid(fs, SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL); + ij_samp_regid = ir3_find_sysval_regid(fs, SYSTEM_VALUE_BARYCENTRIC_PERSP_SAMPLE); + ij_cent_regid = ir3_find_sysval_regid(fs, SYSTEM_VALUE_BARYCENTRIC_PERSP_CENTROID); + ij_size_regid = ir3_find_sysval_regid(fs, SYSTEM_VALUE_BARYCENTRIC_PERSP_SIZE); posz_regid = ir3_find_output_regid(fs, FRAG_RESULT_DEPTH); smask_regid = ir3_find_output_regid(fs, FRAG_RESULT_SAMPLE_MASK); + /* If we have pre-dispatch texture fetches, then ij_pix should not + * be DCE'd, even if not actually used in the shader itself: + */ + if (fs->num_sampler_prefetch > 0) { + assert(VALIDREG(ij_pix_regid)); + /* also, it seems like ij_pix is *required* to be r0.x */ + assert(ij_pix_regid == regid(0, 0)); + } + /* we can't write gl_SampleMask for !msaa.. if b0 is zero then we * end up masking the single sample!! */ @@ -314,56 +379,58 @@ * emitted if frag-prog is dirty vs if vert-prog is dirty.. */ - OUT_PKT4(ring, REG_A6XX_SP_VS_INSTRLEN, 1); - OUT_RING(ring, vs->instrlen); /* SP_VS_INSTRLEN */ - - OUT_PKT4(ring, REG_A6XX_SP_HS_UNKNOWN_A831, 1); - OUT_RING(ring, 0); - - OUT_PKT4(ring, REG_A6XX_SP_HS_INSTRLEN, 1); - OUT_RING(ring, 0); /* SP_HS_INSTRLEN */ - - OUT_PKT4(ring, REG_A6XX_SP_DS_INSTRLEN, 1); - OUT_RING(ring, 0); /* SP_DS_INSTRLEN */ - - OUT_PKT4(ring, REG_A6XX_SP_GS_UNKNOWN_A871, 1); - OUT_RING(ring, 0); - - OUT_PKT4(ring, REG_A6XX_SP_GS_INSTRLEN, 1); - OUT_RING(ring, 0); /* SP_GS_INSTRLEN */ - - /* I believe this is related to pre-dispatch texture fetch.. we probably - * should't turn it on by accident: - */ - OUT_PKT4(ring, REG_A6XX_SP_UNKNOWN_A99E, 1); + OUT_PKT4(ring, REG_A6XX_SP_HS_UNKNOWN_A833, 1); OUT_RING(ring, 0x0); + OUT_PKT4(ring, REG_A6XX_SP_FS_PREFETCH_CNTL, 1 + fs->num_sampler_prefetch); + OUT_RING(ring, A6XX_SP_FS_PREFETCH_CNTL_COUNT(fs->num_sampler_prefetch) | + A6XX_SP_FS_PREFETCH_CNTL_UNK4(regid(63, 0)) | + 0x7000); // XXX + for (int i = 0; i < fs->num_sampler_prefetch; i++) { + const struct ir3_sampler_prefetch *prefetch = &fs->sampler_prefetch[i]; + OUT_RING(ring, A6XX_SP_FS_PREFETCH_CMD_SRC(prefetch->src) | + A6XX_SP_FS_PREFETCH_CMD_SAMP_ID(prefetch->samp_id) | + A6XX_SP_FS_PREFETCH_CMD_TEX_ID(prefetch->tex_id) | + A6XX_SP_FS_PREFETCH_CMD_DST(prefetch->dst) | + A6XX_SP_FS_PREFETCH_CMD_WRMASK(prefetch->wrmask) | + COND(prefetch->half_precision, A6XX_SP_FS_PREFETCH_CMD_HALF) | + A6XX_SP_FS_PREFETCH_CMD_CMD(prefetch->cmd)); + } + OUT_PKT4(ring, REG_A6XX_SP_UNKNOWN_A9A8, 1); OUT_RING(ring, 0); OUT_PKT4(ring, REG_A6XX_SP_UNKNOWN_AB00, 1); OUT_RING(ring, 0x5); - OUT_PKT4(ring, REG_A6XX_SP_FS_INSTRLEN, 1); - OUT_RING(ring, fs->instrlen); /* SP_FS_INSTRLEN */ - OUT_PKT4(ring, REG_A6XX_SP_FS_OUTPUT_CNTL0, 1); OUT_RING(ring, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(posz_regid) | A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(smask_regid) | 0xfc000000); + enum a3xx_threadsize vssz; + uint32_t vsregs; + if (ds || hs) { + vssz = TWO_QUADS; + vsregs = 0; + } else { + vssz = FOUR_QUADS; + vsregs = A6XX_SP_VS_CTRL_REG0_MERGEDREGS; + } + OUT_PKT4(ring, REG_A6XX_SP_VS_CTRL_REG0, 1); - OUT_RING(ring, A6XX_SP_VS_CTRL_REG0_THREADSIZE(fssz) | + OUT_RING(ring, A6XX_SP_VS_CTRL_REG0_THREADSIZE(vssz) | A6XX_SP_VS_CTRL_REG0_FULLREGFOOTPRINT(vs->info.max_reg + 1) | - A6XX_SP_VS_CTRL_REG0_MERGEDREGS | + vsregs | A6XX_SP_VS_CTRL_REG0_BRANCHSTACK(vs->branchstack) | COND(vs->need_pixlod, A6XX_SP_VS_CTRL_REG0_PIXLODENABLE)); - struct ir3_shader_linkage l = {0}; - ir3_link_shaders(&l, vs, fs); + fd6_emit_shader(ring, vs); + ir3_emit_immediates(screen, vs, ring); - if ((vs->shader->stream_output.num_outputs > 0) && !binning_pass) - link_stream_out(&l, vs); + struct ir3_shader_linkage l = {0}; + const struct ir3_shader_variant *last_shader = fd6_last_shader(state); + ir3_link_shaders(&l, last_shader, fs); BITSET_DECLARE(varbs, 128) = {0}; uint32_t *varmask = (uint32_t *)varbs; @@ -378,23 +445,39 @@ OUT_RING(ring, ~varmask[2]); /* VPC_VAR[2].DISABLE */ OUT_RING(ring, ~varmask[3]); /* VPC_VAR[3].DISABLE */ - /* a6xx appends pos/psize to end of the linkage map: */ - if (VALIDREG(pos_regid)) + /* Add stream out outputs after computing the VPC_VAR_DISABLE bitmask. */ + if (last_shader->shader->stream_output.num_outputs > 0) + link_stream_out(&l, last_shader); + + if (VALIDREG(layer_regid)) { + layer_loc = l.max_loc; + ir3_link_add(&l, layer_regid, 0x1, l.max_loc); + } + + if (VALIDREG(pos_regid)) { + pos_loc = l.max_loc; ir3_link_add(&l, pos_regid, 0xf, l.max_loc); + } if (VALIDREG(psize_regid)) { psize_loc = l.max_loc; ir3_link_add(&l, psize_regid, 0x1, l.max_loc); } - if ((vs->shader->stream_output.num_outputs > 0) && !binning_pass) { - setup_stream_out(state, vs, &l); + if (last_shader->shader->stream_output.num_outputs > 0) { + setup_stream_out(state, last_shader, &l); } - for (i = 0, j = 0; (i < 16) && (j < l.cnt); i++) { - uint32_t reg = 0; + debug_assert(l.cnt < 32); + if (gs) + OUT_PKT4(ring, REG_A6XX_SP_GS_OUT_REG(0), DIV_ROUND_UP(l.cnt, 2)); + else if (ds) + OUT_PKT4(ring, REG_A6XX_SP_DS_OUT_REG(0), DIV_ROUND_UP(l.cnt, 2)); + else + OUT_PKT4(ring, REG_A6XX_SP_VS_OUT_REG(0), DIV_ROUND_UP(l.cnt, 2)); - OUT_PKT4(ring, REG_A6XX_SP_VS_OUT_REG(i), 1); + for (j = 0; j < l.cnt; ) { + uint32_t reg = 0; reg |= A6XX_SP_VS_OUT_REG_A_REGID(l.var[j].regid); reg |= A6XX_SP_VS_OUT_REG_A_COMPMASK(l.var[j].compmask); @@ -407,10 +490,15 @@ OUT_RING(ring, reg); } - for (i = 0, j = 0; (i < 8) && (j < l.cnt); i++) { - uint32_t reg = 0; + if (gs) + OUT_PKT4(ring, REG_A6XX_SP_GS_VPC_DST_REG(0), DIV_ROUND_UP(l.cnt, 4)); + else if (ds) + OUT_PKT4(ring, REG_A6XX_SP_DS_VPC_DST_REG(0), DIV_ROUND_UP(l.cnt, 4)); + else + OUT_PKT4(ring, REG_A6XX_SP_VS_VPC_DST_REG(0), DIV_ROUND_UP(l.cnt, 4)); - OUT_PKT4(ring, REG_A6XX_SP_VS_VPC_DST_REG(i), 1); + for (j = 0; j < l.cnt; ) { + uint32_t reg = 0; reg |= A6XX_SP_VS_VPC_DST_REG_OUTLOC0(l.var[j++].loc); reg |= A6XX_SP_VS_VPC_DST_REG_OUTLOC1(l.var[j++].loc); @@ -420,12 +508,87 @@ OUT_RING(ring, reg); } - OUT_PKT4(ring, REG_A6XX_SP_VS_OBJ_START_LO, 2); - OUT_RELOC(ring, vs->bo, 0, 0, 0); /* SP_VS_OBJ_START_LO/HI */ - - if (vs->instrlen) - fd6_emit_shader(ring, vs); + if (hs) { + OUT_PKT4(ring, REG_A6XX_SP_HS_CTRL_REG0, 1); + OUT_RING(ring, A6XX_SP_HS_CTRL_REG0_THREADSIZE(TWO_QUADS) | + A6XX_SP_HS_CTRL_REG0_FULLREGFOOTPRINT(hs->info.max_reg + 1) | + A6XX_SP_HS_CTRL_REG0_BRANCHSTACK(hs->branchstack) | + COND(hs->need_pixlod, A6XX_SP_HS_CTRL_REG0_PIXLODENABLE)); + + fd6_emit_shader(ring, hs); + ir3_emit_immediates(screen, hs, ring); + ir3_emit_link_map(screen, vs, hs, ring); + + OUT_PKT4(ring, REG_A6XX_SP_DS_CTRL_REG0, 1); + OUT_RING(ring, A6XX_SP_DS_CTRL_REG0_THREADSIZE(TWO_QUADS) | + A6XX_SP_DS_CTRL_REG0_FULLREGFOOTPRINT(ds->info.max_reg + 1) | + A6XX_SP_DS_CTRL_REG0_BRANCHSTACK(ds->branchstack) | + COND(ds->need_pixlod, A6XX_SP_DS_CTRL_REG0_PIXLODENABLE)); + + fd6_emit_shader(ring, ds); + ir3_emit_immediates(screen, ds, ring); + ir3_emit_link_map(screen, hs, ds, ring); + + shader_info *hs_info = &hs->shader->nir->info; + OUT_PKT4(ring, REG_A6XX_PC_TESS_NUM_VERTEX, 1); + OUT_RING(ring, hs_info->tess.tcs_vertices_out); + + /* Total attribute slots in HS incoming patch. */ + OUT_PKT4(ring, REG_A6XX_PC_UNKNOWN_9801, 1); + OUT_RING(ring, hs_info->tess.tcs_vertices_out * vs->shader->output_size / 4); + + OUT_PKT4(ring, REG_A6XX_SP_HS_UNKNOWN_A831, 1); + OUT_RING(ring, vs->shader->output_size); + + shader_info *ds_info = &ds->shader->nir->info; + OUT_PKT4(ring, REG_A6XX_PC_TESS_CNTL, 1); + uint32_t output; + if (ds_info->tess.point_mode) + output = TESS_POINTS; + else if (ds_info->tess.primitive_mode == GL_ISOLINES) + output = TESS_LINES; + else if (ds_info->tess.ccw) + output = TESS_CCW_TRIS; + else + output = TESS_CW_TRIS; + + OUT_RING(ring, A6XX_PC_TESS_CNTL_SPACING(fd6_gl2spacing(ds_info->tess.spacing)) | + A6XX_PC_TESS_CNTL_OUTPUT(output)); + + /* xxx: Misc tess unknowns: */ + OUT_PKT4(ring, REG_A6XX_VPC_UNKNOWN_9103, 1); + OUT_RING(ring, 0x00ffff00); + + OUT_PKT4(ring, REG_A6XX_VPC_UNKNOWN_9106, 1); + OUT_RING(ring, 0x0000ffff); + + OUT_PKT4(ring, REG_A6XX_GRAS_UNKNOWN_809D, 1); + OUT_RING(ring, 0x0); + + OUT_PKT4(ring, REG_A6XX_GRAS_UNKNOWN_8002, 1); + OUT_RING(ring, 0x0); + + OUT_PKT4(ring, REG_A6XX_VPC_PACK, 1); + OUT_RING(ring, A6XX_VPC_PACK_POSITIONLOC(pos_loc) | + A6XX_VPC_PACK_PSIZELOC(255) | + A6XX_VPC_PACK_STRIDE_IN_VPC(l.max_loc)); + + OUT_PKT4(ring, REG_A6XX_VPC_PACK_3, 1); + OUT_RING(ring, A6XX_VPC_PACK_3_POSITIONLOC(pos_loc) | + A6XX_VPC_PACK_3_PSIZELOC(psize_loc) | + A6XX_VPC_PACK_3_STRIDE_IN_VPC(l.max_loc)); + + OUT_PKT4(ring, REG_A6XX_SP_DS_PRIMITIVE_CNTL, 1); + OUT_RING(ring, A6XX_SP_DS_PRIMITIVE_CNTL_DSOUT(l.cnt)); + + OUT_PKT4(ring, REG_A6XX_PC_PRIMITIVE_CNTL_4, 1); + OUT_RING(ring, A6XX_PC_PRIMITIVE_CNTL_4_STRIDE_IN_VPC(l.max_loc) | + CONDREG(psize_regid, 0x100)); + } else { + OUT_PKT4(ring, REG_A6XX_SP_HS_UNKNOWN_A831, 1); + OUT_RING(ring, 0); + } OUT_PKT4(ring, REG_A6XX_SP_PRIMITIVE_CNTL, 1); OUT_RING(ring, A6XX_SP_PRIMITIVE_CNTL_VSOUT(l.cnt)); @@ -439,16 +602,10 @@ OUT_PKT4(ring, REG_A6XX_PC_PRIMITIVE_CNTL_1, 1); OUT_RING(ring, A6XX_PC_PRIMITIVE_CNTL_1_STRIDE_IN_VPC(l.max_loc) | - CONDREG(psize_regid, 0x100)); + CONDREG(psize_regid, A6XX_PC_PRIMITIVE_CNTL_1_PSIZE)); - if (binning_pass) { - OUT_PKT4(ring, REG_A6XX_SP_FS_OBJ_START_LO, 2); - OUT_RING(ring, 0x00000000); /* SP_FS_OBJ_START_LO */ - OUT_RING(ring, 0x00000000); /* SP_FS_OBJ_START_HI */ - } else { - OUT_PKT4(ring, REG_A6XX_SP_FS_OBJ_START_LO, 2); - OUT_RELOC(ring, fs->bo, 0, 0, 0); /* SP_FS_OBJ_START_LO/HI */ - } + OUT_PKT4(ring, REG_A6XX_PC_PRIMITIVE_CNTL_3, 1); + OUT_RING(ring, 0); OUT_PKT4(ring, REG_A6XX_HLSQ_CONTROL_1_REG, 5); OUT_RING(ring, 0x7); /* XXX */ @@ -537,46 +694,120 @@ } OUT_PKT4(ring, REG_A6XX_VPC_PACK, 1); - OUT_RING(ring, A6XX_VPC_PACK_NUMNONPOSVAR(fs->total_in) | + OUT_RING(ring, A6XX_VPC_PACK_POSITIONLOC(pos_loc) | A6XX_VPC_PACK_PSIZELOC(psize_loc) | A6XX_VPC_PACK_STRIDE_IN_VPC(l.max_loc)); - if (!binning_pass) { - /* figure out VARYING_INTERP / VARYING_PS_REPL register values: */ - for (j = -1; (j = ir3_next_varying(fs, j)) < (int)fs->inputs_count; ) { - /* NOTE: varyings are packed, so if compmask is 0xb - * then first, third, and fourth component occupy - * three consecutive varying slots: - */ - unsigned compmask = fs->inputs[j].compmask; - - uint32_t inloc = fs->inputs[j].inloc; + if (gs) { + OUT_PKT4(ring, REG_A6XX_SP_GS_CTRL_REG0, 1); + OUT_RING(ring, A6XX_SP_GS_CTRL_REG0_THREADSIZE(TWO_QUADS) | + A6XX_SP_GS_CTRL_REG0_FULLREGFOOTPRINT(gs->info.max_reg + 1) | + A6XX_SP_GS_CTRL_REG0_BRANCHSTACK(gs->branchstack) | + COND(gs->need_pixlod, A6XX_SP_GS_CTRL_REG0_PIXLODENABLE)); + + fd6_emit_shader(ring, gs); + ir3_emit_immediates(screen, gs, ring); + if (ds) + ir3_emit_link_map(screen, ds, gs, ring); + else + ir3_emit_link_map(screen, vs, gs, ring); + + OUT_PKT4(ring, REG_A6XX_VPC_PACK_GS, 1); + OUT_RING(ring, A6XX_VPC_PACK_GS_POSITIONLOC(pos_loc) | + A6XX_VPC_PACK_GS_PSIZELOC(psize_loc) | + A6XX_VPC_PACK_GS_STRIDE_IN_VPC(l.max_loc)); + + OUT_PKT4(ring, REG_A6XX_VPC_UNKNOWN_9105, 1); + OUT_RING(ring, A6XX_VPC_UNKNOWN_9105_LAYERLOC(layer_loc) | 0xff00); + + OUT_PKT4(ring, REG_A6XX_GRAS_UNKNOWN_809C, 1); + OUT_RING(ring, CONDREG(layer_regid, A6XX_GRAS_UNKNOWN_809C_GS_WRITES_LAYER)); + + uint32_t flags_regid = ir3_find_output_regid(gs, VARYING_SLOT_GS_VERTEX_FLAGS_IR3); + + OUT_PKT4(ring, REG_A6XX_SP_PRIMITIVE_CNTL_GS, 1); + OUT_RING(ring, A6XX_SP_PRIMITIVE_CNTL_GS_GSOUT(l.cnt) | + A6XX_SP_PRIMITIVE_CNTL_GS_FLAGS_REGID(flags_regid)); + + OUT_PKT4(ring, REG_A6XX_PC_PRIMITIVE_CNTL_2, 1); + OUT_RING(ring, A6XX_PC_PRIMITIVE_CNTL_2_STRIDE_IN_VPC(l.max_loc) | + CONDREG(psize_regid, A6XX_PC_PRIMITIVE_CNTL_2_PSIZE) | + CONDREG(layer_regid, A6XX_PC_PRIMITIVE_CNTL_2_LAYER) | + CONDREG(primitive_regid, A6XX_PC_PRIMITIVE_CNTL_2_PRIMITIVE_ID)); + + uint32_t output; + switch (gs->shader->nir->info.gs.output_primitive) { + case GL_POINTS: + output = TESS_POINTS; + break; + case GL_LINE_STRIP: + output = TESS_LINES; + break; + case GL_TRIANGLE_STRIP: + output = TESS_CW_TRIS; + break; + default: + unreachable(""); + } + OUT_PKT4(ring, REG_A6XX_PC_PRIMITIVE_CNTL_5, 1); + OUT_RING(ring, + A6XX_PC_PRIMITIVE_CNTL_5_GS_VERTICES_OUT(gs->shader->nir->info.gs.vertices_out - 1) | + A6XX_PC_PRIMITIVE_CNTL_5_GS_OUTPUT(output) | + A6XX_PC_PRIMITIVE_CNTL_5_GS_INVOCATIONS(gs->shader->nir->info.gs.invocations - 1)); + + OUT_PKT4(ring, REG_A6XX_GRAS_UNKNOWN_8003, 1); + OUT_RING(ring, 0); + + OUT_PKT4(ring, REG_A6XX_VPC_UNKNOWN_9100, 1); + OUT_RING(ring, 0xff); + + OUT_PKT4(ring, REG_A6XX_VPC_UNKNOWN_9102, 1); + OUT_RING(ring, 0xffff00); + + const struct ir3_shader_variant *prev = state->ds ? state->ds : state->vs; + + /* Size of per-primitive alloction in ldlw memory in vec4s. */ + uint32_t vec4_size = + gs->shader->nir->info.gs.vertices_in * + DIV_ROUND_UP(prev->shader->output_size, 4); + OUT_PKT4(ring, REG_A6XX_PC_PRIMITIVE_CNTL_6, 1); + OUT_RING(ring, A6XX_PC_PRIMITIVE_CNTL_6_STRIDE_IN_VPC(vec4_size)); - if (fs->inputs[j].interpolate == INTERP_MODE_FLAT) { - uint32_t loc = inloc; + OUT_PKT4(ring, REG_A6XX_PC_UNKNOWN_9B07, 1); + OUT_RING(ring, 0); - for (i = 0; i < 4; i++) { - if (compmask & (1 << i)) { - state->vinterp[loc / 16] |= 1 << ((loc % 16) * 2); - loc++; - } - } - } - } + OUT_PKT4(ring, REG_A6XX_SP_GS_UNKNOWN_A871, 1); + OUT_RING(ring, prev->shader->output_size); + } else { + OUT_PKT4(ring, REG_A6XX_PC_PRIMITIVE_CNTL_6, 1); + OUT_RING(ring, 0); + OUT_PKT4(ring, REG_A6XX_SP_GS_UNKNOWN_A871, 1); + OUT_RING(ring, 0); } - if (!binning_pass) - if (fs->instrlen) - fd6_emit_shader(ring, fs); + OUT_PKT4(ring, REG_A6XX_VPC_UNKNOWN_9101, 1); + OUT_RING(ring, 0xffff00); + + OUT_PKT4(ring, REG_A6XX_VPC_UNKNOWN_9107, 1); + OUT_RING(ring, 0); + + if (fs->instrlen) + fd6_emit_shader(ring, fs); OUT_PKT4(ring, REG_A6XX_VFD_CONTROL_1, 6); OUT_RING(ring, A6XX_VFD_CONTROL_1_REGID4VTX(vertex_regid) | A6XX_VFD_CONTROL_1_REGID4INST(instance_regid) | - 0xfcfc0000); - OUT_RING(ring, 0x0000fcfc); /* VFD_CONTROL_2 */ - OUT_RING(ring, 0xfcfcfcfc); /* VFD_CONTROL_3 */ + A6XX_VFD_CONTROL_1_REGID4PRIMID(primitive_regid) | + 0xfc000000); + OUT_RING(ring, A6XX_VFD_CONTROL_2_REGID_HSPATCHID(hs_patch_regid) | + A6XX_VFD_CONTROL_2_REGID_INVOCATIONID(hs_invocation_regid)); + OUT_RING(ring, A6XX_VFD_CONTROL_3_REGID_DSPATCHID(ds_patch_regid) | + A6XX_VFD_CONTROL_3_REGID_TESSX(tess_coord_x_regid) | + A6XX_VFD_CONTROL_3_REGID_TESSY(tess_coord_y_regid) | + 0xfc); OUT_RING(ring, 0x000000fc); /* VFD_CONTROL_4 */ - OUT_RING(ring, 0x0000fcfc); /* VFD_CONTROL_5 */ + OUT_RING(ring, A6XX_VFD_CONTROL_5_REGID_GSHEADER(gs_header_regid) | + 0xfc00); /* VFD_CONTROL_5 */ OUT_RING(ring, 0x00000000); /* VFD_CONTROL_6 */ bool fragz = fs->no_earlyz | fs->writes_pos; @@ -587,29 +818,66 @@ OUT_PKT4(ring, REG_A6XX_GRAS_SU_DEPTH_PLANE_CNTL, 1); OUT_RING(ring, COND(fragz, A6XX_GRAS_SU_DEPTH_PLANE_CNTL_FRAG_WRITES_Z)); - ir3_emit_immediates(screen, vs, ring); if (!binning_pass) ir3_emit_immediates(screen, fs, ring); } -/* emits the program state which is not part of the stateobj because of - * dependency on other gl state (rasterflat or sprite-coord-replacement) +static struct fd_ringbuffer * +create_interp_stateobj(struct fd_context *ctx, struct fd6_program_state *state) +{ + const struct ir3_shader_variant *fs = state->fs; + struct fd_ringbuffer *ring = fd_ringbuffer_new_object(ctx->pipe, 18 * 4); + uint32_t vinterp[8] = {0}; + + /* figure out VARYING_INTERP / VARYING_PS_REPL register values: */ + for (int j = -1; (j = ir3_next_varying(fs, j)) < (int)fs->inputs_count; ) { + /* NOTE: varyings are packed, so if compmask is 0xb + * then first, third, and fourth component occupy + * three consecutive varying slots: + */ + unsigned compmask = fs->inputs[j].compmask; + + uint32_t inloc = fs->inputs[j].inloc; + + if (fs->inputs[j].interpolate == INTERP_MODE_FLAT) { + uint32_t loc = inloc; + + for (int i = 0; i < 4; i++) { + if (compmask & (1 << i)) { + vinterp[loc / 16] |= 1 << ((loc % 16) * 2); + loc++; + } + } + } + } + + OUT_PKT4(ring, REG_A6XX_VPC_VARYING_INTERP_MODE(0), 8); + for (int i = 0; i < 8; i++) + OUT_RING(ring, vinterp[i]); /* VPC_VARYING_INTERP[i].MODE */ + + OUT_PKT4(ring, REG_A6XX_VPC_VARYING_PS_REPL_MODE(0), 8); + for (int i = 0; i < 8; i++) + OUT_RING(ring, 0x00000000); /* VPC_VARYING_PS_REPL[i] */ + + return ring; +} + +/* build the program streaming state which is not part of the pre- + * baked stateobj because of dependency on other gl state (rasterflat + * or sprite-coord-replacement) */ -void -fd6_program_emit(struct fd_ringbuffer *ring, struct fd6_emit *emit) +struct fd_ringbuffer * +fd6_program_interp_state(struct fd6_emit *emit) { const struct fd6_program_state *state = fd6_emit_get_prog(emit); if (!unlikely(emit->rasterflat || emit->sprite_coord_enable)) { /* fastpath: */ - OUT_PKT4(ring, REG_A6XX_VPC_VARYING_INTERP_MODE(0), 8); - for (int i = 0; i < 8; i++) - OUT_RING(ring, state->vinterp[i]); /* VPC_VARYING_INTERP[i].MODE */ - - OUT_PKT4(ring, REG_A6XX_VPC_VARYING_PS_REPL_MODE(0), 8); - for (int i = 0; i < 8; i++) - OUT_RING(ring, 0x00000000); /* VPC_VARYING_PS_REPL[i] */ + return fd_ringbuffer_ref(state->interp_stateobj); } else { + struct fd_ringbuffer *ring = fd_submit_new_ringbuffer( + emit->ctx->batch->submit, 18 * 4, FD_RINGBUFFER_STREAMING); + /* slow-path: */ struct ir3_shader_variant *fs = state->fs; uint32_t vinterp[8], vpsrepl[8]; @@ -684,36 +952,51 @@ OUT_PKT4(ring, REG_A6XX_VPC_VARYING_PS_REPL_MODE(0), 8); for (int i = 0; i < 8; i++) OUT_RING(ring, vpsrepl[i]); /* VPC_VARYING_PS_REPL[i] */ + + return ring; } } static struct ir3_program_state * fd6_program_create(void *data, struct ir3_shader_variant *bs, struct ir3_shader_variant *vs, + struct ir3_shader_variant *hs, + struct ir3_shader_variant *ds, + struct ir3_shader_variant *gs, struct ir3_shader_variant *fs, const struct ir3_shader_key *key) { struct fd_context *ctx = data; struct fd6_program_state *state = CALLOC_STRUCT(fd6_program_state); - state->bs = bs; + /* if we have streamout, use full VS in binning pass, as the + * binning pass VS will have outputs on other than position/psize + * stripped out: + */ + state->bs = vs->shader->stream_output.num_outputs ? vs : bs; state->vs = vs; + state->hs = hs; + state->ds = ds; + state->gs = gs; state->fs = fs; state->config_stateobj = fd_ringbuffer_new_object(ctx->pipe, 0x1000); state->binning_stateobj = fd_ringbuffer_new_object(ctx->pipe, 0x1000); state->stateobj = fd_ringbuffer_new_object(ctx->pipe, 0x1000); #ifdef DEBUG - for (unsigned i = 0; i < bs->inputs_count; i++) { - if (vs->inputs[i].sysval) - continue; - debug_assert(bs->inputs[i].regid == vs->inputs[i].regid); + if (!ds) { + for (unsigned i = 0; i < bs->inputs_count; i++) { + if (vs->inputs[i].sysval) + continue; + debug_assert(bs->inputs[i].regid == vs->inputs[i].regid); + } } #endif setup_config_stateobj(state->config_stateobj, state); setup_stateobj(state->binning_stateobj, ctx->screen, state, key, true); setup_stateobj(state->stateobj, ctx->screen, state, key, false); + state->interp_stateobj = create_interp_stateobj(ctx, state); return &state->base; } @@ -725,6 +1008,7 @@ fd_ringbuffer_del(so->stateobj); fd_ringbuffer_del(so->binning_stateobj); fd_ringbuffer_del(so->config_stateobj); + fd_ringbuffer_del(so->interp_stateobj); free(so); } @@ -733,6 +1017,37 @@ .destroy_state = fd6_program_destroy, }; +static void * +fd6_shader_state_create(struct pipe_context *pctx, const struct pipe_shader_state *cso) +{ + struct fd_context *ctx = fd_context(pctx); + struct ir3_compiler *compiler = ctx->screen->compiler; + struct ir3_shader *shader = + ir3_shader_create(compiler, cso, &ctx->debug, pctx->screen); + unsigned packets, size; + + /* pre-calculate size required for userconst stateobj: */ + ir3_user_consts_size(&shader->ubo_state, &packets, &size); + + /* also account for UBO addresses: */ + packets += 1; + size += 2 * align(shader->const_state.num_ubos, 2); + + unsigned sizedwords = (4 * packets) + size; + shader->ubo_state.cmdstream_size = sizedwords * 4; + + return shader; +} + +static void +fd6_shader_state_delete(struct pipe_context *pctx, void *hwcso) +{ + struct ir3_shader *so = hwcso; + struct fd_context *ctx = fd_context(pctx); + ir3_cache_invalidate(fd6_context(ctx)->shader_cache, hwcso); + ir3_shader_destroy(so); +} + void fd6_prog_init(struct pipe_context *pctx) { @@ -740,11 +1055,23 @@ fd6_context(ctx)->shader_cache = ir3_cache_create(&cache_funcs, ctx); - pctx->create_fs_state = fd6_fp_state_create; - pctx->delete_fs_state = fd6_fp_state_delete; + pctx->create_vs_state = fd6_shader_state_create; + pctx->delete_vs_state = fd6_shader_state_delete; + + pctx->create_tcs_state = fd6_shader_state_create; + pctx->delete_tcs_state = fd6_shader_state_delete; + + pctx->create_tes_state = fd6_shader_state_create; + pctx->delete_tes_state = fd6_shader_state_delete; + + pctx->create_gs_state = fd6_shader_state_create; + pctx->delete_gs_state = fd6_shader_state_delete; + + pctx->create_gs_state = fd6_shader_state_create; + pctx->delete_gs_state = fd6_shader_state_delete; - pctx->create_vs_state = fd6_vp_state_create; - pctx->delete_vs_state = fd6_vp_state_delete; + pctx->create_fs_state = fd6_shader_state_create; + pctx->delete_fs_state = fd6_shader_state_delete; fd_prog_init(pctx); } diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/a6xx/fd6_program.h mesa-20.0.8/src/gallium/drivers/freedreno/a6xx/fd6_program.h --- mesa-19.2.8/src/gallium/drivers/freedreno/a6xx/fd6_program.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/a6xx/fd6_program.h 2020-06-12 01:21:17.000000000 +0000 @@ -47,15 +47,17 @@ struct ir3_program_state base; struct ir3_shader_variant *bs; /* binning pass vs */ struct ir3_shader_variant *vs; + struct ir3_shader_variant *hs; + struct ir3_shader_variant *ds; + struct ir3_shader_variant *gs; struct ir3_shader_variant *fs; struct fd_ringbuffer *config_stateobj; + struct fd_ringbuffer *interp_stateobj; struct fd_ringbuffer *binning_stateobj; struct fd_ringbuffer *stateobj; /* cached state about current emitted shader program (3d): */ struct fd6_streamout_state tf; - - uint32_t vinterp[8]; }; static inline struct fd6_program_state * @@ -64,9 +66,20 @@ return (struct fd6_program_state *)state; } +static inline const struct ir3_shader_variant * +fd6_last_shader(const struct fd6_program_state *state) +{ + if (state->gs) + return state->gs; + else if (state->ds) + return state->ds; + else + return state->vs; +} + void fd6_emit_shader(struct fd_ringbuffer *ring, const struct ir3_shader_variant *so); -void fd6_program_emit(struct fd_ringbuffer *ring, struct fd6_emit *emit); +struct fd_ringbuffer * fd6_program_interp_state(struct fd6_emit *emit); void fd6_prog_init(struct pipe_context *pctx); diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/a6xx/fd6_query.c mesa-20.0.8/src/gallium/drivers/freedreno/a6xx/fd6_query.c --- mesa-19.2.8/src/gallium/drivers/freedreno/a6xx/fd6_query.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/a6xx/fd6_query.c 2020-06-12 01:21:17.000000000 +0000 @@ -252,6 +252,187 @@ .result = timestamp_accumulate_result, }; +struct PACKED fd6_primitives_sample { + struct { + uint64_t emitted, generated; + } start[4], stop[4], result; + + uint64_t prim_start[16], prim_stop[16], prim_emitted; +}; + + +#define primitives_relocw(ring, aq, field) \ + OUT_RELOCW(ring, fd_resource((aq)->prsc)->bo, offsetof(struct fd6_primitives_sample, field), 0, 0); +#define primitives_reloc(ring, aq, field) \ + OUT_RELOC(ring, fd_resource((aq)->prsc)->bo, offsetof(struct fd6_primitives_sample, field), 0, 0); + +#ifdef DEBUG_COUNTERS +static const unsigned counter_count = 10; +static const unsigned counter_base = REG_A6XX_RBBM_PRIMCTR_0_LO; + +static void +log_counters(struct fd6_primitives_sample *ps) +{ + const char *labels[] = { + "vs_vertices_in", + "vs_primitives_out", + "hs_vertices_in", + "hs_patches_out", + "ds_vertices_in", + "ds_primitives_out", + "gs_primitives_in", + "gs_primitives_out", + "ras_primitives_in", + "x", + }; + + printf(" counter\t\tstart\t\t\tstop\t\t\tdiff\n"); + for (int i = 0; i < counter_count; i++) { + printf(" RBBM_PRIMCTR_%d\t0x%016llx\t0x%016llx\t%lld\t%s\n", + i + (counter_base - REG_A6XX_RBBM_PRIMCTR_0_LO) / 2, + ps->prim_start[i], ps->prim_stop[i], ps->prim_stop[i] - ps->prim_start[i], labels[i]); + } + + printf(" so counts\n"); + for (int i = 0; i < ARRAY_SIZE(ps->start); i++) { + printf(" CHANNEL %d emitted\t0x%016llx\t0x%016llx\t%lld\n", + i, ps->start[i].generated, ps->stop[i].generated, ps->stop[i].generated - ps->start[i].generated); + printf(" CHANNEL %d generated\t0x%016llx\t0x%016llx\t%lld\n", + i, ps->start[i].emitted, ps->stop[i].emitted, ps->stop[i].emitted - ps->start[i].emitted); + } + + printf("generated %lld, emitted %lld\n", ps->result.generated, ps->result.emitted); +} + +#else + +static const unsigned counter_count = 1; +static const unsigned counter_base = REG_A6XX_RBBM_PRIMCTR_8_LO; + +static void +log_counters(struct fd6_primitives_sample *ps) +{ +} + +#endif + +static void +primitives_generated_resume(struct fd_acc_query *aq, struct fd_batch *batch) +{ + struct fd_ringbuffer *ring = batch->draw; + + fd_wfi(batch, ring); + + OUT_PKT7(ring, CP_REG_TO_MEM, 3); + OUT_RING(ring, CP_REG_TO_MEM_0_64B | + CP_REG_TO_MEM_0_CNT(counter_count) | + CP_REG_TO_MEM_0_REG(counter_base)); + primitives_relocw(ring, aq, prim_start); + + fd6_event_write(batch, ring, START_PRIMITIVE_CTRS, false); +} + +static void +primitives_generated_pause(struct fd_acc_query *aq, struct fd_batch *batch) +{ + struct fd_ringbuffer *ring = batch->draw; + + fd_wfi(batch, ring); + + /* snapshot the end values: */ + OUT_PKT7(ring, CP_REG_TO_MEM, 3); + OUT_RING(ring, CP_REG_TO_MEM_0_64B | + CP_REG_TO_MEM_0_CNT(counter_count) | + CP_REG_TO_MEM_0_REG(counter_base)); + primitives_relocw(ring, aq, prim_stop); + + fd6_event_write(batch, ring, STOP_PRIMITIVE_CTRS, false); + + /* result += stop - start: */ + OUT_PKT7(ring, CP_MEM_TO_MEM, 9); + OUT_RING(ring, CP_MEM_TO_MEM_0_DOUBLE | + CP_MEM_TO_MEM_0_NEG_C | 0x40000000); + primitives_relocw(ring, aq, result.generated); + primitives_reloc(ring, aq, prim_emitted); + primitives_reloc(ring, aq, prim_stop[(REG_A6XX_RBBM_PRIMCTR_8_LO - counter_base) / 2]) + primitives_reloc(ring, aq, prim_start[(REG_A6XX_RBBM_PRIMCTR_8_LO - counter_base) / 2]); +} + +static void +primitives_generated_result(struct fd_acc_query *aq, void *buf, + union pipe_query_result *result) +{ + struct fd6_primitives_sample *ps = buf; + + log_counters(ps); + + result->u64 = ps->result.generated; +} + +static const struct fd_acc_sample_provider primitives_generated = { + .query_type = PIPE_QUERY_PRIMITIVES_GENERATED, + .active = FD_STAGE_DRAW, + .size = sizeof(struct fd6_primitives_sample), + .resume = primitives_generated_resume, + .pause = primitives_generated_pause, + .result = primitives_generated_result, +}; + +static void +primitives_emitted_resume(struct fd_acc_query *aq, struct fd_batch *batch) +{ + struct fd_ringbuffer *ring = batch->draw; + + fd_wfi(batch, ring); + OUT_PKT4(ring, REG_A6XX_VPC_SO_STREAM_COUNTS_LO, 2); + primitives_relocw(ring, aq, start[0]); + + fd6_event_write(batch, ring, WRITE_PRIMITIVE_COUNTS, false); +} + +static void +primitives_emitted_pause(struct fd_acc_query *aq, struct fd_batch *batch) +{ + struct fd_ringbuffer *ring = batch->draw; + + fd_wfi(batch, ring); + + OUT_PKT4(ring, REG_A6XX_VPC_SO_STREAM_COUNTS_LO, 2); + primitives_relocw(ring, aq, stop[0]); + fd6_event_write(batch, ring, WRITE_PRIMITIVE_COUNTS, false); + + fd6_event_write(batch, batch->draw, CACHE_FLUSH_TS, true); + + /* result += stop - start: */ + OUT_PKT7(ring, CP_MEM_TO_MEM, 9); + OUT_RING(ring, CP_MEM_TO_MEM_0_DOUBLE | + CP_MEM_TO_MEM_0_NEG_C | 0x80000000); + primitives_relocw(ring, aq, result.emitted); + primitives_reloc(ring, aq, result.emitted); + primitives_reloc(ring, aq, stop[aq->base.index].emitted); + primitives_reloc(ring, aq, start[aq->base.index].emitted); +} + +static void +primitives_emitted_result(struct fd_acc_query *aq, void *buf, + union pipe_query_result *result) +{ + struct fd6_primitives_sample *ps = buf; + + log_counters(ps); + + result->u64 = ps->result.emitted; +} + +static const struct fd_acc_sample_provider primitives_emitted = { + .query_type = PIPE_QUERY_PRIMITIVES_EMITTED, + .active = FD_STAGE_DRAW, + .size = sizeof(struct fd6_primitives_sample), + .resume = primitives_emitted_resume, + .pause = primitives_emitted_pause, + .result = primitives_emitted_result, +}; + /* * Performance Counter (batch) queries: * @@ -433,7 +614,7 @@ counters_per_group[entry->gid]++; } - q = fd_acc_create_query2(ctx, 0, &perfcntr); + q = fd_acc_create_query2(ctx, 0, 0, &perfcntr); aq = fd_acc_query(q); /* sample buffer size is based on # of queries: */ @@ -463,4 +644,7 @@ fd_acc_query_register_provider(pctx, &time_elapsed); fd_acc_query_register_provider(pctx, ×tamp); + + fd_acc_query_register_provider(pctx, &primitives_generated); + fd_acc_query_register_provider(pctx, &primitives_emitted); } diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/a6xx/fd6_rasterizer.c mesa-20.0.8/src/gallium/drivers/freedreno/a6xx/fd6_rasterizer.c --- mesa-19.2.8/src/gallium/drivers/freedreno/a6xx/fd6_rasterizer.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/a6xx/fd6_rasterizer.c 2020-06-12 01:21:17.000000000 +0000 @@ -33,21 +33,15 @@ #include "fd6_rasterizer.h" #include "fd6_context.h" #include "fd6_format.h" +#include "fd6_pack.h" -void * -fd6_rasterizer_state_create(struct pipe_context *pctx, - const struct pipe_rasterizer_state *cso) +struct fd_ringbuffer * +__fd6_setup_rasterizer_stateobj(struct fd_context *ctx, + const struct pipe_rasterizer_state *cso, bool primitive_restart) { - struct fd_context *ctx = fd_context(pctx); - struct fd6_rasterizer_stateobj *so; + struct fd_ringbuffer *ring = fd_ringbuffer_new_object(ctx->pipe, 14 * 4); float psize_min, psize_max; - so = CALLOC_STRUCT(fd6_rasterizer_stateobj); - if (!so) - return NULL; - - so->base = *cso; - if (cso->point_size_per_vertex) { psize_min = util_get_min_point_size(cso); psize_max = 4092; @@ -57,82 +51,60 @@ psize_max = cso->point_size; } - so->gras_su_point_minmax = - A6XX_GRAS_SU_POINT_MINMAX_MIN(psize_min) | - A6XX_GRAS_SU_POINT_MINMAX_MAX(psize_max); - so->gras_su_point_size = A6XX_GRAS_SU_POINT_SIZE(cso->point_size); - so->gras_su_poly_offset_scale = - A6XX_GRAS_SU_POLY_OFFSET_SCALE(cso->offset_scale); - so->gras_su_poly_offset_offset = - A6XX_GRAS_SU_POLY_OFFSET_OFFSET(cso->offset_units); - so->gras_su_poly_offset_clamp = - A6XX_GRAS_SU_POLY_OFFSET_OFFSET_CLAMP(cso->offset_clamp); - - so->gras_su_cntl = - A6XX_GRAS_SU_CNTL_LINEHALFWIDTH(cso->line_width/2.0) | - COND(cso->multisample, A6XX_GRAS_SU_CNTL_MSAA_ENABLE); - -#if 0 - so->pc_raster_cntl = - A6XX_PC_RASTER_CNTL_POLYMODE_FRONT_PTYPE(fd_polygon_mode(cso->fill_front)) | - A6XX_PC_RASTER_CNTL_POLYMODE_BACK_PTYPE(fd_polygon_mode(cso->fill_back)); -#endif - -#if 0 - if (cso->fill_front != PIPE_POLYGON_MODE_FILL || - cso->fill_back != PIPE_POLYGON_MODE_FILL) - so->pc_raster_cntl |= A6XX_PC_RASTER_CNTL_POLYMODE_ENABLE; -#endif - - if (cso->cull_face & PIPE_FACE_FRONT) - so->gras_su_cntl |= A6XX_GRAS_SU_CNTL_CULL_FRONT; - if (cso->cull_face & PIPE_FACE_BACK) - so->gras_su_cntl |= A6XX_GRAS_SU_CNTL_CULL_BACK; - if (!cso->front_ccw) - so->gras_su_cntl |= A6XX_GRAS_SU_CNTL_FRONT_CW; - if (cso->offset_tri) - so->gras_su_cntl |= A6XX_GRAS_SU_CNTL_POLY_OFFSET; - - if (!cso->flatshade_first) - so->pc_primitive_cntl |= A6XX_PC_PRIMITIVE_CNTL_0_PROVOKING_VTX_LAST; - -// if (!cso->depth_clip) -// so->gras_cl_clip_cntl |= A6XX_GRAS_CL_CLIP_CNTL_ZNEAR_CLIP_DISABLE | -// A6XX_GRAS_CL_CLIP_CNTL_ZFAR_CLIP_DISABLE; -#if 0 - if (cso->clip_halfz) - so->gras_cl_clip_cntl |= A6XX_GRAS_CL_CNTL_ZERO_GB_SCALE_Z; -#endif - - so->stateobj = fd_ringbuffer_new_object(ctx->pipe, 15 * 4); - struct fd_ringbuffer *ring = so->stateobj; - - OUT_PKT4(ring, REG_A6XX_GRAS_UNKNOWN_8000, 1); - OUT_RING(ring, 0x80); - OUT_PKT4(ring, REG_A6XX_GRAS_UNKNOWN_8001, 1); - OUT_RING(ring, 0x0); - OUT_PKT4(ring, REG_A6XX_GRAS_UNKNOWN_8004, 1); - OUT_RING(ring, 0x0); - - OUT_PKT4(ring, REG_A6XX_GRAS_SU_CNTL, 1); - OUT_RING(ring, so->gras_su_cntl); - - OUT_PKT4(ring, REG_A6XX_GRAS_SU_POINT_MINMAX, 2); - OUT_RING(ring, so->gras_su_point_minmax); - OUT_RING(ring, so->gras_su_point_size); - - OUT_PKT4(ring, REG_A6XX_GRAS_SU_POLY_OFFSET_SCALE, 3); - OUT_RING(ring, so->gras_su_poly_offset_scale); - OUT_RING(ring, so->gras_su_poly_offset_offset); - OUT_RING(ring, so->gras_su_poly_offset_clamp); - -#if 0 - OUT_PKT4(ring, REG_A6XX_PC_RASTER_CNTL, 1); - OUT_RING(ring, so->pc_raster_cntl); - - OUT_PKT4(ring, REG_A6XX_GRAS_CL_CNTL, 1); - OUT_RING(ring, so->gras_cl_clip_cntl); -#endif + OUT_REG(ring, + A6XX_GRAS_UNKNOWN_8000(.unknown = 0x80), + A6XX_GRAS_UNKNOWN_8001()); + + OUT_REG(ring, + A6XX_GRAS_SU_CNTL( + .linehalfwidth = cso->line_width / 2.0, + .poly_offset = cso->offset_tri, + .msaa_enable = cso->multisample, + .cull_front = cso->cull_face & PIPE_FACE_FRONT, + .cull_back = cso->cull_face & PIPE_FACE_BACK, + .front_cw = !cso->front_ccw, + )); + + OUT_REG(ring, + A6XX_GRAS_SU_POINT_MINMAX( + .min = psize_min, + .max = psize_max, + ), + A6XX_GRAS_SU_POINT_SIZE( + cso->point_size + )); + + OUT_REG(ring, + A6XX_GRAS_SU_POLY_OFFSET_SCALE( + cso->offset_scale + ), + A6XX_GRAS_SU_POLY_OFFSET_OFFSET( + cso->offset_units + ), + A6XX_GRAS_SU_POLY_OFFSET_OFFSET_CLAMP( + cso->offset_clamp + )); + + OUT_REG(ring, + A6XX_PC_PRIMITIVE_CNTL_0( + .provoking_vtx_last = !cso->flatshade_first, + .primitive_restart = primitive_restart, + )); + + return ring; +} + +void * +fd6_rasterizer_state_create(struct pipe_context *pctx, + const struct pipe_rasterizer_state *cso) +{ + struct fd6_rasterizer_stateobj *so; + + so = CALLOC_STRUCT(fd6_rasterizer_stateobj); + if (!so) + return NULL; + + so->base = *cso; return so; } @@ -142,7 +114,10 @@ { struct fd6_rasterizer_stateobj *so = hwcso; - fd_ringbuffer_del(so->stateobj); + for (unsigned i = 0; i < ARRAY_SIZE(so->stateobjs); i++) + if (so->stateobjs[i]) + fd_ringbuffer_del(so->stateobjs[i]); + FREE(hwcso); } diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/a6xx/fd6_rasterizer.h mesa-20.0.8/src/gallium/drivers/freedreno/a6xx/fd6_rasterizer.h --- mesa-19.2.8/src/gallium/drivers/freedreno/a6xx/fd6_rasterizer.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/a6xx/fd6_rasterizer.h 2020-06-12 01:21:17.000000000 +0000 @@ -31,21 +31,12 @@ #include "pipe/p_state.h" #include "pipe/p_context.h" +#include "freedreno_context.h" + struct fd6_rasterizer_stateobj { struct pipe_rasterizer_state base; - uint32_t gras_su_point_minmax; - uint32_t gras_su_point_size; - uint32_t gras_su_poly_offset_scale; - uint32_t gras_su_poly_offset_offset; - uint32_t gras_su_poly_offset_clamp; - - uint32_t gras_su_cntl; - uint32_t gras_cl_clip_cntl; - uint32_t pc_primitive_cntl; - uint32_t pc_raster_cntl; - - struct fd_ringbuffer *stateobj; + struct fd_ringbuffer *stateobjs[2]; }; static inline struct fd6_rasterizer_stateobj * @@ -58,4 +49,21 @@ const struct pipe_rasterizer_state *cso); void fd6_rasterizer_state_delete(struct pipe_context *, void *hwcso); +struct fd_ringbuffer * __fd6_setup_rasterizer_stateobj(struct fd_context *ctx, + const struct pipe_rasterizer_state *cso, bool primitive_restart); + +static inline struct fd_ringbuffer * +fd6_rasterizer_state(struct fd_context *ctx, bool primitive_restart) +{ + struct fd6_rasterizer_stateobj *rasterizer = fd6_rasterizer_stateobj(ctx->rasterizer); + unsigned variant = primitive_restart; + + if (unlikely(!rasterizer->stateobjs[variant])) { + rasterizer->stateobjs[variant] = + __fd6_setup_rasterizer_stateobj(ctx, ctx->rasterizer, primitive_restart); + } + + return rasterizer->stateobjs[variant]; +} + #endif /* FD6_RASTERIZER_H_ */ diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/a6xx/fd6_resource.c mesa-20.0.8/src/gallium/drivers/freedreno/a6xx/fd6_resource.c --- mesa-19.2.8/src/gallium/drivers/freedreno/a6xx/fd6_resource.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/a6xx/fd6_resource.c 2020-06-12 01:21:17.000000000 +0000 @@ -30,138 +30,6 @@ #include "a6xx.xml.h" -/* indexed by cpp, including msaa 2x and 4x: */ -static const struct { - unsigned pitchalign; - unsigned heightalign; -} tile_alignment[] = { - [1] = { 128, 32 }, - [2] = { 64, 32 }, - [3] = { 64, 32 }, - [4] = { 64, 16 }, - [6] = { 64, 16 }, - [8] = { 64, 16 }, - [12] = { 64, 16 }, - [16] = { 64, 16 }, - [24] = { 64, 16 }, - [32] = { 64, 16 }, - [48] = { 64, 16 }, - [64] = { 64, 16 }, - - /* special cases for r16: */ - [0] = { 128, 16 }, -}; - -/* NOTE: good way to test this is: (for example) - * piglit/bin/texelFetch fs sampler3D 100x100x8 - */ -static uint32_t -setup_slices(struct fd_resource *rsc, uint32_t alignment, enum pipe_format format) -{ - struct pipe_resource *prsc = &rsc->base; - struct fd_screen *screen = fd_screen(prsc->screen); - enum util_format_layout layout = util_format_description(format)->layout; - uint32_t pitchalign = screen->gmem_alignw; - uint32_t level, size = 0; - uint32_t depth = prsc->depth0; - /* linear dimensions: */ - uint32_t lwidth = prsc->width0; - uint32_t lheight = prsc->height0; - /* tile_mode dimensions: */ - uint32_t twidth = util_next_power_of_two(lwidth); - uint32_t theight = util_next_power_of_two(lheight); - /* in layer_first layout, the level (slice) contains just one - * layer (since in fact the layer contains the slices) - */ - uint32_t layers_in_level = rsc->layer_first ? 1 : prsc->array_size; - int ta = rsc->cpp; - - /* The z16/r16 formats seem to not play by the normal tiling rules: */ - if ((rsc->cpp == 2) && (util_format_get_nr_components(format) == 1)) - ta = 0; - - debug_assert(ta < ARRAY_SIZE(tile_alignment)); - debug_assert(tile_alignment[ta].pitchalign); - - for (level = 0; level <= prsc->last_level; level++) { - struct fd_resource_slice *slice = fd_resource_slice(rsc, level); - bool linear_level = fd_resource_level_linear(prsc, level); - uint32_t width, height; - - /* tiled levels of 3D textures are rounded up to PoT dimensions: */ - if ((prsc->target == PIPE_TEXTURE_3D) && rsc->tile_mode && !linear_level) { - width = twidth; - height = theight; - } else { - width = lwidth; - height = lheight; - } - uint32_t aligned_height = height; - uint32_t blocks; - - if (rsc->tile_mode && !linear_level) { - pitchalign = tile_alignment[ta].pitchalign; - aligned_height = align(aligned_height, - tile_alignment[ta].heightalign); - } else { - pitchalign = 64; - } - - /* The blits used for mem<->gmem work at a granularity of - * 32x32, which can cause faults due to over-fetch on the - * last level. The simple solution is to over-allocate a - * bit the last level to ensure any over-fetch is harmless. - * The pitch is already sufficiently aligned, but height - * may not be: - */ - if ((level == prsc->last_level) && (prsc->target != PIPE_BUFFER)) - aligned_height = align(aligned_height, 32); - - if (layout == UTIL_FORMAT_LAYOUT_ASTC) - slice->pitch = - util_align_npot(width, pitchalign * util_format_get_blockwidth(format)); - else - slice->pitch = align(width, pitchalign); - - slice->offset = size; - blocks = util_format_get_nblocks(format, slice->pitch, aligned_height); - - /* 1d array and 2d array textures must all have the same layer size - * for each miplevel on a6xx. 3d textures can have different layer - * sizes for high levels, but the hw auto-sizer is buggy (or at least - * different than what this code does), so as soon as the layer size - * range gets into range, we stop reducing it. - */ - if (prsc->target == PIPE_TEXTURE_3D) { - if (level <= 1 || (rsc->slices[level - 1].size0 > 0xf000)) { - slice->size0 = align(blocks * rsc->cpp, alignment); - } else { - slice->size0 = rsc->slices[level - 1].size0; - } - } else { - slice->size0 = align(blocks * rsc->cpp, alignment); - } - - size += slice->size0 * depth * layers_in_level; - -#if 0 - debug_printf("%s: %ux%ux%u@%u:\t%2u: stride=%4u, size=%6u,%7u, aligned_height=%3u, blocks=%u\n", - util_format_name(prsc->format), - width, height, depth, rsc->cpp, - level, slice->pitch * rsc->cpp, - slice->size0, size, aligned_height, blocks); -#endif - - depth = u_minify(depth, 1); - lwidth = u_minify(lwidth, 1); - lheight = u_minify(lheight, 1); - twidth = u_minify(twidth, 1); - theight = u_minify(theight, 1); - } - - return size; -} - /* A subset of the valid tiled formats can be compressed. We do * already require tiled in order to be compressed, but just because * it can be tiled doesn't mean it can be compressed. @@ -201,12 +69,12 @@ case RB6_R8G8B8A8_SINT: case RB6_R8G8B8A8_UINT: case RB6_R8G8B8A8_UNORM: - case RB6_R8G8B8_UNORM: + case RB6_R8G8B8X8_UNORM: case RB6_R8G8_SINT: case RB6_R8G8_UINT: case RB6_R8G8_UNORM: - case RB6_X8Z24_UNORM: case RB6_Z24_UNORM_S8_UINT: + case RB6_Z24_UNORM_S8_UINT_AS_R8G8B8A8: return true; default: return false; @@ -232,7 +100,7 @@ return 0; uint32_t block_width, block_height; - switch (rsc->cpp) { + switch (rsc->layout.cpp) { case 2: case 4: block_width = 16; @@ -261,11 +129,15 @@ * because it is what the kernel expects for scanout. For non-2D we * could just use a separate UBWC buffer.. */ - rsc->ubwc_offset = 0; - rsc->offset = meta_size; - rsc->ubwc_pitch = meta_stride; - rsc->ubwc_size = meta_size >> 2; /* in dwords??? */ - rsc->tile_mode = TILE6_3; + for (int level = 0; level <= prsc->last_level; level++) { + struct fdl_slice *slice = fd_resource_slice(rsc, level); + slice->offset += meta_size; + } + + rsc->layout.ubwc_slices[0].offset = 0; + rsc->layout.ubwc_slices[0].pitch = meta_stride; + rsc->layout.ubwc_size = meta_size >> 2; /* in dwords??? */ + rsc->layout.tile_mode = TILE6_3; return meta_size; } @@ -279,7 +151,7 @@ fd6_validate_format(struct fd_context *ctx, struct fd_resource *rsc, enum pipe_format format) { - if (!rsc->ubwc_size) + if (!rsc->layout.ubwc_size) return; if (ok_ubwc_format(format)) @@ -288,21 +160,51 @@ fd_resource_uncompress(ctx, rsc); } +static void +setup_lrz(struct fd_resource *rsc) +{ + struct fd_screen *screen = fd_screen(rsc->base.screen); + const uint32_t flags = DRM_FREEDRENO_GEM_CACHE_WCOMBINE | + DRM_FREEDRENO_GEM_TYPE_KMEM; /* TODO */ + unsigned width0 = rsc->base.width0; + unsigned height0 = rsc->base.height0; + + /* LRZ buffer is super-sampled: */ + switch (rsc->base.nr_samples) { + case 4: + width0 *= 2; + /* fallthru */ + case 2: + height0 *= 2; + } + + unsigned lrz_pitch = align(DIV_ROUND_UP(width0, 8), 32); + unsigned lrz_height = align(DIV_ROUND_UP(height0, 8), 16); + + unsigned size = lrz_pitch * lrz_height * 2; + + rsc->lrz_height = lrz_height; + rsc->lrz_width = lrz_pitch; + rsc->lrz_pitch = lrz_pitch; + rsc->lrz = fd_bo_new(screen->dev, size, flags, "lrz"); +} + uint32_t fd6_setup_slices(struct fd_resource *rsc) { - uint32_t alignment; + struct pipe_resource *prsc = &rsc->base; - switch (rsc->base.target) { - case PIPE_TEXTURE_3D: - rsc->layer_first = false; - alignment = 4096; - break; - default: - rsc->layer_first = true; - alignment = 1; - break; - } + if (!(fd_mesa_debug & FD_DBG_NOLRZ) && has_depth(rsc->base.format)) + setup_lrz(rsc); - return setup_slices(rsc, alignment, rsc->base.format); + fdl6_layout(&rsc->layout, prsc->format, fd_resource_nr_samples(prsc), + prsc->width0, prsc->height0, prsc->depth0, + prsc->last_level + 1, prsc->array_size, + prsc->target == PIPE_TEXTURE_3D, false); + + /* The caller does this bit of layout setup again. */ + if (rsc->layout.layer_first) + return rsc->layout.size / prsc->array_size; + else + return rsc->layout.size; } diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/a6xx/fd6_resource.h mesa-20.0.8/src/gallium/drivers/freedreno/a6xx/fd6_resource.h --- mesa-19.2.8/src/gallium/drivers/freedreno/a6xx/fd6_resource.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/a6xx/fd6_resource.h 2020-06-12 01:21:17.000000000 +0000 @@ -34,5 +34,7 @@ void fd6_validate_format(struct fd_context *ctx, struct fd_resource *rsc, enum pipe_format format); uint32_t fd6_setup_slices(struct fd_resource *rsc); +void fd6_emit_flag_reference(struct fd_ringbuffer *ring, struct fd_resource *rsc, + int level, int layer); #endif /* FD6_RESOURCE_H_ */ diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/a6xx/fd6_screen.c mesa-20.0.8/src/gallium/drivers/freedreno/a6xx/fd6_screen.c --- mesa-19.2.8/src/gallium/drivers/freedreno/a6xx/fd6_screen.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/a6xx/fd6_screen.c 2020-06-12 01:21:17.000000000 +0000 @@ -27,7 +27,7 @@ #include "drm-uapi/drm_fourcc.h" #include "pipe/p_screen.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "fd6_screen.h" #include "fd6_blitter.h" @@ -82,9 +82,9 @@ } if ((usage & (PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_SHADER_IMAGE)) && + (fd6_pipe2tex(format) != (enum a6xx_tex_fmt)~0) && (target == PIPE_BUFFER || - util_format_get_blocksize(format) != 12) && - (fd6_pipe2tex(format) != (enum a6xx_tex_fmt)~0)) { + util_format_get_blocksize(format) != 12)) { retval |= usage & (PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_SHADER_IMAGE); } @@ -127,9 +127,6 @@ return retval == usage; } -extern const struct fd_perfcntr_group a6xx_perfcntr_groups[]; -extern const unsigned a6xx_num_perfcntr_groups; - void fd6_screen_init(struct pipe_screen *pscreen) { @@ -151,10 +148,5 @@ screen->supported_modifiers = supported_modifiers; screen->num_supported_modifiers = ARRAY_SIZE(supported_modifiers); - if (fd_mesa_debug & FD_DBG_PERFC) { - screen->perfcntr_groups = a6xx_perfcntr_groups; - screen->num_perfcntr_groups = a6xx_num_perfcntr_groups; - } - fd6_emit_init_screen(pscreen); } diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/a6xx/fd6_texture.c mesa-20.0.8/src/gallium/drivers/freedreno/a6xx/fd6_texture.c --- mesa-19.2.8/src/gallium/drivers/freedreno/a6xx/fd6_texture.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/a6xx/fd6_texture.c 2020-06-12 01:21:17.000000000 +0000 @@ -29,7 +29,7 @@ #include "util/u_string.h" #include "util/u_memory.h" #include "util/u_inlines.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/hash_table.h" #include "fd6_texture.h" @@ -130,15 +130,15 @@ A6XX_TEX_SAMP_0_WRAP_R(tex_clamp(cso->wrap_r, clamp_to_edge, &so->needs_border)); so->texsamp1 = + COND(cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE, + A6XX_TEX_SAMP_1_MIPFILTER_LINEAR_FAR) | COND(!cso->seamless_cube_map, A6XX_TEX_SAMP_1_CUBEMAPSEAMLESSFILTOFF) | COND(!cso->normalized_coords, A6XX_TEX_SAMP_1_UNNORM_COORDS); - if (cso->min_mip_filter != PIPE_TEX_MIPFILTER_NONE) { - so->texsamp0 |= A6XX_TEX_SAMP_0_LOD_BIAS(cso->lod_bias); - so->texsamp1 |= - A6XX_TEX_SAMP_1_MIN_LOD(cso->min_lod) | - A6XX_TEX_SAMP_1_MAX_LOD(cso->max_lod); - } + so->texsamp0 |= A6XX_TEX_SAMP_0_LOD_BIAS(cso->lod_bias); + so->texsamp1 |= + A6XX_TEX_SAMP_1_MIN_LOD(cso->min_lod) | + A6XX_TEX_SAMP_1_MAX_LOD(cso->max_lod); if (cso->compare_mode) so->texsamp1 |= A6XX_TEX_SAMP_1_COMPARE_FUNC(cso->compare_func); /* maps 1:1 */ @@ -220,6 +220,7 @@ { struct fd6_pipe_sampler_view *so = CALLOC_STRUCT(fd6_pipe_sampler_view); struct fd_resource *rsc = fd_resource(prsc); + struct fdl_slice *slice = NULL; enum pipe_format format = cso->format; unsigned lvl, layers = 0; @@ -255,6 +256,7 @@ unsigned miplevels; lvl = fd_sampler_first_level(cso); + slice = fd_resource_slice(rsc, lvl); miplevels = fd_sampler_last_level(cso) - lvl; layers = cso->u.tex.last_layer - cso->u.tex.first_layer + 1; @@ -265,8 +267,7 @@ so->texconst2 = A6XX_TEX_CONST_2_FETCHSIZE(fd6_pipe2fetchsize(format)) | A6XX_TEX_CONST_2_PITCH( - util_format_get_nblocksx( - format, rsc->slices[lvl].pitch) * rsc->cpp); + util_format_get_nblocksx(format, slice->pitch) * rsc->layout.cpp); so->offset = fd_resource_offset(rsc, lvl, cso->u.tex.first_layer); so->ubwc_offset = fd_resource_ubwc_offset(rsc, lvl, cso->u.tex.first_layer); so->ubwc_enabled = fd_resource_ubwc_enabled(rsc, lvl); @@ -277,8 +278,8 @@ cso->swizzle_b, cso->swizzle_a); if (so->ubwc_enabled) { - so->texconst9 |= A6XX_TEX_CONST_9_FLAG_BUFFER_ARRAY_PITCH(rsc->ubwc_size); - so->texconst10 |= A6XX_TEX_CONST_10_FLAG_BUFFER_PITCH(rsc->ubwc_pitch); + so->texconst9 |= A6XX_TEX_CONST_9_FLAG_BUFFER_ARRAY_PITCH(rsc->layout.ubwc_size); + so->texconst10 |= A6XX_TEX_CONST_10_FLAG_BUFFER_PITCH(rsc->layout.ubwc_slices[lvl].pitch); } so->texconst2 |= A6XX_TEX_CONST_2_TYPE(fd6_tex_type(cso->target)); @@ -288,28 +289,29 @@ case PIPE_TEXTURE_1D: case PIPE_TEXTURE_2D: so->texconst3 = - A6XX_TEX_CONST_3_ARRAY_PITCH(rsc->layer_size); + A6XX_TEX_CONST_3_ARRAY_PITCH(rsc->layout.layer_size); so->texconst5 = A6XX_TEX_CONST_5_DEPTH(1); break; case PIPE_TEXTURE_1D_ARRAY: case PIPE_TEXTURE_2D_ARRAY: so->texconst3 = - A6XX_TEX_CONST_3_ARRAY_PITCH(rsc->layer_size); + A6XX_TEX_CONST_3_ARRAY_PITCH(rsc->layout.layer_size); so->texconst5 = A6XX_TEX_CONST_5_DEPTH(layers); break; case PIPE_TEXTURE_CUBE: case PIPE_TEXTURE_CUBE_ARRAY: so->texconst3 = - A6XX_TEX_CONST_3_ARRAY_PITCH(rsc->layer_size); + A6XX_TEX_CONST_3_ARRAY_PITCH(rsc->layout.layer_size); so->texconst5 = A6XX_TEX_CONST_5_DEPTH(layers / 6); break; case PIPE_TEXTURE_3D: so->texconst3 = - A6XX_TEX_CONST_3_MIN_LAYERSZ(rsc->slices[prsc->last_level].size0) | - A6XX_TEX_CONST_3_ARRAY_PITCH(rsc->slices[lvl].size0); + A6XX_TEX_CONST_3_MIN_LAYERSZ( + fd_resource_slice(rsc, prsc->last_level)->size0) | + A6XX_TEX_CONST_3_ARRAY_PITCH(slice->size0); so->texconst5 = A6XX_TEX_CONST_5_DEPTH(u_minify(prsc->depth0, lvl)); break; @@ -318,7 +320,7 @@ } if (so->ubwc_enabled) { - so->texconst3 |= A6XX_TEX_CONST_3_FLAG | A6XX_TEX_CONST_3_UNK27; + so->texconst3 |= A6XX_TEX_CONST_3_FLAG | A6XX_TEX_CONST_3_TILE_ALL; } return &so->base; diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/a6xx/fd6_zsa.c mesa-20.0.8/src/gallium/drivers/freedreno/a6xx/fd6_zsa.c --- mesa-19.2.8/src/gallium/drivers/freedreno/a6xx/fd6_zsa.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/a6xx/fd6_zsa.c 2020-06-12 01:21:17.000000000 +0000 @@ -47,45 +47,61 @@ so->base = *cso; - switch (cso->depth.func) { - case PIPE_FUNC_LESS: - case PIPE_FUNC_LEQUAL: - so->gras_lrz_cntl = A6XX_GRAS_LRZ_CNTL_ENABLE; - so->rb_lrz_cntl = A6XX_RB_LRZ_CNTL_ENABLE; - break; - - case PIPE_FUNC_GREATER: - case PIPE_FUNC_GEQUAL: - so->gras_lrz_cntl = A6XX_GRAS_LRZ_CNTL_ENABLE | A6XX_GRAS_LRZ_CNTL_GREATER; - so->rb_lrz_cntl = A6XX_RB_LRZ_CNTL_ENABLE; - break; - - default: - /* LRZ not enabled */ - so->gras_lrz_cntl = 0; - break; - } - - if (cso->depth.writemask) { - if (cso->depth.enabled) - so->gras_lrz_cntl |= A6XX_GRAS_LRZ_CNTL_UNK4; - so->lrz_write = true; - } - so->rb_depth_cntl |= A6XX_RB_DEPTH_CNTL_ZFUNC(cso->depth.func); /* maps 1:1 */ - if (cso->depth.enabled) + if (cso->depth.enabled) { so->rb_depth_cntl |= A6XX_RB_DEPTH_CNTL_Z_ENABLE | A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE; + so->gras_lrz_cntl |= A6XX_GRAS_LRZ_CNTL_Z_TEST_ENABLE; + + if (cso->depth.writemask) { + so->lrz_write = true; + } + + switch (cso->depth.func) { + case PIPE_FUNC_LESS: + case PIPE_FUNC_LEQUAL: + so->gras_lrz_cntl |= A6XX_GRAS_LRZ_CNTL_ENABLE; + so->rb_lrz_cntl |= A6XX_RB_LRZ_CNTL_ENABLE; + break; + + case PIPE_FUNC_GREATER: + case PIPE_FUNC_GEQUAL: + so->gras_lrz_cntl |= A6XX_GRAS_LRZ_CNTL_ENABLE | A6XX_GRAS_LRZ_CNTL_GREATER; + so->rb_lrz_cntl |= A6XX_RB_LRZ_CNTL_ENABLE; + break; + + case PIPE_FUNC_NEVER: + so->gras_lrz_cntl |= A6XX_GRAS_LRZ_CNTL_ENABLE; + so->rb_lrz_cntl |= A6XX_RB_LRZ_CNTL_ENABLE; + so->lrz_write = false; + break; + + case PIPE_FUNC_EQUAL: + case PIPE_FUNC_NOTEQUAL: + case PIPE_FUNC_ALWAYS: + so->lrz_write = false; + so->invalidate_lrz = true; + break; + } + } + if (cso->depth.writemask) so->rb_depth_cntl |= A6XX_RB_DEPTH_CNTL_Z_WRITE_ENABLE; if (cso->stencil[0].enabled) { const struct pipe_stencil_state *s = &cso->stencil[0]; + /* stencil test happens before depth test, so without performing + * stencil test we don't really know what the updates to the + * depth buffer will be. + */ + so->lrz_write = false; + so->invalidate_lrz = true; + so->rb_stencil_control |= A6XX_RB_STENCIL_CONTROL_STENCIL_READ | A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE | diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/a6xx/fd6_zsa.h mesa-20.0.8/src/gallium/drivers/freedreno/a6xx/fd6_zsa.h --- mesa-19.2.8/src/gallium/drivers/freedreno/a6xx/fd6_zsa.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/a6xx/fd6_zsa.h 2020-06-12 01:21:17.000000000 +0000 @@ -45,6 +45,7 @@ uint32_t gras_lrz_cntl; uint32_t rb_lrz_cntl; bool lrz_write; + bool invalidate_lrz; struct fd_ringbuffer *stateobj; struct fd_ringbuffer *stateobj_no_alpha; diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/Android.mk mesa-20.0.8/src/gallium/drivers/freedreno/Android.mk --- mesa-19.2.8/src/gallium/drivers/freedreno/Android.mk 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/Android.mk 2020-06-12 01:21:17.000000000 +0000 @@ -44,7 +44,7 @@ LOCAL_GENERATED_SOURCES := $(MESA_GEN_NIR_H) LOCAL_SHARED_LIBRARIES := libdrm -LOCAL_STATIC_LIBRARIES := libmesa_glsl libmesa_nir libfreedreno_drm libfreedreno_ir3 libfreedreno_registers +LOCAL_STATIC_LIBRARIES := libmesa_glsl libmesa_nir libfreedreno_drm libfreedreno_ir3 libfreedreno_perfcntrs libfreedreno_registers LOCAL_MODULE := libmesa_pipe_freedreno include $(LOCAL_PATH)/Android.gen.mk diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/freedreno_batch.c mesa-20.0.8/src/gallium/drivers/freedreno/freedreno_batch.c --- mesa-19.2.8/src/gallium/drivers/freedreno/freedreno_batch.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/freedreno_batch.c 2020-06-12 01:21:17.000000000 +0000 @@ -41,9 +41,6 @@ struct fd_context *ctx = batch->ctx; unsigned size = 0; - if (ctx->screen->reorder) - util_queue_fence_init(&batch->flush_fence); - /* if kernel is too old to support unlimited # of cmd buffers, we * have no option but to allocate large worst-case sizes so that * we don't need to grow the ringbuffer. Performance is likely to @@ -166,6 +163,12 @@ batch->tile_fini = NULL; } + if (batch->tessellation) { + fd_bo_del(batch->tessfactor_bo); + fd_bo_del(batch->tessparam_bo); + fd_ringbuffer_del(batch->tess_addrs_constobj); + } + fd_submit_del(batch->submit); util_dynarray_fini(&batch->draw_patches); @@ -185,9 +188,6 @@ fd_hw_sample_reference(batch->ctx, &samp, NULL); } util_dynarray_fini(&batch->samples); - - if (batch->ctx->screen->reorder) - util_queue_fence_destroy(&batch->flush_fence); } static void @@ -198,7 +198,7 @@ foreach_batch(dep, cache, batch->dependents_mask) { if (flush) - fd_batch_flush(dep, false); + fd_batch_flush(dep); fd_batch_reference(&dep, NULL); } @@ -233,8 +233,6 @@ { DBG("%p", batch); - fd_batch_sync(batch); - batch_flush_reset_dependencies(batch, false); batch_reset_resources(batch); @@ -280,32 +278,6 @@ sprintf(buf, "fd_batch<%u>", batch->seqno); } -void -fd_batch_sync(struct fd_batch *batch) -{ - if (!batch->ctx->screen->reorder) - return; - util_queue_fence_wait(&batch->flush_fence); -} - -static void -batch_flush_func(void *job, int id) -{ - struct fd_batch *batch = job; - - DBG("%p", batch); - - fd_gmem_render_tiles(batch); - batch_reset_resources(batch); -} - -static void -batch_cleanup_func(void *job, int id) -{ - struct fd_batch *batch = job; - fd_batch_reference(&batch, NULL); -} - static void batch_flush(struct fd_batch *batch) { @@ -327,20 +299,8 @@ fd_fence_ref(&batch->ctx->last_fence, batch->fence); - if (batch->ctx->screen->reorder) { - struct fd_batch *tmp = NULL; - fd_batch_reference(&tmp, batch); - - if (!util_queue_is_initialized(&batch->ctx->flush_queue)) - util_queue_init(&batch->ctx->flush_queue, "flush_queue", 16, 1, 0); - - util_queue_add_job(&batch->ctx->flush_queue, - batch, &batch->flush_fence, - batch_flush_func, batch_cleanup_func); - } else { - fd_gmem_render_tiles(batch); - batch_reset_resources(batch); - } + fd_gmem_render_tiles(batch); + batch_reset_resources(batch); debug_assert(batch->reference.count > 0); @@ -358,10 +318,9 @@ * a fence to sync on */ void -fd_batch_flush(struct fd_batch *batch, bool sync) +fd_batch_flush(struct fd_batch *batch) { struct fd_batch *tmp = NULL; - bool newbatch = false; /* NOTE: we need to hold an extra ref across the body of flush, * since the last ref to this batch could be dropped when cleaning @@ -369,35 +328,12 @@ */ fd_batch_reference(&tmp, batch); - if (batch == batch->ctx->batch) { - batch->ctx->batch = NULL; - newbatch = true; - } - batch_flush(tmp); - if (newbatch) { - struct fd_context *ctx = batch->ctx; - struct fd_batch *new_batch; - - if (ctx->screen->reorder) { - /* defer allocating new batch until one is needed for rendering - * to avoid unused batches for apps that create many contexts - */ - new_batch = NULL; - } else { - new_batch = fd_bc_alloc_batch(&ctx->screen->batch_cache, ctx, false); - util_copy_framebuffer_state(&new_batch->framebuffer, &batch->framebuffer); - } - - fd_batch_reference(&batch, NULL); - ctx->batch = new_batch; - fd_context_all_dirty(ctx); + if (batch == batch->ctx->batch) { + fd_batch_reference(&batch->ctx->batch, NULL); } - if (sync) - fd_batch_sync(tmp); - fd_batch_reference(&tmp, NULL); } @@ -439,7 +375,7 @@ fd_batch_reference_locked(&b, rsc->write_batch); mtx_unlock(&b->ctx->screen->lock); - fd_batch_flush(b, true); + fd_batch_flush(b); mtx_lock(&b->ctx->screen->lock); fd_bc_invalidate_batch(b, false); @@ -513,7 +449,7 @@ debug_assert(!batch->flushed); if (unlikely(fd_mesa_debug & FD_DBG_FLUSH)) { - fd_batch_flush(batch, true); + fd_batch_flush(batch); return; } @@ -522,7 +458,7 @@ struct fd_ringbuffer *ring = batch->draw; if ((ring->cur - ring->start) > (ring->size/4 - 0x1000)) - fd_batch_flush(batch, true); + fd_batch_flush(batch); } /* emit a WAIT_FOR_IDLE only if needed, ie. if there has not already diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/freedreno_batch_cache.c mesa-20.0.8/src/gallium/drivers/freedreno/freedreno_batch_cache.c --- mesa-19.2.8/src/gallium/drivers/freedreno/freedreno_batch_cache.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/freedreno_batch_cache.c 2020-06-12 01:21:17.000000000 +0000 @@ -159,7 +159,7 @@ fd_context_unlock(ctx); for (unsigned i = 0; i < n; i++) { - fd_batch_flush(batches[i], false); + fd_batch_flush(batches[i]); } } @@ -295,9 +295,6 @@ */ struct fd_batch *flush_batch = NULL; for (unsigned i = 0; i < ARRAY_SIZE(cache->batches); i++) { - if ((cache->batches[i] == ctx->batch) || - !cache->batches[i]->needs_flush) - continue; if (!flush_batch || (cache->batches[i]->seqno < flush_batch->seqno)) fd_batch_reference_locked(&flush_batch, cache->batches[i]); } @@ -307,7 +304,7 @@ */ mtx_unlock(&ctx->screen->lock); DBG("%p: too many batches! flush forced!", flush_batch); - fd_batch_flush(flush_batch, true); + fd_batch_flush(flush_batch); mtx_lock(&ctx->screen->lock); /* While the resources get cleaned up automatically, the flush_batch diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/freedreno_batch.h mesa-20.0.8/src/gallium/drivers/freedreno/freedreno_batch.h --- mesa-19.2.8/src/gallium/drivers/freedreno/freedreno_batch.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/freedreno_batch.h 2020-06-12 01:21:17.000000000 +0000 @@ -55,7 +55,7 @@ FD_STAGE_ALL = 0xff, }; -#define MAX_HW_SAMPLE_PROVIDERS 5 +#define MAX_HW_SAMPLE_PROVIDERS 7 struct fd_hw_sample_provider; struct fd_hw_sample; @@ -74,8 +74,6 @@ struct fd_context *ctx; - struct util_queue_fence flush_fence; - /* do we need to mem2gmem before rendering. We don't, if for example, * there was a glClear() that invalidated the entire previous buffer * contents. Keep track of which buffer(s) are cleared, or needs @@ -103,6 +101,7 @@ bool flushed : 1; bool blit : 1; bool back_blit : 1; /* only blit so far is resource shadowing back-blit */ + bool tessellation : 1; /* tessellation used in batch */ /* Keep track if WAIT_FOR_IDLE is needed for registers we need * to update via RMW: @@ -124,6 +123,12 @@ FD_GMEM_LOGICOP_ENABLED = 0x20, FD_GMEM_FB_READ = 0x40, } gmem_reason; + + /* At submit time, once we've decided that this batch will use GMEM + * rendering, the appropriate gmem state is looked up: + */ + const struct fd_gmem_stateobj *gmem_state; + unsigned num_draws; /* number of draws in current batch */ unsigned num_vertices; /* number of vertices in current batch */ @@ -223,13 +228,24 @@ /** set of dependent batches.. holds refs to dependent batches: */ uint32_t dependents_mask; + + /* Buffer for tessellation engine input + */ + struct fd_bo *tessfactor_bo; + uint32_t tessfactor_size; + + /* Buffer for passing parameters between TCS and TES + */ + struct fd_bo *tessparam_bo; + uint32_t tessparam_size; + + struct fd_ringbuffer *tess_addrs_constobj; }; struct fd_batch * fd_batch_create(struct fd_context *ctx, bool nondraw); void fd_batch_reset(struct fd_batch *batch); -void fd_batch_sync(struct fd_batch *batch); -void fd_batch_flush(struct fd_batch *batch, bool sync); +void fd_batch_flush(struct fd_batch *batch); void fd_batch_add_dep(struct fd_batch *batch, struct fd_batch *dep); void fd_batch_resource_used(struct fd_batch *batch, struct fd_resource *rsc, bool write); void fd_batch_check_size(struct fd_batch *batch); diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/freedreno_blitter.c mesa-20.0.8/src/gallium/drivers/freedreno/freedreno_blitter.c --- mesa-19.2.8/src/gallium/drivers/freedreno/freedreno_blitter.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/freedreno_blitter.c 2020-06-12 01:21:17.000000000 +0000 @@ -86,13 +86,16 @@ ctx->constbuf[PIPE_SHADER_FRAGMENT].cb); util_blitter_save_vertex_buffer_slot(ctx->blitter, ctx->vtx.vertexbuf.vb); util_blitter_save_vertex_elements(ctx->blitter, ctx->vtx.vtx); - util_blitter_save_vertex_shader(ctx->blitter, ctx->prog.vp); + util_blitter_save_vertex_shader(ctx->blitter, ctx->prog.vs); + util_blitter_save_tessctrl_shader(ctx->blitter, ctx->prog.hs); + util_blitter_save_tesseval_shader(ctx->blitter, ctx->prog.ds); + util_blitter_save_geometry_shader(ctx->blitter, ctx->prog.gs); util_blitter_save_so_targets(ctx->blitter, ctx->streamout.num_targets, ctx->streamout.targets); util_blitter_save_rasterizer(ctx->blitter, ctx->rasterizer); util_blitter_save_viewport(ctx->blitter, &ctx->viewport); util_blitter_save_scissor(ctx->blitter, &ctx->scissor); - util_blitter_save_fragment_shader(ctx->blitter, ctx->prog.fp); + util_blitter_save_fragment_shader(ctx->blitter, ctx->prog.fs); util_blitter_save_blend(ctx->blitter, ctx->blend); util_blitter_save_depth_stencil_alpha(ctx->blitter, ctx->zsa); util_blitter_save_stencil_ref(ctx->blitter, &ctx->stencil_ref); @@ -216,8 +219,8 @@ pctx->set_vertex_buffers(pctx, blitter->vb_slot, 1, &ctx->solid_vbuf_state.vertexbuf.vb[0]); pctx->set_stream_output_targets(pctx, 0, NULL, NULL); - pctx->bind_vs_state(pctx, ctx->solid_prog.vp); - pctx->bind_fs_state(pctx, ctx->solid_prog.fp); + pctx->bind_vs_state(pctx, ctx->solid_prog.vs); + pctx->bind_fs_state(pctx, ctx->solid_prog.fs); struct pipe_draw_info info = { .mode = PIPE_PRIM_MAX, /* maps to DI_PT_RECTLIST */ diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/freedreno_context.c mesa-20.0.8/src/gallium/drivers/freedreno/freedreno_context.c --- mesa-19.2.8/src/gallium/drivers/freedreno/freedreno_context.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/freedreno_context.c 2020-06-12 01:21:17.000000000 +0000 @@ -49,6 +49,14 @@ DBG("%p: flush: flags=%x\n", ctx->batch, flags); + /* In some sequence of events, we can end up with a last_fence that is + * not an "fd" fence, which results in eglDupNativeFenceFDANDROID() + * errors. + * + */ + if (flags & PIPE_FLUSH_FENCE_FD) + fd_fence_ref(&ctx->last_fence, NULL); + /* if no rendering since last flush, ie. app just decided it needed * a fence, re-use the last one: */ @@ -63,13 +71,11 @@ /* Take a ref to the batch's fence (batch can be unref'd when flushed: */ fd_fence_ref(&fence, batch->fence); - /* TODO is it worth trying to figure out if app is using fence-fd's, to - * avoid requesting one every batch? - */ - batch->needs_out_fence_fd = true; + if (flags & PIPE_FLUSH_FENCE_FD) + batch->needs_out_fence_fd = true; if (!ctx->screen->reorder) { - fd_batch_flush(batch, true); + fd_batch_flush(batch); } else if (flags & PIPE_FLUSH_DEFERRED) { fd_bc_flush_deferred(&ctx->screen->batch_cache, ctx); } else { @@ -164,9 +170,6 @@ fd_fence_ref(&ctx->last_fence, NULL); - if (ctx->screen->reorder && util_queue_is_initialized(&ctx->flush_queue)) - util_queue_destroy(&ctx->flush_queue); - util_copy_framebuffer_state(&ctx->framebuffer, NULL); fd_batch_reference(&ctx->batch, NULL); /* unref current batch */ fd_bc_invalidate_context(ctx); @@ -187,16 +190,17 @@ slab_destroy_child(&ctx->transfer_pool); - for (i = 0; i < ARRAY_SIZE(ctx->vsc_pipe); i++) { - struct fd_vsc_pipe *pipe = &ctx->vsc_pipe[i]; - if (!pipe->bo) + for (i = 0; i < ARRAY_SIZE(ctx->vsc_pipe_bo); i++) { + if (!ctx->vsc_pipe_bo[i]) break; - fd_bo_del(pipe->bo); + fd_bo_del(ctx->vsc_pipe_bo[i]); } fd_device_del(ctx->dev); fd_pipe_del(ctx->pipe); + mtx_destroy(&ctx->gmem_lock); + if (fd_mesa_debug & (FD_DBG_BSTAT | FD_DBG_MSGS)) { printf("batch_total=%u, batch_sysmem=%u, batch_gmem=%u, batch_nondraw=%u, batch_restore=%u\n", (uint32_t)ctx->stats.batch_total, (uint32_t)ctx->stats.batch_sysmem, @@ -357,6 +361,8 @@ if (primtypes[i]) ctx->primtype_mask |= (1 << i); + (void) mtx_init(&ctx->gmem_lock, mtx_plain); + /* need some sane default in case state tracker doesn't * set some state: */ @@ -379,9 +385,6 @@ goto fail; pctx->const_uploader = pctx->stream_uploader; - if (!ctx->screen->reorder) - ctx->batch = fd_bc_alloc_batch(&screen->batch_cache, ctx, false); - slab_create_child(&ctx->transfer_pool, &screen->transfer_pool); fd_draw_init(pctx); diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/freedreno_context.h mesa-20.0.8/src/gallium/drivers/freedreno/freedreno_context.h --- mesa-19.2.8/src/gallium/drivers/freedreno/freedreno_context.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/freedreno_context.h 2020-06-12 01:21:17.000000000 +0000 @@ -55,7 +55,7 @@ }; struct fd_program_stateobj { - void *vp, *fp; + void *vs, *hs, *ds, *gs, *fs; }; struct fd_constbuf_stateobj { @@ -86,6 +86,9 @@ struct fd_streamout_stateobj { struct pipe_stream_output_target *targets[PIPE_MAX_SO_BUFFERS]; + /* Bitmask of stream that should be reset. */ + unsigned reset; + unsigned num_targets; /* Track offset from vtxcnt for streamout data. This counter * is just incremented by # of vertices on each draw until @@ -156,12 +159,20 @@ struct fd_context { struct pipe_context base; + /* We currently need to serialize emitting GMEM batches, because of + * VSC state access in the context. + * + * In practice this lock should not be contended, since pipe_context + * use should be single threaded. But it is needed to protect the + * case, with batch reordering where a ctxB batch triggers flushing + * a ctxA batch + */ + mtx_t gmem_lock; + struct fd_device *dev; struct fd_screen *screen; struct fd_pipe *pipe; - struct util_queue flush_queue; - struct blitter_context *blitter; void *clear_rs_state; struct primconvert_context *primconvert; @@ -212,7 +223,7 @@ uint64_t draw_calls; uint64_t batch_total, batch_sysmem, batch_gmem, batch_nondraw, batch_restore; uint64_t staging_uploads, shadow_uploads; - uint64_t vs_regs, fs_regs; + uint64_t vs_regs, hs_regs, ds_regs, gs_regs, fs_regs; } stats; /* Current batch.. the rule here is that you can deref ctx->batch @@ -256,15 +267,8 @@ */ struct pipe_scissor_state disabled_scissor; - /* Current gmem/tiling configuration.. gets updated on render_tiles() - * if out of date with current maximal-scissor/cpp: - * - * (NOTE: this is kind of related to the batch, but moving it there - * means we'd always have to recalc tiles ever batch) - */ - struct fd_gmem_stateobj gmem; - struct fd_vsc_pipe vsc_pipe[32]; - struct fd_tile tile[512]; + /* Per vsc pipe bo's (a2xx-a5xx): */ + struct fd_bo *vsc_pipe_bo[32]; /* which state objects need to be re-emit'd: */ enum fd_dirty_3d_state dirty; @@ -307,11 +311,11 @@ /* GMEM/tile handling fxns: */ void (*emit_tile_init)(struct fd_batch *batch); - void (*emit_tile_prep)(struct fd_batch *batch, struct fd_tile *tile); - void (*emit_tile_mem2gmem)(struct fd_batch *batch, struct fd_tile *tile); - void (*emit_tile_renderprep)(struct fd_batch *batch, struct fd_tile *tile); - void (*emit_tile)(struct fd_batch *batch, struct fd_tile *tile); - void (*emit_tile_gmem2mem)(struct fd_batch *batch, struct fd_tile *tile); + void (*emit_tile_prep)(struct fd_batch *batch, const struct fd_tile *tile); + void (*emit_tile_mem2gmem)(struct fd_batch *batch, const struct fd_tile *tile); + void (*emit_tile_renderprep)(struct fd_batch *batch, const struct fd_tile *tile); + void (*emit_tile)(struct fd_batch *batch, const struct fd_tile *tile); + void (*emit_tile_gmem2mem)(struct fd_batch *batch, const struct fd_tile *tile); void (*emit_tile_fini)(struct fd_batch *batch); /* optional */ /* optional, for GMEM bypass: */ @@ -328,7 +332,7 @@ void (*launch_grid)(struct fd_context *ctx, const struct pipe_grid_info *info); /* query: */ - struct fd_query * (*create_query)(struct fd_context *ctx, unsigned query_type); + struct fd_query * (*create_query)(struct fd_context *ctx, unsigned query_type, unsigned index); void (*query_prepare)(struct fd_batch *batch, uint32_t num_tiles); void (*query_prepare_tile)(struct fd_batch *batch, uint32_t n, struct fd_ringbuffer *ring); @@ -360,6 +364,20 @@ * - solid_vbuf / 12 / R32G32B32_FLOAT */ struct fd_vertex_state blit_vbuf_state; + + /* + * Info about state of previous draw, for state that comes from + * pipe_draw_info (ie. not part of a CSO). This allows us to + * skip some register emit when the state doesn't change from + * draw-to-draw + */ + struct { + bool dirty; /* last draw state unknown */ + bool primitive_restart; + uint32_t index_start; + uint32_t instance_start; + uint32_t restart_index; + } last; }; static inline struct fd_context * @@ -390,6 +408,7 @@ static inline void fd_context_all_dirty(struct fd_context *ctx) { + ctx->last.dirty = true; ctx->dirty = ~0; for (unsigned i = 0; i < PIPE_SHADER_TYPES; i++) ctx->dirty_shader[i] = ~0; diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/freedreno_draw.c mesa-20.0.8/src/gallium/drivers/freedreno/freedreno_draw.c --- mesa-19.2.8/src/gallium/drivers/freedreno/freedreno_draw.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/freedreno_draw.c 2020-06-12 01:21:17.000000000 +0000 @@ -29,7 +29,7 @@ #include "util/u_string.h" #include "util/u_memory.h" #include "util/u_prim.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_helpers.h" #include "freedreno_blitter.h" @@ -93,15 +93,13 @@ return; } - fd_fence_ref(&ctx->last_fence, NULL); - /* Upload a user index buffer. */ struct pipe_resource *indexbuf = NULL; unsigned index_offset = 0; struct pipe_draw_info new_info; if (info->index_size) { if (info->has_user_indices) { - if (!util_upload_index_buffer(pctx, info, &indexbuf, &index_offset)) + if (!util_upload_index_buffer(pctx, info, &indexbuf, &index_offset, 4)) return; new_info = *info; new_info.index.resource = indexbuf; @@ -256,7 +254,14 @@ batch->num_draws++; - prims = u_reduced_prims_for_vertices(info->mode, info->count); + /* Counting prims in sw doesn't work for GS and tesselation. For older + * gens we don't have those stages and don't have the hw counters enabled, + * so keep the count accurate for non-patch geometry. + */ + if (info->mode != PIPE_PRIM_PATCHES) + prims = u_reduced_prims_for_vertices(info->mode, info->count); + else + prims = 0; ctx->stats.draw_calls++; @@ -275,6 +280,12 @@ /* and any buffers used, need to be resolved: */ batch->resolve |= buffers; + /* Clearing last_fence must come after the batch dependency tracking + * (resource_read()/resource_written()), as that can trigger a flush, + * re-populating last_fence + */ + fd_fence_ref(&ctx->last_fence, NULL); + DBG("%p: %x %ux%u num_draws=%u (%s/%s)", batch, buffers, pfb->width, pfb->height, batch->num_draws, util_format_short_name(pipe_surface_format(pfb->cbufs[0])), @@ -311,8 +322,6 @@ if (!fd_render_condition_check(pctx)) return; - fd_fence_ref(&ctx->last_fence, NULL); - if (ctx->in_blit) { fd_batch_reset(batch); fd_context_all_dirty(ctx); @@ -359,6 +368,12 @@ mtx_unlock(&ctx->screen->lock); + /* Clearing last_fence must come after the batch dependency tracking + * (resource_read()/resource_written()), as that can trigger a flush, + * re-populating last_fence + */ + fd_fence_ref(&ctx->last_fence, NULL); + DBG("%p: %x %ux%u depth=%f, stencil=%u (%s/%s)", batch, buffers, pfb->width, pfb->height, depth, stencil, util_format_short_name(pipe_surface_format(pfb->cbufs[0])), @@ -457,7 +472,7 @@ batch->needs_flush = true; ctx->launch_grid(ctx, info); - fd_batch_flush(batch, false); + fd_batch_flush(batch); fd_batch_reference(&ctx->batch, save_batch); fd_context_all_dirty(ctx); diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/freedreno_fence.c mesa-20.0.8/src/gallium/drivers/freedreno/freedreno_fence.c --- mesa-19.2.8/src/gallium/drivers/freedreno/freedreno_fence.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/freedreno_fence.c 2020-06-12 01:21:17.000000000 +0000 @@ -48,7 +48,7 @@ static void fence_flush(struct pipe_fence_handle *fence) { if (fence->batch) - fd_batch_flush(fence->batch, true); + fd_batch_flush(fence->batch); debug_assert(!fence->batch); } diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/freedreno_gmem.c mesa-20.0.8/src/gallium/drivers/freedreno/freedreno_gmem.c --- mesa-19.2.8/src/gallium/drivers/freedreno/freedreno_gmem.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/freedreno_gmem.c 2020-06-12 01:21:17.000000000 +0000 @@ -25,10 +25,11 @@ */ #include "pipe/p_state.h" +#include "util/hash_table.h" #include "util/u_string.h" #include "util/u_memory.h" #include "util/u_inlines.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "freedreno_gmem.h" #include "freedreno_context.h" @@ -68,6 +69,44 @@ #define BIN_DEBUG 0 +/* + * GMEM Cache: + * + * Caches GMEM state based on a given framebuffer state. The key is + * meant to be the minimal set of data that results in a unique gmem + * configuration, avoiding multiple keys arriving at the same gmem + * state. For example, the render target format is not part of the + * key, only the size per pixel. And the max_scissor bounds is not + * part of they key, only the minx/miny (after clamping to tile + * alignment) and width/height. This ensures that slightly different + * max_scissor which would result in the same gmem state, do not + * become different keys that map to the same state. + */ + +struct gmem_key { + uint16_t minx, miny; + uint16_t width, height; + uint8_t gmem_page_align; /* alignment in multiples of 0x1000 to reduce key size */ + uint8_t nr_cbufs; + uint8_t cbuf_cpp[MAX_RENDER_TARGETS]; + uint8_t zsbuf_cpp[2]; +}; + +static uint32_t +gmem_key_hash(const void *_key) +{ + const struct gmem_key *key = _key; + return _mesa_hash_data(key, sizeof(*key)); +} + +static bool +gmem_key_equals(const void *_a, const void *_b) +{ + const struct gmem_key *a = _a; + const struct gmem_key *b = _b; + return memcmp(a, b, sizeof(*a)) == 0; +} + static uint32_t bin_width(struct fd_screen *screen) { if (is_a4xx(screen) || is_a5xx(screen) || is_a6xx(screen)) @@ -78,148 +117,97 @@ } static uint32_t -total_size(uint8_t cbuf_cpp[], uint8_t zsbuf_cpp[2], - uint32_t bin_w, uint32_t bin_h, uint32_t gmem_align, - struct fd_gmem_stateobj *gmem) +total_size(struct gmem_key *key, uint32_t bin_w, uint32_t bin_h, + struct fd_gmem_stateobj *gmem) { + uint32_t gmem_align = key->gmem_page_align * 0x1000; uint32_t total = 0, i; for (i = 0; i < MAX_RENDER_TARGETS; i++) { - if (cbuf_cpp[i]) { + if (key->cbuf_cpp[i]) { gmem->cbuf_base[i] = align(total, gmem_align); - total = gmem->cbuf_base[i] + cbuf_cpp[i] * bin_w * bin_h; + total = gmem->cbuf_base[i] + key->cbuf_cpp[i] * bin_w * bin_h; } } - if (zsbuf_cpp[0]) { + if (key->zsbuf_cpp[0]) { gmem->zsbuf_base[0] = align(total, gmem_align); - total = gmem->zsbuf_base[0] + zsbuf_cpp[0] * bin_w * bin_h; + total = gmem->zsbuf_base[0] + key->zsbuf_cpp[0] * bin_w * bin_h; } - if (zsbuf_cpp[1]) { + if (key->zsbuf_cpp[1]) { gmem->zsbuf_base[1] = align(total, gmem_align); - total = gmem->zsbuf_base[1] + zsbuf_cpp[1] * bin_w * bin_h; + total = gmem->zsbuf_base[1] + key->zsbuf_cpp[1] * bin_w * bin_h; } return total; } -static void -calculate_tiles(struct fd_batch *batch) +static struct fd_gmem_stateobj * +gmem_stateobj_init(struct fd_screen *screen, struct gmem_key *key) { - struct fd_context *ctx = batch->ctx; - struct fd_screen *screen = ctx->screen; - struct fd_gmem_stateobj *gmem = &ctx->gmem; - struct pipe_scissor_state *scissor = &batch->max_scissor; - struct pipe_framebuffer_state *pfb = &batch->framebuffer; + struct fd_gmem_stateobj *gmem = + rzalloc(screen->gmem_cache.ht, struct fd_gmem_stateobj); + pipe_reference_init(&gmem->reference, 1); + gmem->screen = screen; + gmem->key = key; + list_inithead(&gmem->node); + const uint32_t gmem_alignw = screen->gmem_alignw; const uint32_t gmem_alignh = screen->gmem_alignh; const unsigned npipes = screen->num_vsc_pipes; const uint32_t gmem_size = screen->gmemsize_bytes; - uint32_t minx, miny, width, height; uint32_t nbins_x = 1, nbins_y = 1; uint32_t bin_w, bin_h; - uint32_t gmem_align = 0x4000; uint32_t max_width = bin_width(screen); - uint8_t cbuf_cpp[MAX_RENDER_TARGETS] = {0}, zsbuf_cpp[2] = {0}; uint32_t i, j, t, xoff, yoff; uint32_t tpp_x, tpp_y; - bool has_zs = !!(batch->gmem_reason & (FD_GMEM_DEPTH_ENABLED | - FD_GMEM_STENCIL_ENABLED | FD_GMEM_CLEARS_DEPTH_STENCIL)); int tile_n[npipes]; - if (has_zs) { - struct fd_resource *rsc = fd_resource(pfb->zsbuf->texture); - zsbuf_cpp[0] = rsc->cpp; - if (rsc->stencil) - zsbuf_cpp[1] = rsc->stencil->cpp; - } else { - /* we might have a zsbuf, but it isn't used */ - batch->restore &= ~(FD_BUFFER_DEPTH | FD_BUFFER_STENCIL); - batch->resolve &= ~(FD_BUFFER_DEPTH | FD_BUFFER_STENCIL); - } - for (i = 0; i < pfb->nr_cbufs; i++) { - if (pfb->cbufs[i]) - cbuf_cpp[i] = util_format_get_blocksize(pfb->cbufs[i]->format); - else - cbuf_cpp[i] = 4; - /* if MSAA, color buffers are super-sampled in GMEM: */ - cbuf_cpp[i] *= pfb->samples; - } - - if (!memcmp(gmem->zsbuf_cpp, zsbuf_cpp, sizeof(zsbuf_cpp)) && - !memcmp(gmem->cbuf_cpp, cbuf_cpp, sizeof(cbuf_cpp)) && - !memcmp(&gmem->scissor, scissor, sizeof(gmem->scissor))) { - /* everything is up-to-date */ - return; - } - - if (fd_mesa_debug & FD_DBG_NOSCIS) { - minx = 0; - miny = 0; - width = pfb->width; - height = pfb->height; - } else { - /* round down to multiple of alignment: */ - minx = scissor->minx & ~(gmem_alignw - 1); - miny = scissor->miny & ~(gmem_alignh - 1); - width = scissor->maxx - minx; - height = scissor->maxy - miny; - } - - bin_w = align(width, gmem_alignw); - bin_h = align(height, gmem_alignh); + bin_w = align(key->width, gmem_alignw); + bin_h = align(key->height, gmem_alignh); /* first, find a bin width that satisfies the maximum width * restrictions: */ while (bin_w > max_width) { nbins_x++; - bin_w = align(width / nbins_x, gmem_alignw); + bin_w = align(key->width / nbins_x, gmem_alignw); } if (fd_mesa_debug & FD_DBG_MSGS) { debug_printf("binning input: cbuf cpp:"); - for (i = 0; i < pfb->nr_cbufs; i++) - debug_printf(" %d", cbuf_cpp[i]); + for (i = 0; i < key->nr_cbufs; i++) + debug_printf(" %d", key->cbuf_cpp[i]); debug_printf(", zsbuf cpp: %d; %dx%d\n", - zsbuf_cpp[0], width, height); - } - - if (is_a20x(screen) && batch->cleared) { - /* under normal circumstances the requirement would be 4K - * but the fast clear path requires an alignment of 32K - */ - gmem_align = 0x8000; + key->zsbuf_cpp[0], key->width, key->height); } /* then find a bin width/height that satisfies the memory * constraints: */ - while (total_size(cbuf_cpp, zsbuf_cpp, bin_w, bin_h, gmem_align, gmem) > - gmem_size) { + while (total_size(key, bin_w, bin_h, gmem) > gmem_size) { if (bin_w > bin_h) { nbins_x++; - bin_w = align(width / nbins_x, gmem_alignw); + bin_w = align(key->width / nbins_x, gmem_alignw); } else { nbins_y++; - bin_h = align(height / nbins_y, gmem_alignh); + bin_h = align(key->height / nbins_y, gmem_alignh); } } DBG("using %d bins of size %dx%d", nbins_x*nbins_y, bin_w, bin_h); - gmem->scissor = *scissor; - memcpy(gmem->cbuf_cpp, cbuf_cpp, sizeof(cbuf_cpp)); - memcpy(gmem->zsbuf_cpp, zsbuf_cpp, sizeof(zsbuf_cpp)); + memcpy(gmem->cbuf_cpp, key->cbuf_cpp, sizeof(key->cbuf_cpp)); + memcpy(gmem->zsbuf_cpp, key->zsbuf_cpp, sizeof(key->zsbuf_cpp)); gmem->bin_h = bin_h; gmem->bin_w = bin_w; gmem->nbins_x = nbins_x; gmem->nbins_y = nbins_y; - gmem->minx = minx; - gmem->miny = miny; - gmem->width = width; - gmem->height = height; + gmem->minx = key->minx; + gmem->miny = key->miny; + gmem->width = key->width; + gmem->height = key->height; /* * Assign tiles and pipes: @@ -231,7 +219,7 @@ #define div_round_up(v, a) (((v) + (a) - 1) / (a)) /* figure out number of tiles per pipe: */ - if (is_a20x(ctx->screen)) { + if (is_a20x(screen)) { /* for a20x we want to minimize the number of "pipes" * binning data has 3 bits for x/y (8x8) but the edges are used to * cull off-screen vertices with hw binning, so we have 6x6 pipes @@ -253,7 +241,7 @@ /* configure pipes: */ xoff = yoff = 0; for (i = 0; i < npipes; i++) { - struct fd_vsc_pipe *pipe = &ctx->vsc_pipe[i]; + struct fd_vsc_pipe *pipe = &gmem->vsc_pipe[i]; if (xoff >= nbins_x) { xoff = 0; @@ -276,14 +264,14 @@ gmem->num_vsc_pipes = MAX2(1, i); for (; i < npipes; i++) { - struct fd_vsc_pipe *pipe = &ctx->vsc_pipe[i]; + struct fd_vsc_pipe *pipe = &gmem->vsc_pipe[i]; pipe->x = pipe->y = pipe->w = pipe->h = 0; } if (BIN_DEBUG) { printf("%dx%d ... tpp=%dx%d\n", nbins_x, nbins_y, tpp_x, tpp_y); - for (i = 0; i < ARRAY_SIZE(ctx->vsc_pipe); i++) { - struct fd_vsc_pipe *pipe = &ctx->vsc_pipe[i]; + for (i = 0; i < ARRAY_SIZE(gmem->vsc_pipe); i++) { + struct fd_vsc_pipe *pipe = &gmem->vsc_pipe[i]; printf("pipe[%d]: %ux%u @ %u,%u\n", i, pipe->w, pipe->h, pipe->x, pipe->y); } @@ -291,29 +279,29 @@ /* configure tiles: */ t = 0; - yoff = miny; + yoff = key->miny; memset(tile_n, 0, sizeof(tile_n)); for (i = 0; i < nbins_y; i++) { uint32_t bw, bh; - xoff = minx; + xoff = key->minx; /* clip bin height: */ - bh = MIN2(bin_h, miny + height - yoff); + bh = MIN2(bin_h, key->miny + key->height - yoff); for (j = 0; j < nbins_x; j++) { - struct fd_tile *tile = &ctx->tile[t]; + struct fd_tile *tile = &gmem->tile[t]; uint32_t p; - assert(t < ARRAY_SIZE(ctx->tile)); + assert(t < ARRAY_SIZE(gmem->tile)); /* pipe number: */ p = ((i / tpp_y) * div_round_up(nbins_x, tpp_x)) + (j / tpp_x); assert(p < gmem->num_vsc_pipes); /* clip bin width: */ - bw = MIN2(bin_w, minx + width - xoff); - tile->n = !is_a20x(ctx->screen) ? tile_n[p]++ : + bw = MIN2(bin_w, key->minx + key->width - xoff); + tile->n = !is_a20x(screen) ? tile_n[p]++ : ((i % tpp_y + 1) << 3 | (j % tpp_x + 1)); tile->p = p; tile->bin_w = bw; @@ -338,28 +326,149 @@ t = 0; for (i = 0; i < nbins_y; i++) { for (j = 0; j < nbins_x; j++) { - struct fd_tile *tile = &ctx->tile[t++]; + struct fd_tile *tile = &gmem->tile[t++]; printf("|p:%u n:%u|", tile->p, tile->n); } printf("\n"); } } + + return gmem; +} + +void +__fd_gmem_destroy(struct fd_gmem_stateobj *gmem) +{ + struct fd_gmem_cache *cache = &gmem->screen->gmem_cache; + + pipe_mutex_assert_locked(gmem->screen->lock); + + _mesa_hash_table_remove_key(cache->ht, gmem->key); + list_del(&gmem->node); + + ralloc_free(gmem->key); + ralloc_free(gmem); } +static struct gmem_key * +key_init(struct fd_batch *batch) +{ + struct fd_screen *screen = batch->ctx->screen; + struct pipe_framebuffer_state *pfb = &batch->framebuffer; + bool has_zs = pfb->zsbuf && !!(batch->gmem_reason & (FD_GMEM_DEPTH_ENABLED | + FD_GMEM_STENCIL_ENABLED | FD_GMEM_CLEARS_DEPTH_STENCIL)); + struct gmem_key *key = rzalloc(screen->gmem_cache.ht, struct gmem_key); + + if (has_zs) { + struct fd_resource *rsc = fd_resource(pfb->zsbuf->texture); + key->zsbuf_cpp[0] = rsc->layout.cpp; + if (rsc->stencil) + key->zsbuf_cpp[1] = rsc->stencil->layout.cpp; + } else { + /* we might have a zsbuf, but it isn't used */ + batch->restore &= ~(FD_BUFFER_DEPTH | FD_BUFFER_STENCIL); + batch->resolve &= ~(FD_BUFFER_DEPTH | FD_BUFFER_STENCIL); + } + + key->nr_cbufs = pfb->nr_cbufs; + for (unsigned i = 0; i < pfb->nr_cbufs; i++) { + if (pfb->cbufs[i]) + key->cbuf_cpp[i] = util_format_get_blocksize(pfb->cbufs[i]->format); + else + key->cbuf_cpp[i] = 4; + /* if MSAA, color buffers are super-sampled in GMEM: */ + key->cbuf_cpp[i] *= pfb->samples; + } + + if (fd_mesa_debug & FD_DBG_NOSCIS) { + key->minx = 0; + key->miny = 0; + key->width = pfb->width; + key->height = pfb->height; + } else { + struct pipe_scissor_state *scissor = &batch->max_scissor; + + /* round down to multiple of alignment: */ + key->minx = scissor->minx & ~(screen->gmem_alignw - 1); + key->miny = scissor->miny & ~(screen->gmem_alignh - 1); + key->width = scissor->maxx - key->minx; + key->height = scissor->maxy - key->miny; + } + + if (is_a20x(screen) && batch->cleared) { + /* under normal circumstances the requirement would be 4K + * but the fast clear path requires an alignment of 32K + */ + key->gmem_page_align = 8; + } else { + // TODO re-check this across gens.. maybe it should only + // be a single page in some cases: + key->gmem_page_align = 4; + } + + return key; +} + +static struct fd_gmem_stateobj * +lookup_gmem_state(struct fd_batch *batch) +{ + struct fd_screen *screen = batch->ctx->screen; + struct fd_gmem_cache *cache = &screen->gmem_cache; + struct fd_gmem_stateobj *gmem = NULL; + struct gmem_key *key = key_init(batch); + uint32_t hash = gmem_key_hash(key); + + mtx_lock(&screen->lock); + + struct hash_entry *entry = + _mesa_hash_table_search_pre_hashed(cache->ht, hash, key); + if (entry) { + ralloc_free(key); + goto found; + } + + /* limit the # of cached gmem states, discarding the least + * recently used state if needed: + */ + if (cache->ht->entries >= 20) { + struct fd_gmem_stateobj *last = + list_last_entry(&cache->lru, struct fd_gmem_stateobj, node); + fd_gmem_reference(&last, NULL); + } + + entry = _mesa_hash_table_insert_pre_hashed(cache->ht, + hash, key, gmem_stateobj_init(screen, key)); + +found: + fd_gmem_reference(&gmem, entry->data); + /* Move to the head of the LRU: */ + list_delinit(&gmem->node); + list_add(&gmem->node, &cache->lru); + + mtx_unlock(&screen->lock); + + return gmem; +} + +/* + * GMEM render pass + */ + static void -render_tiles(struct fd_batch *batch) +render_tiles(struct fd_batch *batch, struct fd_gmem_stateobj *gmem) { struct fd_context *ctx = batch->ctx; - struct fd_gmem_stateobj *gmem = &ctx->gmem; int i; + mtx_lock(&ctx->gmem_lock); + ctx->emit_tile_init(batch); if (batch->restore) ctx->stats.batch_restore++; for (i = 0; i < (gmem->nbins_x * gmem->nbins_y); i++) { - struct fd_tile *tile = &ctx->tile[i]; + struct fd_tile *tile = &gmem->tile[i]; DBG("bin_h=%d, yoff=%d, bin_w=%d, xoff=%d", tile->bin_h, tile->yoff, tile->bin_w, tile->xoff); @@ -389,6 +498,8 @@ if (ctx->emit_tile_fini) ctx->emit_tile_fini(batch); + + mtx_unlock(&ctx->gmem_lock); } static void @@ -446,6 +557,26 @@ } } + if (fd_mesa_debug & FD_DBG_NOGMEM) + sysmem = true; + + /* Layered rendering always needs bypass. */ + for (unsigned i = 0; i < pfb->nr_cbufs; i++) { + struct pipe_surface *psurf = pfb->cbufs[i]; + if (!psurf) + continue; + if (psurf->u.tex.first_layer < psurf->u.tex.last_layer) + sysmem = true; + } + + /* Tessellation doesn't seem to support tiled rendering so fall back to + * bypass. + */ + if (batch->tessellation) { + debug_assert(ctx->emit_sysmem_prep); + sysmem = true; + } + fd_reset_wfi(batch); ctx->stats.batch_total++; @@ -464,15 +595,21 @@ render_sysmem(batch); ctx->stats.batch_sysmem++; } else { - struct fd_gmem_stateobj *gmem = &ctx->gmem; - calculate_tiles(batch); + struct fd_gmem_stateobj *gmem = lookup_gmem_state(batch); + batch->gmem_state = gmem; DBG("%p: rendering %dx%d tiles %ux%u (%s/%s)", batch, pfb->width, pfb->height, gmem->nbins_x, gmem->nbins_y, util_format_short_name(pipe_surface_format(pfb->cbufs[0])), util_format_short_name(pipe_surface_format(pfb->zsbuf))); if (ctx->query_prepare) ctx->query_prepare(batch, gmem->nbins_x * gmem->nbins_y); - render_tiles(batch); + render_tiles(batch, gmem); + batch->gmem_state = NULL; + + mtx_lock(&ctx->screen->lock); + fd_gmem_reference(&gmem, NULL); + mtx_unlock(&ctx->screen->lock); + ctx->stats.batch_gmem++; } @@ -485,7 +622,7 @@ * case would be a single clear. */ bool -fd_gmem_needs_restore(struct fd_batch *batch, struct fd_tile *tile, +fd_gmem_needs_restore(struct fd_batch *batch, const struct fd_tile *tile, uint32_t buffers) { if (!(batch->restore & buffers)) @@ -493,3 +630,20 @@ return true; } + +void +fd_gmem_screen_init(struct pipe_screen *pscreen) +{ + struct fd_gmem_cache *cache = &fd_screen(pscreen)->gmem_cache; + + cache->ht = _mesa_hash_table_create(NULL, gmem_key_hash, gmem_key_equals); + list_inithead(&cache->lru); +} + +void +fd_gmem_screen_fini(struct pipe_screen *pscreen) +{ + struct fd_gmem_cache *cache = &fd_screen(pscreen)->gmem_cache; + + _mesa_hash_table_destroy(cache->ht, NULL); +} diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/freedreno_gmem.h mesa-20.0.8/src/gallium/drivers/freedreno/freedreno_gmem.h --- mesa-19.2.8/src/gallium/drivers/freedreno/freedreno_gmem.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/freedreno_gmem.h 2020-06-12 01:21:17.000000000 +0000 @@ -27,14 +27,13 @@ #ifndef FREEDRENO_GMEM_H_ #define FREEDRENO_GMEM_H_ -#include "pipe/p_context.h" +#include "pipe/p_state.h" +#include "util/list.h" #include "freedreno_util.h" /* per-pipe configuration for hw binning: */ struct fd_vsc_pipe { - // TODO a3xx/a4xx/a5xx could probably move to single bo for vsc stream, like a6xx does - struct fd_bo *bo; uint8_t x, y, w, h; /* VSC_PIPE[p].CONFIG */ }; @@ -47,7 +46,10 @@ }; struct fd_gmem_stateobj { - struct pipe_scissor_state scissor; + struct pipe_reference reference; + struct fd_screen *screen; + void *key; + uint32_t cbuf_base[MAX_RENDER_TARGETS]; uint32_t zsbuf_base[2]; uint8_t cbuf_cpp[MAX_RENDER_TARGETS]; @@ -58,13 +60,40 @@ uint16_t width, height; uint16_t maxpw, maxph; /* maximum pipe width/height */ uint8_t num_vsc_pipes; /* number of pipes for a20x */ + + struct fd_vsc_pipe vsc_pipe[32]; + struct fd_tile tile[512]; + + struct list_head node; +}; + +void __fd_gmem_destroy(struct fd_gmem_stateobj *gmem); + +static inline void +fd_gmem_reference(struct fd_gmem_stateobj **ptr, struct fd_gmem_stateobj *gmem) +{ + struct fd_gmem_stateobj *old_gmem = *ptr; + + if (pipe_reference(&(*ptr)->reference, &gmem->reference)) + __fd_gmem_destroy(old_gmem); + + *ptr = gmem; +} + +struct fd_gmem_cache { + struct hash_table *ht; + struct list_head lru; }; struct fd_batch; void fd_gmem_render_tiles(struct fd_batch *batch); -bool fd_gmem_needs_restore(struct fd_batch *batch, struct fd_tile *tile, +bool fd_gmem_needs_restore(struct fd_batch *batch, const struct fd_tile *tile, uint32_t buffers); +struct pipe_screen; +void fd_gmem_screen_init(struct pipe_screen *pscreen); +void fd_gmem_screen_fini(struct pipe_screen *pscreen); + #endif /* FREEDRENO_GMEM_H_ */ diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/freedreno_perfcntr.h mesa-20.0.8/src/gallium/drivers/freedreno/freedreno_perfcntr.h --- mesa-19.2.8/src/gallium/drivers/freedreno/freedreno_perfcntr.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/freedreno_perfcntr.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,72 +0,0 @@ -/* - * Copyright (C) 2018 Rob Clark - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Authors: - * Rob Clark - */ - -#ifndef FREEDRENO_PERFCNTR_H_ -#define FREEDRENO_PERFCNTR_H_ - -#include "pipe/p_defines.h" - -/* - * Mapping very closely to the AMD_performance_monitor extension, adreno has - * groups of performance counters where each group has N counters, which can - * select from M different countables (things that can be counted), where - * generally M > N. - */ - -/* Describes a single counter: */ -struct fd_perfcntr_counter { - /* offset of the select register to choose what to count: */ - unsigned select_reg; - /* offset of the lo/hi 32b to read current counter value: */ - unsigned counter_reg_lo; - unsigned counter_reg_hi; - /* Optional, most counters don't have enable/clear registers: */ - unsigned enable; - unsigned clear; -}; - -/* Describes a single countable: */ -struct fd_perfcntr_countable { - const char *name; - /* selector register enum value to select this countable: */ - unsigned selector; - - /* description of the countable: */ - enum pipe_driver_query_type query_type; - enum pipe_driver_query_result_type result_type; -}; - -/* Describes an entire counter group: */ -struct fd_perfcntr_group { - const char *name; - unsigned num_counters; - const struct fd_perfcntr_counter *counters; - unsigned num_countables; - const struct fd_perfcntr_countable *countables; -}; - - -#endif /* FREEDRENO_PERFCNTR_H_ */ diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/freedreno_program.c mesa-20.0.8/src/gallium/drivers/freedreno/freedreno_program.c --- mesa-19.2.8/src/gallium/drivers/freedreno/freedreno_program.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/freedreno_program.c 2020-06-12 01:21:17.000000000 +0000 @@ -31,24 +31,51 @@ #include "freedreno_context.h" static void -fd_fp_state_bind(struct pipe_context *pctx, void *hwcso) +fd_vs_state_bind(struct pipe_context *pctx, void *hwcso) { struct fd_context *ctx = fd_context(pctx); - ctx->prog.fp = hwcso; - ctx->dirty_shader[PIPE_SHADER_FRAGMENT] |= FD_DIRTY_SHADER_PROG; + ctx->prog.vs = hwcso; + ctx->dirty_shader[PIPE_SHADER_VERTEX] |= FD_DIRTY_SHADER_PROG; ctx->dirty |= FD_DIRTY_PROG; } static void -fd_vp_state_bind(struct pipe_context *pctx, void *hwcso) +fd_tcs_state_bind(struct pipe_context *pctx, void *hwcso) { struct fd_context *ctx = fd_context(pctx); - ctx->prog.vp = hwcso; - ctx->dirty_shader[PIPE_SHADER_VERTEX] |= FD_DIRTY_SHADER_PROG; + ctx->prog.hs = hwcso; + ctx->dirty_shader[PIPE_SHADER_TESS_CTRL] |= FD_DIRTY_SHADER_PROG; + ctx->dirty |= FD_DIRTY_PROG; +} + +static void +fd_tes_state_bind(struct pipe_context *pctx, void *hwcso) +{ + struct fd_context *ctx = fd_context(pctx); + ctx->prog.ds = hwcso; + ctx->dirty_shader[PIPE_SHADER_TESS_EVAL] |= FD_DIRTY_SHADER_PROG; ctx->dirty |= FD_DIRTY_PROG; } -static const char *solid_fp = +static void +fd_gs_state_bind(struct pipe_context *pctx, void *hwcso) +{ + struct fd_context *ctx = fd_context(pctx); + ctx->prog.gs = hwcso; + ctx->dirty_shader[PIPE_SHADER_GEOMETRY] |= FD_DIRTY_SHADER_PROG; + ctx->dirty |= FD_DIRTY_PROG; +} + +static void +fd_fs_state_bind(struct pipe_context *pctx, void *hwcso) +{ + struct fd_context *ctx = fd_context(pctx); + ctx->prog.fs = hwcso; + ctx->dirty_shader[PIPE_SHADER_FRAGMENT] |= FD_DIRTY_SHADER_PROG; + ctx->dirty |= FD_DIRTY_PROG; +} + +static const char *solid_fs = "FRAG \n" "PROPERTY FS_COLOR0_WRITES_ALL_CBUFS 1 \n" "DCL CONST[0] \n" @@ -56,14 +83,14 @@ " 0: MOV OUT[0], CONST[0] \n" " 1: END \n"; -static const char *solid_vp = +static const char *solid_vs = "VERT \n" "DCL IN[0] \n" "DCL OUT[0], POSITION \n" " 0: MOV OUT[0], IN[0] \n" " 1: END \n"; -static const char *blit_vp = +static const char *blit_vs = "VERT \n" "DCL IN[0] \n" "DCL IN[1] \n" @@ -126,26 +153,29 @@ struct fd_context *ctx = fd_context(pctx); int i; - pctx->bind_fs_state = fd_fp_state_bind; - pctx->bind_vs_state = fd_vp_state_bind; - - ctx->solid_prog.fp = assemble_tgsi(pctx, solid_fp, true); - ctx->solid_prog.vp = assemble_tgsi(pctx, solid_vp, false); - ctx->blit_prog[0].vp = assemble_tgsi(pctx, blit_vp, false); - ctx->blit_prog[0].fp = fd_prog_blit(pctx, 1, false); + pctx->bind_vs_state = fd_vs_state_bind; + pctx->bind_tcs_state = fd_tcs_state_bind; + pctx->bind_tes_state = fd_tes_state_bind; + pctx->bind_gs_state = fd_gs_state_bind; + pctx->bind_fs_state = fd_fs_state_bind; + + ctx->solid_prog.fs = assemble_tgsi(pctx, solid_fs, true); + ctx->solid_prog.vs = assemble_tgsi(pctx, solid_vs, false); + ctx->blit_prog[0].vs = assemble_tgsi(pctx, blit_vs, false); + ctx->blit_prog[0].fs = fd_prog_blit(pctx, 1, false); if (ctx->screen->gpu_id < 300) return; for (i = 1; i < ctx->screen->max_rts; i++) { - ctx->blit_prog[i].vp = ctx->blit_prog[0].vp; - ctx->blit_prog[i].fp = fd_prog_blit(pctx, i + 1, false); + ctx->blit_prog[i].vs = ctx->blit_prog[0].vs; + ctx->blit_prog[i].fs = fd_prog_blit(pctx, i + 1, false); } - ctx->blit_z.vp = ctx->blit_prog[0].vp; - ctx->blit_z.fp = fd_prog_blit(pctx, 0, true); - ctx->blit_zs.vp = ctx->blit_prog[0].vp; - ctx->blit_zs.fp = fd_prog_blit(pctx, 1, true); + ctx->blit_z.vs = ctx->blit_prog[0].vs; + ctx->blit_z.fs = fd_prog_blit(pctx, 0, true); + ctx->blit_zs.vs = ctx->blit_prog[0].vs; + ctx->blit_zs.fs = fd_prog_blit(pctx, 1, true); } void fd_prog_fini(struct pipe_context *pctx) @@ -153,11 +183,11 @@ struct fd_context *ctx = fd_context(pctx); int i; - pctx->delete_vs_state(pctx, ctx->solid_prog.vp); - pctx->delete_fs_state(pctx, ctx->solid_prog.fp); - pctx->delete_vs_state(pctx, ctx->blit_prog[0].vp); + pctx->delete_vs_state(pctx, ctx->solid_prog.vs); + pctx->delete_fs_state(pctx, ctx->solid_prog.fs); + pctx->delete_vs_state(pctx, ctx->blit_prog[0].vs); for (i = 0; i < ctx->screen->max_rts; i++) - pctx->delete_fs_state(pctx, ctx->blit_prog[i].fp); - pctx->delete_fs_state(pctx, ctx->blit_z.fp); - pctx->delete_fs_state(pctx, ctx->blit_zs.fp); + pctx->delete_fs_state(pctx, ctx->blit_prog[i].fs); + pctx->delete_fs_state(pctx, ctx->blit_z.fs); + pctx->delete_fs_state(pctx, ctx->blit_zs.fs); } diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/freedreno_query_acc.c mesa-20.0.8/src/gallium/drivers/freedreno/freedreno_query_acc.c --- mesa-19.2.8/src/gallium/drivers/freedreno/freedreno_query_acc.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/freedreno_query_acc.c 2020-06-12 01:21:17.000000000 +0000 @@ -91,7 +91,7 @@ p->resume(aq, batch); /* add to active list: */ - assert(list_empty(&aq->node)); + assert(list_is_empty(&aq->node)); list_addtail(&aq->node, &ctx->acc_active_queries); return true; @@ -123,7 +123,7 @@ DBG("%p: wait=%d, active=%d", q, wait, q->active); - assert(LIST_IS_EMPTY(&aq->node)); + assert(list_is_empty(&aq->node)); /* if !wait, then check the last sample (the one most likely to * not be ready yet) and bail if it is not ready: @@ -139,7 +139,7 @@ * spin forever: */ if (aq->no_wait_cnt++ > 5) - fd_batch_flush(rsc->write_batch, false); + fd_batch_flush(rsc->write_batch); return false; } @@ -152,7 +152,7 @@ } if (rsc->write_batch) - fd_batch_flush(rsc->write_batch, true); + fd_batch_flush(rsc->write_batch); /* get the result: */ fd_bo_cpu_prep(rsc->bo, ctx->pipe, DRM_FREEDRENO_PREP_READ); @@ -173,7 +173,7 @@ struct fd_query * fd_acc_create_query2(struct fd_context *ctx, unsigned query_type, - const struct fd_acc_sample_provider *provider) + unsigned index, const struct fd_acc_sample_provider *provider) { struct fd_acc_query *aq; struct fd_query *q; @@ -192,19 +192,21 @@ q = &aq->base; q->funcs = &acc_query_funcs; q->type = query_type; + q->index = index; return q; } struct fd_query * -fd_acc_create_query(struct fd_context *ctx, unsigned query_type) +fd_acc_create_query(struct fd_context *ctx, unsigned query_type, + unsigned index) { int idx = pidx(query_type); if ((idx < 0) || !ctx->acc_sample_providers[idx]) return NULL; - return fd_acc_create_query2(ctx, query_type, + return fd_acc_create_query2(ctx, query_type, index, ctx->acc_sample_providers[idx]); } diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/freedreno_query_acc.h mesa-20.0.8/src/gallium/drivers/freedreno/freedreno_query_acc.h --- mesa-19.2.8/src/gallium/drivers/freedreno/freedreno_query_acc.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/freedreno_query_acc.h 2020-06-12 01:21:17.000000000 +0000 @@ -97,9 +97,10 @@ return (struct fd_acc_query *)q; } -struct fd_query * fd_acc_create_query(struct fd_context *ctx, unsigned query_type); +struct fd_query * fd_acc_create_query(struct fd_context *ctx, unsigned query_type, + unsigned index); struct fd_query * fd_acc_create_query2(struct fd_context *ctx, unsigned query_type, - const struct fd_acc_sample_provider *provider); + unsigned index, const struct fd_acc_sample_provider *provider); void fd_acc_query_set_stage(struct fd_batch *batch, enum fd_render_stage stage); void fd_acc_query_register_provider(struct pipe_context *pctx, const struct fd_acc_sample_provider *provider); diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/freedreno_query.c mesa-20.0.8/src/gallium/drivers/freedreno/freedreno_query.c --- mesa-19.2.8/src/gallium/drivers/freedreno/freedreno_query.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/freedreno_query.c 2020-06-12 01:21:17.000000000 +0000 @@ -41,11 +41,12 @@ fd_create_query(struct pipe_context *pctx, unsigned query_type, unsigned index) { struct fd_context *ctx = fd_context(pctx); - struct fd_query *q; + struct fd_query *q = NULL; - q = fd_sw_create_query(ctx, query_type); - if (!q && ctx->create_query) - q = ctx->create_query(ctx, query_type); + if (ctx->create_query) + q = ctx->create_query(ctx, query_type, index); + if (!q) + q = fd_sw_create_query(ctx, query_type, index); return (struct pipe_query *) q; } @@ -191,6 +192,41 @@ { } +static enum pipe_driver_query_type +query_type(enum fd_perfcntr_type type) +{ +#define ENUM(t) case FD_PERFCNTR_ ## t: return PIPE_DRIVER_QUERY_ ## t + switch (type) { + ENUM(TYPE_UINT64); + ENUM(TYPE_UINT); + ENUM(TYPE_FLOAT); + ENUM(TYPE_PERCENTAGE); + ENUM(TYPE_BYTES); + ENUM(TYPE_MICROSECONDS); + ENUM(TYPE_HZ); + ENUM(TYPE_DBM); + ENUM(TYPE_TEMPERATURE); + ENUM(TYPE_VOLTS); + ENUM(TYPE_AMPS); + ENUM(TYPE_WATTS); + default: + unreachable("bad type"); + return 0; + } +} + +static enum pipe_driver_query_result_type +query_result_type(enum fd_perfcntr_result_type type) +{ + switch (type) { + ENUM(RESULT_TYPE_AVERAGE); + ENUM(RESULT_TYPE_CUMULATIVE); + default: + unreachable("bad type"); + return 0; + } +} + static void setup_perfcntr_query_info(struct fd_screen *screen) { @@ -214,8 +250,8 @@ info->name = c->name; info->query_type = FD_QUERY_FIRST_PERFCNTR + idx; - info->type = c->query_type; - info->result_type = c->result_type; + info->type = query_type(c->query_type); + info->result_type = query_result_type(c->result_type); info->group_id = i; info->flags = PIPE_DRIVER_QUERY_FLAG_BATCH; diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/freedreno_query.h mesa-20.0.8/src/gallium/drivers/freedreno/freedreno_query.h --- mesa-19.2.8/src/gallium/drivers/freedreno/freedreno_query.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/freedreno_query.h 2020-06-12 01:21:17.000000000 +0000 @@ -46,6 +46,7 @@ const struct fd_query_funcs *funcs; bool active; int type; + unsigned index; }; static inline struct fd_query * @@ -102,6 +103,12 @@ return 3; case PIPE_QUERY_TIMESTAMP: return 4; + + case PIPE_QUERY_PRIMITIVES_GENERATED: + return 5; + case PIPE_QUERY_PRIMITIVES_EMITTED: + return 6; + default: return -1; } diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/freedreno_query_hw.c mesa-20.0.8/src/gallium/drivers/freedreno/freedreno_query_hw.c --- mesa-19.2.8/src/gallium/drivers/freedreno/freedreno_query_hw.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/freedreno_query_hw.c 2020-06-12 01:21:17.000000000 +0000 @@ -147,7 +147,7 @@ resume_query(batch, hq, batch->draw); /* add to active list: */ - assert(list_empty(&hq->list)); + assert(list_is_empty(&hq->list)); list_addtail(&hq->list, &ctx->hw_active_queries); return true; @@ -184,10 +184,10 @@ DBG("%p: wait=%d, active=%d", q, wait, q->active); - if (LIST_IS_EMPTY(&hq->periods)) + if (list_is_empty(&hq->periods)) return true; - assert(LIST_IS_EMPTY(&hq->list)); + assert(list_is_empty(&hq->list)); assert(!hq->period); /* if !wait, then check the last sample (the one most likely to @@ -209,7 +209,7 @@ * spin forever: */ if (hq->no_wait_cnt++ > 5) - fd_batch_flush(rsc->write_batch, false); + fd_batch_flush(rsc->write_batch); return false; } @@ -237,7 +237,7 @@ struct fd_resource *rsc = fd_resource(start->prsc); if (rsc->write_batch) - fd_batch_flush(rsc->write_batch, true); + fd_batch_flush(rsc->write_batch); /* some piglit tests at least do query with no draws, I guess: */ if (!rsc->bo) @@ -266,7 +266,7 @@ }; struct fd_query * -fd_hw_create_query(struct fd_context *ctx, unsigned query_type) +fd_hw_create_query(struct fd_context *ctx, unsigned query_type, unsigned index) { struct fd_hw_query *hq; struct fd_query *q; @@ -289,6 +289,7 @@ q = &hq->base; q->funcs = &hw_query_funcs; q->type = query_type; + q->index = index; return q; } diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/freedreno_query_hw.h mesa-20.0.8/src/gallium/drivers/freedreno/freedreno_query_hw.h --- mesa-19.2.8/src/gallium/drivers/freedreno/freedreno_query_hw.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/freedreno_query_hw.h 2020-06-12 01:21:17.000000000 +0000 @@ -136,7 +136,7 @@ return (struct fd_hw_query *)q; } -struct fd_query * fd_hw_create_query(struct fd_context *ctx, unsigned query_type); +struct fd_query * fd_hw_create_query(struct fd_context *ctx, unsigned query_type, unsigned index); /* helper for sample providers: */ struct fd_hw_sample * fd_hw_sample_init(struct fd_batch *batch, uint32_t size); /* don't call directly, use fd_hw_sample_reference() */ diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/freedreno_query_sw.c mesa-20.0.8/src/gallium/drivers/freedreno/freedreno_query_sw.c --- mesa-19.2.8/src/gallium/drivers/freedreno/freedreno_query_sw.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/freedreno_query_sw.c 2020-06-12 01:21:17.000000000 +0000 @@ -162,7 +162,7 @@ }; struct fd_query * -fd_sw_create_query(struct fd_context *ctx, unsigned query_type) +fd_sw_create_query(struct fd_context *ctx, unsigned query_type, unsigned index) { struct fd_sw_query *sq; struct fd_query *q; diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/freedreno_query_sw.h mesa-20.0.8/src/gallium/drivers/freedreno/freedreno_query_sw.h --- mesa-19.2.8/src/gallium/drivers/freedreno/freedreno_query_sw.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/freedreno_query_sw.h 2020-06-12 01:21:17.000000000 +0000 @@ -48,6 +48,6 @@ } struct fd_query * fd_sw_create_query(struct fd_context *ctx, - unsigned query_type); + unsigned query_type, unsigned index); #endif /* FREEDRENO_QUERY_SW_H_ */ diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/freedreno_resource.c mesa-20.0.8/src/gallium/drivers/freedreno/freedreno_resource.c --- mesa-19.2.8/src/gallium/drivers/freedreno/freedreno_resource.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/freedreno_resource.c 2020-06-12 01:21:17.000000000 +0000 @@ -24,9 +24,9 @@ * Rob Clark */ -#include "util/u_format.h" -#include "util/u_format_rgtc.h" -#include "util/u_format_zs.h" +#include "util/format/u_format.h" +#include "util/format/u_format_rgtc.h" +#include "util/format/u_format_zs.h" #include "util/u_inlines.h" #include "util/u_transfer.h" #include "util/u_string.h" @@ -132,7 +132,7 @@ fd_bo_del(rsc->bo); rsc->bo = fd_bo_new(screen->dev, size, flags, "%ux%ux%u@%u:%x", - prsc->width0, prsc->height0, prsc->depth0, rsc->cpp, prsc->bind); + prsc->width0, prsc->height0, prsc->depth0, rsc->layout.cpp, prsc->bind); rsc->seqno = p_atomic_inc_return(&screen->rsc_seqno); util_range_set_empty(&rsc->valid_buffer_range); fd_bc_invalidate_resource(rsc, true); @@ -223,10 +223,11 @@ /* TODO valid_buffer_range?? */ swap(rsc->bo, shadow->bo); swap(rsc->write_batch, shadow->write_batch); - swap(rsc->offset, shadow->offset); - swap(rsc->ubwc_offset, shadow->ubwc_offset); - swap(rsc->ubwc_pitch, shadow->ubwc_pitch); - swap(rsc->ubwc_size, shadow->ubwc_size); + for (int level = 0; level <= prsc->last_level; level++) { + swap(rsc->layout.slices[level], shadow->layout.slices[level]); + swap(rsc->layout.ubwc_slices[level], shadow->layout.ubwc_slices[level]); + } + swap(rsc->layout.ubwc_size, shadow->layout.ubwc_size); rsc->seqno = p_atomic_inc_return(&ctx->screen->rsc_seqno); /* at this point, the newly created shadow buffer is not referenced @@ -419,7 +420,7 @@ struct fd_resource *rsc = fd_resource(ptrans->resource); if (ptrans->resource->target == PIPE_BUFFER) - util_range_add(&rsc->valid_buffer_range, + util_range_add(&rsc->base, &rsc->valid_buffer_range, ptrans->box.x + box->x, ptrans->box.x + box->x + box->width); } @@ -449,15 +450,14 @@ mtx_unlock(&ctx->screen->lock); foreach_batch(batch, &ctx->screen->batch_cache, batch_mask) - fd_batch_flush(batch, false); + fd_batch_flush(batch); foreach_batch(batch, &ctx->screen->batch_cache, batch_mask) { - fd_batch_sync(batch); fd_batch_reference(&batches[batch->idx], NULL); } assert(rsc->batch_mask == 0); } else if (write_batch) { - fd_batch_flush(write_batch, true); + fd_batch_flush(write_batch); } fd_batch_reference(&write_batch, NULL); @@ -489,7 +489,7 @@ fd_bo_cpu_fini(rsc->bo); } - util_range_add(&rsc->valid_buffer_range, + util_range_add(&rsc->base, &rsc->valid_buffer_range, ptrans->box.x, ptrans->box.x + ptrans->box.width); @@ -506,7 +506,7 @@ { struct fd_context *ctx = fd_context(pctx); struct fd_resource *rsc = fd_resource(prsc); - struct fd_resource_slice *slice = fd_resource_slice(rsc, level); + struct fdl_slice *slice = fd_resource_slice(rsc, level); struct fd_transfer *trans; struct pipe_transfer *ptrans; enum pipe_format format = prsc->format; @@ -530,8 +530,8 @@ ptrans->level = level; ptrans->usage = usage; ptrans->box = *box; - ptrans->stride = util_format_get_nblocksx(format, slice->pitch) * rsc->cpp; - ptrans->layer_stride = rsc->layer_first ? rsc->layer_size : slice->size0; + ptrans->stride = util_format_get_nblocksx(format, slice->pitch) * rsc->layout.cpp; + ptrans->layer_stride = fd_resource_layer_stride(rsc, level); /* we always need a staging texture for tiled buffers: * @@ -539,17 +539,18 @@ * splitting a batch.. for ex, mid-frame texture uploads to a tiled * texture. */ - if (rsc->tile_mode) { + if (rsc->layout.tile_mode) { struct fd_resource *staging_rsc; staging_rsc = fd_alloc_staging(ctx, rsc, level, box); if (staging_rsc) { + struct fdl_slice *staging_slice = + fd_resource_slice(staging_rsc, 0); // TODO for PIPE_TRANSFER_READ, need to do untiling blit.. trans->staging_prsc = &staging_rsc->base; trans->base.stride = util_format_get_nblocksx(format, - staging_rsc->slices[0].pitch) * staging_rsc->cpp; - trans->base.layer_stride = staging_rsc->layer_first ? - staging_rsc->layer_size : staging_rsc->slices[0].size0; + staging_slice->pitch) * staging_rsc->layout.cpp; + trans->base.layer_stride = fd_resource_layer_stride(staging_rsc, 0); trans->staging_box = *box; trans->staging_box.x = 0; trans->staging_box.y = 0; @@ -558,21 +559,6 @@ if (usage & PIPE_TRANSFER_READ) { fd_blit_to_staging(ctx, trans); - struct fd_batch *batch = NULL; - - fd_context_lock(ctx); - fd_batch_reference_locked(&batch, staging_rsc->write_batch); - fd_context_unlock(ctx); - - /* we can't fd_bo_cpu_prep() until the blit to staging - * is submitted to kernel.. in that case write_batch - * wouldn't be NULL yet: - */ - if (batch) { - fd_batch_sync(batch); - fd_batch_reference(&batch, NULL); - } - fd_bo_cpu_prep(staging_rsc->bo, ctx->pipe, DRM_FREEDRENO_PREP_READ); } @@ -660,11 +646,13 @@ */ staging_rsc = fd_alloc_staging(ctx, rsc, level, box); if (staging_rsc) { + struct fdl_slice *staging_slice = + fd_resource_slice(staging_rsc, 0); trans->staging_prsc = &staging_rsc->base; trans->base.stride = util_format_get_nblocksx(format, - staging_rsc->slices[0].pitch) * staging_rsc->cpp; - trans->base.layer_stride = staging_rsc->layer_first ? - staging_rsc->layer_size : staging_rsc->slices[0].size0; + staging_slice->pitch) * staging_rsc->layout.cpp; + trans->base.layer_stride = + fd_resource_layer_stride(staging_rsc, 0); trans->staging_box = *box; trans->staging_box.x = 0; trans->staging_box.y = 0; @@ -704,7 +692,7 @@ buf = fd_bo_map(rsc->bo); offset = box->y / util_format_get_blockheight(format) * ptrans->stride + - box->x / util_format_get_blockwidth(format) * rsc->cpp + + box->x / util_format_get_blockwidth(format) * rsc->layout.cpp + fd_resource_offset(rsc, level, box->z); if (usage & PIPE_TRANSFER_WRITE) @@ -737,10 +725,10 @@ static uint64_t fd_resource_modifier(struct fd_resource *rsc) { - if (!rsc->tile_mode) + if (!rsc->layout.tile_mode) return DRM_FORMAT_MOD_LINEAR; - if (rsc->ubwc_size) + if (rsc->layout.ubwc_size) return DRM_FORMAT_MOD_QCOM_COMPRESSED; /* TODO invent a modifier for tiled but not UBWC buffers: */ @@ -759,7 +747,7 @@ handle->modifier = fd_resource_modifier(rsc); return fd_screen_bo_get_handle(pscreen, rsc->bo, rsc->scanout, - rsc->slices[0].pitch * rsc->cpp, handle); + fd_resource_slice(rsc, 0)->pitch * rsc->layout.cpp, handle); } static uint32_t @@ -776,10 +764,10 @@ /* in layer_first layout, the level (slice) contains just one * layer (since in fact the layer contains the slices) */ - uint32_t layers_in_level = rsc->layer_first ? 1 : prsc->array_size; + uint32_t layers_in_level = rsc->layout.layer_first ? 1 : prsc->array_size; for (level = 0; level <= prsc->last_level; level++) { - struct fd_resource_slice *slice = fd_resource_slice(rsc, level); + struct fdl_slice *slice = fd_resource_slice(rsc, level); uint32_t blocks; if (layout == UTIL_FORMAT_LAYOUT_ASTC) @@ -797,12 +785,12 @@ */ if (prsc->target == PIPE_TEXTURE_3D && ( level == 1 || - (level > 1 && rsc->slices[level - 1].size0 > 0xf000))) - slice->size0 = align(blocks * rsc->cpp, alignment); - else if (level == 0 || rsc->layer_first || alignment == 1) - slice->size0 = align(blocks * rsc->cpp, alignment); + (level > 1 && fd_resource_slice(rsc, level - 1)->size0 > 0xf000))) + slice->size0 = align(blocks * rsc->layout.cpp, alignment); + else if (level == 0 || rsc->layout.layer_first || alignment == 1) + slice->size0 = align(blocks * rsc->layout.cpp, alignment); else - slice->size0 = rsc->slices[level - 1].size0; + slice->size0 = fd_resource_slice(rsc, level - 1)->size0; size += slice->size0 * depth * layers_in_level; @@ -848,10 +836,10 @@ if (is_a4xx(screen)) { switch (rsc->base.target) { case PIPE_TEXTURE_3D: - rsc->layer_first = false; + rsc->layout.layer_first = false; break; default: - rsc->layer_first = true; + rsc->layout.layer_first = true; alignment = 1; break; } @@ -874,23 +862,18 @@ realloc_bo(rsc, fd_screen(prsc->screen)->setup_slices(rsc)); } -// TODO common helper? -static bool -has_depth(enum pipe_format format) +static void +fd_resource_layout_init(struct pipe_resource *prsc) { - switch (format) { - case PIPE_FORMAT_Z16_UNORM: - case PIPE_FORMAT_Z32_UNORM: - case PIPE_FORMAT_Z32_FLOAT: - case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: - case PIPE_FORMAT_Z24_UNORM_S8_UINT: - case PIPE_FORMAT_S8_UINT_Z24_UNORM: - case PIPE_FORMAT_Z24X8_UNORM: - case PIPE_FORMAT_X8Z24_UNORM: - return true; - default: - return false; - } + struct fd_resource *rsc = fd_resource(prsc); + struct fdl_layout *layout = &rsc->layout; + + layout->width0 = prsc->width0; + layout->height0 = prsc->height0; + layout->depth0 = prsc->depth0; + + layout->cpp = util_format_get_blocksize(prsc->format); + layout->cpp *= fd_resource_nr_samples(prsc); } /** @@ -953,6 +936,7 @@ return NULL; *prsc = *tmpl; + fd_resource_layout_init(prsc); #define LINEAR \ (PIPE_BIND_SCANOUT | \ @@ -963,6 +947,9 @@ if (tmpl->bind & LINEAR) linear = true; + if (fd_mesa_debug & FD_DBG_NOTILE) + linear = true; + /* Normally, for non-shared buffers, allow buffer compression if * not shared, otherwise only allow if QCOM_COMPRESSED modifier * is requested: @@ -977,53 +964,29 @@ allow_ubwc &= !(fd_mesa_debug & FD_DBG_NOUBWC); + pipe_reference_init(&prsc->reference, 1); + + prsc->screen = pscreen; + if (screen->tile_mode && (tmpl->target != PIPE_BUFFER) && !linear) { - rsc->tile_mode = screen->tile_mode(tmpl); + rsc->layout.tile_mode = screen->tile_mode(prsc); } - pipe_reference_init(&prsc->reference, 1); - - prsc->screen = pscreen; - util_range_init(&rsc->valid_buffer_range); rsc->internal_format = format; - rsc->cpp = util_format_get_blocksize(format); - rsc->cpp *= fd_resource_nr_samples(prsc); - assert(rsc->cpp); - - // XXX probably need some extra work if we hit rsc shadowing path w/ lrz.. - if ((is_a5xx(screen) || is_a6xx(screen)) && - (fd_mesa_debug & FD_DBG_LRZ) && has_depth(format)) { - const uint32_t flags = DRM_FREEDRENO_GEM_CACHE_WCOMBINE | - DRM_FREEDRENO_GEM_TYPE_KMEM; /* TODO */ - unsigned lrz_pitch = align(DIV_ROUND_UP(tmpl->width0, 8), 64); - unsigned lrz_height = DIV_ROUND_UP(tmpl->height0, 8); - - /* LRZ buffer is super-sampled: */ - switch (prsc->nr_samples) { - case 4: - lrz_pitch *= 2; - case 2: - lrz_height *= 2; - } - - unsigned size = lrz_pitch * lrz_height * 2; - - size += 0x1000; /* for GRAS_LRZ_FAST_CLEAR_BUFFER */ - - rsc->lrz_height = lrz_height; - rsc->lrz_width = lrz_pitch; - rsc->lrz_pitch = lrz_pitch; - rsc->lrz = fd_bo_new(screen->dev, size, flags, "lrz"); + if (prsc->target == PIPE_BUFFER) { + assert(prsc->format == PIPE_FORMAT_R8_UNORM); + size = prsc->width0; + fdl_layout_buffer(&rsc->layout, size); + } else { + size = screen->setup_slices(rsc); } - size = screen->setup_slices(rsc); - - if (allow_ubwc && screen->fill_ubwc_buffer_sizes && rsc->tile_mode) + if (allow_ubwc && screen->fill_ubwc_buffer_sizes && rsc->layout.tile_mode) size += screen->fill_ubwc_buffer_sizes(rsc); /* special case for hw-query buffer, which we need to allocate before we @@ -1035,9 +998,9 @@ return prsc; } - if (rsc->layer_first) { - rsc->layer_size = align(size, 4096); - size = rsc->layer_size * prsc->array_size; + if (rsc->layout.layer_first) { + rsc->layout.layer_size = align(size, 4096); + size = rsc->layout.layer_size * prsc->array_size; } realloc_bo(rsc, size); @@ -1090,7 +1053,7 @@ { struct fd_screen *screen = fd_screen(pscreen); struct fd_resource *rsc = CALLOC_STRUCT(fd_resource); - struct fd_resource_slice *slice = &rsc->slices[0]; + struct fdl_slice *slice = fd_resource_slice(rsc, 0); struct pipe_resource *prsc = &rsc->base; uint32_t pitchalign = fd_screen(pscreen)->gmem_alignw; @@ -1105,6 +1068,7 @@ return NULL; *prsc = *tmpl; + fd_resource_layout_init(prsc); pipe_reference_init(&prsc->reference, 1); @@ -1117,9 +1081,7 @@ goto fail; rsc->internal_format = tmpl->format; - rsc->cpp = util_format_get_blocksize(tmpl->format); - rsc->cpp *= fd_resource_nr_samples(prsc); - slice->pitch = handle->stride / rsc->cpp; + slice->pitch = handle->stride / rsc->layout.cpp; slice->offset = handle->offset; slice->size0 = handle->stride * prsc->height0; @@ -1140,7 +1102,7 @@ goto fail; } - assert(rsc->cpp); + assert(rsc->layout.cpp); if (screen->ro) { rsc->scanout = diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/freedreno_resource.h mesa-20.0.8/src/gallium/drivers/freedreno/freedreno_resource.h --- mesa-19.2.8/src/gallium/drivers/freedreno/freedreno_resource.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/freedreno_resource.h 2020-06-12 01:21:17.000000000 +0000 @@ -33,40 +33,14 @@ #include "freedreno_batch.h" #include "freedreno_util.h" - -/* Texture Layout on a3xx: - * - * Each mipmap-level contains all of it's layers (ie. all cubmap - * faces, all 1d/2d array elements, etc). The texture sampler is - * programmed with the start address of each mipmap level, and hw - * derives the layer offset within the level. - * - * Texture Layout on a4xx+: - * - * For cubemap and 2d array, each layer contains all of it's mipmap - * levels (layer_first layout). - * - * 3d textures are layed out as on a3xx, but unknown about 3d-array - * textures. - * - * In either case, the slice represents the per-miplevel information, - * but in layer_first layout it only includes the first layer, and - * an additional offset of (rsc->layer_size * layer) must be added. - */ -struct fd_resource_slice { - uint32_t offset; /* offset of first layer in slice */ - uint32_t pitch; - uint32_t size0; /* size of first layer in slice */ -}; +#include "freedreno/fdl/freedreno_layout.h" struct fd_resource { struct pipe_resource base; struct fd_bo *bo; - uint32_t cpp; enum pipe_format internal_format; - bool layer_first; /* see above description */ - uint32_t layer_size; - struct fd_resource_slice slices[MAX_MIP_LEVELS]; + struct fdl_layout layout; + /* buffer range that has been initialized */ struct util_range valid_buffer_range; bool valid; @@ -76,11 +50,6 @@ /* TODO rename to secondary or auxiliary? */ struct fd_resource *stencil; - uint32_t offset; - uint32_t ubwc_offset; - uint32_t ubwc_pitch; - uint32_t ubwc_size; - /* bitmask of in-flight batches which reference this resource. Note * that the batch doesn't hold reference to resources (but instead * the fd_ringbuffer holds refs to the underlying fd_bo), but in case @@ -102,10 +71,11 @@ /* Sequence # incremented each time bo changes: */ uint16_t seqno; - unsigned tile_mode : 2; - /* * LRZ + * + * TODO lrz width/height/pitch should probably also move to + * fdl_layout */ bool lrz_valid : 1; uint16_t lrz_width; // for lrz clear, does this differ from lrz_pitch? @@ -120,6 +90,12 @@ return (struct fd_resource *)ptex; } +static inline const struct fd_resource * +fd_resource_const(const struct pipe_resource *ptex) +{ + return (const struct fd_resource *)ptex; +} + static inline bool pending(struct fd_resource *rsc, bool write) { @@ -137,6 +113,14 @@ return false; } +static inline bool +has_depth(enum pipe_format format) +{ + const struct util_format_description *desc = + util_format_description(format); + return util_format_has_depth(desc); +} + struct fd_transfer { struct pipe_transfer base; struct pipe_resource *staging_prsc; @@ -149,59 +133,54 @@ return (struct fd_transfer *)ptrans; } -static inline struct fd_resource_slice * +static inline struct fdl_slice * fd_resource_slice(struct fd_resource *rsc, unsigned level) { assert(level <= rsc->base.last_level); - return &rsc->slices[level]; + return &rsc->layout.slices[level]; +} + +static inline uint32_t +fd_resource_layer_stride(struct fd_resource *rsc, unsigned level) +{ + return fdl_layer_stride(&rsc->layout, level); } /* get offset for specified mipmap level and texture/array layer */ static inline uint32_t fd_resource_offset(struct fd_resource *rsc, unsigned level, unsigned layer) { - struct fd_resource_slice *slice = fd_resource_slice(rsc, level); - unsigned offset; - if (rsc->layer_first) { - offset = slice->offset + (rsc->layer_size * layer); - } else { - offset = slice->offset + (slice->size0 * layer); - } + uint32_t offset = fdl_surface_offset(&rsc->layout, level, layer); debug_assert(offset < fd_bo_size(rsc->bo)); - return offset + rsc->offset; + return offset; } static inline uint32_t fd_resource_ubwc_offset(struct fd_resource *rsc, unsigned level, unsigned layer) { - /* for now this doesn't do anything clever, but when UBWC is enabled - * for multi layer/level images, it will. - */ - if (rsc->ubwc_size) { - debug_assert(level == 0); - debug_assert(layer == 0); - } - return rsc->ubwc_offset; + return fdl_ubwc_offset(&rsc->layout, level, layer); } /* This might be a5xx specific, but higher mipmap levels are always linear: */ static inline bool -fd_resource_level_linear(struct pipe_resource *prsc, int level) +fd_resource_level_linear(const struct pipe_resource *prsc, int level) { struct fd_screen *screen = fd_screen(prsc->screen); debug_assert(!is_a3xx(screen)); - unsigned w = u_minify(prsc->width0, level); - if (w < 16) - return true; - return false; + return fdl_level_linear(&fd_resource_const(prsc)->layout, level); +} + +static inline uint32_t +fd_resource_tile_mode(struct pipe_resource *prsc, int level) +{ + return fdl_tile_mode(&fd_resource(prsc)->layout, level); } static inline bool fd_resource_ubwc_enabled(struct fd_resource *rsc, int level) { - return rsc->ubwc_size && rsc->tile_mode && - !fd_resource_level_linear(&rsc->base, level); + return fdl_ubwc_enabled(&rsc->layout, level); } /* access # of samples, with 0 normalized to 1 (which is what we care about diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/freedreno_screen.c mesa-20.0.8/src/gallium/drivers/freedreno/freedreno_screen.c --- mesa-19.2.8/src/gallium/drivers/freedreno/freedreno_screen.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/freedreno_screen.c 2020-06-12 01:21:17.000000000 +0000 @@ -31,8 +31,8 @@ #include "util/u_memory.h" #include "util/u_inlines.h" -#include "util/u_format.h" -#include "util/u_format_s3tc.h" +#include "util/format/u_format.h" +#include "util/format/u_format_s3tc.h" #include "util/u_screen.h" #include "util/u_string.h" #include "util/u_debug.h" @@ -74,6 +74,7 @@ {"nobypass", FD_DBG_NOBYPASS, "Disable GMEM bypass"}, {"fraghalf", FD_DBG_FRAGHALF, "Use half-precision in fragment shader"}, {"nobin", FD_DBG_NOBIN, "Disable hw binning"}, + {"nogmem", FD_DBG_NOGMEM, "Disable GMEM rendering (bypass only)"}, {"glsl120", FD_DBG_GLSL120,"Temporary flag to force GLSL 1.20 (rather than 1.30) on a3xx+"}, {"shaderdb", FD_DBG_SHADERDB, "Enable shaderdb output"}, {"flush", FD_DBG_FLUSH, "Force flush after every draw"}, @@ -81,13 +82,15 @@ {"inorder", FD_DBG_INORDER,"Disable reordering for draws/blits"}, {"bstat", FD_DBG_BSTAT, "Print batch stats at context destroy"}, {"nogrow", FD_DBG_NOGROW, "Disable \"growable\" cmdstream buffers, even if kernel supports it"}, - {"lrz", FD_DBG_LRZ, "Enable experimental LRZ support (a5xx+)"}, + {"lrz", FD_DBG_LRZ, "Enable experimental LRZ support (a5xx)"}, {"noindirect",FD_DBG_NOINDR, "Disable hw indirect draws (emulate on CPU)"}, {"noblit", FD_DBG_NOBLIT, "Disable blitter (fallback to generic blit path)"}, {"hiprio", FD_DBG_HIPRIO, "Force high-priority context"}, {"ttile", FD_DBG_TTILE, "Enable texture tiling (a2xx/a3xx/a5xx)"}, {"perfcntrs", FD_DBG_PERFC, "Expose performance counters"}, {"noubwc", FD_DBG_NOUBWC, "Disable UBWC for all internal buffers"}, + {"nolrz", FD_DBG_NOLRZ, "Disable LRZ (a6xx)"}, + {"notile", FD_DBG_NOTILE, "Disable tiling for all internal buffers"}, DEBUG_NAMED_VALUE_END }; @@ -151,6 +154,7 @@ FREE(screen->ro); fd_bc_fini(&screen->batch_cache); + fd_gmem_screen_fini(pscreen); slab_destroy_parent(&screen->transfer_pool); @@ -194,6 +198,7 @@ case PIPE_CAP_MIXED_COLOR_DEPTH_BITS: case PIPE_CAP_TEXTURE_BARRIER: case PIPE_CAP_INVALIDATE_BUFFER: + case PIPE_CAP_RGB_OVERRIDE_DST_ALPHA_BLEND: return 1; case PIPE_CAP_PACKED_UNIFORMS: @@ -317,9 +322,6 @@ if (is_a6xx(screen)) return 1; return 0; - case PIPE_CAP_ALLOW_MAPPED_BUFFERS_DURING_EXECUTION: - return 0; - case PIPE_CAP_CONTEXT_PRIORITY_MASK: return screen->priority_mask; @@ -345,6 +347,16 @@ case PIPE_CAP_MAX_VARYINGS: return 16; + case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS: + /* We don't really have a limit on this, it all goes into the main + * memory buffer. Needs to be at least 120 / 4 (minimum requirement + * for GL_MAX_TESS_PATCH_COMPONENTS). + */ + return 128; + + case PIPE_CAP_MAX_TEXTURE_UPLOAD_MEMORY_BUDGET: + return 64 * 1024 * 1024; + case PIPE_CAP_SHAREABLE_SHADERS: case PIPE_CAP_GLSL_OPTIMIZE_CONSERVATIVELY: /* manage the variants for these ourself, to avoid breaking precompile: */ @@ -354,6 +366,14 @@ return 1; return 0; + /* Geometry shaders.. */ + case PIPE_CAP_MAX_GEOMETRY_OUTPUT_VERTICES: + return 512; + case PIPE_CAP_MAX_GEOMETRY_TOTAL_OUTPUT_COMPONENTS: + return 2048; + case PIPE_CAP_MAX_GS_INVOCATIONS: + return 32; + /* Stream output. */ case PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS: if (is_ir3(screen)) @@ -367,6 +387,8 @@ return 0; case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL: return 1; + case PIPE_CAP_TGSI_FS_POINT_IS_SYSVAL: + return is_a2xx(screen); case PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_COMPONENTS: case PIPE_CAP_MAX_STREAM_OUTPUT_INTERLEAVED_COMPONENTS: if (is_ir3(screen)) @@ -460,13 +482,16 @@ case PIPE_SHADER_FRAGMENT: case PIPE_SHADER_VERTEX: break; + case PIPE_SHADER_TESS_CTRL: + case PIPE_SHADER_TESS_EVAL: + case PIPE_SHADER_GEOMETRY: + if (is_a6xx(screen)) + break; + return 0; case PIPE_SHADER_COMPUTE: if (has_compute(screen)) break; return 0; - case PIPE_SHADER_GEOMETRY: - /* maye we could emulate.. */ - return 0; default: DBG("unknown shader type %d", shader); return 0; @@ -502,8 +527,11 @@ * everything is just normal registers. This is just temporary * hack until load_input/store_output handle arrays in a similar * way as load_var/store_var.. + * + * For tessellation stages, inputs are loaded using ldlw or ldg, both + * of which support indirection. */ - return 0; + return shader == PIPE_SHADER_TESS_CTRL || shader == PIPE_SHADER_TESS_EVAL; case PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR: case PIPE_SHADER_CAP_INDIRECT_CONST_ADDR: /* a2xx compiler doesn't handle indirect: */ @@ -538,8 +566,6 @@ return (1 << PIPE_SHADER_IR_NIR) | (1 << PIPE_SHADER_IR_TGSI); case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT: return 32; - case PIPE_SHADER_CAP_SCALAR_ISA: - return is_ir3(screen) ? 1 : 0; case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS: case PIPE_SHADER_CAP_MAX_SHADER_IMAGES: if (is_a5xx(screen) || is_a6xx(screen)) { @@ -890,11 +916,14 @@ case 430: fd4_screen_init(pscreen); break; + case 510: case 530: case 540: fd5_screen_init(pscreen); break; + case 618: case 630: + case 640: fd6_screen_init(pscreen); break; default: @@ -916,6 +945,11 @@ screen->num_vsc_pipes = 8; } + if (fd_mesa_debug & FD_DBG_PERFC) { + screen->perfcntr_groups = fd_perfcntrs(screen->gpu_id, + &screen->num_perfcntr_groups); + } + /* NOTE: don't enable if we have too old of a kernel to support * growable cmdstream buffers, since memory requirement for cmdstream * buffers would be too much otherwise. @@ -936,6 +970,7 @@ fd_resource_screen_init(pscreen); fd_query_screen_init(pscreen); + fd_gmem_screen_init(pscreen); pscreen->get_name = fd_screen_get_name; pscreen->get_vendor = fd_screen_get_vendor; diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/freedreno_screen.h mesa-20.0.8/src/gallium/drivers/freedreno/freedreno_screen.h --- mesa-19.2.8/src/gallium/drivers/freedreno/freedreno_screen.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/freedreno_screen.h 2020-06-12 01:21:17.000000000 +0000 @@ -29,6 +29,7 @@ #include "drm/freedreno_drmif.h" #include "drm/freedreno_ringbuffer.h" +#include "perfcntrs/freedreno_perfcntr.h" #include "pipe/p_screen.h" #include "util/u_memory.h" @@ -37,7 +38,7 @@ #include "renderonly/renderonly.h" #include "freedreno_batch_cache.h" -#include "freedreno_perfcntr.h" +#include "freedreno_gmem.h" #include "freedreno_util.h" struct fd_bo; @@ -113,6 +114,7 @@ int64_t cpu_gpu_time_delta; struct fd_batch_cache batch_cache; + struct fd_gmem_cache gmem_cache; bool reorder; diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/freedreno_state.c mesa-20.0.8/src/gallium/drivers/freedreno/freedreno_state.c --- mesa-19.2.8/src/gallium/drivers/freedreno/freedreno_state.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/freedreno_state.c 2020-06-12 01:21:17.000000000 +0000 @@ -248,15 +248,14 @@ * multiple times to the same surface), so we might as * well go ahead and flush this one: */ - fd_batch_flush(old_batch, false); + fd_batch_flush(old_batch); } fd_batch_reference(&old_batch, NULL); - } else { + } else if (ctx->batch) { DBG("%d: cbufs[0]=%p, zsbuf=%p", ctx->batch->needs_flush, framebuffer->cbufs[0], framebuffer->zsbuf); - fd_batch_flush(ctx->batch, false); - util_copy_framebuffer_state(&ctx->batch->framebuffer, cso); + fd_batch_flush(ctx->batch); } ctx->dirty |= FD_DIRTY_FRAMEBUFFER; @@ -472,7 +471,7 @@ target->buffer_size = buffer_size; assert(rsc->base.target == PIPE_BUFFER); - util_range_add(&rsc->valid_buffer_range, + util_range_add(&rsc->base, &rsc->valid_buffer_range, buffer_offset, buffer_offset + buffer_size); return target; @@ -499,12 +498,14 @@ for (i = 0; i < num_targets; i++) { boolean changed = targets[i] != so->targets[i]; - boolean append = (offsets[i] == (unsigned)-1); + boolean reset = (offsets[i] != (unsigned)-1); - if (!changed && append) + so->reset |= (reset << i); + + if (!changed && !reset) continue; - if (!append) + if (reset) so->offsets[i] = offsets[i]; pipe_so_target_reference(&so->targets[i], targets[i]); diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/freedreno_surface.c mesa-20.0.8/src/gallium/drivers/freedreno/freedreno_surface.c --- mesa-19.2.8/src/gallium/drivers/freedreno/freedreno_surface.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/freedreno_surface.c 2020-06-12 01:21:17.000000000 +0000 @@ -59,7 +59,6 @@ psurf->u.buf.first_element = surf_tmpl->u.buf.first_element; psurf->u.buf.last_element = surf_tmpl->u.buf.last_element; } else { - debug_assert(surf_tmpl->u.tex.first_layer == surf_tmpl->u.tex.last_layer); psurf->u.tex.level = level; psurf->u.tex.first_layer = surf_tmpl->u.tex.first_layer; psurf->u.tex.last_layer = surf_tmpl->u.tex.last_layer; diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/freedreno_util.c mesa-20.0.8/src/gallium/drivers/freedreno/freedreno_util.c --- mesa-19.2.8/src/gallium/drivers/freedreno/freedreno_util.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/freedreno_util.c 2020-06-12 01:21:17.000000000 +0000 @@ -25,7 +25,7 @@ */ #include "pipe/p_defines.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "freedreno_util.h" diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/freedreno_util.h mesa-20.0.8/src/gallium/drivers/freedreno/freedreno_util.h --- mesa-19.2.8/src/gallium/drivers/freedreno/freedreno_util.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/freedreno_util.h 2020-06-12 01:21:17.000000000 +0000 @@ -61,30 +61,35 @@ #define MAX_RENDER_TARGETS A6XX_MAX_RENDER_TARGETS -#define FD_DBG_MSGS 0x0001 -#define FD_DBG_DISASM 0x0002 -#define FD_DBG_DCLEAR 0x0004 -#define FD_DBG_DDRAW 0x0008 -#define FD_DBG_NOSCIS 0x0010 -#define FD_DBG_DIRECT 0x0020 -#define FD_DBG_NOBYPASS 0x0040 -#define FD_DBG_FRAGHALF 0x0080 -#define FD_DBG_NOBIN 0x0100 -/* unused 0x0200 */ -#define FD_DBG_GLSL120 0x0400 -#define FD_DBG_SHADERDB 0x0800 -#define FD_DBG_FLUSH 0x1000 -#define FD_DBG_DEQP 0x2000 -#define FD_DBG_INORDER 0x4000 -#define FD_DBG_BSTAT 0x8000 -#define FD_DBG_NOGROW 0x10000 -#define FD_DBG_LRZ 0x20000 -#define FD_DBG_NOINDR 0x40000 -#define FD_DBG_NOBLIT 0x80000 -#define FD_DBG_HIPRIO 0x100000 -#define FD_DBG_TTILE 0x200000 -#define FD_DBG_PERFC 0x400000 -#define FD_DBG_NOUBWC 0x800000 +enum fd_debug_flag { + FD_DBG_MSGS = BITFIELD_BIT(0), + FD_DBG_DISASM = BITFIELD_BIT(1), + FD_DBG_DCLEAR = BITFIELD_BIT(2), + FD_DBG_DDRAW = BITFIELD_BIT(3), + FD_DBG_NOSCIS = BITFIELD_BIT(4), + FD_DBG_DIRECT = BITFIELD_BIT(5), + FD_DBG_NOBYPASS = BITFIELD_BIT(6), + FD_DBG_FRAGHALF = BITFIELD_BIT(7), + FD_DBG_NOBIN = BITFIELD_BIT(8), + FD_DBG_NOGMEM = BITFIELD_BIT(9), + FD_DBG_GLSL120 = BITFIELD_BIT(10), + FD_DBG_SHADERDB = BITFIELD_BIT(11), + FD_DBG_FLUSH = BITFIELD_BIT(12), + FD_DBG_DEQP = BITFIELD_BIT(13), + FD_DBG_INORDER = BITFIELD_BIT(14), + FD_DBG_BSTAT = BITFIELD_BIT(15), + FD_DBG_NOGROW = BITFIELD_BIT(16), + FD_DBG_LRZ = BITFIELD_BIT(17), + FD_DBG_NOINDR = BITFIELD_BIT(18), + FD_DBG_NOBLIT = BITFIELD_BIT(19), + FD_DBG_HIPRIO = BITFIELD_BIT(20), + FD_DBG_TTILE = BITFIELD_BIT(21), + FD_DBG_PERFC = BITFIELD_BIT(22), + FD_DBG_NOUBWC = BITFIELD_BIT(23), + FD_DBG_NOLRZ = BITFIELD_BIT(24), + FD_DBG_NOTILE = BITFIELD_BIT(25), +}; + extern int fd_mesa_debug; extern bool fd_binning_enabled; diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/ir3/ir3_cache.c mesa-20.0.8/src/gallium/drivers/freedreno/ir3/ir3_cache.c --- mesa-19.2.8/src/gallium/drivers/freedreno/ir3/ir3_cache.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/ir3/ir3_cache.c 2020-06-12 01:21:17.000000000 +0000 @@ -93,16 +93,41 @@ return entry->data; } - struct ir3_shader_variant *bs = ir3_shader_variant(key->vs, key->key, true, debug); struct ir3_shader_variant *vs = ir3_shader_variant(key->vs, key->key, false, debug); - struct ir3_shader_variant *fs = ir3_shader_variant(key->fs, key->key, false, debug); + if (!vs) + return NULL; + + struct ir3_shader_variant *hs = NULL, *ds = NULL; + if (key->hs) { + debug_assert(key->ds); + hs = ir3_shader_variant(key->hs, key->key, false, debug); + ds = ir3_shader_variant(key->ds, key->key, false, debug); + if (!hs || ! ds) + return NULL; + } - if (!bs || !vs || !fs) { + /* For tessellation, the binning shader is derived from the DS. */ + struct ir3_shader_variant *bs; + if (key->ds) + bs = ir3_shader_variant(key->ds, key->key, true, debug); + else + bs = ir3_shader_variant(key->vs, key->key, true, debug); + if (!bs) return NULL; + + struct ir3_shader_variant *gs = NULL; + if (key->gs) { + gs = ir3_shader_variant(key->gs, key->key, false, debug); + if (!gs) + return NULL; } + struct ir3_shader_variant *fs = ir3_shader_variant(key->fs, key->key, false, debug); + if (!fs) + return NULL; + struct ir3_program_state *state = - cache->funcs->create_state(cache->data, bs, vs, fs, &key->key); + cache->funcs->create_state(cache->data, bs, vs, hs, ds, gs, fs, &key->key); state->key = *key; /* NOTE: uses copy of key in state obj, because pointer passed by caller diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/ir3/ir3_cache.h mesa-20.0.8/src/gallium/drivers/freedreno/ir3/ir3_cache.h --- mesa-19.2.8/src/gallium/drivers/freedreno/ir3/ir3_cache.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/ir3/ir3_cache.h 2020-06-12 01:21:17.000000000 +0000 @@ -37,8 +37,8 @@ /* key into program state cache */ struct ir3_cache_key { - struct ir3_shader *vs, *fs; // 4 dwords - struct ir3_shader_key key; // 7 dwords + struct ir3_shader *vs, *hs, *ds, *gs, *fs; // 5 pointers + struct ir3_shader_key key; // 7 dwords }; /* per-gen backend program state object should subclass this for it's @@ -53,6 +53,9 @@ struct ir3_program_state *(*create_state)(void *data, struct ir3_shader_variant *bs, /* binning pass vs */ struct ir3_shader_variant *vs, + struct ir3_shader_variant *hs, + struct ir3_shader_variant *ds, + struct ir3_shader_variant *gs, struct ir3_shader_variant *fs, const struct ir3_shader_key *key); void (*destroy_state)(void *data, struct ir3_program_state *state); diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c mesa-20.0.8/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c --- mesa-19.2.8/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c 2020-06-12 01:21:17.000000000 +0000 @@ -57,7 +57,7 @@ static void dump_info(struct ir3_shader_variant *so, const char *str) { uint32_t *bin; - const char *type = ir3_shader_stage(so->shader); + const char *type = ir3_shader_stage(so); bin = ir3_shader_assemble(so, so->shader->compiler->gpu_id); debug_printf("; %s: %s\n", type, str); ir3_shader_disasm(so, bin, stdout); @@ -142,7 +142,7 @@ NIR_PASS_V(nir, nir_lower_var_copies); nir_print_shader(nir, stdout); NIR_PASS_V(nir, gl_nir_lower_atomics, prog, true); - NIR_PASS_V(nir, nir_lower_atomics_to_ssbo, 8); + NIR_PASS_V(nir, nir_lower_atomics_to_ssbo); nir_print_shader(nir, stdout); switch (stage) { diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/ir3/ir3_gallium.c mesa-20.0.8/src/gallium/drivers/freedreno/ir3/ir3_gallium.c --- mesa-19.2.8/src/gallium/drivers/freedreno/ir3/ir3_gallium.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/ir3/ir3_gallium.c 2020-06-12 01:21:17.000000000 +0000 @@ -29,7 +29,7 @@ #include "util/u_string.h" #include "util/u_memory.h" #include "util/u_inlines.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "tgsi/tgsi_dump.h" #include "tgsi/tgsi_parse.h" @@ -51,13 +51,15 @@ return; pipe_debug_message(debug, SHADER_INFO, - "%s%s shader: %u inst, %u dwords, " - "%u half, %u full, %u constlen, " + "%s shader: %u inst, %u nops, %u non-nops, %u dwords, " + "%u last-baryf, %u half, %u full, %u constlen, " "%u (ss), %u (sy), %d max_sun, %d loops\n", - binning_pass ? "B" : "", - ir3_shader_stage(v->shader), + ir3_shader_stage(v), v->info.instrs_count, + v->info.nops_count, + v->info.instrs_count - v->info.nops_count, v->info.sizedwords, + v->info.last_baryf, v->info.max_half_reg + 1, v->info.max_reg + 1, v->constlen, @@ -110,7 +112,7 @@ struct ir3_shader * ir3_shader_create(struct ir3_compiler *compiler, - const struct pipe_shader_state *cso, gl_shader_stage type, + const struct pipe_shader_state *cso, struct pipe_debug_callback *debug, struct pipe_screen *screen) { @@ -211,6 +213,34 @@ offset, size, user_buffer, buffer); } +/** + * Indirectly calculates size of cmdstream needed for ir3_emit_user_consts(). + * Returns number of packets, and total size of all the payload. + * + * The value can be a worst-case, ie. some shader variants may not read all + * consts, etc. + * + * Returns size in dwords. + */ +void +ir3_user_consts_size(struct ir3_ubo_analysis_state *state, + unsigned *packets, unsigned *size) +{ + *packets = *size = 0; + + for (uint32_t i = 0; i < ARRAY_SIZE(state->range); i++) { + if (state->range[i].start < state->range[i].end) { + *size += state->range[i].end - state->range[i].start; + (*packets)++; + } + } +} + +/** + * Uploads sub-ranges of UBOs to the hardware's constant buffer (UBO access + * outside of these ranges will be done using full UBO accesses in the + * shader). + */ void ir3_emit_user_consts(struct fd_screen *screen, const struct ir3_shader_variant *v, struct fd_ringbuffer *ring, struct fd_constbuf_stateobj *constbuf) @@ -218,31 +248,28 @@ struct ir3_ubo_analysis_state *state; state = &v->shader->ubo_state; - for (uint32_t i = 0; i < ARRAY_SIZE(state->range); i++) { + uint32_t i; + foreach_bit(i, state->enabled & constbuf->enabled_mask) { struct pipe_constant_buffer *cb = &constbuf->cb[i]; - if (state->range[i].start < state->range[i].end && - constbuf->enabled_mask & (1 << i)) { + uint32_t size = state->range[i].end - state->range[i].start; + uint32_t offset = cb->buffer_offset + state->range[i].start; - uint32_t size = state->range[i].end - state->range[i].start; - uint32_t offset = cb->buffer_offset + state->range[i].start; - - /* and even if the start of the const buffer is before - * first_immediate, the end may not be: - */ - size = MIN2(size, (16 * v->constlen) - state->range[i].offset); + /* and even if the start of the const buffer is before + * first_immediate, the end may not be: + */ + size = MIN2(size, (16 * v->constlen) - state->range[i].offset); - if (size == 0) - continue; + if (size == 0) + continue; - /* things should be aligned to vec4: */ - debug_assert((state->range[i].offset % 16) == 0); - debug_assert((size % 16) == 0); - debug_assert((offset % 16) == 0); + /* things should be aligned to vec4: */ + debug_assert((state->range[i].offset % 16) == 0); + debug_assert((size % 16) == 0); + debug_assert((offset % 16) == 0); - emit_const(screen, ring, v, state->range[i].offset / 4, - offset, size / 4, cb->user_buffer, cb->buffer); - } + emit_const(screen, ring, v, state->range[i].offset / 4, + offset, size / 4, cb->user_buffer, cb->buffer); } } @@ -319,18 +346,19 @@ dims[off + 0] = util_format_get_blocksize(img->format); if (img->resource->target != PIPE_BUFFER) { - unsigned lvl = img->u.tex.level; + struct fdl_slice *slice = + fd_resource_slice(rsc, img->u.tex.level); /* note for 2d/cube/etc images, even if re-interpreted * as a different color format, the pixel size should * be the same, so use original dimensions for y and z * stride: */ - dims[off + 1] = rsc->slices[lvl].pitch * rsc->cpp; + dims[off + 1] = slice->pitch * rsc->layout.cpp; /* see corresponding logic in fd_resource_offset(): */ - if (rsc->layer_first) { - dims[off + 2] = rsc->layer_size; + if (rsc->layout.layer_first) { + dims[off + 2] = rsc->layout.layer_size; } else { - dims[off + 2] = rsc->slices[lvl].size0; + dims[off + 2] = slice->size0; } } else { /* For buffer-backed images, the log2 of the format's @@ -373,6 +401,68 @@ } } +static uint32_t +link_geometry_stages(const struct ir3_shader_variant *producer, + const struct ir3_shader_variant *consumer, + uint32_t *locs) +{ + uint32_t num_loc = 0, factor; + + switch (consumer->type) { + case MESA_SHADER_TESS_CTRL: + case MESA_SHADER_GEOMETRY: + /* These stages load with ldlw, which expects byte offsets. */ + factor = 4; + break; + case MESA_SHADER_TESS_EVAL: + /* The tess eval shader uses ldg, which takes dword offsets. */ + factor = 1; + break; + default: + unreachable("bad shader stage"); + } + + nir_foreach_variable(in_var, &consumer->shader->nir->inputs) { + nir_foreach_variable(out_var, &producer->shader->nir->outputs) { + if (in_var->data.location == out_var->data.location) { + locs[in_var->data.driver_location] = + producer->shader->output_loc[out_var->data.driver_location] * factor; + + debug_assert(num_loc <= in_var->data.driver_location + 1); + num_loc = in_var->data.driver_location + 1; + } + } + } + + return num_loc; +} + +void +ir3_emit_link_map(struct fd_screen *screen, + const struct ir3_shader_variant *producer, + const struct ir3_shader_variant *v, struct fd_ringbuffer *ring) +{ + const struct ir3_const_state *const_state = &v->shader->const_state; + uint32_t base = const_state->offsets.primitive_map; + uint32_t patch_locs[MAX_VARYING] = { }, num_loc; + + num_loc = link_geometry_stages(producer, v, patch_locs); + + int size = DIV_ROUND_UP(num_loc, 4); + + /* truncate size to avoid writing constants that shader + * does not use: + */ + size = MIN2(size + base, v->constlen) - base; + + /* convert out of vec4: */ + base *= 4; + size *= 4; + + if (size > 0) + emit_const(screen, ring, v, base, 0, size, patch_locs, NULL); +} + /* emit stream-out buffers: */ static void emit_tfbos(struct fd_context *ctx, const struct ir3_shader_variant *v, @@ -666,3 +756,37 @@ } } } + +static void * +ir3_shader_state_create(struct pipe_context *pctx, const struct pipe_shader_state *cso) +{ + struct fd_context *ctx = fd_context(pctx); + struct ir3_compiler *compiler = ctx->screen->compiler; + return ir3_shader_create(compiler, cso, &ctx->debug, pctx->screen); +} + +static void +ir3_shader_state_delete(struct pipe_context *pctx, void *hwcso) +{ + struct ir3_shader *so = hwcso; + ir3_shader_destroy(so); +} + +void +ir3_prog_init(struct pipe_context *pctx) +{ + pctx->create_vs_state = ir3_shader_state_create; + pctx->delete_vs_state = ir3_shader_state_delete; + + pctx->create_tcs_state = ir3_shader_state_create; + pctx->delete_tcs_state = ir3_shader_state_delete; + + pctx->create_tes_state = ir3_shader_state_create; + pctx->delete_tes_state = ir3_shader_state_delete; + + pctx->create_gs_state = ir3_shader_state_create; + pctx->delete_gs_state = ir3_shader_state_delete; + + pctx->create_fs_state = ir3_shader_state_create; + pctx->delete_fs_state = ir3_shader_state_delete; +} diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/ir3/ir3_gallium.h mesa-20.0.8/src/gallium/drivers/freedreno/ir3/ir3_gallium.h --- mesa-19.2.8/src/gallium/drivers/freedreno/ir3/ir3_gallium.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/ir3/ir3_gallium.h 2020-06-12 01:21:17.000000000 +0000 @@ -32,7 +32,7 @@ #include "ir3/ir3_shader.h" struct ir3_shader * ir3_shader_create(struct ir3_compiler *compiler, - const struct pipe_shader_state *cso, gl_shader_stage type, + const struct pipe_shader_state *cso, struct pipe_debug_callback *debug, struct pipe_screen *screen); struct ir3_shader * @@ -51,6 +51,8 @@ struct fd_shaderbuf_stateobj; struct fd_shaderimg_stateobj; +void ir3_user_consts_size(struct ir3_ubo_analysis_state *state, + unsigned *packets, unsigned *size); void ir3_emit_user_consts(struct fd_screen *screen, const struct ir3_shader_variant *v, struct fd_ringbuffer *ring, struct fd_constbuf_stateobj *constbuf); void ir3_emit_ubos(struct fd_screen *screen, const struct ir3_shader_variant *v, @@ -61,6 +63,9 @@ struct fd_ringbuffer *ring, struct fd_shaderimg_stateobj *si); void ir3_emit_immediates(struct fd_screen *screen, const struct ir3_shader_variant *v, struct fd_ringbuffer *ring); +void ir3_emit_link_map(struct fd_screen *screen, + const struct ir3_shader_variant *producer, + const struct ir3_shader_variant *v, struct fd_ringbuffer *ring); static inline bool ir3_needs_vs_driver_params(const struct ir3_shader_variant *v) @@ -81,4 +86,6 @@ void ir3_emit_cs_consts(const struct ir3_shader_variant *v, struct fd_ringbuffer *ring, struct fd_context *ctx, const struct pipe_grid_info *info); +void ir3_prog_init(struct pipe_context *pctx); + #endif /* IR3_GALLIUM_H_ */ diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/Makefile.sources mesa-20.0.8/src/gallium/drivers/freedreno/Makefile.sources --- mesa-19.2.8/src/gallium/drivers/freedreno/Makefile.sources 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/Makefile.sources 2020-06-12 01:21:17.000000000 +0000 @@ -14,7 +14,6 @@ freedreno_fence.h \ freedreno_gmem.c \ freedreno_gmem.h \ - freedreno_perfcntr.h \ freedreno_program.c \ freedreno_program.h \ freedreno_query.c \ @@ -50,7 +49,6 @@ a2xx/fd2_emit.h \ a2xx/fd2_gmem.c \ a2xx/fd2_gmem.h \ - a2xx/fd2_perfcntr.c \ a2xx/fd2_program.c \ a2xx/fd2_program.h \ a2xx/fd2_query.c \ @@ -149,7 +147,6 @@ a5xx/fd5_gmem.h \ a5xx/fd5_image.c \ a5xx/fd5_image.h \ - a5xx/fd5_perfcntr.c \ a5xx/fd5_program.c \ a5xx/fd5_program.h \ a5xx/fd5_query.c \ @@ -184,7 +181,6 @@ a6xx/fd6_gmem.h \ a6xx/fd6_image.c \ a6xx/fd6_image.h \ - a6xx/fd6_perfcntr.c \ a6xx/fd6_program.c \ a6xx/fd6_program.h \ a6xx/fd6_query.c \ diff -Nru mesa-19.2.8/src/gallium/drivers/freedreno/meson.build mesa-20.0.8/src/gallium/drivers/freedreno/meson.build --- mesa-19.2.8/src/gallium/drivers/freedreno/meson.build 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/freedreno/meson.build 2020-06-12 01:21:17.000000000 +0000 @@ -34,7 +34,6 @@ 'freedreno_fence.h', 'freedreno_gmem.c', 'freedreno_gmem.h', - 'freedreno_perfcntr.h', 'freedreno_program.c', 'freedreno_program.h', 'freedreno_query.c', @@ -68,7 +67,6 @@ 'a2xx/fd2_emit.h', 'a2xx/fd2_gmem.c', 'a2xx/fd2_gmem.h', - 'a2xx/fd2_perfcntr.c', 'a2xx/fd2_program.c', 'a2xx/fd2_program.h', 'a2xx/fd2_query.c', @@ -161,7 +159,6 @@ 'a5xx/fd5_gmem.h', 'a5xx/fd5_image.c', 'a5xx/fd5_image.h', - 'a5xx/fd5_perfcntr.c', 'a5xx/fd5_program.c', 'a5xx/fd5_program.h', 'a5xx/fd5_query.c', @@ -194,7 +191,6 @@ 'a6xx/fd6_gmem.h', 'a6xx/fd6_image.c', 'a6xx/fd6_image.h', - 'a6xx/fd6_perfcntr.c', 'a6xx/fd6_program.c', 'a6xx/fd6_program.h', 'a6xx/fd6_query.c', @@ -247,6 +243,8 @@ libfreedreno, libfreedreno_drm, libfreedreno_ir3, + libfreedreno_layout, + libfreedreno_perfcntrs ], dependencies : idep_nir, ) @@ -263,6 +261,7 @@ libfreedreno, libfreedreno_drm, libfreedreno_ir3, + libfreedreno_layout, libgallium, libglsl_standalone, ], diff -Nru mesa-19.2.8/src/gallium/drivers/i915/i915_clear.c mesa-20.0.8/src/gallium/drivers/i915/i915_clear.c --- mesa-19.2.8/src/gallium/drivers/i915/i915_clear.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/i915/i915_clear.c 2020-06-12 01:21:17.000000000 +0000 @@ -30,7 +30,7 @@ */ -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_pack_color.h" #include "i915_context.h" #include "i915_screen.h" diff -Nru mesa-19.2.8/src/gallium/drivers/i915/i915_resource_texture.c mesa-20.0.8/src/gallium/drivers/i915/i915_resource_texture.c --- mesa-19.2.8/src/gallium/drivers/i915/i915_resource_texture.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/i915/i915_resource_texture.c 2020-06-12 01:21:17.000000000 +0000 @@ -34,7 +34,7 @@ #include "pipe/p_context.h" #include "pipe/p_defines.h" #include "util/u_inlines.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_math.h" #include "util/u_memory.h" #include "util/u_rect.h" diff -Nru mesa-19.2.8/src/gallium/drivers/i915/i915_screen.c mesa-20.0.8/src/gallium/drivers/i915/i915_screen.c --- mesa-19.2.8/src/gallium/drivers/i915/i915_screen.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/i915/i915_screen.c 2020-06-12 01:21:17.000000000 +0000 @@ -28,8 +28,8 @@ #include "draw/draw_context.h" #include "util/os_misc.h" -#include "util/u_format.h" -#include "util/u_format_s3tc.h" +#include "util/format/u_format.h" +#include "util/format/u_format_s3tc.h" #include "util/u_inlines.h" #include "util/u_memory.h" #include "util/u_screen.h" diff -Nru mesa-19.2.8/src/gallium/drivers/i915/i915_state_emit.c mesa-20.0.8/src/gallium/drivers/i915/i915_state_emit.c --- mesa-19.2.8/src/gallium/drivers/i915/i915_state_emit.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/i915/i915_state_emit.c 2020-06-12 01:21:17.000000000 +0000 @@ -37,7 +37,7 @@ #include "pipe/p_defines.h" #include "pipe/p_format.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_math.h" #include "util/u_memory.h" diff -Nru mesa-19.2.8/src/gallium/drivers/i915/i915_surface.c mesa-20.0.8/src/gallium/drivers/i915/i915_surface.c --- mesa-19.2.8/src/gallium/drivers/i915/i915_surface.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/i915/i915_surface.c 2020-06-12 01:21:17.000000000 +0000 @@ -34,7 +34,7 @@ #include "pipe/p_defines.h" #include "util/u_inlines.h" #include "util/u_math.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_memory.h" #include "util/u_pack_color.h" #include "util/u_surface.h" diff -Nru mesa-19.2.8/src/gallium/drivers/i915/TODO mesa-20.0.8/src/gallium/drivers/i915/TODO --- mesa-19.2.8/src/gallium/drivers/i915/TODO 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/i915/TODO 2020-06-12 01:21:17.000000000 +0000 @@ -35,4 +35,4 @@ - Fix fragment discard Other bugs can be found here: -https://gitlab.freedesktop.org/mesa/mesa/issues?scope=all&utf8=%E2%9C%93&state=opened&label_name[]=i915g +https://gitlab.freedesktop.org/mesa/mesa/-/issues?scope=all&utf8=%E2%9C%93&state=opened&label_name[]=i915g diff -Nru mesa-19.2.8/src/gallium/drivers/iris/Android.mk mesa-20.0.8/src/gallium/drivers/iris/Android.mk --- mesa-19.2.8/src/gallium/drivers/iris/Android.mk 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/iris/Android.mk 2020-06-12 01:21:17.000000000 +0000 @@ -121,6 +121,25 @@ include $(MESA_COMMON_MK) include $(BUILD_STATIC_LIBRARY) +# +# libiris for gen12 +# + +include $(CLEAR_VARS) +LOCAL_MODULE := libmesa_iris_gen12 +LOCAL_MODULE_CLASS := STATIC_LIBRARIES + +LOCAL_SRC_FILES := $(LIBIRIS_SRC_FILES) +LOCAL_CFLAGS := -DGEN_VERSIONx10=120 + +LOCAL_C_INCLUDES := $(IRIS_COMMON_INCLUDES) + +LOCAL_STATIC_LIBRARIES := $(LIBIRIS_STATIC_LIBS) + +LOCAL_WHOLE_STATIC_LIBRARIES := libmesa_genxml + +include $(MESA_COMMON_MK) +include $(BUILD_STATIC_LIBRARY) ########################################################### include $(CLEAR_VARS) @@ -169,7 +188,8 @@ libmesa_iris_gen8 \ libmesa_iris_gen9 \ libmesa_iris_gen10 \ - libmesa_iris_gen11 + libmesa_iris_gen11 \ + libmesa_iris_gen12 include $(GALLIUM_COMMON_MK) include $(BUILD_STATIC_LIBRARY) diff -Nru mesa-19.2.8/src/gallium/drivers/iris/driinfo_iris.h mesa-20.0.8/src/gallium/drivers/iris/driinfo_iris.h --- mesa-19.2.8/src/gallium/drivers/iris/driinfo_iris.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/iris/driinfo_iris.h 2020-06-12 01:21:17.000000000 +0000 @@ -2,4 +2,16 @@ DRI_CONF_SECTION_DEBUG DRI_CONF_DUAL_COLOR_BLEND_BY_LOCATION("false") + DRI_CONF_DISABLE_THROTTLING("false") + DRI_CONF_ALWAYS_FLUSH_CACHE("false") +DRI_CONF_SECTION_END + +DRI_CONF_SECTION_PERFORMANCE + +//= BEGIN VERBATIM + DRI_CONF_OPT_BEGIN_V(bo_reuse, enum, 1, "0:1") + DRI_CONF_DESC(en, "Buffer object reuse") + DRI_CONF_OPT_END +//= END VERBATIM + DRI_CONF_SECTION_END diff -Nru mesa-19.2.8/src/gallium/drivers/iris/iris_batch.c mesa-20.0.8/src/gallium/drivers/iris/iris_batch.c --- mesa-19.2.8/src/gallium/drivers/iris/iris_batch.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/iris/iris_batch.c 2020-06-12 01:21:17.000000000 +0000 @@ -44,6 +44,7 @@ #include "drm-uapi/i915_drm.h" +#include "common/gen_aux_map.h" #include "intel/common/gen_gem.h" #include "util/hash_table.h" #include "util/set.h" @@ -149,20 +150,13 @@ } static unsigned -decode_get_state_size(void *v_batch, uint32_t offset_from_base) +decode_get_state_size(void *v_batch, + uint64_t address, + UNUSED uint64_t base_address) { struct iris_batch *batch = v_batch; - - /* The decoder gives us offsets from a base address, which is not great. - * Binding tables are relative to surface state base address, and other - * state is relative to dynamic state base address. These could alias, - * but in practice it's unlikely because surface offsets are always in - * the [0, 64K) range, and we assign dynamic state addresses starting at - * the top of the 4GB range. We should fix this but it's likely good - * enough for now. - */ unsigned size = (uintptr_t) - _mesa_hash_table_u64_search(batch->state_sizes, offset_from_base); + _mesa_hash_table_u64_search(batch->state_sizes, address); return size; } @@ -187,7 +181,6 @@ struct hash_table_u64 *state_sizes, struct iris_batch *all_batches, enum iris_batch_name name, - uint8_t engine, int priority) { batch->screen = screen; @@ -197,11 +190,6 @@ batch->state_sizes = state_sizes; batch->name = name; - /* engine should be one of I915_EXEC_RENDER, I915_EXEC_BLT, etc. */ - assert((engine & ~I915_EXEC_RING_MASK) == 0); - assert(util_bitcount(engine) == 1); - batch->engine = engine; - batch->hw_ctx_id = iris_create_hw_context(screen->bufmgr); assert(batch->hw_ctx_id); @@ -264,6 +252,20 @@ return NULL; } +static void +ensure_exec_obj_space(struct iris_batch *batch, uint32_t count) +{ + while (batch->exec_count + count > batch->exec_array_size) { + batch->exec_array_size *= 2; + batch->exec_bos = + realloc(batch->exec_bos, + batch->exec_array_size * sizeof(batch->exec_bos[0])); + batch->validation_list = + realloc(batch->validation_list, + batch->exec_array_size * sizeof(batch->validation_list[0])); + } +} + /** * Add a buffer to the current batch's validation list. * @@ -330,15 +332,7 @@ /* Now, take a reference and add it to the validation list. */ iris_bo_reference(bo); - if (batch->exec_count == batch->exec_array_size) { - batch->exec_array_size *= 2; - batch->exec_bos = - realloc(batch->exec_bos, - batch->exec_array_size * sizeof(batch->exec_bos[0])); - batch->validation_list = - realloc(batch->validation_list, - batch->exec_array_size * sizeof(batch->validation_list[0])); - } + ensure_exec_obj_space(batch, 1); batch->validation_list[batch->exec_count] = (struct drm_i915_gem_exec_object2) { @@ -376,6 +370,7 @@ iris_bo_unreference(batch->bo); batch->primary_batch_size = 0; + batch->total_chained_batch_size = 0; batch->contains_draw = false; batch->decoder.surface_base = batch->last_surface_base_address; @@ -436,27 +431,60 @@ } } +static void +record_batch_sizes(struct iris_batch *batch) +{ + unsigned batch_size = iris_batch_bytes_used(batch); + + VG(VALGRIND_CHECK_MEM_IS_DEFINED(batch->map, batch_size)); + + if (batch->bo == batch->exec_bos[0]) + batch->primary_batch_size = batch_size; + + batch->total_chained_batch_size += batch_size; +} + void iris_chain_to_new_batch(struct iris_batch *batch) { - /* We only support chaining a single time. */ - assert(batch->bo == batch->exec_bos[0]); - - VG(void *map = batch->map); uint32_t *cmd = batch->map_next; uint64_t *addr = batch->map_next + 4; batch->map_next += 12; + record_batch_sizes(batch); + /* No longer held by batch->bo, still held by validation list */ iris_bo_unreference(batch->bo); - batch->primary_batch_size = iris_batch_bytes_used(batch); create_batch(batch); /* Emit MI_BATCH_BUFFER_START to chain to another batch. */ *cmd = (0x31 << 23) | (1 << 8) | (3 - 2); *addr = batch->bo->gtt_offset; +} + +static void +add_aux_map_bos_to_batch(struct iris_batch *batch) +{ + void *aux_map_ctx = iris_bufmgr_get_aux_map_context(batch->screen->bufmgr); + if (!aux_map_ctx) + return; - VG(VALGRIND_CHECK_MEM_IS_DEFINED(map, batch->primary_batch_size)); + uint32_t count = gen_aux_map_get_num_buffers(aux_map_ctx); + ensure_exec_obj_space(batch, count); + gen_aux_map_fill_bos(aux_map_ctx, + (void**)&batch->exec_bos[batch->exec_count], count); + for (uint32_t i = 0; i < count; i++) { + struct iris_bo *bo = batch->exec_bos[batch->exec_count]; + iris_bo_reference(bo); + batch->validation_list[batch->exec_count] = + (struct drm_i915_gem_exec_object2) { + .handle = bo->gem_handle, + .offset = bo->gtt_offset, + .flags = bo->kflags, + }; + batch->aperture_space += bo->size; + batch->exec_count++; + } } /** @@ -465,16 +493,16 @@ static void iris_finish_batch(struct iris_batch *batch) { + add_aux_map_bos_to_batch(batch); + /* Emit MI_BATCH_BUFFER_END to finish our batch. */ uint32_t *map = batch->map_next; map[0] = (0xA << 23); batch->map_next += 4; - VG(VALGRIND_CHECK_MEM_IS_DEFINED(batch->map, iris_batch_bytes_used(batch))); - if (batch->bo == batch->exec_bos[0]) - batch->primary_batch_size = iris_batch_bytes_used(batch); + record_batch_sizes(batch); } /** @@ -559,7 +587,7 @@ .batch_start_offset = 0, /* This must be QWord aligned. */ .batch_len = ALIGN(batch->primary_batch_size, 8), - .flags = batch->engine | + .flags = I915_EXEC_RENDER | I915_EXEC_NO_RELOC | I915_EXEC_BATCH_FIRST | I915_EXEC_HANDLE_LUT, @@ -622,17 +650,11 @@ if (unlikely(INTEL_DEBUG & (DEBUG_BATCH | DEBUG_SUBMIT | DEBUG_PIPE_CONTROL))) { - int bytes_for_commands = iris_batch_bytes_used(batch); - int second_bytes = 0; - if (batch->bo != batch->exec_bos[0]) { - second_bytes = bytes_for_commands; - bytes_for_commands += batch->primary_batch_size; - } - fprintf(stderr, "%19s:%-3d: %s batch [%u] flush with %5d+%5db (%0.1f%%) " + fprintf(stderr, "%19s:%-3d: %s batch [%u] flush with %5db (%0.1f%%) " "(cmds), %4d BOs (%0.1fMb aperture)\n", file, line, batch_name_to_string(batch->name), batch->hw_ctx_id, - batch->primary_batch_size, second_bytes, - 100.0f * bytes_for_commands / BATCH_SZ, + batch->total_chained_batch_size, + 100.0f * batch->total_chained_batch_size / BATCH_SZ, batch->exec_count, (float) batch->aperture_space / (1024 * 1024)); diff -Nru mesa-19.2.8/src/gallium/drivers/iris/iris_batch.h mesa-20.0.8/src/gallium/drivers/iris/iris_batch.h --- mesa-19.2.8/src/gallium/drivers/iris/iris_batch.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/iris/iris_batch.h 2020-06-12 01:21:17.000000000 +0000 @@ -67,17 +67,18 @@ struct iris_bo *bo; void *map; void *map_next; - /** Size of the primary batch if we've moved on to a secondary. */ + + /** Size of the primary batch being submitted to execbuf (in bytes). */ unsigned primary_batch_size; + /** Total size of all chained batches (in bytes). */ + unsigned total_chained_batch_size; + /** Last Surface State Base Address set in this hardware context. */ uint64_t last_surface_base_address; uint32_t hw_ctx_id; - /** Which engine this batch targets - a I915_EXEC_RING_MASK value */ - uint8_t engine; - /** The validation list */ struct drm_i915_gem_exec_object2 *validation_list; struct iris_bo **exec_bos; @@ -126,6 +127,8 @@ /** Have we emitted any draw calls to this batch? */ bool contains_draw; + + uint32_t last_aux_map_state; }; void iris_init_batch(struct iris_batch *batch, @@ -136,7 +139,6 @@ struct hash_table_u64 *state_sizes, struct iris_batch *all_batches, enum iris_batch_name name, - uint8_t ring, int priority); void iris_chain_to_new_batch(struct iris_batch *batch); void iris_batch_free(struct iris_batch *batch); diff -Nru mesa-19.2.8/src/gallium/drivers/iris/iris_blit.c mesa-20.0.8/src/gallium/drivers/iris/iris_blit.c --- mesa-19.2.8/src/gallium/drivers/iris/iris_blit.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/iris/iris_blit.c 2020-06-12 01:21:17.000000000 +0000 @@ -25,7 +25,7 @@ #include "pipe/p_state.h" #include "pipe/p_context.h" #include "pipe/p_screen.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_inlines.h" #include "util/ralloc.h" #include "intel/blorp/blorp.h" @@ -230,6 +230,7 @@ void iris_blorp_surf_for_resource(struct iris_vtable *vtbl, + struct isl_device *isl_dev, struct blorp_surf *surf, struct pipe_resource *p_res, enum isl_aux_usage aux_usage, @@ -240,7 +241,7 @@ assert(!iris_resource_unfinished_aux_import(res)); - if (aux_usage == ISL_AUX_USAGE_HIZ && + if (isl_aux_usage_has_hiz(aux_usage) && !iris_resource_level_has_hiz(res, level)) aux_usage = ISL_AUX_USAGE_NONE; @@ -250,7 +251,7 @@ .buffer = res->bo, .offset = res->offset, .reloc_flags = is_render_target ? EXEC_OBJECT_WRITE : 0, - .mocs = vtbl->mocs(res->bo), + .mocs = iris_mocs(res->bo, isl_dev), }, .aux_usage = aux_usage, }; @@ -261,7 +262,7 @@ .buffer = res->aux.bo, .offset = res->aux.offset, .reloc_flags = is_render_target ? EXEC_OBJECT_WRITE : 0, - .mocs = vtbl->mocs(res->bo), + .mocs = iris_mocs(res->bo, isl_dev), }; surf->clear_color = iris_resource_get_clear_color(res, NULL, NULL); @@ -269,28 +270,47 @@ .buffer = res->aux.clear_color_bo, .offset = res->aux.clear_color_offset, .reloc_flags = 0, - .mocs = vtbl->mocs(res->aux.clear_color_bo), + .mocs = iris_mocs(res->aux.clear_color_bo, isl_dev), }; } +} - // XXX: ASTC +static bool +is_astc(enum isl_format format) +{ + return format != ISL_FORMAT_UNSUPPORTED && + isl_format_get_layout(format)->txc == ISL_TXC_ASTC; } static void -tex_cache_flush_hack(struct iris_batch *batch) +tex_cache_flush_hack(struct iris_batch *batch, + enum isl_format view_format, + enum isl_format surf_format) { - /* The hardware seems to have issues with having a two different - * format views of the same texture in the sampler cache at the - * same time. It's unclear exactly what the issue is but it hurts - * blits and copies particularly badly because they often reinterpret - * formats. We badly need better understanding of the sampler issue - * and a better fix but this works for now and fixes CTS tests. + const struct gen_device_info *devinfo = &batch->screen->devinfo; + + /* The WaSamplerCacheFlushBetweenRedescribedSurfaceReads workaround says: + * + * "Currently Sampler assumes that a surface would not have two + * different format associate with it. It will not properly cache + * the different views in the MT cache, causing a data corruption." + * + * We may need to handle this for texture views in general someday, but + * for now we handle it here, as it hurts copies and blits particularly + * badly because they ofter reinterpret formats. * * If the BO hasn't been referenced yet this batch, we assume that the * texture cache doesn't contain any relevant data nor need flushing. * - * TODO: Remove this hack! + * Icelake (Gen11+) claims to fix this issue, but seems to still have + * issues with ASTC formats. */ + bool need_flush = devinfo->gen >= 11 ? + is_astc(surf_format) != is_astc(view_format) : + view_format != surf_format; + if (!need_flush) + return; + const char *reason = "workaround: WaSamplerCacheFlushBetweenRedescribedSurfaceReads"; @@ -328,13 +348,18 @@ blorp_flags |= BLORP_BATCH_PREDICATE_ENABLE; } + if (iris_resource_unfinished_aux_import(src_res)) + iris_resource_finish_aux_import(ctx->screen, src_res); + if (iris_resource_unfinished_aux_import(dst_res)) + iris_resource_finish_aux_import(ctx->screen, dst_res); + struct iris_format_info src_fmt = iris_format_for_usage(devinfo, info->src.format, ISL_SURF_USAGE_TEXTURE_BIT); enum isl_aux_usage src_aux_usage = - iris_resource_texture_aux_usage(ice, src_res, src_fmt.fmt, 0); + iris_resource_texture_aux_usage(ice, src_res, src_fmt.fmt); - if (src_aux_usage == ISL_AUX_USAGE_HIZ) + if (iris_resource_level_has_hiz(src_res, info->src.level)) src_aux_usage = ISL_AUX_USAGE_NONE; bool src_clear_supported = src_aux_usage != ISL_AUX_USAGE_NONE && @@ -352,10 +377,12 @@ bool dst_clear_supported = dst_aux_usage != ISL_AUX_USAGE_NONE; struct blorp_surf src_surf, dst_surf; - iris_blorp_surf_for_resource(&ice->vtbl, &src_surf, info->src.resource, - src_aux_usage, info->src.level, false); - iris_blorp_surf_for_resource(&ice->vtbl, &dst_surf, info->dst.resource, - dst_aux_usage, info->dst.level, true); + iris_blorp_surf_for_resource(&ice->vtbl, &screen->isl_dev, &src_surf, + info->src.resource, src_aux_usage, + info->src.level, false); + iris_blorp_surf_for_resource(&ice->vtbl, &screen->isl_dev, &dst_surf, + info->dst.resource, dst_aux_usage, + info->dst.level, true); iris_resource_prepare_access(ice, batch, dst_res, info->dst.level, 1, info->dst.box.z, info->dst.box.depth, @@ -428,13 +455,11 @@ filter = BLORP_FILTER_NEAREST; } - bool format_mismatch = src_fmt.fmt != src_res->surf.format; - - if (format_mismatch && iris_batch_references(batch, src_res->bo)) - tex_cache_flush_hack(batch); + if (iris_batch_references(batch, src_res->bo)) + tex_cache_flush_hack(batch, src_fmt.fmt, src_res->surf.format); if (dst_res->base.target == PIPE_BUFFER) - util_range_add(&dst_res->valid_buffer_range, dst_x0, dst_x1); + util_range_add(&dst_res->base, &dst_res->valid_buffer_range, dst_x0, dst_x1); struct blorp_batch blorp_batch; blorp_batch_init(&ice->blorp, &blorp_batch, batch, blorp_flags); @@ -460,16 +485,56 @@ } } + struct iris_resource *stc_dst = NULL; + enum isl_aux_usage stc_src_aux_usage, stc_dst_aux_usage; if ((info->mask & PIPE_MASK_S) && util_format_has_stencil(util_format_description(info->dst.format)) && util_format_has_stencil(util_format_description(info->src.format))) { - struct iris_resource *src_res, *dst_res, *junk; + struct iris_resource *src_res, *junk; + struct blorp_surf src_surf, dst_surf; iris_get_depth_stencil_resources(info->src.resource, &junk, &src_res); - iris_get_depth_stencil_resources(info->dst.resource, &junk, &dst_res); - iris_blorp_surf_for_resource(&ice->vtbl, &src_surf, &src_res->base, - ISL_AUX_USAGE_NONE, info->src.level, false); - iris_blorp_surf_for_resource(&ice->vtbl, &dst_surf, &dst_res->base, - ISL_AUX_USAGE_NONE, info->dst.level, true); + iris_get_depth_stencil_resources(info->dst.resource, &junk, &stc_dst); + + struct iris_format_info src_fmt = + iris_format_for_usage(devinfo, src_res->base.format, + ISL_SURF_USAGE_TEXTURE_BIT); + stc_src_aux_usage = + iris_resource_texture_aux_usage(ice, src_res, src_fmt.fmt); + + struct iris_format_info dst_fmt = + iris_format_for_usage(devinfo, stc_dst->base.format, + ISL_SURF_USAGE_RENDER_TARGET_BIT); + stc_dst_aux_usage = + iris_resource_render_aux_usage(ice, stc_dst, dst_fmt.fmt, false, false); + + /* Resolve destination surface before blit because : + * 1. when we try to blit from the same surface, we can't read and + * write to the same surfaces at the same time when we have + * compression enabled so it's safe to resolve surface first and then + * do blit. + * 2. While bliting from one surface to another surface, we might be + * mixing compression formats, Our experiments shows that if after + * blit if we set DepthStencilResource flag to 0, blit passes but + * clear fails. + * + * XXX: In second case by destructing the compression, we might lose + * some performance. + */ + if (devinfo->gen >= 12) + stc_dst_aux_usage = ISL_AUX_USAGE_NONE; + + iris_resource_prepare_access(ice, batch, src_res, info->src.level, 1, + info->src.box.z, info->src.box.depth, + stc_src_aux_usage, false); + iris_resource_prepare_access(ice, batch, stc_dst, info->dst.level, 1, + info->dst.box.z, info->dst.box.depth, + stc_dst_aux_usage, false); + iris_blorp_surf_for_resource(&ice->vtbl, &screen->isl_dev, &src_surf, + &src_res->base, stc_src_aux_usage, + info->src.level, false); + iris_blorp_surf_for_resource(&ice->vtbl, &screen->isl_dev, &dst_surf, + &stc_dst->base, stc_dst_aux_usage, + info->dst.level, true); for (int slice = 0; slice < info->dst.box.depth; slice++) { iris_batch_maybe_flush(batch, 1500); @@ -487,11 +552,17 @@ blorp_batch_finish(&blorp_batch); - if (format_mismatch) - tex_cache_flush_hack(batch); + tex_cache_flush_hack(batch, src_fmt.fmt, src_res->surf.format); - iris_resource_finish_write(ice, dst_res, info->dst.level, info->dst.box.z, - info->dst.box.depth, dst_aux_usage); + if (info->mask & main_mask) { + iris_resource_finish_write(ice, dst_res, info->dst.level, info->dst.box.z, + info->dst.box.depth, dst_aux_usage); + } + + if (stc_dst) { + iris_resource_finish_write(ice, stc_dst, info->dst.level, info->dst.box.z, + info->dst.box.depth, stc_dst_aux_usage); + } iris_flush_and_dirty_for_history(ice, batch, (struct iris_resource *) info->dst.resource, @@ -503,18 +574,38 @@ get_copy_region_aux_settings(const struct gen_device_info *devinfo, struct iris_resource *res, enum isl_aux_usage *out_aux_usage, - bool *out_clear_supported) + bool *out_clear_supported, + bool is_render_target) { switch (res->aux.usage) { + case ISL_AUX_USAGE_HIZ: + if (!is_render_target && iris_sample_with_depth_aux(devinfo, res)) { + *out_aux_usage = ISL_AUX_USAGE_HIZ; + *out_clear_supported = true; + } else { + *out_aux_usage = ISL_AUX_USAGE_NONE; + *out_clear_supported = false; + } + break; case ISL_AUX_USAGE_MCS: + case ISL_AUX_USAGE_MCS_CCS: case ISL_AUX_USAGE_CCS_E: - *out_aux_usage = res->aux.usage; - /* Prior to Gen9, fast-clear only supported 0/1 clear colors. Since - * we're going to re-interpret the format as an integer format possibly - * with a different number of components, we can't handle clear colors - * until Gen9. + /* A stencil resolve operation must be performed prior to doing resource + * copies or used by CPU. + * (see HSD 1209978162) */ - *out_clear_supported = devinfo->gen >= 9; + if (is_render_target && isl_surf_usage_is_stencil(res->surf.usage)) { + *out_aux_usage = ISL_AUX_USAGE_NONE; + *out_clear_supported = false; + } else { + *out_aux_usage = res->aux.usage; + /* Prior to Gen9, fast-clear only supported 0/1 clear colors. Since + * we're going to re-interpret the format as an integer format possibly + * with a different number of components, we can't handle clear colors + * until Gen9. + */ + *out_clear_supported = devinfo->gen >= 9; + } break; default: *out_aux_usage = ISL_AUX_USAGE_NONE; @@ -551,15 +642,15 @@ enum isl_aux_usage src_aux_usage, dst_aux_usage; bool src_clear_supported, dst_clear_supported; get_copy_region_aux_settings(devinfo, src_res, &src_aux_usage, - &src_clear_supported); + &src_clear_supported, false); get_copy_region_aux_settings(devinfo, dst_res, &dst_aux_usage, - &dst_clear_supported); + &dst_clear_supported, true); if (iris_batch_references(batch, src_res->bo)) - tex_cache_flush_hack(batch); + tex_cache_flush_hack(batch, ISL_FORMAT_UNSUPPORTED, src_res->surf.format); if (dst->target == PIPE_BUFFER) - util_range_add(&dst_res->valid_buffer_range, dstx, dstx + src_box->width); + util_range_add(&dst_res->base, &dst_res->valid_buffer_range, dstx, dstx + src_box->width); if (dst->target == PIPE_BUFFER && src->target == PIPE_BUFFER) { struct blorp_address src_addr = { @@ -579,10 +670,10 @@ // XXX: what about one surface being a buffer and not the other? struct blorp_surf src_surf, dst_surf; - iris_blorp_surf_for_resource(&ice->vtbl, &src_surf, src, src_aux_usage, - src_level, false); - iris_blorp_surf_for_resource(&ice->vtbl, &dst_surf, dst, dst_aux_usage, - dst_level, true); + iris_blorp_surf_for_resource(&ice->vtbl, &screen->isl_dev, &src_surf, + src, src_aux_usage, src_level, false); + iris_blorp_surf_for_resource(&ice->vtbl, &screen->isl_dev, &dst_surf, + dst, dst_aux_usage, dst_level, true); iris_resource_prepare_access(ice, batch, src_res, src_level, 1, src_box->z, src_box->depth, @@ -607,7 +698,7 @@ src_box->depth, dst_aux_usage); } - tex_cache_flush_hack(batch); + tex_cache_flush_hack(batch, ISL_FORMAT_UNSUPPORTED, src_res->surf.format); } static struct iris_batch * @@ -632,44 +723,51 @@ */ static void iris_resource_copy_region(struct pipe_context *ctx, - struct pipe_resource *dst, + struct pipe_resource *p_dst, unsigned dst_level, unsigned dstx, unsigned dsty, unsigned dstz, - struct pipe_resource *src, + struct pipe_resource *p_src, unsigned src_level, const struct pipe_box *src_box) { struct iris_context *ice = (void *) ctx; struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER]; + struct iris_resource *src = (void *) p_src; + struct iris_resource *dst = (void *) p_dst; + + if (iris_resource_unfinished_aux_import(src)) + iris_resource_finish_aux_import(ctx->screen, src); + if (iris_resource_unfinished_aux_import(dst)) + iris_resource_finish_aux_import(ctx->screen, dst); /* Use MI_COPY_MEM_MEM for tiny (<= 16 byte, % 4) buffer copies. */ - if (src->target == PIPE_BUFFER && dst->target == PIPE_BUFFER && + if (p_src->target == PIPE_BUFFER && p_dst->target == PIPE_BUFFER && (src_box->width % 4 == 0) && src_box->width <= 16) { - struct iris_bo *dst_bo = iris_resource_bo(dst); + struct iris_bo *dst_bo = iris_resource_bo(p_dst); batch = get_preferred_batch(ice, dst_bo); iris_batch_maybe_flush(batch, 24 + 5 * (src_box->width / 4)); iris_emit_pipe_control_flush(batch, "stall for MI_COPY_MEM_MEM copy_region", PIPE_CONTROL_CS_STALL); - ice->vtbl.copy_mem_mem(batch, dst_bo, dstx, iris_resource_bo(src), + ice->vtbl.copy_mem_mem(batch, dst_bo, dstx, iris_resource_bo(p_src), src_box->x, src_box->width); return; } - iris_copy_region(&ice->blorp, batch, dst, dst_level, dstx, dsty, dstz, - src, src_level, src_box); + iris_copy_region(&ice->blorp, batch, p_dst, dst_level, dstx, dsty, dstz, + p_src, src_level, src_box); - if (util_format_is_depth_and_stencil(dst->format) && - util_format_has_stencil(util_format_description(src->format))) { + if (util_format_is_depth_and_stencil(p_dst->format) && + util_format_has_stencil(util_format_description(p_src->format))) { struct iris_resource *junk, *s_src_res, *s_dst_res; - iris_get_depth_stencil_resources(src, &junk, &s_src_res); - iris_get_depth_stencil_resources(dst, &junk, &s_dst_res); + iris_get_depth_stencil_resources(p_src, &junk, &s_src_res); + iris_get_depth_stencil_resources(p_dst, &junk, &s_dst_res); iris_copy_region(&ice->blorp, batch, &s_dst_res->base, dst_level, dstx, dsty, dstz, &s_src_res->base, src_level, src_box); } - iris_flush_and_dirty_for_history(ice, batch, (struct iris_resource *) dst, + iris_flush_and_dirty_for_history(ice, batch, dst, PIPE_CONTROL_RENDER_TARGET_FLUSH, "cache history: post copy_region"); } diff -Nru mesa-19.2.8/src/gallium/drivers/iris/iris_blorp.c mesa-20.0.8/src/gallium/drivers/iris/iris_blorp.c --- mesa-19.2.8/src/gallium/drivers/iris/iris_blorp.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/iris/iris_blorp.c 2020-06-12 01:21:17.000000000 +0000 @@ -46,12 +46,6 @@ #define BLORP_USE_SOFTPIN #include "blorp/blorp_genX_exec.h" -#if GEN_GEN == 8 -#define MOCS_WB 0x78 -#else -#define MOCS_WB (2 << 1) -#endif - static uint32_t * stream_state(struct iris_batch *batch, struct u_upload_mgr *uploader, @@ -189,7 +183,7 @@ *addr = (struct blorp_address) { .buffer = bo, .offset = offset, - .mocs = MOCS_WB, + .mocs = iris_mocs(bo, &batch->screen->isl_dev), }; return map; @@ -202,8 +196,10 @@ static void blorp_vf_invalidate_for_vb_48b_transitions(struct blorp_batch *blorp_batch, const struct blorp_address *addrs, + UNUSED uint32_t *sizes, unsigned num_vbs) { +#if GEN_GEN < 11 struct iris_context *ice = blorp_batch->blorp->driver_ctx; struct iris_batch *batch = blorp_batch->driver_batch; bool need_invalidate = false; @@ -224,6 +220,7 @@ PIPE_CONTROL_VF_CACHE_INVALIDATE | PIPE_CONTROL_CS_STALL); } +#endif } static struct blorp_address @@ -244,24 +241,11 @@ */ } -static void -blorp_emit_urb_config(struct blorp_batch *blorp_batch, - unsigned vs_entry_size, - UNUSED unsigned sf_entry_size) +static const struct gen_l3_config * +blorp_get_l3_config(struct blorp_batch *blorp_batch) { - struct iris_context *ice = blorp_batch->blorp->driver_ctx; struct iris_batch *batch = blorp_batch->driver_batch; - - unsigned size[4] = { vs_entry_size, 1, 1, 1 }; - - /* If last VS URB size is good enough for what the BLORP operation needed, - * then we can skip reconfiguration - */ - if (ice->shaders.last_vs_entry_size >= vs_entry_size) - return; - - genX(emit_urb_setup)(ice, batch, size, false, false); - ice->state.dirty |= IRIS_DIRTY_URB; + return batch->screen->l3_config_3d; } static void @@ -307,14 +291,26 @@ iris_require_command_space(batch, 1400); +#if GEN_GEN == 8 + genX(update_pma_fix)(ice, batch, false); +#endif + const unsigned scale = params->fast_clear_op ? UINT_MAX : 1; if (ice->state.current_hash_scale != scale) { genX(emit_hashing_mode)(ice, batch, params->x1 - params->x0, params->y1 - params->y0, scale); } +#if GEN_GEN >= 12 + genX(invalidate_aux_map_state)(batch); +#endif + + iris_handle_always_flush_cache(batch); + blorp_exec(blorp_batch, params); + iris_handle_always_flush_cache(batch); + /* We've smashed all state compared to what the normal 3D pipeline * rendering tracks for GL. */ @@ -331,13 +327,29 @@ IRIS_DIRTY_UNCOMPILED_GS | IRIS_DIRTY_UNCOMPILED_FS | IRIS_DIRTY_VF | - IRIS_DIRTY_URB | IRIS_DIRTY_SF_CL_VIEWPORT | IRIS_DIRTY_SAMPLER_STATES_VS | IRIS_DIRTY_SAMPLER_STATES_TCS | IRIS_DIRTY_SAMPLER_STATES_TES | IRIS_DIRTY_SAMPLER_STATES_GS); + if (!ice->shaders.uncompiled[MESA_SHADER_TESS_EVAL]) { + /* BLORP disabled tessellation, that's fine for the next draw */ + skip_bits |= IRIS_DIRTY_TCS | + IRIS_DIRTY_TES | + IRIS_DIRTY_CONSTANTS_TCS | + IRIS_DIRTY_CONSTANTS_TES | + IRIS_DIRTY_BINDINGS_TCS | + IRIS_DIRTY_BINDINGS_TES; + } + + if (!ice->shaders.uncompiled[MESA_SHADER_GEOMETRY]) { + /* BLORP disabled geometry shaders, that's fine for the next draw */ + skip_bits |= IRIS_DIRTY_GS | + IRIS_DIRTY_CONSTANTS_GS | + IRIS_DIRTY_BINDINGS_GS; + } + /* we can skip flagging IRIS_DIRTY_DEPTH_BUFFER, if * BLORP_BATCH_NO_EMIT_DEPTH_STENCIL is set. */ diff -Nru mesa-19.2.8/src/gallium/drivers/iris/iris_bufmgr.c mesa-20.0.8/src/gallium/drivers/iris/iris_bufmgr.c --- mesa-19.2.8/src/gallium/drivers/iris/iris_bufmgr.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/iris/iris_bufmgr.c 2020-06-12 01:21:17.000000000 +0000 @@ -49,17 +49,21 @@ #include #include #include +#include #include "errno.h" +#include "common/gen_aux_map.h" #include "common/gen_clflush.h" #include "dev/gen_debug.h" #include "common/gen_gem.h" #include "dev/gen_device_info.h" #include "main/macros.h" +#include "os/os_mman.h" #include "util/debug.h" #include "util/macros.h" #include "util/hash_table.h" #include "util/list.h" +#include "util/os_file.h" #include "util/u_dynarray.h" #include "util/vma.h" #include "iris_bufmgr.h" @@ -88,6 +92,17 @@ #define PAGE_SIZE 4096 +#define WARN_ONCE(cond, fmt...) do { \ + if (unlikely(cond)) { \ + static bool _warned = false; \ + if (!_warned) { \ + fprintf(stderr, "WARNING: "); \ + fprintf(stderr, fmt); \ + _warned = true; \ + } \ + } \ +} while (0) + #define FILE_DEBUG_FLAG DEBUG_BUFMGR static inline int @@ -123,7 +138,24 @@ uint64_t size; }; +struct bo_export { + /** File descriptor associated with a handle export. */ + int drm_fd; + + /** GEM handle in drm_fd */ + uint32_t gem_handle; + + struct list_head link; +}; + struct iris_bufmgr { + /** + * List into the list of bufmgr. + */ + struct list_head link; + + uint32_t refcount; + int fd; mtx_t lock; @@ -146,6 +178,14 @@ bool has_llc:1; bool bo_reuse:1; + + struct gen_aux_map_context *aux_map_ctx; +}; + +static mtx_t global_bufmgr_list_mutex = _MTX_INITIALIZER_NP; +static struct list_head global_bufmgr_list = { + .next = &global_bufmgr_list, + .prev = &global_bufmgr_list, }; static int bo_set_tiling_internal(struct iris_bo *bo, uint32_t tiling_mode, @@ -157,18 +197,6 @@ enum iris_memory_zone memzone, uint64_t size, uint64_t alignment); -static uint32_t -key_hash_uint(const void *key) -{ - return _mesa_hash_data(key, 4); -} - -static bool -key_uint_equal(const void *a, const void *b) -{ - return *((unsigned *) a) == *((unsigned *) b); -} - static struct iris_bo * find_and_ref_external_bo(struct hash_table *ht, unsigned int key) { @@ -346,9 +374,13 @@ bo_calloc(void) { struct iris_bo *bo = calloc(1, sizeof(*bo)); - if (bo) { - bo->hash = _mesa_hash_pointer(bo); - } + if (!bo) + return NULL; + + list_inithead(&bo->exports); + + bo->hash = _mesa_hash_pointer(bo); + return bo; } @@ -392,6 +424,20 @@ if (!bo) return NULL; + if (bo->aux_map_address) { + /* This buffer was associated with an aux-buffer range. We make sure + * that buffers are not reused from the cache while the buffer is (busy) + * being used by an executing batch. Since we are here, the buffer is no + * longer being used by a batch and the buffer was deleted (in order to + * end up in the cache). Therefore its old aux-buffer range can be + * removed from the aux-map. + */ + if (bo->bufmgr->aux_map_ctx) + gen_aux_map_unmap_range(bo->bufmgr->aux_map_ctx, bo->gtt_offset, + bo->size); + bo->aux_map_address = 0; + } + /* If the cached BO isn't in the right memory zone, or the alignment * isn't sufficient, free the old memory and assign it a new address. */ @@ -572,6 +618,7 @@ void *ptr, size_t size, enum iris_memory_zone memzone) { + struct drm_gem_close close = { 0, }; struct iris_bo *bo; bo = bo_calloc(); @@ -617,7 +664,8 @@ return bo; err_close: - gen_ioctl(bufmgr->fd, DRM_IOCTL_GEM_CLOSE, &bo->gem_handle); + close.handle = bo->gem_handle; + gen_ioctl(bufmgr->fd, DRM_IOCTL_GEM_CLOSE, &close); err_free: free(bo); return NULL; @@ -689,6 +737,7 @@ bo->tiling_mode = get_tiling.tiling_mode; bo->swizzle_mode = get_tiling.swizzle_mode; + /* XXX stride is unknown */ DBG("bo_create_from_handle: %d (%s)\n", handle, bo->name); @@ -717,6 +766,16 @@ entry = _mesa_hash_table_search(bufmgr->handle_table, &bo->gem_handle); _mesa_hash_table_remove(bufmgr->handle_table, entry); + + list_for_each_entry_safe(struct bo_export, export, &bo->exports, link) { + struct drm_gem_close close = { .handle = export->gem_handle }; + gen_ioctl(export->drm_fd, DRM_IOCTL_GEM_CLOSE, &close); + + list_del(&export->link); + free(export); + } + } else { + assert(list_is_empty(&bo->exports)); } /* Close this object */ @@ -727,6 +786,11 @@ bo->gem_handle, bo->name, strerror(errno)); } + if (bo->aux_map_address && bo->bufmgr->aux_map_ctx) { + gen_aux_map_unmap_range(bo->bufmgr->aux_map_ctx, bo->gtt_offset, + bo->size); + } + /* Return the VMA for reuse */ vma_free(bo->bufmgr, bo->gtt_offset, bo->size); @@ -740,15 +804,15 @@ if (bo->map_cpu && !bo->userptr) { VG_NOACCESS(bo->map_cpu, bo->size); - munmap(bo->map_cpu, bo->size); + os_munmap(bo->map_cpu, bo->size); } if (bo->map_wc) { VG_NOACCESS(bo->map_wc, bo->size); - munmap(bo->map_wc, bo->size); + os_munmap(bo->map_wc, bo->size); } if (bo->map_gtt) { VG_NOACCESS(bo->map_gtt, bo->size); - munmap(bo->map_gtt, bo->size); + os_munmap(bo->map_gtt, bo->size); } if (bo->idle) { @@ -911,7 +975,7 @@ if (p_atomic_cmpxchg(&bo->map_cpu, NULL, map)) { VG_NOACCESS(map, bo->size); - munmap(map, bo->size); + os_munmap(map, bo->size); } } assert(bo->map_cpu); @@ -973,7 +1037,7 @@ if (p_atomic_cmpxchg(&bo->map_wc, NULL, map)) { VG_NOACCESS(map, bo->size); - munmap(map, bo->size); + os_munmap(map, bo->size); } } assert(bo->map_wc); @@ -1031,8 +1095,8 @@ } /* and mmap it. */ - void *map = mmap(0, bo->size, PROT_READ | PROT_WRITE, - MAP_SHARED, bufmgr->fd, mmap_arg.offset); + void *map = os_mmap(0, bo->size, PROT_READ | PROT_WRITE, + MAP_SHARED, bufmgr->fd, mmap_arg.offset); if (map == MAP_FAILED) { DBG("%s:%d: Error mapping buffer %d (%s): %s .\n", __FILE__, __LINE__, bo->gem_handle, bo->name, strerror(errno)); @@ -1048,7 +1112,7 @@ if (p_atomic_cmpxchg(&bo->map_gtt, NULL, map)) { VG_NOACCESS(map, bo->size); - munmap(map, bo->size); + os_munmap(map, bo->size); } } assert(bo->map_gtt); @@ -1190,9 +1254,15 @@ return ret; } -void +static void iris_bufmgr_destroy(struct iris_bufmgr *bufmgr) { + /* Free aux-map buffers */ + gen_aux_map_finish(bufmgr->aux_map_ctx); + + /* bufmgr will no longer try to free VMA entries in the aux-map */ + bufmgr->aux_map_ctx = NULL; + mtx_destroy(&bufmgr->lock); /* Free any cached buffer objects we were going to reuse */ @@ -1220,6 +1290,8 @@ util_vma_heap_finish(&bufmgr->vma_allocator[z]); } + close(bufmgr->fd); + free(bufmgr); } @@ -1266,7 +1338,8 @@ } struct iris_bo * -iris_bo_import_dmabuf(struct iris_bufmgr *bufmgr, int prime_fd) +iris_bo_import_dmabuf(struct iris_bufmgr *bufmgr, int prime_fd, + uint32_t tiling, uint32_t stride) { uint32_t handle; struct iris_bo *bo; @@ -1305,23 +1378,27 @@ bo->size = ret; bo->bufmgr = bufmgr; - - bo->gem_handle = handle; - _mesa_hash_table_insert(bufmgr->handle_table, &bo->gem_handle, bo); - bo->name = "prime"; bo->reusable = false; bo->external = true; bo->kflags = EXEC_OBJECT_SUPPORTS_48B_ADDRESS | EXEC_OBJECT_PINNED; bo->gtt_offset = vma_alloc(bufmgr, IRIS_MEMZONE_OTHER, bo->size, 1); + bo->gem_handle = handle; + _mesa_hash_table_insert(bufmgr->handle_table, &bo->gem_handle, bo); struct drm_i915_gem_get_tiling get_tiling = { .handle = bo->gem_handle }; if (gen_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_GET_TILING, &get_tiling)) goto err; - bo->tiling_mode = get_tiling.tiling_mode; - bo->swizzle_mode = get_tiling.swizzle_mode; - /* XXX stride is unknown */ + if (get_tiling.tiling_mode == tiling || tiling > I915_TILING_LAST) { + bo->tiling_mode = get_tiling.tiling_mode; + bo->swizzle_mode = get_tiling.swizzle_mode; + /* XXX stride is unknown */ + } else { + if (bo_set_tiling_internal(bo, tiling, stride)) { + goto err; + } + } out: mtx_unlock(&bufmgr->lock); @@ -1338,17 +1415,25 @@ { if (!bo->external) { _mesa_hash_table_insert(bo->bufmgr->handle_table, &bo->gem_handle, bo); + /* If a BO is going to be used externally, it could be sent to the + * display HW. So make sure our CPU mappings don't assume cache + * coherency since display is outside that cache. + */ + bo->cache_coherent = false; bo->external = true; + bo->reusable = false; } } -static void +void iris_bo_make_external(struct iris_bo *bo) { struct iris_bufmgr *bufmgr = bo->bufmgr; - if (bo->external) + if (bo->external) { + assert(!bo->reusable); return; + } mtx_lock(&bufmgr->lock); iris_bo_make_external_locked(bo); @@ -1366,8 +1451,6 @@ DRM_CLOEXEC, prime_fd) != 0) return -errno; - bo->reusable = false; - return 0; } @@ -1397,14 +1480,75 @@ _mesa_hash_table_insert(bufmgr->name_table, &bo->global_name, bo); } mtx_unlock(&bufmgr->lock); - - bo->reusable = false; } *name = bo->global_name; return 0; } +int +iris_bo_export_gem_handle_for_device(struct iris_bo *bo, int drm_fd, + uint32_t *out_handle) +{ + /* Only add the new GEM handle to the list of export if it belongs to a + * different GEM device. Otherwise we might close the same buffer multiple + * times. + */ + struct iris_bufmgr *bufmgr = bo->bufmgr; + int ret = os_same_file_description(drm_fd, bufmgr->fd); + WARN_ONCE(ret < 0, + "Kernel has no file descriptor comparison support: %s\n", + strerror(errno)); + if (ret == 0) { + *out_handle = iris_bo_export_gem_handle(bo); + return 0; + } + + struct bo_export *export = calloc(1, sizeof(*export)); + if (!export) + return -ENOMEM; + + export->drm_fd = drm_fd; + + int dmabuf_fd = -1; + int err = iris_bo_export_dmabuf(bo, &dmabuf_fd); + if (err) { + free(export); + return err; + } + + mtx_lock(&bufmgr->lock); + err = drmPrimeFDToHandle(drm_fd, dmabuf_fd, &export->gem_handle); + close(dmabuf_fd); + if (err) { + mtx_unlock(&bufmgr->lock); + free(export); + return err; + } + + bool found = false; + list_for_each_entry(struct bo_export, iter, &bo->exports, link) { + if (iter->drm_fd != drm_fd) + continue; + /* Here we assume that for a given DRM fd, we'll always get back the + * same GEM handle for a given buffer. + */ + assert(iter->gem_handle == export->gem_handle); + free(export); + export = iter; + found = true; + break; + } + if (!found) + list_addtail(&export->link, &bo->exports); + + mtx_unlock(&bufmgr->lock); + + *out_handle = export->gem_handle; + + return 0; +} + static void add_bucket(struct iris_bufmgr *bufmgr, int size) { @@ -1563,14 +1707,46 @@ return 0; } +static struct gen_buffer * +gen_aux_map_buffer_alloc(void *driver_ctx, uint32_t size) +{ + struct gen_buffer *buf = malloc(sizeof(struct gen_buffer)); + if (!buf) + return NULL; + + struct iris_bufmgr *bufmgr = (struct iris_bufmgr *)driver_ctx; + + struct iris_bo *bo = + iris_bo_alloc_tiled(bufmgr, "aux-map", size, 64 * 1024, + IRIS_MEMZONE_OTHER, I915_TILING_NONE, 0, 0); + + buf->driver_bo = bo; + buf->gpu = bo->gtt_offset; + buf->gpu_end = buf->gpu + bo->size; + buf->map = iris_bo_map(NULL, bo, MAP_WRITE | MAP_RAW); + return buf; +} + +static void +gen_aux_map_buffer_free(void *driver_ctx, struct gen_buffer *buffer) +{ + iris_bo_unreference((struct iris_bo*)buffer->driver_bo); + free(buffer); +} + +static struct gen_mapped_pinned_buffer_alloc aux_map_allocator = { + .alloc = gen_aux_map_buffer_alloc, + .free = gen_aux_map_buffer_free, +}; + /** * Initializes the GEM buffer manager, which uses the kernel to allocate, map, * and manage map buffer objections. * * \param fd File descriptor of the opened DRM device. */ -struct iris_bufmgr * -iris_bufmgr_init(struct gen_device_info *devinfo, int fd) +static struct iris_bufmgr * +iris_bufmgr_create(struct gen_device_info *devinfo, int fd, bool bo_reuse) { uint64_t gtt_size = iris_gtt_size(fd); if (gtt_size <= IRIS_MEMZONE_OTHER_START) @@ -1589,9 +1765,12 @@ * Don't do this! Ensure that each library/bufmgr has its own device * fd so that its namespace does not clash with another. */ - bufmgr->fd = fd; + bufmgr->fd = dup(fd); + + p_atomic_set(&bufmgr->refcount, 1); if (mtx_init(&bufmgr->lock, mtx_plain) != 0) { + close(bufmgr->fd); free(bufmgr); return NULL; } @@ -1599,9 +1778,11 @@ list_inithead(&bufmgr->zombie_list); bufmgr->has_llc = devinfo->has_llc; + bufmgr->bo_reuse = bo_reuse; STATIC_ASSERT(IRIS_MEMZONE_SHADER_START == 0ull); const uint64_t _4GB = 1ull << 32; + const uint64_t _2GB = 1ul << 31; /* The STATE_BASE_ADDRESS size field can only hold 1 page shy of 4GB */ const uint64_t _4GB_minus_1 = _4GB - PAGE_SIZE; @@ -1611,9 +1792,16 @@ util_vma_heap_init(&bufmgr->vma_allocator[IRIS_MEMZONE_SURFACE], IRIS_MEMZONE_SURFACE_START, _4GB_minus_1 - IRIS_MAX_BINDERS * IRIS_BINDER_SIZE); + /* TODO: Why does limiting to 2GB help some state items on gen12? + * - CC Viewport Pointer + * - Blend State Pointer + * - Color Calc State Pointer + */ + const uint64_t dynamic_pool_size = + (devinfo->gen >= 12 ? _2GB : _4GB_minus_1) - IRIS_BORDER_COLOR_POOL_SIZE; util_vma_heap_init(&bufmgr->vma_allocator[IRIS_MEMZONE_DYNAMIC], IRIS_MEMZONE_DYNAMIC_START + IRIS_BORDER_COLOR_POOL_SIZE, - _4GB_minus_1 - IRIS_BORDER_COLOR_POOL_SIZE); + dynamic_pool_size); /* Leave the last 4GB out of the high vma range, so that no state * base address + size can overflow 48 bits. @@ -1622,15 +1810,85 @@ IRIS_MEMZONE_OTHER_START, (gtt_size - _4GB) - IRIS_MEMZONE_OTHER_START); - // XXX: driconf - bufmgr->bo_reuse = env_var_as_boolean("bo_reuse", true); - init_cache_buckets(bufmgr); bufmgr->name_table = - _mesa_hash_table_create(NULL, key_hash_uint, key_uint_equal); + _mesa_hash_table_create(NULL, _mesa_hash_uint, _mesa_key_uint_equal); bufmgr->handle_table = - _mesa_hash_table_create(NULL, key_hash_uint, key_uint_equal); + _mesa_hash_table_create(NULL, _mesa_hash_uint, _mesa_key_uint_equal); + + if (devinfo->gen >= 12) { + bufmgr->aux_map_ctx = gen_aux_map_init(bufmgr, &aux_map_allocator, + devinfo); + assert(bufmgr->aux_map_ctx); + } return bufmgr; } + +static struct iris_bufmgr * +iris_bufmgr_ref(struct iris_bufmgr *bufmgr) +{ + p_atomic_inc(&bufmgr->refcount); + return bufmgr; +} + +void +iris_bufmgr_unref(struct iris_bufmgr *bufmgr) +{ + mtx_lock(&global_bufmgr_list_mutex); + if (p_atomic_dec_zero(&bufmgr->refcount)) { + list_del(&bufmgr->link); + iris_bufmgr_destroy(bufmgr); + } + mtx_unlock(&global_bufmgr_list_mutex); +} + +/** + * Gets an already existing GEM buffer manager or create a new one. + * + * \param fd File descriptor of the opened DRM device. + */ +struct iris_bufmgr * +iris_bufmgr_get_for_fd(struct gen_device_info *devinfo, int fd, bool bo_reuse) +{ + struct stat st; + + if (fstat(fd, &st)) + return NULL; + + struct iris_bufmgr *bufmgr = NULL; + + mtx_lock(&global_bufmgr_list_mutex); + list_for_each_entry(struct iris_bufmgr, iter_bufmgr, &global_bufmgr_list, link) { + struct stat iter_st; + if (fstat(iter_bufmgr->fd, &iter_st)) + continue; + + if (st.st_rdev == iter_st.st_rdev) { + assert(iter_bufmgr->bo_reuse == bo_reuse); + bufmgr = iris_bufmgr_ref(iter_bufmgr); + goto unlock; + } + } + + bufmgr = iris_bufmgr_create(devinfo, fd, bo_reuse); + list_addtail(&bufmgr->link, &global_bufmgr_list); + + unlock: + mtx_unlock(&global_bufmgr_list_mutex); + + return bufmgr; +} + +int +iris_bufmgr_get_fd(struct iris_bufmgr *bufmgr) +{ + return bufmgr->fd; +} + +void* +iris_bufmgr_get_aux_map_context(struct iris_bufmgr *bufmgr) +{ + return bufmgr->aux_map_ctx; +} diff -Nru mesa-19.2.8/src/gallium/drivers/iris/iris_bufmgr.h mesa-20.0.8/src/gallium/drivers/iris/iris_bufmgr.h --- mesa-19.2.8/src/gallium/drivers/iris/iris_bufmgr.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/iris/iris_bufmgr.h 2020-06-12 01:21:17.000000000 +0000 @@ -28,11 +28,13 @@ #include #include #include +#include "c11/threads.h" #include "util/macros.h" #include "util/u_atomic.h" #include "util/list.h" #include "pipe/p_defines.h" +struct iris_batch; struct gen_device_info; struct pipe_debug_callback; @@ -113,6 +115,11 @@ uint64_t gtt_offset; /** + * If non-zero, then this bo has an aux-map translation to this address. + */ + uint64_t aux_map_address; + + /** * The validation list index for this buffer, or -1 when not in a batch. * Note that a single buffer may be in multiple batches (contexts), and * this is a global field, which refers to the last batch using the BO. @@ -166,6 +173,9 @@ /** BO cache list */ struct list_head head; + /** List of GEM handle exports of this buffer (bo_export) */ + struct list_head exports; + /** * Boolean of whether this buffer can be re-used */ @@ -279,10 +289,11 @@ */ void iris_bo_wait_rendering(struct iris_bo *bo); + /** - * Tears down the buffer manager instance. + * Unref a buffer manager instance. */ -void iris_bufmgr_destroy(struct iris_bufmgr *bufmgr); +void iris_bufmgr_unref(struct iris_bufmgr *bufmgr); /** * Get the current tiling (and resulting swizzling) mode for the bo. @@ -303,6 +314,13 @@ int iris_bo_flink(struct iris_bo *bo, uint32_t *name); /** + * Make a BO externally accessible. + * + * \param bo Buffer to make external + */ +void iris_bo_make_external(struct iris_bo *bo); + +/** * Returns 1 if mapping the buffer for write could cause the process * to block, due to the object being active in the GPU. */ @@ -323,11 +341,15 @@ int iris_bo_madvise(struct iris_bo *bo, int madv); /* drm_bacon_bufmgr_gem.c */ -struct iris_bufmgr *iris_bufmgr_init(struct gen_device_info *devinfo, int fd); +struct iris_bufmgr *iris_bufmgr_get_for_fd(struct gen_device_info *devinfo, int fd, + bool bo_reuse); +int iris_bufmgr_get_fd(struct iris_bufmgr *bufmgr); + struct iris_bo *iris_bo_gem_create_from_name(struct iris_bufmgr *bufmgr, const char *name, unsigned handle); -void iris_bufmgr_enable_reuse(struct iris_bufmgr *bufmgr); + +void* iris_bufmgr_get_aux_map_context(struct iris_bufmgr *bufmgr); int iris_bo_wait(struct iris_bo *bo, int64_t timeout_ns); @@ -344,7 +366,20 @@ void iris_destroy_hw_context(struct iris_bufmgr *bufmgr, uint32_t ctx_id); int iris_bo_export_dmabuf(struct iris_bo *bo, int *prime_fd); -struct iris_bo *iris_bo_import_dmabuf(struct iris_bufmgr *bufmgr, int prime_fd); +struct iris_bo *iris_bo_import_dmabuf(struct iris_bufmgr *bufmgr, int prime_fd, + uint32_t tiling, uint32_t stride); + +/** + * Exports a bo as a GEM handle into a given DRM file descriptor + * \param bo Buffer to export + * \param drm_fd File descriptor where the new handle is created + * \param out_handle Pointer to store the new handle + * + * Returns 0 if the buffer was successfully exported, a non zero error code + * otherwise. + */ +int iris_bo_export_gem_handle_for_device(struct iris_bo *bo, int drm_fd, + uint32_t *out_handle); uint32_t iris_bo_export_gem_handle(struct iris_bo *bo); diff -Nru mesa-19.2.8/src/gallium/drivers/iris/iris_clear.c mesa-20.0.8/src/gallium/drivers/iris/iris_clear.c --- mesa-19.2.8/src/gallium/drivers/iris/iris_clear.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/iris/iris_clear.c 2020-06-12 01:21:17.000000000 +0000 @@ -27,7 +27,7 @@ #include "pipe/p_context.h" #include "pipe/p_screen.h" #include "util/u_inlines.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_upload_mgr.h" #include "util/ralloc.h" #include "iris_context.h" @@ -75,6 +75,9 @@ { struct iris_resource *res = (void *) p_res; + if (INTEL_DEBUG & DEBUG_NO_FAST_CLEAR) + return false; + if (res->aux.usage == ISL_AUX_USAGE_NONE) return false; @@ -302,8 +305,8 @@ blorp_batch_init(&ice->blorp, &blorp_batch, batch, blorp_flags); struct blorp_surf surf; - iris_blorp_surf_for_resource(&ice->vtbl, &surf, p_res, res->aux.usage, - level, true); + iris_blorp_surf_for_resource(&ice->vtbl, &batch->screen->isl_dev, &surf, + p_res, res->aux.usage, level, true); /* In newer gens (> 9), the hardware will do a linear -> sRGB conversion of * the clear color during the fast clear, if the surface format is of sRGB @@ -312,6 +315,7 @@ * conversion in convert_fast_clear_color(). */ blorp_fast_clear(&blorp_batch, &surf, isl_format_srgb_to_linear(format), + ISL_SWIZZLE_IDENTITY, level, box->z, box->depth, box->x, box->y, box->x + box->width, box->y + box->height); @@ -351,7 +355,7 @@ } if (p_res->target == PIPE_BUFFER) - util_range_add(&res->valid_buffer_range, box->x, box->x + box->width); + util_range_add(&res->base, &res->valid_buffer_range, box->x, box->x + box->width); iris_batch_maybe_flush(batch, 1500); @@ -372,8 +376,8 @@ box->z, box->depth, aux_usage); struct blorp_surf surf; - iris_blorp_surf_for_resource(&ice->vtbl, &surf, p_res, aux_usage, level, - true); + iris_blorp_surf_for_resource(&ice->vtbl, &batch->screen->isl_dev, &surf, + p_res, aux_usage, level, true); struct blorp_batch blorp_batch; blorp_batch_init(&ice->blorp, &blorp_batch, batch, blorp_flags); @@ -404,6 +408,12 @@ float depth) { struct pipe_resource *p_res = (void *) res; + struct pipe_context *ctx = (void *) ice; + struct iris_screen *screen = (void *) ctx->screen; + const struct gen_device_info *devinfo = &screen->devinfo; + + if (INTEL_DEBUG & DEBUG_NO_FAST_CLEAR) + return false; /* Check for partial clears */ if (box->x > 0 || box->y > 0 || @@ -415,7 +425,10 @@ if (!(res->aux.has_hiz & (1 << level))) return false; - return true; + return blorp_can_hiz_clear_depth(devinfo, &res->surf, res->aux.usage, + level, box->z, box->x, box->y, + box->x + box->width, + box->y + box->height); } static void @@ -509,7 +522,11 @@ for (unsigned l = 0; l < box->depth; l++) { enum isl_aux_state aux_state = iris_resource_get_aux_state(res, level, box->z + l); - if (aux_state != ISL_AUX_STATE_CLEAR) { + if (update_clear_depth || aux_state != ISL_AUX_STATE_CLEAR) { + if (aux_state == ISL_AUX_STATE_CLEAR) { + perf_debug(&ice->dbg, "Performing HiZ clear just to update the " + "depth clear value\n"); + } iris_hiz_exec(ice, batch, res, level, box->z + l, 1, ISL_AUX_OP_FAST_CLEAR, update_clear_depth); @@ -569,19 +586,23 @@ return; } - if (z_res) { + if (clear_depth && z_res) { iris_resource_prepare_depth(ice, batch, z_res, level, box->z, box->depth); - iris_blorp_surf_for_resource(&ice->vtbl, &z_surf, &z_res->base, - z_res->aux.usage, level, true); + iris_blorp_surf_for_resource(&ice->vtbl, &batch->screen->isl_dev, + &z_surf, &z_res->base, z_res->aux.usage, + level, true); } struct blorp_batch blorp_batch; blorp_batch_init(&ice->blorp, &blorp_batch, batch, blorp_flags); - if (stencil_res) { - iris_blorp_surf_for_resource(&ice->vtbl, &stencil_surf, - &stencil_res->base, stencil_res->aux.usage, - level, true); + uint8_t stencil_mask = clear_stencil && stencil_res ? 0xff : 0; + if (stencil_mask) { + iris_resource_prepare_access(ice, batch, stencil_res, level, 1, box->z, + box->depth, stencil_res->aux.usage, false); + iris_blorp_surf_for_resource(&ice->vtbl, &batch->screen->isl_dev, + &stencil_surf, &stencil_res->base, + stencil_res->aux.usage, level, true); } blorp_clear_depth_stencil(&blorp_batch, &z_surf, &stencil_surf, @@ -590,16 +611,21 @@ box->x + box->width, box->y + box->height, clear_depth && z_res, depth, - clear_stencil && stencil_res ? 0xff : 0, stencil); + stencil_mask, stencil); blorp_batch_finish(&blorp_batch); iris_flush_and_dirty_for_history(ice, batch, res, 0, "cache history: post slow ZS clear"); - if (z_res) { + if (clear_depth && z_res) { iris_resource_finish_depth(ice, z_res, level, box->z, box->depth, true); } + + if (stencil_mask) { + iris_resource_finish_write(ice, stencil_res, level, box->z, box->depth, + stencil_res->aux.usage); + } } /** @@ -671,8 +697,12 @@ { struct iris_context *ice = (void *) ctx; struct iris_screen *screen = (void *) ctx->screen; + struct iris_resource *res = (void *) p_res; const struct gen_device_info *devinfo = &screen->devinfo; + if (iris_resource_unfinished_aux_import(res)) + iris_resource_finish_aux_import(ctx->screen, res); + if (util_format_is_depth_or_stencil(p_res->format)) { const struct util_format_description *fmt_desc = util_format_description(p_res->format); diff -Nru mesa-19.2.8/src/gallium/drivers/iris/iris_context.c mesa-20.0.8/src/gallium/drivers/iris/iris_context.c --- mesa-19.2.8/src/gallium/drivers/iris/iris_context.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/iris/iris_context.c 2020-06-12 01:21:17.000000000 +0000 @@ -26,7 +26,7 @@ #include "pipe/p_state.h" #include "util/ralloc.h" #include "util/u_inlines.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_upload_mgr.h" #include "drm-uapi/i915_drm.h" #include "iris_context.h" @@ -79,20 +79,17 @@ * batch is one of our context's, so hackily claw our way back. */ struct iris_context *ice = NULL; - struct iris_screen *screen; if (batch->name == IRIS_BATCH_RENDER) { ice = container_of(batch, ice, batches[IRIS_BATCH_RENDER]); assert(&ice->batches[IRIS_BATCH_RENDER] == batch); - screen = (void *) ice->ctx.screen; - ice->vtbl.init_render_context(screen, batch, &ice->vtbl, &ice->dbg); + ice->vtbl.init_render_context(batch); } else if (batch->name == IRIS_BATCH_COMPUTE) { ice = container_of(batch, ice, batches[IRIS_BATCH_COMPUTE]); assert(&ice->batches[IRIS_BATCH_COMPUTE] == batch); - screen = (void *) ice->ctx.screen; - ice->vtbl.init_compute_context(screen, batch, &ice->vtbl, &ice->dbg); + ice->vtbl.init_compute_context(batch); } else { unreachable("unhandled batch reset"); } @@ -101,6 +98,7 @@ ice->state.current_hash_scale = 0; memset(ice->state.last_grid, 0, sizeof(ice->state.last_grid)); batch->last_surface_base_address = ~0ull; + batch->last_aux_map_state = 0; ice->vtbl.lost_genx_state(ice, batch); } @@ -214,6 +212,9 @@ #define genX_call(devinfo, func, ...) \ switch (devinfo->gen) { \ + case 12: \ + gen12_##func(__VA_ARGS__); \ + break; \ case 11: \ gen11_##func(__VA_ARGS__); \ break; \ @@ -263,14 +264,13 @@ ctx->get_device_reset_status = iris_get_device_reset_status; ctx->get_sample_position = iris_get_sample_position; - ice->shaders.urb_size = devinfo->urb.size; - iris_init_context_fence_functions(ctx); iris_init_blit_functions(ctx); iris_init_clear_functions(ctx); iris_init_program_functions(ctx); iris_init_resource_functions(ctx); iris_init_flush_functions(ctx); + iris_init_perfquery_functions(ctx); iris_init_program_cache(ice); iris_init_border_color_pool(ice); @@ -305,14 +305,11 @@ for (int i = 0; i < IRIS_BATCH_COUNT; i++) { iris_init_batch(&ice->batches[i], screen, &ice->vtbl, &ice->dbg, &ice->reset, ice->state.sizes, - ice->batches, (enum iris_batch_name) i, - I915_EXEC_RENDER, priority); + ice->batches, (enum iris_batch_name) i, priority); } - ice->vtbl.init_render_context(screen, &ice->batches[IRIS_BATCH_RENDER], - &ice->vtbl, &ice->dbg); - ice->vtbl.init_compute_context(screen, &ice->batches[IRIS_BATCH_COMPUTE], - &ice->vtbl, &ice->dbg); + ice->vtbl.init_render_context(&ice->batches[IRIS_BATCH_RENDER]); + ice->vtbl.init_compute_context(&ice->batches[IRIS_BATCH_COMPUTE]); return ctx; } diff -Nru mesa-19.2.8/src/gallium/drivers/iris/iris_context.h mesa-20.0.8/src/gallium/drivers/iris/iris_context.h --- mesa-19.2.8/src/gallium/drivers/iris/iris_context.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/iris/iris_context.h 2020-06-12 01:21:17.000000000 +0000 @@ -28,6 +28,7 @@ #include "util/u_debug.h" #include "intel/blorp/blorp.h" #include "intel/dev/gen_debug.h" +#include "intel/common/gen_l3_config.h" #include "intel/compiler/brw_compiler.h" #include "iris_batch.h" #include "iris_binder.h" @@ -53,6 +54,11 @@ BRW_PARAM_DOMAIN_IMAGE, }; +enum { + DRI_CONF_BO_REUSE_DISABLED, + DRI_CONF_BO_REUSE_ALL +}; + #define BRW_PARAM(domain, val) (BRW_PARAM_DOMAIN_##domain << 24 | (val)) #define BRW_PARAM_DOMAIN(param) ((uint32_t)(param) >> 24) #define BRW_PARAM_VALUE(param) ((uint32_t)(param) & 0x00ffffff) @@ -105,6 +111,7 @@ #define IRIS_DIRTY_FS (1ull << 32) #define IRIS_DIRTY_CS (1ull << 33) #define IRIS_DIRTY_URB (1ull << 34) +#define IRIS_SHIFT_FOR_DIRTY_CONSTANTS 35 #define IRIS_DIRTY_CONSTANTS_VS (1ull << 35) #define IRIS_DIRTY_CONSTANTS_TCS (1ull << 36) #define IRIS_DIRTY_CONSTANTS_TES (1ull << 37) @@ -128,6 +135,9 @@ #define IRIS_DIRTY_RENDER_RESOLVES_AND_FLUSHES (1ull << 55) #define IRIS_DIRTY_COMPUTE_RESOLVES_AND_FLUSHES (1ull << 56) #define IRIS_DIRTY_VF_STATISTICS (1ull << 57) +#define IRIS_DIRTY_PMA_FIX (1ull << 58) +#define IRIS_DIRTY_DEPTH_BOUNDS (1ull << 59) +#define IRIS_DIRTY_RENDER_BUFFER (1ull << 60) #define IRIS_ALL_DIRTY_FOR_COMPUTE (IRIS_DIRTY_CS | \ IRIS_DIRTY_SAMPLER_STATES_CS | \ @@ -143,7 +153,8 @@ IRIS_DIRTY_BINDINGS_TES | \ IRIS_DIRTY_BINDINGS_GS | \ IRIS_DIRTY_BINDINGS_FS | \ - IRIS_DIRTY_BINDINGS_CS) + IRIS_DIRTY_BINDINGS_CS | \ + IRIS_DIRTY_RENDER_BUFFER) /** * Non-orthogonal state (NOS) dependency flags. @@ -164,6 +175,78 @@ IRIS_NOS_COUNT, }; +/** @{ + * + * Program cache keys for state based recompiles. + */ + +struct iris_base_prog_key { + unsigned program_string_id; +}; + +struct iris_vue_prog_key { + struct iris_base_prog_key base; + + unsigned nr_userclip_plane_consts:4; +}; + +struct iris_vs_prog_key { + struct iris_vue_prog_key vue; +}; + +struct iris_tcs_prog_key { + struct iris_vue_prog_key vue; + + uint16_t tes_primitive_mode; + + uint8_t input_vertices; + + bool quads_workaround; + + /** A bitfield of per-patch outputs written. */ + uint32_t patch_outputs_written; + + /** A bitfield of per-vertex outputs written. */ + uint64_t outputs_written; +}; + +struct iris_tes_prog_key { + struct iris_vue_prog_key vue; + + /** A bitfield of per-patch inputs read. */ + uint32_t patch_inputs_read; + + /** A bitfield of per-vertex inputs read. */ + uint64_t inputs_read; +}; + +struct iris_gs_prog_key { + struct iris_vue_prog_key vue; +}; + +struct iris_fs_prog_key { + struct iris_base_prog_key base; + + unsigned nr_color_regions:5; + bool flat_shade:1; + bool alpha_test_replicate_alpha:1; + bool alpha_to_coverage:1; + bool clamp_fragment_color:1; + bool persample_interp:1; + bool multisample_fbo:1; + bool force_dual_color_blend:1; + bool coherent_fb_fetch:1; + + uint8_t color_outputs_valid; + uint64_t input_slots_valid; +}; + +struct iris_cs_prog_key { + struct iris_base_prog_key base; +}; + +/** @} */ + struct iris_depth_stencil_alpha_state; /** @@ -214,6 +297,8 @@ PIPE_CONTROL_STATE_CACHE_INVALIDATE = (1 << 22), PIPE_CONTROL_STALL_AT_SCOREBOARD = (1 << 23), PIPE_CONTROL_DEPTH_CACHE_FLUSH = (1 << 24), + PIPE_CONTROL_TILE_CACHE_FLUSH = (1 << 25), + PIPE_CONTROL_FLUSH_HDC = (1 << 26), }; #define PIPE_CONTROL_CACHE_FLUSH_BITS \ @@ -274,6 +359,8 @@ /** Should we use ALT mode for math? Useful for ARB programs. */ bool use_alt_mode; + bool needs_edge_flag; + /** Constant data scraped from the shader by nir_opt_large_constants */ struct pipe_resource *const_data; @@ -408,14 +495,8 @@ */ struct iris_vtable { void (*destroy_state)(struct iris_context *ice); - void (*init_render_context)(struct iris_screen *screen, - struct iris_batch *batch, - struct iris_vtable *vtbl, - struct pipe_debug_callback *dbg); - void (*init_compute_context)(struct iris_screen *screen, - struct iris_batch *batch, - struct iris_vtable *vtbl, - struct pipe_debug_callback *dbg); + void (*init_render_context)(struct iris_batch *batch); + void (*init_compute_context)(struct iris_batch *batch); void (*upload_render_state)(struct iris_context *ice, struct iris_batch *batch, const struct pipe_draw_info *draw); @@ -425,8 +506,7 @@ struct iris_batch *batch, const struct pipe_grid_info *grid); void (*rebind_buffer)(struct iris_context *ice, - struct iris_resource *res, - uint64_t old_address); + struct iris_resource *res); void (*resolve_conditional_render)(struct iris_context *ice); void (*load_register_reg32)(struct iris_batch *batch, uint32_t dst, uint32_t src); @@ -475,23 +555,22 @@ void (*populate_vs_key)(const struct iris_context *ice, const struct shader_info *info, gl_shader_stage last_stage, - struct brw_vs_prog_key *key); + struct iris_vs_prog_key *key); void (*populate_tcs_key)(const struct iris_context *ice, - struct brw_tcs_prog_key *key); + struct iris_tcs_prog_key *key); void (*populate_tes_key)(const struct iris_context *ice, const struct shader_info *info, gl_shader_stage last_stage, - struct brw_tes_prog_key *key); + struct iris_tes_prog_key *key); void (*populate_gs_key)(const struct iris_context *ice, const struct shader_info *info, gl_shader_stage last_stage, - struct brw_gs_prog_key *key); + struct iris_gs_prog_key *key); void (*populate_fs_key)(const struct iris_context *ice, const struct shader_info *info, - struct brw_wm_prog_key *key); + struct iris_fs_prog_key *key); void (*populate_cs_key)(const struct iris_context *ice, - struct brw_cs_prog_key *key); - uint32_t (*mocs)(const struct iris_bo *bo); + struct iris_cs_prog_key *key); void (*lost_genx_state)(struct iris_context *ice, struct iris_batch *batch); }; @@ -545,12 +624,18 @@ } params; /** + * Are the above values the ones stored in the draw_params buffer? + * If so, we can compare them against new values to see if anything + * changed. If not, we need to assume they changed. + */ + bool params_valid; + + /** * Resource and offset that stores draw_parameters from the indirect * buffer or to the buffer that stures the previous values for non * indirect draws. */ - struct pipe_resource *draw_params_res; - uint32_t draw_params_offset; + struct iris_state_ref draw_params; struct { /** @@ -571,10 +656,7 @@ * contains parameters that are not present in the indirect buffer as * drawid and is_indexed_draw. They will go in their own vertex element. */ - struct pipe_resource *derived_draw_params_res; - uint32_t derived_draw_params_offset; - - bool is_indirect; + struct iris_state_ref derived_draw_params; } draw; struct { @@ -585,14 +667,9 @@ struct u_upload_mgr *uploader; struct hash_table *cache; - unsigned urb_size; - /** Is a GS or TES outputting points or lines? */ bool output_topology_is_points_or_lines; - /* Track last VS URB entry size */ - unsigned last_vs_entry_size; - /** * Scratch buffers for various sizes and stages. * @@ -654,6 +731,8 @@ */ enum isl_aux_usage draw_aux_usage[BRW_MAX_DRAW_BUFFERS]; + enum gen_urb_deref_block_size urb_deref_block_size; + /** Bitfield of whether color blending is enabled for RT[i] */ uint8_t blend_enables; @@ -758,6 +837,7 @@ void iris_init_clear_functions(struct pipe_context *ctx); void iris_init_program_functions(struct pipe_context *ctx); void iris_init_resource_functions(struct pipe_context *ctx); +void iris_init_perfquery_functions(struct pipe_context *ctx); void iris_update_compiled_shaders(struct iris_context *ice); void iris_update_compiled_compute_shader(struct iris_context *ice); void iris_fill_cs_push_const_buffer(struct brw_cs_prog_data *cs_prog_data, @@ -766,6 +846,7 @@ /* iris_blit.c */ void iris_blorp_surf_for_resource(struct iris_vtable *vtbl, + struct isl_device *isl_dev, struct blorp_surf *surf, struct pipe_resource *p_res, enum isl_aux_usage aux_usage, @@ -795,6 +876,11 @@ uint64_t imm); void iris_emit_end_of_pipe_sync(struct iris_batch *batch, const char *reason, uint32_t flags); +void iris_flush_all_caches(struct iris_batch *batch); + +#define iris_handle_always_flush_cache(batch) \ + if (unlikely(batch->screen->driconf.always_flush_cache)) \ + iris_flush_all_caches(batch); void iris_init_flush_functions(struct pipe_context *ctx); @@ -938,6 +1024,9 @@ # define genX(x) gen11_##x # include "iris_genx_protos.h" # undef genX +# define genX(x) gen12_##x +# include "iris_genx_protos.h" +# undef genX #endif #endif diff -Nru mesa-19.2.8/src/gallium/drivers/iris/iris_defines.h mesa-20.0.8/src/gallium/drivers/iris/iris_defines.h --- mesa-19.2.8/src/gallium/drivers/iris/iris_defines.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/iris/iris_defines.h 2020-06-12 01:21:17.000000000 +0000 @@ -55,4 +55,14 @@ /* The number of bits in our TIMESTAMP queries. */ #define TIMESTAMP_BITS 36 +/* For gen12 we set the streamout buffers using 4 separate commands + * (3DSTATE_SO_BUFFER_INDEX_*) instead of 3DSTATE_SO_BUFFER. However the layout + * of the 3DSTATE_SO_BUFFER_INDEX_* commands is identical to that of + * 3DSTATE_SO_BUFFER apart from the SOBufferIndex field, so for now we use the + * 3DSTATE_SO_BUFFER command, but change the 3DCommandSubOpcode. + * SO_BUFFER_INDEX_0_CMD is actually the 3DCommandSubOpcode for + * 3DSTATE_SO_BUFFER_INDEX_0. + */ +#define SO_BUFFER_INDEX_0_CMD 0x60 + #endif diff -Nru mesa-19.2.8/src/gallium/drivers/iris/iris_disk_cache.c mesa-20.0.8/src/gallium/drivers/iris/iris_disk_cache.c --- mesa-19.2.8/src/gallium/drivers/iris/iris_disk_cache.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/iris/iris_disk_cache.c 2020-06-12 01:21:17.000000000 +0000 @@ -31,8 +31,8 @@ #include #include -#include "compiler/blob.h" #include "compiler/nir/nir.h" +#include "util/blob.h" #include "util/build_id.h" #include "util/disk_cache.h" #include "util/mesa-sha1.h" diff -Nru mesa-19.2.8/src/gallium/drivers/iris/iris_draw.c mesa-20.0.8/src/gallium/drivers/iris/iris_draw.c --- mesa-19.2.8/src/gallium/drivers/iris/iris_draw.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/iris/iris_draw.c 2020-06-12 01:21:17.000000000 +0000 @@ -113,35 +113,56 @@ iris_update_draw_parameters(struct iris_context *ice, const struct pipe_draw_info *info) { - if (info->indirect) { - pipe_resource_reference(&ice->draw.draw_params_res, - info->indirect->buffer); - ice->draw.draw_params_offset = info->indirect->offset + - (info->index_size ? 12 : 8); - ice->draw.params.firstvertex = 0; - ice->draw.params.baseinstance = 0; - ice->state.dirty |= IRIS_DIRTY_VERTEX_BUFFERS | - IRIS_DIRTY_VERTEX_ELEMENTS | - IRIS_DIRTY_VF_SGVS; - } else if (ice->draw.is_indirect || - ice->draw.params.firstvertex != - (info->index_size ? info->index_bias : info->start) || - (ice->draw.params.baseinstance != info->start_instance)) { - pipe_resource_reference(&ice->draw.draw_params_res, NULL); - ice->draw.draw_params_offset = 0; - ice->draw.params.firstvertex = - info->index_size ? info->index_bias : info->start; - ice->draw.params.baseinstance = info->start_instance; - ice->state.dirty |= IRIS_DIRTY_VERTEX_BUFFERS | - IRIS_DIRTY_VERTEX_ELEMENTS | - IRIS_DIRTY_VF_SGVS; + bool changed = false; + + if (ice->state.vs_uses_draw_params) { + struct iris_state_ref *draw_params = &ice->draw.draw_params; + + if (info->indirect) { + pipe_resource_reference(&draw_params->res, info->indirect->buffer); + draw_params->offset = + info->indirect->offset + (info->index_size ? 12 : 8); + + changed = true; + ice->draw.params_valid = false; + } else { + int firstvertex = info->index_size ? info->index_bias : info->start; + + if (!ice->draw.params_valid || + ice->draw.params.firstvertex != firstvertex || + ice->draw.params.baseinstance != info->start_instance) { + + changed = true; + ice->draw.params.firstvertex = firstvertex; + ice->draw.params.baseinstance = info->start_instance; + ice->draw.params_valid = true; + + u_upload_data(ice->ctx.stream_uploader, 0, + sizeof(ice->draw.params), 4, &ice->draw.params, + &draw_params->offset, &draw_params->res); + } + } + } + + if (ice->state.vs_uses_derived_draw_params) { + struct iris_state_ref *derived_params = &ice->draw.derived_draw_params; + int is_indexed_draw = info->index_size ? -1 : 0; + + if (ice->draw.derived_params.drawid != info->drawid || + ice->draw.derived_params.is_indexed_draw != is_indexed_draw) { + + changed = true; + ice->draw.derived_params.drawid = info->drawid; + ice->draw.derived_params.is_indexed_draw = is_indexed_draw; + + u_upload_data(ice->ctx.stream_uploader, 0, + sizeof(ice->draw.derived_params), 4, + &ice->draw.derived_params, + &derived_params->offset, &derived_params->res); + } } - ice->draw.is_indirect = info->indirect; - if (ice->draw.derived_params.drawid != info->drawid || - ice->draw.derived_params.is_indexed_draw != (info->index_size ? ~0 : 0)) { - ice->draw.derived_params.drawid = info->drawid; - ice->draw.derived_params.is_indexed_draw = info->index_size ? ~0 : 0; + if (changed) { ice->state.dirty |= IRIS_DIRTY_VERTEX_BUFFERS | IRIS_DIRTY_VERTEX_ELEMENTS | IRIS_DIRTY_VF_SGVS; @@ -241,11 +262,15 @@ ice->vtbl.update_surface_base_address(batch, &ice->state.binder); + iris_handle_always_flush_cache(batch); + if (info->indirect) iris_indirect_draw_vbo(ice, info); else iris_simple_draw_vbo(ice, info); + iris_handle_always_flush_cache(batch); + iris_postdraw_update_resolve_tracking(ice, batch); ice->state.dirty &= ~IRIS_ALL_DIRTY_FOR_RENDER; @@ -301,7 +326,7 @@ .size_B = sizeof(grid->grid), .format = ISL_FORMAT_RAW, .stride_B = 1, - .mocs = ice->vtbl.mocs(grid_bo)); + .mocs = iris_mocs(grid_bo, isl_dev)); ice->state.dirty |= IRIS_DIRTY_BINDINGS_CS; } @@ -328,8 +353,7 @@ iris_batch_maybe_flush(batch, 1500); - if (ice->state.dirty & IRIS_DIRTY_UNCOMPILED_CS) - iris_update_compiled_compute_shader(ice); + iris_update_compiled_compute_shader(ice); iris_update_grid_size_resource(ice, grid); @@ -342,8 +366,12 @@ ice->state.compute_predicate = NULL; } + iris_handle_always_flush_cache(batch); + ice->vtbl.upload_compute_state(ice, batch, grid); + iris_handle_always_flush_cache(batch); + ice->state.dirty &= ~IRIS_ALL_DIRTY_FOR_COMPUTE; /* Note: since compute shaders can't access the framebuffer, there's diff -Nru mesa-19.2.8/src/gallium/drivers/iris/iris_fence.c mesa-20.0.8/src/gallium/drivers/iris/iris_fence.c --- mesa-19.2.8/src/gallium/drivers/iris/iris_fence.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/iris/iris_fence.c 2020-06-12 01:21:17.000000000 +0000 @@ -184,6 +184,8 @@ iris_syncpt_reference(screen, &fence->syncpt[fence->count++], ice->batches[b].last_syncpt); } + + iris_fence_reference(ctx->screen, out_fence, NULL); *out_fence = fence; } @@ -298,6 +300,22 @@ struct iris_screen *screen = (struct iris_screen *)p_screen; int fd = -1; + if (fence->count == 0) { + /* Our fence has no syncobj's recorded. This means that all of the + * batches had already completed, their syncobj's had been signalled, + * and so we didn't bother to record them. But we're being asked to + * export such a fence. So export a dummy already-signalled syncobj. + */ + struct drm_syncobj_handle args = { + .flags = DRM_SYNCOBJ_HANDLE_TO_FD_FLAGS_EXPORT_SYNC_FILE, .fd = -1, + }; + + args.handle = gem_syncobj_create(screen->fd, DRM_SYNCOBJ_CREATE_SIGNALED); + gen_ioctl(screen->fd, DRM_IOCTL_SYNCOBJ_HANDLE_TO_FD, &args); + gem_syncobj_destroy(screen->fd, args.handle); + return args.fd; + } + for (unsigned i = 0; i < fence->count; i++) { struct drm_syncobj_handle args = { .handle = fence->syncpt[i]->handle, @@ -322,10 +340,17 @@ struct iris_screen *screen = (struct iris_screen *)ctx->screen; struct drm_syncobj_handle args = { + .handle = gem_syncobj_create(screen->fd, DRM_SYNCOBJ_CREATE_SIGNALED), .flags = DRM_SYNCOBJ_FD_TO_HANDLE_FLAGS_IMPORT_SYNC_FILE, .fd = fd, }; - gen_ioctl(screen->fd, DRM_IOCTL_SYNCOBJ_FD_TO_HANDLE, &args); + if (gen_ioctl(screen->fd, DRM_IOCTL_SYNCOBJ_FD_TO_HANDLE, &args) == -1) { + fprintf(stderr, "DRM_IOCTL_SYNCOBJ_FD_TO_HANDLE failed: %s\n", + strerror(errno)); + gem_syncobj_destroy(screen->fd, args.handle); + *out = NULL; + return; + } struct iris_syncpt *syncpt = malloc(sizeof(*syncpt)); syncpt->handle = args.handle; diff -Nru mesa-19.2.8/src/gallium/drivers/iris/iris_formats.c mesa-20.0.8/src/gallium/drivers/iris/iris_formats.c --- mesa-19.2.8/src/gallium/drivers/iris/iris_formats.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/iris/iris_formats.c 2020-06-12 01:21:17.000000000 +0000 @@ -29,7 +29,7 @@ #include "util/bitscan.h" #include "util/macros.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "iris_resource.h" #include "iris_screen.h" @@ -283,6 +283,8 @@ [PIPE_FORMAT_ETC2_RG11_UNORM] = ISL_FORMAT_EAC_RG11, [PIPE_FORMAT_ETC2_RG11_SNORM] = ISL_FORMAT_EAC_SIGNED_RG11, + [PIPE_FORMAT_FXT1_RGB] = ISL_FORMAT_FXT1, + [PIPE_FORMAT_FXT1_RGBA] = ISL_FORMAT_FXT1, [PIPE_FORMAT_ASTC_4x4] = ISL_FORMAT_ASTC_LDR_2D_4X4_FLT16, [PIPE_FORMAT_ASTC_5x4] = ISL_FORMAT_ASTC_LDR_2D_5X4_FLT16, @@ -336,9 +338,13 @@ isl_surf_usage_flags_t usage) { enum isl_format format = iris_isl_format_for_pipe_format(pformat); - const struct isl_format_layout *fmtl = isl_format_get_layout(format); struct isl_swizzle swizzle = ISL_SWIZZLE_IDENTITY; + if (format == ISL_FORMAT_UNSUPPORTED) + return (struct iris_format_info) { .fmt = format, .swizzle = swizzle }; + + const struct isl_format_layout *fmtl = isl_format_get_layout(format); + if (!util_format_is_srgb(pformat)) { if (util_format_is_intensity(pformat)) { swizzle = ISL_SWIZZLE(RED, RED, RED, RED); @@ -479,15 +485,17 @@ if (!is_integer) supported &= isl_format_supports_filtering(devinfo, format); - /* Don't advertise 3-component RGB formats. This ensures that they - * are renderable from an API perspective since the state tracker will - * fall back to RGBA or RGBX, which are renderable. We want to render - * internally for copies and blits, even if the application doesn't. + /* Don't advertise 3-component RGB formats for non-buffer textures. + * This ensures that they are renderable from an API perspective since + * the state tracker will fall back to RGBA or RGBX, which are + * renderable. We want to render internally for copies and blits, + * even if the application doesn't. * - * We do need to advertise 32-bit RGB for texture buffers though. + * Buffer textures don't need to be renderable, so we support real RGB. + * This is useful for PBO upload, and 32-bit RGB support is mandatory. */ - supported &= fmtl->bpb != 24 && fmtl->bpb != 48 && - (fmtl->bpb != 96 || target == PIPE_BUFFER); + if (target != PIPE_BUFFER) + supported &= fmtl->bpb != 24 && fmtl->bpb != 48 && fmtl->bpb != 96; } if (usage & PIPE_BIND_VERTEX_BUFFER) @@ -499,6 +507,14 @@ format == ISL_FORMAT_R32_UINT; } + /* TODO: Support ASTC 5x5 on Gen9 properly. This means implementing + * a complex sampler workaround (see i965's gen9_apply_astc5x5_wa_flush). + * Without it, st/mesa will emulate ASTC 5x5 via uncompressed textures. + */ + if (devinfo->gen == 9 && (format == ISL_FORMAT_ASTC_LDR_2D_5X5_FLT16 || + format == ISL_FORMAT_ASTC_LDR_2D_5X5_U8SRGB)) + return false; + return supported; } diff -Nru mesa-19.2.8/src/gallium/drivers/iris/iris_genx_macros.h mesa-20.0.8/src/gallium/drivers/iris/iris_genx_macros.h --- mesa-19.2.8/src/gallium/drivers/iris/iris_genx_macros.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/iris/iris_genx_macros.h 2020-06-12 01:21:17.000000000 +0000 @@ -88,12 +88,15 @@ #define iris_pack_command(cmd, dst, name) \ _iris_pack_command(NULL, cmd, dst, name) -#define iris_pack_state(cmd, dst, name) \ +#define _iris_pack_state(batch, cmd, dst, name) \ for (struct cmd name = {}, \ *_dst = (void *)(dst); __builtin_expect(_dst != NULL, 1); \ - __genxml_cmd_pack(cmd)(NULL, (void *)_dst, &name), \ + __genxml_cmd_pack(cmd)(batch, (void *)_dst, &name), \ _dst = NULL) +#define iris_pack_state(cmd, dst, name) \ + _iris_pack_state(NULL, cmd, dst, name) + #define iris_emit_cmd(batch, cmd, name) \ _iris_pack_command(batch, cmd, __gen_get_batch_dwords(batch, __genxml_cmd_length(cmd)), name) diff -Nru mesa-19.2.8/src/gallium/drivers/iris/iris_genx_protos.h mesa-20.0.8/src/gallium/drivers/iris/iris_genx_protos.h --- mesa-19.2.8/src/gallium/drivers/iris/iris_genx_protos.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/iris/iris_genx_protos.h 2020-06-12 01:21:17.000000000 +0000 @@ -29,14 +29,15 @@ /* iris_state.c */ void genX(init_state)(struct iris_context *ice); -void genX(emit_urb_setup)(struct iris_context *ice, - struct iris_batch *batch, - const unsigned size[4], - bool tess_present, bool gs_present); void genX(emit_hashing_mode)(struct iris_context *ice, struct iris_batch *batch, unsigned width, unsigned height, unsigned scale); +void genX(update_pma_fix)(struct iris_context *ice, + struct iris_batch *batch, + bool enable); + +void genX(invalidate_aux_map_state)(struct iris_batch *batch); /* iris_blorp.c */ void genX(init_blorp)(struct iris_context *ice); diff -Nru mesa-19.2.8/src/gallium/drivers/iris/iris_monitor.c mesa-20.0.8/src/gallium/drivers/iris/iris_monitor.c --- mesa-19.2.8/src/gallium/drivers/iris/iris_monitor.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/iris/iris_monitor.c 2020-06-12 01:21:17.000000000 +0000 @@ -26,8 +26,7 @@ #include "iris_screen.h" #include "iris_context.h" - -#include "perf/gen_perf.h" +#include "iris_perf.h" struct iris_monitor_object { int num_active_counters; @@ -94,78 +93,6 @@ return 1; } -typedef void (*bo_unreference_t)(void *); -typedef void *(*bo_map_t)(void *, void *, unsigned flags); -typedef void (*bo_unmap_t)(void *); -typedef void (*emit_mi_report_t)(void *, void *, uint32_t, uint32_t); -typedef void (*emit_mi_flush_t)(void *); -typedef void (*capture_frequency_stat_register_t)(void *, void *, - uint32_t ); -typedef void (*store_register_mem64_t)(void *ctx, void *bo, - uint32_t reg, uint32_t offset); -typedef bool (*batch_references_t)(void *batch, void *bo); -typedef void (*bo_wait_rendering_t)(void *bo); -typedef int (*bo_busy_t)(void *bo); - -static void * -iris_oa_bo_alloc(void *bufmgr, const char *name, uint64_t size) -{ - return iris_bo_alloc(bufmgr, name, size, IRIS_MEMZONE_OTHER); -} - -static void -iris_monitor_emit_mi_flush(struct iris_context *ice) -{ - const int flags = PIPE_CONTROL_RENDER_TARGET_FLUSH | - PIPE_CONTROL_INSTRUCTION_INVALIDATE | - PIPE_CONTROL_CONST_CACHE_INVALIDATE | - PIPE_CONTROL_DATA_CACHE_FLUSH | - PIPE_CONTROL_DEPTH_CACHE_FLUSH | - PIPE_CONTROL_VF_CACHE_INVALIDATE | - PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE | - PIPE_CONTROL_CS_STALL; - iris_emit_pipe_control_flush(&ice->batches[IRIS_BATCH_RENDER], - "OA metrics", flags); -} - -static void -iris_monitor_emit_mi_report_perf_count(void *c, - void *bo, - uint32_t offset_in_bytes, - uint32_t report_id) -{ - struct iris_context *ice = c; - struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER]; - ice->vtbl.emit_mi_report_perf_count(batch, bo, offset_in_bytes, report_id); -} - -static void -iris_monitor_batchbuffer_flush(void *c, const char *file, int line) -{ - struct iris_context *ice = c; - _iris_batch_flush(&ice->batches[IRIS_BATCH_RENDER], __FILE__, __LINE__); -} - -static void -iris_monitor_capture_frequency_stat_register(void *ctx, - void *bo, - uint32_t bo_offset) -{ - struct iris_context *ice = ctx; - struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER]; - ice->vtbl.store_register_mem32(batch, GEN9_RPSTAT0, bo, bo_offset, false); -} - -static void -iris_monitor_store_register_mem64(void *ctx, void *bo, - uint32_t reg, uint32_t offset) -{ - struct iris_context *ice = ctx; - struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER]; - ice->vtbl.store_register_mem64(batch, reg, bo, offset, false); -} - - static bool iris_monitor_init_metrics(struct iris_screen *screen) { @@ -180,23 +107,7 @@ monitor_cfg->perf_cfg = perf_cfg; - perf_cfg->vtbl.bo_alloc = iris_oa_bo_alloc; - perf_cfg->vtbl.bo_unreference = (bo_unreference_t)iris_bo_unreference; - perf_cfg->vtbl.bo_map = (bo_map_t)iris_bo_map; - perf_cfg->vtbl.bo_unmap = (bo_unmap_t)iris_bo_unmap; - perf_cfg->vtbl.emit_mi_flush = (emit_mi_flush_t)iris_monitor_emit_mi_flush; - - perf_cfg->vtbl.emit_mi_report_perf_count = - (emit_mi_report_t)iris_monitor_emit_mi_report_perf_count; - perf_cfg->vtbl.batchbuffer_flush = iris_monitor_batchbuffer_flush; - perf_cfg->vtbl.capture_frequency_stat_register = - (capture_frequency_stat_register_t) iris_monitor_capture_frequency_stat_register; - perf_cfg->vtbl.store_register_mem64 = - (store_register_mem64_t) iris_monitor_store_register_mem64; - perf_cfg->vtbl.batch_references = (batch_references_t)iris_batch_references; - perf_cfg->vtbl.bo_wait_rendering = - (bo_wait_rendering_t)iris_bo_wait_rendering; - perf_cfg->vtbl.bo_busy = (bo_busy_t)iris_bo_busy; + iris_perf_init_vtbl(perf_cfg); gen_perf_init_metrics(perf_cfg, &screen->devinfo, screen->fd); screen->monitor_cfg = monitor_cfg; diff -Nru mesa-19.2.8/src/gallium/drivers/iris/iris_perf.c mesa-20.0.8/src/gallium/drivers/iris/iris_perf.c --- mesa-19.2.8/src/gallium/drivers/iris/iris_perf.c 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/iris/iris_perf.c 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,104 @@ +/* + * Copyright © 2019 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include "iris_perf.h" +#include "iris_context.h" + +static void * +iris_oa_bo_alloc(void *bufmgr, const char *name, uint64_t size) +{ + return iris_bo_alloc(bufmgr, name, size, IRIS_MEMZONE_OTHER); +} + +static void +iris_perf_emit_stall_at_pixel_scoreboard(struct iris_context *ice) +{ + iris_emit_end_of_pipe_sync(&ice->batches[IRIS_BATCH_RENDER], + "OA metrics", + PIPE_CONTROL_STALL_AT_SCOREBOARD); +} + +static void +iris_perf_emit_mi_report_perf_count(void *c, + void *bo, + uint32_t offset_in_bytes, + uint32_t report_id) +{ + struct iris_context *ice = c; + struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER]; + ice->vtbl.emit_mi_report_perf_count(batch, bo, offset_in_bytes, report_id); +} + +static void +iris_perf_batchbuffer_flush(void *c, const char *file, int line) +{ + struct iris_context *ice = c; + _iris_batch_flush(&ice->batches[IRIS_BATCH_RENDER], __FILE__, __LINE__); +} + +static void +iris_perf_store_register_mem(void *ctx, void *bo, + uint32_t reg, uint32_t reg_size, + uint32_t offset) +{ + struct iris_context *ice = ctx; + struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER]; + if (reg_size == 8) { + ice->vtbl.store_register_mem64(batch, reg, bo, offset, false); + } else { + assert(reg_size == 4); + ice->vtbl.store_register_mem32(batch, reg, bo, offset, false); + } +} + +typedef void (*bo_unreference_t)(void *); +typedef void *(*bo_map_t)(void *, void *, unsigned flags); +typedef void (*bo_unmap_t)(void *); +typedef void (*emit_mi_report_t)(void *, void *, uint32_t, uint32_t); +typedef void (*emit_mi_flush_t)(void *); +typedef void (*store_register_mem_t)(void *ctx, void *bo, + uint32_t reg, uint32_t reg_size, + uint32_t offset); +typedef bool (*batch_references_t)(void *batch, void *bo); +typedef void (*bo_wait_rendering_t)(void *bo); +typedef int (*bo_busy_t)(void *bo); + +void +iris_perf_init_vtbl(struct gen_perf_config *perf_cfg) +{ + perf_cfg->vtbl.bo_alloc = iris_oa_bo_alloc; + perf_cfg->vtbl.bo_unreference = (bo_unreference_t)iris_bo_unreference; + perf_cfg->vtbl.bo_map = (bo_map_t)iris_bo_map; + perf_cfg->vtbl.bo_unmap = (bo_unmap_t)iris_bo_unmap; + perf_cfg->vtbl.emit_stall_at_pixel_scoreboard = + (emit_mi_flush_t)iris_perf_emit_stall_at_pixel_scoreboard; + + perf_cfg->vtbl.emit_mi_report_perf_count = + (emit_mi_report_t)iris_perf_emit_mi_report_perf_count; + perf_cfg->vtbl.batchbuffer_flush = iris_perf_batchbuffer_flush; + perf_cfg->vtbl.store_register_mem = + (store_register_mem_t) iris_perf_store_register_mem; + perf_cfg->vtbl.batch_references = (batch_references_t)iris_batch_references; + perf_cfg->vtbl.bo_wait_rendering = + (bo_wait_rendering_t)iris_bo_wait_rendering; + perf_cfg->vtbl.bo_busy = (bo_busy_t)iris_bo_busy; +} diff -Nru mesa-19.2.8/src/gallium/drivers/iris/iris_perf.h mesa-20.0.8/src/gallium/drivers/iris/iris_perf.h --- mesa-19.2.8/src/gallium/drivers/iris/iris_perf.h 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/iris/iris_perf.h 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,31 @@ +/* + * Copyright © 2019 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#ifndef IRIS_PERF_H +#define IRIS_PERF_H + +#include "perf/gen_perf.h" +#include "perf/gen_perf_query.h" + +void iris_perf_init_vtbl(struct gen_perf_config *cfg); + +#endif /* IRIS_PERF_H */ diff -Nru mesa-19.2.8/src/gallium/drivers/iris/iris_performance_query.c mesa-20.0.8/src/gallium/drivers/iris/iris_performance_query.c --- mesa-19.2.8/src/gallium/drivers/iris/iris_performance_query.c 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/iris/iris_performance_query.c 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,233 @@ +/* + * Copyright © 2019 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include + +#include "iris_context.h" +#include "iris_perf.h" + +struct iris_perf_query { + struct gl_perf_query_object base; + struct gen_perf_query_object *query; +}; + +static unsigned +iris_init_perf_query_info(struct pipe_context *pipe) +{ + struct iris_context *ice = (void *) pipe; + struct iris_screen *screen = (struct iris_screen *) ice->ctx.screen; + struct gen_perf_config *perf_cfg = NULL; + + /* make sure pipe perf counter type/data-type enums are matched with gen_perf's */ + STATIC_ASSERT(PIPE_PERF_COUNTER_TYPE_EVENT == (enum pipe_perf_counter_type)GEN_PERF_COUNTER_TYPE_EVENT); + STATIC_ASSERT(PIPE_PERF_COUNTER_TYPE_DURATION_NORM == (enum pipe_perf_counter_type)GEN_PERF_COUNTER_TYPE_DURATION_NORM); + STATIC_ASSERT(PIPE_PERF_COUNTER_TYPE_DURATION_RAW == (enum pipe_perf_counter_type)GEN_PERF_COUNTER_TYPE_DURATION_RAW); + STATIC_ASSERT(PIPE_PERF_COUNTER_TYPE_THROUGHPUT == (enum pipe_perf_counter_type)GEN_PERF_COUNTER_TYPE_THROUGHPUT); + STATIC_ASSERT(PIPE_PERF_COUNTER_TYPE_RAW == (enum pipe_perf_counter_type)GEN_PERF_COUNTER_TYPE_RAW); + + STATIC_ASSERT(PIPE_PERF_COUNTER_DATA_TYPE_BOOL32 == (enum pipe_perf_counter_data_type)GEN_PERF_COUNTER_DATA_TYPE_BOOL32); + STATIC_ASSERT(PIPE_PERF_COUNTER_DATA_TYPE_UINT32 == (enum pipe_perf_counter_data_type)GEN_PERF_COUNTER_DATA_TYPE_UINT32); + STATIC_ASSERT(PIPE_PERF_COUNTER_DATA_TYPE_UINT64 == (enum pipe_perf_counter_data_type)GEN_PERF_COUNTER_DATA_TYPE_UINT64); + STATIC_ASSERT(PIPE_PERF_COUNTER_DATA_TYPE_FLOAT == (enum pipe_perf_counter_data_type)GEN_PERF_COUNTER_DATA_TYPE_FLOAT); + STATIC_ASSERT(PIPE_PERF_COUNTER_DATA_TYPE_DOUBLE == (enum pipe_perf_counter_data_type)GEN_PERF_COUNTER_DATA_TYPE_DOUBLE); + + if (!ice->perf_ctx) + ice->perf_ctx = gen_perf_new_context(ice); + + if (unlikely(!ice->perf_ctx)) + return 0; + + perf_cfg = gen_perf_config(ice->perf_ctx); + + if (perf_cfg) + return perf_cfg->n_queries; + + perf_cfg = gen_perf_new(ice->perf_ctx); + + iris_perf_init_vtbl(perf_cfg); + + gen_perf_init_context(ice->perf_ctx, + perf_cfg, + ice, + screen->bufmgr, + &screen->devinfo, + ice->batches[IRIS_BATCH_RENDER].hw_ctx_id, + screen->fd); + + gen_perf_init_metrics(perf_cfg, &screen->devinfo, screen->fd); + + return perf_cfg->n_queries; +} + +static struct pipe_query * +iris_new_perf_query_obj(struct pipe_context *pipe, unsigned query_index) +{ + struct iris_context *ice = (void *) pipe; + struct gen_perf_context *perf_ctx = ice->perf_ctx; + struct gen_perf_query_object * obj = gen_perf_new_query(perf_ctx, query_index); + if (unlikely(!obj)) + return NULL; + + struct iris_perf_query *q = calloc(1, sizeof(struct iris_perf_query)); + if (unlikely(!q)) { + gen_perf_delete_query(perf_ctx, obj); + return NULL; + } + + q->query = obj; + return (struct pipe_query *)&q->base; +} + +static void +iris_begin_perf_query(struct pipe_context *pipe, struct pipe_query *q) +{ + struct iris_context *ice = (void *) pipe; + struct iris_perf_query *perf_query= (struct iris_perf_query *) q; + struct gen_perf_query_object *obj = perf_query->query; + struct gen_perf_context *perf_ctx = ice->perf_ctx; + + gen_perf_begin_query(perf_ctx, obj); +} + +static void +iris_end_perf_query(struct pipe_context *pipe, struct pipe_query *q) +{ + struct iris_context *ice = (void *) pipe; + struct iris_perf_query *perf_query = (struct iris_perf_query *) q; + struct gen_perf_query_object *obj = perf_query->query; + struct gen_perf_context *perf_ctx = ice->perf_ctx; + + gen_perf_end_query(perf_ctx, obj); +} + +static void +iris_delete_perf_query(struct pipe_context *pipe, struct pipe_query *q) +{ + struct iris_context *ice = (void *) pipe; + struct iris_perf_query *perf_query = (struct iris_perf_query *) q; + struct gen_perf_query_object *obj = perf_query->query; + struct gen_perf_context *perf_ctx = ice->perf_ctx; + + gen_perf_delete_query(perf_ctx, obj); + free(q); +} + +static void +iris_get_perf_query_info(struct pipe_context *pipe, + unsigned query_index, + const char **name, + uint32_t *data_size, + uint32_t *n_counters, + uint32_t *n_active) +{ + struct iris_context *ice = (void *) pipe; + struct gen_perf_context *perf_ctx = ice->perf_ctx; + struct gen_perf_config *perf_cfg = gen_perf_config(perf_ctx); + const struct gen_perf_query_info *info = &perf_cfg->queries[query_index]; + + *name = info->name; + *data_size = info->data_size; + *n_counters = info->n_counters; + *n_active = gen_perf_active_queries(perf_ctx, info); +} + +static void +iris_get_perf_counter_info(struct pipe_context *pipe, + unsigned query_index, + unsigned counter_index, + const char **name, + const char **desc, + uint32_t *offset, + uint32_t *data_size, + uint32_t *type_enum, + uint32_t *data_type_enum, + uint64_t *raw_max) +{ + struct iris_context *ice = (void *) pipe; + struct gen_perf_context *perf_ctx = ice->perf_ctx; + struct gen_perf_config *perf_cfg = gen_perf_config(perf_ctx); + const struct gen_perf_query_info *info = &perf_cfg->queries[query_index]; + const struct gen_perf_query_counter *counter = &info->counters[counter_index]; + + *name = counter->name; + *desc = counter->desc; + *offset = counter->offset; + *data_size = gen_perf_query_counter_get_size(counter); + *type_enum = counter->type; + *data_type_enum = counter->data_type; + *raw_max = counter->raw_max; +} + +static void +iris_wait_perf_query(struct pipe_context *pipe, struct pipe_query *q) +{ + struct iris_context *ice = (void *) pipe; + struct iris_perf_query *perf_query = (struct iris_perf_query *) q; + struct gen_perf_query_object *obj = perf_query->query; + struct gen_perf_context *perf_ctx = ice->perf_ctx; + + gen_perf_wait_query(perf_ctx, obj, &ice->batches[IRIS_BATCH_RENDER]); +} + +static bool +iris_is_perf_query_ready(struct pipe_context *pipe, struct pipe_query *q) +{ + struct iris_context *ice = (void *) pipe; + struct iris_perf_query *perf_query = (struct iris_perf_query *) q; + struct gen_perf_query_object *obj = perf_query->query; + struct gen_perf_context *perf_ctx = ice->perf_ctx; + + if (perf_query->base.Ready) + return true; + + return gen_perf_is_query_ready(perf_ctx, obj, &ice->batches[IRIS_BATCH_RENDER]); +} + +static void +iris_get_perf_query_data(struct pipe_context *pipe, + struct pipe_query *q, + size_t data_size, + uint32_t *data, + uint32_t *bytes_written) +{ + struct iris_context *ice = (void *) pipe; + struct iris_perf_query *perf_query = (struct iris_perf_query *) q; + struct gen_perf_query_object *obj = perf_query->query; + struct gen_perf_context *perf_ctx = ice->perf_ctx; + + gen_perf_get_query_data(perf_ctx, obj, data_size, data, bytes_written); +} + +void +iris_init_perfquery_functions(struct pipe_context *ctx) +{ + ctx->init_intel_perf_query_info = iris_init_perf_query_info; + ctx->get_intel_perf_query_info = iris_get_perf_query_info; + ctx->get_intel_perf_query_counter_info = iris_get_perf_counter_info; + ctx->new_intel_perf_query_obj = iris_new_perf_query_obj; + ctx->begin_intel_perf_query = iris_begin_perf_query; + ctx->end_intel_perf_query = iris_end_perf_query; + ctx->delete_intel_perf_query = iris_delete_perf_query; + ctx->wait_intel_perf_query = iris_wait_perf_query; + ctx->is_intel_perf_query_ready = iris_is_perf_query_ready; + ctx->get_intel_perf_query_data = iris_get_perf_query_data; +} diff -Nru mesa-19.2.8/src/gallium/drivers/iris/iris_pipe_control.c mesa-20.0.8/src/gallium/drivers/iris/iris_pipe_control.c --- mesa-19.2.8/src/gallium/drivers/iris/iris_pipe_control.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/iris/iris_pipe_control.c 2020-06-12 01:21:17.000000000 +0000 @@ -151,6 +151,24 @@ batch->screen->workaround_bo, 0, 0); } +/** + * Flush and invalidate all caches (for debugging purposes). + */ +void +iris_flush_all_caches(struct iris_batch *batch) +{ + iris_emit_pipe_control_flush(batch, "debug: flush all caches", + PIPE_CONTROL_CS_STALL | + PIPE_CONTROL_DATA_CACHE_FLUSH | + PIPE_CONTROL_DEPTH_CACHE_FLUSH | + PIPE_CONTROL_RENDER_TARGET_FLUSH | + PIPE_CONTROL_VF_CACHE_INVALIDATE | + PIPE_CONTROL_INSTRUCTION_INVALIDATE | + PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE | + PIPE_CONTROL_CONST_CACHE_INVALIDATE | + PIPE_CONTROL_STATE_CACHE_INVALIDATE); +} + static void iris_texture_barrier(struct pipe_context *ctx, unsigned flags) { diff -Nru mesa-19.2.8/src/gallium/drivers/iris/iris_program.c mesa-20.0.8/src/gallium/drivers/iris/iris_program.c --- mesa-19.2.8/src/gallium/drivers/iris/iris_program.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/iris/iris_program.c 2020-06-12 01:21:17.000000000 +0000 @@ -46,12 +46,13 @@ #include "iris_context.h" #include "nir/tgsi_to_nir.h" -#define KEY_INIT_NO_ID(gen) \ +#define KEY_ID(prefix) .prefix.program_string_id = ish->program_id +#define BRW_KEY_INIT(gen, prog_id) \ + .base.program_string_id = prog_id, \ .base.subgroup_size_type = BRW_SUBGROUP_SIZE_UNIFORM, \ .base.tex.swizzles[0 ... MAX_SAMPLERS - 1] = 0x688, \ .base.tex.compressed_multisample_layout_mask = ~0, \ .base.tex.msaa_16 = (gen >= 9 ? ~0 : 0) -#define KEY_INIT(gen) .base.program_string_id = ish->program_id, KEY_INIT_NO_ID(gen) static unsigned get_new_program_id(struct iris_screen *screen) @@ -59,6 +60,83 @@ return p_atomic_inc_return(&screen->program_id); } +static struct brw_vs_prog_key +iris_to_brw_vs_key(const struct gen_device_info *devinfo, + const struct iris_vs_prog_key *key) +{ + return (struct brw_vs_prog_key) { + BRW_KEY_INIT(devinfo->gen, key->vue.base.program_string_id), + + /* Don't tell the backend about our clip plane constants, we've + * already lowered them in NIR and don't want it doing it again. + */ + .nr_userclip_plane_consts = 0, + }; +} + +static struct brw_tcs_prog_key +iris_to_brw_tcs_key(const struct gen_device_info *devinfo, + const struct iris_tcs_prog_key *key) +{ + return (struct brw_tcs_prog_key) { + BRW_KEY_INIT(devinfo->gen, key->vue.base.program_string_id), + .tes_primitive_mode = key->tes_primitive_mode, + .input_vertices = key->input_vertices, + .patch_outputs_written = key->patch_outputs_written, + .outputs_written = key->outputs_written, + .quads_workaround = key->quads_workaround, + }; +} + +static struct brw_tes_prog_key +iris_to_brw_tes_key(const struct gen_device_info *devinfo, + const struct iris_tes_prog_key *key) +{ + return (struct brw_tes_prog_key) { + BRW_KEY_INIT(devinfo->gen, key->vue.base.program_string_id), + .patch_inputs_read = key->patch_inputs_read, + .inputs_read = key->inputs_read, + }; +} + +static struct brw_gs_prog_key +iris_to_brw_gs_key(const struct gen_device_info *devinfo, + const struct iris_gs_prog_key *key) +{ + return (struct brw_gs_prog_key) { + BRW_KEY_INIT(devinfo->gen, key->vue.base.program_string_id), + }; +} + +static struct brw_wm_prog_key +iris_to_brw_fs_key(const struct gen_device_info *devinfo, + const struct iris_fs_prog_key *key) +{ + return (struct brw_wm_prog_key) { + BRW_KEY_INIT(devinfo->gen, key->base.program_string_id), + .nr_color_regions = key->nr_color_regions, + .flat_shade = key->flat_shade, + .alpha_test_replicate_alpha = key->alpha_test_replicate_alpha, + .alpha_to_coverage = key->alpha_to_coverage, + .clamp_fragment_color = key->clamp_fragment_color, + .persample_interp = key->persample_interp, + .multisample_fbo = key->multisample_fbo, + .force_dual_color_blend = key->force_dual_color_blend, + .coherent_fb_fetch = key->coherent_fb_fetch, + .color_outputs_valid = key->color_outputs_valid, + .input_slots_valid = key->input_slots_valid, + }; +} + +static struct brw_cs_prog_key +iris_to_brw_cs_key(const struct gen_device_info *devinfo, + const struct iris_cs_prog_key *key) +{ + return (struct brw_cs_prog_key) { + BRW_KEY_INIT(devinfo->gen, key->base.program_string_id), + }; +} + static void * upload_state(struct u_upload_mgr *uploader, struct iris_state_ref *ref, @@ -99,7 +177,7 @@ : ISL_FORMAT_R32G32B32A32_FLOAT, .swizzle = ISL_SWIZZLE_IDENTITY, .stride_B = 1, - .mocs = ice->vtbl.mocs(res->bo)); + .mocs = iris_mocs(res->bo, &screen->isl_dev)); } static nir_ssa_def * @@ -152,8 +230,10 @@ case nir_intrinsic_image_deref_load: case nir_intrinsic_image_deref_store: case nir_intrinsic_image_deref_atomic_add: - case nir_intrinsic_image_deref_atomic_min: - case nir_intrinsic_image_deref_atomic_max: + case nir_intrinsic_image_deref_atomic_imin: + case nir_intrinsic_image_deref_atomic_umin: + case nir_intrinsic_image_deref_atomic_imax: + case nir_intrinsic_image_deref_atomic_umax: case nir_intrinsic_image_deref_atomic_and: case nir_intrinsic_image_deref_atomic_or: case nir_intrinsic_image_deref_atomic_xor: @@ -181,7 +261,44 @@ } } -// XXX: need unify_interfaces() at link time... +/** + * Undo nir_lower_passthrough_edgeflags but keep the inputs_read flag. + */ +static bool +iris_fix_edge_flags(nir_shader *nir) +{ + if (nir->info.stage != MESA_SHADER_VERTEX) + return false; + + nir_variable *var = NULL; + nir_foreach_variable(v, &nir->outputs) { + if (v->data.location == VARYING_SLOT_EDGE) { + var = v; + break; + } + } + + if (!var) + return false; + + exec_node_remove(&var->node); + var->data.mode = nir_var_shader_temp; + exec_list_push_tail(&nir->globals, &var->node); + nir->info.outputs_written &= ~VARYING_BIT_EDGE; + nir->info.inputs_read &= ~VERT_BIT_EDGEFLAG; + nir_fixup_deref_modes(nir); + + nir_foreach_function(f, nir) { + if (f->impl) { + nir_metadata_preserve(f->impl, nir_metadata_block_index | + nir_metadata_dominance | + nir_metadata_live_ssa_defs | + nir_metadata_loop_analysis); + } + } + + return true; +} /** * Fix an uncompiled shader's stream output info. @@ -267,13 +384,6 @@ { UNUSED const struct gen_device_info *devinfo = compiler->devinfo; - /* The intel compiler assumes that num_uniforms is in bytes. For - * scalar that means 4 bytes per uniform slot. - * - * Ref: brw_nir_lower_uniforms, type_size_scalar_bytes. - */ - nir->num_uniforms *= 4; - const unsigned IRIS_MAX_SYSTEM_VALUES = PIPE_MAX_SHADER_IMAGES * BRW_IMAGE_PARAM_SIZE; enum brw_param_builtin *system_values = @@ -472,12 +582,12 @@ assert(num_cbufs < PIPE_MAX_CONSTANT_BUFFERS); nir_validate_shader(nir, "after remap"); - /* We don't use params[], but fs_visitor::nir_setup_uniforms() asserts - * about it for compute shaders, so go ahead and make some fake ones - * which the backend will dead code eliminate. + /* We don't use params[] but gallium leaves num_uniforms set. We use this + * to detect when cbuf0 exists but we don't need it anymore when we get + * here. Instead, zero it out so that the back-end doesn't get confused + * when nr_params * 4 != num_uniforms != nr_params * 4. */ - prog_data->nr_params = nir->num_uniforms / 4; - prog_data->param = rzalloc_array(mem_ctx, uint32_t, prog_data->nr_params); + nir->num_uniforms = 0; /* Constant loads (if any) need to go at the end of the constant buffers so * we need to know num_cbufs before we can lower to them. @@ -689,10 +799,7 @@ */ bt->sizes[IRIS_SURFACE_GROUP_UBO] = num_cbufs + 1; - /* The first IRIS_MAX_ABOs indices in the SSBO group are for atomics, real - * SSBOs start after that. Compaction will remove unused ABOs. - */ - bt->sizes[IRIS_SURFACE_GROUP_SSBO] = IRIS_MAX_ABOS + info->num_ssbos; + bt->sizes[IRIS_SURFACE_GROUP_SSBO] = info->num_ssbos; for (int i = 0; i < IRIS_SURFACE_GROUP_COUNT; i++) assert(bt->sizes[i] <= SURFACE_GROUP_MAX_ELEMENTS); @@ -723,8 +830,10 @@ case nir_intrinsic_image_load: case nir_intrinsic_image_store: case nir_intrinsic_image_atomic_add: - case nir_intrinsic_image_atomic_min: - case nir_intrinsic_image_atomic_max: + case nir_intrinsic_image_atomic_imin: + case nir_intrinsic_image_atomic_umin: + case nir_intrinsic_image_atomic_imax: + case nir_intrinsic_image_atomic_umax: case nir_intrinsic_image_atomic_and: case nir_intrinsic_image_atomic_or: case nir_intrinsic_image_atomic_xor: @@ -816,8 +925,10 @@ case nir_intrinsic_image_load: case nir_intrinsic_image_store: case nir_intrinsic_image_atomic_add: - case nir_intrinsic_image_atomic_min: - case nir_intrinsic_image_atomic_max: + case nir_intrinsic_image_atomic_imin: + case nir_intrinsic_image_atomic_umin: + case nir_intrinsic_image_atomic_imax: + case nir_intrinsic_image_atomic_umax: case nir_intrinsic_image_atomic_and: case nir_intrinsic_image_atomic_or: case nir_intrinsic_image_atomic_xor: @@ -878,6 +989,7 @@ const struct brw_base_prog_key *key) { struct iris_screen *screen = (struct iris_screen *) ice->ctx.screen; + const struct gen_device_info *devinfo = &screen->devinfo; const struct brw_compiler *c = screen->compiler; if (!info) @@ -888,10 +1000,35 @@ info->name ? info->name : "(no identifier)", info->label ? info->label : ""); - const void *old_key = + const void *old_iris_key = iris_find_previous_compile(ice, info->stage, key->program_string_id); - brw_debug_key_recompile(c, &ice->dbg, info->stage, old_key, key); + union brw_any_prog_key old_key; + + switch (info->stage) { + case MESA_SHADER_VERTEX: + old_key.vs = iris_to_brw_vs_key(devinfo, old_iris_key); + break; + case MESA_SHADER_TESS_CTRL: + old_key.tcs = iris_to_brw_tcs_key(devinfo, old_iris_key); + break; + case MESA_SHADER_TESS_EVAL: + old_key.tes = iris_to_brw_tes_key(devinfo, old_iris_key); + break; + case MESA_SHADER_GEOMETRY: + old_key.gs = iris_to_brw_gs_key(devinfo, old_iris_key); + break; + case MESA_SHADER_FRAGMENT: + old_key.wm = iris_to_brw_fs_key(devinfo, old_iris_key); + break; + case MESA_SHADER_COMPUTE: + old_key.cs = iris_to_brw_cs_key(devinfo, old_iris_key); + break; + default: + unreachable("invalid shader stage"); + } + + brw_debug_key_recompile(c, &ice->dbg, info->stage, &old_key.base, key); } /** @@ -917,7 +1054,7 @@ static struct iris_compiled_shader * iris_compile_vs(struct iris_context *ice, struct iris_uncompiled_shader *ish, - const struct brw_vs_prog_key *key) + const struct iris_vs_prog_key *key) { struct iris_screen *screen = (struct iris_screen *)ice->ctx.screen; const struct brw_compiler *compiler = screen->compiler; @@ -933,9 +1070,10 @@ nir_shader *nir = nir_shader_clone(mem_ctx, ish->nir); - if (key->nr_userclip_plane_consts) { + if (key->vue.nr_userclip_plane_consts) { nir_function_impl *impl = nir_shader_get_entrypoint(nir); - nir_lower_clip_vs(nir, (1 << key->nr_userclip_plane_consts) - 1, true); + nir_lower_clip_vs(nir, (1 << key->vue.nr_userclip_plane_consts) - 1, + true, false, NULL); nir_lower_io_to_temporaries(nir, impl, true, false); nir_lower_global_vars_to_local(nir); nir_lower_vars_to_ssa(nir); @@ -957,15 +1095,11 @@ &vue_prog_data->vue_map, nir->info.outputs_written, nir->info.separate_shader); - /* Don't tell the backend about our clip plane constants, we've already - * lowered them in NIR and we don't want it doing it again. - */ - struct brw_vs_prog_key key_no_ucp = *key; - key_no_ucp.nr_userclip_plane_consts = 0; + struct brw_vs_prog_key brw_key = iris_to_brw_vs_key(devinfo, key); char *error_str = NULL; const unsigned *program = - brw_compile_vs(compiler, &ice->dbg, mem_ctx, &key_no_ucp, vs_prog_data, + brw_compile_vs(compiler, &ice->dbg, mem_ctx, &brw_key, vs_prog_data, nir, -1, NULL, &error_str); if (program == NULL) { dbg_printf("Failed to compile vertex shader: %s\n", error_str); @@ -974,7 +1108,7 @@ } if (ish->compiled_once) { - iris_debug_recompile(ice, &nir->info, &key->base); + iris_debug_recompile(ice, &nir->info, &brw_key.base); } else { ish->compiled_once = true; } @@ -1005,10 +1139,8 @@ struct iris_shader_state *shs = &ice->state.shaders[MESA_SHADER_VERTEX]; struct iris_uncompiled_shader *ish = ice->shaders.uncompiled[MESA_SHADER_VERTEX]; - struct iris_screen *screen = (struct iris_screen *)ice->ctx.screen; - const struct gen_device_info *devinfo = &screen->devinfo; - struct brw_vs_prog_key key = { KEY_INIT(devinfo->gen) }; + struct iris_vs_prog_key key = { KEY_ID(vue.base) }; ice->vtbl.populate_vs_key(ice, &ish->nir->info, last_vue_stage(ice), &key); struct iris_compiled_shader *old = ice->shaders.prog[IRIS_CACHE_VS]; @@ -1038,22 +1170,17 @@ const bool needs_sgvs_element = uses_draw_params || vs_prog_data->uses_instanceid || vs_prog_data->uses_vertexid; - bool needs_edge_flag = false; - nir_foreach_variable(var, &ish->nir->inputs) { - if (var->data.location == VERT_ATTRIB_EDGEFLAG) - needs_edge_flag = true; - } if (ice->state.vs_uses_draw_params != uses_draw_params || ice->state.vs_uses_derived_draw_params != uses_derived_draw_params || - ice->state.vs_needs_edge_flag != needs_edge_flag) { + ice->state.vs_needs_edge_flag != ish->needs_edge_flag) { ice->state.dirty |= IRIS_DIRTY_VERTEX_BUFFERS | IRIS_DIRTY_VERTEX_ELEMENTS; } ice->state.vs_uses_draw_params = uses_draw_params; ice->state.vs_uses_derived_draw_params = uses_derived_draw_params; ice->state.vs_needs_sgvs_element = needs_sgvs_element; - ice->state.vs_needs_edge_flag = needs_edge_flag; + ice->state.vs_needs_edge_flag = ish->needs_edge_flag; } } @@ -1109,7 +1236,7 @@ static struct iris_compiled_shader * iris_compile_tcs(struct iris_context *ice, struct iris_uncompiled_shader *ish, - const struct brw_tcs_prog_key *key) + const struct iris_tcs_prog_key *key) { struct iris_screen *screen = (struct iris_screen *)ice->ctx.screen; const struct brw_compiler *compiler = screen->compiler; @@ -1129,6 +1256,8 @@ struct iris_binding_table bt; + struct brw_tcs_prog_key brw_key = iris_to_brw_tcs_key(devinfo, key); + if (ish) { nir = nir_shader_clone(mem_ctx, ish->nir); @@ -1138,7 +1267,8 @@ num_system_values, num_cbufs); brw_nir_analyze_ubo_ranges(compiler, nir, NULL, prog_data->ubo_ranges); } else { - nir = brw_nir_create_passthrough_tcs(mem_ctx, compiler, options, key); + nir = + brw_nir_create_passthrough_tcs(mem_ctx, compiler, options, &brw_key); /* Reserve space for passing the default tess levels as constants. */ num_cbufs = 1; @@ -1176,8 +1306,8 @@ char *error_str = NULL; const unsigned *program = - brw_compile_tcs(compiler, &ice->dbg, mem_ctx, key, tcs_prog_data, nir, - -1, NULL, &error_str); + brw_compile_tcs(compiler, &ice->dbg, mem_ctx, &brw_key, tcs_prog_data, + nir, -1, NULL, &error_str); if (program == NULL) { dbg_printf("Failed to compile control shader: %s\n", error_str); ralloc_free(mem_ctx); @@ -1186,7 +1316,7 @@ if (ish) { if (ish->compiled_once) { - iris_debug_recompile(ice, &nir->info, &key->base); + iris_debug_recompile(ice, &nir->info, &brw_key.base); } else { ish->compiled_once = true; } @@ -1221,9 +1351,8 @@ const struct shader_info *tes_info = iris_get_shader_info(ice, MESA_SHADER_TESS_EVAL); - struct brw_tcs_prog_key key = { - KEY_INIT_NO_ID(devinfo->gen), - .base.program_string_id = tcs ? tcs->program_id : 0, + struct iris_tcs_prog_key key = { + .vue.base.program_string_id = tcs ? tcs->program_id : 0, .tes_primitive_mode = tes_info->tess.primitive_mode, .input_vertices = !tcs || compiler->use_tcs_8_patch ? ice->state.vertices_per_patch : 0, @@ -1260,7 +1389,7 @@ static struct iris_compiled_shader * iris_compile_tes(struct iris_context *ice, struct iris_uncompiled_shader *ish, - const struct brw_tes_prog_key *key) + const struct iris_tes_prog_key *key) { struct iris_screen *screen = (struct iris_screen *)ice->ctx.screen; const struct brw_compiler *compiler = screen->compiler; @@ -1276,9 +1405,10 @@ nir_shader *nir = nir_shader_clone(mem_ctx, ish->nir); - if (key->nr_userclip_plane_consts) { + if (key->vue.nr_userclip_plane_consts) { nir_function_impl *impl = nir_shader_get_entrypoint(nir); - nir_lower_clip_vs(nir, (1 << key->nr_userclip_plane_consts) - 1, true); + nir_lower_clip_vs(nir, (1 << key->vue.nr_userclip_plane_consts) - 1, + true, false, NULL); nir_lower_io_to_temporaries(nir, impl, true, false); nir_lower_global_vars_to_local(nir); nir_lower_vars_to_ssa(nir); @@ -1298,10 +1428,12 @@ brw_compute_tess_vue_map(&input_vue_map, key->inputs_read, key->patch_inputs_read); + struct brw_tes_prog_key brw_key = iris_to_brw_tes_key(devinfo, key); + char *error_str = NULL; const unsigned *program = - brw_compile_tes(compiler, &ice->dbg, mem_ctx, key, &input_vue_map, - tes_prog_data, nir, NULL, -1, NULL, &error_str); + brw_compile_tes(compiler, &ice->dbg, mem_ctx, &brw_key, &input_vue_map, + tes_prog_data, nir, -1, NULL, &error_str); if (program == NULL) { dbg_printf("Failed to compile evaluation shader: %s\n", error_str); ralloc_free(mem_ctx); @@ -1309,7 +1441,7 @@ } if (ish->compiled_once) { - iris_debug_recompile(ice, &nir->info, &key->base); + iris_debug_recompile(ice, &nir->info, &brw_key.base); } else { ish->compiled_once = true; } @@ -1341,10 +1473,8 @@ struct iris_shader_state *shs = &ice->state.shaders[MESA_SHADER_TESS_EVAL]; struct iris_uncompiled_shader *ish = ice->shaders.uncompiled[MESA_SHADER_TESS_EVAL]; - struct iris_screen *screen = (struct iris_screen *)ice->ctx.screen; - const struct gen_device_info *devinfo = &screen->devinfo; - struct brw_tes_prog_key key = { KEY_INIT(devinfo->gen) }; + struct iris_tes_prog_key key = { KEY_ID(vue.base) }; get_unified_tess_slots(ice, &key.inputs_read, &key.patch_inputs_read); ice->vtbl.populate_tes_key(ice, &ish->nir->info, last_vue_stage(ice), &key); @@ -1380,7 +1510,7 @@ static struct iris_compiled_shader * iris_compile_gs(struct iris_context *ice, struct iris_uncompiled_shader *ish, - const struct brw_gs_prog_key *key) + const struct iris_gs_prog_key *key) { struct iris_screen *screen = (struct iris_screen *)ice->ctx.screen; const struct brw_compiler *compiler = screen->compiler; @@ -1396,9 +1526,10 @@ nir_shader *nir = nir_shader_clone(mem_ctx, ish->nir); - if (key->nr_userclip_plane_consts) { + if (key->vue.nr_userclip_plane_consts) { nir_function_impl *impl = nir_shader_get_entrypoint(nir); - nir_lower_clip_gs(nir, (1 << key->nr_userclip_plane_consts) - 1); + nir_lower_clip_gs(nir, (1 << key->vue.nr_userclip_plane_consts) - 1, + false, NULL); nir_lower_io_to_temporaries(nir, impl, true, false); nir_lower_global_vars_to_local(nir); nir_lower_vars_to_ssa(nir); @@ -1418,10 +1549,12 @@ &vue_prog_data->vue_map, nir->info.outputs_written, nir->info.separate_shader); + struct brw_gs_prog_key brw_key = iris_to_brw_gs_key(devinfo, key); + char *error_str = NULL; const unsigned *program = - brw_compile_gs(compiler, &ice->dbg, mem_ctx, key, gs_prog_data, nir, - NULL, -1, NULL, &error_str); + brw_compile_gs(compiler, &ice->dbg, mem_ctx, &brw_key, gs_prog_data, + nir, NULL, -1, NULL, &error_str); if (program == NULL) { dbg_printf("Failed to compile geometry shader: %s\n", error_str); ralloc_free(mem_ctx); @@ -1429,7 +1562,7 @@ } if (ish->compiled_once) { - iris_debug_recompile(ice, &nir->info, &key->base); + iris_debug_recompile(ice, &nir->info, &brw_key.base); } else { ish->compiled_once = true; } @@ -1464,9 +1597,7 @@ struct iris_compiled_shader *shader = NULL; if (ish) { - struct iris_screen *screen = (struct iris_screen *)ice->ctx.screen; - const struct gen_device_info *devinfo = &screen->devinfo; - struct brw_gs_prog_key key = { KEY_INIT(devinfo->gen) }; + struct iris_gs_prog_key key = { KEY_ID(vue.base) }; ice->vtbl.populate_gs_key(ice, &ish->nir->info, last_vue_stage(ice), &key); shader = @@ -1494,7 +1625,7 @@ static struct iris_compiled_shader * iris_compile_fs(struct iris_context *ice, struct iris_uncompiled_shader *ish, - const struct brw_wm_prog_key *key, + const struct iris_fs_prog_key *key, struct brw_vue_map *vue_map) { struct iris_screen *screen = (struct iris_screen *)ice->ctx.screen; @@ -1522,16 +1653,25 @@ */ brw_nir_lower_fs_outputs(nir); + /* On Gen11+, shader RT write messages have a "Null Render Target" bit + * and do not need a binding table entry with a null surface. Earlier + * generations need an entry for a null surface. + */ + int null_rts = devinfo->gen < 11 ? 1 : 0; + struct iris_binding_table bt; - iris_setup_binding_table(devinfo, nir, &bt, MAX2(key->nr_color_regions, 1), + iris_setup_binding_table(devinfo, nir, &bt, + MAX2(key->nr_color_regions, null_rts), num_system_values, num_cbufs); brw_nir_analyze_ubo_ranges(compiler, nir, NULL, prog_data->ubo_ranges); + struct brw_wm_prog_key brw_key = iris_to_brw_fs_key(devinfo, key); + char *error_str = NULL; const unsigned *program = - brw_compile_fs(compiler, &ice->dbg, mem_ctx, key, fs_prog_data, - nir, NULL, -1, -1, -1, true, false, vue_map, + brw_compile_fs(compiler, &ice->dbg, mem_ctx, &brw_key, fs_prog_data, + nir, -1, -1, -1, true, false, vue_map, NULL, &error_str); if (program == NULL) { dbg_printf("Failed to compile fragment shader: %s\n", error_str); @@ -1540,7 +1680,7 @@ } if (ish->compiled_once) { - iris_debug_recompile(ice, &nir->info, &key->base); + iris_debug_recompile(ice, &nir->info, &brw_key.base); } else { ish->compiled_once = true; } @@ -1567,9 +1707,7 @@ struct iris_shader_state *shs = &ice->state.shaders[MESA_SHADER_FRAGMENT]; struct iris_uncompiled_shader *ish = ice->shaders.uncompiled[MESA_SHADER_FRAGMENT]; - struct iris_screen *screen = (struct iris_screen *)ice->ctx.screen; - const struct gen_device_info *devinfo = &screen->devinfo; - struct brw_wm_prog_key key = { KEY_INIT(devinfo->gen) }; + struct iris_fs_prog_key key = { KEY_ID(base) }; ice->vtbl.populate_fs_key(ice, &ish->nir->info, &key); if (ish->nos & (1ull << IRIS_NOS_LAST_VUE_MAP)) @@ -1633,6 +1771,35 @@ ice->shaders.last_vue_map = &vue_prog_data->vue_map; } +static void +iris_update_pull_constant_descriptors(struct iris_context *ice, + gl_shader_stage stage) +{ + struct iris_compiled_shader *shader = ice->shaders.prog[stage]; + + if (!shader || !shader->prog_data->has_ubo_pull) + return; + + struct iris_shader_state *shs = &ice->state.shaders[stage]; + bool any_new_descriptors = + shader->num_system_values > 0 && shs->sysvals_need_upload; + + unsigned bound_cbufs = shs->bound_cbufs; + + while (bound_cbufs) { + const int i = u_bit_scan(&bound_cbufs); + struct pipe_shader_buffer *cbuf = &shs->constbuf[i]; + struct iris_state_ref *surf_state = &shs->constbuf_surf_state[i]; + if (!surf_state->res && cbuf->buffer) { + iris_upload_ubo_ssbo_surf_state(ice, cbuf, surf_state, false); + any_new_descriptors = true; + } + } + + if (any_new_descriptors) + ice->state.dirty |= IRIS_DIRTY_BINDINGS_VS << stage; +} + /** * Get the prog_data for a given stage, or NULL if the stage is disabled. */ @@ -1747,12 +1914,17 @@ } } } + + for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_FRAGMENT; i++) { + if (ice->state.dirty & (IRIS_DIRTY_CONSTANTS_VS << i)) + iris_update_pull_constant_descriptors(ice, i); + } } static struct iris_compiled_shader * iris_compile_cs(struct iris_context *ice, struct iris_uncompiled_shader *ish, - const struct brw_cs_prog_key *key) + const struct iris_cs_prog_key *key) { struct iris_screen *screen = (struct iris_screen *)ice->ctx.screen; const struct brw_compiler *compiler = screen->compiler; @@ -1774,9 +1946,11 @@ iris_setup_binding_table(devinfo, nir, &bt, /* num_render_targets */ 0, num_system_values, num_cbufs); + struct brw_cs_prog_key brw_key = iris_to_brw_cs_key(devinfo, key); + char *error_str = NULL; const unsigned *program = - brw_compile_cs(compiler, &ice->dbg, mem_ctx, key, cs_prog_data, + brw_compile_cs(compiler, &ice->dbg, mem_ctx, &brw_key, cs_prog_data, nir, -1, NULL, &error_str); if (program == NULL) { dbg_printf("Failed to compile compute shader: %s\n", error_str); @@ -1785,7 +1959,7 @@ } if (ish->compiled_once) { - iris_debug_recompile(ice, &nir->info, &key->base); + iris_debug_recompile(ice, &nir->info, &brw_key.base); } else { ish->compiled_once = true; } @@ -1801,16 +1975,14 @@ return shader; } -void -iris_update_compiled_compute_shader(struct iris_context *ice) +static void +iris_update_compiled_cs(struct iris_context *ice) { struct iris_shader_state *shs = &ice->state.shaders[MESA_SHADER_COMPUTE]; struct iris_uncompiled_shader *ish = ice->shaders.uncompiled[MESA_SHADER_COMPUTE]; - struct iris_screen *screen = (struct iris_screen *)ice->ctx.screen; - const struct gen_device_info *devinfo = &screen->devinfo; - struct brw_cs_prog_key key = { KEY_INIT(devinfo->gen) }; + struct iris_cs_prog_key key = { KEY_ID(base) }; ice->vtbl.populate_cs_key(ice, &key); struct iris_compiled_shader *old = ice->shaders.prog[IRIS_CACHE_CS]; @@ -1833,6 +2005,16 @@ } void +iris_update_compiled_compute_shader(struct iris_context *ice) +{ + if (ice->state.dirty & IRIS_DIRTY_UNCOMPILED_CS) + iris_update_compiled_cs(ice); + + if (ice->state.dirty & IRIS_DIRTY_CONSTANTS_CS) + iris_update_pull_constant_descriptors(ice, MESA_SHADER_COMPUTE); +} + +void iris_fill_cs_push_const_buffer(struct brw_cs_prog_data *cs_prog_data, uint32_t *dst) { @@ -1871,16 +2053,26 @@ * as well. This is not currently documented at all. * * This hack is no longer necessary on Gen11+. + * + * For, Gen11+, scratch space allocation is based on the number of threads + * in the base configuration. */ unsigned subslice_total = screen->subslice_total; - if (devinfo->gen < 11) + if (devinfo->gen >= 12) + subslice_total = devinfo->num_subslices[0]; + else if (devinfo->gen == 11) + subslice_total = 8; + else if (devinfo->gen < 11) subslice_total = 4 * devinfo->num_slices; assert(subslice_total >= screen->subslice_total); if (!*bop) { unsigned scratch_ids_per_subslice = devinfo->max_cs_threads; - if (devinfo->gen >= 11) { + if (devinfo->gen >= 12) { + /* Same as ICL below, but with 16 EUs. */ + scratch_ids_per_subslice = 16 * 8; + } else if (devinfo->gen == 11) { /* The MEDIA_VFE_STATE docs say: * * "Starting with this configuration, the Maximum Number of @@ -1935,6 +2127,8 @@ if (!ish) return NULL; + NIR_PASS(ish->needs_edge_flag, nir, iris_fix_edge_flags); + brw_preprocess_nir(screen->compiler, nir, NULL); NIR_PASS_V(nir, brw_nir_lower_image_load_store, devinfo); @@ -1968,22 +2162,15 @@ if (screen->disk_cache) { /* Serialize the NIR to a binary blob that we can hash for the disk - * cache. First, drop unnecessary information (like variable names) + * cache. Drop unnecessary information (like variable names) * so the serialized NIR is smaller, and also to let us detect more - * isomorphic shaders when hashing, increasing cache hits. We clone - * the NIR before stripping away this info because it can be useful - * when inspecting and debugging shaders. + * isomorphic shaders when hashing, increasing cache hits. */ - nir_shader *clone = nir_shader_clone(NULL, nir); - nir_strip(clone); - struct blob blob; blob_init(&blob); - nir_serialize(&blob, clone); + nir_serialize(&blob, nir, true); _mesa_sha1_compute(blob.data, blob.size, ish->nir_sha1); blob_finish(&blob); - - ralloc_free(clone); } return ish; @@ -2016,8 +2203,7 @@ ish->nos |= (1ull << IRIS_NOS_RASTERIZER); if (screen->precompile) { - const struct gen_device_info *devinfo = &screen->devinfo; - struct brw_vs_prog_key key = { KEY_INIT(devinfo->gen) }; + struct iris_vs_prog_key key = { KEY_ID(vue.base) }; if (!iris_disk_cache_retrieve(ice, ish, &key, sizeof(key))) iris_compile_vs(ice, ish, &key); @@ -2038,9 +2224,8 @@ if (screen->precompile) { const unsigned _GL_TRIANGLES = 0x0004; - const struct gen_device_info *devinfo = &screen->devinfo; - struct brw_tcs_prog_key key = { - KEY_INIT(devinfo->gen), + struct iris_tcs_prog_key key = { + KEY_ID(vue.base), // XXX: make sure the linker fills this out from the TES... .tes_primitive_mode = info->tess.primitive_mode ? info->tess.primitive_mode @@ -2078,9 +2263,8 @@ ish->nos |= (1ull << IRIS_NOS_RASTERIZER); if (screen->precompile) { - const struct gen_device_info *devinfo = &screen->devinfo; - struct brw_tes_prog_key key = { - KEY_INIT(devinfo->gen), + struct iris_tes_prog_key key = { + KEY_ID(vue.base), // XXX: not ideal, need TCS output/TES input unification .inputs_read = info->inputs_read, .patch_inputs_read = info->patch_inputs_read, @@ -2106,8 +2290,7 @@ ish->nos |= (1ull << IRIS_NOS_RASTERIZER); if (screen->precompile) { - const struct gen_device_info *devinfo = &screen->devinfo; - struct brw_gs_prog_key key = { KEY_INIT(devinfo->gen) }; + struct iris_gs_prog_key key = { KEY_ID(vue.base) }; if (!iris_disk_cache_retrieve(ice, ish, &key, sizeof(key))) iris_compile_gs(ice, ish, &key); @@ -2146,8 +2329,8 @@ util_bitcount64(info->inputs_read & BRW_FS_VARYING_INPUT_MASK) <= 16; const struct gen_device_info *devinfo = &screen->devinfo; - struct brw_wm_prog_key key = { - KEY_INIT(devinfo->gen), + struct iris_fs_prog_key key = { + KEY_ID(base), .nr_color_regions = util_bitcount(color_outputs), .coherent_fb_fetch = devinfo->gen >= 9, .input_slots_valid = @@ -2175,8 +2358,7 @@ // XXX: disallow more than 64KB of shared variables if (screen->precompile) { - const struct gen_device_info *devinfo = &screen->devinfo; - struct brw_cs_prog_key key = { KEY_INIT(devinfo->gen) }; + struct iris_cs_prog_key key = { KEY_ID(base) }; if (!iris_disk_cache_retrieve(ice, ish, &key, sizeof(key))) iris_compile_cs(ice, ish, &key); @@ -2336,6 +2518,8 @@ iris_bind_fs_state(struct pipe_context *ctx, void *state) { struct iris_context *ice = (struct iris_context *) ctx; + struct iris_screen *screen = (struct iris_screen *) ctx->screen; + const struct gen_device_info *devinfo = &screen->devinfo; struct iris_uncompiled_shader *old_ish = ice->shaders.uncompiled[MESA_SHADER_FRAGMENT]; struct iris_uncompiled_shader *new_ish = state; @@ -2350,6 +2534,9 @@ (new_ish->nir->info.outputs_written & color_bits)) ice->state.dirty |= IRIS_DIRTY_PS_BLEND; + if (devinfo->gen == 8) + ice->state.dirty |= IRIS_DIRTY_PMA_FIX; + bind_shader_state((void *) ctx, state, MESA_SHADER_FRAGMENT); } diff -Nru mesa-19.2.8/src/gallium/drivers/iris/iris_program_cache.c mesa-20.0.8/src/gallium/drivers/iris/iris_program_cache.c --- mesa-19.2.8/src/gallium/drivers/iris/iris_program_cache.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/iris/iris_program_cache.c 2020-06-12 01:21:17.000000000 +0000 @@ -89,8 +89,7 @@ uint32_t key_size, const void *key) { - struct keybox *keybox = - make_keybox(ice->shaders.cache, cache_id, key, key_size); + struct keybox *keybox = make_keybox(NULL, cache_id, key, key_size); struct hash_entry *entry = _mesa_hash_table_search(ice->shaders.cache, keybox); @@ -191,7 +190,7 @@ /* Store the 3DSTATE shader packets and other derived state. */ ice->vtbl.store_derived_program_state(ice, cache_id, shader); - struct keybox *keybox = make_keybox(cache, cache_id, key, key_size); + struct keybox *keybox = make_keybox(shader, cache_id, key, key_size); _mesa_hash_table_insert(ice->shaders.cache, keybox, shader); return shader; diff -Nru mesa-19.2.8/src/gallium/drivers/iris/iris_query.c mesa-20.0.8/src/gallium/drivers/iris/iris_query.c --- mesa-19.2.8/src/gallium/drivers/iris/iris_query.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/iris/iris_query.c 2020-06-12 01:21:17.000000000 +0000 @@ -33,7 +33,6 @@ #include #include -#include "perf/gen_perf.h" #include "pipe/p_defines.h" #include "pipe/p_state.h" #include "pipe/p_context.h" @@ -69,6 +68,9 @@ int batch_idx; struct iris_monitor_object *monitor; + + /* Fence for PIPE_QUERY_GPU_FINISHED. */ + struct pipe_fence_handle *fence; }; struct iris_query_snapshots { @@ -480,6 +482,7 @@ query->monitor = NULL; } else { iris_syncpt_reference(screen, &query->syncpt, NULL); + screen->base.fence_reference(ctx->screen, &query->fence, NULL); } free(query); } @@ -543,6 +546,11 @@ if (q->monitor) return iris_end_monitor(ctx, q->monitor); + if (q->type == PIPE_QUERY_GPU_FINISHED) { + ctx->flush(ctx, &q->fence, PIPE_FLUSH_DEFERRED); + return true; + } + struct iris_batch *batch = &ice->batches[q->batch_idx]; if (q->type == PIPE_QUERY_TIMESTAMP) { @@ -606,6 +614,14 @@ return true; } + if (q->type == PIPE_QUERY_GPU_FINISHED) { + struct pipe_screen *screen = ctx->screen; + + result->b = screen->fence_finish(screen, ctx, q->fence, + wait ? PIPE_TIMEOUT_INFINITE : 0); + return result->b; + } + if (!q->ready) { struct iris_batch *batch = &ice->batches[q->batch_idx]; if (q->syncpt == iris_batch_get_signal_syncpt(batch)) diff -Nru mesa-19.2.8/src/gallium/drivers/iris/iris_resolve.c mesa-20.0.8/src/gallium/drivers/iris/iris_resolve.c --- mesa-19.2.8/src/gallium/drivers/iris/iris_resolve.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/iris/iris_resolve.c 2020-06-12 01:21:17.000000000 +0000 @@ -90,8 +90,6 @@ { uint32_t views = info ? (shs->bound_sampler_views & info->textures_used) : 0; - unsigned astc5x5_wa_bits = 0; // XXX: actual tracking - while (views) { const int i = u_bit_scan(&views); struct iris_sampler_view *isv = shs->textures[i]; @@ -107,8 +105,7 @@ iris_resource_prepare_texture(ice, batch, res, isv->view.format, isv->view.base_level, isv->view.levels, isv->view.base_array_layer, - isv->view.array_len, - astc5x5_wa_bits); + isv->view.array_len); } iris_cache_flush_for_read(batch, res->bo); @@ -127,15 +124,24 @@ while (views) { const int i = u_bit_scan(&views); - struct iris_resource *res = (void *) shs->image[i].base.resource; + struct pipe_image_view *pview = &shs->image[i].base; + struct iris_resource *res = (void *) pview->resource; if (res->base.target != PIPE_BUFFER) { if (consider_framebuffer) { disable_rb_aux_buffer(ice, draw_aux_buffer_disabled, - res, 0, ~0, "as a shader image"); + res, pview->u.tex.level, 1, + "as a shader image"); } - iris_resource_prepare_image(ice, batch, res); + unsigned num_layers = + pview->u.tex.last_layer - pview->u.tex.first_layer + 1; + + /* The data port doesn't understand any compression */ + iris_resource_prepare_access(ice, batch, res, + pview->u.tex.level, 1, + pview->u.tex.first_layer, num_layers, + ISL_AUX_USAGE_NONE, false); } iris_cache_flush_for_read(batch, res->bo); @@ -168,8 +174,6 @@ resolve_image_views(ice, batch, shs, draw_aux_buffer_disabled, consider_framebuffer); } - - // XXX: ASTC hacks } void @@ -216,8 +220,7 @@ iris_resource_prepare_texture(ice, batch, res, surf->view.format, surf->view.base_level, 1, surf->view.base_array_layer, - surf->view.array_len, - 0); + surf->view.array_len); } } } @@ -295,10 +298,10 @@ } if (s_res) { - if (may_have_resolved_depth) { + if (may_have_resolved_depth && ice->state.stencil_writes_enabled) { iris_resource_finish_write(ice, s_res, zs_surf->u.tex.level, zs_surf->u.tex.first_layer, num_layers, - ISL_AUX_USAGE_NONE); + s_res->aux.usage); } if (ice->state.stencil_writes_enabled) @@ -471,8 +474,8 @@ //DBG("%s to mt %p level %u layer %u\n", __FUNCTION__, mt, level, layer); struct blorp_surf surf; - iris_blorp_surf_for_resource(&ice->vtbl, &surf, &res->base, res->aux.usage, - level, true); + iris_blorp_surf_for_resource(&ice->vtbl, &batch->screen->isl_dev, &surf, + &res->base, res->aux.usage, level, true); iris_batch_maybe_flush(batch, 1500); @@ -493,9 +496,17 @@ struct blorp_batch blorp_batch; blorp_batch_init(&ice->blorp, &blorp_batch, batch, 0); - blorp_ccs_resolve(&blorp_batch, &surf, level, layer, 1, - isl_format_srgb_to_linear(res->surf.format), - resolve_op); + /* On Gen >= 12, Stencil buffer with lossless compression needs to be + * resolve with WM_HZ_OP packet. + */ + if (isl_surf_usage_is_stencil(res->surf.usage)) { + blorp_hiz_stencil_op(&blorp_batch, &surf, level, layer, + 1, resolve_op); + } else { + blorp_ccs_resolve(&blorp_batch, &surf, level, layer, 1, + isl_format_srgb_to_linear(res->surf.format), + resolve_op); + } blorp_batch_finish(&blorp_batch); /* See comment above */ @@ -513,11 +524,11 @@ //DBG("%s to mt %p layers %u-%u\n", __FUNCTION__, mt, //start_layer, start_layer + num_layers - 1); - assert(res->aux.usage == ISL_AUX_USAGE_MCS); + assert(isl_aux_usage_has_mcs(res->aux.usage)); struct blorp_surf surf; - iris_blorp_surf_for_resource(&ice->vtbl, &surf, &res->base, res->aux.usage, - 0, true); + iris_blorp_surf_for_resource(&ice->vtbl, &batch->screen->isl_dev, &surf, + &res->base, res->aux.usage, 0, true); struct blorp_batch blorp_batch; blorp_batch_init(&ice->blorp, &blorp_batch, batch, 0); @@ -548,15 +559,23 @@ return isl_formats_are_ccs_e_compatible(devinfo, isl_format, access_format); } -static bool -sample_with_hiz(const struct gen_device_info *devinfo, - const struct iris_resource *res) +bool +iris_sample_with_depth_aux(const struct gen_device_info *devinfo, + const struct iris_resource *res) { - if (!devinfo->has_sample_with_hiz) + switch (res->aux.usage) { + case ISL_AUX_USAGE_HIZ: + if (devinfo->has_sample_with_hiz) + break; return false; - - if (res->aux.usage != ISL_AUX_USAGE_HIZ) + case ISL_AUX_USAGE_HIZ_CCS: + /* Write through mode must have been enabled for prior writes. */ + if (isl_surf_supports_hiz_ccs_wt(devinfo, &res->surf, res->aux.usage)) + break; + return false; + default: return false; + } /* It seems the hardware won't fallback to the depth buffer if some of the * mipmap levels aren't available in the HiZ buffer. So we need all levels @@ -654,13 +673,13 @@ iris_emit_pipe_control_flush(batch, "hiz op: pre-flushes (2/2)", PIPE_CONTROL_DEPTH_STALL); - assert(res->aux.usage == ISL_AUX_USAGE_HIZ && res->aux.bo); + assert(isl_aux_usage_has_hiz(res->aux.usage) && res->aux.bo); iris_batch_maybe_flush(batch, 1500); struct blorp_surf surf; - iris_blorp_surf_for_resource(&ice->vtbl, &surf, &res->base, - ISL_AUX_USAGE_HIZ, level, true); + iris_blorp_surf_for_resource(&ice->vtbl, &batch->screen->isl_dev, &surf, + &res->base, res->aux.usage, level, true); struct blorp_batch blorp_batch; enum blorp_batch_flags flags = 0; @@ -972,7 +991,7 @@ enum isl_aux_usage aux_usage, bool fast_clear_supported) { - assert(aux_usage == ISL_AUX_USAGE_MCS); + assert(isl_aux_usage_has_mcs(aux_usage)); switch (iris_resource_get_aux_state(res, 0, layer)) { case ISL_AUX_STATE_CLEAR: @@ -1001,7 +1020,7 @@ uint32_t layer, enum isl_aux_usage aux_usage) { - assert(aux_usage == ISL_AUX_USAGE_MCS); + assert(isl_aux_usage_has_mcs(aux_usage)); switch (iris_resource_get_aux_state(res, 0, layer)) { case ISL_AUX_STATE_CLEAR: @@ -1029,18 +1048,21 @@ enum isl_aux_usage aux_usage, bool fast_clear_supported) { - assert(aux_usage == ISL_AUX_USAGE_NONE || aux_usage == ISL_AUX_USAGE_HIZ); + assert(aux_usage == ISL_AUX_USAGE_NONE || + aux_usage == ISL_AUX_USAGE_HIZ || + aux_usage == ISL_AUX_USAGE_HIZ_CCS || + aux_usage == ISL_AUX_USAGE_CCS_E); enum isl_aux_op hiz_op = ISL_AUX_OP_NONE; switch (iris_resource_get_aux_state(res, level, layer)) { case ISL_AUX_STATE_CLEAR: case ISL_AUX_STATE_COMPRESSED_CLEAR: - if (aux_usage != ISL_AUX_USAGE_HIZ || !fast_clear_supported) + if (aux_usage == ISL_AUX_USAGE_NONE || !fast_clear_supported) hiz_op = ISL_AUX_OP_FULL_RESOLVE; break; case ISL_AUX_STATE_COMPRESSED_NO_CLEAR: - if (aux_usage != ISL_AUX_USAGE_HIZ) + if (aux_usage == ISL_AUX_USAGE_NONE) hiz_op = ISL_AUX_OP_FULL_RESOLVE; break; @@ -1049,7 +1071,7 @@ break; case ISL_AUX_STATE_AUX_INVALID: - if (aux_usage == ISL_AUX_USAGE_HIZ) + if (aux_usage != ISL_AUX_USAGE_NONE) hiz_op = ISL_AUX_OP_AMBIGUATE; break; @@ -1084,22 +1106,23 @@ uint32_t level, uint32_t layer, enum isl_aux_usage aux_usage) { - assert(aux_usage == ISL_AUX_USAGE_NONE || aux_usage == ISL_AUX_USAGE_HIZ); + assert(aux_usage == ISL_AUX_USAGE_NONE || + isl_aux_usage_has_hiz(aux_usage)); switch (iris_resource_get_aux_state(res, level, layer)) { case ISL_AUX_STATE_CLEAR: - assert(aux_usage == ISL_AUX_USAGE_HIZ); + assert(isl_aux_usage_has_hiz(aux_usage)); iris_resource_set_aux_state(ice, res, level, layer, 1, ISL_AUX_STATE_COMPRESSED_CLEAR); break; case ISL_AUX_STATE_COMPRESSED_NO_CLEAR: case ISL_AUX_STATE_COMPRESSED_CLEAR: - assert(aux_usage == ISL_AUX_USAGE_HIZ); + assert(isl_aux_usage_has_hiz(aux_usage)); break; /* Nothing to do */ case ISL_AUX_STATE_RESOLVED: - if (aux_usage == ISL_AUX_USAGE_HIZ) { + if (isl_aux_usage_has_hiz(aux_usage)) { iris_resource_set_aux_state(ice, res, level, layer, 1, ISL_AUX_STATE_COMPRESSED_NO_CLEAR); } else { @@ -1109,14 +1132,14 @@ break; case ISL_AUX_STATE_PASS_THROUGH: - if (aux_usage == ISL_AUX_USAGE_HIZ) { + if (isl_aux_usage_has_hiz(aux_usage)) { iris_resource_set_aux_state(ice, res, level, layer, 1, ISL_AUX_STATE_COMPRESSED_NO_CLEAR); } break; case ISL_AUX_STATE_AUX_INVALID: - assert(aux_usage != ISL_AUX_USAGE_HIZ); + assert(!isl_aux_usage_has_hiz(aux_usage)); break; case ISL_AUX_STATE_PARTIAL_CLEAR: @@ -1141,6 +1164,7 @@ break; case ISL_AUX_USAGE_MCS: + case ISL_AUX_USAGE_MCS_CCS: assert(start_level == 0 && num_levels == 1); const uint32_t level_layers = miptree_layer_range_length(res, 0, start_layer, num_layers); @@ -1165,6 +1189,7 @@ break; case ISL_AUX_USAGE_HIZ: + case ISL_AUX_USAGE_HIZ_CCS: for (uint32_t l = 0; l < num_levels; l++) { const uint32_t level = start_level + l; if (!iris_resource_level_has_hiz(res, level)) @@ -1198,6 +1223,7 @@ break; case ISL_AUX_USAGE_MCS: + case ISL_AUX_USAGE_MCS_CCS: for (uint32_t a = 0; a < num_layers; a++) { iris_resource_finish_mcs_write(ice, res, start_layer + a, aux_usage); @@ -1213,6 +1239,7 @@ break; case ISL_AUX_USAGE_HIZ: + case ISL_AUX_USAGE_HIZ_CCS: if (!iris_resource_level_has_hiz(res, level)) return; @@ -1235,8 +1262,6 @@ if (res->surf.usage & ISL_SURF_USAGE_DEPTH_BIT) { assert(iris_resource_level_has_hiz(res, level)); - } else if (res->surf.usage & ISL_SURF_USAGE_STENCIL_BIT) { - unreachable("Cannot get aux state for stencil"); } else { assert(res->surf.samples == 1 || res->surf.msaa_layout == ISL_MSAA_LAYOUT_ARRAY); @@ -1255,8 +1280,6 @@ if (res->surf.usage & ISL_SURF_USAGE_DEPTH_BIT) { assert(iris_resource_level_has_hiz(res, level)); - } else if (res->surf.usage & ISL_SURF_USAGE_STENCIL_BIT) { - unreachable("Cannot set aux state for stencil"); } else { assert(res->surf.samples == 1 || res->surf.msaa_layout == ISL_MSAA_LAYOUT_ARRAY); @@ -1309,30 +1332,25 @@ enum isl_aux_usage iris_resource_texture_aux_usage(struct iris_context *ice, const struct iris_resource *res, - enum isl_format view_format, - enum gen9_astc5x5_wa_tex_type astc5x5_wa_bits) + enum isl_format view_format) { struct iris_screen *screen = (void *) ice->ctx.screen; struct gen_device_info *devinfo = &screen->devinfo; - assert(devinfo->gen == 9 || astc5x5_wa_bits == 0); - - /* On gen9, ASTC 5x5 textures cannot live in the sampler cache along side - * CCS or HiZ compressed textures. See gen9_apply_astc5x5_wa_flush() for - * details. - */ - if ((astc5x5_wa_bits & GEN9_ASTC5X5_WA_TEX_TYPE_ASTC5x5) && - res->aux.usage != ISL_AUX_USAGE_MCS) - return ISL_AUX_USAGE_NONE; - switch (res->aux.usage) { case ISL_AUX_USAGE_HIZ: - if (sample_with_hiz(devinfo, res)) + if (iris_sample_with_depth_aux(devinfo, res)) return ISL_AUX_USAGE_HIZ; break; + case ISL_AUX_USAGE_HIZ_CCS: + if (iris_sample_with_depth_aux(devinfo, res)) + return ISL_AUX_USAGE_CCS_E; + break; + case ISL_AUX_USAGE_MCS: - return ISL_AUX_USAGE_MCS; + case ISL_AUX_USAGE_MCS_CCS: + return res->aux.usage; case ISL_AUX_USAGE_CCS_D: case ISL_AUX_USAGE_CCS_E: @@ -1378,11 +1396,10 @@ struct iris_resource *res, enum isl_format view_format, uint32_t start_level, uint32_t num_levels, - uint32_t start_layer, uint32_t num_layers, - enum gen9_astc5x5_wa_tex_type astc5x5_wa_bits) + uint32_t start_layer, uint32_t num_layers) { enum isl_aux_usage aux_usage = - iris_resource_texture_aux_usage(ice, res, view_format, astc5x5_wa_bits); + iris_resource_texture_aux_usage(ice, res, view_format); bool clear_supported = aux_usage != ISL_AUX_USAGE_NONE; @@ -1398,17 +1415,6 @@ aux_usage, clear_supported); } -void -iris_resource_prepare_image(struct iris_context *ice, - struct iris_batch *batch, - struct iris_resource *res) -{ - /* The data port doesn't understand any compression */ - iris_resource_prepare_access(ice, batch, res, 0, INTEL_REMAINING_LEVELS, - 0, INTEL_REMAINING_LAYERS, - ISL_AUX_USAGE_NONE, false); -} - enum isl_aux_usage iris_resource_render_aux_usage(struct iris_context *ice, struct iris_resource *res, @@ -1424,7 +1430,8 @@ switch (res->aux.usage) { case ISL_AUX_USAGE_MCS: - return ISL_AUX_USAGE_MCS; + case ISL_AUX_USAGE_MCS_CCS: + return res->aux.usage; case ISL_AUX_USAGE_CCS_D: case ISL_AUX_USAGE_CCS_E: @@ -1441,8 +1448,9 @@ format_ccs_e_compat_with_resource(devinfo, res, render_format)) return ISL_AUX_USAGE_CCS_E; - /* Otherwise, we have to fall back to CCS_D */ - return ISL_AUX_USAGE_CCS_D; + /* Otherwise, we try to fall back to CCS_D */ + if (isl_format_supports_ccs_d(devinfo, render_format)) + return ISL_AUX_USAGE_CCS_D; default: return ISL_AUX_USAGE_NONE; diff -Nru mesa-19.2.8/src/gallium/drivers/iris/iris_resource.c mesa-20.0.8/src/gallium/drivers/iris/iris_resource.c --- mesa-19.2.8/src/gallium/drivers/iris/iris_resource.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/iris/iris_resource.c 2020-06-12 01:21:17.000000000 +0000 @@ -37,7 +37,7 @@ #include "util/os_memory.h" #include "util/u_cpu_detect.h" #include "util/u_inlines.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_threaded_context.h" #include "util/u_transfer.h" #include "util/u_transfer_helper.h" @@ -47,6 +47,7 @@ #include "iris_context.h" #include "iris_resource.h" #include "iris_screen.h" +#include "intel/common/gen_aux_map.h" #include "intel/dev/gen_debug.h" #include "isl/isl.h" #include "drm-uapi/drm_fourcc.h" @@ -84,10 +85,11 @@ enum isl_format linear_format = isl_format_srgb_to_linear(rt_format); - if (!isl_format_supports_ccs_e(devinfo, linear_format)) + if (linear_format == ISL_FORMAT_UNSUPPORTED || + !isl_format_supports_ccs_e(devinfo, linear_format)) return false; - return true; + return devinfo->gen >= 9 && devinfo->gen <= 11; } case I915_FORMAT_MOD_Y_TILED: case I915_FORMAT_MOD_X_TILED: @@ -295,6 +297,7 @@ res->aux.has_hiz = 0; res->aux.surf.size_B = 0; res->aux.bo = NULL; + res->aux.extra_aux.surf.size_B = 0; res->aux.clear_color_bo = NULL; res->aux.state = NULL; } @@ -311,6 +314,8 @@ iris_resource_disable_aux(res); iris_bo_unreference(res->bo); + iris_pscreen_unref(res->base.screen); + free(res); } @@ -323,7 +328,7 @@ return NULL; res->base = *templ; - res->base.screen = pscreen; + res->base.screen = iris_pscreen_ref(pscreen); pipe_reference_init(&res->base.reference, 1); res->aux.possible_usages = 1 << ISL_AUX_USAGE_NONE; @@ -347,6 +352,8 @@ static enum isl_aux_state ** create_aux_state_map(struct iris_resource *res, enum isl_aux_state initial) { + assert(res->aux.state == NULL); + uint32_t total_slices = 0; for (uint32_t level = 0; level < res->surf.levels; level++) total_slices += iris_get_num_logical_layers(res, level); @@ -385,10 +392,50 @@ return devinfo->gen >= 10 ? screen->isl_dev.ss.clear_color_state_size : 0; } +static void +map_aux_addresses(struct iris_screen *screen, struct iris_resource *res) +{ + const struct gen_device_info *devinfo = &screen->devinfo; + if (devinfo->gen >= 12 && isl_aux_usage_has_ccs(res->aux.usage)) { + void *aux_map_ctx = iris_bufmgr_get_aux_map_context(screen->bufmgr); + assert(aux_map_ctx); + const unsigned aux_offset = res->aux.extra_aux.surf.size_B > 0 ? + res->aux.extra_aux.offset : res->aux.offset; + gen_aux_map_add_image(aux_map_ctx, &res->surf, res->bo->gtt_offset, + res->aux.bo->gtt_offset + aux_offset); + res->bo->aux_map_address = res->aux.bo->gtt_offset; + } +} + +static bool +want_ccs_e_for_format(const struct gen_device_info *devinfo, + enum isl_format format) +{ + if (!isl_format_supports_ccs_e(devinfo, format)) + return false; + + const struct isl_format_layout *fmtl = isl_format_get_layout(format); + + /* CCS_E seems to significantly hurt performance with 32-bit floating + * point formats. For example, Paraview's "Wavelet Volume" case uses + * both R32_FLOAT and R32G32B32A32_FLOAT, and enabling CCS_E for those + * formats causes a 62% FPS drop. + * + * However, many benchmarks seem to use 16-bit float with no issues. + */ + if (fmtl->channels.r.bits == 32 && fmtl->channels.r.type == ISL_SFLOAT) + return false; + + return true; +} + /** * Configure aux for the resource, but don't allocate it. For images which * might be shared with modifiers, we must allocate the image and aux data in * a single bo. + * + * Returns false on unexpected error (e.g. allocation failed, or invalid + * configuration result). */ static bool iris_resource_configure_aux(struct iris_screen *screen, @@ -396,24 +443,86 @@ uint64_t *aux_size_B, uint32_t *alloc_flags) { - struct isl_device *isl_dev = &screen->isl_dev; - enum isl_aux_state initial_state; - UNUSED bool ok = false; + const struct gen_device_info *devinfo = &screen->devinfo; + /* Try to create the auxiliary surfaces allowed by the modifier or by + * the user if no modifier is specified. + */ + assert(!res->mod_info || res->mod_info->aux_usage == ISL_AUX_USAGE_NONE || + res->mod_info->aux_usage == ISL_AUX_USAGE_CCS_E); + + const bool has_mcs = !res->mod_info && + isl_surf_get_mcs_surf(&screen->isl_dev, &res->surf, &res->aux.surf); + + const bool has_hiz = !res->mod_info && !(INTEL_DEBUG & DEBUG_NO_HIZ) && + isl_surf_get_hiz_surf(&screen->isl_dev, &res->surf, &res->aux.surf); + + const bool has_ccs = + ((!res->mod_info && !(INTEL_DEBUG & DEBUG_NO_RBC)) || + (res->mod_info && res->mod_info->aux_usage != ISL_AUX_USAGE_NONE)) && + isl_surf_get_ccs_surf(&screen->isl_dev, &res->surf, &res->aux.surf, + &res->aux.extra_aux.surf, 0); + + /* Having both HIZ and MCS is impossible. */ + assert(!has_mcs || !has_hiz); + + /* Ensure aux surface creation for MCS_CCS and HIZ_CCS is correct. */ + if (has_ccs && (has_mcs || has_hiz)) { + assert(res->aux.extra_aux.surf.size_B > 0 && + res->aux.extra_aux.surf.usage & ISL_SURF_USAGE_CCS_BIT); + assert(res->aux.surf.size_B > 0 && + res->aux.surf.usage & + (ISL_SURF_USAGE_HIZ_BIT | ISL_SURF_USAGE_MCS_BIT)); + } + + if (res->mod_info && has_ccs) { + /* Only allow a CCS modifier if the aux was created successfully. */ + res->aux.possible_usages |= 1 << res->mod_info->aux_usage; + } else if (has_mcs) { + res->aux.possible_usages |= + 1 << (has_ccs ? ISL_AUX_USAGE_MCS_CCS : ISL_AUX_USAGE_MCS); + } else if (has_hiz) { + res->aux.possible_usages |= + 1 << (has_ccs ? ISL_AUX_USAGE_HIZ_CCS : ISL_AUX_USAGE_HIZ); + } else if (has_ccs) { + if (want_ccs_e_for_format(devinfo, res->surf.format)) + res->aux.possible_usages |= 1 << ISL_AUX_USAGE_CCS_E; + + if (isl_format_supports_ccs_d(devinfo, res->surf.format)) + res->aux.possible_usages |= 1 << ISL_AUX_USAGE_CCS_D; + } + + res->aux.usage = util_last_bit(res->aux.possible_usages) - 1; + + res->aux.sampler_usages = res->aux.possible_usages; + + /* We don't always support sampling with hiz. But when we do, it must be + * single sampled. + */ + if (!devinfo->has_sample_with_hiz || res->surf.samples > 1) + res->aux.sampler_usages &= ~(1 << ISL_AUX_USAGE_HIZ); + + /* We don't always support sampling with HIZ_CCS. But when we do, treat it + * as CCS_E.*/ + res->aux.sampler_usages &= ~(1 << ISL_AUX_USAGE_HIZ_CCS); + if (isl_surf_supports_hiz_ccs_wt(devinfo, &res->surf, res->aux.usage)) + res->aux.sampler_usages |= 1 << ISL_AUX_USAGE_CCS_E; + + enum isl_aux_state initial_state; *aux_size_B = 0; *alloc_flags = 0; assert(!res->aux.bo); switch (res->aux.usage) { case ISL_AUX_USAGE_NONE: - res->aux.surf.size_B = 0; - ok = true; - break; + /* Having no aux buffer is only okay if there's no modifier with aux. */ + return !res->mod_info || res->mod_info->aux_usage == ISL_AUX_USAGE_NONE; case ISL_AUX_USAGE_HIZ: + case ISL_AUX_USAGE_HIZ_CCS: initial_state = ISL_AUX_STATE_AUX_INVALID; - ok = isl_surf_get_hiz_surf(isl_dev, &res->surf, &res->aux.surf); break; case ISL_AUX_USAGE_MCS: + case ISL_AUX_USAGE_MCS_CCS: /* The Ivybridge PRM, Vol 2 Part 1 p326 says: * * "When MCS buffer is enabled and bound to MSRT, it is required @@ -424,7 +533,6 @@ * 1's, so we simply memset it to 0xff. */ initial_state = ISL_AUX_STATE_CLEAR; - ok = isl_surf_get_mcs_surf(isl_dev, &res->surf, &res->aux.surf); break; case ISL_AUX_USAGE_CCS_D: case ISL_AUX_USAGE_CCS_E: @@ -447,38 +555,46 @@ else initial_state = ISL_AUX_STATE_PASS_THROUGH; *alloc_flags |= BO_ALLOC_ZEROED; - ok = isl_surf_get_ccs_surf(isl_dev, &res->surf, &res->aux.surf, 0); break; + case ISL_AUX_USAGE_MC: + unreachable("Unsupported aux mode"); } - /* We should have a valid aux_surf. */ - if (!ok) + /* Create the aux_state for the auxiliary buffer. */ + res->aux.state = create_aux_state_map(res, initial_state); + if (!res->aux.state) return false; - /* No work is needed for a zero-sized auxiliary buffer. */ - if (res->aux.surf.size_B == 0) - return true; + /* Increase the aux offset if the main and aux surfaces will share a BO. */ + res->aux.offset = + !res->mod_info || res->mod_info->aux_usage == res->aux.usage ? + ALIGN(res->surf.size_B, res->aux.surf.alignment_B) : 0; + uint64_t size = res->aux.surf.size_B; - if (!res->aux.state) { - /* Create the aux_state for the auxiliary buffer. */ - res->aux.state = create_aux_state_map(res, initial_state); - if (!res->aux.state) - return false; + /* Allocate space in the buffer for storing the CCS. */ + if (res->aux.extra_aux.surf.size_B > 0) { + const uint64_t padded_aux_size = + ALIGN(size, res->aux.extra_aux.surf.alignment_B); + res->aux.extra_aux.offset = res->aux.offset + padded_aux_size; + size = padded_aux_size + res->aux.extra_aux.surf.size_B; } - uint64_t size = res->aux.surf.size_B; - /* Allocate space in the buffer for storing the clear color. On modern * platforms (gen > 9), we can read it directly from such buffer. * * On gen <= 9, we are going to store the clear color on the buffer * anyways, and copy it back to the surface state during state emission. + * + * Also add some padding to make sure the fast clear color state buffer + * starts at a 4K alignment. We believe that 256B might be enough, but due + * to lack of testing we will leave this as 4K for now. */ - res->aux.clear_color_offset = size; + size = ALIGN(size, 4096); + res->aux.clear_color_offset = res->aux.offset + size; size += iris_get_aux_clear_color_state_size(screen); *aux_size_B = size; - if (res->aux.usage == ISL_AUX_USAGE_HIZ) { + if (isl_aux_usage_has_hiz(res->aux.usage)) { for (unsigned level = 0; level < res->surf.levels; ++level) { uint32_t width = u_minify(res->surf.phys_level0_sa.width, level); uint32_t height = u_minify(res->surf.phys_level0_sa.height, level); @@ -496,6 +612,8 @@ /** * Initialize the aux buffer contents. + * + * Returns false on unexpected error (e.g. mapping a BO failed). */ static bool iris_resource_init_aux_buf(struct iris_resource *res, uint32_t alloc_flags, @@ -504,17 +622,30 @@ if (!(alloc_flags & BO_ALLOC_ZEROED)) { void *map = iris_bo_map(NULL, res->aux.bo, MAP_WRITE | MAP_RAW); - if (!map) { - iris_resource_disable_aux(res); + if (!map) return false; - } if (iris_resource_get_aux_state(res, 0, 0) != ISL_AUX_STATE_AUX_INVALID) { - uint8_t memset_value = res->aux.usage == ISL_AUX_USAGE_MCS ? 0xFF : 0; + uint8_t memset_value = isl_aux_usage_has_mcs(res->aux.usage) ? 0xFF : 0; memset((char*)map + res->aux.offset, memset_value, res->aux.surf.size_B); } + /* Bspec section titled : MCS/CCS Buffers for Render Target(s) states: + * - If Software wants to enable Color Compression without Fast clear, + * Software needs to initialize MCS with zeros. + * - Lossless compression and CCS initialized to all F (using HW Fast + * Clear or SW direct Clear) + * + * We think, the first bullet point above is referring to CCS aux + * surface. Since we initialize the MCS in the clear state, we also + * initialize the CCS in the clear state (via SW direct clear) to keep + * the two in sync. + */ + memset((char*)map + res->aux.extra_aux.offset, + isl_aux_usage_has_mcs(res->aux.usage) ? 0xFF : 0, + res->aux.extra_aux.surf.size_B); + /* Zero the indirect clear color to match ::fast_clear_color. */ memset((char *)map + res->aux.clear_color_offset, 0, clear_color_state_size); @@ -532,6 +663,9 @@ /** * Allocate the initial aux surface for a resource based on aux.usage + * + * Returns false on unexpected error (e.g. allocation failed, or invalid + * configuration result). */ static bool iris_resource_alloc_separate_aux(struct iris_screen *screen, @@ -551,7 +685,8 @@ * block sizes. */ res->aux.bo = iris_bo_alloc_tiled(screen->bufmgr, "aux buffer", size, 4096, - IRIS_MEMZONE_OTHER, I915_TILING_Y, + IRIS_MEMZONE_OTHER, + isl_tiling_to_i915_tiling(res->aux.surf.tiling), res->aux.surf.row_pitch_B, alloc_flags); if (!res->aux.bo) { return false; @@ -561,6 +696,8 @@ iris_get_aux_clear_color_state_size(screen))) return false; + map_aux_addresses(screen, res); + return true; } @@ -602,55 +739,6 @@ res->base.next = NULL; } -static bool -supports_mcs(const struct isl_surf *surf) -{ - /* MCS compression only applies to multisampled resources. */ - if (surf->samples <= 1) - return false; - - /* Depth and stencil buffers use the IMS (interleaved) layout. */ - if (isl_surf_usage_is_depth_or_stencil(surf->usage)) - return false; - - return true; -} - -static bool -supports_ccs(const struct gen_device_info *devinfo, - const struct isl_surf *surf) -{ - /* CCS only supports singlesampled resources. */ - if (surf->samples > 1) - return false; - - /* Note: still need to check the format! */ - - return true; -} - -static bool -want_ccs_e_for_format(const struct gen_device_info *devinfo, - enum isl_format format) -{ - if (!isl_format_supports_ccs_e(devinfo, format)) - return false; - - const struct isl_format_layout *fmtl = isl_format_get_layout(format); - - /* CCS_E seems to significantly hurt performance with 32-bit floating - * point formats. For example, Paraview's "Wavelet Volume" case uses - * both R32_FLOAT and R32G32B32A32_FLOAT, and enabling CCS_E for those - * formats causes a 62% FPS drop. - * - * However, many benchmarks seem to use 16-bit float with no issues. - */ - if (fmtl->channels.r.bits == 32 && fmtl->channels.r.type == ISL_SFLOAT) - return false; - - return true; -} - static struct pipe_resource * iris_resource_create_for_buffer(struct pipe_screen *pscreen, const struct pipe_resource *templ) @@ -686,6 +774,9 @@ return NULL; } + if (templ->bind & PIPE_BIND_SHARED) + iris_bo_make_external(res->bo); + return &res->base; } @@ -720,21 +811,6 @@ goto fail; } - /* No modifiers - we can select our own tiling. */ - - if (has_depth) { - /* Depth must be Y-tiled */ - tiling_flags = ISL_TILING_Y0_BIT; - } else if (templ->format == PIPE_FORMAT_S8_UINT) { - /* Stencil must be W-tiled */ - tiling_flags = ISL_TILING_W_BIT; - } else if (templ->target == PIPE_BUFFER || - templ->target == PIPE_TEXTURE_1D || - templ->target == PIPE_TEXTURE_1D_ARRAY) { - /* Use linear for buffers and 1D textures */ - tiling_flags = ISL_TILING_LINEAR_BIT; - } - /* Use linear for staging buffers */ if (templ->usage == PIPE_USAGE_STAGING || templ->bind & (PIPE_BIND_LINEAR | PIPE_BIND_CURSOR) ) @@ -781,33 +857,6 @@ .tiling_flags = tiling_flags); assert(isl_surf_created_successfully); - if (res->mod_info) { - res->aux.possible_usages |= 1 << res->mod_info->aux_usage; - } else if (supports_mcs(&res->surf)) { - res->aux.possible_usages |= 1 << ISL_AUX_USAGE_MCS; - } else if (has_depth) { - if (likely(!(INTEL_DEBUG & DEBUG_NO_HIZ))) - res->aux.possible_usages |= 1 << ISL_AUX_USAGE_HIZ; - } else if (likely(!(INTEL_DEBUG & DEBUG_NO_RBC)) && - supports_ccs(devinfo, &res->surf)) { - if (want_ccs_e_for_format(devinfo, res->surf.format)) - res->aux.possible_usages |= 1 << ISL_AUX_USAGE_CCS_E; - - if (isl_format_supports_ccs_d(devinfo, res->surf.format)) - res->aux.possible_usages |= 1 << ISL_AUX_USAGE_CCS_D; - } - - res->aux.usage = util_last_bit(res->aux.possible_usages) - 1; - - res->aux.sampler_usages = res->aux.possible_usages; - - /* We don't always support sampling with hiz. But when we do, it must be - * single sampled. - */ - if (!devinfo->has_sample_with_hiz || res->surf.samples > 1) { - res->aux.sampler_usages &= ~(1 << ISL_AUX_USAGE_HIZ); - } - const char *name = "miptree"; enum iris_memory_zone memzone = IRIS_MEMZONE_OTHER; @@ -822,56 +871,38 @@ uint32_t aux_preferred_alloc_flags; uint64_t aux_size = 0; - bool aux_enabled = - iris_resource_configure_aux(screen, res, false, &aux_size, - &aux_preferred_alloc_flags); - aux_enabled = aux_enabled && res->aux.surf.size_B > 0; - const bool separate_aux = aux_enabled && !res->mod_info; - uint64_t aux_offset; - uint64_t bo_size; - - if (aux_enabled && !separate_aux) { - /* Allocate aux data with main surface. This is required for modifiers - * with aux data (ccs). - */ - aux_offset = ALIGN(res->surf.size_B, res->aux.surf.alignment_B); - bo_size = aux_offset + aux_size; - } else { - aux_offset = 0; - bo_size = res->surf.size_B; + if (!iris_resource_configure_aux(screen, res, false, &aux_size, + &aux_preferred_alloc_flags)) { + goto fail; } - res->bo = iris_bo_alloc_tiled(screen->bufmgr, name, bo_size, 4096, memzone, + /* Modifiers require the aux data to be in the same buffer as the main + * surface, but we combine them even when a modifiers is not being used. + */ + const uint64_t bo_size = + MAX2(res->surf.size_B, res->aux.offset + aux_size); + uint32_t alignment = MAX2(4096, res->surf.alignment_B); + res->bo = iris_bo_alloc_tiled(screen->bufmgr, name, bo_size, alignment, + memzone, isl_tiling_to_i915_tiling(res->surf.tiling), res->surf.row_pitch_B, flags); if (!res->bo) goto fail; - if (aux_enabled) { - if (separate_aux) { - if (!iris_resource_alloc_separate_aux(screen, res)) - aux_enabled = false; - } else { - res->aux.bo = res->bo; - iris_bo_reference(res->aux.bo); - res->aux.offset += aux_offset; - unsigned clear_color_state_size = - iris_get_aux_clear_color_state_size(screen); - if (clear_color_state_size > 0) - res->aux.clear_color_offset += aux_offset; - if (!iris_resource_init_aux_buf(res, flags, clear_color_state_size)) - aux_enabled = false; - } - } - - if (!aux_enabled) { - if (res->mod_info && res->mod_info->aux_usage != ISL_AUX_USAGE_NONE) + if (aux_size > 0) { + res->aux.bo = res->bo; + iris_bo_reference(res->aux.bo); + unsigned clear_color_state_size = + iris_get_aux_clear_color_state_size(screen); + if (!iris_resource_init_aux_buf(res, flags, clear_color_state_size)) goto fail; - else - iris_resource_disable_aux(res); + map_aux_addresses(screen, res); } + if (templ->bind & PIPE_BIND_SHARED) + iris_bo_make_external(res->bo); + return &res->base; fail: @@ -923,11 +954,11 @@ user_memory, templ->width0, IRIS_MEMZONE_OTHER); if (!res->bo) { - free(res); + iris_resource_destroy(pscreen, &res->base); return NULL; } - util_range_add(&res->valid_buffer_range, 0, templ->width0); + util_range_add(&res->base, &res->valid_buffer_range, 0, templ->width0); return &res->base; } @@ -942,12 +973,21 @@ struct gen_device_info *devinfo = &screen->devinfo; struct iris_bufmgr *bufmgr = screen->bufmgr; struct iris_resource *res = iris_alloc_resource(pscreen, templ); + const struct isl_drm_modifier_info *mod_inf = + isl_drm_modifier_get_info(whandle->modifier); + uint32_t tiling; + if (!res) return NULL; switch (whandle->type) { case WINSYS_HANDLE_TYPE_FD: - res->bo = iris_bo_import_dmabuf(bufmgr, whandle->handle); + if (mod_inf) + tiling = isl_tiling_to_i915_tiling(mod_inf->tiling); + else + tiling = I915_TILING_LAST + 1; + res->bo = iris_bo_import_dmabuf(bufmgr, whandle->handle, + tiling, whandle->stride); break; case WINSYS_HANDLE_TYPE_SHARED: res->bo = iris_bo_gem_create_from_name(bufmgr, "winsys image", @@ -957,16 +997,18 @@ unreachable("invalid winsys handle type"); } if (!res->bo) - return NULL; + goto fail; res->offset = whandle->offset; - uint64_t modifier = whandle->modifier; - if (modifier == DRM_FORMAT_MOD_INVALID) { - modifier = tiling_to_modifier(res->bo->tiling_mode); + if (mod_inf == NULL) { + mod_inf = + isl_drm_modifier_get_info(tiling_to_modifier(res->bo->tiling_mode)); } - res->mod_info = isl_drm_modifier_get_info(modifier); - assert(res->mod_info); + assert(mod_inf); + + res->external_format = whandle->format; + res->mod_info = mod_inf; isl_surf_usage_flags_t isl_usage = pipe_bind_to_isl_usage(templ->bind); @@ -977,7 +1019,8 @@ if (templ->target == PIPE_BUFFER) { res->surf.tiling = ISL_TILING_LINEAR; } else { - if (whandle->modifier == DRM_FORMAT_MOD_INVALID || whandle->plane == 0) { + /* Create a surface for each plane specified by the external format. */ + if (whandle->plane < util_format_get_num_planes(whandle->format)) { UNUSED const bool isl_surf_created_successfully = isl_surf_init(&screen->isl_dev, &res->surf, .dim = target_to_isl_surf_dim(templ->target), @@ -1004,9 +1047,6 @@ if (res->mod_info->aux_usage != ISL_AUX_USAGE_NONE) { uint32_t alloc_flags; uint64_t size; - res->aux.usage = res->mod_info->aux_usage; - res->aux.possible_usages = 1 << res->mod_info->aux_usage; - res->aux.sampler_usages = res->aux.possible_usages; bool ok = iris_resource_configure_aux(screen, res, true, &size, &alloc_flags); assert(ok); @@ -1072,7 +1112,7 @@ } static bool -iris_resource_get_param(struct pipe_screen *screen, +iris_resource_get_param(struct pipe_screen *pscreen, struct pipe_context *context, struct pipe_resource *resource, unsigned plane, @@ -1081,6 +1121,7 @@ unsigned handle_usage, uint64_t *value) { + struct iris_screen *screen = (struct iris_screen *)pscreen; struct iris_resource *res = (struct iris_resource *)resource; bool mod_with_aux = res->mod_info && res->mod_info->aux_usage != ISL_AUX_USAGE_NONE; @@ -1089,7 +1130,7 @@ unsigned handle; if (iris_resource_unfinished_aux_import(res)) - iris_resource_finish_aux_import(screen, res); + iris_resource_finish_aux_import(pscreen, res); struct iris_bo *bo = wants_aux ? res->aux.bo : res->bo; @@ -1121,9 +1162,19 @@ if (result) *value = handle; return result; - case PIPE_RESOURCE_PARAM_HANDLE_TYPE_KMS: - *value = iris_bo_export_gem_handle(bo); + case PIPE_RESOURCE_PARAM_HANDLE_TYPE_KMS: { + /* Because we share the same drm file across multiple iris_screen, when + * we export a GEM handle we must make sure it is valid in the DRM file + * descriptor the caller is using (this is the FD given at screen + * creation). + */ + uint32_t handle; + if (iris_bo_export_gem_handle_for_device(bo, screen->winsys_fd, &handle)) + return false; + *value = handle; return true; + } + case PIPE_RESOURCE_PARAM_HANDLE_TYPE_FD: result = iris_bo_export_dmabuf(bo, (int *) &handle) == 0; if (result) @@ -1141,6 +1192,7 @@ struct winsys_handle *whandle, unsigned usage) { + struct iris_screen *screen = (struct iris_screen *) pscreen; struct iris_resource *res = (struct iris_resource *)resource; bool mod_with_aux = res->mod_info && res->mod_info->aux_usage != ISL_AUX_USAGE_NONE; @@ -1158,6 +1210,8 @@ whandle->stride = res->surf.row_pitch_B; bo = res->bo; } + + whandle->format = res->external_format; whandle->modifier = res->mod_info ? res->mod_info->modifier : tiling_to_modifier(res->bo->tiling_mode); @@ -1176,9 +1230,18 @@ switch (whandle->type) { case WINSYS_HANDLE_TYPE_SHARED: return iris_bo_flink(bo, &whandle->handle) == 0; - case WINSYS_HANDLE_TYPE_KMS: - whandle->handle = iris_bo_export_gem_handle(bo); + case WINSYS_HANDLE_TYPE_KMS: { + /* Because we share the same drm file across multiple iris_screen, when + * we export a GEM handle we must make sure it is valid in the DRM file + * descriptor the caller is using (this is the FD given at screen + * creation). + */ + uint32_t handle; + if (iris_bo_export_gem_handle_for_device(bo, screen->winsys_fd, &handle)) + return false; + whandle->handle = handle; return true; + } case WINSYS_HANDLE_TYPE_FD: return iris_bo_export_dmabuf(bo, (int *) &whandle->handle) == 0; } @@ -1209,6 +1272,10 @@ if (resource->target != PIPE_BUFFER) return; + /* If it's already invalidated, don't bother doing anything. */ + if (res->valid_buffer_range.start > res->valid_buffer_range.end) + return; + if (!resource_is_busy(ice, res)) { /* The resource is idle, so just mark that it contains no data and * keep using the same underlying buffer object. @@ -1240,7 +1307,7 @@ /* Rebind the buffer, replacing any state referring to the old BO's * address, and marking state dirty so it's reemitted. */ - ice->vtbl.rebind_buffer(ice, res, old_bo->gtt_offset); + ice->vtbl.rebind_buffer(ice, res); util_range_set_empty(&res->valid_buffer_range); @@ -1474,7 +1541,7 @@ * mesa: Fix return type of _mesa_get_format_bytes() (#37351) */ static intptr_t -s8_offset(uint32_t stride, uint32_t x, uint32_t y, bool swizzled) +s8_offset(uint32_t stride, uint32_t x, uint32_t y) { uint32_t tile_size = 4096; uint32_t tile_width = 64; @@ -1499,17 +1566,6 @@ + 2 * (byte_y % 2) + 1 * (byte_x % 2); - if (swizzled) { - /* adjust for bit6 swizzling */ - if (((byte_x / 8) % 2) == 1) { - if (((byte_y / 8) % 2) == 0) { - u += 64; - } else { - u -= 64; - } - } - } - return u; } @@ -1520,7 +1576,6 @@ const struct pipe_box *box = &xfer->box; struct iris_resource *res = (struct iris_resource *) xfer->resource; struct isl_surf *surf = &res->surf; - const bool has_swizzling = false; if (xfer->usage & PIPE_TRANSFER_WRITE) { uint8_t *untiled_s8_map = map->ptr; @@ -1535,8 +1590,7 @@ for (uint32_t x = 0; x < box->width; x++) { ptrdiff_t offset = s8_offset(surf->row_pitch_B, x0_el + box->x + x, - y0_el + box->y + y, - has_swizzling); + y0_el + box->y + y); tiled_s8_map[offset] = untiled_s8_map[s * xfer->layer_stride + y * xfer->stride + x]; } @@ -1565,8 +1619,6 @@ map->buffer = map->ptr = malloc(xfer->layer_stride * box->depth); assert(map->buffer); - const bool has_swizzling = false; - /* One of either READ_BIT or WRITE_BIT or both is set. READ_BIT implies no * INVALIDATE_RANGE_BIT. WRITE_BIT needs the original values read in unless * invalidate is set, since we'll be writing the whole rectangle from our @@ -1585,8 +1637,7 @@ for (uint32_t x = 0; x < box->width; x++) { ptrdiff_t offset = s8_offset(surf->row_pitch_B, x0_el + box->x + x, - y0_el + box->y + y, - has_swizzling); + y0_el + box->y + y); untiled_s8_map[s * xfer->layer_stride + y * xfer->stride + x] = tiled_s8_map[offset]; } @@ -1753,6 +1804,9 @@ struct iris_resource *res = (struct iris_resource *)resource; struct isl_surf *surf = &res->surf; + if (iris_resource_unfinished_aux_import(res)) + iris_resource_finish_aux_import(ctx->screen, res); + if (usage & PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE) { /* Replace the backing storage with a fresh buffer for non-async maps */ if (!(usage & (PIPE_TRANSFER_UNSYNCHRONIZED | @@ -1811,8 +1865,12 @@ xfer->box = *box; *ptransfer = xfer; + map->dest_had_defined_contents = + util_ranges_intersect(&res->valid_buffer_range, box->x, + box->x + box->width); + if (usage & PIPE_TRANSFER_WRITE) - util_range_add(&res->valid_buffer_range, box->x, box->x + box->width); + util_range_add(&res->base, &res->valid_buffer_range, box->x, box->x + box->width); /* Avoid using GPU copies for persistent/coherent buffers, as the idea * there is to access them simultaneously on the CPU & GPU. This also @@ -1891,8 +1949,13 @@ uint32_t history_flush = 0; if (res->base.target == PIPE_BUFFER) { - history_flush |= iris_flush_bits_for_history(res) | - (map->staging ? PIPE_CONTROL_RENDER_TARGET_FLUSH : 0); + if (map->staging) + history_flush |= PIPE_CONTROL_RENDER_TARGET_FLUSH; + + if (map->dest_had_defined_contents) + history_flush |= iris_flush_bits_for_history(res); + + util_range_add(&res->base, &res->valid_buffer_range, box->x, box->x + box->width); } if (history_flush & ~PIPE_CONTROL_CS_STALL) { @@ -1919,7 +1982,8 @@ struct iris_context *ice = (struct iris_context *)ctx; struct iris_transfer *map = (void *) xfer; - if (!(xfer->usage & PIPE_TRANSFER_FLUSH_EXPLICIT)) { + if (!(xfer->usage & (PIPE_TRANSFER_FLUSH_EXPLICIT | + PIPE_TRANSFER_COHERENT))) { struct pipe_box flush_box = { .x = 0, .y = 0, .z = 0, .width = xfer->box.width, @@ -1946,13 +2010,7 @@ uint64_t dirty = 0ull; if (res->bind_history & PIPE_BIND_CONSTANT_BUFFER) { - dirty |= IRIS_DIRTY_CONSTANTS_VS | - IRIS_DIRTY_CONSTANTS_TCS | - IRIS_DIRTY_CONSTANTS_TES | - IRIS_DIRTY_CONSTANTS_GS | - IRIS_DIRTY_CONSTANTS_FS | - IRIS_DIRTY_CONSTANTS_CS | - IRIS_ALL_DIRTY_BINDINGS; + dirty |= ((uint64_t)res->bind_stages) << IRIS_SHIFT_FOR_DIRTY_CONSTANTS; } ice->state.dirty |= dirty; diff -Nru mesa-19.2.8/src/gallium/drivers/iris/iris_resource.h mesa-20.0.8/src/gallium/drivers/iris/iris_resource.h --- mesa-19.2.8/src/gallium/drivers/iris/iris_resource.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/iris/iris_resource.h 2020-06-12 01:21:17.000000000 +0000 @@ -27,6 +27,7 @@ #include "util/u_inlines.h" #include "util/u_range.h" #include "intel/isl/isl.h" +#include "iris_bufmgr.h" struct iris_batch; struct iris_context; @@ -42,11 +43,6 @@ #define IRIS_RESOURCE_FLAG_SURFACE_MEMZONE (PIPE_RESOURCE_FLAG_DRV_PRIV << 1) #define IRIS_RESOURCE_FLAG_DYNAMIC_MEMZONE (PIPE_RESOURCE_FLAG_DRV_PRIV << 2) -enum gen9_astc5x5_wa_tex_type { - GEN9_ASTC5X5_WA_TEX_TYPE_ASTC5x5 = 1 << 0, - GEN9_ASTC5X5_WA_TEX_TYPE_AUX = 1 << 1, -}; - /** * Resources represent a GPU buffer object or image (mipmap tree). * @@ -78,6 +74,12 @@ unsigned bind_history; /** + * A bitfield of MESA_SHADER_* stages indicating where this resource + * was bound. + */ + unsigned bind_stages; + + /** * For PIPE_BUFFER resources, a range which may contain valid data. * * This is a conservative estimate of what part of the buffer contains @@ -100,6 +102,13 @@ /** Offset into 'bo' where the auxiliary surface starts. */ uint32_t offset; + struct { + struct isl_surf surf; + + /** Offset into 'bo' where the auxiliary surface starts. */ + uint32_t offset; + } extra_aux; + /** * Fast clear color for this surface. For depth surfaces, the clear * value is stored as a float32 in the red component. @@ -150,6 +159,13 @@ } aux; /** + * For external surfaces, this is format that was used to create or import + * the surface. For internal surfaces, this will always be + * PIPE_FORMAT_NONE. + */ + enum pipe_format external_format; + + /** * For external surfaces, this is DRM format modifier that was used to * create or import the surface. For internal surfaces, this will always * be DRM_FORMAT_MOD_INVALID. @@ -167,6 +183,33 @@ }; /** + * The SURFACE_STATE descriptors for a resource. + */ +struct iris_surface_state { + /** + * CPU-side copy of the packed SURFACE_STATE structures, already + * aligned so they can be uploaded as a contiguous pile of bytes. + * + * This can be updated and re-uploaded if (e.g.) addresses need to change. + */ + uint32_t *cpu; + + /** + * How many states are there? (Each aux mode has its own state.) + */ + unsigned num_states; + + /** + * Address of the resource (res->bo->gtt_offset). Note that "Surface + * Base Address" may be offset from this value. + */ + uint64_t bo_address; + + /** A reference to the GPU buffer holding our uploaded SURFACE_STATE */ + struct iris_state_ref ref; +}; + +/** * Gallium CSO for sampler views (texture views). * * In addition to the normal pipe_resource, this adds an ISL view @@ -187,7 +230,7 @@ struct iris_resource *res; /** The resource (BO) holding our SURFACE_STATE. */ - struct iris_state_ref surface_state; + struct iris_surface_state surface_state; }; /** @@ -197,7 +240,7 @@ struct pipe_image_view base; /** The resource (BO) holding our SURFACE_STATE. */ - struct iris_state_ref surface_state; + struct iris_surface_state surface_state; }; /** @@ -213,9 +256,9 @@ union isl_color_value clear_color; /** The resource (BO) holding our SURFACE_STATE. */ - struct iris_state_ref surface_state; + struct iris_surface_state surface_state; /** The resource (BO) holding our SURFACE_STATE for read. */ - struct iris_state_ref surface_state_read; + struct iris_surface_state surface_state_read; }; /** @@ -232,6 +275,8 @@ struct blorp_context *blorp; struct iris_batch *batch; + bool dest_had_defined_contents; + void (*unmap)(struct iris_transfer *); }; @@ -245,6 +290,12 @@ return res->bo; } +static inline uint32_t +iris_mocs(const struct iris_bo *bo, const struct isl_device *dev) +{ + return bo && bo->external ? dev->mocs.external : dev->mocs.internal; +} + struct iris_format_info iris_format_for_usage(const struct gen_device_info *, enum pipe_format pf, isl_surf_usage_flags_t usage); @@ -404,18 +455,13 @@ uint32_t *tile_x, uint32_t *tile_y); enum isl_aux_usage iris_resource_texture_aux_usage(struct iris_context *ice, const struct iris_resource *res, - enum isl_format view_fmt, - enum gen9_astc5x5_wa_tex_type); + enum isl_format view_fmt); void iris_resource_prepare_texture(struct iris_context *ice, struct iris_batch *batch, struct iris_resource *res, enum isl_format view_format, uint32_t start_level, uint32_t num_levels, - uint32_t start_layer, uint32_t num_layers, - enum gen9_astc5x5_wa_tex_type); -void iris_resource_prepare_image(struct iris_context *ice, - struct iris_batch *batch, - struct iris_resource *res); + uint32_t start_layer, uint32_t num_layers); static inline bool iris_resource_unfinished_aux_import(struct iris_resource *res) @@ -436,6 +482,10 @@ bool iris_resource_level_has_hiz(const struct iris_resource *res, uint32_t level); + +bool iris_sample_with_depth_aux(const struct gen_device_info *devinfo, + const struct iris_resource *res); + bool iris_has_color_unresolved(const struct iris_resource *res, unsigned start_level, unsigned num_levels, unsigned start_layer, unsigned num_layers); diff -Nru mesa-19.2.8/src/gallium/drivers/iris/iris_screen.c mesa-20.0.8/src/gallium/drivers/iris/iris_screen.c --- mesa-19.2.8/src/gallium/drivers/iris/iris_screen.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/iris/iris_screen.c 2020-06-12 01:21:17.000000000 +0000 @@ -39,7 +39,7 @@ #include "pipe/p_screen.h" #include "util/debug.h" #include "util/u_inlines.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_transfer_helper.h" #include "util/u_upload_mgr.h" #include "util/ralloc.h" @@ -53,6 +53,7 @@ #include "iris_screen.h" #include "intel/compiler/brw_compiler.h" #include "intel/common/gen_gem.h" +#include "intel/common/gen_l3_config.h" #include "iris_monitor.h" static void @@ -80,18 +81,12 @@ { struct iris_screen *screen = (struct iris_screen *)pscreen; static char buf[128]; - const char *chipset; + const char *name = gen_get_device_name(screen->pci_id); - switch (screen->pci_id) { -#undef CHIPSET -#define CHIPSET(id, symbol, str) case id: chipset = str; break; -#include "pci_ids/i965_pci_ids.h" - default: - chipset = "Unknown Intel Chipset"; - break; - } + if (!name) + name = "Intel Unknown"; - snprintf(buf, sizeof(buf), "Mesa %s", chipset); + snprintf(buf, sizeof(buf), "Mesa %s", name); return buf; } @@ -200,6 +195,10 @@ case PIPE_CAP_CS_DERIVED_SYSTEM_VALUES_SUPPORTED: case PIPE_CAP_TEXTURE_SHADOW_LOD: case PIPE_CAP_SHADER_SAMPLES_IDENTICAL: + case PIPE_CAP_GL_SPIRV: + case PIPE_CAP_GL_SPIRV_VARIABLE_POINTERS: + case PIPE_CAP_DEMOTE_TO_HELPER_INVOCATION: + case PIPE_CAP_NATIVE_FENCE_FD: return true; case PIPE_CAP_FBFETCH: return BRW_MAX_DRAW_BUFFERS; @@ -308,6 +307,8 @@ * illegal snoop <-> snoop transfers. */ return devinfo->has_llc; + case PIPE_CAP_THROTTLE: + return screen->driconf.disable_throttling ? 0 : 1; case PIPE_CAP_CONTEXT_PRIORITY_MASK: return PIPE_CONTEXT_PRIORITY_LOW | @@ -324,6 +325,10 @@ case PIPE_CAP_PCI_FUNCTION: return 0; + case PIPE_CAP_OPENCL_INTEGER_FUNCTIONS: + case PIPE_CAP_INTEGER_MULTIPLY_32X16: + return true; + default: return u_pipe_screen_get_param_defaults(pscreen, param); } @@ -398,7 +403,6 @@ case PIPE_SHADER_CAP_SUBROUTINES: return 0; case PIPE_SHADER_CAP_INTEGERS: - case PIPE_SHADER_CAP_SCALAR_ISA: return 1; case PIPE_SHADER_CAP_INT64_ATOMICS: case PIPE_SHADER_CAP_FP16: @@ -513,19 +517,24 @@ return result; } -static void -iris_destroy_screen(struct pipe_screen *pscreen) +void +iris_screen_destroy(struct iris_screen *screen) { - struct iris_screen *screen = (struct iris_screen *) pscreen; iris_bo_unreference(screen->workaround_bo); - u_transfer_helper_destroy(pscreen->transfer_helper); - iris_bufmgr_destroy(screen->bufmgr); + u_transfer_helper_destroy(screen->base.transfer_helper); + iris_bufmgr_unref(screen->bufmgr); disk_cache_destroy(screen->disk_cache); - close(screen->fd); + close(screen->winsys_fd); ralloc_free(screen); } static void +iris_screen_unref(struct pipe_screen *pscreen) +{ + iris_pscreen_unref(pscreen); +} + +static void iris_query_memory_info(struct pipe_screen *pscreen, struct pipe_memory_info *info) { @@ -551,27 +560,38 @@ } static int -iris_getparam(struct iris_screen *screen, int param, int *value) +iris_getparam(int fd, int param, int *value) { struct drm_i915_getparam gp = { .param = param, .value = value }; - if (ioctl(screen->fd, DRM_IOCTL_I915_GETPARAM, &gp) == -1) + if (ioctl(fd, DRM_IOCTL_I915_GETPARAM, &gp) == -1) return -errno; return 0; } static int -iris_getparam_integer(struct iris_screen *screen, int param) +iris_getparam_integer(int fd, int param) { int value = -1; - if (iris_getparam(screen, param, &value) == 0) + if (iris_getparam(fd, param, &value) == 0) return value; return -1; } +static const struct gen_l3_config * +iris_get_default_l3_config(const struct gen_device_info *devinfo, + bool compute) +{ + bool wants_dc_cache = true; + bool has_slm = compute; + const struct gen_l3_weights w = + gen_get_default_l3_weights(devinfo, wants_dc_cache, has_slm); + return gen_get_l3_config(devinfo, w); +} + static void iris_shader_debug_log(void *data, const char *fmt, ...) { @@ -612,29 +632,56 @@ struct pipe_screen * iris_screen_create(int fd, const struct pipe_screen_config *config) { + /* Here are the i915 features we need for Iris (in chronoligical order) : + * - I915_PARAM_HAS_EXEC_NO_RELOC (3.10) + * - I915_PARAM_HAS_EXEC_HANDLE_LUT (3.10) + * - I915_PARAM_HAS_EXEC_BATCH_FIRST (4.13) + * - I915_PARAM_HAS_EXEC_FENCE_ARRAY (4.14) + * - I915_PARAM_HAS_CONTEXT_ISOLATION (4.16) + * + * Checking the last feature availability will include all previous ones. + */ + if (!iris_getparam_integer(fd, I915_PARAM_HAS_CONTEXT_ISOLATION)) { + debug_error("Kernel is too old for Iris. Consider upgrading to kernel v4.16.\n"); + return NULL; + } + struct iris_screen *screen = rzalloc(NULL, struct iris_screen); if (!screen) return NULL; - screen->fd = fd; - if (!gen_get_device_info_from_fd(fd, &screen->devinfo)) return NULL; screen->pci_id = screen->devinfo.chipset_id; screen->no_hw = screen->devinfo.no_hw; + p_atomic_set(&screen->refcount, 1); + if (screen->devinfo.gen < 8 || screen->devinfo.is_cherryview) return NULL; + bool bo_reuse = false; + int bo_reuse_mode = driQueryOptioni(config->options, "bo_reuse"); + switch (bo_reuse_mode) { + case DRI_CONF_BO_REUSE_DISABLED: + break; + case DRI_CONF_BO_REUSE_ALL: + bo_reuse = true; + break; + } + + screen->bufmgr = iris_bufmgr_get_for_fd(&screen->devinfo, fd, bo_reuse); + if (!screen->bufmgr) + return NULL; + + screen->fd = iris_bufmgr_get_fd(screen->bufmgr); + screen->winsys_fd = fd; + screen->aperture_bytes = get_aperture_size(fd); if (getenv("INTEL_NO_HW") != NULL) screen->no_hw = true; - screen->bufmgr = iris_bufmgr_init(&screen->devinfo, fd); - if (!screen->bufmgr) - return NULL; - screen->workaround_bo = iris_bo_alloc(screen->bufmgr, "workaround", 4096, IRIS_MEMZONE_OTHER); if (!screen->workaround_bo) @@ -644,6 +691,10 @@ screen->driconf.dual_color_blend_by_location = driQueryOptionb(config->options, "dual_color_blend_by_location"); + screen->driconf.disable_throttling = + driQueryOptionb(config->options, "disable_throttling"); + screen->driconf.always_flush_cache = + driQueryOptionb(config->options, "always_flush_cache"); screen->precompile = env_var_as_boolean("shader_precompile", true); @@ -654,6 +705,10 @@ screen->compiler->shader_perf_log = iris_shader_perf_log; screen->compiler->supports_pull_constants = false; screen->compiler->supports_shader_constants = true; + screen->compiler->compact_params = false; + + screen->l3_config_3d = iris_get_default_l3_config(&screen->devinfo, false); + screen->l3_config_cs = iris_get_default_l3_config(&screen->devinfo, true); iris_disk_cache_init(screen); @@ -661,7 +716,7 @@ sizeof(struct iris_transfer), 64); screen->subslice_total = - iris_getparam_integer(screen, I915_PARAM_SUBSLICE_TOTAL); + iris_getparam_integer(screen->fd, I915_PARAM_SUBSLICE_TOTAL); assert(screen->subslice_total >= 1); struct pipe_screen *pscreen = &screen->base; @@ -669,7 +724,7 @@ iris_init_screen_fence_functions(pscreen); iris_init_screen_resource_functions(pscreen); - pscreen->destroy = iris_destroy_screen; + pscreen->destroy = iris_screen_unref; pscreen->get_name = iris_get_name; pscreen->get_vendor = iris_get_vendor; pscreen->get_device_vendor = iris_get_device_vendor; diff -Nru mesa-19.2.8/src/gallium/drivers/iris/iris_screen.h mesa-20.0.8/src/gallium/drivers/iris/iris_screen.h --- mesa-19.2.8/src/gallium/drivers/iris/iris_screen.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/iris/iris_screen.h 2020-06-12 01:21:17.000000000 +0000 @@ -34,6 +34,7 @@ struct iris_bo; struct iris_monitor_config; +struct gen_l3_config; #define READ_ONCE(x) (*(volatile __typeof__(x) *)&(x)) #define WRITE_ONCE(x, v) *(volatile __typeof__(x) *)&(x) = (v) @@ -45,12 +46,20 @@ struct iris_screen { struct pipe_screen base; + uint32_t refcount; + /** Global slab allocator for iris_transfer_map objects */ struct slab_parent_pool transfer_pool; - /** drm device file descriptor */ + /** drm device file descriptor, shared with bufmgr, do not close. */ int fd; + /** + * drm device file descriptor to used for window system integration, owned + * by iris_screen, can be a different DRM instance than fd. + */ + int winsys_fd; + /** PCI ID for our GPU device */ int pci_id; @@ -66,6 +75,8 @@ struct { /** Dual color blend by location instead of index (for broken apps) */ bool dual_color_blend_by_location; + bool disable_throttling; + bool always_flush_cache; } driconf; unsigned subslice_total; @@ -78,6 +89,9 @@ struct brw_compiler *compiler; struct iris_monitor_config *monitor_cfg; + const struct gen_l3_config *l3_config_3d; + const struct gen_l3_config *l3_config_cs; + /** * A buffer containing nothing useful, for hardware workarounds that * require scratch writes or reads from some unimportant memory. @@ -90,6 +104,26 @@ struct pipe_screen * iris_screen_create(int fd, const struct pipe_screen_config *config); +void iris_screen_destroy(struct iris_screen *screen); + +UNUSED static inline struct pipe_screen * +iris_pscreen_ref(struct pipe_screen *pscreen) +{ + struct iris_screen *screen = (struct iris_screen *) pscreen; + + p_atomic_inc(&screen->refcount); + return pscreen; +} + +UNUSED static inline void +iris_pscreen_unref(struct pipe_screen *pscreen) +{ + struct iris_screen *screen = (struct iris_screen *) pscreen; + + if (p_atomic_dec_zero(&screen->refcount)) + iris_screen_destroy(screen); +} + bool iris_is_format_supported(struct pipe_screen *pscreen, enum pipe_format format, diff -Nru mesa-19.2.8/src/gallium/drivers/iris/iris_state.c mesa-20.0.8/src/gallium/drivers/iris/iris_state.c --- mesa-19.2.8/src/gallium/drivers/iris/iris_state.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/iris/iris_state.c 2020-06-12 01:21:17.000000000 +0000 @@ -90,7 +90,7 @@ #include "pipe/p_screen.h" #include "util/u_dual_blend.h" #include "util/u_inlines.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_framebuffer.h" #include "util/u_transfer.h" #include "util/u_upload_mgr.h" @@ -98,6 +98,7 @@ #include "drm-uapi/i915_drm.h" #include "nir.h" #include "intel/compiler/brw_compiler.h" +#include "intel/common/gen_aux_map.h" #include "intel/common/gen_l3_config.h" #include "intel/common/gen_sample_positions.h" #include "iris_batch.h" @@ -109,20 +110,6 @@ #include "iris_genx_macros.h" #include "intel/common/gen_guardband.h" -#if GEN_GEN == 8 -#define MOCS_PTE 0x18 -#define MOCS_WB 0x78 -#else -#define MOCS_PTE (1 << 1) -#define MOCS_WB (2 << 1) -#endif - -static uint32_t -mocs(const struct iris_bo *bo) -{ - return bo && bo->external ? MOCS_PTE : MOCS_WB; -} - /** * Statically assert that PIPE_* enums match the hardware packets. * (As long as they match, we don't need to translate them.) @@ -353,9 +340,10 @@ struct iris_bo *bo = iris_resource_bo(*out_res); iris_use_pinned_bo(batch, bo, false); - *out_offset += iris_bo_offset_from_base_address(bo); + iris_record_state_size(batch->state_sizes, + bo->gtt_offset + *out_offset, size); - iris_record_state_size(batch->state_sizes, *out_offset, size); + *out_offset += iris_bo_offset_from_base_address(bo); return ptr; } @@ -391,8 +379,10 @@ (!old_cso || memcmp(old_cso->x, new_cso->x, sizeof(old_cso->x)) != 0) static void -flush_for_state_base_change(struct iris_batch *batch) +flush_before_state_base_change(struct iris_batch *batch) { + const struct gen_device_info *devinfo = &batch->screen->devinfo; + /* Flush before emitting STATE_BASE_ADDRESS. * * This isn't documented anywhere in the PRM. However, it seems to be @@ -415,10 +405,68 @@ * rendering. It's a bit of a big hammer but it appears to work. */ iris_emit_end_of_pipe_sync(batch, - "change STATE_BASE_ADDRESS", + "change STATE_BASE_ADDRESS (flushes)", PIPE_CONTROL_RENDER_TARGET_FLUSH | PIPE_CONTROL_DEPTH_CACHE_FLUSH | - PIPE_CONTROL_DATA_CACHE_FLUSH); + PIPE_CONTROL_DATA_CACHE_FLUSH | + /* GEN:BUG:1606662791: + * + * Software must program PIPE_CONTROL command + * with "HDC Pipeline Flush" prior to + * programming of the below two non-pipeline + * state : + * * STATE_BASE_ADDRESS + * * 3DSTATE_BINDING_TABLE_POOL_ALLOC + */ + ((GEN_GEN == 12 && devinfo->revision == 0 /* A0 */ ? + PIPE_CONTROL_FLUSH_HDC : 0))); +} + +static void +flush_after_state_base_change(struct iris_batch *batch) +{ + /* After re-setting the surface state base address, we have to do some + * cache flusing so that the sampler engine will pick up the new + * SURFACE_STATE objects and binding tables. From the Broadwell PRM, + * Shared Function > 3D Sampler > State > State Caching (page 96): + * + * Coherency with system memory in the state cache, like the texture + * cache is handled partially by software. It is expected that the + * command stream or shader will issue Cache Flush operation or + * Cache_Flush sampler message to ensure that the L1 cache remains + * coherent with system memory. + * + * [...] + * + * Whenever the value of the Dynamic_State_Base_Addr, + * Surface_State_Base_Addr are altered, the L1 state cache must be + * invalidated to ensure the new surface or sampler state is fetched + * from system memory. + * + * The PIPE_CONTROL command has a "State Cache Invalidation Enable" bit + * which, according the PIPE_CONTROL instruction documentation in the + * Broadwell PRM: + * + * Setting this bit is independent of any other bit in this packet. + * This bit controls the invalidation of the L1 and L2 state caches + * at the top of the pipe i.e. at the parsing time. + * + * Unfortunately, experimentation seems to indicate that state cache + * invalidation through a PIPE_CONTROL does nothing whatsoever in + * regards to surface state and binding tables. In stead, it seems that + * invalidating the texture cache is what is actually needed. + * + * XXX: As far as we have been able to determine through + * experimentation, shows that flush the texture cache appears to be + * sufficient. The theory here is that all of the sampling/rendering + * units cache the binding table in the texture cache. However, we have + * yet to be able to actually confirm this. + */ + iris_emit_end_of_pipe_sync(batch, + "change STATE_BASE_ADDRESS (invalidates)", + PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE | + PIPE_CONTROL_CONST_CACHE_INVALIDATE | + PIPE_CONTROL_STATE_CACHE_INVALIDATE); } static void @@ -441,6 +489,128 @@ } static void +iris_load_register_reg32(struct iris_batch *batch, uint32_t dst, + uint32_t src) +{ + _iris_emit_lrr(batch, dst, src); +} + +static void +iris_load_register_reg64(struct iris_batch *batch, uint32_t dst, + uint32_t src) +{ + _iris_emit_lrr(batch, dst, src); + _iris_emit_lrr(batch, dst + 4, src + 4); +} + +static void +iris_load_register_imm32(struct iris_batch *batch, uint32_t reg, + uint32_t val) +{ + _iris_emit_lri(batch, reg, val); +} + +static void +iris_load_register_imm64(struct iris_batch *batch, uint32_t reg, + uint64_t val) +{ + _iris_emit_lri(batch, reg + 0, val & 0xffffffff); + _iris_emit_lri(batch, reg + 4, val >> 32); +} + +/** + * Emit MI_LOAD_REGISTER_MEM to load a 32-bit MMIO register from a buffer. + */ +static void +iris_load_register_mem32(struct iris_batch *batch, uint32_t reg, + struct iris_bo *bo, uint32_t offset) +{ + iris_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) { + lrm.RegisterAddress = reg; + lrm.MemoryAddress = ro_bo(bo, offset); + } +} + +/** + * Load a 64-bit value from a buffer into a MMIO register via + * two MI_LOAD_REGISTER_MEM commands. + */ +static void +iris_load_register_mem64(struct iris_batch *batch, uint32_t reg, + struct iris_bo *bo, uint32_t offset) +{ + iris_load_register_mem32(batch, reg + 0, bo, offset + 0); + iris_load_register_mem32(batch, reg + 4, bo, offset + 4); +} + +static void +iris_store_register_mem32(struct iris_batch *batch, uint32_t reg, + struct iris_bo *bo, uint32_t offset, + bool predicated) +{ + iris_emit_cmd(batch, GENX(MI_STORE_REGISTER_MEM), srm) { + srm.RegisterAddress = reg; + srm.MemoryAddress = rw_bo(bo, offset); + srm.PredicateEnable = predicated; + } +} + +static void +iris_store_register_mem64(struct iris_batch *batch, uint32_t reg, + struct iris_bo *bo, uint32_t offset, + bool predicated) +{ + iris_store_register_mem32(batch, reg + 0, bo, offset + 0, predicated); + iris_store_register_mem32(batch, reg + 4, bo, offset + 4, predicated); +} + +static void +iris_store_data_imm32(struct iris_batch *batch, + struct iris_bo *bo, uint32_t offset, + uint32_t imm) +{ + iris_emit_cmd(batch, GENX(MI_STORE_DATA_IMM), sdi) { + sdi.Address = rw_bo(bo, offset); + sdi.ImmediateData = imm; + } +} + +static void +iris_store_data_imm64(struct iris_batch *batch, + struct iris_bo *bo, uint32_t offset, + uint64_t imm) +{ + /* Can't use iris_emit_cmd because MI_STORE_DATA_IMM has a length of + * 2 in genxml but it's actually variable length and we need 5 DWords. + */ + void *map = iris_get_command_space(batch, 4 * 5); + _iris_pack_command(batch, GENX(MI_STORE_DATA_IMM), map, sdi) { + sdi.DWordLength = 5 - 2; + sdi.Address = rw_bo(bo, offset); + sdi.ImmediateData = imm; + } +} + +static void +iris_copy_mem_mem(struct iris_batch *batch, + struct iris_bo *dst_bo, uint32_t dst_offset, + struct iris_bo *src_bo, uint32_t src_offset, + unsigned bytes) +{ + /* MI_COPY_MEM_MEM operates on DWords. */ + assert(bytes % 4 == 0); + assert(dst_offset % 4 == 0); + assert(src_offset % 4 == 0); + + for (unsigned i = 0; i < bytes; i += 4) { + iris_emit_cmd(batch, GENX(MI_COPY_MEM_MEM), cp) { + cp.DestinationMemoryAddress = rw_bo(dst_bo, dst_offset + i); + cp.SourceMemoryAddress = ro_bo(src_bo, src_offset + i); + } + } +} + +static void emit_pipeline_select(struct iris_batch *batch, uint32_t pipeline) { #if GEN_GEN >= 8 && GEN_GEN < 10 @@ -513,7 +683,8 @@ static void init_state_base_address(struct iris_batch *batch) { - flush_for_state_base_change(batch); + uint32_t mocs = batch->screen->isl_dev.mocs.internal; + flush_before_state_base_change(batch); /* We program most base addresses once at context initialization time. * Each base address points at a 4GB memory zone, and never needs to @@ -523,11 +694,12 @@ * updated occasionally. See iris_binder.c for the details there. */ iris_emit_cmd(batch, GENX(STATE_BASE_ADDRESS), sba) { - sba.GeneralStateMOCS = MOCS_WB; - sba.StatelessDataPortAccessMOCS = MOCS_WB; - sba.DynamicStateMOCS = MOCS_WB; - sba.IndirectObjectMOCS = MOCS_WB; - sba.InstructionMOCS = MOCS_WB; + sba.GeneralStateMOCS = mocs; + sba.StatelessDataPortAccessMOCS = mocs; + sba.DynamicStateMOCS = mocs; + sba.IndirectObjectMOCS = mocs; + sba.InstructionMOCS = mocs; + sba.SurfaceStateMOCS = mocs; sba.GeneralStateBaseAddressModifyEnable = true; sba.DynamicStateBaseAddressModifyEnable = true; @@ -537,7 +709,7 @@ sba.DynamicStateBufferSizeModifyEnable = true; #if (GEN_GEN >= 9) sba.BindlessSurfaceStateBaseAddressModifyEnable = true; - sba.BindlessSurfaceStateMOCS = MOCS_WB; + sba.BindlessSurfaceStateMOCS = mocs; #endif sba.IndirectObjectBufferSizeModifyEnable = true; sba.InstructionBuffersizeModifyEnable = true; @@ -550,15 +722,28 @@ sba.InstructionBufferSize = 0xfffff; sba.DynamicStateBufferSize = 0xfffff; } + + flush_after_state_base_change(batch); } static void -iris_emit_l3_config(struct iris_batch *batch, const struct gen_l3_config *cfg, - bool has_slm, bool wants_dc_cache) +iris_emit_l3_config(struct iris_batch *batch, + const struct gen_l3_config *cfg) { uint32_t reg_val; - iris_pack_state(GENX(L3CNTLREG), ®_val, reg) { - reg.SLMEnable = has_slm; + +#if GEN_GEN >= 12 +#define L3_ALLOCATION_REG GENX(L3ALLOC) +#define L3_ALLOCATION_REG_num GENX(L3ALLOC_num) +#else +#define L3_ALLOCATION_REG GENX(L3CNTLREG) +#define L3_ALLOCATION_REG_num GENX(L3CNTLREG_num) +#endif + + iris_pack_state(L3_ALLOCATION_REG, ®_val, reg) { +#if GEN_GEN < 11 + reg.SLMEnable = cfg->n[GEN_L3P_SLM] > 0; +#endif #if GEN_GEN == 11 /* WA_1406697149: Bit 9 "Error Detection Behavior Control" must be set * in L3CNTLREG register. The default setting of the bit is not the @@ -572,23 +757,10 @@ reg.DCAllocation = cfg->n[GEN_L3P_DC]; reg.AllAllocation = cfg->n[GEN_L3P_ALL]; } - iris_emit_lri(batch, L3CNTLREG, reg_val); -} - -static void -iris_emit_default_l3_config(struct iris_batch *batch, - const struct gen_device_info *devinfo, - bool compute) -{ - bool wants_dc_cache = true; - bool has_slm = compute; - const struct gen_l3_weights w = - gen_get_default_l3_weights(devinfo, wants_dc_cache, has_slm); - const struct gen_l3_config *cfg = gen_get_l3_config(devinfo, w); - iris_emit_l3_config(batch, cfg, has_slm, wants_dc_cache); + _iris_emit_lri(batch, L3_ALLOCATION_REG_num, reg_val); } -#if GEN_GEN == 9 || GEN_GEN == 10 +#if GEN_GEN == 9 static void iris_enable_obj_preemption(struct iris_batch *batch, bool enable) { @@ -687,6 +859,32 @@ } #endif +static void +iris_alloc_push_constants(struct iris_batch *batch) +{ + /* For now, we set a static partitioning of the push constant area, + * assuming that all stages could be in use. + * + * TODO: Try lazily allocating the HS/DS/GS sections as needed, and + * see if that improves performance by offering more space to + * the VS/FS when those aren't in use. Also, try dynamically + * enabling/disabling it like i965 does. This would be more + * stalls and may not actually help; we don't know yet. + */ + for (int i = 0; i <= MESA_SHADER_FRAGMENT; i++) { + iris_emit_cmd(batch, GENX(3DSTATE_PUSH_CONSTANT_ALLOC_VS), alloc) { + alloc._3DCommandSubOpcode = 18 + i; + alloc.ConstantBufferOffset = 6 * i; + alloc.ConstantBufferSize = i == MESA_SHADER_FRAGMENT ? 8 : 6; + } + } +} + +#if GEN_GEN >= 12 +static void +init_aux_map_state(struct iris_batch *batch); +#endif + /** * Upload the initial GPU state for a render context. * @@ -694,17 +892,14 @@ * way, but we never actually change. */ static void -iris_init_render_context(struct iris_screen *screen, - struct iris_batch *batch, - struct iris_vtable *vtbl, - struct pipe_debug_callback *dbg) +iris_init_render_context(struct iris_batch *batch) { - UNUSED const struct gen_device_info *devinfo = &screen->devinfo; + UNUSED const struct gen_device_info *devinfo = &batch->screen->devinfo; uint32_t reg_val; emit_pipeline_select(batch, _3D); - iris_emit_default_l3_config(batch, devinfo, false); + iris_emit_l3_config(batch, batch->screen->l3_config_3d); init_state_base_address(batch); @@ -736,37 +931,39 @@ #endif #if GEN_GEN == 11 - iris_pack_state(GENX(SAMPLER_MODE), ®_val, reg) { - reg.HeaderlessMessageforPreemptableContexts = 1; - reg.HeaderlessMessageforPreemptableContextsMask = 1; - } - iris_emit_lri(batch, SAMPLER_MODE, reg_val); + iris_pack_state(GENX(TCCNTLREG), ®_val, reg) { + reg.L3DataPartialWriteMergingEnable = true; + reg.ColorZPartialWriteMergingEnable = true; + reg.URBPartialWriteMergingEnable = true; + reg.TCDisable = true; + } + iris_emit_lri(batch, TCCNTLREG, reg_val); - /* Bit 1 must be set in HALF_SLICE_CHICKEN7. */ - iris_pack_state(GENX(HALF_SLICE_CHICKEN7), ®_val, reg) { - reg.EnabledTexelOffsetPrecisionFix = 1; - reg.EnabledTexelOffsetPrecisionFixMask = 1; - } - iris_emit_lri(batch, HALF_SLICE_CHICKEN7, reg_val); + iris_pack_state(GENX(SAMPLER_MODE), ®_val, reg) { + reg.HeaderlessMessageforPreemptableContexts = 1; + reg.HeaderlessMessageforPreemptableContextsMask = 1; + } + iris_emit_lri(batch, SAMPLER_MODE, reg_val); - iris_pack_state(GENX(SLICE_COMMON_ECO_CHICKEN1), ®_val, reg) { - reg.StateCacheRedirectToCSSectionEnable = true; - reg.StateCacheRedirectToCSSectionEnableMask = true; - } - iris_emit_lri(batch, SLICE_COMMON_ECO_CHICKEN1, reg_val); + /* Bit 1 must be set in HALF_SLICE_CHICKEN7. */ + iris_pack_state(GENX(HALF_SLICE_CHICKEN7), ®_val, reg) { + reg.EnabledTexelOffsetPrecisionFix = 1; + reg.EnabledTexelOffsetPrecisionFixMask = 1; + } + iris_emit_lri(batch, HALF_SLICE_CHICKEN7, reg_val); - /* Hardware specification recommends disabling repacking for the - * compatibility with decompression mechanism in display controller. - */ - if (devinfo->disable_ccs_repack) { - iris_pack_state(GENX(CACHE_MODE_0), ®_val, reg) { - reg.DisableRepackingforCompression = true; - reg.DisableRepackingforCompressionMask = true; - } - iris_emit_lri(batch, CACHE_MODE_0, reg_val); + /* Hardware specification recommends disabling repacking for the + * compatibility with decompression mechanism in display controller. + */ + if (devinfo->disable_ccs_repack) { + iris_pack_state(GENX(CACHE_MODE_0), ®_val, reg) { + reg.DisableRepackingforCompression = true; + reg.DisableRepackingforCompressionMask = true; } + iris_emit_lri(batch, CACHE_MODE_0, reg_val); + } - iris_upload_slice_hashing_state(batch); + iris_upload_slice_hashing_state(batch); #endif /* 3DSTATE_DRAWING_RECTANGLE is non-pipelined, so we want to avoid @@ -803,40 +1000,45 @@ /* TODO: may need to set an offset for origin-UL framebuffers */ iris_emit_cmd(batch, GENX(3DSTATE_POLY_STIPPLE_OFFSET), foo); - /* Set a static partitioning of the push constant area. */ - /* TODO: this may be a bad idea...could starve the push ringbuffers... */ - for (int i = 0; i <= MESA_SHADER_FRAGMENT; i++) { - iris_emit_cmd(batch, GENX(3DSTATE_PUSH_CONSTANT_ALLOC_VS), alloc) { - alloc._3DCommandSubOpcode = 18 + i; - alloc.ConstantBufferOffset = 6 * i; - alloc.ConstantBufferSize = i == MESA_SHADER_FRAGMENT ? 8 : 6; - } - } + iris_alloc_push_constants(batch); -#if GEN_GEN == 10 - /* Gen11+ is enabled for us by the kernel. */ - iris_enable_obj_preemption(batch, true); +#if GEN_GEN >= 12 + init_aux_map_state(batch); #endif } static void -iris_init_compute_context(struct iris_screen *screen, - struct iris_batch *batch, - struct iris_vtable *vtbl, - struct pipe_debug_callback *dbg) +iris_init_compute_context(struct iris_batch *batch) { - UNUSED const struct gen_device_info *devinfo = &screen->devinfo; + UNUSED const struct gen_device_info *devinfo = &batch->screen->devinfo; + /* GEN:BUG:1607854226: + * + * Start with pipeline in 3D mode to set the STATE_BASE_ADDRESS. + */ +#if GEN_GEN == 12 + emit_pipeline_select(batch, _3D); +#else emit_pipeline_select(batch, GPGPU); +#endif - iris_emit_default_l3_config(batch, devinfo, true); + iris_emit_l3_config(batch, batch->screen->l3_config_cs); init_state_base_address(batch); +#if GEN_GEN == 12 + emit_pipeline_select(batch, GPGPU); +#endif + #if GEN_GEN == 9 if (devinfo->is_geminilake) init_glk_barrier_mode(batch, GLK_BARRIER_MODE_GPGPU); #endif + +#if GEN_GEN >= 12 + init_aux_map_state(batch); +#endif + } struct iris_vertex_buffer_state { @@ -871,6 +1073,10 @@ uint32_t so_buffers[4 * GENX(3DSTATE_SO_BUFFER_length)]; +#if GEN_GEN == 8 + bool pma_fix_enabled; +#endif + #if GEN_GEN == 9 /* Is object level preemption enabled? */ bool object_preemption; @@ -1058,6 +1264,9 @@ ice->state.dirty |= IRIS_DIRTY_BLEND_STATE; ice->state.dirty |= IRIS_DIRTY_RENDER_RESOLVES_AND_FLUSHES; ice->state.dirty |= ice->state.dirty_for_nos[IRIS_NOS_BLEND]; + + if (GEN_GEN == 8) + ice->state.dirty |= IRIS_DIRTY_PMA_FIX; } /** @@ -1086,12 +1295,19 @@ /** Partial 3DSTATE_WM_DEPTH_STENCIL. */ uint32_t wmds[GENX(3DSTATE_WM_DEPTH_STENCIL_length)]; +#if GEN_GEN >= 12 + uint32_t depth_bounds[GENX(3DSTATE_DEPTH_BOUNDS_length)]; +#endif + /** Outbound to BLEND_STATE, 3DSTATE_PS_BLEND, COLOR_CALC_STATE. */ struct pipe_alpha_state alpha; /** Outbound to resolve and cache set tracking. */ bool depth_writes_enabled; bool stencil_writes_enabled; + + /** Outbound to Gen8-9 PMA stall equations */ + bool depth_test_enabled; }; /** @@ -1111,6 +1327,7 @@ cso->alpha = state->alpha; cso->depth_writes_enabled = state->depth.writemask; + cso->depth_test_enabled = state->depth.enabled; cso->stencil_writes_enabled = state->stencil[0].writemask != 0 || (two_sided_stencil && state->stencil[1].writemask != 0); @@ -1144,6 +1361,16 @@ /* wmds.[Backface]StencilReferenceValue are merged later */ } +#if GEN_GEN >= 12 + iris_pack_command(GENX(3DSTATE_DEPTH_BOUNDS), cso->depth_bounds, depth_bounds) { + depth_bounds.DepthBoundsTestValueModifyDisable = false; + depth_bounds.DepthBoundsTestEnableModifyDisable = false; + depth_bounds.DepthBoundsTestEnable = state->depth.bounds_test; + depth_bounds.DepthBoundsTestMinValue = state->depth.bounds_min; + depth_bounds.DepthBoundsTestMaxValue = state->depth.bounds_max; + } +#endif + return cso; } @@ -1174,12 +1401,192 @@ ice->state.depth_writes_enabled = new_cso->depth_writes_enabled; ice->state.stencil_writes_enabled = new_cso->stencil_writes_enabled; + +#if GEN_GEN >= 12 + if (cso_changed(depth_bounds)) + ice->state.dirty |= IRIS_DIRTY_DEPTH_BOUNDS; +#endif } ice->state.cso_zsa = new_cso; ice->state.dirty |= IRIS_DIRTY_CC_VIEWPORT; ice->state.dirty |= IRIS_DIRTY_WM_DEPTH_STENCIL; ice->state.dirty |= ice->state.dirty_for_nos[IRIS_NOS_DEPTH_STENCIL_ALPHA]; + + if (GEN_GEN == 8) + ice->state.dirty |= IRIS_DIRTY_PMA_FIX; +} + +#if GEN_GEN == 8 +static bool +want_pma_fix(struct iris_context *ice) +{ + UNUSED struct iris_screen *screen = (void *) ice->ctx.screen; + UNUSED const struct gen_device_info *devinfo = &screen->devinfo; + const struct brw_wm_prog_data *wm_prog_data = (void *) + ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data; + const struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer; + const struct iris_depth_stencil_alpha_state *cso_zsa = ice->state.cso_zsa; + const struct iris_blend_state *cso_blend = ice->state.cso_blend; + + /* In very specific combinations of state, we can instruct Gen8-9 hardware + * to avoid stalling at the pixel mask array. The state equations are + * documented in these places: + * + * - Gen8 Depth PMA Fix: CACHE_MODE_1::NP_PMA_FIX_ENABLE + * - Gen9 Stencil PMA Fix: CACHE_MODE_0::STC PMA Optimization Enable + * + * Both equations share some common elements: + * + * no_hiz_op = + * !(3DSTATE_WM_HZ_OP::DepthBufferClear || + * 3DSTATE_WM_HZ_OP::DepthBufferResolve || + * 3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable || + * 3DSTATE_WM_HZ_OP::StencilBufferClear) && + * + * killpixels = + * 3DSTATE_WM::ForceKillPix != ForceOff && + * (3DSTATE_PS_EXTRA::PixelShaderKillsPixels || + * 3DSTATE_PS_EXTRA::oMask Present to RenderTarget || + * 3DSTATE_PS_BLEND::AlphaToCoverageEnable || + * 3DSTATE_PS_BLEND::AlphaTestEnable || + * 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable) + * + * (Technically the stencil PMA treats ForceKillPix differently, + * but I think this is a documentation oversight, and we don't + * ever use it in this way, so it doesn't matter). + * + * common_pma_fix = + * 3DSTATE_WM::ForceThreadDispatch != 1 && + * 3DSTATE_RASTER::ForceSampleCount == NUMRASTSAMPLES_0 && + * 3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL && + * 3DSTATE_DEPTH_BUFFER::HIZ Enable && + * 3DSTATE_WM::EDSC_Mode != EDSC_PREPS && + * 3DSTATE_PS_EXTRA::PixelShaderValid && + * no_hiz_op + * + * These are always true: + * + * 3DSTATE_RASTER::ForceSampleCount == NUMRASTSAMPLES_0 + * 3DSTATE_PS_EXTRA::PixelShaderValid + * + * Also, we never use the normal drawing path for HiZ ops; these are true: + * + * !(3DSTATE_WM_HZ_OP::DepthBufferClear || + * 3DSTATE_WM_HZ_OP::DepthBufferResolve || + * 3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable || + * 3DSTATE_WM_HZ_OP::StencilBufferClear) + * + * This happens sometimes: + * + * 3DSTATE_WM::ForceThreadDispatch != 1 + * + * However, we choose to ignore it as it either agrees with the signal + * (dispatch was already enabled, so nothing out of the ordinary), or + * there are no framebuffer attachments (so no depth or HiZ anyway, + * meaning the PMA signal will already be disabled). + */ + + if (!cso_fb->zsbuf) + return false; + + struct iris_resource *zres, *sres; + iris_get_depth_stencil_resources(cso_fb->zsbuf->texture, &zres, &sres); + + /* 3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL && + * 3DSTATE_DEPTH_BUFFER::HIZ Enable && + */ + if (!zres || !iris_resource_level_has_hiz(zres, cso_fb->zsbuf->u.tex.level)) + return false; + + /* 3DSTATE_WM::EDSC_Mode != EDSC_PREPS */ + if (wm_prog_data->early_fragment_tests) + return false; + + /* 3DSTATE_WM::ForceKillPix != ForceOff && + * (3DSTATE_PS_EXTRA::PixelShaderKillsPixels || + * 3DSTATE_PS_EXTRA::oMask Present to RenderTarget || + * 3DSTATE_PS_BLEND::AlphaToCoverageEnable || + * 3DSTATE_PS_BLEND::AlphaTestEnable || + * 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable) + */ + bool killpixels = wm_prog_data->uses_kill || wm_prog_data->uses_omask || + cso_blend->alpha_to_coverage || cso_zsa->alpha.enabled; + + /* The Gen8 depth PMA equation becomes: + * + * depth_writes = + * 3DSTATE_WM_DEPTH_STENCIL::DepthWriteEnable && + * 3DSTATE_DEPTH_BUFFER::DEPTH_WRITE_ENABLE + * + * stencil_writes = + * 3DSTATE_WM_DEPTH_STENCIL::Stencil Buffer Write Enable && + * 3DSTATE_DEPTH_BUFFER::STENCIL_WRITE_ENABLE && + * 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE + * + * Z_PMA_OPT = + * common_pma_fix && + * 3DSTATE_WM_DEPTH_STENCIL::DepthTestEnable && + * ((killpixels && (depth_writes || stencil_writes)) || + * 3DSTATE_PS_EXTRA::PixelShaderComputedDepthMode != PSCDEPTH_OFF) + * + */ + if (!cso_zsa->depth_test_enabled) + return false; + + return wm_prog_data->computed_depth_mode != PSCDEPTH_OFF || + (killpixels && (cso_zsa->depth_writes_enabled || + (sres && cso_zsa->stencil_writes_enabled))); +} +#endif + +void +genX(update_pma_fix)(struct iris_context *ice, + struct iris_batch *batch, + bool enable) +{ +#if GEN_GEN == 8 + struct iris_genx_state *genx = ice->state.genx; + + if (genx->pma_fix_enabled == enable) + return; + + genx->pma_fix_enabled = enable; + + /* According to the Broadwell PIPE_CONTROL documentation, software should + * emit a PIPE_CONTROL with the CS Stall and Depth Cache Flush bits set + * prior to the LRI. If stencil buffer writes are enabled, then a Render * Cache Flush is also necessary. + * + * The Gen9 docs say to use a depth stall rather than a command streamer + * stall. However, the hardware seems to violently disagree. A full + * command streamer stall seems to be needed in both cases. + */ + iris_emit_pipe_control_flush(batch, "PMA fix change (1/2)", + PIPE_CONTROL_CS_STALL | + PIPE_CONTROL_DEPTH_CACHE_FLUSH | + PIPE_CONTROL_RENDER_TARGET_FLUSH); + + uint32_t reg_val; + iris_pack_state(GENX(CACHE_MODE_1), ®_val, reg) { + reg.NPPMAFixEnable = enable; + reg.NPEarlyZFailsDisable = enable; + reg.NPPMAFixEnableMask = true; + reg.NPEarlyZFailsDisableMask = true; + } + iris_emit_lri(batch, CACHE_MODE_1, reg_val); + + /* After the LRI, a PIPE_CONTROL with both the Depth Stall and Depth Cache + * Flush bits is often necessary. We do it regardless because it's easier. + * The render cache flush is also necessary if stencil writes are enabled. + * + * Again, the Gen9 docs give a different set of flushes but the Broadwell + * flushes seem to work just as well. + */ + iris_emit_pipe_control_flush(batch, "PMA fix change (1/2)", + PIPE_CONTROL_DEPTH_STALL | + PIPE_CONTROL_DEPTH_CACHE_FLUSH | + PIPE_CONTROL_RENDER_TARGET_FLUSH); +#endif } /** @@ -1366,9 +1773,11 @@ const unsigned line_stipple_factor = state->line_stipple_factor + 1; iris_pack_command(GENX(3DSTATE_LINE_STIPPLE), cso->line_stipple, line) { - line.LineStipplePattern = state->line_stipple_pattern; - line.LineStippleInverseRepeatCount = 1.0f / line_stipple_factor; - line.LineStippleRepeatCount = line_stipple_factor; + if (state->line_stipple_enable) { + line.LineStipplePattern = state->line_stipple_pattern; + line.LineStippleInverseRepeatCount = 1.0f / line_stipple_factor; + line.LineStippleRepeatCount = line_stipple_factor; + } } return cso; @@ -1551,11 +1960,17 @@ assert(start + count <= IRIS_MAX_TEXTURE_SAMPLERS); + bool dirty = false; + for (int i = 0; i < count; i++) { - shs->samplers[start + i] = states[i]; + if (shs->samplers[start + i] != states[i]) { + shs->samplers[start + i] = states[i]; + dirty = true; + } } - ice->state.dirty |= IRIS_DIRTY_SAMPLER_STATES_VS << stage; + if (dirty) + ice->state.dirty |= IRIS_DIRTY_SAMPLER_STATES_VS << stage; } /** @@ -1589,10 +2004,12 @@ return; struct pipe_resource *res = shs->sampler_table.res; - shs->sampler_table.offset += - iris_bo_offset_from_base_address(iris_resource_bo(res)); + struct iris_bo *bo = iris_resource_bo(res); - iris_record_state_size(ice->state.sizes, shs->sampler_table.offset, size); + iris_record_state_size(ice->state.sizes, + bo->gtt_offset + shs->sampler_table.offset, size); + + shs->sampler_table.offset += iris_bo_offset_from_base_address(bo); /* Make sure all land in the same BO */ iris_border_color_pool_reserve(ice, IRIS_MAX_TEXTURE_SAMPLERS); @@ -1707,18 +2124,18 @@ .format = format, .swizzle = swizzle, .stride_B = cpp, - .mocs = mocs(res->bo)); + .mocs = iris_mocs(res->bo, isl_dev)); } #define SURFACE_STATE_ALIGNMENT 64 /** * Allocate several contiguous SURFACE_STATE structures, one for each - * supported auxiliary surface mode. + * supported auxiliary surface mode. This only allocates the CPU-side + * copy, they will need to be uploaded later after they're filled in. */ -static void * -alloc_surface_states(struct u_upload_mgr *mgr, - struct iris_state_ref *ref, +static void +alloc_surface_states(struct iris_surface_state *surf_state, unsigned aux_usages) { const unsigned surf_size = 4 * GENX(RENDER_SURFACE_STATE_length); @@ -1728,13 +2145,68 @@ assert(aux_usages != 0); + /* In case we're re-allocating them... */ + free(surf_state->cpu); + + surf_state->num_states = util_bitcount(aux_usages); + surf_state->cpu = calloc(surf_state->num_states, surf_size); + surf_state->ref.offset = 0; + pipe_resource_reference(&surf_state->ref.res, NULL); + + assert(surf_state->cpu); +} + +/** + * Upload the CPU side SURFACE_STATEs into a GPU buffer. + */ +static void +upload_surface_states(struct u_upload_mgr *mgr, + struct iris_surface_state *surf_state) +{ + const unsigned surf_size = 4 * GENX(RENDER_SURFACE_STATE_length); + const unsigned bytes = surf_state->num_states * surf_size; + void *map = - upload_state(mgr, ref, util_bitcount(aux_usages) * surf_size, - SURFACE_STATE_ALIGNMENT); + upload_state(mgr, &surf_state->ref, bytes, SURFACE_STATE_ALIGNMENT); - ref->offset += iris_bo_offset_from_base_address(iris_resource_bo(ref->res)); + surf_state->ref.offset += + iris_bo_offset_from_base_address(iris_resource_bo(surf_state->ref.res)); - return map; + if (map) + memcpy(map, surf_state->cpu, bytes); +} + +/** + * Update resource addresses in a set of SURFACE_STATE descriptors, + * and re-upload them if necessary. + */ +static bool +update_surface_state_addrs(struct u_upload_mgr *mgr, + struct iris_surface_state *surf_state, + struct iris_bo *bo) +{ + if (surf_state->bo_address == bo->gtt_offset) + return false; + + STATIC_ASSERT(GENX(RENDER_SURFACE_STATE_SurfaceBaseAddress_start) % 64 == 0); + STATIC_ASSERT(GENX(RENDER_SURFACE_STATE_SurfaceBaseAddress_bits) == 64); + + uint64_t *ss_addr = (uint64_t *) &surf_state->cpu[GENX(RENDER_SURFACE_STATE_SurfaceBaseAddress_start) / 32]; + + /* First, update the CPU copies. We assume no other fields exist in + * the QWord containing Surface Base Address. + */ + for (unsigned i = 0; i < surf_state->num_states; i++) { + *ss_addr = *ss_addr - surf_state->bo_address + bo->gtt_offset; + ss_addr = ((void *) ss_addr) + SURFACE_STATE_ALIGNMENT; + } + + /* Next, upload the updated copies to a GPU buffer. */ + upload_surface_states(mgr, surf_state); + + surf_state->bo_address = bo->gtt_offset; + + return true; } #if GEN_GEN == 8 @@ -1750,11 +2222,11 @@ struct iris_resource *res, enum pipe_texture_target target, struct isl_view *view, + uint32_t *offset_to_tile, uint32_t *tile_x_sa, uint32_t *tile_y_sa, struct isl_surf *surf) { - *surf = res->surf; const enum isl_dim_layout dim_layout = @@ -1777,9 +2249,9 @@ assert(view->levels == 1 && view->array_len == 1); assert(*tile_x_sa == 0 && *tile_y_sa == 0); - res->offset += iris_resource_get_tile_offsets(res, view->base_level, - view->base_array_layer, - tile_x_sa, tile_y_sa); + *offset_to_tile = iris_resource_get_tile_offsets(res, view->base_level, + view->base_array_layer, + tile_x_sa, tile_y_sa); const unsigned l = view->base_level; surf->logical_level0_px.width = minify(surf->logical_level0_px.width, l); @@ -1804,14 +2276,15 @@ struct isl_surf *surf, struct isl_view *view, unsigned aux_usage, + uint32_t extra_main_offset, uint32_t tile_x_sa, uint32_t tile_y_sa) { struct isl_surf_fill_state_info f = { .surf = surf, .view = view, - .mocs = mocs(res->bo), - .address = res->bo->gtt_offset + res->offset, + .mocs = iris_mocs(res->bo, isl_dev), + .address = res->bo->gtt_offset + res->offset + extra_main_offset, .x_offset_sa = tile_x_sa, .y_offset_sa = tile_y_sa, }; @@ -1871,11 +2344,9 @@ isv->res = (struct iris_resource *) tex; - void *map = alloc_surface_states(ice->state.surface_uploader, - &isv->surface_state, - isv->res->aux.sampler_usages); - if (!unlikely(map)) - return NULL; + alloc_surface_states(&isv->surface_state, isv->res->aux.sampler_usages); + + isv->surface_state.bo_address = isv->res->bo->gtt_offset; isl_surf_usage_flags_t usage = ISL_SURF_USAGE_TEXTURE_BIT; @@ -1899,6 +2370,8 @@ .usage = usage, }; + void *map = isv->surface_state.cpu; + /* Fill out SURFACE_STATE for this view. */ if (tmpl->target != PIPE_BUFFER) { isv->view.base_level = tmpl->u.tex.first_level; @@ -1919,7 +2392,7 @@ * surface state with HiZ. */ fill_surface_state(&screen->isl_dev, map, isv->res, &isv->res->surf, - &isv->view, aux_usage, 0, 0); + &isv->view, aux_usage, 0, 0, 0); map += SURFACE_STATE_ALIGNMENT; } @@ -1929,6 +2402,8 @@ tmpl->u.buf.offset, tmpl->u.buf.size); } + upload_surface_states(ice->state.surface_uploader, &isv->surface_state); + return &isv->base; } @@ -1938,7 +2413,8 @@ { struct iris_sampler_view *isv = (void *) state; pipe_resource_reference(&state->texture, NULL); - pipe_resource_reference(&isv->surface_state.res, NULL); + pipe_resource_reference(&isv->surface_state.ref.res, NULL); + free(isv->surface_state.cpu); free(isv); } @@ -2034,56 +2510,49 @@ return psurf; - void *map = alloc_surface_states(ice->state.surface_uploader, - &surf->surface_state, - res->aux.possible_usages); - if (!unlikely(map)) { - pipe_resource_reference(&surf->surface_state.res, NULL); - return NULL; - } + alloc_surface_states(&surf->surface_state, res->aux.possible_usages); + surf->surface_state.bo_address = res->bo->gtt_offset; #if GEN_GEN == 8 - void *map_read = alloc_surface_states(ice->state.surface_uploader, - &surf->surface_state_read, - res->aux.possible_usages); - if (!unlikely(map_read)) { - pipe_resource_reference(&surf->surface_state_read.res, NULL); - return NULL; - } + alloc_surface_states(&surf->surface_state_read, res->aux.possible_usages); + surf->surface_state_read.bo_address = res->bo->gtt_offset; #endif if (!isl_format_is_compressed(res->surf.format)) { if (iris_resource_unfinished_aux_import(res)) iris_resource_finish_aux_import(&screen->base, res); + void *map = surf->surface_state.cpu; + UNUSED void *map_read = surf->surface_state_read.cpu; + /* This is a normal surface. Fill out a SURFACE_STATE for each possible * auxiliary surface mode and return the pipe_surface. */ unsigned aux_modes = res->aux.possible_usages; while (aux_modes) { -#if GEN_GEN == 8 - uint32_t offset = res->offset; -#endif enum isl_aux_usage aux_usage = u_bit_scan(&aux_modes); fill_surface_state(&screen->isl_dev, map, res, &res->surf, - view, aux_usage, 0, 0); + view, aux_usage, 0, 0, 0); map += SURFACE_STATE_ALIGNMENT; #if GEN_GEN == 8 struct isl_surf surf; - uint32_t tile_x_sa = 0, tile_y_sa = 0; + uint32_t offset_to_tile = 0, tile_x_sa = 0, tile_y_sa = 0; get_rt_read_isl_surf(devinfo, res, target, read_view, - &tile_x_sa, &tile_y_sa, &surf); + &offset_to_tile, &tile_x_sa, &tile_y_sa, &surf); fill_surface_state(&screen->isl_dev, map_read, res, &surf, read_view, - aux_usage, tile_x_sa, tile_y_sa); - /* Restore offset because we change offset in case of handling - * non_coherent fb fetch - */ - res->offset = offset; + aux_usage, offset_to_tile, tile_x_sa, tile_y_sa); map_read += SURFACE_STATE_ALIGNMENT; #endif } + upload_surface_states(ice->state.surface_uploader, &surf->surface_state); + +#if GEN_GEN == 8 + upload_surface_states(ice->state.surface_uploader, + &surf->surface_state_read); +#endif + return psurf; } @@ -2156,13 +2625,16 @@ struct isl_surf_fill_state_info f = { .surf = &isl_surf, .view = view, - .mocs = mocs(res->bo), + .mocs = iris_mocs(res->bo, &screen->isl_dev), .address = res->bo->gtt_offset + offset_B, .x_offset_sa = tile_x_sa, .y_offset_sa = tile_y_sa, }; - isl_surf_fill_state_s(&screen->isl_dev, map, &f); + isl_surf_fill_state_s(&screen->isl_dev, surf->surface_state.cpu, &f); + + upload_surface_states(ice->state.surface_uploader, &surf->surface_state); + return psurf; } @@ -2224,17 +2696,12 @@ const struct pipe_image_view *img = &p_images[i]; struct iris_resource *res = (void *) img->resource; - void *map = - alloc_surface_states(ice->state.surface_uploader, - &iv->surface_state, 1 << ISL_AUX_USAGE_NONE); - if (!unlikely(map)) - return; - util_copy_image_view(&iv->base, img); shs->bound_image_views |= 1 << (start_slot + i); res->bind_history |= PIPE_BIND_SHADER_IMAGE; + res->bind_stages |= 1 << stage; isl_surf_usage_flags_t usage = ISL_SURF_USAGE_STORAGE_BIT; enum isl_format isl_fmt = @@ -2256,6 +2723,11 @@ isl_fmt = isl_lower_storage_image_format(devinfo, isl_fmt); } + alloc_surface_states(&iv->surface_state, 1 << ISL_AUX_USAGE_NONE); + iv->surface_state.bo_address = res->bo->gtt_offset; + + void *map = iv->surface_state.cpu; + if (res->base.target != PIPE_BUFFER) { struct isl_view view = { .format = isl_fmt, @@ -2278,7 +2750,7 @@ enum isl_aux_usage usage = u_bit_scan(&aux_modes); fill_surface_state(&screen->isl_dev, map, res, &res->surf, - &view, usage, 0, 0); + &view, usage, 0, 0, 0); map += SURFACE_STATE_ALIGNMENT; } @@ -2288,7 +2760,7 @@ &image_params[start_slot + i], &res->surf, &view); } else { - util_range_add(&res->valid_buffer_range, img->u.buf.offset, + util_range_add(&res->base, &res->valid_buffer_range, img->u.buf.offset, img->u.buf.offset + img->u.buf.size); fill_buffer_surface_state(&screen->isl_dev, res, map, @@ -2297,9 +2769,11 @@ fill_buffer_image_param(&image_params[start_slot + i], img->format, img->u.buf.size); } + + upload_surface_states(ice->state.surface_uploader, &iv->surface_state); } else { pipe_resource_reference(&iv->base.resource, NULL); - pipe_resource_reference(&iv->surface_state.res, NULL); + pipe_resource_reference(&iv->surface_state.ref.res, NULL); fill_default_image_param(&image_params[start_slot + i]); } } @@ -2339,7 +2813,12 @@ struct iris_sampler_view *view = (void *) pview; if (view) { view->res->bind_history |= PIPE_BIND_SAMPLER_VIEW; + view->res->bind_stages |= 1 << stage; + shs->bound_sampler_views |= 1 << (start + i); + + update_surface_state_addrs(ice->state.surface_uploader, + &view->surface_state, view->res->bo); } } @@ -2372,8 +2851,9 @@ { struct iris_surface *surf = (void *) p_surf; pipe_resource_reference(&p_surf->texture, NULL); - pipe_resource_reference(&surf->surface_state.res, NULL); - pipe_resource_reference(&surf->surface_state_read.res, NULL); + pipe_resource_reference(&surf->surface_state.ref.res, NULL); + pipe_resource_reference(&surf->surface_state_read.ref.res, NULL); + free(surf->surface_state.cpu); free(surf); } @@ -2579,12 +3059,12 @@ info.depth_surf = &zres->surf; info.depth_address = zres->bo->gtt_offset + zres->offset; - info.mocs = mocs(zres->bo); + info.mocs = iris_mocs(zres->bo, isl_dev); view.format = zres->surf.format; if (iris_resource_level_has_hiz(zres, view.base_level)) { - info.hiz_usage = ISL_AUX_USAGE_HIZ; + info.hiz_usage = zres->aux.usage; info.hiz_surf = &zres->aux.surf; info.hiz_address = zres->aux.bo->gtt_offset + zres->aux.offset; } @@ -2592,11 +3072,12 @@ if (stencil_res) { view.usage |= ISL_SURF_USAGE_STENCIL_BIT; + info.stencil_aux_usage = stencil_res->aux.usage; info.stencil_surf = &stencil_res->surf; info.stencil_address = stencil_res->bo->gtt_offset + stencil_res->offset; if (!zres) { view.format = stencil_res->surf.format; - info.mocs = mocs(stencil_res->bo); + info.mocs = iris_mocs(stencil_res->bo, isl_dev); } } } @@ -2617,28 +3098,14 @@ /* Render target change */ ice->state.dirty |= IRIS_DIRTY_BINDINGS_FS; + ice->state.dirty |= IRIS_DIRTY_RENDER_BUFFER; + ice->state.dirty |= IRIS_DIRTY_RENDER_RESOLVES_AND_FLUSHES; ice->state.dirty |= ice->state.dirty_for_nos[IRIS_NOS_FRAMEBUFFER]; -#if GEN_GEN == 11 - // XXX: we may want to flag IRIS_DIRTY_MULTISAMPLE (or SAMPLE_MASK?) - // XXX: see commit 979fc1bc9bcc64027ff2cfafd285676f31b930a6 - - /* The PIPE_CONTROL command description says: - * - * "Whenever a Binding Table Index (BTI) used by a Render Target Message - * points to a different RENDER_SURFACE_STATE, SW must issue a Render - * Target Cache Flush by enabling this bit. When render target flush - * is set due to new association of BTI, PS Scoreboard Stall bit must - * be set in this packet." - */ - // XXX: does this need to happen at 3DSTATE_BTP_PS time? - iris_emit_pipe_control_flush(&ice->batches[IRIS_BATCH_RENDER], - "workaround: RT BTI change [draw]", - PIPE_CONTROL_RENDER_TARGET_FLUSH | - PIPE_CONTROL_STALL_AT_SCOREBOARD); -#endif + if (GEN_GEN == 8) + ice->state.dirty |= IRIS_DIRTY_PMA_FIX; } /** @@ -2657,6 +3124,9 @@ struct iris_shader_state *shs = &ice->state.shaders[stage]; struct pipe_shader_buffer *cbuf = &shs->constbuf[index]; + /* TODO: Only do this if the buffer changes? */ + pipe_resource_reference(&shs->constbuf_surf_state[index].res, NULL); + if (input && input->buffer_size && (input->buffer || input->user_buffer)) { shs->bound_cbufs |= 1u << index; @@ -2686,21 +3156,13 @@ struct iris_resource *res = (void *) cbuf->buffer; res->bind_history |= PIPE_BIND_CONSTANT_BUFFER; - - iris_upload_ubo_ssbo_surf_state(ice, cbuf, - &shs->constbuf_surf_state[index], - false); + res->bind_stages |= 1 << stage; } else { shs->bound_cbufs &= ~(1u << index); pipe_resource_reference(&cbuf->buffer, NULL); - pipe_resource_reference(&shs->constbuf_surf_state[index].res, NULL); } ice->state.dirty |= IRIS_DIRTY_CONSTANTS_VS << stage; - // XXX: maybe not necessary all the time...? - // XXX: we need 3DS_BTP to commit these changes, and if we fell back to - // XXX: pull model we may need actual new bindings... - ice->state.dirty |= IRIS_DIRTY_BINDINGS_VS << stage; } static void @@ -2818,8 +3280,9 @@ iris_upload_ubo_ssbo_surf_state(ice, ssbo, surf_state, true); res->bind_history |= PIPE_BIND_SHADER_BUFFER; + res->bind_stages |= 1 << stage; - util_range_add(&res->valid_buffer_range, ssbo->buffer_offset, + util_range_add(&res->base, &res->valid_buffer_range, ssbo->buffer_offset, ssbo->buffer_offset + ssbo->buffer_size); } else { pipe_resource_reference(&shs->ssbo[start_slot + i].buffer, NULL); @@ -2848,6 +3311,7 @@ const struct pipe_vertex_buffer *buffers) { struct iris_context *ice = (struct iris_context *) ctx; + struct iris_screen *screen = (struct iris_screen *)ctx->screen; struct iris_genx_state *genx = ice->state.genx; ice->state.bound_vertex_buffers &= ~u_bit_consecutive64(start_slot, count); @@ -2880,10 +3344,10 @@ vb.AddressModifyEnable = true; vb.BufferPitch = buffer->stride; if (res) { - vb.BufferSize = res->bo->size - (int) buffer->buffer_offset; + vb.BufferSize = res->base.width0 - (int) buffer->buffer_offset; vb.BufferStartingAddress = ro_bo(NULL, res->bo->gtt_offset + (int) buffer->buffer_offset); - vb.MOCS = mocs(res->bo); + vb.MOCS = iris_mocs(res->bo, &screen->isl_dev); } else { vb.NullVertexBuffer = true; } @@ -3063,7 +3527,7 @@ cso->base.buffer_size = buffer_size; cso->base.context = ctx; - util_range_add(&res->valid_buffer_range, buffer_offset, + util_range_add(&res->base, &res->valid_buffer_range, buffer_offset, buffer_offset + buffer_size); upload_state(ctx->stream_uploader, &cso->offset, sizeof(uint32_t), 4); @@ -3099,6 +3563,7 @@ struct iris_context *ice = (struct iris_context *) ctx; struct iris_genx_state *genx = ice->state.genx; uint32_t *so_buffers = genx->so_buffers; + struct iris_screen *screen = (struct iris_screen *)ctx->screen; const bool active = num_targets > 0; if (ice->state.streamout_active != active) { @@ -3145,8 +3610,14 @@ unsigned offset = offsets[i]; if (!tgt) { - iris_pack_command(GENX(3DSTATE_SO_BUFFER), so_buffers, sob) + iris_pack_command(GENX(3DSTATE_SO_BUFFER), so_buffers, sob) { +#if GEN_GEN < 12 sob.SOBufferIndex = i; +#else + sob._3DCommandOpcode = 0; + sob._3DCommandSubOpcode = SO_BUFFER_INDEX_0_CMD + i; +#endif + } continue; } @@ -3167,16 +3638,20 @@ offset = 0; iris_pack_command(GENX(3DSTATE_SO_BUFFER), so_buffers, sob) { +#if GEN_GEN < 12 + sob.SOBufferIndex = i; +#else + sob._3DCommandOpcode = 0; + sob._3DCommandSubOpcode = SO_BUFFER_INDEX_0_CMD + i; +#endif sob.SurfaceBaseAddress = rw_bo(NULL, res->bo->gtt_offset + tgt->base.buffer_offset); sob.SOBufferEnable = true; sob.StreamOffsetWriteEnable = true; sob.StreamOutputBufferOffsetAddressEnable = true; - sob.MOCS = mocs(res->bo); + sob.MOCS = iris_mocs(res->bo, &screen->isl_dev); sob.SurfaceSize = MAX2(tgt->base.buffer_size / 4, 1) - 1; - - sob.SOBufferIndex = i; sob.StreamOffset = offset; sob.StreamOutputBufferOffsetAddress = rw_bo(NULL, iris_resource_bo(tgt->offset.res)->gtt_offset + @@ -3548,14 +4023,14 @@ iris_populate_vs_key(const struct iris_context *ice, const struct shader_info *info, gl_shader_stage last_stage, - struct brw_vs_prog_key *key) + struct iris_vs_prog_key *key) { const struct iris_rasterizer_state *cso_rast = ice->state.cso_rast; if (info->clip_distance_array_size == 0 && (info->outputs_written & (VARYING_BIT_POS | VARYING_BIT_CLIP_VERTEX)) && last_stage == MESA_SHADER_VERTEX) - key->nr_userclip_plane_consts = cso_rast->num_clip_plane_consts; + key->vue.nr_userclip_plane_consts = cso_rast->num_clip_plane_consts; } /** @@ -3563,7 +4038,7 @@ */ static void iris_populate_tcs_key(const struct iris_context *ice, - struct brw_tcs_prog_key *key) + struct iris_tcs_prog_key *key) { } @@ -3574,14 +4049,14 @@ iris_populate_tes_key(const struct iris_context *ice, const struct shader_info *info, gl_shader_stage last_stage, - struct brw_tes_prog_key *key) + struct iris_tes_prog_key *key) { const struct iris_rasterizer_state *cso_rast = ice->state.cso_rast; if (info->clip_distance_array_size == 0 && (info->outputs_written & (VARYING_BIT_POS | VARYING_BIT_CLIP_VERTEX)) && last_stage == MESA_SHADER_TESS_EVAL) - key->nr_userclip_plane_consts = cso_rast->num_clip_plane_consts; + key->vue.nr_userclip_plane_consts = cso_rast->num_clip_plane_consts; } /** @@ -3591,14 +4066,14 @@ iris_populate_gs_key(const struct iris_context *ice, const struct shader_info *info, gl_shader_stage last_stage, - struct brw_gs_prog_key *key) + struct iris_gs_prog_key *key) { const struct iris_rasterizer_state *cso_rast = ice->state.cso_rast; if (info->clip_distance_array_size == 0 && (info->outputs_written & (VARYING_BIT_POS | VARYING_BIT_CLIP_VERTEX)) && last_stage == MESA_SHADER_GEOMETRY) - key->nr_userclip_plane_consts = cso_rast->num_clip_plane_consts; + key->vue.nr_userclip_plane_consts = cso_rast->num_clip_plane_consts; } /** @@ -3607,7 +4082,7 @@ static void iris_populate_fs_key(const struct iris_context *ice, const struct shader_info *info, - struct brw_wm_prog_key *key) + struct iris_fs_prog_key *key) { struct iris_screen *screen = (void *) ice->ctx.screen; const struct pipe_framebuffer_state *fb = &ice->state.framebuffer; @@ -3640,7 +4115,7 @@ static void iris_populate_cs_key(const struct iris_context *ice, - struct brw_cs_prog_key *key) + struct iris_cs_prog_key *key) { } @@ -3651,17 +4126,9 @@ return iris_bo_offset_from_base_address(res->bo) + shader->assembly.offset; } -/* Gen11 workaround table #2056 WABTPPrefetchDisable suggests to disable - * prefetching of binding tables in A0 and B0 steppings. XXX: Revisit - * this WA on C0 stepping. - * - * TODO: Fill out SamplerCount for prefetching? - */ - #define INIT_THREAD_DISPATCH_FIELDS(pkt, prefix, stage) \ pkt.KernelStartPointer = KSP(shader); \ - pkt.BindingTableEntryCount = GEN_GEN == 11 ? 0 : \ - shader->bt.size_bytes / 4; \ + pkt.BindingTableEntryCount = shader->bt.size_bytes / 4; \ pkt.FloatingPointMode = prog_data->use_alt_mode; \ \ pkt.DispatchGRFStartRegisterForURBData = \ @@ -3715,6 +4182,18 @@ iris_pack_command(GENX(3DSTATE_HS), shader->derived_data, hs) { INIT_THREAD_DISPATCH_FIELDS(hs, Vertex, MESA_SHADER_TESS_CTRL); +#if GEN_GEN >= 12 + /* GEN:BUG:1604578095: + * + * Hang occurs when the number of max threads is less than 2 times + * the number of instance count. The number of max threads must be + * more than 2 times the number of instance count. + */ + assert((devinfo->max_tcs_threads / 2) > tcs_prog_data->instances); + hs.DispatchGRFStartRegisterForURBData = prog_data->dispatch_grf_start_reg & 0x1f; + hs.DispatchGRFStartRegisterForURBData5 = prog_data->dispatch_grf_start_reg >> 5; +#endif + hs.InstanceCount = tcs_prog_data->instances - 1; hs.MaximumNumberofThreads = devinfo->max_tcs_threads - 1; hs.IncludeVertexHandles = true; @@ -3828,9 +4307,7 @@ iris_pack_command(GENX(3DSTATE_PS), ps_state, ps) { ps.VectorMaskEnable = true; - // XXX: WABTPPrefetchDisable, see above, drop at C0 - ps.BindingTableEntryCount = GEN_GEN == 11 ? 0 : - shader->bt.size_bytes / 4; + ps.BindingTableEntryCount = shader->bt.size_bytes / 4; ps.FloatingPointMode = prog_data->use_alt_mode; ps.MaximumNumberofThreadsPerPSD = 64 - (GEN_GEN == 8 ? 2 : 1); @@ -3902,6 +4379,18 @@ desc.BarrierEnable = cs_prog_data->uses_barrier; desc.CrossThreadConstantDataReadLength = cs_prog_data->push.cross_thread.regs; +#if GEN_GEN >= 12 + /* TODO: Check if we are missing workarounds and enable mid-thread + * preemption. + * + * We still have issues with mid-thread preemption (it was already + * disabled by the kernel on gen11, due to missing workarounds). It's + * possible that we are just missing some workarounds, and could enable + * it later, but for now let's disable it to fix a GPU in compute in Car + * Chase (and possibly more). + */ + desc.ThreadPreemptionDisable = true; +#endif } } @@ -4007,6 +4496,7 @@ util_bitcount(aux_modes & ((1 << aux_usage) - 1)); } +#if GEN_GEN == 9 static void surf_state_update_clear_value(struct iris_batch *batch, struct iris_resource *res, @@ -4047,12 +4537,13 @@ PIPE_CONTROL_FLUSH_ENABLE | PIPE_CONTROL_STATE_CACHE_INVALIDATE); } +#endif static void update_clear_value(struct iris_context *ice, struct iris_batch *batch, struct iris_resource *res, - struct iris_state_ref *state, + struct iris_surface_state *surf_state, unsigned all_aux_modes, struct isl_view *view) { @@ -4069,19 +4560,23 @@ while (aux_modes) { enum isl_aux_usage aux_usage = u_bit_scan(&aux_modes); - surf_state_update_clear_value(batch, res, state, all_aux_modes, - aux_usage); + surf_state_update_clear_value(batch, res, &surf_state->ref, + all_aux_modes, aux_usage); } #elif GEN_GEN == 8 - pipe_resource_reference(&state->res, NULL); + /* TODO: Could update rather than re-filling */ + alloc_surface_states(surf_state, all_aux_modes); + + void *map = surf_state->cpu; - void *map = alloc_surface_states(ice->state.surface_uploader, - state, all_aux_modes); while (aux_modes) { enum isl_aux_usage aux_usage = u_bit_scan(&aux_modes); - fill_surface_state(isl_dev, map, res, &res->surf, view, aux_usage, 0, 0); + fill_surface_state(isl_dev, map, res, &res->surf, view, aux_usage, + 0, 0, 0); map += SURFACE_STATE_ALIGNMENT; } + + upload_surface_states(ice->state.surface_uploader, surf_state); #endif } @@ -4105,9 +4600,9 @@ iris_use_pinned_bo(batch, iris_resource_bo(p_surf->texture), writeable); if (GEN_GEN == 8 && is_read_surface) { - iris_use_pinned_bo(batch, iris_resource_bo(surf->surface_state_read.res), false); + iris_use_pinned_bo(batch, iris_resource_bo(surf->surface_state_read.ref.res), false); } else { - iris_use_pinned_bo(batch, iris_resource_bo(surf->surface_state.res), false); + iris_use_pinned_bo(batch, iris_resource_bo(surf->surface_state.ref.res), false); } if (res->aux.bo) { @@ -4127,8 +4622,9 @@ } } - offset = (GEN_GEN == 8 && is_read_surface) ? surf->surface_state_read.offset - : surf->surface_state.offset; + offset = (GEN_GEN == 8 && is_read_surface) + ? surf->surface_state_read.ref.offset + : surf->surface_state.ref.offset; return offset + surf_state_offset_for_aux(res, res->aux.possible_usages, aux_usage); @@ -4139,12 +4635,11 @@ struct iris_batch *batch, struct iris_sampler_view *isv) { - // XXX: ASTC hacks enum isl_aux_usage aux_usage = - iris_resource_texture_aux_usage(ice, isv->res, isv->view.format, 0); + iris_resource_texture_aux_usage(ice, isv->res, isv->view.format); iris_use_pinned_bo(batch, isv->res->bo, false); - iris_use_pinned_bo(batch, iris_resource_bo(isv->surface_state.res), false); + iris_use_pinned_bo(batch, iris_resource_bo(isv->surface_state.ref.res), false); if (isv->res->aux.bo) { iris_use_pinned_bo(batch, isv->res->aux.bo, false); @@ -4158,7 +4653,7 @@ } } - return isv->surface_state.offset + + return isv->surface_state.ref.offset + surf_state_offset_for_aux(isv->res, isv->res->aux.sampler_usages, aux_usage); } @@ -4170,7 +4665,7 @@ struct iris_state_ref *surf_state, bool writable) { - if (!buf->buffer) + if (!buf->buffer || !surf_state->res) return use_null_surface(batch, ice); iris_use_pinned_bo(batch, iris_resource_bo(buf->buffer), writable); @@ -4192,12 +4687,12 @@ bool write = iv->base.shader_access & PIPE_IMAGE_ACCESS_WRITE; iris_use_pinned_bo(batch, res->bo, write); - iris_use_pinned_bo(batch, iris_resource_bo(iv->surface_state.res), false); + iris_use_pinned_bo(batch, iris_resource_bo(iv->surface_state.ref.res), false); if (res->aux.bo) iris_use_pinned_bo(batch, res->aux.bo, write); - return iv->surface_state.offset; + return iv->surface_state.ref.offset; } #define push_bt_entry(addr) \ @@ -4267,7 +4762,7 @@ } push_bt_entry(addr); } - } else { + } else if (GEN_GEN < 11) { uint32_t addr = use_null_fb_surface(batch, ice); push_bt_entry(addr); } @@ -4549,49 +5044,265 @@ iris_resource_bo(ice->state.last_res.cs_thread_ids); iris_use_pinned_bo(batch, curbe_bo, false); - struct brw_stage_prog_data *prog_data = shader->prog_data; + struct brw_stage_prog_data *prog_data = shader->prog_data; + + if (prog_data->total_scratch > 0) { + struct iris_bo *bo = + iris_get_scratch_space(ice, prog_data->total_scratch, stage); + iris_use_pinned_bo(batch, bo, true); + } + } + } +} + +/** + * Possibly emit STATE_BASE_ADDRESS to update Surface State Base Address. + */ +static void +iris_update_surface_base_address(struct iris_batch *batch, + struct iris_binder *binder) +{ + if (batch->last_surface_base_address == binder->bo->gtt_offset) + return; + + uint32_t mocs = batch->screen->isl_dev.mocs.internal; + + flush_before_state_base_change(batch); + +#if GEN_GEN == 12 + /* GEN:BUG:1607854226: + * + * Workaround the non pipelined state not applying in MEDIA/GPGPU pipeline + * mode by putting the pipeline temporarily in 3D mode.. + */ + if (batch->name == IRIS_BATCH_COMPUTE) + emit_pipeline_select(batch, _3D); +#endif + + iris_emit_cmd(batch, GENX(STATE_BASE_ADDRESS), sba) { + sba.SurfaceStateBaseAddressModifyEnable = true; + sba.SurfaceStateBaseAddress = ro_bo(binder->bo, 0); + + /* The hardware appears to pay attention to the MOCS fields even + * if you don't set the "Address Modify Enable" bit for the base. + */ + sba.GeneralStateMOCS = mocs; + sba.StatelessDataPortAccessMOCS = mocs; + sba.DynamicStateMOCS = mocs; + sba.IndirectObjectMOCS = mocs; + sba.InstructionMOCS = mocs; + sba.SurfaceStateMOCS = mocs; +#if GEN_GEN >= 9 + sba.BindlessSurfaceStateMOCS = mocs; +#endif + } + +#if GEN_GEN == 12 + /* GEN:BUG:1607854226: + * + * Put the pipeline back into compute mode. + */ + if (batch->name == IRIS_BATCH_COMPUTE) + emit_pipeline_select(batch, GPGPU); +#endif + + flush_after_state_base_change(batch); + + batch->last_surface_base_address = binder->bo->gtt_offset; +} + +static inline void +iris_viewport_zmin_zmax(const struct pipe_viewport_state *vp, bool halfz, + bool window_space_position, float *zmin, float *zmax) +{ + if (window_space_position) { + *zmin = 0.f; + *zmax = 1.f; + return; + } + util_viewport_zmin_zmax(vp, halfz, zmin, zmax); +} + +#if GEN_GEN >= 12 +void +genX(invalidate_aux_map_state)(struct iris_batch *batch) +{ + struct iris_screen *screen = batch->screen; + void *aux_map_ctx = iris_bufmgr_get_aux_map_context(screen->bufmgr); + if (!aux_map_ctx) + return; + uint32_t aux_map_state_num = gen_aux_map_get_state_num(aux_map_ctx); + if (batch->last_aux_map_state != aux_map_state_num) { + /* HSD 1209978178: docs say that before programming the aux table: + * + * "Driver must ensure that the engine is IDLE but ensure it doesn't + * add extra flushes in the case it knows that the engine is already + * IDLE." + * + * An end of pipe sync is needed here, otherwise we see GPU hangs in + * dEQP-GLES31.functional.copy_image.* tests. + */ + iris_emit_end_of_pipe_sync(batch, "Invalidate aux map table", + PIPE_CONTROL_CS_STALL); + + /* If the aux-map state number increased, then we need to rewrite the + * register. Rewriting the register is used to both set the aux-map + * translation table address, and also to invalidate any previously + * cached translations. + */ + iris_load_register_imm32(batch, GENX(GFX_CCS_AUX_INV_num), 1); + batch->last_aux_map_state = aux_map_state_num; + } +} + +static void +init_aux_map_state(struct iris_batch *batch) +{ + struct iris_screen *screen = batch->screen; + void *aux_map_ctx = iris_bufmgr_get_aux_map_context(screen->bufmgr); + if (!aux_map_ctx) + return; + + uint64_t base_addr = gen_aux_map_get_base(aux_map_ctx); + assert(base_addr != 0 && align64(base_addr, 32 * 1024) == base_addr); + iris_load_register_imm64(batch, GENX(GFX_AUX_TABLE_BASE_ADDR_num), + base_addr); +} +#endif + +struct push_bos { + struct { + struct iris_address addr; + uint32_t length; + } buffers[4]; + int buffer_count; + uint32_t max_length; +}; + +static void +setup_constant_buffers(struct iris_context *ice, + struct iris_batch *batch, + int stage, + struct push_bos *push_bos) +{ + struct iris_shader_state *shs = &ice->state.shaders[stage]; + struct iris_compiled_shader *shader = ice->shaders.prog[stage]; + struct brw_stage_prog_data *prog_data = (void *) shader->prog_data; + + uint32_t push_range_sum = 0; + + int n = 0; + for (int i = 0; i < 4; i++) { + const struct brw_ubo_range *range = &prog_data->ubo_ranges[i]; + + if (range->length == 0) + continue; + + push_range_sum += range->length; + + if (range->length > push_bos->max_length) + push_bos->max_length = range->length; + + /* Range block is a binding table index, map back to UBO index. */ + unsigned block_index = iris_bti_to_group_index( + &shader->bt, IRIS_SURFACE_GROUP_UBO, range->block); + assert(block_index != IRIS_SURFACE_NOT_USED); + + struct pipe_shader_buffer *cbuf = &shs->constbuf[block_index]; + struct iris_resource *res = (void *) cbuf->buffer; + + assert(cbuf->buffer_offset % 32 == 0); + + push_bos->buffers[n].length = range->length; + push_bos->buffers[n].addr = + res ? ro_bo(res->bo, range->start * 32 + cbuf->buffer_offset) + : ro_bo(batch->screen->workaround_bo, 0); + n++; + } + + /* From the 3DSTATE_CONSTANT_XS and 3DSTATE_CONSTANT_ALL programming notes: + * + * "The sum of all four read length fields must be less than or + * equal to the size of 64." + */ + assert(push_range_sum <= 64); + + push_bos->buffer_count = n; +} + +static void +emit_push_constant_packets(struct iris_context *ice, + struct iris_batch *batch, + int stage, + const struct push_bos *push_bos) +{ + struct iris_compiled_shader *shader = ice->shaders.prog[stage]; + struct brw_stage_prog_data *prog_data = (void *) shader->prog_data; - if (prog_data->total_scratch > 0) { - struct iris_bo *bo = - iris_get_scratch_space(ice, prog_data->total_scratch, stage); - iris_use_pinned_bo(batch, bo, true); + iris_emit_cmd(batch, GENX(3DSTATE_CONSTANT_VS), pkt) { + pkt._3DCommandSubOpcode = push_constant_opcodes[stage]; + if (prog_data) { + /* The Skylake PRM contains the following restriction: + * + * "The driver must ensure The following case does not occur + * without a flush to the 3D engine: 3DSTATE_CONSTANT_* with + * buffer 3 read length equal to zero committed followed by a + * 3DSTATE_CONSTANT_* with buffer 0 read length not equal to + * zero committed." + * + * To avoid this, we program the buffers in the highest slots. + * This way, slot 0 is only used if slot 3 is also used. + */ + int n = push_bos->buffer_count; + assert(n <= 4); + const unsigned shift = 4 - n; + for (int i = 0; i < n; i++) { + pkt.ConstantBody.ReadLength[i + shift] = + push_bos->buffers[i].length; + pkt.ConstantBody.Buffer[i + shift] = push_bos->buffers[i].addr; } } } } -/** - * Possibly emit STATE_BASE_ADDRESS to update Surface State Base Address. - */ +#if GEN_GEN >= 12 static void -iris_update_surface_base_address(struct iris_batch *batch, - struct iris_binder *binder) +emit_push_constant_packet_all(struct iris_context *ice, + struct iris_batch *batch, + uint32_t shader_mask, + const struct push_bos *push_bos) { - if (batch->last_surface_base_address == binder->bo->gtt_offset) + if (!push_bos) { + iris_emit_cmd(batch, GENX(3DSTATE_CONSTANT_ALL), pc) { + pc.ShaderUpdateEnable = shader_mask; + } return; - - flush_for_state_base_change(batch); - - iris_emit_cmd(batch, GENX(STATE_BASE_ADDRESS), sba) { - sba.SurfaceStateMOCS = MOCS_WB; - sba.SurfaceStateBaseAddressModifyEnable = true; - sba.SurfaceStateBaseAddress = ro_bo(binder->bo, 0); } - batch->last_surface_base_address = binder->bo->gtt_offset; -} - -static inline void -iris_viewport_zmin_zmax(const struct pipe_viewport_state *vp, bool halfz, - bool window_space_position, float *zmin, float *zmax) -{ - if (window_space_position) { - *zmin = 0.f; - *zmax = 1.f; - return; + const uint32_t n = push_bos->buffer_count; + const uint32_t max_pointers = 4; + const uint32_t num_dwords = 2 + 2 * n; + uint32_t const_all[2 + 2 * max_pointers]; + uint32_t *dw = &const_all[0]; + + assert(n <= max_pointers); + iris_pack_command(GENX(3DSTATE_CONSTANT_ALL), dw, all) { + all.DWordLength = num_dwords - 2; + all.ShaderUpdateEnable = shader_mask; + all.PointerBufferMask = (1 << n) - 1; + } + dw += 2; + + for (int i = 0; i < n; i++) { + _iris_pack_state(batch, GENX(3DSTATE_CONSTANT_ALL_DATA), + dw + i * 2, data) { + data.PointerToConstantBuffer = push_bos->buffers[i].addr; + data.ConstantBufferReadLength = push_bos->buffers[i].length; + } } - util_viewport_zmin_zmax(vp, halfz, zmin, zmax); + iris_batch_emit(batch, const_all, sizeof(uint32_t) * num_dwords); } +#endif static void iris_upload_dirty_render_state(struct iris_context *ice, @@ -4703,9 +5414,22 @@ assert(size[i] != 0); } - genX(emit_urb_setup)(ice, batch, size, - ice->shaders.prog[MESA_SHADER_TESS_EVAL] != NULL, - ice->shaders.prog[MESA_SHADER_GEOMETRY] != NULL); + unsigned entries[4], start[4]; + gen_get_urb_config(&batch->screen->devinfo, + batch->screen->l3_config_3d, + ice->shaders.prog[MESA_SHADER_TESS_EVAL] != NULL, + ice->shaders.prog[MESA_SHADER_GEOMETRY] != NULL, + size, entries, start, + &ice->state.urb_deref_block_size); + + for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) { + iris_emit_cmd(batch, GENX(3DSTATE_URB_VS), urb) { + urb._3DCommandSubOpcode += i; + urb.VSURBStartingAddress = start[i]; + urb.VSURBEntryAllocationSize = size[i] - 1; + urb.VSNumberofURBEntries = entries[i]; + } + } } if (dirty & IRIS_DIRTY_BLEND_STATE) { @@ -4771,8 +5495,23 @@ } } + /* GEN:BUG:1604061319 + * + * 3DSTATE_CONSTANT_* needs to be programmed before BTP_* + * + * Testing shows that all the 3DSTATE_CONSTANT_XS need to be emitted if + * any stage has a dirty binding table. + */ + const bool emit_const_wa = GEN_GEN >= 11 && + (dirty & IRIS_ALL_DIRTY_BINDINGS) != 0; + +#if GEN_GEN >= 12 + uint32_t nobuffer_stages = 0; +#endif + for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) { - if (!(dirty & (IRIS_DIRTY_CONSTANTS_VS << stage))) + if (!(dirty & (IRIS_DIRTY_CONSTANTS_VS << stage)) && + !emit_const_wa) continue; struct iris_shader_state *shs = &ice->state.shaders[stage]; @@ -4784,52 +5523,42 @@ if (shs->sysvals_need_upload) upload_sysvals(ice, stage); - struct brw_stage_prog_data *prog_data = (void *) shader->prog_data; - - iris_emit_cmd(batch, GENX(3DSTATE_CONSTANT_VS), pkt) { - pkt._3DCommandSubOpcode = push_constant_opcodes[stage]; - if (prog_data) { - /* The Skylake PRM contains the following restriction: - * - * "The driver must ensure The following case does not occur - * without a flush to the 3D engine: 3DSTATE_CONSTANT_* with - * buffer 3 read length equal to zero committed followed by a - * 3DSTATE_CONSTANT_* with buffer 0 read length not equal to - * zero committed." - * - * To avoid this, we program the buffers in the highest slots. - * This way, slot 0 is only used if slot 3 is also used. - */ - int n = 3; - - for (int i = 3; i >= 0; i--) { - const struct brw_ubo_range *range = &prog_data->ubo_ranges[i]; + struct push_bos push_bos = {}; + setup_constant_buffers(ice, batch, stage, &push_bos); - if (range->length == 0) - continue; - - /* Range block is a binding table index, map back to UBO index. */ - unsigned block_index = iris_bti_to_group_index( - &shader->bt, IRIS_SURFACE_GROUP_UBO, range->block); - assert(block_index != IRIS_SURFACE_NOT_USED); - - struct pipe_shader_buffer *cbuf = &shs->constbuf[block_index]; - struct iris_resource *res = (void *) cbuf->buffer; - - assert(cbuf->buffer_offset % 32 == 0); +#if GEN_GEN >= 12 + /* If this stage doesn't have any push constants, emit it later in a + * single CONSTANT_ALL packet with all the other stages. + */ + if (push_bos.buffer_count == 0) { + nobuffer_stages |= 1 << stage; + continue; + } - pkt.ConstantBody.ReadLength[n] = range->length; - pkt.ConstantBody.Buffer[n] = - res ? ro_bo(res->bo, range->start * 32 + cbuf->buffer_offset) - : ro_bo(batch->screen->workaround_bo, 0); - n--; - } - } + /* The Constant Buffer Read Length field from 3DSTATE_CONSTANT_ALL + * contains only 5 bits, so we can only use it for buffers smaller than + * 32. + */ + if (push_bos.max_length < 32) { + emit_push_constant_packet_all(ice, batch, 1 << stage, &push_bos); + continue; } +#endif + emit_push_constant_packets(ice, batch, stage, &push_bos); } +#if GEN_GEN >= 12 + if (nobuffer_stages) + emit_push_constant_packet_all(ice, batch, nobuffer_stages, NULL); +#endif + for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) { - if (dirty & (IRIS_DIRTY_BINDINGS_VS << stage)) { + /* Gen9 requires 3DSTATE_BINDING_TABLE_POINTERS_XS to be re-emitted + * in order to commit constants. TODO: Investigate "Disable Gather + * at Set Shader" to go back to legacy mode... + */ + if (dirty & ((IRIS_DIRTY_BINDINGS_VS | + (GEN_GEN == 9 ? IRIS_DIRTY_CONSTANTS_VS : 0)) << stage)) { iris_emit_cmd(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_VS), ptr) { ptr._3DCommandSubOpcode = 38 + stage; ptr.PointertoVSBindingTable = binder->bt_offset[stage]; @@ -4837,6 +5566,24 @@ } } + if (GEN_GEN >= 11 && (dirty & IRIS_DIRTY_RENDER_BUFFER)) { + // XXX: we may want to flag IRIS_DIRTY_MULTISAMPLE (or SAMPLE_MASK?) + // XXX: see commit 979fc1bc9bcc64027ff2cfafd285676f31b930a6 + + /* The PIPE_CONTROL command description says: + * + * "Whenever a Binding Table Index (BTI) used by a Render Target + * Message points to a different RENDER_SURFACE_STATE, SW must issue a + * Render Target Cache Flush by enabling this bit. When render target + * flush is set due to new association of BTI, PS Scoreboard Stall bit + * must be set in this packet." + */ + // XXX: does this need to happen at 3DSTATE_BTP_PS time? + iris_emit_pipe_control_flush(batch, "workaround: RT BTI change [draw]", + PIPE_CONTROL_RENDER_TARGET_FLUSH | + PIPE_CONTROL_STALL_AT_SCOREBOARD); + } + for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) { if (dirty & (IRIS_DIRTY_BINDINGS_VS << stage)) { iris_populate_binding_table(ice, batch, stage, false); @@ -5055,13 +5802,17 @@ ARRAY_SIZE(cso_rast->clip)); } - if (dirty & IRIS_DIRTY_RASTER) { + if (dirty & (IRIS_DIRTY_RASTER | IRIS_DIRTY_URB)) { struct iris_rasterizer_state *cso = ice->state.cso_rast; iris_batch_emit(batch, cso->raster, sizeof(cso->raster)); uint32_t dynamic_sf[GENX(3DSTATE_SF_length)]; iris_pack_command(GENX(3DSTATE_SF), &dynamic_sf, sf) { sf.ViewportTransformEnable = !ice->state.window_space_position; + +#if GEN_GEN >= 12 + sf.DerefBlockSize = ice->state.urb_deref_block_size; +#endif } iris_emit_merge(batch, cso->sf, dynamic_sf, ARRAY_SIZE(dynamic_sf)); @@ -5130,6 +5881,10 @@ #else iris_batch_emit(batch, cso->wmds, sizeof(cso->wmds)); #endif + +#if GEN_GEN >= 12 + iris_batch_emit(batch, cso->depth_bounds, sizeof(cso->depth_bounds)); +#endif } if (dirty & IRIS_DIRTY_SCISSOR_RECT) { @@ -5154,6 +5909,18 @@ uint32_t clear_length = GENX(3DSTATE_CLEAR_PARAMS_length) * 4; uint32_t cso_z_size = sizeof(cso_z->packets) - clear_length; iris_batch_emit(batch, cso_z->packets, cso_z_size); + if (GEN_GEN >= 12) { + /* GEN:BUG:1408224581 + * + * Workaround: Gen12LP Astep only An additional pipe control with + * post-sync = store dword operation would be required.( w/a is to + * have an additional pipe control after the stencil state whenever + * the surface state bits of this state is changing). + */ + iris_emit_pipe_control_write(batch, "WA for stencil state", + PIPE_CONTROL_WRITE_IMMEDIATE, + batch->screen->workaround_bo, 0, 0); + } union isl_color_value clear_value = { .f32 = { 0, } }; @@ -5205,61 +5972,59 @@ int dynamic_bound = ice->state.bound_vertex_buffers; if (ice->state.vs_uses_draw_params) { - if (ice->draw.draw_params_offset == 0) { - u_upload_data(ice->ctx.stream_uploader, 0, sizeof(ice->draw.params), - 4, &ice->draw.params, &ice->draw.draw_params_offset, - &ice->draw.draw_params_res); - } - assert(ice->draw.draw_params_res); + assert(ice->draw.draw_params.res); struct iris_vertex_buffer_state *state = &(ice->state.genx->vertex_buffers[count]); - pipe_resource_reference(&state->resource, ice->draw.draw_params_res); + pipe_resource_reference(&state->resource, ice->draw.draw_params.res); struct iris_resource *res = (void *) state->resource; iris_pack_state(GENX(VERTEX_BUFFER_STATE), state->state, vb) { vb.VertexBufferIndex = count; vb.AddressModifyEnable = true; vb.BufferPitch = 0; - vb.BufferSize = res->bo->size - ice->draw.draw_params_offset; + vb.BufferSize = res->bo->size - ice->draw.draw_params.offset; vb.BufferStartingAddress = ro_bo(NULL, res->bo->gtt_offset + - (int) ice->draw.draw_params_offset); - vb.MOCS = mocs(res->bo); + (int) ice->draw.draw_params.offset); + vb.MOCS = iris_mocs(res->bo, &batch->screen->isl_dev); } dynamic_bound |= 1ull << count; count++; } if (ice->state.vs_uses_derived_draw_params) { - u_upload_data(ice->ctx.stream_uploader, 0, - sizeof(ice->draw.derived_params), 4, - &ice->draw.derived_params, - &ice->draw.derived_draw_params_offset, - &ice->draw.derived_draw_params_res); - struct iris_vertex_buffer_state *state = &(ice->state.genx->vertex_buffers[count]); pipe_resource_reference(&state->resource, - ice->draw.derived_draw_params_res); - struct iris_resource *res = (void *) ice->draw.derived_draw_params_res; + ice->draw.derived_draw_params.res); + struct iris_resource *res = (void *) ice->draw.derived_draw_params.res; iris_pack_state(GENX(VERTEX_BUFFER_STATE), state->state, vb) { vb.VertexBufferIndex = count; vb.AddressModifyEnable = true; vb.BufferPitch = 0; vb.BufferSize = - res->bo->size - ice->draw.derived_draw_params_offset; + res->bo->size - ice->draw.derived_draw_params.offset; vb.BufferStartingAddress = ro_bo(NULL, res->bo->gtt_offset + - (int) ice->draw.derived_draw_params_offset); - vb.MOCS = mocs(res->bo); + (int) ice->draw.derived_draw_params.offset); + vb.MOCS = iris_mocs(res->bo, &batch->screen->isl_dev); } dynamic_bound |= 1ull << count; count++; } if (count) { +#if GEN_GEN >= 11 + /* Gen11+ doesn't need the cache workaround below */ + uint64_t bound = dynamic_bound; + while (bound) { + const int i = u_bit_scan64(&bound); + iris_use_optional_res(batch, genx->vertex_buffers[i].resource, + false); + } +#else /* The VF cache designers cut corners, and made the cache key's * tuple only consider the bottom * 32 bits of the address. If you have two vertex buffers which get @@ -5295,6 +6060,7 @@ "workaround: VF cache 32-bit key [VB]", flush_flags); } +#endif const unsigned vb_dwords = GENX(VERTEX_BUFFER_STATE_length); @@ -5441,10 +6207,19 @@ } } +#if GEN_GEN == 8 + if (dirty & IRIS_DIRTY_PMA_FIX) { + bool enable = want_pma_fix(ice); + genX(update_pma_fix)(ice, batch, enable); + } +#endif + if (ice->state.current_hash_scale != 1) genX(emit_hashing_mode)(ice, batch, UINT_MAX, UINT_MAX, 1); - /* TODO: Gen8 PMA fix */ +#if GEN_GEN >= 12 + genX(invalidate_aux_map_state)(batch); +#endif } static void @@ -5490,7 +6265,7 @@ uint32_t ib_packet[GENX(3DSTATE_INDEX_BUFFER_length)]; iris_pack_command(GENX(3DSTATE_INDEX_BUFFER), ib_packet, ib) { ib.IndexFormat = draw->index_size >> 1; - ib.MOCS = mocs(bo); + ib.MOCS = iris_mocs(bo, &batch->screen->isl_dev); ib.BufferSize = bo->size - offset; ib.BufferStartingAddress = ro_bo(NULL, bo->gtt_offset + offset); } @@ -5501,6 +6276,7 @@ iris_use_pinned_bo(batch, bo, false); } +#if GEN_GEN < 11 /* The VF cache key only uses 32-bits, see vertex buffer comment above */ uint16_t high_bits = bo->gtt_offset >> 32ull; if (high_bits != ice->state.last_index_bo_high_bits) { @@ -5510,6 +6286,7 @@ PIPE_CONTROL_CS_STALL); ice->state.last_index_bo_high_bits = high_bits; } +#endif } #define _3DPRIM_END_OFFSET 0x2420 @@ -5550,15 +6327,14 @@ uint32_t mi_predicate; /* Upload the id of the current primitive to MI_PREDICATE_SRC1. */ - ice->vtbl.load_register_imm64(batch, MI_PREDICATE_SRC1, - draw->drawid); + iris_load_register_imm64(batch, MI_PREDICATE_SRC1, draw->drawid); /* Upload the current draw count from the draw parameters buffer * to MI_PREDICATE_SRC0. */ - ice->vtbl.load_register_mem32(batch, MI_PREDICATE_SRC0, - draw_count_bo, draw_count_offset); + iris_load_register_mem32(batch, MI_PREDICATE_SRC0, + draw_count_bo, draw_count_offset); /* Zero the top 32-bits of MI_PREDICATE_SRC0 */ - ice->vtbl.load_register_imm32(batch, MI_PREDICATE_SRC0 + 4, 0); + iris_load_register_imm32(batch, MI_PREDICATE_SRC0 + 4, 0); if (draw->drawid == 0) { mi_predicate = MI_PREDICATE | MI_PREDICATE_LOADOP_LOADINV | @@ -5698,6 +6474,10 @@ if (ice->state.need_border_colors) iris_use_pinned_bo(batch, ice->state.border_color_pool.bo, false); +#if GEN_GEN >= 12 + genX(invalidate_aux_map_state)(batch); +#endif + if (dirty & IRIS_DIRTY_CS) { /* The MEDIA_VFE_STATE documentation for Gen8+ says: * @@ -5842,8 +6622,8 @@ { struct iris_genx_state *genx = ice->state.genx; - pipe_resource_reference(&ice->draw.draw_params_res, NULL); - pipe_resource_reference(&ice->draw.derived_draw_params_res, NULL); + pipe_resource_reference(&ice->draw.draw_params.res, NULL); + pipe_resource_reference(&ice->draw.derived_draw_params.res, NULL); /* Loop over all VBOs, including ones for draw parameters */ for (unsigned i = 0; i < ARRAY_SIZE(genx->vertex_buffers); i++) { @@ -5870,7 +6650,8 @@ } for (int i = 0; i < PIPE_MAX_SHADER_IMAGES; i++) { pipe_resource_reference(&shs->image[i].base.resource, NULL); - pipe_resource_reference(&shs->image[i].surface_state.res, NULL); + pipe_resource_reference(&shs->image[i].surface_state.ref.res, NULL); + free(shs->image[i].surface_state.cpu); } for (int i = 0; i < PIPE_MAX_SHADER_BUFFERS; i++) { pipe_resource_reference(&shs->ssbo[i].buffer, NULL); @@ -5902,11 +6683,9 @@ static void iris_rebind_buffer(struct iris_context *ice, - struct iris_resource *res, - uint64_t old_address) + struct iris_resource *res) { struct pipe_context *ctx = &ice->ctx; - struct iris_screen *screen = (void *) ctx->screen; struct iris_genx_state *genx = ice->state.genx; assert(res->base.target == PIPE_BUFFER); @@ -5932,9 +6711,10 @@ STATIC_ASSERT(GENX(VERTEX_BUFFER_STATE_BufferStartingAddress_start) == 32); STATIC_ASSERT(GENX(VERTEX_BUFFER_STATE_BufferStartingAddress_bits) == 64); uint64_t *addr = (uint64_t *) &state->state[1]; + struct iris_bo *bo = iris_resource_bo(state->resource); - if (*addr == old_address + state->offset) { - *addr = res->bo->gtt_offset + state->offset; + if (*addr != bo->gtt_offset + state->offset) { + *addr = bo->gtt_offset + state->offset; ice->state.dirty |= IRIS_DIRTY_VERTEX_BUFFERS; } } @@ -5957,6 +6737,9 @@ struct iris_shader_state *shs = &ice->state.shaders[s]; enum pipe_shader_type p_stage = stage_to_pipe(s); + if (!(res->bind_stages & (1 << s))) + continue; + if (res->bind_history & PIPE_BIND_CONSTANT_BUFFER) { /* Skip constant buffer 0, it's for regular uniforms, not UBOs */ uint32_t bound_cbufs = shs->bound_cbufs & ~1u; @@ -5966,7 +6749,7 @@ struct iris_state_ref *surf_state = &shs->constbuf_surf_state[i]; if (res->bo == iris_resource_bo(cbuf->buffer)) { - iris_upload_ubo_ssbo_surf_state(ice, cbuf, surf_state, false); + pipe_resource_reference(&surf_state->res, NULL); ice->state.dirty |= IRIS_DIRTY_CONSTANTS_VS << s; } } @@ -5995,16 +6778,10 @@ while (bound_sampler_views) { const int i = u_bit_scan(&bound_sampler_views); struct iris_sampler_view *isv = shs->textures[i]; + struct iris_bo *bo = isv->res->bo; - if (res->bo == iris_resource_bo(isv->base.texture)) { - void *map = alloc_surface_states(ice->state.surface_uploader, - &isv->surface_state, - isv->res->aux.sampler_usages); - assert(map); - fill_buffer_surface_state(&screen->isl_dev, isv->res, map, - isv->view.format, isv->view.swizzle, - isv->base.u.buf.offset, - isv->base.u.buf.size); + if (update_surface_state_addrs(ice->state.surface_uploader, + &isv->surface_state, bo)) { ice->state.dirty |= IRIS_DIRTY_BINDINGS_VS << s; } } @@ -6015,9 +6792,11 @@ while (bound_image_views) { const int i = u_bit_scan(&bound_image_views); struct iris_image_view *iv = &shs->image[i]; + struct iris_bo *bo = iris_resource_bo(iv->base.resource); - if (res->bo == iris_resource_bo(iv->base.resource)) { - iris_set_shader_images(ctx, p_stage, i, 1, &iv->base); + if (update_surface_state_addrs(ice->state.surface_uploader, + &iv->surface_state, bo)) { + ice->state.dirty |= IRIS_DIRTY_BINDINGS_VS << s; } } } @@ -6026,130 +6805,6 @@ /* ------------------------------------------------------------------- */ -static void -iris_load_register_reg32(struct iris_batch *batch, uint32_t dst, - uint32_t src) -{ - _iris_emit_lrr(batch, dst, src); -} - -static void -iris_load_register_reg64(struct iris_batch *batch, uint32_t dst, - uint32_t src) -{ - _iris_emit_lrr(batch, dst, src); - _iris_emit_lrr(batch, dst + 4, src + 4); -} - -static void -iris_load_register_imm32(struct iris_batch *batch, uint32_t reg, - uint32_t val) -{ - _iris_emit_lri(batch, reg, val); -} - -static void -iris_load_register_imm64(struct iris_batch *batch, uint32_t reg, - uint64_t val) -{ - _iris_emit_lri(batch, reg + 0, val & 0xffffffff); - _iris_emit_lri(batch, reg + 4, val >> 32); -} - -/** - * Emit MI_LOAD_REGISTER_MEM to load a 32-bit MMIO register from a buffer. - */ -static void -iris_load_register_mem32(struct iris_batch *batch, uint32_t reg, - struct iris_bo *bo, uint32_t offset) -{ - iris_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) { - lrm.RegisterAddress = reg; - lrm.MemoryAddress = ro_bo(bo, offset); - } -} - -/** - * Load a 64-bit value from a buffer into a MMIO register via - * two MI_LOAD_REGISTER_MEM commands. - */ -static void -iris_load_register_mem64(struct iris_batch *batch, uint32_t reg, - struct iris_bo *bo, uint32_t offset) -{ - iris_load_register_mem32(batch, reg + 0, bo, offset + 0); - iris_load_register_mem32(batch, reg + 4, bo, offset + 4); -} - -static void -iris_store_register_mem32(struct iris_batch *batch, uint32_t reg, - struct iris_bo *bo, uint32_t offset, - bool predicated) -{ - iris_emit_cmd(batch, GENX(MI_STORE_REGISTER_MEM), srm) { - srm.RegisterAddress = reg; - srm.MemoryAddress = rw_bo(bo, offset); - srm.PredicateEnable = predicated; - } -} - -static void -iris_store_register_mem64(struct iris_batch *batch, uint32_t reg, - struct iris_bo *bo, uint32_t offset, - bool predicated) -{ - iris_store_register_mem32(batch, reg + 0, bo, offset + 0, predicated); - iris_store_register_mem32(batch, reg + 4, bo, offset + 4, predicated); -} - -static void -iris_store_data_imm32(struct iris_batch *batch, - struct iris_bo *bo, uint32_t offset, - uint32_t imm) -{ - iris_emit_cmd(batch, GENX(MI_STORE_DATA_IMM), sdi) { - sdi.Address = rw_bo(bo, offset); - sdi.ImmediateData = imm; - } -} - -static void -iris_store_data_imm64(struct iris_batch *batch, - struct iris_bo *bo, uint32_t offset, - uint64_t imm) -{ - /* Can't use iris_emit_cmd because MI_STORE_DATA_IMM has a length of - * 2 in genxml but it's actually variable length and we need 5 DWords. - */ - void *map = iris_get_command_space(batch, 4 * 5); - _iris_pack_command(batch, GENX(MI_STORE_DATA_IMM), map, sdi) { - sdi.DWordLength = 5 - 2; - sdi.Address = rw_bo(bo, offset); - sdi.ImmediateData = imm; - } -} - -static void -iris_copy_mem_mem(struct iris_batch *batch, - struct iris_bo *dst_bo, uint32_t dst_offset, - struct iris_bo *src_bo, uint32_t src_offset, - unsigned bytes) -{ - /* MI_COPY_MEM_MEM operates on DWords. */ - assert(bytes % 4 == 0); - assert(dst_offset % 4 == 0); - assert(src_offset % 4 == 0); - - for (unsigned i = 0; i < bytes; i += 4) { - iris_emit_cmd(batch, GENX(MI_COPY_MEM_MEM), cp) { - cp.DestinationMemoryAddress = rw_bo(dst_bo, dst_offset + i); - cp.SourceMemoryAddress = ro_bo(src_bo, src_offset + i); - } - } -} - -/* ------------------------------------------------------------------- */ - static unsigned flags_to_post_sync_op(uint32_t flags) { @@ -6234,6 +6889,18 @@ 0, NULL, 0, 0); } + /* GEN:BUG:1409226450, Wait for EU to be idle before pipe control which + * invalidates the instruction cache + */ + if (GEN_GEN == 12 && (flags & PIPE_CONTROL_INSTRUCTION_INVALIDATE)) { + iris_emit_raw_pipe_control(batch, + "workaround: CS stall before instruction " + "cache invalidate", + PIPE_CONTROL_CS_STALL | + PIPE_CONTROL_STALL_AT_SCOREBOARD, bo, offset, + imm); + } + if (GEN_GEN == 9 && IS_COMPUTE_PIPELINE(batch) && post_sync_flags) { /* Project: SKL / Argument: LRI Post Sync Operation [23] * @@ -6249,17 +6916,6 @@ PIPE_CONTROL_CS_STALL, bo, offset, imm); } - if (GEN_GEN == 10 && (flags & PIPE_CONTROL_RENDER_TARGET_FLUSH)) { - /* Cannonlake: - * "Before sending a PIPE_CONTROL command with bit 12 set, SW must issue - * another PIPE_CONTROL with Render Target Cache Flush Enable (bit 12) - * = 0 and Pipe Control Flush Enable (bit 7) = 1" - */ - iris_emit_raw_pipe_control(batch, - "workaround: PC flush before RT flush", - PIPE_CONTROL_FLUSH_ENABLE, bo, offset, imm); - } - /* "Flush Types" workarounds --------------------------------------------- * We do these now because they may add post-sync operations or CS stalls. */ @@ -6278,25 +6934,6 @@ } } - /* #1130 from Gen10 workarounds page: - * - * "Enable Depth Stall on every Post Sync Op if Render target Cache - * Flush is not enabled in same PIPE CONTROL and Enable Pixel score - * board stall if Render target cache flush is enabled." - * - * Applicable to CNL B0 and C0 steppings only. - * - * The wording here is unclear, and this workaround doesn't look anything - * like the internal bug report recommendations, but leave it be for now... - */ - if (GEN_GEN == 10) { - if (flags & PIPE_CONTROL_RENDER_TARGET_FLUSH) { - flags |= PIPE_CONTROL_STALL_AT_SCOREBOARD; - } else if (flags & non_lri_post_sync_flags) { - flags |= PIPE_CONTROL_DEPTH_STALL; - } - } - if (flags & PIPE_CONTROL_DEPTH_STALL) { /* From the PIPE_CONTROL instruction table, bit 13 (Depth Stall Enable): * @@ -6444,6 +7081,23 @@ flags |= PIPE_CONTROL_CS_STALL; } + if (GEN_GEN >= 12 && ((flags & PIPE_CONTROL_RENDER_TARGET_FLUSH) || + (flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH))) { + /* From the PIPE_CONTROL instruction table, bit 28 (Tile Cache Flush + * Enable): + * + * Unified Cache (Tile Cache Disabled): + * + * When the Color and Depth (Z) streams are enabled to be cached in + * the DC space of L2, Software must use "Render Target Cache Flush + * Enable" and "Depth Cache Flush Enable" along with "Tile Cache + * Flush" for getting the color and depth (Z) write data to be + * globally observable. In this mode of operation it is not required + * to set "CS Stall" upon setting "Tile Cache Flush" bit. + */ + flags |= PIPE_CONTROL_TILE_CACHE_FLUSH; + } + if (GEN_GEN == 9 && devinfo->gt == 4) { /* TODO: The big Skylake GT4 post sync op workaround */ } @@ -6535,11 +7189,20 @@ flags |= PIPE_CONTROL_STALL_AT_SCOREBOARD; } + if (GEN_GEN >= 12 && (flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH)) { + /* GEN:BUG:1409600907: + * + * "PIPE_CONTROL with Depth Stall Enable bit must be set + * with any PIPE_CONTROL with Depth Flush Enable bit set. + */ + flags |= PIPE_CONTROL_DEPTH_STALL; + } + /* Emit --------------------------------------------------------------- */ if (INTEL_DEBUG & DEBUG_PIPE_CONTROL) { fprintf(stderr, - " PC [%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%"PRIx64"]: %s\n", + " PC [%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%"PRIx64"]: %s\n", (flags & PIPE_CONTROL_FLUSH_ENABLE) ? "PipeCon " : "", (flags & PIPE_CONTROL_CS_STALL) ? "CS " : "", (flags & PIPE_CONTROL_STALL_AT_SCOREBOARD) ? "Scoreboard " : "", @@ -6562,10 +7225,17 @@ (flags & PIPE_CONTROL_WRITE_IMMEDIATE) ? "WriteImm " : "", (flags & PIPE_CONTROL_WRITE_DEPTH_COUNT) ? "WriteZCount " : "", (flags & PIPE_CONTROL_WRITE_TIMESTAMP) ? "WriteTimestamp " : "", + (flags & PIPE_CONTROL_FLUSH_HDC) ? "HDC " : "", imm, reason); } iris_emit_cmd(batch, GENX(PIPE_CONTROL), pc) { +#if GEN_GEN >= 12 + pc.TileCacheFlushEnable = flags & PIPE_CONTROL_TILE_CACHE_FLUSH; +#endif +#if GEN_GEN >= 11 + pc.HDCPipelineFlushEnable = flags & PIPE_CONTROL_FLUSH_HDC; +#endif pc.LRIPostSyncOperation = NoLRIOperation; pc.PipeControlFlushEnable = flags & PIPE_CONTROL_FLUSH_ENABLE; pc.DCFlushEnable = flags & PIPE_CONTROL_DATA_CACHE_FLUSH; @@ -6598,34 +7268,6 @@ } } -void -genX(emit_urb_setup)(struct iris_context *ice, - struct iris_batch *batch, - const unsigned size[4], - bool tess_present, bool gs_present) -{ - const struct gen_device_info *devinfo = &batch->screen->devinfo; - const unsigned push_size_kB = 32; - unsigned entries[4]; - unsigned start[4]; - - ice->shaders.last_vs_entry_size = size[MESA_SHADER_VERTEX]; - - gen_get_urb_config(devinfo, 1024 * push_size_kB, - 1024 * ice->shaders.urb_size, - tess_present, gs_present, - size, entries, start); - - for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) { - iris_emit_cmd(batch, GENX(3DSTATE_URB_VS), urb) { - urb._3DCommandSubOpcode += i; - urb.VSURBStartingAddress = start[i]; - urb.VSURBEntryAllocationSize = size[i] - 1; - urb.VSNumberofURBEntries = entries[i]; - } - } -} - #if GEN_GEN == 9 /** * Preemption on Gen9 has to be enabled or disabled in various cases. @@ -6881,7 +7523,6 @@ ice->vtbl.populate_gs_key = iris_populate_gs_key; ice->vtbl.populate_fs_key = iris_populate_fs_key; ice->vtbl.populate_cs_key = iris_populate_cs_key; - ice->vtbl.mocs = mocs; ice->vtbl.lost_genx_state = iris_lost_genx_state; ice->state.dirty = ~0ull; @@ -6892,6 +7533,7 @@ ice->state.num_viewports = 1; ice->state.prim_mode = PIPE_PRIM_MAX; ice->state.genx = calloc(1, sizeof(struct iris_genx_state)); + ice->draw.derived_params.drawid = -1; /* Make a 1x1x1 null surface for unbound textures */ void *null_surf_map = diff -Nru mesa-19.2.8/src/gallium/drivers/iris/Makefile.sources mesa-20.0.8/src/gallium/drivers/iris/Makefile.sources --- mesa-19.2.8/src/gallium/drivers/iris/Makefile.sources 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/iris/Makefile.sources 2020-06-12 01:21:17.000000000 +0000 @@ -45,6 +45,8 @@ iris_genx_macros.h \ iris_genx_protos.h \ iris_monitor.c \ + iris_performance_query.c \ + iris_perf.c \ iris_pipe.h \ iris_pipe_control.c \ iris_program.c \ diff -Nru mesa-19.2.8/src/gallium/drivers/iris/meson.build mesa-20.0.8/src/gallium/drivers/iris/meson.build --- mesa-19.2.8/src/gallium/drivers/iris/meson.build 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/iris/meson.build 2020-06-12 01:21:17.000000000 +0000 @@ -38,6 +38,9 @@ 'iris_genx_macros.h', 'iris_genx_protos.h', 'iris_monitor.c', + 'iris_perf.h', + 'iris_perf.c', + 'iris_performance_query.c', 'iris_pipe.h', 'iris_pipe_control.c', 'iris_program.c', @@ -62,9 +65,9 @@ ) iris_gen_libs = [] -foreach v : ['80', '90', '100', '110'] +foreach v : ['80', '90', '100', '110', '120'] iris_gen_libs += static_library( - 'libiris_gen@0@'.format(v), + 'iris_gen@0@'.format(v), ['iris_blorp.c', 'iris_query.c', 'iris_state.c', gen_xml_pack], include_directories : [inc_common, inc_intel], c_args : [ @@ -77,17 +80,16 @@ libiris = static_library( 'iris', - [files_libiris, gen_xml_pack, nir_opcodes_h, nir_builder_opcodes_h, - iris_driinfo_h], + [files_libiris, gen_xml_pack, iris_driinfo_h], include_directories : [ - inc_src, inc_include, inc_gallium, inc_gallium_aux, inc_intel, inc_nir, + inc_src, inc_include, inc_gallium, inc_gallium_aux, inc_intel, inc_gallium_drivers, # these should not be necessary, but main/macros.h... inc_mesa, inc_mapi ], c_args : [c_vis_args, c_sse2_args], cpp_args : [cpp_vis_args, c_sse2_args], - dependencies : [dep_libdrm, dep_valgrind, idep_genxml, idep_libintel_common], + dependencies : [dep_libdrm, dep_valgrind, idep_genxml, idep_libintel_common, idep_nir_headers], link_with : [ iris_gen_libs, libintel_compiler, libintel_dev, libisl, libblorp, libintel_perf diff -Nru mesa-19.2.8/src/gallium/drivers/kmsro/Android.mk mesa-20.0.8/src/gallium/drivers/kmsro/Android.mk --- mesa-19.2.8/src/gallium/drivers/kmsro/Android.mk 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/kmsro/Android.mk 2020-06-12 01:21:17.000000000 +0000 @@ -40,7 +40,9 @@ GALLIUM_TARGET_DRIVERS += ili9225 GALLIUM_TARGET_DRIVERS += ili9341 GALLIUM_TARGET_DRIVERS += imx +GALLIUM_TARGET_DRIVERS += ingenic-drm GALLIUM_TARGET_DRIVERS += stm +GALLIUM_TARGET_DRIVERS += mcde GALLIUM_TARGET_DRIVERS += mi0283qt GALLIUM_TARGET_DRIVERS += mxsfb-drm GALLIUM_TARGET_DRIVERS += pl111 diff -Nru mesa-19.2.8/src/gallium/drivers/lima/Android.mk mesa-20.0.8/src/gallium/drivers/lima/Android.mk --- mesa-19.2.8/src/gallium/drivers/lima/Android.mk 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/lima/Android.mk 2020-06-12 01:21:17.000000000 +0000 @@ -36,6 +36,7 @@ ir/gp/scheduler.c \ ir/lima_ir.h \ ir/lima_nir_lower_uniform_to_scalar.c \ + ir/lima_nir_split_load_input.c \ ir/pp/codegen.c \ ir/pp/codegen.h \ ir/pp/disasm.c \ @@ -46,6 +47,7 @@ ir/pp/node_to_instr.c \ ir/pp/ppir.h \ ir/pp/regalloc.c \ + ir/pp/liveness.c \ ir/pp/scheduler.c \ lima_bo.c \ lima_bo.h \ @@ -54,6 +56,8 @@ lima_draw.c \ lima_fence.c \ lima_fence.h \ + lima_parser.c \ + lima_parser.h \ lima_program.c \ lima_program.h \ lima_query.c \ @@ -67,7 +71,9 @@ lima_texture.c \ lima_texture.h \ lima_util.c \ - lima_util.h + lima_util.h \ + lima_format.c \ + lima_format.h LOCAL_MODULE := libmesa_pipe_lima diff -Nru mesa-19.2.8/src/gallium/drivers/lima/ir/gp/codegen.c mesa-20.0.8/src/gallium/drivers/lima/ir/gp/codegen.c --- mesa-19.2.8/src/gallium/drivers/lima/ir/gp/codegen.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/lima/ir/gp/codegen.c 2020-06-12 01:21:17.000000000 +0000 @@ -45,8 +45,6 @@ gpir_codegen_src_unused, gpir_codegen_src_p1_complex, gpir_codegen_src_unused }, [GPIR_INSTR_SLOT_PASS] = { gpir_codegen_src_unused, gpir_codegen_src_p1_pass, gpir_codegen_src_p2_pass }, - [GPIR_INSTR_SLOT_BRANCH] = { - gpir_codegen_src_unused, gpir_codegen_src_unused, gpir_codegen_src_unused }, [GPIR_INSTR_SLOT_REG0_LOAD0] = { gpir_codegen_src_attrib_x, gpir_codegen_src_p1_attrib_x, gpir_codegen_src_unused }, @@ -418,6 +416,22 @@ return; } + if (node->op == gpir_op_branch_cond) { + gpir_branch_node *branch = gpir_node_to_branch(node); + + code->pass_op = gpir_codegen_pass_op_pass; + code->pass_src = gpir_get_alu_input(node, branch->cond); + + /* Fill out branch information */ + unsigned offset = branch->dest->instr_offset; + assert(offset < 0x200); + code->branch = true; + code->branch_target = offset & 0xff; + code->branch_target_lo = !(offset >> 8); + code->unknown_1 = 13; + return; + } + gpir_alu_node *alu = gpir_node_to_alu(node); code->pass_src = gpir_get_alu_input(node, alu->children[0]); @@ -434,16 +448,7 @@ default: assert(0); } -} - -static void gpir_codegen_branch_slot(gpir_codegen_instr *code, gpir_instr *instr) -{ - gpir_node *node = instr->slots[GPIR_INSTR_SLOT_BRANCH]; - - if (!node) - return; - assert(0); } static void gpir_codegen_reg0_slot(gpir_codegen_instr *code, gpir_instr *instr) @@ -483,7 +488,7 @@ [GPIR_INSTR_SLOT_ADD1] = gpir_codegen_store_src_acc_1, [GPIR_INSTR_SLOT_COMPLEX] = gpir_codegen_store_src_complex, [GPIR_INSTR_SLOT_PASS] = gpir_codegen_store_src_pass, - [GPIR_INSTR_SLOT_BRANCH...GPIR_INSTR_SLOT_STORE3] = gpir_codegen_store_src_none, + [GPIR_INSTR_SLOT_REG0_LOAD0...GPIR_INSTR_SLOT_STORE3] = gpir_codegen_store_src_none, }; gpir_store_node *store = gpir_node_to_store(node); @@ -546,7 +551,6 @@ gpir_codegen_complex_slot(code, instr); gpir_codegen_pass_slot(code, instr); - gpir_codegen_branch_slot(code, instr); gpir_codegen_reg0_slot(code, instr); gpir_codegen_reg1_slot(code, instr); @@ -574,6 +578,7 @@ { int num_instr = 0; list_for_each_entry(gpir_block, block, &comp->block_list, list) { + block->instr_offset = num_instr; num_instr += list_length(&block->instr_list); } diff -Nru mesa-19.2.8/src/gallium/drivers/lima/ir/gp/disasm.c mesa-20.0.8/src/gallium/drivers/lima/ir/gp/disasm.c --- mesa-19.2.8/src/gallium/drivers/lima/ir/gp/disasm.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/lima/ir/gp/disasm.c 2020-06-12 01:21:17.000000000 +0000 @@ -238,7 +238,7 @@ case gpir_codegen_src_p1_attrib_w: printf("%c%d.%c", prev_instr->register0_attribute ? 'a' : '$', prev_instr->register0_addr, - "xyzw"[src - gpir_codegen_src_attrib_x]); + "xyzw"[src - gpir_codegen_src_p1_attrib_x]); break; } } diff -Nru mesa-19.2.8/src/gallium/drivers/lima/ir/gp/gpir.h mesa-20.0.8/src/gallium/drivers/lima/ir/gp/gpir.h --- mesa-19.2.8/src/gallium/drivers/lima/ir/gp/gpir.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/lima/ir/gp/gpir.h 2020-06-12 01:21:17.000000000 +0000 @@ -211,11 +211,6 @@ typedef struct { int index; struct list_head list; - - struct list_head defs_list; - struct list_head uses_list; - - int start, end; } gpir_reg; typedef struct { @@ -236,7 +231,6 @@ gpir_node *child; gpir_reg *reg; - struct list_head reg_link; } gpir_store_node; enum gpir_instr_slot { @@ -246,7 +240,6 @@ GPIR_INSTR_SLOT_ADD1, GPIR_INSTR_SLOT_PASS, GPIR_INSTR_SLOT_COMPLEX, - GPIR_INSTR_SLOT_BRANCH, GPIR_INSTR_SLOT_REG0_LOAD0, GPIR_INSTR_SLOT_REG0_LOAD1, GPIR_INSTR_SLOT_REG0_LOAD2, @@ -347,6 +340,33 @@ struct list_head instr_list; struct gpir_compiler *comp; + struct gpir_block *successors[2]; + struct list_head predecessors; + struct list_head predecessors_node; + + /* for regalloc */ + + /* The set of live registers, i.e. registers whose value may be used + * eventually, at the beginning of the block. + */ + BITSET_WORD *live_in; + + /* Set of live registers at the end of the block. */ + BITSET_WORD *live_out; + + /* Set of registers that may have a value defined at the end of the + * block. + */ + BITSET_WORD *def_out; + + /* After register allocation, the set of live physical registers at the end + * of the block. Needed for scheduling. + */ + uint64_t live_out_phys; + + /* For codegen, the offset in the final program. */ + unsigned instr_offset; + /* for scheduler */ union { struct { @@ -361,6 +381,7 @@ typedef struct { gpir_node node; gpir_block *dest; + gpir_node *cond; } gpir_branch_node; struct lima_vs_shader_state; @@ -373,8 +394,20 @@ struct list_head block_list; int cur_index; - /* array for searching ssa node */ - gpir_node **var_nodes; + /* Find the gpir node for a given NIR SSA def. */ + gpir_node **node_for_ssa; + + /* Find the gpir node for a given NIR register. */ + gpir_node **node_for_reg; + + /* Find the gpir register for a given NIR SSA def. */ + gpir_reg **reg_for_ssa; + + /* Find the gpir register for a given NIR register. */ + gpir_reg **reg_for_reg; + + /* gpir block for NIR block. */ + gpir_block **blocks; /* for physical reg */ struct list_head reg_list; @@ -421,18 +454,19 @@ static inline bool gpir_node_is_root(gpir_node *node) { - return list_empty(&node->succ_list); + return list_is_empty(&node->succ_list); } static inline bool gpir_node_is_leaf(gpir_node *node) { - return list_empty(&node->pred_list); + return list_is_empty(&node->pred_list); } #define gpir_node_to_alu(node) ((gpir_alu_node *)(node)) #define gpir_node_to_const(node) ((gpir_const_node *)(node)) #define gpir_node_to_load(node) ((gpir_load_node *)(node)) #define gpir_node_to_store(node) ((gpir_store_node *)(node)) +#define gpir_node_to_branch(node) ((gpir_branch_node *)(node)) gpir_instr *gpir_instr_create(gpir_block *block); bool gpir_instr_try_insert_node(gpir_instr *instr, gpir_node *node); @@ -442,7 +476,6 @@ bool gpir_codegen_acc_same_op(gpir_op op1, gpir_op op2); bool gpir_pre_rsched_lower_prog(gpir_compiler *comp); -bool gpir_post_rsched_lower_prog(gpir_compiler *comp); bool gpir_reduce_reg_pressure_schedule_prog(gpir_compiler *comp); bool gpir_regalloc_prog(gpir_compiler *comp); bool gpir_schedule_prog(gpir_compiler *comp); diff -Nru mesa-19.2.8/src/gallium/drivers/lima/ir/gp/instr.c mesa-20.0.8/src/gallium/drivers/lima/ir/gp/instr.c --- mesa-19.2.8/src/gallium/drivers/lima/ir/gp/instr.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/lima/ir/gp/instr.c 2020-06-12 01:21:17.000000000 +0000 @@ -535,7 +535,6 @@ [GPIR_INSTR_SLOT_REG0_LOAD3] = { 15, "load0" }, [GPIR_INSTR_SLOT_REG1_LOAD3] = { 15, "load1" }, [GPIR_INSTR_SLOT_MEM_LOAD3] = { 15, "load2" }, - [GPIR_INSTR_SLOT_BRANCH] = { 4, "bnch" }, [GPIR_INSTR_SLOT_STORE3] = { 15, "store" }, [GPIR_INSTR_SLOT_COMPLEX] = { 4, "cmpl" }, [GPIR_INSTR_SLOT_PASS] = { 4, "pass" }, diff -Nru mesa-19.2.8/src/gallium/drivers/lima/ir/gp/lower.c mesa-20.0.8/src/gallium/drivers/lima/ir/gp/lower.c --- mesa-19.2.8/src/gallium/drivers/lima/ir/gp/lower.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/lima/ir/gp/lower.c 2020-06-12 01:21:17.000000000 +0000 @@ -109,10 +109,7 @@ gpir_load_node *nload = gpir_node_to_load(new); nload->index = load->index; nload->component = load->component; - if (load->reg) { - nload->reg = load->reg; - list_addtail(&nload->reg_link, &load->reg->uses_list); - } + nload->reg = load->reg; gpir_node_replace_pred(dep, new); gpir_node_replace_child(succ, node, new); @@ -413,12 +410,29 @@ return true; } +/* There is no unconditional branch instruction, so we have to lower it to a + * conditional branch with a condition of 1.0. + */ + +static bool gpir_lower_branch_uncond(gpir_block *block, gpir_node *node) +{ + gpir_branch_node *branch = gpir_node_to_branch(node); + + gpir_node *node_const = gpir_node_create(block, gpir_op_const); + gpir_const_node *c = gpir_node_to_const(node_const); + + list_addtail(&c->node.list, &node->list); + c->value.f = 1.0f; + gpir_node_add_dep(&branch->node, &c->node, GPIR_DEP_INPUT); + + branch->node.op = gpir_op_branch_cond; + branch->cond = node_const; + + return true; +} static bool (*gpir_pre_rsched_lower_funcs[gpir_op_num])(gpir_block *, gpir_node *) = { [gpir_op_not] = gpir_lower_not, -}; - -static bool (*gpir_post_rsched_lower_funcs[gpir_op_num])(gpir_block *, gpir_node *) = { [gpir_op_neg] = gpir_lower_neg, [gpir_op_rcp] = gpir_lower_complex, [gpir_op_rsqrt] = gpir_lower_complex, @@ -427,6 +441,7 @@ [gpir_op_eq] = gpir_lower_eq_ne, [gpir_op_ne] = gpir_lower_eq_ne, [gpir_op_abs] = gpir_lower_abs, + [gpir_op_branch_uncond] = gpir_lower_branch_uncond, }; bool gpir_pre_rsched_lower_prog(gpir_compiler *comp) @@ -445,25 +460,11 @@ if (!gpir_lower_load(comp)) return false; - gpir_debug("pre rsched lower prog\n"); - gpir_node_print_prog_seq(comp); - return true; -} - -bool gpir_post_rsched_lower_prog(gpir_compiler *comp) -{ - list_for_each_entry(gpir_block, block, &comp->block_list, list) { - list_for_each_entry_safe(gpir_node, node, &block->node_list, list) { - if (gpir_post_rsched_lower_funcs[node->op] && - !gpir_post_rsched_lower_funcs[node->op](block, node)) - return false; - } - } - if (!gpir_lower_node_may_consume_two_slots(comp)) return false; - gpir_debug("post rsched lower prog\n"); + gpir_debug("pre rsched lower prog\n"); gpir_node_print_prog_seq(comp); return true; } + diff -Nru mesa-19.2.8/src/gallium/drivers/lima/ir/gp/nir.c mesa-20.0.8/src/gallium/drivers/lima/ir/gp/nir.c --- mesa-19.2.8/src/gallium/drivers/lima/ir/gp/nir.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/lima/ir/gp/nir.c 2020-06-12 01:21:17.000000000 +0000 @@ -30,82 +30,119 @@ #include "gpir.h" #include "lima_context.h" +gpir_reg *gpir_create_reg(gpir_compiler *comp) +{ + gpir_reg *reg = ralloc(comp, gpir_reg); + reg->index = comp->cur_reg++; + list_addtail(®->list, &comp->reg_list); + return reg; +} + +static gpir_reg *reg_for_nir_reg(gpir_compiler *comp, nir_register *nir_reg) +{ + unsigned index = nir_reg->index; + gpir_reg *reg = comp->reg_for_reg[index]; + if (reg) + return reg; + reg = gpir_create_reg(comp); + comp->reg_for_reg[index] = reg; + return reg; +} -static inline void *gpir_node_create_ssa(gpir_block *block, gpir_op op, nir_ssa_def *ssa) +static void register_node_ssa(gpir_block *block, gpir_node *node, nir_ssa_def *ssa) { - int index = ssa->index; - gpir_node *node = gpir_node_create(block, op); + block->comp->node_for_ssa[ssa->index] = node; + snprintf(node->name, sizeof(node->name), "ssa%d", ssa->index); - block->comp->var_nodes[index] = node; - snprintf(node->name, sizeof(node->name), "ssa%d", index); - list_addtail(&node->list, &block->node_list); - return node; + /* If any uses are outside the current block, we'll need to create a + * register and store to it. + */ + bool needs_register = false; + nir_foreach_use(use, ssa) { + if (use->parent_instr->block != ssa->parent_instr->block) { + needs_register = true; + break; + } + } + + if (!needs_register) { + nir_foreach_if_use(use, ssa) { + if (nir_cf_node_prev(&use->parent_if->cf_node) != + &ssa->parent_instr->block->cf_node) { + needs_register = true; + break; + } + } + } + + if (needs_register) { + gpir_store_node *store = gpir_node_create(block, gpir_op_store_reg); + store->child = node; + store->reg = gpir_create_reg(block->comp); + gpir_node_add_dep(&store->node, node, GPIR_DEP_INPUT); + list_addtail(&store->node.list, &block->node_list); + block->comp->reg_for_ssa[ssa->index] = store->reg; + } } -static inline void *gpir_node_create_reg(gpir_block *block, gpir_op op, nir_reg_dest *reg) +static void register_node_reg(gpir_block *block, gpir_node *node, nir_reg_dest *nir_reg) { - int index = reg->reg->index; - gpir_node *node = gpir_node_create(block, op); + block->comp->node_for_reg[nir_reg->reg->index] = node; gpir_store_node *store = gpir_node_create(block, gpir_op_store_reg); - snprintf(node->name, sizeof(node->name), "reg%d", index); + snprintf(node->name, sizeof(node->name), "reg%d", nir_reg->reg->index); store->child = node; + store->reg = reg_for_nir_reg(block->comp, nir_reg->reg); gpir_node_add_dep(&store->node, node, GPIR_DEP_INPUT); - list_for_each_entry(gpir_reg, reg, &block->comp->reg_list, list) { - if (reg->index == index) { - store->reg = reg; - list_addtail(&store->reg_link, ®->defs_list); - break; - } - } - - list_addtail(&node->list, &block->node_list); list_addtail(&store->node.list, &block->node_list); - return node; } -static void *gpir_node_create_dest(gpir_block *block, gpir_op op, nir_dest *dest) +/* Register the given gpir_node as providing the given NIR destination, so + * that gpir_node_find() will return it. Also insert any stores necessary if + * the destination will be used after the end of this basic block. The node + * must already be inserted. + */ +static void register_node(gpir_block *block, gpir_node *node, nir_dest *dest) { if (dest->is_ssa) - return gpir_node_create_ssa(block, op, &dest->ssa); + register_node_ssa(block, node, &dest->ssa); else - return gpir_node_create_reg(block, op, &dest->reg); + register_node_reg(block, node, &dest->reg); } -static gpir_node *gpir_node_find(gpir_block *block, gpir_node *succ, nir_src *src, +static gpir_node *gpir_node_find(gpir_block *block, nir_src *src, int channel) { + gpir_reg *reg = NULL; gpir_node *pred = NULL; - if (src->is_ssa) { if (src->ssa->num_components > 1) { for (int i = 0; i < GPIR_VECTOR_SSA_NUM; i++) { if (block->comp->vector_ssa[i].ssa == src->ssa->index) { - pred = block->comp->vector_ssa[i].nodes[channel]; - break; + return block->comp->vector_ssa[i].nodes[channel]; } } - } else - pred = block->comp->var_nodes[src->ssa->index]; - - assert(pred); - } - else { - pred = gpir_node_create(block, gpir_op_load_reg); - list_addtail(&pred->list, &succ->list); - - gpir_load_node *load = gpir_node_to_load(pred); - list_for_each_entry(gpir_reg, reg, &block->comp->reg_list, list) { - if (reg->index == src->reg.reg->index) { - load->reg = reg; - list_addtail(&load->reg_link, ®->uses_list); - break; - } + } else { + gpir_node *pred = block->comp->node_for_ssa[src->ssa->index]; + if (pred->block == block) + return pred; + reg = block->comp->reg_for_ssa[src->ssa->index]; } + } else { + pred = block->comp->node_for_reg[src->reg.reg->index]; + if (pred && pred->block == block) + return pred; + reg = reg_for_nir_reg(block->comp, src->reg.reg); } + assert(reg); + pred = gpir_node_create(block, gpir_op_load_reg); + gpir_load_node *load = gpir_node_to_load(pred); + load->reg = reg; + list_addtail(&pred->list, &block->node_list); + return pred; } @@ -130,12 +167,25 @@ [nir_op_seq] = gpir_op_eq, [nir_op_sne] = gpir_op_ne, [nir_op_fabs] = gpir_op_abs, - [nir_op_mov] = gpir_op_mov, }; static bool gpir_emit_alu(gpir_block *block, nir_instr *ni) { nir_alu_instr *instr = nir_instr_as_alu(ni); + + /* gpir_op_mov is useless before the final scheduler, and the scheduler + * currently doesn't expect us to emit it. Just register the destination of + * this instruction with its source. This will also emit any necessary + * register loads/stores for things like "r0 = mov ssa_0" or + * "ssa_0 = mov r0". + */ + if (instr->op == nir_op_mov) { + gpir_node *child = gpir_node_find(block, &instr->src[0].src, + instr->src[0].swizzle[0]); + register_node(block, child, &instr->dest.dest); + return true; + } + int op = nir_to_gpir_opcodes[instr->op]; if (op < 0) { @@ -143,7 +193,7 @@ return false; } - gpir_alu_node *node = gpir_node_create_dest(block, op, &instr->dest.dest); + gpir_alu_node *node = gpir_node_create(block, op); if (unlikely(!node)) return false; @@ -155,24 +205,29 @@ nir_alu_src *src = instr->src + i; node->children_negate[i] = src->negate; - gpir_node *child = gpir_node_find(block, &node->node, &src->src, src->swizzle[0]); + gpir_node *child = gpir_node_find(block, &src->src, src->swizzle[0]); node->children[i] = child; gpir_node_add_dep(&node->node, child, GPIR_DEP_INPUT); } + list_addtail(&node->node.list, &block->node_list); + register_node(block, &node->node, &instr->dest.dest); + return true; } static gpir_node *gpir_create_load(gpir_block *block, nir_dest *dest, int op, int index, int component) { - gpir_load_node *load = gpir_node_create_dest(block, op, dest); + gpir_load_node *load = gpir_node_create(block, op); if (unlikely(!load)) return NULL; load->index = index; load->component = component; + list_addtail(&load->node.list, &block->node_list); + register_node(block, &load->node, dest); return &load->node; } @@ -224,14 +279,13 @@ gpir_store_node *store = gpir_node_create(block, gpir_op_store_varying); if (unlikely(!store)) return false; - list_addtail(&store->node.list, &block->node_list); - + gpir_node *child = gpir_node_find(block, instr->src, 0); + store->child = child; store->index = nir_intrinsic_base(instr); store->component = nir_intrinsic_component(instr); - gpir_node *child = gpir_node_find(block, &store->node, instr->src, 0); - store->child = child; gpir_node_add_dep(&store->node, child, GPIR_DEP_INPUT); + list_addtail(&store->node.list, &block->node_list); return true; } @@ -245,8 +299,7 @@ static bool gpir_emit_load_const(gpir_block *block, nir_instr *ni) { nir_load_const_instr *instr = nir_instr_as_load_const(ni); - gpir_const_node *node = - gpir_node_create_ssa(block, gpir_op_const, &instr->def); + gpir_const_node *node = gpir_node_create(block, gpir_op_const); if (unlikely(!node)) return false; @@ -255,6 +308,8 @@ node->value.i = instr->value[0].i32; + list_addtail(&node->node.list, &block->node_list); + register_node_ssa(block, &node->node, &instr->def); return true; } @@ -272,8 +327,8 @@ static bool gpir_emit_jump(gpir_block *block, nir_instr *ni) { - gpir_error("nir_jump_instr not support\n"); - return false; + /* Jumps are emitted at the end of the basic block, so do nothing. */ + return true; } static bool (*gpir_emit_instr[nir_instr_type_phi])(gpir_block *, nir_instr *) = { @@ -285,94 +340,67 @@ [nir_instr_type_jump] = gpir_emit_jump, }; -static gpir_block *gpir_block_create(gpir_compiler *comp) +static bool gpir_emit_function(gpir_compiler *comp, nir_function_impl *impl) { - gpir_block *block = ralloc(comp, gpir_block); - if (!block) - return NULL; - - list_inithead(&block->node_list); - list_inithead(&block->instr_list); - - return block; -} - -static bool gpir_emit_block(gpir_compiler *comp, nir_block *nblock) -{ - gpir_block *block = gpir_block_create(comp); - if (!block) - return false; - - list_addtail(&block->list, &comp->block_list); - block->comp = comp; + nir_index_blocks(impl); + comp->blocks = ralloc_array(comp, gpir_block *, impl->num_blocks); - nir_foreach_instr(instr, nblock) { - assert(instr->type < nir_instr_type_phi); - if (!gpir_emit_instr[instr->type](block, instr)) + nir_foreach_block(block_nir, impl) { + gpir_block *block = ralloc(comp, gpir_block); + if (!block) return false; - } - return true; -} + list_inithead(&block->node_list); + list_inithead(&block->instr_list); -static bool gpir_emit_if(gpir_compiler *comp, nir_if *nif) -{ - gpir_error("if nir_cf_node not support\n"); - return false; -} + list_addtail(&block->list, &comp->block_list); + block->comp = comp; + comp->blocks[block_nir->index] = block; + } -static bool gpir_emit_loop(gpir_compiler *comp, nir_loop *nloop) -{ - gpir_error("loop nir_cf_node not support\n"); - return false; -} + nir_foreach_block(block_nir, impl) { + gpir_block *block = comp->blocks[block_nir->index]; + nir_foreach_instr(instr, block_nir) { + assert(instr->type < nir_instr_type_phi); + if (!gpir_emit_instr[instr->type](block, instr)) + return false; + } -static bool gpir_emit_function(gpir_compiler *comp, nir_function_impl *nfunc) -{ - gpir_error("function nir_cf_node not support\n"); - return false; -} + if (block_nir->successors[0] == impl->end_block) + block->successors[0] = NULL; + else + block->successors[0] = comp->blocks[block_nir->successors[0]->index]; + block->successors[1] = NULL; + + if (block_nir->successors[1] != NULL) { + nir_if *nif = nir_cf_node_as_if(nir_cf_node_next(&block_nir->cf_node)); + gpir_alu_node *cond = gpir_node_create(block, gpir_op_not); + cond->children[0] = gpir_node_find(block, &nif->condition, 0); + + gpir_node_add_dep(&cond->node, cond->children[0], GPIR_DEP_INPUT); + list_addtail(&cond->node.list, &block->node_list); + + gpir_branch_node *branch = gpir_node_create(block, gpir_op_branch_cond); + list_addtail(&branch->node.list, &block->node_list); + + branch->dest = comp->blocks[block_nir->successors[1]->index]; + block->successors[1] = branch->dest; + + branch->cond = &cond->node; + gpir_node_add_dep(&branch->node, &cond->node, GPIR_DEP_INPUT); + + assert(block_nir->successors[0]->index == block_nir->index + 1); + } else if (block_nir->successors[0]->index != block_nir->index + 1) { + gpir_branch_node *branch = gpir_node_create(block, gpir_op_branch_uncond); + list_addtail(&branch->node.list, &block->node_list); -static bool gpir_emit_cf_list(gpir_compiler *comp, struct exec_list *list) -{ - foreach_list_typed(nir_cf_node, node, node, list) { - bool ret; - - switch (node->type) { - case nir_cf_node_block: - ret = gpir_emit_block(comp, nir_cf_node_as_block(node)); - break; - case nir_cf_node_if: - ret = gpir_emit_if(comp, nir_cf_node_as_if(node)); - break; - case nir_cf_node_loop: - ret = gpir_emit_loop(comp, nir_cf_node_as_loop(node)); - break; - case nir_cf_node_function: - ret = gpir_emit_function(comp, nir_cf_node_as_function(node)); - break; - default: - gpir_error("unknown NIR node type %d\n", node->type); - return false; + branch->dest = comp->blocks[block_nir->successors[0]->index]; } - - if (!ret) - return false; } return true; } -gpir_reg *gpir_create_reg(gpir_compiler *comp) -{ - gpir_reg *reg = ralloc(comp, gpir_reg); - reg->index = comp->cur_reg++; - list_addtail(®->list, &comp->reg_list); - list_inithead(®->defs_list); - list_inithead(®->uses_list); - return reg; -} - static gpir_compiler *gpir_compiler_create(void *prog, unsigned num_reg, unsigned num_ssa) { gpir_compiler *comp = rzalloc(prog, gpir_compiler); @@ -380,13 +408,13 @@ list_inithead(&comp->block_list); list_inithead(&comp->reg_list); - for (int i = 0; i < num_reg; i++) - gpir_create_reg(comp); - for (int i = 0; i < GPIR_VECTOR_SSA_NUM; i++) comp->vector_ssa[i].ssa = -1; - comp->var_nodes = rzalloc_array(comp, gpir_node *, num_ssa); + comp->node_for_ssa = rzalloc_array(comp, gpir_node *, num_ssa); + comp->node_for_reg = rzalloc_array(comp, gpir_node *, num_reg); + comp->reg_for_ssa = rzalloc_array(comp, gpir_reg *, num_ssa); + comp->reg_for_reg = rzalloc_array(comp, gpir_reg *, num_reg); comp->prog = prog; return comp; } @@ -429,8 +457,10 @@ comp->constant_base = nir->num_uniforms; prog->uniform_pending_offset = nir->num_uniforms * 16; + prog->gl_pos_idx = 0; + prog->point_size_idx = -1; - if (!gpir_emit_cf_list(comp, &func->body)) + if (!gpir_emit_function(comp, func)) goto err_out0; gpir_node_print_prog_seq(comp); @@ -445,9 +475,6 @@ if (!gpir_reduce_reg_pressure_schedule_prog(comp)) goto err_out0; - if (!gpir_post_rsched_lower_prog(comp)) - goto err_out0; - if (!gpir_regalloc_prog(comp)) goto err_out0; @@ -458,13 +485,24 @@ goto err_out0; nir_foreach_variable(var, &nir->outputs) { - if (var->data.location == VARYING_SLOT_POS) - assert(var->data.driver_location == 0); + bool varying = true; + switch (var->data.location) { + case VARYING_SLOT_POS: + prog->gl_pos_idx = var->data.driver_location; + varying = false; + break; + case VARYING_SLOT_PSIZ: + prog->point_size_idx = var->data.driver_location; + varying = false; + break; + } struct lima_varying_info *v = prog->varying + var->data.driver_location; if (!v->components) { v->component_size = gpir_glsl_type_size(glsl_get_base_type(var->type)); - prog->num_varying++; + prog->num_outputs++; + if (varying) + prog->num_varyings++; } v->components += glsl_get_components(var->type); diff -Nru mesa-19.2.8/src/gallium/drivers/lima/ir/gp/node.c mesa-20.0.8/src/gallium/drivers/lima/ir/gp/node.c --- mesa-19.2.8/src/gallium/drivers/lima/ir/gp/node.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/lima/ir/gp/node.c 2020-06-12 01:21:17.000000000 +0000 @@ -246,6 +246,8 @@ [gpir_op_branch_cond] = { .name = "branch_cond", .type = gpir_node_type_branch, + .schedule_first = true, + .slots = (int []) { GPIR_INSTR_SLOT_PASS, GPIR_INSTR_SLOT_END }, }, [gpir_op_const] = { .name = "const", @@ -380,6 +382,10 @@ gpir_store_node *store = gpir_node_to_store(parent); if (store->child == old_child) store->child = new_child; + } else if (parent->type == gpir_node_type_branch) { + gpir_branch_node *branch = gpir_node_to_branch(parent); + if (branch->cond == old_child) + branch->cond = new_child; } } @@ -427,17 +433,6 @@ ralloc_free(dep); } - if (node->type == gpir_node_type_store) { - gpir_store_node *store = gpir_node_to_store(node); - if (store->reg) - list_del(&store->reg_link); - } - else if (node->type == gpir_node_type_load) { - gpir_load_node *load = gpir_node_to_load(node); - if (load->reg) - list_del(&load->reg_link); - } - list_del(&node->list); ralloc_free(node); } diff -Nru mesa-19.2.8/src/gallium/drivers/lima/ir/gp/reduce_scheduler.c mesa-20.0.8/src/gallium/drivers/lima/ir/gp/reduce_scheduler.c --- mesa-19.2.8/src/gallium/drivers/lima/ir/gp/reduce_scheduler.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/lima/ir/gp/reduce_scheduler.c 2020-06-12 01:21:17.000000000 +0000 @@ -107,7 +107,12 @@ struct list_head *insert_pos = ready_list; list_for_each_entry(gpir_node, node, ready_list, list) { - if (insert_node->rsched.parent_index < node->rsched.parent_index || + if (gpir_op_infos[node->op].schedule_first) { + continue; + } + + if (gpir_op_infos[insert_node->op].schedule_first || + insert_node->rsched.parent_index < node->rsched.parent_index || (insert_node->rsched.parent_index == node->rsched.parent_index && (insert_node->rsched.reg_pressure < node->rsched.reg_pressure || (insert_node->rsched.reg_pressure == node->rsched.reg_pressure && @@ -123,7 +128,7 @@ static void schedule_ready_list(gpir_block *block, struct list_head *ready_list) { - if (list_empty(ready_list)) + if (list_is_empty(ready_list)) return; gpir_node *node = list_first_entry(ready_list, gpir_node, list); @@ -185,21 +190,47 @@ schedule_ready_list(block, &ready_list); } -bool gpir_reduce_reg_pressure_schedule_prog(gpir_compiler *comp) +/* Due to how we translate from NIR, we never read a register written in the + * same block (we just pass the node through instead), so we don't have to + * worry about read-after-write dependencies. We do have to worry about + * write-after-read though, so we add those dependencies now. For example in a + * loop like this we need a dependency between the write and the read of i: + * + * i = ... + * while (...) { + * ... = i; + * i = i + 1; + * } + */ + +static void add_false_dependencies(gpir_compiler *comp) { - /* No need to build physical reg load/store dependency here, - * because we just exit SSA form, there should be at most - * one load and one store pair for a physical reg within a - * block, and the store must be after load with the output - * of load as input after some calculation. So we don't need to - * insert extra write-after-read or read-after-write dependecy - * for load/store nodes to maintain the right sequence before - * scheduling. - * - * Also no need to handle SSA def/use in difference block, - * because we'll load/store SSA to a physical reg if def/use - * are not in the same block. + /* Make sure we allocate this only once, in case there are many values and + * many blocks. */ + gpir_node **last_written = calloc(comp->cur_reg, sizeof(gpir_node *)); + + list_for_each_entry(gpir_block, block, &comp->block_list, list) { + list_for_each_entry_rev(gpir_node, node, &block->node_list, list) { + if (node->op == gpir_op_load_reg) { + gpir_load_node *load = gpir_node_to_load(node); + gpir_node *store = last_written[load->reg->index]; + if (store && store->block == block) { + gpir_node_add_dep(store, node, GPIR_DEP_WRITE_AFTER_READ); + } + } else if (node->op == gpir_op_store_reg) { + gpir_store_node *store = gpir_node_to_store(node); + last_written[store->reg->index] = node; + } + } + } + + free(last_written); +} + +bool gpir_reduce_reg_pressure_schedule_prog(gpir_compiler *comp) +{ + add_false_dependencies(comp); list_for_each_entry(gpir_block, block, &comp->block_list, list) { block->rsched.node_index = 0; diff -Nru mesa-19.2.8/src/gallium/drivers/lima/ir/gp/regalloc.c mesa-20.0.8/src/gallium/drivers/lima/ir/gp/regalloc.c --- mesa-19.2.8/src/gallium/drivers/lima/ir/gp/regalloc.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/lima/ir/gp/regalloc.c 2020-06-12 01:21:17.000000000 +0000 @@ -23,69 +23,432 @@ */ #include "gpir.h" +#include "util/u_dynarray.h" -/* Register allocation - * - * TODO: This needs to be rewritten when we support multiple basic blocks. We - * need to do proper liveness analysis, combined with either linear scan, - * graph coloring, or SSA-based allocation. We should also support spilling to - * temporaries. - * - * For now, this only assigns fake registers to values, used to build the fake - * dependencies that the scheduler relies on. In the future we should also be - * assigning actual physreg numbers to load_reg/store_reg nodes. - */ +/* Per-register information */ + +struct reg_info { + BITSET_WORD *conflicts; + struct util_dynarray conflict_list; + + /* Number of conflicts that must be allocated to physical registers. + */ + unsigned phys_conflicts; + + unsigned node_conflicts; + + /* Number of conflicts that can be allocated to either. */ + unsigned total_conflicts; + + int assigned_color; + + bool visited; +}; + +struct regalloc_ctx { + unsigned bitset_words, num_nodes_and_regs; + struct reg_info *registers; + + /* Reusable scratch liveness array */ + BITSET_WORD *live; -static void regalloc_block(gpir_block *block) + unsigned *worklist; + unsigned worklist_start, worklist_end; + + unsigned *stack; + unsigned stack_size; + + gpir_compiler *comp; + void *mem_ctx; +}; + +/* Liveness analysis */ + +static void propagate_liveness_instr(gpir_node *node, BITSET_WORD *live, + gpir_compiler *comp) { - /* build each node sequence index in the block node list */ - int index = 0; - list_for_each_entry(gpir_node, node, &block->node_list, list) { - node->vreg.index = index++; + /* KILL */ + if (node->type == gpir_node_type_store) { + if (node->op == gpir_op_store_reg) { + gpir_store_node *store = gpir_node_to_store(node); + BITSET_CLEAR(live, store->reg->index); + } } - /* find the last successor of each node by the sequence index */ - list_for_each_entry(gpir_node, node, &block->node_list, list) { - node->vreg.last = NULL; - gpir_node_foreach_succ(node, dep) { - gpir_node *succ = dep->succ; - if (!node->vreg.last || node->vreg.last->vreg.index < succ->vreg.index) - node->vreg.last = succ; + /* GEN */ + if (node->type == gpir_node_type_load) { + if (node->op == gpir_op_load_reg) { + gpir_load_node *load = gpir_node_to_load(node); + BITSET_SET(live, load->reg->index); } } +} + +static bool propagate_liveness_block(gpir_block *block, struct regalloc_ctx *ctx) +{ + for (unsigned i = 0; i < 2; i++) { + if (block->successors[i]) { + for (unsigned j = 0; j < ctx->bitset_words; j++) + block->live_out[j] |= block->successors[i]->live_in[j]; + } + } + + memcpy(ctx->live, block->live_out, ctx->bitset_words * sizeof(BITSET_WORD)); + + list_for_each_entry_rev(gpir_node, node, &block->node_list, list) { + propagate_liveness_instr(node, ctx->live, block->comp); + } - /* do linear scan regalloc */ - int reg_search_start = 0; - gpir_node *active[GPIR_VALUE_REG_NUM + GPIR_PHYSICAL_REG_NUM] = {0}; + bool changed = false; + for (unsigned i = 0; i < ctx->bitset_words; i++) { + changed |= (block->live_in[i] != ctx->live[i]); + block->live_in[i] = ctx->live[i]; + } + return changed; +} + +static void calc_def_block(gpir_block *block) +{ list_for_each_entry(gpir_node, node, &block->node_list, list) { - /* if some reg is expired */ - gpir_node_foreach_pred(node, dep) { - gpir_node *pred = dep->pred; - if (pred->vreg.last == node) - active[pred->value_reg] = NULL; - } - - /* no need to alloc value reg for root node */ - if (gpir_node_is_root(node)) { - node->value_reg = -1; - continue; - } - - /* find a free reg for this node */ - int i; - for (i = 0; i < GPIR_VALUE_REG_NUM + GPIR_PHYSICAL_REG_NUM; i++) { - /* round robin reg select to reduce false dep when schedule */ - int reg = (reg_search_start + i) % (GPIR_VALUE_REG_NUM + GPIR_PHYSICAL_REG_NUM); - if (!active[reg]) { - active[reg] = node; - node->value_reg = reg; - reg_search_start++; + if (node->op == gpir_op_store_reg) { + gpir_store_node *store = gpir_node_to_store(node); + BITSET_SET(block->def_out, store->reg->index); + } + } +} + +static void calc_liveness(struct regalloc_ctx *ctx) +{ + bool changed = true; + while (changed) { + changed = false; + list_for_each_entry_rev(gpir_block, block, &ctx->comp->block_list, list) { + changed |= propagate_liveness_block(block, ctx); + } + } + + list_for_each_entry(gpir_block, block, &ctx->comp->block_list, list) { + calc_def_block(block); + } + + changed = true; + while (changed) { + changed = false; + list_for_each_entry(gpir_block, block, &ctx->comp->block_list, list) { + for (unsigned i = 0; i < 2; i++) { + gpir_block *succ = block->successors[i]; + if (!succ) + continue; + + for (unsigned j = 0; j < ctx->bitset_words; j++) { + BITSET_WORD new = block->def_out[j] & ~succ->def_out[j]; + changed |= (new != 0); + succ->def_out[j] |= block->def_out[j]; + } + } + } + } +} + +/* Interference calculation */ + +static void add_interference(struct regalloc_ctx *ctx, unsigned i, unsigned j) +{ + if (i == j) + return; + + struct reg_info *a = &ctx->registers[i]; + struct reg_info *b = &ctx->registers[j]; + + if (BITSET_TEST(a->conflicts, j)) + return; + + BITSET_SET(a->conflicts, j); + BITSET_SET(b->conflicts, i); + + a->total_conflicts++; + b->total_conflicts++; + if (j < ctx->comp->cur_reg) + a->phys_conflicts++; + else + a->node_conflicts++; + + if (i < ctx->comp->cur_reg) + b->phys_conflicts++; + else + b->node_conflicts++; + + util_dynarray_append(&a->conflict_list, unsigned, j); + util_dynarray_append(&b->conflict_list, unsigned, i); +} + +/* Make the register or node "i" intefere with all the other live registers + * and nodes. + */ +static void add_all_interferences(struct regalloc_ctx *ctx, + unsigned i, + BITSET_WORD *live_nodes, + BITSET_WORD *live_regs) +{ + int live_node; + BITSET_FOREACH_SET(live_node, live_nodes, ctx->comp->cur_index) { + add_interference(ctx, i, + live_node + ctx->comp->cur_reg); + } + + int live_reg; + BITSET_FOREACH_SET(live_reg, ctx->live, ctx->comp->cur_index) { + add_interference(ctx, i, live_reg); + } + +} + +static void print_liveness(struct regalloc_ctx *ctx, + BITSET_WORD *live_reg, BITSET_WORD *live_val) +{ + if (!(lima_debug & LIMA_DEBUG_GP)) + return; + + int live_idx; + BITSET_FOREACH_SET(live_idx, live_reg, ctx->comp->cur_reg) { + printf("reg%d ", live_idx); + } + BITSET_FOREACH_SET(live_idx, live_val, ctx->comp->cur_index) { + printf("%d ", live_idx); + } + printf("\n"); +} + +static void calc_interference(struct regalloc_ctx *ctx) +{ + BITSET_WORD *live_nodes = + rzalloc_array(ctx->mem_ctx, BITSET_WORD, ctx->comp->cur_index); + + list_for_each_entry(gpir_block, block, &ctx->comp->block_list, list) { + /* Initialize liveness at the end of the block, but exclude values that + * definitely aren't defined by the end. This helps out with + * partially-defined registers, like: + * + * if (condition) { + * foo = ...; + * } + * if (condition) { + * ... = foo; + * } + * + * If we naively propagated liveness backwards, foo would be live from + * the beginning of the program, but if we're not inside a loop then + * its value is undefined before the first if and we don't have to + * consider it live. Mask out registers like foo here. + */ + for (unsigned i = 0; i < ctx->bitset_words; i++) { + ctx->live[i] = block->live_out[i] & block->def_out[i]; + } + + list_for_each_entry_rev(gpir_node, node, &block->node_list, list) { + gpir_debug("processing node %d\n", node->index); + print_liveness(ctx, ctx->live, live_nodes); + if (node->type != gpir_node_type_store && + node->type != gpir_node_type_branch) { + add_all_interferences(ctx, node->index + ctx->comp->cur_reg, + live_nodes, ctx->live); + + /* KILL */ + BITSET_CLEAR(live_nodes, node->index); + } else if (node->op == gpir_op_store_reg) { + gpir_store_node *store = gpir_node_to_store(node); + add_all_interferences(ctx, store->reg->index, + live_nodes, ctx->live); + + /* KILL */ + BITSET_CLEAR(ctx->live, store->reg->index); + } + + /* GEN */ + if (node->type == gpir_node_type_store) { + gpir_store_node *store = gpir_node_to_store(node); + BITSET_SET(live_nodes, store->child->index); + } else if (node->type == gpir_node_type_alu) { + gpir_alu_node *alu = gpir_node_to_alu(node); + for (int i = 0; i < alu->num_child; i++) + BITSET_SET(live_nodes, alu->children[i]->index); + } else if (node->type == gpir_node_type_branch) { + gpir_branch_node *branch = gpir_node_to_branch(node); + BITSET_SET(live_nodes, branch->cond->index); + } else if (node->op == gpir_op_load_reg) { + gpir_load_node *load = gpir_node_to_load(node); + BITSET_SET(ctx->live, load->reg->index); + } + } + } +} + +/* Register allocation */ + +static bool can_simplify(struct regalloc_ctx *ctx, unsigned i) +{ + struct reg_info *info = &ctx->registers[i]; + if (i < ctx->comp->cur_reg) { + /* Physical regs. */ + return info->phys_conflicts + info->node_conflicts < GPIR_PHYSICAL_REG_NUM; + } else { + /* Nodes: if we manage to allocate all of its conflicting physical + * registers, they will take up at most GPIR_PHYSICAL_REG_NUM colors, so + * we can ignore any more than that. + */ + return MIN2(info->phys_conflicts, GPIR_PHYSICAL_REG_NUM) + + info->node_conflicts < GPIR_PHYSICAL_REG_NUM + GPIR_VALUE_REG_NUM; + } +} + +static void push_stack(struct regalloc_ctx *ctx, unsigned i) +{ + ctx->stack[ctx->stack_size++] = i; + if (i < ctx->comp->cur_reg) + gpir_debug("pushing reg%u\n", i); + else + gpir_debug("pushing %d\n", i - ctx->comp->cur_reg); + + struct reg_info *info = &ctx->registers[i]; + assert(info->visited); + + util_dynarray_foreach(&info->conflict_list, unsigned, conflict) { + struct reg_info *conflict_info = &ctx->registers[*conflict]; + if (i < ctx->comp->cur_reg) { + assert(conflict_info->phys_conflicts > 0); + conflict_info->phys_conflicts--; + } else { + assert(conflict_info->node_conflicts > 0); + conflict_info->node_conflicts--; + } + if (!ctx->registers[*conflict].visited && can_simplify(ctx, *conflict)) { + ctx->worklist[ctx->worklist_end++] = *conflict; + ctx->registers[*conflict].visited = true; + } + } +} + +static bool do_regalloc(struct regalloc_ctx *ctx) +{ + ctx->worklist_start = 0; + ctx->worklist_end = 0; + ctx->stack_size = 0; + + /* Step 1: find the initially simplifiable registers */ + for (int i = 0; i < ctx->comp->cur_reg + ctx->comp->cur_index; i++) { + if (can_simplify(ctx, i)) { + ctx->worklist[ctx->worklist_end++] = i; + ctx->registers[i].visited = true; + } + } + + while (true) { + /* Step 2: push onto the stack whatever we can */ + while (ctx->worklist_start != ctx->worklist_end) { + push_stack(ctx, ctx->worklist[ctx->worklist_start++]); + } + + if (ctx->stack_size < ctx->num_nodes_and_regs) { + /* If there are still unsimplifiable nodes left, we need to + * optimistically push a node onto the stack. Choose the one with + * the smallest number of current neighbors, since that's the most + * likely to succeed. + */ + unsigned min_conflicts = UINT_MAX; + unsigned best_reg = 0; + for (unsigned reg = 0; reg < ctx->num_nodes_and_regs; reg++) { + struct reg_info *info = &ctx->registers[reg]; + if (info->visited) + continue; + if (info->phys_conflicts + info->node_conflicts < min_conflicts) { + best_reg = reg; + min_conflicts = info->phys_conflicts + info->node_conflicts; + } + } + gpir_debug("optimistic triggered\n"); + ctx->registers[best_reg].visited = true; + push_stack(ctx, best_reg); + } else { + break; + } + } + + /* Step 4: pop off the stack and assign colors */ + for (int i = ctx->num_nodes_and_regs - 1; i >= 0; i--) { + unsigned idx = ctx->stack[i]; + struct reg_info *reg = &ctx->registers[idx]; + + unsigned num_available_regs; + if (idx < ctx->comp->cur_reg) { + num_available_regs = GPIR_PHYSICAL_REG_NUM; + } else { + num_available_regs = GPIR_VALUE_REG_NUM + GPIR_PHYSICAL_REG_NUM; + } + + bool found = false; + unsigned start = i % num_available_regs; + for (unsigned j = 0; j < num_available_regs; j++) { + unsigned candidate = (j + start) % num_available_regs; + bool available = true; + util_dynarray_foreach(®->conflict_list, unsigned, conflict_idx) { + struct reg_info *conflict = &ctx->registers[*conflict_idx]; + if (conflict->assigned_color >= 0 && + conflict->assigned_color == (int) candidate) { + available = false; + break; + } + } + + if (available) { + reg->assigned_color = candidate; + found = true; break; } } - /* TODO: spill */ - assert(i != GPIR_VALUE_REG_NUM + GPIR_PHYSICAL_REG_NUM); + /* TODO: spilling */ + if (!found) { + gpir_error("Failed to allocate registers\n"); + return false; + } + } + + return true; +} + +static void assign_regs(struct regalloc_ctx *ctx) +{ + list_for_each_entry(gpir_block, block, &ctx->comp->block_list, list) { + list_for_each_entry(gpir_node, node, &block->node_list, list) { + if (node->index >= 0) { + node->value_reg = + ctx->registers[ctx->comp->cur_reg + node->index].assigned_color; + } + + if (node->op == gpir_op_load_reg) { + gpir_load_node *load = gpir_node_to_load(node); + unsigned color = ctx->registers[load->reg->index].assigned_color; + load->index = color / 4; + load->component = color % 4; + } + + if (node->op == gpir_op_store_reg) { + gpir_store_node *store = gpir_node_to_store(node); + unsigned color = ctx->registers[store->reg->index].assigned_color; + store->index = color / 4; + store->component = color % 4; + node->value_reg = color; + } + } + + block->live_out_phys = 0; + + int reg_idx; + BITSET_FOREACH_SET(reg_idx, block->live_out, ctx->comp->cur_reg) { + if (BITSET_TEST(block->def_out, reg_idx)) { + block->live_out_phys |= (1ull << ctx->registers[reg_idx].assigned_color); + } + } } } @@ -104,6 +467,14 @@ gpir_node *pred = dep->pred; printf(" %d/%d", pred->index, pred->value_reg); } + if (node->op == gpir_op_load_reg) { + gpir_load_node *load = gpir_node_to_load(node); + printf(" -/%d", 4 * load->index + load->component); + printf(" (%d)", load->reg->index); + } else if (node->op == gpir_op_store_reg) { + gpir_store_node *store = gpir_node_to_store(node); + printf(" (%d)", store->reg->index); + } printf("\n"); } printf("----------------------------\n"); @@ -112,10 +483,38 @@ bool gpir_regalloc_prog(gpir_compiler *comp) { + struct regalloc_ctx ctx; + + ctx.mem_ctx = ralloc_context(NULL); + ctx.num_nodes_and_regs = comp->cur_reg + comp->cur_index; + ctx.bitset_words = BITSET_WORDS(ctx.num_nodes_and_regs); + ctx.live = ralloc_array(ctx.mem_ctx, BITSET_WORD, ctx.bitset_words); + ctx.worklist = ralloc_array(ctx.mem_ctx, unsigned, ctx.num_nodes_and_regs); + ctx.stack = ralloc_array(ctx.mem_ctx, unsigned, ctx.num_nodes_and_regs); + ctx.comp = comp; + + ctx.registers = rzalloc_array(ctx.mem_ctx, struct reg_info, ctx.num_nodes_and_regs); + for (unsigned i = 0; i < ctx.num_nodes_and_regs; i++) { + ctx.registers[i].conflicts = rzalloc_array(ctx.mem_ctx, BITSET_WORD, + ctx.bitset_words); + util_dynarray_init(&ctx.registers[i].conflict_list, ctx.mem_ctx); + } + list_for_each_entry(gpir_block, block, &comp->block_list, list) { - regalloc_block(block); + block->live_out = rzalloc_array(ctx.mem_ctx, BITSET_WORD, ctx.bitset_words); + block->live_in = rzalloc_array(ctx.mem_ctx, BITSET_WORD, ctx.bitset_words); + block->def_out = rzalloc_array(ctx.mem_ctx, BITSET_WORD, ctx.bitset_words); + } + + calc_liveness(&ctx); + calc_interference(&ctx); + if (!do_regalloc(&ctx)) { + ralloc_free(ctx.mem_ctx); + return false; } + assign_regs(&ctx); regalloc_print_result(comp); + ralloc_free(ctx.mem_ctx); return true; } diff -Nru mesa-19.2.8/src/gallium/drivers/lima/ir/gp/scheduler.c mesa-20.0.8/src/gallium/drivers/lima/ir/gp/scheduler.c --- mesa-19.2.8/src/gallium/drivers/lima/ir/gp/scheduler.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/lima/ir/gp/scheduler.c 2020-06-12 01:21:17.000000000 +0000 @@ -215,6 +215,14 @@ * schedule the instruction. */ int total_spill_needed; + + /* For each physical register, a linked list of loads associated with it in + * this block. When we spill a value to a given register, and there are + * existing loads associated with it that haven't been scheduled yet, we + * have to make sure that the corresponding unspill happens after the last + * original use has happened, i.e. is scheduled before. + */ + struct list_head physreg_reads[GPIR_PHYSICAL_REG_NUM]; } sched_ctx; static int gpir_min_dist_alu(gpir_dep *dep) @@ -441,8 +449,9 @@ struct list_head *insert_pos = &ctx->ready_list; list_for_each_entry(gpir_node, node, &ctx->ready_list, list) { - if (insert_node->sched.dist > node->sched.dist || - gpir_op_infos[insert_node->op].schedule_first) { + if ((insert_node->sched.dist > node->sched.dist || + gpir_op_infos[insert_node->op].schedule_first) && + !gpir_op_infos[node->op].schedule_first) { insert_pos = &node->list; break; } @@ -534,6 +543,19 @@ } } + if (node->op == gpir_op_store_reg) { + /* This register may be loaded in the next basic block, in which case + * there still needs to be a 2 instruction gap. We do what the blob + * seems to do and simply disable stores in the last two instructions of + * the basic block. + * + * TODO: We may be able to do better than this, but we have to check + * first if storing a register works across branches. + */ + if (instr->index < 2) + return false; + } + node->sched.instr = instr; int max_node_spill_needed = INT_MAX; @@ -705,7 +727,7 @@ int score = 0; gpir_node_foreach_pred(node, dep) { - if (!gpir_is_input_node(dep->pred)) + if (dep->type != GPIR_DEP_INPUT) continue; int pred_score = INT_MIN; @@ -839,12 +861,12 @@ if (instr->reg0_use_count == 0) use_available = ~0ull; else if (!instr->reg0_is_attr) - use_available = 0xf << (4 * instr->reg0_index); + use_available = 0xfull << (4 * instr->reg0_index); if (instr->reg1_use_count == 0) use_available = ~0ull; else - use_available |= 0xf << (4 * instr->reg1_index); + use_available |= 0xfull << (4 * instr->reg1_index); available &= use_available; } @@ -1008,10 +1030,6 @@ ctx->live_physregs |= (1ull << physreg); - /* TODO: when we support multiple basic blocks, there may be register - * loads/stores to this register other than this one that haven't been - * scheduled yet so we may need to insert write-after-read dependencies. - */ gpir_store_node *store = gpir_node_create(ctx->block, gpir_op_store_reg); store->index = physreg / 4; store->component = physreg % 4; @@ -1029,6 +1047,16 @@ } node->sched.physreg_store = store; gpir_node_add_dep(&store->node, node, GPIR_DEP_INPUT); + + list_for_each_entry(gpir_load_node, load, + &ctx->physreg_reads[physreg], reg_link) { + gpir_node_add_dep(&store->node, &load->node, GPIR_DEP_WRITE_AFTER_READ); + if (load->node.sched.ready) { + list_del(&load->node.list); + load->node.sched.ready = false; + } + } + node->sched.ready = false; schedule_insert_ready_list(ctx, &store->node); } @@ -1154,7 +1182,8 @@ continue; gpir_node *succ = dep->succ; - if (succ->type != gpir_node_type_alu) + if (succ->type != gpir_node_type_alu || + !succ->sched.instr) continue; /* Note: this must be consistent with gpir_codegen_{mul,add}_slot{0,1} @@ -1313,6 +1342,17 @@ static void place_move(sched_ctx *ctx, gpir_node *node) { + /* For complex1 that is consumed by a postlog2, we cannot allow any moves + * in between. Convert the postlog2 to a move and insert a new postlog2, + * and try to schedule it again in try_node(). + */ + gpir_node *postlog2 = consuming_postlog2(node); + if (postlog2) { + postlog2->op = gpir_op_mov; + create_postlog2(ctx, node); + return; + } + gpir_node *move = create_move(ctx, node); gpir_node_foreach_succ_safe(move, dep) { gpir_node *succ = dep->succ; @@ -1330,10 +1370,14 @@ /* For next-max nodes, not every node can be offloaded to a move in the * complex slot. If we run out of non-complex slots, then such nodes cannot * have moves placed for them. There should always be sufficient - * complex-capable nodes so that this isn't a problem. + * complex-capable nodes so that this isn't a problem. We also disallow moves + * for schedule_first nodes here. */ static bool can_place_move(sched_ctx *ctx, gpir_node *node) { + if (gpir_op_infos[node->op].schedule_first) + return false; + if (!node->sched.next_max_node) return true; @@ -1347,17 +1391,7 @@ { list_for_each_entry(gpir_node, node, &ctx->ready_list, list) { if (node->sched.max_node) { - /* For complex1 that is consumed by a postlog2, we cannot allow any - * moves in between. Convert the postlog2 to a move and insert a new - * postlog2, and try to schedule it again in try_node(). - */ - gpir_node *postlog2 = consuming_postlog2(node); - if (postlog2) { - postlog2->op = gpir_op_mov; - create_postlog2(ctx, node); - } else { - place_move(ctx, node); - } + place_move(ctx, node); return true; } } @@ -1550,19 +1584,27 @@ list_inithead(&ctx.ready_list); ctx.block = block; ctx.ready_list_slots = 0; - /* TODO initialize with block live out once we have proper liveness - * tracking - */ - ctx.live_physregs = 0; + ctx.live_physregs = block->live_out_phys; + + for (unsigned i = 0; i < GPIR_PHYSICAL_REG_NUM; i++) { + list_inithead(&ctx.physreg_reads[i]); + } /* construct the ready list from root nodes */ list_for_each_entry_safe(gpir_node, node, &block->node_list, list) { + /* Add to physreg_reads */ + if (node->op == gpir_op_load_reg) { + gpir_load_node *load = gpir_node_to_load(node); + unsigned index = 4 * load->index + load->component; + list_addtail(&load->reg_link, &ctx.physreg_reads[index]); + } + if (gpir_node_is_root(node)) schedule_insert_ready_list(&ctx, node); } list_inithead(&block->node_list); - while (!list_empty(&ctx.ready_list)) { + while (!list_is_empty(&ctx.ready_list)) { if (!schedule_one_instr(&ctx)) return false; } @@ -1570,6 +1612,29 @@ return true; } +static void add_fake_dep(gpir_node *node, gpir_node *dep_node, + gpir_node *last_written[]) +{ + gpir_node_foreach_pred(node, dep) { + if (dep->type == GPIR_DEP_INPUT) { + int index = dep->pred->value_reg; + if (index >= 0 && last_written[index]) { + gpir_node_add_dep(last_written[index], dep_node, + GPIR_DEP_WRITE_AFTER_READ); + } + if (gpir_op_infos[dep->pred->op].schedule_first) { + /* Insert fake dependencies for any schedule_first children on + * this node as well. This guarantees that as soon as + * "dep_node" is ready to schedule, all of its schedule_first + * children, grandchildren, etc. are ready so that they can be + * scheduled as soon as possible. + */ + add_fake_dep(dep->pred, dep_node, last_written); + } + } + } +} + static void schedule_build_dependency(gpir_block *block) { gpir_node *last_written[GPIR_VALUE_REG_NUM + GPIR_PHYSICAL_REG_NUM] = {0}; @@ -1594,22 +1659,6 @@ } } - /* Forward dependencies. We only need to add these for register loads, - * since value registers already have an input dependency. - */ - list_for_each_entry(gpir_node, node, &block->node_list, list) { - if (node->op == gpir_op_load_reg) { - gpir_load_node *load = gpir_node_to_load(node); - unsigned index = 4 * load->index + load->component; - if (last_written[index]) { - gpir_node_add_dep(node, last_written[index], GPIR_DEP_READ_AFTER_WRITE); - } - } - - if (node->value_reg >= 0) - last_written[node->value_reg] = node; - } - memset(last_written, 0, sizeof(last_written)); /* False dependencies. For value registers, these exist only to make sure @@ -1622,16 +1671,12 @@ if (last_written[index]) { gpir_node_add_dep(last_written[index], node, GPIR_DEP_WRITE_AFTER_READ); } + } else if (node->op == gpir_op_store_reg) { + gpir_store_node *store = gpir_node_to_store(node); + unsigned index = 4 * store->index + store->component; + last_written[index] = node; } else { - gpir_node_foreach_pred(node, dep) { - if (dep->type == GPIR_DEP_INPUT) { - int index = dep->pred->value_reg; - if (index >= 0 && last_written[index]) { - gpir_node_add_dep(last_written[index], node, - GPIR_DEP_WRITE_AFTER_READ); - } - } - } + add_fake_dep(node, node, last_written); } if (node->value_reg >= 0) diff -Nru mesa-19.2.8/src/gallium/drivers/lima/ir/lima_ir.h mesa-20.0.8/src/gallium/drivers/lima/ir/lima_ir.h --- mesa-19.2.8/src/gallium/drivers/lima/ir/lima_ir.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/lima/ir/lima_ir.h 2020-06-12 01:21:17.000000000 +0000 @@ -65,5 +65,6 @@ void lima_nir_lower_uniform_to_scalar(nir_shader *shader); bool lima_nir_scale_trig(nir_shader *shader); +bool lima_nir_split_load_input(nir_shader *shader); #endif diff -Nru mesa-19.2.8/src/gallium/drivers/lima/ir/lima_nir_lower_uniform_to_scalar.c mesa-20.0.8/src/gallium/drivers/lima/ir/lima_nir_lower_uniform_to_scalar.c --- mesa-19.2.8/src/gallium/drivers/lima/ir/lima_nir_lower_uniform_to_scalar.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/lima/ir/lima_nir_lower_uniform_to_scalar.c 2020-06-12 01:21:17.000000000 +0000 @@ -44,7 +44,7 @@ nir_intrinsic_set_type(chan_intr, nir_intrinsic_type(intr)); chan_intr->src[0] = - nir_src_for_ssa(nir_fmul_imm(b, intr->src[0].ssa, 4)); + nir_src_for_ssa(nir_imul_imm(b, intr->src[0].ssa, 4)); nir_builder_instr_insert(b, &chan_intr->instr); diff -Nru mesa-19.2.8/src/gallium/drivers/lima/ir/lima_nir_split_load_input.c mesa-20.0.8/src/gallium/drivers/lima/ir/lima_nir_split_load_input.c --- mesa-19.2.8/src/gallium/drivers/lima/ir/lima_nir_split_load_input.c 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/lima/ir/lima_nir_split_load_input.c 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,123 @@ +/* + * Copyright © 2019 Vasily Khoruzhick + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "nir.h" +#include "nir_builder.h" + +#include "lima_ir.h" + +static bool +lima_nir_split_load_input_block(nir_block *block, nir_builder *b) +{ + bool progress = false; + + nir_foreach_instr_safe(instr, block) { + if (instr->type != nir_instr_type_alu) + continue; + + nir_alu_instr *alu = nir_instr_as_alu(instr); + if (alu->op != nir_op_mov) + continue; + + if (!alu->dest.dest.is_ssa) + continue; + + if (!alu->src[0].src.is_ssa) + continue; + + nir_ssa_def *ssa = alu->src[0].src.ssa; + if (ssa->parent_instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(ssa->parent_instr); + if (intrin->intrinsic != nir_intrinsic_load_input) + continue; + + uint8_t swizzle = alu->src[0].swizzle[0]; + int i; + + for (i = 1; i < nir_dest_num_components(alu->dest.dest); i++) + if (alu->src[0].swizzle[i] != (swizzle + i)) + break; + + if (i != nir_dest_num_components(alu->dest.dest)) + continue; + + b->cursor = nir_before_instr(&intrin->instr); + nir_intrinsic_instr *new_intrin = nir_intrinsic_instr_create( + b->shader, + intrin->intrinsic); + nir_ssa_dest_init(&new_intrin->instr, &new_intrin->dest, + nir_dest_num_components(alu->dest.dest), + ssa->bit_size, + NULL); + new_intrin->num_components = nir_dest_num_components(alu->dest.dest); + nir_intrinsic_set_base(new_intrin, nir_intrinsic_base(intrin)); + nir_intrinsic_set_component(new_intrin, nir_intrinsic_component(intrin) + swizzle); + nir_intrinsic_set_type(new_intrin, nir_intrinsic_type(intrin)); + + /* offset */ + nir_src_copy(&new_intrin->src[0], &intrin->src[0], new_intrin); + + nir_builder_instr_insert(b, &new_intrin->instr); + nir_ssa_def_rewrite_uses(&alu->dest.dest.ssa, + nir_src_for_ssa(&new_intrin->dest.ssa)); + nir_instr_remove(&alu->instr); + progress = true; + } + + return progress; +} + +static bool +lima_nir_split_load_input_impl(nir_function_impl *impl) +{ + bool progress = false; + nir_builder builder; + nir_builder_init(&builder, impl); + + nir_foreach_block(block, impl) { + progress |= lima_nir_split_load_input_block(block, &builder); + } + + nir_metadata_preserve(impl, nir_metadata_block_index | + nir_metadata_dominance); + return progress; +} + +/* Replaces a single load of several packed varyings and number of movs with + * a number of loads of smaller size + */ +bool +lima_nir_split_load_input(nir_shader *shader) +{ + bool progress = false; + + nir_foreach_function(function, shader) { + if (function->impl) + progress |= lima_nir_split_load_input_impl(function->impl); + } + + return progress; +} + diff -Nru mesa-19.2.8/src/gallium/drivers/lima/ir/pp/codegen.c mesa-20.0.8/src/gallium/drivers/lima/ir/pp/codegen.c --- mesa-19.2.8/src/gallium/drivers/lima/ir/pp/codegen.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/lima/ir/pp/codegen.c 2020-06-12 01:21:17.000000000 +0000 @@ -53,7 +53,7 @@ int index = ppir_target_get_dest_reg_index(dest); int num_components = load->num_components; - if (num_components) { + if (node->op != ppir_op_load_coords_reg) { assert(node->op == ppir_op_load_varying || node->op == ppir_op_load_coords || node->op == ppir_op_load_fragcoord || @@ -65,7 +65,13 @@ int alignment = num_components == 3 ? 3 : num_components - 1; f->imm.alignment = alignment; - f->imm.offset_vector = 0xf; + + if (load->num_src) { + index = ppir_target_get_src_reg_index(&load->src); + f->imm.offset_vector = index >> 2; + f->imm.offset_scalar = index & 0x3; + } else + f->imm.offset_vector = 0xf; if (alignment == 3) f->imm.index = load->index >> 2; @@ -84,24 +90,33 @@ f->imm.source_type = 3; f->imm.perspective = 1; break; + case ppir_op_load_coords: + /* num_components == 3 implies cubemap as we don't support 3D textures */ + f->imm.source_type = num_components == 3 ? 2 : 0; + break; default: break; } } - else { - assert(node->op == ppir_op_load_coords); - + else { /* node->op == ppir_op_load_coords_reg */ f->reg.dest = index >> 2; f->reg.mask = dest->write_mask << (index & 0x3); - f->reg.source_type = 1; - - ppir_src *src = &load->src; - index = ppir_target_get_src_reg_index(src); - f->reg.source = index >> 2; - f->reg.negate = src->negate; - f->reg.absolute = src->absolute; - f->reg.swizzle = encode_swizzle(src->swizzle, index & 0x3, 0); + if (load->num_src) { + /* num_components == 3 implies cubemap as we don't support 3D textures */ + if (num_components == 3) { + f->reg.source_type = 2; + f->reg.perspective = 1; + } else { + f->reg.source_type = 1; + } + ppir_src *src = &load->src; + index = ppir_target_get_src_reg_index(src); + f->reg.source = index >> 2; + f->reg.negate = src->negate; + f->reg.absolute = src->absolute; + f->reg.swizzle = encode_swizzle(src->swizzle, index & 0x3, 0); + } } } @@ -111,8 +126,25 @@ ppir_load_texture_node *ldtex = ppir_node_to_load_texture(node); f->index = ldtex->sampler; - f->lod_bias_en = 0; - f->type = ppir_codegen_sampler_type_2d; + + f->lod_bias_en = ldtex->lod_bias_en; + f->explicit_lod = ldtex->explicit_lod; + if (ldtex->lod_bias_en) + ppir_target_get_src_reg_index(&ldtex->src[1]); + + switch (ldtex->sampler_dim) { + case GLSL_SAMPLER_DIM_2D: + case GLSL_SAMPLER_DIM_RECT: + case GLSL_SAMPLER_DIM_EXTERNAL: + f->type = ppir_codegen_sampler_type_2d; + break; + case GLSL_SAMPLER_DIM_CUBE: + f->type = ppir_codegen_sampler_type_cube; + break; + default: + break; + } + f->offset_en = 0; f->unknown_2 = 0x39001; } @@ -133,13 +165,14 @@ assert(0); } - int num_components = load->num_components; - int alignment = num_components == 4 ? 2 : num_components - 1; - - f->alignment = alignment; - - /* TODO: uniform can be also combined like varying */ - f->index = load->index << (2 - alignment); + /* Uniforms are always aligned to vec4 boundary */ + f->alignment = 2; + f->index = load->index; + + if (load->num_src) { + f->offset_en = 1; + f->offset_reg = ppir_target_get_src_reg_index(&load->src); + } } static unsigned shift_to_op(int shift) @@ -168,6 +201,7 @@ f->op = shift_to_op(alu->shift); break; case ppir_op_mov: + case ppir_op_store_color: f->op = ppir_codegen_vec4_mul_op_mov; break; case ppir_op_max: @@ -310,6 +344,7 @@ f->op = ppir_codegen_vec4_acc_op_add; break; case ppir_op_mov: + case ppir_op_store_color: f->op = ppir_codegen_vec4_acc_op_mov; break; case ppir_op_sum3: @@ -556,6 +591,7 @@ ppir_codegen_field_branch *b = code; ppir_branch_node *branch; ppir_instr *target_instr; + ppir_block *target; if (node->op == ppir_op_discard) { ppir_codegen_encode_discard(node, code); return; @@ -565,14 +601,35 @@ branch = ppir_node_to_branch(node); b->branch.unknown_0 = 0x0; - b->branch.arg0_source = get_scl_reg_index(&branch->src[0], 0); - b->branch.arg1_source = get_scl_reg_index(&branch->src[1], 0); - b->branch.cond_gt = branch->cond_gt; - b->branch.cond_eq = branch->cond_eq; - b->branch.cond_lt = branch->cond_lt; b->branch.unknown_1 = 0x0; - target_instr = list_first_entry(&branch->target->instr_list, ppir_instr, list); + if (branch->num_src == 2) { + b->branch.arg0_source = get_scl_reg_index(&branch->src[0], 0); + b->branch.arg1_source = get_scl_reg_index(&branch->src[1], 0); + b->branch.cond_gt = branch->cond_gt; + b->branch.cond_eq = branch->cond_eq; + b->branch.cond_lt = branch->cond_lt; + } else if (branch->num_src == 0) { + /* Unconditional branch */ + b->branch.arg0_source = 0; + b->branch.arg1_source = 0; + b->branch.cond_gt = true; + b->branch.cond_eq = true; + b->branch.cond_lt = true; + } else { + assert(false); + } + + target = branch->target; + while (list_is_empty(&target->instr_list)) { + if (!target->list.next) + break; + target = LIST_ENTRY(ppir_block, target->list.next, list); + } + + assert(!list_is_empty(&target->instr_list)); + + target_instr = list_first_entry(&target->instr_list, ppir_instr, list); b->branch.target = target_instr->offset - node->instr->offset; b->branch.next_count = target_instr->encode_size; } diff -Nru mesa-19.2.8/src/gallium/drivers/lima/ir/pp/codegen.h mesa-20.0.8/src/gallium/drivers/lima/ir/pp/codegen.h --- mesa-19.2.8/src/gallium/drivers/lima/ir/pp/codegen.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/lima/ir/pp/codegen.h 2020-06-12 01:21:17.000000000 +0000 @@ -111,7 +111,8 @@ typedef struct __attribute__((__packed__)) { unsigned lod_bias : 6; unsigned index_offset : 6; - unsigned unknown_0 : 6; /* = 000000 */ + unsigned unknown_0 : 5; /* = 00000 */ + bool explicit_lod : 1; bool lod_bias_en : 1; unsigned unknown_1 : 5; /* = 00000 */ ppir_codegen_sampler_type type : 5; diff -Nru mesa-19.2.8/src/gallium/drivers/lima/ir/pp/disasm.c mesa-20.0.8/src/gallium/drivers/lima/ir/pp/disasm.c --- mesa-19.2.8/src/gallium/drivers/lima/ir/pp/disasm.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/lima/ir/pp/disasm.c 2020-06-12 01:21:17.000000000 +0000 @@ -118,6 +118,32 @@ } static void +print_varying_source(ppir_codegen_field_varying *varying) +{ + switch (varying->imm.alignment) { + case 0: + printf("%u.%c", varying->imm.index >> 2, + "xyzw"[varying->imm.index & 3]); + break; + case 1: { + const char *c[2] = {"xy", "zw"}; + printf("%u.%s", varying->imm.index >> 1, c[varying->imm.index & 1]); + break; + } + default: + printf("%u", varying->imm.index); + break; + } + + if (varying->imm.offset_vector != 15) { + unsigned reg = (varying->imm.offset_vector << 2) + + varying->imm.offset_scalar; + printf("+"); + print_source_scalar(reg, NULL, false, false); + } +} + +static void print_outmod(ppir_codegen_outmod modifier) { switch (modifier) @@ -213,7 +239,28 @@ varying->reg.absolute, varying->reg.negate); break; case 2: - printf("gl_FragCoord"); + switch (varying->imm.perspective) { + case 0: + printf("cube("); + print_varying_source(varying); + printf(")"); + break; + case 1: + printf("cube("); + print_vector_source(varying->reg.source, NULL, varying->reg.swizzle, + varying->reg.absolute, varying->reg.negate); + printf(")"); + break; + case 2: + printf("normalize("); + print_vector_source(varying->reg.source, NULL, varying->reg.swizzle, + varying->reg.absolute, varying->reg.negate); + printf(")"); + break; + default: + printf("gl_FragCoord"); + break; + } break; case 3: if (varying->imm.perspective) @@ -222,27 +269,7 @@ printf("gl_PointCoord"); break; default: - switch (varying->imm.alignment) { - case 0: - printf("%u.%c", varying->imm.index >> 2, - "xyzw"[varying->imm.index & 3]); - break; - case 1: { - const char *c[2] = {"xy", "zw"}; - printf("%u.%s", varying->imm.index >> 1, c[varying->imm.index & 1]); - break; - } - default: - printf("%u", varying->imm.index); - break; - } - - if (varying->imm.offset_vector != 15) { - unsigned reg = (varying->imm.offset_vector << 2) + - varying->imm.offset_scalar; - printf("+"); - print_source_scalar(reg, NULL, false, false); - } + print_varying_source(varying); break; } } @@ -318,7 +345,7 @@ } if (uniform->offset_en) { - printf(" "); + printf("+"); print_source_scalar(uniform->offset_reg, NULL, false, false); } } @@ -620,7 +647,7 @@ print_dest_scalar(combine->scalar.dest); } printf(" "); - + print_source_scalar(combine->scalar.arg0_src, NULL, combine->scalar.arg0_absolute, combine->scalar.arg0_negate); diff -Nru mesa-19.2.8/src/gallium/drivers/lima/ir/pp/instr.c mesa-20.0.8/src/gallium/drivers/lima/ir/pp/instr.c --- mesa-19.2.8/src/gallium/drivers/lima/ir/pp/instr.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/lima/ir/pp/instr.c 2020-06-12 01:21:17.000000000 +0000 @@ -186,9 +186,18 @@ uint8_t swizzle[4] = {0}; if (ppir_instr_insert_const(&ic, nc, swizzle)) { + ppir_node *succ = ppir_node_first_succ(node); + ppir_src *src = NULL; + for (int s = 0; s < ppir_node_get_src_num(succ); s++) { + src = ppir_node_get_src(succ, s); + if (src->node == node) + break; + } + assert(src->node == node); + instr->constant[i] = ic; - ppir_instr_update_src_pipeline( - instr, ppir_pipeline_reg_const0 + i, &c->dest, swizzle); + ppir_update_src_pipeline(ppir_pipeline_reg_const0 + i, src, + &c->dest, swizzle); break; } } @@ -264,6 +273,7 @@ printf("const0|1\n"); list_for_each_entry(ppir_block, block, &comp->block_list, list) { + printf("-------block %3d-------\n", block->index); list_for_each_entry(ppir_instr, instr, &block->instr_list, list) { printf("%c%03d: ", instr->is_end ? '*' : ' ', instr->index); for (int i = 0; i < PPIR_INSTR_SLOT_NUM; i++) { @@ -282,8 +292,8 @@ } printf("\n"); } - printf("------------------------\n"); } + printf("===========================\n"); } static void ppir_instr_print_sub(ppir_instr *instr) @@ -316,12 +326,13 @@ printf("======ppir instr depend======\n"); list_for_each_entry(ppir_block, block, &comp->block_list, list) { + printf("-------block %3d-------\n", block->index); list_for_each_entry(ppir_instr, instr, &block->instr_list, list) { if (ppir_instr_is_root(instr)) { ppir_instr_print_sub(instr); printf("\n"); } } - printf("------------------------\n"); } + printf("=============================\n"); } diff -Nru mesa-19.2.8/src/gallium/drivers/lima/ir/pp/liveness.c mesa-20.0.8/src/gallium/drivers/lima/ir/pp/liveness.c --- mesa-19.2.8/src/gallium/drivers/lima/ir/pp/liveness.c 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/lima/ir/pp/liveness.c 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,335 @@ +/* + * Copyright (c) 2019 Lima Project + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sub license, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + */ + +#include "ppir.h" + +/* Propagates liveness from a liveness set to another by performing the + * union between sets. */ +static void +ppir_liveness_propagate(ppir_compiler *comp, + struct ppir_liveness *dest, struct ppir_liveness *src, + struct set *dest_set, struct set *src_set) +{ + set_foreach(src_set, entry_src) { + const struct ppir_liveness *s = entry_src->key; + assert(s); + + unsigned int regalloc_index = s->reg->regalloc_index; + + dest[regalloc_index].reg = src[regalloc_index].reg; + dest[regalloc_index].mask |= src[regalloc_index].mask; + _mesa_set_add(dest_set, &dest[regalloc_index]); + } +} + +/* Clone a liveness set (without propagation) */ +static void +ppir_liveness_set_clone(ppir_compiler *comp, + struct ppir_liveness *dest, struct ppir_liveness *src, + struct set *dest_set, struct set *src_set) +{ + _mesa_set_clear(dest_set, NULL); + memset(dest, 0, list_length(&comp->reg_list) * sizeof(struct ppir_liveness)); + memcpy(dest, src, + list_length(&comp->reg_list) * sizeof(struct ppir_liveness)); + + set_foreach(src_set, entry_src) { + const struct ppir_liveness *s = entry_src->key; + assert(s); + + unsigned int regalloc_index = s->reg->regalloc_index; + dest[regalloc_index].reg = src[regalloc_index].reg; + dest[regalloc_index].mask = src[regalloc_index].mask; + _mesa_set_add(dest_set, &dest[regalloc_index]); + } +} + +/* Check whether two liveness sets are equal. */ +static bool +ppir_liveness_set_equal(ppir_compiler *comp, + struct ppir_liveness *l1, struct ppir_liveness *l2, + struct set *set1, struct set *set2) +{ + set_foreach(set1, entry1) { + const struct ppir_liveness *k1 = entry1->key; + unsigned int regalloc_index = k1->reg->regalloc_index; + + struct set_entry *entry2 = _mesa_set_search(set2, &l2[regalloc_index]); + if (!entry2) + return false; + + const struct ppir_liveness *k2 = entry2->key; + + if (k1->mask != k2->mask) + return false; + } + set_foreach(set2, entry2) { + const struct ppir_liveness *k2 = entry2->key; + unsigned int regalloc_index = k2->reg->regalloc_index; + + struct set_entry *entry1 = _mesa_set_search(set1, &l1[regalloc_index]); + if (!entry1) + return false; + + const struct ppir_liveness *k1 = entry1->key; + + if (k2->mask != k1->mask) + return false; + } + return true; +} + +/* Update the liveness information of the instruction by adding its srcs + * as live registers to the live_in set. */ +static void +ppir_liveness_instr_srcs(ppir_compiler *comp, ppir_instr *instr) +{ + for (int i = PPIR_INSTR_SLOT_NUM-1; i >= 0; i--) { + ppir_node *node = instr->slots[i]; + if (!node) + continue; + + switch(node->op) { + case ppir_op_const: + case ppir_op_undef: + continue; + default: + break; + } + + for (int i = 0; i < ppir_node_get_src_num(node); i++) { + ppir_src *src = ppir_node_get_src(node, i); + if (!src || src->type == ppir_target_pipeline) + continue; + + ppir_reg *reg = ppir_src_get_reg(src); + if (!reg || reg->undef) + continue; + + /* if some other op on this same instruction is writing, + * we just need to reserve a register for this particular + * instruction. Add the register to live_out to make that + * interference happen without propagating its liveness. */ + if (src->node && src->node->instr == instr) { + instr->live_out[reg->regalloc_index].reg = reg; + _mesa_set_add(instr->live_out_set, &instr->live_out[reg->regalloc_index]); + continue; + } + + struct set_entry *live = _mesa_set_search(instr->live_in_set, + &instr->live_in[reg->regalloc_index]); + if (src->type == ppir_target_ssa) { + /* reg is read, needs to be live before instr */ + if (live) + continue; + + instr->live_in[reg->regalloc_index].reg = reg; + _mesa_set_add(instr->live_in_set, &instr->live_in[reg->regalloc_index]); + } + else { + unsigned int mask = ppir_src_get_mask(src); + + /* read reg is type register, need to check if this sets + * any additional bits in the current mask */ + if (live && (instr->live_in[reg->regalloc_index].mask == + (instr->live_in[reg->regalloc_index].mask | mask))) + continue; + + /* some new components */ + instr->live_in[reg->regalloc_index].reg = reg; + instr->live_in[reg->regalloc_index].mask |= mask; + _mesa_set_add(instr->live_in_set, &instr->live_in[reg->regalloc_index]); + } + } + } +} + + +/* Update the liveness information of the instruction by removing its + * dests from the live_in set. */ +static void +ppir_liveness_instr_dest(ppir_compiler *comp, ppir_instr *instr) +{ + for (int i = PPIR_INSTR_SLOT_NUM-1; i >= 0; i--) { + ppir_node *node = instr->slots[i]; + if (!node) + continue; + + switch(node->op) { + case ppir_op_const: + case ppir_op_undef: + case ppir_op_store_color: /* never clear dest if its store output */ + continue; + default: + break; + } + + ppir_dest *dest = ppir_node_get_dest(node); + if (!dest || dest->type == ppir_target_pipeline) + continue; + ppir_reg *reg = ppir_dest_get_reg(dest); + if (!reg || reg->undef) + continue; + + struct set_entry *live = _mesa_set_search(instr->live_in_set, + &instr->live_in[reg->regalloc_index]); + + /* If a register is written but wasn't read in a later instruction, it is + * either dead code or a bug. For now, assign an interference to it to + * ensure it doesn't get assigned a live register and overwrites it. */ + if (!live) { + instr->live_out[reg->regalloc_index].reg = reg; + _mesa_set_add(instr->live_out_set, &instr->live_out[reg->regalloc_index]); + continue; + } + + if (dest->type == ppir_target_ssa) { + /* reg is written and ssa, is not live before instr */ + _mesa_set_remove_key(instr->live_in_set, &instr->live_in[reg->regalloc_index]); + } + else { + unsigned int mask = dest->write_mask; + /* written reg is type register, need to check if this clears + * the remaining mask to remove it from the live set */ + if (instr->live_in[reg->regalloc_index].mask == + (instr->live_in[reg->regalloc_index].mask & ~mask)) + continue; + + instr->live_in[reg->regalloc_index].mask &= ~mask; + /* unset reg if all remaining bits were cleared */ + if (!instr->live_in[reg->regalloc_index].mask) { + _mesa_set_remove_key(instr->live_in_set, &instr->live_in[reg->regalloc_index]); + } + } + } +} + +/* Main loop, iterate blocks/instructions/ops backwards, propagate + * livenss and update liveness of each instruction. */ +static bool +ppir_liveness_compute_live_sets(ppir_compiler *comp) +{ + bool cont = false; + list_for_each_entry_rev(ppir_block, block, &comp->block_list, list) { + ppir_instr *first = list_first_entry(&block->instr_list, ppir_instr, list); + ppir_instr *last = list_last_entry(&block->instr_list, ppir_instr, list); + + /* inherit live_out from the other blocks live_in */ + for (int i = 0; i < 2; i++) { + ppir_block *succ = block->successors[i]; + if (!succ) + continue; + + ppir_liveness_propagate(comp, block->live_out, succ->live_in, + block->live_out_set, succ->live_in_set); + } + + list_for_each_entry_rev(ppir_instr, instr, &block->instr_list, list) { + /* inherit (or-) live variables from next instr or block */ + if (instr == last) { + ppir_liveness_set_clone(comp, + instr->live_out, block->live_out, + instr->live_out_set, block->live_out_set); + } + else { + ppir_instr *next_instr = LIST_ENTRY(ppir_instr, instr->list.next, list); + ppir_liveness_set_clone(comp, + instr->live_out, next_instr->live_in, + instr->live_out_set, next_instr->live_in_set); + } + /* initial copy to check for changes */ + struct set *temp_live_in_set = _mesa_set_create(comp, + _mesa_hash_pointer, + _mesa_key_pointer_equal); + struct ppir_liveness temp_live_in[list_length(&comp->reg_list)]; + ppir_liveness_set_clone(comp, + temp_live_in, instr->live_in, + temp_live_in_set, instr->live_in_set); + + /* initialize live_in for potential changes */ + ppir_liveness_propagate(comp, instr->live_in, instr->live_out, + instr->live_in_set, instr->live_out_set); + + ppir_liveness_instr_dest(comp, instr); + ppir_liveness_instr_srcs(comp, instr); + + cont |= !ppir_liveness_set_equal(comp, temp_live_in, instr->live_in, + temp_live_in_set, instr->live_in_set); + } + + /* inherit live_in from the first instruction in the block, + * or live_out if it is empty */ + if (!list_is_empty(&block->instr_list) && first && first->scheduled) + ppir_liveness_set_clone(comp, block->live_in, first->live_in, + block->live_in_set, first->live_in_set); + else + ppir_liveness_set_clone(comp, block->live_in, block->live_out, + block->live_in_set, block->live_out_set); + } + + return cont; +} + +/* + * Liveness analysis is based on https://en.wikipedia.org/wiki/Live_variable_analysis + * This implementation calculates liveness before/after each + * instruction. Aggregated block liveness information is stored + * before/after blocks for conveniency (handle e.g. empty blocks). + * Blocks/instructions/ops are iterated backwards so register reads are + * propagated up to the instruction that writes it. + * + * 1) Before computing liveness for each instruction, propagate live_out + * from the next instruction. If it is the last instruction in a + * block, propagate liveness from all possible next instructions + * (in this case, this information comes from the live_out of the + * block itself). + * 2) Calculate live_in for the each instruction. The initial live_in is + * a copy of its live_out so registers who aren't touched by this + * instruction are kept intact. + * - If a register is written by this instruction, it no longer needs + * to be live before the instruction, so it is removed from live_in. + * - If a register is read by this instruction, it needs to be live + * before its execution, so add it to live_in. + * - Non-ssa registers are a special case. For this, the algorithm + * keeps and updates the mask of live components following the same + * logic as above. The register is only removed from the live set + * when no live components are left. + * - If a non-ssa register is written and read in the same + * instruction, it stays in live_in. + * - Another special case is a ssa register that is written by an + * early op in the instruction, and read by a later op. In this case, + * the algorithm adds it to the live_out set so that the register + * allocator properly assigns an interference for it. + * 3) The algorithm must run over the entire program until it converges, + * i.e. a full run happens without changes. This is because blocks + * are updated sequentially and updates in a block may need to be + * propagated to parent blocks that were already calculated in the + * current run. + */ +void +ppir_liveness_analysis(ppir_compiler *comp) +{ + while (ppir_liveness_compute_live_sets(comp)) + ; +} diff -Nru mesa-19.2.8/src/gallium/drivers/lima/ir/pp/lower.c mesa-20.0.8/src/gallium/drivers/lima/ir/pp/lower.c --- mesa-19.2.8/src/gallium/drivers/lima/ir/pp/lower.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/lima/ir/pp/lower.c 2020-06-12 01:21:17.000000000 +0000 @@ -34,40 +34,41 @@ return true; } - ppir_node *move = NULL; - ppir_dest *dest = ppir_node_get_dest(node); + assert(ppir_node_has_single_succ(node)); - /* const (register) can only be used in alu node, create a move - * node for other types of node */ - ppir_node_foreach_succ_safe(node, dep) { - ppir_node *succ = dep->succ; - - if (succ->type != ppir_node_type_alu) { - if (!move) { - move = ppir_node_create(block, ppir_op_mov, -1, 0); - if (unlikely(!move)) - return false; - - ppir_debug("lower const create move %d for %d\n", - move->index, node->index); - - ppir_alu_node *alu = ppir_node_to_alu(move); - alu->dest = *dest; - alu->num_src = 1; - ppir_node_target_assign(alu->src, node); - for (int i = 0; i < 4; i++) - alu->src->swizzle[i] = i; - } + ppir_node *succ = ppir_node_first_succ(node); + ppir_src *src = ppir_node_get_src_for_pred(succ, node); + ppir_dest *dest = ppir_node_get_dest(node); + assert(src != NULL); - ppir_node_replace_pred(dep, move); - ppir_node_replace_child(succ, node, move); - } + switch (succ->type) { + case ppir_node_type_alu: + case ppir_node_type_branch: + /* ALU and branch can consume consts directly */ + dest->type = src->type = ppir_target_pipeline; + /* Reg will be updated in node_to_instr later */ + dest->pipeline = src->pipeline = ppir_pipeline_reg_const0; + return true; + default: + /* Create a move for everyone else */ + break; } - if (move) { - ppir_node_add_dep(move, node); - list_addtail(&move->list, &node->list); - } + ppir_node *move = ppir_node_insert_mov(node); + if (unlikely(!move)) + return false; + + ppir_debug("lower const create move %d for %d\n", + move->index, node->index); + + /* Need to be careful with changing src/dst type here: + * it has to be done *after* successors have their children + * replaced, otherwise ppir_node_replace_child() won't find + * matching src/dst and as result won't work + */ + ppir_src *mov_src = ppir_node_get_src(move, 0); + mov_src->type = dest->type = ppir_target_pipeline; + mov_src->pipeline = dest->pipeline = ppir_pipeline_reg_const0; return true; } @@ -89,27 +90,40 @@ static bool ppir_lower_load(ppir_block *block, ppir_node *node) { - ppir_node *move = ppir_node_create(block, ppir_op_mov, -1 , 0); - if (unlikely(!move)) - return false; - - ppir_alu_node *alu = ppir_node_to_alu(move); - ppir_dest *dest = ppir_node_get_dest(node); - alu->dest = *dest; - - ppir_node_replace_all_succ(move, node); + if (ppir_node_is_root(node) && dest->type == ppir_target_ssa) { + ppir_node_delete(node); + return true; + } - dest->type = ppir_target_pipeline; - dest->pipeline = ppir_pipeline_reg_uniform; + /* load can have multiple successors in case if we duplicated load node + * that has load node in source + */ + if ((ppir_node_has_single_src_succ(node) || ppir_node_is_root(node)) && + dest->type != ppir_target_register) { + ppir_node *succ = ppir_node_first_succ(node); + switch (succ->type) { + case ppir_node_type_alu: + case ppir_node_type_branch: { + ppir_src *src = ppir_node_get_src_for_pred(succ, node); + /* Can consume uniforms directly */ + src->type = dest->type = ppir_target_pipeline; + src->pipeline = dest->pipeline = ppir_pipeline_reg_uniform; + return true; + } + default: + /* Create mov for everyone else */ + break; + } + } - alu->num_src = 1; - ppir_node_target_assign(&alu->src[0], node); - for (int i = 0; i < 4; i++) - alu->src->swizzle[i] = i; + ppir_node *move = ppir_node_insert_mov(node); + if (unlikely(!move)) + return false; - ppir_node_add_dep(move, node); - list_addtail(&move->list, &node->list); + ppir_src *mov_src = ppir_node_get_src(move, 0); + mov_src->type = dest->type = ppir_target_pipeline; + mov_src->pipeline = dest->pipeline = ppir_pipeline_reg_uniform; return true; } @@ -135,51 +149,53 @@ static bool ppir_lower_texture(ppir_block *block, ppir_node *node) { ppir_load_texture_node *load_tex = ppir_node_to_load_texture(node); + ppir_dest *dest = ppir_node_get_dest(node); + ppir_node *src_coords = ppir_node_get_src(node, 0)->node; + ppir_load_node *load = NULL; - /* Create load_coords node */ - ppir_load_node *load = ppir_node_create(block, ppir_op_load_coords, -1, 0); - if (!load) - return false; - list_addtail(&load->node.list, &node->list); - - ppir_debug("%s create load_coords node %d for %d\n", - __FUNCTION__, load->node.index, node->index); - - load->dest.type = ppir_target_pipeline; - load->dest.pipeline = ppir_pipeline_reg_discard; - - load->src = load_tex->src_coords; - - ppir_node_foreach_pred_safe(node, dep) { - ppir_node *pred = dep->pred; - ppir_node_remove_dep(dep); - ppir_node_add_dep(&load->node, pred); + if (src_coords && ppir_node_has_single_src_succ(src_coords) && + (src_coords->op == ppir_op_load_coords)) + load = ppir_node_to_load(src_coords); + else { + /* Create load_coords node */ + load = ppir_node_create(block, ppir_op_load_coords_reg, -1, 0); + if (!load) + return false; + list_addtail(&load->node.list, &node->list); + + load->src = load_tex->src[0]; + load->num_src = 1; + if (load_tex->sampler_dim == GLSL_SAMPLER_DIM_CUBE) + load->num_components = 3; + else + load->num_components = 2; + + ppir_debug("%s create load_coords node %d for %d\n", + __FUNCTION__, load->node.index, node->index); + + ppir_node_foreach_pred_safe(node, dep) { + ppir_node *pred = dep->pred; + ppir_node_remove_dep(dep); + ppir_node_add_dep(&load->node, pred, ppir_dep_src); + } + ppir_node_add_dep(node, &load->node, ppir_dep_src); } - ppir_node_add_dep(node, &load->node); + assert(load); + load_tex->src[0].type = load->dest.type = ppir_target_pipeline; + load_tex->src[0].pipeline = load->dest.pipeline = ppir_pipeline_reg_discard; - /* Create move node */ - ppir_node *move = ppir_node_create(block, ppir_op_mov, -1 , 0); + /* Always create move node since there can be successors in other blocks */ + ppir_node *move = ppir_node_insert_mov_all_blocks(node); if (unlikely(!move)) return false; - ppir_alu_node *alu = ppir_node_to_alu(move); + ppir_debug("lower texture create move %d for %d\n", + move->index, node->index); - ppir_dest *dest = ppir_node_get_dest(node); - alu->dest = *dest; - - ppir_node_replace_all_succ(move, node); - - dest->type = ppir_target_pipeline; - dest->pipeline = ppir_pipeline_reg_sampler; - - alu->num_src = 1; - ppir_node_target_assign(&alu->src[0], node); - for (int i = 0; i < 4; i++) - alu->src->swizzle[i] = i; - - ppir_node_add_dep(move, node); - list_addtail(&move->list, &node->list); + ppir_src *mov_src= ppir_node_get_src(move, 0); + mov_src->type = dest->type = ppir_target_pipeline; + mov_src->pipeline = dest->pipeline = ppir_pipeline_reg_sampler; return true; } @@ -208,22 +224,20 @@ move_dest->pipeline = ppir_pipeline_reg_fmul; move_dest->write_mask = 1; - ppir_node_foreach_pred(node, dep) { - ppir_node *pred = dep->pred; - ppir_dest *dest = ppir_node_get_dest(pred); - if (ppir_node_target_equal(alu->src, dest)) { - ppir_node_replace_pred(dep, move); - ppir_node_add_dep(move, pred); - } - } + ppir_node *pred = alu->src[0].node; + ppir_dep *dep = ppir_dep_for_pred(node, pred); + if (dep) + ppir_node_replace_pred(dep, move); + else + ppir_node_add_dep(node, move, ppir_dep_src); - /* move must be the first pred of select node which make sure - * the float mul slot is free when node to instr - */ - assert(ppir_node_first_pred(node) == move); + /* pred can be a register */ + if (pred) + ppir_node_add_dep(move, pred, ppir_dep_src); src->swizzle[0] = 0; ppir_node_target_assign(alu->src, move); + return true; } @@ -282,19 +296,21 @@ static bool ppir_lower_branch(ppir_block *block, ppir_node *node) { ppir_branch_node *branch = ppir_node_to_branch(node); + + /* Unconditional branch */ + if (branch->num_src == 0) + return true; + ppir_const_node *zero = ppir_node_create(block, ppir_op_const, -1, 0); if (!zero) return false; - list_addtail(&zero->node.list, &node->list); - zero->constant.value[0].f = 0; zero->constant.num = 1; - zero->dest.type = ppir_target_ssa; + zero->dest.type = ppir_target_pipeline; + zero->dest.pipeline = ppir_pipeline_reg_const0; zero->dest.ssa.num_components = 1; - zero->dest.ssa.live_in = INT_MAX; - zero->dest.ssa.live_out = 0; zero->dest.write_mask = 0x01; /* For now we're just comparing branch condition with 0, @@ -302,13 +318,19 @@ * comparision node into branch itself and use current * way as a fallback for complex conditions. */ - branch->src[1].type = ppir_target_ssa; - branch->src[1].ssa = &zero->dest.ssa; + ppir_node_target_assign(&branch->src[1], &zero->node); - branch->cond_gt = true; - branch->cond_lt = true; + if (branch->negate) + branch->cond_eq = true; + else { + branch->cond_gt = true; + branch->cond_lt = true; + } + + branch->num_src = 2; - ppir_node_add_dep(&branch->node, &zero->node); + ppir_node_add_dep(&branch->node, &zero->node, ppir_dep_src); + list_addtail(&zero->node.list, &node->list); return true; } @@ -340,6 +362,5 @@ } } - ppir_node_print_prog(comp); return true; } diff -Nru mesa-19.2.8/src/gallium/drivers/lima/ir/pp/nir.c mesa-20.0.8/src/gallium/drivers/lima/ir/pp/nir.c --- mesa-19.2.8/src/gallium/drivers/lima/ir/pp/nir.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/lima/ir/pp/nir.c 2020-06-12 01:21:17.000000000 +0000 @@ -24,6 +24,7 @@ #include +#include "util/hash_table.h" #include "util/ralloc.h" #include "util/bitscan.h" #include "compiler/nir/nir.h" @@ -41,8 +42,6 @@ ppir_dest *dest = ppir_node_get_dest(node); dest->type = ppir_target_ssa; dest->ssa.num_components = ssa->num_components; - dest->ssa.live_in = INT_MAX; - dest->ssa.live_out = 0; dest->write_mask = u_bit_consecutive(0, ssa->num_components); if (node->type == ppir_node_type_load || @@ -53,16 +52,16 @@ } static void *ppir_node_create_reg(ppir_block *block, ppir_op op, - nir_reg_dest *reg, unsigned mask) + nir_register *reg, unsigned mask) { - ppir_node *node = ppir_node_create(block, op, reg->reg->index, mask); + ppir_node *node = ppir_node_create(block, op, reg->index, mask); if (!node) return NULL; ppir_dest *dest = ppir_node_get_dest(node); list_for_each_entry(ppir_reg, r, &block->comp->reg_list, list) { - if (r->index == reg->reg->index) { + if (r->index == reg->index) { dest->reg = r; break; } @@ -87,7 +86,7 @@ if (dest->is_ssa) return ppir_node_create_ssa(block, op, &dest->ssa); else - return ppir_node_create_reg(block, op, &dest->reg, mask); + return ppir_node_create_reg(block, op, dest->reg.reg, mask); } return ppir_node_create(block, op, index, 0); @@ -100,14 +99,70 @@ if (ns->is_ssa) { child = comp->var_nodes[ns->ssa->index]; - ppir_node_add_dep(node, child); + /* Clone consts for each successor */ + switch (child->op) { + case ppir_op_const: + child = ppir_node_clone(node->block, child); + break; + case ppir_op_load_varying: { + bool is_load_coords = false; + if (node->op == ppir_op_load_texture) { + nir_tex_src *nts = (nir_tex_src *)ns; + if (nts->src_type == nir_tex_src_coord) + is_load_coords = true; + } + + if (!is_load_coords) { + /* Clone varying loads for each block */ + if (child->block != node->block) { + ppir_node *new = ppir_node_clone(node->block, child); + /* If we clone it for every block and there is no user of + * the original load left, delete the original one. */ + ppir_delete_if_orphan(node->block, child); + child = new; + comp->var_nodes[ns->ssa->index] = child; + } + break; + } + /* At least one successor is load_texture, promote it to load_coords + * to ensure that is has exactly one successor */ + child->op = ppir_op_load_coords; + } + /* Fallthrough */ + case ppir_op_load_uniform: + case ppir_op_load_coords: + case ppir_op_load_coords_reg: + /* Clone uniform and texture coord loads for each block. + * Also ensure that each load has a single successor. + * Let's do a fetch each time and hope for a cache hit instead + * of increasing reg pressure. + */ + if (child->block != node->block || !ppir_node_is_root(child)) { + child = ppir_node_clone(node->block, child); + comp->var_nodes[ns->ssa->index] = child; + } + break; + default: + break; + } + + if (child->op != ppir_op_undef) + ppir_node_add_dep(node, child, ppir_dep_src); } else { nir_register *reg = ns->reg.reg; while (mask) { int swizzle = ps->swizzle[u_bit_scan(&mask)]; child = comp->var_nodes[(reg->index << 2) + comp->reg_base + swizzle]; - ppir_node_add_dep(node, child); + /* Reg is read before it was written, create a dummy node for it */ + if (!child) { + child = ppir_node_create_reg(node->block, ppir_op_dummy, reg, + u_bit_consecutive(0, 4)); + comp->var_nodes[(reg->index << 2) + comp->reg_base + swizzle] = child; + } + /* Don't add dummies or recursive deps for ops like r1 = r1 + ssa1 */ + if (child && node != child && child->op != ppir_op_dummy) + ppir_node_add_dep(node, child, ppir_dep_src); } } @@ -138,13 +193,9 @@ [nir_op_fceil] = ppir_op_ceil, [nir_op_ffract] = ppir_op_fract, [nir_op_sge] = ppir_op_ge, - [nir_op_fge] = ppir_op_ge, [nir_op_slt] = ppir_op_lt, - [nir_op_flt] = ppir_op_lt, [nir_op_seq] = ppir_op_eq, - [nir_op_feq] = ppir_op_eq, [nir_op_sne] = ppir_op_ne, - [nir_op_fne] = ppir_op_ne, [nir_op_fcsel] = ppir_op_select, [nir_op_inot] = ppir_op_not, [nir_op_ftrunc] = ppir_op_trunc, @@ -241,6 +292,7 @@ /* second src and condition will be updated during lowering */ ppir_node_add_src(block->comp, node, &branch->src[0], &instr->src[0], u_bit_consecutive(0, instr->num_components)); + branch->num_src = 1; branch->target = comp->discard_block; return node; @@ -258,7 +310,7 @@ nir_intrinsic_instr *instr = nir_instr_as_intrinsic(ni); unsigned mask = 0; ppir_load_node *lnode; - ppir_store_node *snode; + ppir_alu_node *alu_node; switch (instr->intrinsic) { case nir_intrinsic_load_input: @@ -271,6 +323,12 @@ lnode->num_components = instr->num_components; lnode->index = nir_intrinsic_base(instr) * 4 + nir_intrinsic_component(instr); + if (nir_src_is_const(instr->src[0])) + lnode->index += (uint32_t)(nir_src_as_float(instr->src[0]) * 4); + else { + lnode->num_src = 1; + ppir_node_add_src(block->comp, &lnode->node, &lnode->src, instr->src, 1); + } return &lnode->node; case nir_intrinsic_load_frag_coord: @@ -312,24 +370,36 @@ lnode->num_components = instr->num_components; lnode->index = nir_intrinsic_base(instr); - lnode->index += (uint32_t)nir_src_as_float(instr->src[0]); + if (nir_src_is_const(instr->src[0])) + lnode->index += (uint32_t)nir_src_as_float(instr->src[0]); + else { + lnode->num_src = 1; + ppir_node_add_src(block->comp, &lnode->node, &lnode->src, instr->src, 1); + } return &lnode->node; - case nir_intrinsic_store_output: - snode = ppir_node_create_dest(block, ppir_op_store_color, NULL, 0); - if (!snode) + case nir_intrinsic_store_output: { + alu_node = ppir_node_create_dest(block, ppir_op_store_color, NULL, 0); + if (!alu_node) return NULL; - snode->index = nir_intrinsic_base(instr); + ppir_dest *dest = ppir_node_get_dest(&alu_node->node); + dest->type = ppir_target_ssa; + dest->ssa.num_components = instr->num_components; + dest->ssa.index = 0; + dest->write_mask = u_bit_consecutive(0, instr->num_components); + + alu_node->num_src = 1; for (int i = 0; i < instr->num_components; i++) - snode->src.swizzle[i] = i; + alu_node->src[0].swizzle[i] = i; - ppir_node_add_src(block->comp, &snode->node, &snode->src, instr->src, + ppir_node_add_src(block->comp, &alu_node->node, alu_node->src, instr->src, u_bit_consecutive(0, instr->num_components)); - return &snode->node; + return &alu_node->node; + } case nir_intrinsic_discard: return ppir_emit_discard(block, ni); @@ -362,8 +432,16 @@ static ppir_node *ppir_emit_ssa_undef(ppir_block *block, nir_instr *ni) { - ppir_error("nir_ssa_undef_instr not support\n"); - return NULL; + nir_ssa_undef_instr *undef = nir_instr_as_ssa_undef(ni); + ppir_node *node = ppir_node_create_ssa(block, ppir_op_undef, &undef->def); + if (!node) + return NULL; + ppir_alu_node *alu = ppir_node_to_alu(node); + + ppir_dest *dest = &alu->dest; + dest->ssa.undef = true; + + return node; } static ppir_node *ppir_emit_tex(ppir_block *block, nir_instr *ni) @@ -371,12 +449,21 @@ nir_tex_instr *instr = nir_instr_as_tex(ni); ppir_load_texture_node *node; - if (instr->op != nir_texop_tex) { + switch (instr->op) { + case nir_texop_tex: + case nir_texop_txb: + case nir_texop_txl: + break; + default: ppir_error("unsupported texop %d\n", instr->op); return NULL; } - node = ppir_node_create_dest(block, ppir_op_load_texture, &instr->dest, 0); + unsigned mask = 0; + if (!instr->dest.is_ssa) + mask = u_bit_consecutive(0, nir_tex_instr_dest_size(instr)); + + node = ppir_node_create_dest(block, ppir_op_load_texture, &instr->dest, mask); if (!node) return NULL; @@ -384,6 +471,7 @@ switch (instr->sampler_dim) { case GLSL_SAMPLER_DIM_2D: + case GLSL_SAMPLER_DIM_CUBE: case GLSL_SAMPLER_DIM_RECT: case GLSL_SAMPLER_DIM_EXTERNAL: break; @@ -395,17 +483,24 @@ node->sampler_dim = instr->sampler_dim; for (int i = 0; i < instr->coord_components; i++) - node->src_coords.swizzle[i] = i; + node->src[0].swizzle[i] = i; for (int i = 0; i < instr->num_srcs; i++) { switch (instr->src[i].src_type) { case nir_tex_src_coord: - ppir_node_add_src(block->comp, &node->node, &node->src_coords, &instr->src[i].src, + ppir_node_add_src(block->comp, &node->node, &node->src[0], &instr->src[i].src, u_bit_consecutive(0, instr->coord_components)); + node->num_src++; + break; + case nir_tex_src_bias: + case nir_tex_src_lod: + node->lod_bias_en = true; + node->explicit_lod = (instr->src[i].src_type == nir_tex_src_lod); + ppir_node_add_src(block->comp, &node->node, &node->src[1], &instr->src[i].src, 1); + node->num_src++; break; default: ppir_error("unsupported texture source type\n"); - assert(0); return NULL; } } @@ -413,10 +508,48 @@ return &node->node; } +static ppir_block *ppir_get_block(ppir_compiler *comp, nir_block *nblock) +{ + ppir_block *block = _mesa_hash_table_u64_search(comp->blocks, (uint64_t)nblock); + + return block; +} + static ppir_node *ppir_emit_jump(ppir_block *block, nir_instr *ni) { - ppir_error("nir_jump_instr not support\n"); - return NULL; + ppir_node *node; + ppir_compiler *comp = block->comp; + ppir_branch_node *branch; + ppir_block *jump_block; + nir_jump_instr *jump = nir_instr_as_jump(ni); + + switch (jump->type) { + case nir_jump_break: { + assert(comp->current_block->successors[0]); + assert(!comp->current_block->successors[1]); + jump_block = comp->current_block->successors[0]; + } + break; + case nir_jump_continue: + jump_block = comp->loop_cont_block; + break; + default: + ppir_error("nir_jump_instr not support\n"); + return NULL; + } + + assert(jump_block != NULL); + + node = ppir_node_create(block, ppir_op_branch, -1, 0); + if (!node) + return NULL; + branch = ppir_node_to_branch(node); + + /* Unconditional */ + branch->num_src = 0; + branch->target = jump_block; + + return node; } static ppir_node *(*ppir_emit_instr[nir_instr_type_phi])(ppir_block *, nir_instr *) = { @@ -437,17 +570,18 @@ list_inithead(&block->node_list); list_inithead(&block->instr_list); + block->comp = comp; + return block; } static bool ppir_emit_block(ppir_compiler *comp, nir_block *nblock) { - ppir_block *block = ppir_block_create(comp); - if (!block) - return false; + ppir_block *block = ppir_get_block(comp, nblock); + + comp->current_block = block; list_addtail(&block->list, &comp->block_list); - block->comp = comp; nir_foreach_instr(instr, nblock) { assert(instr->type < nir_instr_type_phi); @@ -461,16 +595,99 @@ return true; } -static bool ppir_emit_if(ppir_compiler *comp, nir_if *nif) +static bool ppir_emit_cf_list(ppir_compiler *comp, struct exec_list *list); + +static bool ppir_emit_if(ppir_compiler *comp, nir_if *if_stmt) { - ppir_error("if nir_cf_node not support\n"); - return false; + ppir_node *node; + ppir_branch_node *else_branch, *after_branch; + nir_block *nir_else_block = nir_if_first_else_block(if_stmt); + bool empty_else_block = + (nir_else_block == nir_if_last_else_block(if_stmt) && + exec_list_is_empty(&nir_else_block->instr_list)); + ppir_block *block = comp->current_block; + + node = ppir_node_create(block, ppir_op_branch, -1, 0); + if (!node) + return false; + else_branch = ppir_node_to_branch(node); + ppir_node_add_src(block->comp, node, &else_branch->src[0], + &if_stmt->condition, 1); + else_branch->num_src = 1; + /* Negate condition to minimize branching. We're generating following: + * current_block: { ...; if (!statement) branch else_block; } + * then_block: { ...; branch after_block; } + * else_block: { ... } + * after_block: { ... } + * + * or if else list is empty: + * block: { if (!statement) branch else_block; } + * then_block: { ... } + * else_block: after_block: { ... } + */ + else_branch->negate = true; + list_addtail(&else_branch->node.list, &block->node_list); + + ppir_emit_cf_list(comp, &if_stmt->then_list); + if (empty_else_block) { + nir_block *nblock = nir_if_last_else_block(if_stmt); + assert(nblock->successors[0]); + assert(!nblock->successors[1]); + else_branch->target = ppir_get_block(comp, nblock->successors[0]); + /* Add empty else block to the list */ + list_addtail(&block->successors[1]->list, &comp->block_list); + return true; + } + + else_branch->target = ppir_get_block(comp, nir_if_first_else_block(if_stmt)); + + nir_block *last_then_block = nir_if_last_then_block(if_stmt); + assert(last_then_block->successors[0]); + assert(!last_then_block->successors[1]); + block = ppir_get_block(comp, last_then_block); + node = ppir_node_create(block, ppir_op_branch, -1, 0); + if (!node) + return false; + after_branch = ppir_node_to_branch(node); + /* Unconditional */ + after_branch->num_src = 0; + after_branch->target = ppir_get_block(comp, last_then_block->successors[0]); + /* Target should be after_block, will fixup later */ + list_addtail(&after_branch->node.list, &block->node_list); + + ppir_emit_cf_list(comp, &if_stmt->else_list); + + return true; } static bool ppir_emit_loop(ppir_compiler *comp, nir_loop *nloop) { - ppir_error("loop nir_cf_node not support\n"); - return false; + ppir_block *save_loop_cont_block = comp->loop_cont_block; + ppir_block *block; + ppir_branch_node *loop_branch; + nir_block *loop_last_block; + ppir_node *node; + + comp->loop_cont_block = ppir_get_block(comp, nir_loop_first_block(nloop)); + + ppir_emit_cf_list(comp, &nloop->body); + + loop_last_block = nir_loop_last_block(nloop); + block = ppir_get_block(comp, loop_last_block); + node = ppir_node_create(block, ppir_op_branch, -1, 0); + if (!node) + return false; + loop_branch = ppir_node_to_branch(node); + /* Unconditional */ + loop_branch->num_src = 0; + loop_branch->target = comp->loop_cont_block; + list_addtail(&loop_branch->node.list, &block->node_list); + + comp->loop_cont_block = save_loop_cont_block; + + comp->num_loops++; + + return true; } static bool ppir_emit_function(ppir_compiler *comp, nir_function_impl *nfunc) @@ -518,6 +735,7 @@ list_inithead(&comp->block_list); list_inithead(&comp->reg_list); + comp->blocks = _mesa_hash_table_u64_create(prog); comp->var_nodes = (ppir_node **)(comp + 1); comp->reg_base = num_ssa; @@ -553,12 +771,14 @@ */ list_for_each_entry(ppir_block, block, &comp->block_list, list) { ppir_node *prev_node = NULL; - list_for_each_entry(ppir_node, node, &block->node_list, list) { - if (node->type == ppir_node_type_discard || - node->type == ppir_node_type_store || - node->type == ppir_node_type_branch) { - if (prev_node) - ppir_node_add_dep(node, prev_node); + list_for_each_entry_rev(ppir_node, node, &block->node_list, list) { + if (prev_node && ppir_node_is_root(node) && node->op != ppir_op_const) { + ppir_node_add_dep(prev_node, node, ppir_dep_sequence); + } + if (node->op == ppir_op_discard || + node->op == ppir_op_store_color || + node->op == ppir_op_store_temp || + node->op == ppir_op_branch) { prev_node = node; } } @@ -586,6 +806,30 @@ free(shaderdb); } +static void ppir_add_write_after_read_deps(ppir_compiler *comp) +{ + list_for_each_entry(ppir_block, block, &comp->block_list, list) { + list_for_each_entry(ppir_reg, reg, &comp->reg_list, list) { + ppir_node *write = NULL; + list_for_each_entry_rev(ppir_node, node, &block->node_list, list) { + for (int i = 0; i < ppir_node_get_src_num(node); i++) { + ppir_src *src = ppir_node_get_src(node, i); + if (src && src->type == ppir_target_register && + src->reg == reg && + write) { + ppir_debug("Adding dep %d for write %d\n", node->index, write->index); + ppir_node_add_dep(write, node, ppir_dep_write_after_read); + } + } + ppir_dest *dest = ppir_node_get_dest(node); + if (dest && dest->type == ppir_target_register && + dest->reg == reg) + write = node; + } + } + } +} + bool ppir_compile_nir(struct lima_fs_shader_state *prog, struct nir_shader *nir, struct ra_regs *ra, struct pipe_debug_callback *debug) @@ -597,6 +841,49 @@ comp->ra = ra; + /* 1st pass: create ppir blocks */ + nir_foreach_function(function, nir) { + if (!function->impl) + continue; + + nir_foreach_block(nblock, function->impl) { + ppir_block *block = ppir_block_create(comp); + if (!block) + return false; + block->index = nblock->index; + _mesa_hash_table_u64_insert(comp->blocks, (uint64_t)nblock, block); + } + } + + /* 2nd pass: populate successors */ + nir_foreach_function(function, nir) { + if (!function->impl) + continue; + + nir_foreach_block(nblock, function->impl) { + ppir_block *block = ppir_get_block(comp, nblock); + assert(block); + + for (int i = 0; i < 2; i++) { + if (nblock->successors[i]) + block->successors[i] = ppir_get_block(comp, nblock->successors[i]); + } + } + } + + /* Validate outputs, we support only gl_FragColor */ + nir_foreach_variable(var, &nir->outputs) { + switch (var->data.location) { + case FRAG_RESULT_COLOR: + case FRAG_RESULT_DATA0: + break; + default: + ppir_error("unsupported output type\n"); + goto err_out0; + break; + } + } + foreach_list_typed(nir_register, reg, node, &func->registers) { ppir_reg *r = rzalloc(comp, ppir_reg); if (!r) @@ -604,8 +891,6 @@ r->index = reg->index; r->num_components = reg->num_components; - r->live_in = INT_MAX; - r->live_out = 0; r->is_head = false; list_addtail(&r->list, &comp->reg_list); } @@ -617,13 +902,16 @@ if (comp->discard_block) list_addtail(&comp->discard_block->list, &comp->block_list); - ppir_add_ordering_deps(comp); - ppir_node_print_prog(comp); if (!ppir_lower_prog(comp)) goto err_out0; + ppir_add_ordering_deps(comp); + ppir_add_write_after_read_deps(comp); + + ppir_node_print_prog(comp); + if (!ppir_node_to_instr(comp)) goto err_out0; @@ -638,10 +926,12 @@ ppir_print_shader_db(nir, comp, debug); + _mesa_hash_table_u64_destroy(comp->blocks, NULL); ralloc_free(comp); return true; err_out0: + _mesa_hash_table_u64_destroy(comp->blocks, NULL); ralloc_free(comp); return false; } diff -Nru mesa-19.2.8/src/gallium/drivers/lima/ir/pp/node.c mesa-20.0.8/src/gallium/drivers/lima/ir/pp/node.c --- mesa-19.2.8/src/gallium/drivers/lima/ir/pp/node.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/lima/ir/pp/node.c 2020-06-12 01:21:17.000000000 +0000 @@ -260,6 +260,13 @@ PPIR_INSTR_SLOT_VARYING, PPIR_INSTR_SLOT_END }, }, + [ppir_op_load_coords_reg] = { + .name = "ld_coords_reg", + .type = ppir_node_type_load, + .slots = (int []) { + PPIR_INSTR_SLOT_VARYING, PPIR_INSTR_SLOT_END + }, + }, [ppir_op_load_fragcoord] = { .name = "ld_fragcoord", .type = ppir_node_type_load, @@ -308,7 +315,11 @@ }, [ppir_op_store_color] = { .name = "st_col", - .type = ppir_node_type_store, + .type = ppir_node_type_alu, + .slots = (int []) { + PPIR_INSTR_SLOT_ALU_VEC_ADD, PPIR_INSTR_SLOT_ALU_VEC_MUL, + PPIR_INSTR_SLOT_END + }, }, [ppir_op_store_temp] = { .name = "st_temp", @@ -331,6 +342,18 @@ PPIR_INSTR_SLOT_BRANCH, PPIR_INSTR_SLOT_END }, }, + [ppir_op_undef] = { + .name = "undef", + .type = ppir_node_type_alu, + .slots = (int []) { + }, + }, + [ppir_op_dummy] = { + .name = "dummy", + .type = ppir_node_type_alu, + .slots = (int []) { + }, + }, }; void *ppir_node_create(ppir_block *block, ppir_op op, int index, unsigned mask) @@ -377,7 +400,8 @@ return node; } -void ppir_node_add_dep(ppir_node *succ, ppir_node *pred) +void ppir_node_add_dep(ppir_node *succ, ppir_node *pred, + ppir_dep_type type) { /* don't add dep for two nodes from different block */ if (succ->block != pred->block) @@ -392,6 +416,7 @@ ppir_dep *dep = ralloc(succ, ppir_dep); dep->pred = pred; dep->succ = succ; + dep->type = type; list_addtail(&dep->pred_link, &succ->pred_list); list_addtail(&dep->succ_link, &pred->succ_list); } @@ -437,7 +462,8 @@ case ppir_node_type_load_texture: { ppir_load_texture_node *load_texture = ppir_node_to_load_texture(parent); - _ppir_node_replace_child(&load_texture->src_coords, old_child, new_child); + for (int i = 0; i < load_texture->num_src; i++) + _ppir_node_replace_child(ppir_node_get_src(parent, i), old_child, new_child); break; } case ppir_node_type_store: @@ -459,6 +485,21 @@ list_addtail(&dep->succ_link, &new_pred->succ_list); } +ppir_dep *ppir_dep_for_pred(ppir_node *node, ppir_node *pred) +{ + if (!pred) + return NULL; + + if (node->block != pred->block) + return NULL; + + ppir_node_foreach_pred(node, dep) { + if (dep->pred == pred) + return dep; + } + return NULL; +} + void ppir_node_replace_all_succ(ppir_node *dst, ppir_node *src) { ppir_node_foreach_succ_safe(src, dep) { @@ -563,7 +604,7 @@ printf("========prog========\n"); list_for_each_entry(ppir_block, block, &comp->block_list, list) { - printf("-------block------\n"); + printf("-------block %3d-------\n", block->index); list_for_each_entry(ppir_node, node, &block->node_list, list) { if (ppir_node_is_root(node)) ppir_node_print_node(node, 0); @@ -571,3 +612,160 @@ } printf("====================\n"); } + +static ppir_node *ppir_node_clone_const(ppir_block *block, ppir_node *node) +{ + ppir_const_node *cnode = ppir_node_to_const(node); + ppir_const_node *new_cnode = ppir_node_create(block, ppir_op_const, -1, 0); + + if (!new_cnode) + return NULL; + + list_addtail(&new_cnode->node.list, &block->node_list); + + new_cnode->constant.num = cnode->constant.num; + for (int i = 0; i < cnode->constant.num; i++) { + new_cnode->constant.value[i] = cnode->constant.value[i]; + } + new_cnode->dest.type = ppir_target_ssa; + new_cnode->dest.ssa.num_components = cnode->dest.ssa.num_components; + new_cnode->dest.write_mask = cnode->dest.write_mask; + + return &new_cnode->node; +} + +static ppir_node * +ppir_node_clone_load(ppir_block *block, ppir_node *node) +{ + ppir_load_node *load_node = ppir_node_to_load(node); + ppir_load_node *new_lnode = ppir_node_create(block, node->op, -1, 0); + + if (!new_lnode) + return NULL; + + list_addtail(&new_lnode->node.list, &block->node_list); + + new_lnode->num_components = load_node->num_components; + new_lnode->index = load_node->index; + + ppir_dest *dest = ppir_node_get_dest(node); + new_lnode->dest = *dest; + + ppir_src *src = ppir_node_get_src(node, 0); + if (src) { + new_lnode->num_src = 1; + switch (src->type) { + case ppir_target_ssa: + ppir_node_target_assign(&new_lnode->src, src->node); + ppir_node_add_dep(&new_lnode->node, src->node, ppir_dep_src); + break; + case ppir_target_register: + new_lnode->src.type = src->type; + new_lnode->src.reg = src->reg; + new_lnode->src.node = NULL; + break; + default: + /* Load nodes can't consume pipeline registers */ + assert(0); + } + } + + return &new_lnode->node; +} + +void +ppir_delete_if_orphan(ppir_block *block, ppir_node *node) +{ + ppir_dest *dest = ppir_node_get_dest(node); + if (!dest) + return; + + ppir_node_foreach_succ_safe(node, dep) { + ppir_node *succ = dep->succ; + for (int i = 0; i < ppir_node_get_src_num(succ); i++) { + ppir_src *src = ppir_node_get_src(succ, i); + if (!src) + continue; + if (ppir_node_target_equal(src, dest)) + return; + } + } + + ppir_node_delete(node); +} + +ppir_node *ppir_node_clone(ppir_block *block, ppir_node *node) +{ + switch (node->op) { + case ppir_op_const: + return ppir_node_clone_const(block, node); + case ppir_op_load_uniform: + case ppir_op_load_varying: + case ppir_op_load_temp: + case ppir_op_load_coords: + case ppir_op_load_coords_reg: + return ppir_node_clone_load(block, node); + default: + return NULL; + } +} + +ppir_node *ppir_node_insert_mov(ppir_node *node) +{ + ppir_node *move = ppir_node_create(node->block, ppir_op_mov, -1, 0); + if (unlikely(!move)) + return NULL; + + ppir_dest *dest = ppir_node_get_dest(node); + ppir_alu_node *alu = ppir_node_to_alu(move); + alu->dest = *dest; + alu->num_src = 1; + ppir_node_target_assign(alu->src, node); + + for (int s = 0; s < 4; s++) + alu->src->swizzle[s] = s; + + ppir_node_replace_all_succ(move, node); + ppir_node_add_dep(move, node, ppir_dep_src); + list_addtail(&move->list, &node->list); + + return move; +} + +ppir_node *ppir_node_insert_mov_all_blocks(ppir_node *old) +{ + ppir_node *move = ppir_node_insert_mov(old); + ppir_compiler *comp = old->block->comp; + + list_for_each_entry(ppir_block, block, &comp->block_list, list) { + if (old->block == block) + continue; + list_for_each_entry_safe(ppir_node, node, &block->node_list, list) { + for (int i = 0; i < ppir_node_get_src_num(node); i++){ + ppir_src *src = ppir_node_get_src(node, i); + if (!src) + continue; + if (src->node == old) + ppir_node_target_assign(src, move); + } + } + } + + return move; +} +bool ppir_node_has_single_src_succ(ppir_node *node) +{ + if (list_is_singular(&node->succ_list) && + list_first_entry(&node->succ_list, + ppir_dep, succ_link)->type == ppir_dep_src) + return true; + + int cnt = 0; + ppir_node_foreach_succ(node, dep) { + if (dep->type != ppir_dep_src) + continue; + cnt++; + } + + return cnt == 1; +} diff -Nru mesa-19.2.8/src/gallium/drivers/lima/ir/pp/node_to_instr.c mesa-20.0.8/src/gallium/drivers/lima/ir/pp/node_to_instr.c --- mesa-19.2.8/src/gallium/drivers/lima/ir/pp/node_to_instr.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/lima/ir/pp/node_to_instr.c 2020-06-12 01:21:17.000000000 +0000 @@ -37,97 +37,6 @@ return true; } -static bool insert_to_each_succ_instr(ppir_block *block, ppir_node *node) -{ - ppir_dest *dest = ppir_node_get_dest(node); - assert(dest->type == ppir_target_ssa); - - ppir_node *move = NULL; - - ppir_node_foreach_succ_safe(node, dep) { - ppir_node *succ = dep->succ; - assert(succ->type == ppir_node_type_alu || - succ->type == ppir_node_type_branch); - - if (!ppir_instr_insert_node(succ->instr, node)) { - /* create a move node to insert for failed node */ - if (!move) { - move = ppir_node_create(block, ppir_op_mov, -1, 0); - if (unlikely(!move)) - return false; - - ppir_debug("node_to_instr create move %d for %d\n", - move->index, node->index); - - ppir_alu_node *alu = ppir_node_to_alu(move); - alu->dest = *dest; - alu->num_src = 1; - ppir_node_target_assign(alu->src, node); - for (int i = 0; i < 4; i++) - alu->src->swizzle[i] = i; - } - - ppir_node_replace_pred(dep, move); - ppir_node_replace_child(succ, node, move); - } - } - - if (move) { - if (!create_new_instr(block, move)) - return false; - - ASSERTED bool insert_result = - ppir_instr_insert_node(move->instr, node); - assert(insert_result); - - ppir_node_add_dep(move, node); - list_addtail(&move->list, &node->list); - } - - /* dupliacte node for each successor */ - - bool first = true; - struct list_head dup_list; - list_inithead(&dup_list); - - ppir_node_foreach_succ_safe(node, dep) { - ppir_node *succ = dep->succ; - - if (first) { - first = false; - node->instr = succ->instr; - continue; - } - - if (succ->instr == node->instr) - continue; - - list_for_each_entry(ppir_node, dup, &dup_list, list) { - if (succ->instr == dup->instr) { - ppir_node_replace_pred(dep, dup); - continue; - } - } - - ppir_node *dup = ppir_node_create(block, node->op, -1, 0); - if (unlikely(!dup)) - return false; - list_addtail(&dup->list, &dup_list); - - ppir_debug("node_to_instr duplicate %s %d from %d\n", - ppir_op_infos[dup->op].name, dup->index, node->index); - - ppir_instr *instr = succ->instr; - dup->instr = instr; - dup->instr_pos = node->instr_pos; - ppir_node_replace_pred(dep, dup); - } - - list_splicetail(&dup_list, &node->list); - - return true; -} - /* * If a node has a pipeline dest, schedule it in the same instruction as its * successor. @@ -141,7 +50,7 @@ if (!dest || dest->type != ppir_target_pipeline) return false; - assert(ppir_node_has_single_succ(node)); + assert(ppir_node_has_single_src_succ(node)); ppir_node *succ = ppir_node_first_succ(node); assert(succ); assert(succ->instr); @@ -157,11 +66,15 @@ switch (node->type) { case ppir_node_type_alu: { + /* don't create an instr for undef node */ + if (node->op == ppir_op_undef) + break; + /* merge pred mul and succ add in the same instr can save a reg * by using pipeline reg ^vmul/^fmul */ ppir_alu_node *alu = ppir_node_to_alu(node); if (alu->dest.type == ppir_target_ssa && - ppir_node_has_single_succ(node)) { + ppir_node_has_single_src_succ(node)) { ppir_node *succ = ppir_node_first_succ(node); if (succ->instr_pos == PPIR_INSTR_SLOT_ALU_VEC_ADD) { node->instr_pos = PPIR_INSTR_SLOT_ALU_VEC_MUL; @@ -178,90 +91,71 @@ if (!node->instr && !create_new_instr(block, node)) return false; + if (node->op == ppir_op_store_color) + node->instr->is_end = true; + break; } case ppir_node_type_load: - if (node->op == ppir_op_load_varying || - node->op == ppir_op_load_fragcoord || - node->op == ppir_op_load_pointcoord || - node->op == ppir_op_load_frontface) { - if (!create_new_instr(block, node)) - return false; - } - else { - /* not supported yet */ - assert(0); - return false; - } - break; case ppir_node_type_load_texture: + { if (!create_new_instr(block, node)) return false; - break; - case ppir_node_type_const: - if (!insert_to_each_succ_instr(block, node)) - return false; - break; - case ppir_node_type_store: - { - if (node->op == ppir_op_store_temp) { - if (!create_new_instr(block, node)) - return false; + + /* load varying output can be a register, it doesn't need a mov */ + switch (node->op) { + case ppir_op_load_varying: + case ppir_op_load_coords: + case ppir_op_load_coords_reg: + case ppir_op_load_fragcoord: + case ppir_op_load_pointcoord: + case ppir_op_load_frontface: + return true; + default: break; } - /* Only the store color node should appear here. - * Currently we always insert a move node as the end instr. - * But it should only be done when: - * 1. store a const node - * 2. store a load node - * 3. store a reg assigned in another block like loop/if - */ - - assert(node->op == ppir_op_store_color); + /* Load cannot be pipelined, likely slot is already taken. Create a mov */ + assert(ppir_node_has_single_src_succ(node)); + ppir_dest *dest = ppir_node_get_dest(node); + assert(dest->type == ppir_target_pipeline); + ppir_pipeline pipeline_reg = dest->pipeline; + + /* Turn dest back to SSA, so we can update predecessors */ + ppir_node *succ = ppir_node_first_succ(node); + ppir_src *succ_src = ppir_node_get_src_for_pred(succ, node); + dest->type = ppir_target_ssa; + dest->ssa.index = -1; + ppir_node_target_assign(succ_src, node); - ppir_node *move = ppir_node_create(block, ppir_op_mov, -1, 0); + ppir_node *move = ppir_node_insert_mov(node); if (unlikely(!move)) return false; - ppir_debug("node_to_instr create move %d from store %d\n", - move->index, node->index); - - ppir_node_foreach_pred_safe(node, dep) { - ppir_node *pred = dep->pred; - /* we can't do this in this function except here as this - * store is the root of this recursion */ - ppir_node_remove_dep(dep); - ppir_node_add_dep(move, pred); - } - - ppir_node_add_dep(node, move); - list_addtail(&move->list, &node->list); - - ppir_alu_node *alu = ppir_node_to_alu(move); - ppir_store_node *store = ppir_node_to_store(node); - alu->src[0] = store->src; - alu->num_src = 1; - - alu->dest.type = ppir_target_ssa; - alu->dest.ssa.num_components = 4; - alu->dest.ssa.live_in = INT_MAX; - alu->dest.ssa.live_out = 0; - alu->dest.write_mask = 0xf; + ppir_src *mov_src = ppir_node_get_src(move, 0); + mov_src->type = dest->type = ppir_target_pipeline; + mov_src->pipeline = dest->pipeline = pipeline_reg; - store->src.type = ppir_target_ssa; - store->src.ssa = &alu->dest.ssa; + ppir_debug("node_to_instr create move %d for load %d\n", + move->index, node->index); - if (!create_new_instr(block, move)) + if (!ppir_instr_insert_node(node->instr, move)) return false; - move->instr->is_end = true; - node->instr = move->instr; - - /* use move for the following recursion */ - *next = move; break; } + case ppir_node_type_const: + /* Const nodes are supposed to go through do_node_to_instr_pipeline() */ + assert(false); + break; + case ppir_node_type_store: + { + if (node->op == ppir_op_store_temp) { + if (!create_new_instr(block, node)) + return false; + break; + } + } case ppir_node_type_discard: if (!create_new_instr(block, node)) return false; diff -Nru mesa-19.2.8/src/gallium/drivers/lima/ir/pp/ppir.h mesa-20.0.8/src/gallium/drivers/lima/ir/pp/ppir.h --- mesa-19.2.8/src/gallium/drivers/lima/ir/pp/ppir.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/lima/ir/pp/ppir.h 2020-06-12 01:21:17.000000000 +0000 @@ -27,6 +27,7 @@ #include "util/u_math.h" #include "util/list.h" +#include "util/set.h" #include "ir/lima_ir.h" @@ -99,6 +100,7 @@ ppir_op_load_uniform, ppir_op_load_varying, ppir_op_load_coords, + ppir_op_load_coords_reg, ppir_op_load_fragcoord, ppir_op_load_pointcoord, ppir_op_load_frontface, @@ -113,6 +115,9 @@ ppir_op_discard, ppir_op_branch, + ppir_op_undef, + ppir_op_dummy, + ppir_op_num, } ppir_op; @@ -134,8 +139,15 @@ extern const ppir_op_info ppir_op_infos[]; +typedef enum { + ppir_dep_src, + ppir_dep_write_after_read, + ppir_dep_sequence, +} ppir_dep_type; + typedef struct { void *pred, *succ; + ppir_dep_type type; struct list_head pred_link; struct list_head succ_link; } ppir_dep; @@ -169,15 +181,15 @@ typedef struct ppir_reg { struct list_head list; int index; + int regalloc_index; int num_components; + /* whether this reg has to start from the x component * of a full physical reg, this is true for reg used - * in load/store instr which has no swizzle field - */ + * in load/store instr which has no swizzle field */ bool is_head; - /* instr live range */ - int live_in, live_out; bool spilled; + bool undef; } ppir_reg; typedef enum { @@ -245,6 +257,7 @@ int num_components; ppir_dest dest; ppir_src src; + int num_src; } ppir_load_node; typedef struct { @@ -257,9 +270,13 @@ typedef struct { ppir_node node; ppir_dest dest; - ppir_src src_coords; /* not to be used after lowering */ + ppir_src src[2]; /* src[0] temporarily stores src_coords, + not to be used after lowering */ + int num_src; int sampler; int sampler_dim; + bool lod_bias_en; + bool explicit_lod; } ppir_load_texture_node; typedef struct { @@ -283,6 +300,11 @@ PPIR_INSTR_SLOT_ALU_END = PPIR_INSTR_SLOT_ALU_COMBINE, }; +struct ppir_liveness { + ppir_reg *reg; + unsigned mask : 4; +}; + typedef struct ppir_instr { struct list_head list; int index; @@ -302,25 +324,43 @@ bool scheduled; int offset; int encode_size; + + /* for liveness analysis */ + struct ppir_liveness *live_in; + struct ppir_liveness *live_out; + struct set *live_in_set; + struct set *live_out_set; } ppir_instr; typedef struct ppir_block { struct list_head list; struct list_head node_list; struct list_head instr_list; + + struct ppir_block *successors[2]; + struct ppir_compiler *comp; /* for scheduler */ int sched_instr_index; int sched_instr_base; + int index; + + /* for liveness analysis */ + struct ppir_liveness *live_in; + struct ppir_liveness *live_out; + struct set *live_in_set; + struct set *live_out_set; } ppir_block; typedef struct { ppir_node node; ppir_src src[2]; + int num_src; bool cond_gt; bool cond_eq; bool cond_lt; + bool negate; ppir_block *target; } ppir_branch_node; @@ -329,6 +369,7 @@ typedef struct ppir_compiler { struct list_head block_list; + struct hash_table_u64 *blocks; int cur_index; int cur_instr_index; @@ -353,25 +394,34 @@ int num_fills; ppir_block *discard_block; + ppir_block *current_block; + ppir_block *loop_break_block; + ppir_block *loop_cont_block; } ppir_compiler; void *ppir_node_create(ppir_block *block, ppir_op op, int index, unsigned mask); -void ppir_node_add_dep(ppir_node *succ, ppir_node *pred); +void ppir_node_add_dep(ppir_node *succ, ppir_node *pred, ppir_dep_type type); void ppir_node_remove_dep(ppir_dep *dep); void ppir_node_delete(ppir_node *node); void ppir_node_print_prog(ppir_compiler *comp); void ppir_node_replace_child(ppir_node *parent, ppir_node *old_child, ppir_node *new_child); void ppir_node_replace_all_succ(ppir_node *dst, ppir_node *src); void ppir_node_replace_pred(ppir_dep *dep, ppir_node *new_pred); +void ppir_delete_if_orphan(ppir_block *block, ppir_node *node); +ppir_dep *ppir_dep_for_pred(ppir_node *node, ppir_node *pred); +ppir_node *ppir_node_clone(ppir_block *block, ppir_node *node); +/* Assumes that node successors are in the same block */ +ppir_node *ppir_node_insert_mov(ppir_node *node); +ppir_node *ppir_node_insert_mov_all_blocks(ppir_node *node); static inline bool ppir_node_is_root(ppir_node *node) { - return list_empty(&node->succ_list); + return list_is_empty(&node->succ_list); } static inline bool ppir_node_is_leaf(ppir_node *node) { - return list_empty(&node->pred_list); + return list_is_empty(&node->pred_list); } static inline bool ppir_node_has_single_succ(ppir_node *node) @@ -379,6 +429,8 @@ return list_is_singular(&node->succ_list); } +bool ppir_node_has_single_src_succ(ppir_node *node); + static inline ppir_node *ppir_node_first_succ(ppir_node *node) { return list_first_entry(&node->succ_list, ppir_dep, succ_link)->succ; @@ -433,9 +485,11 @@ case ppir_node_type_alu: return ppir_node_to_alu(node)->num_src; case ppir_node_type_branch: - return 2; - case ppir_node_type_load_texture: + return ppir_node_to_branch(node)->num_src; case ppir_node_type_load: + return ppir_node_to_load(node)->num_src; + case ppir_node_type_load_texture: + return ppir_node_to_load_texture(node)->num_src; case ppir_node_type_store: return 1; default: @@ -456,7 +510,7 @@ case ppir_node_type_branch: return &ppir_node_to_branch(node)->src[idx]; case ppir_node_type_load_texture: - return &ppir_node_to_load_texture(node)->src_coords; + return &ppir_node_to_load_texture(node)->src[idx]; case ppir_node_type_load: return &ppir_node_to_load(node)->src; case ppir_node_type_store: @@ -468,6 +522,41 @@ return NULL; } +static inline ppir_reg *ppir_src_get_reg(ppir_src *src) +{ + switch (src->type) { + case ppir_target_ssa: + return src->ssa; + case ppir_target_register: + return src->reg; + default: + return NULL; + } +} + +static inline ppir_reg *ppir_dest_get_reg(ppir_dest *dest) +{ + switch (dest->type) { + case ppir_target_ssa: + return &dest->ssa; + case ppir_target_register: + return dest->reg; + default: + return NULL; + } +} + +static inline ppir_src *ppir_node_get_src_for_pred(ppir_node *node, ppir_node *pred) +{ + for (int i = 0; i < ppir_node_get_src_num(node); i++) { + ppir_src *src = ppir_node_get_src(node, i); + if (src && src->node == pred) + return src; + } + + return NULL; +} + static inline void ppir_node_target_assign(ppir_src *src, ppir_node *node) { ppir_dest *dest = ppir_node_get_dest(node); @@ -538,6 +627,17 @@ return -1; } +static inline int ppir_src_get_mask(ppir_src *src) +{ + ppir_reg *reg = ppir_src_get_reg(src); + int mask = 0; + + for (int i = 0; i < reg->num_components; i++) + mask |= (1 << src->swizzle[i]); + + return mask; +} + static inline bool ppir_target_is_scaler(ppir_dest *dest) { switch (dest->type) { @@ -582,12 +682,12 @@ static inline bool ppir_instr_is_root(ppir_instr *instr) { - return list_empty(&instr->succ_list); + return list_is_empty(&instr->succ_list); } static inline bool ppir_instr_is_leaf(ppir_instr *instr) { - return list_empty(&instr->pred_list); + return list_is_empty(&instr->pred_list); } bool ppir_lower_prog(ppir_compiler *comp); @@ -595,5 +695,6 @@ bool ppir_schedule_prog(ppir_compiler *comp); bool ppir_regalloc_prog(ppir_compiler *comp); bool ppir_codegen_prog(ppir_compiler *comp); +void ppir_liveness_analysis(ppir_compiler *comp); #endif diff -Nru mesa-19.2.8/src/gallium/drivers/lima/ir/pp/regalloc.c mesa-20.0.8/src/gallium/drivers/lima/ir/pp/regalloc.c --- mesa-19.2.8/src/gallium/drivers/lima/ir/pp/regalloc.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/lima/ir/pp/regalloc.c 2020-06-12 01:21:17.000000000 +0000 @@ -134,18 +134,6 @@ return ret; } -static ppir_reg *get_src_reg(ppir_src *src) -{ - switch (src->type) { - case ppir_target_ssa: - return src->ssa; - case ppir_target_register: - return src->reg; - default: - return NULL; - } -} - static void ppir_regalloc_update_reglist_ssa(ppir_compiler *comp) { list_for_each_entry(ppir_block, block, &comp->block_list, list) { @@ -169,53 +157,6 @@ } } -static ppir_reg *ppir_regalloc_build_liveness_info(ppir_compiler *comp) -{ - ppir_reg *ret = NULL; - - list_for_each_entry(ppir_block, block, &comp->block_list, list) { - list_for_each_entry(ppir_node, node, &block->node_list, list) { - if (node->op == ppir_op_store_color) { - ppir_store_node *store = ppir_node_to_store(node); - if (store->src.type == ppir_target_ssa) - ret = store->src.ssa; - else - ret = store->src.reg; - ret->live_out = INT_MAX; - continue; - } - - if (!node->instr || node->op == ppir_op_const) - continue; - - /* update reg live_in from node dest (write) */ - ppir_dest *dest = ppir_node_get_dest(node); - if (dest) { - ppir_reg *reg = NULL; - - if (dest->type == ppir_target_ssa) { - reg = &dest->ssa; - } - else if (dest->type == ppir_target_register) - reg = dest->reg; - - if (reg && node->instr->seq < reg->live_in) - reg->live_in = node->instr->seq; - } - - /* update reg live_out from node src (read) */ - for (int i = 0; i < ppir_node_get_src_num(node); i++) - { - ppir_reg *reg = get_src_reg(ppir_node_get_src(node, i)); - if (reg && node->instr->seq > reg->live_out) - reg->live_out = node->instr->seq; - } - } - } - - return ret; -} - static int get_phy_reg_index(int reg) { int i; @@ -308,35 +249,49 @@ return true; } -static ppir_alu_node* ppir_update_spilled_src(ppir_compiler *comp, - ppir_block *block, - ppir_node *node, ppir_src *src, - ppir_alu_node *move_alu) +static bool ppir_update_spilled_src(ppir_compiler *comp, ppir_block *block, + ppir_node *node, ppir_src *src, + ppir_node **fill_node) { - /* alu nodes may have multiple references to the same value. - * try to avoid unnecessary loads for the same alu node by + /* nodes might have multiple references to the same value. + * avoid creating unnecessary loads for the same fill by * saving the node resulting from the temporary load */ - if (move_alu) + if (*fill_node) goto update_src; + int num_components = src->reg->num_components; + /* alloc new node to load value */ ppir_node *load_node = ppir_node_create(block, ppir_op_load_temp, -1, 0); if (!load_node) - return NULL; + return false; list_addtail(&load_node->list, &node->list); comp->num_fills++; ppir_load_node *load = ppir_node_to_load(load_node); load->index = -comp->prog->stack_size; /* index sizes are negative */ - load->num_components = 4; + load->num_components = num_components; ppir_dest *ld_dest = &load->dest; ld_dest->type = ppir_target_pipeline; ld_dest->pipeline = ppir_pipeline_reg_uniform; - ld_dest->write_mask = 0xf; + ld_dest->write_mask = u_bit_consecutive(0, num_components); + + /* If the uniform slot is empty, we can insert the load_temp + * there and use it directly. Exceptionally, if the node is in the + * varying or texld slot, this doesn't work. */ + if (!node->instr->slots[PPIR_INSTR_SLOT_UNIFORM] && + node->instr_pos != PPIR_INSTR_SLOT_VARYING && + node->instr_pos != PPIR_INSTR_SLOT_TEXLD) { + ppir_node_target_assign(src, load_node); + *fill_node = load_node; + return ppir_instr_insert_node(node->instr, load_node); + } - create_new_instr_before(block, node->instr, load_node); + /* Uniform slot was taken, so fall back to a new instruction with a mov */ + if (!create_new_instr_before(block, node->instr, load_node)) + return false; /* Create move node */ ppir_node *move_node = ppir_node_create(block, ppir_op_mov, -1 , 0); @@ -344,7 +299,7 @@ return false; list_addtail(&move_node->list, &node->list); - move_alu = ppir_node_to_alu(move_node); + ppir_alu_node *move_alu = ppir_node_to_alu(move_node); move_alu->num_src = 1; move_alu->src->type = ppir_target_pipeline; @@ -354,10 +309,9 @@ ppir_dest *alu_dest = &move_alu->dest; alu_dest->type = ppir_target_ssa; - alu_dest->ssa.num_components = 4; - alu_dest->ssa.live_in = INT_MAX; - alu_dest->ssa.live_out = 0; - alu_dest->write_mask = 0xf; + alu_dest->ssa.num_components = num_components; + alu_dest->ssa.spilled = true; + alu_dest->write_mask = u_bit_consecutive(0, num_components); list_addtail(&alu_dest->ssa.list, &comp->reg_list); @@ -368,49 +322,28 @@ ppir_node_foreach_pred_safe(node, dep) { ppir_node *pred = dep->pred; ppir_node_remove_dep(dep); - ppir_node_add_dep(load_node, pred); + ppir_node_add_dep(load_node, pred, ppir_dep_src); } - ppir_node_add_dep(node, move_node); - ppir_node_add_dep(move_node, load_node); - -update_src: - /* switch node src to use the new ssa instead */ - src->type = ppir_target_ssa; - src->ssa = &move_alu->dest.ssa; + ppir_node_add_dep(node, move_node, ppir_dep_src); + ppir_node_add_dep(move_node, load_node, ppir_dep_src); - return move_alu; -} + *fill_node = move_node; -static ppir_reg *create_reg(ppir_compiler *comp, int num_components) -{ - ppir_reg *r = rzalloc(comp, ppir_reg); - if (!r) - return NULL; - - r->num_components = num_components; - r->live_in = INT_MAX; - r->live_out = 0; - r->is_head = false; - list_addtail(&r->list, &comp->reg_list); +update_src: + /* switch node src to use the fill node dest */ + ppir_node_target_assign(src, *fill_node); - return r; + return true; } -static bool ppir_update_spilled_dest(ppir_compiler *comp, ppir_block *block, - ppir_node *node, ppir_dest *dest) +static bool ppir_update_spilled_dest_load(ppir_compiler *comp, ppir_block *block, + ppir_node *node) { + ppir_dest *dest = ppir_node_get_dest(node); assert(dest != NULL); - ppir_reg *reg = NULL; - if (dest->type == ppir_target_register) { - reg = dest->reg; - reg->num_components = 4; - reg->spilled = true; - } - else { - reg = create_reg(comp, 4); - reg->spilled = true; - list_del(&dest->ssa.list); - } + assert(dest->type == ppir_target_register); + ppir_reg *reg = dest->reg; + int num_components = reg->num_components; /* alloc new node to load value */ ppir_node *load_node = ppir_node_create(block, ppir_op_load_temp, -1, 0); @@ -422,13 +355,16 @@ ppir_load_node *load = ppir_node_to_load(load_node); load->index = -comp->prog->stack_size; /* index sizes are negative */ - load->num_components = 4; + load->num_components = num_components; load->dest.type = ppir_target_pipeline; load->dest.pipeline = ppir_pipeline_reg_uniform; - load->dest.write_mask = 0xf; + load->dest.write_mask = u_bit_consecutive(0, num_components); - create_new_instr_before(block, node->instr, load_node); + /* New instruction is needed since we're updating a dest register + * and we can't write to the uniform pipeline reg */ + if (!create_new_instr_before(block, node->instr, load_node)) + return false; /* Create move node */ ppir_node *move_node = ppir_node_create(block, ppir_op_mov, -1 , 0); @@ -446,7 +382,7 @@ move_alu->dest.type = ppir_target_register; move_alu->dest.reg = reg; - move_alu->dest.write_mask = 0x0f; + move_alu->dest.write_mask = u_bit_consecutive(0, num_components); if (!ppir_instr_insert_node(load_node->instr, move_node)) return false; @@ -454,13 +390,20 @@ ppir_node_foreach_pred_safe(node, dep) { ppir_node *pred = dep->pred; ppir_node_remove_dep(dep); - ppir_node_add_dep(load_node, pred); + ppir_node_add_dep(load_node, pred, ppir_dep_src); } - ppir_node_add_dep(node, move_node); - ppir_node_add_dep(move_node, load_node); + ppir_node_add_dep(node, move_node, ppir_dep_src); + ppir_node_add_dep(move_node, load_node, ppir_dep_src); - dest->type = ppir_target_register; - dest->reg = reg; + return true; +} + +static bool ppir_update_spilled_dest(ppir_compiler *comp, ppir_block *block, + ppir_node *node) +{ + ppir_dest *dest = ppir_node_get_dest(node); + assert(dest != NULL); + ppir_reg *reg = ppir_dest_get_reg(dest); /* alloc new node to store value */ ppir_node *store_node = ppir_node_create(block, ppir_op_store_temp, -1, 0); @@ -472,22 +415,27 @@ ppir_store_node *store = ppir_node_to_store(store_node); store->index = -comp->prog->stack_size; /* index sizes are negative */ - store->num_components = 4; - store->src.type = ppir_target_register; - store->src.reg = dest->reg; + ppir_node_target_assign(&store->src, node); + store->num_components = reg->num_components; /* insert the new node as successor */ ppir_node_foreach_succ_safe(node, dep) { ppir_node *succ = dep->succ; ppir_node_remove_dep(dep); - ppir_node_add_dep(succ, store_node); + ppir_node_add_dep(succ, store_node, ppir_dep_src); } - ppir_node_add_dep(store_node, node); + ppir_node_add_dep(store_node, node, ppir_dep_src); - create_new_instr_after(block, node->instr, store_node); + /* If the store temp slot is empty, we can insert the store_temp + * there and use it directly. Exceptionally, if the node is in the + * combine slot, this doesn't work. */ + if (!node->instr->slots[PPIR_INSTR_SLOT_STORE_TEMP] && + node->instr_pos != PPIR_INSTR_SLOT_ALU_COMBINE) + return ppir_instr_insert_node(node->instr, store_node); - return true; + /* Not possible to merge store, so fall back to a new instruction */ + return create_new_instr_after(block, node->instr, store_node); } static bool ppir_regalloc_spill_reg(ppir_compiler *comp, ppir_reg *chosen) @@ -496,45 +444,28 @@ list_for_each_entry(ppir_node, node, &block->node_list, list) { ppir_dest *dest = ppir_node_get_dest(node); - ppir_reg *reg = NULL; - if (dest) { - if (dest->type == ppir_target_ssa) - reg = &dest->ssa; - else if (dest->type == ppir_target_register) - reg = dest->reg; - - if (reg == chosen) - ppir_update_spilled_dest(comp, block, node, dest); - } - - switch (node->type) { - case ppir_node_type_alu: - { - /* alu nodes may have multiple references to the same value. - * try to avoid unnecessary loads for the same alu node by - * saving the node resulting from the temporary load */ - ppir_alu_node *move_alu = NULL; - ppir_alu_node *alu = ppir_node_to_alu(node); - for (int i = 0; i < alu->num_src; i++) { - reg = get_src_reg(alu->src + i); - if (reg == chosen) { - move_alu = ppir_update_spilled_src(comp, block, node, - alu->src + i, move_alu); - } + if (dest && ppir_dest_get_reg(dest) == chosen) { + /* If dest is a register, it might be updating only some its + * components, so need to load the existing value first */ + if (dest->type == ppir_target_register) { + if (!ppir_update_spilled_dest_load(comp, block, node)) + return false; } - break; + if (!ppir_update_spilled_dest(comp, block, node)) + return false; } - default: - { - for (int i = 0; i < ppir_node_get_src_num(node); i++) { - ppir_src *src = ppir_node_get_src(node, i); - reg = get_src_reg(src); - if (reg == chosen) { - ppir_update_spilled_src(comp, block, node, src, NULL); - } + + ppir_node *fill_node = NULL; + /* nodes might have multiple references to the same value. + * avoid creating unnecessary loads for the same fill by + * saving the node resulting from the temporary load */ + for (int i = 0; i < ppir_node_get_src_num(node); i++) { + ppir_src *src = ppir_node_get_src(node, i); + ppir_reg *reg = ppir_src_get_reg(src); + if (reg == chosen) { + if (!ppir_update_spilled_src(comp, block, node, src, &fill_node)) + return false; } - break; - } } } } @@ -545,27 +476,71 @@ static ppir_reg *ppir_regalloc_choose_spill_node(ppir_compiler *comp, struct ra_graph *g) { - int i = 0; - ppir_reg *chosen = NULL; + float spill_costs[list_length(&comp->reg_list)]; + /* experimentally determined, it seems to be worth scaling cost of + * regs in instructions that have used uniform/store_temp slots, + * but not too much as to offset the num_components base cost. */ + const float slot_scale = 1.1f; list_for_each_entry(ppir_reg, reg, &comp->reg_list, list) { - if (reg->spilled || reg->live_out == INT_MAX) { + if (reg->spilled) { /* not considered for spilling */ - ra_set_node_spill_cost(g, i++, 0.0f); + spill_costs[reg->regalloc_index] = 0.0f; continue; } /* It is beneficial to spill registers with higher component number, * so increase the cost of spilling registers with few components */ float spill_cost = 4.0f / (float)reg->num_components; - ra_set_node_spill_cost(g, i++, spill_cost); + spill_costs[reg->regalloc_index] = spill_cost; + } + + list_for_each_entry(ppir_block, block, &comp->block_list, list) { + list_for_each_entry(ppir_instr, instr, &block->instr_list, list) { + if (instr->slots[PPIR_INSTR_SLOT_UNIFORM]) { + for (int i = 0; i < PPIR_INSTR_SLOT_NUM; i++) { + ppir_node *node = instr->slots[i]; + if (!node) + continue; + for (int j = 0; j < ppir_node_get_src_num(node); j++) { + ppir_src *src = ppir_node_get_src(node, j); + if (!src) + continue; + ppir_reg *reg = ppir_src_get_reg(src); + if (!reg) + continue; + + spill_costs[reg->regalloc_index] *= slot_scale; + } + } + } + if (instr->slots[PPIR_INSTR_SLOT_STORE_TEMP]) { + for (int i = 0; i < PPIR_INSTR_SLOT_NUM; i++) { + ppir_node *node = instr->slots[i]; + if (!node) + continue; + ppir_dest *dest = ppir_node_get_dest(node); + if (!dest) + continue; + ppir_reg *reg = ppir_dest_get_reg(dest); + if (!reg) + continue; + + spill_costs[reg->regalloc_index] *= slot_scale; + } + } + } } + for (int i = 0; i < list_length(&comp->reg_list); i++) + ra_set_node_spill_cost(g, i, spill_costs[i]); + int r = ra_get_best_spill_node(g); if (r == -1) return NULL; - i = 0; + ppir_reg *chosen = NULL; + int i = 0; list_for_each_entry(ppir_reg, reg, &comp->reg_list, list) { if (i++ == r) { chosen = reg; @@ -574,15 +549,81 @@ } assert(chosen); chosen->spilled = true; + chosen->is_head = true; /* store_temp unable to do swizzle */ return chosen; } static void ppir_regalloc_reset_liveness_info(ppir_compiler *comp) { + int idx = 0; + list_for_each_entry(ppir_reg, reg, &comp->reg_list, list) { - reg->live_in = INT_MAX; - reg->live_out = 0; + reg->regalloc_index = idx++; + } + + list_for_each_entry(ppir_block, block, &comp->block_list, list) { + + if (block->live_in) + ralloc_free(block->live_in); + block->live_in = rzalloc_array(comp, + struct ppir_liveness, list_length(&comp->reg_list)); + + if (block->live_in_set) + _mesa_set_destroy(block->live_in_set, NULL); + block->live_in_set = _mesa_set_create(comp, + _mesa_hash_pointer, + _mesa_key_pointer_equal); + + if (block->live_out) + ralloc_free(block->live_out); + block->live_out = rzalloc_array(comp, + struct ppir_liveness, list_length(&comp->reg_list)); + + if (block->live_out_set) + _mesa_set_destroy(block->live_out_set, NULL); + block->live_out_set = _mesa_set_create(comp, + _mesa_hash_pointer, + _mesa_key_pointer_equal); + + list_for_each_entry(ppir_instr, instr, &block->instr_list, list) { + + if (instr->live_in) + ralloc_free(instr->live_in); + instr->live_in = rzalloc_array(comp, + struct ppir_liveness, list_length(&comp->reg_list)); + + if (instr->live_in_set) + _mesa_set_destroy(instr->live_in_set, NULL); + instr->live_in_set = _mesa_set_create(comp, + _mesa_hash_pointer, + _mesa_key_pointer_equal); + + if (instr->live_out) + ralloc_free(instr->live_out); + instr->live_out = rzalloc_array(comp, + struct ppir_liveness, list_length(&comp->reg_list)); + + if (instr->live_out_set) + _mesa_set_destroy(instr->live_out_set, NULL); + instr->live_out_set = _mesa_set_create(comp, + _mesa_hash_pointer, + _mesa_key_pointer_equal); + } + } +} + +static void ppir_all_interference(ppir_compiler *comp, struct ra_graph *g, + struct set *liveness) +{ + set_foreach(liveness, entry1) { + set_foreach(liveness, entry2) { + const struct ppir_liveness *r1 = entry1->key; + const struct ppir_liveness *r2 = entry2->key; + ra_add_node_interference(g, r1->reg->regalloc_index, + r2->reg->regalloc_index); + } + _mesa_set_remove(liveness, entry1); } } @@ -590,51 +631,28 @@ static bool ppir_regalloc_prog_try(ppir_compiler *comp, bool *spilled) { - ppir_reg *end_reg; - ppir_regalloc_reset_liveness_info(comp); - end_reg = ppir_regalloc_build_liveness_info(comp); struct ra_graph *g = ra_alloc_interference_graph( comp->ra, list_length(&comp->reg_list)); - int n = 0, end_reg_index = 0; + int n = 0; list_for_each_entry(ppir_reg, reg, &comp->reg_list, list) { int c = ppir_ra_reg_class_vec1 + (reg->num_components - 1); if (reg->is_head) c += 4; - if (reg == end_reg) - end_reg_index = n; ra_set_node_class(g, n++, c); } - int n1 = 0; - list_for_each_entry(ppir_reg, reg1, &comp->reg_list, list) { - int n2 = n1 + 1; - list_for_each_entry_from(ppir_reg, reg2, reg1->list.next, - &comp->reg_list, list) { - bool interference = false; - if (reg1->live_in < reg2->live_in) { - if (reg1->live_out > reg2->live_in) - interference = true; - } - else if (reg1->live_in > reg2->live_in) { - if (reg2->live_out > reg1->live_in) - interference = true; - } - else - interference = true; - - if (interference) - ra_add_node_interference(g, n1, n2); + ppir_liveness_analysis(comp); - n2++; + list_for_each_entry(ppir_block, block, &comp->block_list, list) { + list_for_each_entry(ppir_instr, instr, &block->instr_list, list) { + ppir_all_interference(comp, g, instr->live_in_set); + ppir_all_interference(comp, g, instr->live_out_set); } - n1++; } - ra_set_node_reg(g, end_reg_index, ppir_ra_reg_base[ppir_ra_reg_class_vec4]); - *spilled = false; bool ok = ra_allocate(g); if (!ok || (comp->force_spilling-- > 0)) { @@ -644,11 +662,14 @@ * It is also be used in the spilling code, as negative indices * starting from -1, to create stack addresses. */ comp->prog->stack_size++; - ppir_regalloc_spill_reg(comp, chosen); + if (!ppir_regalloc_spill_reg(comp, chosen)) + goto err_out; /* Ask the outer loop to call back in. */ *spilled = true; - ppir_debug("spilled register\n"); + ppir_debug("spilled register %d/%d, num_components: %d\n", + chosen->regalloc_index, list_length(&comp->reg_list), + chosen->num_components); goto err_out; } @@ -686,7 +707,7 @@ ppir_regalloc_update_reglist_ssa(comp); /* No registers? Probably shader consists of discard instruction */ - if (list_empty(&comp->reg_list)) + if (list_is_empty(&comp->reg_list)) return true; /* this will most likely succeed in the first diff -Nru mesa-19.2.8/src/gallium/drivers/lima/ir/pp/scheduler.c mesa-20.0.8/src/gallium/drivers/lima/ir/pp/scheduler.c --- mesa-19.2.8/src/gallium/drivers/lima/ir/pp/scheduler.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/lima/ir/pp/scheduler.c 2020-06-12 01:21:17.000000000 +0000 @@ -118,7 +118,7 @@ static void ppir_schedule_ready_list(ppir_block *block, struct list_head *ready_list) { - if (list_empty(ready_list)) + if (list_is_empty(ready_list)) return; ppir_instr *instr = list_first_entry(ready_list, ppir_instr, list); diff -Nru mesa-19.2.8/src/gallium/drivers/lima/lima_bo.c mesa-20.0.8/src/gallium/drivers/lima/lima_bo.c --- mesa-19.2.8/src/gallium/drivers/lima/lima_bo.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/lima/lima_bo.c 2020-06-12 01:21:17.000000000 +0000 @@ -30,6 +30,7 @@ #include "drm-uapi/lima_drm.h" #include "util/u_hash_table.h" +#include "util/u_math.h" #include "util/os_time.h" #include "os/os_mman.h" @@ -37,6 +38,7 @@ #include "lima_screen.h" #include "lima_bo.h" +#include "lima_util.h" #define PTR_TO_UINT(x) ((unsigned)((intptr_t)(x))) @@ -68,6 +70,16 @@ return false; } +bool lima_bo_cache_init(struct lima_screen *screen) +{ + mtx_init(&screen->bo_cache_lock, mtx_plain); + list_inithead(&screen->bo_cache_time); + for (int i = 0; i < NR_BO_CACHE_BUCKETS; i++) + list_inithead(&screen->bo_cache_buckets[i]); + + return true; +} + void lima_bo_table_fini(struct lima_screen *screen) { mtx_destroy(&screen->bo_table_lock); @@ -75,6 +87,13 @@ util_hash_table_destroy(screen->bo_flink_names); } +static void +lima_bo_cache_remove(struct lima_bo *bo) +{ + list_del(&bo->size_list); + list_del(&bo->time_list); +} + static void lima_close_kms_handle(struct lima_screen *screen, uint32_t handle) { struct drm_gem_close args = { @@ -84,6 +103,41 @@ drmIoctl(screen->fd, DRM_IOCTL_GEM_CLOSE, &args); } +static void +lima_bo_free(struct lima_bo *bo) +{ + struct lima_screen *screen = bo->screen; + + if (lima_debug & LIMA_DEBUG_BO_CACHE) + fprintf(stderr, "%s: %p (size=%d)\n", __func__, + bo, bo->size); + + mtx_lock(&screen->bo_table_lock); + util_hash_table_remove(screen->bo_handles, + (void *)(uintptr_t)bo->handle); + if (bo->flink_name) + util_hash_table_remove(screen->bo_flink_names, + (void *)(uintptr_t)bo->flink_name); + mtx_unlock(&screen->bo_table_lock); + + if (bo->map) + lima_bo_unmap(bo); + + lima_close_kms_handle(screen, bo->handle); + free(bo); +} + +void lima_bo_cache_fini(struct lima_screen *screen) +{ + mtx_destroy(&screen->bo_cache_lock); + + list_for_each_entry_safe(struct lima_bo, entry, + &screen->bo_cache_time, time_list) { + lima_bo_cache_remove(entry); + lima_bo_free(entry); + } +} + static bool lima_bo_get_info(struct lima_bo *bo) { struct drm_lima_gem_info req = { @@ -98,10 +152,154 @@ return true; } +static unsigned +lima_bucket_index(unsigned size) +{ + /* Round down to POT to compute a bucket index */ + + unsigned bucket_index = util_logbase2(size); + + /* Clamp the bucket index; all huge allocations will be + * sorted into the largest bucket */ + bucket_index = CLAMP(bucket_index, MIN_BO_CACHE_BUCKET, + MAX_BO_CACHE_BUCKET); + + /* Reindex from 0 */ + return (bucket_index - MIN_BO_CACHE_BUCKET); +} + +static struct list_head * +lima_bo_cache_get_bucket(struct lima_screen *screen, unsigned size) +{ + return &screen->bo_cache_buckets[lima_bucket_index(size)]; +} + +static void +lima_bo_cache_free_stale_bos(struct lima_screen *screen, time_t time) +{ + unsigned cnt = 0; + list_for_each_entry_safe(struct lima_bo, entry, + &screen->bo_cache_time, time_list) { + /* Free BOs that are sitting idle for longer than 5 seconds */ + if (time - entry->free_time > 6) { + lima_bo_cache_remove(entry); + lima_bo_free(entry); + cnt++; + } else + break; + } + if ((lima_debug & LIMA_DEBUG_BO_CACHE) && cnt) + fprintf(stderr, "%s: freed %d stale BOs\n", __func__, cnt); +} + +static void +lima_bo_cache_print_stats(struct lima_screen *screen) +{ + fprintf(stderr, "===============\n"); + fprintf(stderr, "BO cache stats:\n"); + unsigned total_size = 0; + for (int i = 0; i < NR_BO_CACHE_BUCKETS; i++) { + struct list_head *bucket = &screen->bo_cache_buckets[i]; + unsigned bucket_size = 0; + list_for_each_entry(struct lima_bo, entry, bucket, size_list) { + bucket_size += entry->size; + total_size += entry->size; + } + fprintf(stderr, "Bucket #%d, BOs: %d, size: %u\n", i, + list_length(bucket), + bucket_size); + } + fprintf(stderr, "Total size: %u\n", total_size); +} + +static bool +lima_bo_cache_put(struct lima_bo *bo) +{ + if (!bo->cacheable) + return false; + + struct lima_screen *screen = bo->screen; + + mtx_lock(&screen->bo_cache_lock); + struct list_head *bucket = lima_bo_cache_get_bucket(screen, bo->size); + + if (!bucket) { + mtx_unlock(&screen->bo_cache_lock); + return false; + } + + struct timespec time; + clock_gettime(CLOCK_MONOTONIC, &time); + bo->free_time = time.tv_sec; + list_addtail(&bo->size_list, bucket); + list_addtail(&bo->time_list, &screen->bo_cache_time); + lima_bo_cache_free_stale_bos(screen, time.tv_sec); + if (lima_debug & LIMA_DEBUG_BO_CACHE) { + fprintf(stderr, "%s: put BO: %p (size=%d)\n", __func__, bo, bo->size); + lima_bo_cache_print_stats(screen); + } + mtx_unlock(&screen->bo_cache_lock); + + return true; +} + +static struct lima_bo * +lima_bo_cache_get(struct lima_screen *screen, uint32_t size, uint32_t flags) +{ + /* we won't cache heap buffer */ + if (flags & LIMA_BO_FLAG_HEAP) + return NULL; + + struct lima_bo *bo = NULL; + mtx_lock(&screen->bo_cache_lock); + struct list_head *bucket = lima_bo_cache_get_bucket(screen, size); + + if (!bucket) { + mtx_unlock(&screen->bo_cache_lock); + return false; + } + + list_for_each_entry_safe(struct lima_bo, entry, bucket, size_list) { + if (entry->size >= size) { + /* Check if BO is idle. If it's not it's better to allocate new one */ + if (!lima_bo_wait(entry, LIMA_GEM_WAIT_WRITE, 0)) { + if (lima_debug & LIMA_DEBUG_BO_CACHE) { + fprintf(stderr, "%s: found BO %p but it's busy\n", __func__, + entry); + } + break; + } + + lima_bo_cache_remove(entry); + p_atomic_set(&entry->refcnt, 1); + entry->flags = flags; + bo = entry; + if (lima_debug & LIMA_DEBUG_BO_CACHE) { + fprintf(stderr, "%s: got BO: %p (size=%d), requested size %d\n", + __func__, bo, bo->size, size); + lima_bo_cache_print_stats(screen); + } + break; + } + } + + mtx_unlock(&screen->bo_cache_lock); + + return bo; +} + struct lima_bo *lima_bo_create(struct lima_screen *screen, uint32_t size, uint32_t flags) { struct lima_bo *bo; + + size = align(size, LIMA_PAGE_SIZE); + + /* Try to get bo from cache first */ + bo = lima_bo_cache_get(screen, size, flags); + if (bo) + return bo; + struct drm_lima_gem_create req = { .size = size, .flags = flags, @@ -110,17 +308,27 @@ if (!(bo = calloc(1, sizeof(*bo)))) return NULL; + list_inithead(&bo->time_list); + list_inithead(&bo->size_list); + if (drmIoctl(screen->fd, DRM_IOCTL_LIMA_GEM_CREATE, &req)) goto err_out0; bo->screen = screen; bo->size = req.size; + bo->flags = req.flags; bo->handle = req.handle; + bo->cacheable = !(lima_debug & LIMA_DEBUG_NO_BO_CACHE || + flags & LIMA_BO_FLAG_HEAP); p_atomic_set(&bo->refcnt, 1); if (!lima_bo_get_info(bo)) goto err_out1; + if (lima_debug & LIMA_DEBUG_BO_CACHE) + fprintf(stderr, "%s: %p (size=%d)\n", __func__, + bo, bo->size); + return bo; err_out1: @@ -130,25 +338,16 @@ return NULL; } -void lima_bo_free(struct lima_bo *bo) +void lima_bo_unreference(struct lima_bo *bo) { if (!p_atomic_dec_zero(&bo->refcnt)) return; - struct lima_screen *screen = bo->screen; - mtx_lock(&screen->bo_table_lock); - util_hash_table_remove(screen->bo_handles, - (void *)(uintptr_t)bo->handle); - if (bo->flink_name) - util_hash_table_remove(screen->bo_flink_names, - (void *)(uintptr_t)bo->flink_name); - mtx_unlock(&screen->bo_table_lock); - - if (bo->map) - lima_bo_unmap(bo); + /* Try to put it into cache */ + if (lima_bo_cache_put(bo)) + return; - lima_close_kms_handle(screen, bo->handle); - free(bo); + lima_bo_free(bo); } void *lima_bo_map(struct lima_bo *bo) @@ -175,6 +374,9 @@ { struct lima_screen *screen = bo->screen; + /* Don't cache exported BOs */ + bo->cacheable = false; + switch (handle->type) { case WINSYS_HANDLE_TYPE_SHARED: if (!bo->flink_name) { @@ -271,6 +473,8 @@ if (bo) { p_atomic_inc(&bo->refcnt); + /* Don't cache imported BOs */ + bo->cacheable = false; mtx_unlock(&screen->bo_table_lock); return bo; } @@ -282,6 +486,10 @@ return NULL; } + /* Don't cache imported BOs */ + bo->cacheable = false; + list_inithead(&bo->time_list); + list_inithead(&bo->size_list); bo->screen = screen; p_atomic_set(&bo->refcnt, 1); @@ -326,7 +534,13 @@ bool lima_bo_wait(struct lima_bo *bo, uint32_t op, uint64_t timeout_ns) { - int64_t abs_timeout = os_time_get_absolute_timeout(timeout_ns); + int64_t abs_timeout; + + if (timeout_ns == 0) + abs_timeout = 0; + else + abs_timeout = os_time_get_absolute_timeout(timeout_ns); + if (abs_timeout == OS_TIMEOUT_INFINITE) abs_timeout = INT64_MAX; diff -Nru mesa-19.2.8/src/gallium/drivers/lima/lima_bo.h mesa-20.0.8/src/gallium/drivers/lima/lima_bo.h --- mesa-19.2.8/src/gallium/drivers/lima/lima_bo.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/lima/lima_bo.h 2020-06-12 01:21:17.000000000 +0000 @@ -28,12 +28,18 @@ #include #include "util/u_atomic.h" +#include "util/list.h" struct lima_bo { struct lima_screen *screen; + struct list_head time_list; + struct list_head size_list; int refcnt; + bool cacheable; + time_t free_time; uint32_t size; + uint32_t flags; uint32_t handle; uint64_t offset; uint32_t flink_name; @@ -44,10 +50,12 @@ bool lima_bo_table_init(struct lima_screen *screen); void lima_bo_table_fini(struct lima_screen *screen); +bool lima_bo_cache_init(struct lima_screen *screen); +void lima_bo_cache_fini(struct lima_screen *screen); struct lima_bo *lima_bo_create(struct lima_screen *screen, uint32_t size, uint32_t flags); -void lima_bo_free(struct lima_bo *bo); +void lima_bo_unreference(struct lima_bo *bo); static inline void lima_bo_reference(struct lima_bo *bo) { diff -Nru mesa-19.2.8/src/gallium/drivers/lima/lima_context.c mesa-20.0.8/src/gallium/drivers/lima/lima_context.c --- mesa-19.2.8/src/gallium/drivers/lima/lima_context.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/lima/lima_context.c 2020-06-12 01:21:17.000000000 +0000 @@ -29,7 +29,6 @@ #include "util/u_debug.h" #include "util/ralloc.h" #include "util/u_inlines.h" -#include "util/u_suballoc.h" #include "util/hash_table.h" #include "lima_screen.h" @@ -70,19 +69,15 @@ void * lima_ctx_buff_alloc(struct lima_context *ctx, enum lima_ctx_buff buff, - unsigned size, bool uploader) + unsigned size) { struct lima_ctx_buff_state *cbs = ctx->buffer_state + buff; void *ret = NULL; cbs->size = align(size, 0x40); - if (uploader) - u_upload_alloc(ctx->uploader, 0, cbs->size, 0x40, &cbs->offset, - &cbs->res, &ret); - else - u_suballocator_alloc(ctx->suballocator, cbs->size, 0x10, - &cbs->offset, &cbs->res); + u_upload_alloc(ctx->uploader, 0, cbs->size, 0x40, &cbs->offset, + &cbs->res, &ret); return ret; } @@ -110,6 +105,19 @@ } static void +lima_invalidate_resource(struct pipe_context *pctx, struct pipe_resource *prsc) +{ + struct lima_context *ctx = lima_context(pctx); + + if (ctx->framebuffer.base.zsbuf && (ctx->framebuffer.base.zsbuf->texture == prsc)) + ctx->resolve &= ~(PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL); + + if (ctx->framebuffer.base.nr_cbufs && + (ctx->framebuffer.base.cbufs[0]->texture == prsc)) + ctx->resolve &= ~PIPE_CLEAR_COLOR0; +} + +static void lima_context_destroy(struct pipe_context *pctx) { struct lima_context *ctx = lima_context(pctx); @@ -128,9 +136,6 @@ if (ctx->blitter) util_blitter_destroy(ctx->blitter); - if (ctx->suballocator) - u_suballocator_destroy(ctx->suballocator); - if (ctx->uploader) u_upload_destroy(ctx->uploader); @@ -138,13 +143,16 @@ for (int i = 0; i < LIMA_CTX_PLB_MAX_NUM; i++) { if (ctx->plb[i]) - lima_bo_free(ctx->plb[i]); + lima_bo_unreference(ctx->plb[i]); if (ctx->gp_tile_heap[i]) - lima_bo_free(ctx->gp_tile_heap[i]); + lima_bo_unreference(ctx->gp_tile_heap[i]); } if (ctx->plb_gp_stream) - lima_bo_free(ctx->plb_gp_stream); + lima_bo_unreference(ctx->plb_gp_stream); + + if (ctx->gp_output) + lima_bo_unreference(ctx->gp_output); if (ctx->plb_pp_stream) assert(!_mesa_hash_table_num_entries(ctx->plb_pp_stream)); @@ -197,6 +205,7 @@ ctx->base.screen = pscreen; ctx->base.destroy = lima_context_destroy; ctx->base.set_debug_callback = lima_set_debug_callback; + ctx->base.invalidate_resource = lima_invalidate_resource; lima_resource_context_init(ctx); lima_fence_context_init(ctx); @@ -217,12 +226,8 @@ ctx->base.stream_uploader = ctx->uploader; ctx->base.const_uploader = ctx->uploader; - /* for varying output which need not mmap */ - ctx->suballocator = - u_suballocator_create(&ctx->base, 1024 * 1024, 0, - PIPE_USAGE_STREAM, 0, false); - if (!ctx->suballocator) - goto err_out; + ctx->damage_rect.minx = ctx->damage_rect.miny = 0xffff; + ctx->damage_rect.maxx = ctx->damage_rect.maxy = 0; util_dynarray_init(&ctx->vs_cmd_array, ctx); util_dynarray_init(&ctx->plbu_cmd_array, ctx); @@ -230,11 +235,25 @@ ctx->plb_size = screen->plb_max_blk * LIMA_CTX_PLB_BLK_SIZE; ctx->plb_gp_size = screen->plb_max_blk * 4; + uint32_t heap_flags; + if (screen->has_growable_heap_buffer) { + /* growable size buffer, initially will allocate 32K (by default) + * backup memory in kernel driver, and will allocate more when GP + * get out of memory interrupt. Max to 16M set here. + */ + ctx->gp_tile_heap_size = 0x1000000; + heap_flags = LIMA_BO_FLAG_HEAP; + } else { + /* fix size buffer */ + ctx->gp_tile_heap_size = 0x100000; + heap_flags = 0; + } + for (int i = 0; i < lima_ctx_num_plb; i++) { ctx->plb[i] = lima_bo_create(screen, ctx->plb_size, 0); if (!ctx->plb[i]) goto err_out; - ctx->gp_tile_heap[i] = lima_bo_create(screen, gp_tile_heap_size, 0); + ctx->gp_tile_heap[i] = lima_bo_create(screen, ctx->gp_tile_heap_size, heap_flags); if (!ctx->gp_tile_heap[i]) goto err_out; } @@ -282,15 +301,3 @@ return lima_submit_has_bo(ctx->gp_submit, bo, write) || lima_submit_has_bo(ctx->pp_submit, bo, write); } - -bool -lima_is_scanout(struct lima_context *ctx) -{ - /* If there is no color buffer, it's an FBO */ - if (!ctx->framebuffer.base.nr_cbufs) - return false; - - return ctx->framebuffer.base.cbufs[0]->texture->bind & PIPE_BIND_DISPLAY_TARGET || - ctx->framebuffer.base.cbufs[0]->texture->bind & PIPE_BIND_SCANOUT || - ctx->framebuffer.base.cbufs[0]->texture->bind & PIPE_BIND_SHARED; -} diff -Nru mesa-19.2.8/src/gallium/drivers/lima/lima_context.h mesa-20.0.8/src/gallium/drivers/lima/lima_context.h --- mesa-19.2.8/src/gallium/drivers/lima/lima_context.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/lima/lima_context.h 2020-06-12 01:21:17.000000000 +0000 @@ -55,6 +55,7 @@ void *shader; int shader_size; int stack_size; + bool uses_discard; struct lima_bo *bo; }; @@ -81,7 +82,10 @@ struct lima_varying_info varying[LIMA_MAX_VARYING_NUM]; int varying_stride; - int num_varying; + int num_outputs; + int num_varyings; + int gl_pos_idx; + int point_size_idx; struct lima_bo *bo; }; @@ -107,7 +111,7 @@ struct lima_context_viewport_state { struct pipe_viewport_state transform; - float x, y, width, height; + float left, right, bottom, top; float near, far; }; @@ -118,8 +122,6 @@ }; enum lima_ctx_buff { - lima_ctx_buff_sh_varying, - lima_ctx_buff_sh_gl_pos, lima_ctx_buff_gp_varying_info, lima_ctx_buff_gp_attribute_info, lima_ctx_buff_gp_uniform, @@ -129,6 +131,7 @@ lima_ctx_buff_pp_uniform_array, lima_ctx_buff_pp_uniform, lima_ctx_buff_pp_tex_desc, + lima_ctx_buff_pp_stack, lima_ctx_buff_num, }; @@ -158,12 +161,6 @@ uint32_t offset[4]; }; -struct lima_damage_state { - struct pipe_scissor_state *region; - unsigned num_region; - bool aligned; -}; - struct lima_pp_stream_state { struct lima_bo *bo; uint32_t bo_offset; @@ -191,8 +188,9 @@ LIMA_CONTEXT_DIRTY_TEXTURES = (1 << 14), } dirty; + unsigned resolve; + struct u_upload_mgr *uploader; - struct u_suballocator *suballocator; struct blitter_context *blitter; struct slab_child_pool transfer_pool; @@ -200,6 +198,7 @@ struct lima_context_framebuffer framebuffer; struct lima_context_viewport_state viewport; struct pipe_scissor_state scissor; + struct pipe_scissor_state damage_rect; struct lima_context_clear clear; struct lima_vs_shader_state *vs; struct lima_fs_shader_state *fs; @@ -212,7 +211,6 @@ struct pipe_stencil_ref stencil_ref; struct lima_context_constant_buffer const_buffer[PIPE_SHADER_TYPES]; struct lima_texture_stateobj tex_stateobj; - struct lima_damage_state damage; struct lima_pp_stream_state pp_stream; unsigned min_index; @@ -227,8 +225,11 @@ struct lima_bo *plb[LIMA_CTX_PLB_MAX_NUM]; struct lima_bo *gp_tile_heap[LIMA_CTX_PLB_MAX_NUM]; - #define gp_tile_heap_size 0x100000 + uint32_t gp_tile_heap_size; struct lima_bo *plb_gp_stream; + struct lima_bo *gp_output; + uint32_t gp_output_varyings_offt; + uint32_t gp_output_point_size_offt; struct hash_table *plb_pp_stream; uint32_t plb_index; @@ -244,6 +245,11 @@ int id; struct pipe_debug_callback debug; + + int pp_max_stack_size; + + unsigned index_offset; + struct lima_resource *index_res; }; static inline struct lima_context * @@ -279,7 +285,7 @@ unsigned submit); void *lima_ctx_buff_map(struct lima_context *ctx, enum lima_ctx_buff buff); void *lima_ctx_buff_alloc(struct lima_context *ctx, enum lima_ctx_buff buff, - unsigned size, bool uploader); + unsigned size); void lima_state_init(struct lima_context *ctx); void lima_state_fini(struct lima_context *ctx); diff -Nru mesa-19.2.8/src/gallium/drivers/lima/lima_draw.c mesa-20.0.8/src/gallium/drivers/lima/lima_draw.c --- mesa-19.2.8/src/gallium/drivers/lima/lima_draw.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/lima/lima_draw.c 2020-06-12 01:21:17.000000000 +0000 @@ -24,13 +24,14 @@ */ #include "util/u_math.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_debug.h" #include "util/u_half.h" #include "util/u_helpers.h" #include "util/u_inlines.h" #include "util/u_pack_color.h" #include "util/hash_table.h" +#include "util/u_split_draw.h" #include "util/u_upload_mgr.h" #include "util/u_prim.h" #include "util/u_vbuf.h" @@ -44,6 +45,7 @@ #include "lima_texture.h" #include "lima_util.h" #include "lima_fence.h" +#include "lima_format.h" #include @@ -116,9 +118,6 @@ uint32_t varyings_address; }; -#define LIMA_PIXEL_FORMAT_B8G8R8A8 0x03 -#define LIMA_PIXEL_FORMAT_Z16 0x0e -#define LIMA_PIXEL_FORMAT_Z24S8 0x0f /* plbu commands */ #define PLBU_CMD_BEGIN(max) { \ @@ -143,14 +142,15 @@ #define PLBU_CMD_BLOCK_STRIDE(block_w) PLBU_CMD((block_w) & 0xff, 0x30000000) #define PLBU_CMD_ARRAY_ADDRESS(gp_stream, block_num) \ PLBU_CMD(gp_stream, 0x28000000 | ((block_num) - 1) | 1) -#define PLBU_CMD_VIEWPORT_X(v) PLBU_CMD(v, 0x10000107) -#define PLBU_CMD_VIEWPORT_W(v) PLBU_CMD(v, 0x10000108) -#define PLBU_CMD_VIEWPORT_Y(v) PLBU_CMD(v, 0x10000105) -#define PLBU_CMD_VIEWPORT_H(v) PLBU_CMD(v, 0x10000106) +#define PLBU_CMD_VIEWPORT_LEFT(v) PLBU_CMD(v, 0x10000107) +#define PLBU_CMD_VIEWPORT_RIGHT(v) PLBU_CMD(v, 0x10000108) +#define PLBU_CMD_VIEWPORT_BOTTOM(v) PLBU_CMD(v, 0x10000105) +#define PLBU_CMD_VIEWPORT_TOP(v) PLBU_CMD(v, 0x10000106) #define PLBU_CMD_ARRAYS_SEMAPHORE_BEGIN() PLBU_CMD(0x00010002, 0x60000000) #define PLBU_CMD_ARRAYS_SEMAPHORE_END() PLBU_CMD(0x00010001, 0x60000000) -#define PLBU_CMD_PRIMITIVE_SETUP(low_prim, cull, index_size) \ - PLBU_CMD(((low_prim) ? 0x00003200 : 0x00002200) | (cull) | ((index_size) << 9), 0x1000010B) +#define PLBU_CMD_PRIMITIVE_SETUP(force_point_size, cull, index_size) \ + PLBU_CMD(0x2200 | ((force_point_size) ? 0x1000 : 0) | \ + (cull) | ((index_size) << 9), 0x1000010B) #define PLBU_CMD_RSW_VERTEX_ARRAY(rsw, gl_pos) \ PLBU_CMD(rsw, 0x80000000 | ((gl_pos) >> 4)) #define PLBU_CMD_SCISSORS(minx, maxx, miny, maxy) \ @@ -162,6 +162,7 @@ #define PLBU_CMD_DEPTH_RANGE_NEAR(v) PLBU_CMD(v, 0x1000010E) #define PLBU_CMD_DEPTH_RANGE_FAR(v) PLBU_CMD(v, 0x1000010F) #define PLBU_CMD_INDEXED_DEST(gl_pos) PLBU_CMD(gl_pos, 0x10000100) +#define PLBU_CMD_INDEXED_PT_SIZE(pt_size) PLBU_CMD(pt_size, 0x10000102) #define PLBU_CMD_INDICES(va) PLBU_CMD(va, 0x10000101) #define PLBU_CMD_DRAW_ARRAYS(mode, start, count) \ PLBU_CMD(((count) << 24) | (start), (((mode) & 0x1F) << 16) | ((count) >> 8)) @@ -212,23 +213,36 @@ return ctx->plbu_cmd_array.size; } +static inline struct lima_damage_region * +lima_ctx_get_damage(struct lima_context *ctx) +{ + if (!ctx->framebuffer.base.nr_cbufs) + return NULL; + + struct lima_surface *surf = lima_surface(ctx->framebuffer.base.cbufs[0]); + struct lima_resource *res = lima_resource(surf->base.texture); + return &res->damage; +} + static bool lima_fb_need_reload(struct lima_context *ctx) { /* Depth buffer is always discarded */ if (!ctx->framebuffer.base.nr_cbufs) return false; - if (ctx->damage.region) { - /* for EGL_KHR_partial_update we just want to reload the - * region not aligned to tile boundary */ - if (!ctx->damage.aligned) - return true; + + struct lima_surface *surf = lima_surface(ctx->framebuffer.base.cbufs[0]); + struct lima_resource *res = lima_resource(surf->base.texture); + if (res->damage.region) { + /* for EGL_KHR_partial_update, when EGL_EXT_buffer_age is enabled, + * we need to reload damage region, otherwise just want to reload + * the region not aligned to tile boundary */ + //if (!res->damage.aligned) + // return true; + return true; } - else { - struct lima_surface *surf = lima_surface(ctx->framebuffer.base.cbufs[0]); - if (surf->reload) + else if (surf->reload) return true; - } return false; } @@ -278,8 +292,8 @@ lima_tex_desc *td = cpu + lima_reload_tex_desc_offset; memset(td, 0, lima_min_tex_desc_size); lima_texture_desc_set_res(ctx, td, fb->base.cbufs[0]->texture, 0, 0); - td->unknown_1_1 = 0x80; - td->texture_2d = 1; + td->unnorm_coords = 1; + td->texture_type = LIMA_TEXTURE_TYPE_2D; td->min_img_filter_nearest = 1; td->mag_img_filter_nearest = 1; td->wrap_s_clamp_to_edge = 1; @@ -309,10 +323,10 @@ PLBU_CMD_BEGIN(20); - PLBU_CMD_VIEWPORT_X(0); - PLBU_CMD_VIEWPORT_W(fui(fb->base.width)); - PLBU_CMD_VIEWPORT_Y(0); - PLBU_CMD_VIEWPORT_H(fui(fb->base.height)); + PLBU_CMD_VIEWPORT_LEFT(0); + PLBU_CMD_VIEWPORT_RIGHT(fui(fb->base.width)); + PLBU_CMD_VIEWPORT_BOTTOM(0); + PLBU_CMD_VIEWPORT_TOP(fui(fb->base.height)); PLBU_CMD_RSW_VERTEX_ARRAY( va + lima_reload_render_state_offset, @@ -329,74 +343,6 @@ } static void -lima_pack_clear_plbu_cmd(struct lima_context *ctx) -{ - #define lima_clear_render_state_offset 0x0000 - #define lima_clear_shader_offset 0x0040 - #define lima_clear_buffer_size 0x0080 - - void *cpu; - unsigned offset; - struct pipe_resource *pres = NULL; - u_upload_alloc(ctx->uploader, 0, lima_clear_buffer_size, - 0x40, &offset, &pres, &cpu); - - struct lima_resource *res = lima_resource(pres); - uint32_t va = res->bo->va + offset; - - struct lima_screen *screen = lima_screen(ctx->base.screen); - uint32_t gl_pos_va = screen->pp_buffer->va + pp_clear_gl_pos_offset; - - /* const0 clear_color, mov.v1 $0 ^const0.xxxx, stop */ - uint32_t clear_shader[] = { - 0x00021025, 0x0000000c, - (ctx->clear.color_16pc << 12) | 0x000007cf, - ctx->clear.color_16pc >> 12, - ctx->clear.color_16pc >> 44, - }; - memcpy(cpu + lima_clear_shader_offset, &clear_shader, - sizeof(clear_shader)); - - uint32_t clear_shader_va = va + lima_clear_shader_offset; - uint32_t clear_shader_first_instr_size = clear_shader[0] & 0x1f; - - struct lima_render_state clear_render_state = { - .blend_color_bg = 0x00800080, - .blend_color_ra = 0x00ff0080, - .alpha_blend = 0xfc321892, - .depth_test = 0x0000003e, - .depth_range = 0xffff0000, - .stencil_front = 0x00000007, - .stencil_back = 0x00000007, - .multi_sample = 0x0000f007, - .shader_address = clear_shader_va | clear_shader_first_instr_size, - }; - memcpy(cpu + lima_clear_render_state_offset, &clear_render_state, - sizeof(clear_render_state)); - - PLBU_CMD_BEGIN(22); - - PLBU_CMD_VIEWPORT_X(0); - PLBU_CMD_VIEWPORT_W(0x45800000); - PLBU_CMD_VIEWPORT_Y(0); - PLBU_CMD_VIEWPORT_H(0x45800000); - - struct pipe_scissor_state *scissor = &ctx->scissor; - PLBU_CMD_SCISSORS(scissor->minx, scissor->maxx, scissor->miny, scissor->maxy); - - PLBU_CMD_RSW_VERTEX_ARRAY(va + lima_clear_render_state_offset, gl_pos_va); - - PLBU_CMD_UNKNOWN2(); - PLBU_CMD_UNKNOWN1(); - - PLBU_CMD_INDICES(screen->pp_buffer->va + pp_shared_index_offset); - PLBU_CMD_INDEXED_DEST(gl_pos_va); - PLBU_CMD_DRAW_ELEMENTS(0xf, 0, 3); - - PLBU_CMD_END(); -} - -static void lima_pack_head_plbu_cmd(struct lima_context *ctx) { /* first draw need create a PLBU command header */ @@ -434,19 +380,6 @@ && scissor->miny == scissor->maxy; } -static bool -lima_is_scissor_full_fb(struct lima_context *ctx) -{ - if (!ctx->rasterizer || !ctx->rasterizer->base.scissor) - return true; - - struct pipe_scissor_state *scissor = &ctx->scissor; - struct lima_context_framebuffer *fb = &ctx->framebuffer; - return - scissor->minx == 0 && scissor->maxx == fb->base.width && - scissor->miny == 0 && scissor->maxy == fb->base.height; -} - static void hilbert_rotate(int n, int *x, int *y, int rx, int ry) { @@ -493,7 +426,7 @@ * extra size should be added to the preceeding stream * 2. alignment: each stream address should be 0x20 aligned */ - int delta = tiled_w * tiled_h / num_pp * 16 + 8; + int delta = tiled_w * tiled_h / num_pp * 16 + 16; int remain = tiled_w * tiled_h % num_pp; int offset = 0; @@ -512,9 +445,9 @@ } static bool -inside_damage_region(int x, int y, struct lima_damage_state *ds) +inside_damage_region(int x, int y, struct lima_damage_region *ds) { - if (!ds->region) + if (!ds || !ds->region) return true; for (int i = 0; i < ds->num_region; i++) { @@ -528,11 +461,12 @@ } static void -lima_update_pp_stream(struct lima_context *ctx, int off_x, int off_y, +lima_generate_pp_stream(struct lima_context *ctx, int off_x, int off_y, int tiled_w, int tiled_h) { struct lima_pp_stream_state *ps = &ctx->pp_stream; struct lima_context_framebuffer *fb = &ctx->framebuffer; + struct lima_damage_region *damage = lima_ctx_get_damage(ctx); struct lima_screen *screen = lima_screen(ctx->base.screen); int i, num_pp = screen->num_pp; @@ -542,11 +476,19 @@ * close enough which should result close workload */ int max = MAX2(tiled_w, tiled_h); - int dim = util_logbase2_ceil(max); - int count = 1 << (dim + dim); int index = 0; uint32_t *stream[4]; int si[4] = {0}; + int dim = 0; + int count = 0; + + /* Don't update count if we get zero rect. We'll just generate + * PP stream with just terminators in it. + */ + if ((tiled_w * tiled_h) != 0) { + dim = util_logbase2_ceil(max); + count = 1 << (dim + dim); + } for (i = 0; i < num_pp; i++) stream[i] = ps->bo->map + ps->bo_offset + ps->offset[i]; @@ -558,7 +500,7 @@ x += off_x; y += off_y; - if (!inside_damage_region(x, y, &ctx->damage)) + if (!inside_damage_region(x, y, damage)) continue; int pp = index % num_pp; @@ -578,6 +520,8 @@ for (i = 0; i < num_pp; i++) { stream[i][si[i]++] = 0; stream[i][si[i]++] = 0xBC000000; + stream[i][si[i]++] = 0; + stream[i][si[i]++] = 0; lima_dump_command_stream_print( stream[i], si[i] * 4, false, "pp plb stream %d at va %x\n", @@ -588,20 +532,32 @@ static void lima_update_damage_pp_stream(struct lima_context *ctx) { - struct lima_damage_state *ds = &ctx->damage; - struct pipe_scissor_state max = ds->region[0]; + struct lima_damage_region *ds = lima_ctx_get_damage(ctx); + struct lima_context_framebuffer *fb = &ctx->framebuffer; + struct pipe_scissor_state bound; - /* find a max region to cover all the damage region */ - for (int i = 1; i < ds->num_region; i++) { - struct pipe_scissor_state *ss = ds->region + i; - max.minx = MIN2(max.minx, ss->minx); - max.miny = MIN2(max.miny, ss->miny); - max.maxx = MAX2(max.maxx, ss->maxx); - max.maxy = MAX2(max.maxy, ss->maxy); + if (ds && ds->region) { + struct pipe_scissor_state *dbound = &ds->bound; + bound.minx = MAX2(dbound->minx, ctx->damage_rect.minx >> 4); + bound.miny = MAX2(dbound->miny, ctx->damage_rect.miny >> 4); + bound.maxx = MIN2(dbound->maxx, (ctx->damage_rect.maxx + 0xf) >> 4); + bound.maxy = MIN2(dbound->maxy, (ctx->damage_rect.maxy + 0xf) >> 4); + } else { + bound.minx = ctx->damage_rect.minx >> 4; + bound.miny = ctx->damage_rect.miny >> 4; + bound.maxx = (ctx->damage_rect.maxx + 0xf) >> 4; + bound.maxy = (ctx->damage_rect.maxy + 0xf) >> 4; } - int tiled_w = max.maxx - max.minx; - int tiled_h = max.maxy - max.miny; + /* Clamp to FB size */ + bound.minx = MIN2(bound.minx, fb->tiled_w); + bound.miny = MIN2(bound.miny, fb->tiled_h); + bound.maxx = MIN2(bound.maxx, fb->tiled_w); + bound.maxy = MIN2(bound.maxy, fb->tiled_h); + + int tiled_w = bound.maxx - bound.minx; + int tiled_h = bound.maxy - bound.miny; + struct lima_screen *screen = lima_screen(ctx->base.screen); int size = lima_get_pp_stream_size( screen->num_pp, tiled_w, tiled_h, ctx->pp_stream.offset); @@ -615,7 +571,7 @@ ctx->pp_stream.bo = res->bo; ctx->pp_stream.bo_offset = offset; - lima_update_pp_stream(ctx, max.minx, max.miny, tiled_w, tiled_h); + lima_generate_pp_stream(ctx, bound.minx, bound.miny, tiled_w, tiled_h); lima_submit_add_bo(ctx->pp_submit, res->bo, LIMA_SUBMIT_BO_READ); pipe_resource_reference(&pres, NULL); @@ -651,12 +607,33 @@ ctx->pp_stream.bo_offset = 0; memcpy(ctx->pp_stream.offset, s->offset, sizeof(s->offset)); - lima_update_pp_stream(ctx, 0, 0, fb->tiled_w, fb->tiled_h); + lima_generate_pp_stream(ctx, 0, 0, fb->tiled_w, fb->tiled_h); } lima_submit_add_bo(ctx->pp_submit, s->bo, LIMA_SUBMIT_BO_READ); } +static bool +lima_damage_fullscreen(struct lima_context *ctx) +{ + return ctx->damage_rect.minx == 0 && + ctx->damage_rect.miny == 0 && + ctx->damage_rect.maxx == ctx->framebuffer.base.width && + ctx->damage_rect.maxy == ctx->framebuffer.base.height; +} + +static void +lima_update_pp_stream(struct lima_context *ctx) +{ + struct lima_damage_region *damage = lima_ctx_get_damage(ctx); + if ((damage && damage->region) || !lima_damage_fullscreen(ctx)) + lima_update_damage_pp_stream(ctx); + else if (ctx->plb_pp_stream) + lima_update_full_pp_stream(ctx); + else + ctx->pp_stream.bo = NULL; +} + static void lima_update_submit_bo(struct lima_context *ctx) { @@ -673,13 +650,6 @@ ctx->plb_gp_size, false, "gp plb stream at va %x\n", ctx->plb_gp_stream->va + ctx->plb_index * ctx->plb_gp_size); - if (ctx->damage.region) - lima_update_damage_pp_stream(ctx); - else if (ctx->plb_pp_stream) - lima_update_full_pp_stream(ctx); - else - ctx->pp_stream.bo = NULL; - if (ctx->framebuffer.base.nr_cbufs) { struct lima_resource *res = lima_resource(ctx->framebuffer.base.cbufs[0]->texture); lima_submit_add_bo(ctx->pp_submit, res->bo, LIMA_SUBMIT_BO_WRITE); @@ -694,20 +664,28 @@ } static void +lima_damage_rect_union(struct lima_context *ctx, unsigned minx, unsigned maxx, unsigned miny, unsigned maxy) +{ + ctx->damage_rect.minx = MIN2(ctx->damage_rect.minx, minx); + ctx->damage_rect.miny = MIN2(ctx->damage_rect.miny, miny); + ctx->damage_rect.maxx = MAX2(ctx->damage_rect.maxx, maxx); + ctx->damage_rect.maxy = MAX2(ctx->damage_rect.maxy, maxy); +} + +static void lima_clear(struct pipe_context *pctx, unsigned buffers, const union pipe_color_union *color, double depth, unsigned stencil) { struct lima_context *ctx = lima_context(pctx); - bool full_fb_clear = lima_is_scissor_full_fb(ctx); - if (full_fb_clear) { - lima_flush(ctx); + lima_flush(ctx); - /* no need to reload if cleared */ - if (ctx->framebuffer.base.nr_cbufs && (buffers & PIPE_CLEAR_COLOR0)) { - struct lima_surface *surf = lima_surface(ctx->framebuffer.base.cbufs[0]); - surf->reload = false; - } + ctx->resolve |= buffers; + + /* no need to reload if cleared */ + if (ctx->framebuffer.base.nr_cbufs && (buffers & PIPE_CLEAR_COLOR0)) { + struct lima_surface *surf = lima_surface(ctx->framebuffer.base.cbufs[0]); + surf->reload = false; } struct lima_context_clear *clear = &ctx->clear; @@ -737,11 +715,10 @@ lima_pack_head_plbu_cmd(ctx); - /* partial clear */ - if (!full_fb_clear) - lima_pack_clear_plbu_cmd(ctx); - ctx->dirty |= LIMA_CONTEXT_DIRTY_CLEAR; + + lima_damage_rect_union(ctx, 0, ctx->framebuffer.base.width, + 0, ctx->framebuffer.base.height); } enum lima_attrib_type { @@ -823,19 +800,19 @@ VS_CMD_SHADER_ADDRESS(ctx->vs->bo->va, ctx->vs->shader_size); VS_CMD_SHADER_INFO(ctx->vs->prefetch, ctx->vs->shader_size); - int num_varryings = ctx->vs->num_varying; + int num_outputs = ctx->vs->num_outputs; int num_attributes = ctx->vertex_elements->num_elements; - VS_CMD_VARYING_ATTRIBUTE_COUNT(num_varryings, num_attributes); + VS_CMD_VARYING_ATTRIBUTE_COUNT(num_outputs, MAX2(1, num_attributes)); VS_CMD_UNKNOWN1(); VS_CMD_ATTRIBUTES_ADDRESS( lima_ctx_buff_va(ctx, lima_ctx_buff_gp_attribute_info, LIMA_CTX_BUFF_SUBMIT_GP), - num_attributes); + MAX2(1, num_attributes)); VS_CMD_VARYINGS_ADDRESS( lima_ctx_buff_va(ctx, lima_ctx_buff_gp_varying_info, LIMA_CTX_BUFF_SUBMIT_GP), - num_varryings); + num_outputs); unsigned num = info->index_size ? (ctx->max_index - ctx->min_index + 1) : info->count; VS_CMD_DRAW(num, info->index_size); @@ -850,40 +827,51 @@ static void lima_pack_plbu_cmd(struct lima_context *ctx, const struct pipe_draw_info *info) { + struct lima_context_framebuffer *fb = &ctx->framebuffer; + struct lima_vs_shader_state *vs = ctx->vs; + unsigned minx, maxx, miny, maxy; + lima_pack_head_plbu_cmd(ctx); /* If it's zero scissor, we skip adding all other commands */ if (lima_is_scissor_zero(ctx)) return; - PLBU_CMD_BEGIN(30); + PLBU_CMD_BEGIN(32); - PLBU_CMD_VIEWPORT_X(fui(ctx->viewport.x)); - PLBU_CMD_VIEWPORT_W(fui(ctx->viewport.width)); - PLBU_CMD_VIEWPORT_Y(fui(ctx->viewport.y)); - PLBU_CMD_VIEWPORT_H(fui(ctx->viewport.height)); + PLBU_CMD_VIEWPORT_LEFT(fui(ctx->viewport.left)); + PLBU_CMD_VIEWPORT_RIGHT(fui(ctx->viewport.right)); + PLBU_CMD_VIEWPORT_BOTTOM(fui(ctx->viewport.bottom)); + PLBU_CMD_VIEWPORT_TOP(fui(ctx->viewport.top)); if (!info->index_size) PLBU_CMD_ARRAYS_SEMAPHORE_BEGIN(); - bool low_prim = info->mode < PIPE_PRIM_TRIANGLES; int cf = ctx->rasterizer->base.cull_face; int ccw = ctx->rasterizer->base.front_ccw; uint32_t cull = 0; + bool force_point_size = false; + if (cf != PIPE_FACE_NONE) { if (cf & PIPE_FACE_FRONT) cull |= ccw ? 0x00040000 : 0x00020000; if (cf & PIPE_FACE_BACK) cull |= ccw ? 0x00020000 : 0x00040000; } - PLBU_CMD_PRIMITIVE_SETUP(low_prim, cull, info->index_size); - uint32_t gl_position_va = - lima_ctx_buff_va(ctx, lima_ctx_buff_sh_gl_pos, - LIMA_CTX_BUFF_SUBMIT_GP | LIMA_CTX_BUFF_SUBMIT_PP); + /* Specify point size with PLBU command if shader doesn't write */ + if (info->mode == PIPE_PRIM_POINTS && ctx->vs->point_size_idx == -1) + force_point_size = true; + + /* Specify line width with PLBU command for lines */ + if (info->mode > PIPE_PRIM_POINTS && info->mode < PIPE_PRIM_TRIANGLES) + force_point_size = true; + + PLBU_CMD_PRIMITIVE_SETUP(force_point_size, cull, info->index_size); + PLBU_CMD_RSW_VERTEX_ARRAY( lima_ctx_buff_va(ctx, lima_ctx_buff_pp_plb_rsw, LIMA_CTX_BUFF_SUBMIT_PP), - gl_position_va); + ctx->gp_output->va); /* TODO * - we should set it only for the first draw that enabled the scissor and for @@ -891,38 +879,44 @@ */ if (ctx->rasterizer->base.scissor) { struct pipe_scissor_state *scissor = &ctx->scissor; - PLBU_CMD_SCISSORS(scissor->minx, scissor->maxx, scissor->miny, scissor->maxy); + minx = scissor->minx; + maxx = scissor->maxx; + miny = scissor->miny; + maxy = scissor->maxy; + } else { + minx = 0; + maxx = fb->base.width; + miny = 0; + maxy = fb->base.height; } + minx = MAX2(minx, ctx->viewport.left); + maxx = MIN2(maxx, ctx->viewport.right); + miny = MAX2(miny, ctx->viewport.bottom); + maxy = MIN2(maxy, ctx->viewport.top); + + PLBU_CMD_SCISSORS(minx, maxx, miny, maxy); + lima_damage_rect_union(ctx, minx, maxx, miny, maxy); + PLBU_CMD_UNKNOWN1(); PLBU_CMD_DEPTH_RANGE_NEAR(fui(ctx->viewport.near)); PLBU_CMD_DEPTH_RANGE_FAR(fui(ctx->viewport.far)); - if (low_prim) { + if ((info->mode == PIPE_PRIM_POINTS && ctx->vs->point_size_idx == -1) || + ((info->mode >= PIPE_PRIM_LINES) && (info->mode < PIPE_PRIM_TRIANGLES))) + { uint32_t v = info->mode == PIPE_PRIM_POINTS ? fui(ctx->rasterizer->base.point_size) : fui(ctx->rasterizer->base.line_width); PLBU_CMD_LOW_PRIM_SIZE(v); } if (info->index_size) { - PLBU_CMD_INDEXED_DEST(gl_position_va); - - struct pipe_resource *indexbuf = NULL; - unsigned index_offset = 0; - struct lima_resource *res; - if (info->has_user_indices) { - util_upload_index_buffer(&ctx->base, info, &indexbuf, &index_offset); - res = lima_resource(indexbuf); - } - else - res = lima_resource(info->index.resource); + PLBU_CMD_INDEXED_DEST(ctx->gp_output->va); + if (vs->point_size_idx != -1) + PLBU_CMD_INDEXED_PT_SIZE(ctx->gp_output->va + ctx->gp_output_point_size_offt); - lima_submit_add_bo(ctx->gp_submit, res->bo, LIMA_SUBMIT_BO_READ); - PLBU_CMD_INDICES(res->bo->va + info->start * info->index_size + index_offset); - - if (indexbuf) - pipe_resource_reference(&indexbuf, NULL); + PLBU_CMD_INDICES(ctx->index_res->bo->va + info->start * info->index_size + ctx->index_offset); } else { /* can this make the attribute info static? */ @@ -956,44 +950,106 @@ } static int -lima_blend_factor(enum pipe_blendfactor pipe) +lima_blend_factor_has_alpha(enum pipe_blendfactor pipe) { + /* Bit 4 is set if the blendfactor uses alpha */ switch (pipe) { - case PIPE_BLENDFACTOR_ONE: - return 11; - case PIPE_BLENDFACTOR_SRC_COLOR: - return 0; case PIPE_BLENDFACTOR_SRC_ALPHA: - return 16; case PIPE_BLENDFACTOR_DST_ALPHA: - return 17; - case PIPE_BLENDFACTOR_DST_COLOR: + case PIPE_BLENDFACTOR_CONST_ALPHA: + case PIPE_BLENDFACTOR_INV_SRC_ALPHA: + case PIPE_BLENDFACTOR_INV_DST_ALPHA: + case PIPE_BLENDFACTOR_INV_CONST_ALPHA: return 1; + + case PIPE_BLENDFACTOR_SRC_COLOR: + case PIPE_BLENDFACTOR_INV_SRC_COLOR: + case PIPE_BLENDFACTOR_DST_COLOR: + case PIPE_BLENDFACTOR_INV_DST_COLOR: + case PIPE_BLENDFACTOR_CONST_COLOR: + case PIPE_BLENDFACTOR_INV_CONST_COLOR: + case PIPE_BLENDFACTOR_ZERO: + case PIPE_BLENDFACTOR_ONE: case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: - return 7; + return 0; + + case PIPE_BLENDFACTOR_SRC1_COLOR: + case PIPE_BLENDFACTOR_SRC1_ALPHA: + case PIPE_BLENDFACTOR_INV_SRC1_COLOR: + case PIPE_BLENDFACTOR_INV_SRC1_ALPHA: + return -1; /* not supported */ + } + return -1; +} + +static int +lima_blend_factor_is_inv(enum pipe_blendfactor pipe) +{ + /* Bit 3 is set if the blendfactor type is inverted */ + switch (pipe) { + case PIPE_BLENDFACTOR_INV_SRC_COLOR: + case PIPE_BLENDFACTOR_INV_SRC_ALPHA: + case PIPE_BLENDFACTOR_INV_DST_COLOR: + case PIPE_BLENDFACTOR_INV_DST_ALPHA: + case PIPE_BLENDFACTOR_INV_CONST_COLOR: + case PIPE_BLENDFACTOR_INV_CONST_ALPHA: + case PIPE_BLENDFACTOR_ONE: + return 1; + + case PIPE_BLENDFACTOR_SRC_COLOR: + case PIPE_BLENDFACTOR_SRC_ALPHA: + case PIPE_BLENDFACTOR_DST_COLOR: + case PIPE_BLENDFACTOR_DST_ALPHA: case PIPE_BLENDFACTOR_CONST_COLOR: - return 2; case PIPE_BLENDFACTOR_CONST_ALPHA: - return 18; case PIPE_BLENDFACTOR_ZERO: - return 3; + case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: + return 0; + + case PIPE_BLENDFACTOR_SRC1_COLOR: + case PIPE_BLENDFACTOR_SRC1_ALPHA: + case PIPE_BLENDFACTOR_INV_SRC1_COLOR: + case PIPE_BLENDFACTOR_INV_SRC1_ALPHA: + return -1; /* not supported */ + } + return -1; +} + +static int +lima_blend_factor(enum pipe_blendfactor pipe) +{ + /* Bits 0-2 indicate the blendfactor type */ + switch (pipe) { + case PIPE_BLENDFACTOR_SRC_COLOR: + case PIPE_BLENDFACTOR_SRC_ALPHA: case PIPE_BLENDFACTOR_INV_SRC_COLOR: - return 8; case PIPE_BLENDFACTOR_INV_SRC_ALPHA: - return 24; - case PIPE_BLENDFACTOR_INV_DST_ALPHA: - return 25; + return 0; + + case PIPE_BLENDFACTOR_DST_COLOR: + case PIPE_BLENDFACTOR_DST_ALPHA: case PIPE_BLENDFACTOR_INV_DST_COLOR: - return 9; + case PIPE_BLENDFACTOR_INV_DST_ALPHA: + return 1; + + case PIPE_BLENDFACTOR_CONST_COLOR: + case PIPE_BLENDFACTOR_CONST_ALPHA: case PIPE_BLENDFACTOR_INV_CONST_COLOR: - return 10; case PIPE_BLENDFACTOR_INV_CONST_ALPHA: - return 26; + return 2; + + case PIPE_BLENDFACTOR_ZERO: + case PIPE_BLENDFACTOR_ONE: + return 3; + + case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: + return 4; + case PIPE_BLENDFACTOR_SRC1_COLOR: case PIPE_BLENDFACTOR_SRC1_ALPHA: case PIPE_BLENDFACTOR_INV_SRC1_COLOR: case PIPE_BLENDFACTOR_INV_SRC1_ALPHA: - return -1; /* not support */ + return -1; /* not supported */ } return -1; } @@ -1003,16 +1059,31 @@ enum pipe_blendfactor rgb_src_factor, enum pipe_blendfactor rgb_dst_factor, enum pipe_blendfactor alpha_src_factor, enum pipe_blendfactor alpha_dst_factor) { + /* PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE has to be changed to PIPE_BLENDFACTOR_ONE + * if it is set for alpha_src. + */ + if (alpha_src_factor == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE) + alpha_src_factor = PIPE_BLENDFACTOR_ONE; + return lima_blend_func(rgb_func) | (lima_blend_func(alpha_func) << 3) | + (lima_blend_factor(rgb_src_factor) << 6) | + (lima_blend_factor_is_inv(rgb_src_factor) << 9) | + (lima_blend_factor_has_alpha(rgb_src_factor) << 10) | + (lima_blend_factor(rgb_dst_factor) << 11) | - ((lima_blend_factor(alpha_src_factor) & 0xF) << 16) | - ((lima_blend_factor(alpha_dst_factor) & 0xF) << 20) | - 0x0C000000; /* need check if this GLESv1 glAlphaFunc */ + (lima_blend_factor_is_inv(rgb_dst_factor) << 14) | + (lima_blend_factor_has_alpha(rgb_dst_factor) << 15) | + + (lima_blend_factor(alpha_src_factor) << 16) | + (lima_blend_factor_is_inv(alpha_src_factor) << 19) | + + (lima_blend_factor(alpha_dst_factor) << 20) | + (lima_blend_factor_is_inv(alpha_dst_factor) << 23) | + 0x0C000000; /* need to check if this is GLESv1 glAlphaFunc */ } -#if 0 static int lima_stencil_op(enum pipe_stencil_op pipe) { @@ -1036,40 +1107,35 @@ } return -1; } -#endif -static int +static unsigned lima_calculate_depth_test(struct pipe_depth_state *depth, struct pipe_rasterizer_state *rst) { + int offset_scale = 0, offset_units = 0; enum pipe_compare_func func = (depth->enabled ? depth->func : PIPE_FUNC_ALWAYS); - int offset_scale = 0; - - //TODO: implement polygon offset -#if 0 - if (rst->offset_scale < -32) - offset_scale = -32; - else if (rst->offset_scale > 31) - offset_scale = 31; - else - offset_scale = rst->offset_scale * 4; - + offset_scale = CLAMP(rst->offset_scale * 4, -128, 127); if (offset_scale < 0) - offset_scale = 0x100 + offset_scale; -#endif + offset_scale += 0x100; + + offset_units = CLAMP(rst->offset_units * 2, -128, 127); + if (offset_units < 0) + offset_units += 0x100; return (depth->enabled && depth->writemask) | ((int)func << 1) | (offset_scale << 16) | + (offset_units << 24) | 0x30; /* find out what is this */ } static void lima_pack_render_state(struct lima_context *ctx, const struct pipe_draw_info *info) { + struct lima_fs_shader_state *fs = ctx->fs; struct lima_render_state *render = lima_ctx_buff_alloc(ctx, lima_ctx_buff_pp_plb_rsw, - sizeof(*render), true); + sizeof(*render)); /* do hw support RGBA independ blend? * PIPE_CAP_INDEP_BLEND_ENABLE @@ -1105,34 +1171,55 @@ struct pipe_depth_state *depth = &ctx->zsa->base.depth; render->depth_test = lima_calculate_depth_test(depth, rst); + ushort far, near; + + near = float_to_ushort(ctx->viewport.near); + far = float_to_ushort(ctx->viewport.far); + + /* Subtract epsilon from 'near' if far == near. Make sure we don't get overflow */ + if ((far == near) && (near != 0)) + near--; + /* overlap with plbu? any place can remove one? */ - render->depth_range = float_to_ushort(ctx->viewport.near) | - (float_to_ushort(ctx->viewport.far) << 16); + render->depth_range = near | (far << 16); -#if 0 struct pipe_stencil_state *stencil = ctx->zsa->base.stencil; struct pipe_stencil_ref *ref = &ctx->stencil_ref; - render->stencil_front = stencil[0].func | - (lima_stencil_op(stencil[0].fail_op) << 3) | - (lima_stencil_op(stencil[0].zfail_op) << 6) | - (lima_stencil_op(stencil[0].zpass_op) << 9) | - (ref->ref_value[0] << 16) | - (stencil[0].valuemask << 24); - render->stencil_back = stencil[1].func | - (lima_stencil_op(stencil[1].fail_op) << 3) | - (lima_stencil_op(stencil[1].zfail_op) << 6) | - (lima_stencil_op(stencil[1].zpass_op) << 9) | - (ref->ref_value[1] << 16) | - (stencil[1].valuemask << 24); -#else - render->stencil_front = 0xff000007; - render->stencil_back = 0xff000007; -#endif - - /* seems not correct? */ - //struct pipe_alpha_state *alpha = &ctx->zsa->base.alpha; - render->stencil_test = 0; - //(stencil->enabled ? 0xFF : 0x00) | (float_to_ubyte(alpha->ref_value) << 16) + + if (stencil[0].enabled) { /* stencil is enabled */ + render->stencil_front = stencil[0].func | + (lima_stencil_op(stencil[0].fail_op) << 3) | + (lima_stencil_op(stencil[0].zfail_op) << 6) | + (lima_stencil_op(stencil[0].zpass_op) << 9) | + (ref->ref_value[0] << 16) | + (stencil[0].valuemask << 24); + render->stencil_back = render->stencil_front; + render->stencil_test = (stencil[0].writemask & 0xff) | (stencil[0].writemask & 0xff) << 8; + if (stencil[1].enabled) { /* two-side is enabled */ + render->stencil_back = stencil[1].func | + (lima_stencil_op(stencil[1].fail_op) << 3) | + (lima_stencil_op(stencil[1].zfail_op) << 6) | + (lima_stencil_op(stencil[1].zpass_op) << 9) | + (ref->ref_value[1] << 16) | + (stencil[1].valuemask << 24); + render->stencil_test = (stencil[0].writemask & 0xff) | (stencil[1].writemask & 0xff) << 8; + } + /* TODO: Find out, what (render->stecil_test & 0xffff0000) is. + * 0x00ff0000 is probably (float_to_ubyte(alpha->ref_value) << 16) + * (render->multi_sample & 0x00000007 is probably the compare function + * of glAlphaFunc then. + */ + } + else { + /* Default values, when stencil is disabled: + * stencil[0|1].valuemask = 0xff + * stencil[0|1].func = PIPE_FUNC_ALWAYS + * stencil[0|1].writemask = 0xff + */ + render->stencil_front = 0xff000007; + render->stencil_back = 0xff000007; + render->stencil_test = 0x0000ffff; + } /* need more investigation */ if (info->mode == PIPE_PRIM_POINTS) @@ -1153,8 +1240,14 @@ render->textures_address = 0x00000000; /* more investigation */ - render->aux0 = 0x00000300 | (ctx->vs->varying_stride >> 3); - render->aux1 = 0x00003000; + render->aux0 = 0x00000100 | (ctx->vs->varying_stride >> 3); + render->aux1 = 0x00001000; + if (ctx->blend->base.dither) + render->aux1 |= 0x00002000; + + /* Enable Early-Z if shader doesn't have discard */ + if (!fs->uses_discard) + render->aux0 |= 0x200; if (ctx->tex_stateobj.num_samplers) { render->textures_address = @@ -1166,25 +1259,35 @@ if (ctx->const_buffer[PIPE_SHADER_FRAGMENT].buffer) { render->uniforms_address = lima_ctx_buff_va(ctx, lima_ctx_buff_pp_uniform_array, LIMA_CTX_BUFF_SUBMIT_PP); - render->uniforms_address |= ((ctx->buffer_state[lima_ctx_buff_pp_uniform].size) / 4 - 1); + uint32_t size = ctx->buffer_state[lima_ctx_buff_pp_uniform].size; + uint32_t bits = 0; + if (size >= 8) { + bits = util_last_bit(size >> 3) - 1; + bits += size & u_bit_consecutive(0, bits + 3) ? 1 : 0; + } + render->uniforms_address |= bits > 0xf ? 0xf : bits; + render->aux0 |= 0x80; render->aux1 |= 0x10000; } - if (ctx->vs->num_varying > 1) { + if (ctx->vs->num_varyings) { render->varying_types = 0x00000000; - render->varyings_address = - lima_ctx_buff_va(ctx, lima_ctx_buff_sh_varying, LIMA_CTX_BUFF_SUBMIT_PP); - for (int i = 1; i < ctx->vs->num_varying; i++) { + render->varyings_address = ctx->gp_output->va + + ctx->gp_output_varyings_offt; + for (int i = 0, index = 0; i < ctx->vs->num_outputs; i++) { int val; + if (i == ctx->vs->gl_pos_idx || + i == ctx->vs->point_size_idx) + continue; + struct lima_varying_info *v = ctx->vs->varying + i; if (v->component_size == 4) val = v->components > 2 ? 0 : 1; else val = v->components > 2 ? 2 : 3; - int index = i - 1; if (index < 10) render->varying_types |= val << (3 * index); else if (index == 10) { @@ -1193,6 +1296,8 @@ } else if (index == 11) render->varyings_address |= val << 1; + + index++; } } else { @@ -1203,6 +1308,10 @@ lima_dump_command_stream_print( render, sizeof(*render), false, "add render state at va %x\n", lima_ctx_buff_va(ctx, lima_ctx_buff_pp_plb_rsw, 0)); + + lima_dump_rsw_command_stream_print(render, sizeof(*render), + lima_ctx_buff_va(ctx, lima_ctx_buff_pp_plb_rsw, 0)); + } static void @@ -1213,7 +1322,7 @@ uint32_t *attribute = lima_ctx_buff_alloc(ctx, lima_ctx_buff_gp_attribute_info, - ve->num_elements * 8, true); + MAX2(1, ve->num_elements) * 8); int n = 0; for (int i = 0; i < ve->num_elements; i++) { @@ -1227,7 +1336,7 @@ lima_submit_add_bo(ctx->gp_submit, res->bo, LIMA_SUBMIT_BO_READ); - unsigned start = info->index_size ? ctx->min_index : info->start; + unsigned start = info->index_size ? (ctx->min_index + info->index_bias) : info->start; attribute[n++] = res->bo->va + pvb->buffer_offset + pve->src_offset + start * pvb->stride; attribute[n++] = (pvb->stride << 11) | @@ -1249,7 +1358,7 @@ int size = vs->uniform_pending_offset + vs->constant_size + 32; void *vs_const_buff = - lima_ctx_buff_alloc(ctx, lima_ctx_buff_gp_uniform, size, true); + lima_ctx_buff_alloc(ctx, lima_ctx_buff_gp_uniform, size); if (ccb->buffer) memcpy(vs_const_buff, ccb->buffer, ccb->size); @@ -1282,10 +1391,10 @@ uint16_t *fp16_const_buff = lima_ctx_buff_alloc(ctx, lima_ctx_buff_pp_uniform, - const_buff_size * sizeof(uint16_t), true); + const_buff_size * sizeof(uint16_t)); uint32_t *array = - lima_ctx_buff_alloc(ctx, lima_ctx_buff_pp_uniform_array, 4, true); + lima_ctx_buff_alloc(ctx, lima_ctx_buff_pp_uniform_array, 4); for (int i = 0; i < const_buff_size; i++) fp16_const_buff[i] = util_float_to_half(const_buff[i]); @@ -1303,27 +1412,25 @@ static void lima_update_varying(struct lima_context *ctx, const struct pipe_draw_info *info) { + struct lima_screen *screen = lima_screen(ctx->base.screen); struct lima_vs_shader_state *vs = ctx->vs; + uint32_t gp_output_size; + unsigned num = info->index_size ? (ctx->max_index - ctx->min_index + 1) : info->count; uint32_t *varying = lima_ctx_buff_alloc(ctx, lima_ctx_buff_gp_varying_info, - vs->num_varying * 8, true); + vs->num_outputs * 8); int n = 0; - /* should be LIMA_SUBMIT_BO_WRITE for GP, but each draw will use - * different part of this bo, so no need to set exclusive constraint */ - lima_ctx_buff_alloc(ctx, lima_ctx_buff_sh_gl_pos, - 4 * 4 * info->count, false); - - /* for gl_Position */ - varying[n++] = - lima_ctx_buff_va(ctx, lima_ctx_buff_sh_gl_pos, - LIMA_CTX_BUFF_SUBMIT_GP | LIMA_CTX_BUFF_SUBMIT_PP); - varying[n++] = 0x8020; - int offset = 0; - for (int i = 1; i < vs->num_varying; i++) { + + for (int i = 0; i < vs->num_outputs; i++) { struct lima_varying_info *v = vs->varying + i; + + if (i == vs->gl_pos_idx || + i == vs->point_size_idx) + continue; + int size = v->component_size * 4; /* does component_size == 2 need to be 16 aligned? */ @@ -1333,19 +1440,50 @@ v->offset = offset; offset += size; } + vs->varying_stride = align(offset, 16); - if (vs->num_varying > 1) - lima_ctx_buff_alloc(ctx, lima_ctx_buff_sh_varying, - vs->varying_stride * info->count, false); + /* gl_Position is always present, allocate space for it */ + gp_output_size = align(4 * 4 * num, 0x40); - for (int i = 1; i < vs->num_varying; i++) { + /* Allocate space for varyings if there're any */ + if (vs->num_varyings) { + ctx->gp_output_varyings_offt = gp_output_size; + gp_output_size += align(vs->varying_stride * num, 0x40); + } + + /* Allocate space for gl_PointSize if it's there */ + if (vs->point_size_idx != -1) { + ctx->gp_output_point_size_offt = gp_output_size; + gp_output_size += 4 * num; + } + + /* gp_output can be too large for the suballocator, so create a + * separate bo for it. The bo cache should prevent performance hit. + */ + ctx->gp_output = lima_bo_create(screen, gp_output_size, 0); + assert(ctx->gp_output); + lima_submit_add_bo(ctx->gp_submit, ctx->gp_output, LIMA_SUBMIT_BO_WRITE); + lima_submit_add_bo(ctx->pp_submit, ctx->gp_output, LIMA_SUBMIT_BO_READ); + + for (int i = 0; i < vs->num_outputs; i++) { struct lima_varying_info *v = vs->varying + i; - varying[n++] = - lima_ctx_buff_va(ctx, lima_ctx_buff_sh_varying, LIMA_CTX_BUFF_SUBMIT_GP) + - v->offset; - varying[n++] = (vs->varying_stride << 11) | (v->components - 1) | - (v->component_size == 2 ? 0x0C : 0); + + if (i == vs->gl_pos_idx) { + /* gl_Position */ + varying[n++] = ctx->gp_output->va; + varying[n++] = 0x8020; + } else if (i == vs->point_size_idx) { + /* gl_PointSize */ + varying[n++] = ctx->gp_output->va + ctx->gp_output_point_size_offt; + varying[n++] = 0x2021; + } else { + /* Varying */ + varying[n++] = ctx->gp_output->va + ctx->gp_output_varyings_offt + + v->offset; + varying[n++] = (vs->varying_stride << 11) | (v->components - 1) | + (v->component_size == 2 ? 0x0C : 0); + } } lima_dump_command_stream_print( @@ -1354,47 +1492,13 @@ } static void -lima_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info) +lima_draw_vbo_update(struct pipe_context *pctx, + const struct pipe_draw_info *info) { - /* check if draw mode and vertex/index count match, - * otherwise gp will hang */ - if (!u_trim_pipe_prim(info->mode, (unsigned*)&info->count)) { - debug_printf("draw mode and vertex/index count mismatch\n"); - return; - } - struct lima_context *ctx = lima_context(pctx); - if (!ctx->vs || !ctx->fs) { - debug_warn_once("no shader, skip draw\n"); - return; - } - - if (!lima_update_vs_state(ctx) || !lima_update_fs_state(ctx)) - return; - - lima_dump_command_stream_print( - ctx->vs->bo->map, ctx->vs->shader_size, false, - "add vs at va %x\n", ctx->vs->bo->va); - - lima_dump_command_stream_print( - ctx->fs->bo->map, ctx->fs->shader_size, false, - "add fs at va %x\n", ctx->fs->bo->va); - - lima_submit_add_bo(ctx->gp_submit, ctx->vs->bo, LIMA_SUBMIT_BO_READ); - lima_submit_add_bo(ctx->pp_submit, ctx->fs->bo, LIMA_SUBMIT_BO_READ); - lima_update_submit_bo(ctx); - /* Mali Utgard GPU always need min/max index info for index draw, - * compute it if upper layer does not do for us */ - if (info->index_size && info->max_index == ~0u) - u_vbuf_get_minmax_index(pctx, info, &ctx->min_index, &ctx->max_index); - else { - ctx->min_index = info->min_index; - ctx->max_index = info->max_index; - } - lima_update_gp_attribute_info(ctx, info); if ((ctx->dirty & LIMA_CONTEXT_DIRTY_CONST_BUFF && @@ -1423,10 +1527,123 @@ lima_pack_render_state(ctx, info); lima_pack_plbu_cmd(ctx, info); + if (ctx->gp_output) { + lima_bo_unreference(ctx->gp_output); /* held by submit */ + ctx->gp_output = NULL; + } + + if (ctx->framebuffer.base.zsbuf) { + if (ctx->zsa->base.depth.enabled) + ctx->resolve |= PIPE_CLEAR_DEPTH; + if (ctx->zsa->base.stencil[0].enabled || + ctx->zsa->base.stencil[1].enabled) + ctx->resolve |= PIPE_CLEAR_STENCIL; + } + + if (ctx->framebuffer.base.nr_cbufs) + ctx->resolve |= PIPE_CLEAR_COLOR0; + ctx->dirty = 0; } static void +lima_draw_vbo_indexed(struct pipe_context *pctx, + const struct pipe_draw_info *info) +{ + struct lima_context *ctx = lima_context(pctx); + struct pipe_resource *indexbuf = NULL; + + /* Mali Utgard GPU always need min/max index info for index draw, + * compute it if upper layer does not do for us */ + if (info->max_index == ~0u) + u_vbuf_get_minmax_index(pctx, info, &ctx->min_index, &ctx->max_index); + else { + ctx->min_index = info->min_index; + ctx->max_index = info->max_index; + } + + if (info->has_user_indices) { + util_upload_index_buffer(&ctx->base, info, &indexbuf, &ctx->index_offset, 0x40); + ctx->index_res = lima_resource(indexbuf); + } + else { + ctx->index_res = lima_resource(info->index.resource); + ctx->index_offset = 0; + } + + lima_submit_add_bo(ctx->gp_submit, ctx->index_res->bo, LIMA_SUBMIT_BO_READ); + lima_submit_add_bo(ctx->pp_submit, ctx->index_res->bo, LIMA_SUBMIT_BO_READ); + lima_draw_vbo_update(pctx, info); + + if (indexbuf) + pipe_resource_reference(&indexbuf, NULL); +} + +static void +lima_draw_vbo_count(struct pipe_context *pctx, + const struct pipe_draw_info *info) +{ + static const uint32_t max_verts = 65535; + + struct pipe_draw_info local_info = *info; + unsigned start = info->start; + unsigned count = info->count; + + while (count) { + unsigned this_count = count; + unsigned step; + + u_split_draw(info, max_verts, &this_count, &step); + + local_info.start = start; + local_info.count = this_count; + + lima_draw_vbo_update(pctx, &local_info); + + count -= step; + start += step; + } +} + +static void +lima_draw_vbo(struct pipe_context *pctx, + const struct pipe_draw_info *info) +{ + /* check if draw mode and vertex/index count match, + * otherwise gp will hang */ + if (!u_trim_pipe_prim(info->mode, (unsigned*)&info->count)) { + debug_printf("draw mode and vertex/index count mismatch\n"); + return; + } + + struct lima_context *ctx = lima_context(pctx); + + if (!ctx->vs || !ctx->fs) { + debug_warn_once("no shader, skip draw\n"); + return; + } + + if (!lima_update_vs_state(ctx) || !lima_update_fs_state(ctx)) + return; + + lima_dump_command_stream_print( + ctx->vs->bo->map, ctx->vs->shader_size, false, + "add vs at va %x\n", ctx->vs->bo->va); + + lima_dump_command_stream_print( + ctx->fs->bo->map, ctx->fs->shader_size, false, + "add fs at va %x\n", ctx->fs->bo->va); + + lima_submit_add_bo(ctx->gp_submit, ctx->vs->bo, LIMA_SUBMIT_BO_READ); + lima_submit_add_bo(ctx->pp_submit, ctx->fs->bo, LIMA_SUBMIT_BO_READ); + + if (info->index_size) + lima_draw_vbo_indexed(pctx, info); + else + lima_draw_vbo_count(pctx, info); +} + +static void lima_finish_plbu_cmd(struct lima_context *ctx) { int i = 0; @@ -1444,20 +1661,7 @@ struct lima_context_framebuffer *fb = &ctx->framebuffer; struct lima_resource *res = lima_resource(fb->base.zsbuf->texture); int level = fb->base.zsbuf->u.tex.level; - - uint32_t format; - - switch (fb->base.zsbuf->format) { - case PIPE_FORMAT_Z16_UNORM: - format = LIMA_PIXEL_FORMAT_Z16; - break; - case PIPE_FORMAT_Z24_UNORM_S8_UINT: - case PIPE_FORMAT_Z24X8_UNORM: - default: - /* Assume Z24S8 */ - format = LIMA_PIXEL_FORMAT_Z24S8; - break; - } + uint32_t format = lima_format_get_pixel(fb->base.zsbuf->format); struct lima_pp_wb_reg *wb = (void *)wb_reg; wb[wb_idx].type = 0x01; /* 1 for depth, stencil */ @@ -1479,21 +1683,14 @@ struct lima_context_framebuffer *fb = &ctx->framebuffer; struct lima_resource *res = lima_resource(fb->base.cbufs[0]->texture); int level = fb->base.cbufs[0]->u.tex.level; - - bool swap_channels = false; - switch (fb->base.cbufs[0]->format) { - case PIPE_FORMAT_R8G8B8A8_UNORM: - case PIPE_FORMAT_R8G8B8X8_UNORM: - swap_channels = true; - break; - default: - break; - } + unsigned layer = fb->base.cbufs[0]->u.tex.first_layer; + uint32_t format = lima_format_get_pixel(fb->base.cbufs[0]->format); + bool swap_channels = lima_format_get_swap_rb(fb->base.cbufs[0]->format); struct lima_pp_wb_reg *wb = (void *)wb_reg; wb[wb_idx].type = 0x02; /* 2 for color buffer */ - wb[wb_idx].address = res->bo->va + res->levels[level].offset; - wb[wb_idx].pixel_format = LIMA_PIXEL_FORMAT_B8G8R8A8; + wb[wb_idx].address = res->bo->va + res->levels[level].offset + layer * res->levels[level].layer_stride; + wb[wb_idx].pixel_format = format; if (res->tiled) { wb[wb_idx].pixel_layout = 0x2; wb[wb_idx].pitch = fb->tiled_w; @@ -1532,8 +1729,7 @@ /* These are "stack size" and "stack offset" shifted, * here they are assumed to be always the same. */ - uint32_t fs_stack_size = ctx->fs ? ctx->fs->stack_size : 0; - frame->fragment_stack_size = fs_stack_size << 16 | fs_stack_size; + frame->fragment_stack_size = ctx->pp_max_stack_size << 16 | ctx->pp_max_stack_size; /* related with MSAA and different value when r4p0/r7p0 */ frame->supersampled_height = fb->base.height * 2 - 1; @@ -1544,20 +1740,19 @@ frame->blocking = (fb->shift_min << 28) | (fb->shift_h << 16) | fb->shift_w; frame->foureight = 0x8888; - if (fb->base.nr_cbufs) + if (fb->base.nr_cbufs && (ctx->resolve & PIPE_CLEAR_COLOR0)) lima_pack_wb_cbuf_reg(ctx, wb_reg, wb_idx++); - /* Mali4x0 can use on-tile buffer for depth/stencil, so to save some - * memory bandwidth don't write depth/stencil back to memory if we're - * rendering to scanout - */ - if (!lima_is_scanout(ctx) && fb->base.zsbuf) + if (fb->base.zsbuf && + (ctx->resolve & (PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL))) lima_pack_wb_zsbuf_reg(ctx, wb_reg, wb_idx++); } static void _lima_flush(struct lima_context *ctx, bool end_of_frame) { + #define pp_stack_pp_size 0x400 + lima_finish_plbu_cmd(ctx); int vs_cmd_size = ctx->vs_cmd_array.size; @@ -1567,7 +1762,7 @@ if (vs_cmd_size) { void *vs_cmd = - lima_ctx_buff_alloc(ctx, lima_ctx_buff_gp_vs_cmd, vs_cmd_size, true); + lima_ctx_buff_alloc(ctx, lima_ctx_buff_gp_vs_cmd, vs_cmd_size); memcpy(vs_cmd, util_dynarray_begin(&ctx->vs_cmd_array), vs_cmd_size); util_dynarray_clear(&ctx->vs_cmd_array); vs_cmd_va = lima_ctx_buff_va(ctx, lima_ctx_buff_gp_vs_cmd, @@ -1575,10 +1770,11 @@ lima_dump_command_stream_print( vs_cmd, vs_cmd_size, false, "flush vs cmd at va %x\n", vs_cmd_va); + lima_dump_vs_command_stream_print(vs_cmd, vs_cmd_size, vs_cmd_va); } void *plbu_cmd = - lima_ctx_buff_alloc(ctx, lima_ctx_buff_gp_plbu_cmd, plbu_cmd_size, true); + lima_ctx_buff_alloc(ctx, lima_ctx_buff_gp_plbu_cmd, plbu_cmd_size); memcpy(plbu_cmd, util_dynarray_begin(&ctx->plbu_cmd_array), plbu_cmd_size); util_dynarray_clear(&ctx->plbu_cmd_array); plbu_cmd_va = lima_ctx_buff_va(ctx, lima_ctx_buff_gp_plbu_cmd, @@ -1586,6 +1782,7 @@ lima_dump_command_stream_print( plbu_cmd, plbu_cmd_size, false, "flush plbu cmd at va %x\n", plbu_cmd_va); + lima_dump_plbu_command_stream_print(plbu_cmd, plbu_cmd_size, plbu_cmd_va); struct lima_screen *screen = lima_screen(ctx->base.screen); struct drm_lima_gp_frame gp_frame; @@ -1595,7 +1792,7 @@ gp_frame_reg->plbu_cmd_start = plbu_cmd_va; gp_frame_reg->plbu_cmd_end = plbu_cmd_va + plbu_cmd_size; gp_frame_reg->tile_heap_start = ctx->gp_tile_heap[ctx->plb_index]->va; - gp_frame_reg->tile_heap_end = ctx->gp_tile_heap[ctx->plb_index]->va + gp_tile_heap_size; + gp_frame_reg->tile_heap_end = ctx->gp_tile_heap[ctx->plb_index]->va + ctx->gp_tile_heap_size; lima_dump_command_stream_print( &gp_frame, sizeof(gp_frame), false, "add gp frame\n"); @@ -1605,11 +1802,11 @@ if (lima_dump_command_stream) { if (lima_submit_wait(ctx->gp_submit, PIPE_TIMEOUT_INFINITE)) { - if (ctx->buffer_state[lima_ctx_buff_sh_gl_pos].res) { - float *pos = lima_ctx_buff_map(ctx, lima_ctx_buff_sh_gl_pos); + if (ctx->gp_output) { + float *pos = lima_bo_map(ctx->gp_output); lima_dump_command_stream_print( pos, 4 * 4 * 16, true, "gl_pos dump at va %x\n", - lima_ctx_buff_va(ctx, lima_ctx_buff_sh_gl_pos, 0)); + ctx->gp_output->va); } uint32_t *plb = lima_bo_map(ctx->plb[ctx->plb_index]); @@ -1623,6 +1820,16 @@ } } + uint32_t pp_stack_va = 0; + if (ctx->pp_max_stack_size) { + lima_ctx_buff_alloc(ctx, lima_ctx_buff_pp_stack, screen->num_pp * + ctx->pp_max_stack_size * pp_stack_pp_size); + pp_stack_va = lima_ctx_buff_va(ctx, lima_ctx_buff_pp_stack, + LIMA_CTX_BUFF_SUBMIT_PP); + } + + lima_update_pp_stream(ctx); + struct lima_pp_stream_state *ps = &ctx->pp_stream; if (screen->gpu_type == DRM_LIMA_PARAM_GPU_ID_MALI400) { struct drm_lima_m400_pp_frame pp_frame = {0}; @@ -1631,8 +1838,9 @@ for (int i = 0; i < screen->num_pp; i++) { pp_frame.plbu_array_address[i] = ps->bo->va + ps->bo_offset + ps->offset[i]; - pp_frame.fragment_stack_address[i] = screen->pp_buffer->va + - pp_stack_offset + pp_stack_pp_size * i; + if (ctx->pp_max_stack_size) + pp_frame.fragment_stack_address[i] = pp_stack_va + + ctx->pp_max_stack_size * pp_stack_pp_size * i; } lima_dump_command_stream_print( @@ -1646,9 +1854,10 @@ lima_pack_pp_frame_reg(ctx, pp_frame.frame, pp_frame.wb); pp_frame.num_pp = screen->num_pp; - for (int i = 0; i < screen->num_pp; i++) - pp_frame.fragment_stack_address[i] = screen->pp_buffer->va + - pp_stack_offset + pp_stack_pp_size * i; + if (ctx->pp_max_stack_size) + for (int i = 0; i < screen->num_pp; i++) + pp_frame.fragment_stack_address[i] = pp_stack_va + + ctx->pp_max_stack_size * pp_stack_pp_size * i; if (ps->bo) { for (int i = 0; i < screen->num_pp; i++) @@ -1686,6 +1895,15 @@ struct lima_surface *surf = lima_surface(ctx->framebuffer.base.cbufs[0]); surf->reload = true; } + + ctx->pp_max_stack_size = 0; + + ctx->damage_rect.minx = ctx->damage_rect.miny = 0xffff; + ctx->damage_rect.maxx = ctx->damage_rect.maxy = 0; + + ctx->resolve = 0; + + lima_dump_file_next(); } void @@ -1702,10 +1920,8 @@ unsigned flags) { struct lima_context *ctx = lima_context(pctx); - if (!lima_ctx_dirty(ctx)) - return; - - _lima_flush(ctx, flags & PIPE_FLUSH_END_OF_FRAME); + if (lima_ctx_dirty(ctx)) + _lima_flush(ctx, flags & PIPE_FLUSH_END_OF_FRAME); if (fence) { int fd; diff -Nru mesa-19.2.8/src/gallium/drivers/lima/lima_format.c mesa-20.0.8/src/gallium/drivers/lima/lima_format.c --- mesa-19.2.8/src/gallium/drivers/lima/lima_format.c 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/lima/lima_format.c 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,134 @@ +/* + * Copyright (c) 2011-2013 Luc Verhaegen + * Copyright (c) 2018-2019 Lima Project + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sub license, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + */ + +#include + +#include + +#include "lima_format.h" + +#define LIMA_TEXEL_FORMAT_L8 0x09 +#define LIMA_TEXEL_FORMAT_A8 0x0a +#define LIMA_TEXEL_FORMAT_I8 0x0b +#define LIMA_TEXEL_FORMAT_BGR_565 0x0e +#define LIMA_TEXEL_FORMAT_L8A8 0x11 +#define LIMA_TEXEL_FORMAT_L16 0x12 +#define LIMA_TEXEL_FORMAT_A16 0x13 +#define LIMA_TEXEL_FORMAT_I16 0x14 +#define LIMA_TEXEL_FORMAT_RGB_888 0x15 +#define LIMA_TEXEL_FORMAT_RGBA_8888 0x16 +#define LIMA_TEXEL_FORMAT_RGBX_8888 0x17 +#define LIMA_TEXEL_FORMAT_Z24S8 0x2c +#define LIMA_TEXEL_FORMAT_NONE -1 + +#define LIMA_PIXEL_FORMAT_B5G6R5 0x00 +#define LIMA_PIXEL_FORMAT_B8G8R8A8 0x03 +#define LIMA_PIXEL_FORMAT_Z16 0x0e +#define LIMA_PIXEL_FORMAT_Z24S8 0x0f +#define LIMA_PIXEL_FORMAT_NONE -1 + +struct lima_format { + bool present; + int texel; + int pixel; + bool swap_r_b; +}; + +#define LIMA_FORMAT(pipe, tex, pix, swap) \ + [PIPE_FORMAT_##pipe] = { \ + .present = true, .texel = LIMA_TEXEL_FORMAT_##tex, \ + .pixel = LIMA_PIXEL_FORMAT_##pix, .swap_r_b = swap, \ + } + +static const struct lima_format lima_format_table[] = { + LIMA_FORMAT(R8G8B8A8_UNORM, RGBA_8888, B8G8R8A8, true), + LIMA_FORMAT(B8G8R8A8_UNORM, RGBA_8888, B8G8R8A8, false), + LIMA_FORMAT(R8G8B8A8_SRGB, RGBA_8888, B8G8R8A8, true), + LIMA_FORMAT(B8G8R8A8_SRGB, RGBA_8888, B8G8R8A8, false), + LIMA_FORMAT(R8G8B8X8_UNORM, RGBX_8888, B8G8R8A8, true), + LIMA_FORMAT(B8G8R8X8_UNORM, RGBX_8888, B8G8R8A8, false), + LIMA_FORMAT(B5G6R5_UNORM, BGR_565, B5G6R5, false), + LIMA_FORMAT(Z24_UNORM_S8_UINT, Z24S8, Z24S8, false), + LIMA_FORMAT(Z24X8_UNORM, Z24S8, Z24S8, false), + /* Blob uses L16 for Z16 */ + LIMA_FORMAT(Z16_UNORM, L16, Z16, false), + LIMA_FORMAT(L16_UNORM, L16, NONE, false), + LIMA_FORMAT(L8_UNORM, L8, NONE, false), + LIMA_FORMAT(A16_UNORM, A16, NONE, false), + LIMA_FORMAT(A8_UNORM, A8, NONE, false), + LIMA_FORMAT(I16_UNORM, I16, NONE, false), + LIMA_FORMAT(I8_UNORM, I8, NONE, false), + LIMA_FORMAT(L8A8_UNORM, L8A8, NONE, false), +}; + +static const struct lima_format * +get_format(enum pipe_format f) +{ + if (f >= ARRAY_SIZE(lima_format_table) || + !lima_format_table[f].present) + return NULL; + + return lima_format_table + f; +} + +bool +lima_format_texel_supported(enum pipe_format f) +{ + const struct lima_format *lf = get_format(f); + + if (!lf) + return false; + + return lf->texel != LIMA_TEXEL_FORMAT_NONE; +} + +bool +lima_format_pixel_supported(enum pipe_format f) +{ + const struct lima_format *lf = get_format(f); + + if (!lf) + return false; + + return lf->pixel != LIMA_PIXEL_FORMAT_NONE; +} + +int +lima_format_get_texel(enum pipe_format f) +{ + return lima_format_table[f].texel; +} + +int +lima_format_get_pixel(enum pipe_format f) +{ + return lima_format_table[f].pixel; +} + +bool +lima_format_get_swap_rb(enum pipe_format f) +{ + return lima_format_table[f].swap_r_b; +} diff -Nru mesa-19.2.8/src/gallium/drivers/lima/lima_format.h mesa-20.0.8/src/gallium/drivers/lima/lima_format.h --- mesa-19.2.8/src/gallium/drivers/lima/lima_format.h 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/lima/lima_format.h 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2018-2019 Lima Project + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sub license, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + */ +#ifndef H_LIMA_FORMAT +#define H_LIMA_FORMAT + +#include + +#include + +bool lima_format_texel_supported(enum pipe_format f); +bool lima_format_pixel_supported(enum pipe_format f); +int lima_format_get_texel(enum pipe_format f); +int lima_format_get_pixel(enum pipe_format f); +bool lima_format_get_swap_rb(enum pipe_format f); + +#endif diff -Nru mesa-19.2.8/src/gallium/drivers/lima/lima_parser.c mesa-20.0.8/src/gallium/drivers/lima/lima_parser.c --- mesa-19.2.8/src/gallium/drivers/lima/lima_parser.c 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/lima/lima_parser.c 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,723 @@ +/* + * Copyright (c) 2019 Andreas Baierl + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sub license, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + */ + +#include "util/u_math.h" + +#include +#include +#include + +#include "lima_context.h" +#include "lima_parser.h" +#include "lima_texture.h" + +typedef struct { + char *info; +} render_state_info; + +static render_state_info render_state_infos[] = { + { .info = "BLEND_COLOR_BG", }, + { .info = "BLEND_COLOR_RA", }, + { .info = "ALPHA_BLEND", }, + { .info = "DEPTH_TEST", }, + { .info = "DEPTH_RANGE", }, + { .info = "STENCIL_FRONT", }, + { .info = "STENCIL_BACK", }, + { .info = "STENCIL_TEST", }, + { .info = "MULTI_SAMPLE", }, + { .info = "SHADER_ADDRESS (FS)", }, + { .info = "VARYING_TYPES", }, + { .info = "UNIFORMS_ADDRESS (PP)", }, + { .info = "TEXTURES_ADDRESS", }, + { .info = "AUX0", }, + { .info = "AUX1", }, + { .info = "VARYINGS_ADDRESS", }, +}; + +/* VS CMD stream parser functions */ + +static void +parse_vs_draw(FILE *fp, uint32_t *value1, uint32_t *value2) +{ + if ((*value1 == 0x00000000) && (*value2 == 0x00000000)) + fprintf(fp, "\t/* ---EMPTY CMD */\n"); + else + fprintf(fp, "\t/* DRAW: num: %d, index_draw: %s */\n", + (*value1 & 0xff000000) >> 24 | (*value2 & 0x000000ff) << 8, + (*value1 & 0x00000001) ? "true" : "false"); +} + +static void +parse_vs_shader_info(FILE *fp, uint32_t *value1, uint32_t *value2) +{ + fprintf(fp, "\t/* SHADER_INFO: prefetch: %d, size: %d */\n", + (*value1 & 0xfff00000) >> 20, + (((*value1 & 0x000fffff) >> 10) + 1) << 4); +} + +static void +parse_vs_unknown1(FILE *fp, uint32_t *value1, uint32_t *value2) +{ + fprintf(fp, "\t/* UNKNOWN_1 */\n"); +} + +static void +parse_vs_varying_attribute_count(FILE *fp, uint32_t *value1, uint32_t *value2) +{ + fprintf(fp, "\t/* VARYING_ATTRIBUTE_COUNT: nr_vary: %d, nr_attr: %d */\n", + ((*value1 & 0x00ffffff) >> 8) + 1, (*value1 >> 24) + 1); +} + +static void +parse_vs_attributes_address(FILE *fp, uint32_t *value1, uint32_t *value2) +{ + fprintf(fp, "\t/* ATTRIBUTES_ADDRESS: address: 0x%08x, size: %d */\n", + *value1, (*value2 & 0x0fffffff) >> 17); +} + +static void +parse_vs_varyings_address(FILE *fp, uint32_t *value1, uint32_t *value2) +{ + fprintf(fp, "\t/* VARYINGS_ADDRESS: varying info @ 0x%08x, size: %d */\n", + *value1, (*value2 & 0x0fffffff) >> 17); +} + +static void +parse_vs_uniforms_address(FILE *fp, uint32_t *value1, uint32_t *value2) +{ + fprintf(fp, "\t/* UNIFORMS_ADDRESS (GP): address: 0x%08x, size: %d */\n", + *value1, (*value2 & 0x0fffffff) >> 12); +} + +static void +parse_vs_shader_address(FILE *fp, uint32_t *value1, uint32_t *value2) +{ + fprintf(fp, "\t/* SHADER_ADDRESS (VS): address: 0x%08x, size: %d */\n", + *value1, (*value2 & 0x0fffffff) >> 12); +} + +static void +parse_vs_semaphore(FILE *fp, uint32_t *value1, uint32_t *value2) +{ + if (*value1 == 0x00028000) + fprintf(fp, "\t/* SEMAPHORE_BEGIN_1 */\n"); + else if (*value1 == 0x00000001) + fprintf(fp, "\t/* SEMAPHORE_BEGIN_2 */\n"); + else if (*value1 == 0x00000000) + fprintf(fp, "\t/* SEMAPHORE_END: index_draw disabled */\n"); + else if (*value1 == 0x00018000) + fprintf(fp, "\t/* SEMAPHORE_END: index_draw enabled */\n"); + else + fprintf(fp, "\t/* SEMAPHORE - cmd unknown! */\n"); +} + +static void +parse_vs_unknown2(FILE *fp, uint32_t *value1, uint32_t *value2) +{ + fprintf(fp, "\t/* UNKNOWN_2 */\n"); +} + +static void +parse_vs_continue(FILE *fp, uint32_t *value1, uint32_t *value2) +{ + fprintf(fp, "\t/* CONTINUE: at 0x%08x */\n", *value1); +} + +void +lima_parse_vs(FILE *fp, uint32_t *data, int size, uint32_t start) +{ + uint32_t *value1; + uint32_t *value2; + + fprintf(fp, "\n"); + fprintf(fp, "/* ============ VS CMD STREAM BEGIN ============= */\n"); + for (int i = 0; i * 4 < size; i += 2) { + value1 = &data[i]; + value2 = &data[i + 1]; + fprintf(fp, "/* 0x%08x (0x%08x) */\t0x%08x 0x%08x", + start + i * 4, i * 4, *value1, *value2); + + if ((*value2 & 0xffff0000) == 0x00000000) + parse_vs_draw(fp, value1, value2); + else if ((*value2 & 0xff0000ff) == 0x10000040) + parse_vs_shader_info(fp, value1, value2); + else if ((*value2 & 0xff0000ff) == 0x10000041) + parse_vs_unknown1(fp, value1, value2); + else if ((*value2 & 0xff0000ff) == 0x10000042) + parse_vs_varying_attribute_count(fp, value1, value2); + else if ((*value2 & 0xff0000ff) == 0x20000000) + parse_vs_attributes_address(fp, value1, value2); + else if ((*value2 & 0xff0000ff) == 0x20000008) + parse_vs_varyings_address(fp, value1, value2); + else if ((*value2 & 0xff000000) == 0x30000000) + parse_vs_uniforms_address(fp, value1, value2); + else if ((*value2 & 0xff000000) == 0x40000000) + parse_vs_shader_address(fp, value1, value2); + else if ((*value2 & 0xff000000)== 0x50000000) + parse_vs_semaphore(fp, value1, value2); + else if ((*value2 & 0xff000000) == 0x60000000) + parse_vs_unknown2(fp, value1, value2); + else if ((*value2 & 0xff000000) == 0xf0000000) + parse_vs_continue(fp, value1, value2); + else + fprintf(fp, "\t/* --- unknown cmd --- */\n"); + } + fprintf(fp, "/* ============ VS CMD STREAM END =============== */\n"); + fprintf(fp, "\n"); +} + +/* PLBU CMD stream parser functions */ + +static void +parse_plbu_block_step(FILE *fp, uint32_t *value1, uint32_t *value2) +{ + fprintf(fp, "\t/* BLOCK_STEP: shift_min: %d, shift_h: %d, shift_w: %d */\n", + (*value1 & 0xf0000000) >> 28, + (*value1 & 0x0fff0000) >> 16, + *value1 & 0x0000ffff); +} + +static void +parse_plbu_tiled_dimensions(FILE *fp, uint32_t *value1, uint32_t *value2) +{ + fprintf(fp, "\t/* TILED_DIMENSIONS: tiled_w: %d, tiled_h: %d */\n", + ((*value1 & 0xff000000) >> 24) + 1, + ((*value1 & 0x00ffff00) >> 8) + 1); +} + +static void +parse_plbu_block_stride(FILE *fp, uint32_t *value1, uint32_t *value2) +{ + fprintf(fp, "\t/* BLOCK_STRIDE: block_w: %d */\n", *value1 & 0x000000ff); +} + +static void +parse_plbu_array_address(FILE *fp, uint32_t *value1, uint32_t *value2) +{ + fprintf(fp, "\t/* ARRAY_ADDRESS: gp_stream: 0x%08x, block_num (block_w * block_h): %d */\n", + *value1, (*value2 & 0x00ffffff) + 1); +} + +static void +parse_plbu_viewport_left(FILE *fp, float *value1, uint32_t *value2) +{ + fprintf(fp, "\t/* VIEWPORT_LEFT: viewport_left: %f */\n", *value1); +} + +static void +parse_plbu_viewport_right(FILE *fp, float *value1, uint32_t *value2) +{ + fprintf(fp, "\t/* VIEWPORT_RIGHT: viewport_right: %f */\n", *value1); +} + +static void +parse_plbu_viewport_bottom(FILE *fp, float *value1, uint32_t *value2) +{ + fprintf(fp, "\t/* VIEWPORT_BOTTOM: viewport_bottom: %f */\n", *value1); +} + +static void +parse_plbu_viewport_top(FILE *fp, float *value1, uint32_t *value2) +{ + fprintf(fp, "\t/* VIEWPORT_TOP: viewport_top: %f */\n", *value1); +} + +static void +parse_plbu_semaphore(FILE *fp, uint32_t *value1, uint32_t *value2) +{ + if (*value1 == 0x00010002) + fprintf(fp, "\t/* ARRAYS_SEMAPHORE_BEGIN */\n"); + else if (*value1 == 0x00010001) + fprintf(fp, "\t/* ARRAYS_SEMAPHORE_END */\n"); + else + fprintf(fp, "\t/* SEMAPHORE - cmd unknown! */\n"); +} + +static void +parse_plbu_primitive_setup(FILE *fp, uint32_t *value1, uint32_t *value2) +{ + if (*value1 == 0x00000200) + fprintf(fp, "\t/* UNKNOWN_2 (PRIMITIVE_SETUP INIT?) */\n"); + else + fprintf(fp, "\t/* PRIMITIVE_SETUP: %scull: %d (0x%x), index_size: %d */\n", + (*value1 & 0x1000) ? "force point size, " : "", + (*value1 & 0x000f0000) >> 16, (*value1 & 0x000f0000) >> 16, + (*value1 & 0x00000e00) >> 9); +} + +static void +parse_plbu_rsw_vertex_array(FILE *fp, uint32_t *value1, uint32_t *value2) +{ + fprintf(fp, "\t/* RSW_VERTEX_ARRAY: rsw: 0x%08x, gl_pos: 0x%08x */\n", + *value1, + (*value2 & 0x0fffffff) << 4); +} + +static void +parse_plbu_scissors(FILE *fp, uint32_t *value1, uint32_t *value2) +{ + float minx = (*value1 & 0xc0000000) >> 30 | (*value2 & 0x00001fff) << 2; + float maxx = ((*value2 & 0x0fffe000) >> 13) + 1; + float miny = *value1 & 0x00003fff; + float maxy = ((*value1 & 0x3fff8000) >> 15) + 1; + + fprintf(fp, "\t/* SCISSORS: minx: %f, maxx: %f, miny: %f, maxy: %f */\n", + minx, maxx, miny, maxy); +} + +static void +parse_plbu_unknown_1(FILE *fp, uint32_t *value1, uint32_t *value2) +{ + fprintf(fp, "\t/* UNKNOWN_1 */\n"); +} + +static void +parse_plbu_low_prim_size(FILE *fp, float *value1, uint32_t *value2) +{ + fprintf(fp, "\t/* LOW_PRIM_SIZE: size: %f */\n", *value1); +} + +static void +parse_plbu_depth_range_near(FILE *fp, float *value1, uint32_t *value2) +{ + fprintf(fp, "\t/* DEPTH_RANG_NEAR: depth_range: %f */\n", *value1); +} + +static void +parse_plbu_depth_range_far(FILE *fp, float *value1, uint32_t *value2) +{ + fprintf(fp, "\t/* DEPTH_RANGE_FAR: depth_range: %f */\n", *value1); +} + +static void +parse_plbu_indexed_dest(FILE *fp, uint32_t *value1, uint32_t *value2) +{ + fprintf(fp, "\t/* INDEXED_DEST: gl_pos: 0x%08x */\n", *value1); +} + +static void +parse_plbu_indexed_pt_size(FILE *fp, uint32_t *value1, uint32_t *value2) +{ + fprintf(fp, "\t/* INDEXED_PT_SIZE: pt_size: 0x%08x */\n", *value1); +} + +static void +parse_plbu_indices(FILE *fp, uint32_t *value1, uint32_t *value2) +{ + fprintf(fp, "\t/* INDICES: indices: 0x%08x */\n", *value1); +} + +static void +parse_plbu_draw_arrays(FILE *fp, uint32_t *value1, uint32_t *value2) +{ + if ((*value1 == 0x00000000) && (*value2 == 0x00000000)) { + fprintf(fp, "\t/* ---EMPTY CMD */\n"); + return; + } + + uint32_t count = (*value1 & 0xff000000) >> 24 | (*value2 & 0x000000ff) << 8; + uint32_t start = *value1 & 0x00ffffff; + uint32_t mode = (*value2 & 0x001f0000) >> 16; + + fprintf(fp, "\t/* DRAW_ARRAYS: count: %d, start: %d, mode: %d (0x%x) */\n", + count, start, mode, mode); +} + +static void +parse_plbu_draw_elements(FILE *fp, uint32_t *value1, uint32_t *value2) +{ + uint32_t count = (*value1 & 0xff000000) >> 24 | (*value2 & 0x000000ff) << 8; + uint32_t start = *value1 & 0x00ffffff; + uint32_t mode = (*value2 & 0x001f0000) >> 16; + + fprintf(fp, "\t/* DRAW_ELEMENTS: count: %d, start: %d, mode: %d (0x%x) */\n", + count, start, mode, mode); +} + +static void +parse_plbu_continue(FILE *fp, uint32_t *value1, uint32_t *value2) +{ + fprintf(fp, "\t/* CONTINUE: continue at 0x%08x */\n", *value1); +} + +static void +parse_plbu_end(FILE *fp, uint32_t *value1, uint32_t *value2) +{ + fprintf(fp, "\t/* END (FINISH/FLUSH) */\n"); +} + +void +lima_parse_plbu(FILE *fp, uint32_t *data, int size, uint32_t start) +{ + uint32_t *value1; + uint32_t *value2; + + fprintf(fp, "/* ============ PLBU CMD STREAM BEGIN ============= */\n"); + for (int i = 0; i * 4 < size; i += 2) { + value1 = &data[i]; + value2 = &data[i + 1]; + fprintf(fp, "/* 0x%08x (0x%08x) */\t0x%08x 0x%08x", + start + i * 4, i * 4, *value1, *value2); + + if ((*value2 & 0xffe00000) == 0x00000000) + parse_plbu_draw_arrays(fp, value1, value2); + else if ((*value2 & 0xffe00000) == 0x00200000) + parse_plbu_draw_elements(fp, value1, value2); + else if ((*value2 & 0xff000fff) == 0x10000100) + parse_plbu_indexed_dest(fp, value1, value2); + else if ((*value2 & 0xff000fff) == 0x10000101) + parse_plbu_indices(fp, value1, value2); + else if ((*value2 & 0xff000fff) == 0x10000102) + parse_plbu_indexed_pt_size(fp, value1, value2); + else if ((*value2 & 0xff000fff) == 0x10000105) + parse_plbu_viewport_bottom(fp, (float *)value1, value2); + else if ((*value2 & 0xff000fff) == 0x10000106) + parse_plbu_viewport_top(fp, (float *)value1, value2); + else if ((*value2 & 0xff000fff) == 0x10000107) + parse_plbu_viewport_left(fp, (float *)value1, value2); + else if ((*value2 & 0xff000fff) == 0x10000108) + parse_plbu_viewport_right(fp, (float *)value1, value2); + else if ((*value2 & 0xff000fff) == 0x10000109) + parse_plbu_tiled_dimensions(fp, value1, value2); + else if ((*value2 & 0xff000fff) == 0x1000010a) + parse_plbu_unknown_1(fp, value1, value2); + else if ((*value2 & 0xff000fff) == 0x1000010b) /* also unknown_2 */ + parse_plbu_primitive_setup(fp, value1, value2); + else if ((*value2 & 0xff000fff) == 0x1000010c) + parse_plbu_block_step(fp, value1, value2); + else if ((*value2 & 0xff000fff) == 0x1000010d) + parse_plbu_low_prim_size(fp, (float *)value1, value2); + else if ((*value2 & 0xff000fff) == 0x1000010e) + parse_plbu_depth_range_near(fp, (float *)value1, value2); + else if ((*value2 & 0xff000fff) == 0x1000010f) + parse_plbu_depth_range_far(fp, (float *)value1, value2); + else if ((*value2 & 0xff000000) == 0x28000000) + parse_plbu_array_address(fp, value1, value2); + else if ((*value2 & 0xf0000000) == 0x30000000) + parse_plbu_block_stride(fp, value1, value2); + else if (*value2 == 0x50000000) + parse_plbu_end(fp, value1, value2); + else if ((*value2 & 0xf0000000)== 0x60000000) + parse_plbu_semaphore(fp, value1, value2); + else if ((*value2 & 0xf0000000)== 0x70000000) + parse_plbu_scissors(fp, value1, value2); + else if ((*value2 & 0xf0000000)== 0x80000000) + parse_plbu_rsw_vertex_array(fp, value1, value2); + else if ((*value2 & 0xf0000000)== 0xf0000000) + parse_plbu_continue(fp, value1, value2); + else + fprintf(fp, "\t/* --- unknown cmd --- */\n"); + } + fprintf(fp, "/* ============ PLBU CMD STREAM END =============== */\n"); + fprintf(fp, "\n"); +} + +static void +parse_rsw(FILE *fp, uint32_t *value, int i, uint32_t *helper) +{ + fprintf(fp, "\t/* %s", render_state_infos[i].info); + + switch (i) { + case 0: /* BLEND COLOR BG */ + fprintf(fp, ": blend_color.color[1] = %f, blend_color.color[2] = %f */\n", + (float)(ubyte_to_float((*value & 0xffff0000) >> 16)), + (float)(ubyte_to_float(*value & 0x0000ffff))); + break; + case 1: /* BLEND COLOR RA */ + fprintf(fp, ": blend_color.color[3] = %f, blend_color.color[0] = %f */\n", + (float)(ubyte_to_float((*value & 0xffff0000) >> 16)), + (float)(ubyte_to_float(*value & 0x0000ffff))); + break; + case 2: /* ALPHA BLEND */ + fprintf(fp, "(1): colormask 0x%02x, rgb_func %d, alpha_func %d */\n", + (*value & 0xf0000000) >> 28, /* colormask */ + (*value & 0x00000007), /* rgb_func */ + (*value & 0x00000038) >> 3); /* alpha_func */ + /* add a few tabs for alignment */ + fprintf(fp, "\t\t\t\t\t\t/* %s(2)", render_state_infos[i].info); + fprintf(fp, ": rgb_src_factor %d, rbg_dst_factor %d */\n", + (*value & 0x000007c0) >> 6, /* rgb_src_factor */ + (*value & 0x0000f800) >> 11); /* rgb_dst_factor */ + fprintf(fp, "\t\t\t\t\t\t/* %s(3)", render_state_infos[i].info); + fprintf(fp, ": alpha_src_factor %d, alpha_dst_factor %d, bits 24-27 0x%02x */\n", + (*value & 0x000f0000) >> 16, /* alpha_src_factor */ + (*value & 0x00f00000) >> 20, /* alpha_dst_factor */ + (*value & 0x0f000000) >> 24); /* bits 24-27 */ + break; + case 3: /* DEPTH TEST */ + if ((*value & 0x00000001) == 0x00000001) + fprintf(fp, ": depth test enabled && writes allowed"); + else + fprintf(fp, ": depth test disabled || writes not allowed"); + + fprintf(fp, ", PIPE_FUNC_%d", *value & 0x0000000e); + fprintf(fp, ", offset_scale: %d", *value & 0xffff0000); + fprintf(fp, ", unknown bits 4-15: 0x%08x */\n", *value & 0x0000fff0); + break; + case 4: /* DEPTH RANGE */ + fprintf(fp, ": viewport.far = %f, viewport.near = %f */\n", + (float)(ushort_to_float((*value & 0xffff0000) >> 16)), + (float)(ushort_to_float(*value & 0x0000ffff))); + break; + case 5: /* STENCIL FRONT */ + fprintf(fp, "(1): valuemask 0x%02x, ref value %d (0x%02x), stencil_func %d */\n", + (*value & 0xff000000) >> 24, /* valuemask */ + (*value & 0x00ff0000) >> 16, (*value & 0x00ff0000) >> 16, /* ref value */ + (*value & 0x00000007)); /* stencil_func */ + /* add a few tabs for alignment */ + fprintf(fp, "\t\t\t\t\t\t/* %s(2)", render_state_infos[i].info); + fprintf(fp, ": fail_op %d, zfail_op %d, zpass_op %d, unknown (12-15) 0x%02x */\n", + (*value & 0x00000038) >> 3, /* fail_op */ + (*value & 0x000001c0) >> 6, /* zfail_op */ + (*value & 0x00000e00) >> 9, /* zpass_op */ + (*value & 0x0000f000) >> 12); /* unknown */ + break; + case 6: /* STENCIL BACK */ + fprintf(fp, "(1): valuemask 0x%02x, ref value %d (0x%02x), stencil_func %d */\n", + (*value & 0xff000000) >> 24, /* valuemask */ + (*value & 0x00ff0000) >> 16, (*value & 0x00ff0000) >> 16, /* ref value */ + (*value & 0x00000007)); /* stencil_func */ + /* add a few tabs for alignment */ + fprintf(fp, "\t\t\t\t\t\t/* %s(2)", render_state_infos[i].info); + fprintf(fp, ": fail_op %d, zfail_op %d, zpass_op %d, unknown (12-15) 0x%02x */\n", + (*value & 0x00000038) >> 3, /* fail_op */ + (*value & 0x000001c0) >> 6, /* zfail_op */ + (*value & 0x00000e00) >> 9, /* zpass_op */ + (*value & 0x0000f000) >> 12); /* unknown */ + break; + case 7: /* STENCIL TEST */ + fprintf(fp, "(1): stencil_front writemask 0x%02x, stencil_back writemask 0x%02x */\n", + (*value & 0x000000ff), /* front writemask */ + (*value & 0x0000ff00) >> 8); /* back writemask */ + /* add a few tabs for alignment */ + fprintf(fp, "\t\t\t\t\t\t/* %s(2)", render_state_infos[i].info); + fprintf(fp, ": unknown (bits 16-31) 0x%04x */\n", + (*value & 0xffff0000) >> 16); /* unknown, alpha ref_value? */ + break; + case 8: /* MULTI SAMPLE */ + if ((*value & 0x00000f00) == 0x00000000) + fprintf(fp, ": points"); + else if ((*value & 0x00000f00) == 0x00000400) + fprintf(fp, ": lines"); + else if ((*value & 0x00000f00) == 0x00000800) + fprintf(fp, ": triangles"); + else + fprintf(fp, ": unknown"); + + if ((*value & 0x00000078) == 0x00000068) + fprintf(fp, ", fb_samples */\n"); + else if ((*value & 0x00000078) == 0x00000000) + fprintf(fp, " */\n"); + else + fprintf(fp, ", UNKNOWN\n"); + break; + case 9: /* SHADER ADDRESS */ + fprintf(fp, ": fs shader @ 0x%08x, ((uint32_t *)ctx->fs->bo->map)[0] & 0x1f: 0x%08x */\n", + *value & 0xffffffe0, *value & 0x0000001f); + break; + case 10: /* VARYING TYPES */ + fprintf(fp, "(1): "); + int val, j; + /* 0 - 5 */ + for (j = 0; j < 6; j++) { + val = *value & (0x07 << (j * 3)); + fprintf(fp, "val %d-%d, ", j, val); + } + /* 6 - 9 */ + /* add a few tabs for alignment */ + fprintf(fp, "\n\t\t\t\t\t\t/* %s(2): ", render_state_infos[i].info); + for (j = 6; j < 10; j++) { + val = *value & (0x07 << (j * 3)); + fprintf(fp, "val %d-%d, ", j, val); + } + /* 10 */ + val = ((*value & 0x0c000000) >> 30) | ((*helper & 0x00000001) << 2); + fprintf(fp, "val %d-%d, ", j, val); + j++; + /* 11 */ + val = (*helper & 0x0000000e) >> 1; + fprintf(fp, "val %d-%d */\n", j, val); + break; + case 11: /* UNIFORMS ADDRESS */ + fprintf(fp, ": pp uniform info @ 0x%08x, bits: 0x%01x */\n", + *value & 0xfffffff0, *value & 0x0000000f); + break; + case 12: /* TEXTURES ADDRESS */ + fprintf(fp, ": address: 0x%08x */\n", *value); + break; + case 13: /* AUX0 */ + fprintf(fp, ": varying_stride: %d, tex_stateobj.num_samplers: %d */\n", + *value & 0x0000001f, (*value & 0xffffc000) >> 14); + break; + case 14: /* AUX1 */ + fprintf(fp, ": "); + if ((*value & 0x00002000) == 0x00002000) + fprintf(fp, "blend->base.dither true, "); + if ((*value & 0x00010000) == 0x00010000) + fprintf(fp, "ctx->const_buffer[PIPE_SHADER_FRAGMENT].buffer true "); + fprintf(fp, "*/\n"); + break; + case 15: /* VARYINGS ADDRESS */ + fprintf(fp, ": varyings @ 0x%08x */\n", *value & 0xfffffff0); + break; + default: /* should never be executed! */ + fprintf(fp, ": something went wrong!!! */\n"); + break; + } +} + +void +lima_parse_render_state(FILE *fp, uint32_t *data, int size, uint32_t start) +{ + uint32_t *value; + + fprintf(fp, "/* ============ RSW BEGIN ========================= */\n"); + for (int i = 0; i * 4 < size; i++) { + value = &data[i]; + fprintf(fp, "/* 0x%08x (0x%08x) */\t0x%08x", + start + i * 4, i * 4, *value); + if (i == 10) + parse_rsw(fp, value, i, &data[15]); + else + parse_rsw(fp, value, i, NULL); + } + fprintf(fp, "/* ============ RSW END =========================== */\n"); +} + +static void +parse_texture(FILE *fp, uint32_t *data, uint32_t start, uint32_t offset) +{ + uint32_t i = 0; + offset /= 4; + lima_tex_desc *desc = (lima_tex_desc *)&data[offset]; + + /* Word 0 */ + fprintf(fp, "/* 0x%08x (0x%08x) */\t0x%08x\n", + start + i * 4, i * 4, *(&data[i + offset])); + i++; + fprintf(fp, "\t format: 0x%x (%d)\n", desc->format, desc->format); + fprintf(fp, "\t flag1: 0x%x (%d)\n", desc->flag1, desc->flag1); + fprintf(fp, "\t swap_r_b: 0x%x (%d)\n", desc->swap_r_b, desc->swap_r_b); + fprintf(fp, "\t unknown_0_1: 0x%x (%d)\n", desc->unknown_0_1, desc->unknown_0_1); + fprintf(fp, "\t stride: 0x%x (%d)\n", desc->stride, desc->stride); + fprintf(fp, "\t unknown_0_2: 0x%x (%d)\n", desc->unknown_0_2, desc->unknown_0_2); + + /* Word 1 - 3 */ + fprintf(fp, "/* 0x%08x (0x%08x) */\t0x%08x 0x%08x 0x%08x\n", + start + i * 4, i * 4, *(&data[i + offset]), *(&data[i + 1 + offset]), *(&data[i + 2 + offset])); + i += 3; + fprintf(fp, "\t unknown_1_1: 0x%x (%d)\n", desc->unknown_1_1, desc->unknown_1_1); + fprintf(fp, "\t unnorm_coords: 0x%x (%d)\n", desc->unnorm_coords, desc->unnorm_coords); + fprintf(fp, "\t unknown_1_2: 0x%x (%d)\n", desc->unknown_1_2, desc->unknown_1_2); + fprintf(fp, "\t texture_type: 0x%x (%d)\n", desc->texture_type, desc->texture_type); + fprintf(fp, "\t min_lod: 0x%x (%d) (%f)\n", desc->min_lod, desc->min_lod, lima_fixed8_to_float(desc->min_lod)); + fprintf(fp, "\t max_lod: 0x%x (%d) (%f)\n", desc->max_lod, desc->max_lod, lima_fixed8_to_float(desc->max_lod)); + fprintf(fp, "\t lod_bias: 0x%x (%d) (%f)\n", desc->lod_bias, desc->lod_bias, lima_fixed8_to_float(desc->lod_bias)); + fprintf(fp, "\t unknown_2_1: 0x%x (%d)\n", desc->unknown_2_1, desc->unknown_2_1); + fprintf(fp, "\t has_stride: 0x%x (%d)\n", desc->has_stride, desc->has_stride); + fprintf(fp, "\t min_mipfilter_2: 0x%x (%d)\n", desc->min_mipfilter_2, desc->min_mipfilter_2); + fprintf(fp, "\t min_img_filter_nearest: 0x%x (%d)\n", desc->min_img_filter_nearest, desc->min_img_filter_nearest); + fprintf(fp, "\t mag_img_filter_nearest: 0x%x (%d)\n", desc->mag_img_filter_nearest, desc->mag_img_filter_nearest); + fprintf(fp, "\t wrap_s_clamp_to_edge: 0x%x (%d)\n", desc->wrap_s_clamp_to_edge, desc->wrap_s_clamp_to_edge); + fprintf(fp, "\t wrap_s_clamp: 0x%x (%d)\n", desc->wrap_s_clamp, desc->wrap_s_clamp); + fprintf(fp, "\t wrap_s_mirror_repeat: 0x%x (%d)\n", desc->wrap_s_mirror_repeat, desc->wrap_s_mirror_repeat); + fprintf(fp, "\t wrap_t_clamp_to_edge: 0x%x (%d)\n", desc->wrap_t_clamp_to_edge, desc->wrap_t_clamp_to_edge); + fprintf(fp, "\t wrap_t_clamp: 0x%x (%d)\n", desc->wrap_t_clamp, desc->wrap_t_clamp); + fprintf(fp, "\t wrap_t_mirror_repeat: 0x%x (%d)\n", desc->wrap_t_mirror_repeat, desc->wrap_t_mirror_repeat); + fprintf(fp, "\t unknown_2_2: 0x%x (%d)\n", desc->unknown_2_2, desc->unknown_2_2); + fprintf(fp, "\t width: 0x%x (%d)\n", desc->width, desc->width); + fprintf(fp, "\t height: 0x%x (%d)\n", desc->height, desc->height); + fprintf(fp, "\t unknown_3_1: 0x%x (%d)\n", desc->unknown_3_1, desc->unknown_3_1); + fprintf(fp, "\t unknown_3_2: 0x%x (%d)\n", desc->unknown_3_2, desc->unknown_3_2); + + /* Word 4 */ + fprintf(fp, "/* 0x%08x (0x%08x) */\t0x%08x\n", + start + i * 4, i * 4, *(&data[i + offset])); + i++; + fprintf(fp, "\t unknown_4: 0x%x (%d)\n", desc->unknown_4, desc->unknown_4); + + /* Word 5 */ + fprintf(fp, "/* 0x%08x (0x%08x) */\t0x%08x\n", + start + i * 4, i * 4, *(&data[i + offset])); + i++; + fprintf(fp, "\t unknown_5: 0x%x (%d)\n", desc->unknown_5, desc->unknown_5); + + /* Word 6 - */ + fprintf(fp, "/* 0x%08x (0x%08x) */", + start + i * 4, i * 4); + fprintf(fp, "\t"); + + int miplevels = (int)lima_fixed8_to_float(desc->max_lod); + for (int k = 0; k < ((((miplevels + 1) * 26) + 64) / 32); k++) + fprintf(fp, "0x%08x ", *(&data[i + offset + k])); + fprintf(fp, "\n"); + + i++; + fprintf(fp, "\t unknown_6_1: 0x%x (%d)\n", desc->va_s.unknown_6_1, desc->va_s.unknown_6_1); + fprintf(fp, "\t layout: 0x%x (%d)\n", desc->va_s.layout, desc->va_s.layout); + fprintf(fp, "\t unknown_6_2: 0x%x (%d)\n", desc->va_s.unknown_6_2, desc->va_s.unknown_6_2); + fprintf(fp, "\t unknown_6_3: 0x%x (%d)\n", desc->va_s.unknown_6_3, desc->va_s.unknown_6_3); + + /* first level */ + fprintf(fp, "\t va_0: 0x%x \n", desc->va_s.va_0 << 6); + + /* second level up to desc->miplevels */ + int j; + unsigned va_bit_idx; + unsigned va_idx; + uint32_t va; + uint32_t va_1; + uint32_t va_2; + for (j = 1; j <= miplevels; j++) { + va = 0; + va_1 = 0; + va_2 = 0; + + va_bit_idx = VA_BIT_OFFSET + (VA_BIT_SIZE * j); + va_idx = va_bit_idx / 32; + va_bit_idx %= 32; + + /* the first (32 - va_bit_idx) bits */ + va_1 |= (*(&data[i + offset + va_idx - 1]) >> va_bit_idx); + + /* do we need some bits from the following word? */ + if (va_bit_idx > 6) { + /* shift left and right again to erase the unneeded bits, keep space for va1 */ + va_2 |= (*(&data[i + offset + va_idx]) << (2 * 32 - VA_BIT_SIZE - va_bit_idx)); + va_2 >>= ((2 * 32 - VA_BIT_SIZE - va_bit_idx) - (32 - va_bit_idx)); + va |= va_2; + } + va |= va_1; + va <<= 6; + fprintf(fp, "\t va_%d: 0x%x \n", j, va); + } +} + +void +lima_parse_texture_descriptor(FILE *fp, uint32_t *data, int size, uint32_t start, uint32_t offset) +{ + fprintf(fp, "/* ============ TEXTURE BEGIN ===================== */\n"); + parse_texture(fp, data, start, offset); + fprintf(fp, "/* ============ TEXTURE END ======================= */\n"); +} diff -Nru mesa-19.2.8/src/gallium/drivers/lima/lima_parser.h mesa-20.0.8/src/gallium/drivers/lima/lima_parser.h --- mesa-19.2.8/src/gallium/drivers/lima/lima_parser.h 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/lima/lima_parser.h 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,32 @@ +/* + * Copyright (C) 2018-2019 Lima Project + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#ifndef H_LIMA_PARSER +#define H_LIMA_PARSER + +void lima_parse_vs(FILE *fp, uint32_t *data, int size, uint32_t start); +void lima_parse_plbu(FILE *fp, uint32_t *data, int size, uint32_t start); +void lima_parse_render_state(FILE *fp, uint32_t *data, int size, uint32_t start); +void lima_parse_texture_descriptor(FILE *fp, uint32_t *data, int size, uint32_t start, uint32_t offset); + +#endif diff -Nru mesa-19.2.8/src/gallium/drivers/lima/lima_program.c mesa-20.0.8/src/gallium/drivers/lima/lima_program.c --- mesa-19.2.8/src/gallium/drivers/lima/lima_program.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/lima/lima_program.c 2020-06-12 01:21:17.000000000 +0000 @@ -54,6 +54,7 @@ .lower_bitops = true, .lower_rotate = true, .lower_sincos = true, + .lower_fceil = true, }; static const nir_shader_compiler_options fs_nir_options = { @@ -67,6 +68,7 @@ .lower_fsign = true, .lower_rotate = true, .lower_fdot = true, + .lower_fdph = true, .lower_bitops = true, .lower_vector_cmp = true, }; @@ -100,6 +102,7 @@ bool progress; NIR_PASS_V(s, nir_lower_viewport_transform); + NIR_PASS_V(s, nir_lower_point_size, 1.0f, 100.0f); NIR_PASS_V(s, nir_lower_io, nir_var_all, type_size, 0); NIR_PASS_V(s, nir_lower_load_const_to_scalar); NIR_PASS_V(s, lima_nir_lower_uniform_to_scalar); @@ -110,7 +113,7 @@ progress = false; NIR_PASS_V(s, nir_lower_vars_to_ssa); - NIR_PASS(progress, s, nir_lower_alu_to_scalar, NULL); + NIR_PASS(progress, s, nir_lower_alu_to_scalar, NULL, NULL); NIR_PASS(progress, s, nir_lower_phis_to_scalar); NIR_PASS(progress, s, nir_copy_prop); NIR_PASS(progress, s, nir_opt_remove_phis); @@ -128,15 +131,16 @@ } while (progress); NIR_PASS_V(s, nir_lower_int_to_float); - NIR_PASS_V(s, nir_lower_bool_to_float); - - /* Some ops must be lowered after being converted from int ops, - * so re-run nir_opt_algebraic after int lowering. */ + /* Run opt_algebraic between int_to_float and bool_to_float because + * int_to_float emits ftrunc, and ftrunc lowering generates bool ops + */ do { progress = false; NIR_PASS(progress, s, nir_opt_algebraic); } while (progress); + NIR_PASS_V(s, nir_lower_bool_to_float); + NIR_PASS_V(s, nir_copy_prop); NIR_PASS_V(s, nir_opt_dce); NIR_PASS_V(s, nir_lower_locals_to_regs); @@ -145,20 +149,55 @@ nir_sweep(s); } +static bool +lima_alu_to_scalar_filter_cb(const nir_instr *instr, const void *data) +{ + if (instr->type != nir_instr_type_alu) + return false; + + nir_alu_instr *alu = nir_instr_as_alu(instr); + switch (alu->op) { + case nir_op_frcp: + case nir_op_frsq: + case nir_op_flog2: + case nir_op_fexp2: + case nir_op_fsqrt: + case nir_op_fsin: + case nir_op_fcos: + return true; + default: + break; + } + + /* nir vec4 fcsel assumes that each component of the condition will be + * used to select the same component from the two options, but Utgard PP + * has only 1 component condition. If all condition components are not the + * same we need to lower it to scalar. + */ + switch (alu->op) { + case nir_op_bcsel: + case nir_op_fcsel: + break; + default: + return false; + } + + int num_components = nir_dest_num_components(alu->dest.dest); + + uint8_t swizzle = alu->src[0].swizzle[0]; + + for (int i = 1; i < num_components; i++) + if (alu->src[0].swizzle[i] != swizzle) + return true; + + return false; +} + void lima_program_optimize_fs_nir(struct nir_shader *s) { - BITSET_DECLARE(alu_lower, nir_num_opcodes) = {0}; bool progress; - BITSET_SET(alu_lower, nir_op_frcp); - BITSET_SET(alu_lower, nir_op_frsq); - BITSET_SET(alu_lower, nir_op_flog2); - BITSET_SET(alu_lower, nir_op_fexp2); - BITSET_SET(alu_lower, nir_op_fsqrt); - BITSET_SET(alu_lower, nir_op_fsin); - BITSET_SET(alu_lower, nir_op_fcos); - NIR_PASS_V(s, nir_lower_fragcoord_wtrans); NIR_PASS_V(s, nir_lower_io, nir_var_all, type_size, 0); NIR_PASS_V(s, nir_lower_regs_to_ssa); @@ -166,10 +205,14 @@ do { progress = false; + NIR_PASS(progress, s, nir_opt_vectorize); + } while (progress); + + do { + progress = false; NIR_PASS_V(s, nir_lower_vars_to_ssa); - NIR_PASS(progress, s, nir_lower_alu_to_scalar, alu_lower); - NIR_PASS(progress, s, nir_lower_phis_to_scalar); + NIR_PASS(progress, s, nir_lower_alu_to_scalar, lima_alu_to_scalar_filter_cb, NULL); NIR_PASS(progress, s, nir_copy_prop); NIR_PASS(progress, s, nir_opt_remove_phis); NIR_PASS(progress, s, nir_opt_dce); @@ -183,6 +226,7 @@ nir_var_shader_in | nir_var_shader_out | nir_var_function_temp); + NIR_PASS(progress, s, lima_nir_split_load_input); } while (progress); NIR_PASS_V(s, nir_lower_int_to_float); @@ -243,6 +287,8 @@ return NULL; } + so->uses_discard = nir->info.fs.uses_discard; + return so; } @@ -261,7 +307,7 @@ struct lima_fs_shader_state *so = hwcso; if (so->bo) - lima_bo_free(so->bo); + lima_bo_unreference(so->bo); ralloc_free(so); } @@ -303,6 +349,8 @@ fs->shader = NULL; } + ctx->pp_max_stack_size = MAX2(ctx->pp_max_stack_size, ctx->fs->stack_size); + return true; } @@ -335,6 +383,8 @@ return NULL; } + ralloc_free(nir); + return so; } @@ -353,7 +403,7 @@ struct lima_vs_shader_state *so = hwcso; if (so->bo) - lima_bo_free(so->bo); + lima_bo_unreference(so->bo); ralloc_free(so); } diff -Nru mesa-19.2.8/src/gallium/drivers/lima/lima_resource.c mesa-20.0.8/src/gallium/drivers/lima/lima_resource.c --- mesa-19.2.8/src/gallium/drivers/lima/lima_resource.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/lima/lima_resource.c 2020-06-12 01:21:17.000000000 +0000 @@ -24,7 +24,7 @@ #include "util/u_memory.h" #include "util/u_blitter.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_inlines.h" #include "util/u_math.h" #include "util/u_debug.h" @@ -116,6 +116,7 @@ res->levels[level].width = aligned_width; res->levels[level].stride = stride; res->levels[level].offset = size; + res->levels[level].layer_stride = util_format_get_stride(pres->format, align(width, 16)) * align(height, 16); /* The start address of each level <= 10 must be 64-aligned * in order to be able to pass the addresses @@ -177,9 +178,13 @@ int count) { struct lima_screen *screen = lima_screen(pscreen); - bool should_tile = false; + bool should_tile = lima_debug & LIMA_DEBUG_NO_TILING ? false : true; unsigned width, height; bool should_align_dimensions; + bool has_user_modifiers = true; + + if (count == 1 && modifiers[0] == DRM_FORMAT_MOD_INVALID) + has_user_modifiers = false; /* VBOs/PBOs are untiled (and 1 height). */ if (templat->target == PIPE_BUFFER) @@ -188,9 +193,17 @@ if (templat->bind & (PIPE_BIND_LINEAR | PIPE_BIND_SCANOUT)) should_tile = false; - /* if linear buffer is not allowed, alloc fail */ - if (!should_tile && !drm_find_modifier(DRM_FORMAT_MOD_LINEAR, modifiers, count)) - return NULL; + /* If there's no user modifiers and buffer is shared we use linear */ + if (!has_user_modifiers && (templat->bind & PIPE_BIND_SHARED)) + should_tile = false; + + if (drm_find_modifier(DRM_FORMAT_MOD_LINEAR, modifiers, count)) + should_tile = false; + + if (has_user_modifiers && + !drm_find_modifier(DRM_FORMAT_MOD_ARM_16X16_BLOCK_U_INTERLEAVED, + modifiers, count)) + should_tile = false; if (should_tile || (templat->bind & PIPE_BIND_RENDER_TARGET) || (templat->bind & PIPE_BIND_DEPTH_STENCIL)) { @@ -227,10 +240,9 @@ lima_resource_create(struct pipe_screen *pscreen, const struct pipe_resource *templat) { - static const uint64_t modifiers[] = { - DRM_FORMAT_MOD_LINEAR, - }; - return _lima_resource_create_with_modifiers(pscreen, templat, modifiers, ARRAY_SIZE(modifiers)); + const uint64_t mod = DRM_FORMAT_MOD_INVALID; + + return _lima_resource_create_with_modifiers(pscreen, templat, &mod, 1); } static struct pipe_resource * @@ -259,11 +271,14 @@ struct lima_resource *res = lima_resource(pres); if (res->bo) - lima_bo_free(res->bo); + lima_bo_unreference(res->bo); if (res->scanout) renderonly_scanout_destroy(res->scanout, screen->ro); + if (res->damage.region) + FREE(res->damage.region); + FREE(res); } @@ -311,8 +326,24 @@ else res->levels[0].width = pres->width0; - handle->modifier = DRM_FORMAT_MOD_LINEAR; - res->tiled = false; + switch (handle->modifier) { + case DRM_FORMAT_MOD_LINEAR: + res->tiled = false; + break; + case DRM_FORMAT_MOD_ARM_16X16_BLOCK_U_INTERLEAVED: + res->tiled = true; + break; + case DRM_FORMAT_MOD_INVALID: + /* Modifier wasn't specified and it's shared buffer. We create these + * as linear, so disable tiling. + */ + res->tiled = false; + break; + default: + fprintf(stderr, "Attempted to import unsupported modifier 0x%llx\n", + (long long)handle->modifier); + goto err_out; + } return pres; @@ -330,7 +361,10 @@ struct lima_screen *screen = lima_screen(pscreen); struct lima_resource *res = lima_resource(pres); - handle->modifier = DRM_FORMAT_MOD_LINEAR; + if (res->tiled) + handle->modifier = DRM_FORMAT_MOD_ARM_16X16_BLOCK_U_INTERLEAVED; + else + handle->modifier = DRM_FORMAT_MOD_LINEAR; if (handle->type == WINSYS_HANDLE_TYPE_KMS && screen->ro && renderonly_get_handle(res->scanout, handle)) @@ -343,6 +377,93 @@ return true; } +static void +get_scissor_from_box(struct pipe_scissor_state *s, + const struct pipe_box *b, int h) +{ + int y = h - (b->y + b->height); + /* region in tile unit */ + s->minx = b->x >> 4; + s->miny = y >> 4; + s->maxx = (b->x + b->width + 0xf) >> 4; + s->maxy = (y + b->height + 0xf) >> 4; +} + +static void +get_damage_bound_box(struct pipe_resource *pres, + const struct pipe_box *rects, + unsigned int nrects, + struct pipe_scissor_state *bound) +{ + struct pipe_box b = rects[0]; + + for (int i = 1; i < nrects; i++) + u_box_union_2d(&b, &b, rects + i); + + int ret = u_box_clip_2d(&b, &b, pres->width0, pres->height0); + if (ret < 0) + memset(bound, 0, sizeof(*bound)); + else + get_scissor_from_box(bound, &b, pres->height0); +} + +static void +lima_resource_set_damage_region(struct pipe_screen *pscreen, + struct pipe_resource *pres, + unsigned int nrects, + const struct pipe_box *rects) +{ + struct lima_resource *res = lima_resource(pres); + struct lima_damage_region *damage = &res->damage; + int i; + + if (damage->region) { + FREE(damage->region); + damage->region = NULL; + damage->num_region = 0; + } + + if (!nrects) + return; + + /* check full damage + * + * TODO: currently only check if there is any single damage + * region that can cover the full render target; there may + * be some accurate way, but a single window size damage + * region is most of the case from weston + */ + for (i = 0; i < nrects; i++) { + if (rects[i].x <= 0 && rects[i].y <= 0 && + rects[i].x + rects[i].width >= pres->width0 && + rects[i].y + rects[i].height >= pres->height0) + return; + } + + struct pipe_scissor_state *bound = &damage->bound; + get_damage_bound_box(pres, rects, nrects, bound); + + damage->region = CALLOC(nrects, sizeof(*damage->region)); + if (!damage->region) + return; + + for (i = 0; i < nrects; i++) + get_scissor_from_box(damage->region + i, rects + i, + pres->height0); + + /* is region aligned to tiles? */ + damage->aligned = true; + for (i = 0; i < nrects; i++) { + if (rects[i].x & 0xf || rects[i].y & 0xf || + rects[i].width & 0xf || rects[i].height & 0xf) { + damage->aligned = false; + break; + } + } + + damage->num_region = nrects; +} + void lima_resource_screen_init(struct lima_screen *screen) { @@ -351,6 +472,7 @@ screen->base.resource_from_handle = lima_resource_from_handle; screen->base.resource_destroy = lima_resource_destroy; screen->base.resource_get_handle = lima_resource_get_handle; + screen->base.set_damage_region = lima_resource_set_damage_region; } static struct pipe_surface * @@ -437,7 +559,7 @@ struct lima_ctx_plb_pp_stream *s = entry->data; if (--s->refcnt == 0) { if (s->bo) - lima_bo_free(s->bo); + lima_bo_unreference(s->bo); _mesa_hash_table_remove(ctx->plb_pp_stream, entry); ralloc_free(s); } @@ -504,20 +626,26 @@ trans->staging = malloc(ptrans->stride * ptrans->box.height * ptrans->box.depth); - if (usage & PIPE_TRANSFER_READ) - panfrost_load_tiled_image(trans->staging, bo->map + res->levels[level].offset, - &ptrans->box, - ptrans->stride, - res->levels[level].stride, - util_format_get_blocksize(pres->format)); + if (usage & PIPE_TRANSFER_READ) { + unsigned i; + for (i = 0; i < ptrans->box.depth; i++) + panfrost_load_tiled_image( + trans->staging + i * ptrans->stride * ptrans->box.height, + bo->map + res->levels[level].offset + (i + box->z) * res->levels[level].layer_stride, + ptrans->box.x, ptrans->box.y, + ptrans->box.width, ptrans->box.height, + ptrans->stride, + res->levels[level].stride, + pres->format); + } return trans->staging; } else { ptrans->stride = res->levels[level].stride; - ptrans->layer_stride = ptrans->stride * box->height; + ptrans->layer_stride = res->levels[level].layer_stride; return bo->map + res->levels[level].offset + - box->z * ptrans->layer_stride + + box->z * res->levels[level].layer_stride + box->y / util_format_get_blockheight(pres->format) * ptrans->stride + box->x / util_format_get_blockwidth(pres->format) * util_format_get_blocksize(pres->format); @@ -544,12 +672,18 @@ if (trans->staging) { pres = &res->base; - if (ptrans->usage & PIPE_TRANSFER_WRITE) - panfrost_store_tiled_image(bo->map + res->levels[ptrans->level].offset, trans->staging, - &ptrans->box, - res->levels[ptrans->level].stride, - ptrans->stride, - util_format_get_blocksize(pres->format)); + if (ptrans->usage & PIPE_TRANSFER_WRITE) { + unsigned i; + for (i = 0; i < ptrans->box.depth; i++) + panfrost_store_tiled_image( + bo->map + res->levels[ptrans->level].offset + (i + ptrans->box.z) * res->levels[ptrans->level].layer_stride, + trans->staging + i * ptrans->stride * ptrans->box.height, + ptrans->box.x, ptrans->box.y, + ptrans->box.width, ptrans->box.height, + res->levels[ptrans->level].stride, + ptrans->stride, + pres->format); + } free(trans->staging); } diff -Nru mesa-19.2.8/src/gallium/drivers/lima/lima_resource.h mesa-20.0.8/src/gallium/drivers/lima/lima_resource.h --- mesa-19.2.8/src/gallium/drivers/lima/lima_resource.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/lima/lima_resource.h 2020-06-12 01:21:17.000000000 +0000 @@ -36,11 +36,20 @@ uint32_t width; uint32_t stride; uint32_t offset; + uint32_t layer_stride; +}; + +struct lima_damage_region { + struct pipe_scissor_state *region; + struct pipe_scissor_state bound; + unsigned num_region; + bool aligned; }; struct lima_resource { struct pipe_resource base; + struct lima_damage_region damage; struct renderonly_scanout *scanout; struct lima_bo *bo; bool tiled; diff -Nru mesa-19.2.8/src/gallium/drivers/lima/lima_screen.c mesa-20.0.8/src/gallium/drivers/lima/lima_screen.c --- mesa-19.2.8/src/gallium/drivers/lima/lima_screen.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/lima/lima_screen.c 2020-06-12 01:21:17.000000000 +0000 @@ -38,7 +38,8 @@ #include "lima_program.h" #include "lima_bo.h" #include "lima_fence.h" -#include "lima_texture.h" +#include "lima_format.h" +#include "lima_util.h" #include "ir/lima_ir.h" #include "xf86drm.h" @@ -50,10 +51,7 @@ { struct lima_screen *screen = lima_screen(pscreen); - if (lima_dump_command_stream) { - fclose(lima_dump_command_stream); - lima_dump_command_stream = NULL; - } + lima_dump_file_close(); slab_destroy_parent(&screen->transfer_pool); @@ -61,8 +59,9 @@ free(screen->ro); if (screen->pp_buffer) - lima_bo_free(screen->pp_buffer); + lima_bo_unreference(screen->pp_buffer); + lima_bo_cache_fini(screen); lima_bo_table_fini(screen); ralloc_free(screen); } @@ -103,6 +102,7 @@ case PIPE_CAP_ACCELERATED: case PIPE_CAP_UMA: case PIPE_CAP_NATIVE_FENCE_FD: + case PIPE_CAP_FRAGMENT_SHADER_TEXTURE_LOD: return 1; /* Unimplemented, but for exporting OpenGL 2.0 */ @@ -143,6 +143,12 @@ case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER: return 0; + case PIPE_CAP_ALPHA_TEST: + case PIPE_CAP_FLATSHADE: + case PIPE_CAP_TWO_SIDED_COLOR: + case PIPE_CAP_CLIP_PLANES: + return 0; + default: return u_pipe_screen_get_param_defaults(pscreen, param); } @@ -156,11 +162,11 @@ case PIPE_CAPF_MAX_LINE_WIDTH_AA: case PIPE_CAPF_MAX_POINT_WIDTH: case PIPE_CAPF_MAX_POINT_WIDTH_AA: - return 255.0f; + return 100.0f; case PIPE_CAPF_MAX_TEXTURE_ANISOTROPY: return 16.0f; case PIPE_CAPF_MAX_TEXTURE_LOD_BIAS: - return 16.0f; + return 15.0f; default: return 0.0f; @@ -178,6 +184,9 @@ case PIPE_SHADER_CAP_MAX_TEX_INDIRECTIONS: return 16384; /* need investigate */ + case PIPE_SHADER_CAP_MAX_CONTROL_FLOW_DEPTH: + return 1024; + case PIPE_SHADER_CAP_MAX_INPUTS: return 16; /* attributes */ @@ -185,7 +194,8 @@ return LIMA_MAX_VARYING_NUM; /* varying */ case PIPE_SHADER_CAP_MAX_CONST_BUFFER_SIZE: - return 4096; /* need investigate */ + return 16 * 1024 * sizeof(float); + case PIPE_SHADER_CAP_MAX_CONST_BUFFERS: return 1; @@ -195,6 +205,9 @@ case PIPE_SHADER_CAP_MAX_TEMPS: return 256; /* need investigate */ + case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT: + return 32; + default: return 0; } @@ -214,8 +227,12 @@ case PIPE_SHADER_CAP_MAX_INPUTS: return LIMA_MAX_VARYING_NUM - 1; /* varying, minus gl_Position */ + case PIPE_SHADER_CAP_MAX_CONTROL_FLOW_DEPTH: + return 1024; + case PIPE_SHADER_CAP_MAX_CONST_BUFFER_SIZE: - return 4096; /* need investigate */ + return 16 * 1024 * sizeof(float); + case PIPE_SHADER_CAP_MAX_CONST_BUFFERS: return 1; @@ -228,6 +245,17 @@ case PIPE_SHADER_CAP_MAX_TEMPS: return 256; /* need investigate */ + case PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR: + case PIPE_SHADER_CAP_INDIRECT_CONST_ADDR: + return 1; + + case PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR: + case PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR: + return 0; + + case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT: + return 32; + default: return 0; } @@ -263,6 +291,8 @@ case PIPE_BUFFER: case PIPE_TEXTURE_1D: case PIPE_TEXTURE_2D: + case PIPE_TEXTURE_RECT: + case PIPE_TEXTURE_CUBE: break; default: return false; @@ -275,20 +305,9 @@ if (sample_count > 1 && sample_count != 4) return false; - if (usage & PIPE_BIND_RENDER_TARGET) { - switch (format) { - case PIPE_FORMAT_B8G8R8A8_UNORM: - case PIPE_FORMAT_B8G8R8X8_UNORM: - case PIPE_FORMAT_R8G8B8A8_UNORM: - case PIPE_FORMAT_R8G8B8X8_UNORM: - case PIPE_FORMAT_Z16_UNORM: - case PIPE_FORMAT_Z24_UNORM_S8_UINT: - case PIPE_FORMAT_Z24X8_UNORM: - break; - default: - return false; - } - } + if (usage & PIPE_BIND_RENDER_TARGET && + !lima_format_pixel_supported(format)) + return false; if (usage & PIPE_BIND_DEPTH_STENCIL) { switch (format) { @@ -322,7 +341,7 @@ } if (usage & PIPE_BIND_SAMPLER_VIEW) - return lima_texel_format_supported(format); + return lima_format_texel_supported(format); return true; } @@ -369,6 +388,18 @@ static bool lima_screen_query_info(struct lima_screen *screen) { + drmVersionPtr version = drmGetVersion(screen->fd); + if (!version) + return false; + + if (version->version_major > 1 || version->version_minor > 0) + screen->has_growable_heap_buffer = true; + + drmFreeVersion(version); + + if (lima_debug & LIMA_DEBUG_NO_GROW_HEAP) + screen->has_growable_heap_buffer = false; + struct drm_lima_get_param param; memset(¶m, 0, sizeof(param)); @@ -405,6 +436,7 @@ int *count) { uint64_t available_modifiers[] = { + DRM_FORMAT_MOD_ARM_16X16_BLOCK_U_INTERLEAVED, DRM_FORMAT_MOD_LINEAR, }; @@ -429,6 +461,14 @@ "dump GPU command stream to $PWD/lima.dump" }, { "shaderdb", LIMA_DEBUG_SHADERDB, "print shader information for shaderdb" }, + { "nobocache", LIMA_DEBUG_NO_BO_CACHE, + "disable BO cache" }, + { "bocache", LIMA_DEBUG_BO_CACHE, + "print debug info for BO cache" }, + { "notiling", LIMA_DEBUG_NO_TILING, + "don't use tiled buffers" }, + { "nogrowheap", LIMA_DEBUG_NO_GROW_HEAP, + "disable growable heap buffer" }, { NULL } }; @@ -440,14 +480,8 @@ { lima_debug = debug_get_option_lima_debug(); - if (lima_debug & LIMA_DEBUG_DUMP) { - const char *dump_command = "lima.dump"; - printf("lima: dump command stream to file %s\n", dump_command); - lima_dump_command_stream = fopen(dump_command, "w"); - if (!lima_dump_command_stream) - fprintf(stderr, "lima: fail to open command stream log file %s\n", - dump_command); - } + if (lima_debug & LIMA_DEBUG_DUMP) + lima_dump_file_open(); lima_ctx_num_plb = debug_get_num_option("LIMA_CTX_NUM_PLB", LIMA_CTX_PLB_DEF_NUM); if (lima_ctx_num_plb > LIMA_CTX_PLB_MAX_NUM || @@ -489,16 +523,20 @@ if (!lima_screen_query_info(screen)) goto err_out0; - if (!lima_bo_table_init(screen)) + if (!lima_bo_cache_init(screen)) goto err_out0; + if (!lima_bo_table_init(screen)) + goto err_out1; + screen->pp_ra = ppir_regalloc_init(screen); if (!screen->pp_ra) - goto err_out1; + goto err_out2; screen->pp_buffer = lima_bo_create(screen, pp_buffer_size, 0); if (!screen->pp_buffer) - goto err_out1; + goto err_out2; + screen->pp_buffer->cacheable = false; /* fs program for clear buffer? * const0 1 0 0 -1.67773, mov.v0 $0 ^const0.xxxx, stop @@ -545,7 +583,7 @@ screen->ro = renderonly_dup(ro); if (!screen->ro) { fprintf(stderr, "Failed to dup renderonly object\n"); - goto err_out2; + goto err_out3; } } @@ -570,10 +608,12 @@ return &screen->base; +err_out3: + lima_bo_unreference(screen->pp_buffer); err_out2: - lima_bo_free(screen->pp_buffer); -err_out1: lima_bo_table_fini(screen); +err_out1: + lima_bo_cache_fini(screen); err_out0: ralloc_free(screen); return NULL; diff -Nru mesa-19.2.8/src/gallium/drivers/lima/lima_screen.h mesa-20.0.8/src/gallium/drivers/lima/lima_screen.h --- mesa-19.2.8/src/gallium/drivers/lima/lima_screen.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/lima/lima_screen.h 2020-06-12 01:21:17.000000000 +0000 @@ -33,19 +33,27 @@ #include "pipe/p_screen.h" -#define LIMA_DEBUG_GP (1 << 0) -#define LIMA_DEBUG_PP (1 << 1) -#define LIMA_DEBUG_DUMP (1 << 2) -#define LIMA_DEBUG_SHADERDB (1 << 3) +#define LIMA_DEBUG_GP (1 << 0) +#define LIMA_DEBUG_PP (1 << 1) +#define LIMA_DEBUG_DUMP (1 << 2) +#define LIMA_DEBUG_SHADERDB (1 << 3) +#define LIMA_DEBUG_NO_BO_CACHE (1 << 4) +#define LIMA_DEBUG_BO_CACHE (1 << 5) +#define LIMA_DEBUG_NO_TILING (1 << 6) +#define LIMA_DEBUG_NO_GROW_HEAP (1 << 7) extern uint32_t lima_debug; -extern FILE *lima_dump_command_stream; extern int lima_ctx_num_plb; extern int lima_plb_max_blk; extern int lima_ppir_force_spilling; struct ra_regs; +#define MIN_BO_CACHE_BUCKET (12) /* 2^12 = 4KB */ +#define MAX_BO_CACHE_BUCKET (22) /* 2^22 = 4MB */ + +#define NR_BO_CACHE_BUCKETS (MAX_BO_CACHE_BUCKET - MIN_BO_CACHE_BUCKET + 1) + struct lima_screen { struct pipe_screen base; struct renderonly *ro; @@ -60,8 +68,11 @@ /* bo table */ mtx_t bo_table_lock; + mtx_t bo_cache_lock; struct util_hash_table *bo_handles; struct util_hash_table *bo_flink_names; + struct list_head bo_cache_buckets[NR_BO_CACHE_BUCKETS]; + struct list_head bo_cache_time; struct slab_parent_pool transfer_pool; @@ -73,11 +84,9 @@ #define pp_reload_program_offset 0x0080 #define pp_shared_index_offset 0x00c0 #define pp_clear_gl_pos_offset 0x0100 - #define pp_stack_offset 0x1000 - #define pp_stack_pp_size 0x400 /* per pp, up to 8 pp */ - #define pp_stack_offset_end 0x3000 - #define pp_buffer_size 0x3000 + #define pp_buffer_size 0x1000 + bool has_growable_heap_buffer; }; static inline struct lima_screen * diff -Nru mesa-19.2.8/src/gallium/drivers/lima/lima_state.c mesa-20.0.8/src/gallium/drivers/lima/lima_state.c --- mesa-19.2.8/src/gallium/drivers/lima/lima_state.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/lima/lima_state.c 2020-06-12 01:21:17.000000000 +0000 @@ -226,7 +226,7 @@ struct lima_context *ctx = lima_context(pctx); struct lima_context_vertex_buffer *so = &ctx->vertex_buffers; - util_set_vertex_buffers_mask(so->vb + start_slot, &so->enabled_mask, + util_set_vertex_buffers_mask(so->vb, &so->enabled_mask, vb, start_slot, count); so->count = util_last_bit(so->enabled_mask); @@ -242,14 +242,18 @@ struct lima_context *ctx = lima_context(pctx); /* reverse calculate the parameter of glViewport */ - ctx->viewport.x = viewport->translate[0] - viewport->scale[0]; - ctx->viewport.y = fabsf(viewport->translate[1] - fabsf(viewport->scale[1])); - ctx->viewport.width = viewport->scale[0] * 2; - ctx->viewport.height = fabsf(viewport->scale[1] * 2); + ctx->viewport.left = viewport->translate[0] - fabsf(viewport->scale[0]); + ctx->viewport.right = viewport->translate[0] + fabsf(viewport->scale[0]); + ctx->viewport.bottom = viewport->translate[1] - fabsf(viewport->scale[1]); + ctx->viewport.top = viewport->translate[1] + fabsf(viewport->scale[1]); /* reverse calculate the parameter of glDepthRange */ - ctx->viewport.near = viewport->translate[2] - viewport->scale[2]; - ctx->viewport.far = viewport->translate[2] + viewport->scale[2]; + float near, far; + near = viewport->translate[2] - viewport->scale[2]; + far = viewport->translate[2] + viewport->scale[2]; + + ctx->viewport.near = MIN2(near, far); + ctx->viewport.far = MAX2(near, far); ctx->viewport.transform = *viewport; ctx->dirty |= LIMA_CONTEXT_DIRTY_VIEWPORT; @@ -414,50 +418,6 @@ ctx->dirty |= LIMA_CONTEXT_DIRTY_TEXTURES; } -UNUSED static bool -lima_set_damage_region(struct pipe_context *pctx, unsigned num_rects, int *rects) -{ - struct lima_context *ctx = lima_context(pctx); - struct lima_damage_state *damage = &ctx->damage; - int i; - - if (damage->region) - ralloc_free(damage->region); - - if (!num_rects) { - damage->region = NULL; - damage->num_region = 0; - return true; - } - - damage->region = ralloc_size(ctx, sizeof(*damage->region) * num_rects); - if (!damage->region) { - damage->num_region = 0; - return false; - } - - for (i = 0; i < num_rects; i++) { - struct pipe_scissor_state *r = damage->region + i; - /* region in tile unit */ - r->minx = rects[i * 4] >> 4; - r->miny = rects[i * 4 + 1] >> 4; - r->maxx = (rects[i * 4] + rects[i * 4 + 2] + 0xf) >> 4; - r->maxy = (rects[i * 4 + 1] + rects[i * 4 + 3] + 0xf) >> 4; - } - - /* is region aligned to tiles? */ - damage->aligned = true; - for (i = 0; i < num_rects * 4; i++) { - if (rects[i] & 0xf) { - damage->aligned = false; - break; - } - } - - damage->num_region = num_rects; - return true; -} - static void lima_set_sample_mask(struct pipe_context *pctx, unsigned sample_mask) diff -Nru mesa-19.2.8/src/gallium/drivers/lima/lima_submit.c mesa-20.0.8/src/gallium/drivers/lima/lima_submit.c --- mesa-19.2.8/src/gallium/drivers/lima/lima_submit.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/lima/lima_submit.c 2020-06-12 01:21:17.000000000 +0000 @@ -78,6 +78,7 @@ goto err_out1; util_dynarray_init(&s->gem_bos, s); + util_dynarray_init(&s->bos, s); return s; @@ -128,6 +129,7 @@ .bos = VOID2U64(util_dynarray_begin(&submit->gem_bos)), .frame = VOID2U64(frame), .frame_size = size, + .out_sync = submit->out_sync, }; if (submit->in_sync_fd >= 0) { @@ -144,7 +146,7 @@ bool ret = drmIoctl(submit->screen->fd, DRM_IOCTL_LIMA_GEM_SUBMIT, &req) == 0; util_dynarray_foreach(&submit->bos, struct lima_bo *, bo) { - lima_bo_free(*bo); + lima_bo_unreference(*bo); } util_dynarray_clear(&submit->gem_bos); diff -Nru mesa-19.2.8/src/gallium/drivers/lima/lima_texture.c mesa-20.0.8/src/gallium/drivers/lima/lima_texture.c --- mesa-19.2.8/src/gallium/drivers/lima/lima_texture.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/lima/lima_texture.c 2020-06-12 01:21:17.000000000 +0000 @@ -36,66 +36,15 @@ #include "lima_resource.h" #include "lima_submit.h" #include "lima_util.h" +#include "lima_format.h" #include -#define LIMA_TEXEL_FORMAT_L8 0x09 -#define LIMA_TEXEL_FORMAT_A8 0x0a -#define LIMA_TEXEL_FORMAT_I8 0x0b -#define LIMA_TEXEL_FORMAT_BGR_565 0x0e -#define LIMA_TEXEL_FORMAT_L8A8 0x11 -#define LIMA_TEXEL_FORMAT_L16 0x12 -#define LIMA_TEXEL_FORMAT_A16 0x13 -#define LIMA_TEXEL_FORMAT_I16 0x14 -#define LIMA_TEXEL_FORMAT_RGB_888 0x15 -#define LIMA_TEXEL_FORMAT_RGBA_8888 0x16 -#define LIMA_TEXEL_FORMAT_RGBX_8888 0x17 -#define LIMA_TEXEL_FORMAT_Z24S8 0x2c #define lima_tex_list_size 64 -typedef struct { - bool present; - uint32_t lima_format; - bool swap_r_b; -} lima_format; - -#define LIMA_FORMAT(pipe, lima, swap) \ - [PIPE_FORMAT_##pipe] = { .present = true, .lima_format = lima, \ - .swap_r_b = swap } - -static const lima_format lima_format_table[] = { - LIMA_FORMAT(R8G8B8A8_UNORM, LIMA_TEXEL_FORMAT_RGBA_8888, true), - LIMA_FORMAT(B8G8R8A8_UNORM, LIMA_TEXEL_FORMAT_RGBA_8888, false), - LIMA_FORMAT(R8G8B8A8_SRGB, LIMA_TEXEL_FORMAT_RGBA_8888, true), - LIMA_FORMAT(B8G8R8A8_SRGB, LIMA_TEXEL_FORMAT_RGBA_8888, false), - LIMA_FORMAT(R8G8B8X8_UNORM, LIMA_TEXEL_FORMAT_RGBX_8888, true), - LIMA_FORMAT(B8G8R8X8_UNORM, LIMA_TEXEL_FORMAT_RGBX_8888, false), - LIMA_FORMAT(R8G8B8_UNORM, LIMA_TEXEL_FORMAT_RGB_888, true), - LIMA_FORMAT(B5G6R5_UNORM, LIMA_TEXEL_FORMAT_BGR_565, false), - LIMA_FORMAT(Z24_UNORM_S8_UINT, LIMA_TEXEL_FORMAT_Z24S8, false), - LIMA_FORMAT(Z24X8_UNORM, LIMA_TEXEL_FORMAT_Z24S8, false), - /* Blob uses L16 for Z16 */ - LIMA_FORMAT(Z16_UNORM, LIMA_TEXEL_FORMAT_L16, false), - LIMA_FORMAT(L16_UNORM, LIMA_TEXEL_FORMAT_L16, false), - LIMA_FORMAT(L8_UNORM, LIMA_TEXEL_FORMAT_L8, false), - LIMA_FORMAT(A16_UNORM, LIMA_TEXEL_FORMAT_A16, false), - LIMA_FORMAT(A8_UNORM, LIMA_TEXEL_FORMAT_A8, false), - LIMA_FORMAT(I16_UNORM, LIMA_TEXEL_FORMAT_I16, false), - LIMA_FORMAT(I8_UNORM, LIMA_TEXEL_FORMAT_I8, false), - LIMA_FORMAT(L8A8_UNORM, LIMA_TEXEL_FORMAT_L8A8, false), -}; - static_assert(offsetof(lima_tex_desc, va) == 24, "lima_tex_desc->va offset isn't 24"); -bool -lima_texel_format_supported(enum pipe_format pformat) -{ - if (pformat >= ARRAY_SIZE(lima_format_table)) - return false; - - return lima_format_table[pformat].present; -} static void lima_texture_desc_set_va(lima_tex_desc *desc, @@ -129,11 +78,8 @@ height = u_minify(height, first_level); } - assert(prsc->format < ARRAY_SIZE(lima_format_table)); - assert(lima_format_table[prsc->format].present); - - desc->format = lima_format_table[prsc->format].lima_format; - desc->swap_r_b = lima_format_table[prsc->format].swap_r_b; + desc->format = lima_format_get_texel(prsc->format); + desc->swap_r_b = lima_format_get_swap_rb(prsc->format); desc->width = width; desc->height = height; desc->unknown_3_1 = 1; @@ -143,7 +89,7 @@ else { /* for padded linear texture */ if (lima_res->levels[first_level].width != width) { - desc->stride = lima_res->levels[first_level].width; + desc->stride = lima_res->levels[first_level].stride; desc->has_stride = 1; } layout = 0; @@ -172,40 +118,58 @@ struct lima_sampler_view *texture, void *pdesc, unsigned desc_size) { + /* unit is 1/16 since lod_bias is in fixed format */ + int lod_bias_delta = 0; lima_tex_desc *desc = pdesc; unsigned first_level; unsigned last_level; - bool mipmapping; + float max_lod; memset(desc, 0, desc_size); - /* 2D texture */ - desc->texture_2d = 1; + switch (texture->base.target) { + case PIPE_TEXTURE_2D: + case PIPE_TEXTURE_RECT: + desc->texture_type = LIMA_TEXTURE_TYPE_2D; + break; + case PIPE_TEXTURE_CUBE: + desc->texture_type = LIMA_TEXTURE_TYPE_CUBE; + break; + default: + break; + } + + if (!sampler->base.normalized_coords) + desc->unnorm_coords = 1; first_level = texture->base.u.tex.first_level; last_level = texture->base.u.tex.last_level; if (last_level - first_level >= LIMA_MAX_MIP_LEVELS) last_level = first_level + LIMA_MAX_MIP_LEVELS - 1; + desc->min_lod = lima_float_to_fixed8(sampler->base.min_lod); + max_lod = MIN2(sampler->base.max_lod, sampler->base.min_lod + + (last_level - first_level)); + desc->max_lod = lima_float_to_fixed8(max_lod); + desc->lod_bias = lima_float_to_fixed8(sampler->base.lod_bias); + switch (sampler->base.min_mip_filter) { case PIPE_TEX_MIPFILTER_LINEAR: - desc->min_mipfilter = 3; + desc->min_mipfilter_2 = 3; + break; case PIPE_TEX_MIPFILTER_NEAREST: - mipmapping = true; - desc->miplevels = (last_level - first_level); + desc->min_mipfilter_2 = 0; break; case PIPE_TEX_MIPFILTER_NONE: + desc->max_lod = desc->min_lod; + break; default: - mipmapping = false; break; } switch (sampler->base.mag_img_filter) { case PIPE_TEX_FILTER_LINEAR: desc->mag_img_filter_nearest = 0; - /* no mipmap, filter_mag = linear */ - if (!mipmapping) - desc->disable_mipmap = 1; break; case PIPE_TEX_FILTER_NEAREST: default: @@ -220,6 +184,7 @@ break; case PIPE_TEX_FILTER_NEAREST: default: + lod_bias_delta = 8; desc->min_img_filter_nearest = 1; break; } @@ -258,6 +223,13 @@ break; } + if (desc->min_img_filter_nearest && desc->mag_img_filter_nearest && + desc->min_mipfilter_2 == 0 && + (desc->min_lod != desc->max_lod)) + lod_bias_delta = -1; + + desc->lod_bias += lod_bias_delta; + lima_texture_desc_set_res(ctx, desc, texture->base.texture, first_level, last_level); } @@ -298,7 +270,7 @@ } uint32_t *descs = - lima_ctx_buff_alloc(ctx, lima_ctx_buff_pp_tex_desc, size, true); + lima_ctx_buff_alloc(ctx, lima_ctx_buff_pp_tex_desc, size); off_t offset = lima_tex_list_size; for (int i = 0; i < lima_tex->num_samplers; i++) { @@ -315,4 +287,9 @@ lima_dump_command_stream_print( descs, size, false, "add textures_desc at va %x\n", lima_ctx_buff_va(ctx, lima_ctx_buff_pp_tex_desc, 0)); + + lima_dump_texture_descriptor( + descs, size, + lima_ctx_buff_va(ctx, lima_ctx_buff_pp_tex_desc, 0) + lima_tex_list_size, + lima_tex_list_size); } diff -Nru mesa-19.2.8/src/gallium/drivers/lima/lima_texture.h mesa-20.0.8/src/gallium/drivers/lima/lima_texture.h --- mesa-19.2.8/src/gallium/drivers/lima/lima_texture.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/lima/lima_texture.h 2020-06-12 01:21:17.000000000 +0000 @@ -27,27 +27,29 @@ #define lima_min_tex_desc_size 64 +#define LIMA_TEXTURE_TYPE_2D 2 +#define LIMA_TEXTURE_TYPE_CUBE 5 + typedef struct __attribute__((__packed__)) { /* Word 0 */ uint32_t format : 6; uint32_t flag1: 1; uint32_t swap_r_b: 1; - uint32_t unknown_0_1: 10; - uint32_t stride: 13; + uint32_t unknown_0_1: 8; + uint32_t stride: 15; uint32_t unknown_0_2: 1; - /* Word 1*/ - uint32_t unknown_1_1: 10; - uint32_t texture_2d: 1; - uint32_t unknown_1_2: 13; - uint32_t miplevels: 4; - uint32_t unknown_1_3: 3; - uint32_t disable_mipmap: 1; - - /* Word 2-3 */ - uint32_t unknown_2_1: 8; + /* Word 1-3 */ + uint32_t unknown_1_1: 7; + uint32_t unnorm_coords: 1; + uint32_t unknown_1_2: 1; + uint32_t texture_type: 3; + uint32_t min_lod: 8; /* Fixed point, 4.4, unsigned */ + uint32_t max_lod: 8; /* Fixed point, 4.4, unsigned */ + uint32_t lod_bias: 9; /* Fixed point, signed, 1.4.4 */ + uint32_t unknown_2_1: 3; uint32_t has_stride: 1; - uint32_t min_mipfilter: 2; /* 0x3 for linear, 0x0 for neares */ + uint32_t min_mipfilter_2: 2; /* 0x3 for linear, 0x0 for nearest */ uint32_t min_img_filter_nearest: 1; uint32_t mag_img_filter_nearest: 1; uint32_t wrap_s_clamp_to_edge: 1; @@ -75,9 +77,9 @@ union { uint32_t va[0]; struct __attribute__((__packed__)) { - uint32_t unknown_6_1: 12; + uint32_t unknown_6_1: 13; uint32_t layout: 2; - uint32_t unknown_6_2: 10; + uint32_t unknown_6_2: 9; uint32_t unknown_6_3: 6; #define VA_BIT_OFFSET 30 #define VA_BIT_SIZE 26 @@ -92,6 +94,23 @@ struct pipe_resource *prsc, unsigned first_level, unsigned last_level); void lima_update_textures(struct lima_context *ctx); -bool lima_texel_format_supported(enum pipe_format pformat); + + +static inline int16_t lima_float_to_fixed8(float f) +{ + return (int)(f * 16.0); +} + +static inline float lima_fixed8_to_float(int16_t i) +{ + float sign = 1.0; + + if (i > 0xff) { + i = 0x200 - i; + sign = -1; + } + + return sign * (float)(i / 16.0); +} #endif diff -Nru mesa-19.2.8/src/gallium/drivers/lima/lima_util.c mesa-20.0.8/src/gallium/drivers/lima/lima_util.c --- mesa-19.2.8/src/gallium/drivers/lima/lima_util.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/lima/lima_util.c 2020-06-12 01:21:17.000000000 +0000 @@ -27,9 +27,13 @@ #include +#include "util/u_debug.h" + #include "lima_util.h" +#include "lima_parser.h" FILE *lima_dump_command_stream = NULL; +int lima_dump_frame_count = 0; bool lima_get_absolute_timeout(uint64_t *timeout) { @@ -51,18 +55,86 @@ void lima_dump_blob(FILE *fp, void *data, int size, bool is_float) { + fprintf(fp, "{\n"); for (int i = 0; i * 4 < size; i++) { - if (i % 4 == 0) { - if (i) fprintf(fp, "\n"); - fprintf(fp, "%04x:", i * 4); - } + if (i % 4 == 0) + fprintf(fp, "\t"); if (is_float) - fprintf(fp, " %f", ((float *)data)[i]); + fprintf(fp, "%f, ", ((float *)data)[i]); else - fprintf(fp, " 0x%08x", ((uint32_t *)data)[i]); + fprintf(fp, "0x%08x, ", ((uint32_t *)data)[i]); + + if ((i % 4 == 3) || (i == size / 4 - 1)) { + fprintf(fp, "/* 0x%08x */", MAX2((i - 3) * 4, 0)); + if (i) fprintf(fp, "\n"); + } + } + fprintf(fp, "}\n"); +} + +void +lima_dump_vs_command_stream_print(void *data, int size, uint32_t start) +{ + if (lima_dump_command_stream) + lima_parse_vs(lima_dump_command_stream, (uint32_t *)data, size, start); +} + +void +lima_dump_plbu_command_stream_print(void *data, int size, uint32_t start) +{ + if (lima_dump_command_stream) + lima_parse_plbu(lima_dump_command_stream, (uint32_t *)data, size, start); +} + +void +lima_dump_rsw_command_stream_print(void *data, int size, uint32_t start) +{ + if (lima_dump_command_stream) + lima_parse_render_state(lima_dump_command_stream, (uint32_t *)data, size, start); +} + +void +lima_dump_texture_descriptor(void *data, int size, uint32_t start, uint32_t offset) +{ + if (lima_dump_command_stream) + lima_parse_texture_descriptor(lima_dump_command_stream, (uint32_t *)data, size, start, offset); +} + +void +lima_dump_file_open(void) +{ + if (lima_dump_command_stream) + return; + + char buffer[1024]; + const char *dump_command = debug_get_option("LIMA_DUMP_FILE", "lima.dump"); + snprintf(buffer, sizeof(buffer), "%s.%04d", dump_command, lima_dump_frame_count); + + printf("lima: dump command stream to file %s\n", buffer); + lima_dump_command_stream = fopen(buffer, "w"); + if (!lima_dump_command_stream) + fprintf(stderr, "lima: failed to open command stream log file %s\n", + buffer); +} + +void +lima_dump_file_close(void) +{ + if (lima_dump_command_stream) { + fclose(lima_dump_command_stream); + lima_dump_command_stream = NULL; + } +} + +void +lima_dump_file_next(void) +{ + if (lima_dump_command_stream) { + lima_dump_file_close(); + lima_dump_frame_count++; + lima_dump_file_open(); } - fprintf(fp, "\n"); } void diff -Nru mesa-19.2.8/src/gallium/drivers/lima/lima_util.h mesa-20.0.8/src/gallium/drivers/lima/lima_util.h --- mesa-19.2.8/src/gallium/drivers/lima/lima_util.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/lima/lima_util.h 2020-06-12 01:21:17.000000000 +0000 @@ -29,8 +29,17 @@ #define LIMA_PAGE_SIZE 4096 +extern FILE *lima_dump_command_stream; + bool lima_get_absolute_timeout(uint64_t *timeout); +void lima_dump_file_open(void); +void lima_dump_file_next(void); +void lima_dump_file_close(void); void lima_dump_blob(FILE *fp, void *data, int size, bool is_float); +void lima_dump_vs_command_stream_print(void *data, int size, uint32_t start); +void lima_dump_plbu_command_stream_print(void *data, int size, uint32_t start); +void lima_dump_rsw_command_stream_print(void *data, int size, uint32_t start); +void lima_dump_texture_descriptor(void *data, int size, uint32_t start, uint32_t offset); void lima_dump_command_stream_print(void *data, int size, bool is_float, const char *fmt, ...); diff -Nru mesa-19.2.8/src/gallium/drivers/lima/meson.build mesa-20.0.8/src/gallium/drivers/lima/meson.build --- mesa-19.2.8/src/gallium/drivers/lima/meson.build 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/lima/meson.build 2020-06-12 01:21:17.000000000 +0000 @@ -39,12 +39,14 @@ 'ir/pp/scheduler.c', 'ir/pp/instr.c', 'ir/pp/regalloc.c', + 'ir/pp/liveness.c', 'ir/pp/codegen.h', 'ir/pp/codegen.c', 'ir/pp/node_to_instr.c', 'ir/pp/disasm.c', 'ir/lima_nir_lower_uniform_to_scalar.c', + 'ir/lima_nir_split_load_input.c', 'ir/lima_ir.h', @@ -62,12 +64,16 @@ 'lima_bo.h', 'lima_submit.c', 'lima_submit.h', + 'lima_parser.c', + 'lima_parser.h', 'lima_util.c', 'lima_util.h', 'lima_texture.c', 'lima_texture.h', 'lima_fence.c', 'lima_fence.h', + 'lima_format.h', + 'lima_format.c', ) lima_nir_algebraic_c = custom_target( @@ -122,3 +128,21 @@ install : with_tools.contains('lima'), ) +lima_disasm = executable( + 'lima_disasm', + files( + 'standalone/lima_disasm.c', + ), + include_directories : [ + inc_src, inc_include, inc_gallium, inc_gallium_aux, inc_gallium_drivers, inc_mesa, inc_mapi, inc_compiler, + ], + dependencies : [ + idep_mesautil, + ], + link_with : [ + liblima, + libpanfrost_shared, + ], + build_by_default : with_tools.contains('lima'), + install : with_tools.contains('lima'), +) diff -Nru mesa-19.2.8/src/gallium/drivers/lima/standalone/lima_compiler_cmdline.c mesa-20.0.8/src/gallium/drivers/lima/standalone/lima_compiler_cmdline.c --- mesa-19.2.8/src/gallium/drivers/lima/standalone/lima_compiler_cmdline.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/lima/standalone/lima_compiler_cmdline.c 2020-06-12 01:21:17.000000000 +0000 @@ -33,8 +33,8 @@ #include "compiler/glsl/gl_nir.h" #include "compiler/nir_types.h" -#include "lima_program.h" #include "lima_context.h" +#include "lima_program.h" #include "ir/lima_ir.h" #include "standalone/glsl.h" @@ -135,7 +135,7 @@ NIR_PASS_V(nir, nir_lower_var_copies); nir_print_shader(nir, stdout); NIR_PASS_V(nir, gl_nir_lower_atomics, prog, true); - NIR_PASS_V(nir, nir_lower_atomics_to_ssbo, 8); + NIR_PASS_V(nir, nir_lower_atomics_to_ssbo); nir_print_shader(nir, stdout); switch (stage) { diff -Nru mesa-19.2.8/src/gallium/drivers/lima/standalone/lima_disasm.c mesa-20.0.8/src/gallium/drivers/lima/standalone/lima_disasm.c --- mesa-19.2.8/src/gallium/drivers/lima/standalone/lima_disasm.c 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/lima/standalone/lima_disasm.c 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,201 @@ +/* + * Copyright (c) 2019 Vasily Khoruzhick + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sub license, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + */ + +#include "util/ralloc.h" + +#include +#include +#include +#include + +#include "ir/pp/codegen.h" +#include "ir/gp/codegen.h" + +static void +print_usage(void) +{ + printf("Usage: lima_disasm [OPTIONS]... FILE\n"); + printf(" --help - show this message\n"); +} + +typedef struct __attribute__((__packed__)) { + char name[4]; + uint32_t size; +} mbs_chunk; + +/* Parses an MBS1 file. MBS1 is used for Mali-400 and earlier which only support + * GLES2, as opposed to MBS2 which is used by later Mali gens, and contains + * the entire inferface between the compiler and the (blob) driver. It's + * produced by the offline compiler as well as glGetProgramBinary(). The + * format is documented at + * https://web.archive.org/web/20171026141029/http://limadriver.org/MBS+File+Format/ + * and consists of a bunch of nested "chunks" where each chunk has a + * 4-character tag followed by a 32-bit size, then the contents of the chunk. + * The chunks are nested as follows: + * + * - MBS1 + * - optional CFRA (fragment shader) + * - core version (uint32_t, Mali-200 vs Mali-400) + * - FSTA (Fragment STAck information) + * - FDIS (if Fragment shader contains a DIScard instruction) + * - FBUU (information on color/depth reads/writes) + * - SUNI (uniform symbol table) + * - SVAR (varying symbol table) + * - DBIN (the actual code) + * - optional CVER (vertex shader) + * - core version (uint32_t, GP2 vs Mali-400) + * - FINS (# of instruction and attrib_prefetch) + * - SUNI (uniform table) + * - SATT (attribute table) + * - SVAR (varying table) + * - DBIN (the actual code) + * + * This routine just finds the DBIN chunk and returns the binary assuming + * there's only the fragment or vertex shader. We don't bother to parse the + * other stuff yet. + */ +static uint32_t * +extract_shader_binary(char *filename, uint32_t *size, bool *is_frag) +{ + mbs_chunk chunk; + + if (!filename || !size || !is_frag) + return NULL; + + FILE *in = fopen(filename, "rb"); + if (!in) + return NULL; + + if (!fread(&chunk, sizeof(chunk), 1, in)) { + printf("Failed to read MBS1 segment\n"); + return NULL; + } + + if (strncmp(chunk.name, "MBS1", 4)) { + printf("File is not MBS\n"); + return NULL; + } + + if (!fread(&chunk, sizeof(chunk), 1, in)) { + printf("Failed to read shader segment\n"); + return NULL; + } + + if (!strncmp(chunk.name, "CFRA", 4)) { + *is_frag = true; + } else if (!strncmp(chunk.name, "CVER", 4)) { + *is_frag = false; + } else { + printf("Unsupported shader type\n"); + return NULL; + } + + /* Skip version */ + fseek(in, 4, SEEK_CUR); + + /* Skip the other chunks and find the DBIN chunk. */ + do { + if (!fread(&chunk, sizeof(chunk), 1, in)) { + printf("Failed to read segment\n"); + return NULL; + } + if (!strncmp(chunk.name, "DBIN", 4)) + break; + fseek(in, chunk.size, SEEK_CUR); + } while (!feof(in)); + + if (feof(in)) { + printf("CBIN segment not found!\n"); + return NULL; + } + + *size = chunk.size; + + uint32_t *bin = ralloc_size(NULL, chunk.size); + if (!bin) { + printf("Failed to allocate shader binary\n"); + return NULL; + } + + if (!fread(bin, chunk.size, 1, in)) { + printf("Failed to read shader binary\n"); + ralloc_free(bin); + bin = NULL; + } + + return bin; +} + +int +main(int argc, char **argv) +{ + int n; + bool is_frag = true; + + if (argc < 2) { + print_usage(); + return 1; + } + + for (n = 1; n < argc; n++) { + if (!strcmp(argv[n], "--help")) { + print_usage(); + return 1; + } + + break; + } + + char *filename = NULL; + filename = argv[n]; + + uint32_t size = 0; + uint32_t *prog = extract_shader_binary(filename, &size, &is_frag); + if (!prog) { + printf("Failed to parse mbs!\n"); + return -1; + } + + if (is_frag) { + assert((size & 0x3) == 0); + size >>= 2; + uint32_t *bin = prog; + uint32_t offset = 0; + do { + ppir_codegen_ctrl *ctrl = (ppir_codegen_ctrl *)bin; + printf("@%6d: ", offset); + ppir_disassemble_instr(bin, offset); + bin += ctrl->count; + offset += ctrl->count; + size -= ctrl->count; + } while (size); + } else { + gpir_disassemble_program((gpir_codegen_instr *)prog, size / (sizeof(gpir_codegen_instr))); + } + + ralloc_free(prog); + + return 0; +} + diff -Nru mesa-19.2.8/src/gallium/drivers/llvmpipe/lp_bld_alpha.c mesa-20.0.8/src/gallium/drivers/llvmpipe/lp_bld_alpha.c --- mesa-19.2.8/src/gallium/drivers/llvmpipe/lp_bld_alpha.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/llvmpipe/lp_bld_alpha.c 2020-06-12 01:21:17.000000000 +0000 @@ -32,7 +32,7 @@ */ #include "pipe/p_state.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "gallivm/lp_bld_type.h" #include "gallivm/lp_bld_const.h" diff -Nru mesa-19.2.8/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c mesa-20.0.8/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c --- mesa-19.2.8/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c 2020-06-12 01:21:17.000000000 +0000 @@ -45,7 +45,7 @@ #include "pipe/p_state.h" #include "util/u_debug.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "gallivm/lp_bld_type.h" #include "gallivm/lp_bld_const.h" diff -Nru mesa-19.2.8/src/gallium/drivers/llvmpipe/lp_bld_depth.c mesa-20.0.8/src/gallium/drivers/llvmpipe/lp_bld_depth.c --- mesa-19.2.8/src/gallium/drivers/llvmpipe/lp_bld_depth.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/llvmpipe/lp_bld_depth.c 2020-06-12 01:21:17.000000000 +0000 @@ -50,7 +50,7 @@ */ #include "pipe/p_state.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_cpu_detect.h" #include "gallivm/lp_bld_type.h" diff -Nru mesa-19.2.8/src/gallium/drivers/llvmpipe/lp_context.c mesa-20.0.8/src/gallium/drivers/llvmpipe/lp_context.c --- mesa-19.2.8/src/gallium/drivers/llvmpipe/lp_context.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/llvmpipe/lp_context.c 2020-06-12 01:21:17.000000000 +0000 @@ -59,6 +59,9 @@ lp_print_counters(); + if (llvmpipe->csctx) { + lp_csctx_destroy(llvmpipe->csctx); + } if (llvmpipe->blitter) { util_blitter_destroy(llvmpipe->blitter); } @@ -149,6 +152,7 @@ make_empty_list(&llvmpipe->setup_variants_list); + make_empty_list(&llvmpipe->cs_variants_list); llvmpipe->pipe.screen = screen; llvmpipe->pipe.priv = priv; @@ -164,6 +168,7 @@ llvmpipe_init_blend_funcs(llvmpipe); llvmpipe_init_clip_funcs(llvmpipe); llvmpipe_init_draw_funcs(llvmpipe); + llvmpipe_init_compute_funcs(llvmpipe); llvmpipe_init_sampler_funcs(llvmpipe); llvmpipe_init_query_funcs( llvmpipe ); llvmpipe_init_vertex_funcs(llvmpipe); @@ -199,6 +204,9 @@ if (!llvmpipe->setup) goto fail; + llvmpipe->csctx = lp_csctx_create( &llvmpipe->pipe ); + if (!llvmpipe->csctx) + goto fail; llvmpipe->pipe.stream_uploader = u_upload_create_default(&llvmpipe->pipe); if (!llvmpipe->pipe.stream_uploader) goto fail; diff -Nru mesa-19.2.8/src/gallium/drivers/llvmpipe/lp_context.h mesa-20.0.8/src/gallium/drivers/llvmpipe/lp_context.h --- mesa-19.2.8/src/gallium/drivers/llvmpipe/lp_context.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/llvmpipe/lp_context.h 2020-06-12 01:21:17.000000000 +0000 @@ -40,6 +40,7 @@ #include "lp_jit.h" #include "lp_setup.h" #include "lp_state_fs.h" +#include "lp_state_cs.h" #include "lp_state_setup.h" @@ -48,6 +49,7 @@ struct draw_stage; struct draw_vertex_shader; struct lp_fragment_shader; +struct lp_compute_shader; struct lp_blend_state; struct lp_setup_context; struct lp_setup_variant; @@ -65,6 +67,7 @@ struct lp_fragment_shader *fs; struct draw_vertex_shader *vs; const struct lp_geometry_shader *gs; + struct lp_compute_shader *cs; const struct lp_velems_state *velems; const struct lp_so_state *so; @@ -83,9 +86,11 @@ struct pipe_vertex_buffer vertex_buffer[PIPE_MAX_ATTRIBS]; struct pipe_shader_buffer ssbos[PIPE_SHADER_TYPES][LP_MAX_TGSI_SHADER_BUFFERS]; + struct pipe_image_view images[PIPE_SHADER_TYPES][LP_MAX_TGSI_SHADER_IMAGES]; unsigned num_samplers[PIPE_SHADER_TYPES]; unsigned num_sampler_views[PIPE_SHADER_TYPES]; + unsigned num_images[PIPE_SHADER_TYPES]; unsigned num_vertex_buffers; @@ -98,8 +103,12 @@ unsigned active_occlusion_queries; - unsigned dirty; /**< Mask of LP_NEW_x flags */ + unsigned active_primgen_queries; + + bool queries_disabled; + unsigned dirty; /**< Mask of LP_NEW_x flags */ + unsigned cs_dirty; /**< Mask of LP_CSNEW_x flags */ /** Mapped vertex buffers */ ubyte *mapped_vbuffer[PIPE_MAX_ATTRIBS]; @@ -147,6 +156,12 @@ struct lp_setup_variant_list_item setup_variants_list; unsigned nr_setup_variants; + /** List of all compute shader variants */ + struct lp_cs_variant_list_item cs_variants_list; + unsigned nr_cs_variants; + unsigned nr_cs_instrs; + struct lp_cs_context *csctx; + /** Conditional query object and mode */ struct pipe_query *render_cond_query; enum pipe_render_cond_flag render_cond_mode; @@ -154,6 +169,10 @@ /** The LLVMContext to use for LLVM related work */ LLVMContextRef context; + + int max_global_buffers; + struct pipe_resource **global_buffers; + }; diff -Nru mesa-19.2.8/src/gallium/drivers/llvmpipe/lp_cs_tpool.c mesa-20.0.8/src/gallium/drivers/llvmpipe/lp_cs_tpool.c --- mesa-19.2.8/src/gallium/drivers/llvmpipe/lp_cs_tpool.c 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/llvmpipe/lp_cs_tpool.c 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,162 @@ +/************************************************************************** + * + * Copyright 2019 Red Hat. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **************************************************************************/ + +/** + * compute shader thread pool. + * based on threadpool.c but modified heavily to be compute shader tuned. + */ + +#include "util/u_thread.h" +#include "util/u_memory.h" +#include "lp_cs_tpool.h" + +static int +lp_cs_tpool_worker(void *data) +{ + struct lp_cs_tpool *pool = data; + struct lp_cs_local_mem lmem; + + memset(&lmem, 0, sizeof(lmem)); + mtx_lock(&pool->m); + + while (!pool->shutdown) { + struct lp_cs_tpool_task *task; + + while (list_is_empty(&pool->workqueue) && !pool->shutdown) + cnd_wait(&pool->new_work, &pool->m); + + if (pool->shutdown) + break; + + task = list_first_entry(&pool->workqueue, struct lp_cs_tpool_task, + list); + unsigned this_iter = task->iter_start++; + + if (task->iter_start == task->iter_total) + list_del(&task->list); + + mtx_unlock(&pool->m); + task->work(task->data, this_iter, &lmem); + mtx_lock(&pool->m); + task->iter_finished++; + if (task->iter_finished == task->iter_total) + cnd_broadcast(&task->finish); + } + mtx_unlock(&pool->m); + FREE(lmem.local_mem_ptr); + return 0; +} + +struct lp_cs_tpool * +lp_cs_tpool_create(unsigned num_threads) +{ + struct lp_cs_tpool *pool = CALLOC_STRUCT(lp_cs_tpool); + + if (!pool) + return NULL; + + (void) mtx_init(&pool->m, mtx_plain); + cnd_init(&pool->new_work); + + list_inithead(&pool->workqueue); + assert (num_threads <= LP_MAX_THREADS); + pool->num_threads = num_threads; + for (unsigned i = 0; i < num_threads; i++) + pool->threads[i] = u_thread_create(lp_cs_tpool_worker, pool); + return pool; +} + +void +lp_cs_tpool_destroy(struct lp_cs_tpool *pool) +{ + if (!pool) + return; + + mtx_lock(&pool->m); + pool->shutdown = true; + cnd_broadcast(&pool->new_work); + mtx_unlock(&pool->m); + + for (unsigned i = 0; i < pool->num_threads; i++) { + thrd_join(pool->threads[i], NULL); + } + + cnd_destroy(&pool->new_work); + mtx_destroy(&pool->m); + FREE(pool); +} + +struct lp_cs_tpool_task * +lp_cs_tpool_queue_task(struct lp_cs_tpool *pool, + lp_cs_tpool_task_func work, void *data, int num_iters) +{ + struct lp_cs_tpool_task *task; + + if (pool->num_threads == 0) { + struct lp_cs_local_mem lmem; + + memset(&lmem, 0, sizeof(lmem)); + for (unsigned t = 0; t < num_iters; t++) { + work(data, t, &lmem); + } + return NULL; + } + task = CALLOC_STRUCT(lp_cs_tpool_task); + if (!task) { + return NULL; + } + + task->work = work; + task->data = data; + task->iter_total = num_iters; + cnd_init(&task->finish); + + mtx_lock(&pool->m); + + list_addtail(&task->list, &pool->workqueue); + + cnd_broadcast(&pool->new_work); + mtx_unlock(&pool->m); + return task; +} + +void +lp_cs_tpool_wait_for_task(struct lp_cs_tpool *pool, + struct lp_cs_tpool_task **task_handle) +{ + struct lp_cs_tpool_task *task = *task_handle; + + if (!pool || !task) + return; + + mtx_lock(&pool->m); + while (task->iter_finished < task->iter_total) + cnd_wait(&task->finish, &pool->m); + mtx_unlock(&pool->m); + + cnd_destroy(&task->finish); + FREE(task); + *task_handle = NULL; +} diff -Nru mesa-19.2.8/src/gallium/drivers/llvmpipe/lp_cs_tpool.h mesa-20.0.8/src/gallium/drivers/llvmpipe/lp_cs_tpool.h --- mesa-19.2.8/src/gallium/drivers/llvmpipe/lp_cs_tpool.h 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/llvmpipe/lp_cs_tpool.h 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,81 @@ +/************************************************************************** + * + * Copyright 2019 Red Hat. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **************************************************************************/ + +/* This is a compute shader specific thread pool. + * It allows the queuing of a number of tasks per work item. + * The item is added to the work queue once, but it must execute + * number of iterations times. This saves storing a bunch of queue + * structs with just unique indexes in them. + * It also supports a local memory support struct to be passed from + * outside the thread exec function. + */ +#ifndef LP_CS_QUEUE +#define LP_CS_QUEUE + +#include "pipe/p_compiler.h" + +#include "util/u_thread.h" +#include "util/list.h" + +#include "lp_limits.h" + +struct lp_cs_tpool { + mtx_t m; + cnd_t new_work; + + thrd_t threads[LP_MAX_THREADS]; + unsigned num_threads; + struct list_head workqueue; + bool shutdown; +}; + +struct lp_cs_local_mem { + unsigned local_size; + void *local_mem_ptr; +}; + +typedef void (*lp_cs_tpool_task_func)(void *data, int iter_idx, struct lp_cs_local_mem *lmem); + +struct lp_cs_tpool_task { + lp_cs_tpool_task_func work; + void *data; + struct list_head list; + cnd_t finish; + unsigned iter_total; + unsigned iter_start; + unsigned iter_finished; +}; + +struct lp_cs_tpool *lp_cs_tpool_create(unsigned num_threads); +void lp_cs_tpool_destroy(struct lp_cs_tpool *); + +struct lp_cs_tpool_task *lp_cs_tpool_queue_task(struct lp_cs_tpool *, + lp_cs_tpool_task_func func, + void *data, int num_iters); + +void lp_cs_tpool_wait_for_task(struct lp_cs_tpool *pool, + struct lp_cs_tpool_task **task); + +#endif /* LP_BIN_QUEUE */ diff -Nru mesa-19.2.8/src/gallium/drivers/llvmpipe/lp_debug.h mesa-20.0.8/src/gallium/drivers/llvmpipe/lp_debug.h --- mesa-19.2.8/src/gallium/drivers/llvmpipe/lp_debug.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/llvmpipe/lp_debug.h 2020-06-12 01:21:17.000000000 +0000 @@ -32,10 +32,6 @@ #include "pipe/p_compiler.h" #include "util/u_debug.h" -extern void -st_print_current(void); - - #define DEBUG_PIPE 0x1 #define DEBUG_TGSI 0x2 #define DEBUG_TEX 0x4 @@ -48,6 +44,9 @@ #define DEBUG_FENCE 0x2000 #define DEBUG_MEM 0x4000 #define DEBUG_FS 0x8000 +#define DEBUG_CS 0x10000 +#define DEBUG_TGSI_IR 0x20000 +#define DEBUG_CL 0x40000 /* Performance flags. These are active even on release builds. */ diff -Nru mesa-19.2.8/src/gallium/drivers/llvmpipe/lp_draw_arrays.c mesa-20.0.8/src/gallium/drivers/llvmpipe/lp_draw_arrays.c --- mesa-19.2.8/src/gallium/drivers/llvmpipe/lp_draw_arrays.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/llvmpipe/lp_draw_arrays.c 2020-06-12 01:21:17.000000000 +0000 @@ -105,6 +105,13 @@ llvmpipe_prepare_geometry_sampling(lp, lp->num_sampler_views[PIPE_SHADER_GEOMETRY], lp->sampler_views[PIPE_SHADER_GEOMETRY]); + + llvmpipe_prepare_vertex_images(lp, + lp->num_images[PIPE_SHADER_VERTEX], + lp->images[PIPE_SHADER_VERTEX]); + llvmpipe_prepare_geometry_images(lp, + lp->num_images[PIPE_SHADER_GEOMETRY], + lp->images[PIPE_SHADER_GEOMETRY]); if (lp->gs && lp->gs->no_tokens) { /* we have an empty geometry shader with stream output, so attach the stream output info to the current vertex shader */ @@ -115,6 +122,10 @@ draw_collect_pipeline_statistics(draw, lp->active_statistics_queries > 0); + draw_collect_primitives_generated(draw, + lp->active_primgen_queries && + !lp->queries_disabled); + /* draw! */ draw_vbo(draw, info); diff -Nru mesa-19.2.8/src/gallium/drivers/llvmpipe/lp_jit.c mesa-20.0.8/src/gallium/drivers/llvmpipe/lp_jit.c --- mesa-19.2.8/src/gallium/drivers/llvmpipe/lp_jit.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/llvmpipe/lp_jit.c 2020-06-12 01:21:17.000000000 +0000 @@ -32,6 +32,7 @@ * @author Jose Fonseca */ +#include #include "util/u_memory.h" #include "gallivm/lp_bld_init.h" @@ -40,13 +41,134 @@ #include "lp_context.h" #include "lp_jit.h" +static LLVMTypeRef +create_jit_texture_type(struct gallivm_state *gallivm) +{ + LLVMContextRef lc = gallivm->context; + LLVMTypeRef texture_type; + LLVMTypeRef elem_types[LP_JIT_TEXTURE_NUM_FIELDS]; + + /* struct lp_jit_texture */ + elem_types[LP_JIT_TEXTURE_WIDTH] = + elem_types[LP_JIT_TEXTURE_HEIGHT] = + elem_types[LP_JIT_TEXTURE_DEPTH] = + elem_types[LP_JIT_TEXTURE_FIRST_LEVEL] = + elem_types[LP_JIT_TEXTURE_LAST_LEVEL] = LLVMInt32TypeInContext(lc); + elem_types[LP_JIT_TEXTURE_BASE] = LLVMPointerType(LLVMInt8TypeInContext(lc), 0); + elem_types[LP_JIT_TEXTURE_ROW_STRIDE] = + elem_types[LP_JIT_TEXTURE_IMG_STRIDE] = + elem_types[LP_JIT_TEXTURE_MIP_OFFSETS] = + LLVMArrayType(LLVMInt32TypeInContext(lc), LP_MAX_TEXTURE_LEVELS); + + texture_type = LLVMStructTypeInContext(lc, elem_types, + ARRAY_SIZE(elem_types), 0); + + LP_CHECK_MEMBER_OFFSET(struct lp_jit_texture, width, + gallivm->target, texture_type, + LP_JIT_TEXTURE_WIDTH); + LP_CHECK_MEMBER_OFFSET(struct lp_jit_texture, height, + gallivm->target, texture_type, + LP_JIT_TEXTURE_HEIGHT); + LP_CHECK_MEMBER_OFFSET(struct lp_jit_texture, depth, + gallivm->target, texture_type, + LP_JIT_TEXTURE_DEPTH); + LP_CHECK_MEMBER_OFFSET(struct lp_jit_texture, base, + gallivm->target, texture_type, + LP_JIT_TEXTURE_BASE); + LP_CHECK_MEMBER_OFFSET(struct lp_jit_texture, row_stride, + gallivm->target, texture_type, + LP_JIT_TEXTURE_ROW_STRIDE); + LP_CHECK_MEMBER_OFFSET(struct lp_jit_texture, img_stride, + gallivm->target, texture_type, + LP_JIT_TEXTURE_IMG_STRIDE); + LP_CHECK_MEMBER_OFFSET(struct lp_jit_texture, first_level, + gallivm->target, texture_type, + LP_JIT_TEXTURE_FIRST_LEVEL); + LP_CHECK_MEMBER_OFFSET(struct lp_jit_texture, last_level, + gallivm->target, texture_type, + LP_JIT_TEXTURE_LAST_LEVEL); + LP_CHECK_MEMBER_OFFSET(struct lp_jit_texture, mip_offsets, + gallivm->target, texture_type, + LP_JIT_TEXTURE_MIP_OFFSETS); + LP_CHECK_STRUCT_SIZE(struct lp_jit_texture, + gallivm->target, texture_type); + return texture_type; +} + +static LLVMTypeRef +create_jit_sampler_type(struct gallivm_state *gallivm) +{ + LLVMContextRef lc = gallivm->context; + LLVMTypeRef sampler_type; + LLVMTypeRef elem_types[LP_JIT_SAMPLER_NUM_FIELDS]; + elem_types[LP_JIT_SAMPLER_MIN_LOD] = + elem_types[LP_JIT_SAMPLER_MAX_LOD] = + elem_types[LP_JIT_SAMPLER_LOD_BIAS] = LLVMFloatTypeInContext(lc); + elem_types[LP_JIT_SAMPLER_BORDER_COLOR] = + LLVMArrayType(LLVMFloatTypeInContext(lc), 4); + + sampler_type = LLVMStructTypeInContext(lc, elem_types, + ARRAY_SIZE(elem_types), 0); + + LP_CHECK_MEMBER_OFFSET(struct lp_jit_sampler, min_lod, + gallivm->target, sampler_type, + LP_JIT_SAMPLER_MIN_LOD); + LP_CHECK_MEMBER_OFFSET(struct lp_jit_sampler, max_lod, + gallivm->target, sampler_type, + LP_JIT_SAMPLER_MAX_LOD); + LP_CHECK_MEMBER_OFFSET(struct lp_jit_sampler, lod_bias, + gallivm->target, sampler_type, + LP_JIT_SAMPLER_LOD_BIAS); + LP_CHECK_MEMBER_OFFSET(struct lp_jit_sampler, border_color, + gallivm->target, sampler_type, + LP_JIT_SAMPLER_BORDER_COLOR); + LP_CHECK_STRUCT_SIZE(struct lp_jit_sampler, + gallivm->target, sampler_type); + return sampler_type; +} + +static LLVMTypeRef +create_jit_image_type(struct gallivm_state *gallivm) +{ + LLVMContextRef lc = gallivm->context; + LLVMTypeRef image_type; + LLVMTypeRef elem_types[LP_JIT_IMAGE_NUM_FIELDS]; + elem_types[LP_JIT_IMAGE_WIDTH] = + elem_types[LP_JIT_IMAGE_HEIGHT] = + elem_types[LP_JIT_IMAGE_DEPTH] = LLVMInt32TypeInContext(lc); + elem_types[LP_JIT_IMAGE_BASE] = LLVMPointerType(LLVMInt8TypeInContext(lc), 0); + elem_types[LP_JIT_IMAGE_ROW_STRIDE] = + elem_types[LP_JIT_IMAGE_IMG_STRIDE] = LLVMInt32TypeInContext(lc); + + image_type = LLVMStructTypeInContext(lc, elem_types, + ARRAY_SIZE(elem_types), 0); + LP_CHECK_MEMBER_OFFSET(struct lp_jit_image, width, + gallivm->target, image_type, + LP_JIT_IMAGE_WIDTH); + LP_CHECK_MEMBER_OFFSET(struct lp_jit_image, height, + gallivm->target, image_type, + LP_JIT_IMAGE_HEIGHT); + LP_CHECK_MEMBER_OFFSET(struct lp_jit_image, depth, + gallivm->target, image_type, + LP_JIT_IMAGE_DEPTH); + LP_CHECK_MEMBER_OFFSET(struct lp_jit_image, base, + gallivm->target, image_type, + LP_JIT_IMAGE_BASE); + LP_CHECK_MEMBER_OFFSET(struct lp_jit_image, row_stride, + gallivm->target, image_type, + LP_JIT_IMAGE_ROW_STRIDE); + LP_CHECK_MEMBER_OFFSET(struct lp_jit_image, img_stride, + gallivm->target, image_type, + LP_JIT_IMAGE_IMG_STRIDE); + return image_type; +} static void lp_jit_create_types(struct lp_fragment_shader_variant *lp) { struct gallivm_state *gallivm = lp->gallivm; LLVMContextRef lc = gallivm->context; - LLVMTypeRef viewport_type, texture_type, sampler_type; + LLVMTypeRef viewport_type, texture_type, sampler_type, image_type; /* struct lp_jit_viewport */ { @@ -68,82 +190,9 @@ gallivm->target, viewport_type); } - /* struct lp_jit_texture */ - { - LLVMTypeRef elem_types[LP_JIT_TEXTURE_NUM_FIELDS]; - - elem_types[LP_JIT_TEXTURE_WIDTH] = - elem_types[LP_JIT_TEXTURE_HEIGHT] = - elem_types[LP_JIT_TEXTURE_DEPTH] = - elem_types[LP_JIT_TEXTURE_FIRST_LEVEL] = - elem_types[LP_JIT_TEXTURE_LAST_LEVEL] = LLVMInt32TypeInContext(lc); - elem_types[LP_JIT_TEXTURE_BASE] = LLVMPointerType(LLVMInt8TypeInContext(lc), 0); - elem_types[LP_JIT_TEXTURE_ROW_STRIDE] = - elem_types[LP_JIT_TEXTURE_IMG_STRIDE] = - elem_types[LP_JIT_TEXTURE_MIP_OFFSETS] = - LLVMArrayType(LLVMInt32TypeInContext(lc), LP_MAX_TEXTURE_LEVELS); - - texture_type = LLVMStructTypeInContext(lc, elem_types, - ARRAY_SIZE(elem_types), 0); - - LP_CHECK_MEMBER_OFFSET(struct lp_jit_texture, width, - gallivm->target, texture_type, - LP_JIT_TEXTURE_WIDTH); - LP_CHECK_MEMBER_OFFSET(struct lp_jit_texture, height, - gallivm->target, texture_type, - LP_JIT_TEXTURE_HEIGHT); - LP_CHECK_MEMBER_OFFSET(struct lp_jit_texture, depth, - gallivm->target, texture_type, - LP_JIT_TEXTURE_DEPTH); - LP_CHECK_MEMBER_OFFSET(struct lp_jit_texture, first_level, - gallivm->target, texture_type, - LP_JIT_TEXTURE_FIRST_LEVEL); - LP_CHECK_MEMBER_OFFSET(struct lp_jit_texture, last_level, - gallivm->target, texture_type, - LP_JIT_TEXTURE_LAST_LEVEL); - LP_CHECK_MEMBER_OFFSET(struct lp_jit_texture, base, - gallivm->target, texture_type, - LP_JIT_TEXTURE_BASE); - LP_CHECK_MEMBER_OFFSET(struct lp_jit_texture, row_stride, - gallivm->target, texture_type, - LP_JIT_TEXTURE_ROW_STRIDE); - LP_CHECK_MEMBER_OFFSET(struct lp_jit_texture, img_stride, - gallivm->target, texture_type, - LP_JIT_TEXTURE_IMG_STRIDE); - LP_CHECK_MEMBER_OFFSET(struct lp_jit_texture, mip_offsets, - gallivm->target, texture_type, - LP_JIT_TEXTURE_MIP_OFFSETS); - LP_CHECK_STRUCT_SIZE(struct lp_jit_texture, - gallivm->target, texture_type); - } - - /* struct lp_jit_sampler */ - { - LLVMTypeRef elem_types[LP_JIT_SAMPLER_NUM_FIELDS]; - elem_types[LP_JIT_SAMPLER_MIN_LOD] = - elem_types[LP_JIT_SAMPLER_MAX_LOD] = - elem_types[LP_JIT_SAMPLER_LOD_BIAS] = LLVMFloatTypeInContext(lc); - elem_types[LP_JIT_SAMPLER_BORDER_COLOR] = - LLVMArrayType(LLVMFloatTypeInContext(lc), 4); - - sampler_type = LLVMStructTypeInContext(lc, elem_types, - ARRAY_SIZE(elem_types), 0); - - LP_CHECK_MEMBER_OFFSET(struct lp_jit_sampler, min_lod, - gallivm->target, sampler_type, - LP_JIT_SAMPLER_MIN_LOD); - LP_CHECK_MEMBER_OFFSET(struct lp_jit_sampler, max_lod, - gallivm->target, sampler_type, - LP_JIT_SAMPLER_MAX_LOD); - LP_CHECK_MEMBER_OFFSET(struct lp_jit_sampler, lod_bias, - gallivm->target, sampler_type, - LP_JIT_SAMPLER_LOD_BIAS); - LP_CHECK_MEMBER_OFFSET(struct lp_jit_sampler, border_color, - gallivm->target, sampler_type, - LP_JIT_SAMPLER_BORDER_COLOR); - LP_CHECK_STRUCT_SIZE(struct lp_jit_sampler, - gallivm->target, sampler_type); - } + texture_type = create_jit_texture_type(gallivm); + sampler_type = create_jit_sampler_type(gallivm); + image_type = create_jit_image_type(gallivm); /* struct lp_jit_context */ { @@ -154,16 +203,18 @@ LLVMArrayType(LLVMPointerType(LLVMFloatTypeInContext(lc), 0), LP_MAX_TGSI_CONST_BUFFERS); elem_types[LP_JIT_CTX_NUM_CONSTANTS] = LLVMArrayType(LLVMInt32TypeInContext(lc), LP_MAX_TGSI_CONST_BUFFERS); + elem_types[LP_JIT_CTX_TEXTURES] = LLVMArrayType(texture_type, + PIPE_MAX_SHADER_SAMPLER_VIEWS); + elem_types[LP_JIT_CTX_SAMPLERS] = LLVMArrayType(sampler_type, + PIPE_MAX_SAMPLERS); + elem_types[LP_JIT_CTX_IMAGES] = LLVMArrayType(image_type, + PIPE_MAX_SHADER_IMAGES); elem_types[LP_JIT_CTX_ALPHA_REF] = LLVMFloatTypeInContext(lc); elem_types[LP_JIT_CTX_STENCIL_REF_FRONT] = elem_types[LP_JIT_CTX_STENCIL_REF_BACK] = LLVMInt32TypeInContext(lc); elem_types[LP_JIT_CTX_U8_BLEND_COLOR] = LLVMPointerType(LLVMInt8TypeInContext(lc), 0); elem_types[LP_JIT_CTX_F_BLEND_COLOR] = LLVMPointerType(LLVMFloatTypeInContext(lc), 0); elem_types[LP_JIT_CTX_VIEWPORTS] = LLVMPointerType(viewport_type, 0); - elem_types[LP_JIT_CTX_TEXTURES] = LLVMArrayType(texture_type, - PIPE_MAX_SHADER_SAMPLER_VIEWS); - elem_types[LP_JIT_CTX_SAMPLERS] = LLVMArrayType(sampler_type, - PIPE_MAX_SAMPLERS); elem_types[LP_JIT_CTX_SSBOS] = LLVMArrayType(LLVMPointerType(LLVMInt32TypeInContext(lc), 0), LP_MAX_TGSI_SHADER_BUFFERS); elem_types[LP_JIT_CTX_NUM_SSBOS] = @@ -177,6 +228,15 @@ LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, num_constants, gallivm->target, context_type, LP_JIT_CTX_NUM_CONSTANTS); + LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, textures, + gallivm->target, context_type, + LP_JIT_CTX_TEXTURES); + LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, samplers, + gallivm->target, context_type, + LP_JIT_CTX_SAMPLERS); + LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, images, + gallivm->target, context_type, + LP_JIT_CTX_IMAGES); LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, alpha_ref_value, gallivm->target, context_type, LP_JIT_CTX_ALPHA_REF); @@ -195,12 +255,6 @@ LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, viewports, gallivm->target, context_type, LP_JIT_CTX_VIEWPORTS); - LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, textures, - gallivm->target, context_type, - LP_JIT_CTX_TEXTURES); - LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, samplers, - gallivm->target, context_type, - LP_JIT_CTX_SAMPLERS); LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, ssbos, gallivm->target, context_type, LP_JIT_CTX_SSBOS); @@ -232,13 +286,9 @@ } if (gallivm_debug & GALLIVM_DEBUG_IR) { -#if HAVE_LLVM >= 0x304 char *str = LLVMPrintModuleToString(gallivm->module); fprintf(stderr, "%s", str); LLVMDisposeMessage(str); -#else - LLVMDumpModule(gallivm->module); -#endif } } @@ -263,3 +313,103 @@ if (!lp->jit_context_ptr_type) lp_jit_create_types(lp); } + +static void +lp_jit_create_cs_types(struct lp_compute_shader_variant *lp) +{ + struct gallivm_state *gallivm = lp->gallivm; + LLVMContextRef lc = gallivm->context; + LLVMTypeRef texture_type, sampler_type, image_type; + + texture_type = create_jit_texture_type(gallivm); + sampler_type = create_jit_sampler_type(gallivm); + image_type = create_jit_image_type(gallivm); + + /* struct lp_jit_cs_thread_data */ + { + LLVMTypeRef elem_types[LP_JIT_CS_THREAD_DATA_COUNT]; + LLVMTypeRef thread_data_type; + + elem_types[LP_JIT_CS_THREAD_DATA_CACHE] = + LLVMPointerType(lp_build_format_cache_type(gallivm), 0); + + elem_types[LP_JIT_CS_THREAD_DATA_SHARED] = LLVMPointerType(LLVMInt32TypeInContext(lc), 0); + thread_data_type = LLVMStructTypeInContext(lc, elem_types, + ARRAY_SIZE(elem_types), 0); + + lp->jit_cs_thread_data_ptr_type = LLVMPointerType(thread_data_type, 0); + } + + /* struct lp_jit_cs_context */ + { + LLVMTypeRef elem_types[LP_JIT_CS_CTX_COUNT]; + LLVMTypeRef cs_context_type; + + elem_types[LP_JIT_CS_CTX_CONSTANTS] = + LLVMArrayType(LLVMPointerType(LLVMFloatTypeInContext(lc), 0), LP_MAX_TGSI_CONST_BUFFERS); + elem_types[LP_JIT_CS_CTX_NUM_CONSTANTS] = + LLVMArrayType(LLVMInt32TypeInContext(lc), LP_MAX_TGSI_CONST_BUFFERS); + elem_types[LP_JIT_CS_CTX_TEXTURES] = LLVMArrayType(texture_type, + PIPE_MAX_SHADER_SAMPLER_VIEWS); + elem_types[LP_JIT_CS_CTX_SAMPLERS] = LLVMArrayType(sampler_type, + PIPE_MAX_SAMPLERS); + elem_types[LP_JIT_CS_CTX_IMAGES] = LLVMArrayType(image_type, + PIPE_MAX_SHADER_IMAGES); + elem_types[LP_JIT_CS_CTX_SSBOS] = + LLVMArrayType(LLVMPointerType(LLVMInt32TypeInContext(lc), 0), LP_MAX_TGSI_SHADER_BUFFERS); + elem_types[LP_JIT_CS_CTX_NUM_SSBOS] = + LLVMArrayType(LLVMInt32TypeInContext(lc), LP_MAX_TGSI_SHADER_BUFFERS); + + elem_types[LP_JIT_CS_CTX_SHARED_SIZE] = LLVMInt32TypeInContext(lc); + + elem_types[LP_JIT_CS_CTX_KERNEL_ARGS] = LLVMPointerType(LLVMInt8TypeInContext(lc), 0); + + cs_context_type = LLVMStructTypeInContext(lc, elem_types, + ARRAY_SIZE(elem_types), 0); + + LP_CHECK_MEMBER_OFFSET(struct lp_jit_cs_context, constants, + gallivm->target, cs_context_type, + LP_JIT_CS_CTX_CONSTANTS); + LP_CHECK_MEMBER_OFFSET(struct lp_jit_cs_context, num_constants, + gallivm->target, cs_context_type, + LP_JIT_CS_CTX_NUM_CONSTANTS); + LP_CHECK_MEMBER_OFFSET(struct lp_jit_cs_context, textures, + gallivm->target, cs_context_type, + LP_JIT_CS_CTX_TEXTURES); + LP_CHECK_MEMBER_OFFSET(struct lp_jit_cs_context, samplers, + gallivm->target, cs_context_type, + LP_JIT_CS_CTX_SAMPLERS); + LP_CHECK_MEMBER_OFFSET(struct lp_jit_cs_context, images, + gallivm->target, cs_context_type, + LP_JIT_CS_CTX_IMAGES); + LP_CHECK_MEMBER_OFFSET(struct lp_jit_cs_context, ssbos, + gallivm->target, cs_context_type, + LP_JIT_CS_CTX_SSBOS); + LP_CHECK_MEMBER_OFFSET(struct lp_jit_cs_context, num_ssbos, + gallivm->target, cs_context_type, + LP_JIT_CS_CTX_NUM_SSBOS); + LP_CHECK_MEMBER_OFFSET(struct lp_jit_cs_context, shared_size, + gallivm->target, cs_context_type, + LP_JIT_CS_CTX_SHARED_SIZE); + LP_CHECK_MEMBER_OFFSET(struct lp_jit_cs_context, kernel_args, + gallivm->target, cs_context_type, + LP_JIT_CS_CTX_KERNEL_ARGS); + LP_CHECK_STRUCT_SIZE(struct lp_jit_cs_context, + gallivm->target, cs_context_type); + + lp->jit_cs_context_ptr_type = LLVMPointerType(cs_context_type, 0); + } + + if (gallivm_debug & GALLIVM_DEBUG_IR) { + char *str = LLVMPrintModuleToString(gallivm->module); + fprintf(stderr, "%s", str); + LLVMDisposeMessage(str); + } +} + +void +lp_jit_init_cs_types(struct lp_compute_shader_variant *lp) +{ + if (!lp->jit_cs_context_ptr_type) + lp_jit_create_cs_types(lp); +} diff -Nru mesa-19.2.8/src/gallium/drivers/llvmpipe/lp_jit.h mesa-20.0.8/src/gallium/drivers/llvmpipe/lp_jit.h --- mesa-19.2.8/src/gallium/drivers/llvmpipe/lp_jit.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/llvmpipe/lp_jit.h 2020-06-12 01:21:17.000000000 +0000 @@ -45,6 +45,7 @@ struct lp_build_format_cache; struct lp_fragment_shader_variant; +struct lp_compute_shader_variant; struct llvmpipe_screen; @@ -53,11 +54,11 @@ uint32_t width; /* same as number of elements */ uint32_t height; uint32_t depth; /* doubles as array size */ - uint32_t first_level; - uint32_t last_level; const void *base; uint32_t row_stride[LP_MAX_TEXTURE_LEVELS]; uint32_t img_stride[LP_MAX_TEXTURE_LEVELS]; + uint32_t first_level; + uint32_t last_level; uint32_t mip_offsets[LP_MAX_TEXTURE_LEVELS]; }; @@ -78,15 +79,25 @@ }; +struct lp_jit_image +{ + uint32_t width; /* same as number of elements */ + uint32_t height; + uint32_t depth; + const void *base; + uint32_t row_stride; + uint32_t img_stride; +}; + enum { LP_JIT_TEXTURE_WIDTH = 0, LP_JIT_TEXTURE_HEIGHT, LP_JIT_TEXTURE_DEPTH, - LP_JIT_TEXTURE_FIRST_LEVEL, - LP_JIT_TEXTURE_LAST_LEVEL, LP_JIT_TEXTURE_BASE, LP_JIT_TEXTURE_ROW_STRIDE, LP_JIT_TEXTURE_IMG_STRIDE, + LP_JIT_TEXTURE_FIRST_LEVEL, + LP_JIT_TEXTURE_LAST_LEVEL, LP_JIT_TEXTURE_MIP_OFFSETS, LP_JIT_TEXTURE_NUM_FIELDS /* number of fields above */ }; @@ -107,7 +118,15 @@ LP_JIT_VIEWPORT_NUM_FIELDS /* number of fields above */ }; - +enum { + LP_JIT_IMAGE_WIDTH = 0, + LP_JIT_IMAGE_HEIGHT, + LP_JIT_IMAGE_DEPTH, + LP_JIT_IMAGE_BASE, + LP_JIT_IMAGE_ROW_STRIDE, + LP_JIT_IMAGE_IMG_STRIDE, + LP_JIT_IMAGE_NUM_FIELDS /* number of fields above */ +}; /** * This structure is passed directly to the generated fragment shader. * @@ -124,6 +143,10 @@ const float *constants[LP_MAX_TGSI_CONST_BUFFERS]; int num_constants[LP_MAX_TGSI_CONST_BUFFERS]; + struct lp_jit_texture textures[PIPE_MAX_SHADER_SAMPLER_VIEWS]; + struct lp_jit_sampler samplers[PIPE_MAX_SAMPLERS]; + struct lp_jit_image images[PIPE_MAX_SHADER_IMAGES]; + float alpha_ref_value; uint32_t stencil_ref_front, stencil_ref_back; @@ -133,9 +156,6 @@ struct lp_jit_viewport *viewports; - struct lp_jit_texture textures[PIPE_MAX_SHADER_SAMPLER_VIEWS]; - struct lp_jit_sampler samplers[PIPE_MAX_SAMPLERS]; - const uint32_t *ssbos[LP_MAX_TGSI_SHADER_BUFFERS]; int num_ssbos[LP_MAX_TGSI_SHADER_BUFFERS]; }; @@ -148,14 +168,15 @@ enum { LP_JIT_CTX_CONSTANTS = 0, LP_JIT_CTX_NUM_CONSTANTS, + LP_JIT_CTX_TEXTURES, + LP_JIT_CTX_SAMPLERS, + LP_JIT_CTX_IMAGES, LP_JIT_CTX_ALPHA_REF, LP_JIT_CTX_STENCIL_REF_FRONT, LP_JIT_CTX_STENCIL_REF_BACK, LP_JIT_CTX_U8_BLEND_COLOR, LP_JIT_CTX_F_BLEND_COLOR, LP_JIT_CTX_VIEWPORTS, - LP_JIT_CTX_TEXTURES, - LP_JIT_CTX_SAMPLERS, LP_JIT_CTX_SSBOS, LP_JIT_CTX_NUM_SSBOS, LP_JIT_CTX_COUNT @@ -168,6 +189,15 @@ #define lp_jit_context_num_constants(_gallivm, _ptr) \ lp_build_struct_get_ptr(_gallivm, _ptr, LP_JIT_CTX_NUM_CONSTANTS, "num_constants") +#define lp_jit_context_textures(_gallivm, _ptr) \ + lp_build_struct_get_ptr(_gallivm, _ptr, LP_JIT_CTX_TEXTURES, "textures") + +#define lp_jit_context_samplers(_gallivm, _ptr) \ + lp_build_struct_get_ptr(_gallivm, _ptr, LP_JIT_CTX_SAMPLERS, "samplers") + +#define lp_jit_context_images(_gallivm, _ptr) \ + lp_build_struct_get_ptr(_gallivm, _ptr, LP_JIT_CTX_IMAGES, "images") + #define lp_jit_context_alpha_ref_value(_gallivm, _ptr) \ lp_build_struct_get(_gallivm, _ptr, LP_JIT_CTX_ALPHA_REF, "alpha_ref_value") @@ -186,12 +216,6 @@ #define lp_jit_context_viewports(_gallivm, _ptr) \ lp_build_struct_get(_gallivm, _ptr, LP_JIT_CTX_VIEWPORTS, "viewports") -#define lp_jit_context_textures(_gallivm, _ptr) \ - lp_build_struct_get_ptr(_gallivm, _ptr, LP_JIT_CTX_TEXTURES, "textures") - -#define lp_jit_context_samplers(_gallivm, _ptr) \ - lp_build_struct_get_ptr(_gallivm, _ptr, LP_JIT_CTX_SAMPLERS, "samplers") - #define lp_jit_context_ssbos(_gallivm, _ptr) \ lp_build_struct_get_ptr(_gallivm, _ptr, LP_JIT_CTX_SSBOS, "ssbos") @@ -269,6 +293,101 @@ unsigned depth_stride); +struct lp_jit_cs_thread_data +{ + struct lp_build_format_cache *cache; + void *shared; +}; + +enum { + LP_JIT_CS_THREAD_DATA_CACHE = 0, + LP_JIT_CS_THREAD_DATA_SHARED = 1, + LP_JIT_CS_THREAD_DATA_COUNT +}; + + +#define lp_jit_cs_thread_data_cache(_gallivm, _ptr) \ + lp_build_struct_get(_gallivm, _ptr, LP_JIT_CS_THREAD_DATA_CACHE, "cache") + +#define lp_jit_cs_thread_data_shared(_gallivm, _ptr) \ + lp_build_struct_get(_gallivm, _ptr, LP_JIT_CS_THREAD_DATA_SHARED, "shared") + +struct lp_jit_cs_context +{ + const float *constants[LP_MAX_TGSI_CONST_BUFFERS]; + int num_constants[LP_MAX_TGSI_CONST_BUFFERS]; + + struct lp_jit_texture textures[PIPE_MAX_SHADER_SAMPLER_VIEWS]; + struct lp_jit_sampler samplers[PIPE_MAX_SAMPLERS]; + struct lp_jit_image images[PIPE_MAX_SHADER_IMAGES]; + + const uint32_t *ssbos[LP_MAX_TGSI_SHADER_BUFFERS]; + int num_ssbos[LP_MAX_TGSI_SHADER_BUFFERS]; + + void *kernel_args; + + uint32_t shared_size; +}; + +/** + * These enum values must match the position of the fields in the + * lp_jit_context struct above. + */ +enum { + LP_JIT_CS_CTX_CONSTANTS = 0, + LP_JIT_CS_CTX_NUM_CONSTANTS, + LP_JIT_CS_CTX_TEXTURES, /* must match the LP_JIT_CTX_TEXTURES */ + LP_JIT_CS_CTX_SAMPLERS, + LP_JIT_CS_CTX_IMAGES, + LP_JIT_CS_CTX_SSBOS, + LP_JIT_CS_CTX_NUM_SSBOS, + LP_JIT_CS_CTX_KERNEL_ARGS, + LP_JIT_CS_CTX_SHARED_SIZE, + LP_JIT_CS_CTX_COUNT +}; + +#define lp_jit_cs_context_constants(_gallivm, _ptr) \ + lp_build_struct_get_ptr(_gallivm, _ptr, LP_JIT_CS_CTX_CONSTANTS, "constants") + +#define lp_jit_cs_context_num_constants(_gallivm, _ptr) \ + lp_build_struct_get_ptr(_gallivm, _ptr, LP_JIT_CS_CTX_NUM_CONSTANTS, "num_constants") + +#define lp_jit_cs_context_textures(_gallivm, _ptr) \ + lp_build_struct_get_ptr(_gallivm, _ptr, LP_JIT_CS_CTX_TEXTURES, "textures") + +#define lp_jit_cs_context_samplers(_gallivm, _ptr) \ + lp_build_struct_get_ptr(_gallivm, _ptr, LP_JIT_CS_CTX_SAMPLERS, "samplers") + +#define lp_jit_cs_context_images(_gallivm, _ptr) \ + lp_build_struct_get_ptr(_gallivm, _ptr, LP_JIT_CS_CTX_IMAGES, "images") + +#define lp_jit_cs_context_ssbos(_gallivm, _ptr) \ + lp_build_struct_get_ptr(_gallivm, _ptr, LP_JIT_CS_CTX_SSBOS, "ssbos") + +#define lp_jit_cs_context_num_ssbos(_gallivm, _ptr) \ + lp_build_struct_get_ptr(_gallivm, _ptr, LP_JIT_CS_CTX_NUM_SSBOS, "num_ssbos") + +#define lp_jit_cs_context_shared_size(_gallivm, _ptr) \ + lp_build_struct_get_ptr(_gallivm, _ptr, LP_JIT_CS_CTX_SHARED_SIZE, "shared_size") + +#define lp_jit_cs_context_kernel_args(_gallivm, _ptr) \ + lp_build_struct_get(_gallivm, _ptr, LP_JIT_CS_CTX_KERNEL_ARGS, "kernel_args") + + +typedef void +(*lp_jit_cs_func)(const struct lp_jit_cs_context *context, + uint32_t x, + uint32_t y, + uint32_t z, + uint32_t grid_x, + uint32_t grid_y, + uint32_t grid_z, + uint32_t grid_size_x, + uint32_t grid_size_y, + uint32_t grid_size_z, + uint32_t work_dim, + struct lp_jit_cs_thread_data *thread_data); + void lp_jit_screen_cleanup(struct llvmpipe_screen *screen); @@ -280,5 +399,6 @@ void lp_jit_init_types(struct lp_fragment_shader_variant *lp); - +void +lp_jit_init_cs_types(struct lp_compute_shader_variant *lp); #endif /* LP_JIT_H */ diff -Nru mesa-19.2.8/src/gallium/drivers/llvmpipe/lp_limits.h mesa-20.0.8/src/gallium/drivers/llvmpipe/lp_limits.h --- mesa-19.2.8/src/gallium/drivers/llvmpipe/lp_limits.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/llvmpipe/lp_limits.h 2020-06-12 01:21:17.000000000 +0000 @@ -43,7 +43,11 @@ /** * Max texture sizes */ -#define LP_MAX_TEXTURE_SIZE (1 * 1024 * 1024 * 1024ULL) /* 1GB for now */ +/** + * 2GB is the actual max currently (we always use 32bit offsets, and both + * llvm GEP as well as avx2 gather use signed offsets). + */ +#define LP_MAX_TEXTURE_SIZE (2 * 1024 * 1024 * 1024ULL) #define LP_MAX_TEXTURE_2D_LEVELS 14 /* 8K x 8K for now */ #define LP_MAX_TEXTURE_3D_LEVELS 12 /* 2K x 2K x 2K for now */ #define LP_MAX_TEXTURE_CUBE_LEVELS 14 /* 8K x 8K for now */ diff -Nru mesa-19.2.8/src/gallium/drivers/llvmpipe/lp_perf.h mesa-20.0.8/src/gallium/drivers/llvmpipe/lp_perf.h --- mesa-19.2.8/src/gallium/drivers/llvmpipe/lp_perf.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/llvmpipe/lp_perf.h 2020-06-12 01:21:17.000000000 +0000 @@ -74,7 +74,7 @@ #define LP_COUNT_ADD(counter, incr) lp_count.counter += (incr) #define LP_COUNT_GET(counter) (lp_count.counter) #else -#define LP_COUNT(counter) +#define LP_COUNT(counter) do {} while (0) #define LP_COUNT_ADD(counter, incr) (void)(incr) #define LP_COUNT_GET(counter) 0 #endif diff -Nru mesa-19.2.8/src/gallium/drivers/llvmpipe/lp_query.c mesa-20.0.8/src/gallium/drivers/llvmpipe/lp_query.c --- mesa-19.2.8/src/gallium/drivers/llvmpipe/lp_query.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/llvmpipe/lp_query.c 2020-06-12 01:21:17.000000000 +0000 @@ -185,6 +185,156 @@ return true; } +static void +llvmpipe_get_query_result_resource(struct pipe_context *pipe, + struct pipe_query *q, + bool wait, + enum pipe_query_value_type result_type, + int index, + struct pipe_resource *resource, + unsigned offset) +{ + struct llvmpipe_screen *screen = llvmpipe_screen(pipe->screen); + unsigned num_threads = MAX2(1, screen->num_threads); + struct llvmpipe_query *pq = llvmpipe_query(q); + struct llvmpipe_resource *lpr = llvmpipe_resource(resource); + bool unflushed = false; + bool unsignalled = false; + if (pq->fence) { + /* only have a fence if there was a scene */ + if (!lp_fence_signalled(pq->fence)) { + unsignalled = true; + if (!lp_fence_issued(pq->fence)) + unflushed = true; + } + } + + + uint64_t value = 0; + if (index == -1) + if (unsignalled) + value = 0; + else + value = 1; + else { + unsigned i; + + if (unflushed) { + llvmpipe_flush(pipe, NULL, __FUNCTION__); + + if (!wait) + return; + + lp_fence_wait(pq->fence); + } + + switch (pq->type) { + case PIPE_QUERY_OCCLUSION_COUNTER: + for (i = 0; i < num_threads; i++) { + value += pq->end[i]; + } + break; + case PIPE_QUERY_OCCLUSION_PREDICATE: + case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: + for (i = 0; i < num_threads; i++) { + /* safer (still not guaranteed) when there's an overflow */ + value = value || pq->end[i]; + } + break; + case PIPE_QUERY_PRIMITIVES_GENERATED: + value = pq->num_primitives_generated; + break; + case PIPE_QUERY_PRIMITIVES_EMITTED: + value = pq->num_primitives_written; + break; + case PIPE_QUERY_TIMESTAMP: + for (i = 0; i < num_threads; i++) { + if (pq->end[i] > value) { + value = pq->end[i]; + } + } + break; + case PIPE_QUERY_SO_OVERFLOW_PREDICATE: + case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE: + value = !!(pq->num_primitives_generated > pq->num_primitives_written); + break; + case PIPE_QUERY_PIPELINE_STATISTICS: + switch ((enum pipe_statistics_query_index)index) { + case PIPE_STAT_QUERY_IA_VERTICES: + value = pq->stats.ia_vertices; + break; + case PIPE_STAT_QUERY_IA_PRIMITIVES: + value = pq->stats.ia_primitives; + break; + case PIPE_STAT_QUERY_VS_INVOCATIONS: + value = pq->stats.vs_invocations; + break; + case PIPE_STAT_QUERY_GS_INVOCATIONS: + value = pq->stats.gs_invocations; + break; + case PIPE_STAT_QUERY_GS_PRIMITIVES: + value = pq->stats.gs_primitives; + break; + case PIPE_STAT_QUERY_C_INVOCATIONS: + value = pq->stats.c_invocations; + break; + case PIPE_STAT_QUERY_C_PRIMITIVES: + value = pq->stats.c_primitives; + break; + case PIPE_STAT_QUERY_PS_INVOCATIONS: + value = 0; + for (i = 0; i < num_threads; i++) { + value += pq->end[i]; + } + value *= LP_RASTER_BLOCK_SIZE * LP_RASTER_BLOCK_SIZE; + break; + case PIPE_STAT_QUERY_HS_INVOCATIONS: + value = pq->stats.hs_invocations; + break; + case PIPE_STAT_QUERY_DS_INVOCATIONS: + value = pq->stats.ds_invocations; + break; + case PIPE_STAT_QUERY_CS_INVOCATIONS: + value = pq->stats.cs_invocations; + break; + } + break; + default: + fprintf(stderr, "Unknown query type %d\n", pq->type); + break; + } + } + + void *dst = (uint8_t *)lpr->data + offset; + switch (result_type) { + case PIPE_QUERY_TYPE_I32: { + int32_t *iptr = (int32_t *)dst; + if (value > 0x7fffffff) + *iptr = 0x7fffffff; + else + *iptr = (int32_t)value; + break; + } + case PIPE_QUERY_TYPE_U32: { + uint32_t *uptr = (uint32_t *)dst; + if (value > 0xffffffff) + *uptr = 0xffffffff; + else + *uptr = (uint32_t)value; + break; + } + case PIPE_QUERY_TYPE_I64: { + int64_t *iptr = (int64_t *)dst; + *iptr = (int64_t)value; + break; + } + case PIPE_QUERY_TYPE_U64: { + uint64_t *uptr = (uint64_t *)dst; + *uptr = (uint64_t)value; + break; + } + } +} static bool llvmpipe_begin_query(struct pipe_context *pipe, struct pipe_query *q) @@ -211,6 +361,7 @@ break; case PIPE_QUERY_PRIMITIVES_GENERATED: pq->num_primitives_generated = llvmpipe->so_stats.primitives_storage_needed; + llvmpipe->active_primgen_queries++; break; case PIPE_QUERY_SO_STATISTICS: pq->num_primitives_written = llvmpipe->so_stats.num_primitives_written; @@ -258,6 +409,8 @@ llvmpipe->so_stats.num_primitives_written - pq->num_primitives_written; break; case PIPE_QUERY_PRIMITIVES_GENERATED: + assert(llvmpipe->active_primgen_queries); + llvmpipe->active_primgen_queries--; pq->num_primitives_generated = llvmpipe->so_stats.primitives_storage_needed - pq->num_primitives_generated; break; @@ -291,7 +444,8 @@ llvmpipe->pipeline_statistics.c_primitives - pq->stats.c_primitives; pq->stats.ps_invocations = llvmpipe->pipeline_statistics.ps_invocations - pq->stats.ps_invocations; - + pq->stats.cs_invocations = + llvmpipe->pipeline_statistics.cs_invocations - pq->stats.cs_invocations; llvmpipe->active_statistics_queries--; break; case PIPE_QUERY_OCCLUSION_COUNTER: @@ -331,6 +485,11 @@ static void llvmpipe_set_active_query_state(struct pipe_context *pipe, bool enable) { + struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe); + + llvmpipe->queries_disabled = !enable; + /* for OQs we need to regenerate the fragment shader */ + llvmpipe->dirty |= LP_NEW_OCCLUSION_QUERY; } void llvmpipe_init_query_funcs(struct llvmpipe_context *llvmpipe ) @@ -340,6 +499,7 @@ llvmpipe->pipe.begin_query = llvmpipe_begin_query; llvmpipe->pipe.end_query = llvmpipe_end_query; llvmpipe->pipe.get_query_result = llvmpipe_get_query_result; + llvmpipe->pipe.get_query_result_resource = llvmpipe_get_query_result_resource; llvmpipe->pipe.set_active_query_state = llvmpipe_set_active_query_state; } diff -Nru mesa-19.2.8/src/gallium/drivers/llvmpipe/lp_rast.c mesa-20.0.8/src/gallium/drivers/llvmpipe/lp_rast.c --- mesa-19.2.8/src/gallium/drivers/llvmpipe/lp_rast.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/llvmpipe/lp_rast.c 2020-06-12 01:21:17.000000000 +0000 @@ -628,7 +628,7 @@ lp_rast_tile_end(task); - +#ifdef DEBUG /* Debug/Perf flags: */ if (bin->head->count == 1) { @@ -637,6 +637,7 @@ else if (bin->head->cmd[0] == LP_RAST_OP_SHADE_TILE) LP_COUNT(nr_pure_shade_64); } +#endif } @@ -866,6 +867,10 @@ pipe_semaphore_init(&rast->tasks[i].work_done, 0); rast->threads[i] = u_thread_create(thread_function, (void *) &rast->tasks[i]); + if (!rast->threads[i]) { + rast->num_threads = i; /* previous thread is max */ + break; + } } } diff -Nru mesa-19.2.8/src/gallium/drivers/llvmpipe/lp_rast_priv.h mesa-20.0.8/src/gallium/drivers/llvmpipe/lp_rast_priv.h --- mesa-19.2.8/src/gallium/drivers/llvmpipe/lp_rast_priv.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/llvmpipe/lp_rast_priv.h 2020-06-12 01:21:17.000000000 +0000 @@ -28,7 +28,7 @@ #ifndef LP_RAST_PRIV_H #define LP_RAST_PRIV_H -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_thread.h" #include "gallivm/lp_bld_debug.h" #include "lp_memory.h" diff -Nru mesa-19.2.8/src/gallium/drivers/llvmpipe/lp_rast_tri.c mesa-20.0.8/src/gallium/drivers/llvmpipe/lp_rast_tri.c --- mesa-19.2.8/src/gallium/drivers/llvmpipe/lp_rast_tri.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/llvmpipe/lp_rast_tri.c 2020-06-12 01:21:17.000000000 +0000 @@ -436,7 +436,7 @@ #else -#if defined(_ARCH_PWR8) && defined(PIPE_ARCH_LITTLE_ENDIAN) +#if defined(_ARCH_PWR8) && UTIL_ARCH_LITTLE_ENDIAN #include #include "util/u_pwr8.h" @@ -556,7 +556,7 @@ __m128i vshuf_mask1; __m128i vshuf_mask2; -#ifdef PIPE_ARCH_LITTLE_ENDIAN +#if UTIL_ARCH_LITTLE_ENDIAN vshuf_mask0 = (__m128i) vec_splats((unsigned int) 0x03020100); vshuf_mask1 = (__m128i) vec_splats((unsigned int) 0x07060504); vshuf_mask2 = (__m128i) vec_splats((unsigned int) 0x0B0A0908); @@ -662,7 +662,7 @@ lp_rast_triangle_32_3(task, arg2); } -#endif /* _ARCH_PWR8 && PIPE_ARCH_LITTLE_ENDIAN */ +#endif /* _ARCH_PWR8 && UTIL_ARCH_LITTLE_ENDIAN */ void lp_rast_triangle_32_4_16(struct lp_rasterizer_task *task, @@ -687,7 +687,7 @@ #if defined PIPE_ARCH_SSE #define BUILD_MASKS(c, cdiff, dcdx, dcdy, omask, pmask) build_masks_sse((int)c, (int)cdiff, dcdx, dcdy, omask, pmask) #define BUILD_MASK_LINEAR(c, dcdx, dcdy) build_mask_linear_sse((int)c, dcdx, dcdy) -#elif (defined(_ARCH_PWR8) && defined(PIPE_ARCH_LITTLE_ENDIAN)) +#elif (defined(_ARCH_PWR8) && UTIL_ARCH_LITTLE_ENDIAN) #define BUILD_MASKS(c, cdiff, dcdx, dcdy, omask, pmask) build_masks_ppc((int)c, (int)cdiff, dcdx, dcdy, omask, pmask) #define BUILD_MASK_LINEAR(c, dcdx, dcdy) build_mask_linear_ppc((int)c, dcdx, dcdy) #else diff -Nru mesa-19.2.8/src/gallium/drivers/llvmpipe/lp_scene.c mesa-20.0.8/src/gallium/drivers/llvmpipe/lp_scene.c --- mesa-19.2.8/src/gallium/drivers/llvmpipe/lp_scene.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/llvmpipe/lp_scene.c 2020-06-12 01:21:17.000000000 +0000 @@ -30,7 +30,7 @@ #include "util/u_memory.h" #include "util/u_inlines.h" #include "util/simple_list.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "lp_scene.h" #include "lp_fence.h" #include "lp_debug.h" diff -Nru mesa-19.2.8/src/gallium/drivers/llvmpipe/lp_screen.c mesa-20.0.8/src/gallium/drivers/llvmpipe/lp_screen.c --- mesa-19.2.8/src/gallium/drivers/llvmpipe/lp_screen.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/llvmpipe/lp_screen.c 2020-06-12 01:21:17.000000000 +0000 @@ -29,14 +29,15 @@ #include "util/u_memory.h" #include "util/u_math.h" #include "util/u_cpu_detect.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_screen.h" #include "util/u_string.h" -#include "util/u_format_s3tc.h" +#include "util/format/u_format_s3tc.h" #include "pipe/p_defines.h" #include "pipe/p_screen.h" #include "draw/draw_context.h" #include "gallivm/lp_bld_type.h" +#include "gallivm/lp_bld_nir.h" #include "util/os_misc.h" #include "util/os_time.h" @@ -49,9 +50,12 @@ #include "lp_public.h" #include "lp_limits.h" #include "lp_rast.h" +#include "lp_cs_tpool.h" #include "state_tracker/sw_winsys.h" +#include "nir.h" + #ifdef DEBUG int LP_DEBUG = 0; @@ -68,6 +72,9 @@ { "fence", DEBUG_FENCE, NULL }, { "mem", DEBUG_MEM, NULL }, { "fs", DEBUG_FS, NULL }, + { "cs", DEBUG_CS, NULL }, + { "tgsi_ir", DEBUG_TGSI_IR, NULL }, + { "cl", DEBUG_CL, NULL }, DEBUG_NAMED_VALUE_END }; #endif @@ -97,8 +104,7 @@ llvmpipe_get_name(struct pipe_screen *screen) { static char buf[100]; - snprintf(buf, sizeof(buf), "llvmpipe (LLVM %u.%u, %u bits)", - HAVE_LLVM >> 8, HAVE_LLVM & 0xff, + snprintf(buf, sizeof(buf), "llvmpipe (LLVM " MESA_LLVM_VERSION_STRING ", %u bits)", lp_native_vector_width ); return buf; } @@ -216,14 +222,14 @@ case PIPE_CAP_QUADS_FOLLOW_PROVOKING_VERTEX_CONVENTION: return 0; case PIPE_CAP_COMPUTE: - return 0; + return GALLIVM_HAVE_CORO; case PIPE_CAP_USER_VERTEX_BUFFERS: return 1; case PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY: case PIPE_CAP_VERTEX_BUFFER_STRIDE_4BYTE_ALIGNED_ONLY: case PIPE_CAP_VERTEX_ELEMENT_SRC_OFFSET_4BYTE_ALIGNED_ONLY: - case PIPE_CAP_TGSI_TEXCOORD: return 0; + case PIPE_CAP_TGSI_TEXCOORD: case PIPE_CAP_DRAW_INDIRECT: return 1; @@ -240,7 +246,7 @@ case PIPE_CAP_MAX_TEXTURE_BUFFER_SIZE: return 65536; case PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT: - return 1; + return 16; case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER: return 0; case PIPE_CAP_MAX_VIEWPORTS: @@ -260,8 +266,8 @@ case PIPE_CAP_TGSI_VS_WINDOW_SPACE_POSITION: return 1; case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE: + return 1; case PIPE_CAP_TGSI_TEX_TXF_LZ: - return 0; case PIPE_CAP_SAMPLER_VIEW_TARGET: return 1; case PIPE_CAP_FAKE_SW_MSAA: @@ -317,6 +323,13 @@ return 32; case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT: return 1; + case PIPE_CAP_QUERY_BUFFER_OBJECT: + return 1; + case PIPE_CAP_DRAW_PARAMETERS: + return 1; + case PIPE_CAP_MULTI_DRAW_INDIRECT: + case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS: + return 1; case PIPE_CAP_MULTISAMPLE_Z_RESOLVE: case PIPE_CAP_RESOURCE_FROM_USER_MEMORY: case PIPE_CAP_DEVICE_RESET_STATUS_QUERY: @@ -325,27 +338,20 @@ case PIPE_CAP_TGSI_TXQS: case PIPE_CAP_FORCE_PERSAMPLE_INTERP: case PIPE_CAP_SHAREABLE_SHADERS: - case PIPE_CAP_DRAW_PARAMETERS: case PIPE_CAP_TGSI_PACK_HALF_FLOAT: - case PIPE_CAP_MULTI_DRAW_INDIRECT: - case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS: case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL: - case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL: case PIPE_CAP_INVALIDATE_BUFFER: case PIPE_CAP_GENERATE_MIPMAP: case PIPE_CAP_STRING_MARKER: case PIPE_CAP_BUFFER_SAMPLER_VIEW_RGBA_ONLY: case PIPE_CAP_SURFACE_REINTERPRET_BLOCKS: - case PIPE_CAP_QUERY_BUFFER_OBJECT: case PIPE_CAP_QUERY_MEMORY_INFO: case PIPE_CAP_PCI_GROUP: case PIPE_CAP_PCI_BUS: case PIPE_CAP_PCI_DEVICE: case PIPE_CAP_PCI_FUNCTION: - case PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT: case PIPE_CAP_ROBUST_BUFFER_ACCESS_BEHAVIOR: case PIPE_CAP_PRIMITIVE_RESTART_FOR_PATCHES: - case PIPE_CAP_TGSI_VOTE: case PIPE_CAP_MAX_WINDOW_RECTANGLES: case PIPE_CAP_POLYGON_OFFSET_UNITS_UNSCALED: case PIPE_CAP_VIEWPORT_SUBPIXEL_BITS: @@ -365,7 +371,6 @@ case PIPE_CAP_BINDLESS_TEXTURE: case PIPE_CAP_NIR_SAMPLERS_AS_DEREF: case PIPE_CAP_MEMOBJ: - case PIPE_CAP_LOAD_CONSTBUF: case PIPE_CAP_TGSI_ANY_REG_AS_ADDRESS: case PIPE_CAP_TILE_RASTER_ORDER: case PIPE_CAP_MAX_COMBINED_SHADER_OUTPUT_RESOURCES: @@ -374,7 +379,6 @@ case PIPE_CAP_CONTEXT_PRIORITY_MASK: case PIPE_CAP_FENCE_SIGNAL: case PIPE_CAP_CONSTBUF0_FLAGS: - case PIPE_CAP_PACKED_UNIFORMS: case PIPE_CAP_CONSERVATIVE_RASTER_POST_SNAP_TRIANGLES: case PIPE_CAP_CONSERVATIVE_RASTER_POST_SNAP_POINTS_LINES: case PIPE_CAP_CONSERVATIVE_RASTER_PRE_SNAP_TRIANGLES: @@ -387,6 +391,16 @@ return 32; case PIPE_CAP_MAX_SHADER_BUFFER_SIZE: return LP_MAX_TGSI_SHADER_BUFFER_SIZE; + case PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT: + case PIPE_CAP_TGSI_TG4_COMPONENT_IN_SWIZZLE: + case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL: + return 1; + case PIPE_CAP_TGSI_VOTE: + case PIPE_CAP_LOAD_CONSTBUF: + case PIPE_CAP_PACKED_UNIFORMS: { + struct llvmpipe_screen *lscreen = llvmpipe_screen(screen); + return !lscreen->use_tgsi; + } default: return u_pipe_screen_get_param_defaults(screen, param); } @@ -399,13 +413,31 @@ { switch(shader) { + case PIPE_SHADER_COMPUTE: + if ((LP_DEBUG & DEBUG_CL) && param == PIPE_SHADER_CAP_SUPPORTED_IRS) + return (1 << PIPE_SHADER_IR_TGSI) | (1 << PIPE_SHADER_IR_NIR) | (1 << PIPE_SHADER_IR_NIR_SERIALIZED); case PIPE_SHADER_FRAGMENT: + if (param == PIPE_SHADER_CAP_PREFERRED_IR) { + struct llvmpipe_screen *lscreen = llvmpipe_screen(screen); + if (lscreen->use_tgsi) + return PIPE_SHADER_IR_TGSI; + else + return PIPE_SHADER_IR_NIR; + } switch (param) { default: return gallivm_get_shader_param(param); } case PIPE_SHADER_VERTEX: case PIPE_SHADER_GEOMETRY: + if (param == PIPE_SHADER_CAP_PREFERRED_IR) { + struct llvmpipe_screen *lscreen = llvmpipe_screen(screen); + if (lscreen->use_tgsi) + return PIPE_SHADER_IR_TGSI; + else + return PIPE_SHADER_IR_NIR; + } + switch (param) { case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS: /* At this time, the draw module and llvmpipe driver only @@ -457,6 +489,156 @@ return 0.0; } +static int +llvmpipe_get_compute_param(struct pipe_screen *_screen, + enum pipe_shader_ir ir_type, + enum pipe_compute_cap param, + void *ret) +{ + switch (param) { + case PIPE_COMPUTE_CAP_IR_TARGET: + return 0; + case PIPE_COMPUTE_CAP_MAX_GRID_SIZE: + if (ret) { + uint64_t *grid_size = ret; + grid_size[0] = 65535; + grid_size[1] = 65535; + grid_size[2] = 65535; + } + return 3 * sizeof(uint64_t) ; + case PIPE_COMPUTE_CAP_MAX_BLOCK_SIZE: + if (ret) { + uint64_t *block_size = ret; + block_size[0] = 1024; + block_size[1] = 1024; + block_size[2] = 1024; + } + return 3 * sizeof(uint64_t); + case PIPE_COMPUTE_CAP_MAX_THREADS_PER_BLOCK: + if (ret) { + uint64_t *max_threads_per_block = ret; + *max_threads_per_block = 1024; + } + return sizeof(uint64_t); + case PIPE_COMPUTE_CAP_MAX_LOCAL_SIZE: + if (ret) { + uint64_t *max_local_size = ret; + *max_local_size = 32768; + } + return sizeof(uint64_t); + case PIPE_COMPUTE_CAP_GRID_DIMENSION: + if (ret) { + uint32_t *grid_dim = ret; + *grid_dim = 3; + } + return sizeof(uint32_t); + case PIPE_COMPUTE_CAP_MAX_GLOBAL_SIZE: + if (ret) { + uint64_t *max_global_size = ret; + *max_global_size = (1ULL << 31); + } + return sizeof(uint64_t); + case PIPE_COMPUTE_CAP_MAX_MEM_ALLOC_SIZE: + if (ret) { + uint64_t *max_mem_alloc_size = ret; + *max_mem_alloc_size = (1ULL << 31); + } + return sizeof(uint64_t); + case PIPE_COMPUTE_CAP_MAX_PRIVATE_SIZE: + if (ret) { + uint64_t *max_private = ret; + *max_private = (1UL << 31); + } + return sizeof(uint64_t); + case PIPE_COMPUTE_CAP_MAX_INPUT_SIZE: + if (ret) { + uint64_t *max_input = ret; + *max_input = 4096; + } + return sizeof(uint64_t); + case PIPE_COMPUTE_CAP_IMAGES_SUPPORTED: + if (ret) { + uint32_t *images = ret; + *images = 0; + } + return sizeof(uint32_t); + case PIPE_COMPUTE_CAP_MAX_VARIABLE_THREADS_PER_BLOCK: + return 0; + case PIPE_COMPUTE_CAP_SUBGROUP_SIZE: + if (ret) { + uint32_t *subgroup_size = ret; + *subgroup_size = 32; + } + return sizeof(uint32_t); + case PIPE_COMPUTE_CAP_MAX_COMPUTE_UNITS: + if (ret) { + uint32_t *max_compute_units = ret; + *max_compute_units = 8; + } + return sizeof(uint32_t); + case PIPE_COMPUTE_CAP_MAX_CLOCK_FREQUENCY: + if (ret) { + uint32_t *max_clock_freq = ret; + *max_clock_freq = 300; + } + return sizeof(uint32_t); + case PIPE_COMPUTE_CAP_ADDRESS_BITS: + if (ret) { + uint32_t *address_bits = ret; + *address_bits = 64; + } + return sizeof(uint32_t); + } + return 0; +} + +static const struct nir_shader_compiler_options gallivm_nir_options = { + .lower_scmp = true, + .lower_flrp32 = true, + .lower_flrp64 = true, + .lower_fsat = true, + .lower_bitfield_insert_to_shifts = true, + .lower_bitfield_extract_to_shifts = true, + .lower_sub = true, + .lower_ffma = true, + .lower_fmod = true, + .lower_hadd = true, + .lower_add_sat = true, + .lower_pack_snorm_2x16 = true, + .lower_pack_snorm_4x8 = true, + .lower_pack_unorm_2x16 = true, + .lower_pack_unorm_4x8 = true, + .lower_unpack_snorm_2x16 = true, + .lower_unpack_snorm_4x8 = true, + .lower_unpack_unorm_2x16 = true, + .lower_unpack_unorm_4x8 = true, + .lower_extract_byte = true, + .lower_extract_word = true, + .lower_rotate = true, + .lower_ifind_msb = true, + .optimize_sample_mask_in = true, + .max_unroll_iterations = 32, + .use_interpolated_input_intrinsics = true, + .lower_to_scalar = true, +}; + +static void +llvmpipe_finalize_nir(struct pipe_screen *screen, + void *nirptr, + bool optimize) +{ + struct nir_shader *nir = (struct nir_shader *)nirptr; + lp_build_opt_nir(nir); +} + +static inline const void * +llvmpipe_get_compiler_options(struct pipe_screen *screen, + enum pipe_shader_ir ir, + enum pipe_shader_type shader) +{ + assert(ir == PIPE_SHADER_IR_NIR); + return &gallivm_nir_options; +} /** * Query format support for creating a texture, drawing surface, etc. @@ -553,7 +735,8 @@ } if (format_desc->layout == UTIL_FORMAT_LAYOUT_ASTC || - format_desc->layout == UTIL_FORMAT_LAYOUT_ATC) { + format_desc->layout == UTIL_FORMAT_LAYOUT_ATC || + format_desc->layout == UTIL_FORMAT_LAYOUT_FXT1) { /* Software decoding is not hooked up. */ return false; } @@ -595,6 +778,9 @@ struct llvmpipe_screen *screen = llvmpipe_screen(_screen); struct sw_winsys *winsys = screen->winsys; + if (screen->cs_tpool) + lp_cs_tpool_destroy(screen->cs_tpool); + if (screen->rast) lp_rast_destroy(screen->rast); @@ -603,8 +789,10 @@ if(winsys->destroy) winsys->destroy(winsys); - mtx_destroy(&screen->rast_mutex); + glsl_type_singleton_decref(); + mtx_destroy(&screen->rast_mutex); + mtx_destroy(&screen->cs_mutex); FREE(screen); } @@ -666,6 +854,8 @@ util_cpu_detect(); + glsl_type_singleton_init_or_ref(); + #ifdef DEBUG LP_DEBUG = debug_get_flags_option("LP_DEBUG", lp_debug_flags, 0 ); #endif @@ -690,7 +880,9 @@ screen->base.get_device_vendor = llvmpipe_get_vendor; // TODO should be the CPU vendor screen->base.get_param = llvmpipe_get_param; screen->base.get_shader_param = llvmpipe_get_shader_param; + screen->base.get_compute_param = llvmpipe_get_compute_param; screen->base.get_paramf = llvmpipe_get_paramf; + screen->base.get_compiler_options = llvmpipe_get_compiler_options; screen->base.is_format_supported = llvmpipe_is_format_supported; screen->base.context_create = llvmpipe_create_context; @@ -700,8 +892,10 @@ screen->base.get_timestamp = llvmpipe_get_timestamp; + screen->base.finalize_nir = llvmpipe_finalize_nir; llvmpipe_init_screen_resource_funcs(&screen->base); + screen->use_tgsi = (LP_DEBUG & DEBUG_TGSI_IR); screen->num_threads = util_cpu_caps.nr_cpus > 1 ? util_cpu_caps.nr_cpus : 0; #ifdef EMBEDDED_DEVICE screen->num_threads = 0; @@ -717,5 +911,14 @@ } (void) mtx_init(&screen->rast_mutex, mtx_plain); + screen->cs_tpool = lp_cs_tpool_create(screen->num_threads); + if (!screen->cs_tpool) { + lp_rast_destroy(screen->rast); + lp_jit_screen_cleanup(screen); + FREE(screen); + return NULL; + } + (void) mtx_init(&screen->cs_mutex, mtx_plain); + return &screen->base; } diff -Nru mesa-19.2.8/src/gallium/drivers/llvmpipe/lp_screen.h mesa-20.0.8/src/gallium/drivers/llvmpipe/lp_screen.h --- mesa-19.2.8/src/gallium/drivers/llvmpipe/lp_screen.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/llvmpipe/lp_screen.h 2020-06-12 01:21:17.000000000 +0000 @@ -41,7 +41,7 @@ struct sw_winsys; - +struct lp_cs_tpool; struct llvmpipe_screen { @@ -57,6 +57,11 @@ struct lp_rasterizer *rast; mtx_t rast_mutex; + + struct lp_cs_tpool *cs_tpool; + mtx_t cs_mutex; + + bool use_tgsi; }; diff -Nru mesa-19.2.8/src/gallium/drivers/llvmpipe/lp_setup.c mesa-20.0.8/src/gallium/drivers/llvmpipe/lp_setup.c --- mesa-19.2.8/src/gallium/drivers/llvmpipe/lp_setup.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/llvmpipe/lp_setup.c 2020-06-12 01:21:17.000000000 +0000 @@ -684,6 +684,77 @@ setup->dirty |= LP_SETUP_NEW_SSBOS; } +void +lp_setup_set_fs_images(struct lp_setup_context *setup, + unsigned num, + struct pipe_image_view *images) +{ + unsigned i; + + LP_DBG(DEBUG_SETUP, "%s %p\n", __FUNCTION__, (void *) images); + + assert(num <= ARRAY_SIZE(setup->images)); + + for (i = 0; i < num; ++i) { + struct pipe_image_view *image = &images[i]; + util_copy_image_view(&setup->images[i].current, &images[i]); + + struct pipe_resource *res = image->resource; + struct llvmpipe_resource *lp_res = llvmpipe_resource(res); + struct lp_jit_image *jit_image; + + jit_image = &setup->fs.current.jit_context.images[i]; + if (!lp_res) + continue; + if (!lp_res->dt) { + /* regular texture - setup array of mipmap level offsets */ + if (llvmpipe_resource_is_texture(res)) { + jit_image->base = lp_res->tex_data; + } else + jit_image->base = lp_res->data; + + jit_image->width = res->width0; + jit_image->height = res->height0; + jit_image->depth = res->depth0; + + if (llvmpipe_resource_is_texture(res)) { + uint32_t mip_offset = lp_res->mip_offsets[image->u.tex.level]; + + jit_image->width = u_minify(jit_image->width, image->u.tex.level); + jit_image->height = u_minify(jit_image->height, image->u.tex.level); + + if (res->target == PIPE_TEXTURE_1D_ARRAY || + res->target == PIPE_TEXTURE_2D_ARRAY || + res->target == PIPE_TEXTURE_3D || + res->target == PIPE_TEXTURE_CUBE || + res->target == PIPE_TEXTURE_CUBE_ARRAY) { + /* + * For array textures, we don't have first_layer, instead + * adjust last_layer (stored as depth) plus the mip level offsets + * (as we have mip-first layout can't just adjust base ptr). + * XXX For mip levels, could do something similar. + */ + jit_image->depth = image->u.tex.last_layer - image->u.tex.first_layer + 1; + mip_offset += image->u.tex.first_layer * lp_res->img_stride[image->u.tex.level]; + } else + jit_image->depth = u_minify(jit_image->depth, image->u.tex.level); + + jit_image->row_stride = lp_res->row_stride[image->u.tex.level]; + jit_image->img_stride = lp_res->img_stride[image->u.tex.level]; + jit_image->base = (uint8_t *)jit_image->base + mip_offset; + } + else { + unsigned view_blocksize = util_format_get_blocksize(image->format); + jit_image->width = image->u.buf.size / view_blocksize; + jit_image->base = (uint8_t *)jit_image->base + image->u.buf.offset; + } + } + } + for (; i < ARRAY_SIZE(setup->images); i++) { + util_copy_image_view(&setup->images[i].current, NULL); + } + setup->dirty |= LP_SETUP_NEW_IMAGES; +} void lp_setup_set_alpha_ref_value( struct lp_setup_context *setup, @@ -1017,6 +1088,11 @@ return LP_REFERENCED_FOR_READ | LP_REFERENCED_FOR_WRITE; } + for (i = 0; i < ARRAY_SIZE(setup->images); i++) { + if (setup->images[i].current.resource == texture) + return LP_REFERENCED_FOR_READ | LP_REFERENCED_FOR_WRITE; + } + return LP_UNREFERENCED; } @@ -1154,7 +1230,7 @@ } num_constants = - setup->constants[i].stored_size / (sizeof(float) * 4); + DIV_ROUND_UP(setup->constants[i].stored_size, (sizeof(float) * 4)); setup->fs.current.jit_context.num_constants[i] = num_constants; setup->dirty |= LP_SETUP_NEW_FS; } diff -Nru mesa-19.2.8/src/gallium/drivers/llvmpipe/lp_setup_context.h mesa-20.0.8/src/gallium/drivers/llvmpipe/lp_setup_context.h --- mesa-19.2.8/src/gallium/drivers/llvmpipe/lp_setup_context.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/llvmpipe/lp_setup_context.h 2020-06-12 01:21:17.000000000 +0000 @@ -50,6 +50,7 @@ #define LP_SETUP_NEW_SCISSOR 0x08 #define LP_SETUP_NEW_VIEWPORTS 0x10 #define LP_SETUP_NEW_SSBOS 0x20 +#define LP_SETUP_NEW_IMAGES 0x40 struct lp_setup_variant; @@ -149,6 +150,10 @@ } ssbos[LP_MAX_TGSI_SHADER_BUFFERS]; struct { + struct pipe_image_view current; + } images[LP_MAX_TGSI_SHADER_IMAGES]; + + struct { struct pipe_blend_color current; uint8_t *stored; } blend_color; diff -Nru mesa-19.2.8/src/gallium/drivers/llvmpipe/lp_setup.h mesa-20.0.8/src/gallium/drivers/llvmpipe/lp_setup.h --- mesa-19.2.8/src/gallium/drivers/llvmpipe/lp_setup.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/llvmpipe/lp_setup.h 2020-06-12 01:21:17.000000000 +0000 @@ -110,6 +110,11 @@ struct pipe_shader_buffer *buffers); void +lp_setup_set_fs_images(struct lp_setup_context *setup, + unsigned num, + struct pipe_image_view *images); + +void lp_setup_set_alpha_ref_value( struct lp_setup_context *setup, float alpha_ref_value ); diff -Nru mesa-19.2.8/src/gallium/drivers/llvmpipe/lp_setup_line.c mesa-20.0.8/src/gallium/drivers/llvmpipe/lp_setup_line.c --- mesa-19.2.8/src/gallium/drivers/llvmpipe/lp_setup_line.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/llvmpipe/lp_setup_line.c 2020-06-12 01:21:17.000000000 +0000 @@ -724,7 +724,7 @@ struct lp_rast_plane *plane_s = &plane[4]; if (s_planes[0]) { - plane_s->dcdx = -1 << 8; + plane_s->dcdx = ~0U << 8; plane_s->dcdy = 0; plane_s->c = (1-scissor->x0) << 8; plane_s->eo = 1 << 8; @@ -746,7 +746,7 @@ } if (s_planes[3]) { plane_s->dcdx = 0; - plane_s->dcdy = -1 << 8; + plane_s->dcdy = ~0U << 8; plane_s->c = (scissor->y1+1) << 8; plane_s->eo = 0; plane_s++; diff -Nru mesa-19.2.8/src/gallium/drivers/llvmpipe/lp_setup_point.c mesa-20.0.8/src/gallium/drivers/llvmpipe/lp_setup_point.c --- mesa-19.2.8/src/gallium/drivers/llvmpipe/lp_setup_point.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/llvmpipe/lp_setup_point.c 2020-06-12 01:21:17.000000000 +0000 @@ -224,10 +224,10 @@ boolean perspective = !!(interp == LP_INTERP_PERSPECTIVE); unsigned i; - if (perspective & usage_mask) { + if (perspective && usage_mask) { fragcoord_usage_mask |= TGSI_WRITEMASK_W; } - + switch (interp) { case LP_INTERP_POSITION: /* @@ -241,27 +241,35 @@ case LP_INTERP_LINEAR: /* Sprite tex coords may use linear interpolation someday */ /* fall-through */ - case LP_INTERP_PERSPECTIVE: + case LP_INTERP_PERSPECTIVE: { /* check if the sprite coord flag is set for this attribute. * If so, set it up so it up so x and y vary from 0 to 1. */ - if (shader->info.base.input_semantic_name[slot] == TGSI_SEMANTIC_GENERIC) { + bool do_texcoord_coef = false; + if (shader->info.base.input_semantic_name[slot] == TGSI_SEMANTIC_PCOORD) { + do_texcoord_coef = true; + } + else if (shader->info.base.input_semantic_name[slot] == TGSI_SEMANTIC_TEXCOORD) { unsigned semantic_index = shader->info.base.input_semantic_index[slot]; /* Note that sprite_coord enable is a bitfield of * PIPE_MAX_SHADER_OUTPUTS bits. */ if (semantic_index < PIPE_MAX_SHADER_OUTPUTS && (setup->sprite_coord_enable & (1u << semantic_index))) { - for (i = 0; i < NUM_CHANNELS; i++) { - if (usage_mask & (1 << i)) { - texcoord_coef(setup, info, slot + 1, i, - setup->sprite_coord_origin, - perspective); - } + do_texcoord_coef = true; + } + } + if (do_texcoord_coef) { + for (i = 0; i < NUM_CHANNELS; i++) { + if (usage_mask & (1 << i)) { + texcoord_coef(setup, info, slot + 1, i, + setup->sprite_coord_origin, + perspective); } - break; } + break; } + } /* fall-through */ case LP_INTERP_CONSTANT: for (i = 0; i < NUM_CHANNELS; i++) { @@ -491,7 +499,7 @@ { struct lp_rast_plane *plane = GET_PLANES(point); - plane[0].dcdx = -1 << 8; + plane[0].dcdx = ~0U << 8; plane[0].dcdy = 0; plane[0].c = (1-bbox.x0) << 8; plane[0].eo = 1 << 8; @@ -507,7 +515,7 @@ plane[2].eo = 1 << 8; plane[3].dcdx = 0; - plane[3].dcdy = -1 << 8; + plane[3].dcdy = ~0U << 8; plane[3].c = (bbox.y1+1) << 8; plane[3].eo = 0; } diff -Nru mesa-19.2.8/src/gallium/drivers/llvmpipe/lp_setup_tri.c mesa-20.0.8/src/gallium/drivers/llvmpipe/lp_setup_tri.c --- mesa-19.2.8/src/gallium/drivers/llvmpipe/lp_setup_tri.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/llvmpipe/lp_setup_tri.c 2020-06-12 01:21:17.000000000 +0000 @@ -46,7 +46,7 @@ #if defined(PIPE_ARCH_SSE) #include -#elif defined(_ARCH_PWR8) && defined(PIPE_ARCH_LITTLE_ENDIAN) +#elif defined(_ARCH_PWR8) && UTIL_ARCH_LITTLE_ENDIAN #include #include "util/u_pwr8.h" #endif @@ -489,7 +489,7 @@ eo = _mm_shuffle_epi32(eo, _MM_SHUFFLE(0,0,0,2)); plane[2].eo = (uint32_t)_mm_cvtsi128_si32(eo); } else -#elif defined(_ARCH_PWR8) && defined(PIPE_ARCH_LITTLE_ENDIAN) +#elif defined(_ARCH_PWR8) && UTIL_ARCH_LITTLE_ENDIAN /* * XXX this code is effectively disabled for all practical purposes, * as the allowed fb size is tiny if FIXED_ORDER is 8. @@ -513,7 +513,7 @@ __m128i zero = vec_splats((unsigned char) 0); PIPE_ALIGN_VAR(16) int32_t temp_vec[4]; -#ifdef PIPE_ARCH_LITTLE_ENDIAN +#if UTIL_ARCH_LITTLE_ENDIAN vshuf_mask.i[0] = 0x07060504; vshuf_mask.i[1] = 0x0B0A0908; vshuf_mask.i[2] = 0x03020100; @@ -687,7 +687,7 @@ struct lp_rast_plane *plane_s = &plane[3]; if (s_planes[0]) { - plane_s->dcdx = -1 << 8; + plane_s->dcdx = ~0U << 8; plane_s->dcdy = 0; plane_s->c = (1-scissor->x0) << 8; plane_s->eo = 1 << 8; @@ -709,7 +709,7 @@ } if (s_planes[3]) { plane_s->dcdx = 0; - plane_s->dcdy = -1 << 8; + plane_s->dcdy = ~0U << 8; plane_s->c = (scissor->y1+1) << 8; plane_s->eo = 0; plane_s++; diff -Nru mesa-19.2.8/src/gallium/drivers/llvmpipe/lp_state_cs.c mesa-20.0.8/src/gallium/drivers/llvmpipe/lp_state_cs.c --- mesa-19.2.8/src/gallium/drivers/llvmpipe/lp_state_cs.c 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/llvmpipe/lp_state_cs.c 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,1356 @@ +/************************************************************************** + * + * Copyright 2019 Red Hat. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **************************************************************************/ +#include "util/u_memory.h" +#include "util/simple_list.h" +#include "util/os_time.h" +#include "util/u_dump.h" +#include "util/u_string.h" +#include "tgsi/tgsi_dump.h" +#include "tgsi/tgsi_parse.h" +#include "gallivm/lp_bld_const.h" +#include "gallivm/lp_bld_debug.h" +#include "gallivm/lp_bld_intr.h" +#include "gallivm/lp_bld_flow.h" +#include "gallivm/lp_bld_gather.h" +#include "gallivm/lp_bld_coro.h" +#include "gallivm/lp_bld_nir.h" +#include "lp_state_cs.h" +#include "lp_context.h" +#include "lp_debug.h" +#include "lp_state.h" +#include "lp_perf.h" +#include "lp_screen.h" +#include "lp_memory.h" +#include "lp_cs_tpool.h" +#include "state_tracker/sw_winsys.h" +#include "nir/nir_to_tgsi_info.h" +#include "nir_serialize.h" +struct lp_cs_job_info { + unsigned grid_size[3]; + unsigned block_size[3]; + unsigned req_local_mem; + unsigned work_dim; + struct lp_cs_exec *current; +}; + +static void +generate_compute(struct llvmpipe_context *lp, + struct lp_compute_shader *shader, + struct lp_compute_shader_variant *variant) +{ + struct gallivm_state *gallivm = variant->gallivm; + const struct lp_compute_shader_variant_key *key = &variant->key; + char func_name[64], func_name_coro[64]; + LLVMTypeRef arg_types[17]; + LLVMTypeRef func_type, coro_func_type; + LLVMTypeRef int32_type = LLVMInt32TypeInContext(gallivm->context); + LLVMValueRef context_ptr; + LLVMValueRef x_size_arg, y_size_arg, z_size_arg; + LLVMValueRef grid_x_arg, grid_y_arg, grid_z_arg; + LLVMValueRef grid_size_x_arg, grid_size_y_arg, grid_size_z_arg; + LLVMValueRef work_dim_arg, thread_data_ptr; + LLVMBasicBlockRef block; + LLVMBuilderRef builder; + struct lp_build_sampler_soa *sampler; + struct lp_build_image_soa *image; + LLVMValueRef function, coro; + struct lp_type cs_type; + unsigned i; + + /* + * This function has two parts + * a) setup the coroutine execution environment loop. + * b) build the compute shader llvm for use inside the coroutine. + */ + assert(lp_native_vector_width / 32 >= 4); + + memset(&cs_type, 0, sizeof cs_type); + cs_type.floating = TRUE; /* floating point values */ + cs_type.sign = TRUE; /* values are signed */ + cs_type.norm = FALSE; /* values are not limited to [0,1] or [-1,1] */ + cs_type.width = 32; /* 32-bit float */ + cs_type.length = MIN2(lp_native_vector_width / 32, 16); /* n*4 elements per vector */ + snprintf(func_name, sizeof(func_name), "cs%u_variant%u", + shader->no, variant->no); + + snprintf(func_name_coro, sizeof(func_name), "cs_co_%u_variant%u", + shader->no, variant->no); + + arg_types[0] = variant->jit_cs_context_ptr_type; /* context */ + arg_types[1] = int32_type; /* block_x_size */ + arg_types[2] = int32_type; /* block_y_size */ + arg_types[3] = int32_type; /* block_z_size */ + arg_types[4] = int32_type; /* grid_x */ + arg_types[5] = int32_type; /* grid_y */ + arg_types[6] = int32_type; /* grid_z */ + arg_types[7] = int32_type; /* grid_size_x */ + arg_types[8] = int32_type; /* grid_size_y */ + arg_types[9] = int32_type; /* grid_size_z */ + arg_types[10] = int32_type; /* work dim */ + arg_types[11] = variant->jit_cs_thread_data_ptr_type; /* per thread data */ + arg_types[12] = int32_type; /* coro only - num X loops */ + arg_types[13] = int32_type; /* coro only - partials */ + arg_types[14] = int32_type; /* coro block_x_size */ + arg_types[15] = int32_type; /* coro block_y_size */ + arg_types[16] = int32_type; /* coro block_z_size */ + func_type = LLVMFunctionType(LLVMVoidTypeInContext(gallivm->context), + arg_types, ARRAY_SIZE(arg_types) - 5, 0); + + coro_func_type = LLVMFunctionType(LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), + arg_types, ARRAY_SIZE(arg_types), 0); + + function = LLVMAddFunction(gallivm->module, func_name, func_type); + LLVMSetFunctionCallConv(function, LLVMCCallConv); + + coro = LLVMAddFunction(gallivm->module, func_name_coro, coro_func_type); + LLVMSetFunctionCallConv(coro, LLVMCCallConv); + + variant->function = function; + + for(i = 0; i < ARRAY_SIZE(arg_types); ++i) { + if(LLVMGetTypeKind(arg_types[i]) == LLVMPointerTypeKind) { + lp_add_function_attr(coro, i + 1, LP_FUNC_ATTR_NOALIAS); + lp_add_function_attr(function, i + 1, LP_FUNC_ATTR_NOALIAS); + } + } + + context_ptr = LLVMGetParam(function, 0); + x_size_arg = LLVMGetParam(function, 1); + y_size_arg = LLVMGetParam(function, 2); + z_size_arg = LLVMGetParam(function, 3); + grid_x_arg = LLVMGetParam(function, 4); + grid_y_arg = LLVMGetParam(function, 5); + grid_z_arg = LLVMGetParam(function, 6); + grid_size_x_arg = LLVMGetParam(function, 7); + grid_size_y_arg = LLVMGetParam(function, 8); + grid_size_z_arg = LLVMGetParam(function, 9); + work_dim_arg = LLVMGetParam(function, 10); + thread_data_ptr = LLVMGetParam(function, 11); + + lp_build_name(context_ptr, "context"); + lp_build_name(x_size_arg, "x_size"); + lp_build_name(y_size_arg, "y_size"); + lp_build_name(z_size_arg, "z_size"); + lp_build_name(grid_x_arg, "grid_x"); + lp_build_name(grid_y_arg, "grid_y"); + lp_build_name(grid_z_arg, "grid_z"); + lp_build_name(grid_size_x_arg, "grid_size_x"); + lp_build_name(grid_size_y_arg, "grid_size_y"); + lp_build_name(grid_size_z_arg, "grid_size_z"); + lp_build_name(work_dim_arg, "work_dim"); + lp_build_name(thread_data_ptr, "thread_data"); + + block = LLVMAppendBasicBlockInContext(gallivm->context, function, "entry"); + builder = gallivm->builder; + assert(builder); + LLVMPositionBuilderAtEnd(builder, block); + sampler = lp_llvm_sampler_soa_create(key->state); + image = lp_llvm_image_soa_create(key->image_state); + + struct lp_build_loop_state loop_state[4]; + LLVMValueRef num_x_loop; + LLVMValueRef vec_length = lp_build_const_int32(gallivm, cs_type.length); + num_x_loop = LLVMBuildAdd(gallivm->builder, x_size_arg, vec_length, ""); + num_x_loop = LLVMBuildSub(gallivm->builder, num_x_loop, lp_build_const_int32(gallivm, 1), ""); + num_x_loop = LLVMBuildUDiv(gallivm->builder, num_x_loop, vec_length, ""); + LLVMValueRef partials = LLVMBuildURem(gallivm->builder, x_size_arg, vec_length, ""); + + LLVMValueRef coro_num_hdls = LLVMBuildMul(gallivm->builder, num_x_loop, y_size_arg, ""); + coro_num_hdls = LLVMBuildMul(gallivm->builder, coro_num_hdls, z_size_arg, ""); + + LLVMTypeRef hdl_ptr_type = LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0); + LLVMValueRef coro_hdls = LLVMBuildArrayAlloca(gallivm->builder, hdl_ptr_type, coro_num_hdls, "coro_hdls"); + + unsigned end_coroutine = INT_MAX; + + /* + * This is the main coroutine execution loop. It iterates over the dimensions + * and calls the coroutine main entrypoint on the first pass, but in subsequent + * passes it checks if the coroutine has completed and resumes it if not. + */ + /* take x_width - round up to type.length width */ + lp_build_loop_begin(&loop_state[3], gallivm, + lp_build_const_int32(gallivm, 0)); /* coroutine reentry loop */ + lp_build_loop_begin(&loop_state[2], gallivm, + lp_build_const_int32(gallivm, 0)); /* z loop */ + lp_build_loop_begin(&loop_state[1], gallivm, + lp_build_const_int32(gallivm, 0)); /* y loop */ + lp_build_loop_begin(&loop_state[0], gallivm, + lp_build_const_int32(gallivm, 0)); /* x loop */ + { + LLVMValueRef args[17]; + args[0] = context_ptr; + args[1] = loop_state[0].counter; + args[2] = loop_state[1].counter; + args[3] = loop_state[2].counter; + args[4] = grid_x_arg; + args[5] = grid_y_arg; + args[6] = grid_z_arg; + args[7] = grid_size_x_arg; + args[8] = grid_size_y_arg; + args[9] = grid_size_z_arg; + args[10] = work_dim_arg; + args[11] = thread_data_ptr; + args[12] = num_x_loop; + args[13] = partials; + args[14] = x_size_arg; + args[15] = y_size_arg; + args[16] = z_size_arg; + + /* idx = (z * (size_x * size_y) + y * size_x + x */ + LLVMValueRef coro_hdl_idx = LLVMBuildMul(gallivm->builder, loop_state[2].counter, + LLVMBuildMul(gallivm->builder, num_x_loop, y_size_arg, ""), ""); + coro_hdl_idx = LLVMBuildAdd(gallivm->builder, coro_hdl_idx, + LLVMBuildMul(gallivm->builder, loop_state[1].counter, + num_x_loop, ""), ""); + coro_hdl_idx = LLVMBuildAdd(gallivm->builder, coro_hdl_idx, + loop_state[0].counter, ""); + + LLVMValueRef coro_entry = LLVMBuildGEP(gallivm->builder, coro_hdls, &coro_hdl_idx, 1, ""); + + LLVMValueRef coro_hdl = LLVMBuildLoad(gallivm->builder, coro_entry, "coro_hdl"); + + struct lp_build_if_state ifstate; + LLVMValueRef cmp = LLVMBuildICmp(gallivm->builder, LLVMIntEQ, loop_state[3].counter, + lp_build_const_int32(gallivm, 0), ""); + /* first time here - call the coroutine function entry point */ + lp_build_if(&ifstate, gallivm, cmp); + LLVMValueRef coro_ret = LLVMBuildCall(gallivm->builder, coro, args, 17, ""); + LLVMBuildStore(gallivm->builder, coro_ret, coro_entry); + lp_build_else(&ifstate); + /* subsequent calls for this invocation - check if done. */ + LLVMValueRef coro_done = lp_build_coro_done(gallivm, coro_hdl); + struct lp_build_if_state ifstate2; + lp_build_if(&ifstate2, gallivm, coro_done); + /* if done destroy and force loop exit */ + lp_build_coro_destroy(gallivm, coro_hdl); + lp_build_loop_force_set_counter(&loop_state[3], lp_build_const_int32(gallivm, end_coroutine - 1)); + lp_build_else(&ifstate2); + /* otherwise resume the coroutine */ + lp_build_coro_resume(gallivm, coro_hdl); + lp_build_endif(&ifstate2); + lp_build_endif(&ifstate); + lp_build_loop_force_reload_counter(&loop_state[3]); + } + lp_build_loop_end_cond(&loop_state[0], + num_x_loop, + NULL, LLVMIntUGE); + lp_build_loop_end_cond(&loop_state[1], + y_size_arg, + NULL, LLVMIntUGE); + lp_build_loop_end_cond(&loop_state[2], + z_size_arg, + NULL, LLVMIntUGE); + lp_build_loop_end_cond(&loop_state[3], + lp_build_const_int32(gallivm, end_coroutine), + NULL, LLVMIntEQ); + LLVMBuildRetVoid(builder); + + /* This is stage (b) - generate the compute shader code inside the coroutine. */ + LLVMValueRef block_x_size_arg, block_y_size_arg, block_z_size_arg; + context_ptr = LLVMGetParam(coro, 0); + x_size_arg = LLVMGetParam(coro, 1); + y_size_arg = LLVMGetParam(coro, 2); + z_size_arg = LLVMGetParam(coro, 3); + grid_x_arg = LLVMGetParam(coro, 4); + grid_y_arg = LLVMGetParam(coro, 5); + grid_z_arg = LLVMGetParam(coro, 6); + grid_size_x_arg = LLVMGetParam(coro, 7); + grid_size_y_arg = LLVMGetParam(coro, 8); + grid_size_z_arg = LLVMGetParam(coro, 9); + work_dim_arg = LLVMGetParam(coro, 10); + thread_data_ptr = LLVMGetParam(coro, 11); + num_x_loop = LLVMGetParam(coro, 12); + partials = LLVMGetParam(coro, 13); + block_x_size_arg = LLVMGetParam(coro, 14); + block_y_size_arg = LLVMGetParam(coro, 15); + block_z_size_arg = LLVMGetParam(coro, 16); + block = LLVMAppendBasicBlockInContext(gallivm->context, coro, "entry"); + LLVMPositionBuilderAtEnd(builder, block); + { + LLVMValueRef consts_ptr, num_consts_ptr; + LLVMValueRef ssbo_ptr, num_ssbo_ptr; + LLVMValueRef shared_ptr; + LLVMValueRef kernel_args_ptr; + struct lp_build_mask_context mask; + struct lp_bld_tgsi_system_values system_values; + + memset(&system_values, 0, sizeof(system_values)); + consts_ptr = lp_jit_cs_context_constants(gallivm, context_ptr); + num_consts_ptr = lp_jit_cs_context_num_constants(gallivm, context_ptr); + ssbo_ptr = lp_jit_cs_context_ssbos(gallivm, context_ptr); + num_ssbo_ptr = lp_jit_cs_context_num_ssbos(gallivm, context_ptr); + kernel_args_ptr = lp_jit_cs_context_kernel_args(gallivm, context_ptr); + + shared_ptr = lp_jit_cs_thread_data_shared(gallivm, thread_data_ptr); + + /* these are coroutine entrypoint necessities */ + LLVMValueRef coro_id = lp_build_coro_id(gallivm); + LLVMValueRef coro_hdl = lp_build_coro_begin_alloc_mem(gallivm, coro_id); + + LLVMValueRef has_partials = LLVMBuildICmp(gallivm->builder, LLVMIntNE, partials, lp_build_const_int32(gallivm, 0), ""); + LLVMValueRef tid_vals[3]; + LLVMValueRef tids_x[LP_MAX_VECTOR_LENGTH], tids_y[LP_MAX_VECTOR_LENGTH], tids_z[LP_MAX_VECTOR_LENGTH]; + LLVMValueRef base_val = LLVMBuildMul(gallivm->builder, x_size_arg, vec_length, ""); + for (i = 0; i < cs_type.length; i++) { + tids_x[i] = LLVMBuildAdd(gallivm->builder, base_val, lp_build_const_int32(gallivm, i), ""); + tids_y[i] = y_size_arg; + tids_z[i] = z_size_arg; + } + tid_vals[0] = lp_build_gather_values(gallivm, tids_x, cs_type.length); + tid_vals[1] = lp_build_gather_values(gallivm, tids_y, cs_type.length); + tid_vals[2] = lp_build_gather_values(gallivm, tids_z, cs_type.length); + system_values.thread_id = LLVMGetUndef(LLVMArrayType(LLVMVectorType(int32_type, cs_type.length), 3)); + for (i = 0; i < 3; i++) + system_values.thread_id = LLVMBuildInsertValue(builder, system_values.thread_id, tid_vals[i], i, ""); + + LLVMValueRef gtids[3] = { grid_x_arg, grid_y_arg, grid_z_arg }; + system_values.block_id = LLVMGetUndef(LLVMVectorType(int32_type, 3)); + for (i = 0; i < 3; i++) + system_values.block_id = LLVMBuildInsertElement(builder, system_values.block_id, gtids[i], lp_build_const_int32(gallivm, i), ""); + + LLVMValueRef gstids[3] = { grid_size_x_arg, grid_size_y_arg, grid_size_z_arg }; + system_values.grid_size = LLVMGetUndef(LLVMVectorType(int32_type, 3)); + for (i = 0; i < 3; i++) + system_values.grid_size = LLVMBuildInsertElement(builder, system_values.grid_size, gstids[i], lp_build_const_int32(gallivm, i), ""); + + system_values.work_dim = work_dim_arg; + + LLVMValueRef bsize[3] = { block_x_size_arg, block_y_size_arg, block_z_size_arg }; + system_values.block_size = LLVMGetUndef(LLVMVectorType(int32_type, 3)); + for (i = 0; i < 3; i++) + system_values.block_size = LLVMBuildInsertElement(builder, system_values.block_size, bsize[i], lp_build_const_int32(gallivm, i), ""); + + LLVMValueRef last_x_loop = LLVMBuildICmp(gallivm->builder, LLVMIntEQ, x_size_arg, LLVMBuildSub(gallivm->builder, num_x_loop, lp_build_const_int32(gallivm, 1), ""), ""); + LLVMValueRef use_partial_mask = LLVMBuildAnd(gallivm->builder, last_x_loop, has_partials, ""); + struct lp_build_if_state if_state; + LLVMValueRef mask_val = lp_build_alloca(gallivm, LLVMVectorType(int32_type, cs_type.length), "mask"); + LLVMValueRef full_mask_val = lp_build_const_int_vec(gallivm, cs_type, ~0); + LLVMBuildStore(gallivm->builder, full_mask_val, mask_val); + + lp_build_if(&if_state, gallivm, use_partial_mask); + struct lp_build_loop_state mask_loop_state; + lp_build_loop_begin(&mask_loop_state, gallivm, partials); + LLVMValueRef tmask_val = LLVMBuildLoad(gallivm->builder, mask_val, ""); + tmask_val = LLVMBuildInsertElement(gallivm->builder, tmask_val, lp_build_const_int32(gallivm, 0), mask_loop_state.counter, ""); + LLVMBuildStore(gallivm->builder, tmask_val, mask_val); + lp_build_loop_end_cond(&mask_loop_state, vec_length, NULL, LLVMIntUGE); + lp_build_endif(&if_state); + + mask_val = LLVMBuildLoad(gallivm->builder, mask_val, ""); + lp_build_mask_begin(&mask, gallivm, cs_type, mask_val); + + struct lp_build_coro_suspend_info coro_info; + + LLVMBasicBlockRef sus_block = LLVMAppendBasicBlockInContext(gallivm->context, coro, "suspend"); + LLVMBasicBlockRef clean_block = LLVMAppendBasicBlockInContext(gallivm->context, coro, "cleanup"); + + coro_info.suspend = sus_block; + coro_info.cleanup = clean_block; + + struct lp_build_tgsi_params params; + memset(¶ms, 0, sizeof(params)); + + params.type = cs_type; + params.mask = &mask; + params.consts_ptr = consts_ptr; + params.const_sizes_ptr = num_consts_ptr; + params.system_values = &system_values; + params.context_ptr = context_ptr; + params.sampler = sampler; + params.info = &shader->info.base; + params.ssbo_ptr = ssbo_ptr; + params.ssbo_sizes_ptr = num_ssbo_ptr; + params.image = image; + params.shared_ptr = shared_ptr; + params.coro = &coro_info; + params.kernel_args = kernel_args_ptr; + + if (shader->base.type == PIPE_SHADER_IR_TGSI) + lp_build_tgsi_soa(gallivm, shader->base.tokens, ¶ms, NULL); + else + lp_build_nir_soa(gallivm, shader->base.ir.nir, ¶ms, + NULL); + + mask_val = lp_build_mask_end(&mask); + + lp_build_coro_suspend_switch(gallivm, &coro_info, NULL, true); + LLVMPositionBuilderAtEnd(builder, clean_block); + + lp_build_coro_free_mem(gallivm, coro_id, coro_hdl); + + LLVMBuildBr(builder, sus_block); + LLVMPositionBuilderAtEnd(builder, sus_block); + + lp_build_coro_end(gallivm, coro_hdl); + LLVMBuildRet(builder, coro_hdl); + } + + sampler->destroy(sampler); + image->destroy(image); + + gallivm_verify_function(gallivm, coro); + gallivm_verify_function(gallivm, function); +} + +static void * +llvmpipe_create_compute_state(struct pipe_context *pipe, + const struct pipe_compute_state *templ) +{ + struct lp_compute_shader *shader; + int nr_samplers, nr_sampler_views; + shader = CALLOC_STRUCT(lp_compute_shader); + if (!shader) + return NULL; + + shader->base.type = templ->ir_type; + if (templ->ir_type == PIPE_SHADER_IR_NIR_SERIALIZED) { + struct blob_reader reader; + const struct pipe_binary_program_header *hdr = templ->prog; + + blob_reader_init(&reader, hdr->blob, hdr->num_bytes); + shader->base.ir.nir = nir_deserialize(NULL, pipe->screen->get_compiler_options(pipe->screen, PIPE_SHADER_IR_NIR, PIPE_SHADER_COMPUTE), &reader); + shader->base.type = PIPE_SHADER_IR_NIR; + + pipe->screen->finalize_nir(pipe->screen, shader->base.ir.nir, false); + } else if (templ->ir_type == PIPE_SHADER_IR_NIR) + shader->base.ir.nir = (struct nir_shader *)templ->prog; + + if (shader->base.type == PIPE_SHADER_IR_TGSI) { + /* get/save the summary info for this shader */ + lp_build_tgsi_info(templ->prog, &shader->info); + + /* we need to keep a local copy of the tokens */ + shader->base.tokens = tgsi_dup_tokens(templ->prog); + } else { + nir_tgsi_scan_shader(shader->base.ir.nir, &shader->info.base, false); + } + + shader->req_local_mem = templ->req_local_mem; + make_empty_list(&shader->variants); + + nr_samplers = shader->info.base.file_max[TGSI_FILE_SAMPLER] + 1; + nr_sampler_views = shader->info.base.file_max[TGSI_FILE_SAMPLER_VIEW] + 1; + shader->variant_key_size = Offset(struct lp_compute_shader_variant_key, + state[MAX2(nr_samplers, nr_sampler_views)]); + return shader; +} + +static void +llvmpipe_bind_compute_state(struct pipe_context *pipe, + void *cs) +{ + struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe); + + if (llvmpipe->cs == cs) + return; + + llvmpipe->cs = (struct lp_compute_shader *)cs; + llvmpipe->cs_dirty |= LP_CSNEW_CS; +} + +/** + * Remove shader variant from two lists: the shader's variant list + * and the context's variant list. + */ +static void +llvmpipe_remove_cs_shader_variant(struct llvmpipe_context *lp, + struct lp_compute_shader_variant *variant) +{ + if ((LP_DEBUG & DEBUG_CS) || (gallivm_debug & GALLIVM_DEBUG_IR)) { + debug_printf("llvmpipe: del cs #%u var %u v created %u v cached %u " + "v total cached %u inst %u total inst %u\n", + variant->shader->no, variant->no, + variant->shader->variants_created, + variant->shader->variants_cached, + lp->nr_cs_variants, variant->nr_instrs, lp->nr_cs_instrs); + } + + gallivm_destroy(variant->gallivm); + + /* remove from shader's list */ + remove_from_list(&variant->list_item_local); + variant->shader->variants_cached--; + + /* remove from context's list */ + remove_from_list(&variant->list_item_global); + lp->nr_fs_variants--; + lp->nr_fs_instrs -= variant->nr_instrs; + + FREE(variant); +} + +static void +llvmpipe_delete_compute_state(struct pipe_context *pipe, + void *cs) +{ + struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe); + struct lp_compute_shader *shader = cs; + struct lp_cs_variant_list_item *li; + + if (llvmpipe->cs == cs) + llvmpipe->cs = NULL; + for (unsigned i = 0; i < shader->max_global_buffers; i++) + pipe_resource_reference(&shader->global_buffers[i], NULL); + FREE(shader->global_buffers); + + /* Delete all the variants */ + li = first_elem(&shader->variants); + while(!at_end(&shader->variants, li)) { + struct lp_cs_variant_list_item *next = next_elem(li); + llvmpipe_remove_cs_shader_variant(llvmpipe, li->base); + li = next; + } + if (shader->base.ir.nir) + ralloc_free(shader->base.ir.nir); + tgsi_free_tokens(shader->base.tokens); + FREE(shader); +} + +static void +make_variant_key(struct llvmpipe_context *lp, + struct lp_compute_shader *shader, + struct lp_compute_shader_variant_key *key) +{ + int i; + + memset(key, 0, shader->variant_key_size); + + /* This value will be the same for all the variants of a given shader: + */ + key->nr_samplers = shader->info.base.file_max[TGSI_FILE_SAMPLER] + 1; + + for(i = 0; i < key->nr_samplers; ++i) { + if(shader->info.base.file_mask[TGSI_FILE_SAMPLER] & (1 << i)) { + lp_sampler_static_sampler_state(&key->state[i].sampler_state, + lp->samplers[PIPE_SHADER_COMPUTE][i]); + } + } + + /* + * XXX If TGSI_FILE_SAMPLER_VIEW exists assume all texture opcodes + * are dx10-style? Can't really have mixed opcodes, at least not + * if we want to skip the holes here (without rescanning tgsi). + */ + if (shader->info.base.file_max[TGSI_FILE_SAMPLER_VIEW] != -1) { + key->nr_sampler_views = shader->info.base.file_max[TGSI_FILE_SAMPLER_VIEW] + 1; + for(i = 0; i < key->nr_sampler_views; ++i) { + /* + * Note sview may exceed what's representable by file_mask. + * This will still work, the only downside is that not actually + * used views may be included in the shader key. + */ + if(shader->info.base.file_mask[TGSI_FILE_SAMPLER_VIEW] & (1u << (i & 31))) { + lp_sampler_static_texture_state(&key->state[i].texture_state, + lp->sampler_views[PIPE_SHADER_COMPUTE][i]); + } + } + } + else { + key->nr_sampler_views = key->nr_samplers; + for(i = 0; i < key->nr_sampler_views; ++i) { + if(shader->info.base.file_mask[TGSI_FILE_SAMPLER] & (1 << i)) { + lp_sampler_static_texture_state(&key->state[i].texture_state, + lp->sampler_views[PIPE_SHADER_COMPUTE][i]); + } + } + } + + key->nr_images = shader->info.base.file_max[TGSI_FILE_IMAGE] + 1; + for (i = 0; i < key->nr_images; ++i) { + if (shader->info.base.file_mask[TGSI_FILE_IMAGE] & (1 << i)) { + lp_sampler_static_texture_state_image(&key->image_state[i].image_state, + &lp->images[PIPE_SHADER_COMPUTE][i]); + } + } +} + +static void +dump_cs_variant_key(const struct lp_compute_shader_variant_key *key) +{ + int i; + debug_printf("cs variant %p:\n", (void *) key); + + for (i = 0; i < key->nr_samplers; ++i) { + const struct lp_static_sampler_state *sampler = &key->state[i].sampler_state; + debug_printf("sampler[%u] = \n", i); + debug_printf(" .wrap = %s %s %s\n", + util_str_tex_wrap(sampler->wrap_s, TRUE), + util_str_tex_wrap(sampler->wrap_t, TRUE), + util_str_tex_wrap(sampler->wrap_r, TRUE)); + debug_printf(" .min_img_filter = %s\n", + util_str_tex_filter(sampler->min_img_filter, TRUE)); + debug_printf(" .min_mip_filter = %s\n", + util_str_tex_mipfilter(sampler->min_mip_filter, TRUE)); + debug_printf(" .mag_img_filter = %s\n", + util_str_tex_filter(sampler->mag_img_filter, TRUE)); + if (sampler->compare_mode != PIPE_TEX_COMPARE_NONE) + debug_printf(" .compare_func = %s\n", util_str_func(sampler->compare_func, TRUE)); + debug_printf(" .normalized_coords = %u\n", sampler->normalized_coords); + debug_printf(" .min_max_lod_equal = %u\n", sampler->min_max_lod_equal); + debug_printf(" .lod_bias_non_zero = %u\n", sampler->lod_bias_non_zero); + debug_printf(" .apply_min_lod = %u\n", sampler->apply_min_lod); + debug_printf(" .apply_max_lod = %u\n", sampler->apply_max_lod); + } + for (i = 0; i < key->nr_sampler_views; ++i) { + const struct lp_static_texture_state *texture = &key->state[i].texture_state; + debug_printf("texture[%u] = \n", i); + debug_printf(" .format = %s\n", + util_format_name(texture->format)); + debug_printf(" .target = %s\n", + util_str_tex_target(texture->target, TRUE)); + debug_printf(" .level_zero_only = %u\n", + texture->level_zero_only); + debug_printf(" .pot = %u %u %u\n", + texture->pot_width, + texture->pot_height, + texture->pot_depth); + } + for (i = 0; i < key->nr_images; ++i) { + const struct lp_static_texture_state *image = &key->image_state[i].image_state; + debug_printf("image[%u] = \n", i); + debug_printf(" .format = %s\n", + util_format_name(image->format)); + debug_printf(" .target = %s\n", + util_str_tex_target(image->target, TRUE)); + debug_printf(" .level_zero_only = %u\n", + image->level_zero_only); + debug_printf(" .pot = %u %u %u\n", + image->pot_width, + image->pot_height, + image->pot_depth); + } +} + +static void +lp_debug_cs_variant(const struct lp_compute_shader_variant *variant) +{ + debug_printf("llvmpipe: Compute shader #%u variant #%u:\n", + variant->shader->no, variant->no); + if (variant->shader->base.type == PIPE_SHADER_IR_TGSI) + tgsi_dump(variant->shader->base.tokens, 0); + else + nir_print_shader(variant->shader->base.ir.nir, stderr); + dump_cs_variant_key(&variant->key); + debug_printf("\n"); +} + +static struct lp_compute_shader_variant * +generate_variant(struct llvmpipe_context *lp, + struct lp_compute_shader *shader, + const struct lp_compute_shader_variant_key *key) +{ + struct lp_compute_shader_variant *variant; + char module_name[64]; + + variant = CALLOC_STRUCT(lp_compute_shader_variant); + if (!variant) + return NULL; + + snprintf(module_name, sizeof(module_name), "cs%u_variant%u", + shader->no, shader->variants_created); + + variant->gallivm = gallivm_create(module_name, lp->context); + if (!variant->gallivm) { + FREE(variant); + return NULL; + } + + variant->shader = shader; + variant->list_item_global.base = variant; + variant->list_item_local.base = variant; + variant->no = shader->variants_created++; + + memcpy(&variant->key, key, shader->variant_key_size); + + if ((LP_DEBUG & DEBUG_CS) || (gallivm_debug & GALLIVM_DEBUG_IR)) { + lp_debug_cs_variant(variant); + } + + lp_jit_init_cs_types(variant); + + generate_compute(lp, shader, variant); + + gallivm_compile_module(variant->gallivm); + + variant->nr_instrs += lp_build_count_ir_module(variant->gallivm->module); + + variant->jit_function = (lp_jit_cs_func)gallivm_jit_function(variant->gallivm, variant->function); + + gallivm_free_ir(variant->gallivm); + return variant; +} + +static void +lp_cs_ctx_set_cs_variant( struct lp_cs_context *csctx, + struct lp_compute_shader_variant *variant) +{ + csctx->cs.current.variant = variant; +} + +static void +llvmpipe_update_cs(struct llvmpipe_context *lp) +{ + struct lp_compute_shader *shader = lp->cs; + + struct lp_compute_shader_variant_key key; + struct lp_compute_shader_variant *variant = NULL; + struct lp_cs_variant_list_item *li; + + make_variant_key(lp, shader, &key); + + /* Search the variants for one which matches the key */ + li = first_elem(&shader->variants); + while(!at_end(&shader->variants, li)) { + if(memcmp(&li->base->key, &key, shader->variant_key_size) == 0) { + variant = li->base; + break; + } + li = next_elem(li); + } + + if (variant) { + /* Move this variant to the head of the list to implement LRU + * deletion of shader's when we have too many. + */ + move_to_head(&lp->cs_variants_list, &variant->list_item_global); + } + else { + /* variant not found, create it now */ + int64_t t0, t1, dt; + unsigned i; + unsigned variants_to_cull; + + if (LP_DEBUG & DEBUG_CS) { + debug_printf("%u variants,\t%u instrs,\t%u instrs/variant\n", + lp->nr_cs_variants, + lp->nr_cs_instrs, + lp->nr_cs_variants ? lp->nr_cs_instrs / lp->nr_cs_variants : 0); + } + + /* First, check if we've exceeded the max number of shader variants. + * If so, free 6.25% of them (the least recently used ones). + */ + variants_to_cull = lp->nr_cs_variants >= LP_MAX_SHADER_VARIANTS ? LP_MAX_SHADER_VARIANTS / 16 : 0; + + if (variants_to_cull || + lp->nr_cs_instrs >= LP_MAX_SHADER_INSTRUCTIONS) { + if (gallivm_debug & GALLIVM_DEBUG_PERF) { + debug_printf("Evicting CS: %u cs variants,\t%u total variants," + "\t%u instrs,\t%u instrs/variant\n", + shader->variants_cached, + lp->nr_cs_variants, lp->nr_cs_instrs, + lp->nr_cs_instrs / lp->nr_cs_variants); + } + + /* + * We need to re-check lp->nr_cs_variants because an arbitrarliy large + * number of shader variants (potentially all of them) could be + * pending for destruction on flush. + */ + + for (i = 0; i < variants_to_cull || lp->nr_cs_instrs >= LP_MAX_SHADER_INSTRUCTIONS; i++) { + struct lp_cs_variant_list_item *item; + if (is_empty_list(&lp->cs_variants_list)) { + break; + } + item = last_elem(&lp->cs_variants_list); + assert(item); + assert(item->base); + llvmpipe_remove_cs_shader_variant(lp, item->base); + } + } + /* + * Generate the new variant. + */ + t0 = os_time_get(); + variant = generate_variant(lp, shader, &key); + t1 = os_time_get(); + dt = t1 - t0; + LP_COUNT_ADD(llvm_compile_time, dt); + LP_COUNT_ADD(nr_llvm_compiles, 2); /* emit vs. omit in/out test */ + + /* Put the new variant into the list */ + if (variant) { + insert_at_head(&shader->variants, &variant->list_item_local); + insert_at_head(&lp->cs_variants_list, &variant->list_item_global); + lp->nr_cs_variants++; + lp->nr_cs_instrs += variant->nr_instrs; + shader->variants_cached++; + } + } + /* Bind this variant */ + lp_cs_ctx_set_cs_variant(lp->csctx, variant); +} + +/** + * Called during state validation when LP_CSNEW_SAMPLER_VIEW is set. + */ +static void +lp_csctx_set_sampler_views(struct lp_cs_context *csctx, + unsigned num, + struct pipe_sampler_view **views) +{ + unsigned i, max_tex_num; + + LP_DBG(DEBUG_SETUP, "%s\n", __FUNCTION__); + + assert(num <= PIPE_MAX_SHADER_SAMPLER_VIEWS); + + max_tex_num = MAX2(num, csctx->cs.current_tex_num); + + for (i = 0; i < max_tex_num; i++) { + struct pipe_sampler_view *view = i < num ? views[i] : NULL; + + if (view) { + struct pipe_resource *res = view->texture; + struct llvmpipe_resource *lp_tex = llvmpipe_resource(res); + struct lp_jit_texture *jit_tex; + jit_tex = &csctx->cs.current.jit_context.textures[i]; + + /* We're referencing the texture's internal data, so save a + * reference to it. + */ + pipe_resource_reference(&csctx->cs.current_tex[i], res); + + if (!lp_tex->dt) { + /* regular texture - csctx array of mipmap level offsets */ + int j; + unsigned first_level = 0; + unsigned last_level = 0; + + if (llvmpipe_resource_is_texture(res)) { + first_level = view->u.tex.first_level; + last_level = view->u.tex.last_level; + assert(first_level <= last_level); + assert(last_level <= res->last_level); + jit_tex->base = lp_tex->tex_data; + } + else { + jit_tex->base = lp_tex->data; + } + if (LP_PERF & PERF_TEX_MEM) { + /* use dummy tile memory */ + jit_tex->base = lp_dummy_tile; + jit_tex->width = TILE_SIZE/8; + jit_tex->height = TILE_SIZE/8; + jit_tex->depth = 1; + jit_tex->first_level = 0; + jit_tex->last_level = 0; + jit_tex->mip_offsets[0] = 0; + jit_tex->row_stride[0] = 0; + jit_tex->img_stride[0] = 0; + } + else { + jit_tex->width = res->width0; + jit_tex->height = res->height0; + jit_tex->depth = res->depth0; + jit_tex->first_level = first_level; + jit_tex->last_level = last_level; + + if (llvmpipe_resource_is_texture(res)) { + for (j = first_level; j <= last_level; j++) { + jit_tex->mip_offsets[j] = lp_tex->mip_offsets[j]; + jit_tex->row_stride[j] = lp_tex->row_stride[j]; + jit_tex->img_stride[j] = lp_tex->img_stride[j]; + } + + if (res->target == PIPE_TEXTURE_1D_ARRAY || + res->target == PIPE_TEXTURE_2D_ARRAY || + res->target == PIPE_TEXTURE_CUBE || + res->target == PIPE_TEXTURE_CUBE_ARRAY) { + /* + * For array textures, we don't have first_layer, instead + * adjust last_layer (stored as depth) plus the mip level offsets + * (as we have mip-first layout can't just adjust base ptr). + * XXX For mip levels, could do something similar. + */ + jit_tex->depth = view->u.tex.last_layer - view->u.tex.first_layer + 1; + for (j = first_level; j <= last_level; j++) { + jit_tex->mip_offsets[j] += view->u.tex.first_layer * + lp_tex->img_stride[j]; + } + if (view->target == PIPE_TEXTURE_CUBE || + view->target == PIPE_TEXTURE_CUBE_ARRAY) { + assert(jit_tex->depth % 6 == 0); + } + assert(view->u.tex.first_layer <= view->u.tex.last_layer); + assert(view->u.tex.last_layer < res->array_size); + } + } + else { + /* + * For buffers, we don't have "offset", instead adjust + * the size (stored as width) plus the base pointer. + */ + unsigned view_blocksize = util_format_get_blocksize(view->format); + /* probably don't really need to fill that out */ + jit_tex->mip_offsets[0] = 0; + jit_tex->row_stride[0] = 0; + jit_tex->img_stride[0] = 0; + + /* everything specified in number of elements here. */ + jit_tex->width = view->u.buf.size / view_blocksize; + jit_tex->base = (uint8_t *)jit_tex->base + view->u.buf.offset; + /* XXX Unsure if we need to sanitize parameters? */ + assert(view->u.buf.offset + view->u.buf.size <= res->width0); + } + } + } + else { + /* display target texture/surface */ + /* + * XXX: Where should this be unmapped? + */ + struct llvmpipe_screen *screen = llvmpipe_screen(res->screen); + struct sw_winsys *winsys = screen->winsys; + jit_tex->base = winsys->displaytarget_map(winsys, lp_tex->dt, + PIPE_TRANSFER_READ); + jit_tex->row_stride[0] = lp_tex->row_stride[0]; + jit_tex->img_stride[0] = lp_tex->img_stride[0]; + jit_tex->mip_offsets[0] = 0; + jit_tex->width = res->width0; + jit_tex->height = res->height0; + jit_tex->depth = res->depth0; + jit_tex->first_level = jit_tex->last_level = 0; + assert(jit_tex->base); + } + } + else { + pipe_resource_reference(&csctx->cs.current_tex[i], NULL); + } + } + csctx->cs.current_tex_num = num; +} + + +/** + * Called during state validation when LP_NEW_SAMPLER is set. + */ +static void +lp_csctx_set_sampler_state(struct lp_cs_context *csctx, + unsigned num, + struct pipe_sampler_state **samplers) +{ + unsigned i; + + LP_DBG(DEBUG_SETUP, "%s\n", __FUNCTION__); + + assert(num <= PIPE_MAX_SAMPLERS); + + for (i = 0; i < PIPE_MAX_SAMPLERS; i++) { + const struct pipe_sampler_state *sampler = i < num ? samplers[i] : NULL; + + if (sampler) { + struct lp_jit_sampler *jit_sam; + jit_sam = &csctx->cs.current.jit_context.samplers[i]; + + jit_sam->min_lod = sampler->min_lod; + jit_sam->max_lod = sampler->max_lod; + jit_sam->lod_bias = sampler->lod_bias; + COPY_4V(jit_sam->border_color, sampler->border_color.f); + } + } +} + +static void +lp_csctx_set_cs_constants(struct lp_cs_context *csctx, + unsigned num, + struct pipe_constant_buffer *buffers) +{ + unsigned i; + + LP_DBG(DEBUG_SETUP, "%s %p\n", __FUNCTION__, (void *) buffers); + + assert(num <= ARRAY_SIZE(csctx->constants)); + + for (i = 0; i < num; ++i) { + util_copy_constant_buffer(&csctx->constants[i].current, &buffers[i]); + } + for (; i < ARRAY_SIZE(csctx->constants); i++) { + util_copy_constant_buffer(&csctx->constants[i].current, NULL); + } +} + +static void +lp_csctx_set_cs_ssbos(struct lp_cs_context *csctx, + unsigned num, + struct pipe_shader_buffer *buffers) +{ + int i; + LP_DBG(DEBUG_SETUP, "%s %p\n", __FUNCTION__, (void *)buffers); + + assert (num <= ARRAY_SIZE(csctx->ssbos)); + + for (i = 0; i < num; ++i) { + util_copy_shader_buffer(&csctx->ssbos[i].current, &buffers[i]); + } + for (; i < ARRAY_SIZE(csctx->ssbos); i++) { + util_copy_shader_buffer(&csctx->ssbos[i].current, NULL); + } +} + +static void +lp_csctx_set_cs_images(struct lp_cs_context *csctx, + unsigned num, + struct pipe_image_view *images) +{ + unsigned i; + + LP_DBG(DEBUG_SETUP, "%s %p\n", __FUNCTION__, (void *) images); + + assert(num <= ARRAY_SIZE(csctx->images)); + + for (i = 0; i < num; ++i) { + struct pipe_image_view *image = &images[i]; + util_copy_image_view(&csctx->images[i].current, &images[i]); + + struct pipe_resource *res = image->resource; + struct llvmpipe_resource *lp_res = llvmpipe_resource(res); + struct lp_jit_image *jit_image; + + jit_image = &csctx->cs.current.jit_context.images[i]; + if (!lp_res) + continue; + if (!lp_res->dt) { + /* regular texture - csctx array of mipmap level offsets */ + if (llvmpipe_resource_is_texture(res)) { + jit_image->base = lp_res->tex_data; + } else + jit_image->base = lp_res->data; + + jit_image->width = res->width0; + jit_image->height = res->height0; + jit_image->depth = res->depth0; + + if (llvmpipe_resource_is_texture(res)) { + uint32_t mip_offset = lp_res->mip_offsets[image->u.tex.level]; + + jit_image->width = u_minify(jit_image->width, image->u.tex.level); + jit_image->height = u_minify(jit_image->height, image->u.tex.level); + + if (res->target == PIPE_TEXTURE_1D_ARRAY || + res->target == PIPE_TEXTURE_2D_ARRAY || + res->target == PIPE_TEXTURE_3D || + res->target == PIPE_TEXTURE_CUBE || + res->target == PIPE_TEXTURE_CUBE_ARRAY) { + /* + * For array textures, we don't have first_layer, instead + * adjust last_layer (stored as depth) plus the mip level offsets + * (as we have mip-first layout can't just adjust base ptr). + * XXX For mip levels, could do something similar. + */ + jit_image->depth = image->u.tex.last_layer - image->u.tex.first_layer + 1; + mip_offset += image->u.tex.first_layer * lp_res->img_stride[image->u.tex.level]; + } else + jit_image->depth = u_minify(jit_image->depth, image->u.tex.level); + + jit_image->row_stride = lp_res->row_stride[image->u.tex.level]; + jit_image->img_stride = lp_res->img_stride[image->u.tex.level]; + jit_image->base = (uint8_t *)jit_image->base + mip_offset; + } else { + unsigned view_blocksize = util_format_get_blocksize(image->format); + jit_image->width = image->u.buf.size / view_blocksize; + jit_image->base = (uint8_t *)jit_image->base + image->u.buf.offset; + } + } + } + for (; i < ARRAY_SIZE(csctx->images); i++) { + util_copy_image_view(&csctx->images[i].current, NULL); + } +} + +static void +update_csctx_consts(struct llvmpipe_context *llvmpipe) +{ + struct lp_cs_context *csctx = llvmpipe->csctx; + int i; + + for (i = 0; i < ARRAY_SIZE(csctx->constants); ++i) { + struct pipe_resource *buffer = csctx->constants[i].current.buffer; + const ubyte *current_data = NULL; + + if (buffer) { + /* resource buffer */ + current_data = (ubyte *) llvmpipe_resource_data(buffer); + } + else if (csctx->constants[i].current.user_buffer) { + /* user-space buffer */ + current_data = (ubyte *) csctx->constants[i].current.user_buffer; + } + + if (current_data) { + current_data += csctx->constants[i].current.buffer_offset; + + csctx->cs.current.jit_context.constants[i] = (const float *)current_data; + csctx->cs.current.jit_context.num_constants[i] = csctx->constants[i].current.buffer_size; + } else { + csctx->cs.current.jit_context.constants[i] = NULL; + csctx->cs.current.jit_context.num_constants[i] = 0; + } + } +} + +static void +update_csctx_ssbo(struct llvmpipe_context *llvmpipe) +{ + struct lp_cs_context *csctx = llvmpipe->csctx; + int i; + for (i = 0; i < ARRAY_SIZE(csctx->ssbos); ++i) { + struct pipe_resource *buffer = csctx->ssbos[i].current.buffer; + const ubyte *current_data = NULL; + + if (!buffer) + continue; + /* resource buffer */ + current_data = (ubyte *) llvmpipe_resource_data(buffer); + if (current_data) { + current_data += csctx->ssbos[i].current.buffer_offset; + + csctx->cs.current.jit_context.ssbos[i] = (const uint32_t *)current_data; + csctx->cs.current.jit_context.num_ssbos[i] = csctx->ssbos[i].current.buffer_size; + } else { + csctx->cs.current.jit_context.ssbos[i] = NULL; + csctx->cs.current.jit_context.num_ssbos[i] = 0; + } + } +} + +static void +llvmpipe_cs_update_derived(struct llvmpipe_context *llvmpipe, void *input) +{ + if (llvmpipe->cs_dirty & (LP_CSNEW_CS)) + llvmpipe_update_cs(llvmpipe); + + if (llvmpipe->cs_dirty & LP_CSNEW_CONSTANTS) { + lp_csctx_set_cs_constants(llvmpipe->csctx, + ARRAY_SIZE(llvmpipe->constants[PIPE_SHADER_COMPUTE]), + llvmpipe->constants[PIPE_SHADER_COMPUTE]); + update_csctx_consts(llvmpipe); + } + + if (llvmpipe->cs_dirty & LP_CSNEW_SSBOS) { + lp_csctx_set_cs_ssbos(llvmpipe->csctx, + ARRAY_SIZE(llvmpipe->ssbos[PIPE_SHADER_COMPUTE]), + llvmpipe->ssbos[PIPE_SHADER_COMPUTE]); + update_csctx_ssbo(llvmpipe); + } + + if (llvmpipe->cs_dirty & LP_CSNEW_SAMPLER_VIEW) + lp_csctx_set_sampler_views(llvmpipe->csctx, + llvmpipe->num_sampler_views[PIPE_SHADER_COMPUTE], + llvmpipe->sampler_views[PIPE_SHADER_COMPUTE]); + + if (llvmpipe->cs_dirty & LP_CSNEW_SAMPLER) + lp_csctx_set_sampler_state(llvmpipe->csctx, + llvmpipe->num_samplers[PIPE_SHADER_COMPUTE], + llvmpipe->samplers[PIPE_SHADER_COMPUTE]); + + if (llvmpipe->cs_dirty & LP_CSNEW_IMAGES) + lp_csctx_set_cs_images(llvmpipe->csctx, + ARRAY_SIZE(llvmpipe->images[PIPE_SHADER_COMPUTE]), + llvmpipe->images[PIPE_SHADER_COMPUTE]); + + if (input) { + struct lp_cs_context *csctx = llvmpipe->csctx; + csctx->input = input; + csctx->cs.current.jit_context.kernel_args = input; + } + + llvmpipe->cs_dirty = 0; +} + +static void +cs_exec_fn(void *init_data, int iter_idx, struct lp_cs_local_mem *lmem) +{ + struct lp_cs_job_info *job_info = init_data; + struct lp_jit_cs_thread_data thread_data; + + memset(&thread_data, 0, sizeof(thread_data)); + + if (lmem->local_size < job_info->req_local_mem) { + lmem->local_mem_ptr = REALLOC(lmem->local_mem_ptr, lmem->local_size, + job_info->req_local_mem); + lmem->local_size = job_info->req_local_mem; + } + thread_data.shared = lmem->local_mem_ptr; + + unsigned grid_z = iter_idx / (job_info->grid_size[0] * job_info->grid_size[1]); + unsigned grid_y = (iter_idx - (grid_z * (job_info->grid_size[0] * job_info->grid_size[1]))) / job_info->grid_size[0]; + unsigned grid_x = (iter_idx - (grid_z * (job_info->grid_size[0] * job_info->grid_size[1])) - (grid_y * job_info->grid_size[0])); + struct lp_compute_shader_variant *variant = job_info->current->variant; + variant->jit_function(&job_info->current->jit_context, + job_info->block_size[0], job_info->block_size[1], job_info->block_size[2], + grid_x, grid_y, grid_z, + job_info->grid_size[0], job_info->grid_size[1], job_info->grid_size[2], job_info->work_dim, + &thread_data); +} + +static void +fill_grid_size(struct pipe_context *pipe, + const struct pipe_grid_info *info, + uint32_t grid_size[3]) +{ + struct pipe_transfer *transfer; + uint32_t *params; + if (!info->indirect) { + grid_size[0] = info->grid[0]; + grid_size[1] = info->grid[1]; + grid_size[2] = info->grid[2]; + return; + } + params = pipe_buffer_map_range(pipe, info->indirect, + info->indirect_offset, + 3 * sizeof(uint32_t), + PIPE_TRANSFER_READ, + &transfer); + + if (!transfer) + return; + + grid_size[0] = params[0]; + grid_size[1] = params[1]; + grid_size[2] = params[2]; + pipe_buffer_unmap(pipe, transfer); +} + +static void llvmpipe_launch_grid(struct pipe_context *pipe, + const struct pipe_grid_info *info) +{ + struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe); + struct llvmpipe_screen *screen = llvmpipe_screen(pipe->screen); + struct lp_cs_job_info job_info; + + memset(&job_info, 0, sizeof(job_info)); + + llvmpipe_cs_update_derived(llvmpipe, info->input); + + fill_grid_size(pipe, info, job_info.grid_size); + + job_info.block_size[0] = info->block[0]; + job_info.block_size[1] = info->block[1]; + job_info.block_size[2] = info->block[2]; + job_info.work_dim = info->work_dim; + job_info.req_local_mem = llvmpipe->cs->req_local_mem; + job_info.current = &llvmpipe->csctx->cs.current; + + int num_tasks = job_info.grid_size[2] * job_info.grid_size[1] * job_info.grid_size[0]; + if (num_tasks) { + struct lp_cs_tpool_task *task; + mtx_lock(&screen->cs_mutex); + task = lp_cs_tpool_queue_task(screen->cs_tpool, cs_exec_fn, &job_info, num_tasks); + + lp_cs_tpool_wait_for_task(screen->cs_tpool, &task); + mtx_unlock(&screen->cs_mutex); + } + llvmpipe->pipeline_statistics.cs_invocations += num_tasks * info->block[0] * info->block[1] * info->block[2]; +} + +static void +llvmpipe_set_compute_resources(struct pipe_context *pipe, + unsigned start, unsigned count, + struct pipe_surface **resources) +{ + + +} + +static void +llvmpipe_set_global_binding(struct pipe_context *pipe, + unsigned first, unsigned count, + struct pipe_resource **resources, + uint32_t **handles) +{ + struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe); + struct lp_compute_shader *cs = llvmpipe->cs; + unsigned i; + + if (first + count > cs->max_global_buffers) { + unsigned old_max = cs->max_global_buffers; + cs->max_global_buffers = first + count; + cs->global_buffers = realloc(cs->global_buffers, + cs->max_global_buffers * sizeof(cs->global_buffers[0])); + if (!cs->global_buffers) { + return; + } + + memset(&cs->global_buffers[old_max], 0, (cs->max_global_buffers - old_max) * sizeof(cs->global_buffers[0])); + } + + if (!resources) { + for (i = 0; i < count; i++) + pipe_resource_reference(&cs->global_buffers[first + i], NULL); + return; + } + + for (i = 0; i < count; i++) { + uintptr_t va; + uint32_t offset; + pipe_resource_reference(&cs->global_buffers[first + i], resources[i]); + struct llvmpipe_resource *lp_res = llvmpipe_resource(resources[i]); + offset = *handles[i]; + va = (uintptr_t)((char *)lp_res->data + offset); + memcpy(handles[i], &va, sizeof(va)); + } +} + +void +llvmpipe_init_compute_funcs(struct llvmpipe_context *llvmpipe) +{ + llvmpipe->pipe.create_compute_state = llvmpipe_create_compute_state; + llvmpipe->pipe.bind_compute_state = llvmpipe_bind_compute_state; + llvmpipe->pipe.delete_compute_state = llvmpipe_delete_compute_state; + llvmpipe->pipe.set_compute_resources = llvmpipe_set_compute_resources; + llvmpipe->pipe.set_global_binding = llvmpipe_set_global_binding; + llvmpipe->pipe.launch_grid = llvmpipe_launch_grid; +} + +void +lp_csctx_destroy(struct lp_cs_context *csctx) +{ + unsigned i; + for (i = 0; i < ARRAY_SIZE(csctx->cs.current_tex); i++) { + pipe_resource_reference(&csctx->cs.current_tex[i], NULL); + } + for (i = 0; i < ARRAY_SIZE(csctx->constants); i++) { + pipe_resource_reference(&csctx->constants[i].current.buffer, NULL); + } + for (i = 0; i < ARRAY_SIZE(csctx->ssbos); i++) { + pipe_resource_reference(&csctx->ssbos[i].current.buffer, NULL); + } + FREE(csctx); +} + +struct lp_cs_context *lp_csctx_create(struct pipe_context *pipe) +{ + struct lp_cs_context *csctx; + + csctx = CALLOC_STRUCT(lp_cs_context); + if (!csctx) + return NULL; + + csctx->pipe = pipe; + return csctx; +} diff -Nru mesa-19.2.8/src/gallium/drivers/llvmpipe/lp_state_cs.h mesa-20.0.8/src/gallium/drivers/llvmpipe/lp_state_cs.h --- mesa-19.2.8/src/gallium/drivers/llvmpipe/lp_state_cs.h 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/llvmpipe/lp_state_cs.h 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,133 @@ +/************************************************************************** + * + * Copyright 2019 Red Hat. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **************************************************************************/ + +#ifndef LP_STATE_CS_H +#define LP_STATE_CS_H + +#include "os/os_thread.h" +#include "util/u_thread.h" +#include "pipe/p_state.h" + +#include "gallivm/lp_bld.h" +#include "gallivm/lp_bld_sample.h" /* for struct lp_sampler_static_state */ +#include "lp_jit.h" +#include "lp_state_fs.h" + +struct lp_compute_shader_variant; + +struct lp_compute_shader_variant_key +{ + unsigned nr_samplers:8; + unsigned nr_sampler_views:8; + unsigned nr_images:8; + struct lp_image_static_state image_state[PIPE_MAX_SHADER_IMAGES]; + struct lp_sampler_static_state state[PIPE_MAX_SHADER_SAMPLER_VIEWS]; +}; + +struct lp_cs_variant_list_item +{ + struct lp_compute_shader_variant *base; + struct lp_cs_variant_list_item *next, *prev; +}; + +struct lp_compute_shader_variant +{ + struct lp_compute_shader_variant_key key; + + struct gallivm_state *gallivm; + + LLVMTypeRef jit_cs_context_ptr_type; + LLVMTypeRef jit_cs_thread_data_ptr_type; + + LLVMValueRef function; + lp_jit_cs_func jit_function; + + /* Total number of LLVM instructions generated */ + unsigned nr_instrs; + + struct lp_cs_variant_list_item list_item_global, list_item_local; + + struct lp_compute_shader *shader; + + /* For debugging/profiling purposes */ + unsigned no; +}; + +struct lp_compute_shader { + struct pipe_shader_state base; + + struct lp_cs_variant_list_item variants; + + struct lp_tgsi_info info; + + uint32_t req_local_mem; + + /* For debugging/profiling purposes */ + unsigned variant_key_size; + unsigned no; + unsigned variants_created; + unsigned variants_cached; + + int max_global_buffers; + struct pipe_resource **global_buffers; +}; + +struct lp_cs_exec { + struct lp_jit_cs_context jit_context; + struct lp_compute_shader_variant *variant; +}; + +struct lp_cs_context { + struct pipe_context *pipe; + + struct { + struct lp_cs_exec current; + struct pipe_resource *current_tex[PIPE_MAX_SHADER_SAMPLER_VIEWS]; + unsigned current_tex_num; + } cs; + + /** compute shader constants */ + struct { + struct pipe_constant_buffer current; + unsigned stored_size; + const void *stored_data; + } constants[LP_MAX_TGSI_CONST_BUFFERS]; + + /** compute shader buffers */ + struct { + struct pipe_shader_buffer current; + } ssbos[LP_MAX_TGSI_SHADER_BUFFERS]; + + struct { + struct pipe_image_view current; + } images[LP_MAX_TGSI_SHADER_IMAGES]; + + void *input; +}; + +struct lp_cs_context *lp_csctx_create(struct pipe_context *pipe); +void lp_csctx_destroy(struct lp_cs_context *csctx); + +#endif diff -Nru mesa-19.2.8/src/gallium/drivers/llvmpipe/lp_state_derived.c mesa-20.0.8/src/gallium/drivers/llvmpipe/lp_state_derived.c --- mesa-19.2.8/src/gallium/drivers/llvmpipe/lp_state_derived.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/llvmpipe/lp_state_derived.c 2020-06-12 01:21:17.000000000 +0000 @@ -260,6 +260,11 @@ ARRAY_SIZE(llvmpipe->ssbos[PIPE_SHADER_FRAGMENT]), llvmpipe->ssbos[PIPE_SHADER_FRAGMENT]); + if (llvmpipe->dirty & LP_NEW_FS_IMAGES) + lp_setup_set_fs_images(llvmpipe->setup, + ARRAY_SIZE(llvmpipe->images[PIPE_SHADER_FRAGMENT]), + llvmpipe->images[PIPE_SHADER_FRAGMENT]); + if (llvmpipe->dirty & (LP_NEW_SAMPLER_VIEW)) lp_setup_set_fragment_sampler_views(llvmpipe->setup, llvmpipe->num_sampler_views[PIPE_SHADER_FRAGMENT], diff -Nru mesa-19.2.8/src/gallium/drivers/llvmpipe/lp_state_fs.c mesa-20.0.8/src/gallium/drivers/llvmpipe/lp_state_fs.c --- mesa-19.2.8/src/gallium/drivers/llvmpipe/lp_state_fs.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/llvmpipe/lp_state_fs.c 2020-06-12 01:21:17.000000000 +0000 @@ -62,7 +62,7 @@ #include "util/u_inlines.h" #include "util/u_memory.h" #include "util/u_pointer.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_dump.h" #include "util/u_string.h" #include "util/simple_list.h" @@ -80,6 +80,7 @@ #include "gallivm/lp_bld_intr.h" #include "gallivm/lp_bld_logic.h" #include "gallivm/lp_bld_tgsi.h" +#include "gallivm/lp_bld_nir.h" #include "gallivm/lp_bld_swizzle.h" #include "gallivm/lp_bld_flow.h" #include "gallivm/lp_bld_debug.h" @@ -102,7 +103,7 @@ #include "lp_flush.h" #include "lp_state_fs.h" #include "lp_rast.h" - +#include "nir/nir_to_tgsi_info.h" /** Fragment shader number (for debugging) */ static unsigned fs_no = 0; @@ -300,6 +301,7 @@ LLVMValueRef num_loop, struct lp_build_interp_soa_context *interp, const struct lp_build_sampler_soa *sampler, + const struct lp_build_image_soa *image, LLVMValueRef mask_store, LLVMValueRef (*out_color)[4], LLVMValueRef depth_ptr, @@ -340,14 +342,22 @@ memset(&system_values, 0, sizeof(system_values)); + /* truncate then sign extend. */ + system_values.front_facing = LLVMBuildTrunc(gallivm->builder, facing, LLVMInt1TypeInContext(gallivm->context), ""); + system_values.front_facing = LLVMBuildSExt(gallivm->builder, system_values.front_facing, LLVMInt32TypeInContext(gallivm->context), ""); + if (key->depth.enabled || key->stencil[0].enabled) { zs_format_desc = util_format_description(key->zsbuf_format); assert(zs_format_desc); - if (!shader->info.base.writes_z && !shader->info.base.writes_stencil) { - if (key->alpha.enabled || + if (shader->info.base.properties[TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL]) + depth_mode = EARLY_DEPTH_TEST | EARLY_DEPTH_WRITE; + else if (!shader->info.base.writes_z && !shader->info.base.writes_stencil) { + if (shader->info.base.writes_memory) + depth_mode = LATE_DEPTH_TEST | LATE_DEPTH_WRITE; + else if (key->alpha.enabled || key->blend.alpha_to_coverage || shader->info.base.uses_kill || shader->info.base.writes_samplemask) { @@ -493,10 +503,15 @@ params.info = &shader->info.base; params.ssbo_ptr = ssbo_ptr; params.ssbo_sizes_ptr = num_ssbo_ptr; + params.image = image; /* Build the actual shader */ - lp_build_tgsi_soa(gallivm, tokens, ¶ms, - outputs); + if (shader->base.type == PIPE_SHADER_IR_TGSI) + lp_build_tgsi_soa(gallivm, tokens, ¶ms, + outputs); + else + lp_build_nir_soa(gallivm, shader->base.ir.nir, ¶ms, + outputs); /* Alpha test */ if (key->alpha.enabled) { @@ -1393,7 +1408,7 @@ for (j = 0; j < src_fmt->nr_channels; ++j) { unsigned mask = 0; unsigned sa = src_fmt->channel[j].shift; -#ifdef PIPE_ARCH_LITTLE_ENDIAN +#if UTIL_ARCH_LITTLE_ENDIAN unsigned from_lsb = j; #else unsigned from_lsb = src_fmt->nr_channels - j - 1; @@ -1575,7 +1590,7 @@ for (j = 0; j < src_fmt->nr_channels; ++j) { unsigned mask = 0; unsigned sa = src_fmt->channel[j].shift; -#ifdef PIPE_ARCH_LITTLE_ENDIAN +#if UTIL_ARCH_LITTLE_ENDIAN unsigned from_lsb = j; #else unsigned from_lsb = src_fmt->nr_channels - j - 1; @@ -2420,7 +2435,7 @@ unsigned partial_mask) { struct gallivm_state *gallivm = variant->gallivm; - const struct lp_fragment_shader_variant_key *key = &variant->key; + struct lp_fragment_shader_variant_key *key = &variant->key; struct lp_shader_input inputs[PIPE_MAX_SHADER_INPUTS]; char func_name[64]; struct lp_type fs_type; @@ -2446,6 +2461,7 @@ LLVMBasicBlockRef block; LLVMBuilderRef builder; struct lp_build_sampler_soa *sampler; + struct lp_build_image_soa *image; struct lp_build_interp_soa_context interp; LLVMValueRef fs_mask[16 / 4]; LLVMValueRef fs_out_color[PIPE_MAX_COLOR_BUFS][TGSI_NUM_CHANNELS][16 / 4]; @@ -2591,7 +2607,8 @@ } /* code generated texture sampling */ - sampler = lp_llvm_sampler_soa_create(key->state); + sampler = lp_llvm_sampler_soa_create(key->samplers); + image = lp_llvm_image_soa_create(lp_fs_variant_key_images(key)); num_fs = 16 / fs_type.length; /* number of loops per 4x4 stamp */ /* for 1d resources only run "upper half" of stamp */ @@ -2646,6 +2663,7 @@ num_loop, &interp, sampler, + image, mask_store, /* output */ color_store, depth_ptr, @@ -2680,7 +2698,7 @@ } sampler->destroy(sampler); - + image->destroy(image); /* Loop over color outputs / color buffers to do blending. */ for(cbuf = 0; cbuf < key->nr_cbufs; cbuf++) { @@ -2720,7 +2738,7 @@ static void -dump_fs_variant_key(const struct lp_fragment_shader_variant_key *key) +dump_fs_variant_key(struct lp_fragment_shader_variant_key *key) { unsigned i; @@ -2775,7 +2793,7 @@ debug_printf("blend.alpha_to_coverage is enabled\n"); } for (i = 0; i < key->nr_samplers; ++i) { - const struct lp_static_sampler_state *sampler = &key->state[i].sampler_state; + const struct lp_static_sampler_state *sampler = &key->samplers[i].sampler_state; debug_printf("sampler[%u] = \n", i); debug_printf(" .wrap = %s %s %s\n", util_str_tex_wrap(sampler->wrap_s, TRUE), @@ -2796,7 +2814,7 @@ debug_printf(" .apply_max_lod = %u\n", sampler->apply_max_lod); } for (i = 0; i < key->nr_sampler_views; ++i) { - const struct lp_static_texture_state *texture = &key->state[i].texture_state; + const struct lp_static_texture_state *texture = &key->samplers[i].texture_state; debug_printf("texture[%u] = \n", i); debug_printf(" .format = %s\n", util_format_name(texture->format)); @@ -2809,15 +2827,33 @@ texture->pot_height, texture->pot_depth); } + struct lp_image_static_state *images = lp_fs_variant_key_images(key); + for (i = 0; i < key->nr_images; ++i) { + const struct lp_static_texture_state *image = &images[i].image_state; + debug_printf("image[%u] = \n", i); + debug_printf(" .format = %s\n", + util_format_name(image->format)); + debug_printf(" .target = %s\n", + util_str_tex_target(image->target, TRUE)); + debug_printf(" .level_zero_only = %u\n", + image->level_zero_only); + debug_printf(" .pot = %u %u %u\n", + image->pot_width, + image->pot_height, + image->pot_depth); + } } void -lp_debug_fs_variant(const struct lp_fragment_shader_variant *variant) +lp_debug_fs_variant(struct lp_fragment_shader_variant *variant) { - debug_printf("llvmpipe: Fragment shader #%u variant #%u:\n", + debug_printf("llvmpipe: Fragment shader #%u variant #%u:\n", variant->shader->no, variant->no); - tgsi_dump(variant->shader->base.tokens, 0); + if (variant->shader->base.type == PIPE_SHADER_IR_TGSI) + tgsi_dump(variant->shader->base.tokens, 0); + else + nir_print_shader(variant->shader->base.ir.nir, stderr); dump_fs_variant_key(&variant->key); debug_printf("variant->opaque = %u\n", variant->opaque); debug_printf("\n"); @@ -2838,10 +2874,11 @@ boolean fullcolormask; char module_name[64]; - variant = CALLOC_STRUCT(lp_fragment_shader_variant); + variant = MALLOC(sizeof *variant + shader->variant_key_size - sizeof variant->key); if (!variant) return NULL; + memset(variant, 0, sizeof(*variant)); snprintf(module_name, sizeof(module_name), "fs%u_variant%u", shader->no, shader->variants_created); @@ -2931,6 +2968,7 @@ struct lp_fragment_shader *shader; int nr_samplers; int nr_sampler_views; + int nr_images; int i; shader = CALLOC_STRUCT(lp_fragment_shader); @@ -2940,11 +2978,17 @@ shader->no = fs_no++; make_empty_list(&shader->variants); - /* get/save the summary info for this shader */ - lp_build_tgsi_info(templ->tokens, &shader->info); + shader->base.type = templ->type; + if (templ->type == PIPE_SHADER_IR_TGSI) { + /* get/save the summary info for this shader */ + lp_build_tgsi_info(templ->tokens, &shader->info); - /* we need to keep a local copy of the tokens */ - shader->base.tokens = tgsi_dup_tokens(templ->tokens); + /* we need to keep a local copy of the tokens */ + shader->base.tokens = tgsi_dup_tokens(templ->tokens); + } else { + shader->base.ir.nir = templ->ir.nir; + nir_tgsi_scan_shader(templ->ir.nir, &shader->info.base, true); + } shader->draw_data = draw_create_fragment_shader(llvmpipe->draw, templ); if (shader->draw_data == NULL) { @@ -2955,9 +2999,8 @@ nr_samplers = shader->info.base.file_max[TGSI_FILE_SAMPLER] + 1; nr_sampler_views = shader->info.base.file_max[TGSI_FILE_SAMPLER_VIEW] + 1; - - shader->variant_key_size = Offset(struct lp_fragment_shader_variant_key, - state[MAX2(nr_samplers, nr_sampler_views)]); + nr_images = shader->info.base.file_max[TGSI_FILE_IMAGE] + 1; + shader->variant_key_size = lp_fs_variant_key_size(MAX2(nr_samplers, nr_sampler_views), nr_images); for (i = 0; i < shader->info.base.num_inputs; i++) { shader->inputs[i].usage_mask = shader->info.base.input_usage_mask[i]; @@ -3095,6 +3138,8 @@ /* Delete draw module's data */ draw_delete_fragment_shader(llvmpipe->draw, shader->draw_data); + if (shader->base.ir.nir) + ralloc_free(shader->base.ir.nir); assert(shader->variants_cached == 0); FREE((void *) shader->base.tokens); FREE(shader); @@ -3145,9 +3190,10 @@ draw_set_mapped_constant_buffer(llvmpipe->draw, shader, index, data, size); } - else { + else if (shader == PIPE_SHADER_COMPUTE) + llvmpipe->cs_dirty |= LP_CSNEW_CONSTANTS; + else llvmpipe->dirty |= LP_NEW_FS_CONSTANTS; - } if (cb && cb->user_buffer) { pipe_resource_reference(&constants, NULL); @@ -3177,12 +3223,42 @@ data += buffer->buffer_offset; draw_set_mapped_shader_buffer(llvmpipe->draw, shader, i, data, size); + } else if (shader == PIPE_SHADER_COMPUTE) { + llvmpipe->cs_dirty |= LP_CSNEW_SSBOS; } else if (shader == PIPE_SHADER_FRAGMENT) { llvmpipe->dirty |= LP_NEW_FS_SSBOS; } } } +static void +llvmpipe_set_shader_images(struct pipe_context *pipe, + enum pipe_shader_type shader, unsigned start_slot, + unsigned count, const struct pipe_image_view *images) +{ + struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe); + unsigned i, idx; + + draw_flush(llvmpipe->draw); + for (i = start_slot, idx = 0; i < start_slot + count; i++, idx++) { + const struct pipe_image_view *image = images ? &images[idx] : NULL; + + util_copy_image_view(&llvmpipe->images[shader][i], image); + } + + llvmpipe->num_images[shader] = start_slot + count; + if (shader == PIPE_SHADER_VERTEX || + shader == PIPE_SHADER_GEOMETRY) { + draw_set_images(llvmpipe->draw, + shader, + llvmpipe->images[shader], + start_slot + count); + } else if (shader == PIPE_SHADER_COMPUTE) + llvmpipe->cs_dirty |= LP_CSNEW_IMAGES; + else + llvmpipe->dirty |= LP_NEW_FS_IMAGES; +} + /** * Return the blend factor equivalent to a destination alpha of one. */ @@ -3212,14 +3288,17 @@ * TODO: there is actually no reason to tie this to context state -- the * generated code could be cached globally in the screen. */ -static void +static struct lp_fragment_shader_variant_key * make_variant_key(struct llvmpipe_context *lp, struct lp_fragment_shader *shader, - struct lp_fragment_shader_variant_key *key) + char *store) { unsigned i; + struct lp_fragment_shader_variant_key *key; + + key = (struct lp_fragment_shader_variant_key *)store; - memset(key, 0, shader->variant_key_size); + memset(key, 0, offsetof(struct lp_fragment_shader_variant_key, samplers[1])); if (lp->framebuffer.zsbuf) { enum pipe_format zsbuf_format = lp->framebuffer.zsbuf->format; @@ -3271,7 +3350,7 @@ /* alpha.ref_value is passed in jit_context */ key->flatshade = lp->rasterizer->flatshade; - if (lp->active_occlusion_queries) { + if (lp->active_occlusion_queries && !lp->queries_disabled) { key->occlusion_count = TRUE; } @@ -3365,9 +3444,15 @@ */ key->nr_samplers = shader->info.base.file_max[TGSI_FILE_SAMPLER] + 1; + struct lp_sampler_static_state *fs_sampler; + + fs_sampler = key->samplers; + + memset(fs_sampler, 0, MAX2(key->nr_samplers, key->nr_sampler_views) * sizeof *fs_sampler); + for(i = 0; i < key->nr_samplers; ++i) { if(shader->info.base.file_mask[TGSI_FILE_SAMPLER] & (1 << i)) { - lp_sampler_static_sampler_state(&key->state[i].sampler_state, + lp_sampler_static_sampler_state(&fs_sampler[i].sampler_state, lp->samplers[PIPE_SHADER_FRAGMENT][i]); } } @@ -3386,7 +3471,7 @@ * used views may be included in the shader key. */ if(shader->info.base.file_mask[TGSI_FILE_SAMPLER_VIEW] & (1u << (i & 31))) { - lp_sampler_static_texture_state(&key->state[i].texture_state, + lp_sampler_static_texture_state(&fs_sampler[i].texture_state, lp->sampler_views[PIPE_SHADER_FRAGMENT][i]); } } @@ -3395,11 +3480,22 @@ key->nr_sampler_views = key->nr_samplers; for(i = 0; i < key->nr_sampler_views; ++i) { if(shader->info.base.file_mask[TGSI_FILE_SAMPLER] & (1 << i)) { - lp_sampler_static_texture_state(&key->state[i].texture_state, + lp_sampler_static_texture_state(&fs_sampler[i].texture_state, lp->sampler_views[PIPE_SHADER_FRAGMENT][i]); } } } + + struct lp_image_static_state *lp_image; + lp_image = lp_fs_variant_key_images(key); + key->nr_images = shader->info.base.file_max[TGSI_FILE_IMAGE] + 1; + for (i = 0; i < key->nr_images; ++i) { + if (shader->info.base.file_mask[TGSI_FILE_IMAGE] & (1 << i)) { + lp_sampler_static_texture_state_image(&lp_image[i].image_state, + &lp->images[PIPE_SHADER_FRAGMENT][i]); + } + } + return key; } @@ -3412,16 +3508,17 @@ llvmpipe_update_fs(struct llvmpipe_context *lp) { struct lp_fragment_shader *shader = lp->fs; - struct lp_fragment_shader_variant_key key; + struct lp_fragment_shader_variant_key *key; struct lp_fragment_shader_variant *variant = NULL; struct lp_fs_variant_list_item *li; + char store[LP_FS_MAX_VARIANT_KEY_SIZE]; - make_variant_key(lp, shader, &key); + key = make_variant_key(lp, shader, store); /* Search the variants for one which matches the key */ li = first_elem(&shader->variants); while(!at_end(&shader->variants, li)) { - if(memcmp(&li->base->key, &key, shader->variant_key_size) == 0) { + if(memcmp(&li->base->key, key, shader->variant_key_size) == 0) { variant = li->base; break; } @@ -3493,7 +3590,7 @@ * Generate the new variant. */ t0 = os_time_get(); - variant = generate_variant(lp, shader, &key); + variant = generate_variant(lp, shader, key); t1 = os_time_get(); dt = t1 - t0; LP_COUNT_ADD(llvm_compile_time, dt); @@ -3527,6 +3624,7 @@ llvmpipe->pipe.set_constant_buffer = llvmpipe_set_constant_buffer; llvmpipe->pipe.set_shader_buffers = llvmpipe_set_shader_buffers; + llvmpipe->pipe.set_shader_images = llvmpipe_set_shader_images; } diff -Nru mesa-19.2.8/src/gallium/drivers/llvmpipe/lp_state_fs.h mesa-20.0.8/src/gallium/drivers/llvmpipe/lp_state_fs.h --- mesa-19.2.8/src/gallium/drivers/llvmpipe/lp_state_fs.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/llvmpipe/lp_state_fs.h 2020-06-12 01:21:17.000000000 +0000 @@ -59,6 +59,11 @@ }; +struct lp_image_static_state +{ + struct lp_static_texture_state image_state; +}; + struct lp_fragment_shader_variant_key { struct pipe_depth_state depth; @@ -73,6 +78,7 @@ unsigned nr_cbufs:8; unsigned nr_samplers:8; /* actually derivable from just the shader */ unsigned nr_sampler_views:8; /* actually derivable from just the shader */ + unsigned nr_images:8; /* actually derivable from just the shader */ unsigned flatshade:1; unsigned occlusion_count:1; unsigned resource_1d:1; @@ -81,9 +87,30 @@ enum pipe_format zsbuf_format; enum pipe_format cbuf_format[PIPE_MAX_COLOR_BUFS]; - struct lp_sampler_static_state state[PIPE_MAX_SHADER_SAMPLER_VIEWS]; + struct lp_sampler_static_state samplers[1]; + /* followed by variable number of images */ }; +#define LP_FS_MAX_VARIANT_KEY_SIZE \ + (sizeof(struct lp_fragment_shader_variant_key) + \ + PIPE_MAX_SHADER_SAMPLER_VIEWS * sizeof(struct lp_sampler_static_state) +\ + PIPE_MAX_SHADER_IMAGES * sizeof(struct lp_image_static_state)) + +static inline size_t +lp_fs_variant_key_size(unsigned nr_samplers, unsigned nr_images) +{ + unsigned samplers = nr_samplers > 1 ? (nr_samplers - 1) : 0; + return (sizeof(struct lp_fragment_shader_variant_key) + + samplers * sizeof(struct lp_sampler_static_state) + + nr_images * sizeof(struct lp_image_static_state)); +} + +static inline struct lp_image_static_state * +lp_fs_variant_key_images(struct lp_fragment_shader_variant_key *key) +{ + return (struct lp_image_static_state *) + &key->samplers[key->nr_samplers]; +} /** doubly-linked list item */ struct lp_fs_variant_list_item @@ -95,7 +122,6 @@ struct lp_fragment_shader_variant { - struct lp_fragment_shader_variant_key key; boolean opaque; @@ -117,6 +143,9 @@ /* For debugging/profiling purposes */ unsigned no; + + /* key is variable-sized, must be last */ + struct lp_fragment_shader_variant_key key; }; @@ -143,6 +172,6 @@ void -lp_debug_fs_variant(const struct lp_fragment_shader_variant *variant); +lp_debug_fs_variant(struct lp_fragment_shader_variant *variant); #endif /* LP_STATE_FS_H_ */ diff -Nru mesa-19.2.8/src/gallium/drivers/llvmpipe/lp_state_gs.c mesa-20.0.8/src/gallium/drivers/llvmpipe/lp_state_gs.c --- mesa-19.2.8/src/gallium/drivers/llvmpipe/lp_state_gs.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/llvmpipe/lp_state_gs.c 2020-06-12 01:21:17.000000000 +0000 @@ -60,7 +60,7 @@ state->no_tokens = !templ->tokens; memcpy(&state->stream_output, &templ->stream_output, sizeof state->stream_output); - if (templ->tokens) { + if (templ->tokens || templ->type == PIPE_SHADER_IR_NIR) { state->dgs = draw_create_geometry_shader(llvmpipe->draw, templ); if (state->dgs == NULL) { goto no_dgs; diff -Nru mesa-19.2.8/src/gallium/drivers/llvmpipe/lp_state.h mesa-20.0.8/src/gallium/drivers/llvmpipe/lp_state.h --- mesa-19.2.8/src/gallium/drivers/llvmpipe/lp_state.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/llvmpipe/lp_state.h 2020-06-12 01:21:17.000000000 +0000 @@ -57,8 +57,14 @@ #define LP_NEW_SO 0x20000 #define LP_NEW_SO_BUFFERS 0x40000 #define LP_NEW_FS_SSBOS 0x80000 +#define LP_NEW_FS_IMAGES 0x100000 - +#define LP_CSNEW_CS 0x1 +#define LP_CSNEW_CONSTANTS 0x2 +#define LP_CSNEW_SAMPLER 0x4 +#define LP_CSNEW_SAMPLER_VIEW 0x8 +#define LP_CSNEW_SSBOS 0x10 +#define LP_CSNEW_IMAGES 0x20 struct vertex_info; struct pipe_context; @@ -110,6 +116,9 @@ llvmpipe_init_draw_funcs(struct llvmpipe_context *llvmpipe); void +llvmpipe_init_compute_funcs(struct llvmpipe_context *llvmpipe); + +void llvmpipe_init_clip_funcs(struct llvmpipe_context *llvmpipe); void @@ -137,4 +146,13 @@ unsigned num, struct pipe_sampler_view **views); +void +llvmpipe_prepare_vertex_images(struct llvmpipe_context *lp, + unsigned num, + struct pipe_image_view *views); + +void +llvmpipe_prepare_geometry_images(struct llvmpipe_context *lp, + unsigned num, + struct pipe_image_view *views); #endif diff -Nru mesa-19.2.8/src/gallium/drivers/llvmpipe/lp_state_sampler.c mesa-20.0.8/src/gallium/drivers/llvmpipe/lp_state_sampler.c --- mesa-19.2.8/src/gallium/drivers/llvmpipe/lp_state_sampler.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/llvmpipe/lp_state_sampler.c 2020-06-12 01:21:17.000000000 +0000 @@ -98,7 +98,9 @@ llvmpipe->samplers[shader], llvmpipe->num_samplers[shader]); } - else { + else if (shader == PIPE_SHADER_COMPUTE) { + llvmpipe->cs_dirty |= LP_CSNEW_SAMPLER; + } else { llvmpipe->dirty |= LP_NEW_SAMPLER; } } @@ -150,7 +152,9 @@ llvmpipe->sampler_views[shader], llvmpipe->num_sampler_views[shader]); } - else { + else if (shader == PIPE_SHADER_COMPUTE) { + llvmpipe->cs_dirty |= LP_CSNEW_SAMPLER_VIEW; + } else { llvmpipe->dirty |= LP_NEW_SAMPLER_VIEW; } } @@ -356,6 +360,117 @@ prepare_shader_sampling(lp, num, views, PIPE_SHADER_GEOMETRY); } +static void +prepare_shader_images( + struct llvmpipe_context *lp, + unsigned num, + struct pipe_image_view *views, + enum pipe_shader_type shader_type) +{ + + unsigned i; + uint32_t row_stride; + uint32_t img_stride; + const void *addr; + + assert(num <= PIPE_MAX_SHADER_SAMPLER_VIEWS); + if (!num) + return; + + for (i = 0; i < num; i++) { + struct pipe_image_view *view = i < num ? &views[i] : NULL; + + if (view) { + struct pipe_resource *img = view->resource; + struct llvmpipe_resource *lp_img = llvmpipe_resource(img); + if (!img) + continue; + + unsigned width = u_minify(img->width0, view->u.tex.level); + unsigned height = u_minify(img->height0, view->u.tex.level); + unsigned num_layers = img->depth0; + + if (!lp_img->dt) { + /* regular texture - setup array of mipmap level offsets */ + struct pipe_resource *res = view->resource; + + if (llvmpipe_resource_is_texture(res)) { + uint32_t mip_offset = lp_img->mip_offsets[view->u.tex.level]; + addr = lp_img->tex_data; + + if (img->target == PIPE_TEXTURE_1D_ARRAY || + img->target == PIPE_TEXTURE_2D_ARRAY || + img->target == PIPE_TEXTURE_3D || + img->target == PIPE_TEXTURE_CUBE || + img->target == PIPE_TEXTURE_CUBE_ARRAY) { + num_layers = view->u.tex.last_layer - view->u.tex.first_layer + 1; + assert(view->u.tex.first_layer <= view->u.tex.last_layer); + mip_offset += view->u.tex.first_layer * lp_img->img_stride[view->u.tex.level]; + } + + row_stride = lp_img->row_stride[view->u.tex.level]; + img_stride = lp_img->img_stride[view->u.tex.level]; + addr = (uint8_t *)addr + mip_offset; + } + else { + unsigned view_blocksize = util_format_get_blocksize(view->format); + addr = lp_img->data; + /* probably don't really need to fill that out */ + row_stride = 0; + img_stride = 0; + + /* everything specified in number of elements here. */ + width = view->u.buf.size / view_blocksize; + addr = (uint8_t *)addr + view->u.buf.offset; + assert(view->u.buf.offset + view->u.buf.size <= res->width0); + } + } + else { + /* display target texture/surface */ + /* + * XXX: Where should this be unmapped? + */ + struct llvmpipe_screen *screen = llvmpipe_screen(img->screen); + struct sw_winsys *winsys = screen->winsys; + addr = winsys->displaytarget_map(winsys, lp_img->dt, + PIPE_TRANSFER_READ); + row_stride = lp_img->row_stride[0]; + img_stride = lp_img->img_stride[0]; + assert(addr); + } + draw_set_mapped_image(lp->draw, + shader_type, + i, + width, height, num_layers, + addr, + row_stride, img_stride); + } + } +} + + +/** + * Called whenever we're about to draw (no dirty flag, FIXME?). + */ +void +llvmpipe_prepare_vertex_images(struct llvmpipe_context *lp, + unsigned num, + struct pipe_image_view *views) +{ + prepare_shader_images(lp, num, views, PIPE_SHADER_VERTEX); +} + + +/** + * Called whenever we're about to draw (no dirty flag, FIXME?). + */ +void +llvmpipe_prepare_geometry_images(struct llvmpipe_context *lp, + unsigned num, + struct pipe_image_view *views) +{ + prepare_shader_images(lp, num, views, PIPE_SHADER_GEOMETRY); +} void llvmpipe_init_sampler_funcs(struct llvmpipe_context *llvmpipe) diff -Nru mesa-19.2.8/src/gallium/drivers/llvmpipe/lp_state_surface.c mesa-20.0.8/src/gallium/drivers/llvmpipe/lp_state_surface.c --- mesa-19.2.8/src/gallium/drivers/llvmpipe/lp_state_surface.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/llvmpipe/lp_state_surface.c 2020-06-12 01:21:17.000000000 +0000 @@ -39,7 +39,7 @@ #include "draw/draw_context.h" -#include "util/u_format.h" +#include "util/format/u_format.h" /** diff -Nru mesa-19.2.8/src/gallium/drivers/llvmpipe/lp_test_format.c mesa-20.0.8/src/gallium/drivers/llvmpipe/lp_test_format.c --- mesa-19.2.8/src/gallium/drivers/llvmpipe/lp_test_format.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/llvmpipe/lp_test_format.c 2020-06-12 01:21:17.000000000 +0000 @@ -33,9 +33,9 @@ #include "util/u_memory.h" #include "util/u_pointer.h" #include "util/u_string.h" -#include "util/u_format.h" -#include "util/u_format_tests.h" -#include "util/u_format_s3tc.h" +#include "util/format/u_format.h" +#include "util/format/u_format_tests.h" +#include "util/format/u_format_s3tc.h" #include "gallivm/lp_bld.h" #include "gallivm/lp_bld_debug.h" @@ -392,7 +392,8 @@ /* missing fetch funcs */ if (format_desc->layout == UTIL_FORMAT_LAYOUT_ASTC || - format_desc->layout == UTIL_FORMAT_LAYOUT_ATC) { + format_desc->layout == UTIL_FORMAT_LAYOUT_ATC || + format_desc->layout == UTIL_FORMAT_LAYOUT_FXT1) { continue; } diff -Nru mesa-19.2.8/src/gallium/drivers/llvmpipe/lp_test.h mesa-20.0.8/src/gallium/drivers/llvmpipe/lp_test.h --- mesa-19.2.8/src/gallium/drivers/llvmpipe/lp_test.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/llvmpipe/lp_test.h 2020-06-12 01:21:17.000000000 +0000 @@ -44,7 +44,7 @@ #include "gallivm/lp_bld.h" #include "pipe/p_state.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_math.h" #include "util/u_dump.h" diff -Nru mesa-19.2.8/src/gallium/drivers/llvmpipe/lp_tex_sample.c mesa-20.0.8/src/gallium/drivers/llvmpipe/lp_tex_sample.c --- mesa-19.2.8/src/gallium/drivers/llvmpipe/lp_tex_sample.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/llvmpipe/lp_tex_sample.c 2020-06-12 01:21:17.000000000 +0000 @@ -78,6 +78,23 @@ struct llvmpipe_sampler_dynamic_state dynamic_state; }; +struct llvmpipe_image_dynamic_state +{ + struct lp_sampler_dynamic_state base; + + const struct lp_image_static_state *static_state; +}; + +/** + * This is the bridge between our sampler and the TGSI translator. + */ +struct lp_llvm_image_soa +{ + struct lp_build_image_soa base; + + struct llvmpipe_image_dynamic_state dynamic_state; +}; + /** * Fetch the specified member of the lp_jit_texture structure. @@ -221,6 +238,80 @@ LP_LLVM_SAMPLER_MEMBER(border_color, LP_JIT_SAMPLER_BORDER_COLOR, FALSE) +/** + * Fetch the specified member of the lp_jit_image structure. + * \param emit_load if TRUE, emit the LLVM load instruction to actually + * fetch the field's value. Otherwise, just emit the + * GEP code to address the field. + * + * @sa http://llvm.org/docs/GetElementPtr.html + */ +static LLVMValueRef +lp_llvm_image_member(const struct lp_sampler_dynamic_state *base, + struct gallivm_state *gallivm, + LLVMValueRef context_ptr, + unsigned image_unit, + unsigned member_index, + const char *member_name, + boolean emit_load) +{ + LLVMBuilderRef builder = gallivm->builder; + LLVMValueRef indices[4]; + LLVMValueRef ptr; + LLVMValueRef res; + + assert(image_unit < PIPE_MAX_SHADER_IMAGES); + + /* context[0] */ + indices[0] = lp_build_const_int32(gallivm, 0); + /* context[0].images */ + indices[1] = lp_build_const_int32(gallivm, LP_JIT_CTX_IMAGES); + /* context[0].images[unit] */ + indices[2] = lp_build_const_int32(gallivm, image_unit); + /* context[0].images[unit].member */ + indices[3] = lp_build_const_int32(gallivm, member_index); + + ptr = LLVMBuildGEP(builder, context_ptr, indices, ARRAY_SIZE(indices), ""); + + if (emit_load) + res = LLVMBuildLoad(builder, ptr, ""); + else + res = ptr; + + lp_build_name(res, "context.image%u.%s", image_unit, member_name); + + return res; +} + + +/** + * Helper macro to instantiate the functions that generate the code to + * fetch the members of lp_jit_image to fulfill the sampler code + * generator requests. + * + * This complexity is the price we have to pay to keep the image + * sampler code generator a reusable module without dependencies to + * llvmpipe internals. + */ +#define LP_LLVM_IMAGE_MEMBER(_name, _index, _emit_load) \ + static LLVMValueRef \ + lp_llvm_image_##_name( const struct lp_sampler_dynamic_state *base, \ + struct gallivm_state *gallivm, \ + LLVMValueRef context_ptr, \ + unsigned image_unit) \ + { \ + return lp_llvm_image_member(base, gallivm, context_ptr, \ + image_unit, _index, #_name, _emit_load ); \ + } + + +LP_LLVM_IMAGE_MEMBER(width, LP_JIT_IMAGE_WIDTH, TRUE) +LP_LLVM_IMAGE_MEMBER(height, LP_JIT_IMAGE_HEIGHT, TRUE) +LP_LLVM_IMAGE_MEMBER(depth, LP_JIT_IMAGE_DEPTH, TRUE) +LP_LLVM_IMAGE_MEMBER(base_ptr, LP_JIT_IMAGE_BASE, TRUE) +LP_LLVM_IMAGE_MEMBER(row_stride, LP_JIT_IMAGE_ROW_STRIDE, TRUE) +LP_LLVM_IMAGE_MEMBER(img_stride, LP_JIT_IMAGE_IMG_STRIDE, TRUE) + #if LP_USE_TEXTURE_CACHE static LLVMValueRef lp_llvm_texture_cache_ptr(const struct lp_sampler_dynamic_state *base, @@ -324,3 +415,66 @@ return &sampler->base; } +static void +lp_llvm_image_soa_destroy(struct lp_build_image_soa *image) +{ + FREE(image); +} + +static void +lp_llvm_image_soa_emit_op(const struct lp_build_image_soa *base, + struct gallivm_state *gallivm, + const struct lp_img_params *params) +{ + struct lp_llvm_image_soa *image = (struct lp_llvm_image_soa *)base; + unsigned image_index = params->image_index; + assert(image_index < PIPE_MAX_SHADER_IMAGES); + + lp_build_img_op_soa(&image->dynamic_state.static_state[image_index].image_state, + &image->dynamic_state.base, + gallivm, params); +} + +/** + * Fetch the texture size. + */ +static void +lp_llvm_image_soa_emit_size_query(const struct lp_build_image_soa *base, + struct gallivm_state *gallivm, + const struct lp_sampler_size_query_params *params) +{ + struct lp_llvm_image_soa *image = (struct lp_llvm_image_soa *)base; + + assert(params->texture_unit < PIPE_MAX_SHADER_IMAGES); + + lp_build_size_query_soa(gallivm, + &image->dynamic_state.static_state[params->texture_unit].image_state, + &image->dynamic_state.base, + params); +} + +struct lp_build_image_soa * +lp_llvm_image_soa_create(const struct lp_image_static_state *static_state) +{ + struct lp_llvm_image_soa *image; + + image = CALLOC_STRUCT(lp_llvm_image_soa); + if (!image) + return NULL; + + image->base.destroy = lp_llvm_image_soa_destroy; + image->base.emit_op = lp_llvm_image_soa_emit_op; + image->base.emit_size_query = lp_llvm_image_soa_emit_size_query; + + image->dynamic_state.base.width = lp_llvm_image_width; + image->dynamic_state.base.height = lp_llvm_image_height; + + image->dynamic_state.base.depth = lp_llvm_image_depth; + image->dynamic_state.base.base_ptr = lp_llvm_image_base_ptr; + image->dynamic_state.base.row_stride = lp_llvm_image_row_stride; + image->dynamic_state.base.img_stride = lp_llvm_image_img_stride; + + image->dynamic_state.static_state = static_state; + + return &image->base; +} diff -Nru mesa-19.2.8/src/gallium/drivers/llvmpipe/lp_tex_sample.h mesa-20.0.8/src/gallium/drivers/llvmpipe/lp_tex_sample.h --- mesa-19.2.8/src/gallium/drivers/llvmpipe/lp_tex_sample.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/llvmpipe/lp_tex_sample.h 2020-06-12 01:21:17.000000000 +0000 @@ -33,6 +33,7 @@ struct lp_sampler_static_state; +struct lp_image_static_state; /** * Whether texture cache is used for s3tc textures. @@ -46,4 +47,7 @@ struct lp_build_sampler_soa * lp_llvm_sampler_soa_create(const struct lp_sampler_static_state *key); +struct lp_build_image_soa * +lp_llvm_image_soa_create(const struct lp_image_static_state *key); + #endif /* LP_TEX_SAMPLE_H */ diff -Nru mesa-19.2.8/src/gallium/drivers/llvmpipe/lp_texture.c mesa-20.0.8/src/gallium/drivers/llvmpipe/lp_texture.c --- mesa-19.2.8/src/gallium/drivers/llvmpipe/lp_texture.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/llvmpipe/lp_texture.c 2020-06-12 01:21:17.000000000 +0000 @@ -37,7 +37,7 @@ #include "util/u_inlines.h" #include "util/u_cpu_detect.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_math.h" #include "util/u_memory.h" #include "util/simple_list.h" @@ -647,7 +647,8 @@ if (!(presource->bind & (PIPE_BIND_DEPTH_STENCIL | PIPE_BIND_RENDER_TARGET | PIPE_BIND_SAMPLER_VIEW | - PIPE_BIND_SHADER_BUFFER))) + PIPE_BIND_SHADER_BUFFER | + PIPE_BIND_SHADER_IMAGE))) return LP_UNREFERENCED; return lp_setup_is_resource_referenced(llvmpipe->setup, presource); @@ -767,6 +768,13 @@ return size; } +static void +llvmpipe_memory_barrier(struct pipe_context *pipe, + unsigned flags) +{ + /* this may be an overly large hammer for this nut. */ + llvmpipe_finish(pipe, "barrier"); +} #ifdef DEBUG void @@ -823,4 +831,6 @@ pipe->transfer_flush_region = u_default_transfer_flush_region; pipe->buffer_subdata = u_default_buffer_subdata; pipe->texture_subdata = u_default_texture_subdata; + + pipe->memory_barrier = llvmpipe_memory_barrier; } diff -Nru mesa-19.2.8/src/gallium/drivers/llvmpipe/Makefile.sources mesa-20.0.8/src/gallium/drivers/llvmpipe/Makefile.sources --- mesa-19.2.8/src/gallium/drivers/llvmpipe/Makefile.sources 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/llvmpipe/Makefile.sources 2020-06-12 01:21:17.000000000 +0000 @@ -13,6 +13,8 @@ lp_clear.h \ lp_context.c \ lp_context.h \ + lp_cs_tpool.c \ + lp_cs_tpool.h \ lp_debug.h \ lp_draw_arrays.c \ lp_fence.c \ @@ -51,6 +53,8 @@ lp_state_blend.c \ lp_state_clip.c \ lp_state_derived.c \ + lp_state_cs.c \ + lp_state_cs.h \ lp_state_fs.c \ lp_state_fs.h \ lp_state_gs.c \ diff -Nru mesa-19.2.8/src/gallium/drivers/llvmpipe/meson.build mesa-20.0.8/src/gallium/drivers/llvmpipe/meson.build --- mesa-19.2.8/src/gallium/drivers/llvmpipe/meson.build 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/llvmpipe/meson.build 2020-06-12 01:21:17.000000000 +0000 @@ -33,6 +33,8 @@ 'lp_clear.h', 'lp_context.c', 'lp_context.h', + 'lp_cs_tpool.h', + 'lp_cs_tpool.c', 'lp_debug.h', 'lp_draw_arrays.c', 'lp_fence.c', @@ -71,6 +73,8 @@ 'lp_state_blend.c', 'lp_state_clip.c', 'lp_state_derived.c', + 'lp_state_cs.c', + 'lp_state_cs.h', 'lp_state_fs.c', 'lp_state_fs.h', 'lp_state_gs.c', @@ -97,7 +101,7 @@ c_args : [c_vis_args, c_msvc_compat_args], cpp_args : [cpp_vis_args, cpp_msvc_compat_args], include_directories : [inc_gallium, inc_gallium_aux, inc_include, inc_src], - dependencies : dep_llvm, + dependencies : [ dep_llvm, idep_nir_headers, ], ) # This overwrites the softpipe driver dependency, but itself depends on the @@ -105,7 +109,7 @@ driver_swrast = declare_dependency( compile_args : '-DGALLIUM_LLVMPIPE', link_with : libllvmpipe, - dependencies : driver_swrast, + dependencies : [driver_swrast, dep_llvm], ) if with_tests and with_gallium_softpipe and with_llvm diff -Nru mesa-19.2.8/src/gallium/drivers/llvmpipe/SConscript mesa-20.0.8/src/gallium/drivers/llvmpipe/SConscript --- mesa-19.2.8/src/gallium/drivers/llvmpipe/SConscript 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/llvmpipe/SConscript 2020-06-12 01:21:17.000000000 +0000 @@ -18,6 +18,9 @@ env.Alias('llvmpipe', llvmpipe) +env.Append(CPPPATH = [ + '../../../compiler/nir', +]) if not env['embedded']: env = env.Clone() diff -Nru mesa-19.2.8/src/gallium/drivers/nouveau/codegen/nv50_ir_bb.cpp mesa-20.0.8/src/gallium/drivers/nouveau/codegen/nv50_ir_bb.cpp --- mesa-19.2.8/src/gallium/drivers/nouveau/codegen/nv50_ir_bb.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/nouveau/codegen/nv50_ir_bb.cpp 2020-06-12 01:21:17.000000000 +0000 @@ -536,9 +536,6 @@ case Graph::Edge::BACK: fprintf(out, "\t%i -> %i;\n", idA, idB); break; - case Graph::Edge::DUMMY: - fprintf(out, "\t%i -> %i [style=dotted];\n", idA, idB); - break; default: assert(0); break; diff -Nru mesa-19.2.8/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp mesa-20.0.8/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp --- mesa-19.2.8/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp 2020-06-12 01:21:17.000000000 +0000 @@ -274,6 +274,8 @@ if (defs.size() > 1) return false; Instruction *insn = getInsn(); + if (!insn) + return false; // let's not try too hard here for now ... return !insn->srcExists(1) && insn->getSrc(0)->isUniform(); } diff -Nru mesa-19.2.8/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp mesa-20.0.8/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp --- mesa-19.2.8/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp 2020-06-12 01:21:17.000000000 +0000 @@ -122,6 +122,8 @@ void emitSAM(); void emitRAM(); + void emitPSETP(); + void emitMOV(); void emitS2R(); void emitCS2R(); @@ -690,6 +692,31 @@ * predicate/cc ******************************************************************************/ +void +CodeEmitterGM107::emitPSETP() +{ + + emitInsn(0x50900000); + + switch (insn->op) { + case OP_AND: emitField(0x18, 3, 0); break; + case OP_OR: emitField(0x18, 3, 1); break; + case OP_XOR: emitField(0x18, 3, 2); break; + default: + assert(!"unexpected operation"); + break; + } + + // emitINV (0x2a); + emitPRED(0x27); // TODO: support 3-arg + emitINV (0x20, insn->src(1)); + emitPRED(0x1d, insn->src(1)); + emitINV (0x0f, insn->src(0)); + emitPRED(0x0c, insn->src(0)); + emitPRED(0x03, insn->def(0)); + emitPRED(0x00); +} + /******************************************************************************* * movement / conversion ******************************************************************************/ @@ -3557,7 +3584,12 @@ case OP_AND: case OP_OR: case OP_XOR: - emitLOP(); + switch (insn->def(0).getFile()) { + case FILE_GPR: emitLOP(); break; + case FILE_PREDICATE: emitPSETP(); break; + default: + assert(!"invalid bool op"); + } break; case OP_NOT: emitNOT(); diff -Nru mesa-19.2.8/src/gallium/drivers/nouveau/codegen/nv50_ir_from_nir.cpp mesa-20.0.8/src/gallium/drivers/nouveau/codegen/nv50_ir_from_nir.cpp --- mesa-19.2.8/src/gallium/drivers/nouveau/codegen/nv50_ir_from_nir.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/nouveau/codegen/nv50_ir_from_nir.cpp 2020-06-12 01:21:17.000000000 +0000 @@ -451,9 +451,6 @@ return OP_SIN; case nir_op_fsqrt: return OP_SQRT; - case nir_op_fsub: - case nir_op_isub: - return OP_SUB; case nir_op_ftrunc: return OP_TRUNC; case nir_op_ixor: @@ -515,12 +512,18 @@ case nir_intrinsic_bindless_image_atomic_exchange: case nir_intrinsic_image_atomic_exchange: case nir_intrinsic_image_deref_atomic_exchange: - case nir_intrinsic_bindless_image_atomic_max: - case nir_intrinsic_image_atomic_max: - case nir_intrinsic_image_deref_atomic_max: - case nir_intrinsic_bindless_image_atomic_min: - case nir_intrinsic_image_atomic_min: - case nir_intrinsic_image_deref_atomic_min: + case nir_intrinsic_bindless_image_atomic_imax: + case nir_intrinsic_image_atomic_imax: + case nir_intrinsic_image_deref_atomic_imax: + case nir_intrinsic_bindless_image_atomic_umax: + case nir_intrinsic_image_atomic_umax: + case nir_intrinsic_image_deref_atomic_umax: + case nir_intrinsic_bindless_image_atomic_imin: + case nir_intrinsic_image_atomic_imin: + case nir_intrinsic_image_deref_atomic_imin: + case nir_intrinsic_bindless_image_atomic_umin: + case nir_intrinsic_image_atomic_umin: + case nir_intrinsic_image_deref_atomic_umin: case nir_intrinsic_bindless_image_atomic_or: case nir_intrinsic_image_atomic_or: case nir_intrinsic_image_deref_atomic_or: @@ -579,52 +582,68 @@ { switch (op) { case nir_intrinsic_bindless_image_atomic_add: + case nir_intrinsic_global_atomic_add: case nir_intrinsic_image_atomic_add: case nir_intrinsic_image_deref_atomic_add: case nir_intrinsic_shared_atomic_add: case nir_intrinsic_ssbo_atomic_add: return NV50_IR_SUBOP_ATOM_ADD; case nir_intrinsic_bindless_image_atomic_and: + case nir_intrinsic_global_atomic_and: case nir_intrinsic_image_atomic_and: case nir_intrinsic_image_deref_atomic_and: case nir_intrinsic_shared_atomic_and: case nir_intrinsic_ssbo_atomic_and: return NV50_IR_SUBOP_ATOM_AND; case nir_intrinsic_bindless_image_atomic_comp_swap: + case nir_intrinsic_global_atomic_comp_swap: case nir_intrinsic_image_atomic_comp_swap: case nir_intrinsic_image_deref_atomic_comp_swap: case nir_intrinsic_shared_atomic_comp_swap: case nir_intrinsic_ssbo_atomic_comp_swap: return NV50_IR_SUBOP_ATOM_CAS; case nir_intrinsic_bindless_image_atomic_exchange: + case nir_intrinsic_global_atomic_exchange: case nir_intrinsic_image_atomic_exchange: case nir_intrinsic_image_deref_atomic_exchange: case nir_intrinsic_shared_atomic_exchange: case nir_intrinsic_ssbo_atomic_exchange: return NV50_IR_SUBOP_ATOM_EXCH; case nir_intrinsic_bindless_image_atomic_or: + case nir_intrinsic_global_atomic_or: case nir_intrinsic_image_atomic_or: case nir_intrinsic_image_deref_atomic_or: case nir_intrinsic_shared_atomic_or: case nir_intrinsic_ssbo_atomic_or: return NV50_IR_SUBOP_ATOM_OR; - case nir_intrinsic_bindless_image_atomic_max: - case nir_intrinsic_image_atomic_max: - case nir_intrinsic_image_deref_atomic_max: + case nir_intrinsic_bindless_image_atomic_imax: + case nir_intrinsic_bindless_image_atomic_umax: + case nir_intrinsic_global_atomic_imax: + case nir_intrinsic_global_atomic_umax: + case nir_intrinsic_image_atomic_imax: + case nir_intrinsic_image_atomic_umax: + case nir_intrinsic_image_deref_atomic_imax: + case nir_intrinsic_image_deref_atomic_umax: case nir_intrinsic_shared_atomic_imax: case nir_intrinsic_shared_atomic_umax: case nir_intrinsic_ssbo_atomic_imax: case nir_intrinsic_ssbo_atomic_umax: return NV50_IR_SUBOP_ATOM_MAX; - case nir_intrinsic_bindless_image_atomic_min: - case nir_intrinsic_image_atomic_min: - case nir_intrinsic_image_deref_atomic_min: + case nir_intrinsic_bindless_image_atomic_imin: + case nir_intrinsic_bindless_image_atomic_umin: + case nir_intrinsic_global_atomic_imin: + case nir_intrinsic_global_atomic_umin: + case nir_intrinsic_image_atomic_imin: + case nir_intrinsic_image_atomic_umin: + case nir_intrinsic_image_deref_atomic_imin: + case nir_intrinsic_image_deref_atomic_umin: case nir_intrinsic_shared_atomic_imin: case nir_intrinsic_shared_atomic_umin: case nir_intrinsic_ssbo_atomic_imin: case nir_intrinsic_ssbo_atomic_umin: return NV50_IR_SUBOP_ATOM_MIN; case nir_intrinsic_bindless_image_atomic_xor: + case nir_intrinsic_global_atomic_xor: case nir_intrinsic_image_atomic_xor: case nir_intrinsic_image_deref_atomic_xor: case nir_intrinsic_shared_atomic_xor: @@ -633,7 +652,6 @@ case nir_intrinsic_group_memory_barrier: case nir_intrinsic_memory_barrier: - case nir_intrinsic_memory_barrier_atomic_counter: case nir_intrinsic_memory_barrier_buffer: case nir_intrinsic_memory_barrier_image: return NV50_IR_SUBOP_MEMBAR(M, GL); @@ -1945,7 +1963,7 @@ } case Program::TYPE_GEOMETRY: case Program::TYPE_VERTEX: { - if (info->io.genUserClip > 0 && idx == clipVertexOutput) { + if (info->io.genUserClip > 0 && idx == (uint32_t)clipVertexOutput) { mkMov(clipVtx[i], src); src = clipVtx[i]; } @@ -2370,12 +2388,38 @@ info->io.globalAccess |= 0x2; break; } + case nir_intrinsic_global_atomic_add: + case nir_intrinsic_global_atomic_and: + case nir_intrinsic_global_atomic_comp_swap: + case nir_intrinsic_global_atomic_exchange: + case nir_intrinsic_global_atomic_or: + case nir_intrinsic_global_atomic_imax: + case nir_intrinsic_global_atomic_imin: + case nir_intrinsic_global_atomic_umax: + case nir_intrinsic_global_atomic_umin: + case nir_intrinsic_global_atomic_xor: { + const DataType dType = getDType(insn); + LValues &newDefs = convert(&insn->dest); + Value *address; + uint32_t offset = getIndirect(&insn->src[0], 0, address); + + Symbol *sym = mkSymbol(FILE_MEMORY_GLOBAL, 0, dType, offset); + Instruction *atom = + mkOp2(OP_ATOM, dType, newDefs[0], sym, getSrc(&insn->src[1], 0)); + atom->setIndirect(0, 0, address); + atom->subOp = getSubOp(op); + + info->io.globalAccess |= 0x2; + break; + } case nir_intrinsic_bindless_image_atomic_add: case nir_intrinsic_bindless_image_atomic_and: case nir_intrinsic_bindless_image_atomic_comp_swap: case nir_intrinsic_bindless_image_atomic_exchange: - case nir_intrinsic_bindless_image_atomic_max: - case nir_intrinsic_bindless_image_atomic_min: + case nir_intrinsic_bindless_image_atomic_imax: + case nir_intrinsic_bindless_image_atomic_umax: + case nir_intrinsic_bindless_image_atomic_imin: + case nir_intrinsic_bindless_image_atomic_umin: case nir_intrinsic_bindless_image_atomic_or: case nir_intrinsic_bindless_image_atomic_xor: case nir_intrinsic_bindless_image_load: @@ -2405,8 +2449,10 @@ case nir_intrinsic_bindless_image_atomic_and: case nir_intrinsic_bindless_image_atomic_comp_swap: case nir_intrinsic_bindless_image_atomic_exchange: - case nir_intrinsic_bindless_image_atomic_max: - case nir_intrinsic_bindless_image_atomic_min: + case nir_intrinsic_bindless_image_atomic_imax: + case nir_intrinsic_bindless_image_atomic_umax: + case nir_intrinsic_bindless_image_atomic_imin: + case nir_intrinsic_bindless_image_atomic_umin: case nir_intrinsic_bindless_image_atomic_or: case nir_intrinsic_bindless_image_atomic_xor: ty = getDType(insn); @@ -2472,8 +2518,10 @@ case nir_intrinsic_image_deref_atomic_and: case nir_intrinsic_image_deref_atomic_comp_swap: case nir_intrinsic_image_deref_atomic_exchange: - case nir_intrinsic_image_deref_atomic_max: - case nir_intrinsic_image_deref_atomic_min: + case nir_intrinsic_image_deref_atomic_imax: + case nir_intrinsic_image_deref_atomic_umax: + case nir_intrinsic_image_deref_atomic_imin: + case nir_intrinsic_image_deref_atomic_umin: case nir_intrinsic_image_deref_atomic_or: case nir_intrinsic_image_deref_atomic_xor: case nir_intrinsic_image_deref_load: @@ -2507,8 +2555,10 @@ case nir_intrinsic_image_deref_atomic_and: case nir_intrinsic_image_deref_atomic_comp_swap: case nir_intrinsic_image_deref_atomic_exchange: - case nir_intrinsic_image_deref_atomic_max: - case nir_intrinsic_image_deref_atomic_min: + case nir_intrinsic_image_deref_atomic_imax: + case nir_intrinsic_image_deref_atomic_umax: + case nir_intrinsic_image_deref_atomic_imin: + case nir_intrinsic_image_deref_atomic_umin: case nir_intrinsic_image_deref_atomic_or: case nir_intrinsic_image_deref_atomic_xor: ty = getDType(insn); @@ -2593,7 +2643,7 @@ break; } - case nir_intrinsic_barrier: { + case nir_intrinsic_control_barrier: { // TODO: add flag to shader_info info->numBarriers = 1; Instruction *bar = mkOp2(OP_BAR, TYPE_U32, NULL, mkImm(0), mkImm(0)); @@ -2603,7 +2653,6 @@ } case nir_intrinsic_group_memory_barrier: case nir_intrinsic_memory_barrier: - case nir_intrinsic_memory_barrier_atomic_counter: case nir_intrinsic_memory_barrier_buffer: case nir_intrinsic_memory_barrier_image: case nir_intrinsic_memory_barrier_shared: { @@ -2612,6 +2661,8 @@ bar->subOp = getSubOp(op); break; } + case nir_intrinsic_memory_barrier_tcs_patch: + break; case nir_intrinsic_shader_clock: { const DataType dType = getDType(insn); LValues &newDefs = convert(&insn->dest); @@ -2797,8 +2848,6 @@ case nir_op_ushr: case nir_op_fsin: case nir_op_fsqrt: - case nir_op_fsub: - case nir_op_isub: case nir_op_ftrunc: case nir_op_ishl: case nir_op_ixor: { @@ -2937,7 +2986,9 @@ break; case nir_op_vec2: case nir_op_vec3: - case nir_op_vec4: { + case nir_op_vec4: + case nir_op_vec8: + case nir_op_vec16: { LValues &newDefs = convert(&insn->dest); for (LValues::size_type c = 0u; c < newDefs.size(); ++c) { mkMov(newDefs[c], getSrc(&insn->src[c]), dType); @@ -3262,7 +3313,7 @@ CacheMode Converter::getCacheModeFromVar(const nir_variable *var) { - return convert(var->data.image.access); + return convert(var->data.access); } bool @@ -3480,7 +3531,7 @@ NIR_PASS_V(nir, nir_lower_regs_to_ssa); NIR_PASS_V(nir, nir_lower_load_const_to_scalar); NIR_PASS_V(nir, nir_lower_vars_to_ssa); - NIR_PASS_V(nir, nir_lower_alu_to_scalar, NULL); + NIR_PASS_V(nir, nir_lower_alu_to_scalar, NULL, NULL); NIR_PASS_V(nir, nir_lower_phis_to_scalar); do { diff -Nru mesa-19.2.8/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp mesa-20.0.8/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp --- mesa-19.2.8/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp 2020-06-12 01:21:17.000000000 +0000 @@ -821,6 +821,7 @@ NV50_IR_OPCODE_CASE(DDY, DFDY); NV50_IR_OPCODE_CASE(DDY_FINE, DFDY); NV50_IR_OPCODE_CASE(KILL, DISCARD); + NV50_IR_OPCODE_CASE(DEMOTE, DISCARD); NV50_IR_OPCODE_CASE(SEQ, SET); NV50_IR_OPCODE_CASE(SGT, SET); @@ -1581,6 +1582,9 @@ if (insn.getOpcode() == TGSI_OPCODE_INTERP_SAMPLE) info->prop.fp.readsSampleLocations = true; + if (insn.getOpcode() == TGSI_OPCODE_DEMOTE) + info->prop.fp.usesDiscard = true; + if (insn.dstCount()) { Instruction::DstRegister dst = insn.getDst(0); @@ -3469,6 +3473,11 @@ if (!tgsi.getDst(0).isMasked(1)) mkOp1(OP_RDSV, TYPE_U32, dst0[1], mkSysVal(SV_CLOCK, 0))->fixed = 1; break; + case TGSI_OPCODE_READ_HELPER: + if (!tgsi.getDst(0).isMasked(0)) + mkOp1(OP_RDSV, TYPE_U32, dst0[0], mkSysVal(SV_THREAD_KILL, 0)) + ->fixed = 1; + break; case TGSI_OPCODE_KILL_IF: val0 = new_LValue(func, FILE_PREDICATE); mask = 0; @@ -3482,6 +3491,9 @@ } break; case TGSI_OPCODE_KILL: + case TGSI_OPCODE_DEMOTE: + // TODO: Should we make KILL exit that invocation? Some old shaders + // don't like that. mkOp(OP_DISCARD, TYPE_NONE, NULL); break; case TGSI_OPCODE_TEX: diff -Nru mesa-19.2.8/src/gallium/drivers/nouveau/codegen/nv50_ir_graph.cpp mesa-20.0.8/src/gallium/drivers/nouveau/codegen/nv50_ir_graph.cpp --- mesa-19.2.8/src/gallium/drivers/nouveau/codegen/nv50_ir_graph.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/nouveau/codegen/nv50_ir_graph.cpp 2020-06-12 01:21:17.000000000 +0000 @@ -77,7 +77,6 @@ case FORWARD: return "forward"; case BACK: return "back"; case CROSS: return "cross"; - case DUMMY: return "dummy"; case UNKNOWN: default: return "unk"; @@ -184,7 +183,7 @@ continue; for (EdgeIterator ei = pos->outgoing(); !ei.end(); ei.next()) { - if (ei.getType() == Edge::BACK || ei.getType() == Edge::DUMMY) + if (ei.getType() == Edge::BACK) continue; if (ei.getNode()->visit(seq)) stack.push(ei.getNode()); @@ -301,7 +300,6 @@ switch (ei.getType()) { case Graph::Edge::TREE: case Graph::Edge::FORWARD: - case Graph::Edge::DUMMY: if (++(ei.getNode()->tag) == ei.getNode()->incidentCountFwd()) bb.push(ei.getNode()); break; @@ -371,8 +369,6 @@ for (edge = curr->out; edge; edge = edge->next[0]) { node = edge->target; - if (edge->type == Edge::DUMMY) - continue; if (node->getSequence() == 0) { edge->type = Edge::TREE; @@ -387,8 +383,6 @@ for (edge = curr->in; edge; edge = edge->next[1]) { node = edge->origin; - if (edge->type == Edge::DUMMY) - continue; if (node->getSequence() == 0) { edge->type = Edge::TREE; diff -Nru mesa-19.2.8/src/gallium/drivers/nouveau/codegen/nv50_ir_graph.h mesa-20.0.8/src/gallium/drivers/nouveau/codegen/nv50_ir_graph.h --- mesa-19.2.8/src/gallium/drivers/nouveau/codegen/nv50_ir_graph.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/nouveau/codegen/nv50_ir_graph.h 2020-06-12 01:21:17.000000000 +0000 @@ -47,7 +47,6 @@ FORWARD, BACK, CROSS, // e.g. loop break - DUMMY }; Edge(Node *dst, Node *src, Type kind); diff -Nru mesa-19.2.8/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp mesa-20.0.8/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp --- mesa-19.2.8/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp 2020-06-12 01:21:17.000000000 +0000 @@ -1645,6 +1645,8 @@ else if (targ->getChipset() < NVISA_GM107_CHIPSET) handleSharedATOMNVE4(atom); return true; + case FILE_MEMORY_GLOBAL: + return true; default: assert(atom->src(0).getFile() == FILE_MEMORY_BUFFER); base = loadBufInfo64(ind, atom->getSrc(0)->reg.fileIndex * 16); @@ -1802,6 +1804,9 @@ { uint32_t base = slot * NVC0_SU_INFO__STRIDE; + // We don't upload surface info for bindless for GM107+ + assert(!bindless || targ->getChipset() < NVISA_GM107_CHIPSET); + if (ptr) { ptr = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(slot)); if (bindless) @@ -2204,7 +2209,7 @@ } void -NVC0LoweringPass::convertSurfaceFormat(TexInstruction *su) +NVC0LoweringPass::convertSurfaceFormat(TexInstruction *su, Instruction **loaded) { const TexInstruction::ImgFormatDesc *format = su->tex.format; int width = format->bits[0] + format->bits[1] + @@ -2223,21 +2228,38 @@ if (width < 32) untypedDst[0] = bld.getSSA(); - for (int i = 0; i < 4; i++) { - typedDst[i] = su->getDef(i); + if (loaded && loaded[0]) { + for (int i = 0; i < 4; i++) { + if (loaded[i]) + typedDst[i] = loaded[i]->getDef(0); + } + } else { + for (int i = 0; i < 4; i++) { + typedDst[i] = su->getDef(i); + } } // Set the untyped dsts as the su's destinations - for (int i = 0; i < 4; i++) - su->setDef(i, untypedDst[i]); + if (loaded && loaded[0]) { + for (int i = 0; i < 4; i++) + if (loaded[i]) + loaded[i]->setDef(0, untypedDst[i]); + } else { + for (int i = 0; i < 4; i++) + su->setDef(i, untypedDst[i]); - bld.setPosition(su, true); + bld.setPosition(su, true); + } // Unpack each component into the typed dsts int bits = 0; for (int i = 0; i < 4; bits += format->bits[i], i++) { if (!typedDst[i]) continue; + + if (loaded && loaded[0]) + bld.setPosition(loaded[i], true); + if (i >= format->components) { if (format->type == FLOAT || format->type == UNORM || @@ -2308,7 +2330,7 @@ processSurfaceCoordsNVE4(su); if (su->op == OP_SULDP) { - convertSurfaceFormat(su); + convertSurfaceFormat(su, NULL); insertOOBSurfaceOpResult(su); } @@ -2421,7 +2443,7 @@ processSurfaceCoordsNVC0(su); if (su->op == OP_SULDP) { - convertSurfaceFormat(su); + convertSurfaceFormat(su, NULL); insertOOBSurfaceOpResult(su); } @@ -2463,14 +2485,16 @@ } } -void -NVC0LoweringPass::processSurfaceCoordsGM107(TexInstruction *su) +TexInstruction * +NVC0LoweringPass::processSurfaceCoordsGM107(TexInstruction *su, Instruction *ret[4]) { const int slot = su->tex.r; const int dim = su->tex.target.getDim(); - const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube()); + const bool array = su->tex.target.isArray() || su->tex.target.isCube(); + const int arg = dim + array; Value *ind = su->getIndirectR(); Value *handle; + Instruction *pred = NULL, *pred2d = NULL; int pos = 0; bld.setPosition(su, false); @@ -2489,67 +2513,153 @@ assert(pos == 0); break; } + + if (dim == 2 && !array) { + // This might be a 2d slice of a 3d texture, try to load the z + // coordinate in. + Value *v; + if (!su->tex.bindless) + v = loadSuInfo32(ind, slot, NVC0_SU_INFO_UNK1C, su->tex.bindless); + else + v = bld.mkOp2v(OP_SHR, TYPE_U32, bld.getSSA(), ind, bld.mkImm(11)); + Value *is_3d = bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), v, bld.mkImm(1)); + pred2d = bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE), + TYPE_U32, bld.mkImm(0), is_3d); + + bld.mkOp2(OP_SHR, TYPE_U32, v, v, bld.loadImm(NULL, 16)); + su->moveSources(dim, 1); + su->setSrc(dim, v); + su->tex.target = nv50_ir::TEX_TARGET_3D; + pos++; + } + if (su->tex.bindless) - handle = ind; + handle = bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), ind, bld.mkImm(2047)); else handle = loadTexHandle(ind, slot + 32); + su->setSrc(arg + pos, handle); // The address check doesn't make sense here. The format check could make // sense but it's a bit of a pain. - if (su->tex.bindless) - return; + if (!su->tex.bindless) { + // prevent read fault when the image is not actually bound + pred = + bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE), + TYPE_U32, bld.mkImm(0), + loadSuInfo32(ind, slot, NVC0_SU_INFO_ADDR, su->tex.bindless)); + if (su->op != OP_SUSTP && su->tex.format) { + const TexInstruction::ImgFormatDesc *format = su->tex.format; + int blockwidth = format->bits[0] + format->bits[1] + + format->bits[2] + format->bits[3]; + + assert(format->components != 0); + // make sure that the format doesn't mismatch when it's not FMT_NONE + bld.mkCmp(OP_SET_OR, CC_NE, TYPE_U32, pred->getDef(0), + TYPE_U32, bld.loadImm(NULL, blockwidth / 8), + loadSuInfo32(ind, slot, NVC0_SU_INFO_BSIZE, su->tex.bindless), + pred->getDef(0)); + } + } - // prevent read fault when the image is not actually bound - CmpInstruction *pred = - bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE), - TYPE_U32, bld.mkImm(0), - loadSuInfo32(ind, slot, NVC0_SU_INFO_ADDR, su->tex.bindless)); - if (su->op != OP_SUSTP && su->tex.format) { - const TexInstruction::ImgFormatDesc *format = su->tex.format; - int blockwidth = format->bits[0] + format->bits[1] + - format->bits[2] + format->bits[3]; + // Now we have "pred" which (optionally) contains whether to do the surface + // op at all, and a "pred2d" which indicates that, in case of doing the + // surface op, we have to create a 2d and 3d version, conditioned on pred2d. + TexInstruction *su2d = NULL; + if (pred2d) { + su2d = cloneForward(func, su)->asTex(); + for (unsigned i = 0; su->defExists(i); ++i) + su2d->setDef(i, bld.getSSA()); + su2d->moveSources(dim + 1, -1); + su2d->tex.target = nv50_ir::TEX_TARGET_2D; + } + if (pred2d && pred) { + Instruction *pred3d = bld.mkOp2(OP_AND, TYPE_U8, + bld.getSSA(1, FILE_PREDICATE), + pred->getDef(0), pred2d->getDef(0)); + pred3d->src(0).mod = Modifier(NV50_IR_MOD_NOT); + pred3d->src(1).mod = Modifier(NV50_IR_MOD_NOT); + su->setPredicate(CC_P, pred3d->getDef(0)); + pred2d = bld.mkOp2(OP_AND, TYPE_U8, bld.getSSA(1, FILE_PREDICATE), + pred->getDef(0), pred2d->getDef(0)); + pred2d->src(0).mod = Modifier(NV50_IR_MOD_NOT); + } else if (pred) { + su->setPredicate(CC_NOT_P, pred->getDef(0)); + } else if (pred2d) { + su->setPredicate(CC_NOT_P, pred2d->getDef(0)); + } + if (su2d) { + su2d->setPredicate(CC_P, pred2d->getDef(0)); + bld.insert(su2d); - assert(format->components != 0); - // make sure that the format doesn't mismatch when it's not FMT_NONE - bld.mkCmp(OP_SET_OR, CC_NE, TYPE_U32, pred->getDef(0), - TYPE_U32, bld.loadImm(NULL, blockwidth / 8), - loadSuInfo32(ind, slot, NVC0_SU_INFO_BSIZE, su->tex.bindless), - pred->getDef(0)); + // Create a UNION so that RA assigns the same registers + bld.setPosition(su, true); + for (unsigned i = 0; su->defExists(i); ++i) { + assert(i < 4); + + ValueDef &def = su->def(i); + ValueDef &def2 = su2d->def(i); + Instruction *mov = NULL; + + if (pred) { + mov = bld.mkMov(bld.getSSA(), bld.loadImm(NULL, 0)); + mov->setPredicate(CC_P, pred->getDef(0)); + } + + Instruction *uni = ret[i] = bld.mkOp2(OP_UNION, TYPE_U32, + bld.getSSA(), + NULL, def2.get()); + def.replace(uni->getDef(0), false); + uni->setSrc(0, def.get()); + if (mov) + uni->setSrc(2, mov->getDef(0)); + } + } else if (pred) { + // Create a UNION so that RA assigns the same registers + bld.setPosition(su, true); + for (unsigned i = 0; su->defExists(i); ++i) { + assert(i < 4); + + ValueDef &def = su->def(i); + + Instruction *mov = bld.mkMov(bld.getSSA(), bld.loadImm(NULL, 0)); + mov->setPredicate(CC_P, pred->getDef(0)); + + Instruction *uni = ret[i] = bld.mkOp2(OP_UNION, TYPE_U32, + bld.getSSA(), + NULL, mov->getDef(0)); + def.replace(uni->getDef(0), false); + uni->setSrc(0, def.get()); + } } - su->setPredicate(CC_NOT_P, pred->getDef(0)); + + return su2d; } void NVC0LoweringPass::handleSurfaceOpGM107(TexInstruction *su) { - processSurfaceCoordsGM107(su); + // processSurfaceCoords also takes care of fixing up the outputs and + // union'ing them with 0 as necessary. Additionally it may create a second + // surface which needs some of the similar fixups. + + Instruction *loaded[4] = {}; + TexInstruction *su2 = processSurfaceCoordsGM107(su, loaded); if (su->op == OP_SULDP) { - convertSurfaceFormat(su); - insertOOBSurfaceOpResult(su); + convertSurfaceFormat(su, loaded); } if (su->op == OP_SUREDP) { - Value *def = su->getDef(0); - su->op = OP_SUREDB; + } - // There may not be a predicate in the bindless case. - if (su->getPredicate()) { - su->setDef(0, bld.getSSA()); - - bld.setPosition(su, true); - - // make sure to initialize dst value when the atomic operation is not - // performed - Instruction *mov = bld.mkMov(bld.getSSA(), bld.loadImm(NULL, 0)); - - assert(su->cc == CC_NOT_P); - mov->setPredicate(CC_P, su->getPredicate()); - - bld.mkOp2(OP_UNION, TYPE_U32, def, su->getDef(0), mov->getDef(0)); - } + // If we fixed up the type of the regular surface load instruction, we also + // have to fix up the copy. + if (su2) { + su2->op = su->op; + su2->dType = su->dType; + su2->sType = su->sType; } } diff -Nru mesa-19.2.8/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h mesa-20.0.8/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h --- mesa-19.2.8/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h 2020-06-12 01:21:17.000000000 +0000 @@ -171,10 +171,10 @@ Value *loadMsInfo32(Value *ptr, uint32_t off); void adjustCoordinatesMS(TexInstruction *); - void processSurfaceCoordsGM107(TexInstruction *); + TexInstruction *processSurfaceCoordsGM107(TexInstruction *, Instruction *[4]); void processSurfaceCoordsNVE4(TexInstruction *); void processSurfaceCoordsNVC0(TexInstruction *); - void convertSurfaceFormat(TexInstruction *); + void convertSurfaceFormat(TexInstruction *, Instruction **); void insertOOBSurfaceOpResult(TexInstruction *); Value *calculateSampleOffset(Value *sampleID); diff -Nru mesa-19.2.8/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp mesa-20.0.8/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp --- mesa-19.2.8/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp 2020-06-12 01:21:17.000000000 +0000 @@ -2804,6 +2804,16 @@ if (prog->getType() == Program::TYPE_COMPUTE && rec->rel[0]) return false; + // There's really no great place to put this in a generic manner. Seemingly + // wide stores at 0x60 don't work in GS shaders on SM50+. Don't combine + // those. + if (prog->getTarget()->getChipset() >= NVISA_GM107_CHIPSET && + prog->getType() == Program::TYPE_GEOMETRY && + st->getSrc(0)->reg.file == FILE_SHADER_OUTPUT && + rec->rel[0] == NULL && + MIN2(offRc, offSt) == 0x60) + return false; + // remove any existing load/store records for the store being merged into // the existing record. purgeRecords(st, DATA_FILE_COUNT); diff -Nru mesa-19.2.8/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp mesa-20.0.8/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp --- mesa-19.2.8/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp 2020-06-12 01:21:17.000000000 +0000 @@ -624,8 +624,6 @@ // trickery to save a loop of OR'ing liveSets // aliasing works fine with BitSet::setOr for (Graph::EdgeIterator ei = bb->cfg.outgoing(); !ei.end(); ei.next()) { - if (ei.getType() == Graph::Edge::DUMMY) - continue; if (bbA) { bb->liveSet.setOr(&bbA->liveSet, &bbB->liveSet); bbA = bb; diff -Nru mesa-19.2.8/src/gallium/drivers/nouveau/codegen/nv50_ir_util.h mesa-20.0.8/src/gallium/drivers/nouveau/codegen/nv50_ir_util.h --- mesa-19.2.8/src/gallium/drivers/nouveau/codegen/nv50_ir_util.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/nouveau/codegen/nv50_ir_util.h 2020-06-12 01:21:17.000000000 +0000 @@ -145,7 +145,7 @@ #define DLLIST_EMPTY(__list) ((__list)->next == (__list)) #define DLLIST_FOR_EACH(list, it) \ - for (DLList::Iterator (it) = (list)->iterator(); !(it).end(); (it).next()) + for (DLList::Iterator it = (list)->iterator(); !(it).end(); (it).next()) class DLList { diff -Nru mesa-19.2.8/src/gallium/drivers/nouveau/meson.build mesa-20.0.8/src/gallium/drivers/nouveau/meson.build --- mesa-19.2.8/src/gallium/drivers/nouveau/meson.build 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/nouveau/meson.build 2020-06-12 01:21:17.000000000 +0000 @@ -223,13 +223,14 @@ 'nouveau_compiler', 'nouveau_compiler.c', include_directories : [inc_src, inc_include, inc_gallium, inc_gallium_aux], - dependencies : [dep_libdrm, dep_libdrm_nouveau, idep_mesautil], - link_with : [libnouveau, libgallium, libnir], + dependencies : [dep_libdrm, dep_libdrm_nouveau, idep_mesautil, idep_nir], + link_with : [libnouveau, libgallium], build_by_default : with_tools.contains('nouveau'), install : with_tools.contains('nouveau'), ) driver_nouveau = declare_dependency( compile_args : '-DGALLIUM_NOUVEAU', - link_with : [libnouveauwinsys, libnouveau, libnir], + dependencies : idep_nir, + link_with : [libnouveauwinsys, libnouveau], ) diff -Nru mesa-19.2.8/src/gallium/drivers/nouveau/nouveau_buffer.c mesa-20.0.8/src/gallium/drivers/nouveau/nouveau_buffer.c --- mesa-19.2.8/src/gallium/drivers/nouveau/nouveau_buffer.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/nouveau/nouveau_buffer.c 2020-06-12 01:21:17.000000000 +0000 @@ -515,7 +515,7 @@ if (tx->map) nouveau_transfer_write(nouveau_context(pipe), tx, box->x, box->width); - util_range_add(&buf->valid_buffer_range, + util_range_add(&buf->base, &buf->valid_buffer_range, tx->base.box.x + box->x, tx->base.box.x + box->x + box->width); } @@ -539,7 +539,7 @@ if (tx->map) nouveau_transfer_write(nv, tx, 0, tx->base.box.width); - util_range_add(&buf->valid_buffer_range, + util_range_add(&buf->base, &buf->valid_buffer_range, tx->base.box.x, tx->base.box.x + tx->base.box.width); } @@ -590,7 +590,7 @@ &src->base, 0, &src_box); } - util_range_add(&dst->valid_buffer_range, dstx, dstx + size); + util_range_add(&dst->base, &dst->valid_buffer_range, dstx, dstx + size); } @@ -725,7 +725,7 @@ buffer->status = NOUVEAU_BUFFER_STATUS_USER_MEMORY; util_range_init(&buffer->valid_buffer_range); - util_range_add(&buffer->valid_buffer_range, 0, bytes); + util_range_add(&buffer->base, &buffer->valid_buffer_range, 0, bytes); return &buffer->base; } @@ -850,11 +850,6 @@ if (unlikely(buf->base.bind & PIPE_BIND_SHARED)) return; - /* We can't touch persistent/coherent buffers */ - if (buf->base.flags & (PIPE_RESOURCE_FLAG_MAP_PERSISTENT | - PIPE_RESOURCE_FLAG_MAP_COHERENT)) - return; - /* If the buffer is sub-allocated and not currently being written, just * wipe the valid buffer range. Otherwise we have to create fresh * storage. (We don't keep track of fences for non-sub-allocated BO's.) diff -Nru mesa-19.2.8/src/gallium/drivers/nouveau/nouveau_fence.c mesa-20.0.8/src/gallium/drivers/nouveau/nouveau_fence.c --- mesa-19.2.8/src/gallium/drivers/nouveau/nouveau_fence.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/nouveau/nouveau_fence.c 2020-06-12 01:21:17.000000000 +0000 @@ -38,7 +38,7 @@ (*fence)->screen = screen; (*fence)->ref = 1; - LIST_INITHEAD(&(*fence)->work); + list_inithead(&(*fence)->work); return true; } @@ -50,7 +50,7 @@ LIST_FOR_EACH_ENTRY_SAFE(work, tmp, &fence->work, list) { work->func(work->data); - LIST_DEL(&work->list); + list_del(&work->list); FREE(work); } } @@ -100,7 +100,7 @@ } } - if (!LIST_IS_EMPTY(&fence->work)) { + if (!list_is_empty(&fence->work)) { debug_printf("WARNING: deleting fence with work still pending !\n"); nouveau_fence_trigger_work(fence); } @@ -265,7 +265,7 @@ return false; work->func = func; work->data = data; - LIST_ADD(&work->list, &fence->work); + list_add(&work->list, &fence->work); p_atomic_inc(&fence->work_count); if (fence->work_count > 64) nouveau_fence_kick(fence); diff -Nru mesa-19.2.8/src/gallium/drivers/nouveau/nouveau_mm.c mesa-20.0.8/src/gallium/drivers/nouveau/nouveau_mm.c --- mesa-19.2.8/src/gallium/drivers/nouveau/nouveau_mm.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/nouveau/nouveau_mm.c 2020-06-12 01:21:17.000000000 +0000 @@ -142,13 +142,13 @@ return PIPE_ERROR_OUT_OF_MEMORY; } - LIST_INITHEAD(&slab->head); + list_inithead(&slab->head); slab->cache = cache; slab->order = chunk_order; slab->count = slab->free = size >> chunk_order; - LIST_ADD(&slab->head, &mm_bucket_by_order(cache, chunk_order)->free); + list_add(&slab->head, &mm_bucket_by_order(cache, chunk_order)->free); cache->allocated += size; @@ -181,16 +181,16 @@ return NULL; } - if (!LIST_IS_EMPTY(&bucket->used)) { + if (!list_is_empty(&bucket->used)) { slab = LIST_ENTRY(struct mm_slab, bucket->used.next, head); } else { - if (LIST_IS_EMPTY(&bucket->free)) { + if (list_is_empty(&bucket->free)) { mm_slab_new(cache, MAX2(mm_get_order(size), MM_MIN_ORDER)); } slab = LIST_ENTRY(struct mm_slab, bucket->free.next, head); - LIST_DEL(&slab->head); - LIST_ADD(&slab->head, &bucket->used); + list_del(&slab->head); + list_add(&slab->head, &bucket->used); } *offset = mm_slab_alloc(slab) << slab->order; @@ -202,8 +202,8 @@ nouveau_bo_ref(slab->bo, bo); if (slab->free == 0) { - LIST_DEL(&slab->head); - LIST_ADD(&slab->head, &bucket->full); + list_del(&slab->head); + list_add(&slab->head, &bucket->full); } alloc->next = NULL; @@ -222,12 +222,12 @@ mm_slab_free(slab, alloc->offset >> slab->order); if (slab->free == slab->count) { - LIST_DEL(&slab->head); - LIST_ADDTAIL(&slab->head, &bucket->free); + list_del(&slab->head); + list_addtail(&slab->head, &bucket->free); } else if (slab->free == 1) { - LIST_DEL(&slab->head); - LIST_ADDTAIL(&slab->head, &bucket->used); + list_del(&slab->head); + list_addtail(&slab->head, &bucket->used); } FREE(alloc); @@ -255,9 +255,9 @@ cache->allocated = 0; for (i = 0; i < MM_NUM_BUCKETS; ++i) { - LIST_INITHEAD(&cache->bucket[i].free); - LIST_INITHEAD(&cache->bucket[i].used); - LIST_INITHEAD(&cache->bucket[i].full); + list_inithead(&cache->bucket[i].free); + list_inithead(&cache->bucket[i].used); + list_inithead(&cache->bucket[i].full); } return cache; @@ -269,7 +269,7 @@ struct mm_slab *slab, *next; LIST_FOR_EACH_ENTRY_SAFE(slab, next, head, head) { - LIST_DEL(&slab->head); + list_del(&slab->head); nouveau_bo_ref(NULL, &slab->bo); FREE(slab); } @@ -284,8 +284,8 @@ return; for (i = 0; i < MM_NUM_BUCKETS; ++i) { - if (!LIST_IS_EMPTY(&cache->bucket[i].used) || - !LIST_IS_EMPTY(&cache->bucket[i].full)) + if (!list_is_empty(&cache->bucket[i].used) || + !list_is_empty(&cache->bucket[i].full)) debug_printf("WARNING: destroying GPU memory cache " "with some buffers still in use\n"); diff -Nru mesa-19.2.8/src/gallium/drivers/nouveau/nouveau_screen.c mesa-20.0.8/src/gallium/drivers/nouveau/nouveau_screen.c --- mesa-19.2.8/src/gallium/drivers/nouveau/nouveau_screen.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/nouveau/nouveau_screen.c 2020-06-12 01:21:17.000000000 +0000 @@ -4,8 +4,8 @@ #include "util/u_memory.h" #include "util/u_inlines.h" -#include "util/u_format.h" -#include "util/u_format_s3tc.h" +#include "util/format/u_format.h" +#include "util/format/u_format_s3tc.h" #include "util/u_string.h" #include "util/os_time.h" @@ -23,6 +23,8 @@ #include "nouveau_mm.h" #include "nouveau_buffer.h" +#include + /* XXX this should go away */ #include "state_tracker/drm_driver.h" @@ -187,6 +189,9 @@ nouveau_mesa_debug = atoi(nv_dbg); screen->prefer_nir = debug_get_bool_option("NV50_PROG_USE_NIR", false); + screen->force_enable_cl = debug_get_bool_option("NOUVEAU_ENABLE_CL", false); + if (screen->force_enable_cl) + glsl_type_singleton_init_or_ref(); /* These must be set before any failure is possible, as the cleanup * paths assume they're responsible for deleting them. @@ -279,6 +284,9 @@ { int fd = screen->drm->fd; + if (screen->force_enable_cl) + glsl_type_singleton_decref(); + nouveau_mm_destroy(screen->mm_GART); nouveau_mm_destroy(screen->mm_VRAM); diff -Nru mesa-19.2.8/src/gallium/drivers/nouveau/nouveau_screen.h mesa-20.0.8/src/gallium/drivers/nouveau/nouveau_screen.h --- mesa-19.2.8/src/gallium/drivers/nouveau/nouveau_screen.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/nouveau/nouveau_screen.h 2020-06-12 01:21:17.000000000 +0000 @@ -69,6 +69,7 @@ struct disk_cache *disk_shader_cache; bool prefer_nir; + bool force_enable_cl; #ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS union { diff -Nru mesa-19.2.8/src/gallium/drivers/nouveau/nouveau_video.c mesa-20.0.8/src/gallium/drivers/nouveau/nouveau_video.c --- mesa-19.2.8/src/gallium/drivers/nouveau/nouveau_video.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/nouveau/nouveau_video.c 2020-06-12 01:21:17.000000000 +0000 @@ -29,7 +29,7 @@ #include "nouveau_buffer.h" #include "util/u_video.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_sampler.h" static int diff -Nru mesa-19.2.8/src/gallium/drivers/nouveau/nouveau_vp3_video.c mesa-20.0.8/src/gallium/drivers/nouveau/nouveau_vp3_video.c --- mesa-19.2.8/src/gallium/drivers/nouveau/nouveau_vp3_video.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/nouveau/nouveau_vp3_video.c 2020-06-12 01:21:17.000000000 +0000 @@ -32,7 +32,7 @@ #include "nouveau_vp3_video.h" #include "util/u_video.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_sampler.h" static struct pipe_sampler_view ** diff -Nru mesa-19.2.8/src/gallium/drivers/nouveau/nv30/nv30_fragprog.c mesa-20.0.8/src/gallium/drivers/nouveau/nv30/nv30_fragprog.c --- mesa-19.2.8/src/gallium/drivers/nouveau/nv30/nv30_fragprog.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/nouveau/nv30/nv30_fragprog.c 2020-06-12 01:21:17.000000000 +0000 @@ -41,7 +41,7 @@ if (unlikely(!fp->buffer)) fp->buffer = pipe_buffer_create(pipe->screen, 0, 0, fp->insn_len * 4); -#ifndef PIPE_ARCH_BIG_ENDIAN +#if !UTIL_ARCH_BIG_ENDIAN pipe_buffer_write(pipe, fp->buffer, 0, fp->insn_len * 4, fp->insn); #else { diff -Nru mesa-19.2.8/src/gallium/drivers/nouveau/nv30/nv30_miptree.c mesa-20.0.8/src/gallium/drivers/nouveau/nv30/nv30_miptree.c --- mesa-19.2.8/src/gallium/drivers/nouveau/nv30/nv30_miptree.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/nouveau/nv30/nv30_miptree.c 2020-06-12 01:21:17.000000000 +0000 @@ -23,7 +23,7 @@ * */ -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_inlines.h" #include "util/u_surface.h" diff -Nru mesa-19.2.8/src/gallium/drivers/nouveau/nv30/nv30_push.c mesa-20.0.8/src/gallium/drivers/nouveau/nv30/nv30_push.c --- mesa-19.2.8/src/gallium/drivers/nouveau/nv30/nv30_push.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/nouveau/nv30/nv30_push.c 2020-06-12 01:21:17.000000000 +0000 @@ -26,7 +26,7 @@ #include "pipe/p_context.h" #include "pipe/p_state.h" #include "util/u_inlines.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "translate/translate.h" #include "nv_object.xml.h" diff -Nru mesa-19.2.8/src/gallium/drivers/nouveau/nv30/nv30_query.c mesa-20.0.8/src/gallium/drivers/nouveau/nv30/nv30_query.c --- mesa-19.2.8/src/gallium/drivers/nouveau/nv30/nv30_query.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/nouveau/nv30/nv30_query.c 2020-06-12 01:21:17.000000000 +0000 @@ -58,7 +58,7 @@ while (ntfy[3] & 0xff000000) { } nouveau_heap_free(&qo->hw); - LIST_DEL(&qo->list); + list_del(&qo->list); FREE(qo); } } @@ -80,7 +80,7 @@ nv30_query_object_del(screen, &oq); } - LIST_ADDTAIL(&qo->list, &screen->queries); + list_addtail(&qo->list, &screen->queries); ntfy = nv30_ntfy(screen, qo); ntfy[0] = 0x00000000; diff -Nru mesa-19.2.8/src/gallium/drivers/nouveau/nv30/nv30_resource.c mesa-20.0.8/src/gallium/drivers/nouveau/nv30/nv30_resource.c --- mesa-19.2.8/src/gallium/drivers/nouveau/nv30/nv30_resource.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/nouveau/nv30/nv30_resource.c 2020-06-12 01:21:17.000000000 +0000 @@ -23,7 +23,7 @@ * */ -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_inlines.h" #include "nv30/nv30_screen.h" diff -Nru mesa-19.2.8/src/gallium/drivers/nouveau/nv30/nv30_screen.c mesa-20.0.8/src/gallium/drivers/nouveau/nv30/nv30_screen.c --- mesa-19.2.8/src/gallium/drivers/nouveau/nv30/nv30_screen.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/nouveau/nv30/nv30_screen.c 2020-06-12 01:21:17.000000000 +0000 @@ -25,8 +25,8 @@ #include #include -#include "util/u_format.h" -#include "util/u_format_s3tc.h" +#include "util/format/u_format.h" +#include "util/format/u_format_s3tc.h" #include "util/u_screen.h" #include "nv_object.xml.h" @@ -361,7 +361,6 @@ case PIPE_SHADER_CAP_TGSI_SKIP_MERGE_REGISTERS: case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTERS: case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTER_BUFFERS: - case PIPE_SHADER_CAP_SCALAR_ISA: return 0; default: debug_printf("unknown vertex shader param %d\n", param); @@ -414,7 +413,6 @@ case PIPE_SHADER_CAP_TGSI_SKIP_MERGE_REGISTERS: case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTERS: case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTER_BUFFERS: - case PIPE_SHADER_CAP_SCALAR_ISA: return 0; default: debug_printf("unknown fragment shader param %d\n", param); @@ -657,7 +655,7 @@ if (ret) FAIL_SCREEN_INIT("error creating query heap: %d\n", ret); - LIST_INITHEAD(&screen->queries); + list_inithead(&screen->queries); /* Vertex program resources (code/data), currently 6 of the constant * slots are reserved to implement user clipping planes diff -Nru mesa-19.2.8/src/gallium/drivers/nouveau/nv30/nv30_state.c mesa-20.0.8/src/gallium/drivers/nouveau/nv30/nv30_state.c --- mesa-19.2.8/src/gallium/drivers/nouveau/nv30/nv30_state.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/nouveau/nv30/nv30_state.c 2020-06-12 01:21:17.000000000 +0000 @@ -23,7 +23,7 @@ * */ -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_helpers.h" #include "util/u_inlines.h" diff -Nru mesa-19.2.8/src/gallium/drivers/nouveau/nv30/nv30_state_validate.c mesa-20.0.8/src/gallium/drivers/nouveau/nv30/nv30_state_validate.c --- mesa-19.2.8/src/gallium/drivers/nouveau/nv30/nv30_state_validate.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/nouveau/nv30/nv30_state_validate.c 2020-06-12 01:21:17.000000000 +0000 @@ -23,7 +23,7 @@ * */ -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_math.h" #include "util/u_half.h" diff -Nru mesa-19.2.8/src/gallium/drivers/nouveau/nv30/nv30_texture.c mesa-20.0.8/src/gallium/drivers/nouveau/nv30/nv30_texture.c --- mesa-19.2.8/src/gallium/drivers/nouveau/nv30/nv30_texture.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/nouveau/nv30/nv30_texture.c 2020-06-12 01:21:17.000000000 +0000 @@ -24,7 +24,7 @@ */ #include "util/u_inlines.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "nv_object.xml.h" #include "nv30/nv30-40_3d.xml.h" diff -Nru mesa-19.2.8/src/gallium/drivers/nouveau/nv30/nv30_vbo.c mesa-20.0.8/src/gallium/drivers/nouveau/nv30/nv30_vbo.c --- mesa-19.2.8/src/gallium/drivers/nouveau/nv30/nv30_vbo.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/nouveau/nv30/nv30_vbo.c 2020-06-12 01:21:17.000000000 +0000 @@ -23,7 +23,7 @@ * */ -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_inlines.h" #include "util/u_prim.h" #include "translate/translate.h" @@ -192,7 +192,7 @@ if (!nv30->vertex || nv30->draw_flags) return; -#ifdef PIPE_ARCH_BIG_ENDIAN +#if UTIL_ARCH_BIG_ENDIAN if (1) { /* Figure out where the buffers are getting messed up */ #else if (unlikely(vertex->need_conversion)) { diff -Nru mesa-19.2.8/src/gallium/drivers/nouveau/nv50/nv50_blit.h mesa-20.0.8/src/gallium/drivers/nouveau/nv50/nv50_blit.h --- mesa-19.2.8/src/gallium/drivers/nouveau/nv50/nv50_blit.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/nouveau/nv50/nv50_blit.h 2020-06-12 01:21:17.000000000 +0000 @@ -3,7 +3,7 @@ #define __NV50_BLIT_H__ #include "util/u_inlines.h" -#include "util/u_format.h" +#include "util/format/u_format.h" void * nv50_blitter_make_fp(struct pipe_context *, diff -Nru mesa-19.2.8/src/gallium/drivers/nouveau/nv50/nv50_miptree.c mesa-20.0.8/src/gallium/drivers/nouveau/nv50/nv50_miptree.c --- mesa-19.2.8/src/gallium/drivers/nouveau/nv50/nv50_miptree.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/nouveau/nv50/nv50_miptree.c 2020-06-12 01:21:17.000000000 +0000 @@ -23,7 +23,7 @@ #include "pipe/p_state.h" #include "pipe/p_defines.h" #include "util/u_inlines.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "nv50/nv50_context.h" #include "nv50/nv50_resource.h" diff -Nru mesa-19.2.8/src/gallium/drivers/nouveau/nv50/nv50_push.c mesa-20.0.8/src/gallium/drivers/nouveau/nv50/nv50_push.c --- mesa-19.2.8/src/gallium/drivers/nouveau/nv50/nv50_push.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/nouveau/nv50/nv50_push.c 2020-06-12 01:21:17.000000000 +0000 @@ -2,7 +2,7 @@ #include "pipe/p_context.h" #include "pipe/p_state.h" #include "util/u_inlines.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "translate/translate.h" #include "nv50/nv50_context.h" diff -Nru mesa-19.2.8/src/gallium/drivers/nouveau/nv50/nv50_resource.c mesa-20.0.8/src/gallium/drivers/nouveau/nv50/nv50_resource.c --- mesa-19.2.8/src/gallium/drivers/nouveau/nv50/nv50_resource.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/nouveau/nv50/nv50_resource.c 2020-06-12 01:21:17.000000000 +0000 @@ -1,7 +1,7 @@ #include "pipe/p_context.h" #include "util/u_inlines.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "nouveau_screen.h" diff -Nru mesa-19.2.8/src/gallium/drivers/nouveau/nv50/nv50_screen.c mesa-20.0.8/src/gallium/drivers/nouveau/nv50/nv50_screen.c --- mesa-19.2.8/src/gallium/drivers/nouveau/nv50/nv50_screen.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/nouveau/nv50/nv50_screen.c 2020-06-12 01:21:17.000000000 +0000 @@ -23,8 +23,8 @@ #include #include #include -#include "util/u_format.h" -#include "util/u_format_s3tc.h" +#include "util/format/u_format.h" +#include "util/format/u_format_s3tc.h" #include "util/u_screen.h" #include "pipe/p_screen.h" #include "compiler/nir/nir.h" @@ -330,6 +330,7 @@ case PIPE_CAP_FBFETCH_COHERENT: case PIPE_CAP_TGSI_SKIP_SHRINK_IO_ARRAYS: case PIPE_CAP_TGSI_ATOMINC_WRAP: + case PIPE_CAP_DEMOTE_TO_HELPER_INVOCATION: return 0; case PIPE_CAP_VENDOR_ID: @@ -355,7 +356,7 @@ /* caps where we want the default value */ case PIPE_CAP_DMABUF: case PIPE_CAP_ESSL_FEATURE_LEVEL: - case PIPE_CAP_MAX_FRAMES_IN_FLIGHT: + case PIPE_CAP_THROTTLE: return u_pipe_screen_get_param_defaults(pscreen, param); } } @@ -435,8 +436,6 @@ case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTERS: case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTER_BUFFERS: return 0; - case PIPE_SHADER_CAP_SCALAR_ISA: - return 1; default: NOUVEAU_ERR("unknown PIPE_SHADER_CAP %d\n", param); return 0; @@ -901,6 +900,7 @@ .lower_fpow = false, .lower_uadd_carry = true, .lower_usub_borrow = true, + .lower_sub = true, .lower_ffract = true, .lower_pack_half_2x16 = true, .lower_pack_unorm_2x16 = true, @@ -917,6 +917,7 @@ .lower_all_io_to_temps = false, .lower_cs_local_index_from_id = true, .lower_rotate = true, + .lower_to_scalar = true, .use_interpolated_input_intrinsics = true, .max_unroll_iterations = 32, }; diff -Nru mesa-19.2.8/src/gallium/drivers/nouveau/nv50/nv50_state.c mesa-20.0.8/src/gallium/drivers/nouveau/nv50/nv50_state.c --- mesa-19.2.8/src/gallium/drivers/nouveau/nv50/nv50_state.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/nouveau/nv50/nv50_state.c 2020-06-12 01:21:17.000000000 +0000 @@ -1148,7 +1148,7 @@ pipe_reference_init(&targ->pipe.reference, 1); assert(buf->base.target == PIPE_BUFFER); - util_range_add(&buf->valid_buffer_range, offset, offset + size); + util_range_add(&buf->base, &buf->valid_buffer_range, offset, offset + size); return &targ->pipe; } diff -Nru mesa-19.2.8/src/gallium/drivers/nouveau/nv50/nv50_surface.c mesa-20.0.8/src/gallium/drivers/nouveau/nv50/nv50_surface.c --- mesa-19.2.8/src/gallium/drivers/nouveau/nv50/nv50_surface.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/nouveau/nv50/nv50_surface.c 2020-06-12 01:21:17.000000000 +0000 @@ -26,7 +26,7 @@ #include "util/u_inlines.h" #include "util/u_pack_color.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_math.h" #include "util/u_surface.h" @@ -724,7 +724,7 @@ return; } - util_range_add(&buf->valid_buffer_range, offset, offset + size); + util_range_add(&buf->base, &buf->valid_buffer_range, offset, offset + size); assert(size % data_size == 0); @@ -1356,7 +1356,6 @@ float x0, x1, y0, y1, z; float dz; float x_range, y_range; - float tri_x, tri_y; blit->mode = nv50_blit_select_mode(info); blit->color_mask = nv50_blit_derive_color_mask(info); @@ -1377,14 +1376,11 @@ x_range = (float)info->src.box.width / (float)info->dst.box.width; y_range = (float)info->src.box.height / (float)info->dst.box.height; - tri_x = 16384 << nv50_miptree(dst)->ms_x; - tri_y = 16384 << nv50_miptree(dst)->ms_y; - x0 = (float)info->src.box.x - x_range * (float)info->dst.box.x; y0 = (float)info->src.box.y - y_range * (float)info->dst.box.y; - x1 = x0 + tri_x * x_range; - y1 = y0 + tri_y * y_range; + x1 = x0 + 16384.0f * x_range; + y1 = y0 + 16384.0f * y_range; x0 *= (float)(1 << nv50_miptree(src)->ms_x); x1 *= (float)(1 << nv50_miptree(src)->ms_x); @@ -1457,7 +1453,7 @@ PUSH_DATAf(push, y0); PUSH_DATAf(push, z); BEGIN_NV04(push, NV50_3D(VTX_ATTR_2F_X(0)), 2); - PUSH_DATAf(push, tri_x); + PUSH_DATAf(push, 16384.0f); PUSH_DATAf(push, 0.0f); BEGIN_NV04(push, NV50_3D(VTX_ATTR_3F_X(1)), 3); PUSH_DATAf(push, x0); @@ -1465,7 +1461,7 @@ PUSH_DATAf(push, z); BEGIN_NV04(push, NV50_3D(VTX_ATTR_2F_X(0)), 2); PUSH_DATAf(push, 0.0f); - PUSH_DATAf(push, tri_y); + PUSH_DATAf(push, 16384.0f); BEGIN_NV04(push, NV50_3D(VERTEX_END_GL), 1); PUSH_DATA (push, 0); } diff -Nru mesa-19.2.8/src/gallium/drivers/nouveau/nv50/nv50_tex.c mesa-20.0.8/src/gallium/drivers/nouveau/nv50/nv50_tex.c --- mesa-19.2.8/src/gallium/drivers/nouveau/nv50/nv50_tex.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/nouveau/nv50/nv50_tex.c 2020-06-12 01:21:17.000000000 +0000 @@ -25,7 +25,7 @@ #include "nv50/g80_texture.xml.h" #include "nv50/g80_defs.xml.h" -#include "util/u_format.h" +#include "util/format/u_format.h" static inline uint32_t nv50_tic_swizzle(const struct nv50_format *fmt, unsigned swz, bool tex_int) @@ -315,7 +315,7 @@ struct nv50_tic_entry *tic = nv50_tic_entry(nv50->textures[s][i]); struct nv50_miptree *res; - if (!tic) { + if (!tic || tic->pipe.target == PIPE_BUFFER) { PUSH_DATA (push, 0); PUSH_DATA (push, 0); continue; diff -Nru mesa-19.2.8/src/gallium/drivers/nouveau/nv50/nv50_transfer.c mesa-20.0.8/src/gallium/drivers/nouveau/nv50/nv50_transfer.c --- mesa-19.2.8/src/gallium/drivers/nouveau/nv50/nv50_transfer.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/nouveau/nv50/nv50_transfer.c 2020-06-12 01:21:17.000000000 +0000 @@ -1,5 +1,5 @@ -#include "util/u_format.h" +#include "util/format/u_format.h" #include "nv50/nv50_context.h" diff -Nru mesa-19.2.8/src/gallium/drivers/nouveau/nv50/nv50_vbo.c mesa-20.0.8/src/gallium/drivers/nouveau/nv50/nv50_vbo.c --- mesa-19.2.8/src/gallium/drivers/nouveau/nv50/nv50_vbo.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/nouveau/nv50/nv50_vbo.c 2020-06-12 01:21:17.000000000 +0000 @@ -23,7 +23,7 @@ #include "pipe/p_context.h" #include "pipe/p_state.h" #include "util/u_inlines.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "translate/translate.h" #include "nv50/nv50_context.h" diff -Nru mesa-19.2.8/src/gallium/drivers/nouveau/nv50/nv84_video.c mesa-20.0.8/src/gallium/drivers/nouveau/nv50/nv84_video.c --- mesa-19.2.8/src/gallium/drivers/nouveau/nv50/nv84_video.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/nouveau/nv50/nv84_video.c 2020-06-12 01:21:17.000000000 +0000 @@ -25,7 +25,7 @@ #include #include -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_sampler.h" #include "vl/vl_zscan.h" diff -Nru mesa-19.2.8/src/gallium/drivers/nouveau/nv50/nv98_video.c mesa-20.0.8/src/gallium/drivers/nouveau/nv50/nv98_video.c --- mesa-19.2.8/src/gallium/drivers/nouveau/nv50/nv98_video.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/nouveau/nv50/nv98_video.c 2020-06-12 01:21:17.000000000 +0000 @@ -23,7 +23,7 @@ #include "nv50/nv98_video.h" #include "util/u_sampler.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include diff -Nru mesa-19.2.8/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c mesa-20.0.8/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c --- mesa-19.2.8/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c 2020-06-12 01:21:17.000000000 +0000 @@ -285,7 +285,7 @@ PUSH_DATA (push, nvc0->buffers[s][i].buffer_size); PUSH_DATA (push, 0); BCTX_REFN(nvc0->bufctx_cp, CP_BUF, res, RDWR); - util_range_add(&res->valid_buffer_range, + util_range_add(&res->base, &res->valid_buffer_range, nvc0->buffers[s][i].buffer_offset, nvc0->buffers[s][i].buffer_offset + nvc0->buffers[s][i].buffer_size); diff -Nru mesa-19.2.8/src/gallium/drivers/nouveau/nvc0/nvc0_context.c mesa-20.0.8/src/gallium/drivers/nouveau/nvc0/nvc0_context.c --- mesa-19.2.8/src/gallium/drivers/nouveau/nvc0/nvc0_context.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/nouveau/nvc0/nvc0_context.c 2020-06-12 01:21:17.000000000 +0000 @@ -134,6 +134,12 @@ } } +static enum pipe_reset_status +nvc0_get_device_reset_status(struct pipe_context *pipe) +{ + return PIPE_NO_RESET; +} + static void nvc0_context_unreference_resources(struct nvc0_context *nvc0) { @@ -407,6 +413,7 @@ pipe->memory_barrier = nvc0_memory_barrier; pipe->get_sample_position = nvc0_context_get_sample_position; pipe->emit_string_marker = nvc0_emit_string_marker; + pipe->get_device_reset_status = nvc0_get_device_reset_status; nouveau_context_init(&nvc0->base); nvc0_init_query_functions(nvc0); diff -Nru mesa-19.2.8/src/gallium/drivers/nouveau/nvc0/nvc0_miptree.c mesa-20.0.8/src/gallium/drivers/nouveau/nvc0/nvc0_miptree.c --- mesa-19.2.8/src/gallium/drivers/nouveau/nvc0/nvc0_miptree.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/nouveau/nvc0/nvc0_miptree.c 2020-06-12 01:21:17.000000000 +0000 @@ -26,7 +26,7 @@ #include "pipe/p_defines.h" #include "state_tracker/drm_driver.h" #include "util/u_inlines.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "nvc0/nvc0_context.h" #include "nvc0/nvc0_resource.h" diff -Nru mesa-19.2.8/src/gallium/drivers/nouveau/nvc0/nvc0_program.c mesa-20.0.8/src/gallium/drivers/nouveau/nvc0/nvc0_program.c --- mesa-19.2.8/src/gallium/drivers/nouveau/nvc0/nvc0_program.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/nouveau/nvc0/nvc0_program.c 2020-06-12 01:21:17.000000000 +0000 @@ -440,7 +440,7 @@ if (info->prop.fp.usesDiscard) fp->hdr[0] |= 0x8000; - if (info->prop.fp.numColourResults > 1) + if (!info->prop.fp.separateFragData) fp->hdr[0] |= 0x4000; if (info->io.sampleMask < PIPE_MAX_SHADER_OUTPUTS) fp->hdr[19] |= 0x1; diff -Nru mesa-19.2.8/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c mesa-20.0.8/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c --- mesa-19.2.8/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c 2020-06-12 01:21:17.000000000 +0000 @@ -409,7 +409,7 @@ result_type >= PIPE_QUERY_TYPE_I64 ? 2 : 1, ready); - util_range_add(&buf->valid_buffer_range, offset, + util_range_add(&buf->base, &buf->valid_buffer_range, offset, offset + (result_type >= PIPE_QUERY_TYPE_I64 ? 8 : 4)); nvc0_resource_validate(buf, NOUVEAU_BO_WR); @@ -508,7 +508,7 @@ PUSH_DATAh(push, buf->address + offset); PUSH_DATA (push, buf->address + offset); - util_range_add(&buf->valid_buffer_range, offset, + util_range_add(&buf->base, &buf->valid_buffer_range, offset, offset + (result_type >= PIPE_QUERY_TYPE_I64 ? 8 : 4)); nvc0_resource_validate(buf, NOUVEAU_BO_WR); diff -Nru mesa-19.2.8/src/gallium/drivers/nouveau/nvc0/nvc0_resource.h mesa-20.0.8/src/gallium/drivers/nouveau/nvc0/nvc0_resource.h --- mesa-19.2.8/src/gallium/drivers/nouveau/nvc0/nvc0_resource.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/nouveau/nvc0/nvc0_resource.h 2020-06-12 01:21:17.000000000 +0000 @@ -38,7 +38,7 @@ const struct pipe_resource *tmp, const uint64_t *modifiers, unsigned int count); -const struct u_resource_vtbl nvc0_miptree_vtbl; +extern const struct u_resource_vtbl nvc0_miptree_vtbl; struct pipe_surface * nvc0_miptree_surface_new(struct pipe_context *, diff -Nru mesa-19.2.8/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c mesa-20.0.8/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c --- mesa-19.2.8/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c 2020-06-12 01:21:17.000000000 +0000 @@ -23,8 +23,8 @@ #include #include #include -#include "util/u_format.h" -#include "util/u_format_s3tc.h" +#include "util/format/u_format.h" +#include "util/format/u_format_s3tc.h" #include "util/u_screen.h" #include "pipe/p_screen.h" #include "compiler/nir/nir.h" @@ -281,6 +281,8 @@ case PIPE_CAP_DEST_SURFACE_SRGB_CONTROL: case PIPE_CAP_TGSI_DIV: case PIPE_CAP_TGSI_ATOMINC_WRAP: + case PIPE_CAP_DEMOTE_TO_HELPER_INVOCATION: + case PIPE_CAP_DEVICE_RESET_STATUS_QUERY: return 1; case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER: return nouveau_screen(pscreen)->vram_domain & NOUVEAU_BO_VRAM ? 1 : 0; @@ -322,7 +324,6 @@ case PIPE_CAP_TGSI_VS_WINDOW_SPACE_POSITION: case PIPE_CAP_VERTEXID_NOBASE: case PIPE_CAP_RESOURCE_FROM_USER_MEMORY: - case PIPE_CAP_DEVICE_RESET_STATUS_QUERY: case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL: case PIPE_CAP_GENERATE_MIPMAP: case PIPE_CAP_BUFFER_SAMPLER_VIEW_RGBA_ONLY: @@ -390,7 +391,7 @@ /* caps where we want the default value */ case PIPE_CAP_DMABUF: case PIPE_CAP_ESSL_FEATURE_LEVEL: - case PIPE_CAP_MAX_FRAMES_IN_FLIGHT: + case PIPE_CAP_THROTTLE: return u_pipe_screen_get_param_defaults(pscreen, param); } } @@ -418,9 +419,13 @@ switch (param) { case PIPE_SHADER_CAP_PREFERRED_IR: return screen->prefer_nir ? PIPE_SHADER_IR_NIR : PIPE_SHADER_IR_TGSI; - case PIPE_SHADER_CAP_SUPPORTED_IRS: - return 1 << PIPE_SHADER_IR_TGSI | - 1 << PIPE_SHADER_IR_NIR; + case PIPE_SHADER_CAP_SUPPORTED_IRS: { + uint32_t irs = 1 << PIPE_SHADER_IR_TGSI | + 1 << PIPE_SHADER_IR_NIR; + if (screen->force_enable_cl) + irs |= 1 << PIPE_SHADER_IR_NIR_SERIALIZED; + return irs; + } case PIPE_SHADER_CAP_MAX_INSTRUCTIONS: case PIPE_SHADER_CAP_MAX_ALU_INSTRUCTIONS: case PIPE_SHADER_CAP_MAX_TEX_INSTRUCTIONS: @@ -467,8 +472,6 @@ case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTERS: case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTER_BUFFERS: return 0; - case PIPE_SHADER_CAP_SCALAR_ISA: - return 1; case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS: return NVC0_MAX_BUFFERS; case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS: @@ -935,7 +938,7 @@ .lower_usub_borrow = true, // TODO .lower_mul_high = false, .lower_negate = false, - .lower_sub = false, // TODO + .lower_sub = true, .lower_scmp = true, // TODO: not implemented yet .lower_idiv = true, .lower_isign = false, // TODO @@ -969,8 +972,9 @@ .use_interpolated_input_intrinsics = true, .lower_mul_2x32_64 = true, // TODO .max_unroll_iterations = 32, - .lower_int64_options = nir_lower_divmod64, // TODO + .lower_int64_options = nir_lower_ufind_msb64|nir_lower_divmod64, // TODO .lower_doubles_options = nir_lower_dmod, // TODO + .lower_to_scalar = true, }; static const void * diff -Nru mesa-19.2.8/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c mesa-20.0.8/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c --- mesa-19.2.8/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c 2020-06-12 01:21:17.000000000 +0000 @@ -315,14 +315,14 @@ struct nvc0_so_target *targ = nvc0_so_target(nvc0->tfbbuf[b]); struct nv04_resource *buf; - if (!targ) { + if (targ && tfb) + targ->stride = tfb->stride[b]; + + if (!targ || !targ->stride) { IMMED_NVC0(push, NVC0_3D(TFB_BUFFER_ENABLE(b)), 0); continue; } - if (tfb) - targ->stride = tfb->stride[b]; - buf = nv04_resource(targ->pipe.buffer); BCTX_REFN(nvc0->bufctx_3d, 3D_TFB, buf, WR); diff -Nru mesa-19.2.8/src/gallium/drivers/nouveau/nvc0/nvc0_state.c mesa-20.0.8/src/gallium/drivers/nouveau/nvc0/nvc0_state.c --- mesa-19.2.8/src/gallium/drivers/nouveau/nvc0/nvc0_state.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/nouveau/nvc0/nvc0_state.c 2020-06-12 01:21:17.000000000 +0000 @@ -28,6 +28,7 @@ #include "tgsi/tgsi_parse.h" #include "compiler/nir/nir.h" +#include "compiler/nir/nir_serialize.h" #include "nvc0/nvc0_stateobj.h" #include "nvc0/nvc0_context.h" @@ -234,10 +235,7 @@ SB_IMMED_3D(so, MULTISAMPLE_ENABLE, cso->multisample); SB_IMMED_3D(so, LINE_SMOOTH_ENABLE, cso->line_smooth); - /* On GM20x+, LINE_WIDTH_SMOOTH controls both aliased and smooth - * rendering and LINE_WIDTH_ALIASED seems to be ignored - */ - if (cso->line_smooth || cso->multisample || class_3d >= GM200_3D_CLASS) + if (cso->line_smooth || cso->multisample) SB_BEGIN_3D(so, LINE_WIDTH_SMOOTH, 1); else SB_BEGIN_3D(so, LINE_WIDTH_ALIASED, 1); @@ -740,6 +738,15 @@ case PIPE_SHADER_IR_NIR: prog->pipe.ir.nir = (nir_shader *)cso->prog; break; + case PIPE_SHADER_IR_NIR_SERIALIZED: { + struct blob_reader reader; + const struct pipe_binary_program_header *hdr = cso->prog; + + blob_reader_init(&reader, hdr->blob, hdr->num_bytes); + prog->pipe.ir.nir = nir_deserialize(NULL, pipe->screen->get_compiler_options(pipe->screen, PIPE_SHADER_IR_NIR, PIPE_SHADER_COMPUTE), &reader); + prog->pipe.type = PIPE_SHADER_IR_NIR; + break; + } default: assert(!"unsupported IR!"); free(prog); @@ -1055,7 +1062,7 @@ pipe_reference_init(&targ->pipe.reference, 1); assert(buf->base.target == PIPE_BUFFER); - util_range_add(&buf->valid_buffer_range, offset, offset + size); + util_range_add(&buf->base, &buf->valid_buffer_range, offset, offset + size); return &targ->pipe; } diff -Nru mesa-19.2.8/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c mesa-20.0.8/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c --- mesa-19.2.8/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c 2020-06-12 01:21:17.000000000 +0000 @@ -1,4 +1,4 @@ -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_framebuffer.h" #include "util/u_math.h" #include "util/u_viewport.h" @@ -644,7 +644,7 @@ PUSH_DATA (push, nvc0->buffers[s][i].buffer_size); PUSH_DATA (push, 0); BCTX_REFN(nvc0->bufctx_3d, 3D_BUF, res, RDWR); - util_range_add(&res->valid_buffer_range, + util_range_add(&res->base, &res->valid_buffer_range, nvc0->buffers[s][i].buffer_offset, nvc0->buffers[s][i].buffer_offset + nvc0->buffers[s][i].buffer_size); diff -Nru mesa-19.2.8/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c mesa-20.0.8/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c --- mesa-19.2.8/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c 2020-06-12 01:21:17.000000000 +0000 @@ -26,7 +26,7 @@ #include "util/u_inlines.h" #include "util/u_pack_color.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_surface.h" #include "os/os_thread.h" @@ -534,7 +534,7 @@ return; } - util_range_add(&buf->valid_buffer_range, offset, offset + size); + util_range_add(&buf->base, &buf->valid_buffer_range, offset, offset + size); assert(size % data_size == 0); @@ -1276,6 +1276,18 @@ * render target, with scissors defining the destination region. * The vertex is supplied with non-normalized texture coordinates * arranged in a way to yield the desired offset and scale. + * + * Note that while the source texture is presented to the sampler as + * non-MSAA (even if it is), the destination texture is treated as MSAA for + * rendering. This means that + * - destination coordinates shouldn't be scaled + * - without per-sample rendering, the target will be a solid-fill for all + * of the samples + * + * The last point implies that this process is very bad for 1:1 blits, as + * well as scaled blits between MSAA surfaces. This works fine for + * upscaling and downscaling though. The 1:1 blits should ideally be + * handled by the 2d engine, which can do it perfectly. */ minx = info->dst.box.x; @@ -1364,14 +1376,14 @@ *(vbuf++) = fui(y0); *(vbuf++) = fui(z); - *(vbuf++) = fui(32768 << nv50_miptree(dst)->ms_x); + *(vbuf++) = fui(32768.0f); *(vbuf++) = fui(0.0f); *(vbuf++) = fui(x1); *(vbuf++) = fui(y0); *(vbuf++) = fui(z); *(vbuf++) = fui(0.0f); - *(vbuf++) = fui(32768 << nv50_miptree(dst)->ms_y); + *(vbuf++) = fui(32768.0f); *(vbuf++) = fui(x0); *(vbuf++) = fui(y1); *(vbuf++) = fui(z); diff -Nru mesa-19.2.8/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c mesa-20.0.8/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c --- mesa-19.2.8/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c 2020-06-12 01:21:17.000000000 +0000 @@ -27,7 +27,7 @@ #include "nv50/g80_texture.xml.h" #include "nv50/g80_defs.xml.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #define NVE4_TIC_ENTRY_INVALID 0x000fffff #define NVE4_TSC_ENTRY_INVALID 0xfff00000 @@ -948,7 +948,7 @@ assert(view->resource->target == PIPE_BUFFER); - util_range_add(&res->valid_buffer_range, + util_range_add(&res->base, &res->valid_buffer_range, view->u.buf.offset, view->u.buf.offset + view->u.buf.size); } @@ -1433,7 +1433,15 @@ nvc0->screen->tic.lock[tic->id / 32] |= 1 << (tic->id % 32); - return 0x100000000ULL | tic->id; + // Compute handle. This will include the TIC as well as some additional + // info regarding the bound 3d surface layer, if applicable. + uint64_t handle = 0x100000000ULL | tic->id; + struct nv04_resource *res = nv04_resource(view->resource); + if (res->base.target == PIPE_TEXTURE_3D) { + handle |= 1 << 11; + handle |= view->u.tex.first_layer << (11 + 16); + } + return handle; fail: FREE(tic); @@ -1472,7 +1480,7 @@ res->flags = (access & 3) << 8; if (res->buf->base.target == PIPE_BUFFER && access & PIPE_IMAGE_ACCESS_WRITE) - util_range_add(&res->buf->valid_buffer_range, + util_range_add(&res->buf->base, &res->buf->valid_buffer_range, tic->pipe.u.buf.offset, tic->pipe.u.buf.offset + tic->pipe.u.buf.size); list_add(&res->list, &nvc0->img_head); diff -Nru mesa-19.2.8/src/gallium/drivers/nouveau/nvc0/nvc0_transfer.c mesa-20.0.8/src/gallium/drivers/nouveau/nvc0/nvc0_transfer.c --- mesa-19.2.8/src/gallium/drivers/nouveau/nvc0/nvc0_transfer.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/nouveau/nvc0/nvc0_transfer.c 2020-06-12 01:21:17.000000000 +0000 @@ -1,5 +1,5 @@ -#include "util/u_format.h" +#include "util/format/u_format.h" #include "nvc0/nvc0_context.h" diff -Nru mesa-19.2.8/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c mesa-20.0.8/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c --- mesa-19.2.8/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c 2020-06-12 01:21:17.000000000 +0000 @@ -25,7 +25,7 @@ #include "pipe/p_context.h" #include "pipe/p_state.h" #include "util/u_inlines.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "translate/translate.h" #include "nvc0/nvc0_context.h" diff -Nru mesa-19.2.8/src/gallium/drivers/nouveau/nvc0/nvc0_vbo_translate.c mesa-20.0.8/src/gallium/drivers/nouveau/nvc0/nvc0_vbo_translate.c --- mesa-19.2.8/src/gallium/drivers/nouveau/nvc0/nvc0_vbo_translate.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/nouveau/nvc0/nvc0_vbo_translate.c 2020-06-12 01:21:17.000000000 +0000 @@ -2,7 +2,7 @@ #include "pipe/p_context.h" #include "pipe/p_state.h" #include "util/u_inlines.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "translate/translate.h" #include "nvc0/nvc0_context.h" diff -Nru mesa-19.2.8/src/gallium/drivers/nouveau/nvc0/nvc0_video.c mesa-20.0.8/src/gallium/drivers/nouveau/nvc0/nvc0_video.c --- mesa-19.2.8/src/gallium/drivers/nouveau/nvc0/nvc0_video.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/nouveau/nvc0/nvc0_video.c 2020-06-12 01:21:17.000000000 +0000 @@ -23,7 +23,7 @@ #include "nvc0/nvc0_video.h" #include "util/u_sampler.h" -#include "util/u_format.h" +#include "util/format/u_format.h" static void nvc0_decoder_begin_frame(struct pipe_video_codec *decoder, diff -Nru mesa-19.2.8/src/gallium/drivers/nouveau/nvc0/nve4_compute.c mesa-20.0.8/src/gallium/drivers/nouveau/nvc0/nve4_compute.c --- mesa-19.2.8/src/gallium/drivers/nouveau/nvc0/nve4_compute.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/nouveau/nvc0/nve4_compute.c 2020-06-12 01:21:17.000000000 +0000 @@ -448,7 +448,7 @@ PUSH_DATA (push, nvc0->buffers[s][i].buffer_size); PUSH_DATA (push, 0); BCTX_REFN(nvc0->bufctx_cp, CP_BUF, res, RDWR); - util_range_add(&res->valid_buffer_range, + util_range_add(&res->base, &res->valid_buffer_range, nvc0->buffers[s][i].buffer_offset, nvc0->buffers[s][i].buffer_offset + nvc0->buffers[s][i].buffer_size); diff -Nru mesa-19.2.8/src/gallium/drivers/panfrost/Android.mk mesa-20.0.8/src/gallium/drivers/panfrost/Android.mk --- mesa-19.2.8/src/gallium/drivers/panfrost/Android.mk 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/panfrost/Android.mk 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,59 @@ +# Copyright © 2019 Collabora Ltd. +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +LOCAL_PATH := $(call my-dir) + +# get C_SOURCES +include $(LOCAL_PATH)/Makefile.sources + +include $(CLEAR_VARS) + +LOCAL_SRC_FILES := \ + $(C_SOURCES) + +LOCAL_C_INCLUDES := \ + $(MESA_TOP)/src/gallium/auxiliary/ \ + $(MESA_TOP)/src/gallium/include/ \ + $(MESA_TOP)/src/panfrost/include/ \ + $(MESA_TOP)/src/panfrost/ + +LOCAL_MODULE := libmesa_pipe_panfrost + +LOCAL_SHARED_LIBRARIES := libdrm + +LOCAL_STATIC_LIBRARIES := \ + libmesa_nir \ + libmesa_winsys_panfrost \ + libpanfrost_bifrost \ + libpanfrost_decode \ + libpanfrost_encoder \ + libpanfrost_midgard \ + libpanfrost_shared + +LOCAL_MODULE_CLASS := STATIC_LIBRARIES + +include $(GALLIUM_COMMON_MK) +include $(BUILD_STATIC_LIBRARY) + +ifneq ($(HAVE_GALLIUM_PANFROST),) +GALLIUM_TARGET_DRIVERS += panfrost +$(eval GALLIUM_LIBS += $(LOCAL_MODULE) libmesa_winsys_panfrost) +$(eval GALLIUM_SHARED_LIBS += $(LOCAL_SHARED_LIBRARIES)) +endif diff -Nru mesa-19.2.8/src/gallium/drivers/panfrost/ci/arm64.config mesa-20.0.8/src/gallium/drivers/panfrost/ci/arm64.config --- mesa-19.2.8/src/gallium/drivers/panfrost/ci/arm64.config 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/panfrost/ci/arm64.config 1970-01-01 00:00:00.000000000 +0000 @@ -1,82 +0,0 @@ -CONFIG_LOCALVERSION="ccu" - -CONFIG_DEBUG_KERNEL=y - -CONFIG_DEVFREQ_GOV_PERFORMANCE=y -CONFIG_DEVFREQ_GOV_POWERSAVE=y -CONFIG_DEVFREQ_GOV_USERSPACE=y -CONFIG_DEVFREQ_GOV_PASSIVE=y - -CONFIG_DRM=y -CONFIG_DRM_ROCKCHIP=y -CONFIG_DRM_PANFROST=y -CONFIG_DRM_PANEL_SIMPLE=y -CONFIG_PWM_CROS_EC=y -CONFIG_BACKLIGHT_PWM=y - -CONFIG_ROCKCHIP_CDN_DP=n - -CONFIG_SPI_ROCKCHIP=y -CONFIG_PWM_ROCKCHIP=y -CONFIG_PHY_ROCKCHIP_DP=y -CONFIG_DWMAC_ROCKCHIP=y -CONFIG_STMMAC_ETH=y -CONFIG_TYPEC_FUSB302=y -CONFIG_TYPEC=y -CONFIG_TYPEC_TCPM=y - -CONFIG_ARCH_SUNXI=n -CONFIG_ARCH_ALPINE=n -CONFIG_ARCH_BCM2835=n -CONFIG_ARCH_BCM_IPROC=n -CONFIG_ARCH_BERLIN=n -CONFIG_ARCH_BRCMSTB=n -CONFIG_ARCH_EXYNOS=n -CONFIG_ARCH_K3=n -CONFIG_ARCH_LAYERSCAPE=n -CONFIG_ARCH_LG1K=n -CONFIG_ARCH_HISI=n -CONFIG_ARCH_MEDIATEK=n -CONFIG_ARCH_MESON=n -CONFIG_ARCH_MVEBU=n -CONFIG_ARCH_QCOM=n -CONFIG_ARCH_SEATTLE=n -CONFIG_ARCH_SYNQUACER=n -CONFIG_ARCH_RENESAS=n -CONFIG_ARCH_R8A774A1=n -CONFIG_ARCH_R8A774C0=n -CONFIG_ARCH_R8A7795=n -CONFIG_ARCH_R8A7796=n -CONFIG_ARCH_R8A77965=n -CONFIG_ARCH_R8A77970=n -CONFIG_ARCH_R8A77980=n -CONFIG_ARCH_R8A77990=n -CONFIG_ARCH_R8A77995=n -CONFIG_ARCH_STRATIX10=n -CONFIG_ARCH_TEGRA=n -CONFIG_ARCH_SPRD=n -CONFIG_ARCH_THUNDER=n -CONFIG_ARCH_THUNDER2=n -CONFIG_ARCH_UNIPHIER=n -CONFIG_ARCH_VEXPRESS=n -CONFIG_ARCH_XGENE=n -CONFIG_ARCH_ZX=n -CONFIG_ARCH_ZYNQMP=n - -CONFIG_ACPI=n - -CONFIG_REGULATOR_FAN53555=y -CONFIG_REGULATOR=y - -CONFIG_REGULATOR_VCTRL=y - -CONFIG_KASAN=n -CONFIG_KASAN_INLINE=n -CONFIG_STACKTRACE=n - -CONFIG_TMPFS=y - -CONFIG_PROVE_LOCKING=n -CONFIG_DEBUG_LOCKDEP=n -CONFIG_SOFTLOCKUP_DETECTOR=n -CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=n \ No newline at end of file diff -Nru mesa-19.2.8/src/gallium/drivers/panfrost/ci/arm.config mesa-20.0.8/src/gallium/drivers/panfrost/ci/arm.config --- mesa-19.2.8/src/gallium/drivers/panfrost/ci/arm.config 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/panfrost/ci/arm.config 1970-01-01 00:00:00.000000000 +0000 @@ -1,45 +0,0 @@ -CONFIG_LOCALVERSION="ccu" - -CONFIG_DEBUG_KERNEL=y - -CONFIG_DEVFREQ_GOV_PERFORMANCE=y -CONFIG_DEVFREQ_GOV_POWERSAVE=y -CONFIG_DEVFREQ_GOV_USERSPACE=y -CONFIG_DEVFREQ_GOV_PASSIVE=y -CONFIG_DEVFREQ_GOV_SIMPLE_ONDEMAND=y - -CONFIG_DRM=y -CONFIG_DRM_ROCKCHIP=y -CONFIG_DRM_PANFROST=y -CONFIG_DRM_PANEL_SIMPLE=y -CONFIG_PWM_CROS_EC=y -CONFIG_BACKLIGHT_PWM=y - -CONFIG_ROCKCHIP_CDN_DP=n - -CONFIG_SPI_ROCKCHIP=y -CONFIG_PWM_ROCKCHIP=y -CONFIG_PHY_ROCKCHIP_DP=y -CONFIG_DWMAC_ROCKCHIP=y - -CONFIG_MFD_RK808=y -CONFIG_REGULATOR_RK808=y -CONFIG_RTC_DRV_RK808=y -CONFIG_COMMON_CLK_RK808=y - -CONFIG_REGULATOR_FAN53555=y -CONFIG_REGULATOR=y - -CONFIG_REGULATOR_VCTRL=y - -CONFIG_KASAN=n -CONFIG_KASAN_INLINE=n -CONFIG_STACKTRACE=n - -CONFIG_TMPFS=y - -CONFIG_PROVE_LOCKING=n -CONFIG_DEBUG_LOCKDEP=n -CONFIG_SOFTLOCKUP_DETECTOR=n -CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=n - diff -Nru mesa-19.2.8/src/gallium/drivers/panfrost/ci/create-rootfs.sh mesa-20.0.8/src/gallium/drivers/panfrost/ci/create-rootfs.sh --- mesa-19.2.8/src/gallium/drivers/panfrost/ci/create-rootfs.sh 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/panfrost/ci/create-rootfs.sh 1970-01-01 00:00:00.000000000 +0000 @@ -1,185 +0,0 @@ -#!/bin/sh - -set -ex - -apt-get -y install --no-install-recommends initramfs-tools libpng16-16 weston strace libsensors5 -passwd root -d -chsh -s /bin/sh -ln -s /bin/sh /init - -####################################################################### -# Strip the image to a small minimal system without removing the debian -# toolchain. - -# Copy timezone file and remove tzdata package -rm -rf /etc/localtime -cp /usr/share/zoneinfo/Etc/UTC /etc/localtime - - -UNNEEDED_PACKAGES=" libfdisk1"\ -" tzdata"\ - -export DEBIAN_FRONTEND=noninteractive - -# Removing unused packages -for PACKAGE in ${UNNEEDED_PACKAGES} -do - echo ${PACKAGE} - if ! apt-get remove --purge --yes "${PACKAGE}" - then - echo "WARNING: ${PACKAGE} isn't installed" - fi -done - -apt-get autoremove --yes || true - -# Dropping logs -rm -rf /var/log/* - -# Dropping documentation, localization, i18n files, etc -rm -rf /usr/share/doc/* -rm -rf /usr/share/locale/* -rm -rf /usr/share/man -rm -rf /usr/share/i18n/* -rm -rf /usr/share/info/* -rm -rf /usr/share/lintian/* -rm -rf /usr/share/common-licenses/* -rm -rf /usr/share/mime/* - -# Dropping reportbug scripts -rm -rf /usr/share/bug - -# Drop udev hwdb not required on a stripped system -rm -rf /lib/udev/hwdb.bin /lib/udev/hwdb.d/* - -# Drop all gconv conversions && binaries -rm -rf usr/bin/iconv -rm -rf usr/sbin/iconvconfig -rm -rf usr/lib/*/gconv/ - -# Remove libusb database -rm -rf usr/sbin/update-usbids -rm -rf var/lib/usbutils/usb.ids -rm -rf usr/share/misc/usb.ids - -####################################################################### -# Crush into a minimal production image to be deployed via some type of image -# updating system. -# IMPORTANT: The Debian system is not longer functional at this point, -# for example, apt and dpkg will stop working - -UNNEEDED_PACKAGES="apt libapt-pkg5.0 "\ -"ncurses-bin ncurses-base libncursesw5 libncurses5 "\ -"perl-base "\ -"debconf libdebconfclient0 "\ -"e2fsprogs e2fslibs libfdisk1 "\ -"insserv "\ -"udev "\ -"init-system-helpers "\ -"bash "\ -"cpio "\ -"passwd "\ -"libsemanage1 libsemanage-common "\ -"libsepol1 "\ -"gzip "\ -"gnupg "\ -"gpgv "\ -"hostname "\ -"adduser "\ -"debian-archive-keyring "\ -"libgl1 libgl1-mesa-dri libglapi-mesa libglvnd0 libglx-mesa0 libegl-mesa0 libgles2 "\ -"libllvm7 "\ -"libx11-data libthai-data "\ -"systemd dbus "\ - -# Removing unneeded packages -for PACKAGE in ${UNNEEDED_PACKAGES} -do - echo "Forcing removal of ${PACKAGE}" - if ! dpkg --purge --force-remove-essential --force-depends "${PACKAGE}" - then - echo "WARNING: ${PACKAGE} isn't installed" - fi -done - -# Show what's left package-wise before dropping dpkg itself -COLUMNS=300 dpkg-query -W --showformat='${Installed-Size;10}\t${Package}\n' | sort -k1,1n - -# Drop dpkg -dpkg --purge --force-remove-essential --force-depends dpkg - -# No apt or dpkg, no need for its configuration archives -rm -rf etc/apt -rm -rf etc/dpkg - -# Drop directories not part of ostree -# Note that /var needs to exist as ostree bind mounts the deployment /var over -# it -rm -rf var/* opt srv share - -# ca-certificates are in /etc drop the source -rm -rf usr/share/ca-certificates - -# No bash, no need for completions -rm -rf usr/share/bash-completion - -# No zsh, no need for comletions -rm -rf usr/share/zsh/vendor-completions - -# drop gcc-6 python helpers -rm -rf usr/share/gcc-6 - -# Drop sysvinit leftovers -rm -rf etc/init.d -rm -rf etc/rc[0-6S].d - -# Drop upstart helpers -rm -rf etc/init - -# Various xtables helpers -rm -rf usr/lib/xtables - -# Drop all locales -# TODO: only remaining locale is actually "C". Should we really remove it? -rm -rf usr/lib/locale/* - -# partition helpers -rm usr/sbin/*fdisk - -# local compiler -rm usr/bin/localedef - -# Systemd dns resolver -find usr etc -name '*systemd-resolve*' -prune -exec rm -r {} \; - -# Systemd network configuration -find usr etc -name '*networkd*' -prune -exec rm -r {} \; - -# systemd ntp client -find usr etc -name '*timesyncd*' -prune -exec rm -r {} \; - -# systemd hw database manager -find usr etc -name '*systemd-hwdb*' -prune -exec rm -r {} \; - -# No need for fuse -find usr etc -name '*fuse*' -prune -exec rm -r {} \; - -# lsb init function leftovers -rm -rf usr/lib/lsb - -# Only needed when adding libraries -rm usr/sbin/ldconfig* - -# Games, unused -rmdir usr/games - -# Remove pam module to authenticate against a DB -# plus libdb-5.3.so that is only used by this pam module -rm usr/lib/*/security/pam_userdb.so -rm usr/lib/*/libdb-5.3.so - -# remove NSS support for nis, nisplus and hesiod -rm usr/lib/*/libnss_hesiod* -rm usr/lib/*/libnss_nis* - -rm usr/bin/tar \ No newline at end of file diff -Nru mesa-19.2.8/src/gallium/drivers/panfrost/ci/debian-install.sh mesa-20.0.8/src/gallium/drivers/panfrost/ci/debian-install.sh --- mesa-19.2.8/src/gallium/drivers/panfrost/ci/debian-install.sh 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/panfrost/ci/debian-install.sh 1970-01-01 00:00:00.000000000 +0000 @@ -1,125 +0,0 @@ -#!/bin/bash - -set -e -set -o xtrace - -PANFROST_CI_DIR=/tmp/clone/src/gallium/drivers/panfrost/ci - -############### Install packages for building -dpkg --add-architecture ${DEBIAN_ARCH} -echo 'deb-src https://deb.debian.org/debian testing main' > /etc/apt/sources.list.d/deb-src.list -apt-get update -apt-get -y install ca-certificates -apt-get -y install --no-install-recommends \ - crossbuild-essential-${DEBIAN_ARCH} \ - meson \ - g++ \ - git \ - ccache \ - pkg-config \ - python3-mako \ - python-numpy \ - python-six \ - python-mako \ - python3-pip \ - python3-setuptools \ - python3-six \ - python3-wheel \ - python3-jinja2 \ - bison \ - flex \ - libwayland-dev \ - gettext \ - cmake \ - bc \ - libssl-dev \ - lavacli \ - csvkit \ - curl \ - unzip \ - wget \ - debootstrap \ - procps \ - qemu-user-static \ - cpio \ - \ - libdrm-dev:${DEBIAN_ARCH} \ - libx11-dev:${DEBIAN_ARCH} \ - libxxf86vm-dev:${DEBIAN_ARCH} \ - libexpat1-dev:${DEBIAN_ARCH} \ - libsensors-dev:${DEBIAN_ARCH} \ - libxfixes-dev:${DEBIAN_ARCH} \ - libxdamage-dev:${DEBIAN_ARCH} \ - libxext-dev:${DEBIAN_ARCH} \ - x11proto-dev:${DEBIAN_ARCH} \ - libx11-xcb-dev:${DEBIAN_ARCH} \ - libxcb-dri2-0-dev:${DEBIAN_ARCH} \ - libxcb-glx0-dev:${DEBIAN_ARCH} \ - libxcb-xfixes0-dev:${DEBIAN_ARCH} \ - libxcb-dri3-dev:${DEBIAN_ARCH} \ - libxcb-present-dev:${DEBIAN_ARCH} \ - libxcb-randr0-dev:${DEBIAN_ARCH} \ - libxcb-sync-dev:${DEBIAN_ARCH} \ - libxrandr-dev:${DEBIAN_ARCH} \ - libxshmfence-dev:${DEBIAN_ARCH} \ - libelf-dev:${DEBIAN_ARCH} \ - libwayland-dev:${DEBIAN_ARCH} \ - libwayland-egl-backend-dev:${DEBIAN_ARCH} \ - libclang-7-dev:${DEBIAN_ARCH} \ - zlib1g-dev:${DEBIAN_ARCH} \ - libglvnd-core-dev:${DEBIAN_ARCH} \ - wayland-protocols:${DEBIAN_ARCH} \ - libpng-dev:${DEBIAN_ARCH} - -############### Cross-build dEQP -mkdir -p /artifacts/rootfs/deqp - -wget https://github.com/KhronosGroup/VK-GL-CTS/archive/opengl-es-cts-3.2.5.0.zip -unzip -q opengl-es-cts-3.2.5.0.zip -d / -rm opengl-es-cts-3.2.5.0.zip - -cd /VK-GL-CTS-opengl-es-cts-3.2.5.0 -python3 external/fetch_sources.py - -cd /artifacts/rootfs/deqp -cmake -DDEQP_TARGET=wayland \ - -DCMAKE_BUILD_TYPE=Release \ - -DCMAKE_C_COMPILER=${GCC_ARCH}-gcc \ - -DCMAKE_CXX_COMPILER=${GCC_ARCH}-g++ \ - /VK-GL-CTS-opengl-es-cts-3.2.5.0 -make -j$(nproc) -rm -rf /artifacts/rootfs/deqp/external -rm -rf /artifacts/rootfs/deqp/modules/gles3 -rm -rf /artifacts/rootfs/deqp/modules/gles31 -rm -rf /artifacts/rootfs/deqp/modules/internal -rm -rf /artifacts/rootfs/deqp/executor -rm -rf /artifacts/rootfs/deqp/execserver -rm -rf /artifacts/rootfs/deqp/modules/egl -rm -rf /artifacts/rootfs/deqp/framework -find . -name CMakeFiles | xargs rm -rf -find . -name lib\*.a | xargs rm -rf -du -sh * -rm -rf /VK-GL-CTS-opengl-es-cts-3.2.5.0 - - -############### Cross-build kernel -KERNEL_URL="https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/snapshot/linux-5.2.tar.gz" -export ARCH=${KERNEL_ARCH} -export CROSS_COMPILE="${GCC_ARCH}-" - -mkdir -p /kernel -wget -qO- ${KERNEL_URL} | tar -xz --strip-components=1 -C /kernel -cd /kernel -./scripts/kconfig/merge_config.sh ${DEFCONFIG} ${PANFROST_CI_DIR}/${KERNEL_ARCH}.config -make -j12 ${KERNEL_IMAGE_NAME} dtbs -cp arch/${KERNEL_ARCH}/boot/${KERNEL_IMAGE_NAME} /artifacts/. -cp ${DEVICE_TREES} /artifacts/. -rm -rf /kernel - - -############### Create rootfs -cp ${PANFROST_CI_DIR}/create-rootfs.sh /artifacts/rootfs/. -debootstrap --variant=minbase --arch=${DEBIAN_ARCH} testing /artifacts/rootfs/ http://deb.debian.org/debian -chroot /artifacts/rootfs sh /create-rootfs.sh -rm /artifacts/rootfs/create-rootfs.sh - diff -Nru mesa-19.2.8/src/gallium/drivers/panfrost/ci/deqp-runner.sh mesa-20.0.8/src/gallium/drivers/panfrost/ci/deqp-runner.sh --- mesa-19.2.8/src/gallium/drivers/panfrost/ci/deqp-runner.sh 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/panfrost/ci/deqp-runner.sh 1970-01-01 00:00:00.000000000 +0000 @@ -1,49 +0,0 @@ -#!/bin/sh - -set -x - -DEQP_OPTIONS="--deqp-surface-width=256 --deqp-surface-height=256" -DEQP_OPTIONS="$DEQP_OPTIONS --deqp-visibility=hidden" -DEQP_OPTIONS="$DEQP_OPTIONS --deqp-log-images=disable" -DEQP_OPTIONS="$DEQP_OPTIONS --deqp-watchdog=enable" -DEQP_OPTIONS="$DEQP_OPTIONS --deqp-crashhandler=enable" - -export XDG_RUNTIME_DIR=/tmp -export LIBGL_DRIVERS_PATH=/mesa/lib/dri/ -export LD_LIBRARY_PATH=/mesa/lib/ -export XDG_CONFIG_HOME=$(pwd) -export MESA_GLES_VERSION_OVERRIDE=3.0 - -echo "[core]\nidle-time=0\nrequire-input=false\n[shell]\nlocking=false" > weston.ini - -cd /deqp/modules/gles2 - -# Generate test case list file -weston --tty=7 & -sleep 1 # Give some time for Weston to start up -./deqp-gles2 $DEQP_OPTIONS --deqp-runmode=stdout-caselist | grep "TEST: dEQP-GLES2" | cut -d ' ' -f 2 > /tmp/case-list.txt - -# Disable for now tests that are very slow, either by just using lots of CPU or by crashing -sed -i '/dEQP-GLES2.performance/d' /tmp/case-list.txt -sed -i '/dEQP-GLES2.stress/d' /tmp/case-list.txt -sed -i '/dEQP-GLES2.functional.fbo.render.depth./d' /tmp/case-list.txt -sed -i '/dEQP-GLES2.functional.flush_finish./d' /tmp/case-list.txt - -# Cannot use tee because dash doesn't have pipefail -touch /tmp/result.txt -tail -f /tmp/result.txt & - -while [ -s /tmp/case-list.txt ]; do - ./deqp-gles2 $DEQP_OPTIONS --deqp-log-filename=/dev/null --deqp-caselist-file=/tmp/case-list.txt >> /tmp/result.txt - if [ $? -ne 0 ]; then - # Continue from the subtest after the failing one - crashed_test=$(grep "Test case" /tmp/result.txt | tail -1 | sed "s/Test case '\(.*\)'\.\./\1/") - sed -i "0,/^$crashed_test$/d" /tmp/case-list.txt - - # So LAVA knows what happened - echo "Test case '$crashed_test'.. - Crash" - else - break - fi -done diff -Nru mesa-19.2.8/src/gallium/drivers/panfrost/ci/expected-failures.txt mesa-20.0.8/src/gallium/drivers/panfrost/ci/expected-failures.txt --- mesa-19.2.8/src/gallium/drivers/panfrost/ci/expected-failures.txt 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/panfrost/ci/expected-failures.txt 1970-01-01 00:00:00.000000000 +0000 @@ -1,237 +0,0 @@ -dEQP-GLES2.functional.buffer.write.use.index_array.array -dEQP-GLES2.functional.buffer.write.use.index_array.element_array -dEQP-GLES2.functional.color_clear.masked_rgb -dEQP-GLES2.functional.color_clear.masked_rgba -dEQP-GLES2.functional.color_clear.masked_scissored_rgb -dEQP-GLES2.functional.color_clear.masked_scissored_rgba -dEQP-GLES2.functional.color_clear.scissored_rgb -dEQP-GLES2.functional.color_clear.scissored_rgba -dEQP-GLES2.functional.color_clear.short_scissored_rgb -dEQP-GLES2.functional.fbo.render.recreate_colorbuffer.no_rebind_rbo_rgb565_depth_component16 -dEQP-GLES2.functional.fbo.render.recreate_colorbuffer.no_rebind_rbo_rgb565_stencil_index8 -dEQP-GLES2.functional.fbo.render.recreate_colorbuffer.no_rebind_rbo_rgb5_a1_depth_component16 -dEQP-GLES2.functional.fbo.render.recreate_colorbuffer.no_rebind_rbo_rgb5_a1_stencil_index8 -dEQP-GLES2.functional.fbo.render.recreate_colorbuffer.no_rebind_rbo_rgba4_depth_component16 -dEQP-GLES2.functional.fbo.render.recreate_colorbuffer.no_rebind_rbo_rgba4_stencil_index8 -dEQP-GLES2.functional.fbo.render.recreate_colorbuffer.no_rebind_tex2d_rgb_depth_component16 -dEQP-GLES2.functional.fbo.render.recreate_colorbuffer.no_rebind_tex2d_rgb_stencil_index8 -dEQP-GLES2.functional.fbo.render.recreate_colorbuffer.no_rebind_tex2d_rgba_depth_component16 -dEQP-GLES2.functional.fbo.render.recreate_colorbuffer.no_rebind_tex2d_rgba_stencil_index8 -dEQP-GLES2.functional.fbo.render.recreate_colorbuffer.rebind_rbo_rgb565_depth_component16 -dEQP-GLES2.functional.fbo.render.recreate_colorbuffer.rebind_rbo_rgb565_stencil_index8 -dEQP-GLES2.functional.fbo.render.recreate_colorbuffer.rebind_rbo_rgb5_a1_depth_component16 -dEQP-GLES2.functional.fbo.render.recreate_colorbuffer.rebind_rbo_rgb5_a1_stencil_index8 -dEQP-GLES2.functional.fbo.render.recreate_colorbuffer.rebind_rbo_rgba4_depth_component16 -dEQP-GLES2.functional.fbo.render.recreate_colorbuffer.rebind_rbo_rgba4_stencil_index8 -dEQP-GLES2.functional.fbo.render.recreate_colorbuffer.rebind_tex2d_rgb_depth_component16 -dEQP-GLES2.functional.fbo.render.recreate_colorbuffer.rebind_tex2d_rgb_stencil_index8 -dEQP-GLES2.functional.fbo.render.recreate_colorbuffer.rebind_tex2d_rgba_depth_component16 -dEQP-GLES2.functional.fbo.render.recreate_colorbuffer.rebind_tex2d_rgba_stencil_index8 -dEQP-GLES2.functional.fbo.render.shared_colorbuffer.rbo_rgb565_depth_component16 -dEQP-GLES2.functional.fbo.render.shared_colorbuffer.tex2d_rgb_depth_component16 -dEQP-GLES2.functional.fbo.render.shared_colorbuffer.tex2d_rgba_depth_component16 -dEQP-GLES2.functional.fbo.render.shared_depthbuffer.rbo_rgb565_depth_component16 -dEQP-GLES2.functional.fbo.render.shared_depthbuffer.tex2d_rgb_depth_component16 -dEQP-GLES2.functional.fbo.render.shared_depthbuffer.tex2d_rgba_depth_component16 -dEQP-GLES2.functional.fbo.render.texsubimage.after_render_tex2d_rgb -dEQP-GLES2.functional.fbo.render.texsubimage.after_render_tex2d_rgba -dEQP-GLES2.functional.fbo.render.texsubimage.between_render_tex2d_rgb -dEQP-GLES2.functional.fbo.render.texsubimage.between_render_tex2d_rgba -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.0 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.1 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.10 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.11 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.12 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.13 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.15 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.16 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.18 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.19 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.20 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.21 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.22 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.23 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.24 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.25 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.26 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.29 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.3 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.30 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.31 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.32 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.33 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.34 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.35 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.36 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.37 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.38 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.39 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.40 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.41 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.42 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.43 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.44 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.46 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.47 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.48 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.49 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.5 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.50 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.51 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.52 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.53 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.54 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.55 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.56 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.57 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.58 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.59 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.6 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.60 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.61 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.62 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.63 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.64 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.65 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.66 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.67 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.68 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.69 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.7 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.70 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.71 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.72 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.73 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.74 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.75 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.76 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.77 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.78 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.79 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.8 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.80 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.81 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.82 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.83 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.84 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.85 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.86 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.87 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.88 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.89 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.9 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.90 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.91 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.92 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.93 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.94 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.95 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.96 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.97 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.98 -dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.99 -dEQP-GLES2.functional.fragment_ops.random.0 -dEQP-GLES2.functional.fragment_ops.random.1 -dEQP-GLES2.functional.fragment_ops.random.10 -dEQP-GLES2.functional.fragment_ops.random.11 -dEQP-GLES2.functional.fragment_ops.random.12 -dEQP-GLES2.functional.fragment_ops.random.13 -dEQP-GLES2.functional.fragment_ops.random.14 -dEQP-GLES2.functional.fragment_ops.random.15 -dEQP-GLES2.functional.fragment_ops.random.16 -dEQP-GLES2.functional.fragment_ops.random.17 -dEQP-GLES2.functional.fragment_ops.random.18 -dEQP-GLES2.functional.fragment_ops.random.19 -dEQP-GLES2.functional.fragment_ops.random.2 -dEQP-GLES2.functional.fragment_ops.random.20 -dEQP-GLES2.functional.fragment_ops.random.21 -dEQP-GLES2.functional.fragment_ops.random.22 -dEQP-GLES2.functional.fragment_ops.random.23 -dEQP-GLES2.functional.fragment_ops.random.24 -dEQP-GLES2.functional.fragment_ops.random.25 -dEQP-GLES2.functional.fragment_ops.random.26 -dEQP-GLES2.functional.fragment_ops.random.27 -dEQP-GLES2.functional.fragment_ops.random.28 -dEQP-GLES2.functional.fragment_ops.random.29 -dEQP-GLES2.functional.fragment_ops.random.3 -dEQP-GLES2.functional.fragment_ops.random.30 -dEQP-GLES2.functional.fragment_ops.random.31 -dEQP-GLES2.functional.fragment_ops.random.32 -dEQP-GLES2.functional.fragment_ops.random.33 -dEQP-GLES2.functional.fragment_ops.random.34 -dEQP-GLES2.functional.fragment_ops.random.35 -dEQP-GLES2.functional.fragment_ops.random.36 -dEQP-GLES2.functional.fragment_ops.random.37 -dEQP-GLES2.functional.fragment_ops.random.38 -dEQP-GLES2.functional.fragment_ops.random.39 -dEQP-GLES2.functional.fragment_ops.random.4 -dEQP-GLES2.functional.fragment_ops.random.40 -dEQP-GLES2.functional.fragment_ops.random.41 -dEQP-GLES2.functional.fragment_ops.random.42 -dEQP-GLES2.functional.fragment_ops.random.43 -dEQP-GLES2.functional.fragment_ops.random.44 -dEQP-GLES2.functional.fragment_ops.random.45 -dEQP-GLES2.functional.fragment_ops.random.46 -dEQP-GLES2.functional.fragment_ops.random.47 -dEQP-GLES2.functional.fragment_ops.random.48 -dEQP-GLES2.functional.fragment_ops.random.49 -dEQP-GLES2.functional.fragment_ops.random.5 -dEQP-GLES2.functional.fragment_ops.random.50 -dEQP-GLES2.functional.fragment_ops.random.51 -dEQP-GLES2.functional.fragment_ops.random.52 -dEQP-GLES2.functional.fragment_ops.random.53 -dEQP-GLES2.functional.fragment_ops.random.54 -dEQP-GLES2.functional.fragment_ops.random.55 -dEQP-GLES2.functional.fragment_ops.random.56 -dEQP-GLES2.functional.fragment_ops.random.57 -dEQP-GLES2.functional.fragment_ops.random.58 -dEQP-GLES2.functional.fragment_ops.random.59 -dEQP-GLES2.functional.fragment_ops.random.6 -dEQP-GLES2.functional.fragment_ops.random.60 -dEQP-GLES2.functional.fragment_ops.random.61 -dEQP-GLES2.functional.fragment_ops.random.62 -dEQP-GLES2.functional.fragment_ops.random.63 -dEQP-GLES2.functional.fragment_ops.random.64 -dEQP-GLES2.functional.fragment_ops.random.65 -dEQP-GLES2.functional.fragment_ops.random.66 -dEQP-GLES2.functional.fragment_ops.random.67 -dEQP-GLES2.functional.fragment_ops.random.68 -dEQP-GLES2.functional.fragment_ops.random.69 -dEQP-GLES2.functional.fragment_ops.random.7 -dEQP-GLES2.functional.fragment_ops.random.70 -dEQP-GLES2.functional.fragment_ops.random.71 -dEQP-GLES2.functional.fragment_ops.random.72 -dEQP-GLES2.functional.fragment_ops.random.73 -dEQP-GLES2.functional.fragment_ops.random.74 -dEQP-GLES2.functional.fragment_ops.random.75 -dEQP-GLES2.functional.fragment_ops.random.76 -dEQP-GLES2.functional.fragment_ops.random.77 -dEQP-GLES2.functional.fragment_ops.random.78 -dEQP-GLES2.functional.fragment_ops.random.79 -dEQP-GLES2.functional.fragment_ops.random.8 -dEQP-GLES2.functional.fragment_ops.random.80 -dEQP-GLES2.functional.fragment_ops.random.81 -dEQP-GLES2.functional.fragment_ops.random.82 -dEQP-GLES2.functional.fragment_ops.random.83 -dEQP-GLES2.functional.fragment_ops.random.84 -dEQP-GLES2.functional.fragment_ops.random.85 -dEQP-GLES2.functional.fragment_ops.random.86 -dEQP-GLES2.functional.fragment_ops.random.87 -dEQP-GLES2.functional.fragment_ops.random.88 -dEQP-GLES2.functional.fragment_ops.random.89 -dEQP-GLES2.functional.fragment_ops.random.9 -dEQP-GLES2.functional.fragment_ops.random.90 -dEQP-GLES2.functional.fragment_ops.random.91 -dEQP-GLES2.functional.fragment_ops.random.92 -dEQP-GLES2.functional.fragment_ops.random.93 -dEQP-GLES2.functional.fragment_ops.random.94 -dEQP-GLES2.functional.fragment_ops.random.95 -dEQP-GLES2.functional.fragment_ops.random.96 -dEQP-GLES2.functional.fragment_ops.random.97 -dEQP-GLES2.functional.fragment_ops.random.98 -dEQP-GLES2.functional.fragment_ops.random.99 -dEQP-GLES2.functional.polygon_offset.fixed16_render_with_units -dEQP-GLES2.functional.polygon_offset.fixed16_factor_1_slope -dEQP-GLES2.functional.shaders.builtin_variable.fragcoord_w -dEQP-GLES2.functional.shaders.scoping.valid.local_variable_hides_function_parameter_fragment -dEQP-GLES2.functional.shaders.scoping.valid.local_variable_hides_function_parameter_vertex diff -Nru mesa-19.2.8/src/gallium/drivers/panfrost/ci/generate_lava.py mesa-20.0.8/src/gallium/drivers/panfrost/ci/generate_lava.py --- mesa-19.2.8/src/gallium/drivers/panfrost/ci/generate_lava.py 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/panfrost/ci/generate_lava.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,23 +0,0 @@ -#!/usr/bin/env python3 - -from jinja2 import Environment, FileSystemLoader -import argparse - -parser = argparse.ArgumentParser() -parser.add_argument("--template") -parser.add_argument("--base-artifacts-url") -parser.add_argument("--arch") -parser.add_argument("--device-type") -parser.add_argument("--kernel-image-name") -args = parser.parse_args() - -env = Environment(loader = FileSystemLoader('.'), trim_blocks=True, lstrip_blocks=True) -template = env.get_template(args.template) - -values = {} -values['base_artifacts_url'] = args.base_artifacts_url -values['arch'] = args.arch -values['device_type'] = args.device_type -values['kernel_image_name'] = args.kernel_image_name - -print(template.render(values)) diff -Nru mesa-19.2.8/src/gallium/drivers/panfrost/ci/gitlab-ci.yml mesa-20.0.8/src/gallium/drivers/panfrost/ci/gitlab-ci.yml --- mesa-19.2.8/src/gallium/drivers/panfrost/ci/gitlab-ci.yml 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/panfrost/ci/gitlab-ci.yml 1970-01-01 00:00:00.000000000 +0000 @@ -1,264 +0,0 @@ -# IMAGE_TAG is the tag of the docker image used for the build jobs. If the -# image doesn't exist yet, the docker-image stage generates it. -# -# In order to generate a new image, one should generally change the tag. -# While removing the image from the registry would also work, that's not -# recommended except for ephemeral images during development: Replacing an -# image after a significant amount of time might pull in newer versions of -# gcc/clang or other packages, which might break the build with older commits -# using the same tag. -# -# After merging a change resulting in generating a new image to the main -# repository, it's recommended to remove the image from the source repository's -# container registry, so that the image from the main repository's registry -# will be used there as well. You can manage your images on your fork of: -# https://gitlab.freedesktop.org/xorg/xserver/container_registry -variables: - UPSTREAM_REPO: mesa/mesa - DEBIAN_VERSION: testing-slim - IMAGE_TAG: "2019-07-25-1" - -include: - - project: 'wayland/ci-templates' - ref: c73dae8b84697ef18e2dbbf4fed7386d9652b0cd - file: '/templates/debian.yml' - -stages: - - containers - - build - - test - -# Retry jobs after runner system failures -.retry: &retry - retry: - max: 2 - when: - - runner_system_failure - -# Build Docker image with deqp, the rootfs and the build deps for Mesa -.container: - extends: .debian@container-ifnot-exists - stage: containers - <<: *retry - variables: - GIT_STRATEGY: none # no need to pull the whole tree for rebuilding the image - DEBIAN_TAG: '${DEBIAN_ARCH}-${IMAGE_TAG}' - DEBIAN_EXEC: 'DEBIAN_ARCH=${DEBIAN_ARCH} - GCC_ARCH=${GCC_ARCH} - KERNEL_ARCH=${KERNEL_ARCH} - DEFCONFIG=${DEFCONFIG} - DEVICE_TREES=${DEVICE_TREES} - KERNEL_IMAGE_NAME=${KERNEL_IMAGE_NAME} - bash src/gallium/drivers/panfrost/ci/debian-install.sh' - -container:armhf: - extends: .container - variables: - DEBIAN_ARCH: "armhf" - GCC_ARCH: "arm-linux-gnueabihf" - KERNEL_ARCH: "arm" - DEFCONFIG: "arch/arm/configs/multi_v7_defconfig" - DEVICE_TREES: "arch/arm/boot/dts/rk3288-veyron-jaq.dtb" - KERNEL_IMAGE_NAME: "zImage" - -container:arm64: - extends: .container - variables: - DEBIAN_ARCH: "arm64" - GCC_ARCH: "aarch64-linux-gnu" - KERNEL_ARCH: "arm64" - DEFCONFIG: "arch/arm64/configs/defconfig" - DEVICE_TREES: "arch/arm64/boot/dts/rockchip/rk3399-gru-kevin.dtb" - KERNEL_IMAGE_NAME: "Image" - -.build: - stage: build - image: $CI_REGISTRY_IMAGE/debian/$DEBIAN_VERSION:${DEBIAN_ARCH}-${IMAGE_TAG} - cache: - paths: - - ccache - before_script: - - mkdir -p results mesa-build - - mkdir -p ccache - script: - - export CCACHE_BASEDIR=$CI_PROJECT_DIR - - export CCACHE_DIR=$CI_PROJECT_DIR/ccache - - export PATH="/usr/lib/ccache:$PATH" - - ccache -s - - # Build Mesa - - /usr/share/meson/debcrossgen --arch ${DEBIAN_ARCH} -o /tmp/cross_file.txt - - meson . mesa-build - --cross-file /tmp/cross_file.txt - --libdir /artifacts/rootfs/mesa/lib/ - --buildtype release - -Dgallium-drivers=kmsro,panfrost - -Ddri-drivers= - -Dprefix=/artifacts/rootfs/mesa - - ninja -C mesa-build install - - du -sh /artifacts/rootfs/mesa/* - - rm -rf /artifacts/rootfs/mesa/include - - # Pack rootfs - - cp src/gallium/drivers/panfrost/ci/deqp-runner.sh /artifacts/rootfs/deqp/. - - du -sh /artifacts/rootfs/deqp/* - - find /artifacts/rootfs/ -type f -printf "%s\t%p\n" | sort -n - - cd /artifacts/rootfs/ ; find -H | cpio -H newc -v -o | gzip -c - > $CI_PROJECT_DIR/results/panfrost-rootfs-${DEBIAN_ARCH}.cpio.gz - - # Copy kernel and DT - - cp /artifacts/${KERNEL_IMAGE_NAME} /artifacts/*.dtb $CI_PROJECT_DIR/results/. - - # Generate LAVA job - - cd $CI_PROJECT_DIR - - src/gallium/drivers/panfrost/ci/generate_lava.py - --template src/gallium/drivers/panfrost/ci/lava-deqp.yml.jinja2 - --arch ${DEBIAN_ARCH} - --base-artifacts-url $CI_PROJECT_URL/-/jobs/$CI_JOB_ID/artifacts/raw/results - --device-type ${DEVICE_TYPE} - --kernel-image-name ${KERNEL_IMAGE_NAME} - > results/lava-deqp.yml - - cp src/gallium/drivers/panfrost/ci/expected-failures.txt results/. - artifacts: - when: always - paths: - - results/ - -build:armhf: - extends: .build - variables: - DEBIAN_ARCH: "armhf" - GCC_ARCH: "arm-linux-gnueabihf" - DEVICE_TYPE: "rk3288-veyron-jaq" - KERNEL_IMAGE_NAME: "zImage" - -build:arm64: - extends: .build - variables: - DEBIAN_ARCH: "arm64" - GCC_ARCH: "aarch64-linux-gnu" - DEVICE_TYPE: "rk3399-gru-kevin" - KERNEL_IMAGE_NAME: "Image" - -.test: - stage: test - tags: - - idle-jobs - image: $CI_REGISTRY_IMAGE/debian/$DEBIAN_VERSION:arm64-${IMAGE_TAG} # Any of the images will be fine - variables: - GIT_STRATEGY: none # no need to pull the whole tree for submitting the job - script: - - mkdir -p ~/.config/ - - | - echo "default: - uri: https://lava.collabora.co.uk/RPC2 - timeout: 120 - username: jenkins-fdo - token: $LAVA_TOKEN - " > ~/.config/lavacli.yaml - - lava_job_id=`lavacli jobs submit $CI_PROJECT_DIR/results/lava-deqp.yml` || echo $lava_job_id - - lavacli jobs logs $lava_job_id | grep -a -v "{'case':" | tee results/lava-deqp-$lava_job_id.log - - lavacli jobs show $lava_job_id - - curl "https://lava.collabora.co.uk/results/$lava_job_id/csv?user=jenkins-fdo&token=$LAVA_TOKEN" > raw_results.csv - - cat raw_results.csv | csvcut -c 12,3 | grep dEQP-GLES2 | sort > results/results-$lava_job_id.csv - - # FIXME: Remove flip-flops from comparison files - - | - FLIP_FLOPS=" - dEQP-GLES2.functional.clipping.triangle_vertex.clip_three.clip_neg_x_neg_z_and_pos_x_pos_z_and_neg_x_neg_y_pos_z - dEQP-GLES2.functional.clipping.triangle_vertex.clip_three.clip_pos_y_pos_z_and_neg_x_neg_y_pos_z_and_pos_x_pos_y_neg_z - dEQP-GLES2.functional.fbo.render.color.blend_rbo_rgb5_a1 - dEQP-GLES2.functional.fbo.render.color.blend_rbo_rgb5_a1_depth_component16 - dEQP-GLES2.functional.fbo.render.color.blend_rbo_rgba4 - dEQP-GLES2.functional.fbo.render.color.blend_rbo_rgba4_depth_component16 - dEQP-GLES2.functional.fbo.render.color.blend_npot_rbo_rgb5_a1 - dEQP-GLES2.functional.fbo.render.color.blend_npot_rbo_rgb5_a1_depth_component16 - dEQP-GLES2.functional.fbo.render.color.blend_npot_rbo_rgba4 - dEQP-GLES2.functional.fbo.render.color.blend_npot_rbo_rgba4_depth_component16 - dEQP-GLES2.functional.fbo.render.color_clear.rbo_rgb5_a1 - dEQP-GLES2.functional.fbo.render.color_clear.rbo_rgb5_a1_depth_component16 - dEQP-GLES2.functional.fbo.render.color_clear.rbo_rgb5_a1_stencil_index8 - dEQP-GLES2.functional.fbo.render.color_clear.rbo_rgba4_depth_component16 - dEQP-GLES2.functional.fbo.render.color_clear.rbo_rgba4_stencil_index8 - dEQP-GLES2.functional.fbo.render.recreate_depthbuffer. - dEQP-GLES2.functional.fbo.render.recreate_stencilbuffer. - dEQP-GLES2.functional.fbo.render.shared_colorbuffer_clear.rbo_rgb5_a1 - dEQP-GLES2.functional.fbo.render.shared_colorbuffer_clear.rbo_rgba4 - dEQP-GLES2.functional.fbo.render.shared_colorbuffer_clear.tex2d_rgb - dEQP-GLES2.functional.fbo.render.shared_colorbuffer_clear.tex2d_rgba - dEQP-GLES2.functional.fbo.render.shared_colorbuffer.rbo_rgb5_a1 - dEQP-GLES2.functional.fbo.render.shared_colorbuffer.rbo_rgba4 - dEQP-GLES2.functional.fbo.render.shared_depthbuffer.rbo_rgb5_a1_depth_component16 - dEQP-GLES2.functional.fbo.render.shared_depthbuffer.rbo_rgba4_depth_component16 - dEQP-GLES2.functional.fbo.render.stencil_clear.rbo_rgb5_a1_stencil_index8 - dEQP-GLES2.functional.fbo.render.stencil.npot_rbo_rgb5_a1_stencil_index8 - dEQP-GLES2.functional.fbo.render.stencil.npot_rbo_rgba4_stencil_index8 - dEQP-GLES2.functional.fbo.render.stencil.rbo_rgb5_a1_stencil_index8 - dEQP-GLES2.functional.fbo.render.stencil.rbo_rgba4_stencil_index8 - dEQP-GLES2.functional.lifetime.attach.deleted_input.renderbuffer_framebuffer - dEQP-GLES2.functional.lifetime.attach.deleted_output.renderbuffer_framebuffer - dEQP-GLES2.functional.polygon_offset.fixed16_factor_0_slope - dEQP-GLES2.functional.polygon_offset.fixed16_factor_1_slope - dEQP-GLES2.functional.shaders.invariance.highp.loop_4 - dEQP-GLES2.functional.shaders.matrix.mul.dynamic_highp_mat4_vec4_vertex - dEQP-GLES2.functional.shaders.matrix.mul.dynamic_highp_vec4_mat4_fragment - dEQP-GLES2.functional.shaders.operator.common_functions.smoothstep.mediump_vec3_vertex - dEQP-GLES2.functional.shaders.random.all_features.fragment.12 - dEQP-GLES2.functional.shaders.random.all_features.fragment.37 - dEQP-GLES2.functional.texture.units.2_units.mixed.1 - dEQP-GLES2.functional.texture.units.2_units.mixed.3 - dEQP-GLES2.functional.texture.units.2_units.only_2d.2 - dEQP-GLES2.functional.texture.units.4_units.mixed.5 - dEQP-GLES2.functional.texture.units.4_units.only_2d.0 - dEQP-GLES2.functional.texture.units.8_units.only_cube.2 - dEQP-GLES2.functional.texture.units.all_units.mixed.6 - dEQP-GLES2.functional.texture.units.all_units.only_cube.4 - dEQP-GLES2.functional.texture.units.all_units.only_cube.7 - dEQP-GLES2.functional.texture.units.all_units.only_cube.8 - " - - # FIXME: These tests fail in RK3288 but pass on RK3399 - - | - FLIP_FLOPS="$FLIP_FLOPS - dEQP-GLES2.functional.fragment_ops.blend.* - dEQP-GLES2.functional.shaders.builtin_variable.max_draw_buffers_vertex - dEQP-GLES2.functional.shaders.matrix.div.const_lowp_mat2_mat2_vertex - dEQP-GLES2.functional.shaders.operator.unary_operator.pre_increment_effect.highp_ivec4_vertex - dEQP-GLES2.functional.shaders.texture_functions.vertex.texture2dprojlod_vec3 - dEQP-GLES2.functional.shaders.swizzles.vector_swizzles.mediump_ivec3_stts_fragment - dEQP-GLES2.functional.texture.filtering.2d.nearest_mipmap_nearest_linear_repeat_rgba8888 - dEQP-GLES2.functional.shaders.swizzles.vector_swizzles.mediump_ivec3_stts_fragment - dEQP-GLES2.functional.shaders.loops.do_while_constant_iterations.only_continue_vertex - dEQP-GLES2.functional.fbo.render.resize.tex2d_rgb_depth_component16 - dEQP-GLES2.functional.fbo.render.resize.tex2d_rgba_depth_component16 - dEQP-GLES2.functional.texture.filtering.2d.linear_mipmap_nearest_nearest_clamp_rgba8888 - " - - - for test in $FLIP_FLOPS; do sed -i "/$test/d" results/expected-failures.txt results/results-$lava_job_id.csv; done - - - PASSED=$(grep pass$ results/results-$lava_job_id.csv | wc -l) - - FAILED=$(grep fail$ results/results-$lava_job_id.csv | wc -l) - - TOTAL=$(wc -l < results/results-$lava_job_id.csv) - - 'echo "Passed: $PASSED ($(expr $PASSED \* 100 / $TOTAL)%)"' - - 'echo "Failed: $FAILED ($(expr $FAILED \* 100 / $TOTAL)%)"' - - 'echo "Total: $TOTAL"' - - 'if [ $TOTAL != 16374 ]; then echo "WARNING: Unexpected count of results. Incomplete run?"; fi' - - - sed '/,pass/d' results/results-$lava_job_id.csv | sed 's/,fail//' > results/failures-$lava_job_id.txt - - # Don't error out on RK3288 - - diff -u results/expected-failures.txt results/failures-$lava_job_id.txt || [ -f results/rk3288-veyron-jaq.dtb ] - artifacts: - when: always - paths: - - results/ - -test:armhf: - extends: .test - dependencies: - - build:armhf - -test:arm64: - extends: .test - dependencies: - - build:arm64 - diff -Nru mesa-19.2.8/src/gallium/drivers/panfrost/ci/lava-deqp.yml.jinja2 mesa-20.0.8/src/gallium/drivers/panfrost/ci/lava-deqp.yml.jinja2 --- mesa-19.2.8/src/gallium/drivers/panfrost/ci/lava-deqp.yml.jinja2 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/panfrost/ci/lava-deqp.yml.jinja2 1970-01-01 00:00:00.000000000 +0000 @@ -1,70 +0,0 @@ -job_name: panfrost-deqp -device_type: {{ device_type }} -timeouts: - job: - minutes: 40 - action: - minutes: 10 - actions: - power-off: - seconds: 30 -priority: medium -visibility: public -actions: -- deploy: - timeout: - minutes: 2 - to: tftp - kernel: - url: {{ base_artifacts_url }}/{{ kernel_image_name }} - ramdisk: - url: {{ base_artifacts_url }}/panfrost-rootfs-{{ arch }}.cpio.gz - compression: gz - dtb: - url: {{ base_artifacts_url }}/{{ device_type }}.dtb - os: oe -- boot: - timeout: - minutes: 5 - method: depthcharge - commands: ramdisk - prompts: - - '#' -- test: - timeout: - minutes: 40 - definitions: - - repository: - metadata: - format: Lava-Test Test Definition 1.0 - name: igt - description: "IGT test plan" - os: - - oe - scope: - - functional - run: - steps: - - mount -t proc none /proc - - mount -t sysfs none /sys - - mount -t devtmpfs none /dev - - mkdir -p /dev/pts - - mount -t devpts devpts /dev/pts - - echo 1 > /proc/sys/kernel/printk -# - echo performance > /sys/devices/platform/ff9a0000.gpu/devfreq/devfreq0/governor - - sh /deqp/deqp-runner.sh - parse: - pattern: 'Test case ''(?P\S*)''..\s+(?P(Pass|NotSupported|QualityWarning|CompatibilityWarning|Fail|ResourceError|Crash|Timeout|InternalError))' - fixupdict: - Pass: pass - NotSupported: pass - QualityWarning: pass - CompatibilityWarning: pass - Fail: fail - ResourceError: fail - Crash: fail - Timeout: fail - InternalError: fail - from: inline - name: deqp - path: inline/lava-deqp.yaml diff -Nru mesa-19.2.8/src/gallium/drivers/panfrost/Makefile.sources mesa-20.0.8/src/gallium/drivers/panfrost/Makefile.sources --- mesa-19.2.8/src/gallium/drivers/panfrost/Makefile.sources 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/panfrost/Makefile.sources 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,37 @@ +C_SOURCES := \ + nir/nir_lower_blend.c \ + nir/nir_lower_blend.h \ + nir/nir_lower_framebuffer.c \ + \ + pan_afbc.c \ + pan_allocate.c \ + pan_allocate.h \ + pan_assemble.c \ + pan_blend_cso.c \ + pan_blend.h \ + pan_blending.c \ + pan_blending.h \ + pan_blend_shaders.c \ + pan_blend_shaders.h \ + pan_blit.c \ + pan_bo.c \ + pan_bo.h \ + pan_compute.c \ + pan_context.c \ + pan_context.h \ + pan_format.c \ + pan_format.h \ + pan_fragment.c \ + pan_attributes.c \ + pan_job.c \ + pan_job.h \ + pan_mfbd.c \ + pan_public.h \ + pan_resource.c \ + pan_resource.h \ + pan_scoreboard.c \ + pan_screen.c \ + pan_screen.h \ + pan_sfbd.c \ + pan_util.h \ + pan_varyings.c diff -Nru mesa-19.2.8/src/gallium/drivers/panfrost/meson.build mesa-20.0.8/src/gallium/drivers/panfrost/meson.build --- mesa-19.2.8/src/gallium/drivers/panfrost/meson.build 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/panfrost/meson.build 2020-06-12 01:21:17.000000000 +0000 @@ -26,17 +26,14 @@ 'pan_resource.c', 'pan_resource.h', - 'nir/nir_undef_to_zero.c', 'nir/nir_lower_blend.c', 'nir/nir_lower_framebuffer.c', - 'nir/nir_clamp_psiz.c', 'pan_context.c', 'pan_afbc.c', - 'pan_bo_cache.c', + 'pan_bo.c', 'pan_blit.c', 'pan_job.c', - 'pan_drm.c', 'pan_allocate.c', 'pan_assemble.c', 'pan_format.c', @@ -45,12 +42,10 @@ 'pan_blend_cso.c', 'pan_compute.c', 'pan_fragment.c', - 'pan_invocation.c', - 'pan_instancing.c', + 'pan_attributes.c', 'pan_scoreboard.c', 'pan_sfbd.c', 'pan_mfbd.c', - 'pan_tiler.c', 'pan_varyings.c', ) @@ -82,5 +77,5 @@ driver_panfrost = declare_dependency( compile_args : compile_args_panfrost, - link_with : [libpanfrost, libpanfrostwinsys, libpanfrost_shared, libpanfrost_midgard, libpanfrost_bifrost, libpanfrost_decode], + link_with : [libpanfrost, libpanfrostwinsys, libpanfrost_shared, libpanfrost_midgard, libpanfrost_bifrost, libpanfrost_decode, libpanfrost_encoder], ) diff -Nru mesa-19.2.8/src/gallium/drivers/panfrost/nir/nir_clamp_psiz.c mesa-20.0.8/src/gallium/drivers/panfrost/nir/nir_clamp_psiz.c --- mesa-19.2.8/src/gallium/drivers/panfrost/nir/nir_clamp_psiz.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/panfrost/nir/nir_clamp_psiz.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,75 +0,0 @@ -/* - * Copyright (C) 2019 Collabora, Ltd. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ - -/** - * @file - * - * Clamps writes to VARYING_SLOT_PSIZ to a given limit. - */ - -#include "compiler/nir/nir.h" -#include "compiler/nir/nir_builder.h" - -void -nir_clamp_psiz(nir_shader *shader, float min_size, float max_size); - -void -nir_clamp_psiz(nir_shader *shader, float min_size, float max_size) -{ - nir_foreach_function(func, shader) { - nir_foreach_block(block, func->impl) { - nir_foreach_instr_safe(instr, block) { - if (instr->type != nir_instr_type_intrinsic) - continue; - - nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); - if (intr->intrinsic != nir_intrinsic_store_deref) - continue; - - nir_variable *var = nir_intrinsic_get_var(intr, 0); - if (var->data.location != VARYING_SLOT_PSIZ) - continue; - - nir_builder b; - nir_builder_init(&b, func->impl); - b.cursor = nir_before_instr(instr); - - nir_ssa_def *in_size = nir_ssa_for_src(&b, intr->src[1], - intr->num_components); - - nir_ssa_def *clamped = - nir_fmin(&b, - nir_fmax(&b, in_size, nir_imm_float(&b, min_size)), - nir_imm_float(&b, max_size)); - - nir_instr_rewrite_src(instr, &intr->src[1], - nir_src_for_ssa(clamped)); - - } - } - - nir_metadata_preserve(func->impl, nir_metadata_block_index | - nir_metadata_dominance); - } -} - diff -Nru mesa-19.2.8/src/gallium/drivers/panfrost/nir/nir_lower_blend.c mesa-20.0.8/src/gallium/drivers/panfrost/nir/nir_lower_blend.c --- mesa-19.2.8/src/gallium/drivers/panfrost/nir/nir_lower_blend.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/panfrost/nir/nir_lower_blend.c 2020-06-12 01:21:17.000000000 +0000 @@ -82,7 +82,7 @@ { nir_ssa_def *Asrc = nir_channel(b, src, 3); nir_ssa_def *Adst = nir_channel(b, dst, 3); - nir_ssa_def *one = nir_imm_float16(b, 1.0); + nir_ssa_def *one = nir_imm_float(b, 1.0); nir_ssa_def *Adsti = nir_fsub(b, one, Adst); return (chan < 3) ? nir_fmin(b, Asrc, Adsti) : one; @@ -99,7 +99,7 @@ { switch (factor) { case BLEND_FACTOR_ZERO: - return nir_imm_float16(b, 0.0); + return nir_imm_float(b, 0.0); case BLEND_FACTOR_SRC_COLOR: return nir_channel(b, src, chan); case BLEND_FACTOR_DST_COLOR: @@ -132,7 +132,7 @@ nir_blend_factor_value(b, src, dst, bconst, chan, factor); if (inverted) - f = nir_fsub(b, nir_imm_float16(b, 1.0), f); + f = nir_fsub(b, nir_imm_float(b, 1.0), f); return nir_fmul(b, raw_scalar, f); } @@ -167,7 +167,7 @@ nir_ssa_def *src, nir_ssa_def *dst) { /* Grab the blend constant ahead of time */ - nir_ssa_def *bconst = nir_f2f16(b, nir_load_blend_const_color_rgba(b)); + nir_ssa_def *bconst = nir_load_blend_const_color_rgba(b); /* We blend per channel and recombine later */ nir_ssa_def *channels[4]; @@ -175,7 +175,7 @@ for (unsigned c = 0; c < 4; ++c) { /* Decide properties based on channel */ nir_lower_blend_channel chan = - (c < 3) ? options.rt[0].rgb : options.rt[0].alpha; + (c < 3) ? options.rgb : options.alpha; nir_ssa_def *psrc = nir_channel(b, src, c); nir_ssa_def *pdst = nir_channel(b, dst, c); @@ -197,7 +197,7 @@ /* Then just recombine with an applied colormask */ nir_ssa_def *blended = nir_vec(b, channels, 4); - return nir_color_mask(b, options.rt[0].colormask, blended, dst); + return nir_color_mask(b, options.colormask, blended, dst); } static bool @@ -214,8 +214,8 @@ nir_is_blend_replace(nir_lower_blend_options options) { return - nir_is_blend_channel_replace(options.rt[0].rgb) && - nir_is_blend_channel_replace(options.rt[0].alpha); + nir_is_blend_channel_replace(options.rgb) && + nir_is_blend_channel_replace(options.alpha); } void @@ -249,13 +249,13 @@ b.cursor = nir_before_instr(instr); /* Grab the input color */ - nir_ssa_def *src = nir_f2f16(&b, nir_ssa_for_src(&b, intr->src[1], 4)); + nir_ssa_def *src = nir_ssa_for_src(&b, intr->src[1], 4); /* Grab the tilebuffer color - io lowered to load_output */ - nir_ssa_def *dst = nir_f2f16(&b, nir_load_var(&b, var)); + nir_ssa_def *dst = nir_load_var(&b, var); /* Blend the two colors per the passed options */ - nir_ssa_def *blended = nir_f2f32(&b, nir_blend(&b, options, src, dst)); + nir_ssa_def *blended = nir_blend(&b, options, src, dst); /* Write out the final color instead of the input */ nir_instr_rewrite_src(instr, &intr->src[1], diff -Nru mesa-19.2.8/src/gallium/drivers/panfrost/nir/nir_lower_blend.h mesa-20.0.8/src/gallium/drivers/panfrost/nir/nir_lower_blend.h --- mesa-19.2.8/src/gallium/drivers/panfrost/nir/nir_lower_blend.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/panfrost/nir/nir_lower_blend.h 2020-06-12 01:21:17.000000000 +0000 @@ -43,18 +43,17 @@ } nir_lower_blend_channel; typedef struct { - struct { - nir_lower_blend_channel rgb; - nir_lower_blend_channel alpha; + nir_lower_blend_channel rgb; + nir_lower_blend_channel alpha; - /* 4-bit colormask. 0x0 for none, 0xF for RGBA, 0x1 for R */ - unsigned colormask; - } rt[8]; + /* 4-bit colormask. 0x0 for none, 0xF for RGBA, 0x1 for R */ + unsigned colormask; } nir_lower_blend_options; void nir_lower_blend(nir_shader *shader, nir_lower_blend_options options); void -nir_lower_framebuffer(nir_shader *shader, enum pipe_format format); +nir_lower_framebuffer(nir_shader *shader, enum pipe_format format, + unsigned gpu_id); #endif diff -Nru mesa-19.2.8/src/gallium/drivers/panfrost/nir/nir_lower_framebuffer.c mesa-20.0.8/src/gallium/drivers/panfrost/nir/nir_lower_framebuffer.c --- mesa-19.2.8/src/gallium/drivers/panfrost/nir/nir_lower_framebuffer.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/panfrost/nir/nir_lower_framebuffer.c 2020-06-12 01:21:17.000000000 +0000 @@ -41,7 +41,30 @@ #include "compiler/nir/nir_builder.h" #include "compiler/nir/nir_format_convert.h" #include "nir_lower_blend.h" -#include "util/u_format.h" +#include "util/format/u_format.h" + +/* Determines the best NIR intrinsic to load a tile buffer of a given type, + * using native format conversion where possible. RGBA8 UNORM has a fast path + * (on some chips). Otherwise, we default to raw reads. */ + +static nir_intrinsic_op +nir_best_load_for_format( + const struct util_format_description *desc, + unsigned *special_bitsize, + unsigned *special_components, + unsigned gpu_id) +{ + if (util_format_is_unorm8(desc) && gpu_id != 0x750) { + *special_bitsize = 16; + return nir_intrinsic_load_output_u8_as_fp16_pan; + } else if (desc->format == PIPE_FORMAT_R11G11B10_FLOAT) { + *special_bitsize = 32; + *special_components = 1; + return nir_intrinsic_load_raw_output_pan; + } else + return nir_intrinsic_load_raw_output_pan; +} + /* Converters for UNORM8 formats, e.g. R8G8B8A8_UNORM */ @@ -49,14 +72,14 @@ nir_float_to_unorm8(nir_builder *b, nir_ssa_def *c_float) { /* First, we degrade quality to fp16; we don't need the extra bits */ - nir_ssa_def *degraded = nir_f2f16(b, c_float); + nir_ssa_def *degraded = /*nir_f2f16(b, c_float)*/c_float; /* Scale from [0, 1] to [0, 255.0] */ nir_ssa_def *scaled = nir_fmul_imm(b, nir_fsat(b, degraded), 255.0); /* Next, we type convert */ nir_ssa_def *converted = nir_u2u8(b, nir_f2u16(b, - nir_fround_even(b, scaled))); + nir_fround_even(b, nir_f2f16(b, scaled)))); return converted; } @@ -65,7 +88,7 @@ nir_unorm8_to_float(nir_builder *b, nir_ssa_def *c_native) { /* First, we convert up from u8 to f16 */ - nir_ssa_def *converted = nir_u2f16(b, nir_u2u16(b, c_native)); + nir_ssa_def *converted = nir_f2f32(b, nir_u2f16(b, nir_u2u16(b, c_native))); /* Next, we scale down from [0, 255.0] to [0, 1] */ nir_ssa_def *scaled = nir_fsat(b, nir_fmul_imm(b, converted, 1.0/255.0)); @@ -204,6 +227,7 @@ static nir_ssa_def * nir_native_to_shader(nir_builder *b, nir_ssa_def *c_native, + nir_intrinsic_op op, const struct util_format_description *desc, unsigned bits, bool homogenous_bits) @@ -212,18 +236,45 @@ util_format_is_float(desc->format) || util_format_is_pure_integer(desc->format); + /* Handle preconverted formats */ + if (op == nir_intrinsic_load_output_u8_as_fp16_pan) { + assert(util_format_is_unorm8(desc)); + return nir_f2f32(b, c_native); + } + + /* Otherwise, we're raw */ + assert(op == nir_intrinsic_load_raw_output_pan); + if (util_format_is_unorm8(desc)) return nir_unorm8_to_float(b, c_native); else if (homogenous_bits && float_or_pure_int) return c_native; /* type is already correct */ - else { + + /* Special formats */ + switch (desc->format) { + case PIPE_FORMAT_R11G11B10_FLOAT: { + nir_ssa_def *unpacked = nir_format_unpack_11f11f10f(b, c_native); + + /* Extend to vec4 with alpha */ + nir_ssa_def *components[4] = { + nir_channel(b, unpacked, 0), + nir_channel(b, unpacked, 1), + nir_channel(b, unpacked, 2), + nir_imm_float(b, 1.0) + }; + + return nir_vec(b, components, 4); + } + + default: printf("%s\n", desc->name); unreachable("Unknown format name"); } } void -nir_lower_framebuffer(nir_shader *shader, enum pipe_format format) +nir_lower_framebuffer(nir_shader *shader, enum pipe_format format, + unsigned gpu_id) { /* Blend shaders are represented as special fragment shaders */ assert(shader->info.stage == MESA_SHADER_FRAGMENT); @@ -287,6 +338,22 @@ /* Grab the input color */ nir_ssa_def *c_nir = nir_ssa_for_src(&b, intr->src[1], 4); + /* Apply sRGB transform */ + + if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) { + nir_ssa_def *rgb = nir_channels(&b, c_nir, 0x7); + nir_ssa_def *trans = nir_format_linear_to_srgb(&b, rgb); + + nir_ssa_def *comp[4] = { + nir_channel(&b, trans, 0), + nir_channel(&b, trans, 1), + nir_channel(&b, trans, 2), + nir_channel(&b, c_nir, 3), + }; + + c_nir = nir_vec(&b, comp, 4); + } + /* Format convert */ nir_ssa_def *converted = nir_shader_to_native(&b, c_nir, format_desc, bits, homogenous_bits); @@ -314,20 +381,29 @@ /* For loads, add conversion after */ b.cursor = nir_after_instr(instr); - /* Rewrite to use a native load by creating a new intrinsic */ - - nir_intrinsic_instr *new = - nir_intrinsic_instr_create(shader, nir_intrinsic_load_raw_output_pan); + /* Determine the best op for the format/hardware */ + unsigned bitsize = raw_bitsize_in; + unsigned components = 4; + nir_intrinsic_op op = nir_best_load_for_format(format_desc, + &bitsize, + &components, + gpu_id); - new->num_components = 4; + /* Rewrite to use a native load by creating a new intrinsic */ + nir_intrinsic_instr *new = nir_intrinsic_instr_create(shader, op); + new->num_components = components; - unsigned bitsize = raw_bitsize_in; - nir_ssa_dest_init(&new->instr, &new->dest, 4, bitsize, NULL); + nir_ssa_dest_init(&new->instr, &new->dest, components, bitsize, NULL); nir_builder_instr_insert(&b, &new->instr); /* Convert the raw value */ nir_ssa_def *raw = &new->dest.ssa; - nir_ssa_def *converted = nir_native_to_shader(&b, raw, format_desc, bits, homogenous_bits); + nir_ssa_def *converted = nir_native_to_shader(&b, raw, op, format_desc, bits, homogenous_bits); + + if (util_format_is_float(format)) + converted = nir_f2f32(&b, converted); + else + converted = nir_i2i32(&b, converted); /* Rewrite to use the converted value */ nir_src rewritten = nir_src_for_ssa(converted); diff -Nru mesa-19.2.8/src/gallium/drivers/panfrost/nir/nir_undef_to_zero.c mesa-20.0.8/src/gallium/drivers/panfrost/nir/nir_undef_to_zero.c --- mesa-19.2.8/src/gallium/drivers/panfrost/nir/nir_undef_to_zero.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/panfrost/nir/nir_undef_to_zero.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,87 +0,0 @@ -/* - * Copyright (C) 2019 Collabora, Ltd. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * Authors (Collabora): - * Alyssa Rosenzweig - */ - -/** - * @file - * - * Flushes undefined SSA values to a zero vector fo the appropriate component - * count, to avoid undefined behaviour in the resulting shader. Not required - * for conformance as use of uninitialized variables is explicitly left - * undefined by the spec. Works around buggy apps, however. - * - * Call immediately after nir_opt_undef. If called before, larger optimization - * opportunities from the former pass will be missed. If called outside of an - * optimization loop, constant propagation and algebraic optimizations won't be - * able to kick in to reduce stuff consuming the zero. - */ - -#include "compiler/nir/nir.h" -#include "compiler/nir/nir_builder.h" - -bool nir_undef_to_zero(nir_shader *shader); - -bool -nir_undef_to_zero(nir_shader *shader) -{ - bool progress = false; - - nir_foreach_function(function, shader) { - if (!function->impl) continue; - - nir_builder b; - nir_builder_init(&b, function->impl); - - nir_foreach_block(block, function->impl) { - nir_foreach_instr_safe(instr, block) { - if (instr->type != nir_instr_type_ssa_undef) continue; - - nir_ssa_undef_instr *und = nir_instr_as_ssa_undef(instr); - - /* Get the required size */ - unsigned c = und->def.num_components; - unsigned s = und->def.bit_size; - - nir_const_value v[NIR_MAX_VEC_COMPONENTS]; - memset(v, 0, sizeof(v)); - - b.cursor = nir_before_instr(instr); - nir_ssa_def *zero = nir_build_imm(&b, c, s, v); - nir_src zerosrc = nir_src_for_ssa(zero); - - nir_ssa_def_rewrite_uses(&und->def, zerosrc); - - progress |= true; - } - } - - nir_metadata_preserve(function->impl, nir_metadata_block_index | nir_metadata_dominance); - - } - - return progress; -} - - diff -Nru mesa-19.2.8/src/gallium/drivers/panfrost/pan_afbc.c mesa-20.0.8/src/gallium/drivers/panfrost/pan_afbc.c --- mesa-19.2.8/src/gallium/drivers/panfrost/pan_afbc.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/panfrost/pan_afbc.c 2020-06-12 01:21:17.000000000 +0000 @@ -25,7 +25,7 @@ */ #include "pan_resource.h" -#include "util/u_format.h" +#include "util/format/u_format.h" /* Arm FrameBuffer Compression (AFBC) is a lossless compression scheme natively * implemented in Mali GPUs (as well as many display controllers paired with diff -Nru mesa-19.2.8/src/gallium/drivers/panfrost/pan_allocate.c mesa-20.0.8/src/gallium/drivers/panfrost/pan_allocate.c --- mesa-19.2.8/src/gallium/drivers/panfrost/pan_allocate.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/panfrost/pan_allocate.c 2020-06-12 01:21:17.000000000 +0000 @@ -29,42 +29,19 @@ #include #include #include +#include "pan_bo.h" #include "pan_context.h" /* TODO: What does this actually have to be? */ #define ALIGNMENT 128 -/* Allocate a new transient slab */ - -static struct panfrost_bo * -panfrost_create_slab(struct panfrost_screen *screen, unsigned *index) -{ - /* Allocate a new slab on the screen */ - - struct panfrost_bo **new = - util_dynarray_grow(&screen->transient_bo, - struct panfrost_bo *, 1); - - struct panfrost_bo *alloc = panfrost_drm_create_bo(screen, TRANSIENT_SLAB_SIZE, 0); - - *new = alloc; - - /* Return the BO as well as the index we just added */ - - *index = util_dynarray_num_elements(&screen->transient_bo, void *) - 1; - return alloc; -} - /* Transient command stream pooling: command stream uploads try to simply copy * into whereever we left off. If there isn't space, we allocate a new entry * into the pool and copy there */ struct panfrost_transfer -panfrost_allocate_transient(struct panfrost_context *ctx, size_t sz) +panfrost_allocate_transient(struct panfrost_batch *batch, size_t sz) { - struct panfrost_screen *screen = pan_screen(ctx->base.screen); - struct panfrost_job *batch = panfrost_get_job_for_fbo(ctx); - /* Pad the size */ sz = ALIGN_POT(sz, ALIGNMENT); @@ -72,56 +49,37 @@ struct panfrost_bo *bo = NULL; unsigned offset = 0; - bool update_offset = false; - bool has_current = batch->transient_indices.size; bool fits_in_current = (batch->transient_offset + sz) < TRANSIENT_SLAB_SIZE; - if (likely(has_current && fits_in_current)) { - /* We can reuse the topmost BO, so get it */ - unsigned idx = util_dynarray_top(&batch->transient_indices, unsigned); - bo = pan_bo_for_index(screen, idx); + if (likely(batch->transient_bo && fits_in_current)) { + /* We can reuse the current BO, so get it */ + bo = batch->transient_bo; /* Use the specified offset */ offset = batch->transient_offset; - update_offset = true; - } else if (sz < TRANSIENT_SLAB_SIZE) { - /* We can't reuse the topmost BO, but we can get a new one. - * First, look for a free slot */ - - unsigned count = util_dynarray_num_elements(&screen->transient_bo, void *); - unsigned index = 0; - - unsigned free = __bitset_ffs( - screen->free_transient, - count / BITSET_WORDBITS); - - if (likely(free)) { - /* Use this one */ - index = free - 1; - - /* It's ours, so no longer free */ - BITSET_CLEAR(screen->free_transient, index); - - /* Grab the BO */ - bo = pan_bo_for_index(screen, index); - } else { - /* Otherwise, create a new BO */ - bo = panfrost_create_slab(screen, &index); - } - - /* Remember we created this */ - util_dynarray_append(&batch->transient_indices, unsigned, index); - - update_offset = true; + batch->transient_offset = offset + sz; } else { - /* Create a new BO and reference it */ - bo = panfrost_drm_create_bo(screen, ALIGN_POT(sz, 4096), 0); - panfrost_job_add_bo(batch, bo); - - /* Creating a BO adds a reference, and then the job adds a - * second one. So we need to pop back one reference */ - panfrost_bo_unreference(&screen->base, bo); + size_t bo_sz = sz < TRANSIENT_SLAB_SIZE ? + TRANSIENT_SLAB_SIZE : ALIGN_POT(sz, 4096); + + /* We can't reuse the current BO, but we can create a new one. + * We don't know what the BO will be used for, so let's flag it + * RW and attach it to both the fragment and vertex/tiler jobs. + * TODO: if we want fine grained BO assignment we should pass + * flags to this function and keep the read/write, + * fragment/vertex+tiler pools separate. + */ + bo = panfrost_batch_create_bo(batch, bo_sz, 0, + PAN_BO_ACCESS_PRIVATE | + PAN_BO_ACCESS_RW | + PAN_BO_ACCESS_VERTEX_TILER | + PAN_BO_ACCESS_FRAGMENT); + + if (sz < TRANSIENT_SLAB_SIZE) { + batch->transient_bo = bo; + batch->transient_offset = offset + sz; + } } struct panfrost_transfer ret = { @@ -129,40 +87,15 @@ .gpu = bo->gpu + offset, }; - if (update_offset) - batch->transient_offset = offset + sz; - return ret; } mali_ptr -panfrost_upload_transient(struct panfrost_context *ctx, const void *data, size_t sz) +panfrost_upload_transient(struct panfrost_batch *batch, const void *data, + size_t sz) { - struct panfrost_transfer transfer = panfrost_allocate_transient(ctx, sz); + struct panfrost_transfer transfer = panfrost_allocate_transient(batch, sz); memcpy(transfer.cpu, data, sz); return transfer.gpu; } - -/* The code below is exclusively for the use of shader memory and is subject to - * be rewritten soon enough since it never frees the memory it allocates. Here - * be dragons, etc. */ - -mali_ptr -panfrost_upload(struct panfrost_memory *mem, const void *data, size_t sz) -{ - size_t aligned_sz = ALIGN_POT(sz, ALIGNMENT); - - /* Bounds check */ - if ((mem->stack_bottom + aligned_sz) >= mem->bo->size) { - printf("Out of memory, tried to upload %zd but only %zd available\n", - sz, mem->bo->size - mem->stack_bottom); - assert(0); - } - - memcpy((uint8_t *) mem->bo->cpu + mem->stack_bottom, data, sz); - mali_ptr gpu = mem->bo->gpu + mem->stack_bottom; - - mem->stack_bottom += aligned_sz; - return gpu; -} diff -Nru mesa-19.2.8/src/gallium/drivers/panfrost/pan_allocate.h mesa-20.0.8/src/gallium/drivers/panfrost/pan_allocate.h --- mesa-19.2.8/src/gallium/drivers/panfrost/pan_allocate.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/panfrost/pan_allocate.h 2020-06-12 01:21:17.000000000 +0000 @@ -33,7 +33,7 @@ #include "util/list.h" -struct panfrost_context; +struct panfrost_batch; /* Represents a fat pointer for GPU-mapped memory, returned from the transient * allocator and not used for much else */ @@ -43,46 +43,11 @@ mali_ptr gpu; }; -struct panfrost_bo { - /* Must be first for casting */ - struct list_head link; - - struct pipe_reference reference; - - /* Mapping for the entire object (all levels) */ - uint8_t *cpu; - - /* GPU address for the object */ - mali_ptr gpu; - - /* Size of all entire trees */ - size_t size; - - int gem_handle; - - uint32_t flags; -}; - -struct panfrost_memory { - /* Backing for the slab in memory */ - struct panfrost_bo *bo; - int stack_bottom; -}; - -/* Functions for the actual Galliumish driver */ -mali_ptr panfrost_upload(struct panfrost_memory *mem, const void *data, size_t sz); - struct panfrost_transfer -panfrost_allocate_transient(struct panfrost_context *ctx, size_t sz); +panfrost_allocate_transient(struct panfrost_batch *batch, size_t sz); mali_ptr -panfrost_upload_transient(struct panfrost_context *ctx, const void *data, size_t sz); - -static inline mali_ptr -panfrost_reserve(struct panfrost_memory *mem, size_t sz) -{ - mem->stack_bottom += sz; - return mem->bo->gpu + (mem->stack_bottom - sz); -} +panfrost_upload_transient(struct panfrost_batch *batch, const void *data, + size_t sz); #endif /* __PAN_ALLOCATE_H__ */ diff -Nru mesa-19.2.8/src/gallium/drivers/panfrost/pan_assemble.c mesa-20.0.8/src/gallium/drivers/panfrost/pan_assemble.c --- mesa-19.2.8/src/gallium/drivers/panfrost/pan_assemble.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/panfrost/pan_assemble.c 2020-06-12 01:21:17.000000000 +0000 @@ -25,7 +25,9 @@ #include #include #include +#include "pan_bo.h" #include "pan_context.h" +#include "pan_util.h" #include "compiler/nir/nir.h" #include "nir/tgsi_to_nir.h" @@ -58,21 +60,14 @@ s->info.stage = stage; - if (stage == MESA_SHADER_FRAGMENT) { - /* Inject the alpha test now if we need to */ - - if (state->alpha_state.enabled) { - NIR_PASS_V(s, nir_lower_alpha_test, state->alpha_state.func, false); - } - } - /* Call out to Midgard compiler given the above NIR */ midgard_program program = { .alpha_ref = state->alpha_state.ref_value }; - midgard_compile_shader_nir(&ctx->compiler, s, &program, false); + midgard_compile_shader_nir(s, &program, false, 0, screen->gpu_id, + pan_debug & PAN_DBG_PRECOMPILE); /* Prepare the compiled binary for upload */ int size = program.compiled.size; @@ -82,9 +77,14 @@ * I bet someone just thought that would be a cute pun. At least, * that's how I'd do it. */ - state->bo = panfrost_drm_create_bo(screen, size, PAN_ALLOCATE_EXECUTE); - memcpy(state->bo->cpu, dst, size); - meta->shader = state->bo->gpu | program.first_tag; + if (size) { + state->bo = panfrost_bo_create(screen, size, PAN_BO_EXECUTE); + memcpy(state->bo->cpu, dst, size); + meta->shader = state->bo->gpu | program.first_tag; + } else { + /* no shader */ + meta->shader = 0x0; + } util_dynarray_fini(&program.compiled); @@ -96,10 +96,20 @@ meta->midgard1.uniform_count = MIN2(program.uniform_count, program.uniform_cutoff); meta->midgard1.work_count = program.work_register_count; + bool vertex_id = s->info.system_values_read & (1 << SYSTEM_VALUE_VERTEX_ID); + bool instance_id = s->info.system_values_read & (1 << SYSTEM_VALUE_INSTANCE_ID); + switch (stage) { case MESA_SHADER_VERTEX: meta->attribute_count = util_bitcount64(s->info.inputs_read); meta->varying_count = util_bitcount64(s->info.outputs_written); + + if (vertex_id) + meta->attribute_count = MAX2(meta->attribute_count, PAN_VERTEX_ID + 1); + + if (instance_id) + meta->attribute_count = MAX2(meta->attribute_count, PAN_INSTANCE_ID + 1); + break; case MESA_SHADER_FRAGMENT: meta->attribute_count = 0; @@ -118,6 +128,7 @@ state->writes_point_size = program.writes_point_size; state->reads_point_coord = false; state->helper_invocations = s->info.fs.needs_helper_invocations; + state->stack_size = program.tls_size; if (outputs_written) *outputs_written = s->info.outputs_written; @@ -137,7 +148,7 @@ /* Default to a vec4 varying */ struct mali_attr_meta v = { - .format = MALI_RGBA32F, + .format = program.varying_type[i], .swizzle = default_vec4_swizzle, .unknown1 = 0x2, }; @@ -145,7 +156,10 @@ /* Check for special cases, otherwise assume general varying */ if (location == VARYING_SLOT_POS) { - v.format = MALI_VARYING_POS; + if (stage == MESA_SHADER_FRAGMENT) + state->reads_frag_coord = true; + else + v.format = MALI_VARYING_POS; } else if (location == VARYING_SLOT_PSIZ) { v.format = MALI_R16F; v.swizzle = default_vec1_swizzle; diff -Nru mesa-19.2.8/src/gallium/drivers/panfrost/pan_attributes.c mesa-20.0.8/src/gallium/drivers/panfrost/pan_attributes.c --- mesa-19.2.8/src/gallium/drivers/panfrost/pan_attributes.c 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/panfrost/pan_attributes.c 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,132 @@ +/* + * Copyright (C) 2018-2019 Alyssa Rosenzweig + * Copyright (C) 2019 Collabora, Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include "pan_bo.h" +#include "pan_context.h" + +void +panfrost_emit_vertex_data(struct panfrost_batch *batch) +{ + struct panfrost_context *ctx = batch->ctx; + struct panfrost_vertex_state *so = ctx->vertex; + + /* Staged mali_attr, and index into them. i =/= k, depending on the + * vertex buffer mask and instancing. Twice as much room is allocated, + * for a worst case of NPOT_DIVIDEs which take up extra slot */ + union mali_attr attrs[PIPE_MAX_ATTRIBS * 2]; + unsigned k = 0; + + for (unsigned i = 0; i < so->num_elements; ++i) { + /* We map a mali_attr to be 1:1 with the mali_attr_meta, which + * means duplicating some vertex buffers (who cares? aside from + * maybe some caching implications but I somehow doubt that + * matters) */ + + struct pipe_vertex_element *elem = &so->pipe[i]; + unsigned vbi = elem->vertex_buffer_index; + + /* The exception to 1:1 mapping is that we can have multiple + * entries (NPOT divisors), so we fixup anyways */ + + so->hw[i].index = k; + + if (!(ctx->vb_mask & (1 << vbi))) continue; + + struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi]; + struct panfrost_resource *rsrc = (struct panfrost_resource *) (buf->buffer.resource); + + if (!rsrc) continue; + + /* Align to 64 bytes by masking off the lower bits. This + * will be adjusted back when we fixup the src_offset in + * mali_attr_meta */ + + mali_ptr raw_addr = rsrc->bo->gpu + buf->buffer_offset; + mali_ptr addr = raw_addr & ~63; + unsigned chopped_addr = raw_addr - addr; + + /* Add a dependency of the batch on the vertex buffer */ + panfrost_batch_add_bo(batch, rsrc->bo, + PAN_BO_ACCESS_SHARED | + PAN_BO_ACCESS_READ | + PAN_BO_ACCESS_VERTEX_TILER); + + /* Set common fields */ + attrs[k].elements = addr; + attrs[k].stride = buf->stride; + + /* Since we advanced the base pointer, we shrink the buffer + * size */ + attrs[k].size = rsrc->base.width0 - buf->buffer_offset; + + /* We need to add the extra size we masked off (for + * correctness) so the data doesn't get clamped away */ + attrs[k].size += chopped_addr; + + /* For non-instancing make sure we initialize */ + attrs[k].shift = attrs[k].extra_flags = 0; + + /* Instancing uses a dramatically different code path than + * linear, so dispatch for the actual emission now that the + * common code is finished */ + + unsigned divisor = elem->instance_divisor; + + if (divisor && ctx->instance_count == 1) { + /* Silly corner case where there's a divisor(=1) but + * there's no legitimate instancing. So we want *every* + * attribute to be the same. So set stride to zero so + * we don't go anywhere. */ + + attrs[k].size = attrs[k].stride + chopped_addr; + attrs[k].stride = 0; + attrs[k++].elements |= MALI_ATTR_LINEAR; + } else if (ctx->instance_count <= 1) { + /* Normal, non-instanced attributes */ + attrs[k++].elements |= MALI_ATTR_LINEAR; + } else { + unsigned instance_shift = batch->ctx->payloads[PIPE_SHADER_FRAGMENT].instance_shift; + unsigned instance_odd = batch->ctx->payloads[PIPE_SHADER_FRAGMENT].instance_odd; + + k += panfrost_vertex_instanced(batch->ctx->padded_count, + instance_shift, instance_odd, divisor, &attrs[k]); + } + } + + /* Add special gl_VertexID/gl_InstanceID buffers */ + + panfrost_vertex_id(ctx->padded_count, &attrs[k]); + so->hw[PAN_VERTEX_ID].index = k++; + + panfrost_instance_id(ctx->padded_count, &attrs[k]); + so->hw[PAN_INSTANCE_ID].index = k++; + + /* Upload whatever we emitted and go */ + + ctx->payloads[PIPE_SHADER_VERTEX].postfix.attributes = + panfrost_upload_transient(batch, attrs, k * sizeof(union mali_attr)); +} + + diff -Nru mesa-19.2.8/src/gallium/drivers/panfrost/pan_blend_cso.c mesa-20.0.8/src/gallium/drivers/panfrost/pan_blend_cso.c --- mesa-19.2.8/src/gallium/drivers/panfrost/pan_blend_cso.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/panfrost/pan_blend_cso.c 2020-06-12 01:21:17.000000000 +0000 @@ -29,6 +29,8 @@ #include "util/u_memory.h" #include "pan_blend_shaders.h" #include "pan_blending.h" +#include "pan_bo.h" +#include "panfrost-quirks.h" /* A given Gallium blend state can be encoded to the hardware in numerous, * dramatically divergent ways due to the interactions of blending with @@ -73,11 +75,12 @@ /* Prevent NULL collision issues.. */ assert(fmt != 0); - /* Check the cache */ + /* Check the cache. Key by the RT and format */ struct hash_table_u64 *shaders = blend->rt[rt].shaders; + unsigned key = (fmt << 3) | rt; struct panfrost_blend_shader *shader = - _mesa_hash_table_u64_search(shaders, fmt); + _mesa_hash_table_u64_search(shaders, key); if (shader) return shader; @@ -85,10 +88,10 @@ /* Cache miss. Build one instead, cache it, and go */ struct panfrost_blend_shader generated = - panfrost_compile_blend_shader(ctx, &blend->base, fmt); + panfrost_compile_blend_shader(ctx, &blend->base, fmt, rt); shader = mem_dup(&generated, sizeof(generated)); - _mesa_hash_table_u64_insert(shaders, fmt, shader); + _mesa_hash_table_u64_insert(shaders, key, shader); return shader; } @@ -148,7 +151,7 @@ if (!blend) return; - if (screen->require_sfbd) { + if (screen->quirks & MIDGARD_SFBD) { SET_BIT(ctx->fragment_shader_core.unknown2_4, MALI_NO_DITHER, !blend->dither); } @@ -224,10 +227,9 @@ /* Create a final blend given the context */ struct panfrost_blend_final -panfrost_get_blend_for_context(struct panfrost_context *ctx, unsigned rti) +panfrost_get_blend_for_context(struct panfrost_context *ctx, unsigned rti, struct panfrost_bo **bo, unsigned *shader_offset) { - struct panfrost_screen *screen = pan_screen(ctx->base.screen); - struct panfrost_job *job = panfrost_get_job_for_fbo(ctx); + struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx); /* Grab the format, falling back gracefully if called invalidly (which * has to happen for no-color-attachment FBOs, for instance) */ @@ -271,22 +273,32 @@ final.shader.work_count = shader->work_count; final.shader.first_tag = shader->first_tag; - /* Upload the shader */ - final.shader.bo = panfrost_drm_create_bo(screen, shader->size, PAN_ALLOCATE_EXECUTE); - memcpy(final.shader.bo->cpu, shader->buffer, shader->size); - - /* Pass BO ownership to job */ - panfrost_job_add_bo(job, final.shader.bo); - panfrost_bo_unreference(ctx->base.screen, final.shader.bo); + /* Upload the shader, sharing a BO */ + if (!(*bo)) { + *bo = panfrost_batch_create_bo(batch, 4096, + PAN_BO_EXECUTE, + PAN_BO_ACCESS_PRIVATE | + PAN_BO_ACCESS_READ | + PAN_BO_ACCESS_VERTEX_TILER | + PAN_BO_ACCESS_FRAGMENT); + } + + /* Size check */ + assert((*shader_offset + shader->size) < 4096); + + memcpy((*bo)->cpu + *shader_offset, shader->buffer, shader->size); + final.shader.gpu = (*bo)->gpu + *shader_offset; if (shader->patch_index) { /* We have to specialize the blend shader to use constants, so * patch in the current constants */ - float *patch = (float *) (final.shader.bo->cpu + shader->patch_index); + float *patch = (float *) ((*bo)->cpu + *shader_offset + shader->patch_index); memcpy(patch, ctx->blend_color.color, sizeof(float) * 4); } + *shader_offset += shader->size; + return final; } diff -Nru mesa-19.2.8/src/gallium/drivers/panfrost/pan_blend.h mesa-20.0.8/src/gallium/drivers/panfrost/pan_blend.h --- mesa-19.2.8/src/gallium/drivers/panfrost/pan_blend.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/panfrost/pan_blend.h 2020-06-12 01:21:17.000000000 +0000 @@ -55,8 +55,8 @@ /* A blend shader descriptor ready for actual use */ struct panfrost_blend_shader_final { - /* The compiled shader in GPU memory, possibly patched */ - struct panfrost_bo *bo; + /* GPU address where we're compiled to */ + uint64_t gpu; /* First instruction tag (for tagging the pointer) */ unsigned first_tag; @@ -113,6 +113,6 @@ panfrost_blend_context_init(struct pipe_context *pipe); struct panfrost_blend_final -panfrost_get_blend_for_context(struct panfrost_context *ctx, unsigned rt); +panfrost_get_blend_for_context(struct panfrost_context *ctx, unsigned rt, struct panfrost_bo **bo, unsigned *shader_offset); #endif diff -Nru mesa-19.2.8/src/gallium/drivers/panfrost/pan_blending.c mesa-20.0.8/src/gallium/drivers/panfrost/pan_blending.c --- mesa-19.2.8/src/gallium/drivers/panfrost/pan_blending.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/panfrost/pan_blending.c 2020-06-12 01:21:17.000000000 +0000 @@ -26,7 +26,7 @@ #include "pan_blending.h" #include "pan_context.h" #include "gallium/auxiliary/util/u_blend.h" -#include "util/u_format.h" +#include "util/format/u_format.h" /* * Implements fixed-function blending on Midgard. diff -Nru mesa-19.2.8/src/gallium/drivers/panfrost/pan_blend_shaders.c mesa-20.0.8/src/gallium/drivers/panfrost/pan_blend_shaders.c --- mesa-19.2.8/src/gallium/drivers/panfrost/pan_blend_shaders.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/panfrost/pan_blend_shaders.c 2020-06-12 01:21:17.000000000 +0000 @@ -85,42 +85,40 @@ */ static nir_lower_blend_options -nir_make_options(const struct pipe_blend_state *blend, unsigned nr_cbufs) +nir_make_options(const struct pipe_blend_state *blend, unsigned i) { nir_lower_blend_options options; - for (unsigned i = 0; i < nr_cbufs; ++i) { - /* If blend is disabled, we just use replace mode */ + /* If blend is disabled, we just use replace mode */ - nir_lower_blend_channel rgb = { - .func = BLEND_FUNC_ADD, - .src_factor = BLEND_FACTOR_ZERO, - .invert_src_factor = true, - .dst_factor = BLEND_FACTOR_ZERO, - .invert_dst_factor = false - }; - - nir_lower_blend_channel alpha = rgb; - - if (blend->rt[i].blend_enable) { - rgb.func = util_blend_func_to_shader(blend->rt[i].rgb_func); - rgb.src_factor = util_blend_factor_to_shader(blend->rt[i].rgb_src_factor); - rgb.dst_factor = util_blend_factor_to_shader(blend->rt[i].rgb_dst_factor); - rgb.invert_src_factor = util_blend_factor_is_inverted(blend->rt[i].rgb_src_factor); - rgb.invert_dst_factor = util_blend_factor_is_inverted(blend->rt[i].rgb_dst_factor); - - alpha.func = util_blend_func_to_shader(blend->rt[i].alpha_func); - alpha.src_factor = util_blend_factor_to_shader(blend->rt[i].alpha_src_factor); - alpha.dst_factor = util_blend_factor_to_shader(blend->rt[i].alpha_dst_factor); - alpha.invert_src_factor = util_blend_factor_is_inverted(blend->rt[i].alpha_src_factor); - alpha.invert_dst_factor = util_blend_factor_is_inverted(blend->rt[i].alpha_dst_factor); - } + nir_lower_blend_channel rgb = { + .func = BLEND_FUNC_ADD, + .src_factor = BLEND_FACTOR_ZERO, + .invert_src_factor = true, + .dst_factor = BLEND_FACTOR_ZERO, + .invert_dst_factor = false + }; + + nir_lower_blend_channel alpha = rgb; + + if (blend->rt[i].blend_enable) { + rgb.func = util_blend_func_to_shader(blend->rt[i].rgb_func); + rgb.src_factor = util_blend_factor_to_shader(blend->rt[i].rgb_src_factor); + rgb.dst_factor = util_blend_factor_to_shader(blend->rt[i].rgb_dst_factor); + rgb.invert_src_factor = util_blend_factor_is_inverted(blend->rt[i].rgb_src_factor); + rgb.invert_dst_factor = util_blend_factor_is_inverted(blend->rt[i].rgb_dst_factor); + + alpha.func = util_blend_func_to_shader(blend->rt[i].alpha_func); + alpha.src_factor = util_blend_factor_to_shader(blend->rt[i].alpha_src_factor); + alpha.dst_factor = util_blend_factor_to_shader(blend->rt[i].alpha_dst_factor); + alpha.invert_src_factor = util_blend_factor_is_inverted(blend->rt[i].alpha_src_factor); + alpha.invert_dst_factor = util_blend_factor_is_inverted(blend->rt[i].alpha_dst_factor); + } - options.rt[i].rgb = rgb; - options.rt[i].alpha = alpha; + options.rgb = rgb; + options.alpha = alpha; - options.rt[i].colormask = blend->rt[i].colormask; - } + options.colormask = blend->rt[i].colormask; return options; } @@ -129,8 +127,10 @@ panfrost_compile_blend_shader( struct panfrost_context *ctx, struct pipe_blend_state *cso, - enum pipe_format format) + enum pipe_format format, + unsigned rt) { + struct panfrost_screen *screen = pan_screen(ctx->base.screen); struct panfrost_blend_shader res; res.ctx = ctx; @@ -164,18 +164,15 @@ nir_store_var(b, c_out, s_src, 0xFF); nir_lower_blend_options options = - nir_make_options(cso, 1); + nir_make_options(cso, rt); NIR_PASS_V(shader, nir_lower_blend, options); - NIR_PASS_V(shader, nir_lower_framebuffer, format); + NIR_PASS_V(shader, nir_lower_framebuffer, format, screen->gpu_id); /* Compile the built shader */ midgard_program program; - midgard_compile_shader_nir(&ctx->compiler, shader, &program, true); - - /* At least two work registers are needed due to an encoding quirk */ - res.work_count = MAX2(program.work_register_count, 2); + midgard_compile_shader_nir(shader, &program, true, rt, screen->gpu_id, false); /* Allow us to patch later */ res.patch_index = program.blend_patch_offset; diff -Nru mesa-19.2.8/src/gallium/drivers/panfrost/pan_blend_shaders.h mesa-20.0.8/src/gallium/drivers/panfrost/pan_blend_shaders.h --- mesa-19.2.8/src/gallium/drivers/panfrost/pan_blend_shaders.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/panfrost/pan_blend_shaders.h 2020-06-12 01:21:17.000000000 +0000 @@ -35,6 +35,7 @@ panfrost_compile_blend_shader( struct panfrost_context *ctx, struct pipe_blend_state *cso, - enum pipe_format format); + enum pipe_format format, + unsigned rt); #endif diff -Nru mesa-19.2.8/src/gallium/drivers/panfrost/pan_blit.c mesa-20.0.8/src/gallium/drivers/panfrost/pan_blit.c --- mesa-19.2.8/src/gallium/drivers/panfrost/pan_blit.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/panfrost/pan_blit.c 2020-06-12 01:21:17.000000000 +0000 @@ -28,7 +28,7 @@ */ #include "pan_context.h" -#include "util/u_format.h" +#include "util/format/u_format.h" static void panfrost_blitter_save( @@ -105,16 +105,17 @@ void panfrost_blit_wallpaper(struct panfrost_context *ctx, struct pipe_box *box) { - struct pipe_blit_info binfo = { }; + struct panfrost_batch *batch = ctx->wallpaper_batch; + struct pipe_blit_info binfo = {0}; panfrost_blitter_save(ctx, ctx->blitter_wallpaper); - struct pipe_surface *surf = ctx->pipe_framebuffer.cbufs[0]; + struct pipe_surface *surf = batch->key.cbufs[0]; unsigned level = surf->u.tex.level; unsigned layer = surf->u.tex.first_layer; assert(surf->u.tex.last_layer == layer); - binfo.src.resource = binfo.dst.resource = ctx->pipe_framebuffer.cbufs[0]->texture; + binfo.src.resource = binfo.dst.resource = batch->key.cbufs[0]->texture; binfo.src.level = binfo.dst.level = level; binfo.src.box.x = binfo.dst.box.x = box->x; binfo.src.box.y = binfo.dst.box.y = box->y; @@ -123,9 +124,9 @@ binfo.src.box.height = binfo.dst.box.height = box->height; binfo.src.box.depth = binfo.dst.box.depth = 1; - binfo.src.format = binfo.dst.format = ctx->pipe_framebuffer.cbufs[0]->format; + binfo.src.format = binfo.dst.format = batch->key.cbufs[0]->format; - assert(ctx->pipe_framebuffer.nr_cbufs == 1); + assert(batch->key.nr_cbufs == 1); binfo.mask = PIPE_MASK_RGBA; binfo.filter = PIPE_TEX_FILTER_LINEAR; binfo.scissor_enable = FALSE; diff -Nru mesa-19.2.8/src/gallium/drivers/panfrost/pan_bo.c mesa-20.0.8/src/gallium/drivers/panfrost/pan_bo.c --- mesa-19.2.8/src/gallium/drivers/panfrost/pan_bo.c 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/panfrost/pan_bo.c 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,529 @@ +/* + * Copyright 2019 Collabora, Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors (Collabora): + * Alyssa Rosenzweig + */ +#include +#include +#include +#include +#include +#include "drm-uapi/panfrost_drm.h" + +#include "pan_bo.h" +#include "pan_screen.h" +#include "pan_util.h" +#include "pandecode/decode.h" + +#include "os/os_mman.h" + +#include "util/u_inlines.h" +#include "util/u_math.h" + +/* This file implements a userspace BO cache. Allocating and freeing + * GPU-visible buffers is very expensive, and even the extra kernel roundtrips + * adds more work than we would like at this point. So caching BOs in userspace + * solves both of these problems and does not require kernel updates. + * + * Cached BOs are sorted into a bucket based on rounding their size down to the + * nearest power-of-two. Each bucket contains a linked list of free panfrost_bo + * objects. Putting a BO into the cache is accomplished by adding it to the + * corresponding bucket. Getting a BO from the cache consists of finding the + * appropriate bucket and sorting. A cache eviction is a kernel-level free of a + * BO and removing it from the bucket. We special case evicting all BOs from + * the cache, since that's what helpful in practice and avoids extra logic + * around the linked list. + */ + +static struct panfrost_bo * +panfrost_bo_alloc(struct panfrost_screen *screen, size_t size, + uint32_t flags) +{ + struct drm_panfrost_create_bo create_bo = { .size = size }; + struct panfrost_bo *bo; + int ret; + + if (screen->kernel_version->version_major > 1 || + screen->kernel_version->version_minor >= 1) { + if (flags & PAN_BO_GROWABLE) + create_bo.flags |= PANFROST_BO_HEAP; + if (!(flags & PAN_BO_EXECUTE)) + create_bo.flags |= PANFROST_BO_NOEXEC; + } + + ret = drmIoctl(screen->fd, DRM_IOCTL_PANFROST_CREATE_BO, &create_bo); + if (ret) { + fprintf(stderr, "DRM_IOCTL_PANFROST_CREATE_BO failed: %m\n"); + return NULL; + } + + bo = rzalloc(screen, struct panfrost_bo); + assert(bo); + bo->size = create_bo.size; + bo->gpu = create_bo.offset; + bo->gem_handle = create_bo.handle; + bo->flags = flags; + bo->screen = screen; + return bo; +} + +static void +panfrost_bo_free(struct panfrost_bo *bo) +{ + struct drm_gem_close gem_close = { .handle = bo->gem_handle }; + int ret; + + ret = drmIoctl(bo->screen->fd, DRM_IOCTL_GEM_CLOSE, &gem_close); + if (ret) { + fprintf(stderr, "DRM_IOCTL_GEM_CLOSE failed: %m\n"); + assert(0); + } + + ralloc_free(bo); +} + +/* Returns true if the BO is ready, false otherwise. + * access_type is encoding the type of access one wants to ensure is done. + * Say you want to make sure all writers are done writing, you should pass + * PAN_BO_ACCESS_WRITE. + * If you want to wait for all users, you should pass PAN_BO_ACCESS_RW. + * PAN_BO_ACCESS_READ would work too as waiting for readers implies + * waiting for writers as well, but we want to make things explicit and waiting + * only for readers is impossible. + */ +bool +panfrost_bo_wait(struct panfrost_bo *bo, int64_t timeout_ns, + uint32_t access_type) +{ + struct drm_panfrost_wait_bo req = { + .handle = bo->gem_handle, + .timeout_ns = timeout_ns, + }; + int ret; + + assert(access_type == PAN_BO_ACCESS_WRITE || + access_type == PAN_BO_ACCESS_RW); + + /* If the BO has been exported or imported we can't rely on the cached + * state, we need to call the WAIT_BO ioctl. + */ + if (!(bo->flags & (PAN_BO_IMPORTED | PAN_BO_EXPORTED))) { + /* If ->gpu_access is 0, the BO is idle, no need to wait. */ + if (!bo->gpu_access) + return true; + + /* If the caller only wants to wait for writers and no + * writes are pending, we don't have to wait. + */ + if (access_type == PAN_BO_ACCESS_WRITE && + !(bo->gpu_access & PAN_BO_ACCESS_WRITE)) + return true; + } + + /* The ioctl returns >= 0 value when the BO we are waiting for is ready + * -1 otherwise. + */ + ret = drmIoctl(bo->screen->fd, DRM_IOCTL_PANFROST_WAIT_BO, &req); + if (ret != -1) { + /* Set gpu_access to 0 so that the next call to bo_wait() + * doesn't have to call the WAIT_BO ioctl. + */ + bo->gpu_access = 0; + return true; + } + + /* If errno is not ETIMEDOUT or EBUSY that means the handle we passed + * is invalid, which shouldn't happen here. + */ + assert(errno == ETIMEDOUT || errno == EBUSY); + return false; +} + +/* Helper to calculate the bucket index of a BO */ + +static unsigned +pan_bucket_index(unsigned size) +{ + /* Round down to POT to compute a bucket index */ + + unsigned bucket_index = util_logbase2(size); + + /* Clamp the bucket index; all huge allocations will be + * sorted into the largest bucket */ + + bucket_index = MIN2(bucket_index, MAX_BO_CACHE_BUCKET); + + /* The minimum bucket size must equal the minimum allocation + * size; the maximum we clamped */ + + assert(bucket_index >= MIN_BO_CACHE_BUCKET); + assert(bucket_index <= MAX_BO_CACHE_BUCKET); + + /* Reindex from 0 */ + return (bucket_index - MIN_BO_CACHE_BUCKET); +} + +static struct list_head * +pan_bucket(struct panfrost_screen *screen, unsigned size) +{ + return &screen->bo_cache.buckets[pan_bucket_index(size)]; +} + +/* Tries to fetch a BO of sufficient size with the appropriate flags from the + * BO cache. If it succeeds, it returns that BO and removes the BO from the + * cache. If it fails, it returns NULL signaling the caller to allocate a new + * BO. */ + +static struct panfrost_bo * +panfrost_bo_cache_fetch(struct panfrost_screen *screen, + size_t size, uint32_t flags, bool dontwait) +{ + pthread_mutex_lock(&screen->bo_cache.lock); + struct list_head *bucket = pan_bucket(screen, size); + struct panfrost_bo *bo = NULL; + + /* Iterate the bucket looking for something suitable */ + list_for_each_entry_safe(struct panfrost_bo, entry, bucket, + bucket_link) { + if (entry->size < size || entry->flags != flags) + continue; + + if (!panfrost_bo_wait(entry, dontwait ? 0 : INT64_MAX, + PAN_BO_ACCESS_RW)) + continue; + + struct drm_panfrost_madvise madv = { + .handle = entry->gem_handle, + .madv = PANFROST_MADV_WILLNEED, + }; + int ret; + + /* This one works, splice it out of the cache */ + list_del(&entry->bucket_link); + list_del(&entry->lru_link); + + ret = drmIoctl(screen->fd, DRM_IOCTL_PANFROST_MADVISE, &madv); + if (!ret && !madv.retained) { + panfrost_bo_free(entry); + continue; + } + /* Let's go! */ + bo = entry; + break; + } + pthread_mutex_unlock(&screen->bo_cache.lock); + + return bo; +} + +static void +panfrost_bo_cache_evict_stale_bos(struct panfrost_screen *screen) +{ + struct timespec time; + + clock_gettime(CLOCK_MONOTONIC, &time); + list_for_each_entry_safe(struct panfrost_bo, entry, + &screen->bo_cache.lru, lru_link) { + /* We want all entries that have been used more than 1 sec + * ago to be dropped, others can be kept. + * Note the <= 2 check and not <= 1. It's here to account for + * the fact that we're only testing ->tv_sec, not ->tv_nsec. + * That means we might keep entries that are between 1 and 2 + * seconds old, but we don't really care, as long as unused BOs + * are dropped at some point. + */ + if (time.tv_sec - entry->last_used <= 2) + break; + + list_del(&entry->bucket_link); + list_del(&entry->lru_link); + panfrost_bo_free(entry); + } +} + +/* Tries to add a BO to the cache. Returns if it was + * successful */ + +static bool +panfrost_bo_cache_put(struct panfrost_bo *bo) +{ + struct panfrost_screen *screen = bo->screen; + + if (bo->flags & PAN_BO_DONT_REUSE) + return false; + + pthread_mutex_lock(&screen->bo_cache.lock); + struct list_head *bucket = pan_bucket(screen, bo->size); + struct drm_panfrost_madvise madv; + struct timespec time; + + madv.handle = bo->gem_handle; + madv.madv = PANFROST_MADV_DONTNEED; + madv.retained = 0; + + drmIoctl(screen->fd, DRM_IOCTL_PANFROST_MADVISE, &madv); + + /* Add us to the bucket */ + list_addtail(&bo->bucket_link, bucket); + + /* Add us to the LRU list and update the last_used field. */ + list_addtail(&bo->lru_link, &screen->bo_cache.lru); + clock_gettime(CLOCK_MONOTONIC, &time); + bo->last_used = time.tv_sec; + + /* Let's do some cleanup in the BO cache while we hold the + * lock. + */ + panfrost_bo_cache_evict_stale_bos(screen); + pthread_mutex_unlock(&screen->bo_cache.lock); + + return true; +} + +/* Evicts all BOs from the cache. Called during context + * destroy or during low-memory situations (to free up + * memory that may be unused by us just sitting in our + * cache, but still reserved from the perspective of the + * OS) */ + +void +panfrost_bo_cache_evict_all( + struct panfrost_screen *screen) +{ + pthread_mutex_lock(&screen->bo_cache.lock); + for (unsigned i = 0; i < ARRAY_SIZE(screen->bo_cache.buckets); ++i) { + struct list_head *bucket = &screen->bo_cache.buckets[i]; + + list_for_each_entry_safe(struct panfrost_bo, entry, bucket, + bucket_link) { + list_del(&entry->bucket_link); + list_del(&entry->lru_link); + panfrost_bo_free(entry); + } + } + pthread_mutex_unlock(&screen->bo_cache.lock); +} + +void +panfrost_bo_mmap(struct panfrost_bo *bo) +{ + struct drm_panfrost_mmap_bo mmap_bo = { .handle = bo->gem_handle }; + int ret; + + if (bo->cpu) + return; + + ret = drmIoctl(bo->screen->fd, DRM_IOCTL_PANFROST_MMAP_BO, &mmap_bo); + if (ret) { + fprintf(stderr, "DRM_IOCTL_PANFROST_MMAP_BO failed: %m\n"); + assert(0); + } + + bo->cpu = os_mmap(NULL, bo->size, PROT_READ | PROT_WRITE, MAP_SHARED, + bo->screen->fd, mmap_bo.offset); + if (bo->cpu == MAP_FAILED) { + fprintf(stderr, "mmap failed: %p %m\n", bo->cpu); + assert(0); + } + + /* Record the mmap if we're tracing */ + if (pan_debug & PAN_DBG_TRACE) + pandecode_inject_mmap(bo->gpu, bo->cpu, bo->size, NULL); +} + +static void +panfrost_bo_munmap(struct panfrost_bo *bo) +{ + if (!bo->cpu) + return; + + if (os_munmap((void *) (uintptr_t)bo->cpu, bo->size)) { + perror("munmap"); + abort(); + } + + bo->cpu = NULL; +} + +struct panfrost_bo * +panfrost_bo_create(struct panfrost_screen *screen, size_t size, + uint32_t flags) +{ + struct panfrost_bo *bo; + + /* Kernel will fail (confusingly) with EPERM otherwise */ + assert(size > 0); + + /* To maximize BO cache usage, don't allocate tiny BOs */ + size = MAX2(size, 4096); + + /* GROWABLE BOs cannot be mmapped */ + if (flags & PAN_BO_GROWABLE) + assert(flags & PAN_BO_INVISIBLE); + + /* Before creating a BO, we first want to check the cache but without + * waiting for BO readiness (BOs in the cache can still be referenced + * by jobs that are not finished yet). + * If the cached allocation fails we fall back on fresh BO allocation, + * and if that fails too, we try one more time to allocate from the + * cache, but this time we accept to wait. + */ + bo = panfrost_bo_cache_fetch(screen, size, flags, true); + if (!bo) + bo = panfrost_bo_alloc(screen, size, flags); + if (!bo) + bo = panfrost_bo_cache_fetch(screen, size, flags, false); + + if (!bo) + fprintf(stderr, "BO creation failed\n"); + + assert(bo); + + /* Only mmap now if we know we need to. For CPU-invisible buffers, we + * never map since we don't care about their contents; they're purely + * for GPU-internal use. But we do trace them anyway. */ + + if (!(flags & (PAN_BO_INVISIBLE | PAN_BO_DELAY_MMAP))) + panfrost_bo_mmap(bo); + else if (flags & PAN_BO_INVISIBLE) { + if (pan_debug & PAN_DBG_TRACE) + pandecode_inject_mmap(bo->gpu, NULL, bo->size, NULL); + } + + pipe_reference_init(&bo->reference, 1); + + pthread_mutex_lock(&screen->active_bos_lock); + _mesa_set_add(bo->screen->active_bos, bo); + pthread_mutex_unlock(&screen->active_bos_lock); + + return bo; +} + +void +panfrost_bo_reference(struct panfrost_bo *bo) +{ + if (bo) + pipe_reference(NULL, &bo->reference); +} + +void +panfrost_bo_unreference(struct panfrost_bo *bo) +{ + if (!bo) + return; + + if (!pipe_reference(&bo->reference, NULL)) + return; + + struct panfrost_screen *screen = bo->screen; + + pthread_mutex_lock(&screen->active_bos_lock); + /* Someone might have imported this BO while we were waiting for the + * lock, let's make sure it's still not referenced before freeing it. + */ + if (!pipe_is_referenced(&bo->reference)) { + _mesa_set_remove_key(bo->screen->active_bos, bo); + + /* When the reference count goes to zero, we need to cleanup */ + panfrost_bo_munmap(bo); + + /* Rather than freeing the BO now, we'll cache the BO for later + * allocations if we're allowed to. + */ + if (!panfrost_bo_cache_put(bo)) + panfrost_bo_free(bo); + } + pthread_mutex_unlock(&screen->active_bos_lock); +} + +struct panfrost_bo * +panfrost_bo_import(struct panfrost_screen *screen, int fd) +{ + struct panfrost_bo *bo, *newbo = rzalloc(screen, struct panfrost_bo); + struct drm_panfrost_get_bo_offset get_bo_offset = {0,}; + struct set_entry *entry; + ASSERTED int ret; + unsigned gem_handle; + + newbo->screen = screen; + + ret = drmPrimeFDToHandle(screen->fd, fd, &gem_handle); + assert(!ret); + + newbo->gem_handle = gem_handle; + + pthread_mutex_lock(&screen->active_bos_lock); + entry = _mesa_set_search_or_add(screen->active_bos, newbo); + assert(entry); + bo = (struct panfrost_bo *)entry->key; + if (newbo == bo) { + get_bo_offset.handle = gem_handle; + ret = drmIoctl(screen->fd, DRM_IOCTL_PANFROST_GET_BO_OFFSET, &get_bo_offset); + assert(!ret); + + newbo->gpu = (mali_ptr) get_bo_offset.offset; + newbo->size = lseek(fd, 0, SEEK_END); + newbo->flags |= PAN_BO_DONT_REUSE | PAN_BO_IMPORTED; + assert(newbo->size > 0); + pipe_reference_init(&newbo->reference, 1); + // TODO map and unmap on demand? + panfrost_bo_mmap(newbo); + } else { + ralloc_free(newbo); + /* !pipe_is_referenced(&bo->reference) can happen if the BO + * was being released but panfrost_bo_import() acquired the + * lock before panfrost_bo_unreference(). In that case, refcnt + * is 0 and we can't use panfrost_bo_reference() directly, we + * have to re-initialize it with pipe_reference_init(). + * Note that panfrost_bo_unreference() checks + * pipe_is_referenced() value just after acquiring the lock to + * make sure the object is not freed if panfrost_bo_import() + * acquired it in the meantime. + */ + if (!pipe_is_referenced(&bo->reference)) + pipe_reference_init(&newbo->reference, 1); + else + panfrost_bo_reference(bo); + assert(bo->cpu); + } + pthread_mutex_unlock(&screen->active_bos_lock); + + return bo; +} + +int +panfrost_bo_export(struct panfrost_bo *bo) +{ + struct drm_prime_handle args = { + .handle = bo->gem_handle, + .flags = DRM_CLOEXEC, + }; + + int ret = drmIoctl(bo->screen->fd, DRM_IOCTL_PRIME_HANDLE_TO_FD, &args); + if (ret == -1) + return -1; + + bo->flags |= PAN_BO_DONT_REUSE | PAN_BO_EXPORTED; + return args.fd; +} + diff -Nru mesa-19.2.8/src/gallium/drivers/panfrost/pan_bo_cache.c mesa-20.0.8/src/gallium/drivers/panfrost/pan_bo_cache.c --- mesa-19.2.8/src/gallium/drivers/panfrost/pan_bo_cache.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/panfrost/pan_bo_cache.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,161 +0,0 @@ -/* - * Copyright 2019 Collabora, Ltd. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Authors (Collabora): - * Alyssa Rosenzweig - */ -#include -#include "drm-uapi/panfrost_drm.h" - -#include "pan_screen.h" -#include "util/u_math.h" - -/* This file implements a userspace BO cache. Allocating and freeing - * GPU-visible buffers is very expensive, and even the extra kernel roundtrips - * adds more work than we would like at this point. So caching BOs in userspace - * solves both of these problems and does not require kernel updates. - * - * Cached BOs are sorted into a bucket based on rounding their size down to the - * nearest power-of-two. Each bucket contains a linked list of free panfrost_bo - * objects. Putting a BO into the cache is accomplished by adding it to the - * corresponding bucket. Getting a BO from the cache consists of finding the - * appropriate bucket and sorting. A cache eviction is a kernel-level free of a - * BO and removing it from the bucket. We special case evicting all BOs from - * the cache, since that's what helpful in practice and avoids extra logic - * around the linked list. - */ - -/* Helper to calculate the bucket index of a BO */ - -static unsigned -pan_bucket_index(unsigned size) -{ - /* Round down to POT to compute a bucket index */ - - unsigned bucket_index = util_logbase2(size); - - /* Clamp the bucket index; all huge allocations will be - * sorted into the largest bucket */ - - bucket_index = MIN2(bucket_index, MAX_BO_CACHE_BUCKET); - - /* The minimum bucket size must equal the minimum allocation - * size; the maximum we clamped */ - - assert(bucket_index >= MIN_BO_CACHE_BUCKET); - assert(bucket_index <= MAX_BO_CACHE_BUCKET); - - /* Reindex from 0 */ - return (bucket_index - MIN_BO_CACHE_BUCKET); -} - -static struct list_head * -pan_bucket(struct panfrost_screen *screen, unsigned size) -{ - return &screen->bo_cache[pan_bucket_index(size)]; -} - -/* Tries to fetch a BO of sufficient size with the appropriate flags from the - * BO cache. If it succeeds, it returns that BO and removes the BO from the - * cache. If it fails, it returns NULL signaling the caller to allocate a new - * BO. */ - -struct panfrost_bo * -panfrost_bo_cache_fetch( - struct panfrost_screen *screen, - size_t size, uint32_t flags) -{ - struct list_head *bucket = pan_bucket(screen, size); - - /* Iterate the bucket looking for something suitable */ - list_for_each_entry_safe(struct panfrost_bo, entry, bucket, link) { - if (entry->size >= size && - entry->flags == flags) { - int ret; - struct drm_panfrost_madvise madv; - - /* This one works, splice it out of the cache */ - list_del(&entry->link); - - madv.handle = entry->gem_handle; - madv.madv = PANFROST_MADV_WILLNEED; - madv.retained = 0; - - ret = drmIoctl(screen->fd, DRM_IOCTL_PANFROST_MADVISE, &madv); - if (!ret && !madv.retained) { - panfrost_drm_release_bo(screen, entry, false); - continue; - } - /* Let's go! */ - return entry; - } - } - - /* We didn't find anything */ - return NULL; -} - -/* Tries to add a BO to the cache. Returns if it was - * successful */ - -bool -panfrost_bo_cache_put( - struct panfrost_screen *screen, - struct panfrost_bo *bo) -{ - struct list_head *bucket = pan_bucket(screen, bo->size); - struct drm_panfrost_madvise madv; - - madv.handle = bo->gem_handle; - madv.madv = PANFROST_MADV_DONTNEED; - madv.retained = 0; - - drmIoctl(screen->fd, DRM_IOCTL_PANFROST_MADVISE, &madv); - - /* Add us to the bucket */ - list_addtail(&bo->link, bucket); - - return true; -} - -/* Evicts all BOs from the cache. Called during context - * destroy or during low-memory situations (to free up - * memory that may be unused by us just sitting in our - * cache, but still reserved from the perspective of the - * OS) */ - -void -panfrost_bo_cache_evict_all( - struct panfrost_screen *screen) -{ - for (unsigned i = 0; i < ARRAY_SIZE(screen->bo_cache); ++i) { - struct list_head *bucket = &screen->bo_cache[i]; - - list_for_each_entry_safe(struct panfrost_bo, entry, bucket, link) { - list_del(&entry->link); - panfrost_drm_release_bo(screen, entry, false); - } - } - - return; -} - diff -Nru mesa-19.2.8/src/gallium/drivers/panfrost/pan_bo.h mesa-20.0.8/src/gallium/drivers/panfrost/pan_bo.h --- mesa-19.2.8/src/gallium/drivers/panfrost/pan_bo.h 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/panfrost/pan_bo.h 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,155 @@ +/* + * © Copyright 2019 Alyssa Rosenzweig + * © Copyright 2019 Collabora, Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef __PAN_BO_H__ +#define __PAN_BO_H__ + +#include +#include "pipe/p_state.h" +#include "util/list.h" + +struct panfrost_screen; + +/* Flags for allocated memory */ + +/* This memory region is executable */ +#define PAN_BO_EXECUTE (1 << 0) + +/* This memory region should be lazily allocated and grow-on-page-fault. Must + * be used in conjunction with INVISIBLE */ +#define PAN_BO_GROWABLE (1 << 1) + +/* This memory region should not be mapped to the CPU */ +#define PAN_BO_INVISIBLE (1 << 2) + +/* This memory region will be used for varyings and needs to have the cache + * bits twiddled accordingly */ +#define PAN_BO_COHERENT_LOCAL (1 << 3) + +/* This region may not be used immediately and will not mmap on allocate + * (semantically distinct from INVISIBLE, which cannot never be mmaped) */ +#define PAN_BO_DELAY_MMAP (1 << 4) + +/* Some BOs shouldn't be returned back to the reuse BO cache, use this flag to + * let the BO logic know about this contraint. */ +#define PAN_BO_DONT_REUSE (1 << 5) + +/* BO has been imported */ +#define PAN_BO_IMPORTED (1 << 6) + +/* BO has been exported */ +#define PAN_BO_EXPORTED (1 << 7) + +/* GPU access flags */ + +/* BO is either shared (can be accessed by more than one GPU batch) or private + * (reserved by a specific GPU job). */ +#define PAN_BO_ACCESS_PRIVATE (0 << 0) +#define PAN_BO_ACCESS_SHARED (1 << 0) + +/* BO is being read/written by the GPU */ +#define PAN_BO_ACCESS_READ (1 << 1) +#define PAN_BO_ACCESS_WRITE (1 << 2) +#define PAN_BO_ACCESS_RW (PAN_BO_ACCESS_READ | PAN_BO_ACCESS_WRITE) + +/* BO is accessed by the vertex/tiler job. */ +#define PAN_BO_ACCESS_VERTEX_TILER (1 << 3) + +/* BO is accessed by the fragment job. */ +#define PAN_BO_ACCESS_FRAGMENT (1 << 4) + +struct panfrost_bo { + /* Must be first for casting */ + struct list_head bucket_link; + + /* Used to link the BO to the BO cache LRU list. */ + struct list_head lru_link; + + /* Store the time this BO was use last, so the BO cache logic can evict + * stale BOs. + */ + time_t last_used; + + struct pipe_reference reference; + + struct panfrost_screen *screen; + + /* Mapping for the entire object (all levels) */ + uint8_t *cpu; + + /* GPU address for the object */ + mali_ptr gpu; + + /* Size of all entire trees */ + size_t size; + + int gem_handle; + + uint32_t flags; + + /* Combination of PAN_BO_ACCESS_{READ,WRITE} flags encoding pending + * GPU accesses to this BO. Useful to avoid calling the WAIT_BO ioctl + * when the BO is idle. + */ + uint32_t gpu_access; +}; + +/* If a BO is accessed for a particular shader stage, will it be in the primary + * batch (vertex/tiler) or the secondary batch (fragment)? Anything but + * fragment will be primary, e.g. compute jobs will be considered + * "vertex/tiler" by analogy */ + +static inline uint32_t +panfrost_bo_access_for_stage(enum pipe_shader_type stage) +{ + assert(stage == PIPE_SHADER_FRAGMENT || + stage == PIPE_SHADER_VERTEX || + stage == PIPE_SHADER_COMPUTE); + + return stage == PIPE_SHADER_FRAGMENT ? + PAN_BO_ACCESS_FRAGMENT : + PAN_BO_ACCESS_VERTEX_TILER; +} + +bool +panfrost_bo_wait(struct panfrost_bo *bo, int64_t timeout_ns, + uint32_t access_type); +void +panfrost_bo_reference(struct panfrost_bo *bo); +void +panfrost_bo_unreference(struct panfrost_bo *bo); +struct panfrost_bo * +panfrost_bo_create(struct panfrost_screen *screen, size_t size, + uint32_t flags); +void +panfrost_bo_mmap(struct panfrost_bo *bo); +struct panfrost_bo * +panfrost_bo_import(struct panfrost_screen *screen, int fd); +int +panfrost_bo_export(struct panfrost_bo *bo); +void +panfrost_bo_cache_evict_all(struct panfrost_screen *screen); + +#endif /* __PAN_BO_H__ */ diff -Nru mesa-19.2.8/src/gallium/drivers/panfrost/pan_compute.c mesa-20.0.8/src/gallium/drivers/panfrost/pan_compute.c --- mesa-19.2.8/src/gallium/drivers/panfrost/pan_compute.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/panfrost/pan_compute.c 2020-06-12 01:21:17.000000000 +0000 @@ -1,5 +1,6 @@ /* * Copyright (C) 2019 Collabora, Ltd. + * Copyright (C) 2019 Red Hat Inc. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -27,6 +28,7 @@ #include "pan_context.h" #include "util/u_memory.h" +#include "nir_serialize.h" /* Compute CSOs are tracked like graphics shader CSOs, but are * considerably simpler. We do not implement multiple @@ -44,19 +46,27 @@ so->cbase = *cso; so->is_compute = true; - struct panfrost_shader_state *v = &so->variants[0]; + struct panfrost_shader_state *v = calloc(1, sizeof(*v)); + so->variants = v; so->variant_count = 1; so->active_variant = 0; v->tripipe = malloc(sizeof(struct mali_shader_meta)); + if (cso->ir_type == PIPE_SHADER_IR_NIR_SERIALIZED) { + struct blob_reader reader; + const struct pipe_binary_program_header *hdr = cso->prog; + + blob_reader_init(&reader, hdr->blob, hdr->num_bytes); + so->cbase.prog = nir_deserialize(NULL, &midgard_nir_options, &reader); + so->cbase.ir_type = PIPE_SHADER_IR_NIR; + } + panfrost_shader_compile(ctx, v->tripipe, - cso->ir_type, cso->prog, + so->cbase.ir_type, so->cbase.prog, MESA_SHADER_COMPUTE, v, NULL); - - return so; } @@ -87,6 +97,9 @@ { struct panfrost_context *ctx = pan_context(pipe); + /* TODO: Do we want a special compute-only batch? */ + struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx); + ctx->compute_grid = info; struct mali_job_descriptor_header job = { @@ -98,6 +111,19 @@ /* TODO: Stub */ struct midgard_payload_vertex_tiler *payload = &ctx->payloads[PIPE_SHADER_COMPUTE]; + /* We implement OpenCL inputs as uniforms (or a UBO -- same thing), so + * reuse the graphics path for this by lowering to Gallium */ + + struct pipe_constant_buffer ubuf = { + .buffer = NULL, + .buffer_offset = 0, + .buffer_size = ctx->shader[PIPE_SHADER_COMPUTE]->cbase.req_input_mem, + .user_buffer = info->input + }; + + if (info->input) + pipe->set_constant_buffer(pipe, PIPE_SHADER_COMPUTE, 0, &ubuf); + panfrost_emit_for_draw(ctx, false); /* Compute jobs have a "compute FBD". It's not a real framebuffer @@ -113,27 +139,47 @@ }; payload->postfix.framebuffer = - panfrost_upload_transient(ctx, &compute_fbd, sizeof(compute_fbd)); + panfrost_upload_transient(batch, &compute_fbd, sizeof(compute_fbd)); /* Invoke according to the grid info */ panfrost_pack_work_groups_compute(&payload->prefix, info->grid[0], info->grid[1], info->grid[2], - info->block[0], info->block[1], info->block[2]); + info->block[0], info->block[1], info->block[2], false); /* Upload the payload */ - struct panfrost_transfer transfer = panfrost_allocate_transient(ctx, sizeof(job) + sizeof(*payload)); + struct panfrost_transfer transfer = panfrost_allocate_transient(batch, sizeof(job) + sizeof(*payload)); memcpy(transfer.cpu, &job, sizeof(job)); memcpy(transfer.cpu + sizeof(job), payload, sizeof(*payload)); - /* TODO: Do we want a special compute-only batch? */ - struct panfrost_job *batch = panfrost_get_job_for_fbo(ctx); - /* Queue the job */ panfrost_scoreboard_queue_compute_job(batch, transfer); - panfrost_flush(pipe, NULL, PIPE_FLUSH_END_OF_FRAME); + panfrost_flush_all_batches(ctx, true); +} + +static void +panfrost_set_compute_resources(struct pipe_context *pctx, + unsigned start, unsigned count, + struct pipe_surface **resources) +{ + /* TODO */ +} + +static void +panfrost_set_global_binding(struct pipe_context *pctx, + unsigned first, unsigned count, + struct pipe_resource **resources, + uint32_t **handles) +{ + /* TODO */ +} + +static void +panfrost_memory_barrier(struct pipe_context *pctx, unsigned flags) +{ + /* TODO */ } void @@ -144,6 +190,9 @@ pctx->delete_compute_state = panfrost_delete_compute_state; pctx->launch_grid = panfrost_launch_grid; -} + pctx->set_compute_resources = panfrost_set_compute_resources; + pctx->set_global_binding = panfrost_set_global_binding; + pctx->memory_barrier = panfrost_memory_barrier; +} diff -Nru mesa-19.2.8/src/gallium/drivers/panfrost/pan_context.c mesa-20.0.8/src/gallium/drivers/panfrost/pan_context.c --- mesa-19.2.8/src/gallium/drivers/panfrost/pan_context.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/panfrost/pan_context.c 2020-06-12 01:21:17.000000000 +0000 @@ -27,18 +27,20 @@ #include #include +#include "pan_bo.h" #include "pan_context.h" #include "pan_format.h" +#include "panfrost-quirks.h" #include "util/macros.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_inlines.h" #include "util/u_upload_mgr.h" #include "util/u_memory.h" #include "util/u_vbuf.h" #include "util/half_float.h" #include "util/u_helpers.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_prim.h" #include "util/u_prim_restart.h" #include "indices/u_primconvert.h" @@ -50,130 +52,74 @@ #include "pan_blending.h" #include "pan_blend_shaders.h" #include "pan_util.h" -#include "pan_tiler.h" +#include "pandecode/decode.h" -/* Do not actually send anything to the GPU; merely generate the cmdstream as fast as possible. Disables framebuffer writes */ -//#define DRY_RUN - -/* Framebuffer descriptor */ - -static struct midgard_tiler_descriptor -panfrost_emit_midg_tiler( - struct panfrost_context *ctx, - unsigned width, - unsigned height, - unsigned vertex_count) +struct midgard_tiler_descriptor +panfrost_emit_midg_tiler(struct panfrost_batch *batch, unsigned vertex_count) { - struct midgard_tiler_descriptor t = {}; - struct panfrost_job *batch = panfrost_get_job_for_fbo(ctx); + struct panfrost_screen *screen = pan_screen(batch->ctx->base.screen); + bool hierarchy = !(screen->quirks & MIDGARD_NO_HIER_TILING); + struct midgard_tiler_descriptor t = {0}; + unsigned height = batch->key.height; + unsigned width = batch->key.width; t.hierarchy_mask = - panfrost_choose_hierarchy_mask(width, height, vertex_count); + panfrost_choose_hierarchy_mask(width, height, vertex_count, hierarchy); /* Compute the polygon header size and use that to offset the body */ unsigned header_size = panfrost_tiler_header_size( - width, height, t.hierarchy_mask); + width, height, t.hierarchy_mask, hierarchy); - unsigned body_size = panfrost_tiler_body_size( - width, height, t.hierarchy_mask); + t.polygon_list_size = panfrost_tiler_full_size( + width, height, t.hierarchy_mask, hierarchy); /* Sanity check */ - if (t.hierarchy_mask) { - t.polygon_list = panfrost_job_get_polygon_list(batch, header_size + body_size); + if (vertex_count) { + struct panfrost_bo *tiler_heap; + + tiler_heap = panfrost_batch_get_tiler_heap(batch); + t.polygon_list = panfrost_batch_get_polygon_list(batch, + header_size + + t.polygon_list_size); + /* Allow the entire tiler heap */ - t.heap_start = ctx->tiler_heap.bo->gpu; - t.heap_end = - ctx->tiler_heap.bo->gpu + ctx->tiler_heap.bo->size; + t.heap_start = tiler_heap->gpu; + t.heap_end = tiler_heap->gpu + tiler_heap->size; } else { + struct panfrost_bo *tiler_dummy; + + tiler_dummy = panfrost_batch_get_tiler_dummy(batch); + header_size = MALI_TILER_MINIMUM_HEADER_SIZE; + /* The tiler is disabled, so don't allow the tiler heap */ - t.heap_start = ctx->tiler_heap.bo->gpu; + t.heap_start = tiler_dummy->gpu; t.heap_end = t.heap_start; /* Use a dummy polygon list */ - t.polygon_list = ctx->tiler_dummy.bo->gpu; + t.polygon_list = tiler_dummy->gpu; - /* Also, set a "tiler disabled?" flag? */ - t.hierarchy_mask |= 0x1000; + /* Disable the tiler */ + if (hierarchy) + t.hierarchy_mask |= MALI_TILER_DISABLED; + else { + t.hierarchy_mask = MALI_TILER_USER; + t.polygon_list_size = MALI_TILER_MINIMUM_HEADER_SIZE + 4; + + /* We don't have a WRITE_VALUE job, so write the polygon list manually */ + uint32_t *polygon_list_body = (uint32_t *) (tiler_dummy->cpu + header_size); + polygon_list_body[0] = 0xa0000000; /* TODO: Just that? */ + } } t.polygon_list_body = t.polygon_list + header_size; - t.polygon_list_size = - header_size + body_size; - return t; } -struct mali_single_framebuffer -panfrost_emit_sfbd(struct panfrost_context *ctx, unsigned vertex_count) -{ - unsigned width = ctx->pipe_framebuffer.width; - unsigned height = ctx->pipe_framebuffer.height; - - struct mali_single_framebuffer framebuffer = { - .width = MALI_POSITIVE(width), - .height = MALI_POSITIVE(height), - .unknown2 = 0x1f, - .format = 0x30000000, - .clear_flags = 0x1000, - .unknown_address_0 = ctx->scratchpad.bo->gpu, - .tiler = panfrost_emit_midg_tiler(ctx, - width, height, vertex_count), - }; - - return framebuffer; -} - -struct bifrost_framebuffer -panfrost_emit_mfbd(struct panfrost_context *ctx, unsigned vertex_count) -{ - unsigned width = ctx->pipe_framebuffer.width; - unsigned height = ctx->pipe_framebuffer.height; - - struct bifrost_framebuffer framebuffer = { - .unk0 = 0x1e5, /* 1e4 if no spill */ - .width1 = MALI_POSITIVE(width), - .height1 = MALI_POSITIVE(height), - .width2 = MALI_POSITIVE(width), - .height2 = MALI_POSITIVE(height), - - .unk1 = 0x1080, - - .rt_count_1 = MALI_POSITIVE(ctx->pipe_framebuffer.nr_cbufs), - .rt_count_2 = 4, - - .unknown2 = 0x1f, - - .scratchpad = ctx->scratchpad.bo->gpu, - .tiler = panfrost_emit_midg_tiler(ctx, - width, height, vertex_count) - }; - - return framebuffer; -} - -/* Are we currently rendering to the screen (rather than an FBO)? */ - -bool -panfrost_is_scanout(struct panfrost_context *ctx) -{ - /* If there is no color buffer, it's an FBO */ - if (ctx->pipe_framebuffer.nr_cbufs != 1) - return false; - - /* If we're too early that no framebuffer was sent, it's scanout */ - if (!ctx->pipe_framebuffer.cbufs[0]) - return true; - - return ctx->pipe_framebuffer.cbufs[0]->texture->bind & PIPE_BIND_DISPLAY_TARGET || - ctx->pipe_framebuffer.cbufs[0]->texture->bind & PIPE_BIND_SCANOUT || - ctx->pipe_framebuffer.cbufs[0]->texture->bind & PIPE_BIND_SHARED; -} - static void panfrost_clear( struct pipe_context *pipe, @@ -182,50 +128,47 @@ double depth, unsigned stencil) { struct panfrost_context *ctx = pan_context(pipe); - struct panfrost_job *job = panfrost_get_job_for_fbo(ctx); - - panfrost_job_clear(ctx, job, buffers, color, depth, stencil); -} - -static mali_ptr -panfrost_attach_vt_mfbd(struct panfrost_context *ctx) -{ - struct bifrost_framebuffer mfbd = panfrost_emit_mfbd(ctx, ~0); - - return panfrost_upload_transient(ctx, &mfbd, sizeof(mfbd)) | MALI_MFBD; -} -static mali_ptr -panfrost_attach_vt_sfbd(struct panfrost_context *ctx) -{ - struct mali_single_framebuffer sfbd = panfrost_emit_sfbd(ctx, ~0); + /* TODO: panfrost_get_fresh_batch_for_fbo() instantiates a new batch if + * the existing batch targeting this FBO has draws. We could probably + * avoid that by replacing plain clears by quad-draws with a specific + * color/depth/stencil value, thus avoiding the generation of extra + * fragment jobs. + */ + struct panfrost_batch *batch = panfrost_get_fresh_batch_for_fbo(ctx); - return panfrost_upload_transient(ctx, &sfbd, sizeof(sfbd)) | MALI_SFBD; + panfrost_batch_add_fbo_bos(batch); + panfrost_batch_clear(batch, buffers, color, depth, stencil); } static void panfrost_attach_vt_framebuffer(struct panfrost_context *ctx) { - /* Skip the attach if we can */ + struct panfrost_screen *screen = pan_screen(ctx->base.screen); + struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx); - if (ctx->payloads[PIPE_SHADER_VERTEX].postfix.framebuffer) { - assert(ctx->payloads[PIPE_SHADER_FRAGMENT].postfix.framebuffer); - return; - } + /* If we haven't, reserve space for the framebuffer */ - struct panfrost_screen *screen = pan_screen(ctx->base.screen); - mali_ptr framebuffer = screen->require_sfbd ? - panfrost_attach_vt_sfbd(ctx) : - panfrost_attach_vt_mfbd(ctx); + if (!batch->framebuffer.gpu) { + unsigned size = (screen->quirks & MIDGARD_SFBD) ? + sizeof(struct mali_single_framebuffer) : + sizeof(struct bifrost_framebuffer); + + batch->framebuffer = panfrost_allocate_transient(batch, size); + + /* Tag the pointer */ + if (!(screen->quirks & MIDGARD_SFBD)) + batch->framebuffer.gpu |= MALI_MFBD; + } for (unsigned i = 0; i < PIPE_SHADER_TYPES; ++i) - ctx->payloads[i].postfix.framebuffer = framebuffer; + ctx->payloads[i].postfix.framebuffer = batch->framebuffer.gpu; } /* Reset per-frame context, called on context initialisation as well as after * flushing a frame */ -static void +void panfrost_invalidate_frame(struct panfrost_context *ctx) { for (unsigned i = 0; i < PIPE_SHADER_TYPES; ++i) @@ -260,18 +203,6 @@ memcpy(&ctx->payloads[PIPE_SHADER_COMPUTE], &payload, sizeof(payload)); } -static void -panfrost_emit_tiler_payload(struct panfrost_context *ctx) -{ - struct midgard_payload_vertex_tiler payload = { - .prefix = { - .zero1 = 0xffff, /* Why is this only seen on test-quad-textured? */ - }, - }; - - memcpy(&ctx->payloads[PIPE_SHADER_FRAGMENT], &payload, sizeof(payload)); -} - static unsigned translate_tex_wrap(enum pipe_tex_wrap w) { @@ -279,6 +210,9 @@ case PIPE_TEX_WRAP_REPEAT: return MALI_WRAP_REPEAT; + case PIPE_TEX_WRAP_CLAMP: + return MALI_WRAP_CLAMP; + case PIPE_TEX_WRAP_CLAMP_TO_EDGE: return MALI_WRAP_CLAMP_TO_EDGE; @@ -288,6 +222,15 @@ case PIPE_TEX_WRAP_MIRROR_REPEAT: return MALI_WRAP_MIRRORED_REPEAT; + case PIPE_TEX_WRAP_MIRROR_CLAMP: + return MALI_WRAP_MIRRORED_CLAMP; + + case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: + return MALI_WRAP_MIRRORED_CLAMP_TO_EDGE; + + case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: + return MALI_WRAP_MIRRORED_CLAMP_TO_BORDER; + default: unreachable("Invalid wrap"); } @@ -327,39 +270,6 @@ } static unsigned -panfrost_translate_alt_compare_func(enum pipe_compare_func in) -{ - switch (in) { - case PIPE_FUNC_NEVER: - return MALI_ALT_FUNC_NEVER; - - case PIPE_FUNC_LESS: - return MALI_ALT_FUNC_LESS; - - case PIPE_FUNC_EQUAL: - return MALI_ALT_FUNC_EQUAL; - - case PIPE_FUNC_LEQUAL: - return MALI_ALT_FUNC_LEQUAL; - - case PIPE_FUNC_GREATER: - return MALI_ALT_FUNC_GREATER; - - case PIPE_FUNC_NOTEQUAL: - return MALI_ALT_FUNC_NOTEQUAL; - - case PIPE_FUNC_GEQUAL: - return MALI_ALT_FUNC_GEQUAL; - - case PIPE_FUNC_ALWAYS: - return MALI_ALT_FUNC_ALWAYS; - - default: - unreachable("Invalid alt func"); - } -} - -static unsigned panfrost_translate_stencil_op(enum pipe_stencil_op in) { switch (in) { @@ -407,6 +317,7 @@ static void panfrost_default_shader_backend(struct panfrost_context *ctx) { + struct panfrost_screen *screen = pan_screen(ctx->base.screen); struct mali_shader_meta shader = { .alpha_coverage = ~MALI_ALPHA_COVERAGE(0.000000), @@ -414,15 +325,14 @@ .unknown2_4 = MALI_NO_MSAA | 0x4e0, }; - /* unknown2_4 has 0x10 bit set on T6XX. We don't know why this is + /* unknown2_4 has 0x10 bit set on T6XX and T720. We don't know why this is * required (independent of 32-bit/64-bit descriptors), or why it's not * used on later GPU revisions. Otherwise, all shader jobs fault on * these earlier chips (perhaps this is a chicken bit of some kind). * More investigation is needed. */ - if (ctx->is_t6xx) { + if (screen->quirks & MIDGARD_SFBD) shader.unknown2_4 |= 0x10; - } struct pipe_stencil_state default_stencil = { .enabled = 0, @@ -455,6 +365,7 @@ struct panfrost_transfer panfrost_vertex_tiler_job(struct panfrost_context *ctx, bool is_tiler) { + struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx); struct mali_job_descriptor_header job = { .job_type = is_tiler ? JOB_TYPE_TILER : JOB_TYPE_VERTEX, .job_descriptor_size = 1, @@ -462,7 +373,7 @@ struct midgard_payload_vertex_tiler *payload = is_tiler ? &ctx->payloads[PIPE_SHADER_FRAGMENT] : &ctx->payloads[PIPE_SHADER_VERTEX]; - struct panfrost_transfer transfer = panfrost_allocate_transient(ctx, sizeof(job) + sizeof(*payload)); + struct panfrost_transfer transfer = panfrost_allocate_transient(batch, sizeof(job) + sizeof(*payload)); memcpy(transfer.cpu, &job, sizeof(job)); memcpy(transfer.cpu + sizeof(job), payload, sizeof(*payload)); return transfer; @@ -492,10 +403,11 @@ static void panfrost_stage_attributes(struct panfrost_context *ctx) { + struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx); struct panfrost_vertex_state *so = ctx->vertex; - size_t sz = sizeof(struct mali_attr_meta) * so->num_elements; - struct panfrost_transfer transfer = panfrost_allocate_transient(ctx, sz); + size_t sz = sizeof(struct mali_attr_meta) * PAN_MAX_ATTRIBUTE; + struct panfrost_transfer transfer = panfrost_allocate_transient(batch, sz); struct mali_attr_meta *target = (struct mali_attr_meta *) transfer.cpu; /* Copy as-is for the first pass */ @@ -535,12 +447,17 @@ /* Also, somewhat obscurely per-instance data needs to be * offset in response to a delayed start in an indexed draw */ - if (so->pipe[i].instance_divisor && ctx->instance_count > 1 && start) { + if (so->pipe[i].instance_divisor && ctx->instance_count > 1 && start) target[i].src_offset -= buf->stride * start; - } + } + /* Let's also include vertex builtins */ - } + target[PAN_VERTEX_ID].format = MALI_R32UI; + target[PAN_VERTEX_ID].swizzle = panfrost_get_default_swizzle(1); + + target[PAN_INSTANCE_ID].format = MALI_R32UI; + target[PAN_INSTANCE_ID].swizzle = panfrost_get_default_swizzle(1); ctx->payloads[PIPE_SHADER_VERTEX].postfix.attribute_meta = transfer.gpu; } @@ -548,16 +465,17 @@ static void panfrost_upload_sampler_descriptors(struct panfrost_context *ctx) { + struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx); size_t desc_size = sizeof(struct mali_sampler_descriptor); for (int t = 0; t <= PIPE_SHADER_FRAGMENT; ++t) { mali_ptr upload = 0; - if (ctx->sampler_count[t] && ctx->sampler_view_count[t]) { + if (ctx->sampler_count[t]) { size_t transfer_size = desc_size * ctx->sampler_count[t]; struct panfrost_transfer transfer = - panfrost_allocate_transient(ctx, transfer_size); + panfrost_allocate_transient(batch, transfer_size); struct mali_sampler_descriptor *desc = (struct mali_sampler_descriptor *) transfer.cpu; @@ -572,38 +490,25 @@ } } -static unsigned -panfrost_layout_for_texture(struct panfrost_resource *rsrc, bool manual_stride) +static enum mali_texture_layout +panfrost_layout_for_texture(struct panfrost_resource *rsrc) { - /* TODO: other linear depth textures */ - bool is_depth = rsrc->base.format == PIPE_FORMAT_Z32_UNORM; - - unsigned usage2_layout = 0x10; - switch (rsrc->layout) { case PAN_AFBC: - usage2_layout |= 0x8 | 0x4; - break; + return MALI_TEXTURE_AFBC; case PAN_TILED: - usage2_layout |= 0x1; - break; + return MALI_TEXTURE_TILED; case PAN_LINEAR: - usage2_layout |= is_depth ? 0x1 : 0x2; - break; + return MALI_TEXTURE_LINEAR; default: - assert(0); - break; + unreachable("Invalid texture layout"); } - - if (manual_stride) - usage2_layout |= MALI_TEX_MANUAL_STRIDE; - - return usage2_layout; } static mali_ptr panfrost_upload_tex( struct panfrost_context *ctx, + enum pipe_shader_type st, struct panfrost_sampler_view *view) { if (!view) @@ -611,6 +516,8 @@ struct pipe_sampler_view *pview = &view->base; struct panfrost_resource *rsrc = pan_resource(pview->texture); + mali_ptr descriptor_gpu; + void *descriptor; /* Do we interleave an explicit stride with every element? */ @@ -618,11 +525,30 @@ /* For easy access */ - assert(pview->target != PIPE_BUFFER); - unsigned first_level = pview->u.tex.first_level; - unsigned last_level = pview->u.tex.last_level; - unsigned first_layer = pview->u.tex.first_layer; - unsigned last_layer = pview->u.tex.last_layer; + bool is_buffer = pview->target == PIPE_BUFFER; + unsigned first_level = is_buffer ? 0 : pview->u.tex.first_level; + unsigned last_level = is_buffer ? 0 : pview->u.tex.last_level; + unsigned first_layer = is_buffer ? 0 : pview->u.tex.first_layer; + unsigned last_layer = is_buffer ? 0 : pview->u.tex.last_layer; + unsigned first_face = 0; + unsigned last_face = 0; + unsigned face_mult = 1; + + /* Cubemaps have 6 faces as layers in between each actual layer. + * There's a bit of an impedence mismatch between Gallium and the + * hardware, let's fixup for it */ + + if (pview->target == PIPE_TEXTURE_CUBE || pview->target == PIPE_TEXTURE_CUBE_ARRAY) { + /* TODO: logic wrong in the asserted out cases ... can they happen? */ + + first_face = first_layer % 6; + last_face = last_layer % 6; + first_layer /= 6; + last_layer /= 6; + + assert((first_layer == last_layer) || (first_face == 0 && last_face == 5)); + face_mult = 6; + } /* Lower-bit is set when sampling from colour AFBC */ bool is_afbc = rsrc->layout == PAN_AFBC; @@ -630,39 +556,63 @@ unsigned afbc_bit = (is_afbc && !is_zs) ? 1 : 0; /* Add the BO to the job so it's retained until the job is done. */ - struct panfrost_job *job = panfrost_get_job_for_fbo(ctx); - panfrost_job_add_bo(job, rsrc->bo); + struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx); + panfrost_batch_add_bo(batch, rsrc->bo, + PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ | + panfrost_bo_access_for_stage(st)); /* Add the usage flags in, since they can change across the CSO * lifetime due to layout switches */ - view->hw.format.usage2 = panfrost_layout_for_texture(rsrc, has_manual_stride); + view->hw.format.layout = panfrost_layout_for_texture(rsrc); + view->hw.format.manual_stride = has_manual_stride; - /* Inject the addresses in, interleaving mip levels, cube faces, and - * strides in that order */ + /* Inject the addresses in, interleaving array indices, mip levels, + * cube faces, and strides in that order */ unsigned idx = 0; + unsigned levels = 1 + last_level - first_level; + unsigned layers = 1 + last_layer - first_layer; + unsigned faces = 1 + last_face - first_face; + unsigned num_elements = levels * layers * faces; + if (has_manual_stride) + num_elements *= 2; + + descriptor = malloc(sizeof(struct mali_texture_descriptor) + + sizeof(mali_ptr) * num_elements); + memcpy(descriptor, &view->hw, sizeof(struct mali_texture_descriptor)); - for (unsigned l = first_level; l <= last_level; ++l) { - for (unsigned f = first_layer; f <= last_layer; ++f) { - - view->hw.payload[idx++] = - panfrost_get_texture_address(rsrc, l, f) + afbc_bit; + mali_ptr *pointers_and_strides = descriptor + + sizeof(struct mali_texture_descriptor); - if (has_manual_stride) { - view->hw.payload[idx++] = - rsrc->slices[l].stride; + for (unsigned w = first_layer; w <= last_layer; ++w) { + for (unsigned l = first_level; l <= last_level; ++l) { + for (unsigned f = first_face; f <= last_face; ++f) { + pointers_and_strides[idx++] = + panfrost_get_texture_address(rsrc, l, w*face_mult + f) + + afbc_bit + view->astc_stretch; + + if (has_manual_stride) { + pointers_and_strides[idx++] = + rsrc->slices[l].stride; + } } } } - return panfrost_upload_transient(ctx, &view->hw, - sizeof(struct mali_texture_descriptor)); + descriptor_gpu = panfrost_upload_transient(batch, descriptor, + sizeof(struct mali_texture_descriptor) + + num_elements * sizeof(*pointers_and_strides)); + free(descriptor); + + return descriptor_gpu; } static void panfrost_upload_texture_descriptors(struct panfrost_context *ctx) { + struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx); + for (int t = 0; t <= PIPE_SHADER_FRAGMENT; ++t) { mali_ptr trampoline = 0; @@ -671,9 +621,9 @@ for (int i = 0; i < ctx->sampler_view_count[t]; ++i) trampolines[i] = - panfrost_upload_tex(ctx, ctx->sampler_views[t][i]); + panfrost_upload_tex(ctx, t, ctx->sampler_views[t][i]); - trampoline = panfrost_upload_transient(ctx, trampolines, sizeof(uint64_t) * ctx->sampler_view_count[t]); + trampoline = panfrost_upload_transient(batch, trampolines, sizeof(uint64_t) * ctx->sampler_view_count[t]); } ctx->payloads[t].postfix.texture_trampoline = trampoline; @@ -744,16 +694,41 @@ struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id]; /* Compute address */ - struct panfrost_job *batch = panfrost_get_job_for_fbo(ctx); + struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx); struct panfrost_bo *bo = pan_resource(sb.buffer)->bo; - panfrost_job_add_bo(batch, bo); + panfrost_batch_add_bo(batch, bo, + PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_RW | + panfrost_bo_access_for_stage(st)); /* Upload address and size as sysval */ uniform->du[0] = bo->gpu + sb.buffer_offset; uniform->u[2] = sb.buffer_size; } +static void +panfrost_upload_sampler_sysval( + struct panfrost_context *ctx, + enum pipe_shader_type st, + unsigned sampler_index, + struct sysval_uniform *uniform) +{ + struct pipe_sampler_state *sampl = + &ctx->samplers[st][sampler_index]->base; + + uniform->f[0] = sampl->min_lod; + uniform->f[1] = sampl->max_lod; + uniform->f[2] = sampl->lod_bias; + + /* Even without any errata, Midgard represents "no mipmapping" as + * fixing the LOD with the clamps; keep behaviour consistent. c.f. + * panfrost_create_sampler_state which also explains our choice of + * epsilon value (again to keep behaviour consistent) */ + + if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE) + uniform->f[1] = uniform->f[0] + (1.0/256.0); +} + static void panfrost_upload_num_work_groups_sysval(struct panfrost_context *ctx, struct sysval_uniform *uniform) { @@ -789,7 +764,10 @@ case PAN_SYSVAL_NUM_WORK_GROUPS: panfrost_upload_num_work_groups_sysval(ctx, &uniforms[i]); break; - + case PAN_SYSVAL_SAMPLER: + panfrost_upload_sampler_sysval(ctx, st, PAN_SYSVAL_ID(sysval), + &uniforms[i]); + break; default: assert(0); } @@ -813,18 +791,27 @@ static mali_ptr panfrost_map_constant_buffer_gpu( struct panfrost_context *ctx, + enum pipe_shader_type st, struct panfrost_constant_buffer *buf, unsigned index) { struct pipe_constant_buffer *cb = &buf->cb[index]; struct panfrost_resource *rsrc = pan_resource(cb->buffer); + struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx); - if (rsrc) - return rsrc->bo->gpu; - else if (cb->user_buffer) - return panfrost_upload_transient(ctx, cb->user_buffer, cb->buffer_size); - else + if (rsrc) { + panfrost_batch_add_bo(batch, rsrc->bo, + PAN_BO_ACCESS_SHARED | + PAN_BO_ACCESS_READ | + panfrost_bo_access_for_stage(st)); + + /* Alignment gauranteed by PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */ + return rsrc->bo->gpu + cb->buffer_offset; + } else if (cb->user_buffer) { + return panfrost_upload_transient(batch, cb->user_buffer + cb->buffer_offset, cb->buffer_size); + } else { unreachable("No constant buffer"); + } } /* Compute number of UBOs active (more specifically, compute the highest UBO @@ -839,16 +826,21 @@ return 32 - __builtin_clz(mask); } -/* Fixes up a shader state with current state, returning a GPU address to the - * patched shader */ +/* Fixes up a shader state with current state */ -static mali_ptr -panfrost_patch_shader_state( - struct panfrost_context *ctx, - struct panfrost_shader_state *ss, - enum pipe_shader_type stage, - bool should_upload) +static void +panfrost_patch_shader_state(struct panfrost_context *ctx, + enum pipe_shader_type stage) { + struct panfrost_shader_variants *all = ctx->shader[stage]; + + if (!all) { + ctx->payloads[stage].postfix.shader = 0; + return; + } + + struct panfrost_shader_state *ss = &all->variants[all->active_variant]; + ss->tripipe->texture_count = ctx->sampler_view_count[stage]; ss->tripipe->sampler_count = ctx->sampler_count[stage]; @@ -857,37 +849,17 @@ unsigned ubo_count = panfrost_ubo_count(ctx, stage); ss->tripipe->midgard1.uniform_buffer_count = ubo_count; - /* We can't reuse over frames; that's not safe. The descriptor must be - * transient uploaded */ - - if (should_upload) { - return panfrost_upload_transient(ctx, - ss->tripipe, - sizeof(struct mali_shader_meta)); - } - - /* If we don't need an upload, don't bother */ - return 0; - -} - -static void -panfrost_patch_shader_state_compute( - struct panfrost_context *ctx, - enum pipe_shader_type stage, - bool should_upload) -{ - struct panfrost_shader_variants *all = ctx->shader[stage]; - - if (!all) { - ctx->payloads[stage].postfix._shader_upper = 0; - return; - } - - struct panfrost_shader_state *s = &all->variants[all->active_variant]; + struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx); - ctx->payloads[stage].postfix._shader_upper = - panfrost_patch_shader_state(ctx, s, stage, should_upload) >> 4; + /* Add the shader BO to the batch. */ + panfrost_batch_add_bo(batch, ss->bo, + PAN_BO_ACCESS_PRIVATE | + PAN_BO_ACCESS_READ | + panfrost_bo_access_for_stage(stage)); + + ctx->payloads[stage].postfix.shader = panfrost_upload_transient(batch, + ss->tripipe, + sizeof(struct mali_shader_meta)); } /* Go through dirty flags and actualise them in the cmdstream. */ @@ -895,13 +867,14 @@ void panfrost_emit_for_draw(struct panfrost_context *ctx, bool with_vertex_data) { - struct panfrost_job *job = panfrost_get_job_for_fbo(ctx); + struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx); struct panfrost_screen *screen = pan_screen(ctx->base.screen); + panfrost_batch_add_fbo_bos(batch); panfrost_attach_vt_framebuffer(ctx); if (with_vertex_data) { - panfrost_emit_vertex_data(job); + panfrost_emit_vertex_data(batch); /* Varyings emitted for -all- geometry */ unsigned total_count = ctx->padded_count * ctx->instance_count; @@ -918,15 +891,15 @@ SET_BIT(ctx->fragment_shader_core.unknown2_4, MALI_NO_MSAA, !msaa); } - panfrost_job_set_requirements(ctx, job); + panfrost_batch_set_requirements(batch); if (ctx->occlusion_query) { - ctx->payloads[PIPE_SHADER_FRAGMENT].gl_enables |= MALI_OCCLUSION_QUERY | MALI_OCCLUSION_PRECISE; - ctx->payloads[PIPE_SHADER_FRAGMENT].postfix.occlusion_counter = ctx->occlusion_query->transfer.gpu; + ctx->payloads[PIPE_SHADER_FRAGMENT].gl_enables |= MALI_OCCLUSION_QUERY; + ctx->payloads[PIPE_SHADER_FRAGMENT].postfix.occlusion_counter = ctx->occlusion_query->bo->gpu; } - panfrost_patch_shader_state_compute(ctx, PIPE_SHADER_VERTEX, true); - panfrost_patch_shader_state_compute(ctx, PIPE_SHADER_COMPUTE, true); + panfrost_patch_shader_state(ctx, PIPE_SHADER_VERTEX); + panfrost_patch_shader_state(ctx, PIPE_SHADER_COMPUTE); if (ctx->dirty & (PAN_DIRTY_RASTERIZER | PAN_DIRTY_VS)) { /* Check if we need to link the gl_PointSize varying */ @@ -935,7 +908,11 @@ * don't touch primitive_size (since we would clobber * the pointer there) */ - ctx->payloads[PIPE_SHADER_FRAGMENT].primitive_size.constant = ctx->rasterizer->base.line_width; + bool points = ctx->payloads[PIPE_SHADER_FRAGMENT].prefix.draw_mode == MALI_POINTS; + + ctx->payloads[PIPE_SHADER_FRAGMENT].primitive_size.constant = points ? + ctx->rasterizer->base.point_size : + ctx->rasterizer->base.line_width; } } @@ -947,9 +924,7 @@ assert(ctx->shader[PIPE_SHADER_FRAGMENT]); struct panfrost_shader_state *variant = &ctx->shader[PIPE_SHADER_FRAGMENT]->variants[ctx->shader[PIPE_SHADER_FRAGMENT]->active_variant]; - panfrost_patch_shader_state(ctx, variant, PIPE_SHADER_FRAGMENT, false); - - panfrost_job_add_bo(job, variant->bo); + panfrost_patch_shader_state(ctx, PIPE_SHADER_FRAGMENT); #define COPY(name) ctx->fragment_shader_core.name = variant->tripipe->name @@ -958,7 +933,6 @@ COPY(varying_count); COPY(texture_count); COPY(sampler_count); - COPY(sampler_count); COPY(midgard1.uniform_count); COPY(midgard1.uniform_buffer_count); COPY(midgard1.work_count); @@ -971,9 +945,12 @@ unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1); struct panfrost_blend_final blend[PIPE_MAX_COLOR_BUFS]; + unsigned shader_offset = 0; + struct panfrost_bo *shader_bo = NULL; - for (unsigned c = 0; c < rt_count; ++c) - blend[c] = panfrost_get_blend_for_context(ctx, c); + for (unsigned c = 0; c < rt_count; ++c) { + blend[c] = panfrost_get_blend_for_context(ctx, c, &shader_bo, &shader_offset); + } /* If there is a blend shader, work registers are shared. XXX: opt */ @@ -982,22 +959,15 @@ ctx->fragment_shader_core.midgard1.work_count = 16; } - /* Set late due to depending on render state */ - unsigned flags = ctx->fragment_shader_core.midgard1.flags; - /* Depending on whether it's legal to in the given shader, we * try to enable early-z testing (or forward-pixel kill?) */ - if (!variant->can_discard) - flags |= MALI_EARLY_Z; + SET_BIT(ctx->fragment_shader_core.midgard1.flags, MALI_EARLY_Z, !variant->can_discard); /* Any time texturing is used, derivatives are implicitly * calculated, so we need to enable helper invocations */ - if (variant->helper_invocations) - flags |= MALI_HELPER_INVOCATIONS; - - ctx->fragment_shader_core.midgard1.flags = flags; + SET_BIT(ctx->fragment_shader_core.midgard1.flags, MALI_HELPER_INVOCATIONS, variant->helper_invocations); /* Assign the stencil refs late */ @@ -1015,29 +985,33 @@ * thing?" by Peter Harris */ - if (variant->can_discard) { - ctx->fragment_shader_core.unknown2_3 |= MALI_CAN_DISCARD; - ctx->fragment_shader_core.midgard1.flags |= 0x400; - } + SET_BIT(ctx->fragment_shader_core.unknown2_3, MALI_CAN_DISCARD, variant->can_discard); + SET_BIT(ctx->fragment_shader_core.midgard1.flags, 0x400, variant->can_discard); /* Even on MFBD, the shader descriptor gets blend shaders. It's * *also* copied to the blend_meta appended (by convention), * but this is the field actually read by the hardware. (Or - * maybe both are read...?) */ + * maybe both are read...?). Specify the last RTi with a blend + * shader. */ - if (blend[0].is_shader) { - ctx->fragment_shader_core.blend.shader = - blend[0].shader.bo->gpu | blend[0].shader.first_tag; - } else { - ctx->fragment_shader_core.blend.shader = 0; + ctx->fragment_shader_core.blend.shader = 0; + + for (signed rt = (rt_count - 1); rt >= 0; --rt) { + if (blend[rt].is_shader) { + ctx->fragment_shader_core.blend.shader = + blend[rt].shader.gpu | blend[rt].shader.first_tag; + break; + } } - if (screen->require_sfbd) { + if (screen->quirks & MIDGARD_SFBD) { /* When only a single render target platform is used, the blend * information is inside the shader meta itself. We * additionally need to signal CAN_DISCARD for nontrivial blend * modes (so we're able to read back the destination buffer) */ + SET_BIT(ctx->fragment_shader_core.unknown2_3, MALI_HAS_BLEND_SHADER, blend[0].is_shader); + if (!blend[0].is_shader) { ctx->fragment_shader_core.blend.equation = *blend[0].equation.equation; @@ -1045,64 +1019,35 @@ blend[0].equation.constant; } - if (!blend[0].no_blending) { - ctx->fragment_shader_core.unknown2_3 |= MALI_CAN_DISCARD; - } + SET_BIT(ctx->fragment_shader_core.unknown2_3, MALI_CAN_DISCARD, !blend[0].no_blending); } size_t size = sizeof(struct mali_shader_meta) + (sizeof(struct midgard_blend_rt) * rt_count); - struct panfrost_transfer transfer = panfrost_allocate_transient(ctx, size); + struct panfrost_transfer transfer = panfrost_allocate_transient(batch, size); memcpy(transfer.cpu, &ctx->fragment_shader_core, sizeof(struct mali_shader_meta)); - ctx->payloads[PIPE_SHADER_FRAGMENT].postfix._shader_upper = (transfer.gpu) >> 4; + ctx->payloads[PIPE_SHADER_FRAGMENT].postfix.shader = transfer.gpu; - if (!screen->require_sfbd) { + if (!(screen->quirks & MIDGARD_SFBD)) { /* Additional blend descriptor tacked on for jobs using MFBD */ struct midgard_blend_rt rts[4]; for (unsigned i = 0; i < rt_count; ++i) { - unsigned blend_count = 0x200; - - if (blend[i].is_shader) { - /* For a blend shader, the bottom nibble corresponds to - * the number of work registers used, which signals the - * -existence- of a blend shader */ - - assert(blend[i].shader.work_count >= 2); - blend_count |= MIN2(blend[i].shader.work_count, 3); - } else { - /* Otherwise, the bottom bit simply specifies if - * blending (anything other than REPLACE) is enabled */ - - if (!blend[i].no_blending) - blend_count |= 0x1; - } - + rts[i].flags = 0x200; bool is_srgb = (ctx->pipe_framebuffer.nr_cbufs > i) && (ctx->pipe_framebuffer.cbufs[i]) && util_format_is_srgb(ctx->pipe_framebuffer.cbufs[i]->format); - rts[i].flags = blend_count; - - if (is_srgb) - rts[i].flags |= MALI_BLEND_SRGB; - - if (!ctx->blend->base.dither) - rts[i].flags |= MALI_BLEND_NO_DITHER; - - /* TODO: sRGB in blend shaders is currently - * unimplemented. Contact me (Alyssa) if you're - * interested in working on this. We have - * native Midgard ops for helping here, but - * they're not well-understood yet. */ - - assert(!(is_srgb && blend[i].is_shader)); + SET_BIT(rts[i].flags, MALI_BLEND_MRT_SHADER, blend[i].is_shader); + SET_BIT(rts[i].flags, MALI_BLEND_LOAD_TIB, !blend[i].no_blending); + SET_BIT(rts[i].flags, MALI_BLEND_SRGB, is_srgb); + SET_BIT(rts[i].flags, MALI_BLEND_NO_DITHER, !ctx->blend->base.dither); if (blend[i].is_shader) { - rts[i].blend.shader = blend[i].shader.bo->gpu | blend[i].shader.first_tag; + rts[i].blend.shader = blend[i].shader.gpu | blend[i].shader.first_tag; } else { rts[i].blend.equation = *blend[i].equation.equation; rts[i].blend.constant = blend[i].equation.constant; @@ -1142,13 +1087,13 @@ size_t sys_size = sizeof(float) * 4 * ss->sysval_count; size_t uniform_size = has_uniforms ? (buf->cb[0].buffer_size) : 0; size_t size = sys_size + uniform_size; - struct panfrost_transfer transfer = panfrost_allocate_transient(ctx, size); + struct panfrost_transfer transfer = panfrost_allocate_transient(batch, size); /* Upload sysvals requested by the shader */ panfrost_upload_sysvals(ctx, transfer.cpu, ss, i); /* Upload uniforms */ - if (has_uniforms) { + if (has_uniforms && uniform_size) { const void *cpu = panfrost_map_constant_buffer_cpu(buf, 0); memcpy(transfer.cpu + sys_size, cpu, uniform_size); } @@ -1175,10 +1120,10 @@ /* The rest are honest-to-goodness UBOs */ for (unsigned ubo = 1; ubo < ubo_count; ++ubo) { - size_t sz = buf->cb[ubo].buffer_size; + size_t usz = buf->cb[ubo].buffer_size; bool enabled = buf->enabled_mask & (1 << ubo); - bool empty = sz == 0; + bool empty = usz == 0; if (!enabled || empty) { /* Stub out disabled UBOs to catch accesses */ @@ -1188,17 +1133,17 @@ continue; } - mali_ptr gpu = panfrost_map_constant_buffer_gpu(ctx, buf, ubo); + mali_ptr gpu = panfrost_map_constant_buffer_gpu(ctx, i, buf, ubo); unsigned bytes_per_field = 16; - unsigned aligned = ALIGN_POT(sz, bytes_per_field); + unsigned aligned = ALIGN_POT(usz, bytes_per_field); unsigned fields = aligned / bytes_per_field; ubos[ubo].size = MALI_POSITIVE(fields); ubos[ubo].ptr = gpu >> 2; } - mali_ptr ubufs = panfrost_upload_transient(ctx, ubos, sz); + mali_ptr ubufs = panfrost_upload_transient(batch, ubos, sz); postfix->uniforms = transfer.gpu; postfix->uniform_buffers = ubufs; @@ -1221,9 +1166,6 @@ .clip_miny = -INFINITY, .clip_maxx = INFINITY, .clip_maxy = INFINITY, - - .clip_minz = 0.0, - .clip_maxz = 1.0, }; /* Always scissor to the viewport by default. */ @@ -1233,6 +1175,9 @@ float vp_miny = (int) (vp->translate[1] - fabsf(vp->scale[1])); float vp_maxy = (int) (vp->translate[1] + fabsf(vp->scale[1])); + float minz = (vp->translate[2] - fabsf(vp->scale[2])); + float maxz = (vp->translate[2] + fabsf(vp->scale[2])); + /* Apply the scissor test */ unsigned minx, miny, maxx, maxy; @@ -1254,23 +1199,22 @@ * handle the negatives if we don't */ if (miny > maxy) { - int temp = miny; + unsigned temp = miny; miny = maxy; maxy = temp; } if (minx > maxx) { - int temp = minx; + unsigned temp = minx; minx = maxx; maxx = temp; } - /* Clamp everything positive, just in case */ - - maxx = MAX2(0, maxx); - maxy = MAX2(0, maxy); - minx = MAX2(0, minx); - miny = MAX2(0, miny); + if (minz > maxz) { + float temp = minz; + minz = maxz; + maxz = temp; + } /* Clamp to the framebuffer size as a last check */ @@ -1285,7 +1229,7 @@ * just... be faster :) */ if (!ctx->wallpaper_batch) - panfrost_job_union_scissor(job, minx, miny, maxx, maxy); + panfrost_batch_union_scissor(batch, minx, miny, maxx, maxy); /* Upload */ @@ -1295,8 +1239,11 @@ view.viewport0[1] = miny; view.viewport1[1] = MALI_POSITIVE(maxy); + view.clip_minz = minz; + view.clip_maxz = maxz; + ctx->payloads[PIPE_SHADER_FRAGMENT].postfix.viewport = - panfrost_upload_transient(ctx, + panfrost_upload_transient(batch, &view, sizeof(struct mali_viewport)); @@ -1322,148 +1269,28 @@ if (!rasterizer_discard) tiler = panfrost_vertex_tiler_job(ctx, true); - struct panfrost_job *batch = panfrost_get_job_for_fbo(ctx); + struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx); if (rasterizer_discard) panfrost_scoreboard_queue_vertex_job(batch, vertex, FALSE); - else if (ctx->wallpaper_batch) + else if (ctx->wallpaper_batch && batch->first_tiler.gpu) panfrost_scoreboard_queue_fused_job_prepend(batch, vertex, tiler); else panfrost_scoreboard_queue_fused_job(batch, vertex, tiler); -} - -/* The entire frame is in memory -- send it off to the kernel! */ - -static void -panfrost_submit_frame(struct panfrost_context *ctx, bool flush_immediate, - struct pipe_fence_handle **fence, - struct panfrost_job *job) -{ - struct pipe_context *gallium = (struct pipe_context *) ctx; - struct panfrost_screen *screen = pan_screen(gallium->screen); - -#ifndef DRY_RUN - - panfrost_job_submit(ctx, job); - - /* If visual, we can stall a frame */ - - if (!flush_immediate) - panfrost_drm_force_flush_fragment(ctx, fence); - - screen->last_fragment_flushed = false; - screen->last_job = job; - - /* If readback, flush now (hurts the pipelined performance) */ - if (flush_immediate) - panfrost_drm_force_flush_fragment(ctx, fence); -#endif -} - -static void -panfrost_draw_wallpaper(struct pipe_context *pipe) -{ - struct panfrost_context *ctx = pan_context(pipe); - - /* Nothing to reload? TODO: MRT wallpapers */ - if (ctx->pipe_framebuffer.cbufs[0] == NULL) - return; - - /* Check if the buffer has any content on it worth preserving */ - - struct pipe_surface *surf = ctx->pipe_framebuffer.cbufs[0]; - struct panfrost_resource *rsrc = pan_resource(surf->texture); - unsigned level = surf->u.tex.level; - - if (!rsrc->slices[level].initialized) - return; - /* Save the batch */ - struct panfrost_job *batch = panfrost_get_job_for_fbo(ctx); - - ctx->wallpaper_batch = batch; - - /* Clamp the rendering area to the damage extent. The - * KHR_partial_update() spec states that trying to render outside of - * the damage region is "undefined behavior", so we should be safe. - */ - unsigned damage_width = (rsrc->damage.extent.maxx - rsrc->damage.extent.minx); - unsigned damage_height = (rsrc->damage.extent.maxy - rsrc->damage.extent.miny); - - if (damage_width && damage_height) { - panfrost_job_intersection_scissor(batch, rsrc->damage.extent.minx, - rsrc->damage.extent.miny, - rsrc->damage.extent.maxx, - rsrc->damage.extent.maxy); - } + for (unsigned i = 0; i < PIPE_SHADER_TYPES; ++i) { + struct panfrost_shader_variants *all = ctx->shader[i]; - /* FIXME: Looks like aligning on a tile is not enough, but - * aligning on twice the tile size seems to works. We don't - * know exactly what happens here but this deserves extra - * investigation to figure it out. - */ - batch->minx = batch->minx & ~((MALI_TILE_LENGTH * 2) - 1); - batch->miny = batch->miny & ~((MALI_TILE_LENGTH * 2) - 1); - batch->maxx = MIN2(ALIGN_POT(batch->maxx, MALI_TILE_LENGTH * 2), - rsrc->base.width0); - batch->maxy = MIN2(ALIGN_POT(batch->maxy, MALI_TILE_LENGTH * 2), - rsrc->base.height0); - - struct pipe_scissor_state damage; - struct pipe_box rects[4]; - - /* Clamp the damage box to the rendering area. */ - damage.minx = MAX2(batch->minx, rsrc->damage.biggest_rect.x); - damage.miny = MAX2(batch->miny, rsrc->damage.biggest_rect.y); - damage.maxx = MIN2(batch->maxx, - rsrc->damage.biggest_rect.x + - rsrc->damage.biggest_rect.width); - damage.maxy = MIN2(batch->maxy, - rsrc->damage.biggest_rect.y + - rsrc->damage.biggest_rect.height); - - /* One damage rectangle means we can end up with at most 4 reload - * regions: - * 1: left region, only exists if damage.x > 0 - * 2: right region, only exists if damage.x + damage.width < fb->width - * 3: top region, only exists if damage.y > 0. The intersection with - * the left and right regions are dropped - * 4: bottom region, only exists if damage.y + damage.height < fb->height. - * The intersection with the left and right regions are dropped - * - * ____________________________ - * | | 3 | | - * | |___________| | - * | | damage | | - * | 1 | rect | 2 | - * | |___________| | - * | | 4 | | - * |_______|___________|______| - */ - u_box_2d(batch->minx, batch->miny, damage.minx - batch->minx, - batch->maxy - batch->miny, &rects[0]); - u_box_2d(damage.maxx, batch->miny, batch->maxx - damage.maxx, - batch->maxy - batch->miny, &rects[1]); - u_box_2d(damage.minx, batch->miny, damage.maxx - damage.minx, - damage.miny - batch->miny, &rects[2]); - u_box_2d(damage.minx, damage.maxy, damage.maxx - damage.minx, - batch->maxy - damage.maxy, &rects[3]); - - for (unsigned i = 0; i < 4; i++) { - /* Width and height are always >= 0 even if width is declared as a - * signed integer: u_box_2d() helper takes unsigned args and - * panfrost_set_damage_region() is taking care of clamping - * negative values. - */ - if (!rects[i].width || !rects[i].height) + if (!all) continue; - /* Blit the wallpaper in */ - panfrost_blit_wallpaper(ctx, &rects[i]); + struct panfrost_shader_state *ss = &all->variants[all->active_variant]; + batch->stack_size = MAX2(batch->stack_size, ss->stack_size); } - ctx->wallpaper_batch = NULL; } +/* The entire frame is in memory -- send it off to the kernel! */ + void panfrost_flush( struct pipe_context *pipe, @@ -1471,24 +1298,39 @@ unsigned flags) { struct panfrost_context *ctx = pan_context(pipe); - struct panfrost_job *job = panfrost_get_job_for_fbo(ctx); + struct util_dynarray fences; - /* Nothing to do! */ - if (!job->last_job.gpu && !job->clear) return; + /* We must collect the fences before the flush is done, otherwise we'll + * lose track of them. + */ + if (fence) { + util_dynarray_init(&fences, NULL); + hash_table_foreach(ctx->batches, hentry) { + struct panfrost_batch *batch = hentry->data; - if (!job->clear && job->last_tiler.gpu) - panfrost_draw_wallpaper(&ctx->base); + panfrost_batch_fence_reference(batch->out_sync); + util_dynarray_append(&fences, + struct panfrost_batch_fence *, + batch->out_sync); + } + } - /* Whether to stall the pipeline for immediately correct results. Since - * pipelined rendering is quite broken right now (to be fixed by the - * panfrost_job refactor, just take the perf hit for correctness) */ - bool flush_immediate = /*flags & PIPE_FLUSH_END_OF_FRAME*/true; + /* Submit all pending jobs */ + panfrost_flush_all_batches(ctx, false); - /* Submit the frame itself */ - panfrost_submit_frame(ctx, flush_immediate, fence, job); + if (fence) { + struct panfrost_fence *f = panfrost_fence_create(ctx, &fences); + pipe->screen->fence_reference(pipe->screen, fence, NULL); + *fence = (struct pipe_fence_handle *)f; - /* Prepare for the next frame */ - panfrost_invalidate_frame(ctx); + util_dynarray_foreach(&fences, struct panfrost_batch_fence *, fence) + panfrost_batch_fence_unreference(*fence); + + util_dynarray_fini(&fences); + } + + if (pan_debug & PAN_DBG_TRACE) + pandecode_next_frame(); } #define DEFINE_CASE(c) case PIPE_PRIM_##c: return MALI_##c; @@ -1542,16 +1384,19 @@ struct panfrost_resource *rsrc = (struct panfrost_resource *) (info->index.resource); off_t offset = info->start * info->index_size; - struct panfrost_job *batch = panfrost_get_job_for_fbo(ctx); + struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx); if (!info->has_user_indices) { /* Only resources can be directly mapped */ - panfrost_job_add_bo(batch, rsrc->bo); + panfrost_batch_add_bo(batch, rsrc->bo, + PAN_BO_ACCESS_SHARED | + PAN_BO_ACCESS_READ | + PAN_BO_ACCESS_VERTEX_TILER); return rsrc->bo->gpu + offset; } else { /* Otherwise, we need to upload to transient memory */ const uint8_t *ibuf8 = (const uint8_t *) info->index.user; - return panfrost_upload_transient(ctx, ibuf8 + offset, info->count * info->index_size); + return panfrost_upload_transient(batch, ibuf8 + offset, info->count * info->index_size); } } @@ -1582,7 +1427,7 @@ uint32_t prims = u_prims_for_vertices(info->mode, info->count); ctx->prims_generated += prims; - if (ctx->streamout.num_targets <= 0) + if (!ctx->streamout.num_targets) return; ctx->tf_prims_generated += prims; @@ -1602,9 +1447,6 @@ if (panfrost_scissor_culls_everything(ctx)) return; - ctx->payloads[PIPE_SHADER_VERTEX].offset_start = info->start; - ctx->payloads[PIPE_SHADER_FRAGMENT].offset_start = info->start; - int mode = info->mode; /* Fallback unsupported restart index */ @@ -1618,8 +1460,10 @@ /* Fallback for unsupported modes */ + assert(ctx->rasterizer != NULL); + if (!(ctx->draw_modes & (1 << mode))) { - if (mode == PIPE_PRIM_QUADS && info->count == 4 && ctx->rasterizer && !ctx->rasterizer->base.flatshade) { + if (mode == PIPE_PRIM_QUADS && info->count == 4 && !ctx->rasterizer->base.flatshade) { mode = PIPE_PRIM_TRIANGLE_FAN; } else { if (info->count < 4) { @@ -1633,10 +1477,13 @@ } } + ctx->payloads[PIPE_SHADER_VERTEX].offset_start = info->start; + ctx->payloads[PIPE_SHADER_FRAGMENT].offset_start = info->start; + /* Now that we have a guaranteed terminating path, find the job. * Assignment commented out to prevent unused warning */ - /* struct panfrost_job *job = */ panfrost_get_job_for_fbo(ctx); + /* struct panfrost_batch *batch = */ panfrost_get_batch_for_fbo(ctx); ctx->payloads[PIPE_SHADER_FRAGMENT].prefix.draw_mode = g2m_draw_mode(mode); @@ -1658,22 +1505,12 @@ if (info->primitive_restart) draw_flags |= MALI_DRAW_PRIMITIVE_RESTART_FIXED_INDEX; - /* For higher amounts of vertices (greater than what fits in a 16-bit - * short), the other value is needed, otherwise there will be bizarre - * rendering artefacts. It's not clear what these values mean yet. This - * change is also needed for instancing and sometimes points (perhaps - * related to dynamically setting gl_PointSize) */ - - bool is_points = mode == PIPE_PRIM_POINTS; - bool many_verts = ctx->vertex_count > 0xFFFF; - bool instanced = ctx->instance_count > 1; - - draw_flags |= (is_points || many_verts || instanced) ? 0x3000 : 0x18000; - - /* This doesn't make much sense */ - if (mode == PIPE_PRIM_LINE_STRIP) { - draw_flags |= 0x800; - } + /* These doesn't make much sense */ + + draw_flags |= 0x3000; + + if (ctx->rasterizer && ctx->rasterizer->base.flatshade_first) + draw_flags |= MALI_DRAW_FLATSHADE_FIRST; panfrost_statistics_record(ctx, info); @@ -1711,7 +1548,7 @@ ctx->payloads[PIPE_SHADER_FRAGMENT].prefix.index_count = MALI_POSITIVE(ctx->vertex_count); /* Reverse index state */ - ctx->payloads[PIPE_SHADER_FRAGMENT].prefix.indices = (u64) NULL; + ctx->payloads[PIPE_SHADER_FRAGMENT].prefix.indices = (mali_ptr) 0; } /* Dispatch "compute jobs" for the vertex/tiler pair as (1, @@ -1728,26 +1565,18 @@ /* Encode the padded vertex count */ if (info->instance_count > 1) { - /* Triangles have non-even vertex counts so they change how - * padding works internally */ - - bool is_triangle = - mode == PIPE_PRIM_TRIANGLES || - mode == PIPE_PRIM_TRIANGLE_STRIP || - mode == PIPE_PRIM_TRIANGLE_FAN; - - struct pan_shift_odd so = - panfrost_padded_vertex_count(vertex_count, !is_triangle); + ctx->padded_count = panfrost_padded_vertex_count(vertex_count); - ctx->payloads[PIPE_SHADER_VERTEX].instance_shift = so.shift; - ctx->payloads[PIPE_SHADER_FRAGMENT].instance_shift = so.shift; + unsigned shift = __builtin_ctz(ctx->padded_count); + unsigned k = ctx->padded_count >> (shift + 1); - ctx->payloads[PIPE_SHADER_VERTEX].instance_odd = so.odd; - ctx->payloads[PIPE_SHADER_FRAGMENT].instance_odd = so.odd; + ctx->payloads[PIPE_SHADER_VERTEX].instance_shift = shift; + ctx->payloads[PIPE_SHADER_FRAGMENT].instance_shift = shift; - ctx->padded_count = pan_expand_shift_odd(so); + ctx->payloads[PIPE_SHADER_VERTEX].instance_odd = k; + ctx->payloads[PIPE_SHADER_FRAGMENT].instance_odd = k; } else { - ctx->padded_count = ctx->vertex_count; + ctx->padded_count = vertex_count; /* Reset instancing state */ ctx->payloads[PIPE_SHADER_VERTEX].instance_shift = 0; @@ -1815,7 +1644,7 @@ ctx->rasterizer = hwcso; ctx->dirty |= PAN_DIRTY_RASTERIZER; - ctx->fragment_shader_core.depth_units = ctx->rasterizer->base.offset_units; + ctx->fragment_shader_core.depth_units = ctx->rasterizer->base.offset_units * 2.0f; ctx->fragment_shader_core.depth_factor = ctx->rasterizer->base.offset_scale; /* Gauranteed with the core GL call, so don't expose ARB_polygon_offset */ @@ -1877,7 +1706,8 @@ static void * panfrost_create_shader_state( struct pipe_context *pctx, - const struct pipe_shader_state *cso) + const struct pipe_shader_state *cso, + enum pipe_shader_type stage) { struct panfrost_shader_variants *so = CALLOC_STRUCT(panfrost_shader_variants); so->base = *cso; @@ -1887,6 +1717,21 @@ if (cso->type == PIPE_SHADER_IR_TGSI) so->base.tokens = tgsi_dup_tokens(so->base.tokens); + /* Precompile for shader-db if we need to */ + if (unlikely((pan_debug & PAN_DBG_PRECOMPILE) && cso->type == PIPE_SHADER_IR_NIR)) { + struct panfrost_context *ctx = pan_context(pctx); + + struct mali_shader_meta meta; + struct panfrost_shader_state state; + uint64_t outputs_written; + + panfrost_shader_compile(ctx, &meta, + PIPE_SHADER_IR_NIR, + so->base.ir.nir, + tgsi_processor_to_shader_stage(stage), &state, + &outputs_written); + } + return so; } @@ -1903,9 +1748,10 @@ for (unsigned i = 0; i < cso->variant_count; ++i) { struct panfrost_shader_state *shader_state = &cso->variants[i]; - panfrost_bo_unreference(pctx->screen, shader_state->bo); + panfrost_bo_unreference(shader_state->bo); shader_state->bo = NULL; } + free(cso->variants); free(so); } @@ -1935,15 +1781,18 @@ .wrap_s = translate_tex_wrap(cso->wrap_s), .wrap_t = translate_tex_wrap(cso->wrap_t), .wrap_r = translate_tex_wrap(cso->wrap_r), - .compare_func = panfrost_translate_alt_compare_func(cso->compare_func), + .compare_func = panfrost_flip_compare_func( + panfrost_translate_compare_func( + cso->compare_func)), .border_color = { cso->border_color.f[0], cso->border_color.f[1], cso->border_color.f[2], cso->border_color.f[3] }, - .min_lod = FIXED_16(cso->min_lod), - .max_lod = FIXED_16(cso->max_lod), + .min_lod = FIXED_16(cso->min_lod, false), /* clamp at 0 */ + .max_lod = FIXED_16(cso->max_lod, false), + .lod_bias = FIXED_16(cso->lod_bias, true), /* can be negative */ .seamless_cube_map = cso->seamless_cube_map, }; @@ -2052,7 +1901,7 @@ uint64_t outputs_written) { uint64_t so_outputs = 0; - uint8_t reverse_map[64] = {}; + uint8_t reverse_map[64] = {0}; unsigned slot = 0; while (outputs_written) @@ -2102,7 +1951,25 @@ if (variant == -1) { /* No variant matched, so create a new one */ variant = variants->variant_count++; - assert(variants->variant_count < MAX_SHADER_VARIANTS); + + if (variants->variant_count > variants->variant_space) { + unsigned old_space = variants->variant_space; + + variants->variant_space *= 2; + if (variants->variant_space == 0) + variants->variant_space = 1; + + /* Arbitrary limit to stop runaway programs from + * creating an unbounded number of shader variants. */ + assert(variants->variant_space < 1024); + + unsigned msize = sizeof(struct panfrost_shader_state); + variants->variants = realloc(variants->variants, + variants->variant_space * msize); + + memset(&variants->variants[old_space], 0, + (variants->variant_space - old_space) * msize); + } struct panfrost_shader_state *v = &variants->variants[variant]; @@ -2152,6 +2019,18 @@ } } +static void * +panfrost_create_vs_state(struct pipe_context *pctx, const struct pipe_shader_state *hwcso) +{ + return panfrost_create_shader_state(pctx, hwcso, PIPE_SHADER_VERTEX); +} + +static void * +panfrost_create_fs_state(struct pipe_context *pctx, const struct pipe_shader_state *hwcso) +{ + return panfrost_create_shader_state(pctx, hwcso, PIPE_SHADER_FRAGMENT); +} + static void panfrost_bind_vs_state(struct pipe_context *pctx, void *hwcso) { @@ -2216,9 +2095,9 @@ switch (t) { case PIPE_BUFFER: - case PIPE_TEXTURE_1D: - case PIPE_TEXTURE_1D_ARRAY: - return MALI_TEX_1D; + case PIPE_TEXTURE_1D: + case PIPE_TEXTURE_1D_ARRAY: + return MALI_TEX_1D; case PIPE_TEXTURE_2D: case PIPE_TEXTURE_2D_ARRAY: @@ -2237,6 +2116,21 @@ } } +static uint8_t +panfrost_compute_astc_stretch( + const struct util_format_description *desc) +{ + unsigned width = desc->block.width; + unsigned height = desc->block.height; + assert(width >= 4 && width <= 12); + assert(height >= 4 && height <= 12); + if (width == 12) + width = 11; + if (height == 12) + height = 11; + return ((height - 4) * 8) + (width - 4); +} + static struct pipe_sampler_view * panfrost_create_sampler_view( struct pipe_context *pctx, @@ -2260,7 +2154,6 @@ * (data) itself. So, we serialise the descriptor here and cache it for * later. */ - /* TODO: Detect from format better */ const struct util_format_description *desc = util_format_description(prsrc->base.format); unsigned char user_swizzle[4] = { @@ -2272,6 +2165,9 @@ enum mali_format format = panfrost_find_format(desc); + if (format == MALI_ASTC_HDR_SUPP || format == MALI_ASTC_SRGB_SUPP) + so->astc_stretch = panfrost_compute_astc_stretch(desc); + /* Check if we need to set a custom stride by computing the "expected" * stride and comparing it to what the BO actually wants. Only applies * to linear textures, since tiled/compressed textures have strict @@ -2310,19 +2206,18 @@ .depth = MALI_POSITIVE(u_minify(texture->depth0, first_level)), .array_size = MALI_POSITIVE(array_size), - /* TODO: Decode */ .format = { .swizzle = panfrost_translate_swizzle_4(desc->swizzle), .format = format, - .srgb = desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB, .type = panfrost_translate_texture_type(template->target), + .unknown2 = 0x1, }, .swizzle = panfrost_translate_swizzle_4(user_swizzle) }; - texture_descriptor.nr_mipmap_levels = last_level - first_level; + texture_descriptor.levels = last_level - first_level; so->hw = texture_descriptor; @@ -2337,17 +2232,23 @@ struct pipe_sampler_view **views) { struct panfrost_context *ctx = pan_context(pctx); + unsigned new_nr = 0; + unsigned i; assert(start_slot == 0); - unsigned new_nr = 0; - for (unsigned i = 0; i < num_views; ++i) { + for (i = 0; i < num_views; ++i) { if (views[i]) new_nr = i + 1; + pipe_sampler_view_reference((struct pipe_sampler_view **)&ctx->sampler_views[shader][i], + views[i]); } + for (; i < ctx->sampler_view_count[shader]; i++) { + pipe_sampler_view_reference((struct pipe_sampler_view **)&ctx->sampler_views[shader][i], + NULL); + } ctx->sampler_view_count[shader] = new_nr; - memcpy(ctx->sampler_views[shader], views, num_views * sizeof (void *)); ctx->dirty |= PAN_DIRTY_TEXTURES; } @@ -2407,50 +2308,10 @@ { struct panfrost_context *ctx = pan_context(pctx); - /* Flush when switching framebuffers, but not if the framebuffer - * state is being restored by u_blitter - */ - - struct panfrost_job *job = panfrost_get_job_for_fbo(ctx); - bool is_scanout = panfrost_is_scanout(ctx); - bool has_draws = job->last_job.gpu; - - /* Bail out early when the current and new states are the same. */ - if (util_framebuffer_state_equal(&ctx->pipe_framebuffer, fb)) - return; - - /* The wallpaper logic sets a new FB state before doing the blit and - * restore the old one when it's done. Those FB states are reported to - * be different because the surface they are pointing to are different, - * but those surfaces actually point to the same cbufs/zbufs. In that - * case we definitely don't want new FB descs to be emitted/attached - * since the job is expected to be flushed just after the blit is done, - * so let's just copy the new state and return here. - */ - if (ctx->wallpaper_batch) { - util_copy_framebuffer_state(&ctx->pipe_framebuffer, fb); - return; - } - - if (!is_scanout || has_draws) - panfrost_flush(pctx, NULL, PIPE_FLUSH_END_OF_FRAME); - else - assert(!ctx->payloads[PIPE_SHADER_VERTEX].postfix.framebuffer && - !ctx->payloads[PIPE_SHADER_FRAGMENT].postfix.framebuffer); - - /* Invalidate the FBO job cache since we've just been assigned a new - * FB state. - */ - ctx->job = NULL; - + panfrost_hint_afbc(pan_screen(pctx->screen), fb); util_copy_framebuffer_state(&ctx->pipe_framebuffer, fb); - - /* Given that we're rendering, we'd love to have compression */ - struct panfrost_screen *screen = pan_screen(ctx->base.screen); - - panfrost_hint_afbc(screen, &ctx->pipe_framebuffer); - for (unsigned i = 0; i < PIPE_SHADER_TYPES; ++i) - ctx->payloads[i].postfix.framebuffer = 0; + ctx->batch = NULL; + panfrost_invalidate_frame(ctx); } static void * @@ -2493,7 +2354,8 @@ ctx->fragment_shader_core.stencil_mask_back = depth_stencil->stencil[back_index].writemask; /* Depth state (TODO: Refactor) */ - SET_BIT(ctx->fragment_shader_core.unknown2_3, MALI_DEPTH_TEST, depth_stencil->depth.enabled); + SET_BIT(ctx->fragment_shader_core.unknown2_3, MALI_DEPTH_WRITEMASK, + depth_stencil->depth.writemask); int func = depth_stencil->depth.enabled ? depth_stencil->depth.func : PIPE_FUNC_ALWAYS; @@ -2572,7 +2434,6 @@ panfrost_destroy(struct pipe_context *pipe) { struct panfrost_context *panfrost = pan_context(pipe); - struct panfrost_screen *screen = pan_screen(pipe->screen); if (panfrost->blitter) util_blitter_destroy(panfrost->blitter); @@ -2580,9 +2441,8 @@ if (panfrost->blitter_wallpaper) util_blitter_destroy(panfrost->blitter_wallpaper); - panfrost_drm_free_slab(screen, &panfrost->scratchpad); - panfrost_drm_free_slab(screen, &panfrost->tiler_heap); - panfrost_drm_free_slab(screen, &panfrost->tiler_dummy); + util_unreference_framebuffer_state(&panfrost->pipe_framebuffer); + u_upload_destroy(pipe->stream_uploader); ralloc_free(pipe); } @@ -2603,6 +2463,13 @@ static void panfrost_destroy_query(struct pipe_context *pipe, struct pipe_query *q) { + struct panfrost_query *query = (struct panfrost_query *) q; + + if (query->bo) { + panfrost_bo_unreference(query->bo); + query->bo = NULL; + } + ralloc_free(q); } @@ -2616,8 +2483,15 @@ case PIPE_QUERY_OCCLUSION_COUNTER: case PIPE_QUERY_OCCLUSION_PREDICATE: case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: - /* Allocate a word for the query results to be stored */ - query->transfer = panfrost_allocate_transient(ctx, sizeof(unsigned)); + /* Allocate a bo for the query results to be stored */ + if (!query->bo) { + query->bo = panfrost_bo_create( + pan_screen(ctx->base.screen), + sizeof(unsigned), 0); + } + + unsigned *result = (unsigned *)query->bo->cpu; + *result = 0; /* Default to 0 if nothing at all drawn. */ ctx->occlusion_query = query; break; @@ -2632,7 +2506,7 @@ break; default: - fprintf(stderr, "Skipping query %d\n", query->type); + fprintf(stderr, "Skipping query %u\n", query->type); break; } @@ -2669,6 +2543,7 @@ union pipe_query_result *vresult) { struct panfrost_query *query = (struct panfrost_query *) q; + struct panfrost_context *ctx = pan_context(pipe); switch (query->type) { @@ -2676,10 +2551,10 @@ case PIPE_QUERY_OCCLUSION_PREDICATE: case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: /* Flush first */ - panfrost_flush(pipe, NULL, PIPE_FLUSH_END_OF_FRAME); + panfrost_flush_all_batches(ctx, true); /* Read back the query results */ - unsigned *result = (unsigned *) query->transfer.cpu; + unsigned *result = (unsigned *) query->bo->cpu; unsigned passed = *result; if (query->type == PIPE_QUERY_OCCLUSION_COUNTER) { @@ -2692,12 +2567,12 @@ case PIPE_QUERY_PRIMITIVES_GENERATED: case PIPE_QUERY_PRIMITIVES_EMITTED: - panfrost_flush(pipe, NULL, PIPE_FLUSH_END_OF_FRAME); + panfrost_flush_all_batches(ctx, true); vresult->u64 = query->end - query->start; break; default: - DBG("Skipped query get %d\n", query->type); + DBG("Skipped query get %u\n", query->type); break; } @@ -2759,30 +2634,12 @@ so->num_targets = num_targets; } -static void -panfrost_setup_hardware(struct panfrost_context *ctx) -{ - struct pipe_context *gallium = (struct pipe_context *) ctx; - struct panfrost_screen *screen = pan_screen(gallium->screen); - - panfrost_drm_allocate_slab(screen, &ctx->scratchpad, 64*4, false, 0, 0, 0); - panfrost_drm_allocate_slab(screen, &ctx->tiler_heap, 4096, false, PAN_ALLOCATE_INVISIBLE | PAN_ALLOCATE_GROWABLE, 1, 128); - panfrost_drm_allocate_slab(screen, &ctx->tiler_dummy, 1, false, PAN_ALLOCATE_INVISIBLE, 0, 0); -} - -/* New context creation, which also does hardware initialisation since I don't - * know the better way to structure this :smirk: */ - struct pipe_context * panfrost_create_context(struct pipe_screen *screen, void *priv, unsigned flags) { struct panfrost_context *ctx = rzalloc(screen, struct panfrost_context); - struct panfrost_screen *pscreen = pan_screen(screen); - memset(ctx, 0, sizeof(*ctx)); struct pipe_context *gallium = (struct pipe_context *) ctx; - ctx->is_t6xx = pscreen->gpu_id < 0x0700; /* Literally, "earlier than T700" */ - gallium->screen = screen; gallium->destroy = panfrost_destroy; @@ -2811,11 +2668,11 @@ gallium->bind_vertex_elements_state = panfrost_bind_vertex_elements_state; gallium->delete_vertex_elements_state = panfrost_generic_cso_delete; - gallium->create_fs_state = panfrost_create_shader_state; + gallium->create_fs_state = panfrost_create_fs_state; gallium->delete_fs_state = panfrost_delete_shader_state; gallium->bind_fs_state = panfrost_bind_fs_state; - gallium->create_vs_state = panfrost_create_shader_state; + gallium->create_vs_state = panfrost_create_vs_state; gallium->delete_vs_state = panfrost_delete_shader_state; gallium->bind_vs_state = panfrost_bind_vs_state; @@ -2849,10 +2706,6 @@ panfrost_blend_context_init(gallium); panfrost_compute_context_init(gallium); - panfrost_drm_init_context(ctx); - - panfrost_setup_hardware(ctx); - /* XXX: leaks */ gallium->stream_uploader = u_upload_create_default(gallium); gallium->const_uploader = gallium->stream_uploader; @@ -2871,9 +2724,8 @@ /* Prepare for render! */ - panfrost_job_init(ctx); + panfrost_batch_init(ctx); panfrost_emit_vertex_payload(ctx); - panfrost_emit_tiler_payload(ctx); panfrost_invalidate_frame(ctx); panfrost_default_shader_backend(ctx); diff -Nru mesa-19.2.8/src/gallium/drivers/panfrost/pan_context.h mesa-20.0.8/src/gallium/drivers/panfrost/pan_context.h --- mesa-19.2.8/src/gallium/drivers/panfrost/pan_context.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/panfrost/pan_context.h 2020-06-12 01:21:17.000000000 +0000 @@ -32,6 +32,7 @@ #include "pan_resource.h" #include "pan_job.h" #include "pan_blend.h" +#include "pan_encoder.h" #include "pipe/p_compiler.h" #include "pipe/p_config.h" @@ -79,21 +80,19 @@ unsigned type; unsigned index; - union { - /* For computed queries. 64-bit to prevent overflow */ - struct { - uint64_t start; - uint64_t end; - }; - - /* Memory for the GPU to writeback the value of the query */ - struct panfrost_transfer transfer; + /* For computed queries. 64-bit to prevent overflow */ + struct { + uint64_t start; + uint64_t end; }; + + /* Memory for the GPU to writeback the value of the query */ + struct panfrost_bo *bo; }; struct panfrost_fence { struct pipe_reference reference; - int fd; + struct util_dynarray syncfds; }; struct panfrost_streamout { @@ -106,15 +105,12 @@ /* Gallium context */ struct pipe_context base; - /* Compiler context */ - struct midgard_screen compiler; - - /* Bound job and map of panfrost_job_key to jobs */ - struct panfrost_job *job; - struct hash_table *jobs; + /* Bound job batch and map of panfrost_batch_key to job batches */ + struct panfrost_batch *batch; + struct hash_table *batches; - /* panfrost_resource -> panfrost_job */ - struct hash_table *write_jobs; + /* panfrost_bo -> panfrost_bo_access */ + struct hash_table *accessed_bos; /* Within a launch_grid call.. */ const struct pipe_grid_info *compute_grid; @@ -125,12 +121,6 @@ struct pipe_framebuffer_state pipe_framebuffer; struct panfrost_streamout streamout; - struct panfrost_memory cmdstream_persistent; - struct panfrost_memory scratchpad; - struct panfrost_memory tiler_heap; - struct panfrost_memory tiler_dummy; - struct panfrost_memory depth_stencil_buffer; - bool active_queries; uint64_t prims_generated; uint64_t tf_prims_generated; @@ -156,8 +146,6 @@ * it is disabled, just equal to plain vertex count */ unsigned padded_count; - union mali_attr attributes[PIPE_MAX_ATTRIBS]; - /* TODO: Multiple uniform buffers (index =/= 0), finer updates? */ struct panfrost_constant_buffer constant_buffer[PIPE_SHADER_TYPES]; @@ -188,7 +176,7 @@ * errors due to unsupported reucrsion */ struct blitter_context *blitter_wallpaper; - struct panfrost_job *wallpaper_batch; + struct panfrost_batch *wallpaper_batch; struct panfrost_blend_state *blend; @@ -197,11 +185,6 @@ struct pipe_blend_color blend_color; struct pipe_depth_stencil_alpha_state *depth_stencil; struct pipe_stencil_ref stencil_ref; - - /* True for t6XX, false for t8xx. */ - bool is_t6xx; - - uint32_t out_sync; }; /* Corresponds to the CSO */ @@ -216,7 +199,6 @@ /* Variants bundle together to form the backing CSO, bundling multiple * shaders with varying emulated features baked in (alpha test * parameters, etc) */ -#define MAX_SHADER_VARIANTS 8 /* A shader state corresponds to the actual, current variant of the shader */ struct panfrost_shader_state { @@ -230,6 +212,8 @@ bool writes_point_size; bool reads_point_coord; bool reads_face; + bool reads_frag_coord; + unsigned stack_size; struct mali_attr_meta varyings[PIPE_MAX_ATTRIBS]; gl_varying_slot varyings_loc[PIPE_MAX_ATTRIBS]; @@ -263,7 +247,9 @@ struct pipe_compute_state cbase; }; - struct panfrost_shader_state variants[MAX_SHADER_VARIANTS]; + struct panfrost_shader_state *variants; + unsigned variant_space; + unsigned variant_count; /* The current active variant */ @@ -287,6 +273,7 @@ struct panfrost_sampler_view { struct pipe_sampler_view base; struct mali_texture_descriptor hw; + uint8_t astc_stretch; bool manual_stride; }; @@ -300,6 +287,9 @@ panfrost_create_context(struct pipe_screen *screen, void *priv, unsigned flags); void +panfrost_invalidate_frame(struct panfrost_context *ctx); + +void panfrost_emit_for_draw(struct panfrost_context *ctx, bool with_vertex_data); struct panfrost_transfer @@ -314,20 +304,21 @@ struct pipe_fence_handle **fence, unsigned flags); -bool -panfrost_is_scanout(struct panfrost_context *ctx); +mali_ptr panfrost_sfbd_fragment(struct panfrost_batch *batch, bool has_draws); +mali_ptr panfrost_mfbd_fragment(struct panfrost_batch *batch, bool has_draws); -mali_ptr panfrost_sfbd_fragment(struct panfrost_context *ctx, bool has_draws); -mali_ptr panfrost_mfbd_fragment(struct panfrost_context *ctx, bool has_draws); +void +panfrost_attach_mfbd(struct panfrost_batch *batch, unsigned vertex_count); -struct bifrost_framebuffer -panfrost_emit_mfbd(struct panfrost_context *ctx, unsigned vertex_count); +void +panfrost_attach_sfbd(struct panfrost_batch *batch, unsigned vertex_count); -struct mali_single_framebuffer -panfrost_emit_sfbd(struct panfrost_context *ctx, unsigned vertex_count); +struct midgard_tiler_descriptor +panfrost_emit_midg_tiler(struct panfrost_batch *batch, unsigned vertex_count); mali_ptr -panfrost_fragment_job(struct panfrost_context *ctx, bool has_draws); +panfrost_fragment_job(struct panfrost_batch *batch, bool has_draws, + struct mali_job_descriptor_header **header_cpu); void panfrost_shader_compile( @@ -339,48 +330,13 @@ struct panfrost_shader_state *state, uint64_t *outputs_written); -void -panfrost_pack_work_groups_compute( - struct mali_vertex_tiler_prefix *out, - unsigned num_x, - unsigned num_y, - unsigned num_z, - unsigned size_x, - unsigned size_y, - unsigned size_z); - -void -panfrost_pack_work_groups_fused( - struct mali_vertex_tiler_prefix *vertex, - struct mali_vertex_tiler_prefix *tiler, - unsigned num_x, - unsigned num_y, - unsigned num_z, - unsigned size_x, - unsigned size_y, - unsigned size_z); - /* Instancing */ mali_ptr panfrost_vertex_buffer_address(struct panfrost_context *ctx, unsigned i); void -panfrost_emit_vertex_data(struct panfrost_job *batch); - -struct pan_shift_odd { - unsigned shift; - unsigned odd; -}; - -struct pan_shift_odd -panfrost_padded_vertex_count( - unsigned vertex_count, - bool primitive_pot); - - -unsigned -pan_expand_shift_odd(struct pan_shift_odd o); +panfrost_emit_vertex_data(struct panfrost_batch *batch); /* Compute */ diff -Nru mesa-19.2.8/src/gallium/drivers/panfrost/pan_drm.c mesa-20.0.8/src/gallium/drivers/panfrost/pan_drm.c --- mesa-19.2.8/src/gallium/drivers/panfrost/pan_drm.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/panfrost/pan_drm.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,438 +0,0 @@ -/* - * © Copyright 2019 Collabora, Ltd. - * Copyright 2019 Alyssa Rosenzweig - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - */ - -#include -#include - -#include "drm-uapi/panfrost_drm.h" - -#include "util/u_memory.h" -#include "util/os_time.h" -#include "os/os_mman.h" - -#include "pan_screen.h" -#include "pan_resource.h" -#include "pan_context.h" -#include "pan_util.h" -#include "pandecode/decode.h" - -void -panfrost_drm_mmap_bo(struct panfrost_screen *screen, struct panfrost_bo *bo) -{ - struct drm_panfrost_mmap_bo mmap_bo = { .handle = bo->gem_handle }; - int ret; - - if (bo->cpu) - return; - - ret = drmIoctl(screen->fd, DRM_IOCTL_PANFROST_MMAP_BO, &mmap_bo); - if (ret) { - fprintf(stderr, "DRM_IOCTL_PANFROST_MMAP_BO failed: %m\n"); - assert(0); - } - - bo->cpu = os_mmap(NULL, bo->size, PROT_READ | PROT_WRITE, MAP_SHARED, - screen->fd, mmap_bo.offset); - if (bo->cpu == MAP_FAILED) { - fprintf(stderr, "mmap failed: %p %m\n", bo->cpu); - assert(0); - } - - /* Record the mmap if we're tracing */ - if (pan_debug & PAN_DBG_TRACE) - pandecode_inject_mmap(bo->gpu, bo->cpu, bo->size, NULL); -} - -static void -panfrost_drm_munmap_bo(struct panfrost_screen *screen, struct panfrost_bo *bo) -{ - if (!bo->cpu) - return; - - if (os_munmap((void *) (uintptr_t)bo->cpu, bo->size)) { - perror("munmap"); - abort(); - } - - bo->cpu = NULL; -} - -struct panfrost_bo * -panfrost_drm_create_bo(struct panfrost_screen *screen, size_t size, - uint32_t flags) -{ - struct panfrost_bo *bo; - - /* Kernel will fail (confusingly) with EPERM otherwise */ - assert(size > 0); - - /* To maximize BO cache usage, don't allocate tiny BOs */ - size = MAX2(size, 4096); - - /* GROWABLE BOs cannot be mmapped */ - if (flags & PAN_ALLOCATE_GROWABLE) - assert(flags & PAN_ALLOCATE_INVISIBLE); - - unsigned translated_flags = 0; - - if (screen->kernel_version->version_major > 1 || - screen->kernel_version->version_minor >= 1) { - if (flags & PAN_ALLOCATE_GROWABLE) - translated_flags |= PANFROST_BO_HEAP; - if (!(flags & PAN_ALLOCATE_EXECUTE)) - translated_flags |= PANFROST_BO_NOEXEC; - } - - struct drm_panfrost_create_bo create_bo = { - .size = size, - .flags = translated_flags, - }; - - /* Before creating a BO, we first want to check the cache */ - - bo = panfrost_bo_cache_fetch(screen, size, flags); - - if (bo == NULL) { - /* Otherwise, the cache misses and we need to allocate a BO fresh from - * the kernel */ - - int ret; - - ret = drmIoctl(screen->fd, DRM_IOCTL_PANFROST_CREATE_BO, &create_bo); - if (ret) { - fprintf(stderr, "DRM_IOCTL_PANFROST_CREATE_BO failed: %m\n"); - assert(0); - } - - /* We have a BO allocated from the kernel; fill in the userspace - * version */ - - bo = rzalloc(screen, struct panfrost_bo); - bo->size = create_bo.size; - bo->gpu = create_bo.offset; - bo->gem_handle = create_bo.handle; - bo->flags = flags; - } - - /* Only mmap now if we know we need to. For CPU-invisible buffers, we - * never map since we don't care about their contents; they're purely - * for GPU-internal use. But we do trace them anyway. */ - - if (!(flags & (PAN_ALLOCATE_INVISIBLE | PAN_ALLOCATE_DELAY_MMAP))) - panfrost_drm_mmap_bo(screen, bo); - else if (flags & PAN_ALLOCATE_INVISIBLE) { - if (pan_debug & PAN_DBG_TRACE) - pandecode_inject_mmap(bo->gpu, NULL, bo->size, NULL); - } - - pipe_reference_init(&bo->reference, 1); - return bo; -} - -void -panfrost_drm_release_bo(struct panfrost_screen *screen, struct panfrost_bo *bo, bool cacheable) -{ - struct drm_gem_close gem_close = { .handle = bo->gem_handle }; - int ret; - - if (!bo) - return; - - /* Rather than freeing the BO now, we'll cache the BO for later - * allocations if we're allowed to */ - - panfrost_drm_munmap_bo(screen, bo); - - if (cacheable) { - bool cached = panfrost_bo_cache_put(screen, bo); - - if (cached) - return; - } - - /* Otherwise, if the BO wasn't cached, we'll legitimately free the BO */ - - ret = drmIoctl(screen->fd, DRM_IOCTL_GEM_CLOSE, &gem_close); - if (ret) { - fprintf(stderr, "DRM_IOCTL_GEM_CLOSE failed: %m\n"); - assert(0); - } - - ralloc_free(bo); -} - -void -panfrost_drm_allocate_slab(struct panfrost_screen *screen, - struct panfrost_memory *mem, - size_t pages, - bool same_va, - int extra_flags, - int commit_count, - int extent) -{ - // TODO cache allocations - // TODO properly handle errors - // TODO take into account extra_flags - mem->bo = panfrost_drm_create_bo(screen, pages * 4096, extra_flags); - mem->stack_bottom = 0; -} - -void -panfrost_drm_free_slab(struct panfrost_screen *screen, struct panfrost_memory *mem) -{ - panfrost_bo_unreference(&screen->base, mem->bo); - mem->bo = NULL; -} - -struct panfrost_bo * -panfrost_drm_import_bo(struct panfrost_screen *screen, int fd) -{ - struct panfrost_bo *bo = rzalloc(screen, struct panfrost_bo); - struct drm_panfrost_get_bo_offset get_bo_offset = {0,}; - ASSERTED int ret; - unsigned gem_handle; - - ret = drmPrimeFDToHandle(screen->fd, fd, &gem_handle); - assert(!ret); - - get_bo_offset.handle = gem_handle; - ret = drmIoctl(screen->fd, DRM_IOCTL_PANFROST_GET_BO_OFFSET, &get_bo_offset); - assert(!ret); - - bo->gem_handle = gem_handle; - bo->gpu = (mali_ptr) get_bo_offset.offset; - bo->size = lseek(fd, 0, SEEK_END); - assert(bo->size > 0); - pipe_reference_init(&bo->reference, 1); - - // TODO map and unmap on demand? - panfrost_drm_mmap_bo(screen, bo); - return bo; -} - -int -panfrost_drm_export_bo(struct panfrost_screen *screen, const struct panfrost_bo *bo) -{ - struct drm_prime_handle args = { - .handle = bo->gem_handle, - .flags = DRM_CLOEXEC, - }; - - int ret = drmIoctl(screen->fd, DRM_IOCTL_PRIME_HANDLE_TO_FD, &args); - if (ret == -1) - return -1; - - return args.fd; -} - -static int -panfrost_drm_submit_job(struct panfrost_context *ctx, u64 job_desc, int reqs) -{ - struct pipe_context *gallium = (struct pipe_context *) ctx; - struct panfrost_screen *screen = pan_screen(gallium->screen); - struct panfrost_job *job = panfrost_get_job_for_fbo(ctx); - struct drm_panfrost_submit submit = {0,}; - int *bo_handles, ret; - - submit.in_syncs = (u64) (uintptr_t) &ctx->out_sync; - submit.in_sync_count = 1; - - submit.out_sync = ctx->out_sync; - - submit.jc = job_desc; - submit.requirements = reqs; - - bo_handles = calloc(job->bos->entries, sizeof(*bo_handles)); - assert(bo_handles); - - set_foreach(job->bos, entry) { - struct panfrost_bo *bo = (struct panfrost_bo *)entry->key; - assert(bo->gem_handle > 0); - bo_handles[submit.bo_handle_count++] = bo->gem_handle; - } - - submit.bo_handles = (u64) (uintptr_t) bo_handles; - ret = drmIoctl(screen->fd, DRM_IOCTL_PANFROST_SUBMIT, &submit); - free(bo_handles); - if (ret) { - fprintf(stderr, "Error submitting: %m\n"); - return errno; - } - - /* Trace the job if we're doing that */ - if (pan_debug & PAN_DBG_TRACE) { - /* Wait so we can get errors reported back */ - drmSyncobjWait(screen->fd, &ctx->out_sync, 1, INT64_MAX, 0, NULL); - pandecode_jc(submit.jc, FALSE); - } - - return 0; -} - -int -panfrost_drm_submit_vs_fs_job(struct panfrost_context *ctx, bool has_draws, bool is_scanout) -{ - int ret = 0; - - struct panfrost_job *job = panfrost_get_job_for_fbo(ctx); - - /* TODO: Add here the transient pools */ - panfrost_job_add_bo(job, ctx->scratchpad.bo); - panfrost_job_add_bo(job, ctx->tiler_heap.bo); - panfrost_job_add_bo(job, job->polygon_list); - - if (job->first_job.gpu) { - ret = panfrost_drm_submit_job(ctx, job->first_job.gpu, 0); - assert(!ret); - } - - if (job->first_tiler.gpu || job->clear) { - ret = panfrost_drm_submit_job(ctx, panfrost_fragment_job(ctx, has_draws), PANFROST_JD_REQ_FS); - assert(!ret); - } - - return ret; -} - -static struct panfrost_fence * -panfrost_fence_create(struct panfrost_context *ctx) -{ - struct pipe_context *gallium = (struct pipe_context *) ctx; - struct panfrost_screen *screen = pan_screen(gallium->screen); - struct panfrost_fence *f = calloc(1, sizeof(*f)); - if (!f) - return NULL; - - /* Snapshot the last Panfrost's rendering's out fence. We'd rather have - * another syncobj instead of a sync file, but this is all we get. - * (HandleToFD/FDToHandle just gives you another syncobj ID for the - * same syncobj). - */ - drmSyncobjExportSyncFile(screen->fd, ctx->out_sync, &f->fd); - if (f->fd == -1) { - fprintf(stderr, "export failed: %m\n"); - free(f); - return NULL; - } - - pipe_reference_init(&f->reference, 1); - - return f; -} - -void -panfrost_drm_force_flush_fragment(struct panfrost_context *ctx, - struct pipe_fence_handle **fence) -{ - struct pipe_context *gallium = (struct pipe_context *) ctx; - struct panfrost_screen *screen = pan_screen(gallium->screen); - - if (!screen->last_fragment_flushed) { - drmSyncobjWait(screen->fd, &ctx->out_sync, 1, INT64_MAX, 0, NULL); - screen->last_fragment_flushed = true; - - /* The job finished up, so we're safe to clean it up now */ - panfrost_free_job(ctx, screen->last_job); - } - - if (fence) { - struct panfrost_fence *f = panfrost_fence_create(ctx); - gallium->screen->fence_reference(gallium->screen, fence, NULL); - *fence = (struct pipe_fence_handle *)f; - } -} - -unsigned -panfrost_drm_query_gpu_version(struct panfrost_screen *screen) -{ - struct drm_panfrost_get_param get_param = {0,}; - ASSERTED int ret; - - get_param.param = DRM_PANFROST_PARAM_GPU_PROD_ID; - ret = drmIoctl(screen->fd, DRM_IOCTL_PANFROST_GET_PARAM, &get_param); - assert(!ret); - - return get_param.value; -} - -int -panfrost_drm_init_context(struct panfrost_context *ctx) -{ - struct pipe_context *gallium = (struct pipe_context *) ctx; - struct panfrost_screen *screen = pan_screen(gallium->screen); - - return drmSyncobjCreate(screen->fd, DRM_SYNCOBJ_CREATE_SIGNALED, - &ctx->out_sync); -} - -void -panfrost_drm_fence_reference(struct pipe_screen *screen, - struct pipe_fence_handle **ptr, - struct pipe_fence_handle *fence) -{ - struct panfrost_fence **p = (struct panfrost_fence **)ptr; - struct panfrost_fence *f = (struct panfrost_fence *)fence; - struct panfrost_fence *old = *p; - - if (pipe_reference(&(*p)->reference, &f->reference)) { - close(old->fd); - free(old); - } - *p = f; -} - -boolean -panfrost_drm_fence_finish(struct pipe_screen *pscreen, - struct pipe_context *ctx, - struct pipe_fence_handle *fence, - uint64_t timeout) -{ - struct panfrost_screen *screen = pan_screen(pscreen); - struct panfrost_fence *f = (struct panfrost_fence *)fence; - int ret; - - unsigned syncobj; - ret = drmSyncobjCreate(screen->fd, 0, &syncobj); - if (ret) { - fprintf(stderr, "Failed to create syncobj to wait on: %m\n"); - return false; - } - - drmSyncobjImportSyncFile(screen->fd, syncobj, f->fd); - if (ret) { - fprintf(stderr, "Failed to import fence to syncobj: %m\n"); - return false; - } - - uint64_t abs_timeout = os_time_get_absolute_timeout(timeout); - if (abs_timeout == OS_TIMEOUT_INFINITE) - abs_timeout = INT64_MAX; - - ret = drmSyncobjWait(screen->fd, &syncobj, 1, abs_timeout, 0, NULL); - - drmSyncobjDestroy(screen->fd, syncobj); - - return ret >= 0; -} diff -Nru mesa-19.2.8/src/gallium/drivers/panfrost/pan_format.c mesa-20.0.8/src/gallium/drivers/panfrost/pan_format.c --- mesa-19.2.8/src/gallium/drivers/panfrost/pan_format.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/panfrost/pan_format.c 2020-06-12 01:21:17.000000000 +0000 @@ -174,6 +174,8 @@ case PIPE_FORMAT_R10G10B10A2_UINT: case PIPE_FORMAT_B10G10R10A2_UINT: + case PIPE_FORMAT_R10G10B10A2_USCALED: + case PIPE_FORMAT_B10G10R10A2_USCALED: return MALI_RGB10_A2UI; case PIPE_FORMAT_R10G10B10A2_SSCALED: @@ -182,8 +184,13 @@ case PIPE_FORMAT_Z32_UNORM: case PIPE_FORMAT_Z24X8_UNORM: + case PIPE_FORMAT_Z24_UNORM_S8_UINT: return MALI_Z32_UNORM; + case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: + /* Z32F = R32F to the hardware */ + return MALI_R32F; + case PIPE_FORMAT_B5G6R5_UNORM: return MALI_RGB565; @@ -210,11 +217,41 @@ case PIPE_FORMAT_R9G9B9E5_FLOAT: return MALI_R9F_G9F_B9F_E5F; + case PIPE_FORMAT_ETC1_RGB8: + case PIPE_FORMAT_ETC2_RGB8: + case PIPE_FORMAT_ETC2_SRGB8: + return MALI_ETC2_RGB8; + + case PIPE_FORMAT_ETC2_RGB8A1: + case PIPE_FORMAT_ETC2_SRGB8A1: + return MALI_ETC2_RGB8A1; + + case PIPE_FORMAT_ETC2_RGBA8: + case PIPE_FORMAT_ETC2_SRGBA8: + return MALI_ETC2_RGBA8; + + case PIPE_FORMAT_ETC2_R11_UNORM: + return MALI_ETC2_R11_UNORM; + case PIPE_FORMAT_ETC2_R11_SNORM: + return MALI_ETC2_R11_SNORM; + + case PIPE_FORMAT_ETC2_RG11_UNORM: + return MALI_ETC2_RG11_UNORM; + case PIPE_FORMAT_ETC2_RG11_SNORM: + return MALI_ETC2_RG11_SNORM; + default: /* Fallthrough to default */ break; } + if (desc->layout == UTIL_FORMAT_LAYOUT_ASTC) { + if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) + return MALI_ASTC_SRGB_SUPP; + else + return MALI_ASTC_HDR_SUPP; + } + /* Formats must match in channel count */ assert(desc->nr_channels >= 1 && desc->nr_channels <= 4); unsigned format = MALI_NR_CHANNELS(desc->nr_channels); @@ -247,4 +284,39 @@ return (enum mali_format) format; } +void +panfrost_invert_swizzle(const unsigned char *in, unsigned char *out) +{ + /* First, default to all zeroes to prevent uninitialized junk */ + + for (unsigned c = 0; c < 4; ++c) + out[c] = PIPE_SWIZZLE_0; + + /* Now "do" what the swizzle says */ + + for (unsigned c = 0; c < 4; ++c) { + unsigned char i = in[c]; + + /* Who cares? */ + assert(PIPE_SWIZZLE_X == 0); + if (i > PIPE_SWIZZLE_W) + continue; + + /* Invert */ + unsigned idx = i - PIPE_SWIZZLE_X; + out[idx] = PIPE_SWIZZLE_X + c; + } +} +/* Is a format encoded like Z24S8 and therefore compatible for render? */ +bool +panfrost_is_z24s8_variant(enum pipe_format fmt) +{ + switch (fmt) { + case PIPE_FORMAT_Z24_UNORM_S8_UINT: + case PIPE_FORMAT_Z24X8_UNORM: + return true; + default: + return false; + } +} diff -Nru mesa-19.2.8/src/gallium/drivers/panfrost/pan_format.h mesa-20.0.8/src/gallium/drivers/panfrost/pan_format.h --- mesa-19.2.8/src/gallium/drivers/panfrost/pan_format.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/panfrost/pan_format.h 2020-06-12 01:21:17.000000000 +0000 @@ -26,7 +26,7 @@ #define __PAN_FORMAT_H__ #include "pan_context.h" -#include "util/u_format.h" +#include "util/format/u_format.h" unsigned panfrost_translate_swizzle_4(const unsigned char swizzle[4]); @@ -37,6 +37,12 @@ enum mali_format panfrost_find_format(const struct util_format_description *desc); +void +panfrost_invert_swizzle(const unsigned char *in, unsigned char *out); + +bool +panfrost_is_z24s8_variant(enum pipe_format fmt); + #endif diff -Nru mesa-19.2.8/src/gallium/drivers/panfrost/pan_fragment.c mesa-20.0.8/src/gallium/drivers/panfrost/pan_fragment.c --- mesa-19.2.8/src/gallium/drivers/panfrost/pan_fragment.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/panfrost/pan_fragment.c 2020-06-12 01:21:17.000000000 +0000 @@ -25,14 +25,15 @@ #include "pan_context.h" #include "pan_util.h" #include "pan_format.h" +#include "panfrost-quirks.h" -#include "util/u_format.h" +#include "util/format/u_format.h" /* Mark a surface as written */ static void panfrost_initialize_surface( - struct panfrost_job *batch, + struct panfrost_batch *batch, struct pipe_surface *surf) { if (!surf) @@ -42,28 +43,25 @@ struct panfrost_resource *rsrc = pan_resource(surf->texture); rsrc->slices[level].initialized = true; - - assert(rsrc->bo); - panfrost_job_add_bo(batch, rsrc->bo); } /* Generate a fragment job. This should be called once per frame. (According to * presentations, this is supposed to correspond to eglSwapBuffers) */ mali_ptr -panfrost_fragment_job(struct panfrost_context *ctx, bool has_draws) +panfrost_fragment_job(struct panfrost_batch *batch, bool has_draws, + struct mali_job_descriptor_header **header_cpu) { - struct panfrost_screen *screen = pan_screen(ctx->base.screen); + struct panfrost_screen *screen = pan_screen(batch->ctx->base.screen); - mali_ptr framebuffer = screen->require_sfbd ? - panfrost_sfbd_fragment(ctx, has_draws) : - panfrost_mfbd_fragment(ctx, has_draws); + mali_ptr framebuffer = (screen->quirks & MIDGARD_SFBD) ? + panfrost_sfbd_fragment(batch, has_draws) : + panfrost_mfbd_fragment(batch, has_draws); /* Mark the affected buffers as initialized, since we're writing to it. * Also, add the surfaces we're writing to to the batch */ - struct pipe_framebuffer_state *fb = &ctx->pipe_framebuffer; - struct panfrost_job *batch = panfrost_get_job_for_fbo(ctx); + struct pipe_framebuffer_state *fb = &batch->key; for (unsigned i = 0; i < fb->nr_cbufs; ++i) { panfrost_initialize_surface(batch, fb->cbufs[i]); @@ -78,8 +76,6 @@ .job_descriptor_size = 1 }; - struct panfrost_job *job = panfrost_get_job_for_fbo(ctx); - /* The passed tile coords can be out of range in some cases, so we need * to clamp them to the framebuffer size to avoid a TILE_RANGE_FAULT. * Theoretically we also need to clamp the coordinates positive, but we @@ -91,27 +87,24 @@ * But that can't happen if any actual drawing occurs (beyond a * wallpaper reload), so this is again irrelevant in practice. */ - job->maxx = MIN2(job->maxx, fb->width); - job->maxy = MIN2(job->maxy, fb->height); + batch->maxx = MIN2(batch->maxx, fb->width); + batch->maxy = MIN2(batch->maxy, fb->height); /* Rendering region must be at least 1x1; otherwise, there is nothing * to do and the whole job chain should have been discarded. */ - assert(job->maxx > job->minx); - assert(job->maxy > job->miny); + assert(batch->maxx > batch->minx); + assert(batch->maxy > batch->miny); struct mali_payload_fragment payload = { - .min_tile_coord = MALI_COORDINATE_TO_TILE_MIN(job->minx, job->miny), - .max_tile_coord = MALI_COORDINATE_TO_TILE_MAX(job->maxx, job->maxy), + .min_tile_coord = MALI_COORDINATE_TO_TILE_MIN(batch->minx, batch->miny), + .max_tile_coord = MALI_COORDINATE_TO_TILE_MAX(batch->maxx, batch->maxy), .framebuffer = framebuffer, }; - /* Normally, there should be no padding. However, fragment jobs are - * shared with 64-bit Bifrost systems, and accordingly there is 4-bytes - * of zero padding in between. */ - - struct panfrost_transfer transfer = panfrost_allocate_transient(ctx, sizeof(header) + sizeof(payload)); + struct panfrost_transfer transfer = panfrost_allocate_transient(batch, sizeof(header) + sizeof(payload)); memcpy(transfer.cpu, &header, sizeof(header)); memcpy(transfer.cpu + sizeof(header), &payload, sizeof(payload)); + *header_cpu = (struct mali_job_descriptor_header *)transfer.cpu; return transfer.gpu; } diff -Nru mesa-19.2.8/src/gallium/drivers/panfrost/pan_instancing.c mesa-20.0.8/src/gallium/drivers/panfrost/pan_instancing.c --- mesa-19.2.8/src/gallium/drivers/panfrost/pan_instancing.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/panfrost/pan_instancing.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,344 +0,0 @@ -/* - * Copyright (C) 2018-2019 Alyssa Rosenzweig - * Copyright (C) 2019 Collabora, Ltd. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - */ - -#include "pan_context.h" - -/* See mali_job for notes on how this works. But basically, for small vertex - * counts, we have a lookup table, and for large vertex counts, we look at the - * high bits as a heuristic. This has to match exactly how the hardware - * calculates this (which is why the algorithm is so weird) or else instancing - * will break. */ - -/* Given an odd number (of the form 2k + 1), compute k */ -#define ODD(odd) ((odd - 1) >> 1) - -/* Given the shift/odd pair, recover the original padded integer */ - -unsigned -pan_expand_shift_odd(struct pan_shift_odd o) -{ - unsigned odd = 2*o.odd + 1; - unsigned shift = 1 << o.shift; - return odd * shift; -} - -static inline struct pan_shift_odd -pan_factored(unsigned pot, unsigned odd) -{ - struct pan_shift_odd out; - - assert(util_is_power_of_two_or_zero(pot)); - assert(odd & 1); - - /* Odd is of the form (2k + 1) = (k << 1) + 1 = (k << 1) | 1. - * - * So (odd >> 1) = ((k << 1) | 1) >> 1 = ((k << 1) >> 1) | (1 >> 1) - * = k | 0 = k */ - - out.odd = (odd >> 1); - - /* POT is the form (1 << shift) */ - out.shift = __builtin_ctz(pot); - - return out; -} - - -/* For small vertices. Second argument is whether the primitive takes a - * power-of-two argument, which determines how rounding works. True for POINTS - * and LINES, false for TRIANGLES. Presumably true for QUADS but you'd be crazy - * to try instanced quads on ES class hardware <3 */ - -static struct { - unsigned pot; - unsigned odd; -} small_lut[] = { - { 0, 1 }, - { 1, 1 }, - { 2, 1 }, - { 1, 3 }, - { 4, 1 }, - { 1, 5 }, - { 2, 3 }, - { 1, 7 }, - { 8, 1 }, - { 1, 9 }, - { 2, 5 }, - { 4, 3 }, /* 11 */ - { 4, 3 }, - { 2, 7 }, /* 13 */ - { 2, 7 }, - { 16, 1 }, /* 15 */ - { 16, 1 }, - { 2, 9 }, - { 4, 5 }, /* 20 */ - { 4, 5 } -}; - -static struct pan_shift_odd -panfrost_small_padded_vertex_count(unsigned idx) -{ - return pan_factored( - small_lut[idx].pot, - small_lut[idx].odd); -} - -static struct pan_shift_odd -panfrost_large_padded_vertex_count(uint32_t vertex_count) -{ - struct pan_shift_odd out = { 0 }; - - /* First, we have to find the highest set one */ - unsigned highest = 32 - __builtin_clz(vertex_count); - - /* Using that, we mask out the highest 4-bits */ - unsigned n = highest - 4; - unsigned nibble = (vertex_count >> n) & 0xF; - - /* Great, we have the nibble. Now we can just try possibilities. Note - * that we don't care about the bottom most bit in most cases, and we - * know the top bit must be 1 */ - - unsigned middle_two = (nibble >> 1) & 0x3; - - switch (middle_two) { - case 0b00: - if (nibble & 1) - return pan_factored(1 << n, 9); - else - return pan_factored(1 << (n + 1), 5); - case 0b01: - return pan_factored(1 << (n + 2), 3); - case 0b10: - return pan_factored(1 << (n + 1), 7); - case 0b11: - return pan_factored(1 << (n + 4), 1); - default: - unreachable("Invalid two bits"); - } - - return out; -} - -struct pan_shift_odd -panfrost_padded_vertex_count( - unsigned vertex_count, - bool pot) -{ - assert(vertex_count > 0); - - if (vertex_count < 20) { - /* Add an off-by-one if it won't align naturally (quirk of the hardware) */ - //if (!pot) - // vertex_count++; - - return panfrost_small_padded_vertex_count(vertex_count); - } else - return panfrost_large_padded_vertex_count(vertex_count); -} - -/* The much, much more irritating case -- instancing is enabled. See - * panfrost_job.h for notes on how this works */ - -static unsigned -panfrost_vertex_instanced( - struct panfrost_job *batch, - struct panfrost_resource *rsrc, - unsigned divisor, - union mali_attr *attrs, - mali_ptr addr, - unsigned vertex_count, - unsigned instance_count) -{ - /* First, grab the padded vertex count */ - - struct pan_shift_odd o = { - .shift = batch->ctx->payloads[PIPE_SHADER_FRAGMENT].instance_shift, - .odd = batch->ctx->payloads[PIPE_SHADER_FRAGMENT].instance_odd, - }; - - unsigned padded_count = batch->ctx->padded_count; - - /* Depending if there is an instance divisor or not, packing varies. - * When there is a divisor, the hardware-level divisor is actually the - * product of the instance divisor and the padded count */ - - unsigned hw_divisor = padded_count * divisor; - - if (divisor == 0) { - /* Per-vertex attributes use the MODULO mode. First, compute - * the modulus */ - - attrs->elements |= MALI_ATTR_MODULO; - attrs->shift = o.shift; - attrs->extra_flags = o.odd; - - return 1; - } else if (util_is_power_of_two_or_zero(hw_divisor)) { - /* If there is a divisor but the hardware divisor works out to - * a power of two (not terribly exceptional), we can use an - * easy path (just shifting) */ - - attrs->elements |= MALI_ATTR_POT_DIVIDE; - attrs->shift = __builtin_ctz(hw_divisor); - - return 1; - } else { - /* We have a NPOT divisor. Here's the fun one (multipling by - * the inverse and shifting) */ - - /* floor(log2(d)) */ - unsigned shift = util_logbase2(hw_divisor); - - /* m = ceil(2^(32 + shift) / d) */ - uint64_t shift_hi = 32 + shift; - uint64_t t = 1ll << shift_hi; - double t_f = t; - double hw_divisor_d = hw_divisor; - double m_f = ceil(t_f / hw_divisor_d); - unsigned m = m_f; - - /* Default case */ - unsigned magic_divisor = m, extra_flags = 0; - - /* e = 2^(shift + 32) % d */ - uint64_t e = t % hw_divisor; - - /* Apply round-down algorithm? e <= 2^shift?. XXX: The blob - * seems to use a different condition */ - if (e <= (1 << shift)) { - magic_divisor = m - 1; - extra_flags = 1; - } - - /* Top flag implicitly set */ - assert(magic_divisor & (1 << 31)); - magic_divisor &= ~(1 << 31); - - /* Upload to two different slots */ - - attrs[0].elements |= MALI_ATTR_NPOT_DIVIDE; - attrs[0].shift = shift; - attrs[0].extra_flags = extra_flags; - - attrs[1].unk = 0x20; - attrs[1].magic_divisor = magic_divisor; - attrs[1].zero = 0; - attrs[1].divisor = divisor; - - return 2; - } -} - -void -panfrost_emit_vertex_data(struct panfrost_job *batch) -{ - struct panfrost_context *ctx = batch->ctx; - struct panfrost_vertex_state *so = ctx->vertex; - - /* Staged mali_attr, and index into them. i =/= k, depending on the - * vertex buffer mask and instancing. Twice as much room is allocated, - * for a worst case of NPOT_DIVIDEs which take up extra slot */ - union mali_attr attrs[PIPE_MAX_ATTRIBS * 2]; - unsigned k = 0; - - unsigned vertex_count = ctx->vertex_count; - unsigned instanced_count = ctx->instance_count; - - for (unsigned i = 0; i < so->num_elements; ++i) { - /* We map a mali_attr to be 1:1 with the mali_attr_meta, which - * means duplicating some vertex buffers (who cares? aside from - * maybe some caching implications but I somehow doubt that - * matters) */ - - struct pipe_vertex_element *elem = &so->pipe[i]; - unsigned vbi = elem->vertex_buffer_index; - - /* The exception to 1:1 mapping is that we can have multiple - * entries (NPOT divisors), so we fixup anyways */ - - so->hw[i].index = k; - - if (!(ctx->vb_mask & (1 << vbi))) continue; - - struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi]; - struct panfrost_resource *rsrc = (struct panfrost_resource *) (buf->buffer.resource); - - if (!rsrc) continue; - - /* Align to 64 bytes by masking off the lower bits. This - * will be adjusted back when we fixup the src_offset in - * mali_attr_meta */ - - mali_ptr raw_addr = panfrost_vertex_buffer_address(ctx, vbi); - mali_ptr addr = raw_addr & ~63; - unsigned chopped_addr = raw_addr - addr; - - /* Add a dependency of the batch on the vertex buffer */ - panfrost_job_add_bo(batch, rsrc->bo); - - /* Set common fields */ - attrs[k].elements = addr; - attrs[k].stride = buf->stride; - attrs[k].size = rsrc->base.width0; - - /* We need to add the extra size we masked off (for - * correctness) so the data doesn't get clamped away */ - attrs[k].size += chopped_addr; - - /* For non-instancing make sure we initialize */ - attrs[k].shift = attrs[k].extra_flags = 0; - - /* Instancing uses a dramatically different code path than - * linear, so dispatch for the actual emission now that the - * common code is finished */ - - unsigned divisor = elem->instance_divisor; - - if (divisor && instanced_count == 1) { - /* Silly corner case where there's a divisor(=1) but - * there's no legitimate instancing. So we want *every* - * attribute to be the same. So set stride to zero so - * we don't go anywhere. */ - - attrs[k].size = attrs[k].stride + chopped_addr; - attrs[k].stride = 0; - attrs[k++].elements |= MALI_ATTR_LINEAR; - } else if (instanced_count <= 1) { - /* Normal, non-instanced attributes */ - attrs[k++].elements |= MALI_ATTR_LINEAR; - } else { - k += panfrost_vertex_instanced( - batch, rsrc, divisor, &attrs[k], addr, vertex_count, instanced_count); - } - } - - /* Upload whatever we emitted and go */ - - ctx->payloads[PIPE_SHADER_VERTEX].postfix.attributes = - panfrost_upload_transient(ctx, attrs, k * sizeof(union mali_attr)); -} - - diff -Nru mesa-19.2.8/src/gallium/drivers/panfrost/pan_invocation.c mesa-20.0.8/src/gallium/drivers/panfrost/pan_invocation.c --- mesa-19.2.8/src/gallium/drivers/panfrost/pan_invocation.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/panfrost/pan_invocation.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,131 +0,0 @@ -/* - * Copyright (C) 2019 Collabora, Ltd. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Authors (Collabora): - * Alyssa Rosenzweig - * - */ - -#include "pan_context.h" - -/* Compute shaders are invoked with a gl_NumWorkGroups X/Y/Z triplet. Vertex - * shaders, it turns out, are invoked with the same mechanism, with the triplet - * (1, vertex_count, instance_count). - * - * Alongside this triplet is the gl_WorkGroupSize X/Y/Z triplet. - * - * Unfortunately, the packing for these triplet into the - * mali_vertex_tiler_prefix is a little funky, using a dynamic bitfield. The - * routines here exist to pack this */ - -void -panfrost_pack_work_groups_compute( - struct mali_vertex_tiler_prefix *out, - unsigned num_x, - unsigned num_y, - unsigned num_z, - unsigned size_x, - unsigned size_y, - unsigned size_z) -{ - /* First of all, all 6 values are off-by-one (strictly positive). - * Account for that, first by ensuring all values are strictly positive - * and then by offsetting */ - - assert(num_x > 0); - assert(num_y > 0); - assert(num_z > 0); - - assert(size_x > 0); - assert(size_y > 0); - assert(size_z > 0); - - num_x = MALI_POSITIVE(num_x); - num_y = MALI_POSITIVE(num_y); - num_z = MALI_POSITIVE(num_z); - - size_x = MALI_POSITIVE(size_x); - size_y = MALI_POSITIVE(size_y); - size_z = MALI_POSITIVE(size_z); - - /* Next up is to pack in order */ - - uint32_t packed = 0; - - /* The values needing packing, in order, and the corresponding shifts. - * Indicies into shift are off-by-one to make the logic easier */ - - unsigned shifts[7] = { 0 }; - unsigned values[6] = { size_x, size_y, size_z, num_x, num_y, num_z }; - - for (unsigned i = 0; i < 6; ++i) { - /* OR it in, shifting as required */ - packed |= (values[i] << shifts[i]); - - /* How many bits did we use? */ - unsigned bit_count = util_logbase2_ceil(values[i] + 1); - - /* Set the next shift accordingly */ - shifts[i + 1] = shifts[i] + bit_count; - } - - /* We're packed, so upload everything */ - out->invocation_count = packed; - out->size_y_shift = shifts[1]; - out->size_z_shift = shifts[2]; - out->workgroups_x_shift = shifts[3]; - out->workgroups_y_shift = shifts[4]; - out->workgroups_z_shift = shifts[5]; - - /* Special fields */ - out->workgroups_x_shift_2 = MAX2(out->workgroups_x_shift, 2); - out->workgroups_x_shift_3 = out->workgroups_x_shift_2; -} - -/* Packs vertex/tiler descriptors simultaneously */ -void -panfrost_pack_work_groups_fused( - struct mali_vertex_tiler_prefix *vertex, - struct mali_vertex_tiler_prefix *tiler, - unsigned num_x, - unsigned num_y, - unsigned num_z, - unsigned size_x, - unsigned size_y, - unsigned size_z) -{ - panfrost_pack_work_groups_compute(vertex, num_x, num_y, num_z, size_x, size_y, size_z); - - /* Copy results over */ - tiler->invocation_count = vertex->invocation_count; - tiler->size_y_shift = vertex->size_y_shift; - tiler->size_z_shift = vertex->size_z_shift; - tiler->workgroups_x_shift = vertex->workgroups_x_shift; - tiler->workgroups_x_shift_2 = vertex->workgroups_x_shift_2; - tiler->workgroups_y_shift = vertex->workgroups_y_shift; - tiler->workgroups_z_shift = vertex->workgroups_z_shift; - - /* Set special fields for each */ - vertex->workgroups_x_shift_3 = 5; - tiler->workgroups_x_shift_3 = 6; -} - diff -Nru mesa-19.2.8/src/gallium/drivers/panfrost/pan_job.c mesa-20.0.8/src/gallium/drivers/panfrost/pan_job.c --- mesa-19.2.8/src/gallium/drivers/panfrost/pan_job.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/panfrost/pan_job.c 2020-06-12 01:21:17.000000000 +0000 @@ -25,101 +25,252 @@ #include +#include "drm-uapi/panfrost_drm.h" + +#include "pan_bo.h" #include "pan_context.h" #include "util/hash_table.h" #include "util/ralloc.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_pack_color.h" +#include "pan_util.h" +#include "pandecode/decode.h" +#include "panfrost-quirks.h" + +/* panfrost_bo_access is here to help us keep track of batch accesses to BOs + * and build a proper dependency graph such that batches can be pipelined for + * better GPU utilization. + * + * Each accessed BO has a corresponding entry in the ->accessed_bos hash table. + * A BO is either being written or read at any time, that's what the type field + * encodes. + * When the last access is a write, the batch writing the BO might have read + * dependencies (readers that have not been executed yet and want to read the + * previous BO content), and when the last access is a read, all readers might + * depend on another batch to push its results to memory. That's what the + * readers/writers keep track off. + * There can only be one writer at any given time, if a new batch wants to + * write to the same BO, a dependency will be added between the new writer and + * the old writer (at the batch level), and panfrost_bo_access->writer will be + * updated to point to the new writer. + */ +struct panfrost_bo_access { + uint32_t type; + struct util_dynarray readers; + struct panfrost_batch_fence *writer; +}; -struct panfrost_job * -panfrost_create_job(struct panfrost_context *ctx) +static struct panfrost_batch_fence * +panfrost_create_batch_fence(struct panfrost_batch *batch) { - struct panfrost_job *job = rzalloc(ctx, struct panfrost_job); - - job->ctx = ctx; + struct panfrost_batch_fence *fence; + ASSERTED int ret; - job->bos = _mesa_set_create(job, - _mesa_hash_pointer, - _mesa_key_pointer_equal); + fence = rzalloc(NULL, struct panfrost_batch_fence); + assert(fence); + pipe_reference_init(&fence->reference, 1); + fence->ctx = batch->ctx; + fence->batch = batch; + ret = drmSyncobjCreate(pan_screen(batch->ctx->base.screen)->fd, 0, + &fence->syncobj); + assert(!ret); - job->minx = job->miny = ~0; - job->maxx = job->maxy = 0; - job->transient_offset = 0; + return fence; +} - util_dynarray_init(&job->headers, job); - util_dynarray_init(&job->gpu_headers, job); - util_dynarray_init(&job->transient_indices, job); +static void +panfrost_free_batch_fence(struct panfrost_batch_fence *fence) +{ + drmSyncobjDestroy(pan_screen(fence->ctx->base.screen)->fd, + fence->syncobj); + ralloc_free(fence); +} - return job; +void +panfrost_batch_fence_unreference(struct panfrost_batch_fence *fence) +{ + if (pipe_reference(&fence->reference, NULL)) + panfrost_free_batch_fence(fence); } void -panfrost_free_job(struct panfrost_context *ctx, struct panfrost_job *job) +panfrost_batch_fence_reference(struct panfrost_batch_fence *fence) { - if (!job) - return; + pipe_reference(NULL, &fence->reference); +} - set_foreach(job->bos, entry) { - struct panfrost_bo *bo = (struct panfrost_bo *)entry->key; - panfrost_bo_unreference(ctx->base.screen, bo); +static struct panfrost_batch * +panfrost_create_batch(struct panfrost_context *ctx, + const struct pipe_framebuffer_state *key) +{ + struct panfrost_batch *batch = rzalloc(ctx, struct panfrost_batch); + + batch->ctx = ctx; + + batch->bos = _mesa_hash_table_create(batch, _mesa_hash_pointer, + _mesa_key_pointer_equal); + + batch->minx = batch->miny = ~0; + batch->maxx = batch->maxy = 0; + batch->transient_offset = 0; + + util_dynarray_init(&batch->headers, batch); + util_dynarray_init(&batch->gpu_headers, batch); + util_dynarray_init(&batch->dependencies, batch); + batch->out_sync = panfrost_create_batch_fence(batch); + util_copy_framebuffer_state(&batch->key, key); + + return batch; +} + +static void +panfrost_freeze_batch(struct panfrost_batch *batch) +{ + struct panfrost_context *ctx = batch->ctx; + struct hash_entry *entry; + + /* Remove the entry in the FBO -> batch hash table if the batch + * matches. This way, next draws/clears targeting this FBO will trigger + * the creation of a new batch. + */ + entry = _mesa_hash_table_search(ctx->batches, &batch->key); + if (entry && entry->data == batch) + _mesa_hash_table_remove(ctx->batches, entry); + + /* If this is the bound batch, the panfrost_context parameters are + * relevant so submitting it invalidates those parameters, but if it's + * not bound, the context parameters are for some other batch so we + * can't invalidate them. + */ + if (ctx->batch == batch) { + panfrost_invalidate_frame(ctx); + ctx->batch = NULL; } +} + +#ifndef NDEBUG +static bool panfrost_batch_is_frozen(struct panfrost_batch *batch) +{ + struct panfrost_context *ctx = batch->ctx; + struct hash_entry *entry; + + entry = _mesa_hash_table_search(ctx->batches, &batch->key); + if (entry && entry->data == batch) + return false; + + if (ctx->batch == batch) + return false; + + return true; +} +#endif + +static void +panfrost_free_batch(struct panfrost_batch *batch) +{ + if (!batch) + return; + + assert(panfrost_batch_is_frozen(batch)); - /* Free up the transient BOs we're sitting on */ - struct panfrost_screen *screen = pan_screen(ctx->base.screen); + hash_table_foreach(batch->bos, entry) + panfrost_bo_unreference((struct panfrost_bo *)entry->key); - util_dynarray_foreach(&job->transient_indices, unsigned, index) { - /* Mark it free */ - BITSET_SET(screen->free_transient, *index); + util_dynarray_foreach(&batch->dependencies, + struct panfrost_batch_fence *, dep) { + panfrost_batch_fence_unreference(*dep); } - /* Unreference the polygon list */ - panfrost_bo_unreference(ctx->base.screen, job->polygon_list); + util_dynarray_fini(&batch->headers); + util_dynarray_fini(&batch->gpu_headers); - _mesa_hash_table_remove_key(ctx->jobs, &job->key); + /* The out_sync fence lifetime is different from the the batch one + * since other batches might want to wait on a fence of already + * submitted/signaled batch. All we need to do here is make sure the + * fence does not point to an invalid batch, which the core will + * interpret as 'batch is already submitted'. + */ + batch->out_sync->batch = NULL; + panfrost_batch_fence_unreference(batch->out_sync); + + util_unreference_framebuffer_state(&batch->key); + ralloc_free(batch); +} - if (ctx->job == job) - ctx->job = NULL; +#ifndef NDEBUG +static bool +panfrost_dep_graph_contains_batch(struct panfrost_batch *root, + struct panfrost_batch *batch) +{ + if (!root) + return false; - ralloc_free(job); + util_dynarray_foreach(&root->dependencies, + struct panfrost_batch_fence *, dep) { + if ((*dep)->batch == batch || + panfrost_dep_graph_contains_batch((*dep)->batch, batch)) + return true; + } + + return false; } +#endif -struct panfrost_job * -panfrost_get_job(struct panfrost_context *ctx, - struct pipe_surface **cbufs, struct pipe_surface *zsbuf) +static void +panfrost_batch_add_dep(struct panfrost_batch *batch, + struct panfrost_batch_fence *newdep) { - /* Lookup the job first */ + if (batch == newdep->batch) + return; + + /* We might want to turn ->dependencies into a set if the number of + * deps turns out to be big enough to make this 'is dep already there' + * search inefficient. + */ + util_dynarray_foreach(&batch->dependencies, + struct panfrost_batch_fence *, dep) { + if (*dep == newdep) + return; + } - struct panfrost_job_key key = { - .cbufs = { - cbufs[0], - cbufs[1], - cbufs[2], - cbufs[3], - }, - .zsbuf = zsbuf - }; + /* Make sure the dependency graph is acyclic. */ + assert(!panfrost_dep_graph_contains_batch(newdep->batch, batch)); + + panfrost_batch_fence_reference(newdep); + util_dynarray_append(&batch->dependencies, + struct panfrost_batch_fence *, newdep); + + /* We now have a batch depending on us, let's make sure new draw/clear + * calls targeting the same FBO use a new batch object. + */ + if (newdep->batch) + panfrost_freeze_batch(newdep->batch); +} - struct hash_entry *entry = _mesa_hash_table_search(ctx->jobs, &key); +static struct panfrost_batch * +panfrost_get_batch(struct panfrost_context *ctx, + const struct pipe_framebuffer_state *key) +{ + /* Lookup the job first */ + struct hash_entry *entry = _mesa_hash_table_search(ctx->batches, key); if (entry) return entry->data; /* Otherwise, let's create a job */ - struct panfrost_job *job = panfrost_create_job(ctx); + struct panfrost_batch *batch = panfrost_create_batch(ctx, key); /* Save the created job */ + _mesa_hash_table_insert(ctx->batches, &batch->key, batch); - memcpy(&job->key, &key, sizeof(key)); - _mesa_hash_table_insert(ctx->jobs, &job->key, job); - - return job; + return batch; } /* Get the job corresponding to the FBO we're currently rendering into */ -struct panfrost_job * -panfrost_get_job_for_fbo(struct panfrost_context *ctx) +struct panfrost_batch * +panfrost_get_batch_for_fbo(struct panfrost_context *ctx) { /* If we're wallpapering, we special case to workaround * u_blitter abuse */ @@ -129,38 +280,346 @@ /* If we already began rendering, use that */ - if (ctx->job) { - assert(ctx->job->key.zsbuf == ctx->pipe_framebuffer.zsbuf && - !memcmp(ctx->job->key.cbufs, - ctx->pipe_framebuffer.cbufs, - sizeof(ctx->job->key.cbufs))); - return ctx->job; + if (ctx->batch) { + assert(util_framebuffer_state_equal(&ctx->batch->key, + &ctx->pipe_framebuffer)); + return ctx->batch; } /* If not, look up the job */ - - struct pipe_surface **cbufs = ctx->pipe_framebuffer.cbufs; - struct pipe_surface *zsbuf = ctx->pipe_framebuffer.zsbuf; - struct panfrost_job *job = panfrost_get_job(ctx, cbufs, zsbuf); + struct panfrost_batch *batch = panfrost_get_batch(ctx, + &ctx->pipe_framebuffer); /* Set this job as the current FBO job. Will be reset when updating the * FB state and when submitting or releasing a job. */ - ctx->job = job; - return job; + ctx->batch = batch; + return batch; +} + +struct panfrost_batch * +panfrost_get_fresh_batch_for_fbo(struct panfrost_context *ctx) +{ + struct panfrost_batch *batch; + + batch = panfrost_get_batch(ctx, &ctx->pipe_framebuffer); + + /* The batch has no draw/clear queued, let's return it directly. + * Note that it's perfectly fine to re-use a batch with an + * existing clear, we'll just update it with the new clear request. + */ + if (!batch->last_job.gpu) + return batch; + + /* Otherwise, we need to freeze the existing one and instantiate a new + * one. + */ + panfrost_freeze_batch(batch); + return panfrost_get_batch(ctx, &ctx->pipe_framebuffer); +} + +static bool +panfrost_batch_fence_is_signaled(struct panfrost_batch_fence *fence) +{ + if (fence->signaled) + return true; + + /* Batch has not been submitted yet. */ + if (fence->batch) + return false; + + int ret = drmSyncobjWait(pan_screen(fence->ctx->base.screen)->fd, + &fence->syncobj, 1, 0, 0, NULL); + + /* Cache whether the fence was signaled */ + fence->signaled = ret >= 0; + return fence->signaled; +} + +static void +panfrost_bo_access_gc_fences(struct panfrost_context *ctx, + struct panfrost_bo_access *access, + const struct panfrost_bo *bo) +{ + if (access->writer && panfrost_batch_fence_is_signaled(access->writer)) { + panfrost_batch_fence_unreference(access->writer); + access->writer = NULL; + } + + struct panfrost_batch_fence **readers_array = util_dynarray_begin(&access->readers); + struct panfrost_batch_fence **new_readers = readers_array; + + util_dynarray_foreach(&access->readers, struct panfrost_batch_fence *, + reader) { + if (!(*reader)) + continue; + + if (panfrost_batch_fence_is_signaled(*reader)) { + panfrost_batch_fence_unreference(*reader); + *reader = NULL; + } else { + /* Build a new array of only unsignaled fences in-place */ + *(new_readers++) = *reader; + } + } + + if (!util_dynarray_resize(&access->readers, struct panfrost_batch_fence *, + new_readers - readers_array) && + new_readers != readers_array) + unreachable("Invalid dynarray access->readers"); +} + +/* Collect signaled fences to keep the kernel-side syncobj-map small. The + * idea is to collect those signaled fences at the end of each flush_all + * call. This function is likely to collect only fences from previous + * batch flushes not the one that have just have just been submitted and + * are probably still in flight when we trigger the garbage collection. + * Anyway, we need to do this garbage collection at some point if we don't + * want the BO access map to keep invalid entries around and retain + * syncobjs forever. + */ +static void +panfrost_gc_fences(struct panfrost_context *ctx) +{ + hash_table_foreach(ctx->accessed_bos, entry) { + struct panfrost_bo_access *access = entry->data; + + assert(access); + panfrost_bo_access_gc_fences(ctx, access, entry->key); + if (!util_dynarray_num_elements(&access->readers, + struct panfrost_batch_fence *) && + !access->writer) { + ralloc_free(access); + _mesa_hash_table_remove(ctx->accessed_bos, entry); + } + } +} + +#ifndef NDEBUG +static bool +panfrost_batch_in_readers(struct panfrost_batch *batch, + struct panfrost_bo_access *access) +{ + util_dynarray_foreach(&access->readers, struct panfrost_batch_fence *, + reader) { + if (*reader && (*reader)->batch == batch) + return true; + } + + return false; +} +#endif + +static void +panfrost_batch_update_bo_access(struct panfrost_batch *batch, + struct panfrost_bo *bo, uint32_t access_type, + bool already_accessed) +{ + struct panfrost_context *ctx = batch->ctx; + struct panfrost_bo_access *access; + uint32_t old_access_type; + struct hash_entry *entry; + + assert(access_type == PAN_BO_ACCESS_WRITE || + access_type == PAN_BO_ACCESS_READ); + + entry = _mesa_hash_table_search(ctx->accessed_bos, bo); + access = entry ? entry->data : NULL; + if (access) { + old_access_type = access->type; + } else { + access = rzalloc(ctx, struct panfrost_bo_access); + util_dynarray_init(&access->readers, access); + _mesa_hash_table_insert(ctx->accessed_bos, bo, access); + /* We are the first to access this BO, let's initialize + * old_access_type to our own access type in that case. + */ + old_access_type = access_type; + access->type = access_type; + } + + assert(access); + + if (access_type == PAN_BO_ACCESS_WRITE && + old_access_type == PAN_BO_ACCESS_READ) { + /* Previous access was a read and we want to write this BO. + * We first need to add explicit deps between our batch and + * the previous readers. + */ + util_dynarray_foreach(&access->readers, + struct panfrost_batch_fence *, reader) { + /* We were already reading the BO, no need to add a dep + * on ourself (the acyclic check would complain about + * that). + */ + if (!(*reader) || (*reader)->batch == batch) + continue; + + panfrost_batch_add_dep(batch, *reader); + } + panfrost_batch_fence_reference(batch->out_sync); + + /* We now are the new writer. */ + access->writer = batch->out_sync; + access->type = access_type; + + /* Release the previous readers and reset the readers array. */ + util_dynarray_foreach(&access->readers, + struct panfrost_batch_fence *, + reader) { + if (!*reader) + continue; + panfrost_batch_fence_unreference(*reader); + } + + util_dynarray_clear(&access->readers); + } else if (access_type == PAN_BO_ACCESS_WRITE && + old_access_type == PAN_BO_ACCESS_WRITE) { + /* Previous access was a write and we want to write this BO. + * First check if we were the previous writer, in that case + * there's nothing to do. Otherwise we need to add a + * dependency between the new writer and the old one. + */ + if (access->writer != batch->out_sync) { + if (access->writer) { + panfrost_batch_add_dep(batch, access->writer); + panfrost_batch_fence_unreference(access->writer); + } + panfrost_batch_fence_reference(batch->out_sync); + access->writer = batch->out_sync; + } + } else if (access_type == PAN_BO_ACCESS_READ && + old_access_type == PAN_BO_ACCESS_WRITE) { + /* Previous access was a write and we want to read this BO. + * First check if we were the previous writer, in that case + * we want to keep the access type unchanged, as a write is + * more constraining than a read. + */ + if (access->writer != batch->out_sync) { + /* Add a dependency on the previous writer. */ + panfrost_batch_add_dep(batch, access->writer); + + /* The previous access was a write, there's no reason + * to have entries in the readers array. + */ + assert(!util_dynarray_num_elements(&access->readers, + struct panfrost_batch_fence *)); + + /* Add ourselves to the readers array. */ + panfrost_batch_fence_reference(batch->out_sync); + util_dynarray_append(&access->readers, + struct panfrost_batch_fence *, + batch->out_sync); + access->type = PAN_BO_ACCESS_READ; + } + } else { + /* We already accessed this BO before, so we should already be + * in the reader array. + */ + if (already_accessed) { + assert(panfrost_batch_in_readers(batch, access)); + return; + } + + /* Previous access was a read and we want to read this BO. + * Add ourselves to the readers array and add a dependency on + * the previous writer if any. + */ + panfrost_batch_fence_reference(batch->out_sync); + util_dynarray_append(&access->readers, + struct panfrost_batch_fence *, + batch->out_sync); + + if (access->writer) + panfrost_batch_add_dep(batch, access->writer); + } } void -panfrost_job_add_bo(struct panfrost_job *job, struct panfrost_bo *bo) +panfrost_batch_add_bo(struct panfrost_batch *batch, struct panfrost_bo *bo, + uint32_t flags) { if (!bo) return; - if (_mesa_set_search(job->bos, bo)) + struct hash_entry *entry; + uint32_t old_flags = 0; + + entry = _mesa_hash_table_search(batch->bos, bo); + if (!entry) { + entry = _mesa_hash_table_insert(batch->bos, bo, + (void *)(uintptr_t)flags); + panfrost_bo_reference(bo); + } else { + old_flags = (uintptr_t)entry->data; + + /* All batches have to agree on the shared flag. */ + assert((old_flags & PAN_BO_ACCESS_SHARED) == + (flags & PAN_BO_ACCESS_SHARED)); + } + + assert(entry); + + if (old_flags == flags) + return; + + flags |= old_flags; + entry->data = (void *)(uintptr_t)flags; + + /* If this is not a shared BO, we don't really care about dependency + * tracking. + */ + if (!(flags & PAN_BO_ACCESS_SHARED)) + return; + + /* All dependencies should have been flushed before we execute the + * wallpaper draw, so it should be harmless to skip the + * update_bo_access() call. + */ + if (batch == batch->ctx->wallpaper_batch) return; - panfrost_bo_reference(bo); - _mesa_set_add(job->bos, bo); + /* Only pass R/W flags to the dep tracking logic. */ + assert(flags & PAN_BO_ACCESS_RW); + flags = (flags & PAN_BO_ACCESS_WRITE) ? + PAN_BO_ACCESS_WRITE : PAN_BO_ACCESS_READ; + panfrost_batch_update_bo_access(batch, bo, flags, old_flags != 0); +} + +void panfrost_batch_add_fbo_bos(struct panfrost_batch *batch) +{ + uint32_t flags = PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_WRITE | + PAN_BO_ACCESS_VERTEX_TILER | + PAN_BO_ACCESS_FRAGMENT; + + for (unsigned i = 0; i < batch->key.nr_cbufs; ++i) { + struct panfrost_resource *rsrc = pan_resource(batch->key.cbufs[i]->texture); + panfrost_batch_add_bo(batch, rsrc->bo, flags); + } + + if (batch->key.zsbuf) { + struct panfrost_resource *rsrc = pan_resource(batch->key.zsbuf->texture); + panfrost_batch_add_bo(batch, rsrc->bo, flags); + } +} + +struct panfrost_bo * +panfrost_batch_create_bo(struct panfrost_batch *batch, size_t size, + uint32_t create_flags, uint32_t access_flags) +{ + struct panfrost_bo *bo; + + bo = panfrost_bo_create(pan_screen(batch->ctx->base.screen), size, + create_flags); + panfrost_batch_add_bo(batch, bo, access_flags); + + /* panfrost_batch_add_bo() has retained a reference and + * panfrost_bo_create() initialize the refcnt to 1, so let's + * unreference the BO here so it gets released when the batch is + * destroyed (unless it's retained by someone else in the meantime). + */ + panfrost_bo_unreference(bo); + return bo; } /* Returns the polygon list's GPU address if available, or otherwise allocates @@ -168,79 +627,513 @@ * since we'll hit the BO cache and this is one-per-batch anyway. */ mali_ptr -panfrost_job_get_polygon_list(struct panfrost_job *batch, unsigned size) +panfrost_batch_get_polygon_list(struct panfrost_batch *batch, unsigned size) { if (batch->polygon_list) { assert(batch->polygon_list->size >= size); } else { - struct panfrost_screen *screen = pan_screen(batch->ctx->base.screen); - /* Create the BO as invisible, as there's no reason to map */ + size = util_next_power_of_two(size); - batch->polygon_list = panfrost_drm_create_bo(screen, - size, PAN_ALLOCATE_INVISIBLE); + batch->polygon_list = panfrost_batch_create_bo(batch, size, + PAN_BO_INVISIBLE, + PAN_BO_ACCESS_PRIVATE | + PAN_BO_ACCESS_RW | + PAN_BO_ACCESS_VERTEX_TILER | + PAN_BO_ACCESS_FRAGMENT); } return batch->polygon_list->gpu; } -void -panfrost_flush_jobs_writing_resource(struct panfrost_context *panfrost, - struct pipe_resource *prsc) +struct panfrost_bo * +panfrost_batch_get_scratchpad(struct panfrost_batch *batch, + unsigned shift, + unsigned thread_tls_alloc, + unsigned core_count) { -#if 0 - struct hash_entry *entry = _mesa_hash_table_search(panfrost->write_jobs, - prsc); - if (entry) { - struct panfrost_job *job = entry->data; - panfrost_job_submit(panfrost, job); + unsigned size = panfrost_get_total_stack_size(shift, + thread_tls_alloc, + core_count); + + if (batch->scratchpad) { + assert(batch->scratchpad->size >= size); + } else { + batch->scratchpad = panfrost_batch_create_bo(batch, size, + PAN_BO_INVISIBLE, + PAN_BO_ACCESS_PRIVATE | + PAN_BO_ACCESS_RW | + PAN_BO_ACCESS_VERTEX_TILER | + PAN_BO_ACCESS_FRAGMENT); } -#endif - /* TODO stub */ + + return batch->scratchpad; } -void -panfrost_job_submit(struct panfrost_context *ctx, struct panfrost_job *job) +struct panfrost_bo * +panfrost_batch_get_tiler_heap(struct panfrost_batch *batch) { - int ret; + if (batch->tiler_heap) + return batch->tiler_heap; - panfrost_scoreboard_link_batch(job); + batch->tiler_heap = panfrost_batch_create_bo(batch, 4096 * 4096, + PAN_BO_INVISIBLE | + PAN_BO_GROWABLE, + PAN_BO_ACCESS_PRIVATE | + PAN_BO_ACCESS_RW | + PAN_BO_ACCESS_VERTEX_TILER | + PAN_BO_ACCESS_FRAGMENT); + assert(batch->tiler_heap); + return batch->tiler_heap; +} + +struct panfrost_bo * +panfrost_batch_get_tiler_dummy(struct panfrost_batch *batch) +{ + struct panfrost_screen *screen = pan_screen(batch->ctx->base.screen); - bool has_draws = job->last_job.gpu; - bool is_scanout = panfrost_is_scanout(ctx); + uint32_t create_flags = 0; - if (!job) + if (batch->tiler_dummy) + return batch->tiler_dummy; + + if (!(screen->quirks & MIDGARD_NO_HIER_TILING)) + create_flags = PAN_BO_INVISIBLE; + + batch->tiler_dummy = panfrost_batch_create_bo(batch, 4096, + create_flags, + PAN_BO_ACCESS_PRIVATE | + PAN_BO_ACCESS_RW | + PAN_BO_ACCESS_VERTEX_TILER | + PAN_BO_ACCESS_FRAGMENT); + assert(batch->tiler_dummy); + return batch->tiler_dummy; +} + +static void +panfrost_batch_draw_wallpaper(struct panfrost_batch *batch) +{ + /* Color 0 is cleared, no need to draw the wallpaper. + * TODO: MRT wallpapers. + */ + if (batch->clear & PIPE_CLEAR_COLOR0) + return; + + /* Nothing to reload? TODO: MRT wallpapers */ + if (batch->key.cbufs[0] == NULL) + return; + + /* No draw calls, and no clear on the depth/stencil bufs. + * Drawing the wallpaper would be useless. + */ + if (!batch->last_tiler.gpu && + !(batch->clear & PIPE_CLEAR_DEPTHSTENCIL)) return; - ret = panfrost_drm_submit_vs_fs_job(ctx, has_draws, is_scanout); + /* Check if the buffer has any content on it worth preserving */ + + struct pipe_surface *surf = batch->key.cbufs[0]; + struct panfrost_resource *rsrc = pan_resource(surf->texture); + unsigned level = surf->u.tex.level; + + if (!rsrc->slices[level].initialized) + return; + + batch->ctx->wallpaper_batch = batch; + + /* Clamp the rendering area to the damage extent. The + * KHR_partial_update() spec states that trying to render outside of + * the damage region is "undefined behavior", so we should be safe. + */ + unsigned damage_width = (rsrc->damage.extent.maxx - rsrc->damage.extent.minx); + unsigned damage_height = (rsrc->damage.extent.maxy - rsrc->damage.extent.miny); + + if (damage_width && damage_height) { + panfrost_batch_intersection_scissor(batch, + rsrc->damage.extent.minx, + rsrc->damage.extent.miny, + rsrc->damage.extent.maxx, + rsrc->damage.extent.maxy); + } + + /* FIXME: Looks like aligning on a tile is not enough, but + * aligning on twice the tile size seems to works. We don't + * know exactly what happens here but this deserves extra + * investigation to figure it out. + */ + batch->minx = batch->minx & ~((MALI_TILE_LENGTH * 2) - 1); + batch->miny = batch->miny & ~((MALI_TILE_LENGTH * 2) - 1); + batch->maxx = MIN2(ALIGN_POT(batch->maxx, MALI_TILE_LENGTH * 2), + rsrc->base.width0); + batch->maxy = MIN2(ALIGN_POT(batch->maxy, MALI_TILE_LENGTH * 2), + rsrc->base.height0); + + struct pipe_scissor_state damage; + struct pipe_box rects[4]; + + /* Clamp the damage box to the rendering area. */ + damage.minx = MAX2(batch->minx, rsrc->damage.biggest_rect.x); + damage.miny = MAX2(batch->miny, rsrc->damage.biggest_rect.y); + damage.maxx = MIN2(batch->maxx, + rsrc->damage.biggest_rect.x + + rsrc->damage.biggest_rect.width); + damage.maxx = MAX2(damage.maxx, damage.minx); + damage.maxy = MIN2(batch->maxy, + rsrc->damage.biggest_rect.y + + rsrc->damage.biggest_rect.height); + damage.maxy = MAX2(damage.maxy, damage.miny); + + /* One damage rectangle means we can end up with at most 4 reload + * regions: + * 1: left region, only exists if damage.x > 0 + * 2: right region, only exists if damage.x + damage.width < fb->width + * 3: top region, only exists if damage.y > 0. The intersection with + * the left and right regions are dropped + * 4: bottom region, only exists if damage.y + damage.height < fb->height. + * The intersection with the left and right regions are dropped + * + * ____________________________ + * | | 3 | | + * | |___________| | + * | | damage | | + * | 1 | rect | 2 | + * | |___________| | + * | | 4 | | + * |_______|___________|______| + */ + u_box_2d(batch->minx, batch->miny, damage.minx - batch->minx, + batch->maxy - batch->miny, &rects[0]); + u_box_2d(damage.maxx, batch->miny, batch->maxx - damage.maxx, + batch->maxy - batch->miny, &rects[1]); + u_box_2d(damage.minx, batch->miny, damage.maxx - damage.minx, + damage.miny - batch->miny, &rects[2]); + u_box_2d(damage.minx, damage.maxy, damage.maxx - damage.minx, + batch->maxy - damage.maxy, &rects[3]); + + for (unsigned i = 0; i < 4; i++) { + /* Width and height are always >= 0 even if width is declared as a + * signed integer: u_box_2d() helper takes unsigned args and + * panfrost_set_damage_region() is taking care of clamping + * negative values. + */ + if (!rects[i].width || !rects[i].height) + continue; + + /* Blit the wallpaper in */ + panfrost_blit_wallpaper(batch->ctx, &rects[i]); + } + batch->ctx->wallpaper_batch = NULL; +} + +static int +panfrost_batch_submit_ioctl(struct panfrost_batch *batch, + mali_ptr first_job_desc, + uint32_t reqs, + struct mali_job_descriptor_header *header) +{ + struct panfrost_context *ctx = batch->ctx; + struct pipe_context *gallium = (struct pipe_context *) ctx; + struct panfrost_screen *screen = pan_screen(gallium->screen); + struct drm_panfrost_submit submit = {0,}; + uint32_t *bo_handles, *in_syncs = NULL; + bool is_fragment_shader; + int ret; + + is_fragment_shader = (reqs & PANFROST_JD_REQ_FS) && batch->first_job.gpu; + if (is_fragment_shader) + submit.in_sync_count = 1; + else + submit.in_sync_count = util_dynarray_num_elements(&batch->dependencies, + struct panfrost_batch_fence *); + + if (submit.in_sync_count) { + in_syncs = calloc(submit.in_sync_count, sizeof(*in_syncs)); + assert(in_syncs); + } + + /* The fragment job always depends on the vertex/tiler job if there's + * one + */ + if (is_fragment_shader) { + in_syncs[0] = batch->out_sync->syncobj; + } else { + unsigned int i = 0; + + util_dynarray_foreach(&batch->dependencies, + struct panfrost_batch_fence *, dep) + in_syncs[i++] = (*dep)->syncobj; + } + + submit.in_syncs = (uintptr_t)in_syncs; + submit.out_sync = batch->out_sync->syncobj; + submit.jc = first_job_desc; + submit.requirements = reqs; + + bo_handles = calloc(batch->bos->entries, sizeof(*bo_handles)); + assert(bo_handles); + + hash_table_foreach(batch->bos, entry) { + struct panfrost_bo *bo = (struct panfrost_bo *)entry->key; + uint32_t flags = (uintptr_t)entry->data; + + assert(bo->gem_handle > 0); + bo_handles[submit.bo_handle_count++] = bo->gem_handle; + + /* Update the BO access flags so that panfrost_bo_wait() knows + * about all pending accesses. + * We only keep the READ/WRITE info since this is all the BO + * wait logic cares about. + * We also preserve existing flags as this batch might not + * be the first one to access the BO. + */ + bo->gpu_access |= flags & (PAN_BO_ACCESS_RW); + } + + submit.bo_handles = (u64) (uintptr_t) bo_handles; + ret = drmIoctl(screen->fd, DRM_IOCTL_PANFROST_SUBMIT, &submit); + free(bo_handles); + free(in_syncs); + + if (ret) { + fprintf(stderr, "Error submitting: %m\n"); + return errno; + } + + if (pan_debug & PAN_DBG_SYNC) { + u32 status; + + /* Wait so we can get errors reported back */ + drmSyncobjWait(screen->fd, &batch->out_sync->syncobj, 1, + INT64_MAX, 0, NULL); + + status = header->exception_status; + + if (status && status != 0x1) { + fprintf(stderr, "Job %" PRIx64 " failed: source ID: 0x%x access: %s exception: 0x%x (exception_status 0x%x) fault_pointer 0x%" PRIx64 " \n", + first_job_desc, + (status >> 16) & 0xFFFF, + pandecode_exception_access((status >> 8) & 0x3), + status & 0xFF, + status, + header->fault_pointer); + } + } + + /* Trace the job if we're doing that */ + if (pan_debug & PAN_DBG_TRACE) { + /* Wait so we can get errors reported back */ + drmSyncobjWait(screen->fd, &batch->out_sync->syncobj, 1, + INT64_MAX, 0, NULL); + pandecode_jc(submit.jc, FALSE, screen->gpu_id); + } + + return 0; +} + +static int +panfrost_batch_submit_jobs(struct panfrost_batch *batch) +{ + bool has_draws = batch->first_job.gpu; + struct mali_job_descriptor_header *header; + int ret = 0; + + if (has_draws) { + header = (struct mali_job_descriptor_header *)batch->first_job.cpu; + ret = panfrost_batch_submit_ioctl(batch, batch->first_job.gpu, 0, header); + assert(!ret); + } + + if (batch->first_tiler.gpu || batch->clear) { + mali_ptr fragjob = panfrost_fragment_job(batch, has_draws, &header); + + ret = panfrost_batch_submit_ioctl(batch, fragjob, PANFROST_JD_REQ_FS, header); + assert(!ret); + } + + return ret; +} + +static void +panfrost_batch_submit(struct panfrost_batch *batch) +{ + assert(batch); + + /* Submit the dependencies first. */ + util_dynarray_foreach(&batch->dependencies, + struct panfrost_batch_fence *, dep) { + if ((*dep)->batch) + panfrost_batch_submit((*dep)->batch); + } + + int ret; + + /* Nothing to do! */ + if (!batch->last_job.gpu && !batch->clear) { + /* Mark the fence as signaled so the fence logic does not try + * to wait on it. + */ + batch->out_sync->signaled = true; + goto out; + } + + panfrost_batch_draw_wallpaper(batch); + + /* Now that all draws are in, we can finally prepare the + * FBD for the batch */ + + if (batch->framebuffer.gpu && batch->first_job.gpu) { + struct panfrost_context *ctx = batch->ctx; + struct pipe_context *gallium = (struct pipe_context *) ctx; + struct panfrost_screen *screen = pan_screen(gallium->screen); + + if (screen->quirks & MIDGARD_SFBD) + panfrost_attach_sfbd(batch, ~0); + else + panfrost_attach_mfbd(batch, ~0); + } + + panfrost_scoreboard_link_batch(batch); + + ret = panfrost_batch_submit_jobs(batch); if (ret) - fprintf(stderr, "panfrost_job_submit failed: %d\n", ret); + fprintf(stderr, "panfrost_batch_submit failed: %d\n", ret); - /* The job has been submitted, let's invalidate the current FBO job - * cache. - */ - assert(!ctx->job || job == ctx->job); - ctx->job = NULL; - - /* Remove the job from the ctx->jobs set so that future - * panfrost_get_job() calls don't see it. - * We must reset the job key to avoid removing another valid entry when - * the job is freed. + /* We must reset the damage info of our render targets here even + * though a damage reset normally happens when the DRI layer swaps + * buffers. That's because there can be implicit flushes the GL + * app is not aware of, and those might impact the damage region: if + * part of the damaged portion is drawn during those implicit flushes, + * you have to reload those areas before next draws are pushed, and + * since the driver can't easily know what's been modified by the draws + * it flushed, the easiest solution is to reload everything. */ - _mesa_hash_table_remove_key(ctx->jobs, &job->key); - memset(&job->key, 0, sizeof(job->key)); + for (unsigned i = 0; i < batch->key.nr_cbufs; i++) { + struct panfrost_resource *res; + + if (!batch->key.cbufs[i]) + continue; + + res = pan_resource(batch->key.cbufs[i]->texture); + panfrost_resource_reset_damage(res); + } + +out: + panfrost_freeze_batch(batch); + panfrost_free_batch(batch); +} + +void +panfrost_flush_all_batches(struct panfrost_context *ctx, bool wait) +{ + struct util_dynarray fences, syncobjs; + + if (wait) { + util_dynarray_init(&fences, NULL); + util_dynarray_init(&syncobjs, NULL); + } + + hash_table_foreach(ctx->batches, hentry) { + struct panfrost_batch *batch = hentry->data; + + assert(batch); + + if (wait) { + panfrost_batch_fence_reference(batch->out_sync); + util_dynarray_append(&fences, struct panfrost_batch_fence *, + batch->out_sync); + util_dynarray_append(&syncobjs, uint32_t, + batch->out_sync->syncobj); + } + + panfrost_batch_submit(batch); + } + + assert(!ctx->batches->entries); + + /* Collect batch fences before returning */ + panfrost_gc_fences(ctx); + + if (!wait) + return; + + drmSyncobjWait(pan_screen(ctx->base.screen)->fd, + util_dynarray_begin(&syncobjs), + util_dynarray_num_elements(&syncobjs, uint32_t), + INT64_MAX, DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL, NULL); + + util_dynarray_foreach(&fences, struct panfrost_batch_fence *, fence) + panfrost_batch_fence_unreference(*fence); + + util_dynarray_fini(&fences); + util_dynarray_fini(&syncobjs); +} + +bool +panfrost_pending_batches_access_bo(struct panfrost_context *ctx, + const struct panfrost_bo *bo) +{ + struct panfrost_bo_access *access; + struct hash_entry *hentry; + + hentry = _mesa_hash_table_search(ctx->accessed_bos, bo); + access = hentry ? hentry->data : NULL; + if (!access) + return false; + + if (access->writer && access->writer->batch) + return true; + + util_dynarray_foreach(&access->readers, struct panfrost_batch_fence *, + reader) { + if (*reader && (*reader)->batch) + return true; + } + + return false; +} + +void +panfrost_flush_batches_accessing_bo(struct panfrost_context *ctx, + struct panfrost_bo *bo, + uint32_t access_type) +{ + struct panfrost_bo_access *access; + struct hash_entry *hentry; + + /* It doesn't make any to flush only the readers. */ + assert(access_type == PAN_BO_ACCESS_WRITE || + access_type == PAN_BO_ACCESS_RW); + + hentry = _mesa_hash_table_search(ctx->accessed_bos, bo); + access = hentry ? hentry->data : NULL; + if (!access) + return; + + if (access_type & PAN_BO_ACCESS_WRITE && access->writer && + access->writer->batch) + panfrost_batch_submit(access->writer->batch); + + if (!(access_type & PAN_BO_ACCESS_READ)) + return; + + util_dynarray_foreach(&access->readers, struct panfrost_batch_fence *, + reader) { + if (*reader && (*reader)->batch) + panfrost_batch_submit((*reader)->batch); + } } void -panfrost_job_set_requirements(struct panfrost_context *ctx, - struct panfrost_job *job) +panfrost_batch_set_requirements(struct panfrost_batch *batch) { + struct panfrost_context *ctx = batch->ctx; + if (ctx->rasterizer && ctx->rasterizer->base.multisample) - job->requirements |= PAN_REQ_MSAA; + batch->requirements |= PAN_REQ_MSAA; if (ctx->depth_stencil && ctx->depth_stencil->depth.writemask) - job->requirements |= PAN_REQ_DEPTH_WRITE; + batch->requirements |= PAN_REQ_DEPTH_WRITE; } /* Helper to smear a 32-bit color across 128-bit components */ @@ -276,10 +1169,10 @@ if (util_format_is_rgba8_variant(desc)) { pan_pack_color_32(packed, - (float_to_ubyte(clear_alpha) << 24) | - (float_to_ubyte(color->f[2]) << 16) | - (float_to_ubyte(color->f[1]) << 8) | - (float_to_ubyte(color->f[0]) << 0)); + ((uint32_t) float_to_ubyte(clear_alpha) << 24) | + ((uint32_t) float_to_ubyte(color->f[2]) << 16) | + ((uint32_t) float_to_ubyte(color->f[1]) << 8) | + ((uint32_t) float_to_ubyte(color->f[0]) << 0)); } else if (format == PIPE_FORMAT_B5G6R5_UNORM) { /* First, we convert the components to R5, G6, B5 separately */ unsigned r5 = CLAMP(color->f[0], 0.0, 1.0) * 31.0; @@ -325,8 +1218,10 @@ pan_pack_color_32(packed, s | (s << 16)); } else if (size == 2) pan_pack_color_32(packed, out.ui[0] | (out.ui[0] << 16)); - else if (size == 4) + else if (size == 3 || size == 4) pan_pack_color_32(packed, out.ui[0]); + else if (size == 6) + pan_pack_color_64(packed, out.ui[0], out.ui[1] | (out.ui[1] << 16)); /* RGB16F -- RGBB */ else if (size == 8) pan_pack_color_64(packed, out.ui[0], out.ui[1]); else if (size == 16) @@ -337,107 +1232,104 @@ } void -panfrost_job_clear(struct panfrost_context *ctx, - struct panfrost_job *job, - unsigned buffers, - const union pipe_color_union *color, - double depth, unsigned stencil) - +panfrost_batch_clear(struct panfrost_batch *batch, + unsigned buffers, + const union pipe_color_union *color, + double depth, unsigned stencil) { + struct panfrost_context *ctx = batch->ctx; + if (buffers & PIPE_CLEAR_COLOR) { for (unsigned i = 0; i < PIPE_MAX_COLOR_BUFS; ++i) { if (!(buffers & (PIPE_CLEAR_COLOR0 << i))) continue; enum pipe_format format = ctx->pipe_framebuffer.cbufs[i]->format; - pan_pack_color(job->clear_color[i], color, format); + pan_pack_color(batch->clear_color[i], color, format); } } if (buffers & PIPE_CLEAR_DEPTH) { - job->clear_depth = depth; + batch->clear_depth = depth; } if (buffers & PIPE_CLEAR_STENCIL) { - job->clear_stencil = stencil; + batch->clear_stencil = stencil; } - job->clear |= buffers; + batch->clear |= buffers; /* Clearing affects the entire framebuffer (by definition -- this is * the Gallium clear callback, which clears the whole framebuffer. If * the scissor test were enabled from the GL side, the state tracker * would emit a quad instead and we wouldn't go down this code path) */ - panfrost_job_union_scissor(job, 0, 0, - ctx->pipe_framebuffer.width, - ctx->pipe_framebuffer.height); -} - -void -panfrost_flush_jobs_reading_resource(struct panfrost_context *panfrost, - struct pipe_resource *prsc) -{ - struct panfrost_resource *rsc = pan_resource(prsc); - - panfrost_flush_jobs_writing_resource(panfrost, prsc); - - hash_table_foreach(panfrost->jobs, entry) { - struct panfrost_job *job = entry->data; - - if (_mesa_set_search(job->bos, rsc->bo)) { - printf("TODO: submit job for flush\n"); - //panfrost_job_submit(panfrost, job); - continue; - } - } + panfrost_batch_union_scissor(batch, 0, 0, + ctx->pipe_framebuffer.width, + ctx->pipe_framebuffer.height); } static bool -panfrost_job_compare(const void *a, const void *b) +panfrost_batch_compare(const void *a, const void *b) { - return memcmp(a, b, sizeof(struct panfrost_job_key)) == 0; + return util_framebuffer_state_equal(a, b); } static uint32_t -panfrost_job_hash(const void *key) +panfrost_batch_hash(const void *key) { - return _mesa_hash_data(key, sizeof(struct panfrost_job_key)); + return _mesa_hash_data(key, sizeof(struct pipe_framebuffer_state)); } /* Given a new bounding rectangle (scissor), let the job cover the union of the * new and old bounding rectangles */ void -panfrost_job_union_scissor(struct panfrost_job *job, - unsigned minx, unsigned miny, - unsigned maxx, unsigned maxy) -{ - job->minx = MIN2(job->minx, minx); - job->miny = MIN2(job->miny, miny); - job->maxx = MAX2(job->maxx, maxx); - job->maxy = MAX2(job->maxy, maxy); +panfrost_batch_union_scissor(struct panfrost_batch *batch, + unsigned minx, unsigned miny, + unsigned maxx, unsigned maxy) +{ + batch->minx = MIN2(batch->minx, minx); + batch->miny = MIN2(batch->miny, miny); + batch->maxx = MAX2(batch->maxx, maxx); + batch->maxy = MAX2(batch->maxy, maxy); } void -panfrost_job_intersection_scissor(struct panfrost_job *job, +panfrost_batch_intersection_scissor(struct panfrost_batch *batch, unsigned minx, unsigned miny, unsigned maxx, unsigned maxy) { - job->minx = MAX2(job->minx, minx); - job->miny = MAX2(job->miny, miny); - job->maxx = MIN2(job->maxx, maxx); - job->maxy = MIN2(job->maxy, maxy); + batch->minx = MAX2(batch->minx, minx); + batch->miny = MAX2(batch->miny, miny); + batch->maxx = MIN2(batch->maxx, maxx); + batch->maxy = MIN2(batch->maxy, maxy); +} + +/* Are we currently rendering to the screen (rather than an FBO)? */ + +bool +panfrost_batch_is_scanout(struct panfrost_batch *batch) +{ + /* If there is no color buffer, it's an FBO */ + if (batch->key.nr_cbufs != 1) + return false; + + /* If we're too early that no framebuffer was sent, it's scanout */ + if (!batch->key.cbufs[0]) + return true; + + return batch->key.cbufs[0]->texture->bind & PIPE_BIND_DISPLAY_TARGET || + batch->key.cbufs[0]->texture->bind & PIPE_BIND_SCANOUT || + batch->key.cbufs[0]->texture->bind & PIPE_BIND_SHARED; } void -panfrost_job_init(struct panfrost_context *ctx) +panfrost_batch_init(struct panfrost_context *ctx) { - ctx->jobs = _mesa_hash_table_create(ctx, - panfrost_job_hash, - panfrost_job_compare); - - ctx->write_jobs = _mesa_hash_table_create(ctx, - _mesa_hash_pointer, - _mesa_key_pointer_equal); + ctx->batches = _mesa_hash_table_create(ctx, + panfrost_batch_hash, + panfrost_batch_compare); + ctx->accessed_bos = _mesa_hash_table_create(ctx, _mesa_hash_pointer, + _mesa_key_pointer_equal); } diff -Nru mesa-19.2.8/src/gallium/drivers/panfrost/pan_job.h mesa-20.0.8/src/gallium/drivers/panfrost/pan_job.h --- mesa-19.2.8/src/gallium/drivers/panfrost/pan_job.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/panfrost/pan_job.h 2020-06-12 01:21:17.000000000 +0000 @@ -31,22 +31,45 @@ #include "pan_allocate.h" #include "pan_resource.h" -/* Used as a hash table key */ +/* panfrost_batch_fence is the out fence of a batch that users or other batches + * might want to wait on. The batch fence lifetime is different from the batch + * one as want will certainly want to wait upon the fence after the batch has + * been submitted (which is when panfrost_batch objects are freed). + */ +struct panfrost_batch_fence { + /* Refcounting object for the fence. */ + struct pipe_reference reference; + + /* Batch that created this fence object. Will become NULL at batch + * submission time. This field is mainly here to know whether the + * batch has been flushed or not. + */ + struct panfrost_batch *batch; + + /* Context this fence is attached to. We need both ctx and batch, as + * the batch will go away after it's been submitted, but the fence + * will stay a bit longer. + */ + struct panfrost_context *ctx; + + /* Sync object backing this fence. */ + uint32_t syncobj; -struct panfrost_job_key { - struct pipe_surface *cbufs[4]; - struct pipe_surface *zsbuf; + /* Cached value of the signaled state to avoid calling WAIT_SYNCOBJs + * when we know the fence has already been signaled. + */ + bool signaled; }; #define PAN_REQ_MSAA (1 << 0) #define PAN_REQ_DEPTH_WRITE (1 << 1) -/* A panfrost_job corresponds to a bound FBO we're rendering to, +/* A panfrost_batch corresponds to a bound FBO we're rendering to, * collecting over multiple draws. */ -struct panfrost_job { +struct panfrost_batch { struct panfrost_context *ctx; - struct panfrost_job_key key; + struct pipe_framebuffer_state key; /* Buffers cleared (PIPE_CLEAR_* bitmask) */ unsigned clear; @@ -60,6 +83,9 @@ float clear_depth; unsigned clear_stencil; + /* Amount of thread local storage required per thread */ + unsigned stack_size; + /* Whether this job uses the corresponding requirement (PAN_REQ_* * bitmask) */ unsigned requirements; @@ -77,7 +103,7 @@ * These arrays contain the headers for the "primary batch", our jargon * referring to the part of the panfrost_job that actually contains * meaningful work. In an OpenGL ES setting, that means the - * SET_VALUE/VERTEX/TILER jobs. Excluded is specifically the FRAGMENT + * WRITE_VALUE/VERTEX/TILER jobs. Excluded is specifically the FRAGMENT * job, which is sent on as a secondary batch containing only a single * hardware job. Since there's one and only one FRAGMENT job issued per * panfrost_job, there is no need to do any scoreboarding / management; @@ -105,104 +131,138 @@ unsigned job_index; /* BOs referenced -- will be used for flushing logic */ - struct set *bos; + struct hash_table *bos; - /* Indices of transient BOs referenced */ - struct util_dynarray transient_indices; + /* Current transient BO */ + struct panfrost_bo *transient_bo; /* Within the topmost transient BO, how much has been used? */ unsigned transient_offset; /* Polygon list bound to the batch, or NULL if none bound yet */ struct panfrost_bo *polygon_list; + + /* Scratchpath BO bound to the batch, or NULL if none bound yet */ + struct panfrost_bo *scratchpad; + + /* Tiler heap BO bound to the batch, or NULL if none bound yet */ + struct panfrost_bo *tiler_heap; + + /* Dummy tiler BO bound to the batch, or NULL if none bound yet */ + struct panfrost_bo *tiler_dummy; + + /* Framebuffer descriptor. */ + struct panfrost_transfer framebuffer; + + /* Output sync object. Only valid when submitted is true. */ + struct panfrost_batch_fence *out_sync; + + /* Batch dependencies */ + struct util_dynarray dependencies; }; /* Functions for managing the above */ -struct panfrost_job * -panfrost_create_job(struct panfrost_context *ctx); +void +panfrost_batch_fence_unreference(struct panfrost_batch_fence *fence); void -panfrost_free_job(struct panfrost_context *ctx, struct panfrost_job *job); +panfrost_batch_fence_reference(struct panfrost_batch_fence *batch); -struct panfrost_job * -panfrost_get_job(struct panfrost_context *ctx, - struct pipe_surface **cbufs, struct pipe_surface *zsbuf); +struct panfrost_batch * +panfrost_get_batch_for_fbo(struct panfrost_context *ctx); -struct panfrost_job * -panfrost_get_job_for_fbo(struct panfrost_context *ctx); +struct panfrost_batch * +panfrost_get_fresh_batch_for_fbo(struct panfrost_context *ctx); void -panfrost_job_init(struct panfrost_context *ctx); +panfrost_batch_init(struct panfrost_context *ctx); void -panfrost_job_add_bo(struct panfrost_job *job, struct panfrost_bo *bo); +panfrost_batch_add_bo(struct panfrost_batch *batch, struct panfrost_bo *bo, + uint32_t flags); -void -panfrost_flush_jobs_writing_resource(struct panfrost_context *panfrost, - struct pipe_resource *prsc); +void panfrost_batch_add_fbo_bos(struct panfrost_batch *batch); + +struct panfrost_bo * +panfrost_batch_create_bo(struct panfrost_batch *batch, size_t size, + uint32_t create_flags, uint32_t access_flags); void -panfrost_flush_jobs_reading_resource(struct panfrost_context *panfrost, - struct pipe_resource *prsc); +panfrost_flush_all_batches(struct panfrost_context *ctx, bool wait); + +bool +panfrost_pending_batches_access_bo(struct panfrost_context *ctx, + const struct panfrost_bo *bo); void -panfrost_job_submit(struct panfrost_context *ctx, struct panfrost_job *job); +panfrost_flush_batches_accessing_bo(struct panfrost_context *ctx, + struct panfrost_bo *bo, uint32_t flags); void -panfrost_job_set_requirements(struct panfrost_context *ctx, - struct panfrost_job *job); +panfrost_batch_set_requirements(struct panfrost_batch *batch); + +struct panfrost_bo * +panfrost_batch_get_scratchpad(struct panfrost_batch *batch, unsigned shift, unsigned thread_tls_alloc, unsigned core_count); mali_ptr -panfrost_job_get_polygon_list(struct panfrost_job *batch, unsigned size); +panfrost_batch_get_polygon_list(struct panfrost_batch *batch, unsigned size); + +struct panfrost_bo * +panfrost_batch_get_tiler_heap(struct panfrost_batch *batch); + +struct panfrost_bo * +panfrost_batch_get_tiler_dummy(struct panfrost_batch *batch); void -panfrost_job_clear(struct panfrost_context *ctx, - struct panfrost_job *job, - unsigned buffers, - const union pipe_color_union *color, - double depth, unsigned stencil); +panfrost_batch_clear(struct panfrost_batch *batch, + unsigned buffers, + const union pipe_color_union *color, + double depth, unsigned stencil); void -panfrost_job_union_scissor(struct panfrost_job *job, - unsigned minx, unsigned miny, - unsigned maxx, unsigned maxy); +panfrost_batch_union_scissor(struct panfrost_batch *batch, + unsigned minx, unsigned miny, + unsigned maxx, unsigned maxy); void -panfrost_job_intersection_scissor(struct panfrost_job *job, - unsigned minx, unsigned miny, - unsigned maxx, unsigned maxy); +panfrost_batch_intersection_scissor(struct panfrost_batch *batch, + unsigned minx, unsigned miny, + unsigned maxx, unsigned maxy); /* Scoreboarding */ void panfrost_scoreboard_queue_compute_job( - struct panfrost_job *batch, + struct panfrost_batch *batch, struct panfrost_transfer job); void panfrost_scoreboard_queue_vertex_job( - struct panfrost_job *batch, + struct panfrost_batch *batch, struct panfrost_transfer vertex, bool requires_tiling); void panfrost_scoreboard_queue_tiler_job( - struct panfrost_job *batch, + struct panfrost_batch *batch, struct panfrost_transfer tiler); void panfrost_scoreboard_queue_fused_job( - struct panfrost_job *batch, + struct panfrost_batch *batch, struct panfrost_transfer vertex, struct panfrost_transfer tiler); void panfrost_scoreboard_queue_fused_job_prepend( - struct panfrost_job *batch, + struct panfrost_batch *batch, struct panfrost_transfer vertex, struct panfrost_transfer tiler); void -panfrost_scoreboard_link_batch(struct panfrost_job *batch); +panfrost_scoreboard_link_batch(struct panfrost_batch *batch); + +bool +panfrost_batch_is_scanout(struct panfrost_batch *batch); #endif diff -Nru mesa-19.2.8/src/gallium/drivers/panfrost/pan_mfbd.c mesa-20.0.8/src/gallium/drivers/panfrost/pan_mfbd.c --- mesa-19.2.8/src/gallium/drivers/panfrost/pan_mfbd.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/panfrost/pan_mfbd.c 2020-06-12 01:21:17.000000000 +0000 @@ -22,34 +22,12 @@ * */ +#include "pan_bo.h" #include "pan_context.h" #include "pan_util.h" #include "pan_format.h" -#include "util/u_format.h" - -static void -panfrost_invert_swizzle(const unsigned char *in, unsigned char *out) -{ - /* First, default to all zeroes to prevent uninitialized junk */ - - for (unsigned c = 0; c < 4; ++c) - out[c] = PIPE_SWIZZLE_0; - - /* Now "do" what the swizzle says */ - - for (unsigned c = 0; c < 4; ++c) { - unsigned char i = in[c]; - - /* Who cares? */ - if (i < PIPE_SWIZZLE_X || i > PIPE_SWIZZLE_W) - continue; - - /* Invert */ - unsigned idx = i - PIPE_SWIZZLE_X; - out[idx] = PIPE_SWIZZLE_X + c; - } -} +#include "util/format/u_format.h" static struct mali_rt_format panfrost_mfbd_format(struct pipe_surface *surf) @@ -178,28 +156,28 @@ static void panfrost_mfbd_clear( - struct panfrost_job *job, + struct panfrost_batch *batch, struct bifrost_framebuffer *fb, struct bifrost_fb_extra *fbx, struct bifrost_render_target *rts, unsigned rt_count) { for (unsigned i = 0; i < rt_count; ++i) { - if (!(job->clear & (PIPE_CLEAR_COLOR0 << i))) + if (!(batch->clear & (PIPE_CLEAR_COLOR0 << i))) continue; - rts[i].clear_color_1 = job->clear_color[i][0]; - rts[i].clear_color_2 = job->clear_color[i][1]; - rts[i].clear_color_3 = job->clear_color[i][2]; - rts[i].clear_color_4 = job->clear_color[i][3]; + rts[i].clear_color_1 = batch->clear_color[i][0]; + rts[i].clear_color_2 = batch->clear_color[i][1]; + rts[i].clear_color_3 = batch->clear_color[i][2]; + rts[i].clear_color_4 = batch->clear_color[i][3]; } - if (job->clear & PIPE_CLEAR_DEPTH) { - fb->clear_depth = job->clear_depth; + if (batch->clear & PIPE_CLEAR_DEPTH) { + fb->clear_depth = batch->clear_depth; } - if (job->clear & PIPE_CLEAR_STENCIL) { - fb->clear_stencil = job->clear_stencil; + if (batch->clear & PIPE_CLEAR_STENCIL) { + fb->clear_stencil = batch->clear_stencil; } } @@ -222,15 +200,15 @@ /* Now, we set the layout specific pieces */ if (rsrc->layout == PAN_LINEAR) { - rt->format.block = MALI_MFBD_BLOCK_LINEAR; + rt->format.block = MALI_BLOCK_LINEAR; rt->framebuffer = base; rt->framebuffer_stride = stride / 16; } else if (rsrc->layout == PAN_TILED) { - rt->format.block = MALI_MFBD_BLOCK_TILED; + rt->format.block = MALI_BLOCK_TILED; rt->framebuffer = base; rt->framebuffer_stride = stride; } else if (rsrc->layout == PAN_AFBC) { - rt->format.block = MALI_MFBD_BLOCK_AFBC; + rt->format.block = MALI_BLOCK_AFBC; unsigned header_size = rsrc->slices[level].header_size; @@ -248,20 +226,6 @@ } } -/* Is a format encoded like Z24S8 and therefore compatible for render? */ - -static bool -panfrost_is_z24s8_variant(enum pipe_format fmt) -{ - switch (fmt) { - case PIPE_FORMAT_Z24_UNORM_S8_UINT: - case PIPE_FORMAT_Z24X8_UNORM: - return true; - default: - return false; - } -} - static void panfrost_mfbd_set_zsbuf( struct bifrost_framebuffer *fb, @@ -271,26 +235,23 @@ struct panfrost_resource *rsrc = pan_resource(surf->texture); unsigned level = surf->u.tex.level; - assert(surf->u.tex.first_layer == 0); + unsigned first_layer = surf->u.tex.first_layer; + assert(surf->u.tex.last_layer == first_layer); - unsigned offset = rsrc->slices[level].offset; + mali_ptr base = panfrost_get_texture_address(rsrc, level, first_layer); if (rsrc->layout == PAN_AFBC) { /* The only Z/S format we can compress is Z24S8 or variants * thereof (handled by the state tracker) */ assert(panfrost_is_z24s8_variant(surf->format)); - mali_ptr base = rsrc->bo->gpu + offset; unsigned header_size = rsrc->slices[level].header_size; fb->mfbd_flags |= MALI_MFBD_EXTRA; - fbx->flags = - MALI_EXTRA_PRESENT | - MALI_EXTRA_AFBC | - MALI_EXTRA_AFBC_ZS | - MALI_EXTRA_ZS | - 0x1; /* unknown */ + fbx->flags_hi |= MALI_EXTRA_PRESENT; + fbx->flags_lo |= MALI_EXTRA_ZS | 0x1; /* unknown */ + fbx->zs_block = MALI_BLOCK_AFBC; fbx->ds_afbc.depth_stencil = base + header_size; fbx->ds_afbc.depth_stencil_afbc_metadata = base; @@ -298,34 +259,43 @@ fbx->ds_afbc.zero1 = 0x10009; fbx->ds_afbc.padding = 0x1000; - } else if (rsrc->layout == PAN_LINEAR) { + } else if (rsrc->layout == PAN_LINEAR || rsrc->layout == PAN_TILED) { /* TODO: Z32F(S8) support, which is always linear */ int stride = rsrc->slices[level].stride; fb->mfbd_flags |= MALI_MFBD_EXTRA; - fbx->flags |= MALI_EXTRA_PRESENT | MALI_EXTRA_ZS; + fbx->flags_hi |= MALI_EXTRA_PRESENT; + fbx->flags_lo |= MALI_EXTRA_ZS; - fbx->ds_linear.depth = rsrc->bo->gpu + offset; - fbx->ds_linear.depth_stride = stride; + fbx->ds_linear.depth = base; + + if (rsrc->layout == PAN_LINEAR) { + fbx->zs_block = MALI_BLOCK_LINEAR; + fbx->ds_linear.depth_stride = stride / 16; + } else { + fbx->zs_block = MALI_BLOCK_TILED; + fbx->ds_linear.depth_stride = stride; + } if (panfrost_is_z24s8_variant(surf->format)) { - fbx->flags |= 0x1; + fbx->flags_lo |= 0x1; } else if (surf->format == PIPE_FORMAT_Z32_UNORM) { /* default flags (0 in bottom place) */ } else if (surf->format == PIPE_FORMAT_Z32_FLOAT) { - fbx->flags |= 0xA; + fbx->flags_lo |= 0xA; fb->mfbd_flags ^= 0x100; fb->mfbd_flags |= 0x200; } else if (surf->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT) { - fbx->flags |= 0x1000A; + fbx->flags_hi |= 0x400; + fbx->flags_lo |= 0xA; fb->mfbd_flags ^= 0x100; fb->mfbd_flags |= 0x201; struct panfrost_resource *stencil = rsrc->separate_stencil; struct panfrost_slice stencil_slice = stencil->slices[level]; - fbx->ds_linear.stencil = stencil->bo->gpu + stencil_slice.offset; + fbx->ds_linear.stencil = panfrost_get_texture_address(stencil, level, first_layer); fbx->ds_linear.stencil_stride = stencil_slice.stride; } @@ -344,8 +314,7 @@ } static mali_ptr -panfrost_mfbd_upload( - struct panfrost_context *ctx, +panfrost_mfbd_upload(struct panfrost_batch *batch, struct bifrost_framebuffer *fb, struct bifrost_fb_extra *fbx, struct bifrost_render_target *rts, @@ -364,7 +333,7 @@ sizeof(struct bifrost_render_target) * 4; struct panfrost_transfer m_f_trans = - panfrost_allocate_transient(ctx, total_sz); + panfrost_allocate_transient(batch, total_sz); /* Do the transfer */ @@ -380,7 +349,7 @@ /* Return pointer suitable for the fragment section */ unsigned tag = MALI_MFBD | - (has_extra ? 0x2 : 0x0) | + (has_extra ? MALI_MFBD_TAG_EXTRA : 0) | (MALI_POSITIVE(rt_count) << 2); return m_f_trans.gpu | tag; @@ -388,34 +357,74 @@ #undef UPLOAD +static struct bifrost_framebuffer +panfrost_emit_mfbd(struct panfrost_batch *batch, unsigned vertex_count) +{ + struct panfrost_context *ctx = batch->ctx; + struct pipe_context *gallium = (struct pipe_context *) ctx; + struct panfrost_screen *screen = pan_screen(gallium->screen); + + unsigned width = batch->key.width; + unsigned height = batch->key.height; + + unsigned shift = panfrost_get_stack_shift(batch->stack_size); + + struct bifrost_framebuffer framebuffer = { + .width1 = MALI_POSITIVE(width), + .height1 = MALI_POSITIVE(height), + .width2 = MALI_POSITIVE(width), + .height2 = MALI_POSITIVE(height), + + .unk1 = 0x1080, + + .rt_count_1 = MALI_POSITIVE(batch->key.nr_cbufs), + .rt_count_2 = 4, + + .unknown2 = 0x1f, + .tiler = panfrost_emit_midg_tiler(batch, vertex_count), + + .stack_shift = shift, + .unk0 = 0x1e, + .scratchpad = panfrost_batch_get_scratchpad(batch, shift, screen->thread_tls_alloc, screen->core_count)->gpu + }; + + return framebuffer; +} + +void +panfrost_attach_mfbd(struct panfrost_batch *batch, unsigned vertex_count) +{ + struct bifrost_framebuffer mfbd = + panfrost_emit_mfbd(batch, vertex_count); + + memcpy(batch->framebuffer.cpu, &mfbd, sizeof(mfbd)); +} + /* Creates an MFBD for the FRAGMENT section of the bound framebuffer */ mali_ptr -panfrost_mfbd_fragment(struct panfrost_context *ctx, bool has_draws) +panfrost_mfbd_fragment(struct panfrost_batch *batch, bool has_draws) { - struct panfrost_job *job = panfrost_get_job_for_fbo(ctx); - - struct bifrost_framebuffer fb = panfrost_emit_mfbd(ctx, has_draws); - struct bifrost_fb_extra fbx = {}; - struct bifrost_render_target rts[4] = {}; + struct bifrost_framebuffer fb = panfrost_emit_mfbd(batch, has_draws); + struct bifrost_fb_extra fbx = {0}; + struct bifrost_render_target rts[4] = {0}; /* We always upload at least one dummy GL_NONE render target */ - unsigned rt_descriptors = - MAX2(ctx->pipe_framebuffer.nr_cbufs, 1); + unsigned rt_descriptors = MAX2(batch->key.nr_cbufs, 1); fb.rt_count_1 = MALI_POSITIVE(rt_descriptors); fb.rt_count_2 = rt_descriptors; fb.mfbd_flags = 0x100; /* TODO: MRT clear */ - panfrost_mfbd_clear(job, &fb, &fbx, rts, fb.rt_count_2); + panfrost_mfbd_clear(batch, &fb, &fbx, rts, fb.rt_count_2); /* Upload either the render target or a dummy GL_NONE target */ for (int cb = 0; cb < rt_descriptors; ++cb) { - struct pipe_surface *surf = ctx->pipe_framebuffer.cbufs[cb]; + struct pipe_surface *surf = batch->key.cbufs[cb]; if (surf) { panfrost_mfbd_set_cbuf(&rts[cb], surf); @@ -441,8 +450,8 @@ rts[cb].format.unk1 |= (cb * 0x400); } - if (ctx->pipe_framebuffer.zsbuf) { - panfrost_mfbd_set_zsbuf(&fb, &fbx, ctx->pipe_framebuffer.zsbuf); + if (batch->key.zsbuf) { + panfrost_mfbd_set_zsbuf(&fb, &fbx, batch->key.zsbuf); } /* When scanning out, the depth buffer is immediately invalidated, so @@ -453,13 +462,12 @@ * The exception is ReadPixels, but this is not supported on GLES so we * can safely ignore it. */ - if (panfrost_is_scanout(ctx)) { - job->requirements &= ~PAN_REQ_DEPTH_WRITE; - } + if (panfrost_batch_is_scanout(batch)) + batch->requirements &= ~PAN_REQ_DEPTH_WRITE; /* Actualize the requirements */ - if (job->requirements & PAN_REQ_MSAA) { + if (batch->requirements & PAN_REQ_MSAA) { rts[0].format.flags |= MALI_MFBD_FORMAT_MSAA; /* XXX */ @@ -467,13 +475,13 @@ fb.rt_count_2 = 4; } - if (job->requirements & PAN_REQ_DEPTH_WRITE) + if (batch->requirements & PAN_REQ_DEPTH_WRITE) fb.mfbd_flags |= MALI_MFBD_DEPTH_WRITE; /* Checksumming only works with a single render target */ - if (ctx->pipe_framebuffer.nr_cbufs == 1) { - struct pipe_surface *surf = ctx->pipe_framebuffer.cbufs[0]; + if (batch->key.nr_cbufs == 1) { + struct pipe_surface *surf = batch->key.cbufs[0]; struct panfrost_resource *rsrc = pan_resource(surf->texture); struct panfrost_bo *bo = rsrc->bo; @@ -482,11 +490,11 @@ struct panfrost_slice *slice = &rsrc->slices[level]; fb.mfbd_flags |= MALI_MFBD_EXTRA; - fbx.flags |= MALI_EXTRA_PRESENT; + fbx.flags_lo |= MALI_EXTRA_PRESENT; fbx.checksum_stride = slice->checksum_stride; fbx.checksum = bo->gpu + slice->checksum_offset; } } - return panfrost_mfbd_upload(ctx, &fb, &fbx, rts, rt_descriptors); + return panfrost_mfbd_upload(batch, &fb, &fbx, rts, rt_descriptors); } diff -Nru mesa-19.2.8/src/gallium/drivers/panfrost/pan_resource.c mesa-20.0.8/src/gallium/drivers/panfrost/pan_resource.c --- mesa-19.2.8/src/gallium/drivers/panfrost/pan_resource.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/panfrost/pan_resource.c 2020-06-12 01:21:17.000000000 +0000 @@ -34,18 +34,31 @@ #include "drm-uapi/drm_fourcc.h" #include "state_tracker/winsys_handle.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_memory.h" #include "util/u_surface.h" #include "util/u_transfer.h" #include "util/u_transfer_helper.h" #include "util/u_gen_mipmap.h" +#include "pan_bo.h" #include "pan_context.h" #include "pan_screen.h" #include "pan_resource.h" #include "pan_util.h" #include "pan_tiling.h" +#include "panfrost-quirks.h" + +void +panfrost_resource_reset_damage(struct panfrost_resource *pres) +{ + /* We set the damage extent to the full resource size but keep the + * damage box empty so that the FB content is reloaded by default. + */ + memset(&pres->damage, 0, sizeof(pres->damage)); + pres->damage.extent.maxx = pres->base.width0; + pres->damage.extent.maxy = pres->base.height0; +} static struct pipe_resource * panfrost_resource_from_handle(struct pipe_screen *pscreen, @@ -70,9 +83,12 @@ pipe_reference_init(&prsc->reference, 1); prsc->screen = pscreen; - rsc->bo = panfrost_drm_import_bo(screen, whandle->handle); + rsc->bo = panfrost_bo_import(screen, whandle->handle); + rsc->internal_format = templat->format; rsc->slices[0].stride = whandle->stride; + rsc->slices[0].offset = whandle->offset; rsc->slices[0].initialized = true; + panfrost_resource_reset_damage(rsc); if (screen->ro) { rsc->scanout = @@ -104,6 +120,7 @@ handle->handle = rsrc->bo->gem_handle; handle->stride = rsrc->slices[0].stride; + handle->offset = rsrc->slices[0].offset; return TRUE; } else if (handle->type == WINSYS_HANDLE_TYPE_FD) { if (scanout) { @@ -121,13 +138,14 @@ return true; } else { - int fd = panfrost_drm_export_bo(screen, rsrc->bo); + int fd = panfrost_bo_export(rsrc->bo); if (fd < 0) return false; handle->handle = fd; handle->stride = rsrc->slices[0].stride; + handle->offset = rsrc->slices[0].offset; return true; } } @@ -297,6 +315,9 @@ /* Compute the would-be stride */ unsigned stride = bytes_per_pixel * effective_width; + if (util_format_is_compressed(res->format)) + stride /= 4; + /* ..but cache-line align it for performance */ if (can_align_stride && pres->layout == PAN_LINEAR) stride = ALIGN_POT(stride, 64); @@ -306,6 +327,8 @@ unsigned slice_one_size = slice->stride * effective_height; unsigned slice_full_size = slice_one_size * effective_depth; + slice->size0 = slice_one_size; + /* Report 2D size for 3D texturing */ if (l == 0) @@ -370,29 +393,34 @@ * AFBC: Compressed and renderable (so always desirable for non-scanout * rendertargets). Cheap to sample from. The format is black box, so we * can't read/write from software. - */ - - /* Tiling textures is almost always faster, unless we only use it once */ - - bool is_texture = (res->bind & PIPE_BIND_SAMPLER_VIEW); - bool is_2d = res->depth0 == 1 && res->array_size == 1; - bool is_streaming = (res->usage != PIPE_USAGE_STREAM); - - /* TODO: Reenable tiling on SFBD systems when we support rendering to - * tiled formats with SFBD */ - bool should_tile = is_streaming && is_texture && is_2d && !screen->require_sfbd; - - /* Depth/stencil can't be tiled, only linear or AFBC */ - should_tile &= !(res->bind & PIPE_BIND_DEPTH_STENCIL); + * + * Tiling textures is almost always faster, unless we only use it once. + * Only a few types of resources can be tiled, ensure the bind is only + * (a combination of) one of the following */ + + const unsigned valid_binding = + PIPE_BIND_DEPTH_STENCIL | + PIPE_BIND_RENDER_TARGET | + PIPE_BIND_BLENDABLE | + PIPE_BIND_SAMPLER_VIEW | + PIPE_BIND_DISPLAY_TARGET; + + unsigned bpp = util_format_get_blocksizebits(res->format); + bool is_2d = (res->target == PIPE_TEXTURE_2D); + bool is_sane_bpp = bpp == 8 || bpp == 16 || bpp == 32 || bpp == 64 || bpp == 128; + bool should_tile = (res->usage != PIPE_USAGE_STREAM); + bool must_tile = (res->bind & PIPE_BIND_DEPTH_STENCIL) && (screen->quirks & MIDGARD_SFBD); + bool can_tile = is_2d && is_sane_bpp && ((res->bind & ~valid_binding) == 0); /* FBOs we would like to checksum, if at all possible */ - bool can_checksum = !(res->bind & (PIPE_BIND_SCANOUT | PIPE_BIND_SHARED)); + bool can_checksum = !(res->bind & ~valid_binding); bool should_checksum = res->bind & PIPE_BIND_RENDER_TARGET; pres->checksummed = can_checksum && should_checksum; /* Set the layout appropriately */ - pres->layout = should_tile ? PAN_TILED : PAN_LINEAR; + assert(!(must_tile && !can_tile)); /* must_tile => can_tile */ + pres->layout = ((can_tile && should_tile) || must_tile) ? PAN_TILED : PAN_LINEAR; size_t bo_size; @@ -400,18 +428,7 @@ /* We create a BO immediately but don't bother mapping, since we don't * care to map e.g. FBOs which the CPU probably won't touch */ - pres->bo = panfrost_drm_create_bo(screen, bo_size, PAN_ALLOCATE_DELAY_MMAP); -} - -static void -panfrost_resource_reset_damage(struct panfrost_resource *pres) -{ - /* We set the damage extent to the full resource size but keep the - * damage box empty so that the FB content is reloaded by default. - */ - memset(&pres->damage, 0, sizeof(pres->damage)); - pres->damage.extent.maxx = pres->base.width0; - pres->damage.extent.maxy = pres->base.height0; + pres->bo = panfrost_bo_create(screen, bo_size, PAN_BO_DELAY_MMAP); } void @@ -509,6 +526,7 @@ so->base = *template; so->base.screen = screen; + so->internal_format = template->format; pipe_reference_init(&so->base.reference, 1); @@ -520,25 +538,6 @@ return (struct pipe_resource *)so; } -void -panfrost_bo_reference(struct panfrost_bo *bo) -{ - if (bo) - pipe_reference(NULL, &bo->reference); -} - -void -panfrost_bo_unreference(struct pipe_screen *screen, struct panfrost_bo *bo) -{ - if (!bo) - return; - - /* When the reference count goes to zero, we need to cleanup */ - - if (pipe_reference(&bo->reference, NULL)) - panfrost_drm_release_bo(pan_screen(screen), bo, true); -} - static void panfrost_resource_destroy(struct pipe_screen *screen, struct pipe_resource *pt) @@ -550,7 +549,7 @@ renderonly_scanout_destroy(rsrc->scanout, pscreen->ro); if (rsrc->bo) - panfrost_bo_unreference(screen, rsrc->bo); + panfrost_bo_unreference(rsrc->bo); util_range_destroy(&rsrc->valid_buffer_range); ralloc_free(rsrc); @@ -578,11 +577,7 @@ *out_transfer = &transfer->base; /* If we haven't already mmaped, now's the time */ - - if (!bo->cpu) { - struct panfrost_screen *screen = pan_screen(pctx->screen); - panfrost_drm_mmap_bo(screen, bo); - } + panfrost_bo_mmap(bo); /* Check if we're bound for rendering and this is a read pixels. If so, * we need to flush */ @@ -599,27 +594,58 @@ is_bound |= fb->cbufs[c]->texture == resource; } - if (is_bound && (usage & PIPE_TRANSFER_READ)) { - assert(level == 0); - panfrost_flush(pctx, NULL, PIPE_FLUSH_END_OF_FRAME); - } + if (is_bound && (usage & PIPE_TRANSFER_READ)) + assert(level == 0); /* TODO: Respect usage flags */ if (usage & PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE) { - /* TODO: reallocate */ - //printf("debug: Missed reallocate\n"); + /* If the BO is used by one of the pending batches or if it's + * not ready yet (still accessed by one of the already flushed + * batches), we try to allocate a new one to avoid waiting. + */ + if (panfrost_pending_batches_access_bo(ctx, bo) || + !panfrost_bo_wait(bo, 0, PAN_BO_ACCESS_RW)) { + struct panfrost_screen *screen = pan_screen(pctx->screen); + /* We want the BO to be MMAPed. */ + uint32_t flags = bo->flags & ~PAN_BO_DELAY_MMAP; + struct panfrost_bo *newbo = NULL; + + /* When the BO has been imported/exported, we can't + * replace it by another one, otherwise the + * importer/exporter wouldn't see the change we're + * doing to it. + */ + if (!(bo->flags & (PAN_BO_IMPORTED | PAN_BO_EXPORTED))) + newbo = panfrost_bo_create(screen, bo->size, + flags); + + if (newbo) { + panfrost_bo_unreference(bo); + rsrc->bo = newbo; + bo = newbo; + } else { + uint32_t access = PAN_BO_ACCESS_RW; + + /* Allocation failed or was impossible, let's + * fall back on a flush+wait. + */ + panfrost_flush_batches_accessing_bo(ctx, bo, + access); + panfrost_bo_wait(bo, INT64_MAX, access); + } + } } else if ((usage & PIPE_TRANSFER_WRITE) && resource->target == PIPE_BUFFER && !util_ranges_intersect(&rsrc->valid_buffer_range, box->x, box->x + box->width)) { /* No flush for writes to uninitialized */ } else if (!(usage & PIPE_TRANSFER_UNSYNCHRONIZED)) { if (usage & PIPE_TRANSFER_WRITE) { - /* STUB: flush reading */ - //printf("debug: missed reading flush %d\n", resource->target); + panfrost_flush_batches_accessing_bo(ctx, bo, PAN_BO_ACCESS_RW); + panfrost_bo_wait(bo, INT64_MAX, PAN_BO_ACCESS_RW); } else if (usage & PIPE_TRANSFER_READ) { - /* STUB: flush writing */ - //printf("debug: missed writing flush %d (%d-%d)\n", resource->target, box->x, box->x + box->width); + panfrost_flush_batches_accessing_bo(ctx, bo, PAN_BO_ACCESS_WRITE); + panfrost_bo_wait(bo, INT64_MAX, PAN_BO_ACCESS_WRITE); } else { /* Why are you even mapping?! */ } @@ -643,17 +669,20 @@ panfrost_load_tiled_image( transfer->map, bo->cpu + rsrc->slices[level].offset, - box, + box->x, box->y, box->width, box->height, transfer->base.stride, rsrc->slices[level].stride, - util_format_get_blocksize(resource->format)); + resource->format); } } return transfer->map; } else { transfer->base.stride = rsrc->slices[level].stride; - transfer->base.layer_stride = rsrc->cubemap_stride; + if (resource->target == PIPE_TEXTURE_3D) + transfer->base.layer_stride = rsrc->slices[level].size0; + else + transfer->base.layer_stride = rsrc->cubemap_stride; /* By mapping direct-write, we're implicitly already * initialized (maybe), so be conservative */ @@ -663,7 +692,7 @@ return bo->cpu + rsrc->slices[level].offset - + transfer->base.box.z * rsrc->cubemap_stride + + transfer->base.box.z * transfer->base.layer_stride + transfer->base.box.y * rsrc->slices[level].stride + transfer->base.box.x * bytes_per_pixel; } @@ -694,16 +723,17 @@ panfrost_store_tiled_image( bo->cpu + prsrc->slices[transfer->level].offset, trans->map, - &transfer->box, + transfer->box.x, transfer->box.y, + transfer->box.width, transfer->box.height, prsrc->slices[transfer->level].stride, transfer->stride, - util_format_get_blocksize(prsrc->base.format)); + prsrc->base.format); } } } - util_range_add(&prsrc->valid_buffer_range, + util_range_add(&prsrc->base, &prsrc->valid_buffer_range, transfer->box.x, transfer->box.x + transfer->box.width); @@ -722,7 +752,7 @@ struct panfrost_resource *rsc = pan_resource(transfer->resource); if (transfer->resource->target == PIPE_BUFFER) { - util_range_add(&rsc->valid_buffer_range, + util_range_add(&rsc->base, &rsc->valid_buffer_range, transfer->box.x + box->x, transfer->box.x + box->x + box->width); } else { @@ -738,8 +768,9 @@ } static enum pipe_format -panfrost_resource_get_internal_format(struct pipe_resource *prsrc) { - return prsrc->format; +panfrost_resource_get_internal_format(struct pipe_resource *rsrc) { + struct panfrost_resource *prsrc = (struct panfrost_resource *) rsrc; + return prsrc->internal_format; } static bool @@ -769,11 +800,8 @@ * reorder-type optimizations in place. But for now prioritize * correctness. */ - struct panfrost_job *job = panfrost_get_job_for_fbo(ctx); - bool has_draws = job->last_job.gpu; - - if (has_draws) - panfrost_flush(pctx, NULL, PIPE_FLUSH_END_OF_FRAME); + panfrost_flush_batches_accessing_bo(ctx, rsrc->bo, PAN_BO_ACCESS_RW); + panfrost_bo_wait(rsrc->bo, INT64_MAX, PAN_BO_ACCESS_RW); /* We've flushed the original buffer if needed, now trigger a blit */ @@ -786,8 +814,10 @@ /* If the blit was successful, flush once more. If it wasn't, well, let * the state tracker deal with it. */ - if (blit_res) - panfrost_flush(pctx, NULL, PIPE_FLUSH_END_OF_FRAME); + if (blit_res) { + panfrost_flush_batches_accessing_bo(ctx, rsrc->bo, PAN_BO_ACCESS_WRITE); + panfrost_bo_wait(rsrc->bo, INT64_MAX, PAN_BO_ACCESS_WRITE); + } return blit_res; } @@ -860,8 +890,8 @@ /* If we grew in size, reallocate the BO */ if (new_size > rsrc->bo->size) { - panfrost_drm_release_bo(screen, rsrc->bo, true); - rsrc->bo = panfrost_drm_create_bo(screen, new_size, PAN_ALLOCATE_DELAY_MMAP); + panfrost_bo_unreference(rsrc->bo); + rsrc->bo = panfrost_bo_create(screen, new_size, PAN_BO_DELAY_MMAP); } } @@ -907,9 +937,7 @@ panfrost_resource_context_init(struct pipe_context *pctx) { pctx->transfer_map = u_transfer_helper_transfer_map; - pctx->transfer_flush_region = u_transfer_helper_transfer_flush_region; pctx->transfer_unmap = u_transfer_helper_transfer_unmap; - pctx->buffer_subdata = u_default_buffer_subdata; pctx->create_surface = panfrost_create_surface; pctx->surface_destroy = panfrost_surface_destroy; pctx->resource_copy_region = util_resource_copy_region; diff -Nru mesa-19.2.8/src/gallium/drivers/panfrost/pan_resource.h mesa-20.0.8/src/gallium/drivers/panfrost/pan_resource.h --- mesa-19.2.8/src/gallium/drivers/panfrost/pan_resource.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/panfrost/pan_resource.h 2020-06-12 01:21:17.000000000 +0000 @@ -43,6 +43,7 @@ struct panfrost_slice { unsigned offset; unsigned stride; + unsigned size0; /* If there is a header preceding each slice, how big is * that header? Used for AFBC */ @@ -57,12 +58,6 @@ bool initialized; }; -void -panfrost_bo_reference(struct panfrost_bo *bo); - -void -panfrost_bo_unreference(struct pipe_screen *screen, struct panfrost_bo *bo); - struct panfrost_resource { struct pipe_resource base; struct { @@ -88,6 +83,8 @@ /* Is transaciton elimination enabled? */ bool checksummed; + + enum pipe_format internal_format; }; static inline struct panfrost_resource * @@ -142,6 +139,9 @@ struct pipe_box *box); void +panfrost_resource_reset_damage(struct panfrost_resource *pres); + +void panfrost_resource_set_damage_region(struct pipe_screen *screen, struct pipe_resource *res, unsigned int nrects, diff -Nru mesa-19.2.8/src/gallium/drivers/panfrost/pan_scoreboard.c mesa-20.0.8/src/gallium/drivers/panfrost/pan_scoreboard.c --- mesa-19.2.8/src/gallium/drivers/panfrost/pan_scoreboard.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/panfrost/pan_scoreboard.c 2020-06-12 01:21:17.000000000 +0000 @@ -30,7 +30,7 @@ /* * Within a batch (panfrost_job), there are various types of Mali jobs: * - * - SET_VALUE: initializes tiler + * - WRITE_VALUE: generic write primitive, used to zero tiler field * - VERTEX: runs a vertex shader * - TILER: runs tiling and sets up a fragment shader * - FRAGMENT: runs fragment shaders and writes out @@ -100,17 +100,6 @@ * */ -/* Accessor to set the next job field */ - -static void -panfrost_set_job_next(struct mali_job_descriptor_header *first, mali_ptr next) -{ - if (first->job_descriptor_size) - first->next_job_64 = (u64) (uintptr_t) next; - else - first->next_job_32 = (u32) (uintptr_t) next; -} - /* Coerce a panfrost_transfer to a header */ static inline struct mali_job_descriptor_header * @@ -121,11 +110,11 @@ static void panfrost_assign_index( - struct panfrost_job *job, + struct panfrost_batch *batch, struct panfrost_transfer transfer) { /* Assign the index */ - unsigned index = ++job->job_index; + unsigned index = ++batch->job_index; job_descriptor_header(transfer)->job_index = index; } @@ -157,7 +146,7 @@ static void panfrost_scoreboard_queue_job_internal( - struct panfrost_job *batch, + struct panfrost_batch *batch, struct panfrost_transfer job) { panfrost_assign_index(batch, job); @@ -174,7 +163,7 @@ void panfrost_scoreboard_queue_compute_job( - struct panfrost_job *batch, + struct panfrost_batch *batch, struct panfrost_transfer job) { panfrost_scoreboard_queue_job_internal(batch, job); @@ -192,7 +181,7 @@ void panfrost_scoreboard_queue_vertex_job( - struct panfrost_job *batch, + struct panfrost_batch *batch, struct panfrost_transfer vertex, bool requires_tiling) { @@ -207,7 +196,7 @@ void panfrost_scoreboard_queue_tiler_job( - struct panfrost_job *batch, + struct panfrost_batch *batch, struct panfrost_transfer tiler) { panfrost_scoreboard_queue_compute_job(batch, tiler); @@ -226,7 +215,7 @@ void panfrost_scoreboard_queue_fused_job( - struct panfrost_job *batch, + struct panfrost_batch *batch, struct panfrost_transfer vertex, struct panfrost_transfer tiler) { @@ -240,7 +229,7 @@ void panfrost_scoreboard_queue_fused_job_prepend( - struct panfrost_job *batch, + struct panfrost_batch *batch, struct panfrost_transfer vertex, struct panfrost_transfer tiler) { @@ -267,33 +256,34 @@ batch->first_tiler = tiler; } -/* Generates a set value job, used below as part of TILER job scheduling. */ +/* Generates a write value job, used to initialize the tiler structures. */ static struct panfrost_transfer -panfrost_set_value_job(struct panfrost_context *ctx, mali_ptr polygon_list) +panfrost_write_value_job(struct panfrost_batch *batch, mali_ptr polygon_list) { struct mali_job_descriptor_header job = { - .job_type = JOB_TYPE_SET_VALUE, + .job_type = JOB_TYPE_WRITE_VALUE, .job_descriptor_size = 1, }; - struct mali_payload_set_value payload = { - .out = polygon_list, - .unknown = 0x3, + struct mali_payload_write_value payload = { + .address = polygon_list, + .value_descriptor = MALI_WRITE_VALUE_ZERO, }; - struct panfrost_transfer transfer = panfrost_allocate_transient(ctx, sizeof(job) + sizeof(payload)); + struct panfrost_transfer transfer = panfrost_allocate_transient(batch, sizeof(job) + sizeof(payload)); memcpy(transfer.cpu, &job, sizeof(job)); memcpy(transfer.cpu + sizeof(job), &payload, sizeof(payload)); return transfer; } -/* If there are any tiler jobs, there needs to be a corresponding set value job - * linked to the first vertex job feeding into tiling. */ +/* If there are any tiler jobs, we need to initialize the tiler by writing + * zeroes to a magic tiler structure. We do so via a WRITE_VALUE job linked to + * the first vertex job feeding into tiling. */ static void -panfrost_scoreboard_set_value(struct panfrost_job *batch) +panfrost_scoreboard_initialize_tiler(struct panfrost_batch *batch) { /* Check if we even need tiling */ if (!batch->last_tiler.gpu) @@ -302,11 +292,11 @@ /* Okay, we do. Let's generate it. We'll need the job's polygon list * regardless of size. */ - struct panfrost_context *ctx = batch->ctx; - mali_ptr polygon_list = panfrost_job_get_polygon_list(batch, 0); + mali_ptr polygon_list = panfrost_batch_get_polygon_list(batch, + MALI_TILER_MINIMUM_HEADER_SIZE); struct panfrost_transfer job = - panfrost_set_value_job(ctx, polygon_list); + panfrost_write_value_job(batch, polygon_list); /* Queue it */ panfrost_scoreboard_queue_compute_job(batch, job); @@ -346,10 +336,10 @@ mali_ptr, count)) void -panfrost_scoreboard_link_batch(struct panfrost_job *batch) +panfrost_scoreboard_link_batch(struct panfrost_batch *batch) { /* Finalize the batch */ - panfrost_scoreboard_set_value(batch); + panfrost_scoreboard_initialize_tiler(batch); /* Let no_incoming represent the set S described. */ @@ -372,7 +362,7 @@ * Proposition: Given a node N of type T, no more than one other node * depends on N. * - * If type is SET_VALUE: The only dependency added against us is from + * If type is WRITE_VALUE: The only dependency added against us is from * the first tiler job, so there is 1 dependent. * * If type is VERTEX: If there is a tiler node, that tiler node depends @@ -414,12 +404,12 @@ if (dep_1) { assert(!dependents[dep_1 - 1]); - dependents[dep_1 - 1] = i; + dependents[dep_1 - 1] = i + 1; } if (dep_2) { assert(!dependents[dep_2 - 1]); - dependents[dep_2 - 1] = i; + dependents[dep_2 - 1] = i + 1; } } @@ -451,7 +441,7 @@ if (tail) { /* Link us to the last node */ - panfrost_set_job_next(tail, addr); + tail->next_job = addr; } else { /* We are the first/last node */ batch->first_job.cpu = (uint8_t *) n; @@ -461,9 +451,11 @@ tail = n; /* Grab the dependent, if there is one */ - unsigned node_m = dependents[node_n]; + unsigned node_m_1 = dependents[node_n]; + + if (node_m_1) { + unsigned node_m = node_m_1 - 1; - if (node_m) { struct mali_job_descriptor_header *m = DESCRIPTOR_FOR_NODE(node_m); diff -Nru mesa-19.2.8/src/gallium/drivers/panfrost/pan_screen.c mesa-20.0.8/src/gallium/drivers/panfrost/pan_screen.c --- mesa-19.2.8/src/gallium/drivers/panfrost/pan_screen.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/panfrost/pan_screen.c 2020-06-12 01:21:17.000000000 +0000 @@ -28,8 +28,8 @@ #include "util/u_debug.h" #include "util/u_memory.h" -#include "util/u_format.h" -#include "util/u_format_s3tc.h" +#include "util/format/u_format.h" +#include "util/format/u_format_s3tc.h" #include "util/u_video.h" #include "util/u_screen.h" #include "util/os_time.h" @@ -41,7 +41,9 @@ #include #include "drm-uapi/drm_fourcc.h" +#include "drm-uapi/panfrost_drm.h" +#include "pan_bo.h" #include "pan_screen.h" #include "pan_resource.h" #include "pan_public.h" @@ -50,12 +52,15 @@ #include "pan_context.h" #include "midgard/midgard_compile.h" +#include "panfrost-quirks.h" static const struct debug_named_value debug_options[] = { {"msgs", PAN_DBG_MSGS, "Print debug messages"}, {"trace", PAN_DBG_TRACE, "Trace the command stream"}, {"deqp", PAN_DBG_DEQP, "Hacks for dEQP"}, {"afbc", PAN_DBG_AFBC, "Enable non-conformant AFBC impl"}, + {"sync", PAN_DBG_SYNC, "Wait for each job's completion and check for any GPU fault"}, + {"precompile", PAN_DBG_PRECOMPILE, "Precompile shaders for shader-db"}, DEBUG_NAMED_VALUE_END }; @@ -66,13 +71,13 @@ static const char * panfrost_get_name(struct pipe_screen *screen) { - return "panfrost"; + return panfrost_model_name(pan_screen(screen)->gpu_id); } static const char * panfrost_get_vendor(struct pipe_screen *screen) { - return "panfrost"; + return "Panfrost"; } static const char * @@ -99,6 +104,9 @@ case PIPE_CAP_MAX_RENDER_TARGETS: return is_deqp ? 4 : 1; + /* Throttling frames breaks pipelining */ + case PIPE_CAP_THROTTLE: + return 0; case PIPE_CAP_OCCLUSION_QUERY: return 1; @@ -108,13 +116,16 @@ case PIPE_CAP_QUERY_SO_OVERFLOW: return 0; - case PIPE_CAP_TEXTURE_MIRROR_CLAMP: case PIPE_CAP_TEXTURE_SWIZZLE: return 1; + case PIPE_CAP_TEXTURE_MIRROR_CLAMP: + case PIPE_CAP_TEXTURE_MIRROR_CLAMP_TO_EDGE: + return 1; + case PIPE_CAP_TGSI_INSTANCEID: case PIPE_CAP_VERTEX_ELEMENT_INSTANCE_DIVISOR: - return is_deqp ? 1 : 0; + return 1; case PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS: return is_deqp ? 4 : 0; @@ -125,7 +136,7 @@ return 1; case PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS: - return is_deqp ? 256 : 0; /* for GL3 */ + return 256; case PIPE_CAP_GLSL_FEATURE_LEVEL: case PIPE_CAP_GLSL_FEATURE_LEVEL_COMPATIBILITY: @@ -134,7 +145,7 @@ return is_deqp ? 300 : 120; case PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT: - return is_deqp ? 16 : 0; + return 16; case PIPE_CAP_CUBE_MAP_ARRAY: return is_deqp; @@ -240,6 +251,12 @@ case PIPE_CAP_MAX_VARYINGS: return 16; + case PIPE_CAP_ALPHA_TEST: + case PIPE_CAP_FLATSHADE: + case PIPE_CAP_TWO_SIDED_COLOR: + case PIPE_CAP_CLIP_PLANES: + return 0; + default: return u_pipe_screen_get_param_defaults(screen, param); } @@ -259,9 +276,6 @@ /* this is probably not totally correct.. but it's a start: */ switch (param) { - case PIPE_SHADER_CAP_SCALAR_ISA: - return 0; - case PIPE_SHADER_CAP_MAX_INSTRUCTIONS: case PIPE_SHADER_CAP_MAX_ALU_INSTRUCTIONS: case PIPE_SHADER_CAP_MAX_TEX_INSTRUCTIONS: @@ -275,7 +289,7 @@ return 16; case PIPE_SHADER_CAP_MAX_OUTPUTS: - return shader == PIPE_SHADER_FRAGMENT ? 4 : 8; + return shader == PIPE_SHADER_FRAGMENT ? 4 : 16; case PIPE_SHADER_CAP_MAX_TEMPS: return 256; /* GL_MAX_PROGRAM_TEMPORARIES_ARB */ @@ -326,7 +340,7 @@ return PIPE_SHADER_IR_NIR; case PIPE_SHADER_CAP_SUPPORTED_IRS: - return (1 << PIPE_SHADER_IR_NIR); + return (1 << PIPE_SHADER_IR_NIR) | (1 << PIPE_SHADER_IR_NIR_SERIALIZED); case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT: return 32; @@ -413,7 +427,16 @@ if (!format_desc) return false; - if (sample_count > 1) + /* MSAA 4x supported, but no more. Technically some revisions of the + * hardware can go up to 16x but we don't support higher modes yet. */ + + if (sample_count > 1 && !(pan_debug & PAN_DBG_DEQP)) + return false; + + if (sample_count > 4) + return false; + + if (MAX2(sample_count, 1) != MAX2(storage_sample_count, 1)) return false; /* Format wishlist */ @@ -436,10 +459,15 @@ if (scanout && renderable && !util_format_is_rgba8_variant(format_desc)) return false; - if (format_desc->layout != UTIL_FORMAT_LAYOUT_PLAIN && - format_desc->layout != UTIL_FORMAT_LAYOUT_OTHER) { - /* Compressed formats not yet hooked up. */ - return false; + switch (format_desc->layout) { + case UTIL_FORMAT_LAYOUT_PLAIN: + case UTIL_FORMAT_LAYOUT_OTHER: + break; + case UTIL_FORMAT_LAYOUT_ETC: + case UTIL_FORMAT_LAYOUT_ASTC: + return true; + default: + return false; } /* Internally, formats that are depth/stencil renderable are limited. @@ -486,8 +514,7 @@ switch (param) { case PIPE_COMPUTE_CAP_ADDRESS_BITS: - /* TODO: We'll want 64-bit pointers soon */ - RET((uint32_t []){ 32 }); + RET((uint32_t []){ 64 }); case PIPE_COMPUTE_CAP_IR_TARGET: if (ret) @@ -543,6 +570,8 @@ { struct panfrost_screen *screen = pan_screen(pscreen); panfrost_bo_cache_evict_all(screen); + pthread_mutex_destroy(&screen->bo_cache.lock); + pthread_mutex_destroy(&screen->active_bos_lock); drmFreeVersion(screen->kernel_version); ralloc_free(screen); } @@ -568,7 +597,17 @@ struct pipe_fence_handle **ptr, struct pipe_fence_handle *fence) { - panfrost_drm_fence_reference(pscreen, ptr, fence); + struct panfrost_fence **p = (struct panfrost_fence **)ptr; + struct panfrost_fence *f = (struct panfrost_fence *)fence; + struct panfrost_fence *old = *p; + + if (pipe_reference(&(*p)->reference, &f->reference)) { + util_dynarray_foreach(&old->syncfds, int, fd) + close(*fd); + util_dynarray_fini(&old->syncfds); + free(old); + } + *p = f; } static bool @@ -577,7 +616,72 @@ struct pipe_fence_handle *fence, uint64_t timeout) { - return panfrost_drm_fence_finish(pscreen, ctx, fence, timeout); + struct panfrost_screen *screen = pan_screen(pscreen); + struct panfrost_fence *f = (struct panfrost_fence *)fence; + struct util_dynarray syncobjs; + int ret; + + /* All fences were already signaled */ + if (!util_dynarray_num_elements(&f->syncfds, int)) + return true; + + util_dynarray_init(&syncobjs, NULL); + util_dynarray_foreach(&f->syncfds, int, fd) { + uint32_t syncobj; + + ret = drmSyncobjCreate(screen->fd, 0, &syncobj); + assert(!ret); + + ret = drmSyncobjImportSyncFile(screen->fd, syncobj, *fd); + assert(!ret); + util_dynarray_append(&syncobjs, uint32_t, syncobj); + } + + uint64_t abs_timeout = os_time_get_absolute_timeout(timeout); + if (abs_timeout == OS_TIMEOUT_INFINITE) + abs_timeout = INT64_MAX; + + ret = drmSyncobjWait(screen->fd, util_dynarray_begin(&syncobjs), + util_dynarray_num_elements(&syncobjs, uint32_t), + abs_timeout, DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL, + NULL); + + util_dynarray_foreach(&syncobjs, uint32_t, syncobj) + drmSyncobjDestroy(screen->fd, *syncobj); + + return ret >= 0; +} + +struct panfrost_fence * +panfrost_fence_create(struct panfrost_context *ctx, + struct util_dynarray *fences) +{ + struct panfrost_screen *screen = pan_screen(ctx->base.screen); + struct panfrost_fence *f = calloc(1, sizeof(*f)); + if (!f) + return NULL; + + util_dynarray_init(&f->syncfds, NULL); + + /* Export fences from all pending batches. */ + util_dynarray_foreach(fences, struct panfrost_batch_fence *, fence) { + int fd = -1; + + /* The fence is already signaled, no need to export it. */ + if ((*fence)->signaled) + continue; + + drmSyncobjExportSyncFile(screen->fd, (*fence)->syncobj, &fd); + if (fd == -1) + fprintf(stderr, "export failed: %m\n"); + + assert(fd != -1); + util_dynarray_append(&f->syncfds, int, fd); + } + + pipe_reference_init(&f->reference, 1); + + return f; } static const void * @@ -588,6 +692,22 @@ return &midgard_nir_options; } +static uint32_t +panfrost_active_bos_hash(const void *key) +{ + const struct panfrost_bo *bo = key; + + return _mesa_hash_data(&bo->gem_handle, sizeof(bo->gem_handle)); +} + +static bool +panfrost_active_bos_cmp(const void *keya, const void *keyb) +{ + const struct panfrost_bo *a = keya, *b = keyb; + + return a->gem_handle == b->gem_handle; +} + struct pipe_screen * panfrost_create_screen(int fd, struct renderonly *ro) { @@ -622,28 +742,34 @@ screen->fd = fd; - screen->gpu_id = panfrost_drm_query_gpu_version(screen); - screen->require_sfbd = screen->gpu_id < 0x0750; /* T760 is the first to support MFBD */ + screen->gpu_id = panfrost_query_gpu_version(screen->fd); + screen->core_count = panfrost_query_core_count(screen->fd); + screen->thread_tls_alloc = panfrost_query_thread_tls_alloc(screen->fd); + screen->quirks = panfrost_get_quirks(screen->gpu_id); screen->kernel_version = drmGetVersion(fd); /* Check if we're loading against a supported GPU model. */ switch (screen->gpu_id) { + case 0x720: /* T720 */ case 0x750: /* T760 */ case 0x820: /* T820 */ case 0x860: /* T860 */ break; default: /* Fail to load against untested models */ - debug_printf("panfrost: Unsupported model %X", - screen->gpu_id); + debug_printf("panfrost: Unsupported model %X", screen->gpu_id); return NULL; } - util_dynarray_init(&screen->transient_bo, screen); - - for (unsigned i = 0; i < ARRAY_SIZE(screen->bo_cache); ++i) - list_inithead(&screen->bo_cache[i]); + pthread_mutex_init(&screen->active_bos_lock, NULL); + screen->active_bos = _mesa_set_create(screen, panfrost_active_bos_hash, + panfrost_active_bos_cmp); + + pthread_mutex_init(&screen->bo_cache.lock, NULL); + list_inithead(&screen->bo_cache.lru); + for (unsigned i = 0; i < ARRAY_SIZE(screen->bo_cache.buckets); ++i) + list_inithead(&screen->bo_cache.buckets[i]); if (pan_debug & PAN_DBG_TRACE) pandecode_initialize(); @@ -666,9 +792,6 @@ screen->base.fence_finish = panfrost_fence_finish; screen->base.set_damage_region = panfrost_resource_set_damage_region; - screen->last_fragment_flushed = true; - screen->last_job = NULL; - panfrost_resource_screen_init(screen); return &screen->base; diff -Nru mesa-19.2.8/src/gallium/drivers/panfrost/pan_screen.h mesa-20.0.8/src/gallium/drivers/panfrost/pan_screen.h --- mesa-19.2.8/src/gallium/drivers/panfrost/pan_screen.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/panfrost/pan_screen.h 2020-06-12 01:21:17.000000000 +0000 @@ -35,10 +35,12 @@ #include "renderonly/renderonly.h" #include "util/u_dynarray.h" #include "util/bitset.h" +#include "util/set.h" #include #include "pan_allocate.h" +struct panfrost_batch; struct panfrost_context; struct panfrost_resource; struct panfrost_screen; @@ -46,26 +48,6 @@ /* Driver limits */ #define PAN_MAX_CONST_BUFFERS 16 -/* Flags for allocated memory */ - -/* This memory region is executable */ -#define PAN_ALLOCATE_EXECUTE (1 << 0) - -/* This memory region should be lazily allocated and grow-on-page-fault. Must - * be used in conjunction with INVISIBLE */ -#define PAN_ALLOCATE_GROWABLE (1 << 1) - -/* This memory region should not be mapped to the CPU */ -#define PAN_ALLOCATE_INVISIBLE (1 << 2) - -/* This memory region will be used for varyings and needs to have the cache - * bits twiddled accordingly */ -#define PAN_ALLOCATE_COHERENT_LOCAL (1 << 3) - -/* This region may not be used immediately and will not mmap on allocate - * (semantically distinct from INVISIBLE, which cannot never be mmaped) */ -#define PAN_ALLOCATE_DELAY_MMAP (1 << 4) - /* Transient slab size. This is a balance between fragmentation against cache * locality and ease of bookkeeping */ @@ -98,32 +80,33 @@ /* Properties of the GPU in use */ unsigned gpu_id; - bool require_sfbd; + unsigned core_count; + unsigned thread_tls_alloc; + unsigned quirks; drmVersionPtr kernel_version; struct renderonly *ro; - /* Transient memory management is based on borrowing fixed-size slabs - * off the screen (loaning them out to the batch). Dynamic array - * container of panfrost_bo */ - - struct util_dynarray transient_bo; - - /* Set of free transient BOs */ - BITSET_DECLARE(free_transient, MAX_TRANSIENT_SLABS); - - /* The BO cache is a set of buckets with power-of-two sizes ranging - * from 2^12 (4096, the page size) to 2^(12 + MAX_BO_CACHE_BUCKETS). - * Each bucket is a linked list of free panfrost_bo objects. */ - - struct list_head bo_cache[NR_BO_CACHE_BUCKETS]; - - /* While we're busy building up the job for frame N, the GPU is - * still busy executing frame N-1. So hold a reference to - * yesterjob */ - int last_fragment_flushed; - struct panfrost_job *last_job; + pthread_mutex_t active_bos_lock; + struct set *active_bos; + + struct { + pthread_mutex_t lock; + + /* List containing all cached BOs sorted in LRU (Least + * Recently Used) order. This allows us to quickly evict BOs + * that are more than 1 second old. + */ + struct list_head lru; + + /* The BO cache is a set of buckets with power-of-two sizes + * ranging from 2^12 (4096, the page size) to + * 2^(12 + MAX_BO_CACHE_BUCKETS). + * Each bucket is a linked list of free panfrost_bo objects. */ + + struct list_head buckets[NR_BO_CACHE_BUCKETS]; + } bo_cache; }; static inline struct panfrost_screen * @@ -132,69 +115,8 @@ return (struct panfrost_screen *)p; } -/* Get a transient BO off the screen given a - * particular index */ - -static inline struct panfrost_bo * -pan_bo_for_index(struct panfrost_screen *screen, unsigned index) -{ - return *(util_dynarray_element(&screen->transient_bo, - struct panfrost_bo *, index)); -} - -void -panfrost_drm_allocate_slab(struct panfrost_screen *screen, - struct panfrost_memory *mem, - size_t pages, - bool same_va, - int extra_flags, - int commit_count, - int extent); -void -panfrost_drm_free_slab(struct panfrost_screen *screen, - struct panfrost_memory *mem); -struct panfrost_bo * -panfrost_drm_create_bo(struct panfrost_screen *screen, size_t size, - uint32_t flags); -void -panfrost_drm_mmap_bo(struct panfrost_screen *screen, struct panfrost_bo *bo); -void -panfrost_drm_release_bo(struct panfrost_screen *screen, struct panfrost_bo *bo, bool cacheable); -struct panfrost_bo * -panfrost_drm_import_bo(struct panfrost_screen *screen, int fd); -int -panfrost_drm_export_bo(struct panfrost_screen *screen, const struct panfrost_bo *bo); -int -panfrost_drm_submit_vs_fs_job(struct panfrost_context *ctx, bool has_draws, - bool is_scanout); -void -panfrost_drm_force_flush_fragment(struct panfrost_context *ctx, - struct pipe_fence_handle **fence); -unsigned -panfrost_drm_query_gpu_version(struct panfrost_screen *screen); -int -panfrost_drm_init_context(struct panfrost_context *ctx); -void -panfrost_drm_fence_reference(struct pipe_screen *screen, - struct pipe_fence_handle **ptr, - struct pipe_fence_handle *fence); -boolean -panfrost_drm_fence_finish(struct pipe_screen *pscreen, - struct pipe_context *ctx, - struct pipe_fence_handle *fence, - uint64_t timeout); -struct panfrost_bo * -panfrost_bo_cache_fetch( - struct panfrost_screen *screen, - size_t size, uint32_t flags); - -bool -panfrost_bo_cache_put( - struct panfrost_screen *screen, - struct panfrost_bo *bo); - -void -panfrost_bo_cache_evict_all( - struct panfrost_screen *screen); +struct panfrost_fence * +panfrost_fence_create(struct panfrost_context *ctx, + struct util_dynarray *fences); #endif /* PAN_SCREEN_H */ diff -Nru mesa-19.2.8/src/gallium/drivers/panfrost/pan_sfbd.c mesa-20.0.8/src/gallium/drivers/panfrost/pan_sfbd.c --- mesa-19.2.8/src/gallium/drivers/panfrost/pan_sfbd.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/panfrost/pan_sfbd.c 2020-06-12 01:21:17.000000000 +0000 @@ -22,54 +22,100 @@ * */ +#include "pan_bo.h" #include "pan_context.h" #include "pan_util.h" #include "pan_format.h" -#include "util/u_format.h" +#include "util/format/u_format.h" -static unsigned +static struct mali_sfbd_format panfrost_sfbd_format(struct pipe_surface *surf) { - /* TODO */ - return 0xb84e0281; /* RGB32, no MSAA */ + /* Explode details on the format */ + + const struct util_format_description *desc = + util_format_description(surf->format); + + /* The swizzle for rendering is inverted from texturing */ + + unsigned char swizzle[4]; + panfrost_invert_swizzle(desc->swizzle, swizzle); + + struct mali_sfbd_format fmt = { + .unk1 = 0x1, + .swizzle = panfrost_translate_swizzle_4(swizzle), + .nr_channels = MALI_POSITIVE(desc->nr_channels), + .unk2 = 0x4, + .unk3 = 0xb, + }; + + if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) + fmt.unk2 |= MALI_SFBD_FORMAT_SRGB; + + /* sRGB handled as a dedicated flag */ + enum pipe_format linearized = util_format_linear(surf->format); + + /* If RGB, we're good to go */ + if (util_format_is_unorm8(desc)) + return fmt; + + switch (linearized) { + case PIPE_FORMAT_B5G6R5_UNORM: + fmt.unk1 = 0x5; + fmt.nr_channels = MALI_POSITIVE(2); + fmt.unk2 = 0x5; + break; + + case PIPE_FORMAT_A4B4G4R4_UNORM: + case PIPE_FORMAT_B4G4R4A4_UNORM: + fmt.unk1 = 0x4; + fmt.nr_channels = MALI_POSITIVE(1); + fmt.unk2 = 0x5; + break; + + default: + unreachable("Invalid format rendering"); + } + + return fmt; } static void panfrost_sfbd_clear( - struct panfrost_job *job, + struct panfrost_batch *batch, struct mali_single_framebuffer *sfbd) { - if (job->clear & PIPE_CLEAR_COLOR) { - sfbd->clear_color_1 = job->clear_color[0][0]; - sfbd->clear_color_2 = job->clear_color[0][1]; - sfbd->clear_color_3 = job->clear_color[0][2]; - sfbd->clear_color_4 = job->clear_color[0][3]; - } - - if (job->clear & PIPE_CLEAR_DEPTH) { - sfbd->clear_depth_1 = job->clear_depth; - sfbd->clear_depth_2 = job->clear_depth; - sfbd->clear_depth_3 = job->clear_depth; - sfbd->clear_depth_4 = job->clear_depth; + if (batch->clear & PIPE_CLEAR_COLOR) { + sfbd->clear_color_1 = batch->clear_color[0][0]; + sfbd->clear_color_2 = batch->clear_color[0][1]; + sfbd->clear_color_3 = batch->clear_color[0][2]; + sfbd->clear_color_4 = batch->clear_color[0][3]; + } + + if (batch->clear & PIPE_CLEAR_DEPTH) { + sfbd->clear_depth_1 = batch->clear_depth; + sfbd->clear_depth_2 = batch->clear_depth; + sfbd->clear_depth_3 = batch->clear_depth; + sfbd->clear_depth_4 = batch->clear_depth; } - if (job->clear & PIPE_CLEAR_STENCIL) { - sfbd->clear_stencil = job->clear_stencil; + if (batch->clear & PIPE_CLEAR_STENCIL) { + sfbd->clear_stencil = batch->clear_stencil; } /* Set flags based on what has been cleared, for the SFBD case */ /* XXX: What do these flags mean? */ int clear_flags = 0x101100; - if (!(job->clear & ~(PIPE_CLEAR_COLOR | PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL))) { + if (!(batch->clear & ~(PIPE_CLEAR_COLOR | PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL))) { /* On a tiler like this, it's fastest to clear all three buffers at once */ clear_flags |= MALI_CLEAR_FAST; } else { clear_flags |= MALI_CLEAR_SLOW; - if (job->clear & PIPE_CLEAR_STENCIL) + if (batch->clear & PIPE_CLEAR_STENCIL) clear_flags |= MALI_CLEAR_SLOW_STENCIL; } @@ -84,16 +130,22 @@ struct panfrost_resource *rsrc = pan_resource(surf->texture); unsigned level = surf->u.tex.level; - assert(surf->u.tex.first_layer == 0); + unsigned first_layer = surf->u.tex.first_layer; + assert(surf->u.tex.last_layer == first_layer); + signed stride = rsrc->slices[level].stride; + + mali_ptr base = panfrost_get_texture_address(rsrc, level, first_layer); fb->format = panfrost_sfbd_format(surf); - unsigned offset = rsrc->slices[level].offset; - signed stride = rsrc->slices[level].stride; + fb->framebuffer = base; + fb->stride = stride; - if (rsrc->layout == PAN_LINEAR) { - fb->framebuffer = rsrc->bo->gpu + offset; - fb->stride = stride; + if (rsrc->layout == PAN_LINEAR) + fb->format.block = MALI_BLOCK_LINEAR; + else if (rsrc->layout == PAN_TILED) { + fb->format.block = MALI_BLOCK_TILED; + fb->stride *= 16; } else { fprintf(stderr, "Invalid render layout\n"); assert(0); @@ -106,46 +158,116 @@ struct pipe_surface *surf) { struct panfrost_resource *rsrc = pan_resource(surf->texture); + struct panfrost_context *ctx = pan_context(surf->context); unsigned level = surf->u.tex.level; assert(surf->u.tex.first_layer == 0); - unsigned offset = rsrc->slices[level].offset; + if (rsrc->layout != PAN_TILED) + unreachable("Invalid render layout."); - if (rsrc->layout == PAN_LINEAR) { - /* TODO: What about format selection? */ - /* TODO: Z/S stride selection? */ + fb->depth_buffer = rsrc->bo->gpu + rsrc->slices[level].offset; + fb->depth_stride = rsrc->slices[level].stride; - fb->depth_buffer = rsrc->bo->gpu + offset; - fb->depth_buffer_enable = MALI_DEPTH_STENCIL_ENABLE; + /* No stencil? Job done. */ + if (!ctx->depth_stencil || !ctx->depth_stencil->stencil[0].enabled) + return; + + if (panfrost_is_z24s8_variant(surf->format)) { + + /* Stencil data is interleaved with depth */ + fb->stencil_buffer = fb->depth_buffer; + fb->stencil_stride = fb->depth_stride; + } else if (surf->format == PIPE_FORMAT_Z32_UNORM || + surf->format == PIPE_FORMAT_Z32_FLOAT) { + + /* No stencil, nothing to do */ + } else if (surf->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT) { + + /* Stencil data in separate buffer */ + struct panfrost_resource *stencil = rsrc->separate_stencil; + struct panfrost_slice stencil_slice = stencil->slices[level]; + + fb->stencil_buffer = stencil->bo->gpu + stencil_slice.offset; + fb->stencil_stride = stencil_slice.stride; + } else + unreachable("Unsupported depth/stencil format."); +} - fb->stencil_buffer = rsrc->bo->gpu + offset; - fb->stencil_buffer_enable = MALI_DEPTH_STENCIL_ENABLE; - } else { - fprintf(stderr, "Invalid render layout\n"); - assert(0); - } + +static struct mali_single_framebuffer +panfrost_emit_sfbd(struct panfrost_batch *batch, unsigned vertex_count) +{ + struct panfrost_context *ctx = batch->ctx; + struct pipe_context *gallium = (struct pipe_context *) ctx; + struct panfrost_screen *screen = pan_screen(gallium->screen); + + unsigned width = batch->key.width; + unsigned height = batch->key.height; + + /* TODO: Why do we need to make the stack bigger than other platforms? */ + unsigned shift = panfrost_get_stack_shift(MAX2(batch->stack_size, 512)); + + /* TODO: where do we specify the shift? */ + + struct mali_single_framebuffer framebuffer = { + .width = MALI_POSITIVE(width), + .height = MALI_POSITIVE(height), + .unknown2 = 0x1f, + .format = { + .unk3 = 0x3, + }, + .clear_flags = 0x1000, + .scratchpad = panfrost_batch_get_scratchpad(batch, shift, screen->thread_tls_alloc, screen->core_count)->gpu, + .tiler = panfrost_emit_midg_tiler(batch, vertex_count), + }; + + return framebuffer; +} + +void +panfrost_attach_sfbd(struct panfrost_batch *batch, unsigned vertex_count) +{ + struct mali_single_framebuffer sfbd = + panfrost_emit_sfbd(batch, vertex_count); + + memcpy(batch->framebuffer.cpu, &sfbd, sizeof(sfbd)); } /* Creates an SFBD for the FRAGMENT section of the bound framebuffer */ mali_ptr -panfrost_sfbd_fragment(struct panfrost_context *ctx, bool has_draws) +panfrost_sfbd_fragment(struct panfrost_batch *batch, bool has_draws) { - struct panfrost_job *job = panfrost_get_job_for_fbo(ctx); - struct mali_single_framebuffer fb = panfrost_emit_sfbd(ctx, has_draws); + struct mali_single_framebuffer fb = panfrost_emit_sfbd(batch, has_draws); - panfrost_sfbd_clear(job, &fb); + panfrost_sfbd_clear(batch, &fb); /* SFBD does not support MRT natively; sanity check */ - assert(ctx->pipe_framebuffer.nr_cbufs == 1); - panfrost_sfbd_set_cbuf(&fb, ctx->pipe_framebuffer.cbufs[0]); + assert(batch->key.nr_cbufs <= 1); + if (batch->key.nr_cbufs) { + struct pipe_surface *surf = batch->key.cbufs[0]; + struct panfrost_resource *rsrc = pan_resource(surf->texture); + struct panfrost_bo *bo = rsrc->bo; + + panfrost_sfbd_set_cbuf(&fb, surf); - if (ctx->pipe_framebuffer.zsbuf) - panfrost_sfbd_set_zsbuf(&fb, ctx->pipe_framebuffer.zsbuf); + if (rsrc->checksummed) { + unsigned level = surf->u.tex.level; + struct panfrost_slice *slice = &rsrc->slices[level]; - if (job->requirements & PAN_REQ_MSAA) - fb.format |= MALI_FRAMEBUFFER_MSAA_A | MALI_FRAMEBUFFER_MSAA_B; + fb.checksum_stride = slice->checksum_stride; + fb.checksum = bo->gpu + slice->checksum_offset; + } + } + + if (batch->key.zsbuf) + panfrost_sfbd_set_zsbuf(&fb, batch->key.zsbuf); + + if (batch->requirements & PAN_REQ_MSAA) { + fb.format.unk1 |= MALI_SFBD_FORMAT_MSAA_A; + fb.format.unk2 |= MALI_SFBD_FORMAT_MSAA_B; + } - return panfrost_upload_transient(ctx, &fb, sizeof(fb)) | MALI_SFBD; + return panfrost_upload_transient(batch, &fb, sizeof(fb)); } diff -Nru mesa-19.2.8/src/gallium/drivers/panfrost/pan_tiler.c mesa-20.0.8/src/gallium/drivers/panfrost/pan_tiler.c --- mesa-19.2.8/src/gallium/drivers/panfrost/pan_tiler.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/panfrost/pan_tiler.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,295 +0,0 @@ -/* - * Copyright (C) 2019 Collabora, Ltd. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Authors: - * Alyssa Rosenzweig - */ - -#include "util/u_math.h" -#include "util/macros.h" -#include "pan_tiler.h" - -/* Mali GPUs are tiled-mode renderers, rather than immediate-mode. - * Conceptually, the screen is divided into 16x16 tiles. Vertex shaders run. - * Then, a fixed-function hardware block (the tiler) consumes the gl_Position - * results. For each triangle specified, it marks each containing tile as - * containing that triangle. This set of "triangles per tile" form the "polygon - * list". Finally, the rasterization unit consumes the polygon list to invoke - * the fragment shader. - * - * In practice, it's a bit more complicated than this. 16x16 is the logical - * tile size, but Midgard features "hierarchical tiling", where power-of-two - * multiples of the base tile size can be used: hierarchy level 0 (16x16), - * level 1 (32x32), level 2 (64x64), per public information about Midgard's - * tiling. In fact, tiling goes up to 2048x2048 (!), although in practice - * 128x128 is the largest usually used (though higher modes are enabled). The - * idea behind hierarchical tiling is to use low tiling levels for small - * triangles and high levels for large triangles, to minimize memory bandwidth - * and repeated fragment shader invocations (the former issue inherent to - * immediate-mode rendering and the latter common in traditional tilers). - * - * The tiler itself works by reading varyings in and writing a polygon list - * out. Unfortunately (for us), both of these buffers are managed in main - * memory; although they ideally will be cached, it is the drivers' - * responsibility to allocate these buffers. Varying buffer allocation is - * handled elsewhere, as it is not tiler specific; the real issue is allocating - * the polygon list. - * - * This is hard, because from the driver's perspective, we have no information - * about what geometry will actually look like on screen; that information is - * only gained from running the vertex shader. (Theoretically, we could run the - * vertex shaders in software as a prepass, or in hardware with transform - * feedback as a prepass, but either idea is ludicrous on so many levels). - * - * Instead, Mali uses a bit of a hybrid approach, splitting the polygon list - * into three distinct pieces. First, the driver statically determines which - * tile hierarchy levels to use (more on that later). At this point, we know the - * framebuffer dimensions and all the possible tilings of the framebuffer, so - * we know exactly how many tiles exist across all hierarchy levels. The first - * piece of the polygon list is the header, which is exactly 8 bytes per tile, - * plus padding and a small 64-byte prologue. (If that doesn't remind you of - * AFBC, it should. See pan_afbc.c for some fun parallels). The next part is - * the polygon list body, which seems to contain 512 bytes per tile, again - * across every level of the hierarchy. These two parts form the polygon list - * buffer. This buffer has a statically determinable size, approximately equal - * to the # of tiles across all hierarchy levels * (8 bytes + 512 bytes), plus - * alignment / minimum restrictions / etc. - * - * The third piece is the easy one (for us): the tiler heap. In essence, the - * tiler heap is a gigantic slab that's as big as could possibly be necessary - * in the worst case imaginable. Just... a gigantic allocation that we give a - * start and end pointer to. What's the catch? The tiler heap is lazily - * allocated; that is, a huge amount of memory is _reserved_, but only a tiny - * bit is actually allocated upfront. The GPU just keeps using the - * unallocated-but-reserved portions as it goes along, generating page faults - * if it goes beyond the allocation, and then the kernel is instructed to - * expand the allocation on page fault (known in the vendor kernel as growable - * memory). This is quite a bit of bookkeeping of its own, but that task is - * pushed to kernel space and we can mostly ignore it here, just remembering to - * set the GROWABLE flag so the kernel actually uses this path rather than - * allocating a gigantic amount up front and burning a hole in RAM. - * - * As far as determining which hierarchy levels to use, the simple answer is - * that right now, we don't. In the tiler configuration fields (consistent from - * the earliest Midgard's SFBD through the latest Bifrost traces we have), - * there is a hierarchy_mask field, controlling which levels (tile sizes) are - * enabled. Ideally, the hierarchical tiling dream -- mapping big polygons to - * big tiles and small polygons to small tiles -- would be realized here as - * well. As long as there are polygons at all needing tiling, we always have to - * have big tiles available, in case there are big polygons. But we don't - * necessarily need small tiles available. Ideally, when there are small - * polygons, small tiles are enabled (to avoid waste from putting small - * triangles in the big tiles); when there are not, small tiles are disabled to - * avoid enabling more levels than necessary, which potentially costs in memory - * bandwidth / power / tiler performance. - * - * Of course, the driver has to figure this out statically. When tile - * hiearchies are actually established, this occurs by the tiler in - * fixed-function hardware, after the vertex shaders have run and there is - * sufficient information to figure out the size of triangles. The driver has - * no such luxury, again barring insane hacks like additionally running the - * vertex shaders in software or in hardware via transform feedback. Thus, for - * the driver, we need a heuristic approach. - * - * There are lots of heuristics to guess triangle size statically you could - * imagine, but one approach shines as particularly simple-stupid: assume all - * on-screen triangles are equal size and spread equidistantly throughout the - * screen. Let's be clear, this is NOT A VALID ASSUMPTION. But if we roll with - * it, then we see: - * - * Triangle Area = (Screen Area / # of triangles) - * = (Width * Height) / (# of triangles) - * - * Or if you prefer, we can also make a third CRAZY assumption that we only draw - * right triangles with edges parallel/perpendicular to the sides of the screen - * with no overdraw, forming a triangle grid across the screen: - * - * |--w--| - * _____ | - * | /| /| | - * |/_|/_| h - * | /| /| | - * |/_|/_| | - * - * Then you can use some middle school geometry and algebra to work out the - * triangle dimensions. I started working on this, but realised I didn't need - * to to make my point, but couldn't bare to erase that ASCII art. Anyway. - * - * POINT IS, by considering the ratio of screen area and triangle count, we can - * estimate the triangle size. For a small size, use small bins; for a large - * size, use large bins. Intuitively, this metric makes sense: when there are - * few triangles on a large screen, you're probably compositing a UI and - * therefore the triangles are large; when there are a lot of triangles on a - * small screen, you're probably rendering a 3D mesh and therefore the - * triangles are tiny. (Or better said -- there will be tiny triangles, even if - * there are also large triangles. There have to be unless you expect crazy - * overdraw. Generally, it's better to allow more small bin sizes than - * necessary than not allow enough.) - * - * From this heuristic (or whatever), we determine the minimum allowable tile - * size, and we use that to decide the hierarchy masking, selecting from the - * minimum "ideal" tile size to the maximum tile size (2048x2048). - * - * Once we have that mask and the framebuffer dimensions, we can compute the - * size of the statically-sized polygon list structures, allocate them, and go! - * - */ - -/* Hierarchical tiling spans from 16x16 to 2048x2048 tiles */ - -#define MIN_TILE_SIZE 16 -#define MAX_TILE_SIZE 2048 - -/* Constants as shifts for easier power-of-two iteration */ - -#define MIN_TILE_SHIFT util_logbase2(MIN_TILE_SIZE) -#define MAX_TILE_SHIFT util_logbase2(MAX_TILE_SIZE) - -/* The hierarchy has a 64-byte prologue */ -#define PROLOGUE_SIZE 0x40 - -/* For each tile (across all hierarchy levels), there is 8 bytes of header */ -#define HEADER_BYTES_PER_TILE 0x8 - -/* Absent any geometry, the minimum size of the header */ -#define MINIMUM_HEADER_SIZE 0x200 - -/* If the width-x-height framebuffer is divided into tile_size-x-tile_size - * tiles, how many tiles are there? Rounding up in each direction. For the - * special case of tile_size=16, this aligns with the usual Midgard count. - * tile_size must be a power-of-two. Not really repeat code from AFBC/checksum, - * because those care about the stride (not just the overall count) and only at - * a a fixed-tile size (not any of a number of power-of-twos) */ - -static unsigned -pan_tile_count(unsigned width, unsigned height, unsigned tile_size) -{ - unsigned aligned_width = ALIGN_POT(width, tile_size); - unsigned aligned_height = ALIGN_POT(height, tile_size); - - unsigned tile_count_x = aligned_width / tile_size; - unsigned tile_count_y = aligned_height / tile_size; - - return tile_count_x * tile_count_y; -} - -/* For `masked_count` of the smallest tile sizes masked out, computes how the - * size of the polygon list header. We iterate the tile sizes (16x16 through - * 2048x2048, if nothing is masked; (16*2^masked_count)x(16*2^masked_count) - * through 2048x2048 more generally. For each tile size, we figure out how many - * tiles there are at this hierarchy level and therefore many bytes this level - * is, leaving us with a byte count for each level. We then just sum up the - * byte counts across the levels to find a byte count for all levels. */ - -static unsigned -panfrost_raw_header_size(unsigned width, unsigned height, unsigned masked_count) -{ - unsigned size = PROLOGUE_SIZE; - - /* Normally we start at 16x16 tiles (MIN_TILE_SHIFT), but we add more - * if anything is masked off */ - - unsigned start_level = MIN_TILE_SHIFT + masked_count; - - /* Iterate hierarchy levels / tile sizes */ - - for (unsigned i = start_level; i < MAX_TILE_SHIFT; ++i) { - /* Shift from a level to a tile size */ - unsigned tile_size = (1 << i); - - unsigned tile_count = pan_tile_count(width, height, tile_size); - unsigned header_bytes = HEADER_BYTES_PER_TILE * tile_count; - - size += header_bytes; - } - - /* This size will be used as an offset, so ensure it's aligned */ - return ALIGN_POT(size, 512); -} - -/* Given a hierarchy mask and a framebuffer size, compute the header size */ - -unsigned -panfrost_tiler_header_size(unsigned width, unsigned height, uint8_t mask) -{ - /* If no hierarchy levels are enabled, that means there is no geometry - * for the tiler to process, so use a minimum size. Used for clears */ - - if (mask == 0x00) - return MINIMUM_HEADER_SIZE; - - /* Some levels are enabled. Ensure that only smaller levels are - * disabled and there are no gaps. Theoretically the hardware is more - * flexible, but there's no known reason to use other configurations - * and this keeps the code simple. Since we know the 0x80 bit is set, - * ctz(mask) will return the number of masked off levels. */ - - unsigned masked_count = __builtin_ctz(mask); - - assert(mask & 0x80); - assert(((mask >> masked_count) & ((mask >> masked_count) + 1)) == 0); - - /* Everything looks good. Use the number of trailing zeroes we found to - * figure out how many smaller levels are disabled to compute the - * actual header size */ - - return panfrost_raw_header_size(width, height, masked_count); -} - -/* The body seems to be about 512 bytes per tile. Noting that the header is - * about 8 bytes per tile, we can be a little sloppy and estimate the body size - * to be equal to the header size * (512/8). Given the header size is a - * considerable overestimate, this is fine. Eventually, we should maybe figure - * out how to actually implement this. */ - -unsigned -panfrost_tiler_body_size(unsigned width, unsigned height, uint8_t mask) -{ - /* No levels means no body */ - if (!mask) - return 0x00; - - unsigned header_size = panfrost_tiler_header_size(width, height, mask); - return ALIGN_POT(header_size * 512 / 8, 512); -} - - -/* In the future, a heuristic to choose a tiler hierarchy mask would go here. - * At the moment, we just default to 0xFF, which enables all possible hierarchy - * levels. Overall this yields good performance but presumably incurs a cost in - * memory bandwidth / power consumption / etc, at least on smaller scenes that - * don't really need all the smaller levels enabled */ - -unsigned -panfrost_choose_hierarchy_mask( - unsigned width, unsigned height, - unsigned vertex_count) -{ - /* If there is no geometry, we don't bother enabling anything */ - - if (!vertex_count) - return 0x00; - - /* Otherwise, default everything on. TODO: Proper tests */ - - return 0xFF; -} diff -Nru mesa-19.2.8/src/gallium/drivers/panfrost/pan_tiler.h mesa-20.0.8/src/gallium/drivers/panfrost/pan_tiler.h --- mesa-19.2.8/src/gallium/drivers/panfrost/pan_tiler.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/panfrost/pan_tiler.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,44 +0,0 @@ -/* - * Copyright (C) 2019 Collabora, Ltd. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Authors: - * Alyssa Rosenzweig - * - */ - -#ifndef __PAN_TILER_H__ -#define __PAN_TILER_H__ - -unsigned -panfrost_tiler_header_size(unsigned width, unsigned height, uint8_t mask); - -unsigned -panfrost_tiler_body_size(unsigned width, unsigned height, uint8_t mask); - -unsigned -panfrost_choose_hierarchy_mask( - unsigned width, unsigned height, - unsigned vertex_count); - -#endif - - diff -Nru mesa-19.2.8/src/gallium/drivers/panfrost/pan_util.h mesa-20.0.8/src/gallium/drivers/panfrost/pan_util.h --- mesa-19.2.8/src/gallium/drivers/panfrost/pan_util.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/panfrost/pan_util.h 2020-06-12 01:21:17.000000000 +0000 @@ -32,6 +32,8 @@ #define PAN_DBG_TRACE 0x0002 #define PAN_DBG_DEQP 0x0004 #define PAN_DBG_AFBC 0x0008 +#define PAN_DBG_SYNC 0x0010 +#define PAN_DBG_PRECOMPILE 0x0020 extern int pan_debug; diff -Nru mesa-19.2.8/src/gallium/drivers/panfrost/pan_varyings.c mesa-20.0.8/src/gallium/drivers/panfrost/pan_varyings.c --- mesa-19.2.8/src/gallium/drivers/panfrost/pan_varyings.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/panfrost/pan_varyings.c 2020-06-12 01:21:17.000000000 +0000 @@ -23,6 +23,7 @@ * */ +#include "pan_bo.h" #include "pan_context.h" #include "util/u_prim.h" @@ -38,8 +39,9 @@ slot->size = stride * count; slot->shift = slot->extra_flags = 0; + struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx); struct panfrost_transfer transfer = - panfrost_allocate_transient(ctx, slot->size); + panfrost_allocate_transient(batch, slot->size); slot->elements = transfer.gpu | MALI_ATTR_LINEAR; @@ -65,27 +67,22 @@ slot->size = MIN2(max_size, expected_size); /* Grab the BO and bind it to the batch */ - struct panfrost_job *batch = panfrost_get_job_for_fbo(ctx); + struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx); struct panfrost_bo *bo = pan_resource(target->buffer)->bo; - panfrost_job_add_bo(batch, bo); + + /* Varyings are WRITE from the perspective of the VERTEX but READ from + * the perspective of the TILER and FRAGMENT. + */ + panfrost_batch_add_bo(batch, bo, + PAN_BO_ACCESS_SHARED | + PAN_BO_ACCESS_RW | + PAN_BO_ACCESS_VERTEX_TILER | + PAN_BO_ACCESS_FRAGMENT); mali_ptr addr = bo->gpu + target->buffer_offset + (offset * slot->stride); slot->elements = addr; } -static void -panfrost_emit_point_coord(union mali_attr *slot) -{ - slot->elements = MALI_VARYING_POINT_COORD | MALI_ATTR_LINEAR; - slot->stride = slot->size = slot->shift = slot->extra_flags = 0; -} - -static void -panfrost_emit_front_face(union mali_attr *slot) -{ - slot->elements = MALI_VARYING_FRONT_FACING | MALI_ATTR_INTERNAL; -} - /* Given a shader and buffer indices, link varying metadata together */ static bool @@ -153,11 +150,11 @@ * accordingly. Compute the src_offset for a given captured varying */ static struct pipe_stream_output -pan_get_so(struct pipe_stream_output_info info, gl_varying_slot loc) +pan_get_so(struct pipe_stream_output_info *info, gl_varying_slot loc) { - for (unsigned i = 0; i < info.num_outputs; ++i) { - if (info.output[i].register_index == loc) - return info.output[i]; + for (unsigned i = 0; i < info->num_outputs; ++i) { + if (info->output[i].register_index == loc) + return info->output[i]; } unreachable("Varying not captured"); @@ -192,7 +189,8 @@ size_t vs_size = sizeof(struct mali_attr_meta) * vs->tripipe->varying_count; size_t fs_size = sizeof(struct mali_attr_meta) * fs->tripipe->varying_count; - struct panfrost_transfer trans = panfrost_allocate_transient(ctx, + struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx); + struct panfrost_transfer trans = panfrost_allocate_transient(batch, vs_size + fs_size); struct pipe_stream_output_info so = vs->stream_output; @@ -209,7 +207,7 @@ bool captured = ((vs->so_mask & (1ll << loc)) ? true : false); if (captured) { - struct pipe_stream_output o = pan_get_so(so, loc); + struct pipe_stream_output o = pan_get_so(&so, loc); unsigned dst_offset = o.dst_offset * 4; /* dwords */ vs->varyings[i].src_offset = dst_offset; @@ -243,13 +241,14 @@ fs->varyings[i].src_offset = 16 * (num_gen_varyings++); if (has_point_coord(fs->point_sprite_mask, loc)) - reads_point_coord |= true; + reads_point_coord = true; } memcpy(trans.cpu, vs->varyings, vs_size); memcpy(trans.cpu + vs_size, fs->varyings, fs_size); union mali_attr varyings[PIPE_MAX_ATTRIBS]; + memset(varyings, 0, sizeof(varyings)); /* Figure out how many streamout buffers could be bound */ unsigned so_count = ctx->streamout.num_targets; @@ -259,7 +258,7 @@ bool captured = ((vs->so_mask & (1ll << loc)) ? true : false); if (!captured) continue; - struct pipe_stream_output o = pan_get_so(so, loc); + struct pipe_stream_output o = pan_get_so(&so, loc); so_count = MAX2(so_count, o.output_buffer + 1); } @@ -269,6 +268,7 @@ signed gl_PointSize = vs->writes_point_size ? (idx++) : -1; signed gl_PointCoord = reads_point_coord ? (idx++) : -1; signed gl_FrontFacing = fs->reads_face ? (idx++) : -1; + signed gl_FragCoord = fs->reads_frag_coord ? (idx++) : -1; /* Emit the stream out buffers */ @@ -305,20 +305,25 @@ 2, vertex_count); if (reads_point_coord) - panfrost_emit_point_coord(&varyings[gl_PointCoord]); + varyings[gl_PointCoord].elements = MALI_VARYING_POINT_COORD; if (fs->reads_face) - panfrost_emit_front_face(&varyings[gl_FrontFacing]); + varyings[gl_FrontFacing].elements = MALI_VARYING_FRONT_FACING; + + if (fs->reads_frag_coord) + varyings[gl_FragCoord].elements = MALI_VARYING_FRAG_COORD; /* Let's go ahead and link varying meta to the buffer in question, now - * that that information is available */ + * that that information is available. VARYING_SLOT_POS is mapped to + * gl_FragCoord for fragment shaders but gl_Positionf or vertex shaders + * */ panfrost_emit_varying_meta(trans.cpu, vs, general, gl_Position, gl_PointSize, gl_PointCoord, gl_FrontFacing); panfrost_emit_varying_meta(trans.cpu + vs_size, fs, - general, gl_Position, gl_PointSize, + general, gl_FragCoord, gl_PointSize, gl_PointCoord, gl_FrontFacing); /* Replace streamout */ @@ -332,7 +337,7 @@ bool captured = ((vs->so_mask & (1ll << loc)) ? true : false); if (!captured) continue; - struct pipe_stream_output o = pan_get_so(so, loc); + struct pipe_stream_output o = pan_get_so(&so, loc); ovs[i].index = o.output_buffer; /* Set the type appropriately. TODO: Integer varyings XXX */ @@ -376,6 +381,9 @@ /* Fix up unaligned addresses */ for (unsigned i = 0; i < so_count; ++i) { + if (varyings[i].elements < MALI_RECORD_SPECIAL) + continue; + unsigned align = (varyings[i].elements & 63); /* While we're at it, the SO buffers are linear */ @@ -401,7 +409,7 @@ } } - mali_ptr varyings_p = panfrost_upload_transient(ctx, &varyings, idx * sizeof(union mali_attr)); + mali_ptr varyings_p = panfrost_upload_transient(batch, &varyings, idx * sizeof(union mali_attr)); ctx->payloads[PIPE_SHADER_VERTEX].postfix.varyings = varyings_p; ctx->payloads[PIPE_SHADER_FRAGMENT].postfix.varyings = varyings_p; diff -Nru mesa-19.2.8/src/gallium/drivers/r300/r300_blit.c mesa-20.0.8/src/gallium/drivers/r300/r300_blit.c --- mesa-19.2.8/src/gallium/drivers/r300/r300_blit.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/r300/r300_blit.c 2020-06-12 01:21:17.000000000 +0000 @@ -25,7 +25,7 @@ #include "r300_texture.h" #include "r300_reg.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_half.h" #include "util/u_pack_color.h" #include "util/u_surface.h" diff -Nru mesa-19.2.8/src/gallium/drivers/r300/r300_emit.c mesa-20.0.8/src/gallium/drivers/r300/r300_emit.c --- mesa-19.2.8/src/gallium/drivers/r300/r300_emit.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/r300/r300_emit.c 2020-06-12 01:21:17.000000000 +0000 @@ -23,7 +23,7 @@ /* r300_emit: Functions for emitting state. */ -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_math.h" #include "r300_context.h" diff -Nru mesa-19.2.8/src/gallium/drivers/r300/r300_fs.c mesa-20.0.8/src/gallium/drivers/r300/r300_fs.c --- mesa-19.2.8/src/gallium/drivers/r300/r300_fs.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/r300/r300_fs.c 2020-06-12 01:21:17.000000000 +0000 @@ -22,7 +22,7 @@ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE * USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_math.h" #include "util/u_memory.h" diff -Nru mesa-19.2.8/src/gallium/drivers/r300/r300_hyperz.c mesa-20.0.8/src/gallium/drivers/r300/r300_hyperz.c --- mesa-19.2.8/src/gallium/drivers/r300/r300_hyperz.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/r300/r300_hyperz.c 2020-06-12 01:21:17.000000000 +0000 @@ -25,7 +25,7 @@ #include "r300_reg.h" #include "r300_fs.h" -#include "util/u_format.h" +#include "util/format/u_format.h" /* HiZ rules - taken from various docs diff -Nru mesa-19.2.8/src/gallium/drivers/r300/r300_render.c mesa-20.0.8/src/gallium/drivers/r300/r300_render.c --- mesa-19.2.8/src/gallium/drivers/r300/r300_render.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/r300/r300_render.c 2020-06-12 01:21:17.000000000 +0000 @@ -29,7 +29,7 @@ #include "util/u_inlines.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_memory.h" #include "util/u_upload_mgr.h" #include "util/u_prim.h" diff -Nru mesa-19.2.8/src/gallium/drivers/r300/r300_screen.c mesa-20.0.8/src/gallium/drivers/r300/r300_screen.c --- mesa-19.2.8/src/gallium/drivers/r300/r300_screen.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/r300/r300_screen.c 2020-06-12 01:21:17.000000000 +0000 @@ -21,8 +21,8 @@ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE * USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#include "util/u_format.h" -#include "util/u_format_s3tc.h" +#include "util/format/u_format.h" +#include "util/format/u_format_s3tc.h" #include "util/u_screen.h" #include "util/u_memory.h" #include "util/os_time.h" @@ -173,130 +173,7 @@ case PIPE_CAP_VERTEX_SHADER_SATURATE: return is_r500 ? 1 : 0; - /* Unsupported features. */ - case PIPE_CAP_QUERY_TIME_ELAPSED: - case PIPE_CAP_QUERY_PIPELINE_STATISTICS: - case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS: - case PIPE_CAP_INDEP_BLEND_ENABLE: - case PIPE_CAP_INDEP_BLEND_FUNC: - case PIPE_CAP_DEPTH_CLIP_DISABLE: - case PIPE_CAP_DEPTH_CLIP_DISABLE_SEPARATE: - case PIPE_CAP_SHADER_STENCIL_EXPORT: - case PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS: - case PIPE_CAP_TGSI_INSTANCEID: - case PIPE_CAP_TGSI_FS_COORD_ORIGIN_LOWER_LEFT: - case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_INTEGER: - case PIPE_CAP_SEAMLESS_CUBE_MAP: - case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE: - case PIPE_CAP_MIN_TEXEL_OFFSET: - case PIPE_CAP_MAX_TEXEL_OFFSET: - case PIPE_CAP_MIN_TEXTURE_GATHER_OFFSET: - case PIPE_CAP_MAX_TEXTURE_GATHER_OFFSET: - case PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS: - case PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_COMPONENTS: - case PIPE_CAP_MAX_STREAM_OUTPUT_INTERLEAVED_COMPONENTS: - case PIPE_CAP_MAX_GEOMETRY_OUTPUT_VERTICES: - case PIPE_CAP_MAX_GEOMETRY_TOTAL_OUTPUT_COMPONENTS: - case PIPE_CAP_MAX_VERTEX_STREAMS: - case PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME: - case PIPE_CAP_STREAM_OUTPUT_INTERLEAVE_BUFFERS: - case PIPE_CAP_FRAGMENT_COLOR_CLAMPED: - case PIPE_CAP_QUADS_FOLLOW_PROVOKING_VERTEX_CONVENTION: - case PIPE_CAP_COMPUTE: - case PIPE_CAP_START_INSTANCE: - case PIPE_CAP_QUERY_TIMESTAMP: - case PIPE_CAP_TEXTURE_MULTISAMPLE: - case PIPE_CAP_CUBE_MAP_ARRAY: - case PIPE_CAP_TEXTURE_BUFFER_OBJECTS: - case PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT: - case PIPE_CAP_TEXTURE_BORDER_COLOR_QUIRK: - case PIPE_CAP_MAX_TEXTURE_BUFFER_SIZE: - case PIPE_CAP_TGSI_VS_LAYER_VIEWPORT: - case PIPE_CAP_MAX_TEXTURE_GATHER_COMPONENTS: - case PIPE_CAP_TEXTURE_GATHER_SM5: - case PIPE_CAP_TEXTURE_QUERY_LOD: - case PIPE_CAP_FAKE_SW_MSAA: - case PIPE_CAP_SAMPLE_SHADING: - case PIPE_CAP_TEXTURE_GATHER_OFFSETS: - case PIPE_CAP_DRAW_INDIRECT: - case PIPE_CAP_MULTI_DRAW_INDIRECT: - case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS: - case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE: - case PIPE_CAP_CONDITIONAL_RENDER_INVERTED: - case PIPE_CAP_SAMPLER_VIEW_TARGET: - case PIPE_CAP_VERTEXID_NOBASE: - case PIPE_CAP_POLYGON_OFFSET_CLAMP: - case PIPE_CAP_MULTISAMPLE_Z_RESOLVE: - case PIPE_CAP_RESOURCE_FROM_USER_MEMORY: - case PIPE_CAP_DEVICE_RESET_STATUS_QUERY: - case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS: - case PIPE_CAP_TEXTURE_FLOAT_LINEAR: - case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR: - case PIPE_CAP_DEPTH_BOUNDS_TEST: - case PIPE_CAP_TGSI_TXQS: - case PIPE_CAP_FORCE_PERSAMPLE_INTERP: - case PIPE_CAP_SHAREABLE_SHADERS: - case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS: - case PIPE_CAP_CLEAR_TEXTURE: - case PIPE_CAP_DRAW_PARAMETERS: - case PIPE_CAP_TGSI_PACK_HALF_FLOAT: - case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL: - case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL: - case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT: - case PIPE_CAP_INVALIDATE_BUFFER: - case PIPE_CAP_GENERATE_MIPMAP: - case PIPE_CAP_STRING_MARKER: - case PIPE_CAP_BUFFER_SAMPLER_VIEW_RGBA_ONLY: - case PIPE_CAP_SURFACE_REINTERPRET_BLOCKS: - case PIPE_CAP_QUERY_BUFFER_OBJECT: - case PIPE_CAP_QUERY_MEMORY_INFO: - case PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT: - case PIPE_CAP_ROBUST_BUFFER_ACCESS_BEHAVIOR: - case PIPE_CAP_CULL_DISTANCE: - case PIPE_CAP_PRIMITIVE_RESTART_FOR_PATCHES: - case PIPE_CAP_TGSI_VOTE: - case PIPE_CAP_MAX_WINDOW_RECTANGLES: - case PIPE_CAP_POLYGON_OFFSET_UNITS_UNSCALED: - case PIPE_CAP_VIEWPORT_SUBPIXEL_BITS: - case PIPE_CAP_TGSI_ARRAY_COMPONENTS: - case PIPE_CAP_TGSI_CAN_READ_OUTPUTS: - case PIPE_CAP_NATIVE_FENCE_FD: case PIPE_CAP_GLSL_OPTIMIZE_CONSERVATIVELY: - case PIPE_CAP_FBFETCH: - case PIPE_CAP_TGSI_MUL_ZERO_WINS: - case PIPE_CAP_DOUBLES: - case PIPE_CAP_INT64: - case PIPE_CAP_INT64_DIVMOD: - case PIPE_CAP_TGSI_TEX_TXF_LZ: - case PIPE_CAP_TGSI_CLOCK: - case PIPE_CAP_POLYGON_MODE_FILL_RECTANGLE: - case PIPE_CAP_SPARSE_BUFFER_PAGE_SIZE: - case PIPE_CAP_TGSI_BALLOT: - case PIPE_CAP_TGSI_TES_LAYER_VIEWPORT: - case PIPE_CAP_CAN_BIND_CONST_BUFFER_AS_VERTEX: - case PIPE_CAP_POST_DEPTH_COVERAGE: - case PIPE_CAP_BINDLESS_TEXTURE: - case PIPE_CAP_NIR_SAMPLERS_AS_DEREF: - case PIPE_CAP_QUERY_SO_OVERFLOW: - case PIPE_CAP_MEMOBJ: - case PIPE_CAP_LOAD_CONSTBUF: - case PIPE_CAP_TGSI_ANY_REG_AS_ADDRESS: - case PIPE_CAP_TILE_RASTER_ORDER: - case PIPE_CAP_MAX_COMBINED_SHADER_OUTPUT_RESOURCES: - case PIPE_CAP_FRAMEBUFFER_MSAA_CONSTRAINTS: - case PIPE_CAP_SIGNED_VERTEX_BUFFER_OFFSET: - case PIPE_CAP_CONTEXT_PRIORITY_MASK: - case PIPE_CAP_FENCE_SIGNAL: - case PIPE_CAP_CONSTBUF0_FLAGS: - case PIPE_CAP_PACKED_UNIFORMS: - case PIPE_CAP_CONSERVATIVE_RASTER_POST_SNAP_TRIANGLES: - case PIPE_CAP_CONSERVATIVE_RASTER_POST_SNAP_POINTS_LINES: - case PIPE_CAP_CONSERVATIVE_RASTER_PRE_SNAP_TRIANGLES: - case PIPE_CAP_CONSERVATIVE_RASTER_PRE_SNAP_POINTS_LINES: - case PIPE_CAP_CONSERVATIVE_RASTER_POST_DEPTH_COVERAGE: - case PIPE_CAP_MAX_CONSERVATIVE_RASTER_SUBPIXEL_PRECISION_BIAS: - case PIPE_CAP_PROGRAMMABLE_SAMPLE_LOCATIONS: - case PIPE_CAP_MAX_TEXTURE_UPLOAD_MEMORY_BUDGET: return 0; case PIPE_CAP_MAX_GS_INVOCATIONS: @@ -315,8 +192,6 @@ case PIPE_CAP_VERTEX_BUFFER_STRIDE_4BYTE_ALIGNED_ONLY: case PIPE_CAP_VERTEX_ELEMENT_SRC_OFFSET_4BYTE_ALIGNED_ONLY: return r300screen->caps.has_tcl; - case PIPE_CAP_TGSI_TEXCOORD: - return 0; /* Texturing. */ case PIPE_CAP_MAX_TEXTURE_2D_SIZE: @@ -434,8 +309,6 @@ return PIPE_SHADER_IR_TGSI; case PIPE_SHADER_CAP_SUPPORTED_IRS: return 0; - case PIPE_SHADER_CAP_SCALAR_ISA: - return 0; } break; case PIPE_SHADER_VERTEX: @@ -502,8 +375,6 @@ return PIPE_SHADER_IR_TGSI; case PIPE_SHADER_CAP_SUPPORTED_IRS: return 0; - case PIPE_SHADER_CAP_SCALAR_ISA: - return 0; } break; default: diff -Nru mesa-19.2.8/src/gallium/drivers/r300/r300_state.c mesa-20.0.8/src/gallium/drivers/r300/r300_state.c --- mesa-19.2.8/src/gallium/drivers/r300/r300_state.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/r300/r300_state.c 2020-06-12 01:21:17.000000000 +0000 @@ -1157,7 +1157,7 @@ rs->rs_draw.offset_tri = 0; rs->rs_draw.offset_clamp = 0; -#ifdef PIPE_ARCH_LITTLE_ENDIAN +#if UTIL_ARCH_LITTLE_ENDIAN vap_control_status = R300_VC_NO_SWAP; #else vap_control_status = R300_VC_32BIT_SWAP; diff -Nru mesa-19.2.8/src/gallium/drivers/r300/r300_state_derived.c mesa-20.0.8/src/gallium/drivers/r300/r300_state_derived.c --- mesa-19.2.8/src/gallium/drivers/r300/r300_state_derived.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/r300/r300_state_derived.c 2020-06-12 01:21:17.000000000 +0000 @@ -376,7 +376,7 @@ if (fs_inputs->color[i] != ATTR_UNUSED) { fp_offset++; - DBG(r300, DBG_RS, "r300: FS input color %i unassigned%s.\n", + DBG(r300, DBG_RS, "r300: FS input color %i unassigned.\n", i); } } @@ -474,7 +474,7 @@ if (fs_inputs->generic[i] != ATTR_UNUSED) { fp_offset++; - DBG(r300, DBG_RS, "r300: FS input generic %i unassigned%s.\n", i); + DBG(r300, DBG_RS, "r300: FS input generic %i unassigned.\n", i); } } } diff -Nru mesa-19.2.8/src/gallium/drivers/r300/r300_state_inlines.h mesa-20.0.8/src/gallium/drivers/r300/r300_state_inlines.h --- mesa-19.2.8/src/gallium/drivers/r300/r300_state_inlines.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/r300/r300_state_inlines.h 2020-06-12 01:21:17.000000000 +0000 @@ -26,7 +26,7 @@ #include "draw/draw_vertex.h" #include "pipe/p_format.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "r300_reg.h" #include diff -Nru mesa-19.2.8/src/gallium/drivers/r300/r300_texture.c mesa-20.0.8/src/gallium/drivers/r300/r300_texture.c --- mesa-19.2.8/src/gallium/drivers/r300/r300_texture.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/r300/r300_texture.c 2020-06-12 01:21:17.000000000 +0000 @@ -30,12 +30,13 @@ #include "r300_transfer.h" #include "r300_screen.h" -#include "util/u_format.h" -#include "util/u_format_s3tc.h" +#include "util/format/u_format.h" +#include "util/format/u_format_s3tc.h" #include "util/u_math.h" #include "util/u_memory.h" #include "pipe/p_screen.h" +#include "state_tracker/winsys_handle.h" /* These formats are supported by swapping their bytes. * The swizzles must be set exactly like their non-swapped counterparts, @@ -1048,8 +1049,10 @@ return false; } - return rws->buffer_get_handle(rws, tex->buf, tex->tex.stride_in_bytes[0], - 0, 0, whandle); + whandle->stride = tex->tex.stride_in_bytes[0]; + whandle->offset = 0; + + return rws->buffer_get_handle(rws, tex->buf, whandle); } static const struct u_resource_vtbl r300_texture_vtbl = @@ -1179,7 +1182,6 @@ struct r300_screen *rscreen = r300_screen(screen); struct radeon_winsys *rws = rscreen->rws; struct pb_buffer *buffer; - unsigned stride; struct radeon_bo_metadata tiling = {}; /* Support only 2D textures without mipmaps */ @@ -1190,7 +1192,7 @@ return NULL; } - buffer = rws->buffer_from_handle(rws, whandle, 0, &stride, NULL); + buffer = rws->buffer_from_handle(rws, whandle, 0); if (!buffer) return NULL; @@ -1212,7 +1214,7 @@ return (struct pipe_resource*) r300_texture_create_object(rscreen, base, tiling.u.legacy.microtile, tiling.u.legacy.macrotile, - stride, buffer); + whandle->stride, buffer); } /* Not required to implement u_resource_vtbl, consider moving to another file: diff -Nru mesa-19.2.8/src/gallium/drivers/r300/r300_texture_desc.c mesa-20.0.8/src/gallium/drivers/r300/r300_texture_desc.c --- mesa-19.2.8/src/gallium/drivers/r300/r300_texture_desc.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/r300/r300_texture_desc.c 2020-06-12 01:21:17.000000000 +0000 @@ -24,7 +24,7 @@ #include "r300_texture_desc.h" #include "r300_context.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include /* Returns the number of pixels that the texture should be aligned to diff -Nru mesa-19.2.8/src/gallium/drivers/r300/r300_transfer.c mesa-20.0.8/src/gallium/drivers/r300/r300_transfer.c --- mesa-19.2.8/src/gallium/drivers/r300/r300_transfer.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/r300/r300_transfer.c 2020-06-12 01:21:17.000000000 +0000 @@ -26,7 +26,7 @@ #include "r300_screen_buffer.h" #include "util/u_memory.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_box.h" struct r300_transfer { diff -Nru mesa-19.2.8/src/gallium/drivers/r600/Android.mk mesa-20.0.8/src/gallium/drivers/r600/Android.mk --- mesa-19.2.8/src/gallium/drivers/r600/Android.mk 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/r600/Android.mk 2020-06-12 01:21:17.000000000 +0000 @@ -30,7 +30,9 @@ LOCAL_SRC_FILES := $(C_SOURCES) $(CXX_SOURCES) -LOCAL_C_INCLUDES += $(MESA_TOP)/src/amd/common +LOCAL_C_INCLUDES += \ + $(MESA_TOP)/src/amd/common \ + $(MESA_TOP)/src/amd/llvm LOCAL_SHARED_LIBRARIES := libdrm_radeon LOCAL_MODULE := libmesa_pipe_r600 diff -Nru mesa-19.2.8/src/gallium/drivers/r600/evergreen_compute.c mesa-20.0.8/src/gallium/drivers/r600/evergreen_compute.c --- mesa-19.2.8/src/gallium/drivers/r600/evergreen_compute.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/r600/evergreen_compute.c 2020-06-12 01:21:17.000000000 +0000 @@ -429,8 +429,7 @@ struct r600_context *rctx = (struct r600_context *)ctx; struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute); #ifdef HAVE_OPENCL - const struct pipe_llvm_program_header *header; - const char *code; + const struct pipe_binary_program_header *header; void *p; boolean use_kill; #endif @@ -449,9 +448,8 @@ #ifdef HAVE_OPENCL COMPUTE_DBG(rctx->screen, "*** evergreen_create_compute_state\n"); header = cso->prog; - code = cso->prog + sizeof(struct pipe_llvm_program_header); radeon_shader_binary_init(&shader->binary); - r600_elf_read(code, header->num_bytes, &shader->binary); + r600_elf_read(header->blob, header->num_bytes, &shader->binary); r600_create_shader(&shader->bc, &shader->binary, &use_kill); /* Upload code + ROdata */ diff -Nru mesa-19.2.8/src/gallium/drivers/r600/evergreen_hw_context.c mesa-20.0.8/src/gallium/drivers/r600/evergreen_hw_context.c --- mesa-19.2.8/src/gallium/drivers/r600/evergreen_hw_context.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/r600/evergreen_hw_context.c 2020-06-12 01:21:17.000000000 +0000 @@ -43,7 +43,7 @@ /* Mark the buffer range of destination as valid (initialized), * so that transfer_map knows it should wait for the GPU when mapping * that range. */ - util_range_add(&rdst->valid_buffer_range, dst_offset, + util_range_add(&rdst->b.b, &rdst->valid_buffer_range, dst_offset, dst_offset + size); dst_offset += rdst->gpu_address; @@ -93,7 +93,7 @@ /* Mark the buffer range of destination as valid (initialized), * so that transfer_map knows it should wait for the GPU when mapping * that range. */ - util_range_add(&r600_resource(dst)->valid_buffer_range, offset, + util_range_add(dst, &r600_resource(dst)->valid_buffer_range, offset, offset + size); offset += r600_resource(dst)->gpu_address; diff -Nru mesa-19.2.8/src/gallium/drivers/r600/evergreen_state.c mesa-20.0.8/src/gallium/drivers/r600/evergreen_state.c --- mesa-19.2.8/src/gallium/drivers/r600/evergreen_state.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/r600/evergreen_state.c 2020-06-12 01:21:17.000000000 +0000 @@ -698,7 +698,7 @@ view->tex_resource = &tmp->resource; if (tmp->resource.gpu_address) - LIST_ADDTAIL(&view->list, &rctx->texture_buffers); + list_addtail(&view->list, &rctx->texture_buffers); return &view->base; } @@ -1308,7 +1308,7 @@ surf->cb_color_view = 0; /* Set the buffer range the GPU will have access to: */ - util_range_add(&r600_resource(pipe_buffer)->valid_buffer_range, + util_range_add(pipe_buffer, &r600_resource(pipe_buffer)->valid_buffer_range, 0, pipe_buffer->width0); } @@ -3364,6 +3364,12 @@ spi_baryc_cntl |= spi_baryc_enable_bit[k]; have_perspective |= k < 3; have_linear |= !(k < 3); + if (rshader->input[i].uses_interpolate_at_centroid) { + k = eg_get_interpolator_index( + rshader->input[i].interpolate, + TGSI_INTERPOLATE_LOC_CENTROID); + spi_baryc_cntl |= spi_baryc_enable_bit[k]; + } } } diff -Nru mesa-19.2.8/src/gallium/drivers/r600/Makefile.sources mesa-20.0.8/src/gallium/drivers/r600/Makefile.sources --- mesa-19.2.8/src/gallium/drivers/r600/Makefile.sources 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/r600/Makefile.sources 2020-06-12 01:21:17.000000000 +0000 @@ -16,6 +16,8 @@ r600_asm.h \ r600_blit.c \ r600d.h \ + r600_dump.c \ + r600_dump.h \ r600_formats.h \ r600_hw_context.c \ r600_isa.c \ diff -Nru mesa-19.2.8/src/gallium/drivers/r600/meson.build mesa-20.0.8/src/gallium/drivers/r600/meson.build --- mesa-19.2.8/src/gallium/drivers/r600/meson.build 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/r600/meson.build 2020-06-12 01:21:17.000000000 +0000 @@ -35,6 +35,8 @@ 'r600_asm.h', 'r600_blit.c', 'r600d.h', + 'r600_dump.c', + 'r600_dump.h', 'r600_formats.h', 'r600_hw_context.c', 'r600_isa.c', diff -Nru mesa-19.2.8/src/gallium/drivers/r600/r600_asm.c mesa-20.0.8/src/gallium/drivers/r600/r600_asm.c --- mesa-19.2.8/src/gallium/drivers/r600/r600_asm.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/r600/r600_asm.c 2020-06-12 01:21:17.000000000 +0000 @@ -54,11 +54,11 @@ if (!cf) return NULL; - LIST_INITHEAD(&cf->list); - LIST_INITHEAD(&cf->alu); - LIST_INITHEAD(&cf->vtx); - LIST_INITHEAD(&cf->tex); - LIST_INITHEAD(&cf->gds); + list_inithead(&cf->list); + list_inithead(&cf->alu); + list_inithead(&cf->vtx); + list_inithead(&cf->tex); + list_inithead(&cf->gds); return cf; } @@ -68,7 +68,7 @@ if (!alu) return NULL; - LIST_INITHEAD(&alu->list); + list_inithead(&alu->list); return alu; } @@ -78,7 +78,7 @@ if (!vtx) return NULL; - LIST_INITHEAD(&vtx->list); + list_inithead(&vtx->list); return vtx; } @@ -88,7 +88,7 @@ if (!tex) return NULL; - LIST_INITHEAD(&tex->list); + list_inithead(&tex->list); return tex; } @@ -98,7 +98,7 @@ if (gds == NULL) return NULL; - LIST_INITHEAD(&gds->list); + list_inithead(&gds->list); return gds; } @@ -154,7 +154,7 @@ bc->r6xx_nop_after_rel_dst = 0; } - LIST_INITHEAD(&bc->cf); + list_inithead(&bc->cf); bc->chip_class = chip_class; bc->family = family; bc->has_compressed_msaa_texturing = has_compressed_msaa_texturing; @@ -167,7 +167,7 @@ if (!cf) return -ENOMEM; - LIST_ADDTAIL(&cf->list, &bc->cf); + list_addtail(&cf->list, &bc->cf); if (bc->cf_last) { cf->id = bc->cf_last->id + 2; if (bc->cf_last->eg_alu_extended) { @@ -928,9 +928,9 @@ for (i = 0; i < max_slots; ++i) { slots[i] = result[i]; if (result[i]) { - LIST_DEL(&result[i]->list); + list_del(&result[i]->list); result[i]->last = 0; - LIST_ADDTAIL(&result[i]->list, &bc->cf_last->alu); + list_addtail(&result[i]->list, &bc->cf_last->alu); } } @@ -1266,7 +1266,7 @@ if (nalu->dst.sel >= bc->ngpr) { bc->ngpr = nalu->dst.sel + 1; } - LIST_ADDTAIL(&nalu->list, &bc->cf_last->alu); + list_addtail(&nalu->list, &bc->cf_last->alu); /* each alu use 2 dwords */ bc->cf_last->ndw += 2; bc->ndw += 2; @@ -1407,7 +1407,7 @@ return -EINVAL; } } - LIST_ADDTAIL(&nvtx->list, &bc->cf_last->vtx); + list_addtail(&nvtx->list, &bc->cf_last->vtx); /* each fetch use 4 dwords */ bc->cf_last->ndw += 4; bc->ndw += 4; @@ -1477,7 +1477,7 @@ if (ntex->dst_gpr >= bc->ngpr) { bc->ngpr = ntex->dst_gpr + 1; } - LIST_ADDTAIL(&ntex->list, &bc->cf_last->tex); + list_addtail(&ntex->list, &bc->cf_last->tex); /* each texture fetch use 4 dwords */ bc->cf_last->ndw += 4; bc->ndw += 4; @@ -1511,7 +1511,7 @@ bc->cf_last->op = CF_OP_GDS; } - LIST_ADDTAIL(&ngds->list, &bc->cf_last->gds); + list_addtail(&ngds->list, &bc->cf_last->gds); bc->cf_last->ndw += 4; /* each GDS uses 4 dwords */ if ((bc->cf_last->ndw / 4) >= r600_bytecode_num_tex_and_vtx_instructions(bc)) bc->force_add_cf = 1; @@ -1867,30 +1867,30 @@ free(alu); } - LIST_INITHEAD(&cf->alu); + list_inithead(&cf->alu); LIST_FOR_EACH_ENTRY_SAFE(tex, next_tex, &cf->tex, list) { free(tex); } - LIST_INITHEAD(&cf->tex); + list_inithead(&cf->tex); LIST_FOR_EACH_ENTRY_SAFE(vtx, next_vtx, &cf->vtx, list) { free(vtx); } - LIST_INITHEAD(&cf->vtx); + list_inithead(&cf->vtx); LIST_FOR_EACH_ENTRY_SAFE(gds, next_gds, &cf->gds, list) { free(gds); } - LIST_INITHEAD(&cf->gds); + list_inithead(&cf->gds); free(cf); } - LIST_INITHEAD(&cf->list); + list_inithead(&cf->list); } static int print_swizzle(unsigned swz) diff -Nru mesa-19.2.8/src/gallium/drivers/r600/r600_asm.h mesa-20.0.8/src/gallium/drivers/r600/r600_asm.h --- mesa-19.2.8/src/gallium/drivers/r600/r600_asm.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/r600/r600_asm.h 2020-06-12 01:21:17.000000000 +0000 @@ -27,6 +27,10 @@ #include "r600_isa.h" #include "tgsi/tgsi_exec.h" +#ifdef __cplusplus +extern "C" { +#endif + struct r600_bytecode_alu_src { unsigned sel; unsigned chan; @@ -358,4 +362,9 @@ } return 0; } + +#ifdef __cplusplus +} +#endif + #endif diff -Nru mesa-19.2.8/src/gallium/drivers/r600/r600_blit.c mesa-20.0.8/src/gallium/drivers/r600/r600_blit.c --- mesa-19.2.8/src/gallium/drivers/r600/r600_blit.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/r600/r600_blit.c 2020-06-12 01:21:17.000000000 +0000 @@ -24,7 +24,7 @@ #include "compute_memory_pool.h" #include "evergreen_compute.h" #include "util/u_surface.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "evergreend.h" enum r600_blitter_op /* bitmask */ diff -Nru mesa-19.2.8/src/gallium/drivers/r600/r600_buffer_common.c mesa-20.0.8/src/gallium/drivers/r600/r600_buffer_common.c --- mesa-19.2.8/src/gallium/drivers/r600/r600_buffer_common.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/r600/r600_buffer_common.c 2020-06-12 01:21:17.000000000 +0000 @@ -498,7 +498,7 @@ ctx->resource_copy_region(ctx, dst, 0, box->x, 0, 0, src, 0, &dma_box); } - util_range_add(&rbuffer->valid_buffer_range, box->x, + util_range_add(&rbuffer->b.b, &rbuffer->valid_buffer_range, box->x, box->x + box->width); } @@ -643,8 +643,8 @@ rbuffer->domains = RADEON_DOMAIN_GTT; rbuffer->flags = 0; rbuffer->b.is_user_ptr = true; - util_range_add(&rbuffer->valid_buffer_range, 0, templ->width0); - util_range_add(&rbuffer->b.valid_buffer_range, 0, templ->width0); + util_range_add(&rbuffer->b.b, &rbuffer->valid_buffer_range, 0, templ->width0); + util_range_add(&rbuffer->b.b, &rbuffer->b.valid_buffer_range, 0, templ->width0); /* Convert a user pointer to a buffer. */ rbuffer->buf = ws->buffer_from_ptr(ws, user_memory, templ->width0); diff -Nru mesa-19.2.8/src/gallium/drivers/r600/r600_dump.c mesa-20.0.8/src/gallium/drivers/r600/r600_dump.c --- mesa-19.2.8/src/gallium/drivers/r600/r600_dump.c 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/r600/r600_dump.c 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,168 @@ +/* -*- mesa-c++ -*- + * + * Copyright (c) 2018 Collabora LTD + * + * Author: Gert Wollny + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "r600_dump.h" +#include "r600_shader.h" + +void print_shader_info(FILE *f , int id, struct r600_shader *shader) +{ + +#define PRINT_INT_MEMBER(NAME) \ + if (shader-> NAME) fprintf(f, " shader->" #NAME "=%d;\n", shader-> NAME) +#define PRINT_UINT_MEMBER(NAME) \ + if (shader-> NAME) fprintf(f, " shader->" #NAME "=%u;\n", (unsigned)shader-> NAME) + +#define PRINT_INT_ARRAY_ELM(NAME, ELM) \ + if (shader->NAME[i].ELM) fprintf(f, " shader->" #NAME "[%d]." #ELM "=%d;\n", i, (int)shader->NAME[i].ELM) +#define PRINT_UINT_ARRAY_ELM(NAME, ELM) \ + if (shader->NAME[i].ELM) fprintf(f, " shader->" #NAME "[%d]." #ELM" =%u;\n", i, (unsigned)shader->NAME[i].ELM) + + fprintf(f, "#include \"gallium/drivers/r600/r600_shader.h\"\n"); + fprintf(f, "void shader_%d_fill_data(struct r600_shader *shader)\n{\n", id); + fprintf(f, " memset(shader, 0, sizeof(struct r600_shader));\n"); + + PRINT_UINT_MEMBER(processor_type); + PRINT_UINT_MEMBER(ninput); + PRINT_UINT_MEMBER(noutput); + PRINT_UINT_MEMBER(nhwatomic); + PRINT_UINT_MEMBER(nlds); + PRINT_UINT_MEMBER(nsys_inputs); + + for (unsigned i = 0; i < shader->ninput; ++i) { + PRINT_UINT_ARRAY_ELM(input, name); + PRINT_UINT_ARRAY_ELM(input, gpr); + PRINT_UINT_ARRAY_ELM(input, done); + PRINT_INT_ARRAY_ELM(input, sid); + PRINT_INT_ARRAY_ELM(input, spi_sid); + PRINT_UINT_ARRAY_ELM(input, interpolate); + PRINT_UINT_ARRAY_ELM(input, ij_index); + PRINT_UINT_ARRAY_ELM(input, interpolate_location); // TGSI_INTERPOLATE_LOC_CENTER, CENTROID, SAMPLE + PRINT_UINT_ARRAY_ELM(input, lds_pos); /* for evergreen */ + PRINT_UINT_ARRAY_ELM(input, back_color_input); + PRINT_UINT_ARRAY_ELM(input, write_mask); + PRINT_INT_ARRAY_ELM(input, ring_offset); + } + + for (unsigned i = 0; i < shader->noutput; ++i) { + PRINT_UINT_ARRAY_ELM(output, name); + PRINT_UINT_ARRAY_ELM(output, gpr); + PRINT_UINT_ARRAY_ELM(output, done); + PRINT_INT_ARRAY_ELM(output, sid); + PRINT_INT_ARRAY_ELM(output, spi_sid); + PRINT_UINT_ARRAY_ELM(output, interpolate); + PRINT_UINT_ARRAY_ELM(output, ij_index); + PRINT_UINT_ARRAY_ELM(output, interpolate_location); // TGSI_INTERPOLATE_LOC_CENTER, CENTROID, SAMPLE + PRINT_UINT_ARRAY_ELM(output, lds_pos); /* for evergreen */ + PRINT_UINT_ARRAY_ELM(output, back_color_input); + PRINT_UINT_ARRAY_ELM(output, write_mask); + PRINT_INT_ARRAY_ELM(output, ring_offset); + } + + for (unsigned i = 0; i < shader->nhwatomic; ++i) { + PRINT_UINT_ARRAY_ELM(atomics, start); + PRINT_UINT_ARRAY_ELM(atomics, end); + PRINT_UINT_ARRAY_ELM(atomics, buffer_id); + PRINT_UINT_ARRAY_ELM(atomics, hw_idx); + PRINT_UINT_ARRAY_ELM(atomics, array_id); + } + + PRINT_UINT_MEMBER(nhwatomic_ranges); + PRINT_UINT_MEMBER(uses_kill); + PRINT_UINT_MEMBER(fs_write_all); + PRINT_UINT_MEMBER(two_side); + PRINT_UINT_MEMBER(needs_scratch_space); + /* Number of color outputs in the TGSI shader, + * sometimes it could be higher than nr_cbufs (bug?). + * Also with writes_all property on eg+ it will be set to max CB number */ + PRINT_UINT_MEMBER(nr_ps_max_color_exports); + /* Real number of ps color exports compiled in the bytecode */ + PRINT_UINT_MEMBER(nr_ps_color_exports); + PRINT_UINT_MEMBER(ps_color_export_mask); + PRINT_UINT_MEMBER(ps_export_highest); + /* bit n is set if the shader writes gl_ClipDistance[n] */ + PRINT_UINT_MEMBER(cc_dist_mask); + PRINT_UINT_MEMBER(clip_dist_write); + PRINT_UINT_MEMBER(cull_dist_write); + PRINT_UINT_MEMBER(vs_position_window_space); + /* flag is set if the shader writes VS_OUT_MISC_VEC (e.g. for PSIZE) */ + PRINT_UINT_MEMBER(vs_out_misc_write); + PRINT_UINT_MEMBER(vs_out_point_size); + PRINT_UINT_MEMBER(vs_out_layer); + PRINT_UINT_MEMBER(vs_out_viewport); + PRINT_UINT_MEMBER(vs_out_edgeflag); + PRINT_UINT_MEMBER(has_txq_cube_array_z_comp); + PRINT_UINT_MEMBER(uses_tex_buffers); + PRINT_UINT_MEMBER(gs_prim_id_input); + PRINT_UINT_MEMBER(gs_tri_strip_adj_fix); + PRINT_UINT_MEMBER(ps_conservative_z); + + /* Size in bytes of a data item in the ring(s) (single vertex data). + Stages with only one ring items 123 will be set to 0. */ + + PRINT_UINT_MEMBER(ring_item_sizes[0]); + PRINT_UINT_MEMBER(ring_item_sizes[1]); + PRINT_UINT_MEMBER(ring_item_sizes[2]); + PRINT_UINT_MEMBER(ring_item_sizes[3]); + + PRINT_UINT_MEMBER(indirect_files); + PRINT_UINT_MEMBER(max_arrays); + PRINT_UINT_MEMBER(num_arrays); + PRINT_UINT_MEMBER(vs_as_es); + PRINT_UINT_MEMBER(vs_as_ls); + PRINT_UINT_MEMBER(vs_as_gs_a); + PRINT_UINT_MEMBER(tes_as_es); + PRINT_UINT_MEMBER(tcs_prim_mode); + PRINT_UINT_MEMBER(ps_prim_id_input); + + if (shader->num_arrays > 0) { + fprintf(stderr, " shader->arrays = new r600_shader_array[%d];\n", shader->num_arrays); + for (unsigned i = 0; i < shader->num_arrays; ++i) { + PRINT_UINT_ARRAY_ELM(arrays, gpr_start); + PRINT_UINT_ARRAY_ELM(arrays, gpr_count); + PRINT_UINT_ARRAY_ELM(arrays, comp_mask); + } + } + + PRINT_UINT_MEMBER(uses_doubles); + PRINT_UINT_MEMBER(uses_atomics); + PRINT_UINT_MEMBER(uses_images); + PRINT_UINT_MEMBER(uses_helper_invocation); + PRINT_UINT_MEMBER(atomic_base); + PRINT_UINT_MEMBER(rat_base); + PRINT_UINT_MEMBER(image_size_const_offset); + + fprintf(f, "}\n"); +} + +void print_pipe_info(FILE *f, struct tgsi_shader_info *shader) +{ + PRINT_UINT_MEMBER(shader_buffers_load); + PRINT_UINT_MEMBER(shader_buffers_store); + PRINT_UINT_MEMBER(shader_buffers_atomic); + PRINT_UINT_MEMBER(writes_memory); + PRINT_UINT_MEMBER(file_mask[TGSI_FILE_HW_ATOMIC]); + PRINT_UINT_MEMBER(file_count[TGSI_FILE_HW_ATOMIC]); +} diff -Nru mesa-19.2.8/src/gallium/drivers/r600/r600_dump.h mesa-20.0.8/src/gallium/drivers/r600/r600_dump.h --- mesa-19.2.8/src/gallium/drivers/r600/r600_dump.h 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/r600/r600_dump.h 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,39 @@ +/* -*- mesa-c++ -*- + * + * Copyright (c) 2018 Collabora LTD + * + * Author: Gert Wollny + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef R600_DUMP_H +#define R600_DUMP_H + +#include + +struct r600_shader; +struct tgsi_shader_info; + +void print_shader_info(FILE *f , int id, struct r600_shader *shader); + +void print_pipe_info(FILE *f, struct tgsi_shader_info *shader); + +#endif // R600_DUMP_H diff -Nru mesa-19.2.8/src/gallium/drivers/r600/r600_formats.h mesa-20.0.8/src/gallium/drivers/r600/r600_formats.h --- mesa-19.2.8/src/gallium/drivers/r600/r600_formats.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/r600/r600_formats.h 2020-06-12 01:21:17.000000000 +0000 @@ -1,7 +1,7 @@ #ifndef R600_FORMATS_H #define R600_FORMATS_H -#include "util/u_format.h" +#include "util/format/u_format.h" #include "r600_pipe.h" /* list of formats from R700 ISA document - apply across GPUs in different registers */ @@ -115,6 +115,10 @@ desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED)) return false; + /* No 8 bit 3 channel formats */ + if (desc->channel[i].size == 8 && desc->nr_channels == 3) + return false; + return true; } diff -Nru mesa-19.2.8/src/gallium/drivers/r600/r600_hw_context.c mesa-20.0.8/src/gallium/drivers/r600/r600_hw_context.c --- mesa-19.2.8/src/gallium/drivers/r600/r600_hw_context.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/r600/r600_hw_context.c 2020-06-12 01:21:17.000000000 +0000 @@ -510,7 +510,7 @@ /* Mark the buffer range of destination as valid (initialized), * so that transfer_map knows it should wait for the GPU when mapping * that range. */ - util_range_add(&r600_resource(dst)->valid_buffer_range, dst_offset, + util_range_add(dst, &r600_resource(dst)->valid_buffer_range, dst_offset, dst_offset + size); dst_offset += r600_resource(dst)->gpu_address; @@ -592,7 +592,7 @@ /* Mark the buffer range of destination as valid (initialized), * so that transfer_map knows it should wait for the GPU when mapping * that range. */ - util_range_add(&rdst->valid_buffer_range, dst_offset, + util_range_add(&rdst->b.b, &rdst->valid_buffer_range, dst_offset, dst_offset + size); size >>= 2; /* convert to dwords */ diff -Nru mesa-19.2.8/src/gallium/drivers/r600/r600_pipe.c mesa-20.0.8/src/gallium/drivers/r600/r600_pipe.c --- mesa-19.2.8/src/gallium/drivers/r600/r600_pipe.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/r600/r600_pipe.c 2020-06-12 01:21:17.000000000 +0000 @@ -155,7 +155,7 @@ goto fail; rctx->screen = rscreen; - LIST_INITHEAD(&rctx->texture_buffers); + list_inithead(&rctx->texture_buffers); r600_init_blit_functions(rctx); @@ -400,60 +400,7 @@ case PIPE_CAP_MAX_COMBINED_SHADER_BUFFERS: return 8; - /* Unsupported features. */ - case PIPE_CAP_TGSI_FS_COORD_ORIGIN_LOWER_LEFT: - case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_INTEGER: - case PIPE_CAP_TGSI_CAN_COMPACT_CONSTANTS: - case PIPE_CAP_FRAGMENT_COLOR_CLAMPED: - case PIPE_CAP_VERTEX_COLOR_CLAMPED: - case PIPE_CAP_USER_VERTEX_BUFFERS: - case PIPE_CAP_TEXTURE_GATHER_OFFSETS: - case PIPE_CAP_VERTEXID_NOBASE: - case PIPE_CAP_DEPTH_BOUNDS_TEST: - case PIPE_CAP_FORCE_PERSAMPLE_INTERP: - case PIPE_CAP_SHAREABLE_SHADERS: - case PIPE_CAP_DRAW_PARAMETERS: - case PIPE_CAP_MULTI_DRAW_INDIRECT: - case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS: - case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL: - case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL: - case PIPE_CAP_GENERATE_MIPMAP: - case PIPE_CAP_STRING_MARKER: - case PIPE_CAP_PRIMITIVE_RESTART_FOR_PATCHES: - case PIPE_CAP_TGSI_VOTE: - case PIPE_CAP_MAX_WINDOW_RECTANGLES: - case PIPE_CAP_TGSI_CAN_READ_OUTPUTS: - case PIPE_CAP_NATIVE_FENCE_FD: case PIPE_CAP_GLSL_OPTIMIZE_CONSERVATIVELY: - case PIPE_CAP_FBFETCH: - case PIPE_CAP_INT64: - case PIPE_CAP_INT64_DIVMOD: - case PIPE_CAP_TGSI_TEX_TXF_LZ: - case PIPE_CAP_POLYGON_MODE_FILL_RECTANGLE: - case PIPE_CAP_SPARSE_BUFFER_PAGE_SIZE: - case PIPE_CAP_TGSI_BALLOT: - case PIPE_CAP_TGSI_TES_LAYER_VIEWPORT: - case PIPE_CAP_POST_DEPTH_COVERAGE: - case PIPE_CAP_BINDLESS_TEXTURE: - case PIPE_CAP_NIR_SAMPLERS_AS_DEREF: - case PIPE_CAP_QUERY_SO_OVERFLOW: - case PIPE_CAP_MEMOBJ: - case PIPE_CAP_LOAD_CONSTBUF: - case PIPE_CAP_TGSI_ANY_REG_AS_ADDRESS: - case PIPE_CAP_TILE_RASTER_ORDER: - case PIPE_CAP_SIGNED_VERTEX_BUFFER_OFFSET: - case PIPE_CAP_CONTEXT_PRIORITY_MASK: - case PIPE_CAP_FENCE_SIGNAL: - case PIPE_CAP_CONSTBUF0_FLAGS: - case PIPE_CAP_PACKED_UNIFORMS: - case PIPE_CAP_FRAMEBUFFER_MSAA_CONSTRAINTS: - case PIPE_CAP_CONSERVATIVE_RASTER_POST_SNAP_TRIANGLES: - case PIPE_CAP_CONSERVATIVE_RASTER_POST_SNAP_POINTS_LINES: - case PIPE_CAP_CONSERVATIVE_RASTER_PRE_SNAP_TRIANGLES: - case PIPE_CAP_CONSERVATIVE_RASTER_PRE_SNAP_POINTS_LINES: - case PIPE_CAP_CONSERVATIVE_RASTER_POST_DEPTH_COVERAGE: - case PIPE_CAP_MAX_CONSERVATIVE_RASTER_SUBPIXEL_PRECISION_BIAS: - case PIPE_CAP_PROGRAMMABLE_SAMPLE_LOCATIONS: return 0; case PIPE_CAP_DOUBLES: @@ -698,8 +645,6 @@ return EG_MAX_ATOMIC_BUFFERS; } return 0; - case PIPE_SHADER_CAP_SCALAR_ISA: - return 0; case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT: /* due to a bug in the shader compiler, some loops hang * if they are not unrolled, see: diff -Nru mesa-19.2.8/src/gallium/drivers/r600/r600_pipe_common.c mesa-20.0.8/src/gallium/drivers/r600/r600_pipe_common.c --- mesa-19.2.8/src/gallium/drivers/r600/r600_pipe_common.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/r600/r600_pipe_common.c 2020-06-12 01:21:17.000000000 +0000 @@ -30,7 +30,7 @@ #include "util/list.h" #include "util/u_draw_quad.h" #include "util/u_memory.h" -#include "util/u_format_s3tc.h" +#include "util/format/u_format_s3tc.h" #include "util/u_upload_mgr.h" #include "util/os_time.h" #include "vl/vl_decoder.h" @@ -39,11 +39,7 @@ #include #include -#ifndef HAVE_LLVM -#define HAVE_LLVM 0 -#endif - -#if HAVE_LLVM +#ifdef LLVM_AVAILABLE #include #endif @@ -302,7 +298,7 @@ void r600_preflush_suspend_features(struct r600_common_context *ctx) { /* suspend queries */ - if (!LIST_IS_EMPTY(&ctx->active_queries)) + if (!list_is_empty(&ctx->active_queries)) r600_suspend_queries(ctx); ctx->streamout.suspended = false; @@ -320,7 +316,7 @@ } /* resume queries */ - if (!LIST_IS_EMPTY(&ctx->active_queries)) + if (!list_is_empty(&ctx->active_queries)) r600_resume_queries(ctx); } @@ -639,7 +635,7 @@ if (!rctx->ctx) return false; - if (rscreen->info.num_sdma_rings && !(rscreen->debug_flags & DBG_NO_ASYNC_DMA)) { + if (rscreen->info.num_rings[RING_DMA] && !(rscreen->debug_flags & DBG_NO_ASYNC_DMA)) { rctx->dma.cs = rctx->ws->cs_create(rctx->ctx, RING_DMA, r600_flush_dma_ring, rctx, false); @@ -821,10 +817,7 @@ case PIPE_CAPF_MAX_LINE_WIDTH_AA: case PIPE_CAPF_MAX_POINT_WIDTH: case PIPE_CAPF_MAX_POINT_WIDTH_AA: - if (rscreen->family >= CHIP_CEDAR) - return 16384.0f; - else - return 8192.0f; + return 8191.0f; case PIPE_CAPF_MAX_TEXTURE_ANISOTROPY: return 16.0f; case PIPE_CAPF_MAX_TEXTURE_LOD_BIAS: @@ -1202,7 +1195,7 @@ snprintf(rscreen->renderer_string, sizeof(rscreen->renderer_string), "%s (%sDRM %i.%i.%i%s" -#if HAVE_LLVM > 0 +#ifdef LLVM_AVAILABLE ", LLVM " MESA_LLVM_VERSION_STRING #endif ")", @@ -1272,8 +1265,8 @@ printf("r600_has_virtual_memory = %i\n", rscreen->info.r600_has_virtual_memory); printf("gfx_ib_pad_with_type2 = %i\n", rscreen->info.gfx_ib_pad_with_type2); printf("has_hw_decode = %u\n", rscreen->info.has_hw_decode); - printf("num_sdma_rings = %i\n", rscreen->info.num_sdma_rings); - printf("num_compute_rings = %u\n", rscreen->info.num_compute_rings); + printf("num_rings[RING_DMA] = %i\n", rscreen->info.num_rings[RING_DMA]); + printf("num_rings[RING_COMPUTE] = %u\n", rscreen->info.num_rings[RING_COMPUTE]); printf("uvd_fw_version = %u\n", rscreen->info.uvd_fw_version); printf("vce_fw_version = %u\n", rscreen->info.vce_fw_version); printf("me_fw_version = %i\n", rscreen->info.me_fw_version); diff -Nru mesa-19.2.8/src/gallium/drivers/r600/r600_pipe_common.h mesa-20.0.8/src/gallium/drivers/r600/r600_pipe_common.h --- mesa-19.2.8/src/gallium/drivers/r600/r600_pipe_common.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/r600/r600_pipe_common.h 2020-06-12 01:21:17.000000000 +0000 @@ -118,7 +118,7 @@ R600_COHERENCY_CB_META, }; -#ifdef PIPE_ARCH_BIG_ENDIAN +#if UTIL_ARCH_BIG_ENDIAN #define R600_BIG_ENDIAN 1 #else #define R600_BIG_ENDIAN 0 diff -Nru mesa-19.2.8/src/gallium/drivers/r600/r600_query.c mesa-20.0.8/src/gallium/drivers/r600/r600_query.c --- mesa-19.2.8/src/gallium/drivers/r600/r600_query.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/r600/r600_query.c 2020-06-12 01:21:17.000000000 +0000 @@ -1047,7 +1047,7 @@ if (!query->buffer.buf) return false; - LIST_ADDTAIL(&query->list, &rctx->active_queries); + list_addtail(&query->list, &rctx->active_queries); return true; } @@ -1070,7 +1070,7 @@ r600_query_hw_emit_stop(rctx, query); if (!(query->flags & R600_QUERY_HW_FLAG_NO_START)) - LIST_DELINIT(&query->list); + list_delinit(&query->list); if (!query->buffer.buf) return false; @@ -2125,7 +2125,7 @@ if (((struct r600_common_screen*)rctx->b.screen)->info.num_render_backends > 0) rctx->b.render_condition = r600_render_condition; - LIST_INITHEAD(&rctx->active_queries); + list_inithead(&rctx->active_queries); } void r600_init_screen_query_functions(struct r600_common_screen *rscreen) diff -Nru mesa-19.2.8/src/gallium/drivers/r600/r600_shader.c mesa-20.0.8/src/gallium/drivers/r600/r600_shader.c --- mesa-19.2.8/src/gallium/drivers/r600/r600_shader.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/r600/r600_shader.c 2020-06-12 01:21:17.000000000 +0000 @@ -6973,6 +6973,7 @@ } else { location = TGSI_INTERPOLATE_LOC_CENTROID; + ctx->shader->input[input].uses_interpolate_at_centroid = 1; } k = eg_get_interpolator_index(ctx->shader->input[input].interpolate, location); diff -Nru mesa-19.2.8/src/gallium/drivers/r600/r600_shader.h mesa-20.0.8/src/gallium/drivers/r600/r600_shader.h --- mesa-19.2.8/src/gallium/drivers/r600/r600_shader.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/r600/r600_shader.h 2020-06-12 01:21:17.000000000 +0000 @@ -45,7 +45,7 @@ unsigned name; unsigned gpr; unsigned done; - int sid; + unsigned sid; int spi_sid; unsigned interpolate; unsigned ij_index; @@ -54,6 +54,7 @@ unsigned back_color_input; unsigned write_mask; int ring_offset; + unsigned uses_interpolate_at_centroid; }; struct r600_shader_atomic { diff -Nru mesa-19.2.8/src/gallium/drivers/r600/r600_state_common.c mesa-20.0.8/src/gallium/drivers/r600/r600_state_common.c --- mesa-19.2.8/src/gallium/drivers/r600/r600_state_common.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/r600/r600_state_common.c 2020-06-12 01:21:17.000000000 +0000 @@ -28,7 +28,7 @@ #include "r600_shader.h" #include "r600d.h" -#include "util/u_format_s3tc.h" +#include "util/format/u_format_s3tc.h" #include "util/u_index_modify.h" #include "util/u_memory.h" #include "util/u_upload_mgr.h" @@ -419,7 +419,7 @@ if (view->tex_resource->gpu_address && view->tex_resource->b.b.target == PIPE_BUFFER) - LIST_DELINIT(&view->list); + list_delinit(&view->list); pipe_resource_reference(&state->texture, NULL); FREE(view); @@ -546,7 +546,8 @@ static void r600_delete_vertex_elements(struct pipe_context *ctx, void *state) { struct r600_fetch_shader *shader = (struct r600_fetch_shader*)state; - r600_resource_reference(&shader->buffer, NULL); + if (shader) + r600_resource_reference(&shader->buffer, NULL); FREE(shader); } diff -Nru mesa-19.2.8/src/gallium/drivers/r600/r600_streamout.c mesa-20.0.8/src/gallium/drivers/r600/r600_streamout.c --- mesa-19.2.8/src/gallium/drivers/r600/r600_streamout.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/r600/r600_streamout.c 2020-06-12 01:21:17.000000000 +0000 @@ -65,7 +65,7 @@ t->b.buffer_offset = buffer_offset; t->b.buffer_size = buffer_size; - util_range_add(&rbuffer->valid_buffer_range, buffer_offset, + util_range_add(buffer, &rbuffer->valid_buffer_range, buffer_offset, buffer_offset + buffer_size); return &t->b; } diff -Nru mesa-19.2.8/src/gallium/drivers/r600/r600_texture.c mesa-20.0.8/src/gallium/drivers/r600/r600_texture.c --- mesa-19.2.8/src/gallium/drivers/r600/r600_texture.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/r600/r600_texture.c 2020-06-12 01:21:17.000000000 +0000 @@ -27,12 +27,13 @@ #include "r600_pipe_common.h" #include "r600_cs.h" #include "r600_query.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_log.h" #include "util/u_memory.h" #include "util/u_pack_color.h" #include "util/u_surface.h" #include "util/os_time.h" +#include "state_tracker/winsys_handle.h" #include #include @@ -569,8 +570,10 @@ res->external_usage = usage; } - return rscreen->ws->buffer_get_handle(rscreen->ws, res->buf, stride, - offset, slice_size, whandle); + whandle->stride = stride; + whandle->offset = offset + slice_size * whandle->layer; + + return rscreen->ws->buffer_get_handle(rscreen->ws, res->buf, whandle); } static void r600_texture_destroy(struct pipe_screen *screen, @@ -1115,7 +1118,6 @@ { struct r600_common_screen *rscreen = (struct r600_common_screen*)screen; struct pb_buffer *buf = NULL; - unsigned stride = 0, offset = 0; enum radeon_surf_mode array_mode; struct radeon_surf surface = {}; int r; @@ -1129,8 +1131,7 @@ return NULL; buf = rscreen->ws->buffer_from_handle(rscreen->ws, whandle, - rscreen->info.max_alignment, - &stride, &offset); + rscreen->info.max_alignment); if (!buf) return NULL; @@ -1138,8 +1139,9 @@ r600_surface_import_metadata(rscreen, &surface, &metadata, &array_mode, &is_scanout); - r = r600_init_surface(rscreen, &surface, templ, array_mode, stride, - offset, true, is_scanout, false); + r = r600_init_surface(rscreen, &surface, templ, array_mode, + whandle->stride, whandle->offset, + true, is_scanout, false); if (r) { return NULL; } @@ -1769,7 +1771,7 @@ int i; /* This function is broken in BE, so just disable this path for now */ -#ifdef PIPE_ARCH_BIG_ENDIAN +#if UTIL_ARCH_BIG_ENDIAN return; #endif @@ -1865,14 +1867,12 @@ struct r600_common_screen *rscreen = (struct r600_common_screen*)screen; struct r600_memory_object *memobj = CALLOC_STRUCT(r600_memory_object); struct pb_buffer *buf = NULL; - uint32_t stride, offset; if (!memobj) return NULL; buf = rscreen->ws->buffer_from_handle(rscreen->ws, whandle, - rscreen->info.max_alignment, - &stride, &offset); + rscreen->info.max_alignment); if (!buf) { free(memobj); return NULL; @@ -1880,8 +1880,8 @@ memobj->b.dedicated = dedicated; memobj->buf = buf; - memobj->stride = stride; - memobj->offset = offset; + memobj->stride = whandle->stride; + memobj->offset = whandle->offset; return (struct pipe_memory_object *)memobj; diff -Nru mesa-19.2.8/src/gallium/drivers/r600/r600_uvd.c mesa-20.0.8/src/gallium/drivers/r600/r600_uvd.c --- mesa-19.2.8/src/gallium/drivers/r600/r600_uvd.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/r600/r600_uvd.c 2020-06-12 01:21:17.000000000 +0000 @@ -62,7 +62,7 @@ struct r600_texture *resources[VL_NUM_COMPONENTS] = {}; struct radeon_surf* surfaces[VL_NUM_COMPONENTS] = {}; struct pb_buffer **pbs[VL_NUM_COMPONENTS] = {}; - const enum pipe_format *resource_formats; + enum pipe_format resource_formats[3]; struct pipe_video_buffer template; struct pipe_resource templ; unsigned i, array_size; @@ -70,9 +70,7 @@ assert(pipe); /* first create the needed resources as "normal" textures */ - resource_formats = vl_video_buffer_formats(pipe->screen, tmpl->buffer_format); - if (!resource_formats) - return NULL; + vl_get_video_buffer_formats(pipe->screen, tmpl->buffer_format, resource_formats); array_size = tmpl->interlaced ? 2 : 1; template = *tmpl; diff -Nru mesa-19.2.8/src/gallium/drivers/r600/radeon_uvd.c mesa-20.0.8/src/gallium/drivers/r600/radeon_uvd.c --- mesa-19.2.8/src/gallium/drivers/r600/radeon_uvd.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/r600/radeon_uvd.c 2020-06-12 01:21:17.000000000 +0000 @@ -216,9 +216,6 @@ case PIPE_VIDEO_FORMAT_MPEG4: return RUVD_CODEC_MPEG4; - case PIPE_VIDEO_FORMAT_HEVC: - return RUVD_CODEC_H265; - case PIPE_VIDEO_FORMAT_JPEG: return RUVD_CODEC_MJPEG; @@ -360,20 +357,6 @@ break; } - case PIPE_VIDEO_FORMAT_HEVC: - if (dec->base.width * dec->base.height >= 4096*2000) - max_references = MAX2(max_references, 8); - else - max_references = MAX2(max_references, 17); - - width = align (width, 16); - height = align (height, 16); - if (dec->base.profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10) - dpb_size = align((align(width, get_db_pitch_alignment(dec)) * height * 9) / 4, 256) * max_references; - else - dpb_size = align((align(width, get_db_pitch_alignment(dec)) * height * 3) / 2, 256) * max_references; - break; - case PIPE_VIDEO_FORMAT_VC1: // the firmware seems to allways assume a minimum of ref frames max_references = MAX2(NUM_VC1_REFS, max_references); @@ -665,18 +648,6 @@ result.direct_reflist[i][j] = pic->RefPicList[i][j]; } - if (pic->base.profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10) { - if (target->buffer_format == PIPE_FORMAT_P016) { - result.p010_mode = 1; - result.msb_mode = 1; - } else { - result.luma_10to8 = 5; - result.chroma_10to8 = 5; - result.sclr_luma10to8 = 4; - result.sclr_chroma10to8 = 4; - } - } - /* TODO result.highestTid; result.isNonRef; @@ -1196,24 +1167,6 @@ dec->msg->body.decode.codec.h264 = get_h264_msg(dec, (struct pipe_h264_picture_desc*)picture); break; - case PIPE_VIDEO_FORMAT_HEVC: - dec->msg->body.decode.codec.h265 = get_h265_msg(dec, target, (struct pipe_h265_picture_desc*)picture); - if (dec->ctx.res == NULL) { - unsigned ctx_size; - if (dec->base.profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10) - ctx_size = calc_ctx_size_h265_main10(dec, (struct pipe_h265_picture_desc*)picture); - else - ctx_size = calc_ctx_size_h265_main(dec); - if (!rvid_create_buffer(dec->screen, &dec->ctx, ctx_size, PIPE_USAGE_DEFAULT)) { - RVID_ERR("Can't allocated context buffer.\n"); - } - rvid_clear_buffer(decoder->context, &dec->ctx); - } - - if (dec->ctx.res) - dec->msg->body.decode.dpb_reserved = dec->ctx.res->buf->size; - break; - case PIPE_VIDEO_FORMAT_VC1: dec->msg->body.decode.codec.vc1 = get_vc1_msg((struct pipe_vc1_picture_desc*)picture); break; diff -Nru mesa-19.2.8/src/gallium/drivers/r600/radeon_vce.c mesa-20.0.8/src/gallium/drivers/r600/radeon_vce.c --- mesa-19.2.8/src/gallium/drivers/r600/radeon_vce.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/r600/radeon_vce.c 2020-06-12 01:21:17.000000000 +0000 @@ -54,6 +54,10 @@ #define FW_52_8_3 ((52 << 24) | (8 << 16) | (3 << 8)) #define FW_53 (53 << 24) +/* version specific function for getting parameters */ +static void (*get_pic_param)(struct rvce_encoder *enc, + struct pipe_h264_enc_picture_desc *pic) = NULL; + /** * flush commands to the hardware */ @@ -97,14 +101,14 @@ { unsigned i; - LIST_INITHEAD(&enc->cpb_slots); + list_inithead(&enc->cpb_slots); for (i = 0; i < enc->cpb_num; ++i) { struct rvce_cpb_slot *slot = &enc->cpb_array[i]; slot->index = i; slot->picture_type = PIPE_H264_ENC_PICTURE_TYPE_SKIP; slot->frame_num = 0; slot->pic_order_cnt = 0; - LIST_ADDTAIL(&slot->list, &enc->cpb_slots); + list_addtail(&slot->list, &enc->cpb_slots); } } @@ -131,13 +135,13 @@ } if (l1) { - LIST_DEL(&l1->list); - LIST_ADD(&l1->list, &enc->cpb_slots); + list_del(&l1->list); + list_add(&l1->list, &enc->cpb_slots); } if (l0) { - LIST_DEL(&l0->list); - LIST_ADD(&l0->list, &enc->cpb_slots); + list_del(&l0->list); + list_add(&l0->list, &enc->cpb_slots); } } @@ -341,8 +345,8 @@ slot->frame_num = enc->pic.frame_num; slot->pic_order_cnt = enc->pic.pic_order_cnt; if (!enc->pic.not_referenced) { - LIST_DEL(&slot->list); - LIST_ADD(&slot->list, &enc->cpb_slots); + list_del(&slot->list); + list_add(&slot->list, &enc->cpb_slots); } } diff -Nru mesa-19.2.8/src/gallium/drivers/r600/radeon_vce.h mesa-20.0.8/src/gallium/drivers/r600/radeon_vce.h --- mesa-19.2.8/src/gallium/drivers/r600/radeon_vce.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/r600/radeon_vce.h 2020-06-12 01:21:17.000000000 +0000 @@ -443,10 +443,6 @@ /* init vce fw 52 specific callbacks */ void radeon_vce_52_init(struct rvce_encoder *enc); -/* version specific function for getting parameters */ -void (*get_pic_param)(struct rvce_encoder *enc, - struct pipe_h264_enc_picture_desc *pic); - /* get parameters for vce 40.2.2 */ void radeon_vce_40_2_2_get_param(struct rvce_encoder *enc, struct pipe_h264_enc_picture_desc *pic); diff -Nru mesa-19.2.8/src/gallium/drivers/r600/radeon_video.c mesa-20.0.8/src/gallium/drivers/r600/radeon_video.c --- mesa-19.2.8/src/gallium/drivers/r600/radeon_video.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/r600/radeon_video.c 2020-06-12 01:21:17.000000000 +0000 @@ -278,10 +278,7 @@ case PIPE_VIDEO_CAP_MAX_HEIGHT: return 1152; case PIPE_VIDEO_CAP_PREFERED_FORMAT: - if (profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10) - return PIPE_FORMAT_P016; - else - return PIPE_FORMAT_NV12; + return PIPE_FORMAT_NV12; case PIPE_VIDEO_CAP_PREFERS_INTERLACED: case PIPE_VIDEO_CAP_SUPPORTS_INTERLACED: @@ -293,9 +290,7 @@ } else { enum pipe_video_format format = u_reduce_video_profile(profile); - if (format == PIPE_VIDEO_FORMAT_HEVC) - return false; //The firmware doesn't support interlaced HEVC. - else if (format == PIPE_VIDEO_FORMAT_JPEG) + if (format == PIPE_VIDEO_FORMAT_JPEG) return false; return true; } @@ -322,9 +317,6 @@ case PIPE_VIDEO_PROFILE_MPEG4_AVC_MAIN: case PIPE_VIDEO_PROFILE_MPEG4_AVC_HIGH: return 41; - case PIPE_VIDEO_PROFILE_HEVC_MAIN: - case PIPE_VIDEO_PROFILE_HEVC_MAIN_10: - return 186; default: return 0; } @@ -338,11 +330,6 @@ enum pipe_video_profile profile, enum pipe_video_entrypoint entrypoint) { - /* HEVC 10 bit decoding should use P016 instead of NV12 if possible */ - if (profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10) - return (format == PIPE_FORMAT_NV12) || - (format == PIPE_FORMAT_P016); - /* we can only handle this one with UVD */ if (profile != PIPE_VIDEO_PROFILE_UNKNOWN) return format == PIPE_FORMAT_NV12; diff -Nru mesa-19.2.8/src/gallium/drivers/r600/sb/sb_bc_dump.cpp mesa-20.0.8/src/gallium/drivers/r600/sb/sb_bc_dump.cpp --- mesa-19.2.8/src/gallium/drivers/r600/sb/sb_bc_dump.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/r600/sb/sb_bc_dump.cpp 2020-06-12 01:21:17.000000000 +0000 @@ -157,6 +157,8 @@ s << " ES:" << n.bc.elem_size; + s << " OP:" << n.bc.rat_inst; + if (n.bc.mark) s << " MARK"; diff -Nru mesa-19.2.8/src/gallium/drivers/r600/sb/sb_bc_parser.cpp mesa-20.0.8/src/gallium/drivers/r600/sb/sb_bc_parser.cpp --- mesa-19.2.8/src/gallium/drivers/r600/sb/sb_bc_parser.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/r600/sb/sb_bc_parser.cpp 2020-06-12 01:21:17.000000000 +0000 @@ -171,8 +171,13 @@ sh->add_input(in.gpr, preloaded, /*in.write_mask*/ 0x0F); if (ps_interp && in.spi_sid) { int k = eg_get_interpolator_index(in.interpolate, in.interpolate_location); - if (k >= 0) + if (k >= 0) { ij_interpolators[k] |= true; + if (in.uses_interpolate_at_centroid) { + k = eg_get_interpolator_index(in.interpolate, TGSI_INTERPOLATE_LOC_CENTROID); + ij_interpolators[k] |= true; + } + } } } diff -Nru mesa-19.2.8/src/gallium/drivers/r600/sb/sb_dump.cpp mesa-20.0.8/src/gallium/drivers/r600/sb/sb_dump.cpp --- mesa-19.2.8/src/gallium/drivers/r600/sb/sb_dump.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/r600/sb/sb_dump.cpp 2020-06-12 01:21:17.000000000 +0000 @@ -367,7 +367,12 @@ sblog << ", "; } - dump_vec(n.src); + if (n.subtype == NST_FETCH_INST) { + fetch_node *f = static_cast(&n); + if (f->bc.indexed) + dump_vec(n.src); + } else + dump_vec(n.src); } void dump::dump_set(shader &sh, val_set& v) { diff -Nru mesa-19.2.8/src/gallium/drivers/radeon/radeon_uvd.c mesa-20.0.8/src/gallium/drivers/radeon/radeon_uvd.c --- mesa-19.2.8/src/gallium/drivers/radeon/radeon_uvd.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/radeon/radeon_uvd.c 2020-06-12 01:21:17.000000000 +0000 @@ -741,7 +741,8 @@ } if (pic->base.profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10) { - if (target->buffer_format == PIPE_FORMAT_P016) { + if (target->buffer_format == PIPE_FORMAT_P010 || + target->buffer_format == PIPE_FORMAT_P016) { result.p010_mode = 1; result.msb_mode = 1; } else { diff -Nru mesa-19.2.8/src/gallium/drivers/radeon/radeon_vce.c mesa-20.0.8/src/gallium/drivers/radeon/radeon_vce.c --- mesa-19.2.8/src/gallium/drivers/radeon/radeon_vce.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/radeon/radeon_vce.c 2020-06-12 01:21:17.000000000 +0000 @@ -48,6 +48,10 @@ #define FW_52_8_3 ((52 << 24) | (8 << 16) | (3 << 8)) #define FW_53 (53 << 24) +/* version specific function for getting parameters */ +static void (*si_get_pic_param)(struct rvce_encoder *enc, + struct pipe_h264_enc_picture_desc *pic) = NULL; + /** * flush commands to the hardware */ @@ -91,14 +95,14 @@ { unsigned i; - LIST_INITHEAD(&enc->cpb_slots); + list_inithead(&enc->cpb_slots); for (i = 0; i < enc->cpb_num; ++i) { struct rvce_cpb_slot *slot = &enc->cpb_array[i]; slot->index = i; slot->picture_type = PIPE_H264_ENC_PICTURE_TYPE_SKIP; slot->frame_num = 0; slot->pic_order_cnt = 0; - LIST_ADDTAIL(&slot->list, &enc->cpb_slots); + list_addtail(&slot->list, &enc->cpb_slots); } } @@ -125,13 +129,13 @@ } if (l1) { - LIST_DEL(&l1->list); - LIST_ADD(&l1->list, &enc->cpb_slots); + list_del(&l1->list); + list_add(&l1->list, &enc->cpb_slots); } if (l0) { - LIST_DEL(&l0->list); - LIST_ADD(&l0->list, &enc->cpb_slots); + list_del(&l0->list); + list_add(&l0->list, &enc->cpb_slots); } } @@ -340,8 +344,8 @@ slot->frame_num = enc->pic.frame_num; slot->pic_order_cnt = enc->pic.pic_order_cnt; if (!enc->pic.not_referenced) { - LIST_DEL(&slot->list); - LIST_ADD(&slot->list, &enc->cpb_slots); + list_del(&slot->list); + list_add(&slot->list, &enc->cpb_slots); } } diff -Nru mesa-19.2.8/src/gallium/drivers/radeon/radeon_vce.h mesa-20.0.8/src/gallium/drivers/radeon/radeon_vce.h --- mesa-19.2.8/src/gallium/drivers/radeon/radeon_vce.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/radeon/radeon_vce.h 2020-06-12 01:21:17.000000000 +0000 @@ -437,10 +437,6 @@ /* init vce fw 52 specific callbacks */ void si_vce_52_init(struct rvce_encoder *enc); -/* version specific function for getting parameters */ -void (*si_get_pic_param)(struct rvce_encoder *enc, - struct pipe_h264_enc_picture_desc *pic); - /* get parameters for vce 40.2.2 */ void si_vce_40_2_2_get_param(struct rvce_encoder *enc, struct pipe_h264_enc_picture_desc *pic); diff -Nru mesa-19.2.8/src/gallium/drivers/radeon/radeon_vcn_dec.c mesa-20.0.8/src/gallium/drivers/radeon/radeon_vcn_dec.c --- mesa-19.2.8/src/gallium/drivers/radeon/radeon_vcn_dec.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/radeon/radeon_vcn_dec.c 2020-06-12 01:21:17.000000000 +0000 @@ -329,7 +329,8 @@ } if (pic->base.profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10) { - if (target->buffer_format == PIPE_FORMAT_P016) { + if (target->buffer_format == PIPE_FORMAT_P010 || + target->buffer_format == PIPE_FORMAT_P016) { result.p010_mode = 1; result.msb_mode = 1; } else { @@ -530,7 +531,8 @@ result.ref_frame_sign_bias[2] = pic->picture_parameter.pic_fields.alt_ref_frame_sign_bias; if (pic->base.profile == PIPE_VIDEO_PROFILE_VP9_PROFILE2) { - if (target->buffer_format == PIPE_FORMAT_P016) { + if (target->buffer_format == PIPE_FORMAT_P010 || + target->buffer_format == PIPE_FORMAT_P016) { result.p010_mode = 1; result.msb_mode = 1; } else { @@ -840,7 +842,7 @@ decode->sc_coeff_size = 0; decode->sw_ctxt_size = RDECODE_SESSION_CONTEXT_SIZE; - decode->db_pitch = (((struct si_screen*)dec->screen)->info.family >= CHIP_ARCTURUS && + decode->db_pitch = (((struct si_screen*)dec->screen)->info.family >= CHIP_RENOIR && dec->base.width > 32 && dec->stream_type == RDECODE_CODEC_VP9) ? align(dec->base.width, 64) : align(dec->base.width, 32) ; @@ -938,13 +940,13 @@ /* default probability + probability data */ ctx_size = 2304 * 5; - if (((struct si_screen*)dec->screen)->info.family >= CHIP_ARCTURUS) { + if (((struct si_screen*)dec->screen)->info.family >= CHIP_RENOIR) { /* SRE collocated context data */ ctx_size += 32 * 2 * 128 * 68; /* SMP collocated context data */ ctx_size += 9 * 64 * 2 * 128 * 68; /* SDB left tile pixel */ - ctx_size += 8 * 2 * 8192; + ctx_size += 8 * 2 * 2 * 8192; } else { ctx_size += 32 * 2 * 64 * 64; ctx_size += 9 * 64 * 2 * 64 * 64; @@ -1263,7 +1265,7 @@ case PIPE_VIDEO_FORMAT_VP9: max_references = MAX2(max_references, 9); - dpb_size = (((struct si_screen*)dec->screen)->info.family >= CHIP_ARCTURUS) ? + dpb_size = (((struct si_screen*)dec->screen)->info.family >= CHIP_RENOIR) ? (8192 * 4320 * 3 / 2) * max_references : (4096 * 3000 * 3 / 2) * max_references; @@ -1607,7 +1609,8 @@ dec->reg.data1 = RDECODE_VCN2_5_GPCOM_VCPU_DATA1; dec->reg.cmd = RDECODE_VCN2_5_GPCOM_VCPU_CMD; dec->reg.cntl = RDECODE_VCN2_5_ENGINE_CNTL; - } else if (sctx->family >= CHIP_NAVI10) { + dec->jpg.direct_reg = true; + } else if (sctx->family >= CHIP_NAVI10 || sctx->family == CHIP_RENOIR) { dec->reg.data0 = RDECODE_VCN2_GPCOM_VCPU_DATA0; dec->reg.data1 = RDECODE_VCN2_GPCOM_VCPU_DATA1; dec->reg.cmd = RDECODE_VCN2_GPCOM_VCPU_CMD; diff -Nru mesa-19.2.8/src/gallium/drivers/radeon/radeon_vcn_enc_2_0.c mesa-20.0.8/src/gallium/drivers/radeon/radeon_vcn_enc_2_0.c --- mesa-19.2.8/src/gallium/drivers/radeon/radeon_vcn_enc_2_0.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/radeon/radeon_vcn_enc_2_0.c 2020-06-12 01:21:17.000000000 +0000 @@ -196,7 +196,13 @@ radeon_enc_code_se(enc, 0x0); radeon_enc_code_fixed_bits(enc, enc->enc_pic.hevc_spec_misc.constrained_intra_pred_flag, 1); radeon_enc_code_fixed_bits(enc, 0x0, 1); - radeon_enc_code_fixed_bits(enc, 0x0, 1); + if (enc->enc_pic.rc_session_init.rate_control_method == + RENCODE_RATE_CONTROL_METHOD_NONE) + radeon_enc_code_fixed_bits(enc, 0x0, 1); + else { + radeon_enc_code_fixed_bits(enc, 0x1, 1); + radeon_enc_code_ue(enc, 0x0); + } radeon_enc_code_se(enc, enc->enc_pic.hevc_deblock.cb_qp_offset); radeon_enc_code_se(enc, enc->enc_pic.hevc_deblock.cr_qp_offset); radeon_enc_code_fixed_bits(enc, 0x0, 1); diff -Nru mesa-19.2.8/src/gallium/drivers/radeon/radeon_vcn_enc.c mesa-20.0.8/src/gallium/drivers/radeon/radeon_vcn_enc.c --- mesa-19.2.8/src/gallium/drivers/radeon/radeon_vcn_enc.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/radeon/radeon_vcn_enc.c 2020-06-12 01:21:17.000000000 +0000 @@ -52,10 +52,17 @@ enc->enc_pic.ref_idx_l1 = pic->ref_idx_l1; enc->enc_pic.not_referenced = pic->not_referenced; enc->enc_pic.is_idr = (pic->picture_type == PIPE_H264_ENC_PICTURE_TYPE_IDR); - enc->enc_pic.crop_left = 0; - enc->enc_pic.crop_right = (align(enc->base.width, 16) - enc->base.width) / 2; - enc->enc_pic.crop_top = 0; - enc->enc_pic.crop_bottom = (align(enc->base.height, 16) - enc->base.height) / 2; + if (pic->pic_ctrl.enc_frame_cropping_flag) { + enc->enc_pic.crop_left = pic->pic_ctrl.enc_frame_crop_left_offset; + enc->enc_pic.crop_right = pic->pic_ctrl.enc_frame_crop_right_offset; + enc->enc_pic.crop_top = pic->pic_ctrl.enc_frame_crop_top_offset; + enc->enc_pic.crop_bottom = pic->pic_ctrl.enc_frame_crop_bottom_offset; + } else { + enc->enc_pic.crop_left = 0; + enc->enc_pic.crop_right = (align(enc->base.width, 16) - enc->base.width) / 2; + enc->enc_pic.crop_top = 0; + enc->enc_pic.crop_bottom = (align(enc->base.height, 16) - enc->base.height) / 2; + } enc->enc_pic.rc_layer_init.target_bit_rate = pic->rate_ctrl.target_bitrate; enc->enc_pic.rc_layer_init.peak_bit_rate = pic->rate_ctrl.peak_bitrate; enc->enc_pic.rc_layer_init.frame_rate_num = pic->rate_ctrl.frame_rate_num; @@ -411,7 +418,7 @@ goto error; } - if (sscreen->info.family <= CHIP_RAVEN) + if (sscreen->info.family <= CHIP_RAVEN2) radeon_enc_1_2_init(enc); else radeon_enc_2_0_init(enc); diff -Nru mesa-19.2.8/src/gallium/drivers/radeon/radeon_video.c mesa-20.0.8/src/gallium/drivers/radeon/radeon_video.c --- mesa-19.2.8/src/gallium/drivers/radeon/radeon_video.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/radeon/radeon_video.c 2020-06-12 01:21:17.000000000 +0000 @@ -125,89 +125,3 @@ si_sdma_clear_buffer(sctx, &buffer->res->b.b, 0, buffer->res->b.b.width0, 0); context->flush(context, NULL, 0); } - -/** - * join surfaces into the same buffer with identical tiling params - * sumup their sizes and replace the backend buffers with a single bo - */ -void si_vid_join_surfaces(struct si_context *sctx, - struct pb_buffer** buffers[VL_NUM_COMPONENTS], - struct radeon_surf *surfaces[VL_NUM_COMPONENTS]) -{ - struct radeon_winsys *ws = sctx->ws;; - unsigned best_tiling, best_wh, off; - unsigned size, alignment; - struct pb_buffer *pb; - unsigned i, j; - - for (i = 0, best_tiling = 0, best_wh = ~0; i < VL_NUM_COMPONENTS; ++i) { - unsigned wh; - - if (!surfaces[i]) - continue; - - if (sctx->chip_class < GFX9) { - /* choose the smallest bank w/h for now */ - wh = surfaces[i]->u.legacy.bankw * surfaces[i]->u.legacy.bankh; - if (wh < best_wh) { - best_wh = wh; - best_tiling = i; - } - } - } - - for (i = 0, off = 0; i < VL_NUM_COMPONENTS; ++i) { - if (!surfaces[i]) - continue; - - /* adjust the texture layer offsets */ - off = align(off, surfaces[i]->surf_alignment); - - if (sctx->chip_class < GFX9) { - /* copy the tiling parameters */ - surfaces[i]->u.legacy.bankw = surfaces[best_tiling]->u.legacy.bankw; - surfaces[i]->u.legacy.bankh = surfaces[best_tiling]->u.legacy.bankh; - surfaces[i]->u.legacy.mtilea = surfaces[best_tiling]->u.legacy.mtilea; - surfaces[i]->u.legacy.tile_split = surfaces[best_tiling]->u.legacy.tile_split; - - for (j = 0; j < ARRAY_SIZE(surfaces[i]->u.legacy.level); ++j) - surfaces[i]->u.legacy.level[j].offset += off; - } else { - surfaces[i]->u.gfx9.surf_offset += off; - for (j = 0; j < ARRAY_SIZE(surfaces[i]->u.gfx9.offset); ++j) - surfaces[i]->u.gfx9.offset[j] += off; - } - - surfaces[i]->flags |= RADEON_SURF_IMPORTED; - off += surfaces[i]->surf_size; - } - - for (i = 0, size = 0, alignment = 0; i < VL_NUM_COMPONENTS; ++i) { - if (!buffers[i] || !*buffers[i]) - continue; - - size = align(size, (*buffers[i])->alignment); - size += (*buffers[i])->size; - alignment = MAX2(alignment, (*buffers[i])->alignment * 1); - } - - if (!size) - return; - - /* TODO: 2D tiling workaround */ - alignment *= 2; - - pb = ws->buffer_create(ws, size, alignment, RADEON_DOMAIN_VRAM, - RADEON_FLAG_GTT_WC); - if (!pb) - return; - - for (i = 0; i < VL_NUM_COMPONENTS; ++i) { - if (!buffers[i] || !*buffers[i]) - continue; - - pb_reference(buffers[i], pb); - } - - pb_reference(&pb, NULL); -} diff -Nru mesa-19.2.8/src/gallium/drivers/radeon/radeon_video.h mesa-20.0.8/src/gallium/drivers/radeon/radeon_video.h --- mesa-19.2.8/src/gallium/drivers/radeon/radeon_video.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/radeon/radeon_video.h 2020-06-12 01:21:17.000000000 +0000 @@ -60,10 +60,4 @@ /* clear the buffer with zeros */ void si_vid_clear_buffer(struct pipe_context *context, struct rvid_buffer* buffer); -/* join surfaces into the same buffer with identical tiling params - sumup their sizes and replace the backend buffers with a single bo */ -void si_vid_join_surfaces(struct si_context *sctx, - struct pb_buffer** buffers[VL_NUM_COMPONENTS], - struct radeon_surf *surfaces[VL_NUM_COMPONENTS]); - #endif // RADEON_VIDEO_H diff -Nru mesa-19.2.8/src/gallium/drivers/radeon/radeon_winsys.h mesa-20.0.8/src/gallium/drivers/radeon/radeon_winsys.h --- mesa-19.2.8/src/gallium/drivers/radeon/radeon_winsys.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/radeon/radeon_winsys.h 2020-06-12 01:21:17.000000000 +0000 @@ -99,19 +99,6 @@ #define RADEON_SPARSE_PAGE_SIZE (64 * 1024) -enum ring_type { - RING_GFX = 0, - RING_COMPUTE, - RING_DMA, - RING_UVD, - RING_VCE, - RING_UVD_ENC, - RING_VCN_DEC, - RING_VCN_ENC, - RING_VCN_JPEG, - RING_LAST, -}; - enum radeon_value_id { RADEON_REQUESTED_VRAM_MEMORY, RADEON_REQUESTED_GTT_MEMORY, @@ -233,6 +220,8 @@ unsigned dcc_offset_256B:24; unsigned dcc_pitch_max:14; /* (mip chain pitch - 1) for DCN */ unsigned dcc_independent_64B:1; + + bool scanout; } gfx9; } u; @@ -378,12 +367,10 @@ * \param ws The winsys this function is called from. * \param whandle A winsys handle pointer as was received from a state * tracker. - * \param stride The returned buffer stride in bytes. */ struct pb_buffer *(*buffer_from_handle)(struct radeon_winsys *ws, struct winsys_handle *whandle, - unsigned vm_alignment, - unsigned *stride, unsigned *offset); + unsigned vm_alignment); /** * Get a winsys buffer from a user pointer. The resulting buffer can't @@ -414,13 +401,10 @@ * \param ws The winsys instance for which the handle is to be valid * \param buf A winsys buffer object to get the handle from. * \param whandle A winsys handle pointer. - * \param stride A stride of the buffer in bytes, for texturing. * \return true on success. */ bool (*buffer_get_handle)(struct radeon_winsys *ws, struct pb_buffer *buf, - unsigned stride, unsigned offset, - unsigned slice_size, struct winsys_handle *whandle); /** @@ -465,6 +449,14 @@ */ enum radeon_bo_domain (*buffer_get_initial_domain)(struct pb_buffer *buf); + /** + * Query the flags used for creation of this buffer. + * + * Note that for imported buffer this may be lossy since not all flags + * are passed 1:1. + */ + enum radeon_bo_flag (*buffer_get_flags)(struct pb_buffer *buf); + /************************************************************************** * Command submission. * diff -Nru mesa-19.2.8/src/gallium/drivers/radeonsi/Android.mk mesa-20.0.8/src/gallium/drivers/radeonsi/Android.mk --- mesa-19.2.8/src/gallium/drivers/radeonsi/Android.mk 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/radeonsi/Android.mk 2020-06-12 01:21:17.000000000 +0000 @@ -36,10 +36,13 @@ LOCAL_C_INCLUDES := \ $(MESA_TOP)/src/amd/common \ + $(MESA_TOP)/src/amd/llvm \ $(call generated-sources-dir-for,STATIC_LIBRARIES,libmesa_amd_common,,)/common \ $(call generated-sources-dir-for,STATIC_LIBRARIES,libmesa_nir,,)/nir -LOCAL_STATIC_LIBRARIES := libmesa_amd_common +LOCAL_STATIC_LIBRARIES := \ + libmesa_amd_common \ + libmesa_galliumvl LOCAL_SHARED_LIBRARIES := libdrm_radeon LOCAL_MODULE := libmesa_pipe_radeonsi @@ -62,7 +65,7 @@ $(hide) $(MESA_PYTHON2) $(MERGE_DRIINFO) $(GEN_DRIINFO_INPUTS) > $@ || ($(RM) $@; false) GEN10_FORMAT_TABLE_INPUTS := \ - $(MESA_TOP)/src/gallium/auxiliary/util/u_format.csv \ + $(MESA_TOP)/src/util/format/u_format.csv \ $(MESA_TOP)/src/amd/registers/gfx10-rsrc.json GEN10_FORMAT_TABLE_DEP := \ diff -Nru mesa-19.2.8/src/gallium/drivers/radeonsi/cik_sdma.c mesa-20.0.8/src/gallium/drivers/radeonsi/cik_sdma.c --- mesa-19.2.8/src/gallium/drivers/radeonsi/cik_sdma.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/radeonsi/cik_sdma.c 2020-06-12 01:21:17.000000000 +0000 @@ -26,47 +26,6 @@ #include "sid.h" #include "si_pipe.h" -static void cik_sdma_copy_buffer(struct si_context *ctx, - struct pipe_resource *dst, - struct pipe_resource *src, - uint64_t dst_offset, - uint64_t src_offset, - uint64_t size) -{ - struct radeon_cmdbuf *cs = ctx->dma_cs; - unsigned i, ncopy, csize; - struct si_resource *sdst = si_resource(dst); - struct si_resource *ssrc = si_resource(src); - - /* Mark the buffer range of destination as valid (initialized), - * so that transfer_map knows it should wait for the GPU when mapping - * that range. */ - util_range_add(&sdst->valid_buffer_range, dst_offset, - dst_offset + size); - - dst_offset += sdst->gpu_address; - src_offset += ssrc->gpu_address; - - ncopy = DIV_ROUND_UP(size, CIK_SDMA_COPY_MAX_SIZE); - si_need_dma_space(ctx, ncopy * 7, sdst, ssrc); - - for (i = 0; i < ncopy; i++) { - csize = MIN2(size, CIK_SDMA_COPY_MAX_SIZE); - radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY, - CIK_SDMA_COPY_SUB_OPCODE_LINEAR, - 0)); - radeon_emit(cs, ctx->chip_class >= GFX9 ? csize - 1 : csize); - radeon_emit(cs, 0); /* src/dst endian swap */ - radeon_emit(cs, src_offset); - radeon_emit(cs, src_offset >> 32); - radeon_emit(cs, dst_offset); - radeon_emit(cs, dst_offset >> 32); - dst_offset += csize; - src_offset += csize; - size -= csize; - } -} - static unsigned minify_as_blocks(unsigned width, unsigned level, unsigned blk_w) { width = u_minify(width, level); @@ -95,6 +54,186 @@ (G_009910_PIPE_CONFIG(tile_mode) << 26); } + +static bool si_sdma_v4_copy_texture(struct si_context *sctx, + struct pipe_resource *dst, + unsigned dst_level, + unsigned dstx, unsigned dsty, unsigned dstz, + struct pipe_resource *src, + unsigned src_level, + const struct pipe_box *src_box) +{ + struct si_texture *ssrc = (struct si_texture*)src; + struct si_texture *sdst = (struct si_texture*)dst; + + unsigned bpp = sdst->surface.bpe; + uint64_t dst_address = sdst->buffer.gpu_address + + sdst->surface.u.gfx9.surf_offset; + uint64_t src_address = ssrc->buffer.gpu_address + + ssrc->surface.u.gfx9.surf_offset; + unsigned dst_pitch = sdst->surface.u.gfx9.surf_pitch; + unsigned src_pitch = ssrc->surface.u.gfx9.surf_pitch; + uint64_t dst_slice_pitch = ((uint64_t)sdst->surface.u.gfx9.surf_slice_size) / bpp; + uint64_t src_slice_pitch = ((uint64_t)ssrc->surface.u.gfx9.surf_slice_size) / bpp; + unsigned srcx = src_box->x / ssrc->surface.blk_w; + unsigned srcy = src_box->y / ssrc->surface.blk_h; + unsigned srcz = src_box->z; + unsigned copy_width = DIV_ROUND_UP(src_box->width, ssrc->surface.blk_w); + unsigned copy_height = DIV_ROUND_UP(src_box->height, ssrc->surface.blk_h); + unsigned copy_depth = src_box->depth; + unsigned xalign = MAX2(1, 4 / bpp); + + assert(src_level <= src->last_level); + assert(dst_level <= dst->last_level); + assert(sdst->surface.u.gfx9.surf_offset + + dst_slice_pitch * bpp * (dstz + src_box->depth) <= + sdst->buffer.buf->size); + assert(ssrc->surface.u.gfx9.surf_offset + + src_slice_pitch * bpp * (srcz + src_box->depth) <= + ssrc->buffer.buf->size); + + if (!si_prepare_for_dma_blit(sctx, sdst, dst_level, dstx, dsty, + dstz, ssrc, src_level, src_box)) + return false; + + dstx /= sdst->surface.blk_w; + dsty /= sdst->surface.blk_h; + + if (srcx >= (1 << 14) || + srcy >= (1 << 14) || + srcz >= (1 << 11) || + dstx >= (1 << 14) || + dsty >= (1 << 14) || + dstz >= (1 << 11)) + return false; + + /* Linear -> linear sub-window copy. */ + if (ssrc->surface.is_linear && + sdst->surface.is_linear) { + struct radeon_cmdbuf *cs = sctx->sdma_cs; + + /* Check if everything fits into the bitfields */ + if (!(src_pitch <= (1 << 19) && + dst_pitch <= (1 << 19) && + src_slice_pitch <= (1 << 28) && + dst_slice_pitch <= (1 << 28) && + copy_width <= (1 << 14) && + copy_height <= (1 << 14) && + copy_depth <= (1 << 11))) + return false; + + si_need_dma_space(sctx, 13, &sdst->buffer, &ssrc->buffer); + + src_address += ssrc->surface.u.gfx9.offset[src_level]; + dst_address += sdst->surface.u.gfx9.offset[dst_level]; + + /* Check alignments */ + if ((src_address % 4) != 0 || + (dst_address % 4) != 0 || + (src_pitch % xalign) != 0) + return false; + + radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY, + CIK_SDMA_COPY_SUB_OPCODE_LINEAR_SUB_WINDOW, 0) | + (util_logbase2(bpp) << 29)); + radeon_emit(cs, src_address); + radeon_emit(cs, src_address >> 32); + radeon_emit(cs, srcx | (srcy << 16)); + radeon_emit(cs, srcz | ((src_pitch - 1) << 13)); + radeon_emit(cs, src_slice_pitch - 1); + radeon_emit(cs, dst_address); + radeon_emit(cs, dst_address >> 32); + radeon_emit(cs, dstx | (dsty << 16)); + radeon_emit(cs, dstz | ((dst_pitch - 1) << 13)); + radeon_emit(cs, dst_slice_pitch - 1); + radeon_emit(cs, (copy_width - 1) | ((copy_height - 1) << 16)); + radeon_emit(cs, (copy_depth - 1)); + return true; + } + + /* Linear <-> Tiled sub-window copy */ + if (ssrc->surface.is_linear != sdst->surface.is_linear) { + struct si_texture *tiled = ssrc->surface.is_linear ? sdst : ssrc; + struct si_texture *linear = tiled == ssrc ? sdst : ssrc; + unsigned tiled_level = tiled == ssrc ? src_level : dst_level; + unsigned linear_level = linear == ssrc ? src_level : dst_level; + unsigned tiled_x = tiled == ssrc ? srcx : dstx; + unsigned linear_x = linear == ssrc ? srcx : dstx; + unsigned tiled_y = tiled == ssrc ? srcy : dsty; + unsigned linear_y = linear == ssrc ? srcy : dsty; + unsigned tiled_z = tiled == ssrc ? srcz : dstz; + unsigned linear_z = linear == ssrc ? srcz : dstz; + unsigned tiled_width = tiled == ssrc ? + DIV_ROUND_UP(ssrc->buffer.b.b.width0, ssrc->surface.blk_w) : + DIV_ROUND_UP(sdst->buffer.b.b.width0, sdst->surface.blk_w); + unsigned tiled_height = tiled == ssrc ? + DIV_ROUND_UP(ssrc->buffer.b.b.height0, ssrc->surface.blk_h) : + DIV_ROUND_UP(sdst->buffer.b.b.height0, sdst->surface.blk_h); + unsigned tiled_depth = tiled == ssrc ? + ssrc->buffer.b.b.depth0 : + sdst->buffer.b.b.depth0; + unsigned linear_pitch = linear == ssrc ? src_pitch : dst_pitch; + unsigned linear_slice_pitch = linear == ssrc ? src_slice_pitch : dst_slice_pitch; + uint64_t tiled_address = tiled == ssrc ? src_address : dst_address; + uint64_t linear_address = linear == ssrc ? src_address : dst_address; + struct radeon_cmdbuf *cs = sctx->sdma_cs; + + linear_address += linear->surface.u.gfx9.offset[linear_level]; + + /* Check if everything fits into the bitfields */ + if (!(tiled_x <= (1 << 14) && + tiled_y <= (1 << 14) && + tiled_z <= (1 << 11) && + tiled_width <= (1 << 14) && + tiled_height <= (1 << 14) && + tiled_depth <= (1 << 11) && + tiled->surface.u.gfx9.surf.epitch <= (1 << 16) && + linear_x <= (1 << 14) && + linear_y <= (1 << 14) && + linear_z <= (1 << 11) && + linear_pitch <= (1 << 14) && + linear_slice_pitch <= (1 << 28) && + copy_width <= (1 << 14) && + copy_height <= (1 << 14) && + copy_depth <= (1 << 11))) + return false; + + /* Check alignments */ + if ((tiled_address % 256 != 0) || + (linear_address % 4 != 0) || + (linear_pitch % xalign != 0) || + (linear_slice_pitch % xalign != 0)) + return false; + + si_need_dma_space(sctx, 14, &sdst->buffer, &ssrc->buffer); + + radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY, + CIK_SDMA_COPY_SUB_OPCODE_TILED_SUB_WINDOW, 0) | + tiled->buffer.b.b.last_level << 20 | + tiled_level << 24 | + (linear == sdst ? 1u : 0) << 31); + radeon_emit(cs, (uint32_t) tiled_address); + radeon_emit(cs, (uint32_t) (tiled_address >> 32)); + radeon_emit(cs, tiled_x | (tiled_y << 16)); + radeon_emit(cs, tiled_z | ((tiled_width - 1) << 16)); + radeon_emit(cs, (tiled_height - 1) | (tiled_depth - 1) << 16); + radeon_emit(cs, util_logbase2(bpp) | + tiled->surface.u.gfx9.surf.swizzle_mode << 3 | + tiled->surface.u.gfx9.resource_type << 9 | + tiled->surface.u.gfx9.surf.epitch << 16); + radeon_emit(cs, (uint32_t) linear_address); + radeon_emit(cs, (uint32_t) (linear_address >> 32)); + radeon_emit(cs, linear_x | (linear_y << 16)); + radeon_emit(cs, linear_z | ((linear_pitch - 1) << 16)); + radeon_emit(cs, linear_slice_pitch - 1); + radeon_emit(cs, (copy_width - 1) | ((copy_height - 1) << 16)); + radeon_emit(cs, (copy_depth - 1)); + return true; + } + + return false; +} + static bool cik_sdma_copy_texture(struct si_context *sctx, struct pipe_resource *dst, unsigned dst_level, @@ -190,7 +329,7 @@ sctx->family != CHIP_KAVERI) || (srcx + copy_width != (1 << 14) && srcy + copy_height != (1 << 14)))) { - struct radeon_cmdbuf *cs = sctx->dma_cs; + struct radeon_cmdbuf *cs = sctx->sdma_cs; si_need_dma_space(sctx, 13, &sdst->buffer, &ssrc->buffer); @@ -351,7 +490,7 @@ copy_width_aligned <= (1 << 14) && copy_height <= (1 << 14) && copy_depth <= (1 << 11)) { - struct radeon_cmdbuf *cs = sctx->dma_cs; + struct radeon_cmdbuf *cs = sctx->sdma_cs; uint32_t direction = linear == sdst ? 1u << 31 : 0; si_need_dma_space(sctx, 14, &sdst->buffer, &ssrc->buffer); @@ -445,7 +584,7 @@ (srcx + copy_width_aligned != (1 << 14) && srcy + copy_height_aligned != (1 << 14) && dstx + copy_width != (1 << 14)))) { - struct radeon_cmdbuf *cs = sctx->dma_cs; + struct radeon_cmdbuf *cs = sctx->sdma_cs; si_need_dma_space(sctx, 15, &sdst->buffer, &ssrc->buffer); @@ -489,29 +628,31 @@ { struct si_context *sctx = (struct si_context *)ctx; - if (!sctx->dma_cs || + assert(src->target != PIPE_BUFFER); + + if (!sctx->sdma_cs || src->flags & PIPE_RESOURCE_FLAG_SPARSE || dst->flags & PIPE_RESOURCE_FLAG_SPARSE) goto fallback; - /* If src is a buffer and dst is a texture, we are uploading metadata. */ - if (src->target == PIPE_BUFFER) { - cik_sdma_copy_buffer(sctx, dst, src, dstx, src_box->x, src_box->width); - return; - } - /* SDMA causes corruption. See: * https://bugs.freedesktop.org/show_bug.cgi?id=110575 * https://bugs.freedesktop.org/show_bug.cgi?id=110635 * * Keep SDMA enabled on APUs. */ - if ((sctx->screen->debug_flags & DBG(FORCE_DMA) || - !sctx->screen->info.has_dedicated_vram) && - (sctx->chip_class == GFX7 || sctx->chip_class == GFX8) && - cik_sdma_copy_texture(sctx, dst, dst_level, dstx, dsty, dstz, - src, src_level, src_box)) - return; + if (sctx->screen->debug_flags & DBG(FORCE_SDMA) || + (!sctx->screen->info.has_dedicated_vram && + !(sctx->screen->debug_flags & DBG(NO_SDMA_COPY_IMAGE)))) { + if ((sctx->chip_class == GFX7 || sctx->chip_class == GFX8) && + cik_sdma_copy_texture(sctx, dst, dst_level, dstx, dsty, dstz, + src, src_level, src_box)) + return; + else if (sctx->chip_class == GFX9 && + si_sdma_v4_copy_texture(sctx, dst, dst_level, dstx, dsty, dstz, + src, src_level, src_box)) + return; + } fallback: si_resource_copy_region(ctx, dst, dst_level, dstx, dsty, dstz, diff -Nru mesa-19.2.8/src/gallium/drivers/radeonsi/gfx10_format_table.py mesa-20.0.8/src/gallium/drivers/radeonsi/gfx10_format_table.py --- mesa-19.2.8/src/gallium/drivers/radeonsi/gfx10_format_table.py 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/radeonsi/gfx10_format_table.py 2020-06-12 01:21:17.000000000 +0000 @@ -34,8 +34,8 @@ import sys AMD_REGISTERS = os.path.abspath(os.path.join(os.path.dirname(sys.argv[0]), "../../../amd/registers")) -GALLIUM_UTIL = os.path.abspath(os.path.join(os.path.dirname(sys.argv[0]), "../../auxiliary/util")) -sys.path.extend([AMD_REGISTERS, GALLIUM_UTIL]) +UTIL_FORMAT = os.path.abspath(os.path.join(os.path.dirname(sys.argv[0]), "../../../util/format")) +sys.path.extend([AMD_REGISTERS, UTIL_FORMAT]) from regdb import Object, RegisterDatabase from u_format_parse import * diff -Nru mesa-19.2.8/src/gallium/drivers/radeonsi/gfx10_query.c mesa-20.0.8/src/gallium/drivers/radeonsi/gfx10_query.c --- mesa-19.2.8/src/gallium/drivers/radeonsi/gfx10_query.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/radeonsi/gfx10_query.c 2020-06-12 01:21:17.000000000 +0000 @@ -79,7 +79,7 @@ static void emit_shader_query(struct si_context *sctx) { - assert(!LIST_IS_EMPTY(&sctx->shader_query_buffers)); + assert(!list_is_empty(&sctx->shader_query_buffers)); struct gfx10_sh_query_buffer *qbuf = list_last_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list); @@ -106,7 +106,7 @@ if (qbuf->list.prev == &sctx->shader_query_buffers) continue; /* keep the oldest buffer for recycling */ - LIST_DEL(&qbuf->list); + list_del(&qbuf->list); si_resource_reference(&qbuf->buf, NULL); FREE(qbuf); } @@ -119,7 +119,7 @@ struct gfx10_sh_query_buffer *qbuf = NULL; - if (!LIST_IS_EMPTY(&sctx->shader_query_buffers)) { + if (!list_is_empty(&sctx->shader_query_buffers)) { qbuf = list_last_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list); if (qbuf->head + sizeof(struct gfx10_sh_query_buffer_mem) <= qbuf->buf->b.b.width0) @@ -131,7 +131,7 @@ !si_rings_is_buffer_referenced(sctx, qbuf->buf->buf, RADEON_USAGE_READWRITE) && sctx->ws->buffer_wait(qbuf->buf->buf, 0, RADEON_USAGE_READWRITE)) { /* Can immediately re-use the oldest buffer */ - LIST_DEL(&qbuf->list); + list_del(&qbuf->list); } else { qbuf = NULL; } @@ -170,7 +170,7 @@ results[32 * i + 16] = 0; } - LIST_ADDTAIL(&qbuf->list, &sctx->shader_query_buffers); + list_addtail(&qbuf->list, &sctx->shader_query_buffers); qbuf->head = 0; qbuf->refcount = sctx->num_active_shader_queries; @@ -504,17 +504,17 @@ void gfx10_init_query(struct si_context *sctx) { - LIST_INITHEAD(&sctx->shader_query_buffers); + list_inithead(&sctx->shader_query_buffers); sctx->atoms.s.shader_query.emit = emit_shader_query; } void gfx10_destroy_query(struct si_context *sctx) { - while (!LIST_IS_EMPTY(&sctx->shader_query_buffers)) { + while (!list_is_empty(&sctx->shader_query_buffers)) { struct gfx10_sh_query_buffer *qbuf = list_first_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list); - LIST_DEL(&qbuf->list); + list_del(&qbuf->list); assert(!qbuf->refcount); si_resource_reference(&qbuf->buf, NULL); diff -Nru mesa-19.2.8/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c mesa-20.0.8/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c --- mesa-19.2.8/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c 2020-06-12 01:21:17.000000000 +0000 @@ -28,15 +28,16 @@ #include "util/u_memory.h" #include "util/u_prim.h" +#include "ac_llvm_cull.h" static LLVMValueRef get_wave_id_in_tg(struct si_shader_context *ctx) { - return si_unpack_param(ctx, ctx->param_merged_wave_info, 24, 4); + return si_unpack_param(ctx, ctx->merged_wave_info, 24, 4); } static LLVMValueRef get_tgsize(struct si_shader_context *ctx) { - return si_unpack_param(ctx, ctx->param_merged_wave_info, 28, 4); + return si_unpack_param(ctx, ctx->merged_wave_info, 28, 4); } static LLVMValueRef get_thread_id_in_tg(struct si_shader_context *ctx) @@ -50,98 +51,167 @@ static LLVMValueRef ngg_get_vtx_cnt(struct si_shader_context *ctx) { - return ac_build_bfe(&ctx->ac, ctx->gs_tg_info, - LLVMConstInt(ctx->ac.i32, 12, false), - LLVMConstInt(ctx->ac.i32, 9, false), - false); + return si_unpack_param(ctx, ctx->gs_tg_info, 12, 9); } static LLVMValueRef ngg_get_prim_cnt(struct si_shader_context *ctx) { - return ac_build_bfe(&ctx->ac, ctx->gs_tg_info, - LLVMConstInt(ctx->ac.i32, 22, false), - LLVMConstInt(ctx->ac.i32, 9, false), - false); + return si_unpack_param(ctx, ctx->gs_tg_info, 22, 9); } static LLVMValueRef ngg_get_ordered_id(struct si_shader_context *ctx) { - return ac_build_bfe(&ctx->ac, ctx->gs_tg_info, - ctx->i32_0, - LLVMConstInt(ctx->ac.i32, 11, false), - false); + return si_unpack_param(ctx, ctx->gs_tg_info, 0, 12); } static LLVMValueRef ngg_get_query_buf(struct si_shader_context *ctx) { - LLVMValueRef buf_ptr = LLVMGetParam(ctx->main_fn, - ctx->param_rw_buffers); + LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers); return ac_build_load_to_sgpr(&ctx->ac, buf_ptr, - LLVMConstInt(ctx->i32, GFX10_GS_QUERY_BUF, false)); + LLVMConstInt(ctx->ac.i32, GFX10_GS_QUERY_BUF, false)); } -/* Send GS Alloc Req message from the first wave of the group to SPI. - * Message payload is: - * - bits 0..10: vertices in group - * - bits 12..22: primitives in group +static LLVMValueRef ngg_get_initial_edgeflag(struct si_shader_context *ctx, unsigned index) +{ + if (ctx->type == PIPE_SHADER_VERTEX) { + LLVMValueRef tmp; + tmp = LLVMBuildLShr(ctx->ac.builder, + ac_get_arg(&ctx->ac, ctx->args.gs_invocation_id), + LLVMConstInt(ctx->ac.i32, 8 + index, false), ""); + return LLVMBuildTrunc(ctx->ac.builder, tmp, ctx->ac.i1, ""); + } + return ctx->ac.i1false; +} + +/** + * Return the number of vertices as a constant in \p num_vertices, + * and return a more precise value as LLVMValueRef from the function. */ -static void build_sendmsg_gs_alloc_req(struct si_shader_context *ctx, - LLVMValueRef vtx_cnt, - LLVMValueRef prim_cnt) +static LLVMValueRef ngg_get_vertices_per_prim(struct si_shader_context *ctx, + unsigned *num_vertices) { - LLVMBuilderRef builder = ctx->ac.builder; - LLVMValueRef tmp; + const struct si_shader_info *info = &ctx->shader->selector->info; - tmp = LLVMBuildICmp(builder, LLVMIntEQ, get_wave_id_in_tg(ctx), ctx->ac.i32_0, ""); - ac_build_ifcc(&ctx->ac, tmp, 5020); + if (ctx->type == PIPE_SHADER_VERTEX) { + if (info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD]) { + /* Blits always use axis-aligned rectangles with 3 vertices. */ + *num_vertices = 3; + return LLVMConstInt(ctx->ac.i32, 3, 0); + } else { + /* We always build up all three indices for the prim export + * independent of the primitive type. The additional garbage + * data shouldn't hurt. This number doesn't matter with + * NGG passthrough. + */ + *num_vertices = 3; + + /* Extract OUTPRIM field. */ + LLVMValueRef num = si_unpack_param(ctx, ctx->vs_state_bits, 2, 2); + return LLVMBuildAdd(ctx->ac.builder, num, ctx->ac.i32_1, ""); + } + } else { + assert(ctx->type == PIPE_SHADER_TESS_EVAL); - tmp = LLVMBuildShl(builder, prim_cnt, LLVMConstInt(ctx->ac.i32, 12, false),""); - tmp = LLVMBuildOr(builder, tmp, vtx_cnt, ""); - ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_ALLOC_REQ, tmp); + if (info->properties[TGSI_PROPERTY_TES_POINT_MODE]) + *num_vertices = 1; + else if (info->properties[TGSI_PROPERTY_TES_PRIM_MODE] == PIPE_PRIM_LINES) + *num_vertices = 2; + else + *num_vertices = 3; - ac_build_endif(&ctx->ac, 5020); + return LLVMConstInt(ctx->ac.i32, *num_vertices, false); + } } -struct ngg_prim { - unsigned num_vertices; - LLVMValueRef isnull; - LLVMValueRef index[3]; - LLVMValueRef edgeflag[3]; -}; +bool gfx10_ngg_export_prim_early(struct si_shader *shader) +{ + struct si_shader_selector *sel = shader->selector; + + assert(shader->key.as_ngg && !shader->key.as_es); + + return sel->type != PIPE_SHADER_GEOMETRY && + !sel->info.writes_edgeflag; +} -static void build_export_prim(struct si_shader_context *ctx, - const struct ngg_prim *prim) +void gfx10_ngg_build_sendmsg_gs_alloc_req(struct si_shader_context *ctx) +{ + ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx), + ngg_get_vtx_cnt(ctx), + ngg_get_prim_cnt(ctx)); +} + +void gfx10_ngg_build_export_prim(struct si_shader_context *ctx, + LLVMValueRef user_edgeflags[3], + LLVMValueRef prim_passthrough) { LLVMBuilderRef builder = ctx->ac.builder; - struct ac_export_args args; - LLVMValueRef tmp; - tmp = LLVMBuildZExt(builder, prim->isnull, ctx->ac.i32, ""); - args.out[0] = LLVMBuildShl(builder, tmp, LLVMConstInt(ctx->ac.i32, 31, false), ""); + if (gfx10_is_ngg_passthrough(ctx->shader) || + ctx->shader->key.opt.ngg_culling) { + ac_build_ifcc(&ctx->ac, si_is_gs_thread(ctx), 6001); + { + struct ac_ngg_prim prim = {}; - for (unsigned i = 0; i < prim->num_vertices; ++i) { - tmp = LLVMBuildShl(builder, prim->index[i], - LLVMConstInt(ctx->ac.i32, 10 * i, false), ""); - args.out[0] = LLVMBuildOr(builder, args.out[0], tmp, ""); - tmp = LLVMBuildZExt(builder, prim->edgeflag[i], ctx->ac.i32, ""); - tmp = LLVMBuildShl(builder, tmp, - LLVMConstInt(ctx->ac.i32, 10 * i + 9, false), ""); - args.out[0] = LLVMBuildOr(builder, args.out[0], tmp, ""); - } - - args.out[0] = LLVMBuildBitCast(builder, args.out[0], ctx->ac.f32, ""); - args.out[1] = LLVMGetUndef(ctx->ac.f32); - args.out[2] = LLVMGetUndef(ctx->ac.f32); - args.out[3] = LLVMGetUndef(ctx->ac.f32); - - args.target = V_008DFC_SQ_EXP_PRIM; - args.enabled_channels = 1; - args.done = true; - args.valid_mask = false; - args.compr = false; + if (prim_passthrough) + prim.passthrough = prim_passthrough; + else + prim.passthrough = ac_get_arg(&ctx->ac, ctx->gs_vtx01_offset); + + /* This is only used with NGG culling, which returns the NGG + * passthrough prim export encoding. + */ + if (ctx->shader->selector->info.writes_edgeflag) { + unsigned all_bits_no_edgeflags = ~SI_NGG_PRIM_EDGE_FLAG_BITS; + LLVMValueRef edgeflags = LLVMConstInt(ctx->ac.i32, all_bits_no_edgeflags, 0); + + unsigned num_vertices; + ngg_get_vertices_per_prim(ctx, &num_vertices); + + for (unsigned i = 0; i < num_vertices; i++) { + unsigned shift = 9 + i*10; + LLVMValueRef edge; + + edge = LLVMBuildLoad(builder, user_edgeflags[i], ""); + edge = LLVMBuildZExt(builder, edge, ctx->ac.i32, ""); + edge = LLVMBuildShl(builder, edge, LLVMConstInt(ctx->ac.i32, shift, 0), ""); + edgeflags = LLVMBuildOr(builder, edgeflags, edge, ""); + } + prim.passthrough = LLVMBuildAnd(builder, prim.passthrough, edgeflags, ""); + } - ac_build_export(&ctx->ac, &args); + ac_build_export_prim(&ctx->ac, &prim); + } + ac_build_endif(&ctx->ac, 6001); + return; + } + + ac_build_ifcc(&ctx->ac, si_is_gs_thread(ctx), 6001); + { + struct ac_ngg_prim prim = {}; + + ngg_get_vertices_per_prim(ctx, &prim.num_vertices); + + prim.isnull = ctx->ac.i1false; + prim.index[0] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 0, 16); + prim.index[1] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 16, 16); + prim.index[2] = si_unpack_param(ctx, ctx->gs_vtx23_offset, 0, 16); + + for (unsigned i = 0; i < prim.num_vertices; ++i) { + prim.edgeflag[i] = ngg_get_initial_edgeflag(ctx, i); + + if (ctx->shader->selector->info.writes_edgeflag) { + LLVMValueRef edge; + + edge = LLVMBuildLoad(ctx->ac.builder, user_edgeflags[i], ""); + edge = LLVMBuildAnd(ctx->ac.builder, prim.edgeflag[i], edge, ""); + prim.edgeflag[i] = edge; + } + } + + ac_build_export_prim(&ctx->ac, &prim); + } + ac_build_endif(&ctx->ac, 6001); } static void build_streamout_vertex(struct si_shader_context *ctx, @@ -149,7 +219,7 @@ unsigned stream, LLVMValueRef offset_vtx, LLVMValueRef vertexptr) { - struct tgsi_shader_info *info = &ctx->shader->selector->info; + struct si_shader_info *info = &ctx->shader->selector->info; struct pipe_stream_output_info *so = &ctx->shader->selector->so; LLVMBuilderRef builder = ctx->ac.builder; LLVMValueRef offset[4] = {}; @@ -160,9 +230,9 @@ continue; tmp = LLVMBuildMul(builder, offset_vtx, - LLVMConstInt(ctx->i32, so->stride[buffer], false), ""); + LLVMConstInt(ctx->ac.i32, so->stride[buffer], false), ""); tmp = LLVMBuildAdd(builder, wg_offset_dw[buffer], tmp, ""); - offset[buffer] = LLVMBuildShl(builder, tmp, LLVMConstInt(ctx->i32, 2, false), ""); + offset[buffer] = LLVMBuildShl(builder, tmp, LLVMConstInt(ctx->ac.i32, 2, false), ""); } for (unsigned i = 0; i < so->num_outputs; ++i) { @@ -176,13 +246,13 @@ for (unsigned comp = 0; comp < 4; comp++) { tmp = ac_build_gep0(&ctx->ac, vertexptr, - LLVMConstInt(ctx->i32, 4 * reg + comp, false)); + LLVMConstInt(ctx->ac.i32, 4 * reg + comp, false)); out.values[comp] = LLVMBuildLoad(builder, tmp, ""); out.vertex_stream[comp] = (info->output_streams[reg] >> (2 * comp)) & 3; } - si_emit_streamout_output(ctx, so_buffer, offset, &so->output[i], &out); + si_llvm_streamout_store_output(ctx, so_buffer, offset, &so->output[i], &out); } } @@ -209,25 +279,25 @@ static void build_streamout(struct si_shader_context *ctx, struct ngg_streamout *nggso) { - struct tgsi_shader_info *info = &ctx->shader->selector->info; + struct si_shader_info *info = &ctx->shader->selector->info; struct pipe_stream_output_info *so = &ctx->shader->selector->so; LLVMBuilderRef builder = ctx->ac.builder; - LLVMValueRef buf_ptr = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers); + LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers); LLVMValueRef tid = get_thread_id_in_tg(ctx); LLVMValueRef tmp, tmp2; - LLVMValueRef i32_2 = LLVMConstInt(ctx->i32, 2, false); - LLVMValueRef i32_4 = LLVMConstInt(ctx->i32, 4, false); - LLVMValueRef i32_8 = LLVMConstInt(ctx->i32, 8, false); + LLVMValueRef i32_2 = LLVMConstInt(ctx->ac.i32, 2, false); + LLVMValueRef i32_4 = LLVMConstInt(ctx->ac.i32, 4, false); + LLVMValueRef i32_8 = LLVMConstInt(ctx->ac.i32, 8, false); LLVMValueRef so_buffer[4] = {}; unsigned max_num_vertices = 1 + (nggso->vertices[1] ? 1 : 0) + (nggso->vertices[2] ? 1 : 0); LLVMValueRef prim_stride_dw[4] = {}; - LLVMValueRef prim_stride_dw_vgpr = LLVMGetUndef(ctx->i32); + LLVMValueRef prim_stride_dw_vgpr = LLVMGetUndef(ctx->ac.i32); int stream_for_buffer[4] = { -1, -1, -1, -1 }; unsigned bufmask_for_stream[4] = {}; bool isgs = ctx->type == PIPE_SHADER_GEOMETRY; unsigned scratch_emit_base = isgs ? 4 : 0; - LLVMValueRef scratch_emit_basev = isgs ? i32_4 : ctx->i32_0; + LLVMValueRef scratch_emit_basev = isgs ? i32_4 : ctx->ac.i32_0; unsigned scratch_offset_base = isgs ? 8 : 4; LLVMValueRef scratch_offset_basev = isgs ? i32_8 : i32_4; @@ -248,26 +318,26 @@ assert(so->stride[buffer]); - tmp = LLVMConstInt(ctx->i32, so->stride[buffer], false); + tmp = LLVMConstInt(ctx->ac.i32, so->stride[buffer], false); prim_stride_dw[buffer] = LLVMBuildMul(builder, tmp, nggso->num_vertices, ""); prim_stride_dw_vgpr = ac_build_writelane( &ctx->ac, prim_stride_dw_vgpr, prim_stride_dw[buffer], - LLVMConstInt(ctx->i32, buffer, false)); + LLVMConstInt(ctx->ac.i32, buffer, false)); so_buffer[buffer] = ac_build_load_to_sgpr( &ctx->ac, buf_ptr, - LLVMConstInt(ctx->i32, SI_VS_STREAMOUT_BUF0 + buffer, false)); + LLVMConstInt(ctx->ac.i32, SI_VS_STREAMOUT_BUF0 + buffer, false)); } - tmp = LLVMBuildICmp(builder, LLVMIntEQ, get_wave_id_in_tg(ctx), ctx->i32_0, ""); + tmp = LLVMBuildICmp(builder, LLVMIntEQ, get_wave_id_in_tg(ctx), ctx->ac.i32_0, ""); ac_build_ifcc(&ctx->ac, tmp, 5200); { - LLVMTypeRef gdsptr = LLVMPointerType(ctx->i32, AC_ADDR_SPACE_GDS); - LLVMValueRef gdsbase = LLVMBuildIntToPtr(builder, ctx->i32_0, gdsptr, ""); + LLVMTypeRef gdsptr = LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GDS); + LLVMValueRef gdsbase = LLVMBuildIntToPtr(builder, ctx->ac.i32_0, gdsptr, ""); /* Advance the streamout offsets in GDS. */ - LLVMValueRef offsets_vgpr = ac_build_alloca_undef(&ctx->ac, ctx->i32, ""); - LLVMValueRef generated_by_stream_vgpr = ac_build_alloca_undef(&ctx->ac, ctx->i32, ""); + LLVMValueRef offsets_vgpr = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, ""); + LLVMValueRef generated_by_stream_vgpr = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, ""); tmp = LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), i32_4, ""); ac_build_ifcc(&ctx->ac, tmp, 5210); @@ -276,8 +346,8 @@ tmp = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tid); tmp = LLVMBuildLoad(builder, tmp, ""); } else { - tmp = ac_build_writelane(&ctx->ac, ctx->i32_0, - ngg_get_prim_cnt(ctx), ctx->i32_0); + tmp = ac_build_writelane(&ctx->ac, ctx->ac.i32_0, + ngg_get_prim_cnt(ctx), ctx->ac.i32_0); } LLVMBuildStore(builder, tmp, generated_by_stream_vgpr); @@ -305,15 +375,15 @@ LLVMValueRef args[] = { LLVMBuildIntToPtr(builder, ngg_get_ordered_id(ctx), gdsptr, ""), tmp, - ctx->i32_0, // ordering - ctx->i32_0, // scope + ctx->ac.i32_0, // ordering + ctx->ac.i32_0, // scope ctx->ac.i1false, // isVolatile - LLVMConstInt(ctx->i32, 4 << 24, false), // OA index + LLVMConstInt(ctx->ac.i32, 4 << 24, false), // OA index ctx->ac.i1true, // wave release ctx->ac.i1true, // wave done }; tmp = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.ds.ordered.add", - ctx->i32, args, ARRAY_SIZE(args), 0); + ctx->ac.i32, args, ARRAY_SIZE(args), 0); /* Keep offsets in a VGPR for quick retrieval via readlane by * the first wave for bounds checking, and also store in LDS @@ -344,13 +414,13 @@ tmp = LLVMBuildLoad(builder, offsets_vgpr, ""); LLVMValueRef offset_dw = ac_build_readlane(&ctx->ac, tmp, - LLVMConstInt(ctx->i32, buffer, false)); + LLVMConstInt(ctx->ac.i32, buffer, false)); tmp = LLVMBuildSub(builder, bufsize_dw, offset_dw, ""); tmp = LLVMBuildUDiv(builder, tmp, prim_stride_dw[buffer], ""); tmp2 = LLVMBuildICmp(builder, LLVMIntULT, bufsize_dw, offset_dw, ""); - max_emit[buffer] = LLVMBuildSelect(builder, tmp2, ctx->i32_0, tmp, ""); + max_emit[buffer] = LLVMBuildSelect(builder, tmp2, ctx->ac.i32_0, tmp, ""); } /* Determine the number of emitted primitives per stream and fixup the @@ -359,7 +429,7 @@ * This is complicated by the fact that a single stream can emit to * multiple buffers (but luckily not vice versa). */ - LLVMValueRef emit_vgpr = ctx->i32_0; + LLVMValueRef emit_vgpr = ctx->ac.i32_0; for (unsigned stream = 0; stream < 4; ++stream) { if (!info->num_stream_output_components[stream]) @@ -368,7 +438,7 @@ tmp = LLVMBuildLoad(builder, generated_by_stream_vgpr, ""); LLVMValueRef generated = ac_build_readlane(&ctx->ac, tmp, - LLVMConstInt(ctx->i32, stream, false)); + LLVMConstInt(ctx->ac.i32, stream, false)); LLVMValueRef emit = generated; for (unsigned buffer = 0; buffer < 4; ++buffer) { @@ -377,15 +447,15 @@ } emit_vgpr = ac_build_writelane(&ctx->ac, emit_vgpr, emit, - LLVMConstInt(ctx->i32, stream, false)); + LLVMConstInt(ctx->ac.i32, stream, false)); /* Fixup the offset using a plain GDS atomic if we overflowed. */ tmp = LLVMBuildICmp(builder, LLVMIntULT, emit, generated, ""); ac_build_ifcc(&ctx->ac, tmp, 5221); /* scalar branch */ tmp = LLVMBuildLShr(builder, - LLVMConstInt(ctx->i32, bufmask_for_stream[stream], false), + LLVMConstInt(ctx->ac.i32, bufmask_for_stream[stream], false), ac_get_thread_id(&ctx->ac), ""); - tmp = LLVMBuildTrunc(builder, tmp, ctx->i1, ""); + tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, ""); ac_build_ifcc(&ctx->ac, tmp, 5222); { tmp = LLVMBuildSub(builder, generated, emit, ""); @@ -424,7 +494,7 @@ primemit_scan[stream].src = nggso->prim_enable[stream]; primemit_scan[stream].scratch = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, - LLVMConstInt(ctx->i32, 12 + 8 * stream, false)); + LLVMConstInt(ctx->ac.i32, 12 + 8 * stream, false)); primemit_scan[stream].waveidx = get_wave_id_in_tg(ctx); primemit_scan[stream].numwaves = get_tgsize(ctx); primemit_scan[stream].maxwaves = 8; @@ -447,7 +517,7 @@ if (stream_for_buffer[buffer] >= 0) { wgoffset_dw[buffer] = ac_build_readlane( &ctx->ac, scratch_vgpr, - LLVMConstInt(ctx->i32, scratch_offset_base + buffer, false)); + LLVMConstInt(ctx->ac.i32, scratch_offset_base + buffer, false)); } } @@ -455,7 +525,7 @@ if (info->num_stream_output_components[stream]) { nggso->emit[stream] = ac_build_readlane( &ctx->ac, scratch_vgpr, - LLVMConstInt(ctx->i32, scratch_emit_base + stream, false)); + LLVMConstInt(ctx->ac.i32, scratch_emit_base + stream, false)); } } } @@ -483,19 +553,64 @@ for (unsigned i = 0; i < max_num_vertices; ++i) { tmp = LLVMBuildICmp(builder, LLVMIntULT, - LLVMConstInt(ctx->i32, i, false), + LLVMConstInt(ctx->ac.i32, i, false), nggso->num_vertices, ""); ac_build_ifcc(&ctx->ac, tmp, 5241); build_streamout_vertex(ctx, so_buffer, wgoffset_dw, stream, offset_vtx, nggso->vertices[i]); ac_build_endif(&ctx->ac, 5241); - offset_vtx = LLVMBuildAdd(builder, offset_vtx, ctx->i32_1, ""); + offset_vtx = LLVMBuildAdd(builder, offset_vtx, ctx->ac.i32_1, ""); } } ac_build_endif(&ctx->ac, 5240); } } +/* LDS layout of ES vertex data for NGG culling. */ +enum { + /* Byte 0: Boolean ES thread accepted (unculled) flag, and later the old + * ES thread ID. After vertex compaction, compacted ES threads + * store the old thread ID here to copy input VGPRs from uncompacted + * ES threads. + * Byte 1: New ES thread ID, loaded by GS to prepare the prim export value. + * Byte 2: TES rel patch ID + * Byte 3: Unused + */ + lds_byte0_accept_flag = 0, + lds_byte0_old_thread_id = 0, + lds_byte1_new_thread_id, + lds_byte2_tes_rel_patch_id, + lds_byte3_unused, + + lds_packed_data = 0, /* lds_byteN_... */ + + lds_pos_x, + lds_pos_y, + lds_pos_z, + lds_pos_w, + lds_pos_x_div_w, + lds_pos_y_div_w, + /* If VS: */ + lds_vertex_id, + lds_instance_id, /* optional */ + /* If TES: */ + lds_tes_u = lds_vertex_id, + lds_tes_v = lds_instance_id, + lds_tes_patch_id, /* optional */ +}; + +static LLVMValueRef si_build_gep_i8(struct si_shader_context *ctx, + LLVMValueRef ptr, unsigned byte_index) +{ + assert(byte_index < 4); + LLVMTypeRef pi8 = LLVMPointerType(ctx->ac.i8, AC_ADDR_SPACE_LDS); + LLVMValueRef index = LLVMConstInt(ctx->ac.i32, byte_index, 0); + + return LLVMBuildGEP(ctx->ac.builder, + LLVMBuildPointerCast(ctx->ac.builder, ptr, pi8, ""), + &index, 1, ""); +} + static unsigned ngg_nogs_vertex_size(struct si_shader *shader) { unsigned lds_vertex_size = 0; @@ -507,6 +622,33 @@ if (shader->selector->info.writes_edgeflag) lds_vertex_size = MAX2(lds_vertex_size, 1); + /* LDS size for passing data from GS to ES. + * GS stores Primitive IDs into LDS at the address corresponding + * to the ES thread of the provoking vertex. All ES threads + * load and export PrimitiveID for their thread. + */ + if (shader->selector->type == PIPE_SHADER_VERTEX && + shader->key.mono.u.vs_export_prim_id) + lds_vertex_size = MAX2(lds_vertex_size, 1); + + if (shader->key.opt.ngg_culling) { + if (shader->selector->type == PIPE_SHADER_VERTEX) { + STATIC_ASSERT(lds_instance_id + 1 == 9); + lds_vertex_size = MAX2(lds_vertex_size, 9); + } else { + assert(shader->selector->type == PIPE_SHADER_TESS_EVAL); + + if (shader->selector->info.uses_primid || + shader->key.mono.u.vs_export_prim_id) { + STATIC_ASSERT(lds_tes_patch_id + 2 == 11); + lds_vertex_size = MAX2(lds_vertex_size, 11); + } else { + STATIC_ASSERT(lds_tes_v + 1 == 9); + lds_vertex_size = MAX2(lds_vertex_size, 9); + } + } + } + return lds_vertex_size; } @@ -519,12 +661,579 @@ { /* The extra dword is used to avoid LDS bank conflicts. */ unsigned vertex_size = ngg_nogs_vertex_size(ctx->shader); - LLVMTypeRef ai32 = LLVMArrayType(ctx->i32, vertex_size); + LLVMTypeRef ai32 = LLVMArrayType(ctx->ac.i32, vertex_size); LLVMTypeRef pai32 = LLVMPointerType(ai32, AC_ADDR_SPACE_LDS); LLVMValueRef tmp = LLVMBuildBitCast(ctx->ac.builder, ctx->esgs_ring, pai32, ""); return LLVMBuildGEP(ctx->ac.builder, tmp, &vtxid, 1, ""); } +static LLVMValueRef si_insert_input_v4i32(struct si_shader_context *ctx, + LLVMValueRef ret, struct ac_arg param, + unsigned return_index) +{ + LLVMValueRef v = ac_get_arg(&ctx->ac, param); + + for (unsigned i = 0; i < 4; i++) { + ret = LLVMBuildInsertValue(ctx->ac.builder, ret, + ac_llvm_extract_elem(&ctx->ac, v, i), + return_index + i, ""); + } + return ret; +} + +static void load_bitmasks_2x64(struct si_shader_context *ctx, + LLVMValueRef lds_ptr, unsigned dw_offset, + LLVMValueRef mask[2], LLVMValueRef *total_bitcount) +{ + LLVMBuilderRef builder = ctx->ac.builder; + LLVMValueRef ptr64 = LLVMBuildPointerCast(builder, lds_ptr, + LLVMPointerType(LLVMArrayType(ctx->ac.i64, 2), + AC_ADDR_SPACE_LDS), ""); + for (unsigned i = 0; i < 2; i++) { + LLVMValueRef index = LLVMConstInt(ctx->ac.i32, dw_offset / 2 + i, 0); + mask[i] = LLVMBuildLoad(builder, ac_build_gep0(&ctx->ac, ptr64, index), ""); + } + + /* We get better code if we don't use the 128-bit bitcount. */ + *total_bitcount = LLVMBuildAdd(builder, ac_build_bit_count(&ctx->ac, mask[0]), + ac_build_bit_count(&ctx->ac, mask[1]), ""); +} + +/** + * Given a total thread count, update total and per-wave thread counts in input SGPRs + * and return the per-wave thread count. + * + * \param new_num_threads Total thread count on the input, per-wave thread count on the output. + * \param tg_info tg_info SGPR value + * \param tg_info_num_bits the bit size of thread count field in tg_info + * \param tg_info_shift the bit offset of the thread count field in tg_info + * \param wave_info merged_wave_info SGPR value + * \param wave_info_num_bits the bit size of thread count field in merged_wave_info + * \param wave_info_shift the bit offset of the thread count field in merged_wave_info + */ +static void update_thread_counts(struct si_shader_context *ctx, + LLVMValueRef *new_num_threads, + LLVMValueRef *tg_info, + unsigned tg_info_num_bits, + unsigned tg_info_shift, + LLVMValueRef *wave_info, + unsigned wave_info_num_bits, + unsigned wave_info_shift) +{ + LLVMBuilderRef builder = ctx->ac.builder; + + /* Update the total thread count. */ + unsigned tg_info_mask = ~(u_bit_consecutive(0, tg_info_num_bits) << tg_info_shift); + *tg_info = LLVMBuildAnd(builder, *tg_info, + LLVMConstInt(ctx->ac.i32, tg_info_mask, 0), ""); + *tg_info = LLVMBuildOr(builder, *tg_info, + LLVMBuildShl(builder, *new_num_threads, + LLVMConstInt(ctx->ac.i32, tg_info_shift, 0), ""), ""); + + /* Update the per-wave thread count. */ + LLVMValueRef prev_threads = LLVMBuildMul(builder, get_wave_id_in_tg(ctx), + LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, 0), ""); + *new_num_threads = LLVMBuildSub(builder, *new_num_threads, prev_threads, ""); + *new_num_threads = ac_build_imax(&ctx->ac, *new_num_threads, ctx->ac.i32_0); + *new_num_threads = ac_build_imin(&ctx->ac, *new_num_threads, + LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, 0)); + unsigned wave_info_mask = ~(u_bit_consecutive(0, wave_info_num_bits) << wave_info_shift); + *wave_info = LLVMBuildAnd(builder, *wave_info, + LLVMConstInt(ctx->ac.i32, wave_info_mask, 0), ""); + *wave_info = LLVMBuildOr(builder, *wave_info, + LLVMBuildShl(builder, *new_num_threads, + LLVMConstInt(ctx->ac.i32, wave_info_shift, 0), ""), ""); +} + +/** + * Cull primitives for NGG VS or TES, then compact vertices, which happens + * before the VS or TES main function. Return values for the main function. + * Also return the position, which is passed to the shader as an input, + * so that we don't compute it twice. + */ +void gfx10_emit_ngg_culling_epilogue_4x_wave32(struct ac_shader_abi *abi, + unsigned max_outputs, + LLVMValueRef *addrs) +{ + struct si_shader_context *ctx = si_shader_context_from_abi(abi); + struct si_shader *shader = ctx->shader; + struct si_shader_selector *sel = shader->selector; + struct si_shader_info *info = &sel->info; + LLVMBuilderRef builder = ctx->ac.builder; + + assert(shader->key.opt.ngg_culling); + assert(shader->key.as_ngg); + assert(sel->type == PIPE_SHADER_VERTEX || + (sel->type == PIPE_SHADER_TESS_EVAL && !shader->key.as_es)); + + LLVMValueRef position[4] = {}; + for (unsigned i = 0; i < info->num_outputs; i++) { + switch (info->output_semantic_name[i]) { + case TGSI_SEMANTIC_POSITION: + for (unsigned j = 0; j < 4; j++) { + position[j] = LLVMBuildLoad(ctx->ac.builder, + addrs[4 * i + j], ""); + } + break; + } + } + assert(position[0]); + + /* Store Position.XYZW into LDS. */ + LLVMValueRef es_vtxptr = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx)); + for (unsigned chan = 0; chan < 4; chan++) { + LLVMBuildStore(builder, ac_to_integer(&ctx->ac, position[chan]), + ac_build_gep0(&ctx->ac, es_vtxptr, + LLVMConstInt(ctx->ac.i32, lds_pos_x + chan, 0))); + } + /* Store Position.XY / W into LDS. */ + for (unsigned chan = 0; chan < 2; chan++) { + LLVMValueRef val = ac_build_fdiv(&ctx->ac, position[chan], position[3]); + LLVMBuildStore(builder, ac_to_integer(&ctx->ac, val), + ac_build_gep0(&ctx->ac, es_vtxptr, + LLVMConstInt(ctx->ac.i32, lds_pos_x_div_w + chan, 0))); + } + + /* Store VertexID and InstanceID. ES threads will have to load them + * from LDS after vertex compaction and use them instead of their own + * system values. + */ + bool uses_instance_id = false; + bool uses_tes_prim_id = false; + LLVMValueRef packed_data = ctx->ac.i32_0; + + if (ctx->type == PIPE_SHADER_VERTEX) { + uses_instance_id = sel->info.uses_instanceid || + shader->key.part.vs.prolog.instance_divisor_is_one || + shader->key.part.vs.prolog.instance_divisor_is_fetched; + + LLVMBuildStore(builder, ctx->abi.vertex_id, + ac_build_gep0(&ctx->ac, es_vtxptr, + LLVMConstInt(ctx->ac.i32, lds_vertex_id, 0))); + if (uses_instance_id) { + LLVMBuildStore(builder, ctx->abi.instance_id, + ac_build_gep0(&ctx->ac, es_vtxptr, + LLVMConstInt(ctx->ac.i32, lds_instance_id, 0))); + } + } else { + uses_tes_prim_id = sel->info.uses_primid || + shader->key.mono.u.vs_export_prim_id; + + assert(ctx->type == PIPE_SHADER_TESS_EVAL); + LLVMBuildStore(builder, ac_to_integer(&ctx->ac, ac_get_arg(&ctx->ac, ctx->tes_u)), + ac_build_gep0(&ctx->ac, es_vtxptr, + LLVMConstInt(ctx->ac.i32, lds_tes_u, 0))); + LLVMBuildStore(builder, ac_to_integer(&ctx->ac, ac_get_arg(&ctx->ac, ctx->tes_v)), + ac_build_gep0(&ctx->ac, es_vtxptr, + LLVMConstInt(ctx->ac.i32, lds_tes_v, 0))); + packed_data = LLVMBuildShl(builder, ac_get_arg(&ctx->ac, ctx->tes_rel_patch_id), + LLVMConstInt(ctx->ac.i32, lds_byte2_tes_rel_patch_id * 8, 0), ""); + if (uses_tes_prim_id) { + LLVMBuildStore(builder, ac_get_arg(&ctx->ac, ctx->args.tes_patch_id), + ac_build_gep0(&ctx->ac, es_vtxptr, + LLVMConstInt(ctx->ac.i32, lds_tes_patch_id, 0))); + } + } + /* Initialize the packed data. */ + LLVMBuildStore(builder, packed_data, + ac_build_gep0(&ctx->ac, es_vtxptr, + LLVMConstInt(ctx->ac.i32, lds_packed_data, 0))); + ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label); + + LLVMValueRef tid = ac_get_thread_id(&ctx->ac); + + /* Initialize the last 3 gs_ngg_scratch dwords to 0, because we may have less + * than 4 waves, but we always read all 4 values. This is where the thread + * bitmasks of unculled threads will be stored. + * + * gs_ngg_scratch layout: esmask[0..3] + */ + ac_build_ifcc(&ctx->ac, + LLVMBuildICmp(builder, LLVMIntULT, get_thread_id_in_tg(ctx), + LLVMConstInt(ctx->ac.i32, 3, 0), ""), 16101); + { + LLVMValueRef index = LLVMBuildAdd(builder, tid, ctx->ac.i32_1, ""); + LLVMBuildStore(builder, ctx->ac.i32_0, + ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, index)); + } + ac_build_endif(&ctx->ac, 16101); + ac_build_s_barrier(&ctx->ac); + + /* The hardware requires that there are no holes between unculled vertices, + * which means we have to pack ES threads, i.e. reduce the ES thread count + * and move ES input VGPRs to lower threads. The upside is that varyings + * are only fetched and computed for unculled vertices. + * + * Vertex compaction in GS threads: + * + * Part 1: Compute the surviving vertex mask in GS threads: + * - Compute 4 32-bit surviving vertex masks in LDS. (max 4 waves) + * - In GS, notify ES threads whether the vertex survived. + * - Barrier + * - ES threads will create the mask and store it in LDS. + * - Barrier + * - Each GS thread loads the vertex masks from LDS. + * + * Part 2: Compact ES threads in GS threads: + * - Compute the prefix sum for all 3 vertices from the masks. These are the new + * thread IDs for each vertex within the primitive. + * - Write the value of the old thread ID into the LDS address of the new thread ID. + * The ES thread will load the old thread ID and use it to load the position, VertexID, + * and InstanceID. + * - Update vertex indices and null flag in the GS input VGPRs. + * - Barrier + * + * Part 3: Update inputs GPRs + * - For all waves, update per-wave thread counts in input SGPRs. + * - In ES threads, update the ES input VGPRs (VertexID, InstanceID, TES inputs). + */ + + LLVMValueRef vtxindex[3]; + if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL) { + /* For the GS fast launch, the VS prologs simply puts the Vertex IDs + * into these VGPRs. + */ + vtxindex[0] = ac_get_arg(&ctx->ac, ctx->gs_vtx01_offset); + vtxindex[1] = ac_get_arg(&ctx->ac, ctx->gs_vtx23_offset); + vtxindex[2] = ac_get_arg(&ctx->ac, ctx->gs_vtx45_offset); + } else { + vtxindex[0] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 0, 16); + vtxindex[1] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 16, 16); + vtxindex[2] = si_unpack_param(ctx, ctx->gs_vtx23_offset, 0, 16); + }; + LLVMValueRef gs_vtxptr[] = { + ngg_nogs_vertex_ptr(ctx, vtxindex[0]), + ngg_nogs_vertex_ptr(ctx, vtxindex[1]), + ngg_nogs_vertex_ptr(ctx, vtxindex[2]), + }; + es_vtxptr = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx)); + + LLVMValueRef gs_accepted = ac_build_alloca(&ctx->ac, ctx->ac.i32, ""); + + /* Do culling in GS threads. */ + ac_build_ifcc(&ctx->ac, si_is_gs_thread(ctx), 16002); + { + /* Load positions. */ + LLVMValueRef pos[3][4] = {}; + for (unsigned vtx = 0; vtx < 3; vtx++) { + for (unsigned chan = 0; chan < 4; chan++) { + unsigned index; + if (chan == 0 || chan == 1) + index = lds_pos_x_div_w + chan; + else if (chan == 3) + index = lds_pos_w; + else + continue; + + LLVMValueRef addr = ac_build_gep0(&ctx->ac, gs_vtxptr[vtx], + LLVMConstInt(ctx->ac.i32, index, 0)); + pos[vtx][chan] = LLVMBuildLoad(builder, addr, ""); + pos[vtx][chan] = ac_to_float(&ctx->ac, pos[vtx][chan]); + } + } + + /* Load the viewport state for small prim culling. */ + LLVMValueRef vp = ac_build_load_invariant(&ctx->ac, + ac_get_arg(&ctx->ac, ctx->small_prim_cull_info), + ctx->ac.i32_0); + vp = LLVMBuildBitCast(builder, vp, ctx->ac.v4f32, ""); + LLVMValueRef vp_scale[2], vp_translate[2]; + vp_scale[0] = ac_llvm_extract_elem(&ctx->ac, vp, 0); + vp_scale[1] = ac_llvm_extract_elem(&ctx->ac, vp, 1); + vp_translate[0] = ac_llvm_extract_elem(&ctx->ac, vp, 2); + vp_translate[1] = ac_llvm_extract_elem(&ctx->ac, vp, 3); + + /* Get the small prim filter precision. */ + LLVMValueRef small_prim_precision = si_unpack_param(ctx, ctx->vs_state_bits, 7, 4); + small_prim_precision = LLVMBuildOr(builder, small_prim_precision, + LLVMConstInt(ctx->ac.i32, 0x70, 0), ""); + small_prim_precision = LLVMBuildShl(builder, small_prim_precision, + LLVMConstInt(ctx->ac.i32, 23, 0), ""); + small_prim_precision = LLVMBuildBitCast(builder, small_prim_precision, ctx->ac.f32, ""); + + /* Execute culling code. */ + struct ac_cull_options options = {}; + options.cull_front = shader->key.opt.ngg_culling & SI_NGG_CULL_FRONT_FACE; + options.cull_back = shader->key.opt.ngg_culling & SI_NGG_CULL_BACK_FACE; + options.cull_view_xy = shader->key.opt.ngg_culling & SI_NGG_CULL_VIEW_SMALLPRIMS; + options.cull_small_prims = options.cull_view_xy; + options.cull_zero_area = options.cull_front || options.cull_back; + options.cull_w = true; + + /* Tell ES threads whether their vertex survived. */ + ac_build_ifcc(&ctx->ac, ac_cull_triangle(&ctx->ac, pos, ctx->ac.i1true, + vp_scale, vp_translate, + small_prim_precision, &options), 16003); + { + LLVMBuildStore(builder, ctx->ac.i32_1, gs_accepted); + for (unsigned vtx = 0; vtx < 3; vtx++) { + LLVMBuildStore(builder, ctx->ac.i8_1, + si_build_gep_i8(ctx, gs_vtxptr[vtx], lds_byte0_accept_flag)); + } + } + ac_build_endif(&ctx->ac, 16003); + } + ac_build_endif(&ctx->ac, 16002); + ac_build_s_barrier(&ctx->ac); + + gs_accepted = LLVMBuildLoad(builder, gs_accepted, ""); + + LLVMValueRef es_accepted = ac_build_alloca(&ctx->ac, ctx->ac.i1, ""); + + /* Convert the per-vertex flag to a thread bitmask in ES threads and store it in LDS. */ + ac_build_ifcc(&ctx->ac, si_is_es_thread(ctx), 16007); + { + LLVMValueRef es_accepted_flag = + LLVMBuildLoad(builder, + si_build_gep_i8(ctx, es_vtxptr, lds_byte0_accept_flag), ""); + + LLVMValueRef es_accepted_bool = LLVMBuildICmp(builder, LLVMIntNE, + es_accepted_flag, ctx->ac.i8_0, ""); + LLVMValueRef es_mask = ac_get_i1_sgpr_mask(&ctx->ac, es_accepted_bool); + + LLVMBuildStore(builder, es_accepted_bool, es_accepted); + + ac_build_ifcc(&ctx->ac, LLVMBuildICmp(builder, LLVMIntEQ, + tid, ctx->ac.i32_0, ""), 16008); + { + LLVMBuildStore(builder, es_mask, + ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, + get_wave_id_in_tg(ctx))); + } + ac_build_endif(&ctx->ac, 16008); + } + ac_build_endif(&ctx->ac, 16007); + ac_build_s_barrier(&ctx->ac); + + /* Load the vertex masks and compute the new ES thread count. */ + LLVMValueRef es_mask[2], new_num_es_threads, kill_wave; + load_bitmasks_2x64(ctx, ctx->gs_ngg_scratch, 0, es_mask, &new_num_es_threads); + new_num_es_threads = ac_build_readlane_no_opt_barrier(&ctx->ac, new_num_es_threads, NULL); + + /* ES threads compute their prefix sum, which is the new ES thread ID. + * Then they write the value of the old thread ID into the LDS address + * of the new thread ID. It will be used it to load input VGPRs from + * the old thread's LDS location. + */ + ac_build_ifcc(&ctx->ac, LLVMBuildLoad(builder, es_accepted, ""), 16009); + { + LLVMValueRef old_id = get_thread_id_in_tg(ctx); + LLVMValueRef new_id = ac_prefix_bitcount_2x64(&ctx->ac, es_mask, old_id); + + LLVMBuildStore(builder, LLVMBuildTrunc(builder, old_id, ctx->ac.i8, ""), + si_build_gep_i8(ctx, ngg_nogs_vertex_ptr(ctx, new_id), + lds_byte0_old_thread_id)); + LLVMBuildStore(builder, LLVMBuildTrunc(builder, new_id, ctx->ac.i8, ""), + si_build_gep_i8(ctx, es_vtxptr, lds_byte1_new_thread_id)); + } + ac_build_endif(&ctx->ac, 16009); + + /* Kill waves that have inactive threads. */ + kill_wave = LLVMBuildICmp(builder, LLVMIntULE, + ac_build_imax(&ctx->ac, new_num_es_threads, ngg_get_prim_cnt(ctx)), + LLVMBuildMul(builder, get_wave_id_in_tg(ctx), + LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, 0), ""), ""); + ac_build_ifcc(&ctx->ac, kill_wave, 19202); + { + /* If we are killing wave 0, send that there are no primitives + * in this threadgroup. + */ + ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx), + ctx->ac.i32_0, ctx->ac.i32_0); + ac_build_s_endpgm(&ctx->ac); + } + ac_build_endif(&ctx->ac, 19202); + ac_build_s_barrier(&ctx->ac); + + /* Send the final vertex and primitive counts. */ + ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx), + new_num_es_threads, ngg_get_prim_cnt(ctx)); + + /* Update thread counts in SGPRs. */ + LLVMValueRef new_gs_tg_info = ac_get_arg(&ctx->ac, ctx->gs_tg_info); + LLVMValueRef new_merged_wave_info = ac_get_arg(&ctx->ac, ctx->merged_wave_info); + + /* This also converts the thread count from the total count to the per-wave count. */ + update_thread_counts(ctx, &new_num_es_threads, &new_gs_tg_info, 9, 12, + &new_merged_wave_info, 8, 0); + + /* Update vertex indices in VGPR0 (same format as NGG passthrough). */ + LLVMValueRef new_vgpr0 = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, ""); + + /* Set the null flag at the beginning (culled), and then + * overwrite it for accepted primitives. + */ + LLVMBuildStore(builder, LLVMConstInt(ctx->ac.i32, 1u << 31, 0), new_vgpr0); + + /* Get vertex indices after vertex compaction. */ + ac_build_ifcc(&ctx->ac, LLVMBuildTrunc(builder, gs_accepted, ctx->ac.i1, ""), 16011); + { + struct ac_ngg_prim prim = {}; + prim.num_vertices = 3; + prim.isnull = ctx->ac.i1false; + + for (unsigned vtx = 0; vtx < 3; vtx++) { + prim.index[vtx] = + LLVMBuildLoad(builder, + si_build_gep_i8(ctx, gs_vtxptr[vtx], + lds_byte1_new_thread_id), ""); + prim.index[vtx] = LLVMBuildZExt(builder, prim.index[vtx], ctx->ac.i32, ""); + prim.edgeflag[vtx] = ngg_get_initial_edgeflag(ctx, vtx); + } + + /* Set the new GS input VGPR. */ + LLVMBuildStore(builder, ac_pack_prim_export(&ctx->ac, &prim), new_vgpr0); + } + ac_build_endif(&ctx->ac, 16011); + + if (gfx10_ngg_export_prim_early(shader)) + gfx10_ngg_build_export_prim(ctx, NULL, LLVMBuildLoad(builder, new_vgpr0, "")); + + /* Set the new ES input VGPRs. */ + LLVMValueRef es_data[4]; + LLVMValueRef old_thread_id = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, ""); + + for (unsigned i = 0; i < 4; i++) + es_data[i] = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, ""); + + ac_build_ifcc(&ctx->ac, LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, tid, + new_num_es_threads, ""), 16012); + { + LLVMValueRef old_id, old_es_vtxptr, tmp; + + /* Load ES input VGPRs from the ES thread before compaction. */ + old_id = LLVMBuildLoad(builder, + si_build_gep_i8(ctx, es_vtxptr, lds_byte0_old_thread_id), ""); + old_id = LLVMBuildZExt(builder, old_id, ctx->ac.i32, ""); + + LLVMBuildStore(builder, old_id, old_thread_id); + old_es_vtxptr = ngg_nogs_vertex_ptr(ctx, old_id); + + for (unsigned i = 0; i < 2; i++) { + tmp = LLVMBuildLoad(builder, + ac_build_gep0(&ctx->ac, old_es_vtxptr, + LLVMConstInt(ctx->ac.i32, lds_vertex_id + i, 0)), ""); + LLVMBuildStore(builder, tmp, es_data[i]); + } + + if (ctx->type == PIPE_SHADER_TESS_EVAL) { + tmp = LLVMBuildLoad(builder, + si_build_gep_i8(ctx, old_es_vtxptr, + lds_byte2_tes_rel_patch_id), ""); + tmp = LLVMBuildZExt(builder, tmp, ctx->ac.i32, ""); + LLVMBuildStore(builder, tmp, es_data[2]); + + if (uses_tes_prim_id) { + tmp = LLVMBuildLoad(builder, + ac_build_gep0(&ctx->ac, old_es_vtxptr, + LLVMConstInt(ctx->ac.i32, lds_tes_patch_id, 0)), ""); + LLVMBuildStore(builder, tmp, es_data[3]); + } + } + } + ac_build_endif(&ctx->ac, 16012); + + /* Return values for the main function. */ + LLVMValueRef ret = ctx->return_value; + LLVMValueRef val; + + ret = LLVMBuildInsertValue(ctx->ac.builder, ret, new_gs_tg_info, 2, ""); + ret = LLVMBuildInsertValue(ctx->ac.builder, ret, new_merged_wave_info, 3, ""); + if (ctx->type == PIPE_SHADER_TESS_EVAL) + ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_offset, 4); + + ret = si_insert_input_ptr(ctx, ret, ctx->rw_buffers, + 8 + SI_SGPR_RW_BUFFERS); + ret = si_insert_input_ptr(ctx, ret, + ctx->bindless_samplers_and_images, + 8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES); + ret = si_insert_input_ptr(ctx, ret, + ctx->const_and_shader_buffers, + 8 + SI_SGPR_CONST_AND_SHADER_BUFFERS); + ret = si_insert_input_ptr(ctx, ret, + ctx->samplers_and_images, + 8 + SI_SGPR_SAMPLERS_AND_IMAGES); + ret = si_insert_input_ptr(ctx, ret, ctx->vs_state_bits, + 8 + SI_SGPR_VS_STATE_BITS); + + if (ctx->type == PIPE_SHADER_VERTEX) { + ret = si_insert_input_ptr(ctx, ret, ctx->args.base_vertex, + 8 + SI_SGPR_BASE_VERTEX); + ret = si_insert_input_ptr(ctx, ret, ctx->args.start_instance, + 8 + SI_SGPR_START_INSTANCE); + ret = si_insert_input_ptr(ctx, ret, ctx->args.draw_id, + 8 + SI_SGPR_DRAWID); + ret = si_insert_input_ptr(ctx, ret, ctx->vertex_buffers, + 8 + SI_VS_NUM_USER_SGPR); + + for (unsigned i = 0; i < shader->selector->num_vbos_in_user_sgprs; i++) { + ret = si_insert_input_v4i32(ctx, ret, ctx->vb_descriptors[i], + 8 + SI_SGPR_VS_VB_DESCRIPTOR_FIRST + i * 4); + } + } else { + assert(ctx->type == PIPE_SHADER_TESS_EVAL); + ret = si_insert_input_ptr(ctx, ret, ctx->tcs_offchip_layout, + 8 + SI_SGPR_TES_OFFCHIP_LAYOUT); + ret = si_insert_input_ptr(ctx, ret, ctx->tes_offchip_addr, + 8 + SI_SGPR_TES_OFFCHIP_ADDR); + } + + unsigned vgpr; + if (ctx->type == PIPE_SHADER_VERTEX) { + if (shader->selector->num_vbos_in_user_sgprs) { + vgpr = 8 + SI_SGPR_VS_VB_DESCRIPTOR_FIRST + + shader->selector->num_vbos_in_user_sgprs * 4; + } else { + vgpr = 8 + GFX9_VSGS_NUM_USER_SGPR + 1; + } + } else { + vgpr = 8 + GFX9_TESGS_NUM_USER_SGPR; + } + + val = LLVMBuildLoad(builder, new_vgpr0, ""); + ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val), + vgpr++, ""); + vgpr++; /* gs_vtx23_offset */ + + ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_prim_id, vgpr++); + ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_invocation_id, vgpr++); + vgpr++; /* gs_vtx45_offset */ + + if (ctx->type == PIPE_SHADER_VERTEX) { + val = LLVMBuildLoad(builder, es_data[0], ""); + ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val), + vgpr++, ""); /* VGPR5 - VertexID */ + vgpr += 2; + if (uses_instance_id) { + val = LLVMBuildLoad(builder, es_data[1], ""); + ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val), + vgpr++, ""); /* VGPR8 - InstanceID */ + } else { + vgpr++; + } + } else { + assert(ctx->type == PIPE_SHADER_TESS_EVAL); + unsigned num_vgprs = uses_tes_prim_id ? 4 : 3; + for (unsigned i = 0; i < num_vgprs; i++) { + val = LLVMBuildLoad(builder, es_data[i], ""); + ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val), + vgpr++, ""); + } + if (num_vgprs == 3) + vgpr++; + } + /* Return the old thread ID. */ + val = LLVMBuildLoad(builder, old_thread_id, ""); + ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val), vgpr++, ""); + + /* These two also use LDS. */ + if (sel->info.writes_edgeflag || + (ctx->type == PIPE_SHADER_VERTEX && shader->key.mono.u.vs_export_prim_id)) + ac_build_s_barrier(&ctx->ac); + + ctx->return_value = ret; +} + /** * Emit the epilogue of an API VS or TES shader compiled as ESGS shader. */ @@ -534,7 +1243,7 @@ { struct si_shader_context *ctx = si_shader_context_from_abi(abi); struct si_shader_selector *sel = ctx->shader->selector; - struct tgsi_shader_info *info = &sel->info; + struct si_shader_info *info = &sel->info; struct si_shader_output_values outputs[PIPE_MAX_SHADER_OUTPUTS]; LLVMBuilderRef builder = ctx->ac.builder; LLVMValueRef tmp, tmp2; @@ -560,7 +1269,7 @@ */ if (sel->so.num_outputs) { tmp = ac_build_gep0(&ctx->ac, vertex_ptr, - LLVMConstInt(ctx->i32, 4 * i + j, false)); + LLVMConstInt(ctx->ac.i32, 4 * i + j, false)); tmp2 = LLVMBuildLoad(builder, addrs[4 * i + j], ""); tmp2 = ac_to_integer(&ctx->ac, tmp2); LLVMBuildStore(builder, tmp2, tmp); @@ -572,63 +1281,50 @@ sel->info.writes_edgeflag) { LLVMValueRef edgeflag = LLVMBuildLoad(builder, addrs[4 * i], ""); /* The output is a float, but the hw expects a 1-bit integer. */ - edgeflag = LLVMBuildFPToUI(ctx->ac.builder, edgeflag, ctx->i32, ""); - edgeflag = ac_build_umin(&ctx->ac, edgeflag, ctx->i32_1); + edgeflag = LLVMBuildFPToUI(ctx->ac.builder, edgeflag, ctx->ac.i32, ""); + edgeflag = ac_build_umin(&ctx->ac, edgeflag, ctx->ac.i32_1); - tmp = LLVMConstInt(ctx->i32, ngg_nogs_vertex_size(ctx->shader) - 1, 0); + tmp = LLVMConstInt(ctx->ac.i32, ngg_nogs_vertex_size(ctx->shader) - 1, 0); tmp = ac_build_gep0(&ctx->ac, vertex_ptr, tmp); LLVMBuildStore(builder, edgeflag, tmp); } } - ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label); - - LLVMValueRef prims_in_wave = si_unpack_param(ctx, ctx->param_merged_wave_info, 8, 8); - LLVMValueRef vtx_in_wave = si_unpack_param(ctx, ctx->param_merged_wave_info, 0, 8); - LLVMValueRef is_gs_thread = LLVMBuildICmp(builder, LLVMIntULT, - ac_get_thread_id(&ctx->ac), prims_in_wave, ""); - LLVMValueRef is_es_thread = LLVMBuildICmp(builder, LLVMIntULT, - ac_get_thread_id(&ctx->ac), vtx_in_wave, ""); - LLVMValueRef vtxindex[] = { - si_unpack_param(ctx, ctx->param_gs_vtx01_offset, 0, 16), - si_unpack_param(ctx, ctx->param_gs_vtx01_offset, 16, 16), - si_unpack_param(ctx, ctx->param_gs_vtx23_offset, 0, 16), - }; + bool unterminated_es_if_block = + !sel->so.num_outputs && + !sel->info.writes_edgeflag && + !ctx->screen->use_ngg_streamout && /* no query buffer */ + (ctx->type != PIPE_SHADER_VERTEX || + !ctx->shader->key.mono.u.vs_export_prim_id); + + if (!unterminated_es_if_block) + ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label); + + LLVMValueRef is_gs_thread = si_is_gs_thread(ctx); + LLVMValueRef is_es_thread = si_is_es_thread(ctx); + LLVMValueRef vtxindex[3]; + + if (ctx->shader->key.opt.ngg_culling) { + vtxindex[0] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 0, 9); + vtxindex[1] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 10, 9); + vtxindex[2] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 20, 9); + } else { + vtxindex[0] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 0, 16); + vtxindex[1] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 16, 16); + vtxindex[2] = si_unpack_param(ctx, ctx->gs_vtx23_offset, 0, 16); + } /* Determine the number of vertices per primitive. */ unsigned num_vertices; - LLVMValueRef num_vertices_val; - - if (ctx->type == PIPE_SHADER_VERTEX) { - if (info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD]) { - /* Blits always use axis-aligned rectangles with 3 vertices. */ - num_vertices = 3; - num_vertices_val = LLVMConstInt(ctx->i32, 3, 0); - } else { - /* Extract OUTPRIM field. */ - tmp = si_unpack_param(ctx, ctx->param_vs_state_bits, 2, 2); - num_vertices_val = LLVMBuildAdd(builder, tmp, ctx->i32_1, ""); - num_vertices = 3; /* TODO: optimize for points & lines */ - } - } else { - assert(ctx->type == PIPE_SHADER_TESS_EVAL); - - if (info->properties[TGSI_PROPERTY_TES_POINT_MODE]) - num_vertices = 1; - else if (info->properties[TGSI_PROPERTY_TES_PRIM_MODE] == PIPE_PRIM_LINES) - num_vertices = 2; - else - num_vertices = 3; - - num_vertices_val = LLVMConstInt(ctx->i32, num_vertices, false); - } + LLVMValueRef num_vertices_val = ngg_get_vertices_per_prim(ctx, &num_vertices); /* Streamout */ LLVMValueRef emitted_prims = NULL; if (sel->so.num_outputs) { - struct ngg_streamout nggso = {}; + assert(!unterminated_es_if_block); + struct ngg_streamout nggso = {}; nggso.num_vertices = num_vertices_val; nggso.prim_enable[0] = is_gs_thread; @@ -642,6 +1338,8 @@ LLVMValueRef user_edgeflags[3] = {}; if (sel->info.writes_edgeflag) { + assert(!unterminated_es_if_block); + /* Streamout already inserted the barrier, so don't insert it again. */ if (!sel->so.num_outputs) ac_build_s_barrier(&ctx->ac); @@ -650,12 +1348,12 @@ /* Load edge flags from ES threads and store them into VGPRs in GS threads. */ for (unsigned i = 0; i < num_vertices; i++) { tmp = ngg_nogs_vertex_ptr(ctx, vtxindex[i]); - tmp2 = LLVMConstInt(ctx->i32, ngg_nogs_vertex_size(ctx->shader) - 1, 0); + tmp2 = LLVMConstInt(ctx->ac.i32, ngg_nogs_vertex_size(ctx->shader) - 1, 0); tmp = ac_build_gep0(&ctx->ac, tmp, tmp2); tmp = LLVMBuildLoad(builder, tmp, ""); - tmp = LLVMBuildTrunc(builder, tmp, ctx->i1, ""); + tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, ""); - user_edgeflags[i] = ac_build_alloca_undef(&ctx->ac, ctx->i1, ""); + user_edgeflags[i] = ac_build_alloca_undef(&ctx->ac, ctx->ac.i1, ""); LLVMBuildStore(builder, tmp, user_edgeflags[i]); } ac_build_endif(&ctx->ac, 5400); @@ -666,6 +1364,8 @@ */ if (ctx->type == PIPE_SHADER_VERTEX && ctx->shader->key.mono.u.vs_export_prim_id) { + assert(!unterminated_es_if_block); + /* Streamout and edge flags use LDS. Make it idle, so that we can reuse it. */ if (sel->so.num_outputs || sel->info.writes_edgeflag) ac_build_s_barrier(&ctx->ac); @@ -673,25 +1373,26 @@ ac_build_ifcc(&ctx->ac, is_gs_thread, 5400); /* Extract the PROVOKING_VTX_INDEX field. */ LLVMValueRef provoking_vtx_in_prim = - si_unpack_param(ctx, ctx->param_vs_state_bits, 4, 2); + si_unpack_param(ctx, ctx->vs_state_bits, 4, 2); /* provoking_vtx_index = vtxindex[provoking_vtx_in_prim]; */ LLVMValueRef indices = ac_build_gather_values(&ctx->ac, vtxindex, 3); LLVMValueRef provoking_vtx_index = LLVMBuildExtractElement(builder, indices, provoking_vtx_in_prim, ""); + LLVMValueRef vertex_ptr = ngg_nogs_vertex_ptr(ctx, provoking_vtx_index); - LLVMBuildStore(builder, ctx->abi.gs_prim_id, - ac_build_gep0(&ctx->ac, ctx->esgs_ring, provoking_vtx_index)); + LLVMBuildStore(builder, ac_get_arg(&ctx->ac, ctx->args.gs_prim_id), + ac_build_gep0(&ctx->ac, vertex_ptr, ctx->ac.i32_0)); ac_build_endif(&ctx->ac, 5400); } - build_sendmsg_gs_alloc_req(ctx, ngg_get_vtx_cnt(ctx), ngg_get_prim_cnt(ctx)); - /* Update query buffer */ - /* TODO: this won't catch 96-bit clear_buffer via transform feedback. */ - if (!info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD]) { - tmp = si_unpack_param(ctx, ctx->param_vs_state_bits, 6, 1); - tmp = LLVMBuildTrunc(builder, tmp, ctx->i1, ""); + if (ctx->screen->use_ngg_streamout && + !info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD]) { + assert(!unterminated_es_if_block); + + tmp = si_unpack_param(ctx, ctx->vs_state_bits, 6, 1); + tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, ""); ac_build_ifcc(&ctx->ac, tmp, 5029); /* if (STREAMOUT_QUERY_ENABLED) */ tmp = LLVMBuildICmp(builder, LLVMIntEQ, get_wave_id_in_tg(ctx), ctx->ac.i32_0, ""); ac_build_ifcc(&ctx->ac, tmp, 5030); @@ -702,83 +1403,61 @@ LLVMValueRef args[] = { ngg_get_prim_cnt(ctx), ngg_get_query_buf(ctx), - LLVMConstInt(ctx->i32, 16, false), /* offset of stream[0].generated_primitives */ - ctx->i32_0, /* soffset */ - ctx->i32_0, /* cachepolicy */ + LLVMConstInt(ctx->ac.i32, 16, false), /* offset of stream[0].generated_primitives */ + ctx->ac.i32_0, /* soffset */ + ctx->ac.i32_0, /* cachepolicy */ }; if (sel->so.num_outputs) { - args[0] = ac_build_writelane(&ctx->ac, args[0], emitted_prims, ctx->i32_1); + args[0] = ac_build_writelane(&ctx->ac, args[0], emitted_prims, ctx->ac.i32_1); args[2] = ac_build_writelane(&ctx->ac, args[2], - LLVMConstInt(ctx->i32, 24, false), ctx->i32_1); + LLVMConstInt(ctx->ac.i32, 24, false), ctx->ac.i32_1); } /* TODO: should this be 64-bit atomics? */ ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.raw.buffer.atomic.add.i32", - ctx->i32, args, 5, 0); + ctx->ac.i32, args, 5, 0); } ac_build_endif(&ctx->ac, 5031); ac_build_endif(&ctx->ac, 5030); ac_build_endif(&ctx->ac, 5029); } - /* Export primitive data to the index buffer. Format is: - * - bits 0..8: index 0 - * - bit 9: edge flag 0 - * - bits 10..18: index 1 - * - bit 19: edge flag 1 - * - bits 20..28: index 2 - * - bit 29: edge flag 2 - * - bit 31: null primitive (skip) - * - * For the first version, we will always build up all three indices - * independent of the primitive type. The additional garbage data - * shouldn't hurt. - * - * TODO: culling depends on the primitive type, so can have some - * interaction here. - */ - ac_build_ifcc(&ctx->ac, is_gs_thread, 6001); - { - struct ngg_prim prim = {}; - - prim.num_vertices = num_vertices; - prim.isnull = ctx->ac.i1false; - memcpy(prim.index, vtxindex, sizeof(vtxindex[0]) * 3); - - for (unsigned i = 0; i < num_vertices; ++i) { - if (ctx->type != PIPE_SHADER_VERTEX) { - prim.edgeflag[i] = ctx->i1false; - continue; - } - - tmp = LLVMBuildLShr(builder, ctx->abi.gs_invocation_id, - LLVMConstInt(ctx->ac.i32, 8 + i, false), ""); - prim.edgeflag[i] = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, ""); - - if (sel->info.writes_edgeflag) { - tmp2 = LLVMBuildLoad(builder, user_edgeflags[i], ""); - prim.edgeflag[i] = LLVMBuildAnd(builder, prim.edgeflag[i], - tmp2, ""); - } - } - - build_export_prim(ctx, &prim); + /* Build the primitive export. */ + if (!gfx10_ngg_export_prim_early(ctx->shader)) { + assert(!unterminated_es_if_block); + gfx10_ngg_build_export_prim(ctx, user_edgeflags, NULL); } - ac_build_endif(&ctx->ac, 6001); /* Export per-vertex data (positions and parameters). */ - ac_build_ifcc(&ctx->ac, is_es_thread, 6002); + if (!unterminated_es_if_block) + ac_build_ifcc(&ctx->ac, is_es_thread, 6002); { unsigned i; /* Unconditionally (re-)load the values for proper SSA form. */ for (i = 0; i < info->num_outputs; i++) { - for (unsigned j = 0; j < 4; j++) { - outputs[i].values[j] = - LLVMBuildLoad(builder, - addrs[4 * i + j], - ""); + /* If the NGG cull shader part computed the position, don't + * use the position from the current shader part. Instead, + * load it from LDS. + */ + if (info->output_semantic_name[i] == TGSI_SEMANTIC_POSITION && + ctx->shader->key.opt.ngg_culling) { + vertex_ptr = ngg_nogs_vertex_ptr(ctx, + ac_get_arg(&ctx->ac, ctx->ngg_old_thread_id)); + + for (unsigned j = 0; j < 4; j++) { + tmp = LLVMConstInt(ctx->ac.i32, lds_pos_x + j, 0); + tmp = ac_build_gep0(&ctx->ac, vertex_ptr, tmp); + tmp = LLVMBuildLoad(builder, tmp, ""); + outputs[i].values[j] = ac_to_float(&ctx->ac, tmp); + } + } else { + for (unsigned j = 0; j < 4; j++) { + outputs[i].values[j] = + LLVMBuildLoad(builder, + addrs[4 * i + j], ""); + } } } @@ -790,8 +1469,8 @@ /* Wait for GS stores to finish. */ ac_build_s_barrier(&ctx->ac); - tmp = ac_build_gep0(&ctx->ac, ctx->esgs_ring, - get_thread_id_in_tg(ctx)); + tmp = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx)); + tmp = ac_build_gep0(&ctx->ac, tmp, ctx->ac.i32_0); outputs[i].values[0] = LLVMBuildLoad(builder, tmp, ""); } else { assert(ctx->type == PIPE_SHADER_TESS_EVAL); @@ -800,14 +1479,14 @@ outputs[i].values[0] = ac_to_float(&ctx->ac, outputs[i].values[0]); for (unsigned j = 1; j < 4; j++) - outputs[i].values[j] = LLVMGetUndef(ctx->f32); + outputs[i].values[j] = LLVMGetUndef(ctx->ac.f32); memset(outputs[i].vertex_stream, 0, sizeof(outputs[i].vertex_stream)); i++; } - si_llvm_export_vs(ctx, outputs, i); + si_llvm_build_vs_exports(ctx, outputs, i); } ac_build_endif(&ctx->ac, 6002); } @@ -816,7 +1495,7 @@ ngg_gs_get_vertex_storage(struct si_shader_context *ctx) { const struct si_shader_selector *sel = ctx->shader->selector; - const struct tgsi_shader_info *info = &sel->info; + const struct si_shader_info *info = &sel->info; LLVMTypeRef elements[2] = { LLVMArrayType(ctx->ac.i32, 4 * info->num_outputs), @@ -894,12 +1573,36 @@ return ngg_gs_vertex_ptr(ctx, vertexidx); } +static LLVMValueRef +ngg_gs_get_emit_output_ptr(struct si_shader_context *ctx, LLVMValueRef vertexptr, + unsigned out_idx) +{ + LLVMValueRef gep_idx[3] = { + ctx->ac.i32_0, /* implied C-style array */ + ctx->ac.i32_0, /* first struct entry */ + LLVMConstInt(ctx->ac.i32, out_idx, false), + }; + return LLVMBuildGEP(ctx->ac.builder, vertexptr, gep_idx, 3, ""); +} + +static LLVMValueRef +ngg_gs_get_emit_primflag_ptr(struct si_shader_context *ctx, LLVMValueRef vertexptr, + unsigned stream) +{ + LLVMValueRef gep_idx[3] = { + ctx->ac.i32_0, /* implied C-style array */ + ctx->ac.i32_1, /* second struct entry */ + LLVMConstInt(ctx->ac.i32, stream, false), + }; + return LLVMBuildGEP(ctx->ac.builder, vertexptr, gep_idx, 3, ""); +} + void gfx10_ngg_gs_emit_vertex(struct si_shader_context *ctx, unsigned stream, LLVMValueRef *addrs) { const struct si_shader_selector *sel = ctx->shader->selector; - const struct tgsi_shader_info *info = &sel->info; + const struct si_shader_info *info = &sel->info; LLVMBuilderRef builder = ctx->ac.builder; LLVMValueRef tmp; const LLVMValueRef vertexidx = @@ -911,7 +1614,7 @@ */ const LLVMValueRef can_emit = LLVMBuildICmp(builder, LLVMIntULT, vertexidx, - LLVMConstInt(ctx->i32, sel->gs_max_out_vertices, false), ""); + LLVMConstInt(ctx->ac.i32, sel->gs_max_out_vertices, false), ""); tmp = LLVMBuildAdd(builder, vertexidx, ctx->ac.i32_1, ""); tmp = LLVMBuildSelect(builder, can_emit, tmp, vertexidx, ""); @@ -929,15 +1632,9 @@ continue; LLVMValueRef out_val = LLVMBuildLoad(builder, addrs[4 * i + chan], ""); - LLVMValueRef gep_idx[3] = { - ctx->ac.i32_0, /* implied C-style array */ - ctx->ac.i32_0, /* first entry of struct */ - LLVMConstInt(ctx->ac.i32, out_idx, false), - }; - LLVMValueRef ptr = LLVMBuildGEP(builder, vertexptr, gep_idx, 3, ""); - out_val = ac_to_integer(&ctx->ac, out_val); - LLVMBuildStore(builder, out_val, ptr); + LLVMBuildStore(builder, out_val, + ngg_gs_get_emit_output_ptr(ctx, vertexptr, out_idx)); } } assert(out_idx * 4 == sel->gsvs_vertex_size); @@ -949,19 +1646,29 @@ const LLVMValueRef iscompleteprim = LLVMBuildICmp(builder, LLVMIntUGE, curverts, tmp, ""); + /* Since the geometry shader emits triangle strips, we need to + * track which primitive is odd and swap vertex indices to get + * the correct vertex order. + */ + LLVMValueRef is_odd = ctx->ac.i1false; + if (stream == 0 && u_vertices_per_prim(sel->gs_output_prim) == 3) { + tmp = LLVMBuildAnd(builder, curverts, ctx->ac.i32_1, ""); + is_odd = LLVMBuildICmp(builder, LLVMIntEQ, tmp, ctx->ac.i32_1, ""); + } + tmp = LLVMBuildAdd(builder, curverts, ctx->ac.i32_1, ""); LLVMBuildStore(builder, tmp, ctx->gs_curprim_verts[stream]); - LLVMValueRef gep_idx[3] = { - ctx->ac.i32_0, /* implied C-style array */ - ctx->ac.i32_1, /* second struct entry */ - LLVMConstInt(ctx->ac.i32, stream, false), - }; - const LLVMValueRef primflagptr = - LLVMBuildGEP(builder, vertexptr, gep_idx, 3, ""); - + /* The per-vertex primitive flag encoding: + * bit 0: whether this vertex finishes a primitive + * bit 1: whether the primitive is odd (if we are emitting triangle strips) + */ tmp = LLVMBuildZExt(builder, iscompleteprim, ctx->ac.i8, ""); - LLVMBuildStore(builder, tmp, primflagptr); + tmp = LLVMBuildOr(builder, tmp, + LLVMBuildShl(builder, + LLVMBuildZExt(builder, is_odd, ctx->ac.i8, ""), + ctx->ac.i8_1, ""), ""); + LLVMBuildStore(builder, tmp, ngg_gs_get_emit_primflag_ptr(ctx, vertexptr, stream)); tmp = LLVMBuildLoad(builder, ctx->gs_generated_prims[stream], ""); tmp = LLVMBuildAdd(builder, tmp, LLVMBuildZExt(builder, iscompleteprim, ctx->ac.i32, ""), ""); @@ -980,11 +1687,11 @@ LLVMValueRef tid = get_thread_id_in_tg(ctx); LLVMValueRef tmp; - tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, LLVMConstInt(ctx->i32, 4, false), ""); + tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, LLVMConstInt(ctx->ac.i32, 4, false), ""); ac_build_ifcc(&ctx->ac, tmp, 5090); { LLVMValueRef ptr = ac_build_gep0(&ctx->ac, scratchptr, tid); - LLVMBuildStore(builder, ctx->i32_0, ptr); + LLVMBuildStore(builder, ctx->ac.i32_0, ptr); } ac_build_endif(&ctx->ac, 5090); @@ -994,7 +1701,7 @@ void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx) { const struct si_shader_selector *sel = ctx->shader->selector; - const struct tgsi_shader_info *info = &sel->info; + const struct si_shader_info *info = &sel->info; const unsigned verts_per_prim = u_vertices_per_prim(sel->gs_output_prim); LLVMBuilderRef builder = ctx->ac.builder; LLVMValueRef i8_0 = LLVMConstInt(ctx->ac.i8, 0, false); @@ -1027,13 +1734,7 @@ LLVMBuildStore(builder, tmp, ctx->gs_next_vertex[stream]); tmp = ngg_gs_emit_vertex_ptr(ctx, gsthread, vertexidx); - LLVMValueRef gep_idx[3] = { - ctx->ac.i32_0, /* implied C-style array */ - ctx->ac.i32_1, /* second entry of struct */ - LLVMConstInt(ctx->ac.i32, stream, false), - }; - tmp = LLVMBuildGEP(builder, tmp, gep_idx, 3, ""); - LLVMBuildStore(builder, i8_0, tmp); + LLVMBuildStore(builder, i8_0, ngg_gs_get_emit_primflag_ptr(ctx, tmp, stream)); ac_build_endloop(&ctx->ac, 5100); } @@ -1047,12 +1748,12 @@ LLVMBuildLoad(builder, ctx->gs_generated_prims[stream], ""); numprims = ac_build_reduce(&ctx->ac, numprims, nir_op_iadd, ctx->ac.wave_size); - tmp = LLVMBuildICmp(builder, LLVMIntEQ, ac_get_thread_id(&ctx->ac), ctx->i32_0, ""); + tmp = LLVMBuildICmp(builder, LLVMIntEQ, ac_get_thread_id(&ctx->ac), ctx->ac.i32_0, ""); ac_build_ifcc(&ctx->ac, tmp, 5105); { LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpAdd, ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, - LLVMConstInt(ctx->i32, stream, false)), + LLVMConstInt(ctx->ac.i32, stream, false)), numprims, LLVMAtomicOrderingMonotonic, false); } ac_build_endif(&ctx->ac, 5105); @@ -1069,70 +1770,64 @@ if (sel->so.num_outputs) { struct ngg_streamout nggso = {}; - nggso.num_vertices = LLVMConstInt(ctx->i32, verts_per_prim, false); + nggso.num_vertices = LLVMConstInt(ctx->ac.i32, verts_per_prim, false); LLVMValueRef vertexptr = ngg_gs_vertex_ptr(ctx, tid); for (unsigned stream = 0; stream < 4; ++stream) { if (!info->num_stream_output_components[stream]) continue; - LLVMValueRef gep_idx[3] = { - ctx->i32_0, /* implicit C-style array */ - ctx->i32_1, /* second value of struct */ - LLVMConstInt(ctx->i32, stream, false), - }; - tmp = LLVMBuildGEP(builder, vertexptr, gep_idx, 3, ""); - tmp = LLVMBuildLoad(builder, tmp, ""); - tmp = LLVMBuildTrunc(builder, tmp, ctx->i1, ""); + tmp = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, vertexptr, stream), ""); + tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, ""); tmp2 = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, ""); nggso.prim_enable[stream] = LLVMBuildAnd(builder, tmp, tmp2, ""); } for (unsigned i = 0; i < verts_per_prim; ++i) { tmp = LLVMBuildSub(builder, tid, - LLVMConstInt(ctx->i32, verts_per_prim - i - 1, false), ""); + LLVMConstInt(ctx->ac.i32, verts_per_prim - i - 1, false), ""); tmp = ngg_gs_vertex_ptr(ctx, tmp); - nggso.vertices[i] = ac_build_gep0(&ctx->ac, tmp, ctx->i32_0); + nggso.vertices[i] = ac_build_gep0(&ctx->ac, tmp, ctx->ac.i32_0); } build_streamout(ctx, &nggso); } /* Write shader query data. */ - tmp = si_unpack_param(ctx, ctx->param_vs_state_bits, 6, 1); - tmp = LLVMBuildTrunc(builder, tmp, ctx->i1, ""); - ac_build_ifcc(&ctx->ac, tmp, 5109); /* if (STREAMOUT_QUERY_ENABLED) */ - unsigned num_query_comps = sel->so.num_outputs ? 8 : 4; - tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, - LLVMConstInt(ctx->i32, num_query_comps, false), ""); - ac_build_ifcc(&ctx->ac, tmp, 5110); - { - LLVMValueRef offset; - tmp = tid; - if (sel->so.num_outputs) - tmp = LLVMBuildAnd(builder, tmp, LLVMConstInt(ctx->i32, 3, false), ""); - offset = LLVMBuildNUWMul(builder, tmp, LLVMConstInt(ctx->i32, 32, false), ""); - if (sel->so.num_outputs) { - tmp = LLVMBuildLShr(builder, tid, LLVMConstInt(ctx->i32, 2, false), ""); - tmp = LLVMBuildNUWMul(builder, tmp, LLVMConstInt(ctx->i32, 8, false), ""); - offset = LLVMBuildAdd(builder, offset, tmp, ""); - } - - tmp = LLVMBuildLoad(builder, ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tid), ""); - LLVMValueRef args[] = { - tmp, - ngg_get_query_buf(ctx), - offset, - LLVMConstInt(ctx->i32, 16, false), /* soffset */ - ctx->i32_0, /* cachepolicy */ - }; - ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.raw.buffer.atomic.add.i32", - ctx->i32, args, 5, 0); - } - ac_build_endif(&ctx->ac, 5110); - ac_build_endif(&ctx->ac, 5109); + if (ctx->screen->use_ngg_streamout) { + tmp = si_unpack_param(ctx, ctx->vs_state_bits, 6, 1); + tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, ""); + ac_build_ifcc(&ctx->ac, tmp, 5109); /* if (STREAMOUT_QUERY_ENABLED) */ + unsigned num_query_comps = sel->so.num_outputs ? 8 : 4; + tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, + LLVMConstInt(ctx->ac.i32, num_query_comps, false), ""); + ac_build_ifcc(&ctx->ac, tmp, 5110); + { + LLVMValueRef offset; + tmp = tid; + if (sel->so.num_outputs) + tmp = LLVMBuildAnd(builder, tmp, LLVMConstInt(ctx->ac.i32, 3, false), ""); + offset = LLVMBuildNUWMul(builder, tmp, LLVMConstInt(ctx->ac.i32, 32, false), ""); + if (sel->so.num_outputs) { + tmp = LLVMBuildLShr(builder, tid, LLVMConstInt(ctx->ac.i32, 2, false), ""); + tmp = LLVMBuildNUWMul(builder, tmp, LLVMConstInt(ctx->ac.i32, 8, false), ""); + offset = LLVMBuildAdd(builder, offset, tmp, ""); + } - /* TODO: culling */ + tmp = LLVMBuildLoad(builder, ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tid), ""); + LLVMValueRef args[] = { + tmp, + ngg_get_query_buf(ctx), + offset, + LLVMConstInt(ctx->ac.i32, 16, false), /* soffset */ + ctx->ac.i32_0, /* cachepolicy */ + }; + ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.raw.buffer.atomic.add.i32", + ctx->ac.i32, args, 5, 0); + } + ac_build_endif(&ctx->ac, 5110); + ac_build_endif(&ctx->ac, 5109); + } /* Determine vertex liveness. */ LLVMValueRef vertliveptr = ac_build_alloca(&ctx->ac, ctx->ac.i1, "vertexlive"); @@ -1152,13 +1847,7 @@ /* Load primitive liveness */ tmp = ngg_gs_vertex_ptr(ctx, primidx); - LLVMValueRef gep_idx[3] = { - ctx->ac.i32_0, /* implicit C-style array */ - ctx->ac.i32_1, /* second value of struct */ - ctx->ac.i32_0, /* stream 0 */ - }; - tmp = LLVMBuildGEP(builder, tmp, gep_idx, 3, ""); - tmp = LLVMBuildLoad(builder, tmp, ""); + tmp = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 0), ""); const LLVMValueRef primlive = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, ""); @@ -1179,7 +1868,7 @@ vertlive_scan.enable_reduce = true; vertlive_scan.enable_exclusive = true; vertlive_scan.src = vertlive; - vertlive_scan.scratch = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, ctx->i32_0); + vertlive_scan.scratch = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, ctx->ac.i32_0); vertlive_scan.waveidx = get_wave_id_in_tg(ctx); vertlive_scan.numwaves = get_tgsize(ctx); vertlive_scan.maxwaves = 8; @@ -1204,7 +1893,8 @@ * there are 4 or more contiguous null primitives in the export * (in the common case of single-dword prim exports). */ - build_sendmsg_gs_alloc_req(ctx, vertlive_scan.result_reduce, num_emit_threads); + ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx), + vertlive_scan.result_reduce, num_emit_threads); /* Setup the reverse vertex compaction permutation. We re-use stream 1 * of the primitive liveness flags, relying on the fact that each @@ -1212,14 +1902,8 @@ ac_build_ifcc(&ctx->ac, vertlive, 5130); { tmp = ngg_gs_vertex_ptr(ctx, vertlive_scan.result_exclusive); - LLVMValueRef gep_idx[3] = { - ctx->ac.i32_0, /* implicit C-style array */ - ctx->ac.i32_1, /* second value of struct */ - ctx->ac.i32_1, /* stream 1 */ - }; - tmp = LLVMBuildGEP(builder, tmp, gep_idx, 3, ""); tmp2 = LLVMBuildTrunc(builder, tid, ctx->ac.i8, ""); - LLVMBuildStore(builder, tmp2, tmp); + LLVMBuildStore(builder, tmp2, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 1)); } ac_build_endif(&ctx->ac, 5130); @@ -1229,19 +1913,13 @@ tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, ""); ac_build_ifcc(&ctx->ac, tmp, 5140); { - struct ngg_prim prim = {}; + LLVMValueRef flags; + struct ac_ngg_prim prim = {}; prim.num_vertices = verts_per_prim; tmp = ngg_gs_vertex_ptr(ctx, tid); - LLVMValueRef gep_idx[3] = { - ctx->ac.i32_0, /* implicit C-style array */ - ctx->ac.i32_1, /* second value of struct */ - ctx->ac.i32_0, /* primflag */ - }; - tmp = LLVMBuildGEP(builder, tmp, gep_idx, 3, ""); - tmp = LLVMBuildLoad(builder, tmp, ""); - prim.isnull = LLVMBuildICmp(builder, LLVMIntEQ, tmp, - LLVMConstInt(ctx->ac.i8, 0, false), ""); + flags = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 0), ""); + prim.isnull = LLVMBuildNot(builder, LLVMBuildTrunc(builder, flags, ctx->ac.i1, ""), ""); for (unsigned i = 0; i < verts_per_prim; ++i) { prim.index[i] = LLVMBuildSub(builder, vertlive_scan.result_exclusive, @@ -1249,7 +1927,21 @@ prim.edgeflag[i] = ctx->ac.i1false; } - build_export_prim(ctx, &prim); + /* Geometry shaders output triangle strips, but NGG expects triangles. */ + if (verts_per_prim == 3) { + LLVMValueRef is_odd = LLVMBuildLShr(builder, flags, ctx->ac.i8_1, ""); + is_odd = LLVMBuildTrunc(builder, is_odd, ctx->ac.i1, ""); + LLVMValueRef flatshade_first = + LLVMBuildICmp(builder, LLVMIntEQ, + si_unpack_param(ctx, ctx->vs_state_bits, 4, 2), + ctx->ac.i32_0, ""); + + ac_build_triangle_strip_indices_to_triangle(&ctx->ac, is_odd, + flatshade_first, + prim.index); + } + + ac_build_export_prim(&ctx->ac, &prim); } ac_build_endif(&ctx->ac, 5140); @@ -1260,25 +1952,17 @@ struct si_shader_output_values outputs[PIPE_MAX_SHADER_OUTPUTS]; tmp = ngg_gs_vertex_ptr(ctx, tid); - LLVMValueRef gep_idx[3] = { - ctx->ac.i32_0, /* implicit C-style array */ - ctx->ac.i32_1, /* second value of struct */ - ctx->ac.i32_1, /* stream 1: source data index */ - }; - tmp = LLVMBuildGEP(builder, tmp, gep_idx, 3, ""); - tmp = LLVMBuildLoad(builder, tmp, ""); + tmp = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 1), ""); tmp = LLVMBuildZExt(builder, tmp, ctx->ac.i32, ""); const LLVMValueRef vertexptr = ngg_gs_vertex_ptr(ctx, tmp); unsigned out_idx = 0; - gep_idx[1] = ctx->ac.i32_0; for (unsigned i = 0; i < info->num_outputs; i++) { outputs[i].semantic_name = info->output_semantic_name[i]; outputs[i].semantic_index = info->output_semantic_index[i]; for (unsigned j = 0; j < 4; j++, out_idx++) { - gep_idx[2] = LLVMConstInt(ctx->ac.i32, out_idx, false); - tmp = LLVMBuildGEP(builder, vertexptr, gep_idx, 3, ""); + tmp = ngg_gs_get_emit_output_ptr(ctx, vertexptr, out_idx); tmp = LLVMBuildLoad(builder, tmp, ""); outputs[i].values[j] = ac_to_float(&ctx->ac, tmp); outputs[i].vertex_stream[j] = @@ -1286,7 +1970,7 @@ } } - si_llvm_export_vs(ctx, outputs, info->num_outputs); + si_llvm_build_vs_exports(ctx, outputs, info->num_outputs); } ac_build_endif(&ctx->ac, 5145); } @@ -1335,8 +2019,16 @@ /* All these are per subgroup: */ bool max_vert_out_per_gs_instance = false; - unsigned max_esverts_base = 128; unsigned max_gsprims_base = 128; /* default prim group size clamp */ + unsigned max_esverts_base = 128; + + if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST) { + max_gsprims_base = 128 / 3; + max_esverts_base = max_gsprims_base * 3; + } else if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP) { + max_gsprims_base = 126; + max_esverts_base = 128; + } /* Hardware has the following non-natural restrictions on the value * of GE_CNTL.VERT_GRP_SIZE based on based on the primitive type of @@ -1372,15 +2064,6 @@ /* VS and TES. */ /* LDS size for passing data from ES to GS. */ esvert_lds_size = ngg_nogs_vertex_size(shader); - - /* LDS size for passing data from GS to ES. - * GS stores Primitive IDs into LDS at the address corresponding - * to the ES thread of the provoking vertex. All ES threads - * load and export PrimitiveID for their thread. - */ - if (gs_sel->type == PIPE_SHADER_VERTEX && - shader->key.mono.u.vs_export_prim_id) - esvert_lds_size = MAX2(esvert_lds_size, 1); } unsigned max_gsprims = max_gsprims_base; diff -Nru mesa-19.2.8/src/gallium/drivers/radeonsi/Makefile.sources mesa-20.0.8/src/gallium/drivers/radeonsi/Makefile.sources --- mesa-19.2.8/src/gallium/drivers/radeonsi/Makefile.sources 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/radeonsi/Makefile.sources 2020-06-12 01:21:17.000000000 +0000 @@ -19,7 +19,6 @@ si_cp_dma.c \ si_debug.c \ si_descriptors.c \ - si_dma.c \ si_dma_cs.c \ si_fence.c \ si_get.c \ @@ -36,10 +35,13 @@ si_shader.c \ si_shader.h \ si_shader_internal.h \ + si_shader_llvm.c \ + si_shader_llvm_gs.c \ + si_shader_llvm_ps.c \ + si_shader_llvm_resources.c \ + si_shader_llvm_tess.c \ + si_shader_llvm_vs.c \ si_shader_nir.c \ - si_shader_tgsi_alu.c \ - si_shader_tgsi_mem.c \ - si_shader_tgsi_setup.c \ si_shaderlib_tgsi.c \ si_state.c \ si_state_binning.c \ diff -Nru mesa-19.2.8/src/gallium/drivers/radeonsi/meson.build mesa-20.0.8/src/gallium/drivers/radeonsi/meson.build --- mesa-19.2.8/src/gallium/drivers/radeonsi/meson.build 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/radeonsi/meson.build 2020-06-12 01:21:17.000000000 +0000 @@ -34,7 +34,6 @@ 'si_cp_dma.c', 'si_debug.c', 'si_descriptors.c', - 'si_dma.c', 'si_dma_cs.c', 'si_fence.c', 'si_get.c', @@ -51,10 +50,13 @@ 'si_shader.c', 'si_shader.h', 'si_shader_internal.h', + 'si_shader_llvm.c', + 'si_shader_llvm_gs.c', + 'si_shader_llvm_ps.c', + 'si_shader_llvm_resources.c', + 'si_shader_llvm_tess.c', + 'si_shader_llvm_vs.c', 'si_shader_nir.c', - 'si_shader_tgsi_alu.c', - 'si_shader_tgsi_mem.c', - 'si_shader_tgsi_setup.c', 'si_shaderlib_tgsi.c', 'si_state.c', 'si_state.h', @@ -105,7 +107,7 @@ 'gfx10_format_table.h', input : files( 'gfx10_format_table.py', - '../../auxiliary/util/u_format.csv', '../../../amd/registers/gfx10-rsrc.json' + '../../../util/format/u_format.csv', '../../../amd/registers/gfx10-rsrc.json' ), output : 'gfx10_format_table.h', command : [prog_python, '@INPUT@'], @@ -117,7 +119,7 @@ 'radeonsi', [files_libradeonsi, si_driinfo_h, sid_tables_h, gfx10_format_table_h], include_directories : [ - inc_src, inc_include, inc_gallium, inc_gallium_aux, inc_amd_common, + inc_src, inc_include, inc_gallium, inc_gallium_aux, inc_amd_common, inc_amd_common_llvm, inc_gallium_drivers, ], c_args : ['-Wstrict-overflow=0', c_vis_args], @@ -129,7 +131,7 @@ compile_args : '-DGALLIUM_RADEONSI', sources : si_driinfo_h, link_with : [ - libradeonsi, libradeonwinsys, libamdgpuwinsys, libamd_common, + libradeonsi, libradeonwinsys, libamdgpuwinsys, libamd_common, libamd_common_llvm, libgalliumvl ], dependencies : idep_nir, ) diff -Nru mesa-19.2.8/src/gallium/drivers/radeonsi/si_blit.c mesa-20.0.8/src/gallium/drivers/radeonsi/si_blit.c --- mesa-19.2.8/src/gallium/drivers/radeonsi/si_blit.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/radeonsi/si_blit.c 2020-06-12 01:21:17.000000000 +0000 @@ -25,7 +25,7 @@ #include "si_pipe.h" #include "si_compute.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_log.h" #include "util/u_surface.h" @@ -99,6 +99,7 @@ * non-global VS user SGPRs. */ sctx->shader_pointers_dirty |= SI_DESCS_SHADER_MASK(VERTEX); sctx->vertex_buffer_pointer_dirty = sctx->vb_descriptors_buffer != NULL; + sctx->vertex_buffer_user_sgprs_dirty = sctx->num_vertex_elements > 0; si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers); } @@ -432,7 +433,8 @@ struct si_texture *tex, unsigned first_level, unsigned last_level, unsigned first_layer, unsigned last_layer, - bool need_dcc_decompress) + bool need_dcc_decompress, + bool need_fmask_expand) { void* custom_blend; unsigned layer, checked_last_layer, max_layer; @@ -442,7 +444,7 @@ if (!need_dcc_decompress) level_mask &= tex->dirty_level_mask; if (!level_mask) - return; + goto expand_fmask; if (unlikely(sctx->log)) u_log_printf(sctx->log, @@ -453,7 +455,7 @@ if (need_dcc_decompress) { custom_blend = sctx->custom_blend_dcc_decompress; - assert(tex->dcc_offset); + assert(tex->surface.dcc_offset); /* disable levels without DCC */ for (int i = first_level; i <= last_level; i++) { @@ -512,19 +514,26 @@ si_make_CB_shader_coherent(sctx, tex->buffer.b.b.nr_samples, vi_dcc_enabled(tex, first_level), tex->surface.u.gfx9.dcc.pipe_aligned); + +expand_fmask: + if (need_fmask_expand && tex->surface.fmask_offset && !tex->fmask_is_identity) { + si_compute_expand_fmask(&sctx->b, &tex->buffer.b.b); + tex->fmask_is_identity = true; + } } static void si_decompress_color_texture(struct si_context *sctx, struct si_texture *tex, - unsigned first_level, unsigned last_level) + unsigned first_level, unsigned last_level, + bool need_fmask_expand) { /* CMASK or DCC can be discarded and we can still end up here. */ - if (!tex->cmask_buffer && !tex->surface.fmask_size && !tex->dcc_offset) + if (!tex->cmask_buffer && !tex->surface.fmask_size && !tex->surface.dcc_offset) return; si_blit_decompress_color(sctx, tex, first_level, last_level, 0, util_max_layer(&tex->buffer.b.b, first_level), - false); + false, need_fmask_expand); } static void @@ -546,7 +555,7 @@ tex = (struct si_texture *)view->texture; si_decompress_color_texture(sctx, tex, view->u.tex.first_level, - view->u.tex.last_level); + view->u.tex.last_level, false); } } @@ -569,7 +578,8 @@ tex = (struct si_texture *)view->resource; si_decompress_color_texture(sctx, tex, view->u.tex.level, - view->u.tex.level); + view->u.tex.level, + view->access & PIPE_IMAGE_ACCESS_WRITE); } } @@ -582,7 +592,7 @@ { bool render_feedback = false; - if (!tex->dcc_offset) + if (!tex->surface.dcc_offset) return; for (unsigned j = 0; j < sctx->framebuffer.state.nr_cbufs; ++j) { @@ -729,7 +739,7 @@ struct si_texture *tex = (struct si_texture *)view->texture; si_decompress_color_texture(sctx, tex, view->u.tex.first_level, - view->u.tex.last_level); + view->u.tex.last_level, false); } util_dynarray_foreach(&sctx->resident_tex_needs_depth_decompress, @@ -753,7 +763,8 @@ struct si_texture *tex = (struct si_texture *)view->resource; si_decompress_color_texture(sctx, tex, view->u.tex.level, - view->u.tex.level); + view->u.tex.level, + view->access & PIPE_IMAGE_ACCESS_WRITE); } } @@ -798,7 +809,7 @@ si_decompress_color_texture(sctx, (struct si_texture*)cb0->texture, cb0->u.tex.first_layer, - cb0->u.tex.last_layer); + cb0->u.tex.last_layer, false); } si_check_render_feedback(sctx); @@ -814,10 +825,10 @@ * blitting if any decompression is needed. * The driver doesn't decompress resources automatically while u_blitter is * rendering. */ -static void si_decompress_subresource(struct pipe_context *ctx, - struct pipe_resource *tex, - unsigned planes, unsigned level, - unsigned first_layer, unsigned last_layer) +void si_decompress_subresource(struct pipe_context *ctx, + struct pipe_resource *tex, + unsigned planes, unsigned level, + unsigned first_layer, unsigned last_layer) { struct si_context *sctx = (struct si_context *)ctx; struct si_texture *stex = (struct si_texture*)tex; @@ -840,7 +851,7 @@ si_decompress_depth(sctx, stex, planes, level, level, first_layer, last_layer); - } else if (stex->surface.fmask_size || stex->cmask_buffer || stex->dcc_offset) { + } else if (stex->surface.fmask_size || stex->cmask_buffer || stex->surface.dcc_offset) { /* If we've rendered into the framebuffer and it's a blitting * source, make sure the decompression pass is invoked * by dirtying the framebuffer. @@ -855,7 +866,7 @@ } si_blit_decompress_color(sctx, stex, level, level, - first_layer, last_layer, false); + first_layer, last_layer, false, false); } } @@ -896,7 +907,7 @@ !util_format_is_compressed(dst->format) && !util_format_is_depth_or_stencil(src->format) && src->nr_samples <= 1 && - !sdst->dcc_offset && + !sdst->surface.dcc_offset && !(dst->target != src->target && (src->target == PIPE_TEXTURE_1D_ARRAY || dst->target == PIPE_TEXTURE_1D_ARRAY))) { si_compute_copy_image(sctx, dst, dst_level, src, src_level, dstx, dsty, dstz, src_box); @@ -1153,10 +1164,13 @@ templ.array_size = 1; templ.usage = PIPE_USAGE_DEFAULT; templ.flags = SI_RESOURCE_FLAG_FORCE_MSAA_TILING | + SI_RESOURCE_FLAG_FORCE_MICRO_TILE_MODE | + SI_RESOURCE_FLAG_MICRO_TILE_MODE_SET(src->surface.micro_tile_mode) | SI_RESOURCE_FLAG_DISABLE_DCC; /* The src and dst microtile modes must be the same. */ - if (src->surface.micro_tile_mode == RADEON_MICRO_MODE_DISPLAY) + if (sctx->chip_class <= GFX8 && + src->surface.micro_tile_mode == RADEON_MICRO_MODE_DISPLAY) templ.bind = PIPE_BIND_SCANOUT; else templ.bind = 0; @@ -1203,7 +1217,6 @@ * on failure (recursion). */ if (dst->surface.is_linear && - sctx->dma_copy && util_can_blit_via_copy_region(info, false)) { sctx->dma_copy(ctx, info->dst.resource, info->dst.level, info->dst.box.x, info->dst.box.y, @@ -1228,7 +1241,7 @@ info->src.box.z, info->src.box.z + info->src.box.depth - 1); - if (sctx->screen->debug_flags & DBG(FORCE_DMA) && + if (sctx->screen->debug_flags & DBG(FORCE_SDMA) && util_try_blit_via_copy_region(ctx, info)) return; @@ -1288,13 +1301,15 @@ if (tex->dcc_separate_buffer && !tex->separate_dcc_dirty) return; - if (!tex->is_depth && (tex->cmask_buffer || tex->dcc_offset)) { + if (!tex->is_depth && (tex->cmask_buffer || tex->surface.dcc_offset)) { si_blit_decompress_color(sctx, tex, 0, res->last_level, 0, util_max_layer(res, 0), - tex->dcc_separate_buffer != NULL); + tex->dcc_separate_buffer != NULL, false); - if (tex->display_dcc_offset) + if (tex->surface.display_dcc_offset && tex->displayable_dcc_dirty) { si_retile_dcc(sctx, tex); + tex->displayable_dcc_dirty = false; + } } /* Always do the analysis even if DCC is disabled at the moment. */ @@ -1333,12 +1348,12 @@ /* If graphics is disabled, we can't decompress DCC, but it shouldn't * be compressed either. The caller should simply discard it. */ - if (!tex->dcc_offset || !sctx->has_graphics) + if (!tex->surface.dcc_offset || !sctx->has_graphics) return; si_blit_decompress_color(sctx, tex, 0, tex->buffer.b.b.last_level, 0, util_max_layer(&tex->buffer.b.b, 0), - true); + true, false); } void si_init_blit_functions(struct si_context *sctx) diff -Nru mesa-19.2.8/src/gallium/drivers/radeonsi/si_buffer.c mesa-20.0.8/src/gallium/drivers/radeonsi/si_buffer.c --- mesa-19.2.8/src/gallium/drivers/radeonsi/si_buffer.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/radeonsi/si_buffer.c 2020-06-12 01:21:17.000000000 +0000 @@ -36,8 +36,8 @@ if (sctx->ws->cs_is_buffer_referenced(sctx->gfx_cs, buf, usage)) { return true; } - if (radeon_emitted(sctx->dma_cs, 0) && - sctx->ws->cs_is_buffer_referenced(sctx->dma_cs, buf, usage)) { + if (radeon_emitted(sctx->sdma_cs, 0) && + sctx->ws->cs_is_buffer_referenced(sctx->sdma_cs, buf, usage)) { return true; } return false; @@ -72,8 +72,8 @@ busy = true; } } - if (radeon_emitted(sctx->dma_cs, 0) && - sctx->ws->cs_is_buffer_referenced(sctx->dma_cs, + if (radeon_emitted(sctx->sdma_cs, 0) && + sctx->ws->cs_is_buffer_referenced(sctx->sdma_cs, resource->buf, rusage)) { if (usage & PIPE_TRANSFER_DONTBLOCK) { si_flush_dma_cs(sctx, PIPE_FLUSH_ASYNC, NULL); @@ -91,8 +91,8 @@ /* We will be wait for the GPU. Wait for any offloaded * CS flush to complete to avoid busy-waiting in the winsys. */ sctx->ws->cs_sync_flush(sctx->gfx_cs); - if (sctx->dma_cs) - sctx->ws->cs_sync_flush(sctx->dma_cs); + if (sctx->sdma_cs) + sctx->ws->cs_sync_flush(sctx->sdma_cs); } } @@ -503,9 +503,9 @@ box->width + (box->x % SI_MAP_BUFFER_ALIGNMENT))); if (staging) { /* Copy the VRAM buffer to the staging buffer. */ - sctx->dma_copy(ctx, &staging->b.b, 0, - box->x % SI_MAP_BUFFER_ALIGNMENT, - 0, 0, resource, 0, box); + si_sdma_copy_buffer(sctx, &staging->b.b, resource, + box->x % SI_MAP_BUFFER_ALIGNMENT, + box->x, box->width); data = si_buffer_map_sync_with_rings(sctx, staging, usage & ~PIPE_TRANSFER_UNSYNCHRONIZED); @@ -590,7 +590,7 @@ box->x, src_offset, box->width); } - util_range_add(&buf->valid_buffer_range, box->x, + util_range_add(&buf->b.b, &buf->valid_buffer_range, box->x, box->x + box->width); } @@ -744,8 +744,8 @@ buf->domains = RADEON_DOMAIN_GTT; buf->flags = 0; buf->b.is_user_ptr = true; - util_range_add(&buf->valid_buffer_range, 0, templ->width0); - util_range_add(&buf->b.valid_buffer_range, 0, templ->width0); + util_range_add(&buf->b.b, &buf->valid_buffer_range, 0, templ->width0); + util_range_add(&buf->b.b, &buf->b.valid_buffer_range, 0, templ->width0); /* Convert a user pointer to a buffer. */ buf->buf = ws->buffer_from_ptr(ws, user_memory, templ->width0); @@ -791,13 +791,14 @@ res->buf, RADEON_USAGE_READWRITE)) { si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL); } - if (radeon_emitted(ctx->dma_cs, 0) && - ctx->ws->cs_is_buffer_referenced(ctx->dma_cs, + if (radeon_emitted(ctx->sdma_cs, 0) && + ctx->ws->cs_is_buffer_referenced(ctx->sdma_cs, res->buf, RADEON_USAGE_READWRITE)) { si_flush_dma_cs(ctx, PIPE_FLUSH_ASYNC, NULL); } - ctx->ws->cs_sync_flush(ctx->dma_cs); + if (ctx->sdma_cs) + ctx->ws->cs_sync_flush(ctx->sdma_cs); ctx->ws->cs_sync_flush(ctx->gfx_cs); assert(resource->target == PIPE_BUFFER); diff -Nru mesa-19.2.8/src/gallium/drivers/radeonsi/si_clear.c mesa-20.0.8/src/gallium/drivers/radeonsi/si_clear.c --- mesa-19.2.8/src/gallium/drivers/radeonsi/si_clear.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/radeonsi/si_clear.c 2020-06-12 01:21:17.000000000 +0000 @@ -25,7 +25,7 @@ #include "si_pipe.h" #include "sid.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_pack_color.h" #include "util/u_surface.h" @@ -245,7 +245,7 @@ dcc_offset = 0; } else { dcc_buffer = &tex->buffer.b.b; - dcc_offset = tex->dcc_offset; + dcc_offset = tex->surface.dcc_offset; } if (sctx->chip_class >= GFX9) { @@ -400,7 +400,7 @@ int i; /* This function is broken in BE, so just disable this path for now */ -#ifdef PIPE_ARCH_BIG_ENDIAN +#if UTIL_ARCH_BIG_ENDIAN return; #endif @@ -510,12 +510,13 @@ continue; tex->separate_dcc_dirty = true; + tex->displayable_dcc_dirty = true; /* DCC fast clear with MSAA should clear CMASK to 0xC. */ if (tex->buffer.b.b.nr_samples >= 2 && tex->cmask_buffer) { uint32_t clear_value = 0xCCCCCCCC; si_clear_buffer(sctx, &tex->cmask_buffer->b.b, - tex->cmask_offset, tex->surface.cmask_size, + tex->surface.cmask_offset, tex->surface.cmask_size, &clear_value, 4, SI_COHERENCY_CB_META, false); fmask_decompress_needed = true; } @@ -540,7 +541,7 @@ /* Do the fast clear. */ uint32_t clear_value = 0; si_clear_buffer(sctx, &tex->cmask_buffer->b.b, - tex->cmask_offset, tex->surface.cmask_size, + tex->surface.cmask_offset, tex->surface.cmask_size, &clear_value, 4, SI_COHERENCY_CB_META, false); eliminate_needed = true; } @@ -559,7 +560,7 @@ /* Chips with DCC constant encoding don't need to set the clear * color registers for DCC clear values 0 and 1. */ - if (sctx->screen->has_dcc_constant_encode && !eliminate_needed) + if (sctx->screen->info.has_dcc_constant_encode && !eliminate_needed) continue; if (si_set_clear_color(tex, fb->cbufs[i]->format, color)) { @@ -692,7 +693,7 @@ struct si_context *sctx = (struct si_context *)ctx; struct si_texture *sdst = (struct si_texture*)dst->texture; - if (dst->texture->nr_samples <= 1 && !sdst->dcc_offset) { + if (dst->texture->nr_samples <= 1 && !sdst->surface.dcc_offset) { si_compute_clear_render_target(ctx, dst, color, dstx, dsty, width, height, render_condition_enabled); return; diff -Nru mesa-19.2.8/src/gallium/drivers/radeonsi/si_compute_blit.c mesa-20.0.8/src/gallium/drivers/radeonsi/si_compute_blit.c --- mesa-19.2.8/src/gallium/drivers/radeonsi/si_compute_blit.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/radeonsi/si_compute_blit.c 2020-06-12 01:21:17.000000000 +0000 @@ -24,7 +24,7 @@ */ #include "si_pipe.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/format_srgb.h" /* Note: Compute shaders always use SI_COMPUTE_DST_CACHE_POLICY for dst @@ -59,18 +59,93 @@ } } -static void si_compute_internal_begin(struct si_context *sctx) +static void si_launch_grid_internal(struct si_context *sctx, + struct pipe_grid_info *info) { + /* Set settings for driver-internal compute dispatches. */ sctx->flags &= ~SI_CONTEXT_START_PIPELINE_STATS; sctx->flags |= SI_CONTEXT_STOP_PIPELINE_STATS; sctx->render_cond_force_off = true; -} + /* Skip decompression to prevent infinite recursion. */ + sctx->blitter->running = true; -static void si_compute_internal_end(struct si_context *sctx) -{ + /* Dispatch compute. */ + sctx->b.launch_grid(&sctx->b, info); + + /* Restore default settings. */ sctx->flags &= ~SI_CONTEXT_STOP_PIPELINE_STATS; sctx->flags |= SI_CONTEXT_START_PIPELINE_STATS; sctx->render_cond_force_off = false; + sctx->blitter->running = false; +} + +static void si_compute_clear_12bytes_buffer(struct si_context *sctx, + struct pipe_resource *dst, + unsigned dst_offset, + unsigned size, + const uint32_t *clear_value, + enum si_coherency coher) +{ + struct pipe_context *ctx = &sctx->b; + + assert(dst_offset % 4 == 0); + assert(size % 4 == 0); + unsigned size_12 = DIV_ROUND_UP(size, 12); + + unsigned data[4] = {0}; + memcpy(data, clear_value, 12); + + sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | + SI_CONTEXT_CS_PARTIAL_FLUSH | + si_get_flush_flags(sctx, coher, SI_COMPUTE_DST_CACHE_POLICY); + + struct pipe_shader_buffer saved_sb = {0}; + si_get_shader_buffers(sctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_sb); + + unsigned saved_writable_mask = 0; + if (sctx->const_and_shader_buffers[PIPE_SHADER_COMPUTE].writable_mask & + (1u << si_get_shaderbuf_slot(0))) + saved_writable_mask = 1; + + struct pipe_constant_buffer saved_cb = {}; + si_get_pipe_constant_buffer(sctx, PIPE_SHADER_COMPUTE, 0, &saved_cb); + + void *saved_cs = sctx->cs_shader_state.program; + + struct pipe_constant_buffer cb = {}; + cb.buffer_size = sizeof(data); + cb.user_buffer = data; + ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &cb); + + struct pipe_shader_buffer sb = {0}; + sb.buffer = dst; + sb.buffer_offset = dst_offset; + sb.buffer_size = size; + + ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 1, &sb, 0x1); + + struct pipe_grid_info info = {0}; + + if (!sctx->cs_clear_12bytes_buffer) + sctx->cs_clear_12bytes_buffer = + si_clear_12bytes_buffer_shader(ctx); + ctx->bind_compute_state(ctx, sctx->cs_clear_12bytes_buffer); + info.block[0] = 64; + info.last_block[0] = size_12 % 64; + info.block[1] = 1; + info.block[2] = 1; + info.grid[0] = DIV_ROUND_UP(size_12, 64); + info.grid[1] = 1; + info.grid[2] = 1; + + si_launch_grid_internal(sctx, &info); + + ctx->bind_compute_state(ctx, saved_cs); + ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_sb, saved_writable_mask); + ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &saved_cb); + + pipe_resource_reference(&saved_sb.buffer, NULL); + pipe_resource_reference(&saved_cb.buffer, NULL); } static void si_compute_do_clear_or_copy(struct si_context *sctx, @@ -92,7 +167,6 @@ assert(dst->target != PIPE_BUFFER || dst_offset + size <= dst->width0); assert(!src || src_offset + size <= src->width0); - si_compute_internal_begin(sctx); sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH | si_get_flush_flags(sctx, coher, SI_COMPUTE_DST_CACHE_POLICY); @@ -169,7 +243,7 @@ ctx->bind_compute_state(ctx, sctx->cs_clear_buffer); } - ctx->launch_grid(ctx, &info); + si_launch_grid_internal(sctx, &info); enum si_cache_policy cache_policy = get_cache_policy(sctx, coher, size); sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | @@ -182,7 +256,8 @@ ctx->bind_compute_state(ctx, saved_cs); ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, src ? 2 : 1, saved_sb, saved_writable_mask); - si_compute_internal_end(sctx); + for (int i = 0; i < 2; i++) + pipe_resource_reference(&saved_sb[i].buffer, NULL); } void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst, @@ -231,17 +306,8 @@ clear_value_size = 4; } - /* Use transform feedback for 12-byte clears. */ - /* TODO: Use compute. */ if (clear_value_size == 12) { - union pipe_color_union streamout_clear_value; - - memcpy(&streamout_clear_value, clear_value, clear_value_size); - si_blitter_begin(sctx, SI_DISABLE_RENDER_COND); - util_blitter_clear_buffer(sctx->blitter, dst, offset, - size, clear_value_size / 4, - &streamout_clear_value); - si_blitter_end(sctx); + si_compute_clear_12bytes_buffer(sctx, dst, offset, size, clear_value, coher); return; } @@ -255,7 +321,7 @@ (!force_cpdma && clear_value_size == 4 && offset % 4 == 0 && - (size > 32*1024 || sctx->chip_class <= GFX8))) { + (size > 32*1024 || sctx->chip_class <= GFX9))) { si_compute_do_clear_or_copy(sctx, dst, offset, NULL, 0, aligned_size, clear_value, clear_value_size, coher); @@ -326,16 +392,36 @@ unsigned width = src_box->width; unsigned height = src_box->height; unsigned depth = src_box->depth; + enum pipe_format src_format = util_format_linear(src->format); + enum pipe_format dst_format = util_format_linear(dst->format); + + assert(util_format_is_subsampled_422(src_format) == + util_format_is_subsampled_422(dst_format)); - unsigned data[] = {src_box->x, src_box->y, src_box->z, 0, dstx, dsty, dstz, 0}; + if (util_format_is_subsampled_422(src_format)) + src_format = dst_format = PIPE_FORMAT_R32_UINT; + + unsigned x_div = util_format_get_blockwidth(src->format) / + util_format_get_blockwidth(src_format); + assert(src_box->x % x_div == 0); + assert(width % x_div == 0); + + unsigned data[] = {src_box->x / x_div, src_box->y, src_box->z, 0, + dstx / x_div, dsty, dstz, 0}; + width /= x_div; if (width == 0 || height == 0) return; - si_compute_internal_begin(sctx); sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM); + /* The driver doesn't decompress resources automatically here. */ + si_decompress_subresource(ctx, dst, PIPE_MASK_RGBAZS, dst_level, + dstz, dstz + src_box->depth - 1); + si_decompress_subresource(ctx, src, PIPE_MASK_RGBAZS, src_level, + src_box->z, src_box->z + src_box->depth - 1); + /* src and dst have the same number of samples. */ si_make_CB_shader_coherent(sctx, src->nr_samples, true, /* Only src can have DCC.*/ @@ -359,7 +445,7 @@ struct pipe_image_view image[2] = {0}; image[0].resource = src; image[0].shader_access = image[0].access = PIPE_IMAGE_ACCESS_READ; - image[0].format = util_format_linear(src->format); + image[0].format = src_format; image[0].u.tex.level = src_level; image[0].u.tex.first_layer = 0; image[0].u.tex.last_layer = @@ -367,7 +453,7 @@ : (unsigned)(src->array_size - 1); image[1].resource = dst; image[1].shader_access = image[1].access = PIPE_IMAGE_ACCESS_WRITE; - image[1].format = util_format_linear(dst->format); + image[1].format = dst_format; image[1].u.tex.level = dst_level; image[1].u.tex.first_layer = 0; image[1].u.tex.last_layer = @@ -416,7 +502,7 @@ info.grid[2] = depth; } - ctx->launch_grid(ctx, &info); + si_launch_grid_internal(sctx, &info); sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | (sctx->chip_class <= GFX8 ? SI_CONTEXT_WB_L2 : 0) | @@ -424,7 +510,9 @@ ctx->bind_compute_state(ctx, saved_cs); ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 2, saved_image); ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &saved_cb); - si_compute_internal_end(sctx); + for (int i = 0; i < 2; i++) + pipe_resource_reference(&saved_image[i].resource, NULL); + pipe_resource_reference(&saved_cb.buffer, NULL); } void si_retile_dcc(struct si_context *sctx, struct si_texture *tex) @@ -451,9 +539,9 @@ unsigned num_elements = tex->surface.u.gfx9.dcc_retile_num_elements; struct pipe_image_view img[3]; - assert(tex->dcc_retile_map_offset && tex->dcc_retile_map_offset <= UINT_MAX); - assert(tex->dcc_offset && tex->dcc_offset <= UINT_MAX); - assert(tex->display_dcc_offset && tex->display_dcc_offset <= UINT_MAX); + assert(tex->surface.dcc_retile_map_offset && tex->surface.dcc_retile_map_offset <= UINT_MAX); + assert(tex->surface.dcc_offset && tex->surface.dcc_offset <= UINT_MAX); + assert(tex->surface.display_dcc_offset && tex->surface.display_dcc_offset <= UINT_MAX); for (unsigned i = 0; i < 3; i++) { img[i].resource = &tex->buffer.b.b; @@ -463,15 +551,15 @@ img[0].format = use_uint16 ? PIPE_FORMAT_R16G16B16A16_UINT : PIPE_FORMAT_R32G32B32A32_UINT; - img[0].u.buf.offset = tex->dcc_retile_map_offset; + img[0].u.buf.offset = tex->surface.dcc_retile_map_offset; img[0].u.buf.size = num_elements * (use_uint16 ? 2 : 4); img[1].format = PIPE_FORMAT_R8_UINT; - img[1].u.buf.offset = tex->dcc_offset; + img[1].u.buf.offset = tex->surface.dcc_offset; img[1].u.buf.size = tex->surface.dcc_size; img[2].format = PIPE_FORMAT_R8_UINT; - img[2].u.buf.offset = tex->display_dcc_offset; + img[2].u.buf.offset = tex->surface.display_dcc_offset; img[2].u.buf.size = tex->surface.u.gfx9.display_dcc_size; ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 3, img); @@ -494,7 +582,7 @@ info.grid[2] = 1; info.last_block[0] = num_threads % 64; - ctx->launch_grid(ctx, &info); + si_launch_grid_internal(sctx, &info); /* Don't flush caches or wait. The driver will wait at the end of this IB, * and L2 will be flushed by the kernel fence. @@ -503,6 +591,93 @@ /* Restore states. */ ctx->bind_compute_state(ctx, saved_cs); ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 3, saved_img); + + for (unsigned i = 0; i < 3; i++) { + pipe_resource_reference(&saved_img[i].resource, NULL); + } +} + +/* Expand FMASK to make it identity, so that image stores can ignore it. */ +void si_compute_expand_fmask(struct pipe_context *ctx, struct pipe_resource *tex) +{ + struct si_context *sctx = (struct si_context *)ctx; + bool is_array = tex->target == PIPE_TEXTURE_2D_ARRAY; + unsigned log_fragments = util_logbase2(tex->nr_storage_samples); + unsigned log_samples = util_logbase2(tex->nr_samples); + assert(tex->nr_samples >= 2); + + /* EQAA FMASK expansion is unimplemented. */ + if (tex->nr_samples != tex->nr_storage_samples) + return; + + /* Flush caches and sync engines. */ + sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | + si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM); + si_make_CB_shader_coherent(sctx, tex->nr_samples, true, + true /* DCC is not possible with image stores */); + + /* Save states. */ + void *saved_cs = sctx->cs_shader_state.program; + struct pipe_image_view saved_image = {0}; + util_copy_image_view(&saved_image, &sctx->images[PIPE_SHADER_COMPUTE].views[0]); + + /* Bind the image. */ + struct pipe_image_view image = {0}; + image.resource = tex; + /* Don't set WRITE so as not to trigger FMASK expansion, causing + * an infinite loop. */ + image.shader_access = image.access = PIPE_IMAGE_ACCESS_READ; + image.format = util_format_linear(tex->format); + if (is_array) + image.u.tex.last_layer = tex->array_size - 1; + + ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 1, &image); + + /* Bind the shader. */ + void **shader = &sctx->cs_fmask_expand[log_samples - 1][is_array]; + if (!*shader) + *shader = si_create_fmask_expand_cs(ctx, tex->nr_samples, is_array); + ctx->bind_compute_state(ctx, *shader); + + /* Dispatch compute. */ + struct pipe_grid_info info = {0}; + info.block[0] = 8; + info.last_block[0] = tex->width0 % 8; + info.block[1] = 8; + info.last_block[1] = tex->height0 % 8; + info.block[2] = 1; + info.grid[0] = DIV_ROUND_UP(tex->width0, 8); + info.grid[1] = DIV_ROUND_UP(tex->height0, 8); + info.grid[2] = is_array ? tex->array_size : 1; + + si_launch_grid_internal(sctx, &info); + + /* Flush caches and sync engines. */ + sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | + (sctx->chip_class <= GFX8 ? SI_CONTEXT_WB_L2 : 0) | + si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM); + + /* Restore previous states. */ + ctx->bind_compute_state(ctx, saved_cs); + ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_image); + pipe_resource_reference(&saved_image.resource, NULL); + + /* Array of fully expanded FMASK values, arranged by [log2(fragments)][log2(samples)-1]. */ +#define INVALID 0 /* never used */ + static const uint64_t fmask_expand_values[][4] = { + /* samples */ + /* 2 (8 bpp) 4 (8 bpp) 8 (8-32bpp) 16 (16-64bpp) fragments */ + {0x02020202, 0x0E0E0E0E, 0xFEFEFEFE, 0xFFFEFFFE}, /* 1 */ + {0x02020202, 0xA4A4A4A4, 0xAAA4AAA4, 0xAAAAAAA4}, /* 2 */ + {INVALID, 0xE4E4E4E4, 0x44443210, 0x4444444444443210}, /* 4 */ + {INVALID, INVALID, 0x76543210, 0x8888888876543210}, /* 8 */ + }; + + /* Clear FMASK to identity. */ + struct si_texture *stex = (struct si_texture*)tex; + si_clear_buffer(sctx, tex, stex->surface.fmask_offset, stex->surface.fmask_size, + (uint32_t*)&fmask_expand_values[log_fragments][log_samples - 1], + 4, SI_COHERENCY_SHADER, false); } void si_init_compute_blit_functions(struct si_context *sctx) @@ -525,6 +700,11 @@ if (width == 0 || height == 0) return; + /* The driver doesn't decompress resources automatically here. */ + si_decompress_subresource(ctx, dstsurf->texture, PIPE_MASK_RGBA, + dstsurf->u.tex.level, dstsurf->u.tex.first_layer, + dstsurf->u.tex.last_layer); + if (util_format_is_srgb(dstsurf->format)) { union pipe_color_union color_srgb; for (int i = 0; i < 3; i++) @@ -535,7 +715,6 @@ memcpy(data + 4, color->ui, sizeof(color->ui)); } - si_compute_internal_begin(sctx); sctx->render_cond_force_off = !render_condition_enabled; sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | @@ -595,7 +774,7 @@ info.grid[2] = 1; } - ctx->launch_grid(ctx, &info); + si_launch_grid_internal(sctx, &info); sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | (sctx->chip_class <= GFX8 ? SI_CONTEXT_WB_L2 : 0) | @@ -603,5 +782,6 @@ ctx->bind_compute_state(ctx, saved_cs); ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_image); ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &saved_cb); - si_compute_internal_end(sctx); + pipe_resource_reference(&saved_image.resource, NULL); + pipe_resource_reference(&saved_cb.buffer, NULL); } diff -Nru mesa-19.2.8/src/gallium/drivers/radeonsi/si_compute.c mesa-20.0.8/src/gallium/drivers/radeonsi/si_compute.c --- mesa-19.2.8/src/gallium/drivers/radeonsi/si_compute.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/radeonsi/si_compute.c 2020-06-12 01:21:17.000000000 +0000 @@ -24,7 +24,6 @@ */ #include "nir/tgsi_to_nir.h" -#include "tgsi/tgsi_parse.h" #include "util/u_async_debug.h" #include "util/u_memory.h" #include "util/u_upload_mgr.h" @@ -121,17 +120,13 @@ assert(thread_index < ARRAY_SIZE(sscreen->compiler)); compiler = &sscreen->compiler[thread_index]; - if (program->ir_type == PIPE_SHADER_IR_TGSI) { - tgsi_scan_shader(sel->tokens, &sel->info); - } else { - assert(program->ir_type == PIPE_SHADER_IR_NIR); + if (!compiler->passes) + si_init_compiler(sscreen, compiler); - si_nir_opts(sel->nir); - si_nir_scan_shader(sel->nir, &sel->info); - si_lower_nir(sel); - } + assert(program->ir_type == PIPE_SHADER_IR_NIR); + si_nir_scan_shader(sel->nir, &sel->info); - /* Store the declared LDS size into tgsi_shader_info for the shader + /* Store the declared LDS size into si_shader_info for the shader * cache to include it. */ sel->info.properties[TGSI_PROPERTY_CS_LOCAL_SIZE] = program->local_size; @@ -147,14 +142,14 @@ program->num_cs_user_data_dwords = sel->info.properties[TGSI_PROPERTY_CS_USER_DATA_COMPONENTS_AMD]; - void *ir_binary = si_get_ir_binary(sel, false, false); + unsigned char ir_sha1_cache_key[20]; + si_get_ir_cache_key(sel, false, false, ir_sha1_cache_key); /* Try to load the shader from the shader cache. */ - mtx_lock(&sscreen->shader_cache_mutex); + simple_mtx_lock(&sscreen->shader_cache_mutex); - if (ir_binary && - si_shader_cache_load_shader(sscreen, ir_binary, shader)) { - mtx_unlock(&sscreen->shader_cache_mutex); + if (si_shader_cache_load_shader(sscreen, ir_sha1_cache_key, shader)) { + simple_mtx_unlock(&sscreen->shader_cache_mutex); si_shader_dump_stats_for_shader_db(sscreen, shader, debug); si_shader_dump(sscreen, shader, debug, stderr, true); @@ -162,13 +157,10 @@ if (!si_shader_binary_upload(sscreen, shader, 0)) program->shader.compilation_failed = true; } else { - mtx_unlock(&sscreen->shader_cache_mutex); + simple_mtx_unlock(&sscreen->shader_cache_mutex); - if (!si_shader_create(sscreen, compiler, &program->shader, debug)) { + if (!si_create_shader_variant(sscreen, compiler, &program->shader, debug)) { program->shader.compilation_failed = true; - - if (program->ir_type == PIPE_SHADER_IR_TGSI) - FREE(sel->tokens); return; } @@ -197,20 +189,19 @@ S_00B84C_TGID_X_EN(sel->info.uses_block_id[0]) | S_00B84C_TGID_Y_EN(sel->info.uses_block_id[1]) | S_00B84C_TGID_Z_EN(sel->info.uses_block_id[2]) | + S_00B84C_TG_SIZE_EN(sel->info.uses_subgroup_info) | S_00B84C_TIDIG_COMP_CNT(sel->info.uses_thread_id[2] ? 2 : sel->info.uses_thread_id[1] ? 1 : 0) | S_00B84C_LDS_SIZE(shader->config.lds_size); - if (ir_binary) { - mtx_lock(&sscreen->shader_cache_mutex); - if (!si_shader_cache_insert_shader(sscreen, ir_binary, shader, true)) - FREE(ir_binary); - mtx_unlock(&sscreen->shader_cache_mutex); - } + simple_mtx_lock(&sscreen->shader_cache_mutex); + si_shader_cache_insert_shader(sscreen, ir_sha1_cache_key, + shader, true); + simple_mtx_unlock(&sscreen->shader_cache_mutex); } - if (program->ir_type == PIPE_SHADER_IR_TGSI) - FREE(sel->tokens); + ralloc_free(sel->nir); + sel->nir = NULL; } static void *si_create_compute_state( @@ -222,7 +213,7 @@ struct si_compute *program = CALLOC_STRUCT(si_compute); struct si_shader_selector *sel = &program->sel; - pipe_reference_init(&sel->reference, 1); + pipe_reference_init(&sel->base.reference, 1); sel->type = PIPE_SHADER_COMPUTE; sel->screen = sscreen; program->shader.selector = &program->sel; @@ -232,16 +223,9 @@ program->input_size = cso->req_input_mem; if (cso->ir_type != PIPE_SHADER_IR_NATIVE) { - if (sscreen->options.enable_nir && - cso->ir_type == PIPE_SHADER_IR_TGSI) { + if (cso->ir_type == PIPE_SHADER_IR_TGSI) { program->ir_type = PIPE_SHADER_IR_NIR; sel->nir = tgsi_to_nir(cso->prog, ctx->screen); - } else if (cso->ir_type == PIPE_SHADER_IR_TGSI) { - sel->tokens = tgsi_dup_tokens(cso->prog); - if (!sel->tokens) { - FREE(program); - return NULL; - } } else { assert(cso->ir_type == PIPE_SHADER_IR_NIR); sel->nir = (struct nir_shader *) cso->prog; @@ -256,10 +240,8 @@ &sel->compiler_ctx_state, program, si_create_compute_state_async); } else { - const struct pipe_llvm_program_header *header; - const char *code; + const struct pipe_binary_program_header *header; header = cso->prog; - code = cso->prog + sizeof(struct pipe_llvm_program_header); program->shader.binary.elf_size = header->num_bytes; program->shader.binary.elf_buffer = malloc(header->num_bytes); @@ -267,7 +249,7 @@ FREE(program); return NULL; } - memcpy((void *)program->shader.binary.elf_buffer, code, header->num_bytes); + memcpy((void *)program->shader.binary.elf_buffer, header->blob, header->num_bytes); const amd_kernel_code_t *code_object = si_compute_get_code_object(program, 0); @@ -422,7 +404,8 @@ si_aligned_buffer_create(&sctx->screen->b, SI_RESOURCE_FLAG_UNMAPPABLE, PIPE_USAGE_DEFAULT, - scratch_needed, 256); + scratch_needed, + sctx->screen->info.pte_fragment_size); if (!sctx->compute_scratch_buffer) return false; @@ -718,8 +701,8 @@ return true; } -static void si_setup_tgsi_user_data(struct si_context *sctx, - const struct pipe_grid_info *info) +static void si_setup_nir_user_data(struct si_context *sctx, + const struct pipe_grid_info *info) { struct si_compute *program = sctx->cs_shader_state.program; struct si_shader_selector *sel = &program->sel; @@ -943,7 +926,7 @@ } if (program->ir_type != PIPE_SHADER_IR_NATIVE) - si_setup_tgsi_user_data(sctx, info); + si_setup_nir_user_data(sctx, info); si_emit_dispatch_packets(sctx, info); diff -Nru mesa-19.2.8/src/gallium/drivers/radeonsi/si_compute.h mesa-20.0.8/src/gallium/drivers/radeonsi/si_compute.h --- mesa-19.2.8/src/gallium/drivers/radeonsi/si_compute.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/radeonsi/si_compute.h 2020-06-12 01:21:17.000000000 +0000 @@ -50,7 +50,7 @@ static inline void si_compute_reference(struct si_compute **dst, struct si_compute *src) { - if (pipe_reference(&(*dst)->sel.reference, &src->sel.reference)) + if (pipe_reference(&(*dst)->sel.base.reference, &src->sel.base.reference)) si_destroy_compute(*dst); *dst = src; diff -Nru mesa-19.2.8/src/gallium/drivers/radeonsi/si_compute_prim_discard.c mesa-20.0.8/src/gallium/drivers/radeonsi/si_compute_prim_discard.c --- mesa-19.2.8/src/gallium/drivers/radeonsi/si_compute_prim_discard.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/radeonsi/si_compute_prim_discard.c 2020-06-12 01:21:17.000000000 +0000 @@ -187,36 +187,38 @@ /* For emulating the rewind packet on CI. */ #define FORCE_REWIND_EMULATION 0 -void si_initialize_prim_discard_tunables(struct si_context *sctx) +void si_initialize_prim_discard_tunables(struct si_screen *sscreen, + bool is_aux_context, + unsigned *prim_discard_vertex_count_threshold, + unsigned *index_ring_size_per_ib) { - sctx->prim_discard_vertex_count_threshold = UINT_MAX; /* disable */ + *prim_discard_vertex_count_threshold = UINT_MAX; /* disable */ - if (sctx->chip_class == GFX6 || /* SI support is not implemented */ - !sctx->screen->info.has_gds_ordered_append || - sctx->screen->debug_flags & DBG(NO_PD) || - /* If aux_context == NULL, we are initializing aux_context right now. */ - !sctx->screen->aux_context) + if (sscreen->info.chip_class == GFX6 || /* SI support is not implemented */ + !sscreen->info.has_gds_ordered_append || + sscreen->debug_flags & DBG(NO_PD) || + is_aux_context) return; /* TODO: enable this after the GDS kernel memory management is fixed */ bool enable_on_pro_graphics_by_default = false; - if (sctx->screen->debug_flags & DBG(ALWAYS_PD) || - sctx->screen->debug_flags & DBG(PD) || + if (sscreen->debug_flags & DBG(ALWAYS_PD) || + sscreen->debug_flags & DBG(PD) || (enable_on_pro_graphics_by_default && - sctx->screen->info.is_pro_graphics && - (sctx->family == CHIP_BONAIRE || - sctx->family == CHIP_HAWAII || - sctx->family == CHIP_TONGA || - sctx->family == CHIP_FIJI || - sctx->family == CHIP_POLARIS10 || - sctx->family == CHIP_POLARIS11 || - sctx->family == CHIP_VEGA10 || - sctx->family == CHIP_VEGA20))) { - sctx->prim_discard_vertex_count_threshold = 6000 * 3; /* 6K triangles */ + sscreen->info.is_pro_graphics && + (sscreen->info.family == CHIP_BONAIRE || + sscreen->info.family == CHIP_HAWAII || + sscreen->info.family == CHIP_TONGA || + sscreen->info.family == CHIP_FIJI || + sscreen->info.family == CHIP_POLARIS10 || + sscreen->info.family == CHIP_POLARIS11 || + sscreen->info.family == CHIP_VEGA10 || + sscreen->info.family == CHIP_VEGA20))) { + *prim_discard_vertex_count_threshold = 6000 * 3; /* 6K triangles */ - if (sctx->screen->debug_flags & DBG(ALWAYS_PD)) - sctx->prim_discard_vertex_count_threshold = 0; /* always enable */ + if (sscreen->debug_flags & DBG(ALWAYS_PD)) + *prim_discard_vertex_count_threshold = 0; /* always enable */ const uint32_t MB = 1024 * 1024; const uint64_t GB = 1024 * 1024 * 1024; @@ -224,12 +226,12 @@ /* The total size is double this per context. * Greater numbers allow bigger gfx IBs. */ - if (sctx->screen->info.vram_size <= 2 * GB) - sctx->index_ring_size_per_ib = 64 * MB; - else if (sctx->screen->info.vram_size <= 4 * GB) - sctx->index_ring_size_per_ib = 128 * MB; + if (sscreen->info.vram_size <= 2 * GB) + *index_ring_size_per_ib = 64 * MB; + else if (sscreen->info.vram_size <= 4 * GB) + *index_ring_size_per_ib = 128 * MB; else - sctx->index_ring_size_per_ib = 256 * MB; + *index_ring_size_per_ib = 256 * MB; } } @@ -241,28 +243,28 @@ { LLVMValueRef args[] = { LLVMBuildIntToPtr(ctx->ac.builder, m0, - LLVMPointerType(ctx->i32, AC_ADDR_SPACE_GDS), ""), + LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GDS), ""), value, - LLVMConstInt(ctx->i32, LLVMAtomicOrderingMonotonic, 0), /* ordering */ - ctx->i32_0, /* scope */ - ctx->i1false, /* volatile */ - LLVMConstInt(ctx->i32, ordered_count_index, 0), - LLVMConstInt(ctx->i1, release, 0), - LLVMConstInt(ctx->i1, done, 0), + LLVMConstInt(ctx->ac.i32, LLVMAtomicOrderingMonotonic, 0), /* ordering */ + ctx->ac.i32_0, /* scope */ + ctx->ac.i1false, /* volatile */ + LLVMConstInt(ctx->ac.i32, ordered_count_index, 0), + LLVMConstInt(ctx->ac.i1, release, 0), + LLVMConstInt(ctx->ac.i1, done, 0), }; char intrinsic[64]; snprintf(intrinsic, sizeof(intrinsic), "llvm.amdgcn.ds.ordered.%s", opcode); - return ac_build_intrinsic(&ctx->ac, intrinsic, ctx->i32, args, ARRAY_SIZE(args), 0); + return ac_build_intrinsic(&ctx->ac, intrinsic, ctx->ac.i32, args, ARRAY_SIZE(args), 0); } static LLVMValueRef si_expand_32bit_pointer(struct si_shader_context *ctx, LLVMValueRef ptr) { uint64_t hi = (uint64_t)ctx->screen->info.address32_hi << 32; - ptr = LLVMBuildZExt(ctx->ac.builder, ptr, ctx->i64, ""); - ptr = LLVMBuildOr(ctx->ac.builder, ptr, LLVMConstInt(ctx->i64, hi, 0), ""); + ptr = LLVMBuildZExt(ctx->ac.builder, ptr, ctx->ac.i64, ""); + ptr = LLVMBuildOr(ctx->ac.builder, ptr, LLVMConstInt(ctx->ac.i64, hi, 0), ""); return LLVMBuildIntToPtr(ctx->ac.builder, ptr, - LLVMPointerType(ctx->i32, AC_ADDR_SPACE_GLOBAL), ""); + LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GLOBAL), ""); } struct si_thread0_section { @@ -277,7 +279,7 @@ LLVMValueRef thread_id) { section->ctx = ctx; - section->vgpr_result = ac_build_alloca_undef(&ctx->ac, ctx->i32, "result0"); + section->vgpr_result = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "result0"); /* This IF has 4 instructions: * v_and_b32_e32 v, 63, v ; get the thread ID @@ -289,7 +291,7 @@ */ ac_build_ifcc(&ctx->ac, LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, thread_id, - ctx->i32_0, ""), 12601); + ctx->ac.i32_0, ""), 12601); } /* Exit a section that only executes on thread 0 and broadcast the result @@ -318,50 +320,51 @@ ac_add_function_attr(ctx->ac.context, vs, -1, AC_FUNC_ATTR_ALWAYSINLINE); LLVMSetLinkage(vs, LLVMPrivateLinkage); - LLVMTypeRef const_desc_type; + enum ac_arg_type const_desc_type; if (ctx->shader->selector->info.const_buffers_declared == 1 && ctx->shader->selector->info.shader_buffers_declared == 0) - const_desc_type = ctx->f32; + const_desc_type = AC_ARG_CONST_FLOAT_PTR; else - const_desc_type = ctx->v4i32; + const_desc_type = AC_ARG_CONST_DESC_PTR; - struct si_function_info fninfo; - si_init_function_info(&fninfo); + memset(&ctx->args, 0, sizeof(ctx->args)); - LLVMValueRef index_buffers_and_constants, vertex_counter, vb_desc, const_desc; - LLVMValueRef base_vertex, start_instance, block_id, local_id, ordered_wave_id; - LLVMValueRef restart_index, vp_scale[2], vp_translate[2], smallprim_precision; - LLVMValueRef num_prims_udiv_multiplier, num_prims_udiv_terms, sampler_desc; - LLVMValueRef last_wave_prim_id, vertex_count_addr; - - add_arg_assign(&fninfo, ARG_SGPR, ac_array_in_const32_addr_space(ctx->v4i32), - &index_buffers_and_constants); - add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &vertex_counter); - add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &last_wave_prim_id); - add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &vertex_count_addr); - add_arg_assign(&fninfo, ARG_SGPR, ac_array_in_const32_addr_space(ctx->v4i32), - &vb_desc); - add_arg_assign(&fninfo, ARG_SGPR, ac_array_in_const32_addr_space(const_desc_type), - &const_desc); - add_arg_assign(&fninfo, ARG_SGPR, ac_array_in_const32_addr_space(ctx->v8i32), - &sampler_desc); - add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &base_vertex); - add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &start_instance); - add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &num_prims_udiv_multiplier); - add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &num_prims_udiv_terms); - add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &restart_index); - add_arg_assign(&fninfo, ARG_SGPR, ctx->f32, &smallprim_precision); + struct ac_arg param_index_buffers_and_constants, param_vertex_counter; + struct ac_arg param_vb_desc, param_const_desc; + struct ac_arg param_base_vertex, param_start_instance; + struct ac_arg param_block_id, param_local_id, param_ordered_wave_id; + struct ac_arg param_restart_index, param_smallprim_precision; + struct ac_arg param_num_prims_udiv_multiplier, param_num_prims_udiv_terms; + struct ac_arg param_sampler_desc, param_last_wave_prim_id, param_vertex_count_addr; + + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, + ¶m_index_buffers_and_constants); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_vertex_counter); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_last_wave_prim_id); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_vertex_count_addr); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, + ¶m_vb_desc); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, const_desc_type, + ¶m_const_desc); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR, + ¶m_sampler_desc); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_base_vertex); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_start_instance); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_num_prims_udiv_multiplier); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_num_prims_udiv_terms); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_restart_index); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, ¶m_smallprim_precision); /* Block ID and thread ID inputs. */ - add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &block_id); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_block_id); if (VERTEX_COUNTER_GDS_MODE == 2) - add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &ordered_wave_id); - add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &local_id); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_ordered_wave_id); + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, ¶m_local_id); /* Create the compute shader function. */ unsigned old_type = ctx->type; ctx->type = PIPE_SHADER_COMPUTE; - si_create_function(ctx, "prim_discard_cs", NULL, 0, &fninfo, THREADGROUP_SIZE); + si_llvm_create_func(ctx, "prim_discard_cs", NULL, 0, THREADGROUP_SIZE); ctx->type = old_type; if (VERTEX_COUNTER_GDS_MODE == 1) { @@ -376,19 +379,19 @@ vs_params[num_vs_params++] = LLVMGetUndef(LLVMTypeOf(LLVMGetParam(vs, 0))); /* RW_BUFFERS */ vs_params[num_vs_params++] = LLVMGetUndef(LLVMTypeOf(LLVMGetParam(vs, 1))); /* BINDLESS */ - vs_params[num_vs_params++] = const_desc; - vs_params[num_vs_params++] = sampler_desc; - vs_params[num_vs_params++] = LLVMConstInt(ctx->i32, + vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_const_desc); + vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_sampler_desc); + vs_params[num_vs_params++] = LLVMConstInt(ctx->ac.i32, S_VS_STATE_INDEXED(key->opt.cs_indexed), 0); - vs_params[num_vs_params++] = base_vertex; - vs_params[num_vs_params++] = start_instance; - vs_params[num_vs_params++] = ctx->i32_0; /* DrawID */ - vs_params[num_vs_params++] = vb_desc; + vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_base_vertex); + vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_start_instance); + vs_params[num_vs_params++] = ctx->ac.i32_0; /* DrawID */ + vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_vb_desc); vs_params[(param_vertex_id = num_vs_params++)] = NULL; /* VertexID */ vs_params[(param_instance_id = num_vs_params++)] = NULL; /* InstanceID */ - vs_params[num_vs_params++] = ctx->i32_0; /* unused (PrimID) */ - vs_params[num_vs_params++] = ctx->i32_0; /* unused */ + vs_params[num_vs_params++] = ctx->ac.i32_0; /* unused (PrimID) */ + vs_params[num_vs_params++] = ctx->ac.i32_0; /* unused */ assert(num_vs_params <= ARRAY_SIZE(vs_params)); assert(num_vs_params == LLVMCountParamTypes(LLVMGetElementType(LLVMTypeOf(vs)))); @@ -396,9 +399,10 @@ /* Load descriptors. (load 8 dwords at once) */ LLVMValueRef input_indexbuf, output_indexbuf, tmp, desc[8]; + LLVMValueRef index_buffers_and_constants = ac_get_arg(&ctx->ac, param_index_buffers_and_constants); tmp = LLVMBuildPointerCast(builder, index_buffers_and_constants, - ac_array_in_const32_addr_space(ctx->v8i32), ""); - tmp = ac_build_load_to_sgpr(&ctx->ac, tmp, ctx->i32_0); + ac_array_in_const32_addr_space(ctx->ac.v8i32), ""); + tmp = ac_build_load_to_sgpr(&ctx->ac, tmp, ctx->ac.i32_0); for (unsigned i = 0; i < 8; i++) desc[i] = ac_llvm_extract_elem(&ctx->ac, tmp, i); @@ -408,17 +412,22 @@ /* Compute PrimID and InstanceID. */ LLVMValueRef global_thread_id = - ac_build_imad(&ctx->ac, block_id, - LLVMConstInt(ctx->i32, THREADGROUP_SIZE, 0), local_id); + ac_build_imad(&ctx->ac, ac_get_arg(&ctx->ac, param_block_id), + LLVMConstInt(ctx->ac.i32, THREADGROUP_SIZE, 0), + ac_get_arg(&ctx->ac, param_local_id)); LLVMValueRef prim_id = global_thread_id; /* PrimID within an instance */ - LLVMValueRef instance_id = ctx->i32_0; + LLVMValueRef instance_id = ctx->ac.i32_0; if (key->opt.cs_instancing) { + LLVMValueRef num_prims_udiv_terms = + ac_get_arg(&ctx->ac, param_num_prims_udiv_terms); + LLVMValueRef num_prims_udiv_multiplier = + ac_get_arg(&ctx->ac, param_num_prims_udiv_multiplier); /* Unpack num_prims_udiv_terms. */ LLVMValueRef post_shift = LLVMBuildAnd(builder, num_prims_udiv_terms, - LLVMConstInt(ctx->i32, 0x1f, 0), ""); + LLVMConstInt(ctx->ac.i32, 0x1f, 0), ""); LLVMValueRef prims_per_instance = LLVMBuildLShr(builder, num_prims_udiv_terms, - LLVMConstInt(ctx->i32, 5, 0), ""); + LLVMConstInt(ctx->ac.i32, 5, 0), ""); /* Divide the total prim_id by the number of prims per instance. */ instance_id = ac_build_fast_udiv_u31_d_not_one(&ctx->ac, prim_id, num_prims_udiv_multiplier, @@ -430,21 +439,21 @@ } /* Generate indices (like a non-indexed draw call). */ - LLVMValueRef index[4] = {NULL, NULL, NULL, LLVMGetUndef(ctx->i32)}; + LLVMValueRef index[4] = {NULL, NULL, NULL, LLVMGetUndef(ctx->ac.i32)}; unsigned vertices_per_prim = 3; switch (key->opt.cs_prim_type) { case PIPE_PRIM_TRIANGLES: for (unsigned i = 0; i < 3; i++) { index[i] = ac_build_imad(&ctx->ac, prim_id, - LLVMConstInt(ctx->i32, 3, 0), - LLVMConstInt(ctx->i32, i, 0)); + LLVMConstInt(ctx->ac.i32, 3, 0), + LLVMConstInt(ctx->ac.i32, i, 0)); } break; case PIPE_PRIM_TRIANGLE_STRIP: for (unsigned i = 0; i < 3; i++) { index[i] = LLVMBuildAdd(builder, prim_id, - LLVMConstInt(ctx->i32, i, 0), ""); + LLVMConstInt(ctx->ac.i32, i, 0), ""); } break; case PIPE_PRIM_TRIANGLE_FAN: @@ -454,13 +463,13 @@ * gl_VertexID is preserved, because it's equal to the index. */ if (key->opt.cs_provoking_vertex_first) { - index[0] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->i32, 1, 0), ""); - index[1] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->i32, 2, 0), ""); - index[2] = ctx->i32_0; + index[0] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->ac.i32, 1, 0), ""); + index[1] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->ac.i32, 2, 0), ""); + index[2] = ctx->ac.i32_0; } else { - index[0] = ctx->i32_0; - index[1] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->i32, 1, 0), ""); - index[2] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->i32, 2, 0), ""); + index[0] = ctx->ac.i32_0; + index[1] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->ac.i32, 1, 0), ""); + index[2] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->ac.i32, 2, 0), ""); } break; default: @@ -471,35 +480,39 @@ if (key->opt.cs_indexed) { for (unsigned i = 0; i < 3; i++) { index[i] = ac_build_buffer_load_format(&ctx->ac, input_indexbuf, - index[i], ctx->i32_0, 1, + index[i], ctx->ac.i32_0, 1, 0, true); index[i] = ac_to_integer(&ctx->ac, index[i]); } } + LLVMValueRef ordered_wave_id = ac_get_arg(&ctx->ac, param_ordered_wave_id); + /* Extract the ordered wave ID. */ if (VERTEX_COUNTER_GDS_MODE == 2) { ordered_wave_id = LLVMBuildLShr(builder, ordered_wave_id, - LLVMConstInt(ctx->i32, 6, 0), ""); + LLVMConstInt(ctx->ac.i32, 6, 0), ""); ordered_wave_id = LLVMBuildAnd(builder, ordered_wave_id, - LLVMConstInt(ctx->i32, 0xfff, 0), ""); + LLVMConstInt(ctx->ac.i32, 0xfff, 0), ""); } LLVMValueRef thread_id = - LLVMBuildAnd(builder, local_id, LLVMConstInt(ctx->i32, 63, 0), ""); + LLVMBuildAnd(builder, ac_get_arg(&ctx->ac, param_local_id), + LLVMConstInt(ctx->ac.i32, 63, 0), ""); /* Every other triangle in a strip has a reversed vertex order, so we * need to swap vertices of odd primitives to get the correct primitive * orientation when converting triangle strips to triangles. Primitive * restart complicates it, because a strip can start anywhere. */ - LLVMValueRef prim_restart_accepted = ctx->i1true; + LLVMValueRef prim_restart_accepted = ctx->ac.i1true; + LLVMValueRef vertex_counter = ac_get_arg(&ctx->ac, param_vertex_counter); if (key->opt.cs_prim_type == PIPE_PRIM_TRIANGLE_STRIP) { /* Without primitive restart, odd primitives have reversed orientation. * Only primitive restart can flip it with respect to the first vertex * of the draw call. */ - LLVMValueRef first_is_odd = ctx->i1false; + LLVMValueRef first_is_odd = ctx->ac.i1false; /* Handle primitive restart. */ if (key->opt.cs_primitive_restart) { @@ -510,17 +523,18 @@ */ LLVMValueRef gds_prim_restart_continue = LLVMBuildLShr(builder, vertex_counter, - LLVMConstInt(ctx->i32, 31, 0), ""); + LLVMConstInt(ctx->ac.i32, 31, 0), ""); gds_prim_restart_continue = - LLVMBuildTrunc(builder, gds_prim_restart_continue, ctx->i1, ""); + LLVMBuildTrunc(builder, gds_prim_restart_continue, ctx->ac.i1, ""); vertex_counter = LLVMBuildAnd(builder, vertex_counter, - LLVMConstInt(ctx->i32, 0x7fffffff, 0), ""); + LLVMConstInt(ctx->ac.i32, 0x7fffffff, 0), ""); LLVMValueRef index0_is_reset; for (unsigned i = 0; i < 3; i++) { LLVMValueRef not_reset = LLVMBuildICmp(builder, LLVMIntNE, index[i], - restart_index, ""); + ac_get_arg(&ctx->ac, param_restart_index), + ""); if (i == 0) index0_is_reset = LLVMBuildNot(builder, not_reset, ""); prim_restart_accepted = LLVMBuildAnd(builder, prim_restart_accepted, @@ -540,7 +554,7 @@ LLVMValueRef preceding_threads_mask = LLVMBuildSub(builder, LLVMBuildShl(builder, ctx->ac.i64_1, - LLVMBuildZExt(builder, thread_id, ctx->i64, ""), ""), + LLVMBuildZExt(builder, thread_id, ctx->ac.i64, ""), ""), ctx->ac.i64_1, ""); LLVMValueRef reset_threadmask = ac_get_i1_sgpr_mask(&ctx->ac, index0_is_reset); @@ -548,10 +562,10 @@ LLVMBuildAnd(builder, reset_threadmask, preceding_threads_mask, ""); LLVMValueRef strip_start = ac_build_umsb(&ctx->ac, preceding_reset_threadmask, NULL); - strip_start = LLVMBuildAdd(builder, strip_start, ctx->i32_1, ""); + strip_start = LLVMBuildAdd(builder, strip_start, ctx->ac.i32_1, ""); /* This flips the orientatino based on reset indices within this wave only. */ - first_is_odd = LLVMBuildTrunc(builder, strip_start, ctx->i1, ""); + first_is_odd = LLVMBuildTrunc(builder, strip_start, ctx->ac.i1, ""); LLVMValueRef last_strip_start, prev_wave_state, ret, tmp; LLVMValueRef is_first_wave, current_wave_resets_index; @@ -565,7 +579,7 @@ * be 64. */ last_strip_start = ac_build_umsb(&ctx->ac, reset_threadmask, NULL); - last_strip_start = LLVMBuildAdd(builder, last_strip_start, ctx->i32_1, ""); + last_strip_start = LLVMBuildAdd(builder, last_strip_start, ctx->ac.i32_1, ""); struct si_thread0_section section; si_enter_thread0_section(ctx, §ion, thread_id); @@ -577,14 +591,14 @@ * NOTE: This will need to be different if we wanna support * instancing with primitive restart. */ - is_first_wave = LLVMBuildICmp(builder, LLVMIntEQ, prim_id, ctx->i32_0, ""); + is_first_wave = LLVMBuildICmp(builder, LLVMIntEQ, prim_id, ctx->ac.i32_0, ""); is_first_wave = LLVMBuildAnd(builder, is_first_wave, LLVMBuildNot(builder, gds_prim_restart_continue, ""), ""); current_wave_resets_index = LLVMBuildICmp(builder, LLVMIntNE, - last_strip_start, ctx->i32_0, ""); + last_strip_start, ctx->ac.i32_0, ""); - ret = ac_build_alloca_undef(&ctx->ac, ctx->i32, "prev_state"); + ret = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "prev_state"); /* Save the last strip start primitive index in GDS and read * the value that previous waves stored. @@ -610,7 +624,7 @@ { /* Just read the value from GDS. */ tmp = si_build_ds_ordered_op(ctx, "add", - ordered_wave_id, ctx->i32_0, + ordered_wave_id, ctx->ac.i32_0, 1, true, false); LLVMBuildStore(builder, tmp, ret); } @@ -619,9 +633,9 @@ prev_wave_state = LLVMBuildLoad(builder, ret, ""); /* Ignore the return value if this is the first wave. */ prev_wave_state = LLVMBuildSelect(builder, is_first_wave, - ctx->i32_0, prev_wave_state, ""); + ctx->ac.i32_0, prev_wave_state, ""); si_exit_thread0_section(§ion, &prev_wave_state); - prev_wave_state = LLVMBuildTrunc(builder, prev_wave_state, ctx->i1, ""); + prev_wave_state = LLVMBuildTrunc(builder, prev_wave_state, ctx->ac.i1, ""); /* If the strip start appears to be on thread 0 for the current primitive * (meaning the reset index is not present in this wave and might have @@ -632,7 +646,7 @@ * the value from the current wave to determine primitive orientation. */ LLVMValueRef strip_start_is0 = LLVMBuildICmp(builder, LLVMIntEQ, - strip_start, ctx->i32_0, ""); + strip_start, ctx->ac.i32_0, ""); first_is_odd = LLVMBuildSelect(builder, strip_start_is0, prev_wave_state, first_is_odd, ""); } @@ -640,23 +654,12 @@ /* prim_is_odd = (first_is_odd + current_is_odd) % 2. */ LLVMValueRef prim_is_odd = LLVMBuildXor(builder, first_is_odd, - LLVMBuildTrunc(builder, thread_id, ctx->i1, ""), ""); + LLVMBuildTrunc(builder, thread_id, ctx->ac.i1, ""), ""); - /* Determine the primitive orientation. - * Only swap the vertices that are not the provoking vertex. We need to keep - * the provoking vertex in place. - */ - if (key->opt.cs_provoking_vertex_first) { - LLVMValueRef index1 = index[1]; - LLVMValueRef index2 = index[2]; - index[1] = LLVMBuildSelect(builder, prim_is_odd, index2, index1, ""); - index[2] = LLVMBuildSelect(builder, prim_is_odd, index1, index2, ""); - } else { - LLVMValueRef index0 = index[0]; - LLVMValueRef index1 = index[1]; - index[0] = LLVMBuildSelect(builder, prim_is_odd, index1, index0, ""); - index[1] = LLVMBuildSelect(builder, prim_is_odd, index0, index1, ""); - } + /* Convert triangle strip indices to triangle indices. */ + ac_build_triangle_strip_indices_to_triangle(&ctx->ac, prim_is_odd, + LLVMConstInt(ctx->ac.i1, key->opt.cs_provoking_vertex_first, 0), + index); } /* Execute the vertex shader for each vertex to get vertex positions. */ @@ -678,8 +681,9 @@ /* Load the viewport state. */ LLVMValueRef vp = ac_build_load_invariant(&ctx->ac, index_buffers_and_constants, - LLVMConstInt(ctx->i32, 2, 0)); - vp = LLVMBuildBitCast(builder, vp, ctx->v4f32, ""); + LLVMConstInt(ctx->ac.i32, 2, 0)); + vp = LLVMBuildBitCast(builder, vp, ctx->ac.v4f32, ""); + LLVMValueRef vp_scale[2], vp_translate[2]; vp_scale[0] = ac_llvm_extract_elem(&ctx->ac, vp, 0); vp_scale[1] = ac_llvm_extract_elem(&ctx->ac, vp, 1); vp_translate[0] = ac_llvm_extract_elem(&ctx->ac, vp, 2); @@ -699,16 +703,18 @@ LLVMValueRef accepted = ac_cull_triangle(&ctx->ac, pos, prim_restart_accepted, - vp_scale, vp_translate, smallprim_precision, + vp_scale, vp_translate, + ac_get_arg(&ctx->ac, param_smallprim_precision), &options); + ac_build_optimization_barrier(&ctx->ac, &accepted); LLVMValueRef accepted_threadmask = ac_get_i1_sgpr_mask(&ctx->ac, accepted); /* Count the number of active threads by doing bitcount(accepted). */ LLVMValueRef num_prims_accepted = - ac_build_intrinsic(&ctx->ac, "llvm.ctpop.i64", ctx->i64, + ac_build_intrinsic(&ctx->ac, "llvm.ctpop.i64", ctx->ac.i64, &accepted_threadmask, 1, AC_FUNC_ATTR_READNONE); - num_prims_accepted = LLVMBuildTrunc(builder, num_prims_accepted, ctx->i32, ""); + num_prims_accepted = LLVMBuildTrunc(builder, num_prims_accepted, ctx->ac.i32, ""); LLVMValueRef start; @@ -718,21 +724,21 @@ { if (VERTEX_COUNTER_GDS_MODE == 0) { LLVMValueRef num_indices = LLVMBuildMul(builder, num_prims_accepted, - LLVMConstInt(ctx->i32, vertices_per_prim, 0), ""); + LLVMConstInt(ctx->ac.i32, vertices_per_prim, 0), ""); vertex_counter = si_expand_32bit_pointer(ctx, vertex_counter); start = LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpAdd, vertex_counter, num_indices, LLVMAtomicOrderingMonotonic, false); } else if (VERTEX_COUNTER_GDS_MODE == 1) { LLVMValueRef num_indices = LLVMBuildMul(builder, num_prims_accepted, - LLVMConstInt(ctx->i32, vertices_per_prim, 0), ""); + LLVMConstInt(ctx->ac.i32, vertices_per_prim, 0), ""); vertex_counter = LLVMBuildIntToPtr(builder, vertex_counter, - LLVMPointerType(ctx->i32, AC_ADDR_SPACE_GDS), ""); + LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GDS), ""); start = LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpAdd, vertex_counter, num_indices, LLVMAtomicOrderingMonotonic, false); } else if (VERTEX_COUNTER_GDS_MODE == 2) { - LLVMValueRef tmp_store = ac_build_alloca_undef(&ctx->ac, ctx->i32, ""); + LLVMValueRef tmp_store = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, ""); /* If the draw call was split into multiple subdraws, each using * a separate draw packet, we need to start counting from 0 for @@ -763,7 +769,7 @@ /* The GDS address is always 0 with ordered append. */ si_build_ds_ordered_op(ctx, "swap", ordered_wave_id, num_prims_accepted, 0, true, true); - LLVMBuildStore(builder, ctx->i32_0, tmp_store); + LLVMBuildStore(builder, ctx->ac.i32_0, tmp_store); } ac_build_else(&ctx->ac, 12605); { @@ -788,29 +794,32 @@ if (VERTEX_COUNTER_GDS_MODE == 2) { ac_build_ifcc(&ctx->ac, LLVMBuildICmp(builder, LLVMIntEQ, global_thread_id, - last_wave_prim_id, ""), 12606); + ac_get_arg(&ctx->ac, param_last_wave_prim_id), ""), + 12606); LLVMValueRef count = LLVMBuildAdd(builder, start, num_prims_accepted, ""); count = LLVMBuildMul(builder, count, - LLVMConstInt(ctx->i32, vertices_per_prim, 0), ""); + LLVMConstInt(ctx->ac.i32, vertices_per_prim, 0), ""); /* GFX8 needs to disable caching, so that the CP can see the stored value. * MTYPE=3 bypasses TC L2. */ if (ctx->screen->info.chip_class <= GFX8) { LLVMValueRef desc[] = { - vertex_count_addr, - LLVMConstInt(ctx->i32, + ac_get_arg(&ctx->ac, param_vertex_count_addr), + LLVMConstInt(ctx->ac.i32, S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi), 0), - LLVMConstInt(ctx->i32, 4, 0), - LLVMConstInt(ctx->i32, S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) | + LLVMConstInt(ctx->ac.i32, 4, 0), + LLVMConstInt(ctx->ac.i32, S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) | S_008F0C_MTYPE(3 /* uncached */), 0), }; LLVMValueRef rsrc = ac_build_gather_values(&ctx->ac, desc, 4); - ac_build_buffer_store_dword(&ctx->ac, rsrc, count, 1, ctx->i32_0, - ctx->i32_0, 0, ac_glc | ac_slc, false); + ac_build_buffer_store_dword(&ctx->ac, rsrc, count, 1, ctx->ac.i32_0, + ctx->ac.i32_0, 0, ac_glc | ac_slc); } else { LLVMBuildStore(builder, count, - si_expand_32bit_pointer(ctx, vertex_count_addr)); + si_expand_32bit_pointer(ctx, + ac_get_arg(&ctx->ac, + param_vertex_count_addr))); } ac_build_endif(&ctx->ac, 12606); } else { @@ -818,7 +827,7 @@ * primitive count, convert it into the primitive index. */ start = LLVMBuildUDiv(builder, start, - LLVMConstInt(ctx->i32, vertices_per_prim, 0), ""); + LLVMConstInt(ctx->ac.i32, vertices_per_prim, 0), ""); } /* Now we need to store the indices of accepted primitives into @@ -832,7 +841,7 @@ /* We have lowered instancing. Pack the instance ID into vertex ID. */ if (key->opt.cs_instancing) { instance_id = LLVMBuildShl(builder, instance_id, - LLVMConstInt(ctx->i32, 16, 0), ""); + LLVMConstInt(ctx->ac.i32, 16, 0), ""); for (unsigned i = 0; i < vertices_per_prim; i++) index[i] = LLVMBuildOr(builder, index[i], instance_id, ""); @@ -858,7 +867,7 @@ vdata = ac_build_expand_to_vec4(&ctx->ac, vdata, 3); ac_build_buffer_store_format(&ctx->ac, output_indexbuf, vdata, - vindex, ctx->i32_0, 3, + vindex, ctx->ac.i32_0, 3, ac_glc | (INDEX_STORES_USE_SLC ? ac_slc : 0)); } ac_build_endif(&ctx->ac, 16607); @@ -922,6 +931,9 @@ sctx->cs_prim_discard_state.cso = sctx->vs_shader.cso; sctx->cs_prim_discard_state.current = NULL; + if (!sctx->compiler.passes) + si_init_compiler(sctx->screen, &sctx->compiler); + struct si_compiler_ctx_state compiler_state; compiler_state.compiler = &sctx->compiler; compiler_state.debug = sctx->debug; @@ -977,7 +989,7 @@ SI_RESOURCE_FLAG_UNMAPPABLE, PIPE_USAGE_DEFAULT, sctx->index_ring_size_per_ib * 2, - 2 * 1024 * 1024); + sctx->screen->info.pte_fragment_size); if (!sctx->index_ring) return false; } @@ -1293,50 +1305,18 @@ S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) | S_008F0C_DATA_FORMAT(output_indexbuf_format); - /* Viewport state. - * This is needed by the small primitive culling, because it's done - * in screen space. - */ - float scale[2], translate[2]; - - scale[0] = sctx->viewports.states[0].scale[0]; - scale[1] = sctx->viewports.states[0].scale[1]; - translate[0] = sctx->viewports.states[0].translate[0]; - translate[1] = sctx->viewports.states[0].translate[1]; - - /* The viewport shouldn't flip the X axis for the small prim culling to work. */ - assert(-scale[0] + translate[0] <= scale[0] + translate[0]); - - /* If the Y axis is inverted (OpenGL default framebuffer), reverse it. - * This is because the viewport transformation inverts the clip space - * bounding box, so min becomes max, which breaks small primitive - * culling. - */ - if (sctx->viewports.y_inverted) { - scale[1] = -scale[1]; - translate[1] = -translate[1]; - } - - /* Scale the framebuffer up, so that samples become pixels and small - * primitive culling is the same for all sample counts. - * This only works with the standard DX sample positions, because - * the samples are evenly spaced on both X and Y axes. - */ - unsigned num_samples = sctx->framebuffer.nr_samples; - assert(num_samples >= 1); - - for (unsigned i = 0; i < 2; i++) { - scale[i] *= num_samples; - translate[i] *= num_samples; - } - - desc[8] = fui(scale[0]); - desc[9] = fui(scale[1]); - desc[10] = fui(translate[0]); - desc[11] = fui(translate[1]); + /* Viewport state. */ + struct si_small_prim_cull_info cull_info; + si_get_small_prim_cull_info(sctx, &cull_info); + + desc[8] = fui(cull_info.scale[0]); + desc[9] = fui(cull_info.scale[1]); + desc[10] = fui(cull_info.translate[0]); + desc[11] = fui(cull_info.translate[1]); /* Better subpixel precision increases the efficiency of small * primitive culling. */ + unsigned num_samples = sctx->framebuffer.nr_samples; unsigned quant_mode = sctx->viewports.as_scissor[0].quant_mode; float small_prim_cull_precision; diff -Nru mesa-19.2.8/src/gallium/drivers/radeonsi/si_cp_dma.c mesa-20.0.8/src/gallium/drivers/radeonsi/si_cp_dma.c --- mesa-19.2.8/src/gallium/drivers/radeonsi/si_cp_dma.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/radeonsi/si_cp_dma.c 2020-06-12 01:21:17.000000000 +0000 @@ -222,7 +222,7 @@ * so that transfer_map knows it should wait for the GPU when mapping * that range. */ if (sdst) - util_range_add(&sdst->valid_buffer_range, offset, offset + size); + util_range_add(dst, &sdst->valid_buffer_range, offset, offset + size); /* Flush the caches. */ if (sdst && !(user_flags & SI_CPDMA_SKIP_GFX_SYNC)) { @@ -325,7 +325,7 @@ /* Mark the buffer range of destination as valid (initialized), * so that transfer_map knows it should wait for the GPU when mapping * that range. */ - util_range_add(&si_resource(dst)->valid_buffer_range, dst_offset, + util_range_add(dst, &si_resource(dst)->valid_buffer_range, dst_offset, dst_offset + size); } @@ -433,12 +433,12 @@ static void cik_prefetch_VBO_descriptors(struct si_context *sctx) { - if (!sctx->vertex_elements || !sctx->vertex_elements->desc_list_byte_size) + if (!sctx->vertex_elements || !sctx->vertex_elements->vb_desc_list_alloc_size) return; cik_prefetch_TC_L2_async(sctx, &sctx->vb_descriptors_buffer->b.b, sctx->vb_descriptors_offset, - sctx->vertex_elements->desc_list_byte_size); + sctx->vertex_elements->vb_desc_list_alloc_size); } /** diff -Nru mesa-19.2.8/src/gallium/drivers/radeonsi/si_debug.c mesa-20.0.8/src/gallium/drivers/radeonsi/si_debug.c --- mesa-19.2.8/src/gallium/drivers/radeonsi/si_debug.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/radeonsi/si_debug.c 2020-06-12 01:21:17.000000000 +0000 @@ -830,7 +830,7 @@ static void si_dump_descriptors(struct si_context *sctx, enum pipe_shader_type processor, - const struct tgsi_shader_info *info, + const struct si_shader_info *info, struct u_log_context *log) { struct si_descriptors *descs = @@ -868,7 +868,7 @@ desc.list = sctx->vb_descriptors_gpu_list; desc.gpu_list = sctx->vb_descriptors_gpu_list; desc.element_dw_size = 4; - desc.num_active_slots = sctx->vertex_elements->desc_list_byte_size / 16; + desc.num_active_slots = sctx->vertex_elements->vb_desc_list_alloc_size / 16; si_dump_descriptor_list(sctx->screen, &desc, name, " - Vertex buffer", 4, info->num_inputs, diff -Nru mesa-19.2.8/src/gallium/drivers/radeonsi/si_debug_options.h mesa-20.0.8/src/gallium/drivers/radeonsi/si_debug_options.h --- mesa-19.2.8/src/gallium/drivers/radeonsi/si_debug_options.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/radeonsi/si_debug_options.h 2020-06-12 01:21:17.000000000 +0000 @@ -1,5 +1,4 @@ OPT_BOOL(clear_db_cache_before_clear, false, "Clear DB cache before fast depth clear") -OPT_BOOL(enable_nir, false, "Enable NIR") OPT_BOOL(aux_debug, false, "Generate ddebug_dumps for the auxiliary context") OPT_BOOL(sync_compile, false, "Always compile synchronously (will cause stalls)") OPT_BOOL(dump_shader_binary, false, "Dump shader binary as part of ddebug_dumps") diff -Nru mesa-19.2.8/src/gallium/drivers/radeonsi/si_descriptors.c mesa-20.0.8/src/gallium/drivers/radeonsi/si_descriptors.c --- mesa-19.2.8/src/gallium/drivers/radeonsi/si_descriptors.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/radeonsi/si_descriptors.c 2020-06-12 01:21:17.000000000 +0000 @@ -58,7 +58,7 @@ #include "util/hash_table.h" #include "util/u_idalloc.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_memory.h" #include "util/u_upload_mgr.h" @@ -75,134 +75,134 @@ * This is the only reason why the buffer descriptor must be in words [4:7]. */ static uint32_t null_texture_descriptor[8] = { - 0, - 0, - 0, - S_008F1C_DST_SEL_W(V_008F1C_SQ_SEL_1) | - S_008F1C_TYPE(V_008F1C_SQ_RSRC_IMG_1D) - /* the rest must contain zeros, which is also used by the buffer - * descriptor */ + 0, + 0, + 0, + S_008F1C_DST_SEL_W(V_008F1C_SQ_SEL_1) | + S_008F1C_TYPE(V_008F1C_SQ_RSRC_IMG_1D) + /* the rest must contain zeros, which is also used by the buffer + * descriptor */ }; static uint32_t null_image_descriptor[8] = { - 0, - 0, - 0, - S_008F1C_TYPE(V_008F1C_SQ_RSRC_IMG_1D) - /* the rest must contain zeros, which is also used by the buffer - * descriptor */ + 0, + 0, + 0, + S_008F1C_TYPE(V_008F1C_SQ_RSRC_IMG_1D) + /* the rest must contain zeros, which is also used by the buffer + * descriptor */ }; static uint64_t si_desc_extract_buffer_address(const uint32_t *desc) { - uint64_t va = desc[0] | - ((uint64_t)G_008F04_BASE_ADDRESS_HI(desc[1]) << 32); + uint64_t va = desc[0] | + ((uint64_t)G_008F04_BASE_ADDRESS_HI(desc[1]) << 32); - /* Sign-extend the 48-bit address. */ - va <<= 16; - va = (int64_t)va >> 16; - return va; + /* Sign-extend the 48-bit address. */ + va <<= 16; + va = (int64_t)va >> 16; + return va; } static void si_init_descriptor_list(uint32_t *desc_list, - unsigned element_dw_size, - unsigned num_elements, - const uint32_t *null_descriptor) -{ - int i; - - /* Initialize the array to NULL descriptors if the element size is 8. */ - if (null_descriptor) { - assert(element_dw_size % 8 == 0); - for (i = 0; i < num_elements * element_dw_size / 8; i++) - memcpy(desc_list + i * 8, null_descriptor, 8 * 4); - } + unsigned element_dw_size, + unsigned num_elements, + const uint32_t *null_descriptor) +{ + int i; + + /* Initialize the array to NULL descriptors if the element size is 8. */ + if (null_descriptor) { + assert(element_dw_size % 8 == 0); + for (i = 0; i < num_elements * element_dw_size / 8; i++) + memcpy(desc_list + i * 8, null_descriptor, 8 * 4); + } } static void si_init_descriptors(struct si_descriptors *desc, - short shader_userdata_rel_index, - unsigned element_dw_size, - unsigned num_elements) -{ - desc->list = CALLOC(num_elements, element_dw_size * 4); - desc->element_dw_size = element_dw_size; - desc->num_elements = num_elements; - desc->shader_userdata_offset = shader_userdata_rel_index * 4; - desc->slot_index_to_bind_directly = -1; + short shader_userdata_rel_index, + unsigned element_dw_size, + unsigned num_elements) +{ + desc->list = CALLOC(num_elements, element_dw_size * 4); + desc->element_dw_size = element_dw_size; + desc->num_elements = num_elements; + desc->shader_userdata_offset = shader_userdata_rel_index * 4; + desc->slot_index_to_bind_directly = -1; } static void si_release_descriptors(struct si_descriptors *desc) { - si_resource_reference(&desc->buffer, NULL); - FREE(desc->list); + si_resource_reference(&desc->buffer, NULL); + FREE(desc->list); } static bool si_upload_descriptors(struct si_context *sctx, - struct si_descriptors *desc) + struct si_descriptors *desc) { - unsigned slot_size = desc->element_dw_size * 4; - unsigned first_slot_offset = desc->first_active_slot * slot_size; - unsigned upload_size = desc->num_active_slots * slot_size; - - /* Skip the upload if no shader is using the descriptors. dirty_mask - * will stay dirty and the descriptors will be uploaded when there is - * a shader using them. - */ - if (!upload_size) - return true; - - /* If there is just one active descriptor, bind it directly. */ - if ((int)desc->first_active_slot == desc->slot_index_to_bind_directly && - desc->num_active_slots == 1) { - uint32_t *descriptor = &desc->list[desc->slot_index_to_bind_directly * - desc->element_dw_size]; - - /* The buffer is already in the buffer list. */ - si_resource_reference(&desc->buffer, NULL); - desc->gpu_list = NULL; - desc->gpu_address = si_desc_extract_buffer_address(descriptor); - si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers); - return true; - } - - uint32_t *ptr; - unsigned buffer_offset; - u_upload_alloc(sctx->b.const_uploader, first_slot_offset, upload_size, - si_optimal_tcc_alignment(sctx, upload_size), - &buffer_offset, (struct pipe_resource**)&desc->buffer, - (void**)&ptr); - if (!desc->buffer) { - desc->gpu_address = 0; - return false; /* skip the draw call */ - } - - util_memcpy_cpu_to_le32(ptr, (char*)desc->list + first_slot_offset, - upload_size); - desc->gpu_list = ptr - first_slot_offset / 4; + unsigned slot_size = desc->element_dw_size * 4; + unsigned first_slot_offset = desc->first_active_slot * slot_size; + unsigned upload_size = desc->num_active_slots * slot_size; + + /* Skip the upload if no shader is using the descriptors. dirty_mask + * will stay dirty and the descriptors will be uploaded when there is + * a shader using them. + */ + if (!upload_size) + return true; + + /* If there is just one active descriptor, bind it directly. */ + if ((int)desc->first_active_slot == desc->slot_index_to_bind_directly && + desc->num_active_slots == 1) { + uint32_t *descriptor = &desc->list[desc->slot_index_to_bind_directly * + desc->element_dw_size]; + + /* The buffer is already in the buffer list. */ + si_resource_reference(&desc->buffer, NULL); + desc->gpu_list = NULL; + desc->gpu_address = si_desc_extract_buffer_address(descriptor); + si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers); + return true; + } + + uint32_t *ptr; + unsigned buffer_offset; + u_upload_alloc(sctx->b.const_uploader, first_slot_offset, upload_size, + si_optimal_tcc_alignment(sctx, upload_size), + &buffer_offset, (struct pipe_resource**)&desc->buffer, + (void**)&ptr); + if (!desc->buffer) { + desc->gpu_address = 0; + return false; /* skip the draw call */ + } + + util_memcpy_cpu_to_le32(ptr, (char*)desc->list + first_slot_offset, + upload_size); + desc->gpu_list = ptr - first_slot_offset / 4; - radeon_add_to_buffer_list(sctx, sctx->gfx_cs, desc->buffer, + radeon_add_to_buffer_list(sctx, sctx->gfx_cs, desc->buffer, RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS); - /* The shader pointer should point to slot 0. */ - buffer_offset -= first_slot_offset; - desc->gpu_address = desc->buffer->gpu_address + buffer_offset; - - assert(desc->buffer->flags & RADEON_FLAG_32BIT); - assert((desc->buffer->gpu_address >> 32) == sctx->screen->info.address32_hi); - assert((desc->gpu_address >> 32) == sctx->screen->info.address32_hi); + /* The shader pointer should point to slot 0. */ + buffer_offset -= first_slot_offset; + desc->gpu_address = desc->buffer->gpu_address + buffer_offset; + + assert(desc->buffer->flags & RADEON_FLAG_32BIT); + assert((desc->buffer->gpu_address >> 32) == sctx->screen->info.address32_hi); + assert((desc->gpu_address >> 32) == sctx->screen->info.address32_hi); - si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers); - return true; + si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers); + return true; } static void si_descriptors_begin_new_cs(struct si_context *sctx, struct si_descriptors *desc) { - if (!desc->buffer) - return; + if (!desc->buffer) + return; - radeon_add_to_buffer_list(sctx, sctx->gfx_cs, desc->buffer, - RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS); + radeon_add_to_buffer_list(sctx, sctx->gfx_cs, desc->buffer, + RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS); } /* SAMPLER VIEWS */ @@ -210,429 +210,429 @@ static inline enum radeon_bo_priority si_get_sampler_view_priority(struct si_resource *res) { - if (res->b.b.target == PIPE_BUFFER) - return RADEON_PRIO_SAMPLER_BUFFER; + if (res->b.b.target == PIPE_BUFFER) + return RADEON_PRIO_SAMPLER_BUFFER; - if (res->b.b.nr_samples > 1) - return RADEON_PRIO_SAMPLER_TEXTURE_MSAA; + if (res->b.b.nr_samples > 1) + return RADEON_PRIO_SAMPLER_TEXTURE_MSAA; - return RADEON_PRIO_SAMPLER_TEXTURE; + return RADEON_PRIO_SAMPLER_TEXTURE; } static struct si_descriptors * si_sampler_and_image_descriptors(struct si_context *sctx, unsigned shader) { - return &sctx->descriptors[si_sampler_and_image_descriptors_idx(shader)]; + return &sctx->descriptors[si_sampler_and_image_descriptors_idx(shader)]; } static void si_release_sampler_views(struct si_samplers *samplers) { - int i; + int i; - for (i = 0; i < ARRAY_SIZE(samplers->views); i++) { - pipe_sampler_view_reference(&samplers->views[i], NULL); - } + for (i = 0; i < ARRAY_SIZE(samplers->views); i++) { + pipe_sampler_view_reference(&samplers->views[i], NULL); + } } static void si_sampler_view_add_buffer(struct si_context *sctx, - struct pipe_resource *resource, - enum radeon_bo_usage usage, - bool is_stencil_sampler, - bool check_mem) -{ - struct si_texture *tex = (struct si_texture*)resource; - enum radeon_bo_priority priority; - - if (!resource) - return; - - /* Use the flushed depth texture if direct sampling is unsupported. */ - if (resource->target != PIPE_BUFFER && - tex->is_depth && !si_can_sample_zs(tex, is_stencil_sampler)) - tex = tex->flushed_depth_texture; - - priority = si_get_sampler_view_priority(&tex->buffer); - radeon_add_to_gfx_buffer_list_check_mem(sctx, &tex->buffer, usage, priority, - check_mem); - - if (resource->target == PIPE_BUFFER) - return; - - /* Add separate DCC. */ - if (tex->dcc_separate_buffer) { - radeon_add_to_gfx_buffer_list_check_mem(sctx, tex->dcc_separate_buffer, - usage, RADEON_PRIO_SEPARATE_META, check_mem); - } + struct pipe_resource *resource, + enum radeon_bo_usage usage, + bool is_stencil_sampler, + bool check_mem) +{ + struct si_texture *tex = (struct si_texture*)resource; + enum radeon_bo_priority priority; + + if (!resource) + return; + + /* Use the flushed depth texture if direct sampling is unsupported. */ + if (resource->target != PIPE_BUFFER && + tex->is_depth && !si_can_sample_zs(tex, is_stencil_sampler)) + tex = tex->flushed_depth_texture; + + priority = si_get_sampler_view_priority(&tex->buffer); + radeon_add_to_gfx_buffer_list_check_mem(sctx, &tex->buffer, usage, priority, + check_mem); + + if (resource->target == PIPE_BUFFER) + return; + + /* Add separate DCC. */ + if (tex->dcc_separate_buffer) { + radeon_add_to_gfx_buffer_list_check_mem(sctx, tex->dcc_separate_buffer, + usage, RADEON_PRIO_SEPARATE_META, check_mem); + } } static void si_sampler_views_begin_new_cs(struct si_context *sctx, - struct si_samplers *samplers) + struct si_samplers *samplers) { - unsigned mask = samplers->enabled_mask; + unsigned mask = samplers->enabled_mask; - /* Add buffers to the CS. */ - while (mask) { - int i = u_bit_scan(&mask); - struct si_sampler_view *sview = (struct si_sampler_view *)samplers->views[i]; - - si_sampler_view_add_buffer(sctx, sview->base.texture, - RADEON_USAGE_READ, - sview->is_stencil_sampler, false); - } + /* Add buffers to the CS. */ + while (mask) { + int i = u_bit_scan(&mask); + struct si_sampler_view *sview = (struct si_sampler_view *)samplers->views[i]; + + si_sampler_view_add_buffer(sctx, sview->base.texture, + RADEON_USAGE_READ, + sview->is_stencil_sampler, false); + } } /* Set buffer descriptor fields that can be changed by reallocations. */ static void si_set_buf_desc_address(struct si_resource *buf, - uint64_t offset, uint32_t *state) + uint64_t offset, uint32_t *state) { - uint64_t va = buf->gpu_address + offset; + uint64_t va = buf->gpu_address + offset; - state[0] = va; - state[1] &= C_008F04_BASE_ADDRESS_HI; - state[1] |= S_008F04_BASE_ADDRESS_HI(va >> 32); + state[0] = va; + state[1] &= C_008F04_BASE_ADDRESS_HI; + state[1] |= S_008F04_BASE_ADDRESS_HI(va >> 32); } /* Set texture descriptor fields that can be changed by reallocations. * - * \param tex texture - * \param base_level_info information of the level of BASE_ADDRESS - * \param base_level the level of BASE_ADDRESS - * \param first_level pipe_sampler_view.u.tex.first_level - * \param block_width util_format_get_blockwidth() - * \param is_stencil select between separate Z & Stencil - * \param state descriptor to update + * \param tex texture + * \param base_level_info information of the level of BASE_ADDRESS + * \param base_level the level of BASE_ADDRESS + * \param first_level pipe_sampler_view.u.tex.first_level + * \param block_width util_format_get_blockwidth() + * \param is_stencil select between separate Z & Stencil + * \param state descriptor to update */ void si_set_mutable_tex_desc_fields(struct si_screen *sscreen, - struct si_texture *tex, - const struct legacy_surf_level *base_level_info, - unsigned base_level, unsigned first_level, - unsigned block_width, bool is_stencil, - uint32_t *state) -{ - uint64_t va, meta_va = 0; - - if (tex->is_depth && !si_can_sample_zs(tex, is_stencil)) { - tex = tex->flushed_depth_texture; - is_stencil = false; - } - - va = tex->buffer.gpu_address; - - if (sscreen->info.chip_class >= GFX9) { - /* Only stencil_offset needs to be added here. */ - if (is_stencil) - va += tex->surface.u.gfx9.stencil_offset; - else - va += tex->surface.u.gfx9.surf_offset; - } else { - va += base_level_info->offset; - } - - state[0] = va >> 8; - state[1] &= C_008F14_BASE_ADDRESS_HI; - state[1] |= S_008F14_BASE_ADDRESS_HI(va >> 40); - - /* Only macrotiled modes can set tile swizzle. - * GFX9 doesn't use (legacy) base_level_info. - */ - if (sscreen->info.chip_class >= GFX9 || - base_level_info->mode == RADEON_SURF_MODE_2D) - state[0] |= tex->surface.tile_swizzle; - - if (sscreen->info.chip_class >= GFX8) { - state[6] &= C_008F28_COMPRESSION_EN; - - if (vi_dcc_enabled(tex, first_level)) { - meta_va = (!tex->dcc_separate_buffer ? tex->buffer.gpu_address : 0) + - tex->dcc_offset; - - if (sscreen->info.chip_class == GFX8) { - meta_va += base_level_info->dcc_offset; - assert(base_level_info->mode == RADEON_SURF_MODE_2D); - } - - unsigned dcc_tile_swizzle = tex->surface.tile_swizzle << 8; - dcc_tile_swizzle &= tex->surface.dcc_alignment - 1; - meta_va |= dcc_tile_swizzle; - } else if (vi_tc_compat_htile_enabled(tex, first_level, - is_stencil ? PIPE_MASK_S : PIPE_MASK_Z)) { - meta_va = tex->buffer.gpu_address + tex->htile_offset; - } - - if (meta_va) - state[6] |= S_008F28_COMPRESSION_EN(1); - } - - if (sscreen->info.chip_class >= GFX8 && sscreen->info.chip_class <= GFX9) - state[7] = meta_va >> 8; - - if (sscreen->info.chip_class >= GFX10) { - state[3] &= C_00A00C_SW_MODE; - - if (is_stencil) { - state[3] |= S_00A00C_SW_MODE(tex->surface.u.gfx9.stencil.swizzle_mode); - } else { - state[3] |= S_00A00C_SW_MODE(tex->surface.u.gfx9.surf.swizzle_mode); - } - - state[6] &= C_00A018_META_DATA_ADDRESS_LO & - C_00A018_META_PIPE_ALIGNED; - - if (meta_va) { - struct gfx9_surf_meta_flags meta; - - if (tex->dcc_offset) - meta = tex->surface.u.gfx9.dcc; - else - meta = tex->surface.u.gfx9.htile; - - state[6] |= S_00A018_META_PIPE_ALIGNED(meta.pipe_aligned) | - S_00A018_META_DATA_ADDRESS_LO(meta_va >> 8); - } - - state[7] = meta_va >> 16; - } else if (sscreen->info.chip_class == GFX9) { - state[3] &= C_008F1C_SW_MODE; - state[4] &= C_008F20_PITCH; - - if (is_stencil) { - state[3] |= S_008F1C_SW_MODE(tex->surface.u.gfx9.stencil.swizzle_mode); - state[4] |= S_008F20_PITCH(tex->surface.u.gfx9.stencil.epitch); - } else { - state[3] |= S_008F1C_SW_MODE(tex->surface.u.gfx9.surf.swizzle_mode); - state[4] |= S_008F20_PITCH(tex->surface.u.gfx9.surf.epitch); - } - - state[5] &= C_008F24_META_DATA_ADDRESS & - C_008F24_META_PIPE_ALIGNED & - C_008F24_META_RB_ALIGNED; - if (meta_va) { - struct gfx9_surf_meta_flags meta; - - if (tex->dcc_offset) - meta = tex->surface.u.gfx9.dcc; - else - meta = tex->surface.u.gfx9.htile; - - state[5] |= S_008F24_META_DATA_ADDRESS(meta_va >> 40) | - S_008F24_META_PIPE_ALIGNED(meta.pipe_aligned) | - S_008F24_META_RB_ALIGNED(meta.rb_aligned); - } - } else { - /* GFX6-GFX8 */ - unsigned pitch = base_level_info->nblk_x * block_width; - unsigned index = si_tile_mode_index(tex, base_level, is_stencil); - - state[3] &= C_008F1C_TILING_INDEX; - state[3] |= S_008F1C_TILING_INDEX(index); - state[4] &= C_008F20_PITCH; - state[4] |= S_008F20_PITCH(pitch - 1); - } + struct si_texture *tex, + const struct legacy_surf_level *base_level_info, + unsigned base_level, unsigned first_level, + unsigned block_width, bool is_stencil, + uint32_t *state) +{ + uint64_t va, meta_va = 0; + + if (tex->is_depth && !si_can_sample_zs(tex, is_stencil)) { + tex = tex->flushed_depth_texture; + is_stencil = false; + } + + va = tex->buffer.gpu_address; + + if (sscreen->info.chip_class >= GFX9) { + /* Only stencil_offset needs to be added here. */ + if (is_stencil) + va += tex->surface.u.gfx9.stencil_offset; + else + va += tex->surface.u.gfx9.surf_offset; + } else { + va += base_level_info->offset; + } + + state[0] = va >> 8; + state[1] &= C_008F14_BASE_ADDRESS_HI; + state[1] |= S_008F14_BASE_ADDRESS_HI(va >> 40); + + /* Only macrotiled modes can set tile swizzle. + * GFX9 doesn't use (legacy) base_level_info. + */ + if (sscreen->info.chip_class >= GFX9 || + base_level_info->mode == RADEON_SURF_MODE_2D) + state[0] |= tex->surface.tile_swizzle; + + if (sscreen->info.chip_class >= GFX8) { + state[6] &= C_008F28_COMPRESSION_EN; + + if (vi_dcc_enabled(tex, first_level)) { + meta_va = (!tex->dcc_separate_buffer ? tex->buffer.gpu_address : 0) + + tex->surface.dcc_offset; + + if (sscreen->info.chip_class == GFX8) { + meta_va += base_level_info->dcc_offset; + assert(base_level_info->mode == RADEON_SURF_MODE_2D); + } + + unsigned dcc_tile_swizzle = tex->surface.tile_swizzle << 8; + dcc_tile_swizzle &= tex->surface.dcc_alignment - 1; + meta_va |= dcc_tile_swizzle; + } else if (vi_tc_compat_htile_enabled(tex, first_level, + is_stencil ? PIPE_MASK_S : PIPE_MASK_Z)) { + meta_va = tex->buffer.gpu_address + tex->surface.htile_offset; + } + + if (meta_va) + state[6] |= S_008F28_COMPRESSION_EN(1); + } + + if (sscreen->info.chip_class >= GFX8 && sscreen->info.chip_class <= GFX9) + state[7] = meta_va >> 8; + + if (sscreen->info.chip_class >= GFX10) { + state[3] &= C_00A00C_SW_MODE; + + if (is_stencil) { + state[3] |= S_00A00C_SW_MODE(tex->surface.u.gfx9.stencil.swizzle_mode); + } else { + state[3] |= S_00A00C_SW_MODE(tex->surface.u.gfx9.surf.swizzle_mode); + } + + state[6] &= C_00A018_META_DATA_ADDRESS_LO & + C_00A018_META_PIPE_ALIGNED; + + if (meta_va) { + struct gfx9_surf_meta_flags meta; + + if (tex->surface.dcc_offset) + meta = tex->surface.u.gfx9.dcc; + else + meta = tex->surface.u.gfx9.htile; + + state[6] |= S_00A018_META_PIPE_ALIGNED(meta.pipe_aligned) | + S_00A018_META_DATA_ADDRESS_LO(meta_va >> 8); + } + + state[7] = meta_va >> 16; + } else if (sscreen->info.chip_class == GFX9) { + state[3] &= C_008F1C_SW_MODE; + state[4] &= C_008F20_PITCH; + + if (is_stencil) { + state[3] |= S_008F1C_SW_MODE(tex->surface.u.gfx9.stencil.swizzle_mode); + state[4] |= S_008F20_PITCH(tex->surface.u.gfx9.stencil.epitch); + } else { + state[3] |= S_008F1C_SW_MODE(tex->surface.u.gfx9.surf.swizzle_mode); + state[4] |= S_008F20_PITCH(tex->surface.u.gfx9.surf.epitch); + } + + state[5] &= C_008F24_META_DATA_ADDRESS & + C_008F24_META_PIPE_ALIGNED & + C_008F24_META_RB_ALIGNED; + if (meta_va) { + struct gfx9_surf_meta_flags meta; + + if (tex->surface.dcc_offset) + meta = tex->surface.u.gfx9.dcc; + else + meta = tex->surface.u.gfx9.htile; + + state[5] |= S_008F24_META_DATA_ADDRESS(meta_va >> 40) | + S_008F24_META_PIPE_ALIGNED(meta.pipe_aligned) | + S_008F24_META_RB_ALIGNED(meta.rb_aligned); + } + } else { + /* GFX6-GFX8 */ + unsigned pitch = base_level_info->nblk_x * block_width; + unsigned index = si_tile_mode_index(tex, base_level, is_stencil); + + state[3] &= C_008F1C_TILING_INDEX; + state[3] |= S_008F1C_TILING_INDEX(index); + state[4] &= C_008F20_PITCH; + state[4] |= S_008F20_PITCH(pitch - 1); + } } static void si_set_sampler_state_desc(struct si_sampler_state *sstate, - struct si_sampler_view *sview, - struct si_texture *tex, - uint32_t *desc) -{ - if (sview && sview->is_integer) - memcpy(desc, sstate->integer_val, 4*4); - else if (tex && tex->upgraded_depth && - (!sview || !sview->is_stencil_sampler)) - memcpy(desc, sstate->upgraded_depth_val, 4*4); - else - memcpy(desc, sstate->val, 4*4); + struct si_sampler_view *sview, + struct si_texture *tex, + uint32_t *desc) +{ + if (sview && sview->is_integer) + memcpy(desc, sstate->integer_val, 4*4); + else if (tex && tex->upgraded_depth && + (!sview || !sview->is_stencil_sampler)) + memcpy(desc, sstate->upgraded_depth_val, 4*4); + else + memcpy(desc, sstate->val, 4*4); } static void si_set_sampler_view_desc(struct si_context *sctx, - struct si_sampler_view *sview, - struct si_sampler_state *sstate, - uint32_t *desc) -{ - struct pipe_sampler_view *view = &sview->base; - struct si_texture *tex = (struct si_texture *)view->texture; - bool is_buffer = tex->buffer.b.b.target == PIPE_BUFFER; - - if (unlikely(!is_buffer && sview->dcc_incompatible)) { - if (vi_dcc_enabled(tex, view->u.tex.first_level)) - if (!si_texture_disable_dcc(sctx, tex)) - si_decompress_dcc(sctx, tex); - - sview->dcc_incompatible = false; - } - - assert(tex); /* views with texture == NULL aren't supported */ - memcpy(desc, sview->state, 8*4); - - if (is_buffer) { - si_set_buf_desc_address(&tex->buffer, - sview->base.u.buf.offset, - desc + 4); - } else { - bool is_separate_stencil = tex->db_compatible && - sview->is_stencil_sampler; - - si_set_mutable_tex_desc_fields(sctx->screen, tex, - sview->base_level_info, - sview->base_level, - sview->base.u.tex.first_level, - sview->block_width, - is_separate_stencil, - desc); - } - - if (!is_buffer && tex->surface.fmask_size) { - memcpy(desc + 8, sview->fmask_state, 8*4); - } else { - /* Disable FMASK and bind sampler state in [12:15]. */ - memcpy(desc + 8, null_texture_descriptor, 4*4); - - if (sstate) - si_set_sampler_state_desc(sstate, sview, - is_buffer ? NULL : tex, - desc + 12); - } + struct si_sampler_view *sview, + struct si_sampler_state *sstate, + uint32_t *desc) +{ + struct pipe_sampler_view *view = &sview->base; + struct si_texture *tex = (struct si_texture *)view->texture; + bool is_buffer = tex->buffer.b.b.target == PIPE_BUFFER; + + if (unlikely(!is_buffer && sview->dcc_incompatible)) { + if (vi_dcc_enabled(tex, view->u.tex.first_level)) + if (!si_texture_disable_dcc(sctx, tex)) + si_decompress_dcc(sctx, tex); + + sview->dcc_incompatible = false; + } + + assert(tex); /* views with texture == NULL aren't supported */ + memcpy(desc, sview->state, 8*4); + + if (is_buffer) { + si_set_buf_desc_address(&tex->buffer, + sview->base.u.buf.offset, + desc + 4); + } else { + bool is_separate_stencil = tex->db_compatible && + sview->is_stencil_sampler; + + si_set_mutable_tex_desc_fields(sctx->screen, tex, + sview->base_level_info, + sview->base_level, + sview->base.u.tex.first_level, + sview->block_width, + is_separate_stencil, + desc); + } + + if (!is_buffer && tex->surface.fmask_size) { + memcpy(desc + 8, sview->fmask_state, 8*4); + } else { + /* Disable FMASK and bind sampler state in [12:15]. */ + memcpy(desc + 8, null_texture_descriptor, 4*4); + + if (sstate) + si_set_sampler_state_desc(sstate, sview, + is_buffer ? NULL : tex, + desc + 12); + } } static bool color_needs_decompression(struct si_texture *tex) { - return tex->surface.fmask_size || - (tex->dirty_level_mask && - (tex->cmask_buffer || tex->dcc_offset)); + return tex->surface.fmask_size || + (tex->dirty_level_mask && + (tex->cmask_buffer || tex->surface.dcc_offset)); } static bool depth_needs_decompression(struct si_texture *tex) { - /* If the depth/stencil texture is TC-compatible, no decompression - * will be done. The decompression function will only flush DB caches - * to make it coherent with shaders. That's necessary because the driver - * doesn't flush DB caches in any other case. - */ - return tex->db_compatible; + /* If the depth/stencil texture is TC-compatible, no decompression + * will be done. The decompression function will only flush DB caches + * to make it coherent with shaders. That's necessary because the driver + * doesn't flush DB caches in any other case. + */ + return tex->db_compatible; } static void si_set_sampler_view(struct si_context *sctx, - unsigned shader, - unsigned slot, struct pipe_sampler_view *view, - bool disallow_early_out) -{ - struct si_samplers *samplers = &sctx->samplers[shader]; - struct si_sampler_view *sview = (struct si_sampler_view*)view; - struct si_descriptors *descs = si_sampler_and_image_descriptors(sctx, shader); - unsigned desc_slot = si_get_sampler_slot(slot); - uint32_t *desc = descs->list + desc_slot * 16; - - if (samplers->views[slot] == view && !disallow_early_out) - return; - - if (view) { - struct si_texture *tex = (struct si_texture *)view->texture; - - si_set_sampler_view_desc(sctx, sview, - samplers->sampler_states[slot], desc); - - if (tex->buffer.b.b.target == PIPE_BUFFER) { - tex->buffer.bind_history |= PIPE_BIND_SAMPLER_VIEW; - samplers->needs_depth_decompress_mask &= ~(1u << slot); - samplers->needs_color_decompress_mask &= ~(1u << slot); - } else { - if (depth_needs_decompression(tex)) { - samplers->needs_depth_decompress_mask |= 1u << slot; - } else { - samplers->needs_depth_decompress_mask &= ~(1u << slot); - } - if (color_needs_decompression(tex)) { - samplers->needs_color_decompress_mask |= 1u << slot; - } else { - samplers->needs_color_decompress_mask &= ~(1u << slot); - } - - if (tex->dcc_offset && - p_atomic_read(&tex->framebuffers_bound)) - sctx->need_check_render_feedback = true; - } - - pipe_sampler_view_reference(&samplers->views[slot], view); - samplers->enabled_mask |= 1u << slot; - - /* Since this can flush, it must be done after enabled_mask is - * updated. */ - si_sampler_view_add_buffer(sctx, view->texture, - RADEON_USAGE_READ, - sview->is_stencil_sampler, true); - } else { - pipe_sampler_view_reference(&samplers->views[slot], NULL); - memcpy(desc, null_texture_descriptor, 8*4); - /* Only clear the lower dwords of FMASK. */ - memcpy(desc + 8, null_texture_descriptor, 4*4); - /* Re-set the sampler state if we are transitioning from FMASK. */ - if (samplers->sampler_states[slot]) - si_set_sampler_state_desc(samplers->sampler_states[slot], NULL, NULL, - desc + 12); - - samplers->enabled_mask &= ~(1u << slot); - samplers->needs_depth_decompress_mask &= ~(1u << slot); - samplers->needs_color_decompress_mask &= ~(1u << slot); - } + unsigned shader, + unsigned slot, struct pipe_sampler_view *view, + bool disallow_early_out) +{ + struct si_samplers *samplers = &sctx->samplers[shader]; + struct si_sampler_view *sview = (struct si_sampler_view*)view; + struct si_descriptors *descs = si_sampler_and_image_descriptors(sctx, shader); + unsigned desc_slot = si_get_sampler_slot(slot); + uint32_t *desc = descs->list + desc_slot * 16; + + if (samplers->views[slot] == view && !disallow_early_out) + return; + + if (view) { + struct si_texture *tex = (struct si_texture *)view->texture; + + si_set_sampler_view_desc(sctx, sview, + samplers->sampler_states[slot], desc); + + if (tex->buffer.b.b.target == PIPE_BUFFER) { + tex->buffer.bind_history |= PIPE_BIND_SAMPLER_VIEW; + samplers->needs_depth_decompress_mask &= ~(1u << slot); + samplers->needs_color_decompress_mask &= ~(1u << slot); + } else { + if (depth_needs_decompression(tex)) { + samplers->needs_depth_decompress_mask |= 1u << slot; + } else { + samplers->needs_depth_decompress_mask &= ~(1u << slot); + } + if (color_needs_decompression(tex)) { + samplers->needs_color_decompress_mask |= 1u << slot; + } else { + samplers->needs_color_decompress_mask &= ~(1u << slot); + } + + if (tex->surface.dcc_offset && + p_atomic_read(&tex->framebuffers_bound)) + sctx->need_check_render_feedback = true; + } + + pipe_sampler_view_reference(&samplers->views[slot], view); + samplers->enabled_mask |= 1u << slot; + + /* Since this can flush, it must be done after enabled_mask is + * updated. */ + si_sampler_view_add_buffer(sctx, view->texture, + RADEON_USAGE_READ, + sview->is_stencil_sampler, true); + } else { + pipe_sampler_view_reference(&samplers->views[slot], NULL); + memcpy(desc, null_texture_descriptor, 8*4); + /* Only clear the lower dwords of FMASK. */ + memcpy(desc + 8, null_texture_descriptor, 4*4); + /* Re-set the sampler state if we are transitioning from FMASK. */ + if (samplers->sampler_states[slot]) + si_set_sampler_state_desc(samplers->sampler_states[slot], NULL, NULL, + desc + 12); + + samplers->enabled_mask &= ~(1u << slot); + samplers->needs_depth_decompress_mask &= ~(1u << slot); + samplers->needs_color_decompress_mask &= ~(1u << slot); + } - sctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader); + sctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader); } static void si_update_shader_needs_decompress_mask(struct si_context *sctx, - unsigned shader) + unsigned shader) { - struct si_samplers *samplers = &sctx->samplers[shader]; - unsigned shader_bit = 1 << shader; + struct si_samplers *samplers = &sctx->samplers[shader]; + unsigned shader_bit = 1 << shader; - if (samplers->needs_depth_decompress_mask || - samplers->needs_color_decompress_mask || - sctx->images[shader].needs_color_decompress_mask) - sctx->shader_needs_decompress_mask |= shader_bit; - else - sctx->shader_needs_decompress_mask &= ~shader_bit; + if (samplers->needs_depth_decompress_mask || + samplers->needs_color_decompress_mask || + sctx->images[shader].needs_color_decompress_mask) + sctx->shader_needs_decompress_mask |= shader_bit; + else + sctx->shader_needs_decompress_mask &= ~shader_bit; } static void si_set_sampler_views(struct pipe_context *ctx, - enum pipe_shader_type shader, unsigned start, + enum pipe_shader_type shader, unsigned start, unsigned count, - struct pipe_sampler_view **views) + struct pipe_sampler_view **views) { - struct si_context *sctx = (struct si_context *)ctx; - int i; + struct si_context *sctx = (struct si_context *)ctx; + int i; - if (!count || shader >= SI_NUM_SHADERS) - return; + if (!count || shader >= SI_NUM_SHADERS) + return; - if (views) { - for (i = 0; i < count; i++) - si_set_sampler_view(sctx, shader, start + i, views[i], false); - } else { - for (i = 0; i < count; i++) - si_set_sampler_view(sctx, shader, start + i, NULL, false); - } + if (views) { + for (i = 0; i < count; i++) + si_set_sampler_view(sctx, shader, start + i, views[i], false); + } else { + for (i = 0; i < count; i++) + si_set_sampler_view(sctx, shader, start + i, NULL, false); + } - si_update_shader_needs_decompress_mask(sctx, shader); + si_update_shader_needs_decompress_mask(sctx, shader); } static void si_samplers_update_needs_color_decompress_mask(struct si_samplers *samplers) { - unsigned mask = samplers->enabled_mask; + unsigned mask = samplers->enabled_mask; - while (mask) { - int i = u_bit_scan(&mask); - struct pipe_resource *res = samplers->views[i]->texture; - - if (res && res->target != PIPE_BUFFER) { - struct si_texture *tex = (struct si_texture *)res; - - if (color_needs_decompression(tex)) { - samplers->needs_color_decompress_mask |= 1u << i; - } else { - samplers->needs_color_decompress_mask &= ~(1u << i); - } - } - } + while (mask) { + int i = u_bit_scan(&mask); + struct pipe_resource *res = samplers->views[i]->texture; + + if (res && res->target != PIPE_BUFFER) { + struct si_texture *tex = (struct si_texture *)res; + + if (color_needs_decompression(tex)) { + samplers->needs_color_decompress_mask |= 1u << i; + } else { + samplers->needs_color_decompress_mask &= ~(1u << i); + } + } + } } /* IMAGE VIEWS */ @@ -640,321 +640,311 @@ static void si_release_image_views(struct si_images *images) { - unsigned i; + unsigned i; - for (i = 0; i < SI_NUM_IMAGES; ++i) { - struct pipe_image_view *view = &images->views[i]; + for (i = 0; i < SI_NUM_IMAGES; ++i) { + struct pipe_image_view *view = &images->views[i]; - pipe_resource_reference(&view->resource, NULL); - } + pipe_resource_reference(&view->resource, NULL); + } } static void si_image_views_begin_new_cs(struct si_context *sctx, struct si_images *images) { - uint mask = images->enabled_mask; + uint mask = images->enabled_mask; - /* Add buffers to the CS. */ - while (mask) { - int i = u_bit_scan(&mask); - struct pipe_image_view *view = &images->views[i]; - - assert(view->resource); - - si_sampler_view_add_buffer(sctx, view->resource, - RADEON_USAGE_READWRITE, false, false); - } + /* Add buffers to the CS. */ + while (mask) { + int i = u_bit_scan(&mask); + struct pipe_image_view *view = &images->views[i]; + + assert(view->resource); + + si_sampler_view_add_buffer(sctx, view->resource, + RADEON_USAGE_READWRITE, false, false); + } } static void si_disable_shader_image(struct si_context *ctx, unsigned shader, unsigned slot) { - struct si_images *images = &ctx->images[shader]; + struct si_images *images = &ctx->images[shader]; - if (images->enabled_mask & (1u << slot)) { - struct si_descriptors *descs = si_sampler_and_image_descriptors(ctx, shader); - unsigned desc_slot = si_get_image_slot(slot); - - pipe_resource_reference(&images->views[slot].resource, NULL); - images->needs_color_decompress_mask &= ~(1 << slot); - - memcpy(descs->list + desc_slot*8, null_image_descriptor, 8*4); - images->enabled_mask &= ~(1u << slot); - ctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader); - } + if (images->enabled_mask & (1u << slot)) { + struct si_descriptors *descs = si_sampler_and_image_descriptors(ctx, shader); + unsigned desc_slot = si_get_image_slot(slot); + + pipe_resource_reference(&images->views[slot].resource, NULL); + images->needs_color_decompress_mask &= ~(1 << slot); + + memcpy(descs->list + desc_slot*8, null_image_descriptor, 8*4); + images->enabled_mask &= ~(1u << slot); + ctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader); + } } static void si_mark_image_range_valid(const struct pipe_image_view *view) { - struct si_resource *res = si_resource(view->resource); + struct si_resource *res = si_resource(view->resource); - if (res->b.b.target != PIPE_BUFFER) - return; + if (res->b.b.target != PIPE_BUFFER) + return; - util_range_add(&res->valid_buffer_range, - view->u.buf.offset, - view->u.buf.offset + view->u.buf.size); + util_range_add(&res->b.b, &res->valid_buffer_range, + view->u.buf.offset, + view->u.buf.offset + view->u.buf.size); } static void si_set_shader_image_desc(struct si_context *ctx, - const struct pipe_image_view *view, - bool skip_decompress, - uint32_t *desc, uint32_t *fmask_desc) -{ - struct si_screen *screen = ctx->screen; - struct si_resource *res; - - res = si_resource(view->resource); - - if (res->b.b.target == PIPE_BUFFER || - view->shader_access & SI_IMAGE_ACCESS_AS_BUFFER) { - if (view->access & PIPE_IMAGE_ACCESS_WRITE) - si_mark_image_range_valid(view); - - si_make_buffer_descriptor(screen, res, - view->format, - view->u.buf.offset, - view->u.buf.size, desc); - si_set_buf_desc_address(res, view->u.buf.offset, desc + 4); - } else { - static const unsigned char swizzle[4] = { 0, 1, 2, 3 }; - struct si_texture *tex = (struct si_texture *)res; - unsigned level = view->u.tex.level; - unsigned width, height, depth, hw_level; - bool uses_dcc = vi_dcc_enabled(tex, level); - unsigned access = view->access; - - /* Clear the write flag when writes can't occur. - * Note that DCC_DECOMPRESS for MSAA doesn't work in some cases, - * so we don't wanna trigger it. - */ - if (tex->is_depth || - (!fmask_desc && tex->surface.fmask_size != 0)) { - assert(!"Z/S and MSAA image stores are not supported"); - access &= ~PIPE_IMAGE_ACCESS_WRITE; - } - - assert(!tex->is_depth); - assert(fmask_desc || tex->surface.fmask_size == 0); - - if (uses_dcc && !skip_decompress && - (view->access & PIPE_IMAGE_ACCESS_WRITE || - !vi_dcc_formats_compatible(screen, res->b.b.format, view->format))) { - /* If DCC can't be disabled, at least decompress it. - * The decompression is relatively cheap if the surface - * has been decompressed already. - */ - if (!si_texture_disable_dcc(ctx, tex)) - si_decompress_dcc(ctx, tex); - } - - if (ctx->chip_class >= GFX9) { - /* Always set the base address. The swizzle modes don't - * allow setting mipmap level offsets as the base. - */ - width = res->b.b.width0; - height = res->b.b.height0; - depth = res->b.b.depth0; - hw_level = level; - } else { - /* Always force the base level to the selected level. - * - * This is required for 3D textures, where otherwise - * selecting a single slice for non-layered bindings - * fails. It doesn't hurt the other targets. - */ - width = u_minify(res->b.b.width0, level); - height = u_minify(res->b.b.height0, level); - depth = u_minify(res->b.b.depth0, level); - hw_level = 0; - } - - screen->make_texture_descriptor(screen, tex, - false, res->b.b.target, - view->format, swizzle, - hw_level, hw_level, - view->u.tex.first_layer, - view->u.tex.last_layer, - width, height, depth, - desc, fmask_desc); - si_set_mutable_tex_desc_fields(screen, tex, - &tex->surface.u.legacy.level[level], - level, level, - util_format_get_blockwidth(view->format), - false, desc); - } + const struct pipe_image_view *view, + bool skip_decompress, + uint32_t *desc, uint32_t *fmask_desc) +{ + struct si_screen *screen = ctx->screen; + struct si_resource *res; + + res = si_resource(view->resource); + + if (res->b.b.target == PIPE_BUFFER || + view->shader_access & SI_IMAGE_ACCESS_AS_BUFFER) { + if (view->access & PIPE_IMAGE_ACCESS_WRITE) + si_mark_image_range_valid(view); + + si_make_buffer_descriptor(screen, res, + view->format, + view->u.buf.offset, + view->u.buf.size, desc); + si_set_buf_desc_address(res, view->u.buf.offset, desc + 4); + } else { + static const unsigned char swizzle[4] = { 0, 1, 2, 3 }; + struct si_texture *tex = (struct si_texture *)res; + unsigned level = view->u.tex.level; + unsigned width, height, depth, hw_level; + bool uses_dcc = vi_dcc_enabled(tex, level); + unsigned access = view->access; + + assert(!tex->is_depth); + assert(fmask_desc || tex->surface.fmask_offset == 0); + + if (uses_dcc && !skip_decompress && + (access & PIPE_IMAGE_ACCESS_WRITE || + !vi_dcc_formats_compatible(screen, res->b.b.format, view->format))) { + /* If DCC can't be disabled, at least decompress it. + * The decompression is relatively cheap if the surface + * has been decompressed already. + */ + if (!si_texture_disable_dcc(ctx, tex)) + si_decompress_dcc(ctx, tex); + } + + if (ctx->chip_class >= GFX9) { + /* Always set the base address. The swizzle modes don't + * allow setting mipmap level offsets as the base. + */ + width = res->b.b.width0; + height = res->b.b.height0; + depth = res->b.b.depth0; + hw_level = level; + } else { + /* Always force the base level to the selected level. + * + * This is required for 3D textures, where otherwise + * selecting a single slice for non-layered bindings + * fails. It doesn't hurt the other targets. + */ + width = u_minify(res->b.b.width0, level); + height = u_minify(res->b.b.height0, level); + depth = u_minify(res->b.b.depth0, level); + hw_level = 0; + } + + screen->make_texture_descriptor(screen, tex, + false, res->b.b.target, + view->format, swizzle, + hw_level, hw_level, + view->u.tex.first_layer, + view->u.tex.last_layer, + width, height, depth, + desc, fmask_desc); + si_set_mutable_tex_desc_fields(screen, tex, + &tex->surface.u.legacy.level[level], + level, level, + util_format_get_blockwidth(view->format), + false, desc); + } } static void si_set_shader_image(struct si_context *ctx, - unsigned shader, - unsigned slot, const struct pipe_image_view *view, - bool skip_decompress) -{ - struct si_images *images = &ctx->images[shader]; - struct si_descriptors *descs = si_sampler_and_image_descriptors(ctx, shader); - struct si_resource *res; - unsigned desc_slot = si_get_image_slot(slot); - uint32_t *desc = descs->list + desc_slot * 8; - - if (!view || !view->resource) { - si_disable_shader_image(ctx, shader, slot); - return; - } - - res = si_resource(view->resource); - - if (&images->views[slot] != view) - util_copy_image_view(&images->views[slot], view); - - si_set_shader_image_desc(ctx, view, skip_decompress, desc, NULL); - - if (res->b.b.target == PIPE_BUFFER || - view->shader_access & SI_IMAGE_ACCESS_AS_BUFFER) { - images->needs_color_decompress_mask &= ~(1 << slot); - res->bind_history |= PIPE_BIND_SHADER_IMAGE; - } else { - struct si_texture *tex = (struct si_texture *)res; - unsigned level = view->u.tex.level; - - if (color_needs_decompression(tex)) { - images->needs_color_decompress_mask |= 1 << slot; - } else { - images->needs_color_decompress_mask &= ~(1 << slot); - } - - if (vi_dcc_enabled(tex, level) && - p_atomic_read(&tex->framebuffers_bound)) - ctx->need_check_render_feedback = true; - } - - images->enabled_mask |= 1u << slot; - ctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader); - - /* Since this can flush, it must be done after enabled_mask is updated. */ - si_sampler_view_add_buffer(ctx, &res->b.b, - (view->access & PIPE_IMAGE_ACCESS_WRITE) ? - RADEON_USAGE_READWRITE : RADEON_USAGE_READ, - false, true); + unsigned shader, + unsigned slot, const struct pipe_image_view *view, + bool skip_decompress) +{ + struct si_images *images = &ctx->images[shader]; + struct si_descriptors *descs = si_sampler_and_image_descriptors(ctx, shader); + struct si_resource *res; + + if (!view || !view->resource) { + si_disable_shader_image(ctx, shader, slot); + return; + } + + res = si_resource(view->resource); + + if (&images->views[slot] != view) + util_copy_image_view(&images->views[slot], view); + + si_set_shader_image_desc(ctx, view, skip_decompress, + descs->list + si_get_image_slot(slot) * 8, + descs->list + si_get_image_slot(slot + SI_NUM_IMAGES) * 8); + + if (res->b.b.target == PIPE_BUFFER || + view->shader_access & SI_IMAGE_ACCESS_AS_BUFFER) { + images->needs_color_decompress_mask &= ~(1 << slot); + res->bind_history |= PIPE_BIND_SHADER_IMAGE; + } else { + struct si_texture *tex = (struct si_texture *)res; + unsigned level = view->u.tex.level; + + if (color_needs_decompression(tex)) { + images->needs_color_decompress_mask |= 1 << slot; + } else { + images->needs_color_decompress_mask &= ~(1 << slot); + } + + if (vi_dcc_enabled(tex, level) && + p_atomic_read(&tex->framebuffers_bound)) + ctx->need_check_render_feedback = true; + } + + images->enabled_mask |= 1u << slot; + ctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader); + + /* Since this can flush, it must be done after enabled_mask is updated. */ + si_sampler_view_add_buffer(ctx, &res->b.b, + (view->access & PIPE_IMAGE_ACCESS_WRITE) ? + RADEON_USAGE_READWRITE : RADEON_USAGE_READ, + false, true); } static void si_set_shader_images(struct pipe_context *pipe, - enum pipe_shader_type shader, - unsigned start_slot, unsigned count, - const struct pipe_image_view *views) + enum pipe_shader_type shader, + unsigned start_slot, unsigned count, + const struct pipe_image_view *views) { - struct si_context *ctx = (struct si_context *)pipe; - unsigned i, slot; + struct si_context *ctx = (struct si_context *)pipe; + unsigned i, slot; - assert(shader < SI_NUM_SHADERS); + assert(shader < SI_NUM_SHADERS); - if (!count) - return; + if (!count) + return; - assert(start_slot + count <= SI_NUM_IMAGES); + assert(start_slot + count <= SI_NUM_IMAGES); - if (views) { - for (i = 0, slot = start_slot; i < count; ++i, ++slot) - si_set_shader_image(ctx, shader, slot, &views[i], false); - } else { - for (i = 0, slot = start_slot; i < count; ++i, ++slot) - si_set_shader_image(ctx, shader, slot, NULL, false); - } + if (views) { + for (i = 0, slot = start_slot; i < count; ++i, ++slot) + si_set_shader_image(ctx, shader, slot, &views[i], false); + } else { + for (i = 0, slot = start_slot; i < count; ++i, ++slot) + si_set_shader_image(ctx, shader, slot, NULL, false); + } - si_update_shader_needs_decompress_mask(ctx, shader); + si_update_shader_needs_decompress_mask(ctx, shader); } static void si_images_update_needs_color_decompress_mask(struct si_images *images) { - unsigned mask = images->enabled_mask; + unsigned mask = images->enabled_mask; - while (mask) { - int i = u_bit_scan(&mask); - struct pipe_resource *res = images->views[i].resource; - - if (res && res->target != PIPE_BUFFER) { - struct si_texture *tex = (struct si_texture *)res; - - if (color_needs_decompression(tex)) { - images->needs_color_decompress_mask |= 1 << i; - } else { - images->needs_color_decompress_mask &= ~(1 << i); - } - } - } + while (mask) { + int i = u_bit_scan(&mask); + struct pipe_resource *res = images->views[i].resource; + + if (res && res->target != PIPE_BUFFER) { + struct si_texture *tex = (struct si_texture *)res; + + if (color_needs_decompression(tex)) { + images->needs_color_decompress_mask |= 1 << i; + } else { + images->needs_color_decompress_mask &= ~(1 << i); + } + } + } } void si_update_ps_colorbuf0_slot(struct si_context *sctx) { - struct si_buffer_resources *buffers = &sctx->rw_buffers; - struct si_descriptors *descs = &sctx->descriptors[SI_DESCS_RW_BUFFERS]; - unsigned slot = SI_PS_IMAGE_COLORBUF0; - struct pipe_surface *surf = NULL; - - /* si_texture_disable_dcc can get us here again. */ - if (sctx->blitter->running) - return; - - /* See whether FBFETCH is used and color buffer 0 is set. */ - if (sctx->ps_shader.cso && - sctx->ps_shader.cso->info.uses_fbfetch && - sctx->framebuffer.state.nr_cbufs && - sctx->framebuffer.state.cbufs[0]) - surf = sctx->framebuffer.state.cbufs[0]; - - /* Return if FBFETCH transitions from disabled to disabled. */ - if (!buffers->buffers[slot] && !surf) - return; - - sctx->ps_uses_fbfetch = surf != NULL; - si_update_ps_iter_samples(sctx); - - if (surf) { - struct si_texture *tex = (struct si_texture*)surf->texture; - struct pipe_image_view view; - - assert(tex); - assert(!tex->is_depth); - - /* Disable DCC, because the texture is used as both a sampler - * and color buffer. - */ - si_texture_disable_dcc(sctx, tex); - - if (tex->buffer.b.b.nr_samples <= 1 && tex->cmask_buffer) { - /* Disable CMASK. */ - assert(tex->cmask_buffer != &tex->buffer); - si_eliminate_fast_color_clear(sctx, tex); - si_texture_discard_cmask(sctx->screen, tex); - } - - view.resource = surf->texture; - view.format = surf->format; - view.access = PIPE_IMAGE_ACCESS_READ; - view.u.tex.first_layer = surf->u.tex.first_layer; - view.u.tex.last_layer = surf->u.tex.last_layer; - view.u.tex.level = surf->u.tex.level; - - /* Set the descriptor. */ - uint32_t *desc = descs->list + slot*4; - memset(desc, 0, 16 * 4); - si_set_shader_image_desc(sctx, &view, true, desc, desc + 8); - - pipe_resource_reference(&buffers->buffers[slot], &tex->buffer.b.b); - radeon_add_to_buffer_list(sctx, sctx->gfx_cs, - &tex->buffer, RADEON_USAGE_READ, - RADEON_PRIO_SHADER_RW_IMAGE); - buffers->enabled_mask |= 1u << slot; - } else { - /* Clear the descriptor. */ - memset(descs->list + slot*4, 0, 8*4); - pipe_resource_reference(&buffers->buffers[slot], NULL); - buffers->enabled_mask &= ~(1u << slot); - } + struct si_buffer_resources *buffers = &sctx->rw_buffers; + struct si_descriptors *descs = &sctx->descriptors[SI_DESCS_RW_BUFFERS]; + unsigned slot = SI_PS_IMAGE_COLORBUF0; + struct pipe_surface *surf = NULL; + + /* si_texture_disable_dcc can get us here again. */ + if (sctx->blitter->running) + return; + + /* See whether FBFETCH is used and color buffer 0 is set. */ + if (sctx->ps_shader.cso && + sctx->ps_shader.cso->info.uses_fbfetch && + sctx->framebuffer.state.nr_cbufs && + sctx->framebuffer.state.cbufs[0]) + surf = sctx->framebuffer.state.cbufs[0]; + + /* Return if FBFETCH transitions from disabled to disabled. */ + if (!buffers->buffers[slot] && !surf) + return; + + sctx->ps_uses_fbfetch = surf != NULL; + si_update_ps_iter_samples(sctx); + + if (surf) { + struct si_texture *tex = (struct si_texture*)surf->texture; + struct pipe_image_view view = {0}; + + assert(tex); + assert(!tex->is_depth); + + /* Disable DCC, because the texture is used as both a sampler + * and color buffer. + */ + si_texture_disable_dcc(sctx, tex); + + if (tex->buffer.b.b.nr_samples <= 1 && tex->cmask_buffer) { + /* Disable CMASK. */ + assert(tex->cmask_buffer != &tex->buffer); + si_eliminate_fast_color_clear(sctx, tex, NULL); + si_texture_discard_cmask(sctx->screen, tex); + } + + view.resource = surf->texture; + view.format = surf->format; + view.access = PIPE_IMAGE_ACCESS_READ; + view.u.tex.first_layer = surf->u.tex.first_layer; + view.u.tex.last_layer = surf->u.tex.last_layer; + view.u.tex.level = surf->u.tex.level; + + /* Set the descriptor. */ + uint32_t *desc = descs->list + slot*4; + memset(desc, 0, 16 * 4); + si_set_shader_image_desc(sctx, &view, true, desc, desc + 8); + + pipe_resource_reference(&buffers->buffers[slot], &tex->buffer.b.b); + radeon_add_to_buffer_list(sctx, sctx->gfx_cs, + &tex->buffer, RADEON_USAGE_READ, + RADEON_PRIO_SHADER_RW_IMAGE); + buffers->enabled_mask |= 1u << slot; + } else { + /* Clear the descriptor. */ + memset(descs->list + slot*4, 0, 8*4); + pipe_resource_reference(&buffers->buffers[slot], NULL); + buffers->enabled_mask &= ~(1u << slot); + } - sctx->descriptors_dirty |= 1u << SI_DESCS_RW_BUFFERS; + sctx->descriptors_dirty |= 1u << SI_DESCS_RW_BUFFERS; } /* SAMPLER STATES */ @@ -963,251 +953,255 @@ enum pipe_shader_type shader, unsigned start, unsigned count, void **states) { - struct si_context *sctx = (struct si_context *)ctx; - struct si_samplers *samplers = &sctx->samplers[shader]; - struct si_descriptors *desc = si_sampler_and_image_descriptors(sctx, shader); - struct si_sampler_state **sstates = (struct si_sampler_state**)states; - int i; - - if (!count || shader >= SI_NUM_SHADERS || !sstates) - return; - - for (i = 0; i < count; i++) { - unsigned slot = start + i; - unsigned desc_slot = si_get_sampler_slot(slot); - - if (!sstates[i] || - sstates[i] == samplers->sampler_states[slot]) - continue; + struct si_context *sctx = (struct si_context *)ctx; + struct si_samplers *samplers = &sctx->samplers[shader]; + struct si_descriptors *desc = si_sampler_and_image_descriptors(sctx, shader); + struct si_sampler_state **sstates = (struct si_sampler_state**)states; + int i; + + if (!count || shader >= SI_NUM_SHADERS || !sstates) + return; + + for (i = 0; i < count; i++) { + unsigned slot = start + i; + unsigned desc_slot = si_get_sampler_slot(slot); + + if (!sstates[i] || + sstates[i] == samplers->sampler_states[slot]) + continue; #ifndef NDEBUG - assert(sstates[i]->magic == SI_SAMPLER_STATE_MAGIC); + assert(sstates[i]->magic == SI_SAMPLER_STATE_MAGIC); #endif - samplers->sampler_states[slot] = sstates[i]; + samplers->sampler_states[slot] = sstates[i]; - /* If FMASK is bound, don't overwrite it. - * The sampler state will be set after FMASK is unbound. - */ - struct si_sampler_view *sview = - (struct si_sampler_view *)samplers->views[slot]; + /* If FMASK is bound, don't overwrite it. + * The sampler state will be set after FMASK is unbound. + */ + struct si_sampler_view *sview = + (struct si_sampler_view *)samplers->views[slot]; - struct si_texture *tex = NULL; + struct si_texture *tex = NULL; - if (sview && sview->base.texture && - sview->base.texture->target != PIPE_BUFFER) - tex = (struct si_texture *)sview->base.texture; + if (sview && sview->base.texture && + sview->base.texture->target != PIPE_BUFFER) + tex = (struct si_texture *)sview->base.texture; - if (tex && tex->surface.fmask_size) - continue; + if (tex && tex->surface.fmask_size) + continue; - si_set_sampler_state_desc(sstates[i], sview, tex, - desc->list + desc_slot * 16 + 12); + si_set_sampler_state_desc(sstates[i], sview, tex, + desc->list + desc_slot * 16 + 12); - sctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader); - } + sctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader); + } } /* BUFFER RESOURCES */ static void si_init_buffer_resources(struct si_buffer_resources *buffers, - struct si_descriptors *descs, - unsigned num_buffers, - short shader_userdata_rel_index, - enum radeon_bo_priority priority, - enum radeon_bo_priority priority_constbuf) -{ - buffers->priority = priority; - buffers->priority_constbuf = priority_constbuf; - buffers->buffers = CALLOC(num_buffers, sizeof(struct pipe_resource*)); - buffers->offsets = CALLOC(num_buffers, sizeof(buffers->offsets[0])); + struct si_descriptors *descs, + unsigned num_buffers, + short shader_userdata_rel_index, + enum radeon_bo_priority priority, + enum radeon_bo_priority priority_constbuf) +{ + buffers->priority = priority; + buffers->priority_constbuf = priority_constbuf; + buffers->buffers = CALLOC(num_buffers, sizeof(struct pipe_resource*)); + buffers->offsets = CALLOC(num_buffers, sizeof(buffers->offsets[0])); - si_init_descriptors(descs, shader_userdata_rel_index, 4, num_buffers); + si_init_descriptors(descs, shader_userdata_rel_index, 4, num_buffers); } static void si_release_buffer_resources(struct si_buffer_resources *buffers, - struct si_descriptors *descs) + struct si_descriptors *descs) { - int i; + int i; - for (i = 0; i < descs->num_elements; i++) { - pipe_resource_reference(&buffers->buffers[i], NULL); - } + for (i = 0; i < descs->num_elements; i++) { + pipe_resource_reference(&buffers->buffers[i], NULL); + } - FREE(buffers->buffers); - FREE(buffers->offsets); + FREE(buffers->buffers); + FREE(buffers->offsets); } static void si_buffer_resources_begin_new_cs(struct si_context *sctx, - struct si_buffer_resources *buffers) + struct si_buffer_resources *buffers) { - unsigned mask = buffers->enabled_mask; + unsigned mask = buffers->enabled_mask; - /* Add buffers to the CS. */ - while (mask) { - int i = u_bit_scan(&mask); - - radeon_add_to_buffer_list(sctx, sctx->gfx_cs, - si_resource(buffers->buffers[i]), - buffers->writable_mask & (1u << i) ? RADEON_USAGE_READWRITE : - RADEON_USAGE_READ, - i < SI_NUM_SHADER_BUFFERS ? buffers->priority : - buffers->priority_constbuf); - } + /* Add buffers to the CS. */ + while (mask) { + int i = u_bit_scan(&mask); + + radeon_add_to_buffer_list(sctx, sctx->gfx_cs, + si_resource(buffers->buffers[i]), + buffers->writable_mask & (1u << i) ? RADEON_USAGE_READWRITE : + RADEON_USAGE_READ, + i < SI_NUM_SHADER_BUFFERS ? buffers->priority : + buffers->priority_constbuf); + } } static void si_get_buffer_from_descriptors(struct si_buffer_resources *buffers, - struct si_descriptors *descs, - unsigned idx, struct pipe_resource **buf, - unsigned *offset, unsigned *size) -{ - pipe_resource_reference(buf, buffers->buffers[idx]); - if (*buf) { - struct si_resource *res = si_resource(*buf); - const uint32_t *desc = descs->list + idx * 4; - uint64_t va; - - *size = desc[2]; - - assert(G_008F04_STRIDE(desc[1]) == 0); - va = si_desc_extract_buffer_address(desc); - - assert(va >= res->gpu_address && va + *size <= res->gpu_address + res->bo_size); - *offset = va - res->gpu_address; - } + struct si_descriptors *descs, + unsigned idx, struct pipe_resource **buf, + unsigned *offset, unsigned *size) +{ + pipe_resource_reference(buf, buffers->buffers[idx]); + if (*buf) { + struct si_resource *res = si_resource(*buf); + const uint32_t *desc = descs->list + idx * 4; + uint64_t va; + + *size = desc[2]; + + assert(G_008F04_STRIDE(desc[1]) == 0); + va = si_desc_extract_buffer_address(desc); + + assert(va >= res->gpu_address && va + *size <= res->gpu_address + res->bo_size); + *offset = va - res->gpu_address; + } } /* VERTEX BUFFERS */ static void si_vertex_buffers_begin_new_cs(struct si_context *sctx) { - int count = sctx->vertex_elements ? sctx->vertex_elements->count : 0; - int i; + int count = sctx->num_vertex_elements; + int i; - for (i = 0; i < count; i++) { - int vb = sctx->vertex_elements->vertex_buffer_index[i]; + for (i = 0; i < count; i++) { + int vb = sctx->vertex_elements->vertex_buffer_index[i]; - if (vb >= ARRAY_SIZE(sctx->vertex_buffer)) - continue; - if (!sctx->vertex_buffer[vb].buffer.resource) - continue; - - radeon_add_to_buffer_list(sctx, sctx->gfx_cs, - si_resource(sctx->vertex_buffer[vb].buffer.resource), - RADEON_USAGE_READ, RADEON_PRIO_VERTEX_BUFFER); - } - - if (!sctx->vb_descriptors_buffer) - return; - radeon_add_to_buffer_list(sctx, sctx->gfx_cs, - sctx->vb_descriptors_buffer, RADEON_USAGE_READ, - RADEON_PRIO_DESCRIPTORS); + if (vb >= ARRAY_SIZE(sctx->vertex_buffer)) + continue; + if (!sctx->vertex_buffer[vb].buffer.resource) + continue; + + radeon_add_to_buffer_list(sctx, sctx->gfx_cs, + si_resource(sctx->vertex_buffer[vb].buffer.resource), + RADEON_USAGE_READ, RADEON_PRIO_VERTEX_BUFFER); + } + + if (!sctx->vb_descriptors_buffer) + return; + radeon_add_to_buffer_list(sctx, sctx->gfx_cs, + sctx->vb_descriptors_buffer, RADEON_USAGE_READ, + RADEON_PRIO_DESCRIPTORS); } bool si_upload_vertex_buffer_descriptors(struct si_context *sctx) { - struct si_vertex_elements *velems = sctx->vertex_elements; - unsigned i, count; - unsigned desc_list_byte_size; - unsigned first_vb_use_mask; - uint32_t *ptr; - - if (!sctx->vertex_buffers_dirty || !velems) - return true; - - count = velems->count; - - if (!count) - return true; - - desc_list_byte_size = velems->desc_list_byte_size; - first_vb_use_mask = velems->first_vb_use_mask; - - /* Vertex buffer descriptors are the only ones which are uploaded - * directly through a staging buffer and don't go through - * the fine-grained upload path. - */ - u_upload_alloc(sctx->b.const_uploader, 0, - desc_list_byte_size, - si_optimal_tcc_alignment(sctx, desc_list_byte_size), - &sctx->vb_descriptors_offset, - (struct pipe_resource**)&sctx->vb_descriptors_buffer, - (void**)&ptr); - if (!sctx->vb_descriptors_buffer) { - sctx->vb_descriptors_offset = 0; - sctx->vb_descriptors_gpu_list = NULL; - return false; - } - - sctx->vb_descriptors_gpu_list = ptr; - radeon_add_to_buffer_list(sctx, sctx->gfx_cs, - sctx->vb_descriptors_buffer, RADEON_USAGE_READ, - RADEON_PRIO_DESCRIPTORS); - - assert(count <= SI_MAX_ATTRIBS); - - for (i = 0; i < count; i++) { - struct pipe_vertex_buffer *vb; - struct si_resource *buf; - unsigned vbo_index = velems->vertex_buffer_index[i]; - uint32_t *desc = &ptr[i*4]; - - vb = &sctx->vertex_buffer[vbo_index]; - buf = si_resource(vb->buffer.resource); - if (!buf) { - memset(desc, 0, 16); - continue; - } - - int64_t offset = (int64_t)((int)vb->buffer_offset) + - velems->src_offset[i]; - - if (offset >= buf->b.b.width0) { - assert(offset < buf->b.b.width0); - memset(desc, 0, 16); - continue; - } - - uint64_t va = buf->gpu_address + offset; - - int64_t num_records = (int64_t)buf->b.b.width0 - offset; - if (sctx->chip_class != GFX8 && vb->stride) { - /* Round up by rounding down and adding 1 */ - num_records = (num_records - velems->format_size[i]) / - vb->stride + 1; - } - assert(num_records >= 0 && num_records <= UINT_MAX); - - uint32_t rsrc_word3 = velems->rsrc_word3[i]; - - /* OOB_SELECT chooses the out-of-bounds check: - * - 1: index >= NUM_RECORDS (Structured) - * - 3: offset >= NUM_RECORDS (Raw) - */ - if (sctx->chip_class >= GFX10) - rsrc_word3 |= S_008F0C_OOB_SELECT(vb->stride ? 1 : 3); - - desc[0] = va; - desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | - S_008F04_STRIDE(vb->stride); - desc[2] = num_records; - desc[3] = rsrc_word3; - - if (first_vb_use_mask & (1 << i)) { - radeon_add_to_buffer_list(sctx, sctx->gfx_cs, - si_resource(vb->buffer.resource), - RADEON_USAGE_READ, RADEON_PRIO_VERTEX_BUFFER); - } - } - - /* Don't flush the const cache. It would have a very negative effect - * on performance (confirmed by testing). New descriptors are always - * uploaded to a fresh new buffer, so I don't think flushing the const - * cache is needed. */ - si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers); - sctx->vertex_buffers_dirty = false; - sctx->vertex_buffer_pointer_dirty = true; - sctx->prefetch_L2_mask |= SI_PREFETCH_VBO_DESCRIPTORS; - return true; + unsigned i, count = sctx->num_vertex_elements; + uint32_t *ptr; + + if (!sctx->vertex_buffers_dirty || !count) + return true; + + struct si_vertex_elements *velems = sctx->vertex_elements; + unsigned alloc_size = velems->vb_desc_list_alloc_size; + + if (alloc_size) { + /* Vertex buffer descriptors are the only ones which are uploaded + * directly through a staging buffer and don't go through + * the fine-grained upload path. + */ + u_upload_alloc(sctx->b.const_uploader, 0, + alloc_size, + si_optimal_tcc_alignment(sctx, alloc_size), + &sctx->vb_descriptors_offset, + (struct pipe_resource**)&sctx->vb_descriptors_buffer, + (void**)&ptr); + if (!sctx->vb_descriptors_buffer) { + sctx->vb_descriptors_offset = 0; + sctx->vb_descriptors_gpu_list = NULL; + return false; + } + + sctx->vb_descriptors_gpu_list = ptr; + radeon_add_to_buffer_list(sctx, sctx->gfx_cs, + sctx->vb_descriptors_buffer, RADEON_USAGE_READ, + RADEON_PRIO_DESCRIPTORS); + sctx->vertex_buffer_pointer_dirty = true; + sctx->prefetch_L2_mask |= SI_PREFETCH_VBO_DESCRIPTORS; + } else { + si_resource_reference(&sctx->vb_descriptors_buffer, NULL); + sctx->vertex_buffer_pointer_dirty = false; + sctx->prefetch_L2_mask &= ~SI_PREFETCH_VBO_DESCRIPTORS; + } + + assert(count <= SI_MAX_ATTRIBS); + + unsigned first_vb_use_mask = velems->first_vb_use_mask; + unsigned num_vbos_in_user_sgprs = sctx->screen->num_vbos_in_user_sgprs; + + for (i = 0; i < count; i++) { + struct pipe_vertex_buffer *vb; + struct si_resource *buf; + unsigned vbo_index = velems->vertex_buffer_index[i]; + uint32_t *desc = i < num_vbos_in_user_sgprs ? + &sctx->vb_descriptor_user_sgprs[i * 4] : + &ptr[(i - num_vbos_in_user_sgprs) * 4]; + + vb = &sctx->vertex_buffer[vbo_index]; + buf = si_resource(vb->buffer.resource); + if (!buf) { + memset(desc, 0, 16); + continue; + } + + int64_t offset = (int64_t)((int)vb->buffer_offset) + + velems->src_offset[i]; + + if (offset >= buf->b.b.width0) { + assert(offset < buf->b.b.width0); + memset(desc, 0, 16); + continue; + } + + uint64_t va = buf->gpu_address + offset; + + int64_t num_records = (int64_t)buf->b.b.width0 - offset; + if (sctx->chip_class != GFX8 && vb->stride) { + /* Round up by rounding down and adding 1 */ + num_records = (num_records - velems->format_size[i]) / + vb->stride + 1; + } + assert(num_records >= 0 && num_records <= UINT_MAX); + + uint32_t rsrc_word3 = velems->rsrc_word3[i]; + + /* OOB_SELECT chooses the out-of-bounds check: + * - 1: index >= NUM_RECORDS (Structured) + * - 3: offset >= NUM_RECORDS (Raw) + */ + if (sctx->chip_class >= GFX10) + rsrc_word3 |= S_008F0C_OOB_SELECT(vb->stride ? V_008F0C_OOB_SELECT_STRUCTURED : V_008F0C_OOB_SELECT_RAW); + + desc[0] = va; + desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | + S_008F04_STRIDE(vb->stride); + desc[2] = num_records; + desc[3] = rsrc_word3; + + if (first_vb_use_mask & (1 << i)) { + radeon_add_to_buffer_list(sctx, sctx->gfx_cs, + si_resource(vb->buffer.resource), + RADEON_USAGE_READ, RADEON_PRIO_VERTEX_BUFFER); + } + } + + /* Don't flush the const cache. It would have a very negative effect + * on performance (confirmed by testing). New descriptors are always + * uploaded to a fresh new buffer, so I don't think flushing the const + * cache is needed. */ + si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers); + sctx->vertex_buffer_user_sgprs_dirty = num_vbos_in_user_sgprs > 0; + sctx->vertex_buffers_dirty = false; + return true; } @@ -1216,366 +1210,366 @@ static struct si_descriptors * si_const_and_shader_buffer_descriptors(struct si_context *sctx, unsigned shader) { - return &sctx->descriptors[si_const_and_shader_buffer_descriptors_idx(shader)]; + return &sctx->descriptors[si_const_and_shader_buffer_descriptors_idx(shader)]; } void si_upload_const_buffer(struct si_context *sctx, struct si_resource **buf, - const uint8_t *ptr, unsigned size, uint32_t *const_offset) + const uint8_t *ptr, unsigned size, uint32_t *const_offset) { - void *tmp; + void *tmp; - u_upload_alloc(sctx->b.const_uploader, 0, size, - si_optimal_tcc_alignment(sctx, size), - const_offset, - (struct pipe_resource**)buf, &tmp); - if (*buf) - util_memcpy_cpu_to_le32(tmp, ptr, size); + u_upload_alloc(sctx->b.const_uploader, 0, size, + si_optimal_tcc_alignment(sctx, size), + const_offset, + (struct pipe_resource**)buf, &tmp); + if (*buf) + util_memcpy_cpu_to_le32(tmp, ptr, size); } static void si_set_constant_buffer(struct si_context *sctx, - struct si_buffer_resources *buffers, - unsigned descriptors_idx, - uint slot, const struct pipe_constant_buffer *input) -{ - struct si_descriptors *descs = &sctx->descriptors[descriptors_idx]; - assert(slot < descs->num_elements); - pipe_resource_reference(&buffers->buffers[slot], NULL); - - /* GFX7 cannot unbind a constant buffer (S_BUFFER_LOAD is buggy - * with a NULL buffer). We need to use a dummy buffer instead. */ - if (sctx->chip_class == GFX7 && - (!input || (!input->buffer && !input->user_buffer))) - input = &sctx->null_const_buf; - - if (input && (input->buffer || input->user_buffer)) { - struct pipe_resource *buffer = NULL; - uint64_t va; - unsigned buffer_offset; - - /* Upload the user buffer if needed. */ - if (input->user_buffer) { - si_upload_const_buffer(sctx, - (struct si_resource**)&buffer, input->user_buffer, - input->buffer_size, &buffer_offset); - if (!buffer) { - /* Just unbind on failure. */ - si_set_constant_buffer(sctx, buffers, descriptors_idx, slot, NULL); - return; - } - } else { - pipe_resource_reference(&buffer, input->buffer); - buffer_offset = input->buffer_offset; - } - - va = si_resource(buffer)->gpu_address + buffer_offset; - - /* Set the descriptor. */ - uint32_t *desc = descs->list + slot*4; - desc[0] = va; - desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | - S_008F04_STRIDE(0); - desc[2] = input->buffer_size; - desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | - S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | - S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | - S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W); - - if (sctx->chip_class >= GFX10) { - desc[3] |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) | - S_008F0C_OOB_SELECT(3) | - S_008F0C_RESOURCE_LEVEL(1); - } else { - desc[3] |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | - S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); - } - - buffers->buffers[slot] = buffer; - buffers->offsets[slot] = buffer_offset; - radeon_add_to_gfx_buffer_list_check_mem(sctx, - si_resource(buffer), - RADEON_USAGE_READ, - buffers->priority_constbuf, true); - buffers->enabled_mask |= 1u << slot; - } else { - /* Clear the descriptor. */ - memset(descs->list + slot*4, 0, sizeof(uint32_t) * 4); - buffers->enabled_mask &= ~(1u << slot); - } + struct si_buffer_resources *buffers, + unsigned descriptors_idx, + uint slot, const struct pipe_constant_buffer *input) +{ + struct si_descriptors *descs = &sctx->descriptors[descriptors_idx]; + assert(slot < descs->num_elements); + pipe_resource_reference(&buffers->buffers[slot], NULL); + + /* GFX7 cannot unbind a constant buffer (S_BUFFER_LOAD is buggy + * with a NULL buffer). We need to use a dummy buffer instead. */ + if (sctx->chip_class == GFX7 && + (!input || (!input->buffer && !input->user_buffer))) + input = &sctx->null_const_buf; + + if (input && (input->buffer || input->user_buffer)) { + struct pipe_resource *buffer = NULL; + uint64_t va; + unsigned buffer_offset; + + /* Upload the user buffer if needed. */ + if (input->user_buffer) { + si_upload_const_buffer(sctx, + (struct si_resource**)&buffer, input->user_buffer, + input->buffer_size, &buffer_offset); + if (!buffer) { + /* Just unbind on failure. */ + si_set_constant_buffer(sctx, buffers, descriptors_idx, slot, NULL); + return; + } + } else { + pipe_resource_reference(&buffer, input->buffer); + buffer_offset = input->buffer_offset; + } + + va = si_resource(buffer)->gpu_address + buffer_offset; + + /* Set the descriptor. */ + uint32_t *desc = descs->list + slot*4; + desc[0] = va; + desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | + S_008F04_STRIDE(0); + desc[2] = input->buffer_size; + desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | + S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | + S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | + S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W); + + if (sctx->chip_class >= GFX10) { + desc[3] |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) | + S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | + S_008F0C_RESOURCE_LEVEL(1); + } else { + desc[3] |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | + S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); + } + + buffers->buffers[slot] = buffer; + buffers->offsets[slot] = buffer_offset; + radeon_add_to_gfx_buffer_list_check_mem(sctx, + si_resource(buffer), + RADEON_USAGE_READ, + buffers->priority_constbuf, true); + buffers->enabled_mask |= 1u << slot; + } else { + /* Clear the descriptor. */ + memset(descs->list + slot*4, 0, sizeof(uint32_t) * 4); + buffers->enabled_mask &= ~(1u << slot); + } - sctx->descriptors_dirty |= 1u << descriptors_idx; + sctx->descriptors_dirty |= 1u << descriptors_idx; } static void si_pipe_set_constant_buffer(struct pipe_context *ctx, - enum pipe_shader_type shader, uint slot, - const struct pipe_constant_buffer *input) + enum pipe_shader_type shader, uint slot, + const struct pipe_constant_buffer *input) { - struct si_context *sctx = (struct si_context *)ctx; + struct si_context *sctx = (struct si_context *)ctx; - if (shader >= SI_NUM_SHADERS) - return; + if (shader >= SI_NUM_SHADERS) + return; - if (slot == 0 && input && input->buffer && - !(si_resource(input->buffer)->flags & RADEON_FLAG_32BIT)) { - assert(!"constant buffer 0 must have a 32-bit VM address, use const_uploader"); - return; - } - - if (input && input->buffer) - si_resource(input->buffer)->bind_history |= PIPE_BIND_CONSTANT_BUFFER; - - slot = si_get_constbuf_slot(slot); - si_set_constant_buffer(sctx, &sctx->const_and_shader_buffers[shader], - si_const_and_shader_buffer_descriptors_idx(shader), - slot, input); + if (slot == 0 && input && input->buffer && + !(si_resource(input->buffer)->flags & RADEON_FLAG_32BIT)) { + assert(!"constant buffer 0 must have a 32-bit VM address, use const_uploader"); + return; + } + + if (input && input->buffer) + si_resource(input->buffer)->bind_history |= PIPE_BIND_CONSTANT_BUFFER; + + slot = si_get_constbuf_slot(slot); + si_set_constant_buffer(sctx, &sctx->const_and_shader_buffers[shader], + si_const_and_shader_buffer_descriptors_idx(shader), + slot, input); } void si_get_pipe_constant_buffer(struct si_context *sctx, uint shader, - uint slot, struct pipe_constant_buffer *cbuf) + uint slot, struct pipe_constant_buffer *cbuf) { - cbuf->user_buffer = NULL; - si_get_buffer_from_descriptors( - &sctx->const_and_shader_buffers[shader], - si_const_and_shader_buffer_descriptors(sctx, shader), - si_get_constbuf_slot(slot), - &cbuf->buffer, &cbuf->buffer_offset, &cbuf->buffer_size); + cbuf->user_buffer = NULL; + si_get_buffer_from_descriptors( + &sctx->const_and_shader_buffers[shader], + si_const_and_shader_buffer_descriptors(sctx, shader), + si_get_constbuf_slot(slot), + &cbuf->buffer, &cbuf->buffer_offset, &cbuf->buffer_size); } /* SHADER BUFFERS */ static void si_set_shader_buffer(struct si_context *sctx, - struct si_buffer_resources *buffers, - unsigned descriptors_idx, - uint slot, const struct pipe_shader_buffer *sbuffer, - bool writable, enum radeon_bo_priority priority) -{ - struct si_descriptors *descs = &sctx->descriptors[descriptors_idx]; - uint32_t *desc = descs->list + slot * 4; - - if (!sbuffer || !sbuffer->buffer) { - pipe_resource_reference(&buffers->buffers[slot], NULL); - memset(desc, 0, sizeof(uint32_t) * 4); - buffers->enabled_mask &= ~(1u << slot); - buffers->writable_mask &= ~(1u << slot); - sctx->descriptors_dirty |= 1u << descriptors_idx; - return; - } - - struct si_resource *buf = si_resource(sbuffer->buffer); - uint64_t va = buf->gpu_address + sbuffer->buffer_offset; - - desc[0] = va; - desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | - S_008F04_STRIDE(0); - desc[2] = sbuffer->buffer_size; - desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | - S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | - S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | - S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W); - - if (sctx->chip_class >= GFX10) { - desc[3] |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) | - S_008F0C_OOB_SELECT(3) | - S_008F0C_RESOURCE_LEVEL(1); - } else { - desc[3] |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | - S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); - } - - pipe_resource_reference(&buffers->buffers[slot], &buf->b.b); - buffers->offsets[slot] = sbuffer->buffer_offset; - radeon_add_to_gfx_buffer_list_check_mem(sctx, buf, - writable ? RADEON_USAGE_READWRITE : - RADEON_USAGE_READ, - priority, true); - if (writable) - buffers->writable_mask |= 1u << slot; - else - buffers->writable_mask &= ~(1u << slot); + struct si_buffer_resources *buffers, + unsigned descriptors_idx, + uint slot, const struct pipe_shader_buffer *sbuffer, + bool writable, enum radeon_bo_priority priority) +{ + struct si_descriptors *descs = &sctx->descriptors[descriptors_idx]; + uint32_t *desc = descs->list + slot * 4; + + if (!sbuffer || !sbuffer->buffer) { + pipe_resource_reference(&buffers->buffers[slot], NULL); + memset(desc, 0, sizeof(uint32_t) * 4); + buffers->enabled_mask &= ~(1u << slot); + buffers->writable_mask &= ~(1u << slot); + sctx->descriptors_dirty |= 1u << descriptors_idx; + return; + } + + struct si_resource *buf = si_resource(sbuffer->buffer); + uint64_t va = buf->gpu_address + sbuffer->buffer_offset; + + desc[0] = va; + desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | + S_008F04_STRIDE(0); + desc[2] = sbuffer->buffer_size; + desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | + S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | + S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | + S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W); + + if (sctx->chip_class >= GFX10) { + desc[3] |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) | + S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | + S_008F0C_RESOURCE_LEVEL(1); + } else { + desc[3] |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | + S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); + } + + pipe_resource_reference(&buffers->buffers[slot], &buf->b.b); + buffers->offsets[slot] = sbuffer->buffer_offset; + radeon_add_to_gfx_buffer_list_check_mem(sctx, buf, + writable ? RADEON_USAGE_READWRITE : + RADEON_USAGE_READ, + priority, true); + if (writable) + buffers->writable_mask |= 1u << slot; + else + buffers->writable_mask &= ~(1u << slot); - buffers->enabled_mask |= 1u << slot; - sctx->descriptors_dirty |= 1u << descriptors_idx; + buffers->enabled_mask |= 1u << slot; + sctx->descriptors_dirty |= 1u << descriptors_idx; - util_range_add(&buf->valid_buffer_range, sbuffer->buffer_offset, - sbuffer->buffer_offset + sbuffer->buffer_size); + util_range_add(&buf->b.b, &buf->valid_buffer_range, sbuffer->buffer_offset, + sbuffer->buffer_offset + sbuffer->buffer_size); } static void si_set_shader_buffers(struct pipe_context *ctx, - enum pipe_shader_type shader, - unsigned start_slot, unsigned count, - const struct pipe_shader_buffer *sbuffers, - unsigned writable_bitmask) -{ - struct si_context *sctx = (struct si_context *)ctx; - struct si_buffer_resources *buffers = &sctx->const_and_shader_buffers[shader]; - unsigned descriptors_idx = si_const_and_shader_buffer_descriptors_idx(shader); - unsigned i; - - assert(start_slot + count <= SI_NUM_SHADER_BUFFERS); - - for (i = 0; i < count; ++i) { - const struct pipe_shader_buffer *sbuffer = sbuffers ? &sbuffers[i] : NULL; - unsigned slot = si_get_shaderbuf_slot(start_slot + i); - - if (sbuffer && sbuffer->buffer) - si_resource(sbuffer->buffer)->bind_history |= PIPE_BIND_SHADER_BUFFER; - - si_set_shader_buffer(sctx, buffers, descriptors_idx, slot, sbuffer, - !!(writable_bitmask & (1u << i)), - buffers->priority); - } + enum pipe_shader_type shader, + unsigned start_slot, unsigned count, + const struct pipe_shader_buffer *sbuffers, + unsigned writable_bitmask) +{ + struct si_context *sctx = (struct si_context *)ctx; + struct si_buffer_resources *buffers = &sctx->const_and_shader_buffers[shader]; + unsigned descriptors_idx = si_const_and_shader_buffer_descriptors_idx(shader); + unsigned i; + + assert(start_slot + count <= SI_NUM_SHADER_BUFFERS); + + for (i = 0; i < count; ++i) { + const struct pipe_shader_buffer *sbuffer = sbuffers ? &sbuffers[i] : NULL; + unsigned slot = si_get_shaderbuf_slot(start_slot + i); + + if (sbuffer && sbuffer->buffer) + si_resource(sbuffer->buffer)->bind_history |= PIPE_BIND_SHADER_BUFFER; + + si_set_shader_buffer(sctx, buffers, descriptors_idx, slot, sbuffer, + !!(writable_bitmask & (1u << i)), + buffers->priority); + } } void si_get_shader_buffers(struct si_context *sctx, - enum pipe_shader_type shader, - uint start_slot, uint count, - struct pipe_shader_buffer *sbuf) -{ - struct si_buffer_resources *buffers = &sctx->const_and_shader_buffers[shader]; - struct si_descriptors *descs = si_const_and_shader_buffer_descriptors(sctx, shader); - - for (unsigned i = 0; i < count; ++i) { - si_get_buffer_from_descriptors( - buffers, descs, - si_get_shaderbuf_slot(start_slot + i), - &sbuf[i].buffer, &sbuf[i].buffer_offset, - &sbuf[i].buffer_size); - } + enum pipe_shader_type shader, + uint start_slot, uint count, + struct pipe_shader_buffer *sbuf) +{ + struct si_buffer_resources *buffers = &sctx->const_and_shader_buffers[shader]; + struct si_descriptors *descs = si_const_and_shader_buffer_descriptors(sctx, shader); + + for (unsigned i = 0; i < count; ++i) { + si_get_buffer_from_descriptors( + buffers, descs, + si_get_shaderbuf_slot(start_slot + i), + &sbuf[i].buffer, &sbuf[i].buffer_offset, + &sbuf[i].buffer_size); + } } /* RING BUFFERS */ void si_set_rw_buffer(struct si_context *sctx, - uint slot, const struct pipe_constant_buffer *input) + uint slot, const struct pipe_constant_buffer *input) { - si_set_constant_buffer(sctx, &sctx->rw_buffers, SI_DESCS_RW_BUFFERS, - slot, input); + si_set_constant_buffer(sctx, &sctx->rw_buffers, SI_DESCS_RW_BUFFERS, + slot, input); } void si_set_rw_shader_buffer(struct si_context *sctx, uint slot, - const struct pipe_shader_buffer *sbuffer) + const struct pipe_shader_buffer *sbuffer) { - si_set_shader_buffer(sctx, &sctx->rw_buffers, SI_DESCS_RW_BUFFERS, - slot, sbuffer, true, RADEON_PRIO_SHADER_RW_BUFFER); + si_set_shader_buffer(sctx, &sctx->rw_buffers, SI_DESCS_RW_BUFFERS, + slot, sbuffer, true, RADEON_PRIO_SHADER_RW_BUFFER); } void si_set_ring_buffer(struct si_context *sctx, uint slot, - struct pipe_resource *buffer, - unsigned stride, unsigned num_records, - bool add_tid, bool swizzle, - unsigned element_size, unsigned index_stride, uint64_t offset) -{ - struct si_buffer_resources *buffers = &sctx->rw_buffers; - struct si_descriptors *descs = &sctx->descriptors[SI_DESCS_RW_BUFFERS]; - - /* The stride field in the resource descriptor has 14 bits */ - assert(stride < (1 << 14)); - - assert(slot < descs->num_elements); - pipe_resource_reference(&buffers->buffers[slot], NULL); - - if (buffer) { - uint64_t va; - - va = si_resource(buffer)->gpu_address + offset; - - switch (element_size) { - default: - assert(!"Unsupported ring buffer element size"); - case 0: - case 2: - element_size = 0; - break; - case 4: - element_size = 1; - break; - case 8: - element_size = 2; - break; - case 16: - element_size = 3; - break; - } - - switch (index_stride) { - default: - assert(!"Unsupported ring buffer index stride"); - case 0: - case 8: - index_stride = 0; - break; - case 16: - index_stride = 1; - break; - case 32: - index_stride = 2; - break; - case 64: - index_stride = 3; - break; - } - - if (sctx->chip_class >= GFX8 && stride) - num_records *= stride; - - /* Set the descriptor. */ - uint32_t *desc = descs->list + slot*4; - desc[0] = va; - desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | - S_008F04_STRIDE(stride) | - S_008F04_SWIZZLE_ENABLE(swizzle); - desc[2] = num_records; - desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | - S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | - S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | - S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) | - S_008F0C_INDEX_STRIDE(index_stride) | - S_008F0C_ADD_TID_ENABLE(add_tid); - - if (sctx->chip_class >= GFX9) - assert(!swizzle || element_size == 1); /* always 4 bytes on GFX9 */ - else - desc[3] |= S_008F0C_ELEMENT_SIZE(element_size); - - if (sctx->chip_class >= GFX10) { - desc[3] |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) | - S_008F0C_OOB_SELECT(2) | - S_008F0C_RESOURCE_LEVEL(1); - } else { - desc[3] |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | - S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); - } - - pipe_resource_reference(&buffers->buffers[slot], buffer); - radeon_add_to_buffer_list(sctx, sctx->gfx_cs, - si_resource(buffer), - RADEON_USAGE_READWRITE, buffers->priority); - buffers->enabled_mask |= 1u << slot; - } else { - /* Clear the descriptor. */ - memset(descs->list + slot*4, 0, sizeof(uint32_t) * 4); - buffers->enabled_mask &= ~(1u << slot); - } + struct pipe_resource *buffer, + unsigned stride, unsigned num_records, + bool add_tid, bool swizzle, + unsigned element_size, unsigned index_stride, uint64_t offset) +{ + struct si_buffer_resources *buffers = &sctx->rw_buffers; + struct si_descriptors *descs = &sctx->descriptors[SI_DESCS_RW_BUFFERS]; + + /* The stride field in the resource descriptor has 14 bits */ + assert(stride < (1 << 14)); + + assert(slot < descs->num_elements); + pipe_resource_reference(&buffers->buffers[slot], NULL); + + if (buffer) { + uint64_t va; + + va = si_resource(buffer)->gpu_address + offset; + + switch (element_size) { + default: + assert(!"Unsupported ring buffer element size"); + case 0: + case 2: + element_size = 0; + break; + case 4: + element_size = 1; + break; + case 8: + element_size = 2; + break; + case 16: + element_size = 3; + break; + } + + switch (index_stride) { + default: + assert(!"Unsupported ring buffer index stride"); + case 0: + case 8: + index_stride = 0; + break; + case 16: + index_stride = 1; + break; + case 32: + index_stride = 2; + break; + case 64: + index_stride = 3; + break; + } + + if (sctx->chip_class >= GFX8 && stride) + num_records *= stride; + + /* Set the descriptor. */ + uint32_t *desc = descs->list + slot*4; + desc[0] = va; + desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | + S_008F04_STRIDE(stride) | + S_008F04_SWIZZLE_ENABLE(swizzle); + desc[2] = num_records; + desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | + S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | + S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | + S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) | + S_008F0C_INDEX_STRIDE(index_stride) | + S_008F0C_ADD_TID_ENABLE(add_tid); + + if (sctx->chip_class >= GFX9) + assert(!swizzle || element_size == 1); /* always 4 bytes on GFX9 */ + else + desc[3] |= S_008F0C_ELEMENT_SIZE(element_size); + + if (sctx->chip_class >= GFX10) { + desc[3] |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) | + S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_DISABLED) | + S_008F0C_RESOURCE_LEVEL(1); + } else { + desc[3] |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | + S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); + } + + pipe_resource_reference(&buffers->buffers[slot], buffer); + radeon_add_to_buffer_list(sctx, sctx->gfx_cs, + si_resource(buffer), + RADEON_USAGE_READWRITE, buffers->priority); + buffers->enabled_mask |= 1u << slot; + } else { + /* Clear the descriptor. */ + memset(descs->list + slot*4, 0, sizeof(uint32_t) * 4); + buffers->enabled_mask &= ~(1u << slot); + } - sctx->descriptors_dirty |= 1u << SI_DESCS_RW_BUFFERS; + sctx->descriptors_dirty |= 1u << SI_DESCS_RW_BUFFERS; } /* INTERNAL CONST BUFFERS */ static void si_set_polygon_stipple(struct pipe_context *ctx, - const struct pipe_poly_stipple *state) + const struct pipe_poly_stipple *state) { - struct si_context *sctx = (struct si_context *)ctx; - struct pipe_constant_buffer cb = {}; - unsigned stipple[32]; - int i; + struct si_context *sctx = (struct si_context *)ctx; + struct pipe_constant_buffer cb = {}; + unsigned stipple[32]; + int i; - for (i = 0; i < 32; i++) - stipple[i] = util_bitreverse(state->stipple[i]); + for (i = 0; i < 32; i++) + stipple[i] = util_bitreverse(state->stipple[i]); - cb.user_buffer = stipple; - cb.buffer_size = sizeof(stipple); + cb.user_buffer = stipple; + cb.buffer_size = sizeof(stipple); - si_set_rw_buffer(sctx, SI_PS_CONST_POLY_STIPPLE, &cb); + si_set_rw_buffer(sctx, SI_PS_CONST_POLY_STIPPLE, &cb); } /* TEXTURE METADATA ENABLE/DISABLE */ @@ -1583,41 +1577,41 @@ static void si_resident_handles_update_needs_color_decompress(struct si_context *sctx) { - util_dynarray_clear(&sctx->resident_tex_needs_color_decompress); - util_dynarray_clear(&sctx->resident_img_needs_color_decompress); + util_dynarray_clear(&sctx->resident_tex_needs_color_decompress); + util_dynarray_clear(&sctx->resident_img_needs_color_decompress); - util_dynarray_foreach(&sctx->resident_tex_handles, - struct si_texture_handle *, tex_handle) { - struct pipe_resource *res = (*tex_handle)->view->texture; - struct si_texture *tex; - - if (!res || res->target == PIPE_BUFFER) - continue; - - tex = (struct si_texture *)res; - if (!color_needs_decompression(tex)) - continue; - - util_dynarray_append(&sctx->resident_tex_needs_color_decompress, - struct si_texture_handle *, *tex_handle); - } - - util_dynarray_foreach(&sctx->resident_img_handles, - struct si_image_handle *, img_handle) { - struct pipe_image_view *view = &(*img_handle)->view; - struct pipe_resource *res = view->resource; - struct si_texture *tex; - - if (!res || res->target == PIPE_BUFFER) - continue; - - tex = (struct si_texture *)res; - if (!color_needs_decompression(tex)) - continue; - - util_dynarray_append(&sctx->resident_img_needs_color_decompress, - struct si_image_handle *, *img_handle); - } + util_dynarray_foreach(&sctx->resident_tex_handles, + struct si_texture_handle *, tex_handle) { + struct pipe_resource *res = (*tex_handle)->view->texture; + struct si_texture *tex; + + if (!res || res->target == PIPE_BUFFER) + continue; + + tex = (struct si_texture *)res; + if (!color_needs_decompression(tex)) + continue; + + util_dynarray_append(&sctx->resident_tex_needs_color_decompress, + struct si_texture_handle *, *tex_handle); + } + + util_dynarray_foreach(&sctx->resident_img_handles, + struct si_image_handle *, img_handle) { + struct pipe_image_view *view = &(*img_handle)->view; + struct pipe_resource *res = view->resource; + struct si_texture *tex; + + if (!res || res->target == PIPE_BUFFER) + continue; + + tex = (struct si_texture *)res; + if (!color_needs_decompression(tex)) + continue; + + util_dynarray_append(&sctx->resident_img_needs_color_decompress, + struct si_image_handle *, *img_handle); + } } /* CMASK can be enabled (for fast clear) and disabled (for texture export) @@ -1626,13 +1620,13 @@ */ void si_update_needs_color_decompress_masks(struct si_context *sctx) { - for (int i = 0; i < SI_NUM_SHADERS; ++i) { - si_samplers_update_needs_color_decompress_mask(&sctx->samplers[i]); - si_images_update_needs_color_decompress_mask(&sctx->images[i]); - si_update_shader_needs_decompress_mask(sctx, i); - } + for (int i = 0; i < SI_NUM_SHADERS; ++i) { + si_samplers_update_needs_color_decompress_mask(&sctx->samplers[i]); + si_images_update_needs_color_decompress_mask(&sctx->images[i]); + si_update_shader_needs_decompress_mask(sctx, i); + } - si_resident_handles_update_needs_color_decompress(sctx); + si_resident_handles_update_needs_color_decompress(sctx); } /* BUFFER DISCARD/INVALIDATION */ @@ -1641,32 +1635,32 @@ * If buf == NULL, reset all descriptors. */ static void si_reset_buffer_resources(struct si_context *sctx, - struct si_buffer_resources *buffers, - unsigned descriptors_idx, - unsigned slot_mask, - struct pipe_resource *buf, - enum radeon_bo_priority priority) -{ - struct si_descriptors *descs = &sctx->descriptors[descriptors_idx]; - unsigned mask = buffers->enabled_mask & slot_mask; - - while (mask) { - unsigned i = u_bit_scan(&mask); - struct pipe_resource *buffer = buffers->buffers[i]; - - if (buffer && (!buf || buffer == buf)) { - si_set_buf_desc_address(si_resource(buffer), buffers->offsets[i], - descs->list + i*4); - sctx->descriptors_dirty |= 1u << descriptors_idx; - - radeon_add_to_gfx_buffer_list_check_mem(sctx, - si_resource(buffer), - buffers->writable_mask & (1u << i) ? - RADEON_USAGE_READWRITE : - RADEON_USAGE_READ, - priority, true); - } - } + struct si_buffer_resources *buffers, + unsigned descriptors_idx, + unsigned slot_mask, + struct pipe_resource *buf, + enum radeon_bo_priority priority) +{ + struct si_descriptors *descs = &sctx->descriptors[descriptors_idx]; + unsigned mask = buffers->enabled_mask & slot_mask; + + while (mask) { + unsigned i = u_bit_scan(&mask); + struct pipe_resource *buffer = buffers->buffers[i]; + + if (buffer && (!buf || buffer == buf)) { + si_set_buf_desc_address(si_resource(buffer), buffers->offsets[i], + descs->list + i*4); + sctx->descriptors_dirty |= 1u << descriptors_idx; + + radeon_add_to_gfx_buffer_list_check_mem(sctx, + si_resource(buffer), + buffers->writable_mask & (1u << i) ? + RADEON_USAGE_READWRITE : + RADEON_USAGE_READ, + priority, true); + } + } } /* Update all buffer bindings where the buffer is bound, including @@ -1677,431 +1671,436 @@ */ void si_rebind_buffer(struct si_context *sctx, struct pipe_resource *buf) { - struct si_resource *buffer = si_resource(buf); - unsigned i, shader; - unsigned num_elems = sctx->vertex_elements ? - sctx->vertex_elements->count : 0; - - /* We changed the buffer, now we need to bind it where the old one - * was bound. This consists of 2 things: - * 1) Updating the resource descriptor and dirtying it. - * 2) Adding a relocation to the CS, so that it's usable. - */ - - /* Vertex buffers. */ - if (!buffer) { - if (num_elems) - sctx->vertex_buffers_dirty = true; - } else if (buffer->bind_history & PIPE_BIND_VERTEX_BUFFER) { - for (i = 0; i < num_elems; i++) { - int vb = sctx->vertex_elements->vertex_buffer_index[i]; - - if (vb >= ARRAY_SIZE(sctx->vertex_buffer)) - continue; - if (!sctx->vertex_buffer[vb].buffer.resource) - continue; - - if (sctx->vertex_buffer[vb].buffer.resource == buf) { - sctx->vertex_buffers_dirty = true; - break; - } - } - } - - /* Streamout buffers. (other internal buffers can't be invalidated) */ - if (!buffer || buffer->bind_history & PIPE_BIND_STREAM_OUTPUT) { - for (i = SI_VS_STREAMOUT_BUF0; i <= SI_VS_STREAMOUT_BUF3; i++) { - struct si_buffer_resources *buffers = &sctx->rw_buffers; - struct si_descriptors *descs = - &sctx->descriptors[SI_DESCS_RW_BUFFERS]; - struct pipe_resource *buffer = buffers->buffers[i]; - - if (!buffer || (buf && buffer != buf)) - continue; - - si_set_buf_desc_address(si_resource(buffer), buffers->offsets[i], - descs->list + i*4); - sctx->descriptors_dirty |= 1u << SI_DESCS_RW_BUFFERS; - - radeon_add_to_gfx_buffer_list_check_mem(sctx, - si_resource(buffer), - RADEON_USAGE_WRITE, - RADEON_PRIO_SHADER_RW_BUFFER, - true); - - /* Update the streamout state. */ - if (sctx->streamout.begin_emitted) - si_emit_streamout_end(sctx); - sctx->streamout.append_bitmask = - sctx->streamout.enabled_mask; - si_streamout_buffers_dirty(sctx); - } - } - - /* Constant and shader buffers. */ - if (!buffer || buffer->bind_history & PIPE_BIND_CONSTANT_BUFFER) { - for (shader = 0; shader < SI_NUM_SHADERS; shader++) - si_reset_buffer_resources(sctx, &sctx->const_and_shader_buffers[shader], - si_const_and_shader_buffer_descriptors_idx(shader), - u_bit_consecutive(SI_NUM_SHADER_BUFFERS, SI_NUM_CONST_BUFFERS), - buf, - sctx->const_and_shader_buffers[shader].priority_constbuf); - } - - if (!buffer || buffer->bind_history & PIPE_BIND_SHADER_BUFFER) { - for (shader = 0; shader < SI_NUM_SHADERS; shader++) - si_reset_buffer_resources(sctx, &sctx->const_and_shader_buffers[shader], - si_const_and_shader_buffer_descriptors_idx(shader), - u_bit_consecutive(0, SI_NUM_SHADER_BUFFERS), - buf, - sctx->const_and_shader_buffers[shader].priority); - } - - if (!buffer || buffer->bind_history & PIPE_BIND_SAMPLER_VIEW) { - /* Texture buffers - update bindings. */ - for (shader = 0; shader < SI_NUM_SHADERS; shader++) { - struct si_samplers *samplers = &sctx->samplers[shader]; - struct si_descriptors *descs = - si_sampler_and_image_descriptors(sctx, shader); - unsigned mask = samplers->enabled_mask; - - while (mask) { - unsigned i = u_bit_scan(&mask); - struct pipe_resource *buffer = samplers->views[i]->texture; - - if (buffer && buffer->target == PIPE_BUFFER && - (!buf || buffer == buf)) { - unsigned desc_slot = si_get_sampler_slot(i); - - si_set_buf_desc_address(si_resource(buffer), - samplers->views[i]->u.buf.offset, - descs->list + desc_slot * 16 + 4); - sctx->descriptors_dirty |= - 1u << si_sampler_and_image_descriptors_idx(shader); - - radeon_add_to_gfx_buffer_list_check_mem( - sctx, si_resource(buffer), - RADEON_USAGE_READ, - RADEON_PRIO_SAMPLER_BUFFER, true); - } - } - } - } - - /* Shader images */ - if (!buffer || buffer->bind_history & PIPE_BIND_SHADER_IMAGE) { - for (shader = 0; shader < SI_NUM_SHADERS; ++shader) { - struct si_images *images = &sctx->images[shader]; - struct si_descriptors *descs = - si_sampler_and_image_descriptors(sctx, shader); - unsigned mask = images->enabled_mask; - - while (mask) { - unsigned i = u_bit_scan(&mask); - struct pipe_resource *buffer = images->views[i].resource; - - if (buffer && buffer->target == PIPE_BUFFER && - (!buf || buffer == buf)) { - unsigned desc_slot = si_get_image_slot(i); - - if (images->views[i].access & PIPE_IMAGE_ACCESS_WRITE) - si_mark_image_range_valid(&images->views[i]); - - si_set_buf_desc_address(si_resource(buffer), - images->views[i].u.buf.offset, - descs->list + desc_slot * 8 + 4); - sctx->descriptors_dirty |= - 1u << si_sampler_and_image_descriptors_idx(shader); - - radeon_add_to_gfx_buffer_list_check_mem( - sctx, si_resource(buffer), - RADEON_USAGE_READWRITE, - RADEON_PRIO_SAMPLER_BUFFER, true); - } - } - } - } - - /* Bindless texture handles */ - if (!buffer || buffer->texture_handle_allocated) { - struct si_descriptors *descs = &sctx->bindless_descriptors; - - util_dynarray_foreach(&sctx->resident_tex_handles, - struct si_texture_handle *, tex_handle) { - struct pipe_sampler_view *view = (*tex_handle)->view; - unsigned desc_slot = (*tex_handle)->desc_slot; - struct pipe_resource *buffer = view->texture; - - if (buffer && buffer->target == PIPE_BUFFER && - (!buf || buffer == buf)) { - si_set_buf_desc_address(si_resource(buffer), - view->u.buf.offset, - descs->list + - desc_slot * 16 + 4); - - (*tex_handle)->desc_dirty = true; - sctx->bindless_descriptors_dirty = true; - - radeon_add_to_gfx_buffer_list_check_mem( - sctx, si_resource(buffer), - RADEON_USAGE_READ, - RADEON_PRIO_SAMPLER_BUFFER, true); - } - } - } - - /* Bindless image handles */ - if (!buffer || buffer->image_handle_allocated) { - struct si_descriptors *descs = &sctx->bindless_descriptors; - - util_dynarray_foreach(&sctx->resident_img_handles, - struct si_image_handle *, img_handle) { - struct pipe_image_view *view = &(*img_handle)->view; - unsigned desc_slot = (*img_handle)->desc_slot; - struct pipe_resource *buffer = view->resource; - - if (buffer && buffer->target == PIPE_BUFFER && - (!buf || buffer == buf)) { - if (view->access & PIPE_IMAGE_ACCESS_WRITE) - si_mark_image_range_valid(view); - - si_set_buf_desc_address(si_resource(buffer), - view->u.buf.offset, - descs->list + - desc_slot * 16 + 4); - - (*img_handle)->desc_dirty = true; - sctx->bindless_descriptors_dirty = true; - - radeon_add_to_gfx_buffer_list_check_mem( - sctx, si_resource(buffer), - RADEON_USAGE_READWRITE, - RADEON_PRIO_SAMPLER_BUFFER, true); - } - } - } - - if (buffer) { - /* Do the same for other contexts. They will invoke this function - * with buffer == NULL. - */ - unsigned new_counter = p_atomic_inc_return(&sctx->screen->dirty_buf_counter); - - /* Skip the update for the current context, because we have already updated - * the buffer bindings. - */ - if (new_counter == sctx->last_dirty_buf_counter + 1) - sctx->last_dirty_buf_counter = new_counter; - } + struct si_resource *buffer = si_resource(buf); + unsigned i, shader; + unsigned num_elems = sctx->num_vertex_elements; + + /* We changed the buffer, now we need to bind it where the old one + * was bound. This consists of 2 things: + * 1) Updating the resource descriptor and dirtying it. + * 2) Adding a relocation to the CS, so that it's usable. + */ + + /* Vertex buffers. */ + if (!buffer) { + if (num_elems) + sctx->vertex_buffers_dirty = true; + } else if (buffer->bind_history & PIPE_BIND_VERTEX_BUFFER) { + for (i = 0; i < num_elems; i++) { + int vb = sctx->vertex_elements->vertex_buffer_index[i]; + + if (vb >= ARRAY_SIZE(sctx->vertex_buffer)) + continue; + if (!sctx->vertex_buffer[vb].buffer.resource) + continue; + + if (sctx->vertex_buffer[vb].buffer.resource == buf) { + sctx->vertex_buffers_dirty = true; + break; + } + } + } + + /* Streamout buffers. (other internal buffers can't be invalidated) */ + if (!buffer || buffer->bind_history & PIPE_BIND_STREAM_OUTPUT) { + for (i = SI_VS_STREAMOUT_BUF0; i <= SI_VS_STREAMOUT_BUF3; i++) { + struct si_buffer_resources *buffers = &sctx->rw_buffers; + struct si_descriptors *descs = + &sctx->descriptors[SI_DESCS_RW_BUFFERS]; + struct pipe_resource *buffer = buffers->buffers[i]; + + if (!buffer || (buf && buffer != buf)) + continue; + + si_set_buf_desc_address(si_resource(buffer), buffers->offsets[i], + descs->list + i*4); + sctx->descriptors_dirty |= 1u << SI_DESCS_RW_BUFFERS; + + radeon_add_to_gfx_buffer_list_check_mem(sctx, + si_resource(buffer), + RADEON_USAGE_WRITE, + RADEON_PRIO_SHADER_RW_BUFFER, + true); + + /* Update the streamout state. */ + if (sctx->streamout.begin_emitted) + si_emit_streamout_end(sctx); + sctx->streamout.append_bitmask = + sctx->streamout.enabled_mask; + si_streamout_buffers_dirty(sctx); + } + } + + /* Constant and shader buffers. */ + if (!buffer || buffer->bind_history & PIPE_BIND_CONSTANT_BUFFER) { + for (shader = 0; shader < SI_NUM_SHADERS; shader++) + si_reset_buffer_resources(sctx, &sctx->const_and_shader_buffers[shader], + si_const_and_shader_buffer_descriptors_idx(shader), + u_bit_consecutive(SI_NUM_SHADER_BUFFERS, SI_NUM_CONST_BUFFERS), + buf, + sctx->const_and_shader_buffers[shader].priority_constbuf); + } + + if (!buffer || buffer->bind_history & PIPE_BIND_SHADER_BUFFER) { + for (shader = 0; shader < SI_NUM_SHADERS; shader++) + si_reset_buffer_resources(sctx, &sctx->const_and_shader_buffers[shader], + si_const_and_shader_buffer_descriptors_idx(shader), + u_bit_consecutive(0, SI_NUM_SHADER_BUFFERS), + buf, + sctx->const_and_shader_buffers[shader].priority); + } + + if (!buffer || buffer->bind_history & PIPE_BIND_SAMPLER_VIEW) { + /* Texture buffers - update bindings. */ + for (shader = 0; shader < SI_NUM_SHADERS; shader++) { + struct si_samplers *samplers = &sctx->samplers[shader]; + struct si_descriptors *descs = + si_sampler_and_image_descriptors(sctx, shader); + unsigned mask = samplers->enabled_mask; + + while (mask) { + unsigned i = u_bit_scan(&mask); + struct pipe_resource *buffer = samplers->views[i]->texture; + + if (buffer && buffer->target == PIPE_BUFFER && + (!buf || buffer == buf)) { + unsigned desc_slot = si_get_sampler_slot(i); + + si_set_buf_desc_address(si_resource(buffer), + samplers->views[i]->u.buf.offset, + descs->list + desc_slot * 16 + 4); + sctx->descriptors_dirty |= + 1u << si_sampler_and_image_descriptors_idx(shader); + + radeon_add_to_gfx_buffer_list_check_mem( + sctx, si_resource(buffer), + RADEON_USAGE_READ, + RADEON_PRIO_SAMPLER_BUFFER, true); + } + } + } + } + + /* Shader images */ + if (!buffer || buffer->bind_history & PIPE_BIND_SHADER_IMAGE) { + for (shader = 0; shader < SI_NUM_SHADERS; ++shader) { + struct si_images *images = &sctx->images[shader]; + struct si_descriptors *descs = + si_sampler_and_image_descriptors(sctx, shader); + unsigned mask = images->enabled_mask; + + while (mask) { + unsigned i = u_bit_scan(&mask); + struct pipe_resource *buffer = images->views[i].resource; + + if (buffer && buffer->target == PIPE_BUFFER && + (!buf || buffer == buf)) { + unsigned desc_slot = si_get_image_slot(i); + + if (images->views[i].access & PIPE_IMAGE_ACCESS_WRITE) + si_mark_image_range_valid(&images->views[i]); + + si_set_buf_desc_address(si_resource(buffer), + images->views[i].u.buf.offset, + descs->list + desc_slot * 8 + 4); + sctx->descriptors_dirty |= + 1u << si_sampler_and_image_descriptors_idx(shader); + + radeon_add_to_gfx_buffer_list_check_mem( + sctx, si_resource(buffer), + RADEON_USAGE_READWRITE, + RADEON_PRIO_SAMPLER_BUFFER, true); + } + } + } + } + + /* Bindless texture handles */ + if (!buffer || buffer->texture_handle_allocated) { + struct si_descriptors *descs = &sctx->bindless_descriptors; + + util_dynarray_foreach(&sctx->resident_tex_handles, + struct si_texture_handle *, tex_handle) { + struct pipe_sampler_view *view = (*tex_handle)->view; + unsigned desc_slot = (*tex_handle)->desc_slot; + struct pipe_resource *buffer = view->texture; + + if (buffer && buffer->target == PIPE_BUFFER && + (!buf || buffer == buf)) { + si_set_buf_desc_address(si_resource(buffer), + view->u.buf.offset, + descs->list + + desc_slot * 16 + 4); + + (*tex_handle)->desc_dirty = true; + sctx->bindless_descriptors_dirty = true; + + radeon_add_to_gfx_buffer_list_check_mem( + sctx, si_resource(buffer), + RADEON_USAGE_READ, + RADEON_PRIO_SAMPLER_BUFFER, true); + } + } + } + + /* Bindless image handles */ + if (!buffer || buffer->image_handle_allocated) { + struct si_descriptors *descs = &sctx->bindless_descriptors; + + util_dynarray_foreach(&sctx->resident_img_handles, + struct si_image_handle *, img_handle) { + struct pipe_image_view *view = &(*img_handle)->view; + unsigned desc_slot = (*img_handle)->desc_slot; + struct pipe_resource *buffer = view->resource; + + if (buffer && buffer->target == PIPE_BUFFER && + (!buf || buffer == buf)) { + if (view->access & PIPE_IMAGE_ACCESS_WRITE) + si_mark_image_range_valid(view); + + si_set_buf_desc_address(si_resource(buffer), + view->u.buf.offset, + descs->list + + desc_slot * 16 + 4); + + (*img_handle)->desc_dirty = true; + sctx->bindless_descriptors_dirty = true; + + radeon_add_to_gfx_buffer_list_check_mem( + sctx, si_resource(buffer), + RADEON_USAGE_READWRITE, + RADEON_PRIO_SAMPLER_BUFFER, true); + } + } + } + + if (buffer) { + /* Do the same for other contexts. They will invoke this function + * with buffer == NULL. + */ + unsigned new_counter = p_atomic_inc_return(&sctx->screen->dirty_buf_counter); + + /* Skip the update for the current context, because we have already updated + * the buffer bindings. + */ + if (new_counter == sctx->last_dirty_buf_counter + 1) + sctx->last_dirty_buf_counter = new_counter; + } } static void si_upload_bindless_descriptor(struct si_context *sctx, - unsigned desc_slot, - unsigned num_dwords) + unsigned desc_slot, + unsigned num_dwords) { - struct si_descriptors *desc = &sctx->bindless_descriptors; - unsigned desc_slot_offset = desc_slot * 16; - uint32_t *data; - uint64_t va; + struct si_descriptors *desc = &sctx->bindless_descriptors; + unsigned desc_slot_offset = desc_slot * 16; + uint32_t *data; + uint64_t va; - data = desc->list + desc_slot_offset; - va = desc->gpu_address + desc_slot_offset * 4; + data = desc->list + desc_slot_offset; + va = desc->gpu_address + desc_slot_offset * 4; - si_cp_write_data(sctx, desc->buffer, va - desc->buffer->gpu_address, - num_dwords * 4, V_370_TC_L2, V_370_ME, data); + si_cp_write_data(sctx, desc->buffer, va - desc->buffer->gpu_address, + num_dwords * 4, V_370_TC_L2, V_370_ME, data); } static void si_upload_bindless_descriptors(struct si_context *sctx) { - if (!sctx->bindless_descriptors_dirty) - return; + if (!sctx->bindless_descriptors_dirty) + return; - /* Wait for graphics/compute to be idle before updating the resident - * descriptors directly in memory, in case the GPU is using them. - */ - sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | - SI_CONTEXT_CS_PARTIAL_FLUSH; - sctx->emit_cache_flush(sctx); - - util_dynarray_foreach(&sctx->resident_tex_handles, - struct si_texture_handle *, tex_handle) { - unsigned desc_slot = (*tex_handle)->desc_slot; - - if (!(*tex_handle)->desc_dirty) - continue; - - si_upload_bindless_descriptor(sctx, desc_slot, 16); - (*tex_handle)->desc_dirty = false; - } - - util_dynarray_foreach(&sctx->resident_img_handles, - struct si_image_handle *, img_handle) { - unsigned desc_slot = (*img_handle)->desc_slot; - - if (!(*img_handle)->desc_dirty) - continue; - - si_upload_bindless_descriptor(sctx, desc_slot, 8); - (*img_handle)->desc_dirty = false; - } - - /* Invalidate L1 because it doesn't know that L2 changed. */ - sctx->flags |= SI_CONTEXT_INV_SCACHE; - sctx->emit_cache_flush(sctx); + /* Wait for graphics/compute to be idle before updating the resident + * descriptors directly in memory, in case the GPU is using them. + */ + sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | + SI_CONTEXT_CS_PARTIAL_FLUSH; + sctx->emit_cache_flush(sctx); + + util_dynarray_foreach(&sctx->resident_tex_handles, + struct si_texture_handle *, tex_handle) { + unsigned desc_slot = (*tex_handle)->desc_slot; + + if (!(*tex_handle)->desc_dirty) + continue; + + si_upload_bindless_descriptor(sctx, desc_slot, 16); + (*tex_handle)->desc_dirty = false; + } + + util_dynarray_foreach(&sctx->resident_img_handles, + struct si_image_handle *, img_handle) { + unsigned desc_slot = (*img_handle)->desc_slot; + + if (!(*img_handle)->desc_dirty) + continue; + + si_upload_bindless_descriptor(sctx, desc_slot, 8); + (*img_handle)->desc_dirty = false; + } + + /* Invalidate L1 because it doesn't know that L2 changed. */ + sctx->flags |= SI_CONTEXT_INV_SCACHE; + sctx->emit_cache_flush(sctx); - sctx->bindless_descriptors_dirty = false; + sctx->bindless_descriptors_dirty = false; } /* Update mutable image descriptor fields of all resident textures. */ static void si_update_bindless_texture_descriptor(struct si_context *sctx, - struct si_texture_handle *tex_handle) + struct si_texture_handle *tex_handle) { - struct si_sampler_view *sview = (struct si_sampler_view *)tex_handle->view; - struct si_descriptors *desc = &sctx->bindless_descriptors; - unsigned desc_slot_offset = tex_handle->desc_slot * 16; - uint32_t desc_list[16]; - - if (sview->base.texture->target == PIPE_BUFFER) - return; - - memcpy(desc_list, desc->list + desc_slot_offset, sizeof(desc_list)); - si_set_sampler_view_desc(sctx, sview, &tex_handle->sstate, - desc->list + desc_slot_offset); - - if (memcmp(desc_list, desc->list + desc_slot_offset, - sizeof(desc_list))) { - tex_handle->desc_dirty = true; - sctx->bindless_descriptors_dirty = true; - } + struct si_sampler_view *sview = (struct si_sampler_view *)tex_handle->view; + struct si_descriptors *desc = &sctx->bindless_descriptors; + unsigned desc_slot_offset = tex_handle->desc_slot * 16; + uint32_t desc_list[16]; + + if (sview->base.texture->target == PIPE_BUFFER) + return; + + memcpy(desc_list, desc->list + desc_slot_offset, sizeof(desc_list)); + si_set_sampler_view_desc(sctx, sview, &tex_handle->sstate, + desc->list + desc_slot_offset); + + if (memcmp(desc_list, desc->list + desc_slot_offset, + sizeof(desc_list))) { + tex_handle->desc_dirty = true; + sctx->bindless_descriptors_dirty = true; + } } static void si_update_bindless_image_descriptor(struct si_context *sctx, - struct si_image_handle *img_handle) + struct si_image_handle *img_handle) { - struct si_descriptors *desc = &sctx->bindless_descriptors; - unsigned desc_slot_offset = img_handle->desc_slot * 16; - struct pipe_image_view *view = &img_handle->view; - uint32_t desc_list[8]; - - if (view->resource->target == PIPE_BUFFER) - return; - - memcpy(desc_list, desc->list + desc_slot_offset, - sizeof(desc_list)); - si_set_shader_image_desc(sctx, view, true, - desc->list + desc_slot_offset, NULL); - - if (memcmp(desc_list, desc->list + desc_slot_offset, - sizeof(desc_list))) { - img_handle->desc_dirty = true; - sctx->bindless_descriptors_dirty = true; - } + struct si_descriptors *desc = &sctx->bindless_descriptors; + unsigned desc_slot_offset = img_handle->desc_slot * 16; + struct pipe_image_view *view = &img_handle->view; + struct pipe_resource *res = view->resource; + uint32_t image_desc[16]; + unsigned desc_size = (res->nr_samples >= 2 ? 16 : 8) * 4; + + if (res->target == PIPE_BUFFER) + return; + + memcpy(image_desc, desc->list + desc_slot_offset, desc_size); + si_set_shader_image_desc(sctx, view, true, + desc->list + desc_slot_offset, + desc->list + desc_slot_offset + 8); + + if (memcmp(image_desc, desc->list + desc_slot_offset, desc_size)) { + img_handle->desc_dirty = true; + sctx->bindless_descriptors_dirty = true; + } } static void si_update_all_resident_texture_descriptors(struct si_context *sctx) { - util_dynarray_foreach(&sctx->resident_tex_handles, - struct si_texture_handle *, tex_handle) { - si_update_bindless_texture_descriptor(sctx, *tex_handle); - } - - util_dynarray_foreach(&sctx->resident_img_handles, - struct si_image_handle *, img_handle) { - si_update_bindless_image_descriptor(sctx, *img_handle); - } + util_dynarray_foreach(&sctx->resident_tex_handles, + struct si_texture_handle *, tex_handle) { + si_update_bindless_texture_descriptor(sctx, *tex_handle); + } + + util_dynarray_foreach(&sctx->resident_img_handles, + struct si_image_handle *, img_handle) { + si_update_bindless_image_descriptor(sctx, *img_handle); + } - si_upload_bindless_descriptors(sctx); + si_upload_bindless_descriptors(sctx); } /* Update mutable image descriptor fields of all bound textures. */ void si_update_all_texture_descriptors(struct si_context *sctx) { - unsigned shader; + unsigned shader; - for (shader = 0; shader < SI_NUM_SHADERS; shader++) { - struct si_samplers *samplers = &sctx->samplers[shader]; - struct si_images *images = &sctx->images[shader]; - unsigned mask; - - /* Images. */ - mask = images->enabled_mask; - while (mask) { - unsigned i = u_bit_scan(&mask); - struct pipe_image_view *view = &images->views[i]; - - if (!view->resource || - view->resource->target == PIPE_BUFFER) - continue; - - si_set_shader_image(sctx, shader, i, view, true); - } - - /* Sampler views. */ - mask = samplers->enabled_mask; - while (mask) { - unsigned i = u_bit_scan(&mask); - struct pipe_sampler_view *view = samplers->views[i]; - - if (!view || - !view->texture || - view->texture->target == PIPE_BUFFER) - continue; - - si_set_sampler_view(sctx, shader, i, - samplers->views[i], true); - } + for (shader = 0; shader < SI_NUM_SHADERS; shader++) { + struct si_samplers *samplers = &sctx->samplers[shader]; + struct si_images *images = &sctx->images[shader]; + unsigned mask; + + /* Images. */ + mask = images->enabled_mask; + while (mask) { + unsigned i = u_bit_scan(&mask); + struct pipe_image_view *view = &images->views[i]; + + if (!view->resource || + view->resource->target == PIPE_BUFFER) + continue; + + si_set_shader_image(sctx, shader, i, view, true); + } + + /* Sampler views. */ + mask = samplers->enabled_mask; + while (mask) { + unsigned i = u_bit_scan(&mask); + struct pipe_sampler_view *view = samplers->views[i]; + + if (!view || + !view->texture || + view->texture->target == PIPE_BUFFER) + continue; + + si_set_sampler_view(sctx, shader, i, + samplers->views[i], true); + } - si_update_shader_needs_decompress_mask(sctx, shader); - } + si_update_shader_needs_decompress_mask(sctx, shader); + } - si_update_all_resident_texture_descriptors(sctx); - si_update_ps_colorbuf0_slot(sctx); + si_update_all_resident_texture_descriptors(sctx); + si_update_ps_colorbuf0_slot(sctx); } /* SHADER USER DATA */ static void si_mark_shader_pointers_dirty(struct si_context *sctx, - unsigned shader) + unsigned shader) { - sctx->shader_pointers_dirty |= - u_bit_consecutive(SI_DESCS_FIRST_SHADER + shader * SI_NUM_SHADER_DESCS, - SI_NUM_SHADER_DESCS); - - if (shader == PIPE_SHADER_VERTEX) - sctx->vertex_buffer_pointer_dirty = sctx->vb_descriptors_buffer != NULL; + sctx->shader_pointers_dirty |= + u_bit_consecutive(SI_DESCS_FIRST_SHADER + shader * SI_NUM_SHADER_DESCS, + SI_NUM_SHADER_DESCS); + + if (shader == PIPE_SHADER_VERTEX) { + sctx->vertex_buffer_pointer_dirty = sctx->vb_descriptors_buffer != NULL; + sctx->vertex_buffer_user_sgprs_dirty = sctx->num_vertex_elements > 0 && + sctx->screen->num_vbos_in_user_sgprs; + } - si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers); + si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers); } static void si_shader_pointers_begin_new_cs(struct si_context *sctx) { - sctx->shader_pointers_dirty = u_bit_consecutive(0, SI_NUM_DESCS); - sctx->vertex_buffer_pointer_dirty = sctx->vb_descriptors_buffer != NULL; - si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers); - sctx->graphics_bindless_pointer_dirty = sctx->bindless_descriptors.buffer != NULL; - sctx->compute_bindless_pointer_dirty = sctx->bindless_descriptors.buffer != NULL; + sctx->shader_pointers_dirty = u_bit_consecutive(0, SI_NUM_DESCS); + sctx->vertex_buffer_pointer_dirty = sctx->vb_descriptors_buffer != NULL; + sctx->vertex_buffer_user_sgprs_dirty = sctx->num_vertex_elements > 0 && + sctx->screen->num_vbos_in_user_sgprs; + si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers); + sctx->graphics_bindless_pointer_dirty = sctx->bindless_descriptors.buffer != NULL; + sctx->compute_bindless_pointer_dirty = sctx->bindless_descriptors.buffer != NULL; } /* Set a base register address for user data constants in the given shader. * This assigns a mapping from PIPE_SHADER_* to SPI_SHADER_USER_DATA_*. */ static void si_set_user_data_base(struct si_context *sctx, - unsigned shader, uint32_t new_base) + unsigned shader, uint32_t new_base) { - uint32_t *base = &sctx->shader_pointers.sh_base[shader]; + uint32_t *base = &sctx->shader_pointers.sh_base[shader]; - if (*base != new_base) { - *base = new_base; + if (*base != new_base) { + *base = new_base; - if (new_base) - si_mark_shader_pointers_dirty(sctx, shader); - - /* Any change in enabled shader stages requires re-emitting - * the VS state SGPR, because it contains the clamp_vertex_color - * state, which can be done in VS, TES, and GS. - */ - sctx->last_vs_state = ~0; - } + if (new_base) + si_mark_shader_pointers_dirty(sctx, shader); + + /* Any change in enabled shader stages requires re-emitting + * the VS state SGPR, because it contains the clamp_vertex_color + * state, which can be done in VS, TES, and GS. + */ + sctx->last_vs_state = ~0; + } } /* This must be called when these are changed between enabled and disabled @@ -2111,911 +2110,922 @@ */ void si_shader_change_notify(struct si_context *sctx) { - /* VS can be bound as VS, ES, or LS. */ - if (sctx->tes_shader.cso) { - if (sctx->chip_class >= GFX10) { - si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, - R_00B430_SPI_SHADER_USER_DATA_HS_0); - } else if (sctx->chip_class == GFX9) { - si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, - R_00B430_SPI_SHADER_USER_DATA_LS_0); - } else { - si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, - R_00B530_SPI_SHADER_USER_DATA_LS_0); - } - } else if (sctx->chip_class >= GFX10) { - if (sctx->ngg || sctx->gs_shader.cso) { - si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, - R_00B230_SPI_SHADER_USER_DATA_GS_0); - } else { - si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, - R_00B130_SPI_SHADER_USER_DATA_VS_0); - } - } else if (sctx->gs_shader.cso) { - si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, - R_00B330_SPI_SHADER_USER_DATA_ES_0); - } else { - si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, - R_00B130_SPI_SHADER_USER_DATA_VS_0); - } - - /* TES can be bound as ES, VS, or not bound. */ - if (sctx->tes_shader.cso) { - if (sctx->chip_class >= GFX10) { - if (sctx->ngg || sctx->gs_shader.cso) { - si_set_user_data_base(sctx, PIPE_SHADER_TESS_EVAL, - R_00B230_SPI_SHADER_USER_DATA_GS_0); - } else { - si_set_user_data_base(sctx, PIPE_SHADER_TESS_EVAL, - R_00B130_SPI_SHADER_USER_DATA_VS_0); - } - } else if (sctx->gs_shader.cso) { - si_set_user_data_base(sctx, PIPE_SHADER_TESS_EVAL, - R_00B330_SPI_SHADER_USER_DATA_ES_0); - } else { - si_set_user_data_base(sctx, PIPE_SHADER_TESS_EVAL, - R_00B130_SPI_SHADER_USER_DATA_VS_0); - } - } else { - si_set_user_data_base(sctx, PIPE_SHADER_TESS_EVAL, 0); - } + /* VS can be bound as VS, ES, or LS. */ + if (sctx->tes_shader.cso) { + if (sctx->chip_class >= GFX10) { + si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, + R_00B430_SPI_SHADER_USER_DATA_HS_0); + } else if (sctx->chip_class == GFX9) { + si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, + R_00B430_SPI_SHADER_USER_DATA_LS_0); + } else { + si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, + R_00B530_SPI_SHADER_USER_DATA_LS_0); + } + } else if (sctx->chip_class >= GFX10) { + if (sctx->ngg || sctx->gs_shader.cso) { + si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, + R_00B230_SPI_SHADER_USER_DATA_GS_0); + } else { + si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, + R_00B130_SPI_SHADER_USER_DATA_VS_0); + } + } else if (sctx->gs_shader.cso) { + si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, + R_00B330_SPI_SHADER_USER_DATA_ES_0); + } else { + si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, + R_00B130_SPI_SHADER_USER_DATA_VS_0); + } + + /* TES can be bound as ES, VS, or not bound. */ + if (sctx->tes_shader.cso) { + if (sctx->chip_class >= GFX10) { + if (sctx->ngg || sctx->gs_shader.cso) { + si_set_user_data_base(sctx, PIPE_SHADER_TESS_EVAL, + R_00B230_SPI_SHADER_USER_DATA_GS_0); + } else { + si_set_user_data_base(sctx, PIPE_SHADER_TESS_EVAL, + R_00B130_SPI_SHADER_USER_DATA_VS_0); + } + } else if (sctx->gs_shader.cso) { + si_set_user_data_base(sctx, PIPE_SHADER_TESS_EVAL, + R_00B330_SPI_SHADER_USER_DATA_ES_0); + } else { + si_set_user_data_base(sctx, PIPE_SHADER_TESS_EVAL, + R_00B130_SPI_SHADER_USER_DATA_VS_0); + } + } else { + si_set_user_data_base(sctx, PIPE_SHADER_TESS_EVAL, 0); + } } static void si_emit_shader_pointer_head(struct radeon_cmdbuf *cs, - unsigned sh_offset, - unsigned pointer_count) + unsigned sh_offset, + unsigned pointer_count) { - radeon_emit(cs, PKT3(PKT3_SET_SH_REG, pointer_count, 0)); - radeon_emit(cs, (sh_offset - SI_SH_REG_OFFSET) >> 2); + radeon_emit(cs, PKT3(PKT3_SET_SH_REG, pointer_count, 0)); + radeon_emit(cs, (sh_offset - SI_SH_REG_OFFSET) >> 2); } static void si_emit_shader_pointer_body(struct si_screen *sscreen, - struct radeon_cmdbuf *cs, - uint64_t va) + struct radeon_cmdbuf *cs, + uint64_t va) { - radeon_emit(cs, va); + radeon_emit(cs, va); - assert(va == 0 || (va >> 32) == sscreen->info.address32_hi); + assert(va == 0 || (va >> 32) == sscreen->info.address32_hi); } static void si_emit_shader_pointer(struct si_context *sctx, - struct si_descriptors *desc, - unsigned sh_base) + struct si_descriptors *desc, + unsigned sh_base) { - struct radeon_cmdbuf *cs = sctx->gfx_cs; - unsigned sh_offset = sh_base + desc->shader_userdata_offset; + struct radeon_cmdbuf *cs = sctx->gfx_cs; + unsigned sh_offset = sh_base + desc->shader_userdata_offset; - si_emit_shader_pointer_head(cs, sh_offset, 1); - si_emit_shader_pointer_body(sctx->screen, cs, desc->gpu_address); + si_emit_shader_pointer_head(cs, sh_offset, 1); + si_emit_shader_pointer_body(sctx->screen, cs, desc->gpu_address); } static void si_emit_consecutive_shader_pointers(struct si_context *sctx, - unsigned pointer_mask, - unsigned sh_base) + unsigned pointer_mask, + unsigned sh_base) { - if (!sh_base) - return; + if (!sh_base) + return; - struct radeon_cmdbuf *cs = sctx->gfx_cs; - unsigned mask = sctx->shader_pointers_dirty & pointer_mask; + struct radeon_cmdbuf *cs = sctx->gfx_cs; + unsigned mask = sctx->shader_pointers_dirty & pointer_mask; - while (mask) { - int start, count; - u_bit_scan_consecutive_range(&mask, &start, &count); - - struct si_descriptors *descs = &sctx->descriptors[start]; - unsigned sh_offset = sh_base + descs->shader_userdata_offset; - - si_emit_shader_pointer_head(cs, sh_offset, count); - for (int i = 0; i < count; i++) - si_emit_shader_pointer_body(sctx->screen, cs, - descs[i].gpu_address); - } + while (mask) { + int start, count; + u_bit_scan_consecutive_range(&mask, &start, &count); + + struct si_descriptors *descs = &sctx->descriptors[start]; + unsigned sh_offset = sh_base + descs->shader_userdata_offset; + + si_emit_shader_pointer_head(cs, sh_offset, count); + for (int i = 0; i < count; i++) + si_emit_shader_pointer_body(sctx->screen, cs, + descs[i].gpu_address); + } } static void si_emit_global_shader_pointers(struct si_context *sctx, - struct si_descriptors *descs) + struct si_descriptors *descs) { - if (sctx->chip_class >= GFX10) { - si_emit_shader_pointer(sctx, descs, - R_00B030_SPI_SHADER_USER_DATA_PS_0); - /* HW VS stage only used in non-NGG mode. */ - si_emit_shader_pointer(sctx, descs, - R_00B130_SPI_SHADER_USER_DATA_VS_0); - si_emit_shader_pointer(sctx, descs, - R_00B230_SPI_SHADER_USER_DATA_GS_0); - si_emit_shader_pointer(sctx, descs, - R_00B430_SPI_SHADER_USER_DATA_HS_0); - return; - } else if (sctx->chip_class == GFX9) { - /* Broadcast it to all shader stages. */ - si_emit_shader_pointer(sctx, descs, - R_00B530_SPI_SHADER_USER_DATA_COMMON_0); - return; - } - - si_emit_shader_pointer(sctx, descs, - R_00B030_SPI_SHADER_USER_DATA_PS_0); - si_emit_shader_pointer(sctx, descs, - R_00B130_SPI_SHADER_USER_DATA_VS_0); - si_emit_shader_pointer(sctx, descs, - R_00B330_SPI_SHADER_USER_DATA_ES_0); - si_emit_shader_pointer(sctx, descs, - R_00B230_SPI_SHADER_USER_DATA_GS_0); - si_emit_shader_pointer(sctx, descs, - R_00B430_SPI_SHADER_USER_DATA_HS_0); - si_emit_shader_pointer(sctx, descs, - R_00B530_SPI_SHADER_USER_DATA_LS_0); + if (sctx->chip_class >= GFX10) { + si_emit_shader_pointer(sctx, descs, + R_00B030_SPI_SHADER_USER_DATA_PS_0); + /* HW VS stage only used in non-NGG mode. */ + si_emit_shader_pointer(sctx, descs, + R_00B130_SPI_SHADER_USER_DATA_VS_0); + si_emit_shader_pointer(sctx, descs, + R_00B230_SPI_SHADER_USER_DATA_GS_0); + si_emit_shader_pointer(sctx, descs, + R_00B430_SPI_SHADER_USER_DATA_HS_0); + return; + } else if (sctx->chip_class == GFX9) { + /* Broadcast it to all shader stages. */ + si_emit_shader_pointer(sctx, descs, + R_00B530_SPI_SHADER_USER_DATA_COMMON_0); + return; + } + + si_emit_shader_pointer(sctx, descs, + R_00B030_SPI_SHADER_USER_DATA_PS_0); + si_emit_shader_pointer(sctx, descs, + R_00B130_SPI_SHADER_USER_DATA_VS_0); + si_emit_shader_pointer(sctx, descs, + R_00B330_SPI_SHADER_USER_DATA_ES_0); + si_emit_shader_pointer(sctx, descs, + R_00B230_SPI_SHADER_USER_DATA_GS_0); + si_emit_shader_pointer(sctx, descs, + R_00B430_SPI_SHADER_USER_DATA_HS_0); + si_emit_shader_pointer(sctx, descs, + R_00B530_SPI_SHADER_USER_DATA_LS_0); } void si_emit_graphics_shader_pointers(struct si_context *sctx) { - uint32_t *sh_base = sctx->shader_pointers.sh_base; + uint32_t *sh_base = sctx->shader_pointers.sh_base; - if (sctx->shader_pointers_dirty & (1 << SI_DESCS_RW_BUFFERS)) { - si_emit_global_shader_pointers(sctx, - &sctx->descriptors[SI_DESCS_RW_BUFFERS]); - } - - si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(VERTEX), - sh_base[PIPE_SHADER_VERTEX]); - si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(TESS_EVAL), - sh_base[PIPE_SHADER_TESS_EVAL]); - si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(FRAGMENT), - sh_base[PIPE_SHADER_FRAGMENT]); - si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(TESS_CTRL), - sh_base[PIPE_SHADER_TESS_CTRL]); - si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(GEOMETRY), - sh_base[PIPE_SHADER_GEOMETRY]); - - sctx->shader_pointers_dirty &= - ~u_bit_consecutive(SI_DESCS_RW_BUFFERS, SI_DESCS_FIRST_COMPUTE); - - if (sctx->vertex_buffer_pointer_dirty) { - struct radeon_cmdbuf *cs = sctx->gfx_cs; - - /* Find the location of the VB descriptor pointer. */ - /* TODO: In the future, the pointer will be packed in unused - * bits of the first 2 VB descriptors. */ - unsigned sh_dw_offset = SI_VS_NUM_USER_SGPR; - if (sctx->chip_class >= GFX9) { - if (sctx->tes_shader.cso) - sh_dw_offset = GFX9_TCS_NUM_USER_SGPR; - else if (sctx->gs_shader.cso) - sh_dw_offset = GFX9_VSGS_NUM_USER_SGPR; - } - - unsigned sh_offset = sh_base[PIPE_SHADER_VERTEX] + sh_dw_offset * 4; - si_emit_shader_pointer_head(cs, sh_offset, 1); - si_emit_shader_pointer_body(sctx->screen, cs, - sctx->vb_descriptors_buffer->gpu_address + - sctx->vb_descriptors_offset); - sctx->vertex_buffer_pointer_dirty = false; - } - - if (sctx->graphics_bindless_pointer_dirty) { - si_emit_global_shader_pointers(sctx, - &sctx->bindless_descriptors); - sctx->graphics_bindless_pointer_dirty = false; - } + if (sctx->shader_pointers_dirty & (1 << SI_DESCS_RW_BUFFERS)) { + si_emit_global_shader_pointers(sctx, + &sctx->descriptors[SI_DESCS_RW_BUFFERS]); + } + + si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(VERTEX), + sh_base[PIPE_SHADER_VERTEX]); + si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(TESS_EVAL), + sh_base[PIPE_SHADER_TESS_EVAL]); + si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(FRAGMENT), + sh_base[PIPE_SHADER_FRAGMENT]); + si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(TESS_CTRL), + sh_base[PIPE_SHADER_TESS_CTRL]); + si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(GEOMETRY), + sh_base[PIPE_SHADER_GEOMETRY]); + + sctx->shader_pointers_dirty &= + ~u_bit_consecutive(SI_DESCS_RW_BUFFERS, SI_DESCS_FIRST_COMPUTE); + + if (sctx->vertex_buffer_pointer_dirty && sctx->num_vertex_elements) { + struct radeon_cmdbuf *cs = sctx->gfx_cs; + + /* Find the location of the VB descriptor pointer. */ + unsigned sh_dw_offset = SI_VS_NUM_USER_SGPR; + if (sctx->chip_class >= GFX9) { + if (sctx->tes_shader.cso) + sh_dw_offset = GFX9_TCS_NUM_USER_SGPR; + else if (sctx->gs_shader.cso) + sh_dw_offset = GFX9_VSGS_NUM_USER_SGPR; + } + + unsigned sh_offset = sh_base[PIPE_SHADER_VERTEX] + sh_dw_offset * 4; + si_emit_shader_pointer_head(cs, sh_offset, 1); + si_emit_shader_pointer_body(sctx->screen, cs, + sctx->vb_descriptors_buffer->gpu_address + + sctx->vb_descriptors_offset); + sctx->vertex_buffer_pointer_dirty = false; + } + + if (sctx->vertex_buffer_user_sgprs_dirty && + sctx->num_vertex_elements && + sctx->screen->num_vbos_in_user_sgprs) { + struct radeon_cmdbuf *cs = sctx->gfx_cs; + unsigned num_desc = MIN2(sctx->num_vertex_elements, + sctx->screen->num_vbos_in_user_sgprs); + unsigned sh_offset = sh_base[PIPE_SHADER_VERTEX] + SI_SGPR_VS_VB_DESCRIPTOR_FIRST * 4; + + si_emit_shader_pointer_head(cs, sh_offset, num_desc * 4); + radeon_emit_array(cs, sctx->vb_descriptor_user_sgprs, num_desc * 4); + sctx->vertex_buffer_user_sgprs_dirty = false; + } + + if (sctx->graphics_bindless_pointer_dirty) { + si_emit_global_shader_pointers(sctx, + &sctx->bindless_descriptors); + sctx->graphics_bindless_pointer_dirty = false; + } } void si_emit_compute_shader_pointers(struct si_context *sctx) { - unsigned base = R_00B900_COMPUTE_USER_DATA_0; + unsigned base = R_00B900_COMPUTE_USER_DATA_0; - si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(COMPUTE), - R_00B900_COMPUTE_USER_DATA_0); - sctx->shader_pointers_dirty &= ~SI_DESCS_SHADER_MASK(COMPUTE); - - if (sctx->compute_bindless_pointer_dirty) { - si_emit_shader_pointer(sctx, &sctx->bindless_descriptors, base); - sctx->compute_bindless_pointer_dirty = false; - } + si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(COMPUTE), + R_00B900_COMPUTE_USER_DATA_0); + sctx->shader_pointers_dirty &= ~SI_DESCS_SHADER_MASK(COMPUTE); + + if (sctx->compute_bindless_pointer_dirty) { + si_emit_shader_pointer(sctx, &sctx->bindless_descriptors, base); + sctx->compute_bindless_pointer_dirty = false; + } } /* BINDLESS */ static void si_init_bindless_descriptors(struct si_context *sctx, - struct si_descriptors *desc, - short shader_userdata_rel_index, - unsigned num_elements) -{ - ASSERTED unsigned desc_slot; - - si_init_descriptors(desc, shader_userdata_rel_index, 16, num_elements); - sctx->bindless_descriptors.num_active_slots = num_elements; - - /* The first bindless descriptor is stored at slot 1, because 0 is not - * considered to be a valid handle. - */ - sctx->num_bindless_descriptors = 1; - - /* Track which bindless slots are used (or not). */ - util_idalloc_init(&sctx->bindless_used_slots); - util_idalloc_resize(&sctx->bindless_used_slots, num_elements); - - /* Reserve slot 0 because it's an invalid handle for bindless. */ - desc_slot = util_idalloc_alloc(&sctx->bindless_used_slots); - assert(desc_slot == 0); + struct si_descriptors *desc, + short shader_userdata_rel_index, + unsigned num_elements) +{ + ASSERTED unsigned desc_slot; + + si_init_descriptors(desc, shader_userdata_rel_index, 16, num_elements); + sctx->bindless_descriptors.num_active_slots = num_elements; + + /* The first bindless descriptor is stored at slot 1, because 0 is not + * considered to be a valid handle. + */ + sctx->num_bindless_descriptors = 1; + + /* Track which bindless slots are used (or not). */ + util_idalloc_init(&sctx->bindless_used_slots); + util_idalloc_resize(&sctx->bindless_used_slots, num_elements); + + /* Reserve slot 0 because it's an invalid handle for bindless. */ + desc_slot = util_idalloc_alloc(&sctx->bindless_used_slots); + assert(desc_slot == 0); } static void si_release_bindless_descriptors(struct si_context *sctx) { - si_release_descriptors(&sctx->bindless_descriptors); - util_idalloc_fini(&sctx->bindless_used_slots); + si_release_descriptors(&sctx->bindless_descriptors); + util_idalloc_fini(&sctx->bindless_used_slots); } static unsigned si_get_first_free_bindless_slot(struct si_context *sctx) { - struct si_descriptors *desc = &sctx->bindless_descriptors; - unsigned desc_slot; + struct si_descriptors *desc = &sctx->bindless_descriptors; + unsigned desc_slot; - desc_slot = util_idalloc_alloc(&sctx->bindless_used_slots); - if (desc_slot >= desc->num_elements) { - /* The array of bindless descriptors is full, resize it. */ - unsigned slot_size = desc->element_dw_size * 4; - unsigned new_num_elements = desc->num_elements * 2; - - desc->list = REALLOC(desc->list, desc->num_elements * slot_size, - new_num_elements * slot_size); - desc->num_elements = new_num_elements; - desc->num_active_slots = new_num_elements; - } + desc_slot = util_idalloc_alloc(&sctx->bindless_used_slots); + if (desc_slot >= desc->num_elements) { + /* The array of bindless descriptors is full, resize it. */ + unsigned slot_size = desc->element_dw_size * 4; + unsigned new_num_elements = desc->num_elements * 2; + + desc->list = REALLOC(desc->list, desc->num_elements * slot_size, + new_num_elements * slot_size); + desc->num_elements = new_num_elements; + desc->num_active_slots = new_num_elements; + } - assert(desc_slot); - return desc_slot; + assert(desc_slot); + return desc_slot; } static unsigned si_create_bindless_descriptor(struct si_context *sctx, uint32_t *desc_list, - unsigned size) + unsigned size) { - struct si_descriptors *desc = &sctx->bindless_descriptors; - unsigned desc_slot, desc_slot_offset; + struct si_descriptors *desc = &sctx->bindless_descriptors; + unsigned desc_slot, desc_slot_offset; - /* Find a free slot. */ - desc_slot = si_get_first_free_bindless_slot(sctx); + /* Find a free slot. */ + desc_slot = si_get_first_free_bindless_slot(sctx); - /* For simplicity, sampler and image bindless descriptors use fixed - * 16-dword slots for now. Image descriptors only need 8-dword but this - * doesn't really matter because no real apps use image handles. - */ - desc_slot_offset = desc_slot * 16; + /* For simplicity, sampler and image bindless descriptors use fixed + * 16-dword slots for now. Image descriptors only need 8-dword but this + * doesn't really matter because no real apps use image handles. + */ + desc_slot_offset = desc_slot * 16; - /* Copy the descriptor into the array. */ - memcpy(desc->list + desc_slot_offset, desc_list, size); + /* Copy the descriptor into the array. */ + memcpy(desc->list + desc_slot_offset, desc_list, size); - /* Re-upload the whole array of bindless descriptors into a new buffer. - */ - if (!si_upload_descriptors(sctx, desc)) - return 0; + /* Re-upload the whole array of bindless descriptors into a new buffer. + */ + if (!si_upload_descriptors(sctx, desc)) + return 0; - /* Make sure to re-emit the shader pointers for all stages. */ - sctx->graphics_bindless_pointer_dirty = true; - sctx->compute_bindless_pointer_dirty = true; + /* Make sure to re-emit the shader pointers for all stages. */ + sctx->graphics_bindless_pointer_dirty = true; + sctx->compute_bindless_pointer_dirty = true; - return desc_slot; + return desc_slot; } static void si_update_bindless_buffer_descriptor(struct si_context *sctx, - unsigned desc_slot, - struct pipe_resource *resource, - uint64_t offset, - bool *desc_dirty) -{ - struct si_descriptors *desc = &sctx->bindless_descriptors; - struct si_resource *buf = si_resource(resource); - unsigned desc_slot_offset = desc_slot * 16; - uint32_t *desc_list = desc->list + desc_slot_offset + 4; - uint64_t old_desc_va; - - assert(resource->target == PIPE_BUFFER); - - /* Retrieve the old buffer addr from the descriptor. */ - old_desc_va = si_desc_extract_buffer_address(desc_list); - - if (old_desc_va != buf->gpu_address + offset) { - /* The buffer has been invalidated when the handle wasn't - * resident, update the descriptor and the dirty flag. - */ - si_set_buf_desc_address(buf, offset, &desc_list[0]); + unsigned desc_slot, + struct pipe_resource *resource, + uint64_t offset, + bool *desc_dirty) +{ + struct si_descriptors *desc = &sctx->bindless_descriptors; + struct si_resource *buf = si_resource(resource); + unsigned desc_slot_offset = desc_slot * 16; + uint32_t *desc_list = desc->list + desc_slot_offset + 4; + uint64_t old_desc_va; + + assert(resource->target == PIPE_BUFFER); + + /* Retrieve the old buffer addr from the descriptor. */ + old_desc_va = si_desc_extract_buffer_address(desc_list); + + if (old_desc_va != buf->gpu_address + offset) { + /* The buffer has been invalidated when the handle wasn't + * resident, update the descriptor and the dirty flag. + */ + si_set_buf_desc_address(buf, offset, &desc_list[0]); - *desc_dirty = true; - } + *desc_dirty = true; + } } static uint64_t si_create_texture_handle(struct pipe_context *ctx, - struct pipe_sampler_view *view, - const struct pipe_sampler_state *state) + struct pipe_sampler_view *view, + const struct pipe_sampler_state *state) { - struct si_sampler_view *sview = (struct si_sampler_view *)view; - struct si_context *sctx = (struct si_context *)ctx; - struct si_texture_handle *tex_handle; - struct si_sampler_state *sstate; - uint32_t desc_list[16]; - uint64_t handle; - - tex_handle = CALLOC_STRUCT(si_texture_handle); - if (!tex_handle) - return 0; - - memset(desc_list, 0, sizeof(desc_list)); - si_init_descriptor_list(&desc_list[0], 16, 1, null_texture_descriptor); - - sstate = ctx->create_sampler_state(ctx, state); - if (!sstate) { - FREE(tex_handle); - return 0; - } - - si_set_sampler_view_desc(sctx, sview, sstate, &desc_list[0]); - memcpy(&tex_handle->sstate, sstate, sizeof(*sstate)); - ctx->delete_sampler_state(ctx, sstate); - - tex_handle->desc_slot = si_create_bindless_descriptor(sctx, desc_list, - sizeof(desc_list)); - if (!tex_handle->desc_slot) { - FREE(tex_handle); - return 0; - } - - handle = tex_handle->desc_slot; - - if (!_mesa_hash_table_insert(sctx->tex_handles, - (void *)(uintptr_t)handle, - tex_handle)) { - FREE(tex_handle); - return 0; - } + struct si_sampler_view *sview = (struct si_sampler_view *)view; + struct si_context *sctx = (struct si_context *)ctx; + struct si_texture_handle *tex_handle; + struct si_sampler_state *sstate; + uint32_t desc_list[16]; + uint64_t handle; + + tex_handle = CALLOC_STRUCT(si_texture_handle); + if (!tex_handle) + return 0; + + memset(desc_list, 0, sizeof(desc_list)); + si_init_descriptor_list(&desc_list[0], 16, 1, null_texture_descriptor); + + sstate = ctx->create_sampler_state(ctx, state); + if (!sstate) { + FREE(tex_handle); + return 0; + } + + si_set_sampler_view_desc(sctx, sview, sstate, &desc_list[0]); + memcpy(&tex_handle->sstate, sstate, sizeof(*sstate)); + ctx->delete_sampler_state(ctx, sstate); + + tex_handle->desc_slot = si_create_bindless_descriptor(sctx, desc_list, + sizeof(desc_list)); + if (!tex_handle->desc_slot) { + FREE(tex_handle); + return 0; + } + + handle = tex_handle->desc_slot; + + if (!_mesa_hash_table_insert(sctx->tex_handles, + (void *)(uintptr_t)handle, + tex_handle)) { + FREE(tex_handle); + return 0; + } - pipe_sampler_view_reference(&tex_handle->view, view); + pipe_sampler_view_reference(&tex_handle->view, view); - si_resource(sview->base.texture)->texture_handle_allocated = true; + si_resource(sview->base.texture)->texture_handle_allocated = true; - return handle; + return handle; } static void si_delete_texture_handle(struct pipe_context *ctx, uint64_t handle) { - struct si_context *sctx = (struct si_context *)ctx; - struct si_texture_handle *tex_handle; - struct hash_entry *entry; - - entry = _mesa_hash_table_search(sctx->tex_handles, - (void *)(uintptr_t)handle); - if (!entry) - return; - - tex_handle = (struct si_texture_handle *)entry->data; - - /* Allow this descriptor slot to be re-used. */ - util_idalloc_free(&sctx->bindless_used_slots, tex_handle->desc_slot); - - pipe_sampler_view_reference(&tex_handle->view, NULL); - _mesa_hash_table_remove(sctx->tex_handles, entry); - FREE(tex_handle); + struct si_context *sctx = (struct si_context *)ctx; + struct si_texture_handle *tex_handle; + struct hash_entry *entry; + + entry = _mesa_hash_table_search(sctx->tex_handles, + (void *)(uintptr_t)handle); + if (!entry) + return; + + tex_handle = (struct si_texture_handle *)entry->data; + + /* Allow this descriptor slot to be re-used. */ + util_idalloc_free(&sctx->bindless_used_slots, tex_handle->desc_slot); + + pipe_sampler_view_reference(&tex_handle->view, NULL); + _mesa_hash_table_remove(sctx->tex_handles, entry); + FREE(tex_handle); } static void si_make_texture_handle_resident(struct pipe_context *ctx, - uint64_t handle, bool resident) + uint64_t handle, bool resident) { - struct si_context *sctx = (struct si_context *)ctx; - struct si_texture_handle *tex_handle; - struct si_sampler_view *sview; - struct hash_entry *entry; - - entry = _mesa_hash_table_search(sctx->tex_handles, - (void *)(uintptr_t)handle); - if (!entry) - return; - - tex_handle = (struct si_texture_handle *)entry->data; - sview = (struct si_sampler_view *)tex_handle->view; - - if (resident) { - if (sview->base.texture->target != PIPE_BUFFER) { - struct si_texture *tex = - (struct si_texture *)sview->base.texture; - - if (depth_needs_decompression(tex)) { - util_dynarray_append( - &sctx->resident_tex_needs_depth_decompress, - struct si_texture_handle *, - tex_handle); - } - - if (color_needs_decompression(tex)) { - util_dynarray_append( - &sctx->resident_tex_needs_color_decompress, - struct si_texture_handle *, - tex_handle); - } - - if (tex->dcc_offset && - p_atomic_read(&tex->framebuffers_bound)) - sctx->need_check_render_feedback = true; - - si_update_bindless_texture_descriptor(sctx, tex_handle); - } else { - si_update_bindless_buffer_descriptor(sctx, - tex_handle->desc_slot, - sview->base.texture, - sview->base.u.buf.offset, - &tex_handle->desc_dirty); - } - - /* Re-upload the descriptor if it has been updated while it - * wasn't resident. - */ - if (tex_handle->desc_dirty) - sctx->bindless_descriptors_dirty = true; - - /* Add the texture handle to the per-context list. */ - util_dynarray_append(&sctx->resident_tex_handles, - struct si_texture_handle *, tex_handle); - - /* Add the buffers to the current CS in case si_begin_new_cs() - * is not going to be called. - */ - si_sampler_view_add_buffer(sctx, sview->base.texture, - RADEON_USAGE_READ, - sview->is_stencil_sampler, false); - } else { - /* Remove the texture handle from the per-context list. */ - util_dynarray_delete_unordered(&sctx->resident_tex_handles, - struct si_texture_handle *, - tex_handle); - - if (sview->base.texture->target != PIPE_BUFFER) { - util_dynarray_delete_unordered( - &sctx->resident_tex_needs_depth_decompress, - struct si_texture_handle *, tex_handle); - - util_dynarray_delete_unordered( - &sctx->resident_tex_needs_color_decompress, - struct si_texture_handle *, tex_handle); - } - } + struct si_context *sctx = (struct si_context *)ctx; + struct si_texture_handle *tex_handle; + struct si_sampler_view *sview; + struct hash_entry *entry; + + entry = _mesa_hash_table_search(sctx->tex_handles, + (void *)(uintptr_t)handle); + if (!entry) + return; + + tex_handle = (struct si_texture_handle *)entry->data; + sview = (struct si_sampler_view *)tex_handle->view; + + if (resident) { + if (sview->base.texture->target != PIPE_BUFFER) { + struct si_texture *tex = + (struct si_texture *)sview->base.texture; + + if (depth_needs_decompression(tex)) { + util_dynarray_append( + &sctx->resident_tex_needs_depth_decompress, + struct si_texture_handle *, + tex_handle); + } + + if (color_needs_decompression(tex)) { + util_dynarray_append( + &sctx->resident_tex_needs_color_decompress, + struct si_texture_handle *, + tex_handle); + } + + if (tex->surface.dcc_offset && + p_atomic_read(&tex->framebuffers_bound)) + sctx->need_check_render_feedback = true; + + si_update_bindless_texture_descriptor(sctx, tex_handle); + } else { + si_update_bindless_buffer_descriptor(sctx, + tex_handle->desc_slot, + sview->base.texture, + sview->base.u.buf.offset, + &tex_handle->desc_dirty); + } + + /* Re-upload the descriptor if it has been updated while it + * wasn't resident. + */ + if (tex_handle->desc_dirty) + sctx->bindless_descriptors_dirty = true; + + /* Add the texture handle to the per-context list. */ + util_dynarray_append(&sctx->resident_tex_handles, + struct si_texture_handle *, tex_handle); + + /* Add the buffers to the current CS in case si_begin_new_cs() + * is not going to be called. + */ + si_sampler_view_add_buffer(sctx, sview->base.texture, + RADEON_USAGE_READ, + sview->is_stencil_sampler, false); + } else { + /* Remove the texture handle from the per-context list. */ + util_dynarray_delete_unordered(&sctx->resident_tex_handles, + struct si_texture_handle *, + tex_handle); + + if (sview->base.texture->target != PIPE_BUFFER) { + util_dynarray_delete_unordered( + &sctx->resident_tex_needs_depth_decompress, + struct si_texture_handle *, tex_handle); + + util_dynarray_delete_unordered( + &sctx->resident_tex_needs_color_decompress, + struct si_texture_handle *, tex_handle); + } + } } static uint64_t si_create_image_handle(struct pipe_context *ctx, - const struct pipe_image_view *view) + const struct pipe_image_view *view) { - struct si_context *sctx = (struct si_context *)ctx; - struct si_image_handle *img_handle; - uint32_t desc_list[8]; - uint64_t handle; + struct si_context *sctx = (struct si_context *)ctx; + struct si_image_handle *img_handle; + uint32_t desc_list[16]; + uint64_t handle; - if (!view || !view->resource) - return 0; + if (!view || !view->resource) + return 0; - img_handle = CALLOC_STRUCT(si_image_handle); - if (!img_handle) - return 0; + img_handle = CALLOC_STRUCT(si_image_handle); + if (!img_handle) + return 0; - memset(desc_list, 0, sizeof(desc_list)); - si_init_descriptor_list(&desc_list[0], 8, 1, null_image_descriptor); + memset(desc_list, 0, sizeof(desc_list)); + si_init_descriptor_list(&desc_list[0], 8, 2, null_image_descriptor); - si_set_shader_image_desc(sctx, view, false, &desc_list[0], NULL); + si_set_shader_image_desc(sctx, view, false, &desc_list[0], &desc_list[8]); - img_handle->desc_slot = si_create_bindless_descriptor(sctx, desc_list, - sizeof(desc_list)); - if (!img_handle->desc_slot) { - FREE(img_handle); - return 0; - } + img_handle->desc_slot = si_create_bindless_descriptor(sctx, desc_list, + sizeof(desc_list)); + if (!img_handle->desc_slot) { + FREE(img_handle); + return 0; + } - handle = img_handle->desc_slot; + handle = img_handle->desc_slot; - if (!_mesa_hash_table_insert(sctx->img_handles, - (void *)(uintptr_t)handle, - img_handle)) { - FREE(img_handle); - return 0; - } + if (!_mesa_hash_table_insert(sctx->img_handles, + (void *)(uintptr_t)handle, + img_handle)) { + FREE(img_handle); + return 0; + } - util_copy_image_view(&img_handle->view, view); + util_copy_image_view(&img_handle->view, view); - si_resource(view->resource)->image_handle_allocated = true; + si_resource(view->resource)->image_handle_allocated = true; - return handle; + return handle; } static void si_delete_image_handle(struct pipe_context *ctx, uint64_t handle) { - struct si_context *sctx = (struct si_context *)ctx; - struct si_image_handle *img_handle; - struct hash_entry *entry; - - entry = _mesa_hash_table_search(sctx->img_handles, - (void *)(uintptr_t)handle); - if (!entry) - return; - - img_handle = (struct si_image_handle *)entry->data; - - util_copy_image_view(&img_handle->view, NULL); - _mesa_hash_table_remove(sctx->img_handles, entry); - FREE(img_handle); + struct si_context *sctx = (struct si_context *)ctx; + struct si_image_handle *img_handle; + struct hash_entry *entry; + + entry = _mesa_hash_table_search(sctx->img_handles, + (void *)(uintptr_t)handle); + if (!entry) + return; + + img_handle = (struct si_image_handle *)entry->data; + + util_copy_image_view(&img_handle->view, NULL); + _mesa_hash_table_remove(sctx->img_handles, entry); + FREE(img_handle); } static void si_make_image_handle_resident(struct pipe_context *ctx, - uint64_t handle, unsigned access, - bool resident) + uint64_t handle, unsigned access, + bool resident) { - struct si_context *sctx = (struct si_context *)ctx; - struct si_image_handle *img_handle; - struct pipe_image_view *view; - struct si_resource *res; - struct hash_entry *entry; - - entry = _mesa_hash_table_search(sctx->img_handles, - (void *)(uintptr_t)handle); - if (!entry) - return; - - img_handle = (struct si_image_handle *)entry->data; - view = &img_handle->view; - res = si_resource(view->resource); - - if (resident) { - if (res->b.b.target != PIPE_BUFFER) { - struct si_texture *tex = (struct si_texture *)res; - unsigned level = view->u.tex.level; - - if (color_needs_decompression(tex)) { - util_dynarray_append( - &sctx->resident_img_needs_color_decompress, - struct si_image_handle *, - img_handle); - } - - if (vi_dcc_enabled(tex, level) && - p_atomic_read(&tex->framebuffers_bound)) - sctx->need_check_render_feedback = true; - - si_update_bindless_image_descriptor(sctx, img_handle); - } else { - si_update_bindless_buffer_descriptor(sctx, - img_handle->desc_slot, - view->resource, - view->u.buf.offset, - &img_handle->desc_dirty); - } - - /* Re-upload the descriptor if it has been updated while it - * wasn't resident. - */ - if (img_handle->desc_dirty) - sctx->bindless_descriptors_dirty = true; - - /* Add the image handle to the per-context list. */ - util_dynarray_append(&sctx->resident_img_handles, - struct si_image_handle *, img_handle); - - /* Add the buffers to the current CS in case si_begin_new_cs() - * is not going to be called. - */ - si_sampler_view_add_buffer(sctx, view->resource, - (access & PIPE_IMAGE_ACCESS_WRITE) ? - RADEON_USAGE_READWRITE : - RADEON_USAGE_READ, false, false); - } else { - /* Remove the image handle from the per-context list. */ - util_dynarray_delete_unordered(&sctx->resident_img_handles, - struct si_image_handle *, - img_handle); - - if (res->b.b.target != PIPE_BUFFER) { - util_dynarray_delete_unordered( - &sctx->resident_img_needs_color_decompress, - struct si_image_handle *, - img_handle); - } - } + struct si_context *sctx = (struct si_context *)ctx; + struct si_image_handle *img_handle; + struct pipe_image_view *view; + struct si_resource *res; + struct hash_entry *entry; + + entry = _mesa_hash_table_search(sctx->img_handles, + (void *)(uintptr_t)handle); + if (!entry) + return; + + img_handle = (struct si_image_handle *)entry->data; + view = &img_handle->view; + res = si_resource(view->resource); + + if (resident) { + if (res->b.b.target != PIPE_BUFFER) { + struct si_texture *tex = (struct si_texture *)res; + unsigned level = view->u.tex.level; + + if (color_needs_decompression(tex)) { + util_dynarray_append( + &sctx->resident_img_needs_color_decompress, + struct si_image_handle *, + img_handle); + } + + if (vi_dcc_enabled(tex, level) && + p_atomic_read(&tex->framebuffers_bound)) + sctx->need_check_render_feedback = true; + + si_update_bindless_image_descriptor(sctx, img_handle); + } else { + si_update_bindless_buffer_descriptor(sctx, + img_handle->desc_slot, + view->resource, + view->u.buf.offset, + &img_handle->desc_dirty); + } + + /* Re-upload the descriptor if it has been updated while it + * wasn't resident. + */ + if (img_handle->desc_dirty) + sctx->bindless_descriptors_dirty = true; + + /* Add the image handle to the per-context list. */ + util_dynarray_append(&sctx->resident_img_handles, + struct si_image_handle *, img_handle); + + /* Add the buffers to the current CS in case si_begin_new_cs() + * is not going to be called. + */ + si_sampler_view_add_buffer(sctx, view->resource, + (access & PIPE_IMAGE_ACCESS_WRITE) ? + RADEON_USAGE_READWRITE : + RADEON_USAGE_READ, false, false); + } else { + /* Remove the image handle from the per-context list. */ + util_dynarray_delete_unordered(&sctx->resident_img_handles, + struct si_image_handle *, + img_handle); + + if (res->b.b.target != PIPE_BUFFER) { + util_dynarray_delete_unordered( + &sctx->resident_img_needs_color_decompress, + struct si_image_handle *, + img_handle); + } + } } static void si_resident_buffers_add_all_to_bo_list(struct si_context *sctx) { - unsigned num_resident_tex_handles, num_resident_img_handles; + unsigned num_resident_tex_handles, num_resident_img_handles; - num_resident_tex_handles = sctx->resident_tex_handles.size / - sizeof(struct si_texture_handle *); - num_resident_img_handles = sctx->resident_img_handles.size / - sizeof(struct si_image_handle *); - - /* Add all resident texture handles. */ - util_dynarray_foreach(&sctx->resident_tex_handles, - struct si_texture_handle *, tex_handle) { - struct si_sampler_view *sview = - (struct si_sampler_view *)(*tex_handle)->view; - - si_sampler_view_add_buffer(sctx, sview->base.texture, - RADEON_USAGE_READ, - sview->is_stencil_sampler, false); - } - - /* Add all resident image handles. */ - util_dynarray_foreach(&sctx->resident_img_handles, - struct si_image_handle *, img_handle) { - struct pipe_image_view *view = &(*img_handle)->view; - - si_sampler_view_add_buffer(sctx, view->resource, - RADEON_USAGE_READWRITE, - false, false); - } - - sctx->num_resident_handles += num_resident_tex_handles + - num_resident_img_handles; - assert(sctx->bo_list_add_all_resident_resources); - sctx->bo_list_add_all_resident_resources = false; + num_resident_tex_handles = sctx->resident_tex_handles.size / + sizeof(struct si_texture_handle *); + num_resident_img_handles = sctx->resident_img_handles.size / + sizeof(struct si_image_handle *); + + /* Add all resident texture handles. */ + util_dynarray_foreach(&sctx->resident_tex_handles, + struct si_texture_handle *, tex_handle) { + struct si_sampler_view *sview = + (struct si_sampler_view *)(*tex_handle)->view; + + si_sampler_view_add_buffer(sctx, sview->base.texture, + RADEON_USAGE_READ, + sview->is_stencil_sampler, false); + } + + /* Add all resident image handles. */ + util_dynarray_foreach(&sctx->resident_img_handles, + struct si_image_handle *, img_handle) { + struct pipe_image_view *view = &(*img_handle)->view; + + si_sampler_view_add_buffer(sctx, view->resource, + RADEON_USAGE_READWRITE, + false, false); + } + + sctx->num_resident_handles += num_resident_tex_handles + + num_resident_img_handles; + assert(sctx->bo_list_add_all_resident_resources); + sctx->bo_list_add_all_resident_resources = false; } /* INIT/DEINIT/UPLOAD */ void si_init_all_descriptors(struct si_context *sctx) { - int i; - unsigned first_shader = - sctx->has_graphics ? 0 : PIPE_SHADER_COMPUTE; - - for (i = first_shader; i < SI_NUM_SHADERS; i++) { - bool is_2nd = sctx->chip_class >= GFX9 && - (i == PIPE_SHADER_TESS_CTRL || - i == PIPE_SHADER_GEOMETRY); - unsigned num_sampler_slots = SI_NUM_IMAGES / 2 + SI_NUM_SAMPLERS; - unsigned num_buffer_slots = SI_NUM_SHADER_BUFFERS + SI_NUM_CONST_BUFFERS; - int rel_dw_offset; - struct si_descriptors *desc; - - if (is_2nd) { - if (i == PIPE_SHADER_TESS_CTRL) { - rel_dw_offset = (R_00B408_SPI_SHADER_USER_DATA_ADDR_LO_HS - - R_00B430_SPI_SHADER_USER_DATA_LS_0) / 4; - } else if (sctx->chip_class >= GFX10) { /* PIPE_SHADER_GEOMETRY */ - rel_dw_offset = (R_00B208_SPI_SHADER_USER_DATA_ADDR_LO_GS - - R_00B230_SPI_SHADER_USER_DATA_GS_0) / 4; - } else { - rel_dw_offset = (R_00B208_SPI_SHADER_USER_DATA_ADDR_LO_GS - - R_00B330_SPI_SHADER_USER_DATA_ES_0) / 4; - } - } else { - rel_dw_offset = SI_SGPR_CONST_AND_SHADER_BUFFERS; - } - desc = si_const_and_shader_buffer_descriptors(sctx, i); - si_init_buffer_resources(&sctx->const_and_shader_buffers[i], desc, - num_buffer_slots, rel_dw_offset, - RADEON_PRIO_SHADER_RW_BUFFER, - RADEON_PRIO_CONST_BUFFER); - desc->slot_index_to_bind_directly = si_get_constbuf_slot(0); - - if (is_2nd) { - if (i == PIPE_SHADER_TESS_CTRL) { - rel_dw_offset = (R_00B40C_SPI_SHADER_USER_DATA_ADDR_HI_HS - - R_00B430_SPI_SHADER_USER_DATA_LS_0) / 4; - } else if (sctx->chip_class >= GFX10) { /* PIPE_SHADER_GEOMETRY */ - rel_dw_offset = (R_00B20C_SPI_SHADER_USER_DATA_ADDR_HI_GS - - R_00B230_SPI_SHADER_USER_DATA_GS_0) / 4; - } else { - rel_dw_offset = (R_00B20C_SPI_SHADER_USER_DATA_ADDR_HI_GS - - R_00B330_SPI_SHADER_USER_DATA_ES_0) / 4; - } - } else { - rel_dw_offset = SI_SGPR_SAMPLERS_AND_IMAGES; - } - - desc = si_sampler_and_image_descriptors(sctx, i); - si_init_descriptors(desc, rel_dw_offset, 16, num_sampler_slots); - - int j; - for (j = 0; j < SI_NUM_IMAGES; j++) - memcpy(desc->list + j * 8, null_image_descriptor, 8 * 4); - for (; j < SI_NUM_IMAGES + SI_NUM_SAMPLERS * 2; j++) - memcpy(desc->list + j * 8, null_texture_descriptor, 8 * 4); - } - - si_init_buffer_resources(&sctx->rw_buffers, - &sctx->descriptors[SI_DESCS_RW_BUFFERS], - SI_NUM_RW_BUFFERS, SI_SGPR_RW_BUFFERS, - /* The second priority is used by - * const buffers in RW buffer slots. */ - RADEON_PRIO_SHADER_RINGS, RADEON_PRIO_CONST_BUFFER); - sctx->descriptors[SI_DESCS_RW_BUFFERS].num_active_slots = SI_NUM_RW_BUFFERS; - - /* Initialize an array of 1024 bindless descriptors, when the limit is - * reached, just make it larger and re-upload the whole array. - */ - si_init_bindless_descriptors(sctx, &sctx->bindless_descriptors, - SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES, - 1024); - - sctx->descriptors_dirty = u_bit_consecutive(0, SI_NUM_DESCS); - - /* Set pipe_context functions. */ - sctx->b.bind_sampler_states = si_bind_sampler_states; - sctx->b.set_shader_images = si_set_shader_images; - sctx->b.set_constant_buffer = si_pipe_set_constant_buffer; - sctx->b.set_shader_buffers = si_set_shader_buffers; - sctx->b.set_sampler_views = si_set_sampler_views; - sctx->b.create_texture_handle = si_create_texture_handle; - sctx->b.delete_texture_handle = si_delete_texture_handle; - sctx->b.make_texture_handle_resident = si_make_texture_handle_resident; - sctx->b.create_image_handle = si_create_image_handle; - sctx->b.delete_image_handle = si_delete_image_handle; - sctx->b.make_image_handle_resident = si_make_image_handle_resident; - - if (!sctx->has_graphics) - return; - - sctx->b.set_polygon_stipple = si_set_polygon_stipple; - - /* Shader user data. */ - sctx->atoms.s.shader_pointers.emit = si_emit_graphics_shader_pointers; - - /* Set default and immutable mappings. */ - if (sctx->ngg) { - assert(sctx->chip_class >= GFX10); - si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, R_00B230_SPI_SHADER_USER_DATA_GS_0); - } else { - si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, R_00B130_SPI_SHADER_USER_DATA_VS_0); - } - - if (sctx->chip_class == GFX9) { - si_set_user_data_base(sctx, PIPE_SHADER_TESS_CTRL, - R_00B430_SPI_SHADER_USER_DATA_LS_0); - si_set_user_data_base(sctx, PIPE_SHADER_GEOMETRY, - R_00B330_SPI_SHADER_USER_DATA_ES_0); - } else { - si_set_user_data_base(sctx, PIPE_SHADER_TESS_CTRL, - R_00B430_SPI_SHADER_USER_DATA_HS_0); - si_set_user_data_base(sctx, PIPE_SHADER_GEOMETRY, - R_00B230_SPI_SHADER_USER_DATA_GS_0); - } - si_set_user_data_base(sctx, PIPE_SHADER_FRAGMENT, R_00B030_SPI_SHADER_USER_DATA_PS_0); + int i; + unsigned first_shader = + sctx->has_graphics ? 0 : PIPE_SHADER_COMPUTE; + + for (i = first_shader; i < SI_NUM_SHADERS; i++) { + bool is_2nd = sctx->chip_class >= GFX9 && + (i == PIPE_SHADER_TESS_CTRL || + i == PIPE_SHADER_GEOMETRY); + unsigned num_sampler_slots = SI_NUM_IMAGE_SLOTS / 2 + SI_NUM_SAMPLERS; + unsigned num_buffer_slots = SI_NUM_SHADER_BUFFERS + SI_NUM_CONST_BUFFERS; + int rel_dw_offset; + struct si_descriptors *desc; + + if (is_2nd) { + if (i == PIPE_SHADER_TESS_CTRL) { + rel_dw_offset = (R_00B408_SPI_SHADER_USER_DATA_ADDR_LO_HS - + R_00B430_SPI_SHADER_USER_DATA_LS_0) / 4; + } else if (sctx->chip_class >= GFX10) { /* PIPE_SHADER_GEOMETRY */ + rel_dw_offset = (R_00B208_SPI_SHADER_USER_DATA_ADDR_LO_GS - + R_00B230_SPI_SHADER_USER_DATA_GS_0) / 4; + } else { + rel_dw_offset = (R_00B208_SPI_SHADER_USER_DATA_ADDR_LO_GS - + R_00B330_SPI_SHADER_USER_DATA_ES_0) / 4; + } + } else { + rel_dw_offset = SI_SGPR_CONST_AND_SHADER_BUFFERS; + } + desc = si_const_and_shader_buffer_descriptors(sctx, i); + si_init_buffer_resources(&sctx->const_and_shader_buffers[i], desc, + num_buffer_slots, rel_dw_offset, + RADEON_PRIO_SHADER_RW_BUFFER, + RADEON_PRIO_CONST_BUFFER); + desc->slot_index_to_bind_directly = si_get_constbuf_slot(0); + + if (is_2nd) { + if (i == PIPE_SHADER_TESS_CTRL) { + rel_dw_offset = (R_00B40C_SPI_SHADER_USER_DATA_ADDR_HI_HS - + R_00B430_SPI_SHADER_USER_DATA_LS_0) / 4; + } else if (sctx->chip_class >= GFX10) { /* PIPE_SHADER_GEOMETRY */ + rel_dw_offset = (R_00B20C_SPI_SHADER_USER_DATA_ADDR_HI_GS - + R_00B230_SPI_SHADER_USER_DATA_GS_0) / 4; + } else { + rel_dw_offset = (R_00B20C_SPI_SHADER_USER_DATA_ADDR_HI_GS - + R_00B330_SPI_SHADER_USER_DATA_ES_0) / 4; + } + } else { + rel_dw_offset = SI_SGPR_SAMPLERS_AND_IMAGES; + } + + desc = si_sampler_and_image_descriptors(sctx, i); + si_init_descriptors(desc, rel_dw_offset, 16, num_sampler_slots); + + int j; + for (j = 0; j < SI_NUM_IMAGE_SLOTS; j++) + memcpy(desc->list + j * 8, null_image_descriptor, 8 * 4); + for (; j < SI_NUM_IMAGE_SLOTS + SI_NUM_SAMPLERS * 2; j++) + memcpy(desc->list + j * 8, null_texture_descriptor, 8 * 4); + } + + si_init_buffer_resources(&sctx->rw_buffers, + &sctx->descriptors[SI_DESCS_RW_BUFFERS], + SI_NUM_RW_BUFFERS, SI_SGPR_RW_BUFFERS, + /* The second priority is used by + * const buffers in RW buffer slots. */ + RADEON_PRIO_SHADER_RINGS, RADEON_PRIO_CONST_BUFFER); + sctx->descriptors[SI_DESCS_RW_BUFFERS].num_active_slots = SI_NUM_RW_BUFFERS; + + /* Initialize an array of 1024 bindless descriptors, when the limit is + * reached, just make it larger and re-upload the whole array. + */ + si_init_bindless_descriptors(sctx, &sctx->bindless_descriptors, + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES, + 1024); + + sctx->descriptors_dirty = u_bit_consecutive(0, SI_NUM_DESCS); + + /* Set pipe_context functions. */ + sctx->b.bind_sampler_states = si_bind_sampler_states; + sctx->b.set_shader_images = si_set_shader_images; + sctx->b.set_constant_buffer = si_pipe_set_constant_buffer; + sctx->b.set_shader_buffers = si_set_shader_buffers; + sctx->b.set_sampler_views = si_set_sampler_views; + sctx->b.create_texture_handle = si_create_texture_handle; + sctx->b.delete_texture_handle = si_delete_texture_handle; + sctx->b.make_texture_handle_resident = si_make_texture_handle_resident; + sctx->b.create_image_handle = si_create_image_handle; + sctx->b.delete_image_handle = si_delete_image_handle; + sctx->b.make_image_handle_resident = si_make_image_handle_resident; + + if (!sctx->has_graphics) + return; + + sctx->b.set_polygon_stipple = si_set_polygon_stipple; + + /* Shader user data. */ + sctx->atoms.s.shader_pointers.emit = si_emit_graphics_shader_pointers; + + /* Set default and immutable mappings. */ + if (sctx->ngg) { + assert(sctx->chip_class >= GFX10); + si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, R_00B230_SPI_SHADER_USER_DATA_GS_0); + } else { + si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, R_00B130_SPI_SHADER_USER_DATA_VS_0); + } + + if (sctx->chip_class == GFX9) { + si_set_user_data_base(sctx, PIPE_SHADER_TESS_CTRL, + R_00B430_SPI_SHADER_USER_DATA_LS_0); + si_set_user_data_base(sctx, PIPE_SHADER_GEOMETRY, + R_00B330_SPI_SHADER_USER_DATA_ES_0); + } else { + si_set_user_data_base(sctx, PIPE_SHADER_TESS_CTRL, + R_00B430_SPI_SHADER_USER_DATA_HS_0); + si_set_user_data_base(sctx, PIPE_SHADER_GEOMETRY, + R_00B230_SPI_SHADER_USER_DATA_GS_0); + } + si_set_user_data_base(sctx, PIPE_SHADER_FRAGMENT, R_00B030_SPI_SHADER_USER_DATA_PS_0); } static bool si_upload_shader_descriptors(struct si_context *sctx, unsigned mask) { - unsigned dirty = sctx->descriptors_dirty & mask; + unsigned dirty = sctx->descriptors_dirty & mask; - /* Assume nothing will go wrong: */ - sctx->shader_pointers_dirty |= dirty; + /* Assume nothing will go wrong: */ + sctx->shader_pointers_dirty |= dirty; - while (dirty) { - unsigned i = u_bit_scan(&dirty); + while (dirty) { + unsigned i = u_bit_scan(&dirty); - if (!si_upload_descriptors(sctx, &sctx->descriptors[i])) - return false; - } + if (!si_upload_descriptors(sctx, &sctx->descriptors[i])) + return false; + } - sctx->descriptors_dirty &= ~mask; + sctx->descriptors_dirty &= ~mask; - si_upload_bindless_descriptors(sctx); + si_upload_bindless_descriptors(sctx); - return true; + return true; } bool si_upload_graphics_shader_descriptors(struct si_context *sctx) { - const unsigned mask = u_bit_consecutive(0, SI_DESCS_FIRST_COMPUTE); - return si_upload_shader_descriptors(sctx, mask); + const unsigned mask = u_bit_consecutive(0, SI_DESCS_FIRST_COMPUTE); + return si_upload_shader_descriptors(sctx, mask); } bool si_upload_compute_shader_descriptors(struct si_context *sctx) { - /* Does not update rw_buffers as that is not needed for compute shaders - * and the input buffer is using the same SGPR's anyway. - */ - const unsigned mask = u_bit_consecutive(SI_DESCS_FIRST_COMPUTE, - SI_NUM_DESCS - SI_DESCS_FIRST_COMPUTE); - return si_upload_shader_descriptors(sctx, mask); + /* Does not update rw_buffers as that is not needed for compute shaders + * and the input buffer is using the same SGPR's anyway. + */ + const unsigned mask = u_bit_consecutive(SI_DESCS_FIRST_COMPUTE, + SI_NUM_DESCS - SI_DESCS_FIRST_COMPUTE); + return si_upload_shader_descriptors(sctx, mask); } void si_release_all_descriptors(struct si_context *sctx) { - int i; + int i; - for (i = 0; i < SI_NUM_SHADERS; i++) { - si_release_buffer_resources(&sctx->const_and_shader_buffers[i], - si_const_and_shader_buffer_descriptors(sctx, i)); - si_release_sampler_views(&sctx->samplers[i]); - si_release_image_views(&sctx->images[i]); - } - si_release_buffer_resources(&sctx->rw_buffers, - &sctx->descriptors[SI_DESCS_RW_BUFFERS]); - for (i = 0; i < SI_NUM_VERTEX_BUFFERS; i++) - pipe_vertex_buffer_unreference(&sctx->vertex_buffer[i]); + for (i = 0; i < SI_NUM_SHADERS; i++) { + si_release_buffer_resources(&sctx->const_and_shader_buffers[i], + si_const_and_shader_buffer_descriptors(sctx, i)); + si_release_sampler_views(&sctx->samplers[i]); + si_release_image_views(&sctx->images[i]); + } + si_release_buffer_resources(&sctx->rw_buffers, + &sctx->descriptors[SI_DESCS_RW_BUFFERS]); + for (i = 0; i < SI_NUM_VERTEX_BUFFERS; i++) + pipe_vertex_buffer_unreference(&sctx->vertex_buffer[i]); - for (i = 0; i < SI_NUM_DESCS; ++i) - si_release_descriptors(&sctx->descriptors[i]); + for (i = 0; i < SI_NUM_DESCS; ++i) + si_release_descriptors(&sctx->descriptors[i]); - si_resource_reference(&sctx->vb_descriptors_buffer, NULL); - sctx->vb_descriptors_gpu_list = NULL; /* points into a mapped buffer */ + si_resource_reference(&sctx->vb_descriptors_buffer, NULL); + sctx->vb_descriptors_gpu_list = NULL; /* points into a mapped buffer */ - si_release_bindless_descriptors(sctx); + si_release_bindless_descriptors(sctx); } void si_gfx_resources_add_all_to_bo_list(struct si_context *sctx) { - for (unsigned i = 0; i < SI_NUM_GRAPHICS_SHADERS; i++) { - si_buffer_resources_begin_new_cs(sctx, &sctx->const_and_shader_buffers[i]); - si_sampler_views_begin_new_cs(sctx, &sctx->samplers[i]); - si_image_views_begin_new_cs(sctx, &sctx->images[i]); - } - si_buffer_resources_begin_new_cs(sctx, &sctx->rw_buffers); - si_vertex_buffers_begin_new_cs(sctx); + for (unsigned i = 0; i < SI_NUM_GRAPHICS_SHADERS; i++) { + si_buffer_resources_begin_new_cs(sctx, &sctx->const_and_shader_buffers[i]); + si_sampler_views_begin_new_cs(sctx, &sctx->samplers[i]); + si_image_views_begin_new_cs(sctx, &sctx->images[i]); + } + si_buffer_resources_begin_new_cs(sctx, &sctx->rw_buffers); + si_vertex_buffers_begin_new_cs(sctx); - if (sctx->bo_list_add_all_resident_resources) - si_resident_buffers_add_all_to_bo_list(sctx); + if (sctx->bo_list_add_all_resident_resources) + si_resident_buffers_add_all_to_bo_list(sctx); - assert(sctx->bo_list_add_all_gfx_resources); - sctx->bo_list_add_all_gfx_resources = false; + assert(sctx->bo_list_add_all_gfx_resources); + sctx->bo_list_add_all_gfx_resources = false; } void si_compute_resources_add_all_to_bo_list(struct si_context *sctx) { - unsigned sh = PIPE_SHADER_COMPUTE; + unsigned sh = PIPE_SHADER_COMPUTE; - si_buffer_resources_begin_new_cs(sctx, &sctx->const_and_shader_buffers[sh]); - si_sampler_views_begin_new_cs(sctx, &sctx->samplers[sh]); - si_image_views_begin_new_cs(sctx, &sctx->images[sh]); - si_buffer_resources_begin_new_cs(sctx, &sctx->rw_buffers); + si_buffer_resources_begin_new_cs(sctx, &sctx->const_and_shader_buffers[sh]); + si_sampler_views_begin_new_cs(sctx, &sctx->samplers[sh]); + si_image_views_begin_new_cs(sctx, &sctx->images[sh]); + si_buffer_resources_begin_new_cs(sctx, &sctx->rw_buffers); - if (sctx->bo_list_add_all_resident_resources) - si_resident_buffers_add_all_to_bo_list(sctx); + if (sctx->bo_list_add_all_resident_resources) + si_resident_buffers_add_all_to_bo_list(sctx); - assert(sctx->bo_list_add_all_compute_resources); - sctx->bo_list_add_all_compute_resources = false; + assert(sctx->bo_list_add_all_compute_resources); + sctx->bo_list_add_all_compute_resources = false; } void si_all_descriptors_begin_new_cs(struct si_context *sctx) { - for (unsigned i = 0; i < SI_NUM_DESCS; ++i) - si_descriptors_begin_new_cs(sctx, &sctx->descriptors[i]); - si_descriptors_begin_new_cs(sctx, &sctx->bindless_descriptors); - - si_shader_pointers_begin_new_cs(sctx); - - sctx->bo_list_add_all_resident_resources = true; - sctx->bo_list_add_all_gfx_resources = true; - sctx->bo_list_add_all_compute_resources = true; + for (unsigned i = 0; i < SI_NUM_DESCS; ++i) + si_descriptors_begin_new_cs(sctx, &sctx->descriptors[i]); + si_descriptors_begin_new_cs(sctx, &sctx->bindless_descriptors); + + si_shader_pointers_begin_new_cs(sctx); + + sctx->bo_list_add_all_resident_resources = true; + sctx->bo_list_add_all_gfx_resources = true; + sctx->bo_list_add_all_compute_resources = true; } void si_set_active_descriptors(struct si_context *sctx, unsigned desc_idx, - uint64_t new_active_mask) + uint64_t new_active_mask) { - struct si_descriptors *desc = &sctx->descriptors[desc_idx]; + struct si_descriptors *desc = &sctx->descriptors[desc_idx]; - /* Ignore no-op updates and updates that disable all slots. */ - if (!new_active_mask || - new_active_mask == u_bit_consecutive64(desc->first_active_slot, - desc->num_active_slots)) - return; - - int first, count; - u_bit_scan_consecutive_range64(&new_active_mask, &first, &count); - assert(new_active_mask == 0); - - /* Upload/dump descriptors if slots are being enabled. */ - if (first < desc->first_active_slot || - first + count > desc->first_active_slot + desc->num_active_slots) - sctx->descriptors_dirty |= 1u << desc_idx; + /* Ignore no-op updates and updates that disable all slots. */ + if (!new_active_mask || + new_active_mask == u_bit_consecutive64(desc->first_active_slot, + desc->num_active_slots)) + return; + + int first, count; + u_bit_scan_consecutive_range64(&new_active_mask, &first, &count); + assert(new_active_mask == 0); + + /* Upload/dump descriptors if slots are being enabled. */ + if (first < desc->first_active_slot || + first + count > desc->first_active_slot + desc->num_active_slots) + sctx->descriptors_dirty |= 1u << desc_idx; - desc->first_active_slot = first; - desc->num_active_slots = count; + desc->first_active_slot = first; + desc->num_active_slots = count; } void si_set_active_descriptors_for_shader(struct si_context *sctx, - struct si_shader_selector *sel) + struct si_shader_selector *sel) { - if (!sel) - return; + if (!sel) + return; - si_set_active_descriptors(sctx, - si_const_and_shader_buffer_descriptors_idx(sel->type), - sel->active_const_and_shader_buffers); - si_set_active_descriptors(sctx, - si_sampler_and_image_descriptors_idx(sel->type), - sel->active_samplers_and_images); + si_set_active_descriptors(sctx, + si_const_and_shader_buffer_descriptors_idx(sel->type), + sel->active_const_and_shader_buffers); + si_set_active_descriptors(sctx, + si_sampler_and_image_descriptors_idx(sel->type), + sel->active_samplers_and_images); } diff -Nru mesa-19.2.8/src/gallium/drivers/radeonsi/si_dma.c mesa-20.0.8/src/gallium/drivers/radeonsi/si_dma.c --- mesa-19.2.8/src/gallium/drivers/radeonsi/si_dma.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/radeonsi/si_dma.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,289 +0,0 @@ -/* - * Copyright 2010 Jerome Glisse - * Copyright 2018 Advanced Micro Devices, Inc. - * All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * on the rights to use, copy, modify, merge, publish, distribute, sub - * license, and/or sell copies of the Software, and to permit persons to whom - * the Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL - * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, - * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR - * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE - * USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -#include "sid.h" -#include "si_pipe.h" - -#include "util/u_format.h" - -static void si_dma_copy_buffer(struct si_context *ctx, - struct pipe_resource *dst, - struct pipe_resource *src, - uint64_t dst_offset, - uint64_t src_offset, - uint64_t size) -{ - struct radeon_cmdbuf *cs = ctx->dma_cs; - unsigned i, ncopy, count, max_size, sub_cmd, shift; - struct si_resource *sdst = si_resource(dst); - struct si_resource *ssrc = si_resource(src); - - /* Mark the buffer range of destination as valid (initialized), - * so that transfer_map knows it should wait for the GPU when mapping - * that range. */ - util_range_add(&sdst->valid_buffer_range, dst_offset, - dst_offset + size); - - dst_offset += sdst->gpu_address; - src_offset += ssrc->gpu_address; - - /* see whether we should use the dword-aligned or byte-aligned copy */ - if (!(dst_offset % 4) && !(src_offset % 4) && !(size % 4)) { - sub_cmd = SI_DMA_COPY_DWORD_ALIGNED; - shift = 2; - max_size = SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE; - } else { - sub_cmd = SI_DMA_COPY_BYTE_ALIGNED; - shift = 0; - max_size = SI_DMA_COPY_MAX_BYTE_ALIGNED_SIZE; - } - - ncopy = DIV_ROUND_UP(size, max_size); - si_need_dma_space(ctx, ncopy * 5, sdst, ssrc); - - for (i = 0; i < ncopy; i++) { - count = MIN2(size, max_size); - radeon_emit(cs, SI_DMA_PACKET(SI_DMA_PACKET_COPY, sub_cmd, - count >> shift)); - radeon_emit(cs, dst_offset); - radeon_emit(cs, src_offset); - radeon_emit(cs, (dst_offset >> 32UL) & 0xff); - radeon_emit(cs, (src_offset >> 32UL) & 0xff); - dst_offset += count; - src_offset += count; - size -= count; - } -} - -static void si_dma_copy_tile(struct si_context *ctx, - struct pipe_resource *dst, - unsigned dst_level, - unsigned dst_x, - unsigned dst_y, - unsigned dst_z, - struct pipe_resource *src, - unsigned src_level, - unsigned src_x, - unsigned src_y, - unsigned src_z, - unsigned copy_height, - unsigned pitch, - unsigned bpp) -{ - struct radeon_cmdbuf *cs = ctx->dma_cs; - struct si_texture *ssrc = (struct si_texture*)src; - struct si_texture *sdst = (struct si_texture*)dst; - unsigned dst_mode = sdst->surface.u.legacy.level[dst_level].mode; - bool detile = dst_mode == RADEON_SURF_MODE_LINEAR_ALIGNED; - struct si_texture *linear = detile ? sdst : ssrc; - struct si_texture *tiled = detile ? ssrc : sdst; - unsigned linear_lvl = detile ? dst_level : src_level; - unsigned tiled_lvl = detile ? src_level : dst_level; - struct radeon_info *info = &ctx->screen->info; - unsigned index = tiled->surface.u.legacy.tiling_index[tiled_lvl]; - unsigned tile_mode = info->si_tile_mode_array[index]; - unsigned array_mode, lbpp, pitch_tile_max, slice_tile_max, size; - unsigned ncopy, height, cheight, i; - unsigned linear_x, linear_y, linear_z, tiled_x, tiled_y, tiled_z; - unsigned sub_cmd, bank_h, bank_w, mt_aspect, nbanks, tile_split, mt; - uint64_t base, addr; - unsigned pipe_config; - - assert(dst_mode != ssrc->surface.u.legacy.level[src_level].mode); - - sub_cmd = SI_DMA_COPY_TILED; - lbpp = util_logbase2(bpp); - pitch_tile_max = ((pitch / bpp) / 8) - 1; - - linear_x = detile ? dst_x : src_x; - linear_y = detile ? dst_y : src_y; - linear_z = detile ? dst_z : src_z; - tiled_x = detile ? src_x : dst_x; - tiled_y = detile ? src_y : dst_y; - tiled_z = detile ? src_z : dst_z; - - assert(!util_format_is_depth_and_stencil(tiled->buffer.b.b.format)); - - array_mode = G_009910_ARRAY_MODE(tile_mode); - slice_tile_max = (tiled->surface.u.legacy.level[tiled_lvl].nblk_x * - tiled->surface.u.legacy.level[tiled_lvl].nblk_y) / (8*8) - 1; - /* linear height must be the same as the slice tile max height, it's ok even - * if the linear destination/source have smaller heigh as the size of the - * dma packet will be using the copy_height which is always smaller or equal - * to the linear height - */ - height = tiled->surface.u.legacy.level[tiled_lvl].nblk_y; - base = tiled->surface.u.legacy.level[tiled_lvl].offset; - addr = linear->surface.u.legacy.level[linear_lvl].offset; - addr += (uint64_t)linear->surface.u.legacy.level[linear_lvl].slice_size_dw * 4 * linear_z; - addr += linear_y * pitch + linear_x * bpp; - bank_h = G_009910_BANK_HEIGHT(tile_mode); - bank_w = G_009910_BANK_WIDTH(tile_mode); - mt_aspect = G_009910_MACRO_TILE_ASPECT(tile_mode); - /* Non-depth modes don't have TILE_SPLIT set. */ - tile_split = util_logbase2(tiled->surface.u.legacy.tile_split >> 6); - nbanks = G_009910_NUM_BANKS(tile_mode); - base += tiled->buffer.gpu_address; - addr += linear->buffer.gpu_address; - - pipe_config = G_009910_PIPE_CONFIG(tile_mode); - mt = G_009910_MICRO_TILE_MODE(tile_mode); - size = copy_height * pitch; - ncopy = DIV_ROUND_UP(size, SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE); - si_need_dma_space(ctx, ncopy * 9, &sdst->buffer, &ssrc->buffer); - - for (i = 0; i < ncopy; i++) { - cheight = copy_height; - if (cheight * pitch > SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE) { - cheight = SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE / pitch; - } - size = cheight * pitch; - radeon_emit(cs, SI_DMA_PACKET(SI_DMA_PACKET_COPY, sub_cmd, size / 4)); - radeon_emit(cs, base >> 8); - radeon_emit(cs, (detile << 31) | (array_mode << 27) | - (lbpp << 24) | (bank_h << 21) | - (bank_w << 18) | (mt_aspect << 16)); - radeon_emit(cs, (pitch_tile_max << 0) | ((height - 1) << 16)); - radeon_emit(cs, (slice_tile_max << 0) | (pipe_config << 26)); - radeon_emit(cs, (tiled_x << 0) | (tiled_z << 18)); - radeon_emit(cs, (tiled_y << 0) | (tile_split << 21) | (nbanks << 25) | (mt << 27)); - radeon_emit(cs, addr & 0xfffffffc); - radeon_emit(cs, (addr >> 32UL) & 0xff); - copy_height -= cheight; - addr += cheight * pitch; - tiled_y += cheight; - } -} - -static void si_dma_copy(struct pipe_context *ctx, - struct pipe_resource *dst, - unsigned dst_level, - unsigned dstx, unsigned dsty, unsigned dstz, - struct pipe_resource *src, - unsigned src_level, - const struct pipe_box *src_box) -{ - struct si_context *sctx = (struct si_context *)ctx; - struct si_texture *ssrc = (struct si_texture*)src; - struct si_texture *sdst = (struct si_texture*)dst; - unsigned dst_pitch, src_pitch, bpp, dst_mode, src_mode; - unsigned src_w, dst_w; - unsigned src_x, src_y; - unsigned dst_x = dstx, dst_y = dsty, dst_z = dstz; - - if (sctx->dma_cs == NULL || - src->flags & PIPE_RESOURCE_FLAG_SPARSE || - dst->flags & PIPE_RESOURCE_FLAG_SPARSE) { - goto fallback; - } - - if (dst->target == PIPE_BUFFER && src->target == PIPE_BUFFER) { - si_dma_copy_buffer(sctx, dst, src, dst_x, src_box->x, src_box->width); - return; - } - - /* XXX: Using the asynchronous DMA engine for multi-dimensional - * operations seems to cause random GPU lockups for various people. - * While the root cause for this might need to be fixed in the kernel, - * let's disable it for now. - * - * Before re-enabling this, please make sure you can hit all newly - * enabled paths in your testing, preferably with both piglit and real - * world apps, and get in touch with people on the bug reports below - * for stability testing. - * - * https://bugs.freedesktop.org/show_bug.cgi?id=85647 - * https://bugs.freedesktop.org/show_bug.cgi?id=83500 - */ - goto fallback; - - if (src_box->depth > 1 || - !si_prepare_for_dma_blit(sctx, sdst, dst_level, dstx, dsty, - dstz, ssrc, src_level, src_box)) - goto fallback; - - src_x = util_format_get_nblocksx(src->format, src_box->x); - dst_x = util_format_get_nblocksx(src->format, dst_x); - src_y = util_format_get_nblocksy(src->format, src_box->y); - dst_y = util_format_get_nblocksy(src->format, dst_y); - - bpp = sdst->surface.bpe; - dst_pitch = sdst->surface.u.legacy.level[dst_level].nblk_x * sdst->surface.bpe; - src_pitch = ssrc->surface.u.legacy.level[src_level].nblk_x * ssrc->surface.bpe; - src_w = u_minify(ssrc->buffer.b.b.width0, src_level); - dst_w = u_minify(sdst->buffer.b.b.width0, dst_level); - - dst_mode = sdst->surface.u.legacy.level[dst_level].mode; - src_mode = ssrc->surface.u.legacy.level[src_level].mode; - - if (src_pitch != dst_pitch || src_box->x || dst_x || src_w != dst_w || - src_box->width != src_w || - src_box->height != u_minify(ssrc->buffer.b.b.height0, src_level) || - src_box->height != u_minify(sdst->buffer.b.b.height0, dst_level) || - ssrc->surface.u.legacy.level[src_level].nblk_y != - sdst->surface.u.legacy.level[dst_level].nblk_y) { - /* FIXME si can do partial blit */ - goto fallback; - } - /* the x test here are currently useless (because we don't support partial blit) - * but keep them around so we don't forget about those - */ - if ((src_pitch % 8) || (src_box->x % 8) || (dst_x % 8) || - (src_box->y % 8) || (dst_y % 8) || (src_box->height % 8)) { - goto fallback; - } - - if (src_mode == dst_mode) { - uint64_t dst_offset, src_offset; - /* simple dma blit would do NOTE code here assume : - * src_box.x/y == 0 - * dst_x/y == 0 - * dst_pitch == src_pitch - */ - src_offset= ssrc->surface.u.legacy.level[src_level].offset; - src_offset += (uint64_t)ssrc->surface.u.legacy.level[src_level].slice_size_dw * 4 * src_box->z; - src_offset += src_y * src_pitch + src_x * bpp; - dst_offset = sdst->surface.u.legacy.level[dst_level].offset; - dst_offset += (uint64_t)sdst->surface.u.legacy.level[dst_level].slice_size_dw * 4 * dst_z; - dst_offset += dst_y * dst_pitch + dst_x * bpp; - si_dma_copy_buffer(sctx, dst, src, dst_offset, src_offset, - (uint64_t)ssrc->surface.u.legacy.level[src_level].slice_size_dw * 4); - } else { - si_dma_copy_tile(sctx, dst, dst_level, dst_x, dst_y, dst_z, - src, src_level, src_x, src_y, src_box->z, - src_box->height / ssrc->surface.blk_h, - dst_pitch, bpp); - } - return; - -fallback: - si_resource_copy_region(ctx, dst, dst_level, dstx, dsty, dstz, - src, src_level, src_box); -} - -void si_init_dma_functions(struct si_context *sctx) -{ - sctx->dma_copy = si_dma_copy; -} diff -Nru mesa-19.2.8/src/gallium/drivers/radeonsi/si_dma_cs.c mesa-20.0.8/src/gallium/drivers/radeonsi/si_dma_cs.c --- mesa-19.2.8/src/gallium/drivers/radeonsi/si_dma_cs.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/radeonsi/si_dma_cs.c 2020-06-12 01:21:17.000000000 +0000 @@ -27,7 +27,7 @@ static void si_dma_emit_wait_idle(struct si_context *sctx) { - struct radeon_cmdbuf *cs = sctx->dma_cs; + struct radeon_cmdbuf *cs = sctx->sdma_cs; /* NOP waits for idle. */ if (sctx->chip_class >= GFX7) @@ -39,7 +39,7 @@ void si_dma_emit_timestamp(struct si_context *sctx, struct si_resource *dst, uint64_t offset) { - struct radeon_cmdbuf *cs = sctx->dma_cs; + struct radeon_cmdbuf *cs = sctx->sdma_cs; uint64_t va = dst->gpu_address + offset; if (sctx->chip_class == GFX6) { @@ -50,7 +50,7 @@ /* Mark the buffer range of destination as valid (initialized), * so that transfer_map knows it should wait for the GPU when mapping * that range. */ - util_range_add(&dst->valid_buffer_range, offset, offset + 8); + util_range_add(&dst->b.b, &dst->valid_buffer_range, offset, offset + 8); assert(va % 8 == 0); @@ -67,7 +67,7 @@ void si_sdma_clear_buffer(struct si_context *sctx, struct pipe_resource *dst, uint64_t offset, uint64_t size, unsigned clear_value) { - struct radeon_cmdbuf *cs = sctx->dma_cs; + struct radeon_cmdbuf *cs = sctx->sdma_cs; unsigned i, ncopy, csize; struct si_resource *sdst = si_resource(dst); @@ -75,7 +75,8 @@ assert(size); assert(size % 4 == 0); - if (!cs || dst->flags & PIPE_RESOURCE_FLAG_SPARSE) { + if (!cs || dst->flags & PIPE_RESOURCE_FLAG_SPARSE || + sctx->screen->debug_flags & DBG(NO_SDMA_CLEARS)) { sctx->b.clear_buffer(&sctx->b, dst, offset, size, &clear_value, 4); return; } @@ -83,7 +84,7 @@ /* Mark the buffer range of destination as valid (initialized), * so that transfer_map knows it should wait for the GPU when mapping * that range. */ - util_range_add(&sdst->valid_buffer_range, offset, offset + size); + util_range_add(dst, &sdst->valid_buffer_range, offset, offset + size); offset += sdst->gpu_address; @@ -117,18 +118,108 @@ radeon_emit(cs, offset); radeon_emit(cs, offset >> 32); radeon_emit(cs, clear_value); - radeon_emit(cs, sctx->chip_class >= GFX9 ? csize - 1 : csize); + /* dw count */ + radeon_emit(cs, (sctx->chip_class >= GFX9 ? csize - 1 : csize) & 0xfffffffc); offset += csize; size -= csize; } } +void si_sdma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, + struct pipe_resource *src, uint64_t dst_offset, + uint64_t src_offset, uint64_t size) +{ + struct radeon_cmdbuf *cs = sctx->sdma_cs; + unsigned i, ncopy, csize; + struct si_resource *sdst = si_resource(dst); + struct si_resource *ssrc = si_resource(src); + + if (!cs || + dst->flags & PIPE_RESOURCE_FLAG_SPARSE || + src->flags & PIPE_RESOURCE_FLAG_SPARSE) { + si_copy_buffer(sctx, dst, src, dst_offset, src_offset, size); + return; + } + + /* Mark the buffer range of destination as valid (initialized), + * so that transfer_map knows it should wait for the GPU when mapping + * that range. */ + util_range_add(dst, &sdst->valid_buffer_range, dst_offset, + dst_offset + size); + + dst_offset += sdst->gpu_address; + src_offset += ssrc->gpu_address; + + if (sctx->chip_class == GFX6) { + unsigned max_size, sub_cmd, shift; + + /* see whether we should use the dword-aligned or byte-aligned copy */ + if (!(dst_offset % 4) && !(src_offset % 4) && !(size % 4)) { + sub_cmd = SI_DMA_COPY_DWORD_ALIGNED; + shift = 2; + max_size = SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE; + } else { + sub_cmd = SI_DMA_COPY_BYTE_ALIGNED; + shift = 0; + max_size = SI_DMA_COPY_MAX_BYTE_ALIGNED_SIZE; + } + + ncopy = DIV_ROUND_UP(size, max_size); + si_need_dma_space(sctx, ncopy * 5, sdst, ssrc); + + for (i = 0; i < ncopy; i++) { + csize = MIN2(size, max_size); + radeon_emit(cs, SI_DMA_PACKET(SI_DMA_PACKET_COPY, sub_cmd, + csize >> shift)); + radeon_emit(cs, dst_offset); + radeon_emit(cs, src_offset); + radeon_emit(cs, (dst_offset >> 32UL) & 0xff); + radeon_emit(cs, (src_offset >> 32UL) & 0xff); + dst_offset += csize; + src_offset += csize; + size -= csize; + } + return; + } + + /* The following code is for CI and later. */ + unsigned align = ~0u; + ncopy = DIV_ROUND_UP(size, CIK_SDMA_COPY_MAX_SIZE); + + /* Align copy size to dw if src/dst address are dw aligned */ + if ((src_offset & 0x3) == 0 && + (dst_offset & 0x3) == 0 && + size > 4 && + (size & 3) != 0) { + align = ~0x3u; + ncopy++; + } + + si_need_dma_space(sctx, ncopy * 7, sdst, ssrc); + + for (i = 0; i < ncopy; i++) { + csize = size >= 4 ? MIN2(size & align, CIK_SDMA_COPY_MAX_SIZE) : size; + radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY, + CIK_SDMA_COPY_SUB_OPCODE_LINEAR, + 0)); + radeon_emit(cs, sctx->chip_class >= GFX9 ? csize - 1 : csize); + radeon_emit(cs, 0); /* src/dst endian swap */ + radeon_emit(cs, src_offset); + radeon_emit(cs, src_offset >> 32); + radeon_emit(cs, dst_offset); + radeon_emit(cs, dst_offset >> 32); + dst_offset += csize; + src_offset += csize; + size -= csize; + } +} + void si_need_dma_space(struct si_context *ctx, unsigned num_dw, struct si_resource *dst, struct si_resource *src) { struct radeon_winsys *ws = ctx->ws; - uint64_t vram = ctx->dma_cs->used_vram; - uint64_t gtt = ctx->dma_cs->used_gart; + uint64_t vram = ctx->sdma_cs->used_vram; + uint64_t gtt = ctx->sdma_cs->used_gart; if (dst) { vram += dst->vram_usage; @@ -164,31 +255,31 @@ */ num_dw++; /* for emit_wait_idle below */ if (!ctx->sdma_uploads_in_progress && - (!ws->cs_check_space(ctx->dma_cs, num_dw, false) || - ctx->dma_cs->used_vram + ctx->dma_cs->used_gart > 64 * 1024 * 1024 || - !radeon_cs_memory_below_limit(ctx->screen, ctx->dma_cs, vram, gtt))) { + (!ws->cs_check_space(ctx->sdma_cs, num_dw, false) || + ctx->sdma_cs->used_vram + ctx->sdma_cs->used_gart > 64 * 1024 * 1024 || + !radeon_cs_memory_below_limit(ctx->screen, ctx->sdma_cs, vram, gtt))) { si_flush_dma_cs(ctx, PIPE_FLUSH_ASYNC, NULL); - assert((num_dw + ctx->dma_cs->current.cdw) <= ctx->dma_cs->current.max_dw); + assert((num_dw + ctx->sdma_cs->current.cdw) <= ctx->sdma_cs->current.max_dw); } /* Wait for idle if either buffer has been used in the IB before to * prevent read-after-write hazards. */ if ((dst && - ws->cs_is_buffer_referenced(ctx->dma_cs, dst->buf, + ws->cs_is_buffer_referenced(ctx->sdma_cs, dst->buf, RADEON_USAGE_READWRITE)) || (src && - ws->cs_is_buffer_referenced(ctx->dma_cs, src->buf, + ws->cs_is_buffer_referenced(ctx->sdma_cs, src->buf, RADEON_USAGE_WRITE))) si_dma_emit_wait_idle(ctx); unsigned sync = ctx->sdma_uploads_in_progress ? 0 : RADEON_USAGE_SYNCHRONIZED; if (dst) { - ws->cs_add_buffer(ctx->dma_cs, dst->buf, RADEON_USAGE_WRITE | sync, + ws->cs_add_buffer(ctx->sdma_cs, dst->buf, RADEON_USAGE_WRITE | sync, dst->domains, 0); } if (src) { - ws->cs_add_buffer(ctx->dma_cs, src->buf, RADEON_USAGE_READ | sync, + ws->cs_add_buffer(ctx->sdma_cs, src->buf, RADEON_USAGE_READ | sync, src->domains, 0); } @@ -199,7 +290,7 @@ void si_flush_dma_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_handle **fence) { - struct radeon_cmdbuf *cs = ctx->dma_cs; + struct radeon_cmdbuf *cs = ctx->sdma_cs; struct radeon_saved_cs saved; bool check_vm = (ctx->screen->debug_flags & DBG(CHECK_VM)) != 0; @@ -232,8 +323,8 @@ { struct si_context *ctx = (struct si_context*)sscreen->aux_context; - mtx_lock(&sscreen->aux_context_lock); + simple_mtx_lock(&sscreen->aux_context_lock); si_sdma_clear_buffer(ctx, dst, offset, size, value); sscreen->aux_context->flush(sscreen->aux_context, NULL, 0); - mtx_unlock(&sscreen->aux_context_lock); + simple_mtx_unlock(&sscreen->aux_context_lock); } diff -Nru mesa-19.2.8/src/gallium/drivers/radeonsi/si_fence.c mesa-20.0.8/src/gallium/drivers/radeonsi/si_fence.c --- mesa-19.2.8/src/gallium/drivers/radeonsi/si_fence.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/radeonsi/si_fence.c 2020-06-12 01:21:17.000000000 +0000 @@ -180,8 +180,8 @@ { struct radeon_winsys *ws = sctx->ws; - if (sctx->dma_cs) - ws->cs_add_fence_dependency(sctx->dma_cs, fence, 0); + if (sctx->sdma_cs) + ws->cs_add_fence_dependency(sctx->sdma_cs, fence, 0); ws->cs_add_fence_dependency(sctx->gfx_cs, fence, 0); } @@ -513,7 +513,7 @@ } /* DMA IBs are preambles to gfx IBs, therefore must be flushed first. */ - if (sctx->dma_cs) + if (sctx->sdma_cs) si_flush_dma_cs(sctx, rflags, fence ? &sdma_fence : NULL); if (!radeon_emitted(sctx->gfx_cs, sctx->initial_gfx_cs_size)) { @@ -577,8 +577,8 @@ assert(!fine.buf); finish: if (!(flags & (PIPE_FLUSH_DEFERRED | PIPE_FLUSH_ASYNC))) { - if (sctx->dma_cs) - ws->cs_sync_flush(sctx->dma_cs); + if (sctx->sdma_cs) + ws->cs_sync_flush(sctx->sdma_cs); ws->cs_sync_flush(sctx->gfx_cs); } } @@ -606,7 +606,11 @@ * Therefore, we must make sure that we flush the pipe to avoid * new work being emitted and getting executed before the signal * operation. + * + * Set sctx->initial_gfx_cs_size to force IB submission even if + * it is empty. */ + sctx->initial_gfx_cs_size = 0; si_flush_from_st(ctx, NULL, PIPE_FLUSH_ASYNC); } diff -Nru mesa-19.2.8/src/gallium/drivers/radeonsi/si_get.c mesa-20.0.8/src/gallium/drivers/radeonsi/si_get.c --- mesa-19.2.8/src/gallium/drivers/radeonsi/si_get.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/radeonsi/si_get.c 2020-06-12 01:21:17.000000000 +0000 @@ -26,880 +26,854 @@ #include "radeon/radeon_video.h" #include "radeon/radeon_vce.h" #include "radeon/radeon_uvd_enc.h" -#include "ac_llvm_util.h" #include "vl/vl_decoder.h" #include "vl/vl_video_buffer.h" #include "util/u_screen.h" #include "util/u_video.h" +#include "mesa/main/macros.h" #include "compiler/nir/nir.h" #include static const char *si_get_vendor(struct pipe_screen *pscreen) { - /* Don't change this. Games such as Alien Isolation are broken if this - * returns "Advanced Micro Devices, Inc." - */ - return "X.Org"; + /* Don't change this. Games such as Alien Isolation are broken if this + * returns "Advanced Micro Devices, Inc." + */ + return "X.Org"; } static const char *si_get_device_vendor(struct pipe_screen *pscreen) { - return "AMD"; + return "AMD"; } static int si_get_param(struct pipe_screen *pscreen, enum pipe_cap param) { - struct si_screen *sscreen = (struct si_screen *)pscreen; + struct si_screen *sscreen = (struct si_screen *)pscreen; - switch (param) { - /* Supported features (boolean caps). */ - case PIPE_CAP_ACCELERATED: - case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS: - case PIPE_CAP_ANISOTROPIC_FILTER: - case PIPE_CAP_POINT_SPRITE: - case PIPE_CAP_OCCLUSION_QUERY: - case PIPE_CAP_TEXTURE_MIRROR_CLAMP: - case PIPE_CAP_TEXTURE_MIRROR_CLAMP_TO_EDGE: - case PIPE_CAP_BLEND_EQUATION_SEPARATE: - case PIPE_CAP_TEXTURE_SWIZZLE: - case PIPE_CAP_DEPTH_CLIP_DISABLE: - case PIPE_CAP_DEPTH_CLIP_DISABLE_SEPARATE: - case PIPE_CAP_SHADER_STENCIL_EXPORT: - case PIPE_CAP_VERTEX_ELEMENT_INSTANCE_DIVISOR: - case PIPE_CAP_MIXED_COLORBUFFER_FORMATS: - case PIPE_CAP_TGSI_FS_COORD_ORIGIN_UPPER_LEFT: - case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER: - case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_INTEGER: - case PIPE_CAP_FRAGMENT_SHADER_TEXTURE_LOD: - case PIPE_CAP_FRAGMENT_SHADER_DERIVATIVES: - case PIPE_CAP_VERTEX_SHADER_SATURATE: - case PIPE_CAP_SEAMLESS_CUBE_MAP: - case PIPE_CAP_PRIMITIVE_RESTART: - case PIPE_CAP_CONDITIONAL_RENDER: - case PIPE_CAP_TEXTURE_BARRIER: - case PIPE_CAP_INDEP_BLEND_ENABLE: - case PIPE_CAP_INDEP_BLEND_FUNC: - case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE: - case PIPE_CAP_VERTEX_COLOR_UNCLAMPED: - case PIPE_CAP_START_INSTANCE: - case PIPE_CAP_NPOT_TEXTURES: - case PIPE_CAP_MIXED_FRAMEBUFFER_SIZES: - case PIPE_CAP_MIXED_COLOR_DEPTH_BITS: - case PIPE_CAP_VERTEX_COLOR_CLAMPED: - case PIPE_CAP_FRAGMENT_COLOR_CLAMPED: - case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER: - case PIPE_CAP_TGSI_INSTANCEID: - case PIPE_CAP_COMPUTE: - case PIPE_CAP_TEXTURE_BUFFER_OBJECTS: - case PIPE_CAP_TGSI_VS_LAYER_VIEWPORT: - case PIPE_CAP_QUERY_PIPELINE_STATISTICS: - case PIPE_CAP_BUFFER_MAP_PERSISTENT_COHERENT: - case PIPE_CAP_CUBE_MAP_ARRAY: - case PIPE_CAP_SAMPLE_SHADING: - case PIPE_CAP_DRAW_INDIRECT: - case PIPE_CAP_CLIP_HALFZ: - case PIPE_CAP_TGSI_VS_WINDOW_SPACE_POSITION: - case PIPE_CAP_POLYGON_OFFSET_CLAMP: - case PIPE_CAP_MULTISAMPLE_Z_RESOLVE: - case PIPE_CAP_QUADS_FOLLOW_PROVOKING_VERTEX_CONVENTION: - case PIPE_CAP_TGSI_TEXCOORD: - case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE: - case PIPE_CAP_CONDITIONAL_RENDER_INVERTED: - case PIPE_CAP_TEXTURE_FLOAT_LINEAR: - case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR: - case PIPE_CAP_SHAREABLE_SHADERS: - case PIPE_CAP_DEPTH_BOUNDS_TEST: - case PIPE_CAP_SAMPLER_VIEW_TARGET: - case PIPE_CAP_TEXTURE_QUERY_LOD: - case PIPE_CAP_TEXTURE_GATHER_SM5: - case PIPE_CAP_TGSI_TXQS: - case PIPE_CAP_FORCE_PERSAMPLE_INTERP: - case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS: - case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL: - case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL: - case PIPE_CAP_INVALIDATE_BUFFER: - case PIPE_CAP_SURFACE_REINTERPRET_BLOCKS: - case PIPE_CAP_QUERY_BUFFER_OBJECT: - case PIPE_CAP_QUERY_MEMORY_INFO: - case PIPE_CAP_TGSI_PACK_HALF_FLOAT: - case PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT: - case PIPE_CAP_ROBUST_BUFFER_ACCESS_BEHAVIOR: - case PIPE_CAP_GENERATE_MIPMAP: - case PIPE_CAP_POLYGON_OFFSET_UNITS_UNSCALED: - case PIPE_CAP_STRING_MARKER: - case PIPE_CAP_CLEAR_TEXTURE: - case PIPE_CAP_CULL_DISTANCE: - case PIPE_CAP_TGSI_ARRAY_COMPONENTS: - case PIPE_CAP_TGSI_CAN_READ_OUTPUTS: - case PIPE_CAP_GLSL_OPTIMIZE_CONSERVATIVELY: - case PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME: - case PIPE_CAP_STREAM_OUTPUT_INTERLEAVE_BUFFERS: - case PIPE_CAP_DOUBLES: - case PIPE_CAP_TGSI_TEX_TXF_LZ: - case PIPE_CAP_TGSI_TES_LAYER_VIEWPORT: - case PIPE_CAP_BINDLESS_TEXTURE: - case PIPE_CAP_QUERY_TIMESTAMP: - case PIPE_CAP_QUERY_TIME_ELAPSED: - case PIPE_CAP_NIR_SAMPLERS_AS_DEREF: - case PIPE_CAP_MEMOBJ: - case PIPE_CAP_LOAD_CONSTBUF: - case PIPE_CAP_INT64: - case PIPE_CAP_INT64_DIVMOD: - case PIPE_CAP_TGSI_CLOCK: - case PIPE_CAP_CAN_BIND_CONST_BUFFER_AS_VERTEX: - case PIPE_CAP_ALLOW_MAPPED_BUFFERS_DURING_EXECUTION: - case PIPE_CAP_TGSI_ANY_REG_AS_ADDRESS: - case PIPE_CAP_SIGNED_VERTEX_BUFFER_OFFSET: - case PIPE_CAP_TGSI_BALLOT: - case PIPE_CAP_TGSI_VOTE: - case PIPE_CAP_FBFETCH: - case PIPE_CAP_COMPUTE_GRID_INFO_LAST_BLOCK: - case PIPE_CAP_IMAGE_LOAD_FORMATTED: - case PIPE_CAP_PREFER_COMPUTE_FOR_MULTIMEDIA: - case PIPE_CAP_TGSI_DIV: - return 1; - - case PIPE_CAP_QUERY_SO_OVERFLOW: - return !sscreen->use_ngg_streamout; - - case PIPE_CAP_POST_DEPTH_COVERAGE: - return sscreen->info.chip_class >= GFX10; - - case PIPE_CAP_GRAPHICS: - return sscreen->info.has_graphics; + switch (param) { + /* Supported features (boolean caps). */ + case PIPE_CAP_ACCELERATED: + case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS: + case PIPE_CAP_ANISOTROPIC_FILTER: + case PIPE_CAP_POINT_SPRITE: + case PIPE_CAP_OCCLUSION_QUERY: + case PIPE_CAP_TEXTURE_MIRROR_CLAMP: + case PIPE_CAP_TEXTURE_MIRROR_CLAMP_TO_EDGE: + case PIPE_CAP_BLEND_EQUATION_SEPARATE: + case PIPE_CAP_TEXTURE_SWIZZLE: + case PIPE_CAP_DEPTH_CLIP_DISABLE: + case PIPE_CAP_DEPTH_CLIP_DISABLE_SEPARATE: + case PIPE_CAP_SHADER_STENCIL_EXPORT: + case PIPE_CAP_VERTEX_ELEMENT_INSTANCE_DIVISOR: + case PIPE_CAP_MIXED_COLORBUFFER_FORMATS: + case PIPE_CAP_TGSI_FS_COORD_ORIGIN_UPPER_LEFT: + case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER: + case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_INTEGER: + case PIPE_CAP_FRAGMENT_SHADER_TEXTURE_LOD: + case PIPE_CAP_FRAGMENT_SHADER_DERIVATIVES: + case PIPE_CAP_VERTEX_SHADER_SATURATE: + case PIPE_CAP_SEAMLESS_CUBE_MAP: + case PIPE_CAP_PRIMITIVE_RESTART: + case PIPE_CAP_CONDITIONAL_RENDER: + case PIPE_CAP_TEXTURE_BARRIER: + case PIPE_CAP_INDEP_BLEND_ENABLE: + case PIPE_CAP_INDEP_BLEND_FUNC: + case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE: + case PIPE_CAP_VERTEX_COLOR_UNCLAMPED: + case PIPE_CAP_START_INSTANCE: + case PIPE_CAP_NPOT_TEXTURES: + case PIPE_CAP_MIXED_FRAMEBUFFER_SIZES: + case PIPE_CAP_MIXED_COLOR_DEPTH_BITS: + case PIPE_CAP_VERTEX_COLOR_CLAMPED: + case PIPE_CAP_FRAGMENT_COLOR_CLAMPED: + case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER: + case PIPE_CAP_TGSI_INSTANCEID: + case PIPE_CAP_COMPUTE: + case PIPE_CAP_TEXTURE_BUFFER_OBJECTS: + case PIPE_CAP_TGSI_VS_LAYER_VIEWPORT: + case PIPE_CAP_QUERY_PIPELINE_STATISTICS: + case PIPE_CAP_BUFFER_MAP_PERSISTENT_COHERENT: + case PIPE_CAP_CUBE_MAP_ARRAY: + case PIPE_CAP_SAMPLE_SHADING: + case PIPE_CAP_DRAW_INDIRECT: + case PIPE_CAP_CLIP_HALFZ: + case PIPE_CAP_TGSI_VS_WINDOW_SPACE_POSITION: + case PIPE_CAP_POLYGON_OFFSET_CLAMP: + case PIPE_CAP_MULTISAMPLE_Z_RESOLVE: + case PIPE_CAP_QUADS_FOLLOW_PROVOKING_VERTEX_CONVENTION: + case PIPE_CAP_TGSI_TEXCOORD: + case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE: + case PIPE_CAP_CONDITIONAL_RENDER_INVERTED: + case PIPE_CAP_TEXTURE_FLOAT_LINEAR: + case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR: + case PIPE_CAP_SHAREABLE_SHADERS: + case PIPE_CAP_DEPTH_BOUNDS_TEST: + case PIPE_CAP_SAMPLER_VIEW_TARGET: + case PIPE_CAP_TEXTURE_QUERY_LOD: + case PIPE_CAP_TEXTURE_GATHER_SM5: + case PIPE_CAP_TGSI_TXQS: + case PIPE_CAP_FORCE_PERSAMPLE_INTERP: + case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS: + case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL: + case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL: + case PIPE_CAP_INVALIDATE_BUFFER: + case PIPE_CAP_SURFACE_REINTERPRET_BLOCKS: + case PIPE_CAP_QUERY_BUFFER_OBJECT: + case PIPE_CAP_QUERY_MEMORY_INFO: + case PIPE_CAP_TGSI_PACK_HALF_FLOAT: + case PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT: + case PIPE_CAP_ROBUST_BUFFER_ACCESS_BEHAVIOR: + case PIPE_CAP_GENERATE_MIPMAP: + case PIPE_CAP_POLYGON_OFFSET_UNITS_UNSCALED: + case PIPE_CAP_STRING_MARKER: + case PIPE_CAP_CLEAR_TEXTURE: + case PIPE_CAP_CULL_DISTANCE: + case PIPE_CAP_TGSI_ARRAY_COMPONENTS: + case PIPE_CAP_TGSI_CAN_READ_OUTPUTS: + case PIPE_CAP_GLSL_OPTIMIZE_CONSERVATIVELY: + case PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME: + case PIPE_CAP_STREAM_OUTPUT_INTERLEAVE_BUFFERS: + case PIPE_CAP_DOUBLES: + case PIPE_CAP_TGSI_TEX_TXF_LZ: + case PIPE_CAP_TGSI_TES_LAYER_VIEWPORT: + case PIPE_CAP_BINDLESS_TEXTURE: + case PIPE_CAP_QUERY_TIMESTAMP: + case PIPE_CAP_QUERY_TIME_ELAPSED: + case PIPE_CAP_NIR_SAMPLERS_AS_DEREF: + case PIPE_CAP_MEMOBJ: + case PIPE_CAP_LOAD_CONSTBUF: + case PIPE_CAP_INT64: + case PIPE_CAP_INT64_DIVMOD: + case PIPE_CAP_TGSI_CLOCK: + case PIPE_CAP_CAN_BIND_CONST_BUFFER_AS_VERTEX: + case PIPE_CAP_ALLOW_MAPPED_BUFFERS_DURING_EXECUTION: + case PIPE_CAP_SIGNED_VERTEX_BUFFER_OFFSET: + case PIPE_CAP_TGSI_BALLOT: + case PIPE_CAP_TGSI_VOTE: + case PIPE_CAP_FBFETCH: + case PIPE_CAP_COMPUTE_GRID_INFO_LAST_BLOCK: + case PIPE_CAP_IMAGE_LOAD_FORMATTED: + case PIPE_CAP_PREFER_COMPUTE_FOR_MULTIMEDIA: + case PIPE_CAP_TGSI_DIV: + case PIPE_CAP_PACKED_UNIFORMS: + case PIPE_CAP_SHADER_SAMPLES_IDENTICAL: + case PIPE_CAP_GL_SPIRV: + return 1; + + case PIPE_CAP_QUERY_SO_OVERFLOW: + return !sscreen->use_ngg_streamout; + + case PIPE_CAP_POST_DEPTH_COVERAGE: + return sscreen->info.chip_class >= GFX10; + + case PIPE_CAP_GRAPHICS: + return sscreen->info.has_graphics; - case PIPE_CAP_RESOURCE_FROM_USER_MEMORY: - return !SI_BIG_ENDIAN && sscreen->info.has_userptr; + case PIPE_CAP_RESOURCE_FROM_USER_MEMORY: + return !SI_BIG_ENDIAN && sscreen->info.has_userptr; - case PIPE_CAP_DEVICE_RESET_STATUS_QUERY: - return sscreen->info.has_gpu_reset_status_query; + case PIPE_CAP_DEVICE_RESET_STATUS_QUERY: + return sscreen->info.has_gpu_reset_status_query; - case PIPE_CAP_TEXTURE_MULTISAMPLE: - return sscreen->info.has_2d_tiling; + case PIPE_CAP_TEXTURE_MULTISAMPLE: + return sscreen->info.has_2d_tiling; case PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT: return SI_MAP_BUFFER_ALIGNMENT; - case PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT: - case PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT: - case PIPE_CAP_MAX_TEXTURE_GATHER_COMPONENTS: - case PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS: - case PIPE_CAP_MAX_VERTEX_STREAMS: - case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT: - case PIPE_CAP_MAX_WINDOW_RECTANGLES: - return 4; - - case PIPE_CAP_GLSL_FEATURE_LEVEL: - case PIPE_CAP_GLSL_FEATURE_LEVEL_COMPATIBILITY: - if (sscreen->info.has_indirect_compute_dispatch) - return 450; - return 420; - - case PIPE_CAP_MAX_TEXTURE_UPLOAD_MEMORY_BUDGET: - /* Optimal number for good TexSubImage performance on Polaris10. */ - return 64 * 1024 * 1024; - - case PIPE_CAP_MAX_TEXTURE_BUFFER_SIZE: - case PIPE_CAP_MAX_SHADER_BUFFER_SIZE: - return MIN2(sscreen->info.max_alloc_size, INT_MAX); - - case PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY: - case PIPE_CAP_VERTEX_BUFFER_STRIDE_4BYTE_ALIGNED_ONLY: - case PIPE_CAP_VERTEX_ELEMENT_SRC_OFFSET_4BYTE_ALIGNED_ONLY: - return HAVE_LLVM < 0x0900 && !sscreen->info.has_unaligned_shader_loads; - - case PIPE_CAP_SPARSE_BUFFER_PAGE_SIZE: - return sscreen->info.has_sparse_vm_mappings ? - RADEON_SPARSE_PAGE_SIZE : 0; - - case PIPE_CAP_PACKED_UNIFORMS: - case PIPE_CAP_SHADER_SAMPLES_IDENTICAL: - if (sscreen->options.enable_nir) - return 1; - return 0; - - /* Unsupported features. */ - case PIPE_CAP_BUFFER_SAMPLER_VIEW_RGBA_ONLY: - case PIPE_CAP_TGSI_FS_COORD_ORIGIN_LOWER_LEFT: - case PIPE_CAP_TGSI_CAN_COMPACT_CONSTANTS: - case PIPE_CAP_USER_VERTEX_BUFFERS: - case PIPE_CAP_FAKE_SW_MSAA: - case PIPE_CAP_TEXTURE_GATHER_OFFSETS: - case PIPE_CAP_VERTEXID_NOBASE: - case PIPE_CAP_PRIMITIVE_RESTART_FOR_PATCHES: - case PIPE_CAP_TGSI_MUL_ZERO_WINS: - case PIPE_CAP_UMA: - case PIPE_CAP_POLYGON_MODE_FILL_RECTANGLE: - case PIPE_CAP_TILE_RASTER_ORDER: - case PIPE_CAP_MAX_COMBINED_SHADER_OUTPUT_RESOURCES: - case PIPE_CAP_CONTEXT_PRIORITY_MASK: - case PIPE_CAP_CONSERVATIVE_RASTER_POST_SNAP_TRIANGLES: - case PIPE_CAP_CONSERVATIVE_RASTER_POST_SNAP_POINTS_LINES: - case PIPE_CAP_CONSERVATIVE_RASTER_PRE_SNAP_TRIANGLES: - case PIPE_CAP_CONSERVATIVE_RASTER_PRE_SNAP_POINTS_LINES: - case PIPE_CAP_CONSERVATIVE_RASTER_POST_DEPTH_COVERAGE: - case PIPE_CAP_MAX_CONSERVATIVE_RASTER_SUBPIXEL_PRECISION_BIAS: - case PIPE_CAP_PROGRAMMABLE_SAMPLE_LOCATIONS: - return 0; - - case PIPE_CAP_FENCE_SIGNAL: - return sscreen->info.has_syncobj; - - case PIPE_CAP_CONSTBUF0_FLAGS: - return SI_RESOURCE_FLAG_32BIT; - - case PIPE_CAP_NATIVE_FENCE_FD: - return sscreen->info.has_fence_to_handle; - - case PIPE_CAP_DRAW_PARAMETERS: - case PIPE_CAP_MULTI_DRAW_INDIRECT: - case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS: - return sscreen->has_draw_indirect_multi; - - case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS: - return 30; - - case PIPE_CAP_MAX_VARYINGS: - return 32; - - case PIPE_CAP_TEXTURE_BORDER_COLOR_QUIRK: - return sscreen->info.chip_class <= GFX8 ? - PIPE_QUIRK_TEXTURE_BORDER_COLOR_SWIZZLE_R600 : 0; - - /* Stream output. */ - case PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_COMPONENTS: - case PIPE_CAP_MAX_STREAM_OUTPUT_INTERLEAVED_COMPONENTS: - return 32*4; - - /* Geometry shader output. */ - case PIPE_CAP_MAX_GEOMETRY_OUTPUT_VERTICES: - /* gfx9 has to report 256 to make piglit/gs-max-output pass. - * gfx8 and earlier can do 1024. - */ - return 256; - case PIPE_CAP_MAX_GEOMETRY_TOTAL_OUTPUT_COMPONENTS: - return 4095; - case PIPE_CAP_MAX_GS_INVOCATIONS: - /* The closed driver exposes 127, but 125 is the greatest - * number that works. */ - return 125; - - case PIPE_CAP_MAX_VERTEX_ATTRIB_STRIDE: - return 2048; - - /* Texturing. */ - case PIPE_CAP_MAX_TEXTURE_2D_SIZE: - return 16384; - case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS: - return 15; /* 16384 */ - case PIPE_CAP_MAX_TEXTURE_3D_LEVELS: - if (sscreen->info.chip_class >= GFX10) - return 14; - /* textures support 8192, but layered rendering supports 2048 */ - return 12; - case PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS: - if (sscreen->info.chip_class >= GFX10) - return 8192; - /* textures support 8192, but layered rendering supports 2048 */ - return 2048; - - /* Viewports and render targets. */ - case PIPE_CAP_MAX_VIEWPORTS: - return SI_MAX_VIEWPORTS; - case PIPE_CAP_VIEWPORT_SUBPIXEL_BITS: - case PIPE_CAP_RASTERIZER_SUBPIXEL_BITS: - case PIPE_CAP_MAX_RENDER_TARGETS: - return 8; - case PIPE_CAP_FRAMEBUFFER_MSAA_CONSTRAINTS: - return sscreen->info.has_eqaa_surface_allocator ? 2 : 0; - - case PIPE_CAP_MIN_TEXTURE_GATHER_OFFSET: - case PIPE_CAP_MIN_TEXEL_OFFSET: - return -32; - - case PIPE_CAP_MAX_TEXTURE_GATHER_OFFSET: - case PIPE_CAP_MAX_TEXEL_OFFSET: - return 31; - - case PIPE_CAP_ENDIANNESS: - return PIPE_ENDIAN_LITTLE; - - case PIPE_CAP_VENDOR_ID: - return ATI_VENDOR_ID; - case PIPE_CAP_DEVICE_ID: - return sscreen->info.pci_id; - case PIPE_CAP_VIDEO_MEMORY: - return sscreen->info.vram_size >> 20; - case PIPE_CAP_PCI_GROUP: - return sscreen->info.pci_domain; - case PIPE_CAP_PCI_BUS: - return sscreen->info.pci_bus; - case PIPE_CAP_PCI_DEVICE: - return sscreen->info.pci_dev; - case PIPE_CAP_PCI_FUNCTION: - return sscreen->info.pci_func; - case PIPE_CAP_TGSI_ATOMINC_WRAP: - return HAVE_LLVM >= 0x1000; - - default: - return u_pipe_screen_get_param_defaults(pscreen, param); - } + case PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT: + case PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT: + case PIPE_CAP_MAX_TEXTURE_GATHER_COMPONENTS: + case PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS: + case PIPE_CAP_MAX_VERTEX_STREAMS: + case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT: + case PIPE_CAP_MAX_WINDOW_RECTANGLES: + return 4; + + case PIPE_CAP_GLSL_FEATURE_LEVEL: + case PIPE_CAP_GLSL_FEATURE_LEVEL_COMPATIBILITY: + if (!sscreen->info.has_indirect_compute_dispatch) + return 420; + return 460; + + case PIPE_CAP_MAX_TEXTURE_UPLOAD_MEMORY_BUDGET: + /* Optimal number for good TexSubImage performance on Polaris10. */ + return 64 * 1024 * 1024; + + case PIPE_CAP_MAX_TEXTURE_BUFFER_SIZE: + case PIPE_CAP_MAX_SHADER_BUFFER_SIZE: + /* Align it down to 256 bytes. I've chosen the number randomly. */ + return ROUND_DOWN_TO(MIN2(sscreen->info.max_alloc_size, INT_MAX), 256); + + case PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY: + case PIPE_CAP_VERTEX_BUFFER_STRIDE_4BYTE_ALIGNED_ONLY: + case PIPE_CAP_VERTEX_ELEMENT_SRC_OFFSET_4BYTE_ALIGNED_ONLY: + return LLVM_VERSION_MAJOR < 9 && !sscreen->info.has_unaligned_shader_loads; + + case PIPE_CAP_SPARSE_BUFFER_PAGE_SIZE: + return sscreen->info.has_sparse_vm_mappings ? + RADEON_SPARSE_PAGE_SIZE : 0; + + + case PIPE_CAP_UMA: + case PIPE_CAP_PREFER_IMM_ARRAYS_AS_CONSTBUF: + return 0; + + case PIPE_CAP_FENCE_SIGNAL: + return sscreen->info.has_syncobj; + + case PIPE_CAP_CONSTBUF0_FLAGS: + return SI_RESOURCE_FLAG_32BIT; + + case PIPE_CAP_NATIVE_FENCE_FD: + return sscreen->info.has_fence_to_handle; + + case PIPE_CAP_DRAW_PARAMETERS: + case PIPE_CAP_MULTI_DRAW_INDIRECT: + case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS: + return sscreen->has_draw_indirect_multi; + + case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS: + return 30; + + case PIPE_CAP_MAX_VARYINGS: + return 32; + + case PIPE_CAP_TEXTURE_BORDER_COLOR_QUIRK: + return sscreen->info.chip_class <= GFX8 ? + PIPE_QUIRK_TEXTURE_BORDER_COLOR_SWIZZLE_R600 : 0; + + /* Stream output. */ + case PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_COMPONENTS: + case PIPE_CAP_MAX_STREAM_OUTPUT_INTERLEAVED_COMPONENTS: + return 32*4; + + /* Geometry shader output. */ + case PIPE_CAP_MAX_GEOMETRY_OUTPUT_VERTICES: + /* gfx9 has to report 256 to make piglit/gs-max-output pass. + * gfx8 and earlier can do 1024. + */ + return 256; + case PIPE_CAP_MAX_GEOMETRY_TOTAL_OUTPUT_COMPONENTS: + return 4095; + case PIPE_CAP_MAX_GS_INVOCATIONS: + /* The closed driver exposes 127, but 125 is the greatest + * number that works. */ + return 125; + + case PIPE_CAP_MAX_VERTEX_ATTRIB_STRIDE: + return 2048; + + /* Texturing. */ + case PIPE_CAP_MAX_TEXTURE_2D_SIZE: + return 16384; + case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS: + return 15; /* 16384 */ + case PIPE_CAP_MAX_TEXTURE_3D_LEVELS: + if (sscreen->info.chip_class >= GFX10) + return 14; + /* textures support 8192, but layered rendering supports 2048 */ + return 12; + case PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS: + if (sscreen->info.chip_class >= GFX10) + return 8192; + /* textures support 8192, but layered rendering supports 2048 */ + return 2048; + + /* Viewports and render targets. */ + case PIPE_CAP_MAX_VIEWPORTS: + return SI_MAX_VIEWPORTS; + case PIPE_CAP_VIEWPORT_SUBPIXEL_BITS: + case PIPE_CAP_RASTERIZER_SUBPIXEL_BITS: + case PIPE_CAP_MAX_RENDER_TARGETS: + return 8; + case PIPE_CAP_FRAMEBUFFER_MSAA_CONSTRAINTS: + return sscreen->info.has_eqaa_surface_allocator ? 2 : 0; + + case PIPE_CAP_MIN_TEXTURE_GATHER_OFFSET: + case PIPE_CAP_MIN_TEXEL_OFFSET: + return -32; + + case PIPE_CAP_MAX_TEXTURE_GATHER_OFFSET: + case PIPE_CAP_MAX_TEXEL_OFFSET: + return 31; + + case PIPE_CAP_ENDIANNESS: + return PIPE_ENDIAN_LITTLE; + + case PIPE_CAP_VENDOR_ID: + return ATI_VENDOR_ID; + case PIPE_CAP_DEVICE_ID: + return sscreen->info.pci_id; + case PIPE_CAP_VIDEO_MEMORY: + return sscreen->info.vram_size >> 20; + case PIPE_CAP_PCI_GROUP: + return sscreen->info.pci_domain; + case PIPE_CAP_PCI_BUS: + return sscreen->info.pci_bus; + case PIPE_CAP_PCI_DEVICE: + return sscreen->info.pci_dev; + case PIPE_CAP_PCI_FUNCTION: + return sscreen->info.pci_func; + case PIPE_CAP_TGSI_ATOMINC_WRAP: + return LLVM_VERSION_MAJOR >= 10; + + default: + return u_pipe_screen_get_param_defaults(pscreen, param); + } } static float si_get_paramf(struct pipe_screen* pscreen, enum pipe_capf param) { - switch (param) { - case PIPE_CAPF_MAX_LINE_WIDTH: - case PIPE_CAPF_MAX_LINE_WIDTH_AA: - /* This depends on the quant mode, though the precise interactions - * are unknown. */ - return 2048; - case PIPE_CAPF_MAX_POINT_WIDTH: - case PIPE_CAPF_MAX_POINT_WIDTH_AA: - return SI_MAX_POINT_SIZE; - case PIPE_CAPF_MAX_TEXTURE_ANISOTROPY: - return 16.0f; - case PIPE_CAPF_MAX_TEXTURE_LOD_BIAS: - return 16.0f; - case PIPE_CAPF_MIN_CONSERVATIVE_RASTER_DILATE: - case PIPE_CAPF_MAX_CONSERVATIVE_RASTER_DILATE: - case PIPE_CAPF_CONSERVATIVE_RASTER_DILATE_GRANULARITY: - return 0.0f; - } - return 0.0f; + switch (param) { + case PIPE_CAPF_MAX_LINE_WIDTH: + case PIPE_CAPF_MAX_LINE_WIDTH_AA: + /* This depends on the quant mode, though the precise interactions + * are unknown. */ + return 2048; + case PIPE_CAPF_MAX_POINT_WIDTH: + case PIPE_CAPF_MAX_POINT_WIDTH_AA: + return SI_MAX_POINT_SIZE; + case PIPE_CAPF_MAX_TEXTURE_ANISOTROPY: + return 16.0f; + case PIPE_CAPF_MAX_TEXTURE_LOD_BIAS: + return 16.0f; + case PIPE_CAPF_MIN_CONSERVATIVE_RASTER_DILATE: + case PIPE_CAPF_MAX_CONSERVATIVE_RASTER_DILATE: + case PIPE_CAPF_CONSERVATIVE_RASTER_DILATE_GRANULARITY: + return 0.0f; + } + return 0.0f; } static int si_get_shader_param(struct pipe_screen* pscreen, - enum pipe_shader_type shader, - enum pipe_shader_cap param) + enum pipe_shader_type shader, + enum pipe_shader_cap param) { - struct si_screen *sscreen = (struct si_screen *)pscreen; + struct si_screen *sscreen = (struct si_screen *)pscreen; - switch(shader) - { - case PIPE_SHADER_FRAGMENT: - case PIPE_SHADER_VERTEX: - case PIPE_SHADER_GEOMETRY: - case PIPE_SHADER_TESS_CTRL: - case PIPE_SHADER_TESS_EVAL: - break; - case PIPE_SHADER_COMPUTE: - switch (param) { - case PIPE_SHADER_CAP_SUPPORTED_IRS: { - int ir = 1 << PIPE_SHADER_IR_NATIVE; - - if (sscreen->info.has_indirect_compute_dispatch) - ir |= 1 << PIPE_SHADER_IR_TGSI; - - return ir; - } - - case PIPE_SHADER_CAP_MAX_CONST_BUFFER_SIZE: { - uint64_t max_const_buffer_size; - pscreen->get_compute_param(pscreen, PIPE_SHADER_IR_TGSI, - PIPE_COMPUTE_CAP_MAX_MEM_ALLOC_SIZE, - &max_const_buffer_size); - return MIN2(max_const_buffer_size, INT_MAX); - } - default: - /* If compute shaders don't require a special value - * for this cap, we can return the same value we - * do for other shader types. */ - break; - } - break; - default: - return 0; - } - - switch (param) { - /* Shader limits. */ - case PIPE_SHADER_CAP_MAX_INSTRUCTIONS: - case PIPE_SHADER_CAP_MAX_ALU_INSTRUCTIONS: - case PIPE_SHADER_CAP_MAX_TEX_INSTRUCTIONS: - case PIPE_SHADER_CAP_MAX_TEX_INDIRECTIONS: - case PIPE_SHADER_CAP_MAX_CONTROL_FLOW_DEPTH: - return 16384; - case PIPE_SHADER_CAP_MAX_INPUTS: - return shader == PIPE_SHADER_VERTEX ? SI_MAX_ATTRIBS : 32; - case PIPE_SHADER_CAP_MAX_OUTPUTS: - return shader == PIPE_SHADER_FRAGMENT ? 8 : 32; - case PIPE_SHADER_CAP_MAX_TEMPS: - return 256; /* Max native temporaries. */ - case PIPE_SHADER_CAP_MAX_CONST_BUFFER_SIZE: - return MIN2(sscreen->info.max_alloc_size, INT_MAX - 3); /* aligned to 4 */ - case PIPE_SHADER_CAP_MAX_CONST_BUFFERS: - return SI_NUM_CONST_BUFFERS; - case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS: - case PIPE_SHADER_CAP_MAX_SAMPLER_VIEWS: - return SI_NUM_SAMPLERS; - case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS: - return SI_NUM_SHADER_BUFFERS; - case PIPE_SHADER_CAP_MAX_SHADER_IMAGES: - return SI_NUM_IMAGES; - case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT: - if (sscreen->options.enable_nir) - return 0; - return 32; - case PIPE_SHADER_CAP_PREFERRED_IR: - if (sscreen->options.enable_nir) - return PIPE_SHADER_IR_NIR; - return PIPE_SHADER_IR_TGSI; - case PIPE_SHADER_CAP_LOWER_IF_THRESHOLD: - return 4; - - /* Supported boolean features. */ - case PIPE_SHADER_CAP_TGSI_CONT_SUPPORTED: - case PIPE_SHADER_CAP_TGSI_SQRT_SUPPORTED: - case PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR: - case PIPE_SHADER_CAP_INDIRECT_CONST_ADDR: - case PIPE_SHADER_CAP_INTEGERS: - case PIPE_SHADER_CAP_INT64_ATOMICS: - case PIPE_SHADER_CAP_FP16: - case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED: - case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE: - case PIPE_SHADER_CAP_TGSI_SKIP_MERGE_REGISTERS: - case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED: - case PIPE_SHADER_CAP_TGSI_LDEXP_SUPPORTED: - case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED: - return 1; - - case PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR: - /* TODO: Indirect indexing of GS inputs is unimplemented. */ - if (shader == PIPE_SHADER_GEOMETRY) - return 0; - - if (shader == PIPE_SHADER_VERTEX && - !sscreen->llvm_has_working_vgpr_indexing) - return 0; - - /* TCS and TES load inputs directly from LDS or offchip - * memory, so indirect indexing is always supported. - * PS has to support indirect indexing, because we can't - * lower that to TEMPs for INTERP instructions. - */ - return 1; - - case PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR: - return sscreen->llvm_has_working_vgpr_indexing || - /* TCS stores outputs directly to memory. */ - shader == PIPE_SHADER_TESS_CTRL; - - /* Unsupported boolean features. */ - case PIPE_SHADER_CAP_SUBROUTINES: - case PIPE_SHADER_CAP_SUPPORTED_IRS: - case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTERS: - case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTER_BUFFERS: - return 0; - case PIPE_SHADER_CAP_SCALAR_ISA: - return 1; - } - return 0; + switch(shader) + { + case PIPE_SHADER_FRAGMENT: + case PIPE_SHADER_VERTEX: + case PIPE_SHADER_GEOMETRY: + case PIPE_SHADER_TESS_CTRL: + case PIPE_SHADER_TESS_EVAL: + break; + case PIPE_SHADER_COMPUTE: + switch (param) { + case PIPE_SHADER_CAP_SUPPORTED_IRS: { + int ir = 1 << PIPE_SHADER_IR_NATIVE; + + if (sscreen->info.has_indirect_compute_dispatch) + ir |= 1 << PIPE_SHADER_IR_NIR; + + return ir; + } + default: + /* If compute shaders don't require a special value + * for this cap, we can return the same value we + * do for other shader types. */ + break; + } + break; + default: + return 0; + } + + switch (param) { + /* Shader limits. */ + case PIPE_SHADER_CAP_MAX_INSTRUCTIONS: + case PIPE_SHADER_CAP_MAX_ALU_INSTRUCTIONS: + case PIPE_SHADER_CAP_MAX_TEX_INSTRUCTIONS: + case PIPE_SHADER_CAP_MAX_TEX_INDIRECTIONS: + case PIPE_SHADER_CAP_MAX_CONTROL_FLOW_DEPTH: + return 16384; + case PIPE_SHADER_CAP_MAX_INPUTS: + return shader == PIPE_SHADER_VERTEX ? SI_MAX_ATTRIBS : 32; + case PIPE_SHADER_CAP_MAX_OUTPUTS: + return shader == PIPE_SHADER_FRAGMENT ? 8 : 32; + case PIPE_SHADER_CAP_MAX_TEMPS: + return 256; /* Max native temporaries. */ + case PIPE_SHADER_CAP_MAX_CONST_BUFFER_SIZE: + return si_get_param(pscreen, PIPE_CAP_MAX_SHADER_BUFFER_SIZE); + case PIPE_SHADER_CAP_MAX_CONST_BUFFERS: + return SI_NUM_CONST_BUFFERS; + case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS: + case PIPE_SHADER_CAP_MAX_SAMPLER_VIEWS: + return SI_NUM_SAMPLERS; + case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS: + return SI_NUM_SHADER_BUFFERS; + case PIPE_SHADER_CAP_MAX_SHADER_IMAGES: + return SI_NUM_IMAGES; + case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT: + return 0; + case PIPE_SHADER_CAP_PREFERRED_IR: + return PIPE_SHADER_IR_NIR; + case PIPE_SHADER_CAP_LOWER_IF_THRESHOLD: + return 4; + + /* Supported boolean features. */ + case PIPE_SHADER_CAP_TGSI_CONT_SUPPORTED: + case PIPE_SHADER_CAP_TGSI_SQRT_SUPPORTED: + case PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR: + case PIPE_SHADER_CAP_INDIRECT_CONST_ADDR: + case PIPE_SHADER_CAP_INTEGERS: + case PIPE_SHADER_CAP_INT64_ATOMICS: + case PIPE_SHADER_CAP_FP16: + case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED: + case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE: + case PIPE_SHADER_CAP_TGSI_SKIP_MERGE_REGISTERS: + case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED: + case PIPE_SHADER_CAP_TGSI_LDEXP_SUPPORTED: + case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED: + return 1; + + case PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR: + /* TODO: Indirect indexing of GS inputs is unimplemented. */ + if (shader == PIPE_SHADER_GEOMETRY) + return 0; + + if (shader == PIPE_SHADER_VERTEX && + !sscreen->llvm_has_working_vgpr_indexing) + return 0; + + /* TCS and TES load inputs directly from LDS or offchip + * memory, so indirect indexing is always supported. + * PS has to support indirect indexing, because we can't + * lower that to TEMPs for INTERP instructions. + */ + return 1; + + case PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR: + return sscreen->llvm_has_working_vgpr_indexing || + /* TCS stores outputs directly to memory. */ + shader == PIPE_SHADER_TESS_CTRL; + + /* Unsupported boolean features. */ + case PIPE_SHADER_CAP_SUBROUTINES: + case PIPE_SHADER_CAP_SUPPORTED_IRS: + case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTERS: + case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTER_BUFFERS: + return 0; + } + return 0; } static const struct nir_shader_compiler_options nir_options = { - .lower_scmp = true, - .lower_flrp32 = true, - .lower_flrp64 = true, - .lower_fsat = true, - .lower_fdiv = true, - .lower_bitfield_insert_to_bitfield_select = true, - .lower_bitfield_extract = true, - .lower_sub = true, - .lower_ffma = true, - .lower_fmod = true, - .lower_pack_snorm_4x8 = true, - .lower_pack_unorm_4x8 = true, - .lower_unpack_snorm_2x16 = true, - .lower_unpack_snorm_4x8 = true, - .lower_unpack_unorm_2x16 = true, - .lower_unpack_unorm_4x8 = true, - .lower_extract_byte = true, - .lower_extract_word = true, - .lower_rotate = true, - .optimize_sample_mask_in = true, - .max_unroll_iterations = 32, - .use_interpolated_input_intrinsics = true, + .lower_scmp = true, + .lower_flrp32 = true, + .lower_flrp64 = true, + .lower_fsat = true, + .lower_fdiv = true, + .lower_bitfield_insert_to_bitfield_select = true, + .lower_bitfield_extract = true, + .lower_sub = true, + .fuse_ffma = true, + .lower_fmod = true, + .lower_pack_snorm_4x8 = true, + .lower_pack_unorm_4x8 = true, + .lower_unpack_snorm_2x16 = true, + .lower_unpack_snorm_4x8 = true, + .lower_unpack_unorm_2x16 = true, + .lower_unpack_unorm_4x8 = true, + .lower_extract_byte = true, + .lower_extract_word = true, + .lower_rotate = true, + .lower_to_scalar = true, + .optimize_sample_mask_in = true, + .max_unroll_iterations = 32, + .use_interpolated_input_intrinsics = true, }; static const void * si_get_compiler_options(struct pipe_screen *screen, - enum pipe_shader_ir ir, - enum pipe_shader_type shader) + enum pipe_shader_ir ir, + enum pipe_shader_type shader) { - assert(ir == PIPE_SHADER_IR_NIR); - return &nir_options; + assert(ir == PIPE_SHADER_IR_NIR); + return &nir_options; } static void si_get_driver_uuid(struct pipe_screen *pscreen, char *uuid) { - ac_compute_driver_uuid(uuid, PIPE_UUID_SIZE); + ac_compute_driver_uuid(uuid, PIPE_UUID_SIZE); } static void si_get_device_uuid(struct pipe_screen *pscreen, char *uuid) { - struct si_screen *sscreen = (struct si_screen *)pscreen; + struct si_screen *sscreen = (struct si_screen *)pscreen; - ac_compute_device_uuid(&sscreen->info, uuid, PIPE_UUID_SIZE); + ac_compute_device_uuid(&sscreen->info, uuid, PIPE_UUID_SIZE); } static const char* si_get_name(struct pipe_screen *pscreen) { - struct si_screen *sscreen = (struct si_screen*)pscreen; + struct si_screen *sscreen = (struct si_screen*)pscreen; - return sscreen->renderer_string; + return sscreen->renderer_string; } static int si_get_video_param_no_decode(struct pipe_screen *screen, - enum pipe_video_profile profile, - enum pipe_video_entrypoint entrypoint, - enum pipe_video_cap param) -{ - switch (param) { - case PIPE_VIDEO_CAP_SUPPORTED: - return vl_profile_supported(screen, profile, entrypoint); - case PIPE_VIDEO_CAP_NPOT_TEXTURES: - return 1; - case PIPE_VIDEO_CAP_MAX_WIDTH: - case PIPE_VIDEO_CAP_MAX_HEIGHT: - return vl_video_buffer_max_size(screen); - case PIPE_VIDEO_CAP_PREFERED_FORMAT: - return PIPE_FORMAT_NV12; - case PIPE_VIDEO_CAP_PREFERS_INTERLACED: - return false; - case PIPE_VIDEO_CAP_SUPPORTS_INTERLACED: - return false; - case PIPE_VIDEO_CAP_SUPPORTS_PROGRESSIVE: - return true; - case PIPE_VIDEO_CAP_MAX_LEVEL: - return vl_level_supported(screen, profile); - default: - return 0; - } + enum pipe_video_profile profile, + enum pipe_video_entrypoint entrypoint, + enum pipe_video_cap param) +{ + switch (param) { + case PIPE_VIDEO_CAP_SUPPORTED: + return vl_profile_supported(screen, profile, entrypoint); + case PIPE_VIDEO_CAP_NPOT_TEXTURES: + return 1; + case PIPE_VIDEO_CAP_MAX_WIDTH: + case PIPE_VIDEO_CAP_MAX_HEIGHT: + return vl_video_buffer_max_size(screen); + case PIPE_VIDEO_CAP_PREFERED_FORMAT: + return PIPE_FORMAT_NV12; + case PIPE_VIDEO_CAP_PREFERS_INTERLACED: + return false; + case PIPE_VIDEO_CAP_SUPPORTS_INTERLACED: + return false; + case PIPE_VIDEO_CAP_SUPPORTS_PROGRESSIVE: + return true; + case PIPE_VIDEO_CAP_MAX_LEVEL: + return vl_level_supported(screen, profile); + default: + return 0; + } } static int si_get_video_param(struct pipe_screen *screen, - enum pipe_video_profile profile, - enum pipe_video_entrypoint entrypoint, - enum pipe_video_cap param) -{ - struct si_screen *sscreen = (struct si_screen *)screen; - enum pipe_video_format codec = u_reduce_video_profile(profile); - - if (entrypoint == PIPE_VIDEO_ENTRYPOINT_ENCODE) { - switch (param) { - case PIPE_VIDEO_CAP_SUPPORTED: - return (codec == PIPE_VIDEO_FORMAT_MPEG4_AVC && - (si_vce_is_fw_version_supported(sscreen) || - sscreen->info.family >= CHIP_RAVEN)) || - (profile == PIPE_VIDEO_PROFILE_HEVC_MAIN && - (sscreen->info.family >= CHIP_RAVEN || - si_radeon_uvd_enc_supported(sscreen))); - case PIPE_VIDEO_CAP_NPOT_TEXTURES: - return 1; - case PIPE_VIDEO_CAP_MAX_WIDTH: - return (sscreen->info.family < CHIP_TONGA) ? 2048 : 4096; - case PIPE_VIDEO_CAP_MAX_HEIGHT: - return (sscreen->info.family < CHIP_TONGA) ? 1152 : 2304; - case PIPE_VIDEO_CAP_PREFERED_FORMAT: - return PIPE_FORMAT_NV12; - case PIPE_VIDEO_CAP_PREFERS_INTERLACED: - return false; - case PIPE_VIDEO_CAP_SUPPORTS_INTERLACED: - return false; - case PIPE_VIDEO_CAP_SUPPORTS_PROGRESSIVE: - return true; - case PIPE_VIDEO_CAP_STACKED_FRAMES: - return (sscreen->info.family < CHIP_TONGA) ? 1 : 2; - default: - return 0; - } - } - - switch (param) { - case PIPE_VIDEO_CAP_SUPPORTED: - switch (codec) { - case PIPE_VIDEO_FORMAT_MPEG12: - return profile != PIPE_VIDEO_PROFILE_MPEG1; - case PIPE_VIDEO_FORMAT_MPEG4: - return 1; - case PIPE_VIDEO_FORMAT_MPEG4_AVC: - if ((sscreen->info.family == CHIP_POLARIS10 || - sscreen->info.family == CHIP_POLARIS11) && - sscreen->info.uvd_fw_version < UVD_FW_1_66_16 ) { - RVID_ERR("POLARIS10/11 firmware version need to be updated.\n"); - return false; - } - return true; - case PIPE_VIDEO_FORMAT_VC1: - return true; - case PIPE_VIDEO_FORMAT_HEVC: - /* Carrizo only supports HEVC Main */ - if (sscreen->info.family >= CHIP_STONEY) - return (profile == PIPE_VIDEO_PROFILE_HEVC_MAIN || - profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10); - else if (sscreen->info.family >= CHIP_CARRIZO) - return profile == PIPE_VIDEO_PROFILE_HEVC_MAIN; - return false; - case PIPE_VIDEO_FORMAT_JPEG: - if (sscreen->info.family == CHIP_RAVEN || - sscreen->info.family == CHIP_RAVEN2 || - sscreen->info.family == CHIP_NAVI10) - return true; - if (sscreen->info.family < CHIP_CARRIZO || sscreen->info.family >= CHIP_VEGA10) - return false; - if (!(sscreen->info.is_amdgpu && sscreen->info.drm_minor >= 19)) { - RVID_ERR("No MJPEG support for the kernel version\n"); - return false; - } - return true; - case PIPE_VIDEO_FORMAT_VP9: - if (sscreen->info.family < CHIP_RAVEN) - return false; - return true; - default: - return false; - } - case PIPE_VIDEO_CAP_NPOT_TEXTURES: - return 1; - case PIPE_VIDEO_CAP_MAX_WIDTH: - return (sscreen->info.family < CHIP_TONGA) ? 2048 : 4096; - case PIPE_VIDEO_CAP_MAX_HEIGHT: - return (sscreen->info.family < CHIP_TONGA) ? 1152 : 4096; - case PIPE_VIDEO_CAP_PREFERED_FORMAT: - if (profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10 || - profile == PIPE_VIDEO_PROFILE_VP9_PROFILE2) - return PIPE_FORMAT_P016; - else - return PIPE_FORMAT_NV12; - - case PIPE_VIDEO_CAP_PREFERS_INTERLACED: - case PIPE_VIDEO_CAP_SUPPORTS_INTERLACED: { - enum pipe_video_format format = u_reduce_video_profile(profile); - - if (format == PIPE_VIDEO_FORMAT_HEVC) - return false; //The firmware doesn't support interlaced HEVC. - else if (format == PIPE_VIDEO_FORMAT_JPEG) - return false; - else if (format == PIPE_VIDEO_FORMAT_VP9) - return false; - return true; - } - case PIPE_VIDEO_CAP_SUPPORTS_PROGRESSIVE: - return true; - case PIPE_VIDEO_CAP_MAX_LEVEL: - switch (profile) { - case PIPE_VIDEO_PROFILE_MPEG1: - return 0; - case PIPE_VIDEO_PROFILE_MPEG2_SIMPLE: - case PIPE_VIDEO_PROFILE_MPEG2_MAIN: - return 3; - case PIPE_VIDEO_PROFILE_MPEG4_SIMPLE: - return 3; - case PIPE_VIDEO_PROFILE_MPEG4_ADVANCED_SIMPLE: - return 5; - case PIPE_VIDEO_PROFILE_VC1_SIMPLE: - return 1; - case PIPE_VIDEO_PROFILE_VC1_MAIN: - return 2; - case PIPE_VIDEO_PROFILE_VC1_ADVANCED: - return 4; - case PIPE_VIDEO_PROFILE_MPEG4_AVC_BASELINE: - case PIPE_VIDEO_PROFILE_MPEG4_AVC_MAIN: - case PIPE_VIDEO_PROFILE_MPEG4_AVC_HIGH: - return (sscreen->info.family < CHIP_TONGA) ? 41 : 52; - case PIPE_VIDEO_PROFILE_HEVC_MAIN: - case PIPE_VIDEO_PROFILE_HEVC_MAIN_10: - return 186; - default: - return 0; - } - default: - return 0; - } + enum pipe_video_profile profile, + enum pipe_video_entrypoint entrypoint, + enum pipe_video_cap param) +{ + struct si_screen *sscreen = (struct si_screen *)screen; + enum pipe_video_format codec = u_reduce_video_profile(profile); + + if (entrypoint == PIPE_VIDEO_ENTRYPOINT_ENCODE) { + switch (param) { + case PIPE_VIDEO_CAP_SUPPORTED: + return (codec == PIPE_VIDEO_FORMAT_MPEG4_AVC && + (si_vce_is_fw_version_supported(sscreen) || + sscreen->info.family >= CHIP_RAVEN)) || + (profile == PIPE_VIDEO_PROFILE_HEVC_MAIN && + (sscreen->info.family >= CHIP_RAVEN || + si_radeon_uvd_enc_supported(sscreen))); + case PIPE_VIDEO_CAP_NPOT_TEXTURES: + return 1; + case PIPE_VIDEO_CAP_MAX_WIDTH: + return (sscreen->info.family < CHIP_TONGA) ? 2048 : 4096; + case PIPE_VIDEO_CAP_MAX_HEIGHT: + return (sscreen->info.family < CHIP_TONGA) ? 1152 : 2304; + case PIPE_VIDEO_CAP_PREFERED_FORMAT: + return PIPE_FORMAT_NV12; + case PIPE_VIDEO_CAP_PREFERS_INTERLACED: + return false; + case PIPE_VIDEO_CAP_SUPPORTS_INTERLACED: + return false; + case PIPE_VIDEO_CAP_SUPPORTS_PROGRESSIVE: + return true; + case PIPE_VIDEO_CAP_STACKED_FRAMES: + return (sscreen->info.family < CHIP_TONGA) ? 1 : 2; + default: + return 0; + } + } + + switch (param) { + case PIPE_VIDEO_CAP_SUPPORTED: + switch (codec) { + case PIPE_VIDEO_FORMAT_MPEG12: + return profile != PIPE_VIDEO_PROFILE_MPEG1; + case PIPE_VIDEO_FORMAT_MPEG4: + return 1; + case PIPE_VIDEO_FORMAT_MPEG4_AVC: + if ((sscreen->info.family == CHIP_POLARIS10 || + sscreen->info.family == CHIP_POLARIS11) && + sscreen->info.uvd_fw_version < UVD_FW_1_66_16 ) { + RVID_ERR("POLARIS10/11 firmware version need to be updated.\n"); + return false; + } + return true; + case PIPE_VIDEO_FORMAT_VC1: + return true; + case PIPE_VIDEO_FORMAT_HEVC: + /* Carrizo only supports HEVC Main */ + if (sscreen->info.family >= CHIP_STONEY) + return (profile == PIPE_VIDEO_PROFILE_HEVC_MAIN || + profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10); + else if (sscreen->info.family >= CHIP_CARRIZO) + return profile == PIPE_VIDEO_PROFILE_HEVC_MAIN; + return false; + case PIPE_VIDEO_FORMAT_JPEG: + if (sscreen->info.family >= CHIP_RAVEN) + return true; + if (sscreen->info.family < CHIP_CARRIZO || sscreen->info.family >= CHIP_VEGA10) + return false; + if (!(sscreen->info.is_amdgpu && sscreen->info.drm_minor >= 19)) { + RVID_ERR("No MJPEG support for the kernel version\n"); + return false; + } + return true; + case PIPE_VIDEO_FORMAT_VP9: + if (sscreen->info.family < CHIP_RAVEN) + return false; + return true; + default: + return false; + } + case PIPE_VIDEO_CAP_NPOT_TEXTURES: + return 1; + case PIPE_VIDEO_CAP_MAX_WIDTH: + switch (codec) { + case PIPE_VIDEO_FORMAT_HEVC: + case PIPE_VIDEO_FORMAT_VP9: + return (sscreen->info.family < CHIP_RENOIR) ? + ((sscreen->info.family < CHIP_TONGA) ? 2048 : 4096) : + 8192; + default: + return (sscreen->info.family < CHIP_TONGA) ? 2048 : 4096; + } + case PIPE_VIDEO_CAP_MAX_HEIGHT: + switch (codec) { + case PIPE_VIDEO_FORMAT_HEVC: + case PIPE_VIDEO_FORMAT_VP9: + return (sscreen->info.family < CHIP_RENOIR) ? + ((sscreen->info.family < CHIP_TONGA) ? 1152 : 4096) : + 4352; + default: + return (sscreen->info.family < CHIP_TONGA) ? 1152 : 4096; + } + case PIPE_VIDEO_CAP_PREFERED_FORMAT: + if (profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10) + return PIPE_FORMAT_P010; + else if (profile == PIPE_VIDEO_PROFILE_VP9_PROFILE2) + return PIPE_FORMAT_P016; + else + return PIPE_FORMAT_NV12; + + case PIPE_VIDEO_CAP_PREFERS_INTERLACED: + case PIPE_VIDEO_CAP_SUPPORTS_INTERLACED: { + enum pipe_video_format format = u_reduce_video_profile(profile); + + if (format == PIPE_VIDEO_FORMAT_HEVC) + return false; //The firmware doesn't support interlaced HEVC. + else if (format == PIPE_VIDEO_FORMAT_JPEG) + return false; + else if (format == PIPE_VIDEO_FORMAT_VP9) + return false; + return true; + } + case PIPE_VIDEO_CAP_SUPPORTS_PROGRESSIVE: + return true; + case PIPE_VIDEO_CAP_MAX_LEVEL: + switch (profile) { + case PIPE_VIDEO_PROFILE_MPEG1: + return 0; + case PIPE_VIDEO_PROFILE_MPEG2_SIMPLE: + case PIPE_VIDEO_PROFILE_MPEG2_MAIN: + return 3; + case PIPE_VIDEO_PROFILE_MPEG4_SIMPLE: + return 3; + case PIPE_VIDEO_PROFILE_MPEG4_ADVANCED_SIMPLE: + return 5; + case PIPE_VIDEO_PROFILE_VC1_SIMPLE: + return 1; + case PIPE_VIDEO_PROFILE_VC1_MAIN: + return 2; + case PIPE_VIDEO_PROFILE_VC1_ADVANCED: + return 4; + case PIPE_VIDEO_PROFILE_MPEG4_AVC_BASELINE: + case PIPE_VIDEO_PROFILE_MPEG4_AVC_MAIN: + case PIPE_VIDEO_PROFILE_MPEG4_AVC_HIGH: + return (sscreen->info.family < CHIP_TONGA) ? 41 : 52; + case PIPE_VIDEO_PROFILE_HEVC_MAIN: + case PIPE_VIDEO_PROFILE_HEVC_MAIN_10: + return 186; + default: + return 0; + } + default: + return 0; + } } static bool si_vid_is_format_supported(struct pipe_screen *screen, - enum pipe_format format, - enum pipe_video_profile profile, - enum pipe_video_entrypoint entrypoint) + enum pipe_format format, + enum pipe_video_profile profile, + enum pipe_video_entrypoint entrypoint) { - /* HEVC 10 bit decoding should use P016 instead of NV12 if possible */ - if (profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10) - return (format == PIPE_FORMAT_NV12) || - (format == PIPE_FORMAT_P016); + /* HEVC 10 bit decoding should use P016 instead of NV12 if possible */ + if (profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10) + return (format == PIPE_FORMAT_NV12) || + (format == PIPE_FORMAT_P016); - /* Vp9 profile 2 supports 10 bit decoding using P016 */ - if (profile == PIPE_VIDEO_PROFILE_VP9_PROFILE2) - return format == PIPE_FORMAT_P016; + /* Vp9 profile 2 supports 10 bit decoding using P016 */ + if (profile == PIPE_VIDEO_PROFILE_VP9_PROFILE2) + return format == PIPE_FORMAT_P016; - /* we can only handle this one with UVD */ - if (profile != PIPE_VIDEO_PROFILE_UNKNOWN) - return format == PIPE_FORMAT_NV12; + /* we can only handle this one with UVD */ + if (profile != PIPE_VIDEO_PROFILE_UNKNOWN) + return format == PIPE_FORMAT_NV12; - return vl_video_buffer_is_format_supported(screen, format, profile, entrypoint); + return vl_video_buffer_is_format_supported(screen, format, profile, entrypoint); } static unsigned get_max_threads_per_block(struct si_screen *screen, - enum pipe_shader_ir ir_type) + enum pipe_shader_ir ir_type) { - if (ir_type == PIPE_SHADER_IR_NATIVE) - return 256; + if (ir_type == PIPE_SHADER_IR_NATIVE) + return 256; - /* Only 16 waves per thread-group on gfx9. */ - if (screen->info.chip_class >= GFX9) - return 1024; - - /* Up to 40 waves per thread-group on GCN < gfx9. Expose a nice - * round number. - */ - return 2048; + /* LLVM 10 only supports 1024 threads per block. */ + return 1024; } static int si_get_compute_param(struct pipe_screen *screen, - enum pipe_shader_ir ir_type, - enum pipe_compute_cap param, - void *ret) -{ - struct si_screen *sscreen = (struct si_screen *)screen; - - //TODO: select these params by asic - switch (param) { - case PIPE_COMPUTE_CAP_IR_TARGET: { - const char *gpu, *triple; - - triple = "amdgcn-mesa-mesa3d"; - gpu = ac_get_llvm_processor_name(sscreen->info.family); - if (ret) { - sprintf(ret, "%s-%s", gpu, triple); - } - /* +2 for dash and terminating NIL byte */ - return (strlen(triple) + strlen(gpu) + 2) * sizeof(char); - } - case PIPE_COMPUTE_CAP_GRID_DIMENSION: - if (ret) { - uint64_t *grid_dimension = ret; - grid_dimension[0] = 3; - } - return 1 * sizeof(uint64_t); - - case PIPE_COMPUTE_CAP_MAX_GRID_SIZE: - if (ret) { - uint64_t *grid_size = ret; - grid_size[0] = 65535; - grid_size[1] = 65535; - grid_size[2] = 65535; - } - return 3 * sizeof(uint64_t) ; - - case PIPE_COMPUTE_CAP_MAX_BLOCK_SIZE: - if (ret) { - uint64_t *block_size = ret; - unsigned threads_per_block = get_max_threads_per_block(sscreen, ir_type); - block_size[0] = threads_per_block; - block_size[1] = threads_per_block; - block_size[2] = threads_per_block; - } - return 3 * sizeof(uint64_t); - - case PIPE_COMPUTE_CAP_MAX_THREADS_PER_BLOCK: - if (ret) { - uint64_t *max_threads_per_block = ret; - *max_threads_per_block = get_max_threads_per_block(sscreen, ir_type); - } - return sizeof(uint64_t); - case PIPE_COMPUTE_CAP_ADDRESS_BITS: - if (ret) { - uint32_t *address_bits = ret; - address_bits[0] = 64; - } - return 1 * sizeof(uint32_t); - - case PIPE_COMPUTE_CAP_MAX_GLOBAL_SIZE: - if (ret) { - uint64_t *max_global_size = ret; - uint64_t max_mem_alloc_size; - - si_get_compute_param(screen, ir_type, - PIPE_COMPUTE_CAP_MAX_MEM_ALLOC_SIZE, - &max_mem_alloc_size); - - /* In OpenCL, the MAX_MEM_ALLOC_SIZE must be at least - * 1/4 of the MAX_GLOBAL_SIZE. Since the - * MAX_MEM_ALLOC_SIZE is fixed for older kernels, - * make sure we never report more than - * 4 * MAX_MEM_ALLOC_SIZE. - */ - *max_global_size = MIN2(4 * max_mem_alloc_size, - MAX2(sscreen->info.gart_size, - sscreen->info.vram_size)); - } - return sizeof(uint64_t); - - case PIPE_COMPUTE_CAP_MAX_LOCAL_SIZE: - if (ret) { - uint64_t *max_local_size = ret; - /* Value reported by the closed source driver. */ - *max_local_size = 32768; - } - return sizeof(uint64_t); - - case PIPE_COMPUTE_CAP_MAX_INPUT_SIZE: - if (ret) { - uint64_t *max_input_size = ret; - /* Value reported by the closed source driver. */ - *max_input_size = 1024; - } - return sizeof(uint64_t); - - case PIPE_COMPUTE_CAP_MAX_MEM_ALLOC_SIZE: - if (ret) { - uint64_t *max_mem_alloc_size = ret; - - *max_mem_alloc_size = sscreen->info.max_alloc_size; - } - return sizeof(uint64_t); - - case PIPE_COMPUTE_CAP_MAX_CLOCK_FREQUENCY: - if (ret) { - uint32_t *max_clock_frequency = ret; - *max_clock_frequency = sscreen->info.max_shader_clock; - } - return sizeof(uint32_t); - - case PIPE_COMPUTE_CAP_MAX_COMPUTE_UNITS: - if (ret) { - uint32_t *max_compute_units = ret; - *max_compute_units = sscreen->info.num_good_compute_units; - } - return sizeof(uint32_t); - - case PIPE_COMPUTE_CAP_IMAGES_SUPPORTED: - if (ret) { - uint32_t *images_supported = ret; - *images_supported = 0; - } - return sizeof(uint32_t); - case PIPE_COMPUTE_CAP_MAX_PRIVATE_SIZE: - break; /* unused */ - case PIPE_COMPUTE_CAP_SUBGROUP_SIZE: - if (ret) { - uint32_t *subgroup_size = ret; - *subgroup_size = sscreen->compute_wave_size; - } - return sizeof(uint32_t); - case PIPE_COMPUTE_CAP_MAX_VARIABLE_THREADS_PER_BLOCK: - if (ret) { - uint64_t *max_variable_threads_per_block = ret; - if (ir_type == PIPE_SHADER_IR_NATIVE) - *max_variable_threads_per_block = 0; - else - *max_variable_threads_per_block = SI_MAX_VARIABLE_THREADS_PER_BLOCK; - } - return sizeof(uint64_t); - } + enum pipe_shader_ir ir_type, + enum pipe_compute_cap param, + void *ret) +{ + struct si_screen *sscreen = (struct si_screen *)screen; + + //TODO: select these params by asic + switch (param) { + case PIPE_COMPUTE_CAP_IR_TARGET: { + const char *gpu, *triple; + + triple = "amdgcn-mesa-mesa3d"; + gpu = ac_get_llvm_processor_name(sscreen->info.family); + if (ret) { + sprintf(ret, "%s-%s", gpu, triple); + } + /* +2 for dash and terminating NIL byte */ + return (strlen(triple) + strlen(gpu) + 2) * sizeof(char); + } + case PIPE_COMPUTE_CAP_GRID_DIMENSION: + if (ret) { + uint64_t *grid_dimension = ret; + grid_dimension[0] = 3; + } + return 1 * sizeof(uint64_t); + + case PIPE_COMPUTE_CAP_MAX_GRID_SIZE: + if (ret) { + uint64_t *grid_size = ret; + grid_size[0] = 65535; + grid_size[1] = 65535; + grid_size[2] = 65535; + } + return 3 * sizeof(uint64_t) ; + + case PIPE_COMPUTE_CAP_MAX_BLOCK_SIZE: + if (ret) { + uint64_t *block_size = ret; + unsigned threads_per_block = get_max_threads_per_block(sscreen, ir_type); + block_size[0] = threads_per_block; + block_size[1] = threads_per_block; + block_size[2] = threads_per_block; + } + return 3 * sizeof(uint64_t); + + case PIPE_COMPUTE_CAP_MAX_THREADS_PER_BLOCK: + if (ret) { + uint64_t *max_threads_per_block = ret; + *max_threads_per_block = get_max_threads_per_block(sscreen, ir_type); + } + return sizeof(uint64_t); + case PIPE_COMPUTE_CAP_ADDRESS_BITS: + if (ret) { + uint32_t *address_bits = ret; + address_bits[0] = 64; + } + return 1 * sizeof(uint32_t); + + case PIPE_COMPUTE_CAP_MAX_GLOBAL_SIZE: + if (ret) { + uint64_t *max_global_size = ret; + uint64_t max_mem_alloc_size; + + si_get_compute_param(screen, ir_type, + PIPE_COMPUTE_CAP_MAX_MEM_ALLOC_SIZE, + &max_mem_alloc_size); + + /* In OpenCL, the MAX_MEM_ALLOC_SIZE must be at least + * 1/4 of the MAX_GLOBAL_SIZE. Since the + * MAX_MEM_ALLOC_SIZE is fixed for older kernels, + * make sure we never report more than + * 4 * MAX_MEM_ALLOC_SIZE. + */ + *max_global_size = MIN2(4 * max_mem_alloc_size, + MAX2(sscreen->info.gart_size, + sscreen->info.vram_size)); + } + return sizeof(uint64_t); + + case PIPE_COMPUTE_CAP_MAX_LOCAL_SIZE: + if (ret) { + uint64_t *max_local_size = ret; + /* Value reported by the closed source driver. */ + *max_local_size = 32768; + } + return sizeof(uint64_t); + + case PIPE_COMPUTE_CAP_MAX_INPUT_SIZE: + if (ret) { + uint64_t *max_input_size = ret; + /* Value reported by the closed source driver. */ + *max_input_size = 1024; + } + return sizeof(uint64_t); + + case PIPE_COMPUTE_CAP_MAX_MEM_ALLOC_SIZE: + if (ret) { + uint64_t *max_mem_alloc_size = ret; + + *max_mem_alloc_size = sscreen->info.max_alloc_size; + } + return sizeof(uint64_t); + + case PIPE_COMPUTE_CAP_MAX_CLOCK_FREQUENCY: + if (ret) { + uint32_t *max_clock_frequency = ret; + *max_clock_frequency = sscreen->info.max_shader_clock; + } + return sizeof(uint32_t); + + case PIPE_COMPUTE_CAP_MAX_COMPUTE_UNITS: + if (ret) { + uint32_t *max_compute_units = ret; + *max_compute_units = sscreen->info.num_good_compute_units; + } + return sizeof(uint32_t); + + case PIPE_COMPUTE_CAP_IMAGES_SUPPORTED: + if (ret) { + uint32_t *images_supported = ret; + *images_supported = 0; + } + return sizeof(uint32_t); + case PIPE_COMPUTE_CAP_MAX_PRIVATE_SIZE: + break; /* unused */ + case PIPE_COMPUTE_CAP_SUBGROUP_SIZE: + if (ret) { + uint32_t *subgroup_size = ret; + *subgroup_size = sscreen->compute_wave_size; + } + return sizeof(uint32_t); + case PIPE_COMPUTE_CAP_MAX_VARIABLE_THREADS_PER_BLOCK: + if (ret) { + uint64_t *max_variable_threads_per_block = ret; + if (ir_type == PIPE_SHADER_IR_NATIVE) + *max_variable_threads_per_block = 0; + else + *max_variable_threads_per_block = SI_MAX_VARIABLE_THREADS_PER_BLOCK; + } + return sizeof(uint64_t); + } fprintf(stderr, "unknown PIPE_COMPUTE_CAP %d\n", param); return 0; @@ -907,109 +881,109 @@ static uint64_t si_get_timestamp(struct pipe_screen *screen) { - struct si_screen *sscreen = (struct si_screen*)screen; + struct si_screen *sscreen = (struct si_screen*)screen; - return 1000000 * sscreen->ws->query_value(sscreen->ws, RADEON_TIMESTAMP) / - sscreen->info.clock_crystal_freq; + return 1000000 * sscreen->ws->query_value(sscreen->ws, RADEON_TIMESTAMP) / + sscreen->info.clock_crystal_freq; } static void si_query_memory_info(struct pipe_screen *screen, - struct pipe_memory_info *info) + struct pipe_memory_info *info) { - struct si_screen *sscreen = (struct si_screen*)screen; - struct radeon_winsys *ws = sscreen->ws; - unsigned vram_usage, gtt_usage; - - info->total_device_memory = sscreen->info.vram_size / 1024; - info->total_staging_memory = sscreen->info.gart_size / 1024; - - /* The real TTM memory usage is somewhat random, because: - * - * 1) TTM delays freeing memory, because it can only free it after - * fences expire. - * - * 2) The memory usage can be really low if big VRAM evictions are - * taking place, but the real usage is well above the size of VRAM. - * - * Instead, return statistics of this process. - */ - vram_usage = ws->query_value(ws, RADEON_VRAM_USAGE) / 1024; - gtt_usage = ws->query_value(ws, RADEON_GTT_USAGE) / 1024; - - info->avail_device_memory = - vram_usage <= info->total_device_memory ? - info->total_device_memory - vram_usage : 0; - info->avail_staging_memory = - gtt_usage <= info->total_staging_memory ? - info->total_staging_memory - gtt_usage : 0; - - info->device_memory_evicted = - ws->query_value(ws, RADEON_NUM_BYTES_MOVED) / 1024; - - if (sscreen->info.is_amdgpu && sscreen->info.drm_minor >= 4) - info->nr_device_memory_evictions = - ws->query_value(ws, RADEON_NUM_EVICTIONS); - else - /* Just return the number of evicted 64KB pages. */ - info->nr_device_memory_evictions = info->device_memory_evicted / 64; + struct si_screen *sscreen = (struct si_screen*)screen; + struct radeon_winsys *ws = sscreen->ws; + unsigned vram_usage, gtt_usage; + + info->total_device_memory = sscreen->info.vram_size / 1024; + info->total_staging_memory = sscreen->info.gart_size / 1024; + + /* The real TTM memory usage is somewhat random, because: + * + * 1) TTM delays freeing memory, because it can only free it after + * fences expire. + * + * 2) The memory usage can be really low if big VRAM evictions are + * taking place, but the real usage is well above the size of VRAM. + * + * Instead, return statistics of this process. + */ + vram_usage = ws->query_value(ws, RADEON_VRAM_USAGE) / 1024; + gtt_usage = ws->query_value(ws, RADEON_GTT_USAGE) / 1024; + + info->avail_device_memory = + vram_usage <= info->total_device_memory ? + info->total_device_memory - vram_usage : 0; + info->avail_staging_memory = + gtt_usage <= info->total_staging_memory ? + info->total_staging_memory - gtt_usage : 0; + + info->device_memory_evicted = + ws->query_value(ws, RADEON_NUM_BYTES_MOVED) / 1024; + + if (sscreen->info.is_amdgpu && sscreen->info.drm_minor >= 4) + info->nr_device_memory_evictions = + ws->query_value(ws, RADEON_NUM_EVICTIONS); + else + /* Just return the number of evicted 64KB pages. */ + info->nr_device_memory_evictions = info->device_memory_evicted / 64; } static struct disk_cache *si_get_disk_shader_cache(struct pipe_screen *pscreen) { - struct si_screen *sscreen = (struct si_screen*)pscreen; + struct si_screen *sscreen = (struct si_screen*)pscreen; - return sscreen->disk_shader_cache; + return sscreen->disk_shader_cache; } static void si_init_renderer_string(struct si_screen *sscreen) { - char first_name[256], second_name[32] = {}, kernel_version[128] = {}; - struct utsname uname_data; + char first_name[256], second_name[32] = {}, kernel_version[128] = {}; + struct utsname uname_data; - if (sscreen->info.marketing_name) { - snprintf(first_name, sizeof(first_name), "%s", - sscreen->info.marketing_name); - snprintf(second_name, sizeof(second_name), "%s, ", - sscreen->info.name); - } else { - snprintf(first_name, sizeof(first_name), "AMD %s", - sscreen->info.name); - } - - if (uname(&uname_data) == 0) - snprintf(kernel_version, sizeof(kernel_version), - ", %s", uname_data.release); - - snprintf(sscreen->renderer_string, sizeof(sscreen->renderer_string), - "%s (%sDRM %i.%i.%i%s, LLVM " MESA_LLVM_VERSION_STRING ")", - first_name, second_name, sscreen->info.drm_major, - sscreen->info.drm_minor, sscreen->info.drm_patchlevel, - kernel_version); + if (sscreen->info.marketing_name) { + snprintf(first_name, sizeof(first_name), "%s", + sscreen->info.marketing_name); + snprintf(second_name, sizeof(second_name), "%s, ", + sscreen->info.name); + } else { + snprintf(first_name, sizeof(first_name), "AMD %s", + sscreen->info.name); + } + + if (uname(&uname_data) == 0) + snprintf(kernel_version, sizeof(kernel_version), + ", %s", uname_data.release); + + snprintf(sscreen->renderer_string, sizeof(sscreen->renderer_string), + "%s (%sDRM %i.%i.%i%s, LLVM " MESA_LLVM_VERSION_STRING ")", + first_name, second_name, sscreen->info.drm_major, + sscreen->info.drm_minor, sscreen->info.drm_patchlevel, + kernel_version); } void si_init_screen_get_functions(struct si_screen *sscreen) { - sscreen->b.get_name = si_get_name; - sscreen->b.get_vendor = si_get_vendor; - sscreen->b.get_device_vendor = si_get_device_vendor; - sscreen->b.get_param = si_get_param; - sscreen->b.get_paramf = si_get_paramf; - sscreen->b.get_compute_param = si_get_compute_param; - sscreen->b.get_timestamp = si_get_timestamp; - sscreen->b.get_shader_param = si_get_shader_param; - sscreen->b.get_compiler_options = si_get_compiler_options; - sscreen->b.get_device_uuid = si_get_device_uuid; - sscreen->b.get_driver_uuid = si_get_driver_uuid; - sscreen->b.query_memory_info = si_query_memory_info; - sscreen->b.get_disk_shader_cache = si_get_disk_shader_cache; - - if (sscreen->info.has_hw_decode) { - sscreen->b.get_video_param = si_get_video_param; - sscreen->b.is_video_format_supported = si_vid_is_format_supported; - } else { - sscreen->b.get_video_param = si_get_video_param_no_decode; - sscreen->b.is_video_format_supported = vl_video_buffer_is_format_supported; - } + sscreen->b.get_name = si_get_name; + sscreen->b.get_vendor = si_get_vendor; + sscreen->b.get_device_vendor = si_get_device_vendor; + sscreen->b.get_param = si_get_param; + sscreen->b.get_paramf = si_get_paramf; + sscreen->b.get_compute_param = si_get_compute_param; + sscreen->b.get_timestamp = si_get_timestamp; + sscreen->b.get_shader_param = si_get_shader_param; + sscreen->b.get_compiler_options = si_get_compiler_options; + sscreen->b.get_device_uuid = si_get_device_uuid; + sscreen->b.get_driver_uuid = si_get_driver_uuid; + sscreen->b.query_memory_info = si_query_memory_info; + sscreen->b.get_disk_shader_cache = si_get_disk_shader_cache; + + if (sscreen->info.has_hw_decode) { + sscreen->b.get_video_param = si_get_video_param; + sscreen->b.is_video_format_supported = si_vid_is_format_supported; + } else { + sscreen->b.get_video_param = si_get_video_param_no_decode; + sscreen->b.is_video_format_supported = vl_video_buffer_is_format_supported; + } - si_init_renderer_string(sscreen); + si_init_renderer_string(sscreen); } diff -Nru mesa-19.2.8/src/gallium/drivers/radeonsi/si_gfx_cs.c mesa-20.0.8/src/gallium/drivers/radeonsi/si_gfx_cs.c --- mesa-19.2.8/src/gallium/drivers/radeonsi/si_gfx_cs.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/radeonsi/si_gfx_cs.c 2020-06-12 01:21:17.000000000 +0000 @@ -97,7 +97,7 @@ (!wait_flags || !ctx->gfx_last_ib_is_busy)) return; - if (si_check_device_reset(ctx)) + if (ctx->b.get_device_reset_status(&ctx->b) != PIPE_NO_RESET) return; if (ctx->screen->debug_flags & DBG(CHECK_VM)) @@ -110,7 +110,7 @@ * If the driver flushes the GFX IB internally, and it should never ask * for a fence handle. */ - assert(!radeon_emitted(ctx->dma_cs, 0) || fence == NULL); + assert(!radeon_emitted(ctx->sdma_cs, 0) || fence == NULL); /* Update the sdma_uploads list by flushing the uploader. */ u_upload_unmap(ctx->b.const_uploader); @@ -119,20 +119,18 @@ ctx->sdma_uploads_in_progress = true; for (unsigned i = 0; i < ctx->num_sdma_uploads; i++) { struct si_sdma_upload *up = &ctx->sdma_uploads[i]; - struct pipe_box box; assert(up->src_offset % 4 == 0 && up->dst_offset % 4 == 0 && up->size % 4 == 0); - u_box_1d(up->src_offset, up->size, &box); - ctx->dma_copy(&ctx->b, &up->dst->b.b, 0, up->dst_offset, 0, 0, - &up->src->b.b, 0, &box); + si_sdma_copy_buffer(ctx, &up->dst->b.b, &up->src->b.b, + up->dst_offset, up->src_offset, up->size); } ctx->sdma_uploads_in_progress = false; si_unref_sdma_uploads(ctx); /* Flush SDMA (preamble IB). */ - if (radeon_emitted(ctx->dma_cs, 0)) + if (radeon_emitted(ctx->sdma_cs, 0)) si_flush_dma_cs(ctx, flags, NULL); if (radeon_emitted(ctx->prim_discard_compute_cs, 0)) { @@ -154,7 +152,7 @@ } if (ctx->has_graphics) { - if (!LIST_IS_EMPTY(&ctx->active_queries)) + if (!list_is_empty(&ctx->active_queries)) si_suspend_queries(ctx); ctx->streamout.suspended = false; @@ -372,7 +370,7 @@ ctx->prefetch_L2_mask |= SI_PREFETCH_VBO_DESCRIPTORS; /* CLEAR_STATE disables all colorbuffers, so only enable bound ones. */ - bool has_clear_state = ctx->screen->has_clear_state; + bool has_clear_state = ctx->screen->info.has_clear_state; if (has_clear_state) { ctx->framebuffer.dirty_cbufs = u_bit_consecutive(0, ctx->framebuffer.state.nr_cbufs); @@ -426,7 +424,7 @@ si_streamout_buffers_dirty(ctx); } - if (!LIST_IS_EMPTY(&ctx->active_queries)) + if (!list_is_empty(&ctx->active_queries)) si_resume_queries(ctx); assert(!ctx->gfx_cs->prev_dw); @@ -440,9 +438,6 @@ ctx->last_restart_index = SI_RESTART_INDEX_UNKNOWN; ctx->last_prim = -1; ctx->last_multi_vgt_param = -1; - ctx->last_rast_prim = -1; - ctx->last_flatshade_first = -1; - ctx->last_sc_line_stipple = ~0; ctx->last_vs_state = ~0; ctx->last_ls = NULL; ctx->last_tcs = NULL; @@ -450,6 +445,7 @@ ctx->last_num_tcs_input_cp = -1; ctx->last_ls_hs_config = -1; /* impossible value */ ctx->last_binning_enabled = -1; + ctx->small_prim_cull_info_dirty = ctx->small_prim_cull_info_buf != NULL; ctx->prim_discard_compute_ib_initialized = false; @@ -494,6 +490,7 @@ ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_HARDWARE_SCREEN_OFFSET] = 0; ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_VTX_CNTL] = 0x00000005; ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_CLIPRECT_RULE] = 0xffff; + ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_LINE_STIPPLE] = 0; ctx->tracked_regs.reg_value[SI_TRACKED_VGT_ESGS_RING_ITEMSIZE] = 0x00000000; ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_OFFSET_1] = 0x00000000; ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_OFFSET_2] = 0x00000000; @@ -527,11 +524,13 @@ ctx->tracked_regs.reg_value[SI_TRACKED_VGT_TF_PARAM] = 0x00000000; ctx->tracked_regs.reg_value[SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL] = 0x0000001e; /* From GFX8 */ - /* Set all saved registers state to saved. */ - ctx->tracked_regs.reg_saved = 0xffffffffffffffff; + /* Set all cleared context registers to saved. */ + ctx->tracked_regs.reg_saved = ~(1ull << SI_TRACKED_GE_PC_ALLOC); /* uconfig reg */ + ctx->last_gs_out_prim = 0; /* cleared by CLEAR_STATE */ } else { - /* Set all saved registers state to unknown. */ + /* Set all register values to unknown. */ ctx->tracked_regs.reg_saved = 0; + ctx->last_gs_out_prim = -1; /* unknown */ } /* 0xffffffff is a impossible value to register SPI_PS_INPUT_CNTL_n */ diff -Nru mesa-19.2.8/src/gallium/drivers/radeonsi/si_gpu_load.c mesa-20.0.8/src/gallium/drivers/radeonsi/si_gpu_load.c --- mesa-19.2.8/src/gallium/drivers/radeonsi/si_gpu_load.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/radeonsi/si_gpu_load.c 2020-06-12 01:21:17.000000000 +0000 @@ -175,12 +175,12 @@ { /* Start the thread if needed. */ if (!sscreen->gpu_load_thread) { - mtx_lock(&sscreen->gpu_load_mutex); + simple_mtx_lock(&sscreen->gpu_load_mutex); /* Check again inside the mutex. */ if (!sscreen->gpu_load_thread) sscreen->gpu_load_thread = u_thread_create(si_gpu_load_thread, sscreen); - mtx_unlock(&sscreen->gpu_load_mutex); + simple_mtx_unlock(&sscreen->gpu_load_mutex); } unsigned busy = p_atomic_read(&sscreen->mmio_counters.array[busy_index]); diff -Nru mesa-19.2.8/src/gallium/drivers/radeonsi/si_perfcounter.c mesa-20.0.8/src/gallium/drivers/radeonsi/si_perfcounter.c --- mesa-19.2.8/src/gallium/drivers/radeonsi/si_perfcounter.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/radeonsi/si_perfcounter.c 2020-06-12 01:21:17.000000000 +0000 @@ -841,7 +841,7 @@ si_query_buffer_reset(ctx, &query->buffer); - LIST_ADDTAIL(&query->b.active_list, &ctx->active_queries); + list_addtail(&query->b.active_list, &ctx->active_queries); ctx->num_cs_dw_queries_suspend += query->b.num_cs_dw_suspend; si_pc_query_resume(ctx, squery); @@ -855,7 +855,7 @@ si_pc_query_suspend(ctx, squery); - LIST_DEL(&squery->active_list); + list_del(&squery->active_list); ctx->num_cs_dw_queries_suspend -= squery->num_cs_dw_suspend; return query->buffer.buf != NULL; diff -Nru mesa-19.2.8/src/gallium/drivers/radeonsi/si_pipe.c mesa-20.0.8/src/gallium/drivers/radeonsi/si_pipe.c --- mesa-19.2.8/src/gallium/drivers/radeonsi/si_pipe.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/radeonsi/si_pipe.c 2020-06-12 01:21:17.000000000 +0000 @@ -29,7 +29,6 @@ #include "si_compute.h" #include "sid.h" -#include "ac_llvm_util.h" #include "radeon/radeon_uvd.h" #include "util/disk_cache.h" #include "util/u_log.h" @@ -45,6 +44,9 @@ #include "gallium/winsys/amdgpu/drm/amdgpu_public.h" #include +static struct pipe_context *si_create_context(struct pipe_screen *screen, + unsigned flags); + static const struct debug_named_value debug_options[] = { /* Shader logging options: */ { "vs", DBG(VS), "Print vertex shaders" }, @@ -54,7 +56,7 @@ { "tes", DBG(TES), "Print tessellation evaluation shaders" }, { "cs", DBG(CS), "Print compute shaders" }, { "noir", DBG(NO_IR), "Don't print the LLVM IR"}, - { "notgsi", DBG(NO_TGSI), "Don't print the TGSI"}, + { "nonir", DBG(NO_NIR), "Don't print NIR when printing shaders"}, { "noasm", DBG(NO_ASM), "Don't print disassembled shaders"}, { "preoptir", DBG(PREOPT_IR), "Print the LLVM IR before initial optimizations" }, @@ -78,10 +80,13 @@ { "tex", DBG(TEX), "Print texture info" }, { "compute", DBG(COMPUTE), "Print compute info" }, { "vm", DBG(VM), "Print virtual addresses when creating resources" }, + { "cache_stats", DBG(CACHE_STATS), "Print shader cache statistics." }, /* Driver options: */ - { "forcedma", DBG(FORCE_DMA), "Use asynchronous DMA for all operations when possible." }, - { "nodma", DBG(NO_ASYNC_DMA), "Disable asynchronous DMA" }, + { "forcedma", DBG(FORCE_SDMA), "Use SDMA for all operations when possible." }, + { "nodma", DBG(NO_SDMA), "Disable SDMA" }, + { "nodmaclear", DBG(NO_SDMA_CLEARS), "Disable SDMA clears" }, + { "nodmacopyimage", DBG(NO_SDMA_COPY_IMAGE), "Disable SDMA image copies" }, { "nowc", DBG(NO_WC), "Disable GTT write combining" }, { "check_vm", DBG(CHECK_VM), "Check VM faults and dump debug info." }, { "reserve_vmid", DBG(RESERVE_VMID), "Force VMID reservation per context." }, @@ -90,6 +95,8 @@ /* 3D engine options: */ { "nogfx", DBG(NO_GFX), "Disable graphics. Only multimedia compute paths can be used." }, { "nongg", DBG(NO_NGG), "Disable NGG and use the legacy pipeline." }, + { "nggc", DBG(ALWAYS_NGG_CULLING), "Always use NGG culling even when it can hurt." }, + { "nonggc", DBG(NO_NGG_CULLING), "Disable NGG culling." }, { "alwayspd", DBG(ALWAYS_PD), "Always enable the primitive discard compute shader." }, { "pd", DBG(PD), "Enable the primitive discard compute shader for large draw calls." }, { "nopd", DBG(NO_PD), "Disable the primitive discard compute shader." }, @@ -109,6 +116,10 @@ { "nodccmsaa", DBG(NO_DCC_MSAA), "Disable DCC for MSAA" }, { "nofmask", DBG(NO_FMASK), "Disable MSAA compression" }, + DEBUG_NAMED_VALUE_END /* must be last */ +}; + +static const struct debug_named_value test_options[] = { /* Tests: */ { "testdma", DBG(TEST_DMA), "Invoke SDMA tests and exit." }, { "testvmfaultcp", DBG(TEST_VMFAULT_CP), "Invoke a CP VM fault test and exit." }, @@ -122,8 +133,7 @@ DEBUG_NAMED_VALUE_END /* must be last */ }; -static void si_init_compiler(struct si_screen *sscreen, - struct ac_llvm_compiler *compiler) +void si_init_compiler(struct si_screen *sscreen, struct ac_llvm_compiler *compiler) { /* Only create the less-optimizing version of the compiler on APUs * predating Ryzen (Raven). */ @@ -162,9 +172,6 @@ struct si_context *sctx = (struct si_context *)context; int i; - util_queue_finish(&sctx->screen->shader_compiler_queue); - util_queue_finish(&sctx->screen->shader_compiler_queue_low_priority); - /* Unreference the framebuffer normally to disable related logic * properly. */ @@ -187,6 +194,7 @@ si_resource_reference(&sctx->scratch_buffer, NULL); si_resource_reference(&sctx->compute_scratch_buffer, NULL); si_resource_reference(&sctx->wait_mem_scratch, NULL); + si_resource_reference(&sctx->small_prim_cull_info_buf, NULL); si_pm4_free_state(sctx, sctx->init_config, ~0); if (sctx->init_config_gs_rings) @@ -228,9 +236,20 @@ sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_render_target); if (sctx->cs_clear_render_target_1d_array) sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_render_target_1d_array); + if (sctx->cs_clear_12bytes_buffer) + sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_12bytes_buffer); if (sctx->cs_dcc_retile) sctx->b.delete_compute_state(&sctx->b, sctx->cs_dcc_retile); + for (unsigned i = 0; i < ARRAY_SIZE(sctx->cs_fmask_expand); i++) { + for (unsigned j = 0; j < ARRAY_SIZE(sctx->cs_fmask_expand[i]); j++) { + if (sctx->cs_fmask_expand[i][j]) { + sctx->b.delete_compute_state(&sctx->b, + sctx->cs_fmask_expand[i][j]); + } + } + } + if (sctx->blitter) util_blitter_destroy(sctx->blitter); @@ -253,8 +272,8 @@ if (sctx->gfx_cs) sctx->ws->cs_destroy(sctx->gfx_cs); - if (sctx->dma_cs) - sctx->ws->cs_destroy(sctx->dma_cs); + if (sctx->sdma_cs) + sctx->ws->cs_destroy(sctx->sdma_cs); if (sctx->ctx) sctx->ws->ctx_destroy(sctx->ctx); @@ -294,14 +313,39 @@ util_dynarray_fini(&sctx->resident_img_needs_color_decompress); util_dynarray_fini(&sctx->resident_tex_needs_depth_decompress); si_unref_sdma_uploads(sctx); + free(sctx->sdma_uploads); FREE(sctx); } static enum pipe_reset_status si_get_reset_status(struct pipe_context *ctx) { struct si_context *sctx = (struct si_context *)ctx; + struct si_screen *sscreen = sctx->screen; + enum pipe_reset_status status = sctx->ws->ctx_query_reset_status(sctx->ctx); - return sctx->ws->ctx_query_reset_status(sctx->ctx); + if (status != PIPE_NO_RESET) { + /* Call the state tracker to set a no-op API dispatch. */ + if (sctx->device_reset_callback.reset) { + sctx->device_reset_callback.reset(sctx->device_reset_callback.data, + status); + } + + /* Re-create the auxiliary context, because it won't submit + * any new IBs due to a GPU reset. + */ + simple_mtx_lock(&sscreen->aux_context_lock); + + struct u_log_context *aux_log = ((struct si_context *)sscreen->aux_context)->log; + sscreen->aux_context->set_log_context(sscreen->aux_context, NULL); + sscreen->aux_context->destroy(sscreen->aux_context); + + sscreen->aux_context = si_create_context(&sscreen->b, + (sscreen->options.aux_debug ? PIPE_CONTEXT_DEBUG : 0) | + (sscreen->info.has_graphics ? 0 : PIPE_CONTEXT_COMPUTE_ONLY)); + sscreen->aux_context->set_log_context(sscreen->aux_context, aux_log); + simple_mtx_unlock(&sscreen->aux_context_lock); + } + return status; } static void si_set_device_reset_callback(struct pipe_context *ctx, @@ -316,21 +360,6 @@ sizeof(sctx->device_reset_callback)); } -bool si_check_device_reset(struct si_context *sctx) -{ - enum pipe_reset_status status; - - if (!sctx->device_reset_callback.reset) - return false; - - status = sctx->ws->ctx_query_reset_status(sctx->ctx); - if (status == PIPE_NO_RESET) - return false; - - sctx->device_reset_callback.reset(sctx->device_reset_callback.data, status); - return true; -} - /* Apitrace profiling: * 1) qapitrace : Tools -> Profile: Measure CPU & GPU times * 2) In the middle panel, zoom in (mouse wheel) on some bad draw call @@ -393,6 +422,7 @@ unsigned flags) { struct si_screen* sscreen = (struct si_screen *)screen; + STATIC_ASSERT(DBG_COUNT <= 64); /* Don't create a context if it's not compute-only and hw is compute-only. */ if (!sscreen->info.has_graphics && @@ -460,19 +490,24 @@ if (!sctx->ctx) goto fail; - if (sscreen->info.num_sdma_rings && - !(sscreen->debug_flags & DBG(NO_ASYNC_DMA)) && + if (sscreen->info.num_rings[RING_DMA] && + !(sscreen->debug_flags & DBG(NO_SDMA)) && + /* SDMA causes corruption on RX 580: + * https://gitlab.freedesktop.org/mesa/mesa/-/issues/1399 + * https://gitlab.freedesktop.org/mesa/mesa/-/issues/1889 + */ + (sctx->chip_class != GFX8 || sscreen->debug_flags & DBG(FORCE_SDMA)) && /* SDMA timeouts sometimes on gfx10 so disable it for now. See: * https://bugs.freedesktop.org/show_bug.cgi?id=111481 - * https://gitlab.freedesktop.org/mesa/mesa/issues/1907 + * https://gitlab.freedesktop.org/mesa/mesa/-/issues/1907 */ - (sctx->chip_class != GFX10 || sscreen->debug_flags & DBG(FORCE_DMA))) { - sctx->dma_cs = sctx->ws->cs_create(sctx->ctx, RING_DMA, + (sctx->chip_class != GFX10 || sscreen->debug_flags & DBG(FORCE_SDMA))) { + sctx->sdma_cs = sctx->ws->cs_create(sctx->ctx, RING_DMA, (void*)si_flush_dma_cs, sctx, stop_exec_on_failure); } - bool use_sdma_upload = sscreen->info.has_dedicated_vram && sctx->dma_cs; + bool use_sdma_upload = sscreen->info.has_dedicated_vram && sctx->sdma_cs; sctx->b.const_uploader = u_upload_create(&sctx->b, 256 * 1024, 0, PIPE_USAGE_DEFAULT, SI_RESOURCE_FLAG_32BIT | @@ -561,16 +596,21 @@ sctx->queued.named.rasterizer = sctx->discard_rasterizer_state; si_init_draw_functions(sctx); - si_initialize_prim_discard_tunables(sctx); + + /* If aux_context == NULL, we are initializing aux_context right now. */ + bool is_aux_context = !sscreen->aux_context; + si_initialize_prim_discard_tunables(sscreen, is_aux_context, + &sctx->prim_discard_vertex_count_threshold, + &sctx->index_ring_size_per_ib); } /* Initialize SDMA functions. */ if (sctx->chip_class >= GFX7) cik_init_sdma_functions(sctx); else - si_init_dma_functions(sctx); + sctx->dma_copy = si_resource_copy_region; - if (sscreen->debug_flags & DBG(FORCE_DMA)) + if (sscreen->debug_flags & DBG(FORCE_SDMA)) sctx->b.resource_copy_region = sctx->dma_copy; sctx->sample_mask = 0xffff; @@ -628,7 +668,7 @@ } uint64_t max_threads_per_block; - screen->get_compute_param(screen, PIPE_SHADER_IR_TGSI, + screen->get_compute_param(screen, PIPE_SHADER_IR_NIR, PIPE_COMPUTE_CAP_MAX_THREADS_PER_BLOCK, &max_threads_per_block); @@ -647,8 +687,6 @@ sctx->scratch_waves = MAX2(32 * sscreen->info.num_good_compute_units, max_threads_per_block / 64); - si_init_compiler(sscreen, &sctx->compiler); - /* Bindless handles. */ sctx->tex_handles = _mesa_hash_table_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal); @@ -736,7 +774,19 @@ if (!sscreen->ws->unref(sscreen->ws)) return; - mtx_destroy(&sscreen->aux_context_lock); + if (sscreen->debug_flags & DBG(CACHE_STATS)) { + printf("live shader cache: hits = %u, misses = %u\n", + sscreen->live_shader_cache.hits, + sscreen->live_shader_cache.misses); + printf("memory shader cache: hits = %u, misses = %u\n", + sscreen->num_memory_shader_cache_hits, + sscreen->num_memory_shader_cache_misses); + printf("disk shader cache: hits = %u, misses = %u\n", + sscreen->num_disk_shader_cache_hits, + sscreen->num_disk_shader_cache_misses); + } + + simple_mtx_destroy(&sscreen->aux_context_lock); struct u_log_context *aux_log = ((struct si_context *)sscreen->aux_context)->log; if (aux_log) { @@ -750,6 +800,9 @@ util_queue_destroy(&sscreen->shader_compiler_queue); util_queue_destroy(&sscreen->shader_compiler_queue_low_priority); + /* Release the reference on glsl types of the compiler threads. */ + glsl_type_singleton_decref(); + for (i = 0; i < ARRAY_SIZE(sscreen->compiler); i++) si_destroy_compiler(&sscreen->compiler[i]); @@ -766,17 +819,18 @@ FREE(part); } } - mtx_destroy(&sscreen->shader_parts_mutex); + simple_mtx_destroy(&sscreen->shader_parts_mutex); si_destroy_shader_cache(sscreen); si_destroy_perfcounters(sscreen); si_gpu_load_kill_thread(sscreen); - mtx_destroy(&sscreen->gpu_load_mutex); + simple_mtx_destroy(&sscreen->gpu_load_mutex); slab_destroy_parent(&sscreen->pool_transfers); disk_cache_destroy(sscreen->disk_shader_cache); + util_live_shader_cache_deinit(&sscreen->live_shader_cache); sscreen->ws->destroy(sscreen->ws); FREE(sscreen); } @@ -787,7 +841,7 @@ sscreen->info.family); } -static void si_test_vmfault(struct si_screen *sscreen) +static void si_test_vmfault(struct si_screen *sscreen, uint64_t test_flags) { struct pipe_context *ctx = sscreen->aux_context; struct si_context *sctx = (struct si_context *)ctx; @@ -801,18 +855,18 @@ si_resource(buf)->gpu_address = 0; /* cause a VM fault */ - if (sscreen->debug_flags & DBG(TEST_VMFAULT_CP)) { + if (test_flags & DBG(TEST_VMFAULT_CP)) { si_cp_dma_copy_buffer(sctx, buf, buf, 0, 4, 4, 0, SI_COHERENCY_NONE, L2_BYPASS); ctx->flush(ctx, NULL, 0); puts("VM fault test: CP - done."); } - if (sscreen->debug_flags & DBG(TEST_VMFAULT_SDMA)) { + if (test_flags & DBG(TEST_VMFAULT_SDMA)) { si_sdma_clear_buffer(sctx, buf, 0, 4, 0); ctx->flush(ctx, NULL, 0); puts("VM fault test: SDMA - done."); } - if (sscreen->debug_flags & DBG(TEST_VMFAULT_SHADER)) { + if (test_flags & DBG(TEST_VMFAULT_SHADER)) { util_test_constant_buffer(ctx, buf); puts("VM fault test: Shader - done."); } @@ -877,10 +931,6 @@ /* These flags affect shader compilation. */ #define ALL_FLAGS (DBG(SI_SCHED) | DBG(GISEL)) uint64_t shader_debug_flags = sscreen->debug_flags & ALL_FLAGS; - /* Reserve left-most bit for tgsi/nir selector */ - assert(!(shader_debug_flags & (1u << 31))); - shader_debug_flags |= (uint32_t) - ((sscreen->options.enable_nir & 0x1) << 31); /* Add the high bits of 32-bit addresses, which affects * how 32-bit addresses are expanded to 64 bits. @@ -921,7 +971,8 @@ const struct pipe_screen_config *config) { struct si_screen *sscreen = CALLOC_STRUCT(si_screen); - unsigned hw_threads, num_comp_hi_threads, num_comp_lo_threads, i; + unsigned hw_threads, num_comp_hi_threads, num_comp_lo_threads; + uint64_t test_flags; if (!sscreen) { return NULL; @@ -930,7 +981,7 @@ sscreen->ws = ws; ws->query_info(ws, &sscreen->info); - if (sscreen->info.chip_class == GFX10 && HAVE_LLVM < 0x0900) { + if (sscreen->info.chip_class == GFX10 && LLVM_VERSION_MAJOR < 9) { fprintf(stderr, "radeonsi: Navi family support requires LLVM 9 or higher\n"); FREE(sscreen); return NULL; @@ -949,6 +1000,8 @@ debug_options, 0); sscreen->debug_flags |= debug_get_flags_option("AMD_DEBUG", debug_options, 0); + test_flags = debug_get_flags_option("AMD_TEST", + test_options, 0); if (sscreen->debug_flags & DBG(NO_GFX)) sscreen->info.has_graphics = false; @@ -960,6 +1013,7 @@ si_set_max_shader_compiler_threads; sscreen->b.is_parallel_shader_compilation_finished = si_is_parallel_shader_compilation_finished; + sscreen->b.finalize_nir = si_finalize_nir; si_init_screen_get_functions(sscreen); si_init_screen_buffer_functions(sscreen); @@ -967,6 +1021,7 @@ si_init_screen_state_functions(sscreen); si_init_screen_texture_functions(sscreen); si_init_screen_query_functions(sscreen); + si_init_screen_live_shader_cache(sscreen); /* Set these flags in debug_flags early, so that the shader cache takes * them into account. @@ -994,8 +1049,8 @@ 1 << util_logbase2(sscreen->force_aniso)); } - (void) mtx_init(&sscreen->aux_context_lock, mtx_plain); - (void) mtx_init(&sscreen->gpu_load_mutex, mtx_plain); + (void) simple_mtx_init(&sscreen->aux_context_lock, mtx_plain); + (void) simple_mtx_init(&sscreen->gpu_load_mutex, mtx_plain); si_init_gs_info(sscreen); if (!si_init_shader_cache(sscreen)) { @@ -1034,12 +1089,16 @@ num_comp_lo_threads = MIN2(num_comp_lo_threads, ARRAY_SIZE(sscreen->compiler_lowp)); + /* Take a reference on the glsl types for the compiler threads. */ + glsl_type_singleton_init_or_ref(); + if (!util_queue_init(&sscreen->shader_compiler_queue, "sh", 64, num_comp_hi_threads, UTIL_QUEUE_INIT_RESIZE_IF_FULL | UTIL_QUEUE_INIT_SET_FULL_THREAD_AFFINITY)) { si_destroy_shader_cache(sscreen); FREE(sscreen); + glsl_type_singleton_decref(); return NULL; } @@ -1051,12 +1110,21 @@ UTIL_QUEUE_INIT_USE_MINIMUM_PRIORITY)) { si_destroy_shader_cache(sscreen); FREE(sscreen); + glsl_type_singleton_decref(); return NULL; } if (!debug_get_bool_option("RADEON_DISABLE_PERFCOUNTERS", false)) si_init_perfcounters(sscreen); + unsigned prim_discard_vertex_count_threshold, tmp; + si_initialize_prim_discard_tunables(sscreen, false, + &prim_discard_vertex_count_threshold, + &tmp); + /* Compute-shader-based culling doesn't support VBOs in user SGPRs. */ + if (prim_discard_vertex_count_threshold == UINT_MAX) + sscreen->num_vbos_in_user_sgprs = sscreen->info.chip_class >= GFX9 ? 5 : 1; + /* Determine tessellation ring info. */ bool double_offchip_buffers = sscreen->info.chip_class >= GFX7 && sscreen->info.family != CHIP_CARRIZO && @@ -1106,16 +1174,6 @@ S_0089B0_OFFCHIP_BUFFERING(max_offchip_buffers); } - /* The mere presense of CLEAR_STATE in the IB causes random GPU hangs - * on GFX6. Some CLEAR_STATE cause asic hang on radeon kernel, etc. - * SPI_VS_OUT_CONFIG. So only enable GFX7 CLEAR_STATE on amdgpu kernel. */ - sscreen->has_clear_state = sscreen->info.chip_class >= GFX7 && - sscreen->info.is_amdgpu; - - sscreen->has_distributed_tess = - sscreen->info.chip_class >= GFX8 && - sscreen->info.max_se >= 2; - sscreen->has_draw_indirect_multi = (sscreen->info.family >= CHIP_POLARIS10) || (sscreen->info.chip_class == GFX8 && @@ -1128,28 +1186,20 @@ sscreen->info.pfp_fw_version >= 79 && sscreen->info.me_fw_version >= 142); - sscreen->has_out_of_order_rast = sscreen->info.chip_class >= GFX8 && - sscreen->info.max_se >= 2 && + sscreen->has_out_of_order_rast = sscreen->info.has_out_of_order_rast && !(sscreen->debug_flags & DBG(NO_OUT_OF_ORDER)); sscreen->assume_no_z_fights = driQueryOptionb(config->options, "radeonsi_assume_no_z_fights"); sscreen->commutative_blend_add = driQueryOptionb(config->options, "radeonsi_commutative_blend_add"); - sscreen->has_gfx9_scissor_bug = sscreen->info.family == CHIP_VEGA10 || - sscreen->info.family == CHIP_RAVEN; - sscreen->has_msaa_sample_loc_bug = (sscreen->info.family >= CHIP_POLARIS10 && - sscreen->info.family <= CHIP_POLARIS12) || - sscreen->info.family == CHIP_VEGA10 || - sscreen->info.family == CHIP_RAVEN; - sscreen->has_ls_vgpr_init_bug = sscreen->info.family == CHIP_VEGA10 || - sscreen->info.family == CHIP_RAVEN; - sscreen->has_dcc_constant_encode = sscreen->info.family == CHIP_RAVEN2 || - sscreen->info.family == CHIP_RENOIR || - sscreen->info.chip_class >= GFX10; sscreen->use_ngg = sscreen->info.chip_class >= GFX10 && sscreen->info.family != CHIP_NAVI14 && !(sscreen->debug_flags & DBG(NO_NGG)); + sscreen->use_ngg_culling = sscreen->use_ngg && + !(sscreen->debug_flags & DBG(NO_NGG_CULLING)); + sscreen->always_use_ngg_culling = sscreen->use_ngg_culling && + sscreen->debug_flags & DBG(ALWAYS_NGG_CULLING); sscreen->use_ngg_streamout = false; /* Only enable primitive binning on APUs by default. */ @@ -1181,28 +1231,10 @@ */ sscreen->llvm_has_working_vgpr_indexing = sscreen->info.chip_class != GFX9; - /* Some chips have RB+ registers, but don't support RB+. Those must - * always disable it. - */ - if (sscreen->info.family == CHIP_STONEY || - sscreen->info.chip_class >= GFX9) { - sscreen->has_rbplus = true; - - sscreen->rbplus_allowed = - !(sscreen->debug_flags & DBG(NO_RB_PLUS)) && - (sscreen->info.family == CHIP_STONEY || - sscreen->info.family == CHIP_VEGA12 || - sscreen->info.family == CHIP_RAVEN || - sscreen->info.family == CHIP_RAVEN2 || - sscreen->info.family == CHIP_RENOIR); - } - sscreen->dcc_msaa_allowed = !(sscreen->debug_flags & DBG(NO_DCC_MSAA)); - sscreen->cpdma_prefetch_writes_memory = sscreen->info.chip_class <= GFX8; - - (void) mtx_init(&sscreen->shader_parts_mutex, mtx_plain); + (void) simple_mtx_init(&sscreen->shader_parts_mutex, mtx_plain); sscreen->use_monolithic_shaders = (sscreen->debug_flags & DBG(MONOLITHIC_SHADERS)) != 0; @@ -1241,11 +1273,6 @@ } } - for (i = 0; i < num_comp_hi_threads; i++) - si_init_compiler(sscreen, &sscreen->compiler[i]); - for (i = 0; i < num_comp_lo_threads; i++) - si_init_compiler(sscreen, &sscreen->compiler_lowp[i]); - sscreen->ge_wave_size = 64; sscreen->ps_wave_size = 64; sscreen->compute_wave_size = 64; @@ -1281,30 +1308,31 @@ sscreen->aux_context->set_log_context(sscreen->aux_context, log); } - if (sscreen->debug_flags & DBG(TEST_DMA)) + if (test_flags & DBG(TEST_DMA)) si_test_dma(sscreen); - if (sscreen->debug_flags & DBG(TEST_DMA_PERF)) { + if (test_flags & DBG(TEST_DMA_PERF)) { si_test_dma_perf(sscreen); } - if (sscreen->debug_flags & (DBG(TEST_VMFAULT_CP) | + if (test_flags & (DBG(TEST_VMFAULT_CP) | DBG(TEST_VMFAULT_SDMA) | DBG(TEST_VMFAULT_SHADER))) - si_test_vmfault(sscreen); + si_test_vmfault(sscreen, test_flags); - if (sscreen->debug_flags & DBG(TEST_GDS)) + if (test_flags & DBG(TEST_GDS)) si_test_gds((struct si_context*)sscreen->aux_context); - if (sscreen->debug_flags & DBG(TEST_GDS_MM)) { + if (test_flags & DBG(TEST_GDS_MM)) { si_test_gds_memory_management((struct si_context*)sscreen->aux_context, 32 * 1024, 4, RADEON_DOMAIN_GDS); } - if (sscreen->debug_flags & DBG(TEST_GDS_OA_MM)) { + if (test_flags & DBG(TEST_GDS_OA_MM)) { si_test_gds_memory_management((struct si_context*)sscreen->aux_context, 4, 1, RADEON_DOMAIN_OA); } + STATIC_ASSERT(sizeof(union si_vgt_stages_key) == 4); return &sscreen->b; } diff -Nru mesa-19.2.8/src/gallium/drivers/radeonsi/si_pipe.h mesa-20.0.8/src/gallium/drivers/radeonsi/si_pipe.h --- mesa-19.2.8/src/gallium/drivers/radeonsi/si_pipe.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/radeonsi/si_pipe.h 2020-06-12 01:21:17.000000000 +0000 @@ -32,7 +32,7 @@ #include "util/u_idalloc.h" #include "util/u_threaded_context.h" -#ifdef PIPE_ARCH_BIG_ENDIAN +#if UTIL_ARCH_BIG_ENDIAN #define SI_BIG_ENDIAN 1 #else #define SI_BIG_ENDIAN 0 @@ -116,6 +116,11 @@ #define SI_RESOURCE_FLAG_CLEAR (PIPE_RESOURCE_FLAG_DRV_PRIV << 7) /* For const_uploader, upload data via GTT and copy to VRAM on context flush via SDMA. */ #define SI_RESOURCE_FLAG_UPLOAD_FLUSH_EXPLICIT_VIA_SDMA (PIPE_RESOURCE_FLAG_DRV_PRIV << 8) +/* Set a micro tile mode: */ +#define SI_RESOURCE_FLAG_FORCE_MICRO_TILE_MODE (PIPE_RESOURCE_FLAG_DRV_PRIV << 9) +#define SI_RESOURCE_FLAG_MICRO_TILE_MODE_SHIFT (util_logbase2(PIPE_RESOURCE_FLAG_DRV_PRIV) + 10) +#define SI_RESOURCE_FLAG_MICRO_TILE_MODE_SET(x) (((x) & 0x3) << SI_RESOURCE_FLAG_MICRO_TILE_MODE_SHIFT) +#define SI_RESOURCE_FLAG_MICRO_TILE_MODE_GET(x) (((x) >> SI_RESOURCE_FLAG_MICRO_TILE_MODE_SHIFT) & 0x3) enum si_clear_code { @@ -139,7 +144,7 @@ DBG_TES = PIPE_SHADER_TESS_EVAL, DBG_CS = PIPE_SHADER_COMPUTE, DBG_NO_IR, - DBG_NO_TGSI, + DBG_NO_NIR, DBG_NO_ASM, DBG_PREOPT_IR, @@ -164,10 +169,13 @@ DBG_TEX, DBG_COMPUTE, DBG_VM, + DBG_CACHE_STATS, /* Driver options: */ - DBG_FORCE_DMA, - DBG_NO_ASYNC_DMA, + DBG_FORCE_SDMA, + DBG_NO_SDMA, + DBG_NO_SDMA_CLEARS, + DBG_NO_SDMA_COPY_IMAGE, DBG_NO_WC, DBG_CHECK_VM, DBG_RESERVE_VMID, @@ -176,6 +184,8 @@ /* 3D engine options: */ DBG_NO_GFX, DBG_NO_NGG, + DBG_ALWAYS_NGG_CULLING, + DBG_NO_NGG_CULLING, DBG_ALWAYS_PD, DBG_PD, DBG_NO_PD, @@ -195,6 +205,10 @@ DBG_NO_DCC_MSAA, DBG_NO_FMASK, + DBG_COUNT +}; + +enum { /* Tests: */ DBG_TEST_DMA, DBG_TEST_VMFAULT_CP, @@ -288,7 +302,6 @@ struct si_resource buffer; struct radeon_surf surface; - uint64_t size; struct si_texture *flushed_depth_texture; /* One texture allocation can contain these buffers: @@ -300,25 +313,22 @@ * - displayable DCC buffer (if the DCC buffer is not displayable) * - DCC retile mapping buffer (if the DCC buffer is not displayable) */ - uint64_t fmask_offset; - uint64_t cmask_offset; uint64_t cmask_base_address_reg; struct si_resource *cmask_buffer; - uint64_t dcc_offset; /* 0 = disabled */ - uint64_t display_dcc_offset; - uint64_t dcc_retile_map_offset; unsigned cb_color_info; /* fast clear enable bit */ unsigned color_clear_value[2]; unsigned last_msaa_resolve_target_micro_mode; unsigned num_level0_transfers; + unsigned plane_index; /* other planes are different pipe_resources */ + unsigned num_planes; /* Depth buffer compression and fast clear. */ - uint64_t htile_offset; float depth_clear_value; uint16_t dirty_level_mask; /* each bit says if that mipmap is compressed */ uint16_t stencil_dirty_level_mask; /* each bit says if that mipmap is compressed */ enum pipe_format db_render_format:16; uint8_t stencil_clear_value; + bool fmask_is_identity:1; bool tc_compatible_htile:1; bool htile_stencil_disabled:1; bool depth_cleared:1; /* if it was cleared at least once */ @@ -336,6 +346,8 @@ * for a possible future enablement. */ bool separate_dcc_dirty:1; + bool displayable_dcc_dirty:1; + /* Statistics gathering for the DCC enablement heuristic. */ bool dcc_gather_statistics:1; /* Counter that should be non-zero if the texture is bound to a @@ -481,6 +493,7 @@ uint32_t *state, uint32_t *fmask_state); + unsigned num_vbos_in_user_sgprs; unsigned pa_sc_raster_config; unsigned pa_sc_raster_config_1; unsigned se_tile_repeat; @@ -492,20 +505,16 @@ unsigned eqaa_force_coverage_samples; unsigned eqaa_force_z_samples; unsigned eqaa_force_color_samples; - bool has_clear_state; - bool has_distributed_tess; bool has_draw_indirect_multi; bool has_out_of_order_rast; bool assume_no_z_fights; bool commutative_blend_add; - bool has_gfx9_scissor_bug; - bool has_msaa_sample_loc_bug; - bool has_ls_vgpr_init_bug; - bool has_dcc_constant_encode; bool dpbb_allowed; bool dfsm_allowed; bool llvm_has_working_vgpr_indexing; bool use_ngg; + bool use_ngg_culling; + bool always_use_ngg_culling; bool use_ngg_streamout; struct { @@ -516,10 +525,7 @@ /* Whether shaders are monolithic (1-part) or separate (3-part). */ bool use_monolithic_shaders; bool record_llvm_ir; - bool has_rbplus; /* if RB+ registers exist */ - bool rbplus_allowed; /* if RB+ is allowed */ bool dcc_msaa_allowed; - bool cpdma_prefetch_writes_memory; struct slab_parent_pool pool_transfers; @@ -529,7 +535,7 @@ /* Auxiliary context. Mainly used to initialize resources. * It must be locked prior to using and flushed before unlocking. */ struct pipe_context *aux_context; - mtx_t aux_context_lock; + simple_mtx_t aux_context_lock; /* This must be in the screen, because UE4 uses one context for * compilation and another one for rendering. @@ -539,10 +545,13 @@ * are loading shaders on demand. This is a monotonic counter. */ unsigned num_shaders_created; - unsigned num_shader_cache_hits; + unsigned num_memory_shader_cache_hits; + unsigned num_memory_shader_cache_misses; + unsigned num_disk_shader_cache_hits; + unsigned num_disk_shader_cache_misses; /* GPU load thread. */ - mtx_t gpu_load_mutex; + simple_mtx_t gpu_load_mutex; thrd_t gpu_load_thread; union si_mmio_counters mmio_counters; volatile unsigned gpu_load_stop_thread; /* bool */ @@ -578,7 +587,7 @@ unsigned L2_to_cp; } barrier_flags; - mtx_t shader_parts_mutex; + simple_mtx_t shader_parts_mutex; struct si_shader_part *vs_prologs; struct si_shader_part *tcs_epilogs; struct si_shader_part *gs_prologs; @@ -589,7 +598,7 @@ * * Design & limitations: * - The shader cache is per screen (= per process), never saved to - * disk, and skips redundant shader compilations from TGSI to bytecode. + * disk, and skips redundant shader compilations from NIR to bytecode. * - It can only be used with one-variant-per-shader support, in which * case only the main (typically middle) part of shaders is cached. * - Only VS, TCS, TES, PS are cached, out of which only the hw VS @@ -597,9 +606,12 @@ * - GS and CS aren't cached, but it's certainly possible to cache * those as well. */ - mtx_t shader_cache_mutex; + simple_mtx_t shader_cache_mutex; struct hash_table *shader_cache; + /* Shader cache of live shaders. */ + struct util_live_shader_cache live_shader_cache; + /* Shader compiler queue for multithreaded compilation. */ struct util_queue shader_compiler_queue; /* Use at most 3 normal compiler threads on quadcore and better. @@ -683,6 +695,7 @@ ubyte nr_color_samples; /* at most 8xAA */ ubyte compressed_cb_mask; ubyte uncompressed_cb_mask; + ubyte displayable_dcc_cb_mask; ubyte color_is_int8; ubyte color_is_int10; ubyte dirty_cbufs; @@ -773,7 +786,7 @@ */ union si_vgt_param_key { struct { -#ifdef PIPE_ARCH_LITTLE_ENDIAN +#if UTIL_ARCH_LITTLE_ENDIAN unsigned prim:4; unsigned uses_instancing:1; unsigned multi_instances_smaller_than_primgroup:1; @@ -784,7 +797,7 @@ unsigned tess_uses_prim_id:1; unsigned uses_gs:1; unsigned _pad:32 - SI_NUM_VGT_PARAM_KEY_BITS; -#else /* PIPE_ARCH_BIG_ENDIAN */ +#else /* UTIL_ARCH_BIG_ENDIAN */ unsigned _pad:32 - SI_NUM_VGT_PARAM_KEY_BITS; unsigned uses_gs:1; unsigned tess_uses_prim_id:1; @@ -800,7 +813,7 @@ uint32_t index; }; -#define SI_NUM_VGT_STAGES_KEY_BITS 4 +#define SI_NUM_VGT_STAGES_KEY_BITS 6 #define SI_NUM_VGT_STAGES_STATES (1 << SI_NUM_VGT_STAGES_KEY_BITS) /* The VGT_SHADER_STAGES key used to index the table of precomputed values. @@ -808,16 +821,20 @@ */ union si_vgt_stages_key { struct { -#ifdef PIPE_ARCH_LITTLE_ENDIAN +#if UTIL_ARCH_LITTLE_ENDIAN unsigned tess:1; unsigned gs:1; + unsigned ngg_gs_fast_launch:1; + unsigned ngg_passthrough:1; unsigned ngg:1; /* gfx10+ */ unsigned streamout:1; /* only used with NGG */ unsigned _pad:32 - SI_NUM_VGT_STAGES_KEY_BITS; -#else /* PIPE_ARCH_BIG_ENDIAN */ +#else /* UTIL_ARCH_BIG_ENDIAN */ unsigned _pad:32 - SI_NUM_VGT_STAGES_KEY_BITS; unsigned streamout:1; unsigned ngg:1; + unsigned ngg_passthrough:1; + unsigned ngg_gs_fast_launch:1; unsigned gs:1; unsigned tess:1; #endif @@ -862,6 +879,10 @@ unsigned size; }; +struct si_small_prim_cull_info { + float scale[2], translate[2]; +}; + struct si_context { struct pipe_context b; /* base class */ @@ -871,7 +892,7 @@ struct radeon_winsys *ws; struct radeon_winsys_ctx *ctx; struct radeon_cmdbuf *gfx_cs; /* compute IB if graphics is disabled */ - struct radeon_cmdbuf *dma_cs; + struct radeon_cmdbuf *sdma_cs; struct pipe_fence_handle *last_gfx_fence; struct pipe_fence_handle *last_sdma_fence; struct si_resource *eop_bug_scratch; @@ -907,7 +928,9 @@ void *cs_copy_image_1d_array; void *cs_clear_render_target; void *cs_clear_render_target_1d_array; + void *cs_clear_12bytes_buffer; void *cs_dcc_retile; + void *cs_fmask_expand[3][2]; /* [log2(samples)-1][is_array] */ struct si_screen *screen; struct pipe_debug_callback debug; struct ac_llvm_compiler compiler; /* only non-threaded compilation */ @@ -999,16 +1022,12 @@ /* shader information */ struct si_vertex_elements *vertex_elements; + unsigned num_vertex_elements; unsigned sprite_coord_enable; unsigned cs_max_waves_per_sh; bool flatshade; bool do_update_shaders; - /* vertex buffer descriptors */ - uint32_t *vb_descriptors_gpu_list; - struct si_resource *vb_descriptors_buffer; - unsigned vb_descriptors_offset; - /* shader descriptors */ struct si_descriptors descriptors[SI_NUM_DESCS]; unsigned descriptors_dirty; @@ -1035,11 +1054,16 @@ uint32_t vs_blit_sh_data[SI_VS_BLIT_SGPRS_POS_TEXCOORD]; uint32_t cs_user_data[4]; - /* Vertex and index buffers. */ + /* Vertex buffers. */ bool vertex_buffers_dirty; bool vertex_buffer_pointer_dirty; + bool vertex_buffer_user_sgprs_dirty; struct pipe_vertex_buffer vertex_buffer[SI_NUM_VERTEX_BUFFERS]; uint16_t vertex_buffer_unaligned; /* bitmask of not dword-aligned buffers */ + uint32_t *vb_descriptors_gpu_list; + struct si_resource *vb_descriptors_buffer; + unsigned vb_descriptors_offset; + unsigned vb_descriptor_user_sgprs[5*4]; /* MSAA config state. */ int ps_iter_samples; @@ -1065,6 +1089,7 @@ bool ls_vgpr_fix:1; bool prim_discard_cs_instancing:1; bool ngg:1; + uint8_t ngg_culling; int last_index_size; int last_base_vertex; int last_start_instance; @@ -1075,14 +1100,17 @@ int last_restart_index; int last_prim; int last_multi_vgt_param; - int last_rast_prim; - int last_flatshade_first; + int last_gs_out_prim; int last_binning_enabled; - unsigned last_sc_line_stipple; unsigned current_vs_state; unsigned last_vs_state; enum pipe_prim_type current_rast_prim; /* primitive type after TES, GS */ + struct si_small_prim_cull_info last_small_prim_cull_info; + struct si_resource *small_prim_cull_info_buf; + uint64_t small_prim_cull_info_address; + bool small_prim_cull_info_dirty; + /* Scratch buffer */ struct si_resource *scratch_buffer; unsigned scratch_waves; @@ -1252,6 +1280,10 @@ void si_blitter_end(struct si_context *sctx); void si_init_blit_functions(struct si_context *sctx); void si_decompress_textures(struct si_context *sctx, unsigned shader_mask); +void si_decompress_subresource(struct pipe_context *ctx, + struct pipe_resource *tex, + unsigned planes, unsigned level, + unsigned first_layer, unsigned last_layer); void si_resource_copy_region(struct pipe_context *ctx, struct pipe_resource *dst, unsigned dst_level, @@ -1317,6 +1349,7 @@ unsigned width, unsigned height, bool render_condition_enabled); void si_retile_dcc(struct si_context *sctx, struct si_texture *tex); +void si_compute_expand_fmask(struct pipe_context *ctx, struct pipe_resource *tex); void si_init_compute_blit_functions(struct si_context *sctx); /* si_cp_dma.c */ @@ -1366,14 +1399,14 @@ struct radeon_saved_cs *saved, enum ring_type ring); bool si_replace_shader(unsigned num, struct si_shader_binary *binary); -/* si_dma.c */ -void si_init_dma_functions(struct si_context *sctx); - /* si_dma_cs.c */ void si_dma_emit_timestamp(struct si_context *sctx, struct si_resource *dst, uint64_t offset); void si_sdma_clear_buffer(struct si_context *sctx, struct pipe_resource *dst, uint64_t offset, uint64_t size, unsigned clear_value); +void si_sdma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, + struct pipe_resource *src, uint64_t dst_offset, + uint64_t src_offset, uint64_t size); void si_need_dma_space(struct si_context *ctx, unsigned num_dw, struct si_resource *dst, struct si_resource *src); void si_flush_dma_cs(struct si_context *ctx, unsigned flags, @@ -1435,15 +1468,18 @@ unsigned base_vertex, uint64_t input_indexbuf_va, unsigned input_indexbuf_max_elements); -void si_initialize_prim_discard_tunables(struct si_context *sctx); +void si_initialize_prim_discard_tunables(struct si_screen *sscreen, + bool is_aux_context, + unsigned *prim_discard_vertex_count_threshold, + unsigned *index_ring_size_per_ib); + +/* si_pipe.c */ +void si_init_compiler(struct si_screen *sscreen, struct ac_llvm_compiler *compiler); /* si_perfcounters.c */ void si_init_perfcounters(struct si_screen *screen); void si_destroy_perfcounters(struct si_screen *screen); -/* si_pipe.c */ -bool si_check_device_reset(struct si_context *sctx); - /* si_query.c */ void si_init_screen_query_functions(struct si_screen *sscreen); void si_init_query_functions(struct si_context *sctx); @@ -1461,7 +1497,10 @@ void *si_create_copy_image_compute_shader_1d_array(struct pipe_context *ctx); void *si_clear_render_target_shader(struct pipe_context *ctx); void *si_clear_render_target_shader_1d_array(struct pipe_context *ctx); +void *si_clear_12bytes_buffer_shader(struct pipe_context *ctx); void *si_create_dcc_retile_cs(struct pipe_context *ctx); +void *si_create_fmask_expand_cs(struct pipe_context *ctx, unsigned num_samples, + bool is_array); void *si_create_query_result_cs(struct si_context *sctx); void *gfx10_create_sh_query_result_cs(struct si_context *sctx); @@ -1483,6 +1522,9 @@ const struct pipe_video_buffer *tmpl); /* si_viewport.c */ +void si_update_ngg_small_prim_precision(struct si_context *ctx); +void si_get_small_prim_cull_info(struct si_context *sctx, + struct si_small_prim_cull_info *out); void si_update_vs_viewport_state(struct si_context *ctx); void si_init_viewport_functions(struct si_context *ctx); @@ -1494,8 +1536,8 @@ struct si_texture *src, unsigned src_level, const struct pipe_box *src_box); -void si_eliminate_fast_color_clear(struct si_context *sctx, - struct si_texture *tex); +void si_eliminate_fast_color_clear(struct si_context *sctx, struct si_texture *tex, + bool *ctx_flushed); void si_texture_discard_cmask(struct si_screen *sscreen, struct si_texture *tex); bool si_init_flushed_depth_texture(struct pipe_context *ctx, @@ -1556,10 +1598,23 @@ pipe_resource_reference((struct pipe_resource **)ptr, &res->buffer.b.b); } +static inline void +si_shader_selector_reference(struct si_context *sctx, /* sctx can optionally be NULL */ + struct si_shader_selector **dst, + struct si_shader_selector *src) +{ + if (*dst == src) + return; + + struct si_screen *sscreen = src ? src->screen : (*dst)->screen; + util_shader_reference(&sctx->b, &sscreen->live_shader_cache, + (void**)dst, src); +} + static inline bool vi_dcc_enabled(struct si_texture *tex, unsigned level) { - return tex->dcc_offset && level < tex->surface.num_dcc_levels; + return tex->surface.dcc_offset && level < tex->surface.num_dcc_levels; } static inline unsigned @@ -1638,7 +1693,7 @@ return &sctx->vs_shader; } -static inline struct tgsi_shader_info *si_get_vs_info(struct si_context *sctx) +static inline struct si_shader_info *si_get_vs_info(struct si_context *sctx) { struct si_shader_ctx_state *vs = si_get_vs(sctx); @@ -1760,13 +1815,13 @@ if (zs_mask == PIPE_MASK_S && tex->htile_stencil_disabled) return false; - return tex->htile_offset && level == 0; + return tex->surface.htile_offset && level == 0; } static inline bool vi_tc_compat_htile_enabled(struct si_texture *tex, unsigned level, unsigned zs_mask) { - assert(!tex->tc_compatible_htile || tex->htile_offset); + assert(!tex->tc_compatible_htile || tex->surface.htile_offset); return tex->tc_compatible_htile && si_htile_enabled(tex, level, zs_mask); } diff -Nru mesa-19.2.8/src/gallium/drivers/radeonsi/si_query.c mesa-20.0.8/src/gallium/drivers/radeonsi/si_query.c --- mesa-19.2.8/src/gallium/drivers/radeonsi/si_query.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/radeonsi/si_query.c 2020-06-12 01:21:17.000000000 +0000 @@ -249,9 +249,23 @@ case SI_QUERY_NUM_SHADERS_CREATED: query->begin_result = p_atomic_read(&sctx->screen->num_shaders_created); break; - case SI_QUERY_NUM_SHADER_CACHE_HITS: - query->begin_result = - p_atomic_read(&sctx->screen->num_shader_cache_hits); + case SI_QUERY_LIVE_SHADER_CACHE_HITS: + query->begin_result = sctx->screen->live_shader_cache.hits; + break; + case SI_QUERY_LIVE_SHADER_CACHE_MISSES: + query->begin_result = sctx->screen->live_shader_cache.misses; + break; + case SI_QUERY_MEMORY_SHADER_CACHE_HITS: + query->begin_result = sctx->screen->num_memory_shader_cache_hits; + break; + case SI_QUERY_MEMORY_SHADER_CACHE_MISSES: + query->begin_result = sctx->screen->num_memory_shader_cache_misses; + break; + case SI_QUERY_DISK_SHADER_CACHE_HITS: + query->begin_result = sctx->screen->num_disk_shader_cache_hits; + break; + case SI_QUERY_DISK_SHADER_CACHE_MISSES: + query->begin_result = sctx->screen->num_disk_shader_cache_misses; break; case SI_QUERY_PD_NUM_PRIMS_ACCEPTED: query->begin_result = sctx->compute_num_verts_accepted; @@ -423,9 +437,23 @@ case SI_QUERY_BACK_BUFFER_PS_DRAW_RATIO: query->end_result = sctx->last_tex_ps_draw_ratio; break; - case SI_QUERY_NUM_SHADER_CACHE_HITS: - query->end_result = - p_atomic_read(&sctx->screen->num_shader_cache_hits); + case SI_QUERY_LIVE_SHADER_CACHE_HITS: + query->end_result = sctx->screen->live_shader_cache.hits; + break; + case SI_QUERY_LIVE_SHADER_CACHE_MISSES: + query->end_result = sctx->screen->live_shader_cache.misses; + break; + case SI_QUERY_MEMORY_SHADER_CACHE_HITS: + query->end_result = sctx->screen->num_memory_shader_cache_hits; + break; + case SI_QUERY_MEMORY_SHADER_CACHE_MISSES: + query->end_result = sctx->screen->num_memory_shader_cache_misses; + break; + case SI_QUERY_DISK_SHADER_CACHE_HITS: + query->end_result = sctx->screen->num_disk_shader_cache_hits; + break; + case SI_QUERY_DISK_SHADER_CACHE_MISSES: + query->end_result = sctx->screen->num_disk_shader_cache_misses; break; case SI_QUERY_PD_NUM_PRIMS_ACCEPTED: query->end_result = sctx->compute_num_verts_accepted; @@ -1147,7 +1175,7 @@ if (!query->buffer.buf) return false; - LIST_ADDTAIL(&query->b.active_list, &sctx->active_queries); + list_addtail(&query->b.active_list, &sctx->active_queries); sctx->num_cs_dw_queries_suspend += query->b.num_cs_dw_suspend; return true; } @@ -1171,7 +1199,7 @@ si_query_hw_emit_stop(sctx, query); if (!(query->flags & SI_QUERY_HW_FLAG_NO_START)) { - LIST_DELINIT(&query->b.active_list); + list_delinit(&query->b.active_list); sctx->num_cs_dw_queries_suspend -= query->b.num_cs_dw_suspend; } @@ -1720,7 +1748,6 @@ static struct pipe_driver_query_info si_driver_query_list[] = { X("num-compilations", NUM_COMPILATIONS, UINT64, CUMULATIVE), X("num-shaders-created", NUM_SHADERS_CREATED, UINT64, CUMULATIVE), - X("num-shader-cache-hits", NUM_SHADER_CACHE_HITS, UINT64, CUMULATIVE), X("draw-calls", DRAW_CALLS, UINT64, AVERAGE), X("decompress-calls", DECOMPRESS_CALLS, UINT64, AVERAGE), X("MRT-draw-calls", MRT_DRAW_CALLS, UINT64, AVERAGE), @@ -1760,6 +1787,12 @@ X("VRAM-vis-usage", VRAM_VIS_USAGE, BYTES, AVERAGE), X("GTT-usage", GTT_USAGE, BYTES, AVERAGE), X("back-buffer-ps-draw-ratio", BACK_BUFFER_PS_DRAW_RATIO, UINT64, AVERAGE), + X("live-shader-cache-hits", LIVE_SHADER_CACHE_HITS, UINT, CUMULATIVE), + X("live-shader-cache-misses", LIVE_SHADER_CACHE_MISSES, UINT, CUMULATIVE), + X("memory-shader-cache-hits", MEMORY_SHADER_CACHE_HITS, UINT, CUMULATIVE), + X("memory-shader-cache-misses", MEMORY_SHADER_CACHE_MISSES, UINT, CUMULATIVE), + X("disk-shader-cache-hits", DISK_SHADER_CACHE_HITS, UINT, CUMULATIVE), + X("disk-shader-cache-misses", DISK_SHADER_CACHE_MISSES, UINT, CUMULATIVE), /* GPIN queries are for the benefit of old versions of GPUPerfStudio, * which use it as a fallback path to detect the GPU type. @@ -1923,7 +1956,7 @@ sctx->b.render_condition = si_render_condition; } - LIST_INITHEAD(&sctx->active_queries); + list_inithead(&sctx->active_queries); } void si_init_screen_query_functions(struct si_screen *sscreen) diff -Nru mesa-19.2.8/src/gallium/drivers/radeonsi/si_query.h mesa-20.0.8/src/gallium/drivers/radeonsi/si_query.h --- mesa-19.2.8/src/gallium/drivers/radeonsi/si_query.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/radeonsi/si_query.h 2020-06-12 01:21:17.000000000 +0000 @@ -106,7 +106,6 @@ SI_QUERY_NUM_COMPILATIONS, SI_QUERY_NUM_SHADERS_CREATED, SI_QUERY_BACK_BUFFER_PS_DRAW_RATIO, - SI_QUERY_NUM_SHADER_CACHE_HITS, SI_QUERY_GPIN_ASIC_ID, SI_QUERY_GPIN_NUM_SIMD, SI_QUERY_GPIN_NUM_RB, @@ -117,6 +116,12 @@ SI_QUERY_PD_NUM_PRIMS_ACCEPTED, SI_QUERY_PD_NUM_PRIMS_REJECTED, SI_QUERY_PD_NUM_PRIMS_INELIGIBLE, + SI_QUERY_LIVE_SHADER_CACHE_HITS, + SI_QUERY_LIVE_SHADER_CACHE_MISSES, + SI_QUERY_MEMORY_SHADER_CACHE_HITS, + SI_QUERY_MEMORY_SHADER_CACHE_MISSES, + SI_QUERY_DISK_SHADER_CACHE_HITS, + SI_QUERY_DISK_SHADER_CACHE_MISSES, SI_QUERY_FIRST_PERFCOUNTER = PIPE_QUERY_DRIVER_SPECIFIC + 100, }; diff -Nru mesa-19.2.8/src/gallium/drivers/radeonsi/si_shader.c mesa-20.0.8/src/gallium/drivers/radeonsi/si_shader.c --- mesa-19.2.8/src/gallium/drivers/radeonsi/si_shader.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/radeonsi/si_shader.c 2020-06-12 01:21:17.000000000 +0000 @@ -23,23 +23,17 @@ */ #include "util/u_memory.h" -#include "util/u_string.h" -#include "tgsi/tgsi_build.h" #include "tgsi/tgsi_strings.h" -#include "tgsi/tgsi_util.h" -#include "tgsi/tgsi_dump.h" #include "tgsi/tgsi_from_mesa.h" -#include "ac_binary.h" #include "ac_exp_param.h" -#include "ac_shader_util.h" #include "ac_rtld.h" -#include "ac_llvm_util.h" #include "si_shader_internal.h" #include "si_pipe.h" #include "sid.h" #include "compiler/nir/nir.h" +#include "compiler/nir/nir_serialize.h" static const char scratch_rsrc_dword0_symbol[] = "SCRATCH_RSRC_DWORD0"; @@ -47,103 +41,24 @@ static const char scratch_rsrc_dword1_symbol[] = "SCRATCH_RSRC_DWORD1"; -static void si_init_shader_ctx(struct si_shader_context *ctx, - struct si_screen *sscreen, - struct ac_llvm_compiler *compiler, - unsigned wave_size, - bool nir); - -static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action, - struct lp_build_tgsi_context *bld_base, - struct lp_build_emit_data *emit_data); - static void si_dump_shader_key(const struct si_shader *shader, FILE *f); -static void si_build_vs_prolog_function(struct si_shader_context *ctx, - union si_shader_part_key *key); -static void si_build_tcs_epilog_function(struct si_shader_context *ctx, - union si_shader_part_key *key); -static void si_build_ps_prolog_function(struct si_shader_context *ctx, - union si_shader_part_key *key); -static void si_build_ps_epilog_function(struct si_shader_context *ctx, - union si_shader_part_key *key); -static void si_fix_resource_usage(struct si_screen *sscreen, - struct si_shader *shader); - -/* Ideally pass the sample mask input to the PS epilog as v14, which - * is its usual location, so that the shader doesn't have to add v_mov. - */ -#define PS_EPILOG_SAMPLEMASK_MIN_LOC 14 - -static bool llvm_type_is_64bit(struct si_shader_context *ctx, - LLVMTypeRef type) -{ - if (type == ctx->ac.i64 || type == ctx->ac.f64) - return true; - - return false; -} - /** Whether the shader runs as a combination of multiple API shaders */ -static bool is_multi_part_shader(struct si_shader_context *ctx) +bool si_is_multi_part_shader(struct si_shader *shader) { - if (ctx->screen->info.chip_class <= GFX8) + if (shader->selector->screen->info.chip_class <= GFX8) return false; - return ctx->shader->key.as_ls || - ctx->shader->key.as_es || - ctx->type == PIPE_SHADER_TESS_CTRL || - ctx->type == PIPE_SHADER_GEOMETRY; + return shader->key.as_ls || + shader->key.as_es || + shader->selector->type == PIPE_SHADER_TESS_CTRL || + shader->selector->type == PIPE_SHADER_GEOMETRY; } /** Whether the shader runs on a merged HW stage (LSHS or ESGS) */ -static bool is_merged_shader(struct si_shader_context *ctx) -{ - return ctx->shader->key.as_ngg || is_multi_part_shader(ctx); -} - -void si_init_function_info(struct si_function_info *fninfo) -{ - fninfo->num_params = 0; - fninfo->num_sgpr_params = 0; -} - -unsigned add_arg_assign(struct si_function_info *fninfo, - enum si_arg_regfile regfile, LLVMTypeRef type, - LLVMValueRef *assign) +bool si_is_merged_shader(struct si_shader *shader) { - assert(regfile != ARG_SGPR || fninfo->num_sgpr_params == fninfo->num_params); - - unsigned idx = fninfo->num_params++; - assert(idx < ARRAY_SIZE(fninfo->types)); - - if (regfile == ARG_SGPR) - fninfo->num_sgpr_params = fninfo->num_params; - - fninfo->types[idx] = type; - fninfo->assign[idx] = assign; - return idx; -} - -static unsigned add_arg(struct si_function_info *fninfo, - enum si_arg_regfile regfile, LLVMTypeRef type) -{ - return add_arg_assign(fninfo, regfile, type, NULL); -} - -static void add_arg_assign_checked(struct si_function_info *fninfo, - enum si_arg_regfile regfile, LLVMTypeRef type, - LLVMValueRef *assign, unsigned idx) -{ - ASSERTED unsigned actual = add_arg_assign(fninfo, regfile, type, assign); - assert(actual == idx); -} - -static void add_arg_checked(struct si_function_info *fninfo, - enum si_arg_regfile regfile, LLVMTypeRef type, - unsigned idx) -{ - add_arg_assign_checked(fninfo, regfile, type, NULL, idx); + return shader->key.as_ngg || si_is_multi_part_shader(shader); } /** @@ -230,6693 +145,1547 @@ } } -/** - * Get the value of a shader input parameter and extract a bitfield. - */ -static LLVMValueRef unpack_llvm_param(struct si_shader_context *ctx, - LLVMValueRef value, unsigned rshift, - unsigned bitwidth) +static void si_dump_streamout(struct pipe_stream_output_info *so) { - if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMFloatTypeKind) - value = ac_to_integer(&ctx->ac, value); + unsigned i; - if (rshift) - value = LLVMBuildLShr(ctx->ac.builder, value, - LLVMConstInt(ctx->i32, rshift, 0), ""); + if (so->num_outputs) + fprintf(stderr, "STREAMOUT\n"); - if (rshift + bitwidth < 32) { - unsigned mask = (1 << bitwidth) - 1; - value = LLVMBuildAnd(ctx->ac.builder, value, - LLVMConstInt(ctx->i32, mask, 0), ""); + for (i = 0; i < so->num_outputs; i++) { + unsigned mask = ((1 << so->output[i].num_components) - 1) << + so->output[i].start_component; + fprintf(stderr, " %i: BUF%i[%i..%i] <- OUT[%i].%s%s%s%s\n", + i, so->output[i].output_buffer, + so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1, + so->output[i].register_index, + mask & 1 ? "x" : "", + mask & 2 ? "y" : "", + mask & 4 ? "z" : "", + mask & 8 ? "w" : ""); } - - return value; } -LLVMValueRef si_unpack_param(struct si_shader_context *ctx, - unsigned param, unsigned rshift, - unsigned bitwidth) +static void declare_streamout_params(struct si_shader_context *ctx, + struct pipe_stream_output_info *so) { - LLVMValueRef value = LLVMGetParam(ctx->main_fn, param); - - return unpack_llvm_param(ctx, value, rshift, bitwidth); -} + if (ctx->screen->use_ngg_streamout) { + if (ctx->type == PIPE_SHADER_TESS_EVAL) + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); + return; + } -static LLVMValueRef get_rel_patch_id(struct si_shader_context *ctx) -{ - switch (ctx->type) { - case PIPE_SHADER_TESS_CTRL: - return unpack_llvm_param(ctx, ctx->abi.tcs_rel_ids, 0, 8); + /* Streamout SGPRs. */ + if (so->num_outputs) { + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->streamout_config); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->streamout_write_index); + } else if (ctx->type == PIPE_SHADER_TESS_EVAL) { + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); + } - case PIPE_SHADER_TESS_EVAL: - return LLVMGetParam(ctx->main_fn, - ctx->param_tes_rel_patch_id); + /* A streamout buffer offset is loaded if the stride is non-zero. */ + for (int i = 0; i < 4; i++) { + if (!so->stride[i]) + continue; - default: - assert(0); - return NULL; + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->streamout_offset[i]); } } -/* Tessellation shaders pass outputs to the next shader using LDS. - * - * LS outputs = TCS inputs - * TCS outputs = TES inputs - * - * The LDS layout is: - * - TCS inputs for patch 0 - * - TCS inputs for patch 1 - * - TCS inputs for patch 2 = get_tcs_in_current_patch_offset (if RelPatchID==2) - * - ... - * - TCS outputs for patch 0 = get_tcs_out_patch0_offset - * - Per-patch TCS outputs for patch 0 = get_tcs_out_patch0_patch_data_offset - * - TCS outputs for patch 1 - * - Per-patch TCS outputs for patch 1 - * - TCS outputs for patch 2 = get_tcs_out_current_patch_offset (if RelPatchID==2) - * - Per-patch TCS outputs for patch 2 = get_tcs_out_current_patch_data_offset (if RelPatchID==2) - * - ... - * - * All three shaders VS(LS), TCS, TES share the same LDS space. - */ - -static LLVMValueRef -get_tcs_in_patch_stride(struct si_shader_context *ctx) +unsigned si_get_max_workgroup_size(const struct si_shader *shader) { - return si_unpack_param(ctx, ctx->param_vs_state_bits, 8, 13); -} + switch (shader->selector->type) { + case PIPE_SHADER_VERTEX: + case PIPE_SHADER_TESS_EVAL: + return shader->key.as_ngg ? 128 : 0; -static unsigned get_tcs_out_vertex_dw_stride_constant(struct si_shader_context *ctx) -{ - assert(ctx->type == PIPE_SHADER_TESS_CTRL); + case PIPE_SHADER_TESS_CTRL: + /* Return this so that LLVM doesn't remove s_barrier + * instructions on chips where we use s_barrier. */ + return shader->selector->screen->info.chip_class >= GFX7 ? 128 : 0; - if (ctx->shader->key.mono.u.ff_tcs_inputs_to_copy) - return util_last_bit64(ctx->shader->key.mono.u.ff_tcs_inputs_to_copy) * 4; + case PIPE_SHADER_GEOMETRY: + return shader->selector->screen->info.chip_class >= GFX9 ? 128 : 0; - return util_last_bit64(ctx->shader->selector->outputs_written) * 4; -} + case PIPE_SHADER_COMPUTE: + break; /* see below */ -static LLVMValueRef get_tcs_out_vertex_dw_stride(struct si_shader_context *ctx) -{ - unsigned stride = get_tcs_out_vertex_dw_stride_constant(ctx); + default: + return 0; + } + + const unsigned *properties = shader->selector->info.properties; + unsigned max_work_group_size = + properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] * + properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] * + properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]; - return LLVMConstInt(ctx->i32, stride, 0); + if (!max_work_group_size) { + /* This is a variable group size compute shader, + * compile it for the maximum possible group size. + */ + max_work_group_size = SI_MAX_VARIABLE_THREADS_PER_BLOCK; + } + return max_work_group_size; } -static LLVMValueRef get_tcs_out_patch_stride(struct si_shader_context *ctx) +static void declare_const_and_shader_buffers(struct si_shader_context *ctx, + bool assign_params) { - if (ctx->shader->key.mono.u.ff_tcs_inputs_to_copy) - return si_unpack_param(ctx, ctx->param_tcs_out_lds_layout, 0, 13); + enum ac_arg_type const_shader_buf_type; - const struct tgsi_shader_info *info = &ctx->shader->selector->info; - unsigned tcs_out_vertices = info->properties[TGSI_PROPERTY_TCS_VERTICES_OUT]; - unsigned vertex_dw_stride = get_tcs_out_vertex_dw_stride_constant(ctx); - unsigned num_patch_outputs = util_last_bit64(ctx->shader->selector->patch_outputs_written); - unsigned patch_dw_stride = tcs_out_vertices * vertex_dw_stride + - num_patch_outputs * 4; - return LLVMConstInt(ctx->i32, patch_dw_stride, 0); -} + if (ctx->shader->selector->info.const_buffers_declared == 1 && + ctx->shader->selector->info.shader_buffers_declared == 0) + const_shader_buf_type = AC_ARG_CONST_FLOAT_PTR; + else + const_shader_buf_type = AC_ARG_CONST_DESC_PTR; -static LLVMValueRef -get_tcs_out_patch0_offset(struct si_shader_context *ctx) -{ - return LLVMBuildMul(ctx->ac.builder, - si_unpack_param(ctx, - ctx->param_tcs_out_lds_offsets, - 0, 16), - LLVMConstInt(ctx->i32, 4, 0), ""); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, const_shader_buf_type, + assign_params ? &ctx->const_and_shader_buffers : + &ctx->other_const_and_shader_buffers); } -static LLVMValueRef -get_tcs_out_patch0_patch_data_offset(struct si_shader_context *ctx) +static void declare_samplers_and_images(struct si_shader_context *ctx, + bool assign_params) { - return LLVMBuildMul(ctx->ac.builder, - si_unpack_param(ctx, - ctx->param_tcs_out_lds_offsets, - 16, 16), - LLVMConstInt(ctx->i32, 4, 0), ""); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR, + assign_params ? &ctx->samplers_and_images : + &ctx->other_samplers_and_images); } -static LLVMValueRef -get_tcs_in_current_patch_offset(struct si_shader_context *ctx) +static void declare_per_stage_desc_pointers(struct si_shader_context *ctx, + bool assign_params) { - LLVMValueRef patch_stride = get_tcs_in_patch_stride(ctx); - LLVMValueRef rel_patch_id = get_rel_patch_id(ctx); - - return LLVMBuildMul(ctx->ac.builder, patch_stride, rel_patch_id, ""); + declare_const_and_shader_buffers(ctx, assign_params); + declare_samplers_and_images(ctx, assign_params); } -static LLVMValueRef -get_tcs_out_current_patch_offset(struct si_shader_context *ctx) +static void declare_global_desc_pointers(struct si_shader_context *ctx) { - LLVMValueRef patch0_offset = get_tcs_out_patch0_offset(ctx); - LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx); - LLVMValueRef rel_patch_id = get_rel_patch_id(ctx); - - return ac_build_imad(&ctx->ac, patch_stride, rel_patch_id, patch0_offset); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, + &ctx->rw_buffers); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR, + &ctx->bindless_samplers_and_images); } -static LLVMValueRef -get_tcs_out_current_patch_data_offset(struct si_shader_context *ctx) +static void declare_vs_specific_input_sgprs(struct si_shader_context *ctx) { - LLVMValueRef patch0_patch_data_offset = - get_tcs_out_patch0_patch_data_offset(ctx); - LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx); - LLVMValueRef rel_patch_id = get_rel_patch_id(ctx); - - return ac_build_imad(&ctx->ac, patch_stride, rel_patch_id, patch0_patch_data_offset); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits); + if (!ctx->shader->is_gs_copy_shader) { + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.base_vertex); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.start_instance); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.draw_id); + } } -static LLVMValueRef get_num_tcs_out_vertices(struct si_shader_context *ctx) +static void declare_vb_descriptor_input_sgprs(struct si_shader_context *ctx) { - unsigned tcs_out_vertices = - ctx->shader->selector ? - ctx->shader->selector->info.properties[TGSI_PROPERTY_TCS_VERTICES_OUT] : 0; - - /* If !tcs_out_vertices, it's either the fixed-func TCS or the TCS epilog. */ - if (ctx->type == PIPE_SHADER_TESS_CTRL && tcs_out_vertices) - return LLVMConstInt(ctx->i32, tcs_out_vertices, 0); - - return si_unpack_param(ctx, ctx->param_tcs_offchip_layout, 6, 6); -} + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &ctx->vertex_buffers); -static LLVMValueRef get_tcs_in_vertex_dw_stride(struct si_shader_context *ctx) -{ - unsigned stride; + unsigned num_vbos_in_user_sgprs = ctx->shader->selector->num_vbos_in_user_sgprs; + if (num_vbos_in_user_sgprs) { + unsigned user_sgprs = ctx->args.num_sgprs_used; - switch (ctx->type) { - case PIPE_SHADER_VERTEX: - stride = ctx->shader->selector->lshs_vertex_stride / 4; - return LLVMConstInt(ctx->i32, stride, 0); + if (si_is_merged_shader(ctx->shader)) + user_sgprs -= 8; + assert(user_sgprs <= SI_SGPR_VS_VB_DESCRIPTOR_FIRST); - case PIPE_SHADER_TESS_CTRL: - if (ctx->screen->info.chip_class >= GFX9 && - ctx->shader->is_monolithic) { - stride = ctx->shader->key.part.tcs.ls->lshs_vertex_stride / 4; - return LLVMConstInt(ctx->i32, stride, 0); - } - return si_unpack_param(ctx, ctx->param_vs_state_bits, 24, 8); + /* Declare unused SGPRs to align VB descriptors to 4 SGPRs (hw requirement). */ + for (unsigned i = user_sgprs; i < SI_SGPR_VS_VB_DESCRIPTOR_FIRST; i++) + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */ - default: - assert(0); - return NULL; + assert(num_vbos_in_user_sgprs <= ARRAY_SIZE(ctx->vb_descriptors)); + for (unsigned i = 0; i < num_vbos_in_user_sgprs; i++) + ac_add_arg(&ctx->args, AC_ARG_SGPR, 4, AC_ARG_INT, &ctx->vb_descriptors[i]); } } -static LLVMValueRef unpack_sint16(struct si_shader_context *ctx, - LLVMValueRef i32, unsigned index) -{ - assert(index <= 1); - - if (index == 1) - return LLVMBuildAShr(ctx->ac.builder, i32, - LLVMConstInt(ctx->i32, 16, 0), ""); - - return LLVMBuildSExt(ctx->ac.builder, - LLVMBuildTrunc(ctx->ac.builder, i32, - ctx->ac.i16, ""), - ctx->i32, ""); -} - -void si_llvm_load_input_vs( - struct si_shader_context *ctx, - unsigned input_index, - LLVMValueRef out[4]) +static void declare_vs_input_vgprs(struct si_shader_context *ctx, + unsigned *num_prolog_vgprs, + bool ngg_cull_shader) { - const struct tgsi_shader_info *info = &ctx->shader->selector->info; - unsigned vs_blit_property = info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD]; + struct si_shader *shader = ctx->shader; - if (vs_blit_property) { - LLVMValueRef vertex_id = ctx->abi.vertex_id; - LLVMValueRef sel_x1 = LLVMBuildICmp(ctx->ac.builder, - LLVMIntULE, vertex_id, - ctx->i32_1, ""); - /* Use LLVMIntNE, because we have 3 vertices and only - * the middle one should use y2. - */ - LLVMValueRef sel_y1 = LLVMBuildICmp(ctx->ac.builder, - LLVMIntNE, vertex_id, - ctx->i32_1, ""); - - if (input_index == 0) { - /* Position: */ - LLVMValueRef x1y1 = LLVMGetParam(ctx->main_fn, - ctx->param_vs_blit_inputs); - LLVMValueRef x2y2 = LLVMGetParam(ctx->main_fn, - ctx->param_vs_blit_inputs + 1); - - LLVMValueRef x1 = unpack_sint16(ctx, x1y1, 0); - LLVMValueRef y1 = unpack_sint16(ctx, x1y1, 1); - LLVMValueRef x2 = unpack_sint16(ctx, x2y2, 0); - LLVMValueRef y2 = unpack_sint16(ctx, x2y2, 1); - - LLVMValueRef x = LLVMBuildSelect(ctx->ac.builder, sel_x1, - x1, x2, ""); - LLVMValueRef y = LLVMBuildSelect(ctx->ac.builder, sel_y1, - y1, y2, ""); - - out[0] = LLVMBuildSIToFP(ctx->ac.builder, x, ctx->f32, ""); - out[1] = LLVMBuildSIToFP(ctx->ac.builder, y, ctx->f32, ""); - out[2] = LLVMGetParam(ctx->main_fn, - ctx->param_vs_blit_inputs + 2); - out[3] = ctx->ac.f32_1; - return; - } - - /* Color or texture coordinates: */ - assert(input_index == 1); - - if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_COLOR) { - for (int i = 0; i < 4; i++) { - out[i] = LLVMGetParam(ctx->main_fn, - ctx->param_vs_blit_inputs + 3 + i); - } + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.vertex_id); + if (shader->key.as_ls) { + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->rel_auto_id); + if (ctx->screen->info.chip_class >= GFX10) { + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */ + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id); } else { - assert(vs_blit_property == SI_VS_BLIT_SGPRS_POS_TEXCOORD); - LLVMValueRef x1 = LLVMGetParam(ctx->main_fn, - ctx->param_vs_blit_inputs + 3); - LLVMValueRef y1 = LLVMGetParam(ctx->main_fn, - ctx->param_vs_blit_inputs + 4); - LLVMValueRef x2 = LLVMGetParam(ctx->main_fn, - ctx->param_vs_blit_inputs + 5); - LLVMValueRef y2 = LLVMGetParam(ctx->main_fn, - ctx->param_vs_blit_inputs + 6); - - out[0] = LLVMBuildSelect(ctx->ac.builder, sel_x1, - x1, x2, ""); - out[1] = LLVMBuildSelect(ctx->ac.builder, sel_y1, - y1, y2, ""); - out[2] = LLVMGetParam(ctx->main_fn, - ctx->param_vs_blit_inputs + 7); - out[3] = LLVMGetParam(ctx->main_fn, - ctx->param_vs_blit_inputs + 8); + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id); + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* unused */ } - return; - } - - union si_vs_fix_fetch fix_fetch; - LLVMValueRef t_list_ptr; - LLVMValueRef t_offset; - LLVMValueRef t_list; - LLVMValueRef vertex_index; - LLVMValueRef tmp; - - /* Load the T list */ - t_list_ptr = LLVMGetParam(ctx->main_fn, ctx->param_vertex_buffers); - - t_offset = LLVMConstInt(ctx->i32, input_index, 0); - - t_list = ac_build_load_to_sgpr(&ctx->ac, t_list_ptr, t_offset); - - vertex_index = LLVMGetParam(ctx->main_fn, - ctx->param_vertex_index0 + - input_index); - - /* Use the open-coded implementation for all loads of doubles and - * of dword-sized data that needs fixups. We need to insert conversion - * code anyway, and the amd/common code does it for us. - * - * Note: On LLVM <= 8, we can only open-code formats with - * channel size >= 4 bytes. - */ - bool opencode = ctx->shader->key.mono.vs_fetch_opencode & (1 << input_index); - fix_fetch.bits = ctx->shader->key.mono.vs_fix_fetch[input_index].bits; - if (opencode || - (fix_fetch.u.log_size == 3 && fix_fetch.u.format == AC_FETCH_FORMAT_FLOAT) || - (fix_fetch.u.log_size == 2)) { - tmp = ac_build_opencoded_load_format( - &ctx->ac, fix_fetch.u.log_size, fix_fetch.u.num_channels_m1 + 1, - fix_fetch.u.format, fix_fetch.u.reverse, !opencode, - t_list, vertex_index, ctx->ac.i32_0, ctx->ac.i32_0, 0, true); - for (unsigned i = 0; i < 4; ++i) - out[i] = LLVMBuildExtractElement(ctx->ac.builder, tmp, LLVMConstInt(ctx->i32, i, false), ""); - return; - } - - /* Do multiple loads for special formats. */ - unsigned required_channels = util_last_bit(info->input_usage_mask[input_index]); - LLVMValueRef fetches[4]; - unsigned num_fetches; - unsigned fetch_stride; - unsigned channels_per_fetch; - - if (fix_fetch.u.log_size <= 1 && fix_fetch.u.num_channels_m1 == 2) { - num_fetches = MIN2(required_channels, 3); - fetch_stride = 1 << fix_fetch.u.log_size; - channels_per_fetch = 1; - } else { - num_fetches = 1; - fetch_stride = 0; - channels_per_fetch = required_channels; - } - - for (unsigned i = 0; i < num_fetches; ++i) { - LLVMValueRef voffset = LLVMConstInt(ctx->i32, fetch_stride * i, 0); - fetches[i] = ac_build_buffer_load_format(&ctx->ac, t_list, vertex_index, voffset, - channels_per_fetch, 0, true); + } else if (ctx->screen->info.chip_class >= GFX10) { + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */ + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, + &ctx->vs_prim_id); /* user vgpr or PrimID (legacy) */ + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id); + } else { + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id); + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->vs_prim_id); + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* unused */ } - if (num_fetches == 1 && channels_per_fetch > 1) { - LLVMValueRef fetch = fetches[0]; - for (unsigned i = 0; i < channels_per_fetch; ++i) { - tmp = LLVMConstInt(ctx->i32, i, false); - fetches[i] = LLVMBuildExtractElement( - ctx->ac.builder, fetch, tmp, ""); + if (!shader->is_gs_copy_shader) { + if (shader->key.opt.ngg_culling && !ngg_cull_shader) { + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, + &ctx->ngg_old_thread_id); } - num_fetches = channels_per_fetch; - channels_per_fetch = 1; - } - - for (unsigned i = num_fetches; i < 4; ++i) - fetches[i] = LLVMGetUndef(ctx->f32); - - if (fix_fetch.u.log_size <= 1 && fix_fetch.u.num_channels_m1 == 2 && - required_channels == 4) { - if (fix_fetch.u.format == AC_FETCH_FORMAT_UINT || fix_fetch.u.format == AC_FETCH_FORMAT_SINT) - fetches[3] = ctx->ac.i32_1; - else - fetches[3] = ctx->ac.f32_1; - } else if (fix_fetch.u.log_size == 3 && - (fix_fetch.u.format == AC_FETCH_FORMAT_SNORM || - fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED || - fix_fetch.u.format == AC_FETCH_FORMAT_SINT) && - required_channels == 4) { - /* For 2_10_10_10, the hardware returns an unsigned value; - * convert it to a signed one. - */ - LLVMValueRef tmp = fetches[3]; - LLVMValueRef c30 = LLVMConstInt(ctx->i32, 30, 0); - - /* First, recover the sign-extended signed integer value. */ - if (fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED) - tmp = LLVMBuildFPToUI(ctx->ac.builder, tmp, ctx->i32, ""); - else - tmp = ac_to_integer(&ctx->ac, tmp); - - /* For the integer-like cases, do a natural sign extension. - * - * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0 - * and happen to contain 0, 1, 2, 3 as the two LSBs of the - * exponent. - */ - tmp = LLVMBuildShl(ctx->ac.builder, tmp, - fix_fetch.u.format == AC_FETCH_FORMAT_SNORM ? - LLVMConstInt(ctx->i32, 7, 0) : c30, ""); - tmp = LLVMBuildAShr(ctx->ac.builder, tmp, c30, ""); - /* Convert back to the right type. */ - if (fix_fetch.u.format == AC_FETCH_FORMAT_SNORM) { - LLVMValueRef clamp; - LLVMValueRef neg_one = LLVMConstReal(ctx->f32, -1.0); - tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, ctx->f32, ""); - clamp = LLVMBuildFCmp(ctx->ac.builder, LLVMRealULT, tmp, neg_one, ""); - tmp = LLVMBuildSelect(ctx->ac.builder, clamp, neg_one, tmp, ""); - } else if (fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED) { - tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, ctx->f32, ""); + /* Vertex load indices. */ + if (shader->selector->info.num_inputs) { + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, + &ctx->vertex_index0); + for (unsigned i = 1; i < shader->selector->info.num_inputs; i++) + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); } - - fetches[3] = tmp; + *num_prolog_vgprs += shader->selector->info.num_inputs; } - - for (unsigned i = 0; i < 4; ++i) - out[i] = ac_to_float(&ctx->ac, fetches[i]); } -static void declare_input_vs( - struct si_shader_context *ctx, - unsigned input_index, - const struct tgsi_full_declaration *decl, - LLVMValueRef out[4]) +static void declare_vs_blit_inputs(struct si_shader_context *ctx, + unsigned vs_blit_property) { - si_llvm_load_input_vs(ctx, input_index, out); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, + &ctx->vs_blit_inputs); /* i16 x1, y1 */ + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* i16 x1, y1 */ + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* depth */ + + if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_COLOR) { + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color0 */ + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color1 */ + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color2 */ + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color3 */ + } else if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_TEXCOORD) { + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.x1 */ + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.y1 */ + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.x2 */ + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.y2 */ + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.z */ + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.w */ + } } -LLVMValueRef si_get_primitive_id(struct si_shader_context *ctx, - unsigned swizzle) +static void declare_tes_input_vgprs(struct si_shader_context *ctx, bool ngg_cull_shader) { - if (swizzle > 0) - return ctx->i32_0; + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->tes_u); + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->tes_v); + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->tes_rel_patch_id); + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tes_patch_id); - switch (ctx->type) { - case PIPE_SHADER_VERTEX: - return LLVMGetParam(ctx->main_fn, - ctx->param_vs_prim_id); - case PIPE_SHADER_TESS_CTRL: - return ctx->abi.tcs_patch_id; - case PIPE_SHADER_TESS_EVAL: - return ctx->abi.tes_patch_id; - case PIPE_SHADER_GEOMETRY: - return ctx->abi.gs_prim_id; - default: - assert(0); - return ctx->i32_0; + if (ctx->shader->key.opt.ngg_culling && !ngg_cull_shader) { + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, + &ctx->ngg_old_thread_id); } } -/** - * Return the value of tgsi_ind_register for indexing. - * This is the indirect index with the constant offset added to it. - */ -LLVMValueRef si_get_indirect_index(struct si_shader_context *ctx, - const struct tgsi_ind_register *ind, - unsigned addr_mul, - int rel_index) -{ - LLVMValueRef result; - - if (ind->File == TGSI_FILE_ADDRESS) { - result = ctx->addrs[ind->Index][ind->Swizzle]; - result = LLVMBuildLoad(ctx->ac.builder, result, ""); - } else { - struct tgsi_full_src_register src = {}; - - src.Register.File = ind->File; - src.Register.Index = ind->Index; - - /* Set the second index to 0 for constants. */ - if (ind->File == TGSI_FILE_CONSTANT) - src.Register.Dimension = 1; - - result = ctx->bld_base.emit_fetch_funcs[ind->File](&ctx->bld_base, &src, - TGSI_TYPE_SIGNED, - ind->Swizzle); - result = ac_to_integer(&ctx->ac, result); - } +enum { + /* Convenient merged shader definitions. */ + SI_SHADER_MERGED_VERTEX_TESSCTRL = PIPE_SHADER_TYPES, + SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY, +}; - return ac_build_imad(&ctx->ac, result, LLVMConstInt(ctx->i32, addr_mul, 0), - LLVMConstInt(ctx->i32, rel_index, 0)); +void si_add_arg_checked(struct ac_shader_args *args, + enum ac_arg_regfile file, + unsigned registers, enum ac_arg_type type, + struct ac_arg *arg, + unsigned idx) +{ + assert(args->arg_count == idx); + ac_add_arg(args, file, registers, type, arg); } -/** - * Like si_get_indirect_index, but restricts the return value to a (possibly - * undefined) value inside [0..num). - */ -LLVMValueRef si_get_bounded_indirect_index(struct si_shader_context *ctx, - const struct tgsi_ind_register *ind, - int rel_index, unsigned num) +void si_create_function(struct si_shader_context *ctx, bool ngg_cull_shader) { - LLVMValueRef result = si_get_indirect_index(ctx, ind, 1, rel_index); + struct si_shader *shader = ctx->shader; + LLVMTypeRef returns[AC_MAX_ARGS]; + unsigned i, num_return_sgprs; + unsigned num_returns = 0; + unsigned num_prolog_vgprs = 0; + unsigned type = ctx->type; + unsigned vs_blit_property = + shader->selector->info.properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD]; - return si_llvm_bound_index(ctx, result, num); -} + memset(&ctx->args, 0, sizeof(ctx->args)); -static LLVMValueRef get_dw_address_from_generic_indices(struct si_shader_context *ctx, - LLVMValueRef vertex_dw_stride, - LLVMValueRef base_addr, - LLVMValueRef vertex_index, - LLVMValueRef param_index, - unsigned input_index, - ubyte *name, - ubyte *index, - bool is_patch) -{ - if (vertex_dw_stride) { - base_addr = ac_build_imad(&ctx->ac, vertex_index, - vertex_dw_stride, base_addr); + /* Set MERGED shaders. */ + if (ctx->screen->info.chip_class >= GFX9) { + if (shader->key.as_ls || type == PIPE_SHADER_TESS_CTRL) + type = SI_SHADER_MERGED_VERTEX_TESSCTRL; /* LS or HS */ + else if (shader->key.as_es || shader->key.as_ngg || type == PIPE_SHADER_GEOMETRY) + type = SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY; } - if (param_index) { - base_addr = ac_build_imad(&ctx->ac, param_index, - LLVMConstInt(ctx->i32, 4, 0), base_addr); - } + switch (type) { + case PIPE_SHADER_VERTEX: + declare_global_desc_pointers(ctx); - int param = is_patch ? - si_shader_io_get_unique_index_patch(name[input_index], - index[input_index]) : - si_shader_io_get_unique_index(name[input_index], - index[input_index], false); + if (vs_blit_property) { + declare_vs_blit_inputs(ctx, vs_blit_property); - /* Add the base address of the element. */ - return LLVMBuildAdd(ctx->ac.builder, base_addr, - LLVMConstInt(ctx->i32, param * 4, 0), ""); -} + /* VGPRs */ + declare_vs_input_vgprs(ctx, &num_prolog_vgprs, ngg_cull_shader); + break; + } -/** - * Calculate a dword address given an input or output register and a stride. - */ -static LLVMValueRef get_dw_address(struct si_shader_context *ctx, - const struct tgsi_full_dst_register *dst, - const struct tgsi_full_src_register *src, - LLVMValueRef vertex_dw_stride, - LLVMValueRef base_addr) -{ - struct tgsi_shader_info *info = &ctx->shader->selector->info; - ubyte *name, *index, *array_first; - int input_index; - struct tgsi_full_dst_register reg; - LLVMValueRef vertex_index = NULL; - LLVMValueRef ind_index = NULL; - - /* Set the register description. The address computation is the same - * for sources and destinations. */ - if (src) { - reg.Register.File = src->Register.File; - reg.Register.Index = src->Register.Index; - reg.Register.Indirect = src->Register.Indirect; - reg.Register.Dimension = src->Register.Dimension; - reg.Indirect = src->Indirect; - reg.Dimension = src->Dimension; - reg.DimIndirect = src->DimIndirect; - } else - reg = *dst; - - /* If the register is 2-dimensional (e.g. an array of vertices - * in a primitive), calculate the base address of the vertex. */ - if (reg.Register.Dimension) { - if (reg.Dimension.Indirect) - vertex_index = si_get_indirect_index(ctx, ®.DimIndirect, - 1, reg.Dimension.Index); - else - vertex_index = LLVMConstInt(ctx->i32, reg.Dimension.Index, 0); - } + declare_per_stage_desc_pointers(ctx, true); + declare_vs_specific_input_sgprs(ctx); + if (!shader->is_gs_copy_shader) + declare_vb_descriptor_input_sgprs(ctx); - /* Get information about the register. */ - if (reg.Register.File == TGSI_FILE_INPUT) { - name = info->input_semantic_name; - index = info->input_semantic_index; - array_first = info->input_array_first; - } else if (reg.Register.File == TGSI_FILE_OUTPUT) { - name = info->output_semantic_name; - index = info->output_semantic_index; - array_first = info->output_array_first; - } else { - assert(0); - return NULL; - } + if (shader->key.as_es) { + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, + &ctx->es2gs_offset); + } else if (shader->key.as_ls) { + /* no extra parameters */ + } else { + /* The locations of the other parameters are assigned dynamically. */ + declare_streamout_params(ctx, &shader->selector->so); + } - if (reg.Register.Indirect) { - /* Add the relative address of the element. */ - if (reg.Indirect.ArrayID) - input_index = array_first[reg.Indirect.ArrayID]; - else - input_index = reg.Register.Index; + /* VGPRs */ + declare_vs_input_vgprs(ctx, &num_prolog_vgprs, ngg_cull_shader); - ind_index = si_get_indirect_index(ctx, ®.Indirect, - 1, reg.Register.Index - input_index); - } else { - input_index = reg.Register.Index; - } + /* Return values */ + if (shader->key.opt.vs_as_prim_discard_cs) { + for (i = 0; i < 4; i++) + returns[num_returns++] = ctx->ac.f32; /* VGPRs */ + } + break; - return get_dw_address_from_generic_indices(ctx, vertex_dw_stride, - base_addr, vertex_index, - ind_index, input_index, - name, index, - !reg.Register.Dimension); -} + case PIPE_SHADER_TESS_CTRL: /* GFX6-GFX8 */ + declare_global_desc_pointers(ctx); + declare_per_stage_desc_pointers(ctx, true); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_offsets); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_layout); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_factor_offset); -/* The offchip buffer layout for TCS->TES is - * - * - attribute 0 of patch 0 vertex 0 - * - attribute 0 of patch 0 vertex 1 - * - attribute 0 of patch 0 vertex 2 - * ... - * - attribute 0 of patch 1 vertex 0 - * - attribute 0 of patch 1 vertex 1 - * ... - * - attribute 1 of patch 0 vertex 0 - * - attribute 1 of patch 0 vertex 1 - * ... - * - per patch attribute 0 of patch 0 - * - per patch attribute 0 of patch 1 - * ... - * - * Note that every attribute has 4 components. - */ -static LLVMValueRef get_tcs_tes_buffer_address(struct si_shader_context *ctx, - LLVMValueRef rel_patch_id, - LLVMValueRef vertex_index, - LLVMValueRef param_index) -{ - LLVMValueRef base_addr, vertices_per_patch, num_patches, total_vertices; - LLVMValueRef param_stride, constant16; - - vertices_per_patch = get_num_tcs_out_vertices(ctx); - num_patches = si_unpack_param(ctx, ctx->param_tcs_offchip_layout, 0, 6); - total_vertices = LLVMBuildMul(ctx->ac.builder, vertices_per_patch, - num_patches, ""); - - constant16 = LLVMConstInt(ctx->i32, 16, 0); - if (vertex_index) { - base_addr = ac_build_imad(&ctx->ac, rel_patch_id, - vertices_per_patch, vertex_index); - param_stride = total_vertices; - } else { - base_addr = rel_patch_id; - param_stride = num_patches; - } + /* VGPRs */ + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_patch_id); + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_rel_ids); - base_addr = ac_build_imad(&ctx->ac, param_index, param_stride, base_addr); - base_addr = LLVMBuildMul(ctx->ac.builder, base_addr, constant16, ""); + /* param_tcs_offchip_offset and param_tcs_factor_offset are + * placed after the user SGPRs. + */ + for (i = 0; i < GFX6_TCS_NUM_USER_SGPR + 2; i++) + returns[num_returns++] = ctx->ac.i32; /* SGPRs */ + for (i = 0; i < 11; i++) + returns[num_returns++] = ctx->ac.f32; /* VGPRs */ + break; - if (!vertex_index) { - LLVMValueRef patch_data_offset = - si_unpack_param(ctx, ctx->param_tcs_offchip_layout, 12, 20); + case SI_SHADER_MERGED_VERTEX_TESSCTRL: + /* Merged stages have 8 system SGPRs at the beginning. */ + /* SPI_SHADER_USER_DATA_ADDR_LO/HI_HS */ + declare_per_stage_desc_pointers(ctx, + ctx->type == PIPE_SHADER_TESS_CTRL); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->merged_wave_info); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_factor_offset); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->merged_scratch_offset); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */ + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */ - base_addr = LLVMBuildAdd(ctx->ac.builder, base_addr, - patch_data_offset, ""); - } - return base_addr; -} - -/* This is a generic helper that can be shared by the NIR and TGSI backends */ -static LLVMValueRef get_tcs_tes_buffer_address_from_generic_indices( - struct si_shader_context *ctx, - LLVMValueRef vertex_index, - LLVMValueRef param_index, - unsigned param_base, - ubyte *name, - ubyte *index, - bool is_patch) -{ - unsigned param_index_base; - - param_index_base = is_patch ? - si_shader_io_get_unique_index_patch(name[param_base], index[param_base]) : - si_shader_io_get_unique_index(name[param_base], index[param_base], false); + declare_global_desc_pointers(ctx); + declare_per_stage_desc_pointers(ctx, + ctx->type == PIPE_SHADER_VERTEX); + declare_vs_specific_input_sgprs(ctx); - if (param_index) { - param_index = LLVMBuildAdd(ctx->ac.builder, param_index, - LLVMConstInt(ctx->i32, param_index_base, 0), - ""); - } else { - param_index = LLVMConstInt(ctx->i32, param_index_base, 0); - } + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_offsets); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_layout); + declare_vb_descriptor_input_sgprs(ctx); - return get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx), - vertex_index, param_index); -} + /* VGPRs (first TCS, then VS) */ + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_patch_id); + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_rel_ids); -static LLVMValueRef get_tcs_tes_buffer_address_from_reg( - struct si_shader_context *ctx, - const struct tgsi_full_dst_register *dst, - const struct tgsi_full_src_register *src) -{ - struct tgsi_shader_info *info = &ctx->shader->selector->info; - ubyte *name, *index, *array_first; - struct tgsi_full_src_register reg; - LLVMValueRef vertex_index = NULL; - LLVMValueRef param_index = NULL; - unsigned param_base; + if (ctx->type == PIPE_SHADER_VERTEX) { + declare_vs_input_vgprs(ctx, &num_prolog_vgprs, ngg_cull_shader); - reg = src ? *src : tgsi_full_src_register_from_dst(dst); + /* LS return values are inputs to the TCS main shader part. */ + for (i = 0; i < 8 + GFX9_TCS_NUM_USER_SGPR; i++) + returns[num_returns++] = ctx->ac.i32; /* SGPRs */ + for (i = 0; i < 2; i++) + returns[num_returns++] = ctx->ac.f32; /* VGPRs */ + } else { + /* TCS return values are inputs to the TCS epilog. + * + * param_tcs_offchip_offset, param_tcs_factor_offset, + * param_tcs_offchip_layout, and param_rw_buffers + * should be passed to the epilog. + */ + for (i = 0; i <= 8 + GFX9_SGPR_TCS_OUT_LAYOUT; i++) + returns[num_returns++] = ctx->ac.i32; /* SGPRs */ + for (i = 0; i < 11; i++) + returns[num_returns++] = ctx->ac.f32; /* VGPRs */ + } + break; - if (reg.Register.Dimension) { + case SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY: + /* Merged stages have 8 system SGPRs at the beginning. */ + /* SPI_SHADER_USER_DATA_ADDR_LO/HI_GS */ + declare_per_stage_desc_pointers(ctx, + ctx->type == PIPE_SHADER_GEOMETRY); - if (reg.Dimension.Indirect) - vertex_index = si_get_indirect_index(ctx, ®.DimIndirect, - 1, reg.Dimension.Index); + if (ctx->shader->key.as_ngg) + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->gs_tg_info); else - vertex_index = LLVMConstInt(ctx->i32, reg.Dimension.Index, 0); - } + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->gs2vs_offset); - /* Get information about the register. */ - if (reg.Register.File == TGSI_FILE_INPUT) { - name = info->input_semantic_name; - index = info->input_semantic_index; - array_first = info->input_array_first; - } else if (reg.Register.File == TGSI_FILE_OUTPUT) { - name = info->output_semantic_name; - index = info->output_semantic_index; - array_first = info->output_array_first; - } else { - assert(0); - return NULL; - } - - if (reg.Register.Indirect) { - if (reg.Indirect.ArrayID) - param_base = array_first[reg.Indirect.ArrayID]; - else - param_base = reg.Register.Index; + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->merged_wave_info); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->merged_scratch_offset); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, + &ctx->small_prim_cull_info); /* SPI_SHADER_PGM_LO_GS << 8 */ + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused (SPI_SHADER_PGM_LO/HI_GS >> 24) */ - param_index = si_get_indirect_index(ctx, ®.Indirect, - 1, reg.Register.Index - param_base); + declare_global_desc_pointers(ctx); + if (ctx->type != PIPE_SHADER_VERTEX || !vs_blit_property) { + declare_per_stage_desc_pointers(ctx, + (ctx->type == PIPE_SHADER_VERTEX || + ctx->type == PIPE_SHADER_TESS_EVAL)); + } - } else { - param_base = reg.Register.Index; - } + if (ctx->type == PIPE_SHADER_VERTEX) { + if (vs_blit_property) + declare_vs_blit_inputs(ctx, vs_blit_property); + else + declare_vs_specific_input_sgprs(ctx); + } else { + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tes_offchip_addr); + /* Declare as many input SGPRs as the VS has. */ + } - return get_tcs_tes_buffer_address_from_generic_indices(ctx, vertex_index, - param_index, param_base, - name, index, !reg.Register.Dimension); -} + if (ctx->type == PIPE_SHADER_VERTEX) + declare_vb_descriptor_input_sgprs(ctx); -static LLVMValueRef buffer_load(struct lp_build_tgsi_context *bld_base, - LLVMTypeRef type, unsigned swizzle, - LLVMValueRef buffer, LLVMValueRef offset, - LLVMValueRef base, bool can_speculate) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - LLVMValueRef value, value2; - LLVMTypeRef vec_type = LLVMVectorType(type, 4); + /* VGPRs (first GS, then VS/TES) */ + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx01_offset); + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx23_offset); + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_prim_id); + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_invocation_id); + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx45_offset); - if (swizzle == ~0) { - value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset, - 0, ac_glc, can_speculate, false); + if (ctx->type == PIPE_SHADER_VERTEX) { + declare_vs_input_vgprs(ctx, &num_prolog_vgprs, ngg_cull_shader); + } else if (ctx->type == PIPE_SHADER_TESS_EVAL) { + declare_tes_input_vgprs(ctx, ngg_cull_shader); + } - return LLVMBuildBitCast(ctx->ac.builder, value, vec_type, ""); - } + if ((ctx->shader->key.as_es || ngg_cull_shader) && + (ctx->type == PIPE_SHADER_VERTEX || + ctx->type == PIPE_SHADER_TESS_EVAL)) { + unsigned num_user_sgprs, num_vgprs; - if (!llvm_type_is_64bit(ctx, type)) { - value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset, - 0, ac_glc, can_speculate, false); + if (ctx->type == PIPE_SHADER_VERTEX) { + /* For the NGG cull shader, add 1 SGPR to hold + * the vertex buffer pointer. + */ + num_user_sgprs = GFX9_VSGS_NUM_USER_SGPR + ngg_cull_shader; - value = LLVMBuildBitCast(ctx->ac.builder, value, vec_type, ""); - return LLVMBuildExtractElement(ctx->ac.builder, value, - LLVMConstInt(ctx->i32, swizzle, 0), ""); - } + if (ngg_cull_shader && shader->selector->num_vbos_in_user_sgprs) { + assert(num_user_sgprs <= 8 + SI_SGPR_VS_VB_DESCRIPTOR_FIRST); + num_user_sgprs = SI_SGPR_VS_VB_DESCRIPTOR_FIRST + + shader->selector->num_vbos_in_user_sgprs * 4; + } + } else { + num_user_sgprs = GFX9_TESGS_NUM_USER_SGPR; + } - value = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset, - swizzle * 4, ac_glc, can_speculate, false); + /* The NGG cull shader has to return all 9 VGPRs + the old thread ID. + * + * The normal merged ESGS shader only has to return the 5 VGPRs + * for the GS stage. + */ + num_vgprs = ngg_cull_shader ? 10 : 5; - value2 = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset, - swizzle * 4 + 4, ac_glc, can_speculate, false); + /* ES return values are inputs to GS. */ + for (i = 0; i < 8 + num_user_sgprs; i++) + returns[num_returns++] = ctx->ac.i32; /* SGPRs */ + for (i = 0; i < num_vgprs; i++) + returns[num_returns++] = ctx->ac.f32; /* VGPRs */ + } + break; - return si_llvm_emit_fetch_64bit(bld_base, type, value, value2); -} + case PIPE_SHADER_TESS_EVAL: + declare_global_desc_pointers(ctx); + declare_per_stage_desc_pointers(ctx, true); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tes_offchip_addr); -/** - * Load from LSHS LDS storage. - * - * \param type output value type - * \param swizzle offset (typically 0..3); it can be ~0, which loads a vec4 - * \param dw_addr address in dwords - */ -static LLVMValueRef lshs_lds_load(struct lp_build_tgsi_context *bld_base, - LLVMTypeRef type, unsigned swizzle, - LLVMValueRef dw_addr) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - LLVMValueRef value; + if (shader->key.as_es) { + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->es2gs_offset); + } else { + declare_streamout_params(ctx, &shader->selector->so); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset); + } - if (swizzle == ~0) { - LLVMValueRef values[TGSI_NUM_CHANNELS]; + /* VGPRs */ + declare_tes_input_vgprs(ctx, ngg_cull_shader); + break; - for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; chan++) - values[chan] = lshs_lds_load(bld_base, type, chan, dw_addr); + case PIPE_SHADER_GEOMETRY: + declare_global_desc_pointers(ctx); + declare_per_stage_desc_pointers(ctx, true); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->gs2vs_offset); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->gs_wave_id); - return ac_build_gather_values(&ctx->ac, values, - TGSI_NUM_CHANNELS); - } + /* VGPRs */ + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[0]); + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[1]); + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_prim_id); + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[2]); + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[3]); + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[4]); + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[5]); + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_invocation_id); + break; - /* Split 64-bit loads. */ - if (llvm_type_is_64bit(ctx, type)) { - LLVMValueRef lo, hi; + case PIPE_SHADER_FRAGMENT: + declare_global_desc_pointers(ctx); + declare_per_stage_desc_pointers(ctx, true); + si_add_arg_checked(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL, + SI_PARAM_ALPHA_REF); + si_add_arg_checked(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, + &ctx->args.prim_mask, SI_PARAM_PRIM_MASK); - lo = lshs_lds_load(bld_base, ctx->i32, swizzle, dw_addr); - hi = lshs_lds_load(bld_base, ctx->i32, swizzle + 1, dw_addr); - return si_llvm_emit_fetch_64bit(bld_base, type, lo, hi); - } + si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.persp_sample, + SI_PARAM_PERSP_SAMPLE); + si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, + &ctx->args.persp_center, SI_PARAM_PERSP_CENTER); + si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, + &ctx->args.persp_centroid, SI_PARAM_PERSP_CENTROID); + si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 3, AC_ARG_INT, + NULL, SI_PARAM_PERSP_PULL_MODEL); + si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, + &ctx->args.linear_sample, SI_PARAM_LINEAR_SAMPLE); + si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, + &ctx->args.linear_center, SI_PARAM_LINEAR_CENTER); + si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, + &ctx->args.linear_centroid, SI_PARAM_LINEAR_CENTROID); + si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 3, AC_ARG_FLOAT, + NULL, SI_PARAM_LINE_STIPPLE_TEX); + si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, + &ctx->args.frag_pos[0], SI_PARAM_POS_X_FLOAT); + si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, + &ctx->args.frag_pos[1], SI_PARAM_POS_Y_FLOAT); + si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, + &ctx->args.frag_pos[2], SI_PARAM_POS_Z_FLOAT); + si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, + &ctx->args.frag_pos[3], SI_PARAM_POS_W_FLOAT); + shader->info.face_vgpr_index = ctx->args.num_vgprs_used; + si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, + &ctx->args.front_face, SI_PARAM_FRONT_FACE); + shader->info.ancillary_vgpr_index = ctx->args.num_vgprs_used; + si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, + &ctx->args.ancillary, SI_PARAM_ANCILLARY); + si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, + &ctx->args.sample_coverage, SI_PARAM_SAMPLE_COVERAGE); + si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, + &ctx->pos_fixed_pt, SI_PARAM_POS_FIXED_PT); - dw_addr = LLVMBuildAdd(ctx->ac.builder, dw_addr, - LLVMConstInt(ctx->i32, swizzle, 0), ""); + /* Color inputs from the prolog. */ + if (shader->selector->info.colors_read) { + unsigned num_color_elements = + util_bitcount(shader->selector->info.colors_read); - value = ac_lds_load(&ctx->ac, dw_addr); + for (i = 0; i < num_color_elements; i++) + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL); - return LLVMBuildBitCast(ctx->ac.builder, value, type, ""); -} + num_prolog_vgprs += num_color_elements; + } -/** - * Store to LSHS LDS storage. - * - * \param swizzle offset (typically 0..3) - * \param dw_addr address in dwords - * \param value value to store - */ -static void lshs_lds_store(struct si_shader_context *ctx, - unsigned dw_offset_imm, LLVMValueRef dw_addr, - LLVMValueRef value) -{ - dw_addr = LLVMBuildAdd(ctx->ac.builder, dw_addr, - LLVMConstInt(ctx->i32, dw_offset_imm, 0), ""); + /* Outputs for the epilog. */ + num_return_sgprs = SI_SGPR_ALPHA_REF + 1; + num_returns = + num_return_sgprs + + util_bitcount(shader->selector->info.colors_written) * 4 + + shader->selector->info.writes_z + + shader->selector->info.writes_stencil + + shader->selector->info.writes_samplemask + + 1 /* SampleMaskIn */; - ac_lds_store(&ctx->ac, dw_addr, value); -} + num_returns = MAX2(num_returns, + num_return_sgprs + + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1); -enum si_tess_ring { - TCS_FACTOR_RING, - TESS_OFFCHIP_RING_TCS, - TESS_OFFCHIP_RING_TES, -}; + for (i = 0; i < num_return_sgprs; i++) + returns[i] = ctx->ac.i32; + for (; i < num_returns; i++) + returns[i] = ctx->ac.f32; + break; -static LLVMValueRef get_tess_ring_descriptor(struct si_shader_context *ctx, - enum si_tess_ring ring) -{ - LLVMBuilderRef builder = ctx->ac.builder; - unsigned param = ring == TESS_OFFCHIP_RING_TES ? ctx->param_tes_offchip_addr : - ctx->param_tcs_out_lds_layout; - LLVMValueRef addr = LLVMGetParam(ctx->main_fn, param); - - /* TCS only receives high 13 bits of the address. */ - if (ring == TESS_OFFCHIP_RING_TCS || ring == TCS_FACTOR_RING) { - addr = LLVMBuildAnd(builder, addr, - LLVMConstInt(ctx->i32, 0xfff80000, 0), ""); - } - - if (ring == TCS_FACTOR_RING) { - unsigned tf_offset = ctx->screen->tess_offchip_ring_size; - addr = LLVMBuildAdd(builder, addr, - LLVMConstInt(ctx->i32, tf_offset, 0), ""); - } - - uint32_t rsrc3 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | - S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | - S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | - S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W); - - if (ctx->screen->info.chip_class >= GFX10) - rsrc3 |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) | - S_008F0C_OOB_SELECT(3) | - S_008F0C_RESOURCE_LEVEL(1); - else - rsrc3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | - S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); + case PIPE_SHADER_COMPUTE: + declare_global_desc_pointers(ctx); + declare_per_stage_desc_pointers(ctx, true); + if (shader->selector->info.uses_grid_size) + ac_add_arg(&ctx->args, AC_ARG_SGPR, 3, AC_ARG_INT, + &ctx->args.num_work_groups); + if (shader->selector->info.uses_block_size && + shader->selector->info.properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] == 0) + ac_add_arg(&ctx->args, AC_ARG_SGPR, 3, AC_ARG_INT, &ctx->block_size); - LLVMValueRef desc[4]; - desc[0] = addr; - desc[1] = LLVMConstInt(ctx->i32, - S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi), 0); - desc[2] = LLVMConstInt(ctx->i32, 0xffffffff, 0); - desc[3] = LLVMConstInt(ctx->i32, rsrc3, false); - - return ac_build_gather_values(&ctx->ac, desc, 4); -} - -static LLVMValueRef fetch_input_tcs( - struct lp_build_tgsi_context *bld_base, - const struct tgsi_full_src_register *reg, - enum tgsi_opcode_type type, unsigned swizzle_in) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - LLVMValueRef dw_addr, stride; - unsigned swizzle = swizzle_in & 0xffff; - stride = get_tcs_in_vertex_dw_stride(ctx); - dw_addr = get_tcs_in_current_patch_offset(ctx); - dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr); - - return lshs_lds_load(bld_base, tgsi2llvmtype(bld_base, type), swizzle, dw_addr); -} - -static LLVMValueRef si_nir_load_tcs_varyings(struct ac_shader_abi *abi, - LLVMTypeRef type, - LLVMValueRef vertex_index, - LLVMValueRef param_index, - unsigned const_index, - unsigned location, - unsigned driver_location, - unsigned component, - unsigned num_components, - bool is_patch, - bool is_compact, - bool load_input) -{ - struct si_shader_context *ctx = si_shader_context_from_abi(abi); - struct tgsi_shader_info *info = &ctx->shader->selector->info; - struct lp_build_tgsi_context *bld_base = &ctx->bld_base; - LLVMValueRef dw_addr, stride; - - driver_location = driver_location / 4; - - if (load_input) { - stride = get_tcs_in_vertex_dw_stride(ctx); - dw_addr = get_tcs_in_current_patch_offset(ctx); - } else { - if (is_patch) { - stride = NULL; - dw_addr = get_tcs_out_current_patch_data_offset(ctx); - } else { - stride = get_tcs_out_vertex_dw_stride(ctx); - dw_addr = get_tcs_out_current_patch_offset(ctx); + unsigned cs_user_data_dwords = + shader->selector->info.properties[TGSI_PROPERTY_CS_USER_DATA_COMPONENTS_AMD]; + if (cs_user_data_dwords) { + ac_add_arg(&ctx->args, AC_ARG_SGPR, cs_user_data_dwords, AC_ARG_INT, + &ctx->cs_user_data); } - } - if (param_index) { - /* Add the constant index to the indirect index */ - param_index = LLVMBuildAdd(ctx->ac.builder, param_index, - LLVMConstInt(ctx->i32, const_index, 0), ""); - } else { - param_index = LLVMConstInt(ctx->i32, const_index, 0); - } + /* Hardware SGPRs. */ + for (i = 0; i < 3; i++) { + if (shader->selector->info.uses_block_id[i]) { + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, + &ctx->args.workgroup_ids[i]); + } + } + if (shader->selector->info.uses_subgroup_info) + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tg_size); - ubyte *names; - ubyte *indices; - if (load_input) { - names = info->input_semantic_name; - indices = info->input_semantic_index; - } else { - names = info->output_semantic_name; - indices = info->output_semantic_index; + /* Hardware VGPRs. */ + ac_add_arg(&ctx->args, AC_ARG_VGPR, 3, AC_ARG_INT, + &ctx->args.local_invocation_ids); + break; + default: + assert(0 && "unimplemented shader"); + return; } - dw_addr = get_dw_address_from_generic_indices(ctx, stride, dw_addr, - vertex_index, param_index, - driver_location, - names, indices, - is_patch); - - LLVMValueRef value[4]; - for (unsigned i = 0; i < num_components; i++) { - unsigned offset = i; - if (llvm_type_is_64bit(ctx, type)) - offset *= 2; + si_llvm_create_func(ctx, ngg_cull_shader ? "ngg_cull_main" : "main", + returns, num_returns, si_get_max_workgroup_size(shader)); - offset += component; - value[i + component] = lshs_lds_load(bld_base, type, offset, dw_addr); + /* Reserve register locations for VGPR inputs the PS prolog may need. */ + if (ctx->type == PIPE_SHADER_FRAGMENT && !ctx->shader->is_monolithic) { + ac_llvm_add_target_dep_function_attr(ctx->main_fn, + "InitialPSInputAddr", + S_0286D0_PERSP_SAMPLE_ENA(1) | + S_0286D0_PERSP_CENTER_ENA(1) | + S_0286D0_PERSP_CENTROID_ENA(1) | + S_0286D0_LINEAR_SAMPLE_ENA(1) | + S_0286D0_LINEAR_CENTER_ENA(1) | + S_0286D0_LINEAR_CENTROID_ENA(1) | + S_0286D0_FRONT_FACE_ENA(1) | + S_0286D0_ANCILLARY_ENA(1) | + S_0286D0_POS_FIXED_PT_ENA(1)); } - return ac_build_varying_gather_values(&ctx->ac, value, num_components, component); -} + shader->info.num_input_sgprs = ctx->args.num_sgprs_used; + shader->info.num_input_vgprs = ctx->args.num_vgprs_used; -static LLVMValueRef fetch_output_tcs( - struct lp_build_tgsi_context *bld_base, - const struct tgsi_full_src_register *reg, - enum tgsi_opcode_type type, unsigned swizzle_in) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - LLVMValueRef dw_addr, stride; - unsigned swizzle = (swizzle_in & 0xffff); + assert(shader->info.num_input_vgprs >= num_prolog_vgprs); + shader->info.num_input_vgprs -= num_prolog_vgprs; - if (reg->Register.Dimension) { - stride = get_tcs_out_vertex_dw_stride(ctx); - dw_addr = get_tcs_out_current_patch_offset(ctx); - dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr); - } else { - dw_addr = get_tcs_out_current_patch_data_offset(ctx); - dw_addr = get_dw_address(ctx, NULL, reg, NULL, dw_addr); + if (shader->key.as_ls || ctx->type == PIPE_SHADER_TESS_CTRL) { + if (USE_LDS_SYMBOLS && LLVM_VERSION_MAJOR >= 9) { + /* The LSHS size is not known until draw time, so we append it + * at the end of whatever LDS use there may be in the rest of + * the shader (currently none, unless LLVM decides to do its + * own LDS-based lowering). + */ + ctx->ac.lds = LLVMAddGlobalInAddressSpace( + ctx->ac.module, LLVMArrayType(ctx->ac.i32, 0), + "__lds_end", AC_ADDR_SPACE_LDS); + LLVMSetAlignment(ctx->ac.lds, 256); + } else { + ac_declare_lds_as_pointer(&ctx->ac); + } } - return lshs_lds_load(bld_base, tgsi2llvmtype(bld_base, type), swizzle, dw_addr); + /* Unlike radv, we override these arguments in the prolog, so to the + * API shader they appear as normal arguments. + */ + if (ctx->type == PIPE_SHADER_VERTEX) { + ctx->abi.vertex_id = ac_get_arg(&ctx->ac, ctx->args.vertex_id); + ctx->abi.instance_id = ac_get_arg(&ctx->ac, ctx->args.instance_id); + } else if (ctx->type == PIPE_SHADER_FRAGMENT) { + ctx->abi.persp_centroid = ac_get_arg(&ctx->ac, ctx->args.persp_centroid); + ctx->abi.linear_centroid = ac_get_arg(&ctx->ac, ctx->args.linear_centroid); + } } -static LLVMValueRef fetch_input_tes( - struct lp_build_tgsi_context *bld_base, - const struct tgsi_full_src_register *reg, - enum tgsi_opcode_type type, unsigned swizzle_in) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - LLVMValueRef base, addr; - unsigned swizzle = (swizzle_in & 0xffff); +/* For the UMR disassembler. */ +#define DEBUGGER_END_OF_CODE_MARKER 0xbf9f0000 /* invalid instruction */ +#define DEBUGGER_NUM_MARKERS 5 - base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset); - addr = get_tcs_tes_buffer_address_from_reg(ctx, NULL, reg); +static bool si_shader_binary_open(struct si_screen *screen, + struct si_shader *shader, + struct ac_rtld_binary *rtld) +{ + const struct si_shader_selector *sel = shader->selector; + const char *part_elfs[5]; + size_t part_sizes[5]; + unsigned num_parts = 0; - return buffer_load(bld_base, tgsi2llvmtype(bld_base, type), swizzle, - ctx->tess_offchip_ring, base, addr, true); -} +#define add_part(shader_or_part) \ + if (shader_or_part) { \ + part_elfs[num_parts] = (shader_or_part)->binary.elf_buffer; \ + part_sizes[num_parts] = (shader_or_part)->binary.elf_size; \ + num_parts++; \ + } -LLVMValueRef si_nir_load_input_tes(struct ac_shader_abi *abi, - LLVMTypeRef type, - LLVMValueRef vertex_index, - LLVMValueRef param_index, - unsigned const_index, - unsigned location, - unsigned driver_location, - unsigned component, - unsigned num_components, - bool is_patch, - bool is_compact, - bool load_input) -{ - struct si_shader_context *ctx = si_shader_context_from_abi(abi); - struct tgsi_shader_info *info = &ctx->shader->selector->info; - LLVMValueRef base, addr; + add_part(shader->prolog); + add_part(shader->previous_stage); + add_part(shader->prolog2); + add_part(shader); + add_part(shader->epilog); - driver_location = driver_location / 4; +#undef add_part - base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset); + struct ac_rtld_symbol lds_symbols[2]; + unsigned num_lds_symbols = 0; - if (param_index) { - /* Add the constant index to the indirect index */ - param_index = LLVMBuildAdd(ctx->ac.builder, param_index, - LLVMConstInt(ctx->i32, const_index, 0), ""); - } else { - param_index = LLVMConstInt(ctx->i32, const_index, 0); + if (sel && screen->info.chip_class >= GFX9 && !shader->is_gs_copy_shader && + (sel->type == PIPE_SHADER_GEOMETRY || shader->key.as_ngg)) { + /* We add this symbol even on LLVM <= 8 to ensure that + * shader->config.lds_size is set correctly below. + */ + struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++]; + sym->name = "esgs_ring"; + sym->size = shader->gs_info.esgs_ring_size; + sym->align = 64 * 1024; } - addr = get_tcs_tes_buffer_address_from_generic_indices(ctx, vertex_index, - param_index, driver_location, - info->input_semantic_name, - info->input_semantic_index, - is_patch); - - /* TODO: This will generate rather ordinary llvm code, although it - * should be easy for the optimiser to fix up. In future we might want - * to refactor buffer_load(), but for now this maximises code sharing - * between the NIR and TGSI backends. - */ - LLVMValueRef value[4]; - for (unsigned i = 0; i < num_components; i++) { - unsigned offset = i; - if (llvm_type_is_64bit(ctx, type)) { - offset *= 2; - if (offset == 4) { - addr = get_tcs_tes_buffer_address_from_generic_indices(ctx, - vertex_index, - param_index, - driver_location + 1, - info->input_semantic_name, - info->input_semantic_index, - is_patch); - } + if (shader->key.as_ngg && sel->type == PIPE_SHADER_GEOMETRY) { + struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++]; + sym->name = "ngg_emit"; + sym->size = shader->ngg.ngg_emit_size * 4; + sym->align = 4; + } - offset = offset % 4; - } + bool ok = ac_rtld_open(rtld, (struct ac_rtld_open_info){ + .info = &screen->info, + .options = { + .halt_at_entry = screen->options.halt_shaders, + }, + .shader_type = tgsi_processor_to_shader_stage(sel->type), + .wave_size = si_get_shader_wave_size(shader), + .num_parts = num_parts, + .elf_ptrs = part_elfs, + .elf_sizes = part_sizes, + .num_shared_lds_symbols = num_lds_symbols, + .shared_lds_symbols = lds_symbols }); - offset += component; - value[i + component] = buffer_load(&ctx->bld_base, type, offset, - ctx->tess_offchip_ring, base, addr, true); + if (rtld->lds_size > 0) { + unsigned alloc_granularity = screen->info.chip_class >= GFX7 ? 512 : 256; + shader->config.lds_size = + align(rtld->lds_size, alloc_granularity) / alloc_granularity; } - return ac_build_varying_gather_values(&ctx->ac, value, num_components, component); + return ok; } -static void store_output_tcs(struct lp_build_tgsi_context *bld_base, - const struct tgsi_full_instruction *inst, - const struct tgsi_opcode_info *info, - unsigned index, - LLVMValueRef dst[4]) +static unsigned si_get_shader_binary_size(struct si_screen *screen, struct si_shader *shader) { - struct si_shader_context *ctx = si_shader_context(bld_base); - const struct tgsi_full_dst_register *reg = &inst->Dst[index]; - const struct tgsi_shader_info *sh_info = &ctx->shader->selector->info; - unsigned chan_index; - LLVMValueRef dw_addr, stride; - LLVMValueRef buffer, base, buf_addr; - LLVMValueRef values[4]; - bool skip_lds_store; - bool is_tess_factor = false, is_tess_inner = false; + struct ac_rtld_binary rtld; + si_shader_binary_open(screen, shader, &rtld); + return rtld.exec_size; +} - /* Only handle per-patch and per-vertex outputs here. - * Vectors will be lowered to scalars and this function will be called again. - */ - if (reg->Register.File != TGSI_FILE_OUTPUT || - (dst[0] && LLVMGetTypeKind(LLVMTypeOf(dst[0])) == LLVMVectorTypeKind)) { - si_llvm_emit_store(bld_base, inst, info, index, dst); - return; - } +static bool si_get_external_symbol(void *data, const char *name, uint64_t *value) +{ + uint64_t *scratch_va = data; - if (reg->Register.Dimension) { - stride = get_tcs_out_vertex_dw_stride(ctx); - dw_addr = get_tcs_out_current_patch_offset(ctx); - dw_addr = get_dw_address(ctx, reg, NULL, stride, dw_addr); - skip_lds_store = !sh_info->reads_pervertex_outputs; - } else { - dw_addr = get_tcs_out_current_patch_data_offset(ctx); - dw_addr = get_dw_address(ctx, reg, NULL, NULL, dw_addr); - skip_lds_store = !sh_info->reads_perpatch_outputs; - - if (!reg->Register.Indirect) { - int name = sh_info->output_semantic_name[reg->Register.Index]; - - /* Always write tess factors into LDS for the TCS epilog. */ - if (name == TGSI_SEMANTIC_TESSINNER || - name == TGSI_SEMANTIC_TESSOUTER) { - /* The epilog doesn't read LDS if invocation 0 defines tess factors. */ - skip_lds_store = !sh_info->reads_tessfactor_outputs && - ctx->shader->selector->tcs_info.tessfactors_are_def_in_all_invocs; - is_tess_factor = true; - is_tess_inner = name == TGSI_SEMANTIC_TESSINNER; - } - } + if (!strcmp(scratch_rsrc_dword0_symbol, name)) { + *value = (uint32_t)*scratch_va; + return true; } - - buffer = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TCS); - - base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset); - buf_addr = get_tcs_tes_buffer_address_from_reg(ctx, reg, NULL); - - uint32_t writemask = reg->Register.WriteMask; - while (writemask) { - chan_index = u_bit_scan(&writemask); - LLVMValueRef value = dst[chan_index]; - - if (inst->Instruction.Saturate) - value = ac_build_clamp(&ctx->ac, value); - - /* Skip LDS stores if there is no LDS read of this output. */ - if (!skip_lds_store) - lshs_lds_store(ctx, chan_index, dw_addr, value); - - value = ac_to_integer(&ctx->ac, value); - values[chan_index] = value; - - if (reg->Register.WriteMask != 0xF && !is_tess_factor) { - ac_build_buffer_store_dword(&ctx->ac, buffer, value, 1, - buf_addr, base, - 4 * chan_index, ac_glc, false); - } - - /* Write tess factors into VGPRs for the epilog. */ - if (is_tess_factor && - ctx->shader->selector->tcs_info.tessfactors_are_def_in_all_invocs) { - if (!is_tess_inner) { - LLVMBuildStore(ctx->ac.builder, value, /* outer */ - ctx->invoc0_tess_factors[chan_index]); - } else if (chan_index < 2) { - LLVMBuildStore(ctx->ac.builder, value, /* inner */ - ctx->invoc0_tess_factors[4 + chan_index]); - } - } + if (!strcmp(scratch_rsrc_dword1_symbol, name)) { + /* Enable scratch coalescing. */ + *value = S_008F04_BASE_ADDRESS_HI(*scratch_va >> 32) | + S_008F04_SWIZZLE_ENABLE(1); + return true; } - if (reg->Register.WriteMask == 0xF && !is_tess_factor) { - LLVMValueRef value = ac_build_gather_values(&ctx->ac, - values, 4); - ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, buf_addr, - base, 0, ac_glc, false); - } + return false; } -static void si_nir_store_output_tcs(struct ac_shader_abi *abi, - const struct nir_variable *var, - LLVMValueRef vertex_index, - LLVMValueRef param_index, - unsigned const_index, - LLVMValueRef src, - unsigned writemask) -{ - struct si_shader_context *ctx = si_shader_context_from_abi(abi); - struct tgsi_shader_info *info = &ctx->shader->selector->info; - const unsigned component = var->data.location_frac; - const bool is_patch = var->data.patch; - unsigned driver_location = var->data.driver_location; - LLVMValueRef dw_addr, stride; - LLVMValueRef buffer, base, addr; - LLVMValueRef values[8]; - bool skip_lds_store; - bool is_tess_factor = false, is_tess_inner = false; - - driver_location = driver_location / 4; - - if (param_index) { - /* Add the constant index to the indirect index */ - param_index = LLVMBuildAdd(ctx->ac.builder, param_index, - LLVMConstInt(ctx->i32, const_index, 0), ""); - } else { - if (const_index != 0) - param_index = LLVMConstInt(ctx->i32, const_index, 0); - } - - if (!is_patch) { - stride = get_tcs_out_vertex_dw_stride(ctx); - dw_addr = get_tcs_out_current_patch_offset(ctx); - dw_addr = get_dw_address_from_generic_indices(ctx, stride, dw_addr, - vertex_index, param_index, - driver_location, - info->output_semantic_name, - info->output_semantic_index, - is_patch); - - skip_lds_store = !info->reads_pervertex_outputs; - } else { - dw_addr = get_tcs_out_current_patch_data_offset(ctx); - dw_addr = get_dw_address_from_generic_indices(ctx, NULL, dw_addr, - vertex_index, param_index, - driver_location, - info->output_semantic_name, - info->output_semantic_index, - is_patch); - - skip_lds_store = !info->reads_perpatch_outputs; - - if (!param_index) { - int name = info->output_semantic_name[driver_location]; - - /* Always write tess factors into LDS for the TCS epilog. */ - if (name == TGSI_SEMANTIC_TESSINNER || - name == TGSI_SEMANTIC_TESSOUTER) { - /* The epilog doesn't read LDS if invocation 0 defines tess factors. */ - skip_lds_store = !info->reads_tessfactor_outputs && - ctx->shader->selector->tcs_info.tessfactors_are_def_in_all_invocs; - is_tess_factor = true; - is_tess_inner = name == TGSI_SEMANTIC_TESSINNER; - } - } - } - - buffer = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TCS); +bool si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader, + uint64_t scratch_va) +{ + struct ac_rtld_binary binary; + if (!si_shader_binary_open(sscreen, shader, &binary)) + return false; - base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset); + si_resource_reference(&shader->bo, NULL); + shader->bo = si_aligned_buffer_create(&sscreen->b, + sscreen->info.cpdma_prefetch_writes_memory ? + 0 : SI_RESOURCE_FLAG_READ_ONLY, + PIPE_USAGE_IMMUTABLE, + align(binary.rx_size, SI_CPDMA_ALIGNMENT), + 256); + if (!shader->bo) + return false; - addr = get_tcs_tes_buffer_address_from_generic_indices(ctx, vertex_index, - param_index, driver_location, - info->output_semantic_name, - info->output_semantic_index, - is_patch); + /* Upload. */ + struct ac_rtld_upload_info u = {}; + u.binary = &binary; + u.get_external_symbol = si_get_external_symbol; + u.cb_data = &scratch_va; + u.rx_va = shader->bo->gpu_address; + u.rx_ptr = sscreen->ws->buffer_map(shader->bo->buf, NULL, + PIPE_TRANSFER_READ_WRITE | + PIPE_TRANSFER_UNSYNCHRONIZED | + RADEON_TRANSFER_TEMPORARY); + if (!u.rx_ptr) + return false; - for (unsigned chan = 0; chan < 8; chan++) { - if (!(writemask & (1 << chan))) - continue; - LLVMValueRef value = ac_llvm_extract_elem(&ctx->ac, src, chan - component); + bool ok = ac_rtld_upload(&u); - unsigned buffer_store_offset = chan % 4; - if (chan == 4) { - addr = get_tcs_tes_buffer_address_from_generic_indices(ctx, - vertex_index, - param_index, - driver_location + 1, - info->output_semantic_name, - info->output_semantic_index, - is_patch); - } - - /* Skip LDS stores if there is no LDS read of this output. */ - if (!skip_lds_store) - lshs_lds_store(ctx, chan, dw_addr, value); - - value = ac_to_integer(&ctx->ac, value); - values[chan] = value; - - if (writemask != 0xF && !is_tess_factor) { - ac_build_buffer_store_dword(&ctx->ac, buffer, value, 1, - addr, base, - 4 * buffer_store_offset, - ac_glc, false); - } - - /* Write tess factors into VGPRs for the epilog. */ - if (is_tess_factor && - ctx->shader->selector->tcs_info.tessfactors_are_def_in_all_invocs) { - if (!is_tess_inner) { - LLVMBuildStore(ctx->ac.builder, value, /* outer */ - ctx->invoc0_tess_factors[chan]); - } else if (chan < 2) { - LLVMBuildStore(ctx->ac.builder, value, /* inner */ - ctx->invoc0_tess_factors[4 + chan]); - } - } - } + sscreen->ws->buffer_unmap(shader->bo->buf); + ac_rtld_close(&binary); - if (writemask == 0xF && !is_tess_factor) { - LLVMValueRef value = ac_build_gather_values(&ctx->ac, - values, 4); - ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, addr, - base, 0, ac_glc, false); - } + return ok; } -LLVMValueRef si_llvm_load_input_gs(struct ac_shader_abi *abi, - unsigned input_index, - unsigned vtx_offset_param, - LLVMTypeRef type, - unsigned swizzle) +static void si_shader_dump_disassembly(struct si_screen *screen, + const struct si_shader_binary *binary, + enum pipe_shader_type shader_type, + unsigned wave_size, + struct pipe_debug_callback *debug, + const char *name, FILE *file) { - struct si_shader_context *ctx = si_shader_context_from_abi(abi); - struct lp_build_tgsi_context *bld_base = &ctx->bld_base; - struct si_shader *shader = ctx->shader; - LLVMValueRef vtx_offset, soffset; - struct tgsi_shader_info *info = &shader->selector->info; - unsigned semantic_name = info->input_semantic_name[input_index]; - unsigned semantic_index = info->input_semantic_index[input_index]; - unsigned param; - LLVMValueRef value; - - param = si_shader_io_get_unique_index(semantic_name, semantic_index, false); - - /* GFX9 has the ESGS ring in LDS. */ - if (ctx->screen->info.chip_class >= GFX9) { - unsigned index = vtx_offset_param; + struct ac_rtld_binary rtld_binary; - switch (index / 2) { - case 0: - vtx_offset = si_unpack_param(ctx, ctx->param_gs_vtx01_offset, - index % 2 ? 16 : 0, 16); - break; - case 1: - vtx_offset = si_unpack_param(ctx, ctx->param_gs_vtx23_offset, - index % 2 ? 16 : 0, 16); - break; - case 2: - vtx_offset = si_unpack_param(ctx, ctx->param_gs_vtx45_offset, - index % 2 ? 16 : 0, 16); - break; - default: - assert(0); - return NULL; - } + if (!ac_rtld_open(&rtld_binary, (struct ac_rtld_open_info){ + .info = &screen->info, + .shader_type = tgsi_processor_to_shader_stage(shader_type), + .wave_size = wave_size, + .num_parts = 1, + .elf_ptrs = &binary->elf_buffer, + .elf_sizes = &binary->elf_size })) + return; - unsigned offset = param * 4 + swizzle; - vtx_offset = LLVMBuildAdd(ctx->ac.builder, vtx_offset, - LLVMConstInt(ctx->i32, offset, false), ""); + const char *disasm; + size_t nbytes; - LLVMValueRef ptr = ac_build_gep0(&ctx->ac, ctx->esgs_ring, vtx_offset); - LLVMValueRef value = LLVMBuildLoad(ctx->ac.builder, ptr, ""); - if (llvm_type_is_64bit(ctx, type)) { - ptr = LLVMBuildGEP(ctx->ac.builder, ptr, - &ctx->ac.i32_1, 1, ""); - LLVMValueRef values[2] = { - value, - LLVMBuildLoad(ctx->ac.builder, ptr, "") - }; - value = ac_build_gather_values(&ctx->ac, values, 2); - } - return LLVMBuildBitCast(ctx->ac.builder, value, type, ""); - } + if (!ac_rtld_get_section_by_name(&rtld_binary, ".AMDGPU.disasm", &disasm, &nbytes)) + goto out; - /* GFX6: input load from the ESGS ring in memory. */ - if (swizzle == ~0) { - LLVMValueRef values[TGSI_NUM_CHANNELS]; - unsigned chan; - for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) { - values[chan] = si_llvm_load_input_gs(abi, input_index, vtx_offset_param, - type, chan); - } - return ac_build_gather_values(&ctx->ac, values, - TGSI_NUM_CHANNELS); - } + if (nbytes > INT_MAX) + goto out; - /* Get the vertex offset parameter on GFX6. */ - LLVMValueRef gs_vtx_offset = ctx->gs_vtx_offset[vtx_offset_param]; + if (debug && debug->debug_message) { + /* Very long debug messages are cut off, so send the + * disassembly one line at a time. This causes more + * overhead, but on the plus side it simplifies + * parsing of resulting logs. + */ + pipe_debug_message(debug, SHADER_INFO, + "Shader Disassembly Begin"); - vtx_offset = LLVMBuildMul(ctx->ac.builder, gs_vtx_offset, - LLVMConstInt(ctx->i32, 4, 0), ""); + uint64_t line = 0; + while (line < nbytes) { + int count = nbytes - line; + const char *nl = memchr(disasm + line, '\n', nbytes - line); + if (nl) + count = nl - (disasm + line); - soffset = LLVMConstInt(ctx->i32, (param * 4 + swizzle) * 256, 0); + if (count) { + pipe_debug_message(debug, SHADER_INFO, + "%.*s", count, disasm + line); + } - value = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1, ctx->i32_0, - vtx_offset, soffset, 0, ac_glc, true, false); - if (llvm_type_is_64bit(ctx, type)) { - LLVMValueRef value2; - soffset = LLVMConstInt(ctx->i32, (param * 4 + swizzle + 1) * 256, 0); + line += count + 1; + } - value2 = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1, - ctx->i32_0, vtx_offset, soffset, - 0, ac_glc, true, false); - return si_llvm_emit_fetch_64bit(bld_base, type, value, value2); + pipe_debug_message(debug, SHADER_INFO, + "Shader Disassembly End"); } - return LLVMBuildBitCast(ctx->ac.builder, value, type, ""); -} -static LLVMValueRef si_nir_load_input_gs(struct ac_shader_abi *abi, - unsigned location, - unsigned driver_location, - unsigned component, - unsigned num_components, - unsigned vertex_index, - unsigned const_index, - LLVMTypeRef type) -{ - struct si_shader_context *ctx = si_shader_context_from_abi(abi); - - LLVMValueRef value[4]; - for (unsigned i = 0; i < num_components; i++) { - unsigned offset = i; - if (llvm_type_is_64bit(ctx, type)) - offset *= 2; - - offset += component; - value[i + component] = si_llvm_load_input_gs(&ctx->abi, driver_location / 4, - vertex_index, type, offset); + if (file) { + fprintf(file, "Shader %s disassembly:\n", name); + fprintf(file, "%*s", (int)nbytes, disasm); } - return ac_build_varying_gather_values(&ctx->ac, value, num_components, component); +out: + ac_rtld_close(&rtld_binary); } -static LLVMValueRef fetch_input_gs( - struct lp_build_tgsi_context *bld_base, - const struct tgsi_full_src_register *reg, - enum tgsi_opcode_type type, - unsigned swizzle_in) +static void si_calculate_max_simd_waves(struct si_shader *shader) { - struct si_shader_context *ctx = si_shader_context(bld_base); - struct tgsi_shader_info *info = &ctx->shader->selector->info; - unsigned swizzle = swizzle_in & 0xffff; - - unsigned semantic_name = info->input_semantic_name[reg->Register.Index]; - if (swizzle != ~0 && semantic_name == TGSI_SEMANTIC_PRIMID) - return si_get_primitive_id(ctx, swizzle); - - if (!reg->Register.Dimension) - return NULL; - - return si_llvm_load_input_gs(&ctx->abi, reg->Register.Index, - reg->Dimension.Index, - tgsi2llvmtype(bld_base, type), - swizzle); -} + struct si_screen *sscreen = shader->selector->screen; + struct ac_shader_config *conf = &shader->config; + unsigned num_inputs = shader->selector->info.num_inputs; + unsigned lds_increment = sscreen->info.chip_class >= GFX7 ? 512 : 256; + unsigned lds_per_wave = 0; + unsigned max_simd_waves; -static int lookup_interp_param_index(unsigned interpolate, unsigned location) -{ - switch (interpolate) { - case TGSI_INTERPOLATE_CONSTANT: - return 0; + max_simd_waves = sscreen->info.max_wave64_per_simd; - case TGSI_INTERPOLATE_LINEAR: - if (location == TGSI_INTERPOLATE_LOC_SAMPLE) - return SI_PARAM_LINEAR_SAMPLE; - else if (location == TGSI_INTERPOLATE_LOC_CENTROID) - return SI_PARAM_LINEAR_CENTROID; - else - return SI_PARAM_LINEAR_CENTER; + /* Compute LDS usage for PS. */ + switch (shader->selector->type) { + case PIPE_SHADER_FRAGMENT: + /* The minimum usage per wave is (num_inputs * 48). The maximum + * usage is (num_inputs * 48 * 16). + * We can get anything in between and it varies between waves. + * + * The 48 bytes per input for a single primitive is equal to + * 4 bytes/component * 4 components/input * 3 points. + * + * Other stages don't know the size at compile time or don't + * allocate LDS per wave, but instead they do it per thread group. + */ + lds_per_wave = conf->lds_size * lds_increment + + align(num_inputs * 48, lds_increment); break; - case TGSI_INTERPOLATE_COLOR: - case TGSI_INTERPOLATE_PERSPECTIVE: - if (location == TGSI_INTERPOLATE_LOC_SAMPLE) - return SI_PARAM_PERSP_SAMPLE; - else if (location == TGSI_INTERPOLATE_LOC_CENTROID) - return SI_PARAM_PERSP_CENTROID; - else - return SI_PARAM_PERSP_CENTER; + case PIPE_SHADER_COMPUTE: + if (shader->selector) { + unsigned max_workgroup_size = + si_get_max_workgroup_size(shader); + lds_per_wave = (conf->lds_size * lds_increment) / + DIV_ROUND_UP(max_workgroup_size, + sscreen->compute_wave_size); + } break; - default: - fprintf(stderr, "Warning: Unhandled interpolation mode.\n"); - return -1; - } -} - -static LLVMValueRef si_build_fs_interp(struct si_shader_context *ctx, - unsigned attr_index, unsigned chan, - LLVMValueRef prim_mask, - LLVMValueRef i, LLVMValueRef j) -{ - if (i || j) { - return ac_build_fs_interp(&ctx->ac, - LLVMConstInt(ctx->i32, chan, 0), - LLVMConstInt(ctx->i32, attr_index, 0), - prim_mask, i, j); - } - return ac_build_fs_interp_mov(&ctx->ac, - LLVMConstInt(ctx->i32, 2, 0), /* P0 */ - LLVMConstInt(ctx->i32, chan, 0), - LLVMConstInt(ctx->i32, attr_index, 0), - prim_mask); -} - -/** - * Interpolate a fragment shader input. - * - * @param ctx context - * @param input_index index of the input in hardware - * @param semantic_name TGSI_SEMANTIC_* - * @param semantic_index semantic index - * @param num_interp_inputs number of all interpolated inputs (= BCOLOR offset) - * @param colors_read_mask color components read (4 bits for each color, 8 bits in total) - * @param interp_param interpolation weights (i,j) - * @param prim_mask SI_PARAM_PRIM_MASK - * @param face SI_PARAM_FRONT_FACE - * @param result the return value (4 components) - */ -static void interp_fs_input(struct si_shader_context *ctx, - unsigned input_index, - unsigned semantic_name, - unsigned semantic_index, - unsigned num_interp_inputs, - unsigned colors_read_mask, - LLVMValueRef interp_param, - LLVMValueRef prim_mask, - LLVMValueRef face, - LLVMValueRef result[4]) -{ - LLVMValueRef i = NULL, j = NULL; - unsigned chan; - - /* fs.constant returns the param from the middle vertex, so it's not - * really useful for flat shading. It's meant to be used for custom - * interpolation (but the intrinsic can't fetch from the other two - * vertices). - * - * Luckily, it doesn't matter, because we rely on the FLAT_SHADE state - * to do the right thing. The only reason we use fs.constant is that - * fs.interp cannot be used on integers, because they can be equal - * to NaN. - * - * When interp is false we will use fs.constant or for newer llvm, - * amdgcn.interp.mov. - */ - bool interp = interp_param != NULL; - - if (interp) { - interp_param = LLVMBuildBitCast(ctx->ac.builder, interp_param, - LLVMVectorType(ctx->f32, 2), ""); - - i = LLVMBuildExtractElement(ctx->ac.builder, interp_param, - ctx->i32_0, ""); - j = LLVMBuildExtractElement(ctx->ac.builder, interp_param, - ctx->i32_1, ""); + default:; } - if (semantic_name == TGSI_SEMANTIC_COLOR && - ctx->shader->key.part.ps.prolog.color_two_side) { - LLVMValueRef is_face_positive; - - /* If BCOLOR0 is used, BCOLOR1 is at offset "num_inputs + 1", - * otherwise it's at offset "num_inputs". - */ - unsigned back_attr_offset = num_interp_inputs; - if (semantic_index == 1 && colors_read_mask & 0xf) - back_attr_offset += 1; - - is_face_positive = LLVMBuildICmp(ctx->ac.builder, LLVMIntNE, - face, ctx->i32_0, ""); - - for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) { - LLVMValueRef front, back; - - front = si_build_fs_interp(ctx, - input_index, chan, - prim_mask, i, j); - back = si_build_fs_interp(ctx, - back_attr_offset, chan, - prim_mask, i, j); - - result[chan] = LLVMBuildSelect(ctx->ac.builder, - is_face_positive, - front, - back, - ""); - } - } else if (semantic_name == TGSI_SEMANTIC_FOG) { - result[0] = si_build_fs_interp(ctx, input_index, - 0, prim_mask, i, j); - result[1] = - result[2] = LLVMConstReal(ctx->f32, 0.0f); - result[3] = LLVMConstReal(ctx->f32, 1.0f); - } else { - for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) { - result[chan] = si_build_fs_interp(ctx, - input_index, chan, - prim_mask, i, j); - } + /* Compute the per-SIMD wave counts. */ + if (conf->num_sgprs) { + max_simd_waves = + MIN2(max_simd_waves, + sscreen->info.num_physical_sgprs_per_simd / conf->num_sgprs); } -} -void si_llvm_load_input_fs( - struct si_shader_context *ctx, - unsigned input_index, - LLVMValueRef out[4]) -{ - struct si_shader *shader = ctx->shader; - struct tgsi_shader_info *info = &shader->selector->info; - LLVMValueRef main_fn = ctx->main_fn; - LLVMValueRef interp_param = NULL; - int interp_param_idx; - enum tgsi_semantic semantic_name = info->input_semantic_name[input_index]; - unsigned semantic_index = info->input_semantic_index[input_index]; - enum tgsi_interpolate_mode interp_mode = info->input_interpolate[input_index]; - enum tgsi_interpolate_loc interp_loc = info->input_interpolate_loc[input_index]; - - /* Get colors from input VGPRs (set by the prolog). */ - if (semantic_name == TGSI_SEMANTIC_COLOR) { - unsigned colors_read = shader->selector->info.colors_read; - unsigned mask = colors_read >> (semantic_index * 4); - unsigned offset = SI_PARAM_POS_FIXED_PT + 1 + - (semantic_index ? util_bitcount(colors_read & 0xf) : 0); - LLVMValueRef undef = LLVMGetUndef(ctx->f32); - - out[0] = mask & 0x1 ? LLVMGetParam(main_fn, offset++) : undef; - out[1] = mask & 0x2 ? LLVMGetParam(main_fn, offset++) : undef; - out[2] = mask & 0x4 ? LLVMGetParam(main_fn, offset++) : undef; - out[3] = mask & 0x8 ? LLVMGetParam(main_fn, offset++) : undef; - return; + if (conf->num_vgprs) { + /* Always print wave limits as Wave64, so that we can compare + * Wave32 and Wave64 with shader-db fairly. */ + unsigned max_vgprs = sscreen->info.num_physical_wave64_vgprs_per_simd; + max_simd_waves = MIN2(max_simd_waves, max_vgprs / conf->num_vgprs); } - interp_param_idx = lookup_interp_param_index(interp_mode, interp_loc); - if (interp_param_idx == -1) - return; - else if (interp_param_idx) { - interp_param = LLVMGetParam(ctx->main_fn, interp_param_idx); - } + /* LDS is 64KB per CU (4 SIMDs) on GFX6-9, which is 16KB per SIMD (usage above + * 16KB makes some SIMDs unoccupied). + * + * LDS is 128KB in WGP mode and 64KB in CU mode. Assume the WGP mode is used. + */ + unsigned max_lds_size = sscreen->info.chip_class >= GFX10 ? 128*1024 : 64*1024; + unsigned max_lds_per_simd = max_lds_size / 4; + if (lds_per_wave) + max_simd_waves = MIN2(max_simd_waves, max_lds_per_simd / lds_per_wave); - interp_fs_input(ctx, input_index, semantic_name, - semantic_index, 0, /* this param is unused */ - shader->selector->info.colors_read, interp_param, - ctx->abi.prim_mask, - LLVMGetParam(main_fn, SI_PARAM_FRONT_FACE), - &out[0]); + shader->info.max_simd_waves = max_simd_waves; } -static void declare_input_fs( - struct si_shader_context *ctx, - unsigned input_index, - const struct tgsi_full_declaration *decl, - LLVMValueRef out[4]) +void si_shader_dump_stats_for_shader_db(struct si_screen *screen, + struct si_shader *shader, + struct pipe_debug_callback *debug) { - si_llvm_load_input_fs(ctx, input_index, out); -} + const struct ac_shader_config *conf = &shader->config; -LLVMValueRef si_get_sample_id(struct si_shader_context *ctx) -{ - return si_unpack_param(ctx, SI_PARAM_ANCILLARY, 8, 4); + if (screen->options.debug_disassembly) + si_shader_dump_disassembly(screen, &shader->binary, + shader->selector->type, + si_get_shader_wave_size(shader), + debug, "main", NULL); + + pipe_debug_message(debug, SHADER_INFO, + "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d " + "LDS: %d Scratch: %d Max Waves: %d Spilled SGPRs: %d " + "Spilled VGPRs: %d PrivMem VGPRs: %d", + conf->num_sgprs, conf->num_vgprs, + si_get_shader_binary_size(screen, shader), + conf->lds_size, conf->scratch_bytes_per_wave, + shader->info.max_simd_waves, conf->spilled_sgprs, + conf->spilled_vgprs, shader->info.private_mem_vgprs); } -static LLVMValueRef get_base_vertex(struct ac_shader_abi *abi) +static void si_shader_dump_stats(struct si_screen *sscreen, + struct si_shader *shader, + FILE *file, + bool check_debug_option) { - struct si_shader_context *ctx = si_shader_context_from_abi(abi); - - /* For non-indexed draws, the base vertex set by the driver - * (for direct draws) or the CP (for indirect draws) is the - * first vertex ID, but GLSL expects 0 to be returned. - */ - LLVMValueRef vs_state = LLVMGetParam(ctx->main_fn, - ctx->param_vs_state_bits); - LLVMValueRef indexed; + const struct ac_shader_config *conf = &shader->config; - indexed = LLVMBuildLShr(ctx->ac.builder, vs_state, ctx->i32_1, ""); - indexed = LLVMBuildTrunc(ctx->ac.builder, indexed, ctx->i1, ""); + if (!check_debug_option || + si_can_dump_shader(sscreen, shader->selector->type)) { + if (shader->selector->type == PIPE_SHADER_FRAGMENT) { + fprintf(file, "*** SHADER CONFIG ***\n" + "SPI_PS_INPUT_ADDR = 0x%04x\n" + "SPI_PS_INPUT_ENA = 0x%04x\n", + conf->spi_ps_input_addr, conf->spi_ps_input_ena); + } - return LLVMBuildSelect(ctx->ac.builder, indexed, ctx->abi.base_vertex, - ctx->i32_0, ""); + fprintf(file, "*** SHADER STATS ***\n" + "SGPRS: %d\n" + "VGPRS: %d\n" + "Spilled SGPRs: %d\n" + "Spilled VGPRs: %d\n" + "Private memory VGPRs: %d\n" + "Code Size: %d bytes\n" + "LDS: %d blocks\n" + "Scratch: %d bytes per wave\n" + "Max Waves: %d\n" + "********************\n\n\n", + conf->num_sgprs, conf->num_vgprs, + conf->spilled_sgprs, conf->spilled_vgprs, + shader->info.private_mem_vgprs, + si_get_shader_binary_size(sscreen, shader), + conf->lds_size, conf->scratch_bytes_per_wave, + shader->info.max_simd_waves); + } } -static LLVMValueRef get_block_size(struct ac_shader_abi *abi) +const char *si_get_shader_name(const struct si_shader *shader) { - struct si_shader_context *ctx = si_shader_context_from_abi(abi); - - LLVMValueRef values[3]; - LLVMValueRef result; - unsigned i; - unsigned *properties = ctx->shader->selector->info.properties; - - if (properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] != 0) { - unsigned sizes[3] = { - properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH], - properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT], - properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH] - }; - - for (i = 0; i < 3; ++i) - values[i] = LLVMConstInt(ctx->i32, sizes[i], 0); - - result = ac_build_gather_values(&ctx->ac, values, 3); - } else { - result = LLVMGetParam(ctx->main_fn, ctx->param_block_size); + switch (shader->selector->type) { + case PIPE_SHADER_VERTEX: + if (shader->key.as_es) + return "Vertex Shader as ES"; + else if (shader->key.as_ls) + return "Vertex Shader as LS"; + else if (shader->key.opt.vs_as_prim_discard_cs) + return "Vertex Shader as Primitive Discard CS"; + else if (shader->key.as_ngg) + return "Vertex Shader as ESGS"; + else + return "Vertex Shader as VS"; + case PIPE_SHADER_TESS_CTRL: + return "Tessellation Control Shader"; + case PIPE_SHADER_TESS_EVAL: + if (shader->key.as_es) + return "Tessellation Evaluation Shader as ES"; + else if (shader->key.as_ngg) + return "Tessellation Evaluation Shader as ESGS"; + else + return "Tessellation Evaluation Shader as VS"; + case PIPE_SHADER_GEOMETRY: + if (shader->is_gs_copy_shader) + return "GS Copy Shader as VS"; + else + return "Geometry Shader"; + case PIPE_SHADER_FRAGMENT: + return "Pixel Shader"; + case PIPE_SHADER_COMPUTE: + return "Compute Shader"; + default: + return "Unknown Shader"; } - - return result; -} - -/** - * Load a dword from a constant buffer. - */ -static LLVMValueRef buffer_load_const(struct si_shader_context *ctx, - LLVMValueRef resource, - LLVMValueRef offset) -{ - return ac_build_buffer_load(&ctx->ac, resource, 1, NULL, offset, NULL, - 0, 0, true, true); } -static LLVMValueRef load_sample_position(struct ac_shader_abi *abi, LLVMValueRef sample_id) +void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader, + struct pipe_debug_callback *debug, + FILE *file, bool check_debug_option) { - struct si_shader_context *ctx = si_shader_context_from_abi(abi); - LLVMValueRef desc = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers); - LLVMValueRef buf_index = LLVMConstInt(ctx->i32, SI_PS_CONST_SAMPLE_POSITIONS, 0); - LLVMValueRef resource = ac_build_load_to_sgpr(&ctx->ac, desc, buf_index); - - /* offset = sample_id * 8 (8 = 2 floats containing samplepos.xy) */ - LLVMValueRef offset0 = LLVMBuildMul(ctx->ac.builder, sample_id, LLVMConstInt(ctx->i32, 8, 0), ""); - LLVMValueRef offset1 = LLVMBuildAdd(ctx->ac.builder, offset0, LLVMConstInt(ctx->i32, 4, 0), ""); - - LLVMValueRef pos[4] = { - buffer_load_const(ctx, resource, offset0), - buffer_load_const(ctx, resource, offset1), - LLVMConstReal(ctx->f32, 0), - LLVMConstReal(ctx->f32, 0) - }; - - return ac_build_gather_values(&ctx->ac, pos, 4); -} + enum pipe_shader_type shader_type = shader->selector->type; -static LLVMValueRef load_sample_mask_in(struct ac_shader_abi *abi) -{ - struct si_shader_context *ctx = si_shader_context_from_abi(abi); - return ac_to_integer(&ctx->ac, abi->sample_coverage); -} + if (!check_debug_option || + si_can_dump_shader(sscreen, shader_type)) + si_dump_shader_key(shader, file); -static LLVMValueRef si_load_tess_coord(struct ac_shader_abi *abi) -{ - struct si_shader_context *ctx = si_shader_context_from_abi(abi); - LLVMValueRef coord[4] = { - LLVMGetParam(ctx->main_fn, ctx->param_tes_u), - LLVMGetParam(ctx->main_fn, ctx->param_tes_v), - ctx->ac.f32_0, - ctx->ac.f32_0 - }; + if (!check_debug_option && shader->binary.llvm_ir_string) { + if (shader->previous_stage && + shader->previous_stage->binary.llvm_ir_string) { + fprintf(file, "\n%s - previous stage - LLVM IR:\n\n", + si_get_shader_name(shader)); + fprintf(file, "%s\n", shader->previous_stage->binary.llvm_ir_string); + } - /* For triangles, the vector should be (u, v, 1-u-v). */ - if (ctx->shader->selector->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] == - PIPE_PRIM_TRIANGLES) { - coord[2] = LLVMBuildFSub(ctx->ac.builder, ctx->ac.f32_1, - LLVMBuildFAdd(ctx->ac.builder, - coord[0], coord[1], ""), ""); + fprintf(file, "\n%s - main shader part - LLVM IR:\n\n", + si_get_shader_name(shader)); + fprintf(file, "%s\n", shader->binary.llvm_ir_string); } - return ac_build_gather_values(&ctx->ac, coord, 4); -} - -static LLVMValueRef load_tess_level(struct si_shader_context *ctx, - unsigned semantic_name) -{ - LLVMValueRef base, addr; - int param = si_shader_io_get_unique_index_patch(semantic_name, 0); - - base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset); - addr = get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx), NULL, - LLVMConstInt(ctx->i32, param, 0)); + if (!check_debug_option || + (si_can_dump_shader(sscreen, shader_type) && + !(sscreen->debug_flags & DBG(NO_ASM)))) { + unsigned wave_size = si_get_shader_wave_size(shader); - return buffer_load(&ctx->bld_base, ctx->f32, - ~0, ctx->tess_offchip_ring, base, addr, true); + fprintf(file, "\n%s:\n", si_get_shader_name(shader)); -} + if (shader->prolog) + si_shader_dump_disassembly(sscreen, &shader->prolog->binary, + shader_type, wave_size, debug, "prolog", file); + if (shader->previous_stage) + si_shader_dump_disassembly(sscreen, &shader->previous_stage->binary, + shader_type, wave_size, debug, "previous stage", file); + if (shader->prolog2) + si_shader_dump_disassembly(sscreen, &shader->prolog2->binary, + shader_type, wave_size, debug, "prolog2", file); -static LLVMValueRef load_tess_level_default(struct si_shader_context *ctx, - unsigned semantic_name) -{ - LLVMValueRef buf, slot, val[4]; - int i, offset; + si_shader_dump_disassembly(sscreen, &shader->binary, shader_type, + wave_size, debug, "main", file); - slot = LLVMConstInt(ctx->i32, SI_HS_CONST_DEFAULT_TESS_LEVELS, 0); - buf = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers); - buf = ac_build_load_to_sgpr(&ctx->ac, buf, slot); - offset = semantic_name == TGSI_SEMANTIC_TESS_DEFAULT_INNER_LEVEL ? 4 : 0; + if (shader->epilog) + si_shader_dump_disassembly(sscreen, &shader->epilog->binary, + shader_type, wave_size, debug, "epilog", file); + fprintf(file, "\n"); + } - for (i = 0; i < 4; i++) - val[i] = buffer_load_const(ctx, buf, - LLVMConstInt(ctx->i32, (offset + i) * 4, 0)); - return ac_build_gather_values(&ctx->ac, val, 4); + si_shader_dump_stats(sscreen, shader, file, check_debug_option); } -static LLVMValueRef si_load_tess_level(struct ac_shader_abi *abi, - unsigned varying_id, - bool load_default_state) +static void si_dump_shader_key_vs(const struct si_shader_key *key, + const struct si_vs_prolog_bits *prolog, + const char *prefix, FILE *f) { - struct si_shader_context *ctx = si_shader_context_from_abi(abi); - unsigned semantic_name; - - if (load_default_state) { - switch (varying_id) { - case VARYING_SLOT_TESS_LEVEL_INNER: - semantic_name = TGSI_SEMANTIC_TESS_DEFAULT_INNER_LEVEL; - break; - case VARYING_SLOT_TESS_LEVEL_OUTER: - semantic_name = TGSI_SEMANTIC_TESS_DEFAULT_OUTER_LEVEL; - break; - default: - unreachable("unknown tess level"); - } - return load_tess_level_default(ctx, semantic_name); - } + fprintf(f, " %s.instance_divisor_is_one = %u\n", + prefix, prolog->instance_divisor_is_one); + fprintf(f, " %s.instance_divisor_is_fetched = %u\n", + prefix, prolog->instance_divisor_is_fetched); + fprintf(f, " %s.unpack_instance_id_from_vertex_id = %u\n", + prefix, prolog->unpack_instance_id_from_vertex_id); + fprintf(f, " %s.ls_vgpr_fix = %u\n", + prefix, prolog->ls_vgpr_fix); - switch (varying_id) { - case VARYING_SLOT_TESS_LEVEL_INNER: - semantic_name = TGSI_SEMANTIC_TESSINNER; - break; - case VARYING_SLOT_TESS_LEVEL_OUTER: - semantic_name = TGSI_SEMANTIC_TESSOUTER; - break; - default: - unreachable("unknown tess level"); + fprintf(f, " mono.vs.fetch_opencode = %x\n", key->mono.vs_fetch_opencode); + fprintf(f, " mono.vs.fix_fetch = {"); + for (int i = 0; i < SI_MAX_ATTRIBS; i++) { + union si_vs_fix_fetch fix = key->mono.vs_fix_fetch[i]; + if (i) + fprintf(f, ", "); + if (!fix.bits) + fprintf(f, "0"); + else + fprintf(f, "%u.%u.%u.%u", fix.u.reverse, fix.u.log_size, + fix.u.num_channels_m1, fix.u.format); } - - return load_tess_level(ctx, semantic_name); - -} - -static LLVMValueRef si_load_patch_vertices_in(struct ac_shader_abi *abi) -{ - struct si_shader_context *ctx = si_shader_context_from_abi(abi); - if (ctx->type == PIPE_SHADER_TESS_CTRL) - return si_unpack_param(ctx, ctx->param_tcs_out_lds_layout, 13, 6); - else if (ctx->type == PIPE_SHADER_TESS_EVAL) - return get_num_tcs_out_vertices(ctx); - else - unreachable("invalid shader stage for TGSI_SEMANTIC_VERTICESIN"); + fprintf(f, "}\n"); } -void si_load_system_value(struct si_shader_context *ctx, - unsigned index, - const struct tgsi_full_declaration *decl) +static void si_dump_shader_key(const struct si_shader *shader, FILE *f) { - LLVMValueRef value = 0; - - assert(index < RADEON_LLVM_MAX_SYSTEM_VALUES); - - switch (decl->Semantic.Name) { - case TGSI_SEMANTIC_INSTANCEID: - value = ctx->abi.instance_id; - break; - - case TGSI_SEMANTIC_VERTEXID: - value = LLVMBuildAdd(ctx->ac.builder, - ctx->abi.vertex_id, - ctx->abi.base_vertex, ""); - break; - - case TGSI_SEMANTIC_VERTEXID_NOBASE: - /* Unused. Clarify the meaning in indexed vs. non-indexed - * draws if this is ever used again. */ - assert(false); - break; - - case TGSI_SEMANTIC_BASEVERTEX: - value = get_base_vertex(&ctx->abi); - break; - - case TGSI_SEMANTIC_BASEINSTANCE: - value = ctx->abi.start_instance; - break; - - case TGSI_SEMANTIC_DRAWID: - value = ctx->abi.draw_id; - break; - - case TGSI_SEMANTIC_INVOCATIONID: - if (ctx->type == PIPE_SHADER_TESS_CTRL) { - value = unpack_llvm_param(ctx, ctx->abi.tcs_rel_ids, 8, 5); - } else if (ctx->type == PIPE_SHADER_GEOMETRY) { - if (ctx->screen->info.chip_class >= GFX10) { - value = LLVMBuildAnd(ctx->ac.builder, - ctx->abi.gs_invocation_id, - LLVMConstInt(ctx->i32, 127, 0), ""); - } else { - value = ctx->abi.gs_invocation_id; - } - } else { - assert(!"INVOCATIONID not implemented"); - } - break; - - case TGSI_SEMANTIC_POSITION: - { - LLVMValueRef pos[4] = { - LLVMGetParam(ctx->main_fn, SI_PARAM_POS_X_FLOAT), - LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Y_FLOAT), - LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Z_FLOAT), - ac_build_fdiv(&ctx->ac, ctx->ac.f32_1, - LLVMGetParam(ctx->main_fn, SI_PARAM_POS_W_FLOAT)), - }; - value = ac_build_gather_values(&ctx->ac, pos, 4); - break; - } - - case TGSI_SEMANTIC_FACE: - value = ctx->abi.front_face; - break; - - case TGSI_SEMANTIC_SAMPLEID: - value = si_get_sample_id(ctx); - break; - - case TGSI_SEMANTIC_SAMPLEPOS: { - LLVMValueRef pos[4] = { - LLVMGetParam(ctx->main_fn, SI_PARAM_POS_X_FLOAT), - LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Y_FLOAT), - LLVMConstReal(ctx->f32, 0), - LLVMConstReal(ctx->f32, 0) - }; - pos[0] = ac_build_fract(&ctx->ac, pos[0], 32); - pos[1] = ac_build_fract(&ctx->ac, pos[1], 32); - value = ac_build_gather_values(&ctx->ac, pos, 4); - break; - } - - case TGSI_SEMANTIC_SAMPLEMASK: - /* This can only occur with the OpenGL Core profile, which - * doesn't support smoothing. - */ - value = LLVMGetParam(ctx->main_fn, SI_PARAM_SAMPLE_COVERAGE); - break; - - case TGSI_SEMANTIC_TESSCOORD: - value = si_load_tess_coord(&ctx->abi); - break; - - case TGSI_SEMANTIC_VERTICESIN: - value = si_load_patch_vertices_in(&ctx->abi); - break; - - case TGSI_SEMANTIC_TESSINNER: - case TGSI_SEMANTIC_TESSOUTER: - value = load_tess_level(ctx, decl->Semantic.Name); - break; - - case TGSI_SEMANTIC_TESS_DEFAULT_OUTER_LEVEL: - case TGSI_SEMANTIC_TESS_DEFAULT_INNER_LEVEL: - value = load_tess_level_default(ctx, decl->Semantic.Name); - break; - - case TGSI_SEMANTIC_PRIMID: - value = si_get_primitive_id(ctx, 0); - break; + const struct si_shader_key *key = &shader->key; + enum pipe_shader_type shader_type = shader->selector->type; - case TGSI_SEMANTIC_GRID_SIZE: - value = ctx->abi.num_work_groups; - break; + fprintf(f, "SHADER KEY\n"); - case TGSI_SEMANTIC_BLOCK_SIZE: - value = get_block_size(&ctx->abi); + switch (shader_type) { + case PIPE_SHADER_VERTEX: + si_dump_shader_key_vs(key, &key->part.vs.prolog, + "part.vs.prolog", f); + fprintf(f, " as_es = %u\n", key->as_es); + fprintf(f, " as_ls = %u\n", key->as_ls); + fprintf(f, " as_ngg = %u\n", key->as_ngg); + fprintf(f, " mono.u.vs_export_prim_id = %u\n", + key->mono.u.vs_export_prim_id); + fprintf(f, " opt.vs_as_prim_discard_cs = %u\n", + key->opt.vs_as_prim_discard_cs); + fprintf(f, " opt.cs_prim_type = %s\n", + tgsi_primitive_names[key->opt.cs_prim_type]); + fprintf(f, " opt.cs_indexed = %u\n", + key->opt.cs_indexed); + fprintf(f, " opt.cs_instancing = %u\n", + key->opt.cs_instancing); + fprintf(f, " opt.cs_primitive_restart = %u\n", + key->opt.cs_primitive_restart); + fprintf(f, " opt.cs_provoking_vertex_first = %u\n", + key->opt.cs_provoking_vertex_first); + fprintf(f, " opt.cs_need_correct_orientation = %u\n", + key->opt.cs_need_correct_orientation); + fprintf(f, " opt.cs_cull_front = %u\n", + key->opt.cs_cull_front); + fprintf(f, " opt.cs_cull_back = %u\n", + key->opt.cs_cull_back); + fprintf(f, " opt.cs_cull_z = %u\n", + key->opt.cs_cull_z); + fprintf(f, " opt.cs_halfz_clip_space = %u\n", + key->opt.cs_halfz_clip_space); break; - case TGSI_SEMANTIC_BLOCK_ID: - { - LLVMValueRef values[3]; - - for (int i = 0; i < 3; i++) { - values[i] = ctx->i32_0; - if (ctx->abi.workgroup_ids[i]) { - values[i] = ctx->abi.workgroup_ids[i]; - } + case PIPE_SHADER_TESS_CTRL: + if (shader->selector->screen->info.chip_class >= GFX9) { + si_dump_shader_key_vs(key, &key->part.tcs.ls_prolog, + "part.tcs.ls_prolog", f); } - value = ac_build_gather_values(&ctx->ac, values, 3); - break; - } - - case TGSI_SEMANTIC_THREAD_ID: - value = ctx->abi.local_invocation_ids; - break; - - case TGSI_SEMANTIC_HELPER_INVOCATION: - value = ac_build_load_helper_invocation(&ctx->ac); - break; - - case TGSI_SEMANTIC_SUBGROUP_SIZE: - value = LLVMConstInt(ctx->i32, ctx->ac.wave_size, 0); + fprintf(f, " part.tcs.epilog.prim_mode = %u\n", key->part.tcs.epilog.prim_mode); + fprintf(f, " mono.u.ff_tcs_inputs_to_copy = 0x%"PRIx64"\n", key->mono.u.ff_tcs_inputs_to_copy); break; - case TGSI_SEMANTIC_SUBGROUP_INVOCATION: - value = ac_get_thread_id(&ctx->ac); + case PIPE_SHADER_TESS_EVAL: + fprintf(f, " as_es = %u\n", key->as_es); + fprintf(f, " as_ngg = %u\n", key->as_ngg); + fprintf(f, " mono.u.vs_export_prim_id = %u\n", + key->mono.u.vs_export_prim_id); break; - case TGSI_SEMANTIC_SUBGROUP_EQ_MASK: - { - LLVMValueRef id = ac_get_thread_id(&ctx->ac); - if (ctx->ac.wave_size == 64) - id = LLVMBuildZExt(ctx->ac.builder, id, ctx->i64, ""); - value = LLVMBuildShl(ctx->ac.builder, - LLVMConstInt(ctx->ac.iN_wavemask, 1, 0), id, ""); - if (ctx->ac.wave_size == 32) - value = LLVMBuildZExt(ctx->ac.builder, value, ctx->i64, ""); - value = LLVMBuildBitCast(ctx->ac.builder, value, ctx->v2i32, ""); - break; - } + case PIPE_SHADER_GEOMETRY: + if (shader->is_gs_copy_shader) + break; - case TGSI_SEMANTIC_SUBGROUP_GE_MASK: - case TGSI_SEMANTIC_SUBGROUP_GT_MASK: - case TGSI_SEMANTIC_SUBGROUP_LE_MASK: - case TGSI_SEMANTIC_SUBGROUP_LT_MASK: - { - LLVMValueRef id = ac_get_thread_id(&ctx->ac); - if (decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_GT_MASK || - decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LE_MASK) { - /* All bits set except LSB */ - value = LLVMConstInt(ctx->ac.iN_wavemask, -2, 0); - } else { - /* All bits set */ - value = LLVMConstInt(ctx->ac.iN_wavemask, -1, 0); + if (shader->selector->screen->info.chip_class >= GFX9 && + key->part.gs.es->type == PIPE_SHADER_VERTEX) { + si_dump_shader_key_vs(key, &key->part.gs.vs_prolog, + "part.gs.vs_prolog", f); } - if (ctx->ac.wave_size == 64) - id = LLVMBuildZExt(ctx->ac.builder, id, ctx->i64, ""); - value = LLVMBuildShl(ctx->ac.builder, value, id, ""); - if (decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LE_MASK || - decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LT_MASK) - value = LLVMBuildNot(ctx->ac.builder, value, ""); - if (ctx->ac.wave_size == 32) - value = LLVMBuildZExt(ctx->ac.builder, value, ctx->i64, ""); - value = LLVMBuildBitCast(ctx->ac.builder, value, ctx->v2i32, ""); + fprintf(f, " part.gs.prolog.tri_strip_adj_fix = %u\n", key->part.gs.prolog.tri_strip_adj_fix); + fprintf(f, " part.gs.prolog.gfx9_prev_is_vs = %u\n", key->part.gs.prolog.gfx9_prev_is_vs); + fprintf(f, " as_ngg = %u\n", key->as_ngg); break; - } - case TGSI_SEMANTIC_CS_USER_DATA_AMD: - value = LLVMGetParam(ctx->main_fn, ctx->param_cs_user_data); + case PIPE_SHADER_COMPUTE: break; - default: - assert(!"unknown system value"); - return; - } - - ctx->system_values[index] = value; -} - -void si_declare_compute_memory(struct si_shader_context *ctx) -{ - struct si_shader_selector *sel = ctx->shader->selector; - unsigned lds_size = sel->info.properties[TGSI_PROPERTY_CS_LOCAL_SIZE]; - - LLVMTypeRef i8p = LLVMPointerType(ctx->i8, AC_ADDR_SPACE_LDS); - LLVMValueRef var; - - assert(!ctx->ac.lds); - - var = LLVMAddGlobalInAddressSpace(ctx->ac.module, - LLVMArrayType(ctx->i8, lds_size), - "compute_lds", - AC_ADDR_SPACE_LDS); - LLVMSetAlignment(var, 64 * 1024); - - ctx->ac.lds = LLVMBuildBitCast(ctx->ac.builder, var, i8p, ""); -} - -void si_tgsi_declare_compute_memory(struct si_shader_context *ctx, - const struct tgsi_full_declaration *decl) -{ - assert(decl->Declaration.MemType == TGSI_MEMORY_TYPE_SHARED); - assert(decl->Range.First == decl->Range.Last); - - si_declare_compute_memory(ctx); -} - -static LLVMValueRef load_const_buffer_desc_fast_path(struct si_shader_context *ctx) -{ - LLVMValueRef ptr = - LLVMGetParam(ctx->main_fn, ctx->param_const_and_shader_buffers); - struct si_shader_selector *sel = ctx->shader->selector; - - /* Do the bounds checking with a descriptor, because - * doing computation and manual bounds checking of 64-bit - * addresses generates horrible VALU code with very high - * VGPR usage and very low SIMD occupancy. - */ - ptr = LLVMBuildPtrToInt(ctx->ac.builder, ptr, ctx->ac.intptr, ""); - - LLVMValueRef desc0, desc1; - desc0 = ptr; - desc1 = LLVMConstInt(ctx->i32, - S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi), 0); - - uint32_t rsrc3 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | - S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | - S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | - S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W); - - if (ctx->screen->info.chip_class >= GFX10) - rsrc3 |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) | - S_008F0C_OOB_SELECT(3) | - S_008F0C_RESOURCE_LEVEL(1); - else - rsrc3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | - S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); - - LLVMValueRef desc_elems[] = { - desc0, - desc1, - LLVMConstInt(ctx->i32, (sel->info.const_file_max[0] + 1) * 16, 0), - LLVMConstInt(ctx->i32, rsrc3, false) - }; - - return ac_build_gather_values(&ctx->ac, desc_elems, 4); -} - -static LLVMValueRef load_const_buffer_desc(struct si_shader_context *ctx, int i) -{ - LLVMValueRef list_ptr = LLVMGetParam(ctx->main_fn, - ctx->param_const_and_shader_buffers); - - return ac_build_load_to_sgpr(&ctx->ac, list_ptr, - LLVMConstInt(ctx->i32, si_get_constbuf_slot(i), 0)); -} - -static LLVMValueRef load_ubo(struct ac_shader_abi *abi, LLVMValueRef index) -{ - struct si_shader_context *ctx = si_shader_context_from_abi(abi); - struct si_shader_selector *sel = ctx->shader->selector; - - LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, ctx->param_const_and_shader_buffers); - - if (sel->info.const_buffers_declared == 1 && - sel->info.shader_buffers_declared == 0) { - return load_const_buffer_desc_fast_path(ctx); - } - - index = si_llvm_bound_index(ctx, index, ctx->num_const_buffers); - index = LLVMBuildAdd(ctx->ac.builder, index, - LLVMConstInt(ctx->i32, SI_NUM_SHADER_BUFFERS, 0), ""); - - return ac_build_load_to_sgpr(&ctx->ac, ptr, index); -} - -static LLVMValueRef -load_ssbo(struct ac_shader_abi *abi, LLVMValueRef index, bool write) -{ - struct si_shader_context *ctx = si_shader_context_from_abi(abi); - LLVMValueRef rsrc_ptr = LLVMGetParam(ctx->main_fn, - ctx->param_const_and_shader_buffers); - - index = si_llvm_bound_index(ctx, index, ctx->num_shader_buffers); - index = LLVMBuildSub(ctx->ac.builder, - LLVMConstInt(ctx->i32, SI_NUM_SHADER_BUFFERS - 1, 0), - index, ""); - - return ac_build_load_to_sgpr(&ctx->ac, rsrc_ptr, index); -} - -static LLVMValueRef fetch_constant( - struct lp_build_tgsi_context *bld_base, - const struct tgsi_full_src_register *reg, - enum tgsi_opcode_type type, - unsigned swizzle_in) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - struct si_shader_selector *sel = ctx->shader->selector; - const struct tgsi_ind_register *ireg = ®->Indirect; - unsigned buf, idx; - unsigned swizzle = swizzle_in & 0xffff; - - LLVMValueRef addr, bufp; - - if (swizzle_in == LP_CHAN_ALL) { - unsigned chan; - LLVMValueRef values[4]; - for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) - values[chan] = fetch_constant(bld_base, reg, type, chan); - - return ac_build_gather_values(&ctx->ac, values, 4); - } - - /* Split 64-bit loads. */ - if (tgsi_type_is_64bit(type)) { - LLVMValueRef lo, hi; - - lo = fetch_constant(bld_base, reg, TGSI_TYPE_UNSIGNED, swizzle); - hi = fetch_constant(bld_base, reg, TGSI_TYPE_UNSIGNED, (swizzle_in >> 16)); - return si_llvm_emit_fetch_64bit(bld_base, tgsi2llvmtype(bld_base, type), - lo, hi); - } - - idx = reg->Register.Index * 4 + swizzle; - if (reg->Register.Indirect) { - addr = si_get_indirect_index(ctx, ireg, 16, idx * 4); - } else { - addr = LLVMConstInt(ctx->i32, idx * 4, 0); - } - - /* Fast path when user data SGPRs point to constant buffer 0 directly. */ - if (sel->info.const_buffers_declared == 1 && - sel->info.shader_buffers_declared == 0) { - LLVMValueRef desc = load_const_buffer_desc_fast_path(ctx); - LLVMValueRef result = buffer_load_const(ctx, desc, addr); - return bitcast(bld_base, type, result); - } - - assert(reg->Register.Dimension); - buf = reg->Dimension.Index; - - if (reg->Dimension.Indirect) { - LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, ctx->param_const_and_shader_buffers); - LLVMValueRef index; - index = si_get_bounded_indirect_index(ctx, ®->DimIndirect, - reg->Dimension.Index, - ctx->num_const_buffers); - index = LLVMBuildAdd(ctx->ac.builder, index, - LLVMConstInt(ctx->i32, SI_NUM_SHADER_BUFFERS, 0), ""); - bufp = ac_build_load_to_sgpr(&ctx->ac, ptr, index); - } else - bufp = load_const_buffer_desc(ctx, buf); - - return bitcast(bld_base, type, buffer_load_const(ctx, bufp, addr)); -} - -/* Initialize arguments for the shader export intrinsic */ -static void si_llvm_init_export_args(struct si_shader_context *ctx, - LLVMValueRef *values, - unsigned target, - struct ac_export_args *args) -{ - LLVMValueRef f32undef = LLVMGetUndef(ctx->ac.f32); - unsigned spi_shader_col_format = V_028714_SPI_SHADER_32_ABGR; - unsigned chan; - bool is_int8, is_int10; - - /* Default is 0xf. Adjusted below depending on the format. */ - args->enabled_channels = 0xf; /* writemask */ - - /* Specify whether the EXEC mask represents the valid mask */ - args->valid_mask = 0; - - /* Specify whether this is the last export */ - args->done = 0; - - /* Specify the target we are exporting */ - args->target = target; - - if (ctx->type == PIPE_SHADER_FRAGMENT) { - const struct si_shader_key *key = &ctx->shader->key; - unsigned col_formats = key->part.ps.epilog.spi_shader_col_format; - int cbuf = target - V_008DFC_SQ_EXP_MRT; - - assert(cbuf >= 0 && cbuf < 8); - spi_shader_col_format = (col_formats >> (cbuf * 4)) & 0xf; - is_int8 = (key->part.ps.epilog.color_is_int8 >> cbuf) & 0x1; - is_int10 = (key->part.ps.epilog.color_is_int10 >> cbuf) & 0x1; - } - - args->compr = false; - args->out[0] = f32undef; - args->out[1] = f32undef; - args->out[2] = f32undef; - args->out[3] = f32undef; - - LLVMValueRef (*packf)(struct ac_llvm_context *ctx, LLVMValueRef args[2]) = NULL; - LLVMValueRef (*packi)(struct ac_llvm_context *ctx, LLVMValueRef args[2], - unsigned bits, bool hi) = NULL; - - switch (spi_shader_col_format) { - case V_028714_SPI_SHADER_ZERO: - args->enabled_channels = 0; /* writemask */ - args->target = V_008DFC_SQ_EXP_NULL; - break; - - case V_028714_SPI_SHADER_32_R: - args->enabled_channels = 1; /* writemask */ - args->out[0] = values[0]; - break; - - case V_028714_SPI_SHADER_32_GR: - args->enabled_channels = 0x3; /* writemask */ - args->out[0] = values[0]; - args->out[1] = values[1]; - break; - - case V_028714_SPI_SHADER_32_AR: - if (ctx->screen->info.chip_class >= GFX10) { - args->enabled_channels = 0x3; /* writemask */ - args->out[0] = values[0]; - args->out[1] = values[3]; - } else { - args->enabled_channels = 0x9; /* writemask */ - args->out[0] = values[0]; - args->out[3] = values[3]; - } - break; - - case V_028714_SPI_SHADER_FP16_ABGR: - packf = ac_build_cvt_pkrtz_f16; - break; - - case V_028714_SPI_SHADER_UNORM16_ABGR: - packf = ac_build_cvt_pknorm_u16; - break; - - case V_028714_SPI_SHADER_SNORM16_ABGR: - packf = ac_build_cvt_pknorm_i16; - break; - - case V_028714_SPI_SHADER_UINT16_ABGR: - packi = ac_build_cvt_pk_u16; - break; - - case V_028714_SPI_SHADER_SINT16_ABGR: - packi = ac_build_cvt_pk_i16; - break; - - case V_028714_SPI_SHADER_32_ABGR: - memcpy(&args->out[0], values, sizeof(values[0]) * 4); - break; - } - - /* Pack f16 or norm_i16/u16. */ - if (packf) { - for (chan = 0; chan < 2; chan++) { - LLVMValueRef pack_args[2] = { - values[2 * chan], - values[2 * chan + 1] - }; - LLVMValueRef packed; - - packed = packf(&ctx->ac, pack_args); - args->out[chan] = ac_to_float(&ctx->ac, packed); - } - args->compr = 1; /* COMPR flag */ - } - /* Pack i16/u16. */ - if (packi) { - for (chan = 0; chan < 2; chan++) { - LLVMValueRef pack_args[2] = { - ac_to_integer(&ctx->ac, values[2 * chan]), - ac_to_integer(&ctx->ac, values[2 * chan + 1]) - }; - LLVMValueRef packed; - - packed = packi(&ctx->ac, pack_args, - is_int8 ? 8 : is_int10 ? 10 : 16, - chan == 1); - args->out[chan] = ac_to_float(&ctx->ac, packed); - } - args->compr = 1; /* COMPR flag */ - } -} - -static void si_alpha_test(struct lp_build_tgsi_context *bld_base, - LLVMValueRef alpha) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - - if (ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_NEVER) { - static LLVMRealPredicate cond_map[PIPE_FUNC_ALWAYS + 1] = { - [PIPE_FUNC_LESS] = LLVMRealOLT, - [PIPE_FUNC_EQUAL] = LLVMRealOEQ, - [PIPE_FUNC_LEQUAL] = LLVMRealOLE, - [PIPE_FUNC_GREATER] = LLVMRealOGT, - [PIPE_FUNC_NOTEQUAL] = LLVMRealONE, - [PIPE_FUNC_GEQUAL] = LLVMRealOGE, - }; - LLVMRealPredicate cond = cond_map[ctx->shader->key.part.ps.epilog.alpha_func]; - assert(cond); - - LLVMValueRef alpha_ref = LLVMGetParam(ctx->main_fn, - SI_PARAM_ALPHA_REF); - LLVMValueRef alpha_pass = - LLVMBuildFCmp(ctx->ac.builder, cond, alpha, alpha_ref, ""); - ac_build_kill_if_false(&ctx->ac, alpha_pass); - } else { - ac_build_kill_if_false(&ctx->ac, ctx->i1false); - } -} - -static LLVMValueRef si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context *bld_base, - LLVMValueRef alpha, - unsigned samplemask_param) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - LLVMValueRef coverage; - - /* alpha = alpha * popcount(coverage) / SI_NUM_SMOOTH_AA_SAMPLES */ - coverage = LLVMGetParam(ctx->main_fn, - samplemask_param); - coverage = ac_to_integer(&ctx->ac, coverage); - - coverage = ac_build_intrinsic(&ctx->ac, "llvm.ctpop.i32", - ctx->i32, - &coverage, 1, AC_FUNC_ATTR_READNONE); - - coverage = LLVMBuildUIToFP(ctx->ac.builder, coverage, - ctx->f32, ""); - - coverage = LLVMBuildFMul(ctx->ac.builder, coverage, - LLVMConstReal(ctx->f32, - 1.0 / SI_NUM_SMOOTH_AA_SAMPLES), ""); - - return LLVMBuildFMul(ctx->ac.builder, alpha, coverage, ""); -} - -static void si_llvm_emit_clipvertex(struct si_shader_context *ctx, - struct ac_export_args *pos, LLVMValueRef *out_elts) -{ - unsigned reg_index; - unsigned chan; - unsigned const_chan; - LLVMValueRef base_elt; - LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers); - LLVMValueRef constbuf_index = LLVMConstInt(ctx->i32, - SI_VS_CONST_CLIP_PLANES, 0); - LLVMValueRef const_resource = ac_build_load_to_sgpr(&ctx->ac, ptr, constbuf_index); - - for (reg_index = 0; reg_index < 2; reg_index ++) { - struct ac_export_args *args = &pos[2 + reg_index]; - - args->out[0] = - args->out[1] = - args->out[2] = - args->out[3] = LLVMConstReal(ctx->f32, 0.0f); - - /* Compute dot products of position and user clip plane vectors */ - for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) { - for (const_chan = 0; const_chan < TGSI_NUM_CHANNELS; const_chan++) { - LLVMValueRef addr = - LLVMConstInt(ctx->i32, ((reg_index * 4 + chan) * 4 + - const_chan) * 4, 0); - base_elt = buffer_load_const(ctx, const_resource, - addr); - args->out[chan] = ac_build_fmad(&ctx->ac, base_elt, - out_elts[const_chan], args->out[chan]); - } - } - - args->enabled_channels = 0xf; - args->valid_mask = 0; - args->done = 0; - args->target = V_008DFC_SQ_EXP_POS + 2 + reg_index; - args->compr = 0; - } -} - -static void si_dump_streamout(struct pipe_stream_output_info *so) -{ - unsigned i; - - if (so->num_outputs) - fprintf(stderr, "STREAMOUT\n"); - - for (i = 0; i < so->num_outputs; i++) { - unsigned mask = ((1 << so->output[i].num_components) - 1) << - so->output[i].start_component; - fprintf(stderr, " %i: BUF%i[%i..%i] <- OUT[%i].%s%s%s%s\n", - i, so->output[i].output_buffer, - so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1, - so->output[i].register_index, - mask & 1 ? "x" : "", - mask & 2 ? "y" : "", - mask & 4 ? "z" : "", - mask & 8 ? "w" : ""); - } -} - -void si_emit_streamout_output(struct si_shader_context *ctx, - LLVMValueRef const *so_buffers, - LLVMValueRef const *so_write_offsets, - struct pipe_stream_output *stream_out, - struct si_shader_output_values *shader_out) -{ - unsigned buf_idx = stream_out->output_buffer; - unsigned start = stream_out->start_component; - unsigned num_comps = stream_out->num_components; - LLVMValueRef out[4]; - - assert(num_comps && num_comps <= 4); - if (!num_comps || num_comps > 4) - return; - - /* Load the output as int. */ - for (int j = 0; j < num_comps; j++) { - assert(stream_out->stream == shader_out->vertex_stream[start + j]); - - out[j] = ac_to_integer(&ctx->ac, shader_out->values[start + j]); - } - - /* Pack the output. */ - LLVMValueRef vdata = NULL; - - switch (num_comps) { - case 1: /* as i32 */ - vdata = out[0]; - break; - case 2: /* as v2i32 */ - case 3: /* as v3i32 */ - if (ac_has_vec3_support(ctx->screen->info.chip_class, false)) { - vdata = ac_build_gather_values(&ctx->ac, out, num_comps); - break; - } - /* as v4i32 (aligned to 4) */ - out[3] = LLVMGetUndef(ctx->i32); - /* fall through */ - case 4: /* as v4i32 */ - vdata = ac_build_gather_values(&ctx->ac, out, util_next_power_of_two(num_comps)); - break; - } - - ac_build_buffer_store_dword(&ctx->ac, so_buffers[buf_idx], - vdata, num_comps, - so_write_offsets[buf_idx], - ctx->i32_0, - stream_out->dst_offset * 4, ac_glc | ac_slc, false); -} - -/** - * Write streamout data to buffers for vertex stream @p stream (different - * vertex streams can occur for GS copy shaders). - */ -static void si_llvm_emit_streamout(struct si_shader_context *ctx, - struct si_shader_output_values *outputs, - unsigned noutput, unsigned stream) -{ - struct si_shader_selector *sel = ctx->shader->selector; - struct pipe_stream_output_info *so = &sel->so; - LLVMBuilderRef builder = ctx->ac.builder; - int i; - - /* Get bits [22:16], i.e. (so_param >> 16) & 127; */ - LLVMValueRef so_vtx_count = - si_unpack_param(ctx, ctx->param_streamout_config, 16, 7); - - LLVMValueRef tid = ac_get_thread_id(&ctx->ac); - - /* can_emit = tid < so_vtx_count; */ - LLVMValueRef can_emit = - LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, ""); - - /* Emit the streamout code conditionally. This actually avoids - * out-of-bounds buffer access. The hw tells us via the SGPR - * (so_vtx_count) which threads are allowed to emit streamout data. */ - ac_build_ifcc(&ctx->ac, can_emit, 6501); - { - /* The buffer offset is computed as follows: - * ByteOffset = streamout_offset[buffer_id]*4 + - * (streamout_write_index + thread_id)*stride[buffer_id] + - * attrib_offset - */ - - LLVMValueRef so_write_index = - LLVMGetParam(ctx->main_fn, - ctx->param_streamout_write_index); - - /* Compute (streamout_write_index + thread_id). */ - so_write_index = LLVMBuildAdd(builder, so_write_index, tid, ""); - - /* Load the descriptor and compute the write offset for each - * enabled buffer. */ - LLVMValueRef so_write_offset[4] = {}; - LLVMValueRef so_buffers[4]; - LLVMValueRef buf_ptr = LLVMGetParam(ctx->main_fn, - ctx->param_rw_buffers); - - for (i = 0; i < 4; i++) { - if (!so->stride[i]) - continue; - - LLVMValueRef offset = LLVMConstInt(ctx->i32, - SI_VS_STREAMOUT_BUF0 + i, 0); - - so_buffers[i] = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset); - - LLVMValueRef so_offset = LLVMGetParam(ctx->main_fn, - ctx->param_streamout_offset[i]); - so_offset = LLVMBuildMul(builder, so_offset, LLVMConstInt(ctx->i32, 4, 0), ""); - - so_write_offset[i] = ac_build_imad(&ctx->ac, so_write_index, - LLVMConstInt(ctx->i32, so->stride[i]*4, 0), - so_offset); - } - - /* Write streamout data. */ - for (i = 0; i < so->num_outputs; i++) { - unsigned reg = so->output[i].register_index; - - if (reg >= noutput) - continue; - - if (stream != so->output[i].stream) - continue; - - si_emit_streamout_output(ctx, so_buffers, so_write_offset, - &so->output[i], &outputs[reg]); - } - } - ac_build_endif(&ctx->ac, 6501); -} - -static void si_export_param(struct si_shader_context *ctx, unsigned index, - LLVMValueRef *values) -{ - struct ac_export_args args; - - si_llvm_init_export_args(ctx, values, - V_008DFC_SQ_EXP_PARAM + index, &args); - ac_build_export(&ctx->ac, &args); -} - -static void si_build_param_exports(struct si_shader_context *ctx, - struct si_shader_output_values *outputs, - unsigned noutput) -{ - struct si_shader *shader = ctx->shader; - unsigned param_count = 0; - - for (unsigned i = 0; i < noutput; i++) { - unsigned semantic_name = outputs[i].semantic_name; - unsigned semantic_index = outputs[i].semantic_index; - - if (outputs[i].vertex_stream[0] != 0 && - outputs[i].vertex_stream[1] != 0 && - outputs[i].vertex_stream[2] != 0 && - outputs[i].vertex_stream[3] != 0) - continue; - - switch (semantic_name) { - case TGSI_SEMANTIC_LAYER: - case TGSI_SEMANTIC_VIEWPORT_INDEX: - case TGSI_SEMANTIC_CLIPDIST: - case TGSI_SEMANTIC_COLOR: - case TGSI_SEMANTIC_BCOLOR: - case TGSI_SEMANTIC_PRIMID: - case TGSI_SEMANTIC_FOG: - case TGSI_SEMANTIC_TEXCOORD: - case TGSI_SEMANTIC_GENERIC: - break; - default: - continue; - } - - if ((semantic_name != TGSI_SEMANTIC_GENERIC || - semantic_index < SI_MAX_IO_GENERIC) && - shader->key.opt.kill_outputs & - (1ull << si_shader_io_get_unique_index(semantic_name, - semantic_index, true))) - continue; - - si_export_param(ctx, param_count, outputs[i].values); - - assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset)); - shader->info.vs_output_param_offset[i] = param_count++; - } - - shader->info.nr_param_exports = param_count; -} - -/** - * Vertex color clamping. - * - * This uses a state constant loaded in a user data SGPR and - * an IF statement is added that clamps all colors if the constant - * is true. - */ -static void si_vertex_color_clamping(struct si_shader_context *ctx, - struct si_shader_output_values *outputs, - unsigned noutput) -{ - LLVMValueRef addr[SI_MAX_VS_OUTPUTS][4]; - bool has_colors = false; - - /* Store original colors to alloca variables. */ - for (unsigned i = 0; i < noutput; i++) { - if (outputs[i].semantic_name != TGSI_SEMANTIC_COLOR && - outputs[i].semantic_name != TGSI_SEMANTIC_BCOLOR) - continue; - - for (unsigned j = 0; j < 4; j++) { - addr[i][j] = ac_build_alloca_undef(&ctx->ac, ctx->f32, ""); - LLVMBuildStore(ctx->ac.builder, outputs[i].values[j], addr[i][j]); - } - has_colors = true; - } - - if (!has_colors) - return; - - /* The state is in the first bit of the user SGPR. */ - LLVMValueRef cond = LLVMGetParam(ctx->main_fn, ctx->param_vs_state_bits); - cond = LLVMBuildTrunc(ctx->ac.builder, cond, ctx->i1, ""); - - ac_build_ifcc(&ctx->ac, cond, 6502); - - /* Store clamped colors to alloca variables within the conditional block. */ - for (unsigned i = 0; i < noutput; i++) { - if (outputs[i].semantic_name != TGSI_SEMANTIC_COLOR && - outputs[i].semantic_name != TGSI_SEMANTIC_BCOLOR) - continue; - - for (unsigned j = 0; j < 4; j++) { - LLVMBuildStore(ctx->ac.builder, - ac_build_clamp(&ctx->ac, outputs[i].values[j]), - addr[i][j]); - } - } - ac_build_endif(&ctx->ac, 6502); - - /* Load clamped colors */ - for (unsigned i = 0; i < noutput; i++) { - if (outputs[i].semantic_name != TGSI_SEMANTIC_COLOR && - outputs[i].semantic_name != TGSI_SEMANTIC_BCOLOR) - continue; - - for (unsigned j = 0; j < 4; j++) { - outputs[i].values[j] = - LLVMBuildLoad(ctx->ac.builder, addr[i][j], ""); - } - } -} - -/* Generate export instructions for hardware VS shader stage or NGG GS stage - * (position and parameter data only). - */ -void si_llvm_export_vs(struct si_shader_context *ctx, - struct si_shader_output_values *outputs, - unsigned noutput) -{ - struct si_shader *shader = ctx->shader; - struct ac_export_args pos_args[4] = {}; - LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL, viewport_index_value = NULL; - unsigned pos_idx; - int i; - - si_vertex_color_clamping(ctx, outputs, noutput); - - /* Build position exports. */ - for (i = 0; i < noutput; i++) { - switch (outputs[i].semantic_name) { - case TGSI_SEMANTIC_POSITION: - si_llvm_init_export_args(ctx, outputs[i].values, - V_008DFC_SQ_EXP_POS, &pos_args[0]); - break; - case TGSI_SEMANTIC_PSIZE: - psize_value = outputs[i].values[0]; - break; - case TGSI_SEMANTIC_LAYER: - layer_value = outputs[i].values[0]; - break; - case TGSI_SEMANTIC_VIEWPORT_INDEX: - viewport_index_value = outputs[i].values[0]; - break; - case TGSI_SEMANTIC_EDGEFLAG: - edgeflag_value = outputs[i].values[0]; - break; - case TGSI_SEMANTIC_CLIPDIST: - if (!shader->key.opt.clip_disable) { - unsigned index = 2 + outputs[i].semantic_index; - si_llvm_init_export_args(ctx, outputs[i].values, - V_008DFC_SQ_EXP_POS + index, - &pos_args[index]); - } - break; - case TGSI_SEMANTIC_CLIPVERTEX: - if (!shader->key.opt.clip_disable) { - si_llvm_emit_clipvertex(ctx, pos_args, - outputs[i].values); - } - break; - } - } - - /* We need to add the position output manually if it's missing. */ - if (!pos_args[0].out[0]) { - pos_args[0].enabled_channels = 0xf; /* writemask */ - pos_args[0].valid_mask = 0; /* EXEC mask */ - pos_args[0].done = 0; /* last export? */ - pos_args[0].target = V_008DFC_SQ_EXP_POS; - pos_args[0].compr = 0; /* COMPR flag */ - pos_args[0].out[0] = ctx->ac.f32_0; /* X */ - pos_args[0].out[1] = ctx->ac.f32_0; /* Y */ - pos_args[0].out[2] = ctx->ac.f32_0; /* Z */ - pos_args[0].out[3] = ctx->ac.f32_1; /* W */ - } - - bool pos_writes_edgeflag = shader->selector->info.writes_edgeflag && - !shader->key.as_ngg; - - /* Write the misc vector (point size, edgeflag, layer, viewport). */ - if (shader->selector->info.writes_psize || - pos_writes_edgeflag || - shader->selector->info.writes_viewport_index || - shader->selector->info.writes_layer) { - pos_args[1].enabled_channels = shader->selector->info.writes_psize | - (pos_writes_edgeflag << 1) | - (shader->selector->info.writes_layer << 2); - - pos_args[1].valid_mask = 0; /* EXEC mask */ - pos_args[1].done = 0; /* last export? */ - pos_args[1].target = V_008DFC_SQ_EXP_POS + 1; - pos_args[1].compr = 0; /* COMPR flag */ - pos_args[1].out[0] = ctx->ac.f32_0; /* X */ - pos_args[1].out[1] = ctx->ac.f32_0; /* Y */ - pos_args[1].out[2] = ctx->ac.f32_0; /* Z */ - pos_args[1].out[3] = ctx->ac.f32_0; /* W */ - - if (shader->selector->info.writes_psize) - pos_args[1].out[0] = psize_value; - - if (pos_writes_edgeflag) { - /* The output is a float, but the hw expects an integer - * with the first bit containing the edge flag. */ - edgeflag_value = LLVMBuildFPToUI(ctx->ac.builder, - edgeflag_value, - ctx->i32, ""); - edgeflag_value = ac_build_umin(&ctx->ac, - edgeflag_value, - ctx->i32_1); - - /* The LLVM intrinsic expects a float. */ - pos_args[1].out[1] = ac_to_float(&ctx->ac, edgeflag_value); - } - - if (ctx->screen->info.chip_class >= GFX9) { - /* GFX9 has the layer in out.z[10:0] and the viewport - * index in out.z[19:16]. - */ - if (shader->selector->info.writes_layer) - pos_args[1].out[2] = layer_value; - - if (shader->selector->info.writes_viewport_index) { - LLVMValueRef v = viewport_index_value; - - v = ac_to_integer(&ctx->ac, v); - v = LLVMBuildShl(ctx->ac.builder, v, - LLVMConstInt(ctx->i32, 16, 0), ""); - v = LLVMBuildOr(ctx->ac.builder, v, - ac_to_integer(&ctx->ac, pos_args[1].out[2]), ""); - pos_args[1].out[2] = ac_to_float(&ctx->ac, v); - pos_args[1].enabled_channels |= 1 << 2; - } - } else { - if (shader->selector->info.writes_layer) - pos_args[1].out[2] = layer_value; - - if (shader->selector->info.writes_viewport_index) { - pos_args[1].out[3] = viewport_index_value; - pos_args[1].enabled_channels |= 1 << 3; - } - } - } - - for (i = 0; i < 4; i++) - if (pos_args[i].out[0]) - shader->info.nr_pos_exports++; - - /* Navi10-14 skip POS0 exports if EXEC=0 and DONE=0, causing a hang. - * Setting valid_mask=1 prevents it and has no other effect. - */ - if (ctx->screen->info.family == CHIP_NAVI10 || - ctx->screen->info.family == CHIP_NAVI12 || - ctx->screen->info.family == CHIP_NAVI14) - pos_args[0].valid_mask = 1; - - pos_idx = 0; - for (i = 0; i < 4; i++) { - if (!pos_args[i].out[0]) - continue; - - /* Specify the target we are exporting */ - pos_args[i].target = V_008DFC_SQ_EXP_POS + pos_idx++; - - if (pos_idx == shader->info.nr_pos_exports) - /* Specify that this is the last export */ - pos_args[i].done = 1; - - ac_build_export(&ctx->ac, &pos_args[i]); - } - - /* Build parameter exports. */ - si_build_param_exports(ctx, outputs, noutput); -} - -/** - * Forward all outputs from the vertex shader to the TES. This is only used - * for the fixed function TCS. - */ -static void si_copy_tcs_inputs(struct lp_build_tgsi_context *bld_base) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - LLVMValueRef invocation_id, buffer, buffer_offset; - LLVMValueRef lds_vertex_stride, lds_base; - uint64_t inputs; - - invocation_id = unpack_llvm_param(ctx, ctx->abi.tcs_rel_ids, 8, 5); - buffer = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TCS); - buffer_offset = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset); - - lds_vertex_stride = get_tcs_in_vertex_dw_stride(ctx); - lds_base = get_tcs_in_current_patch_offset(ctx); - lds_base = ac_build_imad(&ctx->ac, invocation_id, lds_vertex_stride, - lds_base); - - inputs = ctx->shader->key.mono.u.ff_tcs_inputs_to_copy; - while (inputs) { - unsigned i = u_bit_scan64(&inputs); - - LLVMValueRef lds_ptr = LLVMBuildAdd(ctx->ac.builder, lds_base, - LLVMConstInt(ctx->i32, 4 * i, 0), - ""); - - LLVMValueRef buffer_addr = get_tcs_tes_buffer_address(ctx, - get_rel_patch_id(ctx), - invocation_id, - LLVMConstInt(ctx->i32, i, 0)); - - LLVMValueRef value = lshs_lds_load(bld_base, ctx->ac.i32, ~0, lds_ptr); - - ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, buffer_addr, - buffer_offset, 0, ac_glc, false); - } -} - -static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base, - LLVMValueRef rel_patch_id, - LLVMValueRef invocation_id, - LLVMValueRef tcs_out_current_patch_data_offset, - LLVMValueRef invoc0_tf_outer[4], - LLVMValueRef invoc0_tf_inner[2]) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - struct si_shader *shader = ctx->shader; - unsigned tess_inner_index, tess_outer_index; - LLVMValueRef lds_base, lds_inner, lds_outer, byteoffset, buffer; - LLVMValueRef out[6], vec0, vec1, tf_base, inner[4], outer[4]; - unsigned stride, outer_comps, inner_comps, i, offset; - - /* Add a barrier before loading tess factors from LDS. */ - if (!shader->key.part.tcs.epilog.invoc0_tess_factors_are_def) - si_llvm_emit_barrier(NULL, bld_base, NULL); - - /* Do this only for invocation 0, because the tess levels are per-patch, - * not per-vertex. - * - * This can't jump, because invocation 0 executes this. It should - * at least mask out the loads and stores for other invocations. - */ - ac_build_ifcc(&ctx->ac, - LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, - invocation_id, ctx->i32_0, ""), 6503); - - /* Determine the layout of one tess factor element in the buffer. */ - switch (shader->key.part.tcs.epilog.prim_mode) { - case PIPE_PRIM_LINES: - stride = 2; /* 2 dwords, 1 vec2 store */ - outer_comps = 2; - inner_comps = 0; - break; - case PIPE_PRIM_TRIANGLES: - stride = 4; /* 4 dwords, 1 vec4 store */ - outer_comps = 3; - inner_comps = 1; - break; - case PIPE_PRIM_QUADS: - stride = 6; /* 6 dwords, 2 stores (vec4 + vec2) */ - outer_comps = 4; - inner_comps = 2; - break; - default: - assert(0); - return; - } - - for (i = 0; i < 4; i++) { - inner[i] = LLVMGetUndef(ctx->i32); - outer[i] = LLVMGetUndef(ctx->i32); - } - - if (shader->key.part.tcs.epilog.invoc0_tess_factors_are_def) { - /* Tess factors are in VGPRs. */ - for (i = 0; i < outer_comps; i++) - outer[i] = out[i] = invoc0_tf_outer[i]; - for (i = 0; i < inner_comps; i++) - inner[i] = out[outer_comps+i] = invoc0_tf_inner[i]; - } else { - /* Load tess_inner and tess_outer from LDS. - * Any invocation can write them, so we can't get them from a temporary. - */ - tess_inner_index = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSINNER, 0); - tess_outer_index = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSOUTER, 0); - - lds_base = tcs_out_current_patch_data_offset; - lds_inner = LLVMBuildAdd(ctx->ac.builder, lds_base, - LLVMConstInt(ctx->i32, - tess_inner_index * 4, 0), ""); - lds_outer = LLVMBuildAdd(ctx->ac.builder, lds_base, - LLVMConstInt(ctx->i32, - tess_outer_index * 4, 0), ""); - - for (i = 0; i < outer_comps; i++) { - outer[i] = out[i] = - lshs_lds_load(bld_base, ctx->ac.i32, i, lds_outer); - } - for (i = 0; i < inner_comps; i++) { - inner[i] = out[outer_comps+i] = - lshs_lds_load(bld_base, ctx->ac.i32, i, lds_inner); - } - } - - if (shader->key.part.tcs.epilog.prim_mode == PIPE_PRIM_LINES) { - /* For isolines, the hardware expects tess factors in the - * reverse order from what GLSL / TGSI specify. - */ - LLVMValueRef tmp = out[0]; - out[0] = out[1]; - out[1] = tmp; - } - - /* Convert the outputs to vectors for stores. */ - vec0 = ac_build_gather_values(&ctx->ac, out, MIN2(stride, 4)); - vec1 = NULL; - - if (stride > 4) - vec1 = ac_build_gather_values(&ctx->ac, out+4, stride - 4); - - /* Get the buffer. */ - buffer = get_tess_ring_descriptor(ctx, TCS_FACTOR_RING); - - /* Get the offset. */ - tf_base = LLVMGetParam(ctx->main_fn, - ctx->param_tcs_factor_offset); - byteoffset = LLVMBuildMul(ctx->ac.builder, rel_patch_id, - LLVMConstInt(ctx->i32, 4 * stride, 0), ""); - - ac_build_ifcc(&ctx->ac, - LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, - rel_patch_id, ctx->i32_0, ""), 6504); - - /* Store the dynamic HS control word. */ - offset = 0; - if (ctx->screen->info.chip_class <= GFX8) { - ac_build_buffer_store_dword(&ctx->ac, buffer, - LLVMConstInt(ctx->i32, 0x80000000, 0), - 1, ctx->i32_0, tf_base, - offset, ac_glc, false); - offset += 4; - } - - ac_build_endif(&ctx->ac, 6504); - - /* Store the tessellation factors. */ - ac_build_buffer_store_dword(&ctx->ac, buffer, vec0, - MIN2(stride, 4), byteoffset, tf_base, - offset, ac_glc, false); - offset += 16; - if (vec1) - ac_build_buffer_store_dword(&ctx->ac, buffer, vec1, - stride - 4, byteoffset, tf_base, - offset, ac_glc, false); - - /* Store the tess factors into the offchip buffer if TES reads them. */ - if (shader->key.part.tcs.epilog.tes_reads_tess_factors) { - LLVMValueRef buf, base, inner_vec, outer_vec, tf_outer_offset; - LLVMValueRef tf_inner_offset; - unsigned param_outer, param_inner; - - buf = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TCS); - base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset); - - param_outer = si_shader_io_get_unique_index_patch( - TGSI_SEMANTIC_TESSOUTER, 0); - tf_outer_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL, - LLVMConstInt(ctx->i32, param_outer, 0)); - - unsigned outer_vec_size = - ac_has_vec3_support(ctx->screen->info.chip_class, false) ? - outer_comps : util_next_power_of_two(outer_comps); - outer_vec = ac_build_gather_values(&ctx->ac, outer, outer_vec_size); - - ac_build_buffer_store_dword(&ctx->ac, buf, outer_vec, - outer_comps, tf_outer_offset, - base, 0, ac_glc, false); - if (inner_comps) { - param_inner = si_shader_io_get_unique_index_patch( - TGSI_SEMANTIC_TESSINNER, 0); - tf_inner_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL, - LLVMConstInt(ctx->i32, param_inner, 0)); - - inner_vec = inner_comps == 1 ? inner[0] : - ac_build_gather_values(&ctx->ac, inner, inner_comps); - ac_build_buffer_store_dword(&ctx->ac, buf, inner_vec, - inner_comps, tf_inner_offset, - base, 0, ac_glc, false); - } - } - - ac_build_endif(&ctx->ac, 6503); -} - -static LLVMValueRef -si_insert_input_ret(struct si_shader_context *ctx, LLVMValueRef ret, - unsigned param, unsigned return_index) -{ - return LLVMBuildInsertValue(ctx->ac.builder, ret, - LLVMGetParam(ctx->main_fn, param), - return_index, ""); -} - -static LLVMValueRef -si_insert_input_ret_float(struct si_shader_context *ctx, LLVMValueRef ret, - unsigned param, unsigned return_index) -{ - LLVMBuilderRef builder = ctx->ac.builder; - LLVMValueRef p = LLVMGetParam(ctx->main_fn, param); - - return LLVMBuildInsertValue(builder, ret, - ac_to_float(&ctx->ac, p), - return_index, ""); -} - -static LLVMValueRef -si_insert_input_ptr(struct si_shader_context *ctx, LLVMValueRef ret, - unsigned param, unsigned return_index) -{ - LLVMBuilderRef builder = ctx->ac.builder; - LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, param); - ptr = LLVMBuildPtrToInt(builder, ptr, ctx->i32, ""); - return LLVMBuildInsertValue(builder, ret, ptr, return_index, ""); -} - -/* This only writes the tessellation factor levels. */ -static void si_llvm_emit_tcs_epilogue(struct ac_shader_abi *abi, - unsigned max_outputs, - LLVMValueRef *addrs) -{ - struct si_shader_context *ctx = si_shader_context_from_abi(abi); - struct lp_build_tgsi_context *bld_base = &ctx->bld_base; - LLVMBuilderRef builder = ctx->ac.builder; - LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset; - - si_copy_tcs_inputs(bld_base); - - rel_patch_id = get_rel_patch_id(ctx); - invocation_id = unpack_llvm_param(ctx, ctx->abi.tcs_rel_ids, 8, 5); - tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx); - - if (ctx->screen->info.chip_class >= GFX9) { - LLVMBasicBlockRef blocks[2] = { - LLVMGetInsertBlock(builder), - ctx->merged_wrap_if_entry_block - }; - LLVMValueRef values[2]; - - ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label); - - values[0] = rel_patch_id; - values[1] = LLVMGetUndef(ctx->i32); - rel_patch_id = ac_build_phi(&ctx->ac, ctx->i32, 2, values, blocks); - - values[0] = tf_lds_offset; - values[1] = LLVMGetUndef(ctx->i32); - tf_lds_offset = ac_build_phi(&ctx->ac, ctx->i32, 2, values, blocks); - - values[0] = invocation_id; - values[1] = ctx->i32_1; /* cause the epilog to skip threads */ - invocation_id = ac_build_phi(&ctx->ac, ctx->i32, 2, values, blocks); - } - - /* Return epilog parameters from this function. */ - LLVMValueRef ret = ctx->return_value; - unsigned vgpr; - - if (ctx->screen->info.chip_class >= GFX9) { - ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_layout, - 8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT); - ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_out_lds_layout, - 8 + GFX9_SGPR_TCS_OUT_LAYOUT); - /* Tess offchip and tess factor offsets are at the beginning. */ - ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_offset, 2); - ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_offset, 4); - vgpr = 8 + GFX9_SGPR_TCS_OUT_LAYOUT + 1; - } else { - ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_layout, - GFX6_SGPR_TCS_OFFCHIP_LAYOUT); - ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_out_lds_layout, - GFX6_SGPR_TCS_OUT_LAYOUT); - /* Tess offchip and tess factor offsets are after user SGPRs. */ - ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_offset, - GFX6_TCS_NUM_USER_SGPR); - ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_offset, - GFX6_TCS_NUM_USER_SGPR + 1); - vgpr = GFX6_TCS_NUM_USER_SGPR + 2; - } - - /* VGPRs */ - rel_patch_id = ac_to_float(&ctx->ac, rel_patch_id); - invocation_id = ac_to_float(&ctx->ac, invocation_id); - tf_lds_offset = ac_to_float(&ctx->ac, tf_lds_offset); - - /* Leave a hole corresponding to the two input VGPRs. This ensures that - * the invocation_id output does not alias the tcs_rel_ids input, - * which saves a V_MOV on gfx9. - */ - vgpr += 2; - - ret = LLVMBuildInsertValue(builder, ret, rel_patch_id, vgpr++, ""); - ret = LLVMBuildInsertValue(builder, ret, invocation_id, vgpr++, ""); - - if (ctx->shader->selector->tcs_info.tessfactors_are_def_in_all_invocs) { - vgpr++; /* skip the tess factor LDS offset */ - for (unsigned i = 0; i < 6; i++) { - LLVMValueRef value = - LLVMBuildLoad(builder, ctx->invoc0_tess_factors[i], ""); - value = ac_to_float(&ctx->ac, value); - ret = LLVMBuildInsertValue(builder, ret, value, vgpr++, ""); - } - } else { - ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, ""); - } - ctx->return_value = ret; -} - -/* Pass TCS inputs from LS to TCS on GFX9. */ -static void si_set_ls_return_value_for_tcs(struct si_shader_context *ctx) -{ - LLVMValueRef ret = ctx->return_value; - - ret = si_insert_input_ptr(ctx, ret, 0, 0); - ret = si_insert_input_ptr(ctx, ret, 1, 1); - ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_offset, 2); - ret = si_insert_input_ret(ctx, ret, ctx->param_merged_wave_info, 3); - ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_offset, 4); - ret = si_insert_input_ret(ctx, ret, ctx->param_merged_scratch_offset, 5); - - ret = si_insert_input_ptr(ctx, ret, ctx->param_rw_buffers, - 8 + SI_SGPR_RW_BUFFERS); - ret = si_insert_input_ptr(ctx, ret, - ctx->param_bindless_samplers_and_images, - 8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES); - - ret = si_insert_input_ret(ctx, ret, ctx->param_vs_state_bits, - 8 + SI_SGPR_VS_STATE_BITS); - - ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_layout, - 8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT); - ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_out_lds_offsets, - 8 + GFX9_SGPR_TCS_OUT_OFFSETS); - ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_out_lds_layout, - 8 + GFX9_SGPR_TCS_OUT_LAYOUT); - - unsigned vgpr = 8 + GFX9_TCS_NUM_USER_SGPR; - ret = LLVMBuildInsertValue(ctx->ac.builder, ret, - ac_to_float(&ctx->ac, ctx->abi.tcs_patch_id), - vgpr++, ""); - ret = LLVMBuildInsertValue(ctx->ac.builder, ret, - ac_to_float(&ctx->ac, ctx->abi.tcs_rel_ids), - vgpr++, ""); - ctx->return_value = ret; -} - -/* Pass GS inputs from ES to GS on GFX9. */ -static void si_set_es_return_value_for_gs(struct si_shader_context *ctx) -{ - LLVMBuilderRef builder = ctx->ac.builder; - LLVMValueRef ret = ctx->return_value; - - ret = si_insert_input_ptr(ctx, ret, 0, 0); - ret = si_insert_input_ptr(ctx, ret, 1, 1); - if (ctx->shader->key.as_ngg) - ret = LLVMBuildInsertValue(builder, ret, ctx->gs_tg_info, 2, ""); - else - ret = si_insert_input_ret(ctx, ret, ctx->param_gs2vs_offset, 2); - ret = si_insert_input_ret(ctx, ret, ctx->param_merged_wave_info, 3); - ret = si_insert_input_ret(ctx, ret, ctx->param_merged_scratch_offset, 5); - - ret = si_insert_input_ptr(ctx, ret, ctx->param_rw_buffers, - 8 + SI_SGPR_RW_BUFFERS); - ret = si_insert_input_ptr(ctx, ret, - ctx->param_bindless_samplers_and_images, - 8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES); - if (ctx->screen->use_ngg) { - ret = si_insert_input_ptr(ctx, ret, ctx->param_vs_state_bits, - 8 + SI_SGPR_VS_STATE_BITS); - } - - unsigned vgpr; - if (ctx->type == PIPE_SHADER_VERTEX) - vgpr = 8 + GFX9_VSGS_NUM_USER_SGPR; - else - vgpr = 8 + GFX9_TESGS_NUM_USER_SGPR; - - for (unsigned i = 0; i < 5; i++) { - unsigned param = ctx->param_gs_vtx01_offset + i; - ret = si_insert_input_ret_float(ctx, ret, param, vgpr++); - } - ctx->return_value = ret; -} - -static void si_llvm_emit_ls_epilogue(struct ac_shader_abi *abi, - unsigned max_outputs, - LLVMValueRef *addrs) -{ - struct si_shader_context *ctx = si_shader_context_from_abi(abi); - struct si_shader *shader = ctx->shader; - struct tgsi_shader_info *info = &shader->selector->info; - unsigned i, chan; - LLVMValueRef vertex_id = LLVMGetParam(ctx->main_fn, - ctx->param_rel_auto_id); - LLVMValueRef vertex_dw_stride = get_tcs_in_vertex_dw_stride(ctx); - LLVMValueRef base_dw_addr = LLVMBuildMul(ctx->ac.builder, vertex_id, - vertex_dw_stride, ""); - - /* Write outputs to LDS. The next shader (TCS aka HS) will read - * its inputs from it. */ - for (i = 0; i < info->num_outputs; i++) { - unsigned name = info->output_semantic_name[i]; - unsigned index = info->output_semantic_index[i]; - - /* The ARB_shader_viewport_layer_array spec contains the - * following issue: - * - * 2) What happens if gl_ViewportIndex or gl_Layer is - * written in the vertex shader and a geometry shader is - * present? - * - * RESOLVED: The value written by the last vertex processing - * stage is used. If the last vertex processing stage - * (vertex, tessellation evaluation or geometry) does not - * statically assign to gl_ViewportIndex or gl_Layer, index - * or layer zero is assumed. - * - * So writes to those outputs in VS-as-LS are simply ignored. - */ - if (name == TGSI_SEMANTIC_LAYER || - name == TGSI_SEMANTIC_VIEWPORT_INDEX) - continue; - - int param = si_shader_io_get_unique_index(name, index, false); - LLVMValueRef dw_addr = LLVMBuildAdd(ctx->ac.builder, base_dw_addr, - LLVMConstInt(ctx->i32, param * 4, 0), ""); - - for (chan = 0; chan < 4; chan++) { - if (!(info->output_usagemask[i] & (1 << chan))) - continue; - - lshs_lds_store(ctx, chan, dw_addr, - LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "")); - } - } - - if (ctx->screen->info.chip_class >= GFX9) - si_set_ls_return_value_for_tcs(ctx); -} - -static void si_llvm_emit_es_epilogue(struct ac_shader_abi *abi, - unsigned max_outputs, - LLVMValueRef *addrs) -{ - struct si_shader_context *ctx = si_shader_context_from_abi(abi); - struct si_shader *es = ctx->shader; - struct tgsi_shader_info *info = &es->selector->info; - LLVMValueRef soffset = LLVMGetParam(ctx->main_fn, - ctx->param_es2gs_offset); - LLVMValueRef lds_base = NULL; - unsigned chan; - int i; - - if (ctx->screen->info.chip_class >= GFX9 && info->num_outputs) { - unsigned itemsize_dw = es->selector->esgs_itemsize / 4; - LLVMValueRef vertex_idx = ac_get_thread_id(&ctx->ac); - LLVMValueRef wave_idx = si_unpack_param(ctx, ctx->param_merged_wave_info, 24, 4); - vertex_idx = LLVMBuildOr(ctx->ac.builder, vertex_idx, - LLVMBuildMul(ctx->ac.builder, wave_idx, - LLVMConstInt(ctx->i32, ctx->ac.wave_size, false), ""), ""); - lds_base = LLVMBuildMul(ctx->ac.builder, vertex_idx, - LLVMConstInt(ctx->i32, itemsize_dw, 0), ""); - } - - for (i = 0; i < info->num_outputs; i++) { - int param; - - if (info->output_semantic_name[i] == TGSI_SEMANTIC_VIEWPORT_INDEX || - info->output_semantic_name[i] == TGSI_SEMANTIC_LAYER) - continue; - - param = si_shader_io_get_unique_index(info->output_semantic_name[i], - info->output_semantic_index[i], false); - - for (chan = 0; chan < 4; chan++) { - if (!(info->output_usagemask[i] & (1 << chan))) - continue; - - LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], ""); - out_val = ac_to_integer(&ctx->ac, out_val); - - /* GFX9 has the ESGS ring in LDS. */ - if (ctx->screen->info.chip_class >= GFX9) { - LLVMValueRef idx = LLVMConstInt(ctx->i32, param * 4 + chan, false); - idx = LLVMBuildAdd(ctx->ac.builder, lds_base, idx, ""); - ac_build_indexed_store(&ctx->ac, ctx->esgs_ring, idx, out_val); - continue; - } - - ac_build_buffer_store_dword(&ctx->ac, - ctx->esgs_ring, - out_val, 1, NULL, soffset, - (4 * param + chan) * 4, - ac_glc | ac_slc, true); - } - } - - if (ctx->screen->info.chip_class >= GFX9) - si_set_es_return_value_for_gs(ctx); -} - -static LLVMValueRef si_get_gs_wave_id(struct si_shader_context *ctx) -{ - if (ctx->screen->info.chip_class >= GFX9) - return si_unpack_param(ctx, ctx->param_merged_wave_info, 16, 8); - else - return LLVMGetParam(ctx->main_fn, ctx->param_gs_wave_id); -} - -static void emit_gs_epilogue(struct si_shader_context *ctx) -{ - if (ctx->shader->key.as_ngg) { - gfx10_ngg_gs_emit_epilogue(ctx); - return; - } - - if (ctx->screen->info.chip_class >= GFX10) - LLVMBuildFence(ctx->ac.builder, LLVMAtomicOrderingRelease, false, ""); - - ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE, - si_get_gs_wave_id(ctx)); - - if (ctx->screen->info.chip_class >= GFX9) - ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label); -} - -static void si_llvm_emit_gs_epilogue(struct ac_shader_abi *abi, - unsigned max_outputs, - LLVMValueRef *addrs) -{ - struct si_shader_context *ctx = si_shader_context_from_abi(abi); - struct tgsi_shader_info UNUSED *info = &ctx->shader->selector->info; - - assert(info->num_outputs <= max_outputs); - - emit_gs_epilogue(ctx); -} - -static void si_tgsi_emit_gs_epilogue(struct lp_build_tgsi_context *bld_base) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - emit_gs_epilogue(ctx); -} - -static void si_llvm_emit_vs_epilogue(struct ac_shader_abi *abi, - unsigned max_outputs, - LLVMValueRef *addrs) -{ - struct si_shader_context *ctx = si_shader_context_from_abi(abi); - struct tgsi_shader_info *info = &ctx->shader->selector->info; - struct si_shader_output_values *outputs = NULL; - int i,j; - - assert(!ctx->shader->is_gs_copy_shader); - assert(info->num_outputs <= max_outputs); - - outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0])); - - for (i = 0; i < info->num_outputs; i++) { - outputs[i].semantic_name = info->output_semantic_name[i]; - outputs[i].semantic_index = info->output_semantic_index[i]; - - for (j = 0; j < 4; j++) { - outputs[i].values[j] = - LLVMBuildLoad(ctx->ac.builder, - addrs[4 * i + j], - ""); - outputs[i].vertex_stream[j] = - (info->output_streams[i] >> (2 * j)) & 3; - } - } - - if (!ctx->screen->use_ngg_streamout && - ctx->shader->selector->so.num_outputs) - si_llvm_emit_streamout(ctx, outputs, i, 0); - - /* Export PrimitiveID. */ - if (ctx->shader->key.mono.u.vs_export_prim_id) { - outputs[i].semantic_name = TGSI_SEMANTIC_PRIMID; - outputs[i].semantic_index = 0; - outputs[i].values[0] = ac_to_float(&ctx->ac, si_get_primitive_id(ctx, 0)); - for (j = 1; j < 4; j++) - outputs[i].values[j] = LLVMConstReal(ctx->f32, 0); - - memset(outputs[i].vertex_stream, 0, - sizeof(outputs[i].vertex_stream)); - i++; - } - - si_llvm_export_vs(ctx, outputs, i); - FREE(outputs); -} - -static void si_llvm_emit_prim_discard_cs_epilogue(struct ac_shader_abi *abi, - unsigned max_outputs, - LLVMValueRef *addrs) -{ - struct si_shader_context *ctx = si_shader_context_from_abi(abi); - struct tgsi_shader_info *info = &ctx->shader->selector->info; - LLVMValueRef pos[4] = {}; - - assert(info->num_outputs <= max_outputs); - - for (unsigned i = 0; i < info->num_outputs; i++) { - if (info->output_semantic_name[i] != TGSI_SEMANTIC_POSITION) - continue; - - for (unsigned chan = 0; chan < 4; chan++) - pos[chan] = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], ""); - break; - } - assert(pos[0] != NULL); - - /* Return the position output. */ - LLVMValueRef ret = ctx->return_value; - for (unsigned chan = 0; chan < 4; chan++) - ret = LLVMBuildInsertValue(ctx->ac.builder, ret, pos[chan], chan, ""); - ctx->return_value = ret; -} - -static void si_tgsi_emit_epilogue(struct lp_build_tgsi_context *bld_base) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - - ctx->abi.emit_outputs(&ctx->abi, RADEON_LLVM_MAX_OUTPUTS, - &ctx->outputs[0][0]); -} - -struct si_ps_exports { - unsigned num; - struct ac_export_args args[10]; -}; - -static void si_export_mrt_z(struct lp_build_tgsi_context *bld_base, - LLVMValueRef depth, LLVMValueRef stencil, - LLVMValueRef samplemask, struct si_ps_exports *exp) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - struct ac_export_args args; - - ac_export_mrt_z(&ctx->ac, depth, stencil, samplemask, &args); - - memcpy(&exp->args[exp->num++], &args, sizeof(args)); -} - -static void si_export_mrt_color(struct lp_build_tgsi_context *bld_base, - LLVMValueRef *color, unsigned index, - unsigned samplemask_param, - bool is_last, struct si_ps_exports *exp) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - int i; - - /* Clamp color */ - if (ctx->shader->key.part.ps.epilog.clamp_color) - for (i = 0; i < 4; i++) - color[i] = ac_build_clamp(&ctx->ac, color[i]); - - /* Alpha to one */ - if (ctx->shader->key.part.ps.epilog.alpha_to_one) - color[3] = ctx->ac.f32_1; - - /* Alpha test */ - if (index == 0 && - ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS) - si_alpha_test(bld_base, color[3]); - - /* Line & polygon smoothing */ - if (ctx->shader->key.part.ps.epilog.poly_line_smoothing) - color[3] = si_scale_alpha_by_sample_mask(bld_base, color[3], - samplemask_param); - - /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */ - if (ctx->shader->key.part.ps.epilog.last_cbuf > 0) { - struct ac_export_args args[8]; - int c, last = -1; - - /* Get the export arguments, also find out what the last one is. */ - for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) { - si_llvm_init_export_args(ctx, color, - V_008DFC_SQ_EXP_MRT + c, &args[c]); - if (args[c].enabled_channels) - last = c; - } - - /* Emit all exports. */ - for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) { - if (is_last && last == c) { - args[c].valid_mask = 1; /* whether the EXEC mask is valid */ - args[c].done = 1; /* DONE bit */ - } else if (!args[c].enabled_channels) - continue; /* unnecessary NULL export */ - - memcpy(&exp->args[exp->num++], &args[c], sizeof(args[c])); - } - } else { - struct ac_export_args args; - - /* Export */ - si_llvm_init_export_args(ctx, color, V_008DFC_SQ_EXP_MRT + index, - &args); - if (is_last) { - args.valid_mask = 1; /* whether the EXEC mask is valid */ - args.done = 1; /* DONE bit */ - } else if (!args.enabled_channels) - return; /* unnecessary NULL export */ - - memcpy(&exp->args[exp->num++], &args, sizeof(args)); - } -} - -static void si_emit_ps_exports(struct si_shader_context *ctx, - struct si_ps_exports *exp) -{ - for (unsigned i = 0; i < exp->num; i++) - ac_build_export(&ctx->ac, &exp->args[i]); -} - -/** - * Return PS outputs in this order: - * - * v[0:3] = color0.xyzw - * v[4:7] = color1.xyzw - * ... - * vN+0 = Depth - * vN+1 = Stencil - * vN+2 = SampleMask - * vN+3 = SampleMaskIn (used for OpenGL smoothing) - * - * The alpha-ref SGPR is returned via its original location. - */ -static void si_llvm_return_fs_outputs(struct ac_shader_abi *abi, - unsigned max_outputs, - LLVMValueRef *addrs) -{ - struct si_shader_context *ctx = si_shader_context_from_abi(abi); - struct si_shader *shader = ctx->shader; - struct tgsi_shader_info *info = &shader->selector->info; - LLVMBuilderRef builder = ctx->ac.builder; - unsigned i, j, first_vgpr, vgpr; - - LLVMValueRef color[8][4] = {}; - LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL; - LLVMValueRef ret; - - if (ctx->postponed_kill) - ac_build_kill_if_false(&ctx->ac, LLVMBuildLoad(builder, ctx->postponed_kill, "")); - - /* Read the output values. */ - for (i = 0; i < info->num_outputs; i++) { - unsigned semantic_name = info->output_semantic_name[i]; - unsigned semantic_index = info->output_semantic_index[i]; - - switch (semantic_name) { - case TGSI_SEMANTIC_COLOR: - assert(semantic_index < 8); - for (j = 0; j < 4; j++) { - LLVMValueRef ptr = addrs[4 * i + j]; - LLVMValueRef result = LLVMBuildLoad(builder, ptr, ""); - color[semantic_index][j] = result; - } - break; - case TGSI_SEMANTIC_POSITION: - depth = LLVMBuildLoad(builder, - addrs[4 * i + 2], ""); - break; - case TGSI_SEMANTIC_STENCIL: - stencil = LLVMBuildLoad(builder, - addrs[4 * i + 1], ""); - break; - case TGSI_SEMANTIC_SAMPLEMASK: - samplemask = LLVMBuildLoad(builder, - addrs[4 * i + 0], ""); - break; - default: - fprintf(stderr, "Warning: GFX6 unhandled fs output type:%d\n", - semantic_name); - } - } - - /* Fill the return structure. */ - ret = ctx->return_value; - - /* Set SGPRs. */ - ret = LLVMBuildInsertValue(builder, ret, - ac_to_integer(&ctx->ac, - LLVMGetParam(ctx->main_fn, - SI_PARAM_ALPHA_REF)), - SI_SGPR_ALPHA_REF, ""); - - /* Set VGPRs */ - first_vgpr = vgpr = SI_SGPR_ALPHA_REF + 1; - for (i = 0; i < ARRAY_SIZE(color); i++) { - if (!color[i][0]) - continue; - - for (j = 0; j < 4; j++) - ret = LLVMBuildInsertValue(builder, ret, color[i][j], vgpr++, ""); - } - if (depth) - ret = LLVMBuildInsertValue(builder, ret, depth, vgpr++, ""); - if (stencil) - ret = LLVMBuildInsertValue(builder, ret, stencil, vgpr++, ""); - if (samplemask) - ret = LLVMBuildInsertValue(builder, ret, samplemask, vgpr++, ""); - - /* Add the input sample mask for smoothing at the end. */ - if (vgpr < first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC) - vgpr = first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC; - ret = LLVMBuildInsertValue(builder, ret, - LLVMGetParam(ctx->main_fn, - SI_PARAM_SAMPLE_COVERAGE), vgpr++, ""); - - ctx->return_value = ret; -} - -static void membar_emit( - const struct lp_build_tgsi_action *action, - struct lp_build_tgsi_context *bld_base, - struct lp_build_emit_data *emit_data) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - LLVMValueRef src0 = lp_build_emit_fetch(bld_base, emit_data->inst, 0, 0); - unsigned flags = LLVMConstIntGetZExtValue(src0); - unsigned wait_flags = 0; - - if (flags & TGSI_MEMBAR_THREAD_GROUP) - wait_flags |= AC_WAIT_LGKM | AC_WAIT_VLOAD | AC_WAIT_VSTORE; - - if (flags & (TGSI_MEMBAR_ATOMIC_BUFFER | - TGSI_MEMBAR_SHADER_BUFFER | - TGSI_MEMBAR_SHADER_IMAGE)) - wait_flags |= AC_WAIT_VLOAD | AC_WAIT_VSTORE; - - if (flags & TGSI_MEMBAR_SHARED) - wait_flags |= AC_WAIT_LGKM; - - ac_build_waitcnt(&ctx->ac, wait_flags); -} - -static void clock_emit( - const struct lp_build_tgsi_action *action, - struct lp_build_tgsi_context *bld_base, - struct lp_build_emit_data *emit_data) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - LLVMValueRef tmp = ac_build_shader_clock(&ctx->ac); - - emit_data->output[0] = - LLVMBuildExtractElement(ctx->ac.builder, tmp, ctx->i32_0, ""); - emit_data->output[1] = - LLVMBuildExtractElement(ctx->ac.builder, tmp, ctx->i32_1, ""); -} - -static void si_llvm_emit_ddxy( - const struct lp_build_tgsi_action *action, - struct lp_build_tgsi_context *bld_base, - struct lp_build_emit_data *emit_data) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - unsigned opcode = emit_data->info->opcode; - LLVMValueRef val; - int idx; - unsigned mask; - - if (opcode == TGSI_OPCODE_DDX_FINE) - mask = AC_TID_MASK_LEFT; - else if (opcode == TGSI_OPCODE_DDY_FINE) - mask = AC_TID_MASK_TOP; - else - mask = AC_TID_MASK_TOP_LEFT; - - /* for DDX we want to next X pixel, DDY next Y pixel. */ - idx = (opcode == TGSI_OPCODE_DDX || opcode == TGSI_OPCODE_DDX_FINE) ? 1 : 2; - - val = ac_to_integer(&ctx->ac, emit_data->args[0]); - val = ac_build_ddxy(&ctx->ac, mask, idx, val); - emit_data->output[emit_data->chan] = val; -} - -static void build_interp_intrinsic(const struct lp_build_tgsi_action *action, - struct lp_build_tgsi_context *bld_base, - struct lp_build_emit_data *emit_data) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - struct si_shader *shader = ctx->shader; - const struct tgsi_shader_info *info = &shader->selector->info; - LLVMValueRef interp_param; - const struct tgsi_full_instruction *inst = emit_data->inst; - const struct tgsi_full_src_register *input = &inst->Src[0]; - int input_base, input_array_size; - int chan; - int i; - LLVMValueRef prim_mask = ctx->abi.prim_mask; - LLVMValueRef array_idx, offset_x = NULL, offset_y = NULL; - int interp_param_idx; - unsigned interp; - unsigned location; - - if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) { - /* offset is in second src, first two channels */ - offset_x = lp_build_emit_fetch(bld_base, emit_data->inst, 1, - TGSI_CHAN_X); - offset_y = lp_build_emit_fetch(bld_base, emit_data->inst, 1, - TGSI_CHAN_Y); - } else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) { - LLVMValueRef sample_position; - LLVMValueRef sample_id; - LLVMValueRef halfval = LLVMConstReal(ctx->f32, 0.5f); - - /* fetch sample ID, then fetch its sample position, - * and place into first two channels. - */ - sample_id = lp_build_emit_fetch(bld_base, - emit_data->inst, 1, TGSI_CHAN_X); - sample_id = ac_to_integer(&ctx->ac, sample_id); - - /* Section 8.13.2 (Interpolation Functions) of the OpenGL Shading - * Language 4.50 spec says about interpolateAtSample: - * - * "Returns the value of the input interpolant variable at - * the location of sample number sample. If multisample - * buffers are not available, the input variable will be - * evaluated at the center of the pixel. If sample sample - * does not exist, the position used to interpolate the - * input variable is undefined." - * - * This means that sample_id values outside of the valid are - * in fact valid input, and the usual mechanism for loading the - * sample position doesn't work. - */ - if (ctx->shader->key.mono.u.ps.interpolate_at_sample_force_center) { - LLVMValueRef center[4] = { - LLVMConstReal(ctx->f32, 0.5), - LLVMConstReal(ctx->f32, 0.5), - ctx->ac.f32_0, - ctx->ac.f32_0, - }; - - sample_position = ac_build_gather_values(&ctx->ac, center, 4); - } else { - sample_position = load_sample_position(&ctx->abi, sample_id); - } - - offset_x = LLVMBuildExtractElement(ctx->ac.builder, sample_position, - ctx->i32_0, ""); - - offset_x = LLVMBuildFSub(ctx->ac.builder, offset_x, halfval, ""); - offset_y = LLVMBuildExtractElement(ctx->ac.builder, sample_position, - ctx->i32_1, ""); - offset_y = LLVMBuildFSub(ctx->ac.builder, offset_y, halfval, ""); - } - - assert(input->Register.File == TGSI_FILE_INPUT); - - if (input->Register.Indirect) { - unsigned array_id = input->Indirect.ArrayID; - - if (array_id) { - input_base = info->input_array_first[array_id]; - input_array_size = info->input_array_last[array_id] - input_base + 1; - } else { - input_base = inst->Src[0].Register.Index; - input_array_size = info->num_inputs - input_base; - } - - array_idx = si_get_indirect_index(ctx, &input->Indirect, - 1, input->Register.Index - input_base); - } else { - input_base = inst->Src[0].Register.Index; - input_array_size = 1; - array_idx = ctx->i32_0; - } - - interp = shader->selector->info.input_interpolate[input_base]; - - if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET || - inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) - location = TGSI_INTERPOLATE_LOC_CENTER; - else - location = TGSI_INTERPOLATE_LOC_CENTROID; - - interp_param_idx = lookup_interp_param_index(interp, location); - if (interp_param_idx == -1) - return; - else if (interp_param_idx) - interp_param = LLVMGetParam(ctx->main_fn, interp_param_idx); - else - interp_param = NULL; - - if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET || - inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) { - LLVMValueRef ij_out[2]; - LLVMValueRef ddxy_out = ac_build_ddxy_interp(&ctx->ac, interp_param); - - /* - * take the I then J parameters, and the DDX/Y for it, and - * calculate the IJ inputs for the interpolator. - * temp1 = ddx * offset/sample.x + I; - * interp_param.I = ddy * offset/sample.y + temp1; - * temp1 = ddx * offset/sample.x + J; - * interp_param.J = ddy * offset/sample.y + temp1; - */ - for (i = 0; i < 2; i++) { - LLVMValueRef ix_ll = LLVMConstInt(ctx->i32, i, 0); - LLVMValueRef iy_ll = LLVMConstInt(ctx->i32, i + 2, 0); - LLVMValueRef ddx_el = LLVMBuildExtractElement(ctx->ac.builder, - ddxy_out, ix_ll, ""); - LLVMValueRef ddy_el = LLVMBuildExtractElement(ctx->ac.builder, - ddxy_out, iy_ll, ""); - LLVMValueRef interp_el = LLVMBuildExtractElement(ctx->ac.builder, - interp_param, ix_ll, ""); - LLVMValueRef temp; - - interp_el = ac_to_float(&ctx->ac, interp_el); - - temp = ac_build_fmad(&ctx->ac, ddx_el, offset_x, interp_el); - ij_out[i] = ac_build_fmad(&ctx->ac, ddy_el, offset_y, temp); - } - interp_param = ac_build_gather_values(&ctx->ac, ij_out, 2); - } - - if (interp_param) - interp_param = ac_to_float(&ctx->ac, interp_param); - - for (chan = 0; chan < 4; chan++) { - LLVMValueRef gather = LLVMGetUndef(LLVMVectorType(ctx->f32, input_array_size)); - unsigned schan = tgsi_util_get_full_src_register_swizzle(&inst->Src[0], chan); - - for (unsigned idx = 0; idx < input_array_size; ++idx) { - LLVMValueRef v, i = NULL, j = NULL; - - if (interp_param) { - i = LLVMBuildExtractElement( - ctx->ac.builder, interp_param, ctx->i32_0, ""); - j = LLVMBuildExtractElement( - ctx->ac.builder, interp_param, ctx->i32_1, ""); - } - v = si_build_fs_interp(ctx, input_base + idx, schan, - prim_mask, i, j); - - gather = LLVMBuildInsertElement(ctx->ac.builder, - gather, v, LLVMConstInt(ctx->i32, idx, false), ""); - } - - emit_data->output[chan] = LLVMBuildExtractElement( - ctx->ac.builder, gather, array_idx, ""); - } -} - -static void vote_all_emit( - const struct lp_build_tgsi_action *action, - struct lp_build_tgsi_context *bld_base, - struct lp_build_emit_data *emit_data) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - - LLVMValueRef tmp = ac_build_vote_all(&ctx->ac, emit_data->args[0]); - emit_data->output[emit_data->chan] = - LLVMBuildSExt(ctx->ac.builder, tmp, ctx->i32, ""); -} - -static void vote_any_emit( - const struct lp_build_tgsi_action *action, - struct lp_build_tgsi_context *bld_base, - struct lp_build_emit_data *emit_data) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - - LLVMValueRef tmp = ac_build_vote_any(&ctx->ac, emit_data->args[0]); - emit_data->output[emit_data->chan] = - LLVMBuildSExt(ctx->ac.builder, tmp, ctx->i32, ""); -} - -static void vote_eq_emit( - const struct lp_build_tgsi_action *action, - struct lp_build_tgsi_context *bld_base, - struct lp_build_emit_data *emit_data) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - - LLVMValueRef tmp = ac_build_vote_eq(&ctx->ac, emit_data->args[0]); - emit_data->output[emit_data->chan] = - LLVMBuildSExt(ctx->ac.builder, tmp, ctx->i32, ""); -} - -static void ballot_emit( - const struct lp_build_tgsi_action *action, - struct lp_build_tgsi_context *bld_base, - struct lp_build_emit_data *emit_data) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - LLVMBuilderRef builder = ctx->ac.builder; - LLVMValueRef tmp; - - tmp = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_X); - tmp = ac_build_ballot(&ctx->ac, tmp); - - emit_data->output[0] = LLVMBuildTrunc(builder, tmp, ctx->i32, ""); - - if (ctx->ac.wave_size == 32) { - emit_data->output[1] = ctx->i32_0; - } else { - tmp = LLVMBuildLShr(builder, tmp, LLVMConstInt(ctx->i64, 32, 0), ""); - emit_data->output[1] = LLVMBuildTrunc(builder, tmp, ctx->i32, ""); - } -} - -static void read_lane_emit( - const struct lp_build_tgsi_action *action, - struct lp_build_tgsi_context *bld_base, - struct lp_build_emit_data *emit_data) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - - if (emit_data->inst->Instruction.Opcode == TGSI_OPCODE_READ_INVOC) { - emit_data->args[0] = lp_build_emit_fetch(bld_base, emit_data->inst, - 0, emit_data->src_chan); - - /* Always read the source invocation (= lane) from the X channel. */ - emit_data->args[1] = lp_build_emit_fetch(bld_base, emit_data->inst, - 1, TGSI_CHAN_X); - emit_data->arg_count = 2; - } - - /* We currently have no other way to prevent LLVM from lifting the icmp - * calls to a dominating basic block. - */ - ac_build_optimization_barrier(&ctx->ac, &emit_data->args[0]); - - for (unsigned i = 0; i < emit_data->arg_count; ++i) - emit_data->args[i] = ac_to_integer(&ctx->ac, emit_data->args[i]); - - emit_data->output[emit_data->chan] = - ac_build_intrinsic(&ctx->ac, action->intr_name, - ctx->i32, emit_data->args, emit_data->arg_count, - AC_FUNC_ATTR_READNONE | - AC_FUNC_ATTR_CONVERGENT); -} - -static unsigned si_llvm_get_stream(struct lp_build_tgsi_context *bld_base, - struct lp_build_emit_data *emit_data) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - struct tgsi_src_register src0 = emit_data->inst->Src[0].Register; - LLVMValueRef imm; - unsigned stream; - - assert(src0.File == TGSI_FILE_IMMEDIATE); - - imm = ctx->imms[src0.Index * TGSI_NUM_CHANNELS + src0.SwizzleX]; - stream = LLVMConstIntGetZExtValue(imm) & 0x3; - return stream; -} - -/* Emit one vertex from the geometry shader */ -static void si_llvm_emit_vertex(struct ac_shader_abi *abi, - unsigned stream, - LLVMValueRef *addrs) -{ - struct si_shader_context *ctx = si_shader_context_from_abi(abi); - - if (ctx->shader->key.as_ngg) { - gfx10_ngg_gs_emit_vertex(ctx, stream, addrs); - return; - } - - struct tgsi_shader_info *info = &ctx->shader->selector->info; - struct si_shader *shader = ctx->shader; - LLVMValueRef soffset = LLVMGetParam(ctx->main_fn, - ctx->param_gs2vs_offset); - LLVMValueRef gs_next_vertex; - LLVMValueRef can_emit; - unsigned chan, offset; - int i; - - /* Write vertex attribute values to GSVS ring */ - gs_next_vertex = LLVMBuildLoad(ctx->ac.builder, - ctx->gs_next_vertex[stream], - ""); - - /* If this thread has already emitted the declared maximum number of - * vertices, skip the write: excessive vertex emissions are not - * supposed to have any effect. - * - * If the shader has no writes to memory, kill it instead. This skips - * further memory loads and may allow LLVM to skip to the end - * altogether. - */ - can_emit = LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, gs_next_vertex, - LLVMConstInt(ctx->i32, - shader->selector->gs_max_out_vertices, 0), ""); - - bool use_kill = !info->writes_memory; - if (use_kill) { - ac_build_kill_if_false(&ctx->ac, can_emit); - } else { - ac_build_ifcc(&ctx->ac, can_emit, 6505); - } - - offset = 0; - for (i = 0; i < info->num_outputs; i++) { - for (chan = 0; chan < 4; chan++) { - if (!(info->output_usagemask[i] & (1 << chan)) || - ((info->output_streams[i] >> (2 * chan)) & 3) != stream) - continue; - - LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], ""); - LLVMValueRef voffset = - LLVMConstInt(ctx->i32, offset * - shader->selector->gs_max_out_vertices, 0); - offset++; - - voffset = LLVMBuildAdd(ctx->ac.builder, voffset, gs_next_vertex, ""); - voffset = LLVMBuildMul(ctx->ac.builder, voffset, - LLVMConstInt(ctx->i32, 4, 0), ""); - - out_val = ac_to_integer(&ctx->ac, out_val); - - ac_build_buffer_store_dword(&ctx->ac, - ctx->gsvs_ring[stream], - out_val, 1, - voffset, soffset, 0, - ac_glc | ac_slc, true); - } - } - - gs_next_vertex = LLVMBuildAdd(ctx->ac.builder, gs_next_vertex, ctx->i32_1, ""); - LLVMBuildStore(ctx->ac.builder, gs_next_vertex, ctx->gs_next_vertex[stream]); - - /* Signal vertex emission if vertex data was written. */ - if (offset) { - ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8), - si_get_gs_wave_id(ctx)); - } - - if (!use_kill) - ac_build_endif(&ctx->ac, 6505); -} - -/* Emit one vertex from the geometry shader */ -static void si_tgsi_emit_vertex( - const struct lp_build_tgsi_action *action, - struct lp_build_tgsi_context *bld_base, - struct lp_build_emit_data *emit_data) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - unsigned stream = si_llvm_get_stream(bld_base, emit_data); - - si_llvm_emit_vertex(&ctx->abi, stream, ctx->outputs[0]); -} - -/* Cut one primitive from the geometry shader */ -static void si_llvm_emit_primitive(struct ac_shader_abi *abi, - unsigned stream) -{ - struct si_shader_context *ctx = si_shader_context_from_abi(abi); - - if (ctx->shader->key.as_ngg) { - LLVMBuildStore(ctx->ac.builder, ctx->ac.i32_0, ctx->gs_curprim_verts[stream]); - return; - } - - /* Signal primitive cut */ - ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_CUT | AC_SENDMSG_GS | (stream << 8), - si_get_gs_wave_id(ctx)); -} - -/* Cut one primitive from the geometry shader */ -static void si_tgsi_emit_primitive( - const struct lp_build_tgsi_action *action, - struct lp_build_tgsi_context *bld_base, - struct lp_build_emit_data *emit_data) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - - si_llvm_emit_primitive(&ctx->abi, si_llvm_get_stream(bld_base, emit_data)); -} - -static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action, - struct lp_build_tgsi_context *bld_base, - struct lp_build_emit_data *emit_data) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - - /* GFX6 only (thanks to a hw bug workaround): - * The real barrier instruction isn’t needed, because an entire patch - * always fits into a single wave. - */ - if (ctx->screen->info.chip_class == GFX6 && - ctx->type == PIPE_SHADER_TESS_CTRL) { - ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM | AC_WAIT_VLOAD | AC_WAIT_VSTORE); - return; - } - - ac_build_s_barrier(&ctx->ac); -} - -void si_create_function(struct si_shader_context *ctx, - const char *name, - LLVMTypeRef *returns, unsigned num_returns, - struct si_function_info *fninfo, - unsigned max_workgroup_size) -{ - int i; - - si_llvm_create_func(ctx, name, returns, num_returns, - fninfo->types, fninfo->num_params); - ctx->return_value = LLVMGetUndef(ctx->return_type); - - for (i = 0; i < fninfo->num_sgpr_params; ++i) { - LLVMValueRef P = LLVMGetParam(ctx->main_fn, i); - - /* The combination of: - * - noalias - * - dereferenceable - * - invariant.load - * allows the optimization passes to move loads and reduces - * SGPR spilling significantly. - */ - ac_add_function_attr(ctx->ac.context, ctx->main_fn, i + 1, - AC_FUNC_ATTR_INREG); - - if (LLVMGetTypeKind(LLVMTypeOf(P)) == LLVMPointerTypeKind) { - ac_add_function_attr(ctx->ac.context, ctx->main_fn, i + 1, - AC_FUNC_ATTR_NOALIAS); - ac_add_attr_dereferenceable(P, UINT64_MAX); - } - } - - for (i = 0; i < fninfo->num_params; ++i) { - if (fninfo->assign[i]) - *fninfo->assign[i] = LLVMGetParam(ctx->main_fn, i); - } - - if (ctx->screen->info.address32_hi) { - ac_llvm_add_target_dep_function_attr(ctx->main_fn, - "amdgpu-32bit-address-high-bits", - ctx->screen->info.address32_hi); - } - - ac_llvm_set_workgroup_size(ctx->main_fn, max_workgroup_size); - - LLVMAddTargetDependentFunctionAttr(ctx->main_fn, - "no-signed-zeros-fp-math", - "true"); -} - -static void declare_streamout_params(struct si_shader_context *ctx, - struct pipe_stream_output_info *so, - struct si_function_info *fninfo) -{ - if (ctx->screen->use_ngg_streamout) - return; - - /* Streamout SGPRs. */ - if (so->num_outputs) { - if (ctx->type != PIPE_SHADER_TESS_EVAL) - ctx->param_streamout_config = add_arg(fninfo, ARG_SGPR, ctx->ac.i32); - else - ctx->param_streamout_config = fninfo->num_params - 1; - - ctx->param_streamout_write_index = add_arg(fninfo, ARG_SGPR, ctx->ac.i32); - } - /* A streamout buffer offset is loaded if the stride is non-zero. */ - for (int i = 0; i < 4; i++) { - if (!so->stride[i]) - continue; - - ctx->param_streamout_offset[i] = add_arg(fninfo, ARG_SGPR, ctx->ac.i32); - } -} - -static unsigned si_get_max_workgroup_size(const struct si_shader *shader) -{ - switch (shader->selector->type) { - case PIPE_SHADER_VERTEX: - case PIPE_SHADER_TESS_EVAL: - return shader->key.as_ngg ? 128 : 0; - - case PIPE_SHADER_TESS_CTRL: - /* Return this so that LLVM doesn't remove s_barrier - * instructions on chips where we use s_barrier. */ - return shader->selector->screen->info.chip_class >= GFX7 ? 128 : 0; - - case PIPE_SHADER_GEOMETRY: - return shader->selector->screen->info.chip_class >= GFX9 ? 128 : 0; - - case PIPE_SHADER_COMPUTE: - break; /* see below */ - - default: - return 0; - } - - const unsigned *properties = shader->selector->info.properties; - unsigned max_work_group_size = - properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] * - properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] * - properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]; - - if (!max_work_group_size) { - /* This is a variable group size compute shader, - * compile it for the maximum possible group size. - */ - max_work_group_size = SI_MAX_VARIABLE_THREADS_PER_BLOCK; - } - return max_work_group_size; -} - -static void declare_const_and_shader_buffers(struct si_shader_context *ctx, - struct si_function_info *fninfo, - bool assign_params) -{ - LLVMTypeRef const_shader_buf_type; - - if (ctx->shader->selector->info.const_buffers_declared == 1 && - ctx->shader->selector->info.shader_buffers_declared == 0) - const_shader_buf_type = ctx->f32; - else - const_shader_buf_type = ctx->v4i32; - - unsigned const_and_shader_buffers = - add_arg(fninfo, ARG_SGPR, - ac_array_in_const32_addr_space(const_shader_buf_type)); - - if (assign_params) - ctx->param_const_and_shader_buffers = const_and_shader_buffers; -} - -static void declare_samplers_and_images(struct si_shader_context *ctx, - struct si_function_info *fninfo, - bool assign_params) -{ - unsigned samplers_and_images = - add_arg(fninfo, ARG_SGPR, - ac_array_in_const32_addr_space(ctx->v8i32)); - - if (assign_params) - ctx->param_samplers_and_images = samplers_and_images; -} - -static void declare_per_stage_desc_pointers(struct si_shader_context *ctx, - struct si_function_info *fninfo, - bool assign_params) -{ - declare_const_and_shader_buffers(ctx, fninfo, assign_params); - declare_samplers_and_images(ctx, fninfo, assign_params); -} - -static void declare_global_desc_pointers(struct si_shader_context *ctx, - struct si_function_info *fninfo) -{ - ctx->param_rw_buffers = add_arg(fninfo, ARG_SGPR, - ac_array_in_const32_addr_space(ctx->v4i32)); - ctx->param_bindless_samplers_and_images = add_arg(fninfo, ARG_SGPR, - ac_array_in_const32_addr_space(ctx->v8i32)); -} - -static void declare_vs_specific_input_sgprs(struct si_shader_context *ctx, - struct si_function_info *fninfo) -{ - ctx->param_vs_state_bits = add_arg(fninfo, ARG_SGPR, ctx->i32); - add_arg_assign(fninfo, ARG_SGPR, ctx->i32, &ctx->abi.base_vertex); - add_arg_assign(fninfo, ARG_SGPR, ctx->i32, &ctx->abi.start_instance); - add_arg_assign(fninfo, ARG_SGPR, ctx->i32, &ctx->abi.draw_id); -} - -static void declare_vs_input_vgprs(struct si_shader_context *ctx, - struct si_function_info *fninfo, - unsigned *num_prolog_vgprs) -{ - struct si_shader *shader = ctx->shader; - - add_arg_assign(fninfo, ARG_VGPR, ctx->i32, &ctx->abi.vertex_id); - if (shader->key.as_ls) { - ctx->param_rel_auto_id = add_arg(fninfo, ARG_VGPR, ctx->i32); - if (ctx->screen->info.chip_class >= GFX10) { - add_arg(fninfo, ARG_VGPR, ctx->i32); /* user VGPR */ - add_arg_assign(fninfo, ARG_VGPR, ctx->i32, &ctx->abi.instance_id); - } else { - add_arg_assign(fninfo, ARG_VGPR, ctx->i32, &ctx->abi.instance_id); - add_arg(fninfo, ARG_VGPR, ctx->i32); /* unused */ - } - } else if (ctx->screen->info.chip_class >= GFX10) { - add_arg(fninfo, ARG_VGPR, ctx->i32); /* user vgpr */ - ctx->param_vs_prim_id = add_arg(fninfo, ARG_VGPR, ctx->i32); /* user vgpr or PrimID (legacy) */ - add_arg_assign(fninfo, ARG_VGPR, ctx->i32, &ctx->abi.instance_id); - } else { - add_arg_assign(fninfo, ARG_VGPR, ctx->i32, &ctx->abi.instance_id); - ctx->param_vs_prim_id = add_arg(fninfo, ARG_VGPR, ctx->i32); - add_arg(fninfo, ARG_VGPR, ctx->i32); /* unused */ - } - - if (!shader->is_gs_copy_shader) { - /* Vertex load indices. */ - ctx->param_vertex_index0 = fninfo->num_params; - for (unsigned i = 0; i < shader->selector->info.num_inputs; i++) - add_arg(fninfo, ARG_VGPR, ctx->i32); - *num_prolog_vgprs += shader->selector->info.num_inputs; - } -} - -static void declare_vs_blit_inputs(struct si_shader_context *ctx, - struct si_function_info *fninfo, - unsigned vs_blit_property) -{ - ctx->param_vs_blit_inputs = fninfo->num_params; - add_arg(fninfo, ARG_SGPR, ctx->i32); /* i16 x1, y1 */ - add_arg(fninfo, ARG_SGPR, ctx->i32); /* i16 x2, y2 */ - add_arg(fninfo, ARG_SGPR, ctx->f32); /* depth */ - - if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_COLOR) { - add_arg(fninfo, ARG_SGPR, ctx->f32); /* color0 */ - add_arg(fninfo, ARG_SGPR, ctx->f32); /* color1 */ - add_arg(fninfo, ARG_SGPR, ctx->f32); /* color2 */ - add_arg(fninfo, ARG_SGPR, ctx->f32); /* color3 */ - } else if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_TEXCOORD) { - add_arg(fninfo, ARG_SGPR, ctx->f32); /* texcoord.x1 */ - add_arg(fninfo, ARG_SGPR, ctx->f32); /* texcoord.y1 */ - add_arg(fninfo, ARG_SGPR, ctx->f32); /* texcoord.x2 */ - add_arg(fninfo, ARG_SGPR, ctx->f32); /* texcoord.y2 */ - add_arg(fninfo, ARG_SGPR, ctx->f32); /* texcoord.z */ - add_arg(fninfo, ARG_SGPR, ctx->f32); /* texcoord.w */ - } -} - -static void declare_tes_input_vgprs(struct si_shader_context *ctx, - struct si_function_info *fninfo) -{ - ctx->param_tes_u = add_arg(fninfo, ARG_VGPR, ctx->f32); - ctx->param_tes_v = add_arg(fninfo, ARG_VGPR, ctx->f32); - ctx->param_tes_rel_patch_id = add_arg(fninfo, ARG_VGPR, ctx->i32); - add_arg_assign(fninfo, ARG_VGPR, ctx->i32, &ctx->abi.tes_patch_id); -} - -enum { - /* Convenient merged shader definitions. */ - SI_SHADER_MERGED_VERTEX_TESSCTRL = PIPE_SHADER_TYPES, - SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY, -}; - -static void create_function(struct si_shader_context *ctx) -{ - struct si_shader *shader = ctx->shader; - struct si_function_info fninfo; - LLVMTypeRef returns[16+32*4]; - unsigned i, num_return_sgprs; - unsigned num_returns = 0; - unsigned num_prolog_vgprs = 0; - unsigned type = ctx->type; - unsigned vs_blit_property = - shader->selector->info.properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD]; - - si_init_function_info(&fninfo); - - /* Set MERGED shaders. */ - if (ctx->screen->info.chip_class >= GFX9) { - if (shader->key.as_ls || type == PIPE_SHADER_TESS_CTRL) - type = SI_SHADER_MERGED_VERTEX_TESSCTRL; /* LS or HS */ - else if (shader->key.as_es || shader->key.as_ngg || type == PIPE_SHADER_GEOMETRY) - type = SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY; - } - - LLVMTypeRef v3i32 = LLVMVectorType(ctx->i32, 3); - - switch (type) { - case PIPE_SHADER_VERTEX: - declare_global_desc_pointers(ctx, &fninfo); - - if (vs_blit_property) { - declare_vs_blit_inputs(ctx, &fninfo, vs_blit_property); - - /* VGPRs */ - declare_vs_input_vgprs(ctx, &fninfo, &num_prolog_vgprs); - break; - } - - declare_per_stage_desc_pointers(ctx, &fninfo, true); - declare_vs_specific_input_sgprs(ctx, &fninfo); - ctx->param_vertex_buffers = add_arg(&fninfo, ARG_SGPR, - ac_array_in_const32_addr_space(ctx->v4i32)); - - if (shader->key.as_es) { - ctx->param_es2gs_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); - } else if (shader->key.as_ls) { - /* no extra parameters */ - } else { - if (shader->is_gs_copy_shader) { - fninfo.num_params = ctx->param_vs_state_bits + 1; - fninfo.num_sgpr_params = fninfo.num_params; - } - - /* The locations of the other parameters are assigned dynamically. */ - declare_streamout_params(ctx, &shader->selector->so, - &fninfo); - } - - /* VGPRs */ - declare_vs_input_vgprs(ctx, &fninfo, &num_prolog_vgprs); - - /* Return values */ - if (shader->key.opt.vs_as_prim_discard_cs) { - for (i = 0; i < 4; i++) - returns[num_returns++] = ctx->f32; /* VGPRs */ - } - break; - - case PIPE_SHADER_TESS_CTRL: /* GFX6-GFX8 */ - declare_global_desc_pointers(ctx, &fninfo); - declare_per_stage_desc_pointers(ctx, &fninfo, true); - ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32); - ctx->param_tcs_out_lds_offsets = add_arg(&fninfo, ARG_SGPR, ctx->i32); - ctx->param_tcs_out_lds_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32); - ctx->param_vs_state_bits = add_arg(&fninfo, ARG_SGPR, ctx->i32); - ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); - ctx->param_tcs_factor_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); - - /* VGPRs */ - add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->abi.tcs_patch_id); - add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->abi.tcs_rel_ids); - - /* param_tcs_offchip_offset and param_tcs_factor_offset are - * placed after the user SGPRs. - */ - for (i = 0; i < GFX6_TCS_NUM_USER_SGPR + 2; i++) - returns[num_returns++] = ctx->i32; /* SGPRs */ - for (i = 0; i < 11; i++) - returns[num_returns++] = ctx->f32; /* VGPRs */ - break; - - case SI_SHADER_MERGED_VERTEX_TESSCTRL: - /* Merged stages have 8 system SGPRs at the beginning. */ - /* SPI_SHADER_USER_DATA_ADDR_LO/HI_HS */ - declare_per_stage_desc_pointers(ctx, &fninfo, - ctx->type == PIPE_SHADER_TESS_CTRL); - ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); - ctx->param_merged_wave_info = add_arg(&fninfo, ARG_SGPR, ctx->i32); - ctx->param_tcs_factor_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); - ctx->param_merged_scratch_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); - add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */ - add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */ - - declare_global_desc_pointers(ctx, &fninfo); - declare_per_stage_desc_pointers(ctx, &fninfo, - ctx->type == PIPE_SHADER_VERTEX); - declare_vs_specific_input_sgprs(ctx, &fninfo); - - ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32); - ctx->param_tcs_out_lds_offsets = add_arg(&fninfo, ARG_SGPR, ctx->i32); - ctx->param_tcs_out_lds_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32); - ctx->param_vertex_buffers = add_arg(&fninfo, ARG_SGPR, - ac_array_in_const32_addr_space(ctx->v4i32)); - - /* VGPRs (first TCS, then VS) */ - add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->abi.tcs_patch_id); - add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->abi.tcs_rel_ids); - - if (ctx->type == PIPE_SHADER_VERTEX) { - declare_vs_input_vgprs(ctx, &fninfo, - &num_prolog_vgprs); - - /* LS return values are inputs to the TCS main shader part. */ - for (i = 0; i < 8 + GFX9_TCS_NUM_USER_SGPR; i++) - returns[num_returns++] = ctx->i32; /* SGPRs */ - for (i = 0; i < 2; i++) - returns[num_returns++] = ctx->f32; /* VGPRs */ - } else { - /* TCS return values are inputs to the TCS epilog. - * - * param_tcs_offchip_offset, param_tcs_factor_offset, - * param_tcs_offchip_layout, and param_rw_buffers - * should be passed to the epilog. - */ - for (i = 0; i <= 8 + GFX9_SGPR_TCS_OUT_LAYOUT; i++) - returns[num_returns++] = ctx->i32; /* SGPRs */ - for (i = 0; i < 11; i++) - returns[num_returns++] = ctx->f32; /* VGPRs */ - } - break; - - case SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY: - /* Merged stages have 8 system SGPRs at the beginning. */ - /* SPI_SHADER_USER_DATA_ADDR_LO/HI_GS */ - declare_per_stage_desc_pointers(ctx, &fninfo, - ctx->type == PIPE_SHADER_GEOMETRY); - - if (ctx->shader->key.as_ngg) - add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &ctx->gs_tg_info); - else - ctx->param_gs2vs_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); - - ctx->param_merged_wave_info = add_arg(&fninfo, ARG_SGPR, ctx->i32); - ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); - ctx->param_merged_scratch_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); - add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused (SPI_SHADER_PGM_LO/HI_GS << 8) */ - add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused (SPI_SHADER_PGM_LO/HI_GS >> 24) */ - - declare_global_desc_pointers(ctx, &fninfo); - if (ctx->type != PIPE_SHADER_VERTEX || !vs_blit_property) { - declare_per_stage_desc_pointers(ctx, &fninfo, - (ctx->type == PIPE_SHADER_VERTEX || - ctx->type == PIPE_SHADER_TESS_EVAL)); - } - - if (ctx->type == PIPE_SHADER_VERTEX) { - if (vs_blit_property) - declare_vs_blit_inputs(ctx, &fninfo, vs_blit_property); - else - declare_vs_specific_input_sgprs(ctx, &fninfo); - } else { - ctx->param_vs_state_bits = add_arg(&fninfo, ARG_SGPR, ctx->i32); - ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32); - ctx->param_tes_offchip_addr = add_arg(&fninfo, ARG_SGPR, ctx->i32); - /* Declare as many input SGPRs as the VS has. */ - } - - if (ctx->type == PIPE_SHADER_VERTEX) { - ctx->param_vertex_buffers = add_arg(&fninfo, ARG_SGPR, - ac_array_in_const32_addr_space(ctx->v4i32)); - } - - /* VGPRs (first GS, then VS/TES) */ - ctx->param_gs_vtx01_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32); - ctx->param_gs_vtx23_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32); - add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->abi.gs_prim_id); - add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->abi.gs_invocation_id); - ctx->param_gs_vtx45_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32); - - if (ctx->type == PIPE_SHADER_VERTEX) { - declare_vs_input_vgprs(ctx, &fninfo, - &num_prolog_vgprs); - } else if (ctx->type == PIPE_SHADER_TESS_EVAL) { - declare_tes_input_vgprs(ctx, &fninfo); - } - - if (ctx->shader->key.as_es && - (ctx->type == PIPE_SHADER_VERTEX || - ctx->type == PIPE_SHADER_TESS_EVAL)) { - unsigned num_user_sgprs; - - if (ctx->type == PIPE_SHADER_VERTEX) - num_user_sgprs = GFX9_VSGS_NUM_USER_SGPR; - else - num_user_sgprs = GFX9_TESGS_NUM_USER_SGPR; - - /* ES return values are inputs to GS. */ - for (i = 0; i < 8 + num_user_sgprs; i++) - returns[num_returns++] = ctx->i32; /* SGPRs */ - for (i = 0; i < 5; i++) - returns[num_returns++] = ctx->f32; /* VGPRs */ - } - break; - - case PIPE_SHADER_TESS_EVAL: - declare_global_desc_pointers(ctx, &fninfo); - declare_per_stage_desc_pointers(ctx, &fninfo, true); - ctx->param_vs_state_bits = add_arg(&fninfo, ARG_SGPR, ctx->i32); - ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32); - ctx->param_tes_offchip_addr = add_arg(&fninfo, ARG_SGPR, ctx->i32); - - if (shader->key.as_es) { - ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); - add_arg(&fninfo, ARG_SGPR, ctx->i32); - ctx->param_es2gs_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); - } else { - add_arg(&fninfo, ARG_SGPR, ctx->i32); - declare_streamout_params(ctx, &shader->selector->so, - &fninfo); - ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); - } - - /* VGPRs */ - declare_tes_input_vgprs(ctx, &fninfo); - break; - - case PIPE_SHADER_GEOMETRY: - declare_global_desc_pointers(ctx, &fninfo); - declare_per_stage_desc_pointers(ctx, &fninfo, true); - ctx->param_gs2vs_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); - ctx->param_gs_wave_id = add_arg(&fninfo, ARG_SGPR, ctx->i32); - - /* VGPRs */ - add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->gs_vtx_offset[0]); - add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->gs_vtx_offset[1]); - add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->abi.gs_prim_id); - add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->gs_vtx_offset[2]); - add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->gs_vtx_offset[3]); - add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->gs_vtx_offset[4]); - add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->gs_vtx_offset[5]); - add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->abi.gs_invocation_id); - break; - - case PIPE_SHADER_FRAGMENT: - declare_global_desc_pointers(ctx, &fninfo); - declare_per_stage_desc_pointers(ctx, &fninfo, true); - add_arg_checked(&fninfo, ARG_SGPR, ctx->f32, SI_PARAM_ALPHA_REF); - add_arg_assign_checked(&fninfo, ARG_SGPR, ctx->i32, - &ctx->abi.prim_mask, SI_PARAM_PRIM_MASK); - - add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_PERSP_SAMPLE); - add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_PERSP_CENTER); - add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_PERSP_CENTROID); - add_arg_checked(&fninfo, ARG_VGPR, v3i32, SI_PARAM_PERSP_PULL_MODEL); - add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_LINEAR_SAMPLE); - add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_LINEAR_CENTER); - add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_LINEAR_CENTROID); - add_arg_checked(&fninfo, ARG_VGPR, ctx->f32, SI_PARAM_LINE_STIPPLE_TEX); - add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->f32, - &ctx->abi.frag_pos[0], SI_PARAM_POS_X_FLOAT); - add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->f32, - &ctx->abi.frag_pos[1], SI_PARAM_POS_Y_FLOAT); - add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->f32, - &ctx->abi.frag_pos[2], SI_PARAM_POS_Z_FLOAT); - add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->f32, - &ctx->abi.frag_pos[3], SI_PARAM_POS_W_FLOAT); - add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->i32, - &ctx->abi.front_face, SI_PARAM_FRONT_FACE); - shader->info.face_vgpr_index = 20; - add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->i32, - &ctx->abi.ancillary, SI_PARAM_ANCILLARY); - shader->info.ancillary_vgpr_index = 21; - add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->f32, - &ctx->abi.sample_coverage, SI_PARAM_SAMPLE_COVERAGE); - add_arg_checked(&fninfo, ARG_VGPR, ctx->i32, SI_PARAM_POS_FIXED_PT); - - /* Color inputs from the prolog. */ - if (shader->selector->info.colors_read) { - unsigned num_color_elements = - util_bitcount(shader->selector->info.colors_read); - - assert(fninfo.num_params + num_color_elements <= ARRAY_SIZE(fninfo.types)); - for (i = 0; i < num_color_elements; i++) - add_arg(&fninfo, ARG_VGPR, ctx->f32); - - num_prolog_vgprs += num_color_elements; - } - - /* Outputs for the epilog. */ - num_return_sgprs = SI_SGPR_ALPHA_REF + 1; - num_returns = - num_return_sgprs + - util_bitcount(shader->selector->info.colors_written) * 4 + - shader->selector->info.writes_z + - shader->selector->info.writes_stencil + - shader->selector->info.writes_samplemask + - 1 /* SampleMaskIn */; - - num_returns = MAX2(num_returns, - num_return_sgprs + - PS_EPILOG_SAMPLEMASK_MIN_LOC + 1); - - for (i = 0; i < num_return_sgprs; i++) - returns[i] = ctx->i32; - for (; i < num_returns; i++) - returns[i] = ctx->f32; - break; - - case PIPE_SHADER_COMPUTE: - declare_global_desc_pointers(ctx, &fninfo); - declare_per_stage_desc_pointers(ctx, &fninfo, true); - if (shader->selector->info.uses_grid_size) - add_arg_assign(&fninfo, ARG_SGPR, v3i32, &ctx->abi.num_work_groups); - if (shader->selector->info.uses_block_size && - shader->selector->info.properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] == 0) - ctx->param_block_size = add_arg(&fninfo, ARG_SGPR, v3i32); - - unsigned cs_user_data_dwords = - shader->selector->info.properties[TGSI_PROPERTY_CS_USER_DATA_COMPONENTS_AMD]; - if (cs_user_data_dwords) { - ctx->param_cs_user_data = add_arg(&fninfo, ARG_SGPR, - LLVMVectorType(ctx->i32, cs_user_data_dwords)); - } - - for (i = 0; i < 3; i++) { - ctx->abi.workgroup_ids[i] = NULL; - if (shader->selector->info.uses_block_id[i]) - add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &ctx->abi.workgroup_ids[i]); - } - - add_arg_assign(&fninfo, ARG_VGPR, v3i32, &ctx->abi.local_invocation_ids); - break; - default: - assert(0 && "unimplemented shader"); - return; - } - - si_create_function(ctx, "main", returns, num_returns, &fninfo, - si_get_max_workgroup_size(shader)); - - /* Reserve register locations for VGPR inputs the PS prolog may need. */ - if (ctx->type == PIPE_SHADER_FRAGMENT && !ctx->shader->is_monolithic) { - ac_llvm_add_target_dep_function_attr(ctx->main_fn, - "InitialPSInputAddr", - S_0286D0_PERSP_SAMPLE_ENA(1) | - S_0286D0_PERSP_CENTER_ENA(1) | - S_0286D0_PERSP_CENTROID_ENA(1) | - S_0286D0_LINEAR_SAMPLE_ENA(1) | - S_0286D0_LINEAR_CENTER_ENA(1) | - S_0286D0_LINEAR_CENTROID_ENA(1) | - S_0286D0_FRONT_FACE_ENA(1) | - S_0286D0_ANCILLARY_ENA(1) | - S_0286D0_POS_FIXED_PT_ENA(1)); - } - - shader->info.num_input_sgprs = 0; - shader->info.num_input_vgprs = 0; - - for (i = 0; i < fninfo.num_sgpr_params; ++i) - shader->info.num_input_sgprs += ac_get_type_size(fninfo.types[i]) / 4; - - for (; i < fninfo.num_params; ++i) - shader->info.num_input_vgprs += ac_get_type_size(fninfo.types[i]) / 4; - - assert(shader->info.num_input_vgprs >= num_prolog_vgprs); - shader->info.num_input_vgprs -= num_prolog_vgprs; - - if (shader->key.as_ls || ctx->type == PIPE_SHADER_TESS_CTRL) { - if (USE_LDS_SYMBOLS && HAVE_LLVM >= 0x0900) { - /* The LSHS size is not known until draw time, so we append it - * at the end of whatever LDS use there may be in the rest of - * the shader (currently none, unless LLVM decides to do its - * own LDS-based lowering). - */ - ctx->ac.lds = LLVMAddGlobalInAddressSpace( - ctx->ac.module, LLVMArrayType(ctx->i32, 0), - "__lds_end", AC_ADDR_SPACE_LDS); - LLVMSetAlignment(ctx->ac.lds, 256); - } else { - ac_declare_lds_as_pointer(&ctx->ac); - } - } -} - -/* Ensure that the esgs ring is declared. - * - * We declare it with 64KB alignment as a hint that the - * pointer value will always be 0. - */ -static void declare_esgs_ring(struct si_shader_context *ctx) -{ - if (ctx->esgs_ring) - return; - - assert(!LLVMGetNamedGlobal(ctx->ac.module, "esgs_ring")); - - ctx->esgs_ring = LLVMAddGlobalInAddressSpace( - ctx->ac.module, LLVMArrayType(ctx->i32, 0), - "esgs_ring", - AC_ADDR_SPACE_LDS); - LLVMSetLinkage(ctx->esgs_ring, LLVMExternalLinkage); - LLVMSetAlignment(ctx->esgs_ring, 64 * 1024); -} - -/** - * Load ESGS and GSVS ring buffer resource descriptors and save the variables - * for later use. - */ -static void preload_ring_buffers(struct si_shader_context *ctx) -{ - LLVMBuilderRef builder = ctx->ac.builder; - - LLVMValueRef buf_ptr = LLVMGetParam(ctx->main_fn, - ctx->param_rw_buffers); - - if (ctx->shader->key.as_es || ctx->type == PIPE_SHADER_GEOMETRY) { - if (ctx->screen->info.chip_class <= GFX8) { - unsigned ring = - ctx->type == PIPE_SHADER_GEOMETRY ? SI_GS_RING_ESGS - : SI_ES_RING_ESGS; - LLVMValueRef offset = LLVMConstInt(ctx->i32, ring, 0); - - ctx->esgs_ring = - ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset); - } else { - if (USE_LDS_SYMBOLS && HAVE_LLVM >= 0x0900) { - /* Declare the ESGS ring as an explicit LDS symbol. */ - declare_esgs_ring(ctx); - } else { - ac_declare_lds_as_pointer(&ctx->ac); - ctx->esgs_ring = ctx->ac.lds; - } - } - } - - if (ctx->shader->is_gs_copy_shader) { - LLVMValueRef offset = LLVMConstInt(ctx->i32, SI_RING_GSVS, 0); - - ctx->gsvs_ring[0] = - ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset); - } else if (ctx->type == PIPE_SHADER_GEOMETRY) { - const struct si_shader_selector *sel = ctx->shader->selector; - LLVMValueRef offset = LLVMConstInt(ctx->i32, SI_RING_GSVS, 0); - LLVMValueRef base_ring; - - base_ring = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset); - - /* The conceptual layout of the GSVS ring is - * v0c0 .. vLv0 v0c1 .. vLc1 .. - * but the real memory layout is swizzled across - * threads: - * t0v0c0 .. t15v0c0 t0v1c0 .. t15v1c0 ... t15vLcL - * t16v0c0 .. - * Override the buffer descriptor accordingly. - */ - LLVMTypeRef v2i64 = LLVMVectorType(ctx->i64, 2); - uint64_t stream_offset = 0; - - for (unsigned stream = 0; stream < 4; ++stream) { - unsigned num_components; - unsigned stride; - unsigned num_records; - LLVMValueRef ring, tmp; - - num_components = sel->info.num_stream_output_components[stream]; - if (!num_components) - continue; - - stride = 4 * num_components * sel->gs_max_out_vertices; - - /* Limit on the stride field for <= GFX7. */ - assert(stride < (1 << 14)); - - num_records = ctx->ac.wave_size; - - ring = LLVMBuildBitCast(builder, base_ring, v2i64, ""); - tmp = LLVMBuildExtractElement(builder, ring, ctx->i32_0, ""); - tmp = LLVMBuildAdd(builder, tmp, - LLVMConstInt(ctx->i64, - stream_offset, 0), ""); - stream_offset += stride * ctx->ac.wave_size; - - ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->i32_0, ""); - ring = LLVMBuildBitCast(builder, ring, ctx->v4i32, ""); - tmp = LLVMBuildExtractElement(builder, ring, ctx->i32_1, ""); - tmp = LLVMBuildOr(builder, tmp, - LLVMConstInt(ctx->i32, - S_008F04_STRIDE(stride) | - S_008F04_SWIZZLE_ENABLE(1), 0), ""); - ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->i32_1, ""); - ring = LLVMBuildInsertElement(builder, ring, - LLVMConstInt(ctx->i32, num_records, 0), - LLVMConstInt(ctx->i32, 2, 0), ""); - - uint32_t rsrc3 = - S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | - S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | - S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | - S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) | - S_008F0C_INDEX_STRIDE(1) | /* index_stride = 16 (elements) */ - S_008F0C_ADD_TID_ENABLE(1); - - if (ctx->ac.chip_class >= GFX10) { - rsrc3 |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) | - S_008F0C_OOB_SELECT(2) | - S_008F0C_RESOURCE_LEVEL(1); - } else { - rsrc3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | - S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) | - S_008F0C_ELEMENT_SIZE(1); /* element_size = 4 (bytes) */ - } - - ring = LLVMBuildInsertElement(builder, ring, - LLVMConstInt(ctx->i32, rsrc3, false), - LLVMConstInt(ctx->i32, 3, 0), ""); - - ctx->gsvs_ring[stream] = ring; - } - } else if (ctx->type == PIPE_SHADER_TESS_EVAL) { - ctx->tess_offchip_ring = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TES); - } -} - -static void si_llvm_emit_polygon_stipple(struct si_shader_context *ctx, - LLVMValueRef param_rw_buffers, - unsigned param_pos_fixed_pt) -{ - LLVMBuilderRef builder = ctx->ac.builder; - LLVMValueRef slot, desc, offset, row, bit, address[2]; - - /* Use the fixed-point gl_FragCoord input. - * Since the stipple pattern is 32x32 and it repeats, just get 5 bits - * per coordinate to get the repeating effect. - */ - address[0] = si_unpack_param(ctx, param_pos_fixed_pt, 0, 5); - address[1] = si_unpack_param(ctx, param_pos_fixed_pt, 16, 5); - - /* Load the buffer descriptor. */ - slot = LLVMConstInt(ctx->i32, SI_PS_CONST_POLY_STIPPLE, 0); - desc = ac_build_load_to_sgpr(&ctx->ac, param_rw_buffers, slot); - - /* The stipple pattern is 32x32, each row has 32 bits. */ - offset = LLVMBuildMul(builder, address[1], - LLVMConstInt(ctx->i32, 4, 0), ""); - row = buffer_load_const(ctx, desc, offset); - row = ac_to_integer(&ctx->ac, row); - bit = LLVMBuildLShr(builder, row, address[0], ""); - bit = LLVMBuildTrunc(builder, bit, ctx->i1, ""); - ac_build_kill_if_false(&ctx->ac, bit); -} - -/* For the UMR disassembler. */ -#define DEBUGGER_END_OF_CODE_MARKER 0xbf9f0000 /* invalid instruction */ -#define DEBUGGER_NUM_MARKERS 5 - -static bool si_shader_binary_open(struct si_screen *screen, - struct si_shader *shader, - struct ac_rtld_binary *rtld) -{ - const struct si_shader_selector *sel = shader->selector; - const char *part_elfs[5]; - size_t part_sizes[5]; - unsigned num_parts = 0; - -#define add_part(shader_or_part) \ - if (shader_or_part) { \ - part_elfs[num_parts] = (shader_or_part)->binary.elf_buffer; \ - part_sizes[num_parts] = (shader_or_part)->binary.elf_size; \ - num_parts++; \ - } - - add_part(shader->prolog); - add_part(shader->previous_stage); - add_part(shader->prolog2); - add_part(shader); - add_part(shader->epilog); - -#undef add_part - - struct ac_rtld_symbol lds_symbols[2]; - unsigned num_lds_symbols = 0; - - if (sel && screen->info.chip_class >= GFX9 && !shader->is_gs_copy_shader && - (sel->type == PIPE_SHADER_GEOMETRY || shader->key.as_ngg)) { - /* We add this symbol even on LLVM <= 8 to ensure that - * shader->config.lds_size is set correctly below. - */ - struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++]; - sym->name = "esgs_ring"; - sym->size = shader->gs_info.esgs_ring_size; - sym->align = 64 * 1024; - } - - if (shader->key.as_ngg && sel->type == PIPE_SHADER_GEOMETRY) { - struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++]; - sym->name = "ngg_emit"; - sym->size = shader->ngg.ngg_emit_size * 4; - sym->align = 4; - } - - bool ok = ac_rtld_open(rtld, (struct ac_rtld_open_info){ - .info = &screen->info, - .options = { - .halt_at_entry = screen->options.halt_shaders, - }, - .shader_type = tgsi_processor_to_shader_stage(sel->type), - .wave_size = si_get_shader_wave_size(shader), - .num_parts = num_parts, - .elf_ptrs = part_elfs, - .elf_sizes = part_sizes, - .num_shared_lds_symbols = num_lds_symbols, - .shared_lds_symbols = lds_symbols }); - - if (rtld->lds_size > 0) { - unsigned alloc_granularity = screen->info.chip_class >= GFX7 ? 512 : 256; - shader->config.lds_size = - align(rtld->lds_size, alloc_granularity) / alloc_granularity; - } - - return ok; -} - -static unsigned si_get_shader_binary_size(struct si_screen *screen, struct si_shader *shader) -{ - struct ac_rtld_binary rtld; - si_shader_binary_open(screen, shader, &rtld); - return rtld.rx_size; -} - -static bool si_get_external_symbol(void *data, const char *name, uint64_t *value) -{ - uint64_t *scratch_va = data; - - if (!strcmp(scratch_rsrc_dword0_symbol, name)) { - *value = (uint32_t)*scratch_va; - return true; - } - if (!strcmp(scratch_rsrc_dword1_symbol, name)) { - /* Enable scratch coalescing. */ - *value = S_008F04_BASE_ADDRESS_HI(*scratch_va >> 32) | - S_008F04_SWIZZLE_ENABLE(1); - if (HAVE_LLVM < 0x0800) { - /* Old LLVM created an R_ABS32_HI relocation for - * this symbol. */ - *value <<= 32; - } - return true; - } - - return false; -} - -bool si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader, - uint64_t scratch_va) -{ - struct ac_rtld_binary binary; - if (!si_shader_binary_open(sscreen, shader, &binary)) - return false; - - si_resource_reference(&shader->bo, NULL); - shader->bo = si_aligned_buffer_create(&sscreen->b, - sscreen->cpdma_prefetch_writes_memory ? - 0 : SI_RESOURCE_FLAG_READ_ONLY, - PIPE_USAGE_IMMUTABLE, - align(binary.rx_size, SI_CPDMA_ALIGNMENT), - 256); - if (!shader->bo) - return false; - - /* Upload. */ - struct ac_rtld_upload_info u = {}; - u.binary = &binary; - u.get_external_symbol = si_get_external_symbol; - u.cb_data = &scratch_va; - u.rx_va = shader->bo->gpu_address; - u.rx_ptr = sscreen->ws->buffer_map(shader->bo->buf, NULL, - PIPE_TRANSFER_READ_WRITE | - PIPE_TRANSFER_UNSYNCHRONIZED | - RADEON_TRANSFER_TEMPORARY); - if (!u.rx_ptr) - return false; - - bool ok = ac_rtld_upload(&u); - - sscreen->ws->buffer_unmap(shader->bo->buf); - ac_rtld_close(&binary); - - return ok; -} - -static void si_shader_dump_disassembly(struct si_screen *screen, - const struct si_shader_binary *binary, - enum pipe_shader_type shader_type, - unsigned wave_size, - struct pipe_debug_callback *debug, - const char *name, FILE *file) -{ - struct ac_rtld_binary rtld_binary; - - if (!ac_rtld_open(&rtld_binary, (struct ac_rtld_open_info){ - .info = &screen->info, - .shader_type = tgsi_processor_to_shader_stage(shader_type), - .wave_size = wave_size, - .num_parts = 1, - .elf_ptrs = &binary->elf_buffer, - .elf_sizes = &binary->elf_size })) - return; - - const char *disasm; - size_t nbytes; - - if (!ac_rtld_get_section_by_name(&rtld_binary, ".AMDGPU.disasm", &disasm, &nbytes)) - goto out; - - if (nbytes > INT_MAX) - goto out; - - if (debug && debug->debug_message) { - /* Very long debug messages are cut off, so send the - * disassembly one line at a time. This causes more - * overhead, but on the plus side it simplifies - * parsing of resulting logs. - */ - pipe_debug_message(debug, SHADER_INFO, - "Shader Disassembly Begin"); - - uint64_t line = 0; - while (line < nbytes) { - int count = nbytes - line; - const char *nl = memchr(disasm + line, '\n', nbytes - line); - if (nl) - count = nl - (disasm + line); - - if (count) { - pipe_debug_message(debug, SHADER_INFO, - "%.*s", count, disasm + line); - } - - line += count + 1; - } - - pipe_debug_message(debug, SHADER_INFO, - "Shader Disassembly End"); - } - - if (file) { - fprintf(file, "Shader %s disassembly:\n", name); - fprintf(file, "%*s", (int)nbytes, disasm); - } - -out: - ac_rtld_close(&rtld_binary); -} - -static void si_calculate_max_simd_waves(struct si_shader *shader) -{ - struct si_screen *sscreen = shader->selector->screen; - struct ac_shader_config *conf = &shader->config; - unsigned num_inputs = shader->selector->info.num_inputs; - unsigned lds_increment = sscreen->info.chip_class >= GFX7 ? 512 : 256; - unsigned lds_per_wave = 0; - unsigned max_simd_waves; - - max_simd_waves = ac_get_max_wave64_per_simd(sscreen->info.family); - - /* Compute LDS usage for PS. */ - switch (shader->selector->type) { - case PIPE_SHADER_FRAGMENT: - /* The minimum usage per wave is (num_inputs * 48). The maximum - * usage is (num_inputs * 48 * 16). - * We can get anything in between and it varies between waves. - * - * The 48 bytes per input for a single primitive is equal to - * 4 bytes/component * 4 components/input * 3 points. - * - * Other stages don't know the size at compile time or don't - * allocate LDS per wave, but instead they do it per thread group. - */ - lds_per_wave = conf->lds_size * lds_increment + - align(num_inputs * 48, lds_increment); - break; - case PIPE_SHADER_COMPUTE: - if (shader->selector) { - unsigned max_workgroup_size = - si_get_max_workgroup_size(shader); - lds_per_wave = (conf->lds_size * lds_increment) / - DIV_ROUND_UP(max_workgroup_size, - sscreen->compute_wave_size); - } - break; - default:; - } - - /* Compute the per-SIMD wave counts. */ - if (conf->num_sgprs) { - max_simd_waves = - MIN2(max_simd_waves, - ac_get_num_physical_sgprs(&sscreen->info) / conf->num_sgprs); - } - - if (conf->num_vgprs) { - /* Always print wave limits as Wave64, so that we can compare - * Wave32 and Wave64 with shader-db fairly. */ - unsigned max_vgprs = ac_get_num_physical_vgprs(sscreen->info.chip_class, 64); - max_simd_waves = MIN2(max_simd_waves, max_vgprs / conf->num_vgprs); - } - - /* LDS is 64KB per CU (4 SIMDs) on GFX6-9, which is 16KB per SIMD (usage above - * 16KB makes some SIMDs unoccupied). - * - * LDS is 128KB in WGP mode and 64KB in CU mode. Assume the WGP mode is used. - */ - unsigned max_lds_size = sscreen->info.chip_class >= GFX10 ? 128*1024 : 64*1024; - unsigned max_lds_per_simd = max_lds_size / 4; - if (lds_per_wave) - max_simd_waves = MIN2(max_simd_waves, max_lds_per_simd / lds_per_wave); - - shader->info.max_simd_waves = max_simd_waves; -} - -void si_shader_dump_stats_for_shader_db(struct si_screen *screen, - struct si_shader *shader, - struct pipe_debug_callback *debug) -{ - const struct ac_shader_config *conf = &shader->config; - - if (screen->options.debug_disassembly) - si_shader_dump_disassembly(screen, &shader->binary, - shader->selector->type, - si_get_shader_wave_size(shader), - debug, "main", NULL); - - pipe_debug_message(debug, SHADER_INFO, - "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d " - "LDS: %d Scratch: %d Max Waves: %d Spilled SGPRs: %d " - "Spilled VGPRs: %d PrivMem VGPRs: %d", - conf->num_sgprs, conf->num_vgprs, - si_get_shader_binary_size(screen, shader), - conf->lds_size, conf->scratch_bytes_per_wave, - shader->info.max_simd_waves, conf->spilled_sgprs, - conf->spilled_vgprs, shader->info.private_mem_vgprs); -} - -static void si_shader_dump_stats(struct si_screen *sscreen, - struct si_shader *shader, - FILE *file, - bool check_debug_option) -{ - const struct ac_shader_config *conf = &shader->config; - - if (!check_debug_option || - si_can_dump_shader(sscreen, shader->selector->type)) { - if (shader->selector->type == PIPE_SHADER_FRAGMENT) { - fprintf(file, "*** SHADER CONFIG ***\n" - "SPI_PS_INPUT_ADDR = 0x%04x\n" - "SPI_PS_INPUT_ENA = 0x%04x\n", - conf->spi_ps_input_addr, conf->spi_ps_input_ena); - } - - fprintf(file, "*** SHADER STATS ***\n" - "SGPRS: %d\n" - "VGPRS: %d\n" - "Spilled SGPRs: %d\n" - "Spilled VGPRs: %d\n" - "Private memory VGPRs: %d\n" - "Code Size: %d bytes\n" - "LDS: %d blocks\n" - "Scratch: %d bytes per wave\n" - "Max Waves: %d\n" - "********************\n\n\n", - conf->num_sgprs, conf->num_vgprs, - conf->spilled_sgprs, conf->spilled_vgprs, - shader->info.private_mem_vgprs, - si_get_shader_binary_size(sscreen, shader), - conf->lds_size, conf->scratch_bytes_per_wave, - shader->info.max_simd_waves); - } -} - -const char *si_get_shader_name(const struct si_shader *shader) -{ - switch (shader->selector->type) { - case PIPE_SHADER_VERTEX: - if (shader->key.as_es) - return "Vertex Shader as ES"; - else if (shader->key.as_ls) - return "Vertex Shader as LS"; - else if (shader->key.opt.vs_as_prim_discard_cs) - return "Vertex Shader as Primitive Discard CS"; - else if (shader->key.as_ngg) - return "Vertex Shader as ESGS"; - else - return "Vertex Shader as VS"; - case PIPE_SHADER_TESS_CTRL: - return "Tessellation Control Shader"; - case PIPE_SHADER_TESS_EVAL: - if (shader->key.as_es) - return "Tessellation Evaluation Shader as ES"; - else if (shader->key.as_ngg) - return "Tessellation Evaluation Shader as ESGS"; - else - return "Tessellation Evaluation Shader as VS"; - case PIPE_SHADER_GEOMETRY: - if (shader->is_gs_copy_shader) - return "GS Copy Shader as VS"; - else - return "Geometry Shader"; - case PIPE_SHADER_FRAGMENT: - return "Pixel Shader"; - case PIPE_SHADER_COMPUTE: - return "Compute Shader"; - default: - return "Unknown Shader"; - } -} - -void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader, - struct pipe_debug_callback *debug, - FILE *file, bool check_debug_option) -{ - enum pipe_shader_type shader_type = shader->selector->type; - - if (!check_debug_option || - si_can_dump_shader(sscreen, shader_type)) - si_dump_shader_key(shader, file); - - if (!check_debug_option && shader->binary.llvm_ir_string) { - if (shader->previous_stage && - shader->previous_stage->binary.llvm_ir_string) { - fprintf(file, "\n%s - previous stage - LLVM IR:\n\n", - si_get_shader_name(shader)); - fprintf(file, "%s\n", shader->previous_stage->binary.llvm_ir_string); - } - - fprintf(file, "\n%s - main shader part - LLVM IR:\n\n", - si_get_shader_name(shader)); - fprintf(file, "%s\n", shader->binary.llvm_ir_string); - } - - if (!check_debug_option || - (si_can_dump_shader(sscreen, shader_type) && - !(sscreen->debug_flags & DBG(NO_ASM)))) { - unsigned wave_size = si_get_shader_wave_size(shader); - - fprintf(file, "\n%s:\n", si_get_shader_name(shader)); - - if (shader->prolog) - si_shader_dump_disassembly(sscreen, &shader->prolog->binary, - shader_type, wave_size, debug, "prolog", file); - if (shader->previous_stage) - si_shader_dump_disassembly(sscreen, &shader->previous_stage->binary, - shader_type, wave_size, debug, "previous stage", file); - if (shader->prolog2) - si_shader_dump_disassembly(sscreen, &shader->prolog2->binary, - shader_type, wave_size, debug, "prolog2", file); - - si_shader_dump_disassembly(sscreen, &shader->binary, shader_type, - wave_size, debug, "main", file); - - if (shader->epilog) - si_shader_dump_disassembly(sscreen, &shader->epilog->binary, - shader_type, wave_size, debug, "epilog", file); - fprintf(file, "\n"); - } - - si_shader_dump_stats(sscreen, shader, file, check_debug_option); -} - -static int si_compile_llvm(struct si_screen *sscreen, - struct si_shader_binary *binary, - struct ac_shader_config *conf, - struct ac_llvm_compiler *compiler, - LLVMModuleRef mod, - struct pipe_debug_callback *debug, - enum pipe_shader_type shader_type, - unsigned wave_size, - const char *name, - bool less_optimized) -{ - unsigned count = p_atomic_inc_return(&sscreen->num_compilations); - - if (si_can_dump_shader(sscreen, shader_type)) { - fprintf(stderr, "radeonsi: Compiling shader %d\n", count); - - if (!(sscreen->debug_flags & (DBG(NO_IR) | DBG(PREOPT_IR)))) { - fprintf(stderr, "%s LLVM IR:\n\n", name); - ac_dump_module(mod); - fprintf(stderr, "\n"); - } - } - - if (sscreen->record_llvm_ir) { - char *ir = LLVMPrintModuleToString(mod); - binary->llvm_ir_string = strdup(ir); - LLVMDisposeMessage(ir); - } - - if (!si_replace_shader(count, binary)) { - unsigned r = si_llvm_compile(mod, binary, compiler, debug, - less_optimized, wave_size); - if (r) - return r; - } - - struct ac_rtld_binary rtld; - if (!ac_rtld_open(&rtld, (struct ac_rtld_open_info){ - .info = &sscreen->info, - .shader_type = tgsi_processor_to_shader_stage(shader_type), - .wave_size = wave_size, - .num_parts = 1, - .elf_ptrs = &binary->elf_buffer, - .elf_sizes = &binary->elf_size })) - return -1; - - bool ok = ac_rtld_read_config(&rtld, conf); - ac_rtld_close(&rtld); - if (!ok) - return -1; - - /* Enable 64-bit and 16-bit denormals, because there is no performance - * cost. - * - * If denormals are enabled, all floating-point output modifiers are - * ignored. - * - * Don't enable denormals for 32-bit floats, because: - * - Floating-point output modifiers would be ignored by the hw. - * - Some opcodes don't support denormals, such as v_mad_f32. We would - * have to stop using those. - * - GFX6 & GFX7 would be very slow. - */ - conf->float_mode |= V_00B028_FP_64_DENORMS; - - return 0; -} - -static void si_llvm_build_ret(struct si_shader_context *ctx, LLVMValueRef ret) -{ - if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind) - LLVMBuildRetVoid(ctx->ac.builder); - else - LLVMBuildRet(ctx->ac.builder, ret); -} - -/* Generate code for the hardware VS shader stage to go with a geometry shader */ -struct si_shader * -si_generate_gs_copy_shader(struct si_screen *sscreen, - struct ac_llvm_compiler *compiler, - struct si_shader_selector *gs_selector, - struct pipe_debug_callback *debug) -{ - struct si_shader_context ctx; - struct si_shader *shader; - LLVMBuilderRef builder; - struct si_shader_output_values outputs[SI_MAX_VS_OUTPUTS]; - struct tgsi_shader_info *gsinfo = &gs_selector->info; - int i; - - - shader = CALLOC_STRUCT(si_shader); - if (!shader) - return NULL; - - /* We can leave the fence as permanently signaled because the GS copy - * shader only becomes visible globally after it has been compiled. */ - util_queue_fence_init(&shader->ready); - - shader->selector = gs_selector; - shader->is_gs_copy_shader = true; - - si_init_shader_ctx(&ctx, sscreen, compiler, - si_get_wave_size(sscreen, PIPE_SHADER_VERTEX, false, false), - false); - ctx.shader = shader; - ctx.type = PIPE_SHADER_VERTEX; - - builder = ctx.ac.builder; - - create_function(&ctx); - preload_ring_buffers(&ctx); - - LLVMValueRef voffset = - LLVMBuildMul(ctx.ac.builder, ctx.abi.vertex_id, - LLVMConstInt(ctx.i32, 4, 0), ""); - - /* Fetch the vertex stream ID.*/ - LLVMValueRef stream_id; - - if (!sscreen->use_ngg_streamout && gs_selector->so.num_outputs) - stream_id = si_unpack_param(&ctx, ctx.param_streamout_config, 24, 2); - else - stream_id = ctx.i32_0; - - /* Fill in output information. */ - for (i = 0; i < gsinfo->num_outputs; ++i) { - outputs[i].semantic_name = gsinfo->output_semantic_name[i]; - outputs[i].semantic_index = gsinfo->output_semantic_index[i]; - - for (int chan = 0; chan < 4; chan++) { - outputs[i].vertex_stream[chan] = - (gsinfo->output_streams[i] >> (2 * chan)) & 3; - } - } - - LLVMBasicBlockRef end_bb; - LLVMValueRef switch_inst; - - end_bb = LLVMAppendBasicBlockInContext(ctx.ac.context, ctx.main_fn, "end"); - switch_inst = LLVMBuildSwitch(builder, stream_id, end_bb, 4); - - for (int stream = 0; stream < 4; stream++) { - LLVMBasicBlockRef bb; - unsigned offset; - - if (!gsinfo->num_stream_output_components[stream]) - continue; - - if (stream > 0 && !gs_selector->so.num_outputs) - continue; - - bb = LLVMInsertBasicBlockInContext(ctx.ac.context, end_bb, "out"); - LLVMAddCase(switch_inst, LLVMConstInt(ctx.i32, stream, 0), bb); - LLVMPositionBuilderAtEnd(builder, bb); - - /* Fetch vertex data from GSVS ring */ - offset = 0; - for (i = 0; i < gsinfo->num_outputs; ++i) { - for (unsigned chan = 0; chan < 4; chan++) { - if (!(gsinfo->output_usagemask[i] & (1 << chan)) || - outputs[i].vertex_stream[chan] != stream) { - outputs[i].values[chan] = LLVMGetUndef(ctx.f32); - continue; - } - - LLVMValueRef soffset = LLVMConstInt(ctx.i32, - offset * gs_selector->gs_max_out_vertices * 16 * 4, 0); - offset++; - - outputs[i].values[chan] = - ac_build_buffer_load(&ctx.ac, - ctx.gsvs_ring[0], 1, - ctx.i32_0, voffset, - soffset, 0, ac_glc | ac_slc, - true, false); - } - } - - /* Streamout and exports. */ - if (!sscreen->use_ngg_streamout && gs_selector->so.num_outputs) { - si_llvm_emit_streamout(&ctx, outputs, - gsinfo->num_outputs, - stream); - } - - if (stream == 0) - si_llvm_export_vs(&ctx, outputs, gsinfo->num_outputs); - - LLVMBuildBr(builder, end_bb); - } - - LLVMPositionBuilderAtEnd(builder, end_bb); - - LLVMBuildRetVoid(ctx.ac.builder); - - ctx.type = PIPE_SHADER_GEOMETRY; /* override for shader dumping */ - si_llvm_optimize_module(&ctx); - - bool ok = false; - if (si_compile_llvm(sscreen, &ctx.shader->binary, - &ctx.shader->config, ctx.compiler, - ctx.ac.module, - debug, PIPE_SHADER_GEOMETRY, ctx.ac.wave_size, - "GS Copy Shader", false) == 0) { - if (si_can_dump_shader(sscreen, PIPE_SHADER_GEOMETRY)) - fprintf(stderr, "GS Copy Shader:\n"); - si_shader_dump(sscreen, ctx.shader, debug, stderr, true); - - if (!ctx.shader->config.scratch_bytes_per_wave) - ok = si_shader_binary_upload(sscreen, ctx.shader, 0); - else - ok = true; - } - - si_llvm_dispose(&ctx); - - if (!ok) { - FREE(shader); - shader = NULL; - } else { - si_fix_resource_usage(sscreen, shader); - } - return shader; -} - -static void si_dump_shader_key_vs(const struct si_shader_key *key, - const struct si_vs_prolog_bits *prolog, - const char *prefix, FILE *f) -{ - fprintf(f, " %s.instance_divisor_is_one = %u\n", - prefix, prolog->instance_divisor_is_one); - fprintf(f, " %s.instance_divisor_is_fetched = %u\n", - prefix, prolog->instance_divisor_is_fetched); - fprintf(f, " %s.unpack_instance_id_from_vertex_id = %u\n", - prefix, prolog->unpack_instance_id_from_vertex_id); - fprintf(f, " %s.ls_vgpr_fix = %u\n", - prefix, prolog->ls_vgpr_fix); - - fprintf(f, " mono.vs.fetch_opencode = %x\n", key->mono.vs_fetch_opencode); - fprintf(f, " mono.vs.fix_fetch = {"); - for (int i = 0; i < SI_MAX_ATTRIBS; i++) { - union si_vs_fix_fetch fix = key->mono.vs_fix_fetch[i]; - if (i) - fprintf(f, ", "); - if (!fix.bits) - fprintf(f, "0"); - else - fprintf(f, "%u.%u.%u.%u", fix.u.reverse, fix.u.log_size, - fix.u.num_channels_m1, fix.u.format); - } - fprintf(f, "}\n"); -} - -static void si_dump_shader_key(const struct si_shader *shader, FILE *f) -{ - const struct si_shader_key *key = &shader->key; - enum pipe_shader_type shader_type = shader->selector->type; - - fprintf(f, "SHADER KEY\n"); - - switch (shader_type) { - case PIPE_SHADER_VERTEX: - si_dump_shader_key_vs(key, &key->part.vs.prolog, - "part.vs.prolog", f); - fprintf(f, " as_es = %u\n", key->as_es); - fprintf(f, " as_ls = %u\n", key->as_ls); - fprintf(f, " as_ngg = %u\n", key->as_ngg); - fprintf(f, " mono.u.vs_export_prim_id = %u\n", - key->mono.u.vs_export_prim_id); - fprintf(f, " opt.vs_as_prim_discard_cs = %u\n", - key->opt.vs_as_prim_discard_cs); - fprintf(f, " opt.cs_prim_type = %s\n", - tgsi_primitive_names[key->opt.cs_prim_type]); - fprintf(f, " opt.cs_indexed = %u\n", - key->opt.cs_indexed); - fprintf(f, " opt.cs_instancing = %u\n", - key->opt.cs_instancing); - fprintf(f, " opt.cs_primitive_restart = %u\n", - key->opt.cs_primitive_restart); - fprintf(f, " opt.cs_provoking_vertex_first = %u\n", - key->opt.cs_provoking_vertex_first); - fprintf(f, " opt.cs_need_correct_orientation = %u\n", - key->opt.cs_need_correct_orientation); - fprintf(f, " opt.cs_cull_front = %u\n", - key->opt.cs_cull_front); - fprintf(f, " opt.cs_cull_back = %u\n", - key->opt.cs_cull_back); - fprintf(f, " opt.cs_cull_z = %u\n", - key->opt.cs_cull_z); - fprintf(f, " opt.cs_halfz_clip_space = %u\n", - key->opt.cs_halfz_clip_space); - break; - - case PIPE_SHADER_TESS_CTRL: - if (shader->selector->screen->info.chip_class >= GFX9) { - si_dump_shader_key_vs(key, &key->part.tcs.ls_prolog, - "part.tcs.ls_prolog", f); - } - fprintf(f, " part.tcs.epilog.prim_mode = %u\n", key->part.tcs.epilog.prim_mode); - fprintf(f, " mono.u.ff_tcs_inputs_to_copy = 0x%"PRIx64"\n", key->mono.u.ff_tcs_inputs_to_copy); - break; - - case PIPE_SHADER_TESS_EVAL: - fprintf(f, " as_es = %u\n", key->as_es); - fprintf(f, " as_ngg = %u\n", key->as_ngg); - fprintf(f, " mono.u.vs_export_prim_id = %u\n", - key->mono.u.vs_export_prim_id); - break; - - case PIPE_SHADER_GEOMETRY: - if (shader->is_gs_copy_shader) - break; - - if (shader->selector->screen->info.chip_class >= GFX9 && - key->part.gs.es->type == PIPE_SHADER_VERTEX) { - si_dump_shader_key_vs(key, &key->part.gs.vs_prolog, - "part.gs.vs_prolog", f); - } - fprintf(f, " part.gs.prolog.tri_strip_adj_fix = %u\n", key->part.gs.prolog.tri_strip_adj_fix); - fprintf(f, " part.gs.prolog.gfx9_prev_is_vs = %u\n", key->part.gs.prolog.gfx9_prev_is_vs); - fprintf(f, " as_ngg = %u\n", key->as_ngg); - break; - - case PIPE_SHADER_COMPUTE: - break; - - case PIPE_SHADER_FRAGMENT: - fprintf(f, " part.ps.prolog.color_two_side = %u\n", key->part.ps.prolog.color_two_side); - fprintf(f, " part.ps.prolog.flatshade_colors = %u\n", key->part.ps.prolog.flatshade_colors); - fprintf(f, " part.ps.prolog.poly_stipple = %u\n", key->part.ps.prolog.poly_stipple); - fprintf(f, " part.ps.prolog.force_persp_sample_interp = %u\n", key->part.ps.prolog.force_persp_sample_interp); - fprintf(f, " part.ps.prolog.force_linear_sample_interp = %u\n", key->part.ps.prolog.force_linear_sample_interp); - fprintf(f, " part.ps.prolog.force_persp_center_interp = %u\n", key->part.ps.prolog.force_persp_center_interp); - fprintf(f, " part.ps.prolog.force_linear_center_interp = %u\n", key->part.ps.prolog.force_linear_center_interp); - fprintf(f, " part.ps.prolog.bc_optimize_for_persp = %u\n", key->part.ps.prolog.bc_optimize_for_persp); - fprintf(f, " part.ps.prolog.bc_optimize_for_linear = %u\n", key->part.ps.prolog.bc_optimize_for_linear); - fprintf(f, " part.ps.prolog.samplemask_log_ps_iter = %u\n", key->part.ps.prolog.samplemask_log_ps_iter); - fprintf(f, " part.ps.epilog.spi_shader_col_format = 0x%x\n", key->part.ps.epilog.spi_shader_col_format); - fprintf(f, " part.ps.epilog.color_is_int8 = 0x%X\n", key->part.ps.epilog.color_is_int8); - fprintf(f, " part.ps.epilog.color_is_int10 = 0x%X\n", key->part.ps.epilog.color_is_int10); - fprintf(f, " part.ps.epilog.last_cbuf = %u\n", key->part.ps.epilog.last_cbuf); - fprintf(f, " part.ps.epilog.alpha_func = %u\n", key->part.ps.epilog.alpha_func); - fprintf(f, " part.ps.epilog.alpha_to_one = %u\n", key->part.ps.epilog.alpha_to_one); - fprintf(f, " part.ps.epilog.poly_line_smoothing = %u\n", key->part.ps.epilog.poly_line_smoothing); - fprintf(f, " part.ps.epilog.clamp_color = %u\n", key->part.ps.epilog.clamp_color); - fprintf(f, " mono.u.ps.interpolate_at_sample_force_center = %u\n", key->mono.u.ps.interpolate_at_sample_force_center); - fprintf(f, " mono.u.ps.fbfetch_msaa = %u\n", key->mono.u.ps.fbfetch_msaa); - fprintf(f, " mono.u.ps.fbfetch_is_1D = %u\n", key->mono.u.ps.fbfetch_is_1D); - fprintf(f, " mono.u.ps.fbfetch_layered = %u\n", key->mono.u.ps.fbfetch_layered); - break; - - default: - assert(0); - } - - if ((shader_type == PIPE_SHADER_GEOMETRY || - shader_type == PIPE_SHADER_TESS_EVAL || - shader_type == PIPE_SHADER_VERTEX) && - !key->as_es && !key->as_ls) { - fprintf(f, " opt.kill_outputs = 0x%"PRIx64"\n", key->opt.kill_outputs); - fprintf(f, " opt.clip_disable = %u\n", key->opt.clip_disable); - } -} - -static void si_init_shader_ctx(struct si_shader_context *ctx, - struct si_screen *sscreen, - struct ac_llvm_compiler *compiler, - unsigned wave_size, - bool nir) -{ - struct lp_build_tgsi_context *bld_base; - - si_llvm_context_init(ctx, sscreen, compiler, wave_size, - nir ? 64 : wave_size); - - bld_base = &ctx->bld_base; - bld_base->emit_fetch_funcs[TGSI_FILE_CONSTANT] = fetch_constant; - - bld_base->op_actions[TGSI_OPCODE_INTERP_CENTROID].emit = build_interp_intrinsic; - bld_base->op_actions[TGSI_OPCODE_INTERP_SAMPLE].emit = build_interp_intrinsic; - bld_base->op_actions[TGSI_OPCODE_INTERP_OFFSET].emit = build_interp_intrinsic; - - bld_base->op_actions[TGSI_OPCODE_MEMBAR].emit = membar_emit; - - bld_base->op_actions[TGSI_OPCODE_CLOCK].emit = clock_emit; - - bld_base->op_actions[TGSI_OPCODE_DDX].emit = si_llvm_emit_ddxy; - bld_base->op_actions[TGSI_OPCODE_DDY].emit = si_llvm_emit_ddxy; - bld_base->op_actions[TGSI_OPCODE_DDX_FINE].emit = si_llvm_emit_ddxy; - bld_base->op_actions[TGSI_OPCODE_DDY_FINE].emit = si_llvm_emit_ddxy; - - bld_base->op_actions[TGSI_OPCODE_VOTE_ALL].emit = vote_all_emit; - bld_base->op_actions[TGSI_OPCODE_VOTE_ANY].emit = vote_any_emit; - bld_base->op_actions[TGSI_OPCODE_VOTE_EQ].emit = vote_eq_emit; - bld_base->op_actions[TGSI_OPCODE_BALLOT].emit = ballot_emit; - bld_base->op_actions[TGSI_OPCODE_READ_FIRST].intr_name = "llvm.amdgcn.readfirstlane"; - bld_base->op_actions[TGSI_OPCODE_READ_FIRST].emit = read_lane_emit; - bld_base->op_actions[TGSI_OPCODE_READ_INVOC].intr_name = "llvm.amdgcn.readlane"; - bld_base->op_actions[TGSI_OPCODE_READ_INVOC].emit = read_lane_emit; - - bld_base->op_actions[TGSI_OPCODE_EMIT].emit = si_tgsi_emit_vertex; - bld_base->op_actions[TGSI_OPCODE_ENDPRIM].emit = si_tgsi_emit_primitive; - bld_base->op_actions[TGSI_OPCODE_BARRIER].emit = si_llvm_emit_barrier; -} - -static void si_optimize_vs_outputs(struct si_shader_context *ctx) -{ - struct si_shader *shader = ctx->shader; - struct tgsi_shader_info *info = &shader->selector->info; - - if ((ctx->type != PIPE_SHADER_VERTEX && - ctx->type != PIPE_SHADER_TESS_EVAL) || - shader->key.as_ls || - shader->key.as_es) - return; - - ac_optimize_vs_outputs(&ctx->ac, - ctx->main_fn, - shader->info.vs_output_param_offset, - info->num_outputs, - &shader->info.nr_param_exports); -} - -static void si_init_exec_from_input(struct si_shader_context *ctx, - unsigned param, unsigned bitoffset) -{ - LLVMValueRef args[] = { - LLVMGetParam(ctx->main_fn, param), - LLVMConstInt(ctx->i32, bitoffset, 0), - }; - ac_build_intrinsic(&ctx->ac, - "llvm.amdgcn.init.exec.from.input", - ctx->voidt, args, 2, AC_FUNC_ATTR_CONVERGENT); -} - -static bool si_vs_needs_prolog(const struct si_shader_selector *sel, - const struct si_vs_prolog_bits *key) -{ - /* VGPR initialization fixup for Vega10 and Raven is always done in the - * VS prolog. */ - return sel->vs_needs_prolog || key->ls_vgpr_fix; -} - -static bool si_compile_tgsi_main(struct si_shader_context *ctx) -{ - struct si_shader *shader = ctx->shader; - struct si_shader_selector *sel = shader->selector; - struct lp_build_tgsi_context *bld_base = &ctx->bld_base; - - // TODO clean all this up! - switch (ctx->type) { - case PIPE_SHADER_VERTEX: - ctx->load_input = declare_input_vs; - if (shader->key.as_ls) - ctx->abi.emit_outputs = si_llvm_emit_ls_epilogue; - else if (shader->key.as_es) - ctx->abi.emit_outputs = si_llvm_emit_es_epilogue; - else if (shader->key.opt.vs_as_prim_discard_cs) - ctx->abi.emit_outputs = si_llvm_emit_prim_discard_cs_epilogue; - else if (shader->key.as_ngg) - ctx->abi.emit_outputs = gfx10_emit_ngg_epilogue; - else - ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue; - bld_base->emit_epilogue = si_tgsi_emit_epilogue; - ctx->abi.load_base_vertex = get_base_vertex; - break; - case PIPE_SHADER_TESS_CTRL: - bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tcs; - ctx->abi.load_tess_varyings = si_nir_load_tcs_varyings; - ctx->abi.load_tess_level = si_load_tess_level; - bld_base->emit_fetch_funcs[TGSI_FILE_OUTPUT] = fetch_output_tcs; - bld_base->emit_store = store_output_tcs; - ctx->abi.store_tcs_outputs = si_nir_store_output_tcs; - ctx->abi.emit_outputs = si_llvm_emit_tcs_epilogue; - ctx->abi.load_patch_vertices_in = si_load_patch_vertices_in; - bld_base->emit_epilogue = si_tgsi_emit_epilogue; - break; - case PIPE_SHADER_TESS_EVAL: - bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tes; - ctx->abi.load_tess_varyings = si_nir_load_input_tes; - ctx->abi.load_tess_coord = si_load_tess_coord; - ctx->abi.load_tess_level = si_load_tess_level; - ctx->abi.load_patch_vertices_in = si_load_patch_vertices_in; - if (shader->key.as_es) - ctx->abi.emit_outputs = si_llvm_emit_es_epilogue; - else if (shader->key.as_ngg) - ctx->abi.emit_outputs = gfx10_emit_ngg_epilogue; - else - ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue; - bld_base->emit_epilogue = si_tgsi_emit_epilogue; - break; - case PIPE_SHADER_GEOMETRY: - bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_gs; - ctx->abi.load_inputs = si_nir_load_input_gs; - ctx->abi.emit_vertex = si_llvm_emit_vertex; - ctx->abi.emit_primitive = si_llvm_emit_primitive; - ctx->abi.emit_outputs = si_llvm_emit_gs_epilogue; - bld_base->emit_epilogue = si_tgsi_emit_gs_epilogue; - break; - case PIPE_SHADER_FRAGMENT: - ctx->load_input = declare_input_fs; - ctx->abi.emit_outputs = si_llvm_return_fs_outputs; - bld_base->emit_epilogue = si_tgsi_emit_epilogue; - ctx->abi.lookup_interp_param = si_nir_lookup_interp_param; - ctx->abi.load_sample_position = load_sample_position; - ctx->abi.load_sample_mask_in = load_sample_mask_in; - ctx->abi.emit_fbfetch = si_nir_emit_fbfetch; - ctx->abi.emit_kill = si_llvm_emit_kill; - break; - case PIPE_SHADER_COMPUTE: - ctx->abi.load_local_group_size = get_block_size; - break; - default: - assert(!"Unsupported shader type"); - return false; - } - - ctx->abi.load_ubo = load_ubo; - ctx->abi.load_ssbo = load_ssbo; - - create_function(ctx); - preload_ring_buffers(ctx); - - if (ctx->type == PIPE_SHADER_TESS_CTRL && - sel->tcs_info.tessfactors_are_def_in_all_invocs) { - for (unsigned i = 0; i < 6; i++) { - ctx->invoc0_tess_factors[i] = - ac_build_alloca_undef(&ctx->ac, ctx->i32, ""); - } - } - - if (ctx->type == PIPE_SHADER_GEOMETRY) { - for (unsigned i = 0; i < 4; i++) { - ctx->gs_next_vertex[i] = - ac_build_alloca(&ctx->ac, ctx->i32, ""); - } - if (shader->key.as_ngg) { - for (unsigned i = 0; i < 4; ++i) { - ctx->gs_curprim_verts[i] = - ac_build_alloca(&ctx->ac, ctx->ac.i32, ""); - ctx->gs_generated_prims[i] = - ac_build_alloca(&ctx->ac, ctx->ac.i32, ""); - } - - unsigned scratch_size = 8; - if (sel->so.num_outputs) - scratch_size = 44; - - LLVMTypeRef ai32 = LLVMArrayType(ctx->i32, scratch_size); - ctx->gs_ngg_scratch = LLVMAddGlobalInAddressSpace(ctx->ac.module, - ai32, "ngg_scratch", AC_ADDR_SPACE_LDS); - LLVMSetInitializer(ctx->gs_ngg_scratch, LLVMGetUndef(ai32)); - LLVMSetAlignment(ctx->gs_ngg_scratch, 4); - - ctx->gs_ngg_emit = LLVMAddGlobalInAddressSpace(ctx->ac.module, - LLVMArrayType(ctx->i32, 0), "ngg_emit", AC_ADDR_SPACE_LDS); - LLVMSetLinkage(ctx->gs_ngg_emit, LLVMExternalLinkage); - LLVMSetAlignment(ctx->gs_ngg_emit, 4); - } - } - - if (ctx->type != PIPE_SHADER_GEOMETRY && - (shader->key.as_ngg && !shader->key.as_es)) { - /* Unconditionally declare scratch space base for streamout and - * vertex compaction. Whether space is actually allocated is - * determined during linking / PM4 creation. - * - * Add an extra dword per vertex to ensure an odd stride, which - * avoids bank conflicts for SoA accesses. - */ - declare_esgs_ring(ctx); - - /* This is really only needed when streamout and / or vertex - * compaction is enabled. - */ - LLVMTypeRef asi32 = LLVMArrayType(ctx->i32, 8); - ctx->gs_ngg_scratch = LLVMAddGlobalInAddressSpace(ctx->ac.module, - asi32, "ngg_scratch", AC_ADDR_SPACE_LDS); - LLVMSetInitializer(ctx->gs_ngg_scratch, LLVMGetUndef(asi32)); - LLVMSetAlignment(ctx->gs_ngg_scratch, 4); - } - - /* For GFX9 merged shaders: - * - Set EXEC for the first shader. If the prolog is present, set - * EXEC there instead. - * - Add a barrier before the second shader. - * - In the second shader, reset EXEC to ~0 and wrap the main part in - * an if-statement. This is required for correctness in geometry - * shaders, to ensure that empty GS waves do not send GS_EMIT and - * GS_CUT messages. - * - * For monolithic merged shaders, the first shader is wrapped in an - * if-block together with its prolog in si_build_wrapper_function. - * - * NGG vertex and tess eval shaders running as the last - * vertex/geometry stage handle execution explicitly using - * if-statements. - */ - if (ctx->screen->info.chip_class >= GFX9) { - if (!shader->is_monolithic && - sel->info.num_instructions > 1 && /* not empty shader */ - (shader->key.as_es || shader->key.as_ls) && - (ctx->type == PIPE_SHADER_TESS_EVAL || - (ctx->type == PIPE_SHADER_VERTEX && - !si_vs_needs_prolog(sel, &shader->key.part.vs.prolog)))) { - si_init_exec_from_input(ctx, - ctx->param_merged_wave_info, 0); - } else if (ctx->type == PIPE_SHADER_TESS_CTRL || - ctx->type == PIPE_SHADER_GEOMETRY || - (shader->key.as_ngg && !shader->key.as_es)) { - LLVMValueRef num_threads; - bool nested_barrier; - - if (!shader->is_monolithic || - (ctx->type == PIPE_SHADER_TESS_EVAL && - (shader->key.as_ngg && !shader->key.as_es))) - ac_init_exec_full_mask(&ctx->ac); - - if (ctx->type == PIPE_SHADER_TESS_CTRL || - ctx->type == PIPE_SHADER_GEOMETRY) { - if (ctx->type == PIPE_SHADER_GEOMETRY && shader->key.as_ngg) { - gfx10_ngg_gs_emit_prologue(ctx); - nested_barrier = false; - } else { - nested_barrier = true; - } - - /* Number of patches / primitives */ - num_threads = si_unpack_param(ctx, ctx->param_merged_wave_info, 8, 8); - } else { - /* Number of vertices */ - num_threads = si_unpack_param(ctx, ctx->param_merged_wave_info, 0, 8); - nested_barrier = false; - } - - LLVMValueRef ena = - LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, - ac_get_thread_id(&ctx->ac), num_threads, ""); - - ctx->merged_wrap_if_entry_block = LLVMGetInsertBlock(ctx->ac.builder); - ctx->merged_wrap_if_label = 11500; - ac_build_ifcc(&ctx->ac, ena, ctx->merged_wrap_if_label); - - if (nested_barrier) { - /* Execute a barrier before the second shader in - * a merged shader. - * - * Execute the barrier inside the conditional block, - * so that empty waves can jump directly to s_endpgm, - * which will also signal the barrier. - * - * This is possible in gfx9, because an empty wave - * for the second shader does not participate in - * the epilogue. With NGG, empty waves may still - * be required to export data (e.g. GS output vertices), - * so we cannot let them exit early. - * - * If the shader is TCS and the TCS epilog is present - * and contains a barrier, it will wait there and then - * reach s_endpgm. - */ - si_llvm_emit_barrier(NULL, bld_base, NULL); - } - } - } - - if (sel->force_correct_derivs_after_kill) { - ctx->postponed_kill = ac_build_alloca_undef(&ctx->ac, ctx->i1, ""); - /* true = don't kill. */ - LLVMBuildStore(ctx->ac.builder, ctx->i1true, - ctx->postponed_kill); - } - - if (sel->tokens) { - if (!lp_build_tgsi_llvm(bld_base, sel->tokens)) { - fprintf(stderr, "Failed to translate shader from TGSI to LLVM\n"); - return false; - } - } else { - if (!si_nir_build_llvm(ctx, sel->nir)) { - fprintf(stderr, "Failed to translate shader from NIR to LLVM\n"); - return false; - } - } - - si_llvm_build_ret(ctx, ctx->return_value); - return true; -} - -/** - * Compute the VS prolog key, which contains all the information needed to - * build the VS prolog function, and set shader->info bits where needed. - * - * \param info Shader info of the vertex shader. - * \param num_input_sgprs Number of input SGPRs for the vertex shader. - * \param prolog_key Key of the VS prolog - * \param shader_out The vertex shader, or the next shader if merging LS+HS or ES+GS. - * \param key Output shader part key. - */ -static void si_get_vs_prolog_key(const struct tgsi_shader_info *info, - unsigned num_input_sgprs, - const struct si_vs_prolog_bits *prolog_key, - struct si_shader *shader_out, - union si_shader_part_key *key) -{ - memset(key, 0, sizeof(*key)); - key->vs_prolog.states = *prolog_key; - key->vs_prolog.num_input_sgprs = num_input_sgprs; - key->vs_prolog.last_input = MAX2(1, info->num_inputs) - 1; - key->vs_prolog.as_ls = shader_out->key.as_ls; - key->vs_prolog.as_es = shader_out->key.as_es; - key->vs_prolog.as_ngg = shader_out->key.as_ngg; - - if (shader_out->selector->type == PIPE_SHADER_TESS_CTRL) { - key->vs_prolog.as_ls = 1; - key->vs_prolog.num_merged_next_stage_vgprs = 2; - } else if (shader_out->selector->type == PIPE_SHADER_GEOMETRY) { - key->vs_prolog.as_es = 1; - key->vs_prolog.num_merged_next_stage_vgprs = 5; - } else if (shader_out->key.as_ngg) { - key->vs_prolog.num_merged_next_stage_vgprs = 5; - } - - /* Enable loading the InstanceID VGPR. */ - uint16_t input_mask = u_bit_consecutive(0, info->num_inputs); - - if ((key->vs_prolog.states.instance_divisor_is_one | - key->vs_prolog.states.instance_divisor_is_fetched) & input_mask) - shader_out->info.uses_instanceid = true; -} - -/** - * Compute the PS prolog key, which contains all the information needed to - * build the PS prolog function, and set related bits in shader->config. - */ -static void si_get_ps_prolog_key(struct si_shader *shader, - union si_shader_part_key *key, - bool separate_prolog) -{ - struct tgsi_shader_info *info = &shader->selector->info; - - memset(key, 0, sizeof(*key)); - key->ps_prolog.states = shader->key.part.ps.prolog; - key->ps_prolog.colors_read = info->colors_read; - key->ps_prolog.num_input_sgprs = shader->info.num_input_sgprs; - key->ps_prolog.num_input_vgprs = shader->info.num_input_vgprs; - key->ps_prolog.wqm = info->uses_derivatives && - (key->ps_prolog.colors_read || - key->ps_prolog.states.force_persp_sample_interp || - key->ps_prolog.states.force_linear_sample_interp || - key->ps_prolog.states.force_persp_center_interp || - key->ps_prolog.states.force_linear_center_interp || - key->ps_prolog.states.bc_optimize_for_persp || - key->ps_prolog.states.bc_optimize_for_linear); - key->ps_prolog.ancillary_vgpr_index = shader->info.ancillary_vgpr_index; - - if (info->colors_read) { - unsigned *color = shader->selector->color_attr_index; - - if (shader->key.part.ps.prolog.color_two_side) { - /* BCOLORs are stored after the last input. */ - key->ps_prolog.num_interp_inputs = info->num_inputs; - key->ps_prolog.face_vgpr_index = shader->info.face_vgpr_index; - if (separate_prolog) - shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1); - } - - for (unsigned i = 0; i < 2; i++) { - unsigned interp = info->input_interpolate[color[i]]; - unsigned location = info->input_interpolate_loc[color[i]]; - - if (!(info->colors_read & (0xf << i*4))) - continue; - - key->ps_prolog.color_attr_index[i] = color[i]; - - if (shader->key.part.ps.prolog.flatshade_colors && - interp == TGSI_INTERPOLATE_COLOR) - interp = TGSI_INTERPOLATE_CONSTANT; - - switch (interp) { - case TGSI_INTERPOLATE_CONSTANT: - key->ps_prolog.color_interp_vgpr_index[i] = -1; - break; - case TGSI_INTERPOLATE_PERSPECTIVE: - case TGSI_INTERPOLATE_COLOR: - /* Force the interpolation location for colors here. */ - if (shader->key.part.ps.prolog.force_persp_sample_interp) - location = TGSI_INTERPOLATE_LOC_SAMPLE; - if (shader->key.part.ps.prolog.force_persp_center_interp) - location = TGSI_INTERPOLATE_LOC_CENTER; - - switch (location) { - case TGSI_INTERPOLATE_LOC_SAMPLE: - key->ps_prolog.color_interp_vgpr_index[i] = 0; - if (separate_prolog) { - shader->config.spi_ps_input_ena |= - S_0286CC_PERSP_SAMPLE_ENA(1); - } - break; - case TGSI_INTERPOLATE_LOC_CENTER: - key->ps_prolog.color_interp_vgpr_index[i] = 2; - if (separate_prolog) { - shader->config.spi_ps_input_ena |= - S_0286CC_PERSP_CENTER_ENA(1); - } - break; - case TGSI_INTERPOLATE_LOC_CENTROID: - key->ps_prolog.color_interp_vgpr_index[i] = 4; - if (separate_prolog) { - shader->config.spi_ps_input_ena |= - S_0286CC_PERSP_CENTROID_ENA(1); - } - break; - default: - assert(0); - } - break; - case TGSI_INTERPOLATE_LINEAR: - /* Force the interpolation location for colors here. */ - if (shader->key.part.ps.prolog.force_linear_sample_interp) - location = TGSI_INTERPOLATE_LOC_SAMPLE; - if (shader->key.part.ps.prolog.force_linear_center_interp) - location = TGSI_INTERPOLATE_LOC_CENTER; - - /* The VGPR assignment for non-monolithic shaders - * works because InitialPSInputAddr is set on the - * main shader and PERSP_PULL_MODEL is never used. - */ - switch (location) { - case TGSI_INTERPOLATE_LOC_SAMPLE: - key->ps_prolog.color_interp_vgpr_index[i] = - separate_prolog ? 6 : 9; - if (separate_prolog) { - shader->config.spi_ps_input_ena |= - S_0286CC_LINEAR_SAMPLE_ENA(1); - } - break; - case TGSI_INTERPOLATE_LOC_CENTER: - key->ps_prolog.color_interp_vgpr_index[i] = - separate_prolog ? 8 : 11; - if (separate_prolog) { - shader->config.spi_ps_input_ena |= - S_0286CC_LINEAR_CENTER_ENA(1); - } - break; - case TGSI_INTERPOLATE_LOC_CENTROID: - key->ps_prolog.color_interp_vgpr_index[i] = - separate_prolog ? 10 : 13; - if (separate_prolog) { - shader->config.spi_ps_input_ena |= - S_0286CC_LINEAR_CENTROID_ENA(1); - } - break; - default: - assert(0); - } - break; - default: - assert(0); - } - } - } -} - -/** - * Check whether a PS prolog is required based on the key. - */ -static bool si_need_ps_prolog(const union si_shader_part_key *key) -{ - return key->ps_prolog.colors_read || - key->ps_prolog.states.force_persp_sample_interp || - key->ps_prolog.states.force_linear_sample_interp || - key->ps_prolog.states.force_persp_center_interp || - key->ps_prolog.states.force_linear_center_interp || - key->ps_prolog.states.bc_optimize_for_persp || - key->ps_prolog.states.bc_optimize_for_linear || - key->ps_prolog.states.poly_stipple || - key->ps_prolog.states.samplemask_log_ps_iter; -} - -/** - * Compute the PS epilog key, which contains all the information needed to - * build the PS epilog function. - */ -static void si_get_ps_epilog_key(struct si_shader *shader, - union si_shader_part_key *key) -{ - struct tgsi_shader_info *info = &shader->selector->info; - memset(key, 0, sizeof(*key)); - key->ps_epilog.colors_written = info->colors_written; - key->ps_epilog.writes_z = info->writes_z; - key->ps_epilog.writes_stencil = info->writes_stencil; - key->ps_epilog.writes_samplemask = info->writes_samplemask; - key->ps_epilog.states = shader->key.part.ps.epilog; -} - -/** - * Build the GS prolog function. Rotate the input vertices for triangle strips - * with adjacency. - */ -static void si_build_gs_prolog_function(struct si_shader_context *ctx, - union si_shader_part_key *key) -{ - unsigned num_sgprs, num_vgprs; - struct si_function_info fninfo; - LLVMBuilderRef builder = ctx->ac.builder; - LLVMTypeRef returns[48]; - LLVMValueRef func, ret; - - si_init_function_info(&fninfo); - - if (ctx->screen->info.chip_class >= GFX9) { - if (key->gs_prolog.states.gfx9_prev_is_vs) - num_sgprs = 8 + GFX9_VSGS_NUM_USER_SGPR; - else - num_sgprs = 8 + GFX9_TESGS_NUM_USER_SGPR; - num_vgprs = 5; /* ES inputs are not needed by GS */ - } else { - num_sgprs = GFX6_GS_NUM_USER_SGPR + 2; - num_vgprs = 8; - } - - for (unsigned i = 0; i < num_sgprs; ++i) { - add_arg(&fninfo, ARG_SGPR, ctx->i32); - returns[i] = ctx->i32; - } - - for (unsigned i = 0; i < num_vgprs; ++i) { - add_arg(&fninfo, ARG_VGPR, ctx->i32); - returns[num_sgprs + i] = ctx->f32; - } - - /* Create the function. */ - si_create_function(ctx, "gs_prolog", returns, num_sgprs + num_vgprs, - &fninfo, 0); - func = ctx->main_fn; - - /* Set the full EXEC mask for the prolog, because we are only fiddling - * with registers here. The main shader part will set the correct EXEC - * mask. - */ - if (ctx->screen->info.chip_class >= GFX9 && !key->gs_prolog.is_monolithic) - ac_init_exec_full_mask(&ctx->ac); - - /* Copy inputs to outputs. This should be no-op, as the registers match, - * but it will prevent the compiler from overwriting them unintentionally. - */ - ret = ctx->return_value; - for (unsigned i = 0; i < num_sgprs; i++) { - LLVMValueRef p = LLVMGetParam(func, i); - ret = LLVMBuildInsertValue(builder, ret, p, i, ""); - } - for (unsigned i = 0; i < num_vgprs; i++) { - LLVMValueRef p = LLVMGetParam(func, num_sgprs + i); - p = ac_to_float(&ctx->ac, p); - ret = LLVMBuildInsertValue(builder, ret, p, num_sgprs + i, ""); - } - - if (key->gs_prolog.states.tri_strip_adj_fix) { - /* Remap the input vertices for every other primitive. */ - const unsigned gfx6_vtx_params[6] = { - num_sgprs, - num_sgprs + 1, - num_sgprs + 3, - num_sgprs + 4, - num_sgprs + 5, - num_sgprs + 6 - }; - const unsigned gfx9_vtx_params[3] = { - num_sgprs, - num_sgprs + 1, - num_sgprs + 4, - }; - LLVMValueRef vtx_in[6], vtx_out[6]; - LLVMValueRef prim_id, rotate; - - if (ctx->screen->info.chip_class >= GFX9) { - for (unsigned i = 0; i < 3; i++) { - vtx_in[i*2] = si_unpack_param(ctx, gfx9_vtx_params[i], 0, 16); - vtx_in[i*2+1] = si_unpack_param(ctx, gfx9_vtx_params[i], 16, 16); - } - } else { - for (unsigned i = 0; i < 6; i++) - vtx_in[i] = LLVMGetParam(func, gfx6_vtx_params[i]); - } - - prim_id = LLVMGetParam(func, num_sgprs + 2); - rotate = LLVMBuildTrunc(builder, prim_id, ctx->i1, ""); - - for (unsigned i = 0; i < 6; ++i) { - LLVMValueRef base, rotated; - base = vtx_in[i]; - rotated = vtx_in[(i + 4) % 6]; - vtx_out[i] = LLVMBuildSelect(builder, rotate, rotated, base, ""); - } - - if (ctx->screen->info.chip_class >= GFX9) { - for (unsigned i = 0; i < 3; i++) { - LLVMValueRef hi, out; - - hi = LLVMBuildShl(builder, vtx_out[i*2+1], - LLVMConstInt(ctx->i32, 16, 0), ""); - out = LLVMBuildOr(builder, vtx_out[i*2], hi, ""); - out = ac_to_float(&ctx->ac, out); - ret = LLVMBuildInsertValue(builder, ret, out, - gfx9_vtx_params[i], ""); - } - } else { - for (unsigned i = 0; i < 6; i++) { - LLVMValueRef out; + case PIPE_SHADER_FRAGMENT: + fprintf(f, " part.ps.prolog.color_two_side = %u\n", key->part.ps.prolog.color_two_side); + fprintf(f, " part.ps.prolog.flatshade_colors = %u\n", key->part.ps.prolog.flatshade_colors); + fprintf(f, " part.ps.prolog.poly_stipple = %u\n", key->part.ps.prolog.poly_stipple); + fprintf(f, " part.ps.prolog.force_persp_sample_interp = %u\n", key->part.ps.prolog.force_persp_sample_interp); + fprintf(f, " part.ps.prolog.force_linear_sample_interp = %u\n", key->part.ps.prolog.force_linear_sample_interp); + fprintf(f, " part.ps.prolog.force_persp_center_interp = %u\n", key->part.ps.prolog.force_persp_center_interp); + fprintf(f, " part.ps.prolog.force_linear_center_interp = %u\n", key->part.ps.prolog.force_linear_center_interp); + fprintf(f, " part.ps.prolog.bc_optimize_for_persp = %u\n", key->part.ps.prolog.bc_optimize_for_persp); + fprintf(f, " part.ps.prolog.bc_optimize_for_linear = %u\n", key->part.ps.prolog.bc_optimize_for_linear); + fprintf(f, " part.ps.prolog.samplemask_log_ps_iter = %u\n", key->part.ps.prolog.samplemask_log_ps_iter); + fprintf(f, " part.ps.epilog.spi_shader_col_format = 0x%x\n", key->part.ps.epilog.spi_shader_col_format); + fprintf(f, " part.ps.epilog.color_is_int8 = 0x%X\n", key->part.ps.epilog.color_is_int8); + fprintf(f, " part.ps.epilog.color_is_int10 = 0x%X\n", key->part.ps.epilog.color_is_int10); + fprintf(f, " part.ps.epilog.last_cbuf = %u\n", key->part.ps.epilog.last_cbuf); + fprintf(f, " part.ps.epilog.alpha_func = %u\n", key->part.ps.epilog.alpha_func); + fprintf(f, " part.ps.epilog.alpha_to_one = %u\n", key->part.ps.epilog.alpha_to_one); + fprintf(f, " part.ps.epilog.poly_line_smoothing = %u\n", key->part.ps.epilog.poly_line_smoothing); + fprintf(f, " part.ps.epilog.clamp_color = %u\n", key->part.ps.epilog.clamp_color); + fprintf(f, " mono.u.ps.interpolate_at_sample_force_center = %u\n", key->mono.u.ps.interpolate_at_sample_force_center); + fprintf(f, " mono.u.ps.fbfetch_msaa = %u\n", key->mono.u.ps.fbfetch_msaa); + fprintf(f, " mono.u.ps.fbfetch_is_1D = %u\n", key->mono.u.ps.fbfetch_is_1D); + fprintf(f, " mono.u.ps.fbfetch_layered = %u\n", key->mono.u.ps.fbfetch_layered); + break; - out = ac_to_float(&ctx->ac, vtx_out[i]); - ret = LLVMBuildInsertValue(builder, ret, out, - gfx6_vtx_params[i], ""); - } - } + default: + assert(0); } - LLVMBuildRet(builder, ret); + if ((shader_type == PIPE_SHADER_GEOMETRY || + shader_type == PIPE_SHADER_TESS_EVAL || + shader_type == PIPE_SHADER_VERTEX) && + !key->as_es && !key->as_ls) { + fprintf(f, " opt.kill_outputs = 0x%"PRIx64"\n", key->opt.kill_outputs); + fprintf(f, " opt.clip_disable = %u\n", key->opt.clip_disable); + if (shader_type != PIPE_SHADER_GEOMETRY) + fprintf(f, " opt.ngg_culling = 0x%x\n", key->opt.ngg_culling); + } } -/** - * Given a list of shader part functions, build a wrapper function that - * runs them in sequence to form a monolithic shader. - */ -static void si_build_wrapper_function(struct si_shader_context *ctx, - LLVMValueRef *parts, - unsigned num_parts, - unsigned main_part, - unsigned next_shader_first_part) -{ - LLVMBuilderRef builder = ctx->ac.builder; - /* PS epilog has one arg per color component; gfx9 merged shader - * prologs need to forward 32 user SGPRs. - */ - struct si_function_info fninfo; - LLVMValueRef initial[64], out[64]; - LLVMTypeRef function_type; - unsigned num_first_params; - unsigned num_out, initial_num_out; - ASSERTED unsigned num_out_sgpr; /* used in debug checks */ - ASSERTED unsigned initial_num_out_sgpr; /* used in debug checks */ - unsigned num_sgprs, num_vgprs; - unsigned gprs; - - si_init_function_info(&fninfo); - - for (unsigned i = 0; i < num_parts; ++i) { - ac_add_function_attr(ctx->ac.context, parts[i], -1, - AC_FUNC_ATTR_ALWAYSINLINE); - LLVMSetLinkage(parts[i], LLVMPrivateLinkage); - } - - /* The parameters of the wrapper function correspond to those of the - * first part in terms of SGPRs and VGPRs, but we use the types of the - * main part to get the right types. This is relevant for the - * dereferenceable attribute on descriptor table pointers. - */ - num_sgprs = 0; - num_vgprs = 0; - - function_type = LLVMGetElementType(LLVMTypeOf(parts[0])); - num_first_params = LLVMCountParamTypes(function_type); +static void si_optimize_vs_outputs(struct si_shader_context *ctx) +{ + struct si_shader *shader = ctx->shader; + struct si_shader_info *info = &shader->selector->info; + unsigned skip_vs_optim_mask = 0; - for (unsigned i = 0; i < num_first_params; ++i) { - LLVMValueRef param = LLVMGetParam(parts[0], i); + if ((ctx->type != PIPE_SHADER_VERTEX && + ctx->type != PIPE_SHADER_TESS_EVAL) || + shader->key.as_ls || + shader->key.as_es) + return; - if (ac_is_sgpr_param(param)) { - assert(num_vgprs == 0); - num_sgprs += ac_get_type_size(LLVMTypeOf(param)) / 4; - } else { - num_vgprs += ac_get_type_size(LLVMTypeOf(param)) / 4; + /* Optimizing these outputs is not possible, since they might be overriden + * at runtime with S_028644_PT_SPRITE_TEX. */ + for (int i = 0; i < info->num_outputs; i++) { + if (info->output_semantic_name[i] == TGSI_SEMANTIC_PCOORD || + info->output_semantic_name[i] == TGSI_SEMANTIC_TEXCOORD) { + skip_vs_optim_mask |= 1u << shader->info.vs_output_param_offset[i]; } } - gprs = 0; - while (gprs < num_sgprs + num_vgprs) { - LLVMValueRef param = LLVMGetParam(parts[main_part], fninfo.num_params); - LLVMTypeRef type = LLVMTypeOf(param); - unsigned size = ac_get_type_size(type) / 4; + ac_optimize_vs_outputs(&ctx->ac, + ctx->main_fn, + shader->info.vs_output_param_offset, + info->num_outputs, + skip_vs_optim_mask, + &shader->info.nr_param_exports); +} + +static bool si_vs_needs_prolog(const struct si_shader_selector *sel, + const struct si_vs_prolog_bits *prolog_key, + const struct si_shader_key *key, + bool ngg_cull_shader) +{ + /* VGPR initialization fixup for Vega10 and Raven is always done in the + * VS prolog. */ + return sel->vs_needs_prolog || + prolog_key->ls_vgpr_fix || + prolog_key->unpack_instance_id_from_vertex_id || + (ngg_cull_shader && key->opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL); +} - add_arg(&fninfo, gprs < num_sgprs ? ARG_SGPR : ARG_VGPR, type); +static bool si_build_main_function(struct si_shader_context *ctx, + struct si_shader *shader, + struct nir_shader *nir, bool free_nir, + bool ngg_cull_shader) +{ + struct si_shader_selector *sel = shader->selector; + const struct si_shader_info *info = &sel->info; - assert(ac_is_sgpr_param(param) == (gprs < num_sgprs)); - assert(gprs + size <= num_sgprs + num_vgprs && - (gprs >= num_sgprs || gprs + size <= num_sgprs)); + ctx->shader = shader; + ctx->type = sel->type; - gprs += size; - } + ctx->num_const_buffers = util_last_bit(info->const_buffers_declared); + ctx->num_shader_buffers = util_last_bit(info->shader_buffers_declared); - /* Prepare the return type. */ - unsigned num_returns = 0; - LLVMTypeRef returns[32], last_func_type, return_type; + ctx->num_samplers = util_last_bit(info->samplers_declared); + ctx->num_images = util_last_bit(info->images_declared); - last_func_type = LLVMGetElementType(LLVMTypeOf(parts[num_parts - 1])); - return_type = LLVMGetReturnType(last_func_type); + si_llvm_init_resource_callbacks(ctx); - switch (LLVMGetTypeKind(return_type)) { - case LLVMStructTypeKind: - num_returns = LLVMCountStructElementTypes(return_type); - assert(num_returns <= ARRAY_SIZE(returns)); - LLVMGetStructElementTypes(return_type, returns); + switch (ctx->type) { + case PIPE_SHADER_VERTEX: + si_llvm_init_vs_callbacks(ctx, ngg_cull_shader); + break; + case PIPE_SHADER_TESS_CTRL: + si_llvm_init_tcs_callbacks(ctx); + break; + case PIPE_SHADER_TESS_EVAL: + si_llvm_init_tes_callbacks(ctx, ngg_cull_shader); break; - case LLVMVoidTypeKind: + case PIPE_SHADER_GEOMETRY: + si_llvm_init_gs_callbacks(ctx); + break; + case PIPE_SHADER_FRAGMENT: + si_llvm_init_ps_callbacks(ctx); + break; + case PIPE_SHADER_COMPUTE: + ctx->abi.load_local_group_size = si_llvm_get_block_size; break; default: - unreachable("unexpected type"); + assert(!"Unsupported shader type"); + return false; } - si_create_function(ctx, "wrapper", returns, num_returns, &fninfo, - si_get_max_workgroup_size(ctx->shader)); - - if (is_merged_shader(ctx)) - ac_init_exec_full_mask(&ctx->ac); + si_create_function(ctx, ngg_cull_shader); - /* Record the arguments of the function as if they were an output of - * a previous part. - */ - num_out = 0; - num_out_sgpr = 0; + if (ctx->shader->key.as_es || ctx->type == PIPE_SHADER_GEOMETRY) + si_preload_esgs_ring(ctx); - for (unsigned i = 0; i < fninfo.num_params; ++i) { - LLVMValueRef param = LLVMGetParam(ctx->main_fn, i); - LLVMTypeRef param_type = LLVMTypeOf(param); - LLVMTypeRef out_type = i < fninfo.num_sgpr_params ? ctx->i32 : ctx->f32; - unsigned size = ac_get_type_size(param_type) / 4; - - if (size == 1) { - if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) { - param = LLVMBuildPtrToInt(builder, param, ctx->i32, ""); - param_type = ctx->i32; - } + if (ctx->type == PIPE_SHADER_GEOMETRY) + si_preload_gs_rings(ctx); + else if (ctx->type == PIPE_SHADER_TESS_EVAL) + si_llvm_preload_tes_rings(ctx); - if (param_type != out_type) - param = LLVMBuildBitCast(builder, param, out_type, ""); - out[num_out++] = param; - } else { - LLVMTypeRef vector_type = LLVMVectorType(out_type, size); + if (ctx->type == PIPE_SHADER_TESS_CTRL && + sel->info.tessfactors_are_def_in_all_invocs) { + for (unsigned i = 0; i < 6; i++) { + ctx->invoc0_tess_factors[i] = + ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, ""); + } + } - if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) { - param = LLVMBuildPtrToInt(builder, param, ctx->i64, ""); - param_type = ctx->i64; + if (ctx->type == PIPE_SHADER_GEOMETRY) { + for (unsigned i = 0; i < 4; i++) { + ctx->gs_next_vertex[i] = + ac_build_alloca(&ctx->ac, ctx->ac.i32, ""); + } + if (shader->key.as_ngg) { + for (unsigned i = 0; i < 4; ++i) { + ctx->gs_curprim_verts[i] = + ac_build_alloca(&ctx->ac, ctx->ac.i32, ""); + ctx->gs_generated_prims[i] = + ac_build_alloca(&ctx->ac, ctx->ac.i32, ""); } - if (param_type != vector_type) - param = LLVMBuildBitCast(builder, param, vector_type, ""); + unsigned scratch_size = 8; + if (sel->so.num_outputs) + scratch_size = 44; - for (unsigned j = 0; j < size; ++j) - out[num_out++] = LLVMBuildExtractElement( - builder, param, LLVMConstInt(ctx->i32, j, 0), ""); - } + assert(!ctx->gs_ngg_scratch); + LLVMTypeRef ai32 = LLVMArrayType(ctx->ac.i32, scratch_size); + ctx->gs_ngg_scratch = LLVMAddGlobalInAddressSpace(ctx->ac.module, + ai32, "ngg_scratch", AC_ADDR_SPACE_LDS); + LLVMSetInitializer(ctx->gs_ngg_scratch, LLVMGetUndef(ai32)); + LLVMSetAlignment(ctx->gs_ngg_scratch, 4); - if (i < fninfo.num_sgpr_params) - num_out_sgpr = num_out; + ctx->gs_ngg_emit = LLVMAddGlobalInAddressSpace(ctx->ac.module, + LLVMArrayType(ctx->ac.i32, 0), "ngg_emit", AC_ADDR_SPACE_LDS); + LLVMSetLinkage(ctx->gs_ngg_emit, LLVMExternalLinkage); + LLVMSetAlignment(ctx->gs_ngg_emit, 4); + } } - memcpy(initial, out, sizeof(out)); - initial_num_out = num_out; - initial_num_out_sgpr = num_out_sgpr; - - /* Now chain the parts. */ - LLVMValueRef ret = NULL; - for (unsigned part = 0; part < num_parts; ++part) { - LLVMValueRef in[48]; - LLVMTypeRef ret_type; - unsigned out_idx = 0; - unsigned num_params = LLVMCountParams(parts[part]); - - /* Merged shaders are executed conditionally depending - * on the number of enabled threads passed in the input SGPRs. */ - if (is_multi_part_shader(ctx) && part == 0) { - LLVMValueRef ena, count = initial[3]; + if (ctx->type != PIPE_SHADER_GEOMETRY && + (shader->key.as_ngg && !shader->key.as_es)) { + /* Unconditionally declare scratch space base for streamout and + * vertex compaction. Whether space is actually allocated is + * determined during linking / PM4 creation. + * + * Add an extra dword per vertex to ensure an odd stride, which + * avoids bank conflicts for SoA accesses. + */ + if (!gfx10_is_ngg_passthrough(shader)) + si_llvm_declare_esgs_ring(ctx); - count = LLVMBuildAnd(builder, count, - LLVMConstInt(ctx->i32, 0x7f, 0), ""); - ena = LLVMBuildICmp(builder, LLVMIntULT, - ac_get_thread_id(&ctx->ac), count, ""); - ac_build_ifcc(&ctx->ac, ena, 6506); + /* This is really only needed when streamout and / or vertex + * compaction is enabled. + */ + if (!ctx->gs_ngg_scratch && + (sel->so.num_outputs || shader->key.opt.ngg_culling)) { + LLVMTypeRef asi32 = LLVMArrayType(ctx->ac.i32, 8); + ctx->gs_ngg_scratch = LLVMAddGlobalInAddressSpace(ctx->ac.module, + asi32, "ngg_scratch", AC_ADDR_SPACE_LDS); + LLVMSetInitializer(ctx->gs_ngg_scratch, LLVMGetUndef(asi32)); + LLVMSetAlignment(ctx->gs_ngg_scratch, 4); } + } - /* Derive arguments for the next part from outputs of the - * previous one. - */ - for (unsigned param_idx = 0; param_idx < num_params; ++param_idx) { - LLVMValueRef param; - LLVMTypeRef param_type; - bool is_sgpr; - unsigned param_size; - LLVMValueRef arg = NULL; - - param = LLVMGetParam(parts[part], param_idx); - param_type = LLVMTypeOf(param); - param_size = ac_get_type_size(param_type) / 4; - is_sgpr = ac_is_sgpr_param(param); - - if (is_sgpr) { - ac_add_function_attr(ctx->ac.context, parts[part], - param_idx + 1, AC_FUNC_ATTR_INREG); - } else if (out_idx < num_out_sgpr) { - /* Skip returned SGPRs the current part doesn't - * declare on the input. */ - out_idx = num_out_sgpr; - } + /* For GFX9 merged shaders: + * - Set EXEC for the first shader. If the prolog is present, set + * EXEC there instead. + * - Add a barrier before the second shader. + * - In the second shader, reset EXEC to ~0 and wrap the main part in + * an if-statement. This is required for correctness in geometry + * shaders, to ensure that empty GS waves do not send GS_EMIT and + * GS_CUT messages. + * + * For monolithic merged shaders, the first shader is wrapped in an + * if-block together with its prolog in si_build_wrapper_function. + * + * NGG vertex and tess eval shaders running as the last + * vertex/geometry stage handle execution explicitly using + * if-statements. + */ + if (ctx->screen->info.chip_class >= GFX9) { + if (!shader->is_monolithic && + (shader->key.as_es || shader->key.as_ls) && + (ctx->type == PIPE_SHADER_TESS_EVAL || + (ctx->type == PIPE_SHADER_VERTEX && + !si_vs_needs_prolog(sel, &shader->key.part.vs.prolog, + &shader->key, ngg_cull_shader)))) { + si_init_exec_from_input(ctx, + ctx->merged_wave_info, 0); + } else if (ctx->type == PIPE_SHADER_TESS_CTRL || + ctx->type == PIPE_SHADER_GEOMETRY || + (shader->key.as_ngg && !shader->key.as_es)) { + LLVMValueRef thread_enabled; + bool nested_barrier; - assert(out_idx + param_size <= (is_sgpr ? num_out_sgpr : num_out)); + if (!shader->is_monolithic || + (ctx->type == PIPE_SHADER_TESS_EVAL && + shader->key.as_ngg && !shader->key.as_es && + !shader->key.opt.ngg_culling)) + ac_init_exec_full_mask(&ctx->ac); - if (param_size == 1) - arg = out[out_idx]; - else - arg = ac_build_gather_values(&ctx->ac, &out[out_idx], param_size); + if ((ctx->type == PIPE_SHADER_VERTEX || + ctx->type == PIPE_SHADER_TESS_EVAL) && + shader->key.as_ngg && !shader->key.as_es && + !shader->key.opt.ngg_culling) { + gfx10_ngg_build_sendmsg_gs_alloc_req(ctx); - if (LLVMTypeOf(arg) != param_type) { - if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) { - if (LLVMGetPointerAddressSpace(param_type) == - AC_ADDR_SPACE_CONST_32BIT) { - arg = LLVMBuildBitCast(builder, arg, ctx->i32, ""); - arg = LLVMBuildIntToPtr(builder, arg, param_type, ""); - } else { - arg = LLVMBuildBitCast(builder, arg, ctx->i64, ""); - arg = LLVMBuildIntToPtr(builder, arg, param_type, ""); - } + /* Build the primitive export at the beginning + * of the shader if possible. + */ + if (gfx10_ngg_export_prim_early(shader)) + gfx10_ngg_build_export_prim(ctx, NULL, NULL); + } + + if (ctx->type == PIPE_SHADER_TESS_CTRL || + ctx->type == PIPE_SHADER_GEOMETRY) { + if (ctx->type == PIPE_SHADER_GEOMETRY && shader->key.as_ngg) { + gfx10_ngg_gs_emit_prologue(ctx); + nested_barrier = false; } else { - arg = LLVMBuildBitCast(builder, arg, param_type, ""); + nested_barrier = true; } + + thread_enabled = si_is_gs_thread(ctx); + } else { + thread_enabled = si_is_es_thread(ctx); + nested_barrier = false; } - in[param_idx] = arg; - out_idx += param_size; + ctx->merged_wrap_if_entry_block = LLVMGetInsertBlock(ctx->ac.builder); + ctx->merged_wrap_if_label = 11500; + ac_build_ifcc(&ctx->ac, thread_enabled, ctx->merged_wrap_if_label); + + if (nested_barrier) { + /* Execute a barrier before the second shader in + * a merged shader. + * + * Execute the barrier inside the conditional block, + * so that empty waves can jump directly to s_endpgm, + * which will also signal the barrier. + * + * This is possible in gfx9, because an empty wave + * for the second shader does not participate in + * the epilogue. With NGG, empty waves may still + * be required to export data (e.g. GS output vertices), + * so we cannot let them exit early. + * + * If the shader is TCS and the TCS epilog is present + * and contains a barrier, it will wait there and then + * reach s_endpgm. + */ + si_llvm_emit_barrier(ctx); + } } + } - ret = ac_build_call(&ctx->ac, parts[part], in, num_params); + if (sel->force_correct_derivs_after_kill) { + ctx->postponed_kill = ac_build_alloca_undef(&ctx->ac, ctx->ac.i1, ""); + /* true = don't kill. */ + LLVMBuildStore(ctx->ac.builder, ctx->ac.i1true, + ctx->postponed_kill); + } - if (is_multi_part_shader(ctx) && - part + 1 == next_shader_first_part) { - ac_build_endif(&ctx->ac, 6506); + bool success = si_nir_build_llvm(ctx, nir); + if (free_nir) + ralloc_free(nir); + if (!success) { + fprintf(stderr, "Failed to translate shader from NIR to LLVM\n"); + return false; + } - /* The second half of the merged shader should use - * the inputs from the toplevel (wrapper) function, - * not the return value from the last call. - * - * That's because the last call was executed condi- - * tionally, so we can't consume it in the main - * block. - */ - memcpy(out, initial, sizeof(initial)); - num_out = initial_num_out; - num_out_sgpr = initial_num_out_sgpr; - continue; - } + si_llvm_build_ret(ctx, ctx->return_value); + return true; +} - /* Extract the returned GPRs. */ - ret_type = LLVMTypeOf(ret); - num_out = 0; - num_out_sgpr = 0; - - if (LLVMGetTypeKind(ret_type) != LLVMVoidTypeKind) { - assert(LLVMGetTypeKind(ret_type) == LLVMStructTypeKind); - - unsigned ret_size = LLVMCountStructElementTypes(ret_type); - - for (unsigned i = 0; i < ret_size; ++i) { - LLVMValueRef val = - LLVMBuildExtractValue(builder, ret, i, ""); - - assert(num_out < ARRAY_SIZE(out)); - out[num_out++] = val; - - if (LLVMTypeOf(val) == ctx->i32) { - assert(num_out_sgpr + 1 == num_out); - num_out_sgpr = num_out; - } - } - } +/** + * Compute the VS prolog key, which contains all the information needed to + * build the VS prolog function, and set shader->info bits where needed. + * + * \param info Shader info of the vertex shader. + * \param num_input_sgprs Number of input SGPRs for the vertex shader. + * \param has_old_ Whether the preceding shader part is the NGG cull shader. + * \param prolog_key Key of the VS prolog + * \param shader_out The vertex shader, or the next shader if merging LS+HS or ES+GS. + * \param key Output shader part key. + */ +static void si_get_vs_prolog_key(const struct si_shader_info *info, + unsigned num_input_sgprs, + bool ngg_cull_shader, + const struct si_vs_prolog_bits *prolog_key, + struct si_shader *shader_out, + union si_shader_part_key *key) +{ + memset(key, 0, sizeof(*key)); + key->vs_prolog.states = *prolog_key; + key->vs_prolog.num_input_sgprs = num_input_sgprs; + key->vs_prolog.num_inputs = info->num_inputs; + key->vs_prolog.as_ls = shader_out->key.as_ls; + key->vs_prolog.as_es = shader_out->key.as_es; + key->vs_prolog.as_ngg = shader_out->key.as_ngg; + + if (ngg_cull_shader) { + key->vs_prolog.gs_fast_launch_tri_list = !!(shader_out->key.opt.ngg_culling & + SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST); + key->vs_prolog.gs_fast_launch_tri_strip = !!(shader_out->key.opt.ngg_culling & + SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP); + } else { + key->vs_prolog.has_ngg_cull_inputs = !!shader_out->key.opt.ngg_culling; } - /* Return the value from the last part. */ - if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind) - LLVMBuildRetVoid(builder); - else - LLVMBuildRet(builder, ret); + if (shader_out->selector->type == PIPE_SHADER_TESS_CTRL) { + key->vs_prolog.as_ls = 1; + key->vs_prolog.num_merged_next_stage_vgprs = 2; + } else if (shader_out->selector->type == PIPE_SHADER_GEOMETRY) { + key->vs_prolog.as_es = 1; + key->vs_prolog.num_merged_next_stage_vgprs = 5; + } else if (shader_out->key.as_ngg) { + key->vs_prolog.num_merged_next_stage_vgprs = 5; + } + + /* Enable loading the InstanceID VGPR. */ + uint16_t input_mask = u_bit_consecutive(0, info->num_inputs); + + if ((key->vs_prolog.states.instance_divisor_is_one | + key->vs_prolog.states.instance_divisor_is_fetched) & input_mask) + shader_out->info.uses_instanceid = true; } static bool si_should_optimize_less(struct ac_llvm_compiler *compiler, @@ -6935,67 +1704,112 @@ sel->info.num_memory_instructions > 1000; } -int si_compile_tgsi_shader(struct si_screen *sscreen, - struct ac_llvm_compiler *compiler, - struct si_shader *shader, - struct pipe_debug_callback *debug) +static struct nir_shader *get_nir_shader(struct si_shader_selector *sel, + bool *free_nir) +{ + *free_nir = false; + + if (sel->nir) { + return sel->nir; + } else if (sel->nir_binary) { + struct pipe_screen *screen = &sel->screen->b; + const void *options = + screen->get_compiler_options(screen, PIPE_SHADER_IR_NIR, + sel->type); + + struct blob_reader blob_reader; + blob_reader_init(&blob_reader, sel->nir_binary, sel->nir_size); + *free_nir = true; + return nir_deserialize(NULL, options, &blob_reader); + } + return NULL; +} + +static bool si_llvm_compile_shader(struct si_screen *sscreen, + struct ac_llvm_compiler *compiler, + struct si_shader *shader, + struct pipe_debug_callback *debug, + struct nir_shader *nir, + bool free_nir) { struct si_shader_selector *sel = shader->selector; struct si_shader_context ctx; - int r = -1; - - /* Dump TGSI code before doing TGSI->LLVM conversion in case the - * conversion fails. */ - if (si_can_dump_shader(sscreen, sel->type) && - !(sscreen->debug_flags & DBG(NO_TGSI))) { - if (sel->tokens) - tgsi_dump(sel->tokens, 0); - else - nir_print_shader(sel->nir, stderr); - si_dump_streamout(&sel->so); - } - si_init_shader_ctx(&ctx, sscreen, compiler, si_get_shader_wave_size(shader), - sel->nir != NULL); - si_llvm_context_set_ir(&ctx, shader); + si_llvm_context_init(&ctx, sscreen, compiler, si_get_shader_wave_size(shader)); - memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_UNDEFINED, - sizeof(shader->info.vs_output_param_offset)); - - shader->info.uses_instanceid = sel->info.uses_instanceid; + LLVMValueRef ngg_cull_main_fn = NULL; + if (shader->key.opt.ngg_culling) { + if (!si_build_main_function(&ctx, shader, nir, false, true)) { + si_llvm_dispose(&ctx); + return false; + } + ngg_cull_main_fn = ctx.main_fn; + ctx.main_fn = NULL; + } - if (!si_compile_tgsi_main(&ctx)) { + if (!si_build_main_function(&ctx, shader, nir, free_nir, false)) { si_llvm_dispose(&ctx); - return -1; + return false; } if (shader->is_monolithic && ctx.type == PIPE_SHADER_VERTEX) { - LLVMValueRef parts[2]; - bool need_prolog = sel->vs_needs_prolog; - - parts[1] = ctx.main_fn; + LLVMValueRef parts[4]; + unsigned num_parts = 0; + bool has_prolog = false; + LLVMValueRef main_fn = ctx.main_fn; + + if (ngg_cull_main_fn) { + if (si_vs_needs_prolog(sel, &shader->key.part.vs.prolog, + &shader->key, true)) { + union si_shader_part_key prolog_key; + si_get_vs_prolog_key(&sel->info, + shader->info.num_input_sgprs, + true, + &shader->key.part.vs.prolog, + shader, &prolog_key); + prolog_key.vs_prolog.is_monolithic = true; + si_llvm_build_vs_prolog(&ctx, &prolog_key); + parts[num_parts++] = ctx.main_fn; + has_prolog = true; + } + parts[num_parts++] = ngg_cull_main_fn; + } - if (need_prolog) { + if (si_vs_needs_prolog(sel, &shader->key.part.vs.prolog, + &shader->key, false)) { union si_shader_part_key prolog_key; si_get_vs_prolog_key(&sel->info, shader->info.num_input_sgprs, + false, &shader->key.part.vs.prolog, shader, &prolog_key); - si_build_vs_prolog_function(&ctx, &prolog_key); - parts[0] = ctx.main_fn; + prolog_key.vs_prolog.is_monolithic = true; + si_llvm_build_vs_prolog(&ctx, &prolog_key); + parts[num_parts++] = ctx.main_fn; + has_prolog = true; } + parts[num_parts++] = main_fn; + + si_build_wrapper_function(&ctx, parts, num_parts, + has_prolog ? 1 : 0, 0); + + if (ctx.shader->key.opt.vs_as_prim_discard_cs) + si_build_prim_discard_compute_shader(&ctx); + } else if (shader->is_monolithic && ctx.type == PIPE_SHADER_TESS_EVAL && + ngg_cull_main_fn) { + LLVMValueRef parts[2]; - si_build_wrapper_function(&ctx, parts + !need_prolog, - 1 + need_prolog, need_prolog, 0); + parts[0] = ngg_cull_main_fn; + parts[1] = ctx.main_fn; - if (ctx.shader->key.opt.vs_as_prim_discard_cs) - si_build_prim_discard_compute_shader(&ctx); + si_build_wrapper_function(&ctx, parts, 2, 0, 0); } else if (shader->is_monolithic && ctx.type == PIPE_SHADER_TESS_CTRL) { if (sscreen->info.chip_class >= GFX9) { struct si_shader_selector *ls = shader->key.part.tcs.ls; LLVMValueRef parts[4]; bool vs_needs_prolog = - si_vs_needs_prolog(ls, &shader->key.part.tcs.ls_prolog); + si_vs_needs_prolog(ls, &shader->key.part.tcs.ls_prolog, + &shader->key, false); /* TCS main part */ parts[2] = ctx.main_fn; @@ -7004,21 +1818,21 @@ union si_shader_part_key tcs_epilog_key; memset(&tcs_epilog_key, 0, sizeof(tcs_epilog_key)); tcs_epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog; - si_build_tcs_epilog_function(&ctx, &tcs_epilog_key); + si_llvm_build_tcs_epilog(&ctx, &tcs_epilog_key); parts[3] = ctx.main_fn; /* VS as LS main part */ + nir = get_nir_shader(ls, &free_nir); struct si_shader shader_ls = {}; shader_ls.selector = ls; shader_ls.key.as_ls = 1; shader_ls.key.mono = shader->key.mono; shader_ls.key.opt = shader->key.opt; shader_ls.is_monolithic = true; - si_llvm_context_set_ir(&ctx, &shader_ls); - if (!si_compile_tgsi_main(&ctx)) { + if (!si_build_main_function(&ctx, &shader_ls, nir, free_nir, false)) { si_llvm_dispose(&ctx); - return -1; + return false; } shader->info.uses_instanceid |= ls->info.uses_instanceid; parts[1] = ctx.main_fn; @@ -7028,10 +1842,11 @@ union si_shader_part_key vs_prolog_key; si_get_vs_prolog_key(&ls->info, shader_ls.info.num_input_sgprs, + false, &shader->key.part.tcs.ls_prolog, shader, &vs_prolog_key); vs_prolog_key.vs_prolog.is_monolithic = true; - si_build_vs_prolog_function(&ctx, &vs_prolog_key); + si_llvm_build_vs_prolog(&ctx, &vs_prolog_key); parts[0] = ctx.main_fn; } @@ -7051,7 +1866,7 @@ memset(&epilog_key, 0, sizeof(epilog_key)); epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog; - si_build_tcs_epilog_function(&ctx, &epilog_key); + si_llvm_build_tcs_epilog(&ctx, &epilog_key); parts[1] = ctx.main_fn; si_build_wrapper_function(&ctx, parts, 2, 0, 0); @@ -7070,10 +1885,11 @@ gs_prolog_key.gs_prolog.states = shader->key.part.gs.prolog; gs_prolog_key.gs_prolog.is_monolithic = true; gs_prolog_key.gs_prolog.as_ngg = shader->key.as_ngg; - si_build_gs_prolog_function(&ctx, &gs_prolog_key); + si_llvm_build_gs_prolog(&ctx, &gs_prolog_key); gs_prolog = ctx.main_fn; /* ES main part */ + nir = get_nir_shader(es, &free_nir); struct si_shader shader_es = {}; shader_es.selector = es; shader_es.key.as_es = 1; @@ -7081,24 +1897,26 @@ shader_es.key.mono = shader->key.mono; shader_es.key.opt = shader->key.opt; shader_es.is_monolithic = true; - si_llvm_context_set_ir(&ctx, &shader_es); - if (!si_compile_tgsi_main(&ctx)) { + if (!si_build_main_function(&ctx, &shader_es, nir, free_nir, false)) { si_llvm_dispose(&ctx); - return -1; + return false; } shader->info.uses_instanceid |= es->info.uses_instanceid; es_main = ctx.main_fn; /* ES prolog */ - if (es->vs_needs_prolog) { + if (es->type == PIPE_SHADER_VERTEX && + si_vs_needs_prolog(es, &shader->key.part.gs.vs_prolog, + &shader->key, false)) { union si_shader_part_key vs_prolog_key; si_get_vs_prolog_key(&es->info, shader_es.info.num_input_sgprs, + false, &shader->key.part.gs.vs_prolog, shader, &vs_prolog_key); vs_prolog_key.vs_prolog.is_monolithic = true; - si_build_vs_prolog_function(&ctx, &vs_prolog_key); + si_llvm_build_vs_prolog(&ctx, &vs_prolog_key); es_prolog = ctx.main_fn; } @@ -7127,33 +1945,13 @@ memset(&prolog_key, 0, sizeof(prolog_key)); prolog_key.gs_prolog.states = shader->key.part.gs.prolog; - si_build_gs_prolog_function(&ctx, &prolog_key); + si_llvm_build_gs_prolog(&ctx, &prolog_key); parts[0] = ctx.main_fn; si_build_wrapper_function(&ctx, parts, 2, 1, 0); } } else if (shader->is_monolithic && ctx.type == PIPE_SHADER_FRAGMENT) { - LLVMValueRef parts[3]; - union si_shader_part_key prolog_key; - union si_shader_part_key epilog_key; - bool need_prolog; - - si_get_ps_prolog_key(shader, &prolog_key, false); - need_prolog = si_need_ps_prolog(&prolog_key); - - parts[need_prolog ? 1 : 0] = ctx.main_fn; - - if (need_prolog) { - si_build_ps_prolog_function(&ctx, &prolog_key); - parts[0] = ctx.main_fn; - } - - si_get_ps_epilog_key(shader, &epilog_key); - si_build_ps_epilog_function(&ctx, &epilog_key); - parts[need_prolog ? 2 : 1] = ctx.main_fn; - - si_build_wrapper_function(&ctx, parts, need_prolog ? 3 : 2, - need_prolog ? 1 : 0, 0); + si_llvm_build_monolithic_ps(&ctx, shader); } si_llvm_optimize_module(&ctx); @@ -7172,24 +1970,55 @@ LLVMPointerTypeKind); /* Compile to bytecode. */ - r = si_compile_llvm(sscreen, &shader->binary, &shader->config, compiler, - ctx.ac.module, debug, ctx.type, ctx.ac.wave_size, - si_get_shader_name(shader), - si_should_optimize_less(compiler, shader->selector)); - si_llvm_dispose(&ctx); - if (r) { + if (!si_compile_llvm(sscreen, &shader->binary, &shader->config, compiler, + &ctx.ac, debug, ctx.type, si_get_shader_name(shader), + si_should_optimize_less(compiler, shader->selector))) { + si_llvm_dispose(&ctx); fprintf(stderr, "LLVM failed to compile shader\n"); - return r; + return false; + } + + si_llvm_dispose(&ctx); + return true; +} + +bool si_compile_shader(struct si_screen *sscreen, + struct ac_llvm_compiler *compiler, + struct si_shader *shader, + struct pipe_debug_callback *debug) +{ + struct si_shader_selector *sel = shader->selector; + bool free_nir; + struct nir_shader *nir = get_nir_shader(sel, &free_nir); + + /* Dump NIR before doing NIR->LLVM conversion in case the + * conversion fails. */ + if (si_can_dump_shader(sscreen, sel->type) && + !(sscreen->debug_flags & DBG(NO_NIR))) { + nir_print_shader(nir, stderr); + si_dump_streamout(&sel->so); } + memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_UNDEFINED, + sizeof(shader->info.vs_output_param_offset)); + + shader->info.uses_instanceid = sel->info.uses_instanceid; + + /* TODO: ACO could compile non-monolithic shaders here (starting + * with PS and NGG VS), but monolithic shaders should be compiled + * by LLVM due to more complicated compilation. + */ + if (!si_llvm_compile_shader(sscreen, compiler, shader, debug, nir, free_nir)) + return false; + /* Validate SGPR and VGPR usage for compute to detect compiler bugs. * LLVM 3.9svn has this bug. */ if (sel->type == PIPE_SHADER_COMPUTE) { unsigned wave_size = sscreen->compute_wave_size; - unsigned max_vgprs = ac_get_num_physical_vgprs(sscreen->info.chip_class, - wave_size); - unsigned max_sgprs = ac_get_num_physical_sgprs(&sscreen->info); + unsigned max_vgprs = sscreen->info.num_physical_wave64_vgprs_per_simd * + (wave_size == 32 ? 2 : 1); + unsigned max_sgprs = sscreen->info.num_physical_sgprs_per_simd; unsigned max_sgprs_per_wave = 128; unsigned simds_per_tg = 4; /* assuming WGP mode on gfx10 */ unsigned threads_per_tg = si_get_max_workgroup_size(shader); @@ -7216,56 +2045,19 @@ } /* Add the scratch offset to input SGPRs. */ - if (shader->config.scratch_bytes_per_wave && !is_merged_shader(&ctx)) + if (shader->config.scratch_bytes_per_wave && !si_is_merged_shader(shader)) shader->info.num_input_sgprs += 1; /* scratch byte offset */ /* Calculate the number of fragment input VGPRs. */ - if (ctx.type == PIPE_SHADER_FRAGMENT) { - shader->info.num_input_vgprs = 0; - shader->info.face_vgpr_index = -1; - shader->info.ancillary_vgpr_index = -1; - - if (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_addr)) - shader->info.num_input_vgprs += 2; - if (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr)) - shader->info.num_input_vgprs += 2; - if (G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_addr)) - shader->info.num_input_vgprs += 2; - if (G_0286CC_PERSP_PULL_MODEL_ENA(shader->config.spi_ps_input_addr)) - shader->info.num_input_vgprs += 3; - if (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_addr)) - shader->info.num_input_vgprs += 2; - if (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr)) - shader->info.num_input_vgprs += 2; - if (G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_addr)) - shader->info.num_input_vgprs += 2; - if (G_0286CC_LINE_STIPPLE_TEX_ENA(shader->config.spi_ps_input_addr)) - shader->info.num_input_vgprs += 1; - if (G_0286CC_POS_X_FLOAT_ENA(shader->config.spi_ps_input_addr)) - shader->info.num_input_vgprs += 1; - if (G_0286CC_POS_Y_FLOAT_ENA(shader->config.spi_ps_input_addr)) - shader->info.num_input_vgprs += 1; - if (G_0286CC_POS_Z_FLOAT_ENA(shader->config.spi_ps_input_addr)) - shader->info.num_input_vgprs += 1; - if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_addr)) - shader->info.num_input_vgprs += 1; - if (G_0286CC_FRONT_FACE_ENA(shader->config.spi_ps_input_addr)) { - shader->info.face_vgpr_index = shader->info.num_input_vgprs; - shader->info.num_input_vgprs += 1; - } - if (G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr)) { - shader->info.ancillary_vgpr_index = shader->info.num_input_vgprs; - shader->info.num_input_vgprs += 1; - } - if (G_0286CC_SAMPLE_COVERAGE_ENA(shader->config.spi_ps_input_addr)) - shader->info.num_input_vgprs += 1; - if (G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr)) - shader->info.num_input_vgprs += 1; + if (sel->type == PIPE_SHADER_FRAGMENT) { + shader->info.num_input_vgprs = ac_get_fs_input_vgpr_cnt(&shader->config, + &shader->info.face_vgpr_index, + &shader->info.ancillary_vgpr_index); } si_calculate_max_simd_waves(shader); si_shader_dump_stats_for_shader_db(sscreen, shader, debug); - return 0; + return true; } /** @@ -7295,12 +2087,12 @@ { struct si_shader_part *result; - mtx_lock(&sscreen->shader_parts_mutex); + simple_mtx_lock(&sscreen->shader_parts_mutex); /* Find existing. */ for (result = *list; result; result = result->next) { if (memcmp(&result->key, key, sizeof(*key)) == 0) { - mtx_unlock(&sscreen->shader_parts_mutex); + simple_mtx_unlock(&sscreen->shader_parts_mutex); return result; } } @@ -7309,7 +2101,11 @@ result = CALLOC_STRUCT(si_shader_part); result->key = *key; + struct si_shader_selector sel = {}; + sel.screen = sscreen; + struct si_shader shader = {}; + shader.selector = &sel; switch (type) { case PIPE_SHADER_VERTEX: @@ -7336,10 +2132,9 @@ } struct si_shader_context ctx; - si_init_shader_ctx(&ctx, sscreen, compiler, - si_get_wave_size(sscreen, type, shader.key.as_ngg, - shader.key.as_es), - false); + si_llvm_context_init(&ctx, sscreen, compiler, + si_get_wave_size(sscreen, type, shader.key.as_ngg, + shader.key.as_es)); ctx.shader = &shader; ctx.type = type; @@ -7348,9 +2143,8 @@ /* Compile. */ si_llvm_optimize_module(&ctx); - if (si_compile_llvm(sscreen, &result->binary, &result->config, compiler, - ctx.ac.module, debug, ctx.type, ctx.ac.wave_size, - name, false)) { + if (!si_compile_llvm(sscreen, &result->binary, &result->config, compiler, + &ctx.ac, debug, ctx.type, name, false)) { FREE(result); result = NULL; goto out; @@ -7361,230 +2155,10 @@ out: si_llvm_dispose(&ctx); - mtx_unlock(&sscreen->shader_parts_mutex); + simple_mtx_unlock(&sscreen->shader_parts_mutex); return result; } -static LLVMValueRef si_prolog_get_rw_buffers(struct si_shader_context *ctx) -{ - LLVMValueRef ptr[2], list; - bool merged_shader = is_merged_shader(ctx); - - ptr[0] = LLVMGetParam(ctx->main_fn, (merged_shader ? 8 : 0) + SI_SGPR_RW_BUFFERS); - list = LLVMBuildIntToPtr(ctx->ac.builder, ptr[0], - ac_array_in_const32_addr_space(ctx->v4i32), ""); - return list; -} - -/** - * Build the vertex shader prolog function. - * - * The inputs are the same as VS (a lot of SGPRs and 4 VGPR system values). - * All inputs are returned unmodified. The vertex load indices are - * stored after them, which will be used by the API VS for fetching inputs. - * - * For example, the expected outputs for instance_divisors[] = {0, 1, 2} are: - * input_v0, - * input_v1, - * input_v2, - * input_v3, - * (VertexID + BaseVertex), - * (InstanceID + StartInstance), - * (InstanceID / 2 + StartInstance) - */ -static void si_build_vs_prolog_function(struct si_shader_context *ctx, - union si_shader_part_key *key) -{ - struct si_function_info fninfo; - LLVMTypeRef *returns; - LLVMValueRef ret, func; - int num_returns, i; - unsigned first_vs_vgpr = key->vs_prolog.num_merged_next_stage_vgprs; - unsigned num_input_vgprs = key->vs_prolog.num_merged_next_stage_vgprs + 4; - LLVMValueRef input_vgprs[9]; - unsigned num_all_input_regs = key->vs_prolog.num_input_sgprs + - num_input_vgprs; - unsigned user_sgpr_base = key->vs_prolog.num_merged_next_stage_vgprs ? 8 : 0; - - si_init_function_info(&fninfo); - - /* 4 preloaded VGPRs + vertex load indices as prolog outputs */ - returns = alloca((num_all_input_regs + key->vs_prolog.last_input + 1) * - sizeof(LLVMTypeRef)); - num_returns = 0; - - /* Declare input and output SGPRs. */ - for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) { - add_arg(&fninfo, ARG_SGPR, ctx->i32); - returns[num_returns++] = ctx->i32; - } - - /* Preloaded VGPRs (outputs must be floats) */ - for (i = 0; i < num_input_vgprs; i++) { - add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &input_vgprs[i]); - returns[num_returns++] = ctx->f32; - } - - /* Vertex load indices. */ - for (i = 0; i <= key->vs_prolog.last_input; i++) - returns[num_returns++] = ctx->f32; - - /* Create the function. */ - si_create_function(ctx, "vs_prolog", returns, num_returns, &fninfo, 0); - func = ctx->main_fn; - - if (key->vs_prolog.num_merged_next_stage_vgprs) { - if (!key->vs_prolog.is_monolithic) - si_init_exec_from_input(ctx, 3, 0); - - if (key->vs_prolog.as_ls && - ctx->screen->has_ls_vgpr_init_bug) { - /* If there are no HS threads, SPI loads the LS VGPRs - * starting at VGPR 0. Shift them back to where they - * belong. - */ - LLVMValueRef has_hs_threads = - LLVMBuildICmp(ctx->ac.builder, LLVMIntNE, - si_unpack_param(ctx, 3, 8, 8), - ctx->i32_0, ""); - - for (i = 4; i > 0; --i) { - input_vgprs[i + 1] = - LLVMBuildSelect(ctx->ac.builder, has_hs_threads, - input_vgprs[i + 1], - input_vgprs[i - 1], ""); - } - } - } - - unsigned vertex_id_vgpr = first_vs_vgpr; - unsigned instance_id_vgpr = - ctx->screen->info.chip_class >= GFX10 ? - first_vs_vgpr + 3 : - first_vs_vgpr + (key->vs_prolog.as_ls ? 2 : 1); - - ctx->abi.vertex_id = input_vgprs[vertex_id_vgpr]; - ctx->abi.instance_id = input_vgprs[instance_id_vgpr]; - - /* InstanceID = VertexID >> 16; - * VertexID = VertexID & 0xffff; - */ - if (key->vs_prolog.states.unpack_instance_id_from_vertex_id) { - ctx->abi.instance_id = LLVMBuildLShr(ctx->ac.builder, ctx->abi.vertex_id, - LLVMConstInt(ctx->i32, 16, 0), ""); - ctx->abi.vertex_id = LLVMBuildAnd(ctx->ac.builder, ctx->abi.vertex_id, - LLVMConstInt(ctx->i32, 0xffff, 0), ""); - } - - /* Copy inputs to outputs. This should be no-op, as the registers match, - * but it will prevent the compiler from overwriting them unintentionally. - */ - ret = ctx->return_value; - for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) { - LLVMValueRef p = LLVMGetParam(func, i); - ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p, i, ""); - } - for (i = 0; i < num_input_vgprs; i++) { - LLVMValueRef p = input_vgprs[i]; - - if (i == vertex_id_vgpr) - p = ctx->abi.vertex_id; - else if (i == instance_id_vgpr) - p = ctx->abi.instance_id; - - p = ac_to_float(&ctx->ac, p); - ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p, - key->vs_prolog.num_input_sgprs + i, ""); - } - - LLVMValueRef original_ret = ret; - bool wrapped = false; - LLVMBasicBlockRef if_entry_block = NULL; - - if (key->vs_prolog.is_monolithic && key->vs_prolog.as_ngg) { - LLVMValueRef num_threads; - LLVMValueRef ena; - - num_threads = si_unpack_param(ctx, 3, 0, 8); - ena = LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, - ac_get_thread_id(&ctx->ac), num_threads, ""); - if_entry_block = LLVMGetInsertBlock(ctx->ac.builder); - ac_build_ifcc(&ctx->ac, ena, 11501); - wrapped = true; - } - - /* Compute vertex load indices from instance divisors. */ - LLVMValueRef instance_divisor_constbuf = NULL; - - if (key->vs_prolog.states.instance_divisor_is_fetched) { - LLVMValueRef list = si_prolog_get_rw_buffers(ctx); - LLVMValueRef buf_index = - LLVMConstInt(ctx->i32, SI_VS_CONST_INSTANCE_DIVISORS, 0); - instance_divisor_constbuf = - ac_build_load_to_sgpr(&ctx->ac, list, buf_index); - } - - for (i = 0; i <= key->vs_prolog.last_input; i++) { - bool divisor_is_one = - key->vs_prolog.states.instance_divisor_is_one & (1u << i); - bool divisor_is_fetched = - key->vs_prolog.states.instance_divisor_is_fetched & (1u << i); - LLVMValueRef index = NULL; - - if (divisor_is_one) { - index = ctx->abi.instance_id; - } else if (divisor_is_fetched) { - LLVMValueRef udiv_factors[4]; - - for (unsigned j = 0; j < 4; j++) { - udiv_factors[j] = - buffer_load_const(ctx, instance_divisor_constbuf, - LLVMConstInt(ctx->i32, i*16 + j*4, 0)); - udiv_factors[j] = ac_to_integer(&ctx->ac, udiv_factors[j]); - } - /* The faster NUW version doesn't work when InstanceID == UINT_MAX. - * Such InstanceID might not be achievable in a reasonable time though. - */ - index = ac_build_fast_udiv_nuw(&ctx->ac, ctx->abi.instance_id, - udiv_factors[0], udiv_factors[1], - udiv_factors[2], udiv_factors[3]); - } - - if (divisor_is_one || divisor_is_fetched) { - /* Add StartInstance. */ - index = LLVMBuildAdd(ctx->ac.builder, index, - LLVMGetParam(ctx->main_fn, user_sgpr_base + - SI_SGPR_START_INSTANCE), ""); - } else { - /* VertexID + BaseVertex */ - index = LLVMBuildAdd(ctx->ac.builder, - ctx->abi.vertex_id, - LLVMGetParam(func, user_sgpr_base + - SI_SGPR_BASE_VERTEX), ""); - } - - index = ac_to_float(&ctx->ac, index); - ret = LLVMBuildInsertValue(ctx->ac.builder, ret, index, - fninfo.num_params + i, ""); - } - - if (wrapped) { - LLVMBasicBlockRef bbs[2] = { - LLVMGetInsertBlock(ctx->ac.builder), - if_entry_block, - }; - ac_build_endif(&ctx->ac, 11501); - - LLVMValueRef values[2] = { - ret, - original_ret - }; - ret = ac_build_phi(&ctx->ac, LLVMTypeOf(ret), 2, values, bbs); - } - - si_llvm_build_ret(ctx, ret); -} - static bool si_get_vs_prolog(struct si_screen *sscreen, struct ac_llvm_compiler *compiler, struct si_shader *shader, @@ -7594,18 +2168,18 @@ { struct si_shader_selector *vs = main_part->selector; - if (!si_vs_needs_prolog(vs, key)) + if (!si_vs_needs_prolog(vs, key, &shader->key, false)) return true; /* Get the prolog. */ union si_shader_part_key prolog_key; - si_get_vs_prolog_key(&vs->info, main_part->info.num_input_sgprs, + si_get_vs_prolog_key(&vs->info, main_part->info.num_input_sgprs, false, key, shader, &prolog_key); shader->prolog = si_get_shader_part(sscreen, &sscreen->vs_prologs, PIPE_SHADER_VERTEX, true, &prolog_key, compiler, - debug, si_build_vs_prolog_function, + debug, si_llvm_build_vs_prolog, "Vertex Shader Prolog"); return shader->prolog != NULL; } @@ -7623,81 +2197,6 @@ } /** - * Compile the TCS epilog function. This writes tesselation factors to memory - * based on the output primitive type of the tesselator (determined by TES). - */ -static void si_build_tcs_epilog_function(struct si_shader_context *ctx, - union si_shader_part_key *key) -{ - struct lp_build_tgsi_context *bld_base = &ctx->bld_base; - struct si_function_info fninfo; - LLVMValueRef func; - - si_init_function_info(&fninfo); - - if (ctx->screen->info.chip_class >= GFX9) { - add_arg(&fninfo, ARG_SGPR, ctx->i32); - add_arg(&fninfo, ARG_SGPR, ctx->i32); - ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); - add_arg(&fninfo, ARG_SGPR, ctx->i32); /* wave info */ - ctx->param_tcs_factor_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); - add_arg(&fninfo, ARG_SGPR, ctx->i32); - add_arg(&fninfo, ARG_SGPR, ctx->i32); - add_arg(&fninfo, ARG_SGPR, ctx->i32); - add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr); - add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr); - add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr); - add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr); - add_arg(&fninfo, ARG_SGPR, ctx->i32); - add_arg(&fninfo, ARG_SGPR, ctx->i32); - add_arg(&fninfo, ARG_SGPR, ctx->i32); - add_arg(&fninfo, ARG_SGPR, ctx->i32); - ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32); - add_arg(&fninfo, ARG_SGPR, ctx->i32); - ctx->param_tcs_out_lds_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32); - } else { - add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr); - add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr); - add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr); - add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr); - ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32); - add_arg(&fninfo, ARG_SGPR, ctx->i32); - ctx->param_tcs_out_lds_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32); - add_arg(&fninfo, ARG_SGPR, ctx->i32); - ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); - ctx->param_tcs_factor_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); - } - - add_arg(&fninfo, ARG_VGPR, ctx->i32); /* VGPR gap */ - add_arg(&fninfo, ARG_VGPR, ctx->i32); /* VGPR gap */ - unsigned tess_factors_idx = - add_arg(&fninfo, ARG_VGPR, ctx->i32); /* patch index within the wave (REL_PATCH_ID) */ - add_arg(&fninfo, ARG_VGPR, ctx->i32); /* invocation ID within the patch */ - add_arg(&fninfo, ARG_VGPR, ctx->i32); /* LDS offset where tess factors should be loaded from */ - - for (unsigned i = 0; i < 6; i++) - add_arg(&fninfo, ARG_VGPR, ctx->i32); /* tess factors */ - - /* Create the function. */ - si_create_function(ctx, "tcs_epilog", NULL, 0, &fninfo, - ctx->screen->info.chip_class >= GFX7 ? 128 : 0); - ac_declare_lds_as_pointer(&ctx->ac); - func = ctx->main_fn; - - LLVMValueRef invoc0_tess_factors[6]; - for (unsigned i = 0; i < 6; i++) - invoc0_tess_factors[i] = LLVMGetParam(func, tess_factors_idx + 3 + i); - - si_write_tess_factors(bld_base, - LLVMGetParam(func, tess_factors_idx), - LLVMGetParam(func, tess_factors_idx + 1), - LLVMGetParam(func, tess_factors_idx + 2), - invoc0_tess_factors, invoc0_tess_factors + 4); - - LLVMBuildRetVoid(ctx->ac.builder); -} - -/** * Select and compile (or reuse) TCS parts (epilog). */ static bool si_shader_select_tcs_parts(struct si_screen *sscreen, @@ -7724,7 +2223,7 @@ shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs, PIPE_SHADER_TESS_CTRL, false, &epilog_key, compiler, debug, - si_build_tcs_epilog_function, + si_llvm_build_tcs_epilog, "Tessellation Control Shader Epilog"); return shader->epilog != NULL; } @@ -7765,387 +2264,175 @@ shader->prolog2 = si_get_shader_part(sscreen, &sscreen->gs_prologs, PIPE_SHADER_GEOMETRY, true, &prolog_key, compiler, debug, - si_build_gs_prolog_function, + si_llvm_build_gs_prolog, "Geometry Shader Prolog"); return shader->prolog2 != NULL; } /** - * Build the pixel shader prolog function. This handles: - * - two-side color selection and interpolation - * - overriding interpolation parameters for the API PS - * - polygon stippling - * - * All preloaded SGPRs and VGPRs are passed through unmodified unless they are - * overriden by other states. (e.g. per-sample interpolation) - * Interpolated colors are stored after the preloaded VGPRs. + * Compute the PS prolog key, which contains all the information needed to + * build the PS prolog function, and set related bits in shader->config. */ -static void si_build_ps_prolog_function(struct si_shader_context *ctx, - union si_shader_part_key *key) +void si_get_ps_prolog_key(struct si_shader *shader, + union si_shader_part_key *key, + bool separate_prolog) { - struct si_function_info fninfo; - LLVMValueRef ret, func; - int num_returns, i, num_color_channels; - - assert(si_need_ps_prolog(key)); - - si_init_function_info(&fninfo); - - /* Declare inputs. */ - for (i = 0; i < key->ps_prolog.num_input_sgprs; i++) - add_arg(&fninfo, ARG_SGPR, ctx->i32); - - for (i = 0; i < key->ps_prolog.num_input_vgprs; i++) - add_arg(&fninfo, ARG_VGPR, ctx->f32); - - /* Declare outputs (same as inputs + add colors if needed) */ - num_returns = fninfo.num_params; - num_color_channels = util_bitcount(key->ps_prolog.colors_read); - for (i = 0; i < num_color_channels; i++) - fninfo.types[num_returns++] = ctx->f32; + struct si_shader_info *info = &shader->selector->info; - /* Create the function. */ - si_create_function(ctx, "ps_prolog", fninfo.types, num_returns, - &fninfo, 0); - func = ctx->main_fn; + memset(key, 0, sizeof(*key)); + key->ps_prolog.states = shader->key.part.ps.prolog; + key->ps_prolog.colors_read = info->colors_read; + key->ps_prolog.num_input_sgprs = shader->info.num_input_sgprs; + key->ps_prolog.num_input_vgprs = shader->info.num_input_vgprs; + key->ps_prolog.wqm = info->uses_derivatives && + (key->ps_prolog.colors_read || + key->ps_prolog.states.force_persp_sample_interp || + key->ps_prolog.states.force_linear_sample_interp || + key->ps_prolog.states.force_persp_center_interp || + key->ps_prolog.states.force_linear_center_interp || + key->ps_prolog.states.bc_optimize_for_persp || + key->ps_prolog.states.bc_optimize_for_linear); + key->ps_prolog.ancillary_vgpr_index = shader->info.ancillary_vgpr_index; - /* Copy inputs to outputs. This should be no-op, as the registers match, - * but it will prevent the compiler from overwriting them unintentionally. - */ - ret = ctx->return_value; - for (i = 0; i < fninfo.num_params; i++) { - LLVMValueRef p = LLVMGetParam(func, i); - ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p, i, ""); - } - - /* Polygon stippling. */ - if (key->ps_prolog.states.poly_stipple) { - /* POS_FIXED_PT is always last. */ - unsigned pos = key->ps_prolog.num_input_sgprs + - key->ps_prolog.num_input_vgprs - 1; - LLVMValueRef list = si_prolog_get_rw_buffers(ctx); - - si_llvm_emit_polygon_stipple(ctx, list, pos); - } - - if (key->ps_prolog.states.bc_optimize_for_persp || - key->ps_prolog.states.bc_optimize_for_linear) { - unsigned i, base = key->ps_prolog.num_input_sgprs; - LLVMValueRef center[2], centroid[2], tmp, bc_optimize; - - /* The shader should do: if (PRIM_MASK[31]) CENTROID = CENTER; - * The hw doesn't compute CENTROID if the whole wave only - * contains fully-covered quads. - * - * PRIM_MASK is after user SGPRs. - */ - bc_optimize = LLVMGetParam(func, SI_PS_NUM_USER_SGPR); - bc_optimize = LLVMBuildLShr(ctx->ac.builder, bc_optimize, - LLVMConstInt(ctx->i32, 31, 0), ""); - bc_optimize = LLVMBuildTrunc(ctx->ac.builder, bc_optimize, - ctx->i1, ""); + if (info->colors_read) { + unsigned *color = shader->selector->color_attr_index; - if (key->ps_prolog.states.bc_optimize_for_persp) { - /* Read PERSP_CENTER. */ - for (i = 0; i < 2; i++) - center[i] = LLVMGetParam(func, base + 2 + i); - /* Read PERSP_CENTROID. */ - for (i = 0; i < 2; i++) - centroid[i] = LLVMGetParam(func, base + 4 + i); - /* Select PERSP_CENTROID. */ - for (i = 0; i < 2; i++) { - tmp = LLVMBuildSelect(ctx->ac.builder, bc_optimize, - center[i], centroid[i], ""); - ret = LLVMBuildInsertValue(ctx->ac.builder, ret, - tmp, base + 4 + i, ""); - } - } - if (key->ps_prolog.states.bc_optimize_for_linear) { - /* Read LINEAR_CENTER. */ - for (i = 0; i < 2; i++) - center[i] = LLVMGetParam(func, base + 8 + i); - /* Read LINEAR_CENTROID. */ - for (i = 0; i < 2; i++) - centroid[i] = LLVMGetParam(func, base + 10 + i); - /* Select LINEAR_CENTROID. */ - for (i = 0; i < 2; i++) { - tmp = LLVMBuildSelect(ctx->ac.builder, bc_optimize, - center[i], centroid[i], ""); - ret = LLVMBuildInsertValue(ctx->ac.builder, ret, - tmp, base + 10 + i, ""); - } + if (shader->key.part.ps.prolog.color_two_side) { + /* BCOLORs are stored after the last input. */ + key->ps_prolog.num_interp_inputs = info->num_inputs; + key->ps_prolog.face_vgpr_index = shader->info.face_vgpr_index; + if (separate_prolog) + shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1); } - } - - /* Force per-sample interpolation. */ - if (key->ps_prolog.states.force_persp_sample_interp) { - unsigned i, base = key->ps_prolog.num_input_sgprs; - LLVMValueRef persp_sample[2]; - - /* Read PERSP_SAMPLE. */ - for (i = 0; i < 2; i++) - persp_sample[i] = LLVMGetParam(func, base + i); - /* Overwrite PERSP_CENTER. */ - for (i = 0; i < 2; i++) - ret = LLVMBuildInsertValue(ctx->ac.builder, ret, - persp_sample[i], base + 2 + i, ""); - /* Overwrite PERSP_CENTROID. */ - for (i = 0; i < 2; i++) - ret = LLVMBuildInsertValue(ctx->ac.builder, ret, - persp_sample[i], base + 4 + i, ""); - } - if (key->ps_prolog.states.force_linear_sample_interp) { - unsigned i, base = key->ps_prolog.num_input_sgprs; - LLVMValueRef linear_sample[2]; - - /* Read LINEAR_SAMPLE. */ - for (i = 0; i < 2; i++) - linear_sample[i] = LLVMGetParam(func, base + 6 + i); - /* Overwrite LINEAR_CENTER. */ - for (i = 0; i < 2; i++) - ret = LLVMBuildInsertValue(ctx->ac.builder, ret, - linear_sample[i], base + 8 + i, ""); - /* Overwrite LINEAR_CENTROID. */ - for (i = 0; i < 2; i++) - ret = LLVMBuildInsertValue(ctx->ac.builder, ret, - linear_sample[i], base + 10 + i, ""); - } - - /* Force center interpolation. */ - if (key->ps_prolog.states.force_persp_center_interp) { - unsigned i, base = key->ps_prolog.num_input_sgprs; - LLVMValueRef persp_center[2]; - - /* Read PERSP_CENTER. */ - for (i = 0; i < 2; i++) - persp_center[i] = LLVMGetParam(func, base + 2 + i); - /* Overwrite PERSP_SAMPLE. */ - for (i = 0; i < 2; i++) - ret = LLVMBuildInsertValue(ctx->ac.builder, ret, - persp_center[i], base + i, ""); - /* Overwrite PERSP_CENTROID. */ - for (i = 0; i < 2; i++) - ret = LLVMBuildInsertValue(ctx->ac.builder, ret, - persp_center[i], base + 4 + i, ""); - } - if (key->ps_prolog.states.force_linear_center_interp) { - unsigned i, base = key->ps_prolog.num_input_sgprs; - LLVMValueRef linear_center[2]; - - /* Read LINEAR_CENTER. */ - for (i = 0; i < 2; i++) - linear_center[i] = LLVMGetParam(func, base + 8 + i); - /* Overwrite LINEAR_SAMPLE. */ - for (i = 0; i < 2; i++) - ret = LLVMBuildInsertValue(ctx->ac.builder, ret, - linear_center[i], base + 6 + i, ""); - /* Overwrite LINEAR_CENTROID. */ - for (i = 0; i < 2; i++) - ret = LLVMBuildInsertValue(ctx->ac.builder, ret, - linear_center[i], base + 10 + i, ""); - } - - /* Interpolate colors. */ - unsigned color_out_idx = 0; - for (i = 0; i < 2; i++) { - unsigned writemask = (key->ps_prolog.colors_read >> (i * 4)) & 0xf; - unsigned face_vgpr = key->ps_prolog.num_input_sgprs + - key->ps_prolog.face_vgpr_index; - LLVMValueRef interp[2], color[4]; - LLVMValueRef interp_ij = NULL, prim_mask = NULL, face = NULL; - if (!writemask) - continue; + for (unsigned i = 0; i < 2; i++) { + unsigned interp = info->input_interpolate[color[i]]; + unsigned location = info->input_interpolate_loc[color[i]]; - /* If the interpolation qualifier is not CONSTANT (-1). */ - if (key->ps_prolog.color_interp_vgpr_index[i] != -1) { - unsigned interp_vgpr = key->ps_prolog.num_input_sgprs + - key->ps_prolog.color_interp_vgpr_index[i]; + if (!(info->colors_read & (0xf << i*4))) + continue; - /* Get the (i,j) updated by bc_optimize handling. */ - interp[0] = LLVMBuildExtractValue(ctx->ac.builder, ret, - interp_vgpr, ""); - interp[1] = LLVMBuildExtractValue(ctx->ac.builder, ret, - interp_vgpr + 1, ""); - interp_ij = ac_build_gather_values(&ctx->ac, interp, 2); - } + key->ps_prolog.color_attr_index[i] = color[i]; - /* Use the absolute location of the input. */ - prim_mask = LLVMGetParam(func, SI_PS_NUM_USER_SGPR); + if (shader->key.part.ps.prolog.flatshade_colors && + interp == TGSI_INTERPOLATE_COLOR) + interp = TGSI_INTERPOLATE_CONSTANT; - if (key->ps_prolog.states.color_two_side) { - face = LLVMGetParam(func, face_vgpr); - face = ac_to_integer(&ctx->ac, face); - } + switch (interp) { + case TGSI_INTERPOLATE_CONSTANT: + key->ps_prolog.color_interp_vgpr_index[i] = -1; + break; + case TGSI_INTERPOLATE_PERSPECTIVE: + case TGSI_INTERPOLATE_COLOR: + /* Force the interpolation location for colors here. */ + if (shader->key.part.ps.prolog.force_persp_sample_interp) + location = TGSI_INTERPOLATE_LOC_SAMPLE; + if (shader->key.part.ps.prolog.force_persp_center_interp) + location = TGSI_INTERPOLATE_LOC_CENTER; - interp_fs_input(ctx, - key->ps_prolog.color_attr_index[i], - TGSI_SEMANTIC_COLOR, i, - key->ps_prolog.num_interp_inputs, - key->ps_prolog.colors_read, interp_ij, - prim_mask, face, color); + switch (location) { + case TGSI_INTERPOLATE_LOC_SAMPLE: + key->ps_prolog.color_interp_vgpr_index[i] = 0; + if (separate_prolog) { + shader->config.spi_ps_input_ena |= + S_0286CC_PERSP_SAMPLE_ENA(1); + } + break; + case TGSI_INTERPOLATE_LOC_CENTER: + key->ps_prolog.color_interp_vgpr_index[i] = 2; + if (separate_prolog) { + shader->config.spi_ps_input_ena |= + S_0286CC_PERSP_CENTER_ENA(1); + } + break; + case TGSI_INTERPOLATE_LOC_CENTROID: + key->ps_prolog.color_interp_vgpr_index[i] = 4; + if (separate_prolog) { + shader->config.spi_ps_input_ena |= + S_0286CC_PERSP_CENTROID_ENA(1); + } + break; + default: + assert(0); + } + break; + case TGSI_INTERPOLATE_LINEAR: + /* Force the interpolation location for colors here. */ + if (shader->key.part.ps.prolog.force_linear_sample_interp) + location = TGSI_INTERPOLATE_LOC_SAMPLE; + if (shader->key.part.ps.prolog.force_linear_center_interp) + location = TGSI_INTERPOLATE_LOC_CENTER; - while (writemask) { - unsigned chan = u_bit_scan(&writemask); - ret = LLVMBuildInsertValue(ctx->ac.builder, ret, color[chan], - fninfo.num_params + color_out_idx++, ""); + /* The VGPR assignment for non-monolithic shaders + * works because InitialPSInputAddr is set on the + * main shader and PERSP_PULL_MODEL is never used. + */ + switch (location) { + case TGSI_INTERPOLATE_LOC_SAMPLE: + key->ps_prolog.color_interp_vgpr_index[i] = + separate_prolog ? 6 : 9; + if (separate_prolog) { + shader->config.spi_ps_input_ena |= + S_0286CC_LINEAR_SAMPLE_ENA(1); + } + break; + case TGSI_INTERPOLATE_LOC_CENTER: + key->ps_prolog.color_interp_vgpr_index[i] = + separate_prolog ? 8 : 11; + if (separate_prolog) { + shader->config.spi_ps_input_ena |= + S_0286CC_LINEAR_CENTER_ENA(1); + } + break; + case TGSI_INTERPOLATE_LOC_CENTROID: + key->ps_prolog.color_interp_vgpr_index[i] = + separate_prolog ? 10 : 13; + if (separate_prolog) { + shader->config.spi_ps_input_ena |= + S_0286CC_LINEAR_CENTROID_ENA(1); + } + break; + default: + assert(0); + } + break; + default: + assert(0); + } } } - - /* Section 15.2.2 (Shader Inputs) of the OpenGL 4.5 (Core Profile) spec - * says: - * - * "When per-sample shading is active due to the use of a fragment - * input qualified by sample or due to the use of the gl_SampleID - * or gl_SamplePosition variables, only the bit for the current - * sample is set in gl_SampleMaskIn. When state specifies multiple - * fragment shader invocations for a given fragment, the sample - * mask for any single fragment shader invocation may specify a - * subset of the covered samples for the fragment. In this case, - * the bit corresponding to each covered sample will be set in - * exactly one fragment shader invocation." - * - * The samplemask loaded by hardware is always the coverage of the - * entire pixel/fragment, so mask bits out based on the sample ID. - */ - if (key->ps_prolog.states.samplemask_log_ps_iter) { - /* The bit pattern matches that used by fixed function fragment - * processing. */ - static const uint16_t ps_iter_masks[] = { - 0xffff, /* not used */ - 0x5555, - 0x1111, - 0x0101, - 0x0001, - }; - assert(key->ps_prolog.states.samplemask_log_ps_iter < ARRAY_SIZE(ps_iter_masks)); - - uint32_t ps_iter_mask = ps_iter_masks[key->ps_prolog.states.samplemask_log_ps_iter]; - unsigned ancillary_vgpr = key->ps_prolog.num_input_sgprs + - key->ps_prolog.ancillary_vgpr_index; - LLVMValueRef sampleid = si_unpack_param(ctx, ancillary_vgpr, 8, 4); - LLVMValueRef samplemask = LLVMGetParam(func, ancillary_vgpr + 1); - - samplemask = ac_to_integer(&ctx->ac, samplemask); - samplemask = LLVMBuildAnd( - ctx->ac.builder, - samplemask, - LLVMBuildShl(ctx->ac.builder, - LLVMConstInt(ctx->i32, ps_iter_mask, false), - sampleid, ""), - ""); - samplemask = ac_to_float(&ctx->ac, samplemask); - - ret = LLVMBuildInsertValue(ctx->ac.builder, ret, samplemask, - ancillary_vgpr + 1, ""); - } - - /* Tell LLVM to insert WQM instruction sequence when needed. */ - if (key->ps_prolog.wqm) { - LLVMAddTargetDependentFunctionAttr(func, - "amdgpu-ps-wqm-outputs", ""); - } - - si_llvm_build_ret(ctx, ret); } /** - * Build the pixel shader epilog function. This handles everything that must be - * emulated for pixel shader exports. (alpha-test, format conversions, etc) + * Check whether a PS prolog is required based on the key. */ -static void si_build_ps_epilog_function(struct si_shader_context *ctx, - union si_shader_part_key *key) +bool si_need_ps_prolog(const union si_shader_part_key *key) { - struct lp_build_tgsi_context *bld_base = &ctx->bld_base; - struct si_function_info fninfo; - LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL; - int i; - struct si_ps_exports exp = {}; - - si_init_function_info(&fninfo); - - /* Declare input SGPRs. */ - ctx->param_rw_buffers = add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr); - ctx->param_bindless_samplers_and_images = add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr); - ctx->param_const_and_shader_buffers = add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr); - ctx->param_samplers_and_images = add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr); - add_arg_checked(&fninfo, ARG_SGPR, ctx->f32, SI_PARAM_ALPHA_REF); - - /* Declare input VGPRs. */ - unsigned required_num_params = - fninfo.num_sgpr_params + - util_bitcount(key->ps_epilog.colors_written) * 4 + - key->ps_epilog.writes_z + - key->ps_epilog.writes_stencil + - key->ps_epilog.writes_samplemask; - - required_num_params = MAX2(required_num_params, - fninfo.num_sgpr_params + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1); - - while (fninfo.num_params < required_num_params) - add_arg(&fninfo, ARG_VGPR, ctx->f32); - - /* Create the function. */ - si_create_function(ctx, "ps_epilog", NULL, 0, &fninfo, 0); - /* Disable elimination of unused inputs. */ - ac_llvm_add_target_dep_function_attr(ctx->main_fn, - "InitialPSInputAddr", 0xffffff); - - /* Process colors. */ - unsigned vgpr = fninfo.num_sgpr_params; - unsigned colors_written = key->ps_epilog.colors_written; - int last_color_export = -1; - - /* Find the last color export. */ - if (!key->ps_epilog.writes_z && - !key->ps_epilog.writes_stencil && - !key->ps_epilog.writes_samplemask) { - unsigned spi_format = key->ps_epilog.states.spi_shader_col_format; - - /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */ - if (colors_written == 0x1 && key->ps_epilog.states.last_cbuf > 0) { - /* Just set this if any of the colorbuffers are enabled. */ - if (spi_format & - ((1ull << (4 * (key->ps_epilog.states.last_cbuf + 1))) - 1)) - last_color_export = 0; - } else { - for (i = 0; i < 8; i++) - if (colors_written & (1 << i) && - (spi_format >> (i * 4)) & 0xf) - last_color_export = i; - } - } - - while (colors_written) { - LLVMValueRef color[4]; - int mrt = u_bit_scan(&colors_written); - - for (i = 0; i < 4; i++) - color[i] = LLVMGetParam(ctx->main_fn, vgpr++); - - si_export_mrt_color(bld_base, color, mrt, - fninfo.num_params - 1, - mrt == last_color_export, &exp); - } - - /* Process depth, stencil, samplemask. */ - if (key->ps_epilog.writes_z) - depth = LLVMGetParam(ctx->main_fn, vgpr++); - if (key->ps_epilog.writes_stencil) - stencil = LLVMGetParam(ctx->main_fn, vgpr++); - if (key->ps_epilog.writes_samplemask) - samplemask = LLVMGetParam(ctx->main_fn, vgpr++); - - if (depth || stencil || samplemask) - si_export_mrt_z(bld_base, depth, stencil, samplemask, &exp); - else if (last_color_export == -1) - ac_build_export_null(&ctx->ac); - - if (exp.num) - si_emit_ps_exports(ctx, &exp); + return key->ps_prolog.colors_read || + key->ps_prolog.states.force_persp_sample_interp || + key->ps_prolog.states.force_linear_sample_interp || + key->ps_prolog.states.force_persp_center_interp || + key->ps_prolog.states.force_linear_center_interp || + key->ps_prolog.states.bc_optimize_for_persp || + key->ps_prolog.states.bc_optimize_for_linear || + key->ps_prolog.states.poly_stipple || + key->ps_prolog.states.samplemask_log_ps_iter; +} - /* Compile. */ - LLVMBuildRetVoid(ctx->ac.builder); +/** + * Compute the PS epilog key, which contains all the information needed to + * build the PS epilog function. + */ +void si_get_ps_epilog_key(struct si_shader *shader, + union si_shader_part_key *key) +{ + struct si_shader_info *info = &shader->selector->info; + memset(key, 0, sizeof(*key)); + key->ps_epilog.colors_written = info->colors_written; + key->ps_epilog.writes_z = info->writes_z; + key->ps_epilog.writes_stencil = info->writes_stencil; + key->ps_epilog.writes_samplemask = info->writes_samplemask; + key->ps_epilog.states = shader->key.part.ps.epilog; } /** @@ -8168,7 +2455,7 @@ si_get_shader_part(sscreen, &sscreen->ps_prologs, PIPE_SHADER_FRAGMENT, true, &prolog_key, compiler, debug, - si_build_ps_prolog_function, + si_llvm_build_ps_prolog, "Fragment Shader Prolog"); if (!shader->prolog) return false; @@ -8181,7 +2468,7 @@ si_get_shader_part(sscreen, &sscreen->ps_epilogs, PIPE_SHADER_FRAGMENT, false, &epilog_key, compiler, debug, - si_build_ps_epilog_function, + si_llvm_build_ps_epilog, "Fragment Shader Epilog"); if (!shader->epilog) return false; @@ -8268,8 +2555,7 @@ *lds_size = MAX2(*lds_size, 8); } -static void si_fix_resource_usage(struct si_screen *sscreen, - struct si_shader *shader) +void si_fix_resource_usage(struct si_screen *sscreen, struct si_shader *shader) { unsigned min_sgprs = shader->info.num_input_sgprs + 2; /* VCC */ @@ -8282,13 +2568,13 @@ } } -bool si_shader_create(struct si_screen *sscreen, struct ac_llvm_compiler *compiler, - struct si_shader *shader, - struct pipe_debug_callback *debug) +bool si_create_shader_variant(struct si_screen *sscreen, + struct ac_llvm_compiler *compiler, + struct si_shader *shader, + struct pipe_debug_callback *debug) { struct si_shader_selector *sel = shader->selector; struct si_shader *mainp = *si_get_main_shader_part(sel, &shader->key); - int r; /* LS, ES, VS are compiled on demand if the main part hasn't been * compiled for that stage. @@ -8303,8 +2589,7 @@ /* Monolithic shader (compiled as a whole, has many variants, * may take a long time to compile). */ - r = si_compile_tgsi_shader(sscreen, compiler, shader, debug); - if (r) + if (!si_compile_shader(sscreen, compiler, shader, debug)) return false; } else { /* The shader consists of several parts: @@ -8325,7 +2610,7 @@ if (!mainp) return false; - /* Copy the compiled TGSI shader data over. */ + /* Copy the compiled shader data over. */ shader->is_binary_shared = true; shader->binary = mainp->binary; shader->config = mainp->config; @@ -8430,6 +2715,15 @@ return true; } +void si_shader_binary_clean(struct si_shader_binary *binary) +{ + free((void *)binary->elf_buffer); + binary->elf_buffer = NULL; + + free(binary->llvm_ir_string); + binary->llvm_ir_string = NULL; +} + void si_shader_destroy(struct si_shader *shader) { if (shader->scratch_bo) diff -Nru mesa-19.2.8/src/gallium/drivers/radeonsi/si_shader.h mesa-20.0.8/src/gallium/drivers/radeonsi/si_shader.h --- mesa-19.2.8/src/gallium/drivers/radeonsi/si_shader.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/radeonsi/si_shader.h 2020-06-12 01:21:17.000000000 +0000 @@ -131,11 +131,10 @@ #ifndef SI_SHADER_H #define SI_SHADER_H -#include /* LLVMModuleRef */ -#include -#include "tgsi/tgsi_scan.h" #include "util/u_inlines.h" +#include "util/u_live_shader_cache.h" #include "util/u_queue.h" +#include "util/simple_mtx.h" #include "ac_binary.h" #include "ac_llvm_build.h" @@ -159,6 +158,8 @@ */ #define SI_MAX_IO_GENERIC 32 +#define SI_NGG_PRIM_EDGE_FLAG_BITS ((1 << 9) | (1 << 19) | (1 << 29)) + /* SGPR user data indices */ enum { SI_SGPR_RW_BUFFERS, /* rings (& stream-out, VS only) */ @@ -211,6 +212,11 @@ /* PS only */ SI_SGPR_ALPHA_REF = SI_NUM_RESOURCE_SGPRS, SI_PS_NUM_USER_SGPR, + + /* The value has to be 12, because the hw requires that descriptors + * are aligned to 4 SGPRs. + */ + SI_SGPR_VS_VB_DESCRIPTOR_FIRST = 12, }; /* LLVM function parameter indices */ @@ -251,8 +257,10 @@ #define C_VS_STATE_PROVOKING_VTX_INDEX 0xFFFFFFCF #define S_VS_STATE_STREAMOUT_QUERY_ENABLED(x) (((unsigned)(x) & 0x1) << 6) #define C_VS_STATE_STREAMOUT_QUERY_ENABLED 0xFFFFFFBF -#define S_VS_STATE_LS_OUT_PATCH_SIZE(x) (((unsigned)(x) & 0x1FFF) << 8) -#define C_VS_STATE_LS_OUT_PATCH_SIZE 0xFFE000FF +#define S_VS_STATE_SMALL_PRIM_PRECISION(x) (((unsigned)(x) & 0xF) << 7) +#define C_VS_STATE_SMALL_PRIM_PRECISION 0xFFFFF87F +#define S_VS_STATE_LS_OUT_PATCH_SIZE(x) (((unsigned)(x) & 0x1FFF) << 11) +#define C_VS_STATE_LS_OUT_PATCH_SIZE 0xFF0007FF #define S_VS_STATE_LS_OUT_VERTEX_SIZE(x) (((unsigned)(x) & 0xFF) << 24) #define C_VS_STATE_LS_OUT_VERTEX_SIZE 0x00FFFFFF @@ -266,6 +274,13 @@ SI_VS_BLIT_SGPRS_POS_TEXCOORD = 9, }; +#define SI_NGG_CULL_VIEW_SMALLPRIMS (1 << 0) /* view.xy + small prims */ +#define SI_NGG_CULL_BACK_FACE (1 << 1) /* back faces */ +#define SI_NGG_CULL_FRONT_FACE (1 << 2) /* front faces */ +#define SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST (1 << 3) /* GS fast launch: triangles */ +#define SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP (1 << 4) /* GS fast launch: triangle strip */ +#define SI_NGG_CULL_GS_FAST_LAUNCH_ALL (0x3 << 3) /* GS fast launch (both prim types) */ + /** * For VS shader keys, describe any fixups required for vertex fetch. * @@ -301,21 +316,106 @@ bool is_debug_context; }; +struct si_shader_info { + ubyte num_inputs; + ubyte num_outputs; + ubyte input_semantic_name[PIPE_MAX_SHADER_INPUTS]; /**< TGSI_SEMANTIC_x */ + ubyte input_semantic_index[PIPE_MAX_SHADER_INPUTS]; + ubyte input_interpolate[PIPE_MAX_SHADER_INPUTS]; + ubyte input_interpolate_loc[PIPE_MAX_SHADER_INPUTS]; + ubyte input_usage_mask[PIPE_MAX_SHADER_INPUTS]; + ubyte output_semantic_name[PIPE_MAX_SHADER_OUTPUTS]; /**< TGSI_SEMANTIC_x */ + ubyte output_semantic_index[PIPE_MAX_SHADER_OUTPUTS]; + ubyte output_usagemask[PIPE_MAX_SHADER_OUTPUTS]; + ubyte output_streams[PIPE_MAX_SHADER_OUTPUTS]; + + ubyte processor; + + int constbuf0_num_slots; + unsigned const_buffers_declared; /**< bitmask of declared const buffers */ + unsigned samplers_declared; /**< bitmask of declared samplers */ + ubyte num_stream_output_components[4]; + + uint num_memory_instructions; /**< sampler, buffer, and image instructions */ + + /** + * If a tessellation control shader reads outputs, this describes which ones. + */ + bool reads_pervertex_outputs; + bool reads_perpatch_outputs; + bool reads_tessfactor_outputs; + + ubyte colors_read; /**< which color components are read by the FS */ + ubyte colors_written; + bool reads_samplemask; /**< does fragment shader read sample mask? */ + bool reads_tess_factors; /**< If TES reads TESSINNER or TESSOUTER */ + bool writes_z; /**< does fragment shader write Z value? */ + bool writes_stencil; /**< does fragment shader write stencil value? */ + bool writes_samplemask; /**< does fragment shader write sample mask? */ + bool writes_edgeflag; /**< vertex shader outputs edgeflag */ + bool uses_kill; /**< KILL or KILL_IF instruction used? */ + bool uses_persp_center; + bool uses_persp_centroid; + bool uses_persp_sample; + bool uses_linear_center; + bool uses_linear_centroid; + bool uses_linear_sample; + bool uses_persp_opcode_interp_sample; + bool uses_linear_opcode_interp_sample; + bool uses_instanceid; + bool uses_vertexid; + bool uses_vertexid_nobase; + bool uses_basevertex; + bool uses_drawid; + bool uses_primid; + bool uses_frontface; + bool uses_invocationid; + bool uses_thread_id[3]; + bool uses_block_id[3]; + bool uses_block_size; + bool uses_grid_size; + bool uses_subgroup_info; + bool writes_position; + bool writes_psize; + bool writes_clipvertex; + bool writes_primid; + bool writes_viewport_index; + bool writes_layer; + bool writes_memory; /**< contains stores or atomics to buffers or images */ + bool uses_derivatives; + bool uses_bindless_samplers; + bool uses_bindless_images; + bool uses_fbfetch; + unsigned clipdist_writemask; + unsigned culldist_writemask; + unsigned num_written_culldistance; + unsigned num_written_clipdistance; + + unsigned images_declared; /**< bitmask of declared images */ + unsigned msaa_images_declared; /**< bitmask of declared MSAA images */ + unsigned shader_buffers_declared; /**< bitmask of declared shader buffers */ + + unsigned properties[TGSI_PROPERTY_COUNT]; /* index with TGSI_PROPERTY_ */ + + /** Whether all codepaths write tess factors in all invocations. */ + bool tessfactors_are_def_in_all_invocs; +}; + /* A shader selector is a gallium CSO and contains shader variants and - * binaries for one TGSI program. This can be shared by multiple contexts. + * binaries for one NIR program. This can be shared by multiple contexts. */ struct si_shader_selector { - struct pipe_reference reference; + struct util_live_shader base; struct si_screen *screen; struct util_queue_fence ready; struct si_compiler_ctx_state compiler_ctx_state; - mtx_t mutex; + simple_mtx_t mutex; struct si_shader *first_variant; /* immutable after the first variant */ struct si_shader *last_variant; /* mutable */ - /* The compiled TGSI shader expecting a prolog and/or epilog (not - * uploaded to a buffer). + /* The compiled NIR shader without a prolog and/or epilog (not + * uploaded to a buffer object). */ struct si_shader *main_shader_part; struct si_shader *main_shader_part_ls; /* as_ls is set in the key */ @@ -325,17 +425,21 @@ struct si_shader *gs_copy_shader; - struct tgsi_token *tokens; struct nir_shader *nir; + void *nir_binary; + unsigned nir_size; + struct pipe_stream_output_info so; - struct tgsi_shader_info info; - struct tgsi_tessctrl_info tcs_info; + struct si_shader_info info; /* PIPE_SHADER_[VERTEX|FRAGMENT|...] */ enum pipe_shader_type type; bool vs_needs_prolog; bool force_correct_derivs_after_kill; bool prim_discard_cs_allowed; + bool ngg_culling_allowed; + unsigned num_vs_inputs; + unsigned num_vbos_in_user_sgprs; unsigned pa_cl_vs_out_cntl; ubyte clipdist_mask; ubyte culldist_mask; @@ -459,10 +563,13 @@ unsigned num_input_sgprs:6; /* For merged stages such as LS-HS, HS input VGPRs are first. */ unsigned num_merged_next_stage_vgprs:3; - unsigned last_input:4; + unsigned num_inputs:5; unsigned as_ls:1; unsigned as_es:1; unsigned as_ngg:1; + unsigned has_ngg_cull_inputs:1; /* from the NGG cull shader */ + unsigned gs_fast_launch_tri_list:1; /* for NGG culling */ + unsigned gs_fast_launch_tri_strip:1; /* for NGG culling */ /* Prologs for monolithic shaders shouldn't set EXEC. */ unsigned is_monolithic:1; } vs_prolog; @@ -553,6 +660,9 @@ uint64_t kill_outputs; /* "get_unique_index" bits */ unsigned clip_disable:1; + /* For NGG VS and TES. */ + unsigned ngg_culling:5; /* SI_NGG_CULL_* */ + /* For shaders where monolithic variants have better code. * * This is a flag that has no effect on code generation, @@ -580,7 +690,7 @@ #pragma pack(pop) /* GCN-specific shader info. */ -struct si_shader_info { +struct si_shader_binary_info { ubyte vs_output_param_offset[SI_MAX_VS_OUTPUTS]; ubyte num_input_sgprs; ubyte num_input_vgprs; @@ -634,7 +744,7 @@ /* The following data is all that's needed for binary shaders. */ struct si_shader_binary binary; struct ac_shader_config config; - struct si_shader_info info; + struct si_shader_binary_info info; struct { uint16_t ngg_emit_size; /* in dwords */ @@ -684,6 +794,7 @@ unsigned pa_cl_vte_cntl; unsigned pa_cl_ngg_cntl; unsigned vgt_gs_max_vert_out; /* for API GS */ + unsigned ge_pc_alloc; /* uconfig register */ } ngg; struct { @@ -693,6 +804,7 @@ unsigned spi_vs_out_config; unsigned spi_shader_pos_format; unsigned pa_cl_vte_cntl; + unsigned ge_pc_alloc; /* uconfig register */ } vs; struct { @@ -721,18 +833,14 @@ }; /* si_shader.c */ -struct si_shader * -si_generate_gs_copy_shader(struct si_screen *sscreen, - struct ac_llvm_compiler *compiler, - struct si_shader_selector *gs_selector, - struct pipe_debug_callback *debug); -int si_compile_tgsi_shader(struct si_screen *sscreen, - struct ac_llvm_compiler *compiler, - struct si_shader *shader, - struct pipe_debug_callback *debug); -bool si_shader_create(struct si_screen *sscreen, struct ac_llvm_compiler *compiler, - struct si_shader *shader, - struct pipe_debug_callback *debug); +bool si_compile_shader(struct si_screen *sscreen, + struct ac_llvm_compiler *compiler, + struct si_shader *shader, + struct pipe_debug_callback *debug); +bool si_create_shader_variant(struct si_screen *sscreen, + struct ac_llvm_compiler *compiler, + struct si_shader *shader, + struct pipe_debug_callback *debug); void si_shader_destroy(struct si_shader *shader); unsigned si_shader_io_get_unique_index_patch(unsigned semantic_name, unsigned index); unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index, @@ -750,14 +858,18 @@ const char *si_get_shader_name(const struct si_shader *shader); void si_shader_binary_clean(struct si_shader_binary *binary); +/* si_shader_llvm_gs.c */ +struct si_shader * +si_generate_gs_copy_shader(struct si_screen *sscreen, + struct ac_llvm_compiler *compiler, + struct si_shader_selector *gs_selector, + struct pipe_debug_callback *debug); + /* si_shader_nir.c */ void si_nir_scan_shader(const struct nir_shader *nir, - struct tgsi_shader_info *info); -void si_nir_scan_tess_ctrl(const struct nir_shader *nir, - struct tgsi_tessctrl_info *out); -void si_nir_lower_ps_inputs(struct nir_shader *nir); -void si_lower_nir(struct si_shader_selector *sel); -void si_nir_opts(struct nir_shader *nir); + struct si_shader_info *info); +void si_nir_adjust_driver_locations(struct nir_shader *nir); +void si_finalize_nir(struct pipe_screen *screen, void *nirptr, bool optimize); /* si_state_shaders.c */ void gfx9_get_gs_info(struct si_shader_selector *es, @@ -783,6 +895,19 @@ } static inline bool +gfx10_is_ngg_passthrough(struct si_shader *shader) +{ + struct si_shader_selector *sel = shader->selector; + + return sel->type != PIPE_SHADER_GEOMETRY && + !sel->so.num_outputs && + !sel->info.writes_edgeflag && + !shader->key.opt.ngg_culling && + (sel->type != PIPE_SHADER_VERTEX || + !shader->key.mono.u.vs_export_prim_id); +} + +static inline bool si_shader_uses_bindless_samplers(struct si_shader_selector *selector) { return selector ? selector->info.uses_bindless_samplers : false; @@ -794,18 +919,4 @@ return selector ? selector->info.uses_bindless_images : false; } -void si_destroy_shader_selector(struct si_context *sctx, - struct si_shader_selector *sel); - -static inline void -si_shader_selector_reference(struct si_context *sctx, - struct si_shader_selector **dst, - struct si_shader_selector *src) -{ - if (pipe_reference(&(*dst)->reference, &src->reference)) - si_destroy_shader_selector(sctx, *dst); - - *dst = src; -} - #endif diff -Nru mesa-19.2.8/src/gallium/drivers/radeonsi/si_shader_internal.h mesa-20.0.8/src/gallium/drivers/radeonsi/si_shader_internal.h --- mesa-19.2.8/src/gallium/drivers/radeonsi/si_shader_internal.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/radeonsi/si_shader_internal.h 2020-06-12 01:21:17.000000000 +0000 @@ -26,39 +26,16 @@ #define SI_SHADER_PRIVATE_H #include "si_shader.h" -#include "gallivm/lp_bld_flow.h" -#include "gallivm/lp_bld_init.h" -#include "gallivm/lp_bld_tgsi.h" -#include "tgsi/tgsi_parse.h" #include "ac_shader_abi.h" -#include -#include - struct pipe_debug_callback; -#define RADEON_LLVM_MAX_INPUT_SLOTS 32 #define RADEON_LLVM_MAX_INPUTS 32 * 4 -#define RADEON_LLVM_MAX_OUTPUTS 32 * 4 - -#define RADEON_LLVM_MAX_SYSTEM_VALUES 11 -#define RADEON_LLVM_MAX_ADDRS 16 - -enum si_arg_regfile { - ARG_SGPR, - ARG_VGPR -}; -/** - * Used to collect types and other info about arguments of the LLVM function - * before the function is created. +/* Ideally pass the sample mask input to the PS epilog as v14, which + * is its usual location, so that the shader doesn't have to add v_mov. */ -struct si_function_info { - LLVMTypeRef types[100]; - LLVMValueRef *assign[100]; - unsigned num_sgpr_params; - unsigned num_params; -}; +#define PS_EPILOG_SAMPLEMASK_MIN_LOC 14 struct si_shader_output_values { LLVMValueRef values[4]; @@ -68,8 +45,6 @@ }; struct si_shader_context { - struct lp_build_tgsi_context bld_base; - struct gallivm_state gallivm; struct ac_llvm_context ac; struct si_shader *shader; struct si_screen *screen; @@ -82,74 +57,64 @@ unsigned num_images; unsigned num_samplers; + struct ac_shader_args args; struct ac_shader_abi abi; - /** This function is responsible for initilizing the inputs array and will be - * called once for each input declared in the TGSI shader. - */ - void (*load_input)(struct si_shader_context *, - unsigned input_index, - const struct tgsi_full_declaration *decl, - LLVMValueRef out[4]); - - /** This array contains the input values for the shader. Typically these - * values will be in the form of a target intrinsic that will inform the - * backend how to load the actual inputs to the shader. - */ - struct tgsi_full_declaration input_decls[RADEON_LLVM_MAX_INPUT_SLOTS]; LLVMValueRef inputs[RADEON_LLVM_MAX_INPUTS]; - LLVMValueRef outputs[RADEON_LLVM_MAX_OUTPUTS][TGSI_NUM_CHANNELS]; - LLVMValueRef addrs[RADEON_LLVM_MAX_ADDRS][TGSI_NUM_CHANNELS]; - - /** This pointer is used to contain the temporary values. - * The amount of temporary used in tgsi can't be bound to a max value and - * thus we must allocate this array at runtime. - */ - LLVMValueRef *temps; - unsigned temps_count; - LLVMValueRef system_values[RADEON_LLVM_MAX_SYSTEM_VALUES]; - - LLVMValueRef *imms; - unsigned imms_num; LLVMBasicBlockRef merged_wrap_if_entry_block; int merged_wrap_if_label; - struct tgsi_array_info *temp_arrays; - LLVMValueRef *temp_array_allocas; - - LLVMValueRef undef_alloca; - LLVMValueRef main_fn; LLVMTypeRef return_type; - /* Parameter indices for LLVMGetParam. */ - int param_rw_buffers; - int param_const_and_shader_buffers; - int param_samplers_and_images; - int param_bindless_samplers_and_images; + struct ac_arg const_and_shader_buffers; + struct ac_arg samplers_and_images; + + /* For merged shaders, the per-stage descriptors for the stage other + * than the one we're processing, used to pass them through from the + * first stage to the second. + */ + struct ac_arg other_const_and_shader_buffers; + struct ac_arg other_samplers_and_images; + + struct ac_arg rw_buffers; + struct ac_arg bindless_samplers_and_images; /* Common inputs for merged shaders. */ - int param_merged_wave_info; - int param_merged_scratch_offset; + struct ac_arg merged_wave_info; + struct ac_arg merged_scratch_offset; + struct ac_arg small_prim_cull_info; /* API VS */ - int param_vertex_buffers; - int param_rel_auto_id; - int param_vs_prim_id; - int param_vertex_index0; + struct ac_arg vertex_buffers; + struct ac_arg vb_descriptors[5]; + struct ac_arg rel_auto_id; + struct ac_arg vs_prim_id; + struct ac_arg vertex_index0; /* VS states and layout of LS outputs / TCS inputs at the end * [0] = clamp vertex color * [1] = indexed - * [8:20] = stride between patches in DW = num_inputs * num_vertices * 4 - * max = 32*32*4 + 32*4 + * [2:3] = NGG: output primitive type + * [4:5] = NGG: provoking vertex index + * [6] = NGG: streamout queries enabled + * [7:10] = NGG: small prim filter precision = num_samples / quant_mode, + * but in reality it's: 1/2^n, from 1/16 to 1/4096 = 1/2^4 to 1/2^12 + * Only the first 4 bits of the exponent are stored. + * Set it like this: (fui(num_samples / quant_mode) >> 23) + * Expand to FP32 like this: ((0x70 | value) << 23); + * With 0x70 = 112, we get 2^(112 + value - 127) = 2^(value - 15) + * = 1/2^(15 - value) in FP32 + * [11:23] = stride between patches in DW = num_inputs * num_vertices * 4 + * max = 32*32*4 + 32*4 * [24:31] = stride between vertices in DW = num_inputs * 4 * max = 32*4 */ - int param_vs_state_bits; - int param_vs_blit_inputs; + struct ac_arg vs_state_bits; + struct ac_arg vs_blit_inputs; + struct ac_arg ngg_old_thread_id; /* generated by the NGG cull shader */ /* HW VS */ - int param_streamout_config; - int param_streamout_write_index; - int param_streamout_offset[4]; + struct ac_arg streamout_config; + struct ac_arg streamout_write_index; + struct ac_arg streamout_offset[4]; /* API TCS & TES */ /* Layout of TCS outputs in the offchip buffer @@ -161,7 +126,7 @@ * [12:31] = the offset of per patch attributes in the buffer in bytes. * max = NUM_PATCHES*32*32*16 */ - int param_tcs_offchip_layout; + struct ac_arg tcs_offchip_layout; /* API TCS */ /* Offsets where TCS outputs and TCS patch outputs live in LDS: @@ -169,41 +134,43 @@ * [16:31] = TCS output patch0 offset for per-patch / 16 * max = (NUM_PATCHES + 1) * 32*32 */ - int param_tcs_out_lds_offsets; + struct ac_arg tcs_out_lds_offsets; /* Layout of TCS outputs / TES inputs: * [0:12] = stride between output patches in DW, num_outputs * num_vertices * 4 * max = 32*32*4 + 32*4 * [13:18] = gl_PatchVerticesIn, max = 32 * [19:31] = high 13 bits of the 32-bit address of tessellation ring buffers */ - int param_tcs_out_lds_layout; - int param_tcs_offchip_offset; - int param_tcs_factor_offset; + struct ac_arg tcs_out_lds_layout; + struct ac_arg tcs_offchip_offset; + struct ac_arg tcs_factor_offset; /* API TES */ - int param_tes_offchip_addr; - int param_tes_u; - int param_tes_v; - int param_tes_rel_patch_id; + struct ac_arg tes_offchip_addr; + struct ac_arg tes_u; + struct ac_arg tes_v; + struct ac_arg tes_rel_patch_id; /* HW ES */ - int param_es2gs_offset; + struct ac_arg es2gs_offset; /* HW GS */ /* On gfx10: - * - bits 0..10: ordered_wave_id + * - bits 0..11: ordered_wave_id * - bits 12..20: number of vertices in group * - bits 22..30: number of primitives in group */ - LLVMValueRef gs_tg_info; + struct ac_arg gs_tg_info; /* API GS */ - int param_gs2vs_offset; - int param_gs_wave_id; /* GFX6 */ - LLVMValueRef gs_vtx_offset[6]; /* in dwords (GFX6) */ - int param_gs_vtx01_offset; /* in dwords (GFX9) */ - int param_gs_vtx23_offset; /* in dwords (GFX9) */ - int param_gs_vtx45_offset; /* in dwords (GFX9) */ + struct ac_arg gs2vs_offset; + struct ac_arg gs_wave_id; /* GFX6 */ + struct ac_arg gs_vtx_offset[6]; /* in dwords (GFX6) */ + struct ac_arg gs_vtx01_offset; /* in dwords (GFX9) */ + struct ac_arg gs_vtx23_offset; /* in dwords (GFX9) */ + struct ac_arg gs_vtx45_offset; /* in dwords (GFX9) */ + /* PS */ + struct ac_arg pos_fixed_pt; /* CS */ - int param_block_size; - int param_cs_user_data; + struct ac_arg block_size; + struct ac_arg cs_user_data; struct ac_llvm_compiler *compiler; @@ -220,181 +187,148 @@ LLVMValueRef gs_ngg_scratch; LLVMValueRef postponed_kill; LLVMValueRef return_value; - - LLVMTypeRef voidt; - LLVMTypeRef i1; - LLVMTypeRef i8; - LLVMTypeRef i32; - LLVMTypeRef i64; - LLVMTypeRef i128; - LLVMTypeRef f32; - LLVMTypeRef v2i32; - LLVMTypeRef v4i32; - LLVMTypeRef v4f32; - LLVMTypeRef v8i32; - - LLVMValueRef i32_0; - LLVMValueRef i32_1; - LLVMValueRef i1false; - LLVMValueRef i1true; }; static inline struct si_shader_context * -si_shader_context(struct lp_build_tgsi_context *bld_base) -{ - return (struct si_shader_context*)bld_base; -} - -static inline struct si_shader_context * si_shader_context_from_abi(struct ac_shader_abi *abi) { struct si_shader_context *ctx = NULL; return container_of(abi, ctx, abi); } -void si_init_function_info(struct si_function_info *fninfo); -unsigned add_arg_assign(struct si_function_info *fninfo, - enum si_arg_regfile regfile, LLVMTypeRef type, - LLVMValueRef *assign); -void si_create_function(struct si_shader_context *ctx, - const char *name, - LLVMTypeRef *returns, unsigned num_returns, - struct si_function_info *fninfo, - unsigned max_workgroup_size); -unsigned si_llvm_compile(LLVMModuleRef M, struct si_shader_binary *binary, - struct ac_llvm_compiler *compiler, - struct pipe_debug_callback *debug, - bool less_optimized, unsigned wave_size); - -LLVMTypeRef tgsi2llvmtype(struct lp_build_tgsi_context *bld_base, - enum tgsi_opcode_type type); - -LLVMValueRef bitcast(struct lp_build_tgsi_context *bld_base, - enum tgsi_opcode_type type, LLVMValueRef value); - -LLVMValueRef si_llvm_bound_index(struct si_shader_context *ctx, - LLVMValueRef index, - unsigned num); +bool si_is_multi_part_shader(struct si_shader *shader); +bool si_is_merged_shader(struct si_shader *shader); +void si_add_arg_checked(struct ac_shader_args *args, + enum ac_arg_regfile file, + unsigned registers, enum ac_arg_type type, + struct ac_arg *arg, + unsigned idx); +unsigned si_get_max_workgroup_size(const struct si_shader *shader); +bool si_need_ps_prolog(const union si_shader_part_key *key); +void si_get_ps_prolog_key(struct si_shader *shader, + union si_shader_part_key *key, + bool separate_prolog); +void si_get_ps_epilog_key(struct si_shader *shader, + union si_shader_part_key *key); +void si_fix_resource_usage(struct si_screen *sscreen, struct si_shader *shader); +void si_create_function(struct si_shader_context *ctx, bool ngg_cull_shader); + +bool gfx10_ngg_export_prim_early(struct si_shader *shader); +void gfx10_ngg_build_sendmsg_gs_alloc_req(struct si_shader_context *ctx); +void gfx10_ngg_build_export_prim(struct si_shader_context *ctx, + LLVMValueRef user_edgeflags[3], + LLVMValueRef prim_passthrough); +void gfx10_emit_ngg_culling_epilogue_4x_wave32(struct ac_shader_abi *abi, + unsigned max_outputs, + LLVMValueRef *addrs); +void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi, + unsigned max_outputs, + LLVMValueRef *addrs); +void gfx10_ngg_gs_emit_vertex(struct si_shader_context *ctx, + unsigned stream, + LLVMValueRef *addrs); +void gfx10_ngg_gs_emit_prologue(struct si_shader_context *ctx); +void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx); +void gfx10_ngg_calculate_subgroup_info(struct si_shader *shader); +/* si_shader_llvm.c */ +bool si_compile_llvm(struct si_screen *sscreen, + struct si_shader_binary *binary, + struct ac_shader_config *conf, + struct ac_llvm_compiler *compiler, + struct ac_llvm_context *ac, + struct pipe_debug_callback *debug, + enum pipe_shader_type shader_type, + const char *name, + bool less_optimized); void si_llvm_context_init(struct si_shader_context *ctx, struct si_screen *sscreen, struct ac_llvm_compiler *compiler, - unsigned wave_size, - unsigned ballot_mask_bits); -void si_llvm_context_set_ir(struct si_shader_context *ctx, - struct si_shader *shader); - -void si_llvm_create_func(struct si_shader_context *ctx, - const char *name, + unsigned wave_size); +void si_llvm_create_func(struct si_shader_context *ctx, const char *name, LLVMTypeRef *return_types, unsigned num_return_elems, - LLVMTypeRef *ParamTypes, unsigned ParamCount); - -void si_llvm_dispose(struct si_shader_context *ctx); - + unsigned max_workgroup_size); void si_llvm_optimize_module(struct si_shader_context *ctx); - -LLVMValueRef si_llvm_emit_fetch_64bit(struct lp_build_tgsi_context *bld_base, - LLVMTypeRef type, - LLVMValueRef ptr, - LLVMValueRef ptr2); - -LLVMValueRef si_llvm_emit_fetch(struct lp_build_tgsi_context *bld_base, - const struct tgsi_full_src_register *reg, - enum tgsi_opcode_type type, - unsigned swizzle); - -void si_llvm_emit_kill(struct ac_shader_abi *abi, LLVMValueRef visible); - -LLVMValueRef si_nir_load_input_tes(struct ac_shader_abi *abi, - LLVMTypeRef type, - LLVMValueRef vertex_index, - LLVMValueRef param_index, - unsigned const_index, - unsigned location, - unsigned driver_location, - unsigned component, - unsigned num_components, - bool is_patch, - bool is_compact, - bool load_input); - -LLVMValueRef si_llvm_load_input_gs(struct ac_shader_abi *abi, - unsigned input_index, - unsigned vtx_offset_param, - LLVMTypeRef type, - unsigned swizzle); - -LLVMValueRef si_nir_lookup_interp_param(struct ac_shader_abi *abi, - enum glsl_interp_mode interp, - unsigned location); - -void si_llvm_emit_store(struct lp_build_tgsi_context *bld_base, - const struct tgsi_full_instruction *inst, - const struct tgsi_opcode_info *info, - unsigned index, - LLVMValueRef dst[4]); - -LLVMValueRef si_get_indirect_index(struct si_shader_context *ctx, - const struct tgsi_ind_register *ind, - unsigned addr_mul, int rel_index); -LLVMValueRef si_get_bounded_indirect_index(struct si_shader_context *ctx, - const struct tgsi_ind_register *ind, - int rel_index, unsigned num); -LLVMValueRef si_get_sample_id(struct si_shader_context *ctx); - -void si_shader_context_init_alu(struct lp_build_tgsi_context *bld_base); -void si_shader_context_init_mem(struct si_shader_context *ctx); - -LLVMValueRef si_load_sampler_desc(struct si_shader_context *ctx, - LLVMValueRef list, LLVMValueRef index, - enum ac_descriptor_type type); -LLVMValueRef si_load_image_desc(struct si_shader_context *ctx, - LLVMValueRef list, LLVMValueRef index, - enum ac_descriptor_type desc_type, - bool uses_store, bool bindless); -LLVMValueRef si_nir_emit_fbfetch(struct ac_shader_abi *abi); - -void si_load_system_value(struct si_shader_context *ctx, - unsigned index, - const struct tgsi_full_declaration *decl); -void si_declare_compute_memory(struct si_shader_context *ctx); -void si_tgsi_declare_compute_memory(struct si_shader_context *ctx, - const struct tgsi_full_declaration *decl); - +void si_llvm_dispose(struct si_shader_context *ctx); +LLVMValueRef si_buffer_load_const(struct si_shader_context *ctx, + LLVMValueRef resource, LLVMValueRef offset); +void si_llvm_build_ret(struct si_shader_context *ctx, LLVMValueRef ret); +LLVMValueRef si_insert_input_ret(struct si_shader_context *ctx, LLVMValueRef ret, + struct ac_arg param, unsigned return_index); +LLVMValueRef si_insert_input_ret_float(struct si_shader_context *ctx, LLVMValueRef ret, + struct ac_arg param, unsigned return_index); +LLVMValueRef si_insert_input_ptr(struct si_shader_context *ctx, LLVMValueRef ret, + struct ac_arg param, unsigned return_index); +LLVMValueRef si_prolog_get_rw_buffers(struct si_shader_context *ctx); +LLVMValueRef si_build_gather_64bit(struct si_shader_context *ctx, + LLVMTypeRef type, LLVMValueRef val1, + LLVMValueRef val2); +void si_llvm_emit_barrier(struct si_shader_context *ctx); +void si_llvm_declare_esgs_ring(struct si_shader_context *ctx); +void si_init_exec_from_input(struct si_shader_context *ctx, struct ac_arg param, + unsigned bitoffset); +LLVMValueRef si_unpack_param(struct si_shader_context *ctx, + struct ac_arg param, unsigned rshift, + unsigned bitwidth); LLVMValueRef si_get_primitive_id(struct si_shader_context *ctx, unsigned swizzle); -void si_llvm_export_vs(struct si_shader_context *ctx, - struct si_shader_output_values *outputs, - unsigned noutput); -void si_emit_streamout_output(struct si_shader_context *ctx, - LLVMValueRef const *so_buffers, - LLVMValueRef const *so_write_offsets, - struct pipe_stream_output *stream_out, - struct si_shader_output_values *shader_out); - -void si_llvm_load_input_vs( - struct si_shader_context *ctx, - unsigned input_index, - LLVMValueRef out[4]); -void si_llvm_load_input_fs( - struct si_shader_context *ctx, - unsigned input_index, - LLVMValueRef out[4]); - +LLVMValueRef si_llvm_get_block_size(struct ac_shader_abi *abi); +void si_llvm_declare_compute_memory(struct si_shader_context *ctx); bool si_nir_build_llvm(struct si_shader_context *ctx, struct nir_shader *nir); +void si_build_wrapper_function(struct si_shader_context *ctx, LLVMValueRef *parts, + unsigned num_parts, unsigned main_part, + unsigned next_shader_first_part); + +/* si_shader_llvm_gs.c */ +LLVMValueRef si_is_es_thread(struct si_shader_context *ctx); +LLVMValueRef si_is_gs_thread(struct si_shader_context *ctx); +void si_llvm_emit_es_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, + LLVMValueRef *addrs); +void si_preload_esgs_ring(struct si_shader_context *ctx); +void si_preload_gs_rings(struct si_shader_context *ctx); +void si_llvm_build_gs_prolog(struct si_shader_context *ctx, + union si_shader_part_key *key); +void si_llvm_init_gs_callbacks(struct si_shader_context *ctx); + +/* si_shader_llvm_tess.c */ +void si_llvm_preload_tes_rings(struct si_shader_context *ctx); +void si_llvm_emit_ls_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, + LLVMValueRef *addrs); +void si_llvm_build_tcs_epilog(struct si_shader_context *ctx, + union si_shader_part_key *key); +void si_llvm_init_tcs_callbacks(struct si_shader_context *ctx); +void si_llvm_init_tes_callbacks(struct si_shader_context *ctx, bool ngg_cull_shader); -LLVMValueRef si_unpack_param(struct si_shader_context *ctx, - unsigned param, unsigned rshift, - unsigned bitwidth); - -void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi, - unsigned max_outputs, - LLVMValueRef *addrs); -void gfx10_ngg_gs_emit_vertex(struct si_shader_context *ctx, - unsigned stream, +/* si_shader_llvm_ps.c */ +LLVMValueRef si_get_sample_id(struct si_shader_context *ctx); +void si_llvm_build_ps_prolog(struct si_shader_context *ctx, + union si_shader_part_key *key); +void si_llvm_build_ps_epilog(struct si_shader_context *ctx, + union si_shader_part_key *key); +void si_llvm_build_monolithic_ps(struct si_shader_context *ctx, + struct si_shader *shader); +void si_llvm_init_ps_callbacks(struct si_shader_context *ctx); + +/* si_shader_llvm_resources.c */ +void si_llvm_init_resource_callbacks(struct si_shader_context *ctx); + +/* si_shader_llvm_vs.c */ +void si_llvm_load_vs_inputs(struct si_shader_context *ctx, struct nir_shader *nir); +void si_llvm_streamout_store_output(struct si_shader_context *ctx, + LLVMValueRef const *so_buffers, + LLVMValueRef const *so_write_offsets, + struct pipe_stream_output *stream_out, + struct si_shader_output_values *shader_out); +void si_llvm_emit_streamout(struct si_shader_context *ctx, + struct si_shader_output_values *outputs, + unsigned noutput, unsigned stream); +void si_llvm_build_vs_exports(struct si_shader_context *ctx, + struct si_shader_output_values *outputs, + unsigned noutput); +void si_llvm_emit_vs_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs); -void gfx10_ngg_gs_emit_prologue(struct si_shader_context *ctx); -void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx); -void gfx10_ngg_calculate_subgroup_info(struct si_shader *shader); +void si_llvm_build_vs_prolog(struct si_shader_context *ctx, + union si_shader_part_key *key); +void si_llvm_init_vs_callbacks(struct si_shader_context *ctx, bool ngg_cull_shader); #endif diff -Nru mesa-19.2.8/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c mesa-20.0.8/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c --- mesa-19.2.8/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c 2020-06-12 01:21:17.000000000 +0000 @@ -665,6 +665,108 @@ return ctx->create_compute_state(ctx, &state); } +void *si_clear_12bytes_buffer_shader(struct pipe_context *ctx) +{ + static const char text[] = + "COMP\n" + "PROPERTY CS_FIXED_BLOCK_WIDTH 64\n" + "PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n" + "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n" + "DCL SV[0], THREAD_ID\n" + "DCL SV[1], BLOCK_ID\n" + "DCL BUFFER[0]\n" + "DCL CONST[0][0..0]\n" // 0:xyzw + "DCL TEMP[0..0]\n" + "IMM[0] UINT32 {64, 1, 12, 0}\n" + "UMAD TEMP[0].x, SV[1].xyzz, IMM[0].xyyy, SV[0].xyzz\n" + "UMUL TEMP[0].x, TEMP[0].xyzz, IMM[0].zzzz\n" //12 bytes + "STORE BUFFER[0].xyz, TEMP[0].xxxx, CONST[0][0].xyzw\n" + "END\n"; + + struct tgsi_token tokens[1024]; + struct pipe_compute_state state = {0}; + + if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) { + assert(false); + return NULL; + } + + state.ir_type = PIPE_SHADER_IR_TGSI; + state.prog = tokens; + + return ctx->create_compute_state(ctx, &state); +} + + +/* Load samples from the image, and copy them to the same image. This looks like + * a no-op, but it's not. Loads use FMASK, while stores don't, so samples are + * reordered to match expanded FMASK. + * + * After the shader finishes, FMASK should be cleared to identity. + */ +void *si_create_fmask_expand_cs(struct pipe_context *ctx, unsigned num_samples, + bool is_array) +{ + enum tgsi_texture_type target = is_array ? TGSI_TEXTURE_2D_ARRAY_MSAA : + TGSI_TEXTURE_2D_MSAA; + struct ureg_program *ureg = ureg_create(PIPE_SHADER_COMPUTE); + if (!ureg) + return NULL; + + ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH, 8); + ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT, 8); + ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH, 1); + + /* Compute the image coordinates. */ + struct ureg_src image = ureg_DECL_image(ureg, 0, target, 0, true, false); + struct ureg_src tid = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_THREAD_ID, 0); + struct ureg_src blk = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_BLOCK_ID, 0); + struct ureg_dst coord = ureg_writemask(ureg_DECL_temporary(ureg), + TGSI_WRITEMASK_XYZW); + ureg_UMAD(ureg, ureg_writemask(coord, TGSI_WRITEMASK_XY), + ureg_swizzle(blk, 0, 1, 1, 1), ureg_imm2u(ureg, 8, 8), + ureg_swizzle(tid, 0, 1, 1, 1)); + if (is_array) { + ureg_MOV(ureg, ureg_writemask(coord, TGSI_WRITEMASK_Z), + ureg_scalar(blk, TGSI_SWIZZLE_Z)); + } + + /* Load samples, resolving FMASK. */ + struct ureg_dst sample[8]; + assert(num_samples <= ARRAY_SIZE(sample)); + + for (unsigned i = 0; i < num_samples; i++) { + sample[i] = ureg_DECL_temporary(ureg); + + ureg_MOV(ureg, ureg_writemask(coord, TGSI_WRITEMASK_W), + ureg_imm1u(ureg, i)); + + struct ureg_src srcs[] = {image, ureg_src(coord)}; + ureg_memory_insn(ureg, TGSI_OPCODE_LOAD, &sample[i], 1, srcs, 2, + TGSI_MEMORY_RESTRICT, target, 0); + } + + /* Store samples, ignoring FMASK. */ + for (unsigned i = 0; i < num_samples; i++) { + ureg_MOV(ureg, ureg_writemask(coord, TGSI_WRITEMASK_W), + ureg_imm1u(ureg, i)); + + struct ureg_dst dst_image = ureg_dst(image); + struct ureg_src srcs[] = {ureg_src(coord), ureg_src(sample[i])}; + ureg_memory_insn(ureg, TGSI_OPCODE_STORE, &dst_image, 1, srcs, 2, + TGSI_MEMORY_RESTRICT, target, 0); + } + ureg_END(ureg); + + struct pipe_compute_state state = {}; + state.ir_type = PIPE_SHADER_IR_TGSI; + state.prog = ureg_get_tokens(ureg, NULL); + + void *cs = ctx->create_compute_state(ctx, &state); + ureg_destroy(ureg); + return cs; +} + /* Create the compute shader that is used to collect the results of gfx10+ * shader queries. * diff -Nru mesa-19.2.8/src/gallium/drivers/radeonsi/si_shader_llvm.c mesa-20.0.8/src/gallium/drivers/radeonsi/si_shader_llvm.c --- mesa-19.2.8/src/gallium/drivers/radeonsi/si_shader_llvm.c 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/radeonsi/si_shader_llvm.c 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,805 @@ +/* + * Copyright 2016 Advanced Micro Devices, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "si_shader_internal.h" +#include "si_pipe.h" +#include "ac_rtld.h" +#include "ac_nir_to_llvm.h" +#include "sid.h" + +#include "tgsi/tgsi_from_mesa.h" +#include "util/u_memory.h" + +struct si_llvm_diagnostics { + struct pipe_debug_callback *debug; + unsigned retval; +}; + +static void si_diagnostic_handler(LLVMDiagnosticInfoRef di, void *context) +{ + struct si_llvm_diagnostics *diag = (struct si_llvm_diagnostics *)context; + LLVMDiagnosticSeverity severity = LLVMGetDiagInfoSeverity(di); + const char *severity_str = NULL; + + switch (severity) { + case LLVMDSError: + severity_str = "error"; + break; + case LLVMDSWarning: + severity_str = "warning"; + break; + case LLVMDSRemark: + case LLVMDSNote: + default: + return; + } + + char *description = LLVMGetDiagInfoDescription(di); + + pipe_debug_message(diag->debug, SHADER_INFO, + "LLVM diagnostic (%s): %s", severity_str, description); + + if (severity == LLVMDSError) { + diag->retval = 1; + fprintf(stderr,"LLVM triggered Diagnostic Handler: %s\n", description); + } + + LLVMDisposeMessage(description); +} + +bool si_compile_llvm(struct si_screen *sscreen, + struct si_shader_binary *binary, + struct ac_shader_config *conf, + struct ac_llvm_compiler *compiler, + struct ac_llvm_context *ac, + struct pipe_debug_callback *debug, + enum pipe_shader_type shader_type, + const char *name, + bool less_optimized) +{ + unsigned count = p_atomic_inc_return(&sscreen->num_compilations); + + if (si_can_dump_shader(sscreen, shader_type)) { + fprintf(stderr, "radeonsi: Compiling shader %d\n", count); + + if (!(sscreen->debug_flags & (DBG(NO_IR) | DBG(PREOPT_IR)))) { + fprintf(stderr, "%s LLVM IR:\n\n", name); + ac_dump_module(ac->module); + fprintf(stderr, "\n"); + } + } + + if (sscreen->record_llvm_ir) { + char *ir = LLVMPrintModuleToString(ac->module); + binary->llvm_ir_string = strdup(ir); + LLVMDisposeMessage(ir); + } + + if (!si_replace_shader(count, binary)) { + struct ac_compiler_passes *passes = compiler->passes; + + if (ac->wave_size == 32) + passes = compiler->passes_wave32; + else if (less_optimized && compiler->low_opt_passes) + passes = compiler->low_opt_passes; + + struct si_llvm_diagnostics diag = {debug}; + LLVMContextSetDiagnosticHandler(ac->context, si_diagnostic_handler, &diag); + + if (!ac_compile_module_to_elf(passes, ac->module, + (char **)&binary->elf_buffer, + &binary->elf_size)) + diag.retval = 1; + + if (diag.retval != 0) { + pipe_debug_message(debug, SHADER_INFO, "LLVM compilation failed"); + return false; + } + } + + struct ac_rtld_binary rtld; + if (!ac_rtld_open(&rtld, (struct ac_rtld_open_info){ + .info = &sscreen->info, + .shader_type = tgsi_processor_to_shader_stage(shader_type), + .wave_size = ac->wave_size, + .num_parts = 1, + .elf_ptrs = &binary->elf_buffer, + .elf_sizes = &binary->elf_size })) + return false; + + bool ok = ac_rtld_read_config(&rtld, conf); + ac_rtld_close(&rtld); + if (!ok) + return false; + + /* Enable 64-bit and 16-bit denormals, because there is no performance + * cost. + * + * If denormals are enabled, all floating-point output modifiers are + * ignored. + * + * Don't enable denormals for 32-bit floats, because: + * - Floating-point output modifiers would be ignored by the hw. + * - Some opcodes don't support denormals, such as v_mad_f32. We would + * have to stop using those. + * - GFX6 & GFX7 would be very slow. + */ + conf->float_mode |= V_00B028_FP_64_DENORMS; + + return true; +} + +void si_llvm_context_init(struct si_shader_context *ctx, + struct si_screen *sscreen, + struct ac_llvm_compiler *compiler, + unsigned wave_size) +{ + memset(ctx, 0, sizeof(*ctx)); + ctx->screen = sscreen; + ctx->compiler = compiler; + + ac_llvm_context_init(&ctx->ac, compiler, sscreen->info.chip_class, + sscreen->info.family, + AC_FLOAT_MODE_NO_SIGNED_ZEROS_FP_MATH, + wave_size, 64); +} + +void si_llvm_create_func(struct si_shader_context *ctx, const char *name, + LLVMTypeRef *return_types, unsigned num_return_elems, + unsigned max_workgroup_size) +{ + LLVMTypeRef ret_type; + enum ac_llvm_calling_convention call_conv; + enum pipe_shader_type real_shader_type; + + if (num_return_elems) + ret_type = LLVMStructTypeInContext(ctx->ac.context, + return_types, + num_return_elems, true); + else + ret_type = ctx->ac.voidt; + + real_shader_type = ctx->type; + + /* LS is merged into HS (TCS), and ES is merged into GS. */ + if (ctx->screen->info.chip_class >= GFX9) { + if (ctx->shader->key.as_ls) + real_shader_type = PIPE_SHADER_TESS_CTRL; + else if (ctx->shader->key.as_es || ctx->shader->key.as_ngg) + real_shader_type = PIPE_SHADER_GEOMETRY; + } + + switch (real_shader_type) { + case PIPE_SHADER_VERTEX: + case PIPE_SHADER_TESS_EVAL: + call_conv = AC_LLVM_AMDGPU_VS; + break; + case PIPE_SHADER_TESS_CTRL: + call_conv = AC_LLVM_AMDGPU_HS; + break; + case PIPE_SHADER_GEOMETRY: + call_conv = AC_LLVM_AMDGPU_GS; + break; + case PIPE_SHADER_FRAGMENT: + call_conv = AC_LLVM_AMDGPU_PS; + break; + case PIPE_SHADER_COMPUTE: + call_conv = AC_LLVM_AMDGPU_CS; + break; + default: + unreachable("Unhandle shader type"); + } + + /* Setup the function */ + ctx->return_type = ret_type; + ctx->main_fn = ac_build_main(&ctx->args, &ctx->ac, call_conv, name, + ret_type, ctx->ac.module); + ctx->return_value = LLVMGetUndef(ctx->return_type); + + if (ctx->screen->info.address32_hi) { + ac_llvm_add_target_dep_function_attr(ctx->main_fn, + "amdgpu-32bit-address-high-bits", + ctx->screen->info.address32_hi); + } + + LLVMAddTargetDependentFunctionAttr(ctx->main_fn, + "no-signed-zeros-fp-math", + "true"); + + ac_llvm_set_workgroup_size(ctx->main_fn, max_workgroup_size); +} + +void si_llvm_optimize_module(struct si_shader_context *ctx) +{ + /* Dump LLVM IR before any optimization passes */ + if (ctx->screen->debug_flags & DBG(PREOPT_IR) && + si_can_dump_shader(ctx->screen, ctx->type)) + LLVMDumpModule(ctx->ac.module); + + /* Run the pass */ + LLVMRunPassManager(ctx->compiler->passmgr, ctx->ac.module); + LLVMDisposeBuilder(ctx->ac.builder); +} + +void si_llvm_dispose(struct si_shader_context *ctx) +{ + LLVMDisposeModule(ctx->ac.module); + LLVMContextDispose(ctx->ac.context); + ac_llvm_context_dispose(&ctx->ac); +} + +/** + * Load a dword from a constant buffer. + */ +LLVMValueRef si_buffer_load_const(struct si_shader_context *ctx, + LLVMValueRef resource, LLVMValueRef offset) +{ + return ac_build_buffer_load(&ctx->ac, resource, 1, NULL, offset, NULL, + 0, 0, true, true); +} + +void si_llvm_build_ret(struct si_shader_context *ctx, LLVMValueRef ret) +{ + if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind) + LLVMBuildRetVoid(ctx->ac.builder); + else + LLVMBuildRet(ctx->ac.builder, ret); +} + +LLVMValueRef si_insert_input_ret(struct si_shader_context *ctx, LLVMValueRef ret, + struct ac_arg param, unsigned return_index) +{ + return LLVMBuildInsertValue(ctx->ac.builder, ret, + ac_get_arg(&ctx->ac, param), + return_index, ""); +} + +LLVMValueRef si_insert_input_ret_float(struct si_shader_context *ctx, LLVMValueRef ret, + struct ac_arg param, unsigned return_index) +{ + LLVMBuilderRef builder = ctx->ac.builder; + LLVMValueRef p = ac_get_arg(&ctx->ac, param); + + return LLVMBuildInsertValue(builder, ret, + ac_to_float(&ctx->ac, p), + return_index, ""); +} + +LLVMValueRef si_insert_input_ptr(struct si_shader_context *ctx, LLVMValueRef ret, + struct ac_arg param, unsigned return_index) +{ + LLVMBuilderRef builder = ctx->ac.builder; + LLVMValueRef ptr = ac_get_arg(&ctx->ac, param); + ptr = LLVMBuildPtrToInt(builder, ptr, ctx->ac.i32, ""); + return LLVMBuildInsertValue(builder, ret, ptr, return_index, ""); +} + +LLVMValueRef si_prolog_get_rw_buffers(struct si_shader_context *ctx) +{ + LLVMValueRef ptr[2], list; + bool merged_shader = si_is_merged_shader(ctx->shader); + + ptr[0] = LLVMGetParam(ctx->main_fn, (merged_shader ? 8 : 0) + SI_SGPR_RW_BUFFERS); + list = LLVMBuildIntToPtr(ctx->ac.builder, ptr[0], + ac_array_in_const32_addr_space(ctx->ac.v4i32), ""); + return list; +} + +LLVMValueRef si_build_gather_64bit(struct si_shader_context *ctx, + LLVMTypeRef type, LLVMValueRef val1, + LLVMValueRef val2) +{ + LLVMValueRef values[2] = { + ac_to_integer(&ctx->ac, val1), + ac_to_integer(&ctx->ac, val2), + }; + LLVMValueRef result = ac_build_gather_values(&ctx->ac, values, 2); + return LLVMBuildBitCast(ctx->ac.builder, result, type, ""); +} + +void si_llvm_emit_barrier(struct si_shader_context *ctx) +{ + /* GFX6 only (thanks to a hw bug workaround): + * The real barrier instruction isn’t needed, because an entire patch + * always fits into a single wave. + */ + if (ctx->screen->info.chip_class == GFX6 && + ctx->type == PIPE_SHADER_TESS_CTRL) { + ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM | AC_WAIT_VLOAD | AC_WAIT_VSTORE); + return; + } + + ac_build_s_barrier(&ctx->ac); +} + +/* Ensure that the esgs ring is declared. + * + * We declare it with 64KB alignment as a hint that the + * pointer value will always be 0. + */ +void si_llvm_declare_esgs_ring(struct si_shader_context *ctx) +{ + if (ctx->esgs_ring) + return; + + assert(!LLVMGetNamedGlobal(ctx->ac.module, "esgs_ring")); + + ctx->esgs_ring = LLVMAddGlobalInAddressSpace( + ctx->ac.module, LLVMArrayType(ctx->ac.i32, 0), + "esgs_ring", + AC_ADDR_SPACE_LDS); + LLVMSetLinkage(ctx->esgs_ring, LLVMExternalLinkage); + LLVMSetAlignment(ctx->esgs_ring, 64 * 1024); +} + +void si_init_exec_from_input(struct si_shader_context *ctx, struct ac_arg param, + unsigned bitoffset) +{ + LLVMValueRef args[] = { + ac_get_arg(&ctx->ac, param), + LLVMConstInt(ctx->ac.i32, bitoffset, 0), + }; + ac_build_intrinsic(&ctx->ac, + "llvm.amdgcn.init.exec.from.input", + ctx->ac.voidt, args, 2, AC_FUNC_ATTR_CONVERGENT); +} + +/** + * Get the value of a shader input parameter and extract a bitfield. + */ +static LLVMValueRef unpack_llvm_param(struct si_shader_context *ctx, + LLVMValueRef value, unsigned rshift, + unsigned bitwidth) +{ + if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMFloatTypeKind) + value = ac_to_integer(&ctx->ac, value); + + if (rshift) + value = LLVMBuildLShr(ctx->ac.builder, value, + LLVMConstInt(ctx->ac.i32, rshift, 0), ""); + + if (rshift + bitwidth < 32) { + unsigned mask = (1 << bitwidth) - 1; + value = LLVMBuildAnd(ctx->ac.builder, value, + LLVMConstInt(ctx->ac.i32, mask, 0), ""); + } + + return value; +} + +LLVMValueRef si_unpack_param(struct si_shader_context *ctx, + struct ac_arg param, unsigned rshift, + unsigned bitwidth) +{ + LLVMValueRef value = ac_get_arg(&ctx->ac, param); + + return unpack_llvm_param(ctx, value, rshift, bitwidth); +} + +LLVMValueRef si_get_primitive_id(struct si_shader_context *ctx, + unsigned swizzle) +{ + if (swizzle > 0) + return ctx->ac.i32_0; + + switch (ctx->type) { + case PIPE_SHADER_VERTEX: + return ac_get_arg(&ctx->ac, ctx->vs_prim_id); + case PIPE_SHADER_TESS_CTRL: + return ac_get_arg(&ctx->ac, ctx->args.tcs_patch_id); + case PIPE_SHADER_TESS_EVAL: + return ac_get_arg(&ctx->ac, ctx->args.tes_patch_id); + case PIPE_SHADER_GEOMETRY: + return ac_get_arg(&ctx->ac, ctx->args.gs_prim_id); + default: + assert(0); + return ctx->ac.i32_0; + } +} + +LLVMValueRef si_llvm_get_block_size(struct ac_shader_abi *abi) +{ + struct si_shader_context *ctx = si_shader_context_from_abi(abi); + + LLVMValueRef values[3]; + LLVMValueRef result; + unsigned i; + unsigned *properties = ctx->shader->selector->info.properties; + + if (properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] != 0) { + unsigned sizes[3] = { + properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH], + properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT], + properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH] + }; + + for (i = 0; i < 3; ++i) + values[i] = LLVMConstInt(ctx->ac.i32, sizes[i], 0); + + result = ac_build_gather_values(&ctx->ac, values, 3); + } else { + result = ac_get_arg(&ctx->ac, ctx->block_size); + } + + return result; +} + +void si_llvm_declare_compute_memory(struct si_shader_context *ctx) +{ + struct si_shader_selector *sel = ctx->shader->selector; + unsigned lds_size = sel->info.properties[TGSI_PROPERTY_CS_LOCAL_SIZE]; + + LLVMTypeRef i8p = LLVMPointerType(ctx->ac.i8, AC_ADDR_SPACE_LDS); + LLVMValueRef var; + + assert(!ctx->ac.lds); + + var = LLVMAddGlobalInAddressSpace(ctx->ac.module, + LLVMArrayType(ctx->ac.i8, lds_size), + "compute_lds", + AC_ADDR_SPACE_LDS); + LLVMSetAlignment(var, 64 * 1024); + + ctx->ac.lds = LLVMBuildBitCast(ctx->ac.builder, var, i8p, ""); +} + +bool si_nir_build_llvm(struct si_shader_context *ctx, struct nir_shader *nir) +{ + if (nir->info.stage == MESA_SHADER_VERTEX) { + si_llvm_load_vs_inputs(ctx, nir); + } else if (nir->info.stage == MESA_SHADER_FRAGMENT) { + unsigned colors_read = + ctx->shader->selector->info.colors_read; + LLVMValueRef main_fn = ctx->main_fn; + + LLVMValueRef undef = LLVMGetUndef(ctx->ac.f32); + + unsigned offset = SI_PARAM_POS_FIXED_PT + 1; + + if (colors_read & 0x0f) { + unsigned mask = colors_read & 0x0f; + LLVMValueRef values[4]; + values[0] = mask & 0x1 ? LLVMGetParam(main_fn, offset++) : undef; + values[1] = mask & 0x2 ? LLVMGetParam(main_fn, offset++) : undef; + values[2] = mask & 0x4 ? LLVMGetParam(main_fn, offset++) : undef; + values[3] = mask & 0x8 ? LLVMGetParam(main_fn, offset++) : undef; + ctx->abi.color0 = + ac_to_integer(&ctx->ac, + ac_build_gather_values(&ctx->ac, values, 4)); + } + if (colors_read & 0xf0) { + unsigned mask = (colors_read & 0xf0) >> 4; + LLVMValueRef values[4]; + values[0] = mask & 0x1 ? LLVMGetParam(main_fn, offset++) : undef; + values[1] = mask & 0x2 ? LLVMGetParam(main_fn, offset++) : undef; + values[2] = mask & 0x4 ? LLVMGetParam(main_fn, offset++) : undef; + values[3] = mask & 0x8 ? LLVMGetParam(main_fn, offset++) : undef; + ctx->abi.color1 = + ac_to_integer(&ctx->ac, + ac_build_gather_values(&ctx->ac, values, 4)); + } + + ctx->abi.interp_at_sample_force_center = + ctx->shader->key.mono.u.ps.interpolate_at_sample_force_center; + } else if (nir->info.stage == MESA_SHADER_COMPUTE) { + if (nir->info.cs.user_data_components_amd) { + ctx->abi.user_data = ac_get_arg(&ctx->ac, ctx->cs_user_data); + ctx->abi.user_data = ac_build_expand_to_vec4(&ctx->ac, ctx->abi.user_data, + nir->info.cs.user_data_components_amd); + } + } + + ctx->abi.inputs = &ctx->inputs[0]; + ctx->abi.clamp_shadow_reference = true; + ctx->abi.robust_buffer_access = true; + + if (ctx->shader->selector->info.properties[TGSI_PROPERTY_CS_LOCAL_SIZE]) { + assert(gl_shader_stage_is_compute(nir->info.stage)); + si_llvm_declare_compute_memory(ctx); + } + ac_nir_translate(&ctx->ac, &ctx->abi, &ctx->args, nir); + + return true; +} + +/** + * Given a list of shader part functions, build a wrapper function that + * runs them in sequence to form a monolithic shader. + */ +void si_build_wrapper_function(struct si_shader_context *ctx, LLVMValueRef *parts, + unsigned num_parts, unsigned main_part, + unsigned next_shader_first_part) +{ + LLVMBuilderRef builder = ctx->ac.builder; + /* PS epilog has one arg per color component; gfx9 merged shader + * prologs need to forward 40 SGPRs. + */ + LLVMValueRef initial[AC_MAX_ARGS], out[AC_MAX_ARGS]; + LLVMTypeRef function_type; + unsigned num_first_params; + unsigned num_out, initial_num_out; + ASSERTED unsigned num_out_sgpr; /* used in debug checks */ + ASSERTED unsigned initial_num_out_sgpr; /* used in debug checks */ + unsigned num_sgprs, num_vgprs; + unsigned gprs; + + memset(&ctx->args, 0, sizeof(ctx->args)); + + for (unsigned i = 0; i < num_parts; ++i) { + ac_add_function_attr(ctx->ac.context, parts[i], -1, + AC_FUNC_ATTR_ALWAYSINLINE); + LLVMSetLinkage(parts[i], LLVMPrivateLinkage); + } + + /* The parameters of the wrapper function correspond to those of the + * first part in terms of SGPRs and VGPRs, but we use the types of the + * main part to get the right types. This is relevant for the + * dereferenceable attribute on descriptor table pointers. + */ + num_sgprs = 0; + num_vgprs = 0; + + function_type = LLVMGetElementType(LLVMTypeOf(parts[0])); + num_first_params = LLVMCountParamTypes(function_type); + + for (unsigned i = 0; i < num_first_params; ++i) { + LLVMValueRef param = LLVMGetParam(parts[0], i); + + if (ac_is_sgpr_param(param)) { + assert(num_vgprs == 0); + num_sgprs += ac_get_type_size(LLVMTypeOf(param)) / 4; + } else { + num_vgprs += ac_get_type_size(LLVMTypeOf(param)) / 4; + } + } + + gprs = 0; + while (gprs < num_sgprs + num_vgprs) { + LLVMValueRef param = LLVMGetParam(parts[main_part], ctx->args.arg_count); + LLVMTypeRef type = LLVMTypeOf(param); + unsigned size = ac_get_type_size(type) / 4; + + /* This is going to get casted anyways, so we don't have to + * have the exact same type. But we do have to preserve the + * pointer-ness so that LLVM knows about it. + */ + enum ac_arg_type arg_type = AC_ARG_INT; + if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) { + type = LLVMGetElementType(type); + + if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) { + if (LLVMGetVectorSize(type) == 4) + arg_type = AC_ARG_CONST_DESC_PTR; + else if (LLVMGetVectorSize(type) == 8) + arg_type = AC_ARG_CONST_IMAGE_PTR; + else + assert(0); + } else if (type == ctx->ac.f32) { + arg_type = AC_ARG_CONST_FLOAT_PTR; + } else { + assert(0); + } + } + + ac_add_arg(&ctx->args, gprs < num_sgprs ? AC_ARG_SGPR : AC_ARG_VGPR, + size, arg_type, NULL); + + assert(ac_is_sgpr_param(param) == (gprs < num_sgprs)); + assert(gprs + size <= num_sgprs + num_vgprs && + (gprs >= num_sgprs || gprs + size <= num_sgprs)); + + gprs += size; + } + + /* Prepare the return type. */ + unsigned num_returns = 0; + LLVMTypeRef returns[AC_MAX_ARGS], last_func_type, return_type; + + last_func_type = LLVMGetElementType(LLVMTypeOf(parts[num_parts - 1])); + return_type = LLVMGetReturnType(last_func_type); + + switch (LLVMGetTypeKind(return_type)) { + case LLVMStructTypeKind: + num_returns = LLVMCountStructElementTypes(return_type); + assert(num_returns <= ARRAY_SIZE(returns)); + LLVMGetStructElementTypes(return_type, returns); + break; + case LLVMVoidTypeKind: + break; + default: + unreachable("unexpected type"); + } + + si_llvm_create_func(ctx, "wrapper", returns, num_returns, + si_get_max_workgroup_size(ctx->shader)); + + if (si_is_merged_shader(ctx->shader)) + ac_init_exec_full_mask(&ctx->ac); + + /* Record the arguments of the function as if they were an output of + * a previous part. + */ + num_out = 0; + num_out_sgpr = 0; + + for (unsigned i = 0; i < ctx->args.arg_count; ++i) { + LLVMValueRef param = LLVMGetParam(ctx->main_fn, i); + LLVMTypeRef param_type = LLVMTypeOf(param); + LLVMTypeRef out_type = ctx->args.args[i].file == AC_ARG_SGPR ? ctx->ac.i32 : ctx->ac.f32; + unsigned size = ac_get_type_size(param_type) / 4; + + if (size == 1) { + if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) { + param = LLVMBuildPtrToInt(builder, param, ctx->ac.i32, ""); + param_type = ctx->ac.i32; + } + + if (param_type != out_type) + param = LLVMBuildBitCast(builder, param, out_type, ""); + out[num_out++] = param; + } else { + LLVMTypeRef vector_type = LLVMVectorType(out_type, size); + + if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) { + param = LLVMBuildPtrToInt(builder, param, ctx->ac.i64, ""); + param_type = ctx->ac.i64; + } + + if (param_type != vector_type) + param = LLVMBuildBitCast(builder, param, vector_type, ""); + + for (unsigned j = 0; j < size; ++j) + out[num_out++] = LLVMBuildExtractElement( + builder, param, LLVMConstInt(ctx->ac.i32, j, 0), ""); + } + + if (ctx->args.args[i].file == AC_ARG_SGPR) + num_out_sgpr = num_out; + } + + memcpy(initial, out, sizeof(out)); + initial_num_out = num_out; + initial_num_out_sgpr = num_out_sgpr; + + /* Now chain the parts. */ + LLVMValueRef ret = NULL; + for (unsigned part = 0; part < num_parts; ++part) { + LLVMValueRef in[AC_MAX_ARGS]; + LLVMTypeRef ret_type; + unsigned out_idx = 0; + unsigned num_params = LLVMCountParams(parts[part]); + + /* Merged shaders are executed conditionally depending + * on the number of enabled threads passed in the input SGPRs. */ + if (si_is_multi_part_shader(ctx->shader) && part == 0) { + LLVMValueRef ena, count = initial[3]; + + count = LLVMBuildAnd(builder, count, + LLVMConstInt(ctx->ac.i32, 0x7f, 0), ""); + ena = LLVMBuildICmp(builder, LLVMIntULT, + ac_get_thread_id(&ctx->ac), count, ""); + ac_build_ifcc(&ctx->ac, ena, 6506); + } + + /* Derive arguments for the next part from outputs of the + * previous one. + */ + for (unsigned param_idx = 0; param_idx < num_params; ++param_idx) { + LLVMValueRef param; + LLVMTypeRef param_type; + bool is_sgpr; + unsigned param_size; + LLVMValueRef arg = NULL; + + param = LLVMGetParam(parts[part], param_idx); + param_type = LLVMTypeOf(param); + param_size = ac_get_type_size(param_type) / 4; + is_sgpr = ac_is_sgpr_param(param); + + if (is_sgpr) { + ac_add_function_attr(ctx->ac.context, parts[part], + param_idx + 1, AC_FUNC_ATTR_INREG); + } else if (out_idx < num_out_sgpr) { + /* Skip returned SGPRs the current part doesn't + * declare on the input. */ + out_idx = num_out_sgpr; + } + + assert(out_idx + param_size <= (is_sgpr ? num_out_sgpr : num_out)); + + if (param_size == 1) + arg = out[out_idx]; + else + arg = ac_build_gather_values(&ctx->ac, &out[out_idx], param_size); + + if (LLVMTypeOf(arg) != param_type) { + if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) { + if (LLVMGetPointerAddressSpace(param_type) == + AC_ADDR_SPACE_CONST_32BIT) { + arg = LLVMBuildBitCast(builder, arg, ctx->ac.i32, ""); + arg = LLVMBuildIntToPtr(builder, arg, param_type, ""); + } else { + arg = LLVMBuildBitCast(builder, arg, ctx->ac.i64, ""); + arg = LLVMBuildIntToPtr(builder, arg, param_type, ""); + } + } else { + arg = LLVMBuildBitCast(builder, arg, param_type, ""); + } + } + + in[param_idx] = arg; + out_idx += param_size; + } + + ret = ac_build_call(&ctx->ac, parts[part], in, num_params); + + if (si_is_multi_part_shader(ctx->shader) && + part + 1 == next_shader_first_part) { + ac_build_endif(&ctx->ac, 6506); + + /* The second half of the merged shader should use + * the inputs from the toplevel (wrapper) function, + * not the return value from the last call. + * + * That's because the last call was executed condi- + * tionally, so we can't consume it in the main + * block. + */ + memcpy(out, initial, sizeof(initial)); + num_out = initial_num_out; + num_out_sgpr = initial_num_out_sgpr; + continue; + } + + /* Extract the returned GPRs. */ + ret_type = LLVMTypeOf(ret); + num_out = 0; + num_out_sgpr = 0; + + if (LLVMGetTypeKind(ret_type) != LLVMVoidTypeKind) { + assert(LLVMGetTypeKind(ret_type) == LLVMStructTypeKind); + + unsigned ret_size = LLVMCountStructElementTypes(ret_type); + + for (unsigned i = 0; i < ret_size; ++i) { + LLVMValueRef val = + LLVMBuildExtractValue(builder, ret, i, ""); + + assert(num_out < ARRAY_SIZE(out)); + out[num_out++] = val; + + if (LLVMTypeOf(val) == ctx->ac.i32) { + assert(num_out_sgpr + 1 == num_out); + num_out_sgpr = num_out; + } + } + } + } + + /* Return the value from the last part. */ + if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind) + LLVMBuildRetVoid(builder); + else + LLVMBuildRet(builder, ret); +} diff -Nru mesa-19.2.8/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c mesa-20.0.8/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c --- mesa-19.2.8/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,779 @@ +/* + * Copyright 2020 Advanced Micro Devices, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "si_shader_internal.h" +#include "si_pipe.h" +#include "sid.h" +#include "util/u_memory.h" + +LLVMValueRef si_is_es_thread(struct si_shader_context *ctx) +{ + /* Return true if the current thread should execute an ES thread. */ + return LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, + ac_get_thread_id(&ctx->ac), + si_unpack_param(ctx, ctx->merged_wave_info, 0, 8), ""); +} + +LLVMValueRef si_is_gs_thread(struct si_shader_context *ctx) +{ + /* Return true if the current thread should execute a GS thread. */ + return LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, + ac_get_thread_id(&ctx->ac), + si_unpack_param(ctx, ctx->merged_wave_info, 8, 8), ""); +} + +static LLVMValueRef si_llvm_load_input_gs(struct ac_shader_abi *abi, + unsigned input_index, + unsigned vtx_offset_param, + LLVMTypeRef type, + unsigned swizzle) +{ + struct si_shader_context *ctx = si_shader_context_from_abi(abi); + struct si_shader *shader = ctx->shader; + LLVMValueRef vtx_offset, soffset; + struct si_shader_info *info = &shader->selector->info; + unsigned semantic_name = info->input_semantic_name[input_index]; + unsigned semantic_index = info->input_semantic_index[input_index]; + unsigned param; + LLVMValueRef value; + + param = si_shader_io_get_unique_index(semantic_name, semantic_index, false); + + /* GFX9 has the ESGS ring in LDS. */ + if (ctx->screen->info.chip_class >= GFX9) { + unsigned index = vtx_offset_param; + + switch (index / 2) { + case 0: + vtx_offset = si_unpack_param(ctx, ctx->gs_vtx01_offset, + index % 2 ? 16 : 0, 16); + break; + case 1: + vtx_offset = si_unpack_param(ctx, ctx->gs_vtx23_offset, + index % 2 ? 16 : 0, 16); + break; + case 2: + vtx_offset = si_unpack_param(ctx, ctx->gs_vtx45_offset, + index % 2 ? 16 : 0, 16); + break; + default: + assert(0); + return NULL; + } + + unsigned offset = param * 4 + swizzle; + vtx_offset = LLVMBuildAdd(ctx->ac.builder, vtx_offset, + LLVMConstInt(ctx->ac.i32, offset, false), ""); + + LLVMValueRef ptr = ac_build_gep0(&ctx->ac, ctx->esgs_ring, vtx_offset); + LLVMValueRef value = LLVMBuildLoad(ctx->ac.builder, ptr, ""); + if (ac_get_type_size(type) == 8) { + ptr = LLVMBuildGEP(ctx->ac.builder, ptr, + &ctx->ac.i32_1, 1, ""); + LLVMValueRef values[2] = { + value, + LLVMBuildLoad(ctx->ac.builder, ptr, "") + }; + value = ac_build_gather_values(&ctx->ac, values, 2); + } + return LLVMBuildBitCast(ctx->ac.builder, value, type, ""); + } + + /* GFX6: input load from the ESGS ring in memory. */ + if (swizzle == ~0) { + LLVMValueRef values[4]; + unsigned chan; + for (chan = 0; chan < 4; chan++) { + values[chan] = si_llvm_load_input_gs(abi, input_index, vtx_offset_param, + type, chan); + } + return ac_build_gather_values(&ctx->ac, values, 4); + } + + /* Get the vertex offset parameter on GFX6. */ + LLVMValueRef gs_vtx_offset = ac_get_arg(&ctx->ac, + ctx->gs_vtx_offset[vtx_offset_param]); + + vtx_offset = LLVMBuildMul(ctx->ac.builder, gs_vtx_offset, + LLVMConstInt(ctx->ac.i32, 4, 0), ""); + + soffset = LLVMConstInt(ctx->ac.i32, (param * 4 + swizzle) * 256, 0); + + value = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1, ctx->ac.i32_0, + vtx_offset, soffset, 0, ac_glc, true, false); + if (ac_get_type_size(type) == 8) { + LLVMValueRef value2; + soffset = LLVMConstInt(ctx->ac.i32, (param * 4 + swizzle + 1) * 256, 0); + + value2 = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1, + ctx->ac.i32_0, vtx_offset, soffset, + 0, ac_glc, true, false); + return si_build_gather_64bit(ctx, type, value, value2); + } + return LLVMBuildBitCast(ctx->ac.builder, value, type, ""); +} + +static LLVMValueRef si_nir_load_input_gs(struct ac_shader_abi *abi, + unsigned location, + unsigned driver_location, + unsigned component, + unsigned num_components, + unsigned vertex_index, + unsigned const_index, + LLVMTypeRef type) +{ + struct si_shader_context *ctx = si_shader_context_from_abi(abi); + + LLVMValueRef value[4]; + for (unsigned i = 0; i < num_components; i++) { + unsigned offset = i; + if (ac_get_type_size(type) == 8) + offset *= 2; + + offset += component; + value[i + component] = si_llvm_load_input_gs(&ctx->abi, driver_location / 4 + const_index, + vertex_index, type, offset); + } + + return ac_build_varying_gather_values(&ctx->ac, value, num_components, component); +} + +/* Pass GS inputs from ES to GS on GFX9. */ +static void si_set_es_return_value_for_gs(struct si_shader_context *ctx) +{ + LLVMValueRef ret = ctx->return_value; + + ret = si_insert_input_ptr(ctx, ret, ctx->other_const_and_shader_buffers, 0); + ret = si_insert_input_ptr(ctx, ret, ctx->other_samplers_and_images, 1); + if (ctx->shader->key.as_ngg) + ret = si_insert_input_ptr(ctx, ret, ctx->gs_tg_info, 2); + else + ret = si_insert_input_ret(ctx, ret, ctx->gs2vs_offset, 2); + ret = si_insert_input_ret(ctx, ret, ctx->merged_wave_info, 3); + ret = si_insert_input_ret(ctx, ret, ctx->merged_scratch_offset, 5); + + ret = si_insert_input_ptr(ctx, ret, ctx->rw_buffers, + 8 + SI_SGPR_RW_BUFFERS); + ret = si_insert_input_ptr(ctx, ret, + ctx->bindless_samplers_and_images, + 8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES); + if (ctx->screen->use_ngg) { + ret = si_insert_input_ptr(ctx, ret, ctx->vs_state_bits, + 8 + SI_SGPR_VS_STATE_BITS); + } + + unsigned vgpr; + if (ctx->type == PIPE_SHADER_VERTEX) + vgpr = 8 + GFX9_VSGS_NUM_USER_SGPR; + else + vgpr = 8 + GFX9_TESGS_NUM_USER_SGPR; + + ret = si_insert_input_ret_float(ctx, ret, ctx->gs_vtx01_offset, vgpr++); + ret = si_insert_input_ret_float(ctx, ret, ctx->gs_vtx23_offset, vgpr++); + ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_prim_id, vgpr++); + ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_invocation_id, vgpr++); + ret = si_insert_input_ret_float(ctx, ret, ctx->gs_vtx45_offset, vgpr++); + ctx->return_value = ret; +} + +void si_llvm_emit_es_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, + LLVMValueRef *addrs) +{ + struct si_shader_context *ctx = si_shader_context_from_abi(abi); + struct si_shader *es = ctx->shader; + struct si_shader_info *info = &es->selector->info; + LLVMValueRef lds_base = NULL; + unsigned chan; + int i; + + if (ctx->screen->info.chip_class >= GFX9 && info->num_outputs) { + unsigned itemsize_dw = es->selector->esgs_itemsize / 4; + LLVMValueRef vertex_idx = ac_get_thread_id(&ctx->ac); + LLVMValueRef wave_idx = si_unpack_param(ctx, ctx->merged_wave_info, 24, 4); + vertex_idx = LLVMBuildOr(ctx->ac.builder, vertex_idx, + LLVMBuildMul(ctx->ac.builder, wave_idx, + LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, false), ""), ""); + lds_base = LLVMBuildMul(ctx->ac.builder, vertex_idx, + LLVMConstInt(ctx->ac.i32, itemsize_dw, 0), ""); + } + + for (i = 0; i < info->num_outputs; i++) { + int param; + + if (info->output_semantic_name[i] == TGSI_SEMANTIC_VIEWPORT_INDEX || + info->output_semantic_name[i] == TGSI_SEMANTIC_LAYER) + continue; + + param = si_shader_io_get_unique_index(info->output_semantic_name[i], + info->output_semantic_index[i], false); + + for (chan = 0; chan < 4; chan++) { + if (!(info->output_usagemask[i] & (1 << chan))) + continue; + + LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], ""); + out_val = ac_to_integer(&ctx->ac, out_val); + + /* GFX9 has the ESGS ring in LDS. */ + if (ctx->screen->info.chip_class >= GFX9) { + LLVMValueRef idx = LLVMConstInt(ctx->ac.i32, param * 4 + chan, false); + idx = LLVMBuildAdd(ctx->ac.builder, lds_base, idx, ""); + ac_build_indexed_store(&ctx->ac, ctx->esgs_ring, idx, out_val); + continue; + } + + ac_build_buffer_store_dword(&ctx->ac, + ctx->esgs_ring, + out_val, 1, NULL, + ac_get_arg(&ctx->ac, ctx->es2gs_offset), + (4 * param + chan) * 4, + ac_glc | ac_slc | ac_swizzled); + } + } + + if (ctx->screen->info.chip_class >= GFX9) + si_set_es_return_value_for_gs(ctx); +} + +static LLVMValueRef si_get_gs_wave_id(struct si_shader_context *ctx) +{ + if (ctx->screen->info.chip_class >= GFX9) + return si_unpack_param(ctx, ctx->merged_wave_info, 16, 8); + else + return ac_get_arg(&ctx->ac, ctx->gs_wave_id); +} + +static void emit_gs_epilogue(struct si_shader_context *ctx) +{ + if (ctx->shader->key.as_ngg) { + gfx10_ngg_gs_emit_epilogue(ctx); + return; + } + + if (ctx->screen->info.chip_class >= GFX10) + LLVMBuildFence(ctx->ac.builder, LLVMAtomicOrderingRelease, false, ""); + + ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE, + si_get_gs_wave_id(ctx)); + + if (ctx->screen->info.chip_class >= GFX9) + ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label); +} + +static void si_llvm_emit_gs_epilogue(struct ac_shader_abi *abi, + unsigned max_outputs, + LLVMValueRef *addrs) +{ + struct si_shader_context *ctx = si_shader_context_from_abi(abi); + struct si_shader_info UNUSED *info = &ctx->shader->selector->info; + + assert(info->num_outputs <= max_outputs); + + emit_gs_epilogue(ctx); +} + +/* Emit one vertex from the geometry shader */ +static void si_llvm_emit_vertex(struct ac_shader_abi *abi, + unsigned stream, + LLVMValueRef *addrs) +{ + struct si_shader_context *ctx = si_shader_context_from_abi(abi); + + if (ctx->shader->key.as_ngg) { + gfx10_ngg_gs_emit_vertex(ctx, stream, addrs); + return; + } + + struct si_shader_info *info = &ctx->shader->selector->info; + struct si_shader *shader = ctx->shader; + LLVMValueRef soffset = ac_get_arg(&ctx->ac, ctx->gs2vs_offset); + LLVMValueRef gs_next_vertex; + LLVMValueRef can_emit; + unsigned chan, offset; + int i; + + /* Write vertex attribute values to GSVS ring */ + gs_next_vertex = LLVMBuildLoad(ctx->ac.builder, + ctx->gs_next_vertex[stream], + ""); + + /* If this thread has already emitted the declared maximum number of + * vertices, skip the write: excessive vertex emissions are not + * supposed to have any effect. + * + * If the shader has no writes to memory, kill it instead. This skips + * further memory loads and may allow LLVM to skip to the end + * altogether. + */ + can_emit = LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, gs_next_vertex, + LLVMConstInt(ctx->ac.i32, + shader->selector->gs_max_out_vertices, 0), ""); + + bool use_kill = !info->writes_memory; + if (use_kill) { + ac_build_kill_if_false(&ctx->ac, can_emit); + } else { + ac_build_ifcc(&ctx->ac, can_emit, 6505); + } + + offset = 0; + for (i = 0; i < info->num_outputs; i++) { + for (chan = 0; chan < 4; chan++) { + if (!(info->output_usagemask[i] & (1 << chan)) || + ((info->output_streams[i] >> (2 * chan)) & 3) != stream) + continue; + + LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], ""); + LLVMValueRef voffset = + LLVMConstInt(ctx->ac.i32, offset * + shader->selector->gs_max_out_vertices, 0); + offset++; + + voffset = LLVMBuildAdd(ctx->ac.builder, voffset, gs_next_vertex, ""); + voffset = LLVMBuildMul(ctx->ac.builder, voffset, + LLVMConstInt(ctx->ac.i32, 4, 0), ""); + + out_val = ac_to_integer(&ctx->ac, out_val); + + ac_build_buffer_store_dword(&ctx->ac, + ctx->gsvs_ring[stream], + out_val, 1, + voffset, soffset, 0, + ac_glc | ac_slc | ac_swizzled); + } + } + + gs_next_vertex = LLVMBuildAdd(ctx->ac.builder, gs_next_vertex, ctx->ac.i32_1, ""); + LLVMBuildStore(ctx->ac.builder, gs_next_vertex, ctx->gs_next_vertex[stream]); + + /* Signal vertex emission if vertex data was written. */ + if (offset) { + ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8), + si_get_gs_wave_id(ctx)); + } + + if (!use_kill) + ac_build_endif(&ctx->ac, 6505); +} + +/* Cut one primitive from the geometry shader */ +static void si_llvm_emit_primitive(struct ac_shader_abi *abi, + unsigned stream) +{ + struct si_shader_context *ctx = si_shader_context_from_abi(abi); + + if (ctx->shader->key.as_ngg) { + LLVMBuildStore(ctx->ac.builder, ctx->ac.i32_0, ctx->gs_curprim_verts[stream]); + return; + } + + /* Signal primitive cut */ + ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_CUT | AC_SENDMSG_GS | (stream << 8), + si_get_gs_wave_id(ctx)); +} + +void si_preload_esgs_ring(struct si_shader_context *ctx) +{ + if (ctx->screen->info.chip_class <= GFX8) { + unsigned ring = + ctx->type == PIPE_SHADER_GEOMETRY ? SI_GS_RING_ESGS + : SI_ES_RING_ESGS; + LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, ring, 0); + LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers); + + ctx->esgs_ring = + ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset); + } else { + if (USE_LDS_SYMBOLS && LLVM_VERSION_MAJOR >= 9) { + /* Declare the ESGS ring as an explicit LDS symbol. */ + si_llvm_declare_esgs_ring(ctx); + } else { + ac_declare_lds_as_pointer(&ctx->ac); + ctx->esgs_ring = ctx->ac.lds; + } + } +} + +void si_preload_gs_rings(struct si_shader_context *ctx) +{ + const struct si_shader_selector *sel = ctx->shader->selector; + LLVMBuilderRef builder = ctx->ac.builder; + LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, SI_RING_GSVS, 0); + LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers); + LLVMValueRef base_ring = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset); + + /* The conceptual layout of the GSVS ring is + * v0c0 .. vLv0 v0c1 .. vLc1 .. + * but the real memory layout is swizzled across + * threads: + * t0v0c0 .. t15v0c0 t0v1c0 .. t15v1c0 ... t15vLcL + * t16v0c0 .. + * Override the buffer descriptor accordingly. + */ + LLVMTypeRef v2i64 = LLVMVectorType(ctx->ac.i64, 2); + uint64_t stream_offset = 0; + + for (unsigned stream = 0; stream < 4; ++stream) { + unsigned num_components; + unsigned stride; + unsigned num_records; + LLVMValueRef ring, tmp; + + num_components = sel->info.num_stream_output_components[stream]; + if (!num_components) + continue; + + stride = 4 * num_components * sel->gs_max_out_vertices; + + /* Limit on the stride field for <= GFX7. */ + assert(stride < (1 << 14)); + + num_records = ctx->ac.wave_size; + + ring = LLVMBuildBitCast(builder, base_ring, v2i64, ""); + tmp = LLVMBuildExtractElement(builder, ring, ctx->ac.i32_0, ""); + tmp = LLVMBuildAdd(builder, tmp, + LLVMConstInt(ctx->ac.i64, + stream_offset, 0), ""); + stream_offset += stride * ctx->ac.wave_size; + + ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->ac.i32_0, ""); + ring = LLVMBuildBitCast(builder, ring, ctx->ac.v4i32, ""); + tmp = LLVMBuildExtractElement(builder, ring, ctx->ac.i32_1, ""); + tmp = LLVMBuildOr(builder, tmp, + LLVMConstInt(ctx->ac.i32, + S_008F04_STRIDE(stride) | + S_008F04_SWIZZLE_ENABLE(1), 0), ""); + ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->ac.i32_1, ""); + ring = LLVMBuildInsertElement(builder, ring, + LLVMConstInt(ctx->ac.i32, num_records, 0), + LLVMConstInt(ctx->ac.i32, 2, 0), ""); + + uint32_t rsrc3 = + S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | + S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | + S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | + S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) | + S_008F0C_INDEX_STRIDE(1) | /* index_stride = 16 (elements) */ + S_008F0C_ADD_TID_ENABLE(1); + + if (ctx->ac.chip_class >= GFX10) { + rsrc3 |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) | + S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_DISABLED) | + S_008F0C_RESOURCE_LEVEL(1); + } else { + rsrc3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | + S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) | + S_008F0C_ELEMENT_SIZE(1); /* element_size = 4 (bytes) */ + } + + ring = LLVMBuildInsertElement(builder, ring, + LLVMConstInt(ctx->ac.i32, rsrc3, false), + LLVMConstInt(ctx->ac.i32, 3, 0), ""); + + ctx->gsvs_ring[stream] = ring; + } +} + +/* Generate code for the hardware VS shader stage to go with a geometry shader */ +struct si_shader * +si_generate_gs_copy_shader(struct si_screen *sscreen, + struct ac_llvm_compiler *compiler, + struct si_shader_selector *gs_selector, + struct pipe_debug_callback *debug) +{ + struct si_shader_context ctx; + struct si_shader *shader; + LLVMBuilderRef builder; + struct si_shader_output_values outputs[SI_MAX_VS_OUTPUTS]; + struct si_shader_info *gsinfo = &gs_selector->info; + int i; + + + shader = CALLOC_STRUCT(si_shader); + if (!shader) + return NULL; + + /* We can leave the fence as permanently signaled because the GS copy + * shader only becomes visible globally after it has been compiled. */ + util_queue_fence_init(&shader->ready); + + shader->selector = gs_selector; + shader->is_gs_copy_shader = true; + + si_llvm_context_init(&ctx, sscreen, compiler, + si_get_wave_size(sscreen, PIPE_SHADER_VERTEX, false, false)); + ctx.shader = shader; + ctx.type = PIPE_SHADER_VERTEX; + + builder = ctx.ac.builder; + + si_create_function(&ctx, false); + + LLVMValueRef buf_ptr = ac_get_arg(&ctx.ac, ctx.rw_buffers); + ctx.gsvs_ring[0] = ac_build_load_to_sgpr(&ctx.ac, buf_ptr, + LLVMConstInt(ctx.ac.i32, SI_RING_GSVS, 0)); + + LLVMValueRef voffset = + LLVMBuildMul(ctx.ac.builder, ctx.abi.vertex_id, + LLVMConstInt(ctx.ac.i32, 4, 0), ""); + + /* Fetch the vertex stream ID.*/ + LLVMValueRef stream_id; + + if (!sscreen->use_ngg_streamout && gs_selector->so.num_outputs) + stream_id = si_unpack_param(&ctx, ctx.streamout_config, 24, 2); + else + stream_id = ctx.ac.i32_0; + + /* Fill in output information. */ + for (i = 0; i < gsinfo->num_outputs; ++i) { + outputs[i].semantic_name = gsinfo->output_semantic_name[i]; + outputs[i].semantic_index = gsinfo->output_semantic_index[i]; + + for (int chan = 0; chan < 4; chan++) { + outputs[i].vertex_stream[chan] = + (gsinfo->output_streams[i] >> (2 * chan)) & 3; + } + } + + LLVMBasicBlockRef end_bb; + LLVMValueRef switch_inst; + + end_bb = LLVMAppendBasicBlockInContext(ctx.ac.context, ctx.main_fn, "end"); + switch_inst = LLVMBuildSwitch(builder, stream_id, end_bb, 4); + + for (int stream = 0; stream < 4; stream++) { + LLVMBasicBlockRef bb; + unsigned offset; + + if (!gsinfo->num_stream_output_components[stream]) + continue; + + if (stream > 0 && !gs_selector->so.num_outputs) + continue; + + bb = LLVMInsertBasicBlockInContext(ctx.ac.context, end_bb, "out"); + LLVMAddCase(switch_inst, LLVMConstInt(ctx.ac.i32, stream, 0), bb); + LLVMPositionBuilderAtEnd(builder, bb); + + /* Fetch vertex data from GSVS ring */ + offset = 0; + for (i = 0; i < gsinfo->num_outputs; ++i) { + for (unsigned chan = 0; chan < 4; chan++) { + if (!(gsinfo->output_usagemask[i] & (1 << chan)) || + outputs[i].vertex_stream[chan] != stream) { + outputs[i].values[chan] = LLVMGetUndef(ctx.ac.f32); + continue; + } + + LLVMValueRef soffset = LLVMConstInt(ctx.ac.i32, + offset * gs_selector->gs_max_out_vertices * 16 * 4, 0); + offset++; + + outputs[i].values[chan] = + ac_build_buffer_load(&ctx.ac, + ctx.gsvs_ring[0], 1, + ctx.ac.i32_0, voffset, + soffset, 0, ac_glc | ac_slc, + true, false); + } + } + + /* Streamout and exports. */ + if (!sscreen->use_ngg_streamout && gs_selector->so.num_outputs) { + si_llvm_emit_streamout(&ctx, outputs, + gsinfo->num_outputs, + stream); + } + + if (stream == 0) + si_llvm_build_vs_exports(&ctx, outputs, gsinfo->num_outputs); + + LLVMBuildBr(builder, end_bb); + } + + LLVMPositionBuilderAtEnd(builder, end_bb); + + LLVMBuildRetVoid(ctx.ac.builder); + + ctx.type = PIPE_SHADER_GEOMETRY; /* override for shader dumping */ + si_llvm_optimize_module(&ctx); + + bool ok = false; + if (si_compile_llvm(sscreen, &ctx.shader->binary, + &ctx.shader->config, ctx.compiler, &ctx.ac, + debug, PIPE_SHADER_GEOMETRY, + "GS Copy Shader", false)) { + if (si_can_dump_shader(sscreen, PIPE_SHADER_GEOMETRY)) + fprintf(stderr, "GS Copy Shader:\n"); + si_shader_dump(sscreen, ctx.shader, debug, stderr, true); + + if (!ctx.shader->config.scratch_bytes_per_wave) + ok = si_shader_binary_upload(sscreen, ctx.shader, 0); + else + ok = true; + } + + si_llvm_dispose(&ctx); + + if (!ok) { + FREE(shader); + shader = NULL; + } else { + si_fix_resource_usage(sscreen, shader); + } + return shader; +} + +/** + * Build the GS prolog function. Rotate the input vertices for triangle strips + * with adjacency. + */ +void si_llvm_build_gs_prolog(struct si_shader_context *ctx, + union si_shader_part_key *key) +{ + unsigned num_sgprs, num_vgprs; + LLVMBuilderRef builder = ctx->ac.builder; + LLVMTypeRef returns[AC_MAX_ARGS]; + LLVMValueRef func, ret; + + memset(&ctx->args, 0, sizeof(ctx->args)); + + if (ctx->screen->info.chip_class >= GFX9) { + if (key->gs_prolog.states.gfx9_prev_is_vs) + num_sgprs = 8 + GFX9_VSGS_NUM_USER_SGPR; + else + num_sgprs = 8 + GFX9_TESGS_NUM_USER_SGPR; + num_vgprs = 5; /* ES inputs are not needed by GS */ + } else { + num_sgprs = GFX6_GS_NUM_USER_SGPR + 2; + num_vgprs = 8; + } + + for (unsigned i = 0; i < num_sgprs; ++i) { + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); + returns[i] = ctx->ac.i32; + } + + for (unsigned i = 0; i < num_vgprs; ++i) { + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); + returns[num_sgprs + i] = ctx->ac.f32; + } + + /* Create the function. */ + si_llvm_create_func(ctx, "gs_prolog", returns, num_sgprs + num_vgprs, 0); + func = ctx->main_fn; + + /* Set the full EXEC mask for the prolog, because we are only fiddling + * with registers here. The main shader part will set the correct EXEC + * mask. + */ + if (ctx->screen->info.chip_class >= GFX9 && !key->gs_prolog.is_monolithic) + ac_init_exec_full_mask(&ctx->ac); + + /* Copy inputs to outputs. This should be no-op, as the registers match, + * but it will prevent the compiler from overwriting them unintentionally. + */ + ret = ctx->return_value; + for (unsigned i = 0; i < num_sgprs; i++) { + LLVMValueRef p = LLVMGetParam(func, i); + ret = LLVMBuildInsertValue(builder, ret, p, i, ""); + } + for (unsigned i = 0; i < num_vgprs; i++) { + LLVMValueRef p = LLVMGetParam(func, num_sgprs + i); + p = ac_to_float(&ctx->ac, p); + ret = LLVMBuildInsertValue(builder, ret, p, num_sgprs + i, ""); + } + + if (key->gs_prolog.states.tri_strip_adj_fix) { + /* Remap the input vertices for every other primitive. */ + const struct ac_arg gfx6_vtx_params[6] = { + { .used = true, .arg_index = num_sgprs }, + { .used = true, .arg_index = num_sgprs + 1 }, + { .used = true, .arg_index = num_sgprs + 3 }, + { .used = true, .arg_index = num_sgprs + 4 }, + { .used = true, .arg_index = num_sgprs + 5 }, + { .used = true, .arg_index = num_sgprs + 6 }, + }; + const struct ac_arg gfx9_vtx_params[3] = { + { .used = true, .arg_index = num_sgprs }, + { .used = true, .arg_index = num_sgprs + 1 }, + { .used = true, .arg_index = num_sgprs + 4 }, + }; + LLVMValueRef vtx_in[6], vtx_out[6]; + LLVMValueRef prim_id, rotate; + + if (ctx->screen->info.chip_class >= GFX9) { + for (unsigned i = 0; i < 3; i++) { + vtx_in[i*2] = si_unpack_param(ctx, gfx9_vtx_params[i], 0, 16); + vtx_in[i*2+1] = si_unpack_param(ctx, gfx9_vtx_params[i], 16, 16); + } + } else { + for (unsigned i = 0; i < 6; i++) + vtx_in[i] = ac_get_arg(&ctx->ac, gfx6_vtx_params[i]); + } + + prim_id = LLVMGetParam(func, num_sgprs + 2); + rotate = LLVMBuildTrunc(builder, prim_id, ctx->ac.i1, ""); + + for (unsigned i = 0; i < 6; ++i) { + LLVMValueRef base, rotated; + base = vtx_in[i]; + rotated = vtx_in[(i + 4) % 6]; + vtx_out[i] = LLVMBuildSelect(builder, rotate, rotated, base, ""); + } + + if (ctx->screen->info.chip_class >= GFX9) { + for (unsigned i = 0; i < 3; i++) { + LLVMValueRef hi, out; + + hi = LLVMBuildShl(builder, vtx_out[i*2+1], + LLVMConstInt(ctx->ac.i32, 16, 0), ""); + out = LLVMBuildOr(builder, vtx_out[i*2], hi, ""); + out = ac_to_float(&ctx->ac, out); + ret = LLVMBuildInsertValue(builder, ret, out, + gfx9_vtx_params[i].arg_index, ""); + } + } else { + for (unsigned i = 0; i < 6; i++) { + LLVMValueRef out; + + out = ac_to_float(&ctx->ac, vtx_out[i]); + ret = LLVMBuildInsertValue(builder, ret, out, + gfx6_vtx_params[i].arg_index, ""); + } + } + } + + LLVMBuildRet(builder, ret); +} + +void si_llvm_init_gs_callbacks(struct si_shader_context *ctx) +{ + ctx->abi.load_inputs = si_nir_load_input_gs; + ctx->abi.emit_vertex = si_llvm_emit_vertex; + ctx->abi.emit_primitive = si_llvm_emit_primitive; + ctx->abi.emit_outputs = si_llvm_emit_gs_epilogue; +} diff -Nru mesa-19.2.8/src/gallium/drivers/radeonsi/si_shader_llvm_ps.c mesa-20.0.8/src/gallium/drivers/radeonsi/si_shader_llvm_ps.c --- mesa-19.2.8/src/gallium/drivers/radeonsi/si_shader_llvm_ps.c 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/radeonsi/si_shader_llvm_ps.c 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,1062 @@ +/* + * Copyright 2020 Advanced Micro Devices, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "si_shader_internal.h" +#include "si_pipe.h" +#include "sid.h" + +LLVMValueRef si_get_sample_id(struct si_shader_context *ctx) +{ + return si_unpack_param(ctx, ctx->args.ancillary, 8, 4); +} + +static LLVMValueRef load_sample_mask_in(struct ac_shader_abi *abi) +{ + struct si_shader_context *ctx = si_shader_context_from_abi(abi); + return ac_to_integer(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args.sample_coverage)); +} + +static LLVMValueRef load_sample_position(struct ac_shader_abi *abi, LLVMValueRef sample_id) +{ + struct si_shader_context *ctx = si_shader_context_from_abi(abi); + LLVMValueRef desc = ac_get_arg(&ctx->ac, ctx->rw_buffers); + LLVMValueRef buf_index = LLVMConstInt(ctx->ac.i32, SI_PS_CONST_SAMPLE_POSITIONS, 0); + LLVMValueRef resource = ac_build_load_to_sgpr(&ctx->ac, desc, buf_index); + + /* offset = sample_id * 8 (8 = 2 floats containing samplepos.xy) */ + LLVMValueRef offset0 = LLVMBuildMul(ctx->ac.builder, sample_id, LLVMConstInt(ctx->ac.i32, 8, 0), ""); + LLVMValueRef offset1 = LLVMBuildAdd(ctx->ac.builder, offset0, LLVMConstInt(ctx->ac.i32, 4, 0), ""); + + LLVMValueRef pos[4] = { + si_buffer_load_const(ctx, resource, offset0), + si_buffer_load_const(ctx, resource, offset1), + LLVMConstReal(ctx->ac.f32, 0), + LLVMConstReal(ctx->ac.f32, 0) + }; + + return ac_build_gather_values(&ctx->ac, pos, 4); +} + +static LLVMValueRef si_nir_emit_fbfetch(struct ac_shader_abi *abi) +{ + struct si_shader_context *ctx = si_shader_context_from_abi(abi); + struct ac_image_args args = {}; + LLVMValueRef ptr, image, fmask; + + /* Ignore src0, because KHR_blend_func_extended disallows multiple render + * targets. + */ + + /* Load the image descriptor. */ + STATIC_ASSERT(SI_PS_IMAGE_COLORBUF0 % 2 == 0); + ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers); + ptr = LLVMBuildPointerCast(ctx->ac.builder, ptr, + ac_array_in_const32_addr_space(ctx->ac.v8i32), ""); + image = ac_build_load_to_sgpr(&ctx->ac, ptr, + LLVMConstInt(ctx->ac.i32, SI_PS_IMAGE_COLORBUF0 / 2, 0)); + + unsigned chan = 0; + + args.coords[chan++] = si_unpack_param(ctx, ctx->pos_fixed_pt, 0, 16); + + if (!ctx->shader->key.mono.u.ps.fbfetch_is_1D) + args.coords[chan++] = si_unpack_param(ctx, ctx->pos_fixed_pt, 16, 16); + + /* Get the current render target layer index. */ + if (ctx->shader->key.mono.u.ps.fbfetch_layered) + args.coords[chan++] = si_unpack_param(ctx, ctx->args.ancillary, 16, 11); + + if (ctx->shader->key.mono.u.ps.fbfetch_msaa) + args.coords[chan++] = si_get_sample_id(ctx); + + if (ctx->shader->key.mono.u.ps.fbfetch_msaa && + !(ctx->screen->debug_flags & DBG(NO_FMASK))) { + fmask = ac_build_load_to_sgpr(&ctx->ac, ptr, + LLVMConstInt(ctx->ac.i32, SI_PS_IMAGE_COLORBUF0_FMASK / 2, 0)); + + ac_apply_fmask_to_sample(&ctx->ac, fmask, args.coords, + ctx->shader->key.mono.u.ps.fbfetch_layered); + } + + args.opcode = ac_image_load; + args.resource = image; + args.dmask = 0xf; + args.attributes = AC_FUNC_ATTR_READNONE; + + if (ctx->shader->key.mono.u.ps.fbfetch_msaa) + args.dim = ctx->shader->key.mono.u.ps.fbfetch_layered ? + ac_image_2darraymsaa : ac_image_2dmsaa; + else if (ctx->shader->key.mono.u.ps.fbfetch_is_1D) + args.dim = ctx->shader->key.mono.u.ps.fbfetch_layered ? + ac_image_1darray : ac_image_1d; + else + args.dim = ctx->shader->key.mono.u.ps.fbfetch_layered ? + ac_image_2darray : ac_image_2d; + + return ac_build_image_opcode(&ctx->ac, &args); +} + +static LLVMValueRef si_build_fs_interp(struct si_shader_context *ctx, + unsigned attr_index, unsigned chan, + LLVMValueRef prim_mask, + LLVMValueRef i, LLVMValueRef j) +{ + if (i || j) { + return ac_build_fs_interp(&ctx->ac, + LLVMConstInt(ctx->ac.i32, chan, 0), + LLVMConstInt(ctx->ac.i32, attr_index, 0), + prim_mask, i, j); + } + return ac_build_fs_interp_mov(&ctx->ac, + LLVMConstInt(ctx->ac.i32, 2, 0), /* P0 */ + LLVMConstInt(ctx->ac.i32, chan, 0), + LLVMConstInt(ctx->ac.i32, attr_index, 0), + prim_mask); +} + +/** + * Interpolate a fragment shader input. + * + * @param ctx context + * @param input_index index of the input in hardware + * @param semantic_name TGSI_SEMANTIC_* + * @param semantic_index semantic index + * @param num_interp_inputs number of all interpolated inputs (= BCOLOR offset) + * @param colors_read_mask color components read (4 bits for each color, 8 bits in total) + * @param interp_param interpolation weights (i,j) + * @param prim_mask SI_PARAM_PRIM_MASK + * @param face SI_PARAM_FRONT_FACE + * @param result the return value (4 components) + */ +static void interp_fs_color(struct si_shader_context *ctx, + unsigned input_index, + unsigned semantic_index, + unsigned num_interp_inputs, + unsigned colors_read_mask, + LLVMValueRef interp_param, + LLVMValueRef prim_mask, + LLVMValueRef face, + LLVMValueRef result[4]) +{ + LLVMValueRef i = NULL, j = NULL; + unsigned chan; + + /* fs.constant returns the param from the middle vertex, so it's not + * really useful for flat shading. It's meant to be used for custom + * interpolation (but the intrinsic can't fetch from the other two + * vertices). + * + * Luckily, it doesn't matter, because we rely on the FLAT_SHADE state + * to do the right thing. The only reason we use fs.constant is that + * fs.interp cannot be used on integers, because they can be equal + * to NaN. + * + * When interp is false we will use fs.constant or for newer llvm, + * amdgcn.interp.mov. + */ + bool interp = interp_param != NULL; + + if (interp) { + interp_param = LLVMBuildBitCast(ctx->ac.builder, interp_param, + LLVMVectorType(ctx->ac.f32, 2), ""); + + i = LLVMBuildExtractElement(ctx->ac.builder, interp_param, + ctx->ac.i32_0, ""); + j = LLVMBuildExtractElement(ctx->ac.builder, interp_param, + ctx->ac.i32_1, ""); + } + + if (ctx->shader->key.part.ps.prolog.color_two_side) { + LLVMValueRef is_face_positive; + + /* If BCOLOR0 is used, BCOLOR1 is at offset "num_inputs + 1", + * otherwise it's at offset "num_inputs". + */ + unsigned back_attr_offset = num_interp_inputs; + if (semantic_index == 1 && colors_read_mask & 0xf) + back_attr_offset += 1; + + is_face_positive = LLVMBuildICmp(ctx->ac.builder, LLVMIntNE, + face, ctx->ac.i32_0, ""); + + for (chan = 0; chan < 4; chan++) { + LLVMValueRef front, back; + + front = si_build_fs_interp(ctx, + input_index, chan, + prim_mask, i, j); + back = si_build_fs_interp(ctx, + back_attr_offset, chan, + prim_mask, i, j); + + result[chan] = LLVMBuildSelect(ctx->ac.builder, + is_face_positive, + front, + back, + ""); + } + } else { + for (chan = 0; chan < 4; chan++) { + result[chan] = si_build_fs_interp(ctx, + input_index, chan, + prim_mask, i, j); + } + } +} + +static void si_alpha_test(struct si_shader_context *ctx, LLVMValueRef alpha) +{ + if (ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_NEVER) { + static LLVMRealPredicate cond_map[PIPE_FUNC_ALWAYS + 1] = { + [PIPE_FUNC_LESS] = LLVMRealOLT, + [PIPE_FUNC_EQUAL] = LLVMRealOEQ, + [PIPE_FUNC_LEQUAL] = LLVMRealOLE, + [PIPE_FUNC_GREATER] = LLVMRealOGT, + [PIPE_FUNC_NOTEQUAL] = LLVMRealONE, + [PIPE_FUNC_GEQUAL] = LLVMRealOGE, + }; + LLVMRealPredicate cond = cond_map[ctx->shader->key.part.ps.epilog.alpha_func]; + assert(cond); + + LLVMValueRef alpha_ref = LLVMGetParam(ctx->main_fn, + SI_PARAM_ALPHA_REF); + LLVMValueRef alpha_pass = + LLVMBuildFCmp(ctx->ac.builder, cond, alpha, alpha_ref, ""); + ac_build_kill_if_false(&ctx->ac, alpha_pass); + } else { + ac_build_kill_if_false(&ctx->ac, ctx->ac.i1false); + } +} + +static LLVMValueRef si_scale_alpha_by_sample_mask(struct si_shader_context *ctx, + LLVMValueRef alpha, + unsigned samplemask_param) +{ + LLVMValueRef coverage; + + /* alpha = alpha * popcount(coverage) / SI_NUM_SMOOTH_AA_SAMPLES */ + coverage = LLVMGetParam(ctx->main_fn, + samplemask_param); + coverage = ac_to_integer(&ctx->ac, coverage); + + coverage = ac_build_intrinsic(&ctx->ac, "llvm.ctpop.i32", + ctx->ac.i32, + &coverage, 1, AC_FUNC_ATTR_READNONE); + + coverage = LLVMBuildUIToFP(ctx->ac.builder, coverage, + ctx->ac.f32, ""); + + coverage = LLVMBuildFMul(ctx->ac.builder, coverage, + LLVMConstReal(ctx->ac.f32, + 1.0 / SI_NUM_SMOOTH_AA_SAMPLES), ""); + + return LLVMBuildFMul(ctx->ac.builder, alpha, coverage, ""); +} + +struct si_ps_exports { + unsigned num; + struct ac_export_args args[10]; +}; + +static void si_export_mrt_z(struct si_shader_context *ctx, + LLVMValueRef depth, LLVMValueRef stencil, + LLVMValueRef samplemask, struct si_ps_exports *exp) +{ + struct ac_export_args args; + + ac_export_mrt_z(&ctx->ac, depth, stencil, samplemask, &args); + + memcpy(&exp->args[exp->num++], &args, sizeof(args)); +} + +/* Initialize arguments for the shader export intrinsic */ +static void si_llvm_init_ps_export_args(struct si_shader_context *ctx, + LLVMValueRef *values, + unsigned target, + struct ac_export_args *args) +{ + const struct si_shader_key *key = &ctx->shader->key; + unsigned col_formats = key->part.ps.epilog.spi_shader_col_format; + LLVMValueRef f32undef = LLVMGetUndef(ctx->ac.f32); + unsigned spi_shader_col_format; + unsigned chan; + bool is_int8, is_int10; + int cbuf = target - V_008DFC_SQ_EXP_MRT; + + assert(cbuf >= 0 && cbuf < 8); + + spi_shader_col_format = (col_formats >> (cbuf * 4)) & 0xf; + is_int8 = (key->part.ps.epilog.color_is_int8 >> cbuf) & 0x1; + is_int10 = (key->part.ps.epilog.color_is_int10 >> cbuf) & 0x1; + + /* Default is 0xf. Adjusted below depending on the format. */ + args->enabled_channels = 0xf; /* writemask */ + + /* Specify whether the EXEC mask represents the valid mask */ + args->valid_mask = 0; + + /* Specify whether this is the last export */ + args->done = 0; + + /* Specify the target we are exporting */ + args->target = target; + + args->compr = false; + args->out[0] = f32undef; + args->out[1] = f32undef; + args->out[2] = f32undef; + args->out[3] = f32undef; + + LLVMValueRef (*packf)(struct ac_llvm_context *ctx, LLVMValueRef args[2]) = NULL; + LLVMValueRef (*packi)(struct ac_llvm_context *ctx, LLVMValueRef args[2], + unsigned bits, bool hi) = NULL; + + switch (spi_shader_col_format) { + case V_028714_SPI_SHADER_ZERO: + args->enabled_channels = 0; /* writemask */ + args->target = V_008DFC_SQ_EXP_NULL; + break; + + case V_028714_SPI_SHADER_32_R: + args->enabled_channels = 1; /* writemask */ + args->out[0] = values[0]; + break; + + case V_028714_SPI_SHADER_32_GR: + args->enabled_channels = 0x3; /* writemask */ + args->out[0] = values[0]; + args->out[1] = values[1]; + break; + + case V_028714_SPI_SHADER_32_AR: + if (ctx->screen->info.chip_class >= GFX10) { + args->enabled_channels = 0x3; /* writemask */ + args->out[0] = values[0]; + args->out[1] = values[3]; + } else { + args->enabled_channels = 0x9; /* writemask */ + args->out[0] = values[0]; + args->out[3] = values[3]; + } + break; + + case V_028714_SPI_SHADER_FP16_ABGR: + packf = ac_build_cvt_pkrtz_f16; + break; + + case V_028714_SPI_SHADER_UNORM16_ABGR: + packf = ac_build_cvt_pknorm_u16; + break; + + case V_028714_SPI_SHADER_SNORM16_ABGR: + packf = ac_build_cvt_pknorm_i16; + break; + + case V_028714_SPI_SHADER_UINT16_ABGR: + packi = ac_build_cvt_pk_u16; + break; + + case V_028714_SPI_SHADER_SINT16_ABGR: + packi = ac_build_cvt_pk_i16; + break; + + case V_028714_SPI_SHADER_32_ABGR: + memcpy(&args->out[0], values, sizeof(values[0]) * 4); + break; + } + + /* Pack f16 or norm_i16/u16. */ + if (packf) { + for (chan = 0; chan < 2; chan++) { + LLVMValueRef pack_args[2] = { + values[2 * chan], + values[2 * chan + 1] + }; + LLVMValueRef packed; + + packed = packf(&ctx->ac, pack_args); + args->out[chan] = ac_to_float(&ctx->ac, packed); + } + args->compr = 1; /* COMPR flag */ + } + /* Pack i16/u16. */ + if (packi) { + for (chan = 0; chan < 2; chan++) { + LLVMValueRef pack_args[2] = { + ac_to_integer(&ctx->ac, values[2 * chan]), + ac_to_integer(&ctx->ac, values[2 * chan + 1]) + }; + LLVMValueRef packed; + + packed = packi(&ctx->ac, pack_args, + is_int8 ? 8 : is_int10 ? 10 : 16, + chan == 1); + args->out[chan] = ac_to_float(&ctx->ac, packed); + } + args->compr = 1; /* COMPR flag */ + } +} + +static void si_export_mrt_color(struct si_shader_context *ctx, + LLVMValueRef *color, unsigned index, + unsigned samplemask_param, + bool is_last, struct si_ps_exports *exp) +{ + int i; + + /* Clamp color */ + if (ctx->shader->key.part.ps.epilog.clamp_color) + for (i = 0; i < 4; i++) + color[i] = ac_build_clamp(&ctx->ac, color[i]); + + /* Alpha to one */ + if (ctx->shader->key.part.ps.epilog.alpha_to_one) + color[3] = ctx->ac.f32_1; + + /* Alpha test */ + if (index == 0 && + ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS) + si_alpha_test(ctx, color[3]); + + /* Line & polygon smoothing */ + if (ctx->shader->key.part.ps.epilog.poly_line_smoothing) + color[3] = si_scale_alpha_by_sample_mask(ctx, color[3], + samplemask_param); + + /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */ + if (ctx->shader->key.part.ps.epilog.last_cbuf > 0) { + struct ac_export_args args[8]; + int c, last = -1; + + /* Get the export arguments, also find out what the last one is. */ + for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) { + si_llvm_init_ps_export_args(ctx, color, + V_008DFC_SQ_EXP_MRT + c, &args[c]); + if (args[c].enabled_channels) + last = c; + } + + /* Emit all exports. */ + for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) { + if (is_last && last == c) { + args[c].valid_mask = 1; /* whether the EXEC mask is valid */ + args[c].done = 1; /* DONE bit */ + } else if (!args[c].enabled_channels) + continue; /* unnecessary NULL export */ + + memcpy(&exp->args[exp->num++], &args[c], sizeof(args[c])); + } + } else { + struct ac_export_args args; + + /* Export */ + si_llvm_init_ps_export_args(ctx, color, V_008DFC_SQ_EXP_MRT + index, + &args); + if (is_last) { + args.valid_mask = 1; /* whether the EXEC mask is valid */ + args.done = 1; /* DONE bit */ + } else if (!args.enabled_channels) + return; /* unnecessary NULL export */ + + memcpy(&exp->args[exp->num++], &args, sizeof(args)); + } +} + +static void si_emit_ps_exports(struct si_shader_context *ctx, + struct si_ps_exports *exp) +{ + for (unsigned i = 0; i < exp->num; i++) + ac_build_export(&ctx->ac, &exp->args[i]); +} + +/** + * Return PS outputs in this order: + * + * v[0:3] = color0.xyzw + * v[4:7] = color1.xyzw + * ... + * vN+0 = Depth + * vN+1 = Stencil + * vN+2 = SampleMask + * vN+3 = SampleMaskIn (used for OpenGL smoothing) + * + * The alpha-ref SGPR is returned via its original location. + */ +static void si_llvm_return_fs_outputs(struct ac_shader_abi *abi, + unsigned max_outputs, + LLVMValueRef *addrs) +{ + struct si_shader_context *ctx = si_shader_context_from_abi(abi); + struct si_shader *shader = ctx->shader; + struct si_shader_info *info = &shader->selector->info; + LLVMBuilderRef builder = ctx->ac.builder; + unsigned i, j, first_vgpr, vgpr; + + LLVMValueRef color[8][4] = {}; + LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL; + LLVMValueRef ret; + + if (ctx->postponed_kill) + ac_build_kill_if_false(&ctx->ac, LLVMBuildLoad(builder, ctx->postponed_kill, "")); + + /* Read the output values. */ + for (i = 0; i < info->num_outputs; i++) { + unsigned semantic_name = info->output_semantic_name[i]; + unsigned semantic_index = info->output_semantic_index[i]; + + switch (semantic_name) { + case TGSI_SEMANTIC_COLOR: + assert(semantic_index < 8); + for (j = 0; j < 4; j++) { + LLVMValueRef ptr = addrs[4 * i + j]; + LLVMValueRef result = LLVMBuildLoad(builder, ptr, ""); + color[semantic_index][j] = result; + } + break; + case TGSI_SEMANTIC_POSITION: + depth = LLVMBuildLoad(builder, + addrs[4 * i + 0], ""); + break; + case TGSI_SEMANTIC_STENCIL: + stencil = LLVMBuildLoad(builder, + addrs[4 * i + 0], ""); + break; + case TGSI_SEMANTIC_SAMPLEMASK: + samplemask = LLVMBuildLoad(builder, + addrs[4 * i + 0], ""); + break; + default: + fprintf(stderr, "Warning: GFX6 unhandled fs output type:%d\n", + semantic_name); + } + } + + /* Fill the return structure. */ + ret = ctx->return_value; + + /* Set SGPRs. */ + ret = LLVMBuildInsertValue(builder, ret, + ac_to_integer(&ctx->ac, + LLVMGetParam(ctx->main_fn, + SI_PARAM_ALPHA_REF)), + SI_SGPR_ALPHA_REF, ""); + + /* Set VGPRs */ + first_vgpr = vgpr = SI_SGPR_ALPHA_REF + 1; + for (i = 0; i < ARRAY_SIZE(color); i++) { + if (!color[i][0]) + continue; + + for (j = 0; j < 4; j++) + ret = LLVMBuildInsertValue(builder, ret, color[i][j], vgpr++, ""); + } + if (depth) + ret = LLVMBuildInsertValue(builder, ret, depth, vgpr++, ""); + if (stencil) + ret = LLVMBuildInsertValue(builder, ret, stencil, vgpr++, ""); + if (samplemask) + ret = LLVMBuildInsertValue(builder, ret, samplemask, vgpr++, ""); + + /* Add the input sample mask for smoothing at the end. */ + if (vgpr < first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC) + vgpr = first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC; + ret = LLVMBuildInsertValue(builder, ret, + LLVMGetParam(ctx->main_fn, + SI_PARAM_SAMPLE_COVERAGE), vgpr++, ""); + + ctx->return_value = ret; +} + +static void si_llvm_emit_polygon_stipple(struct si_shader_context *ctx, + LLVMValueRef param_rw_buffers, + struct ac_arg param_pos_fixed_pt) +{ + LLVMBuilderRef builder = ctx->ac.builder; + LLVMValueRef slot, desc, offset, row, bit, address[2]; + + /* Use the fixed-point gl_FragCoord input. + * Since the stipple pattern is 32x32 and it repeats, just get 5 bits + * per coordinate to get the repeating effect. + */ + address[0] = si_unpack_param(ctx, param_pos_fixed_pt, 0, 5); + address[1] = si_unpack_param(ctx, param_pos_fixed_pt, 16, 5); + + /* Load the buffer descriptor. */ + slot = LLVMConstInt(ctx->ac.i32, SI_PS_CONST_POLY_STIPPLE, 0); + desc = ac_build_load_to_sgpr(&ctx->ac, param_rw_buffers, slot); + + /* The stipple pattern is 32x32, each row has 32 bits. */ + offset = LLVMBuildMul(builder, address[1], + LLVMConstInt(ctx->ac.i32, 4, 0), ""); + row = si_buffer_load_const(ctx, desc, offset); + row = ac_to_integer(&ctx->ac, row); + bit = LLVMBuildLShr(builder, row, address[0], ""); + bit = LLVMBuildTrunc(builder, bit, ctx->ac.i1, ""); + ac_build_kill_if_false(&ctx->ac, bit); +} + +static void si_llvm_emit_kill(struct ac_shader_abi *abi, LLVMValueRef visible) +{ + struct si_shader_context *ctx = si_shader_context_from_abi(abi); + LLVMBuilderRef builder = ctx->ac.builder; + + if (ctx->shader->selector->force_correct_derivs_after_kill) { + /* Kill immediately while maintaining WQM. */ + ac_build_kill_if_false(&ctx->ac, + ac_build_wqm_vote(&ctx->ac, visible)); + + LLVMValueRef mask = LLVMBuildLoad(builder, ctx->postponed_kill, ""); + mask = LLVMBuildAnd(builder, mask, visible, ""); + LLVMBuildStore(builder, mask, ctx->postponed_kill); + return; + } + + ac_build_kill_if_false(&ctx->ac, visible); +} + +/** + * Build the pixel shader prolog function. This handles: + * - two-side color selection and interpolation + * - overriding interpolation parameters for the API PS + * - polygon stippling + * + * All preloaded SGPRs and VGPRs are passed through unmodified unless they are + * overriden by other states. (e.g. per-sample interpolation) + * Interpolated colors are stored after the preloaded VGPRs. + */ +void si_llvm_build_ps_prolog(struct si_shader_context *ctx, + union si_shader_part_key *key) +{ + LLVMValueRef ret, func; + int num_returns, i, num_color_channels; + + memset(&ctx->args, 0, sizeof(ctx->args)); + + /* Declare inputs. */ + LLVMTypeRef return_types[AC_MAX_ARGS]; + num_returns = 0; + num_color_channels = util_bitcount(key->ps_prolog.colors_read); + assert(key->ps_prolog.num_input_sgprs + + key->ps_prolog.num_input_vgprs + + num_color_channels <= AC_MAX_ARGS); + for (i = 0; i < key->ps_prolog.num_input_sgprs; i++) { + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); + return_types[num_returns++] = ctx->ac.i32; + + } + + struct ac_arg pos_fixed_pt; + struct ac_arg ancillary; + struct ac_arg param_sample_mask; + for (i = 0; i < key->ps_prolog.num_input_vgprs; i++) { + struct ac_arg *arg = NULL; + if (i == key->ps_prolog.ancillary_vgpr_index) { + arg = &ancillary; + } else if (i == key->ps_prolog.ancillary_vgpr_index + 1) { + arg = ¶m_sample_mask; + } else if (i == key->ps_prolog.num_input_vgprs - 1) { + /* POS_FIXED_PT is always last. */ + arg = &pos_fixed_pt; + } + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, arg); + return_types[num_returns++] = ctx->ac.f32; + } + + /* Declare outputs (same as inputs + add colors if needed) */ + for (i = 0; i < num_color_channels; i++) + return_types[num_returns++] = ctx->ac.f32; + + /* Create the function. */ + si_llvm_create_func(ctx, "ps_prolog", return_types, num_returns, 0); + func = ctx->main_fn; + + /* Copy inputs to outputs. This should be no-op, as the registers match, + * but it will prevent the compiler from overwriting them unintentionally. + */ + ret = ctx->return_value; + for (i = 0; i < ctx->args.arg_count; i++) { + LLVMValueRef p = LLVMGetParam(func, i); + ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p, i, ""); + } + + /* Polygon stippling. */ + if (key->ps_prolog.states.poly_stipple) { + LLVMValueRef list = si_prolog_get_rw_buffers(ctx); + + si_llvm_emit_polygon_stipple(ctx, list, pos_fixed_pt); + } + + if (key->ps_prolog.states.bc_optimize_for_persp || + key->ps_prolog.states.bc_optimize_for_linear) { + unsigned i, base = key->ps_prolog.num_input_sgprs; + LLVMValueRef center[2], centroid[2], tmp, bc_optimize; + + /* The shader should do: if (PRIM_MASK[31]) CENTROID = CENTER; + * The hw doesn't compute CENTROID if the whole wave only + * contains fully-covered quads. + * + * PRIM_MASK is after user SGPRs. + */ + bc_optimize = LLVMGetParam(func, SI_PS_NUM_USER_SGPR); + bc_optimize = LLVMBuildLShr(ctx->ac.builder, bc_optimize, + LLVMConstInt(ctx->ac.i32, 31, 0), ""); + bc_optimize = LLVMBuildTrunc(ctx->ac.builder, bc_optimize, + ctx->ac.i1, ""); + + if (key->ps_prolog.states.bc_optimize_for_persp) { + /* Read PERSP_CENTER. */ + for (i = 0; i < 2; i++) + center[i] = LLVMGetParam(func, base + 2 + i); + /* Read PERSP_CENTROID. */ + for (i = 0; i < 2; i++) + centroid[i] = LLVMGetParam(func, base + 4 + i); + /* Select PERSP_CENTROID. */ + for (i = 0; i < 2; i++) { + tmp = LLVMBuildSelect(ctx->ac.builder, bc_optimize, + center[i], centroid[i], ""); + ret = LLVMBuildInsertValue(ctx->ac.builder, ret, + tmp, base + 4 + i, ""); + } + } + if (key->ps_prolog.states.bc_optimize_for_linear) { + /* Read LINEAR_CENTER. */ + for (i = 0; i < 2; i++) + center[i] = LLVMGetParam(func, base + 8 + i); + /* Read LINEAR_CENTROID. */ + for (i = 0; i < 2; i++) + centroid[i] = LLVMGetParam(func, base + 10 + i); + /* Select LINEAR_CENTROID. */ + for (i = 0; i < 2; i++) { + tmp = LLVMBuildSelect(ctx->ac.builder, bc_optimize, + center[i], centroid[i], ""); + ret = LLVMBuildInsertValue(ctx->ac.builder, ret, + tmp, base + 10 + i, ""); + } + } + } + + /* Force per-sample interpolation. */ + if (key->ps_prolog.states.force_persp_sample_interp) { + unsigned i, base = key->ps_prolog.num_input_sgprs; + LLVMValueRef persp_sample[2]; + + /* Read PERSP_SAMPLE. */ + for (i = 0; i < 2; i++) + persp_sample[i] = LLVMGetParam(func, base + i); + /* Overwrite PERSP_CENTER. */ + for (i = 0; i < 2; i++) + ret = LLVMBuildInsertValue(ctx->ac.builder, ret, + persp_sample[i], base + 2 + i, ""); + /* Overwrite PERSP_CENTROID. */ + for (i = 0; i < 2; i++) + ret = LLVMBuildInsertValue(ctx->ac.builder, ret, + persp_sample[i], base + 4 + i, ""); + } + if (key->ps_prolog.states.force_linear_sample_interp) { + unsigned i, base = key->ps_prolog.num_input_sgprs; + LLVMValueRef linear_sample[2]; + + /* Read LINEAR_SAMPLE. */ + for (i = 0; i < 2; i++) + linear_sample[i] = LLVMGetParam(func, base + 6 + i); + /* Overwrite LINEAR_CENTER. */ + for (i = 0; i < 2; i++) + ret = LLVMBuildInsertValue(ctx->ac.builder, ret, + linear_sample[i], base + 8 + i, ""); + /* Overwrite LINEAR_CENTROID. */ + for (i = 0; i < 2; i++) + ret = LLVMBuildInsertValue(ctx->ac.builder, ret, + linear_sample[i], base + 10 + i, ""); + } + + /* Force center interpolation. */ + if (key->ps_prolog.states.force_persp_center_interp) { + unsigned i, base = key->ps_prolog.num_input_sgprs; + LLVMValueRef persp_center[2]; + + /* Read PERSP_CENTER. */ + for (i = 0; i < 2; i++) + persp_center[i] = LLVMGetParam(func, base + 2 + i); + /* Overwrite PERSP_SAMPLE. */ + for (i = 0; i < 2; i++) + ret = LLVMBuildInsertValue(ctx->ac.builder, ret, + persp_center[i], base + i, ""); + /* Overwrite PERSP_CENTROID. */ + for (i = 0; i < 2; i++) + ret = LLVMBuildInsertValue(ctx->ac.builder, ret, + persp_center[i], base + 4 + i, ""); + } + if (key->ps_prolog.states.force_linear_center_interp) { + unsigned i, base = key->ps_prolog.num_input_sgprs; + LLVMValueRef linear_center[2]; + + /* Read LINEAR_CENTER. */ + for (i = 0; i < 2; i++) + linear_center[i] = LLVMGetParam(func, base + 8 + i); + /* Overwrite LINEAR_SAMPLE. */ + for (i = 0; i < 2; i++) + ret = LLVMBuildInsertValue(ctx->ac.builder, ret, + linear_center[i], base + 6 + i, ""); + /* Overwrite LINEAR_CENTROID. */ + for (i = 0; i < 2; i++) + ret = LLVMBuildInsertValue(ctx->ac.builder, ret, + linear_center[i], base + 10 + i, ""); + } + + /* Interpolate colors. */ + unsigned color_out_idx = 0; + for (i = 0; i < 2; i++) { + unsigned writemask = (key->ps_prolog.colors_read >> (i * 4)) & 0xf; + unsigned face_vgpr = key->ps_prolog.num_input_sgprs + + key->ps_prolog.face_vgpr_index; + LLVMValueRef interp[2], color[4]; + LLVMValueRef interp_ij = NULL, prim_mask = NULL, face = NULL; + + if (!writemask) + continue; + + /* If the interpolation qualifier is not CONSTANT (-1). */ + if (key->ps_prolog.color_interp_vgpr_index[i] != -1) { + unsigned interp_vgpr = key->ps_prolog.num_input_sgprs + + key->ps_prolog.color_interp_vgpr_index[i]; + + /* Get the (i,j) updated by bc_optimize handling. */ + interp[0] = LLVMBuildExtractValue(ctx->ac.builder, ret, + interp_vgpr, ""); + interp[1] = LLVMBuildExtractValue(ctx->ac.builder, ret, + interp_vgpr + 1, ""); + interp_ij = ac_build_gather_values(&ctx->ac, interp, 2); + } + + /* Use the absolute location of the input. */ + prim_mask = LLVMGetParam(func, SI_PS_NUM_USER_SGPR); + + if (key->ps_prolog.states.color_two_side) { + face = LLVMGetParam(func, face_vgpr); + face = ac_to_integer(&ctx->ac, face); + } + + interp_fs_color(ctx, + key->ps_prolog.color_attr_index[i], i, + key->ps_prolog.num_interp_inputs, + key->ps_prolog.colors_read, interp_ij, + prim_mask, face, color); + + while (writemask) { + unsigned chan = u_bit_scan(&writemask); + ret = LLVMBuildInsertValue(ctx->ac.builder, ret, color[chan], + ctx->args.arg_count + color_out_idx++, ""); + } + } + + /* Section 15.2.2 (Shader Inputs) of the OpenGL 4.5 (Core Profile) spec + * says: + * + * "When per-sample shading is active due to the use of a fragment + * input qualified by sample or due to the use of the gl_SampleID + * or gl_SamplePosition variables, only the bit for the current + * sample is set in gl_SampleMaskIn. When state specifies multiple + * fragment shader invocations for a given fragment, the sample + * mask for any single fragment shader invocation may specify a + * subset of the covered samples for the fragment. In this case, + * the bit corresponding to each covered sample will be set in + * exactly one fragment shader invocation." + * + * The samplemask loaded by hardware is always the coverage of the + * entire pixel/fragment, so mask bits out based on the sample ID. + */ + if (key->ps_prolog.states.samplemask_log_ps_iter) { + /* The bit pattern matches that used by fixed function fragment + * processing. */ + static const uint16_t ps_iter_masks[] = { + 0xffff, /* not used */ + 0x5555, + 0x1111, + 0x0101, + 0x0001, + }; + assert(key->ps_prolog.states.samplemask_log_ps_iter < ARRAY_SIZE(ps_iter_masks)); + + uint32_t ps_iter_mask = ps_iter_masks[key->ps_prolog.states.samplemask_log_ps_iter]; + LLVMValueRef sampleid = si_unpack_param(ctx, ancillary, 8, 4); + LLVMValueRef samplemask = ac_get_arg(&ctx->ac, param_sample_mask); + + samplemask = ac_to_integer(&ctx->ac, samplemask); + samplemask = LLVMBuildAnd( + ctx->ac.builder, + samplemask, + LLVMBuildShl(ctx->ac.builder, + LLVMConstInt(ctx->ac.i32, ps_iter_mask, false), + sampleid, ""), + ""); + samplemask = ac_to_float(&ctx->ac, samplemask); + + ret = LLVMBuildInsertValue(ctx->ac.builder, ret, samplemask, + param_sample_mask.arg_index, ""); + } + + /* Tell LLVM to insert WQM instruction sequence when needed. */ + if (key->ps_prolog.wqm) { + LLVMAddTargetDependentFunctionAttr(func, + "amdgpu-ps-wqm-outputs", ""); + } + + si_llvm_build_ret(ctx, ret); +} + +/** + * Build the pixel shader epilog function. This handles everything that must be + * emulated for pixel shader exports. (alpha-test, format conversions, etc) + */ +void si_llvm_build_ps_epilog(struct si_shader_context *ctx, + union si_shader_part_key *key) +{ + LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL; + int i; + struct si_ps_exports exp = {}; + + memset(&ctx->args, 0, sizeof(ctx->args)); + + /* Declare input SGPRs. */ + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->rw_buffers); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, + &ctx->bindless_samplers_and_images); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, + &ctx->const_and_shader_buffers); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, + &ctx->samplers_and_images); + si_add_arg_checked(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, + NULL, SI_PARAM_ALPHA_REF); + + /* Declare input VGPRs. */ + unsigned required_num_params = + ctx->args.num_sgprs_used + + util_bitcount(key->ps_epilog.colors_written) * 4 + + key->ps_epilog.writes_z + + key->ps_epilog.writes_stencil + + key->ps_epilog.writes_samplemask; + + required_num_params = MAX2(required_num_params, + ctx->args.num_sgprs_used + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1); + + while (ctx->args.arg_count < required_num_params) + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL); + + /* Create the function. */ + si_llvm_create_func(ctx, "ps_epilog", NULL, 0, 0); + /* Disable elimination of unused inputs. */ + ac_llvm_add_target_dep_function_attr(ctx->main_fn, + "InitialPSInputAddr", 0xffffff); + + /* Process colors. */ + unsigned vgpr = ctx->args.num_sgprs_used; + unsigned colors_written = key->ps_epilog.colors_written; + int last_color_export = -1; + + /* Find the last color export. */ + if (!key->ps_epilog.writes_z && + !key->ps_epilog.writes_stencil && + !key->ps_epilog.writes_samplemask) { + unsigned spi_format = key->ps_epilog.states.spi_shader_col_format; + + /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */ + if (colors_written == 0x1 && key->ps_epilog.states.last_cbuf > 0) { + /* Just set this if any of the colorbuffers are enabled. */ + if (spi_format & + ((1ull << (4 * (key->ps_epilog.states.last_cbuf + 1))) - 1)) + last_color_export = 0; + } else { + for (i = 0; i < 8; i++) + if (colors_written & (1 << i) && + (spi_format >> (i * 4)) & 0xf) + last_color_export = i; + } + } + + while (colors_written) { + LLVMValueRef color[4]; + int mrt = u_bit_scan(&colors_written); + + for (i = 0; i < 4; i++) + color[i] = LLVMGetParam(ctx->main_fn, vgpr++); + + si_export_mrt_color(ctx, color, mrt, + ctx->args.arg_count - 1, + mrt == last_color_export, &exp); + } + + /* Process depth, stencil, samplemask. */ + if (key->ps_epilog.writes_z) + depth = LLVMGetParam(ctx->main_fn, vgpr++); + if (key->ps_epilog.writes_stencil) + stencil = LLVMGetParam(ctx->main_fn, vgpr++); + if (key->ps_epilog.writes_samplemask) + samplemask = LLVMGetParam(ctx->main_fn, vgpr++); + + if (depth || stencil || samplemask) + si_export_mrt_z(ctx, depth, stencil, samplemask, &exp); + else if (last_color_export == -1) + ac_build_export_null(&ctx->ac); + + if (exp.num) + si_emit_ps_exports(ctx, &exp); + + /* Compile. */ + LLVMBuildRetVoid(ctx->ac.builder); +} + +void si_llvm_build_monolithic_ps(struct si_shader_context *ctx, + struct si_shader *shader) +{ + LLVMValueRef parts[3]; + unsigned num_parts = 0, main_index; + LLVMValueRef main_fn = ctx->main_fn; + + union si_shader_part_key prolog_key; + si_get_ps_prolog_key(shader, &prolog_key, false); + + if (si_need_ps_prolog(&prolog_key)) { + si_llvm_build_ps_prolog(ctx, &prolog_key); + parts[num_parts++] = ctx->main_fn; + } + + main_index = num_parts; + parts[num_parts++] = main_fn; + + union si_shader_part_key epilog_key; + si_get_ps_epilog_key(shader, &epilog_key); + si_llvm_build_ps_epilog(ctx, &epilog_key); + parts[num_parts++] = ctx->main_fn; + + si_build_wrapper_function(ctx, parts, num_parts, main_index, 0); +} + +void si_llvm_init_ps_callbacks(struct si_shader_context *ctx) +{ + ctx->abi.emit_outputs = si_llvm_return_fs_outputs; + ctx->abi.load_sample_position = load_sample_position; + ctx->abi.load_sample_mask_in = load_sample_mask_in; + ctx->abi.emit_fbfetch = si_nir_emit_fbfetch; + ctx->abi.emit_kill = si_llvm_emit_kill; +} diff -Nru mesa-19.2.8/src/gallium/drivers/radeonsi/si_shader_llvm_resources.c mesa-20.0.8/src/gallium/drivers/radeonsi/si_shader_llvm_resources.c --- mesa-19.2.8/src/gallium/drivers/radeonsi/si_shader_llvm_resources.c 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/radeonsi/si_shader_llvm_resources.c 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,320 @@ +/* + * Copyright 2020 Advanced Micro Devices, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "si_shader_internal.h" +#include "si_pipe.h" +#include "sid.h" + +/** + * Return a value that is equal to the given i32 \p index if it lies in [0,num) + * or an undefined value in the same interval otherwise. + */ +static LLVMValueRef si_llvm_bound_index(struct si_shader_context *ctx, + LLVMValueRef index, + unsigned num) +{ + LLVMBuilderRef builder = ctx->ac.builder; + LLVMValueRef c_max = LLVMConstInt(ctx->ac.i32, num - 1, 0); + LLVMValueRef cc; + + if (util_is_power_of_two_or_zero(num)) { + index = LLVMBuildAnd(builder, index, c_max, ""); + } else { + /* In theory, this MAX pattern should result in code that is + * as good as the bit-wise AND above. + * + * In practice, LLVM generates worse code (at the time of + * writing), because its value tracking is not strong enough. + */ + cc = LLVMBuildICmp(builder, LLVMIntULE, index, c_max, ""); + index = LLVMBuildSelect(builder, cc, index, c_max, ""); + } + + return index; +} + +static LLVMValueRef load_const_buffer_desc_fast_path(struct si_shader_context *ctx) +{ + LLVMValueRef ptr = + ac_get_arg(&ctx->ac, ctx->const_and_shader_buffers); + struct si_shader_selector *sel = ctx->shader->selector; + + /* Do the bounds checking with a descriptor, because + * doing computation and manual bounds checking of 64-bit + * addresses generates horrible VALU code with very high + * VGPR usage and very low SIMD occupancy. + */ + ptr = LLVMBuildPtrToInt(ctx->ac.builder, ptr, ctx->ac.intptr, ""); + + LLVMValueRef desc0, desc1; + desc0 = ptr; + desc1 = LLVMConstInt(ctx->ac.i32, + S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi), 0); + + uint32_t rsrc3 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | + S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | + S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | + S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W); + + if (ctx->screen->info.chip_class >= GFX10) + rsrc3 |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) | + S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | + S_008F0C_RESOURCE_LEVEL(1); + else + rsrc3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | + S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); + + LLVMValueRef desc_elems[] = { + desc0, + desc1, + LLVMConstInt(ctx->ac.i32, sel->info.constbuf0_num_slots * 16, 0), + LLVMConstInt(ctx->ac.i32, rsrc3, false) + }; + + return ac_build_gather_values(&ctx->ac, desc_elems, 4); +} + +static LLVMValueRef load_ubo(struct ac_shader_abi *abi, LLVMValueRef index) +{ + struct si_shader_context *ctx = si_shader_context_from_abi(abi); + struct si_shader_selector *sel = ctx->shader->selector; + + LLVMValueRef ptr = ac_get_arg(&ctx->ac, ctx->const_and_shader_buffers); + + if (sel->info.const_buffers_declared == 1 && + sel->info.shader_buffers_declared == 0) { + return load_const_buffer_desc_fast_path(ctx); + } + + index = si_llvm_bound_index(ctx, index, ctx->num_const_buffers); + index = LLVMBuildAdd(ctx->ac.builder, index, + LLVMConstInt(ctx->ac.i32, SI_NUM_SHADER_BUFFERS, 0), ""); + + return ac_build_load_to_sgpr(&ctx->ac, ptr, index); +} + +static LLVMValueRef +load_ssbo(struct ac_shader_abi *abi, LLVMValueRef index, bool write) +{ + struct si_shader_context *ctx = si_shader_context_from_abi(abi); + LLVMValueRef rsrc_ptr = ac_get_arg(&ctx->ac, + ctx->const_and_shader_buffers); + + index = si_llvm_bound_index(ctx, index, ctx->num_shader_buffers); + index = LLVMBuildSub(ctx->ac.builder, + LLVMConstInt(ctx->ac.i32, SI_NUM_SHADER_BUFFERS - 1, 0), + index, ""); + + return ac_build_load_to_sgpr(&ctx->ac, rsrc_ptr, index); +} + +/** + * Given a 256-bit resource descriptor, force the DCC enable bit to off. + * + * At least on Tonga, executing image stores on images with DCC enabled and + * non-trivial can eventually lead to lockups. This can occur when an + * application binds an image as read-only but then uses a shader that writes + * to it. The OpenGL spec allows almost arbitrarily bad behavior (including + * program termination) in this case, but it doesn't cost much to be a bit + * nicer: disabling DCC in the shader still leads to undefined results but + * avoids the lockup. + */ +static LLVMValueRef force_dcc_off(struct si_shader_context *ctx, + LLVMValueRef rsrc) +{ + if (ctx->screen->info.chip_class <= GFX7) { + return rsrc; + } else { + LLVMValueRef i32_6 = LLVMConstInt(ctx->ac.i32, 6, 0); + LLVMValueRef i32_C = LLVMConstInt(ctx->ac.i32, C_008F28_COMPRESSION_EN, 0); + LLVMValueRef tmp; + + tmp = LLVMBuildExtractElement(ctx->ac.builder, rsrc, i32_6, ""); + tmp = LLVMBuildAnd(ctx->ac.builder, tmp, i32_C, ""); + return LLVMBuildInsertElement(ctx->ac.builder, rsrc, tmp, i32_6, ""); + } +} + +/* AC_DESC_FMASK is handled exactly like AC_DESC_IMAGE. The caller should + * adjust "index" to point to FMASK. */ +static LLVMValueRef si_load_image_desc(struct si_shader_context *ctx, + LLVMValueRef list, LLVMValueRef index, + enum ac_descriptor_type desc_type, + bool uses_store, bool bindless) +{ + LLVMBuilderRef builder = ctx->ac.builder; + LLVMValueRef rsrc; + + if (desc_type == AC_DESC_BUFFER) { + index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 2, 0), + ctx->ac.i32_1); + list = LLVMBuildPointerCast(builder, list, + ac_array_in_const32_addr_space(ctx->ac.v4i32), ""); + } else { + assert(desc_type == AC_DESC_IMAGE || + desc_type == AC_DESC_FMASK); + } + + if (bindless) + rsrc = ac_build_load_to_sgpr_uint_wraparound(&ctx->ac, list, index); + else + rsrc = ac_build_load_to_sgpr(&ctx->ac, list, index); + + if (desc_type == AC_DESC_IMAGE && uses_store) + rsrc = force_dcc_off(ctx, rsrc); + return rsrc; +} + +/** + * Load an image view, fmask view. or sampler state descriptor. + */ +static LLVMValueRef si_load_sampler_desc(struct si_shader_context *ctx, + LLVMValueRef list, LLVMValueRef index, + enum ac_descriptor_type type) +{ + LLVMBuilderRef builder = ctx->ac.builder; + + switch (type) { + case AC_DESC_IMAGE: + /* The image is at [0:7]. */ + index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->ac.i32, 2, 0), ""); + break; + case AC_DESC_BUFFER: + /* The buffer is in [4:7]. */ + index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 4, 0), + ctx->ac.i32_1); + list = LLVMBuildPointerCast(builder, list, + ac_array_in_const32_addr_space(ctx->ac.v4i32), ""); + break; + case AC_DESC_FMASK: + /* The FMASK is at [8:15]. */ + index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 2, 0), + ctx->ac.i32_1); + break; + case AC_DESC_SAMPLER: + /* The sampler state is at [12:15]. */ + index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 4, 0), + LLVMConstInt(ctx->ac.i32, 3, 0)); + list = LLVMBuildPointerCast(builder, list, + ac_array_in_const32_addr_space(ctx->ac.v4i32), ""); + break; + case AC_DESC_PLANE_0: + case AC_DESC_PLANE_1: + case AC_DESC_PLANE_2: + /* Only used for the multiplane image support for Vulkan. Should + * never be reached in radeonsi. + */ + unreachable("Plane descriptor requested in radeonsi."); + } + + return ac_build_load_to_sgpr(&ctx->ac, list, index); +} + +static LLVMValueRef +si_nir_load_sampler_desc(struct ac_shader_abi *abi, + unsigned descriptor_set, unsigned base_index, + unsigned constant_index, LLVMValueRef dynamic_index, + enum ac_descriptor_type desc_type, bool image, + bool write, bool bindless) +{ + struct si_shader_context *ctx = si_shader_context_from_abi(abi); + LLVMBuilderRef builder = ctx->ac.builder; + unsigned const_index = base_index + constant_index; + + assert(!descriptor_set); + assert(desc_type <= AC_DESC_BUFFER); + + if (bindless) { + LLVMValueRef list = ac_get_arg(&ctx->ac, ctx->bindless_samplers_and_images); + + /* dynamic_index is the bindless handle */ + if (image) { + /* Bindless image descriptors use 16-dword slots. */ + dynamic_index = LLVMBuildMul(ctx->ac.builder, dynamic_index, + LLVMConstInt(ctx->ac.i64, 2, 0), ""); + /* FMASK is right after the image. */ + if (desc_type == AC_DESC_FMASK) { + dynamic_index = LLVMBuildAdd(ctx->ac.builder, dynamic_index, + ctx->ac.i32_1, ""); + } + + return si_load_image_desc(ctx, list, dynamic_index, desc_type, + write, true); + } + + /* Since bindless handle arithmetic can contain an unsigned integer + * wraparound and si_load_sampler_desc assumes there isn't any, + * use GEP without "inbounds" (inside ac_build_pointer_add) + * to prevent incorrect code generation and hangs. + */ + dynamic_index = LLVMBuildMul(ctx->ac.builder, dynamic_index, + LLVMConstInt(ctx->ac.i64, 2, 0), ""); + list = ac_build_pointer_add(&ctx->ac, list, dynamic_index); + return si_load_sampler_desc(ctx, list, ctx->ac.i32_0, desc_type); + } + + unsigned num_slots = image ? ctx->num_images : ctx->num_samplers; + assert(const_index < num_slots || dynamic_index); + + LLVMValueRef list = ac_get_arg(&ctx->ac, ctx->samplers_and_images); + LLVMValueRef index = LLVMConstInt(ctx->ac.i32, const_index, false); + + if (dynamic_index) { + index = LLVMBuildAdd(builder, index, dynamic_index, ""); + + /* From the GL_ARB_shader_image_load_store extension spec: + * + * If a shader performs an image load, store, or atomic + * operation using an image variable declared as an array, + * and if the index used to select an individual element is + * negative or greater than or equal to the size of the + * array, the results of the operation are undefined but may + * not lead to termination. + */ + index = si_llvm_bound_index(ctx, index, num_slots); + } + + if (image) { + /* FMASKs are separate from images. */ + if (desc_type == AC_DESC_FMASK) { + index = LLVMBuildAdd(ctx->ac.builder, index, + LLVMConstInt(ctx->ac.i32, SI_NUM_IMAGES, 0), ""); + } + index = LLVMBuildSub(ctx->ac.builder, + LLVMConstInt(ctx->ac.i32, SI_NUM_IMAGE_SLOTS - 1, 0), + index, ""); + return si_load_image_desc(ctx, list, index, desc_type, write, false); + } + + index = LLVMBuildAdd(ctx->ac.builder, index, + LLVMConstInt(ctx->ac.i32, SI_NUM_IMAGE_SLOTS / 2, 0), ""); + return si_load_sampler_desc(ctx, list, index, desc_type); +} + +void si_llvm_init_resource_callbacks(struct si_shader_context *ctx) +{ + ctx->abi.load_ubo = load_ubo; + ctx->abi.load_ssbo = load_ssbo; + ctx->abi.load_sampler_desc = si_nir_load_sampler_desc; +} diff -Nru mesa-19.2.8/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c mesa-20.0.8/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c --- mesa-19.2.8/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,1295 @@ +/* + * Copyright 2020 Advanced Micro Devices, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "si_shader_internal.h" +#include "si_pipe.h" +#include "sid.h" + +static LLVMValueRef get_rel_patch_id(struct si_shader_context *ctx) +{ + switch (ctx->type) { + case PIPE_SHADER_TESS_CTRL: + return si_unpack_param(ctx, ctx->args.tcs_rel_ids, 0, 8); + + case PIPE_SHADER_TESS_EVAL: + return ac_get_arg(&ctx->ac, ctx->tes_rel_patch_id); + + default: + assert(0); + return NULL; + } +} + +/* Tessellation shaders pass outputs to the next shader using LDS. + * + * LS outputs = TCS inputs + * TCS outputs = TES inputs + * + * The LDS layout is: + * - TCS inputs for patch 0 + * - TCS inputs for patch 1 + * - TCS inputs for patch 2 = get_tcs_in_current_patch_offset (if RelPatchID==2) + * - ... + * - TCS outputs for patch 0 = get_tcs_out_patch0_offset + * - Per-patch TCS outputs for patch 0 = get_tcs_out_patch0_patch_data_offset + * - TCS outputs for patch 1 + * - Per-patch TCS outputs for patch 1 + * - TCS outputs for patch 2 = get_tcs_out_current_patch_offset (if RelPatchID==2) + * - Per-patch TCS outputs for patch 2 = get_tcs_out_current_patch_data_offset (if RelPatchID==2) + * - ... + * + * All three shaders VS(LS), TCS, TES share the same LDS space. + */ + +static LLVMValueRef +get_tcs_in_patch_stride(struct si_shader_context *ctx) +{ + return si_unpack_param(ctx, ctx->vs_state_bits, 11, 13); +} + +static unsigned get_tcs_out_vertex_dw_stride_constant(struct si_shader_context *ctx) +{ + assert(ctx->type == PIPE_SHADER_TESS_CTRL); + + if (ctx->shader->key.mono.u.ff_tcs_inputs_to_copy) + return util_last_bit64(ctx->shader->key.mono.u.ff_tcs_inputs_to_copy) * 4; + + return util_last_bit64(ctx->shader->selector->outputs_written) * 4; +} + +static LLVMValueRef get_tcs_out_vertex_dw_stride(struct si_shader_context *ctx) +{ + unsigned stride = get_tcs_out_vertex_dw_stride_constant(ctx); + + return LLVMConstInt(ctx->ac.i32, stride, 0); +} + +static LLVMValueRef get_tcs_out_patch_stride(struct si_shader_context *ctx) +{ + if (ctx->shader->key.mono.u.ff_tcs_inputs_to_copy) + return si_unpack_param(ctx, ctx->tcs_out_lds_layout, 0, 13); + + const struct si_shader_info *info = &ctx->shader->selector->info; + unsigned tcs_out_vertices = info->properties[TGSI_PROPERTY_TCS_VERTICES_OUT]; + unsigned vertex_dw_stride = get_tcs_out_vertex_dw_stride_constant(ctx); + unsigned num_patch_outputs = util_last_bit64(ctx->shader->selector->patch_outputs_written); + unsigned patch_dw_stride = tcs_out_vertices * vertex_dw_stride + + num_patch_outputs * 4; + return LLVMConstInt(ctx->ac.i32, patch_dw_stride, 0); +} + +static LLVMValueRef +get_tcs_out_patch0_offset(struct si_shader_context *ctx) +{ + return LLVMBuildMul(ctx->ac.builder, + si_unpack_param(ctx, ctx->tcs_out_lds_offsets, 0, 16), + LLVMConstInt(ctx->ac.i32, 4, 0), ""); +} + +static LLVMValueRef +get_tcs_out_patch0_patch_data_offset(struct si_shader_context *ctx) +{ + return LLVMBuildMul(ctx->ac.builder, + si_unpack_param(ctx, ctx->tcs_out_lds_offsets, 16, 16), + LLVMConstInt(ctx->ac.i32, 4, 0), ""); +} + +static LLVMValueRef +get_tcs_in_current_patch_offset(struct si_shader_context *ctx) +{ + LLVMValueRef patch_stride = get_tcs_in_patch_stride(ctx); + LLVMValueRef rel_patch_id = get_rel_patch_id(ctx); + + return LLVMBuildMul(ctx->ac.builder, patch_stride, rel_patch_id, ""); +} + +static LLVMValueRef +get_tcs_out_current_patch_offset(struct si_shader_context *ctx) +{ + LLVMValueRef patch0_offset = get_tcs_out_patch0_offset(ctx); + LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx); + LLVMValueRef rel_patch_id = get_rel_patch_id(ctx); + + return ac_build_imad(&ctx->ac, patch_stride, rel_patch_id, patch0_offset); +} + +static LLVMValueRef +get_tcs_out_current_patch_data_offset(struct si_shader_context *ctx) +{ + LLVMValueRef patch0_patch_data_offset = + get_tcs_out_patch0_patch_data_offset(ctx); + LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx); + LLVMValueRef rel_patch_id = get_rel_patch_id(ctx); + + return ac_build_imad(&ctx->ac, patch_stride, rel_patch_id, patch0_patch_data_offset); +} + +static LLVMValueRef get_num_tcs_out_vertices(struct si_shader_context *ctx) +{ + unsigned tcs_out_vertices = + ctx->shader->selector ? + ctx->shader->selector->info.properties[TGSI_PROPERTY_TCS_VERTICES_OUT] : 0; + + /* If !tcs_out_vertices, it's either the fixed-func TCS or the TCS epilog. */ + if (ctx->type == PIPE_SHADER_TESS_CTRL && tcs_out_vertices) + return LLVMConstInt(ctx->ac.i32, tcs_out_vertices, 0); + + return si_unpack_param(ctx, ctx->tcs_offchip_layout, 6, 6); +} + +static LLVMValueRef get_tcs_in_vertex_dw_stride(struct si_shader_context *ctx) +{ + unsigned stride; + + switch (ctx->type) { + case PIPE_SHADER_VERTEX: + stride = ctx->shader->selector->lshs_vertex_stride / 4; + return LLVMConstInt(ctx->ac.i32, stride, 0); + + case PIPE_SHADER_TESS_CTRL: + if (ctx->screen->info.chip_class >= GFX9 && + ctx->shader->is_monolithic) { + stride = ctx->shader->key.part.tcs.ls->lshs_vertex_stride / 4; + return LLVMConstInt(ctx->ac.i32, stride, 0); + } + return si_unpack_param(ctx, ctx->vs_state_bits, 24, 8); + + default: + assert(0); + return NULL; + } +} + +static LLVMValueRef get_dw_address_from_generic_indices(struct si_shader_context *ctx, + LLVMValueRef vertex_dw_stride, + LLVMValueRef base_addr, + LLVMValueRef vertex_index, + LLVMValueRef param_index, + ubyte name, ubyte index) +{ + if (vertex_dw_stride) { + base_addr = ac_build_imad(&ctx->ac, vertex_index, + vertex_dw_stride, base_addr); + } + + if (param_index) { + base_addr = ac_build_imad(&ctx->ac, param_index, + LLVMConstInt(ctx->ac.i32, 4, 0), base_addr); + } + + int param = name == TGSI_SEMANTIC_PATCH || + name == TGSI_SEMANTIC_TESSINNER || + name == TGSI_SEMANTIC_TESSOUTER ? + si_shader_io_get_unique_index_patch(name, index) : + si_shader_io_get_unique_index(name, index, false); + + /* Add the base address of the element. */ + return LLVMBuildAdd(ctx->ac.builder, base_addr, + LLVMConstInt(ctx->ac.i32, param * 4, 0), ""); +} + +/* The offchip buffer layout for TCS->TES is + * + * - attribute 0 of patch 0 vertex 0 + * - attribute 0 of patch 0 vertex 1 + * - attribute 0 of patch 0 vertex 2 + * ... + * - attribute 0 of patch 1 vertex 0 + * - attribute 0 of patch 1 vertex 1 + * ... + * - attribute 1 of patch 0 vertex 0 + * - attribute 1 of patch 0 vertex 1 + * ... + * - per patch attribute 0 of patch 0 + * - per patch attribute 0 of patch 1 + * ... + * + * Note that every attribute has 4 components. + */ +static LLVMValueRef get_tcs_tes_buffer_address(struct si_shader_context *ctx, + LLVMValueRef rel_patch_id, + LLVMValueRef vertex_index, + LLVMValueRef param_index) +{ + LLVMValueRef base_addr, vertices_per_patch, num_patches, total_vertices; + LLVMValueRef param_stride, constant16; + + vertices_per_patch = get_num_tcs_out_vertices(ctx); + num_patches = si_unpack_param(ctx, ctx->tcs_offchip_layout, 0, 6); + total_vertices = LLVMBuildMul(ctx->ac.builder, vertices_per_patch, + num_patches, ""); + + constant16 = LLVMConstInt(ctx->ac.i32, 16, 0); + if (vertex_index) { + base_addr = ac_build_imad(&ctx->ac, rel_patch_id, + vertices_per_patch, vertex_index); + param_stride = total_vertices; + } else { + base_addr = rel_patch_id; + param_stride = num_patches; + } + + base_addr = ac_build_imad(&ctx->ac, param_index, param_stride, base_addr); + base_addr = LLVMBuildMul(ctx->ac.builder, base_addr, constant16, ""); + + if (!vertex_index) { + LLVMValueRef patch_data_offset = + si_unpack_param(ctx, ctx->tcs_offchip_layout, 12, 20); + + base_addr = LLVMBuildAdd(ctx->ac.builder, base_addr, + patch_data_offset, ""); + } + return base_addr; +} + +static LLVMValueRef get_tcs_tes_buffer_address_from_generic_indices( + struct si_shader_context *ctx, + LLVMValueRef vertex_index, + LLVMValueRef param_index, + ubyte name, ubyte index) +{ + unsigned param_index_base; + + param_index_base = name == TGSI_SEMANTIC_PATCH || + name == TGSI_SEMANTIC_TESSINNER || + name == TGSI_SEMANTIC_TESSOUTER ? + si_shader_io_get_unique_index_patch(name, index) : + si_shader_io_get_unique_index(name, index, false); + + if (param_index) { + param_index = LLVMBuildAdd(ctx->ac.builder, param_index, + LLVMConstInt(ctx->ac.i32, param_index_base, 0), + ""); + } else { + param_index = LLVMConstInt(ctx->ac.i32, param_index_base, 0); + } + + return get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx), + vertex_index, param_index); +} + +static LLVMValueRef buffer_load(struct si_shader_context *ctx, + LLVMTypeRef type, unsigned swizzle, + LLVMValueRef buffer, LLVMValueRef offset, + LLVMValueRef base, bool can_speculate) +{ + LLVMValueRef value, value2; + LLVMTypeRef vec_type = LLVMVectorType(type, 4); + + if (swizzle == ~0) { + value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset, + 0, ac_glc, can_speculate, false); + + return LLVMBuildBitCast(ctx->ac.builder, value, vec_type, ""); + } + + if (ac_get_type_size(type) != 8) { + value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset, + 0, ac_glc, can_speculate, false); + + value = LLVMBuildBitCast(ctx->ac.builder, value, vec_type, ""); + return LLVMBuildExtractElement(ctx->ac.builder, value, + LLVMConstInt(ctx->ac.i32, swizzle, 0), ""); + } + + value = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset, + swizzle * 4, ac_glc, can_speculate, false); + + value2 = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset, + swizzle * 4 + 4, ac_glc, can_speculate, false); + + return si_build_gather_64bit(ctx, type, value, value2); +} + +/** + * Load from LSHS LDS storage. + * + * \param type output value type + * \param swizzle offset (typically 0..3); it can be ~0, which loads a vec4 + * \param dw_addr address in dwords + */ +static LLVMValueRef lshs_lds_load(struct si_shader_context *ctx, + LLVMTypeRef type, unsigned swizzle, + LLVMValueRef dw_addr) +{ + LLVMValueRef value; + + if (swizzle == ~0) { + LLVMValueRef values[4]; + + for (unsigned chan = 0; chan < 4; chan++) + values[chan] = lshs_lds_load(ctx, type, chan, dw_addr); + + return ac_build_gather_values(&ctx->ac, values, 4); + } + + /* Split 64-bit loads. */ + if (ac_get_type_size(type) == 8) { + LLVMValueRef lo, hi; + + lo = lshs_lds_load(ctx, ctx->ac.i32, swizzle, dw_addr); + hi = lshs_lds_load(ctx, ctx->ac.i32, swizzle + 1, dw_addr); + return si_build_gather_64bit(ctx, type, lo, hi); + } + + dw_addr = LLVMBuildAdd(ctx->ac.builder, dw_addr, + LLVMConstInt(ctx->ac.i32, swizzle, 0), ""); + + value = ac_lds_load(&ctx->ac, dw_addr); + + return LLVMBuildBitCast(ctx->ac.builder, value, type, ""); +} + +/** + * Store to LSHS LDS storage. + * + * \param swizzle offset (typically 0..3) + * \param dw_addr address in dwords + * \param value value to store + */ +static void lshs_lds_store(struct si_shader_context *ctx, + unsigned dw_offset_imm, LLVMValueRef dw_addr, + LLVMValueRef value) +{ + dw_addr = LLVMBuildAdd(ctx->ac.builder, dw_addr, + LLVMConstInt(ctx->ac.i32, dw_offset_imm, 0), ""); + + ac_lds_store(&ctx->ac, dw_addr, value); +} + +enum si_tess_ring { + TCS_FACTOR_RING, + TESS_OFFCHIP_RING_TCS, + TESS_OFFCHIP_RING_TES, +}; + +static LLVMValueRef get_tess_ring_descriptor(struct si_shader_context *ctx, + enum si_tess_ring ring) +{ + LLVMBuilderRef builder = ctx->ac.builder; + LLVMValueRef addr = ac_get_arg(&ctx->ac, + ring == TESS_OFFCHIP_RING_TES ? + ctx->tes_offchip_addr : + ctx->tcs_out_lds_layout); + + /* TCS only receives high 13 bits of the address. */ + if (ring == TESS_OFFCHIP_RING_TCS || ring == TCS_FACTOR_RING) { + addr = LLVMBuildAnd(builder, addr, + LLVMConstInt(ctx->ac.i32, 0xfff80000, 0), ""); + } + + if (ring == TCS_FACTOR_RING) { + unsigned tf_offset = ctx->screen->tess_offchip_ring_size; + addr = LLVMBuildAdd(builder, addr, + LLVMConstInt(ctx->ac.i32, tf_offset, 0), ""); + } + + uint32_t rsrc3 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | + S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | + S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | + S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W); + + if (ctx->screen->info.chip_class >= GFX10) + rsrc3 |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) | + S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | + S_008F0C_RESOURCE_LEVEL(1); + else + rsrc3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | + S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); + + LLVMValueRef desc[4]; + desc[0] = addr; + desc[1] = LLVMConstInt(ctx->ac.i32, + S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi), 0); + desc[2] = LLVMConstInt(ctx->ac.i32, 0xffffffff, 0); + desc[3] = LLVMConstInt(ctx->ac.i32, rsrc3, false); + + return ac_build_gather_values(&ctx->ac, desc, 4); +} + +void si_llvm_preload_tes_rings(struct si_shader_context *ctx) +{ + ctx->tess_offchip_ring = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TES); +} + +static LLVMValueRef si_nir_load_tcs_varyings(struct ac_shader_abi *abi, + LLVMTypeRef type, + LLVMValueRef vertex_index, + LLVMValueRef param_index, + unsigned const_index, + unsigned location, + unsigned driver_location, + unsigned component, + unsigned num_components, + bool is_patch, + bool is_compact, + bool load_input) +{ + struct si_shader_context *ctx = si_shader_context_from_abi(abi); + struct si_shader_info *info = &ctx->shader->selector->info; + LLVMValueRef dw_addr, stride; + ubyte name, index; + + driver_location = driver_location / 4; + + if (load_input) { + name = info->input_semantic_name[driver_location]; + index = info->input_semantic_index[driver_location]; + } else { + name = info->output_semantic_name[driver_location]; + index = info->output_semantic_index[driver_location]; + } + + assert((name == TGSI_SEMANTIC_PATCH || + name == TGSI_SEMANTIC_TESSINNER || + name == TGSI_SEMANTIC_TESSOUTER) == is_patch); + + if (load_input) { + stride = get_tcs_in_vertex_dw_stride(ctx); + dw_addr = get_tcs_in_current_patch_offset(ctx); + } else { + if (is_patch) { + stride = NULL; + dw_addr = get_tcs_out_current_patch_data_offset(ctx); + } else { + stride = get_tcs_out_vertex_dw_stride(ctx); + dw_addr = get_tcs_out_current_patch_offset(ctx); + } + } + + if (!param_index) { + param_index = LLVMConstInt(ctx->ac.i32, const_index, 0); + } + + dw_addr = get_dw_address_from_generic_indices(ctx, stride, dw_addr, + vertex_index, param_index, + name, index); + + LLVMValueRef value[4]; + for (unsigned i = 0; i < num_components; i++) { + unsigned offset = i; + if (ac_get_type_size(type) == 8) + offset *= 2; + + offset += component; + value[i + component] = lshs_lds_load(ctx, type, offset, dw_addr); + } + + return ac_build_varying_gather_values(&ctx->ac, value, num_components, component); +} + +static LLVMValueRef si_nir_load_input_tes(struct ac_shader_abi *abi, + LLVMTypeRef type, + LLVMValueRef vertex_index, + LLVMValueRef param_index, + unsigned const_index, + unsigned location, + unsigned driver_location, + unsigned component, + unsigned num_components, + bool is_patch, + bool is_compact, + bool load_input) +{ + struct si_shader_context *ctx = si_shader_context_from_abi(abi); + struct si_shader_info *info = &ctx->shader->selector->info; + LLVMValueRef base, addr; + + driver_location = driver_location / 4; + ubyte name = info->input_semantic_name[driver_location]; + ubyte index = info->input_semantic_index[driver_location]; + + assert((name == TGSI_SEMANTIC_PATCH || + name == TGSI_SEMANTIC_TESSINNER || + name == TGSI_SEMANTIC_TESSOUTER) == is_patch); + + base = ac_get_arg(&ctx->ac, ctx->tcs_offchip_offset); + + if (!param_index) { + param_index = LLVMConstInt(ctx->ac.i32, const_index, 0); + } + + addr = get_tcs_tes_buffer_address_from_generic_indices(ctx, vertex_index, + param_index, + name, index); + + /* TODO: This will generate rather ordinary llvm code, although it + * should be easy for the optimiser to fix up. In future we might want + * to refactor buffer_load(). + */ + LLVMValueRef value[4]; + for (unsigned i = 0; i < num_components; i++) { + unsigned offset = i; + if (ac_get_type_size(type) == 8) { + offset *= 2; + if (offset == 4) { + ubyte name = info->input_semantic_name[driver_location + 1]; + ubyte index = info->input_semantic_index[driver_location + 1]; + addr = get_tcs_tes_buffer_address_from_generic_indices(ctx, + vertex_index, + param_index, + name, index); + } + + offset = offset % 4; + } + + offset += component; + value[i + component] = buffer_load(ctx, type, offset, + ctx->tess_offchip_ring, base, addr, true); + } + + return ac_build_varying_gather_values(&ctx->ac, value, num_components, component); +} + +static void si_nir_store_output_tcs(struct ac_shader_abi *abi, + const struct nir_variable *var, + LLVMValueRef vertex_index, + LLVMValueRef param_index, + unsigned const_index, + LLVMValueRef src, + unsigned writemask) +{ + struct si_shader_context *ctx = si_shader_context_from_abi(abi); + struct si_shader_info *info = &ctx->shader->selector->info; + const unsigned component = var->data.location_frac; + unsigned driver_location = var->data.driver_location; + LLVMValueRef dw_addr, stride; + LLVMValueRef buffer, base, addr; + LLVMValueRef values[8]; + bool skip_lds_store; + bool is_tess_factor = false, is_tess_inner = false; + + driver_location = driver_location / 4; + ubyte name = info->output_semantic_name[driver_location]; + ubyte index = info->output_semantic_index[driver_location]; + + bool is_const = !param_index; + if (!param_index) + param_index = LLVMConstInt(ctx->ac.i32, const_index, 0); + + const bool is_patch = var->data.patch || + var->data.location == VARYING_SLOT_TESS_LEVEL_INNER || + var->data.location == VARYING_SLOT_TESS_LEVEL_OUTER; + + /* Invalid SPIR-V can cause this. */ + if ((name == TGSI_SEMANTIC_PATCH || + name == TGSI_SEMANTIC_TESSINNER || + name == TGSI_SEMANTIC_TESSOUTER) != is_patch) + return; + + if (!is_patch) { + stride = get_tcs_out_vertex_dw_stride(ctx); + dw_addr = get_tcs_out_current_patch_offset(ctx); + dw_addr = get_dw_address_from_generic_indices(ctx, stride, dw_addr, + vertex_index, param_index, + name, index); + + skip_lds_store = !info->reads_pervertex_outputs; + } else { + dw_addr = get_tcs_out_current_patch_data_offset(ctx); + dw_addr = get_dw_address_from_generic_indices(ctx, NULL, dw_addr, + vertex_index, param_index, + name, index); + + skip_lds_store = !info->reads_perpatch_outputs; + + if (is_const && const_index == 0) { + int name = info->output_semantic_name[driver_location]; + + /* Always write tess factors into LDS for the TCS epilog. */ + if (name == TGSI_SEMANTIC_TESSINNER || + name == TGSI_SEMANTIC_TESSOUTER) { + /* The epilog doesn't read LDS if invocation 0 defines tess factors. */ + skip_lds_store = !info->reads_tessfactor_outputs && + ctx->shader->selector->info.tessfactors_are_def_in_all_invocs; + is_tess_factor = true; + is_tess_inner = name == TGSI_SEMANTIC_TESSINNER; + } + } + } + + buffer = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TCS); + + base = ac_get_arg(&ctx->ac, ctx->tcs_offchip_offset); + + addr = get_tcs_tes_buffer_address_from_generic_indices(ctx, vertex_index, + param_index, name, index); + + for (unsigned chan = component; chan < 8; chan++) { + if (!(writemask & (1 << chan))) + continue; + LLVMValueRef value = ac_llvm_extract_elem(&ctx->ac, src, chan - component); + + unsigned buffer_store_offset = chan % 4; + if (chan == 4) { + ubyte name = info->output_semantic_name[driver_location + 1]; + ubyte index = info->output_semantic_index[driver_location + 1]; + addr = get_tcs_tes_buffer_address_from_generic_indices(ctx, + vertex_index, + param_index, + name, index); + } + + /* Skip LDS stores if there is no LDS read of this output. */ + if (!skip_lds_store) + lshs_lds_store(ctx, chan, dw_addr, value); + + value = ac_to_integer(&ctx->ac, value); + values[chan] = value; + + if (writemask != 0xF && !is_tess_factor) { + ac_build_buffer_store_dword(&ctx->ac, buffer, value, 1, + addr, base, + 4 * buffer_store_offset, + ac_glc); + } + + /* Write tess factors into VGPRs for the epilog. */ + if (is_tess_factor && + ctx->shader->selector->info.tessfactors_are_def_in_all_invocs) { + if (!is_tess_inner) { + LLVMBuildStore(ctx->ac.builder, value, /* outer */ + ctx->invoc0_tess_factors[chan]); + } else if (chan < 2) { + LLVMBuildStore(ctx->ac.builder, value, /* inner */ + ctx->invoc0_tess_factors[4 + chan]); + } + } + } + + if (writemask == 0xF && !is_tess_factor) { + LLVMValueRef value = ac_build_gather_values(&ctx->ac, + values, 4); + ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, addr, + base, 0, ac_glc); + } +} + +static LLVMValueRef si_load_tess_coord(struct ac_shader_abi *abi) +{ + struct si_shader_context *ctx = si_shader_context_from_abi(abi); + LLVMValueRef coord[4] = { + ac_get_arg(&ctx->ac, ctx->tes_u), + ac_get_arg(&ctx->ac, ctx->tes_v), + ctx->ac.f32_0, + ctx->ac.f32_0 + }; + + /* For triangles, the vector should be (u, v, 1-u-v). */ + if (ctx->shader->selector->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] == + PIPE_PRIM_TRIANGLES) { + coord[2] = LLVMBuildFSub(ctx->ac.builder, ctx->ac.f32_1, + LLVMBuildFAdd(ctx->ac.builder, + coord[0], coord[1], ""), ""); + } + return ac_build_gather_values(&ctx->ac, coord, 4); +} + +static LLVMValueRef load_tess_level(struct si_shader_context *ctx, + unsigned semantic_name) +{ + LLVMValueRef base, addr; + + int param = si_shader_io_get_unique_index_patch(semantic_name, 0); + + base = ac_get_arg(&ctx->ac, ctx->tcs_offchip_offset); + addr = get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx), NULL, + LLVMConstInt(ctx->ac.i32, param, 0)); + + return buffer_load(ctx, ctx->ac.f32, + ~0, ctx->tess_offchip_ring, base, addr, true); + +} + +static LLVMValueRef load_tess_level_default(struct si_shader_context *ctx, + unsigned semantic_name) +{ + LLVMValueRef buf, slot, val[4]; + int i, offset; + + slot = LLVMConstInt(ctx->ac.i32, SI_HS_CONST_DEFAULT_TESS_LEVELS, 0); + buf = ac_get_arg(&ctx->ac, ctx->rw_buffers); + buf = ac_build_load_to_sgpr(&ctx->ac, buf, slot); + offset = semantic_name == TGSI_SEMANTIC_TESS_DEFAULT_INNER_LEVEL ? 4 : 0; + + for (i = 0; i < 4; i++) + val[i] = si_buffer_load_const(ctx, buf, + LLVMConstInt(ctx->ac.i32, (offset + i) * 4, 0)); + return ac_build_gather_values(&ctx->ac, val, 4); +} + +static LLVMValueRef si_load_tess_level(struct ac_shader_abi *abi, + unsigned varying_id, + bool load_default_state) +{ + struct si_shader_context *ctx = si_shader_context_from_abi(abi); + unsigned semantic_name; + + if (load_default_state) { + switch (varying_id) { + case VARYING_SLOT_TESS_LEVEL_INNER: + semantic_name = TGSI_SEMANTIC_TESS_DEFAULT_INNER_LEVEL; + break; + case VARYING_SLOT_TESS_LEVEL_OUTER: + semantic_name = TGSI_SEMANTIC_TESS_DEFAULT_OUTER_LEVEL; + break; + default: + unreachable("unknown tess level"); + } + return load_tess_level_default(ctx, semantic_name); + } + + switch (varying_id) { + case VARYING_SLOT_TESS_LEVEL_INNER: + semantic_name = TGSI_SEMANTIC_TESSINNER; + break; + case VARYING_SLOT_TESS_LEVEL_OUTER: + semantic_name = TGSI_SEMANTIC_TESSOUTER; + break; + default: + unreachable("unknown tess level"); + } + + return load_tess_level(ctx, semantic_name); + +} + +static LLVMValueRef si_load_patch_vertices_in(struct ac_shader_abi *abi) +{ + struct si_shader_context *ctx = si_shader_context_from_abi(abi); + if (ctx->type == PIPE_SHADER_TESS_CTRL) + return si_unpack_param(ctx, ctx->tcs_out_lds_layout, 13, 6); + else if (ctx->type == PIPE_SHADER_TESS_EVAL) + return get_num_tcs_out_vertices(ctx); + else + unreachable("invalid shader stage for TGSI_SEMANTIC_VERTICESIN"); +} + +/** + * Forward all outputs from the vertex shader to the TES. This is only used + * for the fixed function TCS. + */ +static void si_copy_tcs_inputs(struct si_shader_context *ctx) +{ + LLVMValueRef invocation_id, buffer, buffer_offset; + LLVMValueRef lds_vertex_stride, lds_base; + uint64_t inputs; + + invocation_id = si_unpack_param(ctx, ctx->args.tcs_rel_ids, 8, 5); + buffer = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TCS); + buffer_offset = ac_get_arg(&ctx->ac, ctx->tcs_offchip_offset); + + lds_vertex_stride = get_tcs_in_vertex_dw_stride(ctx); + lds_base = get_tcs_in_current_patch_offset(ctx); + lds_base = ac_build_imad(&ctx->ac, invocation_id, lds_vertex_stride, + lds_base); + + inputs = ctx->shader->key.mono.u.ff_tcs_inputs_to_copy; + while (inputs) { + unsigned i = u_bit_scan64(&inputs); + + LLVMValueRef lds_ptr = LLVMBuildAdd(ctx->ac.builder, lds_base, + LLVMConstInt(ctx->ac.i32, 4 * i, 0), + ""); + + LLVMValueRef buffer_addr = get_tcs_tes_buffer_address(ctx, + get_rel_patch_id(ctx), + invocation_id, + LLVMConstInt(ctx->ac.i32, i, 0)); + + LLVMValueRef value = lshs_lds_load(ctx, ctx->ac.i32, ~0, lds_ptr); + + ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, buffer_addr, + buffer_offset, 0, ac_glc); + } +} + +static void si_write_tess_factors(struct si_shader_context *ctx, + LLVMValueRef rel_patch_id, + LLVMValueRef invocation_id, + LLVMValueRef tcs_out_current_patch_data_offset, + LLVMValueRef invoc0_tf_outer[4], + LLVMValueRef invoc0_tf_inner[2]) +{ + struct si_shader *shader = ctx->shader; + unsigned tess_inner_index, tess_outer_index; + LLVMValueRef lds_base, lds_inner, lds_outer, byteoffset, buffer; + LLVMValueRef out[6], vec0, vec1, tf_base, inner[4], outer[4]; + unsigned stride, outer_comps, inner_comps, i, offset; + + /* Add a barrier before loading tess factors from LDS. */ + if (!shader->key.part.tcs.epilog.invoc0_tess_factors_are_def) + si_llvm_emit_barrier(ctx); + + /* Do this only for invocation 0, because the tess levels are per-patch, + * not per-vertex. + * + * This can't jump, because invocation 0 executes this. It should + * at least mask out the loads and stores for other invocations. + */ + ac_build_ifcc(&ctx->ac, + LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, + invocation_id, ctx->ac.i32_0, ""), 6503); + + /* Determine the layout of one tess factor element in the buffer. */ + switch (shader->key.part.tcs.epilog.prim_mode) { + case PIPE_PRIM_LINES: + stride = 2; /* 2 dwords, 1 vec2 store */ + outer_comps = 2; + inner_comps = 0; + break; + case PIPE_PRIM_TRIANGLES: + stride = 4; /* 4 dwords, 1 vec4 store */ + outer_comps = 3; + inner_comps = 1; + break; + case PIPE_PRIM_QUADS: + stride = 6; /* 6 dwords, 2 stores (vec4 + vec2) */ + outer_comps = 4; + inner_comps = 2; + break; + default: + assert(0); + return; + } + + for (i = 0; i < 4; i++) { + inner[i] = LLVMGetUndef(ctx->ac.i32); + outer[i] = LLVMGetUndef(ctx->ac.i32); + } + + if (shader->key.part.tcs.epilog.invoc0_tess_factors_are_def) { + /* Tess factors are in VGPRs. */ + for (i = 0; i < outer_comps; i++) + outer[i] = out[i] = invoc0_tf_outer[i]; + for (i = 0; i < inner_comps; i++) + inner[i] = out[outer_comps+i] = invoc0_tf_inner[i]; + } else { + /* Load tess_inner and tess_outer from LDS. + * Any invocation can write them, so we can't get them from a temporary. + */ + tess_inner_index = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSINNER, 0); + tess_outer_index = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSOUTER, 0); + + lds_base = tcs_out_current_patch_data_offset; + lds_inner = LLVMBuildAdd(ctx->ac.builder, lds_base, + LLVMConstInt(ctx->ac.i32, + tess_inner_index * 4, 0), ""); + lds_outer = LLVMBuildAdd(ctx->ac.builder, lds_base, + LLVMConstInt(ctx->ac.i32, + tess_outer_index * 4, 0), ""); + + for (i = 0; i < outer_comps; i++) { + outer[i] = out[i] = + lshs_lds_load(ctx, ctx->ac.i32, i, lds_outer); + } + for (i = 0; i < inner_comps; i++) { + inner[i] = out[outer_comps+i] = + lshs_lds_load(ctx, ctx->ac.i32, i, lds_inner); + } + } + + if (shader->key.part.tcs.epilog.prim_mode == PIPE_PRIM_LINES) { + /* For isolines, the hardware expects tess factors in the + * reverse order from what NIR specifies. + */ + LLVMValueRef tmp = out[0]; + out[0] = out[1]; + out[1] = tmp; + } + + /* Convert the outputs to vectors for stores. */ + vec0 = ac_build_gather_values(&ctx->ac, out, MIN2(stride, 4)); + vec1 = NULL; + + if (stride > 4) + vec1 = ac_build_gather_values(&ctx->ac, out+4, stride - 4); + + /* Get the buffer. */ + buffer = get_tess_ring_descriptor(ctx, TCS_FACTOR_RING); + + /* Get the offset. */ + tf_base = ac_get_arg(&ctx->ac, + ctx->tcs_factor_offset); + byteoffset = LLVMBuildMul(ctx->ac.builder, rel_patch_id, + LLVMConstInt(ctx->ac.i32, 4 * stride, 0), ""); + + ac_build_ifcc(&ctx->ac, + LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, + rel_patch_id, ctx->ac.i32_0, ""), 6504); + + /* Store the dynamic HS control word. */ + offset = 0; + if (ctx->screen->info.chip_class <= GFX8) { + ac_build_buffer_store_dword(&ctx->ac, buffer, + LLVMConstInt(ctx->ac.i32, 0x80000000, 0), + 1, ctx->ac.i32_0, tf_base, + offset, ac_glc); + offset += 4; + } + + ac_build_endif(&ctx->ac, 6504); + + /* Store the tessellation factors. */ + ac_build_buffer_store_dword(&ctx->ac, buffer, vec0, + MIN2(stride, 4), byteoffset, tf_base, + offset, ac_glc); + offset += 16; + if (vec1) + ac_build_buffer_store_dword(&ctx->ac, buffer, vec1, + stride - 4, byteoffset, tf_base, + offset, ac_glc); + + /* Store the tess factors into the offchip buffer if TES reads them. */ + if (shader->key.part.tcs.epilog.tes_reads_tess_factors) { + LLVMValueRef buf, base, inner_vec, outer_vec, tf_outer_offset; + LLVMValueRef tf_inner_offset; + unsigned param_outer, param_inner; + + buf = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TCS); + base = ac_get_arg(&ctx->ac, ctx->tcs_offchip_offset); + + param_outer = si_shader_io_get_unique_index_patch( + TGSI_SEMANTIC_TESSOUTER, 0); + tf_outer_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL, + LLVMConstInt(ctx->ac.i32, param_outer, 0)); + + unsigned outer_vec_size = + ac_has_vec3_support(ctx->screen->info.chip_class, false) ? + outer_comps : util_next_power_of_two(outer_comps); + outer_vec = ac_build_gather_values(&ctx->ac, outer, outer_vec_size); + + ac_build_buffer_store_dword(&ctx->ac, buf, outer_vec, + outer_comps, tf_outer_offset, + base, 0, ac_glc); + if (inner_comps) { + param_inner = si_shader_io_get_unique_index_patch( + TGSI_SEMANTIC_TESSINNER, 0); + tf_inner_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL, + LLVMConstInt(ctx->ac.i32, param_inner, 0)); + + inner_vec = inner_comps == 1 ? inner[0] : + ac_build_gather_values(&ctx->ac, inner, inner_comps); + ac_build_buffer_store_dword(&ctx->ac, buf, inner_vec, + inner_comps, tf_inner_offset, + base, 0, ac_glc); + } + } + + ac_build_endif(&ctx->ac, 6503); +} + +/* This only writes the tessellation factor levels. */ +static void si_llvm_emit_tcs_epilogue(struct ac_shader_abi *abi, + unsigned max_outputs, + LLVMValueRef *addrs) +{ + struct si_shader_context *ctx = si_shader_context_from_abi(abi); + LLVMBuilderRef builder = ctx->ac.builder; + LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset; + + si_copy_tcs_inputs(ctx); + + rel_patch_id = get_rel_patch_id(ctx); + invocation_id = si_unpack_param(ctx, ctx->args.tcs_rel_ids, 8, 5); + tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx); + + if (ctx->screen->info.chip_class >= GFX9) { + LLVMBasicBlockRef blocks[2] = { + LLVMGetInsertBlock(builder), + ctx->merged_wrap_if_entry_block + }; + LLVMValueRef values[2]; + + ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label); + + values[0] = rel_patch_id; + values[1] = LLVMGetUndef(ctx->ac.i32); + rel_patch_id = ac_build_phi(&ctx->ac, ctx->ac.i32, 2, values, blocks); + + values[0] = tf_lds_offset; + values[1] = LLVMGetUndef(ctx->ac.i32); + tf_lds_offset = ac_build_phi(&ctx->ac, ctx->ac.i32, 2, values, blocks); + + values[0] = invocation_id; + values[1] = ctx->ac.i32_1; /* cause the epilog to skip threads */ + invocation_id = ac_build_phi(&ctx->ac, ctx->ac.i32, 2, values, blocks); + } + + /* Return epilog parameters from this function. */ + LLVMValueRef ret = ctx->return_value; + unsigned vgpr; + + if (ctx->screen->info.chip_class >= GFX9) { + ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_layout, + 8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT); + ret = si_insert_input_ret(ctx, ret, ctx->tcs_out_lds_layout, + 8 + GFX9_SGPR_TCS_OUT_LAYOUT); + /* Tess offchip and tess factor offsets are at the beginning. */ + ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_offset, 2); + ret = si_insert_input_ret(ctx, ret, ctx->tcs_factor_offset, 4); + vgpr = 8 + GFX9_SGPR_TCS_OUT_LAYOUT + 1; + } else { + ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_layout, + GFX6_SGPR_TCS_OFFCHIP_LAYOUT); + ret = si_insert_input_ret(ctx, ret, ctx->tcs_out_lds_layout, + GFX6_SGPR_TCS_OUT_LAYOUT); + /* Tess offchip and tess factor offsets are after user SGPRs. */ + ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_offset, + GFX6_TCS_NUM_USER_SGPR); + ret = si_insert_input_ret(ctx, ret, ctx->tcs_factor_offset, + GFX6_TCS_NUM_USER_SGPR + 1); + vgpr = GFX6_TCS_NUM_USER_SGPR + 2; + } + + /* VGPRs */ + rel_patch_id = ac_to_float(&ctx->ac, rel_patch_id); + invocation_id = ac_to_float(&ctx->ac, invocation_id); + tf_lds_offset = ac_to_float(&ctx->ac, tf_lds_offset); + + /* Leave a hole corresponding to the two input VGPRs. This ensures that + * the invocation_id output does not alias the tcs_rel_ids input, + * which saves a V_MOV on gfx9. + */ + vgpr += 2; + + ret = LLVMBuildInsertValue(builder, ret, rel_patch_id, vgpr++, ""); + ret = LLVMBuildInsertValue(builder, ret, invocation_id, vgpr++, ""); + + if (ctx->shader->selector->info.tessfactors_are_def_in_all_invocs) { + vgpr++; /* skip the tess factor LDS offset */ + for (unsigned i = 0; i < 6; i++) { + LLVMValueRef value = + LLVMBuildLoad(builder, ctx->invoc0_tess_factors[i], ""); + value = ac_to_float(&ctx->ac, value); + ret = LLVMBuildInsertValue(builder, ret, value, vgpr++, ""); + } + } else { + ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, ""); + } + ctx->return_value = ret; +} + +/* Pass TCS inputs from LS to TCS on GFX9. */ +static void si_set_ls_return_value_for_tcs(struct si_shader_context *ctx) +{ + LLVMValueRef ret = ctx->return_value; + + ret = si_insert_input_ptr(ctx, ret, ctx->other_const_and_shader_buffers, 0); + ret = si_insert_input_ptr(ctx, ret, ctx->other_samplers_and_images, 1); + ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_offset, 2); + ret = si_insert_input_ret(ctx, ret, ctx->merged_wave_info, 3); + ret = si_insert_input_ret(ctx, ret, ctx->tcs_factor_offset, 4); + ret = si_insert_input_ret(ctx, ret, ctx->merged_scratch_offset, 5); + + ret = si_insert_input_ptr(ctx, ret, ctx->rw_buffers, + 8 + SI_SGPR_RW_BUFFERS); + ret = si_insert_input_ptr(ctx, ret, + ctx->bindless_samplers_and_images, + 8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES); + + ret = si_insert_input_ret(ctx, ret, ctx->vs_state_bits, + 8 + SI_SGPR_VS_STATE_BITS); + + ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_layout, + 8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT); + ret = si_insert_input_ret(ctx, ret, ctx->tcs_out_lds_offsets, + 8 + GFX9_SGPR_TCS_OUT_OFFSETS); + ret = si_insert_input_ret(ctx, ret, ctx->tcs_out_lds_layout, + 8 + GFX9_SGPR_TCS_OUT_LAYOUT); + + unsigned vgpr = 8 + GFX9_TCS_NUM_USER_SGPR; + ret = LLVMBuildInsertValue(ctx->ac.builder, ret, + ac_to_float(&ctx->ac, + ac_get_arg(&ctx->ac, ctx->args.tcs_patch_id)), + vgpr++, ""); + ret = LLVMBuildInsertValue(ctx->ac.builder, ret, + ac_to_float(&ctx->ac, + ac_get_arg(&ctx->ac, ctx->args.tcs_rel_ids)), + vgpr++, ""); + ctx->return_value = ret; +} + +void si_llvm_emit_ls_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, + LLVMValueRef *addrs) +{ + struct si_shader_context *ctx = si_shader_context_from_abi(abi); + struct si_shader *shader = ctx->shader; + struct si_shader_info *info = &shader->selector->info; + unsigned i, chan; + LLVMValueRef vertex_id = ac_get_arg(&ctx->ac, ctx->rel_auto_id); + LLVMValueRef vertex_dw_stride = get_tcs_in_vertex_dw_stride(ctx); + LLVMValueRef base_dw_addr = LLVMBuildMul(ctx->ac.builder, vertex_id, + vertex_dw_stride, ""); + + /* Write outputs to LDS. The next shader (TCS aka HS) will read + * its inputs from it. */ + for (i = 0; i < info->num_outputs; i++) { + unsigned name = info->output_semantic_name[i]; + unsigned index = info->output_semantic_index[i]; + + /* The ARB_shader_viewport_layer_array spec contains the + * following issue: + * + * 2) What happens if gl_ViewportIndex or gl_Layer is + * written in the vertex shader and a geometry shader is + * present? + * + * RESOLVED: The value written by the last vertex processing + * stage is used. If the last vertex processing stage + * (vertex, tessellation evaluation or geometry) does not + * statically assign to gl_ViewportIndex or gl_Layer, index + * or layer zero is assumed. + * + * So writes to those outputs in VS-as-LS are simply ignored. + */ + if (name == TGSI_SEMANTIC_LAYER || + name == TGSI_SEMANTIC_VIEWPORT_INDEX) + continue; + + int param = si_shader_io_get_unique_index(name, index, false); + LLVMValueRef dw_addr = LLVMBuildAdd(ctx->ac.builder, base_dw_addr, + LLVMConstInt(ctx->ac.i32, param * 4, 0), ""); + + for (chan = 0; chan < 4; chan++) { + if (!(info->output_usagemask[i] & (1 << chan))) + continue; + + lshs_lds_store(ctx, chan, dw_addr, + LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "")); + } + } + + if (ctx->screen->info.chip_class >= GFX9) + si_set_ls_return_value_for_tcs(ctx); +} + +/** + * Compile the TCS epilog function. This writes tesselation factors to memory + * based on the output primitive type of the tesselator (determined by TES). + */ +void si_llvm_build_tcs_epilog(struct si_shader_context *ctx, + union si_shader_part_key *key) +{ + memset(&ctx->args, 0, sizeof(ctx->args)); + + if (ctx->screen->info.chip_class >= GFX9) { + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, + &ctx->tcs_offchip_offset); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* wave info */ + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, + &ctx->tcs_factor_offset); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, + &ctx->tcs_offchip_layout); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, + &ctx->tcs_out_lds_layout); + } else { + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, + &ctx->tcs_offchip_layout); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, + &ctx->tcs_out_lds_layout); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, + &ctx->tcs_offchip_offset); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, + &ctx->tcs_factor_offset); + } + + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* VGPR gap */ + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* VGPR gap */ + struct ac_arg rel_patch_id; /* patch index within the wave (REL_PATCH_ID) */ + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &rel_patch_id); + struct ac_arg invocation_id; /* invocation ID within the patch */ + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &invocation_id); + struct ac_arg tcs_out_current_patch_data_offset; /* LDS offset where tess factors should be loaded from */ + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, + &tcs_out_current_patch_data_offset); + + struct ac_arg tess_factors[6]; + for (unsigned i = 0; i < 6; i++) + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &tess_factors[i]); + + /* Create the function. */ + si_llvm_create_func(ctx, "tcs_epilog", NULL, 0, + ctx->screen->info.chip_class >= GFX7 ? 128 : 0); + ac_declare_lds_as_pointer(&ctx->ac); + + LLVMValueRef invoc0_tess_factors[6]; + for (unsigned i = 0; i < 6; i++) + invoc0_tess_factors[i] = ac_get_arg(&ctx->ac, tess_factors[i]); + + si_write_tess_factors(ctx, + ac_get_arg(&ctx->ac, rel_patch_id), + ac_get_arg(&ctx->ac, invocation_id), + ac_get_arg(&ctx->ac, tcs_out_current_patch_data_offset), + invoc0_tess_factors, invoc0_tess_factors + 4); + + LLVMBuildRetVoid(ctx->ac.builder); +} + +void si_llvm_init_tcs_callbacks(struct si_shader_context *ctx) +{ + ctx->abi.load_tess_varyings = si_nir_load_tcs_varyings; + ctx->abi.load_tess_level = si_load_tess_level; + ctx->abi.store_tcs_outputs = si_nir_store_output_tcs; + ctx->abi.emit_outputs = si_llvm_emit_tcs_epilogue; + ctx->abi.load_patch_vertices_in = si_load_patch_vertices_in; +} + +void si_llvm_init_tes_callbacks(struct si_shader_context *ctx, bool ngg_cull_shader) +{ + ctx->abi.load_tess_varyings = si_nir_load_input_tes; + ctx->abi.load_tess_coord = si_load_tess_coord; + ctx->abi.load_tess_level = si_load_tess_level; + ctx->abi.load_patch_vertices_in = si_load_patch_vertices_in; + + if (ctx->shader->key.as_es) + ctx->abi.emit_outputs = si_llvm_emit_es_epilogue; + else if (ngg_cull_shader) + ctx->abi.emit_outputs = gfx10_emit_ngg_culling_epilogue_4x_wave32; + else if (ctx->shader->key.as_ngg) + ctx->abi.emit_outputs = gfx10_emit_ngg_epilogue; + else + ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue; +} diff -Nru mesa-19.2.8/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c mesa-20.0.8/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c --- mesa-19.2.8/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,1131 @@ +/* + * Copyright 2020 Advanced Micro Devices, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "si_shader_internal.h" +#include "si_pipe.h" +#include "sid.h" +#include "util/u_memory.h" + +static LLVMValueRef unpack_sint16(struct si_shader_context *ctx, + LLVMValueRef i32, unsigned index) +{ + assert(index <= 1); + + if (index == 1) + return LLVMBuildAShr(ctx->ac.builder, i32, + LLVMConstInt(ctx->ac.i32, 16, 0), ""); + + return LLVMBuildSExt(ctx->ac.builder, + LLVMBuildTrunc(ctx->ac.builder, i32, + ctx->ac.i16, ""), + ctx->ac.i32, ""); +} + +static void load_input_vs(struct si_shader_context *ctx, unsigned input_index, + LLVMValueRef out[4]) +{ + const struct si_shader_info *info = &ctx->shader->selector->info; + unsigned vs_blit_property = info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD]; + + if (vs_blit_property) { + LLVMValueRef vertex_id = ctx->abi.vertex_id; + LLVMValueRef sel_x1 = LLVMBuildICmp(ctx->ac.builder, + LLVMIntULE, vertex_id, + ctx->ac.i32_1, ""); + /* Use LLVMIntNE, because we have 3 vertices and only + * the middle one should use y2. + */ + LLVMValueRef sel_y1 = LLVMBuildICmp(ctx->ac.builder, + LLVMIntNE, vertex_id, + ctx->ac.i32_1, ""); + + unsigned param_vs_blit_inputs = ctx->vs_blit_inputs.arg_index; + if (input_index == 0) { + /* Position: */ + LLVMValueRef x1y1 = LLVMGetParam(ctx->main_fn, + param_vs_blit_inputs); + LLVMValueRef x2y2 = LLVMGetParam(ctx->main_fn, + param_vs_blit_inputs + 1); + + LLVMValueRef x1 = unpack_sint16(ctx, x1y1, 0); + LLVMValueRef y1 = unpack_sint16(ctx, x1y1, 1); + LLVMValueRef x2 = unpack_sint16(ctx, x2y2, 0); + LLVMValueRef y2 = unpack_sint16(ctx, x2y2, 1); + + LLVMValueRef x = LLVMBuildSelect(ctx->ac.builder, sel_x1, + x1, x2, ""); + LLVMValueRef y = LLVMBuildSelect(ctx->ac.builder, sel_y1, + y1, y2, ""); + + out[0] = LLVMBuildSIToFP(ctx->ac.builder, x, ctx->ac.f32, ""); + out[1] = LLVMBuildSIToFP(ctx->ac.builder, y, ctx->ac.f32, ""); + out[2] = LLVMGetParam(ctx->main_fn, + param_vs_blit_inputs + 2); + out[3] = ctx->ac.f32_1; + return; + } + + /* Color or texture coordinates: */ + assert(input_index == 1); + + if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_COLOR) { + for (int i = 0; i < 4; i++) { + out[i] = LLVMGetParam(ctx->main_fn, + param_vs_blit_inputs + 3 + i); + } + } else { + assert(vs_blit_property == SI_VS_BLIT_SGPRS_POS_TEXCOORD); + LLVMValueRef x1 = LLVMGetParam(ctx->main_fn, + param_vs_blit_inputs + 3); + LLVMValueRef y1 = LLVMGetParam(ctx->main_fn, + param_vs_blit_inputs + 4); + LLVMValueRef x2 = LLVMGetParam(ctx->main_fn, + param_vs_blit_inputs + 5); + LLVMValueRef y2 = LLVMGetParam(ctx->main_fn, + param_vs_blit_inputs + 6); + + out[0] = LLVMBuildSelect(ctx->ac.builder, sel_x1, + x1, x2, ""); + out[1] = LLVMBuildSelect(ctx->ac.builder, sel_y1, + y1, y2, ""); + out[2] = LLVMGetParam(ctx->main_fn, + param_vs_blit_inputs + 7); + out[3] = LLVMGetParam(ctx->main_fn, + param_vs_blit_inputs + 8); + } + return; + } + + unsigned num_vbos_in_user_sgprs = ctx->shader->selector->num_vbos_in_user_sgprs; + union si_vs_fix_fetch fix_fetch; + LLVMValueRef vb_desc; + LLVMValueRef vertex_index; + LLVMValueRef tmp; + + if (input_index < num_vbos_in_user_sgprs) { + vb_desc = ac_get_arg(&ctx->ac, ctx->vb_descriptors[input_index]); + } else { + unsigned index= input_index - num_vbos_in_user_sgprs; + vb_desc = ac_build_load_to_sgpr(&ctx->ac, + ac_get_arg(&ctx->ac, ctx->vertex_buffers), + LLVMConstInt(ctx->ac.i32, index, 0)); + } + + vertex_index = LLVMGetParam(ctx->main_fn, + ctx->vertex_index0.arg_index + + input_index); + + /* Use the open-coded implementation for all loads of doubles and + * of dword-sized data that needs fixups. We need to insert conversion + * code anyway, and the amd/common code does it for us. + * + * Note: On LLVM <= 8, we can only open-code formats with + * channel size >= 4 bytes. + */ + bool opencode = ctx->shader->key.mono.vs_fetch_opencode & (1 << input_index); + fix_fetch.bits = ctx->shader->key.mono.vs_fix_fetch[input_index].bits; + if (opencode || + (fix_fetch.u.log_size == 3 && fix_fetch.u.format == AC_FETCH_FORMAT_FLOAT) || + (fix_fetch.u.log_size == 2)) { + tmp = ac_build_opencoded_load_format( + &ctx->ac, fix_fetch.u.log_size, fix_fetch.u.num_channels_m1 + 1, + fix_fetch.u.format, fix_fetch.u.reverse, !opencode, + vb_desc, vertex_index, ctx->ac.i32_0, ctx->ac.i32_0, 0, true); + for (unsigned i = 0; i < 4; ++i) + out[i] = LLVMBuildExtractElement(ctx->ac.builder, tmp, LLVMConstInt(ctx->ac.i32, i, false), ""); + return; + } + + /* Do multiple loads for special formats. */ + unsigned required_channels = util_last_bit(info->input_usage_mask[input_index]); + LLVMValueRef fetches[4]; + unsigned num_fetches; + unsigned fetch_stride; + unsigned channels_per_fetch; + + if (fix_fetch.u.log_size <= 1 && fix_fetch.u.num_channels_m1 == 2) { + num_fetches = MIN2(required_channels, 3); + fetch_stride = 1 << fix_fetch.u.log_size; + channels_per_fetch = 1; + } else { + num_fetches = 1; + fetch_stride = 0; + channels_per_fetch = required_channels; + } + + for (unsigned i = 0; i < num_fetches; ++i) { + LLVMValueRef voffset = LLVMConstInt(ctx->ac.i32, fetch_stride * i, 0); + fetches[i] = ac_build_buffer_load_format(&ctx->ac, vb_desc, vertex_index, voffset, + channels_per_fetch, 0, true); + } + + if (num_fetches == 1 && channels_per_fetch > 1) { + LLVMValueRef fetch = fetches[0]; + for (unsigned i = 0; i < channels_per_fetch; ++i) { + tmp = LLVMConstInt(ctx->ac.i32, i, false); + fetches[i] = LLVMBuildExtractElement( + ctx->ac.builder, fetch, tmp, ""); + } + num_fetches = channels_per_fetch; + channels_per_fetch = 1; + } + + for (unsigned i = num_fetches; i < 4; ++i) + fetches[i] = LLVMGetUndef(ctx->ac.f32); + + if (fix_fetch.u.log_size <= 1 && fix_fetch.u.num_channels_m1 == 2 && + required_channels == 4) { + if (fix_fetch.u.format == AC_FETCH_FORMAT_UINT || fix_fetch.u.format == AC_FETCH_FORMAT_SINT) + fetches[3] = ctx->ac.i32_1; + else + fetches[3] = ctx->ac.f32_1; + } else if (fix_fetch.u.log_size == 3 && + (fix_fetch.u.format == AC_FETCH_FORMAT_SNORM || + fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED || + fix_fetch.u.format == AC_FETCH_FORMAT_SINT) && + required_channels == 4) { + /* For 2_10_10_10, the hardware returns an unsigned value; + * convert it to a signed one. + */ + LLVMValueRef tmp = fetches[3]; + LLVMValueRef c30 = LLVMConstInt(ctx->ac.i32, 30, 0); + + /* First, recover the sign-extended signed integer value. */ + if (fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED) + tmp = LLVMBuildFPToUI(ctx->ac.builder, tmp, ctx->ac.i32, ""); + else + tmp = ac_to_integer(&ctx->ac, tmp); + + /* For the integer-like cases, do a natural sign extension. + * + * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0 + * and happen to contain 0, 1, 2, 3 as the two LSBs of the + * exponent. + */ + tmp = LLVMBuildShl(ctx->ac.builder, tmp, + fix_fetch.u.format == AC_FETCH_FORMAT_SNORM ? + LLVMConstInt(ctx->ac.i32, 7, 0) : c30, ""); + tmp = LLVMBuildAShr(ctx->ac.builder, tmp, c30, ""); + + /* Convert back to the right type. */ + if (fix_fetch.u.format == AC_FETCH_FORMAT_SNORM) { + LLVMValueRef clamp; + LLVMValueRef neg_one = LLVMConstReal(ctx->ac.f32, -1.0); + tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, ctx->ac.f32, ""); + clamp = LLVMBuildFCmp(ctx->ac.builder, LLVMRealULT, tmp, neg_one, ""); + tmp = LLVMBuildSelect(ctx->ac.builder, clamp, neg_one, tmp, ""); + } else if (fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED) { + tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, ctx->ac.f32, ""); + } + + fetches[3] = tmp; + } + + for (unsigned i = 0; i < 4; ++i) + out[i] = ac_to_float(&ctx->ac, fetches[i]); +} + +static void declare_input_vs(struct si_shader_context *ctx, unsigned input_index) +{ + LLVMValueRef input[4]; + + load_input_vs(ctx, input_index / 4, input); + + for (unsigned chan = 0; chan < 4; chan++) { + ctx->inputs[input_index + chan] = + LLVMBuildBitCast(ctx->ac.builder, input[chan], ctx->ac.i32, ""); + } +} + +void si_llvm_load_vs_inputs(struct si_shader_context *ctx, struct nir_shader *nir) +{ + uint64_t processed_inputs = 0; + + nir_foreach_variable(variable, &nir->inputs) { + unsigned attrib_count = glsl_count_attribute_slots(variable->type, + true); + unsigned input_idx = variable->data.driver_location; + unsigned loc = variable->data.location; + + for (unsigned i = 0; i < attrib_count; i++) { + /* Packed components share the same location so skip + * them if we have already processed the location. + */ + if (processed_inputs & ((uint64_t)1 << (loc + i))) { + input_idx += 4; + continue; + } + + declare_input_vs(ctx, input_idx); + if (glsl_type_is_dual_slot(variable->type)) { + input_idx += 4; + declare_input_vs(ctx, input_idx); + } + + processed_inputs |= ((uint64_t)1 << (loc + i)); + input_idx += 4; + } + } +} + +void si_llvm_streamout_store_output(struct si_shader_context *ctx, + LLVMValueRef const *so_buffers, + LLVMValueRef const *so_write_offsets, + struct pipe_stream_output *stream_out, + struct si_shader_output_values *shader_out) +{ + unsigned buf_idx = stream_out->output_buffer; + unsigned start = stream_out->start_component; + unsigned num_comps = stream_out->num_components; + LLVMValueRef out[4]; + + assert(num_comps && num_comps <= 4); + if (!num_comps || num_comps > 4) + return; + + /* Load the output as int. */ + for (int j = 0; j < num_comps; j++) { + assert(stream_out->stream == shader_out->vertex_stream[start + j]); + + out[j] = ac_to_integer(&ctx->ac, shader_out->values[start + j]); + } + + /* Pack the output. */ + LLVMValueRef vdata = NULL; + + switch (num_comps) { + case 1: /* as i32 */ + vdata = out[0]; + break; + case 2: /* as v2i32 */ + case 3: /* as v3i32 */ + if (ac_has_vec3_support(ctx->screen->info.chip_class, false)) { + vdata = ac_build_gather_values(&ctx->ac, out, num_comps); + break; + } + /* as v4i32 (aligned to 4) */ + out[3] = LLVMGetUndef(ctx->ac.i32); + /* fall through */ + case 4: /* as v4i32 */ + vdata = ac_build_gather_values(&ctx->ac, out, util_next_power_of_two(num_comps)); + break; + } + + ac_build_buffer_store_dword(&ctx->ac, so_buffers[buf_idx], + vdata, num_comps, + so_write_offsets[buf_idx], + ctx->ac.i32_0, + stream_out->dst_offset * 4, ac_glc | ac_slc); +} + +/** + * Write streamout data to buffers for vertex stream @p stream (different + * vertex streams can occur for GS copy shaders). + */ +void si_llvm_emit_streamout(struct si_shader_context *ctx, + struct si_shader_output_values *outputs, + unsigned noutput, unsigned stream) +{ + struct si_shader_selector *sel = ctx->shader->selector; + struct pipe_stream_output_info *so = &sel->so; + LLVMBuilderRef builder = ctx->ac.builder; + int i; + + /* Get bits [22:16], i.e. (so_param >> 16) & 127; */ + LLVMValueRef so_vtx_count = + si_unpack_param(ctx, ctx->streamout_config, 16, 7); + + LLVMValueRef tid = ac_get_thread_id(&ctx->ac); + + /* can_emit = tid < so_vtx_count; */ + LLVMValueRef can_emit = + LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, ""); + + /* Emit the streamout code conditionally. This actually avoids + * out-of-bounds buffer access. The hw tells us via the SGPR + * (so_vtx_count) which threads are allowed to emit streamout data. */ + ac_build_ifcc(&ctx->ac, can_emit, 6501); + { + /* The buffer offset is computed as follows: + * ByteOffset = streamout_offset[buffer_id]*4 + + * (streamout_write_index + thread_id)*stride[buffer_id] + + * attrib_offset + */ + + LLVMValueRef so_write_index = + ac_get_arg(&ctx->ac, + ctx->streamout_write_index); + + /* Compute (streamout_write_index + thread_id). */ + so_write_index = LLVMBuildAdd(builder, so_write_index, tid, ""); + + /* Load the descriptor and compute the write offset for each + * enabled buffer. */ + LLVMValueRef so_write_offset[4] = {}; + LLVMValueRef so_buffers[4]; + LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, + ctx->rw_buffers); + + for (i = 0; i < 4; i++) { + if (!so->stride[i]) + continue; + + LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, + SI_VS_STREAMOUT_BUF0 + i, 0); + + so_buffers[i] = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset); + + LLVMValueRef so_offset = ac_get_arg(&ctx->ac, + ctx->streamout_offset[i]); + so_offset = LLVMBuildMul(builder, so_offset, LLVMConstInt(ctx->ac.i32, 4, 0), ""); + + so_write_offset[i] = ac_build_imad(&ctx->ac, so_write_index, + LLVMConstInt(ctx->ac.i32, so->stride[i]*4, 0), + so_offset); + } + + /* Write streamout data. */ + for (i = 0; i < so->num_outputs; i++) { + unsigned reg = so->output[i].register_index; + + if (reg >= noutput) + continue; + + if (stream != so->output[i].stream) + continue; + + si_llvm_streamout_store_output(ctx, so_buffers, so_write_offset, + &so->output[i], &outputs[reg]); + } + } + ac_build_endif(&ctx->ac, 6501); +} + +static void si_llvm_emit_clipvertex(struct si_shader_context *ctx, + struct ac_export_args *pos, LLVMValueRef *out_elts) +{ + unsigned reg_index; + unsigned chan; + unsigned const_chan; + LLVMValueRef base_elt; + LLVMValueRef ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers); + LLVMValueRef constbuf_index = LLVMConstInt(ctx->ac.i32, + SI_VS_CONST_CLIP_PLANES, 0); + LLVMValueRef const_resource = ac_build_load_to_sgpr(&ctx->ac, ptr, constbuf_index); + + for (reg_index = 0; reg_index < 2; reg_index ++) { + struct ac_export_args *args = &pos[2 + reg_index]; + + args->out[0] = + args->out[1] = + args->out[2] = + args->out[3] = LLVMConstReal(ctx->ac.f32, 0.0f); + + /* Compute dot products of position and user clip plane vectors */ + for (chan = 0; chan < 4; chan++) { + for (const_chan = 0; const_chan < 4; const_chan++) { + LLVMValueRef addr = + LLVMConstInt(ctx->ac.i32, ((reg_index * 4 + chan) * 4 + + const_chan) * 4, 0); + base_elt = si_buffer_load_const(ctx, const_resource, + addr); + args->out[chan] = ac_build_fmad(&ctx->ac, base_elt, + out_elts[const_chan], args->out[chan]); + } + } + + args->enabled_channels = 0xf; + args->valid_mask = 0; + args->done = 0; + args->target = V_008DFC_SQ_EXP_POS + 2 + reg_index; + args->compr = 0; + } +} + +/* Initialize arguments for the shader export intrinsic */ +static void si_llvm_init_vs_export_args(struct si_shader_context *ctx, + LLVMValueRef *values, + unsigned target, + struct ac_export_args *args) +{ + args->enabled_channels = 0xf; /* writemask - default is 0xf */ + args->valid_mask = 0; /* Specify whether the EXEC mask represents the valid mask */ + args->done = 0; /* Specify whether this is the last export */ + args->target = target; /* Specify the target we are exporting */ + args->compr = false; + + memcpy(&args->out[0], values, sizeof(values[0]) * 4); +} + +static void si_export_param(struct si_shader_context *ctx, unsigned index, + LLVMValueRef *values) +{ + struct ac_export_args args; + + si_llvm_init_vs_export_args(ctx, values, + V_008DFC_SQ_EXP_PARAM + index, &args); + ac_build_export(&ctx->ac, &args); +} + +static void si_build_param_exports(struct si_shader_context *ctx, + struct si_shader_output_values *outputs, + unsigned noutput) +{ + struct si_shader *shader = ctx->shader; + unsigned param_count = 0; + + for (unsigned i = 0; i < noutput; i++) { + unsigned semantic_name = outputs[i].semantic_name; + unsigned semantic_index = outputs[i].semantic_index; + + if (outputs[i].vertex_stream[0] != 0 && + outputs[i].vertex_stream[1] != 0 && + outputs[i].vertex_stream[2] != 0 && + outputs[i].vertex_stream[3] != 0) + continue; + + switch (semantic_name) { + case TGSI_SEMANTIC_LAYER: + case TGSI_SEMANTIC_VIEWPORT_INDEX: + case TGSI_SEMANTIC_CLIPDIST: + case TGSI_SEMANTIC_COLOR: + case TGSI_SEMANTIC_BCOLOR: + case TGSI_SEMANTIC_PRIMID: + case TGSI_SEMANTIC_FOG: + case TGSI_SEMANTIC_TEXCOORD: + case TGSI_SEMANTIC_GENERIC: + break; + default: + continue; + } + + if ((semantic_name != TGSI_SEMANTIC_GENERIC || + semantic_index < SI_MAX_IO_GENERIC) && + shader->key.opt.kill_outputs & + (1ull << si_shader_io_get_unique_index(semantic_name, + semantic_index, true))) + continue; + + si_export_param(ctx, param_count, outputs[i].values); + + assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset)); + shader->info.vs_output_param_offset[i] = param_count++; + } + + shader->info.nr_param_exports = param_count; +} + +/** + * Vertex color clamping. + * + * This uses a state constant loaded in a user data SGPR and + * an IF statement is added that clamps all colors if the constant + * is true. + */ +static void si_vertex_color_clamping(struct si_shader_context *ctx, + struct si_shader_output_values *outputs, + unsigned noutput) +{ + LLVMValueRef addr[SI_MAX_VS_OUTPUTS][4]; + bool has_colors = false; + + /* Store original colors to alloca variables. */ + for (unsigned i = 0; i < noutput; i++) { + if (outputs[i].semantic_name != TGSI_SEMANTIC_COLOR && + outputs[i].semantic_name != TGSI_SEMANTIC_BCOLOR) + continue; + + for (unsigned j = 0; j < 4; j++) { + addr[i][j] = ac_build_alloca_undef(&ctx->ac, ctx->ac.f32, ""); + LLVMBuildStore(ctx->ac.builder, outputs[i].values[j], addr[i][j]); + } + has_colors = true; + } + + if (!has_colors) + return; + + /* The state is in the first bit of the user SGPR. */ + LLVMValueRef cond = ac_get_arg(&ctx->ac, ctx->vs_state_bits); + cond = LLVMBuildTrunc(ctx->ac.builder, cond, ctx->ac.i1, ""); + + ac_build_ifcc(&ctx->ac, cond, 6502); + + /* Store clamped colors to alloca variables within the conditional block. */ + for (unsigned i = 0; i < noutput; i++) { + if (outputs[i].semantic_name != TGSI_SEMANTIC_COLOR && + outputs[i].semantic_name != TGSI_SEMANTIC_BCOLOR) + continue; + + for (unsigned j = 0; j < 4; j++) { + LLVMBuildStore(ctx->ac.builder, + ac_build_clamp(&ctx->ac, outputs[i].values[j]), + addr[i][j]); + } + } + ac_build_endif(&ctx->ac, 6502); + + /* Load clamped colors */ + for (unsigned i = 0; i < noutput; i++) { + if (outputs[i].semantic_name != TGSI_SEMANTIC_COLOR && + outputs[i].semantic_name != TGSI_SEMANTIC_BCOLOR) + continue; + + for (unsigned j = 0; j < 4; j++) { + outputs[i].values[j] = + LLVMBuildLoad(ctx->ac.builder, addr[i][j], ""); + } + } +} + +/* Generate export instructions for hardware VS shader stage or NGG GS stage + * (position and parameter data only). + */ +void si_llvm_build_vs_exports(struct si_shader_context *ctx, + struct si_shader_output_values *outputs, + unsigned noutput) +{ + struct si_shader *shader = ctx->shader; + struct ac_export_args pos_args[4] = {}; + LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL, viewport_index_value = NULL; + unsigned pos_idx; + int i; + + si_vertex_color_clamping(ctx, outputs, noutput); + + /* Build position exports. */ + for (i = 0; i < noutput; i++) { + switch (outputs[i].semantic_name) { + case TGSI_SEMANTIC_POSITION: + si_llvm_init_vs_export_args(ctx, outputs[i].values, + V_008DFC_SQ_EXP_POS, &pos_args[0]); + break; + case TGSI_SEMANTIC_PSIZE: + psize_value = outputs[i].values[0]; + break; + case TGSI_SEMANTIC_LAYER: + layer_value = outputs[i].values[0]; + break; + case TGSI_SEMANTIC_VIEWPORT_INDEX: + viewport_index_value = outputs[i].values[0]; + break; + case TGSI_SEMANTIC_EDGEFLAG: + edgeflag_value = outputs[i].values[0]; + break; + case TGSI_SEMANTIC_CLIPDIST: + if (!shader->key.opt.clip_disable) { + unsigned index = 2 + outputs[i].semantic_index; + si_llvm_init_vs_export_args(ctx, outputs[i].values, + V_008DFC_SQ_EXP_POS + index, + &pos_args[index]); + } + break; + case TGSI_SEMANTIC_CLIPVERTEX: + if (!shader->key.opt.clip_disable) { + si_llvm_emit_clipvertex(ctx, pos_args, + outputs[i].values); + } + break; + } + } + + /* We need to add the position output manually if it's missing. */ + if (!pos_args[0].out[0]) { + pos_args[0].enabled_channels = 0xf; /* writemask */ + pos_args[0].valid_mask = 0; /* EXEC mask */ + pos_args[0].done = 0; /* last export? */ + pos_args[0].target = V_008DFC_SQ_EXP_POS; + pos_args[0].compr = 0; /* COMPR flag */ + pos_args[0].out[0] = ctx->ac.f32_0; /* X */ + pos_args[0].out[1] = ctx->ac.f32_0; /* Y */ + pos_args[0].out[2] = ctx->ac.f32_0; /* Z */ + pos_args[0].out[3] = ctx->ac.f32_1; /* W */ + } + + bool pos_writes_edgeflag = shader->selector->info.writes_edgeflag && + !shader->key.as_ngg; + + /* Write the misc vector (point size, edgeflag, layer, viewport). */ + if (shader->selector->info.writes_psize || + pos_writes_edgeflag || + shader->selector->info.writes_viewport_index || + shader->selector->info.writes_layer) { + pos_args[1].enabled_channels = shader->selector->info.writes_psize | + (pos_writes_edgeflag << 1) | + (shader->selector->info.writes_layer << 2); + + pos_args[1].valid_mask = 0; /* EXEC mask */ + pos_args[1].done = 0; /* last export? */ + pos_args[1].target = V_008DFC_SQ_EXP_POS + 1; + pos_args[1].compr = 0; /* COMPR flag */ + pos_args[1].out[0] = ctx->ac.f32_0; /* X */ + pos_args[1].out[1] = ctx->ac.f32_0; /* Y */ + pos_args[1].out[2] = ctx->ac.f32_0; /* Z */ + pos_args[1].out[3] = ctx->ac.f32_0; /* W */ + + if (shader->selector->info.writes_psize) + pos_args[1].out[0] = psize_value; + + if (pos_writes_edgeflag) { + /* The output is a float, but the hw expects an integer + * with the first bit containing the edge flag. */ + edgeflag_value = LLVMBuildFPToUI(ctx->ac.builder, + edgeflag_value, + ctx->ac.i32, ""); + edgeflag_value = ac_build_umin(&ctx->ac, + edgeflag_value, + ctx->ac.i32_1); + + /* The LLVM intrinsic expects a float. */ + pos_args[1].out[1] = ac_to_float(&ctx->ac, edgeflag_value); + } + + if (ctx->screen->info.chip_class >= GFX9) { + /* GFX9 has the layer in out.z[10:0] and the viewport + * index in out.z[19:16]. + */ + if (shader->selector->info.writes_layer) + pos_args[1].out[2] = layer_value; + + if (shader->selector->info.writes_viewport_index) { + LLVMValueRef v = viewport_index_value; + + v = ac_to_integer(&ctx->ac, v); + v = LLVMBuildShl(ctx->ac.builder, v, + LLVMConstInt(ctx->ac.i32, 16, 0), ""); + v = LLVMBuildOr(ctx->ac.builder, v, + ac_to_integer(&ctx->ac, pos_args[1].out[2]), ""); + pos_args[1].out[2] = ac_to_float(&ctx->ac, v); + pos_args[1].enabled_channels |= 1 << 2; + } + } else { + if (shader->selector->info.writes_layer) + pos_args[1].out[2] = layer_value; + + if (shader->selector->info.writes_viewport_index) { + pos_args[1].out[3] = viewport_index_value; + pos_args[1].enabled_channels |= 1 << 3; + } + } + } + + for (i = 0; i < 4; i++) + if (pos_args[i].out[0]) + shader->info.nr_pos_exports++; + + /* Navi10-14 skip POS0 exports if EXEC=0 and DONE=0, causing a hang. + * Setting valid_mask=1 prevents it and has no other effect. + */ + if (ctx->screen->info.family == CHIP_NAVI10 || + ctx->screen->info.family == CHIP_NAVI12 || + ctx->screen->info.family == CHIP_NAVI14) + pos_args[0].valid_mask = 1; + + pos_idx = 0; + for (i = 0; i < 4; i++) { + if (!pos_args[i].out[0]) + continue; + + /* Specify the target we are exporting */ + pos_args[i].target = V_008DFC_SQ_EXP_POS + pos_idx++; + + if (pos_idx == shader->info.nr_pos_exports) + /* Specify that this is the last export */ + pos_args[i].done = 1; + + ac_build_export(&ctx->ac, &pos_args[i]); + } + + /* Build parameter exports. */ + si_build_param_exports(ctx, outputs, noutput); +} + +void si_llvm_emit_vs_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, + LLVMValueRef *addrs) +{ + struct si_shader_context *ctx = si_shader_context_from_abi(abi); + struct si_shader_info *info = &ctx->shader->selector->info; + struct si_shader_output_values *outputs = NULL; + int i,j; + + assert(!ctx->shader->is_gs_copy_shader); + assert(info->num_outputs <= max_outputs); + + outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0])); + + for (i = 0; i < info->num_outputs; i++) { + outputs[i].semantic_name = info->output_semantic_name[i]; + outputs[i].semantic_index = info->output_semantic_index[i]; + + for (j = 0; j < 4; j++) { + outputs[i].values[j] = + LLVMBuildLoad(ctx->ac.builder, + addrs[4 * i + j], + ""); + outputs[i].vertex_stream[j] = + (info->output_streams[i] >> (2 * j)) & 3; + } + } + + if (!ctx->screen->use_ngg_streamout && + ctx->shader->selector->so.num_outputs) + si_llvm_emit_streamout(ctx, outputs, i, 0); + + /* Export PrimitiveID. */ + if (ctx->shader->key.mono.u.vs_export_prim_id) { + outputs[i].semantic_name = TGSI_SEMANTIC_PRIMID; + outputs[i].semantic_index = 0; + outputs[i].values[0] = ac_to_float(&ctx->ac, si_get_primitive_id(ctx, 0)); + for (j = 1; j < 4; j++) + outputs[i].values[j] = LLVMConstReal(ctx->ac.f32, 0); + + memset(outputs[i].vertex_stream, 0, + sizeof(outputs[i].vertex_stream)); + i++; + } + + si_llvm_build_vs_exports(ctx, outputs, i); + FREE(outputs); +} + +static void si_llvm_emit_prim_discard_cs_epilogue(struct ac_shader_abi *abi, + unsigned max_outputs, + LLVMValueRef *addrs) +{ + struct si_shader_context *ctx = si_shader_context_from_abi(abi); + struct si_shader_info *info = &ctx->shader->selector->info; + LLVMValueRef pos[4] = {}; + + assert(info->num_outputs <= max_outputs); + + for (unsigned i = 0; i < info->num_outputs; i++) { + if (info->output_semantic_name[i] != TGSI_SEMANTIC_POSITION) + continue; + + for (unsigned chan = 0; chan < 4; chan++) + pos[chan] = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], ""); + break; + } + assert(pos[0] != NULL); + + /* Return the position output. */ + LLVMValueRef ret = ctx->return_value; + for (unsigned chan = 0; chan < 4; chan++) + ret = LLVMBuildInsertValue(ctx->ac.builder, ret, pos[chan], chan, ""); + ctx->return_value = ret; +} + +/** + * Build the vertex shader prolog function. + * + * The inputs are the same as VS (a lot of SGPRs and 4 VGPR system values). + * All inputs are returned unmodified. The vertex load indices are + * stored after them, which will be used by the API VS for fetching inputs. + * + * For example, the expected outputs for instance_divisors[] = {0, 1, 2} are: + * input_v0, + * input_v1, + * input_v2, + * input_v3, + * (VertexID + BaseVertex), + * (InstanceID + StartInstance), + * (InstanceID / 2 + StartInstance) + */ +void si_llvm_build_vs_prolog(struct si_shader_context *ctx, + union si_shader_part_key *key) +{ + LLVMTypeRef *returns; + LLVMValueRef ret, func; + int num_returns, i; + unsigned first_vs_vgpr = key->vs_prolog.num_merged_next_stage_vgprs; + unsigned num_input_vgprs = key->vs_prolog.num_merged_next_stage_vgprs + 4 + + (key->vs_prolog.has_ngg_cull_inputs ? 1 : 0); + struct ac_arg input_sgpr_param[key->vs_prolog.num_input_sgprs]; + struct ac_arg input_vgpr_param[10]; + LLVMValueRef input_vgprs[10]; + unsigned num_all_input_regs = key->vs_prolog.num_input_sgprs + + num_input_vgprs; + unsigned user_sgpr_base = key->vs_prolog.num_merged_next_stage_vgprs ? 8 : 0; + + memset(&ctx->args, 0, sizeof(ctx->args)); + + /* 4 preloaded VGPRs + vertex load indices as prolog outputs */ + returns = alloca((num_all_input_regs + key->vs_prolog.num_inputs) * + sizeof(LLVMTypeRef)); + num_returns = 0; + + /* Declare input and output SGPRs. */ + for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) { + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, + &input_sgpr_param[i]); + returns[num_returns++] = ctx->ac.i32; + } + + struct ac_arg merged_wave_info = input_sgpr_param[3]; + + /* Preloaded VGPRs (outputs must be floats) */ + for (i = 0; i < num_input_vgprs; i++) { + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &input_vgpr_param[i]); + returns[num_returns++] = ctx->ac.f32; + } + + /* Vertex load indices. */ + for (i = 0; i < key->vs_prolog.num_inputs; i++) + returns[num_returns++] = ctx->ac.f32; + + /* Create the function. */ + si_llvm_create_func(ctx, "vs_prolog", returns, num_returns, 0); + func = ctx->main_fn; + + for (i = 0; i < num_input_vgprs; i++) { + input_vgprs[i] = ac_get_arg(&ctx->ac, input_vgpr_param[i]); + } + + if (key->vs_prolog.num_merged_next_stage_vgprs) { + if (!key->vs_prolog.is_monolithic) + si_init_exec_from_input(ctx, merged_wave_info, 0); + + if (key->vs_prolog.as_ls && + ctx->screen->info.has_ls_vgpr_init_bug) { + /* If there are no HS threads, SPI loads the LS VGPRs + * starting at VGPR 0. Shift them back to where they + * belong. + */ + LLVMValueRef has_hs_threads = + LLVMBuildICmp(ctx->ac.builder, LLVMIntNE, + si_unpack_param(ctx, input_sgpr_param[3], 8, 8), + ctx->ac.i32_0, ""); + + for (i = 4; i > 0; --i) { + input_vgprs[i + 1] = + LLVMBuildSelect(ctx->ac.builder, has_hs_threads, + input_vgprs[i + 1], + input_vgprs[i - 1], ""); + } + } + } + + if (key->vs_prolog.gs_fast_launch_tri_list || + key->vs_prolog.gs_fast_launch_tri_strip) { + LLVMValueRef wave_id, thread_id_in_tg; + + wave_id = si_unpack_param(ctx, input_sgpr_param[3], 24, 4); + thread_id_in_tg = ac_build_imad(&ctx->ac, wave_id, + LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, false), + ac_get_thread_id(&ctx->ac)); + + /* The GS fast launch initializes all VGPRs to the value of + * the first thread, so we have to add the thread ID. + * + * Only these are initialized by the hw: + * VGPR2: Base Primitive ID + * VGPR5: Base Vertex ID + * VGPR6: Instance ID + */ + + /* Put the vertex thread IDs into VGPRs as-is instead of packing them. + * The NGG cull shader will read them from there. + */ + if (key->vs_prolog.gs_fast_launch_tri_list) { + input_vgprs[0] = ac_build_imad(&ctx->ac, thread_id_in_tg, /* gs_vtx01_offset */ + LLVMConstInt(ctx->ac.i32, 3, 0), /* Vertex 0 */ + LLVMConstInt(ctx->ac.i32, 0, 0)); + input_vgprs[1] = ac_build_imad(&ctx->ac, thread_id_in_tg, /* gs_vtx23_offset */ + LLVMConstInt(ctx->ac.i32, 3, 0), /* Vertex 1 */ + LLVMConstInt(ctx->ac.i32, 1, 0)); + input_vgprs[4] = ac_build_imad(&ctx->ac, thread_id_in_tg, /* gs_vtx45_offset */ + LLVMConstInt(ctx->ac.i32, 3, 0), /* Vertex 2 */ + LLVMConstInt(ctx->ac.i32, 2, 0)); + } else { + assert(key->vs_prolog.gs_fast_launch_tri_strip); + LLVMBuilderRef builder = ctx->ac.builder; + /* Triangle indices: */ + LLVMValueRef index[3] = { + thread_id_in_tg, + LLVMBuildAdd(builder, thread_id_in_tg, + LLVMConstInt(ctx->ac.i32, 1, 0), ""), + LLVMBuildAdd(builder, thread_id_in_tg, + LLVMConstInt(ctx->ac.i32, 2, 0), ""), + }; + LLVMValueRef is_odd = LLVMBuildTrunc(ctx->ac.builder, + thread_id_in_tg, ctx->ac.i1, ""); + LLVMValueRef flatshade_first = + LLVMBuildICmp(builder, LLVMIntEQ, + si_unpack_param(ctx, ctx->vs_state_bits, 4, 2), + ctx->ac.i32_0, ""); + + ac_build_triangle_strip_indices_to_triangle(&ctx->ac, is_odd, + flatshade_first, index); + input_vgprs[0] = index[0]; + input_vgprs[1] = index[1]; + input_vgprs[4] = index[2]; + } + + /* Triangles always have all edge flags set initially. */ + input_vgprs[3] = LLVMConstInt(ctx->ac.i32, 0x7 << 8, 0); + + input_vgprs[2] = LLVMBuildAdd(ctx->ac.builder, input_vgprs[2], + thread_id_in_tg, ""); /* PrimID */ + input_vgprs[5] = LLVMBuildAdd(ctx->ac.builder, input_vgprs[5], + thread_id_in_tg, ""); /* VertexID */ + input_vgprs[8] = input_vgprs[6]; /* InstanceID */ + } + + unsigned vertex_id_vgpr = first_vs_vgpr; + unsigned instance_id_vgpr = + ctx->screen->info.chip_class >= GFX10 ? + first_vs_vgpr + 3 : + first_vs_vgpr + (key->vs_prolog.as_ls ? 2 : 1); + + ctx->abi.vertex_id = input_vgprs[vertex_id_vgpr]; + ctx->abi.instance_id = input_vgprs[instance_id_vgpr]; + + /* InstanceID = VertexID >> 16; + * VertexID = VertexID & 0xffff; + */ + if (key->vs_prolog.states.unpack_instance_id_from_vertex_id) { + ctx->abi.instance_id = LLVMBuildLShr(ctx->ac.builder, ctx->abi.vertex_id, + LLVMConstInt(ctx->ac.i32, 16, 0), ""); + ctx->abi.vertex_id = LLVMBuildAnd(ctx->ac.builder, ctx->abi.vertex_id, + LLVMConstInt(ctx->ac.i32, 0xffff, 0), ""); + } + + /* Copy inputs to outputs. This should be no-op, as the registers match, + * but it will prevent the compiler from overwriting them unintentionally. + */ + ret = ctx->return_value; + for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) { + LLVMValueRef p = LLVMGetParam(func, i); + ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p, i, ""); + } + for (i = 0; i < num_input_vgprs; i++) { + LLVMValueRef p = input_vgprs[i]; + + if (i == vertex_id_vgpr) + p = ctx->abi.vertex_id; + else if (i == instance_id_vgpr) + p = ctx->abi.instance_id; + + p = ac_to_float(&ctx->ac, p); + ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p, + key->vs_prolog.num_input_sgprs + i, ""); + } + + /* Compute vertex load indices from instance divisors. */ + LLVMValueRef instance_divisor_constbuf = NULL; + + if (key->vs_prolog.states.instance_divisor_is_fetched) { + LLVMValueRef list = si_prolog_get_rw_buffers(ctx); + LLVMValueRef buf_index = + LLVMConstInt(ctx->ac.i32, SI_VS_CONST_INSTANCE_DIVISORS, 0); + instance_divisor_constbuf = + ac_build_load_to_sgpr(&ctx->ac, list, buf_index); + } + + for (i = 0; i < key->vs_prolog.num_inputs; i++) { + bool divisor_is_one = + key->vs_prolog.states.instance_divisor_is_one & (1u << i); + bool divisor_is_fetched = + key->vs_prolog.states.instance_divisor_is_fetched & (1u << i); + LLVMValueRef index = NULL; + + if (divisor_is_one) { + index = ctx->abi.instance_id; + } else if (divisor_is_fetched) { + LLVMValueRef udiv_factors[4]; + + for (unsigned j = 0; j < 4; j++) { + udiv_factors[j] = + si_buffer_load_const(ctx, instance_divisor_constbuf, + LLVMConstInt(ctx->ac.i32, i*16 + j*4, 0)); + udiv_factors[j] = ac_to_integer(&ctx->ac, udiv_factors[j]); + } + /* The faster NUW version doesn't work when InstanceID == UINT_MAX. + * Such InstanceID might not be achievable in a reasonable time though. + */ + index = ac_build_fast_udiv_nuw(&ctx->ac, ctx->abi.instance_id, + udiv_factors[0], udiv_factors[1], + udiv_factors[2], udiv_factors[3]); + } + + if (divisor_is_one || divisor_is_fetched) { + /* Add StartInstance. */ + index = LLVMBuildAdd(ctx->ac.builder, index, + LLVMGetParam(ctx->main_fn, user_sgpr_base + + SI_SGPR_START_INSTANCE), ""); + } else { + /* VertexID + BaseVertex */ + index = LLVMBuildAdd(ctx->ac.builder, + ctx->abi.vertex_id, + LLVMGetParam(func, user_sgpr_base + + SI_SGPR_BASE_VERTEX), ""); + } + + index = ac_to_float(&ctx->ac, index); + ret = LLVMBuildInsertValue(ctx->ac.builder, ret, index, + ctx->args.arg_count + i, ""); + } + + si_llvm_build_ret(ctx, ret); +} + +static LLVMValueRef get_base_vertex(struct ac_shader_abi *abi) +{ + struct si_shader_context *ctx = si_shader_context_from_abi(abi); + + /* For non-indexed draws, the base vertex set by the driver + * (for direct draws) or the CP (for indirect draws) is the + * first vertex ID, but GLSL expects 0 to be returned. + */ + LLVMValueRef vs_state = ac_get_arg(&ctx->ac, + ctx->vs_state_bits); + LLVMValueRef indexed; + + indexed = LLVMBuildLShr(ctx->ac.builder, vs_state, ctx->ac.i32_1, ""); + indexed = LLVMBuildTrunc(ctx->ac.builder, indexed, ctx->ac.i1, ""); + + return LLVMBuildSelect(ctx->ac.builder, indexed, + ac_get_arg(&ctx->ac, ctx->args.base_vertex), + ctx->ac.i32_0, ""); +} + +void si_llvm_init_vs_callbacks(struct si_shader_context *ctx, bool ngg_cull_shader) +{ + struct si_shader *shader = ctx->shader; + + if (shader->key.as_ls) + ctx->abi.emit_outputs = si_llvm_emit_ls_epilogue; + else if (shader->key.as_es) + ctx->abi.emit_outputs = si_llvm_emit_es_epilogue; + else if (shader->key.opt.vs_as_prim_discard_cs) + ctx->abi.emit_outputs = si_llvm_emit_prim_discard_cs_epilogue; + else if (ngg_cull_shader) + ctx->abi.emit_outputs = gfx10_emit_ngg_culling_epilogue_4x_wave32; + else if (shader->key.as_ngg) + ctx->abi.emit_outputs = gfx10_emit_ngg_epilogue; + else + ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue; + + ctx->abi.load_base_vertex = get_base_vertex; +} diff -Nru mesa-19.2.8/src/gallium/drivers/radeonsi/si_shader_nir.c mesa-20.0.8/src/gallium/drivers/radeonsi/si_shader_nir.c --- mesa-19.2.8/src/gallium/drivers/radeonsi/si_shader_nir.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/radeonsi/si_shader_nir.c 2020-06-12 01:21:17.000000000 +0000 @@ -32,6 +32,7 @@ #include "compiler/nir/nir.h" #include "compiler/nir_types.h" #include "compiler/nir/nir_builder.h" +#include "compiler/nir/nir_deref.h" static nir_variable* tex_get_texture_var(nir_tex_instr *instr) { @@ -52,39 +53,87 @@ return nir_deref_instr_get_variable(nir_src_as_deref(instr->src[0])); } -static void gather_intrinsic_load_deref_input_info(const nir_shader *nir, - const nir_intrinsic_instr *instr, - nir_variable *var, - struct tgsi_shader_info *info) -{ - assert(var && var->data.mode == nir_var_shader_in); - - switch (nir->info.stage) { - case MESA_SHADER_VERTEX: { - unsigned i = var->data.driver_location; - unsigned attrib_count = glsl_count_attribute_slots(var->type, false); - uint8_t mask = nir_ssa_def_components_read(&instr->dest.ssa); - - for (unsigned j = 0; j < attrib_count; j++, i++) { - if (glsl_type_is_64bit(glsl_without_array(var->type))) { - unsigned dmask = mask; +static void gather_usage_helper(const nir_deref_instr **deref_ptr, + unsigned location, + uint8_t mask, + uint8_t *usage_mask) +{ + for (; *deref_ptr; deref_ptr++) { + const nir_deref_instr *deref = *deref_ptr; + switch (deref->deref_type) { + case nir_deref_type_array: { + unsigned elem_size = + glsl_count_attribute_slots(deref->type, false); + if (nir_src_is_const(deref->arr.index)) { + location += elem_size * nir_src_as_uint(deref->arr.index); + } else { + unsigned array_elems = + glsl_get_length(deref_ptr[-1]->type); + for (unsigned i = 0; i < array_elems; i++) { + gather_usage_helper(deref_ptr + 1, + location + elem_size * i, + mask, usage_mask); + } + return; + } + break; + } + case nir_deref_type_struct: { + const struct glsl_type *parent_type = + deref_ptr[-1]->type; + unsigned index = deref->strct.index; + for (unsigned i = 0; i < index; i++) { + const struct glsl_type *ft = glsl_get_struct_field(parent_type, i); + location += glsl_count_attribute_slots(ft, false); + } + break; + } + default: + unreachable("Unhandled deref type in gather_components_used_helper"); + } + } - if (glsl_type_is_dual_slot(glsl_without_array(var->type)) && j % 2) - dmask >>= 2; + usage_mask[location] |= mask & 0xf; + if (mask & 0xf0) + usage_mask[location + 1] |= (mask >> 4) & 0xf; +} - dmask <<= var->data.location_frac / 2; +static void gather_usage(const nir_deref_instr *deref, + uint8_t mask, + uint8_t *usage_mask) +{ + nir_deref_path path; + nir_deref_path_init(&path, (nir_deref_instr *)deref, NULL); - if (dmask & 0x1) - info->input_usage_mask[i] |= TGSI_WRITEMASK_XY; - if (dmask & 0x2) - info->input_usage_mask[i] |= TGSI_WRITEMASK_ZW; - } else { - info->input_usage_mask[i] |= - (mask << var->data.location_frac) & 0xf; - } + unsigned location_frac = path.path[0]->var->data.location_frac; + if (glsl_type_is_64bit(deref->type)) { + uint8_t new_mask = 0; + for (unsigned i = 0; i < 4; i++) { + if (mask & (1 << i)) + new_mask |= 0x3 << (2 * i); } - break; + mask = new_mask << location_frac; + } else { + mask <<= location_frac; + mask &= 0xf; } + + gather_usage_helper((const nir_deref_instr **)&path.path[1], + path.path[0]->var->data.driver_location, + mask, usage_mask); + + nir_deref_path_finish(&path); +} + +static void gather_intrinsic_load_deref_input_info(const nir_shader *nir, + const nir_intrinsic_instr *instr, + const nir_deref_instr *deref, + struct si_shader_info *info) +{ + switch (nir->info.stage) { + case MESA_SHADER_VERTEX: + gather_usage(deref, nir_ssa_def_components_read(&instr->dest.ssa), + info->input_usage_mask); default:; } } @@ -92,7 +141,7 @@ static void gather_intrinsic_load_deref_output_info(const nir_shader *nir, const nir_intrinsic_instr *instr, nir_variable *var, - struct tgsi_shader_info *info) + struct si_shader_info *info) { assert(var && var->data.mode == nir_var_shader_out); @@ -117,48 +166,22 @@ static void gather_intrinsic_store_deref_output_info(const nir_shader *nir, const nir_intrinsic_instr *instr, - nir_variable *var, - struct tgsi_shader_info *info) + const nir_deref_instr *deref, + struct si_shader_info *info) { - assert(var && var->data.mode == nir_var_shader_out); - switch (nir->info.stage) { case MESA_SHADER_VERTEX: /* needed by LS, ES */ case MESA_SHADER_TESS_EVAL: /* needed by ES */ - case MESA_SHADER_GEOMETRY: { - unsigned i = var->data.driver_location; - unsigned attrib_count = glsl_count_attribute_slots(var->type, false); - unsigned mask = nir_intrinsic_write_mask(instr); - - assert(!var->data.compact); - - for (unsigned j = 0; j < attrib_count; j++, i++) { - if (glsl_type_is_64bit(glsl_without_array(var->type))) { - unsigned dmask = mask; - - if (glsl_type_is_dual_slot(glsl_without_array(var->type)) && j % 2) - dmask >>= 2; - - dmask <<= var->data.location_frac / 2; - - if (dmask & 0x1) - info->output_usagemask[i] |= TGSI_WRITEMASK_XY; - if (dmask & 0x2) - info->output_usagemask[i] |= TGSI_WRITEMASK_ZW; - } else { - info->output_usagemask[i] |= - (mask << var->data.location_frac) & 0xf; - } - - } + case MESA_SHADER_GEOMETRY: + gather_usage(deref, nir_intrinsic_write_mask(instr), + info->output_usagemask); break; - } default:; } } static void scan_instruction(const struct nir_shader *nir, - struct tgsi_shader_info *info, + struct si_shader_info *info, nir_instr *instr) { if (instr->type == nir_instr_type_alu) { @@ -213,6 +236,11 @@ case nir_intrinsic_load_num_work_groups: info->uses_grid_size = true; break; + case nir_intrinsic_load_local_invocation_index: + case nir_intrinsic_load_subgroup_id: + case nir_intrinsic_load_num_subgroups: + info->uses_subgroup_info = true; + break; case nir_intrinsic_load_local_group_size: /* The block size is translated to IMM with a fixed block size. */ if (info->properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] == 0) @@ -254,25 +282,12 @@ info->reads_tess_factors = true; break; case nir_intrinsic_bindless_image_load: - info->uses_bindless_images = true; - - if (nir_intrinsic_image_dim(intr) == GLSL_SAMPLER_DIM_BUF) - info->uses_bindless_buffer_load = true; - else - info->uses_bindless_image_load = true; - break; case nir_intrinsic_bindless_image_size: case nir_intrinsic_bindless_image_samples: info->uses_bindless_images = true; break; case nir_intrinsic_bindless_image_store: info->uses_bindless_images = true; - - if (nir_intrinsic_image_dim(intr) == GLSL_SAMPLER_DIM_BUF) - info->uses_bindless_buffer_store = true; - else - info->uses_bindless_image_store = true; - info->writes_memory = true; info->num_memory_instructions++; /* we only care about stores */ break; @@ -281,26 +296,24 @@ info->num_memory_instructions++; /* we only care about stores */ break; case nir_intrinsic_bindless_image_atomic_add: - case nir_intrinsic_bindless_image_atomic_min: - case nir_intrinsic_bindless_image_atomic_max: + case nir_intrinsic_bindless_image_atomic_imin: + case nir_intrinsic_bindless_image_atomic_umin: + case nir_intrinsic_bindless_image_atomic_imax: + case nir_intrinsic_bindless_image_atomic_umax: case nir_intrinsic_bindless_image_atomic_and: case nir_intrinsic_bindless_image_atomic_or: case nir_intrinsic_bindless_image_atomic_xor: case nir_intrinsic_bindless_image_atomic_exchange: case nir_intrinsic_bindless_image_atomic_comp_swap: info->uses_bindless_images = true; - - if (nir_intrinsic_image_dim(intr) == GLSL_SAMPLER_DIM_BUF) - info->uses_bindless_buffer_atomic = true; - else - info->uses_bindless_image_atomic = true; - info->writes_memory = true; info->num_memory_instructions++; /* we only care about stores */ break; case nir_intrinsic_image_deref_atomic_add: - case nir_intrinsic_image_deref_atomic_min: - case nir_intrinsic_image_deref_atomic_max: + case nir_intrinsic_image_deref_atomic_imin: + case nir_intrinsic_image_deref_atomic_umin: + case nir_intrinsic_image_deref_atomic_imax: + case nir_intrinsic_image_deref_atomic_umax: case nir_intrinsic_image_deref_atomic_and: case nir_intrinsic_image_deref_atomic_or: case nir_intrinsic_image_deref_atomic_xor: @@ -372,7 +385,8 @@ if (mode == nir_var_shader_in) { /* PS inputs use the interpolated load intrinsics. */ assert(nir->info.stage != MESA_SHADER_FRAGMENT); - gather_intrinsic_load_deref_input_info(nir, intr, var, info); + gather_intrinsic_load_deref_input_info(nir, intr, + nir_src_as_deref(intr->src[0]), info); } else if (mode == nir_var_shader_out) { gather_intrinsic_load_deref_output_info(nir, intr, var, info); } @@ -382,7 +396,8 @@ nir_variable *var = intrinsic_get_var(intr); if (var->data.mode == nir_var_shader_out) - gather_intrinsic_store_deref_output_info(nir, intr, var, info); + gather_intrinsic_store_deref_output_info(nir, intr, + nir_src_as_deref(intr->src[0]), info); break; } case nir_intrinsic_interp_deref_at_centroid: @@ -396,27 +411,161 @@ } } -void si_nir_scan_tess_ctrl(const struct nir_shader *nir, - struct tgsi_tessctrl_info *out) +static void scan_output_slot(const nir_variable *var, + unsigned var_idx, + unsigned component, unsigned num_components, + struct si_shader_info *info) { - memset(out, 0, sizeof(*out)); + assert(component + num_components <= 4); + assert(component < 4); - if (nir->info.stage != MESA_SHADER_TESS_CTRL) - return; + unsigned semantic_name, semantic_index; + + unsigned location = var->data.location + var_idx; + unsigned drv_location = var->data.driver_location + var_idx; - out->tessfactors_are_def_in_all_invocs = - ac_are_tessfactors_def_in_all_invocs(nir); + if (info->processor == PIPE_SHADER_FRAGMENT) { + tgsi_get_gl_frag_result_semantic(location, + &semantic_name, &semantic_index); + + /* Adjust for dual source blending */ + if (var->data.index > 0) { + semantic_index++; + } + } else { + tgsi_get_gl_varying_semantic(location, true, + &semantic_name, &semantic_index); + } + + ubyte usagemask = ((1 << num_components) - 1) << component; + + unsigned gs_out_streams; + if (var->data.stream & NIR_STREAM_PACKED) { + gs_out_streams = var->data.stream & ~NIR_STREAM_PACKED; + } else { + assert(var->data.stream < 4); + gs_out_streams = 0; + for (unsigned j = 0; j < num_components; ++j) + gs_out_streams |= var->data.stream << (2 * (component + j)); + } + + unsigned streamx = gs_out_streams & 3; + unsigned streamy = (gs_out_streams >> 2) & 3; + unsigned streamz = (gs_out_streams >> 4) & 3; + unsigned streamw = (gs_out_streams >> 6) & 3; + + if (usagemask & TGSI_WRITEMASK_X) { + info->output_streams[drv_location] |= streamx; + info->num_stream_output_components[streamx]++; + } + if (usagemask & TGSI_WRITEMASK_Y) { + info->output_streams[drv_location] |= streamy << 2; + info->num_stream_output_components[streamy]++; + } + if (usagemask & TGSI_WRITEMASK_Z) { + info->output_streams[drv_location] |= streamz << 4; + info->num_stream_output_components[streamz]++; + } + if (usagemask & TGSI_WRITEMASK_W) { + info->output_streams[drv_location] |= streamw << 6; + info->num_stream_output_components[streamw]++; + } + + info->output_semantic_name[drv_location] = semantic_name; + info->output_semantic_index[drv_location] = semantic_index; + + switch (semantic_name) { + case TGSI_SEMANTIC_PRIMID: + info->writes_primid = true; + break; + case TGSI_SEMANTIC_VIEWPORT_INDEX: + info->writes_viewport_index = true; + break; + case TGSI_SEMANTIC_LAYER: + info->writes_layer = true; + break; + case TGSI_SEMANTIC_PSIZE: + info->writes_psize = true; + break; + case TGSI_SEMANTIC_CLIPVERTEX: + info->writes_clipvertex = true; + break; + case TGSI_SEMANTIC_COLOR: + info->colors_written |= 1 << semantic_index; + break; + case TGSI_SEMANTIC_STENCIL: + info->writes_stencil = true; + break; + case TGSI_SEMANTIC_SAMPLEMASK: + info->writes_samplemask = true; + break; + case TGSI_SEMANTIC_EDGEFLAG: + info->writes_edgeflag = true; + break; + case TGSI_SEMANTIC_POSITION: + if (info->processor == PIPE_SHADER_FRAGMENT) + info->writes_z = true; + else + info->writes_position = true; + break; + } +} + +static void scan_output_helper(const nir_variable *var, + unsigned location, + const struct glsl_type *type, + struct si_shader_info *info) +{ + if (glsl_type_is_struct(type) || glsl_type_is_interface(type)) { + for (unsigned i = 0; i < glsl_get_length(type); i++) { + const struct glsl_type *ft = glsl_get_struct_field(type, i); + scan_output_helper(var, location, ft, info); + location += glsl_count_attribute_slots(ft, false); + } + } else if (glsl_type_is_array_or_matrix(type)) { + const struct glsl_type *elem_type = + glsl_get_array_element(type); + unsigned num_elems = glsl_get_length(type); + if (var->data.compact) { + assert(glsl_type_is_scalar(elem_type)); + assert(glsl_get_bit_size(elem_type) == 32); + unsigned component = var->data.location_frac; + scan_output_slot(var, location, component, + MIN2(num_elems, 4 - component), info); + if (component + num_elems > 4) { + scan_output_slot(var, location + 1, 0, + component + num_elems - 4, info); + } + + } else { + unsigned elem_count = glsl_count_attribute_slots(elem_type, false); + for (unsigned i = 0; i < num_elems; i++) { + scan_output_helper(var, location, elem_type, info); + location += elem_count; + } + } + } else if (glsl_type_is_dual_slot(type)) { + unsigned component = var->data.location_frac; + scan_output_slot(var, location, component, 4 - component, info); + scan_output_slot(var, location + 1, 0, component + 2 * glsl_get_components(type) - 4, + info); + } else { + unsigned component = var->data.location_frac; + assert(glsl_type_is_vector_or_scalar(type)); + unsigned num_components = glsl_get_components(type); + if (glsl_type_is_64bit(type)) + num_components *= 2; + scan_output_slot(var, location, component, num_components, info); + } } void si_nir_scan_shader(const struct nir_shader *nir, - struct tgsi_shader_info *info) + struct si_shader_info *info) { nir_function *func; unsigned i; info->processor = pipe_shader_type_from_mesa(nir->info.stage); - info->num_tokens = 2; /* indicate that the shader is non-empty */ - info->num_instructions = 2; info->properties[TGSI_PROPERTY_NEXT_SHADER] = pipe_shader_type_from_mesa(nir->info.next_stage); @@ -514,13 +663,8 @@ * tracker has already mapped them to attributes via * variable->data.driver_location. */ - if (nir->info.stage == MESA_SHADER_VERTEX) { - processed_inputs |= 1ull << i; - - if (glsl_type_is_dual_slot(glsl_without_array(variable->type))) - processed_inputs |= 2ull << i; + if (nir->info.stage == MESA_SHADER_VERTEX) continue; - } for (unsigned j = 0; j < attrib_count; j++, i++) { @@ -583,149 +727,15 @@ } } - i = 0; - uint64_t processed_outputs = 0; nir_foreach_variable(variable, &nir->outputs) { - unsigned semantic_name, semantic_index; - - i = variable->data.driver_location; - const struct glsl_type *type = variable->type; if (nir_is_per_vertex_io(variable, nir->info.stage)) { assert(glsl_type_is_array(type)); type = glsl_get_array_element(type); } - unsigned attrib_count = glsl_count_attribute_slots(type, false); - for (unsigned k = 0; k < attrib_count; k++, i++) { - - if (nir->info.stage == MESA_SHADER_FRAGMENT) { - tgsi_get_gl_frag_result_semantic(variable->data.location + k, - &semantic_name, &semantic_index); - - /* Adjust for dual source blending */ - if (variable->data.index > 0) { - semantic_index++; - } - } else { - tgsi_get_gl_varying_semantic(variable->data.location + k, true, - &semantic_name, &semantic_index); - } - - unsigned num_components = 4; - unsigned vector_elements = glsl_get_vector_elements(glsl_without_array(variable->type)); - if (vector_elements) - num_components = vector_elements; - - unsigned component = variable->data.location_frac; - if (glsl_type_is_64bit(glsl_without_array(variable->type))) { - if (glsl_type_is_dual_slot(glsl_without_array(variable->type)) && k % 2) { - num_components = (num_components * 2) - 4; - component = 0; - } else { - num_components = MIN2(num_components * 2, 4); - } - } - - ubyte usagemask = 0; - for (unsigned j = component; j < num_components + component; j++) { - switch (j) { - case 0: - usagemask |= TGSI_WRITEMASK_X; - break; - case 1: - usagemask |= TGSI_WRITEMASK_Y; - break; - case 2: - usagemask |= TGSI_WRITEMASK_Z; - break; - case 3: - usagemask |= TGSI_WRITEMASK_W; - break; - default: - unreachable("error calculating component index"); - } - } - - unsigned gs_out_streams; - if (variable->data.stream & (1u << 31)) { - gs_out_streams = variable->data.stream & ~(1u << 31); - } else { - assert(variable->data.stream < 4); - gs_out_streams = 0; - for (unsigned j = 0; j < num_components; ++j) - gs_out_streams |= variable->data.stream << (2 * (component + j)); - } - - unsigned streamx = gs_out_streams & 3; - unsigned streamy = (gs_out_streams >> 2) & 3; - unsigned streamz = (gs_out_streams >> 4) & 3; - unsigned streamw = (gs_out_streams >> 6) & 3; - - if (usagemask & TGSI_WRITEMASK_X) { - info->output_streams[i] |= streamx; - info->num_stream_output_components[streamx]++; - } - if (usagemask & TGSI_WRITEMASK_Y) { - info->output_streams[i] |= streamy << 2; - info->num_stream_output_components[streamy]++; - } - if (usagemask & TGSI_WRITEMASK_Z) { - info->output_streams[i] |= streamz << 4; - info->num_stream_output_components[streamz]++; - } - if (usagemask & TGSI_WRITEMASK_W) { - info->output_streams[i] |= streamw << 6; - info->num_stream_output_components[streamw]++; - } - - /* make sure we only count this location once against - * the num_outputs counter. - */ - if (processed_outputs & ((uint64_t)1 << i)) - continue; - - processed_outputs |= ((uint64_t)1 << i); - - info->output_semantic_name[i] = semantic_name; - info->output_semantic_index[i] = semantic_index; - - switch (semantic_name) { - case TGSI_SEMANTIC_PRIMID: - info->writes_primid = true; - break; - case TGSI_SEMANTIC_VIEWPORT_INDEX: - info->writes_viewport_index = true; - break; - case TGSI_SEMANTIC_LAYER: - info->writes_layer = true; - break; - case TGSI_SEMANTIC_PSIZE: - info->writes_psize = true; - break; - case TGSI_SEMANTIC_CLIPVERTEX: - info->writes_clipvertex = true; - break; - case TGSI_SEMANTIC_COLOR: - info->colors_written |= 1 << semantic_index; - break; - case TGSI_SEMANTIC_STENCIL: - info->writes_stencil = true; - break; - case TGSI_SEMANTIC_SAMPLEMASK: - info->writes_samplemask = true; - break; - case TGSI_SEMANTIC_EDGEFLAG: - info->writes_edgeflag = true; - break; - case TGSI_SEMANTIC_POSITION: - if (info->processor == PIPE_SHADER_FRAGMENT) - info->writes_z = true; - else - info->writes_position = true; - break; - } - } + ASSERTED unsigned attrib_count = glsl_count_attribute_slots(type, false); + scan_output_helper(variable, 0, type, info); unsigned loc = variable->data.location; if (nir->info.stage == MESA_SHADER_FRAGMENT && @@ -736,99 +746,17 @@ } } - info->num_inputs = util_last_bit64(processed_inputs); - info->num_outputs = util_last_bit64(processed_outputs); - - /* Inputs and outputs can't have holes. If this fails, use - * nir_assign_io_var_locations to re-assign driver_location. - */ - assert(processed_inputs == u_bit_consecutive64(0, info->num_inputs)); - assert(processed_outputs == u_bit_consecutive64(0, info->num_outputs)); - - struct set *ubo_set = _mesa_set_create(NULL, _mesa_hash_pointer, - _mesa_key_pointer_equal); - struct set *ssbo_set = _mesa_set_create(NULL, _mesa_hash_pointer, - _mesa_key_pointer_equal); - - /* Intialise const_file_max[0] */ - info->const_file_max[0] = -1; - - /* The first 8 are reserved for atomic counters using ssbo */ - unsigned ssbo_idx = 8; + info->num_inputs = nir->num_inputs; + info->num_outputs = nir->num_outputs; - unsigned ubo_idx = 1; - nir_foreach_variable(variable, &nir->uniforms) { - const struct glsl_type *type = variable->type; - enum glsl_base_type base_type = - glsl_get_base_type(glsl_without_array(type)); - unsigned aoa_size = MAX2(1, glsl_get_aoa_size(type)); - unsigned loc = variable->data.driver_location / 4; - int slot_count = glsl_count_attribute_slots(type, false); - int max_slot = MAX2(info->const_file_max[0], (int) loc) + slot_count; - - /* Gather buffers declared bitmasks. Note: radeonsi doesn't - * really use the mask (other than ubo_idx == 1 for regular - * uniforms) its really only used for getting the buffer count - * so we don't need to worry about the ordering. - */ - if (variable->interface_type != NULL) { - if (variable->data.mode == nir_var_uniform || - variable->data.mode == nir_var_mem_ubo || - variable->data.mode == nir_var_mem_ssbo) { - - struct set *buf_set = variable->data.mode == nir_var_mem_ssbo ? - ssbo_set : ubo_set; - - unsigned block_count; - if (base_type != GLSL_TYPE_INTERFACE) { - struct set_entry *entry = - _mesa_set_search(buf_set, variable->interface_type); - - /* Check if we have already processed - * a member from this ubo. - */ - if (entry) - continue; - - block_count = 1; - } else { - block_count = aoa_size; - } - - if (variable->data.mode == nir_var_uniform || - variable->data.mode == nir_var_mem_ubo) { - info->const_buffers_declared |= u_bit_consecutive(ubo_idx, block_count); - ubo_idx += block_count; - } else { - assert(variable->data.mode == nir_var_mem_ssbo); - - info->shader_buffers_declared |= u_bit_consecutive(ssbo_idx, block_count); - ssbo_idx += block_count; - } - - _mesa_set_add(buf_set, variable->interface_type); - } - - continue; - } - - /* We rely on the fact that nir_lower_samplers_as_deref has - * eliminated struct dereferences. - */ - if (base_type == GLSL_TYPE_SAMPLER && !variable->data.bindless) { - info->samplers_declared |= - u_bit_consecutive(variable->data.binding, aoa_size); - } else if (base_type == GLSL_TYPE_IMAGE && !variable->data.bindless) { - info->images_declared |= - u_bit_consecutive(variable->data.binding, aoa_size); - } else if (base_type != GLSL_TYPE_ATOMIC_UINT) { - info->const_buffers_declared |= 1; - info->const_file_max[0] = max_slot; - } - } - - _mesa_set_destroy(ubo_set, NULL); - _mesa_set_destroy(ssbo_set, NULL); + info->constbuf0_num_slots = nir->num_uniforms; + info->shader_buffers_declared = u_bit_consecutive(0, nir->info.num_ssbos); + info->const_buffers_declared = u_bit_consecutive(1, nir->info.num_ubos); + if (nir->num_uniforms > 0) + info->const_buffers_declared |= 1; + info->images_declared = u_bit_consecutive(0, nir->info.num_images); + info->msaa_images_declared = u_bit_consecutive(0, nir->info.last_msaa_image + 1); + info->samplers_declared = nir->info.textures_used; info->num_written_clipdistance = nir->info.clip_distance_array_size; info->num_written_culldistance = nir->info.cull_distance_array_size; @@ -838,6 +766,11 @@ if (info->processor == PIPE_SHADER_FRAGMENT) info->uses_kill = nir->info.fs.uses_discard; + if (nir->info.stage == MESA_SHADER_TESS_CTRL) { + info->tessfactors_are_def_in_all_invocs = + ac_are_tessfactors_def_in_all_invocs(nir); + } + func = (struct nir_function *)exec_list_get_head_const(&nir->functions); nir_foreach_block(block, func->impl) { nir_foreach_instr(instr, block) @@ -845,14 +778,10 @@ } } -void +static void si_nir_opts(struct nir_shader *nir) { bool progress; - unsigned lower_flrp = - (nir->options->lower_flrp16 ? 16 : 0) | - (nir->options->lower_flrp32 ? 32 : 0) | - (nir->options->lower_flrp64 ? 64 : 0); do { progress = false; @@ -862,7 +791,7 @@ NIR_PASS(progress, nir, nir_opt_copy_prop_vars); NIR_PASS(progress, nir, nir_opt_dead_write_vars); - NIR_PASS_V(nir, nir_lower_alu_to_scalar, NULL); + NIR_PASS_V(nir, nir_lower_alu_to_scalar, NULL, NULL); NIR_PASS_V(nir, nir_lower_phis_to_scalar); /* (Constant) copy propagation is needed for txf with offsets. */ @@ -883,7 +812,12 @@ NIR_PASS(progress, nir, nir_opt_algebraic); NIR_PASS(progress, nir, nir_opt_constant_folding); - if (lower_flrp != 0) { + if (!nir->info.flrp_lowered) { + unsigned lower_flrp = + (nir->options->lower_flrp16 ? 16 : 0) | + (nir->options->lower_flrp32 ? 32 : 0) | + (nir->options->lower_flrp64 ? 64 : 0); + assert(lower_flrp); bool lower_flrp_progress = false; NIR_PASS(lower_flrp_progress, nir, nir_lower_flrp, @@ -899,7 +833,7 @@ /* Nothing should rematerialize any flrps, so we only * need to do this lowering once. */ - lower_flrp = 0; + nir->info.flrp_lowered = true; } NIR_PASS(progress, nir, nir_opt_undef); @@ -957,7 +891,7 @@ } } -void si_nir_lower_ps_inputs(struct nir_shader *nir) +static void si_nir_lower_ps_inputs(struct nir_shader *nir) { if (nir->info.stage != MESA_SHADER_FRAGMENT) return; @@ -982,32 +916,27 @@ nir_var_shader_in); } -/** - * Perform "lowering" operations on the NIR that are run once when the shader - * selector is created. - */ -void si_lower_nir(struct si_shader_selector *sel) +void si_nir_adjust_driver_locations(struct nir_shader *nir) { /* Adjust the driver location of inputs and outputs. The state tracker * interprets them as slots, while the ac/nir backend interprets them * as individual components. */ - if (sel->nir->info.stage != MESA_SHADER_FRAGMENT) { - nir_foreach_variable(variable, &sel->nir->inputs) + if (nir->info.stage != MESA_SHADER_FRAGMENT) { + nir_foreach_variable(variable, &nir->inputs) variable->data.driver_location *= 4; } - nir_foreach_variable(variable, &sel->nir->outputs) { + nir_foreach_variable(variable, &nir->outputs) variable->data.driver_location *= 4; +} - if (sel->nir->info.stage == MESA_SHADER_FRAGMENT) { - if (variable->data.location == FRAG_RESULT_DEPTH) - variable->data.driver_location += 2; - else if (variable->data.location == FRAG_RESULT_STENCIL) - variable->data.driver_location += 1; - } - } - +/** + * Perform "lowering" operations on the NIR that are run once when the shader + * selector is created. + */ +static void si_lower_nir(struct si_screen *sscreen, struct nir_shader *nir) +{ /* Perform lowerings (and optimizations) of code. * * Performance considerations aside, we must: @@ -1019,7 +948,7 @@ static const struct nir_lower_tex_options lower_tex_options = { .lower_txp = ~0u, }; - NIR_PASS_V(sel->nir, nir_lower_tex, &lower_tex_options); + NIR_PASS_V(nir, nir_lower_tex, &lower_tex_options); const nir_lower_subgroups_options subgroups_options = { .subgroup_size = 64, @@ -1029,238 +958,41 @@ .lower_vote_trivial = false, .lower_vote_eq_to_ballot = true, }; - NIR_PASS_V(sel->nir, nir_lower_subgroups, &subgroups_options); + NIR_PASS_V(nir, nir_lower_subgroups, &subgroups_options); - ac_lower_indirect_derefs(sel->nir, sel->screen->info.chip_class); - - si_nir_opts(sel->nir); - - NIR_PASS_V(sel->nir, nir_lower_bool_to_int32); - - /* Strip the resulting shader so that the shader cache is more likely - * to hit from other similar shaders. + /* Lower load constants to scalar and then clean up the mess */ + NIR_PASS_V(nir, nir_lower_load_const_to_scalar); + NIR_PASS_V(nir, nir_lower_var_copies); + NIR_PASS_V(nir, nir_lower_pack); + NIR_PASS_V(nir, nir_opt_access); + si_nir_opts(nir); + + /* Lower large variables that are always constant with load_constant + * intrinsics, which get turned into PC-relative loads from a data + * section next to the shader. + * + * st/mesa calls finalize_nir twice, but we can't call this pass twice. */ - nir_strip(sel->nir); -} - -static void declare_nir_input_vs(struct si_shader_context *ctx, - struct nir_variable *variable, - unsigned input_index, - LLVMValueRef out[4]) -{ - si_llvm_load_input_vs(ctx, input_index, out); -} - -LLVMValueRef -si_nir_lookup_interp_param(struct ac_shader_abi *abi, - enum glsl_interp_mode interp, unsigned location) -{ - struct si_shader_context *ctx = si_shader_context_from_abi(abi); - int interp_param_idx = -1; - - switch (interp) { - case INTERP_MODE_FLAT: - return NULL; - case INTERP_MODE_SMOOTH: - case INTERP_MODE_NONE: - if (location == INTERP_CENTER) - interp_param_idx = SI_PARAM_PERSP_CENTER; - else if (location == INTERP_CENTROID) - interp_param_idx = SI_PARAM_PERSP_CENTROID; - else if (location == INTERP_SAMPLE) - interp_param_idx = SI_PARAM_PERSP_SAMPLE; - break; - case INTERP_MODE_NOPERSPECTIVE: - if (location == INTERP_CENTER) - interp_param_idx = SI_PARAM_LINEAR_CENTER; - else if (location == INTERP_CENTROID) - interp_param_idx = SI_PARAM_LINEAR_CENTROID; - else if (location == INTERP_SAMPLE) - interp_param_idx = SI_PARAM_LINEAR_SAMPLE; - break; - default: - assert(!"Unhandled interpolation mode."); - return NULL; - } - - return interp_param_idx != -1 ? - LLVMGetParam(ctx->main_fn, interp_param_idx) : NULL; -} - -static LLVMValueRef -si_nir_load_sampler_desc(struct ac_shader_abi *abi, - unsigned descriptor_set, unsigned base_index, - unsigned constant_index, LLVMValueRef dynamic_index, - enum ac_descriptor_type desc_type, bool image, - bool write, bool bindless) -{ - struct si_shader_context *ctx = si_shader_context_from_abi(abi); - LLVMBuilderRef builder = ctx->ac.builder; - unsigned const_index = base_index + constant_index; - - assert(!descriptor_set); - assert(!image || desc_type == AC_DESC_IMAGE || desc_type == AC_DESC_BUFFER); - - if (bindless) { - LLVMValueRef list = - LLVMGetParam(ctx->main_fn, ctx->param_bindless_samplers_and_images); - - /* dynamic_index is the bindless handle */ - if (image) { - /* For simplicity, bindless image descriptors use fixed - * 16-dword slots for now. - */ - dynamic_index = LLVMBuildMul(ctx->ac.builder, dynamic_index, - LLVMConstInt(ctx->i64, 2, 0), ""); - - return si_load_image_desc(ctx, list, dynamic_index, desc_type, - write, true); - } - - /* Since bindless handle arithmetic can contain an unsigned integer - * wraparound and si_load_sampler_desc assumes there isn't any, - * use GEP without "inbounds" (inside ac_build_pointer_add) - * to prevent incorrect code generation and hangs. - */ - dynamic_index = LLVMBuildMul(ctx->ac.builder, dynamic_index, - LLVMConstInt(ctx->i64, 2, 0), ""); - list = ac_build_pointer_add(&ctx->ac, list, dynamic_index); - return si_load_sampler_desc(ctx, list, ctx->i32_0, desc_type); - } - - unsigned num_slots = image ? ctx->num_images : ctx->num_samplers; - assert(const_index < num_slots || dynamic_index); - - LLVMValueRef list = LLVMGetParam(ctx->main_fn, ctx->param_samplers_and_images); - LLVMValueRef index = LLVMConstInt(ctx->ac.i32, const_index, false); - - if (dynamic_index) { - index = LLVMBuildAdd(builder, index, dynamic_index, ""); - - /* From the GL_ARB_shader_image_load_store extension spec: - * - * If a shader performs an image load, store, or atomic - * operation using an image variable declared as an array, - * and if the index used to select an individual element is - * negative or greater than or equal to the size of the - * array, the results of the operation are undefined but may - * not lead to termination. - */ - index = si_llvm_bound_index(ctx, index, num_slots); - } - - if (image) { - index = LLVMBuildSub(ctx->ac.builder, - LLVMConstInt(ctx->i32, SI_NUM_IMAGES - 1, 0), - index, ""); - return si_load_image_desc(ctx, list, index, desc_type, write, false); + bool changed = false; + if (!nir->constant_data) { + NIR_PASS(changed, nir, nir_opt_large_constants, + glsl_get_natural_size_align_bytes, 16); } - index = LLVMBuildAdd(ctx->ac.builder, index, - LLVMConstInt(ctx->i32, SI_NUM_IMAGES / 2, 0), ""); - return si_load_sampler_desc(ctx, list, index, desc_type); -} + changed |= ac_lower_indirect_derefs(nir, sscreen->info.chip_class); + if (changed) + si_nir_opts(nir); -static void bitcast_inputs(struct si_shader_context *ctx, - LLVMValueRef data[4], - unsigned input_idx) -{ - for (unsigned chan = 0; chan < 4; chan++) { - ctx->inputs[input_idx + chan] = - LLVMBuildBitCast(ctx->ac.builder, data[chan], ctx->ac.i32, ""); - } + NIR_PASS_V(nir, nir_lower_bool_to_int32); + NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_function_temp); } -bool si_nir_build_llvm(struct si_shader_context *ctx, struct nir_shader *nir) +void si_finalize_nir(struct pipe_screen *screen, void *nirptr, bool optimize) { - struct tgsi_shader_info *info = &ctx->shader->selector->info; - - if (nir->info.stage == MESA_SHADER_VERTEX) { - uint64_t processed_inputs = 0; - nir_foreach_variable(variable, &nir->inputs) { - unsigned attrib_count = glsl_count_attribute_slots(variable->type, - true); - unsigned input_idx = variable->data.driver_location; - - LLVMValueRef data[4]; - unsigned loc = variable->data.location; - - for (unsigned i = 0; i < attrib_count; i++) { - /* Packed components share the same location so skip - * them if we have already processed the location. - */ - if (processed_inputs & ((uint64_t)1 << (loc + i))) { - input_idx += 4; - continue; - } - - declare_nir_input_vs(ctx, variable, input_idx / 4, data); - bitcast_inputs(ctx, data, input_idx); - if (glsl_type_is_dual_slot(variable->type)) { - input_idx += 4; - declare_nir_input_vs(ctx, variable, input_idx / 4, data); - bitcast_inputs(ctx, data, input_idx); - } - - processed_inputs |= ((uint64_t)1 << (loc + i)); - input_idx += 4; - } - } - } else if (nir->info.stage == MESA_SHADER_FRAGMENT) { - unsigned colors_read = - ctx->shader->selector->info.colors_read; - LLVMValueRef main_fn = ctx->main_fn; - - LLVMValueRef undef = LLVMGetUndef(ctx->f32); - - unsigned offset = SI_PARAM_POS_FIXED_PT + 1; - - if (colors_read & 0x0f) { - unsigned mask = colors_read & 0x0f; - LLVMValueRef values[4]; - values[0] = mask & 0x1 ? LLVMGetParam(main_fn, offset++) : undef; - values[1] = mask & 0x2 ? LLVMGetParam(main_fn, offset++) : undef; - values[2] = mask & 0x4 ? LLVMGetParam(main_fn, offset++) : undef; - values[3] = mask & 0x8 ? LLVMGetParam(main_fn, offset++) : undef; - ctx->abi.color0 = - ac_to_integer(&ctx->ac, - ac_build_gather_values(&ctx->ac, values, 4)); - } - if (colors_read & 0xf0) { - unsigned mask = (colors_read & 0xf0) >> 4; - LLVMValueRef values[4]; - values[0] = mask & 0x1 ? LLVMGetParam(main_fn, offset++) : undef; - values[1] = mask & 0x2 ? LLVMGetParam(main_fn, offset++) : undef; - values[2] = mask & 0x4 ? LLVMGetParam(main_fn, offset++) : undef; - values[3] = mask & 0x8 ? LLVMGetParam(main_fn, offset++) : undef; - ctx->abi.color1 = - ac_to_integer(&ctx->ac, - ac_build_gather_values(&ctx->ac, values, 4)); - } - - ctx->abi.interp_at_sample_force_center = - ctx->shader->key.mono.u.ps.interpolate_at_sample_force_center; - } else if (nir->info.stage == MESA_SHADER_COMPUTE) { - if (nir->info.cs.user_data_components_amd) { - ctx->abi.user_data = LLVMGetParam(ctx->main_fn, ctx->param_cs_user_data); - ctx->abi.user_data = ac_build_expand_to_vec4(&ctx->ac, ctx->abi.user_data, - nir->info.cs.user_data_components_amd); - } - } - - ctx->abi.inputs = &ctx->inputs[0]; - ctx->abi.load_sampler_desc = si_nir_load_sampler_desc; - ctx->abi.clamp_shadow_reference = true; - ctx->abi.robust_buffer_access = true; - - ctx->num_samplers = util_last_bit(info->samplers_declared); - ctx->num_images = util_last_bit(info->images_declared); - - if (ctx->shader->selector->info.properties[TGSI_PROPERTY_CS_LOCAL_SIZE]) { - assert(gl_shader_stage_is_compute(nir->info.stage)); - si_declare_compute_memory(ctx); - } - ac_nir_translate(&ctx->ac, &ctx->abi, nir); + struct si_screen *sscreen = (struct si_screen *)screen; + struct nir_shader *nir = (struct nir_shader *)nirptr; - return true; + nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir)); + si_nir_lower_ps_inputs(nir); + si_lower_nir(sscreen, nir); } diff -Nru mesa-19.2.8/src/gallium/drivers/radeonsi/si_shader_tgsi_alu.c mesa-20.0.8/src/gallium/drivers/radeonsi/si_shader_tgsi_alu.c --- mesa-19.2.8/src/gallium/drivers/radeonsi/si_shader_tgsi_alu.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/radeonsi/si_shader_tgsi_alu.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,823 +0,0 @@ -/* - * Copyright 2016 Advanced Micro Devices, Inc. - * All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * on the rights to use, copy, modify, merge, publish, distribute, sub - * license, and/or sell copies of the Software, and to permit persons to whom - * the Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL - * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, - * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR - * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE - * USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -#include "si_shader_internal.h" -#include "ac_llvm_util.h" - -void si_llvm_emit_kill(struct ac_shader_abi *abi, LLVMValueRef visible) -{ - struct si_shader_context *ctx = si_shader_context_from_abi(abi); - LLVMBuilderRef builder = ctx->ac.builder; - - if (ctx->shader->selector->force_correct_derivs_after_kill) { - /* Kill immediately while maintaining WQM. */ - ac_build_kill_if_false(&ctx->ac, - ac_build_wqm_vote(&ctx->ac, visible)); - - LLVMValueRef mask = LLVMBuildLoad(builder, ctx->postponed_kill, ""); - mask = LLVMBuildAnd(builder, mask, visible, ""); - LLVMBuildStore(builder, mask, ctx->postponed_kill); - return; - } - - ac_build_kill_if_false(&ctx->ac, visible); -} - -static void kil_emit(const struct lp_build_tgsi_action *action, - struct lp_build_tgsi_context *bld_base, - struct lp_build_emit_data *emit_data) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - LLVMValueRef visible; - - if (emit_data->inst->Instruction.Opcode == TGSI_OPCODE_KILL_IF) { - const struct tgsi_full_instruction *inst = emit_data->inst; - struct si_shader_context *ctx = si_shader_context(bld_base); - LLVMBuilderRef builder = ctx->ac.builder; - unsigned i; - LLVMValueRef conds[TGSI_NUM_CHANNELS]; - - for (i = 0; i < TGSI_NUM_CHANNELS; i++) { - LLVMValueRef value = lp_build_emit_fetch(bld_base, inst, 0, i); - /* UGE because NaN shouldn't get killed */ - conds[i] = LLVMBuildFCmp(builder, LLVMRealUGE, value, - ctx->ac.f32_0, ""); - } - - /* And the conditions together */ - for (i = TGSI_NUM_CHANNELS - 1; i > 0; i--) { - conds[i - 1] = LLVMBuildAnd(builder, conds[i], conds[i - 1], ""); - } - visible = conds[0]; - } else { - assert(emit_data->inst->Instruction.Opcode == TGSI_OPCODE_KILL); - visible = ctx->i1false; - } - - si_llvm_emit_kill(&ctx->abi, visible); -} - -static void emit_icmp(const struct lp_build_tgsi_action *action, - struct lp_build_tgsi_context *bld_base, - struct lp_build_emit_data *emit_data) -{ - unsigned pred; - struct si_shader_context *ctx = si_shader_context(bld_base); - - switch (emit_data->inst->Instruction.Opcode) { - case TGSI_OPCODE_USEQ: - case TGSI_OPCODE_U64SEQ: pred = LLVMIntEQ; break; - case TGSI_OPCODE_USNE: - case TGSI_OPCODE_U64SNE: pred = LLVMIntNE; break; - case TGSI_OPCODE_USGE: - case TGSI_OPCODE_U64SGE: pred = LLVMIntUGE; break; - case TGSI_OPCODE_USLT: - case TGSI_OPCODE_U64SLT: pred = LLVMIntULT; break; - case TGSI_OPCODE_ISGE: - case TGSI_OPCODE_I64SGE: pred = LLVMIntSGE; break; - case TGSI_OPCODE_ISLT: - case TGSI_OPCODE_I64SLT: pred = LLVMIntSLT; break; - default: - assert(!"unknown instruction"); - pred = 0; - break; - } - - LLVMValueRef v = LLVMBuildICmp(ctx->ac.builder, pred, - emit_data->args[0], emit_data->args[1],""); - - v = LLVMBuildSExtOrBitCast(ctx->ac.builder, v, ctx->i32, ""); - - emit_data->output[emit_data->chan] = v; -} - -static void emit_ucmp(const struct lp_build_tgsi_action *action, - struct lp_build_tgsi_context *bld_base, - struct lp_build_emit_data *emit_data) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - LLVMValueRef arg0 = ac_to_integer(&ctx->ac, emit_data->args[0]); - - LLVMValueRef v = LLVMBuildICmp(ctx->ac.builder, LLVMIntNE, arg0, - ctx->i32_0, ""); - - emit_data->output[emit_data->chan] = - LLVMBuildSelect(ctx->ac.builder, v, emit_data->args[1], emit_data->args[2], ""); -} - -static void emit_cmp(const struct lp_build_tgsi_action *action, - struct lp_build_tgsi_context *bld_base, - struct lp_build_emit_data *emit_data) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - LLVMValueRef cond, *args = emit_data->args; - - cond = LLVMBuildFCmp(ctx->ac.builder, LLVMRealOLT, args[0], - ctx->ac.f32_0, ""); - - emit_data->output[emit_data->chan] = - LLVMBuildSelect(ctx->ac.builder, cond, args[1], args[2], ""); -} - -static void emit_set_cond(const struct lp_build_tgsi_action *action, - struct lp_build_tgsi_context *bld_base, - struct lp_build_emit_data *emit_data) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - LLVMRealPredicate pred; - LLVMValueRef cond; - - /* Use ordered for everything but NE (which is usual for - * float comparisons) - */ - switch (emit_data->inst->Instruction.Opcode) { - case TGSI_OPCODE_SGE: pred = LLVMRealOGE; break; - case TGSI_OPCODE_SEQ: pred = LLVMRealOEQ; break; - case TGSI_OPCODE_SLE: pred = LLVMRealOLE; break; - case TGSI_OPCODE_SLT: pred = LLVMRealOLT; break; - case TGSI_OPCODE_SNE: pred = LLVMRealUNE; break; - case TGSI_OPCODE_SGT: pred = LLVMRealOGT; break; - default: assert(!"unknown instruction"); pred = 0; break; - } - - cond = LLVMBuildFCmp(ctx->ac.builder, - pred, emit_data->args[0], emit_data->args[1], ""); - - emit_data->output[emit_data->chan] = LLVMBuildSelect(ctx->ac.builder, - cond, ctx->ac.f32_1, ctx->ac.f32_0, ""); -} - -static void emit_fcmp(const struct lp_build_tgsi_action *action, - struct lp_build_tgsi_context *bld_base, - struct lp_build_emit_data *emit_data) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - LLVMRealPredicate pred; - - /* Use ordered for everything but NE (which is usual for - * float comparisons) - */ - switch (emit_data->inst->Instruction.Opcode) { - case TGSI_OPCODE_FSEQ: pred = LLVMRealOEQ; break; - case TGSI_OPCODE_FSGE: pred = LLVMRealOGE; break; - case TGSI_OPCODE_FSLT: pred = LLVMRealOLT; break; - case TGSI_OPCODE_FSNE: pred = LLVMRealUNE; break; - default: assert(!"unknown instruction"); pred = 0; break; - } - - LLVMValueRef v = LLVMBuildFCmp(ctx->ac.builder, pred, - emit_data->args[0], emit_data->args[1],""); - - v = LLVMBuildSExtOrBitCast(ctx->ac.builder, v, ctx->i32, ""); - - emit_data->output[emit_data->chan] = v; -} - -static void emit_dcmp(const struct lp_build_tgsi_action *action, - struct lp_build_tgsi_context *bld_base, - struct lp_build_emit_data *emit_data) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - LLVMRealPredicate pred; - - /* Use ordered for everything but NE (which is usual for - * float comparisons) - */ - switch (emit_data->inst->Instruction.Opcode) { - case TGSI_OPCODE_DSEQ: pred = LLVMRealOEQ; break; - case TGSI_OPCODE_DSGE: pred = LLVMRealOGE; break; - case TGSI_OPCODE_DSLT: pred = LLVMRealOLT; break; - case TGSI_OPCODE_DSNE: pred = LLVMRealUNE; break; - default: assert(!"unknown instruction"); pred = 0; break; - } - - LLVMValueRef v = LLVMBuildFCmp(ctx->ac.builder, pred, - emit_data->args[0], emit_data->args[1],""); - - v = LLVMBuildSExtOrBitCast(ctx->ac.builder, v, ctx->i32, ""); - - emit_data->output[emit_data->chan] = v; -} - -static void emit_not(const struct lp_build_tgsi_action *action, - struct lp_build_tgsi_context *bld_base, - struct lp_build_emit_data *emit_data) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - LLVMValueRef v = ac_to_integer(&ctx->ac, emit_data->args[0]); - emit_data->output[emit_data->chan] = LLVMBuildNot(ctx->ac.builder, v, ""); -} - -static void emit_arl(const struct lp_build_tgsi_action *action, - struct lp_build_tgsi_context *bld_base, - struct lp_build_emit_data *emit_data) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - LLVMValueRef floor_index = - ac_build_intrinsic(&ctx->ac, "llvm.floor.f32", ctx->f32, - &emit_data->args[0], 1, AC_FUNC_ATTR_READNONE); - emit_data->output[emit_data->chan] = LLVMBuildFPToSI(ctx->ac.builder, - floor_index, ctx->i32, ""); -} - -static void emit_and(const struct lp_build_tgsi_action *action, - struct lp_build_tgsi_context *bld_base, - struct lp_build_emit_data *emit_data) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - emit_data->output[emit_data->chan] = LLVMBuildAnd(ctx->ac.builder, - emit_data->args[0], emit_data->args[1], ""); -} - -static void emit_or(const struct lp_build_tgsi_action *action, - struct lp_build_tgsi_context *bld_base, - struct lp_build_emit_data *emit_data) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - emit_data->output[emit_data->chan] = LLVMBuildOr(ctx->ac.builder, - emit_data->args[0], emit_data->args[1], ""); -} - -static void emit_uadd(const struct lp_build_tgsi_action *action, - struct lp_build_tgsi_context *bld_base, - struct lp_build_emit_data *emit_data) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - emit_data->output[emit_data->chan] = LLVMBuildAdd(ctx->ac.builder, - emit_data->args[0], emit_data->args[1], ""); -} - -static void emit_udiv(const struct lp_build_tgsi_action *action, - struct lp_build_tgsi_context *bld_base, - struct lp_build_emit_data *emit_data) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - emit_data->output[emit_data->chan] = LLVMBuildUDiv(ctx->ac.builder, - emit_data->args[0], emit_data->args[1], ""); -} - -static void emit_idiv(const struct lp_build_tgsi_action *action, - struct lp_build_tgsi_context *bld_base, - struct lp_build_emit_data *emit_data) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - emit_data->output[emit_data->chan] = LLVMBuildSDiv(ctx->ac.builder, - emit_data->args[0], emit_data->args[1], ""); -} - -static void emit_mod(const struct lp_build_tgsi_action *action, - struct lp_build_tgsi_context *bld_base, - struct lp_build_emit_data *emit_data) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - emit_data->output[emit_data->chan] = LLVMBuildSRem(ctx->ac.builder, - emit_data->args[0], emit_data->args[1], ""); -} - -static void emit_umod(const struct lp_build_tgsi_action *action, - struct lp_build_tgsi_context *bld_base, - struct lp_build_emit_data *emit_data) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - emit_data->output[emit_data->chan] = LLVMBuildURem(ctx->ac.builder, - emit_data->args[0], emit_data->args[1], ""); -} - -static void emit_shl(const struct lp_build_tgsi_action *action, - struct lp_build_tgsi_context *bld_base, - struct lp_build_emit_data *emit_data) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - emit_data->output[emit_data->chan] = LLVMBuildShl(ctx->ac.builder, - emit_data->args[0], emit_data->args[1], ""); -} - -static void emit_ushr(const struct lp_build_tgsi_action *action, - struct lp_build_tgsi_context *bld_base, - struct lp_build_emit_data *emit_data) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - emit_data->output[emit_data->chan] = LLVMBuildLShr(ctx->ac.builder, - emit_data->args[0], emit_data->args[1], ""); -} -static void emit_ishr(const struct lp_build_tgsi_action *action, - struct lp_build_tgsi_context *bld_base, - struct lp_build_emit_data *emit_data) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - emit_data->output[emit_data->chan] = LLVMBuildAShr(ctx->ac.builder, - emit_data->args[0], emit_data->args[1], ""); -} - -static void emit_xor(const struct lp_build_tgsi_action *action, - struct lp_build_tgsi_context *bld_base, - struct lp_build_emit_data *emit_data) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - emit_data->output[emit_data->chan] = LLVMBuildXor(ctx->ac.builder, - emit_data->args[0], emit_data->args[1], ""); -} - -static void emit_ssg(const struct lp_build_tgsi_action *action, - struct lp_build_tgsi_context *bld_base, - struct lp_build_emit_data *emit_data) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - - LLVMValueRef val; - - if (emit_data->inst->Instruction.Opcode == TGSI_OPCODE_I64SSG) { - val = ac_build_isign(&ctx->ac, emit_data->args[0], 64); - } else if (emit_data->inst->Instruction.Opcode == TGSI_OPCODE_ISSG) { - val = ac_build_isign(&ctx->ac, emit_data->args[0], 32); - } else if (emit_data->inst->Instruction.Opcode == TGSI_OPCODE_DSSG) { - val = ac_build_fsign(&ctx->ac, emit_data->args[0], 64); - } else { - val = ac_build_fsign(&ctx->ac, emit_data->args[0], 32); - } - - emit_data->output[emit_data->chan] = val; -} - -static void emit_ineg(const struct lp_build_tgsi_action *action, - struct lp_build_tgsi_context *bld_base, - struct lp_build_emit_data *emit_data) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - emit_data->output[emit_data->chan] = LLVMBuildNeg(ctx->ac.builder, - emit_data->args[0], ""); -} - -static void emit_dneg(const struct lp_build_tgsi_action *action, - struct lp_build_tgsi_context *bld_base, - struct lp_build_emit_data *emit_data) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - emit_data->output[emit_data->chan] = LLVMBuildFNeg(ctx->ac.builder, - emit_data->args[0], ""); -} - -static void emit_frac(const struct lp_build_tgsi_action *action, - struct lp_build_tgsi_context *bld_base, - struct lp_build_emit_data *emit_data) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - unsigned bitsize; - - if (emit_data->info->opcode == TGSI_OPCODE_FRC) - bitsize = 32; - else if (emit_data->info->opcode == TGSI_OPCODE_DFRAC) - bitsize = 64; - else { - assert(0); - return; - } - - emit_data->output[emit_data->chan] = - ac_build_fract(&ctx->ac, emit_data->args[0], bitsize); -} - -static void emit_f2i(const struct lp_build_tgsi_action *action, - struct lp_build_tgsi_context *bld_base, - struct lp_build_emit_data *emit_data) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - emit_data->output[emit_data->chan] = LLVMBuildFPToSI(ctx->ac.builder, - emit_data->args[0], ctx->i32, ""); -} - -static void emit_f2u(const struct lp_build_tgsi_action *action, - struct lp_build_tgsi_context *bld_base, - struct lp_build_emit_data *emit_data) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - emit_data->output[emit_data->chan] = LLVMBuildFPToUI(ctx->ac.builder, - emit_data->args[0], ctx->i32, ""); -} - -static void emit_i2f(const struct lp_build_tgsi_action *action, - struct lp_build_tgsi_context *bld_base, - struct lp_build_emit_data *emit_data) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - emit_data->output[emit_data->chan] = LLVMBuildSIToFP(ctx->ac.builder, - emit_data->args[0], ctx->f32, ""); -} - -static void emit_u2f(const struct lp_build_tgsi_action *action, - struct lp_build_tgsi_context *bld_base, - struct lp_build_emit_data *emit_data) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - emit_data->output[emit_data->chan] = LLVMBuildUIToFP(ctx->ac.builder, - emit_data->args[0], ctx->f32, ""); -} - -static void -build_tgsi_intrinsic_nomem(const struct lp_build_tgsi_action *action, - struct lp_build_tgsi_context *bld_base, - struct lp_build_emit_data *emit_data) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - emit_data->output[emit_data->chan] = - ac_build_intrinsic(&ctx->ac, action->intr_name, - emit_data->dst_type, emit_data->args, - emit_data->arg_count, AC_FUNC_ATTR_READNONE); -} - -static void emit_bfi(const struct lp_build_tgsi_action *action, - struct lp_build_tgsi_context *bld_base, - struct lp_build_emit_data *emit_data) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - LLVMBuilderRef builder = ctx->ac.builder; - LLVMValueRef bfi_args[3]; - LLVMValueRef bfi_sm5; - LLVMValueRef cond; - - // Calculate the bitmask: (((1 << src3) - 1) << src2 - bfi_args[0] = LLVMBuildShl(builder, - LLVMBuildSub(builder, - LLVMBuildShl(builder, - ctx->i32_1, - emit_data->args[3], ""), - ctx->i32_1, ""), - emit_data->args[2], ""); - - bfi_args[1] = LLVMBuildShl(builder, emit_data->args[1], - emit_data->args[2], ""); - - bfi_args[2] = emit_data->args[0]; - - /* Calculate: - * (arg0 & arg1) | (~arg0 & arg2) = arg2 ^ (arg0 & (arg1 ^ arg2) - * Use the right-hand side, which the LLVM backend can convert to V_BFI. - */ - bfi_sm5 = - LLVMBuildXor(builder, bfi_args[2], - LLVMBuildAnd(builder, bfi_args[0], - LLVMBuildXor(builder, bfi_args[1], bfi_args[2], - ""), ""), ""); - - /* Since shifts of >= 32 bits are undefined in LLVM IR, the backend - * uses the convenient V_BFI lowering for the above, which follows SM5 - * and disagrees with GLSL semantics when bits (src3) is 32. - */ - cond = LLVMBuildICmp(builder, LLVMIntUGE, emit_data->args[3], - LLVMConstInt(ctx->i32, 32, 0), ""); - emit_data->output[emit_data->chan] = - LLVMBuildSelect(builder, cond, emit_data->args[1], bfi_sm5, ""); -} - -static void emit_bfe(const struct lp_build_tgsi_action *action, - struct lp_build_tgsi_context *bld_base, - struct lp_build_emit_data *emit_data) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - - /* FIXME: LLVM 7 returns incorrect result when count is 0. - * https://bugs.freedesktop.org/show_bug.cgi?id=107276 - */ - LLVMValueRef zero = ctx->i32_0; - LLVMValueRef bfe_sm5 = - ac_build_bfe(&ctx->ac, emit_data->args[0], - emit_data->args[1], emit_data->args[2], - emit_data->info->opcode == TGSI_OPCODE_IBFE); - - /* Correct for GLSL semantics. */ - LLVMValueRef cond = LLVMBuildICmp(ctx->ac.builder, LLVMIntUGE, emit_data->args[2], - LLVMConstInt(ctx->i32, 32, 0), ""); - LLVMValueRef cond2 = LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, emit_data->args[2], - zero, ""); - bfe_sm5 = LLVMBuildSelect(ctx->ac.builder, cond, emit_data->args[0], bfe_sm5, ""); - emit_data->output[emit_data->chan] = - LLVMBuildSelect(ctx->ac.builder, cond2, zero, bfe_sm5, ""); -} - -/* this is ffs in C */ -static void emit_lsb(const struct lp_build_tgsi_action *action, - struct lp_build_tgsi_context *bld_base, - struct lp_build_emit_data *emit_data) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - - emit_data->output[emit_data->chan] = ac_find_lsb(&ctx->ac, emit_data->dst_type, emit_data->args[0]); -} - -/* Find the last bit set. */ -static void emit_umsb(const struct lp_build_tgsi_action *action, - struct lp_build_tgsi_context *bld_base, - struct lp_build_emit_data *emit_data) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - - emit_data->output[emit_data->chan] = - ac_build_umsb(&ctx->ac, emit_data->args[0], emit_data->dst_type); -} - -/* Find the last bit opposite of the sign bit. */ -static void emit_imsb(const struct lp_build_tgsi_action *action, - struct lp_build_tgsi_context *bld_base, - struct lp_build_emit_data *emit_data) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - emit_data->output[emit_data->chan] = - ac_build_imsb(&ctx->ac, emit_data->args[0], - emit_data->dst_type); -} - -static void emit_iabs(const struct lp_build_tgsi_action *action, - struct lp_build_tgsi_context *bld_base, - struct lp_build_emit_data *emit_data) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - - emit_data->output[emit_data->chan] = - ac_build_imax(&ctx->ac, emit_data->args[0], - LLVMBuildNeg(ctx->ac.builder, emit_data->args[0], "")); -} - -static void emit_minmax_int(const struct lp_build_tgsi_action *action, - struct lp_build_tgsi_context *bld_base, - struct lp_build_emit_data *emit_data) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - LLVMIntPredicate op; - - switch (emit_data->info->opcode) { - default: - assert(0); - case TGSI_OPCODE_IMAX: - case TGSI_OPCODE_I64MAX: - op = LLVMIntSGT; - break; - case TGSI_OPCODE_IMIN: - case TGSI_OPCODE_I64MIN: - op = LLVMIntSLT; - break; - case TGSI_OPCODE_UMAX: - case TGSI_OPCODE_U64MAX: - op = LLVMIntUGT; - break; - case TGSI_OPCODE_UMIN: - case TGSI_OPCODE_U64MIN: - op = LLVMIntULT; - break; - } - - emit_data->output[emit_data->chan] = - LLVMBuildSelect(ctx->ac.builder, - LLVMBuildICmp(ctx->ac.builder, op, emit_data->args[0], - emit_data->args[1], ""), - emit_data->args[0], - emit_data->args[1], ""); -} - -static void emit_pk2h(const struct lp_build_tgsi_action *action, - struct lp_build_tgsi_context *bld_base, - struct lp_build_emit_data *emit_data) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - LLVMValueRef v[] = { - lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_X), - lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_Y), - }; - - - /* From the GLSL 4.50 spec: - * "The rounding mode cannot be set and is undefined." - * - * v_cvt_pkrtz_f16 rounds to zero, but it's fastest. - */ - emit_data->output[emit_data->chan] = - LLVMBuildBitCast(ctx->ac.builder, ac_build_cvt_pkrtz_f16(&ctx->ac, v), - ctx->i32, ""); -} - -static void emit_up2h(const struct lp_build_tgsi_action *action, - struct lp_build_tgsi_context *bld_base, - struct lp_build_emit_data *emit_data) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - LLVMTypeRef i16; - LLVMValueRef const16, input, val; - unsigned i; - - i16 = LLVMInt16TypeInContext(ctx->ac.context); - const16 = LLVMConstInt(ctx->i32, 16, 0); - input = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_X); - - for (i = 0; i < 2; i++) { - val = i == 1 ? LLVMBuildLShr(ctx->ac.builder, input, const16, "") : input; - val = LLVMBuildTrunc(ctx->ac.builder, val, i16, ""); - val = ac_to_float(&ctx->ac, val); - emit_data->output[i] = LLVMBuildFPExt(ctx->ac.builder, val, ctx->f32, ""); - } -} - -static void emit_fdiv(const struct lp_build_tgsi_action *action, - struct lp_build_tgsi_context *bld_base, - struct lp_build_emit_data *emit_data) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - - emit_data->output[emit_data->chan] = - ac_build_fdiv(&ctx->ac, emit_data->args[0], emit_data->args[1]); -} - -/* 1/sqrt is translated to rsq for f32 if fp32 denormals are not enabled in - * the target machine. f64 needs global unsafe math flags to get rsq. */ -static void emit_rsq(const struct lp_build_tgsi_action *action, - struct lp_build_tgsi_context *bld_base, - struct lp_build_emit_data *emit_data) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - - LLVMValueRef sqrt = - ac_build_intrinsic(&ctx->ac, "llvm.sqrt.f32", ctx->f32, - &emit_data->args[0], 1, AC_FUNC_ATTR_READNONE); - - emit_data->output[emit_data->chan] = - ac_build_fdiv(&ctx->ac, ctx->ac.f32_1, sqrt); -} - -static void dfracexp_emit(const struct lp_build_tgsi_action *action, - struct lp_build_tgsi_context *bld_base, - struct lp_build_emit_data *emit_data) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - LLVMValueRef in = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_X); - - emit_data->output[emit_data->chan] = - ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.frexp.mant.f64", - ctx->ac.f64, &in, 1, 0); - emit_data->output1[emit_data->chan] = - ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.frexp.exp.i32.f64", - ctx->ac.i32, &in, 1, 0); -} - -void si_shader_context_init_alu(struct lp_build_tgsi_context *bld_base) -{ - lp_set_default_actions(bld_base); - - bld_base->op_actions[TGSI_OPCODE_AND].emit = emit_and; - bld_base->op_actions[TGSI_OPCODE_ARL].emit = emit_arl; - bld_base->op_actions[TGSI_OPCODE_BFI].emit = emit_bfi; - bld_base->op_actions[TGSI_OPCODE_BREV].emit = build_tgsi_intrinsic_nomem; - bld_base->op_actions[TGSI_OPCODE_BREV].intr_name = "llvm.bitreverse.i32"; - bld_base->op_actions[TGSI_OPCODE_CEIL].emit = build_tgsi_intrinsic_nomem; - bld_base->op_actions[TGSI_OPCODE_CEIL].intr_name = "llvm.ceil.f32"; - bld_base->op_actions[TGSI_OPCODE_CMP].emit = emit_cmp; - bld_base->op_actions[TGSI_OPCODE_COS].emit = build_tgsi_intrinsic_nomem; - bld_base->op_actions[TGSI_OPCODE_COS].intr_name = "llvm.cos.f32"; - bld_base->op_actions[TGSI_OPCODE_DABS].emit = build_tgsi_intrinsic_nomem; - bld_base->op_actions[TGSI_OPCODE_DABS].intr_name = "llvm.fabs.f64"; - bld_base->op_actions[TGSI_OPCODE_DCEIL].emit = build_tgsi_intrinsic_nomem; - bld_base->op_actions[TGSI_OPCODE_DCEIL].intr_name = "llvm.ceil.f64"; - bld_base->op_actions[TGSI_OPCODE_DFLR].emit = build_tgsi_intrinsic_nomem; - bld_base->op_actions[TGSI_OPCODE_DFLR].intr_name = "llvm.floor.f64"; - bld_base->op_actions[TGSI_OPCODE_DFMA].emit = build_tgsi_intrinsic_nomem; - bld_base->op_actions[TGSI_OPCODE_DFMA].intr_name = "llvm.fma.f64"; - bld_base->op_actions[TGSI_OPCODE_DFRAC].emit = emit_frac; - bld_base->op_actions[TGSI_OPCODE_DIV].emit = emit_fdiv; - bld_base->op_actions[TGSI_OPCODE_DNEG].emit = emit_dneg; - bld_base->op_actions[TGSI_OPCODE_DROUND].emit = build_tgsi_intrinsic_nomem; - bld_base->op_actions[TGSI_OPCODE_DROUND].intr_name = "llvm.rint.f64"; - bld_base->op_actions[TGSI_OPCODE_DSEQ].emit = emit_dcmp; - bld_base->op_actions[TGSI_OPCODE_DSGE].emit = emit_dcmp; - bld_base->op_actions[TGSI_OPCODE_DSLT].emit = emit_dcmp; - bld_base->op_actions[TGSI_OPCODE_DSNE].emit = emit_dcmp; - bld_base->op_actions[TGSI_OPCODE_DSSG].emit = emit_ssg; - bld_base->op_actions[TGSI_OPCODE_DRSQ].emit = build_tgsi_intrinsic_nomem; - bld_base->op_actions[TGSI_OPCODE_DRSQ].intr_name = "llvm.amdgcn.rsq.f64"; - bld_base->op_actions[TGSI_OPCODE_DSQRT].emit = build_tgsi_intrinsic_nomem; - bld_base->op_actions[TGSI_OPCODE_DSQRT].intr_name = "llvm.sqrt.f64"; - bld_base->op_actions[TGSI_OPCODE_DTRUNC].emit = build_tgsi_intrinsic_nomem; - bld_base->op_actions[TGSI_OPCODE_DTRUNC].intr_name = "llvm.trunc.f64"; - bld_base->op_actions[TGSI_OPCODE_DFRACEXP].emit = dfracexp_emit; - bld_base->op_actions[TGSI_OPCODE_DLDEXP].emit = build_tgsi_intrinsic_nomem; - bld_base->op_actions[TGSI_OPCODE_DLDEXP].intr_name = "llvm.amdgcn.ldexp.f64"; - bld_base->op_actions[TGSI_OPCODE_EX2].emit = build_tgsi_intrinsic_nomem; - bld_base->op_actions[TGSI_OPCODE_EX2].intr_name = "llvm.exp2.f32"; - bld_base->op_actions[TGSI_OPCODE_FLR].emit = build_tgsi_intrinsic_nomem; - bld_base->op_actions[TGSI_OPCODE_FLR].intr_name = "llvm.floor.f32"; - bld_base->op_actions[TGSI_OPCODE_FMA].emit = - bld_base->op_actions[TGSI_OPCODE_MAD].emit; - bld_base->op_actions[TGSI_OPCODE_FRC].emit = emit_frac; - bld_base->op_actions[TGSI_OPCODE_F2I].emit = emit_f2i; - bld_base->op_actions[TGSI_OPCODE_F2U].emit = emit_f2u; - bld_base->op_actions[TGSI_OPCODE_FSEQ].emit = emit_fcmp; - bld_base->op_actions[TGSI_OPCODE_FSGE].emit = emit_fcmp; - bld_base->op_actions[TGSI_OPCODE_FSLT].emit = emit_fcmp; - bld_base->op_actions[TGSI_OPCODE_FSNE].emit = emit_fcmp; - bld_base->op_actions[TGSI_OPCODE_IABS].emit = emit_iabs; - bld_base->op_actions[TGSI_OPCODE_IBFE].emit = emit_bfe; - bld_base->op_actions[TGSI_OPCODE_IDIV].emit = emit_idiv; - bld_base->op_actions[TGSI_OPCODE_IMAX].emit = emit_minmax_int; - bld_base->op_actions[TGSI_OPCODE_IMIN].emit = emit_minmax_int; - bld_base->op_actions[TGSI_OPCODE_IMSB].emit = emit_imsb; - bld_base->op_actions[TGSI_OPCODE_INEG].emit = emit_ineg; - bld_base->op_actions[TGSI_OPCODE_ISHR].emit = emit_ishr; - bld_base->op_actions[TGSI_OPCODE_ISGE].emit = emit_icmp; - bld_base->op_actions[TGSI_OPCODE_ISLT].emit = emit_icmp; - bld_base->op_actions[TGSI_OPCODE_ISSG].emit = emit_ssg; - bld_base->op_actions[TGSI_OPCODE_I2F].emit = emit_i2f; - bld_base->op_actions[TGSI_OPCODE_KILL_IF].emit = kil_emit; - bld_base->op_actions[TGSI_OPCODE_KILL].emit = kil_emit; - bld_base->op_actions[TGSI_OPCODE_LDEXP].emit = build_tgsi_intrinsic_nomem; - bld_base->op_actions[TGSI_OPCODE_LDEXP].intr_name = "llvm.amdgcn.ldexp.f32"; - bld_base->op_actions[TGSI_OPCODE_LSB].emit = emit_lsb; - bld_base->op_actions[TGSI_OPCODE_LG2].emit = build_tgsi_intrinsic_nomem; - bld_base->op_actions[TGSI_OPCODE_LG2].intr_name = "llvm.log2.f32"; - bld_base->op_actions[TGSI_OPCODE_MAX].emit = build_tgsi_intrinsic_nomem; - bld_base->op_actions[TGSI_OPCODE_MAX].intr_name = "llvm.maxnum.f32"; - bld_base->op_actions[TGSI_OPCODE_MIN].emit = build_tgsi_intrinsic_nomem; - bld_base->op_actions[TGSI_OPCODE_MIN].intr_name = "llvm.minnum.f32"; - bld_base->op_actions[TGSI_OPCODE_MOD].emit = emit_mod; - bld_base->op_actions[TGSI_OPCODE_UMSB].emit = emit_umsb; - bld_base->op_actions[TGSI_OPCODE_NOT].emit = emit_not; - bld_base->op_actions[TGSI_OPCODE_OR].emit = emit_or; - bld_base->op_actions[TGSI_OPCODE_PK2H].emit = emit_pk2h; - bld_base->op_actions[TGSI_OPCODE_POPC].emit = build_tgsi_intrinsic_nomem; - bld_base->op_actions[TGSI_OPCODE_POPC].intr_name = "llvm.ctpop.i32"; - bld_base->op_actions[TGSI_OPCODE_POW].emit = build_tgsi_intrinsic_nomem; - bld_base->op_actions[TGSI_OPCODE_POW].intr_name = "llvm.pow.f32"; - bld_base->op_actions[TGSI_OPCODE_ROUND].emit = build_tgsi_intrinsic_nomem; - bld_base->op_actions[TGSI_OPCODE_ROUND].intr_name = "llvm.rint.f32"; - bld_base->op_actions[TGSI_OPCODE_RSQ].emit = emit_rsq; - bld_base->op_actions[TGSI_OPCODE_SGE].emit = emit_set_cond; - bld_base->op_actions[TGSI_OPCODE_SEQ].emit = emit_set_cond; - bld_base->op_actions[TGSI_OPCODE_SHL].emit = emit_shl; - bld_base->op_actions[TGSI_OPCODE_SLE].emit = emit_set_cond; - bld_base->op_actions[TGSI_OPCODE_SLT].emit = emit_set_cond; - bld_base->op_actions[TGSI_OPCODE_SNE].emit = emit_set_cond; - bld_base->op_actions[TGSI_OPCODE_SGT].emit = emit_set_cond; - bld_base->op_actions[TGSI_OPCODE_SIN].emit = build_tgsi_intrinsic_nomem; - bld_base->op_actions[TGSI_OPCODE_SIN].intr_name = "llvm.sin.f32"; - bld_base->op_actions[TGSI_OPCODE_SQRT].emit = build_tgsi_intrinsic_nomem; - bld_base->op_actions[TGSI_OPCODE_SQRT].intr_name = "llvm.sqrt.f32"; - bld_base->op_actions[TGSI_OPCODE_SSG].emit = emit_ssg; - bld_base->op_actions[TGSI_OPCODE_TRUNC].emit = build_tgsi_intrinsic_nomem; - bld_base->op_actions[TGSI_OPCODE_TRUNC].intr_name = "llvm.trunc.f32"; - bld_base->op_actions[TGSI_OPCODE_UADD].emit = emit_uadd; - bld_base->op_actions[TGSI_OPCODE_UBFE].emit = emit_bfe; - bld_base->op_actions[TGSI_OPCODE_UDIV].emit = emit_udiv; - bld_base->op_actions[TGSI_OPCODE_UMAX].emit = emit_minmax_int; - bld_base->op_actions[TGSI_OPCODE_UMIN].emit = emit_minmax_int; - bld_base->op_actions[TGSI_OPCODE_UMOD].emit = emit_umod; - bld_base->op_actions[TGSI_OPCODE_USEQ].emit = emit_icmp; - bld_base->op_actions[TGSI_OPCODE_USGE].emit = emit_icmp; - bld_base->op_actions[TGSI_OPCODE_USHR].emit = emit_ushr; - bld_base->op_actions[TGSI_OPCODE_USLT].emit = emit_icmp; - bld_base->op_actions[TGSI_OPCODE_USNE].emit = emit_icmp; - bld_base->op_actions[TGSI_OPCODE_U2F].emit = emit_u2f; - bld_base->op_actions[TGSI_OPCODE_XOR].emit = emit_xor; - bld_base->op_actions[TGSI_OPCODE_UCMP].emit = emit_ucmp; - bld_base->op_actions[TGSI_OPCODE_UP2H].emit = emit_up2h; - - bld_base->op_actions[TGSI_OPCODE_I64MAX].emit = emit_minmax_int; - bld_base->op_actions[TGSI_OPCODE_I64MIN].emit = emit_minmax_int; - bld_base->op_actions[TGSI_OPCODE_U64MAX].emit = emit_minmax_int; - bld_base->op_actions[TGSI_OPCODE_U64MIN].emit = emit_minmax_int; - bld_base->op_actions[TGSI_OPCODE_I64ABS].emit = emit_iabs; - bld_base->op_actions[TGSI_OPCODE_I64SSG].emit = emit_ssg; - bld_base->op_actions[TGSI_OPCODE_I64NEG].emit = emit_ineg; - - bld_base->op_actions[TGSI_OPCODE_U64SEQ].emit = emit_icmp; - bld_base->op_actions[TGSI_OPCODE_U64SNE].emit = emit_icmp; - bld_base->op_actions[TGSI_OPCODE_U64SGE].emit = emit_icmp; - bld_base->op_actions[TGSI_OPCODE_U64SLT].emit = emit_icmp; - bld_base->op_actions[TGSI_OPCODE_I64SGE].emit = emit_icmp; - bld_base->op_actions[TGSI_OPCODE_I64SLT].emit = emit_icmp; - - bld_base->op_actions[TGSI_OPCODE_U64ADD].emit = emit_uadd; - bld_base->op_actions[TGSI_OPCODE_U64SHL].emit = emit_shl; - bld_base->op_actions[TGSI_OPCODE_U64SHR].emit = emit_ushr; - bld_base->op_actions[TGSI_OPCODE_I64SHR].emit = emit_ishr; - - bld_base->op_actions[TGSI_OPCODE_U64MOD].emit = emit_umod; - bld_base->op_actions[TGSI_OPCODE_I64MOD].emit = emit_mod; - bld_base->op_actions[TGSI_OPCODE_U64DIV].emit = emit_udiv; - bld_base->op_actions[TGSI_OPCODE_I64DIV].emit = emit_idiv; -} diff -Nru mesa-19.2.8/src/gallium/drivers/radeonsi/si_shader_tgsi_mem.c mesa-20.0.8/src/gallium/drivers/radeonsi/si_shader_tgsi_mem.c --- mesa-19.2.8/src/gallium/drivers/radeonsi/si_shader_tgsi_mem.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/radeonsi/si_shader_tgsi_mem.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,1835 +0,0 @@ -/* - * Copyright 2017 Advanced Micro Devices, Inc. - * All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * on the rights to use, copy, modify, merge, publish, distribute, sub - * license, and/or sell copies of the Software, and to permit persons to whom - * the Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL - * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, - * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR - * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE - * USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -#include "si_shader_internal.h" -#include "si_pipe.h" -#include "sid.h" -#include "tgsi/tgsi_build.h" -#include "tgsi/tgsi_util.h" -#include "ac_llvm_util.h" - -static void tex_fetch_ptrs(struct lp_build_tgsi_context *bld_base, - struct lp_build_emit_data *emit_data, - LLVMValueRef *res_ptr, LLVMValueRef *samp_ptr, - LLVMValueRef *fmask_ptr); - -/** - * Given a v8i32 resource descriptor for a buffer, extract the size of the - * buffer in number of elements and return it as an i32. - */ -static LLVMValueRef get_buffer_size( - struct lp_build_tgsi_context *bld_base, - LLVMValueRef descriptor) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - LLVMBuilderRef builder = ctx->ac.builder; - LLVMValueRef size = - LLVMBuildExtractElement(builder, descriptor, - LLVMConstInt(ctx->i32, 2, 0), ""); - - if (ctx->screen->info.chip_class == GFX8) { - /* On GFX8, the descriptor contains the size in bytes, - * but TXQ must return the size in elements. - * The stride is always non-zero for resources using TXQ. - */ - LLVMValueRef stride = - LLVMBuildExtractElement(builder, descriptor, - ctx->i32_1, ""); - stride = LLVMBuildLShr(builder, stride, - LLVMConstInt(ctx->i32, 16, 0), ""); - stride = LLVMBuildAnd(builder, stride, - LLVMConstInt(ctx->i32, 0x3FFF, 0), ""); - - size = LLVMBuildUDiv(builder, size, stride, ""); - } - - return size; -} - -static LLVMValueRef -shader_buffer_fetch_rsrc(struct si_shader_context *ctx, - const struct tgsi_full_src_register *reg, - bool ubo) -{ - LLVMValueRef index; - - if (!reg->Register.Indirect) { - index = LLVMConstInt(ctx->i32, reg->Register.Index, false); - } else { - index = si_get_indirect_index(ctx, ®->Indirect, - 1, reg->Register.Index); - } - - if (ubo) - return ctx->abi.load_ubo(&ctx->abi, index); - else - return ctx->abi.load_ssbo(&ctx->abi, index, false); -} - -static enum ac_image_dim -ac_texture_dim_from_tgsi_target(struct si_screen *screen, enum tgsi_texture_type target) -{ - switch (target) { - case TGSI_TEXTURE_1D: - case TGSI_TEXTURE_SHADOW1D: - if (screen->info.chip_class == GFX9) - return ac_image_2d; - return ac_image_1d; - case TGSI_TEXTURE_2D: - case TGSI_TEXTURE_SHADOW2D: - case TGSI_TEXTURE_RECT: - case TGSI_TEXTURE_SHADOWRECT: - return ac_image_2d; - case TGSI_TEXTURE_3D: - return ac_image_3d; - case TGSI_TEXTURE_CUBE: - case TGSI_TEXTURE_SHADOWCUBE: - case TGSI_TEXTURE_CUBE_ARRAY: - case TGSI_TEXTURE_SHADOWCUBE_ARRAY: - return ac_image_cube; - case TGSI_TEXTURE_1D_ARRAY: - case TGSI_TEXTURE_SHADOW1D_ARRAY: - if (screen->info.chip_class == GFX9) - return ac_image_2darray; - return ac_image_1darray; - case TGSI_TEXTURE_2D_ARRAY: - case TGSI_TEXTURE_SHADOW2D_ARRAY: - return ac_image_2darray; - case TGSI_TEXTURE_2D_MSAA: - return ac_image_2dmsaa; - case TGSI_TEXTURE_2D_ARRAY_MSAA: - return ac_image_2darraymsaa; - default: - unreachable("unhandled texture type"); - } -} - -static enum ac_image_dim -ac_image_dim_from_tgsi_target(struct si_screen *screen, enum tgsi_texture_type target) -{ - enum ac_image_dim dim = ac_texture_dim_from_tgsi_target(screen, target); - - /* Match the resource type set in the descriptor. */ - if (dim == ac_image_cube || - (screen->info.chip_class <= GFX8 && dim == ac_image_3d)) - dim = ac_image_2darray; - else if (target == TGSI_TEXTURE_2D && screen->info.chip_class == GFX9) { - /* When a single layer of a 3D texture is bound, the shader - * will refer to a 2D target, but the descriptor has a 3D type. - * Since the HW ignores BASE_ARRAY in this case, we need to - * send 3 coordinates. This doesn't hurt when the underlying - * texture is non-3D. - */ - dim = ac_image_3d; - } - - return dim; -} - -/** - * Given a 256-bit resource descriptor, force the DCC enable bit to off. - * - * At least on Tonga, executing image stores on images with DCC enabled and - * non-trivial can eventually lead to lockups. This can occur when an - * application binds an image as read-only but then uses a shader that writes - * to it. The OpenGL spec allows almost arbitrarily bad behavior (including - * program termination) in this case, but it doesn't cost much to be a bit - * nicer: disabling DCC in the shader still leads to undefined results but - * avoids the lockup. - */ -static LLVMValueRef force_dcc_off(struct si_shader_context *ctx, - LLVMValueRef rsrc) -{ - if (ctx->screen->info.chip_class <= GFX7) { - return rsrc; - } else { - LLVMValueRef i32_6 = LLVMConstInt(ctx->i32, 6, 0); - LLVMValueRef i32_C = LLVMConstInt(ctx->i32, C_008F28_COMPRESSION_EN, 0); - LLVMValueRef tmp; - - tmp = LLVMBuildExtractElement(ctx->ac.builder, rsrc, i32_6, ""); - tmp = LLVMBuildAnd(ctx->ac.builder, tmp, i32_C, ""); - return LLVMBuildInsertElement(ctx->ac.builder, rsrc, tmp, i32_6, ""); - } -} - -LLVMValueRef si_load_image_desc(struct si_shader_context *ctx, - LLVMValueRef list, LLVMValueRef index, - enum ac_descriptor_type desc_type, - bool uses_store, bool bindless) -{ - LLVMBuilderRef builder = ctx->ac.builder; - LLVMValueRef rsrc; - - if (desc_type == AC_DESC_BUFFER) { - index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->i32, 2, 0), - ctx->i32_1); - list = LLVMBuildPointerCast(builder, list, - ac_array_in_const32_addr_space(ctx->v4i32), ""); - } else { - assert(desc_type == AC_DESC_IMAGE); - } - - if (bindless) - rsrc = ac_build_load_to_sgpr_uint_wraparound(&ctx->ac, list, index); - else - rsrc = ac_build_load_to_sgpr(&ctx->ac, list, index); - - if (desc_type == AC_DESC_IMAGE && uses_store) - rsrc = force_dcc_off(ctx, rsrc); - return rsrc; -} - -/** - * Load the resource descriptor for \p image. - */ -static void -image_fetch_rsrc( - struct lp_build_tgsi_context *bld_base, - const struct tgsi_full_src_register *image, - bool is_store, unsigned target, - LLVMValueRef *rsrc) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - LLVMValueRef rsrc_ptr = LLVMGetParam(ctx->main_fn, - ctx->param_samplers_and_images); - LLVMValueRef index; - - if (!image->Register.Indirect) { - index = LLVMConstInt(ctx->i32, - si_get_image_slot(image->Register.Index), 0); - } else { - /* From the GL_ARB_shader_image_load_store extension spec: - * - * If a shader performs an image load, store, or atomic - * operation using an image variable declared as an array, - * and if the index used to select an individual element is - * negative or greater than or equal to the size of the - * array, the results of the operation are undefined but may - * not lead to termination. - */ - index = si_get_bounded_indirect_index(ctx, &image->Indirect, - image->Register.Index, - ctx->num_images); - index = LLVMBuildSub(ctx->ac.builder, - LLVMConstInt(ctx->i32, SI_NUM_IMAGES - 1, 0), - index, ""); - } - - bool bindless = false; - - if (image->Register.File != TGSI_FILE_IMAGE) { - /* Bindless descriptors are accessible from a different pair of - * user SGPR indices. - */ - rsrc_ptr = LLVMGetParam(ctx->main_fn, - ctx->param_bindless_samplers_and_images); - index = lp_build_emit_fetch_src(bld_base, image, - TGSI_TYPE_UNSIGNED, 0); - - /* For simplicity, bindless image descriptors use fixed - * 16-dword slots for now. - */ - index = LLVMBuildMul(ctx->ac.builder, index, - LLVMConstInt(ctx->i32, 2, 0), ""); - bindless = true; - } - - *rsrc = si_load_image_desc(ctx, rsrc_ptr, index, - target == TGSI_TEXTURE_BUFFER ? AC_DESC_BUFFER : AC_DESC_IMAGE, - is_store, bindless); -} - -static void image_fetch_coords( - struct lp_build_tgsi_context *bld_base, - const struct tgsi_full_instruction *inst, - unsigned src, LLVMValueRef desc, - LLVMValueRef *coords) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - LLVMBuilderRef builder = ctx->ac.builder; - unsigned target = inst->Memory.Texture; - unsigned num_coords = tgsi_util_get_texture_coord_dim(target); - LLVMValueRef tmp; - int chan; - - if (target == TGSI_TEXTURE_2D_MSAA || - target == TGSI_TEXTURE_2D_ARRAY_MSAA) { - /* Need the sample index as well. */ - num_coords++; - } - - for (chan = 0; chan < num_coords; ++chan) { - tmp = lp_build_emit_fetch(bld_base, inst, src, chan); - tmp = ac_to_integer(&ctx->ac, tmp); - coords[chan] = tmp; - } - - if (ctx->screen->info.chip_class == GFX9) { - /* 1D textures are allocated and used as 2D on GFX9. */ - if (target == TGSI_TEXTURE_1D) { - coords[1] = ctx->i32_0; - } else if (target == TGSI_TEXTURE_1D_ARRAY) { - coords[2] = coords[1]; - coords[1] = ctx->i32_0; - } else if (target == TGSI_TEXTURE_2D) { - /* The hw can't bind a slice of a 3D image as a 2D - * image, because it ignores BASE_ARRAY if the target - * is 3D. The workaround is to read BASE_ARRAY and set - * it as the 3rd address operand for all 2D images. - */ - LLVMValueRef first_layer, const5, mask; - - const5 = LLVMConstInt(ctx->i32, 5, 0); - mask = LLVMConstInt(ctx->i32, S_008F24_BASE_ARRAY(~0), 0); - first_layer = LLVMBuildExtractElement(builder, desc, const5, ""); - first_layer = LLVMBuildAnd(builder, first_layer, mask, ""); - - coords[2] = first_layer; - } - } -} - -static unsigned get_cache_policy(struct si_shader_context *ctx, - const struct tgsi_full_instruction *inst, - bool atomic, bool may_store_unaligned, - bool writeonly_memory) -{ - unsigned cache_policy = 0; - - if (!atomic && - /* GFX6 has a TC L1 bug causing corruption of 8bit/16bit stores. - * All store opcodes not aligned to a dword are affected. - * The only way to get unaligned stores in radeonsi is through - * shader images. */ - ((may_store_unaligned && ctx->screen->info.chip_class == GFX6) || - /* If this is write-only, don't keep data in L1 to prevent - * evicting L1 cache lines that may be needed by other - * instructions. */ - writeonly_memory || - inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE))) { - cache_policy |= ac_glc; - } - - if (inst->Memory.Qualifier & TGSI_MEMORY_STREAM_CACHE_POLICY) - cache_policy |= ac_slc; - - return cache_policy; -} - -static LLVMValueRef get_memory_ptr(struct si_shader_context *ctx, - const struct tgsi_full_instruction *inst, - LLVMTypeRef type, int arg) -{ - LLVMBuilderRef builder = ctx->ac.builder; - LLVMValueRef offset, ptr; - int addr_space; - - offset = lp_build_emit_fetch(&ctx->bld_base, inst, arg, 0); - offset = ac_to_integer(&ctx->ac, offset); - - ptr = ctx->ac.lds; - ptr = LLVMBuildGEP(builder, ptr, &offset, 1, ""); - addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr)); - ptr = LLVMBuildBitCast(builder, ptr, LLVMPointerType(type, addr_space), ""); - - return ptr; -} - -static void load_emit_memory( - struct si_shader_context *ctx, - struct lp_build_emit_data *emit_data) -{ - const struct tgsi_full_instruction *inst = emit_data->inst; - unsigned writemask = inst->Dst[0].Register.WriteMask; - LLVMValueRef channels[4], ptr, derived_ptr, index; - int chan; - - ptr = get_memory_ptr(ctx, inst, ctx->f32, 1); - - for (chan = 0; chan < 4; ++chan) { - if (!(writemask & (1 << chan))) { - channels[chan] = LLVMGetUndef(ctx->f32); - continue; - } - - index = LLVMConstInt(ctx->i32, chan, 0); - derived_ptr = LLVMBuildGEP(ctx->ac.builder, ptr, &index, 1, ""); - channels[chan] = LLVMBuildLoad(ctx->ac.builder, derived_ptr, ""); - } - emit_data->output[emit_data->chan] = ac_build_gather_values(&ctx->ac, channels, 4); -} - -/** - * Return true if the memory accessed by a LOAD or STORE instruction is - * read-only or write-only, respectively. - * - * \param shader_buffers_reverse_access_mask - * For LOAD, set this to (store | atomic) slot usage in the shader. - * For STORE, set this to (load | atomic) slot usage in the shader. - * \param images_reverse_access_mask Same as above, but for images. - * \param bindless_buffer_reverse_access_mask Same as above, but for bindless image buffers. - * \param bindless_image_reverse_access_mask Same as above, but for bindless images. - */ -static bool is_oneway_access_only(const struct tgsi_full_instruction *inst, - const struct tgsi_shader_info *info, - unsigned shader_buffers_reverse_access_mask, - unsigned images_reverse_access_mask, - bool bindless_buffer_reverse_access_mask, - bool bindless_image_reverse_access_mask) -{ - enum tgsi_file_type resource_file; - unsigned resource_index; - bool resource_indirect; - - if (inst->Instruction.Opcode == TGSI_OPCODE_STORE) { - resource_file = inst->Dst[0].Register.File; - resource_index = inst->Dst[0].Register.Index; - resource_indirect = inst->Dst[0].Register.Indirect; - } else { - resource_file = inst->Src[0].Register.File; - resource_index = inst->Src[0].Register.Index; - resource_indirect = inst->Src[0].Register.Indirect; - } - - assert(resource_file == TGSI_FILE_BUFFER || - resource_file == TGSI_FILE_IMAGE || - /* bindless image */ - resource_file == TGSI_FILE_INPUT || - resource_file == TGSI_FILE_OUTPUT || - resource_file == TGSI_FILE_CONSTANT || - resource_file == TGSI_FILE_TEMPORARY || - resource_file == TGSI_FILE_IMMEDIATE); - - assert(resource_file != TGSI_FILE_BUFFER || - inst->Memory.Texture == TGSI_TEXTURE_BUFFER); - - bool bindless = resource_file != TGSI_FILE_BUFFER && - resource_file != TGSI_FILE_IMAGE; - - /* RESTRICT means NOALIAS. - * If there are no writes, we can assume the accessed memory is read-only. - * If there are no reads, we can assume the accessed memory is write-only. - */ - if (inst->Memory.Qualifier & TGSI_MEMORY_RESTRICT && !bindless) { - unsigned reverse_access_mask; - - if (resource_file == TGSI_FILE_BUFFER) { - reverse_access_mask = shader_buffers_reverse_access_mask; - } else if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) { - reverse_access_mask = info->images_buffers & - images_reverse_access_mask; - } else { - reverse_access_mask = ~info->images_buffers & - images_reverse_access_mask; - } - - if (resource_indirect) { - if (!reverse_access_mask) - return true; - } else { - if (!(reverse_access_mask & - (1u << resource_index))) - return true; - } - } - - /* If there are no buffer writes (for both shader buffers & image - * buffers), it implies that buffer memory is read-only. - * If there are no buffer reads (for both shader buffers & image - * buffers), it implies that buffer memory is write-only. - * - * Same for the case when there are no writes/reads for non-buffer - * images. - */ - if (resource_file == TGSI_FILE_BUFFER || - inst->Memory.Texture == TGSI_TEXTURE_BUFFER) { - if (!shader_buffers_reverse_access_mask && - !(info->images_buffers & images_reverse_access_mask) && - !bindless_buffer_reverse_access_mask) - return true; - } else { - if (!(~info->images_buffers & images_reverse_access_mask) && - !bindless_image_reverse_access_mask) - return true; - } - return false; -} - -static void load_emit( - const struct lp_build_tgsi_action *action, - struct lp_build_tgsi_context *bld_base, - struct lp_build_emit_data *emit_data) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - const struct tgsi_full_instruction * inst = emit_data->inst; - const struct tgsi_shader_info *info = &ctx->shader->selector->info; - bool can_speculate = false; - LLVMValueRef vindex = ctx->i32_0; - LLVMValueRef voffset = ctx->i32_0; - struct ac_image_args args = {}; - - if (inst->Src[0].Register.File == TGSI_FILE_MEMORY) { - load_emit_memory(ctx, emit_data); - return; - } - - if (inst->Src[0].Register.File == TGSI_FILE_BUFFER || - inst->Src[0].Register.File == TGSI_FILE_CONSTBUF) { - bool ubo = inst->Src[0].Register.File == TGSI_FILE_CONSTBUF; - args.resource = shader_buffer_fetch_rsrc(ctx, &inst->Src[0], ubo); - voffset = ac_to_integer(&ctx->ac, lp_build_emit_fetch(bld_base, inst, 1, 0)); - } else { - unsigned target = inst->Memory.Texture; - - image_fetch_rsrc(bld_base, &inst->Src[0], false, target, &args.resource); - image_fetch_coords(bld_base, inst, 1, args.resource, args.coords); - vindex = args.coords[0]; /* for buffers only */ - } - - if (inst->Src[0].Register.File == TGSI_FILE_CONSTBUF) { - emit_data->output[emit_data->chan] = - ac_build_buffer_load(&ctx->ac, args.resource, - util_last_bit(inst->Dst[0].Register.WriteMask), - NULL, voffset, NULL, 0, 0, true, true); - return; - } - - if (inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE) - ac_build_waitcnt(&ctx->ac, AC_WAIT_VLOAD | AC_WAIT_VSTORE); - - can_speculate = !(inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE) && - is_oneway_access_only(inst, info, - info->shader_buffers_store | - info->shader_buffers_atomic, - info->images_store | - info->images_atomic, - info->uses_bindless_buffer_store | - info->uses_bindless_buffer_atomic, - info->uses_bindless_image_store | - info->uses_bindless_image_atomic); - args.cache_policy = get_cache_policy(ctx, inst, false, false, false); - - if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) { - /* Don't use SMEM for shader buffer loads, because LLVM doesn't - * select SMEM for SI.load.const with a non-constant offset, and - * constant offsets practically don't exist with shader buffers. - * - * Also, SI.load.const doesn't use inst_offset when it's lowered - * to VMEM, so we just end up with more VALU instructions in the end - * and no benefit. - * - * TODO: Remove this line once LLVM can select SMEM with a non-constant - * offset, and can derive inst_offset when VMEM is selected. - * After that, si_memory_barrier should invalidate sL1 for shader - * buffers. - */ - emit_data->output[emit_data->chan] = - ac_build_buffer_load(&ctx->ac, args.resource, - util_last_bit(inst->Dst[0].Register.WriteMask), - NULL, voffset, NULL, 0, - args.cache_policy, can_speculate, false); - return; - } - - if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) { - unsigned num_channels = util_last_bit(inst->Dst[0].Register.WriteMask); - LLVMValueRef result = - ac_build_buffer_load_format(&ctx->ac, - args.resource, - vindex, - ctx->i32_0, - num_channels, - args.cache_policy, - can_speculate); - emit_data->output[emit_data->chan] = - ac_build_expand_to_vec4(&ctx->ac, result, num_channels); - } else { - args.opcode = ac_image_load; - args.dim = ac_image_dim_from_tgsi_target(ctx->screen, inst->Memory.Texture); - args.attributes = ac_get_load_intr_attribs(can_speculate); - args.dmask = 0xf; - - emit_data->output[emit_data->chan] = - ac_build_image_opcode(&ctx->ac, &args); - } -} - -static void store_emit_buffer(struct si_shader_context *ctx, - LLVMValueRef resource, - unsigned writemask, - LLVMValueRef value, - LLVMValueRef voffset, - unsigned cache_policy, - bool writeonly_memory) -{ - LLVMBuilderRef builder = ctx->ac.builder; - LLVMValueRef base_data = value; - LLVMValueRef base_offset = voffset; - - while (writemask) { - int start, count; - LLVMValueRef data, voff; - - u_bit_scan_consecutive_range(&writemask, &start, &count); - - if (count == 3 && ac_has_vec3_support(ctx->ac.chip_class, false)) { - LLVMValueRef values[3] = { - LLVMBuildExtractElement(builder, base_data, - LLVMConstInt(ctx->i32, start, 0), ""), - LLVMBuildExtractElement(builder, base_data, - LLVMConstInt(ctx->i32, start + 1, 0), ""), - LLVMBuildExtractElement(builder, base_data, - LLVMConstInt(ctx->i32, start + 2, 0), ""), - }; - data = ac_build_gather_values(&ctx->ac, values, 3); - } else if (count >= 3) { - data = base_data; - } else if (count == 2) { - LLVMValueRef values[2] = { - LLVMBuildExtractElement(builder, base_data, - LLVMConstInt(ctx->i32, start, 0), ""), - LLVMBuildExtractElement(builder, base_data, - LLVMConstInt(ctx->i32, start + 1, 0), ""), - }; - - data = ac_build_gather_values(&ctx->ac, values, 2); - } else { - assert(count == 1); - data = LLVMBuildExtractElement( - builder, base_data, - LLVMConstInt(ctx->i32, start, 0), ""); - } - - voff = base_offset; - if (start != 0) { - voff = LLVMBuildAdd( - builder, voff, - LLVMConstInt(ctx->i32, start * 4, 0), ""); - } - - ac_build_buffer_store_dword(&ctx->ac, resource, data, count, - voff, ctx->i32_0, 0, cache_policy, - false); - } -} - -static void store_emit_memory( - struct si_shader_context *ctx, - struct lp_build_emit_data *emit_data) -{ - const struct tgsi_full_instruction *inst = emit_data->inst; - LLVMBuilderRef builder = ctx->ac.builder; - unsigned writemask = inst->Dst[0].Register.WriteMask; - LLVMValueRef ptr, derived_ptr, data, index; - int chan; - - ptr = get_memory_ptr(ctx, inst, ctx->f32, 0); - - for (chan = 0; chan < 4; ++chan) { - if (!(writemask & (1 << chan))) { - continue; - } - data = lp_build_emit_fetch(&ctx->bld_base, inst, 1, chan); - index = LLVMConstInt(ctx->i32, chan, 0); - derived_ptr = LLVMBuildGEP(builder, ptr, &index, 1, ""); - LLVMBuildStore(builder, data, derived_ptr); - } -} - -static void store_emit( - const struct lp_build_tgsi_action *action, - struct lp_build_tgsi_context *bld_base, - struct lp_build_emit_data *emit_data) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - const struct tgsi_full_instruction * inst = emit_data->inst; - const struct tgsi_shader_info *info = &ctx->shader->selector->info; - struct tgsi_full_src_register resource_reg = - tgsi_full_src_register_from_dst(&inst->Dst[0]); - unsigned target = inst->Memory.Texture; - - if (inst->Dst[0].Register.File == TGSI_FILE_MEMORY) { - store_emit_memory(ctx, emit_data); - return; - } - - bool writeonly_memory = is_oneway_access_only(inst, info, - info->shader_buffers_load | - info->shader_buffers_atomic, - info->images_load | - info->images_atomic, - info->uses_bindless_buffer_load | - info->uses_bindless_buffer_atomic, - info->uses_bindless_image_load | - info->uses_bindless_image_atomic); - LLVMValueRef chans[4]; - LLVMValueRef vindex = ctx->i32_0; - LLVMValueRef voffset = ctx->i32_0; - struct ac_image_args args = {}; - - for (unsigned chan = 0; chan < 4; ++chan) - chans[chan] = lp_build_emit_fetch(bld_base, inst, 1, chan); - - if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER) { - args.resource = shader_buffer_fetch_rsrc(ctx, &resource_reg, false); - voffset = ac_to_integer(&ctx->ac, lp_build_emit_fetch(bld_base, inst, 0, 0)); - } else { - image_fetch_rsrc(bld_base, &resource_reg, true, target, &args.resource); - image_fetch_coords(bld_base, inst, 0, args.resource, args.coords); - vindex = args.coords[0]; /* for buffers only */ - } - - if (inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE) - ac_build_waitcnt(&ctx->ac, AC_WAIT_VLOAD | AC_WAIT_VSTORE); - - bool is_image = inst->Dst[0].Register.File != TGSI_FILE_BUFFER; - args.cache_policy = get_cache_policy(ctx, inst, - false, /* atomic */ - is_image, /* may_store_unaligned */ - writeonly_memory); - - if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER) { - store_emit_buffer(ctx, args.resource, inst->Dst[0].Register.WriteMask, - ac_build_gather_values(&ctx->ac, chans, 4), - voffset, args.cache_policy, writeonly_memory); - return; - } - - if (target == TGSI_TEXTURE_BUFFER) { - unsigned num_channels = util_last_bit(inst->Dst[0].Register.WriteMask); - - ac_build_buffer_store_format(&ctx->ac, args.resource, - ac_build_gather_values(&ctx->ac, chans, num_channels), - vindex, ctx->i32_0 /* voffset */, - num_channels, - args.cache_policy); - } else { - args.opcode = ac_image_store; - args.data[0] = ac_build_gather_values(&ctx->ac, chans, 4); - args.dim = ac_image_dim_from_tgsi_target(ctx->screen, inst->Memory.Texture); - args.attributes = AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY; - args.dmask = 0xf; - - emit_data->output[emit_data->chan] = - ac_build_image_opcode(&ctx->ac, &args); - } -} - -static void atomic_emit_memory(struct si_shader_context *ctx, - struct lp_build_emit_data *emit_data) { - LLVMBuilderRef builder = ctx->ac.builder; - const struct tgsi_full_instruction * inst = emit_data->inst; - LLVMValueRef ptr, result, arg; - const char *sync_scope = HAVE_LLVM >= 0x0900 ? "workgroup-one-as" : "workgroup"; - - ptr = get_memory_ptr(ctx, inst, ctx->i32, 1); - - arg = lp_build_emit_fetch(&ctx->bld_base, inst, 2, 0); - arg = ac_to_integer(&ctx->ac, arg); - - if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) { - LLVMValueRef new_data; - new_data = lp_build_emit_fetch(&ctx->bld_base, - inst, 3, 0); - - new_data = ac_to_integer(&ctx->ac, new_data); - - result = ac_build_atomic_cmp_xchg(&ctx->ac, ptr, arg, new_data, - sync_scope); - result = LLVMBuildExtractValue(builder, result, 0, ""); - } else { - LLVMAtomicRMWBinOp op; - - switch(inst->Instruction.Opcode) { - case TGSI_OPCODE_ATOMUADD: - op = LLVMAtomicRMWBinOpAdd; - break; - case TGSI_OPCODE_ATOMXCHG: - op = LLVMAtomicRMWBinOpXchg; - break; - case TGSI_OPCODE_ATOMAND: - op = LLVMAtomicRMWBinOpAnd; - break; - case TGSI_OPCODE_ATOMOR: - op = LLVMAtomicRMWBinOpOr; - break; - case TGSI_OPCODE_ATOMXOR: - op = LLVMAtomicRMWBinOpXor; - break; - case TGSI_OPCODE_ATOMUMIN: - op = LLVMAtomicRMWBinOpUMin; - break; - case TGSI_OPCODE_ATOMUMAX: - op = LLVMAtomicRMWBinOpUMax; - break; - case TGSI_OPCODE_ATOMIMIN: - op = LLVMAtomicRMWBinOpMin; - break; - case TGSI_OPCODE_ATOMIMAX: - op = LLVMAtomicRMWBinOpMax; - break; - default: - unreachable("unknown atomic opcode"); - } - - result = ac_build_atomic_rmw(&ctx->ac, op, ptr, arg, sync_scope); - } - emit_data->output[emit_data->chan] = - LLVMBuildBitCast(builder, result, ctx->f32, ""); -} - -static void atomic_emit( - const struct lp_build_tgsi_action *action, - struct lp_build_tgsi_context *bld_base, - struct lp_build_emit_data *emit_data) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - const struct tgsi_full_instruction * inst = emit_data->inst; - struct ac_image_args args = {}; - unsigned num_data = 0; - LLVMValueRef vindex = ctx->i32_0; - LLVMValueRef voffset = ctx->i32_0; - - if (inst->Src[0].Register.File == TGSI_FILE_MEMORY) { - atomic_emit_memory(ctx, emit_data); - return; - } - - if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) { - /* llvm.amdgcn.image/buffer.atomic.cmpswap reflect the hardware order - * of arguments, which is reversed relative to TGSI (and GLSL) - */ - args.data[num_data++] = - ac_to_integer(&ctx->ac, lp_build_emit_fetch(bld_base, inst, 3, 0)); - } - - args.data[num_data++] = - ac_to_integer(&ctx->ac, lp_build_emit_fetch(bld_base, inst, 2, 0)); - - args.cache_policy = get_cache_policy(ctx, inst, true, false, false); - - if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) { - args.resource = shader_buffer_fetch_rsrc(ctx, &inst->Src[0], false); - voffset = ac_to_integer(&ctx->ac, lp_build_emit_fetch(bld_base, inst, 1, 0)); - } else { - image_fetch_rsrc(bld_base, &inst->Src[0], true, - inst->Memory.Texture, &args.resource); - image_fetch_coords(bld_base, inst, 1, args.resource, args.coords); - vindex = args.coords[0]; /* for buffers only */ - } - - if (HAVE_LLVM >= 0x0800 && - inst->Src[0].Register.File != TGSI_FILE_BUFFER && - inst->Memory.Texture == TGSI_TEXTURE_BUFFER) { - LLVMValueRef buf_args[7]; - unsigned num_args = 0; - - buf_args[num_args++] = args.data[0]; - if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) - buf_args[num_args++] = args.data[1]; - - buf_args[num_args++] = args.resource; - buf_args[num_args++] = vindex; - buf_args[num_args++] = voffset; - buf_args[num_args++] = ctx->i32_0; /* soffset */ - buf_args[num_args++] = LLVMConstInt(ctx->i32, args.cache_policy & ac_slc, 0); - - char intrinsic_name[64]; - snprintf(intrinsic_name, sizeof(intrinsic_name), - "llvm.amdgcn.struct.buffer.atomic.%s", action->intr_name); - emit_data->output[emit_data->chan] = - ac_to_float(&ctx->ac, - ac_build_intrinsic(&ctx->ac, intrinsic_name, - ctx->i32, buf_args, num_args, 0)); - return; - } - - if (inst->Src[0].Register.File == TGSI_FILE_BUFFER || - (HAVE_LLVM < 0x0800 && - inst->Memory.Texture == TGSI_TEXTURE_BUFFER)) { - LLVMValueRef buf_args[7]; - unsigned num_args = 0; - - buf_args[num_args++] = args.data[0]; - if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) - buf_args[num_args++] = args.data[1]; - - buf_args[num_args++] = args.resource; - buf_args[num_args++] = vindex; - buf_args[num_args++] = voffset; - buf_args[num_args++] = args.cache_policy & ac_slc ? ctx->i1true : ctx->i1false; - - char intrinsic_name[40]; - snprintf(intrinsic_name, sizeof(intrinsic_name), - "llvm.amdgcn.buffer.atomic.%s", action->intr_name); - emit_data->output[emit_data->chan] = - ac_to_float(&ctx->ac, - ac_build_intrinsic(&ctx->ac, intrinsic_name, - ctx->i32, buf_args, num_args, 0)); - } else { - if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) { - args.opcode = ac_image_atomic_cmpswap; - } else { - args.opcode = ac_image_atomic; - switch (inst->Instruction.Opcode) { - case TGSI_OPCODE_ATOMXCHG: args.atomic = ac_atomic_swap; break; - case TGSI_OPCODE_ATOMUADD: args.atomic = ac_atomic_add; break; - case TGSI_OPCODE_ATOMAND: args.atomic = ac_atomic_and; break; - case TGSI_OPCODE_ATOMOR: args.atomic = ac_atomic_or; break; - case TGSI_OPCODE_ATOMXOR: args.atomic = ac_atomic_xor; break; - case TGSI_OPCODE_ATOMUMIN: args.atomic = ac_atomic_umin; break; - case TGSI_OPCODE_ATOMUMAX: args.atomic = ac_atomic_umax; break; - case TGSI_OPCODE_ATOMIMIN: args.atomic = ac_atomic_smin; break; - case TGSI_OPCODE_ATOMIMAX: args.atomic = ac_atomic_smax; break; - case TGSI_OPCODE_ATOMINC_WRAP: - args.atomic = ac_atomic_inc_wrap; - break; - case TGSI_OPCODE_ATOMDEC_WRAP: - args.atomic = ac_atomic_dec_wrap; - break; - default: unreachable("unhandled image atomic"); - } - } - - args.dim = ac_image_dim_from_tgsi_target(ctx->screen, inst->Memory.Texture); - emit_data->output[emit_data->chan] = - ac_to_float(&ctx->ac, ac_build_image_opcode(&ctx->ac, &args)); - } -} - -static LLVMValueRef fix_resinfo(struct si_shader_context *ctx, - unsigned target, LLVMValueRef out) -{ - LLVMBuilderRef builder = ctx->ac.builder; - - /* 1D textures are allocated and used as 2D on GFX9. */ - if (ctx->screen->info.chip_class == GFX9 && - (target == TGSI_TEXTURE_1D_ARRAY || - target == TGSI_TEXTURE_SHADOW1D_ARRAY)) { - LLVMValueRef layers = - LLVMBuildExtractElement(builder, out, - LLVMConstInt(ctx->i32, 2, 0), ""); - out = LLVMBuildInsertElement(builder, out, layers, - ctx->i32_1, ""); - } - - /* Divide the number of layers by 6 to get the number of cubes. */ - if (target == TGSI_TEXTURE_CUBE_ARRAY || - target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) { - LLVMValueRef imm2 = LLVMConstInt(ctx->i32, 2, 0); - - LLVMValueRef z = LLVMBuildExtractElement(builder, out, imm2, ""); - z = LLVMBuildSDiv(builder, z, LLVMConstInt(ctx->i32, 6, 0), ""); - - out = LLVMBuildInsertElement(builder, out, z, imm2, ""); - } - return out; -} - -static void resq_emit( - const struct lp_build_tgsi_action *action, - struct lp_build_tgsi_context *bld_base, - struct lp_build_emit_data *emit_data) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - LLVMBuilderRef builder = ctx->ac.builder; - const struct tgsi_full_instruction *inst = emit_data->inst; - const struct tgsi_full_src_register *reg = - &inst->Src[inst->Instruction.Opcode == TGSI_OPCODE_TXQ ? 1 : 0]; - - if (reg->Register.File == TGSI_FILE_BUFFER) { - LLVMValueRef rsrc = shader_buffer_fetch_rsrc(ctx, reg, false); - - emit_data->output[emit_data->chan] = - LLVMBuildExtractElement(builder, rsrc, - LLVMConstInt(ctx->i32, 2, 0), ""); - return; - } - - if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ && - inst->Texture.Texture == TGSI_TEXTURE_BUFFER) { - LLVMValueRef rsrc; - - tex_fetch_ptrs(bld_base, emit_data, &rsrc, NULL, NULL); - /* Read the size from the buffer descriptor directly. */ - emit_data->output[emit_data->chan] = - get_buffer_size(bld_base, rsrc); - return; - } - - if (inst->Instruction.Opcode == TGSI_OPCODE_RESQ && - inst->Memory.Texture == TGSI_TEXTURE_BUFFER) { - LLVMValueRef rsrc; - - image_fetch_rsrc(bld_base, reg, false, inst->Memory.Texture, &rsrc); - emit_data->output[emit_data->chan] = - get_buffer_size(bld_base, rsrc); - return; - } - - unsigned target; - - if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ) { - target = inst->Texture.Texture; - } else { - if (inst->Memory.Texture == TGSI_TEXTURE_3D) - target = TGSI_TEXTURE_2D_ARRAY; - else - target = inst->Memory.Texture; - } - - struct ac_image_args args = {}; - args.opcode = ac_image_get_resinfo; - args.dim = ac_texture_dim_from_tgsi_target(ctx->screen, target); - args.dmask = 0xf; - args.attributes = AC_FUNC_ATTR_READNONE; - - if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ) { - tex_fetch_ptrs(bld_base, emit_data, &args.resource, NULL, NULL); - args.lod = lp_build_emit_fetch(bld_base, inst, 0, TGSI_CHAN_X); - } else { - image_fetch_rsrc(bld_base, reg, false, target, &args.resource); - args.lod = ctx->i32_0; - } - - emit_data->output[emit_data->chan] = - fix_resinfo(ctx, target, ac_build_image_opcode(&ctx->ac, &args)); -} - -/** - * Load an image view, fmask view. or sampler state descriptor. - */ -LLVMValueRef si_load_sampler_desc(struct si_shader_context *ctx, - LLVMValueRef list, LLVMValueRef index, - enum ac_descriptor_type type) -{ - LLVMBuilderRef builder = ctx->ac.builder; - - switch (type) { - case AC_DESC_IMAGE: - /* The image is at [0:7]. */ - index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 2, 0), ""); - break; - case AC_DESC_BUFFER: - /* The buffer is in [4:7]. */ - index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->i32, 4, 0), - ctx->i32_1); - list = LLVMBuildPointerCast(builder, list, - ac_array_in_const32_addr_space(ctx->v4i32), ""); - break; - case AC_DESC_FMASK: - /* The FMASK is at [8:15]. */ - index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->i32, 2, 0), - ctx->i32_1); - break; - case AC_DESC_SAMPLER: - /* The sampler state is at [12:15]. */ - index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->i32, 4, 0), - LLVMConstInt(ctx->i32, 3, 0)); - list = LLVMBuildPointerCast(builder, list, - ac_array_in_const32_addr_space(ctx->v4i32), ""); - break; - case AC_DESC_PLANE_0: - case AC_DESC_PLANE_1: - case AC_DESC_PLANE_2: - /* Only used for the multiplane image support for Vulkan. Should - * never be reached in radeonsi. - */ - unreachable("Plane descriptor requested in radeonsi."); - } - - return ac_build_load_to_sgpr(&ctx->ac, list, index); -} - -/* Disable anisotropic filtering if BASE_LEVEL == LAST_LEVEL. - * - * GFX6-GFX7: - * If BASE_LEVEL == LAST_LEVEL, the shader must disable anisotropic - * filtering manually. The driver sets img7 to a mask clearing - * MAX_ANISO_RATIO if BASE_LEVEL == LAST_LEVEL. The shader must do: - * s_and_b32 samp0, samp0, img7 - * - * GFX8: - * The ANISO_OVERRIDE sampler field enables this fix in TA. - */ -static LLVMValueRef sici_fix_sampler_aniso(struct si_shader_context *ctx, - LLVMValueRef res, LLVMValueRef samp) -{ - LLVMValueRef img7, samp0; - - if (ctx->screen->info.chip_class >= GFX8) - return samp; - - img7 = LLVMBuildExtractElement(ctx->ac.builder, res, - LLVMConstInt(ctx->i32, 7, 0), ""); - samp0 = LLVMBuildExtractElement(ctx->ac.builder, samp, - ctx->i32_0, ""); - samp0 = LLVMBuildAnd(ctx->ac.builder, samp0, img7, ""); - return LLVMBuildInsertElement(ctx->ac.builder, samp, samp0, - ctx->i32_0, ""); -} - -static void tex_fetch_ptrs(struct lp_build_tgsi_context *bld_base, - struct lp_build_emit_data *emit_data, - LLVMValueRef *res_ptr, LLVMValueRef *samp_ptr, - LLVMValueRef *fmask_ptr) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - LLVMValueRef list = LLVMGetParam(ctx->main_fn, ctx->param_samplers_and_images); - const struct tgsi_full_instruction *inst = emit_data->inst; - const struct tgsi_full_src_register *reg; - unsigned target = inst->Texture.Texture; - unsigned sampler_src; - LLVMValueRef index; - - sampler_src = emit_data->inst->Instruction.NumSrcRegs - 1; - reg = &emit_data->inst->Src[sampler_src]; - - if (reg->Register.Indirect) { - index = si_get_bounded_indirect_index(ctx, - ®->Indirect, - reg->Register.Index, - ctx->num_samplers); - index = LLVMBuildAdd(ctx->ac.builder, index, - LLVMConstInt(ctx->i32, SI_NUM_IMAGES / 2, 0), ""); - } else { - index = LLVMConstInt(ctx->i32, - si_get_sampler_slot(reg->Register.Index), 0); - } - - if (reg->Register.File != TGSI_FILE_SAMPLER) { - /* Bindless descriptors are accessible from a different pair of - * user SGPR indices. - */ - list = LLVMGetParam(ctx->main_fn, - ctx->param_bindless_samplers_and_images); - index = lp_build_emit_fetch_src(bld_base, reg, - TGSI_TYPE_UNSIGNED, 0); - - /* Since bindless handle arithmetic can contain an unsigned integer - * wraparound and si_load_sampler_desc assumes there isn't any, - * use GEP without "inbounds" (inside ac_build_pointer_add) - * to prevent incorrect code generation and hangs. - */ - index = LLVMBuildMul(ctx->ac.builder, index, LLVMConstInt(ctx->i32, 2, 0), ""); - list = ac_build_pointer_add(&ctx->ac, list, index); - index = ctx->i32_0; - } - - if (target == TGSI_TEXTURE_BUFFER) - *res_ptr = si_load_sampler_desc(ctx, list, index, AC_DESC_BUFFER); - else - *res_ptr = si_load_sampler_desc(ctx, list, index, AC_DESC_IMAGE); - - if (samp_ptr) - *samp_ptr = NULL; - if (fmask_ptr) - *fmask_ptr = NULL; - - if (target == TGSI_TEXTURE_2D_MSAA || - target == TGSI_TEXTURE_2D_ARRAY_MSAA) { - if (fmask_ptr) - *fmask_ptr = si_load_sampler_desc(ctx, list, index, - AC_DESC_FMASK); - } else if (target != TGSI_TEXTURE_BUFFER) { - if (samp_ptr) { - *samp_ptr = si_load_sampler_desc(ctx, list, index, - AC_DESC_SAMPLER); - *samp_ptr = sici_fix_sampler_aniso(ctx, *res_ptr, *samp_ptr); - } - } -} - -/* Gather4 should follow the same rules as bilinear filtering, but the hardware - * incorrectly forces nearest filtering if the texture format is integer. - * The only effect it has on Gather4, which always returns 4 texels for - * bilinear filtering, is that the final coordinates are off by 0.5 of - * the texel size. - * - * The workaround is to subtract 0.5 from the unnormalized coordinates, - * or (0.5 / size) from the normalized coordinates. - * - * However, cube textures with 8_8_8_8 data formats require a different - * workaround of overriding the num format to USCALED/SSCALED. This would lose - * precision in 32-bit data formats, so it needs to be applied dynamically at - * runtime. In this case, return an i1 value that indicates whether the - * descriptor was overridden (and hence a fixup of the sampler result is needed). - */ -static LLVMValueRef -si_lower_gather4_integer(struct si_shader_context *ctx, - struct ac_image_args *args, - unsigned target, - enum tgsi_return_type return_type) -{ - LLVMBuilderRef builder = ctx->ac.builder; - LLVMValueRef wa_8888 = NULL; - LLVMValueRef half_texel[2]; - - assert(return_type == TGSI_RETURN_TYPE_SINT || - return_type == TGSI_RETURN_TYPE_UINT); - - if (target == TGSI_TEXTURE_CUBE || - target == TGSI_TEXTURE_CUBE_ARRAY) { - LLVMValueRef formats; - LLVMValueRef data_format; - LLVMValueRef wa_formats; - - formats = LLVMBuildExtractElement(builder, args->resource, ctx->i32_1, ""); - - data_format = LLVMBuildLShr(builder, formats, - LLVMConstInt(ctx->i32, 20, false), ""); - data_format = LLVMBuildAnd(builder, data_format, - LLVMConstInt(ctx->i32, (1u << 6) - 1, false), ""); - wa_8888 = LLVMBuildICmp( - builder, LLVMIntEQ, data_format, - LLVMConstInt(ctx->i32, V_008F14_IMG_DATA_FORMAT_8_8_8_8, false), - ""); - - uint32_t wa_num_format = - return_type == TGSI_RETURN_TYPE_UINT ? - S_008F14_NUM_FORMAT(V_008F14_IMG_NUM_FORMAT_USCALED) : - S_008F14_NUM_FORMAT(V_008F14_IMG_NUM_FORMAT_SSCALED); - wa_formats = LLVMBuildAnd(builder, formats, - LLVMConstInt(ctx->i32, C_008F14_NUM_FORMAT, false), - ""); - wa_formats = LLVMBuildOr(builder, wa_formats, - LLVMConstInt(ctx->i32, wa_num_format, false), ""); - - formats = LLVMBuildSelect(builder, wa_8888, wa_formats, formats, ""); - args->resource = LLVMBuildInsertElement( - builder, args->resource, formats, ctx->i32_1, ""); - } - - if (target == TGSI_TEXTURE_RECT || - target == TGSI_TEXTURE_SHADOWRECT) { - assert(!wa_8888); - half_texel[0] = half_texel[1] = LLVMConstReal(ctx->f32, -0.5); - } else { - struct ac_image_args resinfo = {}; - struct lp_build_if_state if_ctx; - - if (wa_8888) { - /* Skip the texture size query entirely if we don't need it. */ - lp_build_if(&if_ctx, &ctx->gallivm, LLVMBuildNot(builder, wa_8888, "")); - } - - /* Query the texture size. */ - resinfo.opcode = ac_image_get_resinfo; - resinfo.dim = ac_texture_dim_from_tgsi_target(ctx->screen, target); - resinfo.resource = args->resource; - resinfo.sampler = args->sampler; - resinfo.lod = ctx->ac.i32_0; - resinfo.dmask = 0xf; - resinfo.attributes = AC_FUNC_ATTR_READNONE; - - LLVMValueRef texsize = - fix_resinfo(ctx, target, - ac_build_image_opcode(&ctx->ac, &resinfo)); - - /* Compute -0.5 / size. */ - for (unsigned c = 0; c < 2; c++) { - half_texel[c] = - LLVMBuildExtractElement(builder, texsize, - LLVMConstInt(ctx->i32, c, 0), ""); - half_texel[c] = LLVMBuildUIToFP(builder, half_texel[c], ctx->f32, ""); - half_texel[c] = ac_build_fdiv(&ctx->ac, ctx->ac.f32_1, half_texel[c]); - half_texel[c] = LLVMBuildFMul(builder, half_texel[c], - LLVMConstReal(ctx->f32, -0.5), ""); - } - - if (wa_8888) { - lp_build_endif(&if_ctx); - - LLVMBasicBlockRef bb[2] = { if_ctx.true_block, if_ctx.entry_block }; - - for (unsigned c = 0; c < 2; c++) { - LLVMValueRef values[2] = { half_texel[c], ctx->ac.f32_0 }; - half_texel[c] = ac_build_phi(&ctx->ac, ctx->f32, 2, - values, bb); - } - } - } - - for (unsigned c = 0; c < 2; c++) { - LLVMValueRef tmp; - tmp = ac_to_float(&ctx->ac, args->coords[c]); - tmp = LLVMBuildFAdd(builder, tmp, half_texel[c], ""); - args->coords[c] = ac_to_integer(&ctx->ac, tmp); - } - - return wa_8888; -} - -/* The second half of the cube texture 8_8_8_8 integer workaround: adjust the - * result after the gather operation. - */ -static LLVMValueRef -si_fix_gather4_integer_result(struct si_shader_context *ctx, - LLVMValueRef result, - enum tgsi_return_type return_type, - LLVMValueRef wa) -{ - LLVMBuilderRef builder = ctx->ac.builder; - - assert(return_type == TGSI_RETURN_TYPE_SINT || - return_type == TGSI_RETURN_TYPE_UINT); - - for (unsigned chan = 0; chan < 4; ++chan) { - LLVMValueRef chanv = LLVMConstInt(ctx->i32, chan, false); - LLVMValueRef value; - LLVMValueRef wa_value; - - value = LLVMBuildExtractElement(builder, result, chanv, ""); - - if (return_type == TGSI_RETURN_TYPE_UINT) - wa_value = LLVMBuildFPToUI(builder, value, ctx->i32, ""); - else - wa_value = LLVMBuildFPToSI(builder, value, ctx->i32, ""); - wa_value = ac_to_float(&ctx->ac, wa_value); - value = LLVMBuildSelect(builder, wa, wa_value, value, ""); - - result = LLVMBuildInsertElement(builder, result, value, chanv, ""); - } - - return result; -} - -static void build_tex_intrinsic(const struct lp_build_tgsi_action *action, - struct lp_build_tgsi_context *bld_base, - struct lp_build_emit_data *emit_data) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - const struct tgsi_full_instruction *inst = emit_data->inst; - unsigned opcode = inst->Instruction.Opcode; - unsigned target = inst->Texture.Texture; - struct ac_image_args args = {}; - int ref_pos = tgsi_util_get_shadow_ref_src_index(target); - unsigned chan; - bool has_offset = inst->Texture.NumOffsets > 0; - LLVMValueRef fmask_ptr = NULL; - - tex_fetch_ptrs(bld_base, emit_data, &args.resource, &args.sampler, &fmask_ptr); - - if (target == TGSI_TEXTURE_BUFFER) { - LLVMValueRef vindex = lp_build_emit_fetch(bld_base, inst, 0, TGSI_CHAN_X); - unsigned num_channels = - util_last_bit(inst->Dst[0].Register.WriteMask); - LLVMValueRef result = - ac_build_buffer_load_format(&ctx->ac, - args.resource, - vindex, - ctx->i32_0, - num_channels, 0, true); - emit_data->output[emit_data->chan] = - ac_build_expand_to_vec4(&ctx->ac, result, num_channels); - return; - } - - /* Fetch and project texture coordinates */ - args.coords[3] = lp_build_emit_fetch(bld_base, inst, 0, TGSI_CHAN_W); - for (chan = 0; chan < 3; chan++) { - args.coords[chan] = lp_build_emit_fetch(bld_base, inst, 0, chan); - if (opcode == TGSI_OPCODE_TXP) - args.coords[chan] = ac_build_fdiv(&ctx->ac, - args.coords[chan], args.coords[3]); - } - - if (opcode == TGSI_OPCODE_TXP) - args.coords[3] = ctx->ac.f32_1; - - /* Pack offsets. */ - if (has_offset && - opcode != TGSI_OPCODE_TXF && - opcode != TGSI_OPCODE_TXF_LZ) { - /* The offsets are six-bit signed integers packed like this: - * X=[5:0], Y=[13:8], and Z=[21:16]. - */ - LLVMValueRef offset[3], pack; - - assert(inst->Texture.NumOffsets == 1); - - for (chan = 0; chan < 3; chan++) { - offset[chan] = lp_build_emit_fetch_texoffset(bld_base, inst, 0, chan); - offset[chan] = LLVMBuildAnd(ctx->ac.builder, offset[chan], - LLVMConstInt(ctx->i32, 0x3f, 0), ""); - if (chan) - offset[chan] = LLVMBuildShl(ctx->ac.builder, offset[chan], - LLVMConstInt(ctx->i32, chan*8, 0), ""); - } - - pack = LLVMBuildOr(ctx->ac.builder, offset[0], offset[1], ""); - pack = LLVMBuildOr(ctx->ac.builder, pack, offset[2], ""); - args.offset = pack; - } - - /* Pack LOD bias value */ - if (opcode == TGSI_OPCODE_TXB) - args.bias = args.coords[3]; - if (opcode == TGSI_OPCODE_TXB2) - args.bias = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X); - - /* Pack depth comparison value */ - if (tgsi_is_shadow_target(target) && opcode != TGSI_OPCODE_LODQ) { - LLVMValueRef z; - - if (target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) { - z = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X); - } else { - assert(ref_pos >= 0); - z = args.coords[ref_pos]; - } - - /* Section 8.23.1 (Depth Texture Comparison Mode) of the - * OpenGL 4.5 spec says: - * - * "If the texture’s internal format indicates a fixed-point - * depth texture, then D_t and D_ref are clamped to the - * range [0, 1]; otherwise no clamping is performed." - * - * TC-compatible HTILE promotes Z16 and Z24 to Z32_FLOAT, - * so the depth comparison value isn't clamped for Z16 and - * Z24 anymore. Do it manually here for GFX8-9; GFX10 has - * an explicitly clamped 32-bit float format. - */ - if (ctx->screen->info.chip_class >= GFX8 && - ctx->screen->info.chip_class <= GFX9) { - LLVMValueRef upgraded; - LLVMValueRef clamped; - upgraded = LLVMBuildExtractElement(ctx->ac.builder, args.sampler, - LLVMConstInt(ctx->i32, 3, false), ""); - upgraded = LLVMBuildLShr(ctx->ac.builder, upgraded, - LLVMConstInt(ctx->i32, 29, false), ""); - upgraded = LLVMBuildTrunc(ctx->ac.builder, upgraded, ctx->i1, ""); - clamped = ac_build_clamp(&ctx->ac, z); - z = LLVMBuildSelect(ctx->ac.builder, upgraded, clamped, z, ""); - } - - args.compare = z; - } - - /* Pack user derivatives */ - if (opcode == TGSI_OPCODE_TXD) { - int param, num_src_deriv_channels, num_dst_deriv_channels; - - switch (target) { - case TGSI_TEXTURE_3D: - num_src_deriv_channels = 3; - num_dst_deriv_channels = 3; - break; - case TGSI_TEXTURE_2D: - case TGSI_TEXTURE_SHADOW2D: - case TGSI_TEXTURE_RECT: - case TGSI_TEXTURE_SHADOWRECT: - case TGSI_TEXTURE_2D_ARRAY: - case TGSI_TEXTURE_SHADOW2D_ARRAY: - num_src_deriv_channels = 2; - num_dst_deriv_channels = 2; - break; - case TGSI_TEXTURE_CUBE: - case TGSI_TEXTURE_SHADOWCUBE: - case TGSI_TEXTURE_CUBE_ARRAY: - case TGSI_TEXTURE_SHADOWCUBE_ARRAY: - /* Cube derivatives will be converted to 2D. */ - num_src_deriv_channels = 3; - num_dst_deriv_channels = 3; - break; - case TGSI_TEXTURE_1D: - case TGSI_TEXTURE_SHADOW1D: - case TGSI_TEXTURE_1D_ARRAY: - case TGSI_TEXTURE_SHADOW1D_ARRAY: - num_src_deriv_channels = 1; - - /* 1D textures are allocated and used as 2D on GFX9. */ - if (ctx->screen->info.chip_class == GFX9) { - num_dst_deriv_channels = 2; - } else { - num_dst_deriv_channels = 1; - } - break; - default: - unreachable("invalid target"); - } - - for (param = 0; param < 2; param++) { - for (chan = 0; chan < num_src_deriv_channels; chan++) - args.derivs[param * num_dst_deriv_channels + chan] = - lp_build_emit_fetch(bld_base, inst, param+1, chan); - - /* Fill in the rest with zeros. */ - for (chan = num_src_deriv_channels; - chan < num_dst_deriv_channels; chan++) - args.derivs[param * num_dst_deriv_channels + chan] = - ctx->ac.f32_0; - } - } - - if (target == TGSI_TEXTURE_CUBE || - target == TGSI_TEXTURE_CUBE_ARRAY || - target == TGSI_TEXTURE_SHADOWCUBE || - target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) { - ac_prepare_cube_coords(&ctx->ac, - opcode == TGSI_OPCODE_TXD, - target == TGSI_TEXTURE_CUBE_ARRAY || - target == TGSI_TEXTURE_SHADOWCUBE_ARRAY, - opcode == TGSI_OPCODE_LODQ, - args.coords, args.derivs); - } else if (tgsi_is_array_sampler(target) && - opcode != TGSI_OPCODE_TXF && - opcode != TGSI_OPCODE_TXF_LZ && - ctx->screen->info.chip_class <= GFX8) { - unsigned array_coord = target == TGSI_TEXTURE_1D_ARRAY ? 1 : 2; - args.coords[array_coord] = ac_build_round(&ctx->ac, args.coords[array_coord]); - } - - /* 1D textures are allocated and used as 2D on GFX9. */ - if (ctx->screen->info.chip_class == GFX9) { - LLVMValueRef filler; - - /* Use 0.5, so that we don't sample the border color. */ - if (opcode == TGSI_OPCODE_TXF || - opcode == TGSI_OPCODE_TXF_LZ) - filler = ctx->i32_0; - else - filler = LLVMConstReal(ctx->f32, 0.5); - - if (target == TGSI_TEXTURE_1D || - target == TGSI_TEXTURE_SHADOW1D) { - args.coords[1] = filler; - } else if (target == TGSI_TEXTURE_1D_ARRAY || - target == TGSI_TEXTURE_SHADOW1D_ARRAY) { - args.coords[2] = args.coords[1]; - args.coords[1] = filler; - } - } - - /* Pack LOD or sample index */ - if (opcode == TGSI_OPCODE_TXL) - args.lod = args.coords[3]; - else if (opcode == TGSI_OPCODE_TXL2) - args.lod = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X); - else if (opcode == TGSI_OPCODE_TXF) { - if (target == TGSI_TEXTURE_2D_MSAA) { - /* No LOD, but move sample index into the right place. */ - args.coords[2] = args.coords[3]; - } else if (target != TGSI_TEXTURE_2D_ARRAY_MSAA) { - args.lod = args.coords[3]; - } - } - - if ((target == TGSI_TEXTURE_2D_MSAA || - target == TGSI_TEXTURE_2D_ARRAY_MSAA) && - !(ctx->screen->debug_flags & DBG(NO_FMASK))) { - ac_apply_fmask_to_sample(&ctx->ac, fmask_ptr, args.coords, - target == TGSI_TEXTURE_2D_ARRAY_MSAA); - } - - if (opcode == TGSI_OPCODE_TXF || - opcode == TGSI_OPCODE_TXF_LZ) { - /* add tex offsets */ - if (inst->Texture.NumOffsets) { - const struct tgsi_texture_offset *off = inst->TexOffsets; - - assert(inst->Texture.NumOffsets == 1); - - switch (target) { - case TGSI_TEXTURE_3D: - args.coords[2] = - LLVMBuildAdd(ctx->ac.builder, args.coords[2], - ctx->imms[off->Index * TGSI_NUM_CHANNELS + off->SwizzleZ], ""); - /* fall through */ - case TGSI_TEXTURE_2D: - case TGSI_TEXTURE_SHADOW2D: - case TGSI_TEXTURE_RECT: - case TGSI_TEXTURE_SHADOWRECT: - case TGSI_TEXTURE_2D_ARRAY: - case TGSI_TEXTURE_SHADOW2D_ARRAY: - args.coords[1] = - LLVMBuildAdd(ctx->ac.builder, args.coords[1], - ctx->imms[off->Index * TGSI_NUM_CHANNELS + off->SwizzleY], ""); - /* fall through */ - case TGSI_TEXTURE_1D: - case TGSI_TEXTURE_SHADOW1D: - case TGSI_TEXTURE_1D_ARRAY: - case TGSI_TEXTURE_SHADOW1D_ARRAY: - args.coords[0] = - LLVMBuildAdd(ctx->ac.builder, args.coords[0], - ctx->imms[off->Index * TGSI_NUM_CHANNELS + off->SwizzleX], ""); - break; - /* texture offsets do not apply to other texture targets */ - } - } - } - - if (opcode == TGSI_OPCODE_TG4) { - unsigned gather_comp = 0; - - /* DMASK was repurposed for GATHER4. 4 components are always - * returned and DMASK works like a swizzle - it selects - * the component to fetch. The only valid DMASK values are - * 1=red, 2=green, 4=blue, 8=alpha. (e.g. 1 returns - * (red,red,red,red) etc.) The ISA document doesn't mention - * this. - */ - - /* Get the component index from src1.x for Gather4. */ - if (!tgsi_is_shadow_target(target)) { - LLVMValueRef comp_imm; - struct tgsi_src_register src1 = inst->Src[1].Register; - - assert(src1.File == TGSI_FILE_IMMEDIATE); - - comp_imm = ctx->imms[src1.Index * TGSI_NUM_CHANNELS + src1.SwizzleX]; - gather_comp = LLVMConstIntGetZExtValue(comp_imm); - gather_comp = CLAMP(gather_comp, 0, 3); - } - - args.dmask = 1 << gather_comp; - } else { - args.dmask = 0xf; - } - - args.dim = ac_texture_dim_from_tgsi_target(ctx->screen, target); - args.unorm = target == TGSI_TEXTURE_RECT || - target == TGSI_TEXTURE_SHADOWRECT; - args.opcode = ac_image_sample; - - switch (opcode) { - case TGSI_OPCODE_TXF: - case TGSI_OPCODE_TXF_LZ: - args.opcode = opcode == TGSI_OPCODE_TXF_LZ || - target == TGSI_TEXTURE_2D_MSAA || - target == TGSI_TEXTURE_2D_ARRAY_MSAA ? - ac_image_load : ac_image_load_mip; - break; - case TGSI_OPCODE_LODQ: - args.opcode = ac_image_get_lod; - break; - case TGSI_OPCODE_TEX: - case TGSI_OPCODE_TEX2: - case TGSI_OPCODE_TXP: - if (ctx->type != PIPE_SHADER_FRAGMENT) - args.level_zero = true; - break; - case TGSI_OPCODE_TEX_LZ: - args.level_zero = true; - break; - case TGSI_OPCODE_TXB: - case TGSI_OPCODE_TXB2: - assert(ctx->type == PIPE_SHADER_FRAGMENT); - break; - case TGSI_OPCODE_TXL: - case TGSI_OPCODE_TXL2: - break; - case TGSI_OPCODE_TXD: - break; - case TGSI_OPCODE_TG4: - args.opcode = ac_image_gather4; - args.level_zero = true; - break; - default: - assert(0); - return; - } - - /* The hardware needs special lowering for Gather4 with integer formats. */ - LLVMValueRef gather4_int_result_workaround = NULL; - - if (ctx->screen->info.chip_class <= GFX8 && - opcode == TGSI_OPCODE_TG4) { - assert(inst->Texture.ReturnType != TGSI_RETURN_TYPE_UNKNOWN); - - if (inst->Texture.ReturnType == TGSI_RETURN_TYPE_SINT || - inst->Texture.ReturnType == TGSI_RETURN_TYPE_UINT) { - gather4_int_result_workaround = - si_lower_gather4_integer(ctx, &args, target, - inst->Texture.ReturnType); - } - } - - args.attributes = AC_FUNC_ATTR_READNONE; - LLVMValueRef result = ac_build_image_opcode(&ctx->ac, &args); - - if (gather4_int_result_workaround) { - result = si_fix_gather4_integer_result(ctx, result, - inst->Texture.ReturnType, - gather4_int_result_workaround); - } - - emit_data->output[emit_data->chan] = result; -} - -static void si_llvm_emit_txqs( - const struct lp_build_tgsi_action *action, - struct lp_build_tgsi_context *bld_base, - struct lp_build_emit_data *emit_data) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - LLVMValueRef res, samples; - LLVMValueRef res_ptr, samp_ptr, fmask_ptr = NULL; - - tex_fetch_ptrs(bld_base, emit_data, &res_ptr, &samp_ptr, &fmask_ptr); - - /* Read the samples from the descriptor directly. */ - res = LLVMBuildBitCast(ctx->ac.builder, res_ptr, ctx->v8i32, ""); - samples = LLVMBuildExtractElement(ctx->ac.builder, res, - LLVMConstInt(ctx->i32, 3, 0), ""); - samples = LLVMBuildLShr(ctx->ac.builder, samples, - LLVMConstInt(ctx->i32, 16, 0), ""); - samples = LLVMBuildAnd(ctx->ac.builder, samples, - LLVMConstInt(ctx->i32, 0xf, 0), ""); - samples = LLVMBuildShl(ctx->ac.builder, ctx->i32_1, - samples, ""); - - emit_data->output[emit_data->chan] = samples; -} - -static LLVMValueRef si_llvm_emit_fbfetch(struct si_shader_context *ctx) -{ - struct ac_image_args args = {}; - LLVMValueRef ptr, image, fmask; - - /* Ignore src0, because KHR_blend_func_extended disallows multiple render - * targets. - */ - - /* Load the image descriptor. */ - STATIC_ASSERT(SI_PS_IMAGE_COLORBUF0 % 2 == 0); - ptr = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers); - ptr = LLVMBuildPointerCast(ctx->ac.builder, ptr, - ac_array_in_const32_addr_space(ctx->v8i32), ""); - image = ac_build_load_to_sgpr(&ctx->ac, ptr, - LLVMConstInt(ctx->i32, SI_PS_IMAGE_COLORBUF0 / 2, 0)); - - unsigned chan = 0; - - args.coords[chan++] = si_unpack_param(ctx, SI_PARAM_POS_FIXED_PT, 0, 16); - - if (!ctx->shader->key.mono.u.ps.fbfetch_is_1D) - args.coords[chan++] = si_unpack_param(ctx, SI_PARAM_POS_FIXED_PT, 16, 16); - - /* Get the current render target layer index. */ - if (ctx->shader->key.mono.u.ps.fbfetch_layered) - args.coords[chan++] = si_unpack_param(ctx, SI_PARAM_ANCILLARY, 16, 11); - - if (ctx->shader->key.mono.u.ps.fbfetch_msaa) - args.coords[chan++] = si_get_sample_id(ctx); - - if (ctx->shader->key.mono.u.ps.fbfetch_msaa && - !(ctx->screen->debug_flags & DBG(NO_FMASK))) { - fmask = ac_build_load_to_sgpr(&ctx->ac, ptr, - LLVMConstInt(ctx->i32, SI_PS_IMAGE_COLORBUF0_FMASK / 2, 0)); - - ac_apply_fmask_to_sample(&ctx->ac, fmask, args.coords, - ctx->shader->key.mono.u.ps.fbfetch_layered); - } - - args.opcode = ac_image_load; - args.resource = image; - args.dmask = 0xf; - args.attributes = AC_FUNC_ATTR_READNONE; - - if (ctx->shader->key.mono.u.ps.fbfetch_msaa) - args.dim = ctx->shader->key.mono.u.ps.fbfetch_layered ? - ac_image_2darraymsaa : ac_image_2dmsaa; - else if (ctx->shader->key.mono.u.ps.fbfetch_is_1D) - args.dim = ctx->shader->key.mono.u.ps.fbfetch_layered ? - ac_image_1darray : ac_image_1d; - else - args.dim = ctx->shader->key.mono.u.ps.fbfetch_layered ? - ac_image_2darray : ac_image_2d; - - return ac_build_image_opcode(&ctx->ac, &args); -} - -static void si_tgsi_emit_fbfetch(const struct lp_build_tgsi_action *action, - struct lp_build_tgsi_context *bld_base, - struct lp_build_emit_data *emit_data) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - - emit_data->output[emit_data->chan] = si_llvm_emit_fbfetch(ctx); -} - -LLVMValueRef si_nir_emit_fbfetch(struct ac_shader_abi *abi) -{ - struct si_shader_context *ctx = si_shader_context_from_abi(abi); - - return si_llvm_emit_fbfetch(ctx); -} - -/** - * Setup actions for TGSI memory opcode, including texture opcodes. - */ -void si_shader_context_init_mem(struct si_shader_context *ctx) -{ - struct lp_build_tgsi_context *bld_base = &ctx->bld_base; - - bld_base->op_actions[TGSI_OPCODE_TEX].emit = build_tex_intrinsic; - bld_base->op_actions[TGSI_OPCODE_TEX_LZ].emit = build_tex_intrinsic; - bld_base->op_actions[TGSI_OPCODE_TEX2].emit = build_tex_intrinsic; - bld_base->op_actions[TGSI_OPCODE_TXB].emit = build_tex_intrinsic; - bld_base->op_actions[TGSI_OPCODE_TXB2].emit = build_tex_intrinsic; - bld_base->op_actions[TGSI_OPCODE_TXD].emit = build_tex_intrinsic; - bld_base->op_actions[TGSI_OPCODE_TXF].emit = build_tex_intrinsic; - bld_base->op_actions[TGSI_OPCODE_TXF_LZ].emit = build_tex_intrinsic; - bld_base->op_actions[TGSI_OPCODE_TXL].emit = build_tex_intrinsic; - bld_base->op_actions[TGSI_OPCODE_TXL2].emit = build_tex_intrinsic; - bld_base->op_actions[TGSI_OPCODE_TXP].emit = build_tex_intrinsic; - bld_base->op_actions[TGSI_OPCODE_TXQ].emit = resq_emit; - bld_base->op_actions[TGSI_OPCODE_TG4].emit = build_tex_intrinsic; - bld_base->op_actions[TGSI_OPCODE_LODQ].emit = build_tex_intrinsic; - bld_base->op_actions[TGSI_OPCODE_TXQS].emit = si_llvm_emit_txqs; - - bld_base->op_actions[TGSI_OPCODE_FBFETCH].emit = si_tgsi_emit_fbfetch; - - bld_base->op_actions[TGSI_OPCODE_LOAD].emit = load_emit; - bld_base->op_actions[TGSI_OPCODE_STORE].emit = store_emit; - bld_base->op_actions[TGSI_OPCODE_RESQ].emit = resq_emit; - - bld_base->op_actions[TGSI_OPCODE_ATOMUADD].emit = atomic_emit; - bld_base->op_actions[TGSI_OPCODE_ATOMUADD].intr_name = "add"; - bld_base->op_actions[TGSI_OPCODE_ATOMXCHG].emit = atomic_emit; - bld_base->op_actions[TGSI_OPCODE_ATOMXCHG].intr_name = "swap"; - bld_base->op_actions[TGSI_OPCODE_ATOMCAS].emit = atomic_emit; - bld_base->op_actions[TGSI_OPCODE_ATOMCAS].intr_name = "cmpswap"; - bld_base->op_actions[TGSI_OPCODE_ATOMAND].emit = atomic_emit; - bld_base->op_actions[TGSI_OPCODE_ATOMAND].intr_name = "and"; - bld_base->op_actions[TGSI_OPCODE_ATOMOR].emit = atomic_emit; - bld_base->op_actions[TGSI_OPCODE_ATOMOR].intr_name = "or"; - bld_base->op_actions[TGSI_OPCODE_ATOMXOR].emit = atomic_emit; - bld_base->op_actions[TGSI_OPCODE_ATOMXOR].intr_name = "xor"; - bld_base->op_actions[TGSI_OPCODE_ATOMUMIN].emit = atomic_emit; - bld_base->op_actions[TGSI_OPCODE_ATOMUMIN].intr_name = "umin"; - bld_base->op_actions[TGSI_OPCODE_ATOMUMAX].emit = atomic_emit; - bld_base->op_actions[TGSI_OPCODE_ATOMUMAX].intr_name = "umax"; - bld_base->op_actions[TGSI_OPCODE_ATOMIMIN].emit = atomic_emit; - bld_base->op_actions[TGSI_OPCODE_ATOMIMIN].intr_name = "smin"; - bld_base->op_actions[TGSI_OPCODE_ATOMIMAX].emit = atomic_emit; - bld_base->op_actions[TGSI_OPCODE_ATOMIMAX].intr_name = "smax"; - bld_base->op_actions[TGSI_OPCODE_ATOMINC_WRAP].emit = atomic_emit; - bld_base->op_actions[TGSI_OPCODE_ATOMINC_WRAP].intr_name = "inc"; - bld_base->op_actions[TGSI_OPCODE_ATOMDEC_WRAP].emit = atomic_emit; - bld_base->op_actions[TGSI_OPCODE_ATOMDEC_WRAP].intr_name = "dec"; -} diff -Nru mesa-19.2.8/src/gallium/drivers/radeonsi/si_shader_tgsi_setup.c mesa-20.0.8/src/gallium/drivers/radeonsi/si_shader_tgsi_setup.c --- mesa-19.2.8/src/gallium/drivers/radeonsi/si_shader_tgsi_setup.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/radeonsi/si_shader_tgsi_setup.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,1180 +0,0 @@ -/* - * Copyright 2016 Advanced Micro Devices, Inc. - * All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * on the rights to use, copy, modify, merge, publish, distribute, sub - * license, and/or sell copies of the Software, and to permit persons to whom - * the Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL - * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, - * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR - * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE - * USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -#include "si_shader_internal.h" -#include "si_pipe.h" -#include "ac_llvm_util.h" -#include "util/u_memory.h" - -enum si_llvm_calling_convention { - RADEON_LLVM_AMDGPU_VS = 87, - RADEON_LLVM_AMDGPU_GS = 88, - RADEON_LLVM_AMDGPU_PS = 89, - RADEON_LLVM_AMDGPU_CS = 90, - RADEON_LLVM_AMDGPU_HS = 93, -}; - -struct si_llvm_diagnostics { - struct pipe_debug_callback *debug; - unsigned retval; -}; - -static void si_diagnostic_handler(LLVMDiagnosticInfoRef di, void *context) -{ - struct si_llvm_diagnostics *diag = (struct si_llvm_diagnostics *)context; - LLVMDiagnosticSeverity severity = LLVMGetDiagInfoSeverity(di); - char *description = LLVMGetDiagInfoDescription(di); - const char *severity_str = NULL; - - switch (severity) { - case LLVMDSError: - severity_str = "error"; - break; - case LLVMDSWarning: - severity_str = "warning"; - break; - case LLVMDSRemark: - severity_str = "remark"; - break; - case LLVMDSNote: - severity_str = "note"; - break; - default: - severity_str = "unknown"; - } - - pipe_debug_message(diag->debug, SHADER_INFO, - "LLVM diagnostic (%s): %s", severity_str, description); - - if (severity == LLVMDSError) { - diag->retval = 1; - fprintf(stderr,"LLVM triggered Diagnostic Handler: %s\n", description); - } - - LLVMDisposeMessage(description); -} - -/** - * Compile an LLVM module to machine code. - * - * @returns 0 for success, 1 for failure - */ -unsigned si_llvm_compile(LLVMModuleRef M, struct si_shader_binary *binary, - struct ac_llvm_compiler *compiler, - struct pipe_debug_callback *debug, - bool less_optimized, unsigned wave_size) -{ - struct ac_compiler_passes *passes = compiler->passes; - - if (wave_size == 32) - passes = compiler->passes_wave32; - else if (less_optimized && compiler->low_opt_passes) - passes = compiler->low_opt_passes; - - struct si_llvm_diagnostics diag; - LLVMContextRef llvm_ctx; - - diag.debug = debug; - diag.retval = 0; - - /* Setup Diagnostic Handler*/ - llvm_ctx = LLVMGetModuleContext(M); - - LLVMContextSetDiagnosticHandler(llvm_ctx, si_diagnostic_handler, &diag); - - /* Compile IR. */ - if (!ac_compile_module_to_elf(passes, M, (char **)&binary->elf_buffer, - &binary->elf_size)) - diag.retval = 1; - - if (diag.retval != 0) - pipe_debug_message(debug, SHADER_INFO, "LLVM compile failed"); - return diag.retval; -} - -void si_shader_binary_clean(struct si_shader_binary *binary) -{ - free((void *)binary->elf_buffer); - binary->elf_buffer = NULL; - - free(binary->llvm_ir_string); - binary->llvm_ir_string = NULL; -} - -LLVMTypeRef tgsi2llvmtype(struct lp_build_tgsi_context *bld_base, - enum tgsi_opcode_type type) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - - switch (type) { - case TGSI_TYPE_UNSIGNED: - case TGSI_TYPE_SIGNED: - return ctx->ac.i32; - case TGSI_TYPE_UNSIGNED64: - case TGSI_TYPE_SIGNED64: - return ctx->ac.i64; - case TGSI_TYPE_DOUBLE: - return ctx->ac.f64; - case TGSI_TYPE_UNTYPED: - case TGSI_TYPE_FLOAT: - return ctx->ac.f32; - default: break; - } - return 0; -} - -LLVMValueRef bitcast(struct lp_build_tgsi_context *bld_base, - enum tgsi_opcode_type type, LLVMValueRef value) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - LLVMTypeRef dst_type = tgsi2llvmtype(bld_base, type); - - if (dst_type) - return LLVMBuildBitCast(ctx->ac.builder, value, dst_type, ""); - else - return value; -} - -/** - * Return a value that is equal to the given i32 \p index if it lies in [0,num) - * or an undefined value in the same interval otherwise. - */ -LLVMValueRef si_llvm_bound_index(struct si_shader_context *ctx, - LLVMValueRef index, - unsigned num) -{ - LLVMBuilderRef builder = ctx->ac.builder; - LLVMValueRef c_max = LLVMConstInt(ctx->i32, num - 1, 0); - LLVMValueRef cc; - - if (util_is_power_of_two_or_zero(num)) { - index = LLVMBuildAnd(builder, index, c_max, ""); - } else { - /* In theory, this MAX pattern should result in code that is - * as good as the bit-wise AND above. - * - * In practice, LLVM generates worse code (at the time of - * writing), because its value tracking is not strong enough. - */ - cc = LLVMBuildICmp(builder, LLVMIntULE, index, c_max, ""); - index = LLVMBuildSelect(builder, cc, index, c_max, ""); - } - - return index; -} - -static LLVMValueRef emit_swizzle(struct lp_build_tgsi_context *bld_base, - LLVMValueRef value, - unsigned swizzle_x, - unsigned swizzle_y, - unsigned swizzle_z, - unsigned swizzle_w) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - LLVMValueRef swizzles[4]; - - swizzles[0] = LLVMConstInt(ctx->i32, swizzle_x, 0); - swizzles[1] = LLVMConstInt(ctx->i32, swizzle_y, 0); - swizzles[2] = LLVMConstInt(ctx->i32, swizzle_z, 0); - swizzles[3] = LLVMConstInt(ctx->i32, swizzle_w, 0); - - return LLVMBuildShuffleVector(ctx->ac.builder, - value, - LLVMGetUndef(LLVMTypeOf(value)), - LLVMConstVector(swizzles, 4), ""); -} - -/** - * Return the description of the array covering the given temporary register - * index. - */ -static unsigned -get_temp_array_id(struct lp_build_tgsi_context *bld_base, - unsigned reg_index, - const struct tgsi_ind_register *reg) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - unsigned num_arrays = ctx->bld_base.info->array_max[TGSI_FILE_TEMPORARY]; - unsigned i; - - if (reg && reg->ArrayID > 0 && reg->ArrayID <= num_arrays) - return reg->ArrayID; - - for (i = 0; i < num_arrays; i++) { - const struct tgsi_array_info *array = &ctx->temp_arrays[i]; - - if (reg_index >= array->range.First && reg_index <= array->range.Last) - return i + 1; - } - - return 0; -} - -static struct tgsi_declaration_range -get_array_range(struct lp_build_tgsi_context *bld_base, - unsigned File, unsigned reg_index, - const struct tgsi_ind_register *reg) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - struct tgsi_declaration_range range; - - if (File == TGSI_FILE_TEMPORARY) { - unsigned array_id = get_temp_array_id(bld_base, reg_index, reg); - if (array_id) - return ctx->temp_arrays[array_id - 1].range; - } - - range.First = 0; - range.Last = bld_base->info->file_max[File]; - return range; -} - -/** - * For indirect registers, construct a pointer directly to the requested - * element using getelementptr if possible. - * - * Returns NULL if the insertelement/extractelement fallback for array access - * must be used. - */ -static LLVMValueRef -get_pointer_into_array(struct si_shader_context *ctx, - unsigned file, - unsigned swizzle, - unsigned reg_index, - const struct tgsi_ind_register *reg_indirect) -{ - unsigned array_id; - struct tgsi_array_info *array; - LLVMValueRef idxs[2]; - LLVMValueRef index; - LLVMValueRef alloca; - - if (file != TGSI_FILE_TEMPORARY) - return NULL; - - array_id = get_temp_array_id(&ctx->bld_base, reg_index, reg_indirect); - if (!array_id) - return NULL; - - alloca = ctx->temp_array_allocas[array_id - 1]; - if (!alloca) - return NULL; - - array = &ctx->temp_arrays[array_id - 1]; - - if (!(array->writemask & (1 << swizzle))) - return ctx->undef_alloca; - - index = si_get_indirect_index(ctx, reg_indirect, 1, - reg_index - ctx->temp_arrays[array_id - 1].range.First); - - /* Ensure that the index is within a valid range, to guard against - * VM faults and overwriting critical data (e.g. spilled resource - * descriptors). - * - * TODO It should be possible to avoid the additional instructions - * if LLVM is changed so that it guarantuees: - * 1. the scratch space descriptor isolates the current wave (this - * could even save the scratch offset SGPR at the cost of an - * additional SALU instruction) - * 2. the memory for allocas must be allocated at the _end_ of the - * scratch space (after spilled registers) - */ - index = si_llvm_bound_index(ctx, index, array->range.Last - array->range.First + 1); - - index = ac_build_imad(&ctx->ac, index, - LLVMConstInt(ctx->i32, util_bitcount(array->writemask), 0), - LLVMConstInt(ctx->i32, - util_bitcount(array->writemask & ((1 << swizzle) - 1)), 0)); - idxs[0] = ctx->i32_0; - idxs[1] = index; - return LLVMBuildGEP(ctx->ac.builder, alloca, idxs, 2, ""); -} - -LLVMValueRef -si_llvm_emit_fetch_64bit(struct lp_build_tgsi_context *bld_base, - LLVMTypeRef type, - LLVMValueRef ptr, - LLVMValueRef ptr2) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - LLVMValueRef values[2] = { - ac_to_integer(&ctx->ac, ptr), - ac_to_integer(&ctx->ac, ptr2), - }; - LLVMValueRef result = ac_build_gather_values(&ctx->ac, values, 2); - return LLVMBuildBitCast(ctx->ac.builder, result, type, ""); -} - -static LLVMValueRef -emit_array_fetch(struct lp_build_tgsi_context *bld_base, - unsigned File, enum tgsi_opcode_type type, - struct tgsi_declaration_range range, - unsigned swizzle_in) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - unsigned i, size = range.Last - range.First + 1; - LLVMTypeRef vec = LLVMVectorType(tgsi2llvmtype(bld_base, type), size); - LLVMValueRef result = LLVMGetUndef(vec); - unsigned swizzle = swizzle_in; - struct tgsi_full_src_register tmp_reg = {}; - tmp_reg.Register.File = File; - if (tgsi_type_is_64bit(type)) - swizzle |= (swizzle_in + 1) << 16; - - for (i = 0; i < size; ++i) { - tmp_reg.Register.Index = i + range.First; - - LLVMValueRef temp = si_llvm_emit_fetch(bld_base, &tmp_reg, type, swizzle); - result = LLVMBuildInsertElement(ctx->ac.builder, result, temp, - LLVMConstInt(ctx->i32, i, 0), "array_vector"); - } - return result; -} - -static LLVMValueRef -load_value_from_array(struct lp_build_tgsi_context *bld_base, - unsigned file, - enum tgsi_opcode_type type, - unsigned swizzle, - unsigned reg_index, - const struct tgsi_ind_register *reg_indirect) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - LLVMBuilderRef builder = ctx->ac.builder; - LLVMValueRef ptr; - - ptr = get_pointer_into_array(ctx, file, swizzle, reg_index, reg_indirect); - if (ptr) { - LLVMValueRef val = LLVMBuildLoad(builder, ptr, ""); - if (tgsi_type_is_64bit(type)) { - LLVMValueRef ptr_hi, val_hi; - ptr_hi = LLVMBuildGEP(builder, ptr, &ctx->i32_1, 1, ""); - val_hi = LLVMBuildLoad(builder, ptr_hi, ""); - val = si_llvm_emit_fetch_64bit(bld_base, tgsi2llvmtype(bld_base, type), - val, val_hi); - } - - return val; - } else { - struct tgsi_declaration_range range = - get_array_range(bld_base, file, reg_index, reg_indirect); - LLVMValueRef index = - si_get_indirect_index(ctx, reg_indirect, 1, reg_index - range.First); - LLVMValueRef array = - emit_array_fetch(bld_base, file, type, range, swizzle); - return LLVMBuildExtractElement(builder, array, index, ""); - } -} - -static void -store_value_to_array(struct lp_build_tgsi_context *bld_base, - LLVMValueRef value, - unsigned file, - unsigned chan_index, - unsigned reg_index, - const struct tgsi_ind_register *reg_indirect) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - LLVMBuilderRef builder = ctx->ac.builder; - LLVMValueRef ptr; - - ptr = get_pointer_into_array(ctx, file, chan_index, reg_index, reg_indirect); - if (ptr) { - LLVMBuildStore(builder, value, ptr); - } else { - unsigned i, size; - struct tgsi_declaration_range range = get_array_range(bld_base, file, reg_index, reg_indirect); - LLVMValueRef index = si_get_indirect_index(ctx, reg_indirect, 1, reg_index - range.First); - LLVMValueRef array = - emit_array_fetch(bld_base, file, TGSI_TYPE_FLOAT, range, chan_index); - LLVMValueRef temp_ptr; - - array = LLVMBuildInsertElement(builder, array, value, index, ""); - - size = range.Last - range.First + 1; - for (i = 0; i < size; ++i) { - switch(file) { - case TGSI_FILE_OUTPUT: - temp_ptr = ctx->outputs[i + range.First][chan_index]; - break; - - case TGSI_FILE_TEMPORARY: - if (range.First + i >= ctx->temps_count) - continue; - temp_ptr = ctx->temps[(i + range.First) * TGSI_NUM_CHANNELS + chan_index]; - break; - - default: - continue; - } - value = LLVMBuildExtractElement(builder, array, - LLVMConstInt(ctx->i32, i, 0), ""); - LLVMBuildStore(builder, value, temp_ptr); - } - } -} - -/* If this is true, preload FS inputs at the beginning of shaders. Otherwise, - * reload them at each use. This must be true if the shader is using - * derivatives and KILL, because KILL can leave the WQM and then a lazy - * input load isn't in the WQM anymore. - */ -static bool si_preload_fs_inputs(struct si_shader_context *ctx) -{ - struct si_shader_selector *sel = ctx->shader->selector; - - return sel->info.uses_derivatives && - sel->info.uses_kill; -} - -static LLVMValueRef -get_output_ptr(struct lp_build_tgsi_context *bld_base, unsigned index, - unsigned chan) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - - assert(index <= ctx->bld_base.info->file_max[TGSI_FILE_OUTPUT]); - return ctx->outputs[index][chan]; -} - -LLVMValueRef si_llvm_emit_fetch(struct lp_build_tgsi_context *bld_base, - const struct tgsi_full_src_register *reg, - enum tgsi_opcode_type type, - unsigned swizzle_in) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - LLVMBuilderRef builder = ctx->ac.builder; - LLVMValueRef result = NULL, ptr, ptr2; - unsigned swizzle = swizzle_in & 0xffff; - - if (swizzle_in == ~0) { - LLVMValueRef values[TGSI_NUM_CHANNELS]; - unsigned chan; - for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) { - values[chan] = si_llvm_emit_fetch(bld_base, reg, type, chan); - } - return ac_build_gather_values(&ctx->ac, values, - TGSI_NUM_CHANNELS); - } - - if (reg->Register.Indirect) { - LLVMValueRef load = load_value_from_array(bld_base, reg->Register.File, type, - swizzle, reg->Register.Index, ®->Indirect); - return bitcast(bld_base, type, load); - } - - switch(reg->Register.File) { - case TGSI_FILE_IMMEDIATE: { - LLVMTypeRef ctype = tgsi2llvmtype(bld_base, type); - if (tgsi_type_is_64bit(type)) { - result = LLVMGetUndef(LLVMVectorType(ctx->i32, 2)); - result = LLVMConstInsertElement(result, - ctx->imms[reg->Register.Index * TGSI_NUM_CHANNELS + swizzle], - ctx->i32_0); - result = LLVMConstInsertElement(result, - ctx->imms[reg->Register.Index * TGSI_NUM_CHANNELS + (swizzle_in >> 16)], - ctx->i32_1); - return LLVMConstBitCast(result, ctype); - } else { - return LLVMConstBitCast(ctx->imms[reg->Register.Index * TGSI_NUM_CHANNELS + swizzle], ctype); - } - } - - case TGSI_FILE_INPUT: { - unsigned index = reg->Register.Index; - LLVMValueRef input[4]; - - /* I don't think doing this for vertex shaders is beneficial. - * For those, we want to make sure the VMEM loads are executed - * only once. Fragment shaders don't care much, because - * v_interp instructions are much cheaper than VMEM loads. - */ - if (!si_preload_fs_inputs(ctx) && - ctx->bld_base.info->processor == PIPE_SHADER_FRAGMENT) - ctx->load_input(ctx, index, &ctx->input_decls[index], input); - else - memcpy(input, &ctx->inputs[index * 4], sizeof(input)); - - result = input[swizzle]; - - if (tgsi_type_is_64bit(type)) { - ptr = result; - ptr2 = input[swizzle_in >> 16]; - return si_llvm_emit_fetch_64bit(bld_base, tgsi2llvmtype(bld_base, type), - ptr, ptr2); - } - break; - } - - case TGSI_FILE_TEMPORARY: - if (reg->Register.Index >= ctx->temps_count) - return LLVMGetUndef(tgsi2llvmtype(bld_base, type)); - ptr = ctx->temps[reg->Register.Index * TGSI_NUM_CHANNELS + swizzle]; - if (tgsi_type_is_64bit(type)) { - ptr2 = ctx->temps[reg->Register.Index * TGSI_NUM_CHANNELS + (swizzle_in >> 16)]; - return si_llvm_emit_fetch_64bit(bld_base, tgsi2llvmtype(bld_base, type), - LLVMBuildLoad(builder, ptr, ""), - LLVMBuildLoad(builder, ptr2, "")); - } - result = LLVMBuildLoad(builder, ptr, ""); - break; - - case TGSI_FILE_OUTPUT: - ptr = get_output_ptr(bld_base, reg->Register.Index, swizzle); - if (tgsi_type_is_64bit(type)) { - ptr2 = get_output_ptr(bld_base, reg->Register.Index, (swizzle_in >> 16)); - return si_llvm_emit_fetch_64bit(bld_base, tgsi2llvmtype(bld_base, type), - LLVMBuildLoad(builder, ptr, ""), - LLVMBuildLoad(builder, ptr2, "")); - } - result = LLVMBuildLoad(builder, ptr, ""); - break; - - default: - return LLVMGetUndef(tgsi2llvmtype(bld_base, type)); - } - - return bitcast(bld_base, type, result); -} - -static LLVMValueRef fetch_system_value(struct lp_build_tgsi_context *bld_base, - const struct tgsi_full_src_register *reg, - enum tgsi_opcode_type type, - unsigned swizzle_in) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - LLVMBuilderRef builder = ctx->ac.builder; - LLVMValueRef cval = ctx->system_values[reg->Register.Index]; - unsigned swizzle = swizzle_in & 0xffff; - - if (tgsi_type_is_64bit(type)) { - LLVMValueRef lo, hi; - - assert(swizzle == 0 || swizzle == 2); - - lo = LLVMBuildExtractElement( - builder, cval, LLVMConstInt(ctx->i32, swizzle, 0), ""); - hi = LLVMBuildExtractElement( - builder, cval, LLVMConstInt(ctx->i32, (swizzle_in >> 16), 0), ""); - - return si_llvm_emit_fetch_64bit(bld_base, tgsi2llvmtype(bld_base, type), - lo, hi); - } - - if (LLVMGetTypeKind(LLVMTypeOf(cval)) == LLVMVectorTypeKind) { - cval = LLVMBuildExtractElement( - builder, cval, LLVMConstInt(ctx->i32, swizzle, 0), ""); - } else { - assert(swizzle == 0); - } - - return bitcast(bld_base, type, cval); -} - -static void emit_declaration(struct lp_build_tgsi_context *bld_base, - const struct tgsi_full_declaration *decl) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - LLVMBuilderRef builder = ctx->ac.builder; - unsigned first, last, i; - switch(decl->Declaration.File) { - case TGSI_FILE_ADDRESS: - { - unsigned idx; - for (idx = decl->Range.First; idx <= decl->Range.Last; idx++) { - unsigned chan; - for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) { - ctx->addrs[idx][chan] = ac_build_alloca_undef( - &ctx->ac, ctx->i32, ""); - } - } - break; - } - - case TGSI_FILE_TEMPORARY: - { - char name[18] = ""; - LLVMValueRef array_alloca = NULL; - unsigned decl_size; - unsigned writemask = decl->Declaration.UsageMask; - first = decl->Range.First; - last = decl->Range.Last; - decl_size = 4 * ((last - first) + 1); - - if (decl->Declaration.Array) { - unsigned id = decl->Array.ArrayID - 1; - unsigned array_size; - - writemask &= ctx->temp_arrays[id].writemask; - ctx->temp_arrays[id].writemask = writemask; - array_size = ((last - first) + 1) * util_bitcount(writemask); - - /* If the array has more than 16 elements, store it - * in memory using an alloca that spans the entire - * array. - * - * Otherwise, store each array element individually. - * We will then generate vectors (per-channel, up to - * <16 x float> if the usagemask is a single bit) for - * indirect addressing. - * - * Note that 16 is the number of vector elements that - * LLVM will store in a register, so theoretically an - * array with up to 4 * 16 = 64 elements could be - * handled this way, but whether that's a good idea - * depends on VGPR register pressure elsewhere. - * - * FIXME: We shouldn't need to have the non-alloca - * code path for arrays. LLVM should be smart enough to - * promote allocas into registers when profitable. - */ - if (array_size > 16 || - !ctx->screen->llvm_has_working_vgpr_indexing) { - array_alloca = ac_build_alloca_undef(&ctx->ac, - LLVMArrayType(ctx->f32, - array_size), "array"); - ctx->temp_array_allocas[id] = array_alloca; - } - } - - if (!ctx->temps_count) { - ctx->temps_count = bld_base->info->file_max[TGSI_FILE_TEMPORARY] + 1; - ctx->temps = MALLOC(TGSI_NUM_CHANNELS * ctx->temps_count * sizeof(LLVMValueRef)); - } - if (!array_alloca) { - for (i = 0; i < decl_size; ++i) { -#ifndef NDEBUG - snprintf(name, sizeof(name), "TEMP%d.%c", - first + i / 4, "xyzw"[i % 4]); -#endif - ctx->temps[first * TGSI_NUM_CHANNELS + i] = - ac_build_alloca_undef(&ctx->ac, - ctx->f32, - name); - } - } else { - LLVMValueRef idxs[2] = { - ctx->i32_0, - NULL - }; - unsigned j = 0; - - if (writemask != TGSI_WRITEMASK_XYZW && - !ctx->undef_alloca) { - /* Create a dummy alloca. We use it so that we - * have a pointer that is safe to load from if - * a shader ever reads from a channel that - * it never writes to. - */ - ctx->undef_alloca = ac_build_alloca_undef( - &ctx->ac, ctx->f32, "undef"); - } - - for (i = 0; i < decl_size; ++i) { - LLVMValueRef ptr; - if (writemask & (1 << (i % 4))) { -#ifndef NDEBUG - snprintf(name, sizeof(name), "TEMP%d.%c", - first + i / 4, "xyzw"[i % 4]); -#endif - idxs[1] = LLVMConstInt(ctx->i32, j, 0); - ptr = LLVMBuildGEP(builder, array_alloca, idxs, 2, name); - j++; - } else { - ptr = ctx->undef_alloca; - } - ctx->temps[first * TGSI_NUM_CHANNELS + i] = ptr; - } - } - break; - } - case TGSI_FILE_INPUT: - { - unsigned idx; - for (idx = decl->Range.First; idx <= decl->Range.Last; idx++) { - if (ctx->load_input && - ctx->input_decls[idx].Declaration.File != TGSI_FILE_INPUT) { - ctx->input_decls[idx] = *decl; - ctx->input_decls[idx].Range.First = idx; - ctx->input_decls[idx].Range.Last = idx; - ctx->input_decls[idx].Semantic.Index += idx - decl->Range.First; - - if (si_preload_fs_inputs(ctx) || - bld_base->info->processor != PIPE_SHADER_FRAGMENT) - ctx->load_input(ctx, idx, &ctx->input_decls[idx], - &ctx->inputs[idx * 4]); - } - } - } - break; - - case TGSI_FILE_SYSTEM_VALUE: - { - unsigned idx; - for (idx = decl->Range.First; idx <= decl->Range.Last; idx++) { - si_load_system_value(ctx, idx, decl); - } - } - break; - - case TGSI_FILE_OUTPUT: - { - char name[16] = ""; - unsigned idx; - for (idx = decl->Range.First; idx <= decl->Range.Last; idx++) { - unsigned chan; - assert(idx < RADEON_LLVM_MAX_OUTPUTS); - if (ctx->outputs[idx][0]) - continue; - for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) { -#ifndef NDEBUG - snprintf(name, sizeof(name), "OUT%d.%c", - idx, "xyzw"[chan % 4]); -#endif - ctx->outputs[idx][chan] = ac_build_alloca_undef( - &ctx->ac, ctx->f32, name); - } - } - break; - } - - case TGSI_FILE_MEMORY: - si_tgsi_declare_compute_memory(ctx, decl); - break; - - default: - break; - } -} - -void si_llvm_emit_store(struct lp_build_tgsi_context *bld_base, - const struct tgsi_full_instruction *inst, - const struct tgsi_opcode_info *info, - unsigned index, - LLVMValueRef dst[4]) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - const struct tgsi_full_dst_register *reg = &inst->Dst[index]; - LLVMBuilderRef builder = ctx->ac.builder; - LLVMValueRef temp_ptr, temp_ptr2 = NULL; - bool is_vec_store = false; - enum tgsi_opcode_type dtype = tgsi_opcode_infer_dst_type(inst->Instruction.Opcode, index); - - if (dst[0]) { - LLVMTypeKind k = LLVMGetTypeKind(LLVMTypeOf(dst[0])); - is_vec_store = (k == LLVMVectorTypeKind); - } - - if (is_vec_store) { - LLVMValueRef values[4] = {}; - uint32_t writemask = reg->Register.WriteMask; - while (writemask) { - unsigned chan = u_bit_scan(&writemask); - LLVMValueRef index = LLVMConstInt(ctx->i32, chan, 0); - values[chan] = LLVMBuildExtractElement(ctx->ac.builder, - dst[0], index, ""); - } - bld_base->emit_store(bld_base, inst, info, index, values); - return; - } - - uint32_t writemask = reg->Register.WriteMask; - while (writemask) { - unsigned chan_index = u_bit_scan(&writemask); - LLVMValueRef value = dst[chan_index]; - - if (tgsi_type_is_64bit(dtype) && (chan_index == 1 || chan_index == 3)) - continue; - if (inst->Instruction.Saturate) - value = ac_build_clamp(&ctx->ac, value); - - if (reg->Register.File == TGSI_FILE_ADDRESS) { - temp_ptr = ctx->addrs[reg->Register.Index][chan_index]; - LLVMBuildStore(builder, value, temp_ptr); - continue; - } - - if (!tgsi_type_is_64bit(dtype)) - value = ac_to_float(&ctx->ac, value); - - if (reg->Register.Indirect) { - unsigned file = reg->Register.File; - unsigned reg_index = reg->Register.Index; - store_value_to_array(bld_base, value, file, chan_index, - reg_index, ®->Indirect); - } else { - switch(reg->Register.File) { - case TGSI_FILE_OUTPUT: - temp_ptr = ctx->outputs[reg->Register.Index][chan_index]; - if (tgsi_type_is_64bit(dtype)) - temp_ptr2 = ctx->outputs[reg->Register.Index][chan_index + 1]; - break; - - case TGSI_FILE_TEMPORARY: - { - if (reg->Register.Index >= ctx->temps_count) - continue; - - temp_ptr = ctx->temps[ TGSI_NUM_CHANNELS * reg->Register.Index + chan_index]; - if (tgsi_type_is_64bit(dtype)) - temp_ptr2 = ctx->temps[ TGSI_NUM_CHANNELS * reg->Register.Index + chan_index + 1]; - - break; - } - default: - return; - } - if (!tgsi_type_is_64bit(dtype)) - LLVMBuildStore(builder, value, temp_ptr); - else { - LLVMValueRef ptr = LLVMBuildBitCast(builder, value, - LLVMVectorType(ctx->i32, 2), ""); - LLVMValueRef val2; - value = LLVMBuildExtractElement(builder, ptr, - ctx->i32_0, ""); - val2 = LLVMBuildExtractElement(builder, ptr, - ctx->i32_1, ""); - - LLVMBuildStore(builder, ac_to_float(&ctx->ac, value), temp_ptr); - LLVMBuildStore(builder, ac_to_float(&ctx->ac, val2), temp_ptr2); - } - } - } -} - -static int get_line(int pc) -{ - /* Subtract 1 so that the number shown is that of the corresponding - * opcode in the TGSI dump, e.g. an if block has the same suffix as - * the instruction number of the corresponding TGSI IF. - */ - return pc - 1; -} - -static void bgnloop_emit(const struct lp_build_tgsi_action *action, - struct lp_build_tgsi_context *bld_base, - struct lp_build_emit_data *emit_data) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - ac_build_bgnloop(&ctx->ac, get_line(bld_base->pc)); -} - -static void brk_emit(const struct lp_build_tgsi_action *action, - struct lp_build_tgsi_context *bld_base, - struct lp_build_emit_data *emit_data) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - ac_build_break(&ctx->ac); -} - -static void cont_emit(const struct lp_build_tgsi_action *action, - struct lp_build_tgsi_context *bld_base, - struct lp_build_emit_data *emit_data) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - ac_build_continue(&ctx->ac); -} - -static void else_emit(const struct lp_build_tgsi_action *action, - struct lp_build_tgsi_context *bld_base, - struct lp_build_emit_data *emit_data) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - ac_build_else(&ctx->ac, get_line(bld_base->pc)); -} - -static void endif_emit(const struct lp_build_tgsi_action *action, - struct lp_build_tgsi_context *bld_base, - struct lp_build_emit_data *emit_data) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - ac_build_endif(&ctx->ac, get_line(bld_base->pc)); -} - -static void endloop_emit(const struct lp_build_tgsi_action *action, - struct lp_build_tgsi_context *bld_base, - struct lp_build_emit_data *emit_data) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - ac_build_endloop(&ctx->ac, get_line(bld_base->pc)); -} - -static void if_emit(const struct lp_build_tgsi_action *action, - struct lp_build_tgsi_context *bld_base, - struct lp_build_emit_data *emit_data) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - ac_build_if(&ctx->ac, emit_data->args[0], get_line(bld_base->pc)); -} - -static void uif_emit(const struct lp_build_tgsi_action *action, - struct lp_build_tgsi_context *bld_base, - struct lp_build_emit_data *emit_data) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - ac_build_uif(&ctx->ac, emit_data->args[0], get_line(bld_base->pc)); -} - -static void emit_immediate(struct lp_build_tgsi_context *bld_base, - const struct tgsi_full_immediate *imm) -{ - unsigned i; - struct si_shader_context *ctx = si_shader_context(bld_base); - - for (i = 0; i < 4; ++i) { - ctx->imms[ctx->imms_num * TGSI_NUM_CHANNELS + i] = - LLVMConstInt(ctx->i32, imm->u[i].Uint, false ); - } - - ctx->imms_num++; -} - -void si_llvm_context_init(struct si_shader_context *ctx, - struct si_screen *sscreen, - struct ac_llvm_compiler *compiler, - unsigned wave_size, - unsigned ballot_mask_bits) -{ - struct lp_type type; - - /* Initialize the gallivm object: - * We are only using the module, context, and builder fields of this struct. - * This should be enough for us to be able to pass our gallivm struct to the - * helper functions in the gallivm module. - */ - memset(ctx, 0, sizeof(*ctx)); - ctx->screen = sscreen; - ctx->compiler = compiler; - - ac_llvm_context_init(&ctx->ac, compiler, sscreen->info.chip_class, - sscreen->info.family, - AC_FLOAT_MODE_NO_SIGNED_ZEROS_FP_MATH, - wave_size, ballot_mask_bits); - - ctx->gallivm.context = ctx->ac.context; - ctx->gallivm.module = ctx->ac.module; - ctx->gallivm.builder = ctx->ac.builder; - - struct lp_build_tgsi_context *bld_base = &ctx->bld_base; - - type.floating = true; - type.fixed = false; - type.sign = true; - type.norm = false; - type.width = 32; - type.length = 1; - - lp_build_context_init(&bld_base->base, &ctx->gallivm, type); - lp_build_context_init(&ctx->bld_base.uint_bld, &ctx->gallivm, lp_uint_type(type)); - lp_build_context_init(&ctx->bld_base.int_bld, &ctx->gallivm, lp_int_type(type)); - type.width *= 2; - lp_build_context_init(&ctx->bld_base.dbl_bld, &ctx->gallivm, type); - lp_build_context_init(&ctx->bld_base.uint64_bld, &ctx->gallivm, lp_uint_type(type)); - lp_build_context_init(&ctx->bld_base.int64_bld, &ctx->gallivm, lp_int_type(type)); - - bld_base->soa = 1; - bld_base->emit_swizzle = emit_swizzle; - bld_base->emit_declaration = emit_declaration; - bld_base->emit_immediate = emit_immediate; - - bld_base->op_actions[TGSI_OPCODE_BGNLOOP].emit = bgnloop_emit; - bld_base->op_actions[TGSI_OPCODE_BRK].emit = brk_emit; - bld_base->op_actions[TGSI_OPCODE_CONT].emit = cont_emit; - bld_base->op_actions[TGSI_OPCODE_IF].emit = if_emit; - bld_base->op_actions[TGSI_OPCODE_UIF].emit = uif_emit; - bld_base->op_actions[TGSI_OPCODE_ELSE].emit = else_emit; - bld_base->op_actions[TGSI_OPCODE_ENDIF].emit = endif_emit; - bld_base->op_actions[TGSI_OPCODE_ENDLOOP].emit = endloop_emit; - - si_shader_context_init_alu(&ctx->bld_base); - si_shader_context_init_mem(ctx); - - ctx->voidt = LLVMVoidTypeInContext(ctx->ac.context); - ctx->i1 = LLVMInt1TypeInContext(ctx->ac.context); - ctx->i8 = LLVMInt8TypeInContext(ctx->ac.context); - ctx->i32 = LLVMInt32TypeInContext(ctx->ac.context); - ctx->i64 = LLVMInt64TypeInContext(ctx->ac.context); - ctx->i128 = LLVMIntTypeInContext(ctx->ac.context, 128); - ctx->f32 = LLVMFloatTypeInContext(ctx->ac.context); - ctx->v2i32 = LLVMVectorType(ctx->i32, 2); - ctx->v4i32 = LLVMVectorType(ctx->i32, 4); - ctx->v4f32 = LLVMVectorType(ctx->f32, 4); - ctx->v8i32 = LLVMVectorType(ctx->i32, 8); - - ctx->i32_0 = LLVMConstInt(ctx->i32, 0, 0); - ctx->i32_1 = LLVMConstInt(ctx->i32, 1, 0); - ctx->i1false = LLVMConstInt(ctx->i1, 0, 0); - ctx->i1true = LLVMConstInt(ctx->i1, 1, 0); -} - -/* Set the context to a certain TGSI shader. Can be called repeatedly - * to change the shader. */ -void si_llvm_context_set_ir(struct si_shader_context *ctx, - struct si_shader *shader) -{ - struct si_shader_selector *sel = shader->selector; - const struct tgsi_shader_info *info = &sel->info; - - ctx->shader = shader; - ctx->type = sel->type; - ctx->bld_base.info = info; - - /* Clean up the old contents. */ - FREE(ctx->temp_arrays); - ctx->temp_arrays = NULL; - FREE(ctx->temp_array_allocas); - ctx->temp_array_allocas = NULL; - - FREE(ctx->imms); - ctx->imms = NULL; - ctx->imms_num = 0; - - FREE(ctx->temps); - ctx->temps = NULL; - ctx->temps_count = 0; - - ctx->num_const_buffers = util_last_bit(info->const_buffers_declared); - ctx->num_shader_buffers = util_last_bit(info->shader_buffers_declared); - - ctx->num_samplers = util_last_bit(info->samplers_declared); - ctx->num_images = util_last_bit(info->images_declared); - - if (sel->nir) - return; - - if (info->array_max[TGSI_FILE_TEMPORARY] > 0) { - int size = info->array_max[TGSI_FILE_TEMPORARY]; - - ctx->temp_arrays = CALLOC(size, sizeof(ctx->temp_arrays[0])); - ctx->temp_array_allocas = CALLOC(size, sizeof(ctx->temp_array_allocas[0])); - - tgsi_scan_arrays(sel->tokens, TGSI_FILE_TEMPORARY, size, - ctx->temp_arrays); - } - if (info->file_max[TGSI_FILE_IMMEDIATE] >= 0) { - int size = info->file_max[TGSI_FILE_IMMEDIATE] + 1; - ctx->imms = MALLOC(size * TGSI_NUM_CHANNELS * sizeof(LLVMValueRef)); - } - - /* Re-set these to start with a clean slate. */ - ctx->bld_base.num_instructions = 0; - ctx->bld_base.pc = 0; - memset(ctx->outputs, 0, sizeof(ctx->outputs)); - - ctx->bld_base.emit_store = si_llvm_emit_store; - ctx->bld_base.emit_fetch_funcs[TGSI_FILE_IMMEDIATE] = si_llvm_emit_fetch; - ctx->bld_base.emit_fetch_funcs[TGSI_FILE_INPUT] = si_llvm_emit_fetch; - ctx->bld_base.emit_fetch_funcs[TGSI_FILE_TEMPORARY] = si_llvm_emit_fetch; - ctx->bld_base.emit_fetch_funcs[TGSI_FILE_OUTPUT] = si_llvm_emit_fetch; - ctx->bld_base.emit_fetch_funcs[TGSI_FILE_SYSTEM_VALUE] = fetch_system_value; -} - -void si_llvm_create_func(struct si_shader_context *ctx, - const char *name, - LLVMTypeRef *return_types, unsigned num_return_elems, - LLVMTypeRef *ParamTypes, unsigned ParamCount) -{ - LLVMTypeRef main_fn_type, ret_type; - LLVMBasicBlockRef main_fn_body; - enum si_llvm_calling_convention call_conv; - enum pipe_shader_type real_shader_type; - - if (num_return_elems) - ret_type = LLVMStructTypeInContext(ctx->ac.context, - return_types, - num_return_elems, true); - else - ret_type = ctx->voidt; - - /* Setup the function */ - ctx->return_type = ret_type; - main_fn_type = LLVMFunctionType(ret_type, ParamTypes, ParamCount, 0); - ctx->main_fn = LLVMAddFunction(ctx->gallivm.module, name, main_fn_type); - main_fn_body = LLVMAppendBasicBlockInContext(ctx->ac.context, - ctx->main_fn, "main_body"); - LLVMPositionBuilderAtEnd(ctx->ac.builder, main_fn_body); - - real_shader_type = ctx->type; - - /* LS is merged into HS (TCS), and ES is merged into GS. */ - if (ctx->screen->info.chip_class >= GFX9) { - if (ctx->shader->key.as_ls) - real_shader_type = PIPE_SHADER_TESS_CTRL; - else if (ctx->shader->key.as_es || ctx->shader->key.as_ngg) - real_shader_type = PIPE_SHADER_GEOMETRY; - } - - switch (real_shader_type) { - case PIPE_SHADER_VERTEX: - case PIPE_SHADER_TESS_EVAL: - call_conv = RADEON_LLVM_AMDGPU_VS; - break; - case PIPE_SHADER_TESS_CTRL: - call_conv = RADEON_LLVM_AMDGPU_HS; - break; - case PIPE_SHADER_GEOMETRY: - call_conv = RADEON_LLVM_AMDGPU_GS; - break; - case PIPE_SHADER_FRAGMENT: - call_conv = RADEON_LLVM_AMDGPU_PS; - break; - case PIPE_SHADER_COMPUTE: - call_conv = RADEON_LLVM_AMDGPU_CS; - break; - default: - unreachable("Unhandle shader type"); - } - - LLVMSetFunctionCallConv(ctx->main_fn, call_conv); -} - -void si_llvm_optimize_module(struct si_shader_context *ctx) -{ - /* Dump LLVM IR before any optimization passes */ - if (ctx->screen->debug_flags & DBG(PREOPT_IR) && - si_can_dump_shader(ctx->screen, ctx->type)) - LLVMDumpModule(ctx->gallivm.module); - - /* Run the pass */ - LLVMRunPassManager(ctx->compiler->passmgr, ctx->gallivm.module); - LLVMDisposeBuilder(ctx->ac.builder); -} - -void si_llvm_dispose(struct si_shader_context *ctx) -{ - LLVMDisposeModule(ctx->gallivm.module); - LLVMContextDispose(ctx->gallivm.context); - FREE(ctx->temp_arrays); - ctx->temp_arrays = NULL; - FREE(ctx->temp_array_allocas); - ctx->temp_array_allocas = NULL; - FREE(ctx->temps); - ctx->temps = NULL; - ctx->temps_count = 0; - FREE(ctx->imms); - ctx->imms = NULL; - ctx->imms_num = 0; - ac_llvm_context_dispose(&ctx->ac); -} diff -Nru mesa-19.2.8/src/gallium/drivers/radeonsi/si_state_binning.c mesa-20.0.8/src/gallium/drivers/radeonsi/si_state_binning.c --- mesa-19.2.8/src/gallium/drivers/radeonsi/si_state_binning.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/radeonsi/si_state_binning.c 2020-06-12 01:21:17.000000000 +0000 @@ -313,20 +313,6 @@ struct uvec2 *color_bin_size, struct uvec2 *depth_bin_size) { - unsigned num_sdp_interfaces = 0; - - switch (sctx->family) { - case CHIP_NAVI10: - case CHIP_NAVI12: - num_sdp_interfaces = 16; - break; - case CHIP_NAVI14: - num_sdp_interfaces = 8; - break; - default: - assert(0); - } - const unsigned ZsTagSize = 64; const unsigned ZsNumTags = 312; const unsigned CcTagSize = 1024; @@ -335,7 +321,7 @@ const unsigned FcReadTags = 44; const unsigned num_rbs = sctx->screen->info.num_render_backends; - const unsigned num_pipes = MAX2(num_rbs, num_sdp_interfaces); + const unsigned num_pipes = MAX2(num_rbs, sctx->screen->info.num_sdp_interfaces); const unsigned depthBinSizeTagPart = ((ZsNumTags * num_rbs / num_pipes) * (ZsTagSize * num_pipes)); const unsigned colorBinSizeTagPart = ((CcReadTags * num_rbs / num_pipes) * (CcTagSize * num_pipes)); @@ -567,7 +553,7 @@ * https://bugs.freedesktop.org/show_bug.cgi?id=110214 * (an alternative is to insert manual BATCH_BREAK event when * a context_roll is detected). */ - context_states_per_bin = sctx->screen->has_gfx9_scissor_bug ? 1 : 6; + context_states_per_bin = sctx->screen->info.has_gfx9_scissor_bug ? 1 : 6; /* Using 32 here can cause GPU hangs on RAVEN1 */ persistent_states_per_bin = 16; } diff -Nru mesa-19.2.8/src/gallium/drivers/radeonsi/si_state.c mesa-20.0.8/src/gallium/drivers/radeonsi/si_state.c --- mesa-19.2.8/src/gallium/drivers/radeonsi/si_state.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/radeonsi/si_state.c 2020-06-12 01:21:17.000000000 +0000 @@ -27,8 +27,8 @@ #include "si_query.h" #include "util/u_dual_blend.h" -#include "util/u_format.h" -#include "util/u_format_s3tc.h" +#include "util/format/u_format.h" +#include "util/format/u_format_s3tc.h" #include "util/u_memory.h" #include "util/u_resource.h" #include "util/u_upload_mgr.h" @@ -47,27 +47,27 @@ static unsigned si_map_swizzle(unsigned swizzle) { - switch (swizzle) { - case PIPE_SWIZZLE_Y: - return V_008F0C_SQ_SEL_Y; - case PIPE_SWIZZLE_Z: - return V_008F0C_SQ_SEL_Z; - case PIPE_SWIZZLE_W: - return V_008F0C_SQ_SEL_W; - case PIPE_SWIZZLE_0: - return V_008F0C_SQ_SEL_0; - case PIPE_SWIZZLE_1: - return V_008F0C_SQ_SEL_1; - default: /* PIPE_SWIZZLE_X */ - return V_008F0C_SQ_SEL_X; - } + switch (swizzle) { + case PIPE_SWIZZLE_Y: + return V_008F0C_SQ_SEL_Y; + case PIPE_SWIZZLE_Z: + return V_008F0C_SQ_SEL_Z; + case PIPE_SWIZZLE_W: + return V_008F0C_SQ_SEL_W; + case PIPE_SWIZZLE_0: + return V_008F0C_SQ_SEL_0; + case PIPE_SWIZZLE_1: + return V_008F0C_SQ_SEL_1; + default: /* PIPE_SWIZZLE_X */ + return V_008F0C_SQ_SEL_X; + } } /* 12.4 fixed-point */ static unsigned si_pack_float_12p4(float x) { - return x <= 0 ? 0 : - x >= 4096 ? 0xffff : x * 16; + return x <= 0 ? 0 : + x >= 4096 ? 0xffff : x * 16; } /* @@ -78,202 +78,202 @@ */ static void si_emit_cb_render_state(struct si_context *sctx) { - struct radeon_cmdbuf *cs = sctx->gfx_cs; - struct si_state_blend *blend = sctx->queued.named.blend; - /* CB_COLORn_INFO.FORMAT=INVALID should disable unbound colorbuffers, - * but you never know. */ - uint32_t cb_target_mask = sctx->framebuffer.colorbuf_enabled_4bit & - blend->cb_target_mask; - unsigned i; - - /* Avoid a hang that happens when dual source blending is enabled - * but there is not enough color outputs. This is undefined behavior, - * so disable color writes completely. - * - * Reproducible with Unigine Heaven 4.0 and drirc missing. - */ - if (blend->dual_src_blend && - sctx->ps_shader.cso && - (sctx->ps_shader.cso->info.colors_written & 0x3) != 0x3) - cb_target_mask = 0; - - /* GFX9: Flush DFSM when CB_TARGET_MASK changes. - * I think we don't have to do anything between IBs. - */ - if (sctx->screen->dpbb_allowed && - sctx->last_cb_target_mask != cb_target_mask) { - sctx->last_cb_target_mask = cb_target_mask; - - radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); - radeon_emit(cs, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0)); - } - - unsigned initial_cdw = cs->current.cdw; - radeon_opt_set_context_reg(sctx, R_028238_CB_TARGET_MASK, - SI_TRACKED_CB_TARGET_MASK, cb_target_mask); - - if (sctx->chip_class >= GFX8) { - /* DCC MSAA workaround. - * Alternatively, we can set CB_COLORi_DCC_CONTROL.OVERWRITE_- - * COMBINER_DISABLE, but that would be more complicated. - */ - bool oc_disable = blend->dcc_msaa_corruption_4bit & cb_target_mask && - sctx->framebuffer.nr_samples >= 2; - unsigned watermark = sctx->framebuffer.dcc_overwrite_combiner_watermark; - - radeon_opt_set_context_reg( - sctx, R_028424_CB_DCC_CONTROL, - SI_TRACKED_CB_DCC_CONTROL, - S_028424_OVERWRITE_COMBINER_MRT_SHARING_DISABLE(sctx->chip_class <= GFX9) | - S_028424_OVERWRITE_COMBINER_WATERMARK(watermark) | - S_028424_OVERWRITE_COMBINER_DISABLE(oc_disable) | - S_028424_DISABLE_CONSTANT_ENCODE_REG(sctx->screen->has_dcc_constant_encode)); - } - - /* RB+ register settings. */ - if (sctx->screen->rbplus_allowed) { - unsigned spi_shader_col_format = - sctx->ps_shader.cso ? - sctx->ps_shader.current->key.part.ps.epilog.spi_shader_col_format : 0; - unsigned sx_ps_downconvert = 0; - unsigned sx_blend_opt_epsilon = 0; - unsigned sx_blend_opt_control = 0; - - for (i = 0; i < sctx->framebuffer.state.nr_cbufs; i++) { - struct si_surface *surf = - (struct si_surface*)sctx->framebuffer.state.cbufs[i]; - unsigned format, swap, spi_format, colormask; - bool has_alpha, has_rgb; - - if (!surf) { - /* If the color buffer is not set, the driver sets 32_R - * as the SPI color format, because the hw doesn't allow - * holes between color outputs, so also set this to - * enable RB+. - */ - sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_R << (i * 4); - continue; - } - - format = G_028C70_FORMAT(surf->cb_color_info); - swap = G_028C70_COMP_SWAP(surf->cb_color_info); - spi_format = (spi_shader_col_format >> (i * 4)) & 0xf; - colormask = (cb_target_mask >> (i * 4)) & 0xf; - - /* Set if RGB and A are present. */ - has_alpha = !G_028C74_FORCE_DST_ALPHA_1(surf->cb_color_attrib); - - if (format == V_028C70_COLOR_8 || - format == V_028C70_COLOR_16 || - format == V_028C70_COLOR_32) - has_rgb = !has_alpha; - else - has_rgb = true; - - /* Check the colormask and export format. */ - if (!(colormask & (PIPE_MASK_RGBA & ~PIPE_MASK_A))) - has_rgb = false; - if (!(colormask & PIPE_MASK_A)) - has_alpha = false; - - if (spi_format == V_028714_SPI_SHADER_ZERO) { - has_rgb = false; - has_alpha = false; - } - - /* Disable value checking for disabled channels. */ - if (!has_rgb) - sx_blend_opt_control |= S_02875C_MRT0_COLOR_OPT_DISABLE(1) << (i * 4); - if (!has_alpha) - sx_blend_opt_control |= S_02875C_MRT0_ALPHA_OPT_DISABLE(1) << (i * 4); - - /* Enable down-conversion for 32bpp and smaller formats. */ - switch (format) { - case V_028C70_COLOR_8: - case V_028C70_COLOR_8_8: - case V_028C70_COLOR_8_8_8_8: - /* For 1 and 2-channel formats, use the superset thereof. */ - if (spi_format == V_028714_SPI_SHADER_FP16_ABGR || - spi_format == V_028714_SPI_SHADER_UINT16_ABGR || - spi_format == V_028714_SPI_SHADER_SINT16_ABGR) { - sx_ps_downconvert |= V_028754_SX_RT_EXPORT_8_8_8_8 << (i * 4); - sx_blend_opt_epsilon |= V_028758_8BIT_FORMAT << (i * 4); - } - break; - - case V_028C70_COLOR_5_6_5: - if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) { - sx_ps_downconvert |= V_028754_SX_RT_EXPORT_5_6_5 << (i * 4); - sx_blend_opt_epsilon |= V_028758_6BIT_FORMAT << (i * 4); - } - break; - - case V_028C70_COLOR_1_5_5_5: - if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) { - sx_ps_downconvert |= V_028754_SX_RT_EXPORT_1_5_5_5 << (i * 4); - sx_blend_opt_epsilon |= V_028758_5BIT_FORMAT << (i * 4); - } - break; - - case V_028C70_COLOR_4_4_4_4: - if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) { - sx_ps_downconvert |= V_028754_SX_RT_EXPORT_4_4_4_4 << (i * 4); - sx_blend_opt_epsilon |= V_028758_4BIT_FORMAT << (i * 4); - } - break; - - case V_028C70_COLOR_32: - if (swap == V_028C70_SWAP_STD && - spi_format == V_028714_SPI_SHADER_32_R) - sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_R << (i * 4); - else if (swap == V_028C70_SWAP_ALT_REV && - spi_format == V_028714_SPI_SHADER_32_AR) - sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_A << (i * 4); - break; - - case V_028C70_COLOR_16: - case V_028C70_COLOR_16_16: - /* For 1-channel formats, use the superset thereof. */ - if (spi_format == V_028714_SPI_SHADER_UNORM16_ABGR || - spi_format == V_028714_SPI_SHADER_SNORM16_ABGR || - spi_format == V_028714_SPI_SHADER_UINT16_ABGR || - spi_format == V_028714_SPI_SHADER_SINT16_ABGR) { - if (swap == V_028C70_SWAP_STD || - swap == V_028C70_SWAP_STD_REV) - sx_ps_downconvert |= V_028754_SX_RT_EXPORT_16_16_GR << (i * 4); - else - sx_ps_downconvert |= V_028754_SX_RT_EXPORT_16_16_AR << (i * 4); - } - break; - - case V_028C70_COLOR_10_11_11: - if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) - sx_ps_downconvert |= V_028754_SX_RT_EXPORT_10_11_11 << (i * 4); - break; - - case V_028C70_COLOR_2_10_10_10: - if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) { - sx_ps_downconvert |= V_028754_SX_RT_EXPORT_2_10_10_10 << (i * 4); - sx_blend_opt_epsilon |= V_028758_10BIT_FORMAT << (i * 4); - } - break; - } - } - - /* If there are no color outputs, the first color export is - * always enabled as 32_R, so also set this to enable RB+. - */ - if (!sx_ps_downconvert) - sx_ps_downconvert = V_028754_SX_RT_EXPORT_32_R; - - /* SX_PS_DOWNCONVERT, SX_BLEND_OPT_EPSILON, SX_BLEND_OPT_CONTROL */ - radeon_opt_set_context_reg3(sctx, R_028754_SX_PS_DOWNCONVERT, - SI_TRACKED_SX_PS_DOWNCONVERT, - sx_ps_downconvert, sx_blend_opt_epsilon, - sx_blend_opt_control); - } - if (initial_cdw != cs->current.cdw) - sctx->context_roll = true; + struct radeon_cmdbuf *cs = sctx->gfx_cs; + struct si_state_blend *blend = sctx->queued.named.blend; + /* CB_COLORn_INFO.FORMAT=INVALID should disable unbound colorbuffers, + * but you never know. */ + uint32_t cb_target_mask = sctx->framebuffer.colorbuf_enabled_4bit & + blend->cb_target_mask; + unsigned i; + + /* Avoid a hang that happens when dual source blending is enabled + * but there is not enough color outputs. This is undefined behavior, + * so disable color writes completely. + * + * Reproducible with Unigine Heaven 4.0 and drirc missing. + */ + if (blend->dual_src_blend && + sctx->ps_shader.cso && + (sctx->ps_shader.cso->info.colors_written & 0x3) != 0x3) + cb_target_mask = 0; + + /* GFX9: Flush DFSM when CB_TARGET_MASK changes. + * I think we don't have to do anything between IBs. + */ + if (sctx->screen->dpbb_allowed && + sctx->last_cb_target_mask != cb_target_mask) { + sctx->last_cb_target_mask = cb_target_mask; + + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0)); + } + + unsigned initial_cdw = cs->current.cdw; + radeon_opt_set_context_reg(sctx, R_028238_CB_TARGET_MASK, + SI_TRACKED_CB_TARGET_MASK, cb_target_mask); + + if (sctx->chip_class >= GFX8) { + /* DCC MSAA workaround. + * Alternatively, we can set CB_COLORi_DCC_CONTROL.OVERWRITE_- + * COMBINER_DISABLE, but that would be more complicated. + */ + bool oc_disable = blend->dcc_msaa_corruption_4bit & cb_target_mask && + sctx->framebuffer.nr_samples >= 2; + unsigned watermark = sctx->framebuffer.dcc_overwrite_combiner_watermark; + + radeon_opt_set_context_reg( + sctx, R_028424_CB_DCC_CONTROL, + SI_TRACKED_CB_DCC_CONTROL, + S_028424_OVERWRITE_COMBINER_MRT_SHARING_DISABLE(sctx->chip_class <= GFX9) | + S_028424_OVERWRITE_COMBINER_WATERMARK(watermark) | + S_028424_OVERWRITE_COMBINER_DISABLE(oc_disable) | + S_028424_DISABLE_CONSTANT_ENCODE_REG(sctx->screen->info.has_dcc_constant_encode)); + } + + /* RB+ register settings. */ + if (sctx->screen->info.rbplus_allowed) { + unsigned spi_shader_col_format = + sctx->ps_shader.cso ? + sctx->ps_shader.current->key.part.ps.epilog.spi_shader_col_format : 0; + unsigned sx_ps_downconvert = 0; + unsigned sx_blend_opt_epsilon = 0; + unsigned sx_blend_opt_control = 0; + + for (i = 0; i < sctx->framebuffer.state.nr_cbufs; i++) { + struct si_surface *surf = + (struct si_surface*)sctx->framebuffer.state.cbufs[i]; + unsigned format, swap, spi_format, colormask; + bool has_alpha, has_rgb; + + if (!surf) { + /* If the color buffer is not set, the driver sets 32_R + * as the SPI color format, because the hw doesn't allow + * holes between color outputs, so also set this to + * enable RB+. + */ + sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_R << (i * 4); + continue; + } + + format = G_028C70_FORMAT(surf->cb_color_info); + swap = G_028C70_COMP_SWAP(surf->cb_color_info); + spi_format = (spi_shader_col_format >> (i * 4)) & 0xf; + colormask = (cb_target_mask >> (i * 4)) & 0xf; + + /* Set if RGB and A are present. */ + has_alpha = !G_028C74_FORCE_DST_ALPHA_1(surf->cb_color_attrib); + + if (format == V_028C70_COLOR_8 || + format == V_028C70_COLOR_16 || + format == V_028C70_COLOR_32) + has_rgb = !has_alpha; + else + has_rgb = true; + + /* Check the colormask and export format. */ + if (!(colormask & (PIPE_MASK_RGBA & ~PIPE_MASK_A))) + has_rgb = false; + if (!(colormask & PIPE_MASK_A)) + has_alpha = false; + + if (spi_format == V_028714_SPI_SHADER_ZERO) { + has_rgb = false; + has_alpha = false; + } + + /* Disable value checking for disabled channels. */ + if (!has_rgb) + sx_blend_opt_control |= S_02875C_MRT0_COLOR_OPT_DISABLE(1) << (i * 4); + if (!has_alpha) + sx_blend_opt_control |= S_02875C_MRT0_ALPHA_OPT_DISABLE(1) << (i * 4); + + /* Enable down-conversion for 32bpp and smaller formats. */ + switch (format) { + case V_028C70_COLOR_8: + case V_028C70_COLOR_8_8: + case V_028C70_COLOR_8_8_8_8: + /* For 1 and 2-channel formats, use the superset thereof. */ + if (spi_format == V_028714_SPI_SHADER_FP16_ABGR || + spi_format == V_028714_SPI_SHADER_UINT16_ABGR || + spi_format == V_028714_SPI_SHADER_SINT16_ABGR) { + sx_ps_downconvert |= V_028754_SX_RT_EXPORT_8_8_8_8 << (i * 4); + sx_blend_opt_epsilon |= V_028758_8BIT_FORMAT << (i * 4); + } + break; + + case V_028C70_COLOR_5_6_5: + if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) { + sx_ps_downconvert |= V_028754_SX_RT_EXPORT_5_6_5 << (i * 4); + sx_blend_opt_epsilon |= V_028758_6BIT_FORMAT << (i * 4); + } + break; + + case V_028C70_COLOR_1_5_5_5: + if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) { + sx_ps_downconvert |= V_028754_SX_RT_EXPORT_1_5_5_5 << (i * 4); + sx_blend_opt_epsilon |= V_028758_5BIT_FORMAT << (i * 4); + } + break; + + case V_028C70_COLOR_4_4_4_4: + if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) { + sx_ps_downconvert |= V_028754_SX_RT_EXPORT_4_4_4_4 << (i * 4); + sx_blend_opt_epsilon |= V_028758_4BIT_FORMAT << (i * 4); + } + break; + + case V_028C70_COLOR_32: + if (swap == V_028C70_SWAP_STD && + spi_format == V_028714_SPI_SHADER_32_R) + sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_R << (i * 4); + else if (swap == V_028C70_SWAP_ALT_REV && + spi_format == V_028714_SPI_SHADER_32_AR) + sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_A << (i * 4); + break; + + case V_028C70_COLOR_16: + case V_028C70_COLOR_16_16: + /* For 1-channel formats, use the superset thereof. */ + if (spi_format == V_028714_SPI_SHADER_UNORM16_ABGR || + spi_format == V_028714_SPI_SHADER_SNORM16_ABGR || + spi_format == V_028714_SPI_SHADER_UINT16_ABGR || + spi_format == V_028714_SPI_SHADER_SINT16_ABGR) { + if (swap == V_028C70_SWAP_STD || + swap == V_028C70_SWAP_STD_REV) + sx_ps_downconvert |= V_028754_SX_RT_EXPORT_16_16_GR << (i * 4); + else + sx_ps_downconvert |= V_028754_SX_RT_EXPORT_16_16_AR << (i * 4); + } + break; + + case V_028C70_COLOR_10_11_11: + if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) + sx_ps_downconvert |= V_028754_SX_RT_EXPORT_10_11_11 << (i * 4); + break; + + case V_028C70_COLOR_2_10_10_10: + if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) { + sx_ps_downconvert |= V_028754_SX_RT_EXPORT_2_10_10_10 << (i * 4); + sx_blend_opt_epsilon |= V_028758_10BIT_FORMAT << (i * 4); + } + break; + } + } + + /* If there are no color outputs, the first color export is + * always enabled as 32_R, so also set this to enable RB+. + */ + if (!sx_ps_downconvert) + sx_ps_downconvert = V_028754_SX_RT_EXPORT_32_R; + + /* SX_PS_DOWNCONVERT, SX_BLEND_OPT_EPSILON, SX_BLEND_OPT_CONTROL */ + radeon_opt_set_context_reg3(sctx, R_028754_SX_PS_DOWNCONVERT, + SI_TRACKED_SX_PS_DOWNCONVERT, + sx_ps_downconvert, sx_blend_opt_epsilon, + sx_blend_opt_control); + } + if (initial_cdw != cs->current.cdw) + sctx->context_roll = true; } /* @@ -282,157 +282,157 @@ static uint32_t si_translate_blend_function(int blend_func) { - switch (blend_func) { - case PIPE_BLEND_ADD: - return V_028780_COMB_DST_PLUS_SRC; - case PIPE_BLEND_SUBTRACT: - return V_028780_COMB_SRC_MINUS_DST; - case PIPE_BLEND_REVERSE_SUBTRACT: - return V_028780_COMB_DST_MINUS_SRC; - case PIPE_BLEND_MIN: - return V_028780_COMB_MIN_DST_SRC; - case PIPE_BLEND_MAX: - return V_028780_COMB_MAX_DST_SRC; - default: - PRINT_ERR("Unknown blend function %d\n", blend_func); - assert(0); - break; - } - return 0; + switch (blend_func) { + case PIPE_BLEND_ADD: + return V_028780_COMB_DST_PLUS_SRC; + case PIPE_BLEND_SUBTRACT: + return V_028780_COMB_SRC_MINUS_DST; + case PIPE_BLEND_REVERSE_SUBTRACT: + return V_028780_COMB_DST_MINUS_SRC; + case PIPE_BLEND_MIN: + return V_028780_COMB_MIN_DST_SRC; + case PIPE_BLEND_MAX: + return V_028780_COMB_MAX_DST_SRC; + default: + PRINT_ERR("Unknown blend function %d\n", blend_func); + assert(0); + break; + } + return 0; } static uint32_t si_translate_blend_factor(int blend_fact) { - switch (blend_fact) { - case PIPE_BLENDFACTOR_ONE: - return V_028780_BLEND_ONE; - case PIPE_BLENDFACTOR_SRC_COLOR: - return V_028780_BLEND_SRC_COLOR; - case PIPE_BLENDFACTOR_SRC_ALPHA: - return V_028780_BLEND_SRC_ALPHA; - case PIPE_BLENDFACTOR_DST_ALPHA: - return V_028780_BLEND_DST_ALPHA; - case PIPE_BLENDFACTOR_DST_COLOR: - return V_028780_BLEND_DST_COLOR; - case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: - return V_028780_BLEND_SRC_ALPHA_SATURATE; - case PIPE_BLENDFACTOR_CONST_COLOR: - return V_028780_BLEND_CONSTANT_COLOR; - case PIPE_BLENDFACTOR_CONST_ALPHA: - return V_028780_BLEND_CONSTANT_ALPHA; - case PIPE_BLENDFACTOR_ZERO: - return V_028780_BLEND_ZERO; - case PIPE_BLENDFACTOR_INV_SRC_COLOR: - return V_028780_BLEND_ONE_MINUS_SRC_COLOR; - case PIPE_BLENDFACTOR_INV_SRC_ALPHA: - return V_028780_BLEND_ONE_MINUS_SRC_ALPHA; - case PIPE_BLENDFACTOR_INV_DST_ALPHA: - return V_028780_BLEND_ONE_MINUS_DST_ALPHA; - case PIPE_BLENDFACTOR_INV_DST_COLOR: - return V_028780_BLEND_ONE_MINUS_DST_COLOR; - case PIPE_BLENDFACTOR_INV_CONST_COLOR: - return V_028780_BLEND_ONE_MINUS_CONSTANT_COLOR; - case PIPE_BLENDFACTOR_INV_CONST_ALPHA: - return V_028780_BLEND_ONE_MINUS_CONSTANT_ALPHA; - case PIPE_BLENDFACTOR_SRC1_COLOR: - return V_028780_BLEND_SRC1_COLOR; - case PIPE_BLENDFACTOR_SRC1_ALPHA: - return V_028780_BLEND_SRC1_ALPHA; - case PIPE_BLENDFACTOR_INV_SRC1_COLOR: - return V_028780_BLEND_INV_SRC1_COLOR; - case PIPE_BLENDFACTOR_INV_SRC1_ALPHA: - return V_028780_BLEND_INV_SRC1_ALPHA; - default: - PRINT_ERR("Bad blend factor %d not supported!\n", blend_fact); - assert(0); - break; - } - return 0; + switch (blend_fact) { + case PIPE_BLENDFACTOR_ONE: + return V_028780_BLEND_ONE; + case PIPE_BLENDFACTOR_SRC_COLOR: + return V_028780_BLEND_SRC_COLOR; + case PIPE_BLENDFACTOR_SRC_ALPHA: + return V_028780_BLEND_SRC_ALPHA; + case PIPE_BLENDFACTOR_DST_ALPHA: + return V_028780_BLEND_DST_ALPHA; + case PIPE_BLENDFACTOR_DST_COLOR: + return V_028780_BLEND_DST_COLOR; + case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: + return V_028780_BLEND_SRC_ALPHA_SATURATE; + case PIPE_BLENDFACTOR_CONST_COLOR: + return V_028780_BLEND_CONSTANT_COLOR; + case PIPE_BLENDFACTOR_CONST_ALPHA: + return V_028780_BLEND_CONSTANT_ALPHA; + case PIPE_BLENDFACTOR_ZERO: + return V_028780_BLEND_ZERO; + case PIPE_BLENDFACTOR_INV_SRC_COLOR: + return V_028780_BLEND_ONE_MINUS_SRC_COLOR; + case PIPE_BLENDFACTOR_INV_SRC_ALPHA: + return V_028780_BLEND_ONE_MINUS_SRC_ALPHA; + case PIPE_BLENDFACTOR_INV_DST_ALPHA: + return V_028780_BLEND_ONE_MINUS_DST_ALPHA; + case PIPE_BLENDFACTOR_INV_DST_COLOR: + return V_028780_BLEND_ONE_MINUS_DST_COLOR; + case PIPE_BLENDFACTOR_INV_CONST_COLOR: + return V_028780_BLEND_ONE_MINUS_CONSTANT_COLOR; + case PIPE_BLENDFACTOR_INV_CONST_ALPHA: + return V_028780_BLEND_ONE_MINUS_CONSTANT_ALPHA; + case PIPE_BLENDFACTOR_SRC1_COLOR: + return V_028780_BLEND_SRC1_COLOR; + case PIPE_BLENDFACTOR_SRC1_ALPHA: + return V_028780_BLEND_SRC1_ALPHA; + case PIPE_BLENDFACTOR_INV_SRC1_COLOR: + return V_028780_BLEND_INV_SRC1_COLOR; + case PIPE_BLENDFACTOR_INV_SRC1_ALPHA: + return V_028780_BLEND_INV_SRC1_ALPHA; + default: + PRINT_ERR("Bad blend factor %d not supported!\n", blend_fact); + assert(0); + break; + } + return 0; } static uint32_t si_translate_blend_opt_function(int blend_func) { - switch (blend_func) { - case PIPE_BLEND_ADD: - return V_028760_OPT_COMB_ADD; - case PIPE_BLEND_SUBTRACT: - return V_028760_OPT_COMB_SUBTRACT; - case PIPE_BLEND_REVERSE_SUBTRACT: - return V_028760_OPT_COMB_REVSUBTRACT; - case PIPE_BLEND_MIN: - return V_028760_OPT_COMB_MIN; - case PIPE_BLEND_MAX: - return V_028760_OPT_COMB_MAX; - default: - return V_028760_OPT_COMB_BLEND_DISABLED; - } + switch (blend_func) { + case PIPE_BLEND_ADD: + return V_028760_OPT_COMB_ADD; + case PIPE_BLEND_SUBTRACT: + return V_028760_OPT_COMB_SUBTRACT; + case PIPE_BLEND_REVERSE_SUBTRACT: + return V_028760_OPT_COMB_REVSUBTRACT; + case PIPE_BLEND_MIN: + return V_028760_OPT_COMB_MIN; + case PIPE_BLEND_MAX: + return V_028760_OPT_COMB_MAX; + default: + return V_028760_OPT_COMB_BLEND_DISABLED; + } } static uint32_t si_translate_blend_opt_factor(int blend_fact, bool is_alpha) { - switch (blend_fact) { - case PIPE_BLENDFACTOR_ZERO: - return V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_ALL; - case PIPE_BLENDFACTOR_ONE: - return V_028760_BLEND_OPT_PRESERVE_ALL_IGNORE_NONE; - case PIPE_BLENDFACTOR_SRC_COLOR: - return is_alpha ? V_028760_BLEND_OPT_PRESERVE_A1_IGNORE_A0 - : V_028760_BLEND_OPT_PRESERVE_C1_IGNORE_C0; - case PIPE_BLENDFACTOR_INV_SRC_COLOR: - return is_alpha ? V_028760_BLEND_OPT_PRESERVE_A0_IGNORE_A1 - : V_028760_BLEND_OPT_PRESERVE_C0_IGNORE_C1; - case PIPE_BLENDFACTOR_SRC_ALPHA: - return V_028760_BLEND_OPT_PRESERVE_A1_IGNORE_A0; - case PIPE_BLENDFACTOR_INV_SRC_ALPHA: - return V_028760_BLEND_OPT_PRESERVE_A0_IGNORE_A1; - case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: - return is_alpha ? V_028760_BLEND_OPT_PRESERVE_ALL_IGNORE_NONE - : V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_A0; - default: - return V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE; - } + switch (blend_fact) { + case PIPE_BLENDFACTOR_ZERO: + return V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_ALL; + case PIPE_BLENDFACTOR_ONE: + return V_028760_BLEND_OPT_PRESERVE_ALL_IGNORE_NONE; + case PIPE_BLENDFACTOR_SRC_COLOR: + return is_alpha ? V_028760_BLEND_OPT_PRESERVE_A1_IGNORE_A0 + : V_028760_BLEND_OPT_PRESERVE_C1_IGNORE_C0; + case PIPE_BLENDFACTOR_INV_SRC_COLOR: + return is_alpha ? V_028760_BLEND_OPT_PRESERVE_A0_IGNORE_A1 + : V_028760_BLEND_OPT_PRESERVE_C0_IGNORE_C1; + case PIPE_BLENDFACTOR_SRC_ALPHA: + return V_028760_BLEND_OPT_PRESERVE_A1_IGNORE_A0; + case PIPE_BLENDFACTOR_INV_SRC_ALPHA: + return V_028760_BLEND_OPT_PRESERVE_A0_IGNORE_A1; + case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: + return is_alpha ? V_028760_BLEND_OPT_PRESERVE_ALL_IGNORE_NONE + : V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_A0; + default: + return V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE; + } } static void si_blend_check_commutativity(struct si_screen *sscreen, - struct si_state_blend *blend, - enum pipe_blend_func func, - enum pipe_blendfactor src, - enum pipe_blendfactor dst, - unsigned chanmask) -{ - /* Src factor is allowed when it does not depend on Dst */ - static const uint32_t src_allowed = - (1u << PIPE_BLENDFACTOR_ONE) | - (1u << PIPE_BLENDFACTOR_SRC_COLOR) | - (1u << PIPE_BLENDFACTOR_SRC_ALPHA) | - (1u << PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE) | - (1u << PIPE_BLENDFACTOR_CONST_COLOR) | - (1u << PIPE_BLENDFACTOR_CONST_ALPHA) | - (1u << PIPE_BLENDFACTOR_SRC1_COLOR) | - (1u << PIPE_BLENDFACTOR_SRC1_ALPHA) | - (1u << PIPE_BLENDFACTOR_ZERO) | - (1u << PIPE_BLENDFACTOR_INV_SRC_COLOR) | - (1u << PIPE_BLENDFACTOR_INV_SRC_ALPHA) | - (1u << PIPE_BLENDFACTOR_INV_CONST_COLOR) | - (1u << PIPE_BLENDFACTOR_INV_CONST_ALPHA) | - (1u << PIPE_BLENDFACTOR_INV_SRC1_COLOR) | - (1u << PIPE_BLENDFACTOR_INV_SRC1_ALPHA); - - if (dst == PIPE_BLENDFACTOR_ONE && - (src_allowed & (1u << src))) { - /* Addition is commutative, but floating point addition isn't - * associative: subtle changes can be introduced via different - * rounding. - * - * Out-of-order is also non-deterministic, which means that - * this breaks OpenGL invariance requirements. So only enable - * out-of-order additive blending if explicitly allowed by a - * setting. - */ - if (func == PIPE_BLEND_MAX || func == PIPE_BLEND_MIN || - (func == PIPE_BLEND_ADD && sscreen->commutative_blend_add)) - blend->commutative_4bit |= chanmask; - } + struct si_state_blend *blend, + enum pipe_blend_func func, + enum pipe_blendfactor src, + enum pipe_blendfactor dst, + unsigned chanmask) +{ + /* Src factor is allowed when it does not depend on Dst */ + static const uint32_t src_allowed = + (1u << PIPE_BLENDFACTOR_ONE) | + (1u << PIPE_BLENDFACTOR_SRC_COLOR) | + (1u << PIPE_BLENDFACTOR_SRC_ALPHA) | + (1u << PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE) | + (1u << PIPE_BLENDFACTOR_CONST_COLOR) | + (1u << PIPE_BLENDFACTOR_CONST_ALPHA) | + (1u << PIPE_BLENDFACTOR_SRC1_COLOR) | + (1u << PIPE_BLENDFACTOR_SRC1_ALPHA) | + (1u << PIPE_BLENDFACTOR_ZERO) | + (1u << PIPE_BLENDFACTOR_INV_SRC_COLOR) | + (1u << PIPE_BLENDFACTOR_INV_SRC_ALPHA) | + (1u << PIPE_BLENDFACTOR_INV_CONST_COLOR) | + (1u << PIPE_BLENDFACTOR_INV_CONST_ALPHA) | + (1u << PIPE_BLENDFACTOR_INV_SRC1_COLOR) | + (1u << PIPE_BLENDFACTOR_INV_SRC1_ALPHA); + + if (dst == PIPE_BLENDFACTOR_ONE && + (src_allowed & (1u << src))) { + /* Addition is commutative, but floating point addition isn't + * associative: subtle changes can be introduced via different + * rounding. + * + * Out-of-order is also non-deterministic, which means that + * this breaks OpenGL invariance requirements. So only enable + * out-of-order additive blending if explicitly allowed by a + * setting. + */ + if (func == PIPE_BLEND_MAX || func == PIPE_BLEND_MIN || + (func == PIPE_BLEND_ADD && sscreen->commutative_blend_add)) + blend->commutative_4bit |= chanmask; + } } /** @@ -440,305 +440,305 @@ * func(src * DST, dst * 0) ---> func(src * 0, dst * SRC) */ static void si_blend_remove_dst(unsigned *func, unsigned *src_factor, - unsigned *dst_factor, unsigned expected_dst, - unsigned replacement_src) + unsigned *dst_factor, unsigned expected_dst, + unsigned replacement_src) { - if (*src_factor == expected_dst && - *dst_factor == PIPE_BLENDFACTOR_ZERO) { - *src_factor = PIPE_BLENDFACTOR_ZERO; - *dst_factor = replacement_src; - - /* Commuting the operands requires reversing subtractions. */ - if (*func == PIPE_BLEND_SUBTRACT) - *func = PIPE_BLEND_REVERSE_SUBTRACT; - else if (*func == PIPE_BLEND_REVERSE_SUBTRACT) - *func = PIPE_BLEND_SUBTRACT; - } + if (*src_factor == expected_dst && + *dst_factor == PIPE_BLENDFACTOR_ZERO) { + *src_factor = PIPE_BLENDFACTOR_ZERO; + *dst_factor = replacement_src; + + /* Commuting the operands requires reversing subtractions. */ + if (*func == PIPE_BLEND_SUBTRACT) + *func = PIPE_BLEND_REVERSE_SUBTRACT; + else if (*func == PIPE_BLEND_REVERSE_SUBTRACT) + *func = PIPE_BLEND_SUBTRACT; + } } static bool si_blend_factor_uses_dst(unsigned factor) { - return factor == PIPE_BLENDFACTOR_DST_COLOR || - factor == PIPE_BLENDFACTOR_DST_ALPHA || - factor == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE || - factor == PIPE_BLENDFACTOR_INV_DST_ALPHA || - factor == PIPE_BLENDFACTOR_INV_DST_COLOR; + return factor == PIPE_BLENDFACTOR_DST_COLOR || + factor == PIPE_BLENDFACTOR_DST_ALPHA || + factor == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE || + factor == PIPE_BLENDFACTOR_INV_DST_ALPHA || + factor == PIPE_BLENDFACTOR_INV_DST_COLOR; } static void *si_create_blend_state_mode(struct pipe_context *ctx, - const struct pipe_blend_state *state, - unsigned mode) + const struct pipe_blend_state *state, + unsigned mode) { - struct si_context *sctx = (struct si_context*)ctx; - struct si_state_blend *blend = CALLOC_STRUCT(si_state_blend); - struct si_pm4_state *pm4 = &blend->pm4; - uint32_t sx_mrt_blend_opt[8] = {0}; - uint32_t color_control = 0; - bool logicop_enable = state->logicop_enable && - state->logicop_func != PIPE_LOGICOP_COPY; - - if (!blend) - return NULL; - - blend->alpha_to_coverage = state->alpha_to_coverage; - blend->alpha_to_one = state->alpha_to_one; - blend->dual_src_blend = util_blend_state_is_dual(state, 0); - blend->logicop_enable = logicop_enable; - - if (logicop_enable) { - color_control |= S_028808_ROP3(state->logicop_func | (state->logicop_func << 4)); - } else { - color_control |= S_028808_ROP3(0xcc); - } - - si_pm4_set_reg(pm4, R_028B70_DB_ALPHA_TO_MASK, - S_028B70_ALPHA_TO_MASK_ENABLE(state->alpha_to_coverage) | - S_028B70_ALPHA_TO_MASK_OFFSET0(3) | - S_028B70_ALPHA_TO_MASK_OFFSET1(1) | - S_028B70_ALPHA_TO_MASK_OFFSET2(0) | - S_028B70_ALPHA_TO_MASK_OFFSET3(2) | - S_028B70_OFFSET_ROUND(1)); - - if (state->alpha_to_coverage) - blend->need_src_alpha_4bit |= 0xf; - - blend->cb_target_mask = 0; - blend->cb_target_enabled_4bit = 0; - - for (int i = 0; i < 8; i++) { - /* state->rt entries > 0 only written if independent blending */ - const int j = state->independent_blend_enable ? i : 0; - - unsigned eqRGB = state->rt[j].rgb_func; - unsigned srcRGB = state->rt[j].rgb_src_factor; - unsigned dstRGB = state->rt[j].rgb_dst_factor; - unsigned eqA = state->rt[j].alpha_func; - unsigned srcA = state->rt[j].alpha_src_factor; - unsigned dstA = state->rt[j].alpha_dst_factor; - - unsigned srcRGB_opt, dstRGB_opt, srcA_opt, dstA_opt; - unsigned blend_cntl = 0; - - sx_mrt_blend_opt[i] = - S_028760_COLOR_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED) | - S_028760_ALPHA_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED); - - /* Only set dual source blending for MRT0 to avoid a hang. */ - if (i >= 1 && blend->dual_src_blend) { - /* Vulkan does this for dual source blending. */ - if (i == 1) - blend_cntl |= S_028780_ENABLE(1); - - si_pm4_set_reg(pm4, R_028780_CB_BLEND0_CONTROL + i * 4, blend_cntl); - continue; - } - - /* Only addition and subtraction equations are supported with - * dual source blending. - */ - if (blend->dual_src_blend && - (eqRGB == PIPE_BLEND_MIN || eqRGB == PIPE_BLEND_MAX || - eqA == PIPE_BLEND_MIN || eqA == PIPE_BLEND_MAX)) { - assert(!"Unsupported equation for dual source blending"); - si_pm4_set_reg(pm4, R_028780_CB_BLEND0_CONTROL + i * 4, blend_cntl); - continue; - } - - /* cb_render_state will disable unused ones */ - blend->cb_target_mask |= (unsigned)state->rt[j].colormask << (4 * i); - if (state->rt[j].colormask) - blend->cb_target_enabled_4bit |= 0xf << (4 * i); - - if (!state->rt[j].colormask || !state->rt[j].blend_enable) { - si_pm4_set_reg(pm4, R_028780_CB_BLEND0_CONTROL + i * 4, blend_cntl); - continue; - } - - si_blend_check_commutativity(sctx->screen, blend, - eqRGB, srcRGB, dstRGB, 0x7 << (4 * i)); - si_blend_check_commutativity(sctx->screen, blend, - eqA, srcA, dstA, 0x8 << (4 * i)); - - /* Blending optimizations for RB+. - * These transformations don't change the behavior. - * - * First, get rid of DST in the blend factors: - * func(src * DST, dst * 0) ---> func(src * 0, dst * SRC) - */ - si_blend_remove_dst(&eqRGB, &srcRGB, &dstRGB, - PIPE_BLENDFACTOR_DST_COLOR, - PIPE_BLENDFACTOR_SRC_COLOR); - si_blend_remove_dst(&eqA, &srcA, &dstA, - PIPE_BLENDFACTOR_DST_COLOR, - PIPE_BLENDFACTOR_SRC_COLOR); - si_blend_remove_dst(&eqA, &srcA, &dstA, - PIPE_BLENDFACTOR_DST_ALPHA, - PIPE_BLENDFACTOR_SRC_ALPHA); - - /* Look up the ideal settings from tables. */ - srcRGB_opt = si_translate_blend_opt_factor(srcRGB, false); - dstRGB_opt = si_translate_blend_opt_factor(dstRGB, false); - srcA_opt = si_translate_blend_opt_factor(srcA, true); - dstA_opt = si_translate_blend_opt_factor(dstA, true); - - /* Handle interdependencies. */ - if (si_blend_factor_uses_dst(srcRGB)) - dstRGB_opt = V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE; - if (si_blend_factor_uses_dst(srcA)) - dstA_opt = V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE; - - if (srcRGB == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE && - (dstRGB == PIPE_BLENDFACTOR_ZERO || - dstRGB == PIPE_BLENDFACTOR_SRC_ALPHA || - dstRGB == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE)) - dstRGB_opt = V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_A0; - - /* Set the final value. */ - sx_mrt_blend_opt[i] = - S_028760_COLOR_SRC_OPT(srcRGB_opt) | - S_028760_COLOR_DST_OPT(dstRGB_opt) | - S_028760_COLOR_COMB_FCN(si_translate_blend_opt_function(eqRGB)) | - S_028760_ALPHA_SRC_OPT(srcA_opt) | - S_028760_ALPHA_DST_OPT(dstA_opt) | - S_028760_ALPHA_COMB_FCN(si_translate_blend_opt_function(eqA)); - - /* Set blend state. */ - blend_cntl |= S_028780_ENABLE(1); - blend_cntl |= S_028780_COLOR_COMB_FCN(si_translate_blend_function(eqRGB)); - blend_cntl |= S_028780_COLOR_SRCBLEND(si_translate_blend_factor(srcRGB)); - blend_cntl |= S_028780_COLOR_DESTBLEND(si_translate_blend_factor(dstRGB)); - - if (srcA != srcRGB || dstA != dstRGB || eqA != eqRGB) { - blend_cntl |= S_028780_SEPARATE_ALPHA_BLEND(1); - blend_cntl |= S_028780_ALPHA_COMB_FCN(si_translate_blend_function(eqA)); - blend_cntl |= S_028780_ALPHA_SRCBLEND(si_translate_blend_factor(srcA)); - blend_cntl |= S_028780_ALPHA_DESTBLEND(si_translate_blend_factor(dstA)); - } - si_pm4_set_reg(pm4, R_028780_CB_BLEND0_CONTROL + i * 4, blend_cntl); - - blend->blend_enable_4bit |= 0xfu << (i * 4); - - if (sctx->family <= CHIP_NAVI14) - blend->dcc_msaa_corruption_4bit |= 0xfu << (i * 4); - - /* This is only important for formats without alpha. */ - if (srcRGB == PIPE_BLENDFACTOR_SRC_ALPHA || - dstRGB == PIPE_BLENDFACTOR_SRC_ALPHA || - srcRGB == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE || - dstRGB == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE || - srcRGB == PIPE_BLENDFACTOR_INV_SRC_ALPHA || - dstRGB == PIPE_BLENDFACTOR_INV_SRC_ALPHA) - blend->need_src_alpha_4bit |= 0xfu << (i * 4); - } - - if (sctx->family <= CHIP_NAVI14 && logicop_enable) - blend->dcc_msaa_corruption_4bit |= blend->cb_target_enabled_4bit; - - if (blend->cb_target_mask) { - color_control |= S_028808_MODE(mode); - } else { - color_control |= S_028808_MODE(V_028808_CB_DISABLE); - } - - if (sctx->screen->rbplus_allowed) { - /* Disable RB+ blend optimizations for dual source blending. - * Vulkan does this. - */ - if (blend->dual_src_blend) { - for (int i = 0; i < 8; i++) { - sx_mrt_blend_opt[i] = - S_028760_COLOR_COMB_FCN(V_028760_OPT_COMB_NONE) | - S_028760_ALPHA_COMB_FCN(V_028760_OPT_COMB_NONE); - } - } - - for (int i = 0; i < 8; i++) - si_pm4_set_reg(pm4, R_028760_SX_MRT0_BLEND_OPT + i * 4, - sx_mrt_blend_opt[i]); - - /* RB+ doesn't work with dual source blending, logic op, and RESOLVE. */ - if (blend->dual_src_blend || logicop_enable || - mode == V_028808_CB_RESOLVE) - color_control |= S_028808_DISABLE_DUAL_QUAD(1); - } + struct si_context *sctx = (struct si_context*)ctx; + struct si_state_blend *blend = CALLOC_STRUCT(si_state_blend); + struct si_pm4_state *pm4 = &blend->pm4; + uint32_t sx_mrt_blend_opt[8] = {0}; + uint32_t color_control = 0; + bool logicop_enable = state->logicop_enable && + state->logicop_func != PIPE_LOGICOP_COPY; + + if (!blend) + return NULL; + + blend->alpha_to_coverage = state->alpha_to_coverage; + blend->alpha_to_one = state->alpha_to_one; + blend->dual_src_blend = util_blend_state_is_dual(state, 0); + blend->logicop_enable = logicop_enable; + + if (logicop_enable) { + color_control |= S_028808_ROP3(state->logicop_func | (state->logicop_func << 4)); + } else { + color_control |= S_028808_ROP3(0xcc); + } + + si_pm4_set_reg(pm4, R_028B70_DB_ALPHA_TO_MASK, + S_028B70_ALPHA_TO_MASK_ENABLE(state->alpha_to_coverage) | + S_028B70_ALPHA_TO_MASK_OFFSET0(3) | + S_028B70_ALPHA_TO_MASK_OFFSET1(1) | + S_028B70_ALPHA_TO_MASK_OFFSET2(0) | + S_028B70_ALPHA_TO_MASK_OFFSET3(2) | + S_028B70_OFFSET_ROUND(1)); + + if (state->alpha_to_coverage) + blend->need_src_alpha_4bit |= 0xf; + + blend->cb_target_mask = 0; + blend->cb_target_enabled_4bit = 0; + + for (int i = 0; i < 8; i++) { + /* state->rt entries > 0 only written if independent blending */ + const int j = state->independent_blend_enable ? i : 0; + + unsigned eqRGB = state->rt[j].rgb_func; + unsigned srcRGB = state->rt[j].rgb_src_factor; + unsigned dstRGB = state->rt[j].rgb_dst_factor; + unsigned eqA = state->rt[j].alpha_func; + unsigned srcA = state->rt[j].alpha_src_factor; + unsigned dstA = state->rt[j].alpha_dst_factor; + + unsigned srcRGB_opt, dstRGB_opt, srcA_opt, dstA_opt; + unsigned blend_cntl = 0; + + sx_mrt_blend_opt[i] = + S_028760_COLOR_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED) | + S_028760_ALPHA_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED); + + /* Only set dual source blending for MRT0 to avoid a hang. */ + if (i >= 1 && blend->dual_src_blend) { + /* Vulkan does this for dual source blending. */ + if (i == 1) + blend_cntl |= S_028780_ENABLE(1); + + si_pm4_set_reg(pm4, R_028780_CB_BLEND0_CONTROL + i * 4, blend_cntl); + continue; + } + + /* Only addition and subtraction equations are supported with + * dual source blending. + */ + if (blend->dual_src_blend && + (eqRGB == PIPE_BLEND_MIN || eqRGB == PIPE_BLEND_MAX || + eqA == PIPE_BLEND_MIN || eqA == PIPE_BLEND_MAX)) { + assert(!"Unsupported equation for dual source blending"); + si_pm4_set_reg(pm4, R_028780_CB_BLEND0_CONTROL + i * 4, blend_cntl); + continue; + } + + /* cb_render_state will disable unused ones */ + blend->cb_target_mask |= (unsigned)state->rt[j].colormask << (4 * i); + if (state->rt[j].colormask) + blend->cb_target_enabled_4bit |= 0xf << (4 * i); + + if (!state->rt[j].colormask || !state->rt[j].blend_enable) { + si_pm4_set_reg(pm4, R_028780_CB_BLEND0_CONTROL + i * 4, blend_cntl); + continue; + } + + si_blend_check_commutativity(sctx->screen, blend, + eqRGB, srcRGB, dstRGB, 0x7 << (4 * i)); + si_blend_check_commutativity(sctx->screen, blend, + eqA, srcA, dstA, 0x8 << (4 * i)); + + /* Blending optimizations for RB+. + * These transformations don't change the behavior. + * + * First, get rid of DST in the blend factors: + * func(src * DST, dst * 0) ---> func(src * 0, dst * SRC) + */ + si_blend_remove_dst(&eqRGB, &srcRGB, &dstRGB, + PIPE_BLENDFACTOR_DST_COLOR, + PIPE_BLENDFACTOR_SRC_COLOR); + si_blend_remove_dst(&eqA, &srcA, &dstA, + PIPE_BLENDFACTOR_DST_COLOR, + PIPE_BLENDFACTOR_SRC_COLOR); + si_blend_remove_dst(&eqA, &srcA, &dstA, + PIPE_BLENDFACTOR_DST_ALPHA, + PIPE_BLENDFACTOR_SRC_ALPHA); + + /* Look up the ideal settings from tables. */ + srcRGB_opt = si_translate_blend_opt_factor(srcRGB, false); + dstRGB_opt = si_translate_blend_opt_factor(dstRGB, false); + srcA_opt = si_translate_blend_opt_factor(srcA, true); + dstA_opt = si_translate_blend_opt_factor(dstA, true); + + /* Handle interdependencies. */ + if (si_blend_factor_uses_dst(srcRGB)) + dstRGB_opt = V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE; + if (si_blend_factor_uses_dst(srcA)) + dstA_opt = V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE; + + if (srcRGB == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE && + (dstRGB == PIPE_BLENDFACTOR_ZERO || + dstRGB == PIPE_BLENDFACTOR_SRC_ALPHA || + dstRGB == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE)) + dstRGB_opt = V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_A0; + + /* Set the final value. */ + sx_mrt_blend_opt[i] = + S_028760_COLOR_SRC_OPT(srcRGB_opt) | + S_028760_COLOR_DST_OPT(dstRGB_opt) | + S_028760_COLOR_COMB_FCN(si_translate_blend_opt_function(eqRGB)) | + S_028760_ALPHA_SRC_OPT(srcA_opt) | + S_028760_ALPHA_DST_OPT(dstA_opt) | + S_028760_ALPHA_COMB_FCN(si_translate_blend_opt_function(eqA)); + + /* Set blend state. */ + blend_cntl |= S_028780_ENABLE(1); + blend_cntl |= S_028780_COLOR_COMB_FCN(si_translate_blend_function(eqRGB)); + blend_cntl |= S_028780_COLOR_SRCBLEND(si_translate_blend_factor(srcRGB)); + blend_cntl |= S_028780_COLOR_DESTBLEND(si_translate_blend_factor(dstRGB)); + + if (srcA != srcRGB || dstA != dstRGB || eqA != eqRGB) { + blend_cntl |= S_028780_SEPARATE_ALPHA_BLEND(1); + blend_cntl |= S_028780_ALPHA_COMB_FCN(si_translate_blend_function(eqA)); + blend_cntl |= S_028780_ALPHA_SRCBLEND(si_translate_blend_factor(srcA)); + blend_cntl |= S_028780_ALPHA_DESTBLEND(si_translate_blend_factor(dstA)); + } + si_pm4_set_reg(pm4, R_028780_CB_BLEND0_CONTROL + i * 4, blend_cntl); + + blend->blend_enable_4bit |= 0xfu << (i * 4); + + if (sctx->family <= CHIP_NAVI14) + blend->dcc_msaa_corruption_4bit |= 0xfu << (i * 4); + + /* This is only important for formats without alpha. */ + if (srcRGB == PIPE_BLENDFACTOR_SRC_ALPHA || + dstRGB == PIPE_BLENDFACTOR_SRC_ALPHA || + srcRGB == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE || + dstRGB == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE || + srcRGB == PIPE_BLENDFACTOR_INV_SRC_ALPHA || + dstRGB == PIPE_BLENDFACTOR_INV_SRC_ALPHA) + blend->need_src_alpha_4bit |= 0xfu << (i * 4); + } + + if (sctx->family <= CHIP_NAVI14 && logicop_enable) + blend->dcc_msaa_corruption_4bit |= blend->cb_target_enabled_4bit; + + if (blend->cb_target_mask) { + color_control |= S_028808_MODE(mode); + } else { + color_control |= S_028808_MODE(V_028808_CB_DISABLE); + } + + if (sctx->screen->info.rbplus_allowed) { + /* Disable RB+ blend optimizations for dual source blending. + * Vulkan does this. + */ + if (blend->dual_src_blend) { + for (int i = 0; i < 8; i++) { + sx_mrt_blend_opt[i] = + S_028760_COLOR_COMB_FCN(V_028760_OPT_COMB_NONE) | + S_028760_ALPHA_COMB_FCN(V_028760_OPT_COMB_NONE); + } + } + + for (int i = 0; i < 8; i++) + si_pm4_set_reg(pm4, R_028760_SX_MRT0_BLEND_OPT + i * 4, + sx_mrt_blend_opt[i]); + + /* RB+ doesn't work with dual source blending, logic op, and RESOLVE. */ + if (blend->dual_src_blend || logicop_enable || + mode == V_028808_CB_RESOLVE) + color_control |= S_028808_DISABLE_DUAL_QUAD(1); + } - si_pm4_set_reg(pm4, R_028808_CB_COLOR_CONTROL, color_control); - return blend; + si_pm4_set_reg(pm4, R_028808_CB_COLOR_CONTROL, color_control); + return blend; } static void *si_create_blend_state(struct pipe_context *ctx, - const struct pipe_blend_state *state) + const struct pipe_blend_state *state) { - return si_create_blend_state_mode(ctx, state, V_028808_CB_NORMAL); + return si_create_blend_state_mode(ctx, state, V_028808_CB_NORMAL); } static void si_bind_blend_state(struct pipe_context *ctx, void *state) { - struct si_context *sctx = (struct si_context *)ctx; - struct si_state_blend *old_blend = sctx->queued.named.blend; - struct si_state_blend *blend = (struct si_state_blend *)state; - - if (!blend) - blend = (struct si_state_blend *)sctx->noop_blend; - - si_pm4_bind_state(sctx, blend, blend); - - if (old_blend->cb_target_mask != blend->cb_target_mask || - old_blend->dual_src_blend != blend->dual_src_blend || - (old_blend->blend_enable_4bit != blend->blend_enable_4bit && - sctx->framebuffer.nr_samples >= 2 && - sctx->screen->dcc_msaa_allowed)) - si_mark_atom_dirty(sctx, &sctx->atoms.s.cb_render_state); - - if (old_blend->cb_target_mask != blend->cb_target_mask || - old_blend->alpha_to_coverage != blend->alpha_to_coverage || - old_blend->alpha_to_one != blend->alpha_to_one || - old_blend->dual_src_blend != blend->dual_src_blend || - old_blend->blend_enable_4bit != blend->blend_enable_4bit || - old_blend->need_src_alpha_4bit != blend->need_src_alpha_4bit) - sctx->do_update_shaders = true; - - if (sctx->screen->dpbb_allowed && - (old_blend->alpha_to_coverage != blend->alpha_to_coverage || - old_blend->blend_enable_4bit != blend->blend_enable_4bit || - old_blend->cb_target_enabled_4bit != blend->cb_target_enabled_4bit)) - si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state); - - if (sctx->screen->has_out_of_order_rast && - ((old_blend->blend_enable_4bit != blend->blend_enable_4bit || - old_blend->cb_target_enabled_4bit != blend->cb_target_enabled_4bit || - old_blend->commutative_4bit != blend->commutative_4bit || - old_blend->logicop_enable != blend->logicop_enable))) - si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config); + struct si_context *sctx = (struct si_context *)ctx; + struct si_state_blend *old_blend = sctx->queued.named.blend; + struct si_state_blend *blend = (struct si_state_blend *)state; + + if (!blend) + blend = (struct si_state_blend *)sctx->noop_blend; + + si_pm4_bind_state(sctx, blend, blend); + + if (old_blend->cb_target_mask != blend->cb_target_mask || + old_blend->dual_src_blend != blend->dual_src_blend || + (old_blend->dcc_msaa_corruption_4bit != blend->dcc_msaa_corruption_4bit && + sctx->framebuffer.nr_samples >= 2 && + sctx->screen->dcc_msaa_allowed)) + si_mark_atom_dirty(sctx, &sctx->atoms.s.cb_render_state); + + if (old_blend->cb_target_mask != blend->cb_target_mask || + old_blend->alpha_to_coverage != blend->alpha_to_coverage || + old_blend->alpha_to_one != blend->alpha_to_one || + old_blend->dual_src_blend != blend->dual_src_blend || + old_blend->blend_enable_4bit != blend->blend_enable_4bit || + old_blend->need_src_alpha_4bit != blend->need_src_alpha_4bit) + sctx->do_update_shaders = true; + + if (sctx->screen->dpbb_allowed && + (old_blend->alpha_to_coverage != blend->alpha_to_coverage || + old_blend->blend_enable_4bit != blend->blend_enable_4bit || + old_blend->cb_target_enabled_4bit != blend->cb_target_enabled_4bit)) + si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state); + + if (sctx->screen->has_out_of_order_rast && + ((old_blend->blend_enable_4bit != blend->blend_enable_4bit || + old_blend->cb_target_enabled_4bit != blend->cb_target_enabled_4bit || + old_blend->commutative_4bit != blend->commutative_4bit || + old_blend->logicop_enable != blend->logicop_enable))) + si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config); } static void si_delete_blend_state(struct pipe_context *ctx, void *state) { - struct si_context *sctx = (struct si_context *)ctx; + struct si_context *sctx = (struct si_context *)ctx; - if (sctx->queued.named.blend == state) - si_bind_blend_state(ctx, sctx->noop_blend); + if (sctx->queued.named.blend == state) + si_bind_blend_state(ctx, sctx->noop_blend); - si_pm4_delete_state(sctx, blend, (struct si_state_blend *)state); + si_pm4_delete_state(sctx, blend, (struct si_state_blend *)state); } static void si_set_blend_color(struct pipe_context *ctx, - const struct pipe_blend_color *state) + const struct pipe_blend_color *state) { - struct si_context *sctx = (struct si_context *)ctx; - static const struct pipe_blend_color zeros; + struct si_context *sctx = (struct si_context *)ctx; + static const struct pipe_blend_color zeros; - sctx->blend_color.state = *state; - sctx->blend_color.any_nonzeros = memcmp(state, &zeros, sizeof(*state)) != 0; - si_mark_atom_dirty(sctx, &sctx->atoms.s.blend_color); + sctx->blend_color.state = *state; + sctx->blend_color.any_nonzeros = memcmp(state, &zeros, sizeof(*state)) != 0; + si_mark_atom_dirty(sctx, &sctx->atoms.s.blend_color); } static void si_emit_blend_color(struct si_context *sctx) { - struct radeon_cmdbuf *cs = sctx->gfx_cs; + struct radeon_cmdbuf *cs = sctx->gfx_cs; - radeon_set_context_reg_seq(cs, R_028414_CB_BLEND_RED, 4); - radeon_emit_array(cs, (uint32_t*)sctx->blend_color.state.color, 4); + radeon_set_context_reg_seq(cs, R_028414_CB_BLEND_RED, 4); + radeon_emit_array(cs, (uint32_t*)sctx->blend_color.state.color, 4); } /* @@ -746,87 +746,87 @@ */ static void si_set_clip_state(struct pipe_context *ctx, - const struct pipe_clip_state *state) + const struct pipe_clip_state *state) { - struct si_context *sctx = (struct si_context *)ctx; - struct pipe_constant_buffer cb; - static const struct pipe_clip_state zeros; - - if (memcmp(&sctx->clip_state.state, state, sizeof(*state)) == 0) - return; - - sctx->clip_state.state = *state; - sctx->clip_state.any_nonzeros = memcmp(state, &zeros, sizeof(*state)) != 0; - si_mark_atom_dirty(sctx, &sctx->atoms.s.clip_state); - - cb.buffer = NULL; - cb.user_buffer = state->ucp; - cb.buffer_offset = 0; - cb.buffer_size = 4*4*8; - si_set_rw_buffer(sctx, SI_VS_CONST_CLIP_PLANES, &cb); - pipe_resource_reference(&cb.buffer, NULL); + struct si_context *sctx = (struct si_context *)ctx; + struct pipe_constant_buffer cb; + static const struct pipe_clip_state zeros; + + if (memcmp(&sctx->clip_state.state, state, sizeof(*state)) == 0) + return; + + sctx->clip_state.state = *state; + sctx->clip_state.any_nonzeros = memcmp(state, &zeros, sizeof(*state)) != 0; + si_mark_atom_dirty(sctx, &sctx->atoms.s.clip_state); + + cb.buffer = NULL; + cb.user_buffer = state->ucp; + cb.buffer_offset = 0; + cb.buffer_size = 4*4*8; + si_set_rw_buffer(sctx, SI_VS_CONST_CLIP_PLANES, &cb); + pipe_resource_reference(&cb.buffer, NULL); } static void si_emit_clip_state(struct si_context *sctx) { - struct radeon_cmdbuf *cs = sctx->gfx_cs; + struct radeon_cmdbuf *cs = sctx->gfx_cs; - radeon_set_context_reg_seq(cs, R_0285BC_PA_CL_UCP_0_X, 6*4); - radeon_emit_array(cs, (uint32_t*)sctx->clip_state.state.ucp, 6*4); + radeon_set_context_reg_seq(cs, R_0285BC_PA_CL_UCP_0_X, 6*4); + radeon_emit_array(cs, (uint32_t*)sctx->clip_state.state.ucp, 6*4); } static void si_emit_clip_regs(struct si_context *sctx) { - struct si_shader *vs = si_get_vs_state(sctx); - struct si_shader_selector *vs_sel = vs->selector; - struct tgsi_shader_info *info = &vs_sel->info; - struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; - unsigned window_space = - info->properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION]; - unsigned clipdist_mask = vs_sel->clipdist_mask; - unsigned ucp_mask = clipdist_mask ? 0 : rs->clip_plane_enable & SIX_BITS; - unsigned culldist_mask = vs_sel->culldist_mask; - unsigned total_mask; - - if (vs->key.opt.clip_disable) { - assert(!info->culldist_writemask); - clipdist_mask = 0; - culldist_mask = 0; - } - total_mask = clipdist_mask | culldist_mask; - - /* Clip distances on points have no effect, so need to be implemented - * as cull distances. This applies for the clipvertex case as well. - * - * Setting this for primitives other than points should have no adverse - * effects. - */ - clipdist_mask &= rs->clip_plane_enable; - culldist_mask |= clipdist_mask; - - unsigned initial_cdw = sctx->gfx_cs->current.cdw; - unsigned pa_cl_cntl = S_02881C_VS_OUT_CCDIST0_VEC_ENA((total_mask & 0x0F) != 0) | - S_02881C_VS_OUT_CCDIST1_VEC_ENA((total_mask & 0xF0) != 0) | - clipdist_mask | (culldist_mask << 8); - - if (sctx->chip_class >= GFX10) { - radeon_opt_set_context_reg_rmw(sctx, R_02881C_PA_CL_VS_OUT_CNTL, - SI_TRACKED_PA_CL_VS_OUT_CNTL__CL, - pa_cl_cntl, - ~SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK); - } else { - radeon_opt_set_context_reg(sctx, R_02881C_PA_CL_VS_OUT_CNTL, - SI_TRACKED_PA_CL_VS_OUT_CNTL__CL, - vs_sel->pa_cl_vs_out_cntl | pa_cl_cntl); - } - radeon_opt_set_context_reg(sctx, R_028810_PA_CL_CLIP_CNTL, - SI_TRACKED_PA_CL_CLIP_CNTL, - rs->pa_cl_clip_cntl | - ucp_mask | - S_028810_CLIP_DISABLE(window_space)); + struct si_shader *vs = si_get_vs_state(sctx); + struct si_shader_selector *vs_sel = vs->selector; + struct si_shader_info *info = &vs_sel->info; + struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; + unsigned window_space = + info->properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION]; + unsigned clipdist_mask = vs_sel->clipdist_mask; + unsigned ucp_mask = clipdist_mask ? 0 : rs->clip_plane_enable & SIX_BITS; + unsigned culldist_mask = vs_sel->culldist_mask; + unsigned total_mask; + + if (vs->key.opt.clip_disable) { + assert(!info->culldist_writemask); + clipdist_mask = 0; + culldist_mask = 0; + } + total_mask = clipdist_mask | culldist_mask; + + /* Clip distances on points have no effect, so need to be implemented + * as cull distances. This applies for the clipvertex case as well. + * + * Setting this for primitives other than points should have no adverse + * effects. + */ + clipdist_mask &= rs->clip_plane_enable; + culldist_mask |= clipdist_mask; + + unsigned initial_cdw = sctx->gfx_cs->current.cdw; + unsigned pa_cl_cntl = S_02881C_VS_OUT_CCDIST0_VEC_ENA((total_mask & 0x0F) != 0) | + S_02881C_VS_OUT_CCDIST1_VEC_ENA((total_mask & 0xF0) != 0) | + clipdist_mask | (culldist_mask << 8); + + if (sctx->chip_class >= GFX10) { + radeon_opt_set_context_reg_rmw(sctx, R_02881C_PA_CL_VS_OUT_CNTL, + SI_TRACKED_PA_CL_VS_OUT_CNTL__CL, + pa_cl_cntl, + ~SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK); + } else { + radeon_opt_set_context_reg(sctx, R_02881C_PA_CL_VS_OUT_CNTL, + SI_TRACKED_PA_CL_VS_OUT_CNTL__CL, + vs_sel->pa_cl_vs_out_cntl | pa_cl_cntl); + } + radeon_opt_set_context_reg(sctx, R_028810_PA_CL_CLIP_CNTL, + SI_TRACKED_PA_CL_CLIP_CNTL, + rs->pa_cl_clip_cntl | + ucp_mask | + S_028810_CLIP_DISABLE(window_space)); - if (initial_cdw != sctx->gfx_cs->current.cdw) - sctx->context_roll = true; + if (initial_cdw != sctx->gfx_cs->current.cdw) + sctx->context_roll = true; } /* @@ -834,28 +834,28 @@ */ static void si_update_poly_offset_state(struct si_context *sctx) { - struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; + struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; - if (!rs->uses_poly_offset || !sctx->framebuffer.state.zsbuf) { - si_pm4_bind_state(sctx, poly_offset, NULL); - return; - } - - /* Use the user format, not db_render_format, so that the polygon - * offset behaves as expected by applications. - */ - switch (sctx->framebuffer.state.zsbuf->texture->format) { - case PIPE_FORMAT_Z16_UNORM: - si_pm4_bind_state(sctx, poly_offset, &rs->pm4_poly_offset[0]); - break; - default: /* 24-bit */ - si_pm4_bind_state(sctx, poly_offset, &rs->pm4_poly_offset[1]); - break; - case PIPE_FORMAT_Z32_FLOAT: - case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: - si_pm4_bind_state(sctx, poly_offset, &rs->pm4_poly_offset[2]); - break; - } + if (!rs->uses_poly_offset || !sctx->framebuffer.state.zsbuf) { + si_pm4_bind_state(sctx, poly_offset, NULL); + return; + } + + /* Use the user format, not db_render_format, so that the polygon + * offset behaves as expected by applications. + */ + switch (sctx->framebuffer.state.zsbuf->texture->format) { + case PIPE_FORMAT_Z16_UNORM: + si_pm4_bind_state(sctx, poly_offset, &rs->pm4_poly_offset[0]); + break; + default: /* 24-bit */ + si_pm4_bind_state(sctx, poly_offset, &rs->pm4_poly_offset[1]); + break; + case PIPE_FORMAT_Z32_FLOAT: + case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: + si_pm4_bind_state(sctx, poly_offset, &rs->pm4_poly_offset[2]); + break; + } } /* @@ -864,241 +864,245 @@ static uint32_t si_translate_fill(uint32_t func) { - switch(func) { - case PIPE_POLYGON_MODE_FILL: - return V_028814_X_DRAW_TRIANGLES; - case PIPE_POLYGON_MODE_LINE: - return V_028814_X_DRAW_LINES; - case PIPE_POLYGON_MODE_POINT: - return V_028814_X_DRAW_POINTS; - default: - assert(0); - return V_028814_X_DRAW_POINTS; - } + switch(func) { + case PIPE_POLYGON_MODE_FILL: + return V_028814_X_DRAW_TRIANGLES; + case PIPE_POLYGON_MODE_LINE: + return V_028814_X_DRAW_LINES; + case PIPE_POLYGON_MODE_POINT: + return V_028814_X_DRAW_POINTS; + default: + assert(0); + return V_028814_X_DRAW_POINTS; + } } static void *si_create_rs_state(struct pipe_context *ctx, - const struct pipe_rasterizer_state *state) + const struct pipe_rasterizer_state *state) { - struct si_screen *sscreen = ((struct si_context *)ctx)->screen; - struct si_state_rasterizer *rs = CALLOC_STRUCT(si_state_rasterizer); - struct si_pm4_state *pm4 = &rs->pm4; - unsigned tmp, i; - float psize_min, psize_max; - - if (!rs) { - return NULL; - } - - if (!state->front_ccw) { - rs->cull_front = !!(state->cull_face & PIPE_FACE_FRONT); - rs->cull_back = !!(state->cull_face & PIPE_FACE_BACK); - } else { - rs->cull_back = !!(state->cull_face & PIPE_FACE_FRONT); - rs->cull_front = !!(state->cull_face & PIPE_FACE_BACK); - } - rs->depth_clamp_any = !state->depth_clip_near || !state->depth_clip_far; - rs->provoking_vertex_first = state->flatshade_first; - rs->scissor_enable = state->scissor; - rs->clip_halfz = state->clip_halfz; - rs->two_side = state->light_twoside; - rs->multisample_enable = state->multisample; - rs->force_persample_interp = state->force_persample_interp; - rs->clip_plane_enable = state->clip_plane_enable; - rs->half_pixel_center = state->half_pixel_center; - rs->line_stipple_enable = state->line_stipple_enable; - rs->poly_stipple_enable = state->poly_stipple_enable; - rs->line_smooth = state->line_smooth; - rs->line_width = state->line_width; - rs->poly_smooth = state->poly_smooth; - rs->uses_poly_offset = state->offset_point || state->offset_line || - state->offset_tri; - rs->clamp_fragment_color = state->clamp_fragment_color; - rs->clamp_vertex_color = state->clamp_vertex_color; - rs->flatshade = state->flatshade; - rs->flatshade_first = state->flatshade_first; - rs->sprite_coord_enable = state->sprite_coord_enable; - rs->rasterizer_discard = state->rasterizer_discard; - rs->pa_sc_line_stipple = state->line_stipple_enable ? - S_028A0C_LINE_PATTERN(state->line_stipple_pattern) | - S_028A0C_REPEAT_COUNT(state->line_stipple_factor) : 0; - rs->pa_cl_clip_cntl = - S_028810_DX_CLIP_SPACE_DEF(state->clip_halfz) | - S_028810_ZCLIP_NEAR_DISABLE(!state->depth_clip_near) | - S_028810_ZCLIP_FAR_DISABLE(!state->depth_clip_far) | - S_028810_DX_RASTERIZATION_KILL(state->rasterizer_discard) | - S_028810_DX_LINEAR_ATTR_CLIP_ENA(1); - - si_pm4_set_reg(pm4, R_0286D4_SPI_INTERP_CONTROL_0, - S_0286D4_FLAT_SHADE_ENA(1) | - S_0286D4_PNT_SPRITE_ENA(state->point_quad_rasterization) | - S_0286D4_PNT_SPRITE_OVRD_X(V_0286D4_SPI_PNT_SPRITE_SEL_S) | - S_0286D4_PNT_SPRITE_OVRD_Y(V_0286D4_SPI_PNT_SPRITE_SEL_T) | - S_0286D4_PNT_SPRITE_OVRD_Z(V_0286D4_SPI_PNT_SPRITE_SEL_0) | - S_0286D4_PNT_SPRITE_OVRD_W(V_0286D4_SPI_PNT_SPRITE_SEL_1) | - S_0286D4_PNT_SPRITE_TOP_1(state->sprite_coord_mode != PIPE_SPRITE_COORD_UPPER_LEFT)); - - /* point size 12.4 fixed point */ - tmp = (unsigned)(state->point_size * 8.0); - si_pm4_set_reg(pm4, R_028A00_PA_SU_POINT_SIZE, S_028A00_HEIGHT(tmp) | S_028A00_WIDTH(tmp)); - - if (state->point_size_per_vertex) { - psize_min = util_get_min_point_size(state); - psize_max = SI_MAX_POINT_SIZE; - } else { - /* Force the point size to be as if the vertex output was disabled. */ - psize_min = state->point_size; - psize_max = state->point_size; - } - rs->max_point_size = psize_max; - - /* Divide by two, because 0.5 = 1 pixel. */ - si_pm4_set_reg(pm4, R_028A04_PA_SU_POINT_MINMAX, - S_028A04_MIN_SIZE(si_pack_float_12p4(psize_min/2)) | - S_028A04_MAX_SIZE(si_pack_float_12p4(psize_max/2))); - - si_pm4_set_reg(pm4, R_028A08_PA_SU_LINE_CNTL, - S_028A08_WIDTH(si_pack_float_12p4(state->line_width/2))); - si_pm4_set_reg(pm4, R_028A48_PA_SC_MODE_CNTL_0, - S_028A48_LINE_STIPPLE_ENABLE(state->line_stipple_enable) | - S_028A48_MSAA_ENABLE(state->multisample || - state->poly_smooth || - state->line_smooth) | - S_028A48_VPORT_SCISSOR_ENABLE(1) | - S_028A48_ALTERNATE_RBS_PER_TILE(sscreen->info.chip_class >= GFX9)); - - si_pm4_set_reg(pm4, R_028B7C_PA_SU_POLY_OFFSET_CLAMP, fui(state->offset_clamp)); - si_pm4_set_reg(pm4, R_028814_PA_SU_SC_MODE_CNTL, - S_028814_PROVOKING_VTX_LAST(!state->flatshade_first) | - S_028814_CULL_FRONT((state->cull_face & PIPE_FACE_FRONT) ? 1 : 0) | - S_028814_CULL_BACK((state->cull_face & PIPE_FACE_BACK) ? 1 : 0) | - S_028814_FACE(!state->front_ccw) | - S_028814_POLY_OFFSET_FRONT_ENABLE(util_get_offset(state, state->fill_front)) | - S_028814_POLY_OFFSET_BACK_ENABLE(util_get_offset(state, state->fill_back)) | - S_028814_POLY_OFFSET_PARA_ENABLE(state->offset_point || state->offset_line) | - S_028814_POLY_MODE(state->fill_front != PIPE_POLYGON_MODE_FILL || - state->fill_back != PIPE_POLYGON_MODE_FILL) | - S_028814_POLYMODE_FRONT_PTYPE(si_translate_fill(state->fill_front)) | - S_028814_POLYMODE_BACK_PTYPE(si_translate_fill(state->fill_back))); - - if (!rs->uses_poly_offset) - return rs; - - rs->pm4_poly_offset = CALLOC(3, sizeof(struct si_pm4_state)); - if (!rs->pm4_poly_offset) { - FREE(rs); - return NULL; - } - - /* Precalculate polygon offset states for 16-bit, 24-bit, and 32-bit zbuffers. */ - for (i = 0; i < 3; i++) { - struct si_pm4_state *pm4 = &rs->pm4_poly_offset[i]; - float offset_units = state->offset_units; - float offset_scale = state->offset_scale * 16.0f; - uint32_t pa_su_poly_offset_db_fmt_cntl = 0; - - if (!state->offset_units_unscaled) { - switch (i) { - case 0: /* 16-bit zbuffer */ - offset_units *= 4.0f; - pa_su_poly_offset_db_fmt_cntl = - S_028B78_POLY_OFFSET_NEG_NUM_DB_BITS(-16); - break; - case 1: /* 24-bit zbuffer */ - offset_units *= 2.0f; - pa_su_poly_offset_db_fmt_cntl = - S_028B78_POLY_OFFSET_NEG_NUM_DB_BITS(-24); - break; - case 2: /* 32-bit zbuffer */ - offset_units *= 1.0f; - pa_su_poly_offset_db_fmt_cntl = S_028B78_POLY_OFFSET_NEG_NUM_DB_BITS(-23) | - S_028B78_POLY_OFFSET_DB_IS_FLOAT_FMT(1); - break; - } - } - - si_pm4_set_reg(pm4, R_028B80_PA_SU_POLY_OFFSET_FRONT_SCALE, - fui(offset_scale)); - si_pm4_set_reg(pm4, R_028B84_PA_SU_POLY_OFFSET_FRONT_OFFSET, - fui(offset_units)); - si_pm4_set_reg(pm4, R_028B88_PA_SU_POLY_OFFSET_BACK_SCALE, - fui(offset_scale)); - si_pm4_set_reg(pm4, R_028B8C_PA_SU_POLY_OFFSET_BACK_OFFSET, - fui(offset_units)); - si_pm4_set_reg(pm4, R_028B78_PA_SU_POLY_OFFSET_DB_FMT_CNTL, - pa_su_poly_offset_db_fmt_cntl); - } + struct si_screen *sscreen = ((struct si_context *)ctx)->screen; + struct si_state_rasterizer *rs = CALLOC_STRUCT(si_state_rasterizer); + struct si_pm4_state *pm4 = &rs->pm4; + unsigned tmp, i; + float psize_min, psize_max; + + if (!rs) { + return NULL; + } + + if (!state->front_ccw) { + rs->cull_front = !!(state->cull_face & PIPE_FACE_FRONT); + rs->cull_back = !!(state->cull_face & PIPE_FACE_BACK); + } else { + rs->cull_back = !!(state->cull_face & PIPE_FACE_FRONT); + rs->cull_front = !!(state->cull_face & PIPE_FACE_BACK); + } + rs->depth_clamp_any = !state->depth_clip_near || !state->depth_clip_far; + rs->provoking_vertex_first = state->flatshade_first; + rs->scissor_enable = state->scissor; + rs->clip_halfz = state->clip_halfz; + rs->two_side = state->light_twoside; + rs->multisample_enable = state->multisample; + rs->force_persample_interp = state->force_persample_interp; + rs->clip_plane_enable = state->clip_plane_enable; + rs->half_pixel_center = state->half_pixel_center; + rs->line_stipple_enable = state->line_stipple_enable; + rs->poly_stipple_enable = state->poly_stipple_enable; + rs->line_smooth = state->line_smooth; + rs->line_width = state->line_width; + rs->poly_smooth = state->poly_smooth; + rs->uses_poly_offset = state->offset_point || state->offset_line || + state->offset_tri; + rs->clamp_fragment_color = state->clamp_fragment_color; + rs->clamp_vertex_color = state->clamp_vertex_color; + rs->flatshade = state->flatshade; + rs->flatshade_first = state->flatshade_first; + rs->sprite_coord_enable = state->sprite_coord_enable; + rs->rasterizer_discard = state->rasterizer_discard; + rs->polygon_mode_enabled = (state->fill_front != PIPE_POLYGON_MODE_FILL && + !(state->cull_face & PIPE_FACE_FRONT)) || + (state->fill_back != PIPE_POLYGON_MODE_FILL && + !(state->cull_face & PIPE_FACE_BACK)); + rs->polygon_mode_is_lines = (state->fill_front == PIPE_POLYGON_MODE_LINE && + !(state->cull_face & PIPE_FACE_FRONT)) || + (state->fill_back == PIPE_POLYGON_MODE_LINE && + !(state->cull_face & PIPE_FACE_BACK)); + rs->pa_sc_line_stipple = state->line_stipple_enable ? + S_028A0C_LINE_PATTERN(state->line_stipple_pattern) | + S_028A0C_REPEAT_COUNT(state->line_stipple_factor) : 0; + rs->pa_cl_clip_cntl = + S_028810_DX_CLIP_SPACE_DEF(state->clip_halfz) | + S_028810_ZCLIP_NEAR_DISABLE(!state->depth_clip_near) | + S_028810_ZCLIP_FAR_DISABLE(!state->depth_clip_far) | + S_028810_DX_RASTERIZATION_KILL(state->rasterizer_discard) | + S_028810_DX_LINEAR_ATTR_CLIP_ENA(1); + + si_pm4_set_reg(pm4, R_0286D4_SPI_INTERP_CONTROL_0, + S_0286D4_FLAT_SHADE_ENA(1) | + S_0286D4_PNT_SPRITE_ENA(state->point_quad_rasterization) | + S_0286D4_PNT_SPRITE_OVRD_X(V_0286D4_SPI_PNT_SPRITE_SEL_S) | + S_0286D4_PNT_SPRITE_OVRD_Y(V_0286D4_SPI_PNT_SPRITE_SEL_T) | + S_0286D4_PNT_SPRITE_OVRD_Z(V_0286D4_SPI_PNT_SPRITE_SEL_0) | + S_0286D4_PNT_SPRITE_OVRD_W(V_0286D4_SPI_PNT_SPRITE_SEL_1) | + S_0286D4_PNT_SPRITE_TOP_1(state->sprite_coord_mode != PIPE_SPRITE_COORD_UPPER_LEFT)); + + /* point size 12.4 fixed point */ + tmp = (unsigned)(state->point_size * 8.0); + si_pm4_set_reg(pm4, R_028A00_PA_SU_POINT_SIZE, S_028A00_HEIGHT(tmp) | S_028A00_WIDTH(tmp)); + + if (state->point_size_per_vertex) { + psize_min = util_get_min_point_size(state); + psize_max = SI_MAX_POINT_SIZE; + } else { + /* Force the point size to be as if the vertex output was disabled. */ + psize_min = state->point_size; + psize_max = state->point_size; + } + rs->max_point_size = psize_max; + + /* Divide by two, because 0.5 = 1 pixel. */ + si_pm4_set_reg(pm4, R_028A04_PA_SU_POINT_MINMAX, + S_028A04_MIN_SIZE(si_pack_float_12p4(psize_min/2)) | + S_028A04_MAX_SIZE(si_pack_float_12p4(psize_max/2))); + + si_pm4_set_reg(pm4, R_028A08_PA_SU_LINE_CNTL, + S_028A08_WIDTH(si_pack_float_12p4(state->line_width/2))); + si_pm4_set_reg(pm4, R_028A48_PA_SC_MODE_CNTL_0, + S_028A48_LINE_STIPPLE_ENABLE(state->line_stipple_enable) | + S_028A48_MSAA_ENABLE(state->multisample || + state->poly_smooth || + state->line_smooth) | + S_028A48_VPORT_SCISSOR_ENABLE(1) | + S_028A48_ALTERNATE_RBS_PER_TILE(sscreen->info.chip_class >= GFX9)); + + si_pm4_set_reg(pm4, R_028B7C_PA_SU_POLY_OFFSET_CLAMP, fui(state->offset_clamp)); + si_pm4_set_reg(pm4, R_028814_PA_SU_SC_MODE_CNTL, + S_028814_PROVOKING_VTX_LAST(!state->flatshade_first) | + S_028814_CULL_FRONT((state->cull_face & PIPE_FACE_FRONT) ? 1 : 0) | + S_028814_CULL_BACK((state->cull_face & PIPE_FACE_BACK) ? 1 : 0) | + S_028814_FACE(!state->front_ccw) | + S_028814_POLY_OFFSET_FRONT_ENABLE(util_get_offset(state, state->fill_front)) | + S_028814_POLY_OFFSET_BACK_ENABLE(util_get_offset(state, state->fill_back)) | + S_028814_POLY_OFFSET_PARA_ENABLE(state->offset_point || state->offset_line) | + S_028814_POLY_MODE(rs->polygon_mode_enabled) | + S_028814_POLYMODE_FRONT_PTYPE(si_translate_fill(state->fill_front)) | + S_028814_POLYMODE_BACK_PTYPE(si_translate_fill(state->fill_back))); + + if (!rs->uses_poly_offset) + return rs; + + rs->pm4_poly_offset = CALLOC(3, sizeof(struct si_pm4_state)); + if (!rs->pm4_poly_offset) { + FREE(rs); + return NULL; + } + + /* Precalculate polygon offset states for 16-bit, 24-bit, and 32-bit zbuffers. */ + for (i = 0; i < 3; i++) { + struct si_pm4_state *pm4 = &rs->pm4_poly_offset[i]; + float offset_units = state->offset_units; + float offset_scale = state->offset_scale * 16.0f; + uint32_t pa_su_poly_offset_db_fmt_cntl = 0; + + if (!state->offset_units_unscaled) { + switch (i) { + case 0: /* 16-bit zbuffer */ + offset_units *= 4.0f; + pa_su_poly_offset_db_fmt_cntl = + S_028B78_POLY_OFFSET_NEG_NUM_DB_BITS(-16); + break; + case 1: /* 24-bit zbuffer */ + offset_units *= 2.0f; + pa_su_poly_offset_db_fmt_cntl = + S_028B78_POLY_OFFSET_NEG_NUM_DB_BITS(-24); + break; + case 2: /* 32-bit zbuffer */ + offset_units *= 1.0f; + pa_su_poly_offset_db_fmt_cntl = S_028B78_POLY_OFFSET_NEG_NUM_DB_BITS(-23) | + S_028B78_POLY_OFFSET_DB_IS_FLOAT_FMT(1); + break; + } + } + + si_pm4_set_reg(pm4, R_028B80_PA_SU_POLY_OFFSET_FRONT_SCALE, + fui(offset_scale)); + si_pm4_set_reg(pm4, R_028B84_PA_SU_POLY_OFFSET_FRONT_OFFSET, + fui(offset_units)); + si_pm4_set_reg(pm4, R_028B88_PA_SU_POLY_OFFSET_BACK_SCALE, + fui(offset_scale)); + si_pm4_set_reg(pm4, R_028B8C_PA_SU_POLY_OFFSET_BACK_OFFSET, + fui(offset_units)); + si_pm4_set_reg(pm4, R_028B78_PA_SU_POLY_OFFSET_DB_FMT_CNTL, + pa_su_poly_offset_db_fmt_cntl); + } - return rs; + return rs; } static void si_bind_rs_state(struct pipe_context *ctx, void *state) { - struct si_context *sctx = (struct si_context *)ctx; - struct si_state_rasterizer *old_rs = - (struct si_state_rasterizer*)sctx->queued.named.rasterizer; - struct si_state_rasterizer *rs = (struct si_state_rasterizer *)state; - - if (!rs) - rs = (struct si_state_rasterizer *)sctx->discard_rasterizer_state; - - if (old_rs->multisample_enable != rs->multisample_enable) { - si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state); - - /* Update the small primitive filter workaround if necessary. */ - if (sctx->screen->has_msaa_sample_loc_bug && - sctx->framebuffer.nr_samples > 1) - si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_sample_locs); - } - - sctx->current_vs_state &= C_VS_STATE_CLAMP_VERTEX_COLOR; - sctx->current_vs_state |= S_VS_STATE_CLAMP_VERTEX_COLOR(rs->clamp_vertex_color); - - si_pm4_bind_state(sctx, rasterizer, rs); - si_update_poly_offset_state(sctx); - - if (old_rs->scissor_enable != rs->scissor_enable) - si_mark_atom_dirty(sctx, &sctx->atoms.s.scissors); - - if (old_rs->line_width != rs->line_width || - old_rs->max_point_size != rs->max_point_size || - old_rs->half_pixel_center != rs->half_pixel_center) - si_mark_atom_dirty(sctx, &sctx->atoms.s.guardband); - - if (old_rs->clip_halfz != rs->clip_halfz) - si_mark_atom_dirty(sctx, &sctx->atoms.s.viewports); - - if (old_rs->clip_plane_enable != rs->clip_plane_enable || - old_rs->pa_cl_clip_cntl != rs->pa_cl_clip_cntl) - si_mark_atom_dirty(sctx, &sctx->atoms.s.clip_regs); - - sctx->ia_multi_vgt_param_key.u.line_stipple_enabled = - rs->line_stipple_enable; - - if (old_rs->clip_plane_enable != rs->clip_plane_enable || - old_rs->rasterizer_discard != rs->rasterizer_discard || - old_rs->sprite_coord_enable != rs->sprite_coord_enable || - old_rs->flatshade != rs->flatshade || - old_rs->two_side != rs->two_side || - old_rs->multisample_enable != rs->multisample_enable || - old_rs->poly_stipple_enable != rs->poly_stipple_enable || - old_rs->poly_smooth != rs->poly_smooth || - old_rs->line_smooth != rs->line_smooth || - old_rs->clamp_fragment_color != rs->clamp_fragment_color || - old_rs->force_persample_interp != rs->force_persample_interp) - sctx->do_update_shaders = true; + struct si_context *sctx = (struct si_context *)ctx; + struct si_state_rasterizer *old_rs = + (struct si_state_rasterizer*)sctx->queued.named.rasterizer; + struct si_state_rasterizer *rs = (struct si_state_rasterizer *)state; + + if (!rs) + rs = (struct si_state_rasterizer *)sctx->discard_rasterizer_state; + + if (old_rs->multisample_enable != rs->multisample_enable) { + si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state); + + /* Update the small primitive filter workaround if necessary. */ + if (sctx->screen->info.has_msaa_sample_loc_bug && + sctx->framebuffer.nr_samples > 1) + si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_sample_locs); + } + + sctx->current_vs_state &= C_VS_STATE_CLAMP_VERTEX_COLOR; + sctx->current_vs_state |= S_VS_STATE_CLAMP_VERTEX_COLOR(rs->clamp_vertex_color); + + si_pm4_bind_state(sctx, rasterizer, rs); + si_update_poly_offset_state(sctx); + + if (old_rs->scissor_enable != rs->scissor_enable) + si_mark_atom_dirty(sctx, &sctx->atoms.s.scissors); + + if (old_rs->line_width != rs->line_width || + old_rs->max_point_size != rs->max_point_size || + old_rs->half_pixel_center != rs->half_pixel_center) + si_mark_atom_dirty(sctx, &sctx->atoms.s.guardband); + + if (old_rs->clip_halfz != rs->clip_halfz) + si_mark_atom_dirty(sctx, &sctx->atoms.s.viewports); + + if (old_rs->clip_plane_enable != rs->clip_plane_enable || + old_rs->pa_cl_clip_cntl != rs->pa_cl_clip_cntl) + si_mark_atom_dirty(sctx, &sctx->atoms.s.clip_regs); + + if (old_rs->clip_plane_enable != rs->clip_plane_enable || + old_rs->rasterizer_discard != rs->rasterizer_discard || + old_rs->sprite_coord_enable != rs->sprite_coord_enable || + old_rs->flatshade != rs->flatshade || + old_rs->two_side != rs->two_side || + old_rs->multisample_enable != rs->multisample_enable || + old_rs->poly_stipple_enable != rs->poly_stipple_enable || + old_rs->poly_smooth != rs->poly_smooth || + old_rs->line_smooth != rs->line_smooth || + old_rs->clamp_fragment_color != rs->clamp_fragment_color || + old_rs->force_persample_interp != rs->force_persample_interp) + sctx->do_update_shaders = true; } static void si_delete_rs_state(struct pipe_context *ctx, void *state) { - struct si_context *sctx = (struct si_context *)ctx; - struct si_state_rasterizer *rs = (struct si_state_rasterizer *)state; + struct si_context *sctx = (struct si_context *)ctx; + struct si_state_rasterizer *rs = (struct si_state_rasterizer *)state; - if (sctx->queued.named.rasterizer == state) - si_bind_rs_state(ctx, sctx->discard_rasterizer_state); + if (sctx->queued.named.rasterizer == state) + si_bind_rs_state(ctx, sctx->discard_rasterizer_state); - FREE(rs->pm4_poly_offset); - si_pm4_delete_state(sctx, rasterizer, rs); + FREE(rs->pm4_poly_offset); + si_pm4_delete_state(sctx, rasterizer, rs); } /* @@ -1106,31 +1110,31 @@ */ static void si_emit_stencil_ref(struct si_context *sctx) { - struct radeon_cmdbuf *cs = sctx->gfx_cs; - struct pipe_stencil_ref *ref = &sctx->stencil_ref.state; - struct si_dsa_stencil_ref_part *dsa = &sctx->stencil_ref.dsa_part; - - radeon_set_context_reg_seq(cs, R_028430_DB_STENCILREFMASK, 2); - radeon_emit(cs, S_028430_STENCILTESTVAL(ref->ref_value[0]) | - S_028430_STENCILMASK(dsa->valuemask[0]) | - S_028430_STENCILWRITEMASK(dsa->writemask[0]) | - S_028430_STENCILOPVAL(1)); - radeon_emit(cs, S_028434_STENCILTESTVAL_BF(ref->ref_value[1]) | - S_028434_STENCILMASK_BF(dsa->valuemask[1]) | - S_028434_STENCILWRITEMASK_BF(dsa->writemask[1]) | - S_028434_STENCILOPVAL_BF(1)); + struct radeon_cmdbuf *cs = sctx->gfx_cs; + struct pipe_stencil_ref *ref = &sctx->stencil_ref.state; + struct si_dsa_stencil_ref_part *dsa = &sctx->stencil_ref.dsa_part; + + radeon_set_context_reg_seq(cs, R_028430_DB_STENCILREFMASK, 2); + radeon_emit(cs, S_028430_STENCILTESTVAL(ref->ref_value[0]) | + S_028430_STENCILMASK(dsa->valuemask[0]) | + S_028430_STENCILWRITEMASK(dsa->writemask[0]) | + S_028430_STENCILOPVAL(1)); + radeon_emit(cs, S_028434_STENCILTESTVAL_BF(ref->ref_value[1]) | + S_028434_STENCILMASK_BF(dsa->valuemask[1]) | + S_028434_STENCILWRITEMASK_BF(dsa->writemask[1]) | + S_028434_STENCILOPVAL_BF(1)); } static void si_set_stencil_ref(struct pipe_context *ctx, - const struct pipe_stencil_ref *state) + const struct pipe_stencil_ref *state) { struct si_context *sctx = (struct si_context *)ctx; - if (memcmp(&sctx->stencil_ref.state, state, sizeof(*state)) == 0) - return; + if (memcmp(&sctx->stencil_ref.state, state, sizeof(*state)) == 0) + return; - sctx->stencil_ref.state = *state; - si_mark_atom_dirty(sctx, &sctx->atoms.s.stencil_ref); + sctx->stencil_ref.state = *state; + si_mark_atom_dirty(sctx, &sctx->atoms.s.stencil_ref); } @@ -1140,47 +1144,47 @@ static uint32_t si_translate_stencil_op(int s_op) { - switch (s_op) { - case PIPE_STENCIL_OP_KEEP: - return V_02842C_STENCIL_KEEP; - case PIPE_STENCIL_OP_ZERO: - return V_02842C_STENCIL_ZERO; - case PIPE_STENCIL_OP_REPLACE: - return V_02842C_STENCIL_REPLACE_TEST; - case PIPE_STENCIL_OP_INCR: - return V_02842C_STENCIL_ADD_CLAMP; - case PIPE_STENCIL_OP_DECR: - return V_02842C_STENCIL_SUB_CLAMP; - case PIPE_STENCIL_OP_INCR_WRAP: - return V_02842C_STENCIL_ADD_WRAP; - case PIPE_STENCIL_OP_DECR_WRAP: - return V_02842C_STENCIL_SUB_WRAP; - case PIPE_STENCIL_OP_INVERT: - return V_02842C_STENCIL_INVERT; - default: - PRINT_ERR("Unknown stencil op %d", s_op); - assert(0); - break; - } - return 0; + switch (s_op) { + case PIPE_STENCIL_OP_KEEP: + return V_02842C_STENCIL_KEEP; + case PIPE_STENCIL_OP_ZERO: + return V_02842C_STENCIL_ZERO; + case PIPE_STENCIL_OP_REPLACE: + return V_02842C_STENCIL_REPLACE_TEST; + case PIPE_STENCIL_OP_INCR: + return V_02842C_STENCIL_ADD_CLAMP; + case PIPE_STENCIL_OP_DECR: + return V_02842C_STENCIL_SUB_CLAMP; + case PIPE_STENCIL_OP_INCR_WRAP: + return V_02842C_STENCIL_ADD_WRAP; + case PIPE_STENCIL_OP_DECR_WRAP: + return V_02842C_STENCIL_SUB_WRAP; + case PIPE_STENCIL_OP_INVERT: + return V_02842C_STENCIL_INVERT; + default: + PRINT_ERR("Unknown stencil op %d", s_op); + assert(0); + break; + } + return 0; } static bool si_dsa_writes_stencil(const struct pipe_stencil_state *s) { - return s->enabled && s->writemask && - (s->fail_op != PIPE_STENCIL_OP_KEEP || - s->zfail_op != PIPE_STENCIL_OP_KEEP || - s->zpass_op != PIPE_STENCIL_OP_KEEP); + return s->enabled && s->writemask && + (s->fail_op != PIPE_STENCIL_OP_KEEP || + s->zfail_op != PIPE_STENCIL_OP_KEEP || + s->zpass_op != PIPE_STENCIL_OP_KEEP); } static bool si_order_invariant_stencil_op(enum pipe_stencil_op op) { - /* REPLACE is normally order invariant, except when the stencil - * reference value is written by the fragment shader. Tracking this - * interaction does not seem worth the effort, so be conservative. */ - return op != PIPE_STENCIL_OP_INCR && - op != PIPE_STENCIL_OP_DECR && - op != PIPE_STENCIL_OP_REPLACE; + /* REPLACE is normally order invariant, except when the stencil + * reference value is written by the fragment shader. Tracking this + * interaction does not seem worth the effort, so be conservative. */ + return op != PIPE_STENCIL_OP_INCR && + op != PIPE_STENCIL_OP_DECR && + op != PIPE_STENCIL_OP_REPLACE; } /* Compute whether, assuming Z writes are disabled, this stencil state is order @@ -1188,325 +1192,320 @@ * final stencil buffer result does not depend on the order of fragments. */ static bool si_order_invariant_stencil_state(const struct pipe_stencil_state *state) { - return !state->enabled || !state->writemask || - /* The following assumes that Z writes are disabled. */ - (state->func == PIPE_FUNC_ALWAYS && - si_order_invariant_stencil_op(state->zpass_op) && - si_order_invariant_stencil_op(state->zfail_op)) || - (state->func == PIPE_FUNC_NEVER && - si_order_invariant_stencil_op(state->fail_op)); + return !state->enabled || !state->writemask || + /* The following assumes that Z writes are disabled. */ + (state->func == PIPE_FUNC_ALWAYS && + si_order_invariant_stencil_op(state->zpass_op) && + si_order_invariant_stencil_op(state->zfail_op)) || + (state->func == PIPE_FUNC_NEVER && + si_order_invariant_stencil_op(state->fail_op)); } static void *si_create_dsa_state(struct pipe_context *ctx, - const struct pipe_depth_stencil_alpha_state *state) + const struct pipe_depth_stencil_alpha_state *state) { - struct si_context *sctx = (struct si_context *)ctx; - struct si_state_dsa *dsa = CALLOC_STRUCT(si_state_dsa); - struct si_pm4_state *pm4 = &dsa->pm4; - unsigned db_depth_control; - uint32_t db_stencil_control = 0; - - if (!dsa) { - return NULL; - } - - dsa->stencil_ref.valuemask[0] = state->stencil[0].valuemask; - dsa->stencil_ref.valuemask[1] = state->stencil[1].valuemask; - dsa->stencil_ref.writemask[0] = state->stencil[0].writemask; - dsa->stencil_ref.writemask[1] = state->stencil[1].writemask; - - db_depth_control = S_028800_Z_ENABLE(state->depth.enabled) | - S_028800_Z_WRITE_ENABLE(state->depth.writemask) | - S_028800_ZFUNC(state->depth.func) | - S_028800_DEPTH_BOUNDS_ENABLE(state->depth.bounds_test); - - /* stencil */ - if (state->stencil[0].enabled) { - db_depth_control |= S_028800_STENCIL_ENABLE(1); - db_depth_control |= S_028800_STENCILFUNC(state->stencil[0].func); - db_stencil_control |= S_02842C_STENCILFAIL(si_translate_stencil_op(state->stencil[0].fail_op)); - db_stencil_control |= S_02842C_STENCILZPASS(si_translate_stencil_op(state->stencil[0].zpass_op)); - db_stencil_control |= S_02842C_STENCILZFAIL(si_translate_stencil_op(state->stencil[0].zfail_op)); - - if (state->stencil[1].enabled) { - db_depth_control |= S_028800_BACKFACE_ENABLE(1); - db_depth_control |= S_028800_STENCILFUNC_BF(state->stencil[1].func); - db_stencil_control |= S_02842C_STENCILFAIL_BF(si_translate_stencil_op(state->stencil[1].fail_op)); - db_stencil_control |= S_02842C_STENCILZPASS_BF(si_translate_stencil_op(state->stencil[1].zpass_op)); - db_stencil_control |= S_02842C_STENCILZFAIL_BF(si_translate_stencil_op(state->stencil[1].zfail_op)); - } - } - - /* alpha */ - if (state->alpha.enabled) { - dsa->alpha_func = state->alpha.func; - - si_pm4_set_reg(pm4, R_00B030_SPI_SHADER_USER_DATA_PS_0 + - SI_SGPR_ALPHA_REF * 4, fui(state->alpha.ref_value)); - } else { - dsa->alpha_func = PIPE_FUNC_ALWAYS; - } - - si_pm4_set_reg(pm4, R_028800_DB_DEPTH_CONTROL, db_depth_control); - if (state->stencil[0].enabled) - si_pm4_set_reg(pm4, R_02842C_DB_STENCIL_CONTROL, db_stencil_control); - if (state->depth.bounds_test) { - si_pm4_set_reg(pm4, R_028020_DB_DEPTH_BOUNDS_MIN, fui(state->depth.bounds_min)); - si_pm4_set_reg(pm4, R_028024_DB_DEPTH_BOUNDS_MAX, fui(state->depth.bounds_max)); - } - - dsa->depth_enabled = state->depth.enabled; - dsa->depth_write_enabled = state->depth.enabled && - state->depth.writemask; - dsa->stencil_enabled = state->stencil[0].enabled; - dsa->stencil_write_enabled = state->stencil[0].enabled && - (si_dsa_writes_stencil(&state->stencil[0]) || - si_dsa_writes_stencil(&state->stencil[1])); - dsa->db_can_write = dsa->depth_write_enabled || - dsa->stencil_write_enabled; - - bool zfunc_is_ordered = - state->depth.func == PIPE_FUNC_NEVER || - state->depth.func == PIPE_FUNC_LESS || - state->depth.func == PIPE_FUNC_LEQUAL || - state->depth.func == PIPE_FUNC_GREATER || - state->depth.func == PIPE_FUNC_GEQUAL; - - bool nozwrite_and_order_invariant_stencil = - !dsa->db_can_write || - (!dsa->depth_write_enabled && - si_order_invariant_stencil_state(&state->stencil[0]) && - si_order_invariant_stencil_state(&state->stencil[1])); - - dsa->order_invariance[1].zs = - nozwrite_and_order_invariant_stencil || - (!dsa->stencil_write_enabled && zfunc_is_ordered); - dsa->order_invariance[0].zs = !dsa->depth_write_enabled || zfunc_is_ordered; - - dsa->order_invariance[1].pass_set = - nozwrite_and_order_invariant_stencil || - (!dsa->stencil_write_enabled && - (state->depth.func == PIPE_FUNC_ALWAYS || - state->depth.func == PIPE_FUNC_NEVER)); - dsa->order_invariance[0].pass_set = - !dsa->depth_write_enabled || - (state->depth.func == PIPE_FUNC_ALWAYS || - state->depth.func == PIPE_FUNC_NEVER); - - dsa->order_invariance[1].pass_last = - sctx->screen->assume_no_z_fights && - !dsa->stencil_write_enabled && - dsa->depth_write_enabled && zfunc_is_ordered; - dsa->order_invariance[0].pass_last = - sctx->screen->assume_no_z_fights && - dsa->depth_write_enabled && zfunc_is_ordered; + struct si_context *sctx = (struct si_context *)ctx; + struct si_state_dsa *dsa = CALLOC_STRUCT(si_state_dsa); + struct si_pm4_state *pm4 = &dsa->pm4; + unsigned db_depth_control; + uint32_t db_stencil_control = 0; + + if (!dsa) { + return NULL; + } + + dsa->stencil_ref.valuemask[0] = state->stencil[0].valuemask; + dsa->stencil_ref.valuemask[1] = state->stencil[1].valuemask; + dsa->stencil_ref.writemask[0] = state->stencil[0].writemask; + dsa->stencil_ref.writemask[1] = state->stencil[1].writemask; + + db_depth_control = S_028800_Z_ENABLE(state->depth.enabled) | + S_028800_Z_WRITE_ENABLE(state->depth.writemask) | + S_028800_ZFUNC(state->depth.func) | + S_028800_DEPTH_BOUNDS_ENABLE(state->depth.bounds_test); + + /* stencil */ + if (state->stencil[0].enabled) { + db_depth_control |= S_028800_STENCIL_ENABLE(1); + db_depth_control |= S_028800_STENCILFUNC(state->stencil[0].func); + db_stencil_control |= S_02842C_STENCILFAIL(si_translate_stencil_op(state->stencil[0].fail_op)); + db_stencil_control |= S_02842C_STENCILZPASS(si_translate_stencil_op(state->stencil[0].zpass_op)); + db_stencil_control |= S_02842C_STENCILZFAIL(si_translate_stencil_op(state->stencil[0].zfail_op)); + + if (state->stencil[1].enabled) { + db_depth_control |= S_028800_BACKFACE_ENABLE(1); + db_depth_control |= S_028800_STENCILFUNC_BF(state->stencil[1].func); + db_stencil_control |= S_02842C_STENCILFAIL_BF(si_translate_stencil_op(state->stencil[1].fail_op)); + db_stencil_control |= S_02842C_STENCILZPASS_BF(si_translate_stencil_op(state->stencil[1].zpass_op)); + db_stencil_control |= S_02842C_STENCILZFAIL_BF(si_translate_stencil_op(state->stencil[1].zfail_op)); + } + } + + /* alpha */ + if (state->alpha.enabled) { + dsa->alpha_func = state->alpha.func; + + si_pm4_set_reg(pm4, R_00B030_SPI_SHADER_USER_DATA_PS_0 + + SI_SGPR_ALPHA_REF * 4, fui(state->alpha.ref_value)); + } else { + dsa->alpha_func = PIPE_FUNC_ALWAYS; + } + + si_pm4_set_reg(pm4, R_028800_DB_DEPTH_CONTROL, db_depth_control); + if (state->stencil[0].enabled) + si_pm4_set_reg(pm4, R_02842C_DB_STENCIL_CONTROL, db_stencil_control); + if (state->depth.bounds_test) { + si_pm4_set_reg(pm4, R_028020_DB_DEPTH_BOUNDS_MIN, fui(state->depth.bounds_min)); + si_pm4_set_reg(pm4, R_028024_DB_DEPTH_BOUNDS_MAX, fui(state->depth.bounds_max)); + } + + dsa->depth_enabled = state->depth.enabled; + dsa->depth_write_enabled = state->depth.enabled && + state->depth.writemask; + dsa->stencil_enabled = state->stencil[0].enabled; + dsa->stencil_write_enabled = state->stencil[0].enabled && + (si_dsa_writes_stencil(&state->stencil[0]) || + si_dsa_writes_stencil(&state->stencil[1])); + dsa->db_can_write = dsa->depth_write_enabled || + dsa->stencil_write_enabled; + + bool zfunc_is_ordered = + state->depth.func == PIPE_FUNC_NEVER || + state->depth.func == PIPE_FUNC_LESS || + state->depth.func == PIPE_FUNC_LEQUAL || + state->depth.func == PIPE_FUNC_GREATER || + state->depth.func == PIPE_FUNC_GEQUAL; + + bool nozwrite_and_order_invariant_stencil = + !dsa->db_can_write || + (!dsa->depth_write_enabled && + si_order_invariant_stencil_state(&state->stencil[0]) && + si_order_invariant_stencil_state(&state->stencil[1])); + + dsa->order_invariance[1].zs = + nozwrite_and_order_invariant_stencil || + (!dsa->stencil_write_enabled && zfunc_is_ordered); + dsa->order_invariance[0].zs = !dsa->depth_write_enabled || zfunc_is_ordered; + + dsa->order_invariance[1].pass_set = + nozwrite_and_order_invariant_stencil || + (!dsa->stencil_write_enabled && + (state->depth.func == PIPE_FUNC_ALWAYS || + state->depth.func == PIPE_FUNC_NEVER)); + dsa->order_invariance[0].pass_set = + !dsa->depth_write_enabled || + (state->depth.func == PIPE_FUNC_ALWAYS || + state->depth.func == PIPE_FUNC_NEVER); + + dsa->order_invariance[1].pass_last = + sctx->screen->assume_no_z_fights && + !dsa->stencil_write_enabled && + dsa->depth_write_enabled && zfunc_is_ordered; + dsa->order_invariance[0].pass_last = + sctx->screen->assume_no_z_fights && + dsa->depth_write_enabled && zfunc_is_ordered; - return dsa; + return dsa; } static void si_bind_dsa_state(struct pipe_context *ctx, void *state) { struct si_context *sctx = (struct si_context *)ctx; - struct si_state_dsa *old_dsa = sctx->queued.named.dsa; + struct si_state_dsa *old_dsa = sctx->queued.named.dsa; struct si_state_dsa *dsa = state; if (!dsa) dsa = (struct si_state_dsa *)sctx->noop_dsa; - si_pm4_bind_state(sctx, dsa, dsa); + si_pm4_bind_state(sctx, dsa, dsa); - if (memcmp(&dsa->stencil_ref, &sctx->stencil_ref.dsa_part, - sizeof(struct si_dsa_stencil_ref_part)) != 0) { - sctx->stencil_ref.dsa_part = dsa->stencil_ref; - si_mark_atom_dirty(sctx, &sctx->atoms.s.stencil_ref); - } - - if (old_dsa->alpha_func != dsa->alpha_func) - sctx->do_update_shaders = true; - - if (sctx->screen->dpbb_allowed && - ((old_dsa->depth_enabled != dsa->depth_enabled || - old_dsa->stencil_enabled != dsa->stencil_enabled || - old_dsa->db_can_write != dsa->db_can_write))) - si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state); - - if (sctx->screen->has_out_of_order_rast && - (memcmp(old_dsa->order_invariance, dsa->order_invariance, - sizeof(old_dsa->order_invariance)))) - si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config); + if (memcmp(&dsa->stencil_ref, &sctx->stencil_ref.dsa_part, + sizeof(struct si_dsa_stencil_ref_part)) != 0) { + sctx->stencil_ref.dsa_part = dsa->stencil_ref; + si_mark_atom_dirty(sctx, &sctx->atoms.s.stencil_ref); + } + + if (old_dsa->alpha_func != dsa->alpha_func) + sctx->do_update_shaders = true; + + if (sctx->screen->dpbb_allowed && + ((old_dsa->depth_enabled != dsa->depth_enabled || + old_dsa->stencil_enabled != dsa->stencil_enabled || + old_dsa->db_can_write != dsa->db_can_write))) + si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state); + + if (sctx->screen->has_out_of_order_rast && + (memcmp(old_dsa->order_invariance, dsa->order_invariance, + sizeof(old_dsa->order_invariance)))) + si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config); } static void si_delete_dsa_state(struct pipe_context *ctx, void *state) { - struct si_context *sctx = (struct si_context *)ctx; + struct si_context *sctx = (struct si_context *)ctx; - if (sctx->queued.named.dsa == state) - si_bind_dsa_state(ctx, sctx->noop_dsa); + if (sctx->queued.named.dsa == state) + si_bind_dsa_state(ctx, sctx->noop_dsa); - si_pm4_delete_state(sctx, dsa, (struct si_state_dsa *)state); + si_pm4_delete_state(sctx, dsa, (struct si_state_dsa *)state); } static void *si_create_db_flush_dsa(struct si_context *sctx) { - struct pipe_depth_stencil_alpha_state dsa = {}; + struct pipe_depth_stencil_alpha_state dsa = {}; - return sctx->b.create_depth_stencil_alpha_state(&sctx->b, &dsa); + return sctx->b.create_depth_stencil_alpha_state(&sctx->b, &dsa); } /* DB RENDER STATE */ static void si_set_active_query_state(struct pipe_context *ctx, bool enable) { - struct si_context *sctx = (struct si_context*)ctx; + struct si_context *sctx = (struct si_context*)ctx; - /* Pipeline stat & streamout queries. */ - if (enable) { - sctx->flags &= ~SI_CONTEXT_STOP_PIPELINE_STATS; - sctx->flags |= SI_CONTEXT_START_PIPELINE_STATS; - } else { - sctx->flags &= ~SI_CONTEXT_START_PIPELINE_STATS; - sctx->flags |= SI_CONTEXT_STOP_PIPELINE_STATS; - } - - /* Occlusion queries. */ - if (sctx->occlusion_queries_disabled != !enable) { - sctx->occlusion_queries_disabled = !enable; - si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state); - } + /* Pipeline stat & streamout queries. */ + if (enable) { + sctx->flags &= ~SI_CONTEXT_STOP_PIPELINE_STATS; + sctx->flags |= SI_CONTEXT_START_PIPELINE_STATS; + } else { + sctx->flags &= ~SI_CONTEXT_START_PIPELINE_STATS; + sctx->flags |= SI_CONTEXT_STOP_PIPELINE_STATS; + } + + /* Occlusion queries. */ + if (sctx->occlusion_queries_disabled != !enable) { + sctx->occlusion_queries_disabled = !enable; + si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state); + } } void si_set_occlusion_query_state(struct si_context *sctx, - bool old_perfect_enable) + bool old_perfect_enable) { - si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state); + si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state); - bool perfect_enable = sctx->num_perfect_occlusion_queries != 0; + bool perfect_enable = sctx->num_perfect_occlusion_queries != 0; - if (perfect_enable != old_perfect_enable) - si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config); + if (perfect_enable != old_perfect_enable) + si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config); } void si_save_qbo_state(struct si_context *sctx, struct si_qbo_state *st) { - st->saved_compute = sctx->cs_shader_state.program; + st->saved_compute = sctx->cs_shader_state.program; - si_get_pipe_constant_buffer(sctx, PIPE_SHADER_COMPUTE, 0, &st->saved_const0); - si_get_shader_buffers(sctx, PIPE_SHADER_COMPUTE, 0, 3, st->saved_ssbo); + si_get_pipe_constant_buffer(sctx, PIPE_SHADER_COMPUTE, 0, &st->saved_const0); + si_get_shader_buffers(sctx, PIPE_SHADER_COMPUTE, 0, 3, st->saved_ssbo); - st->saved_ssbo_writable_mask = 0; + st->saved_ssbo_writable_mask = 0; - for (unsigned i = 0; i < 3; i++) { - if (sctx->const_and_shader_buffers[PIPE_SHADER_COMPUTE].writable_mask & - (1u << si_get_shaderbuf_slot(i))) - st->saved_ssbo_writable_mask |= 1 << i; - } + for (unsigned i = 0; i < 3; i++) { + if (sctx->const_and_shader_buffers[PIPE_SHADER_COMPUTE].writable_mask & + (1u << si_get_shaderbuf_slot(i))) + st->saved_ssbo_writable_mask |= 1 << i; + } } void si_restore_qbo_state(struct si_context *sctx, struct si_qbo_state *st) { - sctx->b.bind_compute_state(&sctx->b, st->saved_compute); + sctx->b.bind_compute_state(&sctx->b, st->saved_compute); - sctx->b.set_constant_buffer(&sctx->b, PIPE_SHADER_COMPUTE, 0, &st->saved_const0); - pipe_resource_reference(&st->saved_const0.buffer, NULL); + sctx->b.set_constant_buffer(&sctx->b, PIPE_SHADER_COMPUTE, 0, &st->saved_const0); + pipe_resource_reference(&st->saved_const0.buffer, NULL); - sctx->b.set_shader_buffers(&sctx->b, PIPE_SHADER_COMPUTE, 0, 3, st->saved_ssbo, - st->saved_ssbo_writable_mask); - for (unsigned i = 0; i < 3; ++i) - pipe_resource_reference(&st->saved_ssbo[i].buffer, NULL); + sctx->b.set_shader_buffers(&sctx->b, PIPE_SHADER_COMPUTE, 0, 3, st->saved_ssbo, + st->saved_ssbo_writable_mask); + for (unsigned i = 0; i < 3; ++i) + pipe_resource_reference(&st->saved_ssbo[i].buffer, NULL); } static void si_emit_db_render_state(struct si_context *sctx) { - struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; - unsigned db_shader_control, db_render_control, db_count_control; - unsigned initial_cdw = sctx->gfx_cs->current.cdw; - - /* DB_RENDER_CONTROL */ - if (sctx->dbcb_depth_copy_enabled || - sctx->dbcb_stencil_copy_enabled) { - db_render_control = - S_028000_DEPTH_COPY(sctx->dbcb_depth_copy_enabled) | - S_028000_STENCIL_COPY(sctx->dbcb_stencil_copy_enabled) | - S_028000_COPY_CENTROID(1) | - S_028000_COPY_SAMPLE(sctx->dbcb_copy_sample); - } else if (sctx->db_flush_depth_inplace || sctx->db_flush_stencil_inplace) { - db_render_control = - S_028000_DEPTH_COMPRESS_DISABLE(sctx->db_flush_depth_inplace) | - S_028000_STENCIL_COMPRESS_DISABLE(sctx->db_flush_stencil_inplace); - } else { - db_render_control = - S_028000_DEPTH_CLEAR_ENABLE(sctx->db_depth_clear) | - S_028000_STENCIL_CLEAR_ENABLE(sctx->db_stencil_clear); - } - - /* DB_COUNT_CONTROL (occlusion queries) */ - if (sctx->num_occlusion_queries > 0 && - !sctx->occlusion_queries_disabled) { - bool perfect = sctx->num_perfect_occlusion_queries > 0; - bool gfx10_perfect = sctx->chip_class >= GFX10 && perfect; - - if (sctx->chip_class >= GFX7) { - unsigned log_sample_rate = sctx->framebuffer.log_samples; - - /* Stoney doesn't increment occlusion query counters - * if the sample rate is 16x. Use 8x sample rate instead. - */ - if (sctx->family == CHIP_STONEY) - log_sample_rate = MIN2(log_sample_rate, 3); - - db_count_control = - S_028004_PERFECT_ZPASS_COUNTS(perfect) | - S_028004_DISABLE_CONSERVATIVE_ZPASS_COUNTS(gfx10_perfect) | - S_028004_SAMPLE_RATE(log_sample_rate) | - S_028004_ZPASS_ENABLE(1) | - S_028004_SLICE_EVEN_ENABLE(1) | - S_028004_SLICE_ODD_ENABLE(1); - } else { - db_count_control = - S_028004_PERFECT_ZPASS_COUNTS(perfect) | - S_028004_SAMPLE_RATE(sctx->framebuffer.log_samples); - } - } else { - /* Disable occlusion queries. */ - if (sctx->chip_class >= GFX7) { - db_count_control = 0; - } else { - db_count_control = S_028004_ZPASS_INCREMENT_DISABLE(1); - } - } - - radeon_opt_set_context_reg2(sctx, R_028000_DB_RENDER_CONTROL, - SI_TRACKED_DB_RENDER_CONTROL, db_render_control, - db_count_control); - - /* DB_RENDER_OVERRIDE2 */ - radeon_opt_set_context_reg(sctx, R_028010_DB_RENDER_OVERRIDE2, - SI_TRACKED_DB_RENDER_OVERRIDE2, - S_028010_DISABLE_ZMASK_EXPCLEAR_OPTIMIZATION(sctx->db_depth_disable_expclear) | - S_028010_DISABLE_SMEM_EXPCLEAR_OPTIMIZATION(sctx->db_stencil_disable_expclear) | - S_028010_DECOMPRESS_Z_ON_FLUSH(sctx->framebuffer.nr_samples >= 4)); - - db_shader_control = sctx->ps_db_shader_control; - - /* Bug workaround for smoothing (overrasterization) on GFX6. */ - if (sctx->chip_class == GFX6 && sctx->smoothing_enabled) { - db_shader_control &= C_02880C_Z_ORDER; - db_shader_control |= S_02880C_Z_ORDER(V_02880C_LATE_Z); - } - - /* Disable the gl_SampleMask fragment shader output if MSAA is disabled. */ - if (!rs->multisample_enable) - db_shader_control &= C_02880C_MASK_EXPORT_ENABLE; - - if (sctx->screen->has_rbplus && - !sctx->screen->rbplus_allowed) - db_shader_control |= S_02880C_DUAL_QUAD_DISABLE(1); + struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; + unsigned db_shader_control, db_render_control, db_count_control; + unsigned initial_cdw = sctx->gfx_cs->current.cdw; + + /* DB_RENDER_CONTROL */ + if (sctx->dbcb_depth_copy_enabled || + sctx->dbcb_stencil_copy_enabled) { + db_render_control = + S_028000_DEPTH_COPY(sctx->dbcb_depth_copy_enabled) | + S_028000_STENCIL_COPY(sctx->dbcb_stencil_copy_enabled) | + S_028000_COPY_CENTROID(1) | + S_028000_COPY_SAMPLE(sctx->dbcb_copy_sample); + } else if (sctx->db_flush_depth_inplace || sctx->db_flush_stencil_inplace) { + db_render_control = + S_028000_DEPTH_COMPRESS_DISABLE(sctx->db_flush_depth_inplace) | + S_028000_STENCIL_COMPRESS_DISABLE(sctx->db_flush_stencil_inplace); + } else { + db_render_control = + S_028000_DEPTH_CLEAR_ENABLE(sctx->db_depth_clear) | + S_028000_STENCIL_CLEAR_ENABLE(sctx->db_stencil_clear); + } + + /* DB_COUNT_CONTROL (occlusion queries) */ + if (sctx->num_occlusion_queries > 0 && + !sctx->occlusion_queries_disabled) { + bool perfect = sctx->num_perfect_occlusion_queries > 0; + bool gfx10_perfect = sctx->chip_class >= GFX10 && perfect; + + if (sctx->chip_class >= GFX7) { + unsigned log_sample_rate = sctx->framebuffer.log_samples; + + + db_count_control = + S_028004_PERFECT_ZPASS_COUNTS(perfect) | + S_028004_DISABLE_CONSERVATIVE_ZPASS_COUNTS(gfx10_perfect) | + S_028004_SAMPLE_RATE(log_sample_rate) | + S_028004_ZPASS_ENABLE(1) | + S_028004_SLICE_EVEN_ENABLE(1) | + S_028004_SLICE_ODD_ENABLE(1); + } else { + db_count_control = + S_028004_PERFECT_ZPASS_COUNTS(perfect) | + S_028004_SAMPLE_RATE(sctx->framebuffer.log_samples); + } + } else { + /* Disable occlusion queries. */ + if (sctx->chip_class >= GFX7) { + db_count_control = 0; + } else { + db_count_control = S_028004_ZPASS_INCREMENT_DISABLE(1); + } + } + + radeon_opt_set_context_reg2(sctx, R_028000_DB_RENDER_CONTROL, + SI_TRACKED_DB_RENDER_CONTROL, db_render_control, + db_count_control); + + /* DB_RENDER_OVERRIDE2 */ + radeon_opt_set_context_reg(sctx, R_028010_DB_RENDER_OVERRIDE2, + SI_TRACKED_DB_RENDER_OVERRIDE2, + S_028010_DISABLE_ZMASK_EXPCLEAR_OPTIMIZATION(sctx->db_depth_disable_expclear) | + S_028010_DISABLE_SMEM_EXPCLEAR_OPTIMIZATION(sctx->db_stencil_disable_expclear) | + S_028010_DECOMPRESS_Z_ON_FLUSH(sctx->framebuffer.nr_samples >= 4)); + + db_shader_control = sctx->ps_db_shader_control; + + /* Bug workaround for smoothing (overrasterization) on GFX6. */ + if (sctx->chip_class == GFX6 && sctx->smoothing_enabled) { + db_shader_control &= C_02880C_Z_ORDER; + db_shader_control |= S_02880C_Z_ORDER(V_02880C_LATE_Z); + } + + /* Disable the gl_SampleMask fragment shader output if MSAA is disabled. */ + if (!rs->multisample_enable) + db_shader_control &= C_02880C_MASK_EXPORT_ENABLE; + + if (sctx->screen->info.has_rbplus && + !sctx->screen->info.rbplus_allowed) + db_shader_control |= S_02880C_DUAL_QUAD_DISABLE(1); - radeon_opt_set_context_reg(sctx, R_02880C_DB_SHADER_CONTROL, - SI_TRACKED_DB_SHADER_CONTROL, db_shader_control); + radeon_opt_set_context_reg(sctx, R_02880C_DB_SHADER_CONTROL, + SI_TRACKED_DB_SHADER_CONTROL, db_shader_control); - if (initial_cdw != sctx->gfx_cs->current.cdw) - sctx->context_roll = true; + if (initial_cdw != sctx->gfx_cs->current.cdw) + sctx->context_roll = true; } /* @@ -1514,143 +1513,143 @@ */ static uint32_t si_translate_colorformat(enum pipe_format format) { - const struct util_format_description *desc = util_format_description(format); - if (!desc) - return V_028C70_COLOR_INVALID; + const struct util_format_description *desc = util_format_description(format); + if (!desc) + return V_028C70_COLOR_INVALID; #define HAS_SIZE(x,y,z,w) \ - (desc->channel[0].size == (x) && desc->channel[1].size == (y) && \ + (desc->channel[0].size == (x) && desc->channel[1].size == (y) && \ desc->channel[2].size == (z) && desc->channel[3].size == (w)) - if (format == PIPE_FORMAT_R11G11B10_FLOAT) /* isn't plain */ - return V_028C70_COLOR_10_11_11; + if (format == PIPE_FORMAT_R11G11B10_FLOAT) /* isn't plain */ + return V_028C70_COLOR_10_11_11; - if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN) - return V_028C70_COLOR_INVALID; + if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN) + return V_028C70_COLOR_INVALID; - /* hw cannot support mixed formats (except depth/stencil, since - * stencil is not written to). */ - if (desc->is_mixed && desc->colorspace != UTIL_FORMAT_COLORSPACE_ZS) - return V_028C70_COLOR_INVALID; - - switch (desc->nr_channels) { - case 1: - switch (desc->channel[0].size) { - case 8: - return V_028C70_COLOR_8; - case 16: - return V_028C70_COLOR_16; - case 32: - return V_028C70_COLOR_32; - } - break; - case 2: - if (desc->channel[0].size == desc->channel[1].size) { - switch (desc->channel[0].size) { - case 8: - return V_028C70_COLOR_8_8; - case 16: - return V_028C70_COLOR_16_16; - case 32: - return V_028C70_COLOR_32_32; - } - } else if (HAS_SIZE(8,24,0,0)) { - return V_028C70_COLOR_24_8; - } else if (HAS_SIZE(24,8,0,0)) { - return V_028C70_COLOR_8_24; - } - break; - case 3: - if (HAS_SIZE(5,6,5,0)) { - return V_028C70_COLOR_5_6_5; - } else if (HAS_SIZE(32,8,24,0)) { - return V_028C70_COLOR_X24_8_32_FLOAT; - } - break; - case 4: - if (desc->channel[0].size == desc->channel[1].size && - desc->channel[0].size == desc->channel[2].size && - desc->channel[0].size == desc->channel[3].size) { - switch (desc->channel[0].size) { - case 4: - return V_028C70_COLOR_4_4_4_4; - case 8: - return V_028C70_COLOR_8_8_8_8; - case 16: - return V_028C70_COLOR_16_16_16_16; - case 32: - return V_028C70_COLOR_32_32_32_32; - } - } else if (HAS_SIZE(5,5,5,1)) { - return V_028C70_COLOR_1_5_5_5; - } else if (HAS_SIZE(1,5,5,5)) { - return V_028C70_COLOR_5_5_5_1; - } else if (HAS_SIZE(10,10,10,2)) { - return V_028C70_COLOR_2_10_10_10; - } - break; - } - return V_028C70_COLOR_INVALID; + /* hw cannot support mixed formats (except depth/stencil, since + * stencil is not written to). */ + if (desc->is_mixed && desc->colorspace != UTIL_FORMAT_COLORSPACE_ZS) + return V_028C70_COLOR_INVALID; + + switch (desc->nr_channels) { + case 1: + switch (desc->channel[0].size) { + case 8: + return V_028C70_COLOR_8; + case 16: + return V_028C70_COLOR_16; + case 32: + return V_028C70_COLOR_32; + } + break; + case 2: + if (desc->channel[0].size == desc->channel[1].size) { + switch (desc->channel[0].size) { + case 8: + return V_028C70_COLOR_8_8; + case 16: + return V_028C70_COLOR_16_16; + case 32: + return V_028C70_COLOR_32_32; + } + } else if (HAS_SIZE(8,24,0,0)) { + return V_028C70_COLOR_24_8; + } else if (HAS_SIZE(24,8,0,0)) { + return V_028C70_COLOR_8_24; + } + break; + case 3: + if (HAS_SIZE(5,6,5,0)) { + return V_028C70_COLOR_5_6_5; + } else if (HAS_SIZE(32,8,24,0)) { + return V_028C70_COLOR_X24_8_32_FLOAT; + } + break; + case 4: + if (desc->channel[0].size == desc->channel[1].size && + desc->channel[0].size == desc->channel[2].size && + desc->channel[0].size == desc->channel[3].size) { + switch (desc->channel[0].size) { + case 4: + return V_028C70_COLOR_4_4_4_4; + case 8: + return V_028C70_COLOR_8_8_8_8; + case 16: + return V_028C70_COLOR_16_16_16_16; + case 32: + return V_028C70_COLOR_32_32_32_32; + } + } else if (HAS_SIZE(5,5,5,1)) { + return V_028C70_COLOR_1_5_5_5; + } else if (HAS_SIZE(1,5,5,5)) { + return V_028C70_COLOR_5_5_5_1; + } else if (HAS_SIZE(10,10,10,2)) { + return V_028C70_COLOR_2_10_10_10; + } + break; + } + return V_028C70_COLOR_INVALID; } static uint32_t si_colorformat_endian_swap(uint32_t colorformat) { - if (SI_BIG_ENDIAN) { - switch(colorformat) { - /* 8-bit buffers. */ - case V_028C70_COLOR_8: - return V_028C70_ENDIAN_NONE; - - /* 16-bit buffers. */ - case V_028C70_COLOR_5_6_5: - case V_028C70_COLOR_1_5_5_5: - case V_028C70_COLOR_4_4_4_4: - case V_028C70_COLOR_16: - case V_028C70_COLOR_8_8: - return V_028C70_ENDIAN_8IN16; - - /* 32-bit buffers. */ - case V_028C70_COLOR_8_8_8_8: - case V_028C70_COLOR_2_10_10_10: - case V_028C70_COLOR_8_24: - case V_028C70_COLOR_24_8: - case V_028C70_COLOR_16_16: - return V_028C70_ENDIAN_8IN32; - - /* 64-bit buffers. */ - case V_028C70_COLOR_16_16_16_16: - return V_028C70_ENDIAN_8IN16; - - case V_028C70_COLOR_32_32: - return V_028C70_ENDIAN_8IN32; - - /* 128-bit buffers. */ - case V_028C70_COLOR_32_32_32_32: - return V_028C70_ENDIAN_8IN32; - default: - return V_028C70_ENDIAN_NONE; /* Unsupported. */ - } - } else { - return V_028C70_ENDIAN_NONE; - } + if (SI_BIG_ENDIAN) { + switch(colorformat) { + /* 8-bit buffers. */ + case V_028C70_COLOR_8: + return V_028C70_ENDIAN_NONE; + + /* 16-bit buffers. */ + case V_028C70_COLOR_5_6_5: + case V_028C70_COLOR_1_5_5_5: + case V_028C70_COLOR_4_4_4_4: + case V_028C70_COLOR_16: + case V_028C70_COLOR_8_8: + return V_028C70_ENDIAN_8IN16; + + /* 32-bit buffers. */ + case V_028C70_COLOR_8_8_8_8: + case V_028C70_COLOR_2_10_10_10: + case V_028C70_COLOR_8_24: + case V_028C70_COLOR_24_8: + case V_028C70_COLOR_16_16: + return V_028C70_ENDIAN_8IN32; + + /* 64-bit buffers. */ + case V_028C70_COLOR_16_16_16_16: + return V_028C70_ENDIAN_8IN16; + + case V_028C70_COLOR_32_32: + return V_028C70_ENDIAN_8IN32; + + /* 128-bit buffers. */ + case V_028C70_COLOR_32_32_32_32: + return V_028C70_ENDIAN_8IN32; + default: + return V_028C70_ENDIAN_NONE; /* Unsupported. */ + } + } else { + return V_028C70_ENDIAN_NONE; + } } static uint32_t si_translate_dbformat(enum pipe_format format) { - switch (format) { - case PIPE_FORMAT_Z16_UNORM: - return V_028040_Z_16; - case PIPE_FORMAT_S8_UINT_Z24_UNORM: - case PIPE_FORMAT_X8Z24_UNORM: - case PIPE_FORMAT_Z24X8_UNORM: - case PIPE_FORMAT_Z24_UNORM_S8_UINT: - return V_028040_Z_24; /* deprecated on AMD GCN */ - case PIPE_FORMAT_Z32_FLOAT: - case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: - return V_028040_Z_32_FLOAT; - default: - return V_028040_Z_INVALID; - } + switch (format) { + case PIPE_FORMAT_Z16_UNORM: + return V_028040_Z_16; + case PIPE_FORMAT_S8_UINT_Z24_UNORM: + case PIPE_FORMAT_X8Z24_UNORM: + case PIPE_FORMAT_Z24X8_UNORM: + case PIPE_FORMAT_Z24_UNORM_S8_UINT: + return V_028040_Z_24; /* deprecated on AMD GCN */ + case PIPE_FORMAT_Z32_FLOAT: + case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: + return V_028040_Z_32_FLOAT; + default: + return V_028040_Z_INVALID; + } } /* @@ -1658,370 +1657,370 @@ */ static uint32_t si_translate_texformat(struct pipe_screen *screen, - enum pipe_format format, - const struct util_format_description *desc, - int first_non_void) -{ - struct si_screen *sscreen = (struct si_screen*)screen; - bool uniform = true; - int i; - - assert(sscreen->info.chip_class <= GFX9); - - /* Colorspace (return non-RGB formats directly). */ - switch (desc->colorspace) { - /* Depth stencil formats */ - case UTIL_FORMAT_COLORSPACE_ZS: - switch (format) { - case PIPE_FORMAT_Z16_UNORM: - return V_008F14_IMG_DATA_FORMAT_16; - case PIPE_FORMAT_X24S8_UINT: - case PIPE_FORMAT_S8X24_UINT: - /* - * Implemented as an 8_8_8_8 data format to fix texture - * gathers in stencil sampling. This affects at least - * GL45-CTS.texture_cube_map_array.sampling on GFX8. - */ - if (sscreen->info.chip_class <= GFX8) - return V_008F14_IMG_DATA_FORMAT_8_8_8_8; - - if (format == PIPE_FORMAT_X24S8_UINT) - return V_008F14_IMG_DATA_FORMAT_8_24; - else - return V_008F14_IMG_DATA_FORMAT_24_8; - case PIPE_FORMAT_Z24X8_UNORM: - case PIPE_FORMAT_Z24_UNORM_S8_UINT: - return V_008F14_IMG_DATA_FORMAT_8_24; - case PIPE_FORMAT_X8Z24_UNORM: - case PIPE_FORMAT_S8_UINT_Z24_UNORM: - return V_008F14_IMG_DATA_FORMAT_24_8; - case PIPE_FORMAT_S8_UINT: - return V_008F14_IMG_DATA_FORMAT_8; - case PIPE_FORMAT_Z32_FLOAT: - return V_008F14_IMG_DATA_FORMAT_32; - case PIPE_FORMAT_X32_S8X24_UINT: - case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: - return V_008F14_IMG_DATA_FORMAT_X24_8_32; - default: - goto out_unknown; - } - - case UTIL_FORMAT_COLORSPACE_YUV: - goto out_unknown; /* TODO */ - - case UTIL_FORMAT_COLORSPACE_SRGB: - if (desc->nr_channels != 4 && desc->nr_channels != 1) - goto out_unknown; - break; - - default: - break; - } - - if (desc->layout == UTIL_FORMAT_LAYOUT_RGTC) { - if (!sscreen->info.has_format_bc1_through_bc7) - goto out_unknown; - - switch (format) { - case PIPE_FORMAT_RGTC1_SNORM: - case PIPE_FORMAT_LATC1_SNORM: - case PIPE_FORMAT_RGTC1_UNORM: - case PIPE_FORMAT_LATC1_UNORM: - return V_008F14_IMG_DATA_FORMAT_BC4; - case PIPE_FORMAT_RGTC2_SNORM: - case PIPE_FORMAT_LATC2_SNORM: - case PIPE_FORMAT_RGTC2_UNORM: - case PIPE_FORMAT_LATC2_UNORM: - return V_008F14_IMG_DATA_FORMAT_BC5; - default: - goto out_unknown; - } - } - - if (desc->layout == UTIL_FORMAT_LAYOUT_ETC && - (sscreen->info.family == CHIP_STONEY || - sscreen->info.family == CHIP_VEGA10 || - sscreen->info.family == CHIP_RAVEN)) { - switch (format) { - case PIPE_FORMAT_ETC1_RGB8: - case PIPE_FORMAT_ETC2_RGB8: - case PIPE_FORMAT_ETC2_SRGB8: - return V_008F14_IMG_DATA_FORMAT_ETC2_RGB; - case PIPE_FORMAT_ETC2_RGB8A1: - case PIPE_FORMAT_ETC2_SRGB8A1: - return V_008F14_IMG_DATA_FORMAT_ETC2_RGBA1; - case PIPE_FORMAT_ETC2_RGBA8: - case PIPE_FORMAT_ETC2_SRGBA8: - return V_008F14_IMG_DATA_FORMAT_ETC2_RGBA; - case PIPE_FORMAT_ETC2_R11_UNORM: - case PIPE_FORMAT_ETC2_R11_SNORM: - return V_008F14_IMG_DATA_FORMAT_ETC2_R; - case PIPE_FORMAT_ETC2_RG11_UNORM: - case PIPE_FORMAT_ETC2_RG11_SNORM: - return V_008F14_IMG_DATA_FORMAT_ETC2_RG; - default: - goto out_unknown; - } - } - - if (desc->layout == UTIL_FORMAT_LAYOUT_BPTC) { - if (!sscreen->info.has_format_bc1_through_bc7) - goto out_unknown; - - switch (format) { - case PIPE_FORMAT_BPTC_RGBA_UNORM: - case PIPE_FORMAT_BPTC_SRGBA: - return V_008F14_IMG_DATA_FORMAT_BC7; - case PIPE_FORMAT_BPTC_RGB_FLOAT: - case PIPE_FORMAT_BPTC_RGB_UFLOAT: - return V_008F14_IMG_DATA_FORMAT_BC6; - default: - goto out_unknown; - } - } - - if (desc->layout == UTIL_FORMAT_LAYOUT_SUBSAMPLED) { - switch (format) { - case PIPE_FORMAT_R8G8_B8G8_UNORM: - case PIPE_FORMAT_G8R8_B8R8_UNORM: - return V_008F14_IMG_DATA_FORMAT_GB_GR; - case PIPE_FORMAT_G8R8_G8B8_UNORM: - case PIPE_FORMAT_R8G8_R8B8_UNORM: - return V_008F14_IMG_DATA_FORMAT_BG_RG; - default: - goto out_unknown; - } - } - - if (desc->layout == UTIL_FORMAT_LAYOUT_S3TC) { - if (!sscreen->info.has_format_bc1_through_bc7) - goto out_unknown; - - switch (format) { - case PIPE_FORMAT_DXT1_RGB: - case PIPE_FORMAT_DXT1_RGBA: - case PIPE_FORMAT_DXT1_SRGB: - case PIPE_FORMAT_DXT1_SRGBA: - return V_008F14_IMG_DATA_FORMAT_BC1; - case PIPE_FORMAT_DXT3_RGBA: - case PIPE_FORMAT_DXT3_SRGBA: - return V_008F14_IMG_DATA_FORMAT_BC2; - case PIPE_FORMAT_DXT5_RGBA: - case PIPE_FORMAT_DXT5_SRGBA: - return V_008F14_IMG_DATA_FORMAT_BC3; - default: - goto out_unknown; - } - } - - if (format == PIPE_FORMAT_R9G9B9E5_FLOAT) { - return V_008F14_IMG_DATA_FORMAT_5_9_9_9; - } else if (format == PIPE_FORMAT_R11G11B10_FLOAT) { - return V_008F14_IMG_DATA_FORMAT_10_11_11; - } - - /* R8G8Bx_SNORM - TODO CxV8U8 */ - - /* hw cannot support mixed formats (except depth/stencil, since only - * depth is read).*/ - if (desc->is_mixed && desc->colorspace != UTIL_FORMAT_COLORSPACE_ZS) - goto out_unknown; - - /* See whether the components are of the same size. */ - for (i = 1; i < desc->nr_channels; i++) { - uniform = uniform && desc->channel[0].size == desc->channel[i].size; - } - - /* Non-uniform formats. */ - if (!uniform) { - switch(desc->nr_channels) { - case 3: - if (desc->channel[0].size == 5 && - desc->channel[1].size == 6 && - desc->channel[2].size == 5) { - return V_008F14_IMG_DATA_FORMAT_5_6_5; - } - goto out_unknown; - case 4: - if (desc->channel[0].size == 5 && - desc->channel[1].size == 5 && - desc->channel[2].size == 5 && - desc->channel[3].size == 1) { - return V_008F14_IMG_DATA_FORMAT_1_5_5_5; - } - if (desc->channel[0].size == 1 && - desc->channel[1].size == 5 && - desc->channel[2].size == 5 && - desc->channel[3].size == 5) { - return V_008F14_IMG_DATA_FORMAT_5_5_5_1; - } - if (desc->channel[0].size == 10 && - desc->channel[1].size == 10 && - desc->channel[2].size == 10 && - desc->channel[3].size == 2) { - return V_008F14_IMG_DATA_FORMAT_2_10_10_10; - } - goto out_unknown; - } - goto out_unknown; - } - - if (first_non_void < 0 || first_non_void > 3) - goto out_unknown; - - /* uniform formats */ - switch (desc->channel[first_non_void].size) { - case 4: - switch (desc->nr_channels) { + enum pipe_format format, + const struct util_format_description *desc, + int first_non_void) +{ + struct si_screen *sscreen = (struct si_screen*)screen; + bool uniform = true; + int i; + + assert(sscreen->info.chip_class <= GFX9); + + /* Colorspace (return non-RGB formats directly). */ + switch (desc->colorspace) { + /* Depth stencil formats */ + case UTIL_FORMAT_COLORSPACE_ZS: + switch (format) { + case PIPE_FORMAT_Z16_UNORM: + return V_008F14_IMG_DATA_FORMAT_16; + case PIPE_FORMAT_X24S8_UINT: + case PIPE_FORMAT_S8X24_UINT: + /* + * Implemented as an 8_8_8_8 data format to fix texture + * gathers in stencil sampling. This affects at least + * GL45-CTS.texture_cube_map_array.sampling on GFX8. + */ + if (sscreen->info.chip_class <= GFX8) + return V_008F14_IMG_DATA_FORMAT_8_8_8_8; + + if (format == PIPE_FORMAT_X24S8_UINT) + return V_008F14_IMG_DATA_FORMAT_8_24; + else + return V_008F14_IMG_DATA_FORMAT_24_8; + case PIPE_FORMAT_Z24X8_UNORM: + case PIPE_FORMAT_Z24_UNORM_S8_UINT: + return V_008F14_IMG_DATA_FORMAT_8_24; + case PIPE_FORMAT_X8Z24_UNORM: + case PIPE_FORMAT_S8_UINT_Z24_UNORM: + return V_008F14_IMG_DATA_FORMAT_24_8; + case PIPE_FORMAT_S8_UINT: + return V_008F14_IMG_DATA_FORMAT_8; + case PIPE_FORMAT_Z32_FLOAT: + return V_008F14_IMG_DATA_FORMAT_32; + case PIPE_FORMAT_X32_S8X24_UINT: + case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: + return V_008F14_IMG_DATA_FORMAT_X24_8_32; + default: + goto out_unknown; + } + + case UTIL_FORMAT_COLORSPACE_YUV: + goto out_unknown; /* TODO */ + + case UTIL_FORMAT_COLORSPACE_SRGB: + if (desc->nr_channels != 4 && desc->nr_channels != 1) + goto out_unknown; + break; + + default: + break; + } + + if (desc->layout == UTIL_FORMAT_LAYOUT_RGTC) { + if (!sscreen->info.has_format_bc1_through_bc7) + goto out_unknown; + + switch (format) { + case PIPE_FORMAT_RGTC1_SNORM: + case PIPE_FORMAT_LATC1_SNORM: + case PIPE_FORMAT_RGTC1_UNORM: + case PIPE_FORMAT_LATC1_UNORM: + return V_008F14_IMG_DATA_FORMAT_BC4; + case PIPE_FORMAT_RGTC2_SNORM: + case PIPE_FORMAT_LATC2_SNORM: + case PIPE_FORMAT_RGTC2_UNORM: + case PIPE_FORMAT_LATC2_UNORM: + return V_008F14_IMG_DATA_FORMAT_BC5; + default: + goto out_unknown; + } + } + + if (desc->layout == UTIL_FORMAT_LAYOUT_ETC && + (sscreen->info.family == CHIP_STONEY || + sscreen->info.family == CHIP_VEGA10 || + sscreen->info.family == CHIP_RAVEN)) { + switch (format) { + case PIPE_FORMAT_ETC1_RGB8: + case PIPE_FORMAT_ETC2_RGB8: + case PIPE_FORMAT_ETC2_SRGB8: + return V_008F14_IMG_DATA_FORMAT_ETC2_RGB; + case PIPE_FORMAT_ETC2_RGB8A1: + case PIPE_FORMAT_ETC2_SRGB8A1: + return V_008F14_IMG_DATA_FORMAT_ETC2_RGBA1; + case PIPE_FORMAT_ETC2_RGBA8: + case PIPE_FORMAT_ETC2_SRGBA8: + return V_008F14_IMG_DATA_FORMAT_ETC2_RGBA; + case PIPE_FORMAT_ETC2_R11_UNORM: + case PIPE_FORMAT_ETC2_R11_SNORM: + return V_008F14_IMG_DATA_FORMAT_ETC2_R; + case PIPE_FORMAT_ETC2_RG11_UNORM: + case PIPE_FORMAT_ETC2_RG11_SNORM: + return V_008F14_IMG_DATA_FORMAT_ETC2_RG; + default: + goto out_unknown; + } + } + + if (desc->layout == UTIL_FORMAT_LAYOUT_BPTC) { + if (!sscreen->info.has_format_bc1_through_bc7) + goto out_unknown; + + switch (format) { + case PIPE_FORMAT_BPTC_RGBA_UNORM: + case PIPE_FORMAT_BPTC_SRGBA: + return V_008F14_IMG_DATA_FORMAT_BC7; + case PIPE_FORMAT_BPTC_RGB_FLOAT: + case PIPE_FORMAT_BPTC_RGB_UFLOAT: + return V_008F14_IMG_DATA_FORMAT_BC6; + default: + goto out_unknown; + } + } + + if (desc->layout == UTIL_FORMAT_LAYOUT_SUBSAMPLED) { + switch (format) { + case PIPE_FORMAT_R8G8_B8G8_UNORM: + case PIPE_FORMAT_G8R8_B8R8_UNORM: + return V_008F14_IMG_DATA_FORMAT_GB_GR; + case PIPE_FORMAT_G8R8_G8B8_UNORM: + case PIPE_FORMAT_R8G8_R8B8_UNORM: + return V_008F14_IMG_DATA_FORMAT_BG_RG; + default: + goto out_unknown; + } + } + + if (desc->layout == UTIL_FORMAT_LAYOUT_S3TC) { + if (!sscreen->info.has_format_bc1_through_bc7) + goto out_unknown; + + switch (format) { + case PIPE_FORMAT_DXT1_RGB: + case PIPE_FORMAT_DXT1_RGBA: + case PIPE_FORMAT_DXT1_SRGB: + case PIPE_FORMAT_DXT1_SRGBA: + return V_008F14_IMG_DATA_FORMAT_BC1; + case PIPE_FORMAT_DXT3_RGBA: + case PIPE_FORMAT_DXT3_SRGBA: + return V_008F14_IMG_DATA_FORMAT_BC2; + case PIPE_FORMAT_DXT5_RGBA: + case PIPE_FORMAT_DXT5_SRGBA: + return V_008F14_IMG_DATA_FORMAT_BC3; + default: + goto out_unknown; + } + } + + if (format == PIPE_FORMAT_R9G9B9E5_FLOAT) { + return V_008F14_IMG_DATA_FORMAT_5_9_9_9; + } else if (format == PIPE_FORMAT_R11G11B10_FLOAT) { + return V_008F14_IMG_DATA_FORMAT_10_11_11; + } + + /* R8G8Bx_SNORM - TODO CxV8U8 */ + + /* hw cannot support mixed formats (except depth/stencil, since only + * depth is read).*/ + if (desc->is_mixed && desc->colorspace != UTIL_FORMAT_COLORSPACE_ZS) + goto out_unknown; + + /* See whether the components are of the same size. */ + for (i = 1; i < desc->nr_channels; i++) { + uniform = uniform && desc->channel[0].size == desc->channel[i].size; + } + + /* Non-uniform formats. */ + if (!uniform) { + switch(desc->nr_channels) { + case 3: + if (desc->channel[0].size == 5 && + desc->channel[1].size == 6 && + desc->channel[2].size == 5) { + return V_008F14_IMG_DATA_FORMAT_5_6_5; + } + goto out_unknown; + case 4: + if (desc->channel[0].size == 5 && + desc->channel[1].size == 5 && + desc->channel[2].size == 5 && + desc->channel[3].size == 1) { + return V_008F14_IMG_DATA_FORMAT_1_5_5_5; + } + if (desc->channel[0].size == 1 && + desc->channel[1].size == 5 && + desc->channel[2].size == 5 && + desc->channel[3].size == 5) { + return V_008F14_IMG_DATA_FORMAT_5_5_5_1; + } + if (desc->channel[0].size == 10 && + desc->channel[1].size == 10 && + desc->channel[2].size == 10 && + desc->channel[3].size == 2) { + return V_008F14_IMG_DATA_FORMAT_2_10_10_10; + } + goto out_unknown; + } + goto out_unknown; + } + + if (first_non_void < 0 || first_non_void > 3) + goto out_unknown; + + /* uniform formats */ + switch (desc->channel[first_non_void].size) { + case 4: + switch (desc->nr_channels) { #if 0 /* Not supported for render targets */ - case 2: - return V_008F14_IMG_DATA_FORMAT_4_4; + case 2: + return V_008F14_IMG_DATA_FORMAT_4_4; #endif - case 4: - return V_008F14_IMG_DATA_FORMAT_4_4_4_4; - } - break; - case 8: - switch (desc->nr_channels) { - case 1: - return V_008F14_IMG_DATA_FORMAT_8; - case 2: - return V_008F14_IMG_DATA_FORMAT_8_8; - case 4: - return V_008F14_IMG_DATA_FORMAT_8_8_8_8; - } - break; - case 16: - switch (desc->nr_channels) { - case 1: - return V_008F14_IMG_DATA_FORMAT_16; - case 2: - return V_008F14_IMG_DATA_FORMAT_16_16; - case 4: - return V_008F14_IMG_DATA_FORMAT_16_16_16_16; - } - break; - case 32: - switch (desc->nr_channels) { - case 1: - return V_008F14_IMG_DATA_FORMAT_32; - case 2: - return V_008F14_IMG_DATA_FORMAT_32_32; + case 4: + return V_008F14_IMG_DATA_FORMAT_4_4_4_4; + } + break; + case 8: + switch (desc->nr_channels) { + case 1: + return V_008F14_IMG_DATA_FORMAT_8; + case 2: + return V_008F14_IMG_DATA_FORMAT_8_8; + case 4: + return V_008F14_IMG_DATA_FORMAT_8_8_8_8; + } + break; + case 16: + switch (desc->nr_channels) { + case 1: + return V_008F14_IMG_DATA_FORMAT_16; + case 2: + return V_008F14_IMG_DATA_FORMAT_16_16; + case 4: + return V_008F14_IMG_DATA_FORMAT_16_16_16_16; + } + break; + case 32: + switch (desc->nr_channels) { + case 1: + return V_008F14_IMG_DATA_FORMAT_32; + case 2: + return V_008F14_IMG_DATA_FORMAT_32_32; #if 0 /* Not supported for render targets */ - case 3: - return V_008F14_IMG_DATA_FORMAT_32_32_32; + case 3: + return V_008F14_IMG_DATA_FORMAT_32_32_32; #endif - case 4: - return V_008F14_IMG_DATA_FORMAT_32_32_32_32; - } - } + case 4: + return V_008F14_IMG_DATA_FORMAT_32_32_32_32; + } + } out_unknown: - return ~0; + return ~0; } static unsigned si_tex_wrap(unsigned wrap) { - switch (wrap) { - default: - case PIPE_TEX_WRAP_REPEAT: - return V_008F30_SQ_TEX_WRAP; - case PIPE_TEX_WRAP_CLAMP: - return V_008F30_SQ_TEX_CLAMP_HALF_BORDER; - case PIPE_TEX_WRAP_CLAMP_TO_EDGE: - return V_008F30_SQ_TEX_CLAMP_LAST_TEXEL; - case PIPE_TEX_WRAP_CLAMP_TO_BORDER: - return V_008F30_SQ_TEX_CLAMP_BORDER; - case PIPE_TEX_WRAP_MIRROR_REPEAT: - return V_008F30_SQ_TEX_MIRROR; - case PIPE_TEX_WRAP_MIRROR_CLAMP: - return V_008F30_SQ_TEX_MIRROR_ONCE_HALF_BORDER; - case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: - return V_008F30_SQ_TEX_MIRROR_ONCE_LAST_TEXEL; - case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: - return V_008F30_SQ_TEX_MIRROR_ONCE_BORDER; - } + switch (wrap) { + default: + case PIPE_TEX_WRAP_REPEAT: + return V_008F30_SQ_TEX_WRAP; + case PIPE_TEX_WRAP_CLAMP: + return V_008F30_SQ_TEX_CLAMP_HALF_BORDER; + case PIPE_TEX_WRAP_CLAMP_TO_EDGE: + return V_008F30_SQ_TEX_CLAMP_LAST_TEXEL; + case PIPE_TEX_WRAP_CLAMP_TO_BORDER: + return V_008F30_SQ_TEX_CLAMP_BORDER; + case PIPE_TEX_WRAP_MIRROR_REPEAT: + return V_008F30_SQ_TEX_MIRROR; + case PIPE_TEX_WRAP_MIRROR_CLAMP: + return V_008F30_SQ_TEX_MIRROR_ONCE_HALF_BORDER; + case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: + return V_008F30_SQ_TEX_MIRROR_ONCE_LAST_TEXEL; + case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: + return V_008F30_SQ_TEX_MIRROR_ONCE_BORDER; + } } static unsigned si_tex_mipfilter(unsigned filter) { - switch (filter) { - case PIPE_TEX_MIPFILTER_NEAREST: - return V_008F38_SQ_TEX_Z_FILTER_POINT; - case PIPE_TEX_MIPFILTER_LINEAR: - return V_008F38_SQ_TEX_Z_FILTER_LINEAR; - default: - case PIPE_TEX_MIPFILTER_NONE: - return V_008F38_SQ_TEX_Z_FILTER_NONE; - } + switch (filter) { + case PIPE_TEX_MIPFILTER_NEAREST: + return V_008F38_SQ_TEX_Z_FILTER_POINT; + case PIPE_TEX_MIPFILTER_LINEAR: + return V_008F38_SQ_TEX_Z_FILTER_LINEAR; + default: + case PIPE_TEX_MIPFILTER_NONE: + return V_008F38_SQ_TEX_Z_FILTER_NONE; + } } static unsigned si_tex_compare(unsigned compare) { - switch (compare) { - default: - case PIPE_FUNC_NEVER: - return V_008F30_SQ_TEX_DEPTH_COMPARE_NEVER; - case PIPE_FUNC_LESS: - return V_008F30_SQ_TEX_DEPTH_COMPARE_LESS; - case PIPE_FUNC_EQUAL: - return V_008F30_SQ_TEX_DEPTH_COMPARE_EQUAL; - case PIPE_FUNC_LEQUAL: - return V_008F30_SQ_TEX_DEPTH_COMPARE_LESSEQUAL; - case PIPE_FUNC_GREATER: - return V_008F30_SQ_TEX_DEPTH_COMPARE_GREATER; - case PIPE_FUNC_NOTEQUAL: - return V_008F30_SQ_TEX_DEPTH_COMPARE_NOTEQUAL; - case PIPE_FUNC_GEQUAL: - return V_008F30_SQ_TEX_DEPTH_COMPARE_GREATEREQUAL; - case PIPE_FUNC_ALWAYS: - return V_008F30_SQ_TEX_DEPTH_COMPARE_ALWAYS; - } + switch (compare) { + default: + case PIPE_FUNC_NEVER: + return V_008F30_SQ_TEX_DEPTH_COMPARE_NEVER; + case PIPE_FUNC_LESS: + return V_008F30_SQ_TEX_DEPTH_COMPARE_LESS; + case PIPE_FUNC_EQUAL: + return V_008F30_SQ_TEX_DEPTH_COMPARE_EQUAL; + case PIPE_FUNC_LEQUAL: + return V_008F30_SQ_TEX_DEPTH_COMPARE_LESSEQUAL; + case PIPE_FUNC_GREATER: + return V_008F30_SQ_TEX_DEPTH_COMPARE_GREATER; + case PIPE_FUNC_NOTEQUAL: + return V_008F30_SQ_TEX_DEPTH_COMPARE_NOTEQUAL; + case PIPE_FUNC_GEQUAL: + return V_008F30_SQ_TEX_DEPTH_COMPARE_GREATEREQUAL; + case PIPE_FUNC_ALWAYS: + return V_008F30_SQ_TEX_DEPTH_COMPARE_ALWAYS; + } } static unsigned si_tex_dim(struct si_screen *sscreen, struct si_texture *tex, - unsigned view_target, unsigned nr_samples) + unsigned view_target, unsigned nr_samples) { - unsigned res_target = tex->buffer.b.b.target; + unsigned res_target = tex->buffer.b.b.target; - if (view_target == PIPE_TEXTURE_CUBE || - view_target == PIPE_TEXTURE_CUBE_ARRAY) - res_target = view_target; - /* If interpreting cubemaps as something else, set 2D_ARRAY. */ - else if (res_target == PIPE_TEXTURE_CUBE || - res_target == PIPE_TEXTURE_CUBE_ARRAY) - res_target = PIPE_TEXTURE_2D_ARRAY; - - /* GFX9 allocates 1D textures as 2D. */ - if ((res_target == PIPE_TEXTURE_1D || - res_target == PIPE_TEXTURE_1D_ARRAY) && - sscreen->info.chip_class == GFX9 && - tex->surface.u.gfx9.resource_type == RADEON_RESOURCE_2D) { - if (res_target == PIPE_TEXTURE_1D) - res_target = PIPE_TEXTURE_2D; - else - res_target = PIPE_TEXTURE_2D_ARRAY; - } - - switch (res_target) { - default: - case PIPE_TEXTURE_1D: - return V_008F1C_SQ_RSRC_IMG_1D; - case PIPE_TEXTURE_1D_ARRAY: - return V_008F1C_SQ_RSRC_IMG_1D_ARRAY; - case PIPE_TEXTURE_2D: - case PIPE_TEXTURE_RECT: - return nr_samples > 1 ? V_008F1C_SQ_RSRC_IMG_2D_MSAA : - V_008F1C_SQ_RSRC_IMG_2D; - case PIPE_TEXTURE_2D_ARRAY: - return nr_samples > 1 ? V_008F1C_SQ_RSRC_IMG_2D_MSAA_ARRAY : - V_008F1C_SQ_RSRC_IMG_2D_ARRAY; - case PIPE_TEXTURE_3D: - return V_008F1C_SQ_RSRC_IMG_3D; - case PIPE_TEXTURE_CUBE: - case PIPE_TEXTURE_CUBE_ARRAY: - return V_008F1C_SQ_RSRC_IMG_CUBE; - } + if (view_target == PIPE_TEXTURE_CUBE || + view_target == PIPE_TEXTURE_CUBE_ARRAY) + res_target = view_target; + /* If interpreting cubemaps as something else, set 2D_ARRAY. */ + else if (res_target == PIPE_TEXTURE_CUBE || + res_target == PIPE_TEXTURE_CUBE_ARRAY) + res_target = PIPE_TEXTURE_2D_ARRAY; + + /* GFX9 allocates 1D textures as 2D. */ + if ((res_target == PIPE_TEXTURE_1D || + res_target == PIPE_TEXTURE_1D_ARRAY) && + sscreen->info.chip_class == GFX9 && + tex->surface.u.gfx9.resource_type == RADEON_RESOURCE_2D) { + if (res_target == PIPE_TEXTURE_1D) + res_target = PIPE_TEXTURE_2D; + else + res_target = PIPE_TEXTURE_2D_ARRAY; + } + + switch (res_target) { + default: + case PIPE_TEXTURE_1D: + return V_008F1C_SQ_RSRC_IMG_1D; + case PIPE_TEXTURE_1D_ARRAY: + return V_008F1C_SQ_RSRC_IMG_1D_ARRAY; + case PIPE_TEXTURE_2D: + case PIPE_TEXTURE_RECT: + return nr_samples > 1 ? V_008F1C_SQ_RSRC_IMG_2D_MSAA : + V_008F1C_SQ_RSRC_IMG_2D; + case PIPE_TEXTURE_2D_ARRAY: + return nr_samples > 1 ? V_008F1C_SQ_RSRC_IMG_2D_MSAA_ARRAY : + V_008F1C_SQ_RSRC_IMG_2D_ARRAY; + case PIPE_TEXTURE_3D: + return V_008F1C_SQ_RSRC_IMG_3D; + case PIPE_TEXTURE_CUBE: + case PIPE_TEXTURE_CUBE_ARRAY: + return V_008F1C_SQ_RSRC_IMG_CUBE; + } } /* @@ -2030,292 +2029,293 @@ static bool si_is_sampler_format_supported(struct pipe_screen *screen, enum pipe_format format) { - struct si_screen *sscreen = (struct si_screen *)screen; + struct si_screen *sscreen = (struct si_screen *)screen; - if (sscreen->info.chip_class >= GFX10) { - const struct gfx10_format *fmt = &gfx10_format_table[format]; - if (!fmt->img_format || fmt->buffers_only) - return false; - return true; - } - - const struct util_format_description *desc = util_format_description(format); - if (!desc) - return false; + if (sscreen->info.chip_class >= GFX10) { + const struct gfx10_format *fmt = &gfx10_format_table[format]; + if (!fmt->img_format || fmt->buffers_only) + return false; + return true; + } + + const struct util_format_description *desc = util_format_description(format); + if (!desc) + return false; - return si_translate_texformat(screen, format, desc, - util_format_get_first_non_void_channel(format)) != ~0U; + return si_translate_texformat(screen, format, desc, + util_format_get_first_non_void_channel(format)) != ~0U; } static uint32_t si_translate_buffer_dataformat(struct pipe_screen *screen, - const struct util_format_description *desc, - int first_non_void) + const struct util_format_description *desc, + int first_non_void) { - int i; + int i; - assert(((struct si_screen *)screen)->info.chip_class <= GFX9); + assert(((struct si_screen *)screen)->info.chip_class <= GFX9); - if (desc->format == PIPE_FORMAT_R11G11B10_FLOAT) - return V_008F0C_BUF_DATA_FORMAT_10_11_11; - - assert(first_non_void >= 0); - - if (desc->nr_channels == 4 && - desc->channel[0].size == 10 && - desc->channel[1].size == 10 && - desc->channel[2].size == 10 && - desc->channel[3].size == 2) - return V_008F0C_BUF_DATA_FORMAT_2_10_10_10; - - /* See whether the components are of the same size. */ - for (i = 0; i < desc->nr_channels; i++) { - if (desc->channel[first_non_void].size != desc->channel[i].size) - return V_008F0C_BUF_DATA_FORMAT_INVALID; - } - - switch (desc->channel[first_non_void].size) { - case 8: - switch (desc->nr_channels) { - case 1: - case 3: /* 3 loads */ - return V_008F0C_BUF_DATA_FORMAT_8; - case 2: - return V_008F0C_BUF_DATA_FORMAT_8_8; - case 4: - return V_008F0C_BUF_DATA_FORMAT_8_8_8_8; - } - break; - case 16: - switch (desc->nr_channels) { - case 1: - case 3: /* 3 loads */ - return V_008F0C_BUF_DATA_FORMAT_16; - case 2: - return V_008F0C_BUF_DATA_FORMAT_16_16; - case 4: - return V_008F0C_BUF_DATA_FORMAT_16_16_16_16; - } - break; - case 32: - switch (desc->nr_channels) { - case 1: - return V_008F0C_BUF_DATA_FORMAT_32; - case 2: - return V_008F0C_BUF_DATA_FORMAT_32_32; - case 3: - return V_008F0C_BUF_DATA_FORMAT_32_32_32; - case 4: - return V_008F0C_BUF_DATA_FORMAT_32_32_32_32; - } - break; - case 64: - /* Legacy double formats. */ - switch (desc->nr_channels) { - case 1: /* 1 load */ - return V_008F0C_BUF_DATA_FORMAT_32_32; - case 2: /* 1 load */ - return V_008F0C_BUF_DATA_FORMAT_32_32_32_32; - case 3: /* 3 loads */ - return V_008F0C_BUF_DATA_FORMAT_32_32; - case 4: /* 2 loads */ - return V_008F0C_BUF_DATA_FORMAT_32_32_32_32; - } - break; - } + if (desc->format == PIPE_FORMAT_R11G11B10_FLOAT) + return V_008F0C_BUF_DATA_FORMAT_10_11_11; + + assert(first_non_void >= 0); + + if (desc->nr_channels == 4 && + desc->channel[0].size == 10 && + desc->channel[1].size == 10 && + desc->channel[2].size == 10 && + desc->channel[3].size == 2) + return V_008F0C_BUF_DATA_FORMAT_2_10_10_10; + + /* See whether the components are of the same size. */ + for (i = 0; i < desc->nr_channels; i++) { + if (desc->channel[first_non_void].size != desc->channel[i].size) + return V_008F0C_BUF_DATA_FORMAT_INVALID; + } + + switch (desc->channel[first_non_void].size) { + case 8: + switch (desc->nr_channels) { + case 1: + case 3: /* 3 loads */ + return V_008F0C_BUF_DATA_FORMAT_8; + case 2: + return V_008F0C_BUF_DATA_FORMAT_8_8; + case 4: + return V_008F0C_BUF_DATA_FORMAT_8_8_8_8; + } + break; + case 16: + switch (desc->nr_channels) { + case 1: + case 3: /* 3 loads */ + return V_008F0C_BUF_DATA_FORMAT_16; + case 2: + return V_008F0C_BUF_DATA_FORMAT_16_16; + case 4: + return V_008F0C_BUF_DATA_FORMAT_16_16_16_16; + } + break; + case 32: + switch (desc->nr_channels) { + case 1: + return V_008F0C_BUF_DATA_FORMAT_32; + case 2: + return V_008F0C_BUF_DATA_FORMAT_32_32; + case 3: + return V_008F0C_BUF_DATA_FORMAT_32_32_32; + case 4: + return V_008F0C_BUF_DATA_FORMAT_32_32_32_32; + } + break; + case 64: + /* Legacy double formats. */ + switch (desc->nr_channels) { + case 1: /* 1 load */ + return V_008F0C_BUF_DATA_FORMAT_32_32; + case 2: /* 1 load */ + return V_008F0C_BUF_DATA_FORMAT_32_32_32_32; + case 3: /* 3 loads */ + return V_008F0C_BUF_DATA_FORMAT_32_32; + case 4: /* 2 loads */ + return V_008F0C_BUF_DATA_FORMAT_32_32_32_32; + } + break; + } - return V_008F0C_BUF_DATA_FORMAT_INVALID; + return V_008F0C_BUF_DATA_FORMAT_INVALID; } static uint32_t si_translate_buffer_numformat(struct pipe_screen *screen, - const struct util_format_description *desc, - int first_non_void) + const struct util_format_description *desc, + int first_non_void) { - assert(((struct si_screen *)screen)->info.chip_class <= GFX9); + assert(((struct si_screen *)screen)->info.chip_class <= GFX9); - if (desc->format == PIPE_FORMAT_R11G11B10_FLOAT) - return V_008F0C_BUF_NUM_FORMAT_FLOAT; + if (desc->format == PIPE_FORMAT_R11G11B10_FLOAT) + return V_008F0C_BUF_NUM_FORMAT_FLOAT; - assert(first_non_void >= 0); - - switch (desc->channel[first_non_void].type) { - case UTIL_FORMAT_TYPE_SIGNED: - case UTIL_FORMAT_TYPE_FIXED: - if (desc->channel[first_non_void].size >= 32 || - desc->channel[first_non_void].pure_integer) - return V_008F0C_BUF_NUM_FORMAT_SINT; - else if (desc->channel[first_non_void].normalized) - return V_008F0C_BUF_NUM_FORMAT_SNORM; - else - return V_008F0C_BUF_NUM_FORMAT_SSCALED; - break; - case UTIL_FORMAT_TYPE_UNSIGNED: - if (desc->channel[first_non_void].size >= 32 || - desc->channel[first_non_void].pure_integer) - return V_008F0C_BUF_NUM_FORMAT_UINT; - else if (desc->channel[first_non_void].normalized) - return V_008F0C_BUF_NUM_FORMAT_UNORM; - else - return V_008F0C_BUF_NUM_FORMAT_USCALED; - break; - case UTIL_FORMAT_TYPE_FLOAT: - default: - return V_008F0C_BUF_NUM_FORMAT_FLOAT; - } + assert(first_non_void >= 0); + + switch (desc->channel[first_non_void].type) { + case UTIL_FORMAT_TYPE_SIGNED: + case UTIL_FORMAT_TYPE_FIXED: + if (desc->channel[first_non_void].size >= 32 || + desc->channel[first_non_void].pure_integer) + return V_008F0C_BUF_NUM_FORMAT_SINT; + else if (desc->channel[first_non_void].normalized) + return V_008F0C_BUF_NUM_FORMAT_SNORM; + else + return V_008F0C_BUF_NUM_FORMAT_SSCALED; + break; + case UTIL_FORMAT_TYPE_UNSIGNED: + if (desc->channel[first_non_void].size >= 32 || + desc->channel[first_non_void].pure_integer) + return V_008F0C_BUF_NUM_FORMAT_UINT; + else if (desc->channel[first_non_void].normalized) + return V_008F0C_BUF_NUM_FORMAT_UNORM; + else + return V_008F0C_BUF_NUM_FORMAT_USCALED; + break; + case UTIL_FORMAT_TYPE_FLOAT: + default: + return V_008F0C_BUF_NUM_FORMAT_FLOAT; + } } static unsigned si_is_vertex_format_supported(struct pipe_screen *screen, - enum pipe_format format, - unsigned usage) + enum pipe_format format, + unsigned usage) { - struct si_screen *sscreen = (struct si_screen *)screen; - const struct util_format_description *desc; - int first_non_void; - unsigned data_format; - - assert((usage & ~(PIPE_BIND_SHADER_IMAGE | - PIPE_BIND_SAMPLER_VIEW | - PIPE_BIND_VERTEX_BUFFER)) == 0); - - desc = util_format_description(format); - if (!desc) - return 0; - - /* There are no native 8_8_8 or 16_16_16 data formats, and we currently - * select 8_8_8_8 and 16_16_16_16 instead. This works reasonably well - * for read-only access (with caveats surrounding bounds checks), but - * obviously fails for write access which we have to implement for - * shader images. Luckily, OpenGL doesn't expect this to be supported - * anyway, and so the only impact is on PBO uploads / downloads, which - * shouldn't be expected to be fast for GL_RGB anyway. - */ - if (desc->block.bits == 3 * 8 || - desc->block.bits == 3 * 16) { - if (usage & (PIPE_BIND_SHADER_IMAGE | PIPE_BIND_SAMPLER_VIEW)) { - usage &= ~(PIPE_BIND_SHADER_IMAGE | PIPE_BIND_SAMPLER_VIEW); - if (!usage) - return 0; - } - } - - if (sscreen->info.chip_class >= GFX10) { - const struct gfx10_format *fmt = &gfx10_format_table[format]; - if (!fmt->img_format || fmt->img_format >= 128) - return 0; - return usage; - } - - first_non_void = util_format_get_first_non_void_channel(format); - data_format = si_translate_buffer_dataformat(screen, desc, first_non_void); - if (data_format == V_008F0C_BUF_DATA_FORMAT_INVALID) - return 0; + struct si_screen *sscreen = (struct si_screen *)screen; + const struct util_format_description *desc; + int first_non_void; + unsigned data_format; + + assert((usage & ~(PIPE_BIND_SHADER_IMAGE | + PIPE_BIND_SAMPLER_VIEW | + PIPE_BIND_VERTEX_BUFFER)) == 0); + + desc = util_format_description(format); + if (!desc) + return 0; + + /* There are no native 8_8_8 or 16_16_16 data formats, and we currently + * select 8_8_8_8 and 16_16_16_16 instead. This works reasonably well + * for read-only access (with caveats surrounding bounds checks), but + * obviously fails for write access which we have to implement for + * shader images. Luckily, OpenGL doesn't expect this to be supported + * anyway, and so the only impact is on PBO uploads / downloads, which + * shouldn't be expected to be fast for GL_RGB anyway. + */ + if (desc->block.bits == 3 * 8 || + desc->block.bits == 3 * 16) { + if (usage & (PIPE_BIND_SHADER_IMAGE | PIPE_BIND_SAMPLER_VIEW)) { + usage &= ~(PIPE_BIND_SHADER_IMAGE | PIPE_BIND_SAMPLER_VIEW); + if (!usage) + return 0; + } + } + + if (sscreen->info.chip_class >= GFX10) { + const struct gfx10_format *fmt = &gfx10_format_table[format]; + if (!fmt->img_format || fmt->img_format >= 128) + return 0; + return usage; + } + + first_non_void = util_format_get_first_non_void_channel(format); + data_format = si_translate_buffer_dataformat(screen, desc, first_non_void); + if (data_format == V_008F0C_BUF_DATA_FORMAT_INVALID) + return 0; - return usage; + return usage; } static bool si_is_colorbuffer_format_supported(enum pipe_format format) { - return si_translate_colorformat(format) != V_028C70_COLOR_INVALID && - si_translate_colorswap(format, false) != ~0U; + return si_translate_colorformat(format) != V_028C70_COLOR_INVALID && + si_translate_colorswap(format, false) != ~0U; } static bool si_is_zs_format_supported(enum pipe_format format) { - return si_translate_dbformat(format) != V_028040_Z_INVALID; + return si_translate_dbformat(format) != V_028040_Z_INVALID; } static bool si_is_format_supported(struct pipe_screen *screen, - enum pipe_format format, - enum pipe_texture_target target, - unsigned sample_count, - unsigned storage_sample_count, - unsigned usage) -{ - struct si_screen *sscreen = (struct si_screen *)screen; - unsigned retval = 0; - - if (target >= PIPE_MAX_TEXTURE_TYPES) { - PRINT_ERR("radeonsi: unsupported texture type %d\n", target); - return false; - } - - if (MAX2(1, sample_count) < MAX2(1, storage_sample_count)) - return false; - - if (sample_count > 1) { - if (!screen->get_param(screen, PIPE_CAP_TEXTURE_MULTISAMPLE)) - return false; - - if (usage & PIPE_BIND_SHADER_IMAGE) - return false; - - /* Only power-of-two sample counts are supported. */ - if (!util_is_power_of_two_or_zero(sample_count) || - !util_is_power_of_two_or_zero(storage_sample_count)) - return false; - - /* MSAA support without framebuffer attachments. */ - if (format == PIPE_FORMAT_NONE && sample_count <= 16) - return true; - - if (!sscreen->info.has_eqaa_surface_allocator || - util_format_is_depth_or_stencil(format)) { - /* Color without EQAA or depth/stencil. */ - if (sample_count > 8 || - sample_count != storage_sample_count) - return false; - } else { - /* Color with EQAA. */ - if (sample_count > 16 || - storage_sample_count > 8) - return false; - } - } - - if (usage & (PIPE_BIND_SAMPLER_VIEW | - PIPE_BIND_SHADER_IMAGE)) { - if (target == PIPE_BUFFER) { - retval |= si_is_vertex_format_supported( - screen, format, usage & (PIPE_BIND_SAMPLER_VIEW | - PIPE_BIND_SHADER_IMAGE)); - } else { - if (si_is_sampler_format_supported(screen, format)) - retval |= usage & (PIPE_BIND_SAMPLER_VIEW | - PIPE_BIND_SHADER_IMAGE); - } - } - - if ((usage & (PIPE_BIND_RENDER_TARGET | - PIPE_BIND_DISPLAY_TARGET | - PIPE_BIND_SCANOUT | - PIPE_BIND_SHARED | - PIPE_BIND_BLENDABLE)) && - si_is_colorbuffer_format_supported(format)) { - retval |= usage & - (PIPE_BIND_RENDER_TARGET | - PIPE_BIND_DISPLAY_TARGET | - PIPE_BIND_SCANOUT | - PIPE_BIND_SHARED); - if (!util_format_is_pure_integer(format) && - !util_format_is_depth_or_stencil(format)) - retval |= usage & PIPE_BIND_BLENDABLE; - } - - if ((usage & PIPE_BIND_DEPTH_STENCIL) && - si_is_zs_format_supported(format)) { - retval |= PIPE_BIND_DEPTH_STENCIL; - } - - if (usage & PIPE_BIND_VERTEX_BUFFER) { - retval |= si_is_vertex_format_supported(screen, format, - PIPE_BIND_VERTEX_BUFFER); - } - - if ((usage & PIPE_BIND_LINEAR) && - !util_format_is_compressed(format) && - !(usage & PIPE_BIND_DEPTH_STENCIL)) - retval |= PIPE_BIND_LINEAR; + enum pipe_format format, + enum pipe_texture_target target, + unsigned sample_count, + unsigned storage_sample_count, + unsigned usage) +{ + struct si_screen *sscreen = (struct si_screen *)screen; + unsigned retval = 0; + + if (target >= PIPE_MAX_TEXTURE_TYPES) { + PRINT_ERR("radeonsi: unsupported texture type %d\n", target); + return false; + } + + if (MAX2(1, sample_count) < MAX2(1, storage_sample_count)) + return false; + + if (sample_count > 1) { + if (!screen->get_param(screen, PIPE_CAP_TEXTURE_MULTISAMPLE)) + return false; + + /* Only power-of-two sample counts are supported. */ + if (!util_is_power_of_two_or_zero(sample_count) || + !util_is_power_of_two_or_zero(storage_sample_count)) + return false; + + /* Chips with 1 RB don't increment occlusion queries at 16x MSAA sample rate, + * so don't expose 16 samples there. + */ + const unsigned max_eqaa_samples = sscreen->info.num_render_backends == 1 ? 8 : 16; + const unsigned max_samples = 8; + + /* MSAA support without framebuffer attachments. */ + if (format == PIPE_FORMAT_NONE && sample_count <= max_eqaa_samples) + return true; + + if (!sscreen->info.has_eqaa_surface_allocator || + util_format_is_depth_or_stencil(format)) { + /* Color without EQAA or depth/stencil. */ + if (sample_count > max_samples || sample_count != storage_sample_count) + return false; + } else { + /* Color with EQAA. */ + if (sample_count > max_eqaa_samples || storage_sample_count > max_samples) + return false; + } + } + + if (usage & (PIPE_BIND_SAMPLER_VIEW | + PIPE_BIND_SHADER_IMAGE)) { + if (target == PIPE_BUFFER) { + retval |= si_is_vertex_format_supported( + screen, format, usage & (PIPE_BIND_SAMPLER_VIEW | + PIPE_BIND_SHADER_IMAGE)); + } else { + if (si_is_sampler_format_supported(screen, format)) + retval |= usage & (PIPE_BIND_SAMPLER_VIEW | + PIPE_BIND_SHADER_IMAGE); + } + } + + if ((usage & (PIPE_BIND_RENDER_TARGET | + PIPE_BIND_DISPLAY_TARGET | + PIPE_BIND_SCANOUT | + PIPE_BIND_SHARED | + PIPE_BIND_BLENDABLE)) && + si_is_colorbuffer_format_supported(format)) { + retval |= usage & + (PIPE_BIND_RENDER_TARGET | + PIPE_BIND_DISPLAY_TARGET | + PIPE_BIND_SCANOUT | + PIPE_BIND_SHARED); + if (!util_format_is_pure_integer(format) && + !util_format_is_depth_or_stencil(format)) + retval |= usage & PIPE_BIND_BLENDABLE; + } + + if ((usage & PIPE_BIND_DEPTH_STENCIL) && + si_is_zs_format_supported(format)) { + retval |= PIPE_BIND_DEPTH_STENCIL; + } + + if (usage & PIPE_BIND_VERTEX_BUFFER) { + retval |= si_is_vertex_format_supported(screen, format, + PIPE_BIND_VERTEX_BUFFER); + } + + if ((usage & PIPE_BIND_LINEAR) && + !util_format_is_compressed(format) && + !(usage & PIPE_BIND_DEPTH_STENCIL)) + retval |= PIPE_BIND_LINEAR; - return retval == usage; + return retval == usage; } /* @@ -2323,1451 +2323,1458 @@ */ static void si_choose_spi_color_formats(struct si_surface *surf, - unsigned format, unsigned swap, - unsigned ntype, bool is_depth) + unsigned format, unsigned swap, + unsigned ntype, bool is_depth) { - /* Alpha is needed for alpha-to-coverage. - * Blending may be with or without alpha. - */ - unsigned normal = 0; /* most optimal, may not support blending or export alpha */ - unsigned alpha = 0; /* exports alpha, but may not support blending */ - unsigned blend = 0; /* supports blending, but may not export alpha */ - unsigned blend_alpha = 0; /* least optimal, supports blending and exports alpha */ - - /* Choose the SPI color formats. These are required values for RB+. - * Other chips have multiple choices, though they are not necessarily better. - */ - switch (format) { - case V_028C70_COLOR_5_6_5: - case V_028C70_COLOR_1_5_5_5: - case V_028C70_COLOR_5_5_5_1: - case V_028C70_COLOR_4_4_4_4: - case V_028C70_COLOR_10_11_11: - case V_028C70_COLOR_11_11_10: - case V_028C70_COLOR_8: - case V_028C70_COLOR_8_8: - case V_028C70_COLOR_8_8_8_8: - case V_028C70_COLOR_10_10_10_2: - case V_028C70_COLOR_2_10_10_10: - if (ntype == V_028C70_NUMBER_UINT) - alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_UINT16_ABGR; - else if (ntype == V_028C70_NUMBER_SINT) - alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_SINT16_ABGR; - else - alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_FP16_ABGR; - break; - - case V_028C70_COLOR_16: - case V_028C70_COLOR_16_16: - case V_028C70_COLOR_16_16_16_16: - if (ntype == V_028C70_NUMBER_UNORM || - ntype == V_028C70_NUMBER_SNORM) { - /* UNORM16 and SNORM16 don't support blending */ - if (ntype == V_028C70_NUMBER_UNORM) - normal = alpha = V_028714_SPI_SHADER_UNORM16_ABGR; - else - normal = alpha = V_028714_SPI_SHADER_SNORM16_ABGR; - - /* Use 32 bits per channel for blending. */ - if (format == V_028C70_COLOR_16) { - if (swap == V_028C70_SWAP_STD) { /* R */ - blend = V_028714_SPI_SHADER_32_R; - blend_alpha = V_028714_SPI_SHADER_32_AR; - } else if (swap == V_028C70_SWAP_ALT_REV) /* A */ - blend = blend_alpha = V_028714_SPI_SHADER_32_AR; - else - assert(0); - } else if (format == V_028C70_COLOR_16_16) { - if (swap == V_028C70_SWAP_STD) { /* RG */ - blend = V_028714_SPI_SHADER_32_GR; - blend_alpha = V_028714_SPI_SHADER_32_ABGR; - } else if (swap == V_028C70_SWAP_ALT) /* RA */ - blend = blend_alpha = V_028714_SPI_SHADER_32_AR; - else - assert(0); - } else /* 16_16_16_16 */ - blend = blend_alpha = V_028714_SPI_SHADER_32_ABGR; - } else if (ntype == V_028C70_NUMBER_UINT) - alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_UINT16_ABGR; - else if (ntype == V_028C70_NUMBER_SINT) - alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_SINT16_ABGR; - else if (ntype == V_028C70_NUMBER_FLOAT) - alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_FP16_ABGR; - else - assert(0); - break; - - case V_028C70_COLOR_32: - if (swap == V_028C70_SWAP_STD) { /* R */ - blend = normal = V_028714_SPI_SHADER_32_R; - alpha = blend_alpha = V_028714_SPI_SHADER_32_AR; - } else if (swap == V_028C70_SWAP_ALT_REV) /* A */ - alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_AR; - else - assert(0); - break; - - case V_028C70_COLOR_32_32: - if (swap == V_028C70_SWAP_STD) { /* RG */ - blend = normal = V_028714_SPI_SHADER_32_GR; - alpha = blend_alpha = V_028714_SPI_SHADER_32_ABGR; - } else if (swap == V_028C70_SWAP_ALT) /* RA */ - alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_AR; - else - assert(0); - break; - - case V_028C70_COLOR_32_32_32_32: - case V_028C70_COLOR_8_24: - case V_028C70_COLOR_24_8: - case V_028C70_COLOR_X24_8_32_FLOAT: - alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_ABGR; - break; - - default: - assert(0); - return; - } - - /* The DB->CB copy needs 32_ABGR. */ - if (is_depth) - alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_ABGR; - - surf->spi_shader_col_format = normal; - surf->spi_shader_col_format_alpha = alpha; - surf->spi_shader_col_format_blend = blend; - surf->spi_shader_col_format_blend_alpha = blend_alpha; + /* Alpha is needed for alpha-to-coverage. + * Blending may be with or without alpha. + */ + unsigned normal = 0; /* most optimal, may not support blending or export alpha */ + unsigned alpha = 0; /* exports alpha, but may not support blending */ + unsigned blend = 0; /* supports blending, but may not export alpha */ + unsigned blend_alpha = 0; /* least optimal, supports blending and exports alpha */ + + /* Choose the SPI color formats. These are required values for RB+. + * Other chips have multiple choices, though they are not necessarily better. + */ + switch (format) { + case V_028C70_COLOR_5_6_5: + case V_028C70_COLOR_1_5_5_5: + case V_028C70_COLOR_5_5_5_1: + case V_028C70_COLOR_4_4_4_4: + case V_028C70_COLOR_10_11_11: + case V_028C70_COLOR_11_11_10: + case V_028C70_COLOR_8: + case V_028C70_COLOR_8_8: + case V_028C70_COLOR_8_8_8_8: + case V_028C70_COLOR_10_10_10_2: + case V_028C70_COLOR_2_10_10_10: + if (ntype == V_028C70_NUMBER_UINT) + alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_UINT16_ABGR; + else if (ntype == V_028C70_NUMBER_SINT) + alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_SINT16_ABGR; + else + alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_FP16_ABGR; + break; + + case V_028C70_COLOR_16: + case V_028C70_COLOR_16_16: + case V_028C70_COLOR_16_16_16_16: + if (ntype == V_028C70_NUMBER_UNORM || + ntype == V_028C70_NUMBER_SNORM) { + /* UNORM16 and SNORM16 don't support blending */ + if (ntype == V_028C70_NUMBER_UNORM) + normal = alpha = V_028714_SPI_SHADER_UNORM16_ABGR; + else + normal = alpha = V_028714_SPI_SHADER_SNORM16_ABGR; + + /* Use 32 bits per channel for blending. */ + if (format == V_028C70_COLOR_16) { + if (swap == V_028C70_SWAP_STD) { /* R */ + blend = V_028714_SPI_SHADER_32_R; + blend_alpha = V_028714_SPI_SHADER_32_AR; + } else if (swap == V_028C70_SWAP_ALT_REV) /* A */ + blend = blend_alpha = V_028714_SPI_SHADER_32_AR; + else + assert(0); + } else if (format == V_028C70_COLOR_16_16) { + if (swap == V_028C70_SWAP_STD) { /* RG */ + blend = V_028714_SPI_SHADER_32_GR; + blend_alpha = V_028714_SPI_SHADER_32_ABGR; + } else if (swap == V_028C70_SWAP_ALT) /* RA */ + blend = blend_alpha = V_028714_SPI_SHADER_32_AR; + else + assert(0); + } else /* 16_16_16_16 */ + blend = blend_alpha = V_028714_SPI_SHADER_32_ABGR; + } else if (ntype == V_028C70_NUMBER_UINT) + alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_UINT16_ABGR; + else if (ntype == V_028C70_NUMBER_SINT) + alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_SINT16_ABGR; + else if (ntype == V_028C70_NUMBER_FLOAT) + alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_FP16_ABGR; + else + assert(0); + break; + + case V_028C70_COLOR_32: + if (swap == V_028C70_SWAP_STD) { /* R */ + blend = normal = V_028714_SPI_SHADER_32_R; + alpha = blend_alpha = V_028714_SPI_SHADER_32_AR; + } else if (swap == V_028C70_SWAP_ALT_REV) /* A */ + alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_AR; + else + assert(0); + break; + + case V_028C70_COLOR_32_32: + if (swap == V_028C70_SWAP_STD) { /* RG */ + blend = normal = V_028714_SPI_SHADER_32_GR; + alpha = blend_alpha = V_028714_SPI_SHADER_32_ABGR; + } else if (swap == V_028C70_SWAP_ALT) /* RA */ + alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_AR; + else + assert(0); + break; + + case V_028C70_COLOR_32_32_32_32: + case V_028C70_COLOR_8_24: + case V_028C70_COLOR_24_8: + case V_028C70_COLOR_X24_8_32_FLOAT: + alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_ABGR; + break; + + default: + assert(0); + return; + } + + /* The DB->CB copy needs 32_ABGR. */ + if (is_depth) + alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_ABGR; + + surf->spi_shader_col_format = normal; + surf->spi_shader_col_format_alpha = alpha; + surf->spi_shader_col_format_blend = blend; + surf->spi_shader_col_format_blend_alpha = blend_alpha; } static void si_initialize_color_surface(struct si_context *sctx, - struct si_surface *surf) + struct si_surface *surf) { - struct si_texture *tex = (struct si_texture*)surf->base.texture; - unsigned color_info, color_attrib; - unsigned format, swap, ntype, endian; - const struct util_format_description *desc; - int firstchan; - unsigned blend_clamp = 0, blend_bypass = 0; - - desc = util_format_description(surf->base.format); - for (firstchan = 0; firstchan < 4; firstchan++) { - if (desc->channel[firstchan].type != UTIL_FORMAT_TYPE_VOID) { - break; - } - } - if (firstchan == 4 || desc->channel[firstchan].type == UTIL_FORMAT_TYPE_FLOAT) { - ntype = V_028C70_NUMBER_FLOAT; - } else { - ntype = V_028C70_NUMBER_UNORM; - if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) - ntype = V_028C70_NUMBER_SRGB; - else if (desc->channel[firstchan].type == UTIL_FORMAT_TYPE_SIGNED) { - if (desc->channel[firstchan].pure_integer) { - ntype = V_028C70_NUMBER_SINT; - } else { - assert(desc->channel[firstchan].normalized); - ntype = V_028C70_NUMBER_SNORM; - } - } else if (desc->channel[firstchan].type == UTIL_FORMAT_TYPE_UNSIGNED) { - if (desc->channel[firstchan].pure_integer) { - ntype = V_028C70_NUMBER_UINT; - } else { - assert(desc->channel[firstchan].normalized); - ntype = V_028C70_NUMBER_UNORM; - } - } - } - - format = si_translate_colorformat(surf->base.format); - if (format == V_028C70_COLOR_INVALID) { - PRINT_ERR("Invalid CB format: %d, disabling CB.\n", surf->base.format); - } - assert(format != V_028C70_COLOR_INVALID); - swap = si_translate_colorswap(surf->base.format, false); - endian = si_colorformat_endian_swap(format); - - /* blend clamp should be set for all NORM/SRGB types */ - if (ntype == V_028C70_NUMBER_UNORM || - ntype == V_028C70_NUMBER_SNORM || - ntype == V_028C70_NUMBER_SRGB) - blend_clamp = 1; - - /* set blend bypass according to docs if SINT/UINT or - 8/24 COLOR variants */ - if (ntype == V_028C70_NUMBER_UINT || ntype == V_028C70_NUMBER_SINT || - format == V_028C70_COLOR_8_24 || format == V_028C70_COLOR_24_8 || - format == V_028C70_COLOR_X24_8_32_FLOAT) { - blend_clamp = 0; - blend_bypass = 1; - } - - if (ntype == V_028C70_NUMBER_UINT || ntype == V_028C70_NUMBER_SINT) { - if (format == V_028C70_COLOR_8 || - format == V_028C70_COLOR_8_8 || - format == V_028C70_COLOR_8_8_8_8) - surf->color_is_int8 = true; - else if (format == V_028C70_COLOR_10_10_10_2 || - format == V_028C70_COLOR_2_10_10_10) - surf->color_is_int10 = true; - } - - color_info = S_028C70_FORMAT(format) | - S_028C70_COMP_SWAP(swap) | - S_028C70_BLEND_CLAMP(blend_clamp) | - S_028C70_BLEND_BYPASS(blend_bypass) | - S_028C70_SIMPLE_FLOAT(1) | - S_028C70_ROUND_MODE(ntype != V_028C70_NUMBER_UNORM && - ntype != V_028C70_NUMBER_SNORM && - ntype != V_028C70_NUMBER_SRGB && - format != V_028C70_COLOR_8_24 && - format != V_028C70_COLOR_24_8) | - S_028C70_NUMBER_TYPE(ntype) | - S_028C70_ENDIAN(endian); - - /* Intensity is implemented as Red, so treat it that way. */ - color_attrib = S_028C74_FORCE_DST_ALPHA_1(desc->swizzle[3] == PIPE_SWIZZLE_1 || - util_format_is_intensity(surf->base.format)); - - if (tex->buffer.b.b.nr_samples > 1) { - unsigned log_samples = util_logbase2(tex->buffer.b.b.nr_samples); - unsigned log_fragments = util_logbase2(tex->buffer.b.b.nr_storage_samples); - - color_attrib |= S_028C74_NUM_SAMPLES(log_samples) | - S_028C74_NUM_FRAGMENTS(log_fragments); - - if (tex->fmask_offset) { - color_info |= S_028C70_COMPRESSION(1); - unsigned fmask_bankh = util_logbase2(tex->surface.u.legacy.fmask.bankh); - - if (sctx->chip_class == GFX6) { - /* due to a hw bug, FMASK_BANK_HEIGHT must be set on GFX6 too */ - color_attrib |= S_028C74_FMASK_BANK_HEIGHT(fmask_bankh); - } - } - } - - if (sctx->chip_class >= GFX10) { - unsigned min_compressed_block_size = V_028C78_MIN_BLOCK_SIZE_32B; - - /* amdvlk: [min-compressed-block-size] should be set to 32 for dGPU and - 64 for APU because all of our APUs to date use DIMMs which have - a request granularity size of 64B while all other chips have a - 32B request size */ - if (!sctx->screen->info.has_dedicated_vram) - min_compressed_block_size = V_028C78_MIN_BLOCK_SIZE_64B; - - surf->cb_dcc_control = - S_028C78_MAX_UNCOMPRESSED_BLOCK_SIZE(V_028C78_MAX_BLOCK_SIZE_256B) | - S_028C78_MAX_COMPRESSED_BLOCK_SIZE(V_028C78_MAX_BLOCK_SIZE_128B) | - S_028C78_MIN_COMPRESSED_BLOCK_SIZE(min_compressed_block_size) | - S_028C78_INDEPENDENT_64B_BLOCKS(0) | - S_028C78_INDEPENDENT_128B_BLOCKS(1); - } else if (sctx->chip_class >= GFX8) { - unsigned max_uncompressed_block_size = V_028C78_MAX_BLOCK_SIZE_256B; - unsigned min_compressed_block_size = V_028C78_MIN_BLOCK_SIZE_32B; - - /* amdvlk: [min-compressed-block-size] should be set to 32 for dGPU and - 64 for APU because all of our APUs to date use DIMMs which have - a request granularity size of 64B while all other chips have a - 32B request size */ - if (!sctx->screen->info.has_dedicated_vram) - min_compressed_block_size = V_028C78_MIN_BLOCK_SIZE_64B; - - if (tex->buffer.b.b.nr_storage_samples > 1) { - if (tex->surface.bpe == 1) - max_uncompressed_block_size = V_028C78_MAX_BLOCK_SIZE_64B; - else if (tex->surface.bpe == 2) - max_uncompressed_block_size = V_028C78_MAX_BLOCK_SIZE_128B; - } - - surf->cb_dcc_control = S_028C78_MAX_UNCOMPRESSED_BLOCK_SIZE(max_uncompressed_block_size) | - S_028C78_MIN_COMPRESSED_BLOCK_SIZE(min_compressed_block_size) | - S_028C78_INDEPENDENT_64B_BLOCKS(1); - } - - /* This must be set for fast clear to work without FMASK. */ - if (!tex->surface.fmask_size && sctx->chip_class == GFX6) { - unsigned bankh = util_logbase2(tex->surface.u.legacy.bankh); - color_attrib |= S_028C74_FMASK_BANK_HEIGHT(bankh); - } - - /* GFX10 field has the same base shift as the GFX6 field */ - unsigned color_view = S_028C6C_SLICE_START(surf->base.u.tex.first_layer) | - S_028C6C_SLICE_MAX_GFX10(surf->base.u.tex.last_layer); - unsigned mip0_depth = util_max_layer(&tex->buffer.b.b, 0); - - if (sctx->chip_class >= GFX10) { - color_view |= S_028C6C_MIP_LEVEL_GFX10(surf->base.u.tex.level); - - surf->cb_color_attrib3 = S_028EE0_MIP0_DEPTH(mip0_depth) | - S_028EE0_RESOURCE_TYPE(tex->surface.u.gfx9.resource_type) | - S_028EE0_RESOURCE_LEVEL(1); - } else if (sctx->chip_class == GFX9) { - color_view |= S_028C6C_MIP_LEVEL_GFX9(surf->base.u.tex.level); - color_attrib |= S_028C74_MIP0_DEPTH(mip0_depth) | - S_028C74_RESOURCE_TYPE(tex->surface.u.gfx9.resource_type); - } - - if (sctx->chip_class >= GFX9) { - surf->cb_color_attrib2 = S_028C68_MIP0_WIDTH(surf->width0 - 1) | - S_028C68_MIP0_HEIGHT(surf->height0 - 1) | - S_028C68_MAX_MIP(tex->buffer.b.b.last_level); - } - - surf->cb_color_view = color_view; - surf->cb_color_info = color_info; - surf->cb_color_attrib = color_attrib; + struct si_texture *tex = (struct si_texture*)surf->base.texture; + unsigned color_info, color_attrib; + unsigned format, swap, ntype, endian; + const struct util_format_description *desc; + int firstchan; + unsigned blend_clamp = 0, blend_bypass = 0; + + desc = util_format_description(surf->base.format); + for (firstchan = 0; firstchan < 4; firstchan++) { + if (desc->channel[firstchan].type != UTIL_FORMAT_TYPE_VOID) { + break; + } + } + if (firstchan == 4 || desc->channel[firstchan].type == UTIL_FORMAT_TYPE_FLOAT) { + ntype = V_028C70_NUMBER_FLOAT; + } else { + ntype = V_028C70_NUMBER_UNORM; + if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) + ntype = V_028C70_NUMBER_SRGB; + else if (desc->channel[firstchan].type == UTIL_FORMAT_TYPE_SIGNED) { + if (desc->channel[firstchan].pure_integer) { + ntype = V_028C70_NUMBER_SINT; + } else { + assert(desc->channel[firstchan].normalized); + ntype = V_028C70_NUMBER_SNORM; + } + } else if (desc->channel[firstchan].type == UTIL_FORMAT_TYPE_UNSIGNED) { + if (desc->channel[firstchan].pure_integer) { + ntype = V_028C70_NUMBER_UINT; + } else { + assert(desc->channel[firstchan].normalized); + ntype = V_028C70_NUMBER_UNORM; + } + } + } + + format = si_translate_colorformat(surf->base.format); + if (format == V_028C70_COLOR_INVALID) { + PRINT_ERR("Invalid CB format: %d, disabling CB.\n", surf->base.format); + } + assert(format != V_028C70_COLOR_INVALID); + swap = si_translate_colorswap(surf->base.format, false); + endian = si_colorformat_endian_swap(format); + + /* blend clamp should be set for all NORM/SRGB types */ + if (ntype == V_028C70_NUMBER_UNORM || + ntype == V_028C70_NUMBER_SNORM || + ntype == V_028C70_NUMBER_SRGB) + blend_clamp = 1; + + /* set blend bypass according to docs if SINT/UINT or + 8/24 COLOR variants */ + if (ntype == V_028C70_NUMBER_UINT || ntype == V_028C70_NUMBER_SINT || + format == V_028C70_COLOR_8_24 || format == V_028C70_COLOR_24_8 || + format == V_028C70_COLOR_X24_8_32_FLOAT) { + blend_clamp = 0; + blend_bypass = 1; + } + + if (ntype == V_028C70_NUMBER_UINT || ntype == V_028C70_NUMBER_SINT) { + if (format == V_028C70_COLOR_8 || + format == V_028C70_COLOR_8_8 || + format == V_028C70_COLOR_8_8_8_8) + surf->color_is_int8 = true; + else if (format == V_028C70_COLOR_10_10_10_2 || + format == V_028C70_COLOR_2_10_10_10) + surf->color_is_int10 = true; + } + + color_info = S_028C70_FORMAT(format) | + S_028C70_COMP_SWAP(swap) | + S_028C70_BLEND_CLAMP(blend_clamp) | + S_028C70_BLEND_BYPASS(blend_bypass) | + S_028C70_SIMPLE_FLOAT(1) | + S_028C70_ROUND_MODE(ntype != V_028C70_NUMBER_UNORM && + ntype != V_028C70_NUMBER_SNORM && + ntype != V_028C70_NUMBER_SRGB && + format != V_028C70_COLOR_8_24 && + format != V_028C70_COLOR_24_8) | + S_028C70_NUMBER_TYPE(ntype) | + S_028C70_ENDIAN(endian); + + /* Intensity is implemented as Red, so treat it that way. */ + color_attrib = S_028C74_FORCE_DST_ALPHA_1(desc->swizzle[3] == PIPE_SWIZZLE_1 || + util_format_is_intensity(surf->base.format)); + + if (tex->buffer.b.b.nr_samples > 1) { + unsigned log_samples = util_logbase2(tex->buffer.b.b.nr_samples); + unsigned log_fragments = util_logbase2(tex->buffer.b.b.nr_storage_samples); + + color_attrib |= S_028C74_NUM_SAMPLES(log_samples) | + S_028C74_NUM_FRAGMENTS(log_fragments); + + if (tex->surface.fmask_offset) { + color_info |= S_028C70_COMPRESSION(1); + unsigned fmask_bankh = util_logbase2(tex->surface.u.legacy.fmask.bankh); + + if (sctx->chip_class == GFX6) { + /* due to a hw bug, FMASK_BANK_HEIGHT must be set on GFX6 too */ + color_attrib |= S_028C74_FMASK_BANK_HEIGHT(fmask_bankh); + } + } + } + + if (sctx->chip_class >= GFX10) { + unsigned min_compressed_block_size = V_028C78_MIN_BLOCK_SIZE_32B; + + /* amdvlk: [min-compressed-block-size] should be set to 32 for dGPU and + 64 for APU because all of our APUs to date use DIMMs which have + a request granularity size of 64B while all other chips have a + 32B request size */ + if (!sctx->screen->info.has_dedicated_vram) + min_compressed_block_size = V_028C78_MIN_BLOCK_SIZE_64B; + + surf->cb_dcc_control = + S_028C78_MAX_UNCOMPRESSED_BLOCK_SIZE(V_028C78_MAX_BLOCK_SIZE_256B) | + S_028C78_MAX_COMPRESSED_BLOCK_SIZE(V_028C78_MAX_BLOCK_SIZE_128B) | + S_028C78_MIN_COMPRESSED_BLOCK_SIZE(min_compressed_block_size) | + S_028C78_INDEPENDENT_64B_BLOCKS(0) | + S_028C78_INDEPENDENT_128B_BLOCKS(1); + } else if (sctx->chip_class >= GFX8) { + unsigned max_uncompressed_block_size = V_028C78_MAX_BLOCK_SIZE_256B; + unsigned min_compressed_block_size = V_028C78_MIN_BLOCK_SIZE_32B; + + /* amdvlk: [min-compressed-block-size] should be set to 32 for dGPU and + 64 for APU because all of our APUs to date use DIMMs which have + a request granularity size of 64B while all other chips have a + 32B request size */ + if (!sctx->screen->info.has_dedicated_vram) + min_compressed_block_size = V_028C78_MIN_BLOCK_SIZE_64B; + + if (tex->buffer.b.b.nr_storage_samples > 1) { + if (tex->surface.bpe == 1) + max_uncompressed_block_size = V_028C78_MAX_BLOCK_SIZE_64B; + else if (tex->surface.bpe == 2) + max_uncompressed_block_size = V_028C78_MAX_BLOCK_SIZE_128B; + } + + surf->cb_dcc_control = S_028C78_MAX_UNCOMPRESSED_BLOCK_SIZE(max_uncompressed_block_size) | + S_028C78_MIN_COMPRESSED_BLOCK_SIZE(min_compressed_block_size) | + S_028C78_INDEPENDENT_64B_BLOCKS(1); + } + + /* This must be set for fast clear to work without FMASK. */ + if (!tex->surface.fmask_size && sctx->chip_class == GFX6) { + unsigned bankh = util_logbase2(tex->surface.u.legacy.bankh); + color_attrib |= S_028C74_FMASK_BANK_HEIGHT(bankh); + } + + /* GFX10 field has the same base shift as the GFX6 field */ + unsigned color_view = S_028C6C_SLICE_START(surf->base.u.tex.first_layer) | + S_028C6C_SLICE_MAX_GFX10(surf->base.u.tex.last_layer); + unsigned mip0_depth = util_max_layer(&tex->buffer.b.b, 0); + + if (sctx->chip_class >= GFX10) { + color_view |= S_028C6C_MIP_LEVEL_GFX10(surf->base.u.tex.level); + + surf->cb_color_attrib3 = S_028EE0_MIP0_DEPTH(mip0_depth) | + S_028EE0_RESOURCE_TYPE(tex->surface.u.gfx9.resource_type) | + S_028EE0_RESOURCE_LEVEL(1); + } else if (sctx->chip_class == GFX9) { + color_view |= S_028C6C_MIP_LEVEL_GFX9(surf->base.u.tex.level); + color_attrib |= S_028C74_MIP0_DEPTH(mip0_depth) | + S_028C74_RESOURCE_TYPE(tex->surface.u.gfx9.resource_type); + } + + if (sctx->chip_class >= GFX9) { + surf->cb_color_attrib2 = S_028C68_MIP0_WIDTH(surf->width0 - 1) | + S_028C68_MIP0_HEIGHT(surf->height0 - 1) | + S_028C68_MAX_MIP(tex->buffer.b.b.last_level); + } + + surf->cb_color_view = color_view; + surf->cb_color_info = color_info; + surf->cb_color_attrib = color_attrib; - /* Determine pixel shader export format */ - si_choose_spi_color_formats(surf, format, swap, ntype, tex->is_depth); + /* Determine pixel shader export format */ + si_choose_spi_color_formats(surf, format, swap, ntype, tex->is_depth); - surf->color_initialized = true; + surf->color_initialized = true; } static void si_init_depth_surface(struct si_context *sctx, - struct si_surface *surf) + struct si_surface *surf) { - struct si_texture *tex = (struct si_texture*)surf->base.texture; - unsigned level = surf->base.u.tex.level; - unsigned format, stencil_format; - uint32_t z_info, s_info; - - format = si_translate_dbformat(tex->db_render_format); - stencil_format = tex->surface.has_stencil ? - V_028044_STENCIL_8 : V_028044_STENCIL_INVALID; - - assert(format != V_028040_Z_INVALID); - if (format == V_028040_Z_INVALID) - PRINT_ERR("Invalid DB format: %d, disabling DB.\n", tex->buffer.b.b.format); - - surf->db_depth_view = S_028008_SLICE_START(surf->base.u.tex.first_layer) | - S_028008_SLICE_MAX(surf->base.u.tex.last_layer); - surf->db_htile_data_base = 0; - surf->db_htile_surface = 0; - - if (sctx->chip_class >= GFX10) { - surf->db_depth_view |= S_028008_SLICE_START_HI(surf->base.u.tex.first_layer >> 11) | - S_028008_SLICE_MAX_HI(surf->base.u.tex.last_layer >> 11); - } - - if (sctx->chip_class >= GFX9) { - assert(tex->surface.u.gfx9.surf_offset == 0); - surf->db_depth_base = tex->buffer.gpu_address >> 8; - surf->db_stencil_base = (tex->buffer.gpu_address + - tex->surface.u.gfx9.stencil_offset) >> 8; - z_info = S_028038_FORMAT(format) | - S_028038_NUM_SAMPLES(util_logbase2(tex->buffer.b.b.nr_samples)) | - S_028038_SW_MODE(tex->surface.u.gfx9.surf.swizzle_mode) | - S_028038_MAXMIP(tex->buffer.b.b.last_level); - s_info = S_02803C_FORMAT(stencil_format) | - S_02803C_SW_MODE(tex->surface.u.gfx9.stencil.swizzle_mode); - - if (sctx->chip_class == GFX9) { - surf->db_z_info2 = S_028068_EPITCH(tex->surface.u.gfx9.surf.epitch); - surf->db_stencil_info2 = S_02806C_EPITCH(tex->surface.u.gfx9.stencil.epitch); - } - surf->db_depth_view |= S_028008_MIPID(level); - surf->db_depth_size = S_02801C_X_MAX(tex->buffer.b.b.width0 - 1) | - S_02801C_Y_MAX(tex->buffer.b.b.height0 - 1); - - if (si_htile_enabled(tex, level, PIPE_MASK_ZS)) { - z_info |= S_028038_TILE_SURFACE_ENABLE(1) | - S_028038_ALLOW_EXPCLEAR(1); - - if (tex->tc_compatible_htile) { - unsigned max_zplanes = 4; - - if (tex->db_render_format == PIPE_FORMAT_Z16_UNORM && - tex->buffer.b.b.nr_samples > 1) - max_zplanes = 2; - - z_info |= S_028038_DECOMPRESS_ON_N_ZPLANES(max_zplanes + 1); - - if (sctx->chip_class >= GFX10) { - z_info |= S_028040_ITERATE_FLUSH(1); - s_info |= S_028044_ITERATE_FLUSH(!tex->htile_stencil_disabled); - } else { - z_info |= S_028038_ITERATE_FLUSH(1); - s_info |= S_02803C_ITERATE_FLUSH(1); - } - } - - if (tex->surface.has_stencil && !tex->htile_stencil_disabled) { - /* Stencil buffer workaround ported from the GFX6-GFX8 code. - * See that for explanation. - */ - s_info |= S_02803C_ALLOW_EXPCLEAR(tex->buffer.b.b.nr_samples <= 1); - } else { - /* Use all HTILE for depth if there's no stencil. */ - s_info |= S_02803C_TILE_STENCIL_DISABLE(1); - } - - surf->db_htile_data_base = (tex->buffer.gpu_address + - tex->htile_offset) >> 8; - surf->db_htile_surface = S_028ABC_FULL_CACHE(1) | - S_028ABC_PIPE_ALIGNED(tex->surface.u.gfx9.htile.pipe_aligned); - if (sctx->chip_class == GFX9) { - surf->db_htile_surface |= - S_028ABC_RB_ALIGNED(tex->surface.u.gfx9.htile.rb_aligned); - } - } - } else { - /* GFX6-GFX8 */ - struct legacy_surf_level *levelinfo = &tex->surface.u.legacy.level[level]; - - assert(levelinfo->nblk_x % 8 == 0 && levelinfo->nblk_y % 8 == 0); - - surf->db_depth_base = (tex->buffer.gpu_address + - tex->surface.u.legacy.level[level].offset) >> 8; - surf->db_stencil_base = (tex->buffer.gpu_address + - tex->surface.u.legacy.stencil_level[level].offset) >> 8; - - z_info = S_028040_FORMAT(format) | - S_028040_NUM_SAMPLES(util_logbase2(tex->buffer.b.b.nr_samples)); - s_info = S_028044_FORMAT(stencil_format); - surf->db_depth_info = S_02803C_ADDR5_SWIZZLE_MASK(!tex->tc_compatible_htile); - - if (sctx->chip_class >= GFX7) { - struct radeon_info *info = &sctx->screen->info; - unsigned index = tex->surface.u.legacy.tiling_index[level]; - unsigned stencil_index = tex->surface.u.legacy.stencil_tiling_index[level]; - unsigned macro_index = tex->surface.u.legacy.macro_tile_index; - unsigned tile_mode = info->si_tile_mode_array[index]; - unsigned stencil_tile_mode = info->si_tile_mode_array[stencil_index]; - unsigned macro_mode = info->cik_macrotile_mode_array[macro_index]; - - surf->db_depth_info |= - S_02803C_ARRAY_MODE(G_009910_ARRAY_MODE(tile_mode)) | - S_02803C_PIPE_CONFIG(G_009910_PIPE_CONFIG(tile_mode)) | - S_02803C_BANK_WIDTH(G_009990_BANK_WIDTH(macro_mode)) | - S_02803C_BANK_HEIGHT(G_009990_BANK_HEIGHT(macro_mode)) | - S_02803C_MACRO_TILE_ASPECT(G_009990_MACRO_TILE_ASPECT(macro_mode)) | - S_02803C_NUM_BANKS(G_009990_NUM_BANKS(macro_mode)); - z_info |= S_028040_TILE_SPLIT(G_009910_TILE_SPLIT(tile_mode)); - s_info |= S_028044_TILE_SPLIT(G_009910_TILE_SPLIT(stencil_tile_mode)); - } else { - unsigned tile_mode_index = si_tile_mode_index(tex, level, false); - z_info |= S_028040_TILE_MODE_INDEX(tile_mode_index); - tile_mode_index = si_tile_mode_index(tex, level, true); - s_info |= S_028044_TILE_MODE_INDEX(tile_mode_index); - } - - surf->db_depth_size = S_028058_PITCH_TILE_MAX((levelinfo->nblk_x / 8) - 1) | - S_028058_HEIGHT_TILE_MAX((levelinfo->nblk_y / 8) - 1); - surf->db_depth_slice = S_02805C_SLICE_TILE_MAX((levelinfo->nblk_x * - levelinfo->nblk_y) / 64 - 1); - - if (si_htile_enabled(tex, level, PIPE_MASK_ZS)) { - z_info |= S_028040_TILE_SURFACE_ENABLE(1) | - S_028040_ALLOW_EXPCLEAR(1); - - if (tex->surface.has_stencil) { - /* Workaround: For a not yet understood reason, the - * combination of MSAA, fast stencil clear and stencil - * decompress messes with subsequent stencil buffer - * uses. Problem was reproduced on Verde, Bonaire, - * Tonga, and Carrizo. - * - * Disabling EXPCLEAR works around the problem. - * - * Check piglit's arb_texture_multisample-stencil-clear - * test if you want to try changing this. - */ - if (tex->buffer.b.b.nr_samples <= 1) - s_info |= S_028044_ALLOW_EXPCLEAR(1); - } else if (!tex->tc_compatible_htile) { - /* Use all of the htile_buffer for depth if there's no stencil. - * This must not be set when TC-compatible HTILE is enabled - * due to a hw bug. - */ - s_info |= S_028044_TILE_STENCIL_DISABLE(1); - } - - surf->db_htile_data_base = (tex->buffer.gpu_address + - tex->htile_offset) >> 8; - surf->db_htile_surface = S_028ABC_FULL_CACHE(1); - - if (tex->tc_compatible_htile) { - surf->db_htile_surface |= S_028ABC_TC_COMPATIBLE(1); - - /* 0 = full compression. N = only compress up to N-1 Z planes. */ - if (tex->buffer.b.b.nr_samples <= 1) - z_info |= S_028040_DECOMPRESS_ON_N_ZPLANES(5); - else if (tex->buffer.b.b.nr_samples <= 4) - z_info |= S_028040_DECOMPRESS_ON_N_ZPLANES(3); - else - z_info |= S_028040_DECOMPRESS_ON_N_ZPLANES(2); - } - } - } + struct si_texture *tex = (struct si_texture*)surf->base.texture; + unsigned level = surf->base.u.tex.level; + unsigned format, stencil_format; + uint32_t z_info, s_info; + + format = si_translate_dbformat(tex->db_render_format); + stencil_format = tex->surface.has_stencil ? + V_028044_STENCIL_8 : V_028044_STENCIL_INVALID; + + assert(format != V_028040_Z_INVALID); + if (format == V_028040_Z_INVALID) + PRINT_ERR("Invalid DB format: %d, disabling DB.\n", tex->buffer.b.b.format); + + surf->db_depth_view = S_028008_SLICE_START(surf->base.u.tex.first_layer) | + S_028008_SLICE_MAX(surf->base.u.tex.last_layer); + surf->db_htile_data_base = 0; + surf->db_htile_surface = 0; + + if (sctx->chip_class >= GFX10) { + surf->db_depth_view |= S_028008_SLICE_START_HI(surf->base.u.tex.first_layer >> 11) | + S_028008_SLICE_MAX_HI(surf->base.u.tex.last_layer >> 11); + } + + if (sctx->chip_class >= GFX9) { + assert(tex->surface.u.gfx9.surf_offset == 0); + surf->db_depth_base = tex->buffer.gpu_address >> 8; + surf->db_stencil_base = (tex->buffer.gpu_address + + tex->surface.u.gfx9.stencil_offset) >> 8; + z_info = S_028038_FORMAT(format) | + S_028038_NUM_SAMPLES(util_logbase2(tex->buffer.b.b.nr_samples)) | + S_028038_SW_MODE(tex->surface.u.gfx9.surf.swizzle_mode) | + S_028038_MAXMIP(tex->buffer.b.b.last_level); + s_info = S_02803C_FORMAT(stencil_format) | + S_02803C_SW_MODE(tex->surface.u.gfx9.stencil.swizzle_mode); + + if (sctx->chip_class == GFX9) { + surf->db_z_info2 = S_028068_EPITCH(tex->surface.u.gfx9.surf.epitch); + surf->db_stencil_info2 = S_02806C_EPITCH(tex->surface.u.gfx9.stencil.epitch); + } + surf->db_depth_view |= S_028008_MIPID(level); + surf->db_depth_size = S_02801C_X_MAX(tex->buffer.b.b.width0 - 1) | + S_02801C_Y_MAX(tex->buffer.b.b.height0 - 1); + + if (si_htile_enabled(tex, level, PIPE_MASK_ZS)) { + z_info |= S_028038_TILE_SURFACE_ENABLE(1) | + S_028038_ALLOW_EXPCLEAR(1); + + if (tex->tc_compatible_htile) { + unsigned max_zplanes = 4; + + if (tex->db_render_format == PIPE_FORMAT_Z16_UNORM && + tex->buffer.b.b.nr_samples > 1) + max_zplanes = 2; + + z_info |= S_028038_DECOMPRESS_ON_N_ZPLANES(max_zplanes + 1); + + if (sctx->chip_class >= GFX10) { + z_info |= S_028040_ITERATE_FLUSH(1); + s_info |= S_028044_ITERATE_FLUSH(!tex->htile_stencil_disabled); + } else { + z_info |= S_028038_ITERATE_FLUSH(1); + s_info |= S_02803C_ITERATE_FLUSH(1); + } + } + + if (tex->surface.has_stencil && !tex->htile_stencil_disabled) { + /* Stencil buffer workaround ported from the GFX6-GFX8 code. + * See that for explanation. + */ + s_info |= S_02803C_ALLOW_EXPCLEAR(tex->buffer.b.b.nr_samples <= 1); + } else { + /* Use all HTILE for depth if there's no stencil. */ + s_info |= S_02803C_TILE_STENCIL_DISABLE(1); + } + + surf->db_htile_data_base = (tex->buffer.gpu_address + + tex->surface.htile_offset) >> 8; + surf->db_htile_surface = S_028ABC_FULL_CACHE(1) | + S_028ABC_PIPE_ALIGNED(tex->surface.u.gfx9.htile.pipe_aligned); + if (sctx->chip_class == GFX9) { + surf->db_htile_surface |= + S_028ABC_RB_ALIGNED(tex->surface.u.gfx9.htile.rb_aligned); + } + } + } else { + /* GFX6-GFX8 */ + struct legacy_surf_level *levelinfo = &tex->surface.u.legacy.level[level]; + + assert(levelinfo->nblk_x % 8 == 0 && levelinfo->nblk_y % 8 == 0); + + surf->db_depth_base = (tex->buffer.gpu_address + + tex->surface.u.legacy.level[level].offset) >> 8; + surf->db_stencil_base = (tex->buffer.gpu_address + + tex->surface.u.legacy.stencil_level[level].offset) >> 8; + + z_info = S_028040_FORMAT(format) | + S_028040_NUM_SAMPLES(util_logbase2(tex->buffer.b.b.nr_samples)); + s_info = S_028044_FORMAT(stencil_format); + surf->db_depth_info = S_02803C_ADDR5_SWIZZLE_MASK(!tex->tc_compatible_htile); + + if (sctx->chip_class >= GFX7) { + struct radeon_info *info = &sctx->screen->info; + unsigned index = tex->surface.u.legacy.tiling_index[level]; + unsigned stencil_index = tex->surface.u.legacy.stencil_tiling_index[level]; + unsigned macro_index = tex->surface.u.legacy.macro_tile_index; + unsigned tile_mode = info->si_tile_mode_array[index]; + unsigned stencil_tile_mode = info->si_tile_mode_array[stencil_index]; + unsigned macro_mode = info->cik_macrotile_mode_array[macro_index]; + + surf->db_depth_info |= + S_02803C_ARRAY_MODE(G_009910_ARRAY_MODE(tile_mode)) | + S_02803C_PIPE_CONFIG(G_009910_PIPE_CONFIG(tile_mode)) | + S_02803C_BANK_WIDTH(G_009990_BANK_WIDTH(macro_mode)) | + S_02803C_BANK_HEIGHT(G_009990_BANK_HEIGHT(macro_mode)) | + S_02803C_MACRO_TILE_ASPECT(G_009990_MACRO_TILE_ASPECT(macro_mode)) | + S_02803C_NUM_BANKS(G_009990_NUM_BANKS(macro_mode)); + z_info |= S_028040_TILE_SPLIT(G_009910_TILE_SPLIT(tile_mode)); + s_info |= S_028044_TILE_SPLIT(G_009910_TILE_SPLIT(stencil_tile_mode)); + } else { + unsigned tile_mode_index = si_tile_mode_index(tex, level, false); + z_info |= S_028040_TILE_MODE_INDEX(tile_mode_index); + tile_mode_index = si_tile_mode_index(tex, level, true); + s_info |= S_028044_TILE_MODE_INDEX(tile_mode_index); + } + + surf->db_depth_size = S_028058_PITCH_TILE_MAX((levelinfo->nblk_x / 8) - 1) | + S_028058_HEIGHT_TILE_MAX((levelinfo->nblk_y / 8) - 1); + surf->db_depth_slice = S_02805C_SLICE_TILE_MAX((levelinfo->nblk_x * + levelinfo->nblk_y) / 64 - 1); + + if (si_htile_enabled(tex, level, PIPE_MASK_ZS)) { + z_info |= S_028040_TILE_SURFACE_ENABLE(1) | + S_028040_ALLOW_EXPCLEAR(1); + + if (tex->surface.has_stencil) { + /* Workaround: For a not yet understood reason, the + * combination of MSAA, fast stencil clear and stencil + * decompress messes with subsequent stencil buffer + * uses. Problem was reproduced on Verde, Bonaire, + * Tonga, and Carrizo. + * + * Disabling EXPCLEAR works around the problem. + * + * Check piglit's arb_texture_multisample-stencil-clear + * test if you want to try changing this. + */ + if (tex->buffer.b.b.nr_samples <= 1) + s_info |= S_028044_ALLOW_EXPCLEAR(1); + } else if (!tex->tc_compatible_htile) { + /* Use all of the htile_buffer for depth if there's no stencil. + * This must not be set when TC-compatible HTILE is enabled + * due to a hw bug. + */ + s_info |= S_028044_TILE_STENCIL_DISABLE(1); + } + + surf->db_htile_data_base = (tex->buffer.gpu_address + + tex->surface.htile_offset) >> 8; + surf->db_htile_surface = S_028ABC_FULL_CACHE(1); + + if (tex->tc_compatible_htile) { + surf->db_htile_surface |= S_028ABC_TC_COMPATIBLE(1); + + /* 0 = full compression. N = only compress up to N-1 Z planes. */ + if (tex->buffer.b.b.nr_samples <= 1) + z_info |= S_028040_DECOMPRESS_ON_N_ZPLANES(5); + else if (tex->buffer.b.b.nr_samples <= 4) + z_info |= S_028040_DECOMPRESS_ON_N_ZPLANES(3); + else + z_info |= S_028040_DECOMPRESS_ON_N_ZPLANES(2); + } + } + } - surf->db_z_info = z_info; - surf->db_stencil_info = s_info; + surf->db_z_info = z_info; + surf->db_stencil_info = s_info; - surf->depth_initialized = true; + surf->depth_initialized = true; } void si_update_fb_dirtiness_after_rendering(struct si_context *sctx) { - if (sctx->decompression_enabled) - return; + if (sctx->decompression_enabled) + return; - if (sctx->framebuffer.state.zsbuf) { - struct pipe_surface *surf = sctx->framebuffer.state.zsbuf; - struct si_texture *tex = (struct si_texture *)surf->texture; - - tex->dirty_level_mask |= 1 << surf->u.tex.level; - - if (tex->surface.has_stencil) - tex->stencil_dirty_level_mask |= 1 << surf->u.tex.level; - } - - unsigned compressed_cb_mask = sctx->framebuffer.compressed_cb_mask; - while (compressed_cb_mask) { - unsigned i = u_bit_scan(&compressed_cb_mask); - struct pipe_surface *surf = sctx->framebuffer.state.cbufs[i]; - struct si_texture *tex = (struct si_texture*)surf->texture; - - if (tex->fmask_offset) - tex->dirty_level_mask |= 1 << surf->u.tex.level; - if (tex->dcc_gather_statistics) - tex->separate_dcc_dirty = true; - } + if (sctx->framebuffer.state.zsbuf) { + struct pipe_surface *surf = sctx->framebuffer.state.zsbuf; + struct si_texture *tex = (struct si_texture *)surf->texture; + + tex->dirty_level_mask |= 1 << surf->u.tex.level; + + if (tex->surface.has_stencil) + tex->stencil_dirty_level_mask |= 1 << surf->u.tex.level; + } + + unsigned compressed_cb_mask = sctx->framebuffer.compressed_cb_mask; + while (compressed_cb_mask) { + unsigned i = u_bit_scan(&compressed_cb_mask); + struct pipe_surface *surf = sctx->framebuffer.state.cbufs[i]; + struct si_texture *tex = (struct si_texture*)surf->texture; + + if (tex->surface.fmask_offset) { + tex->dirty_level_mask |= 1 << surf->u.tex.level; + tex->fmask_is_identity = false; + } + if (tex->dcc_gather_statistics) + tex->separate_dcc_dirty = true; + } } static void si_dec_framebuffer_counters(const struct pipe_framebuffer_state *state) { - for (int i = 0; i < state->nr_cbufs; ++i) { - struct si_surface *surf = NULL; - struct si_texture *tex; - - if (!state->cbufs[i]) - continue; - surf = (struct si_surface*)state->cbufs[i]; - tex = (struct si_texture*)surf->base.texture; + for (int i = 0; i < state->nr_cbufs; ++i) { + struct si_surface *surf = NULL; + struct si_texture *tex; + + if (!state->cbufs[i]) + continue; + surf = (struct si_surface*)state->cbufs[i]; + tex = (struct si_texture*)surf->base.texture; - p_atomic_dec(&tex->framebuffers_bound); - } + p_atomic_dec(&tex->framebuffers_bound); + } } static void si_set_framebuffer_state(struct pipe_context *ctx, - const struct pipe_framebuffer_state *state) + const struct pipe_framebuffer_state *state) { - struct si_context *sctx = (struct si_context *)ctx; - struct si_surface *surf = NULL; - struct si_texture *tex; - bool old_any_dst_linear = sctx->framebuffer.any_dst_linear; - unsigned old_nr_samples = sctx->framebuffer.nr_samples; - unsigned old_colorbuf_enabled_4bit = sctx->framebuffer.colorbuf_enabled_4bit; - bool old_has_zsbuf = !!sctx->framebuffer.state.zsbuf; - bool old_has_stencil = - old_has_zsbuf && - ((struct si_texture*)sctx->framebuffer.state.zsbuf->texture)->surface.has_stencil; - bool unbound = false; - int i; - - /* Reject zero-sized framebuffers due to a hw bug on GFX6 that occurs - * when PA_SU_HARDWARE_SCREEN_OFFSET != 0 and any_scissor.BR_X/Y <= 0. - * We could implement the full workaround here, but it's a useless case. - */ - if ((!state->width || !state->height) && (state->nr_cbufs || state->zsbuf)) { - unreachable("the framebuffer shouldn't have zero area"); - return; - } - - si_update_fb_dirtiness_after_rendering(sctx); - - for (i = 0; i < sctx->framebuffer.state.nr_cbufs; i++) { - if (!sctx->framebuffer.state.cbufs[i]) - continue; - - tex = (struct si_texture*)sctx->framebuffer.state.cbufs[i]->texture; - if (tex->dcc_gather_statistics) - vi_separate_dcc_stop_query(sctx, tex); - } - - /* Disable DCC if the formats are incompatible. */ - for (i = 0; i < state->nr_cbufs; i++) { - if (!state->cbufs[i]) - continue; - - surf = (struct si_surface*)state->cbufs[i]; - tex = (struct si_texture*)surf->base.texture; - - if (!surf->dcc_incompatible) - continue; - - /* Since the DCC decompression calls back into set_framebuffer- - * _state, we need to unbind the framebuffer, so that - * vi_separate_dcc_stop_query isn't called twice with the same - * color buffer. - */ - if (!unbound) { - util_copy_framebuffer_state(&sctx->framebuffer.state, NULL); - unbound = true; - } - - if (vi_dcc_enabled(tex, surf->base.u.tex.level)) - if (!si_texture_disable_dcc(sctx, tex)) - si_decompress_dcc(sctx, tex); - - surf->dcc_incompatible = false; - } - - /* Only flush TC when changing the framebuffer state, because - * the only client not using TC that can change textures is - * the framebuffer. - * - * Wait for compute shaders because of possible transitions: - * - FB write -> shader read - * - shader write -> FB read - * - * DB caches are flushed on demand (using si_decompress_textures). - * - * When MSAA is enabled, CB and TC caches are flushed on demand - * (after FMASK decompression). Shader write -> FB read transitions - * cannot happen for MSAA textures, because MSAA shader images are - * not supported. - * - * Only flush and wait for CB if there is actually a bound color buffer. - */ - if (sctx->framebuffer.uncompressed_cb_mask) { - si_make_CB_shader_coherent(sctx, sctx->framebuffer.nr_samples, - sctx->framebuffer.CB_has_shader_readable_metadata, - sctx->framebuffer.all_DCC_pipe_aligned); - } - - sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH; - - /* u_blitter doesn't invoke depth decompression when it does multiple - * blits in a row, but the only case when it matters for DB is when - * doing generate_mipmap. So here we flush DB manually between - * individual generate_mipmap blits. - * Note that lower mipmap levels aren't compressed. - */ - if (sctx->generate_mipmap_for_depth) { - si_make_DB_shader_coherent(sctx, 1, false, - sctx->framebuffer.DB_has_shader_readable_metadata); - } else if (sctx->chip_class == GFX9) { - /* It appears that DB metadata "leaks" in a sequence of: - * - depth clear - * - DCC decompress for shader image writes (with DB disabled) - * - render with DEPTH_BEFORE_SHADER=1 - * Flushing DB metadata works around the problem. - */ - sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_DB_META; - } - - /* Take the maximum of the old and new count. If the new count is lower, - * dirtying is needed to disable the unbound colorbuffers. - */ - sctx->framebuffer.dirty_cbufs |= - (1 << MAX2(sctx->framebuffer.state.nr_cbufs, state->nr_cbufs)) - 1; - sctx->framebuffer.dirty_zsbuf |= sctx->framebuffer.state.zsbuf != state->zsbuf; - - si_dec_framebuffer_counters(&sctx->framebuffer.state); - util_copy_framebuffer_state(&sctx->framebuffer.state, state); - - sctx->framebuffer.colorbuf_enabled_4bit = 0; - sctx->framebuffer.spi_shader_col_format = 0; - sctx->framebuffer.spi_shader_col_format_alpha = 0; - sctx->framebuffer.spi_shader_col_format_blend = 0; - sctx->framebuffer.spi_shader_col_format_blend_alpha = 0; - sctx->framebuffer.color_is_int8 = 0; - sctx->framebuffer.color_is_int10 = 0; - - sctx->framebuffer.compressed_cb_mask = 0; - sctx->framebuffer.uncompressed_cb_mask = 0; - sctx->framebuffer.nr_samples = util_framebuffer_get_num_samples(state); - sctx->framebuffer.nr_color_samples = sctx->framebuffer.nr_samples; - sctx->framebuffer.log_samples = util_logbase2(sctx->framebuffer.nr_samples); - sctx->framebuffer.any_dst_linear = false; - sctx->framebuffer.CB_has_shader_readable_metadata = false; - sctx->framebuffer.DB_has_shader_readable_metadata = false; - sctx->framebuffer.all_DCC_pipe_aligned = true; - sctx->framebuffer.min_bytes_per_pixel = 0; - - for (i = 0; i < state->nr_cbufs; i++) { - if (!state->cbufs[i]) - continue; - - surf = (struct si_surface*)state->cbufs[i]; - tex = (struct si_texture*)surf->base.texture; - - if (!surf->color_initialized) { - si_initialize_color_surface(sctx, surf); - } - - sctx->framebuffer.colorbuf_enabled_4bit |= 0xf << (i * 4); - sctx->framebuffer.spi_shader_col_format |= - surf->spi_shader_col_format << (i * 4); - sctx->framebuffer.spi_shader_col_format_alpha |= - surf->spi_shader_col_format_alpha << (i * 4); - sctx->framebuffer.spi_shader_col_format_blend |= - surf->spi_shader_col_format_blend << (i * 4); - sctx->framebuffer.spi_shader_col_format_blend_alpha |= - surf->spi_shader_col_format_blend_alpha << (i * 4); - - if (surf->color_is_int8) - sctx->framebuffer.color_is_int8 |= 1 << i; - if (surf->color_is_int10) - sctx->framebuffer.color_is_int10 |= 1 << i; - - if (tex->fmask_offset) - sctx->framebuffer.compressed_cb_mask |= 1 << i; - else - sctx->framebuffer.uncompressed_cb_mask |= 1 << i; - - /* Don't update nr_color_samples for non-AA buffers. - * (e.g. destination of MSAA resolve) - */ - if (tex->buffer.b.b.nr_samples >= 2 && - tex->buffer.b.b.nr_storage_samples < tex->buffer.b.b.nr_samples) { - sctx->framebuffer.nr_color_samples = - MIN2(sctx->framebuffer.nr_color_samples, - tex->buffer.b.b.nr_storage_samples); - sctx->framebuffer.nr_color_samples = - MAX2(1, sctx->framebuffer.nr_color_samples); - } - - if (tex->surface.is_linear) - sctx->framebuffer.any_dst_linear = true; - - if (vi_dcc_enabled(tex, surf->base.u.tex.level)) { - sctx->framebuffer.CB_has_shader_readable_metadata = true; - - if (sctx->chip_class >= GFX9 && - !tex->surface.u.gfx9.dcc.pipe_aligned) - sctx->framebuffer.all_DCC_pipe_aligned = false; - } - - si_context_add_resource_size(sctx, surf->base.texture); - - p_atomic_inc(&tex->framebuffers_bound); - - if (tex->dcc_gather_statistics) { - /* Dirty tracking must be enabled for DCC usage analysis. */ - sctx->framebuffer.compressed_cb_mask |= 1 << i; - vi_separate_dcc_start_query(sctx, tex); - } - - /* Update the minimum but don't keep 0. */ - if (!sctx->framebuffer.min_bytes_per_pixel || - tex->surface.bpe < sctx->framebuffer.min_bytes_per_pixel) - sctx->framebuffer.min_bytes_per_pixel = tex->surface.bpe; - } - - /* For optimal DCC performance. */ - if (sctx->chip_class >= GFX10) - sctx->framebuffer.dcc_overwrite_combiner_watermark = 6; - else - sctx->framebuffer.dcc_overwrite_combiner_watermark = 4; - - struct si_texture *zstex = NULL; - - if (state->zsbuf) { - surf = (struct si_surface*)state->zsbuf; - zstex = (struct si_texture*)surf->base.texture; - - if (!surf->depth_initialized) { - si_init_depth_surface(sctx, surf); - } - - if (vi_tc_compat_htile_enabled(zstex, surf->base.u.tex.level, - PIPE_MASK_ZS)) - sctx->framebuffer.DB_has_shader_readable_metadata = true; - - si_context_add_resource_size(sctx, surf->base.texture); - - /* Update the minimum but don't keep 0. */ - if (!sctx->framebuffer.min_bytes_per_pixel || - zstex->surface.bpe < sctx->framebuffer.min_bytes_per_pixel) - sctx->framebuffer.min_bytes_per_pixel = zstex->surface.bpe; - } - - si_update_ps_colorbuf0_slot(sctx); - si_update_poly_offset_state(sctx); - si_mark_atom_dirty(sctx, &sctx->atoms.s.cb_render_state); - si_mark_atom_dirty(sctx, &sctx->atoms.s.framebuffer); - - if (sctx->screen->dpbb_allowed) - si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state); - - if (sctx->framebuffer.any_dst_linear != old_any_dst_linear) - si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config); - - if (sctx->screen->has_out_of_order_rast && - (sctx->framebuffer.colorbuf_enabled_4bit != old_colorbuf_enabled_4bit || - !!sctx->framebuffer.state.zsbuf != old_has_zsbuf || - (zstex && zstex->surface.has_stencil != old_has_stencil))) - si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config); - - if (sctx->framebuffer.nr_samples != old_nr_samples) { - struct pipe_constant_buffer constbuf = {0}; - - si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config); - si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state); - - constbuf.buffer = sctx->sample_pos_buffer; - - /* Set sample locations as fragment shader constants. */ - switch (sctx->framebuffer.nr_samples) { - case 1: - constbuf.buffer_offset = 0; - break; - case 2: - constbuf.buffer_offset = (ubyte*)sctx->sample_positions.x2 - - (ubyte*)sctx->sample_positions.x1; - break; - case 4: - constbuf.buffer_offset = (ubyte*)sctx->sample_positions.x4 - - (ubyte*)sctx->sample_positions.x1; - break; - case 8: - constbuf.buffer_offset = (ubyte*)sctx->sample_positions.x8 - - (ubyte*)sctx->sample_positions.x1; - break; - case 16: - constbuf.buffer_offset = (ubyte*)sctx->sample_positions.x16 - - (ubyte*)sctx->sample_positions.x1; - break; - default: - PRINT_ERR("Requested an invalid number of samples %i.\n", - sctx->framebuffer.nr_samples); - assert(0); - } - constbuf.buffer_size = sctx->framebuffer.nr_samples * 2 * 4; - si_set_rw_buffer(sctx, SI_PS_CONST_SAMPLE_POSITIONS, &constbuf); - - si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_sample_locs); - } - - sctx->do_update_shaders = true; - - if (!sctx->decompression_enabled) { - /* Prevent textures decompression when the framebuffer state - * changes come from the decompression passes themselves. - */ - sctx->need_check_render_feedback = true; - } + struct si_context *sctx = (struct si_context *)ctx; + struct si_surface *surf = NULL; + struct si_texture *tex; + bool old_any_dst_linear = sctx->framebuffer.any_dst_linear; + unsigned old_nr_samples = sctx->framebuffer.nr_samples; + unsigned old_colorbuf_enabled_4bit = sctx->framebuffer.colorbuf_enabled_4bit; + bool old_has_zsbuf = !!sctx->framebuffer.state.zsbuf; + bool old_has_stencil = + old_has_zsbuf && + ((struct si_texture*)sctx->framebuffer.state.zsbuf->texture)->surface.has_stencil; + bool unbound = false; + int i; + + /* Reject zero-sized framebuffers due to a hw bug on GFX6 that occurs + * when PA_SU_HARDWARE_SCREEN_OFFSET != 0 and any_scissor.BR_X/Y <= 0. + * We could implement the full workaround here, but it's a useless case. + */ + if ((!state->width || !state->height) && (state->nr_cbufs || state->zsbuf)) { + unreachable("the framebuffer shouldn't have zero area"); + return; + } + + si_update_fb_dirtiness_after_rendering(sctx); + + for (i = 0; i < sctx->framebuffer.state.nr_cbufs; i++) { + if (!sctx->framebuffer.state.cbufs[i]) + continue; + + tex = (struct si_texture*)sctx->framebuffer.state.cbufs[i]->texture; + if (tex->dcc_gather_statistics) + vi_separate_dcc_stop_query(sctx, tex); + } + + /* Disable DCC if the formats are incompatible. */ + for (i = 0; i < state->nr_cbufs; i++) { + if (!state->cbufs[i]) + continue; + + surf = (struct si_surface*)state->cbufs[i]; + tex = (struct si_texture*)surf->base.texture; + + if (!surf->dcc_incompatible) + continue; + + /* Since the DCC decompression calls back into set_framebuffer- + * _state, we need to unbind the framebuffer, so that + * vi_separate_dcc_stop_query isn't called twice with the same + * color buffer. + */ + if (!unbound) { + util_copy_framebuffer_state(&sctx->framebuffer.state, NULL); + unbound = true; + } + + if (vi_dcc_enabled(tex, surf->base.u.tex.level)) + if (!si_texture_disable_dcc(sctx, tex)) + si_decompress_dcc(sctx, tex); + + surf->dcc_incompatible = false; + } + + /* Only flush TC when changing the framebuffer state, because + * the only client not using TC that can change textures is + * the framebuffer. + * + * Wait for compute shaders because of possible transitions: + * - FB write -> shader read + * - shader write -> FB read + * + * DB caches are flushed on demand (using si_decompress_textures). + * + * When MSAA is enabled, CB and TC caches are flushed on demand + * (after FMASK decompression). Shader write -> FB read transitions + * cannot happen for MSAA textures, because MSAA shader images are + * not supported. + * + * Only flush and wait for CB if there is actually a bound color buffer. + */ + if (sctx->framebuffer.uncompressed_cb_mask) { + si_make_CB_shader_coherent(sctx, sctx->framebuffer.nr_samples, + sctx->framebuffer.CB_has_shader_readable_metadata, + sctx->framebuffer.all_DCC_pipe_aligned); + } + + sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH; + + /* u_blitter doesn't invoke depth decompression when it does multiple + * blits in a row, but the only case when it matters for DB is when + * doing generate_mipmap. So here we flush DB manually between + * individual generate_mipmap blits. + * Note that lower mipmap levels aren't compressed. + */ + if (sctx->generate_mipmap_for_depth) { + si_make_DB_shader_coherent(sctx, 1, false, + sctx->framebuffer.DB_has_shader_readable_metadata); + } else if (sctx->chip_class == GFX9) { + /* It appears that DB metadata "leaks" in a sequence of: + * - depth clear + * - DCC decompress for shader image writes (with DB disabled) + * - render with DEPTH_BEFORE_SHADER=1 + * Flushing DB metadata works around the problem. + */ + sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_DB_META; + } + + /* Take the maximum of the old and new count. If the new count is lower, + * dirtying is needed to disable the unbound colorbuffers. + */ + sctx->framebuffer.dirty_cbufs |= + (1 << MAX2(sctx->framebuffer.state.nr_cbufs, state->nr_cbufs)) - 1; + sctx->framebuffer.dirty_zsbuf |= sctx->framebuffer.state.zsbuf != state->zsbuf; + + si_dec_framebuffer_counters(&sctx->framebuffer.state); + util_copy_framebuffer_state(&sctx->framebuffer.state, state); + + sctx->framebuffer.colorbuf_enabled_4bit = 0; + sctx->framebuffer.spi_shader_col_format = 0; + sctx->framebuffer.spi_shader_col_format_alpha = 0; + sctx->framebuffer.spi_shader_col_format_blend = 0; + sctx->framebuffer.spi_shader_col_format_blend_alpha = 0; + sctx->framebuffer.color_is_int8 = 0; + sctx->framebuffer.color_is_int10 = 0; + + sctx->framebuffer.compressed_cb_mask = 0; + sctx->framebuffer.uncompressed_cb_mask = 0; + sctx->framebuffer.displayable_dcc_cb_mask = 0; + sctx->framebuffer.nr_samples = util_framebuffer_get_num_samples(state); + sctx->framebuffer.nr_color_samples = sctx->framebuffer.nr_samples; + sctx->framebuffer.log_samples = util_logbase2(sctx->framebuffer.nr_samples); + sctx->framebuffer.any_dst_linear = false; + sctx->framebuffer.CB_has_shader_readable_metadata = false; + sctx->framebuffer.DB_has_shader_readable_metadata = false; + sctx->framebuffer.all_DCC_pipe_aligned = true; + sctx->framebuffer.min_bytes_per_pixel = 0; + + for (i = 0; i < state->nr_cbufs; i++) { + if (!state->cbufs[i]) + continue; + + surf = (struct si_surface*)state->cbufs[i]; + tex = (struct si_texture*)surf->base.texture; + + if (!surf->color_initialized) { + si_initialize_color_surface(sctx, surf); + } + + sctx->framebuffer.colorbuf_enabled_4bit |= 0xf << (i * 4); + sctx->framebuffer.spi_shader_col_format |= + surf->spi_shader_col_format << (i * 4); + sctx->framebuffer.spi_shader_col_format_alpha |= + surf->spi_shader_col_format_alpha << (i * 4); + sctx->framebuffer.spi_shader_col_format_blend |= + surf->spi_shader_col_format_blend << (i * 4); + sctx->framebuffer.spi_shader_col_format_blend_alpha |= + surf->spi_shader_col_format_blend_alpha << (i * 4); + + if (surf->color_is_int8) + sctx->framebuffer.color_is_int8 |= 1 << i; + if (surf->color_is_int10) + sctx->framebuffer.color_is_int10 |= 1 << i; + + if (tex->surface.fmask_offset) + sctx->framebuffer.compressed_cb_mask |= 1 << i; + else + sctx->framebuffer.uncompressed_cb_mask |= 1 << i; + + if (tex->surface.dcc_offset) + sctx->framebuffer.displayable_dcc_cb_mask |= 1 << i; + + /* Don't update nr_color_samples for non-AA buffers. + * (e.g. destination of MSAA resolve) + */ + if (tex->buffer.b.b.nr_samples >= 2 && + tex->buffer.b.b.nr_storage_samples < tex->buffer.b.b.nr_samples) { + sctx->framebuffer.nr_color_samples = + MIN2(sctx->framebuffer.nr_color_samples, + tex->buffer.b.b.nr_storage_samples); + sctx->framebuffer.nr_color_samples = + MAX2(1, sctx->framebuffer.nr_color_samples); + } + + if (tex->surface.is_linear) + sctx->framebuffer.any_dst_linear = true; + + if (vi_dcc_enabled(tex, surf->base.u.tex.level)) { + sctx->framebuffer.CB_has_shader_readable_metadata = true; + + if (sctx->chip_class >= GFX9 && + !tex->surface.u.gfx9.dcc.pipe_aligned) + sctx->framebuffer.all_DCC_pipe_aligned = false; + } + + si_context_add_resource_size(sctx, surf->base.texture); + + p_atomic_inc(&tex->framebuffers_bound); + + if (tex->dcc_gather_statistics) { + /* Dirty tracking must be enabled for DCC usage analysis. */ + sctx->framebuffer.compressed_cb_mask |= 1 << i; + vi_separate_dcc_start_query(sctx, tex); + } + + /* Update the minimum but don't keep 0. */ + if (!sctx->framebuffer.min_bytes_per_pixel || + tex->surface.bpe < sctx->framebuffer.min_bytes_per_pixel) + sctx->framebuffer.min_bytes_per_pixel = tex->surface.bpe; + } + + /* For optimal DCC performance. */ + if (sctx->chip_class >= GFX10) + sctx->framebuffer.dcc_overwrite_combiner_watermark = 6; + else + sctx->framebuffer.dcc_overwrite_combiner_watermark = 4; + + struct si_texture *zstex = NULL; + + if (state->zsbuf) { + surf = (struct si_surface*)state->zsbuf; + zstex = (struct si_texture*)surf->base.texture; + + if (!surf->depth_initialized) { + si_init_depth_surface(sctx, surf); + } + + if (vi_tc_compat_htile_enabled(zstex, surf->base.u.tex.level, + PIPE_MASK_ZS)) + sctx->framebuffer.DB_has_shader_readable_metadata = true; + + si_context_add_resource_size(sctx, surf->base.texture); + + /* Update the minimum but don't keep 0. */ + if (!sctx->framebuffer.min_bytes_per_pixel || + zstex->surface.bpe < sctx->framebuffer.min_bytes_per_pixel) + sctx->framebuffer.min_bytes_per_pixel = zstex->surface.bpe; + } + + si_update_ps_colorbuf0_slot(sctx); + si_update_poly_offset_state(sctx); + si_update_ngg_small_prim_precision(sctx); + si_mark_atom_dirty(sctx, &sctx->atoms.s.cb_render_state); + si_mark_atom_dirty(sctx, &sctx->atoms.s.framebuffer); + + if (sctx->screen->dpbb_allowed) + si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state); + + if (sctx->framebuffer.any_dst_linear != old_any_dst_linear) + si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config); + + if (sctx->screen->has_out_of_order_rast && + (sctx->framebuffer.colorbuf_enabled_4bit != old_colorbuf_enabled_4bit || + !!sctx->framebuffer.state.zsbuf != old_has_zsbuf || + (zstex && zstex->surface.has_stencil != old_has_stencil))) + si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config); + + if (sctx->framebuffer.nr_samples != old_nr_samples) { + struct pipe_constant_buffer constbuf = {0}; + + si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config); + si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state); + + constbuf.buffer = sctx->sample_pos_buffer; + + /* Set sample locations as fragment shader constants. */ + switch (sctx->framebuffer.nr_samples) { + case 1: + constbuf.buffer_offset = 0; + break; + case 2: + constbuf.buffer_offset = (ubyte*)sctx->sample_positions.x2 - + (ubyte*)sctx->sample_positions.x1; + break; + case 4: + constbuf.buffer_offset = (ubyte*)sctx->sample_positions.x4 - + (ubyte*)sctx->sample_positions.x1; + break; + case 8: + constbuf.buffer_offset = (ubyte*)sctx->sample_positions.x8 - + (ubyte*)sctx->sample_positions.x1; + break; + case 16: + constbuf.buffer_offset = (ubyte*)sctx->sample_positions.x16 - + (ubyte*)sctx->sample_positions.x1; + break; + default: + PRINT_ERR("Requested an invalid number of samples %i.\n", + sctx->framebuffer.nr_samples); + assert(0); + } + constbuf.buffer_size = sctx->framebuffer.nr_samples * 2 * 4; + si_set_rw_buffer(sctx, SI_PS_CONST_SAMPLE_POSITIONS, &constbuf); + + si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_sample_locs); + } + + sctx->do_update_shaders = true; + + if (!sctx->decompression_enabled) { + /* Prevent textures decompression when the framebuffer state + * changes come from the decompression passes themselves. + */ + sctx->need_check_render_feedback = true; + } } static void si_emit_framebuffer_state(struct si_context *sctx) { - struct radeon_cmdbuf *cs = sctx->gfx_cs; - struct pipe_framebuffer_state *state = &sctx->framebuffer.state; - unsigned i, nr_cbufs = state->nr_cbufs; - struct si_texture *tex = NULL; - struct si_surface *cb = NULL; - unsigned cb_color_info = 0; - - /* Colorbuffers. */ - for (i = 0; i < nr_cbufs; i++) { - uint64_t cb_color_base, cb_color_fmask, cb_color_cmask, cb_dcc_base; - unsigned cb_color_attrib; - - if (!(sctx->framebuffer.dirty_cbufs & (1 << i))) - continue; - - cb = (struct si_surface*)state->cbufs[i]; - if (!cb) { - radeon_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C, - S_028C70_FORMAT(V_028C70_COLOR_INVALID)); - continue; - } - - tex = (struct si_texture *)cb->base.texture; - radeon_add_to_buffer_list(sctx, sctx->gfx_cs, - &tex->buffer, RADEON_USAGE_READWRITE, - tex->buffer.b.b.nr_samples > 1 ? - RADEON_PRIO_COLOR_BUFFER_MSAA : - RADEON_PRIO_COLOR_BUFFER); - - if (tex->cmask_buffer && tex->cmask_buffer != &tex->buffer) { - radeon_add_to_buffer_list(sctx, sctx->gfx_cs, - tex->cmask_buffer, RADEON_USAGE_READWRITE, - RADEON_PRIO_SEPARATE_META); - } - - if (tex->dcc_separate_buffer) - radeon_add_to_buffer_list(sctx, sctx->gfx_cs, - tex->dcc_separate_buffer, - RADEON_USAGE_READWRITE, - RADEON_PRIO_SEPARATE_META); - - /* Compute mutable surface parameters. */ - cb_color_base = tex->buffer.gpu_address >> 8; - cb_color_fmask = 0; - cb_color_cmask = tex->cmask_base_address_reg; - cb_dcc_base = 0; - cb_color_info = cb->cb_color_info | tex->cb_color_info; - cb_color_attrib = cb->cb_color_attrib; - - if (cb->base.u.tex.level > 0) - cb_color_info &= C_028C70_FAST_CLEAR; - - if (tex->fmask_offset) { - cb_color_fmask = (tex->buffer.gpu_address + tex->fmask_offset) >> 8; - cb_color_fmask |= tex->surface.fmask_tile_swizzle; - } - - /* Set up DCC. */ - if (vi_dcc_enabled(tex, cb->base.u.tex.level)) { - bool is_msaa_resolve_dst = state->cbufs[0] && - state->cbufs[0]->texture->nr_samples > 1 && - state->cbufs[1] == &cb->base && - state->cbufs[1]->texture->nr_samples <= 1; - - if (!is_msaa_resolve_dst) - cb_color_info |= S_028C70_DCC_ENABLE(1); - - cb_dcc_base = ((!tex->dcc_separate_buffer ? tex->buffer.gpu_address : 0) + - tex->dcc_offset) >> 8; - - unsigned dcc_tile_swizzle = tex->surface.tile_swizzle; - dcc_tile_swizzle &= (tex->surface.dcc_alignment - 1) >> 8; - cb_dcc_base |= dcc_tile_swizzle; - } - - if (sctx->chip_class >= GFX10) { - unsigned cb_color_attrib3; - - /* Set mutable surface parameters. */ - cb_color_base += tex->surface.u.gfx9.surf_offset >> 8; - cb_color_base |= tex->surface.tile_swizzle; - if (!tex->fmask_offset) - cb_color_fmask = cb_color_base; - if (cb->base.u.tex.level > 0) - cb_color_cmask = cb_color_base; - - cb_color_attrib3 = cb->cb_color_attrib3 | - S_028EE0_COLOR_SW_MODE(tex->surface.u.gfx9.surf.swizzle_mode) | - S_028EE0_FMASK_SW_MODE(tex->surface.u.gfx9.fmask.swizzle_mode) | - S_028EE0_CMASK_PIPE_ALIGNED(tex->surface.u.gfx9.cmask.pipe_aligned) | - S_028EE0_DCC_PIPE_ALIGNED(tex->surface.u.gfx9.dcc.pipe_aligned); - - radeon_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 14); - radeon_emit(cs, cb_color_base); /* CB_COLOR0_BASE */ - radeon_emit(cs, 0); /* hole */ - radeon_emit(cs, 0); /* hole */ - radeon_emit(cs, cb->cb_color_view); /* CB_COLOR0_VIEW */ - radeon_emit(cs, cb_color_info); /* CB_COLOR0_INFO */ - radeon_emit(cs, cb_color_attrib); /* CB_COLOR0_ATTRIB */ - radeon_emit(cs, cb->cb_dcc_control); /* CB_COLOR0_DCC_CONTROL */ - radeon_emit(cs, cb_color_cmask); /* CB_COLOR0_CMASK */ - radeon_emit(cs, 0); /* hole */ - radeon_emit(cs, cb_color_fmask); /* CB_COLOR0_FMASK */ - radeon_emit(cs, 0); /* hole */ - radeon_emit(cs, tex->color_clear_value[0]); /* CB_COLOR0_CLEAR_WORD0 */ - radeon_emit(cs, tex->color_clear_value[1]); /* CB_COLOR0_CLEAR_WORD1 */ - radeon_emit(cs, cb_dcc_base); /* CB_COLOR0_DCC_BASE */ - - radeon_set_context_reg(cs, R_028E40_CB_COLOR0_BASE_EXT + i * 4, - cb_color_base >> 32); - radeon_set_context_reg(cs, R_028E60_CB_COLOR0_CMASK_BASE_EXT + i * 4, - cb_color_cmask >> 32); - radeon_set_context_reg(cs, R_028E80_CB_COLOR0_FMASK_BASE_EXT + i * 4, - cb_color_fmask >> 32); - radeon_set_context_reg(cs, R_028EA0_CB_COLOR0_DCC_BASE_EXT + i * 4, - cb_dcc_base >> 32); - radeon_set_context_reg(cs, R_028EC0_CB_COLOR0_ATTRIB2 + i * 4, - cb->cb_color_attrib2); - radeon_set_context_reg(cs, R_028EE0_CB_COLOR0_ATTRIB3 + i * 4, - cb_color_attrib3); - } else if (sctx->chip_class == GFX9) { - struct gfx9_surf_meta_flags meta; - - if (tex->dcc_offset) - meta = tex->surface.u.gfx9.dcc; - else - meta = tex->surface.u.gfx9.cmask; - - /* Set mutable surface parameters. */ - cb_color_base += tex->surface.u.gfx9.surf_offset >> 8; - cb_color_base |= tex->surface.tile_swizzle; - if (!tex->fmask_offset) - cb_color_fmask = cb_color_base; - if (cb->base.u.tex.level > 0) - cb_color_cmask = cb_color_base; - cb_color_attrib |= S_028C74_COLOR_SW_MODE(tex->surface.u.gfx9.surf.swizzle_mode) | - S_028C74_FMASK_SW_MODE(tex->surface.u.gfx9.fmask.swizzle_mode) | - S_028C74_RB_ALIGNED(meta.rb_aligned) | - S_028C74_PIPE_ALIGNED(meta.pipe_aligned); - - radeon_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 15); - radeon_emit(cs, cb_color_base); /* CB_COLOR0_BASE */ - radeon_emit(cs, S_028C64_BASE_256B(cb_color_base >> 32)); /* CB_COLOR0_BASE_EXT */ - radeon_emit(cs, cb->cb_color_attrib2); /* CB_COLOR0_ATTRIB2 */ - radeon_emit(cs, cb->cb_color_view); /* CB_COLOR0_VIEW */ - radeon_emit(cs, cb_color_info); /* CB_COLOR0_INFO */ - radeon_emit(cs, cb_color_attrib); /* CB_COLOR0_ATTRIB */ - radeon_emit(cs, cb->cb_dcc_control); /* CB_COLOR0_DCC_CONTROL */ - radeon_emit(cs, cb_color_cmask); /* CB_COLOR0_CMASK */ - radeon_emit(cs, S_028C80_BASE_256B(cb_color_cmask >> 32)); /* CB_COLOR0_CMASK_BASE_EXT */ - radeon_emit(cs, cb_color_fmask); /* CB_COLOR0_FMASK */ - radeon_emit(cs, S_028C88_BASE_256B(cb_color_fmask >> 32)); /* CB_COLOR0_FMASK_BASE_EXT */ - radeon_emit(cs, tex->color_clear_value[0]); /* CB_COLOR0_CLEAR_WORD0 */ - radeon_emit(cs, tex->color_clear_value[1]); /* CB_COLOR0_CLEAR_WORD1 */ - radeon_emit(cs, cb_dcc_base); /* CB_COLOR0_DCC_BASE */ - radeon_emit(cs, S_028C98_BASE_256B(cb_dcc_base >> 32)); /* CB_COLOR0_DCC_BASE_EXT */ - - radeon_set_context_reg(cs, R_0287A0_CB_MRT0_EPITCH + i * 4, - S_0287A0_EPITCH(tex->surface.u.gfx9.surf.epitch)); - } else { - /* Compute mutable surface parameters (GFX6-GFX8). */ - const struct legacy_surf_level *level_info = - &tex->surface.u.legacy.level[cb->base.u.tex.level]; - unsigned pitch_tile_max, slice_tile_max, tile_mode_index; - unsigned cb_color_pitch, cb_color_slice, cb_color_fmask_slice; - - cb_color_base += level_info->offset >> 8; - /* Only macrotiled modes can set tile swizzle. */ - if (level_info->mode == RADEON_SURF_MODE_2D) - cb_color_base |= tex->surface.tile_swizzle; - - if (!tex->fmask_offset) - cb_color_fmask = cb_color_base; - if (cb->base.u.tex.level > 0) - cb_color_cmask = cb_color_base; - if (cb_dcc_base) - cb_dcc_base += level_info->dcc_offset >> 8; - - pitch_tile_max = level_info->nblk_x / 8 - 1; - slice_tile_max = level_info->nblk_x * - level_info->nblk_y / 64 - 1; - tile_mode_index = si_tile_mode_index(tex, cb->base.u.tex.level, false); - - cb_color_attrib |= S_028C74_TILE_MODE_INDEX(tile_mode_index); - cb_color_pitch = S_028C64_TILE_MAX(pitch_tile_max); - cb_color_slice = S_028C68_TILE_MAX(slice_tile_max); - - if (tex->fmask_offset) { - if (sctx->chip_class >= GFX7) - cb_color_pitch |= S_028C64_FMASK_TILE_MAX(tex->surface.u.legacy.fmask.pitch_in_pixels / 8 - 1); - cb_color_attrib |= S_028C74_FMASK_TILE_MODE_INDEX(tex->surface.u.legacy.fmask.tiling_index); - cb_color_fmask_slice = S_028C88_TILE_MAX(tex->surface.u.legacy.fmask.slice_tile_max); - } else { - /* This must be set for fast clear to work without FMASK. */ - if (sctx->chip_class >= GFX7) - cb_color_pitch |= S_028C64_FMASK_TILE_MAX(pitch_tile_max); - cb_color_attrib |= S_028C74_FMASK_TILE_MODE_INDEX(tile_mode_index); - cb_color_fmask_slice = S_028C88_TILE_MAX(slice_tile_max); - } - - radeon_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, - sctx->chip_class >= GFX8 ? 14 : 13); - radeon_emit(cs, cb_color_base); /* CB_COLOR0_BASE */ - radeon_emit(cs, cb_color_pitch); /* CB_COLOR0_PITCH */ - radeon_emit(cs, cb_color_slice); /* CB_COLOR0_SLICE */ - radeon_emit(cs, cb->cb_color_view); /* CB_COLOR0_VIEW */ - radeon_emit(cs, cb_color_info); /* CB_COLOR0_INFO */ - radeon_emit(cs, cb_color_attrib); /* CB_COLOR0_ATTRIB */ - radeon_emit(cs, cb->cb_dcc_control); /* CB_COLOR0_DCC_CONTROL */ - radeon_emit(cs, cb_color_cmask); /* CB_COLOR0_CMASK */ - radeon_emit(cs, tex->surface.u.legacy.cmask_slice_tile_max); /* CB_COLOR0_CMASK_SLICE */ - radeon_emit(cs, cb_color_fmask); /* CB_COLOR0_FMASK */ - radeon_emit(cs, cb_color_fmask_slice); /* CB_COLOR0_FMASK_SLICE */ - radeon_emit(cs, tex->color_clear_value[0]); /* CB_COLOR0_CLEAR_WORD0 */ - radeon_emit(cs, tex->color_clear_value[1]); /* CB_COLOR0_CLEAR_WORD1 */ - - if (sctx->chip_class >= GFX8) /* R_028C94_CB_COLOR0_DCC_BASE */ - radeon_emit(cs, cb_dcc_base); - } - } - for (; i < 8 ; i++) - if (sctx->framebuffer.dirty_cbufs & (1 << i)) - radeon_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C, 0); - - /* ZS buffer. */ - if (state->zsbuf && sctx->framebuffer.dirty_zsbuf) { - struct si_surface *zb = (struct si_surface*)state->zsbuf; - struct si_texture *tex = (struct si_texture*)zb->base.texture; - - radeon_add_to_buffer_list(sctx, sctx->gfx_cs, - &tex->buffer, RADEON_USAGE_READWRITE, - zb->base.texture->nr_samples > 1 ? - RADEON_PRIO_DEPTH_BUFFER_MSAA : - RADEON_PRIO_DEPTH_BUFFER); - - if (sctx->chip_class >= GFX10) { - radeon_set_context_reg(cs, R_028014_DB_HTILE_DATA_BASE, zb->db_htile_data_base); - radeon_set_context_reg(cs, R_02801C_DB_DEPTH_SIZE_XY, zb->db_depth_size); - - radeon_set_context_reg_seq(cs, R_02803C_DB_DEPTH_INFO, 7); - radeon_emit(cs, S_02803C_RESOURCE_LEVEL(1)); /* DB_DEPTH_INFO */ - radeon_emit(cs, zb->db_z_info | /* DB_Z_INFO */ - S_028038_ZRANGE_PRECISION(tex->depth_clear_value != 0)); - radeon_emit(cs, zb->db_stencil_info); /* DB_STENCIL_INFO */ - radeon_emit(cs, zb->db_depth_base); /* DB_Z_READ_BASE */ - radeon_emit(cs, zb->db_stencil_base); /* DB_STENCIL_READ_BASE */ - radeon_emit(cs, zb->db_depth_base); /* DB_Z_WRITE_BASE */ - radeon_emit(cs, zb->db_stencil_base); /* DB_STENCIL_WRITE_BASE */ - - radeon_set_context_reg_seq(cs, R_028068_DB_Z_READ_BASE_HI, 5); - radeon_emit(cs, zb->db_depth_base >> 32); /* DB_Z_READ_BASE_HI */ - radeon_emit(cs, zb->db_stencil_base >> 32); /* DB_STENCIL_READ_BASE_HI */ - radeon_emit(cs, zb->db_depth_base >> 32); /* DB_Z_WRITE_BASE_HI */ - radeon_emit(cs, zb->db_stencil_base >> 32); /* DB_STENCIL_WRITE_BASE_HI */ - radeon_emit(cs, zb->db_htile_data_base >> 32); /* DB_HTILE_DATA_BASE_HI */ - } else if (sctx->chip_class == GFX9) { - radeon_set_context_reg_seq(cs, R_028014_DB_HTILE_DATA_BASE, 3); - radeon_emit(cs, zb->db_htile_data_base); /* DB_HTILE_DATA_BASE */ - radeon_emit(cs, S_028018_BASE_HI(zb->db_htile_data_base >> 32)); /* DB_HTILE_DATA_BASE_HI */ - radeon_emit(cs, zb->db_depth_size); /* DB_DEPTH_SIZE */ - - radeon_set_context_reg_seq(cs, R_028038_DB_Z_INFO, 10); - radeon_emit(cs, zb->db_z_info | /* DB_Z_INFO */ - S_028038_ZRANGE_PRECISION(tex->depth_clear_value != 0)); - radeon_emit(cs, zb->db_stencil_info); /* DB_STENCIL_INFO */ - radeon_emit(cs, zb->db_depth_base); /* DB_Z_READ_BASE */ - radeon_emit(cs, S_028044_BASE_HI(zb->db_depth_base >> 32)); /* DB_Z_READ_BASE_HI */ - radeon_emit(cs, zb->db_stencil_base); /* DB_STENCIL_READ_BASE */ - radeon_emit(cs, S_02804C_BASE_HI(zb->db_stencil_base >> 32)); /* DB_STENCIL_READ_BASE_HI */ - radeon_emit(cs, zb->db_depth_base); /* DB_Z_WRITE_BASE */ - radeon_emit(cs, S_028054_BASE_HI(zb->db_depth_base >> 32)); /* DB_Z_WRITE_BASE_HI */ - radeon_emit(cs, zb->db_stencil_base); /* DB_STENCIL_WRITE_BASE */ - radeon_emit(cs, S_02805C_BASE_HI(zb->db_stencil_base >> 32)); /* DB_STENCIL_WRITE_BASE_HI */ - - radeon_set_context_reg_seq(cs, R_028068_DB_Z_INFO2, 2); - radeon_emit(cs, zb->db_z_info2); /* DB_Z_INFO2 */ - radeon_emit(cs, zb->db_stencil_info2); /* DB_STENCIL_INFO2 */ - } else { - radeon_set_context_reg(cs, R_028014_DB_HTILE_DATA_BASE, zb->db_htile_data_base); - - radeon_set_context_reg_seq(cs, R_02803C_DB_DEPTH_INFO, 9); - radeon_emit(cs, zb->db_depth_info); /* DB_DEPTH_INFO */ - radeon_emit(cs, zb->db_z_info | /* DB_Z_INFO */ - S_028040_ZRANGE_PRECISION(tex->depth_clear_value != 0)); - radeon_emit(cs, zb->db_stencil_info); /* DB_STENCIL_INFO */ - radeon_emit(cs, zb->db_depth_base); /* DB_Z_READ_BASE */ - radeon_emit(cs, zb->db_stencil_base); /* DB_STENCIL_READ_BASE */ - radeon_emit(cs, zb->db_depth_base); /* DB_Z_WRITE_BASE */ - radeon_emit(cs, zb->db_stencil_base); /* DB_STENCIL_WRITE_BASE */ - radeon_emit(cs, zb->db_depth_size); /* DB_DEPTH_SIZE */ - radeon_emit(cs, zb->db_depth_slice); /* DB_DEPTH_SLICE */ - } - - radeon_set_context_reg_seq(cs, R_028028_DB_STENCIL_CLEAR, 2); - radeon_emit(cs, tex->stencil_clear_value); /* R_028028_DB_STENCIL_CLEAR */ - radeon_emit(cs, fui(tex->depth_clear_value)); /* R_02802C_DB_DEPTH_CLEAR */ - - radeon_set_context_reg(cs, R_028008_DB_DEPTH_VIEW, zb->db_depth_view); - radeon_set_context_reg(cs, R_028ABC_DB_HTILE_SURFACE, zb->db_htile_surface); - } else if (sctx->framebuffer.dirty_zsbuf) { - if (sctx->chip_class == GFX9) - radeon_set_context_reg_seq(cs, R_028038_DB_Z_INFO, 2); - else - radeon_set_context_reg_seq(cs, R_028040_DB_Z_INFO, 2); - - radeon_emit(cs, S_028040_FORMAT(V_028040_Z_INVALID)); /* DB_Z_INFO */ - radeon_emit(cs, S_028044_FORMAT(V_028044_STENCIL_INVALID)); /* DB_STENCIL_INFO */ - } + struct radeon_cmdbuf *cs = sctx->gfx_cs; + struct pipe_framebuffer_state *state = &sctx->framebuffer.state; + unsigned i, nr_cbufs = state->nr_cbufs; + struct si_texture *tex = NULL; + struct si_surface *cb = NULL; + unsigned cb_color_info = 0; + + /* Colorbuffers. */ + for (i = 0; i < nr_cbufs; i++) { + uint64_t cb_color_base, cb_color_fmask, cb_color_cmask, cb_dcc_base; + unsigned cb_color_attrib; + + if (!(sctx->framebuffer.dirty_cbufs & (1 << i))) + continue; + + cb = (struct si_surface*)state->cbufs[i]; + if (!cb) { + radeon_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C, + S_028C70_FORMAT(V_028C70_COLOR_INVALID)); + continue; + } + + tex = (struct si_texture *)cb->base.texture; + radeon_add_to_buffer_list(sctx, sctx->gfx_cs, + &tex->buffer, RADEON_USAGE_READWRITE, + tex->buffer.b.b.nr_samples > 1 ? + RADEON_PRIO_COLOR_BUFFER_MSAA : + RADEON_PRIO_COLOR_BUFFER); + + if (tex->cmask_buffer && tex->cmask_buffer != &tex->buffer) { + radeon_add_to_buffer_list(sctx, sctx->gfx_cs, + tex->cmask_buffer, RADEON_USAGE_READWRITE, + RADEON_PRIO_SEPARATE_META); + } + + if (tex->dcc_separate_buffer) + radeon_add_to_buffer_list(sctx, sctx->gfx_cs, + tex->dcc_separate_buffer, + RADEON_USAGE_READWRITE, + RADEON_PRIO_SEPARATE_META); + + /* Compute mutable surface parameters. */ + cb_color_base = tex->buffer.gpu_address >> 8; + cb_color_fmask = 0; + cb_color_cmask = tex->cmask_base_address_reg; + cb_dcc_base = 0; + cb_color_info = cb->cb_color_info | tex->cb_color_info; + cb_color_attrib = cb->cb_color_attrib; + + if (cb->base.u.tex.level > 0) + cb_color_info &= C_028C70_FAST_CLEAR; + + if (tex->surface.fmask_offset) { + cb_color_fmask = (tex->buffer.gpu_address + tex->surface.fmask_offset) >> 8; + cb_color_fmask |= tex->surface.fmask_tile_swizzle; + } + + /* Set up DCC. */ + if (vi_dcc_enabled(tex, cb->base.u.tex.level)) { + bool is_msaa_resolve_dst = state->cbufs[0] && + state->cbufs[0]->texture->nr_samples > 1 && + state->cbufs[1] == &cb->base && + state->cbufs[1]->texture->nr_samples <= 1; + + if (!is_msaa_resolve_dst) + cb_color_info |= S_028C70_DCC_ENABLE(1); + + cb_dcc_base = ((!tex->dcc_separate_buffer ? tex->buffer.gpu_address : 0) + + tex->surface.dcc_offset) >> 8; + + unsigned dcc_tile_swizzle = tex->surface.tile_swizzle; + dcc_tile_swizzle &= (tex->surface.dcc_alignment - 1) >> 8; + cb_dcc_base |= dcc_tile_swizzle; + } + + if (sctx->chip_class >= GFX10) { + unsigned cb_color_attrib3; + + /* Set mutable surface parameters. */ + cb_color_base += tex->surface.u.gfx9.surf_offset >> 8; + cb_color_base |= tex->surface.tile_swizzle; + if (!tex->surface.fmask_offset) + cb_color_fmask = cb_color_base; + if (cb->base.u.tex.level > 0) + cb_color_cmask = cb_color_base; + + cb_color_attrib3 = cb->cb_color_attrib3 | + S_028EE0_COLOR_SW_MODE(tex->surface.u.gfx9.surf.swizzle_mode) | + S_028EE0_FMASK_SW_MODE(tex->surface.u.gfx9.fmask.swizzle_mode) | + S_028EE0_CMASK_PIPE_ALIGNED(tex->surface.u.gfx9.cmask.pipe_aligned) | + S_028EE0_DCC_PIPE_ALIGNED(tex->surface.u.gfx9.dcc.pipe_aligned); + + radeon_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 14); + radeon_emit(cs, cb_color_base); /* CB_COLOR0_BASE */ + radeon_emit(cs, 0); /* hole */ + radeon_emit(cs, 0); /* hole */ + radeon_emit(cs, cb->cb_color_view); /* CB_COLOR0_VIEW */ + radeon_emit(cs, cb_color_info); /* CB_COLOR0_INFO */ + radeon_emit(cs, cb_color_attrib); /* CB_COLOR0_ATTRIB */ + radeon_emit(cs, cb->cb_dcc_control); /* CB_COLOR0_DCC_CONTROL */ + radeon_emit(cs, cb_color_cmask); /* CB_COLOR0_CMASK */ + radeon_emit(cs, 0); /* hole */ + radeon_emit(cs, cb_color_fmask); /* CB_COLOR0_FMASK */ + radeon_emit(cs, 0); /* hole */ + radeon_emit(cs, tex->color_clear_value[0]); /* CB_COLOR0_CLEAR_WORD0 */ + radeon_emit(cs, tex->color_clear_value[1]); /* CB_COLOR0_CLEAR_WORD1 */ + radeon_emit(cs, cb_dcc_base); /* CB_COLOR0_DCC_BASE */ + + radeon_set_context_reg(cs, R_028E40_CB_COLOR0_BASE_EXT + i * 4, + cb_color_base >> 32); + radeon_set_context_reg(cs, R_028E60_CB_COLOR0_CMASK_BASE_EXT + i * 4, + cb_color_cmask >> 32); + radeon_set_context_reg(cs, R_028E80_CB_COLOR0_FMASK_BASE_EXT + i * 4, + cb_color_fmask >> 32); + radeon_set_context_reg(cs, R_028EA0_CB_COLOR0_DCC_BASE_EXT + i * 4, + cb_dcc_base >> 32); + radeon_set_context_reg(cs, R_028EC0_CB_COLOR0_ATTRIB2 + i * 4, + cb->cb_color_attrib2); + radeon_set_context_reg(cs, R_028EE0_CB_COLOR0_ATTRIB3 + i * 4, + cb_color_attrib3); + } else if (sctx->chip_class == GFX9) { + struct gfx9_surf_meta_flags meta; + + if (tex->surface.dcc_offset) + meta = tex->surface.u.gfx9.dcc; + else + meta = tex->surface.u.gfx9.cmask; + + /* Set mutable surface parameters. */ + cb_color_base += tex->surface.u.gfx9.surf_offset >> 8; + cb_color_base |= tex->surface.tile_swizzle; + if (!tex->surface.fmask_offset) + cb_color_fmask = cb_color_base; + if (cb->base.u.tex.level > 0) + cb_color_cmask = cb_color_base; + cb_color_attrib |= S_028C74_COLOR_SW_MODE(tex->surface.u.gfx9.surf.swizzle_mode) | + S_028C74_FMASK_SW_MODE(tex->surface.u.gfx9.fmask.swizzle_mode) | + S_028C74_RB_ALIGNED(meta.rb_aligned) | + S_028C74_PIPE_ALIGNED(meta.pipe_aligned); + + radeon_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 15); + radeon_emit(cs, cb_color_base); /* CB_COLOR0_BASE */ + radeon_emit(cs, S_028C64_BASE_256B(cb_color_base >> 32)); /* CB_COLOR0_BASE_EXT */ + radeon_emit(cs, cb->cb_color_attrib2); /* CB_COLOR0_ATTRIB2 */ + radeon_emit(cs, cb->cb_color_view); /* CB_COLOR0_VIEW */ + radeon_emit(cs, cb_color_info); /* CB_COLOR0_INFO */ + radeon_emit(cs, cb_color_attrib); /* CB_COLOR0_ATTRIB */ + radeon_emit(cs, cb->cb_dcc_control); /* CB_COLOR0_DCC_CONTROL */ + radeon_emit(cs, cb_color_cmask); /* CB_COLOR0_CMASK */ + radeon_emit(cs, S_028C80_BASE_256B(cb_color_cmask >> 32)); /* CB_COLOR0_CMASK_BASE_EXT */ + radeon_emit(cs, cb_color_fmask); /* CB_COLOR0_FMASK */ + radeon_emit(cs, S_028C88_BASE_256B(cb_color_fmask >> 32)); /* CB_COLOR0_FMASK_BASE_EXT */ + radeon_emit(cs, tex->color_clear_value[0]); /* CB_COLOR0_CLEAR_WORD0 */ + radeon_emit(cs, tex->color_clear_value[1]); /* CB_COLOR0_CLEAR_WORD1 */ + radeon_emit(cs, cb_dcc_base); /* CB_COLOR0_DCC_BASE */ + radeon_emit(cs, S_028C98_BASE_256B(cb_dcc_base >> 32)); /* CB_COLOR0_DCC_BASE_EXT */ + + radeon_set_context_reg(cs, R_0287A0_CB_MRT0_EPITCH + i * 4, + S_0287A0_EPITCH(tex->surface.u.gfx9.surf.epitch)); + } else { + /* Compute mutable surface parameters (GFX6-GFX8). */ + const struct legacy_surf_level *level_info = + &tex->surface.u.legacy.level[cb->base.u.tex.level]; + unsigned pitch_tile_max, slice_tile_max, tile_mode_index; + unsigned cb_color_pitch, cb_color_slice, cb_color_fmask_slice; + + cb_color_base += level_info->offset >> 8; + /* Only macrotiled modes can set tile swizzle. */ + if (level_info->mode == RADEON_SURF_MODE_2D) + cb_color_base |= tex->surface.tile_swizzle; + + if (!tex->surface.fmask_offset) + cb_color_fmask = cb_color_base; + if (cb->base.u.tex.level > 0) + cb_color_cmask = cb_color_base; + if (cb_dcc_base) + cb_dcc_base += level_info->dcc_offset >> 8; + + pitch_tile_max = level_info->nblk_x / 8 - 1; + slice_tile_max = level_info->nblk_x * + level_info->nblk_y / 64 - 1; + tile_mode_index = si_tile_mode_index(tex, cb->base.u.tex.level, false); + + cb_color_attrib |= S_028C74_TILE_MODE_INDEX(tile_mode_index); + cb_color_pitch = S_028C64_TILE_MAX(pitch_tile_max); + cb_color_slice = S_028C68_TILE_MAX(slice_tile_max); + + if (tex->surface.fmask_offset) { + if (sctx->chip_class >= GFX7) + cb_color_pitch |= S_028C64_FMASK_TILE_MAX(tex->surface.u.legacy.fmask.pitch_in_pixels / 8 - 1); + cb_color_attrib |= S_028C74_FMASK_TILE_MODE_INDEX(tex->surface.u.legacy.fmask.tiling_index); + cb_color_fmask_slice = S_028C88_TILE_MAX(tex->surface.u.legacy.fmask.slice_tile_max); + } else { + /* This must be set for fast clear to work without FMASK. */ + if (sctx->chip_class >= GFX7) + cb_color_pitch |= S_028C64_FMASK_TILE_MAX(pitch_tile_max); + cb_color_attrib |= S_028C74_FMASK_TILE_MODE_INDEX(tile_mode_index); + cb_color_fmask_slice = S_028C88_TILE_MAX(slice_tile_max); + } + + radeon_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, + sctx->chip_class >= GFX8 ? 14 : 13); + radeon_emit(cs, cb_color_base); /* CB_COLOR0_BASE */ + radeon_emit(cs, cb_color_pitch); /* CB_COLOR0_PITCH */ + radeon_emit(cs, cb_color_slice); /* CB_COLOR0_SLICE */ + radeon_emit(cs, cb->cb_color_view); /* CB_COLOR0_VIEW */ + radeon_emit(cs, cb_color_info); /* CB_COLOR0_INFO */ + radeon_emit(cs, cb_color_attrib); /* CB_COLOR0_ATTRIB */ + radeon_emit(cs, cb->cb_dcc_control); /* CB_COLOR0_DCC_CONTROL */ + radeon_emit(cs, cb_color_cmask); /* CB_COLOR0_CMASK */ + radeon_emit(cs, tex->surface.u.legacy.cmask_slice_tile_max); /* CB_COLOR0_CMASK_SLICE */ + radeon_emit(cs, cb_color_fmask); /* CB_COLOR0_FMASK */ + radeon_emit(cs, cb_color_fmask_slice); /* CB_COLOR0_FMASK_SLICE */ + radeon_emit(cs, tex->color_clear_value[0]); /* CB_COLOR0_CLEAR_WORD0 */ + radeon_emit(cs, tex->color_clear_value[1]); /* CB_COLOR0_CLEAR_WORD1 */ + + if (sctx->chip_class >= GFX8) /* R_028C94_CB_COLOR0_DCC_BASE */ + radeon_emit(cs, cb_dcc_base); + } + } + for (; i < 8 ; i++) + if (sctx->framebuffer.dirty_cbufs & (1 << i)) + radeon_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C, 0); + + /* ZS buffer. */ + if (state->zsbuf && sctx->framebuffer.dirty_zsbuf) { + struct si_surface *zb = (struct si_surface*)state->zsbuf; + struct si_texture *tex = (struct si_texture*)zb->base.texture; + + radeon_add_to_buffer_list(sctx, sctx->gfx_cs, + &tex->buffer, RADEON_USAGE_READWRITE, + zb->base.texture->nr_samples > 1 ? + RADEON_PRIO_DEPTH_BUFFER_MSAA : + RADEON_PRIO_DEPTH_BUFFER); + + if (sctx->chip_class >= GFX10) { + radeon_set_context_reg(cs, R_028014_DB_HTILE_DATA_BASE, zb->db_htile_data_base); + radeon_set_context_reg(cs, R_02801C_DB_DEPTH_SIZE_XY, zb->db_depth_size); + + radeon_set_context_reg_seq(cs, R_02803C_DB_DEPTH_INFO, 7); + radeon_emit(cs, S_02803C_RESOURCE_LEVEL(1)); /* DB_DEPTH_INFO */ + radeon_emit(cs, zb->db_z_info | /* DB_Z_INFO */ + S_028038_ZRANGE_PRECISION(tex->depth_clear_value != 0)); + radeon_emit(cs, zb->db_stencil_info); /* DB_STENCIL_INFO */ + radeon_emit(cs, zb->db_depth_base); /* DB_Z_READ_BASE */ + radeon_emit(cs, zb->db_stencil_base); /* DB_STENCIL_READ_BASE */ + radeon_emit(cs, zb->db_depth_base); /* DB_Z_WRITE_BASE */ + radeon_emit(cs, zb->db_stencil_base); /* DB_STENCIL_WRITE_BASE */ + + radeon_set_context_reg_seq(cs, R_028068_DB_Z_READ_BASE_HI, 5); + radeon_emit(cs, zb->db_depth_base >> 32); /* DB_Z_READ_BASE_HI */ + radeon_emit(cs, zb->db_stencil_base >> 32); /* DB_STENCIL_READ_BASE_HI */ + radeon_emit(cs, zb->db_depth_base >> 32); /* DB_Z_WRITE_BASE_HI */ + radeon_emit(cs, zb->db_stencil_base >> 32); /* DB_STENCIL_WRITE_BASE_HI */ + radeon_emit(cs, zb->db_htile_data_base >> 32); /* DB_HTILE_DATA_BASE_HI */ + } else if (sctx->chip_class == GFX9) { + radeon_set_context_reg_seq(cs, R_028014_DB_HTILE_DATA_BASE, 3); + radeon_emit(cs, zb->db_htile_data_base); /* DB_HTILE_DATA_BASE */ + radeon_emit(cs, S_028018_BASE_HI(zb->db_htile_data_base >> 32)); /* DB_HTILE_DATA_BASE_HI */ + radeon_emit(cs, zb->db_depth_size); /* DB_DEPTH_SIZE */ + + radeon_set_context_reg_seq(cs, R_028038_DB_Z_INFO, 10); + radeon_emit(cs, zb->db_z_info | /* DB_Z_INFO */ + S_028038_ZRANGE_PRECISION(tex->depth_clear_value != 0)); + radeon_emit(cs, zb->db_stencil_info); /* DB_STENCIL_INFO */ + radeon_emit(cs, zb->db_depth_base); /* DB_Z_READ_BASE */ + radeon_emit(cs, S_028044_BASE_HI(zb->db_depth_base >> 32)); /* DB_Z_READ_BASE_HI */ + radeon_emit(cs, zb->db_stencil_base); /* DB_STENCIL_READ_BASE */ + radeon_emit(cs, S_02804C_BASE_HI(zb->db_stencil_base >> 32)); /* DB_STENCIL_READ_BASE_HI */ + radeon_emit(cs, zb->db_depth_base); /* DB_Z_WRITE_BASE */ + radeon_emit(cs, S_028054_BASE_HI(zb->db_depth_base >> 32)); /* DB_Z_WRITE_BASE_HI */ + radeon_emit(cs, zb->db_stencil_base); /* DB_STENCIL_WRITE_BASE */ + radeon_emit(cs, S_02805C_BASE_HI(zb->db_stencil_base >> 32)); /* DB_STENCIL_WRITE_BASE_HI */ + + radeon_set_context_reg_seq(cs, R_028068_DB_Z_INFO2, 2); + radeon_emit(cs, zb->db_z_info2); /* DB_Z_INFO2 */ + radeon_emit(cs, zb->db_stencil_info2); /* DB_STENCIL_INFO2 */ + } else { + radeon_set_context_reg(cs, R_028014_DB_HTILE_DATA_BASE, zb->db_htile_data_base); + + radeon_set_context_reg_seq(cs, R_02803C_DB_DEPTH_INFO, 9); + radeon_emit(cs, zb->db_depth_info); /* DB_DEPTH_INFO */ + radeon_emit(cs, zb->db_z_info | /* DB_Z_INFO */ + S_028040_ZRANGE_PRECISION(tex->depth_clear_value != 0)); + radeon_emit(cs, zb->db_stencil_info); /* DB_STENCIL_INFO */ + radeon_emit(cs, zb->db_depth_base); /* DB_Z_READ_BASE */ + radeon_emit(cs, zb->db_stencil_base); /* DB_STENCIL_READ_BASE */ + radeon_emit(cs, zb->db_depth_base); /* DB_Z_WRITE_BASE */ + radeon_emit(cs, zb->db_stencil_base); /* DB_STENCIL_WRITE_BASE */ + radeon_emit(cs, zb->db_depth_size); /* DB_DEPTH_SIZE */ + radeon_emit(cs, zb->db_depth_slice); /* DB_DEPTH_SLICE */ + } + + radeon_set_context_reg_seq(cs, R_028028_DB_STENCIL_CLEAR, 2); + radeon_emit(cs, tex->stencil_clear_value); /* R_028028_DB_STENCIL_CLEAR */ + radeon_emit(cs, fui(tex->depth_clear_value)); /* R_02802C_DB_DEPTH_CLEAR */ + + radeon_set_context_reg(cs, R_028008_DB_DEPTH_VIEW, zb->db_depth_view); + radeon_set_context_reg(cs, R_028ABC_DB_HTILE_SURFACE, zb->db_htile_surface); + } else if (sctx->framebuffer.dirty_zsbuf) { + if (sctx->chip_class == GFX9) + radeon_set_context_reg_seq(cs, R_028038_DB_Z_INFO, 2); + else + radeon_set_context_reg_seq(cs, R_028040_DB_Z_INFO, 2); + + radeon_emit(cs, S_028040_FORMAT(V_028040_Z_INVALID)); /* DB_Z_INFO */ + radeon_emit(cs, S_028044_FORMAT(V_028044_STENCIL_INVALID)); /* DB_STENCIL_INFO */ + } - /* Framebuffer dimensions. */ + /* Framebuffer dimensions. */ /* PA_SC_WINDOW_SCISSOR_TL is set in si_init_config() */ - radeon_set_context_reg(cs, R_028208_PA_SC_WINDOW_SCISSOR_BR, - S_028208_BR_X(state->width) | S_028208_BR_Y(state->height)); + radeon_set_context_reg(cs, R_028208_PA_SC_WINDOW_SCISSOR_BR, + S_028208_BR_X(state->width) | S_028208_BR_Y(state->height)); - if (sctx->screen->dfsm_allowed) { - radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); - radeon_emit(cs, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0)); - } + if (sctx->screen->dfsm_allowed) { + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0)); + } - sctx->framebuffer.dirty_cbufs = 0; - sctx->framebuffer.dirty_zsbuf = false; + sctx->framebuffer.dirty_cbufs = 0; + sctx->framebuffer.dirty_zsbuf = false; } static void si_emit_msaa_sample_locs(struct si_context *sctx) { - struct radeon_cmdbuf *cs = sctx->gfx_cs; - struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; - unsigned nr_samples = sctx->framebuffer.nr_samples; - bool has_msaa_sample_loc_bug = sctx->screen->has_msaa_sample_loc_bug; - - /* Smoothing (only possible with nr_samples == 1) uses the same - * sample locations as the MSAA it simulates. - */ - if (nr_samples <= 1 && sctx->smoothing_enabled) - nr_samples = SI_NUM_SMOOTH_AA_SAMPLES; - - /* On Polaris, the small primitive filter uses the sample locations - * even when MSAA is off, so we need to make sure they're set to 0. - * - * GFX10 uses sample locations unconditionally, so they always need - * to be set up. - */ - if ((nr_samples >= 2 || has_msaa_sample_loc_bug || - sctx->chip_class >= GFX10) && - nr_samples != sctx->sample_locs_num_samples) { - sctx->sample_locs_num_samples = nr_samples; - si_emit_sample_locations(cs, nr_samples); - } - - if (sctx->family >= CHIP_POLARIS10) { - unsigned small_prim_filter_cntl = - S_028830_SMALL_PRIM_FILTER_ENABLE(1) | - /* line bug */ - S_028830_LINE_FILTER_DISABLE(sctx->family <= CHIP_POLARIS12); - - /* The alternative of setting sample locations to 0 would - * require a DB flush to avoid Z errors, see - * https://bugs.freedesktop.org/show_bug.cgi?id=96908 - */ - if (has_msaa_sample_loc_bug && - sctx->framebuffer.nr_samples > 1 && - !rs->multisample_enable) - small_prim_filter_cntl &= C_028830_SMALL_PRIM_FILTER_ENABLE; - - radeon_opt_set_context_reg(sctx, - R_028830_PA_SU_SMALL_PRIM_FILTER_CNTL, - SI_TRACKED_PA_SU_SMALL_PRIM_FILTER_CNTL, - small_prim_filter_cntl); - } - - /* The exclusion bits can be set to improve rasterization efficiency - * if no sample lies on the pixel boundary (-8 sample offset). - */ - bool exclusion = sctx->chip_class >= GFX7 && - (!rs->multisample_enable || nr_samples != 16); - radeon_opt_set_context_reg(sctx, R_02882C_PA_SU_PRIM_FILTER_CNTL, - SI_TRACKED_PA_SU_PRIM_FILTER_CNTL, - S_02882C_XMAX_RIGHT_EXCLUSION(exclusion) | - S_02882C_YMAX_BOTTOM_EXCLUSION(exclusion)); + struct radeon_cmdbuf *cs = sctx->gfx_cs; + struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; + unsigned nr_samples = sctx->framebuffer.nr_samples; + bool has_msaa_sample_loc_bug = sctx->screen->info.has_msaa_sample_loc_bug; + + /* Smoothing (only possible with nr_samples == 1) uses the same + * sample locations as the MSAA it simulates. + */ + if (nr_samples <= 1 && sctx->smoothing_enabled) + nr_samples = SI_NUM_SMOOTH_AA_SAMPLES; + + /* On Polaris, the small primitive filter uses the sample locations + * even when MSAA is off, so we need to make sure they're set to 0. + * + * GFX10 uses sample locations unconditionally, so they always need + * to be set up. + */ + if ((nr_samples >= 2 || has_msaa_sample_loc_bug || + sctx->chip_class >= GFX10) && + nr_samples != sctx->sample_locs_num_samples) { + sctx->sample_locs_num_samples = nr_samples; + si_emit_sample_locations(cs, nr_samples); + } + + if (sctx->family >= CHIP_POLARIS10) { + unsigned small_prim_filter_cntl = + S_028830_SMALL_PRIM_FILTER_ENABLE(1) | + /* line bug */ + S_028830_LINE_FILTER_DISABLE(sctx->family <= CHIP_POLARIS12); + + /* The alternative of setting sample locations to 0 would + * require a DB flush to avoid Z errors, see + * https://bugs.freedesktop.org/show_bug.cgi?id=96908 + */ + if (has_msaa_sample_loc_bug && + sctx->framebuffer.nr_samples > 1 && + !rs->multisample_enable) + small_prim_filter_cntl &= C_028830_SMALL_PRIM_FILTER_ENABLE; + + radeon_opt_set_context_reg(sctx, + R_028830_PA_SU_SMALL_PRIM_FILTER_CNTL, + SI_TRACKED_PA_SU_SMALL_PRIM_FILTER_CNTL, + small_prim_filter_cntl); + } + + /* The exclusion bits can be set to improve rasterization efficiency + * if no sample lies on the pixel boundary (-8 sample offset). + */ + bool exclusion = sctx->chip_class >= GFX7 && + (!rs->multisample_enable || nr_samples != 16); + radeon_opt_set_context_reg(sctx, R_02882C_PA_SU_PRIM_FILTER_CNTL, + SI_TRACKED_PA_SU_PRIM_FILTER_CNTL, + S_02882C_XMAX_RIGHT_EXCLUSION(exclusion) | + S_02882C_YMAX_BOTTOM_EXCLUSION(exclusion)); } static bool si_out_of_order_rasterization(struct si_context *sctx) { - struct si_state_blend *blend = sctx->queued.named.blend; - struct si_state_dsa *dsa = sctx->queued.named.dsa; + struct si_state_blend *blend = sctx->queued.named.blend; + struct si_state_dsa *dsa = sctx->queued.named.dsa; - if (!sctx->screen->has_out_of_order_rast) - return false; + if (!sctx->screen->has_out_of_order_rast) + return false; - unsigned colormask = sctx->framebuffer.colorbuf_enabled_4bit; + unsigned colormask = sctx->framebuffer.colorbuf_enabled_4bit; - colormask &= blend->cb_target_enabled_4bit; - - /* Conservative: No logic op. */ - if (colormask && blend->logicop_enable) - return false; - - struct si_dsa_order_invariance dsa_order_invariant = { - .zs = true, .pass_set = true, .pass_last = false - }; - - if (sctx->framebuffer.state.zsbuf) { - struct si_texture *zstex = - (struct si_texture*)sctx->framebuffer.state.zsbuf->texture; - bool has_stencil = zstex->surface.has_stencil; - dsa_order_invariant = dsa->order_invariance[has_stencil]; - if (!dsa_order_invariant.zs) - return false; - - /* The set of PS invocations is always order invariant, - * except when early Z/S tests are requested. */ - if (sctx->ps_shader.cso && - sctx->ps_shader.cso->info.writes_memory && - sctx->ps_shader.cso->info.properties[TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL] && - !dsa_order_invariant.pass_set) - return false; - - if (sctx->num_perfect_occlusion_queries != 0 && - !dsa_order_invariant.pass_set) - return false; - } - - if (!colormask) - return true; - - unsigned blendmask = colormask & blend->blend_enable_4bit; - - if (blendmask) { - /* Only commutative blending. */ - if (blendmask & ~blend->commutative_4bit) - return false; - - if (!dsa_order_invariant.pass_set) - return false; - } - - if (colormask & ~blendmask) { - if (!dsa_order_invariant.pass_last) - return false; - } + colormask &= blend->cb_target_enabled_4bit; + + /* Conservative: No logic op. */ + if (colormask && blend->logicop_enable) + return false; + + struct si_dsa_order_invariance dsa_order_invariant = { + .zs = true, .pass_set = true, .pass_last = false + }; + + if (sctx->framebuffer.state.zsbuf) { + struct si_texture *zstex = + (struct si_texture*)sctx->framebuffer.state.zsbuf->texture; + bool has_stencil = zstex->surface.has_stencil; + dsa_order_invariant = dsa->order_invariance[has_stencil]; + if (!dsa_order_invariant.zs) + return false; + + /* The set of PS invocations is always order invariant, + * except when early Z/S tests are requested. */ + if (sctx->ps_shader.cso && + sctx->ps_shader.cso->info.writes_memory && + sctx->ps_shader.cso->info.properties[TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL] && + !dsa_order_invariant.pass_set) + return false; + + if (sctx->num_perfect_occlusion_queries != 0 && + !dsa_order_invariant.pass_set) + return false; + } + + if (!colormask) + return true; + + unsigned blendmask = colormask & blend->blend_enable_4bit; + + if (blendmask) { + /* Only commutative blending. */ + if (blendmask & ~blend->commutative_4bit) + return false; + + if (!dsa_order_invariant.pass_set) + return false; + } + + if (colormask & ~blendmask) { + if (!dsa_order_invariant.pass_last) + return false; + } - return true; + return true; } static void si_emit_msaa_config(struct si_context *sctx) { - struct radeon_cmdbuf *cs = sctx->gfx_cs; - unsigned num_tile_pipes = sctx->screen->info.num_tile_pipes; - /* 33% faster rendering to linear color buffers */ - bool dst_is_linear = sctx->framebuffer.any_dst_linear; - bool out_of_order_rast = si_out_of_order_rasterization(sctx); - unsigned sc_mode_cntl_1 = - S_028A4C_WALK_SIZE(dst_is_linear) | - S_028A4C_WALK_FENCE_ENABLE(!dst_is_linear) | - S_028A4C_WALK_FENCE_SIZE(num_tile_pipes == 2 ? 2 : 3) | - S_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE(out_of_order_rast) | - S_028A4C_OUT_OF_ORDER_WATER_MARK(0x7) | - /* always 1: */ - S_028A4C_WALK_ALIGN8_PRIM_FITS_ST(1) | - S_028A4C_SUPERTILE_WALK_ORDER_ENABLE(1) | - S_028A4C_TILE_WALK_ORDER_ENABLE(1) | - S_028A4C_MULTI_SHADER_ENGINE_PRIM_DISCARD_ENABLE(1) | - S_028A4C_FORCE_EOV_CNTDWN_ENABLE(1) | - S_028A4C_FORCE_EOV_REZ_ENABLE(1); - unsigned db_eqaa = S_028804_HIGH_QUALITY_INTERSECTIONS(1) | - S_028804_INCOHERENT_EQAA_READS(1) | - S_028804_INTERPOLATE_COMP_Z(1) | - S_028804_STATIC_ANCHOR_ASSOCIATIONS(1); - unsigned coverage_samples, color_samples, z_samples; - struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; - - /* S: Coverage samples (up to 16x): - * - Scan conversion samples (PA_SC_AA_CONFIG.MSAA_NUM_SAMPLES) - * - CB FMASK samples (CB_COLORi_ATTRIB.NUM_SAMPLES) - * - * Z: Z/S samples (up to 8x, must be <= coverage samples and >= color samples): - * - Value seen by DB (DB_Z_INFO.NUM_SAMPLES) - * - Value seen by CB, must be correct even if Z/S is unbound (DB_EQAA.MAX_ANCHOR_SAMPLES) - * # Missing samples are derived from Z planes if Z is compressed (up to 16x quality), or - * # from the closest defined sample if Z is uncompressed (same quality as the number of - * # Z samples). - * - * F: Color samples (up to 8x, must be <= coverage samples): - * - CB color samples (CB_COLORi_ATTRIB.NUM_FRAGMENTS) - * - PS iter samples (DB_EQAA.PS_ITER_SAMPLES) - * - * Can be anything between coverage and color samples: - * - SampleMaskIn samples (PA_SC_AA_CONFIG.MSAA_EXPOSED_SAMPLES) - * - SampleMaskOut samples (DB_EQAA.MASK_EXPORT_NUM_SAMPLES) - * - Alpha-to-coverage samples (DB_EQAA.ALPHA_TO_MASK_NUM_SAMPLES) - * - Occlusion query samples (DB_COUNT_CONTROL.SAMPLE_RATE) - * # All are currently set the same as coverage samples. - * - * If color samples < coverage samples, FMASK has a higher bpp to store an "unknown" - * flag for undefined color samples. A shader-based resolve must handle unknowns - * or mask them out with AND. Unknowns can also be guessed from neighbors via - * an edge-detect shader-based resolve, which is required to make "color samples = 1" - * useful. The CB resolve always drops unknowns. - * - * Sensible AA configurations: - * EQAA 16s 8z 8f - might look the same as 16x MSAA if Z is compressed - * EQAA 16s 8z 4f - might look the same as 16x MSAA if Z is compressed - * EQAA 16s 4z 4f - might look the same as 16x MSAA if Z is compressed - * EQAA 8s 8z 8f = 8x MSAA - * EQAA 8s 8z 4f - might look the same as 8x MSAA - * EQAA 8s 8z 2f - might look the same as 8x MSAA with low-density geometry - * EQAA 8s 4z 4f - might look the same as 8x MSAA if Z is compressed - * EQAA 8s 4z 2f - might look the same as 8x MSAA with low-density geometry if Z is compressed - * EQAA 4s 4z 4f = 4x MSAA - * EQAA 4s 4z 2f - might look the same as 4x MSAA with low-density geometry - * EQAA 2s 2z 2f = 2x MSAA - */ - if (sctx->framebuffer.nr_samples > 1 && rs->multisample_enable) { - coverage_samples = sctx->framebuffer.nr_samples; - color_samples = sctx->framebuffer.nr_color_samples; - - if (sctx->framebuffer.state.zsbuf) { - z_samples = sctx->framebuffer.state.zsbuf->texture->nr_samples; - z_samples = MAX2(1, z_samples); - } else { - z_samples = coverage_samples; - } - } else if (sctx->smoothing_enabled) { - coverage_samples = color_samples = z_samples = SI_NUM_SMOOTH_AA_SAMPLES; - } else { - coverage_samples = color_samples = z_samples = 1; - } - - /* Required by OpenGL line rasterization. - * - * TODO: We should also enable perpendicular endcaps for AA lines, - * but that requires implementing line stippling in the pixel - * shader. SC can only do line stippling with axis-aligned - * endcaps. - */ - unsigned sc_line_cntl = S_028BDC_DX10_DIAMOND_TEST_ENA(1); - unsigned sc_aa_config = 0; - - if (coverage_samples > 1) { - /* distance from the pixel center, indexed by log2(nr_samples) */ - static unsigned max_dist[] = { - 0, /* unused */ - 4, /* 2x MSAA */ - 6, /* 4x MSAA */ - 7, /* 8x MSAA */ - 8, /* 16x MSAA */ - }; - unsigned log_samples = util_logbase2(coverage_samples); - unsigned log_z_samples = util_logbase2(z_samples); - unsigned ps_iter_samples = si_get_ps_iter_samples(sctx); - unsigned log_ps_iter_samples = util_logbase2(ps_iter_samples); - - sc_line_cntl |= S_028BDC_EXPAND_LINE_WIDTH(1); - sc_aa_config = S_028BE0_MSAA_NUM_SAMPLES(log_samples) | - S_028BE0_MAX_SAMPLE_DIST(max_dist[log_samples]) | - S_028BE0_MSAA_EXPOSED_SAMPLES(log_samples); - - if (sctx->framebuffer.nr_samples > 1) { - db_eqaa |= S_028804_MAX_ANCHOR_SAMPLES(log_z_samples) | - S_028804_PS_ITER_SAMPLES(log_ps_iter_samples) | - S_028804_MASK_EXPORT_NUM_SAMPLES(log_samples) | - S_028804_ALPHA_TO_MASK_NUM_SAMPLES(log_samples); - sc_mode_cntl_1 |= S_028A4C_PS_ITER_SAMPLE(ps_iter_samples > 1); - } else if (sctx->smoothing_enabled) { - db_eqaa |= S_028804_OVERRASTERIZATION_AMOUNT(log_samples); - } - } - - unsigned initial_cdw = cs->current.cdw; - - /* R_028BDC_PA_SC_LINE_CNTL, R_028BE0_PA_SC_AA_CONFIG */ - radeon_opt_set_context_reg2(sctx, R_028BDC_PA_SC_LINE_CNTL, - SI_TRACKED_PA_SC_LINE_CNTL, sc_line_cntl, - sc_aa_config); - /* R_028804_DB_EQAA */ - radeon_opt_set_context_reg(sctx, R_028804_DB_EQAA, SI_TRACKED_DB_EQAA, - db_eqaa); - /* R_028A4C_PA_SC_MODE_CNTL_1 */ - radeon_opt_set_context_reg(sctx, R_028A4C_PA_SC_MODE_CNTL_1, - SI_TRACKED_PA_SC_MODE_CNTL_1, sc_mode_cntl_1); - - if (initial_cdw != cs->current.cdw) { - sctx->context_roll = true; - - /* GFX9: Flush DFSM when the AA mode changes. */ - if (sctx->screen->dfsm_allowed) { - radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); - radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_DFSM) | EVENT_INDEX(0)); - } - } + struct radeon_cmdbuf *cs = sctx->gfx_cs; + unsigned num_tile_pipes = sctx->screen->info.num_tile_pipes; + /* 33% faster rendering to linear color buffers */ + bool dst_is_linear = sctx->framebuffer.any_dst_linear; + bool out_of_order_rast = si_out_of_order_rasterization(sctx); + unsigned sc_mode_cntl_1 = + S_028A4C_WALK_SIZE(dst_is_linear) | + S_028A4C_WALK_FENCE_ENABLE(!dst_is_linear) | + S_028A4C_WALK_FENCE_SIZE(num_tile_pipes == 2 ? 2 : 3) | + S_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE(out_of_order_rast) | + S_028A4C_OUT_OF_ORDER_WATER_MARK(0x7) | + /* always 1: */ + S_028A4C_WALK_ALIGN8_PRIM_FITS_ST(1) | + S_028A4C_SUPERTILE_WALK_ORDER_ENABLE(1) | + S_028A4C_TILE_WALK_ORDER_ENABLE(1) | + S_028A4C_MULTI_SHADER_ENGINE_PRIM_DISCARD_ENABLE(1) | + S_028A4C_FORCE_EOV_CNTDWN_ENABLE(1) | + S_028A4C_FORCE_EOV_REZ_ENABLE(1); + unsigned db_eqaa = S_028804_HIGH_QUALITY_INTERSECTIONS(1) | + S_028804_INCOHERENT_EQAA_READS(1) | + S_028804_INTERPOLATE_COMP_Z(1) | + S_028804_STATIC_ANCHOR_ASSOCIATIONS(1); + unsigned coverage_samples, color_samples, z_samples; + struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; + + /* S: Coverage samples (up to 16x): + * - Scan conversion samples (PA_SC_AA_CONFIG.MSAA_NUM_SAMPLES) + * - CB FMASK samples (CB_COLORi_ATTRIB.NUM_SAMPLES) + * + * Z: Z/S samples (up to 8x, must be <= coverage samples and >= color samples): + * - Value seen by DB (DB_Z_INFO.NUM_SAMPLES) + * - Value seen by CB, must be correct even if Z/S is unbound (DB_EQAA.MAX_ANCHOR_SAMPLES) + * # Missing samples are derived from Z planes if Z is compressed (up to 16x quality), or + * # from the closest defined sample if Z is uncompressed (same quality as the number of + * # Z samples). + * + * F: Color samples (up to 8x, must be <= coverage samples): + * - CB color samples (CB_COLORi_ATTRIB.NUM_FRAGMENTS) + * - PS iter samples (DB_EQAA.PS_ITER_SAMPLES) + * + * Can be anything between coverage and color samples: + * - SampleMaskIn samples (PA_SC_AA_CONFIG.MSAA_EXPOSED_SAMPLES) + * - SampleMaskOut samples (DB_EQAA.MASK_EXPORT_NUM_SAMPLES) + * - Alpha-to-coverage samples (DB_EQAA.ALPHA_TO_MASK_NUM_SAMPLES) + * - Occlusion query samples (DB_COUNT_CONTROL.SAMPLE_RATE) + * # All are currently set the same as coverage samples. + * + * If color samples < coverage samples, FMASK has a higher bpp to store an "unknown" + * flag for undefined color samples. A shader-based resolve must handle unknowns + * or mask them out with AND. Unknowns can also be guessed from neighbors via + * an edge-detect shader-based resolve, which is required to make "color samples = 1" + * useful. The CB resolve always drops unknowns. + * + * Sensible AA configurations: + * EQAA 16s 8z 8f - might look the same as 16x MSAA if Z is compressed + * EQAA 16s 8z 4f - might look the same as 16x MSAA if Z is compressed + * EQAA 16s 4z 4f - might look the same as 16x MSAA if Z is compressed + * EQAA 8s 8z 8f = 8x MSAA + * EQAA 8s 8z 4f - might look the same as 8x MSAA + * EQAA 8s 8z 2f - might look the same as 8x MSAA with low-density geometry + * EQAA 8s 4z 4f - might look the same as 8x MSAA if Z is compressed + * EQAA 8s 4z 2f - might look the same as 8x MSAA with low-density geometry if Z is compressed + * EQAA 4s 4z 4f = 4x MSAA + * EQAA 4s 4z 2f - might look the same as 4x MSAA with low-density geometry + * EQAA 2s 2z 2f = 2x MSAA + */ + if (sctx->framebuffer.nr_samples > 1 && rs->multisample_enable) { + coverage_samples = sctx->framebuffer.nr_samples; + color_samples = sctx->framebuffer.nr_color_samples; + + if (sctx->framebuffer.state.zsbuf) { + z_samples = sctx->framebuffer.state.zsbuf->texture->nr_samples; + z_samples = MAX2(1, z_samples); + } else { + z_samples = coverage_samples; + } + } else if (sctx->smoothing_enabled) { + coverage_samples = color_samples = z_samples = SI_NUM_SMOOTH_AA_SAMPLES; + } else { + coverage_samples = color_samples = z_samples = 1; + } + + /* Required by OpenGL line rasterization. + * + * TODO: We should also enable perpendicular endcaps for AA lines, + * but that requires implementing line stippling in the pixel + * shader. SC can only do line stippling with axis-aligned + * endcaps. + */ + unsigned sc_line_cntl = S_028BDC_DX10_DIAMOND_TEST_ENA(1); + unsigned sc_aa_config = 0; + + if (coverage_samples > 1) { + /* distance from the pixel center, indexed by log2(nr_samples) */ + static unsigned max_dist[] = { + 0, /* unused */ + 4, /* 2x MSAA */ + 6, /* 4x MSAA */ + 7, /* 8x MSAA */ + 8, /* 16x MSAA */ + }; + unsigned log_samples = util_logbase2(coverage_samples); + unsigned log_z_samples = util_logbase2(z_samples); + unsigned ps_iter_samples = si_get_ps_iter_samples(sctx); + unsigned log_ps_iter_samples = util_logbase2(ps_iter_samples); + + sc_line_cntl |= S_028BDC_EXPAND_LINE_WIDTH(1); + sc_aa_config = S_028BE0_MSAA_NUM_SAMPLES(log_samples) | + S_028BE0_MAX_SAMPLE_DIST(max_dist[log_samples]) | + S_028BE0_MSAA_EXPOSED_SAMPLES(log_samples); + + if (sctx->framebuffer.nr_samples > 1) { + db_eqaa |= S_028804_MAX_ANCHOR_SAMPLES(log_z_samples) | + S_028804_PS_ITER_SAMPLES(log_ps_iter_samples) | + S_028804_MASK_EXPORT_NUM_SAMPLES(log_samples) | + S_028804_ALPHA_TO_MASK_NUM_SAMPLES(log_samples); + sc_mode_cntl_1 |= S_028A4C_PS_ITER_SAMPLE(ps_iter_samples > 1); + } else if (sctx->smoothing_enabled) { + db_eqaa |= S_028804_OVERRASTERIZATION_AMOUNT(log_samples); + } + } + + unsigned initial_cdw = cs->current.cdw; + + /* R_028BDC_PA_SC_LINE_CNTL, R_028BE0_PA_SC_AA_CONFIG */ + radeon_opt_set_context_reg2(sctx, R_028BDC_PA_SC_LINE_CNTL, + SI_TRACKED_PA_SC_LINE_CNTL, sc_line_cntl, + sc_aa_config); + /* R_028804_DB_EQAA */ + radeon_opt_set_context_reg(sctx, R_028804_DB_EQAA, SI_TRACKED_DB_EQAA, + db_eqaa); + /* R_028A4C_PA_SC_MODE_CNTL_1 */ + radeon_opt_set_context_reg(sctx, R_028A4C_PA_SC_MODE_CNTL_1, + SI_TRACKED_PA_SC_MODE_CNTL_1, sc_mode_cntl_1); + + if (initial_cdw != cs->current.cdw) { + sctx->context_roll = true; + + /* GFX9: Flush DFSM when the AA mode changes. */ + if (sctx->screen->dfsm_allowed) { + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_DFSM) | EVENT_INDEX(0)); + } + } } void si_update_ps_iter_samples(struct si_context *sctx) { - if (sctx->framebuffer.nr_samples > 1) - si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config); - if (sctx->screen->dpbb_allowed) - si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state); + if (sctx->framebuffer.nr_samples > 1) + si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config); + if (sctx->screen->dpbb_allowed) + si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state); } static void si_set_min_samples(struct pipe_context *ctx, unsigned min_samples) { - struct si_context *sctx = (struct si_context *)ctx; + struct si_context *sctx = (struct si_context *)ctx; - /* The hardware can only do sample shading with 2^n samples. */ - min_samples = util_next_power_of_two(min_samples); + /* The hardware can only do sample shading with 2^n samples. */ + min_samples = util_next_power_of_two(min_samples); - if (sctx->ps_iter_samples == min_samples) - return; + if (sctx->ps_iter_samples == min_samples) + return; - sctx->ps_iter_samples = min_samples; - sctx->do_update_shaders = true; + sctx->ps_iter_samples = min_samples; + sctx->do_update_shaders = true; - si_update_ps_iter_samples(sctx); + si_update_ps_iter_samples(sctx); } /* @@ -3780,114 +3787,107 @@ */ void si_make_buffer_descriptor(struct si_screen *screen, struct si_resource *buf, - enum pipe_format format, - unsigned offset, unsigned size, - uint32_t *state) -{ - const struct util_format_description *desc; - unsigned stride; - unsigned num_records; - - desc = util_format_description(format); - stride = desc->block.bits / 8; - - num_records = size / stride; - num_records = MIN2(num_records, (buf->b.b.width0 - offset) / stride); - - /* The NUM_RECORDS field has a different meaning depending on the chip, - * instruction type, STRIDE, and SWIZZLE_ENABLE. - * - * GFX6-7,10: - * - If STRIDE == 0, it's in byte units. - * - If STRIDE != 0, it's in units of STRIDE, used with inst.IDXEN. - * - * GFX8: - * - For SMEM and STRIDE == 0, it's in byte units. - * - For SMEM and STRIDE != 0, it's in units of STRIDE. - * - For VMEM and STRIDE == 0 or SWIZZLE_ENABLE == 0, it's in byte units. - * - For VMEM and STRIDE != 0 and SWIZZLE_ENABLE == 1, it's in units of STRIDE. - * NOTE: There is incompatibility between VMEM and SMEM opcodes due to SWIZZLE_- - * ENABLE. The workaround is to set STRIDE = 0 if SWIZZLE_ENABLE == 0 when - * using SMEM. This can be done in the shader by clearing STRIDE with s_and. - * That way the same descriptor can be used by both SMEM and VMEM. - * - * GFX9: - * - For SMEM and STRIDE == 0, it's in byte units. - * - For SMEM and STRIDE != 0, it's in units of STRIDE. - * - For VMEM and inst.IDXEN == 0 or STRIDE == 0, it's in byte units. - * - For VMEM and inst.IDXEN == 1 and STRIDE != 0, it's in units of STRIDE. - */ - if (screen->info.chip_class == GFX9 && HAVE_LLVM < 0x0800) - /* When vindex == 0, LLVM < 8.0 sets IDXEN = 0, thus changing units - * from STRIDE to bytes. This works around it by setting - * NUM_RECORDS to at least the size of one element, so that - * the first element is readable when IDXEN == 0. - */ - num_records = num_records ? MAX2(num_records, stride) : 0; - else if (screen->info.chip_class == GFX8) - num_records *= stride; - - state[4] = 0; - state[5] = S_008F04_STRIDE(stride); - state[6] = num_records; - state[7] = S_008F0C_DST_SEL_X(si_map_swizzle(desc->swizzle[0])) | - S_008F0C_DST_SEL_Y(si_map_swizzle(desc->swizzle[1])) | - S_008F0C_DST_SEL_Z(si_map_swizzle(desc->swizzle[2])) | - S_008F0C_DST_SEL_W(si_map_swizzle(desc->swizzle[3])); - - if (screen->info.chip_class >= GFX10) { - const struct gfx10_format *fmt = &gfx10_format_table[format]; - - /* OOB_SELECT chooses the out-of-bounds check: - * - 0: (index >= NUM_RECORDS) || (offset >= STRIDE) - * - 1: index >= NUM_RECORDS - * - 2: NUM_RECORDS == 0 - * - 3: if SWIZZLE_ENABLE == 0: offset >= NUM_RECORDS - * else: swizzle_address >= NUM_RECORDS - */ - state[7] |= S_008F0C_FORMAT(fmt->img_format) | - S_008F0C_OOB_SELECT(0) | - S_008F0C_RESOURCE_LEVEL(1); - } else { - int first_non_void; - unsigned num_format, data_format; - - first_non_void = util_format_get_first_non_void_channel(format); - num_format = si_translate_buffer_numformat(&screen->b, desc, first_non_void); - data_format = si_translate_buffer_dataformat(&screen->b, desc, first_non_void); - - state[7] |= S_008F0C_NUM_FORMAT(num_format) | - S_008F0C_DATA_FORMAT(data_format); - } + enum pipe_format format, + unsigned offset, unsigned size, + uint32_t *state) +{ + const struct util_format_description *desc; + unsigned stride; + unsigned num_records; + + desc = util_format_description(format); + stride = desc->block.bits / 8; + + num_records = size / stride; + num_records = MIN2(num_records, (buf->b.b.width0 - offset) / stride); + + /* The NUM_RECORDS field has a different meaning depending on the chip, + * instruction type, STRIDE, and SWIZZLE_ENABLE. + * + * GFX6-7,10: + * - If STRIDE == 0, it's in byte units. + * - If STRIDE != 0, it's in units of STRIDE, used with inst.IDXEN. + * + * GFX8: + * - For SMEM and STRIDE == 0, it's in byte units. + * - For SMEM and STRIDE != 0, it's in units of STRIDE. + * - For VMEM and STRIDE == 0 or SWIZZLE_ENABLE == 0, it's in byte units. + * - For VMEM and STRIDE != 0 and SWIZZLE_ENABLE == 1, it's in units of STRIDE. + * NOTE: There is incompatibility between VMEM and SMEM opcodes due to SWIZZLE_- + * ENABLE. The workaround is to set STRIDE = 0 if SWIZZLE_ENABLE == 0 when + * using SMEM. This can be done in the shader by clearing STRIDE with s_and. + * That way the same descriptor can be used by both SMEM and VMEM. + * + * GFX9: + * - For SMEM and STRIDE == 0, it's in byte units. + * - For SMEM and STRIDE != 0, it's in units of STRIDE. + * - For VMEM and inst.IDXEN == 0 or STRIDE == 0, it's in byte units. + * - For VMEM and inst.IDXEN == 1 and STRIDE != 0, it's in units of STRIDE. + */ + if (screen->info.chip_class == GFX8) + num_records *= stride; + + state[4] = 0; + state[5] = S_008F04_STRIDE(stride); + state[6] = num_records; + state[7] = S_008F0C_DST_SEL_X(si_map_swizzle(desc->swizzle[0])) | + S_008F0C_DST_SEL_Y(si_map_swizzle(desc->swizzle[1])) | + S_008F0C_DST_SEL_Z(si_map_swizzle(desc->swizzle[2])) | + S_008F0C_DST_SEL_W(si_map_swizzle(desc->swizzle[3])); + + if (screen->info.chip_class >= GFX10) { + const struct gfx10_format *fmt = &gfx10_format_table[format]; + + /* OOB_SELECT chooses the out-of-bounds check: + * - 0: (index >= NUM_RECORDS) || (offset >= STRIDE) + * - 1: index >= NUM_RECORDS + * - 2: NUM_RECORDS == 0 + * - 3: if SWIZZLE_ENABLE == 0: offset >= NUM_RECORDS + * else: swizzle_address >= NUM_RECORDS + */ + state[7] |= S_008F0C_FORMAT(fmt->img_format) | + S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_STRUCTURED_WITH_OFFSET) | + S_008F0C_RESOURCE_LEVEL(1); + } else { + int first_non_void; + unsigned num_format, data_format; + + first_non_void = util_format_get_first_non_void_channel(format); + num_format = si_translate_buffer_numformat(&screen->b, desc, first_non_void); + data_format = si_translate_buffer_dataformat(&screen->b, desc, first_non_void); + + state[7] |= S_008F0C_NUM_FORMAT(num_format) | + S_008F0C_DATA_FORMAT(data_format); + } } static unsigned gfx9_border_color_swizzle(const unsigned char swizzle[4]) { - unsigned bc_swizzle = V_008F20_BC_SWIZZLE_XYZW; + unsigned bc_swizzle = V_008F20_BC_SWIZZLE_XYZW; - if (swizzle[3] == PIPE_SWIZZLE_X) { - /* For the pre-defined border color values (white, opaque - * black, transparent black), the only thing that matters is - * that the alpha channel winds up in the correct place - * (because the RGB channels are all the same) so either of - * these enumerations will work. - */ - if (swizzle[2] == PIPE_SWIZZLE_Y) - bc_swizzle = V_008F20_BC_SWIZZLE_WZYX; - else - bc_swizzle = V_008F20_BC_SWIZZLE_WXYZ; - } else if (swizzle[0] == PIPE_SWIZZLE_X) { - if (swizzle[1] == PIPE_SWIZZLE_Y) - bc_swizzle = V_008F20_BC_SWIZZLE_XYZW; - else - bc_swizzle = V_008F20_BC_SWIZZLE_XWYZ; - } else if (swizzle[1] == PIPE_SWIZZLE_X) { - bc_swizzle = V_008F20_BC_SWIZZLE_YXWZ; - } else if (swizzle[2] == PIPE_SWIZZLE_X) { - bc_swizzle = V_008F20_BC_SWIZZLE_ZYXW; - } + if (swizzle[3] == PIPE_SWIZZLE_X) { + /* For the pre-defined border color values (white, opaque + * black, transparent black), the only thing that matters is + * that the alpha channel winds up in the correct place + * (because the RGB channels are all the same) so either of + * these enumerations will work. + */ + if (swizzle[2] == PIPE_SWIZZLE_Y) + bc_swizzle = V_008F20_BC_SWIZZLE_WZYX; + else + bc_swizzle = V_008F20_BC_SWIZZLE_WXYZ; + } else if (swizzle[0] == PIPE_SWIZZLE_X) { + if (swizzle[1] == PIPE_SWIZZLE_Y) + bc_swizzle = V_008F20_BC_SWIZZLE_XYZW; + else + bc_swizzle = V_008F20_BC_SWIZZLE_XWYZ; + } else if (swizzle[1] == PIPE_SWIZZLE_X) { + bc_swizzle = V_008F20_BC_SWIZZLE_YXWZ; + } else if (swizzle[2] == PIPE_SWIZZLE_X) { + bc_swizzle = V_008F20_BC_SWIZZLE_ZYXW; + } - return bc_swizzle; + return bc_swizzle; } /** @@ -3895,190 +3895,190 @@ */ static void gfx10_make_texture_descriptor(struct si_screen *screen, - struct si_texture *tex, - bool sampler, - enum pipe_texture_target target, - enum pipe_format pipe_format, - const unsigned char state_swizzle[4], - unsigned first_level, unsigned last_level, - unsigned first_layer, unsigned last_layer, - unsigned width, unsigned height, unsigned depth, - uint32_t *state, - uint32_t *fmask_state) -{ - struct pipe_resource *res = &tex->buffer.b.b; - const struct util_format_description *desc; - unsigned img_format; - unsigned char swizzle[4]; - unsigned type; - uint64_t va; - - desc = util_format_description(pipe_format); - img_format = gfx10_format_table[pipe_format].img_format; - - if (desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) { - const unsigned char swizzle_xxxx[4] = {0, 0, 0, 0}; - const unsigned char swizzle_yyyy[4] = {1, 1, 1, 1}; - const unsigned char swizzle_wwww[4] = {3, 3, 3, 3}; - bool is_stencil = false; - - switch (pipe_format) { - case PIPE_FORMAT_S8_UINT_Z24_UNORM: - case PIPE_FORMAT_X32_S8X24_UINT: - case PIPE_FORMAT_X8Z24_UNORM: - util_format_compose_swizzles(swizzle_yyyy, state_swizzle, swizzle); - is_stencil = true; - break; - case PIPE_FORMAT_X24S8_UINT: - /* - * X24S8 is implemented as an 8_8_8_8 data format, to - * fix texture gathers. This affects at least - * GL45-CTS.texture_cube_map_array.sampling on GFX8. - */ - util_format_compose_swizzles(swizzle_wwww, state_swizzle, swizzle); - is_stencil = true; - break; - default: - util_format_compose_swizzles(swizzle_xxxx, state_swizzle, swizzle); - is_stencil = pipe_format == PIPE_FORMAT_S8_UINT; - } - - if (tex->upgraded_depth && !is_stencil) { - assert(img_format == V_008F0C_IMG_FORMAT_32_FLOAT); - img_format = V_008F0C_IMG_FORMAT_32_FLOAT_CLAMP; - } - } else { - util_format_compose_swizzles(desc->swizzle, state_swizzle, swizzle); - } - - if (!sampler && - (res->target == PIPE_TEXTURE_CUBE || - res->target == PIPE_TEXTURE_CUBE_ARRAY)) { - /* For the purpose of shader images, treat cube maps as 2D - * arrays. - */ - type = V_008F1C_SQ_RSRC_IMG_2D_ARRAY; - } else { - type = si_tex_dim(screen, tex, target, res->nr_samples); - } - - if (type == V_008F1C_SQ_RSRC_IMG_1D_ARRAY) { - height = 1; - depth = res->array_size; - } else if (type == V_008F1C_SQ_RSRC_IMG_2D_ARRAY || - type == V_008F1C_SQ_RSRC_IMG_2D_MSAA_ARRAY) { - if (sampler || res->target != PIPE_TEXTURE_3D) - depth = res->array_size; - } else if (type == V_008F1C_SQ_RSRC_IMG_CUBE) - depth = res->array_size / 6; - - state[0] = 0; - state[1] = S_00A004_FORMAT(img_format) | - S_00A004_WIDTH_LO(width - 1); - state[2] = S_00A008_WIDTH_HI((width - 1) >> 2) | - S_00A008_HEIGHT(height - 1) | - S_00A008_RESOURCE_LEVEL(1); - state[3] = S_00A00C_DST_SEL_X(si_map_swizzle(swizzle[0])) | - S_00A00C_DST_SEL_Y(si_map_swizzle(swizzle[1])) | - S_00A00C_DST_SEL_Z(si_map_swizzle(swizzle[2])) | - S_00A00C_DST_SEL_W(si_map_swizzle(swizzle[3])) | - S_00A00C_BASE_LEVEL(res->nr_samples > 1 ? - 0 : first_level) | - S_00A00C_LAST_LEVEL(res->nr_samples > 1 ? - util_logbase2(res->nr_samples) : - last_level) | - S_00A00C_BC_SWIZZLE(gfx9_border_color_swizzle(desc->swizzle)) | - S_00A00C_TYPE(type); - /* Depth is the the last accessible layer on gfx9+. The hw doesn't need - * to know the total number of layers. - */ - state[4] = S_00A010_DEPTH((type == V_008F1C_SQ_RSRC_IMG_3D && sampler) - ? depth - 1 : last_layer) | - S_00A010_BASE_ARRAY(first_layer); - state[5] = S_00A014_ARRAY_PITCH(!!(type == V_008F1C_SQ_RSRC_IMG_3D && !sampler)) | - S_00A014_MAX_MIP(res->nr_samples > 1 ? - util_logbase2(res->nr_samples) : - tex->buffer.b.b.last_level) | - S_00A014_PERF_MOD(4); - state[6] = 0; - state[7] = 0; - - if (tex->dcc_offset) { - state[6] |= S_00A018_MAX_UNCOMPRESSED_BLOCK_SIZE(V_028C78_MAX_BLOCK_SIZE_256B) | - S_00A018_MAX_COMPRESSED_BLOCK_SIZE(V_028C78_MAX_BLOCK_SIZE_128B) | - S_00A018_ALPHA_IS_ON_MSB(vi_alpha_is_on_msb(screen, pipe_format)); - } - - /* Initialize the sampler view for FMASK. */ - if (tex->fmask_offset) { - uint32_t format; + struct si_texture *tex, + bool sampler, + enum pipe_texture_target target, + enum pipe_format pipe_format, + const unsigned char state_swizzle[4], + unsigned first_level, unsigned last_level, + unsigned first_layer, unsigned last_layer, + unsigned width, unsigned height, unsigned depth, + uint32_t *state, + uint32_t *fmask_state) +{ + struct pipe_resource *res = &tex->buffer.b.b; + const struct util_format_description *desc; + unsigned img_format; + unsigned char swizzle[4]; + unsigned type; + uint64_t va; + + desc = util_format_description(pipe_format); + img_format = gfx10_format_table[pipe_format].img_format; + + if (desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) { + const unsigned char swizzle_xxxx[4] = {0, 0, 0, 0}; + const unsigned char swizzle_yyyy[4] = {1, 1, 1, 1}; + const unsigned char swizzle_wwww[4] = {3, 3, 3, 3}; + bool is_stencil = false; + + switch (pipe_format) { + case PIPE_FORMAT_S8_UINT_Z24_UNORM: + case PIPE_FORMAT_X32_S8X24_UINT: + case PIPE_FORMAT_X8Z24_UNORM: + util_format_compose_swizzles(swizzle_yyyy, state_swizzle, swizzle); + is_stencil = true; + break; + case PIPE_FORMAT_X24S8_UINT: + /* + * X24S8 is implemented as an 8_8_8_8 data format, to + * fix texture gathers. This affects at least + * GL45-CTS.texture_cube_map_array.sampling on GFX8. + */ + util_format_compose_swizzles(swizzle_wwww, state_swizzle, swizzle); + is_stencil = true; + break; + default: + util_format_compose_swizzles(swizzle_xxxx, state_swizzle, swizzle); + is_stencil = pipe_format == PIPE_FORMAT_S8_UINT; + } + + if (tex->upgraded_depth && !is_stencil) { + assert(img_format == V_008F0C_IMG_FORMAT_32_FLOAT); + img_format = V_008F0C_IMG_FORMAT_32_FLOAT_CLAMP; + } + } else { + util_format_compose_swizzles(desc->swizzle, state_swizzle, swizzle); + } + + if (!sampler && + (res->target == PIPE_TEXTURE_CUBE || + res->target == PIPE_TEXTURE_CUBE_ARRAY)) { + /* For the purpose of shader images, treat cube maps as 2D + * arrays. + */ + type = V_008F1C_SQ_RSRC_IMG_2D_ARRAY; + } else { + type = si_tex_dim(screen, tex, target, res->nr_samples); + } + + if (type == V_008F1C_SQ_RSRC_IMG_1D_ARRAY) { + height = 1; + depth = res->array_size; + } else if (type == V_008F1C_SQ_RSRC_IMG_2D_ARRAY || + type == V_008F1C_SQ_RSRC_IMG_2D_MSAA_ARRAY) { + if (sampler || res->target != PIPE_TEXTURE_3D) + depth = res->array_size; + } else if (type == V_008F1C_SQ_RSRC_IMG_CUBE) + depth = res->array_size / 6; + + state[0] = 0; + state[1] = S_00A004_FORMAT(img_format) | + S_00A004_WIDTH_LO(width - 1); + state[2] = S_00A008_WIDTH_HI((width - 1) >> 2) | + S_00A008_HEIGHT(height - 1) | + S_00A008_RESOURCE_LEVEL(1); + state[3] = S_00A00C_DST_SEL_X(si_map_swizzle(swizzle[0])) | + S_00A00C_DST_SEL_Y(si_map_swizzle(swizzle[1])) | + S_00A00C_DST_SEL_Z(si_map_swizzle(swizzle[2])) | + S_00A00C_DST_SEL_W(si_map_swizzle(swizzle[3])) | + S_00A00C_BASE_LEVEL(res->nr_samples > 1 ? + 0 : first_level) | + S_00A00C_LAST_LEVEL(res->nr_samples > 1 ? + util_logbase2(res->nr_samples) : + last_level) | + S_00A00C_BC_SWIZZLE(gfx9_border_color_swizzle(desc->swizzle)) | + S_00A00C_TYPE(type); + /* Depth is the the last accessible layer on gfx9+. The hw doesn't need + * to know the total number of layers. + */ + state[4] = S_00A010_DEPTH((type == V_008F1C_SQ_RSRC_IMG_3D && sampler) + ? depth - 1 : last_layer) | + S_00A010_BASE_ARRAY(first_layer); + state[5] = S_00A014_ARRAY_PITCH(!!(type == V_008F1C_SQ_RSRC_IMG_3D && !sampler)) | + S_00A014_MAX_MIP(res->nr_samples > 1 ? + util_logbase2(res->nr_samples) : + tex->buffer.b.b.last_level) | + S_00A014_PERF_MOD(4); + state[6] = 0; + state[7] = 0; + + if (tex->surface.dcc_offset) { + state[6] |= S_00A018_MAX_UNCOMPRESSED_BLOCK_SIZE(V_028C78_MAX_BLOCK_SIZE_256B) | + S_00A018_MAX_COMPRESSED_BLOCK_SIZE(V_028C78_MAX_BLOCK_SIZE_128B) | + S_00A018_ALPHA_IS_ON_MSB(vi_alpha_is_on_msb(screen, pipe_format)); + } + + /* Initialize the sampler view for FMASK. */ + if (tex->surface.fmask_offset) { + uint32_t format; - va = tex->buffer.gpu_address + tex->fmask_offset; + va = tex->buffer.gpu_address + tex->surface.fmask_offset; #define FMASK(s,f) (((unsigned)(MAX2(1, s)) * 16) + (MAX2(1, f))) - switch (FMASK(res->nr_samples, res->nr_storage_samples)) { - case FMASK(2,1): - format = V_008F0C_IMG_FORMAT_FMASK8_S2_F1; - break; - case FMASK(2,2): - format = V_008F0C_IMG_FORMAT_FMASK8_S2_F2; - break; - case FMASK(4,1): - format = V_008F0C_IMG_FORMAT_FMASK8_S4_F1; - break; - case FMASK(4,2): - format = V_008F0C_IMG_FORMAT_FMASK8_S4_F2; - break; - case FMASK(4,4): - format = V_008F0C_IMG_FORMAT_FMASK8_S4_F4; - break; - case FMASK(8,1): - format = V_008F0C_IMG_FORMAT_FMASK8_S8_F1; - break; - case FMASK(8,2): - format = V_008F0C_IMG_FORMAT_FMASK16_S8_F2; - break; - case FMASK(8,4): - format = V_008F0C_IMG_FORMAT_FMASK32_S8_F4; - break; - case FMASK(8,8): - format = V_008F0C_IMG_FORMAT_FMASK32_S8_F8; - break; - case FMASK(16,1): - format = V_008F0C_IMG_FORMAT_FMASK16_S16_F1; - break; - case FMASK(16,2): - format = V_008F0C_IMG_FORMAT_FMASK32_S16_F2; - break; - case FMASK(16,4): - format = V_008F0C_IMG_FORMAT_FMASK64_S16_F4; - break; - case FMASK(16,8): - format = V_008F0C_IMG_FORMAT_FMASK64_S16_F8; - break; - default: - unreachable("invalid nr_samples"); - } + switch (FMASK(res->nr_samples, res->nr_storage_samples)) { + case FMASK(2,1): + format = V_008F0C_IMG_FORMAT_FMASK8_S2_F1; + break; + case FMASK(2,2): + format = V_008F0C_IMG_FORMAT_FMASK8_S2_F2; + break; + case FMASK(4,1): + format = V_008F0C_IMG_FORMAT_FMASK8_S4_F1; + break; + case FMASK(4,2): + format = V_008F0C_IMG_FORMAT_FMASK8_S4_F2; + break; + case FMASK(4,4): + format = V_008F0C_IMG_FORMAT_FMASK8_S4_F4; + break; + case FMASK(8,1): + format = V_008F0C_IMG_FORMAT_FMASK8_S8_F1; + break; + case FMASK(8,2): + format = V_008F0C_IMG_FORMAT_FMASK16_S8_F2; + break; + case FMASK(8,4): + format = V_008F0C_IMG_FORMAT_FMASK32_S8_F4; + break; + case FMASK(8,8): + format = V_008F0C_IMG_FORMAT_FMASK32_S8_F8; + break; + case FMASK(16,1): + format = V_008F0C_IMG_FORMAT_FMASK16_S16_F1; + break; + case FMASK(16,2): + format = V_008F0C_IMG_FORMAT_FMASK32_S16_F2; + break; + case FMASK(16,4): + format = V_008F0C_IMG_FORMAT_FMASK64_S16_F4; + break; + case FMASK(16,8): + format = V_008F0C_IMG_FORMAT_FMASK64_S16_F8; + break; + default: + unreachable("invalid nr_samples"); + } #undef FMASK - fmask_state[0] = (va >> 8) | tex->surface.fmask_tile_swizzle; - fmask_state[1] = S_00A004_BASE_ADDRESS_HI(va >> 40) | - S_00A004_FORMAT(format) | - S_00A004_WIDTH_LO(width - 1); - fmask_state[2] = S_00A008_WIDTH_HI((width - 1) >> 2) | - S_00A008_HEIGHT(height - 1) | - S_00A008_RESOURCE_LEVEL(1); - fmask_state[3] = S_00A00C_DST_SEL_X(V_008F1C_SQ_SEL_X) | - S_00A00C_DST_SEL_Y(V_008F1C_SQ_SEL_X) | - S_00A00C_DST_SEL_Z(V_008F1C_SQ_SEL_X) | - S_00A00C_DST_SEL_W(V_008F1C_SQ_SEL_X) | - S_00A00C_SW_MODE(tex->surface.u.gfx9.fmask.swizzle_mode) | - S_00A00C_TYPE(si_tex_dim(screen, tex, target, 0)); - fmask_state[4] = S_00A010_DEPTH(last_layer) | - S_00A010_BASE_ARRAY(first_layer); - fmask_state[5] = 0; - fmask_state[6] = S_00A018_META_PIPE_ALIGNED(tex->surface.u.gfx9.cmask.pipe_aligned); - fmask_state[7] = 0; - } + fmask_state[0] = (va >> 8) | tex->surface.fmask_tile_swizzle; + fmask_state[1] = S_00A004_BASE_ADDRESS_HI(va >> 40) | + S_00A004_FORMAT(format) | + S_00A004_WIDTH_LO(width - 1); + fmask_state[2] = S_00A008_WIDTH_HI((width - 1) >> 2) | + S_00A008_HEIGHT(height - 1) | + S_00A008_RESOURCE_LEVEL(1); + fmask_state[3] = S_00A00C_DST_SEL_X(V_008F1C_SQ_SEL_X) | + S_00A00C_DST_SEL_Y(V_008F1C_SQ_SEL_X) | + S_00A00C_DST_SEL_Z(V_008F1C_SQ_SEL_X) | + S_00A00C_DST_SEL_W(V_008F1C_SQ_SEL_X) | + S_00A00C_SW_MODE(tex->surface.u.gfx9.fmask.swizzle_mode) | + S_00A00C_TYPE(si_tex_dim(screen, tex, target, 0)); + fmask_state[4] = S_00A010_DEPTH(last_layer) | + S_00A010_BASE_ARRAY(first_layer); + fmask_state[5] = 0; + fmask_state[6] = S_00A018_META_PIPE_ALIGNED(tex->surface.u.gfx9.cmask.pipe_aligned); + fmask_state[7] = 0; + } } /** @@ -4086,745 +4086,747 @@ */ static void si_make_texture_descriptor(struct si_screen *screen, - struct si_texture *tex, - bool sampler, - enum pipe_texture_target target, - enum pipe_format pipe_format, - const unsigned char state_swizzle[4], - unsigned first_level, unsigned last_level, - unsigned first_layer, unsigned last_layer, - unsigned width, unsigned height, unsigned depth, - uint32_t *state, - uint32_t *fmask_state) -{ - struct pipe_resource *res = &tex->buffer.b.b; - const struct util_format_description *desc; - unsigned char swizzle[4]; - int first_non_void; - unsigned num_format, data_format, type, num_samples; - uint64_t va; - - desc = util_format_description(pipe_format); - - num_samples = desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS ? - MAX2(1, res->nr_samples) : - MAX2(1, res->nr_storage_samples); - - if (desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) { - const unsigned char swizzle_xxxx[4] = {0, 0, 0, 0}; - const unsigned char swizzle_yyyy[4] = {1, 1, 1, 1}; - const unsigned char swizzle_wwww[4] = {3, 3, 3, 3}; - - switch (pipe_format) { - case PIPE_FORMAT_S8_UINT_Z24_UNORM: - case PIPE_FORMAT_X32_S8X24_UINT: - case PIPE_FORMAT_X8Z24_UNORM: - util_format_compose_swizzles(swizzle_yyyy, state_swizzle, swizzle); - break; - case PIPE_FORMAT_X24S8_UINT: - /* - * X24S8 is implemented as an 8_8_8_8 data format, to - * fix texture gathers. This affects at least - * GL45-CTS.texture_cube_map_array.sampling on GFX8. - */ - if (screen->info.chip_class <= GFX8) - util_format_compose_swizzles(swizzle_wwww, state_swizzle, swizzle); - else - util_format_compose_swizzles(swizzle_yyyy, state_swizzle, swizzle); - break; - default: - util_format_compose_swizzles(swizzle_xxxx, state_swizzle, swizzle); - } - } else { - util_format_compose_swizzles(desc->swizzle, state_swizzle, swizzle); - } - - first_non_void = util_format_get_first_non_void_channel(pipe_format); - - switch (pipe_format) { - case PIPE_FORMAT_S8_UINT_Z24_UNORM: - num_format = V_008F14_IMG_NUM_FORMAT_UNORM; - break; - default: - if (first_non_void < 0) { - if (util_format_is_compressed(pipe_format)) { - switch (pipe_format) { - case PIPE_FORMAT_DXT1_SRGB: - case PIPE_FORMAT_DXT1_SRGBA: - case PIPE_FORMAT_DXT3_SRGBA: - case PIPE_FORMAT_DXT5_SRGBA: - case PIPE_FORMAT_BPTC_SRGBA: - case PIPE_FORMAT_ETC2_SRGB8: - case PIPE_FORMAT_ETC2_SRGB8A1: - case PIPE_FORMAT_ETC2_SRGBA8: - num_format = V_008F14_IMG_NUM_FORMAT_SRGB; - break; - case PIPE_FORMAT_RGTC1_SNORM: - case PIPE_FORMAT_LATC1_SNORM: - case PIPE_FORMAT_RGTC2_SNORM: - case PIPE_FORMAT_LATC2_SNORM: - case PIPE_FORMAT_ETC2_R11_SNORM: - case PIPE_FORMAT_ETC2_RG11_SNORM: - /* implies float, so use SNORM/UNORM to determine - whether data is signed or not */ - case PIPE_FORMAT_BPTC_RGB_FLOAT: - num_format = V_008F14_IMG_NUM_FORMAT_SNORM; - break; - default: - num_format = V_008F14_IMG_NUM_FORMAT_UNORM; - break; - } - } else if (desc->layout == UTIL_FORMAT_LAYOUT_SUBSAMPLED) { - num_format = V_008F14_IMG_NUM_FORMAT_UNORM; - } else { - num_format = V_008F14_IMG_NUM_FORMAT_FLOAT; - } - } else if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) { - num_format = V_008F14_IMG_NUM_FORMAT_SRGB; - } else { - num_format = V_008F14_IMG_NUM_FORMAT_UNORM; - - switch (desc->channel[first_non_void].type) { - case UTIL_FORMAT_TYPE_FLOAT: - num_format = V_008F14_IMG_NUM_FORMAT_FLOAT; - break; - case UTIL_FORMAT_TYPE_SIGNED: - if (desc->channel[first_non_void].normalized) - num_format = V_008F14_IMG_NUM_FORMAT_SNORM; - else if (desc->channel[first_non_void].pure_integer) - num_format = V_008F14_IMG_NUM_FORMAT_SINT; - else - num_format = V_008F14_IMG_NUM_FORMAT_SSCALED; - break; - case UTIL_FORMAT_TYPE_UNSIGNED: - if (desc->channel[first_non_void].normalized) - num_format = V_008F14_IMG_NUM_FORMAT_UNORM; - else if (desc->channel[first_non_void].pure_integer) - num_format = V_008F14_IMG_NUM_FORMAT_UINT; - else - num_format = V_008F14_IMG_NUM_FORMAT_USCALED; - } - } - } - - data_format = si_translate_texformat(&screen->b, pipe_format, desc, first_non_void); - if (data_format == ~0) { - data_format = 0; - } - - /* S8 with Z32 HTILE needs a special format. */ - if (screen->info.chip_class == GFX9 && - pipe_format == PIPE_FORMAT_S8_UINT && - tex->tc_compatible_htile) - data_format = V_008F14_IMG_DATA_FORMAT_S8_32; - - if (!sampler && - (res->target == PIPE_TEXTURE_CUBE || - res->target == PIPE_TEXTURE_CUBE_ARRAY || - (screen->info.chip_class <= GFX8 && - res->target == PIPE_TEXTURE_3D))) { - /* For the purpose of shader images, treat cube maps and 3D - * textures as 2D arrays. For 3D textures, the address - * calculations for mipmaps are different, so we rely on the - * caller to effectively disable mipmaps. - */ - type = V_008F1C_SQ_RSRC_IMG_2D_ARRAY; - - assert(res->target != PIPE_TEXTURE_3D || (first_level == 0 && last_level == 0)); - } else { - type = si_tex_dim(screen, tex, target, num_samples); - } - - if (type == V_008F1C_SQ_RSRC_IMG_1D_ARRAY) { - height = 1; - depth = res->array_size; - } else if (type == V_008F1C_SQ_RSRC_IMG_2D_ARRAY || - type == V_008F1C_SQ_RSRC_IMG_2D_MSAA_ARRAY) { - if (sampler || res->target != PIPE_TEXTURE_3D) - depth = res->array_size; - } else if (type == V_008F1C_SQ_RSRC_IMG_CUBE) - depth = res->array_size / 6; - - state[0] = 0; - state[1] = (S_008F14_DATA_FORMAT(data_format) | - S_008F14_NUM_FORMAT(num_format)); - state[2] = (S_008F18_WIDTH(width - 1) | - S_008F18_HEIGHT(height - 1) | - S_008F18_PERF_MOD(4)); - state[3] = (S_008F1C_DST_SEL_X(si_map_swizzle(swizzle[0])) | - S_008F1C_DST_SEL_Y(si_map_swizzle(swizzle[1])) | - S_008F1C_DST_SEL_Z(si_map_swizzle(swizzle[2])) | - S_008F1C_DST_SEL_W(si_map_swizzle(swizzle[3])) | - S_008F1C_BASE_LEVEL(num_samples > 1 ? 0 : first_level) | - S_008F1C_LAST_LEVEL(num_samples > 1 ? - util_logbase2(num_samples) : - last_level) | - S_008F1C_TYPE(type)); - state[4] = 0; - state[5] = S_008F24_BASE_ARRAY(first_layer); - state[6] = 0; - state[7] = 0; - - if (screen->info.chip_class == GFX9) { - unsigned bc_swizzle = gfx9_border_color_swizzle(desc->swizzle); - - /* Depth is the the last accessible layer on Gfx9. - * The hw doesn't need to know the total number of layers. - */ - if (type == V_008F1C_SQ_RSRC_IMG_3D) - state[4] |= S_008F20_DEPTH(depth - 1); - else - state[4] |= S_008F20_DEPTH(last_layer); - - state[4] |= S_008F20_BC_SWIZZLE(bc_swizzle); - state[5] |= S_008F24_MAX_MIP(num_samples > 1 ? - util_logbase2(num_samples) : - tex->buffer.b.b.last_level); - } else { - state[3] |= S_008F1C_POW2_PAD(res->last_level > 0); - state[4] |= S_008F20_DEPTH(depth - 1); - state[5] |= S_008F24_LAST_ARRAY(last_layer); - } - - if (tex->dcc_offset) { - state[6] = S_008F28_ALPHA_IS_ON_MSB(vi_alpha_is_on_msb(screen, pipe_format)); - } else { - /* The last dword is unused by hw. The shader uses it to clear - * bits in the first dword of sampler state. - */ - if (screen->info.chip_class <= GFX7 && res->nr_samples <= 1) { - if (first_level == last_level) - state[7] = C_008F30_MAX_ANISO_RATIO; - else - state[7] = 0xffffffff; - } - } - - /* Initialize the sampler view for FMASK. */ - if (tex->fmask_offset) { - uint32_t data_format, num_format; + struct si_texture *tex, + bool sampler, + enum pipe_texture_target target, + enum pipe_format pipe_format, + const unsigned char state_swizzle[4], + unsigned first_level, unsigned last_level, + unsigned first_layer, unsigned last_layer, + unsigned width, unsigned height, unsigned depth, + uint32_t *state, + uint32_t *fmask_state) +{ + struct pipe_resource *res = &tex->buffer.b.b; + const struct util_format_description *desc; + unsigned char swizzle[4]; + int first_non_void; + unsigned num_format, data_format, type, num_samples; + uint64_t va; + + desc = util_format_description(pipe_format); + + num_samples = desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS ? + MAX2(1, res->nr_samples) : + MAX2(1, res->nr_storage_samples); + + if (desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) { + const unsigned char swizzle_xxxx[4] = {0, 0, 0, 0}; + const unsigned char swizzle_yyyy[4] = {1, 1, 1, 1}; + const unsigned char swizzle_wwww[4] = {3, 3, 3, 3}; + + switch (pipe_format) { + case PIPE_FORMAT_S8_UINT_Z24_UNORM: + case PIPE_FORMAT_X32_S8X24_UINT: + case PIPE_FORMAT_X8Z24_UNORM: + util_format_compose_swizzles(swizzle_yyyy, state_swizzle, swizzle); + break; + case PIPE_FORMAT_X24S8_UINT: + /* + * X24S8 is implemented as an 8_8_8_8 data format, to + * fix texture gathers. This affects at least + * GL45-CTS.texture_cube_map_array.sampling on GFX8. + */ + if (screen->info.chip_class <= GFX8) + util_format_compose_swizzles(swizzle_wwww, state_swizzle, swizzle); + else + util_format_compose_swizzles(swizzle_yyyy, state_swizzle, swizzle); + break; + default: + util_format_compose_swizzles(swizzle_xxxx, state_swizzle, swizzle); + } + } else { + util_format_compose_swizzles(desc->swizzle, state_swizzle, swizzle); + } + + first_non_void = util_format_get_first_non_void_channel(pipe_format); + + switch (pipe_format) { + case PIPE_FORMAT_S8_UINT_Z24_UNORM: + num_format = V_008F14_IMG_NUM_FORMAT_UNORM; + break; + default: + if (first_non_void < 0) { + if (util_format_is_compressed(pipe_format)) { + switch (pipe_format) { + case PIPE_FORMAT_DXT1_SRGB: + case PIPE_FORMAT_DXT1_SRGBA: + case PIPE_FORMAT_DXT3_SRGBA: + case PIPE_FORMAT_DXT5_SRGBA: + case PIPE_FORMAT_BPTC_SRGBA: + case PIPE_FORMAT_ETC2_SRGB8: + case PIPE_FORMAT_ETC2_SRGB8A1: + case PIPE_FORMAT_ETC2_SRGBA8: + num_format = V_008F14_IMG_NUM_FORMAT_SRGB; + break; + case PIPE_FORMAT_RGTC1_SNORM: + case PIPE_FORMAT_LATC1_SNORM: + case PIPE_FORMAT_RGTC2_SNORM: + case PIPE_FORMAT_LATC2_SNORM: + case PIPE_FORMAT_ETC2_R11_SNORM: + case PIPE_FORMAT_ETC2_RG11_SNORM: + /* implies float, so use SNORM/UNORM to determine + whether data is signed or not */ + case PIPE_FORMAT_BPTC_RGB_FLOAT: + num_format = V_008F14_IMG_NUM_FORMAT_SNORM; + break; + default: + num_format = V_008F14_IMG_NUM_FORMAT_UNORM; + break; + } + } else if (desc->layout == UTIL_FORMAT_LAYOUT_SUBSAMPLED) { + num_format = V_008F14_IMG_NUM_FORMAT_UNORM; + } else { + num_format = V_008F14_IMG_NUM_FORMAT_FLOAT; + } + } else if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) { + num_format = V_008F14_IMG_NUM_FORMAT_SRGB; + } else { + num_format = V_008F14_IMG_NUM_FORMAT_UNORM; + + switch (desc->channel[first_non_void].type) { + case UTIL_FORMAT_TYPE_FLOAT: + num_format = V_008F14_IMG_NUM_FORMAT_FLOAT; + break; + case UTIL_FORMAT_TYPE_SIGNED: + if (desc->channel[first_non_void].normalized) + num_format = V_008F14_IMG_NUM_FORMAT_SNORM; + else if (desc->channel[first_non_void].pure_integer) + num_format = V_008F14_IMG_NUM_FORMAT_SINT; + else + num_format = V_008F14_IMG_NUM_FORMAT_SSCALED; + break; + case UTIL_FORMAT_TYPE_UNSIGNED: + if (desc->channel[first_non_void].normalized) + num_format = V_008F14_IMG_NUM_FORMAT_UNORM; + else if (desc->channel[first_non_void].pure_integer) + num_format = V_008F14_IMG_NUM_FORMAT_UINT; + else + num_format = V_008F14_IMG_NUM_FORMAT_USCALED; + } + } + } + + data_format = si_translate_texformat(&screen->b, pipe_format, desc, first_non_void); + if (data_format == ~0) { + data_format = 0; + } + + /* S8 with Z32 HTILE needs a special format. */ + if (screen->info.chip_class == GFX9 && + pipe_format == PIPE_FORMAT_S8_UINT && + tex->tc_compatible_htile) + data_format = V_008F14_IMG_DATA_FORMAT_S8_32; + + if (!sampler && + (res->target == PIPE_TEXTURE_CUBE || + res->target == PIPE_TEXTURE_CUBE_ARRAY || + (screen->info.chip_class <= GFX8 && + res->target == PIPE_TEXTURE_3D))) { + /* For the purpose of shader images, treat cube maps and 3D + * textures as 2D arrays. For 3D textures, the address + * calculations for mipmaps are different, so we rely on the + * caller to effectively disable mipmaps. + */ + type = V_008F1C_SQ_RSRC_IMG_2D_ARRAY; + + assert(res->target != PIPE_TEXTURE_3D || (first_level == 0 && last_level == 0)); + } else { + type = si_tex_dim(screen, tex, target, num_samples); + } + + if (type == V_008F1C_SQ_RSRC_IMG_1D_ARRAY) { + height = 1; + depth = res->array_size; + } else if (type == V_008F1C_SQ_RSRC_IMG_2D_ARRAY || + type == V_008F1C_SQ_RSRC_IMG_2D_MSAA_ARRAY) { + if (sampler || res->target != PIPE_TEXTURE_3D) + depth = res->array_size; + } else if (type == V_008F1C_SQ_RSRC_IMG_CUBE) + depth = res->array_size / 6; + + state[0] = 0; + state[1] = (S_008F14_DATA_FORMAT(data_format) | + S_008F14_NUM_FORMAT(num_format)); + state[2] = (S_008F18_WIDTH(width - 1) | + S_008F18_HEIGHT(height - 1) | + S_008F18_PERF_MOD(4)); + state[3] = (S_008F1C_DST_SEL_X(si_map_swizzle(swizzle[0])) | + S_008F1C_DST_SEL_Y(si_map_swizzle(swizzle[1])) | + S_008F1C_DST_SEL_Z(si_map_swizzle(swizzle[2])) | + S_008F1C_DST_SEL_W(si_map_swizzle(swizzle[3])) | + S_008F1C_BASE_LEVEL(num_samples > 1 ? 0 : first_level) | + S_008F1C_LAST_LEVEL(num_samples > 1 ? + util_logbase2(num_samples) : + last_level) | + S_008F1C_TYPE(type)); + state[4] = 0; + state[5] = S_008F24_BASE_ARRAY(first_layer); + state[6] = 0; + state[7] = 0; + + if (screen->info.chip_class == GFX9) { + unsigned bc_swizzle = gfx9_border_color_swizzle(desc->swizzle); + + /* Depth is the the last accessible layer on Gfx9. + * The hw doesn't need to know the total number of layers. + */ + if (type == V_008F1C_SQ_RSRC_IMG_3D) + state[4] |= S_008F20_DEPTH(depth - 1); + else + state[4] |= S_008F20_DEPTH(last_layer); + + state[4] |= S_008F20_BC_SWIZZLE(bc_swizzle); + state[5] |= S_008F24_MAX_MIP(num_samples > 1 ? + util_logbase2(num_samples) : + tex->buffer.b.b.last_level); + } else { + state[3] |= S_008F1C_POW2_PAD(res->last_level > 0); + state[4] |= S_008F20_DEPTH(depth - 1); + state[5] |= S_008F24_LAST_ARRAY(last_layer); + } + + if (tex->surface.dcc_offset) { + state[6] = S_008F28_ALPHA_IS_ON_MSB(vi_alpha_is_on_msb(screen, pipe_format)); + } else { + /* The last dword is unused by hw. The shader uses it to clear + * bits in the first dword of sampler state. + */ + if (screen->info.chip_class <= GFX7 && res->nr_samples <= 1) { + if (first_level == last_level) + state[7] = C_008F30_MAX_ANISO_RATIO; + else + state[7] = 0xffffffff; + } + } + + /* Initialize the sampler view for FMASK. */ + if (tex->surface.fmask_offset) { + uint32_t data_format, num_format; - va = tex->buffer.gpu_address + tex->fmask_offset; + va = tex->buffer.gpu_address + tex->surface.fmask_offset; #define FMASK(s,f) (((unsigned)(MAX2(1, s)) * 16) + (MAX2(1, f))) - if (screen->info.chip_class == GFX9) { - data_format = V_008F14_IMG_DATA_FORMAT_FMASK; - switch (FMASK(res->nr_samples, res->nr_storage_samples)) { - case FMASK(2,1): - num_format = V_008F14_IMG_FMASK_8_2_1; - break; - case FMASK(2,2): - num_format = V_008F14_IMG_FMASK_8_2_2; - break; - case FMASK(4,1): - num_format = V_008F14_IMG_FMASK_8_4_1; - break; - case FMASK(4,2): - num_format = V_008F14_IMG_FMASK_8_4_2; - break; - case FMASK(4,4): - num_format = V_008F14_IMG_FMASK_8_4_4; - break; - case FMASK(8,1): - num_format = V_008F14_IMG_FMASK_8_8_1; - break; - case FMASK(8,2): - num_format = V_008F14_IMG_FMASK_16_8_2; - break; - case FMASK(8,4): - num_format = V_008F14_IMG_FMASK_32_8_4; - break; - case FMASK(8,8): - num_format = V_008F14_IMG_FMASK_32_8_8; - break; - case FMASK(16,1): - num_format = V_008F14_IMG_FMASK_16_16_1; - break; - case FMASK(16,2): - num_format = V_008F14_IMG_FMASK_32_16_2; - break; - case FMASK(16,4): - num_format = V_008F14_IMG_FMASK_64_16_4; - break; - case FMASK(16,8): - num_format = V_008F14_IMG_FMASK_64_16_8; - break; - default: - unreachable("invalid nr_samples"); - } - } else { - switch (FMASK(res->nr_samples, res->nr_storage_samples)) { - case FMASK(2,1): - data_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S2_F1; - break; - case FMASK(2,2): - data_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S2_F2; - break; - case FMASK(4,1): - data_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S4_F1; - break; - case FMASK(4,2): - data_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S4_F2; - break; - case FMASK(4,4): - data_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S4_F4; - break; - case FMASK(8,1): - data_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S8_F1; - break; - case FMASK(8,2): - data_format = V_008F14_IMG_DATA_FORMAT_FMASK16_S8_F2; - break; - case FMASK(8,4): - data_format = V_008F14_IMG_DATA_FORMAT_FMASK32_S8_F4; - break; - case FMASK(8,8): - data_format = V_008F14_IMG_DATA_FORMAT_FMASK32_S8_F8; - break; - case FMASK(16,1): - data_format = V_008F14_IMG_DATA_FORMAT_FMASK16_S16_F1; - break; - case FMASK(16,2): - data_format = V_008F14_IMG_DATA_FORMAT_FMASK32_S16_F2; - break; - case FMASK(16,4): - data_format = V_008F14_IMG_DATA_FORMAT_FMASK64_S16_F4; - break; - case FMASK(16,8): - data_format = V_008F14_IMG_DATA_FORMAT_FMASK64_S16_F8; - break; - default: - unreachable("invalid nr_samples"); - } - num_format = V_008F14_IMG_NUM_FORMAT_UINT; - } + if (screen->info.chip_class == GFX9) { + data_format = V_008F14_IMG_DATA_FORMAT_FMASK; + switch (FMASK(res->nr_samples, res->nr_storage_samples)) { + case FMASK(2,1): + num_format = V_008F14_IMG_FMASK_8_2_1; + break; + case FMASK(2,2): + num_format = V_008F14_IMG_FMASK_8_2_2; + break; + case FMASK(4,1): + num_format = V_008F14_IMG_FMASK_8_4_1; + break; + case FMASK(4,2): + num_format = V_008F14_IMG_FMASK_8_4_2; + break; + case FMASK(4,4): + num_format = V_008F14_IMG_FMASK_8_4_4; + break; + case FMASK(8,1): + num_format = V_008F14_IMG_FMASK_8_8_1; + break; + case FMASK(8,2): + num_format = V_008F14_IMG_FMASK_16_8_2; + break; + case FMASK(8,4): + num_format = V_008F14_IMG_FMASK_32_8_4; + break; + case FMASK(8,8): + num_format = V_008F14_IMG_FMASK_32_8_8; + break; + case FMASK(16,1): + num_format = V_008F14_IMG_FMASK_16_16_1; + break; + case FMASK(16,2): + num_format = V_008F14_IMG_FMASK_32_16_2; + break; + case FMASK(16,4): + num_format = V_008F14_IMG_FMASK_64_16_4; + break; + case FMASK(16,8): + num_format = V_008F14_IMG_FMASK_64_16_8; + break; + default: + unreachable("invalid nr_samples"); + } + } else { + switch (FMASK(res->nr_samples, res->nr_storage_samples)) { + case FMASK(2,1): + data_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S2_F1; + break; + case FMASK(2,2): + data_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S2_F2; + break; + case FMASK(4,1): + data_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S4_F1; + break; + case FMASK(4,2): + data_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S4_F2; + break; + case FMASK(4,4): + data_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S4_F4; + break; + case FMASK(8,1): + data_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S8_F1; + break; + case FMASK(8,2): + data_format = V_008F14_IMG_DATA_FORMAT_FMASK16_S8_F2; + break; + case FMASK(8,4): + data_format = V_008F14_IMG_DATA_FORMAT_FMASK32_S8_F4; + break; + case FMASK(8,8): + data_format = V_008F14_IMG_DATA_FORMAT_FMASK32_S8_F8; + break; + case FMASK(16,1): + data_format = V_008F14_IMG_DATA_FORMAT_FMASK16_S16_F1; + break; + case FMASK(16,2): + data_format = V_008F14_IMG_DATA_FORMAT_FMASK32_S16_F2; + break; + case FMASK(16,4): + data_format = V_008F14_IMG_DATA_FORMAT_FMASK64_S16_F4; + break; + case FMASK(16,8): + data_format = V_008F14_IMG_DATA_FORMAT_FMASK64_S16_F8; + break; + default: + unreachable("invalid nr_samples"); + } + num_format = V_008F14_IMG_NUM_FORMAT_UINT; + } #undef FMASK - fmask_state[0] = (va >> 8) | tex->surface.fmask_tile_swizzle; - fmask_state[1] = S_008F14_BASE_ADDRESS_HI(va >> 40) | - S_008F14_DATA_FORMAT(data_format) | - S_008F14_NUM_FORMAT(num_format); - fmask_state[2] = S_008F18_WIDTH(width - 1) | - S_008F18_HEIGHT(height - 1); - fmask_state[3] = S_008F1C_DST_SEL_X(V_008F1C_SQ_SEL_X) | - S_008F1C_DST_SEL_Y(V_008F1C_SQ_SEL_X) | - S_008F1C_DST_SEL_Z(V_008F1C_SQ_SEL_X) | - S_008F1C_DST_SEL_W(V_008F1C_SQ_SEL_X) | - S_008F1C_TYPE(si_tex_dim(screen, tex, target, 0)); - fmask_state[4] = 0; - fmask_state[5] = S_008F24_BASE_ARRAY(first_layer); - fmask_state[6] = 0; - fmask_state[7] = 0; - - if (screen->info.chip_class == GFX9) { - fmask_state[3] |= S_008F1C_SW_MODE(tex->surface.u.gfx9.fmask.swizzle_mode); - fmask_state[4] |= S_008F20_DEPTH(last_layer) | - S_008F20_PITCH(tex->surface.u.gfx9.fmask.epitch); - fmask_state[5] |= S_008F24_META_PIPE_ALIGNED(tex->surface.u.gfx9.cmask.pipe_aligned) | - S_008F24_META_RB_ALIGNED(tex->surface.u.gfx9.cmask.rb_aligned); - } else { - fmask_state[3] |= S_008F1C_TILING_INDEX(tex->surface.u.legacy.fmask.tiling_index); - fmask_state[4] |= S_008F20_DEPTH(depth - 1) | - S_008F20_PITCH(tex->surface.u.legacy.fmask.pitch_in_pixels - 1); - fmask_state[5] |= S_008F24_LAST_ARRAY(last_layer); - } - } + fmask_state[0] = (va >> 8) | tex->surface.fmask_tile_swizzle; + fmask_state[1] = S_008F14_BASE_ADDRESS_HI(va >> 40) | + S_008F14_DATA_FORMAT(data_format) | + S_008F14_NUM_FORMAT(num_format); + fmask_state[2] = S_008F18_WIDTH(width - 1) | + S_008F18_HEIGHT(height - 1); + fmask_state[3] = S_008F1C_DST_SEL_X(V_008F1C_SQ_SEL_X) | + S_008F1C_DST_SEL_Y(V_008F1C_SQ_SEL_X) | + S_008F1C_DST_SEL_Z(V_008F1C_SQ_SEL_X) | + S_008F1C_DST_SEL_W(V_008F1C_SQ_SEL_X) | + S_008F1C_TYPE(si_tex_dim(screen, tex, target, 0)); + fmask_state[4] = 0; + fmask_state[5] = S_008F24_BASE_ARRAY(first_layer); + fmask_state[6] = 0; + fmask_state[7] = 0; + + if (screen->info.chip_class == GFX9) { + fmask_state[3] |= S_008F1C_SW_MODE(tex->surface.u.gfx9.fmask.swizzle_mode); + fmask_state[4] |= S_008F20_DEPTH(last_layer) | + S_008F20_PITCH(tex->surface.u.gfx9.fmask.epitch); + fmask_state[5] |= S_008F24_META_PIPE_ALIGNED(tex->surface.u.gfx9.cmask.pipe_aligned) | + S_008F24_META_RB_ALIGNED(tex->surface.u.gfx9.cmask.rb_aligned); + } else { + fmask_state[3] |= S_008F1C_TILING_INDEX(tex->surface.u.legacy.fmask.tiling_index); + fmask_state[4] |= S_008F20_DEPTH(depth - 1) | + S_008F20_PITCH(tex->surface.u.legacy.fmask.pitch_in_pixels - 1); + fmask_state[5] |= S_008F24_LAST_ARRAY(last_layer); + } + } } /** * Create a sampler view. * - * @param ctx context - * @param texture texture - * @param state sampler view template - * @param width0 width0 override (for compressed textures as int) - * @param height0 height0 override (for compressed textures as int) + * @param ctx context + * @param texture texture + * @param state sampler view template + * @param width0 width0 override (for compressed textures as int) + * @param height0 height0 override (for compressed textures as int) * @param force_level set the base address to the level (for compressed textures) */ struct pipe_sampler_view * si_create_sampler_view_custom(struct pipe_context *ctx, - struct pipe_resource *texture, - const struct pipe_sampler_view *state, - unsigned width0, unsigned height0, - unsigned force_level) -{ - struct si_context *sctx = (struct si_context*)ctx; - struct si_sampler_view *view = CALLOC_STRUCT(si_sampler_view); - struct si_texture *tex = (struct si_texture*)texture; - unsigned base_level, first_level, last_level; - unsigned char state_swizzle[4]; - unsigned height, depth, width; - unsigned last_layer = state->u.tex.last_layer; - enum pipe_format pipe_format; - const struct legacy_surf_level *surflevel; - - if (!view) - return NULL; - - /* initialize base object */ - view->base = *state; - view->base.texture = NULL; - view->base.reference.count = 1; - view->base.context = ctx; - - assert(texture); - pipe_resource_reference(&view->base.texture, texture); - - if (state->format == PIPE_FORMAT_X24S8_UINT || - state->format == PIPE_FORMAT_S8X24_UINT || - state->format == PIPE_FORMAT_X32_S8X24_UINT || - state->format == PIPE_FORMAT_S8_UINT) - view->is_stencil_sampler = true; - - /* Buffer resource. */ - if (texture->target == PIPE_BUFFER) { - si_make_buffer_descriptor(sctx->screen, - si_resource(texture), - state->format, - state->u.buf.offset, - state->u.buf.size, - view->state); - return &view->base; - } - - state_swizzle[0] = state->swizzle_r; - state_swizzle[1] = state->swizzle_g; - state_swizzle[2] = state->swizzle_b; - state_swizzle[3] = state->swizzle_a; - - base_level = 0; - first_level = state->u.tex.first_level; - last_level = state->u.tex.last_level; - width = width0; - height = height0; - depth = texture->depth0; - - if (sctx->chip_class <= GFX8 && force_level) { - assert(force_level == first_level && - force_level == last_level); - base_level = force_level; - first_level = 0; - last_level = 0; - width = u_minify(width, force_level); - height = u_minify(height, force_level); - depth = u_minify(depth, force_level); - } - - /* This is not needed if state trackers set last_layer correctly. */ - if (state->target == PIPE_TEXTURE_1D || - state->target == PIPE_TEXTURE_2D || - state->target == PIPE_TEXTURE_RECT || - state->target == PIPE_TEXTURE_CUBE) - last_layer = state->u.tex.first_layer; - - /* Texturing with separate depth and stencil. */ - pipe_format = state->format; - - /* Depth/stencil texturing sometimes needs separate texture. */ - if (tex->is_depth && !si_can_sample_zs(tex, view->is_stencil_sampler)) { - if (!tex->flushed_depth_texture && - !si_init_flushed_depth_texture(ctx, texture)) { - pipe_resource_reference(&view->base.texture, NULL); - FREE(view); - return NULL; - } - - assert(tex->flushed_depth_texture); - - /* Override format for the case where the flushed texture - * contains only Z or only S. - */ - if (tex->flushed_depth_texture->buffer.b.b.format != tex->buffer.b.b.format) - pipe_format = tex->flushed_depth_texture->buffer.b.b.format; - - tex = tex->flushed_depth_texture; - } - - surflevel = tex->surface.u.legacy.level; - - if (tex->db_compatible) { - if (!view->is_stencil_sampler) - pipe_format = tex->db_render_format; - - switch (pipe_format) { - case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: - pipe_format = PIPE_FORMAT_Z32_FLOAT; - break; - case PIPE_FORMAT_X8Z24_UNORM: - case PIPE_FORMAT_S8_UINT_Z24_UNORM: - /* Z24 is always stored like this for DB - * compatibility. - */ - pipe_format = PIPE_FORMAT_Z24X8_UNORM; - break; - case PIPE_FORMAT_X24S8_UINT: - case PIPE_FORMAT_S8X24_UINT: - case PIPE_FORMAT_X32_S8X24_UINT: - pipe_format = PIPE_FORMAT_S8_UINT; - surflevel = tex->surface.u.legacy.stencil_level; - break; - default:; - } - } - - view->dcc_incompatible = - vi_dcc_formats_are_incompatible(texture, - state->u.tex.first_level, - state->format); - - sctx->screen->make_texture_descriptor(sctx->screen, tex, true, - state->target, pipe_format, state_swizzle, - first_level, last_level, - state->u.tex.first_layer, last_layer, - width, height, depth, - view->state, view->fmask_state); - - const struct util_format_description *desc = util_format_description(pipe_format); - view->is_integer = false; - - for (unsigned i = 0; i < desc->nr_channels; ++i) { - if (desc->channel[i].type == UTIL_FORMAT_TYPE_VOID) - continue; - - /* Whether the number format is {U,S}{SCALED,INT} */ - view->is_integer = - (desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED || - desc->channel[i].type == UTIL_FORMAT_TYPE_SIGNED) && - (desc->channel[i].pure_integer || !desc->channel[i].normalized); - break; - } - - view->base_level_info = &surflevel[base_level]; - view->base_level = base_level; - view->block_width = util_format_get_blockwidth(pipe_format); - return &view->base; + struct pipe_resource *texture, + const struct pipe_sampler_view *state, + unsigned width0, unsigned height0, + unsigned force_level) +{ + struct si_context *sctx = (struct si_context*)ctx; + struct si_sampler_view *view = CALLOC_STRUCT(si_sampler_view); + struct si_texture *tex = (struct si_texture*)texture; + unsigned base_level, first_level, last_level; + unsigned char state_swizzle[4]; + unsigned height, depth, width; + unsigned last_layer = state->u.tex.last_layer; + enum pipe_format pipe_format; + const struct legacy_surf_level *surflevel; + + if (!view) + return NULL; + + /* initialize base object */ + view->base = *state; + view->base.texture = NULL; + view->base.reference.count = 1; + view->base.context = ctx; + + assert(texture); + pipe_resource_reference(&view->base.texture, texture); + + if (state->format == PIPE_FORMAT_X24S8_UINT || + state->format == PIPE_FORMAT_S8X24_UINT || + state->format == PIPE_FORMAT_X32_S8X24_UINT || + state->format == PIPE_FORMAT_S8_UINT) + view->is_stencil_sampler = true; + + /* Buffer resource. */ + if (texture->target == PIPE_BUFFER) { + si_make_buffer_descriptor(sctx->screen, + si_resource(texture), + state->format, + state->u.buf.offset, + state->u.buf.size, + view->state); + return &view->base; + } + + state_swizzle[0] = state->swizzle_r; + state_swizzle[1] = state->swizzle_g; + state_swizzle[2] = state->swizzle_b; + state_swizzle[3] = state->swizzle_a; + + base_level = 0; + first_level = state->u.tex.first_level; + last_level = state->u.tex.last_level; + width = width0; + height = height0; + depth = texture->depth0; + + if (sctx->chip_class <= GFX8 && force_level) { + assert(force_level == first_level && + force_level == last_level); + base_level = force_level; + first_level = 0; + last_level = 0; + width = u_minify(width, force_level); + height = u_minify(height, force_level); + depth = u_minify(depth, force_level); + } + + /* This is not needed if state trackers set last_layer correctly. */ + if (state->target == PIPE_TEXTURE_1D || + state->target == PIPE_TEXTURE_2D || + state->target == PIPE_TEXTURE_RECT || + state->target == PIPE_TEXTURE_CUBE) + last_layer = state->u.tex.first_layer; + + /* Texturing with separate depth and stencil. */ + pipe_format = state->format; + + /* Depth/stencil texturing sometimes needs separate texture. */ + if (tex->is_depth && !si_can_sample_zs(tex, view->is_stencil_sampler)) { + if (!tex->flushed_depth_texture && + !si_init_flushed_depth_texture(ctx, texture)) { + pipe_resource_reference(&view->base.texture, NULL); + FREE(view); + return NULL; + } + + assert(tex->flushed_depth_texture); + + /* Override format for the case where the flushed texture + * contains only Z or only S. + */ + if (tex->flushed_depth_texture->buffer.b.b.format != tex->buffer.b.b.format) + pipe_format = tex->flushed_depth_texture->buffer.b.b.format; + + tex = tex->flushed_depth_texture; + } + + surflevel = tex->surface.u.legacy.level; + + if (tex->db_compatible) { + if (!view->is_stencil_sampler) + pipe_format = tex->db_render_format; + + switch (pipe_format) { + case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: + pipe_format = PIPE_FORMAT_Z32_FLOAT; + break; + case PIPE_FORMAT_X8Z24_UNORM: + case PIPE_FORMAT_S8_UINT_Z24_UNORM: + /* Z24 is always stored like this for DB + * compatibility. + */ + pipe_format = PIPE_FORMAT_Z24X8_UNORM; + break; + case PIPE_FORMAT_X24S8_UINT: + case PIPE_FORMAT_S8X24_UINT: + case PIPE_FORMAT_X32_S8X24_UINT: + pipe_format = PIPE_FORMAT_S8_UINT; + surflevel = tex->surface.u.legacy.stencil_level; + break; + default:; + } + } + + view->dcc_incompatible = + vi_dcc_formats_are_incompatible(texture, + state->u.tex.first_level, + state->format); + + sctx->screen->make_texture_descriptor(sctx->screen, tex, true, + state->target, pipe_format, state_swizzle, + first_level, last_level, + state->u.tex.first_layer, last_layer, + width, height, depth, + view->state, view->fmask_state); + + const struct util_format_description *desc = util_format_description(pipe_format); + view->is_integer = false; + + for (unsigned i = 0; i < desc->nr_channels; ++i) { + if (desc->channel[i].type == UTIL_FORMAT_TYPE_VOID) + continue; + + /* Whether the number format is {U,S}{SCALED,INT} */ + view->is_integer = + (desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED || + desc->channel[i].type == UTIL_FORMAT_TYPE_SIGNED) && + (desc->channel[i].pure_integer || !desc->channel[i].normalized); + break; + } + + view->base_level_info = &surflevel[base_level]; + view->base_level = base_level; + view->block_width = util_format_get_blockwidth(pipe_format); + return &view->base; } static struct pipe_sampler_view * si_create_sampler_view(struct pipe_context *ctx, - struct pipe_resource *texture, - const struct pipe_sampler_view *state) + struct pipe_resource *texture, + const struct pipe_sampler_view *state) { - return si_create_sampler_view_custom(ctx, texture, state, - texture ? texture->width0 : 0, - texture ? texture->height0 : 0, 0); + return si_create_sampler_view_custom(ctx, texture, state, + texture ? texture->width0 : 0, + texture ? texture->height0 : 0, 0); } static void si_sampler_view_destroy(struct pipe_context *ctx, - struct pipe_sampler_view *state) + struct pipe_sampler_view *state) { - struct si_sampler_view *view = (struct si_sampler_view *)state; + struct si_sampler_view *view = (struct si_sampler_view *)state; - pipe_resource_reference(&state->texture, NULL); - FREE(view); + pipe_resource_reference(&state->texture, NULL); + FREE(view); } static bool wrap_mode_uses_border_color(unsigned wrap, bool linear_filter) { - return wrap == PIPE_TEX_WRAP_CLAMP_TO_BORDER || - wrap == PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER || - (linear_filter && - (wrap == PIPE_TEX_WRAP_CLAMP || - wrap == PIPE_TEX_WRAP_MIRROR_CLAMP)); + return wrap == PIPE_TEX_WRAP_CLAMP_TO_BORDER || + wrap == PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER || + (linear_filter && + (wrap == PIPE_TEX_WRAP_CLAMP || + wrap == PIPE_TEX_WRAP_MIRROR_CLAMP)); } static uint32_t si_translate_border_color(struct si_context *sctx, - const struct pipe_sampler_state *state, - const union pipe_color_union *color, - bool is_integer) -{ - bool linear_filter = state->min_img_filter != PIPE_TEX_FILTER_NEAREST || - state->mag_img_filter != PIPE_TEX_FILTER_NEAREST; - - if (!wrap_mode_uses_border_color(state->wrap_s, linear_filter) && - !wrap_mode_uses_border_color(state->wrap_t, linear_filter) && - !wrap_mode_uses_border_color(state->wrap_r, linear_filter)) - return S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_TRANS_BLACK); + const struct pipe_sampler_state *state, + const union pipe_color_union *color, + bool is_integer) +{ + bool linear_filter = state->min_img_filter != PIPE_TEX_FILTER_NEAREST || + state->mag_img_filter != PIPE_TEX_FILTER_NEAREST; + + if (!wrap_mode_uses_border_color(state->wrap_s, linear_filter) && + !wrap_mode_uses_border_color(state->wrap_t, linear_filter) && + !wrap_mode_uses_border_color(state->wrap_r, linear_filter)) + return S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_TRANS_BLACK); #define simple_border_types(elt) \ do { \ - if (color->elt[0] == 0 && color->elt[1] == 0 && \ - color->elt[2] == 0 && color->elt[3] == 0) \ - return S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_TRANS_BLACK); \ - if (color->elt[0] == 0 && color->elt[1] == 0 && \ - color->elt[2] == 0 && color->elt[3] == 1) \ - return S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_OPAQUE_BLACK); \ - if (color->elt[0] == 1 && color->elt[1] == 1 && \ - color->elt[2] == 1 && color->elt[3] == 1) \ - return S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_OPAQUE_WHITE); \ + if (color->elt[0] == 0 && color->elt[1] == 0 && \ + color->elt[2] == 0 && color->elt[3] == 0) \ + return S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_TRANS_BLACK); \ + if (color->elt[0] == 0 && color->elt[1] == 0 && \ + color->elt[2] == 0 && color->elt[3] == 1) \ + return S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_OPAQUE_BLACK); \ + if (color->elt[0] == 1 && color->elt[1] == 1 && \ + color->elt[2] == 1 && color->elt[3] == 1) \ + return S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_OPAQUE_WHITE); \ } while (false) - if (is_integer) - simple_border_types(ui); - else - simple_border_types(f); + if (is_integer) + simple_border_types(ui); + else + simple_border_types(f); #undef simple_border_types - int i; + int i; - /* Check if the border has been uploaded already. */ - for (i = 0; i < sctx->border_color_count; i++) - if (memcmp(&sctx->border_color_table[i], color, - sizeof(*color)) == 0) - break; - - if (i >= SI_MAX_BORDER_COLORS) { - /* Getting 4096 unique border colors is very unlikely. */ - fprintf(stderr, "radeonsi: The border color table is full. " - "Any new border colors will be just black. " - "Please file a bug.\n"); - return S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_TRANS_BLACK); - } - - if (i == sctx->border_color_count) { - /* Upload a new border color. */ - memcpy(&sctx->border_color_table[i], color, - sizeof(*color)); - util_memcpy_cpu_to_le32(&sctx->border_color_map[i], - color, sizeof(*color)); - sctx->border_color_count++; - } + /* Check if the border has been uploaded already. */ + for (i = 0; i < sctx->border_color_count; i++) + if (memcmp(&sctx->border_color_table[i], color, + sizeof(*color)) == 0) + break; + + if (i >= SI_MAX_BORDER_COLORS) { + /* Getting 4096 unique border colors is very unlikely. */ + fprintf(stderr, "radeonsi: The border color table is full. " + "Any new border colors will be just black. " + "Please file a bug.\n"); + return S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_TRANS_BLACK); + } + + if (i == sctx->border_color_count) { + /* Upload a new border color. */ + memcpy(&sctx->border_color_table[i], color, + sizeof(*color)); + util_memcpy_cpu_to_le32(&sctx->border_color_map[i], + color, sizeof(*color)); + sctx->border_color_count++; + } - return S_008F3C_BORDER_COLOR_PTR(i) | - S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_REGISTER); + return S_008F3C_BORDER_COLOR_PTR(i) | + S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_REGISTER); } static inline int S_FIXED(float value, unsigned frac_bits) { - return value * (1 << frac_bits); + return value * (1 << frac_bits); } static inline unsigned si_tex_filter(unsigned filter, unsigned max_aniso) { - if (filter == PIPE_TEX_FILTER_LINEAR) - return max_aniso > 1 ? V_008F38_SQ_TEX_XY_FILTER_ANISO_BILINEAR - : V_008F38_SQ_TEX_XY_FILTER_BILINEAR; - else - return max_aniso > 1 ? V_008F38_SQ_TEX_XY_FILTER_ANISO_POINT - : V_008F38_SQ_TEX_XY_FILTER_POINT; + if (filter == PIPE_TEX_FILTER_LINEAR) + return max_aniso > 1 ? V_008F38_SQ_TEX_XY_FILTER_ANISO_BILINEAR + : V_008F38_SQ_TEX_XY_FILTER_BILINEAR; + else + return max_aniso > 1 ? V_008F38_SQ_TEX_XY_FILTER_ANISO_POINT + : V_008F38_SQ_TEX_XY_FILTER_POINT; } static inline unsigned si_tex_aniso_filter(unsigned filter) { - if (filter < 2) - return 0; - if (filter < 4) - return 1; - if (filter < 8) - return 2; - if (filter < 16) - return 3; - return 4; + if (filter < 2) + return 0; + if (filter < 4) + return 1; + if (filter < 8) + return 2; + if (filter < 16) + return 3; + return 4; } static void *si_create_sampler_state(struct pipe_context *ctx, - const struct pipe_sampler_state *state) + const struct pipe_sampler_state *state) { - struct si_context *sctx = (struct si_context *)ctx; - struct si_screen *sscreen = sctx->screen; - struct si_sampler_state *rstate = CALLOC_STRUCT(si_sampler_state); - unsigned max_aniso = sscreen->force_aniso >= 0 ? sscreen->force_aniso - : state->max_anisotropy; - unsigned max_aniso_ratio = si_tex_aniso_filter(max_aniso); - union pipe_color_union clamped_border_color; - - if (!rstate) { - return NULL; - } + struct si_context *sctx = (struct si_context *)ctx; + struct si_screen *sscreen = sctx->screen; + struct si_sampler_state *rstate = CALLOC_STRUCT(si_sampler_state); + unsigned max_aniso = sscreen->force_aniso >= 0 ? sscreen->force_aniso + : state->max_anisotropy; + unsigned max_aniso_ratio = si_tex_aniso_filter(max_aniso); + bool trunc_coord = state->min_img_filter == PIPE_TEX_FILTER_NEAREST && + state->mag_img_filter == PIPE_TEX_FILTER_NEAREST && + state->compare_mode == PIPE_TEX_COMPARE_NONE; + union pipe_color_union clamped_border_color; + + if (!rstate) { + return NULL; + } #ifndef NDEBUG - rstate->magic = SI_SAMPLER_STATE_MAGIC; + rstate->magic = SI_SAMPLER_STATE_MAGIC; #endif - rstate->val[0] = (S_008F30_CLAMP_X(si_tex_wrap(state->wrap_s)) | - S_008F30_CLAMP_Y(si_tex_wrap(state->wrap_t)) | - S_008F30_CLAMP_Z(si_tex_wrap(state->wrap_r)) | - S_008F30_MAX_ANISO_RATIO(max_aniso_ratio) | - S_008F30_DEPTH_COMPARE_FUNC(si_tex_compare(state->compare_func)) | - S_008F30_FORCE_UNNORMALIZED(!state->normalized_coords) | - S_008F30_ANISO_THRESHOLD(max_aniso_ratio >> 1) | - S_008F30_ANISO_BIAS(max_aniso_ratio) | - S_008F30_DISABLE_CUBE_WRAP(!state->seamless_cube_map) | - S_008F30_COMPAT_MODE(sctx->chip_class == GFX8 || sctx->chip_class == GFX9)); - rstate->val[1] = (S_008F34_MIN_LOD(S_FIXED(CLAMP(state->min_lod, 0, 15), 8)) | - S_008F34_MAX_LOD(S_FIXED(CLAMP(state->max_lod, 0, 15), 8)) | - S_008F34_PERF_MIP(max_aniso_ratio ? max_aniso_ratio + 6 : 0)); - rstate->val[2] = (S_008F38_LOD_BIAS(S_FIXED(CLAMP(state->lod_bias, -16, 16), 8)) | - S_008F38_XY_MAG_FILTER(si_tex_filter(state->mag_img_filter, max_aniso)) | - S_008F38_XY_MIN_FILTER(si_tex_filter(state->min_img_filter, max_aniso)) | - S_008F38_MIP_FILTER(si_tex_mipfilter(state->min_mip_filter)) | - S_008F38_MIP_POINT_PRECLAMP(0)); - rstate->val[3] = si_translate_border_color(sctx, state, &state->border_color, false); - - if (sscreen->info.chip_class >= GFX10) { - rstate->val[2] |= S_008F38_ANISO_OVERRIDE_GFX10(1); - } else { - rstate->val[2] |= S_008F38_DISABLE_LSB_CEIL(sctx->chip_class <= GFX8) | - S_008F38_FILTER_PREC_FIX(1) | - S_008F38_ANISO_OVERRIDE_GFX6(sctx->chip_class >= GFX8); - } - - /* Create sampler resource for integer textures. */ - memcpy(rstate->integer_val, rstate->val, sizeof(rstate->val)); - rstate->integer_val[3] = si_translate_border_color(sctx, state, &state->border_color, true); - - /* Create sampler resource for upgraded depth textures. */ - memcpy(rstate->upgraded_depth_val, rstate->val, sizeof(rstate->val)); - - for (unsigned i = 0; i < 4; ++i) { - /* Use channel 0 on purpose, so that we can use OPAQUE_WHITE - * when the border color is 1.0. */ - clamped_border_color.f[i] = CLAMP(state->border_color.f[0], 0, 1); - } - - if (memcmp(&state->border_color, &clamped_border_color, sizeof(clamped_border_color)) == 0) { - if (sscreen->info.chip_class <= GFX9) - rstate->upgraded_depth_val[3] |= S_008F3C_UPGRADED_DEPTH(1); - } else { - rstate->upgraded_depth_val[3] = - si_translate_border_color(sctx, state, &clamped_border_color, false); - } + rstate->val[0] = + (S_008F30_CLAMP_X(si_tex_wrap(state->wrap_s)) | S_008F30_CLAMP_Y(si_tex_wrap(state->wrap_t)) | + S_008F30_CLAMP_Z(si_tex_wrap(state->wrap_r)) | S_008F30_MAX_ANISO_RATIO(max_aniso_ratio) | + S_008F30_DEPTH_COMPARE_FUNC(si_tex_compare(state->compare_func)) | + S_008F30_FORCE_UNNORMALIZED(!state->normalized_coords) | + S_008F30_ANISO_THRESHOLD(max_aniso_ratio >> 1) | S_008F30_ANISO_BIAS(max_aniso_ratio) | + S_008F30_DISABLE_CUBE_WRAP(!state->seamless_cube_map) | + S_008F30_TRUNC_COORD(trunc_coord) | + S_008F30_COMPAT_MODE(sctx->chip_class == GFX8 || sctx->chip_class == GFX9)); + rstate->val[1] = (S_008F34_MIN_LOD(S_FIXED(CLAMP(state->min_lod, 0, 15), 8)) | + S_008F34_MAX_LOD(S_FIXED(CLAMP(state->max_lod, 0, 15), 8)) | + S_008F34_PERF_MIP(max_aniso_ratio ? max_aniso_ratio + 6 : 0)); + rstate->val[2] = (S_008F38_LOD_BIAS(S_FIXED(CLAMP(state->lod_bias, -16, 16), 8)) | + S_008F38_XY_MAG_FILTER(si_tex_filter(state->mag_img_filter, max_aniso)) | + S_008F38_XY_MIN_FILTER(si_tex_filter(state->min_img_filter, max_aniso)) | + S_008F38_MIP_FILTER(si_tex_mipfilter(state->min_mip_filter)) | + S_008F38_MIP_POINT_PRECLAMP(0)); + rstate->val[3] = si_translate_border_color(sctx, state, &state->border_color, false); + + if (sscreen->info.chip_class >= GFX10) { + rstate->val[2] |= S_008F38_ANISO_OVERRIDE_GFX10(1); + } else { + rstate->val[2] |= S_008F38_DISABLE_LSB_CEIL(sctx->chip_class <= GFX8) | + S_008F38_FILTER_PREC_FIX(1) | + S_008F38_ANISO_OVERRIDE_GFX6(sctx->chip_class >= GFX8); + } + + /* Create sampler resource for integer textures. */ + memcpy(rstate->integer_val, rstate->val, sizeof(rstate->val)); + rstate->integer_val[3] = si_translate_border_color(sctx, state, &state->border_color, true); + + /* Create sampler resource for upgraded depth textures. */ + memcpy(rstate->upgraded_depth_val, rstate->val, sizeof(rstate->val)); + + for (unsigned i = 0; i < 4; ++i) { + /* Use channel 0 on purpose, so that we can use OPAQUE_WHITE + * when the border color is 1.0. */ + clamped_border_color.f[i] = CLAMP(state->border_color.f[0], 0, 1); + } + + if (memcmp(&state->border_color, &clamped_border_color, sizeof(clamped_border_color)) == 0) { + if (sscreen->info.chip_class <= GFX9) + rstate->upgraded_depth_val[3] |= S_008F3C_UPGRADED_DEPTH(1); + } else { + rstate->upgraded_depth_val[3] = + si_translate_border_color(sctx, state, &clamped_border_color, false); + } - return rstate; + return rstate; } static void si_set_sample_mask(struct pipe_context *ctx, unsigned sample_mask) { - struct si_context *sctx = (struct si_context *)ctx; + struct si_context *sctx = (struct si_context *)ctx; - if (sctx->sample_mask == (uint16_t)sample_mask) - return; + if (sctx->sample_mask == (uint16_t)sample_mask) + return; - sctx->sample_mask = sample_mask; - si_mark_atom_dirty(sctx, &sctx->atoms.s.sample_mask); + sctx->sample_mask = sample_mask; + si_mark_atom_dirty(sctx, &sctx->atoms.s.sample_mask); } static void si_emit_sample_mask(struct si_context *sctx) { - struct radeon_cmdbuf *cs = sctx->gfx_cs; - unsigned mask = sctx->sample_mask; + struct radeon_cmdbuf *cs = sctx->gfx_cs; + unsigned mask = sctx->sample_mask; - /* Needed for line and polygon smoothing as well as for the Polaris - * small primitive filter. We expect the state tracker to take care of - * this for us. - */ - assert(mask == 0xffff || sctx->framebuffer.nr_samples > 1 || - (mask & 1 && sctx->blitter->running)); - - radeon_set_context_reg_seq(cs, R_028C38_PA_SC_AA_MASK_X0Y0_X1Y0, 2); - radeon_emit(cs, mask | (mask << 16)); - radeon_emit(cs, mask | (mask << 16)); + /* Needed for line and polygon smoothing as well as for the Polaris + * small primitive filter. We expect the state tracker to take care of + * this for us. + */ + assert(mask == 0xffff || sctx->framebuffer.nr_samples > 1 || + (mask & 1 && sctx->blitter->running)); + + radeon_set_context_reg_seq(cs, R_028C38_PA_SC_AA_MASK_X0Y0_X1Y0, 2); + radeon_emit(cs, mask | (mask << 16)); + radeon_emit(cs, mask | (mask << 16)); } static void si_delete_sampler_state(struct pipe_context *ctx, void *state) { #ifndef NDEBUG - struct si_sampler_state *s = state; + struct si_sampler_state *s = state; - assert(s->magic == SI_SAMPLER_STATE_MAGIC); - s->magic = 0; + assert(s->magic == SI_SAMPLER_STATE_MAGIC); + s->magic = 0; #endif - free(state); + free(state); } /* @@ -4834,331 +4836,344 @@ struct si_fast_udiv_info32 si_compute_fast_udiv_info32(uint32_t D, unsigned num_bits) { - struct util_fast_udiv_info info = - util_compute_fast_udiv_info(D, num_bits, 32); + struct util_fast_udiv_info info = + util_compute_fast_udiv_info(D, num_bits, 32); - struct si_fast_udiv_info32 result = { - info.multiplier, - info.pre_shift, - info.post_shift, - info.increment, - }; - return result; + struct si_fast_udiv_info32 result = { + info.multiplier, + info.pre_shift, + info.post_shift, + info.increment, + }; + return result; } static void *si_create_vertex_elements(struct pipe_context *ctx, - unsigned count, - const struct pipe_vertex_element *elements) + unsigned count, + const struct pipe_vertex_element *elements) { - struct si_screen *sscreen = (struct si_screen*)ctx->screen; - struct si_vertex_elements *v = CALLOC_STRUCT(si_vertex_elements); - bool used[SI_NUM_VERTEX_BUFFERS] = {}; - struct si_fast_udiv_info32 divisor_factors[SI_MAX_ATTRIBS] = {}; - STATIC_ASSERT(sizeof(struct si_fast_udiv_info32) == 16); - STATIC_ASSERT(sizeof(divisor_factors[0].multiplier) == 4); - STATIC_ASSERT(sizeof(divisor_factors[0].pre_shift) == 4); - STATIC_ASSERT(sizeof(divisor_factors[0].post_shift) == 4); - STATIC_ASSERT(sizeof(divisor_factors[0].increment) == 4); - int i; - - assert(count <= SI_MAX_ATTRIBS); - if (!v) - return NULL; - - v->count = count; - v->desc_list_byte_size = align(count * 16, SI_CPDMA_ALIGNMENT); - - for (i = 0; i < count; ++i) { - const struct util_format_description *desc; - const struct util_format_channel_description *channel; - int first_non_void; - unsigned vbo_index = elements[i].vertex_buffer_index; - - if (vbo_index >= SI_NUM_VERTEX_BUFFERS) { - FREE(v); - return NULL; - } - - unsigned instance_divisor = elements[i].instance_divisor; - if (instance_divisor) { - v->uses_instance_divisors = true; - - if (instance_divisor == 1) { - v->instance_divisor_is_one |= 1u << i; - } else { - v->instance_divisor_is_fetched |= 1u << i; - divisor_factors[i] = - si_compute_fast_udiv_info32(instance_divisor, 32); - } - } - - if (!used[vbo_index]) { - v->first_vb_use_mask |= 1 << i; - used[vbo_index] = true; - } - - desc = util_format_description(elements[i].src_format); - first_non_void = util_format_get_first_non_void_channel(elements[i].src_format); - channel = first_non_void >= 0 ? &desc->channel[first_non_void] : NULL; - - v->format_size[i] = desc->block.bits / 8; - v->src_offset[i] = elements[i].src_offset; - v->vertex_buffer_index[i] = vbo_index; - - bool always_fix = false; - union si_vs_fix_fetch fix_fetch; - unsigned log_hw_load_size; /* the load element size as seen by the hardware */ - - fix_fetch.bits = 0; - log_hw_load_size = MIN2(2, util_logbase2(desc->block.bits) - 3); - - if (channel) { - switch (channel->type) { - case UTIL_FORMAT_TYPE_FLOAT: fix_fetch.u.format = AC_FETCH_FORMAT_FLOAT; break; - case UTIL_FORMAT_TYPE_FIXED: fix_fetch.u.format = AC_FETCH_FORMAT_FIXED; break; - case UTIL_FORMAT_TYPE_SIGNED: { - if (channel->pure_integer) - fix_fetch.u.format = AC_FETCH_FORMAT_SINT; - else if (channel->normalized) - fix_fetch.u.format = AC_FETCH_FORMAT_SNORM; - else - fix_fetch.u.format = AC_FETCH_FORMAT_SSCALED; - break; - } - case UTIL_FORMAT_TYPE_UNSIGNED: { - if (channel->pure_integer) - fix_fetch.u.format = AC_FETCH_FORMAT_UINT; - else if (channel->normalized) - fix_fetch.u.format = AC_FETCH_FORMAT_UNORM; - else - fix_fetch.u.format = AC_FETCH_FORMAT_USCALED; - break; - } - default: unreachable("bad format type"); - } - } else { - switch (elements[i].src_format) { - case PIPE_FORMAT_R11G11B10_FLOAT: fix_fetch.u.format = AC_FETCH_FORMAT_FLOAT; break; - default: unreachable("bad other format"); - } - } - - if (desc->channel[0].size == 10) { - fix_fetch.u.log_size = 3; /* special encoding for 2_10_10_10 */ - log_hw_load_size = 2; - - /* The hardware always treats the 2-bit alpha channel as - * unsigned, so a shader workaround is needed. The affected - * chips are GFX8 and older except Stoney (GFX8.1). - */ - always_fix = sscreen->info.chip_class <= GFX8 && - sscreen->info.family != CHIP_STONEY && - channel->type == UTIL_FORMAT_TYPE_SIGNED; - } else if (elements[i].src_format == PIPE_FORMAT_R11G11B10_FLOAT) { - fix_fetch.u.log_size = 3; /* special encoding */ - fix_fetch.u.format = AC_FETCH_FORMAT_FIXED; - log_hw_load_size = 2; - } else { - fix_fetch.u.log_size = util_logbase2(channel->size) - 3; - fix_fetch.u.num_channels_m1 = desc->nr_channels - 1; - - /* Always fix up: - * - doubles (multiple loads + truncate to float) - * - 32-bit requiring a conversion - */ - always_fix = - (fix_fetch.u.log_size == 3) || - (fix_fetch.u.log_size == 2 && - fix_fetch.u.format != AC_FETCH_FORMAT_FLOAT && - fix_fetch.u.format != AC_FETCH_FORMAT_UINT && - fix_fetch.u.format != AC_FETCH_FORMAT_SINT); - - /* Also fixup 8_8_8 and 16_16_16. */ - if (desc->nr_channels == 3 && fix_fetch.u.log_size <= 1) { - always_fix = true; - log_hw_load_size = fix_fetch.u.log_size; - } - } - - if (desc->swizzle[0] != PIPE_SWIZZLE_X) { - assert(desc->swizzle[0] == PIPE_SWIZZLE_Z && - (desc->swizzle[2] == PIPE_SWIZZLE_X || desc->swizzle[2] == PIPE_SWIZZLE_0)); - fix_fetch.u.reverse = 1; - } - - /* Force the workaround for unaligned access here already if the - * offset relative to the vertex buffer base is unaligned. - * - * There is a theoretical case in which this is too conservative: - * if the vertex buffer's offset is also unaligned in just the - * right way, we end up with an aligned address after all. - * However, this case should be extremely rare in practice (it - * won't happen in well-behaved applications), and taking it - * into account would complicate the fast path (where everything - * is nicely aligned). - */ - bool check_alignment = - log_hw_load_size >= 1 && - (sscreen->info.chip_class == GFX6 || sscreen->info.chip_class == GFX10); - bool opencode = sscreen->options.vs_fetch_always_opencode; - - if (check_alignment && - (elements[i].src_offset & ((1 << log_hw_load_size) - 1)) != 0) - opencode = true; - - if (always_fix || check_alignment || opencode) - v->fix_fetch[i] = fix_fetch.bits; - - if (opencode) - v->fix_fetch_opencode |= 1 << i; - if (opencode || always_fix) - v->fix_fetch_always |= 1 << i; - - if (check_alignment && !opencode) { - assert(log_hw_load_size == 1 || log_hw_load_size == 2); - - v->fix_fetch_unaligned |= 1 << i; - v->hw_load_is_dword |= (log_hw_load_size - 1) << i; - v->vb_alignment_check_mask |= 1 << vbo_index; - } - - v->rsrc_word3[i] = S_008F0C_DST_SEL_X(si_map_swizzle(desc->swizzle[0])) | - S_008F0C_DST_SEL_Y(si_map_swizzle(desc->swizzle[1])) | - S_008F0C_DST_SEL_Z(si_map_swizzle(desc->swizzle[2])) | - S_008F0C_DST_SEL_W(si_map_swizzle(desc->swizzle[3])); - - if (sscreen->info.chip_class >= GFX10) { - const struct gfx10_format *fmt = - &gfx10_format_table[elements[i].src_format]; - assert(fmt->img_format != 0 && fmt->img_format < 128); - v->rsrc_word3[i] |= S_008F0C_FORMAT(fmt->img_format) | - S_008F0C_RESOURCE_LEVEL(1); - } else { - unsigned data_format, num_format; - data_format = si_translate_buffer_dataformat(ctx->screen, desc, first_non_void); - num_format = si_translate_buffer_numformat(ctx->screen, desc, first_non_void); - v->rsrc_word3[i] |= S_008F0C_NUM_FORMAT(num_format) | - S_008F0C_DATA_FORMAT(data_format); - } - } - - if (v->instance_divisor_is_fetched) { - unsigned num_divisors = util_last_bit(v->instance_divisor_is_fetched); - - v->instance_divisor_factor_buffer = - (struct si_resource*) - pipe_buffer_create(&sscreen->b, 0, PIPE_USAGE_DEFAULT, - num_divisors * sizeof(divisor_factors[0])); - if (!v->instance_divisor_factor_buffer) { - FREE(v); - return NULL; - } - void *map = sscreen->ws->buffer_map(v->instance_divisor_factor_buffer->buf, - NULL, PIPE_TRANSFER_WRITE); - memcpy(map , divisor_factors, num_divisors * sizeof(divisor_factors[0])); - } - return v; + struct si_screen *sscreen = (struct si_screen*)ctx->screen; + struct si_vertex_elements *v = CALLOC_STRUCT(si_vertex_elements); + bool used[SI_NUM_VERTEX_BUFFERS] = {}; + struct si_fast_udiv_info32 divisor_factors[SI_MAX_ATTRIBS] = {}; + STATIC_ASSERT(sizeof(struct si_fast_udiv_info32) == 16); + STATIC_ASSERT(sizeof(divisor_factors[0].multiplier) == 4); + STATIC_ASSERT(sizeof(divisor_factors[0].pre_shift) == 4); + STATIC_ASSERT(sizeof(divisor_factors[0].post_shift) == 4); + STATIC_ASSERT(sizeof(divisor_factors[0].increment) == 4); + int i; + + assert(count <= SI_MAX_ATTRIBS); + if (!v) + return NULL; + + v->count = count; + + unsigned alloc_count = count > sscreen->num_vbos_in_user_sgprs ? + count - sscreen->num_vbos_in_user_sgprs : 0; + v->vb_desc_list_alloc_size = align(alloc_count * 16, SI_CPDMA_ALIGNMENT); + + for (i = 0; i < count; ++i) { + const struct util_format_description *desc; + const struct util_format_channel_description *channel; + int first_non_void; + unsigned vbo_index = elements[i].vertex_buffer_index; + + if (vbo_index >= SI_NUM_VERTEX_BUFFERS) { + FREE(v); + return NULL; + } + + unsigned instance_divisor = elements[i].instance_divisor; + if (instance_divisor) { + v->uses_instance_divisors = true; + + if (instance_divisor == 1) { + v->instance_divisor_is_one |= 1u << i; + } else { + v->instance_divisor_is_fetched |= 1u << i; + divisor_factors[i] = + si_compute_fast_udiv_info32(instance_divisor, 32); + } + } + + if (!used[vbo_index]) { + v->first_vb_use_mask |= 1 << i; + used[vbo_index] = true; + } + + desc = util_format_description(elements[i].src_format); + first_non_void = util_format_get_first_non_void_channel(elements[i].src_format); + channel = first_non_void >= 0 ? &desc->channel[first_non_void] : NULL; + + v->format_size[i] = desc->block.bits / 8; + v->src_offset[i] = elements[i].src_offset; + v->vertex_buffer_index[i] = vbo_index; + + bool always_fix = false; + union si_vs_fix_fetch fix_fetch; + unsigned log_hw_load_size; /* the load element size as seen by the hardware */ + + fix_fetch.bits = 0; + log_hw_load_size = MIN2(2, util_logbase2(desc->block.bits) - 3); + + if (channel) { + switch (channel->type) { + case UTIL_FORMAT_TYPE_FLOAT: fix_fetch.u.format = AC_FETCH_FORMAT_FLOAT; break; + case UTIL_FORMAT_TYPE_FIXED: fix_fetch.u.format = AC_FETCH_FORMAT_FIXED; break; + case UTIL_FORMAT_TYPE_SIGNED: { + if (channel->pure_integer) + fix_fetch.u.format = AC_FETCH_FORMAT_SINT; + else if (channel->normalized) + fix_fetch.u.format = AC_FETCH_FORMAT_SNORM; + else + fix_fetch.u.format = AC_FETCH_FORMAT_SSCALED; + break; + } + case UTIL_FORMAT_TYPE_UNSIGNED: { + if (channel->pure_integer) + fix_fetch.u.format = AC_FETCH_FORMAT_UINT; + else if (channel->normalized) + fix_fetch.u.format = AC_FETCH_FORMAT_UNORM; + else + fix_fetch.u.format = AC_FETCH_FORMAT_USCALED; + break; + } + default: unreachable("bad format type"); + } + } else { + switch (elements[i].src_format) { + case PIPE_FORMAT_R11G11B10_FLOAT: fix_fetch.u.format = AC_FETCH_FORMAT_FLOAT; break; + default: unreachable("bad other format"); + } + } + + if (desc->channel[0].size == 10) { + fix_fetch.u.log_size = 3; /* special encoding for 2_10_10_10 */ + log_hw_load_size = 2; + + /* The hardware always treats the 2-bit alpha channel as + * unsigned, so a shader workaround is needed. The affected + * chips are GFX8 and older except Stoney (GFX8.1). + */ + always_fix = sscreen->info.chip_class <= GFX8 && + sscreen->info.family != CHIP_STONEY && + channel->type == UTIL_FORMAT_TYPE_SIGNED; + } else if (elements[i].src_format == PIPE_FORMAT_R11G11B10_FLOAT) { + fix_fetch.u.log_size = 3; /* special encoding */ + fix_fetch.u.format = AC_FETCH_FORMAT_FIXED; + log_hw_load_size = 2; + } else { + fix_fetch.u.log_size = util_logbase2(channel->size) - 3; + fix_fetch.u.num_channels_m1 = desc->nr_channels - 1; + + /* Always fix up: + * - doubles (multiple loads + truncate to float) + * - 32-bit requiring a conversion + */ + always_fix = + (fix_fetch.u.log_size == 3) || + (fix_fetch.u.log_size == 2 && + fix_fetch.u.format != AC_FETCH_FORMAT_FLOAT && + fix_fetch.u.format != AC_FETCH_FORMAT_UINT && + fix_fetch.u.format != AC_FETCH_FORMAT_SINT); + + /* Also fixup 8_8_8 and 16_16_16. */ + if (desc->nr_channels == 3 && fix_fetch.u.log_size <= 1) { + always_fix = true; + log_hw_load_size = fix_fetch.u.log_size; + } + } + + if (desc->swizzle[0] != PIPE_SWIZZLE_X) { + assert(desc->swizzle[0] == PIPE_SWIZZLE_Z && + (desc->swizzle[2] == PIPE_SWIZZLE_X || desc->swizzle[2] == PIPE_SWIZZLE_0)); + fix_fetch.u.reverse = 1; + } + + /* Force the workaround for unaligned access here already if the + * offset relative to the vertex buffer base is unaligned. + * + * There is a theoretical case in which this is too conservative: + * if the vertex buffer's offset is also unaligned in just the + * right way, we end up with an aligned address after all. + * However, this case should be extremely rare in practice (it + * won't happen in well-behaved applications), and taking it + * into account would complicate the fast path (where everything + * is nicely aligned). + */ + bool check_alignment = + log_hw_load_size >= 1 && + (sscreen->info.chip_class == GFX6 || sscreen->info.chip_class == GFX10); + bool opencode = sscreen->options.vs_fetch_always_opencode; + + if (check_alignment && + (elements[i].src_offset & ((1 << log_hw_load_size) - 1)) != 0) + opencode = true; + + if (always_fix || check_alignment || opencode) + v->fix_fetch[i] = fix_fetch.bits; + + if (opencode) + v->fix_fetch_opencode |= 1 << i; + if (opencode || always_fix) + v->fix_fetch_always |= 1 << i; + + if (check_alignment && !opencode) { + assert(log_hw_load_size == 1 || log_hw_load_size == 2); + + v->fix_fetch_unaligned |= 1 << i; + v->hw_load_is_dword |= (log_hw_load_size - 1) << i; + v->vb_alignment_check_mask |= 1 << vbo_index; + } + + v->rsrc_word3[i] = S_008F0C_DST_SEL_X(si_map_swizzle(desc->swizzle[0])) | + S_008F0C_DST_SEL_Y(si_map_swizzle(desc->swizzle[1])) | + S_008F0C_DST_SEL_Z(si_map_swizzle(desc->swizzle[2])) | + S_008F0C_DST_SEL_W(si_map_swizzle(desc->swizzle[3])); + + if (sscreen->info.chip_class >= GFX10) { + const struct gfx10_format *fmt = + &gfx10_format_table[elements[i].src_format]; + assert(fmt->img_format != 0 && fmt->img_format < 128); + v->rsrc_word3[i] |= S_008F0C_FORMAT(fmt->img_format) | + S_008F0C_RESOURCE_LEVEL(1); + } else { + unsigned data_format, num_format; + data_format = si_translate_buffer_dataformat(ctx->screen, desc, first_non_void); + num_format = si_translate_buffer_numformat(ctx->screen, desc, first_non_void); + v->rsrc_word3[i] |= S_008F0C_NUM_FORMAT(num_format) | + S_008F0C_DATA_FORMAT(data_format); + } + } + + if (v->instance_divisor_is_fetched) { + unsigned num_divisors = util_last_bit(v->instance_divisor_is_fetched); + + v->instance_divisor_factor_buffer = + (struct si_resource*) + pipe_buffer_create(&sscreen->b, 0, PIPE_USAGE_DEFAULT, + num_divisors * sizeof(divisor_factors[0])); + if (!v->instance_divisor_factor_buffer) { + FREE(v); + return NULL; + } + void *map = sscreen->ws->buffer_map(v->instance_divisor_factor_buffer->buf, + NULL, PIPE_TRANSFER_WRITE); + memcpy(map , divisor_factors, num_divisors * sizeof(divisor_factors[0])); + } + return v; } static void si_bind_vertex_elements(struct pipe_context *ctx, void *state) { - struct si_context *sctx = (struct si_context *)ctx; - struct si_vertex_elements *old = sctx->vertex_elements; - struct si_vertex_elements *v = (struct si_vertex_elements*)state; - - sctx->vertex_elements = v; - sctx->vertex_buffers_dirty = true; - - if (v && - (!old || - old->count != v->count || - old->uses_instance_divisors != v->uses_instance_divisors || - /* we don't check which divisors changed */ - v->uses_instance_divisors || - (old->vb_alignment_check_mask ^ v->vb_alignment_check_mask) & sctx->vertex_buffer_unaligned || - ((v->vb_alignment_check_mask & sctx->vertex_buffer_unaligned) && - memcmp(old->vertex_buffer_index, v->vertex_buffer_index, - sizeof(v->vertex_buffer_index[0]) * v->count)) || - /* fix_fetch_{always,opencode,unaligned} and hw_load_is_dword are - * functions of fix_fetch and the src_offset alignment. - * If they change and fix_fetch doesn't, it must be due to different - * src_offset alignment, which is reflected in fix_fetch_opencode. */ - old->fix_fetch_opencode != v->fix_fetch_opencode || - memcmp(old->fix_fetch, v->fix_fetch, sizeof(v->fix_fetch[0]) * v->count))) - sctx->do_update_shaders = true; - - if (v && v->instance_divisor_is_fetched) { - struct pipe_constant_buffer cb; - - cb.buffer = &v->instance_divisor_factor_buffer->b.b; - cb.user_buffer = NULL; - cb.buffer_offset = 0; - cb.buffer_size = 0xffffffff; - si_set_rw_buffer(sctx, SI_VS_CONST_INSTANCE_DIVISORS, &cb); - } + struct si_context *sctx = (struct si_context *)ctx; + struct si_vertex_elements *old = sctx->vertex_elements; + struct si_vertex_elements *v = (struct si_vertex_elements*)state; + + sctx->vertex_elements = v; + sctx->num_vertex_elements = v ? v->count : 0; + + if (sctx->num_vertex_elements) { + sctx->vertex_buffers_dirty = true; + } else { + sctx->vertex_buffer_pointer_dirty = false; + sctx->vertex_buffer_user_sgprs_dirty = false; + } + + if (v && + (!old || + old->count != v->count || + old->uses_instance_divisors != v->uses_instance_divisors || + /* we don't check which divisors changed */ + v->uses_instance_divisors || + (old->vb_alignment_check_mask ^ v->vb_alignment_check_mask) & sctx->vertex_buffer_unaligned || + ((v->vb_alignment_check_mask & sctx->vertex_buffer_unaligned) && + memcmp(old->vertex_buffer_index, v->vertex_buffer_index, + sizeof(v->vertex_buffer_index[0]) * v->count)) || + /* fix_fetch_{always,opencode,unaligned} and hw_load_is_dword are + * functions of fix_fetch and the src_offset alignment. + * If they change and fix_fetch doesn't, it must be due to different + * src_offset alignment, which is reflected in fix_fetch_opencode. */ + old->fix_fetch_opencode != v->fix_fetch_opencode || + memcmp(old->fix_fetch, v->fix_fetch, sizeof(v->fix_fetch[0]) * v->count))) + sctx->do_update_shaders = true; + + if (v && v->instance_divisor_is_fetched) { + struct pipe_constant_buffer cb; + + cb.buffer = &v->instance_divisor_factor_buffer->b.b; + cb.user_buffer = NULL; + cb.buffer_offset = 0; + cb.buffer_size = 0xffffffff; + si_set_rw_buffer(sctx, SI_VS_CONST_INSTANCE_DIVISORS, &cb); + } } static void si_delete_vertex_element(struct pipe_context *ctx, void *state) { - struct si_context *sctx = (struct si_context *)ctx; - struct si_vertex_elements *v = (struct si_vertex_elements*)state; + struct si_context *sctx = (struct si_context *)ctx; + struct si_vertex_elements *v = (struct si_vertex_elements*)state; - if (sctx->vertex_elements == state) - sctx->vertex_elements = NULL; - si_resource_reference(&v->instance_divisor_factor_buffer, NULL); - FREE(state); + if (sctx->vertex_elements == state) { + sctx->vertex_elements = NULL; + sctx->num_vertex_elements = 0; + } + si_resource_reference(&v->instance_divisor_factor_buffer, NULL); + FREE(state); } static void si_set_vertex_buffers(struct pipe_context *ctx, - unsigned start_slot, unsigned count, - const struct pipe_vertex_buffer *buffers) + unsigned start_slot, unsigned count, + const struct pipe_vertex_buffer *buffers) { - struct si_context *sctx = (struct si_context *)ctx; - struct pipe_vertex_buffer *dst = sctx->vertex_buffer + start_slot; - uint32_t orig_unaligned = sctx->vertex_buffer_unaligned; - uint32_t unaligned = orig_unaligned; - int i; - - assert(start_slot + count <= ARRAY_SIZE(sctx->vertex_buffer)); - - if (buffers) { - for (i = 0; i < count; i++) { - const struct pipe_vertex_buffer *src = buffers + i; - struct pipe_vertex_buffer *dsti = dst + i; - struct pipe_resource *buf = src->buffer.resource; - - pipe_resource_reference(&dsti->buffer.resource, buf); - dsti->buffer_offset = src->buffer_offset; - dsti->stride = src->stride; - if (dsti->buffer_offset & 3 || dsti->stride & 3) - unaligned |= 1 << (start_slot + i); - else - unaligned &= ~(1 << (start_slot + i)); - - si_context_add_resource_size(sctx, buf); - if (buf) - si_resource(buf)->bind_history |= PIPE_BIND_VERTEX_BUFFER; - } - } else { - for (i = 0; i < count; i++) { - pipe_resource_reference(&dst[i].buffer.resource, NULL); - } - unaligned &= ~u_bit_consecutive(start_slot, count); - } - sctx->vertex_buffers_dirty = true; - sctx->vertex_buffer_unaligned = unaligned; - - /* Check whether alignment may have changed in a way that requires - * shader changes. This check is conservative: a vertex buffer can only - * trigger a shader change if the misalignment amount changes (e.g. - * from byte-aligned to short-aligned), but we only keep track of - * whether buffers are at least dword-aligned, since that should always - * be the case in well-behaved applications anyway. - */ - if (sctx->vertex_elements && - (sctx->vertex_elements->vb_alignment_check_mask & - (unaligned | orig_unaligned) & u_bit_consecutive(start_slot, count))) - sctx->do_update_shaders = true; + struct si_context *sctx = (struct si_context *)ctx; + struct pipe_vertex_buffer *dst = sctx->vertex_buffer + start_slot; + unsigned updated_mask = u_bit_consecutive(start_slot, count); + uint32_t orig_unaligned = sctx->vertex_buffer_unaligned; + uint32_t unaligned = 0; + int i; + + assert(start_slot + count <= ARRAY_SIZE(sctx->vertex_buffer)); + + if (buffers) { + for (i = 0; i < count; i++) { + const struct pipe_vertex_buffer *src = buffers + i; + struct pipe_vertex_buffer *dsti = dst + i; + struct pipe_resource *buf = src->buffer.resource; + unsigned slot_bit = 1 << (start_slot + i); + + pipe_resource_reference(&dsti->buffer.resource, buf); + dsti->buffer_offset = src->buffer_offset; + dsti->stride = src->stride; + + if (dsti->buffer_offset & 3 || dsti->stride & 3) + unaligned |= slot_bit; + + si_context_add_resource_size(sctx, buf); + if (buf) + si_resource(buf)->bind_history |= PIPE_BIND_VERTEX_BUFFER; + } + } else { + for (i = 0; i < count; i++) { + pipe_resource_reference(&dst[i].buffer.resource, NULL); + } + unaligned &= ~updated_mask; + } + sctx->vertex_buffers_dirty = true; + sctx->vertex_buffer_unaligned = (orig_unaligned & ~updated_mask) | unaligned; + + /* Check whether alignment may have changed in a way that requires + * shader changes. This check is conservative: a vertex buffer can only + * trigger a shader change if the misalignment amount changes (e.g. + * from byte-aligned to short-aligned), but we only keep track of + * whether buffers are at least dword-aligned, since that should always + * be the case in well-behaved applications anyway. + */ + if (sctx->vertex_elements && + (sctx->vertex_elements->vb_alignment_check_mask & + (unaligned | orig_unaligned) & updated_mask)) + sctx->do_update_shaders = true; } /* @@ -5166,571 +5181,524 @@ */ static void si_set_tess_state(struct pipe_context *ctx, - const float default_outer_level[4], - const float default_inner_level[2]) + const float default_outer_level[4], + const float default_inner_level[2]) { - struct si_context *sctx = (struct si_context *)ctx; - struct pipe_constant_buffer cb; - float array[8]; - - memcpy(array, default_outer_level, sizeof(float) * 4); - memcpy(array+4, default_inner_level, sizeof(float) * 2); - - cb.buffer = NULL; - cb.user_buffer = NULL; - cb.buffer_size = sizeof(array); - - si_upload_const_buffer(sctx, (struct si_resource**)&cb.buffer, - (void*)array, sizeof(array), - &cb.buffer_offset); + struct si_context *sctx = (struct si_context *)ctx; + struct pipe_constant_buffer cb; + float array[8]; + + memcpy(array, default_outer_level, sizeof(float) * 4); + memcpy(array+4, default_inner_level, sizeof(float) * 2); + + cb.buffer = NULL; + cb.user_buffer = NULL; + cb.buffer_size = sizeof(array); + + si_upload_const_buffer(sctx, (struct si_resource**)&cb.buffer, + (void*)array, sizeof(array), + &cb.buffer_offset); - si_set_rw_buffer(sctx, SI_HS_CONST_DEFAULT_TESS_LEVELS, &cb); - pipe_resource_reference(&cb.buffer, NULL); + si_set_rw_buffer(sctx, SI_HS_CONST_DEFAULT_TESS_LEVELS, &cb); + pipe_resource_reference(&cb.buffer, NULL); } static void si_texture_barrier(struct pipe_context *ctx, unsigned flags) { - struct si_context *sctx = (struct si_context *)ctx; + struct si_context *sctx = (struct si_context *)ctx; - si_update_fb_dirtiness_after_rendering(sctx); + si_update_fb_dirtiness_after_rendering(sctx); - /* Multisample surfaces are flushed in si_decompress_textures. */ - if (sctx->framebuffer.uncompressed_cb_mask) { - si_make_CB_shader_coherent(sctx, sctx->framebuffer.nr_samples, - sctx->framebuffer.CB_has_shader_readable_metadata, - sctx->framebuffer.all_DCC_pipe_aligned); - } + /* Multisample surfaces are flushed in si_decompress_textures. */ + if (sctx->framebuffer.uncompressed_cb_mask) { + si_make_CB_shader_coherent(sctx, sctx->framebuffer.nr_samples, + sctx->framebuffer.CB_has_shader_readable_metadata, + sctx->framebuffer.all_DCC_pipe_aligned); + } } /* This only ensures coherency for shader image/buffer stores. */ static void si_memory_barrier(struct pipe_context *ctx, unsigned flags) { - struct si_context *sctx = (struct si_context *)ctx; + struct si_context *sctx = (struct si_context *)ctx; - if (!(flags & ~PIPE_BARRIER_UPDATE)) - return; + if (!(flags & ~PIPE_BARRIER_UPDATE)) + return; - /* Subsequent commands must wait for all shader invocations to - * complete. */ - sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | - SI_CONTEXT_CS_PARTIAL_FLUSH; - - if (flags & PIPE_BARRIER_CONSTANT_BUFFER) - sctx->flags |= SI_CONTEXT_INV_SCACHE | - SI_CONTEXT_INV_VCACHE; - - if (flags & (PIPE_BARRIER_VERTEX_BUFFER | - PIPE_BARRIER_SHADER_BUFFER | - PIPE_BARRIER_TEXTURE | - PIPE_BARRIER_IMAGE | - PIPE_BARRIER_STREAMOUT_BUFFER | - PIPE_BARRIER_GLOBAL_BUFFER)) { - /* As far as I can tell, L1 contents are written back to L2 - * automatically at end of shader, but the contents of other - * L1 caches might still be stale. */ - sctx->flags |= SI_CONTEXT_INV_VCACHE; - } - - if (flags & PIPE_BARRIER_INDEX_BUFFER) { - /* Indices are read through TC L2 since GFX8. - * L1 isn't used. - */ - if (sctx->screen->info.chip_class <= GFX7) - sctx->flags |= SI_CONTEXT_WB_L2; - } - - /* MSAA color, any depth and any stencil are flushed in - * si_decompress_textures when needed. - */ - if (flags & PIPE_BARRIER_FRAMEBUFFER && - sctx->framebuffer.uncompressed_cb_mask) { - sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_CB; - - if (sctx->chip_class <= GFX8) - sctx->flags |= SI_CONTEXT_WB_L2; - } - - /* Indirect buffers use TC L2 on GFX9, but not older hw. */ - if (sctx->screen->info.chip_class <= GFX8 && - flags & PIPE_BARRIER_INDIRECT_BUFFER) - sctx->flags |= SI_CONTEXT_WB_L2; + /* Subsequent commands must wait for all shader invocations to + * complete. */ + sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | + SI_CONTEXT_CS_PARTIAL_FLUSH; + + if (flags & PIPE_BARRIER_CONSTANT_BUFFER) + sctx->flags |= SI_CONTEXT_INV_SCACHE | + SI_CONTEXT_INV_VCACHE; + + if (flags & (PIPE_BARRIER_VERTEX_BUFFER | + PIPE_BARRIER_SHADER_BUFFER | + PIPE_BARRIER_TEXTURE | + PIPE_BARRIER_IMAGE | + PIPE_BARRIER_STREAMOUT_BUFFER | + PIPE_BARRIER_GLOBAL_BUFFER)) { + /* As far as I can tell, L1 contents are written back to L2 + * automatically at end of shader, but the contents of other + * L1 caches might still be stale. */ + sctx->flags |= SI_CONTEXT_INV_VCACHE; + } + + if (flags & PIPE_BARRIER_INDEX_BUFFER) { + /* Indices are read through TC L2 since GFX8. + * L1 isn't used. + */ + if (sctx->screen->info.chip_class <= GFX7) + sctx->flags |= SI_CONTEXT_WB_L2; + } + + /* MSAA color, any depth and any stencil are flushed in + * si_decompress_textures when needed. + */ + if (flags & PIPE_BARRIER_FRAMEBUFFER && + sctx->framebuffer.uncompressed_cb_mask) { + sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_CB; + + if (sctx->chip_class <= GFX8) + sctx->flags |= SI_CONTEXT_WB_L2; + } + + /* Indirect buffers use TC L2 on GFX9, but not older hw. */ + if (sctx->screen->info.chip_class <= GFX8 && + flags & PIPE_BARRIER_INDIRECT_BUFFER) + sctx->flags |= SI_CONTEXT_WB_L2; } static void *si_create_blend_custom(struct si_context *sctx, unsigned mode) { - struct pipe_blend_state blend; + struct pipe_blend_state blend; - memset(&blend, 0, sizeof(blend)); - blend.independent_blend_enable = true; - blend.rt[0].colormask = 0xf; - return si_create_blend_state_mode(&sctx->b, &blend, mode); + memset(&blend, 0, sizeof(blend)); + blend.independent_blend_enable = true; + blend.rt[0].colormask = 0xf; + return si_create_blend_state_mode(&sctx->b, &blend, mode); } static void si_init_config(struct si_context *sctx); void si_init_state_compute_functions(struct si_context *sctx) { - sctx->b.create_sampler_state = si_create_sampler_state; - sctx->b.delete_sampler_state = si_delete_sampler_state; - sctx->b.create_sampler_view = si_create_sampler_view; - sctx->b.sampler_view_destroy = si_sampler_view_destroy; - sctx->b.memory_barrier = si_memory_barrier; + sctx->b.create_sampler_state = si_create_sampler_state; + sctx->b.delete_sampler_state = si_delete_sampler_state; + sctx->b.create_sampler_view = si_create_sampler_view; + sctx->b.sampler_view_destroy = si_sampler_view_destroy; + sctx->b.memory_barrier = si_memory_barrier; } void si_init_state_functions(struct si_context *sctx) { - sctx->atoms.s.framebuffer.emit = si_emit_framebuffer_state; - sctx->atoms.s.msaa_sample_locs.emit = si_emit_msaa_sample_locs; - sctx->atoms.s.db_render_state.emit = si_emit_db_render_state; - sctx->atoms.s.dpbb_state.emit = si_emit_dpbb_state; - sctx->atoms.s.msaa_config.emit = si_emit_msaa_config; - sctx->atoms.s.sample_mask.emit = si_emit_sample_mask; - sctx->atoms.s.cb_render_state.emit = si_emit_cb_render_state; - sctx->atoms.s.blend_color.emit = si_emit_blend_color; - sctx->atoms.s.clip_regs.emit = si_emit_clip_regs; - sctx->atoms.s.clip_state.emit = si_emit_clip_state; - sctx->atoms.s.stencil_ref.emit = si_emit_stencil_ref; - - sctx->b.create_blend_state = si_create_blend_state; - sctx->b.bind_blend_state = si_bind_blend_state; - sctx->b.delete_blend_state = si_delete_blend_state; - sctx->b.set_blend_color = si_set_blend_color; - - sctx->b.create_rasterizer_state = si_create_rs_state; - sctx->b.bind_rasterizer_state = si_bind_rs_state; - sctx->b.delete_rasterizer_state = si_delete_rs_state; - - sctx->b.create_depth_stencil_alpha_state = si_create_dsa_state; - sctx->b.bind_depth_stencil_alpha_state = si_bind_dsa_state; - sctx->b.delete_depth_stencil_alpha_state = si_delete_dsa_state; - - sctx->custom_dsa_flush = si_create_db_flush_dsa(sctx); - sctx->custom_blend_resolve = si_create_blend_custom(sctx, V_028808_CB_RESOLVE); - sctx->custom_blend_fmask_decompress = si_create_blend_custom(sctx, V_028808_CB_FMASK_DECOMPRESS); - sctx->custom_blend_eliminate_fastclear = si_create_blend_custom(sctx, V_028808_CB_ELIMINATE_FAST_CLEAR); - sctx->custom_blend_dcc_decompress = si_create_blend_custom(sctx, V_028808_CB_DCC_DECOMPRESS); - - sctx->b.set_clip_state = si_set_clip_state; - sctx->b.set_stencil_ref = si_set_stencil_ref; - - sctx->b.set_framebuffer_state = si_set_framebuffer_state; - - sctx->b.set_sample_mask = si_set_sample_mask; - - sctx->b.create_vertex_elements_state = si_create_vertex_elements; - sctx->b.bind_vertex_elements_state = si_bind_vertex_elements; - sctx->b.delete_vertex_elements_state = si_delete_vertex_element; - sctx->b.set_vertex_buffers = si_set_vertex_buffers; - - sctx->b.texture_barrier = si_texture_barrier; - sctx->b.set_min_samples = si_set_min_samples; - sctx->b.set_tess_state = si_set_tess_state; + sctx->atoms.s.framebuffer.emit = si_emit_framebuffer_state; + sctx->atoms.s.msaa_sample_locs.emit = si_emit_msaa_sample_locs; + sctx->atoms.s.db_render_state.emit = si_emit_db_render_state; + sctx->atoms.s.dpbb_state.emit = si_emit_dpbb_state; + sctx->atoms.s.msaa_config.emit = si_emit_msaa_config; + sctx->atoms.s.sample_mask.emit = si_emit_sample_mask; + sctx->atoms.s.cb_render_state.emit = si_emit_cb_render_state; + sctx->atoms.s.blend_color.emit = si_emit_blend_color; + sctx->atoms.s.clip_regs.emit = si_emit_clip_regs; + sctx->atoms.s.clip_state.emit = si_emit_clip_state; + sctx->atoms.s.stencil_ref.emit = si_emit_stencil_ref; + + sctx->b.create_blend_state = si_create_blend_state; + sctx->b.bind_blend_state = si_bind_blend_state; + sctx->b.delete_blend_state = si_delete_blend_state; + sctx->b.set_blend_color = si_set_blend_color; + + sctx->b.create_rasterizer_state = si_create_rs_state; + sctx->b.bind_rasterizer_state = si_bind_rs_state; + sctx->b.delete_rasterizer_state = si_delete_rs_state; + + sctx->b.create_depth_stencil_alpha_state = si_create_dsa_state; + sctx->b.bind_depth_stencil_alpha_state = si_bind_dsa_state; + sctx->b.delete_depth_stencil_alpha_state = si_delete_dsa_state; + + sctx->custom_dsa_flush = si_create_db_flush_dsa(sctx); + sctx->custom_blend_resolve = si_create_blend_custom(sctx, V_028808_CB_RESOLVE); + sctx->custom_blend_fmask_decompress = si_create_blend_custom(sctx, V_028808_CB_FMASK_DECOMPRESS); + sctx->custom_blend_eliminate_fastclear = si_create_blend_custom(sctx, V_028808_CB_ELIMINATE_FAST_CLEAR); + sctx->custom_blend_dcc_decompress = si_create_blend_custom(sctx, V_028808_CB_DCC_DECOMPRESS); + + sctx->b.set_clip_state = si_set_clip_state; + sctx->b.set_stencil_ref = si_set_stencil_ref; + + sctx->b.set_framebuffer_state = si_set_framebuffer_state; + + sctx->b.set_sample_mask = si_set_sample_mask; + + sctx->b.create_vertex_elements_state = si_create_vertex_elements; + sctx->b.bind_vertex_elements_state = si_bind_vertex_elements; + sctx->b.delete_vertex_elements_state = si_delete_vertex_element; + sctx->b.set_vertex_buffers = si_set_vertex_buffers; + + sctx->b.texture_barrier = si_texture_barrier; + sctx->b.set_min_samples = si_set_min_samples; + sctx->b.set_tess_state = si_set_tess_state; - sctx->b.set_active_query_state = si_set_active_query_state; + sctx->b.set_active_query_state = si_set_active_query_state; - si_init_config(sctx); + si_init_config(sctx); } void si_init_screen_state_functions(struct si_screen *sscreen) { - sscreen->b.is_format_supported = si_is_format_supported; + sscreen->b.is_format_supported = si_is_format_supported; - if (sscreen->info.chip_class >= GFX10) { - sscreen->make_texture_descriptor = gfx10_make_texture_descriptor; - } else { - sscreen->make_texture_descriptor = si_make_texture_descriptor; - } + if (sscreen->info.chip_class >= GFX10) { + sscreen->make_texture_descriptor = gfx10_make_texture_descriptor; + } else { + sscreen->make_texture_descriptor = si_make_texture_descriptor; + } } static void si_set_grbm_gfx_index(struct si_context *sctx, - struct si_pm4_state *pm4, unsigned value) + struct si_pm4_state *pm4, unsigned value) { - unsigned reg = sctx->chip_class >= GFX7 ? R_030800_GRBM_GFX_INDEX : - R_00802C_GRBM_GFX_INDEX; - si_pm4_set_reg(pm4, reg, value); + unsigned reg = sctx->chip_class >= GFX7 ? R_030800_GRBM_GFX_INDEX : + R_00802C_GRBM_GFX_INDEX; + si_pm4_set_reg(pm4, reg, value); } static void si_set_grbm_gfx_index_se(struct si_context *sctx, - struct si_pm4_state *pm4, unsigned se) + struct si_pm4_state *pm4, unsigned se) { - assert(se == ~0 || se < sctx->screen->info.max_se); - si_set_grbm_gfx_index(sctx, pm4, - (se == ~0 ? S_030800_SE_BROADCAST_WRITES(1) : - S_030800_SE_INDEX(se)) | - S_030800_SH_BROADCAST_WRITES(1) | - S_030800_INSTANCE_BROADCAST_WRITES(1)); + assert(se == ~0 || se < sctx->screen->info.max_se); + si_set_grbm_gfx_index(sctx, pm4, + (se == ~0 ? S_030800_SE_BROADCAST_WRITES(1) : + S_030800_SE_INDEX(se)) | + S_030800_SH_BROADCAST_WRITES(1) | + S_030800_INSTANCE_BROADCAST_WRITES(1)); } static void si_write_harvested_raster_configs(struct si_context *sctx, - struct si_pm4_state *pm4, - unsigned raster_config, - unsigned raster_config_1) -{ - unsigned num_se = MAX2(sctx->screen->info.max_se, 1); - unsigned raster_config_se[4]; - unsigned se; - - ac_get_harvested_configs(&sctx->screen->info, - raster_config, - &raster_config_1, - raster_config_se); - - for (se = 0; se < num_se; se++) { - si_set_grbm_gfx_index_se(sctx, pm4, se); - si_pm4_set_reg(pm4, R_028350_PA_SC_RASTER_CONFIG, raster_config_se[se]); - } - si_set_grbm_gfx_index(sctx, pm4, ~0); - - if (sctx->chip_class >= GFX7) { - si_pm4_set_reg(pm4, R_028354_PA_SC_RASTER_CONFIG_1, raster_config_1); - } + struct si_pm4_state *pm4, + unsigned raster_config, + unsigned raster_config_1) +{ + unsigned num_se = MAX2(sctx->screen->info.max_se, 1); + unsigned raster_config_se[4]; + unsigned se; + + ac_get_harvested_configs(&sctx->screen->info, + raster_config, + &raster_config_1, + raster_config_se); + + for (se = 0; se < num_se; se++) { + si_set_grbm_gfx_index_se(sctx, pm4, se); + si_pm4_set_reg(pm4, R_028350_PA_SC_RASTER_CONFIG, raster_config_se[se]); + } + si_set_grbm_gfx_index(sctx, pm4, ~0); + + if (sctx->chip_class >= GFX7) { + si_pm4_set_reg(pm4, R_028354_PA_SC_RASTER_CONFIG_1, raster_config_1); + } } static void si_set_raster_config(struct si_context *sctx, struct si_pm4_state *pm4) { - struct si_screen *sscreen = sctx->screen; - unsigned num_rb = MIN2(sscreen->info.num_render_backends, 16); - unsigned rb_mask = sscreen->info.enabled_rb_mask; - unsigned raster_config = sscreen->pa_sc_raster_config; - unsigned raster_config_1 = sscreen->pa_sc_raster_config_1; - - if (!rb_mask || util_bitcount(rb_mask) >= num_rb) { - /* Always use the default config when all backends are enabled - * (or when we failed to determine the enabled backends). - */ - si_pm4_set_reg(pm4, R_028350_PA_SC_RASTER_CONFIG, - raster_config); - if (sctx->chip_class >= GFX7) - si_pm4_set_reg(pm4, R_028354_PA_SC_RASTER_CONFIG_1, - raster_config_1); - } else { - si_write_harvested_raster_configs(sctx, pm4, raster_config, raster_config_1); - } + struct si_screen *sscreen = sctx->screen; + unsigned num_rb = MIN2(sscreen->info.num_render_backends, 16); + unsigned rb_mask = sscreen->info.enabled_rb_mask; + unsigned raster_config = sscreen->pa_sc_raster_config; + unsigned raster_config_1 = sscreen->pa_sc_raster_config_1; + + if (!rb_mask || util_bitcount(rb_mask) >= num_rb) { + /* Always use the default config when all backends are enabled + * (or when we failed to determine the enabled backends). + */ + si_pm4_set_reg(pm4, R_028350_PA_SC_RASTER_CONFIG, + raster_config); + if (sctx->chip_class >= GFX7) + si_pm4_set_reg(pm4, R_028354_PA_SC_RASTER_CONFIG_1, + raster_config_1); + } else { + si_write_harvested_raster_configs(sctx, pm4, raster_config, raster_config_1); + } } static void si_init_config(struct si_context *sctx) { - struct si_screen *sscreen = sctx->screen; - uint64_t border_color_va = sctx->border_color_buffer->gpu_address; - bool has_clear_state = sscreen->has_clear_state; - struct si_pm4_state *pm4 = CALLOC_STRUCT(si_pm4_state); - - if (!pm4) - return; - - si_pm4_cmd_begin(pm4, PKT3_CONTEXT_CONTROL); - si_pm4_cmd_add(pm4, CONTEXT_CONTROL_LOAD_ENABLE(1)); - si_pm4_cmd_add(pm4, CONTEXT_CONTROL_SHADOW_ENABLE(1)); - si_pm4_cmd_end(pm4, false); - - if (has_clear_state) { - si_pm4_cmd_begin(pm4, PKT3_CLEAR_STATE); - si_pm4_cmd_add(pm4, 0); - si_pm4_cmd_end(pm4, false); - } - - if (sctx->chip_class <= GFX8) - si_set_raster_config(sctx, pm4); - - si_pm4_set_reg(pm4, R_028A18_VGT_HOS_MAX_TESS_LEVEL, fui(64)); - if (!has_clear_state) - si_pm4_set_reg(pm4, R_028A1C_VGT_HOS_MIN_TESS_LEVEL, fui(0)); - - /* FIXME calculate these values somehow ??? */ - if (sctx->chip_class <= GFX8) { - si_pm4_set_reg(pm4, R_028A54_VGT_GS_PER_ES, SI_GS_PER_ES); - si_pm4_set_reg(pm4, R_028A58_VGT_ES_PER_GS, 0x40); - } - - if (!has_clear_state) { - si_pm4_set_reg(pm4, R_028A5C_VGT_GS_PER_VS, 0x2); - si_pm4_set_reg(pm4, R_028A8C_VGT_PRIMITIVEID_RESET, 0x0); - si_pm4_set_reg(pm4, R_028B98_VGT_STRMOUT_BUFFER_CONFIG, 0x0); - } - - if (sscreen->info.chip_class <= GFX9) - si_pm4_set_reg(pm4, R_028AA0_VGT_INSTANCE_STEP_RATE_0, 1); - if (!has_clear_state) - si_pm4_set_reg(pm4, R_028AB8_VGT_VTX_CNT_EN, 0x0); - if (sctx->chip_class < GFX7) - si_pm4_set_reg(pm4, R_008A14_PA_CL_ENHANCE, S_008A14_NUM_CLIP_SEQ(3) | - S_008A14_CLIP_VTX_REORDER_ENA(1)); - - /* CLEAR_STATE doesn't clear these correctly on certain generations. - * I don't know why. Deduced by trial and error. - */ - if (sctx->chip_class <= GFX7 || !has_clear_state) { - si_pm4_set_reg(pm4, R_028B28_VGT_STRMOUT_DRAW_OPAQUE_OFFSET, 0); - si_pm4_set_reg(pm4, R_028204_PA_SC_WINDOW_SCISSOR_TL, S_028204_WINDOW_OFFSET_DISABLE(1)); - si_pm4_set_reg(pm4, R_028240_PA_SC_GENERIC_SCISSOR_TL, S_028240_WINDOW_OFFSET_DISABLE(1)); - si_pm4_set_reg(pm4, R_028244_PA_SC_GENERIC_SCISSOR_BR, - S_028244_BR_X(16384) | S_028244_BR_Y(16384)); - si_pm4_set_reg(pm4, R_028030_PA_SC_SCREEN_SCISSOR_TL, 0); - si_pm4_set_reg(pm4, R_028034_PA_SC_SCREEN_SCISSOR_BR, - S_028034_BR_X(16384) | S_028034_BR_Y(16384)); - } - - if (!has_clear_state) { - si_pm4_set_reg(pm4, R_028230_PA_SC_EDGERULE, - S_028230_ER_TRI(0xA) | - S_028230_ER_POINT(0xA) | - S_028230_ER_RECT(0xA) | - /* Required by DX10_DIAMOND_TEST_ENA: */ - S_028230_ER_LINE_LR(0x1A) | - S_028230_ER_LINE_RL(0x26) | - S_028230_ER_LINE_TB(0xA) | - S_028230_ER_LINE_BT(0xA)); - si_pm4_set_reg(pm4, R_028820_PA_CL_NANINF_CNTL, 0); - si_pm4_set_reg(pm4, R_028AC0_DB_SRESULTS_COMPARE_STATE0, 0x0); - si_pm4_set_reg(pm4, R_028AC4_DB_SRESULTS_COMPARE_STATE1, 0x0); - si_pm4_set_reg(pm4, R_028AC8_DB_PRELOAD_CONTROL, 0x0); - si_pm4_set_reg(pm4, R_02800C_DB_RENDER_OVERRIDE, 0); - } - - if (sctx->chip_class >= GFX10) { - si_pm4_set_reg(pm4, R_028A98_VGT_DRAW_PAYLOAD_CNTL, 0); - si_pm4_set_reg(pm4, R_030964_GE_MAX_VTX_INDX, ~0); - si_pm4_set_reg(pm4, R_030924_GE_MIN_VTX_INDX, 0); - si_pm4_set_reg(pm4, R_030928_GE_INDX_OFFSET, 0); - si_pm4_set_reg(pm4, R_03097C_GE_STEREO_CNTL, 0); - si_pm4_set_reg(pm4, R_030988_GE_USER_VGPR_EN, 0); - } else if (sctx->chip_class == GFX9) { - si_pm4_set_reg(pm4, R_030920_VGT_MAX_VTX_INDX, ~0); - si_pm4_set_reg(pm4, R_030924_VGT_MIN_VTX_INDX, 0); - si_pm4_set_reg(pm4, R_030928_VGT_INDX_OFFSET, 0); - } else { - /* These registers, when written, also overwrite the CLEAR_STATE - * context, so we can't rely on CLEAR_STATE setting them. - * It would be an issue if there was another UMD changing them. - */ - si_pm4_set_reg(pm4, R_028400_VGT_MAX_VTX_INDX, ~0); - si_pm4_set_reg(pm4, R_028404_VGT_MIN_VTX_INDX, 0); - si_pm4_set_reg(pm4, R_028408_VGT_INDX_OFFSET, 0); - } - - if (sctx->chip_class >= GFX7) { - if (sctx->chip_class >= GFX10) { - /* Logical CUs 16 - 31 */ - si_pm4_set_reg(pm4, R_00B404_SPI_SHADER_PGM_RSRC4_HS, - S_00B404_CU_EN(0xffff)); - si_pm4_set_reg(pm4, R_00B104_SPI_SHADER_PGM_RSRC4_VS, - S_00B104_CU_EN(0xffff)); - si_pm4_set_reg(pm4, R_00B004_SPI_SHADER_PGM_RSRC4_PS, - S_00B004_CU_EN(0xffff)); - } - - if (sctx->chip_class >= GFX9) { - si_pm4_set_reg(pm4, R_00B41C_SPI_SHADER_PGM_RSRC3_HS, - S_00B41C_CU_EN(0xffff) | S_00B41C_WAVE_LIMIT(0x3F)); - } else { - si_pm4_set_reg(pm4, R_00B51C_SPI_SHADER_PGM_RSRC3_LS, - S_00B51C_CU_EN(0xffff) | S_00B51C_WAVE_LIMIT(0x3F)); - si_pm4_set_reg(pm4, R_00B41C_SPI_SHADER_PGM_RSRC3_HS, - S_00B41C_WAVE_LIMIT(0x3F)); - si_pm4_set_reg(pm4, R_00B31C_SPI_SHADER_PGM_RSRC3_ES, - S_00B31C_CU_EN(0xffff) | S_00B31C_WAVE_LIMIT(0x3F)); - - /* If this is 0, Bonaire can hang even if GS isn't being used. - * Other chips are unaffected. These are suboptimal values, - * but we don't use on-chip GS. - */ - si_pm4_set_reg(pm4, R_028A44_VGT_GS_ONCHIP_CNTL, - S_028A44_ES_VERTS_PER_SUBGRP(64) | - S_028A44_GS_PRIMS_PER_SUBGRP(4)); - } - - /* Compute LATE_ALLOC_VS.LIMIT. */ - unsigned num_cu_per_sh = sscreen->info.num_good_cu_per_sh; - unsigned late_alloc_limit; /* The limit is per SH. */ - - if (sctx->family == CHIP_KABINI) { - late_alloc_limit = 0; /* Potential hang on Kabini. */ - } else if (num_cu_per_sh <= 4) { - /* Too few available compute units per SH. Disallowing - * VS to run on one CU could hurt us more than late VS - * allocation would help. - * - * 2 is the highest safe number that allows us to keep - * all CUs enabled. - */ - late_alloc_limit = 2; - } else { - /* This is a good initial value, allowing 1 late_alloc - * wave per SIMD on num_cu - 2. - */ - late_alloc_limit = (num_cu_per_sh - 2) * 4; - } - - unsigned late_alloc_limit_gs = late_alloc_limit; - unsigned cu_mask_vs = 0xffff; - unsigned cu_mask_gs = 0xffff; - - if (late_alloc_limit > 2) { - if (sctx->chip_class >= GFX10) { - /* CU2 & CU3 disabled because of the dual CU design */ - cu_mask_vs = 0xfff3; - cu_mask_gs = 0xfff3; /* NGG only */ - } else { - cu_mask_vs = 0xfffe; /* 1 CU disabled */ - } - } - - /* Don't use late alloc for NGG on Navi14 due to a hw bug. - * If NGG is never used, enable all CUs. - */ - if (!sscreen->use_ngg || sctx->family == CHIP_NAVI14) { - late_alloc_limit_gs = 0; - cu_mask_gs = 0xffff; - } - - /* VS can't execute on one CU if the limit is > 2. */ - si_pm4_set_reg(pm4, R_00B118_SPI_SHADER_PGM_RSRC3_VS, - S_00B118_CU_EN(cu_mask_vs) | - S_00B118_WAVE_LIMIT(0x3F)); - si_pm4_set_reg(pm4, R_00B11C_SPI_SHADER_LATE_ALLOC_VS, - S_00B11C_LIMIT(late_alloc_limit)); - - si_pm4_set_reg(pm4, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, - S_00B21C_CU_EN(cu_mask_gs) | S_00B21C_WAVE_LIMIT(0x3F)); - - if (sctx->chip_class >= GFX10) { - si_pm4_set_reg(pm4, R_00B204_SPI_SHADER_PGM_RSRC4_GS, - S_00B204_CU_EN(0xffff) | - S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_limit_gs)); - } - - si_pm4_set_reg(pm4, R_00B01C_SPI_SHADER_PGM_RSRC3_PS, - S_00B01C_CU_EN(0xffff) | S_00B01C_WAVE_LIMIT(0x3F)); - } - - if (sctx->chip_class >= GFX10) { - /* Break up a pixel wave if it contains deallocs for more than - * half the parameter cache. - * - * To avoid a deadlock where pixel waves aren't launched - * because they're waiting for more pixels while the frontend - * is stuck waiting for PC space, the maximum allowed value is - * the size of the PC minus the largest possible allocation for - * a single primitive shader subgroup. - */ - si_pm4_set_reg(pm4, R_028C50_PA_SC_NGG_MODE_CNTL, - S_028C50_MAX_DEALLOCS_IN_WAVE(512)); - si_pm4_set_reg(pm4, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, 14); - - if (!has_clear_state) { - si_pm4_set_reg(pm4, R_02835C_PA_SC_TILE_STEERING_OVERRIDE, - sscreen->info.pa_sc_tile_steering_override); - } - - si_pm4_set_reg(pm4, R_02807C_DB_RMI_L2_CACHE_CONTROL, - S_02807C_Z_WR_POLICY(V_02807C_CACHE_STREAM_WR) | - S_02807C_S_WR_POLICY(V_02807C_CACHE_STREAM_WR) | - S_02807C_HTILE_WR_POLICY(V_02807C_CACHE_STREAM_WR) | - S_02807C_ZPCPSD_WR_POLICY(V_02807C_CACHE_STREAM_WR) | - S_02807C_Z_RD_POLICY(V_02807C_CACHE_NOA_RD) | - S_02807C_S_RD_POLICY(V_02807C_CACHE_NOA_RD) | - S_02807C_HTILE_RD_POLICY(V_02807C_CACHE_NOA_RD)); - - si_pm4_set_reg(pm4, R_028410_CB_RMI_GL2_CACHE_CONTROL, - S_028410_CMASK_WR_POLICY(V_028410_CACHE_STREAM_WR) | - S_028410_FMASK_WR_POLICY(V_028410_CACHE_STREAM_WR) | - S_028410_DCC_WR_POLICY(V_028410_CACHE_STREAM_WR) | - S_028410_COLOR_WR_POLICY(V_028410_CACHE_STREAM_WR) | - S_028410_CMASK_RD_POLICY(V_028410_CACHE_NOA_RD) | - S_028410_FMASK_RD_POLICY(V_028410_CACHE_NOA_RD) | - S_028410_DCC_RD_POLICY(V_028410_CACHE_NOA_RD) | - S_028410_COLOR_RD_POLICY(V_028410_CACHE_NOA_RD)); - si_pm4_set_reg(pm4, R_028428_CB_COVERAGE_OUT_CONTROL, 0); - - si_pm4_set_reg(pm4, R_00B0C0_SPI_SHADER_REQ_CTRL_PS, - S_00B0C0_SOFT_GROUPING_EN(1) | - S_00B0C0_NUMBER_OF_REQUESTS_PER_CU(4 - 1)); - si_pm4_set_reg(pm4, R_00B1C0_SPI_SHADER_REQ_CTRL_VS, 0); - - if (sctx->family == CHIP_NAVI10 || - sctx->family == CHIP_NAVI12 || - sctx->family == CHIP_NAVI14) { - /* SQ_NON_EVENT must be emitted before GE_PC_ALLOC is written. */ - si_pm4_cmd_begin(pm4, PKT3_EVENT_WRITE); - si_pm4_cmd_add(pm4, EVENT_TYPE(V_028A90_SQ_NON_EVENT) | EVENT_INDEX(0)); - si_pm4_cmd_end(pm4, false); - } - /* TODO: For culling, replace 128 with 256. */ - si_pm4_set_reg(pm4, R_030980_GE_PC_ALLOC, - S_030980_OVERSUB_EN(1) | - S_030980_NUM_PC_LINES(128 * sscreen->info.max_se - 1)); - } - - if (sctx->chip_class >= GFX8) { - unsigned vgt_tess_distribution; - - vgt_tess_distribution = - S_028B50_ACCUM_ISOLINE(32) | - S_028B50_ACCUM_TRI(11) | - S_028B50_ACCUM_QUAD(11) | - S_028B50_DONUT_SPLIT(16); - - /* Testing with Unigine Heaven extreme tesselation yielded best results - * with TRAP_SPLIT = 3. - */ - if (sctx->family == CHIP_FIJI || - sctx->family >= CHIP_POLARIS10) - vgt_tess_distribution |= S_028B50_TRAP_SPLIT(3); - - si_pm4_set_reg(pm4, R_028B50_VGT_TESS_DISTRIBUTION, vgt_tess_distribution); - } else if (!has_clear_state) { - si_pm4_set_reg(pm4, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, 14); - si_pm4_set_reg(pm4, R_028C5C_VGT_OUT_DEALLOC_CNTL, 16); - } - - si_pm4_set_reg(pm4, R_028080_TA_BC_BASE_ADDR, border_color_va >> 8); - if (sctx->chip_class >= GFX7) { - si_pm4_set_reg(pm4, R_028084_TA_BC_BASE_ADDR_HI, - S_028084_ADDRESS(border_color_va >> 40)); - } - si_pm4_add_bo(pm4, sctx->border_color_buffer, RADEON_USAGE_READ, - RADEON_PRIO_BORDER_COLORS); - - if (sctx->chip_class >= GFX9) { - unsigned num_se = sscreen->info.max_se; - unsigned pc_lines = 0; - unsigned max_alloc_count = 0; - - switch (sctx->family) { - case CHIP_VEGA10: - case CHIP_VEGA12: - case CHIP_VEGA20: - pc_lines = 2048; - break; - case CHIP_RAVEN: - case CHIP_RAVEN2: - case CHIP_RENOIR: - case CHIP_NAVI10: - case CHIP_NAVI12: - pc_lines = 1024; - break; - case CHIP_NAVI14: - pc_lines = 512; - break; - default: - assert(0); - } - - if (sctx->chip_class >= GFX10) { - max_alloc_count = pc_lines / 3; - } else { - max_alloc_count = MIN2(128, pc_lines / (4 * num_se)); - } - - si_pm4_set_reg(pm4, R_028C48_PA_SC_BINNER_CNTL_1, - S_028C48_MAX_ALLOC_COUNT(max_alloc_count - 1) | - S_028C48_MAX_PRIM_PER_BATCH(1023)); - si_pm4_set_reg(pm4, R_028C4C_PA_SC_CONSERVATIVE_RASTERIZATION_CNTL, - S_028C4C_NULL_SQUAD_AA_MASK_ENABLE(1)); - si_pm4_set_reg(pm4, R_030968_VGT_INSTANCE_BASE_ID, 0); - } + struct si_screen *sscreen = sctx->screen; + uint64_t border_color_va = sctx->border_color_buffer->gpu_address; + bool has_clear_state = sscreen->info.has_clear_state; + struct si_pm4_state *pm4 = CALLOC_STRUCT(si_pm4_state); + + if (!pm4) + return; + + si_pm4_cmd_begin(pm4, PKT3_CONTEXT_CONTROL); + si_pm4_cmd_add(pm4, CONTEXT_CONTROL_LOAD_ENABLE(1)); + si_pm4_cmd_add(pm4, CONTEXT_CONTROL_SHADOW_ENABLE(1)); + si_pm4_cmd_end(pm4, false); + + if (has_clear_state) { + si_pm4_cmd_begin(pm4, PKT3_CLEAR_STATE); + si_pm4_cmd_add(pm4, 0); + si_pm4_cmd_end(pm4, false); + } + + if (sctx->chip_class <= GFX8) + si_set_raster_config(sctx, pm4); + + si_pm4_set_reg(pm4, R_028A18_VGT_HOS_MAX_TESS_LEVEL, fui(64)); + if (!has_clear_state) + si_pm4_set_reg(pm4, R_028A1C_VGT_HOS_MIN_TESS_LEVEL, fui(0)); + + /* FIXME calculate these values somehow ??? */ + if (sctx->chip_class <= GFX8) { + si_pm4_set_reg(pm4, R_028A54_VGT_GS_PER_ES, SI_GS_PER_ES); + si_pm4_set_reg(pm4, R_028A58_VGT_ES_PER_GS, 0x40); + } + + if (!has_clear_state) { + si_pm4_set_reg(pm4, R_028A5C_VGT_GS_PER_VS, 0x2); + si_pm4_set_reg(pm4, R_028A8C_VGT_PRIMITIVEID_RESET, 0x0); + si_pm4_set_reg(pm4, R_028B98_VGT_STRMOUT_BUFFER_CONFIG, 0x0); + } + + if (sscreen->info.chip_class <= GFX9) + si_pm4_set_reg(pm4, R_028AA0_VGT_INSTANCE_STEP_RATE_0, 1); + if (!has_clear_state) + si_pm4_set_reg(pm4, R_028AB8_VGT_VTX_CNT_EN, 0x0); + if (sctx->chip_class < GFX7) + si_pm4_set_reg(pm4, R_008A14_PA_CL_ENHANCE, S_008A14_NUM_CLIP_SEQ(3) | + S_008A14_CLIP_VTX_REORDER_ENA(1)); + + /* CLEAR_STATE doesn't restore these correctly. */ + si_pm4_set_reg(pm4, R_028240_PA_SC_GENERIC_SCISSOR_TL, S_028240_WINDOW_OFFSET_DISABLE(1)); + si_pm4_set_reg(pm4, R_028244_PA_SC_GENERIC_SCISSOR_BR, + S_028244_BR_X(16384) | S_028244_BR_Y(16384)); + + /* CLEAR_STATE doesn't clear these correctly on certain generations. + * I don't know why. Deduced by trial and error. + */ + if (sctx->chip_class <= GFX7 || !has_clear_state) { + si_pm4_set_reg(pm4, R_028B28_VGT_STRMOUT_DRAW_OPAQUE_OFFSET, 0); + si_pm4_set_reg(pm4, R_028204_PA_SC_WINDOW_SCISSOR_TL, S_028204_WINDOW_OFFSET_DISABLE(1)); + si_pm4_set_reg(pm4, R_028030_PA_SC_SCREEN_SCISSOR_TL, 0); + si_pm4_set_reg(pm4, R_028034_PA_SC_SCREEN_SCISSOR_BR, + S_028034_BR_X(16384) | S_028034_BR_Y(16384)); + } + + if (!has_clear_state) { + si_pm4_set_reg(pm4, R_028230_PA_SC_EDGERULE, + S_028230_ER_TRI(0xA) | + S_028230_ER_POINT(0xA) | + S_028230_ER_RECT(0xA) | + /* Required by DX10_DIAMOND_TEST_ENA: */ + S_028230_ER_LINE_LR(0x1A) | + S_028230_ER_LINE_RL(0x26) | + S_028230_ER_LINE_TB(0xA) | + S_028230_ER_LINE_BT(0xA)); + si_pm4_set_reg(pm4, R_028820_PA_CL_NANINF_CNTL, 0); + si_pm4_set_reg(pm4, R_028AC0_DB_SRESULTS_COMPARE_STATE0, 0x0); + si_pm4_set_reg(pm4, R_028AC4_DB_SRESULTS_COMPARE_STATE1, 0x0); + si_pm4_set_reg(pm4, R_028AC8_DB_PRELOAD_CONTROL, 0x0); + si_pm4_set_reg(pm4, R_02800C_DB_RENDER_OVERRIDE, 0); + } + + if (sctx->chip_class >= GFX10) { + si_pm4_set_reg(pm4, R_028A98_VGT_DRAW_PAYLOAD_CNTL, 0); + si_pm4_set_reg(pm4, R_030964_GE_MAX_VTX_INDX, ~0); + si_pm4_set_reg(pm4, R_030924_GE_MIN_VTX_INDX, 0); + si_pm4_set_reg(pm4, R_030928_GE_INDX_OFFSET, 0); + si_pm4_set_reg(pm4, R_03097C_GE_STEREO_CNTL, 0); + si_pm4_set_reg(pm4, R_030988_GE_USER_VGPR_EN, 0); + } else if (sctx->chip_class == GFX9) { + si_pm4_set_reg(pm4, R_030920_VGT_MAX_VTX_INDX, ~0); + si_pm4_set_reg(pm4, R_030924_VGT_MIN_VTX_INDX, 0); + si_pm4_set_reg(pm4, R_030928_VGT_INDX_OFFSET, 0); + } else { + /* These registers, when written, also overwrite the CLEAR_STATE + * context, so we can't rely on CLEAR_STATE setting them. + * It would be an issue if there was another UMD changing them. + */ + si_pm4_set_reg(pm4, R_028400_VGT_MAX_VTX_INDX, ~0); + si_pm4_set_reg(pm4, R_028404_VGT_MIN_VTX_INDX, 0); + si_pm4_set_reg(pm4, R_028408_VGT_INDX_OFFSET, 0); + } + + if (sctx->chip_class >= GFX7) { + if (sctx->chip_class >= GFX10) { + /* Logical CUs 16 - 31 */ + si_pm4_set_reg(pm4, R_00B404_SPI_SHADER_PGM_RSRC4_HS, + S_00B404_CU_EN(0xffff)); + si_pm4_set_reg(pm4, R_00B104_SPI_SHADER_PGM_RSRC4_VS, + S_00B104_CU_EN(0xffff)); + si_pm4_set_reg(pm4, R_00B004_SPI_SHADER_PGM_RSRC4_PS, + S_00B004_CU_EN(0xffff)); + } + + if (sctx->chip_class >= GFX9) { + si_pm4_set_reg(pm4, R_00B41C_SPI_SHADER_PGM_RSRC3_HS, + S_00B41C_CU_EN(0xffff) | S_00B41C_WAVE_LIMIT(0x3F)); + } else { + si_pm4_set_reg(pm4, R_00B51C_SPI_SHADER_PGM_RSRC3_LS, + S_00B51C_CU_EN(0xffff) | S_00B51C_WAVE_LIMIT(0x3F)); + si_pm4_set_reg(pm4, R_00B41C_SPI_SHADER_PGM_RSRC3_HS, + S_00B41C_WAVE_LIMIT(0x3F)); + si_pm4_set_reg(pm4, R_00B31C_SPI_SHADER_PGM_RSRC3_ES, + S_00B31C_CU_EN(0xffff) | S_00B31C_WAVE_LIMIT(0x3F)); + + /* If this is 0, Bonaire can hang even if GS isn't being used. + * Other chips are unaffected. These are suboptimal values, + * but we don't use on-chip GS. + */ + si_pm4_set_reg(pm4, R_028A44_VGT_GS_ONCHIP_CNTL, + S_028A44_ES_VERTS_PER_SUBGRP(64) | + S_028A44_GS_PRIMS_PER_SUBGRP(4)); + } + + /* Compute LATE_ALLOC_VS.LIMIT. */ + unsigned num_cu_per_sh = sscreen->info.num_good_cu_per_sh; + unsigned late_alloc_wave64 = 0; /* The limit is per SH. */ + unsigned cu_mask_vs = 0xffff; + unsigned cu_mask_gs = 0xffff; + + if (sctx->chip_class >= GFX10) { + /* For Wave32, the hw will launch twice the number of late + * alloc waves, so 1 == 2x wave32. + */ + if (num_cu_per_sh <= 6) { + late_alloc_wave64 = num_cu_per_sh - 2; + } else { + late_alloc_wave64 = (num_cu_per_sh - 2) * 4; + + /* CU2 & CU3 disabled because of the dual CU design */ + /* Late alloc is not used for NGG on Navi14 due to a hw bug. */ + cu_mask_vs = 0xfff3; + cu_mask_gs = sscreen->use_ngg && + sctx->family != CHIP_NAVI14 ? 0xfff3 : 0xffff; + } + } else { + if (sctx->family == CHIP_KABINI) { + late_alloc_wave64 = 0; /* Potential hang on Kabini. */ + } else if (num_cu_per_sh <= 4) { + /* Too few available compute units per SH. Disallowing + * VS to run on one CU could hurt us more than late VS + * allocation would help. + * + * 2 is the highest safe number that allows us to keep + * all CUs enabled. + */ + late_alloc_wave64 = 2; + } else { + /* This is a good initial value, allowing 1 late_alloc + * wave per SIMD on num_cu - 2. + */ + late_alloc_wave64 = (num_cu_per_sh - 2) * 4; + } + + if (late_alloc_wave64 > 2) + cu_mask_vs = 0xfffe; /* 1 CU disabled */ + } + + /* VS can't execute on one CU if the limit is > 2. */ + si_pm4_set_reg(pm4, R_00B118_SPI_SHADER_PGM_RSRC3_VS, + S_00B118_CU_EN(cu_mask_vs) | + S_00B118_WAVE_LIMIT(0x3F)); + si_pm4_set_reg(pm4, R_00B11C_SPI_SHADER_LATE_ALLOC_VS, + S_00B11C_LIMIT(late_alloc_wave64)); + + si_pm4_set_reg(pm4, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, + S_00B21C_CU_EN(cu_mask_gs) | S_00B21C_WAVE_LIMIT(0x3F)); + + si_pm4_set_reg(pm4, R_00B01C_SPI_SHADER_PGM_RSRC3_PS, + S_00B01C_CU_EN(0xffff) | S_00B01C_WAVE_LIMIT(0x3F)); + } + + if (sctx->chip_class >= GFX10) { + /* Break up a pixel wave if it contains deallocs for more than + * half the parameter cache. + * + * To avoid a deadlock where pixel waves aren't launched + * because they're waiting for more pixels while the frontend + * is stuck waiting for PC space, the maximum allowed value is + * the size of the PC minus the largest possible allocation for + * a single primitive shader subgroup. + */ + si_pm4_set_reg(pm4, R_028C50_PA_SC_NGG_MODE_CNTL, + S_028C50_MAX_DEALLOCS_IN_WAVE(512)); + si_pm4_set_reg(pm4, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, 14); + + if (!has_clear_state) { + si_pm4_set_reg(pm4, R_02835C_PA_SC_TILE_STEERING_OVERRIDE, + sscreen->info.pa_sc_tile_steering_override); + } + + si_pm4_set_reg(pm4, R_02807C_DB_RMI_L2_CACHE_CONTROL, + S_02807C_Z_WR_POLICY(V_02807C_CACHE_STREAM_WR) | + S_02807C_S_WR_POLICY(V_02807C_CACHE_STREAM_WR) | + S_02807C_HTILE_WR_POLICY(V_02807C_CACHE_STREAM_WR) | + S_02807C_ZPCPSD_WR_POLICY(V_02807C_CACHE_STREAM_WR) | + S_02807C_Z_RD_POLICY(V_02807C_CACHE_NOA_RD) | + S_02807C_S_RD_POLICY(V_02807C_CACHE_NOA_RD) | + S_02807C_HTILE_RD_POLICY(V_02807C_CACHE_NOA_RD)); + + si_pm4_set_reg(pm4, R_028410_CB_RMI_GL2_CACHE_CONTROL, + S_028410_CMASK_WR_POLICY(V_028410_CACHE_STREAM_WR) | + S_028410_FMASK_WR_POLICY(V_028410_CACHE_STREAM_WR) | + S_028410_DCC_WR_POLICY(V_028410_CACHE_STREAM_WR) | + S_028410_COLOR_WR_POLICY(V_028410_CACHE_STREAM_WR) | + S_028410_CMASK_RD_POLICY(V_028410_CACHE_NOA_RD) | + S_028410_FMASK_RD_POLICY(V_028410_CACHE_NOA_RD) | + S_028410_DCC_RD_POLICY(V_028410_CACHE_NOA_RD) | + S_028410_COLOR_RD_POLICY(V_028410_CACHE_NOA_RD)); + si_pm4_set_reg(pm4, R_028428_CB_COVERAGE_OUT_CONTROL, 0); + + si_pm4_set_reg(pm4, R_00B0C0_SPI_SHADER_REQ_CTRL_PS, + S_00B0C0_SOFT_GROUPING_EN(1) | + S_00B0C0_NUMBER_OF_REQUESTS_PER_CU(4 - 1)); + si_pm4_set_reg(pm4, R_00B1C0_SPI_SHADER_REQ_CTRL_VS, 0); + } + + if (sctx->chip_class >= GFX8) { + unsigned vgt_tess_distribution; + + vgt_tess_distribution = + S_028B50_ACCUM_ISOLINE(32) | + S_028B50_ACCUM_TRI(11) | + S_028B50_ACCUM_QUAD(11) | + S_028B50_DONUT_SPLIT(16); + + /* Testing with Unigine Heaven extreme tesselation yielded best results + * with TRAP_SPLIT = 3. + */ + if (sctx->family == CHIP_FIJI || + sctx->family >= CHIP_POLARIS10) + vgt_tess_distribution |= S_028B50_TRAP_SPLIT(3); + + si_pm4_set_reg(pm4, R_028B50_VGT_TESS_DISTRIBUTION, vgt_tess_distribution); + } else if (!has_clear_state) { + si_pm4_set_reg(pm4, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, 14); + si_pm4_set_reg(pm4, R_028C5C_VGT_OUT_DEALLOC_CNTL, 16); + } + + si_pm4_set_reg(pm4, R_028080_TA_BC_BASE_ADDR, border_color_va >> 8); + if (sctx->chip_class >= GFX7) { + si_pm4_set_reg(pm4, R_028084_TA_BC_BASE_ADDR_HI, + S_028084_ADDRESS(border_color_va >> 40)); + } + si_pm4_add_bo(pm4, sctx->border_color_buffer, RADEON_USAGE_READ, + RADEON_PRIO_BORDER_COLORS); + + if (sctx->chip_class >= GFX9) { + si_pm4_set_reg(pm4, R_028C48_PA_SC_BINNER_CNTL_1, + S_028C48_MAX_ALLOC_COUNT(sscreen->info.pbb_max_alloc_count - 1) | + S_028C48_MAX_PRIM_PER_BATCH(1023)); + si_pm4_set_reg(pm4, R_028C4C_PA_SC_CONSERVATIVE_RASTERIZATION_CNTL, + S_028C4C_NULL_SQUAD_AA_MASK_ENABLE(1)); + si_pm4_set_reg(pm4, R_030968_VGT_INSTANCE_BASE_ID, 0); + } - si_pm4_upload_indirect_buffer(sctx, pm4); - sctx->init_config = pm4; + si_pm4_upload_indirect_buffer(sctx, pm4); + sctx->init_config = pm4; } diff -Nru mesa-19.2.8/src/gallium/drivers/radeonsi/si_state_draw.c mesa-20.0.8/src/gallium/drivers/radeonsi/si_state_draw.c --- mesa-19.2.8/src/gallium/drivers/radeonsi/si_state_draw.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/radeonsi/si_state_draw.c 2020-06-12 01:21:17.000000000 +0000 @@ -175,7 +175,7 @@ /* When distributed tessellation is unsupported, switch between SEs * at a higher frequency to compensate for it. */ - if (!sctx->screen->has_distributed_tess && sctx->screen->info.max_se > 1) + if (!sctx->screen->info.has_distributed_tess && sctx->screen->info.max_se > 1) *num_patches = MIN2(*num_patches, 16); /* recommended */ /* Make sure that vector lanes are reasonably occupied. It probably @@ -363,7 +363,7 @@ partial_vs_wave = true; /* Needed for 028B6C_DISTRIBUTION_MODE != 0. (implies >= GFX8) */ - if (sscreen->has_distributed_tess) { + if (sscreen->info.has_distributed_tess) { if (key->u.uses_gs) { if (sscreen->info.chip_class == GFX8) partial_es_wave = true; @@ -501,6 +501,16 @@ } } +static bool si_is_line_stipple_enabled(struct si_context *sctx) +{ + struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; + + return rs->line_stipple_enable && + sctx->current_rast_prim != PIPE_PRIM_POINTS && + (rs->polygon_mode_is_lines || + util_prim_is_lines(sctx->current_rast_prim)); +} + static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx, const struct pipe_draw_info *info, enum pipe_prim_type prim, @@ -529,6 +539,7 @@ si_num_prims_for_vertices(info, prim) < primgroup_size)); key.u.primitive_restart = primitive_restart; key.u.count_from_stream_output = info->count_from_stream_output != NULL; + key.u.line_stipple_enabled = si_is_line_stipple_enabled(sctx); ia_multi_vgt_param = sctx->ia_multi_vgt_param[key.index] | S_028AA8_PRIMGROUP_SIZE(primgroup_size - 1); @@ -586,46 +597,37 @@ struct radeon_cmdbuf *cs = sctx->gfx_cs; enum pipe_prim_type rast_prim = sctx->current_rast_prim; struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; - bool use_ngg = sctx->screen->use_ngg; - - if (likely(rast_prim == sctx->last_rast_prim && - rs->pa_sc_line_stipple == sctx->last_sc_line_stipple && - (!use_ngg || - rs->flatshade_first == sctx->last_flatshade_first))) - return; + unsigned initial_cdw = cs->current.cdw; - if (util_prim_is_lines(rast_prim)) { + if (unlikely(si_is_line_stipple_enabled(sctx))) { /* For lines, reset the stipple pattern at each primitive. Otherwise, * reset the stipple pattern at each packet (line strips, line loops). */ - radeon_set_context_reg(cs, R_028A0C_PA_SC_LINE_STIPPLE, - rs->pa_sc_line_stipple | - S_028A0C_AUTO_RESET_CNTL(rast_prim == PIPE_PRIM_LINES ? 1 : 2)); - sctx->context_roll = true; + unsigned value = rs->pa_sc_line_stipple | + S_028A0C_AUTO_RESET_CNTL(rast_prim == PIPE_PRIM_LINES ? 1 : 2); + + radeon_opt_set_context_reg(sctx, R_028A0C_PA_SC_LINE_STIPPLE, + SI_TRACKED_PA_SC_LINE_STIPPLE, value); } - unsigned gs_out = si_conv_prim_to_gs_out(sctx->current_rast_prim); + unsigned gs_out_prim = si_conv_prim_to_gs_out(rast_prim); + if (unlikely(gs_out_prim != sctx->last_gs_out_prim && + (sctx->ngg || sctx->gs_shader.cso))) { + radeon_set_context_reg(cs, R_028A6C_VGT_GS_OUT_PRIM_TYPE, gs_out_prim); + sctx->last_gs_out_prim = gs_out_prim; + } - if (rast_prim != sctx->last_rast_prim && - (sctx->ngg || sctx->gs_shader.cso)) { - radeon_set_context_reg(cs, R_028A6C_VGT_GS_OUT_PRIM_TYPE, gs_out); + if (initial_cdw != cs->current.cdw) sctx->context_roll = true; - if (use_ngg) { - sctx->current_vs_state &= C_VS_STATE_OUTPRIM; - sctx->current_vs_state |= S_VS_STATE_OUTPRIM(gs_out); - } - } + if (sctx->ngg) { + unsigned vtx_index = rs->flatshade_first ? 0 : gs_out_prim; - if (use_ngg) { - unsigned vtx_index = rs->flatshade_first ? 0 : gs_out; - sctx->current_vs_state &= C_VS_STATE_PROVOKING_VTX_INDEX; - sctx->current_vs_state |= S_VS_STATE_PROVOKING_VTX_INDEX(vtx_index); + sctx->current_vs_state &= C_VS_STATE_OUTPRIM & + C_VS_STATE_PROVOKING_VTX_INDEX; + sctx->current_vs_state |= S_VS_STATE_OUTPRIM(gs_out_prim) | + S_VS_STATE_PROVOKING_VTX_INDEX(vtx_index); } - - sctx->last_rast_prim = rast_prim; - sctx->last_sc_line_stipple = rs->pa_sc_line_stipple; - sctx->last_flatshade_first = rs->flatshade_first; } static void si_emit_vs_state(struct si_context *sctx, @@ -724,25 +726,22 @@ if (sctx->ngg) { if (sctx->tes_shader.cso) { ge_cntl = S_03096C_PRIM_GRP_SIZE(num_patches) | - S_03096C_VERT_GRP_SIZE(0) | + S_03096C_VERT_GRP_SIZE(256) | /* 256 = disable vertex grouping */ S_03096C_BREAK_WAVE_AT_EOI(key.u.tess_uses_prim_id); } else { ge_cntl = si_get_vs_state(sctx)->ge_cntl; } } else { unsigned primgroup_size; - unsigned vertgroup_size; + unsigned vertgroup_size = 256; /* 256 = disable vertex grouping */; if (sctx->tes_shader.cso) { primgroup_size = num_patches; /* must be a multiple of NUM_PATCHES */ - vertgroup_size = 0; } else if (sctx->gs_shader.cso) { unsigned vgt_gs_onchip_cntl = sctx->gs_shader.current->ctx_reg.gs.vgt_gs_onchip_cntl; primgroup_size = G_028A44_GS_PRIMS_PER_SUBGRP(vgt_gs_onchip_cntl); - vertgroup_size = G_028A44_ES_VERTS_PER_SUBGRP(vgt_gs_onchip_cntl); } else { primgroup_size = 128; /* recommended without a GS and tess */ - vertgroup_size = 0; } ge_cntl = S_03096C_PRIM_GRP_SIZE(primgroup_size) | @@ -750,7 +749,7 @@ S_03096C_BREAK_WAVE_AT_EOI(key.u.uses_tess && key.u.tess_uses_prim_id); } - ge_cntl |= S_03096C_PACKET_TO_ONE_PA(key.u.line_stipple_enabled); + ge_cntl |= S_03096C_PACKET_TO_ONE_PA(si_is_line_stipple_enabled(sctx)); if (ge_cntl != sctx->last_multi_vgt_param) { radeon_set_uconfig_reg(sctx->gfx_cs, R_03096C_GE_CNTL, ge_cntl); @@ -1790,7 +1789,9 @@ return; } - if (unlikely(!sctx->vs_shader.cso || + struct si_shader_selector *vs = sctx->vs_shader.cso; + if (unlikely(!vs || + sctx->num_vertex_elements < vs->num_vs_inputs || (!sctx->ps_shader.cso && !rs->rasterizer_discard) || (!!sctx->tes_shader.cso != (prim == PIPE_PRIM_PATCHES)))) { assert(0); @@ -1845,7 +1846,7 @@ } if (sctx->tes_shader.cso && - sctx->screen->has_ls_vgpr_init_bug) { + sctx->screen->info.has_ls_vgpr_init_bug) { /* Determine whether the LS VGPR fix should be applied. * * It is only required when num input CPs > num output CPs, @@ -1996,6 +1997,7 @@ (!sctx->tes_shader.cso || pd_msg("uses tess")) && (!sctx->gs_shader.cso || pd_msg("uses GS")) && (!sctx->ps_shader.cso->info.uses_primid || pd_msg("PS uses PrimID")) && + !rs->polygon_mode_enabled && #if SI_PRIM_DISCARD_DEBUG /* same as cso->prim_discard_cs_allowed */ (!sctx->vs_shader.cso->info.uses_bindless_images || pd_msg("uses bindless images")) && (!sctx->vs_shader.cso->info.uses_bindless_samplers || pd_msg("uses bindless samplers")) && @@ -2036,6 +2038,61 @@ sctx->do_update_shaders = true; } + /* Update NGG culling settings. */ + if (sctx->ngg && + rast_prim == PIPE_PRIM_TRIANGLES && + (sctx->screen->always_use_ngg_culling || + /* At least 1024 non-indexed vertices (8 subgroups) are needed + * per draw call (no TES/GS) to enable NGG culling. + */ + (!index_size && direct_count >= 1024 && + (prim == PIPE_PRIM_TRIANGLES || prim == PIPE_PRIM_TRIANGLE_STRIP) && + !sctx->tes_shader.cso && !sctx->gs_shader.cso)) && + si_get_vs(sctx)->cso->ngg_culling_allowed) { + unsigned ngg_culling = 0; + + if (rs->rasterizer_discard) { + ngg_culling |= SI_NGG_CULL_FRONT_FACE | + SI_NGG_CULL_BACK_FACE; + } else { + /* Polygon mode can't use view and small primitive culling, + * because it draws points or lines where the culling depends + * on the point or line width. + */ + if (!rs->polygon_mode_enabled) + ngg_culling |= SI_NGG_CULL_VIEW_SMALLPRIMS; + + if (sctx->viewports.y_inverted ? rs->cull_back : rs->cull_front) + ngg_culling |= SI_NGG_CULL_FRONT_FACE; + if (sctx->viewports.y_inverted ? rs->cull_front : rs->cull_back) + ngg_culling |= SI_NGG_CULL_BACK_FACE; + } + + /* Use NGG fast launch for certain non-indexed primitive types. + * A draw must have at least 1 full primitive. + */ + if (ngg_culling && !index_size && direct_count >= 3 && + !sctx->tes_shader.cso && !sctx->gs_shader.cso) { + if (prim == PIPE_PRIM_TRIANGLES) + ngg_culling |= SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST; + else if (prim == PIPE_PRIM_TRIANGLE_STRIP) + ngg_culling |= SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP; + } + + if (ngg_culling != sctx->ngg_culling) { + /* Insert a VGT_FLUSH when enabling fast launch changes to prevent hangs. + * See issues #2418, #2426, #2434 + */ + if (ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL) + sctx->flags |= SI_CONTEXT_VGT_FLUSH; + sctx->ngg_culling = ngg_culling; + sctx->do_update_shaders = true; + } + } else if (sctx->ngg_culling) { + sctx->ngg_culling = false; + sctx->do_update_shaders = true; + } + if (sctx->do_update_shaders && !si_update_shaders(sctx)) goto return_cleanup; @@ -2055,10 +2112,9 @@ * written (i.e. the GPU rolls the context), PA_SC_VPORT_SCISSOR * registers must be written too. */ - bool has_gfx9_scissor_bug = sctx->screen->has_gfx9_scissor_bug; unsigned masked_atoms = 0; - if (has_gfx9_scissor_bug) { + if (sctx->screen->info.has_gfx9_scissor_bug) { masked_atoms |= si_get_atom_bit(sctx, &sctx->atoms.s.scissors); if (info->count_from_stream_output || @@ -2092,7 +2148,7 @@ if (si_is_atom_dirty(sctx, &sctx->atoms.s.render_cond)) sctx->atoms.s.render_cond.emit(sctx); - if (has_gfx9_scissor_bug && + if (sctx->screen->info.has_gfx9_scissor_bug && (sctx->context_roll || si_is_atom_dirty(sctx, &sctx->atoms.s.scissors))) sctx->atoms.s.scissors.emit(sctx); @@ -2126,7 +2182,7 @@ si_emit_all_states(sctx, info, prim, instance_count, primitive_restart, masked_atoms); - if (has_gfx9_scissor_bug && + if (sctx->screen->info.has_gfx9_scissor_bug && (sctx->context_roll || si_is_atom_dirty(sctx, &sctx->atoms.s.scissors))) sctx->atoms.s.scissors.emit(sctx); @@ -2143,6 +2199,20 @@ cik_emit_prefetch_L2(sctx, false); } + /* Mark the displayable dcc buffer as dirty in order to update + * it on the next call to si_flush_resource. */ + if (sctx->screen->info.use_display_dcc_with_retile_blit) { + /* Don't use si_update_fb_dirtiness_after_rendering because it'll + * cause unnecessary texture decompressions on each draw. */ + unsigned displayable_dcc_cb_mask = sctx->framebuffer.displayable_dcc_cb_mask; + while (displayable_dcc_cb_mask) { + unsigned i = u_bit_scan(&displayable_dcc_cb_mask); + struct pipe_surface *surf = sctx->framebuffer.state.cbufs[i]; + struct si_texture *tex = (struct si_texture*) surf->texture; + tex->displayable_dcc_dirty = true; + } + } + /* Clear the context roll flag after the draw call. */ sctx->context_roll = false; @@ -2219,6 +2289,7 @@ /* Don't set per-stage shader pointers for VS. */ sctx->shader_pointers_dirty &= ~SI_DESCS_SHADER_MASK(VERTEX); sctx->vertex_buffer_pointer_dirty = false; + sctx->vertex_buffer_user_sgprs_dirty = false; si_draw_vbo(pipe, &info); } diff -Nru mesa-19.2.8/src/gallium/drivers/radeonsi/si_state.h mesa-20.0.8/src/gallium/drivers/radeonsi/si_state.h --- mesa-19.2.8/src/gallium/drivers/radeonsi/si_state.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/radeonsi/si_state.h 2020-06-12 01:21:17.000000000 +0000 @@ -37,6 +37,7 @@ #define SI_NUM_SAMPLERS 32 /* OpenGL textures units per shader */ #define SI_NUM_CONST_BUFFERS 16 #define SI_NUM_IMAGES 16 +#define SI_NUM_IMAGE_SLOTS (SI_NUM_IMAGES * 2) /* the second half are FMASK slots */ #define SI_NUM_SHADER_BUFFERS 16 struct si_screen; @@ -93,6 +94,8 @@ unsigned cull_back:1; unsigned depth_clamp_any:1; unsigned provoking_vertex_first:1; + unsigned polygon_mode_enabled:1; + unsigned polygon_mode_is_lines:1; }; struct si_dsa_stencil_ref_part { @@ -170,7 +173,7 @@ uint16_t first_vb_use_mask; /* Vertex buffer descriptor list size aligned for optimal prefetch. */ - uint16_t desc_list_byte_size; + uint16_t vb_desc_list_alloc_size; uint16_t instance_divisor_is_one; /* bitmask of inputs */ uint16_t instance_divisor_is_fetched; /* bitmask of inputs */ }; @@ -308,6 +311,8 @@ SI_TRACKED_PA_SC_CLIPRECT_RULE, + SI_TRACKED_PA_SC_LINE_STIPPLE, + SI_TRACKED_VGT_ESGS_RING_ITEMSIZE, SI_TRACKED_VGT_GSVS_RING_OFFSET_1, /* 3 consecutive registers */ @@ -350,6 +355,8 @@ SI_TRACKED_VGT_TF_PARAM, SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL, + SI_TRACKED_GE_PC_ALLOC, + SI_NUM_TRACKED_REGS, }; @@ -582,13 +589,17 @@ void si_emit_dpbb_state(struct si_context *sctx); /* si_state_shaders.c */ -void *si_get_ir_binary(struct si_shader_selector *sel, bool ngg, bool es); -bool si_shader_cache_load_shader(struct si_screen *sscreen, void *ir_binary, +void si_get_ir_cache_key(struct si_shader_selector *sel, bool ngg, bool es, + unsigned char ir_sha1_cache_key[20]); +bool si_shader_cache_load_shader(struct si_screen *sscreen, + unsigned char ir_sha1_cache_key[20], struct si_shader *shader); -bool si_shader_cache_insert_shader(struct si_screen *sscreen, void *ir_binary, +void si_shader_cache_insert_shader(struct si_screen *sscreen, + unsigned char ir_sha1_cache_key[20], struct si_shader *shader, bool insert_into_disk_cache); bool si_update_shaders(struct si_context *sctx); +void si_init_screen_live_shader_cache(struct si_screen *sscreen); void si_init_shader_functions(struct si_context *sctx); bool si_init_shader_cache(struct si_screen *sscreen); void si_destroy_shader_cache(struct si_screen *sscreen); @@ -596,7 +607,7 @@ struct util_queue_fence *ready_fence, struct si_compiler_ctx_state *compiler_ctx_state, void *job, util_queue_execute_func execute); -void si_get_active_slot_masks(const struct tgsi_shader_info *info, +void si_get_active_slot_masks(const struct si_shader_info *info, uint32_t *const_and_shader_buffers, uint64_t *samplers_and_images); int si_shader_select_with_key(struct si_screen *sscreen, @@ -647,14 +658,16 @@ static inline unsigned si_get_sampler_slot(unsigned slot) { - /* samplers are in slots [8..39], ascending */ - return SI_NUM_IMAGES / 2 + slot; + /* 32 samplers are in sampler slots [16..47], 16 dw per slot, ascending */ + /* those are equivalent to image slots [32..95], 8 dw per slot, ascending */ + return SI_NUM_IMAGE_SLOTS / 2 + slot; } static inline unsigned si_get_image_slot(unsigned slot) { - /* images are in slots [15..0] (sampler slots [7..0]), descending */ - return SI_NUM_IMAGES - 1 - slot; + /* image slots are in [31..0] (sampler slots [15..0]), descending */ + /* images are in slots [31..16], while FMASKs are in slots [15..0] */ + return SI_NUM_IMAGE_SLOTS - 1 - slot; } #endif diff -Nru mesa-19.2.8/src/gallium/drivers/radeonsi/si_state_shaders.c mesa-20.0.8/src/gallium/drivers/radeonsi/si_state_shaders.c --- mesa-19.2.8/src/gallium/drivers/radeonsi/si_state_shaders.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/radeonsi/si_state_shaders.c 2020-06-12 01:21:17.000000000 +0000 @@ -27,7 +27,6 @@ #include "compiler/nir/nir_serialize.h" #include "nir/tgsi_to_nir.h" -#include "tgsi/tgsi_parse.h" #include "util/hash_table.h" #include "util/crc32.h" #include "util/u_async_debug.h" @@ -42,24 +41,23 @@ /* SHADER_CACHE */ /** - * Return the IR binary in a buffer. For TGSI the first 4 bytes contain its - * size as integer. + * Return the IR key for the shader cache. */ -void *si_get_ir_binary(struct si_shader_selector *sel, bool ngg, bool es) +void si_get_ir_cache_key(struct si_shader_selector *sel, bool ngg, bool es, + unsigned char ir_sha1_cache_key[20]) { - struct blob blob; + struct blob blob = {}; unsigned ir_size; void *ir_binary; - if (sel->tokens) { - ir_binary = sel->tokens; - ir_size = tgsi_num_tokens(sel->tokens) * - sizeof(struct tgsi_token); + if (sel->nir_binary) { + ir_binary = sel->nir_binary; + ir_size = sel->nir_size; } else { assert(sel->nir); blob_init(&blob); - nir_serialize(&blob, sel->nir); + nir_serialize(&blob, sel->nir, true); ir_binary = blob.data; ir_size = blob.size; } @@ -78,20 +76,18 @@ if (sel->force_correct_derivs_after_kill) shader_variant_flags |= 1 << 3; - unsigned size = 4 + 4 + ir_size + sizeof(sel->so); - char *result = (char*)MALLOC(size); - if (!result) - return NULL; - - ((uint32_t*)result)[0] = size; - ((uint32_t*)result)[1] = shader_variant_flags; - memcpy(result + 8, ir_binary, ir_size); - memcpy(result + 8 + ir_size, &sel->so, sizeof(sel->so)); + struct mesa_sha1 ctx; + _mesa_sha1_init(&ctx); + _mesa_sha1_update(&ctx, &shader_variant_flags, 4); + _mesa_sha1_update(&ctx, ir_binary, ir_size); + if (sel->type == PIPE_SHADER_VERTEX || + sel->type == PIPE_SHADER_TESS_EVAL || + sel->type == PIPE_SHADER_GEOMETRY) + _mesa_sha1_update(&ctx, &sel->so, sizeof(sel->so)); + _mesa_sha1_final(&ctx, ir_sha1_cache_key); - if (sel->nir) + if (ir_binary == blob.data) blob_finish(&blob); - - return result; } /** Copy "data" to "ptr" and return the next dword following copied data. */ @@ -208,10 +204,9 @@ /** * Insert a shader into the cache. It's assumed the shader is not in the cache. * Use si_shader_cache_load_shader before calling this. - * - * Returns false on failure, in which case the ir_binary should be freed. */ -bool si_shader_cache_insert_shader(struct si_screen *sscreen, void *ir_binary, +void si_shader_cache_insert_shader(struct si_screen *sscreen, + unsigned char ir_sha1_cache_key[20], struct si_shader *shader, bool insert_into_disk_cache) { @@ -219,104 +214,88 @@ struct hash_entry *entry; uint8_t key[CACHE_KEY_SIZE]; - entry = _mesa_hash_table_search(sscreen->shader_cache, ir_binary); + entry = _mesa_hash_table_search(sscreen->shader_cache, ir_sha1_cache_key); if (entry) - return false; /* already added */ + return; /* already added */ hw_binary = si_get_shader_binary(shader); if (!hw_binary) - return false; + return; - if (_mesa_hash_table_insert(sscreen->shader_cache, ir_binary, + if (_mesa_hash_table_insert(sscreen->shader_cache, + mem_dup(ir_sha1_cache_key, 20), hw_binary) == NULL) { FREE(hw_binary); - return false; + return; } if (sscreen->disk_shader_cache && insert_into_disk_cache) { - disk_cache_compute_key(sscreen->disk_shader_cache, ir_binary, - *((uint32_t *)ir_binary), key); + disk_cache_compute_key(sscreen->disk_shader_cache, + ir_sha1_cache_key, 20, key); disk_cache_put(sscreen->disk_shader_cache, key, hw_binary, *((uint32_t *) hw_binary), NULL); } - - return true; } -bool si_shader_cache_load_shader(struct si_screen *sscreen, void *ir_binary, +bool si_shader_cache_load_shader(struct si_screen *sscreen, + unsigned char ir_sha1_cache_key[20], struct si_shader *shader) { struct hash_entry *entry = - _mesa_hash_table_search(sscreen->shader_cache, ir_binary); - if (!entry) { - if (sscreen->disk_shader_cache) { - unsigned char sha1[CACHE_KEY_SIZE]; - size_t tg_size = *((uint32_t *) ir_binary); - - disk_cache_compute_key(sscreen->disk_shader_cache, - ir_binary, tg_size, sha1); - - size_t binary_size; - uint8_t *buffer = - disk_cache_get(sscreen->disk_shader_cache, - sha1, &binary_size); - if (!buffer) - return false; - - if (binary_size < sizeof(uint32_t) || - *((uint32_t*)buffer) != binary_size) { - /* Something has gone wrong discard the item - * from the cache and rebuild/link from - * source. - */ - assert(!"Invalid radeonsi shader disk cache " - "item!"); + _mesa_hash_table_search(sscreen->shader_cache, ir_sha1_cache_key); - disk_cache_remove(sscreen->disk_shader_cache, - sha1); - free(buffer); + if (entry) { + if (si_load_shader_binary(shader, entry->data)) { + p_atomic_inc(&sscreen->num_memory_shader_cache_hits); + return true; + } + } + p_atomic_inc(&sscreen->num_memory_shader_cache_misses); - return false; - } + if (!sscreen->disk_shader_cache) + return false; - if (!si_load_shader_binary(shader, buffer)) { + unsigned char sha1[CACHE_KEY_SIZE]; + disk_cache_compute_key(sscreen->disk_shader_cache, ir_sha1_cache_key, + 20, sha1); + + size_t binary_size; + uint8_t *buffer = disk_cache_get(sscreen->disk_shader_cache, sha1, + &binary_size); + if (buffer) { + if (binary_size >= sizeof(uint32_t) && + *((uint32_t*)buffer) == binary_size) { + if (si_load_shader_binary(shader, buffer)) { free(buffer); - return false; + si_shader_cache_insert_shader(sscreen, ir_sha1_cache_key, + shader, false); + p_atomic_inc(&sscreen->num_disk_shader_cache_hits); + return true; } - free(buffer); - - if (!si_shader_cache_insert_shader(sscreen, ir_binary, - shader, false)) - FREE(ir_binary); } else { - return false; + /* Something has gone wrong discard the item from the cache and + * rebuild/link from source. + */ + assert(!"Invalid radeonsi shader disk cache item!"); + disk_cache_remove(sscreen->disk_shader_cache, sha1); } - } else { - if (si_load_shader_binary(shader, entry->data)) - FREE(ir_binary); - else - return false; } - p_atomic_inc(&sscreen->num_shader_cache_hits); - return true; + + free(buffer); + p_atomic_inc(&sscreen->num_disk_shader_cache_misses); + return false; } static uint32_t si_shader_cache_key_hash(const void *key) { - /* The first dword is the key size. */ - return util_hash_crc32(key, *(uint32_t*)key); + /* Take the first dword of SHA1. */ + return *(uint32_t*)key; } static bool si_shader_cache_key_equals(const void *a, const void *b) { - uint32_t *keya = (uint32_t*)a; - uint32_t *keyb = (uint32_t*)b; - - /* The first dword is the key size. */ - if (*keya != *keyb) - return false; - - return memcmp(keya, keyb, *keya) == 0; + /* Compare SHA1s. */ + return memcmp(a, b, 20) == 0; } static void si_destroy_shader_cache_entry(struct hash_entry *entry) @@ -327,7 +306,7 @@ bool si_init_shader_cache(struct si_screen *sscreen) { - (void) mtx_init(&sscreen->shader_cache_mutex, mtx_plain); + (void) simple_mtx_init(&sscreen->shader_cache_mutex, mtx_plain); sscreen->shader_cache = _mesa_hash_table_create(NULL, si_shader_cache_key_hash, @@ -341,7 +320,7 @@ if (sscreen->shader_cache) _mesa_hash_table_destroy(sscreen->shader_cache, si_destroy_shader_cache_entry); - mtx_destroy(&sscreen->shader_cache_mutex); + simple_mtx_destroy(&sscreen->shader_cache_mutex); } /* SHADER STATES */ @@ -350,7 +329,7 @@ const struct si_shader_selector *tes, struct si_pm4_state *pm4) { - const struct tgsi_shader_info *info = &tes->info; + const struct si_shader_info *info = &tes->info; unsigned tes_prim_mode = info->properties[TGSI_PROPERTY_TES_PRIM_MODE]; unsigned tes_spacing = info->properties[TGSI_PROPERTY_TES_SPACING]; bool tes_vertex_order_cw = info->properties[TGSI_PROPERTY_TES_VERTEX_ORDER_CW]; @@ -397,7 +376,7 @@ else topology = V_028B6C_OUTPUT_TRIANGLE_CW; - if (sscreen->has_distributed_tess) { + if (sscreen->info.has_distributed_tess) { if (sscreen->info.family == CHIP_FIJI || sscreen->info.family >= CHIP_POLARIS10) distribution_mode = V_028B6C_DISTRIBUTION_MODE_TRAPEZOIDS; @@ -472,8 +451,19 @@ } } -static unsigned si_get_num_vs_user_sgprs(unsigned num_always_on_user_sgprs) +static unsigned si_get_num_vs_user_sgprs(struct si_shader *shader, + unsigned num_always_on_user_sgprs) { + struct si_shader_selector *vs = shader->previous_stage_sel ? + shader->previous_stage_sel : shader->selector; + unsigned num_vbos_in_user_sgprs = vs->num_vbos_in_user_sgprs; + + /* 1 SGPR is reserved for the vertex buffer pointer. */ + assert(num_always_on_user_sgprs <= SI_SGPR_VS_VB_DESCRIPTOR_FIRST - 1); + + if (num_vbos_in_user_sgprs) + return SI_SGPR_VS_VB_DESCRIPTOR_FIRST + num_vbos_in_user_sgprs * 4; + /* Add the pointer to VBO descriptors. */ return num_always_on_user_sgprs + 1; } @@ -525,7 +515,7 @@ S_00B528_VGPR_COMP_CNT(si_get_vs_vgpr_comp_cnt(sscreen, shader, false)) | S_00B528_DX10_CLAMP(1) | S_00B528_FLOAT_MODE(shader->config.float_mode); - shader->config.rsrc2 = S_00B52C_USER_SGPR(si_get_num_vs_user_sgprs(SI_VS_NUM_USER_SGPR)) | + shader->config.rsrc2 = S_00B52C_USER_SGPR(si_get_num_vs_user_sgprs(shader, SI_VS_NUM_USER_SGPR)) | S_00B52C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0); } @@ -551,7 +541,7 @@ } unsigned num_user_sgprs = - si_get_num_vs_user_sgprs(GFX9_TCS_NUM_USER_SGPR); + si_get_num_vs_user_sgprs(shader, GFX9_TCS_NUM_USER_SGPR); shader->config.rsrc2 = S_00B42C_USER_SGPR(num_user_sgprs) | @@ -635,7 +625,7 @@ if (shader->selector->type == PIPE_SHADER_VERTEX) { vgpr_comp_cnt = si_get_vs_vgpr_comp_cnt(sscreen, shader, false); - num_user_sgprs = si_get_num_vs_user_sgprs(SI_VS_NUM_USER_SGPR); + num_user_sgprs = si_get_num_vs_user_sgprs(shader, SI_VS_NUM_USER_SGPR); } else if (shader->selector->type == PIPE_SHADER_TESS_EVAL) { vgpr_comp_cnt = shader->selector->info.uses_primid ? 3 : 2; num_user_sgprs = SI_TES_NUM_USER_SGPR; @@ -902,7 +892,7 @@ unsigned num_user_sgprs; if (es_type == PIPE_SHADER_VERTEX) - num_user_sgprs = si_get_num_vs_user_sgprs(GFX9_VSGS_NUM_USER_SGPR); + num_user_sgprs = si_get_num_vs_user_sgprs(shader, GFX9_VSGS_NUM_USER_SGPR); else num_user_sgprs = GFX9_TESGS_NUM_USER_SGPR; @@ -938,6 +928,12 @@ si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS, rsrc1); si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS, rsrc2); + if (sscreen->info.chip_class >= GFX10) { + si_pm4_set_reg(pm4, R_00B204_SPI_SHADER_PGM_RSRC4_GS, + S_00B204_CU_EN(0xffff) | + S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(0)); + } + shader->ctx_reg.gs.vgt_gs_onchip_cntl = S_028A44_ES_VERTS_PER_SUBGRP(shader->gs_info.es_verts_per_subgroup) | S_028A44_GS_PRIMS_PER_SUBGRP(shader->gs_info.gs_prims_per_subgroup) | @@ -967,6 +963,29 @@ } } +static void gfx10_emit_ge_pc_alloc(struct si_context *sctx, unsigned value) +{ + enum si_tracked_reg reg = SI_TRACKED_GE_PC_ALLOC; + + if (((sctx->tracked_regs.reg_saved >> reg) & 0x1) != 0x1 || + sctx->tracked_regs.reg_value[reg] != value) { + struct radeon_cmdbuf *cs = sctx->gfx_cs; + + if (sctx->family == CHIP_NAVI10 || + sctx->family == CHIP_NAVI12 || + sctx->family == CHIP_NAVI14) { + /* SQ_NON_EVENT must be emitted before GE_PC_ALLOC is written. */ + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_SQ_NON_EVENT) | EVENT_INDEX(0)); + } + + radeon_set_uconfig_reg(cs, R_030980_GE_PC_ALLOC, value); + + sctx->tracked_regs.reg_saved |= 0x1ull << reg; + sctx->tracked_regs.reg_value[reg] = value; + } +} + /* Common tail code for NGG primitive shaders. */ static void gfx10_emit_shader_ngg_tail(struct si_context *sctx, struct si_shader *shader, @@ -1011,6 +1030,9 @@ if (initial_cdw != sctx->gfx_cs->current.cdw) sctx->context_roll = true; + + /* GE_PC_ALLOC is not a context register, so it doesn't cause a context roll. */ + gfx10_emit_ge_pc_alloc(sctx, shader->ctx_reg.ngg.ge_pc_alloc); } static void gfx10_emit_shader_ngg_notess_nogs(struct si_context *sctx) @@ -1109,11 +1131,11 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader) { const struct si_shader_selector *gs_sel = shader->selector; - const struct tgsi_shader_info *gs_info = &gs_sel->info; + const struct si_shader_info *gs_info = &gs_sel->info; enum pipe_shader_type gs_type = shader->selector->type; const struct si_shader_selector *es_sel = shader->previous_stage_sel ? shader->previous_stage_sel : shader->selector; - const struct tgsi_shader_info *es_info = &es_sel->info; + const struct si_shader_info *es_info = &es_sel->info; enum pipe_shader_type es_type = es_sel->type; unsigned num_user_sgprs; unsigned nparams, es_vgpr_comp_cnt, gs_vgpr_comp_cnt; @@ -1146,7 +1168,7 @@ num_user_sgprs = SI_SGPR_VS_BLIT_DATA + es_info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD]; } else { - num_user_sgprs = si_get_num_vs_user_sgprs(GFX9_VSGS_NUM_USER_SGPR); + num_user_sgprs = si_get_num_vs_user_sgprs(shader, GFX9_VSGS_NUM_USER_SGPR); } } else { assert(es_type == PIPE_SHADER_TESS_EVAL); @@ -1164,11 +1186,13 @@ * pass edge flags for decomposed primitives (such as quads) to the PA * for the GL_LINE polygon mode to skip rendering lines on inner edges. */ - if (gs_info->uses_invocationid || gs_type == PIPE_SHADER_VERTEX) + if (gs_info->uses_invocationid || + (gs_type == PIPE_SHADER_VERTEX && !gfx10_is_ngg_passthrough(shader))) gs_vgpr_comp_cnt = 3; /* VGPR3 contains InvocationID, edge flags. */ - else if (gs_info->uses_primid) + else if ((gs_type == PIPE_SHADER_GEOMETRY && gs_info->uses_primid) || + (gs_type == PIPE_SHADER_VERTEX && shader->key.mono.u.vs_export_prim_id)) gs_vgpr_comp_cnt = 2; /* VGPR2 contains PrimitiveID. */ - else if (input_prim >= PIPE_PRIM_TRIANGLES) + else if (input_prim >= PIPE_PRIM_TRIANGLES && !gfx10_is_ngg_passthrough(shader)) gs_vgpr_comp_cnt = 1; /* VGPR1 contains offsets 2, 3 */ else gs_vgpr_comp_cnt = 0; /* VGPR0 contains offsets 0, 1 */ @@ -1191,6 +1215,34 @@ S_00B22C_OC_LDS_EN(es_type == PIPE_SHADER_TESS_EVAL) | S_00B22C_LDS_SIZE(shader->config.lds_size)); + /* Determine LATE_ALLOC_GS. */ + unsigned num_cu_per_sh = sscreen->info.num_good_cu_per_sh; + unsigned late_alloc_wave64; /* The limit is per SH. */ + + /* For Wave32, the hw will launch twice the number of late + * alloc waves, so 1 == 2x wave32. + * + * Don't use late alloc for NGG on Navi14 due to a hw bug. + */ + if (sscreen->info.family == CHIP_NAVI14) + late_alloc_wave64 = 0; + else if (num_cu_per_sh <= 6) + late_alloc_wave64 = num_cu_per_sh - 2; /* All CUs enabled */ + else if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL) + late_alloc_wave64 = (num_cu_per_sh - 2) * 6; + else + late_alloc_wave64 = (num_cu_per_sh - 2) * 4; + + /* Limit LATE_ALLOC_GS for prevent a hang (hw bug). */ + if (sscreen->info.family == CHIP_NAVI10 || + sscreen->info.family == CHIP_NAVI12 || + sscreen->info.family == CHIP_NAVI14) + late_alloc_wave64 = MIN2(late_alloc_wave64, 64); + + si_pm4_set_reg(pm4, R_00B204_SPI_SHADER_PGM_RSRC4_GS, + S_00B204_CU_EN(0xffff) | + S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_wave64)); + nparams = MAX2(shader->info.nr_param_exports, 1); shader->ctx_reg.ngg.spi_vs_out_config = S_0286C4_VS_EXPORT_COUNT(nparams - 1) | @@ -1212,7 +1264,8 @@ shader->ctx_reg.ngg.vgt_primitiveid_en = S_028A84_PRIMITIVEID_EN(es_enable_prim_id) | - S_028A84_NGG_DISABLE_PROVOK_REUSE(es_enable_prim_id); + S_028A84_NGG_DISABLE_PROVOK_REUSE(shader->key.mono.u.vs_export_prim_id || + gs_sel->info.writes_primid); if (gs_type == PIPE_SHADER_GEOMETRY) { shader->ctx_reg.ngg.vgt_esgs_ring_itemsize = es_sel->esgs_itemsize / 4; @@ -1248,26 +1301,53 @@ S_028838_INDEX_BUF_EDGE_FLAG_ENA(gs_type == PIPE_SHADER_VERTEX); shader->pa_cl_vs_out_cntl = si_get_vs_out_cntl(gs_sel, true); - shader->ge_cntl = - S_03096C_PRIM_GRP_SIZE(shader->ngg.max_gsprims) | - S_03096C_VERT_GRP_SIZE(shader->ngg.hw_max_esverts) | - S_03096C_BREAK_WAVE_AT_EOI(break_wave_at_eoi); + /* Oversubscribe PC. This improves performance when there are too many varyings. */ + float oversub_pc_factor = 0.25; - /* Bug workaround for a possible hang with non-tessellation cases. - * Tessellation always sets GE_CNTL.VERT_GRP_SIZE = 0 - * - * Requirement: GE_CNTL.VERT_GRP_SIZE = VGT_GS_ONCHIP_CNTL.ES_VERTS_PER_SUBGRP - 5 - */ - if ((sscreen->info.family == CHIP_NAVI10 || - sscreen->info.family == CHIP_NAVI12 || - sscreen->info.family == CHIP_NAVI14) && - (es_type == PIPE_SHADER_VERTEX || gs_type == PIPE_SHADER_VERTEX) && /* = no tess */ - shader->ngg.hw_max_esverts != 256) { - shader->ge_cntl &= C_03096C_VERT_GRP_SIZE; - - if (shader->ngg.hw_max_esverts > 5) { - shader->ge_cntl |= - S_03096C_VERT_GRP_SIZE(shader->ngg.hw_max_esverts - 5); + if (shader->key.opt.ngg_culling) { + /* Be more aggressive with NGG culling. */ + if (shader->info.nr_param_exports > 4) + oversub_pc_factor = 1; + else if (shader->info.nr_param_exports > 2) + oversub_pc_factor = 0.75; + else + oversub_pc_factor = 0.5; + } + + unsigned oversub_pc_lines = sscreen->info.pc_lines * oversub_pc_factor; + shader->ctx_reg.ngg.ge_pc_alloc = S_030980_OVERSUB_EN(1) | + S_030980_NUM_PC_LINES(oversub_pc_lines - 1); + + if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST) { + shader->ge_cntl = + S_03096C_PRIM_GRP_SIZE(shader->ngg.max_gsprims) | + S_03096C_VERT_GRP_SIZE(shader->ngg.max_gsprims * 3); + } else if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP) { + shader->ge_cntl = + S_03096C_PRIM_GRP_SIZE(shader->ngg.max_gsprims) | + S_03096C_VERT_GRP_SIZE(shader->ngg.max_gsprims + 2); + } else { + shader->ge_cntl = + S_03096C_PRIM_GRP_SIZE(shader->ngg.max_gsprims) | + S_03096C_VERT_GRP_SIZE(256) | /* 256 = disable vertex grouping */ + S_03096C_BREAK_WAVE_AT_EOI(break_wave_at_eoi); + + /* Bug workaround for a possible hang with non-tessellation cases. + * Tessellation always sets GE_CNTL.VERT_GRP_SIZE = 0 + * + * Requirement: GE_CNTL.VERT_GRP_SIZE = VGT_GS_ONCHIP_CNTL.ES_VERTS_PER_SUBGRP - 5 + */ + if ((sscreen->info.family == CHIP_NAVI10 || + sscreen->info.family == CHIP_NAVI12 || + sscreen->info.family == CHIP_NAVI14) && + (es_type == PIPE_SHADER_VERTEX || gs_type == PIPE_SHADER_VERTEX) && /* = no tess */ + shader->ngg.hw_max_esverts != 256) { + shader->ge_cntl &= C_03096C_VERT_GRP_SIZE; + + if (shader->ngg.hw_max_esverts > 5) { + shader->ge_cntl |= + S_03096C_VERT_GRP_SIZE(shader->ngg.hw_max_esverts - 5); + } } } @@ -1326,9 +1406,6 @@ SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL, shader->vgt_vertex_reuse_block_cntl); - if (initial_cdw != sctx->gfx_cs->current.cdw) - sctx->context_roll = true; - /* Required programming for tessellation. (legacy pipeline only) */ if (sctx->chip_class == GFX10 && shader->selector->type == PIPE_SHADER_TESS_EVAL) { @@ -1345,6 +1422,13 @@ shader->pa_cl_vs_out_cntl, SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK); } + + if (initial_cdw != sctx->gfx_cs->current.cdw) + sctx->context_roll = true; + + /* GE_PC_ALLOC is not a context register, so it doesn't cause a context roll. */ + if (sctx->chip_class >= GFX10) + gfx10_emit_ge_pc_alloc(sctx, shader->ctx_reg.vs.ge_pc_alloc); } /** @@ -1357,7 +1441,7 @@ static void si_shader_vs(struct si_screen *sscreen, struct si_shader *shader, struct si_shader_selector *gs) { - const struct tgsi_shader_info *info = &shader->selector->info; + const struct si_shader_info *info = &shader->selector->info; struct si_pm4_state *pm4; unsigned num_user_sgprs, vgpr_comp_cnt; uint64_t va; @@ -1413,7 +1497,7 @@ num_user_sgprs = SI_SGPR_VS_BLIT_DATA + info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD]; } else { - num_user_sgprs = si_get_num_vs_user_sgprs(SI_VS_NUM_USER_SGPR); + num_user_sgprs = si_get_num_vs_user_sgprs(shader, SI_VS_NUM_USER_SGPR); } } else if (shader->selector->type == PIPE_SHADER_TESS_EVAL) { vgpr_comp_cnt = enable_prim_id ? 3 : 2; @@ -1441,6 +1525,8 @@ S_02870C_POS3_EXPORT_FORMAT(shader->info.nr_pos_exports > 3 ? V_02870C_SPI_SHADER_4COMP : V_02870C_SPI_SHADER_NONE); + shader->ctx_reg.vs.ge_pc_alloc = S_030980_OVERSUB_EN(1) | + S_030980_NUM_PC_LINES(sscreen->info.pc_lines / 4 - 1); shader->pa_cl_vs_out_cntl = si_get_vs_out_cntl(shader->selector, false); oc_lds_en = shader->selector->type == PIPE_SHADER_TESS_EVAL ? 1 : 0; @@ -1458,6 +1544,11 @@ S_00B12C_OC_LDS_EN(oc_lds_en) | S_00B12C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0); + if (sscreen->info.chip_class >= GFX10) + rsrc2 |= S_00B12C_USER_SGPR_MSB_GFX10(num_user_sgprs >> 5); + else if (sscreen->info.chip_class == GFX9) + rsrc2 |= S_00B12C_USER_SGPR_MSB_GFX9(num_user_sgprs >> 5); + if (sscreen->info.chip_class <= GFX9) rsrc1 |= S_00B128_SGPRS((shader->config.num_sgprs - 1) / 8); @@ -1490,7 +1581,7 @@ static unsigned si_get_ps_num_interp(struct si_shader *ps) { - struct tgsi_shader_info *info = &ps->selector->info; + struct si_shader_info *info = &ps->selector->info; unsigned num_colors = !!(info->colors_read & 0x0f) + !!(info->colors_read & 0xf0); unsigned num_interp = ps->selector->info.num_inputs + @@ -1552,7 +1643,7 @@ static void si_shader_ps(struct si_screen *sscreen, struct si_shader *shader) { - struct tgsi_shader_info *info = &shader->selector->info; + struct si_shader_info *info = &shader->selector->info; struct si_pm4_state *pm4; unsigned spi_ps_in_control, spi_shader_col_format, cb_shader_mask; unsigned spi_baryc_cntl = S_0286E0_FRONT_FACE_ALL_BITS(1); @@ -1836,6 +1927,7 @@ uint64_t linked = outputs_written & inputs_read; key->opt.kill_outputs = ~linked & outputs_written; + key->opt.ngg_culling = sctx->ngg_culling; } /* Compute the key for the hw shader variant */ @@ -1889,7 +1981,7 @@ key->part.tcs.epilog.prim_mode = sctx->tes_shader.cso->info.properties[TGSI_PROPERTY_TES_PRIM_MODE]; key->part.tcs.epilog.invoc0_tess_factors_are_def = - sel->tcs_info.tessfactors_are_def_in_all_invocs; + sel->info.tessfactors_are_def_in_all_invocs; key->part.tcs.epilog.tes_reads_tess_factors = sctx->tes_shader.cso->info.reads_tess_factors; @@ -2105,7 +2197,10 @@ compiler = shader->compiler_ctx_state.compiler; } - if (unlikely(!si_shader_create(sscreen, compiler, shader, debug))) { + if (!compiler->passes) + si_init_compiler(sscreen, compiler); + + if (unlikely(!si_create_shader_variant(sscreen, compiler, shader, debug))) { PRINT_ERR("Failed to build shader variant (type=%u)\n", sel->type); shader->compilation_failed = true; @@ -2159,8 +2254,8 @@ main_part->key.as_ngg = key->as_ngg; main_part->is_monolithic = false; - if (si_compile_tgsi_shader(sscreen, compiler_state->compiler, - main_part, &compiler_state->debug) != 0) { + if (!si_compile_shader(sscreen, compiler_state->compiler, + main_part, &compiler_state->debug)) { FREE(main_part); return false; } @@ -2221,14 +2316,14 @@ if (thread_index < 0) util_queue_fence_wait(&sel->ready); - mtx_lock(&sel->mutex); + simple_mtx_lock(&sel->mutex); /* Find the shader variant. */ for (iter = sel->first_variant; iter; iter = iter->next_variant) { /* Don't check the "current" shader. We checked it above. */ if (current != iter && memcmp(&iter->key, key, sizeof(*key)) == 0) { - mtx_unlock(&sel->mutex); + simple_mtx_unlock(&sel->mutex); if (unlikely(!util_queue_fence_is_signalled(&iter->ready))) { /* If it's an optimized shader and its compilation has @@ -2257,7 +2352,7 @@ /* Build a new shader. */ shader = CALLOC_STRUCT(si_shader); if (!shader) { - mtx_unlock(&sel->mutex); + simple_mtx_unlock(&sel->mutex); return -ENOMEM; } @@ -2314,11 +2409,11 @@ assert(0); } - mtx_lock(&previous_stage_sel->mutex); + simple_mtx_lock(&previous_stage_sel->mutex); ok = si_check_missing_main_part(sscreen, previous_stage_sel, compiler_state, &shader1_key); - mtx_unlock(&previous_stage_sel->mutex); + simple_mtx_unlock(&previous_stage_sel->mutex); } if (ok) { @@ -2328,7 +2423,7 @@ if (!ok) { FREE(shader); - mtx_unlock(&sel->mutex); + simple_mtx_unlock(&sel->mutex); return -ENOMEM; /* skip the draw call */ } } @@ -2358,7 +2453,8 @@ /* Compile it asynchronously. */ util_queue_add_job(&sscreen->shader_compiler_queue_low_priority, shader, &shader->ready, - si_build_shader_variant_low_priority, NULL); + si_build_shader_variant_low_priority, NULL, + 0); /* Add only after the ready fence was reset, to guard against a * race with si_bind_XX_shader. */ @@ -2372,7 +2468,7 @@ /* Use the default (unoptimized) shader for now. */ memset(&key->opt, 0, sizeof(key->opt)); - mtx_unlock(&sel->mutex); + simple_mtx_unlock(&sel->mutex); if (sscreen->options.sync_compile) util_queue_fence_wait(&shader->ready); @@ -2393,7 +2489,7 @@ sel->last_variant = shader; } - mtx_unlock(&sel->mutex); + simple_mtx_unlock(&sel->mutex); assert(!shader->is_optimized); si_build_shader_variant(shader, thread_index, false); @@ -2419,7 +2515,7 @@ &key, -1, false); } -static void si_parse_next_shader_property(const struct tgsi_shader_info *info, +static void si_parse_next_shader_property(const struct si_shader_info *info, bool streamout, struct si_shader_key *key) { @@ -2471,8 +2567,25 @@ assert(thread_index < ARRAY_SIZE(sscreen->compiler)); compiler = &sscreen->compiler[thread_index]; - if (sel->nir) - si_lower_nir(sel); + if (!compiler->passes) + si_init_compiler(sscreen, compiler); + + /* Serialize NIR to save memory. Monolithic shader variants + * have to deserialize NIR before compilation. + */ + if (sel->nir) { + struct blob blob; + size_t size; + + blob_init(&blob); + /* true = remove optional debugging data to increase + * the likehood of getting more shader cache hits. + * It also drops variable names, so we'll save more memory. + */ + nir_serialize(&blob, sel->nir, true); + blob_finish_get_buffer(&blob, &sel->nir_binary, &size); + sel->nir_size = size; + } /* Compile the main shader part for use with a prolog and/or epilog. * If this fails, the driver will try to compile a monolithic shader @@ -2480,7 +2593,7 @@ */ if (!sscreen->use_monolithic_shaders) { struct si_shader *shader = CALLOC_STRUCT(si_shader); - void *ir_binary = NULL; + unsigned char ir_sha1_cache_key[20]; if (!shader) { fprintf(stderr, "radeonsi: can't allocate a main shader part\n"); @@ -2504,36 +2617,31 @@ sel->type == PIPE_SHADER_GEOMETRY)) shader->key.as_ngg = 1; - if (sel->tokens || sel->nir) { - ir_binary = si_get_ir_binary(sel, shader->key.as_ngg, - shader->key.as_es); + if (sel->nir) { + si_get_ir_cache_key(sel, shader->key.as_ngg, + shader->key.as_es, ir_sha1_cache_key); } /* Try to load the shader from the shader cache. */ - mtx_lock(&sscreen->shader_cache_mutex); + simple_mtx_lock(&sscreen->shader_cache_mutex); - if (ir_binary && - si_shader_cache_load_shader(sscreen, ir_binary, shader)) { - mtx_unlock(&sscreen->shader_cache_mutex); + if (si_shader_cache_load_shader(sscreen, ir_sha1_cache_key, shader)) { + simple_mtx_unlock(&sscreen->shader_cache_mutex); si_shader_dump_stats_for_shader_db(sscreen, shader, debug); } else { - mtx_unlock(&sscreen->shader_cache_mutex); + simple_mtx_unlock(&sscreen->shader_cache_mutex); /* Compile the shader if it hasn't been loaded from the cache. */ - if (si_compile_tgsi_shader(sscreen, compiler, shader, - debug) != 0) { + if (!si_compile_shader(sscreen, compiler, shader, debug)) { FREE(shader); - FREE(ir_binary); fprintf(stderr, "radeonsi: can't compile a main shader part\n"); return; } - if (ir_binary) { - mtx_lock(&sscreen->shader_cache_mutex); - if (!si_shader_cache_insert_shader(sscreen, ir_binary, shader, true)) - FREE(ir_binary); - mtx_unlock(&sscreen->shader_cache_mutex); - } + simple_mtx_lock(&sscreen->shader_cache_mutex); + si_shader_cache_insert_shader(sscreen, ir_sha1_cache_key, + shader, true); + simple_mtx_unlock(&sscreen->shader_cache_mutex); } *si_get_main_shader_part(sel, &shader->key) = shader; @@ -2594,6 +2702,12 @@ si_shader_vs(sscreen, sel->gs_copy_shader, sel); } + + /* Free NIR. We only keep serialized NIR after this point. */ + if (sel->nir) { + ralloc_free(sel->nir); + sel->nir = NULL; + } } void si_schedule_initial_compile(struct si_context *sctx, unsigned processor, @@ -2615,7 +2729,7 @@ } util_queue_add_job(&sctx->screen->shader_compiler_queue, job, - ready_fence, execute, NULL); + ready_fence, execute, NULL, 0); if (debug) { util_queue_fence_wait(ready_fence); @@ -2628,16 +2742,17 @@ } /* Return descriptor slot usage masks from the given shader info. */ -void si_get_active_slot_masks(const struct tgsi_shader_info *info, +void si_get_active_slot_masks(const struct si_shader_info *info, uint32_t *const_and_shader_buffers, uint64_t *samplers_and_images) { - unsigned start, num_shaderbufs, num_constbufs, num_images, num_samplers; + unsigned start, num_shaderbufs, num_constbufs, num_images, num_msaa_images, num_samplers; num_shaderbufs = util_last_bit(info->shader_buffers_declared); num_constbufs = util_last_bit(info->const_buffers_declared); /* two 8-byte images share one 16-byte slot */ num_images = align(util_last_bit(info->images_declared), 2); + num_msaa_images = align(util_last_bit(info->msaa_images_declared), 2); num_samplers = util_last_bit(info->samplers_declared); /* The layout is: sb[last] ... sb[0], cb[0] ... cb[last] */ @@ -2645,7 +2760,18 @@ *const_and_shader_buffers = u_bit_consecutive(start, num_shaderbufs + num_constbufs); - /* The layout is: image[last] ... image[0], sampler[0] ... sampler[last] */ + /* The layout is: + * - fmask[last] ... fmask[0] go to [15-last .. 15] + * - image[last] ... image[0] go to [31-last .. 31] + * - sampler[0] ... sampler[last] go to [32 .. 32+last*2] + * + * FMASKs for images are placed separately, because MSAA images are rare, + * and so we can benefit from a better cache hit rate if we keep image + * descriptors together. + */ + if (num_msaa_images) + num_images = SI_NUM_IMAGES + num_msaa_images; /* add FMASK descriptors */ + start = si_get_image_slot(num_images - 1) / 2; *samplers_and_images = u_bit_consecutive64(start, num_images / 2 + num_samplers); @@ -2662,52 +2788,22 @@ if (!sel) return NULL; - pipe_reference_init(&sel->reference, 1); sel->screen = sscreen; sel->compiler_ctx_state.debug = sctx->debug; sel->compiler_ctx_state.is_debug_context = sctx->is_debug; sel->so = state->stream_output; - if (state->type == PIPE_SHADER_IR_TGSI && - !sscreen->options.enable_nir) { - sel->tokens = tgsi_dup_tokens(state->tokens); - if (!sel->tokens) { - FREE(sel); - return NULL; - } - - tgsi_scan_shader(state->tokens, &sel->info); - tgsi_scan_tess_ctrl(state->tokens, &sel->info, &sel->tcs_info); - - /* Fixup for TGSI: Set which opcode uses which (i,j) pair. */ - if (sel->info.uses_persp_opcode_interp_centroid) - sel->info.uses_persp_centroid = true; - - if (sel->info.uses_linear_opcode_interp_centroid) - sel->info.uses_linear_centroid = true; - - if (sel->info.uses_persp_opcode_interp_offset || - sel->info.uses_persp_opcode_interp_sample) - sel->info.uses_persp_center = true; - - if (sel->info.uses_linear_opcode_interp_offset || - sel->info.uses_linear_opcode_interp_sample) - sel->info.uses_linear_center = true; + if (state->type == PIPE_SHADER_IR_TGSI) { + sel->nir = tgsi_to_nir(state->tokens, ctx->screen); } else { - if (state->type == PIPE_SHADER_IR_TGSI) { - sel->nir = tgsi_to_nir(state->tokens, ctx->screen); - } else { - assert(state->type == PIPE_SHADER_IR_NIR); - sel->nir = state->ir.nir; - } - - si_nir_lower_ps_inputs(sel->nir); - si_nir_opts(sel->nir); - si_nir_scan_shader(sel->nir, &sel->info); - si_nir_scan_tess_ctrl(sel->nir, &sel->tcs_info); + assert(state->type == PIPE_SHADER_IR_NIR); + sel->nir = state->ir.nir; } + si_nir_scan_shader(sel->nir, &sel->info); + si_nir_adjust_driver_locations(sel->nir); + sel->type = sel->info.processor; p_atomic_inc(&sscreen->num_shaders_created); si_get_active_slot_masks(&sel->info, @@ -2721,6 +2817,12 @@ (sel->so.output[i].stream * 4); } + sel->num_vs_inputs = sel->type == PIPE_SHADER_VERTEX && + !sel->info.properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD] ? + sel->info.num_inputs : 0; + sel->num_vbos_in_user_sgprs = + MIN2(sel->num_vs_inputs, sscreen->num_vbos_in_user_sgprs); + /* The prolog is a no-op if there are no inputs. */ sel->vs_needs_prolog = sel->type == PIPE_SHADER_VERTEX && sel->info.num_inputs && @@ -2769,9 +2871,7 @@ /* EN_MAX_VERT_OUT_PER_GS_INSTANCE does not work with tesselation. */ sel->tess_turns_off_ngg = - (sscreen->info.family == CHIP_NAVI10 || - sscreen->info.family == CHIP_NAVI12 || - sscreen->info.family == CHIP_NAVI14) && + sscreen->info.chip_class == GFX10 && sel->gs_num_invocations * sel->gs_max_out_vertices > 256; break; @@ -2870,6 +2970,20 @@ default:; } + sel->ngg_culling_allowed = + sscreen->info.chip_class == GFX10 && + sscreen->info.has_dedicated_vram && + sscreen->use_ngg_culling && + /* Disallow TES by default, because TessMark results are mixed. */ + (sel->type == PIPE_SHADER_VERTEX || + (sscreen->always_use_ngg_culling && sel->type == PIPE_SHADER_TESS_EVAL)) && + sel->info.writes_position && + !sel->info.writes_viewport_index && /* cull only against viewport 0 */ + !sel->info.writes_memory && + !sel->so.num_outputs && + !sel->info.properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD] && + !sel->info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION]; + /* PA_CL_VS_OUT_CNTL */ if (sctx->chip_class <= GFX9) sel->pa_cl_vs_out_cntl = si_get_vs_out_cntl(sel, false); @@ -2932,7 +3046,7 @@ if (sel->info.properties[TGSI_PROPERTY_FS_POST_DEPTH_COVERAGE]) sel->db_shader_control |= S_02880C_PRE_SHADER_DEPTH_COVERAGE_ENABLE(1); - (void) mtx_init(&sel->mutex, mtx_plain); + (void) simple_mtx_init(&sel->mutex, mtx_plain); si_schedule_initial_compile(sctx, sel->info.processor, &sel->ready, &sel->compiler_ctx_state, sel, @@ -2940,6 +3054,14 @@ return sel; } +static void *si_create_shader(struct pipe_context *ctx, + const struct pipe_shader_state *state) +{ + struct si_screen *sscreen = (struct si_screen *)ctx->screen; + + return util_live_shader_cache_get(ctx, &sscreen->live_shader_cache, state); +} + static void si_update_streamout_state(struct si_context *sctx) { struct si_shader_selector *shader_with_so = si_get_vs(sctx)->cso; @@ -3059,7 +3181,7 @@ sctx->flags |= SI_CONTEXT_VGT_FLUSH; sctx->ngg = new_ngg; - sctx->last_rast_prim = -1; /* reset this so that it gets updated */ + sctx->last_gs_out_prim = -1; /* reset this so that it gets updated */ return true; } return false; @@ -3082,7 +3204,7 @@ sctx->ia_multi_vgt_param_key.u.uses_gs = sel != NULL; si_update_common_shader_state(sctx); - sctx->last_rast_prim = -1; /* reset this so that it gets updated */ + sctx->last_gs_out_prim = -1; /* reset this so that it gets updated */ ngg_changed = si_update_ngg(sctx); if (ngg_changed || enable_changed) @@ -3136,7 +3258,7 @@ si_update_tess_uses_prim_id(sctx); si_update_common_shader_state(sctx); - sctx->last_rast_prim = -1; /* reset this so that it gets updated */ + sctx->last_gs_out_prim = -1; /* reset this so that it gets updated */ bool ngg_changed = si_update_ngg(sctx); if (ngg_changed || enable_changed) @@ -3243,9 +3365,10 @@ free(shader); } -void si_destroy_shader_selector(struct si_context *sctx, - struct si_shader_selector *sel) +static void si_destroy_shader_selector(struct pipe_context *ctx, void *cso) { + struct si_context *sctx = (struct si_context*)ctx; + struct si_shader_selector *sel = (struct si_shader_selector *)cso; struct si_shader *p = sel->first_variant, *c; struct si_shader_ctx_state *current_shader[SI_NUM_SHADERS] = { [PIPE_SHADER_VERTEX] = &sctx->vs_shader, @@ -3280,9 +3403,9 @@ si_delete_shader(sctx, sel->gs_copy_shader); util_queue_fence_destroy(&sel->ready); - mtx_destroy(&sel->mutex); - free(sel->tokens); + simple_mtx_destroy(&sel->mutex); ralloc_free(sel->nir); + free(sel->nir_binary); free(sel); } @@ -3298,7 +3421,7 @@ struct si_shader *vs, unsigned name, unsigned index, unsigned interpolate) { - struct tgsi_shader_info *vsinfo = &vs->selector->info; + struct si_shader_info *vsinfo = &vs->selector->info; unsigned j, offset, ps_input_cntl = 0; if (interpolate == TGSI_INTERPOLATE_CONSTANT || @@ -3357,7 +3480,7 @@ { struct si_shader *ps = sctx->ps_shader.current; struct si_shader *vs = si_get_vs_state(sctx); - struct tgsi_shader_info *psinfo = ps ? &ps->selector->info : NULL; + struct si_shader_info *psinfo = ps ? &ps->selector->info : NULL; unsigned i, num_interp, num_written = 0, bcol_interp[2]; unsigned spi_ps_input_cntl[32]; @@ -3487,7 +3610,8 @@ pipe_aligned_buffer_create(sctx->b.screen, SI_RESOURCE_FLAG_UNMAPPABLE, PIPE_USAGE_DEFAULT, - esgs_ring_size, alignment); + esgs_ring_size, + sctx->screen->info.pte_fragment_size); if (!sctx->esgs_ring) return false; } @@ -3498,7 +3622,8 @@ pipe_aligned_buffer_create(sctx->b.screen, SI_RESOURCE_FLAG_UNMAPPABLE, PIPE_USAGE_DEFAULT, - gsvs_ring_size, alignment); + gsvs_ring_size, + sctx->screen->info.pte_fragment_size); if (!sctx->gsvs_ring) return false; } @@ -3561,18 +3686,18 @@ static void si_shader_lock(struct si_shader *shader) { - mtx_lock(&shader->selector->mutex); + simple_mtx_lock(&shader->selector->mutex); if (shader->previous_stage_sel) { assert(shader->previous_stage_sel != shader->selector); - mtx_lock(&shader->previous_stage_sel->mutex); + simple_mtx_lock(&shader->previous_stage_sel->mutex); } } static void si_shader_unlock(struct si_shader *shader) { if (shader->previous_stage_sel) - mtx_unlock(&shader->previous_stage_sel->mutex); - mtx_unlock(&shader->selector->mutex); + simple_mtx_unlock(&shader->previous_stage_sel->mutex); + simple_mtx_unlock(&shader->selector->mutex); } /** @@ -3737,9 +3862,10 @@ sctx->scratch_buffer = si_aligned_buffer_create(&sctx->screen->b, - SI_RESOURCE_FLAG_UNMAPPABLE, - PIPE_USAGE_DEFAULT, - scratch_needed_size, 256); + SI_RESOURCE_FLAG_UNMAPPABLE, + PIPE_USAGE_DEFAULT, + scratch_needed_size, + sctx->screen->info.pte_fragment_size); if (!sctx->scratch_buffer) return false; @@ -3846,9 +3972,10 @@ } if (key.u.ngg) { - stages |= S_028B54_PRIMGEN_EN(1); - if (key.u.streamout) - stages |= S_028B54_NGG_WAVE_ID_EN(1); + stages |= S_028B54_PRIMGEN_EN(1) | + S_028B54_GS_FAST_LAUNCH(key.u.ngg_gs_fast_launch) | + S_028B54_NGG_WAVE_ID_EN(key.u.streamout) | + S_028B54_PRIMGEN_PASSTHRU_EN(key.u.ngg_passthrough); } else if (key.u.gs) stages |= S_028B54_VS_EN(V_028B54_VS_STAGE_COPY_SHADER); @@ -3888,6 +4015,9 @@ old_ps ? old_ps->key.part.ps.epilog.spi_shader_col_format : 0; int r; + if (!sctx->compiler.passes) + si_init_compiler(sctx->screen, &sctx->compiler); + compiler_state.compiler = &sctx->compiler; compiler_state.debug = sctx->debug; compiler_state.is_debug_context = sctx->is_debug; @@ -3998,6 +4128,15 @@ } } + /* This must be done after the shader variant is selected. */ + if (sctx->ngg) { + struct si_shader *vs = si_get_vs(sctx)->current; + + key.u.ngg_passthrough = gfx10_is_ngg_passthrough(vs); + key.u.ngg_gs_fast_launch = !!(vs->key.opt.ngg_culling & + SI_NGG_CULL_GS_FAST_LAUNCH_ALL); + } + si_update_vgt_shader_config(sctx, key); if (old_clip_disable != si_get_vs_state(sctx)->key.opt.clip_disable) @@ -4025,7 +4164,7 @@ si_mark_atom_dirty(sctx, &sctx->atoms.s.spi_map); } - if (sctx->screen->rbplus_allowed && + if (sctx->screen->info.rbplus_allowed && si_pm4_state_changed(sctx, ps) && (!old_ps || old_spi_shader_col_format != @@ -4111,16 +4250,23 @@ } } +void si_init_screen_live_shader_cache(struct si_screen *sscreen) +{ + util_live_shader_cache_init(&sscreen->live_shader_cache, + si_create_shader_selector, + si_destroy_shader_selector); +} + void si_init_shader_functions(struct si_context *sctx) { sctx->atoms.s.spi_map.emit = si_emit_spi_map; sctx->atoms.s.scratch_state.emit = si_emit_scratch_state; - sctx->b.create_vs_state = si_create_shader_selector; - sctx->b.create_tcs_state = si_create_shader_selector; - sctx->b.create_tes_state = si_create_shader_selector; - sctx->b.create_gs_state = si_create_shader_selector; - sctx->b.create_fs_state = si_create_shader_selector; + sctx->b.create_vs_state = si_create_shader; + sctx->b.create_tcs_state = si_create_shader; + sctx->b.create_tes_state = si_create_shader; + sctx->b.create_gs_state = si_create_shader; + sctx->b.create_fs_state = si_create_shader; sctx->b.bind_vs_state = si_bind_vs_shader; sctx->b.bind_tcs_state = si_bind_tcs_shader; diff -Nru mesa-19.2.8/src/gallium/drivers/radeonsi/si_state_streamout.c mesa-20.0.8/src/gallium/drivers/radeonsi/si_state_streamout.c --- mesa-19.2.8/src/gallium/drivers/radeonsi/si_state_streamout.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/radeonsi/si_state_streamout.c 2020-06-12 01:21:17.000000000 +0000 @@ -65,7 +65,7 @@ t->b.buffer_offset = buffer_offset; t->b.buffer_size = buffer_size; - util_range_add(&buf->valid_buffer_range, buffer_offset, + util_range_add(&buf->b.b, &buf->valid_buffer_range, buffer_offset, buffer_offset + buffer_size); return &t->b; } diff -Nru mesa-19.2.8/src/gallium/drivers/radeonsi/si_state_viewport.c mesa-20.0.8/src/gallium/drivers/radeonsi/si_state_viewport.c --- mesa-19.2.8/src/gallium/drivers/radeonsi/si_state_viewport.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/radeonsi/si_state_viewport.c 2020-06-12 01:21:17.000000000 +0000 @@ -23,11 +23,72 @@ */ #include "si_build_pm4.h" +#include "util/u_upload_mgr.h" #include "util/u_viewport.h" -#include "tgsi/tgsi_scan.h" #define SI_MAX_SCISSOR 16384 +void si_update_ngg_small_prim_precision(struct si_context *ctx) +{ + if (!ctx->screen->use_ngg_culling) + return; + + /* Set VS_STATE.SMALL_PRIM_PRECISION for NGG culling. */ + unsigned num_samples = ctx->framebuffer.nr_samples; + unsigned quant_mode = ctx->viewports.as_scissor[0].quant_mode; + float precision; + + if (quant_mode == SI_QUANT_MODE_12_12_FIXED_POINT_1_4096TH) + precision = num_samples / 4096.0; + else if (quant_mode == SI_QUANT_MODE_14_10_FIXED_POINT_1_1024TH) + precision = num_samples / 1024.0; + else + precision = num_samples / 256.0; + + ctx->current_vs_state &= C_VS_STATE_SMALL_PRIM_PRECISION; + ctx->current_vs_state |= S_VS_STATE_SMALL_PRIM_PRECISION(fui(precision) >> 23); +} + +void si_get_small_prim_cull_info(struct si_context *sctx, + struct si_small_prim_cull_info *out) +{ + /* This is needed by the small primitive culling, because it's done + * in screen space. + */ + struct si_small_prim_cull_info info; + unsigned num_samples = sctx->framebuffer.nr_samples; + assert(num_samples >= 1); + + info.scale[0] = sctx->viewports.states[0].scale[0]; + info.scale[1] = sctx->viewports.states[0].scale[1]; + info.translate[0] = sctx->viewports.states[0].translate[0]; + info.translate[1] = sctx->viewports.states[0].translate[1]; + + /* The viewport shouldn't flip the X axis for the small prim culling to work. */ + assert(-info.scale[0] + info.translate[0] <= info.scale[0] + info.translate[0]); + + /* If the Y axis is inverted (OpenGL default framebuffer), reverse it. + * This is because the viewport transformation inverts the clip space + * bounding box, so min becomes max, which breaks small primitive + * culling. + */ + if (sctx->viewports.y_inverted) { + info.scale[1] = -info.scale[1]; + info.translate[1] = -info.translate[1]; + } + + /* Scale the framebuffer up, so that samples become pixels and small + * primitive culling is the same for all sample counts. + * This only works with the standard DX sample positions, because + * the samples are evenly spaced on both X and Y axes. + */ + for (unsigned i = 0; i < 2; i++) { + info.scale[i] *= num_samples; + info.translate[i] *= num_samples; + } + *out = info; +} + static void si_set_scissor_states(struct pipe_context *pctx, unsigned start_slot, unsigned num_scissors, @@ -282,6 +343,8 @@ vp_as_scissor.quant_mode)); if (initial_cdw != ctx->gfx_cs->current.cdw) ctx->context_roll = true; + + si_update_ngg_small_prim_precision(ctx); } static void si_emit_scissors(struct si_context *ctx) @@ -409,6 +472,35 @@ struct radeon_cmdbuf *cs = ctx->gfx_cs; struct pipe_viewport_state *states = ctx->viewports.states; + if (ctx->screen->use_ngg_culling) { + /* Set the viewport info for small primitive culling. */ + struct si_small_prim_cull_info info; + si_get_small_prim_cull_info(ctx, &info); + + if (memcmp(&info, &ctx->last_small_prim_cull_info, sizeof(info))) { + unsigned offset = 0; + + /* Align to 256, because the address is shifted by 8 bits. */ + u_upload_data(ctx->b.const_uploader, 0, sizeof(info), 256, + &info, &offset, + (struct pipe_resource**)&ctx->small_prim_cull_info_buf); + + ctx->small_prim_cull_info_address = + ctx->small_prim_cull_info_buf->gpu_address + offset; + ctx->last_small_prim_cull_info = info; + ctx->small_prim_cull_info_dirty = true; + } + + if (ctx->small_prim_cull_info_dirty) { + /* This will end up in SGPR6 as (value << 8), shifted by the hw. */ + radeon_add_to_buffer_list(ctx, ctx->gfx_cs, ctx->small_prim_cull_info_buf, + RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER); + radeon_set_sh_reg(ctx->gfx_cs, R_00B220_SPI_SHADER_PGM_LO_GS, + ctx->small_prim_cull_info_address >> 8); + ctx->small_prim_cull_info_dirty = false; + } + } + /* The simple case: Only 1 viewport is active. */ if (!ctx->vs_writes_viewport_index) { radeon_set_context_reg_seq(cs, R_02843C_PA_CL_VPORT_XSCALE, 6); @@ -487,7 +579,7 @@ */ void si_update_vs_viewport_state(struct si_context *ctx) { - struct tgsi_shader_info *info = si_get_vs_info(ctx); + struct si_shader_info *info = si_get_vs_info(ctx); bool vs_window_space; if (!info) diff -Nru mesa-19.2.8/src/gallium/drivers/radeonsi/si_test_dma.c mesa-20.0.8/src/gallium/drivers/radeonsi/si_test_dma.c --- mesa-19.2.8/src/gallium/drivers/radeonsi/si_test_dma.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/radeonsi/si_test_dma.c 2020-06-12 01:21:17.000000000 +0000 @@ -223,7 +223,7 @@ struct si_texture *ssrc; struct cpu_texture src_cpu, dst_cpu; unsigned bpp, max_width, max_height, max_depth, j, num; - unsigned gfx_blits = 0, dma_blits = 0, max_tex_side_gen; + unsigned gfx_blits = 0, dma_blits = 0, cs_blits = 0, max_tex_side_gen; unsigned max_tex_layers; bool pass; bool do_partial_copies = rand() & 1; @@ -323,6 +323,7 @@ struct pipe_box box; unsigned old_num_draw_calls = sctx->num_draw_calls; unsigned old_num_dma_calls = sctx->num_dma_calls; + unsigned old_num_cs_calls = sctx->num_compute_calls; if (!do_partial_copies) { /* copy whole src to dst */ @@ -382,6 +383,7 @@ /* See which engine was used. */ gfx_blits += sctx->num_draw_calls > old_num_draw_calls; dma_blits += sctx->num_dma_calls > old_num_dma_calls; + cs_blits += sctx->num_compute_calls > old_num_cs_calls; /* CPU copy */ util_copy_box(dst_cpu.ptr, tdst.format, dst_cpu.stride, @@ -398,8 +400,8 @@ else num_fail++; - printf("BLITs: GFX = %2u, DMA = %2u, %s [%u/%u]\n", - gfx_blits, dma_blits, pass ? "pass" : "fail", + printf("BLITs: GFX = %2u, DMA = %2u, CS = %2u, %s [%u/%u]\n", + gfx_blits, dma_blits, cs_blits, pass ? "pass" : "fail", num_pass, num_pass+num_fail); /* cleanup */ diff -Nru mesa-19.2.8/src/gallium/drivers/radeonsi/si_test_dma_perf.c mesa-20.0.8/src/gallium/drivers/radeonsi/si_test_dma_perf.c --- mesa-19.2.8/src/gallium/drivers/radeonsi/si_test_dma_perf.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/radeonsi/si_test_dma_perf.c 2020-06-12 01:21:17.000000000 +0000 @@ -112,7 +112,7 @@ unsigned cs_dwords_per_thread = test_cs ? cs_dwords_per_thread_list[cs_method % NUM_SHADERS] : 0; - if (test_sdma && !sctx->dma_cs) + if (test_sdma && !sctx->sdma_cs) continue; if (sctx->chip_class == GFX6) { @@ -191,9 +191,7 @@ } else if (test_sdma) { /* SDMA */ if (is_copy) { - struct pipe_box box; - u_box_1d(0, size, &box); - sctx->dma_copy(ctx, dst, 0, 0, 0, 0, src, 0, &box); + si_sdma_copy_buffer(sctx, dst, src, 0, 0, size); } else { si_sdma_clear_buffer(sctx, dst, 0, size, clear_value); } diff -Nru mesa-19.2.8/src/gallium/drivers/radeonsi/si_texture.c mesa-20.0.8/src/gallium/drivers/radeonsi/si_texture.c --- mesa-19.2.8/src/gallium/drivers/radeonsi/si_texture.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/radeonsi/si_texture.c 2020-06-12 01:21:17.000000000 +0000 @@ -25,7 +25,7 @@ #include "si_pipe.h" #include "si_query.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_log.h" #include "util/u_memory.h" #include "util/u_pack_color.h" @@ -38,429 +38,453 @@ #include "state_tracker/drm_driver.h" #include "sid.h" #include "amd/addrlib/inc/addrinterface.h" +#include "drm-uapi/drm_fourcc.h" static enum radeon_surf_mode si_choose_tiling(struct si_screen *sscreen, - const struct pipe_resource *templ, bool tc_compatible_htile); + const struct pipe_resource *templ, bool tc_compatible_htile); bool si_prepare_for_dma_blit(struct si_context *sctx, - struct si_texture *dst, - unsigned dst_level, unsigned dstx, - unsigned dsty, unsigned dstz, - struct si_texture *src, - unsigned src_level, - const struct pipe_box *src_box) -{ - if (!sctx->dma_cs) - return false; - - if (dst->surface.bpe != src->surface.bpe) - return false; - - /* MSAA: Blits don't exist in the real world. */ - if (src->buffer.b.b.nr_samples > 1 || - dst->buffer.b.b.nr_samples > 1) - return false; - - /* Depth-stencil surfaces: - * When dst is linear, the DB->CB copy preserves HTILE. - * When dst is tiled, the 3D path must be used to update HTILE. - */ - if (src->is_depth || dst->is_depth) - return false; - - /* DCC as: - * src: Use the 3D path. DCC decompression is expensive. - * dst: Use the 3D path to compress the pixels with DCC. - */ - if (vi_dcc_enabled(src, src_level) || - vi_dcc_enabled(dst, dst_level)) - return false; - - /* CMASK as: - * src: Both texture and SDMA paths need decompression. Use SDMA. - * dst: If overwriting the whole texture, discard CMASK and use - * SDMA. Otherwise, use the 3D path. - */ - if (dst->cmask_buffer && dst->dirty_level_mask & (1 << dst_level)) { - /* The CMASK clear is only enabled for the first level. */ - assert(dst_level == 0); - if (!util_texrange_covers_whole_level(&dst->buffer.b.b, dst_level, - dstx, dsty, dstz, src_box->width, - src_box->height, src_box->depth)) - return false; - - si_texture_discard_cmask(sctx->screen, dst); - } - - /* All requirements are met. Prepare textures for SDMA. */ - if (src->cmask_buffer && src->dirty_level_mask & (1 << src_level)) - sctx->b.flush_resource(&sctx->b, &src->buffer.b.b); + struct si_texture *dst, + unsigned dst_level, unsigned dstx, + unsigned dsty, unsigned dstz, + struct si_texture *src, + unsigned src_level, + const struct pipe_box *src_box) +{ + if (!sctx->sdma_cs) + return false; + + if (dst->surface.bpe != src->surface.bpe) + return false; + + /* MSAA: Blits don't exist in the real world. */ + if (src->buffer.b.b.nr_samples > 1 || + dst->buffer.b.b.nr_samples > 1) + return false; + + /* Depth-stencil surfaces: + * When dst is linear, the DB->CB copy preserves HTILE. + * When dst is tiled, the 3D path must be used to update HTILE. + */ + if (src->is_depth || dst->is_depth) + return false; + + /* DCC as: + * src: Use the 3D path. DCC decompression is expensive. + * dst: Use the 3D path to compress the pixels with DCC. + */ + if (vi_dcc_enabled(src, src_level) || + vi_dcc_enabled(dst, dst_level)) + return false; + + /* CMASK as: + * src: Both texture and SDMA paths need decompression. Use SDMA. + * dst: If overwriting the whole texture, discard CMASK and use + * SDMA. Otherwise, use the 3D path. + */ + if (dst->cmask_buffer && dst->dirty_level_mask & (1 << dst_level)) { + /* The CMASK clear is only enabled for the first level. */ + assert(dst_level == 0); + if (!util_texrange_covers_whole_level(&dst->buffer.b.b, dst_level, + dstx, dsty, dstz, src_box->width, + src_box->height, src_box->depth)) + return false; + + si_texture_discard_cmask(sctx->screen, dst); + } + + /* All requirements are met. Prepare textures for SDMA. */ + if (src->cmask_buffer && src->dirty_level_mask & (1 << src_level)) + sctx->b.flush_resource(&sctx->b, &src->buffer.b.b); - assert(!(src->dirty_level_mask & (1 << src_level))); - assert(!(dst->dirty_level_mask & (1 << dst_level))); + assert(!(src->dirty_level_mask & (1 << src_level))); + assert(!(dst->dirty_level_mask & (1 << dst_level))); - return true; + return true; } /* Same as resource_copy_region, except that both upsampling and downsampling are allowed. */ static void si_copy_region_with_blit(struct pipe_context *pipe, - struct pipe_resource *dst, - unsigned dst_level, - unsigned dstx, unsigned dsty, unsigned dstz, - struct pipe_resource *src, - unsigned src_level, - const struct pipe_box *src_box) -{ - struct pipe_blit_info blit; - - memset(&blit, 0, sizeof(blit)); - blit.src.resource = src; - blit.src.format = src->format; - blit.src.level = src_level; - blit.src.box = *src_box; - blit.dst.resource = dst; - blit.dst.format = dst->format; - blit.dst.level = dst_level; - blit.dst.box.x = dstx; - blit.dst.box.y = dsty; - blit.dst.box.z = dstz; - blit.dst.box.width = src_box->width; - blit.dst.box.height = src_box->height; - blit.dst.box.depth = src_box->depth; - blit.mask = util_format_get_mask(dst->format); - blit.filter = PIPE_TEX_FILTER_NEAREST; - - if (blit.mask) { - pipe->blit(pipe, &blit); - } + struct pipe_resource *dst, + unsigned dst_level, + unsigned dstx, unsigned dsty, unsigned dstz, + struct pipe_resource *src, + unsigned src_level, + const struct pipe_box *src_box) +{ + struct pipe_blit_info blit; + + memset(&blit, 0, sizeof(blit)); + blit.src.resource = src; + blit.src.format = src->format; + blit.src.level = src_level; + blit.src.box = *src_box; + blit.dst.resource = dst; + blit.dst.format = dst->format; + blit.dst.level = dst_level; + blit.dst.box.x = dstx; + blit.dst.box.y = dsty; + blit.dst.box.z = dstz; + blit.dst.box.width = src_box->width; + blit.dst.box.height = src_box->height; + blit.dst.box.depth = src_box->depth; + blit.mask = util_format_get_mask(dst->format); + blit.filter = PIPE_TEX_FILTER_NEAREST; + + if (blit.mask) { + pipe->blit(pipe, &blit); + } } /* Copy from a full GPU texture to a transfer's staging one. */ static void si_copy_to_staging_texture(struct pipe_context *ctx, struct si_transfer *stransfer) { - struct si_context *sctx = (struct si_context*)ctx; - struct pipe_transfer *transfer = (struct pipe_transfer*)stransfer; - struct pipe_resource *dst = &stransfer->staging->b.b; - struct pipe_resource *src = transfer->resource; - - if (src->nr_samples > 1 || ((struct si_texture*)src)->is_depth) { - si_copy_region_with_blit(ctx, dst, 0, 0, 0, 0, - src, transfer->level, &transfer->box); - return; - } + struct si_context *sctx = (struct si_context*)ctx; + struct pipe_transfer *transfer = (struct pipe_transfer*)stransfer; + struct pipe_resource *dst = &stransfer->staging->b.b; + struct pipe_resource *src = transfer->resource; + + if (src->nr_samples > 1 || ((struct si_texture*)src)->is_depth) { + si_copy_region_with_blit(ctx, dst, 0, 0, 0, 0, + src, transfer->level, &transfer->box); + return; + } - sctx->dma_copy(ctx, dst, 0, 0, 0, 0, src, transfer->level, - &transfer->box); + sctx->dma_copy(ctx, dst, 0, 0, 0, 0, src, transfer->level, + &transfer->box); } /* Copy from a transfer's staging texture to a full GPU one. */ static void si_copy_from_staging_texture(struct pipe_context *ctx, struct si_transfer *stransfer) { - struct si_context *sctx = (struct si_context*)ctx; - struct pipe_transfer *transfer = (struct pipe_transfer*)stransfer; - struct pipe_resource *dst = transfer->resource; - struct pipe_resource *src = &stransfer->staging->b.b; - struct pipe_box sbox; - - u_box_3d(0, 0, 0, transfer->box.width, transfer->box.height, transfer->box.depth, &sbox); - - if (dst->nr_samples > 1 || ((struct si_texture*)dst)->is_depth) { - si_copy_region_with_blit(ctx, dst, transfer->level, - transfer->box.x, transfer->box.y, transfer->box.z, - src, 0, &sbox); - return; - } - - if (util_format_is_compressed(dst->format)) { - sbox.width = util_format_get_nblocksx(dst->format, sbox.width); - sbox.height = util_format_get_nblocksx(dst->format, sbox.height); - } - - sctx->dma_copy(ctx, dst, transfer->level, - transfer->box.x, transfer->box.y, transfer->box.z, - src, 0, &sbox); + struct si_context *sctx = (struct si_context*)ctx; + struct pipe_transfer *transfer = (struct pipe_transfer*)stransfer; + struct pipe_resource *dst = transfer->resource; + struct pipe_resource *src = &stransfer->staging->b.b; + struct pipe_box sbox; + + u_box_3d(0, 0, 0, transfer->box.width, transfer->box.height, transfer->box.depth, &sbox); + + if (dst->nr_samples > 1 || ((struct si_texture*)dst)->is_depth) { + si_copy_region_with_blit(ctx, dst, transfer->level, + transfer->box.x, transfer->box.y, transfer->box.z, + src, 0, &sbox); + return; + } + + if (util_format_is_compressed(dst->format)) { + sbox.width = util_format_get_nblocksx(dst->format, sbox.width); + sbox.height = util_format_get_nblocksx(dst->format, sbox.height); + } + + sctx->dma_copy(ctx, dst, transfer->level, + transfer->box.x, transfer->box.y, transfer->box.z, + src, 0, &sbox); } static unsigned si_texture_get_offset(struct si_screen *sscreen, - struct si_texture *tex, unsigned level, - const struct pipe_box *box, - unsigned *stride, - unsigned *layer_stride) -{ - if (sscreen->info.chip_class >= GFX9) { - *stride = tex->surface.u.gfx9.surf_pitch * tex->surface.bpe; - *layer_stride = tex->surface.u.gfx9.surf_slice_size; - - if (!box) - return 0; - - /* Each texture is an array of slices. Each slice is an array - * of mipmap levels. */ - return box->z * tex->surface.u.gfx9.surf_slice_size + - tex->surface.u.gfx9.offset[level] + - (box->y / tex->surface.blk_h * - tex->surface.u.gfx9.surf_pitch + - box->x / tex->surface.blk_w) * tex->surface.bpe; - } else { - *stride = tex->surface.u.legacy.level[level].nblk_x * - tex->surface.bpe; - assert((uint64_t)tex->surface.u.legacy.level[level].slice_size_dw * 4 <= UINT_MAX); - *layer_stride = (uint64_t)tex->surface.u.legacy.level[level].slice_size_dw * 4; - - if (!box) - return tex->surface.u.legacy.level[level].offset; - - /* Each texture is an array of mipmap levels. Each level is - * an array of slices. */ - return tex->surface.u.legacy.level[level].offset + - box->z * (uint64_t)tex->surface.u.legacy.level[level].slice_size_dw * 4 + - (box->y / tex->surface.blk_h * - tex->surface.u.legacy.level[level].nblk_x + - box->x / tex->surface.blk_w) * tex->surface.bpe; - } + struct si_texture *tex, unsigned level, + const struct pipe_box *box, + unsigned *stride, + unsigned *layer_stride) +{ + if (sscreen->info.chip_class >= GFX9) { + *stride = tex->surface.u.gfx9.surf_pitch * tex->surface.bpe; + *layer_stride = tex->surface.u.gfx9.surf_slice_size; + + if (!box) + return 0; + + /* Each texture is an array of slices. Each slice is an array + * of mipmap levels. */ + return tex->surface.u.gfx9.surf_offset + + box->z * tex->surface.u.gfx9.surf_slice_size + + tex->surface.u.gfx9.offset[level] + + (box->y / tex->surface.blk_h * + tex->surface.u.gfx9.surf_pitch + + box->x / tex->surface.blk_w) * tex->surface.bpe; + } else { + *stride = tex->surface.u.legacy.level[level].nblk_x * + tex->surface.bpe; + assert((uint64_t)tex->surface.u.legacy.level[level].slice_size_dw * 4 <= UINT_MAX); + *layer_stride = (uint64_t)tex->surface.u.legacy.level[level].slice_size_dw * 4; + + if (!box) + return tex->surface.u.legacy.level[level].offset; + + /* Each texture is an array of mipmap levels. Each level is + * an array of slices. */ + return tex->surface.u.legacy.level[level].offset + + box->z * (uint64_t)tex->surface.u.legacy.level[level].slice_size_dw * 4 + + (box->y / tex->surface.blk_h * + tex->surface.u.legacy.level[level].nblk_x + + box->x / tex->surface.blk_w) * tex->surface.bpe; + } } static int si_init_surface(struct si_screen *sscreen, - struct radeon_surf *surface, - const struct pipe_resource *ptex, - enum radeon_surf_mode array_mode, - unsigned pitch_in_bytes_override, - unsigned offset, - bool is_imported, - bool is_scanout, - bool is_flushed_depth, - bool tc_compatible_htile) -{ - const struct util_format_description *desc = - util_format_description(ptex->format); - bool is_depth, is_stencil; - int r; - unsigned i, bpe, flags = 0; - - is_depth = util_format_has_depth(desc); - is_stencil = util_format_has_stencil(desc); - - if (!is_flushed_depth && - ptex->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT) { - bpe = 4; /* stencil is allocated separately */ - } else { - bpe = util_format_get_blocksize(ptex->format); - assert(util_is_power_of_two_or_zero(bpe)); - } - - if (!is_flushed_depth && is_depth) { - flags |= RADEON_SURF_ZBUFFER; - - if (tc_compatible_htile && - (sscreen->info.chip_class >= GFX9 || - array_mode == RADEON_SURF_MODE_2D)) { - /* TC-compatible HTILE only supports Z32_FLOAT. - * GFX9 also supports Z16_UNORM. - * On GFX8, promote Z16 to Z32. DB->CB copies will convert - * the format for transfers. - */ - if (sscreen->info.chip_class == GFX8) - bpe = 4; - - flags |= RADEON_SURF_TC_COMPATIBLE_HTILE; - } - - if (is_stencil) - flags |= RADEON_SURF_SBUFFER; - } - - if (sscreen->info.chip_class >= GFX8 && - (ptex->flags & SI_RESOURCE_FLAG_DISABLE_DCC || - ptex->format == PIPE_FORMAT_R9G9B9E5_FLOAT || - (ptex->nr_samples >= 2 && !sscreen->dcc_msaa_allowed))) - flags |= RADEON_SURF_DISABLE_DCC; - - /* Stoney: 128bpp MSAA textures randomly fail piglit tests with DCC. */ - if (sscreen->info.family == CHIP_STONEY && - bpe == 16 && ptex->nr_samples >= 2) - flags |= RADEON_SURF_DISABLE_DCC; - - /* GFX8: DCC clear for 4x and 8x MSAA array textures unimplemented. */ - if (sscreen->info.chip_class == GFX8 && - ptex->nr_storage_samples >= 4 && - ptex->array_size > 1) - flags |= RADEON_SURF_DISABLE_DCC; - - /* GFX9: DCC clear for 4x and 8x MSAA textures unimplemented. */ - if (sscreen->info.chip_class == GFX9 && - ptex->nr_storage_samples >= 4) - flags |= RADEON_SURF_DISABLE_DCC; - - /* TODO: GFX10: DCC causes corruption with MSAA. */ - if (sscreen->info.chip_class >= GFX10 && - ptex->nr_storage_samples >= 2) - flags |= RADEON_SURF_DISABLE_DCC; - - if (ptex->bind & PIPE_BIND_SCANOUT || is_scanout) { - /* This should catch bugs in gallium users setting incorrect flags. */ - assert(ptex->nr_samples <= 1 && - ptex->array_size == 1 && - ptex->depth0 == 1 && - ptex->last_level == 0 && - !(flags & RADEON_SURF_Z_OR_SBUFFER)); - - flags |= RADEON_SURF_SCANOUT; - } - - if (ptex->bind & PIPE_BIND_SHARED) - flags |= RADEON_SURF_SHAREABLE; - if (is_imported) - flags |= RADEON_SURF_IMPORTED | RADEON_SURF_SHAREABLE; - if (!(ptex->flags & SI_RESOURCE_FLAG_FORCE_MSAA_TILING)) - flags |= RADEON_SURF_OPTIMIZE_FOR_SPACE; - - if (sscreen->info.chip_class >= GFX10 && - (ptex->flags & SI_RESOURCE_FLAG_FORCE_MSAA_TILING)) { - flags |= RADEON_SURF_FORCE_SWIZZLE_MODE; - surface->u.gfx9.surf.swizzle_mode = ADDR_SW_64KB_R_X; - } - - r = sscreen->ws->surface_init(sscreen->ws, ptex, flags, bpe, - array_mode, surface); - if (r) { - return r; - } - - unsigned pitch = pitch_in_bytes_override / bpe; - - if (sscreen->info.chip_class >= GFX9) { - if (pitch) { - surface->u.gfx9.surf_pitch = pitch; - surface->u.gfx9.surf_slice_size = - (uint64_t)pitch * surface->u.gfx9.surf_height * bpe; - } - surface->u.gfx9.surf_offset = offset; - } else { - if (pitch) { - surface->u.legacy.level[0].nblk_x = pitch; - surface->u.legacy.level[0].slice_size_dw = - ((uint64_t)pitch * surface->u.legacy.level[0].nblk_y * bpe) / 4; - } - if (offset) { - for (i = 0; i < ARRAY_SIZE(surface->u.legacy.level); ++i) - surface->u.legacy.level[i].offset += offset; - } - } - return 0; + struct radeon_surf *surface, + const struct pipe_resource *ptex, + enum radeon_surf_mode array_mode, + unsigned pitch_in_bytes_override, + bool is_imported, + bool is_scanout, + bool is_flushed_depth, + bool tc_compatible_htile) +{ + const struct util_format_description *desc = + util_format_description(ptex->format); + bool is_depth, is_stencil; + int r; + unsigned bpe, flags = 0; + + is_depth = util_format_has_depth(desc); + is_stencil = util_format_has_stencil(desc); + + if (!is_flushed_depth && + ptex->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT) { + bpe = 4; /* stencil is allocated separately */ + } else { + bpe = util_format_get_blocksize(ptex->format); + assert(util_is_power_of_two_or_zero(bpe)); + } + + if (!is_flushed_depth && is_depth) { + flags |= RADEON_SURF_ZBUFFER; + + if (sscreen->debug_flags & DBG(NO_HYPERZ)) { + flags |= RADEON_SURF_NO_HTILE; + } else if (tc_compatible_htile && + (sscreen->info.chip_class >= GFX9 || + array_mode == RADEON_SURF_MODE_2D)) { + /* TC-compatible HTILE only supports Z32_FLOAT. + * GFX9 also supports Z16_UNORM. + * On GFX8, promote Z16 to Z32. DB->CB copies will convert + * the format for transfers. + */ + if (sscreen->info.chip_class == GFX8) + bpe = 4; + + flags |= RADEON_SURF_TC_COMPATIBLE_HTILE; + } + + if (is_stencil) + flags |= RADEON_SURF_SBUFFER; + } + + if (sscreen->info.chip_class >= GFX8 && + (ptex->flags & SI_RESOURCE_FLAG_DISABLE_DCC || + ptex->format == PIPE_FORMAT_R9G9B9E5_FLOAT || + (ptex->nr_samples >= 2 && !sscreen->dcc_msaa_allowed))) + flags |= RADEON_SURF_DISABLE_DCC; + + /* Stoney: 128bpp MSAA textures randomly fail piglit tests with DCC. */ + if (sscreen->info.family == CHIP_STONEY && + bpe == 16 && ptex->nr_samples >= 2) + flags |= RADEON_SURF_DISABLE_DCC; + + /* GFX8: DCC clear for 4x and 8x MSAA array textures unimplemented. */ + if (sscreen->info.chip_class == GFX8 && + ptex->nr_storage_samples >= 4 && + ptex->array_size > 1) + flags |= RADEON_SURF_DISABLE_DCC; + + /* GFX9: DCC clear for 4x and 8x MSAA textures unimplemented. */ + if (sscreen->info.chip_class == GFX9 && + (ptex->nr_storage_samples >= 4 || + (sscreen->info.family == CHIP_RAVEN && + ptex->nr_storage_samples >= 2 && bpe < 4))) + flags |= RADEON_SURF_DISABLE_DCC; + + /* TODO: GFX10: DCC causes corruption with MSAA. */ + if (sscreen->info.chip_class >= GFX10 && + ptex->nr_storage_samples >= 2) + flags |= RADEON_SURF_DISABLE_DCC; + + /* Shared textures must always set up DCC. + * If it's not present, it will be disabled by + * si_get_opaque_metadata later. + */ + if (!is_imported && (sscreen->debug_flags & DBG(NO_DCC))) + flags |= RADEON_SURF_DISABLE_DCC; + + if (is_scanout) { + /* This should catch bugs in gallium users setting incorrect flags. */ + assert(ptex->nr_samples <= 1 && + ptex->array_size == 1 && + ptex->depth0 == 1 && + ptex->last_level == 0 && + !(flags & RADEON_SURF_Z_OR_SBUFFER)); + + flags |= RADEON_SURF_SCANOUT; + } + + if (ptex->bind & PIPE_BIND_SHARED) + flags |= RADEON_SURF_SHAREABLE; + if (is_imported) + flags |= RADEON_SURF_IMPORTED | RADEON_SURF_SHAREABLE; + if (!(ptex->flags & SI_RESOURCE_FLAG_FORCE_MSAA_TILING)) + flags |= RADEON_SURF_OPTIMIZE_FOR_SPACE; + if (sscreen->debug_flags & DBG(NO_FMASK)) + flags |= RADEON_SURF_NO_FMASK; + + if (sscreen->info.chip_class == GFX9 && + (ptex->flags & SI_RESOURCE_FLAG_FORCE_MICRO_TILE_MODE)) { + flags |= RADEON_SURF_FORCE_MICRO_TILE_MODE; + surface->micro_tile_mode = SI_RESOURCE_FLAG_MICRO_TILE_MODE_GET(ptex->flags); + } + + if (sscreen->info.chip_class >= GFX10 && + (ptex->flags & SI_RESOURCE_FLAG_FORCE_MSAA_TILING)) { + flags |= RADEON_SURF_FORCE_SWIZZLE_MODE; + surface->u.gfx9.surf.swizzle_mode = ADDR_SW_64KB_R_X; + } + + r = sscreen->ws->surface_init(sscreen->ws, ptex, flags, bpe, + array_mode, surface); + if (r) { + return r; + } + + unsigned pitch = pitch_in_bytes_override / bpe; + + if (sscreen->info.chip_class >= GFX9) { + if (pitch) { + surface->u.gfx9.surf_pitch = pitch; + if (ptex->last_level == 0) + surface->u.gfx9.surf.epitch = pitch - 1; + surface->u.gfx9.surf_slice_size = + (uint64_t)pitch * surface->u.gfx9.surf_height * bpe; + } + } else { + if (pitch) { + surface->u.legacy.level[0].nblk_x = pitch; + surface->u.legacy.level[0].slice_size_dw = + ((uint64_t)pitch * surface->u.legacy.level[0].nblk_y * bpe) / 4; + } + } + return 0; } static void si_get_display_metadata(struct si_screen *sscreen, - struct radeon_surf *surf, - struct radeon_bo_metadata *metadata, - enum radeon_surf_mode *array_mode, - bool *is_scanout) -{ - if (sscreen->info.chip_class >= GFX9) { - if (metadata->u.gfx9.swizzle_mode > 0) - *array_mode = RADEON_SURF_MODE_2D; - else - *array_mode = RADEON_SURF_MODE_LINEAR_ALIGNED; - - *is_scanout = metadata->u.gfx9.swizzle_mode == 0 || - metadata->u.gfx9.swizzle_mode % 4 == 2; - - surf->u.gfx9.surf.swizzle_mode = metadata->u.gfx9.swizzle_mode; - - if (metadata->u.gfx9.dcc_offset_256B) { - surf->u.gfx9.display_dcc_pitch_max = metadata->u.gfx9.dcc_pitch_max; - assert(metadata->u.gfx9.dcc_independent_64B == 1); - } - } else { - surf->u.legacy.pipe_config = metadata->u.legacy.pipe_config; - surf->u.legacy.bankw = metadata->u.legacy.bankw; - surf->u.legacy.bankh = metadata->u.legacy.bankh; - surf->u.legacy.tile_split = metadata->u.legacy.tile_split; - surf->u.legacy.mtilea = metadata->u.legacy.mtilea; - surf->u.legacy.num_banks = metadata->u.legacy.num_banks; - - if (metadata->u.legacy.macrotile == RADEON_LAYOUT_TILED) - *array_mode = RADEON_SURF_MODE_2D; - else if (metadata->u.legacy.microtile == RADEON_LAYOUT_TILED) - *array_mode = RADEON_SURF_MODE_1D; - else - *array_mode = RADEON_SURF_MODE_LINEAR_ALIGNED; - - *is_scanout = metadata->u.legacy.scanout; - } -} - -void si_eliminate_fast_color_clear(struct si_context *sctx, - struct si_texture *tex) -{ - struct si_screen *sscreen = sctx->screen; - struct pipe_context *ctx = &sctx->b; - - if (ctx == sscreen->aux_context) - mtx_lock(&sscreen->aux_context_lock); - - unsigned n = sctx->num_decompress_calls; - ctx->flush_resource(ctx, &tex->buffer.b.b); - - /* Flush only if any fast clear elimination took place. */ - if (n != sctx->num_decompress_calls) - ctx->flush(ctx, NULL, 0); + struct radeon_surf *surf, + struct radeon_bo_metadata *metadata, + enum radeon_surf_mode *array_mode, + bool *is_scanout) +{ + if (sscreen->info.chip_class >= GFX9) { + if (metadata->u.gfx9.swizzle_mode > 0) + *array_mode = RADEON_SURF_MODE_2D; + else + *array_mode = RADEON_SURF_MODE_LINEAR_ALIGNED; + + surf->u.gfx9.surf.swizzle_mode = metadata->u.gfx9.swizzle_mode; + *is_scanout = metadata->u.gfx9.scanout; + + if (metadata->u.gfx9.dcc_offset_256B) { + surf->u.gfx9.display_dcc_pitch_max = metadata->u.gfx9.dcc_pitch_max; + assert(metadata->u.gfx9.dcc_independent_64B == 1); + } + } else { + surf->u.legacy.pipe_config = metadata->u.legacy.pipe_config; + surf->u.legacy.bankw = metadata->u.legacy.bankw; + surf->u.legacy.bankh = metadata->u.legacy.bankh; + surf->u.legacy.tile_split = metadata->u.legacy.tile_split; + surf->u.legacy.mtilea = metadata->u.legacy.mtilea; + surf->u.legacy.num_banks = metadata->u.legacy.num_banks; + + if (metadata->u.legacy.macrotile == RADEON_LAYOUT_TILED) + *array_mode = RADEON_SURF_MODE_2D; + else if (metadata->u.legacy.microtile == RADEON_LAYOUT_TILED) + *array_mode = RADEON_SURF_MODE_1D; + else + *array_mode = RADEON_SURF_MODE_LINEAR_ALIGNED; + + *is_scanout = metadata->u.legacy.scanout; + } +} + +void si_eliminate_fast_color_clear(struct si_context *sctx, struct si_texture *tex, + bool *ctx_flushed) +{ + struct si_screen *sscreen = sctx->screen; + struct pipe_context *ctx = &sctx->b; + + if (ctx == sscreen->aux_context) + simple_mtx_lock(&sscreen->aux_context_lock); + + unsigned n = sctx->num_decompress_calls; + ctx->flush_resource(ctx, &tex->buffer.b.b); + + /* Flush only if any fast clear elimination took place. */ + bool flushed = false; + if (n != sctx->num_decompress_calls) + { + ctx->flush(ctx, NULL, 0); + flushed = true; + } + if (ctx_flushed) + *ctx_flushed = flushed; - if (ctx == sscreen->aux_context) - mtx_unlock(&sscreen->aux_context_lock); + if (ctx == sscreen->aux_context) + simple_mtx_unlock(&sscreen->aux_context_lock); } void si_texture_discard_cmask(struct si_screen *sscreen, - struct si_texture *tex) + struct si_texture *tex) { - if (!tex->cmask_buffer) - return; + if (!tex->cmask_buffer) + return; - assert(tex->buffer.b.b.nr_samples <= 1); + assert(tex->buffer.b.b.nr_samples <= 1); - /* Disable CMASK. */ - tex->cmask_base_address_reg = tex->buffer.gpu_address >> 8; - tex->dirty_level_mask = 0; + /* Disable CMASK. */ + tex->cmask_base_address_reg = tex->buffer.gpu_address >> 8; + tex->dirty_level_mask = 0; - tex->cb_color_info &= ~S_028C70_FAST_CLEAR(1); + tex->cb_color_info &= ~S_028C70_FAST_CLEAR(1); - if (tex->cmask_buffer != &tex->buffer) - si_resource_reference(&tex->cmask_buffer, NULL); + if (tex->cmask_buffer != &tex->buffer) + si_resource_reference(&tex->cmask_buffer, NULL); - tex->cmask_buffer = NULL; + tex->cmask_buffer = NULL; - /* Notify all contexts about the change. */ - p_atomic_inc(&sscreen->dirty_tex_counter); - p_atomic_inc(&sscreen->compressed_colortex_counter); + /* Notify all contexts about the change. */ + p_atomic_inc(&sscreen->dirty_tex_counter); + p_atomic_inc(&sscreen->compressed_colortex_counter); } static bool si_can_disable_dcc(struct si_texture *tex) { - /* We can't disable DCC if it can be written by another process. */ - return tex->dcc_offset && - (!tex->buffer.b.is_shared || - !(tex->buffer.external_usage & PIPE_HANDLE_USAGE_FRAMEBUFFER_WRITE)); + /* We can't disable DCC if it can be written by another process. */ + return tex->surface.dcc_offset && + (!tex->buffer.b.is_shared || + !(tex->buffer.external_usage & PIPE_HANDLE_USAGE_FRAMEBUFFER_WRITE)); +} + +static void si_texture_zero_dcc_fields(struct si_texture *tex) +{ + tex->surface.dcc_offset = 0; + tex->surface.display_dcc_offset = 0; + tex->surface.dcc_retile_map_offset = 0; } static bool si_texture_discard_dcc(struct si_screen *sscreen, - struct si_texture *tex) + struct si_texture *tex) { - if (!si_can_disable_dcc(tex)) { - assert(tex->display_dcc_offset == 0); - return false; - } - - assert(tex->dcc_separate_buffer == NULL); - - /* Disable DCC. */ - tex->dcc_offset = 0; - tex->display_dcc_offset = 0; - tex->dcc_retile_map_offset = 0; - - /* Notify all contexts about the change. */ - p_atomic_inc(&sscreen->dirty_tex_counter); - return true; + if (!si_can_disable_dcc(tex)) + return false; + + assert(tex->dcc_separate_buffer == NULL); + + /* Disable DCC. */ + si_texture_zero_dcc_fields(tex); + + /* Notify all contexts about the change. */ + p_atomic_inc(&sscreen->dirty_tex_counter); + return true; } /** @@ -485,1379 +509,1405 @@ * if you don't. */ bool si_texture_disable_dcc(struct si_context *sctx, - struct si_texture *tex) + struct si_texture *tex) { - struct si_screen *sscreen = sctx->screen; + struct si_screen *sscreen = sctx->screen; - if (!sctx->has_graphics) - return si_texture_discard_dcc(sscreen, tex); + if (!sctx->has_graphics) + return si_texture_discard_dcc(sscreen, tex); - if (!si_can_disable_dcc(tex)) - return false; + if (!si_can_disable_dcc(tex)) + return false; - if (&sctx->b == sscreen->aux_context) - mtx_lock(&sscreen->aux_context_lock); + if (&sctx->b == sscreen->aux_context) + simple_mtx_lock(&sscreen->aux_context_lock); - /* Decompress DCC. */ - si_decompress_dcc(sctx, tex); - sctx->b.flush(&sctx->b, NULL, 0); + /* Decompress DCC. */ + si_decompress_dcc(sctx, tex); + sctx->b.flush(&sctx->b, NULL, 0); - if (&sctx->b == sscreen->aux_context) - mtx_unlock(&sscreen->aux_context_lock); + if (&sctx->b == sscreen->aux_context) + simple_mtx_unlock(&sscreen->aux_context_lock); - return si_texture_discard_dcc(sscreen, tex); + return si_texture_discard_dcc(sscreen, tex); } static void si_reallocate_texture_inplace(struct si_context *sctx, - struct si_texture *tex, - unsigned new_bind_flag, - bool invalidate_storage) -{ - struct pipe_screen *screen = sctx->b.screen; - struct si_texture *new_tex; - struct pipe_resource templ = tex->buffer.b.b; - unsigned i; - - templ.bind |= new_bind_flag; - - if (tex->buffer.b.is_shared) - return; - - if (new_bind_flag == PIPE_BIND_LINEAR) { - if (tex->surface.is_linear) - return; - - /* This fails with MSAA, depth, and compressed textures. */ - if (si_choose_tiling(sctx->screen, &templ, false) != - RADEON_SURF_MODE_LINEAR_ALIGNED) - return; - } - - new_tex = (struct si_texture*)screen->resource_create(screen, &templ); - if (!new_tex) - return; - - /* Copy the pixels to the new texture. */ - if (!invalidate_storage) { - for (i = 0; i <= templ.last_level; i++) { - struct pipe_box box; - - u_box_3d(0, 0, 0, - u_minify(templ.width0, i), u_minify(templ.height0, i), - util_num_layers(&templ, i), &box); - - sctx->dma_copy(&sctx->b, &new_tex->buffer.b.b, i, 0, 0, 0, - &tex->buffer.b.b, i, &box); - } - } - - if (new_bind_flag == PIPE_BIND_LINEAR) { - si_texture_discard_cmask(sctx->screen, tex); - si_texture_discard_dcc(sctx->screen, tex); - } - - /* Replace the structure fields of tex. */ - tex->buffer.b.b.bind = templ.bind; - pb_reference(&tex->buffer.buf, new_tex->buffer.buf); - tex->buffer.gpu_address = new_tex->buffer.gpu_address; - tex->buffer.vram_usage = new_tex->buffer.vram_usage; - tex->buffer.gart_usage = new_tex->buffer.gart_usage; - tex->buffer.bo_size = new_tex->buffer.bo_size; - tex->buffer.bo_alignment = new_tex->buffer.bo_alignment; - tex->buffer.domains = new_tex->buffer.domains; - tex->buffer.flags = new_tex->buffer.flags; - - tex->surface = new_tex->surface; - tex->size = new_tex->size; - si_texture_reference(&tex->flushed_depth_texture, - new_tex->flushed_depth_texture); - - tex->fmask_offset = new_tex->fmask_offset; - tex->cmask_offset = new_tex->cmask_offset; - tex->cmask_base_address_reg = new_tex->cmask_base_address_reg; - - if (tex->cmask_buffer == &tex->buffer) - tex->cmask_buffer = NULL; - else - si_resource_reference(&tex->cmask_buffer, NULL); - - if (new_tex->cmask_buffer == &new_tex->buffer) - tex->cmask_buffer = &tex->buffer; - else - si_resource_reference(&tex->cmask_buffer, new_tex->cmask_buffer); - - tex->dcc_offset = new_tex->dcc_offset; - tex->cb_color_info = new_tex->cb_color_info; - memcpy(tex->color_clear_value, new_tex->color_clear_value, - sizeof(tex->color_clear_value)); - tex->last_msaa_resolve_target_micro_mode = new_tex->last_msaa_resolve_target_micro_mode; - - tex->htile_offset = new_tex->htile_offset; - tex->depth_clear_value = new_tex->depth_clear_value; - tex->dirty_level_mask = new_tex->dirty_level_mask; - tex->stencil_dirty_level_mask = new_tex->stencil_dirty_level_mask; - tex->db_render_format = new_tex->db_render_format; - tex->stencil_clear_value = new_tex->stencil_clear_value; - tex->tc_compatible_htile = new_tex->tc_compatible_htile; - tex->depth_cleared = new_tex->depth_cleared; - tex->stencil_cleared = new_tex->stencil_cleared; - tex->upgraded_depth = new_tex->upgraded_depth; - tex->db_compatible = new_tex->db_compatible; - tex->can_sample_z = new_tex->can_sample_z; - tex->can_sample_s = new_tex->can_sample_s; - - tex->separate_dcc_dirty = new_tex->separate_dcc_dirty; - tex->dcc_gather_statistics = new_tex->dcc_gather_statistics; - si_resource_reference(&tex->dcc_separate_buffer, - new_tex->dcc_separate_buffer); - si_resource_reference(&tex->last_dcc_separate_buffer, - new_tex->last_dcc_separate_buffer); - - if (new_bind_flag == PIPE_BIND_LINEAR) { - assert(!tex->htile_offset); - assert(!tex->cmask_buffer); - assert(!tex->surface.fmask_size); - assert(!tex->dcc_offset); - assert(!tex->is_depth); - } + struct si_texture *tex, + unsigned new_bind_flag, + bool invalidate_storage) +{ + struct pipe_screen *screen = sctx->b.screen; + struct si_texture *new_tex; + struct pipe_resource templ = tex->buffer.b.b; + unsigned i; + + templ.bind |= new_bind_flag; + + if (tex->buffer.b.is_shared || tex->num_planes > 1) + return; + + if (new_bind_flag == PIPE_BIND_LINEAR) { + if (tex->surface.is_linear) + return; + + /* This fails with MSAA, depth, and compressed textures. */ + if (si_choose_tiling(sctx->screen, &templ, false) != + RADEON_SURF_MODE_LINEAR_ALIGNED) + return; + } + + new_tex = (struct si_texture*)screen->resource_create(screen, &templ); + if (!new_tex) + return; + + /* Copy the pixels to the new texture. */ + if (!invalidate_storage) { + for (i = 0; i <= templ.last_level; i++) { + struct pipe_box box; + + u_box_3d(0, 0, 0, + u_minify(templ.width0, i), u_minify(templ.height0, i), + util_num_layers(&templ, i), &box); + + sctx->dma_copy(&sctx->b, &new_tex->buffer.b.b, i, 0, 0, 0, + &tex->buffer.b.b, i, &box); + } + } + + if (new_bind_flag == PIPE_BIND_LINEAR) { + si_texture_discard_cmask(sctx->screen, tex); + si_texture_discard_dcc(sctx->screen, tex); + } + + /* Replace the structure fields of tex. */ + tex->buffer.b.b.bind = templ.bind; + pb_reference(&tex->buffer.buf, new_tex->buffer.buf); + tex->buffer.gpu_address = new_tex->buffer.gpu_address; + tex->buffer.vram_usage = new_tex->buffer.vram_usage; + tex->buffer.gart_usage = new_tex->buffer.gart_usage; + tex->buffer.bo_size = new_tex->buffer.bo_size; + tex->buffer.bo_alignment = new_tex->buffer.bo_alignment; + tex->buffer.domains = new_tex->buffer.domains; + tex->buffer.flags = new_tex->buffer.flags; + + tex->surface = new_tex->surface; + si_texture_reference(&tex->flushed_depth_texture, + new_tex->flushed_depth_texture); + + tex->surface.fmask_offset = new_tex->surface.fmask_offset; + tex->surface.cmask_offset = new_tex->surface.cmask_offset; + tex->cmask_base_address_reg = new_tex->cmask_base_address_reg; + + if (tex->cmask_buffer == &tex->buffer) + tex->cmask_buffer = NULL; + else + si_resource_reference(&tex->cmask_buffer, NULL); + + if (new_tex->cmask_buffer == &new_tex->buffer) + tex->cmask_buffer = &tex->buffer; + else + si_resource_reference(&tex->cmask_buffer, new_tex->cmask_buffer); + + tex->surface.dcc_offset = new_tex->surface.dcc_offset; + tex->cb_color_info = new_tex->cb_color_info; + memcpy(tex->color_clear_value, new_tex->color_clear_value, + sizeof(tex->color_clear_value)); + tex->last_msaa_resolve_target_micro_mode = new_tex->last_msaa_resolve_target_micro_mode; + + tex->surface.htile_offset = new_tex->surface.htile_offset; + tex->depth_clear_value = new_tex->depth_clear_value; + tex->dirty_level_mask = new_tex->dirty_level_mask; + tex->stencil_dirty_level_mask = new_tex->stencil_dirty_level_mask; + tex->db_render_format = new_tex->db_render_format; + tex->stencil_clear_value = new_tex->stencil_clear_value; + tex->tc_compatible_htile = new_tex->tc_compatible_htile; + tex->depth_cleared = new_tex->depth_cleared; + tex->stencil_cleared = new_tex->stencil_cleared; + tex->upgraded_depth = new_tex->upgraded_depth; + tex->db_compatible = new_tex->db_compatible; + tex->can_sample_z = new_tex->can_sample_z; + tex->can_sample_s = new_tex->can_sample_s; + + tex->separate_dcc_dirty = new_tex->separate_dcc_dirty; + tex->displayable_dcc_dirty = new_tex->displayable_dcc_dirty; + tex->dcc_gather_statistics = new_tex->dcc_gather_statistics; + si_resource_reference(&tex->dcc_separate_buffer, + new_tex->dcc_separate_buffer); + si_resource_reference(&tex->last_dcc_separate_buffer, + new_tex->last_dcc_separate_buffer); + + if (new_bind_flag == PIPE_BIND_LINEAR) { + assert(!tex->surface.htile_offset); + assert(!tex->cmask_buffer); + assert(!tex->surface.fmask_size); + assert(!tex->surface.dcc_offset); + assert(!tex->is_depth); + } - si_texture_reference(&new_tex, NULL); + si_texture_reference(&new_tex, NULL); - p_atomic_inc(&sctx->screen->dirty_tex_counter); + p_atomic_inc(&sctx->screen->dirty_tex_counter); } static uint32_t si_get_bo_metadata_word1(struct si_screen *sscreen) { - return (ATI_VENDOR_ID << 16) | sscreen->info.pci_id; + return (ATI_VENDOR_ID << 16) | sscreen->info.pci_id; } static void si_set_tex_bo_metadata(struct si_screen *sscreen, - struct si_texture *tex) + struct si_texture *tex) { - struct radeon_surf *surface = &tex->surface; - struct pipe_resource *res = &tex->buffer.b.b; - struct radeon_bo_metadata md; - - memset(&md, 0, sizeof(md)); - - if (sscreen->info.chip_class >= GFX9) { - md.u.gfx9.swizzle_mode = surface->u.gfx9.surf.swizzle_mode; - - if (tex->dcc_offset && !tex->dcc_separate_buffer) { - uint64_t dcc_offset = - tex->display_dcc_offset ? tex->display_dcc_offset - : tex->dcc_offset; - - assert((dcc_offset >> 8) != 0 && (dcc_offset >> 8) < (1 << 24)); - md.u.gfx9.dcc_offset_256B = dcc_offset >> 8; - md.u.gfx9.dcc_pitch_max = tex->surface.u.gfx9.display_dcc_pitch_max; - md.u.gfx9.dcc_independent_64B = 1; - } - } else { - md.u.legacy.microtile = surface->u.legacy.level[0].mode >= RADEON_SURF_MODE_1D ? - RADEON_LAYOUT_TILED : RADEON_LAYOUT_LINEAR; - md.u.legacy.macrotile = surface->u.legacy.level[0].mode >= RADEON_SURF_MODE_2D ? - RADEON_LAYOUT_TILED : RADEON_LAYOUT_LINEAR; - md.u.legacy.pipe_config = surface->u.legacy.pipe_config; - md.u.legacy.bankw = surface->u.legacy.bankw; - md.u.legacy.bankh = surface->u.legacy.bankh; - md.u.legacy.tile_split = surface->u.legacy.tile_split; - md.u.legacy.mtilea = surface->u.legacy.mtilea; - md.u.legacy.num_banks = surface->u.legacy.num_banks; - md.u.legacy.stride = surface->u.legacy.level[0].nblk_x * surface->bpe; - md.u.legacy.scanout = (surface->flags & RADEON_SURF_SCANOUT) != 0; - } - - assert(tex->dcc_separate_buffer == NULL); - assert(tex->surface.fmask_size == 0); - - /* Metadata image format format version 1: - * [0] = 1 (metadata format identifier) - * [1] = (VENDOR_ID << 16) | PCI_ID - * [2:9] = image descriptor for the whole resource - * [2] is always 0, because the base address is cleared - * [9] is the DCC offset bits [39:8] from the beginning of - * the buffer - * [10:10+LAST_LEVEL] = mipmap level offset bits [39:8] for each level - */ - - md.metadata[0] = 1; /* metadata image format version 1 */ - - /* TILE_MODE_INDEX is ambiguous without a PCI ID. */ - md.metadata[1] = si_get_bo_metadata_word1(sscreen); - - static const unsigned char swizzle[] = { - PIPE_SWIZZLE_X, - PIPE_SWIZZLE_Y, - PIPE_SWIZZLE_Z, - PIPE_SWIZZLE_W - }; - bool is_array = util_texture_is_array(res->target); - uint32_t desc[8]; - - sscreen->make_texture_descriptor(sscreen, tex, true, - res->target, res->format, - swizzle, 0, res->last_level, 0, - is_array ? res->array_size - 1 : 0, - res->width0, res->height0, res->depth0, - desc, NULL); - - si_set_mutable_tex_desc_fields(sscreen, tex, &tex->surface.u.legacy.level[0], - 0, 0, tex->surface.blk_w, false, desc); - - /* Clear the base address and set the relative DCC offset. */ - desc[0] = 0; - desc[1] &= C_008F14_BASE_ADDRESS_HI; - - switch (sscreen->info.chip_class) { - case GFX6: - case GFX7: - break; - case GFX8: - desc[7] = tex->dcc_offset >> 8; - break; - case GFX9: - desc[7] = tex->dcc_offset >> 8; - desc[5] &= C_008F24_META_DATA_ADDRESS; - desc[5] |= S_008F24_META_DATA_ADDRESS(tex->dcc_offset >> 40); - break; - case GFX10: - desc[6] &= C_00A018_META_DATA_ADDRESS_LO; - desc[6] |= S_00A018_META_DATA_ADDRESS_LO(tex->dcc_offset >> 8); - desc[7] = tex->dcc_offset >> 16; - break; - default: - assert(0); - } - - - /* Dwords [2:9] contain the image descriptor. */ - memcpy(&md.metadata[2], desc, sizeof(desc)); - md.size_metadata = 10 * 4; - - /* Dwords [10:..] contain the mipmap level offsets. */ - if (sscreen->info.chip_class <= GFX8) { - for (unsigned i = 0; i <= res->last_level; i++) - md.metadata[10+i] = tex->surface.u.legacy.level[i].offset >> 8; + struct radeon_surf *surface = &tex->surface; + struct pipe_resource *res = &tex->buffer.b.b; + struct radeon_bo_metadata md; + + memset(&md, 0, sizeof(md)); + + if (sscreen->info.chip_class >= GFX9) { + md.u.gfx9.swizzle_mode = surface->u.gfx9.surf.swizzle_mode; + md.u.gfx9.scanout = (surface->flags & RADEON_SURF_SCANOUT) != 0; + + if (tex->surface.dcc_offset && !tex->dcc_separate_buffer) { + uint64_t dcc_offset = + tex->surface.display_dcc_offset ? tex->surface.display_dcc_offset + : tex->surface.dcc_offset; + + assert((dcc_offset >> 8) != 0 && (dcc_offset >> 8) < (1 << 24)); + md.u.gfx9.dcc_offset_256B = dcc_offset >> 8; + md.u.gfx9.dcc_pitch_max = tex->surface.u.gfx9.display_dcc_pitch_max; + md.u.gfx9.dcc_independent_64B = 1; + } + } else { + md.u.legacy.microtile = surface->u.legacy.level[0].mode >= RADEON_SURF_MODE_1D ? + RADEON_LAYOUT_TILED : RADEON_LAYOUT_LINEAR; + md.u.legacy.macrotile = surface->u.legacy.level[0].mode >= RADEON_SURF_MODE_2D ? + RADEON_LAYOUT_TILED : RADEON_LAYOUT_LINEAR; + md.u.legacy.pipe_config = surface->u.legacy.pipe_config; + md.u.legacy.bankw = surface->u.legacy.bankw; + md.u.legacy.bankh = surface->u.legacy.bankh; + md.u.legacy.tile_split = surface->u.legacy.tile_split; + md.u.legacy.mtilea = surface->u.legacy.mtilea; + md.u.legacy.num_banks = surface->u.legacy.num_banks; + md.u.legacy.stride = surface->u.legacy.level[0].nblk_x * surface->bpe; + md.u.legacy.scanout = (surface->flags & RADEON_SURF_SCANOUT) != 0; + } + + assert(tex->dcc_separate_buffer == NULL); + assert(tex->surface.fmask_size == 0); + + /* Metadata image format format version 1: + * [0] = 1 (metadata format identifier) + * [1] = (VENDOR_ID << 16) | PCI_ID + * [2:9] = image descriptor for the whole resource + * [2] is always 0, because the base address is cleared + * [9] is the DCC offset bits [39:8] from the beginning of + * the buffer + * [10:10+LAST_LEVEL] = mipmap level offset bits [39:8] for each level + */ + + md.metadata[0] = 1; /* metadata image format version 1 */ + + /* TILE_MODE_INDEX is ambiguous without a PCI ID. */ + md.metadata[1] = si_get_bo_metadata_word1(sscreen); + + static const unsigned char swizzle[] = { + PIPE_SWIZZLE_X, + PIPE_SWIZZLE_Y, + PIPE_SWIZZLE_Z, + PIPE_SWIZZLE_W + }; + bool is_array = util_texture_is_array(res->target); + uint32_t desc[8]; + + sscreen->make_texture_descriptor(sscreen, tex, true, + res->target, res->format, + swizzle, 0, res->last_level, 0, + is_array ? res->array_size - 1 : 0, + res->width0, res->height0, res->depth0, + desc, NULL); + + si_set_mutable_tex_desc_fields(sscreen, tex, &tex->surface.u.legacy.level[0], + 0, 0, tex->surface.blk_w, false, desc); + + /* Clear the base address and set the relative DCC offset. */ + desc[0] = 0; + desc[1] &= C_008F14_BASE_ADDRESS_HI; + + switch (sscreen->info.chip_class) { + case GFX6: + case GFX7: + break; + case GFX8: + desc[7] = tex->surface.dcc_offset >> 8; + break; + case GFX9: + desc[7] = tex->surface.dcc_offset >> 8; + desc[5] &= C_008F24_META_DATA_ADDRESS; + desc[5] |= S_008F24_META_DATA_ADDRESS(tex->surface.dcc_offset >> 40); + break; + case GFX10: + desc[6] &= C_00A018_META_DATA_ADDRESS_LO; + desc[6] |= S_00A018_META_DATA_ADDRESS_LO(tex->surface.dcc_offset >> 8); + desc[7] = tex->surface.dcc_offset >> 16; + break; + default: + assert(0); + } + + + /* Dwords [2:9] contain the image descriptor. */ + memcpy(&md.metadata[2], desc, sizeof(desc)); + md.size_metadata = 10 * 4; + + /* Dwords [10:..] contain the mipmap level offsets. */ + if (sscreen->info.chip_class <= GFX8) { + for (unsigned i = 0; i <= res->last_level; i++) + md.metadata[10+i] = tex->surface.u.legacy.level[i].offset >> 8; - md.size_metadata += (1 + res->last_level) * 4; - } + md.size_metadata += (1 + res->last_level) * 4; + } - sscreen->ws->buffer_set_metadata(tex->buffer.buf, &md); + sscreen->ws->buffer_set_metadata(tex->buffer.buf, &md); } static bool si_read_tex_bo_metadata(struct si_screen *sscreen, - struct si_texture *tex, - struct radeon_bo_metadata *md) -{ - uint32_t *desc = &md->metadata[2]; - - if (md->size_metadata < 10 * 4 || /* at least 2(header) + 8(desc) dwords */ - md->metadata[0] == 0 || /* invalid version number */ - md->metadata[1] != si_get_bo_metadata_word1(sscreen)) /* invalid PCI ID */ { - /* Don't report an error if the texture comes from an incompatible driver, - * but this might not work. - */ - return true; - } - - /* Validate that sample counts and the number of mipmap levels match. */ - unsigned last_level = G_008F1C_LAST_LEVEL(desc[3]); - unsigned type = G_008F1C_TYPE(desc[3]); - - if (type == V_008F1C_SQ_RSRC_IMG_2D_MSAA || - type == V_008F1C_SQ_RSRC_IMG_2D_MSAA_ARRAY) { - unsigned log_samples = - util_logbase2(MAX2(1, tex->buffer.b.b.nr_storage_samples)); - - if (last_level != log_samples) { - fprintf(stderr, "radeonsi: invalid MSAA texture import, " - "metadata has log2(samples) = %u, the caller set %u\n", - last_level, log_samples); - return false; - } - } else { - if (last_level != tex->buffer.b.b.last_level) { - fprintf(stderr, "radeonsi: invalid mipmapped texture import, " - "metadata has last_level = %u, the caller set %u\n", - last_level, tex->buffer.b.b.last_level); - return false; - } - } - - if (sscreen->info.chip_class >= GFX8 && - G_008F28_COMPRESSION_EN(desc[6])) { - /* Read DCC information. - * - * Some state trackers don't set the SCANOUT flag when - * importing displayable images, which affects PIPE_ALIGNED - * and RB_ALIGNED, so we need to recover them here. - */ - switch (sscreen->info.chip_class) { - case GFX8: - tex->dcc_offset = (uint64_t)desc[7] << 8; - break; - - case GFX9: - tex->dcc_offset = - ((uint64_t)desc[7] << 8) | - ((uint64_t)G_008F24_META_DATA_ADDRESS(desc[5]) << 40); - tex->surface.u.gfx9.dcc.pipe_aligned = - G_008F24_META_PIPE_ALIGNED(desc[5]); - tex->surface.u.gfx9.dcc.rb_aligned = - G_008F24_META_RB_ALIGNED(desc[5]); - - /* If DCC is unaligned, this can only be a displayable image. */ - if (!tex->surface.u.gfx9.dcc.pipe_aligned && - !tex->surface.u.gfx9.dcc.rb_aligned) - tex->surface.is_displayable = true; - break; - - case GFX10: - tex->dcc_offset = - ((uint64_t)G_00A018_META_DATA_ADDRESS_LO(desc[6]) << 8) | - ((uint64_t)desc[7] << 16); - tex->surface.u.gfx9.dcc.pipe_aligned = - G_00A018_META_PIPE_ALIGNED(desc[6]); - break; - - default: - assert(0); - return false; - } - } else { - /* Disable DCC. dcc_offset is always set by texture_from_handle - * and must be cleared here. - */ - tex->dcc_offset = 0; - } + struct si_texture *tex, + uint64_t offset, + struct radeon_bo_metadata *md) +{ + uint32_t *desc = &md->metadata[2]; + + if (offset || /* Non-zero planes ignore metadata. */ + md->size_metadata < 10 * 4 || /* at least 2(header) + 8(desc) dwords */ + md->metadata[0] == 0 || /* invalid version number */ + md->metadata[1] != si_get_bo_metadata_word1(sscreen)) /* invalid PCI ID */ { + /* Disable DCC because it might not be enabled. */ + si_texture_zero_dcc_fields(tex); + + /* Don't report an error if the texture comes from an incompatible driver, + * but this might not work. + */ + return true; + } + + /* Validate that sample counts and the number of mipmap levels match. */ + unsigned last_level = G_008F1C_LAST_LEVEL(desc[3]); + unsigned type = G_008F1C_TYPE(desc[3]); + + if (type == V_008F1C_SQ_RSRC_IMG_2D_MSAA || + type == V_008F1C_SQ_RSRC_IMG_2D_MSAA_ARRAY) { + unsigned log_samples = + util_logbase2(MAX2(1, tex->buffer.b.b.nr_storage_samples)); + + if (last_level != log_samples) { + fprintf(stderr, "radeonsi: invalid MSAA texture import, " + "metadata has log2(samples) = %u, the caller set %u\n", + last_level, log_samples); + return false; + } + } else { + if (last_level != tex->buffer.b.b.last_level) { + fprintf(stderr, "radeonsi: invalid mipmapped texture import, " + "metadata has last_level = %u, the caller set %u\n", + last_level, tex->buffer.b.b.last_level); + return false; + } + } + + if (sscreen->info.chip_class >= GFX8 && + G_008F28_COMPRESSION_EN(desc[6])) { + /* Read DCC information. */ + switch (sscreen->info.chip_class) { + case GFX8: + tex->surface.dcc_offset = (uint64_t)desc[7] << 8; + break; + + case GFX9: + tex->surface.dcc_offset = + ((uint64_t)desc[7] << 8) | + ((uint64_t)G_008F24_META_DATA_ADDRESS(desc[5]) << 40); + tex->surface.u.gfx9.dcc.pipe_aligned = + G_008F24_META_PIPE_ALIGNED(desc[5]); + tex->surface.u.gfx9.dcc.rb_aligned = + G_008F24_META_RB_ALIGNED(desc[5]); + + /* If DCC is unaligned, this can only be a displayable image. */ + if (!tex->surface.u.gfx9.dcc.pipe_aligned && + !tex->surface.u.gfx9.dcc.rb_aligned) + assert(tex->surface.is_displayable); + break; + + case GFX10: + tex->surface.dcc_offset = + ((uint64_t)G_00A018_META_DATA_ADDRESS_LO(desc[6]) << 8) | + ((uint64_t)desc[7] << 16); + tex->surface.u.gfx9.dcc.pipe_aligned = + G_00A018_META_PIPE_ALIGNED(desc[6]); + break; + + default: + assert(0); + return false; + } + } else { + /* Disable DCC. dcc_offset is always set by texture_from_handle + * and must be cleared here. + */ + si_texture_zero_dcc_fields(tex); + } - return true; + return true; } static bool si_has_displayable_dcc(struct si_texture *tex) { - struct si_screen *sscreen = (struct si_screen*)tex->buffer.b.b.screen; - - if (sscreen->info.chip_class <= GFX8) - return false; + struct si_screen *sscreen = (struct si_screen*)tex->buffer.b.b.screen; - /* This needs a cache flush before scanout. - * (it can't be scanned out and rendered to simultaneously) - */ - if (sscreen->info.use_display_dcc_unaligned && - tex->dcc_offset && - !tex->surface.u.gfx9.dcc.pipe_aligned && - !tex->surface.u.gfx9.dcc.rb_aligned) - return true; - - /* This needs an explicit flush (flush_resource). */ - if (sscreen->info.use_display_dcc_with_retile_blit && - tex->display_dcc_offset) - return true; + if (sscreen->info.chip_class <= GFX8) + return false; - return false; + /* This needs a cache flush before scanout. + * (it can't be scanned out and rendered to simultaneously) + */ + if (sscreen->info.use_display_dcc_unaligned && + tex->surface.dcc_offset && + !tex->surface.u.gfx9.dcc.pipe_aligned && + !tex->surface.u.gfx9.dcc.rb_aligned) + return true; + + /* This needs an explicit flush (flush_resource). */ + if (sscreen->info.use_display_dcc_with_retile_blit && + tex->surface.display_dcc_offset) + return true; + + return false; +} + +static bool si_resource_get_param(struct pipe_screen *screen, + struct pipe_context *context, + struct pipe_resource *resource, + unsigned plane, + unsigned layer, + enum pipe_resource_param param, + unsigned handle_usage, + uint64_t *value) +{ + for (unsigned i = 0; i < plane; i++) + resource = resource->next; + + struct si_screen *sscreen = (struct si_screen*)screen; + struct si_texture *tex = (struct si_texture*)resource; + struct winsys_handle whandle; + + switch (param) { + case PIPE_RESOURCE_PARAM_NPLANES: + *value = resource->target == PIPE_BUFFER ? 1 : tex->num_planes; + return true; + + case PIPE_RESOURCE_PARAM_STRIDE: + if (resource->target == PIPE_BUFFER) + *value = 0; + else if (sscreen->info.chip_class >= GFX9) + *value = tex->surface.u.gfx9.surf_pitch * tex->surface.bpe; + else + *value = tex->surface.u.legacy.level[0].nblk_x * tex->surface.bpe; + return true; + + case PIPE_RESOURCE_PARAM_OFFSET: + if (resource->target == PIPE_BUFFER) + *value = 0; + else if (sscreen->info.chip_class >= GFX9) + *value = tex->surface.u.gfx9.surf_offset + + layer * tex->surface.u.gfx9.surf_slice_size; + else + *value = tex->surface.u.legacy.level[0].offset + + layer * (uint64_t)tex->surface.u.legacy.level[0].slice_size_dw * 4; + return true; + + case PIPE_RESOURCE_PARAM_MODIFIER: + *value = DRM_FORMAT_MOD_INVALID; + return true; + + case PIPE_RESOURCE_PARAM_HANDLE_TYPE_SHARED: + case PIPE_RESOURCE_PARAM_HANDLE_TYPE_KMS: + case PIPE_RESOURCE_PARAM_HANDLE_TYPE_FD: + memset(&whandle, 0, sizeof(whandle)); + + if (param == PIPE_RESOURCE_PARAM_HANDLE_TYPE_SHARED) + whandle.type = WINSYS_HANDLE_TYPE_SHARED; + else if (param == PIPE_RESOURCE_PARAM_HANDLE_TYPE_KMS) + whandle.type = WINSYS_HANDLE_TYPE_KMS; + else if (param == PIPE_RESOURCE_PARAM_HANDLE_TYPE_FD) + whandle.type = WINSYS_HANDLE_TYPE_FD; + + if (!screen->resource_get_handle(screen, context, resource, + &whandle, handle_usage)) + return false; + + *value = whandle.handle; + return true; + } + return false; } static void si_texture_get_info(struct pipe_screen* screen, - struct pipe_resource *resource, - unsigned *pstride, - unsigned *poffset) -{ - struct si_screen *sscreen = (struct si_screen*)screen; - struct si_texture *tex = (struct si_texture*)resource; - unsigned stride = 0; - unsigned offset = 0; - - if (!sscreen || !tex) - return; - - if (resource->target != PIPE_BUFFER) { - if (sscreen->info.chip_class >= GFX9) { - offset = tex->surface.u.gfx9.surf_offset; - stride = tex->surface.u.gfx9.surf_pitch * - tex->surface.bpe; - } else { - offset = tex->surface.u.legacy.level[0].offset; - stride = tex->surface.u.legacy.level[0].nblk_x * - tex->surface.bpe; - } - } - - if (pstride) - *pstride = stride; - - if (poffset) - *poffset = offset; + struct pipe_resource *resource, + unsigned *pstride, + unsigned *poffset) +{ + uint64_t value; + + if (pstride) { + si_resource_get_param(screen, NULL, resource, 0, 0, + PIPE_RESOURCE_PARAM_STRIDE, 0, &value); + *pstride = value; + } + + if (poffset) { + si_resource_get_param(screen, NULL, resource, 0, 0, + PIPE_RESOURCE_PARAM_OFFSET, 0, &value); + *poffset = value; + } } static bool si_texture_get_handle(struct pipe_screen* screen, - struct pipe_context *ctx, - struct pipe_resource *resource, - struct winsys_handle *whandle, - unsigned usage) -{ - struct si_screen *sscreen = (struct si_screen*)screen; - struct si_context *sctx; - struct si_resource *res = si_resource(resource); - struct si_texture *tex = (struct si_texture*)resource; - bool update_metadata = false; - unsigned stride, offset, slice_size; - bool flush = false; - - ctx = threaded_context_unwrap_sync(ctx); - sctx = (struct si_context*)(ctx ? ctx : sscreen->aux_context); - - if (resource->target != PIPE_BUFFER) { - /* This is not supported now, but it might be required for OpenCL - * interop in the future. - */ - if (resource->nr_samples > 1 || tex->is_depth) - return false; - - /* Move a suballocated texture into a non-suballocated allocation. */ - if (sscreen->ws->buffer_is_suballocated(res->buf) || - tex->surface.tile_swizzle || - (tex->buffer.flags & RADEON_FLAG_NO_INTERPROCESS_SHARING && - sscreen->info.has_local_buffers)) { - assert(!res->b.is_shared); - si_reallocate_texture_inplace(sctx, tex, - PIPE_BIND_SHARED, false); - flush = true; - assert(res->b.b.bind & PIPE_BIND_SHARED); - assert(res->flags & RADEON_FLAG_NO_SUBALLOC); - assert(!(res->flags & RADEON_FLAG_NO_INTERPROCESS_SHARING)); - assert(tex->surface.tile_swizzle == 0); - } - - /* Since shader image stores don't support DCC on GFX8, - * disable it for external clients that want write - * access. - */ - if ((usage & PIPE_HANDLE_USAGE_SHADER_WRITE && tex->dcc_offset) || - /* Displayable DCC requires an explicit flush. */ - (!(usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH) && - si_has_displayable_dcc(tex))) { - if (si_texture_disable_dcc(sctx, tex)) { - update_metadata = true; - /* si_texture_disable_dcc flushes the context */ - flush = false; - } - } - - if (!(usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH) && - (tex->cmask_buffer || tex->dcc_offset)) { - /* Eliminate fast clear (both CMASK and DCC) */ - si_eliminate_fast_color_clear(sctx, tex); - /* eliminate_fast_color_clear flushes the context */ - flush = false; - - /* Disable CMASK if flush_resource isn't going - * to be called. - */ - if (tex->cmask_buffer) - si_texture_discard_cmask(sscreen, tex); - } - - /* Set metadata. */ - if (!res->b.is_shared || update_metadata) - si_set_tex_bo_metadata(sscreen, tex); - - if (sscreen->info.chip_class >= GFX9) { - slice_size = tex->surface.u.gfx9.surf_slice_size; - } else { - slice_size = (uint64_t)tex->surface.u.legacy.level[0].slice_size_dw * 4; - } - } else { - /* Buffer exports are for the OpenCL interop. */ - /* Move a suballocated buffer into a non-suballocated allocation. */ - if (sscreen->ws->buffer_is_suballocated(res->buf) || - /* A DMABUF export always fails if the BO is local. */ - (tex->buffer.flags & RADEON_FLAG_NO_INTERPROCESS_SHARING && - sscreen->info.has_local_buffers)) { - assert(!res->b.is_shared); - - /* Allocate a new buffer with PIPE_BIND_SHARED. */ - struct pipe_resource templ = res->b.b; - templ.bind |= PIPE_BIND_SHARED; - - struct pipe_resource *newb = - screen->resource_create(screen, &templ); - if (!newb) - return false; - - /* Copy the old buffer contents to the new one. */ - struct pipe_box box; - u_box_1d(0, newb->width0, &box); - sctx->b.resource_copy_region(&sctx->b, newb, 0, 0, 0, 0, - &res->b.b, 0, &box); - flush = true; - /* Move the new buffer storage to the old pipe_resource. */ - si_replace_buffer_storage(&sctx->b, &res->b.b, newb); - pipe_resource_reference(&newb, NULL); - - assert(res->b.b.bind & PIPE_BIND_SHARED); - assert(res->flags & RADEON_FLAG_NO_SUBALLOC); - } - - /* Buffers */ - slice_size = 0; - } - - si_texture_get_info(screen, resource, &stride, &offset); - - if (flush) - sctx->b.flush(&sctx->b, NULL, 0); - - if (res->b.is_shared) { - /* USAGE_EXPLICIT_FLUSH must be cleared if at least one user - * doesn't set it. - */ - res->external_usage |= usage & ~PIPE_HANDLE_USAGE_EXPLICIT_FLUSH; - if (!(usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH)) - res->external_usage &= ~PIPE_HANDLE_USAGE_EXPLICIT_FLUSH; - } else { - res->b.is_shared = true; - res->external_usage = usage; - } + struct pipe_context *ctx, + struct pipe_resource *resource, + struct winsys_handle *whandle, + unsigned usage) +{ + struct si_screen *sscreen = (struct si_screen*)screen; + struct si_context *sctx; + struct si_resource *res = si_resource(resource); + struct si_texture *tex = (struct si_texture*)resource; + bool update_metadata = false; + unsigned stride, offset, slice_size; + bool flush = false; + + ctx = threaded_context_unwrap_sync(ctx); + sctx = (struct si_context*)(ctx ? ctx : sscreen->aux_context); + + if (resource->target != PIPE_BUFFER) { + /* Individual planes are chained pipe_resource instances. */ + for (unsigned i = 0; i < whandle->plane; i++) { + resource = resource->next; + res = si_resource(resource); + tex = (struct si_texture*)resource; + } + + /* This is not supported now, but it might be required for OpenCL + * interop in the future. + */ + if (resource->nr_samples > 1 || tex->is_depth) + return false; + + /* Move a suballocated texture into a non-suballocated allocation. */ + if (sscreen->ws->buffer_is_suballocated(res->buf) || + tex->surface.tile_swizzle || + (tex->buffer.flags & RADEON_FLAG_NO_INTERPROCESS_SHARING && + sscreen->info.has_local_buffers)) { + assert(!res->b.is_shared); + si_reallocate_texture_inplace(sctx, tex, + PIPE_BIND_SHARED, false); + flush = true; + assert(res->b.b.bind & PIPE_BIND_SHARED); + assert(res->flags & RADEON_FLAG_NO_SUBALLOC); + assert(!(res->flags & RADEON_FLAG_NO_INTERPROCESS_SHARING)); + assert(tex->surface.tile_swizzle == 0); + } + + /* Since shader image stores don't support DCC on GFX8, + * disable it for external clients that want write + * access. + */ + if ((usage & PIPE_HANDLE_USAGE_SHADER_WRITE && tex->surface.dcc_offset) || + /* Displayable DCC requires an explicit flush. */ + (!(usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH) && + si_has_displayable_dcc(tex))) { + if (si_texture_disable_dcc(sctx, tex)) { + update_metadata = true; + /* si_texture_disable_dcc flushes the context */ + flush = false; + } + } + + if (!(usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH) && + (tex->cmask_buffer || tex->surface.dcc_offset)) { + /* Eliminate fast clear (both CMASK and DCC) */ + bool flushed; + si_eliminate_fast_color_clear(sctx, tex, &flushed); + /* eliminate_fast_color_clear sometimes flushes the context */ + if (flushed) + flush = false; + + /* Disable CMASK if flush_resource isn't going + * to be called. + */ + if (tex->cmask_buffer) + si_texture_discard_cmask(sscreen, tex); + } + + /* Set metadata. */ + if ((!res->b.is_shared || update_metadata) && whandle->offset == 0) + si_set_tex_bo_metadata(sscreen, tex); + + if (sscreen->info.chip_class >= GFX9) { + slice_size = tex->surface.u.gfx9.surf_slice_size; + } else { + slice_size = (uint64_t)tex->surface.u.legacy.level[0].slice_size_dw * 4; + } + } else { + /* Buffer exports are for the OpenCL interop. */ + /* Move a suballocated buffer into a non-suballocated allocation. */ + if (sscreen->ws->buffer_is_suballocated(res->buf) || + /* A DMABUF export always fails if the BO is local. */ + (tex->buffer.flags & RADEON_FLAG_NO_INTERPROCESS_SHARING && + sscreen->info.has_local_buffers)) { + assert(!res->b.is_shared); + + /* Allocate a new buffer with PIPE_BIND_SHARED. */ + struct pipe_resource templ = res->b.b; + templ.bind |= PIPE_BIND_SHARED; + + struct pipe_resource *newb = + screen->resource_create(screen, &templ); + if (!newb) + return false; + + /* Copy the old buffer contents to the new one. */ + struct pipe_box box; + u_box_1d(0, newb->width0, &box); + sctx->b.resource_copy_region(&sctx->b, newb, 0, 0, 0, 0, + &res->b.b, 0, &box); + flush = true; + /* Move the new buffer storage to the old pipe_resource. */ + si_replace_buffer_storage(&sctx->b, &res->b.b, newb); + pipe_resource_reference(&newb, NULL); + + assert(res->b.b.bind & PIPE_BIND_SHARED); + assert(res->flags & RADEON_FLAG_NO_SUBALLOC); + } + + /* Buffers */ + slice_size = 0; + } + + si_texture_get_info(screen, resource, &stride, &offset); + + if (flush) + sctx->b.flush(&sctx->b, NULL, 0); + + if (res->b.is_shared) { + /* USAGE_EXPLICIT_FLUSH must be cleared if at least one user + * doesn't set it. + */ + res->external_usage |= usage & ~PIPE_HANDLE_USAGE_EXPLICIT_FLUSH; + if (!(usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH)) + res->external_usage &= ~PIPE_HANDLE_USAGE_EXPLICIT_FLUSH; + } else { + res->b.is_shared = true; + res->external_usage = usage; + } - return sscreen->ws->buffer_get_handle(sscreen->ws, res->buf, stride, - offset, slice_size, whandle); + whandle->stride = stride; + whandle->offset = offset + slice_size * whandle->layer; + + return sscreen->ws->buffer_get_handle(sscreen->ws, res->buf, whandle); } static void si_texture_destroy(struct pipe_screen *screen, - struct pipe_resource *ptex) + struct pipe_resource *ptex) { - struct si_screen *sscreen = (struct si_screen*)screen; - struct si_texture *tex = (struct si_texture*)ptex; - struct si_resource *resource = &tex->buffer; - - if (sscreen->info.chip_class >= GFX9) - free(tex->surface.u.gfx9.dcc_retile_map); - - si_texture_reference(&tex->flushed_depth_texture, NULL); - - if (tex->cmask_buffer != &tex->buffer) { - si_resource_reference(&tex->cmask_buffer, NULL); - } - pb_reference(&resource->buf, NULL); - si_resource_reference(&tex->dcc_separate_buffer, NULL); - si_resource_reference(&tex->last_dcc_separate_buffer, NULL); - FREE(tex); + struct si_screen *sscreen = (struct si_screen*)screen; + struct si_texture *tex = (struct si_texture*)ptex; + struct si_resource *resource = &tex->buffer; + + if (sscreen->info.chip_class >= GFX9) + free(tex->surface.u.gfx9.dcc_retile_map); + + si_texture_reference(&tex->flushed_depth_texture, NULL); + + if (tex->cmask_buffer != &tex->buffer) { + si_resource_reference(&tex->cmask_buffer, NULL); + } + pb_reference(&resource->buf, NULL); + si_resource_reference(&tex->dcc_separate_buffer, NULL); + si_resource_reference(&tex->last_dcc_separate_buffer, NULL); + FREE(tex); } static const struct u_resource_vtbl si_texture_vtbl; -static void si_texture_get_htile_size(struct si_screen *sscreen, - struct si_texture *tex) -{ - unsigned cl_width, cl_height, width, height; - unsigned slice_elements, slice_bytes, pipe_interleave_bytes, base_align; - unsigned num_pipes = sscreen->info.num_tile_pipes; - - assert(sscreen->info.chip_class <= GFX8); - - tex->surface.htile_size = 0; - - if (tex->surface.u.legacy.level[0].mode == RADEON_SURF_MODE_1D && - !sscreen->info.htile_cmask_support_1d_tiling) - return; - - /* Overalign HTILE on P2 configs to work around GPU hangs in - * piglit/depthstencil-render-miplevels 585. - * - * This has been confirmed to help Kabini & Stoney, where the hangs - * are always reproducible. I think I have seen the test hang - * on Carrizo too, though it was very rare there. - */ - if (sscreen->info.chip_class >= GFX7 && num_pipes < 4) - num_pipes = 4; - - switch (num_pipes) { - case 1: - cl_width = 32; - cl_height = 16; - break; - case 2: - cl_width = 32; - cl_height = 32; - break; - case 4: - cl_width = 64; - cl_height = 32; - break; - case 8: - cl_width = 64; - cl_height = 64; - break; - case 16: - cl_width = 128; - cl_height = 64; - break; - default: - assert(0); - return; - } - - width = align(tex->surface.u.legacy.level[0].nblk_x, cl_width * 8); - height = align(tex->surface.u.legacy.level[0].nblk_y, cl_height * 8); - - slice_elements = (width * height) / (8 * 8); - slice_bytes = slice_elements * 4; - - pipe_interleave_bytes = sscreen->info.pipe_interleave_bytes; - base_align = num_pipes * pipe_interleave_bytes; - - tex->surface.htile_alignment = base_align; - tex->surface.htile_size = - util_num_layers(&tex->buffer.b.b, 0) * - align(slice_bytes, base_align); -} - -static void si_texture_allocate_htile(struct si_screen *sscreen, - struct si_texture *tex) -{ - if (sscreen->info.chip_class <= GFX8 && !tex->tc_compatible_htile) - si_texture_get_htile_size(sscreen, tex); - - if (!tex->surface.htile_size) - return; - - tex->htile_offset = align(tex->size, tex->surface.htile_alignment); - tex->size = tex->htile_offset + tex->surface.htile_size; -} - void si_print_texture_info(struct si_screen *sscreen, - struct si_texture *tex, struct u_log_context *log) + struct si_texture *tex, struct u_log_context *log) { - int i; + int i; - /* Common parameters. */ - u_log_printf(log, " Info: npix_x=%u, npix_y=%u, npix_z=%u, blk_w=%u, " - "blk_h=%u, array_size=%u, last_level=%u, " - "bpe=%u, nsamples=%u, flags=0x%x, %s\n", - tex->buffer.b.b.width0, tex->buffer.b.b.height0, - tex->buffer.b.b.depth0, tex->surface.blk_w, - tex->surface.blk_h, - tex->buffer.b.b.array_size, tex->buffer.b.b.last_level, - tex->surface.bpe, tex->buffer.b.b.nr_samples, - tex->surface.flags, util_format_short_name(tex->buffer.b.b.format)); - - if (sscreen->info.chip_class >= GFX9) { - u_log_printf(log, " Surf: size=%"PRIu64", slice_size=%"PRIu64", " - "alignment=%u, swmode=%u, epitch=%u, pitch=%u\n", - tex->surface.surf_size, - tex->surface.u.gfx9.surf_slice_size, - tex->surface.surf_alignment, - tex->surface.u.gfx9.surf.swizzle_mode, - tex->surface.u.gfx9.surf.epitch, - tex->surface.u.gfx9.surf_pitch); - - if (tex->fmask_offset) { - u_log_printf(log, " FMASK: offset=%"PRIu64", size=%"PRIu64", " - "alignment=%u, swmode=%u, epitch=%u\n", - tex->fmask_offset, - tex->surface.fmask_size, - tex->surface.fmask_alignment, - tex->surface.u.gfx9.fmask.swizzle_mode, - tex->surface.u.gfx9.fmask.epitch); - } - - if (tex->cmask_buffer) { - u_log_printf(log, " CMask: offset=%"PRIu64", size=%u, " - "alignment=%u, rb_aligned=%u, pipe_aligned=%u\n", - tex->cmask_offset, - tex->surface.cmask_size, - tex->surface.cmask_alignment, - tex->surface.u.gfx9.cmask.rb_aligned, - tex->surface.u.gfx9.cmask.pipe_aligned); - } - - if (tex->htile_offset) { - u_log_printf(log, " HTile: offset=%"PRIu64", size=%u, alignment=%u, " - "rb_aligned=%u, pipe_aligned=%u\n", - tex->htile_offset, - tex->surface.htile_size, - tex->surface.htile_alignment, - tex->surface.u.gfx9.htile.rb_aligned, - tex->surface.u.gfx9.htile.pipe_aligned); - } - - if (tex->dcc_offset) { - u_log_printf(log, " DCC: offset=%"PRIu64", size=%u, " - "alignment=%u, pitch_max=%u, num_dcc_levels=%u\n", - tex->dcc_offset, tex->surface.dcc_size, - tex->surface.dcc_alignment, - tex->surface.u.gfx9.display_dcc_pitch_max, - tex->surface.num_dcc_levels); - } - - if (tex->surface.u.gfx9.stencil_offset) { - u_log_printf(log, " Stencil: offset=%"PRIu64", swmode=%u, epitch=%u\n", - tex->surface.u.gfx9.stencil_offset, - tex->surface.u.gfx9.stencil.swizzle_mode, - tex->surface.u.gfx9.stencil.epitch); - } - return; - } - - u_log_printf(log, " Layout: size=%"PRIu64", alignment=%u, bankw=%u, " - "bankh=%u, nbanks=%u, mtilea=%u, tilesplit=%u, pipeconfig=%u, scanout=%u\n", - tex->surface.surf_size, tex->surface.surf_alignment, tex->surface.u.legacy.bankw, - tex->surface.u.legacy.bankh, tex->surface.u.legacy.num_banks, tex->surface.u.legacy.mtilea, - tex->surface.u.legacy.tile_split, tex->surface.u.legacy.pipe_config, - (tex->surface.flags & RADEON_SURF_SCANOUT) != 0); - - if (tex->fmask_offset) - u_log_printf(log, " FMask: offset=%"PRIu64", size=%"PRIu64", alignment=%u, pitch_in_pixels=%u, " - "bankh=%u, slice_tile_max=%u, tile_mode_index=%u\n", - tex->fmask_offset, tex->surface.fmask_size, tex->surface.fmask_alignment, - tex->surface.u.legacy.fmask.pitch_in_pixels, - tex->surface.u.legacy.fmask.bankh, - tex->surface.u.legacy.fmask.slice_tile_max, - tex->surface.u.legacy.fmask.tiling_index); - - if (tex->cmask_buffer) - u_log_printf(log, " CMask: offset=%"PRIu64", size=%u, alignment=%u, " - "slice_tile_max=%u\n", - tex->cmask_offset, tex->surface.cmask_size, tex->surface.cmask_alignment, - tex->surface.u.legacy.cmask_slice_tile_max); - - if (tex->htile_offset) - u_log_printf(log, " HTile: offset=%"PRIu64", size=%u, " - "alignment=%u, TC_compatible = %u\n", - tex->htile_offset, tex->surface.htile_size, - tex->surface.htile_alignment, - tex->tc_compatible_htile); - - if (tex->dcc_offset) { - u_log_printf(log, " DCC: offset=%"PRIu64", size=%u, alignment=%u\n", - tex->dcc_offset, tex->surface.dcc_size, - tex->surface.dcc_alignment); - for (i = 0; i <= tex->buffer.b.b.last_level; i++) - u_log_printf(log, " DCCLevel[%i]: enabled=%u, offset=%u, " - "fast_clear_size=%u\n", - i, i < tex->surface.num_dcc_levels, - tex->surface.u.legacy.level[i].dcc_offset, - tex->surface.u.legacy.level[i].dcc_fast_clear_size); - } - - for (i = 0; i <= tex->buffer.b.b.last_level; i++) - u_log_printf(log, " Level[%i]: offset=%"PRIu64", slice_size=%"PRIu64", " - "npix_x=%u, npix_y=%u, npix_z=%u, nblk_x=%u, nblk_y=%u, " - "mode=%u, tiling_index = %u\n", - i, tex->surface.u.legacy.level[i].offset, - (uint64_t)tex->surface.u.legacy.level[i].slice_size_dw * 4, - u_minify(tex->buffer.b.b.width0, i), - u_minify(tex->buffer.b.b.height0, i), - u_minify(tex->buffer.b.b.depth0, i), - tex->surface.u.legacy.level[i].nblk_x, - tex->surface.u.legacy.level[i].nblk_y, - tex->surface.u.legacy.level[i].mode, - tex->surface.u.legacy.tiling_index[i]); - - if (tex->surface.has_stencil) { - u_log_printf(log, " StencilLayout: tilesplit=%u\n", - tex->surface.u.legacy.stencil_tile_split); - for (i = 0; i <= tex->buffer.b.b.last_level; i++) { - u_log_printf(log, " StencilLevel[%i]: offset=%"PRIu64", " - "slice_size=%"PRIu64", npix_x=%u, " - "npix_y=%u, npix_z=%u, nblk_x=%u, nblk_y=%u, " - "mode=%u, tiling_index = %u\n", - i, tex->surface.u.legacy.stencil_level[i].offset, - (uint64_t)tex->surface.u.legacy.stencil_level[i].slice_size_dw * 4, - u_minify(tex->buffer.b.b.width0, i), - u_minify(tex->buffer.b.b.height0, i), - u_minify(tex->buffer.b.b.depth0, i), - tex->surface.u.legacy.stencil_level[i].nblk_x, - tex->surface.u.legacy.stencil_level[i].nblk_y, - tex->surface.u.legacy.stencil_level[i].mode, - tex->surface.u.legacy.stencil_tiling_index[i]); - } - } + /* Common parameters. */ + u_log_printf(log, " Info: npix_x=%u, npix_y=%u, npix_z=%u, blk_w=%u, " + "blk_h=%u, array_size=%u, last_level=%u, " + "bpe=%u, nsamples=%u, flags=0x%x, %s\n", + tex->buffer.b.b.width0, tex->buffer.b.b.height0, + tex->buffer.b.b.depth0, tex->surface.blk_w, + tex->surface.blk_h, + tex->buffer.b.b.array_size, tex->buffer.b.b.last_level, + tex->surface.bpe, tex->buffer.b.b.nr_samples, + tex->surface.flags, util_format_short_name(tex->buffer.b.b.format)); + + if (sscreen->info.chip_class >= GFX9) { + u_log_printf(log, " Surf: size=%"PRIu64", slice_size=%"PRIu64", " + "alignment=%u, swmode=%u, epitch=%u, pitch=%u\n", + tex->surface.surf_size, + tex->surface.u.gfx9.surf_slice_size, + tex->surface.surf_alignment, + tex->surface.u.gfx9.surf.swizzle_mode, + tex->surface.u.gfx9.surf.epitch, + tex->surface.u.gfx9.surf_pitch); + + if (tex->surface.fmask_offset) { + u_log_printf(log, " FMASK: offset=%"PRIu64", size=%"PRIu64", " + "alignment=%u, swmode=%u, epitch=%u\n", + tex->surface.fmask_offset, + tex->surface.fmask_size, + tex->surface.fmask_alignment, + tex->surface.u.gfx9.fmask.swizzle_mode, + tex->surface.u.gfx9.fmask.epitch); + } + + if (tex->cmask_buffer) { + u_log_printf(log, " CMask: offset=%"PRIu64", size=%u, " + "alignment=%u, rb_aligned=%u, pipe_aligned=%u\n", + tex->surface.cmask_offset, + tex->surface.cmask_size, + tex->surface.cmask_alignment, + tex->surface.u.gfx9.cmask.rb_aligned, + tex->surface.u.gfx9.cmask.pipe_aligned); + } + + if (tex->surface.htile_offset) { + u_log_printf(log, " HTile: offset=%"PRIu64", size=%u, alignment=%u, " + "rb_aligned=%u, pipe_aligned=%u\n", + tex->surface.htile_offset, + tex->surface.htile_size, + tex->surface.htile_alignment, + tex->surface.u.gfx9.htile.rb_aligned, + tex->surface.u.gfx9.htile.pipe_aligned); + } + + if (tex->surface.dcc_offset) { + u_log_printf(log, " DCC: offset=%"PRIu64", size=%u, " + "alignment=%u, pitch_max=%u, num_dcc_levels=%u\n", + tex->surface.dcc_offset, tex->surface.dcc_size, + tex->surface.dcc_alignment, + tex->surface.u.gfx9.display_dcc_pitch_max, + tex->surface.num_dcc_levels); + } + + if (tex->surface.u.gfx9.stencil_offset) { + u_log_printf(log, " Stencil: offset=%"PRIu64", swmode=%u, epitch=%u\n", + tex->surface.u.gfx9.stencil_offset, + tex->surface.u.gfx9.stencil.swizzle_mode, + tex->surface.u.gfx9.stencil.epitch); + } + return; + } + + u_log_printf(log, " Layout: size=%"PRIu64", alignment=%u, bankw=%u, " + "bankh=%u, nbanks=%u, mtilea=%u, tilesplit=%u, pipeconfig=%u, scanout=%u\n", + tex->surface.surf_size, tex->surface.surf_alignment, tex->surface.u.legacy.bankw, + tex->surface.u.legacy.bankh, tex->surface.u.legacy.num_banks, tex->surface.u.legacy.mtilea, + tex->surface.u.legacy.tile_split, tex->surface.u.legacy.pipe_config, + (tex->surface.flags & RADEON_SURF_SCANOUT) != 0); + + if (tex->surface.fmask_offset) + u_log_printf(log, " FMask: offset=%"PRIu64", size=%"PRIu64", alignment=%u, pitch_in_pixels=%u, " + "bankh=%u, slice_tile_max=%u, tile_mode_index=%u\n", + tex->surface.fmask_offset, tex->surface.fmask_size, tex->surface.fmask_alignment, + tex->surface.u.legacy.fmask.pitch_in_pixels, + tex->surface.u.legacy.fmask.bankh, + tex->surface.u.legacy.fmask.slice_tile_max, + tex->surface.u.legacy.fmask.tiling_index); + + if (tex->cmask_buffer) + u_log_printf(log, " CMask: offset=%"PRIu64", size=%u, alignment=%u, " + "slice_tile_max=%u\n", + tex->surface.cmask_offset, tex->surface.cmask_size, tex->surface.cmask_alignment, + tex->surface.u.legacy.cmask_slice_tile_max); + + if (tex->surface.htile_offset) + u_log_printf(log, " HTile: offset=%"PRIu64", size=%u, " + "alignment=%u, TC_compatible = %u\n", + tex->surface.htile_offset, tex->surface.htile_size, + tex->surface.htile_alignment, + tex->tc_compatible_htile); + + if (tex->surface.dcc_offset) { + u_log_printf(log, " DCC: offset=%"PRIu64", size=%u, alignment=%u\n", + tex->surface.dcc_offset, tex->surface.dcc_size, + tex->surface.dcc_alignment); + for (i = 0; i <= tex->buffer.b.b.last_level; i++) + u_log_printf(log, " DCCLevel[%i]: enabled=%u, offset=%u, " + "fast_clear_size=%u\n", + i, i < tex->surface.num_dcc_levels, + tex->surface.u.legacy.level[i].dcc_offset, + tex->surface.u.legacy.level[i].dcc_fast_clear_size); + } + + for (i = 0; i <= tex->buffer.b.b.last_level; i++) + u_log_printf(log, " Level[%i]: offset=%"PRIu64", slice_size=%"PRIu64", " + "npix_x=%u, npix_y=%u, npix_z=%u, nblk_x=%u, nblk_y=%u, " + "mode=%u, tiling_index = %u\n", + i, tex->surface.u.legacy.level[i].offset, + (uint64_t)tex->surface.u.legacy.level[i].slice_size_dw * 4, + u_minify(tex->buffer.b.b.width0, i), + u_minify(tex->buffer.b.b.height0, i), + u_minify(tex->buffer.b.b.depth0, i), + tex->surface.u.legacy.level[i].nblk_x, + tex->surface.u.legacy.level[i].nblk_y, + tex->surface.u.legacy.level[i].mode, + tex->surface.u.legacy.tiling_index[i]); + + if (tex->surface.has_stencil) { + u_log_printf(log, " StencilLayout: tilesplit=%u\n", + tex->surface.u.legacy.stencil_tile_split); + for (i = 0; i <= tex->buffer.b.b.last_level; i++) { + u_log_printf(log, " StencilLevel[%i]: offset=%"PRIu64", " + "slice_size=%"PRIu64", npix_x=%u, " + "npix_y=%u, npix_z=%u, nblk_x=%u, nblk_y=%u, " + "mode=%u, tiling_index = %u\n", + i, tex->surface.u.legacy.stencil_level[i].offset, + (uint64_t)tex->surface.u.legacy.stencil_level[i].slice_size_dw * 4, + u_minify(tex->buffer.b.b.width0, i), + u_minify(tex->buffer.b.b.height0, i), + u_minify(tex->buffer.b.b.depth0, i), + tex->surface.u.legacy.stencil_level[i].nblk_x, + tex->surface.u.legacy.stencil_level[i].nblk_y, + tex->surface.u.legacy.stencil_level[i].mode, + tex->surface.u.legacy.stencil_tiling_index[i]); + } + } } -/* Common processing for si_texture_create and si_texture_from_handle */ +/** + * Common function for si_texture_create and si_texture_from_handle. + * + * \param screen screen + * \param base resource template + * \param surface radeon_surf + * \param plane0 if a non-zero plane is being created, this is the first plane + * \param imported_buf from si_texture_from_handle + * \param offset offset for non-zero planes or imported buffers + * \param alloc_size the size to allocate if plane0 != NULL + * \param alignment alignment for the allocation + */ static struct si_texture * si_texture_create_object(struct pipe_screen *screen, - const struct pipe_resource *base, - struct pb_buffer *buf, - struct radeon_surf *surface) -{ - struct si_texture *tex; - struct si_resource *resource; - struct si_screen *sscreen = (struct si_screen*)screen; - - tex = CALLOC_STRUCT(si_texture); - if (!tex) - goto error; - - resource = &tex->buffer; - resource->b.b = *base; - resource->b.b.next = NULL; - resource->b.vtbl = &si_texture_vtbl; - pipe_reference_init(&resource->b.b.reference, 1); - resource->b.b.screen = screen; - - /* don't include stencil-only formats which we don't support for rendering */ - tex->is_depth = util_format_has_depth(util_format_description(tex->buffer.b.b.format)); - - tex->surface = *surface; - tex->size = tex->surface.surf_size; - - tex->tc_compatible_htile = tex->surface.htile_size != 0 && - (tex->surface.flags & - RADEON_SURF_TC_COMPATIBLE_HTILE); - - /* TC-compatible HTILE: - * - GFX8 only supports Z32_FLOAT. - * - GFX9 only supports Z32_FLOAT and Z16_UNORM. */ - if (tex->tc_compatible_htile) { - if (sscreen->info.chip_class >= GFX9 && - base->format == PIPE_FORMAT_Z16_UNORM) - tex->db_render_format = base->format; - else { - tex->db_render_format = PIPE_FORMAT_Z32_FLOAT; - tex->upgraded_depth = base->format != PIPE_FORMAT_Z32_FLOAT && - base->format != PIPE_FORMAT_Z32_FLOAT_S8X24_UINT; - } - } else { - tex->db_render_format = base->format; - } - - /* Applies to GCN. */ - tex->last_msaa_resolve_target_micro_mode = tex->surface.micro_tile_mode; - - /* Disable separate DCC at the beginning. DRI2 doesn't reuse buffers - * between frames, so the only thing that can enable separate DCC - * with DRI2 is multiple slow clears within a frame. - */ - tex->ps_draw_ratio = 0; - - if (tex->is_depth) { - if (sscreen->info.chip_class >= GFX9) { - tex->can_sample_z = true; - tex->can_sample_s = true; - - /* Stencil texturing with HTILE doesn't work - * with mipmapping on Navi10-14. */ - if ((sscreen->info.family == CHIP_NAVI10 || - sscreen->info.family == CHIP_NAVI12 || - sscreen->info.family == CHIP_NAVI14) && - base->last_level > 0) - tex->htile_stencil_disabled = true; - } else { - tex->can_sample_z = !tex->surface.u.legacy.depth_adjusted; - tex->can_sample_s = !tex->surface.u.legacy.stencil_adjusted; - } - - if (!(base->flags & (SI_RESOURCE_FLAG_TRANSFER | - SI_RESOURCE_FLAG_FLUSHED_DEPTH))) { - tex->db_compatible = true; - - if (!(sscreen->debug_flags & DBG(NO_HYPERZ))) - si_texture_allocate_htile(sscreen, tex); - } - } else { - if (base->nr_samples > 1 && - !buf && - !(sscreen->debug_flags & DBG(NO_FMASK))) { - /* Allocate FMASK. */ - tex->fmask_offset = align64(tex->size, - tex->surface.fmask_alignment); - tex->size = tex->fmask_offset + tex->surface.fmask_size; - - /* Allocate CMASK. */ - tex->cmask_offset = align64(tex->size, tex->surface.cmask_alignment); - tex->size = tex->cmask_offset + tex->surface.cmask_size; - tex->cb_color_info |= S_028C70_FAST_CLEAR(1); - tex->cmask_buffer = &tex->buffer; - - if (!tex->surface.fmask_size || !tex->surface.cmask_size) - goto error; - } - - /* Shared textures must always set up DCC here. - * If it's not present, it will be disabled by - * si_get_opaque_metadata later. - */ - if (tex->surface.dcc_size && - (buf || !(sscreen->debug_flags & DBG(NO_DCC))) && - (sscreen->info.use_display_dcc_unaligned || - sscreen->info.use_display_dcc_with_retile_blit || - !(tex->surface.flags & RADEON_SURF_SCANOUT))) { - /* Add space for the DCC buffer. */ - tex->dcc_offset = align64(tex->size, tex->surface.dcc_alignment); - tex->size = tex->dcc_offset + tex->surface.dcc_size; - - if (sscreen->info.chip_class >= GFX9 && - tex->surface.u.gfx9.dcc_retile_num_elements) { - /* Add space for the displayable DCC buffer. */ - tex->display_dcc_offset = - align64(tex->size, tex->surface.u.gfx9.display_dcc_alignment); - tex->size = tex->display_dcc_offset + - tex->surface.u.gfx9.display_dcc_size; - - /* Add space for the DCC retile buffer. (16-bit or 32-bit elements) */ - tex->dcc_retile_map_offset = - align64(tex->size, sscreen->info.tcc_cache_line_size); - - if (tex->surface.u.gfx9.dcc_retile_use_uint16) { - tex->size = tex->dcc_retile_map_offset + - tex->surface.u.gfx9.dcc_retile_num_elements * 2; - } else { - tex->size = tex->dcc_retile_map_offset + - tex->surface.u.gfx9.dcc_retile_num_elements * 4; - } - } - } - } - - /* Now create the backing buffer. */ - if (!buf) { - si_init_resource_fields(sscreen, resource, tex->size, - tex->surface.surf_alignment); - - if (!si_alloc_resource(sscreen, resource)) - goto error; - } else { - resource->buf = buf; - resource->gpu_address = sscreen->ws->buffer_get_virtual_address(resource->buf); - resource->bo_size = buf->size; - resource->bo_alignment = buf->alignment; - resource->domains = sscreen->ws->buffer_get_initial_domain(resource->buf); - if (resource->domains & RADEON_DOMAIN_VRAM) - resource->vram_usage = buf->size; - else if (resource->domains & RADEON_DOMAIN_GTT) - resource->gart_usage = buf->size; - } - - if (tex->cmask_buffer) { - /* Initialize the cmask to 0xCC (= compressed state). */ - si_screen_clear_buffer(sscreen, &tex->cmask_buffer->b.b, - tex->cmask_offset, tex->surface.cmask_size, - 0xCCCCCCCC); - } - if (tex->htile_offset) { - uint32_t clear_value = 0; - - if (sscreen->info.chip_class >= GFX9 || tex->tc_compatible_htile) - clear_value = 0x0000030F; - - si_screen_clear_buffer(sscreen, &tex->buffer.b.b, - tex->htile_offset, - tex->surface.htile_size, - clear_value); - } - - /* Initialize DCC only if the texture is not being imported. */ - if (!buf && tex->dcc_offset) { - /* Clear DCC to black for all tiles with DCC enabled. - * - * This fixes corruption in 3DMark Slingshot Extreme, which - * uses uninitialized textures, causing corruption. - */ - if (tex->surface.num_dcc_levels == tex->buffer.b.b.last_level + 1 && - tex->buffer.b.b.nr_samples <= 2) { - /* Simple case - all tiles have DCC enabled. */ - si_screen_clear_buffer(sscreen, &tex->buffer.b.b, - tex->dcc_offset, - tex->surface.dcc_size, - DCC_CLEAR_COLOR_0000); - } else if (sscreen->info.chip_class >= GFX9) { - /* Clear to uncompressed. Clearing this to black is complicated. */ - si_screen_clear_buffer(sscreen, &tex->buffer.b.b, - tex->dcc_offset, - tex->surface.dcc_size, - DCC_UNCOMPRESSED); - } else { - /* GFX8: Initialize mipmap levels and multisamples separately. */ - if (tex->buffer.b.b.nr_samples >= 2) { - /* Clearing this to black is complicated. */ - si_screen_clear_buffer(sscreen, &tex->buffer.b.b, - tex->dcc_offset, - tex->surface.dcc_size, - DCC_UNCOMPRESSED); - } else { - /* Clear the enabled mipmap levels to black. */ - unsigned size = 0; - - for (unsigned i = 0; i < tex->surface.num_dcc_levels; i++) { - if (!tex->surface.u.legacy.level[i].dcc_fast_clear_size) - break; - - size = tex->surface.u.legacy.level[i].dcc_offset + - tex->surface.u.legacy.level[i].dcc_fast_clear_size; - } - - /* Mipmap levels with DCC. */ - if (size) { - si_screen_clear_buffer(sscreen, &tex->buffer.b.b, - tex->dcc_offset, size, - DCC_CLEAR_COLOR_0000); - } - /* Mipmap levels without DCC. */ - if (size != tex->surface.dcc_size) { - si_screen_clear_buffer(sscreen, &tex->buffer.b.b, - tex->dcc_offset + size, - tex->surface.dcc_size - size, - DCC_UNCOMPRESSED); - } - } - } - - /* Upload the DCC retile map. */ - if (tex->dcc_retile_map_offset) { - /* Use a staging buffer for the upload, because - * the buffer backing the texture is unmappable. - */ - bool use_uint16 = tex->surface.u.gfx9.dcc_retile_use_uint16; - unsigned num_elements = tex->surface.u.gfx9.dcc_retile_num_elements; - struct si_resource *buf = - si_aligned_buffer_create(screen, 0, PIPE_USAGE_STREAM, - num_elements * (use_uint16 ? 2 : 4), - sscreen->info.tcc_cache_line_size); - uint32_t *ui = (uint32_t*)sscreen->ws->buffer_map(buf->buf, NULL, - PIPE_TRANSFER_WRITE); - uint16_t *us = (uint16_t*)ui; - - /* Upload the retile map into a staging buffer. */ - if (use_uint16) { - for (unsigned i = 0; i < num_elements; i++) - us[i] = tex->surface.u.gfx9.dcc_retile_map[i]; - } else { - for (unsigned i = 0; i < num_elements; i++) - ui[i] = tex->surface.u.gfx9.dcc_retile_map[i]; - } - - /* Copy the staging buffer to the buffer backing the texture. */ - struct si_context *sctx = (struct si_context*)sscreen->aux_context; - struct pipe_box box; - u_box_1d(0, buf->b.b.width0, &box); - - assert(tex->dcc_retile_map_offset <= UINT_MAX); - mtx_lock(&sscreen->aux_context_lock); - sctx->dma_copy(&sctx->b, &tex->buffer.b.b, 0, - tex->dcc_retile_map_offset, 0, 0, - &buf->b.b, 0, &box); - sscreen->aux_context->flush(sscreen->aux_context, NULL, 0); - mtx_unlock(&sscreen->aux_context_lock); - - si_resource_reference(&buf, NULL); - } - } - - /* Initialize the CMASK base register value. */ - tex->cmask_base_address_reg = - (tex->buffer.gpu_address + tex->cmask_offset) >> 8; - - if (sscreen->debug_flags & DBG(VM)) { - fprintf(stderr, "VM start=0x%"PRIX64" end=0x%"PRIX64" | Texture %ix%ix%i, %i levels, %i samples, %s\n", - tex->buffer.gpu_address, - tex->buffer.gpu_address + tex->buffer.buf->size, - base->width0, base->height0, util_num_layers(base, 0), base->last_level+1, - base->nr_samples ? base->nr_samples : 1, util_format_short_name(base->format)); - } - - if (sscreen->debug_flags & DBG(TEX)) { - puts("Texture:"); - struct u_log_context log; - u_log_context_init(&log); - si_print_texture_info(sscreen, tex, &log); - u_log_new_page_print(&log, stdout); - fflush(stdout); - u_log_context_destroy(&log); - } + const struct pipe_resource *base, + const struct radeon_surf *surface, + const struct si_texture *plane0, + struct pb_buffer *imported_buf, + uint64_t offset, + uint64_t alloc_size, + unsigned alignment) +{ + struct si_texture *tex; + struct si_resource *resource; + struct si_screen *sscreen = (struct si_screen*)screen; + + tex = CALLOC_STRUCT(si_texture); + if (!tex) + goto error; + + resource = &tex->buffer; + resource->b.b = *base; + resource->b.b.next = NULL; + resource->b.vtbl = &si_texture_vtbl; + pipe_reference_init(&resource->b.b.reference, 1); + resource->b.b.screen = screen; + + /* don't include stencil-only formats which we don't support for rendering */ + tex->is_depth = util_format_has_depth(util_format_description(tex->buffer.b.b.format)); + tex->surface = *surface; + tex->tc_compatible_htile = tex->surface.htile_size != 0 && + (tex->surface.flags & + RADEON_SURF_TC_COMPATIBLE_HTILE); + + /* TC-compatible HTILE: + * - GFX8 only supports Z32_FLOAT. + * - GFX9 only supports Z32_FLOAT and Z16_UNORM. */ + if (tex->tc_compatible_htile) { + if (sscreen->info.chip_class >= GFX9 && + base->format == PIPE_FORMAT_Z16_UNORM) + tex->db_render_format = base->format; + else { + tex->db_render_format = PIPE_FORMAT_Z32_FLOAT; + tex->upgraded_depth = base->format != PIPE_FORMAT_Z32_FLOAT && + base->format != PIPE_FORMAT_Z32_FLOAT_S8X24_UINT; + } + } else { + tex->db_render_format = base->format; + } + + /* Applies to GCN. */ + tex->last_msaa_resolve_target_micro_mode = tex->surface.micro_tile_mode; + + /* Disable separate DCC at the beginning. DRI2 doesn't reuse buffers + * between frames, so the only thing that can enable separate DCC + * with DRI2 is multiple slow clears within a frame. + */ + tex->ps_draw_ratio = 0; + + if (sscreen->info.chip_class >= GFX9) { + tex->surface.u.gfx9.surf_offset = offset; + } else { + for (unsigned i = 0; i < ARRAY_SIZE(surface->u.legacy.level); ++i) + tex->surface.u.legacy.level[i].offset += offset; + } + + if (tex->is_depth) { + if (sscreen->info.chip_class >= GFX9) { + tex->can_sample_z = true; + tex->can_sample_s = true; + + /* Stencil texturing with HTILE doesn't work + * with mipmapping on Navi10-14. */ + if ((sscreen->info.family == CHIP_NAVI10 || + sscreen->info.family == CHIP_NAVI12 || + sscreen->info.family == CHIP_NAVI14) && + base->last_level > 0) + tex->htile_stencil_disabled = true; + } else { + tex->can_sample_z = !tex->surface.u.legacy.depth_adjusted; + tex->can_sample_s = !tex->surface.u.legacy.stencil_adjusted; + } + + tex->db_compatible = surface->flags & RADEON_SURF_ZBUFFER; + } else { + if (tex->surface.cmask_offset) { + tex->cb_color_info |= S_028C70_FAST_CLEAR(1); + tex->cmask_buffer = &tex->buffer; + } + } + + if (plane0) { + /* The buffer is shared with the first plane. */ + resource->bo_size = plane0->buffer.bo_size; + resource->bo_alignment = plane0->buffer.bo_alignment; + resource->flags = plane0->buffer.flags; + resource->domains = plane0->buffer.domains; + resource->vram_usage = plane0->buffer.vram_usage; + resource->gart_usage = plane0->buffer.gart_usage; + + pb_reference(&resource->buf, plane0->buffer.buf); + resource->gpu_address = plane0->buffer.gpu_address; + } else if (!(surface->flags & RADEON_SURF_IMPORTED)) { + /* Create the backing buffer. */ + si_init_resource_fields(sscreen, resource, alloc_size, alignment); + + if (!si_alloc_resource(sscreen, resource)) + goto error; + } else { + resource->buf = imported_buf; + resource->gpu_address = sscreen->ws->buffer_get_virtual_address(resource->buf); + resource->bo_size = imported_buf->size; + resource->bo_alignment = imported_buf->alignment; + resource->domains = sscreen->ws->buffer_get_initial_domain(resource->buf); + if (resource->domains & RADEON_DOMAIN_VRAM) + resource->vram_usage = resource->bo_size; + else if (resource->domains & RADEON_DOMAIN_GTT) + resource->gart_usage = resource->bo_size; + if (sscreen->ws->buffer_get_flags) + resource->flags = sscreen->ws->buffer_get_flags(resource->buf); + } + + if (tex->cmask_buffer) { + /* Initialize the cmask to 0xCC (= compressed state). */ + si_screen_clear_buffer(sscreen, &tex->cmask_buffer->b.b, + tex->surface.cmask_offset, tex->surface.cmask_size, + 0xCCCCCCCC); + } + if (tex->surface.htile_offset) { + uint32_t clear_value = 0; + + if (sscreen->info.chip_class >= GFX9 || tex->tc_compatible_htile) + clear_value = 0x0000030F; + + si_screen_clear_buffer(sscreen, &tex->buffer.b.b, + tex->surface.htile_offset, + tex->surface.htile_size, + clear_value); + } + + /* Initialize DCC only if the texture is not being imported. */ + if (!(surface->flags & RADEON_SURF_IMPORTED) && tex->surface.dcc_offset) { + /* Clear DCC to black for all tiles with DCC enabled. + * + * This fixes corruption in 3DMark Slingshot Extreme, which + * uses uninitialized textures, causing corruption. + */ + if (tex->surface.num_dcc_levels == tex->buffer.b.b.last_level + 1 && + tex->buffer.b.b.nr_samples <= 2) { + /* Simple case - all tiles have DCC enabled. */ + si_screen_clear_buffer(sscreen, &tex->buffer.b.b, + tex->surface.dcc_offset, + tex->surface.dcc_size, + DCC_CLEAR_COLOR_0000); + } else if (sscreen->info.chip_class >= GFX9) { + /* Clear to uncompressed. Clearing this to black is complicated. */ + si_screen_clear_buffer(sscreen, &tex->buffer.b.b, + tex->surface.dcc_offset, + tex->surface.dcc_size, + DCC_UNCOMPRESSED); + } else { + /* GFX8: Initialize mipmap levels and multisamples separately. */ + if (tex->buffer.b.b.nr_samples >= 2) { + /* Clearing this to black is complicated. */ + si_screen_clear_buffer(sscreen, &tex->buffer.b.b, + tex->surface.dcc_offset, + tex->surface.dcc_size, + DCC_UNCOMPRESSED); + } else { + /* Clear the enabled mipmap levels to black. */ + unsigned size = 0; + + for (unsigned i = 0; i < tex->surface.num_dcc_levels; i++) { + if (!tex->surface.u.legacy.level[i].dcc_fast_clear_size) + break; + + size = tex->surface.u.legacy.level[i].dcc_offset + + tex->surface.u.legacy.level[i].dcc_fast_clear_size; + } + + /* Mipmap levels with DCC. */ + if (size) { + si_screen_clear_buffer(sscreen, &tex->buffer.b.b, + tex->surface.dcc_offset, size, + DCC_CLEAR_COLOR_0000); + } + /* Mipmap levels without DCC. */ + if (size != tex->surface.dcc_size) { + si_screen_clear_buffer(sscreen, &tex->buffer.b.b, + tex->surface.dcc_offset + size, + tex->surface.dcc_size - size, + DCC_UNCOMPRESSED); + } + } + } + + /* Initialize displayable DCC that requires the retile blit. */ + if (tex->surface.dcc_retile_map_offset) { + /* Uninitialized DCC can hang the display hw. + * Clear to white to indicate that. */ + si_screen_clear_buffer(sscreen, &tex->buffer.b.b, + tex->surface.display_dcc_offset, + tex->surface.u.gfx9.display_dcc_size, + DCC_CLEAR_COLOR_1111); + + /* Upload the DCC retile map. + * Use a staging buffer for the upload, because + * the buffer backing the texture is unmappable. + */ + bool use_uint16 = tex->surface.u.gfx9.dcc_retile_use_uint16; + unsigned num_elements = tex->surface.u.gfx9.dcc_retile_num_elements; + struct si_resource *buf = + si_aligned_buffer_create(screen, 0, PIPE_USAGE_STREAM, + num_elements * (use_uint16 ? 2 : 4), + sscreen->info.tcc_cache_line_size); + uint32_t *ui = (uint32_t*)sscreen->ws->buffer_map(buf->buf, NULL, + PIPE_TRANSFER_WRITE); + uint16_t *us = (uint16_t*)ui; + + /* Upload the retile map into a staging buffer. */ + if (use_uint16) { + for (unsigned i = 0; i < num_elements; i++) + us[i] = tex->surface.u.gfx9.dcc_retile_map[i]; + } else { + for (unsigned i = 0; i < num_elements; i++) + ui[i] = tex->surface.u.gfx9.dcc_retile_map[i]; + } + + /* Copy the staging buffer to the buffer backing the texture. */ + struct si_context *sctx = (struct si_context*)sscreen->aux_context; + + assert(tex->surface.dcc_retile_map_offset <= UINT_MAX); + simple_mtx_lock(&sscreen->aux_context_lock); + si_sdma_copy_buffer(sctx, &tex->buffer.b.b, &buf->b.b, + tex->surface.dcc_retile_map_offset, + 0, buf->b.b.width0); + sscreen->aux_context->flush(sscreen->aux_context, NULL, 0); + simple_mtx_unlock(&sscreen->aux_context_lock); + + si_resource_reference(&buf, NULL); + } + } + + /* Initialize the CMASK base register value. */ + tex->cmask_base_address_reg = + (tex->buffer.gpu_address + tex->surface.cmask_offset) >> 8; + + if (sscreen->debug_flags & DBG(VM)) { + fprintf(stderr, "VM start=0x%"PRIX64" end=0x%"PRIX64" | Texture %ix%ix%i, %i levels, %i samples, %s\n", + tex->buffer.gpu_address, + tex->buffer.gpu_address + tex->buffer.buf->size, + base->width0, base->height0, util_num_layers(base, 0), base->last_level+1, + base->nr_samples ? base->nr_samples : 1, util_format_short_name(base->format)); + } + + if (sscreen->debug_flags & DBG(TEX)) { + puts("Texture:"); + struct u_log_context log; + u_log_context_init(&log); + si_print_texture_info(sscreen, tex, &log); + u_log_new_page_print(&log, stdout); + fflush(stdout); + u_log_context_destroy(&log); + } - return tex; + return tex; error: - FREE(tex); - if (sscreen->info.chip_class >= GFX9) - free(surface->u.gfx9.dcc_retile_map); - return NULL; + FREE(tex); + if (sscreen->info.chip_class >= GFX9) + free(surface->u.gfx9.dcc_retile_map); + return NULL; } static enum radeon_surf_mode si_choose_tiling(struct si_screen *sscreen, - const struct pipe_resource *templ, bool tc_compatible_htile) + const struct pipe_resource *templ, bool tc_compatible_htile) { - const struct util_format_description *desc = util_format_description(templ->format); - bool force_tiling = templ->flags & SI_RESOURCE_FLAG_FORCE_MSAA_TILING; - bool is_depth_stencil = util_format_is_depth_or_stencil(templ->format) && - !(templ->flags & SI_RESOURCE_FLAG_FLUSHED_DEPTH); - - /* MSAA resources must be 2D tiled. */ - if (templ->nr_samples > 1) - return RADEON_SURF_MODE_2D; - - /* Transfer resources should be linear. */ - if (templ->flags & SI_RESOURCE_FLAG_TRANSFER) - return RADEON_SURF_MODE_LINEAR_ALIGNED; - - /* Avoid Z/S decompress blits by forcing TC-compatible HTILE on GFX8, - * which requires 2D tiling. - */ - if (sscreen->info.chip_class == GFX8 && tc_compatible_htile) - return RADEON_SURF_MODE_2D; - - /* Handle common candidates for the linear mode. - * Compressed textures and DB surfaces must always be tiled. - */ - if (!force_tiling && - !is_depth_stencil && - !util_format_is_compressed(templ->format)) { - if (sscreen->debug_flags & DBG(NO_TILING)) - return RADEON_SURF_MODE_LINEAR_ALIGNED; - - /* Tiling doesn't work with the 422 (SUBSAMPLED) formats. */ - if (desc->layout == UTIL_FORMAT_LAYOUT_SUBSAMPLED) - return RADEON_SURF_MODE_LINEAR_ALIGNED; - - /* Cursors are linear on AMD GCN. - * (XXX double-check, maybe also use RADEON_SURF_SCANOUT) */ - if (templ->bind & PIPE_BIND_CURSOR) - return RADEON_SURF_MODE_LINEAR_ALIGNED; - - if (templ->bind & PIPE_BIND_LINEAR) - return RADEON_SURF_MODE_LINEAR_ALIGNED; - - /* Textures with a very small height are recommended to be linear. */ - if (templ->target == PIPE_TEXTURE_1D || - templ->target == PIPE_TEXTURE_1D_ARRAY || - /* Only very thin and long 2D textures should benefit from - * linear_aligned. */ - (templ->width0 > 8 && templ->height0 <= 2)) - return RADEON_SURF_MODE_LINEAR_ALIGNED; - - /* Textures likely to be mapped often. */ - if (templ->usage == PIPE_USAGE_STAGING || - templ->usage == PIPE_USAGE_STREAM) - return RADEON_SURF_MODE_LINEAR_ALIGNED; - } - - /* Make small textures 1D tiled. */ - if (templ->width0 <= 16 || templ->height0 <= 16 || - (sscreen->debug_flags & DBG(NO_2D_TILING))) - return RADEON_SURF_MODE_1D; + const struct util_format_description *desc = util_format_description(templ->format); + bool force_tiling = templ->flags & SI_RESOURCE_FLAG_FORCE_MSAA_TILING; + bool is_depth_stencil = util_format_is_depth_or_stencil(templ->format) && + !(templ->flags & SI_RESOURCE_FLAG_FLUSHED_DEPTH); + + /* MSAA resources must be 2D tiled. */ + if (templ->nr_samples > 1) + return RADEON_SURF_MODE_2D; + + /* Transfer resources should be linear. */ + if (templ->flags & SI_RESOURCE_FLAG_TRANSFER) + return RADEON_SURF_MODE_LINEAR_ALIGNED; + + /* Avoid Z/S decompress blits by forcing TC-compatible HTILE on GFX8, + * which requires 2D tiling. + */ + if (sscreen->info.chip_class == GFX8 && tc_compatible_htile) + return RADEON_SURF_MODE_2D; + + /* Handle common candidates for the linear mode. + * Compressed textures and DB surfaces must always be tiled. + */ + if (!force_tiling && + !is_depth_stencil && + !util_format_is_compressed(templ->format)) { + if (sscreen->debug_flags & DBG(NO_TILING)) + return RADEON_SURF_MODE_LINEAR_ALIGNED; + + /* Tiling doesn't work with the 422 (SUBSAMPLED) formats. */ + if (desc->layout == UTIL_FORMAT_LAYOUT_SUBSAMPLED) + return RADEON_SURF_MODE_LINEAR_ALIGNED; + + /* Cursors are linear on AMD GCN. + * (XXX double-check, maybe also use RADEON_SURF_SCANOUT) */ + if (templ->bind & PIPE_BIND_CURSOR) + return RADEON_SURF_MODE_LINEAR_ALIGNED; + + if (templ->bind & PIPE_BIND_LINEAR) + return RADEON_SURF_MODE_LINEAR_ALIGNED; + + /* Textures with a very small height are recommended to be linear. */ + if (templ->target == PIPE_TEXTURE_1D || + templ->target == PIPE_TEXTURE_1D_ARRAY || + /* Only very thin and long 2D textures should benefit from + * linear_aligned. */ + (templ->width0 > 8 && templ->height0 <= 2)) + return RADEON_SURF_MODE_LINEAR_ALIGNED; + + /* Textures likely to be mapped often. */ + if (templ->usage == PIPE_USAGE_STAGING || + templ->usage == PIPE_USAGE_STREAM) + return RADEON_SURF_MODE_LINEAR_ALIGNED; + } + + /* Make small textures 1D tiled. */ + if (templ->width0 <= 16 || templ->height0 <= 16 || + (sscreen->debug_flags & DBG(NO_2D_TILING))) + return RADEON_SURF_MODE_1D; - /* The allocator will switch to 1D if needed. */ - return RADEON_SURF_MODE_2D; + /* The allocator will switch to 1D if needed. */ + return RADEON_SURF_MODE_2D; } struct pipe_resource *si_texture_create(struct pipe_screen *screen, - const struct pipe_resource *templ) + const struct pipe_resource *templ) { - struct si_screen *sscreen = (struct si_screen*)screen; - bool is_zs = util_format_is_depth_or_stencil(templ->format); + struct si_screen *sscreen = (struct si_screen*)screen; + bool is_zs = util_format_is_depth_or_stencil(templ->format); - if (templ->nr_samples >= 2) { - /* This is hackish (overwriting the const pipe_resource template), - * but should be harmless and state trackers can also see - * the overriden number of samples in the created pipe_resource. - */ - if (is_zs && sscreen->eqaa_force_z_samples) { - ((struct pipe_resource*)templ)->nr_samples = - ((struct pipe_resource*)templ)->nr_storage_samples = - sscreen->eqaa_force_z_samples; - } else if (!is_zs && sscreen->eqaa_force_color_samples) { - ((struct pipe_resource*)templ)->nr_samples = - sscreen->eqaa_force_coverage_samples; - ((struct pipe_resource*)templ)->nr_storage_samples = - sscreen->eqaa_force_color_samples; - } - } - - struct radeon_surf surface = {0}; - bool is_flushed_depth = templ->flags & SI_RESOURCE_FLAG_FLUSHED_DEPTH; - bool tc_compatible_htile = - sscreen->info.chip_class >= GFX8 && - /* There are issues with TC-compatible HTILE on Tonga (and - * Iceland is the same design), and documented bug workarounds - * don't help. For example, this fails: - * piglit/bin/tex-miplevel-selection 'texture()' 2DShadow -auto - */ - sscreen->info.family != CHIP_TONGA && - sscreen->info.family != CHIP_ICELAND && - (templ->flags & PIPE_RESOURCE_FLAG_TEXTURING_MORE_LIKELY) && - !(sscreen->debug_flags & DBG(NO_HYPERZ)) && - !is_flushed_depth && - templ->nr_samples <= 1 && /* TC-compat HTILE is less efficient with MSAA */ - is_zs; - int r; - - r = si_init_surface(sscreen, &surface, templ, - si_choose_tiling(sscreen, templ, tc_compatible_htile), - 0, 0, false, false, is_flushed_depth, - tc_compatible_htile); - if (r) { - return NULL; - } + if (templ->nr_samples >= 2) { + /* This is hackish (overwriting the const pipe_resource template), + * but should be harmless and state trackers can also see + * the overriden number of samples in the created pipe_resource. + */ + if (is_zs && sscreen->eqaa_force_z_samples) { + ((struct pipe_resource*)templ)->nr_samples = + ((struct pipe_resource*)templ)->nr_storage_samples = + sscreen->eqaa_force_z_samples; + } else if (!is_zs && sscreen->eqaa_force_color_samples) { + ((struct pipe_resource*)templ)->nr_samples = + sscreen->eqaa_force_coverage_samples; + ((struct pipe_resource*)templ)->nr_storage_samples = + sscreen->eqaa_force_color_samples; + } + } + + bool is_flushed_depth = templ->flags & SI_RESOURCE_FLAG_FLUSHED_DEPTH || + templ->flags & SI_RESOURCE_FLAG_TRANSFER; + bool tc_compatible_htile = + sscreen->info.chip_class >= GFX8 && + /* There are issues with TC-compatible HTILE on Tonga (and + * Iceland is the same design), and documented bug workarounds + * don't help. For example, this fails: + * piglit/bin/tex-miplevel-selection 'texture()' 2DShadow -auto + */ + sscreen->info.family != CHIP_TONGA && + sscreen->info.family != CHIP_ICELAND && + (templ->flags & PIPE_RESOURCE_FLAG_TEXTURING_MORE_LIKELY) && + !(sscreen->debug_flags & DBG(NO_HYPERZ)) && + !is_flushed_depth && + templ->nr_samples <= 1 && /* TC-compat HTILE is less efficient with MSAA */ + is_zs; + enum radeon_surf_mode tile_mode = si_choose_tiling(sscreen, templ, + tc_compatible_htile); + + /* This allocates textures with multiple planes like NV12 in 1 buffer. */ + enum { SI_TEXTURE_MAX_PLANES = 3 }; + struct radeon_surf surface[SI_TEXTURE_MAX_PLANES] = {}; + struct pipe_resource plane_templ[SI_TEXTURE_MAX_PLANES]; + uint64_t plane_offset[SI_TEXTURE_MAX_PLANES] = {}; + uint64_t total_size = 0; + unsigned max_alignment = 0; + unsigned num_planes = util_format_get_num_planes(templ->format); + assert(num_planes <= SI_TEXTURE_MAX_PLANES); + + /* Compute texture or plane layouts and offsets. */ + for (unsigned i = 0; i < num_planes; i++) { + plane_templ[i] = *templ; + plane_templ[i].format = util_format_get_plane_format(templ->format, i); + plane_templ[i].width0 = util_format_get_plane_width(templ->format, i, templ->width0); + plane_templ[i].height0 = util_format_get_plane_height(templ->format, i, templ->height0); + + /* Multi-plane allocations need PIPE_BIND_SHARED, because we can't + * reallocate the storage to add PIPE_BIND_SHARED, because it's + * shared by 3 pipe_resources. + */ + if (num_planes > 1) + plane_templ[i].bind |= PIPE_BIND_SHARED; + + if (si_init_surface(sscreen, &surface[i], &plane_templ[i], + tile_mode, 0, false, + plane_templ[i].bind & PIPE_BIND_SCANOUT, + is_flushed_depth, tc_compatible_htile)) + return NULL; + + plane_offset[i] = align64(total_size, surface[i].surf_alignment); + total_size = plane_offset[i] + surface[i].total_size; + max_alignment = MAX2(max_alignment, surface[i].surf_alignment); + } + + struct si_texture *plane0 = NULL, *last_plane = NULL; + + for (unsigned i = 0; i < num_planes; i++) { + struct si_texture *tex = + si_texture_create_object(screen, &plane_templ[i], &surface[i], + plane0, NULL, plane_offset[i], + total_size, max_alignment); + if (!tex) { + si_texture_reference(&plane0, NULL); + return NULL; + } + + tex->plane_index = i; + tex->num_planes = num_planes; + + if (!plane0) { + plane0 = last_plane = tex; + } else { + last_plane->buffer.b.b.next = &tex->buffer.b.b; + last_plane = tex; + } + } - return (struct pipe_resource *) - si_texture_create_object(screen, templ, NULL, &surface); + return (struct pipe_resource *)plane0; } static struct pipe_resource *si_texture_from_winsys_buffer(struct si_screen *sscreen, - const struct pipe_resource *templ, - struct pb_buffer *buf, - unsigned stride, - unsigned offset, - unsigned usage, - bool dedicated) -{ - enum radeon_surf_mode array_mode; - struct radeon_surf surface = {}; - struct radeon_bo_metadata metadata = {}; - struct si_texture *tex; - bool is_scanout; - int r; - - if (dedicated) { - sscreen->ws->buffer_get_metadata(buf, &metadata); - si_get_display_metadata(sscreen, &surface, &metadata, - &array_mode, &is_scanout); - } else { - /** - * The bo metadata is unset for un-dedicated images. So we fall - * back to linear. See answer to question 5 of the - * VK_KHX_external_memory spec for some details. - * - * It is possible that this case isn't going to work if the - * surface pitch isn't correctly aligned by default. - * - * In order to support it correctly we require multi-image - * metadata to be syncrhonized between radv and radeonsi. The - * semantics of associating multiple image metadata to a memory - * object on the vulkan export side are not concretely defined - * either. - * - * All the use cases we are aware of at the moment for memory - * objects use dedicated allocations. So lets keep the initial - * implementation simple. - * - * A possible alternative is to attempt to reconstruct the - * tiling information when the TexParameter TEXTURE_TILING_EXT - * is set. - */ - array_mode = RADEON_SURF_MODE_LINEAR_ALIGNED; - is_scanout = false; - } - - r = si_init_surface(sscreen, &surface, templ, - array_mode, stride, offset, true, is_scanout, - false, false); - if (r) - return NULL; - - tex = si_texture_create_object(&sscreen->b, templ, buf, &surface); - if (!tex) - return NULL; - - tex->buffer.b.is_shared = true; - tex->buffer.external_usage = usage; - - if (!si_read_tex_bo_metadata(sscreen, tex, &metadata)) { - si_texture_reference(&tex, NULL); - return NULL; - } - - /* Displayable DCC requires an explicit flush. */ - if (dedicated && - !(usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH) && - si_has_displayable_dcc(tex)) { - /* TODO: do we need to decompress DCC? */ - if (si_texture_discard_dcc(sscreen, tex)) { - /* Update BO metadata after disabling DCC. */ - si_set_tex_bo_metadata(sscreen, tex); - } - } + const struct pipe_resource *templ, + struct pb_buffer *buf, + unsigned stride, + unsigned offset, + unsigned usage, + bool dedicated) +{ + enum radeon_surf_mode array_mode; + struct radeon_surf surface = {}; + struct radeon_bo_metadata metadata = {}; + struct si_texture *tex; + bool is_scanout; + int r; + + /* Ignore metadata for non-zero planes. */ + if (offset != 0) + dedicated = false; + + if (dedicated) { + sscreen->ws->buffer_get_metadata(buf, &metadata); + si_get_display_metadata(sscreen, &surface, &metadata, + &array_mode, &is_scanout); + } else { + /** + * The bo metadata is unset for un-dedicated images. So we fall + * back to linear. See answer to question 5 of the + * VK_KHX_external_memory spec for some details. + * + * It is possible that this case isn't going to work if the + * surface pitch isn't correctly aligned by default. + * + * In order to support it correctly we require multi-image + * metadata to be syncrhonized between radv and radeonsi. The + * semantics of associating multiple image metadata to a memory + * object on the vulkan export side are not concretely defined + * either. + * + * All the use cases we are aware of at the moment for memory + * objects use dedicated allocations. So lets keep the initial + * implementation simple. + * + * A possible alternative is to attempt to reconstruct the + * tiling information when the TexParameter TEXTURE_TILING_EXT + * is set. + */ + array_mode = RADEON_SURF_MODE_LINEAR_ALIGNED; + is_scanout = false; + } + + r = si_init_surface(sscreen, &surface, templ, + array_mode, stride, true, is_scanout, + false, false); + if (r) + return NULL; + + tex = si_texture_create_object(&sscreen->b, templ, &surface, NULL, buf, + offset, 0, 0); + if (!tex) + return NULL; + + tex->buffer.b.is_shared = true; + tex->buffer.external_usage = usage; + tex->num_planes = 1; + + if (!si_read_tex_bo_metadata(sscreen, tex, offset, &metadata)) { + si_texture_reference(&tex, NULL); + return NULL; + } + + /* Displayable DCC requires an explicit flush. */ + if (dedicated && offset == 0 && + !(usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH) && + si_has_displayable_dcc(tex)) { + /* TODO: do we need to decompress DCC? */ + if (si_texture_discard_dcc(sscreen, tex)) { + /* Update BO metadata after disabling DCC. */ + si_set_tex_bo_metadata(sscreen, tex); + } + } - assert(tex->surface.tile_swizzle == 0); - return &tex->buffer.b.b; + assert(tex->surface.tile_swizzle == 0); + return &tex->buffer.b.b; } static struct pipe_resource *si_texture_from_handle(struct pipe_screen *screen, - const struct pipe_resource *templ, - struct winsys_handle *whandle, - unsigned usage) -{ - struct si_screen *sscreen = (struct si_screen*)screen; - struct pb_buffer *buf = NULL; - unsigned stride = 0, offset = 0; - - /* Support only 2D textures without mipmaps */ - if ((templ->target != PIPE_TEXTURE_2D && templ->target != PIPE_TEXTURE_RECT && - templ->target != PIPE_TEXTURE_2D_ARRAY) || - templ->last_level != 0) - return NULL; - - buf = sscreen->ws->buffer_from_handle(sscreen->ws, whandle, - sscreen->info.max_alignment, - &stride, &offset); - if (!buf) - return NULL; - - return si_texture_from_winsys_buffer(sscreen, templ, buf, stride, - offset, usage, true); + const struct pipe_resource *templ, + struct winsys_handle *whandle, + unsigned usage) +{ + struct si_screen *sscreen = (struct si_screen*)screen; + struct pb_buffer *buf = NULL; + + /* Support only 2D textures without mipmaps */ + if ((templ->target != PIPE_TEXTURE_2D && templ->target != PIPE_TEXTURE_RECT && + templ->target != PIPE_TEXTURE_2D_ARRAY) || + templ->last_level != 0) + return NULL; + + buf = sscreen->ws->buffer_from_handle(sscreen->ws, whandle, + sscreen->info.max_alignment); + if (!buf) + return NULL; + + return si_texture_from_winsys_buffer(sscreen, templ, buf, + whandle->stride, whandle->offset, + usage, true); } bool si_init_flushed_depth_texture(struct pipe_context *ctx, - struct pipe_resource *texture) + struct pipe_resource *texture) { - struct si_texture *tex = (struct si_texture*)texture; - struct pipe_resource resource; - enum pipe_format pipe_format = texture->format; - - assert(!tex->flushed_depth_texture); - - if (!tex->can_sample_z && tex->can_sample_s) { - switch (pipe_format) { - case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: - /* Save memory by not allocating the S plane. */ - pipe_format = PIPE_FORMAT_Z32_FLOAT; - break; - case PIPE_FORMAT_Z24_UNORM_S8_UINT: - case PIPE_FORMAT_S8_UINT_Z24_UNORM: - /* Save memory bandwidth by not copying the - * stencil part during flush. - * - * This potentially increases memory bandwidth - * if an application uses both Z and S texturing - * simultaneously (a flushed Z24S8 texture - * would be stored compactly), but how often - * does that really happen? - */ - pipe_format = PIPE_FORMAT_Z24X8_UNORM; - break; - default:; - } - } else if (!tex->can_sample_s && tex->can_sample_z) { - assert(util_format_has_stencil(util_format_description(pipe_format))); - - /* DB->CB copies to an 8bpp surface don't work. */ - pipe_format = PIPE_FORMAT_X24S8_UINT; - } - - memset(&resource, 0, sizeof(resource)); - resource.target = texture->target; - resource.format = pipe_format; - resource.width0 = texture->width0; - resource.height0 = texture->height0; - resource.depth0 = texture->depth0; - resource.array_size = texture->array_size; - resource.last_level = texture->last_level; - resource.nr_samples = texture->nr_samples; - resource.usage = PIPE_USAGE_DEFAULT; - resource.bind = texture->bind & ~PIPE_BIND_DEPTH_STENCIL; - resource.flags = texture->flags | SI_RESOURCE_FLAG_FLUSHED_DEPTH; - - tex->flushed_depth_texture = (struct si_texture *)ctx->screen->resource_create(ctx->screen, &resource); - if (!tex->flushed_depth_texture) { - PRINT_ERR("failed to create temporary texture to hold flushed depth\n"); - return false; - } - return true; + struct si_texture *tex = (struct si_texture*)texture; + struct pipe_resource resource; + enum pipe_format pipe_format = texture->format; + + assert(!tex->flushed_depth_texture); + + if (!tex->can_sample_z && tex->can_sample_s) { + switch (pipe_format) { + case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: + /* Save memory by not allocating the S plane. */ + pipe_format = PIPE_FORMAT_Z32_FLOAT; + break; + case PIPE_FORMAT_Z24_UNORM_S8_UINT: + case PIPE_FORMAT_S8_UINT_Z24_UNORM: + /* Save memory bandwidth by not copying the + * stencil part during flush. + * + * This potentially increases memory bandwidth + * if an application uses both Z and S texturing + * simultaneously (a flushed Z24S8 texture + * would be stored compactly), but how often + * does that really happen? + */ + pipe_format = PIPE_FORMAT_Z24X8_UNORM; + break; + default:; + } + } else if (!tex->can_sample_s && tex->can_sample_z) { + assert(util_format_has_stencil(util_format_description(pipe_format))); + + /* DB->CB copies to an 8bpp surface don't work. */ + pipe_format = PIPE_FORMAT_X24S8_UINT; + } + + memset(&resource, 0, sizeof(resource)); + resource.target = texture->target; + resource.format = pipe_format; + resource.width0 = texture->width0; + resource.height0 = texture->height0; + resource.depth0 = texture->depth0; + resource.array_size = texture->array_size; + resource.last_level = texture->last_level; + resource.nr_samples = texture->nr_samples; + resource.usage = PIPE_USAGE_DEFAULT; + resource.bind = texture->bind & ~PIPE_BIND_DEPTH_STENCIL; + resource.flags = texture->flags | SI_RESOURCE_FLAG_FLUSHED_DEPTH; + + tex->flushed_depth_texture = (struct si_texture *)ctx->screen->resource_create(ctx->screen, &resource); + if (!tex->flushed_depth_texture) { + PRINT_ERR("failed to create temporary texture to hold flushed depth\n"); + return false; + } + return true; } /** @@ -1866,678 +1916,678 @@ * mipmap level. */ static void si_init_temp_resource_from_box(struct pipe_resource *res, - struct pipe_resource *orig, - const struct pipe_box *box, - unsigned level, unsigned flags) -{ - memset(res, 0, sizeof(*res)); - res->format = orig->format; - res->width0 = box->width; - res->height0 = box->height; - res->depth0 = 1; - res->array_size = 1; - res->usage = flags & SI_RESOURCE_FLAG_TRANSFER ? PIPE_USAGE_STAGING : PIPE_USAGE_DEFAULT; - res->flags = flags; - - if (flags & SI_RESOURCE_FLAG_TRANSFER && - util_format_is_compressed(orig->format)) { - /* Transfer resources are allocated with linear tiling, which is - * not supported for compressed formats. - */ - unsigned blocksize = - util_format_get_blocksize(orig->format); - - if (blocksize == 8) { - res->format = PIPE_FORMAT_R16G16B16A16_UINT; - } else { - assert(blocksize == 16); - res->format = PIPE_FORMAT_R32G32B32A32_UINT; - } - - res->width0 = util_format_get_nblocksx(orig->format, box->width); - res->height0 = util_format_get_nblocksy(orig->format, box->height); - } - - /* We must set the correct texture target and dimensions for a 3D box. */ - if (box->depth > 1 && util_max_layer(orig, level) > 0) { - res->target = PIPE_TEXTURE_2D_ARRAY; - res->array_size = box->depth; - } else { - res->target = PIPE_TEXTURE_2D; - } + struct pipe_resource *orig, + const struct pipe_box *box, + unsigned level, unsigned flags) +{ + memset(res, 0, sizeof(*res)); + res->format = orig->format; + res->width0 = box->width; + res->height0 = box->height; + res->depth0 = 1; + res->array_size = 1; + res->usage = flags & SI_RESOURCE_FLAG_TRANSFER ? PIPE_USAGE_STAGING : PIPE_USAGE_DEFAULT; + res->flags = flags; + + if (flags & SI_RESOURCE_FLAG_TRANSFER && + util_format_is_compressed(orig->format)) { + /* Transfer resources are allocated with linear tiling, which is + * not supported for compressed formats. + */ + unsigned blocksize = + util_format_get_blocksize(orig->format); + + if (blocksize == 8) { + res->format = PIPE_FORMAT_R16G16B16A16_UINT; + } else { + assert(blocksize == 16); + res->format = PIPE_FORMAT_R32G32B32A32_UINT; + } + + res->width0 = util_format_get_nblocksx(orig->format, box->width); + res->height0 = util_format_get_nblocksy(orig->format, box->height); + } + + /* We must set the correct texture target and dimensions for a 3D box. */ + if (box->depth > 1 && util_max_layer(orig, level) > 0) { + res->target = PIPE_TEXTURE_2D_ARRAY; + res->array_size = box->depth; + } else { + res->target = PIPE_TEXTURE_2D; + } } static bool si_can_invalidate_texture(struct si_screen *sscreen, - struct si_texture *tex, - unsigned transfer_usage, - const struct pipe_box *box) -{ - return !tex->buffer.b.is_shared && - !(tex->surface.flags & RADEON_SURF_IMPORTED) && - !(transfer_usage & PIPE_TRANSFER_READ) && - tex->buffer.b.b.last_level == 0 && - util_texrange_covers_whole_level(&tex->buffer.b.b, 0, - box->x, box->y, box->z, - box->width, box->height, - box->depth); + struct si_texture *tex, + unsigned transfer_usage, + const struct pipe_box *box) +{ + return !tex->buffer.b.is_shared && + !(tex->surface.flags & RADEON_SURF_IMPORTED) && + !(transfer_usage & PIPE_TRANSFER_READ) && + tex->buffer.b.b.last_level == 0 && + util_texrange_covers_whole_level(&tex->buffer.b.b, 0, + box->x, box->y, box->z, + box->width, box->height, + box->depth); } static void si_texture_invalidate_storage(struct si_context *sctx, - struct si_texture *tex) + struct si_texture *tex) { - struct si_screen *sscreen = sctx->screen; + struct si_screen *sscreen = sctx->screen; - /* There is no point in discarding depth and tiled buffers. */ - assert(!tex->is_depth); - assert(tex->surface.is_linear); + /* There is no point in discarding depth and tiled buffers. */ + assert(!tex->is_depth); + assert(tex->surface.is_linear); - /* Reallocate the buffer in the same pipe_resource. */ - si_alloc_resource(sscreen, &tex->buffer); + /* Reallocate the buffer in the same pipe_resource. */ + si_alloc_resource(sscreen, &tex->buffer); - /* Initialize the CMASK base address (needed even without CMASK). */ - tex->cmask_base_address_reg = - (tex->buffer.gpu_address + tex->cmask_offset) >> 8; + /* Initialize the CMASK base address (needed even without CMASK). */ + tex->cmask_base_address_reg = + (tex->buffer.gpu_address + tex->surface.cmask_offset) >> 8; - p_atomic_inc(&sscreen->dirty_tex_counter); + p_atomic_inc(&sscreen->dirty_tex_counter); - sctx->num_alloc_tex_transfer_bytes += tex->size; + sctx->num_alloc_tex_transfer_bytes += tex->surface.total_size; } static void *si_texture_transfer_map(struct pipe_context *ctx, - struct pipe_resource *texture, - unsigned level, - unsigned usage, - const struct pipe_box *box, - struct pipe_transfer **ptransfer) -{ - struct si_context *sctx = (struct si_context*)ctx; - struct si_texture *tex = (struct si_texture*)texture; - struct si_transfer *trans; - struct si_resource *buf; - unsigned offset = 0; - char *map; - bool use_staging_texture = false; - - assert(!(texture->flags & SI_RESOURCE_FLAG_TRANSFER)); - assert(box->width && box->height && box->depth); - - if (tex->is_depth) { - /* Depth textures use staging unconditionally. */ - use_staging_texture = true; - } else { - /* Degrade the tile mode if we get too many transfers on APUs. - * On dGPUs, the staging texture is always faster. - * Only count uploads that are at least 4x4 pixels large. - */ - if (!sctx->screen->info.has_dedicated_vram && - level == 0 && - box->width >= 4 && box->height >= 4 && - p_atomic_inc_return(&tex->num_level0_transfers) == 10) { - bool can_invalidate = - si_can_invalidate_texture(sctx->screen, tex, - usage, box); - - si_reallocate_texture_inplace(sctx, tex, - PIPE_BIND_LINEAR, - can_invalidate); - } - - /* Tiled textures need to be converted into a linear texture for CPU - * access. The staging texture is always linear and is placed in GART. - * - * Reading from VRAM or GTT WC is slow, always use the staging - * texture in this case. - * - * Use the staging texture for uploads if the underlying BO - * is busy. - */ - if (!tex->surface.is_linear) - use_staging_texture = true; - else if (usage & PIPE_TRANSFER_READ) - use_staging_texture = - tex->buffer.domains & RADEON_DOMAIN_VRAM || - tex->buffer.flags & RADEON_FLAG_GTT_WC; - /* Write & linear only: */ - else if (si_rings_is_buffer_referenced(sctx, tex->buffer.buf, - RADEON_USAGE_READWRITE) || - !sctx->ws->buffer_wait(tex->buffer.buf, 0, - RADEON_USAGE_READWRITE)) { - /* It's busy. */ - if (si_can_invalidate_texture(sctx->screen, tex, - usage, box)) - si_texture_invalidate_storage(sctx, tex); - else - use_staging_texture = true; - } - } - - trans = CALLOC_STRUCT(si_transfer); - if (!trans) - return NULL; - pipe_resource_reference(&trans->b.b.resource, texture); - trans->b.b.level = level; - trans->b.b.usage = usage; - trans->b.b.box = *box; - - if (use_staging_texture) { - struct pipe_resource resource; - struct si_texture *staging; - - si_init_temp_resource_from_box(&resource, texture, box, level, - SI_RESOURCE_FLAG_TRANSFER); - resource.usage = (usage & PIPE_TRANSFER_READ) ? - PIPE_USAGE_STAGING : PIPE_USAGE_STREAM; - - /* Since depth-stencil textures don't support linear tiling, - * blit from ZS to color and vice versa. u_blitter will do - * the packing for these formats. - */ - if (tex->is_depth) - resource.format = util_blitter_get_color_format_for_zs(resource.format); - - /* Create the temporary texture. */ - staging = (struct si_texture*)ctx->screen->resource_create(ctx->screen, &resource); - if (!staging) { - PRINT_ERR("failed to create temporary texture to hold untiled copy\n"); - goto fail_trans; - } - trans->staging = &staging->buffer; - - /* Just get the strides. */ - si_texture_get_offset(sctx->screen, staging, 0, NULL, - &trans->b.b.stride, - &trans->b.b.layer_stride); - - if (usage & PIPE_TRANSFER_READ) - si_copy_to_staging_texture(ctx, trans); - else - usage |= PIPE_TRANSFER_UNSYNCHRONIZED; - - buf = trans->staging; - } else { - /* the resource is mapped directly */ - offset = si_texture_get_offset(sctx->screen, tex, level, box, - &trans->b.b.stride, - &trans->b.b.layer_stride); - buf = &tex->buffer; - } - - /* Always unmap texture CPU mappings on 32-bit architectures, so that - * we don't run out of the CPU address space. - */ - if (sizeof(void*) == 4) - usage |= RADEON_TRANSFER_TEMPORARY; + struct pipe_resource *texture, + unsigned level, + unsigned usage, + const struct pipe_box *box, + struct pipe_transfer **ptransfer) +{ + struct si_context *sctx = (struct si_context*)ctx; + struct si_texture *tex = (struct si_texture*)texture; + struct si_transfer *trans; + struct si_resource *buf; + unsigned offset = 0; + char *map; + bool use_staging_texture = false; + + assert(!(texture->flags & SI_RESOURCE_FLAG_TRANSFER)); + assert(box->width && box->height && box->depth); + + if (tex->is_depth) { + /* Depth textures use staging unconditionally. */ + use_staging_texture = true; + } else { + /* Degrade the tile mode if we get too many transfers on APUs. + * On dGPUs, the staging texture is always faster. + * Only count uploads that are at least 4x4 pixels large. + */ + if (!sctx->screen->info.has_dedicated_vram && + level == 0 && + box->width >= 4 && box->height >= 4 && + p_atomic_inc_return(&tex->num_level0_transfers) == 10) { + bool can_invalidate = + si_can_invalidate_texture(sctx->screen, tex, + usage, box); + + si_reallocate_texture_inplace(sctx, tex, + PIPE_BIND_LINEAR, + can_invalidate); + } + + /* Tiled textures need to be converted into a linear texture for CPU + * access. The staging texture is always linear and is placed in GART. + * + * Reading from VRAM or GTT WC is slow, always use the staging + * texture in this case. + * + * Use the staging texture for uploads if the underlying BO + * is busy. + */ + if (!tex->surface.is_linear) + use_staging_texture = true; + else if (usage & PIPE_TRANSFER_READ) + use_staging_texture = + tex->buffer.domains & RADEON_DOMAIN_VRAM || + tex->buffer.flags & RADEON_FLAG_GTT_WC; + /* Write & linear only: */ + else if (si_rings_is_buffer_referenced(sctx, tex->buffer.buf, + RADEON_USAGE_READWRITE) || + !sctx->ws->buffer_wait(tex->buffer.buf, 0, + RADEON_USAGE_READWRITE)) { + /* It's busy. */ + if (si_can_invalidate_texture(sctx->screen, tex, + usage, box)) + si_texture_invalidate_storage(sctx, tex); + else + use_staging_texture = true; + } + } + + trans = CALLOC_STRUCT(si_transfer); + if (!trans) + return NULL; + pipe_resource_reference(&trans->b.b.resource, texture); + trans->b.b.level = level; + trans->b.b.usage = usage; + trans->b.b.box = *box; + + if (use_staging_texture) { + struct pipe_resource resource; + struct si_texture *staging; + + si_init_temp_resource_from_box(&resource, texture, box, level, + SI_RESOURCE_FLAG_TRANSFER); + resource.usage = (usage & PIPE_TRANSFER_READ) ? + PIPE_USAGE_STAGING : PIPE_USAGE_STREAM; + + /* Since depth-stencil textures don't support linear tiling, + * blit from ZS to color and vice versa. u_blitter will do + * the packing for these formats. + */ + if (tex->is_depth) + resource.format = util_blitter_get_color_format_for_zs(resource.format); + + /* Create the temporary texture. */ + staging = (struct si_texture*)ctx->screen->resource_create(ctx->screen, &resource); + if (!staging) { + PRINT_ERR("failed to create temporary texture to hold untiled copy\n"); + goto fail_trans; + } + trans->staging = &staging->buffer; + + /* Just get the strides. */ + si_texture_get_offset(sctx->screen, staging, 0, NULL, + &trans->b.b.stride, + &trans->b.b.layer_stride); + + if (usage & PIPE_TRANSFER_READ) + si_copy_to_staging_texture(ctx, trans); + else + usage |= PIPE_TRANSFER_UNSYNCHRONIZED; + + buf = trans->staging; + } else { + /* the resource is mapped directly */ + offset = si_texture_get_offset(sctx->screen, tex, level, box, + &trans->b.b.stride, + &trans->b.b.layer_stride); + buf = &tex->buffer; + } + + /* Always unmap texture CPU mappings on 32-bit architectures, so that + * we don't run out of the CPU address space. + */ + if (sizeof(void*) == 4) + usage |= RADEON_TRANSFER_TEMPORARY; - if (!(map = si_buffer_map_sync_with_rings(sctx, buf, usage))) - goto fail_trans; + if (!(map = si_buffer_map_sync_with_rings(sctx, buf, usage))) + goto fail_trans; - *ptransfer = &trans->b.b; - return map + offset; + *ptransfer = &trans->b.b; + return map + offset; fail_trans: - si_resource_reference(&trans->staging, NULL); - pipe_resource_reference(&trans->b.b.resource, NULL); - FREE(trans); - return NULL; + si_resource_reference(&trans->staging, NULL); + pipe_resource_reference(&trans->b.b.resource, NULL); + FREE(trans); + return NULL; } static void si_texture_transfer_unmap(struct pipe_context *ctx, - struct pipe_transfer* transfer) + struct pipe_transfer* transfer) { - struct si_context *sctx = (struct si_context*)ctx; - struct si_transfer *stransfer = (struct si_transfer*)transfer; - struct pipe_resource *texture = transfer->resource; - struct si_texture *tex = (struct si_texture*)texture; - - /* Always unmap texture CPU mappings on 32-bit architectures, so that - * we don't run out of the CPU address space. - */ - if (sizeof(void*) == 4) { - struct si_resource *buf = - stransfer->staging ? stransfer->staging : &tex->buffer; - - sctx->ws->buffer_unmap(buf->buf); - } - - if ((transfer->usage & PIPE_TRANSFER_WRITE) && stransfer->staging) - si_copy_from_staging_texture(ctx, stransfer); - - if (stransfer->staging) { - sctx->num_alloc_tex_transfer_bytes += stransfer->staging->buf->size; - si_resource_reference(&stransfer->staging, NULL); - } - - /* Heuristic for {upload, draw, upload, draw, ..}: - * - * Flush the gfx IB if we've allocated too much texture storage. - * - * The idea is that we don't want to build IBs that use too much - * memory and put pressure on the kernel memory manager and we also - * want to make temporary and invalidated buffers go idle ASAP to - * decrease the total memory usage or make them reusable. The memory - * usage will be slightly higher than given here because of the buffer - * cache in the winsys. - * - * The result is that the kernel memory manager is never a bottleneck. - */ - if (sctx->num_alloc_tex_transfer_bytes > sctx->screen->info.gart_size / 4) { - si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL); - sctx->num_alloc_tex_transfer_bytes = 0; - } + struct si_context *sctx = (struct si_context*)ctx; + struct si_transfer *stransfer = (struct si_transfer*)transfer; + struct pipe_resource *texture = transfer->resource; + struct si_texture *tex = (struct si_texture*)texture; + + /* Always unmap texture CPU mappings on 32-bit architectures, so that + * we don't run out of the CPU address space. + */ + if (sizeof(void*) == 4) { + struct si_resource *buf = + stransfer->staging ? stransfer->staging : &tex->buffer; + + sctx->ws->buffer_unmap(buf->buf); + } + + if ((transfer->usage & PIPE_TRANSFER_WRITE) && stransfer->staging) + si_copy_from_staging_texture(ctx, stransfer); + + if (stransfer->staging) { + sctx->num_alloc_tex_transfer_bytes += stransfer->staging->buf->size; + si_resource_reference(&stransfer->staging, NULL); + } + + /* Heuristic for {upload, draw, upload, draw, ..}: + * + * Flush the gfx IB if we've allocated too much texture storage. + * + * The idea is that we don't want to build IBs that use too much + * memory and put pressure on the kernel memory manager and we also + * want to make temporary and invalidated buffers go idle ASAP to + * decrease the total memory usage or make them reusable. The memory + * usage will be slightly higher than given here because of the buffer + * cache in the winsys. + * + * The result is that the kernel memory manager is never a bottleneck. + */ + if (sctx->num_alloc_tex_transfer_bytes > sctx->screen->info.gart_size / 4) { + si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL); + sctx->num_alloc_tex_transfer_bytes = 0; + } - pipe_resource_reference(&transfer->resource, NULL); - FREE(transfer); + pipe_resource_reference(&transfer->resource, NULL); + FREE(transfer); } static const struct u_resource_vtbl si_texture_vtbl = { - NULL, /* get_handle */ - si_texture_destroy, /* resource_destroy */ - si_texture_transfer_map, /* transfer_map */ - u_default_transfer_flush_region, /* transfer_flush_region */ - si_texture_transfer_unmap, /* transfer_unmap */ + NULL, /* get_handle */ + si_texture_destroy, /* resource_destroy */ + si_texture_transfer_map, /* transfer_map */ + u_default_transfer_flush_region, /* transfer_flush_region */ + si_texture_transfer_unmap, /* transfer_unmap */ }; /* Return if it's allowed to reinterpret one format as another with DCC enabled. */ bool vi_dcc_formats_compatible(struct si_screen *sscreen, - enum pipe_format format1, - enum pipe_format format2) + enum pipe_format format1, + enum pipe_format format2) { - const struct util_format_description *desc1, *desc2; + const struct util_format_description *desc1, *desc2; - /* No format change - exit early. */ - if (format1 == format2) - return true; - - format1 = si_simplify_cb_format(format1); - format2 = si_simplify_cb_format(format2); - - /* Check again after format adjustments. */ - if (format1 == format2) - return true; - - desc1 = util_format_description(format1); - desc2 = util_format_description(format2); - - if (desc1->layout != UTIL_FORMAT_LAYOUT_PLAIN || - desc2->layout != UTIL_FORMAT_LAYOUT_PLAIN) - return false; - - /* Float and non-float are totally incompatible. */ - if ((desc1->channel[0].type == UTIL_FORMAT_TYPE_FLOAT) != - (desc2->channel[0].type == UTIL_FORMAT_TYPE_FLOAT)) - return false; - - /* Channel sizes must match across DCC formats. - * Comparing just the first 2 channels should be enough. - */ - if (desc1->channel[0].size != desc2->channel[0].size || - (desc1->nr_channels >= 2 && - desc1->channel[1].size != desc2->channel[1].size)) - return false; - - /* Everything below is not needed if the driver never uses the DCC - * clear code with the value of 1. - */ - - /* If the clear values are all 1 or all 0, this constraint can be - * ignored. */ - if (vi_alpha_is_on_msb(sscreen, format1) != vi_alpha_is_on_msb(sscreen, format2)) - return false; - - /* Channel types must match if the clear value of 1 is used. - * The type categories are only float, signed, unsigned. - * NORM and INT are always compatible. - */ - if (desc1->channel[0].type != desc2->channel[0].type || - (desc1->nr_channels >= 2 && - desc1->channel[1].type != desc2->channel[1].type)) - return false; + /* No format change - exit early. */ + if (format1 == format2) + return true; + + format1 = si_simplify_cb_format(format1); + format2 = si_simplify_cb_format(format2); + + /* Check again after format adjustments. */ + if (format1 == format2) + return true; + + desc1 = util_format_description(format1); + desc2 = util_format_description(format2); + + if (desc1->layout != UTIL_FORMAT_LAYOUT_PLAIN || + desc2->layout != UTIL_FORMAT_LAYOUT_PLAIN) + return false; + + /* Float and non-float are totally incompatible. */ + if ((desc1->channel[0].type == UTIL_FORMAT_TYPE_FLOAT) != + (desc2->channel[0].type == UTIL_FORMAT_TYPE_FLOAT)) + return false; + + /* Channel sizes must match across DCC formats. + * Comparing just the first 2 channels should be enough. + */ + if (desc1->channel[0].size != desc2->channel[0].size || + (desc1->nr_channels >= 2 && + desc1->channel[1].size != desc2->channel[1].size)) + return false; + + /* Everything below is not needed if the driver never uses the DCC + * clear code with the value of 1. + */ + + /* If the clear values are all 1 or all 0, this constraint can be + * ignored. */ + if (vi_alpha_is_on_msb(sscreen, format1) != vi_alpha_is_on_msb(sscreen, format2)) + return false; + + /* Channel types must match if the clear value of 1 is used. + * The type categories are only float, signed, unsigned. + * NORM and INT are always compatible. + */ + if (desc1->channel[0].type != desc2->channel[0].type || + (desc1->nr_channels >= 2 && + desc1->channel[1].type != desc2->channel[1].type)) + return false; - return true; + return true; } bool vi_dcc_formats_are_incompatible(struct pipe_resource *tex, - unsigned level, - enum pipe_format view_format) + unsigned level, + enum pipe_format view_format) { - struct si_texture *stex = (struct si_texture *)tex; + struct si_texture *stex = (struct si_texture *)tex; - return vi_dcc_enabled(stex, level) && - !vi_dcc_formats_compatible((struct si_screen*)tex->screen, - tex->format, view_format); + return vi_dcc_enabled(stex, level) && + !vi_dcc_formats_compatible((struct si_screen*)tex->screen, + tex->format, view_format); } /* This can't be merged with the above function, because * vi_dcc_formats_compatible should be called only when DCC is enabled. */ void vi_disable_dcc_if_incompatible_format(struct si_context *sctx, - struct pipe_resource *tex, - unsigned level, - enum pipe_format view_format) -{ - struct si_texture *stex = (struct si_texture *)tex; - - if (vi_dcc_formats_are_incompatible(tex, level, view_format)) - if (!si_texture_disable_dcc(sctx, stex)) - si_decompress_dcc(sctx, stex); + struct pipe_resource *tex, + unsigned level, + enum pipe_format view_format) +{ + struct si_texture *stex = (struct si_texture *)tex; + + if (vi_dcc_formats_are_incompatible(tex, level, view_format)) + if (!si_texture_disable_dcc(sctx, stex)) + si_decompress_dcc(sctx, stex); } struct pipe_surface *si_create_surface_custom(struct pipe_context *pipe, - struct pipe_resource *texture, - const struct pipe_surface *templ, - unsigned width0, unsigned height0, - unsigned width, unsigned height) -{ - struct si_surface *surface = CALLOC_STRUCT(si_surface); - - if (!surface) - return NULL; - - assert(templ->u.tex.first_layer <= util_max_layer(texture, templ->u.tex.level)); - assert(templ->u.tex.last_layer <= util_max_layer(texture, templ->u.tex.level)); - - pipe_reference_init(&surface->base.reference, 1); - pipe_resource_reference(&surface->base.texture, texture); - surface->base.context = pipe; - surface->base.format = templ->format; - surface->base.width = width; - surface->base.height = height; - surface->base.u = templ->u; - - surface->width0 = width0; - surface->height0 = height0; - - surface->dcc_incompatible = - texture->target != PIPE_BUFFER && - vi_dcc_formats_are_incompatible(texture, templ->u.tex.level, - templ->format); - return &surface->base; + struct pipe_resource *texture, + const struct pipe_surface *templ, + unsigned width0, unsigned height0, + unsigned width, unsigned height) +{ + struct si_surface *surface = CALLOC_STRUCT(si_surface); + + if (!surface) + return NULL; + + assert(templ->u.tex.first_layer <= util_max_layer(texture, templ->u.tex.level)); + assert(templ->u.tex.last_layer <= util_max_layer(texture, templ->u.tex.level)); + + pipe_reference_init(&surface->base.reference, 1); + pipe_resource_reference(&surface->base.texture, texture); + surface->base.context = pipe; + surface->base.format = templ->format; + surface->base.width = width; + surface->base.height = height; + surface->base.u = templ->u; + + surface->width0 = width0; + surface->height0 = height0; + + surface->dcc_incompatible = + texture->target != PIPE_BUFFER && + vi_dcc_formats_are_incompatible(texture, templ->u.tex.level, + templ->format); + return &surface->base; } static struct pipe_surface *si_create_surface(struct pipe_context *pipe, - struct pipe_resource *tex, - const struct pipe_surface *templ) + struct pipe_resource *tex, + const struct pipe_surface *templ) { - unsigned level = templ->u.tex.level; - unsigned width = u_minify(tex->width0, level); - unsigned height = u_minify(tex->height0, level); - unsigned width0 = tex->width0; - unsigned height0 = tex->height0; - - if (tex->target != PIPE_BUFFER && templ->format != tex->format) { - const struct util_format_description *tex_desc - = util_format_description(tex->format); - const struct util_format_description *templ_desc - = util_format_description(templ->format); - - assert(tex_desc->block.bits == templ_desc->block.bits); - - /* Adjust size of surface if and only if the block width or - * height is changed. */ - if (tex_desc->block.width != templ_desc->block.width || - tex_desc->block.height != templ_desc->block.height) { - unsigned nblks_x = util_format_get_nblocksx(tex->format, width); - unsigned nblks_y = util_format_get_nblocksy(tex->format, height); - - width = nblks_x * templ_desc->block.width; - height = nblks_y * templ_desc->block.height; - - width0 = util_format_get_nblocksx(tex->format, width0); - height0 = util_format_get_nblocksy(tex->format, height0); - } - } - - return si_create_surface_custom(pipe, tex, templ, - width0, height0, - width, height); + unsigned level = templ->u.tex.level; + unsigned width = u_minify(tex->width0, level); + unsigned height = u_minify(tex->height0, level); + unsigned width0 = tex->width0; + unsigned height0 = tex->height0; + + if (tex->target != PIPE_BUFFER && templ->format != tex->format) { + const struct util_format_description *tex_desc + = util_format_description(tex->format); + const struct util_format_description *templ_desc + = util_format_description(templ->format); + + assert(tex_desc->block.bits == templ_desc->block.bits); + + /* Adjust size of surface if and only if the block width or + * height is changed. */ + if (tex_desc->block.width != templ_desc->block.width || + tex_desc->block.height != templ_desc->block.height) { + unsigned nblks_x = util_format_get_nblocksx(tex->format, width); + unsigned nblks_y = util_format_get_nblocksy(tex->format, height); + + width = nblks_x * templ_desc->block.width; + height = nblks_y * templ_desc->block.height; + + width0 = util_format_get_nblocksx(tex->format, width0); + height0 = util_format_get_nblocksy(tex->format, height0); + } + } + + return si_create_surface_custom(pipe, tex, templ, + width0, height0, + width, height); } static void si_surface_destroy(struct pipe_context *pipe, - struct pipe_surface *surface) + struct pipe_surface *surface) { - pipe_resource_reference(&surface->texture, NULL); - FREE(surface); + pipe_resource_reference(&surface->texture, NULL); + FREE(surface); } unsigned si_translate_colorswap(enum pipe_format format, bool do_endian_swap) { - const struct util_format_description *desc = util_format_description(format); + const struct util_format_description *desc = util_format_description(format); #define HAS_SWIZZLE(chan,swz) (desc->swizzle[chan] == PIPE_SWIZZLE_##swz) - if (format == PIPE_FORMAT_R11G11B10_FLOAT) /* isn't plain */ - return V_028C70_SWAP_STD; + if (format == PIPE_FORMAT_R11G11B10_FLOAT) /* isn't plain */ + return V_028C70_SWAP_STD; - if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN) - return ~0U; + if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN) + return ~0U; - switch (desc->nr_channels) { - case 1: - if (HAS_SWIZZLE(0,X)) - return V_028C70_SWAP_STD; /* X___ */ - else if (HAS_SWIZZLE(3,X)) - return V_028C70_SWAP_ALT_REV; /* ___X */ - break; - case 2: - if ((HAS_SWIZZLE(0,X) && HAS_SWIZZLE(1,Y)) || - (HAS_SWIZZLE(0,X) && HAS_SWIZZLE(1,NONE)) || - (HAS_SWIZZLE(0,NONE) && HAS_SWIZZLE(1,Y))) - return V_028C70_SWAP_STD; /* XY__ */ - else if ((HAS_SWIZZLE(0,Y) && HAS_SWIZZLE(1,X)) || - (HAS_SWIZZLE(0,Y) && HAS_SWIZZLE(1,NONE)) || - (HAS_SWIZZLE(0,NONE) && HAS_SWIZZLE(1,X))) - /* YX__ */ - return (do_endian_swap ? V_028C70_SWAP_STD : V_028C70_SWAP_STD_REV); - else if (HAS_SWIZZLE(0,X) && HAS_SWIZZLE(3,Y)) - return V_028C70_SWAP_ALT; /* X__Y */ - else if (HAS_SWIZZLE(0,Y) && HAS_SWIZZLE(3,X)) - return V_028C70_SWAP_ALT_REV; /* Y__X */ - break; - case 3: - if (HAS_SWIZZLE(0,X)) - return (do_endian_swap ? V_028C70_SWAP_STD_REV : V_028C70_SWAP_STD); - else if (HAS_SWIZZLE(0,Z)) - return V_028C70_SWAP_STD_REV; /* ZYX */ - break; - case 4: - /* check the middle channels, the 1st and 4th channel can be NONE */ - if (HAS_SWIZZLE(1,Y) && HAS_SWIZZLE(2,Z)) { - return V_028C70_SWAP_STD; /* XYZW */ - } else if (HAS_SWIZZLE(1,Z) && HAS_SWIZZLE(2,Y)) { - return V_028C70_SWAP_STD_REV; /* WZYX */ - } else if (HAS_SWIZZLE(1,Y) && HAS_SWIZZLE(2,X)) { - return V_028C70_SWAP_ALT; /* ZYXW */ - } else if (HAS_SWIZZLE(1,Z) && HAS_SWIZZLE(2,W)) { - /* YZWX */ - if (desc->is_array) - return V_028C70_SWAP_ALT_REV; - else - return (do_endian_swap ? V_028C70_SWAP_ALT : V_028C70_SWAP_ALT_REV); - } - break; - } - return ~0U; + switch (desc->nr_channels) { + case 1: + if (HAS_SWIZZLE(0,X)) + return V_028C70_SWAP_STD; /* X___ */ + else if (HAS_SWIZZLE(3,X)) + return V_028C70_SWAP_ALT_REV; /* ___X */ + break; + case 2: + if ((HAS_SWIZZLE(0,X) && HAS_SWIZZLE(1,Y)) || + (HAS_SWIZZLE(0,X) && HAS_SWIZZLE(1,NONE)) || + (HAS_SWIZZLE(0,NONE) && HAS_SWIZZLE(1,Y))) + return V_028C70_SWAP_STD; /* XY__ */ + else if ((HAS_SWIZZLE(0,Y) && HAS_SWIZZLE(1,X)) || + (HAS_SWIZZLE(0,Y) && HAS_SWIZZLE(1,NONE)) || + (HAS_SWIZZLE(0,NONE) && HAS_SWIZZLE(1,X))) + /* YX__ */ + return (do_endian_swap ? V_028C70_SWAP_STD : V_028C70_SWAP_STD_REV); + else if (HAS_SWIZZLE(0,X) && HAS_SWIZZLE(3,Y)) + return V_028C70_SWAP_ALT; /* X__Y */ + else if (HAS_SWIZZLE(0,Y) && HAS_SWIZZLE(3,X)) + return V_028C70_SWAP_ALT_REV; /* Y__X */ + break; + case 3: + if (HAS_SWIZZLE(0,X)) + return (do_endian_swap ? V_028C70_SWAP_STD_REV : V_028C70_SWAP_STD); + else if (HAS_SWIZZLE(0,Z)) + return V_028C70_SWAP_STD_REV; /* ZYX */ + break; + case 4: + /* check the middle channels, the 1st and 4th channel can be NONE */ + if (HAS_SWIZZLE(1,Y) && HAS_SWIZZLE(2,Z)) { + return V_028C70_SWAP_STD; /* XYZW */ + } else if (HAS_SWIZZLE(1,Z) && HAS_SWIZZLE(2,Y)) { + return V_028C70_SWAP_STD_REV; /* WZYX */ + } else if (HAS_SWIZZLE(1,Y) && HAS_SWIZZLE(2,X)) { + return V_028C70_SWAP_ALT; /* ZYXW */ + } else if (HAS_SWIZZLE(1,Z) && HAS_SWIZZLE(2,W)) { + /* YZWX */ + if (desc->is_array) + return V_028C70_SWAP_ALT_REV; + else + return (do_endian_swap ? V_028C70_SWAP_ALT : V_028C70_SWAP_ALT_REV); + } + break; + } + return ~0U; } /* PIPELINE_STAT-BASED DCC ENABLEMENT FOR DISPLAYABLE SURFACES */ static void vi_dcc_clean_up_context_slot(struct si_context *sctx, - int slot) + int slot) { - int i; + int i; - if (sctx->dcc_stats[slot].query_active) - vi_separate_dcc_stop_query(sctx, - sctx->dcc_stats[slot].tex); - - for (i = 0; i < ARRAY_SIZE(sctx->dcc_stats[slot].ps_stats); i++) - if (sctx->dcc_stats[slot].ps_stats[i]) { - sctx->b.destroy_query(&sctx->b, - sctx->dcc_stats[slot].ps_stats[i]); - sctx->dcc_stats[slot].ps_stats[i] = NULL; - } + if (sctx->dcc_stats[slot].query_active) + vi_separate_dcc_stop_query(sctx, + sctx->dcc_stats[slot].tex); + + for (i = 0; i < ARRAY_SIZE(sctx->dcc_stats[slot].ps_stats); i++) + if (sctx->dcc_stats[slot].ps_stats[i]) { + sctx->b.destroy_query(&sctx->b, + sctx->dcc_stats[slot].ps_stats[i]); + sctx->dcc_stats[slot].ps_stats[i] = NULL; + } - si_texture_reference(&sctx->dcc_stats[slot].tex, NULL); + si_texture_reference(&sctx->dcc_stats[slot].tex, NULL); } /** * Return the per-context slot where DCC statistics queries for the texture live. */ static unsigned vi_get_context_dcc_stats_index(struct si_context *sctx, - struct si_texture *tex) + struct si_texture *tex) { - int i, empty_slot = -1; + int i, empty_slot = -1; - /* Remove zombie textures (textures kept alive by this array only). */ - for (i = 0; i < ARRAY_SIZE(sctx->dcc_stats); i++) - if (sctx->dcc_stats[i].tex && - sctx->dcc_stats[i].tex->buffer.b.b.reference.count == 1) - vi_dcc_clean_up_context_slot(sctx, i); - - /* Find the texture. */ - for (i = 0; i < ARRAY_SIZE(sctx->dcc_stats); i++) { - /* Return if found. */ - if (sctx->dcc_stats[i].tex == tex) { - sctx->dcc_stats[i].last_use_timestamp = os_time_get(); - return i; - } - - /* Record the first seen empty slot. */ - if (empty_slot == -1 && !sctx->dcc_stats[i].tex) - empty_slot = i; - } - - /* Not found. Remove the oldest member to make space in the array. */ - if (empty_slot == -1) { - int oldest_slot = 0; - - /* Find the oldest slot. */ - for (i = 1; i < ARRAY_SIZE(sctx->dcc_stats); i++) - if (sctx->dcc_stats[oldest_slot].last_use_timestamp > - sctx->dcc_stats[i].last_use_timestamp) - oldest_slot = i; - - /* Clean up the oldest slot. */ - vi_dcc_clean_up_context_slot(sctx, oldest_slot); - empty_slot = oldest_slot; - } - - /* Add the texture to the new slot. */ - si_texture_reference(&sctx->dcc_stats[empty_slot].tex, tex); - sctx->dcc_stats[empty_slot].last_use_timestamp = os_time_get(); - return empty_slot; + /* Remove zombie textures (textures kept alive by this array only). */ + for (i = 0; i < ARRAY_SIZE(sctx->dcc_stats); i++) + if (sctx->dcc_stats[i].tex && + sctx->dcc_stats[i].tex->buffer.b.b.reference.count == 1) + vi_dcc_clean_up_context_slot(sctx, i); + + /* Find the texture. */ + for (i = 0; i < ARRAY_SIZE(sctx->dcc_stats); i++) { + /* Return if found. */ + if (sctx->dcc_stats[i].tex == tex) { + sctx->dcc_stats[i].last_use_timestamp = os_time_get(); + return i; + } + + /* Record the first seen empty slot. */ + if (empty_slot == -1 && !sctx->dcc_stats[i].tex) + empty_slot = i; + } + + /* Not found. Remove the oldest member to make space in the array. */ + if (empty_slot == -1) { + int oldest_slot = 0; + + /* Find the oldest slot. */ + for (i = 1; i < ARRAY_SIZE(sctx->dcc_stats); i++) + if (sctx->dcc_stats[oldest_slot].last_use_timestamp > + sctx->dcc_stats[i].last_use_timestamp) + oldest_slot = i; + + /* Clean up the oldest slot. */ + vi_dcc_clean_up_context_slot(sctx, oldest_slot); + empty_slot = oldest_slot; + } + + /* Add the texture to the new slot. */ + si_texture_reference(&sctx->dcc_stats[empty_slot].tex, tex); + sctx->dcc_stats[empty_slot].last_use_timestamp = os_time_get(); + return empty_slot; } static struct pipe_query * vi_create_resuming_pipestats_query(struct si_context *sctx) { - struct si_query_hw *query = (struct si_query_hw*) - sctx->b.create_query(&sctx->b, PIPE_QUERY_PIPELINE_STATISTICS, 0); + struct si_query_hw *query = (struct si_query_hw*) + sctx->b.create_query(&sctx->b, PIPE_QUERY_PIPELINE_STATISTICS, 0); - query->flags |= SI_QUERY_HW_FLAG_BEGIN_RESUMES; - return (struct pipe_query*)query; + query->flags |= SI_QUERY_HW_FLAG_BEGIN_RESUMES; + return (struct pipe_query*)query; } /** * Called when binding a color buffer. */ void vi_separate_dcc_start_query(struct si_context *sctx, - struct si_texture *tex) + struct si_texture *tex) { - unsigned i = vi_get_context_dcc_stats_index(sctx, tex); + unsigned i = vi_get_context_dcc_stats_index(sctx, tex); - assert(!sctx->dcc_stats[i].query_active); + assert(!sctx->dcc_stats[i].query_active); - if (!sctx->dcc_stats[i].ps_stats[0]) - sctx->dcc_stats[i].ps_stats[0] = vi_create_resuming_pipestats_query(sctx); + if (!sctx->dcc_stats[i].ps_stats[0]) + sctx->dcc_stats[i].ps_stats[0] = vi_create_resuming_pipestats_query(sctx); - /* begin or resume the query */ - sctx->b.begin_query(&sctx->b, sctx->dcc_stats[i].ps_stats[0]); - sctx->dcc_stats[i].query_active = true; + /* begin or resume the query */ + sctx->b.begin_query(&sctx->b, sctx->dcc_stats[i].ps_stats[0]); + sctx->dcc_stats[i].query_active = true; } /** * Called when unbinding a color buffer. */ void vi_separate_dcc_stop_query(struct si_context *sctx, - struct si_texture *tex) + struct si_texture *tex) { - unsigned i = vi_get_context_dcc_stats_index(sctx, tex); + unsigned i = vi_get_context_dcc_stats_index(sctx, tex); - assert(sctx->dcc_stats[i].query_active); - assert(sctx->dcc_stats[i].ps_stats[0]); + assert(sctx->dcc_stats[i].query_active); + assert(sctx->dcc_stats[i].ps_stats[0]); - /* pause or end the query */ - sctx->b.end_query(&sctx->b, sctx->dcc_stats[i].ps_stats[0]); - sctx->dcc_stats[i].query_active = false; + /* pause or end the query */ + sctx->b.end_query(&sctx->b, sctx->dcc_stats[i].ps_stats[0]); + sctx->dcc_stats[i].query_active = false; } static bool vi_should_enable_separate_dcc(struct si_texture *tex) { - /* The minimum number of fullscreen draws per frame that is required - * to enable DCC. */ - return tex->ps_draw_ratio + tex->num_slow_clears >= 5; + /* The minimum number of fullscreen draws per frame that is required + * to enable DCC. */ + return tex->ps_draw_ratio + tex->num_slow_clears >= 5; } /* Called by fast clear. */ void vi_separate_dcc_try_enable(struct si_context *sctx, - struct si_texture *tex) + struct si_texture *tex) { - /* The intent is to use this with shared displayable back buffers, - * but it's not strictly limited only to them. - */ - if (!tex->buffer.b.is_shared || - !(tex->buffer.external_usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH) || - tex->buffer.b.b.target != PIPE_TEXTURE_2D || - tex->buffer.b.b.last_level > 0 || - !tex->surface.dcc_size || - sctx->screen->debug_flags & DBG(NO_DCC) || - sctx->screen->debug_flags & DBG(NO_DCC_FB)) - return; - - assert(sctx->chip_class >= GFX8); - - if (tex->dcc_offset) - return; /* already enabled */ - - /* Enable the DCC stat gathering. */ - if (!tex->dcc_gather_statistics) { - tex->dcc_gather_statistics = true; - vi_separate_dcc_start_query(sctx, tex); - } - - if (!vi_should_enable_separate_dcc(tex)) - return; /* stats show that DCC decompression is too expensive */ - - assert(tex->surface.num_dcc_levels); - assert(!tex->dcc_separate_buffer); - - si_texture_discard_cmask(sctx->screen, tex); - - /* Get a DCC buffer. */ - if (tex->last_dcc_separate_buffer) { - assert(tex->dcc_gather_statistics); - assert(!tex->dcc_separate_buffer); - tex->dcc_separate_buffer = tex->last_dcc_separate_buffer; - tex->last_dcc_separate_buffer = NULL; - } else { - tex->dcc_separate_buffer = - si_aligned_buffer_create(sctx->b.screen, - SI_RESOURCE_FLAG_UNMAPPABLE, - PIPE_USAGE_DEFAULT, - tex->surface.dcc_size, - tex->surface.dcc_alignment); - if (!tex->dcc_separate_buffer) - return; - } - - /* dcc_offset is the absolute GPUVM address. */ - tex->dcc_offset = tex->dcc_separate_buffer->gpu_address; - - /* no need to flag anything since this is called by fast clear that - * flags framebuffer state - */ + /* The intent is to use this with shared displayable back buffers, + * but it's not strictly limited only to them. + */ + if (!tex->buffer.b.is_shared || + !(tex->buffer.external_usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH) || + tex->buffer.b.b.target != PIPE_TEXTURE_2D || + tex->buffer.b.b.last_level > 0 || + !tex->surface.dcc_size || + sctx->screen->debug_flags & DBG(NO_DCC) || + sctx->screen->debug_flags & DBG(NO_DCC_FB)) + return; + + assert(sctx->chip_class >= GFX8); + + if (tex->surface.dcc_offset) + return; /* already enabled */ + + /* Enable the DCC stat gathering. */ + if (!tex->dcc_gather_statistics) { + tex->dcc_gather_statistics = true; + vi_separate_dcc_start_query(sctx, tex); + } + + if (!vi_should_enable_separate_dcc(tex)) + return; /* stats show that DCC decompression is too expensive */ + + assert(tex->surface.num_dcc_levels); + assert(!tex->dcc_separate_buffer); + + si_texture_discard_cmask(sctx->screen, tex); + + /* Get a DCC buffer. */ + if (tex->last_dcc_separate_buffer) { + assert(tex->dcc_gather_statistics); + assert(!tex->dcc_separate_buffer); + tex->dcc_separate_buffer = tex->last_dcc_separate_buffer; + tex->last_dcc_separate_buffer = NULL; + } else { + tex->dcc_separate_buffer = + si_aligned_buffer_create(sctx->b.screen, + SI_RESOURCE_FLAG_UNMAPPABLE, + PIPE_USAGE_DEFAULT, + tex->surface.dcc_size, + tex->surface.dcc_alignment); + if (!tex->dcc_separate_buffer) + return; + } + + /* dcc_offset is the absolute GPUVM address. */ + tex->surface.dcc_offset = tex->dcc_separate_buffer->gpu_address; + + /* no need to flag anything since this is called by fast clear that + * flags framebuffer state + */ } /** @@ -2545,157 +2595,156 @@ * takes place. */ void vi_separate_dcc_process_and_reset_stats(struct pipe_context *ctx, - struct si_texture *tex) + struct si_texture *tex) { - struct si_context *sctx = (struct si_context*)ctx; - struct pipe_query *tmp; - unsigned i = vi_get_context_dcc_stats_index(sctx, tex); - bool query_active = sctx->dcc_stats[i].query_active; - bool disable = false; - - if (sctx->dcc_stats[i].ps_stats[2]) { - union pipe_query_result result; - - /* Read the results. */ - struct pipe_query *query = sctx->dcc_stats[i].ps_stats[2]; - ctx->get_query_result(ctx, query, - true, &result); - si_query_buffer_reset(sctx, &((struct si_query_hw*)query)->buffer); - - /* Compute the approximate number of fullscreen draws. */ - tex->ps_draw_ratio = - result.pipeline_statistics.ps_invocations / - (tex->buffer.b.b.width0 * tex->buffer.b.b.height0); - sctx->last_tex_ps_draw_ratio = tex->ps_draw_ratio; - - disable = tex->dcc_separate_buffer && - !vi_should_enable_separate_dcc(tex); - } - - tex->num_slow_clears = 0; - - /* stop the statistics query for ps_stats[0] */ - if (query_active) - vi_separate_dcc_stop_query(sctx, tex); - - /* Move the queries in the queue by one. */ - tmp = sctx->dcc_stats[i].ps_stats[2]; - sctx->dcc_stats[i].ps_stats[2] = sctx->dcc_stats[i].ps_stats[1]; - sctx->dcc_stats[i].ps_stats[1] = sctx->dcc_stats[i].ps_stats[0]; - sctx->dcc_stats[i].ps_stats[0] = tmp; - - /* create and start a new query as ps_stats[0] */ - if (query_active) - vi_separate_dcc_start_query(sctx, tex); - - if (disable) { - assert(!tex->last_dcc_separate_buffer); - tex->last_dcc_separate_buffer = tex->dcc_separate_buffer; - tex->dcc_separate_buffer = NULL; - tex->dcc_offset = 0; - /* no need to flag anything since this is called after - * decompression that re-sets framebuffer state - */ - } + struct si_context *sctx = (struct si_context*)ctx; + struct pipe_query *tmp; + unsigned i = vi_get_context_dcc_stats_index(sctx, tex); + bool query_active = sctx->dcc_stats[i].query_active; + bool disable = false; + + if (sctx->dcc_stats[i].ps_stats[2]) { + union pipe_query_result result; + + /* Read the results. */ + struct pipe_query *query = sctx->dcc_stats[i].ps_stats[2]; + ctx->get_query_result(ctx, query, + true, &result); + si_query_buffer_reset(sctx, &((struct si_query_hw*)query)->buffer); + + /* Compute the approximate number of fullscreen draws. */ + tex->ps_draw_ratio = + result.pipeline_statistics.ps_invocations / + (tex->buffer.b.b.width0 * tex->buffer.b.b.height0); + sctx->last_tex_ps_draw_ratio = tex->ps_draw_ratio; + + disable = tex->dcc_separate_buffer && + !vi_should_enable_separate_dcc(tex); + } + + tex->num_slow_clears = 0; + + /* stop the statistics query for ps_stats[0] */ + if (query_active) + vi_separate_dcc_stop_query(sctx, tex); + + /* Move the queries in the queue by one. */ + tmp = sctx->dcc_stats[i].ps_stats[2]; + sctx->dcc_stats[i].ps_stats[2] = sctx->dcc_stats[i].ps_stats[1]; + sctx->dcc_stats[i].ps_stats[1] = sctx->dcc_stats[i].ps_stats[0]; + sctx->dcc_stats[i].ps_stats[0] = tmp; + + /* create and start a new query as ps_stats[0] */ + if (query_active) + vi_separate_dcc_start_query(sctx, tex); + + if (disable) { + assert(!tex->last_dcc_separate_buffer); + tex->last_dcc_separate_buffer = tex->dcc_separate_buffer; + tex->dcc_separate_buffer = NULL; + tex->surface.dcc_offset = 0; + /* no need to flag anything since this is called after + * decompression that re-sets framebuffer state + */ + } } static struct pipe_memory_object * si_memobj_from_handle(struct pipe_screen *screen, - struct winsys_handle *whandle, - bool dedicated) + struct winsys_handle *whandle, + bool dedicated) { - struct si_screen *sscreen = (struct si_screen*)screen; - struct si_memory_object *memobj = CALLOC_STRUCT(si_memory_object); - struct pb_buffer *buf = NULL; - uint32_t stride, offset; - - if (!memobj) - return NULL; - - buf = sscreen->ws->buffer_from_handle(sscreen->ws, whandle, - sscreen->info.max_alignment, - &stride, &offset); - if (!buf) { - free(memobj); - return NULL; - } - - memobj->b.dedicated = dedicated; - memobj->buf = buf; - memobj->stride = stride; + struct si_screen *sscreen = (struct si_screen*)screen; + struct si_memory_object *memobj = CALLOC_STRUCT(si_memory_object); + struct pb_buffer *buf = NULL; + + if (!memobj) + return NULL; + + buf = sscreen->ws->buffer_from_handle(sscreen->ws, whandle, + sscreen->info.max_alignment); + if (!buf) { + free(memobj); + return NULL; + } + + memobj->b.dedicated = dedicated; + memobj->buf = buf; + memobj->stride = whandle->stride; - return (struct pipe_memory_object *)memobj; + return (struct pipe_memory_object *)memobj; } static void si_memobj_destroy(struct pipe_screen *screen, - struct pipe_memory_object *_memobj) + struct pipe_memory_object *_memobj) { - struct si_memory_object *memobj = (struct si_memory_object *)_memobj; + struct si_memory_object *memobj = (struct si_memory_object *)_memobj; - pb_reference(&memobj->buf, NULL); - free(memobj); + pb_reference(&memobj->buf, NULL); + free(memobj); } static struct pipe_resource * si_texture_from_memobj(struct pipe_screen *screen, - const struct pipe_resource *templ, - struct pipe_memory_object *_memobj, - uint64_t offset) -{ - struct si_screen *sscreen = (struct si_screen*)screen; - struct si_memory_object *memobj = (struct si_memory_object *)_memobj; - struct pipe_resource *tex = - si_texture_from_winsys_buffer(sscreen, templ, memobj->buf, - memobj->stride, offset, - PIPE_HANDLE_USAGE_FRAMEBUFFER_WRITE | - PIPE_HANDLE_USAGE_SHADER_WRITE, - memobj->b.dedicated); - if (!tex) - return NULL; - - /* si_texture_from_winsys_buffer doesn't increment refcount of - * memobj->buf, so increment it here. - */ - struct pb_buffer *buf = NULL; - pb_reference(&buf, memobj->buf); - return tex; + const struct pipe_resource *templ, + struct pipe_memory_object *_memobj, + uint64_t offset) +{ + struct si_screen *sscreen = (struct si_screen*)screen; + struct si_memory_object *memobj = (struct si_memory_object *)_memobj; + struct pipe_resource *tex = + si_texture_from_winsys_buffer(sscreen, templ, memobj->buf, + memobj->stride, offset, + PIPE_HANDLE_USAGE_FRAMEBUFFER_WRITE | + PIPE_HANDLE_USAGE_SHADER_WRITE, + memobj->b.dedicated); + if (!tex) + return NULL; + + /* si_texture_from_winsys_buffer doesn't increment refcount of + * memobj->buf, so increment it here. + */ + struct pb_buffer *buf = NULL; + pb_reference(&buf, memobj->buf); + return tex; } static bool si_check_resource_capability(struct pipe_screen *screen, - struct pipe_resource *resource, - unsigned bind) + struct pipe_resource *resource, + unsigned bind) { - struct si_texture *tex = (struct si_texture*)resource; + struct si_texture *tex = (struct si_texture*)resource; - /* Buffers only support the linear flag. */ - if (resource->target == PIPE_BUFFER) - return (bind & ~PIPE_BIND_LINEAR) == 0; + /* Buffers only support the linear flag. */ + if (resource->target == PIPE_BUFFER) + return (bind & ~PIPE_BIND_LINEAR) == 0; - if (bind & PIPE_BIND_LINEAR && !tex->surface.is_linear) - return false; + if (bind & PIPE_BIND_LINEAR && !tex->surface.is_linear) + return false; - if (bind & PIPE_BIND_SCANOUT && !tex->surface.is_displayable) - return false; + if (bind & PIPE_BIND_SCANOUT && !tex->surface.is_displayable) + return false; - /* TODO: PIPE_BIND_CURSOR - do we care? */ - return true; + /* TODO: PIPE_BIND_CURSOR - do we care? */ + return true; } void si_init_screen_texture_functions(struct si_screen *sscreen) { - sscreen->b.resource_from_handle = si_texture_from_handle; - sscreen->b.resource_get_handle = si_texture_get_handle; - sscreen->b.resource_get_info = si_texture_get_info; - sscreen->b.resource_from_memobj = si_texture_from_memobj; - sscreen->b.memobj_create_from_handle = si_memobj_from_handle; - sscreen->b.memobj_destroy = si_memobj_destroy; - sscreen->b.check_resource_capability = si_check_resource_capability; + sscreen->b.resource_from_handle = si_texture_from_handle; + sscreen->b.resource_get_handle = si_texture_get_handle; + sscreen->b.resource_get_param = si_resource_get_param; + sscreen->b.resource_get_info = si_texture_get_info; + sscreen->b.resource_from_memobj = si_texture_from_memobj; + sscreen->b.memobj_create_from_handle = si_memobj_from_handle; + sscreen->b.memobj_destroy = si_memobj_destroy; + sscreen->b.check_resource_capability = si_check_resource_capability; } void si_init_context_texture_functions(struct si_context *sctx) { - sctx->b.create_surface = si_create_surface; - sctx->b.surface_destroy = si_surface_destroy; + sctx->b.create_surface = si_create_surface; + sctx->b.surface_destroy = si_surface_destroy; } diff -Nru mesa-19.2.8/src/gallium/drivers/radeonsi/si_uvd.c mesa-20.0.8/src/gallium/drivers/radeonsi/si_uvd.c --- mesa-19.2.8/src/gallium/drivers/radeonsi/si_uvd.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/radeonsi/si_uvd.c 2020-06-12 01:21:17.000000000 +0000 @@ -40,73 +40,11 @@ struct pipe_video_buffer *si_video_buffer_create(struct pipe_context *pipe, const struct pipe_video_buffer *tmpl) { - struct si_context *ctx = (struct si_context *)pipe; - struct si_texture *resources[VL_NUM_COMPONENTS] = {}; - struct radeon_surf *surfaces[VL_NUM_COMPONENTS] = {}; - struct pb_buffer **pbs[VL_NUM_COMPONENTS] = {}; - const enum pipe_format *resource_formats; - struct pipe_video_buffer vidtemplate; - struct pipe_resource templ; - unsigned i, array_size; + struct pipe_video_buffer vidbuf = *tmpl; + /* TODO: get tiling working */ + vidbuf.bind |= PIPE_BIND_LINEAR; - assert(pipe); - - /* first create the needed resources as "normal" textures */ - resource_formats = vl_video_buffer_formats(pipe->screen, tmpl->buffer_format); - if (!resource_formats) - return NULL; - - array_size = tmpl->interlaced ? 2 : 1; - vidtemplate = *tmpl; - vidtemplate.width = align(tmpl->width, VL_MACROBLOCK_WIDTH); - vidtemplate.height = align(tmpl->height / array_size, VL_MACROBLOCK_HEIGHT); - - assert(resource_formats[0] != PIPE_FORMAT_NONE); - - for (i = 0; i < VL_NUM_COMPONENTS; ++i) { - if (resource_formats[i] != PIPE_FORMAT_NONE) { - vl_video_buffer_template(&templ, &vidtemplate, - resource_formats[i], 1, - array_size, PIPE_USAGE_DEFAULT, i); - /* Set PIPE_BIND_SHARED to avoid reallocation in si_texture_get_handle, - * which can't handle joined surfaces. */ - /* TODO: get tiling working */ - templ.bind = PIPE_BIND_LINEAR | PIPE_BIND_SHARED; - resources[i] = (struct si_texture *) - pipe->screen->resource_create(pipe->screen, &templ); - if (!resources[i]) - goto error; - } - } - - for (i = 0; i < VL_NUM_COMPONENTS; ++i) { - if (!resources[i]) - continue; - - surfaces[i] = & resources[i]->surface; - pbs[i] = &resources[i]->buffer.buf; - } - - si_vid_join_surfaces(ctx, pbs, surfaces); - - for (i = 0; i < VL_NUM_COMPONENTS; ++i) { - if (!resources[i]) - continue; - - /* reset the address */ - resources[i]->buffer.gpu_address = ctx->ws->buffer_get_virtual_address( - resources[i]->buffer.buf); - resources[i]->buffer.bo_size = resources[i]->buffer.buf->size; - } - - vidtemplate.height *= array_size; - return vl_video_buffer_create_ex2(pipe, &vidtemplate, (struct pipe_resource **)resources); - -error: - for (i = 0; i < VL_NUM_COMPONENTS; ++i) - si_texture_reference(&resources[i], NULL); - - return NULL; + return vl_video_buffer_create_as_resource(pipe, &vidbuf); } /* set the decoding target buffer offsets */ diff -Nru mesa-19.2.8/src/gallium/drivers/softpipe/sp_buffer.c mesa-20.0.8/src/gallium/drivers/softpipe/sp_buffer.c --- mesa-19.2.8/src/gallium/drivers/softpipe/sp_buffer.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/softpipe/sp_buffer.c 2020-06-12 01:21:17.000000000 +0000 @@ -25,7 +25,7 @@ #include "sp_buffer.h" #include "sp_texture.h" -#include "util/u_format.h" +#include "util/format/u_format.h" static bool get_dimensions(const struct pipe_shader_buffer *bview, diff -Nru mesa-19.2.8/src/gallium/drivers/softpipe/sp_image.c mesa-20.0.8/src/gallium/drivers/softpipe/sp_image.c --- mesa-19.2.8/src/gallium/drivers/softpipe/sp_image.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/softpipe/sp_image.c 2020-06-12 01:21:17.000000000 +0000 @@ -25,7 +25,7 @@ #include "sp_image.h" #include "sp_texture.h" -#include "util/u_format.h" +#include "util/format/u_format.h" /* * Get the offset into the base image diff -Nru mesa-19.2.8/src/gallium/drivers/softpipe/sp_quad_blend.c mesa-20.0.8/src/gallium/drivers/softpipe/sp_quad_blend.c --- mesa-19.2.8/src/gallium/drivers/softpipe/sp_quad_blend.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/softpipe/sp_quad_blend.c 2020-06-12 01:21:17.000000000 +0000 @@ -33,7 +33,7 @@ #include "pipe/p_defines.h" #include "util/u_math.h" #include "util/u_memory.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_dual_blend.h" #include "sp_context.h" #include "sp_state.h" diff -Nru mesa-19.2.8/src/gallium/drivers/softpipe/sp_quad_depth_test.c mesa-20.0.8/src/gallium/drivers/softpipe/sp_quad_depth_test.c --- mesa-19.2.8/src/gallium/drivers/softpipe/sp_quad_depth_test.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/softpipe/sp_quad_depth_test.c 2020-06-12 01:21:17.000000000 +0000 @@ -31,7 +31,7 @@ */ #include "pipe/p_defines.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_math.h" #include "util/u_memory.h" #include "tgsi/tgsi_scan.h" diff -Nru mesa-19.2.8/src/gallium/drivers/softpipe/sp_screen.c mesa-20.0.8/src/gallium/drivers/softpipe/sp_screen.c --- mesa-19.2.8/src/gallium/drivers/softpipe/sp_screen.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/softpipe/sp_screen.c 2020-06-12 01:21:17.000000000 +0000 @@ -27,8 +27,8 @@ #include "util/u_memory.h" -#include "util/u_format.h" -#include "util/u_format_s3tc.h" +#include "util/format/u_format.h" +#include "util/format/u_format_s3tc.h" #include "util/u_screen.h" #include "util/u_video.h" #include "util/os_misc.h" diff -Nru mesa-19.2.8/src/gallium/drivers/softpipe/sp_state_sampler.c mesa-20.0.8/src/gallium/drivers/softpipe/sp_state_sampler.c --- mesa-19.2.8/src/gallium/drivers/softpipe/sp_state_sampler.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/softpipe/sp_state_sampler.c 2020-06-12 01:21:17.000000000 +0000 @@ -31,7 +31,7 @@ #include "util/u_memory.h" #include "util/u_inlines.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "draw/draw_context.h" diff -Nru mesa-19.2.8/src/gallium/drivers/softpipe/sp_state_so.c mesa-20.0.8/src/gallium/drivers/softpipe/sp_state_so.c --- mesa-19.2.8/src/gallium/drivers/softpipe/sp_state_so.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/softpipe/sp_state_so.c 2020-06-12 01:21:17.000000000 +0000 @@ -29,7 +29,7 @@ #include "sp_state.h" #include "sp_texture.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_memory.h" #include "draw/draw_context.h" #include "pipebuffer/pb_buffer.h" diff -Nru mesa-19.2.8/src/gallium/drivers/softpipe/sp_state_surface.c mesa-20.0.8/src/gallium/drivers/softpipe/sp_state_surface.c --- mesa-19.2.8/src/gallium/drivers/softpipe/sp_state_surface.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/softpipe/sp_state_surface.c 2020-06-12 01:21:17.000000000 +0000 @@ -34,7 +34,7 @@ #include "draw/draw_context.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_inlines.h" diff -Nru mesa-19.2.8/src/gallium/drivers/softpipe/sp_surface.c mesa-20.0.8/src/gallium/drivers/softpipe/sp_surface.c --- mesa-19.2.8/src/gallium/drivers/softpipe/sp_surface.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/softpipe/sp_surface.c 2020-06-12 01:21:17.000000000 +0000 @@ -25,7 +25,7 @@ * **************************************************************************/ -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_surface.h" #include "sp_context.h" #include "sp_surface.h" diff -Nru mesa-19.2.8/src/gallium/drivers/softpipe/sp_tex_sample.c mesa-20.0.8/src/gallium/drivers/softpipe/sp_tex_sample.c --- mesa-19.2.8/src/gallium/drivers/softpipe/sp_tex_sample.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/softpipe/sp_tex_sample.c 2020-06-12 01:21:17.000000000 +0000 @@ -38,7 +38,7 @@ #include "pipe/p_defines.h" #include "pipe/p_shader_tokens.h" #include "util/u_math.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_memory.h" #include "util/u_inlines.h" #include "sp_quad.h" /* only for #define QUAD_* tokens */ diff -Nru mesa-19.2.8/src/gallium/drivers/softpipe/sp_tex_tile_cache.c mesa-20.0.8/src/gallium/drivers/softpipe/sp_tex_tile_cache.c --- mesa-19.2.8/src/gallium/drivers/softpipe/sp_tex_tile_cache.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/softpipe/sp_tex_tile_cache.c 2020-06-12 01:21:17.000000000 +0000 @@ -35,7 +35,7 @@ #include "util/u_inlines.h" #include "util/u_memory.h" #include "util/u_tile.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_math.h" #include "sp_context.h" #include "sp_texture.h" diff -Nru mesa-19.2.8/src/gallium/drivers/softpipe/sp_texture.c mesa-20.0.8/src/gallium/drivers/softpipe/sp_texture.c --- mesa-19.2.8/src/gallium/drivers/softpipe/sp_texture.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/softpipe/sp_texture.c 2020-06-12 01:21:17.000000000 +0000 @@ -33,7 +33,7 @@ #include "pipe/p_defines.h" #include "util/u_inlines.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_math.h" #include "util/u_memory.h" #include "util/u_transfer.h" diff -Nru mesa-19.2.8/src/gallium/drivers/softpipe/sp_tile_cache.c mesa-20.0.8/src/gallium/drivers/softpipe/sp_tile_cache.c --- mesa-19.2.8/src/gallium/drivers/softpipe/sp_tile_cache.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/softpipe/sp_tile_cache.c 2020-06-12 01:21:17.000000000 +0000 @@ -33,7 +33,7 @@ */ #include "util/u_inlines.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_memory.h" #include "util/u_tile.h" #include "sp_tile_cache.h" diff -Nru mesa-19.2.8/src/gallium/drivers/svga/svga_context.c mesa-20.0.8/src/gallium/drivers/svga/svga_context.c --- mesa-19.2.8/src/gallium/drivers/svga/svga_context.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/svga/svga_context.c 2020-06-12 01:21:17.000000000 +0000 @@ -136,7 +136,7 @@ if (!svga) goto done; - LIST_INITHEAD(&svga->dirty_buffers); + list_inithead(&svga->dirty_buffers); svga->pipe.screen = screen; svga->pipe.priv = priv; diff -Nru mesa-19.2.8/src/gallium/drivers/svga/svga_format.c mesa-20.0.8/src/gallium/drivers/svga/svga_format.c --- mesa-19.2.8/src/gallium/drivers/svga/svga_format.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/svga/svga_format.c 2020-06-12 01:21:17.000000000 +0000 @@ -26,7 +26,7 @@ #include "pipe/p_format.h" #include "util/u_debug.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_memory.h" #include "svga_winsys.h" @@ -207,6 +207,8 @@ [ PIPE_FORMAT_L32_SINT ] = { SVGA3D_FORMAT_INVALID, SVGA3D_FORMAT_INVALID, SVGA3D_R32_SINT, TF_XXX1 }, [ PIPE_FORMAT_L32A32_SINT ] = { SVGA3D_FORMAT_INVALID, SVGA3D_FORMAT_INVALID, SVGA3D_R32G32_SINT, TF_XXXY }, [ PIPE_FORMAT_R10G10B10A2_UINT ] = { SVGA3D_R10G10B10A2_UINT, SVGA3D_R10G10B10A2_UINT, SVGA3D_R10G10B10A2_UINT, 0 }, + /* Must specify following entry to give the sense of size of format_conversion_table[] */ + [ PIPE_FORMAT_COUNT ] = {SVGA3D_FORMAT_INVALID, SVGA3D_FORMAT_INVALID, SVGA3D_FORMAT_INVALID, 0 }, }; diff -Nru mesa-19.2.8/src/gallium/drivers/svga/svga_pipe_blit.c mesa-20.0.8/src/gallium/drivers/svga/svga_pipe_blit.c --- mesa-19.2.8/src/gallium/drivers/svga/svga_pipe_blit.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/svga/svga_pipe_blit.c 2020-06-12 01:21:17.000000000 +0000 @@ -32,7 +32,7 @@ #include "svga_surface.h" //#include "util/u_blit_sw.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_surface.h" #define FILE_DEBUG_FLAG DEBUG_BLIT diff -Nru mesa-19.2.8/src/gallium/drivers/svga/svga_pipe_query.c mesa-20.0.8/src/gallium/drivers/svga/svga_pipe_query.c --- mesa-19.2.8/src/gallium/drivers/svga/svga_pipe_query.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/svga/svga_pipe_query.c 2020-06-12 01:21:17.000000000 +0000 @@ -670,6 +670,7 @@ { struct svga_context *svga = svga_context(pipe); struct svga_query *sq; + enum pipe_error ret; assert(query_type < SVGA_QUERY_MAX); @@ -689,7 +690,10 @@ case PIPE_QUERY_OCCLUSION_COUNTER: sq->svga_type = SVGA3D_QUERYTYPE_OCCLUSION; if (svga_have_vgpu10(svga)) { - define_query_vgpu10(svga, sq, sizeof(SVGADXOcclusionQueryResult)); + ret = define_query_vgpu10(svga, sq, + sizeof(SVGADXOcclusionQueryResult)); + if (ret != PIPE_OK) + goto fail; /** * In OpenGL, occlusion counter query can be used in conditional @@ -703,17 +707,24 @@ sq->predicate = svga_create_query(pipe, PIPE_QUERY_OCCLUSION_PREDICATE, index); } else { - define_query_vgpu9(svga, sq); + ret = define_query_vgpu9(svga, sq); + if (ret != PIPE_OK) + goto fail; } break; case PIPE_QUERY_OCCLUSION_PREDICATE: case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: if (svga_have_vgpu10(svga)) { sq->svga_type = SVGA3D_QUERYTYPE_OCCLUSIONPREDICATE; - define_query_vgpu10(svga, sq, sizeof(SVGADXOcclusionPredicateQueryResult)); + ret = define_query_vgpu10(svga, sq, + sizeof(SVGADXOcclusionPredicateQueryResult)); + if (ret != PIPE_OK) + goto fail; } else { sq->svga_type = SVGA3D_QUERYTYPE_OCCLUSION; - define_query_vgpu9(svga, sq); + ret = define_query_vgpu9(svga, sq); + if (ret != PIPE_OK) + goto fail; } break; case PIPE_QUERY_PRIMITIVES_GENERATED: @@ -721,14 +732,18 @@ case PIPE_QUERY_SO_STATISTICS: assert(svga_have_vgpu10(svga)); sq->svga_type = SVGA3D_QUERYTYPE_STREAMOUTPUTSTATS; - define_query_vgpu10(svga, sq, - sizeof(SVGADXStreamOutStatisticsQueryResult)); + ret = define_query_vgpu10(svga, sq, + sizeof(SVGADXStreamOutStatisticsQueryResult)); + if (ret != PIPE_OK) + goto fail; break; case PIPE_QUERY_TIMESTAMP: assert(svga_have_vgpu10(svga)); sq->svga_type = SVGA3D_QUERYTYPE_TIMESTAMP; - define_query_vgpu10(svga, sq, - sizeof(SVGADXTimestampQueryResult)); + ret = define_query_vgpu10(svga, sq, + sizeof(SVGADXTimestampQueryResult)); + if (ret != PIPE_OK) + goto fail; break; case SVGA_QUERY_NUM_DRAW_CALLS: case SVGA_QUERY_NUM_FALLBACKS: diff -Nru mesa-19.2.8/src/gallium/drivers/svga/svga_pipe_sampler.c mesa-20.0.8/src/gallium/drivers/svga/svga_pipe_sampler.c --- mesa-19.2.8/src/gallium/drivers/svga/svga_pipe_sampler.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/svga/svga_pipe_sampler.c 2020-06-12 01:21:17.000000000 +0000 @@ -25,7 +25,7 @@ #include "pipe/p_defines.h" #include "util/u_bitmask.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_inlines.h" #include "util/u_math.h" #include "util/u_memory.h" diff -Nru mesa-19.2.8/src/gallium/drivers/svga/svga_pipe_vertex.c mesa-20.0.8/src/gallium/drivers/svga/svga_pipe_vertex.c --- mesa-19.2.8/src/gallium/drivers/svga/svga_pipe_vertex.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/svga/svga_pipe_vertex.c 2020-06-12 01:21:17.000000000 +0000 @@ -25,7 +25,7 @@ #include "pipe/p_defines.h" #include "util/u_bitmask.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_helpers.h" #include "util/u_inlines.h" #include "util/u_math.h" diff -Nru mesa-19.2.8/src/gallium/drivers/svga/svga_resource_buffer.c mesa-20.0.8/src/gallium/drivers/svga/svga_resource_buffer.c --- mesa-19.2.8/src/gallium/drivers/svga/svga_resource_buffer.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/svga/svga_resource_buffer.c 2020-06-12 01:21:17.000000000 +0000 @@ -442,7 +442,7 @@ sbuf->b.b.screen = screen; bind_flags = template->bind & ~PIPE_BIND_CUSTOM; - LIST_INITHEAD(&sbuf->surfaces); + list_inithead(&sbuf->surfaces); if (bind_flags & PIPE_BIND_CONSTANT_BUFFER) { /* Constant buffers can only have the PIPE_BIND_CONSTANT_BUFFER diff -Nru mesa-19.2.8/src/gallium/drivers/svga/svga_resource_buffer.h mesa-20.0.8/src/gallium/drivers/svga/svga_resource_buffer.h --- mesa-19.2.8/src/gallium/drivers/svga/svga_resource_buffer.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/svga/svga_resource_buffer.h 2020-06-12 01:21:17.000000000 +0000 @@ -285,7 +285,23 @@ svga->hud.num_buffers_mapped++; if (sws->have_gb_objects) { - return svga->swc->surface_map(svga->swc, sbuf->handle, flags, retry); + struct svga_winsys_context *swc = svga->swc; + boolean rebind; + void *map; + + map = swc->surface_map(swc, sbuf->handle, flags, retry, &rebind); + if (map && rebind) { + enum pipe_error ret; + + ret = SVGA3D_BindGBSurface(swc, sbuf->handle); + if (ret != PIPE_OK) { + svga_context_flush(svga, NULL); + ret = SVGA3D_BindGBSurface(swc, sbuf->handle); + assert(ret == PIPE_OK); + } + svga_context_flush(svga, NULL); + } + return map; } else { *retry = FALSE; return sws->buffer_map(sws, sbuf->hwbuf, flags); diff -Nru mesa-19.2.8/src/gallium/drivers/svga/svga_resource_buffer_upload.c mesa-20.0.8/src/gallium/drivers/svga/svga_resource_buffer_upload.c --- mesa-19.2.8/src/gallium/drivers/svga/svga_resource_buffer_upload.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/svga/svga_resource_buffer_upload.c 2020-06-12 01:21:17.000000000 +0000 @@ -322,7 +322,7 @@ bufsurf->key = *key; /* add the surface to the surface list */ - LIST_ADD(&bufsurf->list, &sbuf->surfaces); + list_add(&bufsurf->list, &sbuf->surfaces); /* Set the new bind flags for this buffer resource */ sbuf->bind_flags = bind_flags; @@ -410,7 +410,7 @@ svga_screen_surface_destroy(svga_screen(sbuf->b.b.screen), &bufsurf->key, &bufsurf->handle); - LIST_DEL(&bufsurf->list); + list_del(&bufsurf->list); FREE(bufsurf); } } else { @@ -728,7 +728,7 @@ sbuf->map.num_ranges = 0; assert(sbuf->head.prev && sbuf->head.next); - LIST_DEL(&sbuf->head); /* remove from svga->dirty_buffers list */ + list_del(&sbuf->head); /* remove from svga->dirty_buffers list */ #ifdef DEBUG sbuf->head.next = sbuf->head.prev = NULL; #endif @@ -1065,7 +1065,7 @@ if (ret == PIPE_OK) { sbuf->dma.pending = TRUE; assert(!sbuf->head.prev && !sbuf->head.next); - LIST_ADDTAIL(&sbuf->head, &svga->dirty_buffers); + list_addtail(&sbuf->head, &svga->dirty_buffers); } } else if (ret == PIPE_ERROR_OUT_OF_MEMORY) { diff -Nru mesa-19.2.8/src/gallium/drivers/svga/svga_resource_texture.c mesa-20.0.8/src/gallium/drivers/svga/svga_resource_texture.c --- mesa-19.2.8/src/gallium/drivers/svga/svga_resource_texture.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/svga/svga_resource_texture.c 2020-06-12 01:21:17.000000000 +0000 @@ -29,7 +29,7 @@ #include "pipe/p_state.h" #include "pipe/p_defines.h" #include "os/os_thread.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_inlines.h" #include "util/u_math.h" #include "util/u_memory.h" @@ -133,25 +133,26 @@ } } else { - int y, h, srcy; + int y, h, y_max; unsigned blockheight = util_format_get_blockheight(st->base.resource->format); h = st->hw_nblocksy * blockheight; - srcy = 0; + y_max = st->box.y + st->box.h; - for (y = 0; y < st->box.h; y += h) { + for (y = st->box.y; y < y_max; y += h) { unsigned offset, length; void *hw, *sw; - if (y + h > st->box.h) - h = st->box.h - y; + if (y + h > y_max) + h = y_max - y; /* Transfer band must be aligned to pixel block boundaries */ assert(y % blockheight == 0); assert(h % blockheight == 0); - offset = y * st->base.stride / blockheight; + /* First band starts at the top of the SW buffer. */ + offset = (y - st->box.y) * st->base.stride / blockheight; length = h * st->base.stride / blockheight; sw = (uint8_t *) st->swbuf + offset; @@ -159,9 +160,9 @@ if (transfer == SVGA3D_WRITE_HOST_VRAM) { unsigned usage = PIPE_TRANSFER_WRITE; - /* Wait for the previous DMAs to complete */ - /* TODO: keep one DMA (at half the size) in the background */ - if (y) { + /* Don't write to an in-flight DMA buffer. Synchronize or + * discard in-flight storage. */ + if (y != st->box.y) { svga_context_flush(svga, NULL); usage |= PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE; } @@ -177,7 +178,7 @@ svga_transfer_dma_band(svga, st, transfer, st->box.x, y, st->box.z, st->box.w, h, st->box.d, - 0, srcy, 0, flags); + 0, 0, 0, flags); /* * Prevent the texture contents to be discarded on the next band @@ -457,10 +458,11 @@ { SVGA3dSize baseLevelSize; uint8_t *map; - boolean retry; + boolean retry, rebind; unsigned offset, mip_width, mip_height; + struct svga_winsys_context *swc = svga->swc; - map = svga->swc->surface_map(svga->swc, surf, usage, &retry); + map = swc->surface_map(swc, surf, usage, &retry, &rebind); if (map == NULL && retry) { /* * At this point, the svga_surfaces_flush() should already have @@ -468,7 +470,18 @@ */ svga->hud.surface_write_flushes++; svga_context_flush(svga, NULL); - map = svga->swc->surface_map(svga->swc, surf, usage, &retry); + map = swc->surface_map(swc, surf, usage, &retry, &rebind); + } + if (map && rebind) { + enum pipe_error ret; + + ret = SVGA3D_BindGBSurface(swc, surf); + if (ret != PIPE_OK) { + svga_context_flush(svga, NULL); + ret = SVGA3D_BindGBSurface(swc, surf); + assert(ret == PIPE_OK); + } + svga_context_flush(svga, NULL); } /* @@ -531,7 +544,7 @@ struct svga_transfer *st; struct svga_winsys_surface *surf = tex->handle; boolean use_direct_map = svga_have_gb_objects(svga) && - !svga_have_gb_dma(svga); + (!svga_have_gb_dma(svga) || (usage & PIPE_TRANSFER_WRITE)); void *map = NULL; int64_t begin = svga_get_time(svga); diff -Nru mesa-19.2.8/src/gallium/drivers/svga/svga_sampler_view.c mesa-20.0.8/src/gallium/drivers/svga/svga_sampler_view.c --- mesa-19.2.8/src/gallium/drivers/svga/svga_sampler_view.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/svga/svga_sampler_view.c 2020-06-12 01:21:17.000000000 +0000 @@ -29,7 +29,7 @@ #include "pipe/p_defines.h" #include "util/u_inlines.h" #include "os/os_thread.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_math.h" #include "util/u_memory.h" #include "util/u_string.h" diff -Nru mesa-19.2.8/src/gallium/drivers/svga/svga_screen.c mesa-20.0.8/src/gallium/drivers/svga/svga_screen.c --- mesa-19.2.8/src/gallium/drivers/svga/svga_screen.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/svga/svga_screen.c 2020-06-12 01:21:17.000000000 +0000 @@ -24,7 +24,7 @@ **********************************************************/ #include "git_sha1.h" /* For MESA_GIT_SHA1 */ -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_memory.h" #include "util/u_inlines.h" #include "util/u_screen.h" @@ -94,7 +94,7 @@ #else build = "build: RELEASE;"; #endif -#ifdef HAVE_LLVM +#ifdef LLVM_AVAILABLE llvm = "LLVM;"; #endif @@ -572,8 +572,6 @@ case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTERS: case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTER_BUFFERS: return 0; - case PIPE_SHADER_CAP_SCALAR_ISA: - return 1; case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT: return 32; } @@ -641,8 +639,6 @@ case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTERS: case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTER_BUFFERS: return 0; - case PIPE_SHADER_CAP_SCALAR_ISA: - return 1; case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT: return 32; } @@ -744,8 +740,6 @@ case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTERS: case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTER_BUFFERS: return 0; - case PIPE_SHADER_CAP_SCALAR_ISA: - return 1; case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT: return 32; default: diff -Nru mesa-19.2.8/src/gallium/drivers/svga/svga_screen_cache.c mesa-20.0.8/src/gallium/drivers/svga/svga_screen_cache.c --- mesa-19.2.8/src/gallium/drivers/svga/svga_screen_cache.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/svga/svga_screen_cache.c 2020-06-12 01:21:17.000000000 +0000 @@ -133,13 +133,13 @@ entry->handle = NULL; /* Remove from hash table */ - LIST_DEL(&entry->bucket_head); + list_del(&entry->bucket_head); /* remove from LRU list */ - LIST_DEL(&entry->head); + list_del(&entry->head); /* Add the cache entry (but not the surface!) to the empty list */ - LIST_ADD(&entry->head, &cache->empty); + list_add(&entry->head, &cache->empty); /* update the cache size */ surf_size = surface_size(&entry->key); @@ -192,9 +192,9 @@ assert(entry->handle); sws->surface_reference(sws, &entry->handle, NULL); - LIST_DEL(&entry->bucket_head); - LIST_DEL(&entry->head); - LIST_ADD(&entry->head, &cache->empty); + list_del(&entry->bucket_head); + list_del(&entry->head); + list_add(&entry->head, &cache->empty); if (cache->total_size <= target_size) { /* all done */ @@ -256,7 +256,7 @@ } } - if (!LIST_IS_EMPTY(&cache->empty)) { + if (!list_is_empty(&cache->empty)) { /* An empty entry has no surface associated with it. * Use the first empty entry. */ @@ -264,9 +264,9 @@ cache->empty.next, head); /* Remove from LRU list */ - LIST_DEL(&entry->head); + list_del(&entry->head); } - else if (!LIST_IS_EMPTY(&cache->unused)) { + else if (!list_is_empty(&cache->unused)) { /* free the last used buffer and reuse its entry */ entry = LIST_ENTRY(struct svga_host_surface_cache_entry, cache->unused.prev, head); @@ -278,10 +278,10 @@ sws->surface_reference(sws, &entry->handle, NULL); /* Remove from hash table */ - LIST_DEL(&entry->bucket_head); + list_del(&entry->bucket_head); /* Remove from LRU list */ - LIST_DEL(&entry->head); + list_del(&entry->head); } if (entry) { @@ -294,9 +294,9 @@ /* If we don't have gb objects, we don't need to invalidate. */ if (sws->have_gb_objects) - LIST_ADD(&entry->head, &cache->validated); + list_add(&entry->head, &cache->validated); else - LIST_ADD(&entry->head, &cache->invalidated); + list_add(&entry->head, &cache->invalidated); cache->total_size += surf_size; } @@ -338,16 +338,16 @@ if (sws->surface_is_flushed(sws, entry->handle)) { /* remove entry from the invalidated list */ - LIST_DEL(&entry->head); + list_del(&entry->head); sws->fence_reference(sws, &entry->fence, fence); /* Add entry to the unused list */ - LIST_ADD(&entry->head, &cache->unused); + list_add(&entry->head, &cache->unused); /* Add entry to the hash table bucket */ bucket = svga_screen_cache_bucket(&entry->key); - LIST_ADD(&entry->bucket_head, &cache->bucket[bucket]); + list_add(&entry->bucket_head, &cache->bucket[bucket]); } curr = next; @@ -364,7 +364,7 @@ if (sws->surface_is_flushed(sws, entry->handle)) { /* remove entry from the validated list */ - LIST_DEL(&entry->head); + list_del(&entry->head); /* It is now safe to invalidate the surface content. * It will be done using the current context. @@ -386,7 +386,7 @@ } /* add the entry to the invalidated list */ - LIST_ADD(&entry->head, &cache->invalidated); + list_add(&entry->head, &cache->invalidated); } curr = next; @@ -436,17 +436,17 @@ (void) mtx_init(&cache->mutex, mtx_plain); for (i = 0; i < SVGA_HOST_SURFACE_CACHE_BUCKETS; ++i) - LIST_INITHEAD(&cache->bucket[i]); + list_inithead(&cache->bucket[i]); - LIST_INITHEAD(&cache->unused); + list_inithead(&cache->unused); - LIST_INITHEAD(&cache->validated); + list_inithead(&cache->validated); - LIST_INITHEAD(&cache->invalidated); + list_inithead(&cache->invalidated); - LIST_INITHEAD(&cache->empty); + list_inithead(&cache->empty); for (i = 0; i < SVGA_HOST_SURFACE_CACHE_SIZE; ++i) - LIST_ADDTAIL(&cache->entries[i].head, &cache->empty); + list_addtail(&cache->entries[i].head, &cache->empty); return PIPE_OK; } diff -Nru mesa-19.2.8/src/gallium/drivers/svga/svga_shader.c mesa-20.0.8/src/gallium/drivers/svga/svga_shader.c --- mesa-19.2.8/src/gallium/drivers/svga/svga_shader.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/svga/svga_shader.c 2020-06-12 01:21:17.000000000 +0000 @@ -25,7 +25,7 @@ #include "util/u_bitmask.h" #include "util/u_memory.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "svga_context.h" #include "svga_cmd.h" #include "svga_format.h" diff -Nru mesa-19.2.8/src/gallium/drivers/svga/svga_state_constants.c mesa-20.0.8/src/gallium/drivers/svga/svga_state_constants.c --- mesa-19.2.8/src/gallium/drivers/svga/svga_state_constants.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/svga/svga_state_constants.c 2020-06-12 01:21:17.000000000 +0000 @@ -24,7 +24,7 @@ * **********************************************************/ -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_inlines.h" #include "util/u_memory.h" #include "pipe/p_defines.h" diff -Nru mesa-19.2.8/src/gallium/drivers/svga/svga_state_framebuffer.c mesa-20.0.8/src/gallium/drivers/svga/svga_state_framebuffer.c --- mesa-19.2.8/src/gallium/drivers/svga/svga_state_framebuffer.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/svga/svga_state_framebuffer.c 2020-06-12 01:21:17.000000000 +0000 @@ -26,7 +26,7 @@ #include "util/u_inlines.h" #include "pipe/p_defines.h" #include "util/u_math.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "svga_context.h" #include "svga_state.h" diff -Nru mesa-19.2.8/src/gallium/drivers/svga/svga_state_fs.c mesa-20.0.8/src/gallium/drivers/svga/svga_state_fs.c --- mesa-19.2.8/src/gallium/drivers/svga/svga_state_fs.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/svga/svga_state_fs.c 2020-06-12 01:21:17.000000000 +0000 @@ -25,7 +25,7 @@ #include "util/u_inlines.h" #include "pipe/p_defines.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_math.h" #include "util/u_memory.h" #include "util/u_bitmask.h" diff -Nru mesa-19.2.8/src/gallium/drivers/svga/svga_state_rss.c mesa-20.0.8/src/gallium/drivers/svga/svga_state_rss.c --- mesa-19.2.8/src/gallium/drivers/svga/svga_state_rss.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/svga/svga_state_rss.c 2020-06-12 01:21:17.000000000 +0000 @@ -25,7 +25,7 @@ #include "pipe/p_defines.h" #include "util/u_bitmask.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_inlines.h" #include "util/u_memory.h" #include "util/u_math.h" diff -Nru mesa-19.2.8/src/gallium/drivers/svga/svga_state_sampler.c mesa-20.0.8/src/gallium/drivers/svga/svga_state_sampler.c --- mesa-19.2.8/src/gallium/drivers/svga/svga_state_sampler.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/svga/svga_state_sampler.c 2020-06-12 01:21:17.000000000 +0000 @@ -30,7 +30,7 @@ #include "pipe/p_defines.h" #include "util/u_bitmask.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_inlines.h" #include "util/u_math.h" #include "util/u_memory.h" diff -Nru mesa-19.2.8/src/gallium/drivers/svga/svga_state_tgsi_transform.c mesa-20.0.8/src/gallium/drivers/svga/svga_state_tgsi_transform.c --- mesa-19.2.8/src/gallium/drivers/svga/svga_state_tgsi_transform.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/svga/svga_state_tgsi_transform.c 2020-06-12 01:21:17.000000000 +0000 @@ -69,7 +69,7 @@ const struct tgsi_token *orig_tokens; struct svga_geometry_shader *orig_gs = (struct svga_geometry_shader *)shader; struct svga_geometry_shader *gs = NULL; - struct pipe_shader_state templ; + struct pipe_shader_state templ = {0}; struct svga_stream_output *streamout = NULL; int pos_out_index = -1; int aa_point_coord_index = -1; @@ -131,7 +131,7 @@ tgsi_dump(new_tokens, 0); } - templ.tokens = new_tokens; + pipe_shader_state_from_tgsi(&templ, new_tokens); templ.stream_output.num_outputs = 0; if (streamout) { diff -Nru mesa-19.2.8/src/gallium/drivers/svga/svga_surface.c mesa-20.0.8/src/gallium/drivers/svga/svga_surface.c --- mesa-19.2.8/src/gallium/drivers/svga/svga_surface.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/svga/svga_surface.c 2020-06-12 01:21:17.000000000 +0000 @@ -30,7 +30,7 @@ #include "util/u_inlines.h" #include "os/os_thread.h" #include "util/u_bitmask.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_math.h" #include "util/u_memory.h" diff -Nru mesa-19.2.8/src/gallium/drivers/svga/svga_winsys.h mesa-20.0.8/src/gallium/drivers/svga/svga_winsys.h --- mesa-19.2.8/src/gallium/drivers/svga/svga_winsys.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/svga/svga_winsys.h 2020-06-12 01:21:17.000000000 +0000 @@ -390,7 +390,11 @@ /** * Map a guest-backed surface. + * \param swc The winsys context + * \param surface The surface to map * \param flags bitmaks of PIPE_TRANSFER_x flags + * \param retry Whether to flush and retry the map + * \param rebind Whether to issue an immediate rebind and flush. * * The surface_map() member is allowed to fail due to a * shortage of command buffer space, if the @@ -401,7 +405,8 @@ void * (*surface_map)(struct svga_winsys_context *swc, struct svga_winsys_surface *surface, - unsigned flags, boolean *retry); + unsigned flags, boolean *retry, + boolean *rebind); /** * Unmap a guest-backed surface. diff -Nru mesa-19.2.8/src/gallium/drivers/swr/Makefile.sources mesa-20.0.8/src/gallium/drivers/swr/Makefile.sources --- mesa-19.2.8/src/gallium/drivers/swr/Makefile.sources 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/swr/Makefile.sources 2020-06-12 01:21:17.000000000 +0000 @@ -128,6 +128,8 @@ rasterizer/core/state.h \ rasterizer/core/state_funcs.h \ rasterizer/core/tessellator.h \ + rasterizer/core/tessellator.hpp \ + rasterizer/core/tessellator.cpp \ rasterizer/core/threads.cpp \ rasterizer/core/threads.h \ rasterizer/core/tilemgr.cpp \ diff -Nru mesa-19.2.8/src/gallium/drivers/swr/meson.build mesa-20.0.8/src/gallium/drivers/swr/meson.build --- mesa-19.2.8/src/gallium/drivers/swr/meson.build 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/swr/meson.build 2020-06-12 01:21:17.000000000 +0000 @@ -1,4 +1,4 @@ -# Copyright © 2017-2018 Intel Corporation +# Copyright © 2017-2020 Intel Corporation # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -129,6 +129,8 @@ 'rasterizer/core/state.h', 'rasterizer/core/state_funcs.h', 'rasterizer/core/tessellator.h', + 'rasterizer/core/tessellator.hpp', + 'rasterizer/core/tessellator.cpp', 'rasterizer/core/threads.cpp', 'rasterizer/core/threads.h', 'rasterizer/core/tilemgr.cpp', @@ -191,51 +193,61 @@ endif swr_arch_libs = [] -swr_arch_defines = [] +swr_defines = [] swr_avx_args = cpp.first_supported_argument( - '-mavx', '-target-cpu=sandybridge', '-march=core-avx', '-tp=sandybridge', + '-target-cpu=sandybridge', '-mavx', '-march=core-avx', '-tp=sandybridge', + '/arch:AVX', ) if swr_avx_args == [] error('Cannot find AVX support for swr. (these are required for SWR an all architectures.)') endif -if with_swr_arches.contains('avx') - swr_arch_defines += '-DHAVE_SWR_AVX' - swr_arch_libs += shared_library( - 'swrAVX', - [files_swr_common, files_swr_arch], - cpp_args : [swr_cpp_args, swr_avx_args, '-DKNOB_ARCH=KNOB_ARCH_AVX'], - link_args : [ld_args_gc_sections], - include_directories : [swr_incs], - dependencies : [dep_thread, dep_llvm], - version : '0.0.0', - install : true, - ) + +shared_swr = get_option('shared-swr') +if not shared_swr + if with_swr_arches.length() > 1 + error('When SWR is linked statically only one architecture is allowed.') + endif + swr_defines += '-DHAVE_SWR_BUILTIN' endif -if with_swr_arches.contains('avx2') - swr_avx2_args = cpp.first_supported_argument( - '-march=core-avx2', '-target-cpu=haswell', '-tp=haswell', +if with_swr_arches.contains('skx') + swr_skx_args = cpp.first_supported_argument( + '-march=skylake-avx512', '-target-cpu=x86-skylake', '-xCORE-AVX512', ) - if swr_avx2_args == [] - if cpp.has_argument(['-mavx2', '-mfma', '-mbmi2', '-mf16c']) - swr_avx2_args = ['-mavx2', '-mfma', '-mbmi2', '-mf16c'] - else - error('Cannot find AVX2 support for swr.') - endif + if swr_skx_args == [] + error('Cannot find SKX support for swr.') endif - swr_arch_defines += '-DHAVE_SWR_AVX2' - swr_arch_libs += shared_library( - 'swrAVX2', - [files_swr_common, files_swr_arch], - cpp_args : [swr_cpp_args, swr_avx2_args, '-DKNOB_ARCH=KNOB_ARCH_AVX2'], - link_args : [ld_args_gc_sections], - include_directories : [swr_incs], - dependencies : [dep_thread, dep_llvm], - version : '0.0.0', - install : true, - ) + swr_defines += '-DHAVE_SWR_SKX' + if shared_swr + swr_arch_libs += shared_library( + 'swrSKX', + [files_swr_common, files_swr_arch], + cpp_args : [ + cpp_msvc_compat_args, swr_cpp_args, swr_skx_args, + '-DKNOB_ARCH=KNOB_ARCH_AVX512', + ], + link_args : [ld_args_gc_sections], + include_directories : [swr_incs], + dependencies : [dep_thread, dep_llvm], + version : '0.0.0', + soversion : host_machine.system() == 'windows' ? '' : '0', + install : true, + ) + else + swr_arch_libs += static_library( + 'swrSKX', + [files_swr_common, files_swr_arch], + cpp_args : [ + cpp_msvc_compat_args, swr_cpp_args, swr_skx_args, + '-DKNOB_ARCH=KNOB_ARCH_AVX512', + ], + link_args : [ld_args_gc_sections], + include_directories : [swr_incs], + dependencies : [dep_thread, dep_llvm], + ) + endif endif if with_swr_arches.contains('knl') @@ -246,43 +258,114 @@ error('Cannot find KNL support for swr.') endif - swr_arch_defines += '-DHAVE_SWR_KNL' - swr_arch_libs += shared_library( - 'swrKNL', - [files_swr_common, files_swr_arch], - cpp_args : [ - swr_cpp_args, swr_knl_args, '-DKNOB_ARCH=KNOB_ARCH_AVX512', - '-DSIMD_ARCH_KNIGHTS', - ], - link_args : [ld_args_gc_sections], - include_directories : [swr_incs], - dependencies : [dep_thread, dep_llvm], - version : '0.0.0', - install : true, - ) + swr_defines += '-DHAVE_SWR_KNL' + if shared_swr + swr_arch_libs += shared_library( + 'swrKNL', + [files_swr_common, files_swr_arch], + cpp_args : [ + cpp_msvc_compat_args, swr_cpp_args, swr_knl_args, + '-DKNOB_ARCH=KNOB_ARCH_AVX512', '-DSIMD_ARCH_KNIGHTS', + ], + link_args : [ld_args_gc_sections], + include_directories : [swr_incs], + dependencies : [dep_thread, dep_llvm], + version : '0.0.0', + soversion : host_machine.system() == 'windows' ? '' : '0', + install : true, + ) + else + swr_arch_libs += static_library( + 'swrKNL', + [files_swr_common, files_swr_arch], + cpp_args : [ + cpp_msvc_compat_args, swr_cpp_args, swr_knl_args, + '-DKNOB_ARCH=KNOB_ARCH_AVX512', '-DSIMD_ARCH_KNIGHTS', + ], + link_args : [ld_args_gc_sections], + include_directories : [swr_incs], + dependencies : [dep_thread, dep_llvm], + ) + endif endif -if with_swr_arches.contains('skx') - swr_skx_args = cpp.first_supported_argument( - '-march=skylake-avx512', '-target-cpu=x86-skylake', '-xCORE-AVX512', + +if with_swr_arches.contains('avx2') + swr_avx2_args = cpp.first_supported_argument( + '-target-cpu=haswell', '-march=core-avx2', '-tp=haswell', '/arch:AVX2', ) - if swr_skx_args == [] - error('Cannot find SKX support for swr.') + if swr_avx2_args == [] + if cpp.has_argument(['-mavx2', '-mfma', '-mbmi2', '-mf16c']) + swr_avx2_args = ['-mavx2', '-mfma', '-mbmi2', '-mf16c'] + else + error('Cannot find AVX2 support for swr.') + endif endif - swr_arch_defines += '-DHAVE_SWR_SKX' - swr_arch_libs += shared_library( - 'swrSKX', - [files_swr_common, files_swr_arch], - cpp_args : [swr_cpp_args, swr_skx_args, '-DKNOB_ARCH=KNOB_ARCH_AVX512'], - link_args : [ld_args_gc_sections], - include_directories : [swr_incs], - dependencies : [dep_thread, dep_llvm], - version : '0.0.0', - install : true, - ) + swr_defines += '-DHAVE_SWR_AVX2' + if shared_swr + swr_arch_libs += shared_library( + 'swrAVX2', + [files_swr_common, files_swr_arch], + cpp_args : [ + cpp_msvc_compat_args, swr_cpp_args, swr_avx2_args, + '-DKNOB_ARCH=KNOB_ARCH_AVX2', + ], + link_args : [ld_args_gc_sections], + include_directories : [swr_incs], + dependencies : [dep_thread, dep_llvm], + version : '0.0.0', + soversion : host_machine.system() == 'windows' ? '' : '0', + install : true, + ) + else + swr_arch_libs += static_library( + 'swrAVX2', + [files_swr_common, files_swr_arch], + cpp_args : [ + cpp_msvc_compat_args, swr_cpp_args, swr_avx2_args, + '-DKNOB_ARCH=KNOB_ARCH_AVX2', + ], + link_args : [ld_args_gc_sections], + include_directories : [swr_incs], + dependencies : [dep_thread, dep_llvm], + ) + endif endif +if with_swr_arches.contains('avx') + swr_defines += '-DHAVE_SWR_AVX' + if shared_swr + swr_arch_libs += shared_library( + 'swrAVX', + [files_swr_common, files_swr_arch], + cpp_args : [ + cpp_msvc_compat_args, swr_cpp_args, swr_avx_args, + '-DKNOB_ARCH=KNOB_ARCH_AVX', + ], + link_args : [ld_args_gc_sections], + include_directories : [swr_incs], + dependencies : [dep_thread, dep_llvm], + version : '0.0.0', + soversion : host_machine.system() == 'windows' ? '' : '0', + install : true, + ) + else + swr_arch_libs += static_library( + 'swrAVX', + [files_swr_common, files_swr_arch], + cpp_args : [ + cpp_msvc_compat_args, swr_cpp_args, swr_avx_args, + '-DKNOB_ARCH=KNOB_ARCH_AVX', + ], + link_args : [ld_args_gc_sections], + include_directories : [swr_incs], + dependencies : [dep_thread, dep_llvm], + ) + endif +endif + + if swr_arch_libs == [] error('SWR configured, but no SWR architectures configured') endif @@ -292,12 +375,20 @@ 'mesaswr', [files_swr_mesa, files_swr_common, gen_knobs_h, gen_knobs_cpp, gen_builder_hpp, gen_builder_meta_hpp, gen_builder_intrin_hpp], - cpp_args : [cpp_vis_args, swr_cpp_args, swr_avx_args, swr_arch_defines], + cpp_args : [ + cpp_msvc_compat_args, cpp_vis_args, swr_cpp_args, swr_avx_args, + swr_defines, + ], include_directories : [inc_common, swr_incs], dependencies : dep_llvm, ) +link_libs = [libmesaswr] +if not shared_swr + link_libs += swr_arch_libs +endif + driver_swr = declare_dependency( compile_args : '-DGALLIUM_SWR', - link_with : libmesaswr, + link_with : link_libs ) diff -Nru mesa-19.2.8/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_types.py mesa-20.0.8/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_types.py --- mesa-19.2.8/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_types.py 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_types.py 2020-06-12 01:21:17.000000000 +0000 @@ -1,4 +1,4 @@ -# Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. +# Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. # # Permission is hereby granted, free of charge, to any person obtaining a # copy of this software and associated documentation files (the "Software"), diff -Nru mesa-19.2.8/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.cpp mesa-20.0.8/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.cpp --- mesa-19.2.8/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.cpp 2020-06-12 01:21:17.000000000 +0000 @@ -43,9 +43,6 @@ //======================================================== void KnobBase::autoExpandEnvironmentVariables(std::string& text) { -#if (__GNUC__) && (GCC_VERSION < 409000) - // isn't implemented prior to gcc-4.9.0 - // unix style variable replacement size_t start; while ((start = text.find("${'${'}")) != std::string::npos) { @@ -64,32 +61,6 @@ const std::string var = GetEnv(text.substr(start + 1, end - start - 1)); text.replace(start, end - start + 1, var); } -#else - { - // unix style variable replacement - static std::regex env("\\$\\{([^}]+?)\\}"); - std::smatch match; - while (std::regex_search(text, match, env)) - { - const std::string var = GetEnv(match[1].str()); - // certain combinations of gcc/libstd++ have problems with this - // text.replace(match[0].first, match[0].second, var); - text.replace(match.prefix().length(), match[0].length(), var); - } - } - { - // win32 style variable replacement - static std::regex env("%([^%]+?)%"); - std::smatch match; - while (std::regex_search(text, match, env)) - { - const std::string var = GetEnv(match[1].str()); - // certain combinations of gcc/libstd++ have problems with this - // text.replace(match[0].first, match[0].second, var); - text.replace(match.prefix().length(), match[0].length(), var); - } - } -#endif } //======================================================== diff -Nru mesa-19.2.8/src/gallium/drivers/swr/rasterizer/common/os.h mesa-20.0.8/src/gallium/drivers/swr/rasterizer/common/os.h --- mesa-19.2.8/src/gallium/drivers/swr/rasterizer/common/os.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/swr/rasterizer/common/os.h 2020-06-12 01:21:17.000000000 +0000 @@ -182,7 +182,7 @@ } // gcc prior to 4.9 doesn't have _mm*_undefined_* -#if (__GNUC__) && (GCC_VERSION < 409000) +#if (__GNUC__) && (GCC_VERSION < 40900) #define _mm_undefined_si128 _mm_setzero_si128 #define _mm256_undefined_ps _mm256_setzero_ps #endif diff -Nru mesa-19.2.8/src/gallium/drivers/swr/rasterizer/core/api.cpp mesa-20.0.8/src/gallium/drivers/swr/rasterizer/core/api.cpp --- mesa-19.2.8/src/gallium/drivers/swr/rasterizer/core/api.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/swr/rasterizer/core/api.cpp 2020-06-12 01:21:17.000000000 +0000 @@ -82,6 +82,7 @@ pContext->pfnUpdateSoWriteOffset = pCreateInfo->pfnUpdateSoWriteOffset; pContext->pfnUpdateStats = pCreateInfo->pfnUpdateStats; pContext->pfnUpdateStatsFE = pCreateInfo->pfnUpdateStatsFE; + pContext->pfnUpdateStreamOut = pCreateInfo->pfnUpdateStreamOut; pContext->hExternalMemory = pCreateInfo->hExternalMemory; @@ -616,9 +617,17 @@ { API_STATE* pState = GetDrawState(GetContext(hContext)); - SWR_ASSERT((slot < 4), "There are only 4 SO buffer slots [0, 3]\nSlot requested: %d", slot); + SWR_ASSERT((slot < MAX_SO_STREAMS), "There are only 4 SO buffer slots [0, 3]\nSlot requested: %d", slot); - pState->soBuffer[slot] = *pSoBuffer; + // remember buffer status in case of future resume StreamOut + if ((pState->soBuffer[slot].pBuffer != 0) && (pSoBuffer->pBuffer == 0)) + pState->soPausedBuffer[slot] = pState->soBuffer[slot]; + + // resume + if (pState->soPausedBuffer[slot].pBuffer == pSoBuffer->pBuffer) + pState->soBuffer[slot] = pState->soPausedBuffer[slot]; + else + pState->soBuffer[slot] = *pSoBuffer; } void SwrSetVertexFunc(HANDLE hContext, PFN_VERTEX_FUNC pfnVertexFunc) diff -Nru mesa-19.2.8/src/gallium/drivers/swr/rasterizer/core/api.h mesa-20.0.8/src/gallium/drivers/swr/rasterizer/core/api.h --- mesa-19.2.8/src/gallium/drivers/swr/rasterizer/core/api.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/swr/rasterizer/core/api.h 2020-06-12 01:21:17.000000000 +0000 @@ -188,6 +188,12 @@ typedef void(SWR_API* PFN_UPDATE_STATS_FE)(HANDLE hPrivateContext, const SWR_STATS_FE* pStats); ////////////////////////////////////////////////////////////////////////// +/// @brief Callback to allow driver to update StreamOut status +/// @param hPrivateContext - handle to private data +/// @param numPrims - number of primitives written to StreamOut buffer +typedef void(SWR_API* PFN_UPDATE_STREAMOUT)(HANDLE hPrivateContext, uint64_t numPrims); + +////////////////////////////////////////////////////////////////////////// /// BucketManager /// Forward Declaration (see rdtsc_buckets.h for full definition) ///////////////////////////////////////////////////////////////////////// @@ -272,6 +278,7 @@ PFN_UPDATE_SO_WRITE_OFFSET pfnUpdateSoWriteOffset; PFN_UPDATE_STATS pfnUpdateStats; PFN_UPDATE_STATS_FE pfnUpdateStatsFE; + PFN_UPDATE_STREAMOUT pfnUpdateStreamOut; // Pointer to rdtsc buckets mgr returned to the caller. diff -Nru mesa-19.2.8/src/gallium/drivers/swr/rasterizer/core/backend_impl.h mesa-20.0.8/src/gallium/drivers/swr/rasterizer/core/backend_impl.h --- mesa-19.2.8/src/gallium/drivers/swr/rasterizer/core/backend_impl.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/swr/rasterizer/core/backend_impl.h 2020-06-12 01:21:17.000000000 +0000 @@ -29,6 +29,9 @@ #pragma once #include "tilemgr.h" +#include "state.h" +#include "context.h" + void InitBackendSingleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_INPUT_COVERAGE_COUNT][2][2]); void InitBackendSampleFuncTable( @@ -648,6 +651,7 @@ psContext->pPerspAttribs = work.pPerspAttribs; psContext->frontFace = work.triFlags.frontFacing; psContext->renderTargetArrayIndex = work.triFlags.renderTargetArrayIndex; + psContext->viewportIndex = work.triFlags.viewportIndex; // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull // attribs diff -Nru mesa-19.2.8/src/gallium/drivers/swr/rasterizer/core/binner.cpp mesa-20.0.8/src/gallium/drivers/swr/rasterizer/core/binner.cpp --- mesa-19.2.8/src/gallium/drivers/swr/rasterizer/core/binner.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/swr/rasterizer/core/binner.cpp 2020-06-12 01:21:17.000000000 +0000 @@ -347,7 +347,8 @@ /// @param oneTileMask - defines triangles for ER to work on /// (tris that fit into ER tile) template -uint32_t SIMDCALL EarlyRasterizer(SIMDBBOX_T& er_bbox, +uint32_t SIMDCALL EarlyRasterizer(DRAW_CONTEXT* pDC, + SIMDBBOX_T& er_bbox, Integer (&vAi)[3], Integer (&vBi)[3], Integer (&vXi)[3], @@ -1025,7 +1026,7 @@ // Try early rasterization triMask = EarlyRasterizer( - er_bbox, vAi, vBi, vXi, vYi, cwTrisMask, triMask, oneTileMask); + pDC, er_bbox, vAi, vBi, vXi, vYi, cwTrisMask, triMask, oneTileMask); if (!triMask) { diff -Nru mesa-19.2.8/src/gallium/drivers/swr/rasterizer/core/context.h mesa-20.0.8/src/gallium/drivers/swr/rasterizer/core/context.h --- mesa-19.2.8/src/gallium/drivers/swr/rasterizer/core/context.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/swr/rasterizer/core/context.h 2020-06-12 01:21:17.000000000 +0000 @@ -276,6 +276,7 @@ // Streamout state SWR_STREAMOUT_STATE soState; mutable SWR_STREAMOUT_BUFFER soBuffer[MAX_SO_STREAMS]; + mutable SWR_STREAMOUT_BUFFER soPausedBuffer[MAX_SO_STREAMS]; // Tessellation State PFN_HS_FUNC pfnHsFunc; @@ -422,6 +423,7 @@ SWR_STATS_FE statsFE; // Only one FE thread per DC. SWR_STATS* pStats; + uint64_t soPrims; // number of primitives written to StremOut buffer }; // Draw Context @@ -540,6 +542,7 @@ PFN_UPDATE_SO_WRITE_OFFSET pfnUpdateSoWriteOffset; PFN_UPDATE_STATS pfnUpdateStats; PFN_UPDATE_STATS_FE pfnUpdateStatsFE; + PFN_UPDATE_STREAMOUT pfnUpdateStreamOut; // Global Stats diff -Nru mesa-19.2.8/src/gallium/drivers/swr/rasterizer/core/frontend.cpp mesa-20.0.8/src/gallium/drivers/swr/rasterizer/core/frontend.cpp --- mesa-19.2.8/src/gallium/drivers/swr/rasterizer/core/frontend.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/swr/rasterizer/core/frontend.cpp 2020-06-12 01:21:17.000000000 +0000 @@ -589,6 +589,8 @@ } } + pDC->dynState.soPrims += soContext.numPrimsWritten; + UPDATE_STAT_FE(SoPrimStorageNeeded[streamIndex], soContext.numPrimStorageNeeded); UPDATE_STAT_FE(SoNumPrimsWritten[streamIndex], soContext.numPrimsWritten); @@ -702,8 +704,8 @@ { SWR_ASSERT(stream < MAX_SO_STREAMS); - uint32_t numInputBytes = (numEmittedVerts * 2 + 7) / 8; - uint32_t numOutputBytes = std::max(numInputBytes / 2, 1U); + uint32_t numInputBytes = AlignUp(numEmittedVerts * 2, 8) / 8; + uint32_t numOutputBytes = AlignUp(numEmittedVerts, 8) / 8; for (uint32_t b = 0; b < numOutputBytes; ++b) { @@ -851,29 +853,21 @@ gsContext.inputVertStride = pState->inputVertStride; for (uint32_t slot = 0; slot < pState->numInputAttribs; ++slot) { - uint32_t srcAttribSlot = pState->srcVertexAttribOffset + slot; - uint32_t attribSlot = pState->vertexAttribOffset + slot; - pa.Assemble(srcAttribSlot, attrib); + uint32_t attribOffset = slot + pState->vertexAttribOffset; + pa.Assemble(attribOffset, attrib); for (uint32_t i = 0; i < numVertsPerPrim; ++i) { - gsContext.pVerts[attribSlot + pState->inputVertStride * i] = attrib[i]; + gsContext.pVerts[attribOffset + pState->inputVertStride * i] = attrib[i]; } } - // assemble position - pa.Assemble(VERTEX_POSITION_SLOT, attrib); - for (uint32_t i = 0; i < numVertsPerPrim; ++i) - { - gsContext.pVerts[VERTEX_POSITION_SLOT + pState->inputVertStride * i] = attrib[i]; - } - // record valid prims from the frontend to avoid over binning the newly generated // prims from the GS #if USE_SIMD16_FRONTEND uint32_t numInputPrims = numPrims_simd8; #else - uint32_t numInputPrims = pa.NumPrims(); + uint32_t numInputPrims = pa.NumPrims(); #endif for (uint32_t instance = 0; instance < pState->instanceCount; ++instance) @@ -1343,6 +1337,13 @@ // Max storage for one attribute for an entire simdprimitive simdvector simdattrib[MAX_NUM_VERTS_PER_PRIM]; + // Assemble position separately + // TESS_TODO: this could be avoided - fix it + pa.Assemble(VERTEX_POSITION_SLOT, simdattrib); + for (uint32_t i = 0; i < numVertsPerPrim; ++i) { + hsContext.vert[i].attrib[VERTEX_POSITION_SLOT] = simdattrib[i]; + } + // assemble all attributes for the input primitives for (uint32_t slot = 0; slot < tsState.numHsInputAttribs; ++slot) { @@ -1370,6 +1371,7 @@ #if defined(_DEBUG) //memset(hsContext.pCPout, 0x90, sizeof(ScalarPatch) * KNOB_SIMD_WIDTH); #endif + memset(hsContext.pCPout, 0x90, sizeof(ScalarPatch) * KNOB_SIMD_WIDTH); #if USE_SIMD16_FRONTEND uint32_t numPrims = numPrims_simd8; @@ -1395,7 +1397,7 @@ SWR_TESSELLATION_FACTORS tessFactors; tessFactors = hsContext.pCPout[p].tessFactors; - // Run Tessellator + // Run Tessellator SWR_TS_TESSELLATED_DATA tsData = {0}; RDTSC_BEGIN(pDC->pContext->pBucketMgr, FETessellation, pDC->drawId); TSTessellate(tsCtx, tessFactors, tsData); diff -Nru mesa-19.2.8/src/gallium/drivers/swr/rasterizer/core/state.h mesa-20.0.8/src/gallium/drivers/swr/rasterizer/core/state.h --- mesa-19.2.8/src/gallium/drivers/swr/rasterizer/core/state.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/swr/rasterizer/core/state.h 2020-06-12 01:21:17.000000000 +0000 @@ -387,6 +387,7 @@ uint32_t frontFace; // IN: front- 1, back- 0 uint32_t sampleIndex; // IN: sampleIndex uint32_t renderTargetArrayIndex; // IN: render target array index from GS + uint32_t viewportIndex; // IN: viewport index from GS uint32_t rasterizerSampleCount; // IN: sample count used by the rasterizer uint8_t* pColorBuffer[SWR_NUM_RENDERTARGETS]; // IN: Pointers to render target hottiles @@ -746,13 +747,11 @@ // Total amount of memory to allocate for one instance of the shader output in bytes uint32_t allocationSize; - // Offset to the start of the attributes of the input vertices, in simdvector units, as read by - // the GS + // Offset to start reading data per input vertex in simdvector units. This can be used to + // skip over any vertex data output from the previous stage that is unused in the GS, removing + // unnecessary vertex processing. uint32_t vertexAttribOffset; - // Offset to the attributes as stored by the preceding shader stage. - uint32_t srcVertexAttribOffset; - // Size of the control data section which contains cut or streamID data, in simdscalar units. // Should be sized to handle the maximum number of verts output by the GS. Can be 0 if there are // no cuts or streamID bits. @@ -771,10 +770,7 @@ // shader is expected to store the final vertex count in the first dword of the gs output // stream. uint32_t staticVertexCount; - - uint32_t pad; }; -static_assert(sizeof(SWR_GS_STATE) == 64, "Adjust padding to keep size (or remove this assert)"); ////////////////////////////////////////////////////////////////////////// /// SWR_TS_OUTPUT_TOPOLOGY - Defines data output by the tessellator / DS diff -Nru mesa-19.2.8/src/gallium/drivers/swr/rasterizer/core/tessellator.cpp mesa-20.0.8/src/gallium/drivers/swr/rasterizer/core/tessellator.cpp --- mesa-19.2.8/src/gallium/drivers/swr/rasterizer/core/tessellator.cpp 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/swr/rasterizer/core/tessellator.cpp 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,2696 @@ +/* + Copyright (c) Microsoft Corporation + + Permission is hereby granted, free of charge, to any person obtaining a copy of this software and + associated documentation files (the "Software"), to deal in the Software without restriction, + including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, + and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, + subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all copies or substantial + portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT + NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +#include "tessellator.hpp" +#if defined(_WIN32) || defined(_WIN64) +#include // ceil +#else +#include +#endif +//#include // Just used for some commented out debug stat printing. +//#include // Ditto. +#define min(x,y) (x < y ? x : y) +#define max(x,y) (x > y ? x : y) + +//================================================================================================================================= +// Some D3D Compliant Float Math (reference rasterizer implements these in RefALU class) +//================================================================================================================================= +// +//--------------------------------------------------------------------------------------------------------------------------------- +// isNaN +//--------------------------------------------------------------------------------------------------------------------------------- +static bool tess_isNaN( float a ) +{ + static const int exponentMask = 0x7f800000; + static const int mantissaMask = 0x007fffff; + int u = *(int*)&a; + return ( ( ( u & exponentMask ) == exponentMask ) && ( u & mantissaMask ) ); // NaN +} + +//--------------------------------------------------------------------------------------------------------------------------------- +// flush (denorm) +//--------------------------------------------------------------------------------------------------------------------------------- +static float tess_flush( float a ) +{ + static const int minNormalizedFloat = 0x00800000; + static const int signBit = 0x80000000; + static const int signBitComplement = 0x7fffffff; + int b = (*(int*)&a) & signBitComplement; // fabs() + if( b < minNormalizedFloat ) // UINT comparison. NaN/INF do test false here + { + b = signBit & (*(int*)&a); + return *(float*)&b; + } + return a; +} + +//--------------------------------------------------------------------------------------------------------------------------------- +// IEEE754R min +//--------------------------------------------------------------------------------------------------------------------------------- +static float tess_fmin( float a, float b ) +{ + float _a = tess_flush( a ); + float _b = tess_flush( b ); + if( tess_isNaN( _b ) ) + { + return a; + } + else if( ( _a == 0 ) && ( _b == 0 ) ) + { + return ( (*(int*)&_a) & 0x80000000 ) ? a : b; + } + return _a < _b ? a : b; +} + +//--------------------------------------------------------------------------------------------------------------------------------- +// IEEE754R max +//--------------------------------------------------------------------------------------------------------------------------------- +static float tess_fmax( float a, float b ) +{ + float _a = tess_flush( a ); + float _b = tess_flush( b ); + + if( tess_isNaN( _b ) ) + { + return a; + } + else if( ( _a == 0 ) && ( _b == 0 ) ) + { + return ( (*(int*)&_b) & 0x80000000 ) ? a : b; + } + return _a >= _b ? a : b; +} + +//================================================================================================================================= +// Fixed Point Math +//================================================================================================================================= + +//----------------------------------------------------------------------------------------------------------------------------- +// floatToFixedPoint +// +// Convert 32-bit float to 32-bit fixed point integer, using only +// integer arithmetic + bitwise operations. +// +// c_uIBits: UINT8 : Width of i (aka. integer bits) +// c_uFBits: UINT8 : Width of f (aka. fractional bits) +// c_bSigned: bool : Whether the integer bits are a 2's complement signed value +// input: float : All values valid. +// output: INT32 : At most 24 bits from LSB are meaningful, depending +// on the fixed point bit representation chosen (see +// below). Extra bits are sign extended from the most +// meaningful bit. +// +//----------------------------------------------------------------------------------------------------------------------------- + +typedef unsigned char UINT8; +typedef int INT32; +template< const UINT8 c_uIBits, const UINT8 c_uFBits, const bool c_bSigned > +INT32 floatToIDotF( const float& input ) +{ + // ------------------------------------------------------------------------ + // output fixed point format + // 32-bit result: + // + // [sign-extend]i.f + // | | + // MSB(31)...LSB(0) + // + // f fractional part of the number, an unsigned + // value with _fxpFracBitCount bits (defined below) + // + // . implied decimal + // + // i integer part of the number, a 2's complement + // value with _fxpIntBitCount bits (defined below) + // + // [sign-extend] MSB of i conditionally replicated + // + // ------------------------------------------------------------------------ + // Define fixed point bit counts + // + + // Commenting out C_ASSERT below to minimise #includes: + // C_ASSERT( 2 <= c_uIBits && c_uIBits <= 32 && c_uFBits <= 32 && c_uIBits + c_uFBits <= 32 ); + + // Define most negative and most positive fixed point values + const INT32 c_iMinResult = (c_bSigned ? INT32( -1 ) << (c_uIBits + c_uFBits - 1) : 0); + const INT32 c_iMaxResult = ~c_iMinResult; + + // ------------------------------------------------------------------------ + // constant float properties + // ------------------------------------------------------------------------ + const UINT8 _fltMantissaBitCount = 23; + const UINT8 _fltExponentBitCount = 8; + const INT32 _fltExponentBias = (INT32( 1 ) << (_fltExponentBitCount - 1)) - 1; + const INT32 _fltHiddenBit = INT32( 1 ) << _fltMantissaBitCount; + const INT32 _fltMantissaMask = _fltHiddenBit - 1; + const INT32 _fltExponentMask = ((INT32( 1 ) << _fltExponentBitCount) - 1) << _fltMantissaBitCount; + const INT32 _fltSignBit = INT32( 1 ) << (_fltExponentBitCount + _fltMantissaBitCount); + + // ------------------------------------------------------------------------ + // define min and max values as floats (clamp to these bounds) + // ------------------------------------------------------------------------ + INT32 _fxpMaxPosValueFloat; + INT32 _fxpMaxNegValueFloat; + + if (c_bSigned) + { + // The maximum positive fixed point value is 2^(i-1) - 2^(-f). + // The following constructs the floating point bit pattern for this value, + // as long as i >= 2. + _fxpMaxPosValueFloat = (_fltExponentBias + c_uIBits - 1) <<_fltMantissaBitCount; + const INT32 iShift = _fltMantissaBitCount + 2 - c_uIBits - c_uFBits; + if (iShift >= 0) + { +// assert( iShift < 32 ); +#pragma warning( suppress : 4293 ) + _fxpMaxPosValueFloat -= INT32( 1 ) << iShift; + } + + // The maximum negative fixed point value is -2^(i-1). + // The following constructs the floating point bit pattern for this value, + // as long as i >= 2. + // We need this number without the sign bit + _fxpMaxNegValueFloat = (_fltExponentBias + c_uIBits - 1) << _fltMantissaBitCount; + } + else + { + // The maximum positive fixed point value is 2^(i) - 2^(-f). + // The following constructs the floating point bit pattern for this value, + // as long as i >= 2. + _fxpMaxPosValueFloat = (_fltExponentBias + c_uIBits) <<_fltMantissaBitCount; + const INT32 iShift = _fltMantissaBitCount + 1 - c_uIBits - c_uFBits; + if (iShift >= 0) + { +// assert( iShift < 32 ); +#pragma warning( suppress : 4293 ) + _fxpMaxPosValueFloat -= INT32( 1 ) << iShift; + } + + // The maximum negative fixed point value is 0. + _fxpMaxNegValueFloat = 0; + } + + // ------------------------------------------------------------------------ + // float -> fixed conversion + // ------------------------------------------------------------------------ + + // ------------------------------------------------------------------------ + // examine input float + // ------------------------------------------------------------------------ + INT32 output = *(INT32*)&input; + INT32 unbiasedExponent = ((output & _fltExponentMask) >> _fltMantissaBitCount) - _fltExponentBias; + INT32 isNegative = output & _fltSignBit; + + // ------------------------------------------------------------------------ + // nan + // ------------------------------------------------------------------------ + if (unbiasedExponent == (_fltExponentBias + 1) && (output & _fltMantissaMask)) + { + // nan converts to 0 + output = 0; + } + // ------------------------------------------------------------------------ + // too large positive + // ------------------------------------------------------------------------ + else if (!isNegative && output >= _fxpMaxPosValueFloat) // integer compare + { + output = c_iMaxResult; + } + // ------------------------------------------------------------------------ + // too large negative + // ------------------------------------------------------------------------ + // integer compare + else if (isNegative && (output & ~_fltSignBit) >= _fxpMaxNegValueFloat) + { + output = c_iMinResult; + } + // ------------------------------------------------------------------------ + // too small + // ------------------------------------------------------------------------ + else if (unbiasedExponent < -c_uFBits - 1) + { + // clamp to 0 + output = 0; + } + // ------------------------------------------------------------------------ + // within range + // ------------------------------------------------------------------------ + else + { + // copy mantissa, add hidden bit + output = (output & _fltMantissaMask) | _fltHiddenBit; + + INT32 extraBits = _fltMantissaBitCount - c_uFBits - unbiasedExponent; + if (extraBits >= 0) + { + // 2's complement if negative + if (isNegative) + { + output = ~output + 1; + } + + // From the range checks that led here, it is known that + // unbiasedExponent < c_uIBits. So, at most: + // (a) unbiasedExponent == c_uIBits - 1. + // + // From compile validation above, it is known that + // c_uIBits + c_uFBits <= _fltMantissaBitCount + 1). + // So, at minimum: + // (b) _fltMantissaBitCount == _fxtIntBitCount + c_uFBits - 1 + // + // Substituting (a) and (b) into extraBits calculation above: + // extraBits >= (_fxtIntBitCount + c_uFBits - 1) + // - c_uFBits - (c_uIBits - 1) + // extraBits >= 0 + // + // Thus we only have to worry about shifting right by 0 or more + // bits to get the decimal to the right place, and never have + // to shift left. + + INT32 LSB = 1 << extraBits; // last bit being kept + INT32 extraBitsMask = LSB - 1; + INT32 half = LSB >> 1; // round bias + + // round to nearest-even at LSB + if ((output & LSB) || (output & extraBitsMask) > half) + { + output += half; + } + + // shift off the extra bits (sign extending) + output >>= extraBits; + } + else + { + output <<= -extraBits; + + // 2's complement if negative + if (isNegative) + { + output = ~output + 1; + } + } + } + return output; +} +//----------------------------------------------------------------------------------------------------------------------------- + +#define FXP_INTEGER_BITS 15 +#define FXP_FRACTION_BITS 16 +#define FXP_FRACTION_MASK 0x0000ffff +#define FXP_INTEGER_MASK 0x7fff0000 +#define FXP_THREE (3<( input ); +} + +//--------------------------------------------------------------------------------------------------------------------------------- +// fixedToFloat +//--------------------------------------------------------------------------------------------------------------------------------- +float fixedToFloat(const FXP& input) +{ + // not worrying about denorm flushing the float operations (the DX spec behavior for div), since the numbers will not be that small during tessellation. + return ((float)(input>>FXP_FRACTION_BITS) + (float)(input&FXP_FRACTION_MASK)/(1< 0) || // NaN will pass + !(tessFactor_Veq0 > 0) || + !(tessFactor_Ueq1 > 0) || + !(tessFactor_Veq1 > 0) ) + { + processedTessFactors.bPatchCulled = true; + return; + } + else + { + processedTessFactors.bPatchCulled = false; + } + + // Clamp edge TessFactors + float lowerBound, upperBound; + switch(m_originalPartitioning) + { + case D3D11_TESSELLATOR_PARTITIONING_INTEGER: + case D3D11_TESSELLATOR_PARTITIONING_POW2: // don�t care about pow2 distinction for validation, just treat as integer + lowerBound = D3D11_TESSELLATOR_MIN_ODD_TESSELLATION_FACTOR; + upperBound = D3D11_TESSELLATOR_MAX_EVEN_TESSELLATION_FACTOR; + break; + + case D3D11_TESSELLATOR_PARTITIONING_FRACTIONAL_EVEN: + lowerBound = D3D11_TESSELLATOR_MIN_EVEN_TESSELLATION_FACTOR; + upperBound = D3D11_TESSELLATOR_MAX_EVEN_TESSELLATION_FACTOR; + break; + + case D3D11_TESSELLATOR_PARTITIONING_FRACTIONAL_ODD: + lowerBound = D3D11_TESSELLATOR_MIN_ODD_TESSELLATION_FACTOR; + upperBound = D3D11_TESSELLATOR_MAX_ODD_TESSELLATION_FACTOR; + break; + } + + tessFactor_Ueq0 = tess_fmin( upperBound, tess_fmax( lowerBound, tessFactor_Ueq0 ) ); + tessFactor_Veq0 = tess_fmin( upperBound, tess_fmax( lowerBound, tessFactor_Veq0 ) ); + tessFactor_Ueq1 = tess_fmin( upperBound, tess_fmax( lowerBound, tessFactor_Ueq1 ) ); + tessFactor_Veq1 = tess_fmin( upperBound, tess_fmax( lowerBound, tessFactor_Veq1 ) ); + + if( HWIntegerPartitioning()) // pow2 or integer, round to next int (hw doesn't care about pow2 distinction) + { + tessFactor_Ueq0 = ceil(tessFactor_Ueq0); + tessFactor_Veq0 = ceil(tessFactor_Veq0); + tessFactor_Ueq1 = ceil(tessFactor_Ueq1); + tessFactor_Veq1 = ceil(tessFactor_Veq1); + } + + // Clamp inside TessFactors + if(D3D11_TESSELLATOR_PARTITIONING_FRACTIONAL_ODD == m_originalPartitioning) + { +#define EPSILON 0.0000152587890625f // 2^(-16), min positive fixed point fraction +#define MIN_ODD_TESSFACTOR_PLUS_HALF_EPSILON (D3D11_TESSELLATOR_MIN_ODD_TESSELLATION_FACTOR + EPSILON/2) + // If any TessFactor will end up > 1 after floatToFixed conversion later, + // then force the inside TessFactors to be > 1 so there is a picture frame. + if( (tessFactor_Ueq0 > MIN_ODD_TESSFACTOR_PLUS_HALF_EPSILON) || + (tessFactor_Veq0 > MIN_ODD_TESSFACTOR_PLUS_HALF_EPSILON) || + (tessFactor_Ueq1 > MIN_ODD_TESSFACTOR_PLUS_HALF_EPSILON) || + (tessFactor_Veq1 > MIN_ODD_TESSFACTOR_PLUS_HALF_EPSILON) || + (insideTessFactor_U > MIN_ODD_TESSFACTOR_PLUS_HALF_EPSILON) || + (insideTessFactor_V > MIN_ODD_TESSFACTOR_PLUS_HALF_EPSILON) ) + { + // Force picture frame + lowerBound = D3D11_TESSELLATOR_MIN_ODD_TESSELLATION_FACTOR + EPSILON; + } + } + + insideTessFactor_U = tess_fmin( upperBound, tess_fmax( lowerBound, insideTessFactor_U ) ); + insideTessFactor_V = tess_fmin( upperBound, tess_fmax( lowerBound, insideTessFactor_V ) ); + // Note the above clamps map NaN to lowerBound + + + if( HWIntegerPartitioning()) // pow2 or integer, round to next int (hw doesn't care about pow2 distinction) + { + insideTessFactor_U = ceil(insideTessFactor_U); + insideTessFactor_V = ceil(insideTessFactor_V); + } + + // Reset our vertex and index buffers. We have enough storage for the max tessFactor. + m_NumPoints = 0; + m_NumIndices = 0; + + // Process tessFactors + float outsideTessFactor[QUAD_EDGES] = {tessFactor_Ueq0, tessFactor_Veq0, tessFactor_Ueq1, tessFactor_Veq1}; + float insideTessFactor[QUAD_AXES] = {insideTessFactor_U,insideTessFactor_V}; + int edge, axis; + if( HWIntegerPartitioning() ) + { + for( edge = 0; edge < QUAD_EDGES; edge++ ) + { + int edgeEven = isEven(outsideTessFactor[edge]); + processedTessFactors.outsideTessFactorParity[edge] = edgeEven ? TESSELLATOR_PARITY_EVEN : TESSELLATOR_PARITY_ODD; + } + for( axis = 0; axis < QUAD_AXES; axis++ ) + { + processedTessFactors.insideTessFactorParity[axis] = + (isEven(insideTessFactor[axis]) || (FLOAT_ONE == insideTessFactor[axis]) ) + ? TESSELLATOR_PARITY_EVEN : TESSELLATOR_PARITY_ODD; + } + } + else + { + for( edge = 0; edge < QUAD_EDGES; edge++ ) + { + processedTessFactors.outsideTessFactorParity[edge] = m_originalParity; + } + processedTessFactors.insideTessFactorParity[U] = processedTessFactors.insideTessFactorParity[V] = m_originalParity; + } + + // Save fixed point TessFactors + for( edge = 0; edge < QUAD_EDGES; edge++ ) + { + processedTessFactors.outsideTessFactor[edge] = floatToFixed(outsideTessFactor[edge]); + } + for( axis = 0; axis < QUAD_AXES; axis++ ) + { + processedTessFactors.insideTessFactor[axis] = floatToFixed(insideTessFactor[axis]); + } + + if( HWIntegerPartitioning() || Odd() ) + { + // Special case if all TessFactors are 1 + if( (FXP_ONE == processedTessFactors.insideTessFactor[U]) && + (FXP_ONE == processedTessFactors.insideTessFactor[V]) && + (FXP_ONE == processedTessFactors.outsideTessFactor[Ueq0]) && + (FXP_ONE == processedTessFactors.outsideTessFactor[Veq0]) && + (FXP_ONE == processedTessFactors.outsideTessFactor[Ueq1]) && + (FXP_ONE == processedTessFactors.outsideTessFactor[Veq1]) ) + { + processedTessFactors.bJustDoMinimumTessFactor = true; + return; + } + } + processedTessFactors.bJustDoMinimumTessFactor = false; + + // Compute TessFactor-specific metadata + for(int edge = 0; edge < QUAD_EDGES; edge++ ) + { + SetTessellationParity(processedTessFactors.outsideTessFactorParity[edge]); + ComputeTessFactorContext(processedTessFactors.outsideTessFactor[edge], processedTessFactors.outsideTessFactorCtx[edge]); + } + + for(int axis = 0; axis < QUAD_AXES; axis++) + { + SetTessellationParity(processedTessFactors.insideTessFactorParity[axis]); + ComputeTessFactorContext(processedTessFactors.insideTessFactor[axis], processedTessFactors.insideTessFactorCtx[axis]); + } + + // Compute some initial data. + + // outside edge offsets and storage + for(int edge = 0; edge < QUAD_EDGES; edge++ ) + { + SetTessellationParity(processedTessFactors.outsideTessFactorParity[edge]); + processedTessFactors.numPointsForOutsideEdge[edge] = NumPointsForTessFactor(processedTessFactors.outsideTessFactor[edge]); + m_NumPoints += processedTessFactors.numPointsForOutsideEdge[edge]; + } + m_NumPoints -= 4; + + // inside edge offsets + for(int axis = 0; axis < QUAD_AXES; axis++) + { + SetTessellationParity(processedTessFactors.insideTessFactorParity[axis]); + processedTessFactors.numPointsForInsideTessFactor[axis] = NumPointsForTessFactor(processedTessFactors.insideTessFactor[axis]); + int pointCountMin = ( TESSELLATOR_PARITY_ODD == processedTessFactors.insideTessFactorParity[axis] ) ? 4 : 3; + // max() allows degenerate transition regions when inside TessFactor == 1 + processedTessFactors.numPointsForInsideTessFactor[axis] = max(pointCountMin,processedTessFactors.numPointsForInsideTessFactor[axis]); + } + + processedTessFactors.insideEdgePointBaseOffset = m_NumPoints; + + // inside storage, including interior edges above + int numInteriorPoints = (processedTessFactors.numPointsForInsideTessFactor[U] - 2)*(processedTessFactors.numPointsForInsideTessFactor[V]-2); + m_NumPoints += numInteriorPoints; +} + +//--------------------------------------------------------------------------------------------------------------------------------- +// CHWTessellator::QuadGeneratePoints +//--------------------------------------------------------------------------------------------------------------------------------- +void CHWTessellator::QuadGeneratePoints( const PROCESSED_TESS_FACTORS_QUAD& processedTessFactors ) +{ + // Generate exterior ring edge points, clockwise from top-left + int pointOffset = 0; + int edge; + for(edge = 0; edge < QUAD_EDGES; edge++ ) + { + int parity = edge&0x1; + int startPoint = 0; + int endPoint = processedTessFactors.numPointsForOutsideEdge[edge] - 1; + for(int p = startPoint; p < endPoint; p++,pointOffset++) // don't include end, since next edge starts with it. + { + FXP fxpParam; + int q = ((edge==1)||(edge==2)) ? p : endPoint - p; // reverse order + SetTessellationParity(processedTessFactors.outsideTessFactorParity[edge]); + PlacePointIn1D(processedTessFactors.outsideTessFactorCtx[edge],q,fxpParam); + if( parity ) + { + DefinePoint(/*U*/fxpParam, + /*V*/(edge == 3) ? FXP_ONE : 0, + /*pointStorageOffset*/pointOffset); + } + else + { + DefinePoint(/*U*/(edge == 2) ? FXP_ONE : 0, + /*V*/fxpParam, + /*pointStorageOffset*/pointOffset); + } + } + } + + // Generate interior ring points, clockwise from (U==0,V==1) (bottom-left) spiralling toward center + static const int startRing = 1; + int minNumPointsForTessFactor = min(processedTessFactors.numPointsForInsideTessFactor[U],processedTessFactors.numPointsForInsideTessFactor[V]); + int numRings = (minNumPointsForTessFactor >> 1); // note for even tess we aren't counting center point here. + for(int ring = startRing; ring < numRings; ring++) + { + int startPoint = ring; + int endPoint[QUAD_AXES] = {processedTessFactors.numPointsForInsideTessFactor[U] - 1 - startPoint, + processedTessFactors.numPointsForInsideTessFactor[V] - 1 - startPoint}; + + for(edge = 0; edge < QUAD_EDGES; edge++ ) + { + int parity[QUAD_AXES] = {edge&0x1,((edge+1)&0x1)}; + int perpendicularAxisPoint = (edge < 2) ? startPoint : endPoint[parity[0]]; + FXP fxpPerpParam; + SetTessellationParity(processedTessFactors.insideTessFactorParity[parity[0]]); + PlacePointIn1D(processedTessFactors.insideTessFactorCtx[parity[0]],perpendicularAxisPoint,fxpPerpParam); + SetTessellationParity(processedTessFactors.insideTessFactorParity[parity[1]]); + for(int p = startPoint; p < endPoint[parity[1]]; p++, pointOffset++) // don't include end: next edge starts with it. + { + FXP fxpParam; + int q = ((edge == 1)||(edge==2)) ? p : endPoint[parity[1]] - (p - startPoint); + PlacePointIn1D(processedTessFactors.insideTessFactorCtx[parity[1]],q,fxpParam); + if( parity[1] ) + { + DefinePoint(/*U*/fxpPerpParam, + /*V*/fxpParam, + /*pointStorageOffset*/pointOffset); + } + else + { + DefinePoint(/*U*/fxpParam, + /*V*/fxpPerpParam, + /*pointStorageOffset*/pointOffset); + } + } + } + } + // For even tessellation, the inner "ring" is degenerate - a row of points + if( (processedTessFactors.numPointsForInsideTessFactor[U] > processedTessFactors.numPointsForInsideTessFactor[V]) && + (TESSELLATOR_PARITY_EVEN == processedTessFactors.insideTessFactorParity[V]) ) + { + int startPoint = numRings; + int endPoint = processedTessFactors.numPointsForInsideTessFactor[U] - 1 - startPoint; + SetTessellationParity(processedTessFactors.insideTessFactorParity[U]); + for( int p = startPoint; p <= endPoint; p++, pointOffset++ ) + { + FXP fxpParam; + PlacePointIn1D(processedTessFactors.insideTessFactorCtx[U],p,fxpParam); + DefinePoint(/*U*/fxpParam, + /*V*/FXP_ONE_HALF, // middle + /*pointStorageOffset*/pointOffset); + } + } + else if( (processedTessFactors.numPointsForInsideTessFactor[V] >= processedTessFactors.numPointsForInsideTessFactor[U]) && + (TESSELLATOR_PARITY_EVEN == processedTessFactors.insideTessFactorParity[U]) ) + { + int startPoint = numRings; + int endPoint; + FXP fxpParam; + endPoint = processedTessFactors.numPointsForInsideTessFactor[V] - 1 - startPoint; + SetTessellationParity(processedTessFactors.insideTessFactorParity[V]); + for( int p = endPoint; p >= startPoint; p--, pointOffset++ ) + { + PlacePointIn1D(processedTessFactors.insideTessFactorCtx[V],p,fxpParam); + DefinePoint(/*U*/FXP_ONE_HALF, // middle + /*V*/fxpParam, + /*pointStorageOffset*/pointOffset); + } + } +} +//--------------------------------------------------------------------------------------------------------------------------------- +// CHWTessellator::QuadGenerateConnectivity +//--------------------------------------------------------------------------------------------------------------------------------- +void CHWTessellator::QuadGenerateConnectivity( const PROCESSED_TESS_FACTORS_QUAD& processedTessFactors ) +{ + // Generate primitives for all the concentric rings, one side at a time for each ring + static const int startRing = 1; + int numPointRowsToCenter[QUAD_AXES] = {((processedTessFactors.numPointsForInsideTessFactor[U]+1) >> 1), + ((processedTessFactors.numPointsForInsideTessFactor[V]+1) >> 1)}; // +1 is so even tess includes the center point + int numRings = min(numPointRowsToCenter[U],numPointRowsToCenter[V]); + int degeneratePointRing[QUAD_AXES] = { // Even partitioning causes degenerate row of points, + // which results in exceptions to the point ordering conventions + // when travelling around the rings counterclockwise. + (TESSELLATOR_PARITY_EVEN == processedTessFactors.insideTessFactorParity[V]) ? numPointRowsToCenter[V] - 1 : -1, + (TESSELLATOR_PARITY_EVEN == processedTessFactors.insideTessFactorParity[U]) ? numPointRowsToCenter[U] - 1 : -1 }; + + const TESS_FACTOR_CONTEXT* outsideTessFactorCtx[QUAD_EDGES] = {&processedTessFactors.outsideTessFactorCtx[Ueq0], + &processedTessFactors.outsideTessFactorCtx[Veq0], + &processedTessFactors.outsideTessFactorCtx[Ueq1], + &processedTessFactors.outsideTessFactorCtx[Veq1]}; + TESSELLATOR_PARITY outsideTessFactorParity[QUAD_EDGES] = {processedTessFactors.outsideTessFactorParity[Ueq0], + processedTessFactors.outsideTessFactorParity[Veq0], + processedTessFactors.outsideTessFactorParity[Ueq1], + processedTessFactors.outsideTessFactorParity[Veq1]}; + int numPointsForOutsideEdge[QUAD_EDGES] = {processedTessFactors.numPointsForOutsideEdge[Ueq0], + processedTessFactors.numPointsForOutsideEdge[Veq0], + processedTessFactors.numPointsForOutsideEdge[Ueq1], + processedTessFactors.numPointsForOutsideEdge[Veq1]}; + + int insideEdgePointBaseOffset = processedTessFactors.insideEdgePointBaseOffset; + int outsideEdgePointBaseOffset = 0; + int edge; + for(int ring = startRing; ring < numRings; ring++) + { + int numPointsForInsideEdge[QUAD_AXES] = {processedTessFactors.numPointsForInsideTessFactor[U] - 2*ring, + processedTessFactors.numPointsForInsideTessFactor[V] - 2*ring}; + + int edge0InsidePointBaseOffset = insideEdgePointBaseOffset; + int edge0OutsidePointBaseOffset = outsideEdgePointBaseOffset; + + for(edge = 0; edge < QUAD_EDGES; edge++ ) + { + int parity = (edge+1)&0x1; + + int numTriangles = numPointsForInsideEdge[parity] + numPointsForOutsideEdge[edge] - 2; + int insideBaseOffset; + int outsideBaseOffset; + if( edge == 3 ) // We need to patch the indexing so Stitch() can think it sees + // 2 sequentially increasing rows of points, even though we have wrapped around + // to the end of the inner and outer ring's points, so the last point is really + // the first point for the ring. + // We make it so that when Stitch() calls AddIndex(), that function + // will do any necessary index adjustment. + { + if( ring == degeneratePointRing[parity] ) + { + m_IndexPatchContext2.baseIndexToInvert = insideEdgePointBaseOffset + 1; + m_IndexPatchContext2.cornerCaseBadValue = outsideEdgePointBaseOffset + numPointsForOutsideEdge[edge] - 1; + m_IndexPatchContext2.cornerCaseReplacementValue = edge0OutsidePointBaseOffset; + m_IndexPatchContext2.indexInversionEndPoint = (m_IndexPatchContext2.baseIndexToInvert << 1) - 1; + insideBaseOffset = m_IndexPatchContext2.baseIndexToInvert; + outsideBaseOffset = outsideEdgePointBaseOffset; + SetUsingPatchedIndices2(true); + } + else + { + m_IndexPatchContext.insidePointIndexDeltaToRealValue = insideEdgePointBaseOffset; + m_IndexPatchContext.insidePointIndexBadValue = numPointsForInsideEdge[parity] - 1; + m_IndexPatchContext.insidePointIndexReplacementValue = edge0InsidePointBaseOffset; + m_IndexPatchContext.outsidePointIndexPatchBase = m_IndexPatchContext.insidePointIndexBadValue+1; // past inside patched index range + m_IndexPatchContext.outsidePointIndexDeltaToRealValue = outsideEdgePointBaseOffset + - m_IndexPatchContext.outsidePointIndexPatchBase; + m_IndexPatchContext.outsidePointIndexBadValue = m_IndexPatchContext.outsidePointIndexPatchBase + + numPointsForOutsideEdge[edge] - 1; + m_IndexPatchContext.outsidePointIndexReplacementValue = edge0OutsidePointBaseOffset; + + insideBaseOffset = 0; + outsideBaseOffset = m_IndexPatchContext.outsidePointIndexPatchBase; + SetUsingPatchedIndices(true); + } + } + else if( (edge == 2) && (ring == degeneratePointRing[parity]) ) + { + m_IndexPatchContext2.baseIndexToInvert = insideEdgePointBaseOffset; + m_IndexPatchContext2.cornerCaseBadValue = -1; // unused + m_IndexPatchContext2.cornerCaseReplacementValue = -1; // unused + m_IndexPatchContext2.indexInversionEndPoint = m_IndexPatchContext2.baseIndexToInvert << 1; + insideBaseOffset = m_IndexPatchContext2.baseIndexToInvert; + outsideBaseOffset = outsideEdgePointBaseOffset; + SetUsingPatchedIndices2(true); + } + else + { + insideBaseOffset = insideEdgePointBaseOffset; + outsideBaseOffset = outsideEdgePointBaseOffset; + } + if( ring == startRing ) + { + StitchTransition(/*baseIndexOffset: */m_NumIndices, + insideBaseOffset,processedTessFactors.insideTessFactorCtx[parity].numHalfTessFactorPoints,processedTessFactors.insideTessFactorParity[parity], + outsideBaseOffset,outsideTessFactorCtx[edge]->numHalfTessFactorPoints,outsideTessFactorParity[edge]); + } + else + { + StitchRegular(/*bTrapezoid*/true, DIAGONALS_MIRRORED, + /*baseIndexOffset: */m_NumIndices, + numPointsForInsideEdge[parity], + insideBaseOffset,outsideBaseOffset); + } + SetUsingPatchedIndices(false); + SetUsingPatchedIndices2(false); + m_NumIndices += numTriangles*3; + outsideEdgePointBaseOffset += numPointsForOutsideEdge[edge] - 1; + if( (edge == 2) && (ring == degeneratePointRing[parity]) ) + { + insideEdgePointBaseOffset -= numPointsForInsideEdge[parity] - 1; + } + else + { + insideEdgePointBaseOffset += numPointsForInsideEdge[parity] - 1; + } + numPointsForOutsideEdge[edge] = numPointsForInsideEdge[parity]; + } + if( startRing == ring ) + { + for(edge = 0; edge < QUAD_EDGES; edge++ ) + { + outsideTessFactorCtx[edge] = &processedTessFactors.insideTessFactorCtx[edge&1]; + outsideTessFactorParity[edge] = processedTessFactors.insideTessFactorParity[edge&1]; + } + } + } + + // Triangulate center - a row of quads if odd + // This triangulation may be producing diagonals that are asymmetric about + // the center of the patch in this region. + if( (processedTessFactors.numPointsForInsideTessFactor[U] > processedTessFactors.numPointsForInsideTessFactor[V]) && + (TESSELLATOR_PARITY_ODD == processedTessFactors.insideTessFactorParity[V] ) ) + { + SetUsingPatchedIndices2(true); + int stripNumQuads = (((processedTessFactors.numPointsForInsideTessFactor[U]>>1) - (processedTessFactors.numPointsForInsideTessFactor[V]>>1))<<1)+ + ((TESSELLATOR_PARITY_EVEN == processedTessFactors.insideTessFactorParity[U] ) ? 2 : 1); + m_IndexPatchContext2.baseIndexToInvert = outsideEdgePointBaseOffset + stripNumQuads + 2; + m_IndexPatchContext2.cornerCaseBadValue = m_IndexPatchContext2.baseIndexToInvert; + m_IndexPatchContext2.cornerCaseReplacementValue = outsideEdgePointBaseOffset; + m_IndexPatchContext2.indexInversionEndPoint = m_IndexPatchContext2.baseIndexToInvert + + m_IndexPatchContext2.baseIndexToInvert + stripNumQuads; + StitchRegular(/*bTrapezoid*/false,DIAGONALS_INSIDE_TO_OUTSIDE, + /*baseIndexOffset: */m_NumIndices, /*numInsideEdgePoints:*/stripNumQuads+1, + /*insideEdgePointBaseOffset*/m_IndexPatchContext2.baseIndexToInvert, + outsideEdgePointBaseOffset+1); + SetUsingPatchedIndices2(false); + m_NumIndices += stripNumQuads*6; + } + else if((processedTessFactors.numPointsForInsideTessFactor[V] >= processedTessFactors.numPointsForInsideTessFactor[U]) && + (TESSELLATOR_PARITY_ODD == processedTessFactors.insideTessFactorParity[U]) ) + { + SetUsingPatchedIndices2(true); + int stripNumQuads = (((processedTessFactors.numPointsForInsideTessFactor[V]>>1) - (processedTessFactors.numPointsForInsideTessFactor[U]>>1))<<1)+ + ((TESSELLATOR_PARITY_EVEN == processedTessFactors.insideTessFactorParity[V] ) ? 2 : 1); + m_IndexPatchContext2.baseIndexToInvert = outsideEdgePointBaseOffset + stripNumQuads + 1; + m_IndexPatchContext2.cornerCaseBadValue = -1; // unused + m_IndexPatchContext2.indexInversionEndPoint = m_IndexPatchContext2.baseIndexToInvert + + m_IndexPatchContext2.baseIndexToInvert + stripNumQuads; + DIAGONALS diag = (TESSELLATOR_PARITY_EVEN == processedTessFactors.insideTessFactorParity[V]) ? + DIAGONALS_INSIDE_TO_OUTSIDE : DIAGONALS_INSIDE_TO_OUTSIDE_EXCEPT_MIDDLE; + StitchRegular(/*bTrapezoid*/false,diag, + /*baseIndexOffset: */m_NumIndices, /*numInsideEdgePoints:*/stripNumQuads+1, + /*insideEdgePointBaseOffset*/m_IndexPatchContext2.baseIndexToInvert, + outsideEdgePointBaseOffset); + SetUsingPatchedIndices2(false); + m_NumIndices += stripNumQuads*6; + } +} + +//--------------------------------------------------------------------------------------------------------------------------------- +// CHWTessellator::TessellateTriDomain +// User calls this +//--------------------------------------------------------------------------------------------------------------------------------- +void CHWTessellator::TessellateTriDomain( float tessFactor_Ueq0, float tessFactor_Veq0, float tessFactor_Weq0, + float insideTessFactor ) +{ + PROCESSED_TESS_FACTORS_TRI processedTessFactors; + TriProcessTessFactors(tessFactor_Ueq0,tessFactor_Veq0,tessFactor_Weq0,insideTessFactor,processedTessFactors); + + if( processedTessFactors.bPatchCulled ) + { + m_NumPoints = 0; + m_NumIndices = 0; + return; + } + else if( processedTessFactors.bJustDoMinimumTessFactor ) + { + DefinePoint(/*U*/0,/*V*/FXP_ONE,/*pointStorageOffset*/0); //V=1 (beginning of Ueq0 edge VW) + DefinePoint(/*U*/0,/*V*/0,/*pointStorageOffset*/1); //W=1 (beginning of Veq0 edge WU) + DefinePoint(/*U*/FXP_ONE,/*V*/0,/*pointStorageOffset*/2); //U=1 (beginning of Weq0 edge UV) + m_NumPoints = 3; + + switch(m_outputPrimitive) + { + case D3D11_TESSELLATOR_OUTPUT_TRIANGLE_CW: + case D3D11_TESSELLATOR_OUTPUT_TRIANGLE_CCW: + // function orients them CCW if needed + DefineClockwiseTriangle(0,1,2,/*indexStorageBaseOffset*/m_NumIndices); + m_NumIndices = 3; + break; + case D3D11_TESSELLATOR_OUTPUT_POINT: + DumpAllPoints(); + break; + case D3D11_TESSELLATOR_OUTPUT_LINE: + DumpAllPointsAsInOrderLineList(); + break; + } + return; + } + + TriGeneratePoints(processedTessFactors); + + if( m_outputPrimitive == D3D11_TESSELLATOR_OUTPUT_POINT ) + { + DumpAllPoints(); + return; + } + if( m_outputPrimitive == D3D11_TESSELLATOR_OUTPUT_LINE ) + { + DumpAllPointsAsInOrderLineList(); + return; + } + + TriGenerateConnectivity(processedTessFactors); // can be done in parallel to TriGeneratePoints() +} + +//--------------------------------------------------------------------------------------------------------------------------------- +// CHWTessellator::TriProcessTessFactors +//--------------------------------------------------------------------------------------------------------------------------------- +void CHWTessellator::TriProcessTessFactors( float tessFactor_Ueq0, float tessFactor_Veq0, float tessFactor_Weq0, + float insideTessFactor, PROCESSED_TESS_FACTORS_TRI& processedTessFactors ) +{ + // Is the patch culled? + if( !(tessFactor_Ueq0 > 0) || // NaN will pass + !(tessFactor_Veq0 > 0) || + !(tessFactor_Weq0 > 0) ) + { + processedTessFactors.bPatchCulled = true; + return; + } + else + { + processedTessFactors.bPatchCulled = false; + } + + // Clamp edge TessFactors + float lowerBound, upperBound; + switch(m_originalPartitioning) + { + case D3D11_TESSELLATOR_PARTITIONING_INTEGER: + case D3D11_TESSELLATOR_PARTITIONING_POW2: // don�t care about pow2 distinction for validation, just treat as integer + lowerBound = D3D11_TESSELLATOR_MIN_ODD_TESSELLATION_FACTOR; + upperBound = D3D11_TESSELLATOR_MAX_EVEN_TESSELLATION_FACTOR; + break; + + case D3D11_TESSELLATOR_PARTITIONING_FRACTIONAL_EVEN: + lowerBound = D3D11_TESSELLATOR_MIN_EVEN_TESSELLATION_FACTOR; + upperBound = D3D11_TESSELLATOR_MAX_EVEN_TESSELLATION_FACTOR; + break; + + case D3D11_TESSELLATOR_PARTITIONING_FRACTIONAL_ODD: + lowerBound = D3D11_TESSELLATOR_MIN_ODD_TESSELLATION_FACTOR; + upperBound = D3D11_TESSELLATOR_MAX_ODD_TESSELLATION_FACTOR; + break; + } + + tessFactor_Ueq0 = tess_fmin( upperBound, tess_fmax( lowerBound, tessFactor_Ueq0 ) ); + tessFactor_Veq0 = tess_fmin( upperBound, tess_fmax( lowerBound, tessFactor_Veq0 ) ); + tessFactor_Weq0 = tess_fmin( upperBound, tess_fmax( lowerBound, tessFactor_Weq0 ) ); + + if( HWIntegerPartitioning()) // pow2 or integer, round to next int (hw doesn't care about pow2 distinction) + { + tessFactor_Ueq0 = ceil(tessFactor_Ueq0); + tessFactor_Veq0 = ceil(tessFactor_Veq0); + tessFactor_Weq0 = ceil(tessFactor_Weq0); + } + + // Clamp inside TessFactors + if(D3D11_TESSELLATOR_PARTITIONING_FRACTIONAL_ODD == m_originalPartitioning) + { + if( (tessFactor_Ueq0 > MIN_ODD_TESSFACTOR_PLUS_HALF_EPSILON) || + (tessFactor_Veq0 > MIN_ODD_TESSFACTOR_PLUS_HALF_EPSILON) || + (tessFactor_Weq0 > MIN_ODD_TESSFACTOR_PLUS_HALF_EPSILON)) + // Don't need the same check for insideTessFactor for tri patches, + // since there is only one insideTessFactor, as opposed to quad + // patches which have 2 insideTessFactors. + { + // Force picture frame + lowerBound = D3D11_TESSELLATOR_MIN_ODD_TESSELLATION_FACTOR + EPSILON; + } + } + + insideTessFactor = tess_fmin( upperBound, tess_fmax( lowerBound, insideTessFactor ) ); + // Note the above clamps map NaN to lowerBound + + if( HWIntegerPartitioning()) // pow2 or integer, round to next int (hw doesn't care about pow2 distinction) + { + insideTessFactor = ceil(insideTessFactor); + } + + // Reset our vertex and index buffers. We have enough storage for the max tessFactor. + m_NumPoints = 0; + m_NumIndices = 0; + + // Process tessFactors + float outsideTessFactor[TRI_EDGES] = {tessFactor_Ueq0, tessFactor_Veq0, tessFactor_Weq0}; + int edge; + if( HWIntegerPartitioning() ) + { + for( edge = 0; edge < TRI_EDGES; edge++ ) + { + int edgeEven = isEven(outsideTessFactor[edge]); + processedTessFactors.outsideTessFactorParity[edge] = edgeEven ? TESSELLATOR_PARITY_EVEN : TESSELLATOR_PARITY_ODD; + } + processedTessFactors.insideTessFactorParity = (isEven(insideTessFactor) || (FLOAT_ONE == insideTessFactor)) + ? TESSELLATOR_PARITY_EVEN : TESSELLATOR_PARITY_ODD; + } + else + { + for( edge = 0; edge < TRI_EDGES; edge++ ) + { + processedTessFactors.outsideTessFactorParity[edge] = m_originalParity; + } + processedTessFactors.insideTessFactorParity = m_originalParity; + } + + // Save fixed point TessFactors + for( edge = 0; edge < TRI_EDGES; edge++ ) + { + processedTessFactors.outsideTessFactor[edge] = floatToFixed(outsideTessFactor[edge]); + } + processedTessFactors.insideTessFactor = floatToFixed(insideTessFactor); + + if( HWIntegerPartitioning() || Odd() ) + { + // Special case if all TessFactors are 1 + if( (FXP_ONE == processedTessFactors.insideTessFactor) && + (FXP_ONE == processedTessFactors.outsideTessFactor[Ueq0]) && + (FXP_ONE == processedTessFactors.outsideTessFactor[Veq0]) && + (FXP_ONE == processedTessFactors.outsideTessFactor[Weq0]) ) + { + processedTessFactors.bJustDoMinimumTessFactor = true; + return; + } + } + processedTessFactors.bJustDoMinimumTessFactor = false; + + // Compute per-TessFactor metadata + for(edge = 0; edge < TRI_EDGES; edge++ ) + { + SetTessellationParity(processedTessFactors.outsideTessFactorParity[edge]); + ComputeTessFactorContext(processedTessFactors.outsideTessFactor[edge], processedTessFactors.outsideTessFactorCtx[edge]); + } + SetTessellationParity(processedTessFactors.insideTessFactorParity); + ComputeTessFactorContext(processedTessFactors.insideTessFactor, processedTessFactors.insideTessFactorCtx); + + // Compute some initial data. + + // outside edge offsets and storage + for(edge = 0; edge < TRI_EDGES; edge++ ) + { + SetTessellationParity(processedTessFactors.outsideTessFactorParity[edge]); + processedTessFactors.numPointsForOutsideEdge[edge] = NumPointsForTessFactor(processedTessFactors.outsideTessFactor[edge]); + m_NumPoints += processedTessFactors.numPointsForOutsideEdge[edge]; + } + m_NumPoints -= 3; + + // inside edge offsets + SetTessellationParity(processedTessFactors.insideTessFactorParity); + processedTessFactors.numPointsForInsideTessFactor = NumPointsForTessFactor(processedTessFactors.insideTessFactor); + { + int pointCountMin = Odd() ? 4 : 3; + // max() allows degenerate transition regions when inside TessFactor == 1 + processedTessFactors.numPointsForInsideTessFactor = max(pointCountMin,processedTessFactors.numPointsForInsideTessFactor); + } + + processedTessFactors.insideEdgePointBaseOffset = m_NumPoints; + + // inside storage, including interior edges above + { + int numInteriorRings = (processedTessFactors.numPointsForInsideTessFactor >> 1) - 1; + int numInteriorPoints; + if( Odd() ) + { + numInteriorPoints = TRI_EDGES*(numInteriorRings*(numInteriorRings+1) - numInteriorRings); + } + else + { + numInteriorPoints = TRI_EDGES*(numInteriorRings*(numInteriorRings+1)) + 1; + } + m_NumPoints += numInteriorPoints; + } + +} + +//--------------------------------------------------------------------------------------------------------------------------------- +// CHWTessellator::TriGeneratePoints +//--------------------------------------------------------------------------------------------------------------------------------- +void CHWTessellator::TriGeneratePoints( const PROCESSED_TESS_FACTORS_TRI& processedTessFactors ) +{ + // Generate exterior ring edge points, clockwise starting from point V (VW, the U==0 edge) + int pointOffset = 0; + int edge; + for(edge = 0; edge < TRI_EDGES; edge++ ) + { + int parity = edge&0x1; + int startPoint = 0; + int endPoint = processedTessFactors.numPointsForOutsideEdge[edge] - 1; + for(int p = startPoint; p < endPoint; p++, pointOffset++) // don't include end, since next edge starts with it. + { + FXP fxpParam; + int q = (parity) ? p : endPoint - p; // whether to reverse point order given we are defining V or U (W implicit): + // edge0, VW, has V decreasing, so reverse 1D points below + // edge1, WU, has U increasing, so don't reverse 1D points below + // edge2, UV, has U decreasing, so reverse 1D points below + SetTessellationParity(processedTessFactors.outsideTessFactorParity[edge]); + PlacePointIn1D(processedTessFactors.outsideTessFactorCtx[edge],q,fxpParam); + if( edge == 0 ) + { + DefinePoint(/*U*/0, + /*V*/fxpParam, + /*pointStorageOffset*/pointOffset); + } + else + { + DefinePoint(/*U*/fxpParam, + /*V*/(edge == 2) ? FXP_ONE - fxpParam : 0, + /*pointStorageOffset*/pointOffset); + } + } + } + + // Generate interior ring points, clockwise spiralling in + SetTessellationParity(processedTessFactors.insideTessFactorParity); + static const int startRing = 1; + int numRings = (processedTessFactors.numPointsForInsideTessFactor >> 1); + for(int ring = startRing; ring < numRings; ring++) + { + int startPoint = ring; + int endPoint = processedTessFactors.numPointsForInsideTessFactor - 1 - startPoint; + + for(edge = 0; edge < TRI_EDGES; edge++ ) + { + int parity = edge&0x1; + int perpendicularAxisPoint = startPoint; + FXP fxpPerpParam; + PlacePointIn1D(processedTessFactors.insideTessFactorCtx,perpendicularAxisPoint,fxpPerpParam); + fxpPerpParam *= FXP_TWO_THIRDS; // Map location to the right size in barycentric space. + // I (amarp) can draw a picture to explain. + // We know this fixed point math won't over/underflow + fxpPerpParam = (fxpPerpParam+FXP_ONE_HALF/*round*/)>>FXP_FRACTION_BITS; // get back to n.16 + for(int p = startPoint; p < endPoint; p++, pointOffset++) // don't include end: next edge starts with it. + { + FXP fxpParam; + int q = (parity) ? p : endPoint - (p - startPoint); // whether to reverse point given we are defining V or U (W implicit): + // edge0, VW, has V decreasing, so reverse 1D points below + // edge1, WU, has U increasing, so don't reverse 1D points below + // edge2, UV, has U decreasing, so reverse 1D points below + PlacePointIn1D(processedTessFactors.insideTessFactorCtx,q,fxpParam); + // edge0 VW, has perpendicular parameter U constant + // edge1 WU, has perpendicular parameter V constant + // edge2 UV, has perpendicular parameter W constant + const unsigned int deriv = 2; // reciprocal is the rate of change of edge-parallel parameters as they are pushed into the triangle + switch(edge) + { + case 0: + DefinePoint(/*U*/fxpPerpParam, + /*V*/fxpParam - (fxpPerpParam+1/*round*/)/deriv, // we know this fixed point math won't over/underflow + /*pointStorageOffset*/pointOffset); + break; + case 1: + DefinePoint(/*U*/fxpParam - (fxpPerpParam+1/*round*/)/deriv,// we know this fixed point math won't over/underflow + /*V*/fxpPerpParam, + /*pointStorageOffset*/pointOffset); + break; + case 2: + DefinePoint(/*U*/fxpParam - (fxpPerpParam+1/*round*/)/deriv,// we know this fixed point math won't over/underflow + /*V*/FXP_ONE - (fxpParam - (fxpPerpParam+1/*round*/)/deriv) - fxpPerpParam,// we know this fixed point math won't over/underflow + /*pointStorageOffset*/pointOffset); + break; + } + } + } + } + if( !Odd() ) + { + // Last point is the point at the center. + DefinePoint(/*U*/FXP_ONE_THIRD, + /*V*/FXP_ONE_THIRD, + /*pointStorageOffset*/pointOffset); + } +} +//--------------------------------------------------------------------------------------------------------------------------------- +// CHWTessellator::TriGenerateConnectivity +//--------------------------------------------------------------------------------------------------------------------------------- +void CHWTessellator::TriGenerateConnectivity( const PROCESSED_TESS_FACTORS_TRI& processedTessFactors ) +{ + // Generate primitives for all the concentric rings, one side at a time for each ring + static const int startRing = 1; + int numRings = ((processedTessFactors.numPointsForInsideTessFactor+1) >> 1); // +1 is so even tess includes the center point, which we want to now + const TESS_FACTOR_CONTEXT* outsideTessFactorCtx[TRI_EDGES] = {&processedTessFactors.outsideTessFactorCtx[Ueq0], + &processedTessFactors.outsideTessFactorCtx[Veq0], + &processedTessFactors.outsideTessFactorCtx[Weq0]}; + TESSELLATOR_PARITY outsideTessFactorParity[TRI_EDGES] = {processedTessFactors.outsideTessFactorParity[Ueq0], + processedTessFactors.outsideTessFactorParity[Veq0], + processedTessFactors.outsideTessFactorParity[Weq0]}; + int numPointsForOutsideEdge[TRI_EDGES] = {processedTessFactors.numPointsForOutsideEdge[Ueq0], + processedTessFactors.numPointsForOutsideEdge[Veq0], + processedTessFactors.numPointsForOutsideEdge[Weq0]}; + + int insideEdgePointBaseOffset = processedTessFactors.insideEdgePointBaseOffset; + int outsideEdgePointBaseOffset = 0; + int edge; + for(int ring = startRing; ring < numRings; ring++) + { + int numPointsForInsideEdge = processedTessFactors.numPointsForInsideTessFactor - 2*ring; + int edge0InsidePointBaseOffset = insideEdgePointBaseOffset; + int edge0OutsidePointBaseOffset = outsideEdgePointBaseOffset; + for(edge = 0; edge < TRI_EDGES; edge++ ) + { + int numTriangles = numPointsForInsideEdge + numPointsForOutsideEdge[edge] - 2; + + int insideBaseOffset; + int outsideBaseOffset; + if( edge == 2 ) + { + m_IndexPatchContext.insidePointIndexDeltaToRealValue = insideEdgePointBaseOffset; + m_IndexPatchContext.insidePointIndexBadValue = numPointsForInsideEdge - 1; + m_IndexPatchContext.insidePointIndexReplacementValue = edge0InsidePointBaseOffset; + m_IndexPatchContext.outsidePointIndexPatchBase = m_IndexPatchContext.insidePointIndexBadValue+1; // past inside patched index range + m_IndexPatchContext.outsidePointIndexDeltaToRealValue = outsideEdgePointBaseOffset + - m_IndexPatchContext.outsidePointIndexPatchBase; + m_IndexPatchContext.outsidePointIndexBadValue = m_IndexPatchContext.outsidePointIndexPatchBase + + numPointsForOutsideEdge[edge] - 1; + m_IndexPatchContext.outsidePointIndexReplacementValue = edge0OutsidePointBaseOffset; + SetUsingPatchedIndices(true); + insideBaseOffset = 0; + outsideBaseOffset = m_IndexPatchContext.outsidePointIndexPatchBase; + } + else + { + insideBaseOffset = insideEdgePointBaseOffset; + outsideBaseOffset = outsideEdgePointBaseOffset; + } + if( ring == startRing ) + { + StitchTransition(/*baseIndexOffset: */m_NumIndices, + insideBaseOffset,processedTessFactors.insideTessFactorCtx.numHalfTessFactorPoints,processedTessFactors.insideTessFactorParity, + outsideBaseOffset,outsideTessFactorCtx[edge]->numHalfTessFactorPoints,outsideTessFactorParity[edge]); + } + else + { + StitchRegular(/*bTrapezoid*/true, DIAGONALS_MIRRORED, + /*baseIndexOffset: */m_NumIndices, + numPointsForInsideEdge, + insideBaseOffset,outsideBaseOffset); + } + if( 2 == edge ) + { + SetUsingPatchedIndices(false); + } + m_NumIndices += numTriangles*3; + outsideEdgePointBaseOffset += numPointsForOutsideEdge[edge] - 1; + insideEdgePointBaseOffset += numPointsForInsideEdge - 1; + numPointsForOutsideEdge[edge] = numPointsForInsideEdge; + } + if( startRing == ring ) + { + for(edge = 0; edge < TRI_EDGES; edge++ ) + { + outsideTessFactorCtx[edge] = &processedTessFactors.insideTessFactorCtx; + outsideTessFactorParity[edge] = processedTessFactors.insideTessFactorParity; + } + } + } + if( Odd() ) + { + // Triangulate center (a single triangle) + DefineClockwiseTriangle(outsideEdgePointBaseOffset, outsideEdgePointBaseOffset+1, outsideEdgePointBaseOffset+2, + m_NumIndices); + m_NumIndices += 3; + } +} + +//--------------------------------------------------------------------------------------------------------------------------------- +// CHWTessellator::TessellateIsoLineDomain +// User calls this. +//--------------------------------------------------------------------------------------------------------------------------------- +void CHWTessellator::TessellateIsoLineDomain( float TessFactor_V_LineDensity, float TessFactor_U_LineDetail ) +{ + PROCESSED_TESS_FACTORS_ISOLINE processedTessFactors; + IsoLineProcessTessFactors(TessFactor_V_LineDensity,TessFactor_U_LineDetail,processedTessFactors); + if( processedTessFactors.bPatchCulled ) + { + m_NumPoints = 0; + m_NumIndices = 0; + return; + } + IsoLineGeneratePoints(processedTessFactors); + IsoLineGenerateConnectivity(processedTessFactors); // can be done in parallel to IsoLineGeneratePoints +} + +//--------------------------------------------------------------------------------------------------------------------------------- +// CHWTessellator::IsoLineProcessTessFactors +//--------------------------------------------------------------------------------------------------------------------------------- +void CHWTessellator::IsoLineProcessTessFactors( float TessFactor_V_LineDensity, float TessFactor_U_LineDetail, + PROCESSED_TESS_FACTORS_ISOLINE& processedTessFactors ) +{ + // Is the patch culled? + if( !(TessFactor_V_LineDensity > 0) || // NaN will pass + !(TessFactor_U_LineDetail > 0) ) + { + processedTessFactors.bPatchCulled = true; + return; + } + else + { + processedTessFactors.bPatchCulled = false; + } + + // Clamp edge TessFactors + float lowerBound, upperBound; + switch(m_originalPartitioning) + { + case D3D11_TESSELLATOR_PARTITIONING_INTEGER: + case D3D11_TESSELLATOR_PARTITIONING_POW2: // don�t care about pow2 distinction for validation, just treat as integer + lowerBound = D3D11_TESSELLATOR_MIN_ODD_TESSELLATION_FACTOR; + upperBound = D3D11_TESSELLATOR_MAX_EVEN_TESSELLATION_FACTOR; + break; + + case D3D11_TESSELLATOR_PARTITIONING_FRACTIONAL_EVEN: + lowerBound = D3D11_TESSELLATOR_MIN_EVEN_TESSELLATION_FACTOR; + upperBound = D3D11_TESSELLATOR_MAX_EVEN_TESSELLATION_FACTOR; + break; + + case D3D11_TESSELLATOR_PARTITIONING_FRACTIONAL_ODD: + lowerBound = D3D11_TESSELLATOR_MIN_ODD_TESSELLATION_FACTOR; + upperBound = D3D11_TESSELLATOR_MAX_ODD_TESSELLATION_FACTOR; + break; + } + + TessFactor_V_LineDensity = tess_fmin( D3D11_TESSELLATOR_MAX_ISOLINE_DENSITY_TESSELLATION_FACTOR, + tess_fmax( D3D11_TESSELLATOR_MIN_ISOLINE_DENSITY_TESSELLATION_FACTOR, TessFactor_V_LineDensity ) ); + TessFactor_U_LineDetail = tess_fmin( upperBound, tess_fmax( lowerBound, TessFactor_U_LineDetail ) ); + + // Reset our vertex and index buffers. We have enough storage for the max tessFactor. + m_NumPoints = 0; + m_NumIndices = 0; + + // Process tessFactors + if( HWIntegerPartitioning() ) + { + TessFactor_U_LineDetail = ceil(TessFactor_U_LineDetail); + processedTessFactors.lineDetailParity = isEven(TessFactor_U_LineDetail) ? TESSELLATOR_PARITY_EVEN : TESSELLATOR_PARITY_ODD; + } + else + { + processedTessFactors.lineDetailParity = m_originalParity; + } + + FXP fxpTessFactor_U_LineDetail = floatToFixed(TessFactor_U_LineDetail); + + SetTessellationParity(processedTessFactors.lineDetailParity); + + ComputeTessFactorContext(fxpTessFactor_U_LineDetail, processedTessFactors.lineDetailTessFactorCtx); + processedTessFactors.numPointsPerLine = NumPointsForTessFactor(fxpTessFactor_U_LineDetail); + + OverridePartitioning(D3D11_TESSELLATOR_PARTITIONING_INTEGER); + + TessFactor_V_LineDensity = ceil(TessFactor_V_LineDensity); + processedTessFactors.lineDensityParity = isEven(TessFactor_V_LineDensity) ? TESSELLATOR_PARITY_EVEN : TESSELLATOR_PARITY_ODD; + SetTessellationParity(processedTessFactors.lineDensityParity); + FXP fxpTessFactor_V_LineDensity = floatToFixed(TessFactor_V_LineDensity); + ComputeTessFactorContext(fxpTessFactor_V_LineDensity, processedTessFactors.lineDensityTessFactorCtx); + + processedTessFactors.numLines = NumPointsForTessFactor(fxpTessFactor_V_LineDensity) - 1; // don't draw last line at V == 1. + + RestorePartitioning(); + + // Compute some initial data. + + // outside edge offsets + m_NumPoints = processedTessFactors.numPointsPerLine * processedTessFactors.numLines; + if( m_outputPrimitive == D3D11_TESSELLATOR_OUTPUT_POINT ) + { + m_NumIndices = m_NumPoints; + } + else // line + { + m_NumIndices = processedTessFactors.numLines*(processedTessFactors.numPointsPerLine-1)*2; + } +} + +//--------------------------------------------------------------------------------------------------------------------------------- +// CHWTessellator::IsoLineGeneratePoints +//--------------------------------------------------------------------------------------------------------------------------------- +void CHWTessellator::IsoLineGeneratePoints( const PROCESSED_TESS_FACTORS_ISOLINE& processedTessFactors ) +{ + int line, pointOffset; + for(line = 0, pointOffset = 0; line < processedTessFactors.numLines; line++) + { + for(int point = 0; point < processedTessFactors.numPointsPerLine; point++) + { + FXP fxpU,fxpV; + SetTessellationParity(processedTessFactors.lineDensityParity); + PlacePointIn1D(processedTessFactors.lineDensityTessFactorCtx,line,fxpV); + + SetTessellationParity(processedTessFactors.lineDetailParity); + PlacePointIn1D(processedTessFactors.lineDetailTessFactorCtx,point,fxpU); + + DefinePoint(fxpU,fxpV,pointOffset++); + } + } +} + +//--------------------------------------------------------------------------------------------------------------------------------- +// CHWTessellator::IsoLineGenerateConnectivity +//--------------------------------------------------------------------------------------------------------------------------------- +void CHWTessellator::IsoLineGenerateConnectivity( const PROCESSED_TESS_FACTORS_ISOLINE& processedTessFactors ) +{ + int line, pointOffset, indexOffset; + if( m_outputPrimitive == D3D11_TESSELLATOR_OUTPUT_POINT ) + { + for(line = 0, pointOffset = 0, indexOffset = 0; line < processedTessFactors.numLines; line++) + { + for(int point = 0; point < processedTessFactors.numPointsPerLine; point++) + { + DefineIndex(pointOffset++,indexOffset++); + } + } + } + else // line + { + for(line = 0, pointOffset = 0, indexOffset = 0; line < processedTessFactors.numLines; line++) + { + for(int point = 0; point < processedTessFactors.numPointsPerLine; point++) + { + if( point > 0 ) + { + DefineIndex(pointOffset-1,indexOffset++); + DefineIndex(pointOffset,indexOffset++); + } + pointOffset++; + } + } + } +} + +//--------------------------------------------------------------------------------------------------------------------------------- +// CHWTessellator::GetPointCount +// User calls this. +//--------------------------------------------------------------------------------------------------------------------------------- +int CHWTessellator::GetPointCount() +{ + return m_NumPoints; +} + +//--------------------------------------------------------------------------------------------------------------------------------- +// CHWTessellator::GetIndexCount() +// User calls this. +//--------------------------------------------------------------------------------------------------------------------------------- +int CHWTessellator::GetIndexCount() +{ + return m_NumIndices; +} + +//--------------------------------------------------------------------------------------------------------------------------------- +// CHWTessellator::GetPoints() +// User calls this. +//--------------------------------------------------------------------------------------------------------------------------------- +DOMAIN_POINT* CHWTessellator::GetPoints() +{ + return m_Point; +} +//--------------------------------------------------------------------------------------------------------------------------------- +// CHWTessellator::GetIndices() +// User calls this. +//--------------------------------------------------------------------------------------------------------------------------------- +int* CHWTessellator::GetIndices() +{ + return m_Index; +} + +//--------------------------------------------------------------------------------------------------------------------------------- +// CHWTessellator::DefinePoint() +//--------------------------------------------------------------------------------------------------------------------------------- +int CHWTessellator::DefinePoint(FXP fxpU, FXP fxpV, int pointStorageOffset) +{ +// WCHAR foo[80]; +// StringCchPrintf(foo,80,L"off:%d, uv=(%f,%f)\n",pointStorageOffset,fixedToFloat(fxpU),fixedToFloat(fxpV)); +// OutputDebugString(foo); + m_Point[pointStorageOffset].u = fixedToFloat(fxpU); + m_Point[pointStorageOffset].v = fixedToFloat(fxpV); + return pointStorageOffset; +} + +//--------------------------------------------------------------------------------------------------------------------------------- +// CHWTessellator::DefineIndex() +//-------------------------------------------------------------------------------------------------------------------------------- +void CHWTessellator::DefineIndex(int index, int indexStorageOffset) +{ + index = PatchIndexValue(index); +// WCHAR foo[80]; +// StringCchPrintf(foo,80,L"off:%d, idx=%d, uv=(%f,%f)\n",indexStorageOffset,index,m_Point[index].u,m_Point[index].v); +// OutputDebugString(foo); + m_Index[indexStorageOffset] = index; +} + +//--------------------------------------------------------------------------------------------------------------------------------- +// CHWTessellator::DefineClockwiseTriangle() +//--------------------------------------------------------------------------------------------------------------------------------- +void CHWTessellator::DefineClockwiseTriangle(int index0, int index1, int index2, int indexStorageBaseOffset) +{ + // inputs a clockwise triangle, stores a CW or CCW triangle depending on the state + DefineIndex(index0,indexStorageBaseOffset); + bool bWantClockwise = (m_outputPrimitive == D3D11_TESSELLATOR_OUTPUT_TRIANGLE_CW) ? true : false; + if( bWantClockwise ) + { + DefineIndex(index1,indexStorageBaseOffset+1); + DefineIndex(index2,indexStorageBaseOffset+2); + } + else + { + DefineIndex(index2,indexStorageBaseOffset+1); + DefineIndex(index1,indexStorageBaseOffset+2); + } +} + +//--------------------------------------------------------------------------------------------------------------------------------- +// CHWTessellator::DumpAllPoints() +//--------------------------------------------------------------------------------------------------------------------------------- +void CHWTessellator::DumpAllPoints() +{ + for( int p = 0; p < m_NumPoints; p++ ) + { + DefineIndex(p,m_NumIndices++); + } +} + +//--------------------------------------------------------------------------------------------------------------------------------- +// CHWTessellator::DumpAllPointsAsInOrderLineList() +//--------------------------------------------------------------------------------------------------------------------------------- +void CHWTessellator::DumpAllPointsAsInOrderLineList() +{ + for( int p = 1; p < m_NumPoints; p++ ) + { + DefineIndex(p-1,m_NumIndices++); + DefineIndex(p,m_NumIndices++); + } +} + +//--------------------------------------------------------------------------------------------------------------------------------- +// RemoveMSB +//--------------------------------------------------------------------------------------------------------------------------------- +int RemoveMSB(int val) +{ + int check; + if( val <= 0x0000ffff ) { check = ( val <= 0x000000ff ) ? 0x00000080 : 0x00008000; } + else { check = ( val <= 0x00ffffff ) ? 0x00800000 : 0x80000000; } + for( int i = 0; i < 8; i++, check >>= 1 ) { if( val & check ) return (val & ~check); } + return 0; +} +//--------------------------------------------------------------------------------------------------------------------------------- +// GetMSB +//--------------------------------------------------------------------------------------------------------------------------------- +int GetMSB(int val) +{ + int check; + if( val <= 0x0000ffff ) { check = ( val <= 0x000000ff ) ? 0x00000080 : 0x00008000; } + else { check = ( val <= 0x00ffffff ) ? 0x00800000 : 0x80000000; } + for( int i = 0; i < 8; i++, check >>= 1 ) { if( val & check ) return check; } + return 0; +} + +//--------------------------------------------------------------------------------------------------------------------------------- +// CHWTessellator::CleanseParameter() +//--------------------------------------------------------------------------------------------------------------------------------- +/* NOTHING TO DO FOR FIXED POINT ARITHMETIC! +void CHWTessellator::CleanseParameter(float& parameter) +{ + // Clean up [0..1] parameter to guarantee that (1 - (1 - parameter)) == parameter. + parameter = 1.0f - parameter; + parameter = 1.0f - parameter; + +} +*/ +//--------------------------------------------------------------------------------------------------------------------------------- +// CHWTessellator::NumPointsForTessFactor() +//--------------------------------------------------------------------------------------------------------------------------------- +int CHWTessellator::NumPointsForTessFactor( FXP fxpTessFactor ) +{ + int numPoints; + if( Odd() ) + { + numPoints = (fxpCeil(FXP_ONE_HALF + (fxpTessFactor+1/*round*/)/2)*2)>>FXP_FRACTION_BITS; + } + else + { + numPoints = ((fxpCeil((fxpTessFactor+1/*round*/)/2)*2)>>FXP_FRACTION_BITS)+1; + } + return numPoints; +} + +//--------------------------------------------------------------------------------------------------------------------------------- +// CHWTessellator::ComputeTessFactorContext() +//--------------------------------------------------------------------------------------------------------------------------------- +void CHWTessellator::ComputeTessFactorContext( FXP fxpTessFactor, TESS_FACTOR_CONTEXT& TessFactorCtx ) +{ + FXP fxpHalfTessFactor = (fxpTessFactor+1/*round*/)/2; + if( Odd() || (fxpHalfTessFactor == FXP_ONE_HALF)) // fxpHalfTessFactor == 1/2 if TessFactor is 1, but we're pretending we are even. + { + fxpHalfTessFactor += FXP_ONE_HALF; + } + FXP fxpFloorHalfTessFactor = fxpFloor(fxpHalfTessFactor); + FXP fxpCeilHalfTessFactor = fxpCeil(fxpHalfTessFactor); + TessFactorCtx.fxpHalfTessFactorFraction = fxpHalfTessFactor - fxpFloorHalfTessFactor; + //CleanseParameter(TessFactorCtx.fxpHalfTessFactorFraction); + TessFactorCtx.numHalfTessFactorPoints = (fxpCeilHalfTessFactor>>FXP_FRACTION_BITS); // for EVEN, we don't include the point always fixed at the midpoint of the TessFactor + if( fxpCeilHalfTessFactor == fxpFloorHalfTessFactor ) + { + TessFactorCtx.splitPointOnFloorHalfTessFactor = /*pick value to cause this to be ignored*/ TessFactorCtx.numHalfTessFactorPoints+1; + } + else if( Odd() ) + { + if( fxpFloorHalfTessFactor == FXP_ONE ) + { + TessFactorCtx.splitPointOnFloorHalfTessFactor = 0; + } + else + { +#ifdef ALLOW_XBOX_360_COMPARISON + if( m_bXBox360Mode ) + TessFactorCtx.splitPointOnFloorHalfTessFactor = TessFactorCtx.numHalfTessFactorPoints-2; + else +#endif + TessFactorCtx.splitPointOnFloorHalfTessFactor = (RemoveMSB((fxpFloorHalfTessFactor>>FXP_FRACTION_BITS)-1)<<1) + 1; + } + } + else + { +#ifdef ALLOW_XBOX_360_COMPARISON + if( m_bXBox360Mode ) + TessFactorCtx.splitPointOnFloorHalfTessFactor = TessFactorCtx.numHalfTessFactorPoints-1; + else +#endif + TessFactorCtx.splitPointOnFloorHalfTessFactor = (RemoveMSB(fxpFloorHalfTessFactor>>FXP_FRACTION_BITS)<<1) + 1; + } + int numFloorSegments = (fxpFloorHalfTessFactor * 2)>>FXP_FRACTION_BITS; + int numCeilSegments = (fxpCeilHalfTessFactor * 2)>>FXP_FRACTION_BITS; + if( Odd() ) + { + numFloorSegments -= 1; + numCeilSegments -= 1; + } + TessFactorCtx.fxpInvNumSegmentsOnFloorTessFactor = s_fixedReciprocal[numFloorSegments]; + TessFactorCtx.fxpInvNumSegmentsOnCeilTessFactor = s_fixedReciprocal[numCeilSegments]; +} + +//--------------------------------------------------------------------------------------------------------------------------------- +// CHWTessellator::PlacePointIn1D() +//--------------------------------------------------------------------------------------------------------------------------------- +void CHWTessellator::PlacePointIn1D( const TESS_FACTOR_CONTEXT& TessFactorCtx, int point, FXP& fxpLocation ) +{ + bool bFlip; + if( point >= TessFactorCtx.numHalfTessFactorPoints ) + { + point = (TessFactorCtx.numHalfTessFactorPoints << 1) - point; + if( Odd() ) + { + point -= 1; + } + bFlip = true; + } + else + { + bFlip = false; + } + if( point == TessFactorCtx.numHalfTessFactorPoints ) + { + fxpLocation = FXP_ONE_HALF; // special casing middle since 16 bit fixed math below can't reproduce 0.5 exactly + return; + } + unsigned int indexOnCeilHalfTessFactor = point; + unsigned int indexOnFloorHalfTessFactor = indexOnCeilHalfTessFactor; + if( point > TessFactorCtx.splitPointOnFloorHalfTessFactor ) + { + indexOnFloorHalfTessFactor -= 1; + } + // For the fixed point multiplies below, we know the results are <= 16 bits because + // the locations on the halfTessFactor are <= half the number of segments for the total TessFactor. + // So a number divided by a number that is at least twice as big will give + // a result no bigger than 0.5 (which in fixed point is 16 bits in our case) + FXP fxpLocationOnFloorHalfTessFactor = indexOnFloorHalfTessFactor * TessFactorCtx.fxpInvNumSegmentsOnFloorTessFactor; + FXP fxpLocationOnCeilHalfTessFactor = indexOnCeilHalfTessFactor * TessFactorCtx.fxpInvNumSegmentsOnCeilTessFactor; + + // Since we know the numbers calculated above are <= fixed point 0.5, and the equation + // below is just lerping between two values <= fixed point 0.5 (0x00008000), then we know + // that the final result before shifting by 16 bits is no larger than 0x80000000. Once we + // shift that down by 16, we get the result of lerping 2 numbers <= 0.5, which is obviously + // at most 0.5 (0x00008000) + fxpLocation = fxpLocationOnFloorHalfTessFactor * (FXP_ONE - TessFactorCtx.fxpHalfTessFactorFraction) + + fxpLocationOnCeilHalfTessFactor * (TessFactorCtx.fxpHalfTessFactorFraction); + fxpLocation = (fxpLocation + FXP_ONE_HALF/*round*/) >> FXP_FRACTION_BITS; // get back to n.16 + /* Commenting out floating point version. Note the parameter cleansing it does is not needed in fixed point. + if( bFlip ) + location = 1.0f - location; // complement produces cleansed result. + else + CleanseParameter(location); + */ + if( bFlip ) + { + fxpLocation = FXP_ONE - fxpLocation; + } +} + +//--------------------------------------------------------------------------------------------------------------------------------- +// CHWTessellator::StitchRegular +//--------------------------------------------------------------------------------------------------------------------------------- +void CHWTessellator::StitchRegular(bool bTrapezoid,DIAGONALS diagonals, + int baseIndexOffset, int numInsideEdgePoints, + int insideEdgePointBaseOffset, int outsideEdgePointBaseOffset) +{ + int insidePoint = insideEdgePointBaseOffset; + int outsidePoint = outsideEdgePointBaseOffset; + if( bTrapezoid ) + { + DefineClockwiseTriangle(outsidePoint,outsidePoint+1,insidePoint,baseIndexOffset); + baseIndexOffset += 3; outsidePoint++; + } + int p; + switch( diagonals ) + { + case DIAGONALS_INSIDE_TO_OUTSIDE: + // Diagonals pointing from inside edge forward towards outside edge + for( p = 0; p < numInsideEdgePoints-1; p++ ) + { + DefineClockwiseTriangle(insidePoint,outsidePoint,outsidePoint+1,baseIndexOffset); + baseIndexOffset += 3; + + DefineClockwiseTriangle(insidePoint,outsidePoint+1,insidePoint+1,baseIndexOffset); + baseIndexOffset += 3; + insidePoint++; outsidePoint++; + } + break; + case DIAGONALS_INSIDE_TO_OUTSIDE_EXCEPT_MIDDLE: // Assumes ODD tessellation + // Diagonals pointing from outside edge forward towards inside edge + + // First half + for( p = 0; p < numInsideEdgePoints/2-1; p++ ) + { + DefineClockwiseTriangle(outsidePoint,outsidePoint+1,insidePoint,baseIndexOffset); + baseIndexOffset += 3; + DefineClockwiseTriangle(insidePoint,outsidePoint+1,insidePoint+1,baseIndexOffset); + baseIndexOffset += 3; + insidePoint++; outsidePoint++; + } + + // Middle + DefineClockwiseTriangle(outsidePoint,insidePoint+1,insidePoint,baseIndexOffset); + baseIndexOffset += 3; + DefineClockwiseTriangle(outsidePoint,outsidePoint+1,insidePoint+1,baseIndexOffset); + baseIndexOffset += 3; + insidePoint++; outsidePoint++; p+=2; + + // Second half + for( ; p < numInsideEdgePoints; p++ ) + { + DefineClockwiseTriangle(outsidePoint,outsidePoint+1,insidePoint,baseIndexOffset); + baseIndexOffset += 3; + DefineClockwiseTriangle(insidePoint,outsidePoint+1,insidePoint+1,baseIndexOffset); + baseIndexOffset += 3; + insidePoint++; outsidePoint++; + } + break; + case DIAGONALS_MIRRORED: + // First half, diagonals pointing from outside of outside edge to inside of inside edge + for( p = 0; p < numInsideEdgePoints/2; p++ ) + { + DefineClockwiseTriangle(outsidePoint,insidePoint+1,insidePoint,baseIndexOffset); + baseIndexOffset += 3; + DefineClockwiseTriangle(outsidePoint,outsidePoint+1,insidePoint+1,baseIndexOffset); + baseIndexOffset += 3; + insidePoint++; outsidePoint++; + } + // Second half, diagonals pointing from inside of inside edge to outside of outside edge + for( ; p < numInsideEdgePoints-1; p++ ) + { + DefineClockwiseTriangle(insidePoint,outsidePoint,outsidePoint+1,baseIndexOffset); + baseIndexOffset += 3; + DefineClockwiseTriangle(insidePoint,outsidePoint+1,insidePoint+1,baseIndexOffset); + baseIndexOffset += 3; + insidePoint++; outsidePoint++; + } + break; + } + if( bTrapezoid ) + { + DefineClockwiseTriangle(outsidePoint,outsidePoint+1,insidePoint,baseIndexOffset); + baseIndexOffset += 3; + } +} + +//--------------------------------------------------------------------------------------------------------------------------------- +// CHWTessellator::StitchTransition() +//--------------------------------------------------------------------------------------------------------------------------------- +void CHWTessellator::StitchTransition(int baseIndexOffset, + int insideEdgePointBaseOffset, int insideNumHalfTessFactorPoints, + TESSELLATOR_PARITY insideEdgeTessFactorParity, + int outsideEdgePointBaseOffset, int outsideNumHalfTessFactorPoints, + TESSELLATOR_PARITY outsideTessFactorParity +) +{ + +#ifdef ALLOW_XBOX_360_COMPARISON + // Tables to assist in the stitching of 2 rows of points having arbitrary TessFactors. + // The stitching order is governed by Ruler Function vertex split ordering (see external documentation). + // + // The contents of the finalPointPositionTable are where vertex i [0..32] ends up on the half-edge + // at the max tessellation amount given ruler-function split order. + // Recall the other half of an edge is mirrored, so we only need to deal with one half. + // This table is used to decide when to advance a point on the interior or exterior. + // It supports odd TessFactor up to 65 and even TessFactor up to 64. + static const int _finalPointPositionTable[33] = + { 0, 32, 16, 8, 17, 4, 18, 9, 19, 2, 20, 10, 21, 5, 22, 11, 23, + 1, 24, 12, 25, 6, 26, 13, 27, 3, 28, 14, 29, 7, 30, 15, 31 }; + // The loopStart and loopEnd tables below just provide optimal loop bounds for the + // stitching algorithm further below, for any given halfTssFactor. + // There is probably a better way to encode this... + + // loopStart[halfTessFactor] encodes the FIRST entry other that [0] in finalPointPositionTable[] above which is + // less than halfTessFactor. Exceptions are entry 0 and 1, which are set up to skip the loop. + static const int _loopStart[33] = + {1,1,17,9,9,5,5,5,5,3,3,3,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2}; + // loopStart[halfTessFactor] encodes the LAST entry in finalPointPositionTable[] above which is + // less than halfTessFactor. Exceptions are entry 0 and 1, which are set up to skip the loop. + static const int _loopEnd[33] = + {0,0,17,17,25,25,25,25,29,29,29,29,29,29,29,29,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,32}; + const int* finalPointPositionTable; + const int* loopStart; + const int* loopEnd; + if( m_bXBox360Mode ) + { + // The XBox360 vertex introduction order is always from the center of the edge. + // So the final positions of points on the half-edge are this trivial table. + static const int XBOXfinalPointPositionTable[33] = + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, + 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32 }; + // loopStart and loopEnd (meaning described above) also become trivial for XBox360 splitting. + static const int XBOXloopStart[33] = + {1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}; + static const int XBOXloopEnd[33] = + {0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31}; + + finalPointPositionTable = XBOXfinalPointPositionTable; + loopStart = XBOXloopStart; + loopEnd = XBOXloopEnd; + } + else + { + finalPointPositionTable = _finalPointPositionTable; + loopStart = _loopStart; + loopEnd =_loopEnd; + } +#else + // Tables to assist in the stitching of 2 rows of points having arbitrary TessFactors. + // The stitching order is governed by Ruler Function vertex split ordering (see external documentation). + // + // The contents of the finalPointPositionTable are where vertex i [0..33] ends up on the half-edge + // at the max tessellation amount given ruler-function split order. + // Recall the other half of an edge is mirrored, so we only need to deal with one half. + // This table is used to decide when to advance a point on the interior or exterior. + // It supports odd TessFactor up to 65 and even TessFactor up to 64. + static const int finalPointPositionTable[33] = + { 0, 32, 16, 8, 17, 4, 18, 9, 19, 2, 20, 10, 21, 5, 22, 11, 23, + 1, 24, 12, 25, 6, 26, 13, 27, 3, 28, 14, 29, 7, 30, 15, 31 }; + + // The loopStart and loopEnd tables below just provide optimal loop bounds for the + // stitching algorithm further below, for any given halfTssFactor. + // There is probably a better way to encode this... + + // loopStart[halfTessFactor] encodes the FIRST entry in finalPointPositionTable[] above which is + // less than halfTessFactor. Exceptions are entry 0 and 1, which are set up to skip the loop. + static const int loopStart[33] = + {1,1,17,9,9,5,5,5,5,3,3,3,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2}; + // loopStart[halfTessFactor] encodes the LAST entry in finalPointPositionTable[] above which is + // less than halfTessFactor. Exceptions are entry 0 and 1, which are set up to skip the loop. + static const int loopEnd[33] = + {0,0,17,17,25,25,25,25,29,29,29,29,29,29,29,29,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,32}; +#endif + if( TESSELLATOR_PARITY_ODD == insideEdgeTessFactorParity ) + { + insideNumHalfTessFactorPoints -= 1; + } + if( TESSELLATOR_PARITY_ODD == outsideTessFactorParity ) + { + outsideNumHalfTessFactorPoints -= 1; + } + // Walk first half + int outsidePoint = outsideEdgePointBaseOffset; + int insidePoint = insideEdgePointBaseOffset; + + // iStart,iEnd are a small optimization so the loop below doesn't have to go from 0 up to 31 + int iStart = min(loopStart[insideNumHalfTessFactorPoints],loopStart[outsideNumHalfTessFactorPoints]); + int iEnd = max(loopEnd[insideNumHalfTessFactorPoints],loopEnd[outsideNumHalfTessFactorPoints]); + + if( finalPointPositionTable[0] < outsideNumHalfTessFactorPoints ) // since we dont' start the loop at 0 below, we need a special case. + { + // Advance outside + DefineClockwiseTriangle(outsidePoint,outsidePoint+1,insidePoint,baseIndexOffset); + baseIndexOffset += 3; outsidePoint++; + } + + for(int i = iStart; i <= iEnd; i++) + { + if( /*(i>0) && <-- not needed since iStart is never 0*/(finalPointPositionTable[i] < insideNumHalfTessFactorPoints)) + { + // Advance inside + DefineClockwiseTriangle(insidePoint,outsidePoint,insidePoint+1,baseIndexOffset); + baseIndexOffset += 3; insidePoint++; + } + if((finalPointPositionTable[i] < outsideNumHalfTessFactorPoints)) + { + // Advance outside + DefineClockwiseTriangle(outsidePoint,outsidePoint+1,insidePoint,baseIndexOffset); + baseIndexOffset += 3; outsidePoint++; + } + } + + if( (insideEdgeTessFactorParity != outsideTessFactorParity) || (insideEdgeTessFactorParity == TESSELLATOR_PARITY_ODD)) + { + if( insideEdgeTessFactorParity == outsideTessFactorParity ) + { + // Quad in the middle + DefineClockwiseTriangle(insidePoint,outsidePoint,insidePoint+1,baseIndexOffset); + baseIndexOffset += 3; + DefineClockwiseTriangle(insidePoint+1,outsidePoint,outsidePoint+1,baseIndexOffset); + baseIndexOffset += 3; + insidePoint++; + outsidePoint++; + } + else if( TESSELLATOR_PARITY_EVEN == insideEdgeTessFactorParity ) + { + // Triangle pointing inside + DefineClockwiseTriangle(insidePoint,outsidePoint,outsidePoint+1,baseIndexOffset); + baseIndexOffset += 3; + outsidePoint++; + } + else + { + // Triangle pointing outside + DefineClockwiseTriangle(insidePoint,outsidePoint,insidePoint+1,baseIndexOffset); + baseIndexOffset += 3; + insidePoint++; + } + } + + // Walk second half. + for(int i = iEnd; i >= iStart; i--) + { + if((finalPointPositionTable[i] < outsideNumHalfTessFactorPoints)) + { + // Advance outside + DefineClockwiseTriangle(outsidePoint,outsidePoint+1,insidePoint,baseIndexOffset); + baseIndexOffset += 3; outsidePoint++; + } + if( /*(i>0) && <-- not needed since iStart is never 0*/ (finalPointPositionTable[i] < insideNumHalfTessFactorPoints)) + { + // Advance inside + DefineClockwiseTriangle(insidePoint,outsidePoint,insidePoint+1,baseIndexOffset); + baseIndexOffset += 3; insidePoint++; + } + } + // Below case is not needed if we didn't optimize loop above and made it run from 31 down to 0. + if((finalPointPositionTable[0] < outsideNumHalfTessFactorPoints)) + { + DefineClockwiseTriangle(outsidePoint,outsidePoint+1,insidePoint,baseIndexOffset); + baseIndexOffset += 3; outsidePoint++; + } +} + +//--------------------------------------------------------------------------------------------------------------------------------- +// CHWTessellator::PatchIndexValue() +//-------------------------------------------------------------------------------------------------------------------------------- +int CHWTessellator::PatchIndexValue(int index) +{ + if( m_bUsingPatchedIndices ) + { + if( index >= m_IndexPatchContext.outsidePointIndexPatchBase ) // assumed remapped outide indices are > remapped inside vertices + { + if( index == m_IndexPatchContext.outsidePointIndexBadValue ) + index = m_IndexPatchContext.outsidePointIndexReplacementValue; + else + index += m_IndexPatchContext.outsidePointIndexDeltaToRealValue; + } + else + { + if( index == m_IndexPatchContext.insidePointIndexBadValue ) + index = m_IndexPatchContext.insidePointIndexReplacementValue; + else + index += m_IndexPatchContext.insidePointIndexDeltaToRealValue; + } + } + else if( m_bUsingPatchedIndices2 ) + { + if( index >= m_IndexPatchContext2.baseIndexToInvert ) + { + if( index == m_IndexPatchContext2.cornerCaseBadValue ) + { + index = m_IndexPatchContext2.cornerCaseReplacementValue; + } + else + { + index = m_IndexPatchContext2.indexInversionEndPoint - index; + } + } + else if( index == m_IndexPatchContext2.cornerCaseBadValue ) + { + index = m_IndexPatchContext2.cornerCaseReplacementValue; + } + } + return index; +} + + +//================================================================================================================================= +// CHLSLTessellator +//================================================================================================================================= + +//--------------------------------------------------------------------------------------------------------------------------------- +// CHLSLTessellator::CHLSLTessellator +//--------------------------------------------------------------------------------------------------------------------------------- +CHLSLTessellator::CHLSLTessellator() +{ + m_LastComputedTessFactors[0] = m_LastComputedTessFactors[1] = m_LastComputedTessFactors[2] = + m_LastComputedTessFactors[3] = m_LastComputedTessFactors[4] = m_LastComputedTessFactors[5] = 0; +} + +//--------------------------------------------------------------------------------------------------------------------------------- +// CHLSLTessellator::Init +// User calls this. +//--------------------------------------------------------------------------------------------------------------------------------- +void CHLSLTessellator::Init( + D3D11_TESSELLATOR_PARTITIONING partitioning, + D3D11_TESSELLATOR_REDUCTION insideTessFactorReduction, + D3D11_TESSELLATOR_QUAD_REDUCTION_AXIS quadInsideTessFactorReductionAxis, + D3D11_TESSELLATOR_OUTPUT_PRIMITIVE outputPrimitive) +{ + CHWTessellator::Init(partitioning,outputPrimitive); + m_LastComputedTessFactors[0] = m_LastComputedTessFactors[1] = m_LastComputedTessFactors[2] = + m_LastComputedTessFactors[3] = m_LastComputedTessFactors[4] = m_LastComputedTessFactors[5] = 0; + m_partitioning = partitioning; + m_originalPartitioning = partitioning; + switch( partitioning ) + { + case D3D11_TESSELLATOR_PARTITIONING_INTEGER: + default: + break; + case D3D11_TESSELLATOR_PARTITIONING_FRACTIONAL_ODD: + m_parity = TESSELLATOR_PARITY_ODD; + break; + case D3D11_TESSELLATOR_PARTITIONING_FRACTIONAL_EVEN: + m_parity = TESSELLATOR_PARITY_EVEN; + break; + } + m_originalParity = m_parity; + m_outputPrimitive = outputPrimitive; + m_insideTessFactorReduction = insideTessFactorReduction; + m_quadInsideTessFactorReductionAxis = quadInsideTessFactorReductionAxis; +} +//--------------------------------------------------------------------------------------------------------------------------------- +// CHLSLTessellator::TessellateQuadDomain +// User calls this +//--------------------------------------------------------------------------------------------------------------------------------- +void CHLSLTessellator::TessellateQuadDomain( float tessFactor_Ueq0, float tessFactor_Veq0, float tessFactor_Ueq1, float tessFactor_Veq1, + float insideTessFactorScaleU, float insideTessFactorScaleV ) +{ + QuadHLSLProcessTessFactors(tessFactor_Ueq0,tessFactor_Veq0,tessFactor_Ueq1,tessFactor_Veq1,insideTessFactorScaleU,insideTessFactorScaleV); + + CHWTessellator::TessellateQuadDomain(m_LastComputedTessFactors[0],m_LastComputedTessFactors[1],m_LastComputedTessFactors[2],m_LastComputedTessFactors[3], + m_LastComputedTessFactors[4],m_LastComputedTessFactors[5]); +} + +//--------------------------------------------------------------------------------------------------------------------------------- +// CHLSLTessellator::QuadHLSLProcessTessFactors +//--------------------------------------------------------------------------------------------------------------------------------- +void CHLSLTessellator::QuadHLSLProcessTessFactors( float tessFactor_Ueq0, float tessFactor_Veq0, float tessFactor_Ueq1, float tessFactor_Veq1, + float insideTessFactorScaleU, float insideTessFactorScaleV ) +{ + if( !(tessFactor_Ueq0 > 0) ||// NaN will pass + !(tessFactor_Veq0 > 0) || + !(tessFactor_Ueq1 > 0) || + !(tessFactor_Veq1 > 0) ) + { + m_LastUnRoundedComputedTessFactors[0] = tessFactor_Ueq0; + m_LastUnRoundedComputedTessFactors[1] = tessFactor_Veq0; + m_LastUnRoundedComputedTessFactors[2] = tessFactor_Ueq1; + m_LastUnRoundedComputedTessFactors[3] = tessFactor_Veq1; + m_LastUnRoundedComputedTessFactors[4] = 0; + m_LastUnRoundedComputedTessFactors[5] = 0; + m_LastComputedTessFactors[0] = + m_LastComputedTessFactors[1] = + m_LastComputedTessFactors[2] = + m_LastComputedTessFactors[3] = + m_LastComputedTessFactors[4] = + m_LastComputedTessFactors[5] = 0; + return; + } + + CleanupFloatTessFactor(tessFactor_Ueq0);// clamp to [1.0f..INF], NaN->1.0f + CleanupFloatTessFactor(tessFactor_Veq0); + CleanupFloatTessFactor(tessFactor_Ueq1); + CleanupFloatTessFactor(tessFactor_Veq1); + + // Save off tessFactors so they can be returned to app + m_LastUnRoundedComputedTessFactors[0] = tessFactor_Ueq0; + m_LastUnRoundedComputedTessFactors[1] = tessFactor_Veq0; + m_LastUnRoundedComputedTessFactors[2] = tessFactor_Ueq1; + m_LastUnRoundedComputedTessFactors[3] = tessFactor_Veq1; + + // Process outside tessFactors + float outsideTessFactor[QUAD_EDGES] = {tessFactor_Ueq0, tessFactor_Veq0, tessFactor_Ueq1, tessFactor_Veq1}; + int edge, axis; + TESSELLATOR_PARITY insideTessFactorParity[QUAD_AXES], outsideTessFactorParity[QUAD_EDGES]; + if( Pow2Partitioning() || IntegerPartitioning() ) + { + for( edge = 0; edge < QUAD_EDGES; edge++ ) + { + RoundUpTessFactor(outsideTessFactor[edge]); + ClampTessFactor(outsideTessFactor[edge]); // clamp unbounded user input based on tessellation mode + int edgeEven = isEven(outsideTessFactor[edge]); + outsideTessFactorParity[edge] = edgeEven ? TESSELLATOR_PARITY_EVEN : TESSELLATOR_PARITY_ODD; + } + } + else + { + SetTessellationParity(m_originalParity); // ClampTessFactor needs it + for( edge = 0; edge < QUAD_EDGES; edge++ ) + { + ClampTessFactor(outsideTessFactor[edge]); // clamp unbounded user input based on tessellation mode + outsideTessFactorParity[edge] = m_originalParity; + } + } + + // Compute inside TessFactors + float insideTessFactor[QUAD_AXES]; + if( m_quadInsideTessFactorReductionAxis == D3D11_TESSELLATOR_QUAD_REDUCTION_1_AXIS ) + { + switch( m_insideTessFactorReduction ) + { + case D3D11_TESSELLATOR_REDUCTION_MIN: + insideTessFactor[U] = tess_fmin(tess_fmin(tessFactor_Veq0,tessFactor_Veq1),tess_fmin(tessFactor_Ueq0,tessFactor_Ueq1)); + break; + case D3D11_TESSELLATOR_REDUCTION_MAX: + insideTessFactor[U] = tess_fmax(tess_fmax(tessFactor_Veq0,tessFactor_Veq1),tess_fmax(tessFactor_Ueq0,tessFactor_Ueq1)); + break; + case D3D11_TESSELLATOR_REDUCTION_AVERAGE: + insideTessFactor[U] = (tessFactor_Veq0 + tessFactor_Veq1 + tessFactor_Ueq0 + tessFactor_Ueq1) / 4; + break; + } + // Scale inside tessFactor based on user scale factor. + + ClampFloatTessFactorScale(insideTessFactorScaleU); // clamp scale value to [0..1], NaN->0 + insideTessFactor[U] = insideTessFactor[U]*insideTessFactorScaleU; + + // Compute inside parity + if( Pow2Partitioning() || IntegerPartitioning() ) + { + ClampTessFactor(insideTessFactor[U]); // clamp reduction + scale result that is based on unbounded user input + m_LastUnRoundedComputedTessFactors[4] = m_LastUnRoundedComputedTessFactors[5] = insideTessFactor[U]; // Save off TessFactors so they can be returned to app + RoundUpTessFactor(insideTessFactor[U]); + insideTessFactorParity[U] = + insideTessFactorParity[V] = + (isEven(insideTessFactor[U]) || (FLOAT_ONE == insideTessFactor[U]) ) + ? TESSELLATOR_PARITY_EVEN : TESSELLATOR_PARITY_ODD; + } + else + { + ClampTessFactor(insideTessFactor[U]); // clamp reduction + scale result that is based on unbounded user input + m_LastUnRoundedComputedTessFactors[4] = m_LastUnRoundedComputedTessFactors[5] = insideTessFactor[U]; // Save off TessFactors so they can be returned to app + // no parity changes for fractional tessellation - just use what the user requested + insideTessFactorParity[U] = insideTessFactorParity[V] = m_originalParity; + } + + // To prevent snapping on edges, the "picture frame" comes + // in using avg or max (and ignore inside TessFactor scaling) until it is at least 3. + if( (TESSELLATOR_PARITY_ODD == insideTessFactorParity[U]) && + (insideTessFactor[U] < FLOAT_THREE) ) + { + if(D3D11_TESSELLATOR_REDUCTION_MAX == m_insideTessFactorReduction) + { + insideTessFactor[U] = tess_fmin(FLOAT_THREE,tess_fmax(tess_fmax(tessFactor_Veq0,tessFactor_Veq1),tess_fmax(tessFactor_Ueq0,tessFactor_Ueq1))); + } + else + { + insideTessFactor[U] = tess_fmin(FLOAT_THREE,(tessFactor_Veq0 + tessFactor_Veq1 + tessFactor_Ueq0 + tessFactor_Ueq1) / 4); + } + ClampTessFactor(insideTessFactor[U]); // clamp reduction result that is based on unbounded user input + m_LastUnRoundedComputedTessFactors[4] = m_LastUnRoundedComputedTessFactors[5] = insideTessFactor[U]; // Save off TessFactors so they can be returned to app + if( IntegerPartitioning()) + { + RoundUpTessFactor(insideTessFactor[U]); + insideTessFactorParity[U] = + insideTessFactorParity[V] = isEven(insideTessFactor[U]) ? TESSELLATOR_PARITY_EVEN : TESSELLATOR_PARITY_ODD; + } + } + insideTessFactor[V] = insideTessFactor[U]; + } + else + { + switch( m_insideTessFactorReduction ) + { + case D3D11_TESSELLATOR_REDUCTION_MIN: + insideTessFactor[U] = tess_fmin(tessFactor_Veq0,tessFactor_Veq1); + insideTessFactor[V] = tess_fmin(tessFactor_Ueq0,tessFactor_Ueq1); + break; + case D3D11_TESSELLATOR_REDUCTION_MAX: + insideTessFactor[U] = tess_fmax(tessFactor_Veq0,tessFactor_Veq1); + insideTessFactor[V] = tess_fmax(tessFactor_Ueq0,tessFactor_Ueq1); + break; + case D3D11_TESSELLATOR_REDUCTION_AVERAGE: + insideTessFactor[U] = (tessFactor_Veq0 + tessFactor_Veq1) / 2; + insideTessFactor[V] = (tessFactor_Ueq0 + tessFactor_Ueq1) / 2; + break; + } + // Scale inside tessFactors based on user scale factor. + + ClampFloatTessFactorScale(insideTessFactorScaleU); // clamp scale value to [0..1], NaN->0 + ClampFloatTessFactorScale(insideTessFactorScaleV); + insideTessFactor[U] = insideTessFactor[U]*insideTessFactorScaleU; + insideTessFactor[V] = insideTessFactor[V]*insideTessFactorScaleV; + + // Compute inside parity + if( Pow2Partitioning() || IntegerPartitioning() ) + { + for( axis = 0; axis < QUAD_AXES; axis++ ) + { + ClampTessFactor(insideTessFactor[axis]); // clamp reduction + scale result that is based on unbounded user input + m_LastUnRoundedComputedTessFactors[4+axis] = insideTessFactor[axis]; // Save off TessFactors so they can be returned to app + RoundUpTessFactor(insideTessFactor[axis]); + insideTessFactorParity[axis] = + (isEven(insideTessFactor[axis]) || (FLOAT_ONE == insideTessFactor[axis]) ) + ? TESSELLATOR_PARITY_EVEN : TESSELLATOR_PARITY_ODD; + } + } + else + { + ClampTessFactor(insideTessFactor[U]); // clamp reduction + scale result that is based on unbounded user input + ClampTessFactor(insideTessFactor[V]); // clamp reduction + scale result that is based on unbounded user input + m_LastUnRoundedComputedTessFactors[4] = insideTessFactor[U]; // Save off TessFactors so they can be returned to app + m_LastUnRoundedComputedTessFactors[5] = insideTessFactor[V]; // Save off TessFactors so they can be returned to app + // no parity changes for fractional tessellation - just use what the user requested + insideTessFactorParity[U] = insideTessFactorParity[V] = m_originalParity; + } + + // To prevent snapping on edges, the "picture frame" comes + // in using avg or max (and ignore inside TessFactor scaling) until it is at least 3. + if( (TESSELLATOR_PARITY_ODD == insideTessFactorParity[U]) && + (insideTessFactor[U] < FLOAT_THREE) ) + { + if(D3D11_TESSELLATOR_REDUCTION_MAX == m_insideTessFactorReduction) + { + insideTessFactor[U] = tess_fmin(FLOAT_THREE,tess_fmax(tessFactor_Veq0,tessFactor_Veq1)); + } + else + { + insideTessFactor[U] = tess_fmin(FLOAT_THREE,(tessFactor_Veq0 + tessFactor_Veq1) / 2); + } + ClampTessFactor(insideTessFactor[U]); // clamp reduction result that is based on unbounded user input + m_LastUnRoundedComputedTessFactors[4] = insideTessFactor[U]; // Save off TessFactors so they can be returned to app + if( IntegerPartitioning()) + { + RoundUpTessFactor(insideTessFactor[U]); + insideTessFactorParity[U] = isEven(insideTessFactor[U]) ? TESSELLATOR_PARITY_EVEN : TESSELLATOR_PARITY_ODD; + } + } + + if( (TESSELLATOR_PARITY_ODD == insideTessFactorParity[V]) && + (insideTessFactor[V] < FLOAT_THREE) ) + { + if(D3D11_TESSELLATOR_REDUCTION_MAX == m_insideTessFactorReduction) + { + insideTessFactor[V] = tess_fmin(FLOAT_THREE,tess_fmax(tessFactor_Ueq0,tessFactor_Ueq1)); + } + else + { + insideTessFactor[V] = tess_fmin(FLOAT_THREE,(tessFactor_Ueq0 + tessFactor_Ueq1) / 2); + } + ClampTessFactor(insideTessFactor[V]);// clamp reduction result that is based on unbounded user input + m_LastUnRoundedComputedTessFactors[5] = insideTessFactor[V]; // Save off TessFactors so they can be returned to app + if( IntegerPartitioning()) + { + RoundUpTessFactor(insideTessFactor[V]); + insideTessFactorParity[V] = isEven(insideTessFactor[V]) ? TESSELLATOR_PARITY_EVEN : TESSELLATOR_PARITY_ODD; + } + } + + for( axis = 0; axis < QUAD_AXES; axis++ ) + { + if( TESSELLATOR_PARITY_ODD == insideTessFactorParity[axis] ) + { + // Ensure the first ring ("picture frame") interpolates in on all sides + // as much as the side with the minimum TessFactor. Prevents snapping to edge. + if( (insideTessFactor[axis] < FLOAT_THREE) && (insideTessFactor[axis] < insideTessFactor[(axis+1)&0x1])) + { + insideTessFactor[axis] = tess_fmin(insideTessFactor[(axis+1)&0x1],FLOAT_THREE); + m_LastUnRoundedComputedTessFactors[4+axis] = insideTessFactor[axis]; // Save off TessFactors so they can be returned to app + } + } + } + } + + // Save off TessFactors so they can be returned to app + m_LastComputedTessFactors[0] = outsideTessFactor[Ueq0]; + m_LastComputedTessFactors[1] = outsideTessFactor[Veq0]; + m_LastComputedTessFactors[2] = outsideTessFactor[Ueq1]; + m_LastComputedTessFactors[3] = outsideTessFactor[Veq1]; + m_LastComputedTessFactors[4] = insideTessFactor[U]; + m_LastComputedTessFactors[5] = insideTessFactor[V]; +} + +//--------------------------------------------------------------------------------------------------------------------------------- +// CHLSLTessellator::TessellateTriDomain +// User calls this +//--------------------------------------------------------------------------------------------------------------------------------- +void CHLSLTessellator::TessellateTriDomain( float tessFactor_Ueq0, float tessFactor_Veq0, float tessFactor_Weq0, + float insideTessFactorScale ) +{ + TriHLSLProcessTessFactors(tessFactor_Ueq0,tessFactor_Veq0,tessFactor_Weq0,insideTessFactorScale); + + CHWTessellator::TessellateTriDomain(m_LastComputedTessFactors[0],m_LastComputedTessFactors[1],m_LastComputedTessFactors[2],m_LastComputedTessFactors[3]); +} + +//--------------------------------------------------------------------------------------------------------------------------------- +// CHLSLTessellator::TriHLSLProcessTessFactors +//--------------------------------------------------------------------------------------------------------------------------------- +void CHLSLTessellator::TriHLSLProcessTessFactors( float tessFactor_Ueq0, float tessFactor_Veq0, float tessFactor_Weq0, + float insideTessFactorScale ) +{ + if( !(tessFactor_Ueq0 > 0) || // NaN will pass + !(tessFactor_Veq0 > 0) || + !(tessFactor_Weq0 > 0) ) + { + m_LastUnRoundedComputedTessFactors[0] = tessFactor_Ueq0; + m_LastUnRoundedComputedTessFactors[1] = tessFactor_Veq0; + m_LastUnRoundedComputedTessFactors[2] = tessFactor_Weq0; + m_LastUnRoundedComputedTessFactors[3] = + m_LastComputedTessFactors[0] = + m_LastComputedTessFactors[1] = + m_LastComputedTessFactors[2] = + m_LastComputedTessFactors[3] = 0; + return; + } + + CleanupFloatTessFactor(tessFactor_Ueq0); // clamp to [1.0f..INF], NaN->1.0f + CleanupFloatTessFactor(tessFactor_Veq0); + CleanupFloatTessFactor(tessFactor_Weq0); + + // Save off TessFactors so they can be returned to app + m_LastUnRoundedComputedTessFactors[0] = tessFactor_Ueq0; + m_LastUnRoundedComputedTessFactors[1] = tessFactor_Veq0; + m_LastUnRoundedComputedTessFactors[2] = tessFactor_Weq0; + + // Process outside TessFactors + float outsideTessFactor[TRI_EDGES] = {tessFactor_Ueq0, tessFactor_Veq0, tessFactor_Weq0}; + int edge; + if( Pow2Partitioning() || IntegerPartitioning() ) + { + for( edge = 0; edge < TRI_EDGES; edge++ ) + { + RoundUpTessFactor(outsideTessFactor[edge]); // for pow2 this rounds to pow2 + ClampTessFactor(outsideTessFactor[edge]); // clamp unbounded user input based on tessellation mode + } + } + else + { + for( edge = 0; edge < TRI_EDGES; edge++ ) + { + ClampTessFactor(outsideTessFactor[edge]); // clamp unbounded user input based on tessellation mode + } + } + + // Compute inside TessFactor + float insideTessFactor; + switch( m_insideTessFactorReduction ) + { + case D3D11_TESSELLATOR_REDUCTION_MIN: + insideTessFactor = tess_fmin(tess_fmin(tessFactor_Ueq0,tessFactor_Veq0),tessFactor_Weq0); + break; + case D3D11_TESSELLATOR_REDUCTION_MAX: + insideTessFactor = tess_fmax(tess_fmax(tessFactor_Ueq0,tessFactor_Veq0),tessFactor_Weq0); + break; + case D3D11_TESSELLATOR_REDUCTION_AVERAGE: + insideTessFactor = (tessFactor_Ueq0 + tessFactor_Veq0 + tessFactor_Weq0) / 3; + break; + } + + // Scale inside TessFactor based on user scale factor. + ClampFloatTessFactorScale(insideTessFactorScale); // clamp scale value to [0..1], NaN->0 + insideTessFactor = insideTessFactor*tess_fmin(FLOAT_ONE,insideTessFactorScale); + + ClampTessFactor(insideTessFactor); // clamp reduction + scale result that is based on unbounded user input + m_LastUnRoundedComputedTessFactors[3] = insideTessFactor;// Save off TessFactors so they can be returned to app + TESSELLATOR_PARITY parity; + if( Pow2Partitioning() || IntegerPartitioning() ) + { + RoundUpTessFactor(insideTessFactor); + parity = (isEven(insideTessFactor) || (FLOAT_ONE == insideTessFactor)) + ? TESSELLATOR_PARITY_EVEN : TESSELLATOR_PARITY_ODD; + } + else + { + parity = m_originalParity; + } + + if( (TESSELLATOR_PARITY_ODD == parity) && + (insideTessFactor < FLOAT_THREE)) + { + // To prevent snapping on edges, the "picture frame" comes + // in using avg or max (and ignore inside TessFactor scaling) until it is at least 3. + if(D3D11_TESSELLATOR_REDUCTION_MAX == m_insideTessFactorReduction) + { + insideTessFactor = tess_fmin(FLOAT_THREE,tess_fmax(tessFactor_Ueq0,tess_fmax(tessFactor_Veq0,tessFactor_Weq0))); + } + else + { + insideTessFactor = tess_fmin(FLOAT_THREE,(tessFactor_Ueq0 + tessFactor_Veq0 + tessFactor_Weq0) / 3); + } + ClampTessFactor(insideTessFactor); // clamp reduction result that is based on unbounded user input + m_LastUnRoundedComputedTessFactors[3] = insideTessFactor;// Save off TessFactors so they can be returned to app + if( IntegerPartitioning()) + { + RoundUpTessFactor(insideTessFactor); + } + } + + // Save off TessFactors so they can be returned to app + m_LastComputedTessFactors[0] = outsideTessFactor[Ueq0]; + m_LastComputedTessFactors[1] = outsideTessFactor[Veq0]; + m_LastComputedTessFactors[2] = outsideTessFactor[Weq0]; + m_LastComputedTessFactors[3] = insideTessFactor; +} + +//--------------------------------------------------------------------------------------------------------------------------------- +// CHLSLTessellator::TessellateIsoLineDomain +// User calls this. +//--------------------------------------------------------------------------------------------------------------------------------- +void CHLSLTessellator::TessellateIsoLineDomain( float TessFactor_U_LineDetail, float TessFactor_V_LineDensity ) +{ + IsoLineHLSLProcessTessFactors(TessFactor_V_LineDensity,TessFactor_U_LineDetail); + CHWTessellator::TessellateIsoLineDomain(m_LastComputedTessFactors[0],m_LastComputedTessFactors[1]); +} + +//--------------------------------------------------------------------------------------------------------------------------------- +// CHLSLTessellator::IsoLineHLSLProcessTessFactors +//--------------------------------------------------------------------------------------------------------------------------------- +void CHLSLTessellator::IsoLineHLSLProcessTessFactors( float TessFactor_V_LineDensity, float TessFactor_U_LineDetail ) +{ + if( !(TessFactor_V_LineDensity > 0) || // NaN will pass + !(TessFactor_U_LineDetail > 0) ) + { + m_LastUnRoundedComputedTessFactors[0] = TessFactor_V_LineDensity; + m_LastUnRoundedComputedTessFactors[1] = TessFactor_U_LineDetail; + m_LastComputedTessFactors[0] = + m_LastComputedTessFactors[1] = 0; + return; + } + + CleanupFloatTessFactor(TessFactor_V_LineDensity); // clamp to [1.0f..INF], NaN->1.0f + CleanupFloatTessFactor(TessFactor_U_LineDetail); // clamp to [1.0f..INF], NaN->1.0f + + ClampTessFactor(TessFactor_U_LineDetail); // clamp unbounded user input based on tessellation mode + + m_LastUnRoundedComputedTessFactors[1] = TessFactor_U_LineDetail; // Save off TessFactors so they can be returned to app + + TESSELLATOR_PARITY parity; + if(Pow2Partitioning()||IntegerPartitioning()) + { + RoundUpTessFactor(TessFactor_U_LineDetail); + parity = isEven(TessFactor_U_LineDetail) ? TESSELLATOR_PARITY_EVEN : TESSELLATOR_PARITY_ODD; + } + else + { + parity = m_originalParity; + } + + FXP fxpTessFactor_U_LineDetail = floatToFixed(TessFactor_U_LineDetail); + + OverridePartitioning(D3D11_TESSELLATOR_PARTITIONING_INTEGER); + + ClampTessFactor(TessFactor_V_LineDensity); // Clamp unbounded user input to integer + m_LastUnRoundedComputedTessFactors[0] = TessFactor_V_LineDensity; // Save off TessFactors so they can be returned to app + + RoundUpTessFactor(TessFactor_V_LineDensity); + + RestorePartitioning(); + + // Save off TessFactors so they can be returned to app + m_LastComputedTessFactors[0] = TessFactor_V_LineDensity; + m_LastComputedTessFactors[1] = TessFactor_U_LineDetail; +} + +//--------------------------------------------------------------------------------------------------------------------------------- +// CHLSLTessellator::ClampTessFactor() +//--------------------------------------------------------------------------------------------------------------------------------- +void CHLSLTessellator::ClampTessFactor(float& TessFactor) +{ + if( Pow2Partitioning() ) + { + TessFactor = tess_fmin( D3D11_TESSELLATOR_MAX_EVEN_TESSELLATION_FACTOR, tess_fmax( TessFactor, D3D11_TESSELLATOR_MIN_ODD_TESSELLATION_FACTOR) ); + } + else if( IntegerPartitioning() ) + { + TessFactor = tess_fmin( D3D11_TESSELLATOR_MAX_TESSELLATION_FACTOR, tess_fmax( TessFactor, D3D11_TESSELLATOR_MIN_ODD_TESSELLATION_FACTOR) ); + } + else if( Odd() ) + { + TessFactor = tess_fmin( D3D11_TESSELLATOR_MAX_ODD_TESSELLATION_FACTOR, tess_fmax( TessFactor, D3D11_TESSELLATOR_MIN_ODD_TESSELLATION_FACTOR) ); + } + else // even + { + TessFactor = tess_fmin( D3D11_TESSELLATOR_MAX_EVEN_TESSELLATION_FACTOR, tess_fmax( TessFactor, D3D11_TESSELLATOR_MIN_EVEN_TESSELLATION_FACTOR) ); + } +} + +//--------------------------------------------------------------------------------------------------------------------------------- +// CHLSLTessellator::CleanupFloatTessFactor() +//--------------------------------------------------------------------------------------------------------------------------------- +static const int exponentMask = 0x7f800000; +static const int mantissaMask = 0x007fffff; +void CHLSLTessellator::CleanupFloatTessFactor(float& input) +{ + // If input is < 1.0f or NaN, clamp to 1.0f. + // In other words, clamp input to [1.0f...+INF] + int bits = *(int*)&input; + if( ( ( ( bits & exponentMask ) == exponentMask ) && ( bits & mantissaMask ) ) ||// nan? + (input < 1.0f) ) + { + input = 1; + } +} + +//--------------------------------------------------------------------------------------------------------------------------------- +// CHLSLTessellator::ClampFloatTessFactorScale() +//--------------------------------------------------------------------------------------------------------------------------------- +void CHLSLTessellator::ClampFloatTessFactorScale(float& input) +{ + // If input is < 0.0f or NaN, clamp to 0.0f. > 1 clamps to 1. + // In other words, clamp input to [0.0f...1.0f] + int bits = *(int*)&input; + if( ( ( ( bits & exponentMask ) == exponentMask ) && ( bits & mantissaMask ) ) ||// nan? + (input < 0.0f) ) + { + input = 0; + } + else if( input > 1 ) + { + input = 1; + } +} + +//--------------------------------------------------------------------------------------------------------------------------------- +// CHLSLTessellator::RoundUpTessFactor() +//--------------------------------------------------------------------------------------------------------------------------------- +static const int exponentLSB = 0x00800000; +void CHLSLTessellator::RoundUpTessFactor(float& TessFactor) +{ + // Assume TessFactor is in [1.0f..+INF] + if( Pow2Partitioning() ) + { + int bits = *(int*)&TessFactor; + if( bits & mantissaMask ) + { + *(int*)&TessFactor = (bits & exponentMask) + exponentLSB; + } + } + else if( IntegerPartitioning() ) + { + TessFactor = ceil(TessFactor); + } +} diff -Nru mesa-19.2.8/src/gallium/drivers/swr/rasterizer/core/tessellator.h mesa-20.0.8/src/gallium/drivers/swr/rasterizer/core/tessellator.h --- mesa-19.2.8/src/gallium/drivers/swr/rasterizer/core/tessellator.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/swr/rasterizer/core/tessellator.h 2020-06-12 01:21:17.000000000 +0000 @@ -1,9 +1,5 @@ /**************************************************************************** - * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation + * Copyright (C) 2014-2019 without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: @@ -27,16 +23,7 @@ ******************************************************************************/ #pragma once -/// Allocate and initialize a new tessellation context -HANDLE SWR_API - TSInitCtx(SWR_TS_DOMAIN tsDomain, ///< [IN] Tessellation domain (isoline, quad, triangle) - SWR_TS_PARTITIONING tsPartitioning, ///< [IN] Tessellation partitioning algorithm - SWR_TS_OUTPUT_TOPOLOGY tsOutputTopology, ///< [IN] Tessellation output topology - void* pContextMem, ///< [IN] Memory to use for the context - size_t& memSize); ///< [INOUT] In: Amount of memory in pContextMem. Out: Mem required - -/// Destroy & de-allocate tessellation context -void SWR_API TSDestroyCtx(HANDLE tsCtx); ///< [IN] Tessellation context to be destroyed +#include "tessellator.hpp" struct SWR_TS_TESSELLATED_DATA { @@ -49,34 +36,170 @@ // For Tri: pDomainPointsW[i] = 1.0f - pDomainPointsU[i] - pDomainPointsV[i] }; -/// Perform Tessellation -void SWR_API - TSTessellate(HANDLE tsCtx, ///< [IN] Tessellation Context - const SWR_TESSELLATION_FACTORS& tsTessFactors, ///< [IN] Tessellation Factors - SWR_TS_TESSELLATED_DATA& tsTessellatedData); ///< [OUT] Tessellated Data - +namespace Tessellator +{ + /// Wrapper class for the CHWTessellator reference tessellator from MSFT + /// This class will store data not originally stored in CHWTessellator + class SWR_TS : private CHWTessellator + { + private: + typedef CHWTessellator SUPER; + SWR_TS_DOMAIN Domain; + OSALIGNSIMD(float) DomainPointsU[MAX_POINT_COUNT]; + OSALIGNSIMD(float) DomainPointsV[MAX_POINT_COUNT]; + uint32_t NumDomainPoints; + OSALIGNSIMD(uint32_t) Indices[3][MAX_INDEX_COUNT / 3]; + uint32_t NumIndices; + + public: + void Init(SWR_TS_DOMAIN tsDomain, + SWR_TS_PARTITIONING tsPartitioning, + SWR_TS_OUTPUT_TOPOLOGY tsOutputTopology) + { + static D3D11_TESSELLATOR_PARTITIONING CVT_TS_D3D_PARTITIONING[] = { + D3D11_TESSELLATOR_PARTITIONING_INTEGER, // SWR_TS_INTEGER + D3D11_TESSELLATOR_PARTITIONING_FRACTIONAL_ODD, // SWR_TS_ODD_FRACTIONAL + D3D11_TESSELLATOR_PARTITIONING_FRACTIONAL_EVEN, // SWR_TS_EVEN_FRACTIONAL + D3D11_TESSELLATOR_PARTITIONING_POW2 // SWR_TS_POW2 + }; + + static D3D11_TESSELLATOR_OUTPUT_PRIMITIVE CVT_TS_D3D_OUTPUT_TOPOLOGY[] = { + D3D11_TESSELLATOR_OUTPUT_POINT, // SWR_TS_OUTPUT_POINT + D3D11_TESSELLATOR_OUTPUT_LINE, // SWR_TS_OUTPUT_LINE + D3D11_TESSELLATOR_OUTPUT_TRIANGLE_CCW, // SWR_TS_OUTPUT_TRI_CW - inverted logic, because DX + D3D11_TESSELLATOR_OUTPUT_TRIANGLE_CW // SWR_TS_OUTPUT_TRI_CCW - inverted logic, because DX + }; + + SUPER::Init(CVT_TS_D3D_PARTITIONING[tsPartitioning], + CVT_TS_D3D_OUTPUT_TOPOLOGY[tsOutputTopology]); + + Domain = tsDomain; + NumDomainPoints = 0; + NumIndices = 0; + } + + void Tessellate(const SWR_TESSELLATION_FACTORS& tsTessFactors, + SWR_TS_TESSELLATED_DATA& tsTessellatedData) + { + uint32_t IndexDiv = 0; + switch (Domain) + { + case SWR_TS_QUAD: + IndexDiv = 3; + SUPER::TessellateQuadDomain( + tsTessFactors.OuterTessFactors[SWR_QUAD_U_EQ0_TRI_U_LINE_DETAIL], + tsTessFactors.OuterTessFactors[SWR_QUAD_V_EQ0_TRI_W], + tsTessFactors.OuterTessFactors[SWR_QUAD_U_EQ1_TRI_V_LINE_DENSITY], + tsTessFactors.OuterTessFactors[SWR_QUAD_V_EQ1], + tsTessFactors.InnerTessFactors[SWR_QUAD_U_TRI_INSIDE], + tsTessFactors.InnerTessFactors[SWR_QUAD_V_INSIDE]); + break; + + case SWR_TS_TRI: + IndexDiv = 3; + SUPER::TessellateTriDomain( + tsTessFactors.OuterTessFactors[SWR_QUAD_U_EQ0_TRI_U_LINE_DETAIL], + tsTessFactors.OuterTessFactors[SWR_QUAD_U_EQ1_TRI_V_LINE_DENSITY], + tsTessFactors.OuterTessFactors[SWR_QUAD_V_EQ0_TRI_W], + tsTessFactors.InnerTessFactors[SWR_QUAD_U_TRI_INSIDE]); + break; + + case SWR_TS_ISOLINE: + IndexDiv = 2; + SUPER::TessellateIsoLineDomain( + tsTessFactors.OuterTessFactors[SWR_QUAD_U_EQ1_TRI_V_LINE_DENSITY], + tsTessFactors.OuterTessFactors[SWR_QUAD_U_EQ0_TRI_U_LINE_DETAIL]); + break; + + default: + SWR_INVALID("Invalid Tessellation Domain: %d", Domain); + } + + NumDomainPoints = (uint32_t)SUPER::GetPointCount(); + + DOMAIN_POINT* pPoints = SUPER::GetPoints(); + for (uint32_t i = 0; i < NumDomainPoints; i++) { + DomainPointsU[i] = pPoints[i].u; + DomainPointsV[i] = pPoints[i].v; + } + tsTessellatedData.NumDomainPoints = NumDomainPoints; + tsTessellatedData.pDomainPointsU = &DomainPointsU[0]; + tsTessellatedData.pDomainPointsV = &DomainPointsV[0]; + + NumIndices = (uint32_t)SUPER::GetIndexCount(); + + assert(NumIndices % IndexDiv == 0); + tsTessellatedData.NumPrimitives = NumIndices / IndexDiv; + + uint32_t* pIndices = (uint32_t*)SUPER::GetIndices(); + for (uint32_t i = 0; i < NumIndices; i++) { + Indices[i % IndexDiv][i / IndexDiv] = pIndices[i]; + } + + tsTessellatedData.ppIndices[0] = &Indices[0][0]; + tsTessellatedData.ppIndices[1] = &Indices[1][0]; + tsTessellatedData.ppIndices[2] = &Indices[2][0]; + } + }; +} // namespace Tessellator -/// @TODO - Implement OSS tessellator - -INLINE HANDLE SWR_API TSInitCtx(SWR_TS_DOMAIN tsDomain, - SWR_TS_PARTITIONING tsPartitioning, - SWR_TS_OUTPUT_TOPOLOGY tsOutputTopology, - void* pContextMem, - size_t& memSize) +/// Allocate and initialize a new tessellation context +INLINE HANDLE SWR_API + TSInitCtx(SWR_TS_DOMAIN tsDomain, ///< [IN] Tessellation domain (isoline, quad, triangle) + SWR_TS_PARTITIONING tsPartitioning, ///< [IN] Tessellation partitioning algorithm + SWR_TS_OUTPUT_TOPOLOGY tsOutputTopology, ///< [IN] Tessellation output topology + void* pContextMem, ///< [IN] Memory to use for the context + size_t& memSize) ///< [INOUT] In: Amount of memory in pContextMem. Out: Mem required { - SWR_NOT_IMPL; - return NULL; + using Tessellator::SWR_TS; + SWR_ASSERT(tsDomain < SWR_TS_DOMAIN_COUNT); + SWR_ASSERT(tsPartitioning < SWR_TS_PARTITIONING_COUNT); + SWR_ASSERT(tsOutputTopology < SWR_TS_OUTPUT_TOPOLOGY_COUNT); + + size_t origMemSize = memSize; + memSize = AlignUp(sizeof(SWR_TS), 64); + + if (nullptr == pContextMem || memSize > origMemSize) + { + return nullptr; + } + + HANDLE tsCtx = pContextMem; + if (!tsCtx) + { + return tsCtx; + } + + SWR_TS* pTessellator = new (tsCtx) SWR_TS(); + SWR_ASSERT(pTessellator == tsCtx); + + pTessellator->Init(tsDomain, tsPartitioning, tsOutputTopology); + + return tsCtx; } -INLINE void SWR_API TSDestroyCtx(HANDLE tsCtx) +/// Destroy & de-allocate tessellation context +INLINE void SWR_API TSDestroyCtx(HANDLE tsCtx) ///< [IN] Tessellation context to be destroyed { - SWR_NOT_IMPL; + using Tessellator::SWR_TS; + SWR_TS* pTessellator = (SWR_TS*)tsCtx; + + if (pTessellator) + { + pTessellator->~SWR_TS(); + } } -INLINE void SWR_API TSTessellate(HANDLE tsCtx, - const SWR_TESSELLATION_FACTORS& tsTessFactors, - SWR_TS_TESSELLATED_DATA& tsTessellatedData) +/// Perform Tessellation +INLINE void SWR_API + TSTessellate(HANDLE tsCtx, ///< [IN] Tessellation Context + const SWR_TESSELLATION_FACTORS& tsTessFactors, ///< [IN] Tessellation Factors + SWR_TS_TESSELLATED_DATA& tsTessellatedData) ///< [OUT] Tessellated Data { - SWR_NOT_IMPL; + using Tessellator::SWR_TS; + SWR_TS* pTessellator = (SWR_TS*)tsCtx; + SWR_ASSERT(pTessellator); + + pTessellator->Tessellate(tsTessFactors, tsTessellatedData); } diff -Nru mesa-19.2.8/src/gallium/drivers/swr/rasterizer/core/tessellator.hpp mesa-20.0.8/src/gallium/drivers/swr/rasterizer/core/tessellator.hpp --- mesa-19.2.8/src/gallium/drivers/swr/rasterizer/core/tessellator.hpp 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/swr/rasterizer/core/tessellator.hpp 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,471 @@ +/* + Copyright (c) Microsoft Corporation + + Permission is hereby granted, free of charge, to any person obtaining a copy of this software and + associated documentation files (the "Software"), to deal in the Software without restriction, + including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, + and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, + subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all copies or substantial + portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT + NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +#pragma once +//================================================================================================================================= +// Microsoft D3D11 Fixed Function Tessellator Reference - May 7, 2012 +// amar.patel@microsoft.com +// +// CHWTessellator demonstrates what is expected of hardware in the D3D11 fixed function Tessellator stage. Hardware +// implementers need only look at this class. +// +// CHLSLTessellator is a wrapper for CHWTessellator, representing the effect of shader code that will +// be autogenerated by HLSL in the Hull Shader, both for plumbing data around, and to precondition TessFactor values before they +// are passed to the hardware (such as deriving inside TessFactors from edge TessFactors). The algorithms used +// in CHLSLTessellator are subject to change, but since they represent shader code auto-generated by the HLSL compiler, +// CHLSLTessellator has no effect on hardware design at all. Note the HLSL compiler will expose all the raw hardware +// control illustrated by CHWTessellator for those who don't need the helper functionality illustrated by CHLSLTessellator. +// +// Usage: (1) Create either a CHLSLTessellator or CHWTessellator object, depending on which you want to verify. +// (2) Call C*Tessellator::Init() +// (3) Call C*Tessellator::Tessellate[IsoLine|Tri|Quad]Domain() +// - Here you pass in TessFactors (how much to tessellate) +// (4) Call C*Tessellator::GetPointCount(), C*Tessellator::GetIndexCount() to see how much data was generated. +// (5) Call C*Tessellator::GetPoints() and C*Tessellator::GetIndices() to get pointers to the data. +// The pointers are fixed for the lifetime of the object (storage for max tessellation), +// so if you ::Tessellate again, the data in the buffers is overwritten. +// (6) There are various other Get() methods to retrieve TessFactors that have been processed from +// what you passed in at step 3. You can retrieve separate TessFactors that the tessellator +// produced after clamping but before rounding, and also after rounding (say in pow2 mode). +// These numbers can be useful information if you are geomorphing displacement maps. +// (7) Goto Step 2 or 3 if you want to animate TessFactors or tessellate a different patch +// +// Code implementation details: +// +// There is lots of headroom to make this code run faster on CPUs. It was written merely as a reference for +// what results hardware should produce, with CPU performance not a consideration. It is nice that this implementation +// only generates the exact number of vertices needed (no duplicates) in the output vertex buffer. Also, the number +// of calculations done for each U/V domain coordinate is minimized by doing some precalculation of some patch or edge +// invariant numbers (see TESS_FACTOR_CONTEXT). All the vertex coordinate calculations could be computed with as much +// parallelism as you like. Similarly the calculation of connectivity itself is highly parallelizable, and can also +// be done independent of the vertex calculations. +// +//================================================================================================================================= + +#define D3D11_TESSELLATOR_MIN_ODD_TESSELLATION_FACTOR 1 +#define D3D11_TESSELLATOR_MAX_ODD_TESSELLATION_FACTOR 63 +#define D3D11_TESSELLATOR_MIN_EVEN_TESSELLATION_FACTOR 2 +#define D3D11_TESSELLATOR_MAX_EVEN_TESSELLATION_FACTOR 64 + +#define D3D11_TESSELLATOR_MIN_ISOLINE_DENSITY_TESSELLATION_FACTOR 1 +#define D3D11_TESSELLATOR_MAX_ISOLINE_DENSITY_TESSELLATION_FACTOR 64 + +#define D3D11_TESSELLATOR_MAX_TESSELLATION_FACTOR 64 // max of even and odd tessFactors + +#define MAX_POINT_COUNT ((D3D11_TESSELLATOR_MAX_TESSELLATION_FACTOR+1)*(D3D11_TESSELLATOR_MAX_TESSELLATION_FACTOR+1)) +#define MAX_INDEX_COUNT (D3D11_TESSELLATOR_MAX_TESSELLATION_FACTOR*D3D11_TESSELLATOR_MAX_TESSELLATION_FACTOR*2*3) + +//================================================================================================================================= +// Data types for the caller +//================================================================================================================================= +typedef enum D3D11_TESSELLATOR_PARTITIONING +{ + D3D11_TESSELLATOR_PARTITIONING_INTEGER, + D3D11_TESSELLATOR_PARTITIONING_POW2, + D3D11_TESSELLATOR_PARTITIONING_FRACTIONAL_ODD, + D3D11_TESSELLATOR_PARTITIONING_FRACTIONAL_EVEN +}; + +typedef enum D3D11_TESSELLATOR_REDUCTION +{ + D3D11_TESSELLATOR_REDUCTION_MIN, + D3D11_TESSELLATOR_REDUCTION_MAX, + D3D11_TESSELLATOR_REDUCTION_AVERAGE +}; + +typedef enum D3D11_TESSELLATOR_QUAD_REDUCTION_AXIS +{ + D3D11_TESSELLATOR_QUAD_REDUCTION_1_AXIS, + D3D11_TESSELLATOR_QUAD_REDUCTION_2_AXIS +}; + +typedef enum D3D11_TESSELLATOR_OUTPUT_PRIMITIVE +{ + D3D11_TESSELLATOR_OUTPUT_POINT, + D3D11_TESSELLATOR_OUTPUT_LINE, + D3D11_TESSELLATOR_OUTPUT_TRIANGLE_CW, + D3D11_TESSELLATOR_OUTPUT_TRIANGLE_CCW, +}; + +typedef struct DOMAIN_POINT +{ + float u; + float v; // for tri, w = 1 - u - v; +} DOMAIN_POINT; + +//================================================================================================================================= +// CHWTessellator: D3D11 Tessellation Fixed Function Hardware Reference +//================================================================================================================================= +typedef unsigned int FXP; // fixed point number + +class CHWTessellator +{ + +//--------------------------------------------------------------------------------------------------------------------------------- +public: + void Init( D3D11_TESSELLATOR_PARTITIONING partitioning, + D3D11_TESSELLATOR_OUTPUT_PRIMITIVE outputPrimitive); + + void TessellateIsoLineDomain( float TessFactor_V_LineDensity, + float TessFactor_U_LineDetail ); + + void TessellateTriDomain( float TessFactor_Ueq0, + float TessFactor_Veq0, + float TessFactor_Weq0, + float TessFactor_Inside ); + + void TessellateQuadDomain( float TessFactor_Ueq0, + float TessFactor_Veq0, + float TessFactor_Ueq1, + float TessFactor_Veq1, + float TessFactor_InsideU, + float TessFactor_InsideV ); + + int GetPointCount(); + int GetIndexCount(); + + DOMAIN_POINT* GetPoints(); // Get CHWTessellator owned pointer to vertices (UV values). + // Pointer is fixed for lifetime of CHWTessellator object. + int* GetIndices(); // Get CHWTessellator owned pointer to vertex indices. + // Pointer is fixed for lifetime of CHWTessellator object. + +#define ALLOW_XBOX_360_COMPARISON // Different vertex splitting order. This is NOT D3D11 behavior, just available here for comparison. + // Setting this define true just allows the XBox split style to be enabled via + // SetXBox360Mode() below, but by default this XBox360 mode still always starts off DISABLED. + // The XBox360 always splits from the center of an edge (D3D11 uses ruler function). Splitting + // from the center causes sliver triangles in transition areas, which cause numerous problems. + // Note the XBox360 only supports adaptive tessellation via fractional_even partitioning, + // though this #define lets you try the XBox vertex splitting order with any of the + // partitioning modes: even, odd, integer or pow2. +#ifdef ALLOW_XBOX_360_COMPARISON + void SetXBox360Mode(bool bXboxMode) {m_bXBox360Mode = bXboxMode;} +#endif + CHWTessellator(); + ~CHWTessellator(); +//--------------------------------------------------------------------------------------------------------------------------------- + //============================================================================================================================= + // Some defines so that numbers are usually self commenting + //============================================================================================================================= + static const int U = 0; // points on a tri patch + static const int V = 1; + static const int W = 2; + static const int Ueq0 = 0; // edges on a tri patch + static const int Veq0 = 1; + static const int Weq0 = 2; + + static const int Ueq1 = 2; // edges on a quad patch: Ueq0, Veq0, Ueq1, Veq1 + static const int Veq1 = 3; + + static const int QUAD_AXES = 2; + static const int QUAD_EDGES = 4; + static const int TRI_EDGES = 3; + //============================================================================================================================= + + typedef enum TESSELLATOR_PARITY // derived from D3D11_TESSELLATOR_PARTITIONING + { // (note: for integer tessellation, both parities are used) + TESSELLATOR_PARITY_EVEN, + TESSELLATOR_PARITY_ODD + }; +private: + TESSELLATOR_PARITY m_originalParity; // user chosen parity + TESSELLATOR_PARITY m_parity; // current parity: if allowing mix of even/odd during discrete + // tessellation, this can vary from the user defined parity + D3D11_TESSELLATOR_PARTITIONING m_originalPartitioning; // user chosen partitioning + D3D11_TESSELLATOR_PARTITIONING m_partitioning; // current partitioning. IsoLines overrides for line density + D3D11_TESSELLATOR_OUTPUT_PRIMITIVE m_outputPrimitive; + DOMAIN_POINT* m_Point; // array where we will store u/v's for the points we generate + int* m_Index; // array where we will store index topology + int m_NumPoints; + int m_NumIndices; +#ifdef ALLOW_XBOX_360_COMPARISON + bool m_bXBox360Mode; +#endif + // PlacePointIn1D below is the workhorse for all position placement. + // It is code that could run as preamble in a Domain Shader, so the tessellator itself + // doesn't necessarily need to have floating point. + // Some per-TessFactor fixed context is needed, and that can be computed wherever + // the TessFactor reduction is done, perhaps as Hull Shader postamble - this is shared + // for all point evaluation. + typedef struct TESS_FACTOR_CONTEXT + { + FXP fxpInvNumSegmentsOnFloorTessFactor; + FXP fxpInvNumSegmentsOnCeilTessFactor; + FXP fxpHalfTessFactorFraction; + int numHalfTessFactorPoints; + int splitPointOnFloorHalfTessFactor; + } TESS_FACTOR_CONTEXT; + void ComputeTessFactorContext( FXP fxpTessFactor, TESS_FACTOR_CONTEXT& TessFactorCtx ); + void PlacePointIn1D( const TESS_FACTOR_CONTEXT& TessFactorCtx, int point, FXP& fxpLocation ); + + int NumPointsForTessFactor(FXP fxpTessFactor); + + // Tessellation parity control + bool Odd() {return (m_parity == TESSELLATOR_PARITY_ODD) ? true : false;} + void SetTessellationParity(TESSELLATOR_PARITY parity) {m_parity = parity;} + + // HWIntegerPartitioning() - hardware doesn't care about what pow2 partitioning is - the query below is true for + // both integer and pow2. + bool HWIntegerPartitioning() {return ((m_partitioning == D3D11_TESSELLATOR_PARTITIONING_INTEGER)|| + (m_partitioning == D3D11_TESSELLATOR_PARTITIONING_POW2)) ? true : false;} + + // Tesselation Partitioning control + void RestorePartitioning() {m_partitioning = m_originalPartitioning;}; + void OverridePartitioning(D3D11_TESSELLATOR_PARTITIONING partitioning) {m_partitioning = partitioning;} //isoline uses this for density + + // Call these to generate new points and indices. Max TessFactor storage is already allocated. + int DefinePoint(FXP u, FXP v, int pointStorageOffset); + void DefineIndex(int index, int indexStorageOffset); + void DefineClockwiseTriangle(int index0, int index1, int index2, int indexStorageBaseOffset); + + // Couple of trivial ways to generate index data just given points and no other connectivity. + void DumpAllPoints(); // Make point indices for point rendering mode - + // redundant, but just here for orthogonality. + void DumpAllPointsAsInOrderLineList(); // A debug visualization of all the points connected + // in the order they were generated. + // Asking to draw line topology on a tri or quad patch will do this + + + // The structures below define the data that is derived given input TessFactors and which + // is used by point generation and connectivity generation steps (each of which are independent) + typedef struct PROCESSED_TESS_FACTORS_ISOLINE + { + TESSELLATOR_PARITY lineDensityParity; + TESSELLATOR_PARITY lineDetailParity; + TESS_FACTOR_CONTEXT lineDensityTessFactorCtx; + TESS_FACTOR_CONTEXT lineDetailTessFactorCtx; + bool bPatchCulled; + int numPointsPerLine; + int numLines; + } PROCESSED_TESS_FACTORS_ISOLINE; + typedef struct PROCESSED_TESS_FACTORS_TRI + { + FXP outsideTessFactor[TRI_EDGES]; + FXP insideTessFactor; + TESSELLATOR_PARITY outsideTessFactorParity[TRI_EDGES]; + TESSELLATOR_PARITY insideTessFactorParity; + TESS_FACTOR_CONTEXT outsideTessFactorCtx[TRI_EDGES]; + TESS_FACTOR_CONTEXT insideTessFactorCtx; + bool bJustDoMinimumTessFactor; + bool bPatchCulled; + // Stuff below is just specific to the traversal order + // this code happens to use to generate points/lines + int numPointsForOutsideEdge[TRI_EDGES]; + int numPointsForInsideTessFactor; + int insideEdgePointBaseOffset; + } PROCESSED_TESS_FACTORS_TRI; + typedef struct PROCESSED_TESS_FACTORS_QUAD + { + FXP outsideTessFactor[QUAD_EDGES]; + FXP insideTessFactor[QUAD_AXES]; + TESSELLATOR_PARITY outsideTessFactorParity[QUAD_EDGES]; + TESSELLATOR_PARITY insideTessFactorParity[QUAD_AXES]; + TESS_FACTOR_CONTEXT outsideTessFactorCtx[QUAD_EDGES]; + TESS_FACTOR_CONTEXT insideTessFactorCtx[QUAD_AXES]; + bool bJustDoMinimumTessFactor; + bool bPatchCulled; + // Stuff below is just specific to the traversal order + // this code happens to use to generate points/lines + int numPointsForOutsideEdge[QUAD_EDGES]; + int numPointsForInsideTessFactor[QUAD_AXES]; + int insideEdgePointBaseOffset; + } PROCESSED_TESS_FACTORS_QUAD; + + // These are the workhorse functions for tessellation: + // (1) Process input TessFactors + // (2) Generate points + // (3) Generate connectivity (can be done in parallel to (2)) + void IsoLineProcessTessFactors( float TessFactor_V_LineDensity, float TessFactor_U_LineDetail, PROCESSED_TESS_FACTORS_ISOLINE& processedTessFactors ); + void IsoLineGeneratePoints( const PROCESSED_TESS_FACTORS_ISOLINE& processedTessFactors ); + void IsoLineGenerateConnectivity( const PROCESSED_TESS_FACTORS_ISOLINE& processedTessFactors ); + void TriProcessTessFactors( float tessFactor_Ueq0, float TessFactor_Veq0, float TessFactor_Weq0, float insideTessFactor, PROCESSED_TESS_FACTORS_TRI& processedTessFactors ); + void TriGeneratePoints( const PROCESSED_TESS_FACTORS_TRI& processedTessFactors ); + void TriGenerateConnectivity( const PROCESSED_TESS_FACTORS_TRI& processedTessFactors ); + void QuadProcessTessFactors( float tessFactor_Ueq0, float tessFactor_Veq0, float tessFactor_Ueq1, float tessFactor_Veq1, + float insideTessFactor_U, float insideTessFactor_V, PROCESSED_TESS_FACTORS_QUAD& processedTessFactors ); + void QuadGeneratePoints( const PROCESSED_TESS_FACTORS_QUAD& processedTessFactors ); + void QuadGenerateConnectivity( const PROCESSED_TESS_FACTORS_QUAD& processedTessFactors ); + + // Stitching + // --------- + // Given pointers to the beginning of 2 parallel rows of points, and TessFactors for each, stitch them. + // The assumption is the stitch is symmetric. + void StitchTransition(int baseIndexOffset, int insideEdgePointBaseOffset, int insideNumHalfTessFactorPoints, + TESSELLATOR_PARITY insideEdgeTessFactorParity, + int outsideEdgePointBaseOffset, int outsideNumHalfTessFactorPoints, + TESSELLATOR_PARITY outsideEdgeTessFactorParity ); + // The interior can just use a simpler stitch. + typedef enum DIAGONALS + { + DIAGONALS_INSIDE_TO_OUTSIDE, + DIAGONALS_INSIDE_TO_OUTSIDE_EXCEPT_MIDDLE, + DIAGONALS_MIRRORED + }; + + void StitchRegular(bool bTrapezoid, DIAGONALS diagonals, int baseIndexOffset, int numInsideEdgePoints, + int insideEdgePointBaseOffset, int outsideEdgePointBaseOffset); + +//--------------------------------------------------------------------------------------------------------------------------------- + // Index Patching + // -------------- + // The code below patches index values produces during triangulation, so triangulation doesn't have to know + // where points should go. I happened to never produce duplicate vertices, but the patching would + // be simpler if some duplicate vertices were introduced in practice. During point rendering mode however, + // it is not permitted for duplicate points to show up. + + // Since the points are generated in concentric rings, most of the time, the point locations are + // sequentially increasing in memory for each side of a ring, which the stitch can take advantage of. + // However, there are exceptions where the points are not sequentially increasing, such as + // the 4th row in a given ring, where the last point on the outside of each row is actually the beginning + // point. + // So we let the stitching code think it sees sequential vertices, and when it emits a vertex index, + // we patch it to be the real location. + int PatchIndexValue(int index); + typedef struct INDEX_PATCH_CONTEXT + { + int insidePointIndexDeltaToRealValue; + int insidePointIndexBadValue; + int insidePointIndexReplacementValue; + int outsidePointIndexPatchBase; + int outsidePointIndexDeltaToRealValue; + int outsidePointIndexBadValue; + int outsidePointIndexReplacementValue; + } INDEX_PATCH_CONTEXT; + void SetUsingPatchedIndices(bool bUsingPatchedIndices) {m_bUsingPatchedIndices = bUsingPatchedIndices;} + + // A second index patch we have to do handles the leftover strip of quads in the middle of an odd quad patch after + // finishing all the concentric rings. + // This also handles the leftover strip of points in the middle of an even quad + // patch, when stitching the row of triangles up the left side (V major quad) or bottom (U major quad) of the + // inner ring + typedef struct INDEX_PATCH_CONTEXT2 + { + int baseIndexToInvert; + int indexInversionEndPoint; + int cornerCaseBadValue; + int cornerCaseReplacementValue; + } INDEX_PATCH_CONTEXT2; + void SetUsingPatchedIndices2(bool bUsingPatchedIndices) {m_bUsingPatchedIndices2 = bUsingPatchedIndices;} + bool m_bUsingPatchedIndices; + bool m_bUsingPatchedIndices2; + INDEX_PATCH_CONTEXT m_IndexPatchContext; + INDEX_PATCH_CONTEXT2 m_IndexPatchContext2; + +}; + +//================================================================================================================================= +// CHLSLTessellator: D3D11 Tessellation HLSL Tessellator Interface +// Demonstrates TessFactor preconditioning code auto-generated by HLSL. Subject to change, but this +// just represents the effect of shader code the HLSL compiler will generate in the Hull Shader, +// so it does not affect hardware design at all. +//================================================================================================================================= +class CHLSLTessellator : public CHWTessellator +{ +public: + void Init( D3D11_TESSELLATOR_PARTITIONING partitioning, + D3D11_TESSELLATOR_REDUCTION insideTessFactorReduction, + D3D11_TESSELLATOR_QUAD_REDUCTION_AXIS quadInsideTessFactorReductionAxis, + D3D11_TESSELLATOR_OUTPUT_PRIMITIVE outputPrimitive); + + void TessellateIsoLineDomain( float TessFactor_V_LineDensity, + float TessFactor_U_LineDetail ); + + void TessellateTriDomain( float tessFactor_Ueq0, + float TessFactor_Veq0, + float TessFactor_Weq0, + float insideTessFactorScale /*[0..1]*/ ); + + void TessellateQuadDomain( float TessFactorUeq0, + float TessFactorVeq0, + float TessFactorUeq1, + float TessFactorVeq1, + float insideTessFactorScaleU /*[0..1]*/, + float insideTessFactorScaleV /*[0..1]*/ ); + + int GetPointCount() {return CHWTessellator::GetPointCount();}; + int GetIndexCount() {return CHWTessellator::GetIndexCount();} + + DOMAIN_POINT* GetPoints() {return CHWTessellator::GetPoints();} // Get CHLSLTessellator owned pointer to vertices (UV values). + // Pointer is fixed for lifetime of CHLSLTessellator object. + int* GetIndices() {return CHWTessellator::GetIndices();} // Get CHLSLTessellator owned pointer to vertex indices. + // Pointer is fixed for lifetime of CHLSLTessellator object. + + // Retrieve TessFactors actually used by the "hardware" + // This includes clamping to valid range, and more interestingly + // if integer or pow2 partitioning is being done, the rounded TessFactors can be retrieved. + // Getting the rounded TessFactors can be useful for geomorphing of displacement maps. + float GetIsoLineDensityTessFactor() {return m_LastComputedTessFactors[0];} + float GetIsoLineDetailTessFactor() {return m_LastComputedTessFactors[1];} + float GetTriUeq0TessFactor() {return m_LastComputedTessFactors[0];} + float GetTriVeq0TessFactor() {return m_LastComputedTessFactors[1];} + float GetTriWeq0TessFactor() {return m_LastComputedTessFactors[2];} + float GetTriInsideTessFactor() {return m_LastComputedTessFactors[3];} + float GetQuadUeq0TessFactor() {return m_LastComputedTessFactors[0];} + float GetQuadVeq0TessFactor() {return m_LastComputedTessFactors[1];} + float GetQuadUeq1TessFactor() {return m_LastComputedTessFactors[2];} + float GetQuadVeq1TessFactor() {return m_LastComputedTessFactors[3];} + float GetQuadInsideUTessFactor() {return m_LastComputedTessFactors[4];} + float GetQuadInsideVTessFactor() {return m_LastComputedTessFactors[5];} + float GetUnRoundedIsoLineDensityTessFactor() {return m_LastUnRoundedComputedTessFactors[0];} + float GetUnRoundedIsoLineDetailTessFactor() {return m_LastUnRoundedComputedTessFactors[1];} + float GetUnRoundedTriUeq0TessFactor() {return m_LastUnRoundedComputedTessFactors[0];} + float GetUnRoundedTriVeq0TessFactor() {return m_LastUnRoundedComputedTessFactors[1];} + float GetUnRoundedTriWeq0TessFactor() {return m_LastUnRoundedComputedTessFactors[2];} + float GetUnRoundedTriInsideTessFactor() {return m_LastUnRoundedComputedTessFactors[3];} + float GetUnRoundedQuadUeq0TessFactor() {return m_LastUnRoundedComputedTessFactors[0];} + float GetUnRoundedQuadVeq0TessFactor() {return m_LastUnRoundedComputedTessFactors[1];} + float GetUnRoundedQuadUeq1TessFactor() {return m_LastUnRoundedComputedTessFactors[2];} + float GetUnRoundedQuadVeq1TessFactor() {return m_LastUnRoundedComputedTessFactors[3];} + float GetUnRoundedQuadInsideUTessFactor() {return m_LastUnRoundedComputedTessFactors[4];} + float GetUnRoundedQuadInsideVTessFactor() {return m_LastUnRoundedComputedTessFactors[5];} + + CHLSLTessellator(); +//--------------------------------------------------------------------------------------------------------------------------------- +private: + TESSELLATOR_PARITY m_originalParity; // user chosen parity + TESSELLATOR_PARITY m_parity; // current parity: if allowing mix of even/odd during discrete + // tessellation, this can vary from the user defined parity + D3D11_TESSELLATOR_PARTITIONING m_originalPartitioning; // user chosen partitioning + D3D11_TESSELLATOR_PARTITIONING m_partitioning; // current partitioning. IsoLines overrides for line density + D3D11_TESSELLATOR_OUTPUT_PRIMITIVE m_outputPrimitive; + D3D11_TESSELLATOR_REDUCTION m_insideTessFactorReduction; + D3D11_TESSELLATOR_QUAD_REDUCTION_AXIS m_quadInsideTessFactorReductionAxis; + float m_LastComputedTessFactors[6]; // TessFactors used for last tessellation + float m_LastUnRoundedComputedTessFactors[6]; // TessFactors used for last tessellation (before they were rounded) + bool IntegerPartitioning() {return (m_partitioning == D3D11_TESSELLATOR_PARTITIONING_INTEGER) ? true : false;} + bool Pow2Partitioning() {return (m_partitioning == D3D11_TESSELLATOR_PARTITIONING_POW2)? true : false;} + void ClampTessFactor(float& TessFactor); + void RoundUpTessFactor(float& TessFactor); + void CleanupFloatTessFactor(float& input); // clamp float to [1.0f... +INF] (incl NaN->1.0f) + void ClampFloatTessFactorScale(float& input); // clamp float to [0.0f... +INF] (incl NaN->0.0f) + + // Tessellation parity control + bool Odd() {return (m_parity == TESSELLATOR_PARITY_ODD) ? true : false;} + void SetTessellationParity(TESSELLATOR_PARITY parity) {m_parity = parity;} + + // Tesselation Partitioning control + void RestorePartitioning() {m_partitioning = m_originalPartitioning;}; + void OverridePartitioning(D3D11_TESSELLATOR_PARTITIONING partitioning) {m_partitioning = partitioning;} //isoline uses this for density + + void IsoLineHLSLProcessTessFactors( float TessFactor_V_LineDensity, float TessFactor_U_LineDetail ); + void TriHLSLProcessTessFactors( float tessFactor_Ueq0, float TessFactor_Veq0, float TessFactor_Weq0, float insideTessFactor ); + void QuadHLSLProcessTessFactors( float TessFactor_Ueq0, float TessFactor_Veq0, float TessFactor_Ueq1, float TessFactor_Veq1, + float insideTessFactor_U, float insideTessFactor_V ); + +}; + diff -Nru mesa-19.2.8/src/gallium/drivers/swr/rasterizer/core/threads.cpp mesa-20.0.8/src/gallium/drivers/swr/rasterizer/core/threads.cpp --- mesa-19.2.8/src/gallium/drivers/swr/rasterizer/core/threads.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/swr/rasterizer/core/threads.cpp 2020-06-12 01:21:17.000000000 +0000 @@ -714,6 +714,9 @@ } } + if (pContext->pfnUpdateStreamOut) + pContext->pfnUpdateStreamOut(GetPrivateState(pDC), pDC->dynState.soPrims); + // Ensure all streaming writes are globally visible before marking this FE done _mm_mfence(); pDC->doneFE = true; diff -Nru mesa-19.2.8/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp mesa-20.0.8/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp --- mesa-19.2.8/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp 2020-06-12 01:21:17.000000000 +0000 @@ -423,7 +423,10 @@ legacy::PassManager* pMPasses = new legacy::PassManager(); auto* pTarget = mpExec->getTargetMachine(); pTarget->Options.MCOptions.AsmVerbose = true; -#if LLVM_VERSION_MAJOR >= 7 +#if LLVM_VERSION_MAJOR >= 10 + pTarget->addPassesToEmitFile( + *pMPasses, filestream, nullptr, CGFT_AssemblyFile); +#elif LLVM_VERSION_MAJOR >= 7 pTarget->addPassesToEmitFile( *pMPasses, filestream, nullptr, TargetMachine::CGFT_AssemblyFile); #else diff -Nru mesa-19.2.8/src/gallium/drivers/swr/rasterizer/jitter/jit_pch.hpp mesa-20.0.8/src/gallium/drivers/swr/rasterizer/jitter/jit_pch.hpp --- mesa-19.2.8/src/gallium/drivers/swr/rasterizer/jitter/jit_pch.hpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/swr/rasterizer/jitter/jit_pch.hpp 2020-06-12 01:21:17.000000000 +0000 @@ -34,9 +34,13 @@ #pragma warning(disable : 4146 4244 4267 4800 4996) #endif +#include + +#if LLVM_VERSION_MAJOR < 7 // llvm 3.7+ reuses "DEBUG" as an enum value #pragma push_macro("DEBUG") #undef DEBUG +#endif #include "llvm/IR/DataLayout.h" #include "llvm/IR/Instructions.h" @@ -45,10 +49,11 @@ #include "llvm/IR/Type.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/IntrinsicInst.h" +#if LLVM_VERSION_MAJOR >= 10 +#include "llvm/IR/IntrinsicsX86.h" +#endif #include "llvm/ExecutionEngine/ObjectCache.h" -#include "llvm/Config/llvm-config.h" - #include "llvm/IR/Verifier.h" #include "llvm/ExecutionEngine/MCJIT.h" #include "llvm/Support/FileSystem.h" @@ -133,7 +138,9 @@ } #endif +#if LLVM_VERSION_MAJOR < 7 #pragma pop_macro("DEBUG") +#endif #include #include diff -Nru mesa-19.2.8/src/gallium/drivers/swr/rasterizer/jitter/meson.build mesa-20.0.8/src/gallium/drivers/swr/rasterizer/jitter/meson.build --- mesa-19.2.8/src/gallium/drivers/swr/rasterizer/jitter/meson.build 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/swr/rasterizer/jitter/meson.build 2020-06-12 01:21:17.000000000 +0000 @@ -18,15 +18,21 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. +if dep_llvm.type_name() == 'internal' + _irbuilder_h = subproject('llvm').get_variable('irbuilder_h') +else + if meson.version().version_compare('>=0.51') + _llvm_includedir = dep_llvm.get_variable(configtool : 'includedir', cmake : 'LLVM_INCLUDE_DIR') + else + _llvm_includedir = dep_llvm.get_configtool_variable('includedir') + endif + _irbuilder_h = join_paths(_llvm_includedir, 'llvm', 'IR', 'IRBuilder.h') +endif gen_builder_hpp = custom_target( 'gen_builder.hpp', input : [ - swr_gen_llvm_ir_macros_py, - join_paths( - dep_llvm.get_configtool_variable('includedir'), 'llvm', 'IR', - 'IRBuilder.h' - ) + swr_gen_llvm_ir_macros_py, _irbuilder_h, ], output : 'gen_builder.hpp', command : [ diff -Nru mesa-19.2.8/src/gallium/drivers/swr/swr_context.cpp mesa-20.0.8/src/gallium/drivers/swr/swr_context.cpp --- mesa-19.2.8/src/gallium/drivers/swr/swr_context.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/swr/swr_context.cpp 2020-06-12 01:21:17.000000000 +0000 @@ -31,7 +31,7 @@ #include "util/u_memory.h" #include "util/u_inlines.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_atomic.h" #include "util/u_upload_mgr.h" #include "util/u_transfer.h" @@ -320,6 +320,8 @@ util_blitter_save_vertex_elements(ctx->blitter, (void *)ctx->velems); util_blitter_save_vertex_shader(ctx->blitter, (void *)ctx->vs); util_blitter_save_geometry_shader(ctx->blitter, (void*)ctx->gs); + util_blitter_save_tessctrl_shader(ctx->blitter, (void*)ctx->tcs); + util_blitter_save_tesseval_shader(ctx->blitter, (void*)ctx->tes); util_blitter_save_so_targets( ctx->blitter, ctx->num_so_targets, @@ -472,6 +474,18 @@ } } +static void +swr_UpdateStreamOut(HANDLE hPrivateContext, uint64_t numPrims) +{ + swr_draw_context *pDC = (swr_draw_context*)hPrivateContext; + + if (!pDC) + return; + + if (pDC->soPrims) + *pDC->soPrims += numPrims; +} + struct pipe_context * swr_create_context(struct pipe_screen *p_screen, void *priv, unsigned flags) { @@ -496,6 +510,7 @@ createInfo.pfnStoreTile = swr_StoreHotTile; createInfo.pfnUpdateStats = swr_UpdateStats; createInfo.pfnUpdateStatsFE = swr_UpdateStatsFE; + createInfo.pfnUpdateStreamOut = swr_UpdateStreamOut; createInfo.pfnMakeGfxPtr = swr_MakeGfxPtr; SWR_THREADING_INFO threadingInfo {0}; diff -Nru mesa-19.2.8/src/gallium/drivers/swr/swr_context.h mesa-20.0.8/src/gallium/drivers/swr/swr_context.h --- mesa-19.2.8/src/gallium/drivers/swr/swr_context.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/swr/swr_context.h 2020-06-12 01:21:17.000000000 +0000 @@ -53,7 +53,12 @@ #define SWR_NEW_FRAMEBUFFER (1 << 15) #define SWR_NEW_CLIP (1 << 16) #define SWR_NEW_SO (1 << 17) -#define SWR_LARGE_CLIENT_DRAW (1<<18) // Indicates client draw will block +#define SWR_BLOCK_CLIENT_DRAW ( 1 << 18) // Indicates client draw will block +#define SWR_NEW_TCS (1 << 19) +#define SWR_NEW_TES (1 << 20) +#define SWR_NEW_TS (1 << 21) +#define SWR_NEW_TCSCONSTANTS (1 << 22) +#define SWR_NEW_TESCONSTANTS (1 << 23) namespace std { @@ -91,6 +96,10 @@ uint32_t num_constantsFS[PIPE_MAX_CONSTANT_BUFFERS]; const float *constantGS[PIPE_MAX_CONSTANT_BUFFERS]; uint32_t num_constantsGS[PIPE_MAX_CONSTANT_BUFFERS]; + const float *constantTCS[PIPE_MAX_CONSTANT_BUFFERS]; + uint32_t num_constantsTCS[PIPE_MAX_CONSTANT_BUFFERS]; + const float *constantTES[PIPE_MAX_CONSTANT_BUFFERS]; + uint32_t num_constantsTES[PIPE_MAX_CONSTANT_BUFFERS]; swr_jit_texture texturesVS[PIPE_MAX_SHADER_SAMPLER_VIEWS]; swr_jit_sampler samplersVS[PIPE_MAX_SAMPLERS]; @@ -98,6 +107,10 @@ swr_jit_sampler samplersFS[PIPE_MAX_SAMPLERS]; swr_jit_texture texturesGS[PIPE_MAX_SHADER_SAMPLER_VIEWS]; swr_jit_sampler samplersGS[PIPE_MAX_SAMPLERS]; + swr_jit_texture texturesTCS[PIPE_MAX_SHADER_SAMPLER_VIEWS]; + swr_jit_sampler samplersTCS[PIPE_MAX_SAMPLERS]; + swr_jit_texture texturesTES[PIPE_MAX_SHADER_SAMPLER_VIEWS]; + swr_jit_sampler samplersTES[PIPE_MAX_SAMPLERS]; float userClipPlanes[PIPE_MAX_CLIP_PLANES][4]; @@ -107,6 +120,8 @@ struct swr_query_result *pStats; // @llvm_struct SWR_INTERFACE *pAPI; // @llvm_struct - Needed for the swr_memory callbacks SWR_TILE_INTERFACE *pTileAPI; // @llvm_struct - Needed for the swr_memory callbacks + + uint64_t* soPrims; //number of primitives written to StreamOut buffer }; /* gen_llvm_types FINI */ @@ -116,6 +131,8 @@ HANDLE swrContext; + SWR_TS_STATE tsState; + /** Constant state objects */ struct swr_blend_state *blend; struct pipe_sampler_state *samplers[PIPE_SHADER_TYPES][PIPE_MAX_SAMPLERS]; @@ -125,6 +142,8 @@ struct swr_vertex_shader *vs; struct swr_fragment_shader *fs; struct swr_geometry_shader *gs; + struct swr_tess_control_shader *tcs; + struct swr_tess_evaluation_shader *tes; struct swr_vertex_element_state *velems; /** Other rendering state */ @@ -160,6 +179,7 @@ // streamout pipe_stream_output_target *so_targets[MAX_SO_STREAMS]; uint32_t num_so_targets; + uint64_t so_primCounter; // number of primitives written to StreamOut buffer /* Temp storage for user_buffer constants */ struct swr_scratch_buffers *scratch; diff -Nru mesa-19.2.8/src/gallium/drivers/swr/swr_draw.cpp mesa-20.0.8/src/gallium/drivers/swr/swr_draw.cpp --- mesa-19.2.8/src/gallium/drivers/swr/swr_draw.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/swr/swr_draw.cpp 2020-06-12 01:21:17.000000000 +0000 @@ -31,6 +31,8 @@ #include "util/u_draw.h" #include "util/u_prim.h" +#include +#include /* * Draw vertex arrays, with optional indexing, optional instancing. */ @@ -62,6 +64,16 @@ swr_update_draw_context(ctx); + struct pipe_draw_info resolved_info; + /* DrawTransformFeedback */ + if (info->count_from_stream_output) { + // trick copied from softpipe to modify const struct *info + memcpy(&resolved_info, (void*)info, sizeof(struct pipe_draw_info)); + resolved_info.count = ctx->so_primCounter * resolved_info.vertices_per_patch; + resolved_info.max_index = resolved_info.count - 1; + info = &resolved_info; + } + if (ctx->vs->pipe.stream_output.num_outputs) { if (!ctx->vs->soFunc[info->mode]) { STREAMOUT_COMPILE_STATE state = {0}; @@ -144,16 +156,22 @@ // between all the shader stages, so it has to be large enough to // incorporate all interfaces between stages - // max of gs and vs num_outputs + // max of frontend shaders num_outputs feState.vsVertexSize = ctx->vs->info.base.num_outputs; - if (ctx->gs && - ctx->gs->info.base.num_outputs > feState.vsVertexSize) { - feState.vsVertexSize = ctx->gs->info.base.num_outputs; + if (ctx->gs) { + feState.vsVertexSize = std::max(feState.vsVertexSize, (uint32_t)ctx->gs->info.base.num_outputs); + } + if (ctx->tcs) { + feState.vsVertexSize = std::max(feState.vsVertexSize, (uint32_t)ctx->tcs->info.base.num_outputs); } + if (ctx->tes) { + feState.vsVertexSize = std::max(feState.vsVertexSize, (uint32_t)ctx->tes->info.base.num_outputs); + } + if (ctx->vs->info.base.num_outputs) { // gs does not adjust for position in SGV slot at input from vs - if (!ctx->gs) + if (!ctx->gs && !ctx->tcs && !ctx->tes) feState.vsVertexSize--; } @@ -170,7 +188,6 @@ if (ctx->rasterizer->sprite_coord_enable) feState.vsVertexSize++; - if (ctx->rasterizer->flatshade_first) { feState.provokingVertex = {1, 0, 0}; } else { @@ -212,7 +229,7 @@ if (info->index_size) ctx->api.pfnSwrDrawIndexedInstanced(ctx->swrContext, - swr_convert_prim_topology(info->mode), + swr_convert_prim_topology(info->mode, info->vertices_per_patch), info->count, info->instance_count, info->start, @@ -220,16 +237,16 @@ info->start_instance); else ctx->api.pfnSwrDrawInstanced(ctx->swrContext, - swr_convert_prim_topology(info->mode), + swr_convert_prim_topology(info->mode, info->vertices_per_patch), info->count, info->instance_count, info->start, info->start_instance); - /* On large client-buffer draw, we used client buffer directly, without + /* On client-buffer draw, we used client buffer directly, without * copy. Block until draw is finished. * VMD is an example application that benefits from this. */ - if (ctx->dirty & SWR_LARGE_CLIENT_DRAW) { + if (ctx->dirty & SWR_BLOCK_CLIENT_DRAW) { struct swr_screen *screen = swr_screen(pipe->screen); swr_fence_submit(ctx, screen->flush_fence); swr_fence_finish(pipe->screen, NULL, screen->flush_fence, 0); diff -Nru mesa-19.2.8/src/gallium/drivers/swr/swr_fence_work.cpp mesa-20.0.8/src/gallium/drivers/swr/swr_fence_work.cpp --- mesa-19.2.8/src/gallium/drivers/swr/swr_fence_work.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/swr/swr_fence_work.cpp 2020-06-12 01:21:17.000000000 +0000 @@ -83,7 +83,7 @@ { if (aligned_free) AlignedFree(work->free.data); - else + else FREE(work->free.data); } @@ -105,6 +105,19 @@ delete work->free.swr_gs; } +static void +swr_delete_tcs_cb(struct swr_fence_work *work) +{ + delete work->free.swr_tcs; +} + +static void +swr_delete_tes_cb(struct swr_fence_work *work) +{ + delete work->free.swr_tes; +} + + bool swr_fence_work_free(struct pipe_fence_handle *fence, void *data, bool aligned_free) @@ -167,3 +180,34 @@ return true; } + +bool +swr_fence_work_delete_tcs(struct pipe_fence_handle *fence, + struct swr_tess_control_shader *swr_tcs) +{ + struct swr_fence_work *work = CALLOC_STRUCT(swr_fence_work); + if (!work) + return false; + work->callback = swr_delete_tcs_cb; + work->free.swr_tcs = swr_tcs; + + swr_add_fence_work(fence, work); + + return true; +} + + +bool +swr_fence_work_delete_tes(struct pipe_fence_handle *fence, + struct swr_tess_evaluation_shader *swr_tes) +{ + struct swr_fence_work *work = CALLOC_STRUCT(swr_fence_work); + if (!work) + return false; + work->callback = swr_delete_tes_cb; + work->free.swr_tes = swr_tes; + + swr_add_fence_work(fence, work); + + return true; +} \ No newline at end of file diff -Nru mesa-19.2.8/src/gallium/drivers/swr/swr_fence_work.h mesa-20.0.8/src/gallium/drivers/swr/swr_fence_work.h --- mesa-19.2.8/src/gallium/drivers/swr/swr_fence_work.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/swr/swr_fence_work.h 2020-06-12 01:21:17.000000000 +0000 @@ -32,6 +32,8 @@ struct swr_vertex_shader *swr_vs; struct swr_fragment_shader *swr_fs; struct swr_geometry_shader *swr_gs; + struct swr_tess_control_shader *swr_tcs; + struct swr_tess_evaluation_shader *swr_tes; } free; struct swr_fence_work *next; @@ -47,4 +49,8 @@ struct swr_fragment_shader *swr_vs); bool swr_fence_work_delete_gs(struct pipe_fence_handle *fence, struct swr_geometry_shader *swr_gs); +bool swr_fence_work_delete_tcs(struct pipe_fence_handle *fence, + struct swr_tess_control_shader *swr_tcs); +bool swr_fence_work_delete_tes(struct pipe_fence_handle *fence, + struct swr_tess_evaluation_shader *swr_tes); #endif diff -Nru mesa-19.2.8/src/gallium/drivers/swr/swr_loader.cpp mesa-20.0.8/src/gallium/drivers/swr/swr_loader.cpp --- mesa-19.2.8/src/gallium/drivers/swr/swr_loader.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/swr/swr_loader.cpp 2020-06-12 01:21:17.000000000 +0000 @@ -36,9 +36,9 @@ #ifdef HAVE_SWR_BUILTIN screen->pLibrary = NULL; screen->pfnSwrGetInterface = SwrGetInterface; - screen->pfnSwrGetInterface = SwrGetTileInterface; + screen->pfnSwrGetTileInterface = SwrGetTileIterface; InitTilesTable(); - fprintf(stderr, "(using: builtin).\n"); + swr_print_info("(using: builtin).\n"); #else char filename[256] = { 0 }; sprintf(filename, "%sswr%s%s", UTIL_DL_PREFIX, arch, UTIL_DL_EXT); @@ -71,8 +71,9 @@ pInitFunc(); - fprintf(stderr, "(using: %s).\n", filename); + swr_print_info("(using: %s).\n", filename); #endif + return true; } @@ -91,9 +92,9 @@ util_cpu_detect(); if (util_cpu_caps.has_avx512f && util_cpu_caps.has_avx512er) { - fprintf(stderr, "SWR detected KNL instruction support "); + swr_print_info("SWR detected KNL instruction support "); #ifndef HAVE_SWR_KNL - fprintf(stderr, "(skipping: not built).\n"); + swr_print_info("(skipping: not built).\n"); #else if (swr_initialize_screen_interface(screen, "KNL")) { screen->is_knl = true; @@ -103,9 +104,9 @@ } if (util_cpu_caps.has_avx512f && util_cpu_caps.has_avx512bw) { - fprintf(stderr, "SWR detected SKX instruction support "); + swr_print_info("SWR detected SKX instruction support "); #ifndef HAVE_SWR_SKX - fprintf(stderr, "(skipping not built).\n"); + swr_print_info("(skipping not built).\n"); #else if (swr_initialize_screen_interface(screen, "SKX")) return p_screen; @@ -113,9 +114,9 @@ } if (util_cpu_caps.has_avx2) { - fprintf(stderr, "SWR detected AVX2 instruction support "); + swr_print_info("SWR detected AVX2 instruction support "); #ifndef HAVE_SWR_AVX2 - fprintf(stderr, "(skipping not built).\n"); + swr_print_info("(skipping not built).\n"); #else if (swr_initialize_screen_interface(screen, "AVX2")) return p_screen; @@ -123,9 +124,9 @@ } if (util_cpu_caps.has_avx) { - fprintf(stderr, "SWR detected AVX instruction support "); + swr_print_info("SWR detected AVX instruction support "); #ifndef HAVE_SWR_AVX - fprintf(stderr, "(skipping not built).\n"); + swr_print_info("(skipping not built).\n"); #else if (swr_initialize_screen_interface(screen, "AVX")) return p_screen; diff -Nru mesa-19.2.8/src/gallium/drivers/swr/swr_scratch.cpp mesa-20.0.8/src/gallium/drivers/swr/swr_scratch.cpp --- mesa-19.2.8/src/gallium/drivers/swr/swr_scratch.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/swr/swr_scratch.cpp 2020-06-12 01:21:17.000000000 +0000 @@ -94,6 +94,8 @@ AlignedFree(scratch->vs_constants.base); AlignedFree(scratch->fs_constants.base); AlignedFree(scratch->gs_constants.base); + AlignedFree(scratch->tcs_constants.base); + AlignedFree(scratch->tes_constants.base); AlignedFree(scratch->vertex_buffer.base); AlignedFree(scratch->index_buffer.base); FREE(scratch); diff -Nru mesa-19.2.8/src/gallium/drivers/swr/swr_scratch.h mesa-20.0.8/src/gallium/drivers/swr/swr_scratch.h --- mesa-19.2.8/src/gallium/drivers/swr/swr_scratch.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/swr/swr_scratch.h 2020-06-12 01:21:17.000000000 +0000 @@ -36,6 +36,8 @@ struct swr_scratch_space vs_constants; struct swr_scratch_space fs_constants; struct swr_scratch_space gs_constants; + struct swr_scratch_space tcs_constants; + struct swr_scratch_space tes_constants; struct swr_scratch_space vertex_buffer; struct swr_scratch_space index_buffer; }; diff -Nru mesa-19.2.8/src/gallium/drivers/swr/swr_screen.cpp mesa-20.0.8/src/gallium/drivers/swr/swr_screen.cpp --- mesa-19.2.8/src/gallium/drivers/swr/swr_screen.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/swr/swr_screen.cpp 2020-06-12 01:21:17.000000000 +0000 @@ -31,10 +31,10 @@ #include "pipe/p_screen.h" #include "pipe/p_defines.h" #include "util/u_memory.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_inlines.h" #include "util/u_cpu_detect.h" -#include "util/u_format_s3tc.h" +#include "util/format/u_format_s3tc.h" #include "util/u_string.h" #include "util/u_screen.h" @@ -69,8 +69,7 @@ swr_get_name(struct pipe_screen *screen) { static char buf[100]; - snprintf(buf, sizeof(buf), "SWR (LLVM %u.%u, %u bits)", - HAVE_LLVM >> 8, HAVE_LLVM & 0xff, + snprintf(buf, sizeof(buf), "SWR (LLVM " MESA_LLVM_VERSION_STRING ", %u bits)", lp_native_vector_width); return buf; } @@ -142,8 +141,9 @@ return false; } - if (format_desc->layout == UTIL_FORMAT_LAYOUT_BPTC || - format_desc->layout == UTIL_FORMAT_LAYOUT_ASTC) { + if (format_desc->layout == UTIL_FORMAT_LAYOUT_ASTC || + format_desc->layout == UTIL_FORMAT_LAYOUT_FXT1) + { return false; } @@ -192,7 +192,7 @@ case PIPE_CAP_MAX_GEOMETRY_TOTAL_OUTPUT_COMPONENTS: return 1024; case PIPE_CAP_MAX_VERTEX_STREAMS: - return 1; + return 4; case PIPE_CAP_MAX_VERTEX_ATTRIB_STRIDE: return 2048; case PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS: @@ -275,6 +275,9 @@ case PIPE_CAP_DOUBLES: case PIPE_CAP_TEXTURE_QUERY_LOD: case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS: + case PIPE_CAP_TGSI_TG4_COMPONENT_IN_SWIZZLE: + case PIPE_CAP_QUERY_SO_OVERFLOW: + case PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME: return 1; /* MSAA support @@ -345,7 +348,6 @@ case PIPE_CAP_VIEWPORT_SUBPIXEL_BITS: case PIPE_CAP_TGSI_ARRAY_COMPONENTS: case PIPE_CAP_TGSI_CAN_READ_OUTPUTS: - case PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME: case PIPE_CAP_NATIVE_FENCE_FD: case PIPE_CAP_GLSL_OPTIMIZE_CONSERVATIVELY: case PIPE_CAP_FBFETCH: @@ -363,7 +365,6 @@ case PIPE_CAP_POST_DEPTH_COVERAGE: case PIPE_CAP_BINDLESS_TEXTURE: case PIPE_CAP_NIR_SAMPLERS_AS_DEREF: - case PIPE_CAP_QUERY_SO_OVERFLOW: case PIPE_CAP_MEMOBJ: case PIPE_CAP_LOAD_CONSTBUF: case PIPE_CAP_TGSI_ANY_REG_AS_ADDRESS: @@ -420,10 +421,13 @@ { if (shader == PIPE_SHADER_VERTEX || shader == PIPE_SHADER_FRAGMENT || - shader == PIPE_SHADER_GEOMETRY) + shader == PIPE_SHADER_GEOMETRY + || shader == PIPE_SHADER_TESS_CTRL || + shader == PIPE_SHADER_TESS_EVAL + ) return gallivm_get_shader_param(param); - // Todo: tesselation, compute + // Todo: compute return 0; } @@ -782,7 +786,7 @@ * surface sample count. */ if (screen->msaa_force_enable) { res->swr.numSamples = screen->msaa_max_count; - fprintf(stderr,"swr_texture_layout: forcing sample count: %d\n", + swr_print_info("swr_texture_layout: forcing sample count: %d\n", res->swr.numSamples); } } else { @@ -1122,7 +1126,7 @@ struct swr_screen *screen = swr_screen(p_screen); struct sw_winsys *winsys = screen->winsys; - fprintf(stderr, "SWR destroy screen!\n"); + swr_print_info("SWR destroy screen!\n"); if (winsys->destroy) winsys->destroy(winsys); @@ -1156,12 +1160,11 @@ fprintf(stderr, "must be power of 2 between 1 and %d" \ " (or 1 to disable msaa)\n", SWR_MAX_NUM_MULTISAMPLES); + fprintf(stderr, "(msaa disabled)\n"); msaa_max_count = 1; } - fprintf(stderr, "SWR_MSAA_MAX_COUNT: %d\n", msaa_max_count); - if (msaa_max_count == 1) - fprintf(stderr, "(msaa disabled)\n"); + swr_print_info("SWR_MSAA_MAX_COUNT: %d\n", msaa_max_count); screen->msaa_max_count = msaa_max_count; } @@ -1169,7 +1172,7 @@ screen->msaa_force_enable = debug_get_bool_option( "SWR_MSAA_FORCE_ENABLE", false); if (screen->msaa_force_enable) - fprintf(stderr, "SWR_MSAA_FORCE_ENABLE: true\n"); + swr_print_info("SWR_MSAA_FORCE_ENABLE: true\n"); } diff -Nru mesa-19.2.8/src/gallium/drivers/swr/swr_screen.h mesa-20.0.8/src/gallium/drivers/swr/swr_screen.h --- mesa-19.2.8/src/gallium/drivers/swr/swr_screen.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/swr/swr_screen.h 2020-06-12 01:21:17.000000000 +0000 @@ -29,11 +29,13 @@ #include "pipe/p_screen.h" #include "pipe/p_defines.h" #include "util/u_dl.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "api.h" #include "memory/TilingFunctions.h" #include "memory/InitMemory.h" +#include +#include struct sw_winsys; @@ -70,4 +72,15 @@ SWR_FORMAT mesa_to_swr_format(enum pipe_format format); +static void swr_print_info(const char *format, ...) +{ + static bool print_info = debug_get_bool_option("SWR_PRINT_INFO", false); + if(print_info) { + va_list args; + va_start(args, format); + vfprintf(stderr, format, args); + va_end(args); + } +} + #endif diff -Nru mesa-19.2.8/src/gallium/drivers/swr/swr_shader.cpp mesa-20.0.8/src/gallium/drivers/swr/swr_shader.cpp --- mesa-19.2.8/src/gallium/drivers/swr/swr_shader.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/swr/swr_shader.cpp 2020-06-12 01:21:17.000000000 +0000 @@ -21,14 +21,22 @@ * IN THE SOFTWARE. ***************************************************************************/ +#include + +#if LLVM_VERSION_MAJOR < 7 // llvm redefines DEBUG #pragma push_macro("DEBUG") #undef DEBUG +#endif + #include "JitManager.h" #include "llvm-c/Core.h" #include "llvm/Support/CBindingWrapping.h" #include "llvm/IR/LegacyPassManager.h" + +#if LLVM_VERSION_MAJOR < 7 #pragma pop_macro("DEBUG") +#endif #include "state.h" #include "gen_state_llvm.h" @@ -36,12 +44,14 @@ #include "functionpasses/passes.h" #include "tgsi/tgsi_strings.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_prim.h" #include "gallivm/lp_bld_init.h" #include "gallivm/lp_bld_flow.h" #include "gallivm/lp_bld_struct.h" #include "gallivm/lp_bld_tgsi.h" +#include "gallivm/lp_bld_const.h" +#include "gallivm/lp_bld_printf.h" #include "swr_context.h" #include "gen_surf_state_llvm.h" @@ -50,8 +60,25 @@ #include "swr_state.h" #include "swr_screen.h" + +///////////////////////////////////////////////////////////////////////// + +#include +#include + +#include "util/u_debug.h" +#include "util/u_memory.h" +#include "util/u_string.h" + +#include "gallivm/lp_bld_type.h" + +#ifdef DEBUG +constexpr bool verbose_shader = true; +#else +constexpr bool verbose_shader = false; +#endif + using namespace SwrJit; -using namespace llvm; static unsigned locate_linkage(ubyte name, ubyte index, struct tgsi_shader_info *info); @@ -76,6 +103,17 @@ return !memcmp(&lhs, &rhs, sizeof(lhs)); } +bool operator==(const swr_jit_tcs_key &lhs, const swr_jit_tcs_key &rhs) +{ + return !memcmp(&lhs, &rhs, sizeof(lhs)); +} + +bool operator==(const swr_jit_tes_key &lhs, const swr_jit_tes_key &rhs) +{ + return !memcmp(&lhs, &rhs, sizeof(lhs)); +} + + static void swr_generate_sampler_key(const struct lp_tgsi_info &info, struct swr_context *ctx, @@ -151,6 +189,8 @@ struct tgsi_shader_info *pPrevShader; if (ctx->gs) pPrevShader = &ctx->gs->info.base; + else if (ctx->tes) + pPrevShader = &ctx->tes->info.base; else pPrevShader = &ctx->vs->info.base; @@ -198,7 +238,13 @@ { memset(&key, 0, sizeof(key)); - struct tgsi_shader_info *pPrevShader = &ctx->vs->info.base; + struct tgsi_shader_info *pPrevShader = nullptr; + + if (ctx->tes) { + pPrevShader = &ctx->tes->info.base; + } else { + pPrevShader = &ctx->vs->info.base; + } memcpy(&key.vs_output_semantic_name, &pPrevShader->output_semantic_name, @@ -210,6 +256,63 @@ swr_generate_sampler_key(swr_gs->info, ctx, PIPE_SHADER_GEOMETRY, key); } +void +swr_generate_tcs_key(struct swr_jit_tcs_key &key, + struct swr_context *ctx, + swr_tess_control_shader *swr_tcs) +{ + memset(&key, 0, sizeof(key)); + + struct tgsi_shader_info *pPrevShader = &ctx->vs->info.base; + + memcpy(&key.vs_output_semantic_name, + &pPrevShader->output_semantic_name, + sizeof(key.vs_output_semantic_name)); + memcpy(&key.vs_output_semantic_idx, + &pPrevShader->output_semantic_index, + sizeof(key.vs_output_semantic_idx)); + + key.clip_plane_mask = + swr_tcs->info.base.clipdist_writemask ? + swr_tcs->info.base.clipdist_writemask & ctx->rasterizer->clip_plane_enable : + ctx->rasterizer->clip_plane_enable; + + swr_generate_sampler_key(swr_tcs->info, ctx, PIPE_SHADER_TESS_CTRL, key); +} + +void +swr_generate_tes_key(struct swr_jit_tes_key &key, + struct swr_context *ctx, + swr_tess_evaluation_shader *swr_tes) +{ + memset(&key, 0, sizeof(key)); + + struct tgsi_shader_info *pPrevShader = nullptr; + + if (ctx->tcs) { + pPrevShader = &ctx->tcs->info.base; + } + else { + pPrevShader = &ctx->vs->info.base; + } + + SWR_ASSERT(pPrevShader != nullptr, "TES: No TCS or VS defined"); + + memcpy(&key.prev_output_semantic_name, + &pPrevShader->output_semantic_name, + sizeof(key.prev_output_semantic_name)); + memcpy(&key.prev_output_semantic_idx, + &pPrevShader->output_semantic_index, + sizeof(key.prev_output_semantic_idx)); + + key.clip_plane_mask = + swr_tes->info.base.clipdist_writemask ? + swr_tes->info.base.clipdist_writemask & ctx->rasterizer->clip_plane_enable : + ctx->rasterizer->clip_plane_enable; + + swr_generate_sampler_key(swr_tes->info, ctx, PIPE_SHADER_TESS_EVAL, key); +} + struct BuilderSWR : public Builder { BuilderSWR(JitManager *pJitMgr, const char *pName) : Builder(pJitMgr) @@ -230,37 +333,97 @@ PFN_VERTEX_FUNC CompileVS(struct swr_context *ctx, swr_jit_vs_key &key); PFN_PIXEL_KERNEL CompileFS(struct swr_context *ctx, swr_jit_fs_key &key); PFN_GS_FUNC CompileGS(struct swr_context *ctx, swr_jit_gs_key &key); + PFN_TCS_FUNC CompileTCS(struct swr_context *ctx, swr_jit_tcs_key &key); + PFN_TES_FUNC CompileTES(struct swr_context *ctx, swr_jit_tes_key &key); + // GS-specific emit functions LLVMValueRef - swr_gs_llvm_fetch_input(const struct lp_build_tgsi_gs_iface *gs_iface, - struct lp_build_tgsi_context * bld_base, + swr_gs_llvm_fetch_input(const struct lp_build_gs_iface *gs_iface, + struct lp_build_context * bld, boolean is_vindex_indirect, LLVMValueRef vertex_index, boolean is_aindex_indirect, LLVMValueRef attrib_index, LLVMValueRef swizzle_index); void - swr_gs_llvm_emit_vertex(const struct lp_build_tgsi_gs_iface *gs_base, - struct lp_build_tgsi_context * bld_base, + swr_gs_llvm_emit_vertex(const struct lp_build_gs_iface *gs_base, + struct lp_build_context * bld, LLVMValueRef (*outputs)[4], - LLVMValueRef emitted_vertices_vec); + LLVMValueRef emitted_vertices_vec, + LLVMValueRef stream_id); void - swr_gs_llvm_end_primitive(const struct lp_build_tgsi_gs_iface *gs_base, - struct lp_build_tgsi_context * bld_base, + swr_gs_llvm_end_primitive(const struct lp_build_gs_iface *gs_base, + struct lp_build_context * bld, + LLVMValueRef total_emitted_vertices_vec_ptr, LLVMValueRef verts_per_prim_vec, - LLVMValueRef emitted_prims_vec); + LLVMValueRef emitted_prims_vec, + LLVMValueRef mask_vec); void - swr_gs_llvm_epilogue(const struct lp_build_tgsi_gs_iface *gs_base, - struct lp_build_tgsi_context * bld_base, + swr_gs_llvm_epilogue(const struct lp_build_gs_iface *gs_base, LLVMValueRef total_emitted_vertices_vec, LLVMValueRef emitted_prims_vec); + // TCS-specific emit functions + void swr_tcs_llvm_emit_prologue(struct lp_build_tgsi_soa_context* bld); + void swr_tcs_llvm_emit_epilogue(struct lp_build_tgsi_soa_context* bld); + + LLVMValueRef + swr_tcs_llvm_fetch_input(const struct lp_build_tcs_iface *tcs_iface, + struct lp_build_tgsi_context * bld_base, + boolean is_vindex_indirect, + LLVMValueRef vertex_index, + boolean is_aindex_indirect, + LLVMValueRef attrib_index, + LLVMValueRef swizzle_index); + + LLVMValueRef + swr_tcs_llvm_fetch_output(const struct lp_build_tcs_iface *tcs_iface, + struct lp_build_tgsi_context * bld_base, + boolean is_vindex_indirect, + LLVMValueRef vertex_index, + boolean is_aindex_indirect, + LLVMValueRef attrib_index, + LLVMValueRef swizzle_index, + uint32_t name); + + void + swr_tcs_llvm_store_output(const struct lp_build_tcs_iface *tcs_iface, + struct lp_build_tgsi_context * bld_base, + unsigned name, + boolean is_vindex_indirect, + LLVMValueRef vertex_index, + boolean is_aindex_indirect, + LLVMValueRef attrib_index, + LLVMValueRef swizzle_index, + LLVMValueRef value); + + // Barrier implementation (available only in TCS) + void + swr_tcs_llvm_emit_barrier(const struct lp_build_tcs_iface *tcs_iface, + struct lp_build_tgsi_context *bld_base); + + // TES-specific emit functions + LLVMValueRef + swr_tes_llvm_fetch_vtx_input(const struct lp_build_tes_iface *tes_iface, + struct lp_build_tgsi_context * bld_base, + boolean is_vindex_indirect, + LLVMValueRef vertex_index, + boolean is_aindex_indirect, + LLVMValueRef attrib_index, + LLVMValueRef swizzle_index); + + LLVMValueRef + swr_tes_llvm_fetch_patch_input(const struct lp_build_tes_iface *tes_iface, + struct lp_build_tgsi_context * bld_base, + boolean is_aindex_indirect, + LLVMValueRef attrib_index, + LLVMValueRef swizzle_index); }; struct swr_gs_llvm_iface { - struct lp_build_tgsi_gs_iface base; + struct lp_build_gs_iface base; struct tgsi_shader_info *info; BuilderSWR *pBuilder; @@ -273,10 +436,43 @@ Value *pVtxAttribMap; }; +struct swr_tcs_llvm_iface { + struct lp_build_tcs_iface base; + struct tgsi_shader_info *info; + + BuilderSWR *pBuilder; + + Value *pTcsCtx; + SWR_TS_STATE *pTsState; + + uint32_t output_vertices; + + struct lp_build_for_loop_state loop_state; + + Value *pVtxAttribMap; + Value *pVtxOutputAttribMap; + Value *pPatchOutputAttribMap; +}; + +struct swr_tes_llvm_iface { + struct lp_build_tes_iface base; + struct tgsi_shader_info *info; + + BuilderSWR *pBuilder; + + Value *pTesCtx; + SWR_TS_STATE *pTsState; + + uint32_t num_outputs; + + Value *pVtxAttribMap; + Value *pPatchAttribMap; +}; + // trampoline functions so we can use the builder llvm construction methods static LLVMValueRef -swr_gs_llvm_fetch_input(const struct lp_build_tgsi_gs_iface *gs_iface, - struct lp_build_tgsi_context * bld_base, +swr_gs_llvm_fetch_input(const struct lp_build_gs_iface *gs_iface, + struct lp_build_context * bld, boolean is_vindex_indirect, LLVMValueRef vertex_index, boolean is_aindex_indirect, @@ -285,7 +481,7 @@ { swr_gs_llvm_iface *iface = (swr_gs_llvm_iface*)gs_iface; - return iface->pBuilder->swr_gs_llvm_fetch_input(gs_iface, bld_base, + return iface->pBuilder->swr_gs_llvm_fetch_input(gs_iface, bld, is_vindex_indirect, vertex_index, is_aindex_indirect, @@ -294,47 +490,183 @@ } static void -swr_gs_llvm_emit_vertex(const struct lp_build_tgsi_gs_iface *gs_base, - struct lp_build_tgsi_context * bld_base, +swr_gs_llvm_emit_vertex(const struct lp_build_gs_iface *gs_base, + struct lp_build_context * bld, LLVMValueRef (*outputs)[4], - LLVMValueRef emitted_vertices_vec) + LLVMValueRef emitted_vertices_vec, + LLVMValueRef stream_id) { swr_gs_llvm_iface *iface = (swr_gs_llvm_iface*)gs_base; - iface->pBuilder->swr_gs_llvm_emit_vertex(gs_base, bld_base, + iface->pBuilder->swr_gs_llvm_emit_vertex(gs_base, bld, outputs, - emitted_vertices_vec); + emitted_vertices_vec, + stream_id); } static void -swr_gs_llvm_end_primitive(const struct lp_build_tgsi_gs_iface *gs_base, - struct lp_build_tgsi_context * bld_base, +swr_gs_llvm_end_primitive(const struct lp_build_gs_iface *gs_base, + struct lp_build_context * bld, + LLVMValueRef total_emitted_vertices_vec_ptr, LLVMValueRef verts_per_prim_vec, - LLVMValueRef emitted_prims_vec) + LLVMValueRef emitted_prims_vec, + LLVMValueRef mask_vec) { swr_gs_llvm_iface *iface = (swr_gs_llvm_iface*)gs_base; - iface->pBuilder->swr_gs_llvm_end_primitive(gs_base, bld_base, + iface->pBuilder->swr_gs_llvm_end_primitive(gs_base, bld, + total_emitted_vertices_vec_ptr, verts_per_prim_vec, - emitted_prims_vec); + emitted_prims_vec, + mask_vec); } static void -swr_gs_llvm_epilogue(const struct lp_build_tgsi_gs_iface *gs_base, - struct lp_build_tgsi_context * bld_base, +swr_gs_llvm_epilogue(const struct lp_build_gs_iface *gs_base, LLVMValueRef total_emitted_vertices_vec, LLVMValueRef emitted_prims_vec) { swr_gs_llvm_iface *iface = (swr_gs_llvm_iface*)gs_base; - iface->pBuilder->swr_gs_llvm_epilogue(gs_base, bld_base, + iface->pBuilder->swr_gs_llvm_epilogue(gs_base, total_emitted_vertices_vec, emitted_prims_vec); } +static LLVMValueRef +swr_tcs_llvm_fetch_input(const struct lp_build_tcs_iface *tcs_iface, + struct lp_build_context * bld, + boolean is_vindex_indirect, + LLVMValueRef vertex_index, + boolean is_aindex_indirect, + LLVMValueRef attrib_index, + LLVMValueRef swizzle_index) +{ + swr_tcs_llvm_iface *iface = (swr_tcs_llvm_iface*)tcs_iface; + struct lp_build_tgsi_context *bld_base = (struct lp_build_tgsi_context*)bld; + + return iface->pBuilder->swr_tcs_llvm_fetch_input(tcs_iface, bld_base, + is_vindex_indirect, + vertex_index, + is_aindex_indirect, + attrib_index, + swizzle_index); +} + +static LLVMValueRef +swr_tcs_llvm_fetch_output(const struct lp_build_tcs_iface *tcs_iface, + struct lp_build_context * bld, + boolean is_vindex_indirect, + LLVMValueRef vertex_index, + boolean is_aindex_indirect, + LLVMValueRef attrib_index, + LLVMValueRef swizzle_index, + uint32_t name) +{ + swr_tcs_llvm_iface *iface = (swr_tcs_llvm_iface*)tcs_iface; + struct lp_build_tgsi_context *bld_base = (struct lp_build_tgsi_context*)bld; + + return iface->pBuilder->swr_tcs_llvm_fetch_output(tcs_iface, bld_base, + is_vindex_indirect, + vertex_index, + is_aindex_indirect, + attrib_index, + swizzle_index, + name); +} + + +static void +swr_tcs_llvm_emit_prologue(struct lp_build_context* bld) +{ + lp_build_tgsi_soa_context* bld_base = (lp_build_tgsi_soa_context*)bld; + swr_tcs_llvm_iface *iface = (swr_tcs_llvm_iface*)bld_base->tcs_iface; + iface->pBuilder->swr_tcs_llvm_emit_prologue(bld_base); +} + +static void +swr_tcs_llvm_emit_epilogue(struct lp_build_context* bld) +{ + lp_build_tgsi_soa_context* bld_base = (lp_build_tgsi_soa_context*)bld; + swr_tcs_llvm_iface *iface = (swr_tcs_llvm_iface*)bld_base->tcs_iface; + iface->pBuilder->swr_tcs_llvm_emit_epilogue(bld_base); +} + +static +void swr_tcs_llvm_store_output(const struct lp_build_tcs_iface *tcs_iface, + struct lp_build_context * bld, + unsigned name, + boolean is_vindex_indirect, + LLVMValueRef vertex_index, + boolean is_aindex_indirect, + LLVMValueRef attrib_index, + LLVMValueRef swizzle_index, + LLVMValueRef value) +{ + swr_tcs_llvm_iface *iface = (swr_tcs_llvm_iface*)tcs_iface; + struct lp_build_tgsi_context *bld_base = (struct lp_build_tgsi_context*)bld; + + iface->pBuilder->swr_tcs_llvm_store_output(tcs_iface, + bld_base, + name, + is_vindex_indirect, + vertex_index, + is_aindex_indirect, + attrib_index, + swizzle_index, + value); +} + + +static +void swr_tcs_llvm_emit_barrier(struct lp_build_context *bld) +{ + lp_build_tgsi_soa_context* bld_base = (lp_build_tgsi_soa_context*)bld; + swr_tcs_llvm_iface *iface = (swr_tcs_llvm_iface*)bld_base->tcs_iface; + + iface->pBuilder->swr_tcs_llvm_emit_barrier(bld_base->tcs_iface, &bld_base->bld_base); +} + + +static LLVMValueRef +swr_tes_llvm_fetch_vtx_input(const struct lp_build_tes_iface *tes_iface, + struct lp_build_context * bld, + boolean is_vindex_indirect, + LLVMValueRef vertex_index, + boolean is_aindex_indirect, + LLVMValueRef attrib_index, + LLVMValueRef swizzle_index) +{ + swr_tes_llvm_iface *iface = (swr_tes_llvm_iface*)tes_iface; + struct lp_build_tgsi_context *bld_base = (struct lp_build_tgsi_context*)bld; + + return iface->pBuilder->swr_tes_llvm_fetch_vtx_input(tes_iface, bld_base, + is_vindex_indirect, + vertex_index, + is_aindex_indirect, + attrib_index, + swizzle_index); +} + +static LLVMValueRef +swr_tes_llvm_fetch_patch_input(const struct lp_build_tes_iface *tes_iface, + struct lp_build_context * bld, + boolean is_aindex_indirect, + LLVMValueRef attrib_index, + LLVMValueRef swizzle_index) +{ + swr_tes_llvm_iface *iface = (swr_tes_llvm_iface*)tes_iface; + struct lp_build_tgsi_context *bld_base = (struct lp_build_tgsi_context*)bld; + + return iface->pBuilder->swr_tes_llvm_fetch_patch_input(tes_iface, bld_base, + is_aindex_indirect, + attrib_index, + swizzle_index); +} + LLVMValueRef -BuilderSWR::swr_gs_llvm_fetch_input(const struct lp_build_tgsi_gs_iface *gs_iface, - struct lp_build_tgsi_context * bld_base, +BuilderSWR::swr_gs_llvm_fetch_input(const struct lp_build_gs_iface *gs_iface, + struct lp_build_context * bld, boolean is_vindex_indirect, LLVMValueRef vertex_index, boolean is_aindex_indirect, @@ -349,8 +681,8 @@ if (is_vindex_indirect || is_aindex_indirect) { int i; - Value *res = unwrap(bld_base->base.zero); - struct lp_type type = bld_base->base.type; + Value *res = unwrap(bld->zero); + struct lp_type type = bld->type; for (i = 0; i < type.length; i++) { Value *vert_chan_index = vert_index; @@ -396,15 +728,15 @@ #define CONTROL_HEADER_SIZE (8*32) void -BuilderSWR::swr_gs_llvm_emit_vertex(const struct lp_build_tgsi_gs_iface *gs_base, - struct lp_build_tgsi_context * bld_base, +BuilderSWR::swr_gs_llvm_emit_vertex(const struct lp_build_gs_iface *gs_base, + struct lp_build_context * bld, LLVMValueRef (*outputs)[4], - LLVMValueRef emitted_vertices_vec) + LLVMValueRef emitted_vertices_vec, + LLVMValueRef stream_id) { swr_gs_llvm_iface *iface = (swr_gs_llvm_iface*)gs_base; IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder))); - const uint32_t headerSize = VERTEX_COUNT_SIZE + CONTROL_HEADER_SIZE; const uint32_t attribSize = 4 * sizeof(float); const uint32_t vertSize = attribSize * SWR_VTX_NUM_SLOTS; @@ -437,149 +769,1221 @@ } } - Value *pOutputOffset = ADD(pVertexOffset, VIMMED1(headerSize + attribSize * attribSlot)); // + sgvChannel ? + Value *pOutputOffset = ADD(pVertexOffset, VIMMED1(headerSize + attribSize * attribSlot)); // + sgvChannel ? + + for (uint32_t lane = 0; lane < mVWidth; ++lane) { + Value *pLaneOffset = VEXTRACT(pOutputOffset, C(lane)); + Value *pStream = LOAD(iface->pGsCtx, {0, SWR_GS_CONTEXT_pStreams, lane}); + Value *pStreamOffset = GEP(pStream, pLaneOffset); + pStreamOffset = BITCAST(pStreamOffset, mFP32PtrTy); + + Value *pLaneMask = VEXTRACT(vMask1, C(lane)); + pStreamOffset = SELECT(pLaneMask, pStreamOffset, pTmpPtr); + + for (uint32_t channel = 0; channel < 4; ++channel) { + Value *vData; + + if (attribSlot == VERTEX_SGV_SLOT) + vData = LOAD(unwrap(outputs[attrib][0])); + else + vData = LOAD(unwrap(outputs[attrib][channel])); + + if (attribSlot != VERTEX_SGV_SLOT || + sgvChannel == channel) { + vData = VEXTRACT(vData, C(lane)); + STORE(vData, pStreamOffset); + } + pStreamOffset = GEP(pStreamOffset, C(1)); + } + } + } + + /* When the output type is not points, the geometry shader may not + * output data to multiple streams. So early exit here. + */ + if(iface->pGsState->outputTopology != TOP_POINT_LIST) { + STACKRESTORE(pStack); + return; + } + + // Info about stream id for each vertex + // is coded in 2 bits (4 vert per byte "box"): + // ----------------- ----------------- ---- + // |d|d|c|c|b|b|a|a| |h|h|g|g|f|f|e|e| |... + // ----------------- ----------------- ---- + + // Calculate where need to put stream id for current vert + // in 1 byte "box". + Value *pShiftControl = MUL(unwrap(emitted_vertices_vec), VIMMED1(2)); + + // Calculate in which box put stream id for current vert. + Value *pOffsetControl = LSHR(unwrap(emitted_vertices_vec), VIMMED1(2)); + + // Skip count header + Value *pStreamIdOffset = ADD(pOffsetControl, VIMMED1(VERTEX_COUNT_SIZE)); + + for (uint32_t lane = 0; lane < mVWidth; ++lane) { + Value *pShift = TRUNC(VEXTRACT(pShiftControl, C(lane)), mInt8Ty); + Value *pStream = LOAD(iface->pGsCtx, {0, SWR_GS_CONTEXT_pStreams, lane}); + + Value *pStreamOffset = GEP(pStream, VEXTRACT(pStreamIdOffset, C(lane))); + + // Just make sure that not overflow max - stream id = (0,1,2,3) + Value *vVal = TRUNC(AND(VEXTRACT(unwrap(stream_id), C(0)), C(0x3)), mInt8Ty); + + // Shift it to correct position in byte "box" + vVal = SHL(vVal, pShift); + + // Info about other vertices can be already stored + // so we need to read and add bits from current vert info. + Value *storedValue = LOAD(pStreamOffset); + vVal = OR(storedValue, vVal); + STORE(vVal, pStreamOffset); + } + + STACKRESTORE(pStack); +} + +void +BuilderSWR::swr_gs_llvm_end_primitive(const struct lp_build_gs_iface *gs_base, + struct lp_build_context * bld, + LLVMValueRef total_emitted_vertices_vec, + LLVMValueRef verts_per_prim_vec, + LLVMValueRef emitted_prims_vec, + LLVMValueRef mask_vec) +{ + swr_gs_llvm_iface *iface = (swr_gs_llvm_iface*)gs_base; + + /* When the output type is points, the geometry shader may output data + * to multiple streams, and end_primitive has no effect. Info about + * stream id for vertices is stored into the same place in memory where + * end primitive info is stored so early exit in this case. + */ + if (iface->pGsState->outputTopology == TOP_POINT_LIST) { + return; + } + + IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder))); + + Value *vMask = LOAD(iface->pGsCtx, { 0, SWR_GS_CONTEXT_mask }); + Value *vMask1 = TRUNC(vMask, VectorType::get(mInt1Ty, 8)); + + uint32_t vertsPerPrim = iface->num_verts_per_prim; + + Value *vCount = + ADD(MUL(unwrap(emitted_prims_vec), VIMMED1(vertsPerPrim)), + unwrap(verts_per_prim_vec)); + + vCount = unwrap(total_emitted_vertices_vec); + + Value *mask = unwrap(mask_vec); + Value *cmpMask = VMASK(ICMP_NE(unwrap(verts_per_prim_vec), VIMMED1(0))); + mask = AND(mask, cmpMask); + vMask1 = TRUNC(mask, VectorType::get(mInt1Ty, 8)); + + vCount = SUB(vCount, VIMMED1(1)); + Value *vOffset = ADD(UDIV(vCount, VIMMED1(8)), VIMMED1(VERTEX_COUNT_SIZE)); + Value *vValue = SHL(VIMMED1(1), UREM(vCount, VIMMED1(8))); + + vValue = TRUNC(vValue, VectorType::get(mInt8Ty, 8)); + + Value *pStack = STACKSAVE(); + Value *pTmpPtr = ALLOCA(mInt8Ty, C(4)); // used for dummy read/write for lane masking + + for (uint32_t lane = 0; lane < mVWidth; ++lane) { + Value *vLaneOffset = VEXTRACT(vOffset, C(lane)); + Value *pStream = LOAD(iface->pGsCtx, {0, SWR_GS_CONTEXT_pStreams, lane}); + Value *pStreamOffset = GEP(pStream, vLaneOffset); + + Value *pLaneMask = VEXTRACT(vMask1, C(lane)); + pStreamOffset = SELECT(pLaneMask, pStreamOffset, pTmpPtr); + + Value *vVal = LOAD(pStreamOffset); + vVal = OR(vVal, VEXTRACT(vValue, C(lane))); + STORE(vVal, pStreamOffset); + } + + STACKRESTORE(pStack); +} + +void +BuilderSWR::swr_gs_llvm_epilogue(const struct lp_build_gs_iface *gs_base, + LLVMValueRef total_emitted_vertices_vec, + LLVMValueRef emitted_prims_vec) +{ + swr_gs_llvm_iface *iface = (swr_gs_llvm_iface*)gs_base; + + IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder))); + + // Store emit count to each output stream in the first DWORD + for (uint32_t lane = 0; lane < mVWidth; ++lane) + { + Value* pStream = LOAD(iface->pGsCtx, {0, SWR_GS_CONTEXT_pStreams, lane}); + pStream = BITCAST(pStream, mInt32PtrTy); + Value* pLaneCount = VEXTRACT(unwrap(total_emitted_vertices_vec), C(lane)); + STORE(pLaneCount, pStream); + } +} + +void +BuilderSWR::swr_tcs_llvm_emit_prologue(struct lp_build_tgsi_soa_context* bld) +{ + swr_tcs_llvm_iface *iface = (swr_tcs_llvm_iface*)bld->tcs_iface; + + // Iterate for all the vertices in the output patch + lp_build_for_loop_begin(&iface->loop_state, gallivm, + lp_build_const_int32(gallivm, 0), + LLVMIntULT, + lp_build_const_int32(gallivm, iface->output_vertices), + lp_build_const_int32(gallivm, 1)); + + IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder))); + bld->system_values.invocation_id = wrap(VBROADCAST(unwrap(iface->loop_state.counter))); + + if (verbose_shader) { + lp_build_printf(gallivm, "Prologue LOOP: Iteration %d BEGIN\n", iface->loop_state.counter); + lp_build_print_value(gallivm, "LOOP: InvocationId: \n", bld->system_values.invocation_id); + } +} + +void +BuilderSWR::swr_tcs_llvm_emit_epilogue(struct lp_build_tgsi_soa_context* bld) +{ + swr_tcs_llvm_iface *iface = (swr_tcs_llvm_iface*)bld->tcs_iface; + + if (verbose_shader) { + lp_build_printf(gallivm, "Epilogue LOOP: Iteration %d END\n", iface->loop_state.counter); + } + lp_build_for_loop_end(&iface->loop_state); +} + +LLVMValueRef +BuilderSWR::swr_tcs_llvm_fetch_input(const struct lp_build_tcs_iface *tcs_iface, + struct lp_build_tgsi_context * bld_base, + boolean is_vindex_indirect, + LLVMValueRef vertex_index, + boolean is_aindex_indirect, + LLVMValueRef attrib_index, + LLVMValueRef swizzle_index) +{ + swr_tcs_llvm_iface *iface = (swr_tcs_llvm_iface*)tcs_iface; + Value *vert_index = unwrap(vertex_index); + Value *attr_index = unwrap(attrib_index); + + IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder))); + + if (verbose_shader) { + lp_build_print_value(gallivm, "TCS: Vertex index: ", vertex_index); + lp_build_print_value(gallivm, "TCS: Attrib index: ", attrib_index); + lp_build_print_value(gallivm, "TCS: Swizzle index: ", swizzle_index); + } + + if (is_vindex_indirect) { + vert_index = VEXTRACT(vert_index, C(0)); + if (verbose_shader) { + lp_build_print_value(gallivm, "TCS: Extracted vertex index: ", vertex_index); + } + } + + if (is_aindex_indirect) { + attr_index = VEXTRACT(attr_index, C(0)); + if (verbose_shader) { + lp_build_print_value(gallivm, "TCS: Extracted attrib index: ", attrib_index); + } + } + + Value *attrib = LOAD(GEP(iface->pVtxAttribMap, {C(0), attr_index})); + if (verbose_shader) { + lp_build_print_value(gallivm, "TCS: Attrib index loaded from map: ", wrap(attrib)); + } + + Value *pBase = GEP(iface->pTcsCtx, + { C(0), C(SWR_HS_CONTEXT_vert), vert_index, + C(simdvertex_attrib), attrib /*attr_index*/, unwrap(swizzle_index) }); + + LLVMValueRef res = wrap(LOAD(pBase)); + + if (verbose_shader) { + lp_build_print_value(gallivm, "TCS input fetched: ", res); + } + return res; +} + +LLVMValueRef +BuilderSWR::swr_tcs_llvm_fetch_output(const struct lp_build_tcs_iface *tcs_iface, + struct lp_build_tgsi_context * bld_base, + boolean is_vindex_indirect, + LLVMValueRef vertex_index, + boolean is_aindex_indirect, + LLVMValueRef attrib_index, + LLVMValueRef swizzle_index, + uint32_t name) +{ + swr_tcs_llvm_iface *iface = (swr_tcs_llvm_iface*)tcs_iface; + + Value *vert_index = unwrap(vertex_index); + Value *attr_index = unwrap(attrib_index); + + IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder))); + + if (verbose_shader) { + lp_build_print_value(gallivm, "++TCSo: Vertex index: ", vertex_index); + lp_build_print_value(gallivm, "++TCSo: Attrib index: ", wrap(attr_index)); + lp_build_print_value(gallivm, "++TCSo: Swizzle index: ", swizzle_index); + } + + if (is_vindex_indirect) { + vert_index = VEXTRACT(vert_index, C(0)); + if (verbose_shader) + { + lp_build_print_value(gallivm, "TCSo: Extracted vertex index: ", vertex_index); + } + } + + if (is_aindex_indirect) { + attr_index = VEXTRACT(attr_index, C(0)); + if (verbose_shader) { + lp_build_print_value(gallivm, "TCSo: Extracted attrib index: ", attrib_index); + } + } + + Value* res = unwrap(bld_base->base.zero); + + for (uint32_t lane = 0; lane < mVWidth; lane++) { + Value* p1 = LOAD(iface->pTcsCtx, {0, SWR_HS_CONTEXT_pCPout}); + Value* pCpOut = GEP(p1, {lane}); + + if (name == TGSI_SEMANTIC_TESSOUTER || name == TGSI_SEMANTIC_TESSINNER) { + + Value* tessFactors = GEP(pCpOut, {(uint32_t)0, ScalarPatch_tessFactors}); + Value* tessFactorArray = nullptr; + if (name == TGSI_SEMANTIC_TESSOUTER) { + tessFactorArray = GEP(tessFactors, {(uint32_t)0, SWR_TESSELLATION_FACTORS_OuterTessFactors}); + } else { + tessFactorArray = GEP(tessFactors, {(uint32_t)0, SWR_TESSELLATION_FACTORS_InnerTessFactors}); + } + Value* tessFactor = GEP(tessFactorArray, {C(0), unwrap(swizzle_index)}); + res = VINSERT(res, LOAD(tessFactor), C(lane)); + + } else if (name == TGSI_SEMANTIC_PATCH) { + lp_build_print_value(gallivm, "bbbbb TCS per-patch attr_index: ", wrap(attr_index)); + Value* attr = GEP(pCpOut, {C(0), C(ScalarPatch_patchData), C(ScalarCPoint_attrib), attr_index, unwrap(swizzle_index)}); + res = VINSERT(res, LOAD(attr), C(lane)); + if (verbose_shader) { + lp_build_print_value(gallivm, "++TCSo per-patch lane (patch-id): ", wrap(C(lane))); + lp_build_print_value(gallivm, "++TCSo per-patch loaded value: ", wrap(res)); + } + } else { + // Generic attribute + Value *attrib = + LOAD(GEP(iface->pVtxOutputAttribMap, {C(0), attr_index})); + if (verbose_shader) + { + lp_build_print_value(gallivm, "TCSo: Attrib index from map: ", wrap(attrib)); + } + Value* attr_chan = GEP(pCpOut, {C(0), C(ScalarPatch_cp), vert_index, + C(ScalarCPoint_attrib), attrib, unwrap(swizzle_index)}); + + res = VINSERT(res, LOAD(attr_chan), C(lane)); + } + } + + if (verbose_shader) { + lp_build_print_value(gallivm, "TCSo: output fetched: ", wrap(res)); + } + return wrap(res); +} + +void +BuilderSWR::swr_tcs_llvm_store_output(const struct lp_build_tcs_iface *tcs_iface, + struct lp_build_tgsi_context *bld_base, + unsigned name, + boolean is_vindex_indirect, + LLVMValueRef vertex_index, + boolean is_aindex_indirect, + LLVMValueRef attrib_index, + LLVMValueRef swizzle_index, + LLVMValueRef value) +{ + swr_tcs_llvm_iface *iface = (swr_tcs_llvm_iface*)tcs_iface; + struct lp_build_tgsi_soa_context* bld = (struct lp_build_tgsi_soa_context*)bld_base; + + IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder))); + + if (verbose_shader) { + lp_build_printf(gallivm, "[TCS OUT] =============================================\n"); + } + + if (verbose_shader) { + lp_build_print_value(gallivm, "[TCS OUT] Store mask: ", bld->exec_mask.exec_mask); + lp_build_print_value(gallivm, "[TCS OUT] Store value: ", value); + } + + Value *vert_index = unwrap(vertex_index); + Value *attr_index = unwrap(attrib_index); + + IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder))); + + if (verbose_shader) { + lp_build_print_value(gallivm, "[TCS OUT] Vertex index: ", vertex_index); + lp_build_print_value(gallivm, "[TCS OUT] Attrib index: ", wrap(attr_index)); + lp_build_print_value(gallivm, "[TCS OUT] Swizzle index: ", swizzle_index); + } + + if (is_vindex_indirect) { + vert_index = VEXTRACT(vert_index, C(0)); + if (verbose_shader) + { + lp_build_print_value(gallivm, "[TCS OUT] Extracted vertex index: ", vertex_index); + } + } + + if (is_aindex_indirect) { + attr_index = VEXTRACT(attr_index, C(0)); + if (verbose_shader) { + lp_build_print_value(gallivm, "[TCS OUT] Extracted attrib index: ", wrap(attr_index)); + } + } + + for (uint32_t lane = 0; lane < mVWidth; lane++) { + Value* p1 = LOAD(iface->pTcsCtx, {0, SWR_HS_CONTEXT_pCPout}); + Value* pCpOut = GEP(p1, {lane}); + + if (name == TGSI_SEMANTIC_TESSOUTER || name == TGSI_SEMANTIC_TESSINNER) { + Value* tessFactors = GEP(pCpOut, {(uint32_t)0, ScalarPatch_tessFactors}); + Value* tessFactorArray = nullptr; + if (name == TGSI_SEMANTIC_TESSOUTER) { + tessFactorArray = GEP(tessFactors, {(uint32_t)0, SWR_TESSELLATION_FACTORS_OuterTessFactors}); + } else { + tessFactorArray = GEP(tessFactors, {(uint32_t)0, SWR_TESSELLATION_FACTORS_InnerTessFactors}); + } + Value* tessFactor = GEP(tessFactorArray, {C(0), unwrap(swizzle_index)}); + Value* valueToStore = VEXTRACT(unwrap(value), C(lane)); + struct lp_exec_mask *mask = &bld->exec_mask; + if (mask->has_mask) { + Value *originalVal = LOAD(tessFactor); + Value *vMask = TRUNC(VEXTRACT(unwrap(mask->exec_mask), C(lane)), mInt1Ty); + valueToStore = SELECT(vMask, valueToStore, originalVal); + } + STORE(valueToStore, tessFactor); + if (verbose_shader) { + lp_build_print_value(gallivm, "[TCS OUT][FACTOR] Stored value: ", wrap(valueToStore)); + } + } else if (name == TGSI_SEMANTIC_PATCH) { + Value* attrib = LOAD(GEP(iface->pPatchOutputAttribMap, {C(0), attr_index})); + if (verbose_shader) { + lp_build_print_value(gallivm, "[TCS OUT][PATCH] vert_index: ", wrap(vert_index)); + lp_build_print_value(gallivm, "[TCS OUT][PATCH] attr_index: ", wrap(attr_index)); + lp_build_print_value(gallivm, "[TCS OUT][PATCH] vert_index_indirect: ", wrap(C(is_vindex_indirect))); + lp_build_print_value(gallivm, "[TCS OUT][PATCH] attr_index_indirect: ", wrap(C(is_aindex_indirect))); + lp_build_print_value(gallivm, "[TCS OUT][PATCH] attr index loaded from map: ", wrap(attrib)); + } + Value* attr = GEP(pCpOut, {C(0), C(ScalarPatch_patchData), C(ScalarCPoint_attrib), attrib}); + Value* value_to_store = VEXTRACT(unwrap(value), C(lane)); + if (verbose_shader) { + lp_build_print_value(gallivm, "[TCS OUT][PATCH] lane (patch-id): ", wrap(C(lane))); + lp_build_print_value(gallivm, "[TCS OUT][PATCH] value to store: ", value); + lp_build_print_value(gallivm, "[TCS OUT][PATCH] per-patch value to store: ", wrap(value_to_store)); + lp_build_print_value(gallivm, "[TCS OUT][PATCH] chan_index: ", swizzle_index); + } + struct lp_exec_mask *mask = &bld->exec_mask; + if (mask->has_mask) { + Value *originalVal = LOADV(attr, {C(0), unwrap(swizzle_index)}); + Value *vMask = TRUNC(VEXTRACT(unwrap(mask->exec_mask), C(lane)), mInt1Ty); + value_to_store = SELECT(vMask, BITCAST(value_to_store, mFP32Ty), originalVal); + if (verbose_shader) { + lp_build_print_value(gallivm, "[TCS OUT][PATCH] store mask: ", bld->exec_mask.exec_mask); + lp_build_print_value(gallivm, "[TCS OUT][PATCH] loaded original value: ", wrap(originalVal)); + lp_build_print_value(gallivm, "[TCS OUT][PATCH] vMask: ", wrap(vMask)); + lp_build_print_value(gallivm, "[TCS OUT][PATCH] selected value to store: ", wrap(value_to_store)); + } + } + STOREV(value_to_store, attr, {C(0), unwrap(swizzle_index)}); + if (verbose_shader) { + lp_build_print_value(gallivm, "[TCS OUT][PATCH] stored value: ", wrap(value_to_store)); + } + } else { + Value* value_to_store = VEXTRACT(unwrap(value), C(lane)); + Value* attrib = LOAD(GEP(iface->pVtxOutputAttribMap, {C(0), attr_index})); + + if (verbose_shader) { + lp_build_print_value(gallivm, "[TCS OUT][VTX] invocation_id: ", bld->system_values.invocation_id); + lp_build_print_value(gallivm, "[TCS OUT][VTX] attribIndex: ", wrap(attr_index)); + lp_build_print_value(gallivm, "[TCS OUT][VTX] attrib read from map: ", wrap(attrib)); + lp_build_print_value(gallivm, "[TCS OUT][VTX] chan_index: ", swizzle_index); + lp_build_print_value(gallivm, "[TCS OUT][VTX] value: ", value); + lp_build_print_value(gallivm, "[TCS OUT][VTX] value_to_store: ", wrap(value_to_store)); + } + + Value* attr_chan = GEP(pCpOut, {C(0), C(ScalarPatch_cp), + VEXTRACT(unwrap(bld->system_values.invocation_id), C(0)), + C(ScalarCPoint_attrib), attrib, unwrap(swizzle_index)}); + + // Mask output values if needed + struct lp_exec_mask *mask = &bld->exec_mask; + if (mask->has_mask) { + Value *originalVal = LOAD(attr_chan); + Value *vMask = TRUNC(VEXTRACT(unwrap(mask->exec_mask), C(lane)), mInt1Ty); + // convert input to float before trying to store + value_to_store = SELECT(vMask, BITCAST(value_to_store, mFP32Ty), originalVal); + } + STORE(value_to_store, attr_chan); + if (verbose_shader) { + lp_build_print_value(gallivm, "[TCS OUT][VTX] stored: ", wrap(value_to_store)); + } + } + } +} + + + +void +BuilderSWR::swr_tcs_llvm_emit_barrier(const struct lp_build_tcs_iface *tcs_iface, + struct lp_build_tgsi_context *bld_base) +{ + swr_tcs_llvm_iface *iface = (swr_tcs_llvm_iface*)tcs_iface; + struct lp_build_tgsi_soa_context* bld = (struct lp_build_tgsi_soa_context*)bld_base; + + if (verbose_shader) { + lp_build_printf(gallivm, "Barrier LOOP: Iteration %d END\n", iface->loop_state.counter); + } + + // End previous loop + lp_build_for_loop_end(&iface->loop_state); + + // Start new one + lp_build_for_loop_begin(&iface->loop_state, gallivm, + lp_build_const_int32(gallivm, 0), + LLVMIntULT, + lp_build_const_int32(gallivm, iface->output_vertices), + lp_build_const_int32(gallivm, 1)); + + + IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder))); + + bld->system_values.invocation_id = wrap(VBROADCAST(unwrap(iface->loop_state.counter))); + + if (verbose_shader) { + lp_build_printf(gallivm, "Barrier LOOP: Iteration %d BEGIN\n", iface->loop_state.counter); + lp_build_print_value(gallivm, "LOOP: InvocationId: \n", bld->system_values.invocation_id); + } +} + + +LLVMValueRef +BuilderSWR::swr_tes_llvm_fetch_patch_input(const struct lp_build_tes_iface *tes_iface, + struct lp_build_tgsi_context * bld_base, + boolean is_aindex_indirect, + LLVMValueRef attrib_index, + LLVMValueRef swizzle_index) +{ + swr_tes_llvm_iface *iface = (swr_tes_llvm_iface*)tes_iface; + Value *attr_index = unwrap(attrib_index); + Value *res = unwrap(bld_base->base.zero); + + IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder))); + + if (verbose_shader) { + lp_build_printf(gallivm, "[TES IN][PATCH] --------------------------------------\n"); + } + + if (is_aindex_indirect) { + int i; + struct lp_type type = bld_base->base.type; + + for (i = 0; i < type.length; i++) { + Value *attr_chan_index = attr_index; + + if (is_aindex_indirect) { + attr_chan_index = VEXTRACT(attr_index, C(i)); + } + + Value *attrib = + LOAD(GEP(iface->pPatchAttribMap, {C(0), attr_chan_index})); + + Value *pCpIn = LOAD(iface->pTesCtx, {0, SWR_DS_CONTEXT_pCpIn}, "pCpIn"); + Value *pPatchData = GEP(pCpIn, {(uint32_t)0, ScalarPatch_patchData}); + Value *pAttr = GEP(pPatchData, {(uint32_t)0, ScalarCPoint_attrib}); + Value *Val = LOADV(pAttr, {C(0), attrib, unwrap(swizzle_index)}); + if (verbose_shader) { + lp_build_print_value(gallivm, "[TES IN][PATCH] attrib_index: ", attrib_index); + lp_build_print_value(gallivm, "[TES IN][PATCH] attr_chan_index: ", wrap(attr_chan_index)); + lp_build_print_value(gallivm, "[TES IN][PATCH] attrib read from map: ", wrap(attrib)); + lp_build_print_value(gallivm, "[TES IN][PATCH] swizzle_index: ", swizzle_index); + lp_build_print_value(gallivm, "[TES IN][PATCH] Loaded: ", wrap(Val)); + } + res = VINSERT(res, Val, C(i)); + } + } else { + Value *attrib = LOAD(GEP(iface->pPatchAttribMap, {C(0), attr_index})); + + Value *pCpIn = LOAD(iface->pTesCtx, {(uint32_t)0, SWR_DS_CONTEXT_pCpIn}, "pCpIn"); + Value *pPatchData = GEP(pCpIn, {(uint32_t)0, ScalarPatch_patchData}); + Value *pAttr = GEP(pPatchData, {(uint32_t)0, ScalarCPoint_attrib}); + Value *Val = LOADV(pAttr, {C(0), attrib, unwrap(swizzle_index)}); + if (verbose_shader) { + lp_build_print_value(gallivm, "[TES IN][PATCH] attrib_index: ", attrib_index); + lp_build_print_value(gallivm, "[TES IN][PATCH] attr_chan_index: ", wrap(attr_index)); + lp_build_print_value(gallivm, "[TES IN][PATCH] attrib read from map: ", wrap(attrib)); + lp_build_print_value(gallivm, "[TES IN][PATCH] swizzle_index: ", swizzle_index); + lp_build_print_value(gallivm, "[TES IN][PATCH] Loaded: ", wrap(Val)); + } + res = VBROADCAST(Val); + } + if (verbose_shader) { + lp_build_print_value(gallivm, "[TES IN][PATCH] returning: ", wrap(res)); + } + return wrap(res); +} + + + +LLVMValueRef +BuilderSWR::swr_tes_llvm_fetch_vtx_input(const struct lp_build_tes_iface *tes_iface, + struct lp_build_tgsi_context * bld_base, + boolean is_vindex_indirect, + LLVMValueRef vertex_index, + boolean is_aindex_indirect, + LLVMValueRef attrib_index, + LLVMValueRef swizzle_index) +{ + swr_tes_llvm_iface *iface = (swr_tes_llvm_iface*)tes_iface; + Value *vert_index = unwrap(vertex_index); + Value *attr_index = unwrap(attrib_index); + + IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder))); + + if (verbose_shader) { + lp_build_printf(gallivm, "[TES IN][VTX] --------------------------------------\n"); + } + + Value *res = unwrap(bld_base->base.zero); + if (is_vindex_indirect || is_aindex_indirect) { + int i; + struct lp_type type = bld_base->base.type; + + for (i = 0; i < type.length; i++) { + Value *vert_chan_index = vert_index; + Value *attr_chan_index = attr_index; + + if (is_vindex_indirect) { + vert_chan_index = VEXTRACT(vert_index, C(i)); + } + if (is_aindex_indirect) { + attr_chan_index = VEXTRACT(attr_index, C(i)); + } + + Value *attrib = + LOAD(GEP(iface->pVtxAttribMap, {C(0), attr_chan_index})); + + Value *pCpIn = LOAD(iface->pTesCtx, {0, SWR_DS_CONTEXT_pCpIn}, "pCpIn"); + Value *pCp = GEP(pCpIn, {0, ScalarPatch_cp}); + Value *pVertex = GEP(pCp, {(Value*)C(0), vert_chan_index}); + Value *pAttrTab = GEP(pVertex, {uint32_t(0), uint32_t(0)}); + Value *pAttr = GEP(pAttrTab, {(Value*)C(0), attrib}); + Value *Val = LOADV(pAttr, {C(0), unwrap(swizzle_index)}); + if (verbose_shader) { + lp_build_print_value(gallivm, "[TES IN][VTX] attrib_index: ", attrib_index); + lp_build_print_value(gallivm, "[TES IN][VTX] attr_chan_index: ", wrap(attr_index)); + lp_build_print_value(gallivm, "[TES IN][VTX] attrib read from map: ", wrap(attrib)); + lp_build_print_value(gallivm, "[TES IN][VTX] swizzle_index: ", swizzle_index); + lp_build_print_value(gallivm, "[TES IN][VTX] Loaded: ", wrap(Val)); + } + res = VINSERT(res, Val, C(i)); + } + } else { + Value *attrib = LOAD(GEP(iface->pVtxAttribMap, {C(0), attr_index})); + + Value *pCpIn = LOAD(iface->pTesCtx, {0, SWR_DS_CONTEXT_pCpIn}, "pCpIn"); + Value *pCp = GEP(pCpIn, {0, ScalarPatch_cp}); + Value *pVertex = GEP(pCp, {(Value*)C(0), vert_index}); + Value *pAttrTab = GEP(pVertex, {uint32_t(0), uint32_t(0)}); + Value *pAttr = GEP(pAttrTab, {(Value*)C(0), attrib}); + Value *Val = LOADV(pAttr, {C(0), unwrap(swizzle_index)}); + if (verbose_shader) { + lp_build_print_value(gallivm, "[TES IN][VTX] attrib_index: ", attrib_index); + lp_build_print_value(gallivm, "[TES IN][VTX] attr_chan_index: ", wrap(attr_index)); + lp_build_print_value(gallivm, "[TES IN][VTX] attrib read from map: ", wrap(attrib)); + lp_build_print_value(gallivm, "[TES IN][VTX] swizzle_index: ", swizzle_index); + lp_build_print_value(gallivm, "[TES IN][VTX] Loaded: ", wrap(Val)); + } + res = VBROADCAST(Val); + } + if (verbose_shader) { + lp_build_print_value(gallivm, "[TES IN][VTX] returning: ", wrap(res)); + } + return wrap(res); +} + + + + +PFN_GS_FUNC +BuilderSWR::CompileGS(struct swr_context *ctx, swr_jit_gs_key &key) +{ + SWR_GS_STATE *pGS = &ctx->gs->gsState; + struct tgsi_shader_info *info = &ctx->gs->info.base; + + memset(pGS, 0, sizeof(*pGS)); + + pGS->gsEnable = true; + + pGS->numInputAttribs = (VERTEX_ATTRIB_START_SLOT - VERTEX_POSITION_SLOT) + info->num_inputs; + pGS->outputTopology = + swr_convert_prim_topology(info->properties[TGSI_PROPERTY_GS_OUTPUT_PRIM], 0); + + /* It's +1 because emit_vertex in swr is always called exactly one time more + * than max_vertices passed in Geometry Shader. We need to allocate more memory + * to avoid crash/memory overwritten. + */ + pGS->maxNumVerts = info->properties[TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES] + 1; + pGS->instanceCount = info->properties[TGSI_PROPERTY_GS_INVOCATIONS]; + + // If point primitive then assume to use multiple streams + if(pGS->outputTopology == TOP_POINT_LIST) { + pGS->isSingleStream = false; + } else { + pGS->isSingleStream = true; + pGS->singleStreamID = 0; + } + + pGS->vertexAttribOffset = VERTEX_POSITION_SLOT; + pGS->inputVertStride = pGS->numInputAttribs + pGS->vertexAttribOffset; + pGS->outputVertexSize = SWR_VTX_NUM_SLOTS; + pGS->controlDataSize = 8; // GS ouputs max of 8 32B units + pGS->controlDataOffset = VERTEX_COUNT_SIZE; + pGS->outputVertexOffset = pGS->controlDataOffset + CONTROL_HEADER_SIZE; + + pGS->allocationSize = + VERTEX_COUNT_SIZE + // vertex count + CONTROL_HEADER_SIZE + // control header + (SWR_VTX_NUM_SLOTS * 16) * // sizeof vertex + pGS->maxNumVerts; // num verts + + struct swr_geometry_shader *gs = ctx->gs; + + LLVMValueRef inputs[PIPE_MAX_SHADER_INPUTS][TGSI_NUM_CHANNELS]; + LLVMValueRef outputs[PIPE_MAX_SHADER_OUTPUTS][TGSI_NUM_CHANNELS]; + + memset(outputs, 0, sizeof(outputs)); + + AttrBuilder attrBuilder; + attrBuilder.addStackAlignmentAttr(JM()->mVWidth * sizeof(float)); + + std::vector gsArgs{PointerType::get(Gen_swr_draw_context(JM()), 0), + PointerType::get(mInt8Ty, 0), + PointerType::get(Gen_SWR_GS_CONTEXT(JM()), 0)}; + FunctionType *vsFuncType = + FunctionType::get(Type::getVoidTy(JM()->mContext), gsArgs, false); + + // create new vertex shader function + auto pFunction = Function::Create(vsFuncType, + GlobalValue::ExternalLinkage, + "GS", + JM()->mpCurrentModule); +#if LLVM_VERSION_MAJOR < 5 + AttributeSet attrSet = AttributeSet::get( + JM()->mContext, AttributeSet::FunctionIndex, attrBuilder); + pFunction->addAttributes(AttributeSet::FunctionIndex, attrSet); +#else + pFunction->addAttributes(AttributeList::FunctionIndex, attrBuilder); +#endif + + BasicBlock *block = BasicBlock::Create(JM()->mContext, "entry", pFunction); + IRB()->SetInsertPoint(block); + LLVMPositionBuilderAtEnd(gallivm->builder, wrap(block)); + + auto argitr = pFunction->arg_begin(); + Value *hPrivateData = &*argitr++; + hPrivateData->setName("hPrivateData"); + Value *pWorkerData = &*argitr++; + pWorkerData->setName("pWorkerData"); + Value *pGsCtx = &*argitr++; + pGsCtx->setName("gsCtx"); + + Value *consts_ptr = + GEP(hPrivateData, {C(0), C(swr_draw_context_constantGS)}); + consts_ptr->setName("gs_constants"); + Value *const_sizes_ptr = + GEP(hPrivateData, {0, swr_draw_context_num_constantsGS}); + const_sizes_ptr->setName("num_gs_constants"); + + struct lp_build_sampler_soa *sampler = + swr_sampler_soa_create(key.sampler, PIPE_SHADER_GEOMETRY); + + struct lp_bld_tgsi_system_values system_values; + memset(&system_values, 0, sizeof(system_values)); + system_values.prim_id = wrap(LOAD(pGsCtx, {0, SWR_GS_CONTEXT_PrimitiveID})); + system_values.invocation_id = wrap(LOAD(pGsCtx, {0, SWR_GS_CONTEXT_InstanceID})); + + std::vector mapConstants; + Value *vtxAttribMap = ALLOCA(ArrayType::get(mInt32Ty, PIPE_MAX_SHADER_INPUTS)); + for (unsigned slot = 0; slot < info->num_inputs; slot++) { + ubyte semantic_name = info->input_semantic_name[slot]; + ubyte semantic_idx = info->input_semantic_index[slot]; + + unsigned vs_slot = locate_linkage(semantic_name, semantic_idx, &ctx->vs->info.base); + + vs_slot += VERTEX_ATTRIB_START_SLOT; + + if (ctx->vs->info.base.output_semantic_name[0] == TGSI_SEMANTIC_POSITION) + vs_slot--; + + if (semantic_name == TGSI_SEMANTIC_POSITION) + vs_slot = VERTEX_POSITION_SLOT; + + STORE(C(vs_slot), vtxAttribMap, {0, slot}); + mapConstants.push_back(C(vs_slot)); + } + + struct lp_build_mask_context mask; + Value *mask_val = LOAD(pGsCtx, {0, SWR_GS_CONTEXT_mask}, "gsMask"); + lp_build_mask_begin(&mask, gallivm, + lp_type_float_vec(32, 32 * 8), wrap(mask_val)); + + // zero out cut buffer so we can load/modify/store bits + for (uint32_t lane = 0; lane < mVWidth; ++lane) + { + Value* pStream = LOAD(pGsCtx, {0, SWR_GS_CONTEXT_pStreams, lane}); +#if LLVM_VERSION_MAJOR >= 10 + MEMSET(pStream, C((char)0), VERTEX_COUNT_SIZE + CONTROL_HEADER_SIZE, MaybeAlign(sizeof(float) * KNOB_SIMD_WIDTH)); +#else + MEMSET(pStream, C((char)0), VERTEX_COUNT_SIZE + CONTROL_HEADER_SIZE, sizeof(float) * KNOB_SIMD_WIDTH); +#endif + } + + struct swr_gs_llvm_iface gs_iface; + gs_iface.base.fetch_input = ::swr_gs_llvm_fetch_input; + gs_iface.base.emit_vertex = ::swr_gs_llvm_emit_vertex; + gs_iface.base.end_primitive = ::swr_gs_llvm_end_primitive; + gs_iface.base.gs_epilogue = ::swr_gs_llvm_epilogue; + gs_iface.pBuilder = this; + gs_iface.pGsCtx = pGsCtx; + gs_iface.pGsState = pGS; + gs_iface.num_outputs = gs->info.base.num_outputs; + gs_iface.num_verts_per_prim = + u_vertices_per_prim((pipe_prim_type)info->properties[TGSI_PROPERTY_GS_OUTPUT_PRIM]); + gs_iface.info = info; + gs_iface.pVtxAttribMap = vtxAttribMap; + + struct lp_build_tgsi_params params; + memset(¶ms, 0, sizeof(params)); + params.type = lp_type_float_vec(32, 32 * 8); + params.mask = & mask; + params.consts_ptr = wrap(consts_ptr); + params.const_sizes_ptr = wrap(const_sizes_ptr); + params.system_values = &system_values; + params.inputs = inputs; + params.context_ptr = wrap(hPrivateData); + params.sampler = sampler; + params.info = &gs->info.base; + params.gs_iface = &gs_iface.base; + + lp_build_tgsi_soa(gallivm, + gs->pipe.tokens, + ¶ms, + outputs); + + lp_build_mask_end(&mask); + + sampler->destroy(sampler); + + IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder))); + + RET_VOID(); + + gallivm_verify_function(gallivm, wrap(pFunction)); + gallivm_compile_module(gallivm); + + PFN_GS_FUNC pFunc = + (PFN_GS_FUNC)gallivm_jit_function(gallivm, wrap(pFunction)); + + debug_printf("geom shader %p\n", pFunc); + assert(pFunc && "Error: GeomShader = NULL"); + + JM()->mIsModuleFinalized = true; + + return pFunc; +} + +PFN_TES_FUNC +BuilderSWR::CompileTES(struct swr_context *ctx, swr_jit_tes_key &key) +{ + SWR_TS_STATE *pTS = &ctx->tsState; + struct tgsi_shader_info *info = &ctx->tes->info.base; + + // tessellation is enabled if TES is present + // clear tessellation state here then + memset(pTS, 0, sizeof(*pTS)); + + pTS->tsEnable = true; + + unsigned tes_prim_mode = info->properties[TGSI_PROPERTY_TES_PRIM_MODE]; + unsigned tes_spacing = info->properties[TGSI_PROPERTY_TES_SPACING]; + bool tes_vertex_order_cw = info->properties[TGSI_PROPERTY_TES_VERTEX_ORDER_CW]; + bool tes_point_mode = info->properties[TGSI_PROPERTY_TES_POINT_MODE]; + SWR_TS_DOMAIN type; + SWR_TS_PARTITIONING partitioning; + SWR_TS_OUTPUT_TOPOLOGY topology; + PRIMITIVE_TOPOLOGY postDSTopology; + + // TESS_TODO: move this to helper functions to improve readability + switch (tes_prim_mode) { + case PIPE_PRIM_LINES: + type = SWR_TS_ISOLINE; + postDSTopology = TOP_LINE_LIST; + break; + case PIPE_PRIM_TRIANGLES: + type = SWR_TS_TRI; + postDSTopology = TOP_TRIANGLE_LIST; + break; + case PIPE_PRIM_QUADS: + type = SWR_TS_QUAD; + // See OpenGL spec - quads are tessellated into triangles + postDSTopology = TOP_TRIANGLE_LIST; + break; + default: + assert(0); + } + + switch (tes_spacing) { + case PIPE_TESS_SPACING_FRACTIONAL_ODD: + partitioning = SWR_TS_ODD_FRACTIONAL; + break; + case PIPE_TESS_SPACING_FRACTIONAL_EVEN: + partitioning = SWR_TS_EVEN_FRACTIONAL; + break; + case PIPE_TESS_SPACING_EQUAL: + partitioning = SWR_TS_INTEGER; + break; + default: + assert(0); + } + + if (tes_point_mode) { + topology = SWR_TS_OUTPUT_POINT; + postDSTopology = TOP_POINT_LIST; + } + else if (tes_prim_mode == PIPE_PRIM_LINES) { + topology = SWR_TS_OUTPUT_LINE; + } + else if (tes_vertex_order_cw) { + topology = SWR_TS_OUTPUT_TRI_CW; + } + else { + topology = SWR_TS_OUTPUT_TRI_CCW; + } + + pTS->domain = type; + pTS->tsOutputTopology = topology; + pTS->partitioning = partitioning; + pTS->numDsOutputAttribs = info->num_outputs; + pTS->postDSTopology = postDSTopology; + + pTS->dsAllocationSize = SWR_VTX_NUM_SLOTS * MAX_NUM_VERTS_PER_PRIM; + pTS->vertexAttribOffset = VERTEX_ATTRIB_START_SLOT; + pTS->srcVertexAttribOffset = VERTEX_ATTRIB_START_SLOT; + pTS->dsOutVtxAttribOffset = VERTEX_ATTRIB_START_SLOT; + + struct swr_tess_evaluation_shader *tes = ctx->tes; + + LLVMValueRef inputs[PIPE_MAX_SHADER_INPUTS][TGSI_NUM_CHANNELS]; + LLVMValueRef outputs[PIPE_MAX_SHADER_OUTPUTS][TGSI_NUM_CHANNELS]; + + memset(outputs, 0, sizeof(outputs)); + + AttrBuilder attrBuilder; + attrBuilder.addStackAlignmentAttr(JM()->mVWidth * sizeof(float)); + + std::vector tesArgs{PointerType::get(Gen_swr_draw_context(JM()), 0), + PointerType::get(mInt8Ty, 0), + PointerType::get(Gen_SWR_DS_CONTEXT(JM()), 0)}; + FunctionType *tesFuncType = + FunctionType::get(Type::getVoidTy(JM()->mContext), tesArgs, false); + + // create new vertex shader function + auto pFunction = Function::Create(tesFuncType, + GlobalValue::ExternalLinkage, + "TES", + JM()->mpCurrentModule); + +#if LLVM_VERSION_MAJOR < 5 + AttributeSet attrSet = AttributeSet::get( + JM()->mContext, AttributeSet::FunctionIndex, attrBuilder); + pFunction->addAttributes(AttributeSet::FunctionIndex, attrSet); +#else + pFunction->addAttributes(AttributeList::FunctionIndex, attrBuilder); +#endif + + BasicBlock *block = BasicBlock::Create(JM()->mContext, "entry", pFunction); + IRB()->SetInsertPoint(block); + LLVMPositionBuilderAtEnd(gallivm->builder, wrap(block)); + + auto argitr = pFunction->arg_begin(); + Value *hPrivateData = &*argitr++; + hPrivateData->setName("hPrivateData"); + Value *pWorkerData = &*argitr++; + pWorkerData->setName("pWorkerData"); + Value *pTesCtx = &*argitr++; + pTesCtx->setName("tesCtx"); + + Value *consts_ptr = + GEP(hPrivateData, {C(0), C(swr_draw_context_constantTES)}); + consts_ptr->setName("tes_constants"); + Value *const_sizes_ptr = + GEP(hPrivateData, {0, swr_draw_context_num_constantsTES}); + const_sizes_ptr->setName("num_tes_constants"); + + struct lp_build_sampler_soa *sampler = + swr_sampler_soa_create(key.sampler, PIPE_SHADER_TESS_EVAL); + + struct lp_bld_tgsi_system_values system_values; + memset(&system_values, 0, sizeof(system_values)); + + // Load and calculate system values + // Tessellation coordinates (gl_TessCoord) + Value *vecOffset = LOAD(pTesCtx, {0, SWR_DS_CONTEXT_vectorOffset}, "vecOffset"); + Value *vecStride = LOAD(pTesCtx, {0, SWR_DS_CONTEXT_vectorStride}, "vecStride"); + Value *vecIndex = LOAD(pTesCtx, {0, SWR_DS_CONTEXT_vectorOffset}); + + Value* tess_coord = ALLOCA(ArrayType::get(mSimdFP32Ty, 3)); + + Value *tessCoordU = LOADV(LOAD(pTesCtx, {0, SWR_DS_CONTEXT_pDomainU}), {vecIndex}, "tessCoordU"); + STORE(tessCoordU, tess_coord, {0, 0}); + Value *tessCoordV = LOADV(LOAD(pTesCtx, {0, SWR_DS_CONTEXT_pDomainV}), {vecIndex}, "tessCoordV"); + STORE(tessCoordV, tess_coord, {0, 1}); + Value *tessCoordW = FSUB(FSUB(VIMMED1(1.0f), tessCoordU), tessCoordV, "tessCoordW"); + STORE(tessCoordW, tess_coord, {0, 2}); + system_values.tess_coord = wrap(tess_coord); + + // Primitive ID + system_values.prim_id = wrap(VBROADCAST(LOAD(pTesCtx, {0, SWR_DS_CONTEXT_PrimitiveID}), "PrimitiveID")); + + // Tessellation factors + Value* pPatch = LOAD(pTesCtx, {0, SWR_DS_CONTEXT_pCpIn}); + Value* pTessFactors = GEP(pPatch, {C(0), C(ScalarPatch_tessFactors)}); + + assert(SWR_NUM_OUTER_TESS_FACTORS == 4); + Value* sys_value_outer_factors = UndefValue::get(VectorType::get(mFP32Ty, 4)); + for (unsigned i = 0; i < SWR_NUM_OUTER_TESS_FACTORS; i++) { + Value* v = LOAD(pTessFactors, {0, SWR_TESSELLATION_FACTORS_OuterTessFactors, i}); + sys_value_outer_factors = VINSERT(sys_value_outer_factors, v, i, "gl_TessLevelOuter"); + } + system_values.tess_outer = wrap(sys_value_outer_factors); + + assert(SWR_NUM_INNER_TESS_FACTORS == 2); + Value* sys_value_inner_factors = UndefValue::get(VectorType::get(mFP32Ty, 4)); + for (unsigned i = 0; i < SWR_NUM_INNER_TESS_FACTORS; i++) { + Value* v = LOAD(pTessFactors, {0, SWR_TESSELLATION_FACTORS_InnerTessFactors, i}); + sys_value_inner_factors = VINSERT(sys_value_inner_factors, v, i, "gl_TessLevelInner"); + } + system_values.tess_inner = wrap(sys_value_inner_factors); + + if (verbose_shader) + { + lp_build_print_value(gallivm, "tess_coord = ", system_values.tess_coord); + } - for (uint32_t lane = 0; lane < mVWidth; ++lane) { - Value *pLaneOffset = VEXTRACT(pOutputOffset, C(lane)); - Value *pStream = LOAD(iface->pGsCtx, {0, SWR_GS_CONTEXT_pStreams, lane}); - Value *pStreamOffset = GEP(pStream, pLaneOffset); - pStreamOffset = BITCAST(pStreamOffset, mFP32PtrTy); + struct tgsi_shader_info *pPrevShader = nullptr; - Value *pLaneMask = VEXTRACT(vMask1, C(lane)); - pStreamOffset = SELECT(pLaneMask, pStreamOffset, pTmpPtr); + if (ctx->tcs) { + pPrevShader = &ctx->tcs->info.base; + } + else { + pPrevShader = &ctx->vs->info.base; + } - for (uint32_t channel = 0; channel < 4; ++channel) { - Value *vData; + // Figure out how many per-patch attributes we have + unsigned perPatchAttrs = 0; + unsigned genericAttrs = 0; + unsigned tessLevelAttrs = 0; + unsigned sgvAttrs = 0; + for (unsigned slot = 0; slot < pPrevShader->num_outputs; slot++) { + switch (pPrevShader->output_semantic_name[slot]) { + case TGSI_SEMANTIC_PATCH: + perPatchAttrs++; + break; + case TGSI_SEMANTIC_GENERIC: + genericAttrs++; + break; + case TGSI_SEMANTIC_TESSINNER: + case TGSI_SEMANTIC_TESSOUTER: + tessLevelAttrs++; + break; + case TGSI_SEMANTIC_POSITION: + case TGSI_SEMANTIC_CLIPDIST: + case TGSI_SEMANTIC_PSIZE: + sgvAttrs++; + break; + default: + assert(!"Unknown semantic input in TES"); + } + } - if (attribSlot == VERTEX_SGV_SLOT) - vData = LOAD(unwrap(outputs[attrib][0])); - else - vData = LOAD(unwrap(outputs[attrib][channel])); + std::vector mapConstants; + Value *vtxAttribMap = ALLOCA(ArrayType::get(mInt32Ty, PIPE_MAX_SHADER_INPUTS)); + Value *patchAttribMap = ALLOCA(ArrayType::get(mInt32Ty, PIPE_MAX_SHADER_INPUTS)); + for (unsigned slot = 0; slot < info->num_inputs; slot++) { + ubyte semantic_name = info->input_semantic_name[slot]; + ubyte semantic_idx = info->input_semantic_index[slot]; - if (attribSlot != VERTEX_SGV_SLOT || - sgvChannel == channel) { - vData = VEXTRACT(vData, C(lane)); - STORE(vData, pStreamOffset); - } - pStreamOffset = GEP(pStreamOffset, C(1)); - } - } - } + // Where in TCS output is my attribute? + // TESS_TODO: revisit after implement pass-through TCS + unsigned tcs_slot = locate_linkage(semantic_name, semantic_idx, pPrevShader); + + // Skip tessellation levels - these go to the tessellator, not TES + switch (semantic_name) { + case TGSI_SEMANTIC_GENERIC: + tcs_slot = tcs_slot + VERTEX_ATTRIB_START_SLOT - sgvAttrs - tessLevelAttrs; + break; + case TGSI_SEMANTIC_PATCH: + tcs_slot = semantic_idx; + break; + case TGSI_SEMANTIC_POSITION: + tcs_slot = VERTEX_POSITION_SLOT; + break; + case TGSI_SEMANTIC_CLIPDIST: + case TGSI_SEMANTIC_PSIZE: + break; + default: + assert(!"Unexpected semantic found while builiding TES input map"); + } + if (semantic_name == TGSI_SEMANTIC_PATCH) { + STORE(C(tcs_slot), patchAttribMap, {0, slot}); + } else { + STORE(C(tcs_slot), vtxAttribMap, {0, slot}); + } + mapConstants.push_back(C(tcs_slot)); + } - STACKRESTORE(pStack); -} + // Build execution mask + struct lp_build_mask_context mask; + Value *mask_val = LOAD(pTesCtx, {0, SWR_DS_CONTEXT_mask}, "tesMask"); -void -BuilderSWR::swr_gs_llvm_end_primitive(const struct lp_build_tgsi_gs_iface *gs_base, - struct lp_build_tgsi_context * bld_base, - LLVMValueRef verts_per_prim_vec, - LLVMValueRef emitted_prims_vec) -{ - swr_gs_llvm_iface *iface = (swr_gs_llvm_iface*)gs_base; + if (verbose_shader) + lp_build_print_value(gallivm, "TES execution mask: ", wrap(mask_val)); - IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder))); + lp_build_mask_begin(&mask, gallivm, + lp_type_float_vec(32, 32 * 8), wrap(mask_val)); - Value *vMask = LOAD(iface->pGsCtx, { 0, SWR_GS_CONTEXT_mask }); - Value *vMask1 = TRUNC(vMask, VectorType::get(mInt1Ty, 8)); + struct swr_tes_llvm_iface tes_iface; - uint32_t vertsPerPrim = iface->num_verts_per_prim; + tes_iface.base.fetch_vertex_input = ::swr_tes_llvm_fetch_vtx_input; + tes_iface.base.fetch_patch_input = ::swr_tes_llvm_fetch_patch_input; - Value *vCount = - ADD(MUL(unwrap(emitted_prims_vec), VIMMED1(vertsPerPrim)), - unwrap(verts_per_prim_vec)); + tes_iface.pBuilder = this; + tes_iface.pTesCtx = pTesCtx; + tes_iface.pTsState = pTS; + tes_iface.num_outputs = tes->info.base.num_outputs; + tes_iface.info = info; + tes_iface.pVtxAttribMap = vtxAttribMap; + tes_iface.pPatchAttribMap = patchAttribMap; - struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base); - vCount = LOAD(unwrap(bld->total_emitted_vertices_vec_ptr)); + struct lp_build_tgsi_params params; + memset(¶ms, 0, sizeof(params)); + params.type = lp_type_float_vec(32, 32 * 8); + params.mask = & mask; + params.consts_ptr = wrap(consts_ptr); + params.const_sizes_ptr = wrap(const_sizes_ptr); + params.system_values = &system_values; + params.inputs = inputs; + params.context_ptr = wrap(hPrivateData); + params.sampler = sampler; + params.info = &tes->info.base; + params.tes_iface = &tes_iface.base; - struct lp_exec_mask *exec_mask = &bld->exec_mask; - Value *mask = unwrap(lp_build_mask_value(bld->mask)); - if (exec_mask->has_mask) - mask = AND(mask, unwrap(exec_mask->exec_mask)); + // Build LLVM IR + lp_build_tgsi_soa(gallivm, + tes->pipe.tokens, + ¶ms, + outputs); - Value *cmpMask = VMASK(ICMP_NE(unwrap(verts_per_prim_vec), VIMMED1(0))); - mask = AND(mask, cmpMask); - vMask1 = TRUNC(mask, VectorType::get(mInt1Ty, 8)); + lp_build_mask_end(&mask); - vCount = SUB(vCount, VIMMED1(1)); - Value *vOffset = ADD(UDIV(vCount, VIMMED1(8)), VIMMED1(VERTEX_COUNT_SIZE)); - Value *vValue = SHL(VIMMED1(1), UREM(vCount, VIMMED1(8))); + sampler->destroy(sampler); - vValue = TRUNC(vValue, VectorType::get(mInt8Ty, 8)); + IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder))); - Value *pStack = STACKSAVE(); - Value *pTmpPtr = ALLOCA(mInt8Ty, C(4)); // used for dummy read/write for lane masking + // Write output attributes + Value *dclOut = LOAD(pTesCtx, {0, SWR_DS_CONTEXT_pOutputData}, "dclOut"); - for (uint32_t lane = 0; lane < mVWidth; ++lane) { - Value *vLaneOffset = VEXTRACT(vOffset, C(lane)); - Value *pStream = LOAD(iface->pGsCtx, {0, SWR_GS_CONTEXT_pStreams, lane}); - Value *pStreamOffset = GEP(pStream, vLaneOffset); + for (uint32_t attrib = 0; attrib < PIPE_MAX_SHADER_OUTPUTS; attrib++) { + for (uint32_t channel = 0; channel < TGSI_NUM_CHANNELS; channel++) { + if (!outputs[attrib][channel]) + continue; - Value *pLaneMask = VEXTRACT(vMask1, C(lane)); - pStreamOffset = SELECT(pLaneMask, pStreamOffset, pTmpPtr); + Value *val = LOAD(unwrap(outputs[attrib][channel]));; + Value *attribOffset = + LOAD(pTesCtx, {0, SWR_DS_CONTEXT_outVertexAttribOffset}); + + // Assume we write possition + Value* outputSlot = C(VERTEX_POSITION_SLOT); + if (tes->info.base.output_semantic_name[attrib] != TGSI_SEMANTIC_POSITION) { + // No, it's a generic attribute, not a position - let's calculate output slot + uint32_t outSlot = attrib; + if (tes->info.base.output_semantic_name[0] == TGSI_SEMANTIC_POSITION) { + // this shader will write position, so in shader's term + // output starts at attrib 1, but we will handle that separately, + // so let's fix the outSlot + outSlot--; + } + outputSlot = ADD(attribOffset, C(outSlot)); + } - Value *vVal = LOAD(pStreamOffset); - vVal = OR(vVal, VEXTRACT(vValue, C(lane))); - STORE(vVal, pStreamOffset); - } + Value *attribVecIndex = + ADD(MUL(vecStride, MUL(outputSlot, C(4))), vecOffset); - STACKRESTORE(pStack); -} + uint32_t outputComponent = 0; + uint32_t curComp = outputComponent + channel; + auto outValIndex = ADD(attribVecIndex, MUL(vecStride, C(curComp))); + STOREV(val, dclOut, {outValIndex}); + + if (verbose_shader) { + lp_build_printf(gallivm, + "TES output [%d][%d]", + C(attrib), + C(channel)); + lp_build_print_value(gallivm, " = ", wrap(val)); + } + } + } -void -BuilderSWR::swr_gs_llvm_epilogue(const struct lp_build_tgsi_gs_iface *gs_base, - struct lp_build_tgsi_context * bld_base, - LLVMValueRef total_emitted_vertices_vec, - LLVMValueRef emitted_prims_vec) -{ - swr_gs_llvm_iface *iface = (swr_gs_llvm_iface*)gs_base; + RET_VOID(); - IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder))); + JM()->DumpToFile(pFunction, "src"); + gallivm_verify_function(gallivm, wrap(pFunction)); - // Store emit count to each output stream in the first DWORD - for (uint32_t lane = 0; lane < mVWidth; ++lane) - { - Value* pStream = LOAD(iface->pGsCtx, {0, SWR_GS_CONTEXT_pStreams, lane}); - pStream = BITCAST(pStream, mInt32PtrTy); - Value* pLaneCount = VEXTRACT(unwrap(total_emitted_vertices_vec), C(lane)); - STORE(pLaneCount, pStream); - } -} + gallivm_compile_module(gallivm); + JM()->DumpToFile(pFunction, "optimized"); -PFN_GS_FUNC -BuilderSWR::CompileGS(struct swr_context *ctx, swr_jit_gs_key &key) -{ - SWR_GS_STATE *pGS = &ctx->gs->gsState; - struct tgsi_shader_info *info = &ctx->gs->info.base; + PFN_TES_FUNC pFunc = + (PFN_TES_FUNC)gallivm_jit_function(gallivm, wrap(pFunction)); - memset(pGS, 0, sizeof(*pGS)); + debug_printf("tess evaluation shader %p\n", pFunc); + assert(pFunc && "Error: TessEvaluationShader = NULL"); - pGS->gsEnable = true; + JM()->DumpAsm(pFunction, "asm"); - pGS->numInputAttribs = info->num_inputs; - pGS->outputTopology = - swr_convert_prim_topology(info->properties[TGSI_PROPERTY_GS_OUTPUT_PRIM]); - pGS->maxNumVerts = info->properties[TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES]; - pGS->instanceCount = info->properties[TGSI_PROPERTY_GS_INVOCATIONS]; + JM()->mIsModuleFinalized = true; - // XXX: single stream for now... - pGS->isSingleStream = true; - pGS->singleStreamID = 0; + return pFunc; +} - pGS->vertexAttribOffset = VERTEX_ATTRIB_START_SLOT; // TODO: optimize - pGS->srcVertexAttribOffset = VERTEX_ATTRIB_START_SLOT; // TODO: optimize - pGS->inputVertStride = pGS->numInputAttribs + pGS->vertexAttribOffset; - pGS->outputVertexSize = SWR_VTX_NUM_SLOTS; - pGS->controlDataSize = 8; // GS ouputs max of 8 32B units - pGS->controlDataOffset = VERTEX_COUNT_SIZE; - pGS->outputVertexOffset = pGS->controlDataOffset + CONTROL_HEADER_SIZE; +PFN_TCS_FUNC +BuilderSWR::CompileTCS(struct swr_context *ctx, swr_jit_tcs_key &key) +{ + SWR_TS_STATE *pTS = &ctx->tsState; + struct tgsi_shader_info *info = &ctx->tcs->info.base; - pGS->allocationSize = - VERTEX_COUNT_SIZE + // vertex count - CONTROL_HEADER_SIZE + // control header - (SWR_VTX_NUM_SLOTS * 16) * // sizeof vertex - pGS->maxNumVerts; // num verts + pTS->numHsInputAttribs = info->num_inputs; + pTS->numHsOutputAttribs = info->num_outputs; - struct swr_geometry_shader *gs = ctx->gs; + pTS->hsAllocationSize = sizeof(ScalarPatch); + + pTS->vertexAttribOffset = VERTEX_ATTRIB_START_SLOT; + pTS->srcVertexAttribOffset = VERTEX_ATTRIB_START_SLOT; + + struct swr_tess_control_shader *tcs = ctx->tcs; LLVMValueRef inputs[PIPE_MAX_SHADER_INPUTS][TGSI_NUM_CHANNELS]; LLVMValueRef outputs[PIPE_MAX_SHADER_OUTPUTS][TGSI_NUM_CHANNELS]; @@ -589,18 +1993,20 @@ AttrBuilder attrBuilder; attrBuilder.addStackAlignmentAttr(JM()->mVWidth * sizeof(float)); - std::vector gsArgs{PointerType::get(Gen_swr_draw_context(JM()), 0), - PointerType::get(mInt8Ty, 0), - PointerType::get(Gen_SWR_GS_CONTEXT(JM()), 0)}; - FunctionType *vsFuncType = - FunctionType::get(Type::getVoidTy(JM()->mContext), gsArgs, false); + std::vector tcsArgs{ + PointerType::get(Gen_swr_draw_context(JM()), 0), + PointerType::get(mInt8Ty, 0), + PointerType::get(Gen_SWR_HS_CONTEXT(JM()), 0)}; + FunctionType *tcsFuncType = + FunctionType::get(Type::getVoidTy(JM()->mContext), tcsArgs, false); // create new vertex shader function - auto pFunction = Function::Create(vsFuncType, + auto pFunction = Function::Create(tcsFuncType, GlobalValue::ExternalLinkage, - "GS", + "TCS", JM()->mpCurrentModule); -#if HAVE_LLVM < 0x0500 + +#if LLVM_VERSION_MAJOR < 5 AttributeSet attrSet = AttributeSet::get( JM()->mContext, AttributeSet::FunctionIndex, attrBuilder); pFunction->addAttributes(AttributeSet::FunctionIndex, attrSet); @@ -617,35 +2023,57 @@ hPrivateData->setName("hPrivateData"); Value *pWorkerData = &*argitr++; pWorkerData->setName("pWorkerData"); - Value *pGsCtx = &*argitr++; - pGsCtx->setName("gsCtx"); + Value *pTcsCtx = &*argitr++; + pTcsCtx->setName("tcsCtx"); Value *consts_ptr = - GEP(hPrivateData, {C(0), C(swr_draw_context_constantGS)}); - consts_ptr->setName("gs_constants"); + GEP(hPrivateData, {C(0), C(swr_draw_context_constantTCS)}); + consts_ptr->setName("tcs_constants"); Value *const_sizes_ptr = - GEP(hPrivateData, {0, swr_draw_context_num_constantsGS}); - const_sizes_ptr->setName("num_gs_constants"); + GEP(hPrivateData, {0, swr_draw_context_num_constantsTCS}); + const_sizes_ptr->setName("num_tcs_constants"); struct lp_build_sampler_soa *sampler = - swr_sampler_soa_create(key.sampler, PIPE_SHADER_GEOMETRY); + swr_sampler_soa_create(key.sampler, PIPE_SHADER_TESS_CTRL); struct lp_bld_tgsi_system_values system_values; memset(&system_values, 0, sizeof(system_values)); - system_values.prim_id = wrap(LOAD(pGsCtx, {0, SWR_GS_CONTEXT_PrimitiveID})); - system_values.instance_id = wrap(LOAD(pGsCtx, {0, SWR_GS_CONTEXT_InstanceID})); - std::vector mapConstants; - Value *vtxAttribMap = ALLOCA(ArrayType::get(mInt32Ty, PIPE_MAX_SHADER_INPUTS)); + system_values.prim_id = + wrap(LOAD(pTcsCtx, {0, SWR_HS_CONTEXT_PrimitiveID})); + + Constant *vInvocationId; + if (mVWidth == 8) { + vInvocationId = C({0, 1, 2, 3, 4, 5, 6, 7}); + } else { + vInvocationId = + C({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}); + } + + system_values.invocation_id = wrap(vInvocationId); + system_values.vertices_in = wrap(C(tcs->vertices_per_patch)); + + if (verbose_shader) { + lp_build_print_value(gallivm, "TCS::prim_id = ", system_values.prim_id); + lp_build_print_value(gallivm, "TCS::invocation_id = ", system_values.invocation_id); + lp_build_print_value(gallivm, "TCS::vertices_in = ", system_values.vertices_in); + } + + std::vector mapConstants; + Value *vtxAttribMap = + ALLOCA(ArrayType::get(mInt32Ty, PIPE_MAX_SHADER_INPUTS)); + for (unsigned slot = 0; slot < info->num_inputs; slot++) { ubyte semantic_name = info->input_semantic_name[slot]; ubyte semantic_idx = info->input_semantic_index[slot]; - unsigned vs_slot = locate_linkage(semantic_name, semantic_idx, &ctx->vs->info.base); + unsigned vs_slot = + locate_linkage(semantic_name, semantic_idx, &ctx->vs->info.base); vs_slot += VERTEX_ATTRIB_START_SLOT; - if (ctx->vs->info.base.output_semantic_name[0] == TGSI_SEMANTIC_POSITION) + if (ctx->vs->info.base.output_semantic_name[0] + == TGSI_SEMANTIC_POSITION) vs_slot--; if (semantic_name == TGSI_SEMANTIC_POSITION) @@ -655,72 +2083,105 @@ mapConstants.push_back(C(vs_slot)); } - struct lp_build_mask_context mask; - Value *mask_val = LOAD(pGsCtx, {0, SWR_GS_CONTEXT_mask}, "gsMask"); - lp_build_mask_begin(&mask, gallivm, - lp_type_float_vec(32, 32 * 8), wrap(mask_val)); - - // zero out cut buffer so we can load/modify/store bits - for (uint32_t lane = 0; lane < mVWidth; ++lane) - { - Value* pStream = LOAD(pGsCtx, {0, SWR_GS_CONTEXT_pStreams, lane}); - MEMSET(pStream, C((char)0), VERTEX_COUNT_SIZE + CONTROL_HEADER_SIZE, sizeof(float) * KNOB_SIMD_WIDTH); + // Prepare map of output attributes. Needed when shader instance wants + // to read own output or output of other instance, which is allowed in TCS + Value *vtxOutputAttribMap = + ALLOCA(ArrayType::get(mInt32Ty, PIPE_MAX_SHADER_INPUTS)); + // Map for per-patch attributes + Value *patchOutputAttribMap = + ALLOCA(ArrayType::get(mInt32Ty, PIPE_MAX_SHADER_INPUTS)); + for (unsigned slot = 0; slot < info->num_outputs; slot++) { + ubyte name = info->output_semantic_name[slot]; + int32_t idx = info->output_semantic_index[slot]; + if (name == TGSI_SEMANTIC_PATCH) { + STORE(C(idx), patchOutputAttribMap, {0, slot}); + } else { + int32_t target_slot = slot; + if (name == TGSI_SEMANTIC_GENERIC) { + target_slot += VERTEX_ATTRIB_START_SLOT; + } + // Now normalize target slot + for (ubyte as = 0; as < slot; as++) { + ubyte name = info->output_semantic_name[as]; + switch (name) { + case TGSI_SEMANTIC_TESSOUTER: + case TGSI_SEMANTIC_TESSINNER: + case TGSI_SEMANTIC_PATCH: + case TGSI_SEMANTIC_POSITION: + target_slot--; + } + } + if (name == TGSI_SEMANTIC_POSITION) { + target_slot = VERTEX_POSITION_SLOT; + } + STORE(C(target_slot), vtxOutputAttribMap, {0, slot}); + mapConstants.push_back(C(target_slot)); + } } - struct swr_gs_llvm_iface gs_iface; - gs_iface.base.fetch_input = ::swr_gs_llvm_fetch_input; - gs_iface.base.emit_vertex = ::swr_gs_llvm_emit_vertex; - gs_iface.base.end_primitive = ::swr_gs_llvm_end_primitive; - gs_iface.base.gs_epilogue = ::swr_gs_llvm_epilogue; - gs_iface.pBuilder = this; - gs_iface.pGsCtx = pGsCtx; - gs_iface.pGsState = pGS; - gs_iface.num_outputs = gs->info.base.num_outputs; - gs_iface.num_verts_per_prim = - u_vertices_per_prim((pipe_prim_type)info->properties[TGSI_PROPERTY_GS_OUTPUT_PRIM]); - gs_iface.info = info; - gs_iface.pVtxAttribMap = vtxAttribMap; + struct lp_build_mask_context mask; + Value *mask_val = LOAD(pTcsCtx, {0, SWR_HS_CONTEXT_mask}, "tcsMask"); + lp_build_mask_begin( + &mask, gallivm, lp_type_float_vec(32, 32 * 8), wrap(mask_val)); + + struct swr_tcs_llvm_iface tcs_iface; + + tcs_iface.base.emit_store_output = ::swr_tcs_llvm_store_output; + tcs_iface.base.emit_fetch_input = ::swr_tcs_llvm_fetch_input; + tcs_iface.base.emit_fetch_output = ::swr_tcs_llvm_fetch_output; + tcs_iface.base.emit_barrier = ::swr_tcs_llvm_emit_barrier; + tcs_iface.base.emit_prologue = ::swr_tcs_llvm_emit_prologue; + tcs_iface.base.emit_epilogue = ::swr_tcs_llvm_emit_epilogue; + + tcs_iface.pBuilder = this; + tcs_iface.pTcsCtx = pTcsCtx; + tcs_iface.pTsState = pTS; + tcs_iface.output_vertices = info->properties[TGSI_PROPERTY_TCS_VERTICES_OUT]; + tcs_iface.info = info; + tcs_iface.pVtxAttribMap = vtxAttribMap; + tcs_iface.pVtxOutputAttribMap = vtxOutputAttribMap; + tcs_iface.pPatchOutputAttribMap = patchOutputAttribMap; struct lp_build_tgsi_params params; memset(¶ms, 0, sizeof(params)); params.type = lp_type_float_vec(32, 32 * 8); - params.mask = & mask; + params.mask = &mask; params.consts_ptr = wrap(consts_ptr); params.const_sizes_ptr = wrap(const_sizes_ptr); params.system_values = &system_values; params.inputs = inputs; params.context_ptr = wrap(hPrivateData); params.sampler = sampler; - params.info = &gs->info.base; - params.gs_iface = &gs_iface.base; + params.info = &tcs->info.base; + params.tcs_iface = &tcs_iface.base; - lp_build_tgsi_soa(gallivm, - gs->pipe.tokens, - ¶ms, - outputs); + lp_build_tgsi_soa(gallivm, tcs->pipe.tokens, ¶ms, outputs); lp_build_mask_end(&mask); sampler->destroy(sampler); IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder))); - RET_VOID(); + JM()->DumpToFile(pFunction, "src"); gallivm_verify_function(gallivm, wrap(pFunction)); gallivm_compile_module(gallivm); + JM()->DumpToFile(pFunction, "optimized"); - PFN_GS_FUNC pFunc = - (PFN_GS_FUNC)gallivm_jit_function(gallivm, wrap(pFunction)); + PFN_TCS_FUNC pFunc = + (PFN_TCS_FUNC)gallivm_jit_function(gallivm, wrap(pFunction)); - debug_printf("geom shader %p\n", pFunc); - assert(pFunc && "Error: GeomShader = NULL"); + debug_printf("tess control shader %p\n", pFunc); + assert(pFunc && "Error: TessControlShader = NULL"); + JM()->DumpAsm(pFunction, "asm"); JM()->mIsModuleFinalized = true; return pFunc; } + PFN_GS_FUNC swr_compile_gs(struct swr_context *ctx, swr_jit_gs_key &key) { @@ -729,7 +2190,35 @@ "GS"); PFN_GS_FUNC func = builder.CompileGS(ctx, key); - ctx->gs->map.insert(std::make_pair(key, make_unique(builder.gallivm, func))); + ctx->gs->map.insert(std::make_pair(key, std::unique_ptr(new VariantGS(builder.gallivm, func)))); + return func; +} + +PFN_TCS_FUNC +swr_compile_tcs(struct swr_context *ctx, swr_jit_tcs_key &key) +{ + BuilderSWR builder( + reinterpret_cast(swr_screen(ctx->pipe.screen)->hJitMgr), + "TCS"); + PFN_TCS_FUNC func = builder.CompileTCS(ctx, key); + + ctx->tcs->map.insert( + std::make_pair(key, std::unique_ptr(new VariantTCS(builder.gallivm, func)))); + + return func; +} + +PFN_TES_FUNC +swr_compile_tes(struct swr_context *ctx, swr_jit_tes_key &key) +{ + BuilderSWR builder( + reinterpret_cast(swr_screen(ctx->pipe.screen)->hJitMgr), + "TES"); + PFN_TES_FUNC func = builder.CompileTES(ctx, key); + + ctx->tes->map.insert( + std::make_pair(key, std::unique_ptr(new VariantTES(builder.gallivm, func)))); + return func; } @@ -747,6 +2236,10 @@ #else Value *pOut = GEP(pVtxOutput, {0, 0, slot}); STORE(pVal, pOut, {0, channel}); + if (verbose_shader) { + lp_build_printf(gallivm, "VS: Storing on slot %d, channel %d: ", C(slot), C(channel)); + lp_build_print_value(gallivm, "", wrap(pVal)); + } #endif } @@ -774,7 +2267,7 @@ GlobalValue::ExternalLinkage, "VS", JM()->mpCurrentModule); -#if HAVE_LLVM < 0x0500 +#if LLVM_VERSION_MAJOR < 5 AttributeSet attrSet = AttributeSet::get( JM()->mContext, AttributeSet::FunctionIndex, attrBuilder); pFunction->addAttributes(AttributeSet::FunctionIndex, attrSet); @@ -793,7 +2286,7 @@ pWorkerData->setName("pWorkerData"); Value *pVsCtx = &*argitr++; pVsCtx->setName("vsCtx"); - + Value *consts_ptr = GEP(hPrivateData, {C(0), C(swr_draw_context_constantVS)}); consts_ptr->setName("vs_constants"); @@ -909,12 +2402,23 @@ LLVMValueRef cz = LLVMBuildLoad(gallivm->builder, outputs[cv][2], ""); LLVMValueRef cw = LLVMBuildLoad(gallivm->builder, outputs[cv][3], ""); + tgsi_shader_info *pLastFE = &ctx->vs->info.base; + + if (ctx->gs) { + pLastFE = &ctx->gs->info.base; + } + else if (ctx->tes) { + pLastFE = &ctx->tes->info.base; + } + else if (ctx->tcs) { + pLastFE = &ctx->tcs->info.base; + } + for (unsigned val = 0; val < PIPE_MAX_CLIP_PLANES; val++) { // clip distance overrides user clip planes - if ((swr_vs->info.base.clipdist_writemask & clip_mask & (1 << val)) || - ((swr_vs->info.base.culldist_writemask << swr_vs->info.base.num_written_clipdistance) & (1 << val))) { - unsigned cv = locate_linkage(TGSI_SEMANTIC_CLIPDIST, val < 4 ? 0 : 1, - &swr_vs->info.base); + if ((pLastFE->clipdist_writemask & clip_mask & (1 << val)) || + ((pLastFE->culldist_writemask << pLastFE->num_written_clipdistance) & (1 << val))) { + unsigned cv = locate_linkage(TGSI_SEMANTIC_CLIPDIST, val < 4 ? 0 : 1, pLastFE); if (val < 4) { LLVMValueRef dist = LLVMBuildLoad(gallivm->builder, outputs[cv][val], ""); WriteVS(unwrap(dist), pVsCtx, vtxOutput, VERTEX_CLIPCULL_DIST_LO_SLOT, val); @@ -957,14 +2461,17 @@ RET_VOID(); + JM()->DumpToFile(pFunction, "vs_function1"); gallivm_verify_function(gallivm, wrap(pFunction)); gallivm_compile_module(gallivm); + JM()->DumpToFile(pFunction, "vs_function2"); // lp_debug_dump_value(func); PFN_VERTEX_FUNC pFunc = (PFN_VERTEX_FUNC)gallivm_jit_function(gallivm, wrap(pFunction)); + JM()->DumpAsm(pFunction, "vs_function_asm"); debug_printf("vert shader %p\n", pFunc); assert(pFunc && "Error: VertShader = NULL"); @@ -984,7 +2491,7 @@ "VS"); PFN_VERTEX_FUNC func = builder.CompileVS(ctx, key); - ctx->vs->map.insert(std::make_pair(key, make_unique(builder.gallivm, func))); + ctx->vs->map.insert(std::make_pair(key, std::unique_ptr(new VariantVS(builder.gallivm, func)))); return func; } @@ -1036,6 +2543,8 @@ struct tgsi_shader_info *pPrevShader; if (ctx->gs) pPrevShader = &ctx->gs->info.base; + else if (ctx->tes) + pPrevShader = &ctx->tes->info.base; else pPrevShader = &ctx->vs->info.base; @@ -1060,7 +2569,7 @@ GlobalValue::ExternalLinkage, "FS", JM()->mpCurrentModule); -#if HAVE_LLVM < 0x0500 +#if LLVM_VERSION_MAJOR < 5 AttributeSet attrSet = AttributeSet::get( JM()->mContext, AttributeSet::FunctionIndex, attrBuilder); pFunction->addAttributes(AttributeSet::FunctionIndex, attrSet); @@ -1170,8 +2679,23 @@ inputs[attrib][3] = wrap(LOAD(pPS, {0, SWR_PS_CONTEXT_vOneOverW, PixelPositions_center}, "vOneOverW")); continue; + } else if (semantic_name == TGSI_SEMANTIC_LAYER) { // gl_Layer + Value *ff = LOAD(pPS, {0, SWR_PS_CONTEXT_renderTargetArrayIndex}); + ff = VECTOR_SPLAT(JM()->mVWidth, ff, "vRenderTargetArrayIndex"); + inputs[attrib][0] = wrap(ff); + inputs[attrib][1] = wrap(VIMMED1(0.0f)); + inputs[attrib][2] = wrap(VIMMED1(0.0f)); + inputs[attrib][3] = wrap(VIMMED1(0.0f)); + continue; + } else if (semantic_name == TGSI_SEMANTIC_VIEWPORT_INDEX) { // gl_ViewportIndex + Value *ff = LOAD(pPS, {0, SWR_PS_CONTEXT_viewportIndex}); + ff = VECTOR_SPLAT(JM()->mVWidth, ff, "vViewportIndex"); + inputs[attrib][0] = wrap(ff); + inputs[attrib][1] = wrap(VIMMED1(0.0f)); + inputs[attrib][2] = wrap(VIMMED1(0.0f)); + inputs[attrib][3] = wrap(VIMMED1(0.0f)); + continue; } - unsigned linkedAttrib = locate_linkage(semantic_name, semantic_idx, pPrevShader) - 1; @@ -1436,6 +2960,6 @@ "FS"); PFN_PIXEL_KERNEL func = builder.CompileFS(ctx, key); - ctx->fs->map.insert(std::make_pair(key, make_unique(builder.gallivm, func))); + ctx->fs->map.insert(std::make_pair(key, std::unique_ptr(new VariantFS(builder.gallivm, func)))); return func; } diff -Nru mesa-19.2.8/src/gallium/drivers/swr/swr_shader.h mesa-20.0.8/src/gallium/drivers/swr/swr_shader.h --- mesa-19.2.8/src/gallium/drivers/swr/swr_shader.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/swr/swr_shader.h 2020-06-12 01:21:17.000000000 +0000 @@ -26,9 +26,17 @@ struct swr_vertex_shader; struct swr_fragment_shader; struct swr_geometry_shader; +struct swr_tess_control_shader; +struct swr_tess_evaluation_shader; + struct swr_jit_fs_key; struct swr_jit_vs_key; struct swr_jit_gs_key; +struct swr_jit_tcs_key; +struct swr_jit_tes_key; + +using PFN_TCS_FUNC = PFN_HS_FUNC; +using PFN_TES_FUNC = PFN_DS_FUNC; unsigned swr_so_adjust_attrib(unsigned in_attrib, swr_vertex_shader *swr_vs); @@ -42,6 +50,12 @@ PFN_GS_FUNC swr_compile_gs(struct swr_context *ctx, swr_jit_gs_key &key); +PFN_TCS_FUNC +swr_compile_tcs(struct swr_context *ctx, swr_jit_tcs_key &key); + +PFN_TES_FUNC +swr_compile_tes(struct swr_context *ctx, swr_jit_tes_key &key); + void swr_generate_fs_key(struct swr_jit_fs_key &key, struct swr_context *ctx, swr_fragment_shader *swr_fs); @@ -57,6 +71,14 @@ struct swr_context *ctx, swr_geometry_shader *swr_gs); +void swr_generate_tcs_key(struct swr_jit_tcs_key &key, + struct swr_context *ctx, + swr_tess_control_shader *swr_tcs); + +void swr_generate_tes_key(struct swr_jit_tes_key &key, + struct swr_context *ctx, + swr_tess_evaluation_shader *swr_tes); + struct swr_jit_sampler_key { unsigned nr_samplers; unsigned nr_sampler_views; @@ -85,6 +107,21 @@ ubyte vs_output_semantic_idx[PIPE_MAX_SHADER_OUTPUTS]; }; +// TESS_TODO: revisit this - we probably need to use +// primitive modes, number of vertices emitted, etc. +struct swr_jit_tcs_key : swr_jit_sampler_key { + ubyte vs_output_semantic_name[PIPE_MAX_SHADER_OUTPUTS]; + ubyte vs_output_semantic_idx[PIPE_MAX_SHADER_OUTPUTS]; + unsigned clip_plane_mask; // from rasterizer state & tcs_info +}; + +// TESS_TODO: revisit this +struct swr_jit_tes_key : swr_jit_sampler_key { + ubyte prev_output_semantic_name[PIPE_MAX_SHADER_OUTPUTS]; + ubyte prev_output_semantic_idx[PIPE_MAX_SHADER_OUTPUTS]; + unsigned clip_plane_mask; // from rasterizer state & tes_info +}; + namespace std { template <> struct hash { @@ -114,9 +151,25 @@ return util_hash_crc32(&k, sizeof(k)); } }; + +template <> struct hash { + std::size_t operator()(const swr_jit_tcs_key &k) const + { + return util_hash_crc32(&k, sizeof(k)); + } +}; + +template <> struct hash { + std::size_t operator()(const swr_jit_tes_key &k) const + { + return util_hash_crc32(&k, sizeof(k)); + } +}; }; bool operator==(const swr_jit_fs_key &lhs, const swr_jit_fs_key &rhs); bool operator==(const swr_jit_vs_key &lhs, const swr_jit_vs_key &rhs); bool operator==(const swr_jit_fetch_key &lhs, const swr_jit_fetch_key &rhs); bool operator==(const swr_jit_gs_key &lhs, const swr_jit_gs_key &rhs); +bool operator==(const swr_jit_tcs_key &lhs, const swr_jit_tcs_key &rhs); +bool operator==(const swr_jit_tes_key &lhs, const swr_jit_tes_key &rhs); diff -Nru mesa-19.2.8/src/gallium/drivers/swr/swr_state.cpp mesa-20.0.8/src/gallium/drivers/swr/swr_state.cpp --- mesa-19.2.8/src/gallium/drivers/swr/swr_state.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/swr/swr_state.cpp 2020-06-12 01:21:17.000000000 +0000 @@ -21,13 +21,20 @@ * IN THE SOFTWARE. ***************************************************************************/ +#include + +#if LLVM_VERSION_MAJOR < 7 // llvm redefines DEBUG #pragma push_macro("DEBUG") #undef DEBUG +#endif #include #include "JitManager.h" + +#if LLVM_VERSION_MAJOR < 7 #pragma pop_macro("DEBUG") +#endif #include "common/os.h" #include "jit_api.h" @@ -36,7 +43,7 @@ #include "core/state_funcs.h" #include "gallivm/lp_bld_tgsi.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_memory.h" #include "util/u_inlines.h" @@ -432,7 +439,6 @@ return swr_gs; } - static void swr_bind_gs_state(struct pipe_context *pipe, void *gs) { @@ -456,6 +462,86 @@ swr_fence_work_delete_gs(screen->flush_fence, swr_gs); } +static void * +swr_create_tcs_state(struct pipe_context *pipe, + const struct pipe_shader_state *tcs) +{ + struct swr_tess_control_shader *swr_tcs = new swr_tess_control_shader; + if (!swr_tcs) + return NULL; + + swr_tcs->pipe.tokens = tgsi_dup_tokens(tcs->tokens); + lp_build_tgsi_info(tcs->tokens, &swr_tcs->info); + return swr_tcs; +} + +static void +swr_bind_tcs_state(struct pipe_context *pipe, void *tcs) +{ + struct swr_context *ctx = swr_context(pipe); + + if (ctx->tcs == tcs) + return; + + ctx->tcs = (swr_tess_control_shader *)tcs; + ctx->dirty |= SWR_NEW_TCS; + ctx->dirty |= SWR_NEW_TS; +} + +static void +swr_delete_tcs_state(struct pipe_context *pipe, void *tcs) +{ + struct swr_tess_control_shader *swr_tcs = (swr_tess_control_shader *)tcs; + FREE((void *)swr_tcs->pipe.tokens); + struct swr_screen *screen = swr_screen(pipe->screen); + + /* Defer deleton of tcs state */ + swr_fence_work_delete_tcs(screen->flush_fence, swr_tcs); +} + +static void * +swr_create_tes_state(struct pipe_context *pipe, + const struct pipe_shader_state *tes) +{ + struct swr_tess_evaluation_shader *swr_tes = new swr_tess_evaluation_shader; + if (!swr_tes) + return NULL; + + swr_tes->pipe.tokens = tgsi_dup_tokens(tes->tokens); + lp_build_tgsi_info(tes->tokens, &swr_tes->info); + return swr_tes; +} + +static void +swr_bind_tes_state(struct pipe_context *pipe, void *tes) +{ + struct swr_context *ctx = swr_context(pipe); + + if (ctx->tes == tes) + return; + + // Save current tessellator state first + if (ctx->tes != nullptr) { + ctx->tes->ts_state = ctx->tsState; + } + + ctx->tes = (swr_tess_evaluation_shader *)tes; + + ctx->dirty |= SWR_NEW_TES; + ctx->dirty |= SWR_NEW_TS; +} + +static void +swr_delete_tes_state(struct pipe_context *pipe, void *tes) +{ + struct swr_tess_evaluation_shader *swr_tes = (swr_tess_evaluation_shader *)tes; + FREE((void *)swr_tes->pipe.tokens); + struct swr_screen *screen = swr_screen(pipe->screen); + + /* Defer deleton of tes state */ + swr_fence_work_delete_tes(screen->flush_fence, swr_tes); +} + static void swr_set_constant_buffer(struct pipe_context *pipe, enum pipe_shader_type shader, @@ -477,8 +563,11 @@ ctx->dirty |= SWR_NEW_FSCONSTANTS; } else if (shader == PIPE_SHADER_GEOMETRY) { ctx->dirty |= SWR_NEW_GSCONSTANTS; + } else if (shader == PIPE_SHADER_TESS_CTRL) { + ctx->dirty |= SWR_NEW_TCSCONSTANTS; + } else if (shader == PIPE_SHADER_TESS_EVAL) { + ctx->dirty |= SWR_NEW_TESCONSTANTS; } - if (cb && cb->user_buffer) { pipe_resource_reference(&constants, NULL); } @@ -869,8 +958,18 @@ num_constants = pDC->num_constantsGS; scratch = &ctx->scratch->gs_constants; break; + case PIPE_SHADER_TESS_CTRL: + constant = pDC->constantTCS; + num_constants = pDC->num_constantsTCS; + scratch = &ctx->scratch->tcs_constants; + break; + case PIPE_SHADER_TESS_EVAL: + constant = pDC->constantTES; + num_constants = pDC->num_constantsTES; + scratch = &ctx->scratch->tes_constants; + break; default: - debug_printf("Unsupported shader type constants\n"); + assert(0 && "Unsupported shader type constants"); return; } @@ -1034,6 +1133,25 @@ sizeof(ctx->poly_stipple.pipe.stipple)); } + +static struct tgsi_shader_info * +swr_get_last_fe(const struct swr_context *ctx) +{ + tgsi_shader_info *pLastFE = &ctx->vs->info.base; + + if (ctx->gs) { + pLastFE = &ctx->gs->info.base; + } + else if (ctx->tes) { + pLastFE = &ctx->tes->info.base; + } + else if (ctx->tcs) { + pLastFE = &ctx->tcs->info.base; + } + return pLastFE; +} + + void swr_update_derived(struct pipe_context *pipe, const struct pipe_draw_info *p_draw_info) @@ -1121,6 +1239,8 @@ /* Raster state */ if (ctx->dirty & (SWR_NEW_RASTERIZER | SWR_NEW_VS | // clipping + SWR_NEW_TES | + SWR_NEW_TCS | SWR_NEW_FRAMEBUFFER)) { pipe_rasterizer_state *rasterizer = ctx->rasterizer; pipe_framebuffer_state *fb = &ctx->framebuffer; @@ -1224,6 +1344,14 @@ util_viewport_zmin_zmax(state, rasterizer->clip_halfz, &vp->minZ, &vp->maxZ); + if (rasterizer->depth_clip_near) { + vp->minZ = 0.0f; + } + + if (rasterizer->depth_clip_far) { + vp->maxZ = 1.0f; + } + vpm->m00[i] = state->scale[0]; vpm->m11[i] = state->scale[1]; vpm->m22[i] = state->scale[2]; @@ -1292,20 +1420,12 @@ partial_inbounds = 0; min_vertex_index = info.min_index + info.index_bias; - size = AlignUp(size, 4); - /* If size of client memory copy is too large, don't copy. The - * draw will access user-buffer directly and then block. This is - * faster than queuing many large client draws. */ - if (size >= screen->client_copy_limit) { - post_update_dirty_flags |= SWR_LARGE_CLIENT_DRAW; - p_data = (const uint8_t *) vb->buffer.user; - } else { - /* Copy only needed vertices to scratch space */ - const void *ptr = (const uint8_t *) vb->buffer.user + base; - ptr = (uint8_t *)swr_copy_to_scratch_space( - ctx, &ctx->scratch->vertex_buffer, ptr, size); - p_data = (const uint8_t *)ptr - base; - } + /* Use user memory directly. The draw will access user-buffer + * directly and then block. It's easier and usually + * faster than copying. + */ + post_update_dirty_flags |= SWR_BLOCK_CLIENT_DRAW; + p_data = (const uint8_t *) vb->buffer.user; } else if (vb->buffer.resource) { /* VBO */ if (!pitch) { @@ -1365,20 +1485,13 @@ post_update_dirty_flags |= SWR_NEW_VERTEX; size = info.count * pitch; - size = AlignUp(size, 4); - /* If size of client memory copy is too large, don't copy. The - * draw will access user-buffer directly and then block. This is - * faster than queuing many large client draws. */ - if (size >= screen->client_copy_limit) { - post_update_dirty_flags |= SWR_LARGE_CLIENT_DRAW; - p_data = (const uint8_t *) info.index.user; - } else { - /* Copy indices to scratch space */ - const void *ptr = info.index.user; - ptr = swr_copy_to_scratch_space( - ctx, &ctx->scratch->index_buffer, ptr, size); - p_data = (const uint8_t *)ptr; - } + + /* Use user memory directly. The draw will access user-buffer + * directly and then block. It's easier and usually + * faster than copying. + */ + post_update_dirty_flags |= SWR_BLOCK_CLIENT_DRAW; + p_data = (const uint8_t *) info.index.user; } SWR_INDEX_BUFFER_STATE swrIndexBuffer; @@ -1399,6 +1512,8 @@ /* GeometryShader */ if (ctx->dirty & (SWR_NEW_GS | SWR_NEW_VS | + SWR_NEW_TCS | + SWR_NEW_TES | SWR_NEW_SAMPLER | SWR_NEW_SAMPLER_VIEW)) { if (ctx->gs) { @@ -1437,12 +1552,114 @@ } } - /* VertexShader */ - if (ctx->dirty & (SWR_NEW_VS | - SWR_NEW_RASTERIZER | // for clip planes + // We may need to restore tessellation state + // This restored state may be however overwritten + // during shader compilation + if (ctx->dirty & SWR_NEW_TS) { + if (ctx->tes != nullptr) { + ctx->tsState = ctx->tes->ts_state; + ctx->api.pfnSwrSetTsState(ctx->swrContext, &ctx->tsState); + } else { + SWR_TS_STATE state = { 0 }; + ctx->api.pfnSwrSetTsState(ctx->swrContext, &state); + } + } + + // Tessellation Evaluation Shader + // Compile TES first, because TCS is optional + if (ctx->dirty & (SWR_NEW_GS | + SWR_NEW_VS | + SWR_NEW_TCS | + SWR_NEW_TES | SWR_NEW_SAMPLER | - SWR_NEW_SAMPLER_VIEW | - SWR_NEW_FRAMEBUFFER)) { + SWR_NEW_SAMPLER_VIEW)) { + if (ctx->tes) { + swr_jit_tes_key key; + swr_generate_tes_key(key, ctx, ctx->tes); + + auto search = ctx->tes->map.find(key); + PFN_TES_FUNC func; + if (search != ctx->tes->map.end()) { + func = search->second->shader; + } else { + func = swr_compile_tes(ctx, key); + } + + ctx->api.pfnSwrSetDsFunc(ctx->swrContext, func); + + /* JIT sampler state */ + if (ctx->dirty & SWR_NEW_SAMPLER) { + swr_update_sampler_state(ctx, + PIPE_SHADER_TESS_EVAL, + key.nr_samplers, + ctx->swrDC.samplersTES); + } + + /* JIT sampler view state */ + if (ctx->dirty & (SWR_NEW_SAMPLER_VIEW | SWR_NEW_FRAMEBUFFER)) { + swr_update_texture_state(ctx, + PIPE_SHADER_TESS_EVAL, + key.nr_sampler_views, + ctx->swrDC.texturesTES); + } + + // Update tessellation state in case it's been updated + ctx->api.pfnSwrSetTsState(ctx->swrContext, &ctx->tsState); + } else { + ctx->api.pfnSwrSetDsFunc(ctx->swrContext, NULL); + } + } + + /* Tessellation Control Shader */ + if (ctx->dirty & (SWR_NEW_GS | + SWR_NEW_VS | + SWR_NEW_TCS | + SWR_NEW_TES | + SWR_NEW_SAMPLER | + SWR_NEW_SAMPLER_VIEW)) { + if (ctx->tcs) { + ctx->tcs->vertices_per_patch = p_draw_info->vertices_per_patch; + + swr_jit_tcs_key key; + swr_generate_tcs_key(key, ctx, ctx->tcs); + + auto search = ctx->tcs->map.find(key); + PFN_TCS_FUNC func; + if (search != ctx->tcs->map.end()) { + func = search->second->shader; + } else { + func = swr_compile_tcs(ctx, key); + } + + ctx->api.pfnSwrSetHsFunc(ctx->swrContext, func); + + /* JIT sampler state */ + if (ctx->dirty & SWR_NEW_SAMPLER) { + swr_update_sampler_state(ctx, + PIPE_SHADER_TESS_CTRL, + key.nr_samplers, + ctx->swrDC.samplersTCS); + } + + /* JIT sampler view state */ + if (ctx->dirty & (SWR_NEW_SAMPLER_VIEW | SWR_NEW_FRAMEBUFFER)) { + swr_update_texture_state(ctx, + PIPE_SHADER_TESS_CTRL, + key.nr_sampler_views, + ctx->swrDC.texturesTCS); + } + + // Update tessellation state in case it's been updated + ctx->api.pfnSwrSetTsState(ctx->swrContext, &ctx->tsState); + } else { + ctx->api.pfnSwrSetHsFunc(ctx->swrContext, NULL); + } + } + + /* VertexShader */ + if (ctx->dirty + & (SWR_NEW_VS | SWR_NEW_RASTERIZER | // for clip planes + SWR_NEW_SAMPLER | SWR_NEW_SAMPLER_VIEW | SWR_NEW_FRAMEBUFFER)) { swr_jit_vs_key key; swr_generate_vs_key(key, ctx, ctx->vs); auto search = ctx->vs->map.find(key); @@ -1456,10 +1673,8 @@ /* JIT sampler state */ if (ctx->dirty & SWR_NEW_SAMPLER) { - swr_update_sampler_state(ctx, - PIPE_SHADER_VERTEX, - key.nr_samplers, - ctx->swrDC.samplersVS); + swr_update_sampler_state( + ctx, PIPE_SHADER_VERTEX, key.nr_samplers, ctx->swrDC.samplersVS); } /* JIT sampler view state */ @@ -1488,6 +1703,8 @@ if (ctx->dirty & (SWR_NEW_FS | SWR_NEW_VS | SWR_NEW_GS | + SWR_NEW_TES | + SWR_NEW_TCS | SWR_NEW_RASTERIZER | SWR_NEW_SAMPLER | SWR_NEW_SAMPLER_VIEW | @@ -1578,6 +1795,16 @@ swr_update_constants(ctx, PIPE_SHADER_GEOMETRY); } + /* Tessellation Control Shader Constants */ + if (ctx->dirty & SWR_NEW_TCSCONSTANTS) { + swr_update_constants(ctx, PIPE_SHADER_TESS_CTRL); + } + + /* Tessellation Evaluation Shader Constants */ + if (ctx->dirty & SWR_NEW_TESCONSTANTS) { + swr_update_constants(ctx, PIPE_SHADER_TESS_EVAL); + } + /* Depth/stencil state */ if (ctx->dirty & (SWR_NEW_DEPTH_STENCIL_ALPHA | SWR_NEW_FRAMEBUFFER)) { struct pipe_depth_state *depth = &(ctx->depth_stencil->depth); @@ -1718,7 +1945,7 @@ compileState.alphaTestFormat = ALPHA_TEST_FLOAT32; // xxx compileState.Canonicalize(); - + PFN_BLEND_JIT_FUNC func = NULL; auto search = ctx->blendJIT->find(compileState); if (search != ctx->blendJIT->end()) { @@ -1741,33 +1968,34 @@ swr_update_poly_stipple(ctx); } - if (ctx->dirty & (SWR_NEW_VS | SWR_NEW_SO | SWR_NEW_RASTERIZER)) { + if (ctx->dirty & (SWR_NEW_VS | SWR_NEW_TCS | SWR_NEW_TES | SWR_NEW_SO | SWR_NEW_RASTERIZER)) { ctx->vs->soState.rasterizerDisable = ctx->rasterizer->rasterizer_discard; ctx->api.pfnSwrSetSoState(ctx->swrContext, &ctx->vs->soState); pipe_stream_output_info *stream_output = &ctx->vs->pipe.stream_output; - for (uint32_t i = 0; i < ctx->num_so_targets; i++) { + for (uint32_t i = 0; i < MAX_SO_STREAMS; i++) { SWR_STREAMOUT_BUFFER buffer = {0}; - if (!ctx->so_targets[i]) - continue; - buffer.enable = true; - buffer.pBuffer = - (gfxptr_t)(swr_resource_data(ctx->so_targets[i]->buffer) + - ctx->so_targets[i]->buffer_offset); - buffer.bufferSize = ctx->so_targets[i]->buffer_size >> 2; - buffer.pitch = stream_output->stride[i]; - buffer.streamOffset = 0; + if (ctx->so_targets[i]) { + buffer.enable = true; + buffer.pBuffer = + (gfxptr_t)(swr_resource_data(ctx->so_targets[i]->buffer) + + ctx->so_targets[i]->buffer_offset); + buffer.bufferSize = ctx->so_targets[i]->buffer_size >> 2; + buffer.pitch = stream_output->stride[i]; + buffer.streamOffset = 0; + } ctx->api.pfnSwrSetSoBuffers(ctx->swrContext, &buffer, i); } } + if (ctx->dirty & (SWR_NEW_CLIP | SWR_NEW_RASTERIZER | SWR_NEW_VS)) { // shader exporting clip distances overrides all user clip planes if (ctx->rasterizer->clip_plane_enable && - !ctx->vs->info.base.num_written_clipdistance) + !swr_get_last_fe(ctx)->num_written_clipdistance) { swr_draw_context *pDC = &ctx->swrDC; memcpy(pDC->userClipPlanes, @@ -1780,7 +2008,12 @@ SWR_BACKEND_STATE backendState = {0}; if (ctx->gs) { backendState.numAttributes = ctx->gs->info.base.num_outputs - 1; - } else { + } else + if (ctx->tes) { + backendState.numAttributes = ctx->tes->info.base.num_outputs - 1; + // no case for TCS, because if TCS is active, TES must be active + // as well - pipeline stages after tessellation does not support patches + } else { backendState.numAttributes = ctx->vs->info.base.num_outputs - 1; if (ctx->fs->info.base.uses_primid) { backendState.numAttributes++; @@ -1804,21 +2037,19 @@ (ctx->rasterizer->flatshade ? ctx->fs->flatConstantMask : 0); backendState.pointSpriteTexCoordMask = ctx->fs->pointSpriteMask; - struct tgsi_shader_info *pLastFE = - ctx->gs ? - &ctx->gs->info.base : - &ctx->vs->info.base; + struct tgsi_shader_info *pLastFE = swr_get_last_fe(ctx); + backendState.readRenderTargetArrayIndex = pLastFE->writes_layer; backendState.readViewportArrayIndex = pLastFE->writes_viewport_index; backendState.vertexAttribOffset = VERTEX_ATTRIB_START_SLOT; // TODO: optimize backendState.clipDistanceMask = - ctx->vs->info.base.num_written_clipdistance ? - ctx->vs->info.base.clipdist_writemask & ctx->rasterizer->clip_plane_enable : + pLastFE->num_written_clipdistance ? + pLastFE->clipdist_writemask & ctx->rasterizer->clip_plane_enable : ctx->rasterizer->clip_plane_enable; backendState.cullDistanceMask = - ctx->vs->info.base.culldist_writemask << ctx->vs->info.base.num_written_clipdistance; + pLastFE->culldist_writemask << pLastFE->num_written_clipdistance; // Assume old layout of SGV, POSITION, CLIPCULL, ATTRIB backendState.vertexClipCullOffset = backendState.vertexAttribOffset - 2; @@ -1887,6 +2118,7 @@ } swr->num_so_targets = num_targets; + swr->swrDC.soPrims = &swr->so_primCounter; swr->dirty |= SWR_NEW_SO; } @@ -1927,6 +2159,14 @@ pipe->bind_gs_state = swr_bind_gs_state; pipe->delete_gs_state = swr_delete_gs_state; + pipe->create_tcs_state = swr_create_tcs_state; + pipe->bind_tcs_state = swr_bind_tcs_state; + pipe->delete_tcs_state = swr_delete_tcs_state; + + pipe->create_tes_state = swr_create_tes_state; + pipe->bind_tes_state = swr_bind_tes_state; + pipe->delete_tes_state = swr_delete_tes_state; + pipe->set_constant_buffer = swr_set_constant_buffer; pipe->create_vertex_elements_state = swr_create_vertex_elements_state; diff -Nru mesa-19.2.8/src/gallium/drivers/swr/swr_state.h mesa-20.0.8/src/gallium/drivers/swr/swr_state.h --- mesa-19.2.8/src/gallium/drivers/swr/swr_state.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/swr/swr_state.h 2020-06-12 01:21:17.000000000 +0000 @@ -46,9 +46,14 @@ ~ShaderVariant() { gallivm_destroy(gallivm); } }; +using PFN_TCS_FUNC = PFN_HS_FUNC; +using PFN_TES_FUNC = PFN_DS_FUNC; + typedef ShaderVariant VariantVS; typedef ShaderVariant VariantFS; typedef ShaderVariant VariantGS; +typedef ShaderVariant VariantTCS; +typedef ShaderVariant VariantTES; /* skeleton */ struct swr_vertex_shader { @@ -76,6 +81,23 @@ std::unordered_map> map; }; +struct swr_tess_control_shader { + struct pipe_shader_state pipe; + struct lp_tgsi_info info; + uint32_t vertices_per_patch; + + std::unordered_map> map; +}; + +struct swr_tess_evaluation_shader { + struct pipe_shader_state pipe; + struct lp_tgsi_info info; + SWR_TS_STATE ts_state; + + std::unordered_map> map; +}; + + /* Vertex element state */ struct swr_vertex_element_state { FETCH_COMPILE_STATE fsState; @@ -340,7 +362,7 @@ * Convert mesa PIPE_PRIM_X to SWR enum PRIMITIVE_TOPOLOGY */ static INLINE enum PRIMITIVE_TOPOLOGY -swr_convert_prim_topology(const unsigned mode) +swr_convert_prim_topology(const unsigned mode, const unsigned tcs_verts) { switch (mode) { case PIPE_PRIM_POINTS: @@ -371,6 +393,9 @@ return TOP_TRI_LIST_ADJ; case PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY: return TOP_TRI_STRIP_ADJ; + case PIPE_PRIM_PATCHES: + // rasterizer has a separate type for each possible number of patch vertices + return (PRIMITIVE_TOPOLOGY)((unsigned)TOP_PATCHLIST_BASE + tcs_verts); default: assert(0 && "Unknown topology"); return TOP_UNKNOWN; @@ -396,4 +421,5 @@ } } + #endif diff -Nru mesa-19.2.8/src/gallium/drivers/swr/swr_tex_sample.cpp mesa-20.0.8/src/gallium/drivers/swr/swr_tex_sample.cpp --- mesa-19.2.8/src/gallium/drivers/swr/swr_tex_sample.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/swr/swr_tex_sample.cpp 2020-06-12 01:21:17.000000000 +0000 @@ -127,6 +127,12 @@ case PIPE_SHADER_GEOMETRY: indices[1] = lp_build_const_int32(gallivm, swr_draw_context_texturesGS); break; + case PIPE_SHADER_TESS_CTRL: + indices[1] = lp_build_const_int32(gallivm, swr_draw_context_texturesTCS); + break; + case PIPE_SHADER_TESS_EVAL: + indices[1] = lp_build_const_int32(gallivm, swr_draw_context_texturesTES); + break; default: assert(0 && "unsupported shader type"); break; @@ -224,6 +230,12 @@ case PIPE_SHADER_GEOMETRY: indices[1] = lp_build_const_int32(gallivm, swr_draw_context_samplersGS); break; + case PIPE_SHADER_TESS_CTRL: + indices[1] = lp_build_const_int32(gallivm, swr_draw_context_samplersTCS); + break; + case PIPE_SHADER_TESS_EVAL: + indices[1] = lp_build_const_int32(gallivm, swr_draw_context_samplersTES); + break; default: assert(0 && "unsupported shader type"); break; diff -Nru mesa-19.2.8/src/gallium/drivers/v3d/meson.build mesa-20.0.8/src/gallium/drivers/v3d/meson.build --- mesa-19.2.8/src/gallium/drivers/v3d/meson.build 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/v3d/meson.build 2020-06-12 01:21:17.000000000 +0000 @@ -73,14 +73,14 @@ foreach ver : v3d_versions per_version_libs += static_library( 'v3d-v' + ver, - [files_per_version, v3d_xml_pack, nir_opcodes_h, nir_builder_opcodes_h], + [files_per_version, v3d_xml_pack], include_directories : [ inc_src, inc_include, inc_gallium, inc_gallium_aux, inc_broadcom, inc_gallium_drivers, ], c_args : [c_vis_args, v3d_args, '-DV3D_VERSION=' + ver], cpp_args : [cpp_vis_args], - dependencies : [dep_v3dv3, dep_libdrm, dep_valgrind], + dependencies : [dep_v3dv3, dep_libdrm, dep_valgrind, idep_nir_headers], ) endforeach diff -Nru mesa-19.2.8/src/gallium/drivers/v3d/v3d_blit.c mesa-20.0.8/src/gallium/drivers/v3d/v3d_blit.c --- mesa-19.2.8/src/gallium/drivers/v3d/v3d_blit.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/v3d/v3d_blit.c 2020-06-12 01:21:17.000000000 +0000 @@ -21,7 +21,7 @@ * IN THE SOFTWARE. */ -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_surface.h" #include "util/u_blitter.h" #include "v3d_context.h" @@ -128,7 +128,7 @@ struct pipe_surface *src_surf = v3d_get_blit_surface(pctx, info->src.resource, info->src.level); - v3d_flush_jobs_reading_resource(v3d, info->src.resource); + v3d_flush_jobs_reading_resource(v3d, info->src.resource, false); struct v3d_job *job = v3d_get_job(v3d, dst_surf, NULL); pipe_surface_reference(&job->color_read, src_surf); @@ -172,6 +172,7 @@ util_blitter_save_vertex_buffer_slot(v3d->blitter, v3d->vertexbuf.vb); util_blitter_save_vertex_elements(v3d->blitter, v3d->vtx); util_blitter_save_vertex_shader(v3d->blitter, v3d->prog.bind_vs); + util_blitter_save_geometry_shader(v3d->blitter, v3d->prog.bind_gs); util_blitter_save_so_targets(v3d->blitter, v3d->streamout.num_targets, v3d->streamout.targets); util_blitter_save_rasterizer(v3d->blitter, v3d->rasterizer); @@ -380,8 +381,8 @@ if (dst_base_slice->tiling == VC5_TILING_RASTER) return false; - v3d_flush_jobs_writing_resource(v3d, psrc, V3D_FLUSH_DEFAULT); - v3d_flush_jobs_reading_resource(v3d, pdst, V3D_FLUSH_DEFAULT); + v3d_flush_jobs_writing_resource(v3d, psrc, V3D_FLUSH_DEFAULT, false); + v3d_flush_jobs_reading_resource(v3d, pdst, V3D_FLUSH_DEFAULT, false); struct drm_v3d_submit_tfu tfu = { .ios = (height << 16) | width, @@ -538,5 +539,5 @@ * texture uploads before using the textures. */ v3d_flush_jobs_writing_resource(v3d, info.dst.resource, - V3D_FLUSH_DEFAULT); + V3D_FLUSH_DEFAULT, false); } diff -Nru mesa-19.2.8/src/gallium/drivers/v3d/v3d_bufmgr.c mesa-20.0.8/src/gallium/drivers/v3d/v3d_bufmgr.c --- mesa-19.2.8/src/gallium/drivers/v3d/v3d_bufmgr.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/v3d/v3d_bufmgr.c 2020-06-12 01:21:17.000000000 +0000 @@ -65,7 +65,7 @@ fprintf(stderr, " BOs cached: %d\n", cache_count); fprintf(stderr, " BOs cached size: %dkb\n", cache_size / 1024); - if (!list_empty(&cache->time_list)) { + if (!list_is_empty(&cache->time_list)) { struct v3d_bo *first = list_first_entry(&cache->time_list, struct v3d_bo, time_list); @@ -103,7 +103,7 @@ struct v3d_bo *bo = NULL; mtx_lock(&cache->lock); - if (!list_empty(&cache->size_list[page_index])) { + if (!list_is_empty(&cache->size_list[page_index])) { bo = list_first_entry(&cache->size_list[page_index], struct v3d_bo, size_list); @@ -170,7 +170,7 @@ bo->offset = create.offset; if (ret != 0) { - if (!list_empty(&screen->bo_cache.time_list) && + if (!list_is_empty(&screen->bo_cache.time_list) && !cleared_and_retried) { cleared_and_retried = true; v3d_bo_cache_free_all(&screen->bo_cache); @@ -300,7 +300,7 @@ */ for (int i = 0; i < cache->size_list_size; i++) { struct list_head *old_head = &cache->size_list[i]; - if (list_empty(old_head)) + if (list_is_empty(old_head)) list_inithead(&new_list[i]); else { new_list[i].next = old_head->next; diff -Nru mesa-19.2.8/src/gallium/drivers/v3d/v3d_cl.c mesa-20.0.8/src/gallium/drivers/v3d/v3d_cl.c --- mesa-19.2.8/src/gallium/drivers/v3d/v3d_cl.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/v3d/v3d_cl.c 2020-06-12 01:21:17.000000000 +0000 @@ -63,7 +63,7 @@ if (cl_offset(cl) + space + cl_packet_length(BRANCH) <= cl->size) return; - struct v3d_bo *new_bo = v3d_bo_alloc(cl->job->v3d->screen, 4096, "CL"); + struct v3d_bo *new_bo = v3d_bo_alloc(cl->job->v3d->screen, space, "CL"); assert(space <= new_bo->size); /* Chain to the new BO from the old one. */ @@ -74,7 +74,7 @@ v3d_bo_unreference(&cl->bo); } else { /* Root the first RCL/BCL BO in the job. */ - v3d_job_add_bo(cl->job, cl->bo); + v3d_job_add_bo(cl->job, new_bo); } cl->bo = new_bo; diff -Nru mesa-19.2.8/src/gallium/drivers/v3d/v3d_context.c mesa-20.0.8/src/gallium/drivers/v3d/v3d_context.c --- mesa-19.2.8/src/gallium/drivers/v3d/v3d_context.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/v3d/v3d_context.c 2020-06-12 01:21:17.000000000 +0000 @@ -123,7 +123,7 @@ * output targets. */ void -v3d_tf_update_counters(struct v3d_context *v3d) +v3d_update_primitive_counters(struct v3d_context *v3d) { struct v3d_job *job = v3d_get_job_for_fbo(v3d); if (job->draw_calls_queued == 0) diff -Nru mesa-19.2.8/src/gallium/drivers/v3d/v3d_context.h mesa-20.0.8/src/gallium/drivers/v3d/v3d_context.h --- mesa-19.2.8/src/gallium/drivers/v3d/v3d_context.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/v3d/v3d_context.h 2020-06-12 01:21:17.000000000 +0000 @@ -54,37 +54,46 @@ #define using_v3d_simulator false #endif -#define VC5_DIRTY_BLEND (1 << 0) -#define VC5_DIRTY_RASTERIZER (1 << 1) -#define VC5_DIRTY_ZSA (1 << 2) -#define VC5_DIRTY_FRAGTEX (1 << 3) -#define VC5_DIRTY_VERTTEX (1 << 4) -#define VC5_DIRTY_SHADER_IMAGE (1 << 5) - -#define VC5_DIRTY_BLEND_COLOR (1 << 7) -#define VC5_DIRTY_STENCIL_REF (1 << 8) -#define VC5_DIRTY_SAMPLE_STATE (1 << 9) -#define VC5_DIRTY_FRAMEBUFFER (1 << 10) -#define VC5_DIRTY_STIPPLE (1 << 11) -#define VC5_DIRTY_VIEWPORT (1 << 12) -#define VC5_DIRTY_CONSTBUF (1 << 13) -#define VC5_DIRTY_VTXSTATE (1 << 14) -#define VC5_DIRTY_VTXBUF (1 << 15) -#define VC5_DIRTY_SCISSOR (1 << 17) -#define VC5_DIRTY_FLAT_SHADE_FLAGS (1 << 18) -#define VC5_DIRTY_PRIM_MODE (1 << 19) -#define VC5_DIRTY_CLIP (1 << 20) -#define VC5_DIRTY_UNCOMPILED_VS (1 << 21) -#define VC5_DIRTY_UNCOMPILED_FS (1 << 22) -#define VC5_DIRTY_COMPILED_CS (1 << 23) -#define VC5_DIRTY_COMPILED_VS (1 << 24) -#define VC5_DIRTY_COMPILED_FS (1 << 25) -#define VC5_DIRTY_FS_INPUTS (1 << 26) -#define VC5_DIRTY_STREAMOUT (1 << 27) -#define VC5_DIRTY_OQ (1 << 28) -#define VC5_DIRTY_CENTROID_FLAGS (1 << 29) -#define VC5_DIRTY_NOPERSPECTIVE_FLAGS (1 << 30) -#define VC5_DIRTY_SSBO (1 << 31) +#define VC5_DIRTY_BLEND (1ull << 0) +#define VC5_DIRTY_RASTERIZER (1ull << 1) +#define VC5_DIRTY_ZSA (1ull << 2) +#define VC5_DIRTY_COMPTEX (1ull << 3) +#define VC5_DIRTY_VERTTEX (1ull << 4) +#define VC5_DIRTY_GEOMTEX (1ull << 5) +#define VC5_DIRTY_FRAGTEX (1ull << 6) + +#define VC5_DIRTY_SHADER_IMAGE (1ull << 9) +#define VC5_DIRTY_BLEND_COLOR (1ull << 10) +#define VC5_DIRTY_STENCIL_REF (1ull << 11) +#define VC5_DIRTY_SAMPLE_STATE (1ull << 12) +#define VC5_DIRTY_FRAMEBUFFER (1ull << 13) +#define VC5_DIRTY_STIPPLE (1ull << 14) +#define VC5_DIRTY_VIEWPORT (1ull << 15) +#define VC5_DIRTY_CONSTBUF (1ull << 16) +#define VC5_DIRTY_VTXSTATE (1ull << 17) +#define VC5_DIRTY_VTXBUF (1ull << 18) +#define VC5_DIRTY_SCISSOR (1ull << 19) +#define VC5_DIRTY_FLAT_SHADE_FLAGS (1ull << 20) +#define VC5_DIRTY_PRIM_MODE (1ull << 21) +#define VC5_DIRTY_CLIP (1ull << 22) +#define VC5_DIRTY_UNCOMPILED_CS (1ull << 23) +#define VC5_DIRTY_UNCOMPILED_VS (1ull << 24) +#define VC5_DIRTY_UNCOMPILED_GS (1ull << 25) +#define VC5_DIRTY_UNCOMPILED_FS (1ull << 26) + +#define VC5_DIRTY_COMPILED_CS (1ull << 29) +#define VC5_DIRTY_COMPILED_VS (1ull << 30) +#define VC5_DIRTY_COMPILED_GS_BIN (1ULL << 31) +#define VC5_DIRTY_COMPILED_GS (1ULL << 32) +#define VC5_DIRTY_COMPILED_FS (1ull << 33) + +#define VC5_DIRTY_FS_INPUTS (1ull << 38) +#define VC5_DIRTY_GS_INPUTS (1ull << 39) +#define VC5_DIRTY_STREAMOUT (1ull << 40) +#define VC5_DIRTY_OQ (1ull << 41) +#define VC5_DIRTY_CENTROID_FLAGS (1ull << 42) +#define VC5_DIRTY_NOPERSPECTIVE_FLAGS (1ull << 43) +#define VC5_DIRTY_SSBO (1ull << 44) #define VC5_MAX_FS_INPUTS 64 @@ -202,6 +211,7 @@ union { struct v3d_prog_data *base; struct v3d_vs_prog_data *vs; + struct v3d_gs_prog_data *gs; struct v3d_fs_prog_data *fs; struct v3d_compute_prog_data *compute; } prog_data; @@ -211,12 +221,12 @@ * uniforms have to be rewritten (and therefore the shader state * reemitted). */ - uint32_t uniform_dirty_bits; + uint64_t uniform_dirty_bits; }; struct v3d_program_stateobj { - struct v3d_uncompiled_shader *bind_vs, *bind_fs, *bind_compute; - struct v3d_compiled_shader *cs, *vs, *fs, *compute; + struct v3d_uncompiled_shader *bind_vs, *bind_gs, *bind_fs, *bind_compute; + struct v3d_compiled_shader *cs, *vs, *gs_bin, *gs, *fs, *compute; struct hash_table *cache[MESA_SHADER_STAGES]; @@ -305,7 +315,6 @@ struct v3d_cl indirect; struct v3d_bo *tile_alloc; struct v3d_bo *tile_state; - uint32_t shader_rec_count; struct drm_v3d_submit_cl submit; @@ -345,6 +354,8 @@ */ uint32_t draw_width; uint32_t draw_height; + uint32_t num_layers; + /** @} */ /** @{ Tile information, depending on MSAA and float color buffer. */ uint32_t draw_tiles_x; /** @< Number of tiles wide for framebuffer. */ @@ -409,6 +420,12 @@ */ uint32_t draw_calls_queued; + /** + * Number of draw calls (not counting full buffer clears) queued in + * the current job during active transform feedback. + */ + uint32_t tf_draw_calls_queued; + struct v3d_job_key key; }; @@ -437,7 +454,7 @@ struct blitter_context *blitter; /** bitfield of VC5_DIRTY_* */ - uint32_t dirty; + uint64_t dirty; struct primconvert_context *primconvert; @@ -502,6 +519,12 @@ bool active_queries; + /** + * If a compute job writes a resource read by a non-compute stage we + * should sync on the last compute job. + */ + bool sync_on_last_compute_job; + uint32_t tf_prims_generated; uint32_t prims_generated; @@ -610,17 +633,21 @@ static inline bool v3d_transform_feedback_enabled(struct v3d_context *v3d) { - return v3d->prog.bind_vs->num_tf_specs != 0 && + return (v3d->prog.bind_vs->num_tf_specs != 0 || + (v3d->prog.bind_gs && v3d->prog.bind_gs->num_tf_specs != 0)) && v3d->active_queries; } void v3d_set_shader_uniform_dirty_flags(struct v3d_compiled_shader *shader); struct v3d_cl_reloc v3d_write_uniforms(struct v3d_context *v3d, + struct v3d_job *job, struct v3d_compiled_shader *shader, enum pipe_shader_type stage); void v3d_flush(struct pipe_context *pctx); void v3d_job_init(struct v3d_context *v3d); +struct v3d_job *v3d_job_create(struct v3d_context *v3d); +void v3d_job_free(struct v3d_context *v3d, struct v3d_job *job); struct v3d_job *v3d_get_job(struct v3d_context *v3d, struct pipe_surface **cbufs, struct pipe_surface *zsbuf); @@ -632,10 +659,12 @@ void v3d_flush_jobs_using_bo(struct v3d_context *v3d, struct v3d_bo *bo); void v3d_flush_jobs_writing_resource(struct v3d_context *v3d, struct pipe_resource *prsc, - enum v3d_flush_cond flush_cond); + enum v3d_flush_cond flush_cond, + bool is_compute_pipeline); void v3d_flush_jobs_reading_resource(struct v3d_context *v3d, struct pipe_resource *prsc, - enum v3d_flush_cond flush_cond); + enum v3d_flush_cond flush_cond, + bool is_compute_pipeline); void v3d_update_compiled_shaders(struct v3d_context *v3d, uint8_t prim_mode); void v3d_update_compiled_cs(struct v3d_context *v3d); @@ -672,7 +701,7 @@ struct v3d_fence *v3d_fence_create(struct v3d_context *v3d); -void v3d_tf_update_counters(struct v3d_context *v3d); +void v3d_update_primitive_counters(struct v3d_context *v3d); #ifdef v3dX # include "v3dx_context.h" diff -Nru mesa-19.2.8/src/gallium/drivers/v3d/v3d_fence.c mesa-20.0.8/src/gallium/drivers/v3d/v3d_fence.c --- mesa-19.2.8/src/gallium/drivers/v3d/v3d_fence.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/v3d/v3d_fence.c 2020-06-12 01:21:17.000000000 +0000 @@ -79,7 +79,7 @@ return false; } - drmSyncobjImportSyncFile(screen->fd, syncobj, f->fd); + ret = drmSyncobjImportSyncFile(screen->fd, syncobj, f->fd); if (ret) { fprintf(stderr, "Failed to import fence to syncobj: %d\n", ret); return false; diff -Nru mesa-19.2.8/src/gallium/drivers/v3d/v3d_job.c mesa-20.0.8/src/gallium/drivers/v3d/v3d_job.c --- mesa-19.2.8/src/gallium/drivers/v3d/v3d_job.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/v3d/v3d_job.c 2020-06-12 01:21:17.000000000 +0000 @@ -37,7 +37,7 @@ #include "util/set.h" #include "broadcom/clif/clif_dump.h" -static void +void v3d_job_free(struct v3d_context *v3d, struct v3d_job *job) { set_foreach(job->bos, entry) { @@ -85,7 +85,7 @@ ralloc_free(job); } -static struct v3d_job * +struct v3d_job * v3d_job_create(struct v3d_context *v3d) { struct v3d_job *job = rzalloc(v3d, struct v3d_job); @@ -184,10 +184,23 @@ void v3d_flush_jobs_writing_resource(struct v3d_context *v3d, struct pipe_resource *prsc, - enum v3d_flush_cond flush_cond) + enum v3d_flush_cond flush_cond, + bool is_compute_pipeline) { struct hash_entry *entry = _mesa_hash_table_search(v3d->write_jobs, prsc); + struct v3d_resource *rsc = v3d_resource(prsc); + + /* We need to sync if graphics pipeline reads a resource written + * by the compute pipeline. The same would be needed for the case of + * graphics-compute dependency but nowadays all compute jobs + * are serialized with the previous submitted job. + */ + if (!is_compute_pipeline && rsc->bo != NULL && rsc->compute_written) { + v3d->sync_on_last_compute_job = true; + rsc->compute_written = false; + } + if (!entry) return; @@ -220,7 +233,8 @@ void v3d_flush_jobs_reading_resource(struct v3d_context *v3d, struct pipe_resource *prsc, - enum v3d_flush_cond flush_cond) + enum v3d_flush_cond flush_cond, + bool is_compute_pipeline) { struct v3d_resource *rsc = v3d_resource(prsc); @@ -230,7 +244,8 @@ * caller intends to write to the resource, so we don't care if * there was a previous TF write to it. */ - v3d_flush_jobs_writing_resource(v3d, prsc, flush_cond); + v3d_flush_jobs_writing_resource(v3d, prsc, flush_cond, + is_compute_pipeline); hash_table_foreach(v3d->jobs, entry) { struct v3d_job *job = entry->data; @@ -329,7 +344,8 @@ for (int i = 0; i < V3D_MAX_DRAW_BUFFERS; i++) { if (cbufs[i]) { v3d_flush_jobs_reading_resource(v3d, cbufs[i]->texture, - V3D_FLUSH_DEFAULT); + V3D_FLUSH_DEFAULT, + false); pipe_surface_reference(&job->cbufs[i], cbufs[i]); if (cbufs[i]->texture->nr_samples > 1) @@ -338,7 +354,8 @@ } if (zsbuf) { v3d_flush_jobs_reading_resource(v3d, zsbuf->texture, - V3D_FLUSH_DEFAULT); + V3D_FLUSH_DEFAULT, + false); pipe_surface_reference(&job->zsbuf, zsbuf); if (zsbuf->texture->nr_samples > 1) job->msaa = true; @@ -356,7 +373,8 @@ if (rsc->separate_stencil) { v3d_flush_jobs_reading_resource(v3d, &rsc->separate_stencil->base, - V3D_FLUSH_DEFAULT); + V3D_FLUSH_DEFAULT, + false); _mesa_hash_table_insert(v3d->write_jobs, &rsc->separate_stencil->base, job); @@ -454,11 +472,16 @@ { assert(v3d->prim_counts); - perf_debug("stalling on TF counts readback"); + perf_debug("stalling on TF counts readback\n"); struct v3d_resource *rsc = v3d_resource(v3d->prim_counts); if (v3d_bo_wait(rsc->bo, PIPE_TIMEOUT_INFINITE, "prim-counts")) { uint32_t *map = v3d_bo_map(rsc->bo) + v3d->prim_counts_offset; v3d->tf_prims_generated += map[V3D_PRIM_COUNTS_TF_WRITTEN]; + /* When we only have a vertex shader we determine the primitive + * count in the CPU so don't update it here again. + */ + if (v3d->prog.gs) + v3d->prims_generated += map[V3D_PRIM_COUNTS_WRITTEN]; } } @@ -497,6 +520,10 @@ job->submit.bcl_end = job->bcl.bo->offset + cl_offset(&job->bcl); job->submit.rcl_end = job->rcl.bo->offset + cl_offset(&job->rcl); + job->submit.flags = 0; + if (job->tmu_dirty_rcl && screen->has_cache_flush) + job->submit.flags |= DRM_V3D_SUBMIT_CL_FLUSH_CACHE; + /* On V3D 4.1, the tile alloc/state setup moved to register writes * instead of binner packets. */ @@ -526,8 +553,16 @@ * feedback we need to read the primitive counts and accumulate * them, otherwise they will be reset at the start of the next * draw when we emit the Tile Binning Mode Configuration packet. + * + * If the job doesn't have any TF draw calls, then we know + * the primitive count must be zero and we can skip stalling + * for this. This also fixes a problem because it seems that + * in this scenario the counters are not reset with the Tile + * Binning Mode Configuration packet, which would translate + * to us reading an obsolete (possibly non-zero) value from + * the GPU counters. */ - if (v3d->streamout.num_targets) + if (v3d->streamout.num_targets && job->tf_draw_calls_queued > 0) v3d_read_and_accumulate_primitive_counters(v3d); } diff -Nru mesa-19.2.8/src/gallium/drivers/v3d/v3d_program.c mesa-20.0.8/src/gallium/drivers/v3d/v3d_program.c --- mesa-19.2.8/src/gallium/drivers/v3d/v3d_program.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/v3d/v3d_program.c 2020-06-12 01:21:17.000000000 +0000 @@ -22,7 +22,7 @@ */ #include -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_math.h" #include "util/u_memory.h" #include "util/ralloc.h" @@ -175,6 +175,27 @@ return glsl_count_attribute_slots(type, false); } +static void +precompile_all_outputs(nir_shader *s, + struct v3d_varying_slot *outputs, + uint8_t *num_outputs) +{ + nir_foreach_variable(var, &s->outputs) { + const int array_len = MAX2(glsl_get_length(var->type), 1); + for (int j = 0; j < array_len; j++) { + const int slot = var->data.location + j; + const int num_components = + glsl_get_components(var->type); + for (int i = 0; i < num_components; i++) { + const int swiz = var->data.location_frac + i; + outputs[(*num_outputs)++] = + v3d_slot_from_slot_and_component(slot, + swiz); + } + } + } +} + /** * Precompiles a shader variant at shader state creation time if * V3D_DEBUG=precompile is set. Used for shader-db @@ -204,35 +225,50 @@ v3d_setup_shared_precompile_key(so, &key.base); v3d_get_compiled_shader(v3d, &key.base, sizeof(key)); + } else if (s->info.stage == MESA_SHADER_GEOMETRY) { + struct v3d_gs_key key = { + .base.shader_state = so, + .base.is_last_geometry_stage = true, + }; + + v3d_setup_shared_precompile_key(so, &key.base); + + precompile_all_outputs(s, + key.used_outputs, + &key.num_used_outputs); + + v3d_get_compiled_shader(v3d, &key.base, sizeof(key)); + + /* Compile GS bin shader: only position (XXX: include TF) */ + key.is_coord = true; + key.num_used_outputs = 0; + for (int i = 0; i < 4; i++) { + key.used_outputs[key.num_used_outputs++] = + v3d_slot_from_slot_and_component(VARYING_SLOT_POS, + i); + } + v3d_get_compiled_shader(v3d, &key.base, sizeof(key)); } else { + assert(s->info.stage == MESA_SHADER_VERTEX); struct v3d_vs_key key = { .base.shader_state = so, + /* Emit fixed function outputs */ + .base.is_last_geometry_stage = true, }; v3d_setup_shared_precompile_key(so, &key.base); - /* Compile VS: All outputs */ - nir_foreach_variable(var, &s->outputs) { - unsigned array_len = MAX2(glsl_get_length(var->type), 1); - assert(array_len == 1); - (void)array_len; - - int slot = var->data.location; - for (int i = 0; i < glsl_get_components(var->type); i++) { - int swiz = var->data.location_frac + i; - key.fs_inputs[key.num_fs_inputs++] = - v3d_slot_from_slot_and_component(slot, - swiz); - } - } + precompile_all_outputs(s, + key.used_outputs, + &key.num_used_outputs); v3d_get_compiled_shader(v3d, &key.base, sizeof(key)); /* Compile VS bin shader: only position (XXX: include TF) */ key.is_coord = true; - key.num_fs_inputs = 0; + key.num_used_outputs = 0; for (int i = 0; i < 4; i++) { - key.fs_inputs[key.num_fs_inputs++] = + key.used_outputs[key.num_used_outputs++] = v3d_slot_from_slot_and_component(VARYING_SLOT_POS, i); } @@ -271,8 +307,10 @@ } nir_variable_mode lower_mode = nir_var_all & ~nir_var_uniform; - if (s->info.stage == MESA_SHADER_VERTEX) + if (s->info.stage == MESA_SHADER_VERTEX || + s->info.stage == MESA_SHADER_GEOMETRY) { lower_mode &= ~(nir_var_shader_in | nir_var_shader_out); + } NIR_PASS_V(s, nir_lower_io, lower_mode, type_size, (nir_lower_io_options)0); @@ -609,17 +647,92 @@ } static void +v3d_update_compiled_gs(struct v3d_context *v3d, uint8_t prim_mode) +{ + struct v3d_gs_key local_key; + struct v3d_gs_key *key = &local_key; + + if (!(v3d->dirty & (VC5_DIRTY_GEOMTEX | + VC5_DIRTY_RASTERIZER | + VC5_DIRTY_UNCOMPILED_GS | + VC5_DIRTY_PRIM_MODE | + VC5_DIRTY_FS_INPUTS))) { + return; + } + + if (!v3d->prog.bind_gs) { + v3d->prog.gs = NULL; + v3d->prog.gs_bin = NULL; + return; + } + + memset(key, 0, sizeof(*key)); + v3d_setup_shared_key(v3d, &key->base, &v3d->tex[PIPE_SHADER_GEOMETRY]); + key->base.shader_state = v3d->prog.bind_gs; + key->base.ucp_enables = v3d->rasterizer->base.clip_plane_enable; + key->base.is_last_geometry_stage = true; + key->num_used_outputs = v3d->prog.fs->prog_data.fs->num_inputs; + STATIC_ASSERT(sizeof(key->used_outputs) == + sizeof(v3d->prog.fs->prog_data.fs->input_slots)); + memcpy(key->used_outputs, v3d->prog.fs->prog_data.fs->input_slots, + sizeof(key->used_outputs)); + + key->per_vertex_point_size = + (prim_mode == PIPE_PRIM_POINTS && + v3d->rasterizer->base.point_size_per_vertex); + + struct v3d_compiled_shader *gs = + v3d_get_compiled_shader(v3d, &key->base, sizeof(*key)); + if (gs != v3d->prog.gs) { + v3d->prog.gs = gs; + v3d->dirty |= VC5_DIRTY_COMPILED_GS; + } + + key->is_coord = true; + + /* The last bin-mode shader in the geometry pipeline only outputs + * varyings used by transform feedback. + */ + struct v3d_uncompiled_shader *shader_state = key->base.shader_state; + memcpy(key->used_outputs, shader_state->tf_outputs, + sizeof(*key->used_outputs) * shader_state->num_tf_outputs); + if (shader_state->num_tf_outputs < key->num_used_outputs) { + uint32_t size = sizeof(*key->used_outputs) * + (key->num_used_outputs - + shader_state->num_tf_outputs); + memset(&key->used_outputs[shader_state->num_tf_outputs], + 0, size); + } + key->num_used_outputs = shader_state->num_tf_outputs; + + struct v3d_compiled_shader *old_gs = v3d->prog.gs; + struct v3d_compiled_shader *gs_bin = + v3d_get_compiled_shader(v3d, &key->base, sizeof(*key)); + if (gs_bin != old_gs) { + v3d->prog.gs_bin = gs_bin; + v3d->dirty |= VC5_DIRTY_COMPILED_GS_BIN; + } + + if (old_gs && memcmp(v3d->prog.gs->prog_data.gs->input_slots, + old_gs->prog_data.gs->input_slots, + sizeof(v3d->prog.gs->prog_data.gs->input_slots))) { + v3d->dirty |= VC5_DIRTY_GS_INPUTS; + } +} + +static void v3d_update_compiled_vs(struct v3d_context *v3d, uint8_t prim_mode) { struct v3d_vs_key local_key; struct v3d_vs_key *key = &local_key; - if (!(v3d->dirty & (VC5_DIRTY_PRIM_MODE | - VC5_DIRTY_RASTERIZER | - VC5_DIRTY_VERTTEX | + if (!(v3d->dirty & (VC5_DIRTY_VERTTEX | VC5_DIRTY_VTXSTATE | VC5_DIRTY_UNCOMPILED_VS | - VC5_DIRTY_FS_INPUTS))) { + (v3d->prog.bind_gs ? 0 : VC5_DIRTY_RASTERIZER) | + (v3d->prog.bind_gs ? 0 : VC5_DIRTY_PRIM_MODE) | + (v3d->prog.bind_gs ? VC5_DIRTY_GS_INPUTS : + VC5_DIRTY_FS_INPUTS)))) { return; } @@ -627,11 +740,22 @@ v3d_setup_shared_key(v3d, &key->base, &v3d->tex[PIPE_SHADER_VERTEX]); key->base.shader_state = v3d->prog.bind_vs; key->base.ucp_enables = v3d->rasterizer->base.clip_plane_enable; - key->num_fs_inputs = v3d->prog.fs->prog_data.fs->num_inputs; - STATIC_ASSERT(sizeof(key->fs_inputs) == - sizeof(v3d->prog.fs->prog_data.fs->input_slots)); - memcpy(key->fs_inputs, v3d->prog.fs->prog_data.fs->input_slots, - sizeof(key->fs_inputs)); + key->base.is_last_geometry_stage = !v3d->prog.bind_gs; + + if (!v3d->prog.bind_gs) { + key->num_used_outputs = v3d->prog.fs->prog_data.fs->num_inputs; + STATIC_ASSERT(sizeof(key->used_outputs) == + sizeof(v3d->prog.fs->prog_data.fs->input_slots)); + memcpy(key->used_outputs, v3d->prog.fs->prog_data.fs->input_slots, + sizeof(key->used_outputs)); + } else { + key->num_used_outputs = v3d->prog.gs->prog_data.gs->num_inputs; + STATIC_ASSERT(sizeof(key->used_outputs) == + sizeof(v3d->prog.gs->prog_data.gs->input_slots)); + memcpy(key->used_outputs, v3d->prog.gs->prog_data.gs->input_slots, + sizeof(key->used_outputs)); + } + key->clamp_color = v3d->rasterizer->base.clamp_vertex_color; key->per_vertex_point_size = @@ -646,17 +770,29 @@ } key->is_coord = true; - /* Coord shaders only output varyings used by transform feedback. */ - struct v3d_uncompiled_shader *shader_state = key->base.shader_state; - memcpy(key->fs_inputs, shader_state->tf_outputs, - sizeof(*key->fs_inputs) * shader_state->num_tf_outputs); - if (shader_state->num_tf_outputs < key->num_fs_inputs) { - memset(&key->fs_inputs[shader_state->num_tf_outputs], - 0, - sizeof(*key->fs_inputs) * (key->num_fs_inputs - - shader_state->num_tf_outputs)); + + /* Coord shaders only output varyings used by transform feedback, + * unless they are linked to other shaders in the geometry side + * of the pipeline, since in that case any of the output varyings + * could be required in later geometry stages to compute + * gl_Position or TF outputs. + */ + if (!v3d->prog.bind_gs) { + struct v3d_uncompiled_shader *shader_state = + key->base.shader_state; + memcpy(key->used_outputs, shader_state->tf_outputs, + sizeof(*key->used_outputs) * + shader_state->num_tf_outputs); + if (shader_state->num_tf_outputs < key->num_used_outputs) { + uint32_t tail_bytes = + sizeof(*key->used_outputs) * + (key->num_used_outputs - + shader_state->num_tf_outputs); + memset(&key->used_outputs[shader_state->num_tf_outputs], + 0, tail_bytes); + } + key->num_used_outputs = shader_state->num_tf_outputs; } - key->num_fs_inputs = shader_state->num_tf_outputs; struct v3d_compiled_shader *cs = v3d_get_compiled_shader(v3d, &key->base, sizeof(*key)); @@ -670,6 +806,7 @@ v3d_update_compiled_shaders(struct v3d_context *v3d, uint8_t prim_mode) { v3d_update_compiled_fs(v3d, prim_mode); + v3d_update_compiled_gs(v3d, prim_mode); v3d_update_compiled_vs(v3d, prim_mode); } @@ -679,9 +816,8 @@ struct v3d_key local_key; struct v3d_key *key = &local_key; - if (!(v3d->dirty & (~0 | /* XXX */ - VC5_DIRTY_VERTTEX | - VC5_DIRTY_UNCOMPILED_FS))) { + if (!(v3d->dirty & (VC5_DIRTY_UNCOMPILED_CS | + VC5_DIRTY_COMPTEX))) { return; } @@ -704,6 +840,12 @@ } static uint32_t +gs_cache_hash(const void *key) +{ + return _mesa_hash_data(key, sizeof(struct v3d_gs_key)); +} + +static uint32_t vs_cache_hash(const void *key) { return _mesa_hash_data(key, sizeof(struct v3d_vs_key)); @@ -722,6 +864,12 @@ } static bool +gs_cache_compare(const void *key1, const void *key2) +{ + return memcmp(key1, key2, sizeof(struct v3d_gs_key)) == 0; +} + +static bool vs_cache_compare(const void *key1, const void *key2) { return memcmp(key1, key2, sizeof(struct v3d_vs_key)) == 0; @@ -773,6 +921,14 @@ } static void +v3d_gp_state_bind(struct pipe_context *pctx, void *hwcso) +{ + struct v3d_context *v3d = v3d_context(pctx); + v3d->prog.bind_gs = hwcso; + v3d->dirty |= VC5_DIRTY_UNCOMPILED_GS; +} + +static void v3d_vp_state_bind(struct pipe_context *pctx, void *hwcso) { struct v3d_context *v3d = v3d_context(pctx); @@ -786,6 +942,7 @@ struct v3d_context *v3d = v3d_context(pctx); v3d->prog.bind_compute = state; + v3d->dirty |= VC5_DIRTY_UNCOMPILED_CS; } static void * @@ -804,10 +961,14 @@ pctx->create_vs_state = v3d_shader_state_create; pctx->delete_vs_state = v3d_shader_state_delete; + pctx->create_gs_state = v3d_shader_state_create; + pctx->delete_gs_state = v3d_shader_state_delete; + pctx->create_fs_state = v3d_shader_state_create; pctx->delete_fs_state = v3d_shader_state_delete; pctx->bind_fs_state = v3d_fp_state_bind; + pctx->bind_gs_state = v3d_gp_state_bind; pctx->bind_vs_state = v3d_vp_state_bind; if (v3d->screen->has_csd) { @@ -818,6 +979,8 @@ v3d->prog.cache[MESA_SHADER_VERTEX] = _mesa_hash_table_create(pctx, vs_cache_hash, vs_cache_compare); + v3d->prog.cache[MESA_SHADER_GEOMETRY] = + _mesa_hash_table_create(pctx, gs_cache_hash, gs_cache_compare); v3d->prog.cache[MESA_SHADER_FRAGMENT] = _mesa_hash_table_create(pctx, fs_cache_hash, fs_cache_compare); v3d->prog.cache[MESA_SHADER_COMPUTE] = diff -Nru mesa-19.2.8/src/gallium/drivers/v3d/v3d_query.c mesa-20.0.8/src/gallium/drivers/v3d/v3d_query.c --- mesa-19.2.8/src/gallium/drivers/v3d/v3d_query.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/v3d/v3d_query.c 2020-06-12 01:21:17.000000000 +0000 @@ -72,6 +72,13 @@ switch (q->type) { case PIPE_QUERY_PRIMITIVES_GENERATED: + /* If we are using PRIMITIVE_COUNTS_FEEDBACK to retrieve + * primitive counts from the GPU (which we need when a GS + * is present), then we need to update our counters now + * to discard any primitives generated before this. + */ + if (v3d->prog.gs) + v3d_update_primitive_counters(v3d); q->start = v3d->prims_generated; break; case PIPE_QUERY_PRIMITIVES_EMITTED: @@ -79,7 +86,7 @@ * primitive counts to skip primtives recorded before this. */ if (v3d->streamout.num_targets > 0) - v3d_tf_update_counters(v3d); + v3d_update_primitive_counters(v3d); q->start = v3d->tf_prims_generated; break; case PIPE_QUERY_OCCLUSION_COUNTER: @@ -107,6 +114,12 @@ switch (q->type) { case PIPE_QUERY_PRIMITIVES_GENERATED: + /* If we are using PRIMITIVE_COUNTS_FEEDBACK to retrieve + * primitive counts from the GPU (which we need when a GS + * is present), then we need to update our counters now. + */ + if (v3d->prog.gs) + v3d_update_primitive_counters(v3d); q->end = v3d->prims_generated; break; case PIPE_QUERY_PRIMITIVES_EMITTED: @@ -115,7 +128,7 @@ * time. Otherwise, we have to do it now. */ if (v3d->streamout.num_targets > 0) - v3d_tf_update_counters(v3d); + v3d_update_primitive_counters(v3d); q->end = v3d->tf_prims_generated; break; case PIPE_QUERY_OCCLUSION_COUNTER: diff -Nru mesa-19.2.8/src/gallium/drivers/v3d/v3d_resource.c mesa-20.0.8/src/gallium/drivers/v3d/v3d_resource.c --- mesa-19.2.8/src/gallium/drivers/v3d/v3d_resource.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/v3d/v3d_resource.c 2020-06-12 01:21:17.000000000 +0000 @@ -25,12 +25,12 @@ #include "pipe/p_defines.h" #include "util/u_blit.h" #include "util/u_memory.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_inlines.h" #include "util/u_surface.h" #include "util/u_transfer_helper.h" #include "util/u_upload_mgr.h" -#include "util/u_format_zs.h" +#include "util/format/u_format_zs.h" #include "util/u_drm.h" #include "drm-uapi/drm_fourcc.h" @@ -170,19 +170,23 @@ * don't violate any syncing requirements. */ v3d_flush_jobs_reading_resource(v3d, prsc, - V3D_FLUSH_DEFAULT); + V3D_FLUSH_DEFAULT, + false); } } else if (!(usage & PIPE_TRANSFER_UNSYNCHRONIZED)) { /* If we're writing and the buffer is being used by the CL, we * have to flush the CL first. If we're only reading, we need * to flush if the CL has written our buffer. */ - if (usage & PIPE_TRANSFER_WRITE) + if (usage & PIPE_TRANSFER_WRITE) { v3d_flush_jobs_reading_resource(v3d, prsc, - V3D_FLUSH_ALWAYS); - else + V3D_FLUSH_ALWAYS, + false); + } else { v3d_flush_jobs_writing_resource(v3d, prsc, - V3D_FLUSH_ALWAYS); + V3D_FLUSH_ALWAYS, + false); + } } if (usage & PIPE_TRANSFER_WRITE) { @@ -992,8 +996,6 @@ if (!surface) return NULL; - assert(surf_tmpl->u.tex.first_layer == surf_tmpl->u.tex.last_layer); - struct pipe_surface *psurf = &surface->base; unsigned level = surf_tmpl->u.tex.level; struct v3d_resource_slice *slice = &rsc->slices[level]; diff -Nru mesa-19.2.8/src/gallium/drivers/v3d/v3d_resource.h mesa-20.0.8/src/gallium/drivers/v3d/v3d_resource.h --- mesa-19.2.8/src/gallium/drivers/v3d/v3d_resource.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/v3d/v3d_resource.h 2020-06-12 01:21:17.000000000 +0000 @@ -130,6 +130,11 @@ bool tiled; /** + * Indicates if the CS has written the resource + */ + bool compute_written; + + /** * Number of times the resource has been written to. * * This is used to track whether we need to load the surface on first diff -Nru mesa-19.2.8/src/gallium/drivers/v3d/v3d_screen.c mesa-20.0.8/src/gallium/drivers/v3d/v3d_screen.c --- mesa-19.2.8/src/gallium/drivers/v3d/v3d_screen.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/v3d/v3d_screen.c 2020-06-12 01:21:17.000000000 +0000 @@ -32,7 +32,7 @@ #include "util/u_debug.h" #include "util/u_memory.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_hash_table.h" #include "util/u_screen.h" #include "util/u_transfer_helper.h" @@ -175,11 +175,17 @@ return 4; case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT: - return 4; + if (screen->has_cache_flush) + return 4; + else + return 0; /* Disables shader storage */ case PIPE_CAP_GLSL_FEATURE_LEVEL: return 330; + case PIPE_CAP_ESSL_FEATURE_LEVEL: + return 310; + case PIPE_CAP_GLSL_FEATURE_LEVEL_COMPATIBILITY: return 140; @@ -245,6 +251,16 @@ case PIPE_CAP_UMA: return 1; + /* Geometry shaders */ + case PIPE_CAP_MAX_GEOMETRY_TOTAL_OUTPUT_COMPONENTS: + /* Minimum required by GLES 3.2 */ + return 1024; + case PIPE_CAP_MAX_GEOMETRY_OUTPUT_VERTICES: + /* MAX_GEOMETRY_TOTAL_OUTPUT_COMPONENTS / 4 */ + return 256; + case PIPE_CAP_MAX_GS_INVOCATIONS: + return 32; + default: return u_pipe_screen_get_param_defaults(pscreen, param); } @@ -291,6 +307,10 @@ if (!screen->has_csd) return 0; break; + case PIPE_SHADER_GEOMETRY: + if (screen->devinfo.ver < 41) + return 0; + break; default: return 0; } @@ -307,10 +327,16 @@ return UINT_MAX; case PIPE_SHADER_CAP_MAX_INPUTS: - if (shader == PIPE_SHADER_FRAGMENT) - return V3D_MAX_FS_INPUTS / 4; - else + switch (shader) { + case PIPE_SHADER_VERTEX: return V3D_MAX_VS_INPUTS / 4; + case PIPE_SHADER_GEOMETRY: + return V3D_MAX_GS_INPUTS / 4; + case PIPE_SHADER_FRAGMENT: + return V3D_MAX_FS_INPUTS / 4; + default: + return 0; + }; case PIPE_SHADER_CAP_MAX_OUTPUTS: if (shader == PIPE_SHADER_FRAGMENT) return 4; @@ -328,6 +354,7 @@ case PIPE_SHADER_CAP_TGSI_CONT_SUPPORTED: return 0; case PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR: + return 1; case PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR: return 0; case PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR: @@ -348,23 +375,30 @@ case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTERS: case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTER_BUFFERS: return 0; - case PIPE_SHADER_CAP_SCALAR_ISA: - return 1; case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS: case PIPE_SHADER_CAP_MAX_SAMPLER_VIEWS: return V3D_MAX_TEXTURE_SAMPLERS; case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS: - if (shader == PIPE_SHADER_VERTEX) + if (screen->has_cache_flush) { + if (shader == PIPE_SHADER_VERTEX || + shader == PIPE_SHADER_GEOMETRY) { + return 0; + } + return PIPE_MAX_SHADER_BUFFERS; + } else { return 0; - - return PIPE_MAX_SHADER_BUFFERS; + } case PIPE_SHADER_CAP_MAX_SHADER_IMAGES: - if (screen->devinfo.ver < 41) + if (screen->has_cache_flush) { + if (screen->devinfo.ver < 41) + return 0; + else + return PIPE_MAX_SHADER_IMAGES; + } else { return 0; - else - return PIPE_MAX_SHADER_IMAGES; + } case PIPE_SHADER_CAP_PREFERRED_IR: return PIPE_SHADER_IR_NIR; @@ -668,7 +702,9 @@ slab_create_parent(&screen->transfer_pool, sizeof(struct v3d_transfer), 16); - screen->has_csd = false; /* until the UABI is enabled. */ + screen->has_csd = v3d_has_feature(screen, DRM_V3D_PARAM_SUPPORTS_CSD); + screen->has_cache_flush = + v3d_has_feature(screen, DRM_V3D_PARAM_SUPPORTS_CACHE_FLUSH); v3d_fence_init(screen); diff -Nru mesa-19.2.8/src/gallium/drivers/v3d/v3d_screen.h mesa-20.0.8/src/gallium/drivers/v3d/v3d_screen.h --- mesa-19.2.8/src/gallium/drivers/v3d/v3d_screen.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/v3d/v3d_screen.h 2020-06-12 01:21:17.000000000 +0000 @@ -78,6 +78,7 @@ uint32_t bo_count; bool has_csd; + bool has_cache_flush; bool nonmsaa_texture_size_limit; struct v3d_simulator_file *sim_file; diff -Nru mesa-19.2.8/src/gallium/drivers/v3d/v3d_simulator.c mesa-20.0.8/src/gallium/drivers/v3d/v3d_simulator.c --- mesa-19.2.8/src/gallium/drivers/v3d/v3d_simulator.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/v3d/v3d_simulator.c 2020-06-12 01:21:17.000000000 +0000 @@ -515,6 +515,28 @@ return ret; } +static int +v3d_simulator_submit_csd_ioctl(int fd, struct drm_v3d_submit_csd *args) +{ + struct v3d_simulator_file *file = v3d_get_simulator_file_for_fd(fd); + uint32_t *bo_handles = (uint32_t *)(uintptr_t)args->bo_handles; + int ret; + + for (int i = 0; i < args->bo_handle_count; i++) + v3d_simulator_copy_in_handle(file, bo_handles[i]); + + if (sim_state.ver >= 41) + ret = v3d41_simulator_submit_csd_ioctl(sim_state.v3d, args, + file->gmp->ofs); + else + ret = -1; + + for (int i = 0; i < args->bo_handle_count; i++) + v3d_simulator_copy_out_handle(file, bo_handles[i]); + + return ret; +} + int v3d_simulator_ioctl(int fd, unsigned long request, void *args) { @@ -545,6 +567,9 @@ case DRM_IOCTL_V3D_SUBMIT_TFU: return v3d_simulator_submit_tfu_ioctl(fd, args); + case DRM_IOCTL_V3D_SUBMIT_CSD: + return v3d_simulator_submit_csd_ioctl(fd, args); + case DRM_IOCTL_GEM_OPEN: case DRM_IOCTL_GEM_FLINK: return drmIoctl(fd, request, args); diff -Nru mesa-19.2.8/src/gallium/drivers/v3d/v3d_uniforms.c mesa-20.0.8/src/gallium/drivers/v3d/v3d_uniforms.c --- mesa-19.2.8/src/gallium/drivers/v3d/v3d_uniforms.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/v3d/v3d_uniforms.c 2020-06-12 01:21:17.000000000 +0000 @@ -206,19 +206,24 @@ } struct v3d_cl_reloc -v3d_write_uniforms(struct v3d_context *v3d, struct v3d_compiled_shader *shader, +v3d_write_uniforms(struct v3d_context *v3d, struct v3d_job *job, + struct v3d_compiled_shader *shader, enum pipe_shader_type stage) { struct v3d_constbuf_stateobj *cb = &v3d->constbuf[stage]; struct v3d_texture_stateobj *texstate = &v3d->tex[stage]; struct v3d_uniform_list *uinfo = &shader->prog_data.base->uniforms; - struct v3d_job *job = v3d->job; const uint32_t *gallium_uniforms = cb->cb[0].user_buffer; - /* We always need to return some space for uniforms, because the HW - * will be prefetching, even if we don't read any in the program. + /* The hardware always pre-fetches the next uniform (also when there + * aren't any), so we always allocate space for an extra slot. This + * fixes MMU exceptions reported since Linux kernel 5.4 when the + * uniforms fill up the tail bytes of a page in the indirect + * BO. In that scenario, when the hardware pre-fetches after reading + * the last uniform it will read beyond the end of the page and trigger + * the MMU exception. */ - v3d_cl_ensure_space(&job->indirect, MAX2(uinfo->count, 1) * 4, 4); + v3d_cl_ensure_space(&job->indirect, (uinfo->count + 1) * 4, 4); struct v3d_cl_reloc uniform_stream = cl_get_address(&job->indirect); v3d_bo_reference(uniform_stream.bo); @@ -368,6 +373,10 @@ v3d->compute_shared_memory, 0); break; + case QUNIFORM_FB_LAYERS: + cl_aligned_u32(&uniforms, job->num_layers); + break; + default: assert(quniform_contents_is_texture_p0(uinfo->contents[i])); @@ -434,7 +443,8 @@ /* We could flag this on just the stage we're * compiling for, but it's not passed in. */ - dirty |= VC5_DIRTY_FRAGTEX | VC5_DIRTY_VERTTEX; + dirty |= VC5_DIRTY_FRAGTEX | VC5_DIRTY_VERTTEX | + VC5_DIRTY_GEOMTEX | VC5_DIRTY_COMPTEX; break; case QUNIFORM_SSBO_OFFSET: @@ -459,9 +469,14 @@ /* Compute always recalculates uniforms. */ break; + case QUNIFORM_FB_LAYERS: + dirty |= VC5_DIRTY_FRAMEBUFFER; + break; + default: assert(quniform_contents_is_texture_p0(shader->prog_data.base->uniforms.contents[i])); - dirty |= VC5_DIRTY_FRAGTEX | VC5_DIRTY_VERTTEX; + dirty |= VC5_DIRTY_FRAGTEX | VC5_DIRTY_VERTTEX | + VC5_DIRTY_GEOMTEX | VC5_DIRTY_COMPTEX; break; } } diff -Nru mesa-19.2.8/src/gallium/drivers/v3d/v3dx_context.h mesa-20.0.8/src/gallium/drivers/v3d/v3dx_context.h --- mesa-19.2.8/src/gallium/drivers/v3d/v3dx_context.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/v3d/v3dx_context.h 2020-06-12 01:21:17.000000000 +0000 @@ -44,6 +44,9 @@ uint32_t gmp_offset); int v3dX(simulator_submit_tfu_ioctl)(struct v3d_hw *v3d, struct drm_v3d_submit_tfu *args); +int v3dX(simulator_submit_csd_ioctl)(struct v3d_hw *v3d, + struct drm_v3d_submit_csd *args, + uint32_t gmp_offset); const struct v3d_format *v3dX(get_format_desc)(enum pipe_format f); void v3dX(get_internal_type_bpp_for_output_format)(uint32_t format, uint32_t *type, diff -Nru mesa-19.2.8/src/gallium/drivers/v3d/v3dx_draw.c mesa-20.0.8/src/gallium/drivers/v3d/v3dx_draw.c --- mesa-19.2.8/src/gallium/drivers/v3d/v3dx_draw.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/v3d/v3dx_draw.c 2020-06-12 01:21:17.000000000 +0000 @@ -23,7 +23,7 @@ #include "util/u_blitter.h" #include "util/u_prim.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_pack_color.h" #include "util/u_prim_restart.h" #include "util/u_upload_mgr.h" @@ -55,11 +55,14 @@ job->submit.bcl_start = job->bcl.bo->offset; v3d_job_add_bo(job, job->bcl.bo); + uint32_t fb_layers = util_framebuffer_get_num_layers(&v3d->framebuffer); + /* The PTB will request the tile alloc initial size per tile at start * of tile binning. */ - uint32_t tile_alloc_size = (job->draw_tiles_x * - job->draw_tiles_y) * 64; + uint32_t tile_alloc_size = + MAX2(fb_layers, 1) * job->draw_tiles_x * job->draw_tiles_y * 64; + /* The PTB allocates in aligned 4k chunks after the initial setup. */ tile_alloc_size = align(tile_alloc_size, 4096); @@ -79,10 +82,21 @@ "tile_alloc"); uint32_t tsda_per_tile_size = v3d->screen->devinfo.ver >= 40 ? 256 : 64; job->tile_state = v3d_bo_alloc(v3d->screen, + MAX2(fb_layers, 1) * job->draw_tiles_y * job->draw_tiles_x * tsda_per_tile_size, "TSDA"); +#if V3D_VERSION >= 41 + /* This must go before the binning mode configuration. It is + * required for layered framebuffers to work. + */ + if (fb_layers > 0) { + cl_emit(&job->bcl, NUMBER_OF_LAYERS, config) { + config.number_of_layers = fb_layers; + } + } +#endif #if V3D_VERSION >= 40 cl_emit(&job->bcl, TILE_BINNING_MODE_CFG, config) { @@ -137,6 +151,7 @@ job->needs_flush = true; job->draw_width = v3d->framebuffer.width; job->draw_height = v3d->framebuffer.height; + job->num_layers = fb_layers; } static void @@ -157,7 +172,8 @@ v3d_update_shadow_texture(pctx, &view->base); v3d_flush_jobs_writing_resource(v3d, view->texture, - V3D_FLUSH_DEFAULT); + V3D_FLUSH_DEFAULT, + s == PIPE_SHADER_COMPUTE); } /* Flush writes to UBOs. */ @@ -165,7 +181,8 @@ struct pipe_constant_buffer *cb = &v3d->constbuf[s].cb[i]; if (cb->buffer) { v3d_flush_jobs_writing_resource(v3d, cb->buffer, - V3D_FLUSH_DEFAULT); + V3D_FLUSH_DEFAULT, + s == PIPE_SHADER_COMPUTE); } } @@ -174,7 +191,8 @@ struct pipe_shader_buffer *sb = &v3d->ssbo[s].sb[i]; if (sb->buffer) { v3d_flush_jobs_reading_resource(v3d, sb->buffer, - V3D_FLUSH_NOT_CURRENT_JOB); + V3D_FLUSH_NOT_CURRENT_JOB, + s == PIPE_SHADER_COMPUTE); } } @@ -183,7 +201,8 @@ struct v3d_image_view *view = &v3d->shaderimg[s].si[i]; v3d_flush_jobs_reading_resource(v3d, view->base.resource, - V3D_FLUSH_NOT_CURRENT_JOB); + V3D_FLUSH_NOT_CURRENT_JOB, + s == PIPE_SHADER_COMPUTE); } /* Flush writes to our vertex buffers (i.e. from transform feedback) */ @@ -192,7 +211,8 @@ struct pipe_vertex_buffer *vb = &v3d->vertexbuf.vb[i]; v3d_flush_jobs_writing_resource(v3d, vb->buffer.resource, - V3D_FLUSH_DEFAULT); + V3D_FLUSH_DEFAULT, + false); } } } @@ -213,7 +233,8 @@ const struct pipe_stream_output_target *target = so->targets[i]; v3d_flush_jobs_reading_resource(v3d, target->buffer, - V3D_FLUSH_DEFAULT); + V3D_FLUSH_DEFAULT, + false); } } } @@ -328,6 +349,279 @@ } } +struct vpm_config { + uint32_t As; + uint32_t Vc; + uint32_t Gs; + uint32_t Gd; + uint32_t Gv; + uint32_t Ve; + uint32_t gs_width; +}; + +#if V3D_VERSION >= 41 +static void +v3d_emit_gs_state_record(struct v3d_job *job, + struct v3d_compiled_shader *gs_bin, + struct v3d_cl_reloc gs_bin_uniforms, + struct v3d_compiled_shader *gs, + struct v3d_cl_reloc gs_render_uniforms) +{ + cl_emit(&job->indirect, GEOMETRY_SHADER_STATE_RECORD, shader) { + shader.geometry_bin_mode_shader_code_address = + cl_address(v3d_resource(gs_bin->resource)->bo, + gs_bin->offset); + shader.geometry_bin_mode_shader_4_way_threadable = + gs_bin->prog_data.gs->base.threads == 4; + shader.geometry_bin_mode_shader_start_in_final_thread_section = + gs_bin->prog_data.gs->base.single_seg; + shader.geometry_bin_mode_shader_propagate_nans = true; + shader.geometry_bin_mode_shader_uniforms_address = + gs_bin_uniforms; + + shader.geometry_render_mode_shader_code_address = + cl_address(v3d_resource(gs->resource)->bo, gs->offset); + shader.geometry_render_mode_shader_4_way_threadable = + gs->prog_data.gs->base.threads == 4; + shader.geometry_render_mode_shader_start_in_final_thread_section = + gs->prog_data.gs->base.single_seg; + shader.geometry_render_mode_shader_propagate_nans = true; + shader.geometry_render_mode_shader_uniforms_address = + gs_render_uniforms; + } +} + +static uint8_t +v3d_gs_output_primitive(uint32_t prim_type) +{ + switch (prim_type) { + case GL_POINTS: + return GEOMETRY_SHADER_POINTS; + case GL_LINE_STRIP: + return GEOMETRY_SHADER_LINE_STRIP; + case GL_TRIANGLE_STRIP: + return GEOMETRY_SHADER_TRI_STRIP; + default: + unreachable("Unsupported primitive type"); + } +} + +static void +v3d_emit_tes_gs_common_params(struct v3d_job *job, + uint8_t gs_out_prim_type, + uint8_t gs_num_invocations) +{ + /* This, and v3d_emit_tes_gs_shader_params below, fill in default + * values for tessellation fields even though we don't support + * tessellation yet because our packing functions (and the simulator) + * complain if we don't. + */ + cl_emit(&job->indirect, TESSELLATION_GEOMETRY_COMMON_PARAMS, shader) { + shader.tessellation_type = TESSELLATION_TYPE_TRIANGLE; + shader.tessellation_point_mode = false; + shader.tessellation_edge_spacing = TESSELLATION_EDGE_SPACING_EVEN; + shader.tessellation_clockwise = true; + shader.tessellation_invocations = 1; + + shader.geometry_shader_output_format = + v3d_gs_output_primitive(gs_out_prim_type); + shader.geometry_shader_instances = gs_num_invocations & 0x1F; + } +} + +static uint8_t +simd_width_to_gs_pack_mode(uint32_t width) +{ + switch (width) { + case 16: + return V3D_PACK_MODE_16_WAY; + case 8: + return V3D_PACK_MODE_8_WAY; + case 4: + return V3D_PACK_MODE_4_WAY; + case 1: + return V3D_PACK_MODE_1_WAY; + default: + unreachable("Invalid SIMD width"); + }; +} + +static void +v3d_emit_tes_gs_shader_params(struct v3d_job *job, + uint32_t gs_simd, + uint32_t gs_vpm_output_size, + uint32_t gs_max_vpm_input_size_per_batch) +{ + cl_emit(&job->indirect, TESSELLATION_GEOMETRY_SHADER_PARAMS, shader) { + shader.tcs_batch_flush_mode = V3D_TCS_FLUSH_MODE_FULLY_PACKED; + shader.per_patch_data_column_depth = 1; + shader.tcs_output_segment_size_in_sectors = 1; + shader.tcs_output_segment_pack_mode = V3D_PACK_MODE_16_WAY; + shader.tes_output_segment_size_in_sectors = 1; + shader.tes_output_segment_pack_mode = V3D_PACK_MODE_16_WAY; + shader.gs_output_segment_size_in_sectors = gs_vpm_output_size; + shader.gs_output_segment_pack_mode = + simd_width_to_gs_pack_mode(gs_simd); + shader.tbg_max_patches_per_tcs_batch = 1; + shader.tbg_max_extra_vertex_segs_for_patches_after_first = 0; + shader.tbg_min_tcs_output_segments_required_in_play = 1; + shader.tbg_min_per_patch_data_segments_required_in_play = 1; + shader.tpg_max_patches_per_tes_batch = 1; + shader.tpg_max_vertex_segments_per_tes_batch = 0; + shader.tpg_max_tcs_output_segments_per_tes_batch = 1; + shader.tpg_min_tes_output_segments_required_in_play = 1; + shader.gbg_max_tes_output_vertex_segments_per_gs_batch = + gs_max_vpm_input_size_per_batch; + shader.gbg_min_gs_output_segments_required_in_play = 1; + } +} + +static inline uint32_t +compute_vpm_size_in_sectors(const struct v3d_device_info *devinfo) +{ + assert(devinfo->vpm_size > 0); + const uint32_t sector_size = V3D_CHANNELS * sizeof(uint32_t) * 8; + return devinfo->vpm_size / sector_size; +} + +/* Computes various parameters affecting VPM memory configuration for programs + * involving geometry shaders to ensure the program fits in memory and honors + * requirements described in section "VPM usage" of the programming manual. + */ +static void +compute_vpm_config_gs(struct v3d_device_info *devinfo, + struct v3d_vs_prog_data *vs, + struct v3d_gs_prog_data *gs, + struct vpm_config *vpm_cfg_out) +{ + const uint32_t A = vs->separate_segments ? 1 : 0; + const uint32_t Ad = vs->vpm_input_size; + const uint32_t Vd = vs->vpm_output_size; + + const uint32_t vpm_size = compute_vpm_size_in_sectors(devinfo); + + /* Try to fit program into our VPM memory budget by adjusting + * configurable parameters iteratively. We do this in two phases: + * the first phase tries to fit the program into the total available + * VPM memory. If we suceed at that, then the second phase attempts + * to fit the program into half of that budget so we can run bin and + * render programs in parallel. + */ + struct vpm_config vpm_cfg[2]; + struct vpm_config *final_vpm_cfg = NULL; + uint32_t phase = 0; + + vpm_cfg[phase].As = 1; + vpm_cfg[phase].Gs = 1; + vpm_cfg[phase].Gd = gs->vpm_output_size; + vpm_cfg[phase].gs_width = gs->simd_width; + + /* While there is a requirement that Vc >= [Vn / 16], this is + * always the case when tessellation is not present because in that + * case Vn can only be 6 at most (when input primitive is triangles + * with adjacency). + * + * We always choose Vc=2. We can't go lower than this due to GFXH-1744, + * and Broadcom has not found it worth it to increase it beyond this + * in general. Increasing Vc also increases VPM memory pressure which + * can turn up being detrimental for performance in some scenarios. + */ + vpm_cfg[phase].Vc = 2; + + /* Gv is a constraint on the hardware to not exceed the + * specified number of vertex segments per GS batch. If adding a + * new primitive to a GS batch would result in a range of more + * than Gv vertex segments being referenced by the batch, then + * the hardware will flush the batch and start a new one. This + * means that we can choose any value we want, we just need to + * be aware that larger values improve GS batch utilization + * at the expense of more VPM memory pressure (which can affect + * other performance aspects, such as GS dispatch width). + * We start with the largest value, and will reduce it if we + * find that total memory pressure is too high. + */ + vpm_cfg[phase].Gv = 3; + do { + /* When GS is present in absence of TES, then we need to satisfy + * that Ve >= Gv. We go with the smallest value of Ve to avoid + * increasing memory pressure. + */ + vpm_cfg[phase].Ve = vpm_cfg[phase].Gv; + + uint32_t vpm_sectors = + A * vpm_cfg[phase].As * Ad + + (vpm_cfg[phase].Vc + vpm_cfg[phase].Ve) * Vd + + vpm_cfg[phase].Gs * vpm_cfg[phase].Gd; + + /* Ideally we want to use no more than half of the available + * memory so we can execute a bin and render program in parallel + * without stalls. If we achieved that then we are done. + */ + if (vpm_sectors <= vpm_size / 2) { + final_vpm_cfg = &vpm_cfg[phase]; + break; + } + + /* At the very least, we should not allocate more than the + * total available VPM memory. If we have a configuration that + * succeeds at this we save it and continue to see if we can + * meet the half-memory-use criteria too. + */ + if (phase == 0 && vpm_sectors <= vpm_size) { + vpm_cfg[1] = vpm_cfg[0]; + phase = 1; + } + + /* Try lowering Gv */ + if (vpm_cfg[phase].Gv > 0) { + vpm_cfg[phase].Gv--; + continue; + } + + /* Try lowering GS dispatch width */ + if (vpm_cfg[phase].gs_width > 1) { + do { + vpm_cfg[phase].gs_width >>= 1; + vpm_cfg[phase].Gd = + align(vpm_cfg[phase].Gd, 2) / 2; + } while (vpm_cfg[phase].gs_width == 2); + + /* Reset Gv to max after dropping dispatch width */ + vpm_cfg[phase].Gv = 3; + continue; + } + + /* We ran out of options to reduce memory pressure. If we + * are at phase 1 we have at least a valid configuration, so we + * we use that. + */ + if (phase == 1) + final_vpm_cfg = &vpm_cfg[0]; + break; + } while (true); + + if (!final_vpm_cfg) { + /* FIXME: maybe return a boolean to indicate failure and use + * that to stop the submission for this draw call. + */ + fprintf(stderr, "Failed to allocate VPM memory.\n"); + abort(); + } + + assert(final_vpm_cfg); + assert(final_vpm_cfg->Gd <= 16); + assert(final_vpm_cfg->Gv < 4); + assert(final_vpm_cfg->Ve < 4); + assert(final_vpm_cfg->Vc >= 2 && final_vpm_cfg->Vc <= 4); + assert(final_vpm_cfg->gs_width == 1 || + final_vpm_cfg->gs_width == 4 || + final_vpm_cfg->gs_width == 8 || + final_vpm_cfg->gs_width == 16); + + *vpm_cfg_out = *final_vpm_cfg; +} +#endif + static void v3d_emit_gl_shader_state(struct v3d_context *v3d, const struct pipe_draw_info *info) @@ -340,20 +634,57 @@ /* Upload the uniforms to the indirect CL first */ struct v3d_cl_reloc fs_uniforms = - v3d_write_uniforms(v3d, v3d->prog.fs, + v3d_write_uniforms(v3d, job, v3d->prog.fs, PIPE_SHADER_FRAGMENT); + + struct v3d_cl_reloc gs_uniforms = { NULL, 0 }; + struct v3d_cl_reloc gs_bin_uniforms = { NULL, 0 }; + if (v3d->prog.gs) { + gs_uniforms = v3d_write_uniforms(v3d, job, v3d->prog.gs, + PIPE_SHADER_GEOMETRY); + } + if (v3d->prog.gs_bin) { + gs_bin_uniforms = v3d_write_uniforms(v3d, job, v3d->prog.gs_bin, + PIPE_SHADER_GEOMETRY); + } + struct v3d_cl_reloc vs_uniforms = - v3d_write_uniforms(v3d, v3d->prog.vs, + v3d_write_uniforms(v3d, job, v3d->prog.vs, PIPE_SHADER_VERTEX); struct v3d_cl_reloc cs_uniforms = - v3d_write_uniforms(v3d, v3d->prog.cs, + v3d_write_uniforms(v3d, job, v3d->prog.cs, PIPE_SHADER_VERTEX); + /* Update the cache dirty flag based on the shader progs data */ + job->tmu_dirty_rcl |= v3d->prog.cs->prog_data.vs->base.tmu_dirty_rcl; + job->tmu_dirty_rcl |= v3d->prog.vs->prog_data.vs->base.tmu_dirty_rcl; + if (v3d->prog.gs_bin) { + job->tmu_dirty_rcl |= + v3d->prog.gs_bin->prog_data.gs->base.tmu_dirty_rcl; + } + if (v3d->prog.gs) { + job->tmu_dirty_rcl |= + v3d->prog.gs->prog_data.gs->base.tmu_dirty_rcl; + } + job->tmu_dirty_rcl |= v3d->prog.fs->prog_data.fs->base.tmu_dirty_rcl; + /* See GFXH-930 workaround below */ uint32_t num_elements_to_emit = MAX2(vtx->num_elements, 1); + + uint32_t shader_state_record_length = + cl_packet_length(GL_SHADER_STATE_RECORD); +#if V3D_VERSION >= 41 + if (v3d->prog.gs) { + shader_state_record_length += + cl_packet_length(GEOMETRY_SHADER_STATE_RECORD) + + cl_packet_length(TESSELLATION_GEOMETRY_COMMON_PARAMS) + + 2 * cl_packet_length(TESSELLATION_GEOMETRY_SHADER_PARAMS); + } +#endif + uint32_t shader_rec_offset = - v3d_cl_ensure_space(&job->indirect, - cl_packet_length(GL_SHADER_STATE_RECORD) + + v3d_cl_ensure_space(&job->indirect, + shader_state_record_length + num_elements_to_emit * cl_packet_length(GL_SHADER_STATE_ATTRIBUTE_RECORD), 32); @@ -362,6 +693,54 @@ * compile time, so that we mostly just have to OR the VS and FS * records together at draw time. */ + + struct vpm_config vpm_cfg_bin, vpm_cfg; + + assert(v3d->screen->devinfo.ver >= 41 || !v3d->prog.gs); + if (!v3d->prog.gs) { + vpm_cfg_bin.As = 1; + vpm_cfg_bin.Ve = 0; + vpm_cfg_bin.Vc = v3d->prog.cs->prog_data.vs->vcm_cache_size; + + vpm_cfg.As = 1; + vpm_cfg.Ve = 0; + vpm_cfg.Vc = v3d->prog.vs->prog_data.vs->vcm_cache_size; + } +#if V3D_VERSION >= 41 + else { + v3d_emit_gs_state_record(v3d->job, + v3d->prog.gs_bin, gs_bin_uniforms, + v3d->prog.gs, gs_uniforms); + + struct v3d_gs_prog_data *gs = v3d->prog.gs->prog_data.gs; + struct v3d_gs_prog_data *gs_bin = v3d->prog.gs_bin->prog_data.gs; + + v3d_emit_tes_gs_common_params(v3d->job, + gs->out_prim_type, + gs->num_invocations); + + /* Bin Tes/Gs params */ + struct v3d_vs_prog_data *vs_bin = v3d->prog.cs->prog_data.vs; + compute_vpm_config_gs(&v3d->screen->devinfo, + vs_bin, gs_bin, &vpm_cfg_bin); + + v3d_emit_tes_gs_shader_params(v3d->job, + vpm_cfg_bin.gs_width, + vpm_cfg_bin.Gd, + vpm_cfg_bin.Gv); + + /* Render Tes/Gs params */ + struct v3d_vs_prog_data *vs = v3d->prog.vs->prog_data.vs; + compute_vpm_config_gs(&v3d->screen->devinfo, + vs, gs, &vpm_cfg); + + v3d_emit_tes_gs_shader_params(v3d->job, + vpm_cfg.gs_width, + vpm_cfg.Gd, + vpm_cfg.Gv); + } +#endif + cl_emit(&job->indirect, GL_SHADER_STATE_RECORD, shader) { shader.enable_clipping = true; /* VC5_DIRTY_PRIM_MODE | VC5_DIRTY_RASTERIZER */ @@ -385,6 +764,12 @@ shader.fragment_shader_uses_real_pixel_centre_w_in_addition_to_centroid_w2 = v3d->prog.fs->prog_data.fs->uses_center_w; +#if V3D_VERSION >= 41 + shader.any_shader_reads_hardware_written_primitive_id = + v3d->prog.gs ? v3d->prog.gs->prog_data.gs->uses_pid : + false; +#endif + #if V3D_VERSION >= 40 shader.do_scoreboard_wait_on_first_thread_switch = v3d->prog.fs->prog_data.fs->lock_scoreboard_on_first_thrsw; @@ -434,8 +819,15 @@ shader.fragment_shader_uniforms_address = fs_uniforms; #if V3D_VERSION >= 41 - shader.min_coord_shader_input_segments_required_in_play = 1; - shader.min_vertex_shader_input_segments_required_in_play = 1; + shader.min_coord_shader_input_segments_required_in_play = + vpm_cfg_bin.As; + shader.min_vertex_shader_input_segments_required_in_play = + vpm_cfg.As; + + shader.min_coord_shader_output_segments_required_in_play_in_addition_to_vcm_cache_size = + vpm_cfg_bin.Ve; + shader.min_vertex_shader_output_segments_required_in_play_in_addition_to_vcm_cache_size = + vpm_cfg.Ve; shader.coordinate_shader_4_way_threadable = v3d->prog.cs->prog_data.vs->base.threads == 4; @@ -539,34 +931,54 @@ } cl_emit(&job->bcl, VCM_CACHE_SIZE, vcm) { - vcm.number_of_16_vertex_batches_for_binning = - v3d->prog.cs->prog_data.vs->vcm_cache_size; - vcm.number_of_16_vertex_batches_for_rendering = - v3d->prog.vs->prog_data.vs->vcm_cache_size; + vcm.number_of_16_vertex_batches_for_binning = vpm_cfg_bin.Vc; + vcm.number_of_16_vertex_batches_for_rendering = vpm_cfg.Vc; } +#if V3D_VERSION >= 41 + if (v3d->prog.gs) { + cl_emit(&job->bcl, GL_SHADER_STATE_INCLUDING_GS, state) { + state.address = cl_address(job->indirect.bo, + shader_rec_offset); + state.number_of_attribute_arrays = num_elements_to_emit; + } + } else { + cl_emit(&job->bcl, GL_SHADER_STATE, state) { + state.address = cl_address(job->indirect.bo, + shader_rec_offset); + state.number_of_attribute_arrays = num_elements_to_emit; + } + } +#else + assert(!v3d->prog.gs); cl_emit(&job->bcl, GL_SHADER_STATE, state) { state.address = cl_address(job->indirect.bo, shader_rec_offset); state.number_of_attribute_arrays = num_elements_to_emit; } +#endif v3d_bo_unreference(&cs_uniforms.bo); v3d_bo_unreference(&vs_uniforms.bo); + if (gs_uniforms.bo) + v3d_bo_unreference(&gs_uniforms.bo); + if (gs_bin_uniforms.bo) + v3d_bo_unreference(&gs_bin_uniforms.bo); v3d_bo_unreference(&fs_uniforms.bo); - - job->shader_rec_count++; } /** - * Updates the number of primitvies generated from the number of vertices - * to draw. We do this here instead of using PRIMITIVE_COUNTS_FEEDBACK because - * using the GPU packet for this might require sync waits and this is trivial - * to handle in the CPU instead. + * Updates the number of primitives generated from the number of vertices + * to draw. This only works when no GS is present, since otherwise the number + * of primitives generated cannot be determined in advance and we need to + * use the PRIMITIVE_COUNTS_FEEDBACK command instead, however, that requires + * a sync wait for the draw to complete, so we only use that when GS is present. */ static void v3d_update_primitives_generated_counter(struct v3d_context *v3d, const struct pipe_draw_info *info) { + assert(!v3d->prog.gs); + if (!v3d->active_queries) return; @@ -618,6 +1030,57 @@ job->first_ez_state = job->ez_state; } +static uint32_t +v3d_hw_prim_type(enum pipe_prim_type prim_type) +{ + switch (prim_type) { + case PIPE_PRIM_POINTS: + case PIPE_PRIM_LINES: + case PIPE_PRIM_LINE_LOOP: + case PIPE_PRIM_LINE_STRIP: + case PIPE_PRIM_TRIANGLES: + case PIPE_PRIM_TRIANGLE_STRIP: + case PIPE_PRIM_TRIANGLE_FAN: + return prim_type; + + case PIPE_PRIM_LINES_ADJACENCY: + case PIPE_PRIM_LINE_STRIP_ADJACENCY: + case PIPE_PRIM_TRIANGLES_ADJACENCY: + case PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY: + return 8 + (prim_type - PIPE_PRIM_LINES_ADJACENCY); + + default: + unreachable("Unsupported primitive type"); + } +} + +static bool +v3d_check_compiled_shaders(struct v3d_context *v3d) +{ + static bool warned[5] = { 0 }; + + uint32_t failed_stage = MESA_SHADER_NONE; + if (!v3d->prog.vs->resource || !v3d->prog.cs->resource) { + failed_stage = MESA_SHADER_VERTEX; + } else if ((v3d->prog.gs_bin && !v3d->prog.gs_bin->resource) || + (v3d->prog.gs && !v3d->prog.gs->resource)) { + failed_stage = MESA_SHADER_GEOMETRY; + } else if (v3d->prog.fs && !v3d->prog.fs->resource) { + failed_stage = MESA_SHADER_FRAGMENT; + } + + if (likely(failed_stage == MESA_SHADER_NONE)) + return true; + + if (!warned[failed_stage]) { + fprintf(stderr, + "%s shader failed to compile. Expect corruption.\n", + _mesa_shader_stage_to_string(failed_stage)); + warned[failed_stage] = true; + } + return false; +} + static void v3d_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info) { @@ -648,7 +1111,7 @@ } } - if (info->mode >= PIPE_PRIM_QUADS) { + if (info->mode >= PIPE_PRIM_QUADS && info->mode <= PIPE_PRIM_POLYGON) { util_primconvert_save_rasterizer_state(v3d->primconvert, &v3d->rasterizer->base); util_primconvert_draw_vbo(v3d->primconvert, info); perf_debug("Fallback conversion for %d %s vertices\n", @@ -664,7 +1127,7 @@ if (info->indirect) { v3d_flush_jobs_writing_resource(v3d, info->indirect->buffer, - V3D_FLUSH_DEFAULT); + V3D_FLUSH_DEFAULT, false); } v3d_predraw_check_outputs(pctx); @@ -677,7 +1140,7 @@ */ if (v3d->streamout.num_targets > 0 && u_base_prim_type(info->mode) != u_base_prim_type(v3d->prim_mode)) { - v3d_tf_update_counters(v3d); + v3d_update_primitive_counters(v3d); } struct v3d_job *job = v3d_get_job_for_fbo(v3d); @@ -696,8 +1159,16 @@ job->submit.in_sync_bcl = v3d->out_sync; } - /* Mark SSBOs as being written. We don't actually know which ones are - * read vs written, so just assume the worst + /* We also need to ensure that compute is complete when render depends + * on resources written by it. + */ + if (v3d->sync_on_last_compute_job) { + job->submit.in_sync_bcl = v3d->out_sync; + v3d->sync_on_last_compute_job = false; + } + + /* Mark SSBOs and images as being written. We don't actually know + * which ones are read vs written, so just assume the worst. */ for (int s = 0; s < PIPE_SHADER_COMPUTE; s++) { foreach_bit(i, v3d->ssbo[s].enabled_mask) { @@ -725,6 +1196,8 @@ v3d_start_draw(v3d); v3d_update_compiled_shaders(v3d, info->mode); + if (!v3d_check_compiled_shaders(v3d)) + return; v3d_update_job_ez(v3d, job); /* If this job was writing to transform feedback buffers before this @@ -749,9 +1222,15 @@ VC5_DIRTY_RASTERIZER | VC5_DIRTY_COMPILED_CS | VC5_DIRTY_COMPILED_VS | + VC5_DIRTY_COMPILED_GS_BIN | + VC5_DIRTY_COMPILED_GS | VC5_DIRTY_COMPILED_FS | v3d->prog.cs->uniform_dirty_bits | v3d->prog.vs->uniform_dirty_bits | + (v3d->prog.gs_bin ? + v3d->prog.gs_bin->uniform_dirty_bits : 0) | + (v3d->prog.gs ? + v3d->prog.gs->uniform_dirty_bits : 0) | v3d->prog.fs->uniform_dirty_bits)) { v3d_emit_gl_shader_state(v3d, info); } @@ -777,11 +1256,10 @@ prim_tf_enable = (V3D_PRIM_POINTS_TF - V3D_PRIM_POINTS); #endif - v3d_update_primitives_generated_counter(v3d, info); + if (!v3d->prog.gs) + v3d_update_primitives_generated_counter(v3d, info); - /* Note that the primitive type fields match with OpenGL/gallium - * definitions, up to but not including QUADS. - */ + uint32_t hw_prim_type = v3d_hw_prim_type(info->mode); if (info->index_size) { uint32_t index_size = info->index_size; uint32_t offset = info->start * index_size; @@ -811,7 +1289,7 @@ prim.address_of_indices_list = cl_address(rsc->bo, offset); #endif /* V3D_VERSION < 40 */ - prim.mode = info->mode | prim_tf_enable; + prim.mode = hw_prim_type | prim_tf_enable; prim.enable_primitive_restarts = info->primitive_restart; prim.number_of_draw_indirect_indexed_records = info->indirect->draw_count; @@ -830,7 +1308,7 @@ prim.address_of_indices_list = cl_address(rsc->bo, offset); #endif /* V3D_VERSION < 40 */ - prim.mode = info->mode | prim_tf_enable; + prim.mode = hw_prim_type | prim_tf_enable; prim.enable_primitive_restarts = info->primitive_restart; prim.number_of_instances = info->instance_count; @@ -847,19 +1325,17 @@ prim.address_of_indices_list = cl_address(rsc->bo, offset); #endif /* V3D_VERSION < 40 */ - prim.mode = info->mode | prim_tf_enable; + prim.mode = hw_prim_type | prim_tf_enable; prim.enable_primitive_restarts = info->primitive_restart; } } - job->draw_calls_queued++; - if (info->has_user_indices) pipe_resource_reference(&prsc, NULL); } else { if (info->indirect) { cl_emit(&job->bcl, INDIRECT_VERTEX_ARRAY_INSTANCED_PRIMS, prim) { - prim.mode = info->mode | prim_tf_enable; + prim.mode = hw_prim_type | prim_tf_enable; prim.number_of_draw_indirect_array_records = info->indirect->draw_count; prim.stride_in_multiples_of_4_bytes = info->indirect->stride >> 2; @@ -873,7 +1349,7 @@ v3d_stream_output_target_get_vertex_count(so) : info->count; cl_emit(&job->bcl, VERTEX_ARRAY_INSTANCED_PRIMS, prim) { - prim.mode = info->mode | prim_tf_enable; + prim.mode = hw_prim_type | prim_tf_enable; prim.index_of_first_vertex = info->start; prim.number_of_instances = info->instance_count; prim.instance_length = vert_count; @@ -885,7 +1361,7 @@ v3d_stream_output_target_get_vertex_count(so) : info->count; cl_emit(&job->bcl, VERTEX_ARRAY_PRIMS, prim) { - prim.mode = info->mode | prim_tf_enable; + prim.mode = hw_prim_type | prim_tf_enable; prim.length = vert_count; prim.index_of_first_vertex = info->start; } @@ -899,6 +1375,8 @@ cl_emit(&job->bcl, TRANSFORM_FEEDBACK_FLUSH_AND_COUNT, flush); job->draw_calls_queued++; + if (v3d->streamout.num_targets) + job->tf_draw_calls_queued++; /* Increment the TF offsets by how many verts we wrote. XXX: This * needs some clamping to the buffer size. @@ -955,6 +1433,178 @@ v3d_flush(pctx); } +#if V3D_VERSION >= 41 +#define V3D_CSD_CFG012_WG_COUNT_SHIFT 16 +#define V3D_CSD_CFG012_WG_OFFSET_SHIFT 0 +/* Allow this dispatch to start while the last one is still running. */ +#define V3D_CSD_CFG3_OVERLAP_WITH_PREV (1 << 26) +/* Maximum supergroup ID. 6 bits. */ +#define V3D_CSD_CFG3_MAX_SG_ID_SHIFT 20 +/* Batches per supergroup minus 1. 8 bits. */ +#define V3D_CSD_CFG3_BATCHES_PER_SG_M1_SHIFT 12 +/* Workgroups per supergroup, 0 means 16 */ +#define V3D_CSD_CFG3_WGS_PER_SG_SHIFT 8 +#define V3D_CSD_CFG3_WG_SIZE_SHIFT 0 + +#define V3D_CSD_CFG5_PROPAGATE_NANS (1 << 2) +#define V3D_CSD_CFG5_SINGLE_SEG (1 << 1) +#define V3D_CSD_CFG5_THREADING (1 << 0) + +static void +v3d_launch_grid(struct pipe_context *pctx, const struct pipe_grid_info *info) +{ + struct v3d_context *v3d = v3d_context(pctx); + struct v3d_screen *screen = v3d->screen; + + v3d_predraw_check_stage_inputs(pctx, PIPE_SHADER_COMPUTE); + + v3d_update_compiled_cs(v3d); + + if (!v3d->prog.compute->resource) { + static bool warned = false; + if (!warned) { + fprintf(stderr, + "Compute shader failed to compile. " + "Expect corruption.\n"); + warned = true; + } + return; + } + + /* Some of the units of scale: + * + * - Batches of 16 work items (shader invocations) that will be queued + * to the run on a QPU at once. + * + * - Workgroups composed of work items based on the shader's layout + * declaration. + * + * - Supergroups of 1-16 workgroups. There can only be 16 supergroups + * running at a time on the core, so we want to keep them large to + * keep the QPUs busy, but a whole supergroup will sync at a barrier + * so we want to keep them small if one is present. + */ + struct drm_v3d_submit_csd submit = { 0 }; + struct v3d_job *job = v3d_job_create(v3d); + + /* Set up the actual number of workgroups, synchronously mapping the + * indirect buffer if necessary to get the dimensions. + */ + if (info->indirect) { + struct pipe_transfer *transfer; + uint32_t *map = pipe_buffer_map_range(pctx, info->indirect, + info->indirect_offset, + 3 * sizeof(uint32_t), + PIPE_TRANSFER_READ, + &transfer); + memcpy(v3d->compute_num_workgroups, map, 3 * sizeof(uint32_t)); + pipe_buffer_unmap(pctx, transfer); + + if (v3d->compute_num_workgroups[0] == 0 || + v3d->compute_num_workgroups[1] == 0 || + v3d->compute_num_workgroups[2] == 0) { + /* Nothing to dispatch, so skip the draw (CSD can't + * handle 0 workgroups). + */ + return; + } + } else { + v3d->compute_num_workgroups[0] = info->grid[0]; + v3d->compute_num_workgroups[1] = info->grid[1]; + v3d->compute_num_workgroups[2] = info->grid[2]; + } + + for (int i = 0; i < 3; i++) { + submit.cfg[i] |= (v3d->compute_num_workgroups[i] << + V3D_CSD_CFG012_WG_COUNT_SHIFT); + } + + perf_debug("CSD only using single WG per SG currently, " + "should increase that when possible."); + int wgs_per_sg = 1; + int wg_size = info->block[0] * info->block[1] * info->block[2]; + submit.cfg[3] |= wgs_per_sg << V3D_CSD_CFG3_WGS_PER_SG_SHIFT; + submit.cfg[3] |= ((DIV_ROUND_UP(wgs_per_sg * wg_size, 16) - 1) << + V3D_CSD_CFG3_BATCHES_PER_SG_M1_SHIFT); + submit.cfg[3] |= (wg_size & 0xff) << V3D_CSD_CFG3_WG_SIZE_SHIFT; + + int batches_per_wg = DIV_ROUND_UP(wg_size, 16); + /* Number of batches the dispatch will invoke (minus 1). */ + submit.cfg[4] = batches_per_wg * (v3d->compute_num_workgroups[0] * + v3d->compute_num_workgroups[1] * + v3d->compute_num_workgroups[2]) - 1; + + /* Make sure we didn't accidentally underflow. */ + assert(submit.cfg[4] != ~0); + + v3d_job_add_bo(job, v3d_resource(v3d->prog.compute->resource)->bo); + submit.cfg[5] = (v3d_resource(v3d->prog.compute->resource)->bo->offset + + v3d->prog.compute->offset); + submit.cfg[5] |= V3D_CSD_CFG5_PROPAGATE_NANS; + if (v3d->prog.compute->prog_data.base->single_seg) + submit.cfg[5] |= V3D_CSD_CFG5_SINGLE_SEG; + if (v3d->prog.compute->prog_data.base->threads == 4) + submit.cfg[5] |= V3D_CSD_CFG5_THREADING; + + if (v3d->prog.compute->prog_data.compute->shared_size) { + v3d->compute_shared_memory = + v3d_bo_alloc(v3d->screen, + v3d->prog.compute->prog_data.compute->shared_size * + wgs_per_sg, + "shared_vars"); + } + + struct v3d_cl_reloc uniforms = v3d_write_uniforms(v3d, job, + v3d->prog.compute, + PIPE_SHADER_COMPUTE); + v3d_job_add_bo(job, uniforms.bo); + submit.cfg[6] = uniforms.bo->offset + uniforms.offset; + + /* Pull some job state that was stored in a SUBMIT_CL struct out to + * our SUBMIT_CSD struct + */ + submit.bo_handles = job->submit.bo_handles; + submit.bo_handle_count = job->submit.bo_handle_count; + + /* Serialize this in the rest of our command stream. */ + submit.in_sync = v3d->out_sync; + submit.out_sync = v3d->out_sync; + + if (!(V3D_DEBUG & V3D_DEBUG_NORAST)) { + int ret = v3d_ioctl(screen->fd, DRM_IOCTL_V3D_SUBMIT_CSD, + &submit); + static bool warned = false; + if (ret && !warned) { + fprintf(stderr, "CSD submit call returned %s. " + "Expect corruption.\n", strerror(errno)); + warned = true; + } + } + + v3d_job_free(v3d, job); + + /* Mark SSBOs as being written.. we don't actually know which ones are + * read vs written, so just assume the worst + */ + foreach_bit(i, v3d->ssbo[PIPE_SHADER_COMPUTE].enabled_mask) { + struct v3d_resource *rsc = v3d_resource( + v3d->ssbo[PIPE_SHADER_COMPUTE].sb[i].buffer); + rsc->writes++; + rsc->compute_written = true; + } + + foreach_bit(i, v3d->shaderimg[PIPE_SHADER_COMPUTE].enabled_mask) { + struct v3d_resource *rsc = v3d_resource( + v3d->shaderimg[PIPE_SHADER_COMPUTE].si[i].base.resource); + rsc->writes++; + rsc->compute_written = true; + } + + v3d_bo_unreference(&uniforms.bo); + v3d_bo_unreference(&v3d->compute_shared_memory); +} +#endif + /** * Implements gallium's clear() hook (glClear()) by drawing a pair of triangles. */ @@ -1131,4 +1781,8 @@ pctx->clear = v3d_clear; pctx->clear_render_target = v3d_clear_render_target; pctx->clear_depth_stencil = v3d_clear_depth_stencil; +#if V3D_VERSION >= 41 + if (v3d_context(pctx)->screen->has_csd) + pctx->launch_grid = v3d_launch_grid; +#endif } diff -Nru mesa-19.2.8/src/gallium/drivers/v3d/v3dx_emit.c mesa-20.0.8/src/gallium/drivers/v3d/v3dx_emit.c --- mesa-19.2.8/src/gallium/drivers/v3d/v3dx_emit.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/v3d/v3dx_emit.c 2020-06-12 01:21:17.000000000 +0000 @@ -21,7 +21,7 @@ * IN THE SOFTWARE. */ -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_half.h" #include "v3d_context.h" #include "broadcom/common/v3d_macros.h" @@ -401,6 +401,15 @@ return emitted_any; } +static inline struct v3d_uncompiled_shader * +get_tf_shader(struct v3d_context *v3d) +{ + if (v3d->prog.bind_gs) + return v3d->prog.bind_gs; + else + return v3d->prog.bind_vs; +} + void v3dX(emit_state)(struct pipe_context *pctx) { @@ -655,6 +664,9 @@ if (v3d->dirty & VC5_DIRTY_FRAGTEX) emit_textures(v3d, &v3d->tex[PIPE_SHADER_FRAGMENT]); + if (v3d->dirty & VC5_DIRTY_GEOMTEX) + emit_textures(v3d, &v3d->tex[PIPE_SHADER_GEOMETRY]); + if (v3d->dirty & VC5_DIRTY_VERTTEX) emit_textures(v3d, &v3d->tex[PIPE_SHADER_VERTEX]); #endif @@ -692,13 +704,14 @@ VC5_DIRTY_RASTERIZER | VC5_DIRTY_PRIM_MODE)) { struct v3d_streamout_stateobj *so = &v3d->streamout; - if (so->num_targets) { bool psiz_per_vertex = (v3d->prim_mode == PIPE_PRIM_POINTS && v3d->rasterizer->base.point_size_per_vertex); + struct v3d_uncompiled_shader *tf_shader = + get_tf_shader(v3d); uint16_t *tf_specs = (psiz_per_vertex ? - v3d->prog.bind_vs->tf_specs_psiz : - v3d->prog.bind_vs->tf_specs); + tf_shader->tf_specs_psiz : + tf_shader->tf_specs); #if V3D_VERSION >= 40 bool tf_enabled = v3d_transform_feedback_enabled(v3d); @@ -706,7 +719,7 @@ cl_emit(&job->bcl, TRANSFORM_FEEDBACK_SPECS, tfe) { tfe.number_of_16_bit_output_data_specs_following = - v3d->prog.bind_vs->num_tf_specs; + tf_shader->num_tf_specs; tfe.enable = tf_enabled; }; #else /* V3D_VERSION < 40 */ @@ -714,10 +727,10 @@ tfe.number_of_32_bit_output_buffer_address_following = so->num_targets; tfe.number_of_16_bit_output_data_specs_following = - v3d->prog.bind_vs->num_tf_specs; + tf_shader->num_tf_specs; }; #endif /* V3D_VERSION < 40 */ - for (int i = 0; i < v3d->prog.bind_vs->num_tf_specs; i++) { + for (int i = 0; i < tf_shader->num_tf_specs; i++) { cl_emit_prepacked(&job->bcl, &tf_specs[i]); } } else { @@ -731,14 +744,15 @@ /* Set up the trasnform feedback buffers. */ if (v3d->dirty & VC5_DIRTY_STREAMOUT) { + struct v3d_uncompiled_shader *tf_shader = get_tf_shader(v3d); struct v3d_streamout_stateobj *so = &v3d->streamout; for (int i = 0; i < so->num_targets; i++) { const struct pipe_stream_output_target *target = so->targets[i]; struct v3d_resource *rsc = target ? v3d_resource(target->buffer) : NULL; - struct pipe_shader_state *vs = &v3d->prog.bind_vs->base; - struct pipe_stream_output_info *info = &vs->stream_output; + struct pipe_shader_state *ss = &tf_shader->base; + struct pipe_stream_output_info *info = &ss->stream_output; uint32_t offset = (v3d->streamout.offsets[i] * info->stride[i] * 4); diff -Nru mesa-19.2.8/src/gallium/drivers/v3d/v3dx_format_table.c mesa-20.0.8/src/gallium/drivers/v3d/v3dx_format_table.c --- mesa-19.2.8/src/gallium/drivers/v3d/v3dx_format_table.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/v3d/v3dx_format_table.c 2020-06-12 01:21:17.000000000 +0000 @@ -21,7 +21,7 @@ * IN THE SOFTWARE. */ -#include "util/u_format.h" +#include "util/format/u_format.h" #include "v3d_context.h" #include "broadcom/cle/v3dx_pack.h" @@ -177,8 +177,13 @@ FORMAT(ETC2_RG11_SNORM, NO, SIGNED_RG11_EAC, SWIZ_XY01, 16, 0), FORMAT(DXT1_RGB, NO, BC1, SWIZ_XYZ1, 16, 0), - FORMAT(DXT3_RGBA, NO, BC2, SWIZ_XYZ1, 16, 0), - FORMAT(DXT5_RGBA, NO, BC3, SWIZ_XYZ1, 16, 0), + FORMAT(DXT1_SRGB, NO, BC1, SWIZ_XYZ1, 16, 0), + FORMAT(DXT1_RGBA, NO, BC1, SWIZ_XYZW, 16, 0), + FORMAT(DXT1_SRGBA, NO, BC1, SWIZ_XYZW, 16, 0), + FORMAT(DXT3_RGBA, NO, BC2, SWIZ_XYZW, 16, 0), + FORMAT(DXT3_SRGBA, NO, BC2, SWIZ_XYZW, 16, 0), + FORMAT(DXT5_RGBA, NO, BC3, SWIZ_XYZW, 16, 0), + FORMAT(DXT5_SRGBA, NO, BC3, SWIZ_XYZW, 16, 0), }; const struct v3d_format * diff -Nru mesa-19.2.8/src/gallium/drivers/v3d/v3dx_rcl.c mesa-20.0.8/src/gallium/drivers/v3d/v3dx_rcl.c --- mesa-19.2.8/src/gallium/drivers/v3d/v3dx_rcl.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/v3d/v3dx_rcl.c 2020-06-12 01:21:17.000000000 +0000 @@ -21,7 +21,7 @@ * IN THE SOFTWARE. */ -#include "util/u_format.h" +#include "util/format/u_format.h" #include "v3d_context.h" #include "v3d_tiling.h" #include "broadcom/common/v3d_macros.h" @@ -53,7 +53,7 @@ static void load_general(struct v3d_cl *cl, struct pipe_surface *psurf, int buffer, - uint32_t pipe_bit, uint32_t *loads_pending) + int layer, uint32_t pipe_bit, uint32_t *loads_pending) { struct v3d_surface *surf = v3d_surface(psurf); bool separate_stencil = surf->separate_stencil && buffer == STENCIL; @@ -64,9 +64,12 @@ struct v3d_resource *rsc = v3d_resource(psurf->texture); + uint32_t layer_offset = + v3d_layer_offset(&rsc->base, psurf->u.tex.level, + psurf->u.tex.first_layer + layer); cl_emit(cl, LOAD_TILE_BUFFER_GENERAL, load) { load.buffer_to_load = buffer; - load.address = cl_address(rsc->bo, surf->offset); + load.address = cl_address(rsc->bo, layer_offset); #if V3D_VERSION >= 40 load.memory_format = surf->tiling; @@ -109,8 +112,9 @@ static void store_general(struct v3d_job *job, - struct v3d_cl *cl, struct pipe_surface *psurf, int buffer, - int pipe_bit, uint32_t *stores_pending, bool general_color_clear) + struct v3d_cl *cl, struct pipe_surface *psurf, + int layer, int buffer, int pipe_bit, + uint32_t *stores_pending, bool general_color_clear) { struct v3d_surface *surf = v3d_surface(psurf); bool separate_stencil = surf->separate_stencil && buffer == STENCIL; @@ -126,9 +130,12 @@ rsc->writes++; + uint32_t layer_offset = + v3d_layer_offset(&rsc->base, psurf->u.tex.level, + psurf->u.tex.first_layer + layer); cl_emit(cl, STORE_TILE_BUFFER_GENERAL, store) { store.buffer_to_store = buffer; - store.address = cl_address(rsc->bo, surf->offset); + store.address = cl_address(rsc->bo, layer_offset); #if V3D_VERSION >= 40 store.clear_buffer_being_stored = false; @@ -203,7 +210,7 @@ } static void -v3d_rcl_emit_loads(struct v3d_job *job, struct v3d_cl *cl) +v3d_rcl_emit_loads(struct v3d_job *job, struct v3d_cl *cl, int layer) { uint32_t loads_pending = job->load; @@ -218,7 +225,7 @@ continue; } - load_general(cl, psurf, RENDER_TARGET_0 + i, + load_general(cl, psurf, RENDER_TARGET_0 + i, layer, bit, &loads_pending); } @@ -230,7 +237,7 @@ if (rsc->separate_stencil && (loads_pending & PIPE_CLEAR_STENCIL)) { load_general(cl, job->zsbuf, - STENCIL, + STENCIL, layer, PIPE_CLEAR_STENCIL, &loads_pending); } @@ -238,6 +245,7 @@ if (loads_pending & PIPE_CLEAR_DEPTHSTENCIL) { load_general(cl, job->zsbuf, zs_buffer_from_pipe_bits(loads_pending), + layer, loads_pending & PIPE_CLEAR_DEPTHSTENCIL, &loads_pending); } @@ -266,7 +274,7 @@ } static void -v3d_rcl_emit_stores(struct v3d_job *job, struct v3d_cl *cl) +v3d_rcl_emit_stores(struct v3d_job *job, struct v3d_cl *cl, int layer) { #if V3D_VERSION < 40 UNUSED bool needs_color_clear = job->clear & PIPE_CLEAR_COLOR_BUFFERS; @@ -316,7 +324,7 @@ continue; } - store_general(job, cl, psurf, RENDER_TARGET_0 + i, bit, + store_general(job, cl, psurf, layer, RENDER_TARGET_0 + i, bit, &stores_pending, general_color_clear); } @@ -325,20 +333,20 @@ struct v3d_resource *rsc = v3d_resource(job->zsbuf->texture); if (rsc->separate_stencil) { if (job->store & PIPE_CLEAR_DEPTH) { - store_general(job, cl, job->zsbuf, Z, - PIPE_CLEAR_DEPTH, + store_general(job, cl, job->zsbuf, layer, + Z, PIPE_CLEAR_DEPTH, &stores_pending, general_color_clear); } if (job->store & PIPE_CLEAR_STENCIL) { - store_general(job, cl, job->zsbuf, STENCIL, - PIPE_CLEAR_STENCIL, + store_general(job, cl, job->zsbuf, layer, + STENCIL, PIPE_CLEAR_STENCIL, &stores_pending, general_color_clear); } } else { - store_general(job, cl, job->zsbuf, + store_general(job, cl, job->zsbuf, layer, zs_buffer_from_pipe_bits(job->store), job->store & PIPE_CLEAR_DEPTHSTENCIL, &stores_pending, general_color_clear); @@ -400,7 +408,7 @@ } static void -v3d_rcl_emit_generic_per_tile_list(struct v3d_job *job, int last_cbuf) +v3d_rcl_emit_generic_per_tile_list(struct v3d_job *job, int layer) { /* Emit the generic list in our indirect state -- the rcl will just * have pointers into it. @@ -416,7 +424,7 @@ cl_emit(cl, TILE_COORDINATES_IMPLICIT, coords); } - v3d_rcl_emit_loads(job, cl); + v3d_rcl_emit_loads(job, cl, layer); if (V3D_VERSION < 40) { /* Tile Coordinates triggers the last reload and sets where @@ -434,7 +442,7 @@ cl_emit(cl, BRANCH_TO_IMPLICIT_TILE_LIST, branch); - v3d_rcl_emit_stores(job, cl); + v3d_rcl_emit_stores(job, cl, layer); #if V3D_VERSION >= 40 cl_emit(cl, END_OF_TILE_MARKER, end); @@ -495,13 +503,131 @@ #define div_round_up(a, b) (((a) + (b) - 1) / b) +static void +emit_render_layer(struct v3d_job *job, uint32_t layer) +{ + uint32_t supertile_w = 1, supertile_h = 1; + + /* If doing multicore binning, we would need to initialize each + * core's tile list here. + */ + uint32_t tile_alloc_offset = + layer * job->draw_tiles_x * job->draw_tiles_y * 64; + cl_emit(&job->rcl, MULTICORE_RENDERING_TILE_LIST_SET_BASE, list) { + list.address = cl_address(job->tile_alloc, tile_alloc_offset); + } + + cl_emit(&job->rcl, MULTICORE_RENDERING_SUPERTILE_CFG, config) { + uint32_t frame_w_in_supertiles, frame_h_in_supertiles; + const uint32_t max_supertiles = 256; + + /* Size up our supertiles until we get under the limit. */ + for (;;) { + frame_w_in_supertiles = div_round_up(job->draw_tiles_x, + supertile_w); + frame_h_in_supertiles = div_round_up(job->draw_tiles_y, + supertile_h); + if (frame_w_in_supertiles * + frame_h_in_supertiles < max_supertiles) { + break; + } + + if (supertile_w < supertile_h) + supertile_w++; + else + supertile_h++; + } + + config.number_of_bin_tile_lists = 1; + config.total_frame_width_in_tiles = job->draw_tiles_x; + config.total_frame_height_in_tiles = job->draw_tiles_y; + + config.supertile_width_in_tiles = supertile_w; + config.supertile_height_in_tiles = supertile_h; + + config.total_frame_width_in_supertiles = frame_w_in_supertiles; + config.total_frame_height_in_supertiles = frame_h_in_supertiles; + } + + /* Start by clearing the tile buffer. */ + cl_emit(&job->rcl, TILE_COORDINATES, coords) { + coords.tile_column_number = 0; + coords.tile_row_number = 0; + } + + /* Emit an initial clear of the tile buffers. This is necessary + * for any buffers that should be cleared (since clearing + * normally happens at the *end* of the generic tile list), but + * it's also nice to clear everything so the first tile doesn't + * inherit any contents from some previous frame. + * + * Also, implement the GFXH-1742 workaround. There's a race in + * the HW between the RCL updating the TLB's internal type/size + * and thespawning of the QPU instances using the TLB's current + * internal type/size. To make sure the QPUs get the right + * state, we need 1 dummy store in between internal type/size + * changes on V3D 3.x, and 2 dummy stores on 4.x. + */ +#if V3D_VERSION < 40 + cl_emit(&job->rcl, STORE_TILE_BUFFER_GENERAL, store) { + store.buffer_to_store = NONE; + } +#else + for (int i = 0; i < 2; i++) { + if (i > 0) + cl_emit(&job->rcl, TILE_COORDINATES, coords); + cl_emit(&job->rcl, END_OF_LOADS, end); + cl_emit(&job->rcl, STORE_TILE_BUFFER_GENERAL, store) { + store.buffer_to_store = NONE; + } + if (i == 0) { + cl_emit(&job->rcl, CLEAR_TILE_BUFFERS, clear) { + clear.clear_z_stencil_buffer = true; + clear.clear_all_render_targets = true; + } + } + cl_emit(&job->rcl, END_OF_TILE_MARKER, end); + } +#endif + + cl_emit(&job->rcl, FLUSH_VCD_CACHE, flush); + + v3d_rcl_emit_generic_per_tile_list(job, layer); + + /* XXX perf: We should expose GL_MESA_tile_raster_order to + * improve X11 performance, but we should use Morton order + * otherwise to improve cache locality. + */ + uint32_t supertile_w_in_pixels = job->tile_width * supertile_w; + uint32_t supertile_h_in_pixels = job->tile_height * supertile_h; + uint32_t min_x_supertile = job->draw_min_x / supertile_w_in_pixels; + uint32_t min_y_supertile = job->draw_min_y / supertile_h_in_pixels; + + uint32_t max_x_supertile = 0; + uint32_t max_y_supertile = 0; + if (job->draw_max_x != 0 && job->draw_max_y != 0) { + max_x_supertile = (job->draw_max_x - 1) / supertile_w_in_pixels; + max_y_supertile = (job->draw_max_y - 1) / supertile_h_in_pixels; + } + + for (int y = min_y_supertile; y <= max_y_supertile; y++) { + for (int x = min_x_supertile; x <= max_x_supertile; x++) { + cl_emit(&job->rcl, SUPERTILE_COORDINATES, coords) { + coords.column_number_in_supertiles = x; + coords.row_number_in_supertiles = y; + } + } + } +} + void v3dX(emit_rcl)(struct v3d_job *job) { /* The RCL list should be empty. */ assert(!job->rcl.bo); - v3d_cl_ensure_space_with_branch(&job->rcl, 200 + 256 * + v3d_cl_ensure_space_with_branch(&job->rcl, 200 + + MAX2(job->num_layers, 1) * 256 * cl_packet_length(SUPERTILE_COORDINATES)); job->submit.rcl_start = job->rcl.bo->offset; v3d_job_add_bo(job, job->rcl.bo); @@ -684,131 +810,15 @@ TILE_ALLOCATION_BLOCK_SIZE_64B; } - uint32_t supertile_w = 1, supertile_h = 1; - - /* If doing multicore binning, we would need to initialize each core's - * tile list here. - */ - cl_emit(&job->rcl, MULTICORE_RENDERING_TILE_LIST_SET_BASE, list) { - list.address = cl_address(job->tile_alloc, 0); - } - - cl_emit(&job->rcl, MULTICORE_RENDERING_SUPERTILE_CFG, config) { - uint32_t frame_w_in_supertiles, frame_h_in_supertiles; - const uint32_t max_supertiles = 256; - - /* Size up our supertiles until we get under the limit. */ - for (;;) { - frame_w_in_supertiles = div_round_up(job->draw_tiles_x, - supertile_w); - frame_h_in_supertiles = div_round_up(job->draw_tiles_y, - supertile_h); - if (frame_w_in_supertiles * frame_h_in_supertiles < - max_supertiles) { - break; - } - - if (supertile_w < supertile_h) - supertile_w++; - else - supertile_h++; - } - - config.number_of_bin_tile_lists = 1; - config.total_frame_width_in_tiles = job->draw_tiles_x; - config.total_frame_height_in_tiles = job->draw_tiles_y; - - config.supertile_width_in_tiles = supertile_w; - config.supertile_height_in_tiles = supertile_h; - - config.total_frame_width_in_supertiles = frame_w_in_supertiles; - config.total_frame_height_in_supertiles = frame_h_in_supertiles; - } - - /* Start by clearing the tile buffer. */ - cl_emit(&job->rcl, TILE_COORDINATES, coords) { - coords.tile_column_number = 0; - coords.tile_row_number = 0; - } - - /* Emit an initial clear of the tile buffers. This is necessary for - * any buffers that should be cleared (since clearing normally happens - * at the *end* of the generic tile list), but it's also nice to clear - * everything so the first tile doesn't inherit any contents from some - * previous frame. - * - * Also, implement the GFXH-1742 workaround. There's a race in the HW - * between the RCL updating the TLB's internal type/size and the - * spawning of the QPU instances using the TLB's current internal - * type/size. To make sure the QPUs get the right state,, we need 1 - * dummy store in between internal type/size changes on V3D 3.x, and 2 - * dummy stores on 4.x. - */ -#if V3D_VERSION < 40 - cl_emit(&job->rcl, STORE_TILE_BUFFER_GENERAL, store) { - store.buffer_to_store = NONE; - } -#else - for (int i = 0; i < 2; i++) { - if (i > 0) - cl_emit(&job->rcl, TILE_COORDINATES, coords); - cl_emit(&job->rcl, END_OF_LOADS, end); - cl_emit(&job->rcl, STORE_TILE_BUFFER_GENERAL, store) { - store.buffer_to_store = NONE; - } - if (i == 0) { - cl_emit(&job->rcl, CLEAR_TILE_BUFFERS, clear) { - clear.clear_z_stencil_buffer = true; - clear.clear_all_render_targets = true; - } - } - cl_emit(&job->rcl, END_OF_TILE_MARKER, end); - } -#endif - - cl_emit(&job->rcl, FLUSH_VCD_CACHE, flush); - - v3d_rcl_emit_generic_per_tile_list(job, nr_cbufs - 1); - - /* XXX perf: We should expose GL_MESA_tile_raster_order to improve X11 - * performance, but we should use Morton order otherwise to improve - * cache locality. - */ - uint32_t supertile_w_in_pixels = job->tile_width * supertile_w; - uint32_t supertile_h_in_pixels = job->tile_height * supertile_h; - uint32_t min_x_supertile = job->draw_min_x / supertile_w_in_pixels; - uint32_t min_y_supertile = job->draw_min_y / supertile_h_in_pixels; - - uint32_t max_x_supertile = 0; - uint32_t max_y_supertile = 0; - if (job->draw_max_x != 0 && job->draw_max_y != 0) { - max_x_supertile = (job->draw_max_x - 1) / supertile_w_in_pixels; - max_y_supertile = (job->draw_max_y - 1) / supertile_h_in_pixels; - } - - for (int y = min_y_supertile; y <= max_y_supertile; y++) { - for (int x = min_x_supertile; x <= max_x_supertile; x++) { - cl_emit(&job->rcl, SUPERTILE_COORDINATES, coords) { - coords.column_number_in_supertiles = x; - coords.row_number_in_supertiles = y; - } - } - } - - if (job->tmu_dirty_rcl) { - cl_emit(&job->rcl, L1_CACHE_FLUSH_CONTROL, flush) { - flush.tmu_config_cache_clear = 0xf; - flush.tmu_data_cache_clear = 0xf; - flush.uniforms_cache_clear = 0xf; - flush.instruction_cache_clear = 0xf; - } - - cl_emit(&job->rcl, L2T_CACHE_FLUSH_CONTROL, flush) { - flush.l2t_flush_mode = L2T_FLUSH_MODE_CLEAN; - flush.l2t_flush_start = cl_address(NULL, 0); - flush.l2t_flush_end = cl_address(NULL, ~0); - } - } + /* ARB_framebuffer_no_attachments allows rendering to happen even when + * the framebuffer has no attachments, the idea being that fragment + * shaders can still do image load/store, ssbo, etc without having to + * write to actual attachments, so always run at least one iteration + * of the loop. + */ + assert(job->num_layers > 0 || (job->load == 0 && job->store == 0)); + for (int layer = 0; layer < MAX2(1, job->num_layers); layer++) + emit_render_layer(job, layer); cl_emit(&job->rcl, END_OF_RENDERING, end); } diff -Nru mesa-19.2.8/src/gallium/drivers/v3d/v3dx_simulator.c mesa-20.0.8/src/gallium/drivers/v3d/v3dx_simulator.c --- mesa-19.2.8/src/gallium/drivers/v3d/v3dx_simulator.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/v3d/v3dx_simulator.c 2020-06-12 01:21:17.000000000 +0000 @@ -85,6 +85,29 @@ (0 << V3D_CTL_0_L2TCACTL_L2TFLM_LSB)); } +/* Flushes dirty texture cachelines from the L1 write combiner */ +static void +v3d_flush_l1td(struct v3d_hw *v3d) +{ + V3D_WRITE(V3D_CTL_0_L2TCACTL, + V3D_CTL_0_L2TCACTL_TMUWCF_SET); + + assert(!(V3D_READ(V3D_CTL_0_L2TCACTL) & V3D_CTL_0_L2TCACTL_L2TFLS_SET)); +} + +/* Flushes dirty texture L2 cachelines */ +static void +v3d_flush_l2t(struct v3d_hw *v3d) +{ + V3D_WRITE(V3D_CTL_0_L2TFLSTA, 0); + V3D_WRITE(V3D_CTL_0_L2TFLEND, ~0); + V3D_WRITE(V3D_CTL_0_L2TCACTL, + V3D_CTL_0_L2TCACTL_L2TFLS_SET | + (2 << V3D_CTL_0_L2TCACTL_L2TFLM_LSB)); + + assert(!(V3D_READ(V3D_CTL_0_L2TCACTL) & V3D_CTL_0_L2TCACTL_L2TFLS_SET)); +} + /* Invalidates the slice caches. These are read-only caches. */ static void v3d_invalidate_slices(struct v3d_hw *v3d) @@ -116,6 +139,13 @@ } } +static UNUSED void +v3d_flush_caches(struct v3d_hw *v3d) +{ + v3d_flush_l1td(v3d); + v3d_flush_l2t(v3d); +} + int v3dX(simulator_submit_tfu_ioctl)(struct v3d_hw *v3d, struct drm_v3d_submit_tfu *args) @@ -142,6 +172,38 @@ return 0; } +#if V3D_VERSION >= 41 +int +v3dX(simulator_submit_csd_ioctl)(struct v3d_hw *v3d, + struct drm_v3d_submit_csd *args, + uint32_t gmp_ofs) +{ + g_gmp_ofs = gmp_ofs; + v3d_reload_gmp(v3d); + + v3d_invalidate_caches(v3d); + + V3D_WRITE(V3D_CSD_0_QUEUED_CFG1, args->cfg[1]); + V3D_WRITE(V3D_CSD_0_QUEUED_CFG2, args->cfg[2]); + V3D_WRITE(V3D_CSD_0_QUEUED_CFG3, args->cfg[3]); + V3D_WRITE(V3D_CSD_0_QUEUED_CFG4, args->cfg[4]); + V3D_WRITE(V3D_CSD_0_QUEUED_CFG5, args->cfg[5]); + V3D_WRITE(V3D_CSD_0_QUEUED_CFG6, args->cfg[6]); + /* CFG0 kicks off the job */ + V3D_WRITE(V3D_CSD_0_QUEUED_CFG0, args->cfg[0]); + + while (V3D_READ(V3D_CSD_0_STATUS) & + (V3D_CSD_0_STATUS_HAVE_CURRENT_DISPATCH_SET | + V3D_CSD_0_STATUS_HAVE_QUEUED_DISPATCH_SET)) { + v3d_hw_tick(v3d); + } + + v3d_flush_caches(v3d); + + return 0; +} +#endif + int v3dX(simulator_get_param_ioctl)(struct v3d_hw *v3d, struct drm_v3d_get_param *args) @@ -160,6 +222,12 @@ case DRM_V3D_PARAM_SUPPORTS_TFU: args->value = 1; return 0; + case DRM_V3D_PARAM_SUPPORTS_CSD: + args->value = V3D_VERSION >= 41; + return 0; + case DRM_V3D_PARAM_SUPPORTS_CACHE_FLUSH: + args->value = 1; + return 0; } if (args->param < ARRAY_SIZE(reg_map) && reg_map[args->param]) { diff -Nru mesa-19.2.8/src/gallium/drivers/v3d/v3dx_state.c mesa-20.0.8/src/gallium/drivers/v3d/v3dx_state.c --- mesa-19.2.8/src/gallium/drivers/v3d/v3dx_state.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/v3d/v3dx_state.c 2020-06-12 01:21:17.000000000 +0000 @@ -23,7 +23,7 @@ */ #include "pipe/p_state.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_framebuffer.h" #include "util/u_inlines.h" #include "util/u_math.h" @@ -778,9 +778,15 @@ case PIPE_SHADER_VERTEX: v3d->dirty |= VC5_DIRTY_VERTTEX; break; + case PIPE_SHADER_GEOMETRY: + v3d->dirty |= VC5_DIRTY_GEOMTEX; + break; case PIPE_SHADER_FRAGMENT: v3d->dirty |= VC5_DIRTY_FRAGTEX; break; + case PIPE_SHADER_COMPUTE: + v3d->dirty |= VC5_DIRTY_COMPTEX; + break; default: unreachable("Unsupported shader stage"); } @@ -1228,7 +1234,7 @@ * draw we need to do it here as well. */ if (num_targets == 0 && so->num_targets > 0) - v3d_tf_update_counters(ctx); + v3d_update_primitive_counters(ctx); for (i = 0; i < num_targets; i++) { if (offsets[i] != -1) @@ -1244,7 +1250,8 @@ /* Create primitive counters BO if needed */ if (num_targets > 0 && !ctx->prim_counts) { - uint32_t zeroes[7] = { 0 }; /* Init all 7 counters to 0 */ + /* Init all 7 counters and 1 padding to 0 */ + uint32_t zeroes[8] = { 0 }; u_upload_data(ctx->uploader, 0, sizeof(zeroes), 32, zeroes, &ctx->prim_counts_offset, diff -Nru mesa-19.2.8/src/gallium/drivers/vc4/vc4_blit.c mesa-20.0.8/src/gallium/drivers/vc4/vc4_blit.c --- mesa-19.2.8/src/gallium/drivers/vc4/vc4_blit.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/vc4/vc4_blit.c 2020-06-12 01:21:17.000000000 +0000 @@ -21,7 +21,7 @@ * IN THE SOFTWARE. */ -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_surface.h" #include "util/u_blitter.h" #include "compiler/nir/nir_builder.h" @@ -360,7 +360,7 @@ util_blitter_unset_running_flag(vc4->blitter); return false; } - dst_surf->width /= 2; + dst_surf->width = align(dst_surf->width, 8) / 2; if (dst->cpp == 1) dst_surf->height /= 2; diff -Nru mesa-19.2.8/src/gallium/drivers/vc4/vc4_bufmgr.c mesa-20.0.8/src/gallium/drivers/vc4/vc4_bufmgr.c --- mesa-19.2.8/src/gallium/drivers/vc4/vc4_bufmgr.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/vc4/vc4_bufmgr.c 2020-06-12 01:21:17.000000000 +0000 @@ -36,14 +36,6 @@ #include "vc4_context.h" #include "vc4_screen.h" -#ifdef HAVE_VALGRIND -#include -#include -#define VG(x) x -#else -#define VG(x) -#endif - static bool dump_stats = false; static void @@ -92,7 +84,7 @@ fprintf(stderr, " BOs cached: %d\n", cache->bo_count); fprintf(stderr, " BOs cached size: %dkb\n", cache->bo_size / 1024); - if (!list_empty(&cache->time_list)) { + if (!list_is_empty(&cache->time_list)) { struct vc4_bo *first = LIST_ENTRY(struct vc4_bo, cache->time_list.next, time_list); @@ -263,7 +255,7 @@ bo->handle = create.handle; if (ret != 0) { - if (!list_empty(&screen->bo_cache.time_list) && + if (!list_is_empty(&screen->bo_cache.time_list) && !cleared_and_retried) { cleared_and_retried = true; vc4_bo_cache_free_all(&screen->bo_cache); diff -Nru mesa-19.2.8/src/gallium/drivers/vc4/vc4_draw.c mesa-20.0.8/src/gallium/drivers/vc4/vc4_draw.c --- mesa-19.2.8/src/gallium/drivers/vc4/vc4_draw.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/vc4/vc4_draw.c 2020-06-12 01:21:17.000000000 +0000 @@ -24,8 +24,9 @@ #include "util/u_blitter.h" #include "util/u_prim.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_pack_color.h" +#include "util/u_split_draw.h" #include "util/u_upload_mgr.h" #include "indices/u_primconvert.h" @@ -448,45 +449,14 @@ while (count) { uint32_t this_count = count; - uint32_t step = count; + uint32_t step; if (needs_drawarrays_shader_state) { vc4_emit_gl_shader_state(vc4, info, extra_index_bias); } - if (count > max_verts) { - switch (info->mode) { - case PIPE_PRIM_POINTS: - this_count = step = max_verts; - break; - case PIPE_PRIM_LINES: - this_count = step = max_verts - (max_verts % 2); - break; - case PIPE_PRIM_LINE_STRIP: - this_count = max_verts; - step = max_verts - 1; - break; - case PIPE_PRIM_LINE_LOOP: - this_count = max_verts; - step = max_verts - 1; - debug_warn_once("unhandled line loop " - "looping behavior with " - ">65535 verts\n"); - break; - case PIPE_PRIM_TRIANGLES: - this_count = step = max_verts - (max_verts % 3); - break; - case PIPE_PRIM_TRIANGLE_STRIP: - this_count = max_verts; - step = max_verts - 2; - break; - default: - debug_warn_once("unhandled primitive " - "max vert count, truncating\n"); - this_count = step = max_verts; - } - } + u_split_draw(info, max_verts, &this_count, &step); cl_emit(&job->bcl, VERTEX_ARRAY_PRIMITIVES, array) { array.primitive_mode = info->mode; diff -Nru mesa-19.2.8/src/gallium/drivers/vc4/vc4_formats.c mesa-20.0.8/src/gallium/drivers/vc4/vc4_formats.c --- mesa-19.2.8/src/gallium/drivers/vc4/vc4_formats.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/vc4/vc4_formats.c 2020-06-12 01:21:17.000000000 +0000 @@ -32,7 +32,7 @@ * in our shader code, and this stores the table for doing so. */ -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/macros.h" #include "vc4_context.h" diff -Nru mesa-19.2.8/src/gallium/drivers/vc4/vc4_nir_lower_blend.c mesa-20.0.8/src/gallium/drivers/vc4/vc4_nir_lower_blend.c --- mesa-19.2.8/src/gallium/drivers/vc4/vc4_nir_lower_blend.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/vc4/vc4_nir_lower_blend.c 2020-06-12 01:21:17.000000000 +0000 @@ -39,7 +39,7 @@ * Lowers fixed-function blending to a load of the destination color and a * series of ALU operations before the store of the output. */ -#include "util/u_format.h" +#include "util/format/u_format.h" #include "vc4_qir.h" #include "compiler/nir/nir_builder.h" #include "compiler/nir/nir_format_convert.h" diff -Nru mesa-19.2.8/src/gallium/drivers/vc4/vc4_nir_lower_io.c mesa-20.0.8/src/gallium/drivers/vc4/vc4_nir_lower_io.c --- mesa-19.2.8/src/gallium/drivers/vc4/vc4_nir_lower_io.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/vc4/vc4_nir_lower_io.c 2020-06-12 01:21:17.000000000 +0000 @@ -23,7 +23,7 @@ #include "vc4_qir.h" #include "compiler/nir/nir_builder.h" -#include "util/u_format.h" +#include "util/format/u_format.h" /** * Walks the NIR generated by TGSI-to-NIR or GLSL-to-NIR to lower its io diff -Nru mesa-19.2.8/src/gallium/drivers/vc4/vc4_program.c mesa-20.0.8/src/gallium/drivers/vc4/vc4_program.c --- mesa-19.2.8/src/gallium/drivers/vc4/vc4_program.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/vc4/vc4_program.c 2020-06-12 01:21:17.000000000 +0000 @@ -23,7 +23,7 @@ */ #include -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/crc32.h" #include "util/u_math.h" #include "util/u_memory.h" @@ -185,7 +185,7 @@ struct qreg result) { struct qinst *last_inst = NULL; - if (!list_empty(&c->cur_block->instructions)) + if (!list_is_empty(&c->cur_block->instructions)) last_inst = (struct qinst *)c->cur_block->instructions.prev; assert(result.file == QFILE_UNIF || @@ -832,7 +832,7 @@ if (!src->is_ssa) return false; - if (!list_empty(&src->ssa->if_uses)) + if (!list_is_empty(&src->ssa->if_uses)) return false; return (src->ssa->uses.next == &src->use_link && @@ -1530,7 +1530,7 @@ progress = false; NIR_PASS_V(s, nir_lower_vars_to_ssa); - NIR_PASS(progress, s, nir_lower_alu_to_scalar, NULL); + NIR_PASS(progress, s, nir_lower_alu_to_scalar, NULL, NULL); NIR_PASS(progress, s, nir_lower_phis_to_scalar); NIR_PASS(progress, s, nir_copy_prop); NIR_PASS(progress, s, nir_opt_remove_phis); @@ -2191,6 +2191,7 @@ .lower_ldexp = true, .lower_negate = true, .lower_rotate = true, + .lower_to_scalar = true, .max_unroll_iterations = 32, }; @@ -2257,7 +2258,8 @@ NIR_PASS_V(c->s, nir_lower_alpha_test, c->fs_key->alpha_test_func, c->fs_key->sample_alpha_to_one && - c->fs_key->msaa); + c->fs_key->msaa, + NULL); } NIR_PASS_V(c->s, vc4_nir_lower_blend, c); } @@ -2311,10 +2313,11 @@ if (c->key->ucp_enables) { if (stage == QSTAGE_FRAG) { - NIR_PASS_V(c->s, nir_lower_clip_fs, c->key->ucp_enables); + NIR_PASS_V(c->s, nir_lower_clip_fs, + c->key->ucp_enables, false); } else { NIR_PASS_V(c->s, nir_lower_clip_vs, - c->key->ucp_enables, false); + c->key->ucp_enables, false, false, NULL); NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_out); } @@ -2331,10 +2334,25 @@ NIR_PASS_V(c->s, vc4_nir_lower_io, c); NIR_PASS_V(c->s, vc4_nir_lower_txf_ms, c); - NIR_PASS_V(c->s, nir_lower_idiv); + NIR_PASS_V(c->s, nir_lower_idiv, nir_lower_idiv_fast); vc4_optimize_nir(c->s); + /* Do late algebraic optimization to turn add(a, neg(b)) back into + * subs, then the mandatory cleanup after algebraic. Note that it may + * produce fnegs, and if so then we need to keep running to squash + * fneg(fneg(a)). + */ + bool more_late_algebraic = true; + while (more_late_algebraic) { + more_late_algebraic = false; + NIR_PASS(more_late_algebraic, c->s, nir_opt_algebraic_late); + NIR_PASS_V(c->s, nir_opt_constant_folding); + NIR_PASS_V(c->s, nir_copy_prop); + NIR_PASS_V(c->s, nir_opt_dce); + NIR_PASS_V(c->s, nir_opt_cse); + } + NIR_PASS_V(c->s, nir_lower_bool_to_int32); NIR_PASS_V(c->s, nir_convert_from_ssa, true); diff -Nru mesa-19.2.8/src/gallium/drivers/vc4/vc4_qir.c mesa-20.0.8/src/gallium/drivers/vc4/vc4_qir.c --- mesa-19.2.8/src/gallium/drivers/vc4/vc4_qir.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/vc4/vc4_qir.c 2020-06-12 01:21:17.000000000 +0000 @@ -738,7 +738,7 @@ qir_compile_destroy(struct vc4_compile *c) { qir_for_each_block(block, c) { - while (!list_empty(&block->instructions)) { + while (!list_is_empty(&block->instructions)) { struct qinst *qinst = list_first_entry(&block->instructions, struct qinst, link); @@ -798,7 +798,7 @@ { struct qinst *last_inst = NULL; - if (!list_empty(&c->cur_block->instructions)) + if (!list_is_empty(&c->cur_block->instructions)) last_inst = (struct qinst *)c->cur_block->instructions.prev; /* We don't have any way to guess which kind of MOV is implied. */ diff -Nru mesa-19.2.8/src/gallium/drivers/vc4/vc4_qir_live_variables.c mesa-20.0.8/src/gallium/drivers/vc4/vc4_qir_live_variables.c --- mesa-19.2.8/src/gallium/drivers/vc4/vc4_qir_live_variables.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/vc4/vc4_qir_live_variables.c 2020-06-12 01:21:17.000000000 +0000 @@ -34,18 +34,6 @@ uint8_t channels; }; -static uint32_t -int_hash(const void *key) -{ - return _mesa_hash_data(key, sizeof(int)); -} - -static bool -int_compare(const void *key1, const void *key2) -{ - return *(const int *)key1 == *(const int *)key2; -} - static int qir_reg_to_var(struct qreg reg) { @@ -194,7 +182,7 @@ qir_setup_def_use(struct vc4_compile *c) { struct hash_table *partial_update_ht = - _mesa_hash_table_create(c, int_hash, int_compare); + _mesa_hash_table_create(c, _mesa_hash_int, _mesa_key_int_equal); int ip = 0; qir_for_each_block(block, c) { diff -Nru mesa-19.2.8/src/gallium/drivers/vc4/vc4_qir_schedule.c mesa-20.0.8/src/gallium/drivers/vc4/vc4_qir_schedule.c --- mesa-19.2.8/src/gallium/drivers/vc4/vc4_qir_schedule.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/vc4/vc4_qir_schedule.c 2020-06-12 01:21:17.000000000 +0000 @@ -622,7 +622,7 @@ } state->time = 0; - while (!list_empty(&state->dag->heads)) { + while (!list_is_empty(&state->dag->heads)) { struct schedule_node *chosen = choose_instruction(state); struct qinst *inst = chosen->inst; diff -Nru mesa-19.2.8/src/gallium/drivers/vc4/vc4_qpu_schedule.c mesa-20.0.8/src/gallium/drivers/vc4/vc4_qpu_schedule.c --- mesa-19.2.8/src/gallium/drivers/vc4/vc4_qpu_schedule.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/vc4/vc4_qpu_schedule.c 2020-06-12 01:21:17.000000000 +0000 @@ -874,7 +874,7 @@ { uint32_t time = 0; - while (!list_empty(&scoreboard->dag->heads)) { + while (!list_is_empty(&scoreboard->dag->heads)) { struct schedule_node *chosen = choose_instruction_to_schedule(scoreboard, schedule_list, @@ -995,7 +995,7 @@ /* Wrap each instruction in a scheduler structure. */ uint32_t next_sched_uniform = *next_uniform; - while (!list_empty(&block->qpu_inst_list)) { + while (!list_is_empty(&block->qpu_inst_list)) { struct queued_qpu_inst *inst = (struct queued_qpu_inst *)block->qpu_inst_list.next; struct schedule_node *n = rzalloc(scoreboard->dag, diff -Nru mesa-19.2.8/src/gallium/drivers/vc4/vc4_resource.c mesa-20.0.8/src/gallium/drivers/vc4/vc4_resource.c --- mesa-19.2.8/src/gallium/drivers/vc4/vc4_resource.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/vc4/vc4_resource.c 2020-06-12 01:21:17.000000000 +0000 @@ -25,7 +25,7 @@ #include "pipe/p_defines.h" #include "util/u_blit.h" #include "util/u_memory.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_inlines.h" #include "util/u_surface.h" #include "util/u_transfer_helper.h" diff -Nru mesa-19.2.8/src/gallium/drivers/vc4/vc4_screen.c mesa-20.0.8/src/gallium/drivers/vc4/vc4_screen.c --- mesa-19.2.8/src/gallium/drivers/vc4/vc4_screen.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/vc4/vc4_screen.c 2020-06-12 01:21:17.000000000 +0000 @@ -30,7 +30,7 @@ #include "util/u_cpu_detect.h" #include "util/u_debug.h" #include "util/u_memory.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_hash_table.h" #include "util/u_screen.h" #include "util/u_transfer_helper.h" @@ -299,8 +299,6 @@ case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTERS: case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTER_BUFFERS: return 0; - case PIPE_SHADER_CAP_SCALAR_ISA: - return 1; default: fprintf(stderr, "unknown shader param %d\n", param); return 0; diff -Nru mesa-19.2.8/src/gallium/drivers/virgl/virgl_buffer.c mesa-20.0.8/src/gallium/drivers/virgl/virgl_buffer.c --- mesa-19.2.8/src/gallium/drivers/virgl/virgl_buffer.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/virgl/virgl_buffer.c 2020-06-12 01:21:17.000000000 +0000 @@ -71,7 +71,7 @@ * * We'll end up flushing 25 --> 70. */ - util_range_add(&trans->range, box->x, box->x + box->width); + util_range_add(transfer->resource, &trans->range, box->x, box->x + box->width); } static const struct u_resource_vtbl virgl_buffer_vtbl = diff -Nru mesa-19.2.8/src/gallium/drivers/virgl/virgl_context.c mesa-20.0.8/src/gallium/drivers/virgl/virgl_context.c --- mesa-19.2.8/src/gallium/drivers/virgl/virgl_context.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/virgl/virgl_context.c 2020-06-12 01:21:17.000000000 +0000 @@ -30,7 +30,7 @@ #include "pipe/p_state.h" #include "util/u_inlines.h" #include "util/u_memory.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_prim.h" #include "util/u_transfer.h" #include "util/u_helpers.h" @@ -1116,7 +1116,7 @@ struct virgl_resource *sres = virgl_resource(src); if (dres->u.b.target == PIPE_BUFFER) - util_range_add(&dres->valid_buffer_range, dstx, dstx + src_box->width); + util_range_add(&dres->u.b, &dres->valid_buffer_range, dstx, dstx + src_box->width); virgl_resource_dirty(dres, dst_level); virgl_encode_resource_copy_region(vctx, dres, diff -Nru mesa-19.2.8/src/gallium/drivers/virgl/virgl_encode.c mesa-20.0.8/src/gallium/drivers/virgl/virgl_encode.c --- mesa-19.2.8/src/gallium/drivers/virgl/virgl_encode.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/virgl/virgl_encode.c 2020-06-12 01:21:17.000000000 +0000 @@ -24,7 +24,7 @@ #include #include -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_memory.h" #include "util/u_math.h" #include "pipe/p_state.h" @@ -880,7 +880,12 @@ virgl_encoder_write_dword(ctx->cbuf, state->u.buf.offset / elem_size); virgl_encoder_write_dword(ctx->cbuf, (state->u.buf.offset + state->u.buf.size) / elem_size - 1); } else { - virgl_encoder_write_dword(ctx->cbuf, state->u.tex.first_layer | state->u.tex.last_layer << 16); + if (res->metadata.plane) { + debug_assert(state->u.tex.first_layer == 0 && state->u.tex.last_layer == 0); + virgl_encoder_write_dword(ctx->cbuf, res->metadata.plane); + } else { + virgl_encoder_write_dword(ctx->cbuf, state->u.tex.first_layer | state->u.tex.last_layer << 16); + } virgl_encoder_write_dword(ctx->cbuf, state->u.tex.first_level | state->u.tex.last_level << 8); } tmp = VIRGL_OBJ_SAMPLER_VIEW_SWIZZLE_R(state->swizzle_r) | @@ -1214,7 +1219,7 @@ virgl_encoder_write_dword(ctx->cbuf, buffers[i].buffer_size); virgl_encoder_write_res(ctx, res); - util_range_add(&res->valid_buffer_range, buffers[i].buffer_offset, + util_range_add(&res->u.b, &res->valid_buffer_range, buffers[i].buffer_offset, buffers[i].buffer_offset + buffers[i].buffer_size); virgl_resource_dirty(res, 0); } else { @@ -1241,7 +1246,7 @@ virgl_encoder_write_dword(ctx->cbuf, buffers[i].buffer_size); virgl_encoder_write_res(ctx, res); - util_range_add(&res->valid_buffer_range, buffers[i].buffer_offset, + util_range_add(&res->u.b, &res->valid_buffer_range, buffers[i].buffer_offset, buffers[i].buffer_offset + buffers[i].buffer_size); virgl_resource_dirty(res, 0); } else { @@ -1273,7 +1278,7 @@ virgl_encoder_write_res(ctx, res); if (res->u.b.target == PIPE_BUFFER) { - util_range_add(&res->valid_buffer_range, images[i].u.buf.offset, + util_range_add(&res->u.b, &res->valid_buffer_range, images[i].u.buf.offset, images[i].u.buf.offset + images[i].u.buf.size); } virgl_resource_dirty(res, images[i].u.tex.level); diff -Nru mesa-19.2.8/src/gallium/drivers/virgl/virgl_query.c mesa-20.0.8/src/gallium/drivers/virgl/virgl_query.c --- mesa-19.2.8/src/gallium/drivers/virgl/virgl_query.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/virgl/virgl_query.c 2020-06-12 01:21:17.000000000 +0000 @@ -114,7 +114,7 @@ query->result_size = (query_type == PIPE_QUERY_TIMESTAMP || query_type == PIPE_QUERY_TIME_ELAPSED) ? 8 : 4; - util_range_add(&query->buf->valid_buffer_range, 0, + util_range_add(&query->buf->u.b, &query->buf->valid_buffer_range, 0, sizeof(struct virgl_host_query_state)); virgl_resource_dirty(query->buf, 0); diff -Nru mesa-19.2.8/src/gallium/drivers/virgl/virgl_resource.c mesa-20.0.8/src/gallium/drivers/virgl/virgl_resource.c --- mesa-19.2.8/src/gallium/drivers/virgl/virgl_resource.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/virgl/virgl_resource.c 2020-06-12 01:21:17.000000000 +0000 @@ -20,7 +20,7 @@ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE * USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_inlines.h" #include "util/u_memory.h" #include "util/u_upload_mgr.h" @@ -446,13 +446,58 @@ } if (usage & PIPE_TRANSFER_WRITE) - util_range_add(&vres->valid_buffer_range, box->x, box->x + box->width); + util_range_add(&vres->u.b, &vres->valid_buffer_range, box->x, box->x + box->width); } *transfer = &trans->base; return map_addr; } +static void virgl_resource_layout(struct pipe_resource *pt, + struct virgl_resource_metadata *metadata, + uint32_t plane, + uint32_t winsys_stride, + uint32_t plane_offset, + uint32_t modifier) +{ + unsigned level, nblocksy; + unsigned width = pt->width0; + unsigned height = pt->height0; + unsigned depth = pt->depth0; + unsigned buffer_size = 0; + + for (level = 0; level <= pt->last_level; level++) { + unsigned slices; + + if (pt->target == PIPE_TEXTURE_CUBE) + slices = 6; + else if (pt->target == PIPE_TEXTURE_3D) + slices = depth; + else + slices = pt->array_size; + + nblocksy = util_format_get_nblocksy(pt->format, height); + metadata->stride[level] = winsys_stride ? winsys_stride : + util_format_get_stride(pt->format, width); + metadata->layer_stride[level] = nblocksy * metadata->stride[level]; + metadata->level_offset[level] = buffer_size; + + buffer_size += slices * metadata->layer_stride[level]; + + width = u_minify(width, 1); + height = u_minify(height, 1); + depth = u_minify(depth, 1); + } + + metadata->plane = plane; + metadata->plane_offset = plane_offset; + metadata->modifier = modifier; + if (pt->nr_samples <= 1) + metadata->total_size = buffer_size; + else /* don't create guest backing store for MSAA */ + metadata->total_size = 0; +} + static struct pipe_resource *virgl_resource_create(struct pipe_screen *screen, const struct pipe_resource *templ) { @@ -464,7 +509,7 @@ res->u.b.screen = &vs->base; pipe_reference_init(&res->u.b.reference, 1); vbind = pipe_to_virgl_bind(vs, templ->bind, templ->flags); - virgl_resource_layout(&res->u.b, &res->metadata); + virgl_resource_layout(&res->u.b, &res->metadata, 0, 0, 0, 0); if ((vs->caps.caps.v2.capability_bits & VIRGL_CAP_APP_TWEAK_SUPPORT) && vs->tweak_gles_emulate_bgra && @@ -507,6 +552,8 @@ struct winsys_handle *whandle, unsigned usage) { + uint32_t winsys_stride, plane_offset, plane; + uint64_t modifier; struct virgl_screen *vs = virgl_screen(screen); if (templ->target == PIPE_BUFFER) return NULL; @@ -515,9 +562,16 @@ res->u.b = *templ; res->u.b.screen = &vs->base; pipe_reference_init(&res->u.b.reference, 1); - virgl_resource_layout(&res->u.b, &res->metadata); - res->hw_res = vs->vws->resource_create_from_handle(vs->vws, whandle); + plane = winsys_stride = plane_offset = modifier = 0; + res->hw_res = vs->vws->resource_create_from_handle(vs->vws, whandle, + &plane, + &winsys_stride, + &plane_offset, + &modifier); + + virgl_resource_layout(&res->u.b, &res->metadata, plane, winsys_stride, + plane_offset, modifier); if (!res->hw_res) { FREE(res); return NULL; @@ -554,7 +608,7 @@ likely(!(virgl_debug & VIRGL_DEBUG_XFER)) && virgl_transfer_queue_extend_buffer(&vctx->queue, vbuf->hw_res, offset, size, data)) { - util_range_add(&vbuf->valid_buffer_range, offset, offset + size); + util_range_add(&vbuf->u.b, &vbuf->valid_buffer_range, offset, offset + size); return; } @@ -570,42 +624,6 @@ ctx->texture_subdata = u_default_texture_subdata; } -void virgl_resource_layout(struct pipe_resource *pt, - struct virgl_resource_metadata *metadata) -{ - unsigned level, nblocksy; - unsigned width = pt->width0; - unsigned height = pt->height0; - unsigned depth = pt->depth0; - unsigned buffer_size = 0; - - for (level = 0; level <= pt->last_level; level++) { - unsigned slices; - - if (pt->target == PIPE_TEXTURE_CUBE) - slices = 6; - else if (pt->target == PIPE_TEXTURE_3D) - slices = depth; - else - slices = pt->array_size; - - nblocksy = util_format_get_nblocksy(pt->format, height); - metadata->stride[level] = util_format_get_stride(pt->format, width); - metadata->layer_stride[level] = nblocksy * metadata->stride[level]; - metadata->level_offset[level] = buffer_size; - - buffer_size += slices * metadata->layer_stride[level]; - - width = u_minify(width, 1); - height = u_minify(height, 1); - depth = u_minify(depth, 1); - } - - if (pt->nr_samples <= 1) - metadata->total_size = buffer_size; - else /* don't create guest backing store for MSAA */ - metadata->total_size = 0; -} struct virgl_transfer * virgl_resource_create_transfer(struct virgl_context *vctx, @@ -620,7 +638,7 @@ const unsigned blocksy = box->y / util_format_get_blockheight(format); const unsigned blocksx = box->x / util_format_get_blockwidth(format); - unsigned offset = metadata->level_offset[level]; + unsigned offset = metadata->plane_offset + metadata->level_offset[level]; if (pres->target == PIPE_TEXTURE_CUBE || pres->target == PIPE_TEXTURE_CUBE_ARRAY || pres->target == PIPE_TEXTURE_3D || diff -Nru mesa-19.2.8/src/gallium/drivers/virgl/virgl_resource.h mesa-20.0.8/src/gallium/drivers/virgl/virgl_resource.h --- mesa-19.2.8/src/gallium/drivers/virgl/virgl_resource.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/virgl/virgl_resource.h 2020-06-12 01:21:17.000000000 +0000 @@ -42,7 +42,8 @@ unsigned long level_offset[VR_MAX_TEXTURE_2D_LEVELS]; unsigned stride[VR_MAX_TEXTURE_2D_LEVELS]; unsigned layer_stride[VR_MAX_TEXTURE_2D_LEVELS]; - uint32_t total_size; + uint32_t plane, plane_offset, total_size; + uint64_t modifier; }; struct virgl_resource { @@ -155,9 +156,6 @@ const struct pipe_box *box, struct pipe_transfer **transfer); -void virgl_resource_layout(struct pipe_resource *pt, - struct virgl_resource_metadata *metadata); - struct virgl_transfer * virgl_resource_create_transfer(struct virgl_context *vctx, struct pipe_resource *pres, diff -Nru mesa-19.2.8/src/gallium/drivers/virgl/virgl_screen.c mesa-20.0.8/src/gallium/drivers/virgl/virgl_screen.c --- mesa-19.2.8/src/gallium/drivers/virgl/virgl_screen.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/virgl/virgl_screen.c 2020-06-12 01:21:17.000000000 +0000 @@ -21,8 +21,8 @@ * USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include "util/u_memory.h" -#include "util/u_format.h" -#include "util/u_format_s3tc.h" +#include "util/format/u_format.h" +#include "util/format/u_format_s3tc.h" #include "util/u_screen.h" #include "util/u_video.h" #include "util/u_math.h" @@ -463,8 +463,6 @@ case PIPE_SHADER_CAP_INT64_ATOMICS: case PIPE_SHADER_CAP_FP16: return 0; - case PIPE_SHADER_CAP_SCALAR_ISA: - return 1; default: return 0; } @@ -661,6 +659,9 @@ if (MAX2(1, sample_count) != MAX2(1, storage_sample_count)) return false; + if (!util_is_power_of_two_or_zero(sample_count)) + return false; + assert(target == PIPE_BUFFER || target == PIPE_TEXTURE_1D || target == PIPE_TEXTURE_1D_ARRAY || diff -Nru mesa-19.2.8/src/gallium/drivers/virgl/virgl_streamout.c mesa-20.0.8/src/gallium/drivers/virgl/virgl_streamout.c --- mesa-19.2.8/src/gallium/drivers/virgl/virgl_streamout.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/virgl/virgl_streamout.c 2020-06-12 01:21:17.000000000 +0000 @@ -50,7 +50,7 @@ t->handle = handle; res->bind_history |= PIPE_BIND_STREAM_OUTPUT; - util_range_add(&res->valid_buffer_range, buffer_offset, + util_range_add(&res->u.b, &res->valid_buffer_range, buffer_offset, buffer_offset + buffer_size); virgl_resource_dirty(res, 0); diff -Nru mesa-19.2.8/src/gallium/drivers/virgl/virgl_texture.c mesa-20.0.8/src/gallium/drivers/virgl/virgl_texture.c --- mesa-19.2.8/src/gallium/drivers/virgl/virgl_texture.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/virgl/virgl_texture.c 2020-06-12 01:21:17.000000000 +0000 @@ -20,7 +20,7 @@ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE * USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_inlines.h" #include "util/u_memory.h" diff -Nru mesa-19.2.8/src/gallium/drivers/virgl/virgl_winsys.h mesa-20.0.8/src/gallium/drivers/virgl/virgl_winsys.h --- mesa-19.2.8/src/gallium/drivers/virgl/virgl_winsys.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/virgl/virgl_winsys.h 2020-06-12 01:21:17.000000000 +0000 @@ -80,7 +80,11 @@ struct virgl_hw_res *res); struct virgl_hw_res *(*resource_create_from_handle)(struct virgl_winsys *vws, - struct winsys_handle *whandle); + struct winsys_handle *whandle, + uint32_t *plane, + uint32_t *stride, + uint32_t *plane_offset, + uint64_t *modifier); boolean (*resource_get_handle)(struct virgl_winsys *vws, struct virgl_hw_res *res, uint32_t stride, diff -Nru mesa-19.2.8/src/gallium/drivers/zink/meson.build mesa-20.0.8/src/gallium/drivers/zink/meson.build --- mesa-19.2.8/src/gallium/drivers/zink/meson.build 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/zink/meson.build 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,53 @@ +# Copyright © 2018 Collabora Ltd + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +files_libzink = files( + 'nir_to_spirv/nir_to_spirv.c', + 'nir_to_spirv/spirv_builder.c', + 'zink_batch.c', + 'zink_blit.c', + 'zink_compiler.c', + 'zink_context.c', + 'zink_draw.c', + 'zink_fence.c', + 'zink_format.c', + 'zink_framebuffer.c', + 'zink_pipeline.c', + 'zink_program.c', + 'zink_query.c', + 'zink_render_pass.c', + 'zink_resource.c', + 'zink_screen.c', + 'zink_state.c', + 'zink_surface.c', +) + +libzink = static_library( + 'zink', + files_libzink, + c_args : c_vis_args, + include_directories : inc_common, + dependencies: [dep_vulkan, idep_nir_headers], +) + +driver_zink = declare_dependency( + compile_args : '-DGALLIUM_ZINK', + link_with : [libzink], +) diff -Nru mesa-19.2.8/src/gallium/drivers/zink/nir_to_spirv/nir_to_spirv.c mesa-20.0.8/src/gallium/drivers/zink/nir_to_spirv/nir_to_spirv.c --- mesa-19.2.8/src/gallium/drivers/zink/nir_to_spirv/nir_to_spirv.c 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/zink/nir_to_spirv/nir_to_spirv.c 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,1972 @@ +/* + * Copyright 2018 Collabora Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "nir_to_spirv.h" +#include "spirv_builder.h" + +#include "nir.h" +#include "pipe/p_state.h" +#include "util/u_memory.h" +#include "util/hash_table.h" + +struct ntv_context { + struct spirv_builder builder; + + SpvId GLSL_std_450; + + gl_shader_stage stage; + + SpvId ubos[128]; + size_t num_ubos; + SpvId image_types[PIPE_MAX_SAMPLERS]; + SpvId samplers[PIPE_MAX_SAMPLERS]; + size_t num_samplers; + SpvId entry_ifaces[PIPE_MAX_SHADER_INPUTS * 4 + PIPE_MAX_SHADER_OUTPUTS * 4]; + size_t num_entry_ifaces; + + SpvId *defs; + size_t num_defs; + + SpvId *regs; + size_t num_regs; + + struct hash_table *vars; /* nir_variable -> SpvId */ + + const SpvId *block_ids; + size_t num_blocks; + bool block_started; + SpvId loop_break, loop_cont; + + SpvId front_face_var, vertex_id_var; +}; + +static SpvId +get_fvec_constant(struct ntv_context *ctx, unsigned bit_size, + unsigned num_components, float value); + +static SpvId +get_uvec_constant(struct ntv_context *ctx, unsigned bit_size, + unsigned num_components, uint32_t value); + +static SpvId +get_ivec_constant(struct ntv_context *ctx, unsigned bit_size, + unsigned num_components, int32_t value); + +static SpvId +emit_unop(struct ntv_context *ctx, SpvOp op, SpvId type, SpvId src); + +static SpvId +emit_binop(struct ntv_context *ctx, SpvOp op, SpvId type, + SpvId src0, SpvId src1); + +static SpvId +emit_triop(struct ntv_context *ctx, SpvOp op, SpvId type, + SpvId src0, SpvId src1, SpvId src2); + +static SpvId +get_bvec_type(struct ntv_context *ctx, int num_components) +{ + SpvId bool_type = spirv_builder_type_bool(&ctx->builder); + if (num_components > 1) + return spirv_builder_type_vector(&ctx->builder, bool_type, + num_components); + + assert(num_components == 1); + return bool_type; +} + +static SpvId +block_label(struct ntv_context *ctx, nir_block *block) +{ + assert(block->index < ctx->num_blocks); + return ctx->block_ids[block->index]; +} + +static SpvId +emit_float_const(struct ntv_context *ctx, int bit_size, float value) +{ + assert(bit_size == 32); + return spirv_builder_const_float(&ctx->builder, bit_size, value); +} + +static SpvId +emit_uint_const(struct ntv_context *ctx, int bit_size, uint32_t value) +{ + assert(bit_size == 32); + return spirv_builder_const_uint(&ctx->builder, bit_size, value); +} + +static SpvId +emit_int_const(struct ntv_context *ctx, int bit_size, int32_t value) +{ + assert(bit_size == 32); + return spirv_builder_const_int(&ctx->builder, bit_size, value); +} + +static SpvId +get_fvec_type(struct ntv_context *ctx, unsigned bit_size, unsigned num_components) +{ + assert(bit_size == 32); // only 32-bit floats supported so far + + SpvId float_type = spirv_builder_type_float(&ctx->builder, bit_size); + if (num_components > 1) + return spirv_builder_type_vector(&ctx->builder, float_type, + num_components); + + assert(num_components == 1); + return float_type; +} + +static SpvId +get_ivec_type(struct ntv_context *ctx, unsigned bit_size, unsigned num_components) +{ + assert(bit_size == 1 || bit_size == 32); // only 32-bit ints supported so far + + SpvId int_type = spirv_builder_type_int(&ctx->builder, MAX2(bit_size, 32)); + if (num_components > 1) + return spirv_builder_type_vector(&ctx->builder, int_type, + num_components); + + assert(num_components == 1); + return int_type; +} + +static SpvId +get_uvec_type(struct ntv_context *ctx, unsigned bit_size, unsigned num_components) +{ + assert(bit_size == 1 || bit_size == 32); // only 32-bit uints supported so far + + SpvId uint_type = spirv_builder_type_uint(&ctx->builder, MAX2(bit_size, 32)); + if (num_components > 1) + return spirv_builder_type_vector(&ctx->builder, uint_type, + num_components); + + assert(num_components == 1); + return uint_type; +} + +static SpvId +get_dest_uvec_type(struct ntv_context *ctx, nir_dest *dest) +{ + return get_uvec_type(ctx, nir_dest_bit_size(*dest), + nir_dest_num_components(*dest)); +} + +static SpvId +get_glsl_basetype(struct ntv_context *ctx, enum glsl_base_type type) +{ + switch (type) { + case GLSL_TYPE_BOOL: + return spirv_builder_type_bool(&ctx->builder); + + case GLSL_TYPE_FLOAT: + return spirv_builder_type_float(&ctx->builder, 32); + + case GLSL_TYPE_INT: + return spirv_builder_type_int(&ctx->builder, 32); + + case GLSL_TYPE_UINT: + return spirv_builder_type_uint(&ctx->builder, 32); + /* TODO: handle more types */ + + default: + unreachable("unknown GLSL type"); + } +} + +static SpvId +get_glsl_type(struct ntv_context *ctx, const struct glsl_type *type) +{ + assert(type); + if (glsl_type_is_scalar(type)) + return get_glsl_basetype(ctx, glsl_get_base_type(type)); + + if (glsl_type_is_vector(type)) + return spirv_builder_type_vector(&ctx->builder, + get_glsl_basetype(ctx, glsl_get_base_type(type)), + glsl_get_vector_elements(type)); + + if (glsl_type_is_array(type)) { + SpvId ret = spirv_builder_type_array(&ctx->builder, + get_glsl_type(ctx, glsl_get_array_element(type)), + emit_uint_const(ctx, 32, glsl_get_length(type))); + uint32_t stride = glsl_get_explicit_stride(type); + if (stride) + spirv_builder_emit_array_stride(&ctx->builder, ret, stride); + return ret; + } + + + unreachable("we shouldn't get here, I think..."); +} + +static void +emit_input(struct ntv_context *ctx, struct nir_variable *var) +{ + SpvId var_type = get_glsl_type(ctx, var->type); + SpvId pointer_type = spirv_builder_type_pointer(&ctx->builder, + SpvStorageClassInput, + var_type); + SpvId var_id = spirv_builder_emit_var(&ctx->builder, pointer_type, + SpvStorageClassInput); + + if (var->name) + spirv_builder_emit_name(&ctx->builder, var_id, var->name); + + if (ctx->stage == MESA_SHADER_FRAGMENT) { + if (var->data.location >= VARYING_SLOT_VAR0) + spirv_builder_emit_location(&ctx->builder, var_id, + var->data.location - + VARYING_SLOT_VAR0 + + VARYING_SLOT_TEX0); + else if ((var->data.location >= VARYING_SLOT_COL0 && + var->data.location <= VARYING_SLOT_TEX7) || + var->data.location == VARYING_SLOT_BFC0 || + var->data.location == VARYING_SLOT_BFC1) { + spirv_builder_emit_location(&ctx->builder, var_id, + var->data.location); + } else { + switch (var->data.location) { + case VARYING_SLOT_POS: + spirv_builder_emit_builtin(&ctx->builder, var_id, SpvBuiltInFragCoord); + break; + + case VARYING_SLOT_PNTC: + spirv_builder_emit_builtin(&ctx->builder, var_id, SpvBuiltInPointCoord); + break; + + default: + debug_printf("unknown varying slot: %s\n", gl_varying_slot_name(var->data.location)); + unreachable("unexpected varying slot"); + } + } + } else { + spirv_builder_emit_location(&ctx->builder, var_id, + var->data.driver_location); + } + + if (var->data.location_frac) + spirv_builder_emit_component(&ctx->builder, var_id, + var->data.location_frac); + + if (var->data.interpolation == INTERP_MODE_FLAT) + spirv_builder_emit_decoration(&ctx->builder, var_id, SpvDecorationFlat); + + _mesa_hash_table_insert(ctx->vars, var, (void *)(intptr_t)var_id); + + assert(ctx->num_entry_ifaces < ARRAY_SIZE(ctx->entry_ifaces)); + ctx->entry_ifaces[ctx->num_entry_ifaces++] = var_id; +} + +static void +emit_output(struct ntv_context *ctx, struct nir_variable *var) +{ + SpvId var_type = get_glsl_type(ctx, var->type); + SpvId pointer_type = spirv_builder_type_pointer(&ctx->builder, + SpvStorageClassOutput, + var_type); + SpvId var_id = spirv_builder_emit_var(&ctx->builder, pointer_type, + SpvStorageClassOutput); + if (var->name) + spirv_builder_emit_name(&ctx->builder, var_id, var->name); + + + if (ctx->stage == MESA_SHADER_VERTEX) { + if (var->data.location >= VARYING_SLOT_VAR0) + spirv_builder_emit_location(&ctx->builder, var_id, + var->data.location - + VARYING_SLOT_VAR0 + + VARYING_SLOT_TEX0); + else if ((var->data.location >= VARYING_SLOT_COL0 && + var->data.location <= VARYING_SLOT_TEX7) || + var->data.location == VARYING_SLOT_BFC0 || + var->data.location == VARYING_SLOT_BFC1) { + spirv_builder_emit_location(&ctx->builder, var_id, + var->data.location); + } else { + switch (var->data.location) { + case VARYING_SLOT_POS: + spirv_builder_emit_builtin(&ctx->builder, var_id, SpvBuiltInPosition); + break; + + case VARYING_SLOT_PSIZ: + spirv_builder_emit_builtin(&ctx->builder, var_id, SpvBuiltInPointSize); + break; + + case VARYING_SLOT_CLIP_DIST0: + assert(glsl_type_is_array(var->type)); + spirv_builder_emit_builtin(&ctx->builder, var_id, SpvBuiltInClipDistance); + break; + + default: + debug_printf("unknown varying slot: %s\n", gl_varying_slot_name(var->data.location)); + unreachable("unexpected varying slot"); + } + } + } else if (ctx->stage == MESA_SHADER_FRAGMENT) { + if (var->data.location >= FRAG_RESULT_DATA0) + spirv_builder_emit_location(&ctx->builder, var_id, + var->data.location - FRAG_RESULT_DATA0); + else { + switch (var->data.location) { + case FRAG_RESULT_COLOR: + spirv_builder_emit_location(&ctx->builder, var_id, 0); + break; + + case FRAG_RESULT_DEPTH: + spirv_builder_emit_builtin(&ctx->builder, var_id, SpvBuiltInFragDepth); + break; + + default: + spirv_builder_emit_location(&ctx->builder, var_id, + var->data.driver_location); + } + } + } + + if (var->data.location_frac) + spirv_builder_emit_component(&ctx->builder, var_id, + var->data.location_frac); + + _mesa_hash_table_insert(ctx->vars, var, (void *)(intptr_t)var_id); + + assert(ctx->num_entry_ifaces < ARRAY_SIZE(ctx->entry_ifaces)); + ctx->entry_ifaces[ctx->num_entry_ifaces++] = var_id; +} + +static SpvDim +type_to_dim(enum glsl_sampler_dim gdim, bool *is_ms) +{ + *is_ms = false; + switch (gdim) { + case GLSL_SAMPLER_DIM_1D: + return SpvDim1D; + case GLSL_SAMPLER_DIM_2D: + return SpvDim2D; + case GLSL_SAMPLER_DIM_3D: + return SpvDim3D; + case GLSL_SAMPLER_DIM_CUBE: + return SpvDimCube; + case GLSL_SAMPLER_DIM_RECT: + return SpvDimRect; + case GLSL_SAMPLER_DIM_BUF: + return SpvDimBuffer; + case GLSL_SAMPLER_DIM_EXTERNAL: + return SpvDim2D; /* seems dodgy... */ + case GLSL_SAMPLER_DIM_MS: + *is_ms = true; + return SpvDim2D; + default: + fprintf(stderr, "unknown sampler type %d\n", gdim); + break; + } + return SpvDim2D; +} + +static void +emit_sampler(struct ntv_context *ctx, struct nir_variable *var) +{ + const struct glsl_type *type = glsl_without_array(var->type); + + bool is_ms; + SpvDim dimension = type_to_dim(glsl_get_sampler_dim(type), &is_ms); + + SpvId result_type = get_glsl_basetype(ctx, glsl_get_sampler_result_type(type)); + SpvId image_type = spirv_builder_type_image(&ctx->builder, result_type, + dimension, false, + glsl_sampler_type_is_array(type), + is_ms, 1, + SpvImageFormatUnknown); + + SpvId sampled_type = spirv_builder_type_sampled_image(&ctx->builder, + image_type); + SpvId pointer_type = spirv_builder_type_pointer(&ctx->builder, + SpvStorageClassUniformConstant, + sampled_type); + + if (glsl_type_is_array(var->type)) { + for (int i = 0; i < glsl_get_length(var->type); ++i) { + SpvId var_id = spirv_builder_emit_var(&ctx->builder, pointer_type, + SpvStorageClassUniformConstant); + + if (var->name) { + char element_name[100]; + snprintf(element_name, sizeof(element_name), "%s_%d", var->name, i); + spirv_builder_emit_name(&ctx->builder, var_id, var->name); + } + + assert(ctx->num_samplers < ARRAY_SIZE(ctx->image_types)); + ctx->image_types[ctx->num_samplers] = image_type; + + assert(ctx->num_samplers < ARRAY_SIZE(ctx->samplers)); + ctx->samplers[ctx->num_samplers++] = var_id; + + spirv_builder_emit_descriptor_set(&ctx->builder, var_id, + var->data.descriptor_set); + spirv_builder_emit_binding(&ctx->builder, var_id, var->data.binding); + } + } else { + SpvId var_id = spirv_builder_emit_var(&ctx->builder, pointer_type, + SpvStorageClassUniformConstant); + + if (var->name) + spirv_builder_emit_name(&ctx->builder, var_id, var->name); + + assert(ctx->num_samplers < ARRAY_SIZE(ctx->image_types)); + ctx->image_types[ctx->num_samplers] = image_type; + + assert(ctx->num_samplers < ARRAY_SIZE(ctx->samplers)); + ctx->samplers[ctx->num_samplers++] = var_id; + + spirv_builder_emit_descriptor_set(&ctx->builder, var_id, + var->data.descriptor_set); + spirv_builder_emit_binding(&ctx->builder, var_id, var->data.binding); + } +} + +static void +emit_ubo(struct ntv_context *ctx, struct nir_variable *var) +{ + uint32_t size = glsl_count_attribute_slots(var->type, false); + SpvId vec4_type = get_uvec_type(ctx, 32, 4); + SpvId array_length = emit_uint_const(ctx, 32, size); + SpvId array_type = spirv_builder_type_array(&ctx->builder, vec4_type, + array_length); + spirv_builder_emit_array_stride(&ctx->builder, array_type, 16); + + // wrap UBO-array in a struct + SpvId struct_type = spirv_builder_type_struct(&ctx->builder, &array_type, 1); + if (var->name) { + char struct_name[100]; + snprintf(struct_name, sizeof(struct_name), "struct_%s", var->name); + spirv_builder_emit_name(&ctx->builder, struct_type, struct_name); + } + + spirv_builder_emit_decoration(&ctx->builder, struct_type, + SpvDecorationBlock); + spirv_builder_emit_member_offset(&ctx->builder, struct_type, 0, 0); + + + SpvId pointer_type = spirv_builder_type_pointer(&ctx->builder, + SpvStorageClassUniform, + struct_type); + + SpvId var_id = spirv_builder_emit_var(&ctx->builder, pointer_type, + SpvStorageClassUniform); + if (var->name) + spirv_builder_emit_name(&ctx->builder, var_id, var->name); + + assert(ctx->num_ubos < ARRAY_SIZE(ctx->ubos)); + ctx->ubos[ctx->num_ubos++] = var_id; + + spirv_builder_emit_descriptor_set(&ctx->builder, var_id, + var->data.descriptor_set); + spirv_builder_emit_binding(&ctx->builder, var_id, var->data.binding); +} + +static void +emit_uniform(struct ntv_context *ctx, struct nir_variable *var) +{ + if (var->data.mode == nir_var_mem_ubo) + emit_ubo(ctx, var); + else { + assert(var->data.mode == nir_var_uniform); + if (glsl_type_is_sampler(glsl_without_array(var->type))) + emit_sampler(ctx, var); + } +} + +static SpvId +get_src_uint_ssa(struct ntv_context *ctx, const nir_ssa_def *ssa) +{ + assert(ssa->index < ctx->num_defs); + assert(ctx->defs[ssa->index] != 0); + return ctx->defs[ssa->index]; +} + +static SpvId +get_var_from_reg(struct ntv_context *ctx, nir_register *reg) +{ + assert(reg->index < ctx->num_regs); + assert(ctx->regs[reg->index] != 0); + return ctx->regs[reg->index]; +} + +static SpvId +get_src_uint_reg(struct ntv_context *ctx, const nir_reg_src *reg) +{ + assert(reg->reg); + assert(!reg->indirect); + assert(!reg->base_offset); + + SpvId var = get_var_from_reg(ctx, reg->reg); + SpvId type = get_uvec_type(ctx, reg->reg->bit_size, reg->reg->num_components); + return spirv_builder_emit_load(&ctx->builder, type, var); +} + +static SpvId +get_src_uint(struct ntv_context *ctx, nir_src *src) +{ + if (src->is_ssa) + return get_src_uint_ssa(ctx, src->ssa); + else + return get_src_uint_reg(ctx, &src->reg); +} + +static SpvId +get_alu_src_uint(struct ntv_context *ctx, nir_alu_instr *alu, unsigned src) +{ + assert(!alu->src[src].negate); + assert(!alu->src[src].abs); + + SpvId def = get_src_uint(ctx, &alu->src[src].src); + + unsigned used_channels = 0; + bool need_swizzle = false; + for (unsigned i = 0; i < NIR_MAX_VEC_COMPONENTS; i++) { + if (!nir_alu_instr_channel_used(alu, src, i)) + continue; + + used_channels++; + + if (alu->src[src].swizzle[i] != i) + need_swizzle = true; + } + assert(used_channels != 0); + + unsigned live_channels = nir_src_num_components(alu->src[src].src); + if (used_channels != live_channels) + need_swizzle = true; + + if (!need_swizzle) + return def; + + int bit_size = nir_src_bit_size(alu->src[src].src); + assert(bit_size == 1 || bit_size == 32); + + SpvId uint_type = spirv_builder_type_uint(&ctx->builder, MAX2(bit_size, 32)); + if (used_channels == 1) { + uint32_t indices[] = { alu->src[src].swizzle[0] }; + return spirv_builder_emit_composite_extract(&ctx->builder, uint_type, + def, indices, + ARRAY_SIZE(indices)); + } else if (live_channels == 1) { + SpvId uvec_type = spirv_builder_type_vector(&ctx->builder, uint_type, + used_channels); + + SpvId constituents[NIR_MAX_VEC_COMPONENTS]; + for (unsigned i = 0; i < used_channels; ++i) + constituents[i] = def; + + return spirv_builder_emit_composite_construct(&ctx->builder, uvec_type, + constituents, + used_channels); + } else { + SpvId uvec_type = spirv_builder_type_vector(&ctx->builder, uint_type, + used_channels); + + uint32_t components[NIR_MAX_VEC_COMPONENTS]; + size_t num_components = 0; + for (unsigned i = 0; i < NIR_MAX_VEC_COMPONENTS; i++) { + if (!nir_alu_instr_channel_used(alu, src, i)) + continue; + + components[num_components++] = alu->src[src].swizzle[i]; + } + + return spirv_builder_emit_vector_shuffle(&ctx->builder, uvec_type, + def, def, components, num_components); + } +} + +static void +store_ssa_def_uint(struct ntv_context *ctx, nir_ssa_def *ssa, SpvId result) +{ + assert(result != 0); + assert(ssa->index < ctx->num_defs); + ctx->defs[ssa->index] = result; +} + +static SpvId +emit_select(struct ntv_context *ctx, SpvId type, SpvId cond, + SpvId if_true, SpvId if_false) +{ + return emit_triop(ctx, SpvOpSelect, type, cond, if_true, if_false); +} + +static SpvId +bvec_to_uvec(struct ntv_context *ctx, SpvId value, unsigned num_components) +{ + SpvId otype = get_uvec_type(ctx, 32, num_components); + SpvId zero = get_uvec_constant(ctx, 32, num_components, 0); + SpvId one = get_uvec_constant(ctx, 32, num_components, UINT32_MAX); + return emit_select(ctx, otype, value, one, zero); +} + +static SpvId +uvec_to_bvec(struct ntv_context *ctx, SpvId value, unsigned num_components) +{ + SpvId type = get_bvec_type(ctx, num_components); + SpvId zero = get_uvec_constant(ctx, 32, num_components, 0); + return emit_binop(ctx, SpvOpINotEqual, type, value, zero); +} + +static SpvId +emit_bitcast(struct ntv_context *ctx, SpvId type, SpvId value) +{ + return emit_unop(ctx, SpvOpBitcast, type, value); +} + +static SpvId +bitcast_to_uvec(struct ntv_context *ctx, SpvId value, unsigned bit_size, + unsigned num_components) +{ + SpvId type = get_uvec_type(ctx, bit_size, num_components); + return emit_bitcast(ctx, type, value); +} + +static SpvId +bitcast_to_ivec(struct ntv_context *ctx, SpvId value, unsigned bit_size, + unsigned num_components) +{ + SpvId type = get_ivec_type(ctx, bit_size, num_components); + return emit_bitcast(ctx, type, value); +} + +static SpvId +bitcast_to_fvec(struct ntv_context *ctx, SpvId value, unsigned bit_size, + unsigned num_components) +{ + SpvId type = get_fvec_type(ctx, bit_size, num_components); + return emit_bitcast(ctx, type, value); +} + +static void +store_reg_def(struct ntv_context *ctx, nir_reg_dest *reg, SpvId result) +{ + SpvId var = get_var_from_reg(ctx, reg->reg); + assert(var); + spirv_builder_emit_store(&ctx->builder, var, result); +} + +static void +store_dest_uint(struct ntv_context *ctx, nir_dest *dest, SpvId result) +{ + if (dest->is_ssa) + store_ssa_def_uint(ctx, &dest->ssa, result); + else + store_reg_def(ctx, &dest->reg, result); +} + +static void +store_dest(struct ntv_context *ctx, nir_dest *dest, SpvId result, nir_alu_type type) +{ + unsigned num_components = nir_dest_num_components(*dest); + unsigned bit_size = nir_dest_bit_size(*dest); + + switch (nir_alu_type_get_base_type(type)) { + case nir_type_bool: + assert(bit_size == 1); + result = bvec_to_uvec(ctx, result, num_components); + break; + + case nir_type_uint: + break; /* nothing to do! */ + + case nir_type_int: + case nir_type_float: + result = bitcast_to_uvec(ctx, result, bit_size, num_components); + break; + + default: + unreachable("unsupported nir_alu_type"); + } + + store_dest_uint(ctx, dest, result); +} + +static SpvId +emit_unop(struct ntv_context *ctx, SpvOp op, SpvId type, SpvId src) +{ + return spirv_builder_emit_unop(&ctx->builder, op, type, src); +} + +static SpvId +emit_binop(struct ntv_context *ctx, SpvOp op, SpvId type, + SpvId src0, SpvId src1) +{ + return spirv_builder_emit_binop(&ctx->builder, op, type, src0, src1); +} + +static SpvId +emit_triop(struct ntv_context *ctx, SpvOp op, SpvId type, + SpvId src0, SpvId src1, SpvId src2) +{ + return spirv_builder_emit_triop(&ctx->builder, op, type, src0, src1, src2); +} + +static SpvId +emit_builtin_unop(struct ntv_context *ctx, enum GLSLstd450 op, SpvId type, + SpvId src) +{ + SpvId args[] = { src }; + return spirv_builder_emit_ext_inst(&ctx->builder, type, ctx->GLSL_std_450, + op, args, ARRAY_SIZE(args)); +} + +static SpvId +emit_builtin_binop(struct ntv_context *ctx, enum GLSLstd450 op, SpvId type, + SpvId src0, SpvId src1) +{ + SpvId args[] = { src0, src1 }; + return spirv_builder_emit_ext_inst(&ctx->builder, type, ctx->GLSL_std_450, + op, args, ARRAY_SIZE(args)); +} + +static SpvId +emit_builtin_triop(struct ntv_context *ctx, enum GLSLstd450 op, SpvId type, + SpvId src0, SpvId src1, SpvId src2) +{ + SpvId args[] = { src0, src1, src2 }; + return spirv_builder_emit_ext_inst(&ctx->builder, type, ctx->GLSL_std_450, + op, args, ARRAY_SIZE(args)); +} + +static SpvId +get_fvec_constant(struct ntv_context *ctx, unsigned bit_size, + unsigned num_components, float value) +{ + assert(bit_size == 32); + + SpvId result = emit_float_const(ctx, bit_size, value); + if (num_components == 1) + return result; + + assert(num_components > 1); + SpvId components[num_components]; + for (int i = 0; i < num_components; i++) + components[i] = result; + + SpvId type = get_fvec_type(ctx, bit_size, num_components); + return spirv_builder_const_composite(&ctx->builder, type, components, + num_components); +} + +static SpvId +get_uvec_constant(struct ntv_context *ctx, unsigned bit_size, + unsigned num_components, uint32_t value) +{ + assert(bit_size == 32); + + SpvId result = emit_uint_const(ctx, bit_size, value); + if (num_components == 1) + return result; + + assert(num_components > 1); + SpvId components[num_components]; + for (int i = 0; i < num_components; i++) + components[i] = result; + + SpvId type = get_uvec_type(ctx, bit_size, num_components); + return spirv_builder_const_composite(&ctx->builder, type, components, + num_components); +} + +static SpvId +get_ivec_constant(struct ntv_context *ctx, unsigned bit_size, + unsigned num_components, int32_t value) +{ + assert(bit_size == 32); + + SpvId result = emit_int_const(ctx, bit_size, value); + if (num_components == 1) + return result; + + assert(num_components > 1); + SpvId components[num_components]; + for (int i = 0; i < num_components; i++) + components[i] = result; + + SpvId type = get_ivec_type(ctx, bit_size, num_components); + return spirv_builder_const_composite(&ctx->builder, type, components, + num_components); +} + +static inline unsigned +alu_instr_src_components(const nir_alu_instr *instr, unsigned src) +{ + if (nir_op_infos[instr->op].input_sizes[src] > 0) + return nir_op_infos[instr->op].input_sizes[src]; + + if (instr->dest.dest.is_ssa) + return instr->dest.dest.ssa.num_components; + else + return instr->dest.dest.reg.reg->num_components; +} + +static SpvId +get_alu_src(struct ntv_context *ctx, nir_alu_instr *alu, unsigned src) +{ + SpvId uint_value = get_alu_src_uint(ctx, alu, src); + + unsigned num_components = alu_instr_src_components(alu, src); + unsigned bit_size = nir_src_bit_size(alu->src[src].src); + nir_alu_type type = nir_op_infos[alu->op].input_types[src]; + + switch (nir_alu_type_get_base_type(type)) { + case nir_type_bool: + assert(bit_size == 1); + return uvec_to_bvec(ctx, uint_value, num_components); + + case nir_type_int: + return bitcast_to_ivec(ctx, uint_value, bit_size, num_components); + + case nir_type_uint: + return uint_value; + + case nir_type_float: + return bitcast_to_fvec(ctx, uint_value, bit_size, num_components); + + default: + unreachable("unknown nir_alu_type"); + } +} + +static void +store_alu_result(struct ntv_context *ctx, nir_alu_instr *alu, SpvId result) +{ + assert(!alu->dest.saturate); + return store_dest(ctx, &alu->dest.dest, result, nir_op_infos[alu->op].output_type); +} + +static SpvId +get_dest_type(struct ntv_context *ctx, nir_dest *dest, nir_alu_type type) +{ + unsigned num_components = nir_dest_num_components(*dest); + unsigned bit_size = nir_dest_bit_size(*dest); + + switch (nir_alu_type_get_base_type(type)) { + case nir_type_bool: + return get_bvec_type(ctx, num_components); + + case nir_type_int: + return get_ivec_type(ctx, bit_size, num_components); + + case nir_type_uint: + return get_uvec_type(ctx, bit_size, num_components); + + case nir_type_float: + return get_fvec_type(ctx, bit_size, num_components); + + default: + unreachable("unsupported nir_alu_type"); + } +} + +static void +emit_alu(struct ntv_context *ctx, nir_alu_instr *alu) +{ + SpvId src[nir_op_infos[alu->op].num_inputs]; + for (unsigned i = 0; i < nir_op_infos[alu->op].num_inputs; i++) + src[i] = get_alu_src(ctx, alu, i); + + SpvId dest_type = get_dest_type(ctx, &alu->dest.dest, + nir_op_infos[alu->op].output_type); + unsigned bit_size = nir_dest_bit_size(alu->dest.dest); + unsigned num_components = nir_dest_num_components(alu->dest.dest); + + SpvId result = 0; + switch (alu->op) { + case nir_op_mov: + assert(nir_op_infos[alu->op].num_inputs == 1); + result = src[0]; + break; + +#define UNOP(nir_op, spirv_op) \ + case nir_op: \ + assert(nir_op_infos[alu->op].num_inputs == 1); \ + result = emit_unop(ctx, spirv_op, dest_type, src[0]); \ + break; + + UNOP(nir_op_ineg, SpvOpSNegate) + UNOP(nir_op_fneg, SpvOpFNegate) + UNOP(nir_op_fddx, SpvOpDPdx) + UNOP(nir_op_fddy, SpvOpDPdy) + UNOP(nir_op_f2i32, SpvOpConvertFToS) + UNOP(nir_op_f2u32, SpvOpConvertFToU) + UNOP(nir_op_i2f32, SpvOpConvertSToF) + UNOP(nir_op_u2f32, SpvOpConvertUToF) + UNOP(nir_op_inot, SpvOpNot) +#undef UNOP + + case nir_op_b2i32: + assert(nir_op_infos[alu->op].num_inputs == 1); + result = emit_select(ctx, dest_type, src[0], + get_ivec_constant(ctx, 32, num_components, 1), + get_ivec_constant(ctx, 32, num_components, 0)); + break; + + case nir_op_b2f32: + assert(nir_op_infos[alu->op].num_inputs == 1); + result = emit_select(ctx, dest_type, src[0], + get_fvec_constant(ctx, 32, num_components, 1), + get_fvec_constant(ctx, 32, num_components, 0)); + break; + +#define BUILTIN_UNOP(nir_op, spirv_op) \ + case nir_op: \ + assert(nir_op_infos[alu->op].num_inputs == 1); \ + result = emit_builtin_unop(ctx, spirv_op, dest_type, src[0]); \ + break; + + BUILTIN_UNOP(nir_op_iabs, GLSLstd450SAbs) + BUILTIN_UNOP(nir_op_fabs, GLSLstd450FAbs) + BUILTIN_UNOP(nir_op_fsqrt, GLSLstd450Sqrt) + BUILTIN_UNOP(nir_op_frsq, GLSLstd450InverseSqrt) + BUILTIN_UNOP(nir_op_flog2, GLSLstd450Log2) + BUILTIN_UNOP(nir_op_fexp2, GLSLstd450Exp2) + BUILTIN_UNOP(nir_op_ffract, GLSLstd450Fract) + BUILTIN_UNOP(nir_op_ffloor, GLSLstd450Floor) + BUILTIN_UNOP(nir_op_fceil, GLSLstd450Ceil) + BUILTIN_UNOP(nir_op_ftrunc, GLSLstd450Trunc) + BUILTIN_UNOP(nir_op_fround_even, GLSLstd450RoundEven) + BUILTIN_UNOP(nir_op_fsign, GLSLstd450FSign) + BUILTIN_UNOP(nir_op_fsin, GLSLstd450Sin) + BUILTIN_UNOP(nir_op_fcos, GLSLstd450Cos) +#undef BUILTIN_UNOP + + case nir_op_frcp: + assert(nir_op_infos[alu->op].num_inputs == 1); + result = emit_binop(ctx, SpvOpFDiv, dest_type, + get_fvec_constant(ctx, bit_size, num_components, 1), + src[0]); + break; + + case nir_op_f2b1: + assert(nir_op_infos[alu->op].num_inputs == 1); + result = emit_binop(ctx, SpvOpFOrdNotEqual, dest_type, src[0], + get_fvec_constant(ctx, + nir_src_bit_size(alu->src[0].src), + num_components, 0)); + break; + + +#define BINOP(nir_op, spirv_op) \ + case nir_op: \ + assert(nir_op_infos[alu->op].num_inputs == 2); \ + result = emit_binop(ctx, spirv_op, dest_type, src[0], src[1]); \ + break; + + BINOP(nir_op_iadd, SpvOpIAdd) + BINOP(nir_op_isub, SpvOpISub) + BINOP(nir_op_imul, SpvOpIMul) + BINOP(nir_op_idiv, SpvOpSDiv) + BINOP(nir_op_udiv, SpvOpUDiv) + BINOP(nir_op_umod, SpvOpUMod) + BINOP(nir_op_fadd, SpvOpFAdd) + BINOP(nir_op_fsub, SpvOpFSub) + BINOP(nir_op_fmul, SpvOpFMul) + BINOP(nir_op_fdiv, SpvOpFDiv) + BINOP(nir_op_fmod, SpvOpFMod) + BINOP(nir_op_ilt, SpvOpSLessThan) + BINOP(nir_op_ige, SpvOpSGreaterThanEqual) + BINOP(nir_op_ieq, SpvOpIEqual) + BINOP(nir_op_ine, SpvOpINotEqual) + BINOP(nir_op_uge, SpvOpUGreaterThanEqual) + BINOP(nir_op_flt, SpvOpFOrdLessThan) + BINOP(nir_op_fge, SpvOpFOrdGreaterThanEqual) + BINOP(nir_op_feq, SpvOpFOrdEqual) + BINOP(nir_op_fne, SpvOpFOrdNotEqual) + BINOP(nir_op_ishl, SpvOpShiftLeftLogical) + BINOP(nir_op_ishr, SpvOpShiftRightArithmetic) + BINOP(nir_op_ushr, SpvOpShiftRightLogical) + BINOP(nir_op_iand, SpvOpBitwiseAnd) + BINOP(nir_op_ior, SpvOpBitwiseOr) +#undef BINOP + +#define BUILTIN_BINOP(nir_op, spirv_op) \ + case nir_op: \ + assert(nir_op_infos[alu->op].num_inputs == 2); \ + result = emit_builtin_binop(ctx, spirv_op, dest_type, src[0], src[1]); \ + break; + + BUILTIN_BINOP(nir_op_fmin, GLSLstd450FMin) + BUILTIN_BINOP(nir_op_fmax, GLSLstd450FMax) +#undef BUILTIN_BINOP + + case nir_op_fdot2: + case nir_op_fdot3: + case nir_op_fdot4: + assert(nir_op_infos[alu->op].num_inputs == 2); + result = emit_binop(ctx, SpvOpDot, dest_type, src[0], src[1]); + break; + + case nir_op_seq: + case nir_op_sne: + case nir_op_slt: + case nir_op_sge: { + assert(nir_op_infos[alu->op].num_inputs == 2); + int num_components = nir_dest_num_components(alu->dest.dest); + SpvId bool_type = get_bvec_type(ctx, num_components); + + SpvId zero = emit_float_const(ctx, bit_size, 0.0f); + SpvId one = emit_float_const(ctx, bit_size, 1.0f); + if (num_components > 1) { + SpvId zero_comps[num_components], one_comps[num_components]; + for (int i = 0; i < num_components; i++) { + zero_comps[i] = zero; + one_comps[i] = one; + } + + zero = spirv_builder_const_composite(&ctx->builder, dest_type, + zero_comps, num_components); + one = spirv_builder_const_composite(&ctx->builder, dest_type, + one_comps, num_components); + } + + SpvOp op; + switch (alu->op) { + case nir_op_seq: op = SpvOpFOrdEqual; break; + case nir_op_sne: op = SpvOpFOrdNotEqual; break; + case nir_op_slt: op = SpvOpFOrdLessThan; break; + case nir_op_sge: op = SpvOpFOrdGreaterThanEqual; break; + default: unreachable("unexpected op"); + } + + result = emit_binop(ctx, op, bool_type, src[0], src[1]); + result = emit_select(ctx, dest_type, result, one, zero); + } + break; + + case nir_op_flrp: + assert(nir_op_infos[alu->op].num_inputs == 3); + result = emit_builtin_triop(ctx, GLSLstd450FMix, dest_type, + src[0], src[1], src[2]); + break; + + case nir_op_fcsel: + result = emit_binop(ctx, SpvOpFOrdGreaterThan, + get_bvec_type(ctx, num_components), + src[0], + get_fvec_constant(ctx, + nir_src_bit_size(alu->src[0].src), + num_components, 0)); + result = emit_select(ctx, dest_type, result, src[1], src[2]); + break; + + case nir_op_bcsel: + assert(nir_op_infos[alu->op].num_inputs == 3); + result = emit_select(ctx, dest_type, src[0], src[1], src[2]); + break; + + case nir_op_bany_fnequal2: + case nir_op_bany_fnequal3: + case nir_op_bany_fnequal4: + assert(nir_op_infos[alu->op].num_inputs == 2); + assert(alu_instr_src_components(alu, 0) == + alu_instr_src_components(alu, 1)); + result = emit_binop(ctx, SpvOpFOrdNotEqual, + get_bvec_type(ctx, alu_instr_src_components(alu, 0)), + src[0], src[1]); + result = emit_unop(ctx, SpvOpAny, dest_type, result); + break; + + case nir_op_ball_fequal2: + case nir_op_ball_fequal3: + case nir_op_ball_fequal4: + assert(nir_op_infos[alu->op].num_inputs == 2); + assert(alu_instr_src_components(alu, 0) == + alu_instr_src_components(alu, 1)); + result = emit_binop(ctx, SpvOpFOrdEqual, + get_bvec_type(ctx, alu_instr_src_components(alu, 0)), + src[0], src[1]); + result = emit_unop(ctx, SpvOpAll, dest_type, result); + break; + + case nir_op_bany_inequal2: + case nir_op_bany_inequal3: + case nir_op_bany_inequal4: + assert(nir_op_infos[alu->op].num_inputs == 2); + assert(alu_instr_src_components(alu, 0) == + alu_instr_src_components(alu, 1)); + result = emit_binop(ctx, SpvOpINotEqual, + get_bvec_type(ctx, alu_instr_src_components(alu, 0)), + src[0], src[1]); + result = emit_unop(ctx, SpvOpAny, dest_type, result); + break; + + case nir_op_ball_iequal2: + case nir_op_ball_iequal3: + case nir_op_ball_iequal4: + assert(nir_op_infos[alu->op].num_inputs == 2); + assert(alu_instr_src_components(alu, 0) == + alu_instr_src_components(alu, 1)); + result = emit_binop(ctx, SpvOpIEqual, + get_bvec_type(ctx, alu_instr_src_components(alu, 0)), + src[0], src[1]); + result = emit_unop(ctx, SpvOpAll, dest_type, result); + break; + + case nir_op_vec2: + case nir_op_vec3: + case nir_op_vec4: { + int num_inputs = nir_op_infos[alu->op].num_inputs; + assert(2 <= num_inputs && num_inputs <= 4); + result = spirv_builder_emit_composite_construct(&ctx->builder, dest_type, + src, num_inputs); + } + break; + + default: + fprintf(stderr, "emit_alu: not implemented (%s)\n", + nir_op_infos[alu->op].name); + + unreachable("unsupported opcode"); + return; + } + + store_alu_result(ctx, alu, result); +} + +static void +emit_load_const(struct ntv_context *ctx, nir_load_const_instr *load_const) +{ + unsigned bit_size = load_const->def.bit_size; + unsigned num_components = load_const->def.num_components; + + SpvId constant; + if (num_components > 1) { + SpvId components[num_components]; + SpvId type; + if (bit_size == 1) { + for (int i = 0; i < num_components; i++) + components[i] = spirv_builder_const_bool(&ctx->builder, + load_const->value[i].b); + + type = get_bvec_type(ctx, num_components); + } else { + for (int i = 0; i < num_components; i++) + components[i] = emit_uint_const(ctx, bit_size, + load_const->value[i].u32); + + type = get_uvec_type(ctx, bit_size, num_components); + } + constant = spirv_builder_const_composite(&ctx->builder, type, + components, num_components); + } else { + assert(num_components == 1); + if (bit_size == 1) + constant = spirv_builder_const_bool(&ctx->builder, + load_const->value[0].b); + else + constant = emit_uint_const(ctx, bit_size, load_const->value[0].u32); + } + + if (bit_size == 1) + constant = bvec_to_uvec(ctx, constant, num_components); + + store_ssa_def_uint(ctx, &load_const->def, constant); +} + +static void +emit_load_ubo(struct ntv_context *ctx, nir_intrinsic_instr *intr) +{ + nir_const_value *const_block_index = nir_src_as_const_value(intr->src[0]); + assert(const_block_index); // no dynamic indexing for now + assert(const_block_index->u32 == 0); // we only support the default UBO for now + + nir_const_value *const_offset = nir_src_as_const_value(intr->src[1]); + if (const_offset) { + SpvId uvec4_type = get_uvec_type(ctx, 32, 4); + SpvId pointer_type = spirv_builder_type_pointer(&ctx->builder, + SpvStorageClassUniform, + uvec4_type); + + unsigned idx = const_offset->u32; + SpvId member = emit_uint_const(ctx, 32, 0); + SpvId offset = emit_uint_const(ctx, 32, idx); + SpvId offsets[] = { member, offset }; + SpvId ptr = spirv_builder_emit_access_chain(&ctx->builder, pointer_type, + ctx->ubos[0], offsets, + ARRAY_SIZE(offsets)); + SpvId result = spirv_builder_emit_load(&ctx->builder, uvec4_type, ptr); + + SpvId type = get_dest_uvec_type(ctx, &intr->dest); + unsigned num_components = nir_dest_num_components(intr->dest); + if (num_components == 1) { + uint32_t components[] = { 0 }; + result = spirv_builder_emit_composite_extract(&ctx->builder, + type, + result, components, + 1); + } else if (num_components < 4) { + SpvId constituents[num_components]; + SpvId uint_type = spirv_builder_type_uint(&ctx->builder, 32); + for (uint32_t i = 0; i < num_components; ++i) + constituents[i] = spirv_builder_emit_composite_extract(&ctx->builder, + uint_type, + result, &i, + 1); + + result = spirv_builder_emit_composite_construct(&ctx->builder, + type, + constituents, + num_components); + } + + store_dest_uint(ctx, &intr->dest, result); + } else + unreachable("uniform-addressing not yet supported"); +} + +static void +emit_discard(struct ntv_context *ctx, nir_intrinsic_instr *intr) +{ + assert(ctx->block_started); + spirv_builder_emit_kill(&ctx->builder); + /* discard is weird in NIR, so let's just create an unreachable block after + it and hope that the vulkan driver will DCE any instructinos in it. */ + spirv_builder_label(&ctx->builder, spirv_builder_new_id(&ctx->builder)); +} + +static void +emit_load_deref(struct ntv_context *ctx, nir_intrinsic_instr *intr) +{ + /* uint is a bit of a lie here; it's really just a pointer */ + SpvId ptr = get_src_uint(ctx, intr->src); + + nir_variable *var = nir_intrinsic_get_var(intr, 0); + SpvId result = spirv_builder_emit_load(&ctx->builder, + get_glsl_type(ctx, var->type), + ptr); + unsigned num_components = nir_dest_num_components(intr->dest); + unsigned bit_size = nir_dest_bit_size(intr->dest); + result = bitcast_to_uvec(ctx, result, bit_size, num_components); + store_dest_uint(ctx, &intr->dest, result); +} + +static void +emit_store_deref(struct ntv_context *ctx, nir_intrinsic_instr *intr) +{ + /* uint is a bit of a lie here; it's really just a pointer */ + SpvId ptr = get_src_uint(ctx, &intr->src[0]); + SpvId src = get_src_uint(ctx, &intr->src[1]); + + nir_variable *var = nir_intrinsic_get_var(intr, 0); + SpvId type = get_glsl_type(ctx, glsl_without_array(var->type)); + SpvId result = emit_bitcast(ctx, type, src); + spirv_builder_emit_store(&ctx->builder, ptr, result); +} + +static SpvId +create_builtin_var(struct ntv_context *ctx, SpvId var_type, + SpvStorageClass storage_class, + const char *name, SpvBuiltIn builtin) +{ + SpvId pointer_type = spirv_builder_type_pointer(&ctx->builder, + storage_class, + var_type); + SpvId var = spirv_builder_emit_var(&ctx->builder, pointer_type, + storage_class); + spirv_builder_emit_name(&ctx->builder, var, name); + spirv_builder_emit_builtin(&ctx->builder, var, builtin); + + assert(ctx->num_entry_ifaces < ARRAY_SIZE(ctx->entry_ifaces)); + ctx->entry_ifaces[ctx->num_entry_ifaces++] = var; + return var; +} + +static void +emit_load_front_face(struct ntv_context *ctx, nir_intrinsic_instr *intr) +{ + SpvId var_type = spirv_builder_type_bool(&ctx->builder); + if (!ctx->front_face_var) + ctx->front_face_var = create_builtin_var(ctx, var_type, + SpvStorageClassInput, + "gl_FrontFacing", + SpvBuiltInFrontFacing); + + SpvId result = spirv_builder_emit_load(&ctx->builder, var_type, + ctx->front_face_var); + assert(1 == nir_dest_num_components(intr->dest)); + result = bvec_to_uvec(ctx, result, 1); + store_dest_uint(ctx, &intr->dest, result); +} + +static void +emit_load_vertex_id(struct ntv_context *ctx, nir_intrinsic_instr *intr) +{ + SpvId var_type = spirv_builder_type_uint(&ctx->builder, 32); + if (!ctx->vertex_id_var) + ctx->vertex_id_var = create_builtin_var(ctx, var_type, + SpvStorageClassInput, + "gl_VertexID", + SpvBuiltInVertexIndex); + + SpvId result = spirv_builder_emit_load(&ctx->builder, var_type, + ctx->vertex_id_var); + assert(1 == nir_dest_num_components(intr->dest)); + store_dest_uint(ctx, &intr->dest, result); +} + +static void +emit_intrinsic(struct ntv_context *ctx, nir_intrinsic_instr *intr) +{ + switch (intr->intrinsic) { + case nir_intrinsic_load_ubo: + emit_load_ubo(ctx, intr); + break; + + case nir_intrinsic_discard: + emit_discard(ctx, intr); + break; + + case nir_intrinsic_load_deref: + emit_load_deref(ctx, intr); + break; + + case nir_intrinsic_store_deref: + emit_store_deref(ctx, intr); + break; + + case nir_intrinsic_load_front_face: + emit_load_front_face(ctx, intr); + break; + + case nir_intrinsic_load_vertex_id: + emit_load_vertex_id(ctx, intr); + break; + + default: + fprintf(stderr, "emit_intrinsic: not implemented (%s)\n", + nir_intrinsic_infos[intr->intrinsic].name); + unreachable("unsupported intrinsic"); + } +} + +static void +emit_undef(struct ntv_context *ctx, nir_ssa_undef_instr *undef) +{ + SpvId type = get_uvec_type(ctx, undef->def.bit_size, + undef->def.num_components); + + store_ssa_def_uint(ctx, &undef->def, + spirv_builder_emit_undef(&ctx->builder, type)); +} + +static SpvId +get_src_float(struct ntv_context *ctx, nir_src *src) +{ + SpvId def = get_src_uint(ctx, src); + unsigned num_components = nir_src_num_components(*src); + unsigned bit_size = nir_src_bit_size(*src); + return bitcast_to_fvec(ctx, def, bit_size, num_components); +} + +static SpvId +get_src_int(struct ntv_context *ctx, nir_src *src) +{ + SpvId def = get_src_uint(ctx, src); + unsigned num_components = nir_src_num_components(*src); + unsigned bit_size = nir_src_bit_size(*src); + return bitcast_to_ivec(ctx, def, bit_size, num_components); +} + +static void +emit_tex(struct ntv_context *ctx, nir_tex_instr *tex) +{ + assert(tex->op == nir_texop_tex || + tex->op == nir_texop_txb || + tex->op == nir_texop_txl || + tex->op == nir_texop_txd || + tex->op == nir_texop_txf || + tex->op == nir_texop_txs); + assert(tex->texture_index == tex->sampler_index); + + SpvId coord = 0, proj = 0, bias = 0, lod = 0, dref = 0, dx = 0, dy = 0, + offset = 0; + unsigned coord_components = 0; + for (unsigned i = 0; i < tex->num_srcs; i++) { + switch (tex->src[i].src_type) { + case nir_tex_src_coord: + if (tex->op == nir_texop_txf) + coord = get_src_int(ctx, &tex->src[i].src); + else + coord = get_src_float(ctx, &tex->src[i].src); + coord_components = nir_src_num_components(tex->src[i].src); + break; + + case nir_tex_src_projector: + assert(nir_src_num_components(tex->src[i].src) == 1); + proj = get_src_float(ctx, &tex->src[i].src); + assert(proj != 0); + break; + + case nir_tex_src_offset: + offset = get_src_int(ctx, &tex->src[i].src); + break; + + case nir_tex_src_bias: + assert(tex->op == nir_texop_txb); + bias = get_src_float(ctx, &tex->src[i].src); + assert(bias != 0); + break; + + case nir_tex_src_lod: + assert(nir_src_num_components(tex->src[i].src) == 1); + if (tex->op == nir_texop_txf || + tex->op == nir_texop_txs) + lod = get_src_int(ctx, &tex->src[i].src); + else + lod = get_src_float(ctx, &tex->src[i].src); + assert(lod != 0); + break; + + case nir_tex_src_comparator: + assert(nir_src_num_components(tex->src[i].src) == 1); + dref = get_src_float(ctx, &tex->src[i].src); + assert(dref != 0); + break; + + case nir_tex_src_ddx: + dx = get_src_float(ctx, &tex->src[i].src); + assert(dx != 0); + break; + + case nir_tex_src_ddy: + dy = get_src_float(ctx, &tex->src[i].src); + assert(dy != 0); + break; + + default: + fprintf(stderr, "texture source: %d\n", tex->src[i].src_type); + unreachable("unknown texture source"); + } + } + + if (lod == 0 && ctx->stage != MESA_SHADER_FRAGMENT) { + lod = emit_float_const(ctx, 32, 0.0f); + assert(lod != 0); + } + + SpvId image_type = ctx->image_types[tex->texture_index]; + SpvId sampled_type = spirv_builder_type_sampled_image(&ctx->builder, + image_type); + + assert(tex->texture_index < ctx->num_samplers); + SpvId load = spirv_builder_emit_load(&ctx->builder, sampled_type, + ctx->samplers[tex->texture_index]); + + SpvId dest_type = get_dest_type(ctx, &tex->dest, tex->dest_type); + + if (tex->op == nir_texop_txs) { + SpvId image = spirv_builder_emit_image(&ctx->builder, image_type, load); + SpvId result = spirv_builder_emit_image_query_size(&ctx->builder, + dest_type, image, + lod); + store_dest(ctx, &tex->dest, result, tex->dest_type); + return; + } + + if (proj && coord_components > 0) { + SpvId constituents[coord_components + 1]; + if (coord_components == 1) + constituents[0] = coord; + else { + assert(coord_components > 1); + SpvId float_type = spirv_builder_type_float(&ctx->builder, 32); + for (uint32_t i = 0; i < coord_components; ++i) + constituents[i] = spirv_builder_emit_composite_extract(&ctx->builder, + float_type, + coord, + &i, 1); + } + + constituents[coord_components++] = proj; + + SpvId vec_type = get_fvec_type(ctx, 32, coord_components); + coord = spirv_builder_emit_composite_construct(&ctx->builder, + vec_type, + constituents, + coord_components); + } + + SpvId actual_dest_type = dest_type; + if (dref) + actual_dest_type = spirv_builder_type_float(&ctx->builder, 32); + + SpvId result; + if (tex->op == nir_texop_txf) { + SpvId image = spirv_builder_emit_image(&ctx->builder, image_type, load); + result = spirv_builder_emit_image_fetch(&ctx->builder, dest_type, + image, coord, lod); + } else { + result = spirv_builder_emit_image_sample(&ctx->builder, + actual_dest_type, load, + coord, + proj != 0, + lod, bias, dref, dx, dy, + offset); + } + + spirv_builder_emit_decoration(&ctx->builder, result, + SpvDecorationRelaxedPrecision); + + if (dref && nir_dest_num_components(tex->dest) > 1) { + SpvId components[4] = { result, result, result, result }; + result = spirv_builder_emit_composite_construct(&ctx->builder, + dest_type, + components, + 4); + } + + store_dest(ctx, &tex->dest, result, tex->dest_type); +} + +static void +start_block(struct ntv_context *ctx, SpvId label) +{ + /* terminate previous block if needed */ + if (ctx->block_started) + spirv_builder_emit_branch(&ctx->builder, label); + + /* start new block */ + spirv_builder_label(&ctx->builder, label); + ctx->block_started = true; +} + +static void +branch(struct ntv_context *ctx, SpvId label) +{ + assert(ctx->block_started); + spirv_builder_emit_branch(&ctx->builder, label); + ctx->block_started = false; +} + +static void +branch_conditional(struct ntv_context *ctx, SpvId condition, SpvId then_id, + SpvId else_id) +{ + assert(ctx->block_started); + spirv_builder_emit_branch_conditional(&ctx->builder, condition, + then_id, else_id); + ctx->block_started = false; +} + +static void +emit_jump(struct ntv_context *ctx, nir_jump_instr *jump) +{ + switch (jump->type) { + case nir_jump_break: + assert(ctx->loop_break); + branch(ctx, ctx->loop_break); + break; + + case nir_jump_continue: + assert(ctx->loop_cont); + branch(ctx, ctx->loop_cont); + break; + + default: + unreachable("Unsupported jump type\n"); + } +} + +static void +emit_deref_var(struct ntv_context *ctx, nir_deref_instr *deref) +{ + assert(deref->deref_type == nir_deref_type_var); + + struct hash_entry *he = _mesa_hash_table_search(ctx->vars, deref->var); + assert(he); + SpvId result = (SpvId)(intptr_t)he->data; + /* uint is a bit of a lie here, it's really just an opaque type */ + store_dest_uint(ctx, &deref->dest, result); +} + +static void +emit_deref_array(struct ntv_context *ctx, nir_deref_instr *deref) +{ + assert(deref->deref_type == nir_deref_type_array); + nir_variable *var = nir_deref_instr_get_variable(deref); + + SpvStorageClass storage_class; + switch (var->data.mode) { + case nir_var_shader_in: + storage_class = SpvStorageClassInput; + break; + + case nir_var_shader_out: + storage_class = SpvStorageClassOutput; + break; + + default: + unreachable("Unsupported nir_variable_mode\n"); + } + + SpvId index = get_src_uint(ctx, &deref->arr.index); + + SpvId ptr_type = spirv_builder_type_pointer(&ctx->builder, + storage_class, + get_glsl_type(ctx, deref->type)); + + SpvId result = spirv_builder_emit_access_chain(&ctx->builder, + ptr_type, + get_src_uint(ctx, &deref->parent), + &index, 1); + /* uint is a bit of a lie here, it's really just an opaque type */ + store_dest_uint(ctx, &deref->dest, result); +} + +static void +emit_deref(struct ntv_context *ctx, nir_deref_instr *deref) +{ + switch (deref->deref_type) { + case nir_deref_type_var: + emit_deref_var(ctx, deref); + break; + + case nir_deref_type_array: + emit_deref_array(ctx, deref); + break; + + default: + unreachable("unexpected deref_type"); + } +} + +static void +emit_block(struct ntv_context *ctx, struct nir_block *block) +{ + start_block(ctx, block_label(ctx, block)); + nir_foreach_instr(instr, block) { + switch (instr->type) { + case nir_instr_type_alu: + emit_alu(ctx, nir_instr_as_alu(instr)); + break; + case nir_instr_type_intrinsic: + emit_intrinsic(ctx, nir_instr_as_intrinsic(instr)); + break; + case nir_instr_type_load_const: + emit_load_const(ctx, nir_instr_as_load_const(instr)); + break; + case nir_instr_type_ssa_undef: + emit_undef(ctx, nir_instr_as_ssa_undef(instr)); + break; + case nir_instr_type_tex: + emit_tex(ctx, nir_instr_as_tex(instr)); + break; + case nir_instr_type_phi: + unreachable("nir_instr_type_phi not supported"); + break; + case nir_instr_type_jump: + emit_jump(ctx, nir_instr_as_jump(instr)); + break; + case nir_instr_type_call: + unreachable("nir_instr_type_call not supported"); + break; + case nir_instr_type_parallel_copy: + unreachable("nir_instr_type_parallel_copy not supported"); + break; + case nir_instr_type_deref: + emit_deref(ctx, nir_instr_as_deref(instr)); + break; + } + } +} + +static void +emit_cf_list(struct ntv_context *ctx, struct exec_list *list); + +static SpvId +get_src_bool(struct ntv_context *ctx, nir_src *src) +{ + SpvId def = get_src_uint(ctx, src); + assert(nir_src_bit_size(*src) == 1); + unsigned num_components = nir_src_num_components(*src); + return uvec_to_bvec(ctx, def, num_components); +} + +static void +emit_if(struct ntv_context *ctx, nir_if *if_stmt) +{ + SpvId condition = get_src_bool(ctx, &if_stmt->condition); + + SpvId header_id = spirv_builder_new_id(&ctx->builder); + SpvId then_id = block_label(ctx, nir_if_first_then_block(if_stmt)); + SpvId endif_id = spirv_builder_new_id(&ctx->builder); + SpvId else_id = endif_id; + + bool has_else = !exec_list_is_empty(&if_stmt->else_list); + if (has_else) { + assert(nir_if_first_else_block(if_stmt)->index < ctx->num_blocks); + else_id = block_label(ctx, nir_if_first_else_block(if_stmt)); + } + + /* create a header-block */ + start_block(ctx, header_id); + spirv_builder_emit_selection_merge(&ctx->builder, endif_id, + SpvSelectionControlMaskNone); + branch_conditional(ctx, condition, then_id, else_id); + + emit_cf_list(ctx, &if_stmt->then_list); + + if (has_else) { + if (ctx->block_started) + branch(ctx, endif_id); + + emit_cf_list(ctx, &if_stmt->else_list); + } + + start_block(ctx, endif_id); +} + +static void +emit_loop(struct ntv_context *ctx, nir_loop *loop) +{ + SpvId header_id = spirv_builder_new_id(&ctx->builder); + SpvId begin_id = block_label(ctx, nir_loop_first_block(loop)); + SpvId break_id = spirv_builder_new_id(&ctx->builder); + SpvId cont_id = spirv_builder_new_id(&ctx->builder); + + /* create a header-block */ + start_block(ctx, header_id); + spirv_builder_loop_merge(&ctx->builder, break_id, cont_id, SpvLoopControlMaskNone); + branch(ctx, begin_id); + + SpvId save_break = ctx->loop_break; + SpvId save_cont = ctx->loop_cont; + ctx->loop_break = break_id; + ctx->loop_cont = cont_id; + + emit_cf_list(ctx, &loop->body); + + ctx->loop_break = save_break; + ctx->loop_cont = save_cont; + + branch(ctx, cont_id); + start_block(ctx, cont_id); + branch(ctx, header_id); + + start_block(ctx, break_id); +} + +static void +emit_cf_list(struct ntv_context *ctx, struct exec_list *list) +{ + foreach_list_typed(nir_cf_node, node, node, list) { + switch (node->type) { + case nir_cf_node_block: + emit_block(ctx, nir_cf_node_as_block(node)); + break; + + case nir_cf_node_if: + emit_if(ctx, nir_cf_node_as_if(node)); + break; + + case nir_cf_node_loop: + emit_loop(ctx, nir_cf_node_as_loop(node)); + break; + + case nir_cf_node_function: + unreachable("nir_cf_node_function not supported"); + break; + } + } +} + +struct spirv_shader * +nir_to_spirv(struct nir_shader *s) +{ + struct spirv_shader *ret = NULL; + + struct ntv_context ctx = {}; + + switch (s->info.stage) { + case MESA_SHADER_VERTEX: + case MESA_SHADER_FRAGMENT: + case MESA_SHADER_COMPUTE: + spirv_builder_emit_cap(&ctx.builder, SpvCapabilityShader); + break; + + case MESA_SHADER_TESS_CTRL: + case MESA_SHADER_TESS_EVAL: + spirv_builder_emit_cap(&ctx.builder, SpvCapabilityTessellation); + break; + + case MESA_SHADER_GEOMETRY: + spirv_builder_emit_cap(&ctx.builder, SpvCapabilityGeometry); + break; + + default: + unreachable("invalid stage"); + } + + // TODO: only enable when needed + if (s->info.stage == MESA_SHADER_FRAGMENT) { + spirv_builder_emit_cap(&ctx.builder, SpvCapabilitySampled1D); + spirv_builder_emit_cap(&ctx.builder, SpvCapabilityImageQuery); + } + + ctx.stage = s->info.stage; + ctx.GLSL_std_450 = spirv_builder_import(&ctx.builder, "GLSL.std.450"); + spirv_builder_emit_source(&ctx.builder, SpvSourceLanguageGLSL, 450); + + spirv_builder_emit_mem_model(&ctx.builder, SpvAddressingModelLogical, + SpvMemoryModelGLSL450); + + SpvExecutionModel exec_model; + switch (s->info.stage) { + case MESA_SHADER_VERTEX: + exec_model = SpvExecutionModelVertex; + break; + case MESA_SHADER_TESS_CTRL: + exec_model = SpvExecutionModelTessellationControl; + break; + case MESA_SHADER_TESS_EVAL: + exec_model = SpvExecutionModelTessellationEvaluation; + break; + case MESA_SHADER_GEOMETRY: + exec_model = SpvExecutionModelGeometry; + break; + case MESA_SHADER_FRAGMENT: + exec_model = SpvExecutionModelFragment; + break; + case MESA_SHADER_COMPUTE: + exec_model = SpvExecutionModelGLCompute; + break; + default: + unreachable("invalid stage"); + } + + SpvId type_void = spirv_builder_type_void(&ctx.builder); + SpvId type_main = spirv_builder_type_function(&ctx.builder, type_void, + NULL, 0); + SpvId entry_point = spirv_builder_new_id(&ctx.builder); + spirv_builder_emit_name(&ctx.builder, entry_point, "main"); + + ctx.vars = _mesa_hash_table_create(NULL, _mesa_hash_pointer, + _mesa_key_pointer_equal); + + nir_foreach_variable(var, &s->inputs) + emit_input(&ctx, var); + + nir_foreach_variable(var, &s->outputs) + emit_output(&ctx, var); + + nir_foreach_variable(var, &s->uniforms) + emit_uniform(&ctx, var); + + if (s->info.stage == MESA_SHADER_FRAGMENT) { + spirv_builder_emit_exec_mode(&ctx.builder, entry_point, + SpvExecutionModeOriginUpperLeft); + if (s->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) + spirv_builder_emit_exec_mode(&ctx.builder, entry_point, + SpvExecutionModeDepthReplacing); + } + + + spirv_builder_function(&ctx.builder, entry_point, type_void, + SpvFunctionControlMaskNone, + type_main); + + nir_function_impl *entry = nir_shader_get_entrypoint(s); + nir_metadata_require(entry, nir_metadata_block_index); + + ctx.defs = (SpvId *)malloc(sizeof(SpvId) * entry->ssa_alloc); + if (!ctx.defs) + goto fail; + ctx.num_defs = entry->ssa_alloc; + + nir_index_local_regs(entry); + ctx.regs = malloc(sizeof(SpvId) * entry->reg_alloc); + if (!ctx.regs) + goto fail; + ctx.num_regs = entry->reg_alloc; + + SpvId *block_ids = (SpvId *)malloc(sizeof(SpvId) * entry->num_blocks); + if (!block_ids) + goto fail; + + for (int i = 0; i < entry->num_blocks; ++i) + block_ids[i] = spirv_builder_new_id(&ctx.builder); + + ctx.block_ids = block_ids; + ctx.num_blocks = entry->num_blocks; + + /* emit a block only for the variable declarations */ + start_block(&ctx, spirv_builder_new_id(&ctx.builder)); + foreach_list_typed(nir_register, reg, node, &entry->registers) { + SpvId type = get_uvec_type(&ctx, reg->bit_size, reg->num_components); + SpvId pointer_type = spirv_builder_type_pointer(&ctx.builder, + SpvStorageClassFunction, + type); + SpvId var = spirv_builder_emit_var(&ctx.builder, pointer_type, + SpvStorageClassFunction); + + ctx.regs[reg->index] = var; + } + + emit_cf_list(&ctx, &entry->body); + + free(ctx.defs); + + spirv_builder_return(&ctx.builder); // doesn't belong here, but whatevz + spirv_builder_function_end(&ctx.builder); + + spirv_builder_emit_entry_point(&ctx.builder, exec_model, entry_point, + "main", ctx.entry_ifaces, + ctx.num_entry_ifaces); + + size_t num_words = spirv_builder_get_num_words(&ctx.builder); + + ret = CALLOC_STRUCT(spirv_shader); + if (!ret) + goto fail; + + ret->words = MALLOC(sizeof(uint32_t) * num_words); + if (!ret->words) + goto fail; + + ret->num_words = spirv_builder_get_words(&ctx.builder, ret->words, num_words); + assert(ret->num_words == num_words); + + return ret; + +fail: + + if (ret) + spirv_shader_delete(ret); + + if (ctx.vars) + _mesa_hash_table_destroy(ctx.vars, NULL); + + return NULL; +} + +void +spirv_shader_delete(struct spirv_shader *s) +{ + FREE(s->words); + FREE(s); +} diff -Nru mesa-19.2.8/src/gallium/drivers/zink/nir_to_spirv/nir_to_spirv.h mesa-20.0.8/src/gallium/drivers/zink/nir_to_spirv/nir_to_spirv.h --- mesa-19.2.8/src/gallium/drivers/zink/nir_to_spirv/nir_to_spirv.h 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/zink/nir_to_spirv/nir_to_spirv.h 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,44 @@ +/* + * Copyright 2018 Collabora Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef NIR_TO_SPIRV_H +#define NIR_TO_SPIRV_H + +#include +#include +#include + +struct spirv_shader { + uint32_t *words; + size_t num_words; +}; + +struct nir_shader; + +struct spirv_shader * +nir_to_spirv(struct nir_shader *s); + +void +spirv_shader_delete(struct spirv_shader *s); + +#endif diff -Nru mesa-19.2.8/src/gallium/drivers/zink/nir_to_spirv/spirv_builder.c mesa-20.0.8/src/gallium/drivers/zink/nir_to_spirv/spirv_builder.c --- mesa-19.2.8/src/gallium/drivers/zink/nir_to_spirv/spirv_builder.c 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/zink/nir_to_spirv/spirv_builder.c 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,1067 @@ +/* + * Copyright 2018 Collabora Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "spirv_builder.h" + +#include "util/macros.h" +#include "util/u_bitcast.h" +#include "util/u_memory.h" +#include "util/hash_table.h" + +#include +#include +#include + +static bool +spirv_buffer_grow(struct spirv_buffer *b, size_t needed) +{ + size_t new_room = MAX3(64, (b->room * 3) / 2, needed); + + uint32_t *new_words = realloc(b->words, new_room * sizeof(uint32_t)); + if (!new_words) + return false; + + b->words = new_words; + b->room = new_room; + return true; +} + +static inline bool +spirv_buffer_prepare(struct spirv_buffer *b, size_t needed) +{ + needed += b->num_words; + if (b->room >= b->num_words + needed) + return true; + + return spirv_buffer_grow(b, needed); +} + +static inline void +spirv_buffer_emit_word(struct spirv_buffer *b, uint32_t word) +{ + assert(b->num_words < b->room); + b->words[b->num_words++] = word; +} + +static int +spirv_buffer_emit_string(struct spirv_buffer *b, const char *str) +{ + int pos = 0; + uint32_t word = 0; + while (str[pos] != '\0') { + word |= str[pos] << (8 * (pos % 4)); + if (++pos % 4 == 0) { + spirv_buffer_prepare(b, 1); + spirv_buffer_emit_word(b, word); + word = 0; + } + } + + spirv_buffer_prepare(b, 1); + spirv_buffer_emit_word(b, word); + + return 1 + pos / 4; +} + +void +spirv_builder_emit_cap(struct spirv_builder *b, SpvCapability cap) +{ + spirv_buffer_prepare(&b->capabilities, 2); + spirv_buffer_emit_word(&b->capabilities, SpvOpCapability | (2 << 16)); + spirv_buffer_emit_word(&b->capabilities, cap); +} + +void +spirv_builder_emit_source(struct spirv_builder *b, SpvSourceLanguage lang, + uint32_t version) +{ + spirv_buffer_prepare(&b->debug_names, 3); + spirv_buffer_emit_word(&b->debug_names, SpvOpSource | (3 << 16)); + spirv_buffer_emit_word(&b->debug_names, lang); + spirv_buffer_emit_word(&b->debug_names, version); +} + +void +spirv_builder_emit_mem_model(struct spirv_builder *b, + SpvAddressingModel addr_model, + SpvMemoryModel mem_model) +{ + spirv_buffer_prepare(&b->memory_model, 3); + spirv_buffer_emit_word(&b->memory_model, SpvOpMemoryModel | (3 << 16)); + spirv_buffer_emit_word(&b->memory_model, addr_model); + spirv_buffer_emit_word(&b->memory_model, mem_model); +} + +void +spirv_builder_emit_entry_point(struct spirv_builder *b, + SpvExecutionModel exec_model, SpvId entry_point, + const char *name, const SpvId interfaces[], + size_t num_interfaces) +{ + size_t pos = b->entry_points.num_words; + spirv_buffer_prepare(&b->entry_points, 3); + spirv_buffer_emit_word(&b->entry_points, SpvOpEntryPoint); + spirv_buffer_emit_word(&b->entry_points, exec_model); + spirv_buffer_emit_word(&b->entry_points, entry_point); + int len = spirv_buffer_emit_string(&b->entry_points, name); + b->entry_points.words[pos] |= (3 + len + num_interfaces) << 16; + spirv_buffer_prepare(&b->entry_points, num_interfaces); + for (int i = 0; i < num_interfaces; ++i) + spirv_buffer_emit_word(&b->entry_points, interfaces[i]); +} + +void +spirv_builder_emit_exec_mode(struct spirv_builder *b, SpvId entry_point, + SpvExecutionMode exec_mode) +{ + spirv_buffer_prepare(&b->exec_modes, 3); + spirv_buffer_emit_word(&b->exec_modes, SpvOpExecutionMode | (3 << 16)); + spirv_buffer_emit_word(&b->exec_modes, entry_point); + spirv_buffer_emit_word(&b->exec_modes, exec_mode); +} + +void +spirv_builder_emit_name(struct spirv_builder *b, SpvId target, + const char *name) +{ + size_t pos = b->debug_names.num_words; + spirv_buffer_prepare(&b->debug_names, 2); + spirv_buffer_emit_word(&b->debug_names, SpvOpName); + spirv_buffer_emit_word(&b->debug_names, target); + int len = spirv_buffer_emit_string(&b->debug_names, name); + b->debug_names.words[pos] |= (2 + len) << 16; +} + +static void +emit_decoration(struct spirv_builder *b, SpvId target, + SpvDecoration decoration, const uint32_t extra_operands[], + size_t num_extra_operands) +{ + int words = 3 + num_extra_operands; + spirv_buffer_prepare(&b->decorations, words); + spirv_buffer_emit_word(&b->decorations, SpvOpDecorate | (words << 16)); + spirv_buffer_emit_word(&b->decorations, target); + spirv_buffer_emit_word(&b->decorations, decoration); + for (int i = 0; i < num_extra_operands; ++i) + spirv_buffer_emit_word(&b->decorations, extra_operands[i]); +} + +void +spirv_builder_emit_decoration(struct spirv_builder *b, SpvId target, + SpvDecoration decoration) +{ + emit_decoration(b, target, decoration, NULL, 0); +} + +void +spirv_builder_emit_location(struct spirv_builder *b, SpvId target, + uint32_t location) +{ + uint32_t args[] = { location }; + emit_decoration(b, target, SpvDecorationLocation, args, ARRAY_SIZE(args)); +} + +void +spirv_builder_emit_component(struct spirv_builder *b, SpvId target, + uint32_t component) +{ + uint32_t args[] = { component }; + emit_decoration(b, target, SpvDecorationComponent, args, ARRAY_SIZE(args)); +} + +void +spirv_builder_emit_builtin(struct spirv_builder *b, SpvId target, + SpvBuiltIn builtin) +{ + uint32_t args[] = { builtin }; + emit_decoration(b, target, SpvDecorationBuiltIn, args, ARRAY_SIZE(args)); +} + +void +spirv_builder_emit_descriptor_set(struct spirv_builder *b, SpvId target, + uint32_t descriptor_set) +{ + uint32_t args[] = { descriptor_set }; + emit_decoration(b, target, SpvDecorationDescriptorSet, args, + ARRAY_SIZE(args)); +} + +void +spirv_builder_emit_binding(struct spirv_builder *b, SpvId target, + uint32_t binding) +{ + uint32_t args[] = { binding }; + emit_decoration(b, target, SpvDecorationBinding, args, ARRAY_SIZE(args)); +} + +void +spirv_builder_emit_array_stride(struct spirv_builder *b, SpvId target, + uint32_t stride) +{ + uint32_t args[] = { stride }; + emit_decoration(b, target, SpvDecorationArrayStride, args, ARRAY_SIZE(args)); +} + +static void +emit_member_decoration(struct spirv_builder *b, SpvId target, uint32_t member, + SpvDecoration decoration, const uint32_t extra_operands[], + size_t num_extra_operands) +{ + int words = 4 + num_extra_operands; + spirv_buffer_prepare(&b->decorations, words); + spirv_buffer_emit_word(&b->decorations, + SpvOpMemberDecorate | (words << 16)); + spirv_buffer_emit_word(&b->decorations, target); + spirv_buffer_emit_word(&b->decorations, member); + spirv_buffer_emit_word(&b->decorations, decoration); + for (int i = 0; i < num_extra_operands; ++i) + spirv_buffer_emit_word(&b->decorations, extra_operands[i]); +} + +void +spirv_builder_emit_member_offset(struct spirv_builder *b, SpvId target, + uint32_t member, uint32_t offset) +{ + uint32_t args[] = { offset }; + emit_member_decoration(b, target, member, SpvDecorationOffset, + args, ARRAY_SIZE(args)); +} + +SpvId +spirv_builder_emit_undef(struct spirv_builder *b, SpvId result_type) +{ + SpvId result = spirv_builder_new_id(b); + spirv_buffer_prepare(&b->instructions, 3); + spirv_buffer_emit_word(&b->instructions, SpvOpUndef | (3 << 16)); + spirv_buffer_emit_word(&b->instructions, result_type); + spirv_buffer_emit_word(&b->instructions, result); + return result; +} + +void +spirv_builder_function(struct spirv_builder *b, SpvId result, + SpvId return_type, + SpvFunctionControlMask function_control, + SpvId function_type) +{ + spirv_buffer_prepare(&b->instructions, 5); + spirv_buffer_emit_word(&b->instructions, SpvOpFunction | (5 << 16)); + spirv_buffer_emit_word(&b->instructions, return_type); + spirv_buffer_emit_word(&b->instructions, result); + spirv_buffer_emit_word(&b->instructions, function_control); + spirv_buffer_emit_word(&b->instructions, function_type); +} + +void +spirv_builder_function_end(struct spirv_builder *b) +{ + spirv_buffer_prepare(&b->instructions, 1); + spirv_buffer_emit_word(&b->instructions, SpvOpFunctionEnd | (1 << 16)); +} + +void +spirv_builder_label(struct spirv_builder *b, SpvId label) +{ + spirv_buffer_prepare(&b->instructions, 2); + spirv_buffer_emit_word(&b->instructions, SpvOpLabel | (2 << 16)); + spirv_buffer_emit_word(&b->instructions, label); +} + +void +spirv_builder_return(struct spirv_builder *b) +{ + spirv_buffer_prepare(&b->instructions, 1); + spirv_buffer_emit_word(&b->instructions, SpvOpReturn | (1 << 16)); +} + +SpvId +spirv_builder_emit_load(struct spirv_builder *b, SpvId result_type, + SpvId pointer) +{ + return spirv_builder_emit_unop(b, SpvOpLoad, result_type, pointer); +} + +void +spirv_builder_emit_store(struct spirv_builder *b, SpvId pointer, SpvId object) +{ + spirv_buffer_prepare(&b->instructions, 3); + spirv_buffer_emit_word(&b->instructions, SpvOpStore | (3 << 16)); + spirv_buffer_emit_word(&b->instructions, pointer); + spirv_buffer_emit_word(&b->instructions, object); +} + +SpvId +spirv_builder_emit_access_chain(struct spirv_builder *b, SpvId result_type, + SpvId base, const SpvId indexes[], + size_t num_indexes) +{ + SpvId result = spirv_builder_new_id(b); + + int words = 4 + num_indexes; + spirv_buffer_prepare(&b->instructions, words); + spirv_buffer_emit_word(&b->instructions, SpvOpAccessChain | (words << 16)); + spirv_buffer_emit_word(&b->instructions, result_type); + spirv_buffer_emit_word(&b->instructions, result); + spirv_buffer_emit_word(&b->instructions, base); + for (int i = 0; i < num_indexes; ++i) + spirv_buffer_emit_word(&b->instructions, indexes[i]); + return result; +} + + +SpvId +spirv_builder_emit_unop(struct spirv_builder *b, SpvOp op, SpvId result_type, + SpvId operand) +{ + SpvId result = spirv_builder_new_id(b); + spirv_buffer_prepare(&b->instructions, 4); + spirv_buffer_emit_word(&b->instructions, op | (4 << 16)); + spirv_buffer_emit_word(&b->instructions, result_type); + spirv_buffer_emit_word(&b->instructions, result); + spirv_buffer_emit_word(&b->instructions, operand); + return result; +} + +SpvId +spirv_builder_emit_binop(struct spirv_builder *b, SpvOp op, SpvId result_type, + SpvId operand0, SpvId operand1) +{ + SpvId result = spirv_builder_new_id(b); + spirv_buffer_prepare(&b->instructions, 5); + spirv_buffer_emit_word(&b->instructions, op | (5 << 16)); + spirv_buffer_emit_word(&b->instructions, result_type); + spirv_buffer_emit_word(&b->instructions, result); + spirv_buffer_emit_word(&b->instructions, operand0); + spirv_buffer_emit_word(&b->instructions, operand1); + return result; +} + +SpvId +spirv_builder_emit_triop(struct spirv_builder *b, SpvOp op, SpvId result_type, + SpvId operand0, SpvId operand1, SpvId operand2) +{ + SpvId result = spirv_builder_new_id(b); + spirv_buffer_prepare(&b->instructions, 6); + spirv_buffer_emit_word(&b->instructions, op | (6 << 16)); + spirv_buffer_emit_word(&b->instructions, result_type); + spirv_buffer_emit_word(&b->instructions, result); + spirv_buffer_emit_word(&b->instructions, operand0); + spirv_buffer_emit_word(&b->instructions, operand1); + spirv_buffer_emit_word(&b->instructions, operand2); + return result; +} + +SpvId +spirv_builder_emit_composite_extract(struct spirv_builder *b, SpvId result_type, + SpvId composite, const uint32_t indexes[], + size_t num_indexes) +{ + SpvId result = spirv_builder_new_id(b); + + assert(num_indexes > 0); + int words = 4 + num_indexes; + spirv_buffer_prepare(&b->instructions, words); + spirv_buffer_emit_word(&b->instructions, + SpvOpCompositeExtract | (words << 16)); + spirv_buffer_emit_word(&b->instructions, result_type); + spirv_buffer_emit_word(&b->instructions, result); + spirv_buffer_emit_word(&b->instructions, composite); + for (int i = 0; i < num_indexes; ++i) + spirv_buffer_emit_word(&b->instructions, indexes[i]); + return result; +} + +SpvId +spirv_builder_emit_composite_construct(struct spirv_builder *b, + SpvId result_type, + const SpvId constituents[], + size_t num_constituents) +{ + SpvId result = spirv_builder_new_id(b); + + assert(num_constituents > 0); + int words = 3 + num_constituents; + spirv_buffer_prepare(&b->instructions, words); + spirv_buffer_emit_word(&b->instructions, + SpvOpCompositeConstruct | (words << 16)); + spirv_buffer_emit_word(&b->instructions, result_type); + spirv_buffer_emit_word(&b->instructions, result); + for (int i = 0; i < num_constituents; ++i) + spirv_buffer_emit_word(&b->instructions, constituents[i]); + return result; +} + +SpvId +spirv_builder_emit_vector_shuffle(struct spirv_builder *b, SpvId result_type, + SpvId vector_1, SpvId vector_2, + const uint32_t components[], + size_t num_components) +{ + SpvId result = spirv_builder_new_id(b); + + assert(num_components > 0); + int words = 5 + num_components; + spirv_buffer_prepare(&b->instructions, words); + spirv_buffer_emit_word(&b->instructions, SpvOpVectorShuffle | (words << 16)); + spirv_buffer_emit_word(&b->instructions, result_type); + spirv_buffer_emit_word(&b->instructions, result); + spirv_buffer_emit_word(&b->instructions, vector_1); + spirv_buffer_emit_word(&b->instructions, vector_2); + for (int i = 0; i < num_components; ++i) + spirv_buffer_emit_word(&b->instructions, components[i]); + return result; +} + +void +spirv_builder_emit_branch(struct spirv_builder *b, SpvId label) +{ + spirv_buffer_prepare(&b->instructions, 2); + spirv_buffer_emit_word(&b->instructions, SpvOpBranch | (2 << 16)); + spirv_buffer_emit_word(&b->instructions, label); +} + +void +spirv_builder_emit_selection_merge(struct spirv_builder *b, SpvId merge_block, + SpvSelectionControlMask selection_control) +{ + spirv_buffer_prepare(&b->instructions, 3); + spirv_buffer_emit_word(&b->instructions, SpvOpSelectionMerge | (3 << 16)); + spirv_buffer_emit_word(&b->instructions, merge_block); + spirv_buffer_emit_word(&b->instructions, selection_control); +} + +void +spirv_builder_loop_merge(struct spirv_builder *b, SpvId merge_block, + SpvId cont_target, SpvLoopControlMask loop_control) +{ + spirv_buffer_prepare(&b->instructions, 4); + spirv_buffer_emit_word(&b->instructions, SpvOpLoopMerge | (4 << 16)); + spirv_buffer_emit_word(&b->instructions, merge_block); + spirv_buffer_emit_word(&b->instructions, cont_target); + spirv_buffer_emit_word(&b->instructions, loop_control); +} + +void +spirv_builder_emit_branch_conditional(struct spirv_builder *b, SpvId condition, + SpvId true_label, SpvId false_label) +{ + spirv_buffer_prepare(&b->instructions, 4); + spirv_buffer_emit_word(&b->instructions, SpvOpBranchConditional | (4 << 16)); + spirv_buffer_emit_word(&b->instructions, condition); + spirv_buffer_emit_word(&b->instructions, true_label); + spirv_buffer_emit_word(&b->instructions, false_label); +} + +SpvId +spirv_builder_emit_phi(struct spirv_builder *b, SpvId result_type, + size_t num_vars, size_t *position) +{ + SpvId result = spirv_builder_new_id(b); + + assert(num_vars > 0); + int words = 3 + 2 * num_vars; + spirv_buffer_prepare(&b->instructions, words); + spirv_buffer_emit_word(&b->instructions, SpvOpPhi | (words << 16)); + spirv_buffer_emit_word(&b->instructions, result_type); + spirv_buffer_emit_word(&b->instructions, result); + *position = b->instructions.num_words; + for (int i = 0; i < 2 * num_vars; ++i) + spirv_buffer_emit_word(&b->instructions, 0); + return result; +} + +void +spirv_builder_set_phi_operand(struct spirv_builder *b, size_t position, + size_t index, SpvId variable, SpvId parent) +{ + b->instructions.words[position + index * 2 + 0] = variable; + b->instructions.words[position + index * 2 + 1] = parent; +} + +void +spirv_builder_emit_kill(struct spirv_builder *b) +{ + spirv_buffer_prepare(&b->instructions, 1); + spirv_buffer_emit_word(&b->instructions, SpvOpKill | (1 << 16)); +} + +SpvId +spirv_builder_emit_image_sample(struct spirv_builder *b, + SpvId result_type, + SpvId sampled_image, + SpvId coordinate, + bool proj, + SpvId lod, + SpvId bias, + SpvId dref, + SpvId dx, + SpvId dy, + SpvId offset) +{ + SpvId result = spirv_builder_new_id(b); + + int opcode = SpvOpImageSampleImplicitLod; + int operands = 5; + if (proj) + opcode += SpvOpImageSampleProjImplicitLod - SpvOpImageSampleImplicitLod; + if (lod || (dx && dy)) + opcode += SpvOpImageSampleExplicitLod - SpvOpImageSampleImplicitLod; + if (dref) { + opcode += SpvOpImageSampleDrefImplicitLod - SpvOpImageSampleImplicitLod; + operands++; + } + + SpvImageOperandsMask operand_mask = SpvImageOperandsMaskNone; + SpvId extra_operands[5]; + int num_extra_operands = 0; + if (bias) { + extra_operands[++num_extra_operands] = bias; + operand_mask |= SpvImageOperandsBiasMask; + } + if (lod) { + extra_operands[++num_extra_operands] = lod; + operand_mask |= SpvImageOperandsLodMask; + } else if (dx && dy) { + extra_operands[++num_extra_operands] = dx; + extra_operands[++num_extra_operands] = dy; + operand_mask |= SpvImageOperandsGradMask; + } + if (offset) { + extra_operands[++num_extra_operands] = offset; + operand_mask |= SpvImageOperandsOffsetMask; + } + + /* finalize num_extra_operands / extra_operands */ + if (num_extra_operands > 0) { + extra_operands[0] = operand_mask; + num_extra_operands++; + } + + spirv_buffer_prepare(&b->instructions, operands + num_extra_operands); + spirv_buffer_emit_word(&b->instructions, opcode | ((operands + num_extra_operands) << 16)); + spirv_buffer_emit_word(&b->instructions, result_type); + spirv_buffer_emit_word(&b->instructions, result); + spirv_buffer_emit_word(&b->instructions, sampled_image); + spirv_buffer_emit_word(&b->instructions, coordinate); + if (dref) + spirv_buffer_emit_word(&b->instructions, dref); + for (int i = 0; i < num_extra_operands; ++i) + spirv_buffer_emit_word(&b->instructions, extra_operands[i]); + return result; +} + +SpvId +spirv_builder_emit_image(struct spirv_builder *b, SpvId result_type, + SpvId sampled_image) +{ + SpvId result = spirv_builder_new_id(b); + spirv_buffer_prepare(&b->instructions, 4); + spirv_buffer_emit_word(&b->instructions, SpvOpImage | (4 << 16)); + spirv_buffer_emit_word(&b->instructions, result_type); + spirv_buffer_emit_word(&b->instructions, result); + spirv_buffer_emit_word(&b->instructions, sampled_image); + return result; +} + +SpvId +spirv_builder_emit_image_fetch(struct spirv_builder *b, + SpvId result_type, + SpvId image, + SpvId coordinate, + SpvId lod) +{ + SpvId result = spirv_builder_new_id(b); + + SpvId extra_operands[2]; + int num_extra_operands = 0; + if (lod) { + extra_operands[0] = SpvImageOperandsLodMask; + extra_operands[1] = lod; + num_extra_operands = 2; + } + + spirv_buffer_prepare(&b->instructions, 5 + num_extra_operands); + spirv_buffer_emit_word(&b->instructions, SpvOpImageFetch | + ((5 + num_extra_operands) << 16)); + spirv_buffer_emit_word(&b->instructions, result_type); + spirv_buffer_emit_word(&b->instructions, result); + spirv_buffer_emit_word(&b->instructions, image); + spirv_buffer_emit_word(&b->instructions, coordinate); + for (int i = 0; i < num_extra_operands; ++i) + spirv_buffer_emit_word(&b->instructions, extra_operands[i]); + return result; +} + +SpvId +spirv_builder_emit_image_query_size(struct spirv_builder *b, + SpvId result_type, + SpvId image, + SpvId lod) +{ + int opcode = SpvOpImageQuerySize; + int words = 4; + if (lod) { + words++; + opcode = SpvOpImageQuerySizeLod; + } + + SpvId result = spirv_builder_new_id(b); + spirv_buffer_prepare(&b->instructions, words); + spirv_buffer_emit_word(&b->instructions, opcode | (words << 16)); + spirv_buffer_emit_word(&b->instructions, result_type); + spirv_buffer_emit_word(&b->instructions, result); + spirv_buffer_emit_word(&b->instructions, image); + + if (lod) + spirv_buffer_emit_word(&b->instructions, lod); + + return result; +} + +SpvId +spirv_builder_emit_ext_inst(struct spirv_builder *b, SpvId result_type, + SpvId set, uint32_t instruction, + const SpvId *args, size_t num_args) +{ + SpvId result = spirv_builder_new_id(b); + + int words = 5 + num_args; + spirv_buffer_prepare(&b->instructions, words); + spirv_buffer_emit_word(&b->instructions, SpvOpExtInst | (words << 16)); + spirv_buffer_emit_word(&b->instructions, result_type); + spirv_buffer_emit_word(&b->instructions, result); + spirv_buffer_emit_word(&b->instructions, set); + spirv_buffer_emit_word(&b->instructions, instruction); + for (int i = 0; i < num_args; ++i) + spirv_buffer_emit_word(&b->instructions, args[i]); + return result; +} + +struct spirv_type { + SpvOp op; + uint32_t args[8]; + size_t num_args; + + SpvId type; +}; + +static uint32_t +non_aggregate_type_hash(const void *arg) +{ + const struct spirv_type *type = arg; + + uint32_t hash = _mesa_fnv32_1a_offset_bias; + hash = _mesa_fnv32_1a_accumulate(hash, type->op); + hash = _mesa_fnv32_1a_accumulate_block(hash, type->args, sizeof(uint32_t) * + type->num_args); + return hash; +} + +static bool +non_aggregate_type_equals(const void *a, const void *b) +{ + const struct spirv_type *ta = a, *tb = b; + + if (ta->op != tb->op) + return false; + + assert(ta->num_args == tb->num_args); + return memcmp(ta->args, tb->args, sizeof(uint32_t) * ta->num_args) == 0; +} + +static SpvId +get_type_def(struct spirv_builder *b, SpvOp op, const uint32_t args[], + size_t num_args) +{ + /* According to the SPIR-V specification: + * + * "Two different type s form, by definition, two different types. It + * is valid to declare multiple aggregate type s having the same + * opcode and operands. This is to allow multiple instances of aggregate + * types with the same structure to be decorated differently. (Different + * decorations are not required; two different aggregate type s are + * allowed to have identical declarations and decorations, and will still + * be two different types.) Non-aggregate types are different: It is + * invalid to declare multiple type s for the same scalar, vector, or + * matrix type. That is, non-aggregate type declarations must all have + * different opcodes or operands. (Note that non-aggregate types cannot + * be decorated in ways that affect their type.)" + * + * ..so, we need to prevent the same non-aggregate type to be re-defined + * with a new . We do this by putting the definitions in a hash-map, so + * we can easily look up and reuse them. + */ + + struct spirv_type key; + assert(num_args <= ARRAY_SIZE(key.args)); + key.op = op; + memcpy(&key.args, args, sizeof(uint32_t) * num_args); + key.num_args = num_args; + + struct hash_entry *entry; + if (b->types) { + entry = _mesa_hash_table_search(b->types, &key); + if (entry) + return ((struct spirv_type *)entry->data)->type; + } else { + b->types = _mesa_hash_table_create(NULL, non_aggregate_type_hash, + non_aggregate_type_equals); + assert(b->types); + } + + struct spirv_type *type = CALLOC_STRUCT(spirv_type); + if (!type) + return 0; + + type->op = op; + memcpy(&type->args, args, sizeof(uint32_t) * num_args); + type->num_args = num_args; + + type->type = spirv_builder_new_id(b); + spirv_buffer_prepare(&b->types_const_defs, 2 + num_args); + spirv_buffer_emit_word(&b->types_const_defs, op | ((2 + num_args) << 16)); + spirv_buffer_emit_word(&b->types_const_defs, type->type); + for (int i = 0; i < num_args; ++i) + spirv_buffer_emit_word(&b->types_const_defs, args[i]); + + entry = _mesa_hash_table_insert(b->types, type, type); + assert(entry); + + return ((struct spirv_type *)entry->data)->type; +} + +SpvId +spirv_builder_type_void(struct spirv_builder *b) +{ + return get_type_def(b, SpvOpTypeVoid, NULL, 0); +} + +SpvId +spirv_builder_type_bool(struct spirv_builder *b) +{ + return get_type_def(b, SpvOpTypeBool, NULL, 0); +} + +SpvId +spirv_builder_type_int(struct spirv_builder *b, unsigned width) +{ + uint32_t args[] = { width, 1 }; + return get_type_def(b, SpvOpTypeInt, args, ARRAY_SIZE(args)); +} + +SpvId +spirv_builder_type_uint(struct spirv_builder *b, unsigned width) +{ + uint32_t args[] = { width, 0 }; + return get_type_def(b, SpvOpTypeInt, args, ARRAY_SIZE(args)); +} + +SpvId +spirv_builder_type_float(struct spirv_builder *b, unsigned width) +{ + uint32_t args[] = { width }; + return get_type_def(b, SpvOpTypeFloat, args, ARRAY_SIZE(args)); +} + +SpvId +spirv_builder_type_image(struct spirv_builder *b, SpvId sampled_type, + SpvDim dim, bool depth, bool arrayed, bool ms, + unsigned sampled, SpvImageFormat image_format) +{ + assert(sampled < 3); + uint32_t args[] = { + sampled_type, dim, depth ? 1 : 0, arrayed ? 1 : 0, ms ? 1 : 0, sampled, + image_format + }; + return get_type_def(b, SpvOpTypeImage, args, ARRAY_SIZE(args)); +} + +SpvId +spirv_builder_type_sampled_image(struct spirv_builder *b, SpvId image_type) +{ + uint32_t args[] = { image_type }; + return get_type_def(b, SpvOpTypeSampledImage, args, ARRAY_SIZE(args)); +} + +SpvId +spirv_builder_type_pointer(struct spirv_builder *b, + SpvStorageClass storage_class, SpvId type) +{ + uint32_t args[] = { storage_class, type }; + return get_type_def(b, SpvOpTypePointer, args, ARRAY_SIZE(args)); +} + +SpvId +spirv_builder_type_vector(struct spirv_builder *b, SpvId component_type, + unsigned component_count) +{ + assert(component_count > 1); + uint32_t args[] = { component_type, component_count }; + return get_type_def(b, SpvOpTypeVector, args, ARRAY_SIZE(args)); +} + +SpvId +spirv_builder_type_array(struct spirv_builder *b, SpvId component_type, + SpvId length) +{ + SpvId type = spirv_builder_new_id(b); + spirv_buffer_prepare(&b->types_const_defs, 4); + spirv_buffer_emit_word(&b->types_const_defs, SpvOpTypeArray | (4 << 16)); + spirv_buffer_emit_word(&b->types_const_defs, type); + spirv_buffer_emit_word(&b->types_const_defs, component_type); + spirv_buffer_emit_word(&b->types_const_defs, length); + return type; +} + +SpvId +spirv_builder_type_struct(struct spirv_builder *b, const SpvId member_types[], + size_t num_member_types) +{ + int words = 2 + num_member_types; + SpvId type = spirv_builder_new_id(b); + spirv_buffer_prepare(&b->types_const_defs, words); + spirv_buffer_emit_word(&b->types_const_defs, SpvOpTypeStruct | (words << 16)); + spirv_buffer_emit_word(&b->types_const_defs, type); + for (int i = 0; i < num_member_types; ++i) + spirv_buffer_emit_word(&b->types_const_defs, member_types[i]); + return type; +} + +SpvId +spirv_builder_type_function(struct spirv_builder *b, SpvId return_type, + const SpvId parameter_types[], + size_t num_parameter_types) +{ + int words = 3 + num_parameter_types; + SpvId type = spirv_builder_new_id(b); + spirv_buffer_prepare(&b->types_const_defs, words); + spirv_buffer_emit_word(&b->types_const_defs, SpvOpTypeFunction | (words << 16)); + spirv_buffer_emit_word(&b->types_const_defs, type); + spirv_buffer_emit_word(&b->types_const_defs, return_type); + for (int i = 0; i < num_parameter_types; ++i) + spirv_buffer_emit_word(&b->types_const_defs, parameter_types[i]); + return type; +} + +struct spirv_const { + SpvOp op, type; + uint32_t args[8]; + size_t num_args; + + SpvId result; +}; + +static uint32_t +const_hash(const void *arg) +{ + const struct spirv_const *key = arg; + + uint32_t hash = _mesa_fnv32_1a_offset_bias; + hash = _mesa_fnv32_1a_accumulate(hash, key->op); + hash = _mesa_fnv32_1a_accumulate(hash, key->type); + hash = _mesa_fnv32_1a_accumulate_block(hash, key->args, sizeof(uint32_t) * + key->num_args); + return hash; +} + +static bool +const_equals(const void *a, const void *b) +{ + const struct spirv_const *ca = a, *cb = b; + + if (ca->op != cb->op || + ca->type != cb->type) + return false; + + assert(ca->num_args == cb->num_args); + return memcmp(ca->args, cb->args, sizeof(uint32_t) * ca->num_args) == 0; +} + +static SpvId +get_const_def(struct spirv_builder *b, SpvOp op, SpvId type, + const uint32_t args[], size_t num_args) +{ + struct spirv_const key; + assert(num_args <= ARRAY_SIZE(key.args)); + key.op = op; + key.type = type; + memcpy(&key.args, args, sizeof(uint32_t) * num_args); + key.num_args = num_args; + + struct hash_entry *entry; + if (b->consts) { + entry = _mesa_hash_table_search(b->consts, &key); + if (entry) + return ((struct spirv_const *)entry->data)->result; + } else { + b->consts = _mesa_hash_table_create(NULL, const_hash, const_equals); + assert(b->consts); + } + + struct spirv_const *cnst = CALLOC_STRUCT(spirv_const); + if (!cnst) + return 0; + + cnst->op = op; + cnst->type = type; + memcpy(&cnst->args, args, sizeof(uint32_t) * num_args); + cnst->num_args = num_args; + + cnst->result = spirv_builder_new_id(b); + spirv_buffer_prepare(&b->types_const_defs, 3 + num_args); + spirv_buffer_emit_word(&b->types_const_defs, op | ((3 + num_args) << 16)); + spirv_buffer_emit_word(&b->types_const_defs, type); + spirv_buffer_emit_word(&b->types_const_defs, cnst->result); + for (int i = 0; i < num_args; ++i) + spirv_buffer_emit_word(&b->types_const_defs, args[i]); + + entry = _mesa_hash_table_insert(b->consts, cnst, cnst); + assert(entry); + + return ((struct spirv_const *)entry->data)->result; +} + +SpvId +spirv_builder_const_bool(struct spirv_builder *b, bool val) +{ + return get_const_def(b, val ? SpvOpConstantTrue : SpvOpConstantFalse, + spirv_builder_type_bool(b), NULL, 0); +} + +SpvId +spirv_builder_const_int(struct spirv_builder *b, int width, int32_t val) +{ + assert(width <= 32); + uint32_t args[] = { val }; + return get_const_def(b, SpvOpConstant, spirv_builder_type_int(b, width), + args, ARRAY_SIZE(args)); +} + +SpvId +spirv_builder_const_uint(struct spirv_builder *b, int width, uint32_t val) +{ + assert(width <= 32); + uint32_t args[] = { val }; + return get_const_def(b, SpvOpConstant, spirv_builder_type_uint(b, width), + args, ARRAY_SIZE(args)); +} + +SpvId +spirv_builder_const_float(struct spirv_builder *b, int width, float val) +{ + assert(width <= 32); + uint32_t args[] = { u_bitcast_f2u(val) }; + return get_const_def(b, SpvOpConstant, spirv_builder_type_float(b, width), + args, ARRAY_SIZE(args)); +} + +SpvId +spirv_builder_const_composite(struct spirv_builder *b, SpvId result_type, + const SpvId constituents[], + size_t num_constituents) +{ + return get_const_def(b, SpvOpConstantComposite, result_type, + (const uint32_t *)constituents, + num_constituents); +} + +SpvId +spirv_builder_emit_var(struct spirv_builder *b, SpvId type, + SpvStorageClass storage_class) +{ + assert(storage_class != SpvStorageClassGeneric); + struct spirv_buffer *buf = storage_class != SpvStorageClassFunction ? + &b->types_const_defs : &b->instructions; + + SpvId ret = spirv_builder_new_id(b); + spirv_buffer_prepare(buf, 4); + spirv_buffer_emit_word(buf, SpvOpVariable | (4 << 16)); + spirv_buffer_emit_word(buf, type); + spirv_buffer_emit_word(buf, ret); + spirv_buffer_emit_word(buf, storage_class); + return ret; +} + +SpvId +spirv_builder_import(struct spirv_builder *b, const char *name) +{ + SpvId result = spirv_builder_new_id(b); + size_t pos = b->imports.num_words; + spirv_buffer_prepare(&b->imports, 2); + spirv_buffer_emit_word(&b->imports, SpvOpExtInstImport); + spirv_buffer_emit_word(&b->imports, result); + int len = spirv_buffer_emit_string(&b->imports, name); + b->imports.words[pos] |= (2 + len) << 16; + return result; +} + +size_t +spirv_builder_get_num_words(struct spirv_builder *b) +{ + const size_t header_size = 5; + return header_size + + b->capabilities.num_words + + b->imports.num_words + + b->memory_model.num_words + + b->entry_points.num_words + + b->exec_modes.num_words + + b->debug_names.num_words + + b->decorations.num_words + + b->types_const_defs.num_words + + b->instructions.num_words; +} + +size_t +spirv_builder_get_words(struct spirv_builder *b, uint32_t *words, + size_t num_words) +{ + assert(num_words >= spirv_builder_get_num_words(b)); + + size_t written = 0; + words[written++] = SpvMagicNumber; + words[written++] = 0x00010000; + words[written++] = 0; + words[written++] = b->prev_id + 1; + words[written++] = 0; + + const struct spirv_buffer *buffers[] = { + &b->capabilities, + &b->imports, + &b->memory_model, + &b->entry_points, + &b->exec_modes, + &b->debug_names, + &b->decorations, + &b->types_const_defs, + &b->instructions + }; + + for (int i = 0; i < ARRAY_SIZE(buffers); ++i) { + const struct spirv_buffer *buffer = buffers[i]; + for (int j = 0; j < buffer->num_words; ++j) + words[written++] = buffer->words[j]; + } + + assert(written == spirv_builder_get_num_words(b)); + return written; +} diff -Nru mesa-19.2.8/src/gallium/drivers/zink/nir_to_spirv/spirv_builder.h mesa-20.0.8/src/gallium/drivers/zink/nir_to_spirv/spirv_builder.h --- mesa-19.2.8/src/gallium/drivers/zink/nir_to_spirv/spirv_builder.h 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/zink/nir_to_spirv/spirv_builder.h 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,318 @@ +/* + * Copyright 2018 Collabora Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef SPIRV_BUILDER_H +#define SPIRV_BUILDER_H + +#include "compiler/spirv/spirv.h" +#include "compiler/spirv/GLSL.std.450.h" + +#include +#include +#include + +struct hash_table; + +struct spirv_buffer { + uint32_t *words; + size_t num_words, room; +}; + +struct spirv_builder { + struct spirv_buffer capabilities; + struct spirv_buffer imports; + struct spirv_buffer memory_model; + struct spirv_buffer entry_points; + struct spirv_buffer exec_modes; + struct spirv_buffer debug_names; + struct spirv_buffer decorations; + + struct spirv_buffer types_const_defs; + struct hash_table *types; + struct hash_table *consts; + + struct spirv_buffer instructions; + SpvId prev_id; +}; + +static inline SpvId +spirv_builder_new_id(struct spirv_builder *b) +{ + return ++b->prev_id; +} + +void +spirv_builder_emit_cap(struct spirv_builder *b, SpvCapability cap); + +void +spirv_builder_emit_source(struct spirv_builder *b, SpvSourceLanguage lang, + uint32_t version); + +void +spirv_builder_emit_mem_model(struct spirv_builder *b, + SpvAddressingModel addr_model, + SpvMemoryModel mem_model); + +void +spirv_builder_emit_name(struct spirv_builder *b, SpvId target, + const char *name); + +void +spirv_builder_emit_decoration(struct spirv_builder *b, SpvId target, + SpvDecoration decoration); + +void +spirv_builder_emit_location(struct spirv_builder *b, SpvId target, + uint32_t location); + +void +spirv_builder_emit_component(struct spirv_builder *b, SpvId target, + uint32_t component); + +void +spirv_builder_emit_builtin(struct spirv_builder *b, SpvId target, + SpvBuiltIn builtin); + +void +spirv_builder_emit_descriptor_set(struct spirv_builder *b, SpvId target, + uint32_t descriptor_set); + +void +spirv_builder_emit_binding(struct spirv_builder *b, SpvId target, + uint32_t binding); + +void +spirv_builder_emit_array_stride(struct spirv_builder *b, SpvId target, + uint32_t stride); + +void +spirv_builder_emit_member_offset(struct spirv_builder *b, SpvId target, + uint32_t member, uint32_t offset); + +void +spirv_builder_emit_entry_point(struct spirv_builder *b, + SpvExecutionModel exec_model, SpvId entry_point, + const char *name, const SpvId interfaces[], + size_t num_interfaces); + +void +spirv_builder_emit_exec_mode(struct spirv_builder *b, SpvId entry_point, + SpvExecutionMode exec_mode); + +void +spirv_builder_function(struct spirv_builder *b, SpvId result, + SpvId return_type, + SpvFunctionControlMask function_control, + SpvId function_type); + +void +spirv_builder_function_end(struct spirv_builder *b); + +void +spirv_builder_label(struct spirv_builder *b, SpvId label); + +void +spirv_builder_return(struct spirv_builder *b); + +SpvId +spirv_builder_emit_undef(struct spirv_builder *b, SpvId result_type); + +SpvId +spirv_builder_emit_load(struct spirv_builder *b, SpvId result_type, + SpvId pointer); + +void +spirv_builder_emit_store(struct spirv_builder *b, SpvId pointer, SpvId object); + +SpvId +spirv_builder_emit_access_chain(struct spirv_builder *b, SpvId result_type, + SpvId base, const SpvId indexes[], + size_t num_indexes); + +SpvId +spirv_builder_emit_unop(struct spirv_builder *b, SpvOp op, SpvId result_type, + SpvId operand); + +SpvId +spirv_builder_emit_binop(struct spirv_builder *b, SpvOp op, SpvId result_type, + SpvId operand0, SpvId operand1); + +SpvId +spirv_builder_emit_triop(struct spirv_builder *b, SpvOp op, SpvId result_type, + SpvId operand0, SpvId operand1, SpvId operand2); + +SpvId +spirv_builder_emit_composite_extract(struct spirv_builder *b, SpvId result_type, + SpvId composite, const uint32_t indexes[], + size_t num_indexes); + +SpvId +spirv_builder_emit_composite_construct(struct spirv_builder *b, + SpvId result_type, + const SpvId constituents[], + size_t num_constituents); + +SpvId +spirv_builder_emit_vector_shuffle(struct spirv_builder *b, SpvId result_type, + SpvId vector_1, SpvId vector_2, + const uint32_t components[], + size_t num_components); + +void +spirv_builder_emit_branch(struct spirv_builder *b, SpvId label); + +void +spirv_builder_emit_selection_merge(struct spirv_builder *b, SpvId merge_block, + SpvSelectionControlMask selection_control); + +void +spirv_builder_loop_merge(struct spirv_builder *b, SpvId merge_block, + SpvId cont_target, SpvLoopControlMask loop_control); + +void +spirv_builder_emit_branch_conditional(struct spirv_builder *b, SpvId condition, + SpvId true_label, SpvId false_label); + +SpvId +spirv_builder_emit_phi(struct spirv_builder *b, SpvId result_type, + size_t num_vars, size_t *position); + +void +spirv_builder_set_phi_operand(struct spirv_builder *b, size_t position, + size_t index, SpvId variable, SpvId parent); + +void +spirv_builder_emit_kill(struct spirv_builder *b); + + +SpvId +spirv_builder_emit_image_sample(struct spirv_builder *b, + SpvId result_type, + SpvId sampled_image, + SpvId coordinate, + bool proj, + SpvId lod, + SpvId bias, + SpvId dref, + SpvId dx, + SpvId dy, + SpvId offset); + +SpvId +spirv_builder_emit_image(struct spirv_builder *b, SpvId result_type, + SpvId sampled_image); + +SpvId +spirv_builder_emit_image_fetch(struct spirv_builder *b, + SpvId result_type, + SpvId image, + SpvId coordinate, + SpvId lod); + +SpvId +spirv_builder_emit_image_query_size(struct spirv_builder *b, + SpvId result_type, + SpvId image, + SpvId lod); + +SpvId +spirv_builder_emit_ext_inst(struct spirv_builder *b, SpvId result_type, + SpvId set, uint32_t instruction, + const SpvId args[], size_t num_args); + +SpvId +spirv_builder_type_void(struct spirv_builder *b); + +SpvId +spirv_builder_type_bool(struct spirv_builder *b); + +SpvId +spirv_builder_type_int(struct spirv_builder *b, unsigned width); + +SpvId +spirv_builder_type_uint(struct spirv_builder *b, unsigned width); + +SpvId +spirv_builder_type_float(struct spirv_builder *b, unsigned width); + +SpvId +spirv_builder_type_image(struct spirv_builder *b, SpvId sampled_type, + SpvDim dim, bool depth, bool arrayed, bool ms, + unsigned sampled, SpvImageFormat image_format); + +SpvId +spirv_builder_type_sampled_image(struct spirv_builder *b, SpvId image_type); + +SpvId +spirv_builder_type_pointer(struct spirv_builder *b, + SpvStorageClass storage_class, SpvId type); + +SpvId +spirv_builder_type_vector(struct spirv_builder *b, SpvId component_type, + unsigned component_count); + +SpvId +spirv_builder_type_array(struct spirv_builder *b, SpvId component_type, + SpvId length); + +SpvId +spirv_builder_type_struct(struct spirv_builder *b, const SpvId member_types[], + size_t num_member_types); + +SpvId +spirv_builder_type_function(struct spirv_builder *b, SpvId return_type, + const SpvId parameter_types[], + size_t num_parameter_types); + +SpvId +spirv_builder_const_bool(struct spirv_builder *b, bool val); + +SpvId +spirv_builder_const_int(struct spirv_builder *b, int width, int32_t val); + +SpvId +spirv_builder_const_uint(struct spirv_builder *b, int width, uint32_t val); + +SpvId +spirv_builder_const_float(struct spirv_builder *b, int width, float val); + +SpvId +spirv_builder_const_composite(struct spirv_builder *b, SpvId result_type, + const SpvId constituents[], + size_t num_constituents); + +SpvId +spirv_builder_emit_var(struct spirv_builder *b, SpvId type, + SpvStorageClass storage_class); + +SpvId +spirv_builder_import(struct spirv_builder *b, const char *name); + +size_t +spirv_builder_get_num_words(struct spirv_builder *b); + +size_t +spirv_builder_get_words(struct spirv_builder *b, uint32_t *words, + size_t num_words); + +#endif diff -Nru mesa-19.2.8/src/gallium/drivers/zink/zink_batch.c mesa-20.0.8/src/gallium/drivers/zink/zink_batch.c --- mesa-19.2.8/src/gallium/drivers/zink/zink_batch.c 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/zink/zink_batch.c 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,119 @@ +#include "zink_batch.h" + +#include "zink_context.h" +#include "zink_fence.h" +#include "zink_framebuffer.h" +#include "zink_query.h" +#include "zink_render_pass.h" +#include "zink_resource.h" +#include "zink_screen.h" + +#include "util/u_debug.h" +#include "util/set.h" + +static void +reset_batch(struct zink_screen *screen, struct zink_batch *batch) +{ + batch->descs_left = ZINK_BATCH_DESC_SIZE; + + // cmdbuf hasn't been submitted before + if (!batch->fence) + return; + + zink_fence_finish(screen, batch->fence, PIPE_TIMEOUT_INFINITE); + zink_fence_reference(screen, &batch->fence, NULL); + + zink_render_pass_reference(screen, &batch->rp, NULL); + zink_framebuffer_reference(screen, &batch->fb, NULL); + + /* unref all used resources */ + set_foreach(batch->resources, entry) { + struct pipe_resource *pres = (struct pipe_resource *)entry->key; + pipe_resource_reference(&pres, NULL); + } + _mesa_set_clear(batch->resources, NULL); + + /* unref all used sampler-views */ + set_foreach(batch->sampler_views, entry) { + struct pipe_sampler_view *pres = (struct pipe_sampler_view *)entry->key; + pipe_sampler_view_reference(&pres, NULL); + } + _mesa_set_clear(batch->sampler_views, NULL); + + util_dynarray_foreach(&batch->zombie_samplers, VkSampler, samp) { + vkDestroySampler(screen->dev, *samp, NULL); + } + util_dynarray_clear(&batch->zombie_samplers); + + if (vkResetDescriptorPool(screen->dev, batch->descpool, 0) != VK_SUCCESS) + fprintf(stderr, "vkResetDescriptorPool failed\n"); +} + +void +zink_start_batch(struct zink_context *ctx, struct zink_batch *batch) +{ + reset_batch(zink_screen(ctx->base.screen), batch); + + VkCommandBufferBeginInfo cbbi = {}; + cbbi.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO; + cbbi.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; + if (vkBeginCommandBuffer(batch->cmdbuf, &cbbi) != VK_SUCCESS) + debug_printf("vkBeginCommandBuffer failed\n"); + + if (!ctx->queries_disabled) + zink_resume_queries(ctx, batch); +} + +void +zink_end_batch(struct zink_context *ctx, struct zink_batch *batch) +{ + if (!ctx->queries_disabled) + zink_suspend_queries(ctx, batch); + + if (vkEndCommandBuffer(batch->cmdbuf) != VK_SUCCESS) { + debug_printf("vkEndCommandBuffer failed\n"); + return; + } + + assert(batch->fence == NULL); + batch->fence = zink_create_fence(ctx->base.screen); + if (!batch->fence) + return; + + VkSubmitInfo si = {}; + si.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; + si.waitSemaphoreCount = 0; + si.pWaitSemaphores = NULL; + si.signalSemaphoreCount = 0; + si.pSignalSemaphores = NULL; + si.pWaitDstStageMask = NULL; + si.commandBufferCount = 1; + si.pCommandBuffers = &batch->cmdbuf; + + if (vkQueueSubmit(ctx->queue, 1, &si, batch->fence->fence) != VK_SUCCESS) { + debug_printf("vkQueueSubmit failed\n"); + abort(); + } +} + +void +zink_batch_reference_resoure(struct zink_batch *batch, + struct zink_resource *res) +{ + struct set_entry *entry = _mesa_set_search(batch->resources, res); + if (!entry) { + entry = _mesa_set_add(batch->resources, res); + pipe_reference(NULL, &res->base.reference); + } +} + +void +zink_batch_reference_sampler_view(struct zink_batch *batch, + struct zink_sampler_view *sv) +{ + struct set_entry *entry = _mesa_set_search(batch->sampler_views, sv); + if (!entry) { + entry = _mesa_set_add(batch->sampler_views, sv); + pipe_reference(NULL, &sv->base.reference); + } +} diff -Nru mesa-19.2.8/src/gallium/drivers/zink/zink_batch.h mesa-20.0.8/src/gallium/drivers/zink/zink_batch.h --- mesa-19.2.8/src/gallium/drivers/zink/zink_batch.h 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/zink/zink_batch.h 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,69 @@ +/* + * Copyright 2018 Collabora Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef ZINK_BATCH_H +#define ZINK_BATCH_H + +#include + +#include "util/u_dynarray.h" + +struct zink_context; +struct zink_fence; +struct zink_framebuffer; +struct zink_render_pass; +struct zink_resource; +struct zink_sampler_view; + +#define ZINK_BATCH_DESC_SIZE 1000 + +struct zink_batch { + VkCommandBuffer cmdbuf; + VkDescriptorPool descpool; + int descs_left; + struct zink_fence *fence; + + struct zink_render_pass *rp; + struct zink_framebuffer *fb; + + struct set *resources; + struct set *sampler_views; + + struct util_dynarray zombie_samplers; +}; + +void +zink_start_batch(struct zink_context *ctx, struct zink_batch *batch); + +void +zink_end_batch(struct zink_context *ctx, struct zink_batch *batch); + +void +zink_batch_reference_resoure(struct zink_batch *batch, + struct zink_resource *res); + +void +zink_batch_reference_sampler_view(struct zink_batch *batch, + struct zink_sampler_view *sv); + +#endif diff -Nru mesa-19.2.8/src/gallium/drivers/zink/zink_blit.c mesa-20.0.8/src/gallium/drivers/zink/zink_blit.c --- mesa-19.2.8/src/gallium/drivers/zink/zink_blit.c 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/zink/zink_blit.c 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,205 @@ +#include "zink_context.h" +#include "zink_helpers.h" +#include "zink_resource.h" +#include "zink_screen.h" + +#include "util/u_blitter.h" +#include "util/format/u_format.h" + +static bool +blit_resolve(struct zink_context *ctx, const struct pipe_blit_info *info) +{ + if (info->mask != PIPE_MASK_RGBA || + info->scissor_enable || + info->alpha_blend) + return false; + + struct zink_resource *src = zink_resource(info->src.resource); + struct zink_resource *dst = zink_resource(info->dst.resource); + + struct zink_screen *screen = zink_screen(ctx->base.screen); + if (src->format != zink_get_format(screen, info->src.format) || + dst->format != zink_get_format(screen, info->dst.format)) + return false; + + struct zink_batch *batch = zink_batch_no_rp(ctx); + + zink_batch_reference_resoure(batch, src); + zink_batch_reference_resoure(batch, dst); + + if (src->layout != VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL) + zink_resource_barrier(batch->cmdbuf, src, src->aspect, + VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL); + + if (dst->layout != VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL) + zink_resource_barrier(batch->cmdbuf, dst, dst->aspect, + VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL); + + VkImageResolve region = {}; + + region.srcSubresource.aspectMask = src->aspect; + region.srcSubresource.mipLevel = info->src.level; + region.srcSubresource.baseArrayLayer = 0; // no clue + region.srcSubresource.layerCount = 1; // no clue + region.srcOffset.x = info->src.box.x; + region.srcOffset.y = info->src.box.y; + region.srcOffset.z = info->src.box.z; + + region.dstSubresource.aspectMask = dst->aspect; + region.dstSubresource.mipLevel = info->dst.level; + region.dstSubresource.baseArrayLayer = 0; // no clue + region.dstSubresource.layerCount = 1; // no clue + region.dstOffset.x = info->dst.box.x; + region.dstOffset.y = info->dst.box.y; + region.dstOffset.z = info->dst.box.z; + + region.extent.width = info->dst.box.width; + region.extent.height = info->dst.box.height; + region.extent.depth = info->dst.box.depth; + vkCmdResolveImage(batch->cmdbuf, src->image, src->layout, + dst->image, dst->layout, + 1, ®ion); + + return true; +} + +static bool +blit_native(struct zink_context *ctx, const struct pipe_blit_info *info) +{ + if (info->mask != PIPE_MASK_RGBA || + info->scissor_enable || + info->alpha_blend) + return false; + + struct zink_resource *src = zink_resource(info->src.resource); + struct zink_resource *dst = zink_resource(info->dst.resource); + + struct zink_screen *screen = zink_screen(ctx->base.screen); + if (src->format != zink_get_format(screen, info->src.format) || + dst->format != zink_get_format(screen, info->dst.format)) + return false; + + struct zink_batch *batch = zink_batch_no_rp(ctx); + zink_batch_reference_resoure(batch, src); + zink_batch_reference_resoure(batch, dst); + + if (src == dst) { + /* The Vulkan 1.1 specification says the following about valid usage + * of vkCmdBlitImage: + * + * "srcImageLayout must be VK_IMAGE_LAYOUT_SHARED_PRESENT_KHR, + * VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL or VK_IMAGE_LAYOUT_GENERAL" + * + * and: + * + * "dstImageLayout must be VK_IMAGE_LAYOUT_SHARED_PRESENT_KHR, + * VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL or VK_IMAGE_LAYOUT_GENERAL" + * + * Since we cant have the same image in two states at the same time, + * we're effectively left with VK_IMAGE_LAYOUT_SHARED_PRESENT_KHR or + * VK_IMAGE_LAYOUT_GENERAL. And since this isn't a present-related + * operation, VK_IMAGE_LAYOUT_GENERAL seems most appropriate. + */ + if (src->layout != VK_IMAGE_LAYOUT_GENERAL) + zink_resource_barrier(batch->cmdbuf, src, src->aspect, + VK_IMAGE_LAYOUT_GENERAL); + } else { + if (src->layout != VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL) + zink_resource_barrier(batch->cmdbuf, src, src->aspect, + VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL); + + if (dst->layout != VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL) + zink_resource_barrier(batch->cmdbuf, dst, dst->aspect, + VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL); + } + + VkImageBlit region = {}; + region.srcSubresource.aspectMask = src->aspect; + region.srcSubresource.mipLevel = info->src.level; + region.srcOffsets[0].x = info->src.box.x; + region.srcOffsets[0].y = info->src.box.y; + region.srcOffsets[1].x = info->src.box.x + info->src.box.width; + region.srcOffsets[1].y = info->src.box.y + info->src.box.height; + + if (src->base.array_size > 1) { + region.srcOffsets[0].z = 0; + region.srcOffsets[1].z = 1; + region.srcSubresource.baseArrayLayer = info->src.box.z; + region.srcSubresource.layerCount = info->src.box.depth; + } else { + region.srcOffsets[0].z = info->src.box.z; + region.srcOffsets[1].z = info->src.box.z + info->src.box.depth; + region.srcSubresource.baseArrayLayer = 0; + region.srcSubresource.layerCount = 1; + } + + region.dstSubresource.aspectMask = dst->aspect; + region.dstSubresource.mipLevel = info->dst.level; + region.dstOffsets[0].x = info->dst.box.x; + region.dstOffsets[0].y = info->dst.box.y; + region.dstOffsets[1].x = info->dst.box.x + info->dst.box.width; + region.dstOffsets[1].y = info->dst.box.y + info->dst.box.height; + + if (dst->base.array_size > 1) { + region.dstOffsets[0].z = 0; + region.dstOffsets[1].z = 1; + region.dstSubresource.baseArrayLayer = info->dst.box.z; + region.dstSubresource.layerCount = info->dst.box.depth; + } else { + region.dstOffsets[0].z = info->dst.box.z; + region.dstOffsets[1].z = info->dst.box.z + info->dst.box.depth; + region.dstSubresource.baseArrayLayer = 0; + region.dstSubresource.layerCount = 1; + } + + vkCmdBlitImage(batch->cmdbuf, src->image, src->layout, + dst->image, dst->layout, + 1, ®ion, + zink_filter(info->filter)); + + return true; +} + +void +zink_blit(struct pipe_context *pctx, + const struct pipe_blit_info *info) +{ + struct zink_context *ctx = zink_context(pctx); + if (info->src.resource->nr_samples > 1 && + info->dst.resource->nr_samples <= 1) { + if (blit_resolve(ctx, info)) + return; + } else { + if (blit_native(ctx, info)) + return; + } + + if (!util_blitter_is_blit_supported(ctx->blitter, info)) { + debug_printf("blit unsupported %s -> %s\n", + util_format_short_name(info->src.resource->format), + util_format_short_name(info->dst.resource->format)); + return; + } + + util_blitter_save_blend(ctx->blitter, ctx->gfx_pipeline_state.blend_state); + util_blitter_save_depth_stencil_alpha(ctx->blitter, ctx->gfx_pipeline_state.depth_stencil_alpha_state); + util_blitter_save_vertex_elements(ctx->blitter, ctx->element_state); + util_blitter_save_stencil_ref(ctx->blitter, &ctx->stencil_ref); + util_blitter_save_rasterizer(ctx->blitter, ctx->rast_state); + util_blitter_save_fragment_shader(ctx->blitter, ctx->gfx_stages[PIPE_SHADER_FRAGMENT]); + util_blitter_save_vertex_shader(ctx->blitter, ctx->gfx_stages[PIPE_SHADER_VERTEX]); + util_blitter_save_framebuffer(ctx->blitter, &ctx->fb_state); + util_blitter_save_viewport(ctx->blitter, ctx->viewport_states); + util_blitter_save_scissor(ctx->blitter, ctx->scissor_states); + util_blitter_save_fragment_sampler_states(ctx->blitter, + ctx->num_samplers[PIPE_SHADER_FRAGMENT], + ctx->sampler_states[PIPE_SHADER_FRAGMENT]); + util_blitter_save_fragment_sampler_views(ctx->blitter, + ctx->num_image_views[PIPE_SHADER_FRAGMENT], + ctx->image_views[PIPE_SHADER_FRAGMENT]); + util_blitter_save_fragment_constant_buffer_slot(ctx->blitter, ctx->ubos[PIPE_SHADER_FRAGMENT]); + util_blitter_save_vertex_buffer_slot(ctx->blitter, ctx->buffers); + util_blitter_save_sample_mask(ctx->blitter, ctx->gfx_pipeline_state.sample_mask); + + util_blitter_blit(ctx->blitter, info); +} diff -Nru mesa-19.2.8/src/gallium/drivers/zink/zink_compiler.c mesa-20.0.8/src/gallium/drivers/zink/zink_compiler.c --- mesa-19.2.8/src/gallium/drivers/zink/zink_compiler.c 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/zink/zink_compiler.c 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,319 @@ +/* + * Copyright 2018 Collabora Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "zink_compiler.h" +#include "zink_screen.h" +#include "nir_to_spirv/nir_to_spirv.h" + +#include "pipe/p_state.h" + +#include "nir.h" +#include "compiler/nir/nir_builder.h" + +#include "nir/tgsi_to_nir.h" +#include "tgsi/tgsi_dump.h" +#include "tgsi/tgsi_from_mesa.h" + +#include "util/u_memory.h" + +static bool +lower_instr(nir_intrinsic_instr *instr, nir_builder *b) +{ + b->cursor = nir_before_instr(&instr->instr); + + if (instr->intrinsic == nir_intrinsic_load_ubo) { + nir_ssa_def *old_idx = nir_ssa_for_src(b, instr->src[0], 1); + nir_ssa_def *new_idx = nir_iadd(b, old_idx, nir_imm_int(b, 1)); + nir_instr_rewrite_src(&instr->instr, &instr->src[0], + nir_src_for_ssa(new_idx)); + return true; + } + + if (instr->intrinsic == nir_intrinsic_load_uniform) { + nir_ssa_def *ubo_idx = nir_imm_int(b, 0); + nir_ssa_def *ubo_offset = + nir_iadd(b, nir_imm_int(b, nir_intrinsic_base(instr)), + nir_ssa_for_src(b, instr->src[0], 1)); + + nir_intrinsic_instr *load = + nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_ubo); + load->num_components = instr->num_components; + load->src[0] = nir_src_for_ssa(ubo_idx); + load->src[1] = nir_src_for_ssa(ubo_offset); + nir_ssa_dest_init(&load->instr, &load->dest, + load->num_components, instr->dest.ssa.bit_size, + instr->dest.ssa.name); + nir_builder_instr_insert(b, &load->instr); + nir_ssa_def_rewrite_uses(&instr->dest.ssa, nir_src_for_ssa(&load->dest.ssa)); + + nir_instr_remove(&instr->instr); + return true; + } + + return false; +} + +static bool +lower_uniforms_to_ubo(nir_shader *shader) +{ + bool progress = false; + + nir_foreach_function(function, shader) { + if (function->impl) { + nir_builder builder; + nir_builder_init(&builder, function->impl); + nir_foreach_block(block, function->impl) { + nir_foreach_instr_safe(instr, block) { + if (instr->type == nir_instr_type_intrinsic) + progress |= lower_instr(nir_instr_as_intrinsic(instr), + &builder); + } + } + + nir_metadata_preserve(function->impl, nir_metadata_block_index | + nir_metadata_dominance); + } + } + + if (progress) { + assert(shader->num_uniforms > 0); + const struct glsl_type *type = glsl_array_type(glsl_vec4_type(), + shader->num_uniforms, 0); + nir_variable *ubo = nir_variable_create(shader, nir_var_mem_ubo, type, + "uniform_0"); + ubo->data.binding = 0; + + struct glsl_struct_field field = { + .type = type, + .name = "data", + .location = -1, + }; + ubo->interface_type = + glsl_interface_type(&field, 1, GLSL_INTERFACE_PACKING_STD430, + false, "__ubo0_interface"); + } + + return progress; +} + +static bool +lower_discard_if_instr(nir_intrinsic_instr *instr, nir_builder *b) +{ + if (instr->intrinsic == nir_intrinsic_discard_if) { + b->cursor = nir_before_instr(&instr->instr); + + nir_if *if_stmt = nir_push_if(b, nir_ssa_for_src(b, instr->src[0], 1)); + nir_intrinsic_instr *discard = + nir_intrinsic_instr_create(b->shader, nir_intrinsic_discard); + nir_builder_instr_insert(b, &discard->instr); + nir_pop_if(b, if_stmt); + nir_instr_remove(&instr->instr); + return true; + } + assert(instr->intrinsic != nir_intrinsic_discard || + nir_block_last_instr(instr->instr.block) == &instr->instr); + + return false; +} + +static bool +lower_discard_if(nir_shader *shader) +{ + bool progress = false; + + nir_foreach_function(function, shader) { + if (function->impl) { + nir_builder builder; + nir_builder_init(&builder, function->impl); + nir_foreach_block(block, function->impl) { + nir_foreach_instr_safe(instr, block) { + if (instr->type == nir_instr_type_intrinsic) + progress |= lower_discard_if_instr( + nir_instr_as_intrinsic(instr), + &builder); + } + } + + nir_metadata_preserve(function->impl, nir_metadata_dominance); + } + } + + return progress; +} + +static const struct nir_shader_compiler_options nir_options = { + .lower_all_io_to_temps = true, + .lower_ffma = true, + .lower_flrp32 = true, + .lower_fpow = true, + .lower_fsat = true, +}; + +const void * +zink_get_compiler_options(struct pipe_screen *screen, + enum pipe_shader_ir ir, + enum pipe_shader_type shader) +{ + assert(ir == PIPE_SHADER_IR_NIR); + return &nir_options; +} + +struct nir_shader * +zink_tgsi_to_nir(struct pipe_screen *screen, const struct tgsi_token *tokens) +{ + if (zink_debug & ZINK_DEBUG_TGSI) { + fprintf(stderr, "TGSI shader:\n---8<---\n"); + tgsi_dump_to_file(tokens, 0, stderr); + fprintf(stderr, "---8<---\n\n"); + } + + return tgsi_to_nir(tokens, screen); +} + +static void +optimize_nir(struct nir_shader *s) +{ + bool progress; + do { + progress = false; + NIR_PASS_V(s, nir_lower_vars_to_ssa); + NIR_PASS(progress, s, nir_copy_prop); + NIR_PASS(progress, s, nir_opt_remove_phis); + NIR_PASS(progress, s, nir_opt_dce); + NIR_PASS(progress, s, nir_opt_dead_cf); + NIR_PASS(progress, s, nir_opt_cse); + NIR_PASS(progress, s, nir_opt_peephole_select, 8, true, true); + NIR_PASS(progress, s, nir_opt_algebraic); + NIR_PASS(progress, s, nir_opt_constant_folding); + NIR_PASS(progress, s, nir_opt_undef); + } while (progress); +} + +static uint32_t +zink_binding(enum pipe_shader_type stage, VkDescriptorType type, int index) +{ + if (stage == PIPE_SHADER_COMPUTE) { + unreachable("not supported"); + } else { + uint32_t stage_offset = (uint32_t)stage * (PIPE_MAX_CONSTANT_BUFFERS + + PIPE_MAX_SHADER_SAMPLER_VIEWS); + + switch (type) { + case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: + assert(index < PIPE_MAX_CONSTANT_BUFFERS); + return stage_offset + index; + + case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: + assert(index < PIPE_MAX_SHADER_SAMPLER_VIEWS); + return stage_offset + PIPE_MAX_CONSTANT_BUFFERS + index; + + default: + unreachable("unexpected type"); + } + } +} + +struct zink_shader * +zink_compile_nir(struct zink_screen *screen, struct nir_shader *nir) +{ + struct zink_shader *ret = CALLOC_STRUCT(zink_shader); + + NIR_PASS_V(nir, lower_uniforms_to_ubo); + NIR_PASS_V(nir, nir_lower_clip_halfz); + NIR_PASS_V(nir, nir_lower_regs_to_ssa); + optimize_nir(nir); + NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_function_temp); + NIR_PASS_V(nir, lower_discard_if); + NIR_PASS_V(nir, nir_convert_from_ssa, true); + + if (zink_debug & ZINK_DEBUG_NIR) { + fprintf(stderr, "NIR shader:\n---8<---\n"); + nir_print_shader(nir, stderr); + fprintf(stderr, "---8<---\n"); + } + + enum pipe_shader_type stage = pipe_shader_type_from_mesa(nir->info.stage); + + ret->num_bindings = 0; + nir_foreach_variable(var, &nir->uniforms) { + if (var->data.mode == nir_var_mem_ubo) { + ret->bindings[ret->num_bindings].index = var->data.binding; + var->data.binding = zink_binding(stage, VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, var->data.binding); + ret->bindings[ret->num_bindings].binding = var->data.binding; + ret->bindings[ret->num_bindings].type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER; + ret->num_bindings++; + } else { + assert(var->data.mode == nir_var_uniform); + if (glsl_type_is_array(var->type) && + glsl_type_is_sampler(glsl_get_array_element(var->type))) { + for (int i = 0; i < glsl_get_length(var->type); ++i) { + ret->bindings[ret->num_bindings].index = var->data.driver_location + i; + var->data.binding = zink_binding(stage, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, var->data.driver_location + i); + ret->bindings[ret->num_bindings].binding = var->data.binding; + ret->bindings[ret->num_bindings].type = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER; + ret->num_bindings++; + } + } else if (glsl_type_is_sampler(var->type)) { + ret->bindings[ret->num_bindings].index = var->data.driver_location; + var->data.binding = zink_binding(stage, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, var->data.driver_location); + ret->bindings[ret->num_bindings].binding = var->data.binding; + ret->bindings[ret->num_bindings].type = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER; + ret->num_bindings++; + } + } + } + + ret->info = nir->info; + + struct spirv_shader *spirv = nir_to_spirv(nir); + assert(spirv); + + if (zink_debug & ZINK_DEBUG_SPIRV) { + char buf[256]; + static int i; + snprintf(buf, sizeof(buf), "dump%02d.spv", i++); + FILE *fp = fopen(buf, "wb"); + if (fp) { + fwrite(spirv->words, sizeof(uint32_t), spirv->num_words, fp); + fclose(fp); + fprintf(stderr, "wrote '%s'...\n", buf); + } + } + + VkShaderModuleCreateInfo smci = {}; + smci.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO; + smci.codeSize = spirv->num_words * sizeof(uint32_t); + smci.pCode = spirv->words; + + if (vkCreateShaderModule(screen->dev, &smci, NULL, &ret->shader_module) != VK_SUCCESS) + return NULL; + + return ret; +} + +void +zink_shader_free(struct zink_screen *screen, struct zink_shader *shader) +{ + vkDestroyShaderModule(screen->dev, shader->shader_module, NULL); + FREE(shader); +} diff -Nru mesa-19.2.8/src/gallium/drivers/zink/zink_compiler.h mesa-20.0.8/src/gallium/drivers/zink/zink_compiler.h --- mesa-19.2.8/src/gallium/drivers/zink/zink_compiler.h 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/zink/zink_compiler.h 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,69 @@ +/* + * Copyright 2018 Collabora Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef ZINK_COMPILER_H +#define ZINK_COMPILER_H + +#include "pipe/p_defines.h" +#include "pipe/p_state.h" + +#include "compiler/shader_info.h" + +#include + +struct pipe_screen; +struct zink_screen; + +struct nir_shader_compiler_options; +struct nir_shader; + +struct tgsi_token; + +const void * +zink_get_compiler_options(struct pipe_screen *screen, + enum pipe_shader_ir ir, + enum pipe_shader_type shader); + +struct nir_shader * +zink_tgsi_to_nir(struct pipe_screen *screen, const struct tgsi_token *tokens); + +struct zink_shader { + VkShaderModule shader_module; + + shader_info info; + + struct { + int index; + int binding; + VkDescriptorType type; + } bindings[PIPE_MAX_CONSTANT_BUFFERS + PIPE_MAX_SHADER_SAMPLER_VIEWS]; + size_t num_bindings; +}; + +struct zink_shader * +zink_compile_nir(struct zink_screen *screen, struct nir_shader *nir); + +void +zink_shader_free(struct zink_screen *screen, struct zink_shader *shader); + +#endif diff -Nru mesa-19.2.8/src/gallium/drivers/zink/zink_context.c mesa-20.0.8/src/gallium/drivers/zink/zink_context.c --- mesa-19.2.8/src/gallium/drivers/zink/zink_context.c 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/zink/zink_context.c 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,1188 @@ +/* + * Copyright 2018 Collabora Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "zink_context.h" + +#include "zink_batch.h" +#include "zink_compiler.h" +#include "zink_fence.h" +#include "zink_framebuffer.h" +#include "zink_helpers.h" +#include "zink_pipeline.h" +#include "zink_render_pass.h" +#include "zink_resource.h" +#include "zink_screen.h" +#include "zink_state.h" +#include "zink_surface.h" + +#include "indices/u_primconvert.h" +#include "util/u_blitter.h" +#include "util/u_debug.h" +#include "util/format/u_format.h" +#include "util/u_framebuffer.h" +#include "util/u_helpers.h" +#include "util/u_inlines.h" + +#include "nir.h" + +#include "util/u_memory.h" +#include "util/u_upload_mgr.h" + +static void +zink_context_destroy(struct pipe_context *pctx) +{ + struct zink_context *ctx = zink_context(pctx); + struct zink_screen *screen = zink_screen(pctx->screen); + + if (vkQueueWaitIdle(ctx->queue) != VK_SUCCESS) + debug_printf("vkQueueWaitIdle failed\n"); + + for (int i = 0; i < ARRAY_SIZE(ctx->batches); ++i) + vkFreeCommandBuffers(screen->dev, ctx->cmdpool, 1, &ctx->batches[i].cmdbuf); + vkDestroyCommandPool(screen->dev, ctx->cmdpool, NULL); + + util_primconvert_destroy(ctx->primconvert); + u_upload_destroy(pctx->stream_uploader); + slab_destroy_child(&ctx->transfer_pool); + util_blitter_destroy(ctx->blitter); + FREE(ctx); +} + +static VkSamplerMipmapMode +sampler_mipmap_mode(enum pipe_tex_mipfilter filter) +{ + switch (filter) { + case PIPE_TEX_MIPFILTER_NEAREST: return VK_SAMPLER_MIPMAP_MODE_NEAREST; + case PIPE_TEX_MIPFILTER_LINEAR: return VK_SAMPLER_MIPMAP_MODE_LINEAR; + case PIPE_TEX_MIPFILTER_NONE: + unreachable("PIPE_TEX_MIPFILTER_NONE should be dealt with earlier"); + } + unreachable("unexpected filter"); +} + +static VkSamplerAddressMode +sampler_address_mode(enum pipe_tex_wrap filter) +{ + switch (filter) { + case PIPE_TEX_WRAP_REPEAT: return VK_SAMPLER_ADDRESS_MODE_REPEAT; + case PIPE_TEX_WRAP_CLAMP: return VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE; /* not technically correct, but kinda works */ + case PIPE_TEX_WRAP_CLAMP_TO_EDGE: return VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE; + case PIPE_TEX_WRAP_CLAMP_TO_BORDER: return VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER; + case PIPE_TEX_WRAP_MIRROR_REPEAT: return VK_SAMPLER_ADDRESS_MODE_MIRRORED_REPEAT; + case PIPE_TEX_WRAP_MIRROR_CLAMP: return VK_SAMPLER_ADDRESS_MODE_MIRROR_CLAMP_TO_EDGE; /* not technically correct, but kinda works */ + case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: return VK_SAMPLER_ADDRESS_MODE_MIRROR_CLAMP_TO_EDGE; + case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: return VK_SAMPLER_ADDRESS_MODE_MIRROR_CLAMP_TO_EDGE; /* not technically correct, but kinda works */ + } + unreachable("unexpected wrap"); +} + +static VkCompareOp +compare_op(enum pipe_compare_func op) +{ + switch (op) { + case PIPE_FUNC_NEVER: return VK_COMPARE_OP_NEVER; + case PIPE_FUNC_LESS: return VK_COMPARE_OP_LESS; + case PIPE_FUNC_EQUAL: return VK_COMPARE_OP_EQUAL; + case PIPE_FUNC_LEQUAL: return VK_COMPARE_OP_LESS_OR_EQUAL; + case PIPE_FUNC_GREATER: return VK_COMPARE_OP_GREATER; + case PIPE_FUNC_NOTEQUAL: return VK_COMPARE_OP_NOT_EQUAL; + case PIPE_FUNC_GEQUAL: return VK_COMPARE_OP_GREATER_OR_EQUAL; + case PIPE_FUNC_ALWAYS: return VK_COMPARE_OP_ALWAYS; + } + unreachable("unexpected compare"); +} + +static void * +zink_create_sampler_state(struct pipe_context *pctx, + const struct pipe_sampler_state *state) +{ + struct zink_screen *screen = zink_screen(pctx->screen); + + VkSamplerCreateInfo sci = {}; + sci.sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO; + sci.magFilter = zink_filter(state->mag_img_filter); + sci.minFilter = zink_filter(state->min_img_filter); + + if (state->min_mip_filter != PIPE_TEX_MIPFILTER_NONE) { + sci.mipmapMode = sampler_mipmap_mode(state->min_mip_filter); + sci.minLod = state->min_lod; + sci.maxLod = state->max_lod; + } else { + sci.mipmapMode = VK_SAMPLER_MIPMAP_MODE_NEAREST; + sci.minLod = 0; + sci.maxLod = 0; + } + + sci.addressModeU = sampler_address_mode(state->wrap_s); + sci.addressModeV = sampler_address_mode(state->wrap_t); + sci.addressModeW = sampler_address_mode(state->wrap_r); + sci.mipLodBias = state->lod_bias; + + if (state->compare_mode == PIPE_TEX_COMPARE_NONE) + sci.compareOp = VK_COMPARE_OP_NEVER; + else { + sci.compareOp = compare_op(state->compare_func); + sci.compareEnable = VK_TRUE; + } + + sci.borderColor = VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK; // TODO + sci.unnormalizedCoordinates = !state->normalized_coords; + + if (state->max_anisotropy > 1) { + sci.maxAnisotropy = state->max_anisotropy; + sci.anisotropyEnable = VK_TRUE; + } + + VkSampler *sampler = CALLOC(1, sizeof(VkSampler)); + if (!sampler) + return NULL; + + if (vkCreateSampler(screen->dev, &sci, NULL, sampler) != VK_SUCCESS) { + FREE(sampler); + return NULL; + } + + return sampler; +} + +static void +zink_bind_sampler_states(struct pipe_context *pctx, + enum pipe_shader_type shader, + unsigned start_slot, + unsigned num_samplers, + void **samplers) +{ + struct zink_context *ctx = zink_context(pctx); + for (unsigned i = 0; i < num_samplers; ++i) { + VkSampler *sampler = samplers[i]; + ctx->sampler_states[shader][start_slot + i] = sampler; + ctx->samplers[shader][start_slot + i] = sampler ? *sampler : VK_NULL_HANDLE; + } + ctx->num_samplers[shader] = start_slot + num_samplers; +} + +static void +zink_delete_sampler_state(struct pipe_context *pctx, + void *sampler_state) +{ + struct zink_batch *batch = zink_curr_batch(zink_context(pctx)); + util_dynarray_append(&batch->zombie_samplers, VkSampler, + *(VkSampler *)sampler_state); + FREE(sampler_state); +} + + +static VkImageViewType +image_view_type(enum pipe_texture_target target) +{ + switch (target) { + case PIPE_TEXTURE_1D: return VK_IMAGE_VIEW_TYPE_1D; + case PIPE_TEXTURE_1D_ARRAY: return VK_IMAGE_VIEW_TYPE_1D_ARRAY; + case PIPE_TEXTURE_2D: return VK_IMAGE_VIEW_TYPE_2D; + case PIPE_TEXTURE_2D_ARRAY: return VK_IMAGE_VIEW_TYPE_2D_ARRAY; + case PIPE_TEXTURE_CUBE: return VK_IMAGE_VIEW_TYPE_CUBE; + case PIPE_TEXTURE_CUBE_ARRAY: return VK_IMAGE_VIEW_TYPE_CUBE_ARRAY; + case PIPE_TEXTURE_3D: return VK_IMAGE_VIEW_TYPE_3D; + case PIPE_TEXTURE_RECT: return VK_IMAGE_VIEW_TYPE_2D; /* not sure */ + default: + unreachable("unexpected target"); + } +} + +static VkComponentSwizzle +component_mapping(enum pipe_swizzle swizzle) +{ + switch (swizzle) { + case PIPE_SWIZZLE_X: return VK_COMPONENT_SWIZZLE_R; + case PIPE_SWIZZLE_Y: return VK_COMPONENT_SWIZZLE_G; + case PIPE_SWIZZLE_Z: return VK_COMPONENT_SWIZZLE_B; + case PIPE_SWIZZLE_W: return VK_COMPONENT_SWIZZLE_A; + case PIPE_SWIZZLE_0: return VK_COMPONENT_SWIZZLE_ZERO; + case PIPE_SWIZZLE_1: return VK_COMPONENT_SWIZZLE_ONE; + case PIPE_SWIZZLE_NONE: return VK_COMPONENT_SWIZZLE_IDENTITY; // ??? + default: + unreachable("unexpected swizzle"); + } +} + +static VkImageAspectFlags +sampler_aspect_from_format(enum pipe_format fmt) +{ + if (util_format_is_depth_or_stencil(fmt)) { + const struct util_format_description *desc = util_format_description(fmt); + if (util_format_has_depth(desc)) + return VK_IMAGE_ASPECT_DEPTH_BIT; + assert(util_format_has_stencil(desc)); + return VK_IMAGE_ASPECT_STENCIL_BIT; + } else + return VK_IMAGE_ASPECT_COLOR_BIT; +} + +static struct pipe_sampler_view * +zink_create_sampler_view(struct pipe_context *pctx, struct pipe_resource *pres, + const struct pipe_sampler_view *state) +{ + struct zink_screen *screen = zink_screen(pctx->screen); + struct zink_resource *res = zink_resource(pres); + struct zink_sampler_view *sampler_view = CALLOC_STRUCT(zink_sampler_view); + + sampler_view->base = *state; + sampler_view->base.texture = NULL; + pipe_resource_reference(&sampler_view->base.texture, pres); + sampler_view->base.reference.count = 1; + sampler_view->base.context = pctx; + + VkImageViewCreateInfo ivci = {}; + ivci.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO; + ivci.image = res->image; + ivci.viewType = image_view_type(state->target); + ivci.format = zink_get_format(screen, state->format); + ivci.components.r = component_mapping(state->swizzle_r); + ivci.components.g = component_mapping(state->swizzle_g); + ivci.components.b = component_mapping(state->swizzle_b); + ivci.components.a = component_mapping(state->swizzle_a); + + ivci.subresourceRange.aspectMask = sampler_aspect_from_format(state->format); + ivci.subresourceRange.baseMipLevel = state->u.tex.first_level; + ivci.subresourceRange.baseArrayLayer = state->u.tex.first_layer; + ivci.subresourceRange.levelCount = state->u.tex.last_level - state->u.tex.first_level + 1; + ivci.subresourceRange.layerCount = state->u.tex.last_layer - state->u.tex.first_layer + 1; + + VkResult err = vkCreateImageView(screen->dev, &ivci, NULL, &sampler_view->image_view); + if (err != VK_SUCCESS) { + FREE(sampler_view); + return NULL; + } + + return &sampler_view->base; +} + +static void +zink_sampler_view_destroy(struct pipe_context *pctx, + struct pipe_sampler_view *pview) +{ + struct zink_sampler_view *view = zink_sampler_view(pview); + vkDestroyImageView(zink_screen(pctx->screen)->dev, view->image_view, NULL); + FREE(view); +} + +static void * +zink_create_vs_state(struct pipe_context *pctx, + const struct pipe_shader_state *shader) +{ + struct nir_shader *nir; + if (shader->type != PIPE_SHADER_IR_NIR) + nir = zink_tgsi_to_nir(pctx->screen, shader->tokens); + else + nir = (struct nir_shader *)shader->ir.nir; + + return zink_compile_nir(zink_screen(pctx->screen), nir); +} + +static void +bind_stage(struct zink_context *ctx, enum pipe_shader_type stage, + struct zink_shader *shader) +{ + assert(stage < PIPE_SHADER_COMPUTE); + ctx->gfx_stages[stage] = shader; + ctx->dirty_program = true; +} + +static void +zink_bind_vs_state(struct pipe_context *pctx, + void *cso) +{ + bind_stage(zink_context(pctx), PIPE_SHADER_VERTEX, cso); +} + +static void +zink_delete_vs_state(struct pipe_context *pctx, + void *cso) +{ + zink_shader_free(zink_screen(pctx->screen), cso); +} + +static void * +zink_create_fs_state(struct pipe_context *pctx, + const struct pipe_shader_state *shader) +{ + struct nir_shader *nir; + if (shader->type != PIPE_SHADER_IR_NIR) + nir = zink_tgsi_to_nir(pctx->screen, shader->tokens); + else + nir = (struct nir_shader *)shader->ir.nir; + + return zink_compile_nir(zink_screen(pctx->screen), nir); +} + +static void +zink_bind_fs_state(struct pipe_context *pctx, + void *cso) +{ + bind_stage(zink_context(pctx), PIPE_SHADER_FRAGMENT, cso); +} + +static void +zink_delete_fs_state(struct pipe_context *pctx, + void *cso) +{ + zink_shader_free(zink_screen(pctx->screen), cso); +} + +static void +zink_set_polygon_stipple(struct pipe_context *pctx, + const struct pipe_poly_stipple *ps) +{ +} + +static void +zink_set_vertex_buffers(struct pipe_context *pctx, + unsigned start_slot, + unsigned num_buffers, + const struct pipe_vertex_buffer *buffers) +{ + struct zink_context *ctx = zink_context(pctx); + + if (buffers) { + for (int i = 0; i < num_buffers; ++i) { + const struct pipe_vertex_buffer *vb = buffers + i; + ctx->gfx_pipeline_state.bindings[start_slot + i].stride = vb->stride; + } + } + + util_set_vertex_buffers_mask(ctx->buffers, &ctx->buffers_enabled_mask, + buffers, start_slot, num_buffers); +} + +static void +zink_set_viewport_states(struct pipe_context *pctx, + unsigned start_slot, + unsigned num_viewports, + const struct pipe_viewport_state *state) +{ + struct zink_context *ctx = zink_context(pctx); + + for (unsigned i = 0; i < num_viewports; ++i) { + VkViewport viewport = { + state[i].translate[0] - state[i].scale[0], + state[i].translate[1] - state[i].scale[1], + state[i].scale[0] * 2, + state[i].scale[1] * 2, + state[i].translate[2] - state[i].scale[2], + state[i].translate[2] + state[i].scale[2] + }; + ctx->viewport_states[start_slot + i] = state[i]; + ctx->viewports[start_slot + i] = viewport; + } + ctx->num_viewports = start_slot + num_viewports; +} + +static void +zink_set_scissor_states(struct pipe_context *pctx, + unsigned start_slot, unsigned num_scissors, + const struct pipe_scissor_state *states) +{ + struct zink_context *ctx = zink_context(pctx); + + for (unsigned i = 0; i < num_scissors; i++) { + VkRect2D scissor; + + scissor.offset.x = states[i].minx; + scissor.offset.y = states[i].miny; + scissor.extent.width = states[i].maxx - states[i].minx; + scissor.extent.height = states[i].maxy - states[i].miny; + ctx->scissor_states[start_slot + i] = states[i]; + ctx->scissors[start_slot + i] = scissor; + } +} + +static void +zink_set_constant_buffer(struct pipe_context *pctx, + enum pipe_shader_type shader, uint index, + const struct pipe_constant_buffer *cb) +{ + struct zink_context *ctx = zink_context(pctx); + + if (cb) { + struct pipe_resource *buffer = cb->buffer; + unsigned offset = cb->buffer_offset; + if (cb->user_buffer) { + struct zink_screen *screen = zink_screen(pctx->screen); + u_upload_data(ctx->base.const_uploader, 0, cb->buffer_size, + screen->props.limits.minUniformBufferOffsetAlignment, + cb->user_buffer, &offset, &buffer); + } + + pipe_resource_reference(&ctx->ubos[shader][index].buffer, buffer); + ctx->ubos[shader][index].buffer_offset = offset; + ctx->ubos[shader][index].buffer_size = cb->buffer_size; + ctx->ubos[shader][index].user_buffer = NULL; + + if (cb->user_buffer) + pipe_resource_reference(&buffer, NULL); + } else { + pipe_resource_reference(&ctx->ubos[shader][index].buffer, NULL); + ctx->ubos[shader][index].buffer_offset = 0; + ctx->ubos[shader][index].buffer_size = 0; + ctx->ubos[shader][index].user_buffer = NULL; + } +} + +static void +zink_set_sampler_views(struct pipe_context *pctx, + enum pipe_shader_type shader_type, + unsigned start_slot, + unsigned num_views, + struct pipe_sampler_view **views) +{ + struct zink_context *ctx = zink_context(pctx); + assert(views); + for (unsigned i = 0; i < num_views; ++i) { + pipe_sampler_view_reference( + &ctx->image_views[shader_type][start_slot + i], + views[i]); + } + ctx->num_image_views[shader_type] = start_slot + num_views; +} + +static void +zink_set_stencil_ref(struct pipe_context *pctx, + const struct pipe_stencil_ref *ref) +{ + struct zink_context *ctx = zink_context(pctx); + ctx->stencil_ref = *ref; +} + +static void +zink_set_clip_state(struct pipe_context *pctx, + const struct pipe_clip_state *pcs) +{ +} + +static struct zink_render_pass * +get_render_pass(struct zink_context *ctx) +{ + struct zink_screen *screen = zink_screen(ctx->base.screen); + const struct pipe_framebuffer_state *fb = &ctx->fb_state; + struct zink_render_pass_state state; + + for (int i = 0; i < fb->nr_cbufs; i++) { + struct pipe_resource *res = fb->cbufs[i]->texture; + state.rts[i].format = zink_get_format(screen, fb->cbufs[i]->format); + state.rts[i].samples = res->nr_samples > 0 ? res->nr_samples : + VK_SAMPLE_COUNT_1_BIT; + } + state.num_cbufs = fb->nr_cbufs; + + if (fb->zsbuf) { + struct zink_resource *zsbuf = zink_resource(fb->zsbuf->texture); + state.rts[fb->nr_cbufs].format = zsbuf->format; + state.rts[fb->nr_cbufs].samples = zsbuf->base.nr_samples > 0 ? zsbuf->base.nr_samples : VK_SAMPLE_COUNT_1_BIT; + } + state.have_zsbuf = fb->zsbuf != NULL; + + struct hash_entry *entry = _mesa_hash_table_search(ctx->render_pass_cache, + &state); + if (!entry) { + struct zink_render_pass *rp; + rp = zink_create_render_pass(screen, &state); + entry = _mesa_hash_table_insert(ctx->render_pass_cache, &state, rp); + if (!entry) + return NULL; + } + + return entry->data; +} + +static struct zink_framebuffer * +get_framebuffer(struct zink_context *ctx) +{ + struct zink_screen *screen = zink_screen(ctx->base.screen); + + struct zink_framebuffer_state state = {}; + state.rp = get_render_pass(ctx); + for (int i = 0; i < ctx->fb_state.nr_cbufs; i++) { + struct pipe_surface *psurf = ctx->fb_state.cbufs[i]; + state.attachments[i] = zink_surface(psurf); + } + + state.num_attachments = ctx->fb_state.nr_cbufs; + if (ctx->fb_state.zsbuf) { + struct pipe_surface *psurf = ctx->fb_state.zsbuf; + state.attachments[state.num_attachments++] = zink_surface(psurf); + } + + state.width = ctx->fb_state.width; + state.height = ctx->fb_state.height; + state.layers = MAX2(ctx->fb_state.layers, 1); + + struct hash_entry *entry = _mesa_hash_table_search(ctx->framebuffer_cache, + &state); + if (!entry) { + struct zink_framebuffer *fb = zink_create_framebuffer(screen, &state); + entry = _mesa_hash_table_insert(ctx->framebuffer_cache, &state, fb); + if (!entry) + return NULL; + } + + return entry->data; +} + +void +zink_begin_render_pass(struct zink_context *ctx, struct zink_batch *batch) +{ + struct zink_screen *screen = zink_screen(ctx->base.screen); + assert(batch == zink_curr_batch(ctx)); + assert(ctx->gfx_pipeline_state.render_pass); + + struct pipe_framebuffer_state *fb_state = &ctx->fb_state; + + VkRenderPassBeginInfo rpbi = {}; + rpbi.sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO; + rpbi.renderPass = ctx->gfx_pipeline_state.render_pass->render_pass; + rpbi.renderArea.offset.x = 0; + rpbi.renderArea.offset.y = 0; + rpbi.renderArea.extent.width = fb_state->width; + rpbi.renderArea.extent.height = fb_state->height; + rpbi.clearValueCount = 0; + rpbi.pClearValues = NULL; + rpbi.framebuffer = ctx->framebuffer->fb; + + assert(ctx->gfx_pipeline_state.render_pass && ctx->framebuffer); + assert(!batch->rp || batch->rp == ctx->gfx_pipeline_state.render_pass); + assert(!batch->fb || batch->fb == ctx->framebuffer); + + for (int i = 0; i < fb_state->nr_cbufs; i++) { + struct zink_resource *res = zink_resource(fb_state->cbufs[i]->texture); + if (res->layout != VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL) + zink_resource_barrier(batch->cmdbuf, res, res->aspect, + VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL); + } + + if (fb_state->zsbuf) { + struct zink_resource *res = zink_resource(fb_state->zsbuf->texture); + if (res->layout != VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL) + zink_resource_barrier(batch->cmdbuf, res, res->aspect, + VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL); + } + + zink_render_pass_reference(screen, &batch->rp, ctx->gfx_pipeline_state.render_pass); + zink_framebuffer_reference(screen, &batch->fb, ctx->framebuffer); + + vkCmdBeginRenderPass(batch->cmdbuf, &rpbi, VK_SUBPASS_CONTENTS_INLINE); +} + +static void +flush_batch(struct zink_context *ctx) +{ + struct zink_batch *batch = zink_curr_batch(ctx); + if (batch->rp) + vkCmdEndRenderPass(batch->cmdbuf); + + zink_end_batch(ctx, batch); + + ctx->curr_batch++; + if (ctx->curr_batch == ARRAY_SIZE(ctx->batches)) + ctx->curr_batch = 0; + + zink_start_batch(ctx, zink_curr_batch(ctx)); +} + +struct zink_batch * +zink_batch_rp(struct zink_context *ctx) +{ + struct zink_batch *batch = zink_curr_batch(ctx); + if (!batch->rp) { + zink_begin_render_pass(ctx, batch); + assert(batch->rp); + } + return batch; +} + +struct zink_batch * +zink_batch_no_rp(struct zink_context *ctx) +{ + struct zink_batch *batch = zink_curr_batch(ctx); + if (batch->rp) { + /* flush batch and get a new one */ + flush_batch(ctx); + batch = zink_curr_batch(ctx); + assert(!batch->rp); + } + return batch; +} + +static void +zink_set_framebuffer_state(struct pipe_context *pctx, + const struct pipe_framebuffer_state *state) +{ + struct zink_context *ctx = zink_context(pctx); + struct zink_screen *screen = zink_screen(pctx->screen); + + VkSampleCountFlagBits rast_samples = VK_SAMPLE_COUNT_1_BIT; + for (int i = 0; i < state->nr_cbufs; i++) + rast_samples = MAX2(rast_samples, state->cbufs[i]->texture->nr_samples); + if (state->zsbuf && state->zsbuf->texture->nr_samples) + rast_samples = MAX2(rast_samples, state->zsbuf->texture->nr_samples); + + util_copy_framebuffer_state(&ctx->fb_state, state); + + struct zink_framebuffer *fb = get_framebuffer(ctx); + zink_framebuffer_reference(screen, &ctx->framebuffer, fb); + zink_render_pass_reference(screen, &ctx->gfx_pipeline_state.render_pass, fb->rp); + + ctx->gfx_pipeline_state.rast_samples = rast_samples; + ctx->gfx_pipeline_state.num_attachments = state->nr_cbufs; + + struct zink_batch *batch = zink_batch_no_rp(ctx); + + for (int i = 0; i < state->nr_cbufs; i++) { + struct zink_resource *res = zink_resource(state->cbufs[i]->texture); + if (res->layout != VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL) + zink_resource_barrier(batch->cmdbuf, res, res->aspect, + VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL); + } + + if (state->zsbuf) { + struct zink_resource *res = zink_resource(state->zsbuf->texture); + if (res->layout != VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL) + zink_resource_barrier(batch->cmdbuf, res, res->aspect, + VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL); + } +} + +static void +zink_set_blend_color(struct pipe_context *pctx, + const struct pipe_blend_color *color) +{ + struct zink_context *ctx = zink_context(pctx); + memcpy(ctx->blend_constants, color->color, sizeof(float) * 4); +} + +static void +zink_set_sample_mask(struct pipe_context *pctx, unsigned sample_mask) +{ + struct zink_context *ctx = zink_context(pctx); + ctx->gfx_pipeline_state.sample_mask = sample_mask; +} + +static VkAccessFlags +access_src_flags(VkImageLayout layout) +{ + switch (layout) { + case VK_IMAGE_LAYOUT_UNDEFINED: + case VK_IMAGE_LAYOUT_GENERAL: + return 0; + + case VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL: + return VK_ACCESS_COLOR_ATTACHMENT_READ_BIT; + case VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL: + return VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT; + + case VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL: + return VK_ACCESS_SHADER_READ_BIT; + + case VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL: + return VK_ACCESS_TRANSFER_READ_BIT; + + case VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL: + return VK_ACCESS_TRANSFER_WRITE_BIT; + + case VK_IMAGE_LAYOUT_PREINITIALIZED: + return VK_ACCESS_HOST_WRITE_BIT; + + default: + unreachable("unexpected layout"); + } +} + +static VkAccessFlags +access_dst_flags(VkImageLayout layout) +{ + switch (layout) { + case VK_IMAGE_LAYOUT_UNDEFINED: + case VK_IMAGE_LAYOUT_GENERAL: + return 0; + + case VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL: + return VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT; + case VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL: + return VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT; + + case VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL: + return VK_ACCESS_TRANSFER_READ_BIT; + + case VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL: + return VK_ACCESS_TRANSFER_WRITE_BIT; + + default: + unreachable("unexpected layout"); + } +} + +static VkPipelineStageFlags +pipeline_dst_stage(VkImageLayout layout) +{ + switch (layout) { + case VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL: + return VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT; + case VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL: + return VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT; + + case VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL: + return VK_PIPELINE_STAGE_TRANSFER_BIT; + case VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL: + return VK_PIPELINE_STAGE_TRANSFER_BIT; + + default: + return VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT; + } +} + +static VkPipelineStageFlags +pipeline_src_stage(VkImageLayout layout) +{ + switch (layout) { + case VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL: + return VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT; + case VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL: + return VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT; + + case VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL: + return VK_PIPELINE_STAGE_TRANSFER_BIT; + case VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL: + return VK_PIPELINE_STAGE_TRANSFER_BIT; + + default: + return VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; + } +} + + +void +zink_resource_barrier(VkCommandBuffer cmdbuf, struct zink_resource *res, + VkImageAspectFlags aspect, VkImageLayout new_layout) +{ + VkImageSubresourceRange isr = { + aspect, + 0, VK_REMAINING_MIP_LEVELS, + 0, VK_REMAINING_ARRAY_LAYERS + }; + + VkImageMemoryBarrier imb = { + VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, + NULL, + access_src_flags(res->layout), + access_dst_flags(new_layout), + res->layout, + new_layout, + VK_QUEUE_FAMILY_IGNORED, + VK_QUEUE_FAMILY_IGNORED, + res->image, + isr + }; + vkCmdPipelineBarrier( + cmdbuf, + pipeline_src_stage(res->layout), + pipeline_dst_stage(new_layout), + 0, + 0, NULL, + 0, NULL, + 1, &imb + ); + + res->layout = new_layout; +} + +static void +zink_clear(struct pipe_context *pctx, + unsigned buffers, + const union pipe_color_union *pcolor, + double depth, unsigned stencil) +{ + struct zink_context *ctx = zink_context(pctx); + struct pipe_framebuffer_state *fb = &ctx->fb_state; + + /* FIXME: this is very inefficient; if no renderpass has been started yet, + * we should record the clear if it's full-screen, and apply it as we + * start the render-pass. Otherwise we can do a partial out-of-renderpass + * clear. + */ + struct zink_batch *batch = zink_batch_rp(ctx); + + VkClearAttachment attachments[1 + PIPE_MAX_COLOR_BUFS]; + int num_attachments = 0; + + if (buffers & PIPE_CLEAR_COLOR) { + VkClearColorValue color; + color.float32[0] = pcolor->f[0]; + color.float32[1] = pcolor->f[1]; + color.float32[2] = pcolor->f[2]; + color.float32[3] = pcolor->f[3]; + + for (unsigned i = 0; i < fb->nr_cbufs; i++) { + if (!(buffers & (PIPE_CLEAR_COLOR0 << i)) || !fb->cbufs[i]) + continue; + + attachments[num_attachments].aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; + attachments[num_attachments].colorAttachment = i; + attachments[num_attachments].clearValue.color = color; + ++num_attachments; + } + } + + if (buffers & PIPE_CLEAR_DEPTHSTENCIL && fb->zsbuf) { + VkImageAspectFlags aspect = 0; + if (buffers & PIPE_CLEAR_DEPTH) + aspect |= VK_IMAGE_ASPECT_DEPTH_BIT; + if (buffers & PIPE_CLEAR_STENCIL) + aspect |= VK_IMAGE_ASPECT_STENCIL_BIT; + + attachments[num_attachments].aspectMask = aspect; + attachments[num_attachments].clearValue.depthStencil.depth = depth; + attachments[num_attachments].clearValue.depthStencil.stencil = stencil; + ++num_attachments; + } + + VkClearRect cr; + cr.rect.offset.x = 0; + cr.rect.offset.y = 0; + cr.rect.extent.width = fb->width; + cr.rect.extent.height = fb->height; + cr.baseArrayLayer = 0; + cr.layerCount = util_framebuffer_get_num_layers(fb); + vkCmdClearAttachments(batch->cmdbuf, num_attachments, attachments, 1, &cr); +} + +VkShaderStageFlagBits +zink_shader_stage(enum pipe_shader_type type) +{ + VkShaderStageFlagBits stages[] = { + [PIPE_SHADER_VERTEX] = VK_SHADER_STAGE_VERTEX_BIT, + [PIPE_SHADER_FRAGMENT] = VK_SHADER_STAGE_FRAGMENT_BIT, + [PIPE_SHADER_GEOMETRY] = VK_SHADER_STAGE_GEOMETRY_BIT, + [PIPE_SHADER_TESS_CTRL] = VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT, + [PIPE_SHADER_TESS_EVAL] = VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT, + [PIPE_SHADER_COMPUTE] = VK_SHADER_STAGE_COMPUTE_BIT, + }; + return stages[type]; +} + +static uint32_t +hash_gfx_program(const void *key) +{ + return _mesa_hash_data(key, sizeof(struct zink_shader *) * (PIPE_SHADER_TYPES - 1)); +} + +static bool +equals_gfx_program(const void *a, const void *b) +{ + return memcmp(a, b, sizeof(struct zink_shader *) * (PIPE_SHADER_TYPES - 1)) == 0; +} + +static uint32_t +hash_render_pass_state(const void *key) +{ + return _mesa_hash_data(key, sizeof(struct zink_render_pass_state)); +} + +static bool +equals_render_pass_state(const void *a, const void *b) +{ + return memcmp(a, b, sizeof(struct zink_render_pass_state)) == 0; +} + +static uint32_t +hash_framebuffer_state(const void *key) +{ + struct zink_framebuffer_state *s = (struct zink_framebuffer_state*)key; + return _mesa_hash_data(key, sizeof(struct zink_framebuffer_state) + sizeof(s->attachments) * s->num_attachments); +} + +static bool +equals_framebuffer_state(const void *a, const void *b) +{ + struct zink_framebuffer_state *s = (struct zink_framebuffer_state*)a; + return memcmp(a, b, sizeof(struct zink_framebuffer_state) + sizeof(s->attachments) * s->num_attachments) == 0; +} + +static void +zink_flush(struct pipe_context *pctx, + struct pipe_fence_handle **pfence, + enum pipe_flush_flags flags) +{ + struct zink_context *ctx = zink_context(pctx); + + struct zink_batch *batch = zink_curr_batch(ctx); + flush_batch(ctx); + + if (pfence) + zink_fence_reference(zink_screen(pctx->screen), + (struct zink_fence **)pfence, + batch->fence); + + /* HACK: + * For some strange reason, we need to finish before presenting, or else + * we start rendering on top of the back-buffer for the next frame. This + * seems like a bug in the DRI-driver to me, because we really should + * be properly protected by fences here, and the back-buffer should + * either be swapped with the front-buffer, or blitted from. But for + * some strange reason, neither of these things happen. + */ + if (flags & PIPE_FLUSH_END_OF_FRAME) + pctx->screen->fence_finish(pctx->screen, pctx, + (struct pipe_fence_handle *)batch->fence, + PIPE_TIMEOUT_INFINITE); +} + +static void +zink_flush_resource(struct pipe_context *pipe, + struct pipe_resource *resource) +{ +} + +static void +zink_resource_copy_region(struct pipe_context *pctx, + struct pipe_resource *pdst, + unsigned dst_level, unsigned dstx, unsigned dsty, unsigned dstz, + struct pipe_resource *psrc, + unsigned src_level, const struct pipe_box *src_box) +{ + struct zink_resource *dst = zink_resource(pdst); + struct zink_resource *src = zink_resource(psrc); + struct zink_context *ctx = zink_context(pctx); + if (dst->base.target != PIPE_BUFFER && src->base.target != PIPE_BUFFER) { + VkImageCopy region = {}; + + region.srcSubresource.aspectMask = src->aspect; + region.srcSubresource.mipLevel = src_level; + region.srcSubresource.layerCount = 1; + if (src->base.array_size > 1) { + region.srcSubresource.baseArrayLayer = src_box->z; + region.srcSubresource.layerCount = src_box->depth; + region.extent.depth = 1; + } else { + region.srcOffset.z = src_box->z; + region.srcSubresource.layerCount = 1; + region.extent.depth = src_box->depth; + } + + region.srcOffset.x = src_box->x; + region.srcOffset.y = src_box->y; + + region.dstSubresource.aspectMask = dst->aspect; + region.dstSubresource.mipLevel = dst_level; + if (dst->base.array_size > 1) { + region.dstSubresource.baseArrayLayer = dstz; + region.dstSubresource.layerCount = src_box->depth; + } else { + region.dstOffset.z = dstz; + region.dstSubresource.layerCount = 1; + } + + region.dstOffset.x = dstx; + region.dstOffset.y = dsty; + region.extent.width = src_box->width; + region.extent.height = src_box->height; + + struct zink_batch *batch = zink_batch_no_rp(ctx); + zink_batch_reference_resoure(batch, src); + zink_batch_reference_resoure(batch, dst); + + if (src->layout != VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL) { + zink_resource_barrier(batch->cmdbuf, src, src->aspect, + VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL); + } + + if (dst->layout != VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL) { + zink_resource_barrier(batch->cmdbuf, dst, dst->aspect, + VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL); + } + + vkCmdCopyImage(batch->cmdbuf, src->image, src->layout, + dst->image, dst->layout, + 1, ®ion); + } else if (dst->base.target == PIPE_BUFFER && + src->base.target == PIPE_BUFFER) { + VkBufferCopy region; + region.srcOffset = src_box->x; + region.dstOffset = dstx; + region.size = src_box->width; + + struct zink_batch *batch = zink_batch_no_rp(ctx); + zink_batch_reference_resoure(batch, src); + zink_batch_reference_resoure(batch, dst); + + vkCmdCopyBuffer(batch->cmdbuf, src->buffer, dst->buffer, 1, ®ion); + } else + debug_printf("zink: TODO resource copy\n"); +} + +struct pipe_context * +zink_context_create(struct pipe_screen *pscreen, void *priv, unsigned flags) +{ + struct zink_screen *screen = zink_screen(pscreen); + struct zink_context *ctx = CALLOC_STRUCT(zink_context); + if (!ctx) + goto fail; + + ctx->base.screen = pscreen; + ctx->base.priv = priv; + + ctx->base.destroy = zink_context_destroy; + + zink_context_state_init(&ctx->base); + + ctx->base.create_sampler_state = zink_create_sampler_state; + ctx->base.bind_sampler_states = zink_bind_sampler_states; + ctx->base.delete_sampler_state = zink_delete_sampler_state; + + ctx->base.create_sampler_view = zink_create_sampler_view; + ctx->base.set_sampler_views = zink_set_sampler_views; + ctx->base.sampler_view_destroy = zink_sampler_view_destroy; + + ctx->base.create_vs_state = zink_create_vs_state; + ctx->base.bind_vs_state = zink_bind_vs_state; + ctx->base.delete_vs_state = zink_delete_vs_state; + + ctx->base.create_fs_state = zink_create_fs_state; + ctx->base.bind_fs_state = zink_bind_fs_state; + ctx->base.delete_fs_state = zink_delete_fs_state; + + ctx->base.set_polygon_stipple = zink_set_polygon_stipple; + ctx->base.set_vertex_buffers = zink_set_vertex_buffers; + ctx->base.set_viewport_states = zink_set_viewport_states; + ctx->base.set_scissor_states = zink_set_scissor_states; + ctx->base.set_constant_buffer = zink_set_constant_buffer; + ctx->base.set_framebuffer_state = zink_set_framebuffer_state; + ctx->base.set_stencil_ref = zink_set_stencil_ref; + ctx->base.set_clip_state = zink_set_clip_state; + ctx->base.set_blend_color = zink_set_blend_color; + + ctx->base.set_sample_mask = zink_set_sample_mask; + + ctx->base.clear = zink_clear; + ctx->base.draw_vbo = zink_draw_vbo; + ctx->base.flush = zink_flush; + + ctx->base.resource_copy_region = zink_resource_copy_region; + ctx->base.blit = zink_blit; + + ctx->base.flush_resource = zink_flush_resource; + zink_context_surface_init(&ctx->base); + zink_context_resource_init(&ctx->base); + zink_context_query_init(&ctx->base); + + slab_create_child(&ctx->transfer_pool, &screen->transfer_pool); + + ctx->base.stream_uploader = u_upload_create_default(&ctx->base); + ctx->base.const_uploader = ctx->base.stream_uploader; + + int prim_hwsupport = 1 << PIPE_PRIM_POINTS | + 1 << PIPE_PRIM_LINES | + 1 << PIPE_PRIM_LINE_STRIP | + 1 << PIPE_PRIM_TRIANGLES | + 1 << PIPE_PRIM_TRIANGLE_STRIP | + 1 << PIPE_PRIM_TRIANGLE_FAN; + + ctx->primconvert = util_primconvert_create(&ctx->base, prim_hwsupport); + if (!ctx->primconvert) + goto fail; + + ctx->blitter = util_blitter_create(&ctx->base); + if (!ctx->blitter) + goto fail; + + VkCommandPoolCreateInfo cpci = {}; + cpci.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO; + cpci.queueFamilyIndex = screen->gfx_queue; + cpci.flags = VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT; + if (vkCreateCommandPool(screen->dev, &cpci, NULL, &ctx->cmdpool) != VK_SUCCESS) + goto fail; + + VkCommandBufferAllocateInfo cbai = {}; + cbai.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO; + cbai.commandPool = ctx->cmdpool; + cbai.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY; + cbai.commandBufferCount = 1; + + VkDescriptorPoolSize sizes[] = { + {VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, ZINK_BATCH_DESC_SIZE} + }; + VkDescriptorPoolCreateInfo dpci = {}; + dpci.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO; + dpci.pPoolSizes = sizes; + dpci.poolSizeCount = ARRAY_SIZE(sizes); + dpci.flags = VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT; + dpci.maxSets = ZINK_BATCH_DESC_SIZE; + + for (int i = 0; i < ARRAY_SIZE(ctx->batches); ++i) { + if (vkAllocateCommandBuffers(screen->dev, &cbai, &ctx->batches[i].cmdbuf) != VK_SUCCESS) + goto fail; + + ctx->batches[i].resources = _mesa_set_create(NULL, _mesa_hash_pointer, + _mesa_key_pointer_equal); + ctx->batches[i].sampler_views = _mesa_set_create(NULL, + _mesa_hash_pointer, + _mesa_key_pointer_equal); + + if (!ctx->batches[i].resources || !ctx->batches[i].sampler_views) + goto fail; + + util_dynarray_init(&ctx->batches[i].zombie_samplers, NULL); + + if (vkCreateDescriptorPool(screen->dev, &dpci, 0, + &ctx->batches[i].descpool) != VK_SUCCESS) + goto fail; + } + + vkGetDeviceQueue(screen->dev, screen->gfx_queue, 0, &ctx->queue); + + ctx->program_cache = _mesa_hash_table_create(NULL, + hash_gfx_program, + equals_gfx_program); + ctx->render_pass_cache = _mesa_hash_table_create(NULL, + hash_render_pass_state, + equals_render_pass_state); + ctx->framebuffer_cache = _mesa_hash_table_create(NULL, + hash_framebuffer_state, + equals_framebuffer_state); + + if (!ctx->program_cache || !ctx->render_pass_cache || + !ctx->framebuffer_cache) + goto fail; + + ctx->dirty_program = true; + + /* start the first batch */ + zink_start_batch(ctx, zink_curr_batch(ctx)); + + return &ctx->base; + +fail: + if (ctx) { + vkDestroyCommandPool(screen->dev, ctx->cmdpool, NULL); + FREE(ctx); + } + return NULL; +} diff -Nru mesa-19.2.8/src/gallium/drivers/zink/zink_context.h mesa-20.0.8/src/gallium/drivers/zink/zink_context.h --- mesa-19.2.8/src/gallium/drivers/zink/zink_context.h 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/zink/zink_context.h 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,160 @@ +/* + * Copyright 2018 Collabora Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef ZINK_CONTEXT_H +#define ZINK_CONTEXT_H + +#include "zink_pipeline.h" +#include "zink_batch.h" + +#include "pipe/p_context.h" +#include "pipe/p_state.h" + +#include "util/slab.h" +#include "util/list.h" + +#include + +struct blitter_context; +struct primconvert_context; +struct list_head; + +struct zink_blend_state; +struct zink_depth_stencil_alpha_state; +struct zink_gfx_program; +struct zink_rasterizer_state; +struct zink_resource; +struct zink_vertex_elements_state; + +struct zink_sampler_view { + struct pipe_sampler_view base; + VkImageView image_view; +}; + +static inline struct zink_sampler_view * +zink_sampler_view(struct pipe_sampler_view *pview) +{ + return (struct zink_sampler_view *)pview; +} + +struct zink_context { + struct pipe_context base; + struct slab_child_pool transfer_pool; + struct blitter_context *blitter; + + VkCommandPool cmdpool; + struct zink_batch batches[4]; + unsigned curr_batch; + + VkQueue queue; + + struct pipe_constant_buffer ubos[PIPE_SHADER_TYPES][PIPE_MAX_CONSTANT_BUFFERS]; + struct pipe_framebuffer_state fb_state; + + struct zink_vertex_elements_state *element_state; + struct zink_rasterizer_state *rast_state; + + struct zink_shader *gfx_stages[PIPE_SHADER_TYPES - 1]; + struct zink_gfx_pipeline_state gfx_pipeline_state; + struct hash_table *program_cache; + struct zink_gfx_program *curr_program; + + unsigned dirty_program : 1; + + struct hash_table *render_pass_cache; + struct hash_table *framebuffer_cache; + + struct primconvert_context *primconvert; + + struct zink_framebuffer *framebuffer; + + struct pipe_viewport_state viewport_states[PIPE_MAX_VIEWPORTS]; + struct pipe_scissor_state scissor_states[PIPE_MAX_VIEWPORTS]; + VkViewport viewports[PIPE_MAX_VIEWPORTS]; + VkRect2D scissors[PIPE_MAX_VIEWPORTS]; + unsigned num_viewports; + + struct pipe_vertex_buffer buffers[PIPE_MAX_ATTRIBS]; + uint32_t buffers_enabled_mask; + + void *sampler_states[PIPE_SHADER_TYPES][PIPE_MAX_SAMPLERS]; + VkSampler samplers[PIPE_SHADER_TYPES][PIPE_MAX_SAMPLERS]; + unsigned num_samplers[PIPE_SHADER_TYPES]; + struct pipe_sampler_view *image_views[PIPE_SHADER_TYPES][PIPE_MAX_SHADER_SAMPLER_VIEWS]; + unsigned num_image_views[PIPE_SHADER_TYPES]; + + float line_width; + float blend_constants[4]; + + struct pipe_stencil_ref stencil_ref; + + struct list_head active_queries; + bool queries_disabled; +}; + +static inline struct zink_context * +zink_context(struct pipe_context *context) +{ + return (struct zink_context *)context; +} + +static inline struct zink_batch * +zink_curr_batch(struct zink_context *ctx) +{ + assert(ctx->curr_batch < ARRAY_SIZE(ctx->batches)); + return ctx->batches + ctx->curr_batch; +} + +struct zink_batch * +zink_batch_rp(struct zink_context *ctx); + +struct zink_batch * +zink_batch_no_rp(struct zink_context *ctx); + +void +zink_resource_barrier(VkCommandBuffer cmdbuf, struct zink_resource *res, + VkImageAspectFlags aspect, VkImageLayout new_layout); + + void + zink_begin_render_pass(struct zink_context *ctx, + struct zink_batch *batch); + + +VkShaderStageFlagBits +zink_shader_stage(enum pipe_shader_type type); + +struct pipe_context * +zink_context_create(struct pipe_screen *pscreen, void *priv, unsigned flags); + +void +zink_context_query_init(struct pipe_context *ctx); + +void +zink_blit(struct pipe_context *pctx, + const struct pipe_blit_info *info); + +void +zink_draw_vbo(struct pipe_context *pctx, + const struct pipe_draw_info *dinfo); + +#endif diff -Nru mesa-19.2.8/src/gallium/drivers/zink/zink_draw.c mesa-20.0.8/src/gallium/drivers/zink/zink_draw.c --- mesa-19.2.8/src/gallium/drivers/zink/zink_draw.c 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/zink/zink_draw.c 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,307 @@ +#include "zink_compiler.h" +#include "zink_context.h" +#include "zink_program.h" +#include "zink_resource.h" +#include "zink_screen.h" +#include "zink_state.h" + +#include "indices/u_primconvert.h" +#include "util/hash_table.h" +#include "util/u_debug.h" +#include "util/u_helpers.h" +#include "util/u_inlines.h" +#include "util/u_prim.h" + +static VkDescriptorSet +allocate_descriptor_set(struct zink_screen *screen, + struct zink_batch *batch, + struct zink_gfx_program *prog) +{ + assert(batch->descs_left >= prog->num_descriptors); + VkDescriptorSetAllocateInfo dsai; + memset((void *)&dsai, 0, sizeof(dsai)); + dsai.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO; + dsai.pNext = NULL; + dsai.descriptorPool = batch->descpool; + dsai.descriptorSetCount = 1; + dsai.pSetLayouts = &prog->dsl; + + VkDescriptorSet desc_set; + if (vkAllocateDescriptorSets(screen->dev, &dsai, &desc_set) != VK_SUCCESS) { + debug_printf("ZINK: failed to allocate descriptor set :/"); + return VK_NULL_HANDLE; + } + + batch->descs_left -= prog->num_descriptors; + return desc_set; +} + +static void +zink_bind_vertex_buffers(struct zink_batch *batch, struct zink_context *ctx) +{ + VkBuffer buffers[PIPE_MAX_ATTRIBS]; + VkDeviceSize buffer_offsets[PIPE_MAX_ATTRIBS]; + const struct zink_vertex_elements_state *elems = ctx->element_state; + for (unsigned i = 0; i < elems->hw_state.num_bindings; i++) { + struct pipe_vertex_buffer *vb = ctx->buffers + ctx->element_state->binding_map[i]; + assert(vb && vb->buffer.resource); + struct zink_resource *res = zink_resource(vb->buffer.resource); + buffers[i] = res->buffer; + buffer_offsets[i] = vb->buffer_offset; + zink_batch_reference_resoure(batch, res); + } + + if (elems->hw_state.num_bindings > 0) + vkCmdBindVertexBuffers(batch->cmdbuf, 0, + elems->hw_state.num_bindings, + buffers, buffer_offsets); +} + +static struct zink_gfx_program * +get_gfx_program(struct zink_context *ctx) +{ + if (ctx->dirty_program) { + struct hash_entry *entry = _mesa_hash_table_search(ctx->program_cache, + ctx->gfx_stages); + if (!entry) { + struct zink_gfx_program *prog; + prog = zink_create_gfx_program(zink_screen(ctx->base.screen), + ctx->gfx_stages); + entry = _mesa_hash_table_insert(ctx->program_cache, prog->stages, prog); + if (!entry) + return NULL; + } + ctx->curr_program = entry->data; + ctx->dirty_program = false; + } + + assert(ctx->curr_program); + return ctx->curr_program; +} + +static bool +line_width_needed(enum pipe_prim_type reduced_prim, + VkPolygonMode polygon_mode) +{ + switch (reduced_prim) { + case PIPE_PRIM_POINTS: + return false; + + case PIPE_PRIM_LINES: + return true; + + case PIPE_PRIM_TRIANGLES: + return polygon_mode == VK_POLYGON_MODE_LINE; + + default: + unreachable("unexpected reduced prim"); + } +} + +void +zink_draw_vbo(struct pipe_context *pctx, + const struct pipe_draw_info *dinfo) +{ + struct zink_context *ctx = zink_context(pctx); + struct zink_screen *screen = zink_screen(pctx->screen); + struct zink_rasterizer_state *rast_state = ctx->rast_state; + + if (dinfo->mode >= PIPE_PRIM_QUADS || + dinfo->mode == PIPE_PRIM_LINE_LOOP || + dinfo->index_size == 1) { + if (!u_trim_pipe_prim(dinfo->mode, (unsigned *)&dinfo->count)) + return; + + util_primconvert_save_rasterizer_state(ctx->primconvert, &rast_state->base); + util_primconvert_draw_vbo(ctx->primconvert, dinfo); + return; + } + + struct zink_gfx_program *gfx_program = get_gfx_program(ctx); + if (!gfx_program) + return; + + VkPipeline pipeline = zink_get_gfx_pipeline(screen, gfx_program, + &ctx->gfx_pipeline_state, + dinfo->mode); + + enum pipe_prim_type reduced_prim = u_reduced_prim(dinfo->mode); + + bool depth_bias = false; + switch (reduced_prim) { + case PIPE_PRIM_POINTS: + depth_bias = rast_state->offset_point; + break; + + case PIPE_PRIM_LINES: + depth_bias = rast_state->offset_line; + break; + + case PIPE_PRIM_TRIANGLES: + depth_bias = rast_state->offset_tri; + break; + + default: + unreachable("unexpected reduced prim"); + } + + unsigned index_offset = 0; + struct pipe_resource *index_buffer = NULL; + if (dinfo->index_size > 0) { + if (dinfo->has_user_indices) { + if (!util_upload_index_buffer(pctx, dinfo, &index_buffer, &index_offset, 4)) { + debug_printf("util_upload_index_buffer() failed\n"); + return; + } + } else + index_buffer = dinfo->index.resource; + } + + VkWriteDescriptorSet wds[PIPE_SHADER_TYPES * PIPE_MAX_CONSTANT_BUFFERS + PIPE_SHADER_TYPES * PIPE_MAX_SHADER_SAMPLER_VIEWS]; + VkDescriptorBufferInfo buffer_infos[PIPE_SHADER_TYPES * PIPE_MAX_CONSTANT_BUFFERS]; + VkDescriptorImageInfo image_infos[PIPE_SHADER_TYPES * PIPE_MAX_SHADER_SAMPLER_VIEWS]; + int num_wds = 0, num_buffer_info = 0, num_image_info = 0; + + struct zink_resource *transitions[PIPE_SHADER_TYPES * PIPE_MAX_SHADER_SAMPLER_VIEWS]; + int num_transitions = 0; + + for (int i = 0; i < ARRAY_SIZE(ctx->gfx_stages); i++) { + struct zink_shader *shader = ctx->gfx_stages[i]; + if (!shader) + continue; + + for (int j = 0; j < shader->num_bindings; j++) { + int index = shader->bindings[j].index; + if (shader->bindings[j].type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER) { + assert(ctx->ubos[i][index].buffer_size > 0); + assert(ctx->ubos[i][index].buffer_size <= screen->props.limits.maxUniformBufferRange); + assert(ctx->ubos[i][index].buffer); + struct zink_resource *res = zink_resource(ctx->ubos[i][index].buffer); + buffer_infos[num_buffer_info].buffer = res->buffer; + buffer_infos[num_buffer_info].offset = ctx->ubos[i][index].buffer_offset; + buffer_infos[num_buffer_info].range = ctx->ubos[i][index].buffer_size; + wds[num_wds].pBufferInfo = buffer_infos + num_buffer_info; + ++num_buffer_info; + } else { + struct pipe_sampler_view *psampler_view = ctx->image_views[i][index]; + assert(psampler_view); + struct zink_sampler_view *sampler_view = zink_sampler_view(psampler_view); + + struct zink_resource *res = zink_resource(psampler_view->texture); + VkImageLayout layout = res->layout; + if (layout != VK_IMAGE_LAYOUT_DEPTH_STENCIL_READ_ONLY_OPTIMAL && + layout != VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL && + layout != VK_IMAGE_LAYOUT_GENERAL) { + transitions[num_transitions++] = res; + layout = VK_IMAGE_LAYOUT_GENERAL; + } + image_infos[num_image_info].imageLayout = layout; + image_infos[num_image_info].imageView = sampler_view->image_view; + image_infos[num_image_info].sampler = ctx->samplers[i][index]; + wds[num_wds].pImageInfo = image_infos + num_image_info; + ++num_image_info; + } + + wds[num_wds].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; + wds[num_wds].pNext = NULL; + wds[num_wds].dstBinding = shader->bindings[j].binding; + wds[num_wds].dstArrayElement = 0; + wds[num_wds].descriptorCount = 1; + wds[num_wds].descriptorType = shader->bindings[j].type; + ++num_wds; + } + } + + struct zink_batch *batch; + if (num_transitions > 0) { + batch = zink_batch_no_rp(ctx); + + for (int i = 0; i < num_transitions; ++i) + zink_resource_barrier(batch->cmdbuf, transitions[i], + transitions[i]->aspect, + VK_IMAGE_LAYOUT_GENERAL); + } + + batch = zink_batch_rp(ctx); + + if (batch->descs_left < gfx_program->num_descriptors) { + ctx->base.flush(&ctx->base, NULL, 0); + batch = zink_batch_rp(ctx); + assert(batch->descs_left >= gfx_program->num_descriptors); + } + + VkDescriptorSet desc_set = allocate_descriptor_set(screen, batch, + gfx_program); + assert(desc_set != VK_NULL_HANDLE); + + for (int i = 0; i < ARRAY_SIZE(ctx->gfx_stages); i++) { + struct zink_shader *shader = ctx->gfx_stages[i]; + if (!shader) + continue; + + for (int j = 0; j < shader->num_bindings; j++) { + int index = shader->bindings[j].index; + if (shader->bindings[j].type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER) { + struct zink_resource *res = zink_resource(ctx->ubos[i][index].buffer); + zink_batch_reference_resoure(batch, res); + } else { + struct zink_sampler_view *sampler_view = zink_sampler_view(ctx->image_views[i][index]); + zink_batch_reference_sampler_view(batch, sampler_view); + } + } + } + + vkCmdSetViewport(batch->cmdbuf, 0, ctx->num_viewports, ctx->viewports); + if (ctx->rast_state->base.scissor) + vkCmdSetScissor(batch->cmdbuf, 0, ctx->num_viewports, ctx->scissors); + else if (ctx->fb_state.width && ctx->fb_state.height) { + VkRect2D fb_scissor = {}; + fb_scissor.extent.width = ctx->fb_state.width; + fb_scissor.extent.height = ctx->fb_state.height; + vkCmdSetScissor(batch->cmdbuf, 0, 1, &fb_scissor); + } + + if (line_width_needed(reduced_prim, rast_state->hw_state.polygon_mode)) { + if (screen->feats.wideLines || ctx->line_width == 1.0f) + vkCmdSetLineWidth(batch->cmdbuf, ctx->line_width); + else + debug_printf("BUG: wide lines not supported, needs fallback!"); + } + + vkCmdSetStencilReference(batch->cmdbuf, VK_STENCIL_FACE_FRONT_BIT, ctx->stencil_ref.ref_value[0]); + vkCmdSetStencilReference(batch->cmdbuf, VK_STENCIL_FACE_BACK_BIT, ctx->stencil_ref.ref_value[1]); + + if (depth_bias) + vkCmdSetDepthBias(batch->cmdbuf, rast_state->offset_units, rast_state->offset_clamp, rast_state->offset_scale); + else + vkCmdSetDepthBias(batch->cmdbuf, 0.0f, 0.0f, 0.0f); + + if (ctx->gfx_pipeline_state.blend_state->need_blend_constants) + vkCmdSetBlendConstants(batch->cmdbuf, ctx->blend_constants); + + if (num_wds > 0) { + for (int i = 0; i < num_wds; ++i) + wds[i].dstSet = desc_set; + vkUpdateDescriptorSets(screen->dev, num_wds, wds, 0, NULL); + } + + vkCmdBindPipeline(batch->cmdbuf, VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline); + vkCmdBindDescriptorSets(batch->cmdbuf, VK_PIPELINE_BIND_POINT_GRAPHICS, + gfx_program->layout, 0, 1, &desc_set, 0, NULL); + zink_bind_vertex_buffers(batch, ctx); + + if (dinfo->index_size > 0) { + assert(dinfo->index_size != 1); + VkIndexType index_type = dinfo->index_size == 2 ? VK_INDEX_TYPE_UINT16 : VK_INDEX_TYPE_UINT32; + struct zink_resource *res = zink_resource(index_buffer); + vkCmdBindIndexBuffer(batch->cmdbuf, res->buffer, index_offset, index_type); + zink_batch_reference_resoure(batch, res); + vkCmdDrawIndexed(batch->cmdbuf, + dinfo->count, dinfo->instance_count, + dinfo->start, dinfo->index_bias, dinfo->start_instance); + } else + vkCmdDraw(batch->cmdbuf, dinfo->count, dinfo->instance_count, dinfo->start, dinfo->start_instance); + + if (dinfo->index_size > 0 && dinfo->has_user_indices) + pipe_resource_reference(&index_buffer, NULL); +} diff -Nru mesa-19.2.8/src/gallium/drivers/zink/zink_fence.c mesa-20.0.8/src/gallium/drivers/zink/zink_fence.c --- mesa-19.2.8/src/gallium/drivers/zink/zink_fence.c 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/zink/zink_fence.c 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,106 @@ +/* + * Copyright 2018 Collabora Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "zink_fence.h" + +#include "zink_screen.h" + +#include "util/u_memory.h" + +static void +destroy_fence(struct zink_screen *screen, struct zink_fence *fence) +{ + if (fence->fence) + vkDestroyFence(screen->dev, fence->fence, NULL); + FREE(fence); +} + +struct zink_fence * +zink_create_fence(struct pipe_screen *pscreen) +{ + struct zink_screen *screen = zink_screen(pscreen); + + VkFenceCreateInfo fci = {}; + fci.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO; + + struct zink_fence *ret = CALLOC_STRUCT(zink_fence); + if (!ret) { + debug_printf("CALLOC_STRUCT failed\n"); + return NULL; + } + + if (vkCreateFence(screen->dev, &fci, NULL, &ret->fence) != VK_SUCCESS) { + debug_printf("vkCreateFence failed\n"); + goto fail; + } + + pipe_reference_init(&ret->reference, 1); + return ret; + +fail: + destroy_fence(screen, ret); + return NULL; +} + +void +zink_fence_reference(struct zink_screen *screen, + struct zink_fence **ptr, + struct zink_fence *fence) +{ + if (pipe_reference(&(*ptr)->reference, &fence->reference)) + destroy_fence(screen, *ptr); + + *ptr = fence; +} + +static void +fence_reference(struct pipe_screen *pscreen, + struct pipe_fence_handle **pptr, + struct pipe_fence_handle *pfence) +{ + zink_fence_reference(zink_screen(pscreen), (struct zink_fence **)pptr, + zink_fence(pfence)); +} + +bool +zink_fence_finish(struct zink_screen *screen, struct zink_fence *fence, + uint64_t timeout_ns) +{ + return vkWaitForFences(screen->dev, 1, &fence->fence, VK_TRUE, + timeout_ns) == VK_SUCCESS; +} + +static bool +fence_finish(struct pipe_screen *pscreen, struct pipe_context *pctx, + struct pipe_fence_handle *pfence, uint64_t timeout_ns) +{ + return zink_fence_finish(zink_screen(pscreen), zink_fence(pfence), + timeout_ns); +} + +void +zink_screen_fence_init(struct pipe_screen *pscreen) +{ + pscreen->fence_reference = fence_reference; + pscreen->fence_finish = fence_finish; +} diff -Nru mesa-19.2.8/src/gallium/drivers/zink/zink_fence.h mesa-20.0.8/src/gallium/drivers/zink/zink_fence.h --- mesa-19.2.8/src/gallium/drivers/zink/zink_fence.h 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/zink/zink_fence.h 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,60 @@ +/* + * Copyright 2018 Collabora Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef ZINK_FENCE_H +#define ZINK_FENCE_H + +#include "util/u_inlines.h" + +#include + +struct pipe_screen; +struct zink_screen; + +struct zink_fence { + struct pipe_reference reference; + VkFence fence; +}; + +static inline struct zink_fence * +zink_fence(struct pipe_fence_handle *pfence) +{ + return (struct zink_fence *)pfence; +} + +struct zink_fence * +zink_create_fence(struct pipe_screen *pscreen); + +void +zink_fence_reference(struct zink_screen *screen, + struct zink_fence **ptr, + struct zink_fence *fence); + +bool +zink_fence_finish(struct zink_screen *screen, struct zink_fence *fence, + uint64_t timeout_ns); + +void +zink_screen_fence_init(struct pipe_screen *pscreen); + +#endif diff -Nru mesa-19.2.8/src/gallium/drivers/zink/zink_format.c mesa-20.0.8/src/gallium/drivers/zink/zink_format.c --- mesa-19.2.8/src/gallium/drivers/zink/zink_format.c 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/zink/zink_format.c 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,153 @@ +#include "zink_screen.h" + +static const VkFormat formats[PIPE_FORMAT_COUNT] = { +#define MAP_FORMAT_NORM(FMT) \ + [PIPE_FORMAT_ ## FMT ## _UNORM] = VK_FORMAT_ ## FMT ## _UNORM, \ + [PIPE_FORMAT_ ## FMT ## _SNORM] = VK_FORMAT_ ## FMT ## _SNORM, + +#define MAP_FORMAT_SCALED(FMT) \ + [PIPE_FORMAT_ ## FMT ## _USCALED] = VK_FORMAT_ ## FMT ## _USCALED, \ + [PIPE_FORMAT_ ## FMT ## _SSCALED] = VK_FORMAT_ ## FMT ## _SSCALED, + +#define MAP_FORMAT_INT(FMT) \ + [PIPE_FORMAT_ ## FMT ## _UINT] = VK_FORMAT_ ## FMT ## _UINT, \ + [PIPE_FORMAT_ ## FMT ## _SINT] = VK_FORMAT_ ## FMT ## _SINT, + +#define MAP_FORMAT_SRGB(FMT) \ + [PIPE_FORMAT_ ## FMT ## _SRGB] = VK_FORMAT_ ## FMT ## _SRGB, + +#define MAP_FORMAT_FLOAT(FMT) \ + [PIPE_FORMAT_ ## FMT ## _FLOAT] = VK_FORMAT_ ## FMT ## _SFLOAT, + + // one component + + // 8-bits + MAP_FORMAT_NORM(R8) + MAP_FORMAT_SCALED(R8) + MAP_FORMAT_INT(R8) + // 16-bits + MAP_FORMAT_NORM(R16) + MAP_FORMAT_SCALED(R16) + MAP_FORMAT_INT(R16) + MAP_FORMAT_FLOAT(R16) + // 32-bits + MAP_FORMAT_INT(R32) + MAP_FORMAT_FLOAT(R32) + + // two components + + // 8-bits + MAP_FORMAT_NORM(R8G8) + MAP_FORMAT_SCALED(R8G8) + MAP_FORMAT_INT(R8G8) + // 16-bits + MAP_FORMAT_NORM(R16G16) + MAP_FORMAT_SCALED(R16G16) + MAP_FORMAT_INT(R16G16) + MAP_FORMAT_FLOAT(R16G16) + // 32-bits + MAP_FORMAT_INT(R32G32) + MAP_FORMAT_FLOAT(R32G32) + + // three components + + // 8-bits + MAP_FORMAT_NORM(R8G8B8) + MAP_FORMAT_SCALED(R8G8B8) + MAP_FORMAT_INT(R8G8B8) + MAP_FORMAT_SRGB(R8G8B8) + // 16-bits + MAP_FORMAT_NORM(R16G16B16) + MAP_FORMAT_SCALED(R16G16B16) + MAP_FORMAT_INT(R16G16B16) + MAP_FORMAT_FLOAT(R16G16B16) + // 32-bits + MAP_FORMAT_INT(R32G32B32) + MAP_FORMAT_FLOAT(R32G32B32) + + // four components + + // 8-bits + MAP_FORMAT_NORM(R8G8B8A8) + MAP_FORMAT_SCALED(R8G8B8A8) + MAP_FORMAT_INT(R8G8B8A8) + MAP_FORMAT_SRGB(R8G8B8A8) + [PIPE_FORMAT_B8G8R8A8_UNORM] = VK_FORMAT_B8G8R8A8_UNORM, + [PIPE_FORMAT_B8G8R8X8_UNORM] = VK_FORMAT_B8G8R8A8_UNORM, + MAP_FORMAT_SRGB(B8G8R8A8) + [PIPE_FORMAT_A8B8G8R8_SRGB] = VK_FORMAT_A8B8G8R8_SRGB_PACK32, + // 16-bits + MAP_FORMAT_NORM(R16G16B16A16) + MAP_FORMAT_SCALED(R16G16B16A16) + MAP_FORMAT_INT(R16G16B16A16) + MAP_FORMAT_FLOAT(R16G16B16A16) + // 32-bits + MAP_FORMAT_INT(R32G32B32A32) + MAP_FORMAT_FLOAT(R32G32B32A32) + + // other color formats + [PIPE_FORMAT_B5G6R5_UNORM] = VK_FORMAT_R5G6B5_UNORM_PACK16, + [PIPE_FORMAT_B5G5R5A1_UNORM] = VK_FORMAT_B5G5R5A1_UNORM_PACK16, + [PIPE_FORMAT_R11G11B10_FLOAT] = VK_FORMAT_B10G11R11_UFLOAT_PACK32, + [PIPE_FORMAT_R9G9B9E5_FLOAT] = VK_FORMAT_E5B9G9R9_UFLOAT_PACK32, + [PIPE_FORMAT_R10G10B10A2_UNORM] = VK_FORMAT_A2B10G10R10_UNORM_PACK32, + [PIPE_FORMAT_B10G10R10A2_UNORM] = VK_FORMAT_A2R10G10B10_UNORM_PACK32, + [PIPE_FORMAT_R10G10B10A2_UINT] = VK_FORMAT_A2B10G10R10_UINT_PACK32, + [PIPE_FORMAT_B10G10R10A2_UINT] = VK_FORMAT_A2R10G10B10_UINT_PACK32, + + // depth/stencil formats + [PIPE_FORMAT_Z32_FLOAT] = VK_FORMAT_D32_SFLOAT, + [PIPE_FORMAT_Z32_FLOAT_S8X24_UINT] = VK_FORMAT_D32_SFLOAT_S8_UINT, + [PIPE_FORMAT_Z16_UNORM] = VK_FORMAT_D16_UNORM, + [PIPE_FORMAT_Z24X8_UNORM] = VK_FORMAT_X8_D24_UNORM_PACK32, + [PIPE_FORMAT_Z24_UNORM_S8_UINT] = VK_FORMAT_D24_UNORM_S8_UINT, + + // compressed formats + [PIPE_FORMAT_DXT1_RGB] = VK_FORMAT_BC1_RGB_UNORM_BLOCK, + [PIPE_FORMAT_DXT1_RGBA] = VK_FORMAT_BC1_RGBA_UNORM_BLOCK, + [PIPE_FORMAT_DXT3_RGBA] = VK_FORMAT_BC2_UNORM_BLOCK, + [PIPE_FORMAT_DXT5_RGBA] = VK_FORMAT_BC3_UNORM_BLOCK, + [PIPE_FORMAT_DXT1_SRGB] = VK_FORMAT_BC1_RGB_SRGB_BLOCK, + [PIPE_FORMAT_DXT1_SRGBA] = VK_FORMAT_BC1_RGBA_SRGB_BLOCK, + [PIPE_FORMAT_DXT3_SRGBA] = VK_FORMAT_BC2_SRGB_BLOCK, + [PIPE_FORMAT_DXT5_SRGBA] = VK_FORMAT_BC3_SRGB_BLOCK, + + [PIPE_FORMAT_RGTC1_UNORM] = VK_FORMAT_BC4_UNORM_BLOCK, + [PIPE_FORMAT_RGTC1_SNORM] = VK_FORMAT_BC4_SNORM_BLOCK, + [PIPE_FORMAT_RGTC2_UNORM] = VK_FORMAT_BC5_UNORM_BLOCK, + [PIPE_FORMAT_RGTC2_SNORM] = VK_FORMAT_BC5_SNORM_BLOCK, + [PIPE_FORMAT_BPTC_RGBA_UNORM] = VK_FORMAT_BC7_UNORM_BLOCK, + [PIPE_FORMAT_BPTC_SRGBA] = VK_FORMAT_BC7_SRGB_BLOCK, + [PIPE_FORMAT_BPTC_RGB_FLOAT] = VK_FORMAT_BC6H_SFLOAT_BLOCK, + [PIPE_FORMAT_BPTC_RGB_UFLOAT] = VK_FORMAT_BC6H_UFLOAT_BLOCK, +}; + +bool +zink_is_depth_format_supported(struct zink_screen *screen, VkFormat format) +{ + VkFormatProperties props; + vkGetPhysicalDeviceFormatProperties(screen->pdev, format, &props); + return (props.linearTilingFeatures | props.optimalTilingFeatures) & + VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT; +} + +VkFormat +zink_get_format(struct zink_screen *screen, enum pipe_format format) +{ + VkFormat ret = formats[format]; + + if (ret == VK_FORMAT_X8_D24_UNORM_PACK32 && + !screen->have_X8_D24_UNORM_PACK32) { + assert(zink_is_depth_format_supported(screen, VK_FORMAT_D32_SFLOAT)); + return VK_FORMAT_D32_SFLOAT; + } + + if (ret == VK_FORMAT_D24_UNORM_S8_UINT && + !screen->have_D24_UNORM_S8_UINT) { + assert(zink_is_depth_format_supported(screen, + VK_FORMAT_D32_SFLOAT_S8_UINT)); + return VK_FORMAT_D32_SFLOAT_S8_UINT; + } + + return ret; +} diff -Nru mesa-19.2.8/src/gallium/drivers/zink/zink_framebuffer.c mesa-20.0.8/src/gallium/drivers/zink/zink_framebuffer.c --- mesa-19.2.8/src/gallium/drivers/zink/zink_framebuffer.c 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/zink/zink_framebuffer.c 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,86 @@ +/* + * Copyright 2018 Collabora Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "zink_framebuffer.h" + +#include "zink_render_pass.h" +#include "zink_screen.h" +#include "zink_surface.h" + +#include "util/u_memory.h" +#include "util/u_string.h" + +void +zink_destroy_framebuffer(struct zink_screen *screen, + struct zink_framebuffer *fbuf) +{ + vkDestroyFramebuffer(screen->dev, fbuf->fb, NULL); + for (int i = 0; i < ARRAY_SIZE(fbuf->surfaces); ++i) + pipe_surface_reference(fbuf->surfaces + i, NULL); + + zink_render_pass_reference(screen, &fbuf->rp, NULL); + + FREE(fbuf); +} + +struct zink_framebuffer * +zink_create_framebuffer(struct zink_screen *screen, + struct zink_framebuffer_state *fb) +{ + struct zink_framebuffer *fbuf = CALLOC_STRUCT(zink_framebuffer); + if (!fbuf) + return NULL; + + pipe_reference_init(&fbuf->reference, 1); + + VkImageView attachments[ARRAY_SIZE(fb->attachments)]; + for (int i = 0; i < fb->num_attachments; i++) { + struct zink_surface *surf = fb->attachments[i]; + pipe_surface_reference(fbuf->surfaces + i, &surf->base); + attachments[i] = surf->image_view; + } + + zink_render_pass_reference(screen, &fbuf->rp, fb->rp); + + VkFramebufferCreateInfo fci = {}; + fci.sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO; + fci.renderPass = fbuf->rp->render_pass; + fci.attachmentCount = fb->num_attachments; + fci.pAttachments = attachments; + fci.width = fb->width; + fci.height = fb->height; + fci.layers = fb->layers; + + if (vkCreateFramebuffer(screen->dev, &fci, NULL, &fbuf->fb) != VK_SUCCESS) { + zink_destroy_framebuffer(screen, fbuf); + return NULL; + } + + return fbuf; +} + +void +debug_describe_zink_framebuffer(char* buf, const struct zink_framebuffer *ptr) +{ + sprintf(buf, "zink_framebuffer"); +} diff -Nru mesa-19.2.8/src/gallium/drivers/zink/zink_framebuffer.h mesa-20.0.8/src/gallium/drivers/zink/zink_framebuffer.h --- mesa-19.2.8/src/gallium/drivers/zink/zink_framebuffer.h 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/zink/zink_framebuffer.h 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,75 @@ +/* + * Copyright 2018 Collabora Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef ZINK_FRAMEBUFFER_H +#define ZINK_FRAMEBUFFER_H + +#include "pipe/p_state.h" +#include + +#include "util/u_inlines.h" + +struct zink_screen; +struct zink_render_pass; + +struct zink_framebuffer_state { + struct zink_render_pass *rp; + uint32_t width; + uint16_t height, layers; + uint8_t num_attachments; + struct zink_surface *attachments[PIPE_MAX_COLOR_BUFS + 1]; +}; + +struct zink_framebuffer { + struct pipe_reference reference; + VkFramebuffer fb; + + struct pipe_surface *surfaces[PIPE_MAX_COLOR_BUFS + 1]; + struct zink_render_pass *rp; +}; + +struct zink_framebuffer * +zink_create_framebuffer(struct zink_screen *screen, + struct zink_framebuffer_state *fb); + +void +zink_destroy_framebuffer(struct zink_screen *screen, + struct zink_framebuffer *fbuf); + +void +debug_describe_zink_framebuffer(char* buf, const struct zink_framebuffer *ptr); + +static inline void +zink_framebuffer_reference(struct zink_screen *screen, + struct zink_framebuffer **dst, + struct zink_framebuffer *src) +{ + struct zink_framebuffer *old_dst = *dst; + + if (pipe_reference_described(&old_dst->reference, &src->reference, + (debug_reference_descriptor)debug_describe_zink_framebuffer)) + zink_destroy_framebuffer(screen, old_dst); + *dst = src; +} + +#endif diff -Nru mesa-19.2.8/src/gallium/drivers/zink/zink_helpers.h mesa-20.0.8/src/gallium/drivers/zink/zink_helpers.h --- mesa-19.2.8/src/gallium/drivers/zink/zink_helpers.h 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/zink/zink_helpers.h 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,37 @@ +/* + * Copyright 2019 Collabora Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef ZINK_HELPERS_H +#define ZINK_HELPERS_H + +static inline VkFilter +zink_filter(enum pipe_tex_filter filter) +{ + switch (filter) { + case PIPE_TEX_FILTER_NEAREST: return VK_FILTER_NEAREST; + case PIPE_TEX_FILTER_LINEAR: return VK_FILTER_LINEAR; + } + unreachable("unexpected filter"); +} + +#endif diff -Nru mesa-19.2.8/src/gallium/drivers/zink/zink_pipeline.c mesa-20.0.8/src/gallium/drivers/zink/zink_pipeline.c --- mesa-19.2.8/src/gallium/drivers/zink/zink_pipeline.c 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/zink/zink_pipeline.c 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,156 @@ +/* + * Copyright 2018 Collabora Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "zink_pipeline.h" + +#include "zink_compiler.h" +#include "zink_context.h" +#include "zink_program.h" +#include "zink_render_pass.h" +#include "zink_screen.h" +#include "zink_state.h" + +#include "util/u_debug.h" +#include "util/u_prim.h" + +VkPipeline +zink_create_gfx_pipeline(struct zink_screen *screen, + struct zink_gfx_program *prog, + struct zink_gfx_pipeline_state *state, + VkPrimitiveTopology primitive_topology) +{ + VkPipelineVertexInputStateCreateInfo vertex_input_state = {}; + vertex_input_state.sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO; + vertex_input_state.pVertexBindingDescriptions = state->bindings; + vertex_input_state.vertexBindingDescriptionCount = state->element_state->num_bindings; + vertex_input_state.pVertexAttributeDescriptions = state->element_state->attribs; + vertex_input_state.vertexAttributeDescriptionCount = state->element_state->num_attribs; + + VkPipelineInputAssemblyStateCreateInfo primitive_state = {}; + primitive_state.sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO; + primitive_state.topology = primitive_topology; + primitive_state.primitiveRestartEnable = VK_FALSE; + + VkPipelineColorBlendStateCreateInfo blend_state = {}; + blend_state.sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO; + blend_state.pAttachments = state->blend_state->attachments; + blend_state.attachmentCount = state->num_attachments; + blend_state.logicOpEnable = state->blend_state->logicop_enable; + blend_state.logicOp = state->blend_state->logicop_func; + + VkPipelineMultisampleStateCreateInfo ms_state = {}; + ms_state.sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO; + ms_state.rasterizationSamples = state->rast_samples; + ms_state.alphaToCoverageEnable = state->blend_state->alpha_to_coverage; + ms_state.alphaToOneEnable = state->blend_state->alpha_to_one; + ms_state.pSampleMask = state->sample_mask ? &state->sample_mask : NULL; + + VkPipelineViewportStateCreateInfo viewport_state = {}; + viewport_state.sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO; + viewport_state.viewportCount = 1; + viewport_state.pViewports = NULL; + viewport_state.scissorCount = 1; + viewport_state.pScissors = NULL; + + VkPipelineRasterizationStateCreateInfo rast_state = {}; + rast_state.sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO; + + rast_state.depthClampEnable = state->rast_state->depth_clamp; + rast_state.rasterizerDiscardEnable = state->rast_state->rasterizer_discard; + rast_state.polygonMode = state->rast_state->polygon_mode; + rast_state.cullMode = state->rast_state->cull_mode; + rast_state.frontFace = state->rast_state->front_face; + + rast_state.depthBiasEnable = VK_TRUE; + rast_state.depthBiasConstantFactor = 0.0; + rast_state.depthBiasClamp = 0.0; + rast_state.depthBiasSlopeFactor = 0.0; + rast_state.lineWidth = 1.0f; + + VkPipelineDepthStencilStateCreateInfo depth_stencil_state = {}; + depth_stencil_state.sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO; + depth_stencil_state.depthTestEnable = state->depth_stencil_alpha_state->depth_test; + depth_stencil_state.depthCompareOp = state->depth_stencil_alpha_state->depth_compare_op; + depth_stencil_state.depthBoundsTestEnable = state->depth_stencil_alpha_state->depth_bounds_test; + depth_stencil_state.minDepthBounds = state->depth_stencil_alpha_state->min_depth_bounds; + depth_stencil_state.maxDepthBounds = state->depth_stencil_alpha_state->max_depth_bounds; + depth_stencil_state.stencilTestEnable = state->depth_stencil_alpha_state->stencil_test; + depth_stencil_state.front = state->depth_stencil_alpha_state->stencil_front; + depth_stencil_state.back = state->depth_stencil_alpha_state->stencil_back; + depth_stencil_state.depthWriteEnable = state->depth_stencil_alpha_state->depth_write; + + VkDynamicState dynamicStateEnables[] = { + VK_DYNAMIC_STATE_VIEWPORT, + VK_DYNAMIC_STATE_SCISSOR, + VK_DYNAMIC_STATE_LINE_WIDTH, + VK_DYNAMIC_STATE_DEPTH_BIAS, + VK_DYNAMIC_STATE_BLEND_CONSTANTS, + VK_DYNAMIC_STATE_STENCIL_REFERENCE, + }; + + VkPipelineDynamicStateCreateInfo pipelineDynamicStateCreateInfo = {}; + pipelineDynamicStateCreateInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_DYNAMIC_STATE_CREATE_INFO; + pipelineDynamicStateCreateInfo.pDynamicStates = dynamicStateEnables; + pipelineDynamicStateCreateInfo.dynamicStateCount = ARRAY_SIZE(dynamicStateEnables); + + VkGraphicsPipelineCreateInfo pci = {}; + pci.sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO; + pci.flags = VK_PIPELINE_CREATE_DISABLE_OPTIMIZATION_BIT; + pci.layout = prog->layout; + pci.renderPass = state->render_pass->render_pass; + pci.pVertexInputState = &vertex_input_state; + pci.pInputAssemblyState = &primitive_state; + pci.pRasterizationState = &rast_state; + pci.pColorBlendState = &blend_state; + pci.pMultisampleState = &ms_state; + pci.pViewportState = &viewport_state; + pci.pDepthStencilState = &depth_stencil_state; + pci.pDynamicState = &pipelineDynamicStateCreateInfo; + + VkPipelineShaderStageCreateInfo shader_stages[PIPE_SHADER_TYPES - 1]; + uint32_t num_stages = 0; + for (int i = 0; i < PIPE_SHADER_TYPES - 1; ++i) { + if (!prog->stages[i]) + continue; + + VkPipelineShaderStageCreateInfo stage = {}; + stage.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO; + stage.stage = zink_shader_stage(i); + stage.module = prog->stages[i]->shader_module; + stage.pName = "main"; + shader_stages[num_stages++] = stage; + } + assert(num_stages > 0); + + pci.pStages = shader_stages; + pci.stageCount = num_stages; + + VkPipeline pipeline; + if (vkCreateGraphicsPipelines(screen->dev, VK_NULL_HANDLE, 1, &pci, + NULL, &pipeline) != VK_SUCCESS) { + debug_printf("vkCreateGraphicsPipelines failed\n"); + return VK_NULL_HANDLE; + } + + return pipeline; +} diff -Nru mesa-19.2.8/src/gallium/drivers/zink/zink_pipeline.h mesa-20.0.8/src/gallium/drivers/zink/zink_pipeline.h --- mesa-19.2.8/src/gallium/drivers/zink/zink_pipeline.h 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/zink/zink_pipeline.h 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,62 @@ +/* + * Copyright 2018 Collabora Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef ZINK_PIPELINE_H +#define ZINK_PIPELINE_H + +#include + +#include "pipe/p_state.h" + +struct zink_blend_state; +struct zink_depth_stencil_alpha_state; +struct zink_gfx_program; +struct zink_rasterizer_state; +struct zink_render_pass; +struct zink_screen; +struct zink_vertex_elements_state; + +struct zink_gfx_pipeline_state { + struct zink_render_pass *render_pass; + + struct zink_vertex_elements_hw_state *element_state; + VkVertexInputBindingDescription bindings[PIPE_MAX_ATTRIBS]; // combination of element_state and stride + + uint32_t num_attachments; + struct zink_blend_state *blend_state; + + struct zink_rasterizer_hw_state *rast_state; + + struct zink_depth_stencil_alpha_state *depth_stencil_alpha_state; + + VkSampleMask sample_mask; + uint8_t rast_samples; +}; + +VkPipeline +zink_create_gfx_pipeline(struct zink_screen *screen, + struct zink_gfx_program *prog, + struct zink_gfx_pipeline_state *state, + VkPrimitiveTopology primitive_topology); + +#endif diff -Nru mesa-19.2.8/src/gallium/drivers/zink/zink_program.c mesa-20.0.8/src/gallium/drivers/zink/zink_program.c --- mesa-19.2.8/src/gallium/drivers/zink/zink_program.c 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/zink/zink_program.c 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,250 @@ +/* + * Copyright 2018 Collabora Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "zink_program.h" + +#include "zink_compiler.h" +#include "zink_context.h" +#include "zink_render_pass.h" +#include "zink_screen.h" + +#include "util/hash_table.h" +#include "util/set.h" +#include "util/u_debug.h" +#include "util/u_memory.h" + +static VkDescriptorSetLayout +create_desc_set_layout(VkDevice dev, + struct zink_shader *stages[PIPE_SHADER_TYPES - 1], + unsigned *num_descriptors) +{ + VkDescriptorSetLayoutBinding bindings[PIPE_SHADER_TYPES * PIPE_MAX_CONSTANT_BUFFERS]; + int num_bindings = 0; + + for (int i = 0; i < PIPE_SHADER_TYPES - 1; i++) { + struct zink_shader *shader = stages[i]; + if (!shader) + continue; + + VkShaderStageFlagBits stage_flags = zink_shader_stage(i); + for (int j = 0; j < shader->num_bindings; j++) { + assert(num_bindings < ARRAY_SIZE(bindings)); + bindings[num_bindings].binding = shader->bindings[j].binding; + bindings[num_bindings].descriptorType = shader->bindings[j].type; + bindings[num_bindings].descriptorCount = 1; + bindings[num_bindings].stageFlags = stage_flags; + bindings[num_bindings].pImmutableSamplers = NULL; + ++num_bindings; + } + } + + VkDescriptorSetLayoutCreateInfo dcslci = {}; + dcslci.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO; + dcslci.pNext = NULL; + dcslci.flags = 0; + dcslci.bindingCount = num_bindings; + dcslci.pBindings = bindings; + + VkDescriptorSetLayout dsl; + if (vkCreateDescriptorSetLayout(dev, &dcslci, 0, &dsl) != VK_SUCCESS) { + debug_printf("vkCreateDescriptorSetLayout failed\n"); + return VK_NULL_HANDLE; + } + + *num_descriptors = num_bindings; + return dsl; +} + +static VkPipelineLayout +create_pipeline_layout(VkDevice dev, VkDescriptorSetLayout dsl) +{ + assert(dsl != VK_NULL_HANDLE); + + VkPipelineLayoutCreateInfo plci = {}; + plci.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO; + + plci.pSetLayouts = &dsl; + plci.setLayoutCount = 1; + + VkPipelineLayout layout; + if (vkCreatePipelineLayout(dev, &plci, NULL, &layout) != VK_SUCCESS) { + debug_printf("vkCreatePipelineLayout failed!\n"); + return VK_NULL_HANDLE; + } + + return layout; +} + +static uint32_t +hash_gfx_pipeline_state(const void *key) +{ + return _mesa_hash_data(key, sizeof(struct zink_gfx_pipeline_state)); +} + +static bool +equals_gfx_pipeline_state(const void *a, const void *b) +{ + return memcmp(a, b, sizeof(struct zink_gfx_pipeline_state)) == 0; +} + +struct zink_gfx_program * +zink_create_gfx_program(struct zink_screen *screen, + struct zink_shader *stages[PIPE_SHADER_TYPES - 1]) +{ + struct zink_gfx_program *prog = CALLOC_STRUCT(zink_gfx_program); + if (!prog) + goto fail; + + for (int i = 0; i < ARRAY_SIZE(prog->pipelines); ++i) { + prog->pipelines[i] = _mesa_hash_table_create(NULL, + hash_gfx_pipeline_state, + equals_gfx_pipeline_state); + if (!prog->pipelines[i]) + goto fail; + } + + for (int i = 0; i < PIPE_SHADER_TYPES - 1; ++i) + prog->stages[i] = stages[i]; + + prog->dsl = create_desc_set_layout(screen->dev, stages, + &prog->num_descriptors); + if (!prog->dsl) + goto fail; + + prog->layout = create_pipeline_layout(screen->dev, prog->dsl); + if (!prog->layout) + goto fail; + + prog->render_passes = _mesa_set_create(NULL, _mesa_hash_pointer, + _mesa_key_pointer_equal); + if (!prog->render_passes) + goto fail; + + return prog; + +fail: + if (prog) + zink_destroy_gfx_program(screen, prog); + return NULL; +} + +void +zink_destroy_gfx_program(struct zink_screen *screen, + struct zink_gfx_program *prog) +{ + if (prog->layout) + vkDestroyPipelineLayout(screen->dev, prog->layout, NULL); + + if (prog->dsl) + vkDestroyDescriptorSetLayout(screen->dev, prog->dsl, NULL); + + /* unref all used render-passes */ + if (prog->render_passes) { + set_foreach(prog->render_passes, entry) { + struct zink_render_pass *pres = (struct zink_render_pass *)entry->key; + zink_render_pass_reference(screen, &pres, NULL); + } + _mesa_set_destroy(prog->render_passes, NULL); + } + + FREE(prog); +} + +struct pipeline_cache_entry { + struct zink_gfx_pipeline_state state; + VkPipeline pipeline; +}; + +static VkPrimitiveTopology +primitive_topology(enum pipe_prim_type mode) +{ + switch (mode) { + case PIPE_PRIM_POINTS: + return VK_PRIMITIVE_TOPOLOGY_POINT_LIST; + + case PIPE_PRIM_LINES: + return VK_PRIMITIVE_TOPOLOGY_LINE_LIST; + + case PIPE_PRIM_LINE_STRIP: + return VK_PRIMITIVE_TOPOLOGY_LINE_STRIP; + + case PIPE_PRIM_TRIANGLES: + return VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST; + + case PIPE_PRIM_TRIANGLE_STRIP: + return VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP; + + case PIPE_PRIM_TRIANGLE_FAN: + return VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN; + + default: + unreachable("unexpected enum pipe_prim_type"); + } +} + +static void +reference_render_pass(struct zink_screen *screen, + struct zink_gfx_program *prog, + struct zink_render_pass *render_pass) +{ + struct set_entry *entry = _mesa_set_search(prog->render_passes, + render_pass); + if (!entry) { + entry = _mesa_set_add(prog->render_passes, render_pass); + pipe_reference(NULL, &render_pass->reference); + } +} + +VkPipeline +zink_get_gfx_pipeline(struct zink_screen *screen, + struct zink_gfx_program *prog, + struct zink_gfx_pipeline_state *state, + enum pipe_prim_type mode) +{ + assert(mode <= ARRAY_SIZE(prog->pipelines)); + + /* TODO: use pre-hashed versions to save some time (can re-hash only when + state changes) */ + struct hash_entry *entry = _mesa_hash_table_search(prog->pipelines[mode], state); + if (!entry) { + VkPrimitiveTopology vkmode = primitive_topology(mode); + VkPipeline pipeline = zink_create_gfx_pipeline(screen, prog, + state, vkmode); + if (pipeline == VK_NULL_HANDLE) + return VK_NULL_HANDLE; + + struct pipeline_cache_entry *pc_entry = CALLOC_STRUCT(pipeline_cache_entry); + if (!pc_entry) + return VK_NULL_HANDLE; + + memcpy(&pc_entry->state, state, sizeof(*state)); + pc_entry->pipeline = pipeline; + + entry = _mesa_hash_table_insert(prog->pipelines[mode], &pc_entry->state, pc_entry); + assert(entry); + + reference_render_pass(screen, prog, state->render_pass); + } + + return ((struct pipeline_cache_entry *)(entry->data))->pipeline; +} diff -Nru mesa-19.2.8/src/gallium/drivers/zink/zink_program.h mesa-20.0.8/src/gallium/drivers/zink/zink_program.h --- mesa-19.2.8/src/gallium/drivers/zink/zink_program.h 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/zink/zink_program.h 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,61 @@ +/* + * Copyright 2018 Collabora Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef ZINK_PROGRAM_H +#define ZINK_PROGRAM_H + +#include + +#include "pipe/p_state.h" + +struct zink_screen; +struct zink_shader; +struct zink_gfx_pipeline_state; + +struct hash_table; +struct set; + +struct zink_gfx_program { + struct zink_shader *stages[PIPE_SHADER_TYPES - 1]; // compute stage doesn't belong here + VkDescriptorSetLayout dsl; + VkPipelineLayout layout; + unsigned num_descriptors; + struct hash_table *pipelines[PIPE_PRIM_TRIANGLE_FAN + 1]; + struct set *render_passes; +}; + +struct zink_gfx_program * +zink_create_gfx_program(struct zink_screen *screen, + struct zink_shader *stages[PIPE_SHADER_TYPES - 1]); + +void +zink_destroy_gfx_program(struct zink_screen *screen, + struct zink_gfx_program *prog); + +VkPipeline +zink_get_gfx_pipeline(struct zink_screen *screen, + struct zink_gfx_program *prog, + struct zink_gfx_pipeline_state *state, + enum pipe_prim_type mode); + +#endif diff -Nru mesa-19.2.8/src/gallium/drivers/zink/zink_public.h mesa-20.0.8/src/gallium/drivers/zink/zink_public.h --- mesa-19.2.8/src/gallium/drivers/zink/zink_public.h 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/zink/zink_public.h 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,35 @@ +/* + * Copyright 2018 Collabora Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef ZINK_PUBLIC_H +#define ZINK_PUBLIC_H + +struct pipe_screen; +struct sw_winsys; + +struct pipe_screen * +zink_create_screen(struct sw_winsys *winsys); + +struct pipe_screen * +zink_drm_create_screen(int fd); +#endif diff -Nru mesa-19.2.8/src/gallium/drivers/zink/zink_query.c mesa-20.0.8/src/gallium/drivers/zink/zink_query.c --- mesa-19.2.8/src/gallium/drivers/zink/zink_query.c 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/zink/zink_query.c 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,262 @@ +#include "zink_query.h" + +#include "zink_context.h" +#include "zink_screen.h" + +#include "util/u_dump.h" +#include "util/u_inlines.h" +#include "util/u_memory.h" + +struct zink_query { + enum pipe_query_type type; + + VkQueryPool query_pool; + unsigned curr_query, num_queries; + + VkQueryType vkqtype; + bool use_64bit; + bool precise; + + struct list_head active_list; +}; + +static VkQueryType +convert_query_type(unsigned query_type, bool *use_64bit, bool *precise) +{ + *use_64bit = false; + *precise = false; + switch (query_type) { + case PIPE_QUERY_OCCLUSION_COUNTER: + *precise = true; + *use_64bit = true; + case PIPE_QUERY_OCCLUSION_PREDICATE: + case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: + return VK_QUERY_TYPE_OCCLUSION; + case PIPE_QUERY_TIMESTAMP: + *use_64bit = true; + return VK_QUERY_TYPE_TIMESTAMP; + case PIPE_QUERY_PIPELINE_STATISTICS: + return VK_QUERY_TYPE_PIPELINE_STATISTICS; + default: + debug_printf("unknown query: %s\n", + util_str_query_type(query_type, true)); + unreachable("zink: unknown query type"); + } +} + +static struct pipe_query * +zink_create_query(struct pipe_context *pctx, + unsigned query_type, unsigned index) +{ + struct zink_screen *screen = zink_screen(pctx->screen); + struct zink_query *query = CALLOC_STRUCT(zink_query); + VkQueryPoolCreateInfo pool_create = {}; + + if (!query) + return NULL; + + query->type = query_type; + query->vkqtype = convert_query_type(query_type, &query->use_64bit, &query->precise); + if (query->vkqtype == -1) + return NULL; + + query->num_queries = query_type == PIPE_QUERY_TIMESTAMP ? 1 : 100; + query->curr_query = 0; + + pool_create.sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO; + pool_create.queryType = query->vkqtype; + pool_create.queryCount = query->num_queries; + + VkResult status = vkCreateQueryPool(screen->dev, &pool_create, NULL, &query->query_pool); + if (status != VK_SUCCESS) { + FREE(query); + return NULL; + } + return (struct pipe_query *)query; +} + +static void +zink_destroy_query(struct pipe_context *pctx, + struct pipe_query *q) +{ + struct zink_screen *screen = zink_screen(pctx->screen); + struct zink_query *query = (struct zink_query *)q; + + vkDestroyQueryPool(screen->dev, query->query_pool, NULL); +} + +static void +begin_query(struct zink_context *ctx, struct zink_query *q) +{ + VkQueryControlFlags flags = 0; + if (q->precise) + flags |= VK_QUERY_CONTROL_PRECISE_BIT; + + struct zink_batch *batch = zink_curr_batch(ctx); + vkCmdBeginQuery(batch->cmdbuf, q->query_pool, q->curr_query, flags); +} + +static bool +zink_begin_query(struct pipe_context *pctx, + struct pipe_query *q) +{ + struct zink_context *ctx = zink_context(pctx); + struct zink_query *query = (struct zink_query *)q; + + /* ignore begin_query for timestamps */ + if (query->type == PIPE_QUERY_TIMESTAMP) + return true; + + /* TODO: resetting on begin isn't ideal, as it forces render-pass exit... + * should instead reset on creation (if possible?)... Or perhaps maintain + * the pool in the batch instead? + */ + struct zink_batch *batch = zink_batch_no_rp(zink_context(pctx)); + vkCmdResetQueryPool(batch->cmdbuf, query->query_pool, 0, query->curr_query); + query->curr_query = 0; + + begin_query(ctx, query); + list_addtail(&query->active_list, &ctx->active_queries); + + return true; +} + +static void +end_query(struct zink_context *ctx, struct zink_query *q) +{ + struct zink_batch *batch = zink_curr_batch(ctx); + assert(q->type != PIPE_QUERY_TIMESTAMP); + vkCmdEndQuery(batch->cmdbuf, q->query_pool, q->curr_query); + if (++q->curr_query == q->num_queries) { + assert(0); + /* need to reset pool! */ + } +} + +static bool +zink_end_query(struct pipe_context *pctx, + struct pipe_query *q) +{ + struct zink_context *ctx = zink_context(pctx); + struct zink_query *query = (struct zink_query *)q; + + if (query->type == PIPE_QUERY_TIMESTAMP) { + assert(query->curr_query == 0); + struct zink_batch *batch = zink_curr_batch(ctx); + vkCmdWriteTimestamp(batch->cmdbuf, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, + query->query_pool, 0); + } else { + end_query(ctx, query); + list_delinit(&query->active_list); + } + + return true; +} + +static bool +zink_get_query_result(struct pipe_context *pctx, + struct pipe_query *q, + bool wait, + union pipe_query_result *result) +{ + struct zink_screen *screen = zink_screen(pctx->screen); + struct zink_query *query = (struct zink_query *)q; + VkQueryResultFlagBits flags = 0; + + if (wait) { + struct pipe_fence_handle *fence = NULL; + pctx->flush(pctx, &fence, PIPE_FLUSH_HINT_FINISH); + if (fence) { + pctx->screen->fence_finish(pctx->screen, NULL, fence, + PIPE_TIMEOUT_INFINITE); + pctx->screen->fence_reference(pctx->screen, &fence, NULL); + } + flags |= VK_QUERY_RESULT_WAIT_BIT; + } else + pctx->flush(pctx, NULL, 0); + + if (query->use_64bit) + flags |= VK_QUERY_RESULT_64_BIT; + + // TODO: handle curr_query > 100 + // union pipe_query_result results[100]; + uint64_t results[100]; + memset(results, 0, sizeof(results)); + assert(query->curr_query <= ARRAY_SIZE(results)); + if (vkGetQueryPoolResults(screen->dev, query->query_pool, + 0, query->curr_query, + sizeof(results), + results, + sizeof(uint64_t), + flags) != VK_SUCCESS) + return false; + + util_query_clear_result(result, query->type); + for (int i = 0; i < query->curr_query; ++i) { + switch (query->type) { + case PIPE_QUERY_OCCLUSION_PREDICATE: + case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: + case PIPE_QUERY_SO_OVERFLOW_PREDICATE: + case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE: + case PIPE_QUERY_GPU_FINISHED: + result->b |= results[i] != 0; + break; + + case PIPE_QUERY_OCCLUSION_COUNTER: + result->u64 += results[i]; + break; + + default: + debug_printf("unhangled query type: %s\n", + util_str_query_type(query->type, true)); + unreachable("unexpected query type"); + } + } + + return TRUE; +} + +void +zink_suspend_queries(struct zink_context *ctx, struct zink_batch *batch) +{ + struct zink_query *query; + LIST_FOR_EACH_ENTRY(query, &ctx->active_queries, active_list) { + end_query(ctx, query); + } +} + +void +zink_resume_queries(struct zink_context *ctx, struct zink_batch *batch) +{ + struct zink_query *query; + LIST_FOR_EACH_ENTRY(query, &ctx->active_queries, active_list) { + begin_query(ctx, query); + } +} + +static void +zink_set_active_query_state(struct pipe_context *pctx, bool enable) +{ + struct zink_context *ctx = zink_context(pctx); + ctx->queries_disabled = !enable; + + struct zink_batch *batch = zink_curr_batch(ctx); + if (ctx->queries_disabled) + zink_suspend_queries(ctx, batch); + else + zink_resume_queries(ctx, batch); +} + +void +zink_context_query_init(struct pipe_context *pctx) +{ + struct zink_context *ctx = zink_context(pctx); + list_inithead(&ctx->active_queries); + + pctx->create_query = zink_create_query; + pctx->destroy_query = zink_destroy_query; + pctx->begin_query = zink_begin_query; + pctx->end_query = zink_end_query; + pctx->get_query_result = zink_get_query_result; + pctx->set_active_query_state = zink_set_active_query_state; +} diff -Nru mesa-19.2.8/src/gallium/drivers/zink/zink_query.h mesa-20.0.8/src/gallium/drivers/zink/zink_query.h --- mesa-19.2.8/src/gallium/drivers/zink/zink_query.h 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/zink/zink_query.h 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,36 @@ +/* + * Copyright 2019 Collabora Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef ZINK_QUERY_H +#define ZINK_QUERY_H + +struct zink_batch; +struct zink_context; + +void +zink_suspend_queries(struct zink_context *ctx, struct zink_batch *batch); + +void +zink_resume_queries(struct zink_context *ctx, struct zink_batch *batch); + +#endif diff -Nru mesa-19.2.8/src/gallium/drivers/zink/zink_render_pass.c mesa-20.0.8/src/gallium/drivers/zink/zink_render_pass.c --- mesa-19.2.8/src/gallium/drivers/zink/zink_render_pass.c 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/zink/zink_render_pass.c 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,124 @@ +/* + * Copyright 2018 Collabora Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "zink_render_pass.h" + +#include "zink_screen.h" + +#include "util/u_memory.h" +#include "util/u_string.h" + +static VkRenderPass +create_render_pass(VkDevice dev, struct zink_render_pass_state *state) +{ + + VkAttachmentReference color_refs[PIPE_MAX_COLOR_BUFS], zs_ref; + VkAttachmentDescription attachments[PIPE_MAX_COLOR_BUFS + 1]; + + for (int i = 0; i < state->num_cbufs; i++) { + struct zink_rt_attrib *rt = state->rts + i; + attachments[i].flags = 0; + attachments[i].format = rt->format; + attachments[i].samples = rt->samples; + attachments[i].loadOp = VK_ATTACHMENT_LOAD_OP_LOAD; + attachments[i].storeOp = VK_ATTACHMENT_STORE_OP_STORE; + attachments[i].stencilLoadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE; + attachments[i].stencilStoreOp = VK_ATTACHMENT_STORE_OP_DONT_CARE; + attachments[i].initialLayout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL; + attachments[i].finalLayout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL; + color_refs[i].attachment = i; + color_refs[i].layout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL; + } + + int num_attachments = state->num_cbufs; + if (state->have_zsbuf) { + struct zink_rt_attrib *rt = state->rts + state->num_cbufs; + attachments[num_attachments].flags = 0; + attachments[num_attachments].format = rt->format; + attachments[num_attachments].samples = rt->samples; + attachments[num_attachments].loadOp = VK_ATTACHMENT_LOAD_OP_LOAD; + attachments[num_attachments].storeOp = VK_ATTACHMENT_STORE_OP_STORE; + attachments[num_attachments].stencilLoadOp = VK_ATTACHMENT_LOAD_OP_LOAD; + attachments[num_attachments].stencilStoreOp = VK_ATTACHMENT_STORE_OP_STORE; + attachments[num_attachments].initialLayout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL; + attachments[num_attachments].finalLayout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL; + + zs_ref.attachment = num_attachments++; + zs_ref.layout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL; + } + + VkSubpassDescription subpass = {}; + subpass.pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS; + subpass.colorAttachmentCount = state->num_cbufs; + subpass.pColorAttachments = color_refs; + subpass.pDepthStencilAttachment = state->have_zsbuf ? &zs_ref : NULL; + + VkRenderPassCreateInfo rpci = {}; + rpci.sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO; + rpci.attachmentCount = num_attachments; + rpci.pAttachments = attachments; + rpci.subpassCount = 1; + rpci.pSubpasses = &subpass; + + VkRenderPass render_pass; + if (vkCreateRenderPass(dev, &rpci, NULL, &render_pass) != VK_SUCCESS) + return VK_NULL_HANDLE; + + return render_pass; +} + +struct zink_render_pass * +zink_create_render_pass(struct zink_screen *screen, + struct zink_render_pass_state *state) +{ + struct zink_render_pass *rp = CALLOC_STRUCT(zink_render_pass); + if (!rp) + goto fail; + + pipe_reference_init(&rp->reference, 1); + + rp->render_pass = create_render_pass(screen->dev, state); + if (!rp->render_pass) + goto fail; + + return rp; + +fail: + if (rp) + zink_destroy_render_pass(screen, rp); + return NULL; +} + +void +zink_destroy_render_pass(struct zink_screen *screen, + struct zink_render_pass *rp) +{ + vkDestroyRenderPass(screen->dev, rp->render_pass, NULL); + FREE(rp); +} + +void +debug_describe_zink_render_pass(char* buf, const struct zink_render_pass *ptr) +{ + sprintf(buf, "zink_render_pass"); +} diff -Nru mesa-19.2.8/src/gallium/drivers/zink/zink_render_pass.h mesa-20.0.8/src/gallium/drivers/zink/zink_render_pass.h --- mesa-19.2.8/src/gallium/drivers/zink/zink_render_pass.h 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/zink/zink_render_pass.h 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,75 @@ +/* + * Copyright 2018 Collabora Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef ZINK_RENDERPASS_H +#define ZINK_RENDERPASS_H + +#include + +#include "pipe/p_state.h" +#include "util/u_inlines.h" + +struct zink_screen; + +struct zink_rt_attrib { + VkFormat format; + VkSampleCountFlagBits samples; +}; + +struct zink_render_pass_state { + uint8_t num_cbufs : 4; /* PIPE_MAX_COLOR_BUFS = 8 */ + uint8_t have_zsbuf : 1; + struct zink_rt_attrib rts[PIPE_MAX_COLOR_BUFS + 1]; +}; + +struct zink_render_pass { + struct pipe_reference reference; + + VkRenderPass render_pass; +}; + +struct zink_render_pass * +zink_create_render_pass(struct zink_screen *screen, + struct zink_render_pass_state *state); + +void +zink_destroy_render_pass(struct zink_screen *screen, + struct zink_render_pass *rp); + +void +debug_describe_zink_render_pass(char* buf, const struct zink_render_pass *ptr); + +static inline void +zink_render_pass_reference(struct zink_screen *screen, + struct zink_render_pass **dst, + struct zink_render_pass *src) +{ + struct zink_render_pass *old_dst = *dst; + + if (pipe_reference_described(&old_dst->reference, &src->reference, + (debug_reference_descriptor)debug_describe_zink_render_pass)) + zink_destroy_render_pass(screen, old_dst); + *dst = src; +} + +#endif diff -Nru mesa-19.2.8/src/gallium/drivers/zink/zink_resource.c mesa-20.0.8/src/gallium/drivers/zink/zink_resource.c --- mesa-19.2.8/src/gallium/drivers/zink/zink_resource.c 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/zink/zink_resource.c 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,560 @@ +/* + * Copyright 2018 Collabora Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "zink_resource.h" + +#include "zink_batch.h" +#include "zink_context.h" +#include "zink_screen.h" + +#include "util/slab.h" +#include "util/u_debug.h" +#include "util/format/u_format.h" +#include "util/u_inlines.h" +#include "util/u_memory.h" + +#include "state_tracker/sw_winsys.h" + +static void +zink_resource_destroy(struct pipe_screen *pscreen, + struct pipe_resource *pres) +{ + struct zink_screen *screen = zink_screen(pscreen); + struct zink_resource *res = zink_resource(pres); + if (pres->target == PIPE_BUFFER) + vkDestroyBuffer(screen->dev, res->buffer, NULL); + else + vkDestroyImage(screen->dev, res->image, NULL); + + vkFreeMemory(screen->dev, res->mem, NULL); + FREE(res); +} + +static uint32_t +get_memory_type_index(struct zink_screen *screen, + const VkMemoryRequirements *reqs, + VkMemoryPropertyFlags props) +{ + for (uint32_t i = 0u; i < VK_MAX_MEMORY_TYPES; i++) { + if (((reqs->memoryTypeBits >> i) & 1) == 1) { + if ((screen->mem_props.memoryTypes[i].propertyFlags & props) == props) { + return i; + break; + } + } + } + + unreachable("Unsupported memory-type"); + return 0; +} + +static VkImageAspectFlags +aspect_from_format(enum pipe_format fmt) +{ + if (util_format_is_depth_or_stencil(fmt)) { + VkImageAspectFlags aspect = 0; + const struct util_format_description *desc = util_format_description(fmt); + if (util_format_has_depth(desc)) + aspect |= VK_IMAGE_ASPECT_DEPTH_BIT; + if (util_format_has_stencil(desc)) + aspect |= VK_IMAGE_ASPECT_STENCIL_BIT; + return aspect; + } else + return VK_IMAGE_ASPECT_COLOR_BIT; +} + +static struct pipe_resource * +resource_create(struct pipe_screen *pscreen, + const struct pipe_resource *templ, + struct winsys_handle *whandle, + unsigned external_usage) +{ + struct zink_screen *screen = zink_screen(pscreen); + struct zink_resource *res = CALLOC_STRUCT(zink_resource); + + res->base = *templ; + + pipe_reference_init(&res->base.reference, 1); + res->base.screen = pscreen; + + VkMemoryRequirements reqs; + VkMemoryPropertyFlags flags = 0; + if (templ->target == PIPE_BUFFER) { + VkBufferCreateInfo bci = {}; + bci.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO; + bci.size = templ->width0; + + bci.usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | + VK_BUFFER_USAGE_TRANSFER_DST_BIT; + + if (templ->bind & PIPE_BIND_VERTEX_BUFFER) + bci.usage |= VK_BUFFER_USAGE_VERTEX_BUFFER_BIT; + + if (templ->bind & PIPE_BIND_INDEX_BUFFER) + bci.usage |= VK_BUFFER_USAGE_INDEX_BUFFER_BIT; + + if (templ->bind & PIPE_BIND_CONSTANT_BUFFER) + bci.usage |= VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT; + + if (templ->bind & PIPE_BIND_SHADER_BUFFER) + bci.usage |= VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; + + if (templ->bind & PIPE_BIND_COMMAND_ARGS_BUFFER) + bci.usage |= VK_BUFFER_USAGE_INDIRECT_BUFFER_BIT; + + if (vkCreateBuffer(screen->dev, &bci, NULL, &res->buffer) != + VK_SUCCESS) { + FREE(res); + return NULL; + } + + vkGetBufferMemoryRequirements(screen->dev, res->buffer, &reqs); + flags |= VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT; + } else { + res->format = zink_get_format(screen, templ->format); + + VkImageCreateInfo ici = {}; + ici.sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO; + ici.flags = VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT; + + switch (templ->target) { + case PIPE_TEXTURE_1D: + case PIPE_TEXTURE_1D_ARRAY: + ici.imageType = VK_IMAGE_TYPE_1D; + break; + + case PIPE_TEXTURE_CUBE: + case PIPE_TEXTURE_CUBE_ARRAY: + ici.flags |= VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT; + /* fall-through */ + case PIPE_TEXTURE_2D: + case PIPE_TEXTURE_2D_ARRAY: + case PIPE_TEXTURE_RECT: + ici.imageType = VK_IMAGE_TYPE_2D; + break; + + case PIPE_TEXTURE_3D: + ici.imageType = VK_IMAGE_TYPE_3D; + if (templ->bind & PIPE_BIND_RENDER_TARGET) + ici.flags |= VK_IMAGE_CREATE_2D_ARRAY_COMPATIBLE_BIT; + break; + + case PIPE_BUFFER: + unreachable("PIPE_BUFFER should already be handled"); + + default: + unreachable("Unknown target"); + } + + ici.format = res->format; + ici.extent.width = templ->width0; + ici.extent.height = templ->height0; + ici.extent.depth = templ->depth0; + ici.mipLevels = templ->last_level + 1; + ici.arrayLayers = templ->array_size; + ici.samples = templ->nr_samples ? templ->nr_samples : VK_SAMPLE_COUNT_1_BIT; + ici.tiling = templ->bind & PIPE_BIND_LINEAR ? VK_IMAGE_TILING_LINEAR : VK_IMAGE_TILING_OPTIMAL; + + if (templ->target == PIPE_TEXTURE_CUBE || + templ->target == PIPE_TEXTURE_CUBE_ARRAY) + ici.arrayLayers *= 6; + + if (templ->bind & (PIPE_BIND_DISPLAY_TARGET | + PIPE_BIND_SCANOUT | + PIPE_BIND_SHARED)) { + // assert(ici.tiling == VK_IMAGE_TILING_LINEAR); + ici.tiling = VK_IMAGE_TILING_LINEAR; + } + + if (templ->usage == PIPE_USAGE_STAGING) + ici.tiling = VK_IMAGE_TILING_LINEAR; + + /* sadly, gallium doesn't let us know if it'll ever need this, so we have to assume */ + ici.usage = VK_IMAGE_USAGE_TRANSFER_SRC_BIT | + VK_IMAGE_USAGE_TRANSFER_DST_BIT | + VK_IMAGE_USAGE_SAMPLED_BIT; + + if (templ->bind & PIPE_BIND_SHADER_IMAGE) + ici.usage |= VK_IMAGE_USAGE_STORAGE_BIT; + + if (templ->bind & PIPE_BIND_RENDER_TARGET) + ici.usage |= VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT; + + if (templ->bind & PIPE_BIND_DEPTH_STENCIL) + ici.usage |= VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT; + + if (templ->flags & PIPE_RESOURCE_FLAG_SPARSE) + ici.usage |= VK_IMAGE_USAGE_TRANSIENT_ATTACHMENT_BIT; + + if (templ->bind & PIPE_BIND_STREAM_OUTPUT) + ici.usage |= VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT; + + ici.sharingMode = VK_SHARING_MODE_EXCLUSIVE; + ici.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED; + res->layout = VK_IMAGE_LAYOUT_UNDEFINED; + + VkResult result = vkCreateImage(screen->dev, &ici, NULL, &res->image); + if (result != VK_SUCCESS) { + FREE(res); + return NULL; + } + + res->optimial_tiling = ici.tiling != VK_IMAGE_TILING_LINEAR; + res->aspect = aspect_from_format(templ->format); + + vkGetImageMemoryRequirements(screen->dev, res->image, &reqs); + if (templ->usage == PIPE_USAGE_STAGING || (screen->winsys && (templ->bind & (PIPE_BIND_SCANOUT|PIPE_BIND_DISPLAY_TARGET|PIPE_BIND_SHARED)))) + flags |= VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT; + else + flags |= VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT; + } + + VkMemoryAllocateInfo mai = {}; + mai.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO; + mai.allocationSize = reqs.size; + mai.memoryTypeIndex = get_memory_type_index(screen, &reqs, flags); + + VkExportMemoryAllocateInfo emai = {}; + if (templ->bind & PIPE_BIND_SHARED) { + emai.sType = VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO; + emai.handleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT; + mai.pNext = &emai; + } + + VkImportMemoryFdInfoKHR imfi = { + VK_STRUCTURE_TYPE_IMPORT_MEMORY_FD_INFO_KHR, + NULL, + }; + + if (whandle && whandle->type == WINSYS_HANDLE_TYPE_FD) { + imfi.sType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT; + imfi.pNext = NULL; + imfi.handleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT; + imfi.fd = whandle->handle; + + emai.pNext = &imfi; + } + + if (vkAllocateMemory(screen->dev, &mai, NULL, &res->mem) != VK_SUCCESS) + goto fail; + + res->offset = 0; + res->size = reqs.size; + + if (templ->target == PIPE_BUFFER) + vkBindBufferMemory(screen->dev, res->buffer, res->mem, res->offset); + else + vkBindImageMemory(screen->dev, res->image, res->mem, res->offset); + + if (screen->winsys && (templ->bind & (PIPE_BIND_DISPLAY_TARGET | + PIPE_BIND_SCANOUT | + PIPE_BIND_SHARED))) { + struct sw_winsys *winsys = screen->winsys; + res->dt = winsys->displaytarget_create(screen->winsys, + res->base.bind, + res->base.format, + templ->width0, + templ->height0, + 64, NULL, + &res->dt_stride); + } + + return &res->base; + +fail: + if (templ->target == PIPE_BUFFER) + vkDestroyBuffer(screen->dev, res->buffer, NULL); + else + vkDestroyImage(screen->dev, res->image, NULL); + + FREE(res); + + return NULL; +} + +static struct pipe_resource * +zink_resource_create(struct pipe_screen *pscreen, + const struct pipe_resource *templ) +{ + return resource_create(pscreen, templ, NULL, 0); +} + +static bool +zink_resource_get_handle(struct pipe_screen *pscreen, + struct pipe_context *context, + struct pipe_resource *tex, + struct winsys_handle *whandle, + unsigned usage) +{ + struct zink_resource *res = zink_resource(tex); + struct zink_screen *screen = zink_screen(pscreen); + VkMemoryGetFdInfoKHR fd_info = {}; + int fd; + + if (res->base.target != PIPE_BUFFER) { + VkImageSubresource sub_res = {}; + VkSubresourceLayout sub_res_layout = {}; + + sub_res.aspectMask = res->aspect; + + vkGetImageSubresourceLayout(screen->dev, res->image, &sub_res, &sub_res_layout); + + whandle->stride = sub_res_layout.rowPitch; + } + + if (whandle->type == WINSYS_HANDLE_TYPE_FD) { + + if (!screen->vk_GetMemoryFdKHR) + screen->vk_GetMemoryFdKHR = (PFN_vkGetMemoryFdKHR)vkGetDeviceProcAddr(screen->dev, "vkGetMemoryFdKHR"); + if (!screen->vk_GetMemoryFdKHR) + return false; + fd_info.sType = VK_STRUCTURE_TYPE_MEMORY_GET_FD_INFO_KHR; + fd_info.memory = res->mem; + fd_info.handleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT; + VkResult result = (*screen->vk_GetMemoryFdKHR)(screen->dev, &fd_info, &fd); + if (result != VK_SUCCESS) + return false; + whandle->handle = fd; + } + return true; +} + +static struct pipe_resource * +zink_resource_from_handle(struct pipe_screen *pscreen, + const struct pipe_resource *templ, + struct winsys_handle *whandle, + unsigned usage) +{ + return resource_create(pscreen, templ, whandle, usage); +} + +void +zink_screen_resource_init(struct pipe_screen *pscreen) +{ + pscreen->resource_create = zink_resource_create; + pscreen->resource_destroy = zink_resource_destroy; + + if (zink_screen(pscreen)->have_KHR_external_memory_fd) { + pscreen->resource_get_handle = zink_resource_get_handle; + pscreen->resource_from_handle = zink_resource_from_handle; + } +} + +static bool +zink_transfer_copy_bufimage(struct zink_context *ctx, + struct zink_resource *res, + struct zink_resource *staging_res, + struct zink_transfer *trans, + bool buf2img) +{ + struct zink_batch *batch = zink_batch_no_rp(ctx); + + if (buf2img) { + if (res->layout != VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL) { + zink_resource_barrier(batch->cmdbuf, res, res->aspect, + VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL); + } + } else { + if (res->layout != VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL) { + zink_resource_barrier(batch->cmdbuf, res, res->aspect, + VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL); + } + } + + VkBufferImageCopy copyRegion = {}; + copyRegion.bufferOffset = staging_res->offset; + copyRegion.bufferRowLength = 0; + copyRegion.bufferImageHeight = 0; + copyRegion.imageSubresource.mipLevel = trans->base.level; + copyRegion.imageSubresource.layerCount = 1; + if (res->base.array_size > 1) { + copyRegion.imageSubresource.baseArrayLayer = trans->base.box.z; + copyRegion.imageSubresource.layerCount = trans->base.box.depth; + copyRegion.imageExtent.depth = 1; + } else { + copyRegion.imageOffset.z = trans->base.box.z; + copyRegion.imageExtent.depth = trans->base.box.depth; + } + copyRegion.imageOffset.x = trans->base.box.x; + copyRegion.imageOffset.y = trans->base.box.y; + + copyRegion.imageExtent.width = trans->base.box.width; + copyRegion.imageExtent.height = trans->base.box.height; + + zink_batch_reference_resoure(batch, res); + zink_batch_reference_resoure(batch, staging_res); + + unsigned aspects = res->aspect; + while (aspects) { + int aspect = 1 << u_bit_scan(&aspects); + copyRegion.imageSubresource.aspectMask = aspect; + + if (buf2img) + vkCmdCopyBufferToImage(batch->cmdbuf, staging_res->buffer, res->image, res->layout, 1, ©Region); + else + vkCmdCopyImageToBuffer(batch->cmdbuf, res->image, res->layout, staging_res->buffer, 1, ©Region); + } + + return true; +} + +static void * +zink_transfer_map(struct pipe_context *pctx, + struct pipe_resource *pres, + unsigned level, + unsigned usage, + const struct pipe_box *box, + struct pipe_transfer **transfer) +{ + struct zink_context *ctx = zink_context(pctx); + struct zink_screen *screen = zink_screen(pctx->screen); + struct zink_resource *res = zink_resource(pres); + + struct zink_transfer *trans = slab_alloc(&ctx->transfer_pool); + if (!trans) + return NULL; + + memset(trans, 0, sizeof(*trans)); + pipe_resource_reference(&trans->base.resource, pres); + + trans->base.resource = pres; + trans->base.level = level; + trans->base.usage = usage; + trans->base.box = *box; + + void *ptr; + if (pres->target == PIPE_BUFFER) { + VkResult result = vkMapMemory(screen->dev, res->mem, res->offset, res->size, 0, &ptr); + if (result != VK_SUCCESS) + return NULL; + + trans->base.stride = 0; + trans->base.layer_stride = 0; + ptr = ((uint8_t *)ptr) + box->x; + } else { + if (res->optimial_tiling || ((res->base.usage != PIPE_USAGE_STAGING))) { + trans->base.stride = util_format_get_stride(pres->format, box->width); + trans->base.layer_stride = util_format_get_2d_size(pres->format, + trans->base.stride, + box->height); + + struct pipe_resource templ = *pres; + templ.usage = PIPE_USAGE_STAGING; + templ.target = PIPE_BUFFER; + templ.bind = 0; + templ.width0 = trans->base.layer_stride * box->depth; + templ.height0 = templ.depth0 = 0; + templ.last_level = 0; + templ.array_size = 1; + templ.flags = 0; + + trans->staging_res = zink_resource_create(pctx->screen, &templ); + if (!trans->staging_res) + return NULL; + + struct zink_resource *staging_res = zink_resource(trans->staging_res); + + if (usage & PIPE_TRANSFER_READ) { + struct zink_context *ctx = zink_context(pctx); + bool ret = zink_transfer_copy_bufimage(ctx, res, + staging_res, trans, + false); + if (ret == false) + return NULL; + + /* need to wait for rendering to finish */ + struct pipe_fence_handle *fence = NULL; + pctx->flush(pctx, &fence, PIPE_FLUSH_HINT_FINISH); + if (fence) { + pctx->screen->fence_finish(pctx->screen, NULL, fence, + PIPE_TIMEOUT_INFINITE); + pctx->screen->fence_reference(pctx->screen, &fence, NULL); + } + } + + VkResult result = vkMapMemory(screen->dev, staging_res->mem, + staging_res->offset, + staging_res->size, 0, &ptr); + if (result != VK_SUCCESS) + return NULL; + + } else { + assert(!res->optimial_tiling); + VkResult result = vkMapMemory(screen->dev, res->mem, res->offset, res->size, 0, &ptr); + if (result != VK_SUCCESS) + return NULL; + VkImageSubresource isr = { + res->aspect, + level, + 0 + }; + VkSubresourceLayout srl; + vkGetImageSubresourceLayout(screen->dev, res->image, &isr, &srl); + trans->base.stride = srl.rowPitch; + trans->base.layer_stride = srl.arrayPitch; + ptr = ((uint8_t *)ptr) + box->z * srl.depthPitch + + box->y * srl.rowPitch + + box->x; + } + } + + *transfer = &trans->base; + return ptr; +} + +static void +zink_transfer_unmap(struct pipe_context *pctx, + struct pipe_transfer *ptrans) +{ + struct zink_context *ctx = zink_context(pctx); + struct zink_screen *screen = zink_screen(pctx->screen); + struct zink_resource *res = zink_resource(ptrans->resource); + struct zink_transfer *trans = (struct zink_transfer *)ptrans; + if (trans->staging_res) { + struct zink_resource *staging_res = zink_resource(trans->staging_res); + vkUnmapMemory(screen->dev, staging_res->mem); + + if (trans->base.usage & PIPE_TRANSFER_WRITE) { + struct zink_context *ctx = zink_context(pctx); + + zink_transfer_copy_bufimage(ctx, res, staging_res, trans, true); + } + + pipe_resource_reference(&trans->staging_res, NULL); + } else + vkUnmapMemory(screen->dev, res->mem); + + pipe_resource_reference(&trans->base.resource, NULL); + slab_free(&ctx->transfer_pool, ptrans); +} + +void +zink_context_resource_init(struct pipe_context *pctx) +{ + pctx->transfer_map = zink_transfer_map; + pctx->transfer_unmap = zink_transfer_unmap; + + pctx->transfer_flush_region = u_default_transfer_flush_region; + pctx->buffer_subdata = u_default_buffer_subdata; + pctx->texture_subdata = u_default_texture_subdata; +} diff -Nru mesa-19.2.8/src/gallium/drivers/zink/zink_resource.h mesa-20.0.8/src/gallium/drivers/zink/zink_resource.h --- mesa-19.2.8/src/gallium/drivers/zink/zink_resource.h 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/zink/zink_resource.h 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,71 @@ +/* + * Copyright 2018 Collabora Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef ZINK_RESOURCE_H +#define ZINK_RESOURCE_H + +struct pipe_screen; +struct sw_displaytarget; + +#include "util/u_transfer.h" + +#include + +struct zink_resource { + struct pipe_resource base; + + union { + VkBuffer buffer; + struct { + VkFormat format; + VkImage image; + VkImageLayout layout; + VkImageAspectFlags aspect; + bool optimial_tiling; + }; + }; + VkDeviceMemory mem; + VkDeviceSize offset, size; + + struct sw_displaytarget *dt; + unsigned dt_stride; +}; + +struct zink_transfer { + struct pipe_transfer base; + struct pipe_resource *staging_res; +}; + +static inline struct zink_resource * +zink_resource(struct pipe_resource *r) +{ + return (struct zink_resource *)r; +} + +void +zink_screen_resource_init(struct pipe_screen *pscreen); + +void +zink_context_resource_init(struct pipe_context *pctx); + +#endif diff -Nru mesa-19.2.8/src/gallium/drivers/zink/zink_screen.c mesa-20.0.8/src/gallium/drivers/zink/zink_screen.c --- mesa-19.2.8/src/gallium/drivers/zink/zink_screen.c 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/zink/zink_screen.c 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,808 @@ +/* + * Copyright 2018 Collabora Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "zink_screen.h" + +#include "zink_compiler.h" +#include "zink_context.h" +#include "zink_fence.h" +#include "zink_public.h" +#include "zink_resource.h" + +#include "os/os_process.h" +#include "util/u_debug.h" +#include "util/format/u_format.h" +#include "util/u_math.h" +#include "util/u_memory.h" +#include "util/u_screen.h" +#include "util/u_string.h" + +#include "state_tracker/sw_winsys.h" + +static const struct debug_named_value +debug_options[] = { + { "nir", ZINK_DEBUG_NIR, "Dump NIR during program compile" }, + { "spirv", ZINK_DEBUG_SPIRV, "Dump SPIR-V during program compile" }, + { "tgsi", ZINK_DEBUG_TGSI, "Dump TGSI during program compile" }, + DEBUG_NAMED_VALUE_END +}; + +DEBUG_GET_ONCE_FLAGS_OPTION(zink_debug, "ZINK_DEBUG", debug_options, 0) + +uint32_t +zink_debug; + +static const char * +zink_get_vendor(struct pipe_screen *pscreen) +{ + return "Collabora Ltd"; +} + +static const char * +zink_get_device_vendor(struct pipe_screen *pscreen) +{ + struct zink_screen *screen = zink_screen(pscreen); + static char buf[1000]; + snprintf(buf, sizeof(buf), "Unknown (vendor-id: 0x%04x)", screen->props.vendorID); + return buf; +} + +static const char * +zink_get_name(struct pipe_screen *pscreen) +{ + struct zink_screen *screen = zink_screen(pscreen); + static char buf[1000]; + snprintf(buf, sizeof(buf), "zink (%s)", screen->props.deviceName); + return buf; +} + +static int +get_video_mem(struct zink_screen *screen) +{ + VkDeviceSize size = 0; + for (uint32_t i = 0; i < screen->mem_props.memoryHeapCount; ++i) + size += screen->mem_props.memoryHeaps[i].size; + return (int)(size >> 20); +} + +static int +zink_get_param(struct pipe_screen *pscreen, enum pipe_cap param) +{ + struct zink_screen *screen = zink_screen(pscreen); + + switch (param) { + case PIPE_CAP_NPOT_TEXTURES: + return 1; + + case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS: + return screen->props.limits.maxFragmentDualSrcAttachments; + + case PIPE_CAP_POINT_SPRITE: + return 1; + + case PIPE_CAP_MAX_RENDER_TARGETS: + return screen->props.limits.maxColorAttachments; + + case PIPE_CAP_OCCLUSION_QUERY: + return 1; + +#if 0 /* TODO: Enable me */ + case PIPE_CAP_QUERY_TIME_ELAPSED: + return 1; +#endif + + case PIPE_CAP_TEXTURE_SWIZZLE: + return 1; + + case PIPE_CAP_MAX_TEXTURE_2D_SIZE: + return screen->props.limits.maxImageDimension2D; + case PIPE_CAP_MAX_TEXTURE_3D_LEVELS: + return 1 + util_logbase2(screen->props.limits.maxImageDimension3D); + case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS: + return 1 + util_logbase2(screen->props.limits.maxImageDimensionCube); + + case PIPE_CAP_BLEND_EQUATION_SEPARATE: + case PIPE_CAP_FRAGMENT_SHADER_TEXTURE_LOD: + case PIPE_CAP_FRAGMENT_SHADER_DERIVATIVES: + case PIPE_CAP_VERTEX_SHADER_SATURATE: + return 1; + + case PIPE_CAP_INDEP_BLEND_ENABLE: + case PIPE_CAP_INDEP_BLEND_FUNC: + return 1; + + case PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS: + return screen->props.limits.maxImageArrayLayers; + +#if 0 /* TODO: Enable me */ + case PIPE_CAP_DEPTH_CLIP_DISABLE: + return 0; +#endif + + case PIPE_CAP_MIXED_COLORBUFFER_FORMATS: + return 1; + + case PIPE_CAP_SEAMLESS_CUBE_MAP: + return 1; + + case PIPE_CAP_MIN_TEXEL_OFFSET: + return screen->props.limits.minTexelOffset; + case PIPE_CAP_MAX_TEXEL_OFFSET: + return screen->props.limits.maxTexelOffset; + + case PIPE_CAP_VERTEX_COLOR_UNCLAMPED: + return 1; + + case PIPE_CAP_GLSL_FEATURE_LEVEL: + case PIPE_CAP_GLSL_FEATURE_LEVEL_COMPATIBILITY: + return 120; + +#if 0 /* TODO: Enable me */ + case PIPE_CAP_COMPUTE: + return 1; +#endif + + case PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT: + return screen->props.limits.minUniformBufferOffsetAlignment; + +#if 0 /* TODO: Enable me */ + case PIPE_CAP_QUERY_TIMESTAMP: + return 1; +#endif + + case PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT: + return screen->props.limits.minMemoryMapAlignment; + + case PIPE_CAP_CUBE_MAP_ARRAY: + return screen->feats.imageCubeArray; + + case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER: + return 0; /* unsure */ + + case PIPE_CAP_MAX_TEXTURE_BUFFER_SIZE: + return screen->props.limits.maxTexelBufferElements; + + case PIPE_CAP_ENDIANNESS: + return PIPE_ENDIAN_NATIVE; /* unsure */ + + case PIPE_CAP_MAX_VIEWPORTS: + return screen->props.limits.maxViewports; + + case PIPE_CAP_MIXED_FRAMEBUFFER_SIZES: + return 1; + + case PIPE_CAP_MAX_GEOMETRY_OUTPUT_VERTICES: + return screen->props.limits.maxGeometryOutputVertices; + case PIPE_CAP_MAX_GEOMETRY_TOTAL_OUTPUT_COMPONENTS: + return screen->props.limits.maxGeometryOutputComponents; + +#if 0 /* TODO: Enable me. Enables ARB_texture_gather */ + case PIPE_CAP_MAX_TEXTURE_GATHER_COMPONENTS: + return 4; +#endif + + case PIPE_CAP_MIN_TEXTURE_GATHER_OFFSET: + return screen->props.limits.minTexelGatherOffset; + case PIPE_CAP_MAX_TEXTURE_GATHER_OFFSET: + return screen->props.limits.maxTexelGatherOffset; + + case PIPE_CAP_VENDOR_ID: + return screen->props.vendorID; + case PIPE_CAP_DEVICE_ID: + return screen->props.deviceID; + + case PIPE_CAP_ACCELERATED: + return 1; + case PIPE_CAP_VIDEO_MEMORY: + return get_video_mem(screen); + case PIPE_CAP_UMA: + return screen->props.deviceType == VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU; + + case PIPE_CAP_MAX_VERTEX_ATTRIB_STRIDE: + return screen->props.limits.maxVertexInputBindingStride; + +#if 0 /* TODO: Enable me */ + case PIPE_CAP_SAMPLER_VIEW_TARGET: + return 1; +#endif + +#if 0 /* TODO: Enable me */ + case PIPE_CAP_CLIP_HALFZ: + return 1; +#endif + +#if 0 /* TODO: Enable me */ + case PIPE_CAP_TEXTURE_FLOAT_LINEAR: + case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR: + return 1; +#endif + + case PIPE_CAP_SHAREABLE_SHADERS: + return 1; + +#if 0 /* TODO: Enable me. Enables GL_ARB_shader_storage_buffer_object */ + case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT: + return screen->props.limits.minStorageBufferOffsetAlignment; +#endif + + case PIPE_CAP_PCI_GROUP: + case PIPE_CAP_PCI_BUS: + case PIPE_CAP_PCI_DEVICE: + case PIPE_CAP_PCI_FUNCTION: + return 0; /* TODO: figure these out */ + +#if 0 /* TODO: Enable me */ + case PIPE_CAP_CULL_DISTANCE: + return screen->feats.shaderCullDistance; +#endif + + case PIPE_CAP_VIEWPORT_SUBPIXEL_BITS: + return screen->props.limits.viewportSubPixelBits; + + case PIPE_CAP_GLSL_OPTIMIZE_CONSERVATIVELY: + return 0; /* not sure */ + + case PIPE_CAP_MAX_GS_INVOCATIONS: + return 0; /* not implemented */ + + case PIPE_CAP_MAX_COMBINED_SHADER_BUFFERS: + return screen->props.limits.maxDescriptorSetStorageBuffers; + + case PIPE_CAP_MAX_SHADER_BUFFER_SIZE: + return screen->props.limits.maxStorageBufferRange; /* unsure */ + + case PIPE_CAP_TGSI_FS_COORD_ORIGIN_UPPER_LEFT: + case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER: + return 1; + + case PIPE_CAP_TGSI_FS_COORD_ORIGIN_LOWER_LEFT: + case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_INTEGER: + return 0; + + case PIPE_CAP_BUFFER_MAP_PERSISTENT_COHERENT: + return 0; + + case PIPE_CAP_NIR_COMPACT_ARRAYS: + return 1; + + case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL: + return 1; + + case PIPE_CAP_FLATSHADE: + case PIPE_CAP_ALPHA_TEST: + case PIPE_CAP_CLIP_PLANES: + case PIPE_CAP_POINT_SIZE_FIXED: + case PIPE_CAP_TWO_SIDED_COLOR: + return 0; + + case PIPE_CAP_DMABUF: + return screen->have_KHR_external_memory_fd; + + default: + return u_pipe_screen_get_param_defaults(pscreen, param); + } +} + +static float +zink_get_paramf(struct pipe_screen *pscreen, enum pipe_capf param) +{ + struct zink_screen *screen = zink_screen(pscreen); + + switch (param) { + case PIPE_CAPF_MAX_LINE_WIDTH: + case PIPE_CAPF_MAX_LINE_WIDTH_AA: + return screen->props.limits.lineWidthRange[1]; + + case PIPE_CAPF_MAX_POINT_WIDTH: + case PIPE_CAPF_MAX_POINT_WIDTH_AA: + return screen->props.limits.pointSizeRange[1]; + + case PIPE_CAPF_MAX_TEXTURE_ANISOTROPY: + return screen->props.limits.maxSamplerAnisotropy; + + case PIPE_CAPF_MAX_TEXTURE_LOD_BIAS: + return screen->props.limits.maxSamplerLodBias; + + case PIPE_CAPF_MIN_CONSERVATIVE_RASTER_DILATE: + case PIPE_CAPF_MAX_CONSERVATIVE_RASTER_DILATE: + case PIPE_CAPF_CONSERVATIVE_RASTER_DILATE_GRANULARITY: + return 0.0f; /* not implemented */ + } + + /* should only get here on unhandled cases */ + return 0.0; +} + +static int +zink_get_shader_param(struct pipe_screen *pscreen, + enum pipe_shader_type shader, + enum pipe_shader_cap param) +{ + struct zink_screen *screen = zink_screen(pscreen); + + switch (param) { + case PIPE_SHADER_CAP_MAX_INSTRUCTIONS: + case PIPE_SHADER_CAP_MAX_ALU_INSTRUCTIONS: + case PIPE_SHADER_CAP_MAX_TEX_INSTRUCTIONS: + case PIPE_SHADER_CAP_MAX_TEX_INDIRECTIONS: + case PIPE_SHADER_CAP_MAX_CONTROL_FLOW_DEPTH: + if (shader == PIPE_SHADER_VERTEX || + shader == PIPE_SHADER_FRAGMENT) + return INT_MAX; + return 0; + + case PIPE_SHADER_CAP_MAX_INPUTS: + switch (shader) { + case PIPE_SHADER_VERTEX: + return MIN2(screen->props.limits.maxVertexInputAttributes, + PIPE_MAX_SHADER_INPUTS); + case PIPE_SHADER_FRAGMENT: + return MIN2(screen->props.limits.maxFragmentInputComponents / 4, + PIPE_MAX_SHADER_INPUTS); + default: + return 0; /* unsupported stage */ + } + + case PIPE_SHADER_CAP_MAX_OUTPUTS: + switch (shader) { + case PIPE_SHADER_VERTEX: + return MIN2(screen->props.limits.maxVertexOutputComponents / 4, + PIPE_MAX_SHADER_OUTPUTS); + case PIPE_SHADER_FRAGMENT: + return MIN2(screen->props.limits.maxColorAttachments, + PIPE_MAX_SHADER_OUTPUTS); + default: + return 0; /* unsupported stage */ + } + + case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS: + /* this might be a bit simplistic... */ + return MIN2(screen->props.limits.maxPerStageDescriptorSamplers, + PIPE_MAX_SAMPLERS); + + case PIPE_SHADER_CAP_MAX_CONST_BUFFER_SIZE: + return MIN2(screen->props.limits.maxUniformBufferRange, INT_MAX); + + case PIPE_SHADER_CAP_MAX_CONST_BUFFERS: + return screen->props.limits.maxPerStageDescriptorUniformBuffers; + + case PIPE_SHADER_CAP_MAX_TEMPS: + return INT_MAX; + + case PIPE_SHADER_CAP_INTEGERS: + return 1; + + case PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR: + case PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR: + case PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR: + case PIPE_SHADER_CAP_INDIRECT_CONST_ADDR: + case PIPE_SHADER_CAP_SUBROUTINES: + case PIPE_SHADER_CAP_INT64_ATOMICS: + case PIPE_SHADER_CAP_FP16: + return 0; /* not implemented */ + + case PIPE_SHADER_CAP_PREFERRED_IR: + return PIPE_SHADER_IR_NIR; + + case PIPE_SHADER_CAP_TGSI_SQRT_SUPPORTED: + return 0; /* not implemented */ + + case PIPE_SHADER_CAP_MAX_SAMPLER_VIEWS: + return MIN2(screen->props.limits.maxPerStageDescriptorSampledImages, + PIPE_MAX_SHADER_SAMPLER_VIEWS); + + case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED: + case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED: + case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED: + return 0; /* not implemented */ + + case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE: + return 0; /* no idea */ + + case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT: + return 32; /* arbitrary */ + + case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS: + /* TODO: this limitation is dumb, and will need some fixes in mesa */ + return MIN2(screen->props.limits.maxPerStageDescriptorStorageBuffers, 8); + + case PIPE_SHADER_CAP_SUPPORTED_IRS: + return (1 << PIPE_SHADER_IR_NIR) | (1 << PIPE_SHADER_IR_TGSI); + + case PIPE_SHADER_CAP_MAX_SHADER_IMAGES: + return MIN2(screen->props.limits.maxPerStageDescriptorStorageImages, + PIPE_MAX_SHADER_IMAGES); + + case PIPE_SHADER_CAP_LOWER_IF_THRESHOLD: + case PIPE_SHADER_CAP_TGSI_SKIP_MERGE_REGISTERS: + return 0; /* unsure */ + + case PIPE_SHADER_CAP_TGSI_LDEXP_SUPPORTED: + case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTERS: + case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTER_BUFFERS: + case PIPE_SHADER_CAP_TGSI_CONT_SUPPORTED: + return 0; /* not implemented */ + } + + /* should only get here on unhandled cases */ + return 0; +} + +static VkSampleCountFlagBits +vk_sample_count_flags(uint32_t sample_count) +{ + switch (sample_count) { + case 1: return VK_SAMPLE_COUNT_1_BIT; + case 2: return VK_SAMPLE_COUNT_2_BIT; + case 4: return VK_SAMPLE_COUNT_4_BIT; + case 8: return VK_SAMPLE_COUNT_8_BIT; + case 16: return VK_SAMPLE_COUNT_16_BIT; + case 32: return VK_SAMPLE_COUNT_32_BIT; + case 64: return VK_SAMPLE_COUNT_64_BIT; + default: + return 0; + } +} + +static bool +zink_is_format_supported(struct pipe_screen *pscreen, + enum pipe_format format, + enum pipe_texture_target target, + unsigned sample_count, + unsigned storage_sample_count, + unsigned bind) +{ + struct zink_screen *screen = zink_screen(pscreen); + + if (format == PIPE_FORMAT_NONE) + return screen->props.limits.framebufferNoAttachmentsSampleCounts & + vk_sample_count_flags(sample_count); + + VkFormat vkformat = zink_get_format(screen, format); + if (vkformat == VK_FORMAT_UNDEFINED) + return false; + + if (sample_count >= 1) { + VkSampleCountFlagBits sample_mask = vk_sample_count_flags(sample_count); + if (!sample_mask) + return false; + const struct util_format_description *desc = util_format_description(format); + if (util_format_is_depth_or_stencil(format)) { + if (util_format_has_depth(desc)) { + if (bind & PIPE_BIND_DEPTH_STENCIL && + (screen->props.limits.framebufferDepthSampleCounts & sample_mask) != sample_mask) + return false; + if (bind & PIPE_BIND_SAMPLER_VIEW && + (screen->props.limits.sampledImageDepthSampleCounts & sample_mask) != sample_mask) + return false; + } + if (util_format_has_stencil(desc)) { + if (bind & PIPE_BIND_DEPTH_STENCIL && + (screen->props.limits.framebufferStencilSampleCounts & sample_mask) != sample_mask) + return false; + if (bind & PIPE_BIND_SAMPLER_VIEW && + (screen->props.limits.sampledImageStencilSampleCounts & sample_mask) != sample_mask) + return false; + } + } else if (util_format_is_pure_integer(format)) { + if (bind & PIPE_BIND_RENDER_TARGET && + !(screen->props.limits.framebufferColorSampleCounts & sample_mask)) + return false; + if (bind & PIPE_BIND_SAMPLER_VIEW && + !(screen->props.limits.sampledImageIntegerSampleCounts & sample_mask)) + return false; + } else { + if (bind & PIPE_BIND_RENDER_TARGET && + !(screen->props.limits.framebufferColorSampleCounts & sample_mask)) + return false; + if (bind & PIPE_BIND_SAMPLER_VIEW && + !(screen->props.limits.sampledImageColorSampleCounts & sample_mask)) + return false; + } + } + + VkFormatProperties props; + vkGetPhysicalDeviceFormatProperties(screen->pdev, vkformat, &props); + + if (target == PIPE_BUFFER) { + if (bind & PIPE_BIND_VERTEX_BUFFER && + !(props.bufferFeatures & VK_FORMAT_FEATURE_VERTEX_BUFFER_BIT)) + return false; + } else { + /* all other targets are texture-targets */ + if (bind & PIPE_BIND_RENDER_TARGET && + !(props.optimalTilingFeatures & VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT)) + return false; + + if (bind & PIPE_BIND_BLENDABLE && + !(props.optimalTilingFeatures & VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BLEND_BIT)) + return false; + + if (bind & PIPE_BIND_SAMPLER_VIEW && + !(props.optimalTilingFeatures & VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT)) + return false; + + if (bind & PIPE_BIND_DEPTH_STENCIL && + !(props.optimalTilingFeatures & VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT)) + return false; + } + + if (util_format_is_compressed(format)) { + const struct util_format_description *desc = util_format_description(format); + if (desc->layout == UTIL_FORMAT_LAYOUT_BPTC && + !screen->feats.textureCompressionBC) + return false; + } + + return true; +} + +static void +zink_destroy_screen(struct pipe_screen *pscreen) +{ + struct zink_screen *screen = zink_screen(pscreen); + slab_destroy_parent(&screen->transfer_pool); + FREE(screen); +} + +static VkInstance +create_instance() +{ + VkApplicationInfo ai = {}; + ai.sType = VK_STRUCTURE_TYPE_APPLICATION_INFO; + + char proc_name[128]; + if (os_get_process_name(proc_name, ARRAY_SIZE(proc_name))) + ai.pApplicationName = proc_name; + else + ai.pApplicationName = "unknown"; + + ai.pEngineName = "mesa zink"; + ai.apiVersion = VK_API_VERSION_1_0; + + const char *extensions[] = { + VK_KHR_GET_PHYSICAL_DEVICE_PROPERTIES_2_EXTENSION_NAME, + VK_KHR_EXTERNAL_MEMORY_CAPABILITIES_EXTENSION_NAME, + }; + + VkInstanceCreateInfo ici = {}; + ici.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO; + ici.pApplicationInfo = &ai; + ici.ppEnabledExtensionNames = extensions; + ici.enabledExtensionCount = ARRAY_SIZE(extensions); + + VkInstance instance = VK_NULL_HANDLE; + VkResult err = vkCreateInstance(&ici, NULL, &instance); + if (err != VK_SUCCESS) + return VK_NULL_HANDLE; + + return instance; +} + +static VkPhysicalDevice +choose_pdev(const VkInstance instance) +{ + uint32_t i, pdev_count; + VkPhysicalDevice *pdevs, pdev; + vkEnumeratePhysicalDevices(instance, &pdev_count, NULL); + assert(pdev_count > 0); + + pdevs = malloc(sizeof(*pdevs) * pdev_count); + vkEnumeratePhysicalDevices(instance, &pdev_count, pdevs); + assert(pdev_count > 0); + + pdev = pdevs[0]; + for (i = 0; i < pdev_count; ++i) { + VkPhysicalDeviceProperties props; + vkGetPhysicalDeviceProperties(pdevs[i], &props); + if (props.deviceType == VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU) { + pdev = pdevs[i]; + break; + } + } + free(pdevs); + return pdev; +} + +static uint32_t +find_gfx_queue(const VkPhysicalDevice pdev) +{ + uint32_t num_queues; + vkGetPhysicalDeviceQueueFamilyProperties(pdev, &num_queues, NULL); + assert(num_queues > 0); + + VkQueueFamilyProperties *props = malloc(sizeof(*props) * num_queues); + vkGetPhysicalDeviceQueueFamilyProperties(pdev, &num_queues, props); + + for (uint32_t i = 0; i < num_queues; i++) { + if (props[i].queueFlags & VK_QUEUE_GRAPHICS_BIT) { + free(props); + return i; + } + } + + return UINT32_MAX; +} + +static void +zink_flush_frontbuffer(struct pipe_screen *pscreen, + struct pipe_resource *pres, + unsigned level, unsigned layer, + void *winsys_drawable_handle, + struct pipe_box *sub_box) +{ + struct zink_screen *screen = zink_screen(pscreen); + struct sw_winsys *winsys = screen->winsys; + struct zink_resource *res = zink_resource(pres); + + if (!winsys) + return; + void *map = winsys->displaytarget_map(winsys, res->dt, 0); + + if (map) { + VkImageSubresource isr = {}; + isr.aspectMask = res->aspect; + isr.mipLevel = level; + isr.arrayLayer = layer; + VkSubresourceLayout layout; + vkGetImageSubresourceLayout(screen->dev, res->image, &isr, &layout); + + void *ptr; + VkResult result = vkMapMemory(screen->dev, res->mem, res->offset, res->size, 0, &ptr); + if (result != VK_SUCCESS) { + debug_printf("failed to map memory for display\n"); + return; + } + for (int i = 0; i < pres->height0; ++i) { + uint8_t *src = (uint8_t *)ptr + i * layout.rowPitch; + uint8_t *dst = (uint8_t *)map + i * res->dt_stride; + memcpy(dst, src, res->dt_stride); + } + vkUnmapMemory(screen->dev, res->mem); + } + + winsys->displaytarget_unmap(winsys, res->dt); + + assert(res->dt); + if (res->dt) + winsys->displaytarget_display(winsys, res->dt, winsys_drawable_handle, sub_box); +} + +static struct pipe_screen * +zink_internal_create_screen(struct sw_winsys *winsys, int fd) +{ + struct zink_screen *screen = CALLOC_STRUCT(zink_screen); + if (!screen) + return NULL; + + zink_debug = debug_get_option_zink_debug(); + + screen->instance = create_instance(); + screen->pdev = choose_pdev(screen->instance); + screen->gfx_queue = find_gfx_queue(screen->pdev); + + vkGetPhysicalDeviceProperties(screen->pdev, &screen->props); + vkGetPhysicalDeviceFeatures(screen->pdev, &screen->feats); + vkGetPhysicalDeviceMemoryProperties(screen->pdev, &screen->mem_props); + + screen->have_X8_D24_UNORM_PACK32 = zink_is_depth_format_supported(screen, + VK_FORMAT_X8_D24_UNORM_PACK32); + screen->have_D24_UNORM_S8_UINT = zink_is_depth_format_supported(screen, + VK_FORMAT_D24_UNORM_S8_UINT); + + uint32_t num_extensions = 0; + if (vkEnumerateDeviceExtensionProperties(screen->pdev, NULL, + &num_extensions, NULL) == VK_SUCCESS && num_extensions > 0) { + VkExtensionProperties *extensions = MALLOC(sizeof(VkExtensionProperties) * + num_extensions); + if (extensions) { + vkEnumerateDeviceExtensionProperties(screen->pdev, NULL, + &num_extensions, extensions); + + for (uint32_t i = 0; i < num_extensions; ++i) { + if (!strcmp(extensions[i].extensionName, + VK_KHR_MAINTENANCE1_EXTENSION_NAME)) + screen->have_KHR_maintenance1 = true; + if (!strcmp(extensions[i].extensionName, + VK_KHR_EXTERNAL_MEMORY_FD_EXTENSION_NAME)) + screen->have_KHR_external_memory_fd = true; + } + FREE(extensions); + } + } + + if (!screen->have_KHR_maintenance1) { + debug_printf("ZINK: VK_KHR_maintenance1 required!\n"); + goto fail; + } + + VkDeviceQueueCreateInfo qci = {}; + float dummy = 0.0f; + qci.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO; + qci.queueFamilyIndex = screen->gfx_queue; + qci.queueCount = 1; + qci.pQueuePriorities = &dummy; + + VkDeviceCreateInfo dci = {}; + dci.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO; + dci.queueCreateInfoCount = 1; + dci.pQueueCreateInfos = &qci; + dci.pEnabledFeatures = &screen->feats; + const char *extensions[3] = { + VK_KHR_MAINTENANCE1_EXTENSION_NAME, + }; + num_extensions = 1; + + if (fd >= 0 && !screen->have_KHR_external_memory_fd) { + debug_printf("ZINK: KHR_external_memory_fd required!\n"); + goto fail; + } + + if (screen->have_KHR_external_memory_fd) { + extensions[num_extensions++] = VK_KHR_EXTERNAL_MEMORY_EXTENSION_NAME; + extensions[num_extensions++] = VK_KHR_EXTERNAL_MEMORY_FD_EXTENSION_NAME; + } + assert(num_extensions <= ARRAY_SIZE(extensions)); + + dci.ppEnabledExtensionNames = extensions; + dci.enabledExtensionCount = num_extensions; + if (vkCreateDevice(screen->pdev, &dci, NULL, &screen->dev) != VK_SUCCESS) + goto fail; + + screen->winsys = winsys; + + screen->base.get_name = zink_get_name; + screen->base.get_vendor = zink_get_vendor; + screen->base.get_device_vendor = zink_get_device_vendor; + screen->base.get_param = zink_get_param; + screen->base.get_paramf = zink_get_paramf; + screen->base.get_shader_param = zink_get_shader_param; + screen->base.get_compiler_options = zink_get_compiler_options; + screen->base.is_format_supported = zink_is_format_supported; + screen->base.context_create = zink_context_create; + screen->base.flush_frontbuffer = zink_flush_frontbuffer; + screen->base.destroy = zink_destroy_screen; + + zink_screen_resource_init(&screen->base); + zink_screen_fence_init(&screen->base); + + slab_create_parent(&screen->transfer_pool, sizeof(struct zink_transfer), 16); + + return &screen->base; + +fail: + FREE(screen); + return NULL; +} + +struct pipe_screen * +zink_create_screen(struct sw_winsys *winsys) +{ + return zink_internal_create_screen(winsys, -1); +} + +struct pipe_screen * +zink_drm_create_screen(int fd) +{ + return zink_internal_create_screen(NULL, fd); +} diff -Nru mesa-19.2.8/src/gallium/drivers/zink/zink_screen.h mesa-20.0.8/src/gallium/drivers/zink/zink_screen.h --- mesa-19.2.8/src/gallium/drivers/zink/zink_screen.h 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/zink/zink_screen.h 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,76 @@ +/* + * Copyright 2018 Collabora Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef ZINK_SCREEN_H +#define ZINK_SCREEN_H + +#include "pipe/p_screen.h" +#include "util/slab.h" + +#include + +extern uint32_t zink_debug; + +#define ZINK_DEBUG_NIR 0x1 +#define ZINK_DEBUG_SPIRV 0x2 +#define ZINK_DEBUG_TGSI 0x4 + +struct zink_screen { + struct pipe_screen base; + + struct sw_winsys *winsys; + + struct slab_parent_pool transfer_pool; + + VkInstance instance; + VkPhysicalDevice pdev; + + VkPhysicalDeviceProperties props; + VkPhysicalDeviceFeatures feats; + VkPhysicalDeviceMemoryProperties mem_props; + + bool have_KHR_maintenance1; + bool have_KHR_external_memory_fd; + + bool have_X8_D24_UNORM_PACK32; + bool have_D24_UNORM_S8_UINT; + + uint32_t gfx_queue; + VkDevice dev; + + PFN_vkGetMemoryFdKHR vk_GetMemoryFdKHR; +}; + +static inline struct zink_screen * +zink_screen(struct pipe_screen *pipe) +{ + return (struct zink_screen *)pipe; +} + +VkFormat +zink_get_format(struct zink_screen *screen, enum pipe_format format); + +bool +zink_is_depth_format_supported(struct zink_screen *screen, VkFormat format); + +#endif diff -Nru mesa-19.2.8/src/gallium/drivers/zink/zink_state.c mesa-20.0.8/src/gallium/drivers/zink/zink_state.c --- mesa-19.2.8/src/gallium/drivers/zink/zink_state.c 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/zink/zink_state.c 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,455 @@ +/* + * Copyright 2018 Collabora Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "zink_state.h" + +#include "zink_context.h" +#include "zink_screen.h" + +#include "util/u_memory.h" + +#include + +static void * +zink_create_vertex_elements_state(struct pipe_context *pctx, + unsigned num_elements, + const struct pipe_vertex_element *elements) +{ + struct zink_screen *screen = zink_screen(pctx->screen); + unsigned int i; + struct zink_vertex_elements_state *ves = CALLOC_STRUCT(zink_vertex_elements_state); + if (!ves) + return NULL; + + int buffer_map[PIPE_MAX_ATTRIBS]; + for (int i = 0; i < ARRAY_SIZE(buffer_map); ++i) + buffer_map[i] = -1; + + int num_bindings = 0; + for (i = 0; i < num_elements; ++i) { + const struct pipe_vertex_element *elem = elements + i; + assert(!elem->instance_divisor); + + int binding = elem->vertex_buffer_index; + if (buffer_map[binding] < 0) { + ves->binding_map[num_bindings] = binding; + buffer_map[binding] = num_bindings++; + } + binding = buffer_map[binding]; + + + ves->bindings[binding].binding = binding; + ves->bindings[binding].inputRate = VK_VERTEX_INPUT_RATE_VERTEX; + + ves->hw_state.attribs[i].binding = binding; + ves->hw_state.attribs[i].location = i; // TODO: unsure + ves->hw_state.attribs[i].format = zink_get_format(screen, + elem->src_format); + assert(ves->hw_state.attribs[i].format != VK_FORMAT_UNDEFINED); + ves->hw_state.attribs[i].offset = elem->src_offset; + } + + ves->hw_state.num_bindings = num_bindings; + ves->hw_state.num_attribs = num_elements; + return ves; +} + +static void +zink_bind_vertex_elements_state(struct pipe_context *pctx, + void *cso) +{ + struct zink_context *ctx = zink_context(pctx); + struct zink_gfx_pipeline_state *state = &ctx->gfx_pipeline_state; + ctx->element_state = cso; + if (cso) { + state->element_state = &ctx->element_state->hw_state; + struct zink_vertex_elements_state *ves = cso; + for (int i = 0; i < state->element_state->num_bindings; ++i) { + state->bindings[i].binding = ves->bindings[i].binding; + state->bindings[i].inputRate = ves->bindings[i].inputRate; + } + } else + state->element_state = NULL; +} + +static void +zink_delete_vertex_elements_state(struct pipe_context *pctx, + void *ves) +{ +} + +static VkBlendFactor +blend_factor(enum pipe_blendfactor factor) +{ + switch (factor) { + case PIPE_BLENDFACTOR_ONE: return VK_BLEND_FACTOR_ONE; + case PIPE_BLENDFACTOR_SRC_COLOR: return VK_BLEND_FACTOR_SRC_COLOR; + case PIPE_BLENDFACTOR_SRC_ALPHA: return VK_BLEND_FACTOR_SRC_ALPHA; + case PIPE_BLENDFACTOR_DST_ALPHA: return VK_BLEND_FACTOR_DST_ALPHA; + case PIPE_BLENDFACTOR_DST_COLOR: return VK_BLEND_FACTOR_DST_COLOR; + case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: + return VK_BLEND_FACTOR_SRC_ALPHA_SATURATE; + case PIPE_BLENDFACTOR_CONST_COLOR: return VK_BLEND_FACTOR_CONSTANT_COLOR; + case PIPE_BLENDFACTOR_CONST_ALPHA: return VK_BLEND_FACTOR_CONSTANT_ALPHA; + case PIPE_BLENDFACTOR_SRC1_COLOR: return VK_BLEND_FACTOR_SRC1_COLOR; + case PIPE_BLENDFACTOR_SRC1_ALPHA: return VK_BLEND_FACTOR_SRC1_ALPHA; + + case PIPE_BLENDFACTOR_ZERO: return VK_BLEND_FACTOR_ZERO; + + case PIPE_BLENDFACTOR_INV_SRC_COLOR: + return VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR; + case PIPE_BLENDFACTOR_INV_SRC_ALPHA: + return VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA; + case PIPE_BLENDFACTOR_INV_DST_ALPHA: + return VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA; + case PIPE_BLENDFACTOR_INV_DST_COLOR: + return VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR; + + case PIPE_BLENDFACTOR_INV_CONST_COLOR: + return VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR; + case PIPE_BLENDFACTOR_INV_CONST_ALPHA: + return VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA; + case PIPE_BLENDFACTOR_INV_SRC1_COLOR: + return VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR; + case PIPE_BLENDFACTOR_INV_SRC1_ALPHA: + return VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA; + } + unreachable("unexpected blend factor"); +} + + +static bool +need_blend_constants(enum pipe_blendfactor factor) +{ + switch (factor) { + case PIPE_BLENDFACTOR_CONST_COLOR: + case PIPE_BLENDFACTOR_CONST_ALPHA: + case PIPE_BLENDFACTOR_INV_CONST_COLOR: + case PIPE_BLENDFACTOR_INV_CONST_ALPHA: + return true; + + default: + return false; + } +} + +static VkBlendOp +blend_op(enum pipe_blend_func func) +{ + switch (func) { + case PIPE_BLEND_ADD: return VK_BLEND_OP_ADD; + case PIPE_BLEND_SUBTRACT: return VK_BLEND_OP_SUBTRACT; + case PIPE_BLEND_REVERSE_SUBTRACT: return VK_BLEND_OP_REVERSE_SUBTRACT; + case PIPE_BLEND_MIN: return VK_BLEND_OP_MIN; + case PIPE_BLEND_MAX: return VK_BLEND_OP_MAX; + } + unreachable("unexpected blend function"); +} + +static VkLogicOp +logic_op(enum pipe_logicop func) +{ + switch (func) { + case PIPE_LOGICOP_CLEAR: return VK_LOGIC_OP_CLEAR; + case PIPE_LOGICOP_NOR: return VK_LOGIC_OP_NOR; + case PIPE_LOGICOP_AND_INVERTED: return VK_LOGIC_OP_AND_INVERTED; + case PIPE_LOGICOP_COPY_INVERTED: return VK_LOGIC_OP_COPY_INVERTED; + case PIPE_LOGICOP_AND_REVERSE: return VK_LOGIC_OP_AND_REVERSE; + case PIPE_LOGICOP_INVERT: return VK_LOGIC_OP_INVERT; + case PIPE_LOGICOP_XOR: return VK_LOGIC_OP_XOR; + case PIPE_LOGICOP_NAND: return VK_LOGIC_OP_NAND; + case PIPE_LOGICOP_AND: return VK_LOGIC_OP_AND; + case PIPE_LOGICOP_EQUIV: return VK_LOGIC_OP_EQUIVALENT; + case PIPE_LOGICOP_NOOP: return VK_LOGIC_OP_NO_OP; + case PIPE_LOGICOP_OR_INVERTED: return VK_LOGIC_OP_OR_INVERTED; + case PIPE_LOGICOP_COPY: return VK_LOGIC_OP_COPY; + case PIPE_LOGICOP_OR_REVERSE: return VK_LOGIC_OP_OR_REVERSE; + case PIPE_LOGICOP_OR: return VK_LOGIC_OP_OR; + case PIPE_LOGICOP_SET: return VK_LOGIC_OP_SET; + } + unreachable("unexpected logicop function"); +} + +static void * +zink_create_blend_state(struct pipe_context *pctx, + const struct pipe_blend_state *blend_state) +{ + struct zink_blend_state *cso = CALLOC_STRUCT(zink_blend_state); + if (!cso) + return NULL; + + if (blend_state->logicop_enable) { + cso->logicop_enable = VK_TRUE; + cso->logicop_func = logic_op(blend_state->logicop_func); + } + + /* TODO: figure out what to do with dither (nothing is probably "OK" for now, + * as dithering is undefined in GL + */ + + /* TODO: these are multisampling-state, and should be set there instead of + * here, as that's closer tied to the update-frequency + */ + cso->alpha_to_coverage = blend_state->alpha_to_coverage; + cso->alpha_to_one = blend_state->alpha_to_one; + + cso->need_blend_constants = false; + + for (int i = 0; i < PIPE_MAX_COLOR_BUFS; ++i) { + const struct pipe_rt_blend_state *rt = blend_state->rt; + if (blend_state->independent_blend_enable) + rt = blend_state->rt + i; + + VkPipelineColorBlendAttachmentState att = { }; + + if (rt->blend_enable) { + att.blendEnable = VK_TRUE; + att.srcColorBlendFactor = blend_factor(rt->rgb_src_factor); + att.dstColorBlendFactor = blend_factor(rt->rgb_dst_factor); + att.colorBlendOp = blend_op(rt->rgb_func); + att.srcAlphaBlendFactor = blend_factor(rt->alpha_src_factor); + att.dstAlphaBlendFactor = blend_factor(rt->alpha_dst_factor); + att.alphaBlendOp = blend_op(rt->alpha_func); + + if (need_blend_constants(rt->rgb_src_factor) || + need_blend_constants(rt->rgb_dst_factor) || + need_blend_constants(rt->alpha_src_factor) || + need_blend_constants(rt->alpha_dst_factor)) + cso->need_blend_constants = true; + } + + if (rt->colormask & PIPE_MASK_R) + att.colorWriteMask |= VK_COLOR_COMPONENT_R_BIT; + if (rt->colormask & PIPE_MASK_G) + att.colorWriteMask |= VK_COLOR_COMPONENT_G_BIT; + if (rt->colormask & PIPE_MASK_B) + att.colorWriteMask |= VK_COLOR_COMPONENT_B_BIT; + if (rt->colormask & PIPE_MASK_A) + att.colorWriteMask |= VK_COLOR_COMPONENT_A_BIT; + + cso->attachments[i] = att; + } + + return cso; +} + +static void +zink_bind_blend_state(struct pipe_context *pctx, void *cso) +{ + zink_context(pctx)->gfx_pipeline_state.blend_state = cso; +} + +static void +zink_delete_blend_state(struct pipe_context *pctx, void *blend_state) +{ + FREE(blend_state); +} + +static VkCompareOp +compare_op(enum pipe_compare_func func) +{ + switch (func) { + case PIPE_FUNC_NEVER: return VK_COMPARE_OP_NEVER; + case PIPE_FUNC_LESS: return VK_COMPARE_OP_LESS; + case PIPE_FUNC_EQUAL: return VK_COMPARE_OP_EQUAL; + case PIPE_FUNC_LEQUAL: return VK_COMPARE_OP_LESS_OR_EQUAL; + case PIPE_FUNC_GREATER: return VK_COMPARE_OP_GREATER; + case PIPE_FUNC_NOTEQUAL: return VK_COMPARE_OP_NOT_EQUAL; + case PIPE_FUNC_GEQUAL: return VK_COMPARE_OP_GREATER_OR_EQUAL; + case PIPE_FUNC_ALWAYS: return VK_COMPARE_OP_ALWAYS; + } + unreachable("unexpected func"); +} + +static VkStencilOp +stencil_op(enum pipe_stencil_op op) +{ + switch (op) { + case PIPE_STENCIL_OP_KEEP: return VK_STENCIL_OP_KEEP; + case PIPE_STENCIL_OP_ZERO: return VK_STENCIL_OP_ZERO; + case PIPE_STENCIL_OP_REPLACE: return VK_STENCIL_OP_REPLACE; + case PIPE_STENCIL_OP_INCR: return VK_STENCIL_OP_INCREMENT_AND_CLAMP; + case PIPE_STENCIL_OP_DECR: return VK_STENCIL_OP_DECREMENT_AND_CLAMP; + case PIPE_STENCIL_OP_INCR_WRAP: return VK_STENCIL_OP_INCREMENT_AND_CLAMP; + case PIPE_STENCIL_OP_DECR_WRAP: return VK_STENCIL_OP_DECREMENT_AND_CLAMP; + case PIPE_STENCIL_OP_INVERT: return VK_STENCIL_OP_INVERT; + } + unreachable("unexpected op"); +} + +static VkStencilOpState +stencil_op_state(const struct pipe_stencil_state *src) +{ + VkStencilOpState ret; + ret.failOp = stencil_op(src->fail_op); + ret.passOp = stencil_op(src->zpass_op); + ret.depthFailOp = stencil_op(src->zfail_op); + ret.compareOp = compare_op(src->func); + ret.compareMask = src->valuemask; + ret.writeMask = src->writemask; + ret.reference = 0; // not used: we'll use a dynamic state for this + return ret; +} + +static void * +zink_create_depth_stencil_alpha_state(struct pipe_context *pctx, + const struct pipe_depth_stencil_alpha_state *depth_stencil_alpha) +{ + struct zink_depth_stencil_alpha_state *cso = CALLOC_STRUCT(zink_depth_stencil_alpha_state); + if (!cso) + return NULL; + + if (depth_stencil_alpha->depth.enabled) { + cso->depth_test = VK_TRUE; + cso->depth_compare_op = compare_op(depth_stencil_alpha->depth.func); + } + + if (depth_stencil_alpha->depth.bounds_test) { + cso->depth_bounds_test = VK_TRUE; + cso->min_depth_bounds = depth_stencil_alpha->depth.bounds_min; + cso->max_depth_bounds = depth_stencil_alpha->depth.bounds_max; + } + + if (depth_stencil_alpha->stencil[0].enabled) { + cso->stencil_test = VK_TRUE; + cso->stencil_front = stencil_op_state(depth_stencil_alpha->stencil); + } + + if (depth_stencil_alpha->stencil[0].enabled) + cso->stencil_back = stencil_op_state(depth_stencil_alpha->stencil + 1); + else + cso->stencil_back = cso->stencil_front; + + cso->depth_write = depth_stencil_alpha->depth.writemask; + + return cso; +} + +static void +zink_bind_depth_stencil_alpha_state(struct pipe_context *pctx, void *cso) +{ + zink_context(pctx)->gfx_pipeline_state.depth_stencil_alpha_state = cso; +} + +static void +zink_delete_depth_stencil_alpha_state(struct pipe_context *pctx, + void *depth_stencil_alpha) +{ + FREE(depth_stencil_alpha); +} + +static float +round_to_granularity(float value, float granularity) +{ + return roundf(value / granularity) * granularity; +} + +static float +line_width(float width, float granularity, const float range[2]) +{ + assert(granularity >= 0); + assert(range[0] <= range[1]); + + if (granularity > 0) + width = round_to_granularity(width, granularity); + + return CLAMP(width, range[0], range[1]); +} + +static void * +zink_create_rasterizer_state(struct pipe_context *pctx, + const struct pipe_rasterizer_state *rs_state) +{ + struct zink_screen *screen = zink_screen(pctx->screen); + + struct zink_rasterizer_state *state = CALLOC_STRUCT(zink_rasterizer_state); + if (!state) + return NULL; + + state->base = *rs_state; + + assert(rs_state->depth_clip_far == rs_state->depth_clip_near); + state->hw_state.depth_clamp = rs_state->depth_clip_near == 0; + state->hw_state.rasterizer_discard = rs_state->rasterizer_discard; + + assert(rs_state->fill_front <= PIPE_POLYGON_MODE_POINT); + if (rs_state->fill_back != rs_state->fill_front) + debug_printf("BUG: vulkan doesn't support different front and back fill modes\n"); + state->hw_state.polygon_mode = (VkPolygonMode)rs_state->fill_front; // same values + state->hw_state.cull_mode = (VkCullModeFlags)rs_state->cull_face; // same bits + + state->hw_state.front_face = rs_state->front_ccw ? + VK_FRONT_FACE_COUNTER_CLOCKWISE : + VK_FRONT_FACE_CLOCKWISE; + + state->offset_point = rs_state->offset_point; + state->offset_line = rs_state->offset_line; + state->offset_tri = rs_state->offset_tri; + state->offset_units = rs_state->offset_units; + state->offset_clamp = rs_state->offset_clamp; + state->offset_scale = rs_state->offset_scale; + + state->line_width = line_width(rs_state->line_width, + screen->props.limits.lineWidthGranularity, + screen->props.limits.lineWidthRange); + + return state; +} + +static void +zink_bind_rasterizer_state(struct pipe_context *pctx, void *cso) +{ + struct zink_context *ctx = zink_context(pctx); + ctx->rast_state = cso; + + if (ctx->rast_state) { + ctx->gfx_pipeline_state.rast_state = &ctx->rast_state->hw_state; + ctx->line_width = ctx->rast_state->line_width; + } +} + +static void +zink_delete_rasterizer_state(struct pipe_context *pctx, void *rs_state) +{ + FREE(rs_state); +} + +void +zink_context_state_init(struct pipe_context *pctx) +{ + pctx->create_vertex_elements_state = zink_create_vertex_elements_state; + pctx->bind_vertex_elements_state = zink_bind_vertex_elements_state; + pctx->delete_vertex_elements_state = zink_delete_vertex_elements_state; + + pctx->create_blend_state = zink_create_blend_state; + pctx->bind_blend_state = zink_bind_blend_state; + pctx->delete_blend_state = zink_delete_blend_state; + + pctx->create_depth_stencil_alpha_state = zink_create_depth_stencil_alpha_state; + pctx->bind_depth_stencil_alpha_state = zink_bind_depth_stencil_alpha_state; + pctx->delete_depth_stencil_alpha_state = zink_delete_depth_stencil_alpha_state; + + pctx->create_rasterizer_state = zink_create_rasterizer_state; + pctx->bind_rasterizer_state = zink_bind_rasterizer_state; + pctx->delete_rasterizer_state = zink_delete_rasterizer_state; +} diff -Nru mesa-19.2.8/src/gallium/drivers/zink/zink_state.h mesa-20.0.8/src/gallium/drivers/zink/zink_state.h --- mesa-19.2.8/src/gallium/drivers/zink/zink_state.h 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/zink/zink_state.h 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,90 @@ +/* + * Copyright 2018 Collabora Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef ZINK_STATE_H +#define ZINK_STATE_H + +#include + +#include "pipe/p_state.h" + +struct zink_vertex_elements_hw_state { + VkVertexInputAttributeDescription attribs[PIPE_MAX_ATTRIBS]; + uint32_t num_bindings, num_attribs; +}; + +struct zink_vertex_elements_state { + struct { + uint32_t binding; + VkVertexInputRate inputRate; + } bindings[PIPE_MAX_ATTRIBS]; + uint8_t binding_map[PIPE_MAX_ATTRIBS]; + struct zink_vertex_elements_hw_state hw_state; +}; + +struct zink_rasterizer_hw_state { + VkBool32 depth_clamp; + VkBool32 rasterizer_discard; + VkFrontFace front_face; + VkPolygonMode polygon_mode; + VkCullModeFlags cull_mode; +}; + +struct zink_rasterizer_state { + struct pipe_rasterizer_state base; + bool offset_point, offset_line, offset_tri; + float offset_units, offset_clamp, offset_scale; + float line_width; + struct zink_rasterizer_hw_state hw_state; +}; + +struct zink_blend_state { + VkPipelineColorBlendAttachmentState attachments[PIPE_MAX_COLOR_BUFS]; + + VkBool32 logicop_enable; + VkLogicOp logicop_func; + + VkBool32 alpha_to_coverage; + VkBool32 alpha_to_one; + + bool need_blend_constants; +}; + +struct zink_depth_stencil_alpha_state { + VkBool32 depth_test; + VkCompareOp depth_compare_op; + + VkBool32 depth_bounds_test; + float min_depth_bounds, max_depth_bounds; + + VkBool32 stencil_test; + VkStencilOpState stencil_front; + VkStencilOpState stencil_back; + + VkBool32 depth_write; +}; + +void +zink_context_state_init(struct pipe_context *pctx); + +#endif diff -Nru mesa-19.2.8/src/gallium/drivers/zink/zink_surface.c mesa-20.0.8/src/gallium/drivers/zink/zink_surface.c --- mesa-19.2.8/src/gallium/drivers/zink/zink_surface.c 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/zink/zink_surface.c 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,139 @@ +/* + * Copyright 2018 Collabora Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "zink_context.h" +#include "zink_resource.h" +#include "zink_screen.h" +#include "zink_surface.h" + +#include "util/format/u_format.h" +#include "util/u_inlines.h" +#include "util/u_memory.h" + +static struct pipe_surface * +zink_create_surface(struct pipe_context *pctx, + struct pipe_resource *pres, + const struct pipe_surface *templ) +{ + struct zink_screen *screen = zink_screen(pctx->screen); + unsigned int level = templ->u.tex.level; + + struct zink_surface *surface = CALLOC_STRUCT(zink_surface); + if (!surface) + return NULL; + + pipe_resource_reference(&surface->base.texture, pres); + pipe_reference_init(&surface->base.reference, 1); + surface->base.context = pctx; + surface->base.format = templ->format; + surface->base.width = u_minify(pres->width0, level); + surface->base.height = u_minify(pres->height0, level); + surface->base.nr_samples = templ->nr_samples; + surface->base.u.tex.level = level; + surface->base.u.tex.first_layer = templ->u.tex.first_layer; + surface->base.u.tex.last_layer = templ->u.tex.last_layer; + + struct zink_resource *res = zink_resource(pres); + + VkImageViewCreateInfo ivci = {}; + ivci.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO; + ivci.image = res->image; + + switch (pres->target) { + case PIPE_TEXTURE_1D: + ivci.viewType = VK_IMAGE_VIEW_TYPE_1D; + break; + + case PIPE_TEXTURE_1D_ARRAY: + ivci.viewType = VK_IMAGE_VIEW_TYPE_1D_ARRAY; + break; + + case PIPE_TEXTURE_2D: + case PIPE_TEXTURE_RECT: + ivci.viewType = VK_IMAGE_VIEW_TYPE_2D; + break; + + case PIPE_TEXTURE_2D_ARRAY: + ivci.viewType = VK_IMAGE_VIEW_TYPE_2D_ARRAY; + break; + + case PIPE_TEXTURE_CUBE: + ivci.viewType = VK_IMAGE_VIEW_TYPE_CUBE; + break; + + case PIPE_TEXTURE_CUBE_ARRAY: + ivci.viewType = VK_IMAGE_VIEW_TYPE_CUBE_ARRAY; + break; + + case PIPE_TEXTURE_3D: + ivci.viewType = VK_IMAGE_VIEW_TYPE_2D; + break; + + default: + unreachable("unsupported target"); + } + + ivci.format = zink_get_format(screen, templ->format); + + // TODO: format swizzles + ivci.components.r = VK_COMPONENT_SWIZZLE_R; + ivci.components.g = VK_COMPONENT_SWIZZLE_G; + ivci.components.b = VK_COMPONENT_SWIZZLE_B; + ivci.components.a = VK_COMPONENT_SWIZZLE_A; + + ivci.subresourceRange.aspectMask = res->aspect; + ivci.subresourceRange.baseMipLevel = templ->u.tex.level; + ivci.subresourceRange.levelCount = 1; + ivci.subresourceRange.baseArrayLayer = templ->u.tex.first_layer; + ivci.subresourceRange.layerCount = 1 + templ->u.tex.last_layer - templ->u.tex.first_layer; + + if (pres->target == PIPE_TEXTURE_CUBE || + pres->target == PIPE_TEXTURE_CUBE_ARRAY) + ivci.subresourceRange.layerCount *= 6; + + if (vkCreateImageView(screen->dev, &ivci, NULL, + &surface->image_view) != VK_SUCCESS) { + FREE(surface); + return NULL; + } + + return &surface->base; +} + +static void +zink_surface_destroy(struct pipe_context *pctx, + struct pipe_surface *psurface) +{ + struct zink_screen *screen = zink_screen(pctx->screen); + struct zink_surface *surface = zink_surface(psurface); + pipe_resource_reference(&psurface->texture, NULL); + vkDestroyImageView(screen->dev, surface->image_view, NULL); + FREE(surface); +} + +void +zink_context_surface_init(struct pipe_context *context) +{ + context->create_surface = zink_create_surface; + context->surface_destroy = zink_surface_destroy; +} diff -Nru mesa-19.2.8/src/gallium/drivers/zink/zink_surface.h mesa-20.0.8/src/gallium/drivers/zink/zink_surface.h --- mesa-19.2.8/src/gallium/drivers/zink/zink_surface.h 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/drivers/zink/zink_surface.h 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,47 @@ +/* + * Copyright 2018 Collabora Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + + #ifndef ZINK_SURFACE_H + #define ZINK_SURFACE_H + +#include "pipe/p_state.h" + +#include + +struct pipe_context; + +struct zink_surface { + struct pipe_surface base; + VkImageView image_view; +}; + +static inline struct zink_surface * +zink_surface(struct pipe_surface *pipe) +{ + return (struct zink_surface *)pipe; +} + +void +zink_context_surface_init(struct pipe_context *context); + +#endif diff -Nru mesa-19.2.8/src/gallium/include/pipe/p_config.h mesa-20.0.8/src/gallium/include/pipe/p_config.h --- mesa-19.2.8/src/gallium/include/pipe/p_config.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/include/pipe/p_config.h 2020-06-12 01:21:17.000000000 +0000 @@ -127,19 +127,6 @@ */ #include "util/u_endian.h" -#if !defined(PIPE_ARCH_LITTLE_ENDIAN) && !defined(PIPE_ARCH_BIG_ENDIAN) - -#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) || defined(PIPE_ARCH_ARM) || defined(PIPE_ARCH_AARCH64) -#define PIPE_ARCH_LITTLE_ENDIAN -#elif defined(PIPE_ARCH_PPC) || defined(PIPE_ARCH_PPC_64) || defined(PIPE_ARCH_S390) -#define PIPE_ARCH_BIG_ENDIAN -#endif - -#endif - -#if !defined(PIPE_ARCH_LITTLE_ENDIAN) && !defined(PIPE_ARCH_BIG_ENDIAN) -#error Unknown Endianness -#endif /* * Auto-detect the operating system family. diff -Nru mesa-19.2.8/src/gallium/include/pipe/p_context.h mesa-20.0.8/src/gallium/include/pipe/p_context.h --- mesa-19.2.8/src/gallium/include/pipe/p_context.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/include/pipe/p_context.h 2020-06-12 01:21:17.000000000 +0000 @@ -191,6 +191,50 @@ */ void (*set_active_query_state)(struct pipe_context *pipe, bool enable); + /** + * INTEL Performance Query + */ + /*@{*/ + + unsigned (*init_intel_perf_query_info)(struct pipe_context *pipe); + + void (*get_intel_perf_query_info)(struct pipe_context *pipe, + unsigned query_index, + const char **name, + uint32_t *data_size, + uint32_t *n_counters, + uint32_t *n_active); + + void (*get_intel_perf_query_counter_info)(struct pipe_context *pipe, + unsigned query_index, + unsigned counter_index, + const char **name, + const char **desc, + uint32_t *offset, + uint32_t *data_size, + uint32_t *type_enum, + uint32_t *data_type_enum, + uint64_t *raw_max); + + struct pipe_query *(*new_intel_perf_query_obj)(struct pipe_context *pipe, + unsigned query_index); + + void (*begin_intel_perf_query)(struct pipe_context *pipe, struct pipe_query *q); + + void (*end_intel_perf_query)(struct pipe_context *pipe, struct pipe_query *q); + + void (*delete_intel_perf_query)(struct pipe_context *pipe, struct pipe_query *q); + + void (*wait_intel_perf_query)(struct pipe_context *pipe, struct pipe_query *q); + + bool (*is_intel_perf_query_ready)(struct pipe_context *pipe, struct pipe_query *q); + + void (*get_intel_perf_query_data)(struct pipe_context *pipe, + struct pipe_query *q, + size_t data_size, + uint32_t *data, + uint32_t *bytes_written); + /*@}*/ /** diff -Nru mesa-19.2.8/src/gallium/include/pipe/p_defines.h mesa-20.0.8/src/gallium/include/pipe/p_defines.h --- mesa-19.2.8/src/gallium/include/pipe/p_defines.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/include/pipe/p_defines.h 2020-06-12 01:21:17.000000000 +0000 @@ -494,6 +494,7 @@ #define PIPE_RESOURCE_FLAG_MAP_COHERENT (1 << 1) #define PIPE_RESOURCE_FLAG_TEXTURING_MORE_LIKELY (1 << 2) #define PIPE_RESOURCE_FLAG_SPARSE (1 << 3) +#define PIPE_RESOURCE_FLAG_SINGLE_THREAD_USE (1 << 4) #define PIPE_RESOURCE_FLAG_DRV_PRIV (1 << 8) /* driver/winsys private */ #define PIPE_RESOURCE_FLAG_ST_PRIV (1 << 24) /* state-tracker/winsys private */ @@ -693,6 +694,7 @@ PIPE_CAP_MAX_RENDER_TARGETS, PIPE_CAP_OCCLUSION_QUERY, PIPE_CAP_QUERY_TIME_ELAPSED, + PIPE_CAP_TEXTURE_SHADOW_MAP, PIPE_CAP_TEXTURE_SWIZZLE, PIPE_CAP_MAX_TEXTURE_2D_SIZE, PIPE_CAP_MAX_TEXTURE_3D_LEVELS, @@ -883,7 +885,7 @@ PIPE_CAP_COMPUTE_SHADER_DERIVATIVES, PIPE_CAP_TGSI_SKIP_SHRINK_IO_ARRAYS, PIPE_CAP_IMAGE_LOAD_FORMATTED, - PIPE_CAP_MAX_FRAMES_IN_FLIGHT, + PIPE_CAP_THROTTLE, PIPE_CAP_DMABUF, PIPE_CAP_PREFER_COMPUTE_FOR_MULTIMEDIA, PIPE_CAP_FRAGMENT_SHADER_INTERLOCK, @@ -897,6 +899,19 @@ PIPE_CAP_TEXTURE_SHADOW_LOD, PIPE_CAP_SHADER_SAMPLES_IDENTICAL, PIPE_CAP_TGSI_ATOMINC_WRAP, + PIPE_CAP_PREFER_IMM_ARRAYS_AS_CONSTBUF, + PIPE_CAP_GL_SPIRV, + PIPE_CAP_GL_SPIRV_VARIABLE_POINTERS, + PIPE_CAP_DEMOTE_TO_HELPER_INVOCATION, + PIPE_CAP_TGSI_TG4_COMPONENT_IN_SWIZZLE, + PIPE_CAP_FLATSHADE, + PIPE_CAP_ALPHA_TEST, + PIPE_CAP_POINT_SIZE_FIXED, + PIPE_CAP_TWO_SIDED_COLOR, + PIPE_CAP_CLIP_PLANES, + PIPE_CAP_MAX_VERTEX_BUFFERS, + PIPE_CAP_OPENCL_INTEGER_FUNCTIONS, + PIPE_CAP_INTEGER_MULTIPLY_32X16, }; /** @@ -917,9 +932,9 @@ { PIPE_ENDIAN_LITTLE = 0, PIPE_ENDIAN_BIG = 1, -#if defined(PIPE_ARCH_LITTLE_ENDIAN) +#if UTIL_ARCH_LITTLE_ENDIAN PIPE_ENDIAN_NATIVE = PIPE_ENDIAN_LITTLE -#elif defined(PIPE_ARCH_BIG_ENDIAN) +#elif UTIL_ARCH_BIG_ENDIAN PIPE_ENDIAN_NATIVE = PIPE_ENDIAN_BIG #endif }; @@ -981,7 +996,6 @@ PIPE_SHADER_CAP_TGSI_LDEXP_SUPPORTED, PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTERS, PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTER_BUFFERS, - PIPE_SHADER_CAP_SCALAR_ISA, }; /** @@ -1000,6 +1014,7 @@ PIPE_SHADER_IR_TGSI = 0, PIPE_SHADER_IR_NATIVE, PIPE_SHADER_IR_NIR, + PIPE_SHADER_IR_NIR_SERIALIZED, }; /** @@ -1234,6 +1249,29 @@ PIPE_DEBUG_TYPE_CONFORMANCE, }; +/** + * counter type and counter data type enums used by INTEL_performance_query + * APIs in gallium drivers. + */ +enum pipe_perf_counter_type +{ + PIPE_PERF_COUNTER_TYPE_EVENT, + PIPE_PERF_COUNTER_TYPE_DURATION_NORM, + PIPE_PERF_COUNTER_TYPE_DURATION_RAW, + PIPE_PERF_COUNTER_TYPE_THROUGHPUT, + PIPE_PERF_COUNTER_TYPE_RAW, + PIPE_PERF_COUNTER_TYPE_TIMESTAMP, +}; + +enum pipe_perf_counter_data_type +{ + PIPE_PERF_COUNTER_DATA_TYPE_BOOL32, + PIPE_PERF_COUNTER_DATA_TYPE_UINT32, + PIPE_PERF_COUNTER_DATA_TYPE_UINT64, + PIPE_PERF_COUNTER_DATA_TYPE_FLOAT, + PIPE_PERF_COUNTER_DATA_TYPE_DOUBLE, +}; + #define PIPE_UUID_SIZE 16 #ifdef __cplusplus diff -Nru mesa-19.2.8/src/gallium/include/pipe/p_format.h mesa-20.0.8/src/gallium/include/pipe/p_format.h --- mesa-19.2.8/src/gallium/include/pipe/p_format.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/include/pipe/p_format.h 2020-06-12 01:21:17.000000000 +0000 @@ -45,7 +45,9 @@ PIPE_FORMAT_A8R8G8B8_UNORM, PIPE_FORMAT_X8R8G8B8_UNORM, PIPE_FORMAT_B5G5R5A1_UNORM, + PIPE_FORMAT_R4G4B4A4_UNORM, PIPE_FORMAT_B4G4R4A4_UNORM, + PIPE_FORMAT_R5G6B5_UNORM, PIPE_FORMAT_B5G6R5_UNORM, PIPE_FORMAT_R10G10B10A2_UNORM, PIPE_FORMAT_L8_UNORM, /**< ubyte luminance */ @@ -106,20 +108,29 @@ PIPE_FORMAT_R8_UNORM, PIPE_FORMAT_R8G8_UNORM, PIPE_FORMAT_R8G8B8_UNORM, + PIPE_FORMAT_B8G8R8_UNORM, PIPE_FORMAT_R8G8B8A8_UNORM, PIPE_FORMAT_X8B8G8R8_UNORM, PIPE_FORMAT_R8_USCALED, PIPE_FORMAT_R8G8_USCALED, PIPE_FORMAT_R8G8B8_USCALED, + PIPE_FORMAT_B8G8R8_USCALED, PIPE_FORMAT_R8G8B8A8_USCALED, + PIPE_FORMAT_B8G8R8A8_USCALED, + PIPE_FORMAT_A8B8G8R8_USCALED, PIPE_FORMAT_R8_SNORM, PIPE_FORMAT_R8G8_SNORM, PIPE_FORMAT_R8G8B8_SNORM, + PIPE_FORMAT_B8G8R8_SNORM, PIPE_FORMAT_R8G8B8A8_SNORM, + PIPE_FORMAT_B8G8R8A8_SNORM, PIPE_FORMAT_R8_SSCALED, PIPE_FORMAT_R8G8_SSCALED, PIPE_FORMAT_R8G8B8_SSCALED, + PIPE_FORMAT_B8G8R8_SSCALED, PIPE_FORMAT_R8G8B8A8_SSCALED, + PIPE_FORMAT_B8G8R8A8_SSCALED, + PIPE_FORMAT_A8B8G8R8_SSCALED, PIPE_FORMAT_R32_FIXED, PIPE_FORMAT_R32G32_FIXED, PIPE_FORMAT_R32G32B32_FIXED, @@ -131,8 +142,11 @@ /* sRGB formats */ PIPE_FORMAT_L8_SRGB, + PIPE_FORMAT_R8_SRGB, PIPE_FORMAT_L8A8_SRGB, + PIPE_FORMAT_R8G8_SRGB, PIPE_FORMAT_R8G8B8_SRGB, + PIPE_FORMAT_B8G8R8_SRGB, PIPE_FORMAT_A8B8G8R8_SRGB, PIPE_FORMAT_X8B8G8R8_SRGB, PIPE_FORMAT_B8G8R8A8_SRGB, @@ -177,6 +191,8 @@ PIPE_FORMAT_R10G10B10X2_USCALED, PIPE_FORMAT_R10G10B10X2_SNORM, PIPE_FORMAT_L4A4_UNORM, + PIPE_FORMAT_A2R10G10B10_UNORM, + PIPE_FORMAT_A2B10G10R10_UNORM, PIPE_FORMAT_B10G10R10A2_UNORM, PIPE_FORMAT_R10SG10SB10SA2U_NORM, PIPE_FORMAT_R8G8Bx_SNORM, @@ -188,6 +204,7 @@ PIPE_FORMAT_S8X24_UINT, PIPE_FORMAT_X32_S8X24_UINT, + PIPE_FORMAT_R3G3B2_UNORM, PIPE_FORMAT_B2G3R3_UNORM, PIPE_FORMAT_L16A16_UNORM, PIPE_FORMAT_A16_UNORM, @@ -294,7 +311,30 @@ PIPE_FORMAT_L32_SINT, PIPE_FORMAT_L32A32_SINT, + PIPE_FORMAT_B8G8R8_UINT, + PIPE_FORMAT_B8G8R8A8_UINT, + + PIPE_FORMAT_B8G8R8_SINT, + PIPE_FORMAT_B8G8R8A8_SINT, + + PIPE_FORMAT_A8R8G8B8_UINT, + PIPE_FORMAT_A8B8G8R8_UINT, + PIPE_FORMAT_A2R10G10B10_UINT, + PIPE_FORMAT_A2B10G10R10_UINT, PIPE_FORMAT_B10G10R10A2_UINT, + PIPE_FORMAT_B10G10R10A2_SINT, + PIPE_FORMAT_R5G6B5_UINT, + PIPE_FORMAT_B5G6R5_UINT, + PIPE_FORMAT_R5G5B5A1_UINT, + PIPE_FORMAT_B5G5R5A1_UINT, + PIPE_FORMAT_A1R5G5B5_UINT, + PIPE_FORMAT_A1B5G5R5_UINT, + PIPE_FORMAT_R4G4B4A4_UINT, + PIPE_FORMAT_B4G4R4A4_UINT, + PIPE_FORMAT_A4R4G4B4_UINT, + PIPE_FORMAT_A4B4G4R4_UINT, + PIPE_FORMAT_R3G3B2_UINT, + PIPE_FORMAT_B2G3R3_UINT, PIPE_FORMAT_ETC1_RGB8, @@ -327,6 +367,7 @@ PIPE_FORMAT_R32A32_UINT, PIPE_FORMAT_R32A32_SINT, PIPE_FORMAT_R10G10B10A2_UINT, + PIPE_FORMAT_R10G10B10A2_SINT, PIPE_FORMAT_B5G6R5_SRGB, @@ -335,11 +376,6 @@ PIPE_FORMAT_BPTC_RGB_FLOAT, PIPE_FORMAT_BPTC_RGB_UFLOAT, - PIPE_FORMAT_A8L8_UNORM, - PIPE_FORMAT_A8L8_SNORM, - PIPE_FORMAT_A8L8_SRGB, - PIPE_FORMAT_A16L16_UNORM, - PIPE_FORMAT_G8R8_UNORM, PIPE_FORMAT_G8R8_SNORM, PIPE_FORMAT_G16R16_UNORM, @@ -389,16 +425,42 @@ PIPE_FORMAT_ASTC_12x10_SRGB, PIPE_FORMAT_ASTC_12x12_SRGB, + PIPE_FORMAT_ASTC_3x3x3, + PIPE_FORMAT_ASTC_4x3x3, + PIPE_FORMAT_ASTC_4x4x3, + PIPE_FORMAT_ASTC_4x4x4, + PIPE_FORMAT_ASTC_5x4x4, + PIPE_FORMAT_ASTC_5x5x4, + PIPE_FORMAT_ASTC_5x5x5, + PIPE_FORMAT_ASTC_6x5x5, + PIPE_FORMAT_ASTC_6x6x5, + PIPE_FORMAT_ASTC_6x6x6, + + PIPE_FORMAT_ASTC_3x3x3_SRGB, + PIPE_FORMAT_ASTC_4x3x3_SRGB, + PIPE_FORMAT_ASTC_4x4x3_SRGB, + PIPE_FORMAT_ASTC_4x4x4_SRGB, + PIPE_FORMAT_ASTC_5x4x4_SRGB, + PIPE_FORMAT_ASTC_5x5x4_SRGB, + PIPE_FORMAT_ASTC_5x5x5_SRGB, + PIPE_FORMAT_ASTC_6x5x5_SRGB, + PIPE_FORMAT_ASTC_6x6x5_SRGB, + PIPE_FORMAT_ASTC_6x6x6_SRGB, + + PIPE_FORMAT_FXT1_RGB, + PIPE_FORMAT_FXT1_RGBA, + + PIPE_FORMAT_P010, PIPE_FORMAT_P016, PIPE_FORMAT_R10G10B10X2_UNORM, + PIPE_FORMAT_A1R5G5B5_UNORM, PIPE_FORMAT_A1B5G5R5_UNORM, PIPE_FORMAT_X1B5G5R5_UNORM, + PIPE_FORMAT_R5G5B5A1_UNORM, + PIPE_FORMAT_A4R4G4B4_UNORM, PIPE_FORMAT_A4B4G4R4_UNORM, - PIPE_FORMAT_R8_SRGB, - - PIPE_FORMAT_A8L8_SINT, PIPE_FORMAT_G8R8_SINT, PIPE_FORMAT_A8B8G8R8_SINT, PIPE_FORMAT_X8B8G8R8_SINT, @@ -415,7 +477,7 @@ PIPE_FORMAT_COUNT }; -#if defined(PIPE_ARCH_LITTLE_ENDIAN) +#if UTIL_ARCH_LITTLE_ENDIAN #define PIPE_FORMAT_RGBA8888_UNORM PIPE_FORMAT_R8G8B8A8_UNORM #define PIPE_FORMAT_RGBX8888_UNORM PIPE_FORMAT_R8G8B8X8_UNORM #define PIPE_FORMAT_BGRA8888_UNORM PIPE_FORMAT_B8G8R8A8_UNORM @@ -436,14 +498,13 @@ #define PIPE_FORMAT_XRGB8888_SRGB PIPE_FORMAT_X8R8G8B8_SRGB #define PIPE_FORMAT_ABGR8888_SRGB PIPE_FORMAT_A8B8G8R8_SRGB #define PIPE_FORMAT_XBGR8888_SRGB PIPE_FORMAT_X8B8G8R8_SRGB -#define PIPE_FORMAT_LA88_UNORM PIPE_FORMAT_L8A8_UNORM -#define PIPE_FORMAT_AL88_UNORM PIPE_FORMAT_A8L8_UNORM -#define PIPE_FORMAT_LA88_SNORM PIPE_FORMAT_L8A8_SNORM -#define PIPE_FORMAT_AL88_SNORM PIPE_FORMAT_A8L8_SNORM -#define PIPE_FORMAT_LA88_SRGB PIPE_FORMAT_L8A8_SRGB -#define PIPE_FORMAT_AL88_SRGB PIPE_FORMAT_A8L8_SRGB -#define PIPE_FORMAT_LA1616_UNORM PIPE_FORMAT_L16A16_UNORM -#define PIPE_FORMAT_AL1616_UNORM PIPE_FORMAT_A16L16_UNORM +#define PIPE_FORMAT_RGBA8888_USCALED PIPE_FORMAT_R8G8B8A8_USCALED +#define PIPE_FORMAT_RGBA8888_SSCALED PIPE_FORMAT_R8G8B8A8_SSCALED +#define PIPE_FORMAT_RGBA8888_UINT PIPE_FORMAT_R8G8B8A8_UINT +#define PIPE_FORMAT_BGRA8888_UINT PIPE_FORMAT_B8G8R8A8_UINT +#define PIPE_FORMAT_ARGB8888_UINT PIPE_FORMAT_A8R8G8B8_UINT +#define PIPE_FORMAT_ABGR8888_UINT PIPE_FORMAT_A8B8G8R8_UINT +#define PIPE_FORMAT_RGBA8888_SINT PIPE_FORMAT_R8G8B8A8_SINT #define PIPE_FORMAT_RG88_UNORM PIPE_FORMAT_R8G8_UNORM #define PIPE_FORMAT_GR88_UNORM PIPE_FORMAT_G8R8_UNORM #define PIPE_FORMAT_RG88_SNORM PIPE_FORMAT_R8G8_SNORM @@ -452,7 +513,7 @@ #define PIPE_FORMAT_GR1616_UNORM PIPE_FORMAT_G16R16_UNORM #define PIPE_FORMAT_RG1616_SNORM PIPE_FORMAT_R16G16_SNORM #define PIPE_FORMAT_GR1616_SNORM PIPE_FORMAT_G16R16_SNORM -#elif defined(PIPE_ARCH_BIG_ENDIAN) +#elif UTIL_ARCH_BIG_ENDIAN #define PIPE_FORMAT_ABGR8888_UNORM PIPE_FORMAT_R8G8B8A8_UNORM #define PIPE_FORMAT_XBGR8888_UNORM PIPE_FORMAT_R8G8B8X8_UNORM #define PIPE_FORMAT_ARGB8888_UNORM PIPE_FORMAT_B8G8R8A8_UNORM @@ -473,14 +534,13 @@ #define PIPE_FORMAT_BGRX8888_SRGB PIPE_FORMAT_X8R8G8B8_SRGB #define PIPE_FORMAT_RGBA8888_SRGB PIPE_FORMAT_A8B8G8R8_SRGB #define PIPE_FORMAT_RGBX8888_SRGB PIPE_FORMAT_X8B8G8R8_SRGB -#define PIPE_FORMAT_LA88_UNORM PIPE_FORMAT_A8L8_UNORM -#define PIPE_FORMAT_AL88_UNORM PIPE_FORMAT_L8A8_UNORM -#define PIPE_FORMAT_LA88_SNORM PIPE_FORMAT_A8L8_SNORM -#define PIPE_FORMAT_AL88_SNORM PIPE_FORMAT_L8A8_SNORM -#define PIPE_FORMAT_LA88_SRGB PIPE_FORMAT_A8L8_SRGB -#define PIPE_FORMAT_AL88_SRGB PIPE_FORMAT_L8A8_SRGB -#define PIPE_FORMAT_LA1616_UNORM PIPE_FORMAT_A16L16_UNORM -#define PIPE_FORMAT_AL1616_UNORM PIPE_FORMAT_L16A16_UNORM +#define PIPE_FORMAT_RGBA8888_USCALED PIPE_FORMAT_A8B8G8R8_USCALED +#define PIPE_FORMAT_RGBA8888_SSCALED PIPE_FORMAT_A8B8G8R8_SSCALED +#define PIPE_FORMAT_RGBA8888_UINT PIPE_FORMAT_A8B8G8R8_UINT +#define PIPE_FORMAT_BGRA8888_UINT PIPE_FORMAT_A8R8G8B8_UINT +#define PIPE_FORMAT_ARGB8888_UINT PIPE_FORMAT_B8G8R8A8_UINT +#define PIPE_FORMAT_ABGR8888_UINT PIPE_FORMAT_R8G8B8A8_UINT +#define PIPE_FORMAT_RGBA8888_SINT PIPE_FORMAT_A8B8G8R8_SINT #define PIPE_FORMAT_RG88_UNORM PIPE_FORMAT_G8R8_UNORM #define PIPE_FORMAT_GR88_UNORM PIPE_FORMAT_R8G8_UNORM #define PIPE_FORMAT_RG88_SNORM PIPE_FORMAT_G8R8_SNORM diff -Nru mesa-19.2.8/src/gallium/include/pipe/p_screen.h mesa-20.0.8/src/gallium/include/pipe/p_screen.h --- mesa-19.2.8/src/gallium/include/pipe/p_screen.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/include/pipe/p_screen.h 2020-06-12 01:21:17.000000000 +0000 @@ -500,6 +500,17 @@ struct pipe_resource *resource, unsigned int nrects, const struct pipe_box *rects); + + /** + * Run driver-specific NIR lowering and optimization passes. + * + * State trackers should call this before passing shaders to drivers, + * and ideally also before shader caching. + * + * \param optimize Whether the input shader hasn't been optimized and + * should be. + */ + void (*finalize_nir)(struct pipe_screen *screen, void *nir, bool optimize); }; diff -Nru mesa-19.2.8/src/gallium/include/pipe/p_shader_tokens.h mesa-20.0.8/src/gallium/include/pipe/p_shader_tokens.h --- mesa-19.2.8/src/gallium/include/pipe/p_shader_tokens.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/include/pipe/p_shader_tokens.h 2020-06-12 01:21:17.000000000 +0000 @@ -376,11 +376,11 @@ TGSI_OPCODE_EX2 = 28, TGSI_OPCODE_LG2 = 29, TGSI_OPCODE_POW = 30, - /* gap */ + TGSI_OPCODE_DEMOTE = 31, TGSI_OPCODE_U2I64 = 32, TGSI_OPCODE_CLOCK = 33, TGSI_OPCODE_I2I64 = 34, - /* gap */ + TGSI_OPCODE_READ_HELPER = 35, TGSI_OPCODE_COS = 36, TGSI_OPCODE_DDX = 37, TGSI_OPCODE_DDY = 38, diff -Nru mesa-19.2.8/src/gallium/include/pipe/p_state.h mesa-20.0.8/src/gallium/include/pipe/p_state.h --- mesa-19.2.8/src/gallium/include/pipe/p_state.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/include/pipe/p_state.h 2020-06-12 01:21:17.000000000 +0000 @@ -879,11 +879,12 @@ }; /** - * Structure used as a header for serialized LLVM programs. + * Structure used as a header for serialized compute programs. */ -struct pipe_llvm_program_header +struct pipe_binary_program_header { uint32_t num_bytes; /**< Number of bytes in the LLVM bytecode program. */ + char blob[]; }; struct pipe_compute_state diff -Nru mesa-19.2.8/src/gallium/include/state_tracker/st_api.h mesa-20.0.8/src/gallium/include/state_tracker/st_api.h --- mesa-19.2.8/src/gallium/include/state_tracker/st_api.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/include/state_tracker/st_api.h 2020-06-12 01:21:17.000000000 +0000 @@ -182,6 +182,8 @@ unsigned level; unsigned layer; + /* GL internal format. */ + unsigned internalformat; }; /** @@ -228,6 +230,7 @@ bool allow_glsl_builtin_variable_redeclaration; bool allow_higher_compat_version; bool glsl_zero_init; + bool vs_position_always_invariant; bool force_glsl_abs_sqrt; bool allow_glsl_cross_stage_interpolation_mismatch; bool allow_glsl_layout_qualifier_on_function_parameters; @@ -388,7 +391,9 @@ * Flush all drawing from context to the pipe also flushes the pipe. */ void (*flush)(struct st_context_iface *stctxi, unsigned flags, - struct pipe_fence_handle **fence); + struct pipe_fence_handle **fence, + void (*notify_before_flush_cb) (void*), + void* notify_before_flush_cb_args); /** * Replace the texture image of a texture object at the specified level. diff -Nru mesa-19.2.8/src/gallium/meson.build mesa-20.0.8/src/gallium/meson.build --- mesa-19.2.8/src/gallium/meson.build 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/meson.build 2020-06-12 01:21:17.000000000 +0000 @@ -21,6 +21,7 @@ inc_gallium_drivers = include_directories('drivers') inc_gallium_winsys = include_directories('winsys') +inc_gallium_winsys_sw = include_directories('winsys/sw') subdir('auxiliary') subdir('auxiliary/pipe-loader') @@ -35,6 +36,11 @@ else libswkmsdri = [] endif +if with_platform_windows + subdir('winsys/sw/gdi') +else + libwsgdi = null_dep +endif subdir('winsys/sw/wrapper') if with_platform_haiku subdir('winsys/sw/hgl') @@ -150,6 +156,13 @@ else driver_lima = declare_dependency() endif + +if with_gallium_zink + subdir('drivers/zink') +else + driver_zink = declare_dependency() +endif + if with_gallium_opencl # TODO: this isn't really clover specific, but ATM clover is the only # consumer @@ -199,9 +212,15 @@ subdir('state_trackers/nine') subdir('targets/d3dadapter9') endif +if with_platform_windows + subdir('state_trackers/wgl') + subdir('targets/libgl-gdi') +endif if with_tests subdir('targets/graw-null') - if with_glx == 'gallium-xlib' + if with_platform_windows + subdir('targets/graw-gdi') + elif with_glx == 'gallium-xlib' subdir('targets/graw-xlib') endif subdir('tests') diff -Nru mesa-19.2.8/src/gallium/state_trackers/clover/api/dispatch.cpp mesa-20.0.8/src/gallium/state_trackers/clover/api/dispatch.cpp --- mesa-19.2.8/src/gallium/state_trackers/clover/api/dispatch.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/state_trackers/clover/api/dispatch.cpp 2020-06-12 01:21:17.000000000 +0000 @@ -23,7 +23,7 @@ #include "api/dispatch.hpp" namespace clover { - const _cl_icd_dispatch _dispatch = { + const cl_icd_dispatch _dispatch = { clGetPlatformIDs, GetPlatformInfo, clGetDeviceIDs, diff -Nru mesa-19.2.8/src/gallium/state_trackers/clover/api/dispatch.hpp mesa-20.0.8/src/gallium/state_trackers/clover/api/dispatch.hpp --- mesa-19.2.8/src/gallium/state_trackers/clover/api/dispatch.hpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/state_trackers/clover/api/dispatch.hpp 2020-06-12 01:21:17.000000000 +0000 @@ -23,949 +23,14 @@ #ifndef API_DISPATCH_HPP #define API_DISPATCH_HPP -#define CL_TARGET_OPENCL_VERSION 220 - -#define CL_USE_DEPRECATED_OPENCL_1_0_APIS -#define CL_USE_DEPRECATED_OPENCL_1_1_APIS -#define CL_USE_DEPRECATED_OPENCL_1_2_APIS -#define CL_USE_DEPRECATED_OPENCL_2_0_APIS -#define CL_USE_DEPRECATED_OPENCL_2_1_APIS - - #include "CL/cl.h" #include "CL/cl_ext.h" #include "CL/cl_egl.h" #include "CL/cl_gl.h" - -/// -/// OpenCL ICD vendor dispatch table. -/// -/// The entry point ordering should always be in agreement with -/// Khronos' ICD loader. -/// -struct _cl_icd_dispatch { - CL_API_ENTRY cl_int (CL_API_CALL *clGetPlatformIDs)( - cl_uint num_entries, - cl_platform_id *platforms, - cl_uint *num_platforms); - - CL_API_ENTRY cl_int (CL_API_CALL *clGetPlatformInfo)( - cl_platform_id platform, - cl_platform_info param_name, - size_t param_value_size, - void *param_value, - size_t *param_value_size_ret); - - CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDs)( - cl_platform_id platform, - cl_device_type device_type, - cl_uint num_entries, - cl_device_id *devices, - cl_uint *num_devices); - - CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceInfo)( - cl_device_id device, - cl_device_info param_name, - size_t param_value_size, - void *param_value, - size_t *param_value_size_ret); - - CL_API_ENTRY cl_context (CL_API_CALL *clCreateContext)( - const cl_context_properties *properties, - cl_uint num_devices, - const cl_device_id *devices, - void (CL_CALLBACK *pfn_notify)(const char *, const void *, size_t, void *), - void *user_data, - cl_int *errcode_ret); - - CL_API_ENTRY cl_context (CL_API_CALL *clCreateContextFromType)( - const cl_context_properties *properties, - cl_device_type device_type, - void (CL_CALLBACK *pfn_notify)(const char *, const void *, size_t, void *), - void *user_data, - cl_int *errcode_ret); - - CL_API_ENTRY cl_int (CL_API_CALL *clRetainContext)( - cl_context context); - - CL_API_ENTRY cl_int (CL_API_CALL *clReleaseContext)( - cl_context context); - - CL_API_ENTRY cl_int (CL_API_CALL *clGetContextInfo)( - cl_context context, - cl_context_info param_name, - size_t param_value_size, - void *param_value, - size_t *param_value_size_ret); - - CL_API_ENTRY cl_command_queue (CL_API_CALL *clCreateCommandQueue)( - cl_context context, - cl_device_id device, - cl_command_queue_properties properties, - cl_int *errcode_ret); - - CL_API_ENTRY cl_int (CL_API_CALL *clRetainCommandQueue)( - cl_command_queue command_queue); - - CL_API_ENTRY cl_int (CL_API_CALL *clReleaseCommandQueue)( - cl_command_queue command_queue); - - CL_API_ENTRY cl_int (CL_API_CALL *clGetCommandQueueInfo)( - cl_command_queue command_queue, - cl_command_queue_info param_name, - size_t param_value_size, - void *param_value, - size_t *param_value_size_ret); - - CL_API_ENTRY cl_int (CL_API_CALL *clSetCommandQueueProperty)( - cl_command_queue command_queue, - cl_command_queue_properties properties, - cl_bool enable, - cl_command_queue_properties *old_properties); - - CL_API_ENTRY cl_mem (CL_API_CALL *clCreateBuffer)( - cl_context context, - cl_mem_flags flags, - size_t size, - void *host_ptr, - cl_int *errcode_ret); - - CL_API_ENTRY cl_mem (CL_API_CALL *clCreateImage2D)( - cl_context context, - cl_mem_flags flags, - const cl_image_format *image_format, - size_t image_width, - size_t image_height, - size_t image_row_pitch, - void *host_ptr, - cl_int *errcode_ret); - - CL_API_ENTRY cl_mem (CL_API_CALL *clCreateImage3D)( - cl_context context, - cl_mem_flags flags, - const cl_image_format *image_format, - size_t image_width, - size_t image_height, - size_t image_depth, - size_t image_row_pitch, - size_t image_slice_pitch, - void *host_ptr, - cl_int *errcode_ret); - - CL_API_ENTRY cl_int (CL_API_CALL *clRetainMemObject)( - cl_mem memobj); - - CL_API_ENTRY cl_int (CL_API_CALL *clReleaseMemObject)( - cl_mem memobj); - - CL_API_ENTRY cl_int (CL_API_CALL *clGetSupportedImageFormats)( - cl_context context, - cl_mem_flags flags, - cl_mem_object_type image_type, - cl_uint num_entries, - cl_image_format *image_formats, - cl_uint *num_image_formats); - - CL_API_ENTRY cl_int (CL_API_CALL *clGetMemObjectInfo)( - cl_mem memobj, - cl_mem_info param_name, - size_t param_value_size, - void *param_value, - size_t *param_value_size_ret); - - CL_API_ENTRY cl_int (CL_API_CALL *clGetImageInfo)( - cl_mem image, - cl_image_info param_name, - size_t param_value_size, - void *param_value, - size_t *param_value_size_ret); - - CL_API_ENTRY cl_sampler (CL_API_CALL *clCreateSampler)( - cl_context context, - cl_bool normalized_coords, - cl_addressing_mode addressing_mode, - cl_filter_mode filter_mode, - cl_int *errcode_ret); - - CL_API_ENTRY cl_int (CL_API_CALL *clRetainSampler)( - cl_sampler sampler); - - CL_API_ENTRY cl_int (CL_API_CALL *clReleaseSampler)( - cl_sampler sampler); - - CL_API_ENTRY cl_int (CL_API_CALL *clGetSamplerInfo)( - cl_sampler sampler, - cl_sampler_info param_name, - size_t param_value_size, - void *param_value, - size_t *param_value_size_ret); - - CL_API_ENTRY cl_program (CL_API_CALL *clCreateProgramWithSource)( - cl_context context, - cl_uint count, - const char **strings, - const size_t *lengths, - cl_int *errcode_ret); - - CL_API_ENTRY cl_program (CL_API_CALL *clCreateProgramWithBinary)( - cl_context context, - cl_uint num_devices, - const cl_device_id *device_list, - const size_t *lengths, - const unsigned char **binaries, - cl_int *binary_status, - cl_int *errcode_ret); - - CL_API_ENTRY cl_int (CL_API_CALL *clRetainProgram)( - cl_program program); - - CL_API_ENTRY cl_int (CL_API_CALL *clReleaseProgram)( - cl_program program); - - CL_API_ENTRY cl_int (CL_API_CALL *clBuildProgram)( - cl_program program, - cl_uint num_devices, - const cl_device_id *device_list, - const char *options, - void (CL_CALLBACK *pfn_notify)(cl_program, void *), - void *user_data); - - CL_API_ENTRY cl_int (CL_API_CALL *clUnloadCompiler)( - void); - - CL_API_ENTRY cl_int (CL_API_CALL *clGetProgramInfo)( - cl_program program, - cl_program_info param_name, - size_t param_value_size, - void *param_value, - size_t *param_value_size_ret); - - CL_API_ENTRY cl_int (CL_API_CALL *clGetProgramBuildInfo)( - cl_program program, - cl_device_id device, - cl_program_build_info param_name, - size_t param_value_size, - void *param_value, - size_t *param_value_size_ret); - - CL_API_ENTRY cl_kernel (CL_API_CALL *clCreateKernel)( - cl_program program, - const char *kernel_name, - cl_int *errcode_ret); - - CL_API_ENTRY cl_int (CL_API_CALL *clCreateKernelsInProgram)( - cl_program program, - cl_uint num_kernels, - cl_kernel *kernels, - cl_uint *num_kernels_ret); - - CL_API_ENTRY cl_int (CL_API_CALL *clRetainKernel)( - cl_kernel kernel); - - CL_API_ENTRY cl_int (CL_API_CALL *clReleaseKernel)( - cl_kernel kernel); - - CL_API_ENTRY cl_int (CL_API_CALL *clSetKernelArg)( - cl_kernel kernel, - cl_uint arg_index, - size_t arg_size, - const void *arg_value); - - CL_API_ENTRY cl_int (CL_API_CALL *clGetKernelInfo)( - cl_kernel kernel, - cl_kernel_info param_name, - size_t param_value_size, - void *param_value, - size_t *param_value_size_ret); - - CL_API_ENTRY cl_int (CL_API_CALL *clGetKernelWorkGroupInfo)( - cl_kernel kernel, - cl_device_id device, - cl_kernel_work_group_info param_name, - size_t param_value_size, - void *param_value, - size_t *param_value_size_ret); - - CL_API_ENTRY cl_int (CL_API_CALL *clWaitForEvents)( - cl_uint num_events, - const cl_event *event_list); - - CL_API_ENTRY cl_int (CL_API_CALL *clGetEventInfo)( - cl_event event, - cl_event_info param_name, - size_t param_value_size, - void *param_value, - size_t *param_value_size_ret); - - CL_API_ENTRY cl_int (CL_API_CALL *clRetainEvent)( - cl_event event); - - CL_API_ENTRY cl_int (CL_API_CALL *clReleaseEvent)( - cl_event event); - - CL_API_ENTRY cl_int (CL_API_CALL *clGetEventProfilingInfo)( - cl_event event, - cl_profiling_info param_name, - size_t param_value_size, - void *param_value, - size_t *param_value_size_ret); - - CL_API_ENTRY cl_int (CL_API_CALL *clFlush)( - cl_command_queue command_queue); - - CL_API_ENTRY cl_int (CL_API_CALL *clFinish)( - cl_command_queue command_queue); - - CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReadBuffer)( - cl_command_queue command_queue, - cl_mem buffer, - cl_bool blocking_read, - size_t offset, - size_t cb, - void *ptr, - cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event); - - CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueWriteBuffer)( - cl_command_queue command_queue, - cl_mem buffer, - cl_bool blocking_write, - size_t offset, - size_t cb, - const void *ptr, - cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event); - - CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueCopyBuffer)( - cl_command_queue command_queue, - cl_mem src_buffer, - cl_mem dst_buffer, - size_t src_offset, - size_t dst_offset, - size_t cb, - cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event); - - CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReadImage)( - cl_command_queue command_queue, - cl_mem image, - cl_bool blocking_read, - const size_t *origin, - const size_t *region, - size_t row_pitch, - size_t slice_pitch, - void *ptr, - cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event); - - CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueWriteImage)( - cl_command_queue command_queue, - cl_mem image, - cl_bool blocking_write, - const size_t *origin, - const size_t *region, - size_t input_row_pitch, - size_t input_slice_pitch, - const void *ptr, - cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event); - - CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueCopyImage)( - cl_command_queue command_queue, - cl_mem src_image, - cl_mem dst_image, - const size_t *src_origin, - const size_t *dst_origin, - const size_t *region, - cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event); - - CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueCopyImageToBuffer)( - cl_command_queue command_queue, - cl_mem src_image, - cl_mem dst_buffer, - const size_t *src_origin, - const size_t *region, - size_t dst_offset, - cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event); - - CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueCopyBufferToImage)( - cl_command_queue command_queue, - cl_mem src_buffer, - cl_mem dst_image, - size_t src_offset, - const size_t *dst_origin, - const size_t *region, - cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event); - - CL_API_ENTRY void *(CL_API_CALL *clEnqueueMapBuffer)( - cl_command_queue command_queue, - cl_mem buffer, - cl_bool blocking_map, - cl_map_flags map_flags, - size_t offset, - size_t cb, - cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event, - cl_int *errcode_ret); - - CL_API_ENTRY void *(CL_API_CALL *clEnqueueMapImage)( - cl_command_queue command_queue, - cl_mem image, - cl_bool blocking_map, - cl_map_flags map_flags, - const size_t *origin, - const size_t *region, - size_t *image_row_pitch, - size_t *image_slice_pitch, - cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event, - cl_int *errcode_ret); - - CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueUnmapMemObject)( - cl_command_queue command_queue, - cl_mem memobj, - void *mapped_ptr, - cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event); - - CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueNDRangeKernel)( - cl_command_queue command_queue, - cl_kernel kernel, - cl_uint work_dim, - const size_t *global_work_offset, - const size_t *global_work_size, - const size_t *local_work_size, - cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event); - - CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueTask)( - cl_command_queue command_queue, - cl_kernel kernel, - cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event); - - CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueNativeKernel)( - cl_command_queue command_queue, - void (CL_CALLBACK *user_func)(void *), - void *args, - size_t cb_args, - cl_uint num_mem_objects, - const cl_mem *mem_list, - const void **args_mem_loc, - cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event); - - CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueMarker)( - cl_command_queue command_queue, - cl_event *event); - - CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueWaitForEvents)( - cl_command_queue command_queue, - cl_uint num_events, - const cl_event *event_list); - - CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueBarrier)( - cl_command_queue command_queue); - - CL_API_ENTRY void *(CL_API_CALL *clGetExtensionFunctionAddress)( - const char *function_name); - - CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromGLBuffer)( - cl_context context, - cl_mem_flags flags, - cl_GLuint bufobj, - int *errcode_ret); - - CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromGLTexture2D)( - cl_context context, - cl_mem_flags flags, - cl_GLenum target, - cl_GLint miplevel, - cl_GLuint texture, - cl_int *errcode_ret); - - CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromGLTexture3D)( - cl_context context, - cl_mem_flags flags, - cl_GLenum target, - cl_GLint miplevel, - cl_GLuint texture, - cl_int *errcode_ret); - - CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromGLRenderbuffer)( - cl_context context, - cl_mem_flags flags, - cl_GLuint renderbuffer, - cl_int *errcode_ret); - - CL_API_ENTRY cl_int (CL_API_CALL *clGetGLObjectInfo)( - cl_mem memobj, - cl_gl_object_type *gl_object_type, - cl_GLuint *gl_object_name); - - CL_API_ENTRY cl_int (CL_API_CALL *clGetGLTextureInfo)( - cl_mem memobj, - cl_gl_texture_info param_name, - size_t param_value_size, - void *param_value, - size_t *param_value_size_ret); - - CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireGLObjects)( - cl_command_queue command_queue, - cl_uint num_objects, - const cl_mem *mem_objects, - cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event); - - CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseGLObjects)( - cl_command_queue command_queue, - cl_uint num_objects, - const cl_mem *mem_objects, - cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event); - - CL_API_ENTRY cl_int (CL_API_CALL *clGetGLContextInfoKHR)( - const cl_context_properties *properties, - cl_gl_context_info param_name, - size_t param_value_size, - void *param_value, - size_t *param_value_size_ret); - - void *clGetDeviceIDsFromD3D10KHR; - void *clCreateFromD3D10BufferKHR; - void *clCreateFromD3D10Texture2DKHR; - void *clCreateFromD3D10Texture3DKHR; - void *clEnqueueAcquireD3D10ObjectsKHR; - void *clEnqueueReleaseD3D10ObjectsKHR; - - CL_API_ENTRY cl_int (CL_API_CALL *clSetEventCallback)( - cl_event event, - cl_int type, - void (CL_CALLBACK *pfn_notify)(cl_event, cl_int, void *), - void *user_data); - - CL_API_ENTRY cl_mem (CL_API_CALL *clCreateSubBuffer)( - cl_mem buffer, - cl_mem_flags flags, - cl_buffer_create_type buffer_create_type, - const void *buffer_create_info, - cl_int *errcode_ret); - - CL_API_ENTRY cl_int (CL_API_CALL *clSetMemObjectDestructorCallback)( - cl_mem memobj, - void (CL_CALLBACK *pfn_notify)(cl_mem, void *), - void *user_data); - - CL_API_ENTRY cl_event (CL_API_CALL *clCreateUserEvent)( - cl_context context, - cl_int *errcode_ret); - - CL_API_ENTRY cl_int (CL_API_CALL *clSetUserEventStatus)( - cl_event event, - cl_int status); - - CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReadBufferRect)( - cl_command_queue command_queue, - cl_mem buffer, - cl_bool blocking_read, - const size_t *buffer_origin, - const size_t *host_origin, - const size_t *region, - size_t buffer_row_pitch, - size_t buffer_slice_pitch, - size_t host_row_pitch, - size_t host_slice_pitch, - void *ptr, - cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event); - - CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueWriteBufferRect)( - cl_command_queue command_queue, - cl_mem buffer, - cl_bool blocking_read, - const size_t *buffer_origin, - const size_t *host_origin, - const size_t *region, - size_t buffer_row_pitch, - size_t buffer_slice_pitch, - size_t host_row_pitch, - size_t host_slice_pitch, - const void *ptr, - cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event); - - CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueCopyBufferRect)( - cl_command_queue command_queue, - cl_mem src_buffer, - cl_mem dst_buffer, - const size_t *src_origin, - const size_t *dst_origin, - const size_t *region, - size_t src_row_pitch, - size_t src_slice_pitch, - size_t dst_row_pitch, - size_t dst_slice_pitch, - cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event); - - CL_API_ENTRY cl_int (CL_API_CALL *clCreateSubDevicesEXT)( - cl_device_id in_device, - const cl_device_partition_property_ext *partition_properties, - cl_uint num_entries, - cl_device_id *out_devices, - cl_uint *num_devices); - - CL_API_ENTRY cl_int (CL_API_CALL *clRetainDeviceEXT)( - cl_device_id device); - - CL_API_ENTRY cl_int (CL_API_CALL *clReleaseDeviceEXT)( - cl_device_id device); - - CL_API_ENTRY cl_event (CL_API_CALL *clCreateEventFromGLsyncKHR)( - cl_context context, - cl_GLsync sync, - cl_int *errcode_ret); - - CL_API_ENTRY cl_int (CL_API_CALL *clCreateSubDevices)( - cl_device_id in_device, - const cl_device_partition_property *partition_properties, - cl_uint num_entries, - cl_device_id *out_devices, - cl_uint *num_devices); - - CL_API_ENTRY cl_int (CL_API_CALL *clRetainDevice)( - cl_device_id device); - - CL_API_ENTRY cl_int (CL_API_CALL *clReleaseDevice)( - cl_device_id device); - - CL_API_ENTRY cl_mem (CL_API_CALL *clCreateImage)( - cl_context context, - cl_mem_flags flags, - const cl_image_format *image_format, - const cl_image_desc *image_desc, - void *host_ptr, - cl_int *errcode_ret); - - CL_API_ENTRY cl_program (CL_API_CALL *clCreateProgramWithBuiltInKernels)( - cl_context context, - cl_uint num_devices, - const cl_device_id *device_list, - const char *kernel_names, - cl_int *errcode_ret); - - CL_API_ENTRY cl_int (CL_API_CALL *clCompileProgram)( - cl_program program, - cl_uint num_devices, - const cl_device_id *device_list, - const char *options, - cl_uint num_input_headers, - const cl_program *input_headers, - const char **header_include_names, - void (CL_CALLBACK *pfn_notify)(cl_program, void *), - void *user_data); - - CL_API_ENTRY cl_program (CL_API_CALL *clLinkProgram)( - cl_context context, - cl_uint num_devices, - const cl_device_id *device_list, - const char *options, - cl_uint num_input_programs, - const cl_program *input_programs, - void (CL_CALLBACK *pfn_notify)(cl_program, void *), - void *user_data, - cl_int *errcode_ret); - - CL_API_ENTRY cl_int (CL_API_CALL *clUnloadPlatformCompiler)( - cl_platform_id platform); - - CL_API_ENTRY cl_int (CL_API_CALL *clGetKernelArgInfo)( - cl_kernel kernel, - cl_uint arg_indx, - cl_kernel_arg_info param_name, - size_t param_value_size, - void *param_value, - size_t *param_value_size_ret); - - CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueFillBuffer)( - cl_command_queue command_queue, - cl_mem buffer, - const void *pattern, - size_t pattern_size, - size_t offset, - size_t size, - cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event); - - CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueFillImage)( - cl_command_queue command_queue, - cl_mem image, - const void *fill_color, - const size_t *origin, - const size_t *region, - cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event); - - CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueMigrateMemObjects)( - cl_command_queue command_queue, - cl_uint num_mem_objects, - const cl_mem *mem_objects, - cl_mem_migration_flags flags, - cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event); - - CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueMarkerWithWaitList)( - cl_command_queue command_queue, - cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event); - - CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueBarrierWithWaitList)( - cl_command_queue command_queue, - cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event); - - CL_API_ENTRY void *(CL_API_CALL *clGetExtensionFunctionAddressForPlatform)( - cl_platform_id platform, - const char *function_name); - - CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromGLTexture)( - cl_context context, - cl_mem_flags flags, - cl_GLenum target, - cl_GLint miplevel, - cl_GLuint texture, - cl_int *errcode_ret); - - void *clGetDeviceIDsFromD3D11KHR; - void *clCreateFromD3D11BufferKHR; - void *clCreateFromD3D11Texture2DKHR; - void *clCreateFromD3D11Texture3DKHR; - void *clCreateFromDX9MediaSurfaceKHR; - void *clEnqueueAcquireD3D11ObjectsKHR; - void *clEnqueueReleaseD3D11ObjectsKHR; - void *clGetDeviceIDsFromDX9MediaAdapterKHR; - void *clEnqueueAcquireDX9MediaSurfacesKHR; - void *clEnqueueReleaseDX9MediaSurfacesKHR; - - CL_API_ENTRY void (CL_API_CALL *clCreateFromEGLImageKHR)( - cl_context context, - CLeglDisplayKHR display, - CLeglImageKHR image, - cl_mem_flags flags, - const cl_egl_image_properties_khr *properties, - cl_int *errcode_ret); - - CL_API_ENTRY void (CL_API_CALL *clEnqueueAcquireEGLObjectsKHR)( - cl_command_queue command_queue, - cl_uint num_objects, - const cl_mem *mem_objects, - cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event); - - CL_API_ENTRY void (CL_API_CALL *clEnqueueReleaseEGLObjectsKHR)( - cl_command_queue command_queue, - cl_uint num_objects, - const cl_mem *mem_objects, - cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event); - - CL_API_ENTRY void (CL_API_CALL *clCreateEventFromEGLSyncKHR)( - cl_context context, - CLeglSyncKHR sync, - CLeglDisplayKHR display, - cl_int *errcode_ret); - - CL_API_ENTRY cl_command_queue (CL_API_CALL *clCreateCommandQueueWithProperties)( - cl_context context, - cl_device_id device, - const cl_queue_properties *properties, - cl_int *errcode_ret); - - CL_API_ENTRY void (CL_API_CALL *clCreatePipe)( - cl_context context, - cl_mem_flags flags, - cl_uint pipe_packet_size, - cl_uint pipe_max_packets, - const cl_pipe_properties *properties, - cl_int *errcode_ret); - - CL_API_ENTRY void (CL_API_CALL *clGetPipeInfo)( - cl_mem pipe, - cl_pipe_info param_name, - size_t param_value_size, - void *param_value, - size_t *param_value_size_ret); - - CL_API_ENTRY void (CL_API_CALL *clSVMAlloc)( - cl_context context, - cl_svm_mem_flags flags, - size_t size, - unsigned int alignment); - - CL_API_ENTRY void (CL_API_CALL *clSVMFree)( - cl_context context, - void *svm_pointer); - - CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueSVMFree)( - cl_command_queue command_queue, - cl_uint num_svm_pointers, - void **svm_pointers, - void (CL_CALLBACK *pfn_free_func)(cl_command_queue, cl_uint, void **, void *), - void *user_data, - cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event); - - CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueSVMMemcpy)( - cl_command_queue command_queue, - cl_bool blocking_copy, - void *dst_ptr, - const void *src_ptr, - size_t size, - cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event); - - CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueSVMMemFill)( - cl_command_queue command_queue, - void *svm_ptr, - const void *pattern, - size_t pattern_size, - size_t size, - cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event); - - CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueSVMMap)( - cl_command_queue command_queue, - cl_bool blocking_map, - cl_map_flags map_flags, - void *svm_ptr, - size_t size, - cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event); - - CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueSVMUnmap)( - cl_command_queue command_queue, - void *svm_ptr, - cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event); - - CL_API_ENTRY void (CL_API_CALL *clCreateSamplerWithProperties)( - cl_context context, - const cl_sampler_properties *sampler_properties, - cl_int *errcode_ret); - - CL_API_ENTRY cl_int (CL_API_CALL *clSetKernelArgSVMPointer)( - cl_kernel kernel, - cl_uint arg_index, - const void *arg_value); - - CL_API_ENTRY void (CL_API_CALL *clSetKernelExecInfo)( - cl_kernel kernel, - cl_kernel_exec_info param_name, - size_t param_value_size, - const void *param_value); - - CL_API_ENTRY void (CL_API_CALL *clGetKernelSubGroupInfoKHR)( - cl_kernel kernel, - cl_device_id device, - cl_kernel_sub_group_info param_name, - size_t input_value_size, - const void *input_value, - size_t param_value_size, - void *param_value, - size_t *param_value_size_ret); - - CL_API_ENTRY cl_kernel (CL_API_CALL *clCloneKernel)( - cl_kernel source_kernel, - cl_int *errcode_ret); - - CL_API_ENTRY cl_program (CL_API_CALL *clCreateProgramWithIL)( - cl_context context, - const void *il, - size_t length, - cl_int *errcode_ret); - - CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueSVMMigrateMem)( - cl_command_queue command_queue, - cl_uint num_svm_pointers, - const void **svm_pointers, - const size_t *sizes, - cl_mem_migration_flags flags, - cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event); - - CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceAndHostTimer)( - cl_device_id device, - cl_ulong *device_timestamp, - cl_ulong *host_timestamp); - - CL_API_ENTRY cl_int (CL_API_CALL *clGetHostTimer)( - cl_device_id device, - cl_ulong *host_timestamp); - - CL_API_ENTRY void (CL_API_CALL *clGetKernelSubGroupInfo)( - cl_kernel kernel, - cl_device_id device, - cl_kernel_sub_group_info param_name, - size_t input_value_size, - const void *input_value, - size_t param_value_size, - void *param_value, - size_t *param_value_size_ret); - - CL_API_ENTRY cl_int (CL_API_CALL *clSetDefaultDeviceCommandQueue)( - cl_context context, - cl_device_id device, - cl_command_queue command_queue); - - CL_API_ENTRY cl_int (CL_API_CALL *clSetProgramReleaseCallback)( - cl_program program, - void (CL_CALLBACK *pfn_notify)(cl_program program, void *user_data), - void *user_data); - - CL_API_ENTRY cl_int (CL_API_CALL *clSetProgramSpecializationConstant)( - cl_program program, - cl_uint spec_id, - size_t spec_size, - const void *spec_value); -}; +#include "CL/cl_icd.h" namespace clover { - extern const _cl_icd_dispatch _dispatch; + extern const cl_icd_dispatch _dispatch; cl_int GetPlatformInfo(cl_platform_id d_platform, cl_platform_info param, diff -Nru mesa-19.2.8/src/gallium/state_trackers/clover/core/compiler.hpp mesa-20.0.8/src/gallium/state_trackers/clover/core/compiler.hpp --- mesa-19.2.8/src/gallium/state_trackers/clover/core/compiler.hpp 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/state_trackers/clover/core/compiler.hpp 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,68 @@ +// +// Copyright 2019 Red Hat, Inc. +// +// Permission is hereby granted, free of charge, to any person obtaining a +// copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. +// + +#ifndef CLOVER_CORE_COMPILER_HPP +#define CLOVER_CORE_COMPILER_HPP + +#include "core/device.hpp" +#include "core/module.hpp" +#include "llvm/invocation.hpp" +#include "nir/invocation.hpp" +#include "spirv/invocation.hpp" + +namespace clover { + namespace compiler { + static inline module + compile_program(const std::string &source, const header_map &headers, + const device &dev, const std::string &opts, + std::string &log) { + switch (dev.ir_format()) { +#ifdef HAVE_CLOVER_SPIRV + case PIPE_SHADER_IR_NIR_SERIALIZED: + return llvm::compile_to_spirv(source, headers, dev, opts, log); +#endif + case PIPE_SHADER_IR_NATIVE: + return llvm::compile_program(source, headers, dev, opts, log); + default: + unreachable("device with unsupported IR"); + throw error(CL_INVALID_VALUE); + } + } + + static inline module + link_program(const std::vector &ms, const device &dev, + const std::string &opts, std::string &log) { + switch (dev.ir_format()) { + case PIPE_SHADER_IR_NIR_SERIALIZED: + return nir::spirv_to_nir(spirv::link_program(ms, dev, opts, log), + dev, log); + case PIPE_SHADER_IR_NATIVE: + return llvm::link_program(ms, dev, opts, log); + default: + unreachable("device with unsupported IR"); + throw error(CL_INVALID_VALUE); + } + } + } +} + +#endif diff -Nru mesa-19.2.8/src/gallium/state_trackers/clover/core/device.cpp mesa-20.0.8/src/gallium/state_trackers/clover/core/device.cpp --- mesa-19.2.8/src/gallium/state_trackers/clover/core/device.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/state_trackers/clover/core/device.cpp 2020-06-12 01:21:17.000000000 +0000 @@ -46,12 +46,17 @@ device::device(clover::platform &platform, pipe_loader_device *ldev) : platform(platform), ldev(ldev) { pipe = pipe_loader_create_screen(ldev); - if (!pipe || !pipe->get_param(pipe, PIPE_CAP_COMPUTE) || - !supports_ir(PIPE_SHADER_IR_NATIVE)) { - if (pipe) - pipe->destroy(pipe); - throw error(CL_INVALID_DEVICE); + if (pipe && pipe->get_param(pipe, PIPE_CAP_COMPUTE)) { + if (supports_ir(PIPE_SHADER_IR_NATIVE)) + return; +#ifdef HAVE_CLOVER_SPIRV + if (supports_ir(PIPE_SHADER_IR_NIR_SERIALIZED)) + return; +#endif } + if (pipe) + pipe->destroy(pipe); + throw error(CL_INVALID_DEVICE); } device::~device() { @@ -246,7 +251,11 @@ enum pipe_shader_ir device::ir_format() const { - return PIPE_SHADER_IR_NATIVE; + if (supports_ir(PIPE_SHADER_IR_NATIVE)) + return PIPE_SHADER_IR_NATIVE; + + assert(supports_ir(PIPE_SHADER_IR_NIR_SERIALIZED)); + return PIPE_SHADER_IR_NIR_SERIALIZED; } std::string @@ -294,3 +303,8 @@ + std::string(has_doubles() ? " cl_khr_fp64" : "") + std::string(has_halves() ? " cl_khr_fp16" : ""); } + +const void * +device::get_compiler_options(enum pipe_shader_ir ir) const { + return pipe->get_compiler_options(pipe, ir, PIPE_SHADER_COMPUTE); +} diff -Nru mesa-19.2.8/src/gallium/state_trackers/clover/core/device.hpp mesa-20.0.8/src/gallium/state_trackers/clover/core/device.hpp --- mesa-19.2.8/src/gallium/state_trackers/clover/core/device.hpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/state_trackers/clover/core/device.hpp 2020-06-12 01:21:17.000000000 +0000 @@ -90,6 +90,7 @@ friend class hard_event; friend std::set supported_formats(const context &, cl_mem_object_type); + const void *get_compiler_options(enum pipe_shader_ir ir) const; clover::platform &platform; diff -Nru mesa-19.2.8/src/gallium/state_trackers/clover/core/error.hpp mesa-20.0.8/src/gallium/state_trackers/clover/core/error.hpp --- mesa-19.2.8/src/gallium/state_trackers/clover/core/error.hpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/state_trackers/clover/core/error.hpp 2020-06-12 01:21:17.000000000 +0000 @@ -23,7 +23,6 @@ #ifndef CLOVER_CORE_ERROR_HPP #define CLOVER_CORE_ERROR_HPP -#define CL_TARGET_OPENCL_VERSION 220 #include "CL/cl.h" #include diff -Nru mesa-19.2.8/src/gallium/state_trackers/clover/core/kernel.cpp mesa-20.0.8/src/gallium/state_trackers/clover/core/kernel.cpp --- mesa-19.2.8/src/gallium/state_trackers/clover/core/kernel.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/state_trackers/clover/core/kernel.cpp 2020-06-12 01:21:17.000000000 +0000 @@ -161,8 +161,9 @@ // Bind kernel arguments. auto &m = kern.program().build(q->device()).binary; - auto margs = find(name_equals(kern.name()), m.syms).args; - auto msec = find(type_equals(module::section::text_executable), m.secs); + auto msym = find(name_equals(kern.name()), m.syms); + auto margs = msym.args; + auto msec = find(id_equals(msym.section), m.secs); auto explicit_arg = kern._args.begin(); for (auto &marg : margs) { diff -Nru mesa-19.2.8/src/gallium/state_trackers/clover/core/memory.cpp mesa-20.0.8/src/gallium/state_trackers/clover/core/memory.cpp --- mesa-19.2.8/src/gallium/state_trackers/clover/core/memory.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/state_trackers/clover/core/memory.cpp 2020-06-12 01:21:17.000000000 +0000 @@ -22,7 +22,7 @@ #include "core/memory.hpp" #include "core/resource.hpp" -#include "util/u_format.h" +#include "util/format/u_format.h" using namespace clover; diff -Nru mesa-19.2.8/src/gallium/state_trackers/clover/core/object.hpp mesa-20.0.8/src/gallium/state_trackers/clover/core/object.hpp --- mesa-19.2.8/src/gallium/state_trackers/clover/core/object.hpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/state_trackers/clover/core/object.hpp 2020-06-12 01:21:17.000000000 +0000 @@ -27,7 +27,6 @@ #include #include -#define CL_TARGET_OPENCL_VERSION 220 #include "CL/cl.h" #include "core/error.hpp" @@ -52,7 +51,7 @@ "ICD requires CL API objects to be standard layout."); } - const _cl_icd_dispatch *dispatch; + const cl_icd_dispatch *dispatch; }; struct default_tag; diff -Nru mesa-19.2.8/src/gallium/state_trackers/clover/core/program.cpp mesa-20.0.8/src/gallium/state_trackers/clover/core/program.cpp --- mesa-19.2.8/src/gallium/state_trackers/clover/core/program.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/state_trackers/clover/core/program.cpp 2020-06-12 01:21:17.000000000 +0000 @@ -20,8 +20,8 @@ // OTHER DEALINGS IN THE SOFTWARE. // +#include "core/compiler.hpp" #include "core/program.hpp" -#include "llvm/invocation.hpp" using namespace clover; @@ -51,9 +51,8 @@ std::string log; try { - assert(dev.ir_format() == PIPE_SHADER_IR_NATIVE); - const module m = llvm::compile_program(_source, headers, dev, opts, - log); + const module m = + compiler::compile_program(_source, headers, dev, opts, log); _builds[&dev] = { m, opts, log }; } catch (...) { _builds[&dev] = { module(), opts, log }; @@ -75,8 +74,7 @@ std::string log = _builds[&dev].log; try { - assert(dev.ir_format() == PIPE_SHADER_IR_NATIVE); - const module m = llvm::link_program(ms, dev, opts, log); + const module m = compiler::link_program(ms, dev, opts, log); _builds[&dev] = { m, opts, log }; } catch (...) { _builds[&dev] = { module(), opts, log }; diff -Nru mesa-19.2.8/src/gallium/state_trackers/clover/core/resource.cpp mesa-20.0.8/src/gallium/state_trackers/clover/core/resource.cpp --- mesa-19.2.8/src/gallium/state_trackers/clover/core/resource.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/state_trackers/clover/core/resource.cpp 2020-06-12 01:21:17.000000000 +0000 @@ -24,7 +24,7 @@ #include "core/memory.hpp" #include "pipe/p_screen.h" #include "util/u_sampler.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_inlines.h" using namespace clover; diff -Nru mesa-19.2.8/src/gallium/state_trackers/clover/llvm/codegen/bitcode.cpp mesa-20.0.8/src/gallium/state_trackers/clover/llvm/codegen/bitcode.cpp --- mesa-19.2.8/src/gallium/state_trackers/clover/llvm/codegen/bitcode.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/state_trackers/clover/llvm/codegen/bitcode.cpp 2020-06-12 01:21:17.000000000 +0000 @@ -38,7 +38,8 @@ #include "util/algorithm.hpp" #include -#if HAVE_LLVM < 0x0400 +#include +#if LLVM_VERSION_MAJOR < 4 #include #else #include @@ -50,18 +51,6 @@ using namespace clover::llvm; namespace { - std::map - get_symbol_offsets(const ::llvm::Module &mod) { - std::map offsets; - unsigned i = 0; - - for (const auto &name : map(std::mem_fn(&::llvm::Function::getName), - get_kernels(mod))) - offsets[name] = i++; - - return offsets; - } - std::vector emit_code(const ::llvm::Module &mod) { ::llvm::SmallVector data; diff -Nru mesa-19.2.8/src/gallium/state_trackers/clover/llvm/codegen/common.cpp mesa-20.0.8/src/gallium/state_trackers/clover/llvm/codegen/common.cpp --- mesa-19.2.8/src/gallium/state_trackers/clover/llvm/codegen/common.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/state_trackers/clover/llvm/codegen/common.cpp 2020-06-12 01:21:17.000000000 +0000 @@ -33,7 +33,6 @@ #include "llvm/codegen.hpp" #include "llvm/metadata.hpp" -#define CL_TARGET_OPENCL_VERSION 220 #include "CL/cl.h" #include "pipe/p_state.h" @@ -177,7 +176,7 @@ module::section make_text_section(const std::vector &code) { - const pipe_llvm_program_header header { uint32_t(code.size()) }; + const pipe_binary_program_header header { uint32_t(code.size()) }; module::section text { 0, module::section::text_executable, header.num_bytes, {} }; @@ -197,8 +196,9 @@ const clang::CompilerInstance &c) { module m; - for (const auto &name : map(std::mem_fn(&Function::getName), + for (const auto &llvm_name : map(std::mem_fn(&Function::getName), get_kernels(mod))) { + const ::std::string name(llvm_name); if (offsets.count(name)) m.syms.emplace_back(name, 0, offsets.at(name), make_kernel_args(mod, name, c)); diff -Nru mesa-19.2.8/src/gallium/state_trackers/clover/llvm/codegen/native.cpp mesa-20.0.8/src/gallium/state_trackers/clover/llvm/codegen/native.cpp --- mesa-19.2.8/src/gallium/state_trackers/clover/llvm/codegen/native.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/state_trackers/clover/llvm/codegen/native.cpp 2020-06-12 01:21:17.000000000 +0000 @@ -105,7 +105,7 @@ std::vector emit_code(::llvm::Module &mod, const target &target, - TargetMachine::CodeGenFileType ft, + compat::CodeGenFileType ft, std::string &r_log) { std::string err; auto t = ::llvm::TargetRegistry::lookupTarget(target.triple, err); @@ -128,7 +128,7 @@ mod.setDataLayout(tm->createDataLayout()); tm->Options.MCOptions.AsmVerbose = - (ft == TargetMachine::CGFT_AssemblyFile); + (ft == compat::CGFT_AssemblyFile); if (compat::add_passes_to_emit_file(*tm, pm, os, ft)) fail(r_log, build_error(), "TargetMachine can't emit this file"); @@ -145,7 +145,7 @@ const clang::CompilerInstance &c, std::string &r_log) { const auto code = emit_code(mod, target, - TargetMachine::CGFT_ObjectFile, r_log); + compat::CGFT_ObjectFile, r_log); return build_module_common(mod, code, get_symbol_offsets(code, r_log), c); } @@ -156,7 +156,7 @@ try { std::unique_ptr< ::llvm::Module> cmod { compat::clone_module(mod) }; return as_string(emit_code(*cmod, target, - TargetMachine::CGFT_AssemblyFile, log)); + compat::CGFT_AssemblyFile, log)); } catch (...) { return "Couldn't output native disassembly: " + log; } diff -Nru mesa-19.2.8/src/gallium/state_trackers/clover/llvm/compat.hpp mesa-20.0.8/src/gallium/state_trackers/clover/llvm/compat.hpp --- mesa-19.2.8/src/gallium/state_trackers/clover/llvm/compat.hpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/state_trackers/clover/llvm/compat.hpp 2020-06-12 01:21:17.000000000 +0000 @@ -36,7 +36,8 @@ #include "util/algorithm.hpp" -#if HAVE_LLVM < 0x0400 +#include +#if LLVM_VERSION_MAJOR < 4 #include #else #include @@ -48,7 +49,7 @@ #include #include #include -#if HAVE_LLVM >= 0x0400 +#if LLVM_VERSION_MAJOR >= 4 #include #else #include @@ -60,34 +61,50 @@ #include #include -#if HAVE_LLVM >= 0x0800 +#if LLVM_VERSION_MAJOR >= 8 #include #else #include #endif +#if LLVM_VERSION_MAJOR >= 10 +#include +#endif + namespace clover { namespace llvm { namespace compat { + +#if LLVM_VERSION_MAJOR >= 10 + const auto CGFT_ObjectFile = ::llvm::CGFT_ObjectFile; + const auto CGFT_AssemblyFile = ::llvm::CGFT_AssemblyFile; + typedef ::llvm::CodeGenFileType CodeGenFileType; +#else + const auto CGFT_ObjectFile = ::llvm::TargetMachine::CGFT_ObjectFile; + const auto CGFT_AssemblyFile = + ::llvm::TargetMachine::CGFT_AssemblyFile; + typedef ::llvm::TargetMachine::CodeGenFileType CodeGenFileType; +#endif + template unsigned target_address_space(const T &target, const AS lang_as) { const auto &map = target.getAddressSpaceMap(); -#if HAVE_LLVM >= 0x0500 +#if LLVM_VERSION_MAJOR >= 5 return map[static_cast(lang_as)]; #else return map[lang_as - clang::LangAS::Offset]; #endif } -#if HAVE_LLVM >= 0x1000 +#if LLVM_VERSION_MAJOR >= 10 const clang::InputKind ik_opencl = clang::Language::OpenCL; -#elif HAVE_LLVM >= 0x0500 +#elif LLVM_VERSION_MAJOR >= 5 const clang::InputKind ik_opencl = clang::InputKind::OpenCL; #else const clang::InputKind ik_opencl = clang::IK_OpenCL; #endif -#if HAVE_LLVM >= 0x0500 +#if LLVM_VERSION_MAJOR >= 5 const clang::LangStandard::Kind lang_opencl10 = clang::LangStandard::lang_opencl10; #else const clang::LangStandard::Kind lang_opencl10 = clang::LangStandard::lang_opencl; @@ -96,7 +113,7 @@ inline void add_link_bitcode_file(clang::CodeGenOptions &opts, const std::string &path) { -#if HAVE_LLVM >= 0x0500 +#if LLVM_VERSION_MAJOR >= 5 clang::CodeGenOptions::BitcodeFileToLink F; F.Filename = path; @@ -108,7 +125,7 @@ #endif } -#if HAVE_LLVM >= 0x0600 +#if LLVM_VERSION_MAJOR >= 6 const auto default_code_model = ::llvm::None; #else const auto default_code_model = ::llvm::CodeModel::Default; @@ -116,7 +133,7 @@ template void handle_module_error(M &mod, const F &f) { -#if HAVE_LLVM >= 0x0400 +#if LLVM_VERSION_MAJOR >= 4 if (::llvm::Error err = mod.takeError()) ::llvm::handleAllErrors(std::move(err), [&](::llvm::ErrorInfoBase &eib) { f(eib.message()); @@ -130,7 +147,7 @@ template void set_diagnostic_handler(::llvm::LLVMContext &ctx, T *diagnostic_handler, void *data) { -#if HAVE_LLVM >= 0x0600 +#if LLVM_VERSION_MAJOR >= 6 ctx.setDiagnosticHandlerCallBack(diagnostic_handler, data); #else ctx.setDiagnosticHandler(diagnostic_handler, data); @@ -140,7 +157,7 @@ inline std::unique_ptr< ::llvm::Module> clone_module(const ::llvm::Module &mod) { -#if HAVE_LLVM >= 0x0700 +#if LLVM_VERSION_MAJOR >= 7 return ::llvm::CloneModule(mod); #else return ::llvm::CloneModule(&mod); @@ -150,7 +167,7 @@ template void write_bitcode_to_file(const ::llvm::Module &mod, T &os) { -#if HAVE_LLVM >= 0x0700 +#if LLVM_VERSION_MAJOR >= 7 ::llvm::WriteBitcodeToFile(mod, os); #else ::llvm::WriteBitcodeToFile(&mod, os); @@ -160,16 +177,30 @@ template bool add_passes_to_emit_file(TM &tm, PM &pm, OS &os, FT &ft) { -#if HAVE_LLVM >= 0x0700 +#if LLVM_VERSION_MAJOR >= 7 return tm.addPassesToEmitFile(pm, os, nullptr, ft); #else return tm.addPassesToEmitFile(pm, os, ft); #endif } + template inline bool + create_compiler_invocation_from_args(clang::CompilerInvocation &cinv, + T copts, + clang::DiagnosticsEngine &diag) + { +#if LLVM_VERSION_MAJOR >= 10 + return clang::CompilerInvocation::CreateFromArgs( + cinv, copts, diag); +#else + return clang::CompilerInvocation::CreateFromArgs( + cinv, copts.data(), copts.data() + copts.size(), diag); +#endif + } + template T get_abi_type(const T &arg_type, const M &mod) { -#if HAVE_LLVM >= 0x0700 +#if LLVM_VERSION_MAJOR >= 7 return arg_type; #else ::llvm::DataLayout dl(&mod); diff -Nru mesa-19.2.8/src/gallium/state_trackers/clover/llvm/invocation.cpp mesa-20.0.8/src/gallium/state_trackers/clover/llvm/invocation.cpp --- mesa-19.2.8/src/gallium/state_trackers/clover/llvm/invocation.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/state_trackers/clover/llvm/invocation.cpp 2020-06-12 01:21:17.000000000 +0000 @@ -30,6 +30,9 @@ #include #include #include +#ifdef HAVE_CLOVER_SPIRV +#include +#endif #include #include @@ -51,6 +54,9 @@ #include "llvm/invocation.hpp" #include "llvm/metadata.hpp" #include "llvm/util.hpp" +#ifdef HAVE_CLOVER_SPIRV +#include "spirv/invocation.hpp" +#endif #include "util/algorithm.hpp" @@ -98,6 +104,7 @@ LLVMInitializeAllTargets(); LLVMInitializeAllTargetInfos(); LLVMInitializeAllTargetMCs(); + LLVMInitializeAllAsmParsers(); LLVMInitializeAllAsmPrinters(); targets_initialized = true; } @@ -182,7 +189,7 @@ } std::unique_ptr - create_compiler_instance(const device &dev, + create_compiler_instance(const device &dev, const std::string& ir_target, const std::vector &opts, std::string &r_log) { std::unique_ptr c { new clang::CompilerInstance }; @@ -196,11 +203,11 @@ const std::vector copts = map(std::mem_fn(&std::string::c_str), opts); - const target &target = dev.ir_target(); + const target &target = ir_target; const std::string &device_clc_version = dev.device_clc_version(); - if (!clang::CompilerInvocation::CreateFromArgs( - c->getInvocation(), copts.data(), copts.data() + copts.size(), diag)) + if (!compat::create_compiler_invocation_from_args( + c->getInvocation(), copts, diag)) throw invalid_build_options_error(); diag_buffer->FlushDiagnostics(diag); @@ -235,19 +242,29 @@ compile(LLVMContext &ctx, clang::CompilerInstance &c, const std::string &name, const std::string &source, const header_map &headers, const device &dev, - const std::string &opts, std::string &r_log) { + const std::string &opts, bool use_libclc, std::string &r_log) { c.getFrontendOpts().ProgramAction = clang::frontend::EmitLLVMOnly; c.getHeaderSearchOpts().UseBuiltinIncludes = true; c.getHeaderSearchOpts().UseStandardSystemIncludes = true; c.getHeaderSearchOpts().ResourceDir = CLANG_RESOURCE_DIR; - // Add libclc generic search path - c.getHeaderSearchOpts().AddPath(LIBCLC_INCLUDEDIR, - clang::frontend::Angled, - false, false); + if (use_libclc) { + // Add libclc generic search path + c.getHeaderSearchOpts().AddPath(LIBCLC_INCLUDEDIR, + clang::frontend::Angled, + false, false); + + // Add libclc include + c.getPreprocessorOpts().Includes.push_back("clc/clc.h"); + } else { + // Add opencl-c generic search path + c.getHeaderSearchOpts().AddPath(CLANG_RESOURCE_DIR, + clang::frontend::Angled, + false, false); - // Add libclc include - c.getPreprocessorOpts().Includes.push_back("clc/clc.h"); + // Add opencl include + c.getPreprocessorOpts().Includes.push_back("opencl-c.h"); + } // Add definition for the OpenCL version c.getPreprocessorOpts().addMacroDef("__OPENCL_VERSION__=" + @@ -279,8 +296,9 @@ // attribute will prevent Clang from creating illegal uses of // barrier() (e.g. Moving barrier() inside a conditional that is // no executed by all threads) during its optimizaton passes. - compat::add_link_bitcode_file(c.getCodeGenOpts(), - LIBCLC_LIBEXECDIR + dev.ir_target() + ".bc"); + if (use_libclc) + compat::add_link_bitcode_file(c.getCodeGenOpts(), + LIBCLC_LIBEXECDIR + dev.ir_target() + ".bc"); // Compile the code clang::EmitLLVMOnlyAction act(&ctx); @@ -301,8 +319,10 @@ debug::log(".cl", "// Options: " + opts + '\n' + source); auto ctx = create_context(r_log); - auto c = create_compiler_instance(dev, tokenize(opts + " input.cl"), r_log); - auto mod = compile(*ctx, *c, "input.cl", source, headers, dev, opts, r_log); + auto c = create_compiler_instance(dev, dev.ir_target(), + tokenize(opts + " input.cl"), r_log); + auto mod = compile(*ctx, *c, "input.cl", source, headers, dev, opts, true, + r_log); if (has_flag(debug::llvm)) debug::log(".ll", print_module_bitcode(*mod)); @@ -357,20 +377,20 @@ throw build_error(); } - return std::move(mod); + return mod; } } module clover::llvm::link_program(const std::vector &modules, - const device &dev, - const std::string &opts, std::string &r_log) { + const device &dev, const std::string &opts, + std::string &r_log) { std::vector options = tokenize(opts + " input.cl"); const bool create_library = count("-create-library", options); erase_if(equals("-create-library"), options); auto ctx = create_context(r_log); - auto c = create_compiler_instance(dev, options, r_log); + auto c = create_compiler_instance(dev, dev.ir_target(), options, r_log); auto mod = link(*ctx, *c, modules, r_log); optimize(*mod, c->getCodeGenOpts().OptimizationLevel, !create_library); @@ -395,3 +415,51 @@ unreachable("Unsupported IR."); } } + +#ifdef HAVE_CLOVER_SPIRV +module +clover::llvm::compile_to_spirv(const std::string &source, + const header_map &headers, + const device &dev, + const std::string &opts, + std::string &r_log) { + if (has_flag(debug::clc)) + debug::log(".cl", "// Options: " + opts + '\n' + source); + + auto ctx = create_context(r_log); + const std::string target = dev.address_bits() == 32u ? + "-spir-unknown-unknown" : + "-spir64-unknown-unknown"; + auto c = create_compiler_instance(dev, target, + tokenize(opts + " input.cl"), r_log); + auto mod = compile(*ctx, *c, "input.cl", source, headers, dev, opts, false, + r_log); + + if (has_flag(debug::llvm)) + debug::log(".ll", print_module_bitcode(*mod)); + + std::string error_msg; + if (!::llvm::regularizeLlvmForSpirv(mod.get(), error_msg)) { + r_log += "Failed to regularize LLVM IR for SPIR-V: " + error_msg + ".\n"; + throw error(CL_INVALID_VALUE); + } + + std::ostringstream os; + if (!::llvm::writeSpirv(mod.get(), os, error_msg)) { + r_log += "Translation from LLVM IR to SPIR-V failed: " + error_msg + ".\n"; + throw error(CL_INVALID_VALUE); + } + + const std::string osContent = os.str(); + std::vector binary(osContent.begin(), osContent.end()); + if (binary.empty()) { + r_log += "Failed to retrieve SPIR-V binary.\n"; + throw error(CL_INVALID_VALUE); + } + + if (has_flag(debug::spirv)) + debug::log(".spvasm", spirv::print_module(binary, dev.device_version())); + + return spirv::compile_program(binary, dev, r_log); +} +#endif diff -Nru mesa-19.2.8/src/gallium/state_trackers/clover/llvm/invocation.hpp mesa-20.0.8/src/gallium/state_trackers/clover/llvm/invocation.hpp --- mesa-19.2.8/src/gallium/state_trackers/clover/llvm/invocation.hpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/state_trackers/clover/llvm/invocation.hpp 2020-06-12 01:21:17.000000000 +0000 @@ -40,6 +40,14 @@ const device &device, const std::string &opts, std::string &r_log); + +#ifdef HAVE_CLOVER_SPIRV + module compile_to_spirv(const std::string &source, + const header_map &headers, + const device &dev, + const std::string &opts, + std::string &r_log); +#endif } } diff -Nru mesa-19.2.8/src/gallium/state_trackers/clover/llvm/metadata.hpp mesa-20.0.8/src/gallium/state_trackers/clover/llvm/metadata.hpp --- mesa-19.2.8/src/gallium/state_trackers/clover/llvm/metadata.hpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/state_trackers/clover/llvm/metadata.hpp 2020-06-12 01:21:17.000000000 +0000 @@ -32,69 +32,23 @@ #include "util/algorithm.hpp" #include +#include #include #include namespace clover { namespace llvm { namespace detail { - inline std::vector - get_kernel_nodes(const ::llvm::Module &mod) { - if (const ::llvm::NamedMDNode *n = - mod.getNamedMetadata("opencl.kernels")) - return { n->op_begin(), n->op_end() }; - else - return {}; - } - - inline std::function - is_kernel_node_for(const ::llvm::Function &f) { - return [&](const ::llvm::MDNode *n) { - using ::llvm::mdconst::dyn_extract; - return &f == dyn_extract< ::llvm::Function>(n->getOperand(0)); - }; - } - inline bool is_kernel(const ::llvm::Function &f) { -#if HAVE_LLVM >= 0x0309 return f.getMetadata("kernel_arg_type"); -#else - return clover::any_of(is_kernel_node_for(f), - get_kernel_nodes(*f.getParent())); -#endif } inline iterator_range< ::llvm::MDNode::op_iterator> get_kernel_metadata_operands(const ::llvm::Function &f, const std::string &name) { -#if HAVE_LLVM >= 0x0309 - // On LLVM v3.9+ kernel argument attributes are stored as - // function metadata. const auto data_node = f.getMetadata(name); return range(data_node->op_begin(), data_node->op_end()); -#else - using ::llvm::cast; - using ::llvm::dyn_cast; - const auto kernel_node = find(is_kernel_node_for(f), - get_kernel_nodes(*f.getParent())); - - const auto data_node = cast< ::llvm::MDNode>( - find([&](const ::llvm::MDOperand &op) { - if (auto m = dyn_cast< ::llvm::MDNode>(op)) - if (m->getNumOperands()) - if (auto m_name = dyn_cast< ::llvm::MDString>( - m->getOperand(0).get())) - return m_name->getString() == name; - - return false; - }, - kernel_node->operands())); - - // Skip the first operand node which is just the metadata - // attribute name. - return range(data_node->op_begin() + 1, data_node->op_end()); -#endif } } @@ -108,7 +62,7 @@ const std::string &name) { return ::llvm::cast< ::llvm::MDString>( detail::get_kernel_metadata_operands(f, name)[arg.getArgNo()]) - ->getString(); + ->getString().str(); } /// diff -Nru mesa-19.2.8/src/gallium/state_trackers/clover/llvm/util.hpp mesa-20.0.8/src/gallium/state_trackers/clover/llvm/util.hpp --- mesa-19.2.8/src/gallium/state_trackers/clover/llvm/util.hpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/state_trackers/clover/llvm/util.hpp 2020-06-12 01:21:17.000000000 +0000 @@ -101,7 +101,8 @@ enum flag { clc = 1 << 0, llvm = 1 << 1, - native = 1 << 2 + native = 1 << 2, + spirv = 1 << 3, }; inline bool @@ -111,6 +112,7 @@ { "llvm", llvm, "Dump the generated LLVM IR for all kernels." }, { "native", native, "Dump kernel assembly code for targets " "specifying PIPE_SHADER_IR_NATIVE" }, + { "spirv", spirv, "Dump the generated SPIR-V for all kernels." }, DEBUG_NAMED_VALUE_END }; static const unsigned flags = diff -Nru mesa-19.2.8/src/gallium/state_trackers/clover/Makefile.sources mesa-20.0.8/src/gallium/state_trackers/clover/Makefile.sources --- mesa-19.2.8/src/gallium/state_trackers/clover/Makefile.sources 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/state_trackers/clover/Makefile.sources 2020-06-12 01:21:17.000000000 +0000 @@ -62,3 +62,7 @@ llvm/invocation.hpp \ llvm/metadata.hpp \ llvm/util.hpp + +SPIRV_SOURCES := \ + spirv/invocation.cpp \ + spirv/invocation.hpp diff -Nru mesa-19.2.8/src/gallium/state_trackers/clover/meson.build mesa-20.0.8/src/gallium/state_trackers/clover/meson.build --- mesa-19.2.8/src/gallium/state_trackers/clover/meson.build 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/state_trackers/clover/meson.build 2020-06-12 01:21:17.000000000 +0000 @@ -19,12 +19,32 @@ # SOFTWARE. clover_cpp_args = [] +clover_opencl_cpp_args = [ + '-DCL_TARGET_OPENCL_VERSION=220', + '-DCL_USE_DEPRECATED_OPENCL_1_0_APIS', + '-DCL_USE_DEPRECATED_OPENCL_1_1_APIS', + '-DCL_USE_DEPRECATED_OPENCL_1_2_APIS', + '-DCL_USE_DEPRECATED_OPENCL_2_0_APIS', + '-DCL_USE_DEPRECATED_OPENCL_2_1_APIS' +] +clover_spirv_cpp_args = [] clover_incs = [inc_include, inc_src, inc_gallium, inc_gallium_aux] +# the CL header files declare attributes on the CL types. Compilers warn if +# we use them as template arguments. Disable the warning as there isn't +# anything we can do about it +if cpp.has_argument('-Wno-ignored-attributes') + clover_cpp_args += '-Wno-ignored-attributes' +endif + if with_opencl_icd clover_cpp_args += '-DHAVE_CLOVER_ICD' endif +if with_opencl_spirv + clover_spirv_cpp_args += '-DHAVE_CLOVER_SPIRV' +endif + libclllvm = static_library( 'clllvm', files( @@ -40,6 +60,9 @@ ), include_directories : clover_incs, cpp_args : [ + clover_cpp_args, + clover_opencl_cpp_args, + clover_spirv_cpp_args, cpp_vis_args, '-DLIBCLC_INCLUDEDIR="@0@/"'.format(dep_clc.get_pkgconfig_variable('includedir')), '-DLIBCLC_LIBEXECDIR="@0@/"'.format(dep_clc.get_pkgconfig_variable('libexecdir')), @@ -48,7 +71,25 @@ dep_llvm.version(), 'include', )), ], - dependencies : [dep_llvm, dep_elf], + dependencies : [dep_llvm, dep_elf, dep_llvmspirvlib], + override_options : clover_cpp_std, +) + +libclspirv = static_library( + 'clspirv', + files('spirv/invocation.cpp', 'spirv/invocation.hpp'), + include_directories : clover_incs, + cpp_args : [clover_opencl_cpp_args, clover_spirv_cpp_args, cpp_vis_args], + dependencies : [dep_spirv_tools], + override_options : clover_cpp_std, +) + +libclnir = static_library( + 'clnir', + files('nir/invocation.cpp', 'nir/invocation.hpp'), + include_directories : [clover_incs, inc_mesa], + dependencies : idep_nir, + cpp_args : [clover_opencl_cpp_args, clover_spirv_cpp_args, cpp_vis_args], override_options : clover_cpp_std, ) @@ -67,6 +108,7 @@ 'api/sampler.cpp', 'api/transfer.cpp', 'api/util.hpp', + 'core/compiler.hpp', 'core/context.cpp', 'core/context.hpp', 'core/device.cpp', @@ -111,7 +153,12 @@ 'clover', [clover_files, sha1_h], include_directories : clover_incs, - cpp_args : [clover_cpp_args, cpp_vis_args], - link_with : [libclllvm], + cpp_args : [ + clover_opencl_cpp_args, + clover_spirv_cpp_args, + clover_cpp_args, + cpp_vis_args + ], + link_with : [libclllvm, libclspirv, libclnir], override_options : clover_cpp_std, ) diff -Nru mesa-19.2.8/src/gallium/state_trackers/clover/nir/invocation.cpp mesa-20.0.8/src/gallium/state_trackers/clover/nir/invocation.cpp --- mesa-19.2.8/src/gallium/state_trackers/clover/nir/invocation.cpp 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/state_trackers/clover/nir/invocation.cpp 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,173 @@ +// +// Copyright 2019 Karol Herbst +// +// Permission is hereby granted, free of charge, to any person obtaining a +// copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. +// + +#include "invocation.hpp" + +#include + +#include "core/device.hpp" +#include "core/error.hpp" +#include "pipe/p_state.h" +#include "util/algorithm.hpp" +#include "util/functional.hpp" + +#include +#include +#include +#include + +using namespace clover; + +#ifdef HAVE_CLOVER_SPIRV + +// Refs and unrefs the glsl_type_singleton. +static class glsl_type_ref { +public: + glsl_type_ref() { + glsl_type_singleton_init_or_ref(); + } + + ~glsl_type_ref() { + glsl_type_singleton_decref(); + } +} glsl_type_ref; + +static const nir_shader_compiler_options * +dev_get_nir_compiler_options(const device &dev) +{ + const void *co = dev.get_compiler_options(PIPE_SHADER_IR_NIR); + return static_cast(co); +} + +module clover::nir::spirv_to_nir(const module &mod, const device &dev, + std::string &r_log) +{ + struct spirv_to_nir_options spirv_options = {}; + spirv_options.environment = NIR_SPIRV_OPENCL; + spirv_options.caps.address = true; + spirv_options.caps.float64 = true; + spirv_options.caps.int8 = true; + spirv_options.caps.int16 = true; + spirv_options.caps.int64 = true; + spirv_options.caps.kernel = true; + spirv_options.constant_as_global = true; + + module m; + // We only insert one section. + assert(mod.secs.size() == 1); + auto §ion = mod.secs[0]; + + module::resource_id section_id = 0; + for (const auto &sym : mod.syms) { + assert(sym.section == 0); + + const auto *binary = + reinterpret_cast(section.data.data()); + const uint32_t *data = reinterpret_cast(binary->blob); + const size_t num_words = binary->num_bytes / 4; + const char *name = sym.name.c_str(); + auto *compiler_options = dev_get_nir_compiler_options(dev); + + nir_shader *nir = spirv_to_nir(data, num_words, nullptr, 0, + MESA_SHADER_KERNEL, name, + &spirv_options, compiler_options); + if (!nir) { + r_log += "Translation from SPIR-V to NIR for kernel \"" + sym.name + + "\" failed.\n"; + throw build_error(); + } + + nir->info.cs.local_size_variable = true; + nir_validate_shader(nir, "clover"); + + // Calculate input offsets. + unsigned offset = 0; + nir_foreach_variable_safe(var, &nir->inputs) { + offset = align(offset, glsl_get_cl_alignment(var->type)); + var->data.driver_location = offset; + offset += glsl_get_cl_size(var->type); + } + + // Inline all functions first. + // according to the comment on nir_inline_functions + NIR_PASS_V(nir, nir_lower_constant_initializers, nir_var_function_temp); + NIR_PASS_V(nir, nir_lower_returns); + NIR_PASS_V(nir, nir_inline_functions); + NIR_PASS_V(nir, nir_opt_deref); + + // Pick off the single entrypoint that we want. + foreach_list_typed_safe(nir_function, func, node, &nir->functions) { + if (!func->is_entrypoint) + exec_node_remove(&func->node); + } + assert(exec_list_length(&nir->functions) == 1); + + nir_validate_shader(nir, "clover after function inlining"); + + NIR_PASS_V(nir, nir_lower_constant_initializers, + static_cast(~nir_var_function_temp)); + + // copy propagate to prepare for lower_explicit_io + NIR_PASS_V(nir, nir_split_var_copies); + NIR_PASS_V(nir, nir_opt_copy_prop_vars); + NIR_PASS_V(nir, nir_lower_var_copies); + NIR_PASS_V(nir, nir_lower_vars_to_ssa); + NIR_PASS_V(nir, nir_opt_dce); + + nir_variable_mode modes = (nir_variable_mode)( + nir_var_shader_in | + nir_var_mem_global | + nir_var_mem_shared); + nir_address_format format = nir->info.cs.ptr_size == 64 ? + nir_address_format_64bit_global : nir_address_format_32bit_global; + NIR_PASS_V(nir, nir_lower_explicit_io, modes, format); + + NIR_PASS_V(nir, nir_lower_system_values); + if (compiler_options->lower_int64_options) + NIR_PASS_V(nir, nir_lower_int64, + compiler_options->lower_int64_options); + + NIR_PASS_V(nir, nir_opt_dce); + + struct blob blob; + blob_init(&blob); + nir_serialize(&blob, nir, false); + + const pipe_binary_program_header header { uint32_t(blob.size) }; + module::section text { section_id, module::section::text_executable, header.num_bytes, {} }; + text.data.insert(text.data.end(), reinterpret_cast(&header), + reinterpret_cast(&header) + sizeof(header)); + text.data.insert(text.data.end(), blob.data, blob.data + blob.size); + + m.syms.emplace_back(sym.name, section_id, 0, sym.args); + m.secs.push_back(text); + section_id++; + } + return m; +} +#else +module clover::nir::spirv_to_nir(const module &mod, const device &dev, std::string &r_log) +{ + r_log += "SPIR-V support in clover is not enabled.\n"; + throw error(CL_LINKER_NOT_AVAILABLE); +} +#endif diff -Nru mesa-19.2.8/src/gallium/state_trackers/clover/nir/invocation.hpp mesa-20.0.8/src/gallium/state_trackers/clover/nir/invocation.hpp --- mesa-19.2.8/src/gallium/state_trackers/clover/nir/invocation.hpp 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/state_trackers/clover/nir/invocation.hpp 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,36 @@ +// +// Copyright 2019 Karol Herbst +// +// Permission is hereby granted, free of charge, to any person obtaining a +// copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. +// + +#ifndef CLOVER_NIR_INVOCATION_HPP +#define CLOVER_NIR_INVOCATION_HPP + +#include "core/module.hpp" + +namespace clover { + class device; + namespace nir { + // converts a given spirv module to nir + module spirv_to_nir(const module &mod, const device &dev, std::string &r_log); + } +} + +#endif diff -Nru mesa-19.2.8/src/gallium/state_trackers/clover/spirv/invocation.cpp mesa-20.0.8/src/gallium/state_trackers/clover/spirv/invocation.cpp --- mesa-19.2.8/src/gallium/state_trackers/clover/spirv/invocation.cpp 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/state_trackers/clover/spirv/invocation.cpp 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,740 @@ +// +// Copyright 2018 Pierre Moreau +// +// Permission is hereby granted, free of charge, to any person obtaining a +// copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. +// + +#include "invocation.hpp" + +#include +#include +#include +#include + +#ifdef HAVE_CLOVER_SPIRV +#include +#include +#endif + +#include "core/error.hpp" +#include "core/platform.hpp" +#include "invocation.hpp" +#include "llvm/util.hpp" +#include "pipe/p_state.h" +#include "util/algorithm.hpp" +#include "util/functional.hpp" +#include "util/u_math.h" + +#include "compiler/spirv/spirv.h" + +#define SPIRV_HEADER_WORD_SIZE 5 + +using namespace clover; + +#ifdef HAVE_CLOVER_SPIRV +namespace { + + template + T get(const char *source, size_t index) { + const uint32_t *word_ptr = reinterpret_cast(source); + return static_cast(word_ptr[index]); + } + + enum module::argument::type + convert_storage_class(SpvStorageClass storage_class, std::string &err) { + switch (storage_class) { + case SpvStorageClassFunction: + return module::argument::scalar; + case SpvStorageClassUniformConstant: + return module::argument::global; + case SpvStorageClassWorkgroup: + return module::argument::local; + case SpvStorageClassCrossWorkgroup: + return module::argument::global; + default: + err += "Invalid storage type " + std::to_string(storage_class) + "\n"; + throw build_error(); + } + } + + enum module::argument::type + convert_image_type(SpvId id, SpvDim dim, SpvAccessQualifier access, + std::string &err) { + if (dim == SpvDim2D && access == SpvAccessQualifierReadOnly) + return module::argument::image2d_rd; + else if (dim == SpvDim2D && access == SpvAccessQualifierWriteOnly) + return module::argument::image2d_wr; + else if (dim == SpvDim3D && access == SpvAccessQualifierReadOnly) + return module::argument::image3d_rd; + else if (dim == SpvDim3D && access == SpvAccessQualifierWriteOnly) + return module::argument::image3d_wr; + else { + err += "Unknown access qualifier " + std::to_string(access) + + " or dimension " + std::to_string(dim) + " for image " + + std::to_string(id) + ".\n"; + throw build_error(); + } + } + + module::section + make_text_section(const std::vector &code, + enum module::section::type section_type) { + const pipe_binary_program_header header { uint32_t(code.size()) }; + module::section text { 0, section_type, header.num_bytes, {} }; + + text.data.insert(text.data.end(), reinterpret_cast(&header), + reinterpret_cast(&header) + sizeof(header)); + text.data.insert(text.data.end(), code.begin(), code.end()); + + return text; + } + + module + create_module_from_spirv(const std::vector &source, + size_t pointer_byte_size, + std::string &err) { + const size_t length = source.size() / sizeof(uint32_t); + size_t i = SPIRV_HEADER_WORD_SIZE; // Skip header + + std::string kernel_name; + size_t kernel_nb = 0u; + std::vector args; + + module m; + + std::unordered_map kernels; + std::unordered_map types; + std::unordered_map pointer_types; + std::unordered_map constants; + std::unordered_set packed_structures; + std::unordered_map> + func_param_attr_map; + + while (i < length) { + const auto inst = &source[i * sizeof(uint32_t)]; + const auto desc_word = get(inst, 0); + const auto opcode = static_cast(desc_word & SpvOpCodeMask); + const unsigned int num_operands = desc_word >> SpvWordCountShift; + + switch (opcode) { + case SpvOpEntryPoint: + if (get(inst, 1) == SpvExecutionModelKernel) + kernels.emplace(get(inst, 2), + source.data() + (i + 3u) * sizeof(uint32_t)); + break; + + case SpvOpDecorate: { + const auto id = get(inst, 1); + const auto decoration = get(inst, 2); + if (decoration == SpvDecorationCPacked) + packed_structures.emplace(id); + else if (decoration == SpvDecorationFuncParamAttr) { + const auto attribute = + get(inst, 3u); + func_param_attr_map[id].push_back(attribute); + } + break; + } + + case SpvOpGroupDecorate: { + const auto group_id = get(inst, 1); + if (packed_structures.count(group_id)) { + for (unsigned int i = 2u; i < num_operands; ++i) + packed_structures.emplace(get(inst, i)); + } + const auto func_param_attr_iter = + func_param_attr_map.find(group_id); + if (func_param_attr_iter != func_param_attr_map.end()) { + for (unsigned int i = 2u; i < num_operands; ++i) + func_param_attr_map.emplace(get(inst, i), + func_param_attr_iter->second); + } + break; + } + + case SpvOpConstant: + // We only care about constants that represent the size of arrays. + // If they are passed as argument, they will never be more than + // 4GB-wide, and even if they did, a clover::module::argument size + // is represented by an int. + constants[get(inst, 2)] = get(inst, 3u); + break; + + case SpvOpTypeInt: // FALLTHROUGH + case SpvOpTypeFloat: { + const auto size = get(inst, 2) / 8u; + types[get(inst, 1)] = { module::argument::scalar, size, + size, size, + module::argument::zero_ext }; + break; + } + + case SpvOpTypeArray: { + const auto id = get(inst, 1); + const auto type_id = get(inst, 2); + const auto types_iter = types.find(type_id); + if (types_iter == types.end()) + break; + + const auto constant_id = get(inst, 3); + const auto constants_iter = constants.find(constant_id); + if (constants_iter == constants.end()) { + err += "Constant " + std::to_string(constant_id) + + " is missing\n"; + throw build_error(); + } + const auto elem_size = types_iter->second.size; + const auto elem_nbs = constants_iter->second; + const auto size = elem_size * elem_nbs; + types[id] = { module::argument::scalar, size, size, + types_iter->second.target_align, + module::argument::zero_ext }; + break; + } + + case SpvOpTypeStruct: { + const auto id = get(inst, 1); + const bool is_packed = packed_structures.count(id); + + unsigned struct_size = 0u; + unsigned struct_align = 1u; + for (unsigned j = 2u; j < num_operands; ++j) { + const auto type_id = get(inst, j); + const auto types_iter = types.find(type_id); + + // If a type was not found, that means it is not one of the + // types allowed as kernel arguments. And since the module has + // been validated, this means this type is not used for kernel + // arguments, and therefore can be ignored. + if (types_iter == types.end()) + break; + + const auto alignment = is_packed ? 1u + : types_iter->second.target_align; + const auto padding = (-struct_size) & (alignment - 1u); + struct_size += padding + types_iter->second.target_size; + struct_align = std::max(struct_align, alignment); + } + struct_size += (-struct_size) & (struct_align - 1u); + types[id] = { module::argument::scalar, struct_size, struct_size, + struct_align, module::argument::zero_ext }; + break; + } + + case SpvOpTypeVector: { + const auto id = get(inst, 1); + const auto type_id = get(inst, 2); + const auto types_iter = types.find(type_id); + + // If a type was not found, that means it is not one of the + // types allowed as kernel arguments. And since the module has + // been validated, this means this type is not used for kernel + // arguments, and therefore can be ignored. + if (types_iter == types.end()) + break; + + const auto elem_size = types_iter->second.size; + const auto elem_nbs = get(inst, 3); + const auto size = elem_size * elem_nbs; + types[id] = { module::argument::scalar, size, size, size, + module::argument::zero_ext }; + break; + } + + case SpvOpTypeForwardPointer: // FALLTHROUGH + case SpvOpTypePointer: { + const auto id = get(inst, 1); + const auto storage_class = get(inst, 2); + // Input means this is for a builtin variable, which can not be + // passed as an argument to a kernel. + if (storage_class == SpvStorageClassInput) + break; + types[id] = { convert_storage_class(storage_class, err), + sizeof(cl_mem), + static_cast(pointer_byte_size), + static_cast(pointer_byte_size), + module::argument::zero_ext }; + if (opcode == SpvOpTypePointer) + pointer_types[id] = get(inst, 3); + break; + } + + case SpvOpTypeSampler: + types[get(inst, 1)] = { module::argument::sampler, + sizeof(cl_sampler) }; + break; + + case SpvOpTypeImage: { + const auto id = get(inst, 1); + const auto dim = get(inst, 3); + const auto access = get(inst, 9); + types[id] = { convert_image_type(id, dim, access, err), + sizeof(cl_mem), sizeof(cl_mem), sizeof(cl_mem), + module::argument::zero_ext }; + break; + } + + case SpvOpTypePipe: // FALLTHROUGH + case SpvOpTypeQueue: { + err += "TypePipe and TypeQueue are valid SPIR-V 1.0 types, but are " + "not available in the currently supported OpenCL C version." + "\n"; + throw build_error(); + } + + case SpvOpFunction: { + const auto kernels_iter = kernels.find(get(inst, 2)); + if (kernels_iter != kernels.end()) + kernel_name = kernels_iter->second; + break; + } + + case SpvOpFunctionParameter: { + if (kernel_name.empty()) + break; + + const auto type_id = get(inst, 1); + auto arg = types.find(type_id)->second; + const auto &func_param_attr_iter = + func_param_attr_map.find(get(inst, 2)); + if (func_param_attr_iter != func_param_attr_map.end()) { + for (auto &i : func_param_attr_iter->second) { + switch (i) { + case SpvFunctionParameterAttributeSext: + arg.ext_type = module::argument::sign_ext; + break; + case SpvFunctionParameterAttributeZext: + arg.ext_type = module::argument::zero_ext; + break; + case SpvFunctionParameterAttributeByVal: { + const SpvId ptr_type_id = + pointer_types.find(type_id)->second; + arg = types.find(ptr_type_id)->second; + break; + } + default: + break; + } + } + } + args.emplace_back(arg); + break; + } + + case SpvOpFunctionEnd: + if (kernel_name.empty()) + break; + m.syms.emplace_back(kernel_name, 0, kernel_nb, args); + ++kernel_nb; + kernel_name.clear(); + args.clear(); + break; + + default: + break; + } + + i += num_operands; + } + + m.secs.push_back(make_text_section(source, + module::section::text_intermediate)); + return m; + } + + bool + check_capabilities(const device &dev, const std::vector &source, + std::string &r_log) { + const size_t length = source.size() / sizeof(uint32_t); + size_t i = SPIRV_HEADER_WORD_SIZE; // Skip header + + while (i < length) { + const auto desc_word = get(source.data(), i); + const auto opcode = static_cast(desc_word & SpvOpCodeMask); + const unsigned int num_operands = desc_word >> SpvWordCountShift; + + if (opcode != SpvOpCapability) + break; + + const auto capability = get(source.data(), i + 1u); + switch (capability) { + // Mandatory capabilities + case SpvCapabilityAddresses: + case SpvCapabilityFloat16Buffer: + case SpvCapabilityGroups: + case SpvCapabilityInt64: + case SpvCapabilityInt16: + case SpvCapabilityInt8: + case SpvCapabilityKernel: + case SpvCapabilityLinkage: + case SpvCapabilityVector16: + break; + // Optional capabilities + case SpvCapabilityImageBasic: + case SpvCapabilityLiteralSampler: + case SpvCapabilitySampled1D: + case SpvCapabilityImage1D: + case SpvCapabilitySampledBuffer: + case SpvCapabilityImageBuffer: + if (!dev.image_support()) { + r_log += "Capability 'ImageBasic' is not supported.\n"; + return false; + } + break; + case SpvCapabilityFloat64: + if (!dev.has_doubles()) { + r_log += "Capability 'Float64' is not supported.\n"; + return false; + } + break; + // Enabled through extensions + case SpvCapabilityFloat16: + if (!dev.has_halves()) { + r_log += "Capability 'Float16' is not supported.\n"; + return false; + } + break; + case SpvCapabilityInt64Atomics: + if (!dev.has_int64_atomics()) { + r_log += "Capability 'Int64Atomics' is not supported.\n"; + return false; + } + break; + default: + r_log += "Capability '" + std::to_string(capability) + + "' is not supported.\n"; + return false; + } + + i += num_operands; + } + + return true; + } + + bool + check_extensions(const device &dev, const std::vector &source, + std::string &r_log) { + const size_t length = source.size() / sizeof(uint32_t); + size_t i = SPIRV_HEADER_WORD_SIZE; // Skip header + + while (i < length) { + const auto desc_word = get(source.data(), i); + const auto opcode = static_cast(desc_word & SpvOpCodeMask); + const unsigned int num_operands = desc_word >> SpvWordCountShift; + + if (opcode == SpvOpCapability) { + i += num_operands; + continue; + } + if (opcode != SpvOpExtension) + break; + + const char *extension = source.data() + (i + 1u) * sizeof(uint32_t); + const std::string device_extensions = dev.supported_extensions(); + const std::string platform_extensions = + dev.platform.supported_extensions(); + if (device_extensions.find(extension) == std::string::npos && + platform_extensions.find(extension) == std::string::npos) { + r_log += "Extension '" + std::string(extension) + + "' is not supported.\n"; + return false; + } + + i += num_operands; + } + + return true; + } + + bool + check_memory_model(const device &dev, const std::vector &source, + std::string &r_log) { + const size_t length = source.size() / sizeof(uint32_t); + size_t i = SPIRV_HEADER_WORD_SIZE; // Skip header + + while (i < length) { + const auto desc_word = get(source.data(), i); + const auto opcode = static_cast(desc_word & SpvOpCodeMask); + const unsigned int num_operands = desc_word >> SpvWordCountShift; + + switch (opcode) { + case SpvOpMemoryModel: + switch (get(source.data(), i + 1u)) { + case SpvAddressingModelPhysical32: + return dev.address_bits() == 32; + case SpvAddressingModelPhysical64: + return dev.address_bits() == 64; + default: + unreachable("Only Physical32 and Physical64 are valid for OpenCL, and the binary was already validated"); + return false; + } + break; + default: + break; + } + + i += num_operands; + } + + return false; + } + + // Copies the input binary and convert it to the endianness of the host CPU. + std::vector + spirv_to_cpu(const std::vector &binary) + { + const uint32_t first_word = get(binary.data(), 0u); + if (first_word == SpvMagicNumber) + return binary; + + std::vector cpu_endianness_binary(binary.size()); + for (size_t i = 0; i < (binary.size() / 4u); ++i) { + const uint32_t word = get(binary.data(), i); + reinterpret_cast(cpu_endianness_binary.data())[i] = + util_bswap32(word); + } + + return cpu_endianness_binary; + } + +#ifdef HAVE_CLOVER_SPIRV + std::string + format_validator_msg(spv_message_level_t level, const char * /* source */, + const spv_position_t &position, const char *message) { + std::string level_str; + switch (level) { + case SPV_MSG_FATAL: + level_str = "Fatal"; + break; + case SPV_MSG_INTERNAL_ERROR: + level_str = "Internal error"; + break; + case SPV_MSG_ERROR: + level_str = "Error"; + break; + case SPV_MSG_WARNING: + level_str = "Warning"; + break; + case SPV_MSG_INFO: + level_str = "Info"; + break; + case SPV_MSG_DEBUG: + level_str = "Debug"; + break; + } + return "[" + level_str + "] At word No." + + std::to_string(position.index) + ": \"" + message + "\"\n"; + } + + spv_target_env + convert_opencl_str_to_target_env(const std::string &opencl_version) { + if (opencl_version == "2.2") { + return SPV_ENV_OPENCL_2_2; + } else if (opencl_version == "2.1") { + return SPV_ENV_OPENCL_2_1; + } else if (opencl_version == "2.0") { + return SPV_ENV_OPENCL_2_0; + } else if (opencl_version == "1.2" || + opencl_version == "1.1" || + opencl_version == "1.0") { + // SPIR-V is only defined for OpenCL >= 1.2, however some drivers + // might use it with OpenCL 1.0 and 1.1. + return SPV_ENV_OPENCL_1_2; + } else { + throw build_error("Invalid OpenCL version"); + } + } +#endif + +} + +module +clover::spirv::compile_program(const std::vector &binary, + const device &dev, std::string &r_log) { + std::vector source = spirv_to_cpu(binary); + + if (!is_valid_spirv(source, dev.device_version(), r_log)) + throw build_error(); + + if (!check_capabilities(dev, source, r_log)) + throw build_error(); + if (!check_extensions(dev, source, r_log)) + throw build_error(); + if (!check_memory_model(dev, source, r_log)) + throw build_error(); + + return create_module_from_spirv(source, + dev.address_bits() == 32 ? 4u : 8u, r_log); +} + +module +clover::spirv::link_program(const std::vector &modules, + const device &dev, const std::string &opts, + std::string &r_log) { + std::vector options = clover::llvm::tokenize(opts); + + bool create_library = false; + + std::string ignored_options; + for (const std::string &option : options) { + if (option == "-create-library") { + create_library = true; + } else { + ignored_options += "'" + option + "' "; + } + } + if (!ignored_options.empty()) { + r_log += "Ignoring the following link options: " + ignored_options + + "\n"; + } + + spvtools::LinkerOptions linker_options; + linker_options.SetCreateLibrary(create_library); + + module m; + + const auto section_type = create_library ? module::section::text_library : + module::section::text_executable; + + std::vector sections; + sections.reserve(modules.size()); + std::vector lengths; + lengths.reserve(modules.size()); + + auto const validator_consumer = [&r_log](spv_message_level_t level, + const char *source, + const spv_position_t &position, + const char *message) { + r_log += format_validator_msg(level, source, position, message); + }; + + for (const auto &mod : modules) { + const auto &msec = find([](const module::section &sec) { + return sec.type == module::section::text_intermediate || + sec.type == module::section::text_library; + }, mod.secs); + + const auto c_il = ((struct pipe_binary_program_header*)msec.data.data())->blob; + const auto length = msec.size; + + sections.push_back(reinterpret_cast(c_il)); + lengths.push_back(length / sizeof(uint32_t)); + } + + std::vector linked_binary; + + const std::string opencl_version = dev.device_version(); + const spv_target_env target_env = + convert_opencl_str_to_target_env(opencl_version); + + const spvtools::MessageConsumer consumer = validator_consumer; + spvtools::Context context(target_env); + context.SetMessageConsumer(std::move(consumer)); + + if (Link(context, sections.data(), lengths.data(), sections.size(), + &linked_binary, linker_options) != SPV_SUCCESS) + throw error(CL_LINK_PROGRAM_FAILURE); + + std::vector final_binary{ + reinterpret_cast(linked_binary.data()), + reinterpret_cast(linked_binary.data() + + linked_binary.size()) }; + if (!is_valid_spirv(final_binary, opencl_version, r_log)) + throw error(CL_LINK_PROGRAM_FAILURE); + + for (const auto &mod : modules) + m.syms.insert(m.syms.end(), mod.syms.begin(), mod.syms.end()); + + m.secs.emplace_back(make_text_section(final_binary, section_type)); + + return m; +} + +bool +clover::spirv::is_valid_spirv(const std::vector &binary, + const std::string &opencl_version, + std::string &r_log) { + auto const validator_consumer = + [&r_log](spv_message_level_t level, const char *source, + const spv_position_t &position, const char *message) { + r_log += format_validator_msg(level, source, position, message); + }; + + const spv_target_env target_env = + convert_opencl_str_to_target_env(opencl_version); + spvtools::SpirvTools spvTool(target_env); + spvTool.SetMessageConsumer(validator_consumer); + + return spvTool.Validate(reinterpret_cast(binary.data()), + binary.size() / 4u); +} + +std::string +clover::spirv::print_module(const std::vector &binary, + const std::string &opencl_version) { + const spv_target_env target_env = + convert_opencl_str_to_target_env(opencl_version); + spvtools::SpirvTools spvTool(target_env); + spv_context spvContext = spvContextCreate(target_env); + if (!spvContext) + return "Failed to create an spv_context for disassembling the module."; + + spv_text disassembly; + spvBinaryToText(spvContext, + reinterpret_cast(binary.data()), + binary.size() / 4u, SPV_BINARY_TO_TEXT_OPTION_NONE, + &disassembly, nullptr); + spvContextDestroy(spvContext); + + const std::string disassemblyStr = disassembly->str; + spvTextDestroy(disassembly); + + return disassemblyStr; +} + +#else +bool +clover::spirv::is_valid_spirv(const std::vector &/*binary*/, + const std::string &/*opencl_version*/, + std::string &/*r_log*/) { + return false; +} + +module +clover::spirv::compile_program(const std::vector &binary, + const device &dev, std::string &r_log) { + r_log += "SPIR-V support in clover is not enabled.\n"; + throw build_error(); +} + +module +clover::spirv::link_program(const std::vector &/*modules*/, + const device &/*dev*/, const std::string &/*opts*/, + std::string &r_log) { + r_log += "SPIR-V support in clover is not enabled.\n"; + throw error(CL_LINKER_NOT_AVAILABLE); +} + +std::string +clover::spirv::print_module(const std::vector &binary, + const std::string &opencl_version) { + return std::string(); +} +#endif diff -Nru mesa-19.2.8/src/gallium/state_trackers/clover/spirv/invocation.hpp mesa-20.0.8/src/gallium/state_trackers/clover/spirv/invocation.hpp --- mesa-19.2.8/src/gallium/state_trackers/clover/spirv/invocation.hpp 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/state_trackers/clover/spirv/invocation.hpp 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,56 @@ +// +// Copyright 2018 Pierre Moreau +// +// Permission is hereby granted, free of charge, to any person obtaining a +// copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. +// + +#ifndef CLOVER_SPIRV_INVOCATION_HPP +#define CLOVER_SPIRV_INVOCATION_HPP + +#include "core/context.hpp" +#include "core/module.hpp" +#include "core/program.hpp" + +namespace clover { + namespace spirv { + // Returns whether the given binary is considered valid for the given + // OpenCL version. + // + // It uses SPIRV-Tools validator to do the validation, and potential + // warnings and errors are appended to |r_log|. + bool is_valid_spirv(const std::vector &binary, + const std::string &opencl_version, + std::string &r_log); + + // Creates a clover module out of the given SPIR-V binary. + module compile_program(const std::vector &binary, + const device &dev, std::string &r_log); + + // Combines multiple clover modules into a single one, resolving + // link dependencies between them. + module link_program(const std::vector &modules, const device &dev, + const std::string &opts, std::string &r_log); + + // Returns a textual representation of the given binary. + std::string print_module(const std::vector &binary, + const std::string &opencl_version); + } +} + +#endif diff -Nru mesa-19.2.8/src/gallium/state_trackers/clover/util/functional.hpp mesa-20.0.8/src/gallium/state_trackers/clover/util/functional.hpp --- mesa-19.2.8/src/gallium/state_trackers/clover/util/functional.hpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/state_trackers/clover/util/functional.hpp 2020-06-12 01:21:17.000000000 +0000 @@ -347,6 +347,21 @@ const std::string &name; }; + class id_equals { + public: + id_equals(const uint32_t id) : id(id) { + } + + template + bool + operator()(const T &x) const { + return x.id == id; + } + + private: + const uint32_t id; + }; + template class key_equals_t { public: diff -Nru mesa-19.2.8/src/gallium/state_trackers/dri/dri2.c mesa-20.0.8/src/gallium/state_trackers/dri/dri2.c --- mesa-19.2.8/src/gallium/state_trackers/dri/dri2.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/state_trackers/dri/dri2.c 2020-06-12 01:21:17.000000000 +0000 @@ -33,7 +33,7 @@ #include "util/disk_cache.h" #include "util/u_memory.h" #include "util/u_inlines.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_debug.h" #include "state_tracker/drm_driver.h" #include "state_tracker/st_cb_bufferobjects.h" @@ -153,6 +153,12 @@ * may occur as the stvis->color_format. */ switch(format) { + case PIPE_FORMAT_R16G16B16A16_FLOAT: + depth = 64; + break; + case PIPE_FORMAT_R16G16B16X16_FLOAT: + depth = 48; + break; case PIPE_FORMAT_B10G10R10A2_UNORM: case PIPE_FORMAT_R10G10B10A2_UNORM: case PIPE_FORMAT_BGRA8888_UNORM: @@ -231,6 +237,12 @@ } switch (pf) { + case PIPE_FORMAT_R16G16B16A16_FLOAT: + image_format = __DRI_IMAGE_FORMAT_ABGR16161616F; + break; + case PIPE_FORMAT_R16G16B16X16_FLOAT: + image_format = __DRI_IMAGE_FORMAT_XBGR16161616F; + break; case PIPE_FORMAT_B5G5R5A1_UNORM: image_format = __DRI_IMAGE_FORMAT_ARGB1555; break; @@ -304,6 +316,12 @@ bind |= PIPE_BIND_SHARED; switch (format) { + case 64: + pf = PIPE_FORMAT_R16G16B16A16_FLOAT; + break; + case 48: + pf = PIPE_FORMAT_R16G16B16X16_FLOAT; + break; case 32: pf = PIPE_FORMAT_BGRA8888_UNORM; break; @@ -720,7 +738,7 @@ struct pipe_resource templ; unsigned tex_usage = 0; int i; - bool is_yuv = util_format_is_yuv(map->pipe_format); + bool use_lowered = false; if (pscreen->is_format_supported(pscreen, map->pipe_format, screen->target, 0, 0, PIPE_BIND_RENDER_TARGET)) @@ -729,17 +747,15 @@ PIPE_BIND_SAMPLER_VIEW)) tex_usage |= PIPE_BIND_SAMPLER_VIEW; - if (!tex_usage && is_yuv) { + if (!tex_usage && util_format_is_yuv(map->pipe_format)) { /* YUV format sampling can be emulated by the Mesa state tracker by * using multiple samplers of varying formats. * If no tex_usage is set and we detect a YUV format, - * test for support of the first plane's sampler format and + * test for support of all planes' sampler formats and * add sampler view usage. */ - if (pscreen->is_format_supported(pscreen, - dri2_get_pipe_format_for_dri_format(map->planes[0].dri_format), - screen->target, 0, 0, - PIPE_BIND_SAMPLER_VIEW)) + use_lowered = true; + if (dri2_yuv_dma_buf_supported(screen, map)) tex_usage |= PIPE_BIND_SAMPLER_VIEW; } @@ -757,19 +773,20 @@ templ.depth0 = 1; templ.array_size = 1; - for (i = num_handles - 1; i >= 0; i--) { + for (i = (use_lowered ? map->nplanes : num_handles) - 1; i >= 0; i--) { struct pipe_resource *tex; templ.width0 = width >> map->planes[i].width_shift; templ.height0 = height >> map->planes[i].height_shift; - if (is_yuv) + if (use_lowered) templ.format = dri2_get_pipe_format_for_dri_format(map->planes[i].dri_format); else templ.format = map->pipe_format; assert(templ.format != PIPE_FORMAT_NONE); tex = pscreen->resource_from_handle(pscreen, - &templ, &whandle[i], PIPE_HANDLE_USAGE_FRAMEBUFFER_WRITE); + &templ, &whandle[use_lowered ? map->planes[i].buffer_index : i], + PIPE_HANDLE_USAGE_FRAMEBUFFER_WRITE); if (!tex) { pipe_resource_reference(&img->texture, NULL); FREE(img); @@ -877,8 +894,8 @@ } switch (fourcc) { - case __DRI_IMAGE_FOURCC_YUYV: - case __DRI_IMAGE_FOURCC_UYVY: + case DRM_FORMAT_YUYV: + case DRM_FORMAT_UYVY: expected_num_fds = 1; break; default: @@ -893,25 +910,23 @@ memset(whandles, 0, sizeof(whandles)); - for (i = 0; i < num_handles; i++) { - int fdnum = i >= num_fds ? 0 : i; - int index = i >= map->nplanes ? i : map->planes[i].buffer_index; - if (fds[fdnum] < 0) { + for (i = 0; i < num_fds; i++) { + if (fds[i] < 0) { err = __DRI_IMAGE_ERROR_BAD_ALLOC; goto exit; } whandles[i].type = WINSYS_HANDLE_TYPE_FD; - whandles[i].handle = (unsigned)fds[fdnum]; - whandles[i].stride = (unsigned)strides[index]; - whandles[i].offset = (unsigned)offsets[index]; + whandles[i].handle = (unsigned)fds[i]; + whandles[i].stride = (unsigned)strides[i]; + whandles[i].offset = (unsigned)offsets[i]; whandles[i].format = map->pipe_format; whandles[i].modifier = modifier; - whandles[i].plane = index; + whandles[i].plane = i; } img = dri2_create_image_from_winsys(_screen, width, height, map, - num_handles, whandles, loaderPrivate); + num_fds, whandles, loaderPrivate); if(img == NULL) { err = __DRI_IMAGE_ERROR_BAD_ALLOC; goto exit; @@ -920,6 +935,7 @@ img->dri_components = map->dri_components; img->dri_fourcc = fourcc; img->dri_format = map->dri_format; + img->imported_dmabuf = TRUE; exit: if (error) @@ -1090,10 +1106,10 @@ return false; } + usage = PIPE_HANDLE_USAGE_FRAMEBUFFER_WRITE; + if (image->use & __DRI_IMAGE_USE_BACKBUFFER) - usage = PIPE_HANDLE_USAGE_EXPLICIT_FLUSH; - else - usage = PIPE_HANDLE_USAGE_FRAMEBUFFER_WRITE; + usage |= PIPE_HANDLE_USAGE_EXPLICIT_FLUSH; if (!pscreen->resource_get_handle(pscreen, NULL, image->texture, &whandle, usage)) @@ -1176,10 +1192,10 @@ return false; } + handle_usage = PIPE_HANDLE_USAGE_FRAMEBUFFER_WRITE; + if (image->use & __DRI_IMAGE_USE_BACKBUFFER) - handle_usage = PIPE_HANDLE_USAGE_EXPLICIT_FLUSH; - else - handle_usage = PIPE_HANDLE_USAGE_FRAMEBUFFER_WRITE; + handle_usage |= PIPE_HANDLE_USAGE_EXPLICIT_FLUSH; if (!dri2_resource_get_param(image, param, handle_usage, &res_param)) return false; @@ -1379,7 +1395,8 @@ (pscreen->is_format_supported(pscreen, format, screen->target, 0, 0, PIPE_BIND_RENDER_TARGET) || pscreen->is_format_supported(pscreen, format, screen->target, 0, 0, - PIPE_BIND_SAMPLER_VIEW))) { + PIPE_BIND_SAMPLER_VIEW) || + dri2_yuv_dma_buf_supported(screen, map))) { pscreen->query_dmabuf_modifiers(pscreen, format, max, modifiers, external_only, count); return true; @@ -1499,11 +1516,11 @@ if (flush_flag == __BLIT_FLAG_FLUSH) { pipe->flush_resource(pipe, dst->texture); - ctx->st->flush(ctx->st, 0, NULL); + ctx->st->flush(ctx->st, 0, NULL, NULL, NULL); } else if (flush_flag == __BLIT_FLAG_FINISH) { screen = dri_screen(ctx->sPriv)->base.screen; pipe->flush_resource(pipe, dst->texture); - ctx->st->flush(ctx->st, 0, &fence); + ctx->st->flush(ctx->st, 0, &fence, NULL, NULL); (void) screen->fence_finish(screen, NULL, fence, PIPE_TIMEOUT_INFINITE); screen->fence_reference(screen, &fence, NULL); } @@ -2055,8 +2072,7 @@ if (!pscreen) goto release_pipe; - screen->default_throttle_frames = - pscreen->get_param(pscreen, PIPE_CAP_MAX_FRAMES_IN_FLIGHT); + screen->throttle = pscreen->get_param(pscreen, PIPE_CAP_THROTTLE); if (pscreen->resource_create_with_modifiers) dri2ImageExtension.createImageWithModifiers = diff -Nru mesa-19.2.8/src/gallium/state_trackers/dri/dri_context.c mesa-20.0.8/src/gallium/state_trackers/dri/dri_context.c --- mesa-19.2.8/src/gallium/state_trackers/dri/dri_context.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/state_trackers/dri/dri_context.c 2020-06-12 01:21:17.000000000 +0000 @@ -241,7 +241,7 @@ * to avoid having to add code elsewhere to cope with flushing a * partially destroyed context. */ - ctx->st->flush(ctx->st, 0, NULL); + ctx->st->flush(ctx->st, 0, NULL, NULL, NULL); ctx->st->destroy(ctx->st); free(ctx); } diff -Nru mesa-19.2.8/src/gallium/state_trackers/dri/dri_drawable.c mesa-20.0.8/src/gallium/state_trackers/dri/dri_drawable.c --- mesa-19.2.8/src/gallium/state_trackers/dri/dri_drawable.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/state_trackers/dri/dri_drawable.c 2020-06-12 01:21:17.000000000 +0000 @@ -34,15 +34,12 @@ #include "dri_drawable.h" #include "pipe/p_screen.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_memory.h" #include "util/u_inlines.h" static uint32_t drifb_ID = 0; -static void -swap_fences_unref(struct dri_drawable *draw); - static bool dri_st_framebuffer_validate(struct st_context_iface *stctx, struct st_framebuffer_iface *stfbi, @@ -179,9 +176,6 @@ drawable->screen = screen; drawable->sPriv = sPriv; drawable->dPriv = dPriv; - drawable->desired_fences = screen->default_throttle_frames; - if (drawable->desired_fences > DRI_SWAP_FENCES_MAX) - drawable->desired_fences = DRI_SWAP_FENCES_MAX; dPriv->driverPrivate = (void *)drawable; p_atomic_set(&drawable->base.stamp, 1); @@ -209,7 +203,8 @@ for (i = 0; i < ST_ATTACHMENT_COUNT; i++) pipe_resource_reference(&drawable->msaa_textures[i], NULL); - swap_fences_unref(drawable); + screen->base.screen->fence_reference(screen->base.screen, + &drawable->throttle_fence, NULL); /* Notify the st manager that this drawable is no longer valid */ stapi->destroy_drawable(stapi, &drawable->base); @@ -273,6 +268,9 @@ if (format == __DRI_TEXTURE_FORMAT_RGB) { /* only need to cover the formats recognized by dri_fill_st_visual */ switch (internal_format) { + case PIPE_FORMAT_R16G16B16A16_FLOAT: + internal_format = PIPE_FORMAT_R16G16B16X16_FLOAT; + break; case PIPE_FORMAT_B10G10R10A2_UNORM: internal_format = PIPE_FORMAT_B10G10R10X2_UNORM; break; @@ -346,75 +344,6 @@ } } - -/** - * swap_fences_pop_front - pull a fence from the throttle queue - * - * If the throttle queue is filled to the desired number of fences, - * pull fences off the queue until the number is less than the desired - * number of fences, and return the last fence pulled. - */ -static struct pipe_fence_handle * -swap_fences_pop_front(struct dri_drawable *draw) -{ - struct pipe_screen *screen = draw->screen->base.screen; - struct pipe_fence_handle *fence = NULL; - - if (draw->desired_fences == 0) - return NULL; - - if (draw->cur_fences >= draw->desired_fences) { - screen->fence_reference(screen, &fence, draw->swap_fences[draw->tail]); - screen->fence_reference(screen, &draw->swap_fences[draw->tail++], NULL); - draw->tail &= DRI_SWAP_FENCES_MASK; - --draw->cur_fences; - } - return fence; -} - - -/** - * swap_fences_push_back - push a fence onto the throttle queue - * - * push a fence onto the throttle queue and pull fences of the queue - * so that the desired number of fences are on the queue. - */ -static void -swap_fences_push_back(struct dri_drawable *draw, - struct pipe_fence_handle *fence) -{ - struct pipe_screen *screen = draw->screen->base.screen; - - if (!fence || draw->desired_fences == 0) - return; - - while(draw->cur_fences == draw->desired_fences) - swap_fences_pop_front(draw); - - draw->cur_fences++; - screen->fence_reference(screen, &draw->swap_fences[draw->head++], - fence); - draw->head &= DRI_SWAP_FENCES_MASK; -} - - -/** - * swap_fences_unref - empty the throttle queue - * - * pulls fences of the throttle queue until it is empty. - */ -static void -swap_fences_unref(struct dri_drawable *draw) -{ - struct pipe_screen *screen = draw->screen->base.screen; - - while(draw->cur_fences) { - screen->fence_reference(screen, &draw->swap_fences[draw->tail++], NULL); - draw->tail &= DRI_SWAP_FENCES_MASK; - --draw->cur_fences; - } -} - void dri_pipe_blit(struct pipe_context *pipe, struct pipe_resource *dst, @@ -475,6 +404,56 @@ pp_run(ctx->pp, src, src, zsbuf); } +struct notify_before_flush_cb_args { + struct dri_context *ctx; + struct dri_drawable *drawable; + unsigned flags; + enum __DRI2throttleReason reason; + bool swap_msaa_buffers; +}; + +static void +notify_before_flush_cb(void* _args) +{ + struct notify_before_flush_cb_args *args = (struct notify_before_flush_cb_args *) _args; + struct st_context_iface *st = args->ctx->st; + struct pipe_context *pipe = st->pipe; + + if (args->drawable->stvis.samples > 1 && + (args->reason == __DRI2_THROTTLE_SWAPBUFFER || + args->reason == __DRI2_THROTTLE_COPYSUBBUFFER)) { + /* Resolve the MSAA back buffer. */ + dri_pipe_blit(st->pipe, + args->drawable->textures[ST_ATTACHMENT_BACK_LEFT], + args->drawable->msaa_textures[ST_ATTACHMENT_BACK_LEFT]); + + if (args->reason == __DRI2_THROTTLE_SWAPBUFFER && + args->drawable->msaa_textures[ST_ATTACHMENT_FRONT_LEFT] && + args->drawable->msaa_textures[ST_ATTACHMENT_BACK_LEFT]) { + args->swap_msaa_buffers = true; + } + + /* FRONT_LEFT is resolved in drawable->flush_frontbuffer. */ + } + + dri_postprocessing(args->ctx, args->drawable, ST_ATTACHMENT_BACK_LEFT); + + if (pipe->invalidate_resource && + (args->flags & __DRI2_FLUSH_INVALIDATE_ANCILLARY)) { + if (args->drawable->textures[ST_ATTACHMENT_DEPTH_STENCIL]) + pipe->invalidate_resource(pipe, args->drawable->textures[ST_ATTACHMENT_DEPTH_STENCIL]); + if (args->drawable->msaa_textures[ST_ATTACHMENT_DEPTH_STENCIL]) + pipe->invalidate_resource(pipe, args->drawable->msaa_textures[ST_ATTACHMENT_DEPTH_STENCIL]); + } + + if (args->ctx->hud) { + hud_run(args->ctx->hud, args->ctx->st->cso_context, + args->drawable->textures[ST_ATTACHMENT_BACK_LEFT]); + } + + pipe->flush_resource(pipe, args->drawable->textures[ST_ATTACHMENT_BACK_LEFT]); +} + /** * DRI2 flush extension, the flush_with_flags function. * @@ -493,7 +472,7 @@ struct dri_drawable *drawable = dri_drawable(dPriv); struct st_context_iface *st; unsigned flush_flags; - bool swap_msaa_buffers = false; + struct notify_before_flush_cb_args args = { 0 }; if (!ctx) { assert(0); @@ -515,42 +494,18 @@ flags &= ~__DRI2_FLUSH_DRAWABLE; } - /* Flush the drawable. */ if ((flags & __DRI2_FLUSH_DRAWABLE) && drawable->textures[ST_ATTACHMENT_BACK_LEFT]) { - struct pipe_context *pipe = st->pipe; - - if (drawable->stvis.samples > 1 && - reason == __DRI2_THROTTLE_SWAPBUFFER) { - /* Resolve the MSAA back buffer. */ - dri_pipe_blit(st->pipe, - drawable->textures[ST_ATTACHMENT_BACK_LEFT], - drawable->msaa_textures[ST_ATTACHMENT_BACK_LEFT]); - - if (drawable->msaa_textures[ST_ATTACHMENT_FRONT_LEFT] && - drawable->msaa_textures[ST_ATTACHMENT_BACK_LEFT]) { - swap_msaa_buffers = true; - } - - /* FRONT_LEFT is resolved in drawable->flush_frontbuffer. */ - } - - dri_postprocessing(ctx, drawable, ST_ATTACHMENT_BACK_LEFT); - - if (pipe->invalidate_resource && - (flags & __DRI2_FLUSH_INVALIDATE_ANCILLARY)) { - if (drawable->textures[ST_ATTACHMENT_DEPTH_STENCIL]) - pipe->invalidate_resource(pipe, drawable->textures[ST_ATTACHMENT_DEPTH_STENCIL]); - if (drawable->msaa_textures[ST_ATTACHMENT_DEPTH_STENCIL]) - pipe->invalidate_resource(pipe, drawable->msaa_textures[ST_ATTACHMENT_DEPTH_STENCIL]); - } - - if (ctx->hud) { - hud_run(ctx->hud, ctx->st->cso_context, - drawable->textures[ST_ATTACHMENT_BACK_LEFT]); - } - - pipe->flush_resource(pipe, drawable->textures[ST_ATTACHMENT_BACK_LEFT]); + /* We can't do operations on the back buffer here, because there + * may be some pending operations that will get flushed by the + * call to st->flush (eg: FLUSH_VERTICES). + * Instead we register a callback to be notified when all operations + * have been submitted but before the call to st_flush. + */ + args.ctx = ctx; + args.drawable = drawable; + args.flags = flags; + args.reason = reason; } flush_flags = 0; @@ -560,38 +515,25 @@ flush_flags |= ST_FLUSH_END_OF_FRAME; /* Flush the context and throttle if needed. */ - if (dri_screen(ctx->sPriv)->default_throttle_frames && + if (dri_screen(ctx->sPriv)->throttle && drawable && (reason == __DRI2_THROTTLE_SWAPBUFFER || reason == __DRI2_THROTTLE_FLUSHFRONT)) { - /* Throttle. - * - * This pulls a fence off the throttling queue and waits for it if the - * number of fences on the throttling queue has reached the desired - * number. - * - * Then flushes to insert a fence at the current rendering position, and - * pushes that fence on the queue. This requires that the st_context_iface - * flush method returns a fence even if there are no commands to flush. - */ - struct pipe_screen *screen = drawable->screen->base.screen; - struct pipe_fence_handle *oldest_fence, *new_fence = NULL; - st->flush(st, flush_flags, &new_fence); + struct pipe_screen *screen = drawable->screen->base.screen; + struct pipe_fence_handle *new_fence = NULL; - oldest_fence = swap_fences_pop_front(drawable); - if (oldest_fence) { - screen->fence_finish(screen, NULL, oldest_fence, PIPE_TIMEOUT_INFINITE); - screen->fence_reference(screen, &oldest_fence, NULL); - } + st->flush(st, flush_flags, &new_fence, args.ctx ? notify_before_flush_cb : NULL, &args); - if (new_fence) { - swap_fences_push_back(drawable, new_fence); - screen->fence_reference(screen, &new_fence, NULL); + /* throttle on the previous fence */ + if (drawable->throttle_fence) { + screen->fence_finish(screen, NULL, drawable->throttle_fence, PIPE_TIMEOUT_INFINITE); + screen->fence_reference(screen, &drawable->throttle_fence, NULL); } + drawable->throttle_fence = new_fence; } else if (flags & (__DRI2_FLUSH_DRAWABLE | __DRI2_FLUSH_CONTEXT)) { - st->flush(st, flush_flags, NULL); + st->flush(st, flush_flags, NULL, args.ctx ? notify_before_flush_cb : NULL, &args); } if (drawable) { @@ -602,7 +544,7 @@ * from the front buffer after SwapBuffers returns what was * in the back buffer. */ - if (swap_msaa_buffers) { + if (args.swap_msaa_buffers) { struct pipe_resource *tmp = drawable->msaa_textures[ST_ATTACHMENT_FRONT_LEFT]; diff -Nru mesa-19.2.8/src/gallium/state_trackers/dri/dri_drawable.h mesa-20.0.8/src/gallium/state_trackers/dri/dri_drawable.h --- mesa-19.2.8/src/gallium/state_trackers/dri/dri_drawable.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/state_trackers/dri/dri_drawable.h 2020-06-12 01:21:17.000000000 +0000 @@ -36,10 +36,6 @@ struct st_framebuffer; struct dri_context; -#define DRI_SWAP_FENCES_MAX 4 -#define DRI_SWAP_FENCES_MASK 3 -#define DRI_SWAP_FENCES_DEFAULT 1 - struct dri_drawable { struct st_framebuffer_iface base; @@ -63,11 +59,7 @@ struct pipe_resource *msaa_textures[ST_ATTACHMENT_COUNT]; unsigned int texture_mask, texture_stamp; - struct pipe_fence_handle *swap_fences[DRI_SWAP_FENCES_MAX]; - unsigned int cur_fences; - unsigned int head; - unsigned int tail; - unsigned int desired_fences; + struct pipe_fence_handle *throttle_fence; bool flushing; /* prevents recursion in dri_flush */ /* used only by DRISW */ diff -Nru mesa-19.2.8/src/gallium/state_trackers/dri/dri_helpers.c mesa-20.0.8/src/gallium/state_trackers/dri/dri_helpers.c --- mesa-19.2.8/src/gallium/state_trackers/dri/dri_helpers.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/state_trackers/dri/dri_helpers.c 2020-06-12 01:21:17.000000000 +0000 @@ -21,6 +21,7 @@ */ #include +#include "drm-uapi/drm_fourcc.h" #include "util/u_memory.h" #include "pipe/p_screen.h" #include "state_tracker/st_texture.h" @@ -96,7 +97,7 @@ if (!fence) return NULL; - stapi->flush(stapi, 0, &fence->pipe_fence); + stapi->flush(stapi, 0, &fence->pipe_fence, NULL, NULL); if (!fence->pipe_fence) { FREE(fence); @@ -116,7 +117,7 @@ if (fd == -1) { /* exporting driver created fence, flush: */ - stapi->flush(stapi, ST_FLUSH_FENCE_FD, &fence->pipe_fence); + stapi->flush(stapi, ST_FLUSH_FENCE_FD, &fence->pipe_fence, NULL, NULL); } else { /* importing a foreign fence fd: */ ctx->create_fence_fd(ctx, &fence->pipe_fence, fd, PIPE_FD_TYPE_NATIVE_SYNC); @@ -380,131 +381,137 @@ } static const struct dri2_format_mapping dri2_format_table[] = { - { __DRI_IMAGE_FOURCC_ARGB2101010, __DRI_IMAGE_FORMAT_ARGB2101010, + { DRM_FORMAT_ABGR16161616F, __DRI_IMAGE_FORMAT_ABGR16161616F, + __DRI_IMAGE_COMPONENTS_RGBA, PIPE_FORMAT_R16G16B16A16_FLOAT, 1, + { { 0, 0, 0, __DRI_IMAGE_FORMAT_ABGR16161616F, 4 } } }, + { DRM_FORMAT_XBGR16161616F, __DRI_IMAGE_FORMAT_XBGR16161616F, + __DRI_IMAGE_COMPONENTS_RGB, PIPE_FORMAT_R16G16B16X16_FLOAT, 1, + { { 0, 0, 0, __DRI_IMAGE_FORMAT_XBGR16161616F, 4 } } }, + { DRM_FORMAT_ARGB2101010, __DRI_IMAGE_FORMAT_ARGB2101010, __DRI_IMAGE_COMPONENTS_RGBA, PIPE_FORMAT_B10G10R10A2_UNORM, 1, { { 0, 0, 0, __DRI_IMAGE_FORMAT_ARGB2101010, 4 } } }, - { __DRI_IMAGE_FOURCC_XRGB2101010, __DRI_IMAGE_FORMAT_XRGB2101010, + { DRM_FORMAT_XRGB2101010, __DRI_IMAGE_FORMAT_XRGB2101010, __DRI_IMAGE_COMPONENTS_RGB, PIPE_FORMAT_B10G10R10X2_UNORM, 1, { { 0, 0, 0, __DRI_IMAGE_FORMAT_XRGB2101010, 4 } } }, - { __DRI_IMAGE_FOURCC_ABGR2101010, __DRI_IMAGE_FORMAT_ABGR2101010, + { DRM_FORMAT_ABGR2101010, __DRI_IMAGE_FORMAT_ABGR2101010, __DRI_IMAGE_COMPONENTS_RGBA, PIPE_FORMAT_R10G10B10A2_UNORM, 1, { { 0, 0, 0, __DRI_IMAGE_FORMAT_ABGR2101010, 4 } } }, - { __DRI_IMAGE_FOURCC_XBGR2101010, __DRI_IMAGE_FORMAT_XBGR2101010, + { DRM_FORMAT_XBGR2101010, __DRI_IMAGE_FORMAT_XBGR2101010, __DRI_IMAGE_COMPONENTS_RGB, PIPE_FORMAT_R10G10B10X2_UNORM, 1, { { 0, 0, 0, __DRI_IMAGE_FORMAT_XBGR2101010, 4 } } }, - { __DRI_IMAGE_FOURCC_ARGB8888, __DRI_IMAGE_FORMAT_ARGB8888, + { DRM_FORMAT_ARGB8888, __DRI_IMAGE_FORMAT_ARGB8888, __DRI_IMAGE_COMPONENTS_RGBA, PIPE_FORMAT_BGRA8888_UNORM, 1, { { 0, 0, 0, __DRI_IMAGE_FORMAT_ARGB8888, 4 } } }, - { __DRI_IMAGE_FOURCC_ABGR8888, __DRI_IMAGE_FORMAT_ABGR8888, + { DRM_FORMAT_ABGR8888, __DRI_IMAGE_FORMAT_ABGR8888, __DRI_IMAGE_COMPONENTS_RGBA, PIPE_FORMAT_RGBA8888_UNORM, 1, { { 0, 0, 0, __DRI_IMAGE_FORMAT_ABGR8888, 4 } } }, { __DRI_IMAGE_FOURCC_SARGB8888, __DRI_IMAGE_FORMAT_SARGB8, __DRI_IMAGE_COMPONENTS_RGBA, PIPE_FORMAT_BGRA8888_SRGB, 1, { { 0, 0, 0, __DRI_IMAGE_FORMAT_SARGB8, 4 } } }, - { __DRI_IMAGE_FOURCC_XRGB8888, __DRI_IMAGE_FORMAT_XRGB8888, + { DRM_FORMAT_XRGB8888, __DRI_IMAGE_FORMAT_XRGB8888, __DRI_IMAGE_COMPONENTS_RGB, PIPE_FORMAT_BGRX8888_UNORM, 1, { { 0, 0, 0, __DRI_IMAGE_FORMAT_XRGB8888, 4 }, } }, - { __DRI_IMAGE_FOURCC_XBGR8888, __DRI_IMAGE_FORMAT_XBGR8888, + { DRM_FORMAT_XBGR8888, __DRI_IMAGE_FORMAT_XBGR8888, __DRI_IMAGE_COMPONENTS_RGB, PIPE_FORMAT_RGBX8888_UNORM, 1, { { 0, 0, 0, __DRI_IMAGE_FORMAT_XBGR8888, 4 }, } }, - { __DRI_IMAGE_FOURCC_ARGB1555, __DRI_IMAGE_FORMAT_ARGB1555, + { DRM_FORMAT_ARGB1555, __DRI_IMAGE_FORMAT_ARGB1555, __DRI_IMAGE_COMPONENTS_RGBA, PIPE_FORMAT_B5G5R5A1_UNORM, 1, { { 0, 0, 0, __DRI_IMAGE_FORMAT_ARGB1555, 2 } } }, - { __DRI_IMAGE_FOURCC_RGB565, __DRI_IMAGE_FORMAT_RGB565, + { DRM_FORMAT_RGB565, __DRI_IMAGE_FORMAT_RGB565, __DRI_IMAGE_COMPONENTS_RGB, PIPE_FORMAT_B5G6R5_UNORM, 1, { { 0, 0, 0, __DRI_IMAGE_FORMAT_RGB565, 2 } } }, - { __DRI_IMAGE_FOURCC_R8, __DRI_IMAGE_FORMAT_R8, + { DRM_FORMAT_R8, __DRI_IMAGE_FORMAT_R8, __DRI_IMAGE_COMPONENTS_R, PIPE_FORMAT_R8_UNORM, 1, { { 0, 0, 0, __DRI_IMAGE_FORMAT_R8, 1 }, } }, - { __DRI_IMAGE_FOURCC_R16, __DRI_IMAGE_FORMAT_R16, + { DRM_FORMAT_R16, __DRI_IMAGE_FORMAT_R16, __DRI_IMAGE_COMPONENTS_R, PIPE_FORMAT_R16_UNORM, 1, { { 0, 0, 0, __DRI_IMAGE_FORMAT_R16, 1 }, } }, - { __DRI_IMAGE_FOURCC_GR88, __DRI_IMAGE_FORMAT_GR88, + { DRM_FORMAT_GR88, __DRI_IMAGE_FORMAT_GR88, __DRI_IMAGE_COMPONENTS_RG, PIPE_FORMAT_RG88_UNORM, 1, { { 0, 0, 0, __DRI_IMAGE_FORMAT_GR88, 2 }, } }, - { __DRI_IMAGE_FOURCC_GR1616, __DRI_IMAGE_FORMAT_GR1616, + { DRM_FORMAT_GR1616, __DRI_IMAGE_FORMAT_GR1616, __DRI_IMAGE_COMPONENTS_RG, PIPE_FORMAT_RG1616_UNORM, 1, { { 0, 0, 0, __DRI_IMAGE_FORMAT_GR1616, 2 }, } }, - { __DRI_IMAGE_FOURCC_YUV410, __DRI_IMAGE_FORMAT_NONE, + { DRM_FORMAT_YUV410, __DRI_IMAGE_FORMAT_NONE, __DRI_IMAGE_COMPONENTS_Y_U_V, PIPE_FORMAT_IYUV, 3, { { 0, 0, 0, __DRI_IMAGE_FORMAT_R8, 1 }, { 1, 2, 2, __DRI_IMAGE_FORMAT_R8, 1 }, { 2, 2, 2, __DRI_IMAGE_FORMAT_R8, 1 } } }, - { __DRI_IMAGE_FOURCC_YUV411, __DRI_IMAGE_FORMAT_NONE, + { DRM_FORMAT_YUV411, __DRI_IMAGE_FORMAT_NONE, __DRI_IMAGE_COMPONENTS_Y_U_V, PIPE_FORMAT_IYUV, 3, { { 0, 0, 0, __DRI_IMAGE_FORMAT_R8, 1 }, { 1, 2, 0, __DRI_IMAGE_FORMAT_R8, 1 }, { 2, 2, 0, __DRI_IMAGE_FORMAT_R8, 1 } } }, - { __DRI_IMAGE_FOURCC_YUV420, __DRI_IMAGE_FORMAT_NONE, + { DRM_FORMAT_YUV420, __DRI_IMAGE_FORMAT_NONE, __DRI_IMAGE_COMPONENTS_Y_U_V, PIPE_FORMAT_IYUV, 3, { { 0, 0, 0, __DRI_IMAGE_FORMAT_R8, 1 }, { 1, 1, 1, __DRI_IMAGE_FORMAT_R8, 1 }, { 2, 1, 1, __DRI_IMAGE_FORMAT_R8, 1 } } }, - { __DRI_IMAGE_FOURCC_YUV422, __DRI_IMAGE_FORMAT_NONE, + { DRM_FORMAT_YUV422, __DRI_IMAGE_FORMAT_NONE, __DRI_IMAGE_COMPONENTS_Y_U_V, PIPE_FORMAT_IYUV, 3, { { 0, 0, 0, __DRI_IMAGE_FORMAT_R8, 1 }, { 1, 1, 0, __DRI_IMAGE_FORMAT_R8, 1 }, { 2, 1, 0, __DRI_IMAGE_FORMAT_R8, 1 } } }, - { __DRI_IMAGE_FOURCC_YUV444, __DRI_IMAGE_FORMAT_NONE, + { DRM_FORMAT_YUV444, __DRI_IMAGE_FORMAT_NONE, __DRI_IMAGE_COMPONENTS_Y_U_V, PIPE_FORMAT_IYUV, 3, { { 0, 0, 0, __DRI_IMAGE_FORMAT_R8, 1 }, { 1, 0, 0, __DRI_IMAGE_FORMAT_R8, 1 }, { 2, 0, 0, __DRI_IMAGE_FORMAT_R8, 1 } } }, - { __DRI_IMAGE_FOURCC_YVU410, __DRI_IMAGE_FORMAT_NONE, + { DRM_FORMAT_YVU410, __DRI_IMAGE_FORMAT_NONE, __DRI_IMAGE_COMPONENTS_Y_U_V, PIPE_FORMAT_IYUV, 3, { { 0, 0, 0, __DRI_IMAGE_FORMAT_R8, 1 }, { 2, 2, 2, __DRI_IMAGE_FORMAT_R8, 1 }, { 1, 2, 2, __DRI_IMAGE_FORMAT_R8, 1 } } }, - { __DRI_IMAGE_FOURCC_YVU411, __DRI_IMAGE_FORMAT_NONE, + { DRM_FORMAT_YVU411, __DRI_IMAGE_FORMAT_NONE, __DRI_IMAGE_COMPONENTS_Y_U_V, PIPE_FORMAT_IYUV, 3, { { 0, 0, 0, __DRI_IMAGE_FORMAT_R8, 1 }, { 2, 2, 0, __DRI_IMAGE_FORMAT_R8, 1 }, { 1, 2, 0, __DRI_IMAGE_FORMAT_R8, 1 } } }, - { __DRI_IMAGE_FOURCC_YVU420, __DRI_IMAGE_FORMAT_NONE, + { DRM_FORMAT_YVU420, __DRI_IMAGE_FORMAT_NONE, __DRI_IMAGE_COMPONENTS_Y_U_V, PIPE_FORMAT_IYUV, 3, { { 0, 0, 0, __DRI_IMAGE_FORMAT_R8, 1 }, { 2, 1, 1, __DRI_IMAGE_FORMAT_R8, 1 }, { 1, 1, 1, __DRI_IMAGE_FORMAT_R8, 1 } } }, - { __DRI_IMAGE_FOURCC_YVU422, __DRI_IMAGE_FORMAT_NONE, + { DRM_FORMAT_YVU422, __DRI_IMAGE_FORMAT_NONE, __DRI_IMAGE_COMPONENTS_Y_U_V, PIPE_FORMAT_IYUV, 3, { { 0, 0, 0, __DRI_IMAGE_FORMAT_R8, 1 }, { 2, 1, 0, __DRI_IMAGE_FORMAT_R8, 1 }, { 1, 1, 0, __DRI_IMAGE_FORMAT_R8, 1 } } }, - { __DRI_IMAGE_FOURCC_YVU444, __DRI_IMAGE_FORMAT_NONE, + { DRM_FORMAT_YVU444, __DRI_IMAGE_FORMAT_NONE, __DRI_IMAGE_COMPONENTS_Y_U_V, PIPE_FORMAT_IYUV, 3, { { 0, 0, 0, __DRI_IMAGE_FORMAT_R8, 1 }, { 2, 0, 0, __DRI_IMAGE_FORMAT_R8, 1 }, { 1, 0, 0, __DRI_IMAGE_FORMAT_R8, 1 } } }, - { __DRI_IMAGE_FOURCC_NV12, __DRI_IMAGE_FORMAT_NONE, + { DRM_FORMAT_NV12, __DRI_IMAGE_FORMAT_NONE, __DRI_IMAGE_COMPONENTS_Y_UV, PIPE_FORMAT_NV12, 2, { { 0, 0, 0, __DRI_IMAGE_FORMAT_R8, 1 }, { 1, 1, 1, __DRI_IMAGE_FORMAT_GR88, 2 } } }, - { __DRI_IMAGE_FOURCC_P010, __DRI_IMAGE_FORMAT_NONE, + { DRM_FORMAT_P010, __DRI_IMAGE_FORMAT_NONE, __DRI_IMAGE_COMPONENTS_Y_UV, PIPE_FORMAT_P016, 2, { { 0, 0, 0, __DRI_IMAGE_FORMAT_R16, 2 }, { 1, 1, 1, __DRI_IMAGE_FORMAT_GR1616, 4 } } }, - { __DRI_IMAGE_FOURCC_P012, __DRI_IMAGE_FORMAT_NONE, + { DRM_FORMAT_P012, __DRI_IMAGE_FORMAT_NONE, __DRI_IMAGE_COMPONENTS_Y_UV, PIPE_FORMAT_P016, 2, { { 0, 0, 0, __DRI_IMAGE_FORMAT_R16, 2 }, { 1, 1, 1, __DRI_IMAGE_FORMAT_GR1616, 4 } } }, - { __DRI_IMAGE_FOURCC_P016, __DRI_IMAGE_FORMAT_NONE, + { DRM_FORMAT_P016, __DRI_IMAGE_FORMAT_NONE, __DRI_IMAGE_COMPONENTS_Y_UV, PIPE_FORMAT_P016, 2, { { 0, 0, 0, __DRI_IMAGE_FORMAT_R16, 2 }, { 1, 1, 1, __DRI_IMAGE_FORMAT_GR1616, 4 } } }, - { __DRI_IMAGE_FOURCC_NV16, __DRI_IMAGE_FORMAT_NONE, + { DRM_FORMAT_NV16, __DRI_IMAGE_FORMAT_NONE, __DRI_IMAGE_COMPONENTS_Y_UV, PIPE_FORMAT_NV12, 2, { { 0, 0, 0, __DRI_IMAGE_FORMAT_R8, 1 }, { 1, 1, 0, __DRI_IMAGE_FORMAT_GR88, 2 } } }, - { __DRI_IMAGE_FOURCC_AYUV, __DRI_IMAGE_FORMAT_ABGR8888, + { DRM_FORMAT_AYUV, __DRI_IMAGE_FORMAT_ABGR8888, __DRI_IMAGE_COMPONENTS_AYUV, PIPE_FORMAT_AYUV, 1, { { 0, 0, 0, __DRI_IMAGE_FORMAT_ABGR8888, 4 } } }, - { __DRI_IMAGE_FOURCC_XYUV8888, __DRI_IMAGE_FORMAT_XBGR8888, + { DRM_FORMAT_XYUV8888, __DRI_IMAGE_FORMAT_XBGR8888, __DRI_IMAGE_COMPONENTS_XYUV, PIPE_FORMAT_XYUV, 1, { { 0, 0, 0, __DRI_IMAGE_FORMAT_XBGR8888, 4 } } }, @@ -516,11 +523,11 @@ * V into A. This lets the texture sampler interpolate the Y * components correctly when sampling from plane 0, and interpolate * U and V correctly when sampling from plane 1. */ - { __DRI_IMAGE_FOURCC_YUYV, __DRI_IMAGE_FORMAT_NONE, + { DRM_FORMAT_YUYV, __DRI_IMAGE_FORMAT_NONE, __DRI_IMAGE_COMPONENTS_Y_XUXV, PIPE_FORMAT_YUYV, 2, { { 0, 0, 0, __DRI_IMAGE_FORMAT_GR88, 2 }, { 0, 1, 0, __DRI_IMAGE_FORMAT_ARGB8888, 4 } } }, - { __DRI_IMAGE_FOURCC_UYVY, __DRI_IMAGE_FORMAT_NONE, + { DRM_FORMAT_UYVY, __DRI_IMAGE_FORMAT_NONE, __DRI_IMAGE_COMPONENTS_Y_UXVX, PIPE_FORMAT_UYVY, 2, { { 0, 0, 0, __DRI_IMAGE_FORMAT_GR88, 2 }, { 0, 1, 0, __DRI_IMAGE_FORMAT_ABGR8888, 4 } } } @@ -560,6 +567,21 @@ } boolean +dri2_yuv_dma_buf_supported(struct dri_screen *screen, + const struct dri2_format_mapping *map) +{ + struct pipe_screen *pscreen = screen->base.screen; + + for (unsigned i = 0; i < map->nplanes; i++) { + if (!pscreen->is_format_supported(pscreen, + dri2_get_pipe_format_for_dri_format(map->planes[i].dri_format), + screen->target, 0, 0, PIPE_BIND_SAMPLER_VIEW)) + return false; + } + return true; +} + +boolean dri2_query_dma_buf_formats(__DRIscreen *_screen, int max, int *formats, int *count) { @@ -582,7 +604,8 @@ PIPE_BIND_RENDER_TARGET) || pscreen->is_format_supported(pscreen, map->pipe_format, screen->target, 0, 0, - PIPE_BIND_SAMPLER_VIEW)) { + PIPE_BIND_SAMPLER_VIEW) || + dri2_yuv_dma_buf_supported(screen, map)) { if (j < max) formats[j] = map->dri_fourcc; j++; diff -Nru mesa-19.2.8/src/gallium/state_trackers/dri/dri_helpers.h mesa-20.0.8/src/gallium/state_trackers/dri/dri_helpers.h --- mesa-19.2.8/src/gallium/state_trackers/dri/dri_helpers.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/state_trackers/dri/dri_helpers.h 2020-06-12 01:21:17.000000000 +0000 @@ -56,6 +56,9 @@ boolean dri2_query_dma_buf_formats(__DRIscreen *_screen, int max, int *formats, int *count); +boolean +dri2_yuv_dma_buf_supported(struct dri_screen *screen, + const struct dri2_format_mapping *map); __DRIimage * dri2_lookup_egl_image(struct dri_screen *screen, void *handle); diff -Nru mesa-19.2.8/src/gallium/state_trackers/dri/dri_screen.c mesa-20.0.8/src/gallium/state_trackers/dri/dri_screen.c --- mesa-19.2.8/src/gallium/state_trackers/dri/dri_screen.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/state_trackers/dri/dri_screen.c 2020-06-12 01:21:17.000000000 +0000 @@ -43,7 +43,7 @@ #include "state_tracker/drm_driver.h" #include "util/u_debug.h" -#include "util/u_format_s3tc.h" +#include "util/format/u_format_s3tc.h" #define MSAA_VISUAL_MAX_SAMPLES 32 @@ -84,6 +84,8 @@ options->allow_higher_compat_version = driQueryOptionb(optionCache, "allow_higher_compat_version"); options->glsl_zero_init = driQueryOptionb(optionCache, "glsl_zero_init"); + options->vs_position_always_invariant = + driQueryOptionb(optionCache, "vs_position_always_invariant"); options->force_glsl_abs_sqrt = driQueryOptionb(optionCache, "force_glsl_abs_sqrt"); options->allow_glsl_cross_stage_interpolation_mismatch = @@ -129,6 +131,8 @@ MESA_FORMAT_B8G8R8A8_SRGB, MESA_FORMAT_B8G8R8X8_SRGB, MESA_FORMAT_B5G6R5_UNORM, + MESA_FORMAT_RGBA_FLOAT16, + MESA_FORMAT_RGBX_FLOAT16, /* The 32-bit RGBA format must not precede the 32-bit BGRA format. * Likewise for RGBX and BGRX. Otherwise, the GLX client and the GLX @@ -161,6 +165,8 @@ PIPE_FORMAT_BGRA8888_SRGB, PIPE_FORMAT_BGRX8888_SRGB, PIPE_FORMAT_B5G6R5_UNORM, + PIPE_FORMAT_R16G16B16A16_FLOAT, + PIPE_FORMAT_R16G16B16X16_FLOAT, PIPE_FORMAT_RGBA8888_UNORM, PIPE_FORMAT_RGBX8888_UNORM, }; @@ -174,7 +180,9 @@ struct pipe_screen *p_screen = screen->base.screen; bool pf_z16, pf_x8z24, pf_z24x8, pf_s8z24, pf_z24s8, pf_z32; bool mixed_color_depth; + bool allow_rgba_ordering; bool allow_rgb10; + bool allow_fp16; static const GLenum back_buffer_modes[] = { __DRI_ATTRIB_SWAP_NONE, __DRI_ATTRIB_SWAP_UNDEFINED, @@ -191,7 +199,10 @@ depth_buffer_factor = 1; } + allow_rgba_ordering = dri_loader_get_cap(screen, DRI_LOADER_CAP_RGBA_ORDERING); allow_rgb10 = driQueryOptionb(&screen->dev->option_cache, "allow_rgb10_configs"); + allow_fp16 = driQueryOptionb(&screen->dev->option_cache, "allow_fp16_configs"); + allow_fp16 &= dri_loader_get_cap(screen, DRI_LOADER_CAP_FP16); msaa_samples_max = (screen->st_api->feature_mask & ST_API_FEATURE_MS_VISUALS_MASK) ? MSAA_VISUAL_MAX_SAMPLES : 1; @@ -239,19 +250,18 @@ assert(ARRAY_SIZE(mesa_formats) == ARRAY_SIZE(pipe_formats)); - /* Expose only BGRA ordering if the loader doesn't support RGBA ordering. */ - unsigned num_formats; - if (dri_loader_get_cap(screen, DRI_LOADER_CAP_RGBA_ORDERING)) - num_formats = ARRAY_SIZE(mesa_formats); - else - num_formats = ARRAY_SIZE(mesa_formats) - 2; /* all - RGBA_ORDERING formats */ - /* Add configs. */ - for (format = 0; format < num_formats; format++) { + for (format = 0; format < ARRAY_SIZE(mesa_formats); format++) { __DRIconfig **new_configs = NULL; unsigned num_msaa_modes = 0; /* includes a single-sample mode */ uint8_t msaa_modes[MSAA_VISUAL_MAX_SAMPLES]; + /* Expose only BGRA ordering if the loader doesn't support RGBA ordering. */ + if (!allow_rgba_ordering && + (mesa_formats[format] == MESA_FORMAT_R8G8B8A8_UNORM || + mesa_formats[format] == MESA_FORMAT_R8G8B8X8_UNORM)) + continue; + if (!allow_rgb10 && (mesa_formats[format] == MESA_FORMAT_B10G10R10A2_UNORM || mesa_formats[format] == MESA_FORMAT_B10G10R10X2_UNORM || @@ -259,6 +269,11 @@ mesa_formats[format] == MESA_FORMAT_R10G10B10X2_UNORM)) continue; + if (!allow_fp16 && + (mesa_formats[format] == MESA_FORMAT_RGBA_FLOAT16 || + mesa_formats[format] == MESA_FORMAT_RGBX_FLOAT16)) + continue; + if (!p_screen->is_format_supported(p_screen, pipe_formats[format], PIPE_TEXTURE_2D, 0, 0, PIPE_BIND_RENDER_TARGET | @@ -323,6 +338,17 @@ /* Deduce the color format. */ switch (mode->redMask) { + case 0: + /* Formats > 32 bpp */ + assert(mode->floatMode); + if (mode->alphaShift > -1) { + assert(mode->alphaShift == 48); + stvis->color_format = PIPE_FORMAT_R16G16B16A16_FLOAT; + } else { + stvis->color_format = PIPE_FORMAT_R16G16B16X16_FLOAT; + } + break; + case 0x3FF00000: if (mode->alphaMask) { assert(mode->alphaMask == 0xC0000000); @@ -404,7 +430,7 @@ break; } - stvis->accum_format = (mode->haveAccumBuffer) ? + stvis->accum_format = (mode->accumRedBits > 0) ? PIPE_FORMAT_R16G16B16A16_SNORM : PIPE_FORMAT_NONE; stvis->buffer_mask |= ST_ATTACHMENT_FRONT_LEFT_MASK; @@ -419,7 +445,7 @@ stvis->buffer_mask |= ST_ATTACHMENT_BACK_RIGHT_MASK; } - if (mode->haveDepthBuffer || mode->haveStencilBuffer) + if (mode->depthBits > 0 || mode->stencilBits > 0) stvis->buffer_mask |= ST_ATTACHMENT_DEPTH_STENCIL_MASK; /* let the state tracker allocate the accum buffer */ } @@ -447,6 +473,14 @@ stimg->level = img->level; stimg->layer = img->layer; + if (img->imported_dmabuf && map) { + /* Guess sized internal format for dma-bufs. Could be used + * by EXT_EGL_image_storage. + */ + mesa_format mesa_format = driImageFormatToGLFormat(map->dri_format); + stimg->internalformat = driGLFormatToSizedInternalGLFormat(mesa_format); + } + return TRUE; } diff -Nru mesa-19.2.8/src/gallium/state_trackers/dri/dri_screen.h mesa-20.0.8/src/gallium/state_trackers/dri/dri_screen.h --- mesa-19.2.8/src/gallium/state_trackers/dri/dri_screen.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/state_trackers/dri/dri_screen.h 2020-06-12 01:21:17.000000000 +0000 @@ -57,7 +57,7 @@ /* dri */ __DRIscreen *sPriv; - unsigned default_throttle_frames; + boolean throttle; struct st_config_options options; @@ -109,6 +109,7 @@ void *loader_private; + boolean imported_dmabuf; /** * Provided by EGL_EXT_image_dma_buf_import. */ diff -Nru mesa-19.2.8/src/gallium/state_trackers/dri/drisw.c mesa-20.0.8/src/gallium/state_trackers/dri/drisw.c --- mesa-19.2.8/src/gallium/state_trackers/dri/drisw.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/state_trackers/dri/drisw.c 2020-06-12 01:21:17.000000000 +0000 @@ -26,7 +26,7 @@ * **************************************************************************/ -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_memory.h" #include "util/u_inlines.h" #include "util/u_box.h" @@ -138,6 +138,9 @@ if (!res->screen->resource_get_handle(res->screen, NULL, res, &whandle, PIPE_HANDLE_USAGE_FRAMEBUFFER_WRITE)) return FALSE; + if (loader->base.version > 5 && loader->getImageShm2) + return loader->getImageShm2(dPriv, x, y, width, height, whandle.handle, dPriv->loaderPrivate); + loader->getImageShm(dPriv, x, y, width, height, whandle.handle, dPriv->loaderPrivate); return TRUE; } @@ -249,7 +252,7 @@ if (ctx->hud) hud_run(ctx->hud, ctx->st->cso_context, ptex); - ctx->st->flush(ctx->st, ST_FLUSH_FRONT, NULL); + ctx->st->flush(ctx->st, ST_FLUSH_FRONT, NULL, NULL, NULL); drisw_copy_to_front(dPriv, ptex); } @@ -272,7 +275,7 @@ if (ctx->pp && drawable->textures[ST_ATTACHMENT_DEPTH_STENCIL]) pp_run(ctx->pp, ptex, ptex, drawable->textures[ST_ATTACHMENT_DEPTH_STENCIL]); - ctx->st->flush(ctx->st, ST_FLUSH_FRONT, NULL); + ctx->st->flush(ctx->st, ST_FLUSH_FRONT, NULL, NULL, NULL); u_box_2d(x, dPriv->h - y - h, w, h, &box); drisw_present_texture(dPriv, ptex, &box); diff -Nru mesa-19.2.8/src/gallium/state_trackers/glx/xlib/glx_api.c mesa-20.0.8/src/gallium/state_trackers/glx/xlib/glx_api.c --- mesa-19.2.8/src/gallium/state_trackers/glx/xlib/glx_api.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/state_trackers/glx/xlib/glx_api.c 2020-06-12 01:21:17.000000000 +0000 @@ -191,6 +191,9 @@ GLint i; GLboolean comparePointers; + if (!rgbFlag) + return NULL; + if (dbFlag) { /* Check if the MESA_BACK_BUFFER env var is set */ char *backbuffer = getenv("MESA_BACK_BUFFER"); @@ -234,7 +237,6 @@ && v->mesa_visual.numAuxBuffers == numAuxBuffers && v->mesa_visual.samples == num_samples && v->ximage_flag == ximageFlag - && v->mesa_visual.rgbMode == rgbFlag && v->mesa_visual.doubleBufferMode == dbFlag && v->mesa_visual.stereoMode == stereoFlag && (v->mesa_visual.alphaBits > 0) == alphaFlag @@ -1172,20 +1174,13 @@ } -/* XXX these may have to be removed due to thread-safety issues. */ -static GLXContext MakeCurrent_PrevContext = 0; -static GLXDrawable MakeCurrent_PrevDrawable = 0; -static GLXDrawable MakeCurrent_PrevReadable = 0; -static XMesaBuffer MakeCurrent_PrevDrawBuffer = 0; -static XMesaBuffer MakeCurrent_PrevReadBuffer = 0; - - /* GLX 1.3 and later */ PUBLIC Bool glXMakeContextCurrent( Display *dpy, GLXDrawable draw, GLXDrawable read, GLXContext ctx ) { GLXContext glxCtx = ctx; + GLXContext current = GetCurrentContext(); static boolean firsttime = 1, no_rast = 0; if (firsttime) { @@ -1193,58 +1188,43 @@ firsttime = 0; } - if (ctx && draw && read) { - XMesaBuffer drawBuffer, readBuffer; + if (ctx) { + XMesaBuffer drawBuffer = NULL, readBuffer = NULL; XMesaContext xmctx = glxCtx->xmesaContext; - /* Find the XMesaBuffer which corresponds to the GLXDrawable 'draw' */ - if (ctx == MakeCurrent_PrevContext - && draw == MakeCurrent_PrevDrawable) { - drawBuffer = MakeCurrent_PrevDrawBuffer; - } - else { + /* either both must be null, or both must be non-null */ + if (!draw != !read) + return False; + + if (draw) { + /* Find the XMesaBuffer which corresponds to 'draw' */ drawBuffer = XMesaFindBuffer( dpy, draw ); - } - if (!drawBuffer) { - /* drawable must be a new window! */ - drawBuffer = XMesaCreateWindowBuffer( xmctx->xm_visual, draw ); if (!drawBuffer) { - /* Out of memory, or context/drawable depth mismatch */ - return False; + /* drawable must be a new window! */ + drawBuffer = XMesaCreateWindowBuffer( xmctx->xm_visual, draw ); + if (!drawBuffer) { + /* Out of memory, or context/drawable depth mismatch */ + return False; + } } } - /* Find the XMesaBuffer which corresponds to the GLXDrawable 'read' */ - if (ctx == MakeCurrent_PrevContext - && read == MakeCurrent_PrevReadable) { - readBuffer = MakeCurrent_PrevReadBuffer; - } - else { + if (read) { + /* Find the XMesaBuffer which corresponds to 'read' */ readBuffer = XMesaFindBuffer( dpy, read ); - } - if (!readBuffer) { - /* drawable must be a new window! */ - readBuffer = XMesaCreateWindowBuffer( xmctx->xm_visual, read ); if (!readBuffer) { - /* Out of memory, or context/drawable depth mismatch */ - return False; + /* drawable must be a new window! */ + readBuffer = XMesaCreateWindowBuffer( xmctx->xm_visual, read ); + if (!readBuffer) { + /* Out of memory, or context/drawable depth mismatch */ + return False; + } } } - if (no_rast && - MakeCurrent_PrevContext == ctx && - MakeCurrent_PrevDrawable == draw && - MakeCurrent_PrevReadable == read && - MakeCurrent_PrevDrawBuffer == drawBuffer && - MakeCurrent_PrevReadBuffer == readBuffer) + if (no_rast && current == ctx) return True; - MakeCurrent_PrevContext = ctx; - MakeCurrent_PrevDrawable = draw; - MakeCurrent_PrevReadable = read; - MakeCurrent_PrevDrawBuffer = drawBuffer; - MakeCurrent_PrevReadBuffer = readBuffer; - /* Now make current! */ if (XMesaMakeCurrent2(xmctx, drawBuffer, readBuffer)) { ctx->currentDpy = dpy; @@ -1260,18 +1240,11 @@ else if (!ctx && !draw && !read) { /* release current context w/out assigning new one. */ XMesaMakeCurrent2( NULL, NULL, NULL ); - MakeCurrent_PrevContext = 0; - MakeCurrent_PrevDrawable = 0; - MakeCurrent_PrevReadable = 0; - MakeCurrent_PrevDrawBuffer = 0; - MakeCurrent_PrevReadBuffer = 0; SetCurrentContext(NULL); return True; } else { - /* The args must either all be non-zero or all zero. - * This is an error. - */ + /* We were given an invalid set of arguments */ return False; } } @@ -1399,7 +1372,7 @@ XMesaContext xm_src = src->xmesaContext; XMesaContext xm_dst = dst->xmesaContext; (void) dpy; - if (MakeCurrent_PrevContext == src) { + if (GetCurrentContext() == src) { glFlush(); } XMesaCopyContext(xm_src, xm_dst, mask); @@ -1427,11 +1400,6 @@ if (ctx) { GLXContext glxCtx = ctx; (void) dpy; - MakeCurrent_PrevContext = 0; - MakeCurrent_PrevDrawable = 0; - MakeCurrent_PrevReadable = 0; - MakeCurrent_PrevDrawBuffer = 0; - MakeCurrent_PrevReadBuffer = 0; XMesaDestroyContext( glxCtx->xmesaContext ); XMesaGarbageCollect(); free(glxCtx); @@ -1522,12 +1490,7 @@ case GLX_RGBA: if (fbconfig) return GLX_BAD_ATTRIBUTE; - if (xmvis->mesa_visual.rgbMode) { - *value = True; - } - else { - *value = False; - } + *value = True; return 0; case GLX_DOUBLEBUFFER: *value = (int) xmvis->mesa_visual.doubleBufferMode; @@ -1639,10 +1602,7 @@ case GLX_RENDER_TYPE_SGIX: if (!fbconfig) return GLX_BAD_ATTRIBUTE; - if (xmvis->mesa_visual.rgbMode) - *value = GLX_RGBA_BIT; - else - *value = GLX_COLOR_INDEX_BIT; + *value = GLX_RGBA_BIT; break; case GLX_X_RENDERABLE_SGIX: if (!fbconfig) @@ -2222,10 +2182,7 @@ *value = xmctx->xm_visual->visinfo->visualid; break; case GLX_RENDER_TYPE: - if (xmctx->xm_visual->mesa_visual.rgbMode) - *value = GLX_RGBA_TYPE; - else - *value = GLX_COLOR_INDEX_TYPE; + *value = GLX_RGBA_TYPE; break; case GLX_SCREEN: *value = 0; @@ -2495,7 +2452,7 @@ } -PUBLIC int +PUBLIC void glXQueryGLXPbufferSGIX(Display *dpy, GLXPbufferSGIX pbuf, int attribute, unsigned int *value) { @@ -2503,7 +2460,7 @@ if (!xmbuf) { /* Generate GLXBadPbufferSGIX for bad pbuffer */ - return 0; + return; } switch (attribute) { @@ -2525,7 +2482,6 @@ default: *value = 0; } - return 0; } @@ -2654,7 +2610,7 @@ PUBLIC Status glXGetTransparentIndexSUN(Display *dpy, Window overlay, Window underlay, - long *pTransparent) + unsigned long *pTransparent) { (void) dpy; (void) overlay; diff -Nru mesa-19.2.8/src/gallium/state_trackers/glx/xlib/xm_api.c mesa-20.0.8/src/gallium/state_trackers/glx/xlib/xm_api.c --- mesa-19.2.8/src/gallium/state_trackers/glx/xlib/xm_api.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/state_trackers/glx/xlib/xm_api.c 2020-06-12 01:21:17.000000000 +0000 @@ -636,15 +636,13 @@ * initializing the context's visual and buffer information. * \param v the XMesaVisual to initialize * \param b the XMesaBuffer to initialize (may be NULL) - * \param rgb_flag TRUE = RGBA mode, FALSE = color index mode * \param window the window/pixmap we're rendering into * \param cmap the colormap associated with the window/pixmap * \return GL_TRUE=success, GL_FALSE=failure */ static GLboolean initialize_visual_and_buffer(XMesaVisual v, XMesaBuffer b, - GLboolean rgb_flag, Drawable window, - Colormap cmap) + Drawable window, Colormap cmap) { assert(!b || b->xm_visual == v); @@ -652,29 +650,22 @@ v->BitsPerPixel = bits_per_pixel(v); assert(v->BitsPerPixel > 0); - if (rgb_flag == GL_FALSE) { - /* COLOR-INDEXED WINDOW: not supported*/ + /* RGB WINDOW: + * We support RGB rendering into almost any kind of visual. + */ + const int xclass = v->visualType; + if (xclass != GLX_TRUE_COLOR && xclass != GLX_DIRECT_COLOR) { + _mesa_warning(NULL, + "XMesa: RGB mode rendering not supported in given visual.\n"); return GL_FALSE; } - else { - /* RGB WINDOW: - * We support RGB rendering into almost any kind of visual. - */ - const int xclass = v->visualType; - if (xclass != GLX_TRUE_COLOR && xclass == !GLX_DIRECT_COLOR) { - _mesa_warning(NULL, - "XMesa: RGB mode rendering not supported in given visual.\n"); - return GL_FALSE; - } - v->mesa_visual.indexBits = 0; - if (v->BitsPerPixel == 32) { - /* We use XImages for all front/back buffers. If an X Window or - * X Pixmap is 32bpp, there's no guarantee that the alpha channel - * will be preserved. For XImages we're in luck. - */ - v->mesa_visual.alphaBits = 8; - } + if (v->BitsPerPixel == 32) { + /* We use XImages for all front/back buffers. If an X Window or + * X Pixmap is 32bpp, there's no guarantee that the alpha channel + * will be preserved. For XImages we're in luck. + */ + v->mesa_visual.alphaBits = 8; } /* @@ -775,6 +766,9 @@ if (!xmdpy) return NULL; + if (!rgb_flag) + return NULL; + /* For debugging only */ if (getenv("MESA_XSYNC")) { /* This makes debugging X easier. @@ -821,7 +815,7 @@ if (alpha_flag) v->mesa_visual.alphaBits = 8; - (void) initialize_visual_and_buffer( v, NULL, rgb_flag, 0, 0 ); + (void) initialize_visual_and_buffer( v, NULL, 0, 0 ); { const int xclass = v->visualType; @@ -849,7 +843,6 @@ { struct gl_config *vis = &v->mesa_visual; - vis->rgbMode = GL_TRUE; vis->doubleBufferMode = db_flag; vis->stereoMode = stereo_flag; @@ -859,7 +852,6 @@ vis->alphaBits = alpha_bits; vis->rgbBits = red_bits + green_bits + blue_bits; - vis->indexBits = 0; vis->depthBits = depth_size; vis->stencilBits = stencil_size; @@ -868,10 +860,6 @@ vis->accumBlueBits = accum_blue_size; vis->accumAlphaBits = accum_alpha_size; - vis->haveAccumBuffer = accum_red_size > 0; - vis->haveDepthBuffer = depth_size > 0; - vis->haveStencilBuffer = stencil_size > 0; - vis->numAuxBuffers = 0; vis->level = 0; vis->sampleBuffers = num_samples > 1; @@ -1115,8 +1103,7 @@ if (!b) return NULL; - if (!initialize_visual_and_buffer( v, b, v->mesa_visual.rgbMode, - (Drawable) w, cmap )) { + if (!initialize_visual_and_buffer( v, b, (Drawable) w, cmap )) { xmesa_free_buffer(b); return NULL; } @@ -1146,8 +1133,7 @@ if (!b) return NULL; - if (!initialize_visual_and_buffer(v, b, v->mesa_visual.rgbMode, - (Drawable) p, cmap)) { + if (!initialize_visual_and_buffer(v, b, (Drawable) p, cmap)) { xmesa_free_buffer(b); return NULL; } @@ -1205,8 +1191,7 @@ b->TextureFormat = format; b->TextureMipmap = mipmap; - if (!initialize_visual_and_buffer(v, b, v->mesa_visual.rgbMode, - (Drawable) p, cmap)) { + if (!initialize_visual_and_buffer(v, b, (Drawable) p, cmap)) { xmesa_free_buffer(b); return NULL; } @@ -1235,8 +1220,7 @@ if (!b) return NULL; - if (!initialize_visual_and_buffer(v, b, v->mesa_visual.rgbMode, - drawable, cmap)) { + if (!initialize_visual_and_buffer(v, b, drawable, cmap)) { xmesa_free_buffer(b); return NULL; } @@ -1274,6 +1258,9 @@ { GLuint old_width, old_height; + if (!b) + return; + if (b->type == PBUFFER) return; @@ -1303,8 +1290,9 @@ } if (c) { - if (!drawBuffer || !readBuffer) - return GL_FALSE; /* must specify buffers! */ + if (!drawBuffer != !readBuffer) { + return GL_FALSE; /* must specify zero or two buffers! */ + } if (c == old_ctx && c->xm_buffer == drawBuffer && @@ -1318,10 +1306,13 @@ c->xm_buffer = drawBuffer; c->xm_read_buffer = readBuffer; - stapi->make_current(stapi, c->st, drawBuffer->stfb, readBuffer->stfb); + stapi->make_current(stapi, c->st, + drawBuffer ? drawBuffer->stfb : NULL, + readBuffer ? readBuffer->stfb : NULL); /* Solution to Stephane Rehel's problem with glXReleaseBuffersMESA(): */ - drawBuffer->wasCurrent = GL_TRUE; + if (drawBuffer) + drawBuffer->wasCurrent = GL_TRUE; } else { /* Detach */ @@ -1367,7 +1358,7 @@ } if (xmctx && xmctx->xm_buffer == b) { - xmctx->st->flush( xmctx->st, ST_FLUSH_FRONT, NULL); + xmctx->st->flush( xmctx->st, ST_FLUSH_FRONT, NULL, NULL, NULL); } xmesa_swap_st_framebuffer(b->stfb); @@ -1382,7 +1373,7 @@ { XMesaContext xmctx = XMesaGetCurrentContext(); - xmctx->st->flush( xmctx->st, ST_FLUSH_FRONT, NULL); + xmctx->st->flush( xmctx->st, ST_FLUSH_FRONT, NULL, NULL, NULL); xmesa_copy_st_framebuffer(b->stfb, ST_ATTACHMENT_BACK_LEFT, ST_ATTACHMENT_FRONT_LEFT, @@ -1397,7 +1388,7 @@ XMesaDisplay xmdpy = xmesa_init_display(c->xm_visual->display); struct pipe_fence_handle *fence = NULL; - c->st->flush(c->st, ST_FLUSH_FRONT, &fence); + c->st->flush(c->st, ST_FLUSH_FRONT, &fence, NULL, NULL); if (fence) { xmdpy->screen->fence_finish(xmdpy->screen, NULL, fence, PIPE_TIMEOUT_INFINITE); diff -Nru mesa-19.2.8/src/gallium/state_trackers/hgl/hgl.c mesa-20.0.8/src/gallium/state_trackers/hgl/hgl.c --- mesa-19.2.8/src/gallium/state_trackers/hgl/hgl.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/state_trackers/hgl/hgl.c 2020-06-12 01:21:17.000000000 +0000 @@ -13,7 +13,7 @@ #include "pipe/p_format.h" #include "util/u_atomic.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_memory.h" #include "util/u_inlines.h" #include "state_tracker/st_gl_api.h" /* for st_gl_api_create */ diff -Nru mesa-19.2.8/src/gallium/state_trackers/hgl/hgl_context.h mesa-20.0.8/src/gallium/state_trackers/hgl/hgl_context.h --- mesa-19.2.8/src/gallium/state_trackers/hgl/hgl_context.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/state_trackers/hgl/hgl_context.h 2020-06-12 01:21:17.000000000 +0000 @@ -9,11 +9,13 @@ #define HGL_CONTEXT_H -#include "state_tracker/st_api.h" -#include "state_tracker/st_manager.h" +#include "pipe/p_format.h" #include "pipe/p_compiler.h" #include "pipe/p_screen.h" #include "postprocess/filters.h" + +#include "state_tracker/st_api.h" +#include "state_tracker/st_manager.h" #include "os/os_thread.h" #include "bitmap_wrapper.h" diff -Nru mesa-19.2.8/src/gallium/state_trackers/nine/adapter9.c mesa-20.0.8/src/gallium/state_trackers/nine/adapter9.c --- mesa-19.2.8/src/gallium/state_trackers/nine/adapter9.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/state_trackers/nine/adapter9.c 2020-06-12 01:21:17.000000000 +0000 @@ -27,7 +27,7 @@ #include "nine_pipe.h" #include "nine_dump.h" #include "util/u_math.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_dump.h" #include "pipe/p_screen.h" diff -Nru mesa-19.2.8/src/gallium/state_trackers/nine/basetexture9.c mesa-20.0.8/src/gallium/state_trackers/nine/basetexture9.c --- mesa-19.2.8/src/gallium/state_trackers/nine/basetexture9.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/state_trackers/nine/basetexture9.c 2020-06-12 01:21:17.000000000 +0000 @@ -33,7 +33,7 @@ #include "nine_dump.h" #endif -#include "util/u_format.h" +#include "util/format/u_format.h" #define DBG_CHANNEL DBG_BASETEXTURE @@ -122,7 +122,7 @@ 0 : This->base.info.last_level; This->managed.lod = MIN2(LODNew, max_level); - if (This->managed.lod != old && This->bind_count && LIST_IS_EMPTY(&This->list)) + if (This->managed.lod != old && This->bind_count && list_is_empty(&This->list)) list_add(&This->list, &This->base.base.device->update_textures); return old; diff -Nru mesa-19.2.8/src/gallium/state_trackers/nine/basetexture9.h mesa-20.0.8/src/gallium/state_trackers/nine/basetexture9.h --- mesa-19.2.8/src/gallium/state_trackers/nine/basetexture9.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/state_trackers/nine/basetexture9.h 2020-06-12 01:21:17.000000000 +0000 @@ -139,7 +139,7 @@ struct NineBaseTexture9 *old = *slot; if (tex) { - if ((tex->managed.dirty | tex->dirty_mip) && LIST_IS_EMPTY(&tex->list)) + if ((tex->managed.dirty | tex->dirty_mip) && list_is_empty(&tex->list)) list_add(&tex->list, &device->update_textures); tex->bind_count++; @@ -163,7 +163,7 @@ #define BASETEX_REGISTER_UPDATE(t) do { \ if (((t)->managed.dirty | ((t)->dirty_mip)) && (t)->bind_count) \ - if (LIST_IS_EMPTY(&(t)->list)) \ + if (list_is_empty(&(t)->list)) \ list_add(&(t)->list, &(t)->base.base.device->update_textures); \ } while(0) diff -Nru mesa-19.2.8/src/gallium/state_trackers/nine/buffer9.c mesa-20.0.8/src/gallium/state_trackers/nine/buffer9.c --- mesa-19.2.8/src/gallium/state_trackers/nine/buffer9.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/state_trackers/nine/buffer9.c 2020-06-12 01:21:17.000000000 +0000 @@ -246,7 +246,7 @@ /* Tests on Win: READONLY doesn't wait for the upload */ if (!(Flags & D3DLOCK_READONLY)) { if (!This->managed.dirty) { - assert(LIST_IS_EMPTY(&This->managed.list)); + assert(list_is_empty(&This->managed.list)); This->managed.dirty = TRUE; This->managed.dirty_box = box; if (p_atomic_read(&This->managed.pending_upload)) diff -Nru mesa-19.2.8/src/gallium/state_trackers/nine/buffer9.h mesa-20.0.8/src/gallium/state_trackers/nine/buffer9.h --- mesa-19.2.8/src/gallium/state_trackers/nine/buffer9.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/state_trackers/nine/buffer9.h 2020-06-12 01:21:17.000000000 +0000 @@ -122,7 +122,7 @@ struct NineBuffer9 *old = *slot; if (buf) { - if ((buf->managed.dirty) && LIST_IS_EMPTY(&buf->managed.list)) + if ((buf->managed.dirty) && list_is_empty(&buf->managed.list)) list_add(&buf->managed.list, &device->update_buffers); buf->bind_count++; } @@ -140,7 +140,7 @@ #define BASEBUF_REGISTER_UPDATE(b) { \ if ((b)->managed.dirty && (b)->bind_count) \ - if (LIST_IS_EMPTY(&(b)->managed.list)) \ + if (list_is_empty(&(b)->managed.list)) \ list_add(&(b)->managed.list, &(b)->base.base.device->update_buffers); \ } diff -Nru mesa-19.2.8/src/gallium/state_trackers/nine/device9.c mesa-20.0.8/src/gallium/state_trackers/nine/device9.c --- mesa-19.2.8/src/gallium/state_trackers/nine/device9.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/state_trackers/nine/device9.c 2020-06-12 01:21:17.000000000 +0000 @@ -47,7 +47,7 @@ #include "util/u_math.h" #include "util/u_inlines.h" #include "util/u_hash_table.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_surface.h" #include "util/u_upload_mgr.h" #include "hud/hud_context.h" diff -Nru mesa-19.2.8/src/gallium/state_trackers/nine/nine_debug.h mesa-20.0.8/src/gallium/state_trackers/nine/nine_debug.h --- mesa-19.2.8/src/gallium/state_trackers/nine/nine_debug.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/state_trackers/nine/nine_debug.h 2020-06-12 01:21:17.000000000 +0000 @@ -44,15 +44,15 @@ } \ } while(0) #else -#define WARN(fmt, ...) -#define WARN_ONCE(fmt, ...) +#define WARN(fmt, ...) do {} while(0) +#define WARN_ONCE(fmt, ...) do {} while(0) #endif #if defined(DEBUG) || !defined(NDEBUG) #define DBG_FLAG(flag, fmt, ...) \ _nine_debug_printf(flag, __FUNCTION__, fmt, ## __VA_ARGS__) #else -#define DBG_FLAG(flag, fmt, ...) +#define DBG_FLAG(flag, fmt, ...) do {} while(0) #endif #define DBG(fmt, ...) DBG_FLAG(DBG_CHANNEL, fmt, ## __VA_ARGS__) @@ -116,7 +116,7 @@ #define user_warn(x) \ if ((x)) { DBG_FLAG(DBG_USER, "User warning: `%s'\n", #x); } #else -#define user_warn(x) +#define user_warn(x) do {} while(0) #endif /* nonfatal assert */ diff -Nru mesa-19.2.8/src/gallium/state_trackers/nine/nine_ff.c mesa-20.0.8/src/gallium/state_trackers/nine/nine_ff.c --- mesa-19.2.8/src/gallium/state_trackers/nine/nine_ff.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/state_trackers/nine/nine_ff.c 2020-06-12 01:21:17.000000000 +0000 @@ -126,7 +126,7 @@ static unsigned nine_ff_vs_key_hash(void *key) { - struct nine_ff_vs_key *vs = key; + const struct nine_ff_vs_key *vs = key; unsigned i; uint32_t hash = vs->value32[0]; for (i = 1; i < ARRAY_SIZE(vs->value32); ++i) @@ -142,7 +142,7 @@ } static unsigned nine_ff_ps_key_hash(void *key) { - struct nine_ff_ps_key *ps = key; + const struct nine_ff_ps_key *ps = key; unsigned i; uint32_t hash = ps->value32[0]; for (i = 1; i < ARRAY_SIZE(ps->value32); ++i) diff -Nru mesa-19.2.8/src/gallium/state_trackers/nine/nine_pipe.h mesa-20.0.8/src/gallium/state_trackers/nine/nine_pipe.h --- mesa-19.2.8/src/gallium/state_trackers/nine/nine_pipe.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/state_trackers/nine/nine_pipe.h 2020-06-12 01:21:17.000000000 +0000 @@ -29,7 +29,7 @@ #include "pipe/p_state.h" /* pipe_box */ #include "util/macros.h" #include "util/u_rect.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "nine_helpers.h" struct cso_context; diff -Nru mesa-19.2.8/src/gallium/state_trackers/nine/nine_shader.c mesa-20.0.8/src/gallium/state_trackers/nine/nine_shader.c --- mesa-19.2.8/src/gallium/state_trackers/nine/nine_shader.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/state_trackers/nine/nine_shader.c 2020-06-12 01:21:17.000000000 +0000 @@ -2601,7 +2601,7 @@ struct ureg_src src = tx_src_param(tx, &tx->insn.src[0]); /* t[n] */ struct ureg_src sample; const int m = tx->insn.dst[0].idx; - const int n = tx->insn.src[0].idx; + ASSERTED const int n = tx->insn.src[0].idx; assert(m >= 0 && m > n); sample = ureg_DECL_sampler(ureg, m); @@ -2618,7 +2618,7 @@ struct ureg_src src = tx_src_param(tx, &tx->insn.src[0]); /* t[n] */ struct ureg_src sample; const int m = tx->insn.dst[0].idx; - const int n = tx->insn.src[0].idx; + ASSERTED const int n = tx->insn.src[0].idx; assert(m >= 0 && m > n); sample = ureg_DECL_sampler(ureg, m); @@ -2640,7 +2640,7 @@ struct ureg_src src = tx_src_param(tx, &tx->insn.src[0]); /* t[n] */ struct ureg_src sample; const int m = tx->insn.dst[0].idx - 1; - const int n = tx->insn.src[0].idx; + ASSERTED const int n = tx->insn.src[0].idx; assert(m >= 0 && m > n); tx_texcoord_alloc(tx, m); @@ -2671,7 +2671,7 @@ struct ureg_src sample; struct ureg_dst tmp; const int m = tx->insn.dst[0].idx - 2; - const int n = tx->insn.src[0].idx; + ASSERTED const int n = tx->insn.src[0].idx; assert(m >= 0 && m > n); tx_texcoord_alloc(tx, m); @@ -2712,7 +2712,7 @@ struct ureg_src src = tx_src_param(tx, &tx->insn.src[0]); /* t[n] */ struct ureg_src sample; const int m = tx->insn.dst[0].idx; - const int n = tx->insn.src[0].idx; + ASSERTED const int n = tx->insn.src[0].idx; assert(m >= 0 && m > n); sample = ureg_DECL_sampler(ureg, m); @@ -2730,7 +2730,7 @@ struct ureg_dst tmp; struct ureg_src sample; const int m = tx->insn.dst[0].idx; - const int n = tx->insn.src[0].idx; + ASSERTED const int n = tx->insn.src[0].idx; assert(m >= 0 && m > n); tx_texcoord_alloc(tx, m); @@ -2752,7 +2752,7 @@ struct ureg_src src = tx_src_param(tx, &tx->insn.src[0]); /* t[n] */ struct ureg_dst tmp; const int m = tx->insn.dst[0].idx - 1; - const int n = tx->insn.src[0].idx; + ASSERTED const int n = tx->insn.src[0].idx; assert(m >= 0 && m > n); tx_texcoord_alloc(tx, m); @@ -2784,7 +2784,7 @@ struct ureg_dst dst = tx_dst_param(tx, &tx->insn.dst[0]); struct ureg_src src = tx_src_param(tx, &tx->insn.src[0]); /* t[n] */ const int m = tx->insn.dst[0].idx; - const int n = tx->insn.src[0].idx; + ASSERTED const int n = tx->insn.src[0].idx; assert(m >= 0 && m > n); tx_texcoord_alloc(tx, m); @@ -2802,7 +2802,7 @@ struct ureg_src sample; struct ureg_dst E, tmp; const int m = tx->insn.dst[0].idx - 2; - const int n = tx->insn.src[0].idx; + ASSERTED const int n = tx->insn.src[0].idx; assert(m >= 0 && m > n); tx_texcoord_alloc(tx, m); diff -Nru mesa-19.2.8/src/gallium/state_trackers/nine/vertexdeclaration9.c mesa-20.0.8/src/gallium/state_trackers/nine/vertexdeclaration9.c --- mesa-19.2.8/src/gallium/state_trackers/nine/vertexdeclaration9.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/state_trackers/nine/vertexdeclaration9.c 2020-06-12 01:21:17.000000000 +0000 @@ -29,7 +29,7 @@ #include "pipe/p_format.h" #include "pipe/p_context.h" #include "util/u_math.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "translate/translate.h" #define DBG_CHANNEL DBG_VERTEXDECLARATION diff -Nru mesa-19.2.8/src/gallium/state_trackers/nine/vertexshader9.c mesa-20.0.8/src/gallium/state_trackers/nine/vertexshader9.c --- mesa-19.2.8/src/gallium/state_trackers/nine/vertexshader9.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/state_trackers/nine/vertexshader9.c 2020-06-12 01:21:17.000000000 +0000 @@ -143,7 +143,7 @@ while (var_so && var_so->vdecl) { if (var_so->cso) { - cso_delete_vertex_shader(This->base.device->cso_sw, var_so->cso ); + This->base.device->pipe_sw->delete_vs_state(This->base.device->pipe_sw, var_so->cso); } var_so = var_so->next; } diff -Nru mesa-19.2.8/src/gallium/state_trackers/nine/volume9.c mesa-20.0.8/src/gallium/state_trackers/nine/volume9.c --- mesa-19.2.8/src/gallium/state_trackers/nine/volume9.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/state_trackers/nine/volume9.c 2020-06-12 01:21:17.000000000 +0000 @@ -28,7 +28,7 @@ #include "nine_pipe.h" #include "nine_dump.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_surface.h" #define DBG_CHANNEL DBG_VOLUME diff -Nru mesa-19.2.8/src/gallium/state_trackers/omx/bellagio/vid_dec_h264.c mesa-20.0.8/src/gallium/state_trackers/omx/bellagio/vid_dec_h264.c --- mesa-19.2.8/src/gallium/state_trackers/omx/bellagio/vid_dec_h264.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/state_trackers/omx/bellagio/vid_dec_h264.c 2020-06-12 01:21:17.000000000 +0000 @@ -49,7 +49,7 @@ priv->EndFrame = vid_dec_h264_EndFrame; priv->Flush = vid_dec_h264_Flush; - LIST_INITHEAD(&priv->codec_data.h264.dpb_list); + list_inithead(&priv->codec_data.h264.dpb_list); priv->picture.h264.field_order_cnt[0] = priv->picture.h264.field_order_cnt[1] = INT_MAX; priv->first_buf_in_frame = true; } diff -Nru mesa-19.2.8/src/gallium/state_trackers/omx/bellagio/vid_dec_h265.c mesa-20.0.8/src/gallium/state_trackers/omx/bellagio/vid_dec_h265.c --- mesa-19.2.8/src/gallium/state_trackers/omx/bellagio/vid_dec_h265.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/state_trackers/omx/bellagio/vid_dec_h265.c 2020-06-12 01:21:17.000000000 +0000 @@ -668,7 +668,7 @@ *timestamp = result->timestamp; --priv->codec_data.h265.dpb_num; - LIST_DEL(&result->list); + list_del(&result->list); FREE(result); return buf; @@ -736,7 +736,7 @@ entry->timestamp = priv->timestamp; entry->poc = get_poc(priv); - LIST_ADDTAIL(&entry->list, &priv->codec_data.h265.dpb_list); + list_addtail(&entry->list, &priv->codec_data.h265.dpb_list); ++priv->codec_data.h265.dpb_num; priv->target = NULL; @@ -1002,7 +1002,7 @@ { priv->picture.base.profile = PIPE_VIDEO_PROFILE_HEVC_MAIN; - LIST_INITHEAD(&priv->codec_data.h265.dpb_list); + list_inithead(&priv->codec_data.h265.dpb_list); priv->codec_data.h265.ref_pic_set_list = (struct ref_pic_set *) CALLOC(MAX_NUM_REF_PICS, sizeof(struct ref_pic_set)); diff -Nru mesa-19.2.8/src/gallium/state_trackers/omx/bellagio/vid_enc.c mesa-20.0.8/src/gallium/state_trackers/omx/bellagio/vid_enc.c --- mesa-19.2.8/src/gallium/state_trackers/omx/bellagio/vid_enc.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/state_trackers/omx/bellagio/vid_enc.c 2020-06-12 01:21:17.000000000 +0000 @@ -235,10 +235,10 @@ priv->scale.xWidth = OMX_VID_ENC_SCALING_WIDTH_DEFAULT; priv->scale.xHeight = OMX_VID_ENC_SCALING_WIDTH_DEFAULT; - LIST_INITHEAD(&priv->free_tasks); - LIST_INITHEAD(&priv->used_tasks); - LIST_INITHEAD(&priv->b_frames); - LIST_INITHEAD(&priv->stacked_tasks); + list_inithead(&priv->free_tasks); + list_inithead(&priv->used_tasks); + list_inithead(&priv->b_frames); + list_inithead(&priv->stacked_tasks); return OMX_ErrorNone; } @@ -658,7 +658,7 @@ return OMX_ErrorInsufficientResources; } - LIST_INITHEAD(&inp->tasks); + list_inithead(&inp->tasks); FREE((*buf)->pBuffer); r = enc_AllocateBackTexture(port, &inp->resource, &inp->transfer, &(*buf)->pBuffer); @@ -687,7 +687,7 @@ return OMX_ErrorInsufficientResources; } - LIST_INITHEAD(&inp->tasks); + list_inithead(&inp->tasks); return OMX_ErrorNone; } @@ -820,16 +820,16 @@ vid_enc_PrivateType *priv = comp->pComponentPrivate; struct encode_task *task; - if (LIST_IS_EMPTY(&priv->b_frames)) + if (list_is_empty(&priv->b_frames)) return; task = LIST_ENTRY(struct encode_task, priv->b_frames.prev, list); - LIST_DEL(&task->list); + list_del(&task->list); /* promote last from to P frame */ priv->ref_idx_l0 = priv->ref_idx_l1; enc_HandleTask(port, task, PIPE_H264_ENC_PICTURE_TYPE_P); - LIST_ADDTAIL(&task->list, &inp->tasks); + list_addtail(&task->list, &inp->tasks); priv->ref_idx_l1 = priv->frame_num++; /* handle B frames */ @@ -900,20 +900,20 @@ if (picture_type == PIPE_H264_ENC_PICTURE_TYPE_B) { /* put frame at the tail of the queue */ - LIST_ADDTAIL(&task->list, &priv->b_frames); + list_addtail(&task->list, &priv->b_frames); } else { /* handle I or P frame */ priv->ref_idx_l0 = priv->ref_idx_l1; enc_HandleTask(port, task, picture_type); - LIST_ADDTAIL(&task->list, &priv->stacked_tasks); + list_addtail(&task->list, &priv->stacked_tasks); LIST_FOR_EACH_ENTRY(task, &priv->stacked_tasks, list) { ++stacked_num; } if (stacked_num == priv->stacked_frames_num) { struct encode_task *t; t = LIST_ENTRY(struct encode_task, priv->stacked_tasks.next, list); - LIST_DEL(&t->list); - LIST_ADDTAIL(&t->list, &inp->tasks); + list_del(&t->list); + list_addtail(&t->list, &inp->tasks); } priv->ref_idx_l1 = priv->frame_num++; @@ -928,7 +928,7 @@ enc_MoveTasks(&priv->b_frames, &inp->tasks); } - if (LIST_IS_EMPTY(&inp->tasks)) + if (list_is_empty(&inp->tasks)) return port->ReturnBufferFunction(port, buf); else return base_port_SendBufferFunction(port, buf); diff -Nru mesa-19.2.8/src/gallium/state_trackers/omx/meson.build mesa-20.0.8/src/gallium/state_trackers/omx/meson.build --- mesa-19.2.8/src/gallium/state_trackers/omx/meson.build 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/state_trackers/omx/meson.build 2020-06-12 01:21:17.000000000 +0000 @@ -65,7 +65,7 @@ libomx_st = static_library( 'omx_st', files_omx, - c_args : [c_vis_args], + c_args : [c_vis_args, '-fcommon'], include_directories : inc_st_omx, dependencies : dep_st_omx, ) diff -Nru mesa-19.2.8/src/gallium/state_trackers/omx/tizonia/h264dprc.c mesa-20.0.8/src/gallium/state_trackers/omx/tizonia/h264dprc.c --- mesa-19.2.8/src/gallium/state_trackers/omx/tizonia/h264dprc.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/state_trackers/omx/tizonia/h264dprc.c 2020-06-12 01:21:17.000000000 +0000 @@ -432,7 +432,7 @@ return OMX_ErrorInsufficientResources; } - LIST_INITHEAD(&priv->codec_data.h264.dpb_list); + list_inithead(&priv->codec_data.h264.dpb_list); priv->video_buffer_map = util_hash_table_create(handle_hash, handle_compare); diff -Nru mesa-19.2.8/src/gallium/state_trackers/omx/tizonia/h264einport.c mesa-20.0.8/src/gallium/state_trackers/omx/tizonia/h264einport.c --- mesa-19.2.8/src/gallium/state_trackers/omx/tizonia/h264einport.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/state_trackers/omx/tizonia/h264einport.c 2020-06-12 01:21:17.000000000 +0000 @@ -112,7 +112,7 @@ return OMX_ErrorInsufficientResources; } - LIST_INITHEAD(&inp->tasks); + list_inithead(&inp->tasks); r = enc_AllocateBackTexture(ap_hdl, idx, &inp->resource, &inp->transfer, &(*buf)->pBuffer); @@ -143,7 +143,7 @@ return OMX_ErrorInsufficientResources; } - LIST_INITHEAD(&inp->tasks); + list_inithead(&inp->tasks); return OMX_ErrorNone; } diff -Nru mesa-19.2.8/src/gallium/state_trackers/omx/tizonia/h264eprc.c mesa-20.0.8/src/gallium/state_trackers/omx/tizonia/h264eprc.c --- mesa-19.2.8/src/gallium/state_trackers/omx/tizonia/h264eprc.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/state_trackers/omx/tizonia/h264eprc.c 2020-06-12 01:21:17.000000000 +0000 @@ -268,16 +268,16 @@ { struct encode_task *task; - if (LIST_IS_EMPTY(&priv->b_frames)) + if (list_is_empty(&priv->b_frames)) return; task = LIST_ENTRY(struct encode_task, priv->b_frames.prev, list); - LIST_DEL(&task->list); + list_del(&task->list); /* promote last from to P frame */ priv->ref_idx_l0 = priv->ref_idx_l1; enc_HandleTask(priv, task, PIPE_H264_ENC_PICTURE_TYPE_P); - LIST_ADDTAIL(&task->list, &inp->tasks); + list_addtail(&task->list, &inp->tasks); priv->ref_idx_l1 = priv->frame_num++; /* handle B frames */ @@ -354,20 +354,20 @@ if (picture_type == PIPE_H264_ENC_PICTURE_TYPE_B) { /* put frame at the tail of the queue */ - LIST_ADDTAIL(&task->list, &priv->b_frames); + list_addtail(&task->list, &priv->b_frames); } else { /* handle I or P frame */ priv->ref_idx_l0 = priv->ref_idx_l1; enc_HandleTask(priv, task, picture_type); - LIST_ADDTAIL(&task->list, &priv->stacked_tasks); + list_addtail(&task->list, &priv->stacked_tasks); LIST_FOR_EACH_ENTRY(task, &priv->stacked_tasks, list) { ++stacked_num; } if (stacked_num == priv->stacked_frames_num) { struct encode_task *t; t = LIST_ENTRY(struct encode_task, priv->stacked_tasks.next, list); - LIST_DEL(&t->list); - LIST_ADDTAIL(&t->list, &inp->tasks); + list_del(&t->list); + list_addtail(&t->list, &inp->tasks); } priv->ref_idx_l1 = priv->frame_num++; @@ -382,7 +382,7 @@ enc_MoveTasks(&priv->b_frames, &inp->tasks); } - if (LIST_IS_EMPTY(&inp->tasks)) { + if (list_is_empty(&inp->tasks)) { return h264e_buffer_emptied(priv, in_buf); } else { return h264e_manage_buffers(priv); @@ -426,10 +426,10 @@ if (!priv->t_pipe) return OMX_ErrorInsufficientResources; - LIST_INITHEAD(&priv->free_tasks); - LIST_INITHEAD(&priv->used_tasks); - LIST_INITHEAD(&priv->b_frames); - LIST_INITHEAD(&priv->stacked_tasks); + list_inithead(&priv->free_tasks); + list_inithead(&priv->used_tasks); + list_inithead(&priv->b_frames); + list_inithead(&priv->stacked_tasks); return OMX_ErrorNone; } diff -Nru mesa-19.2.8/src/gallium/state_trackers/omx/vid_dec_h264_common.c mesa-20.0.8/src/gallium/state_trackers/omx/vid_dec_h264_common.c --- mesa-19.2.8/src/gallium/state_trackers/omx/vid_dec_h264_common.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/state_trackers/omx/vid_dec_h264_common.c 2020-06-12 01:21:17.000000000 +0000 @@ -98,7 +98,7 @@ *timestamp = result->timestamp; --priv->codec_data.h264.dpb_num; - LIST_DEL(&result->list); + list_del(&result->list); FREE(result); return buf; @@ -136,7 +136,7 @@ entry->buffer = priv->target; entry->timestamp = priv->timestamp; entry->poc = MIN2(priv->picture.h264.field_order_cnt[0], priv->picture.h264.field_order_cnt[1]); - LIST_ADDTAIL(&entry->list, &priv->codec_data.h264.dpb_list); + list_addtail(&entry->list, &priv->codec_data.h264.dpb_list); ++priv->codec_data.h264.dpb_num; priv->target = NULL; priv->picture.h264.field_order_cnt[0] = priv->picture.h264.field_order_cnt[1] = INT_MAX; diff -Nru mesa-19.2.8/src/gallium/state_trackers/omx/vid_enc_common.c mesa-20.0.8/src/gallium/state_trackers/omx/vid_enc_common.c --- mesa-19.2.8/src/gallium/state_trackers/omx/vid_enc_common.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/state_trackers/omx/vid_enc_common.c 2020-06-12 01:21:17.000000000 +0000 @@ -50,7 +50,7 @@ from->next->prev = to->prev; from->prev->next = to; to->prev = from->prev; - LIST_INITHEAD(from); + list_inithead(from); } static void enc_GetPictureParamPreset(struct pipe_h264_enc_picture_desc *picture) @@ -130,7 +130,7 @@ unsigned size; #if ENABLE_ST_OMX_BELLAGIO - if (!inp || LIST_IS_EMPTY(&inp->tasks)) { + if (!inp || list_is_empty(&inp->tasks)) { input->nFilledLen = 0; /* mark buffer as empty */ enc_MoveTasks(&priv->used_tasks, &inp->tasks); return; @@ -138,8 +138,8 @@ #endif task = LIST_ENTRY(struct encode_task, inp->tasks.next, list); - LIST_DEL(&task->list); - LIST_ADDTAIL(&task->list, &priv->used_tasks); + list_del(&task->list); + list_addtail(&task->list, &priv->used_tasks); if (!task->bitstream) return; @@ -182,9 +182,9 @@ struct pipe_video_buffer templat = {}; struct encode_task *task; - if (!LIST_IS_EMPTY(&priv->free_tasks)) { + if (!list_is_empty(&priv->free_tasks)) { task = LIST_ENTRY(struct encode_task, priv->free_tasks.next, list); - LIST_DEL(&task->list); + list_del(&task->list); return task; } diff -Nru mesa-19.2.8/src/gallium/state_trackers/osmesa/meson.build mesa-20.0.8/src/gallium/state_trackers/osmesa/meson.build --- mesa-19.2.8/src/gallium/state_trackers/osmesa/meson.build 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/state_trackers/osmesa/meson.build 2020-06-12 01:21:17.000000000 +0000 @@ -1,4 +1,4 @@ -# Copyright © 2017 Intel Corporation +# Copyright © 2017-2018 Intel Corporation # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -18,10 +18,18 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. +osmesa_st_c_args = [] +if with_platform_windows + osmesa_st_c_args += ['-DBUILD_GL32', '-DWIN32_LEAN_AND_MEAN'] + if not with_shared_glapi + osmesa_st_c_args += ['-D_GLAPI_NO_EXPORTS'] + endif +endif + libosmesa_st = static_library( 'osmesa_st', 'osmesa.c', - c_args : ['-DGALLIUM_SOFTPIPE'], + c_args : osmesa_st_c_args, include_directories : [ inc_include, inc_src, inc_gallium, inc_gallium_aux, inc_mapi, inc_mesa, ], diff -Nru mesa-19.2.8/src/gallium/state_trackers/osmesa/osmesa.c mesa-20.0.8/src/gallium/state_trackers/osmesa/osmesa.c --- mesa-19.2.8/src/gallium/state_trackers/osmesa/osmesa.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/state_trackers/osmesa/osmesa.c 2020-06-12 01:21:17.000000000 +0000 @@ -50,6 +50,7 @@ #include +#include #include "GL/osmesa.h" #include "glapi/glapi.h" /* for OSMesaGetProcAddress below */ @@ -61,7 +62,7 @@ #include "util/u_atomic.h" #include "util/u_box.h" #include "util/u_debug.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_inlines.h" #include "util/u_memory.h" @@ -149,6 +150,18 @@ return stapi; } +static struct st_manager *stmgr = NULL; + +static void +create_st_manager(void) +{ + stmgr = CALLOC_STRUCT(st_manager); + if (stmgr) { + stmgr->screen = osmesa_create_screen(); + stmgr->get_param = osmesa_st_get_param; + stmgr->get_egl_image = NULL; + } +} /** * Create/return a singleton st_manager object. @@ -156,24 +169,11 @@ static struct st_manager * get_st_manager(void) { - static struct st_manager *stmgr = NULL; - if (!stmgr) { - stmgr = CALLOC_STRUCT(st_manager); - if (stmgr) { - stmgr->screen = osmesa_create_screen(); - stmgr->get_param = osmesa_st_get_param; - stmgr->get_egl_image = NULL; - } - } - return stmgr; -} + static once_flag create_once_flag = ONCE_FLAG_INIT; + call_once(&create_once_flag, create_st_manager); -static inline boolean -little_endian(void) -{ - const unsigned ui = 1; - return *((const char *) &ui); + return stmgr; } @@ -191,10 +191,11 @@ switch (format) { case OSMESA_RGBA: if (type == GL_UNSIGNED_BYTE) { - if (little_endian()) - return PIPE_FORMAT_R8G8B8A8_UNORM; - else - return PIPE_FORMAT_A8B8G8R8_UNORM; +#if UTIL_ARCH_LITTLE_ENDIAN + return PIPE_FORMAT_R8G8B8A8_UNORM; +#else + return PIPE_FORMAT_A8B8G8R8_UNORM; +#endif } else if (type == GL_UNSIGNED_SHORT) { return PIPE_FORMAT_R16G16B16A16_UNORM; @@ -208,10 +209,11 @@ break; case OSMESA_BGRA: if (type == GL_UNSIGNED_BYTE) { - if (little_endian()) - return PIPE_FORMAT_B8G8R8A8_UNORM; - else - return PIPE_FORMAT_A8R8G8B8_UNORM; +#if UTIL_ARCH_LITTLE_ENDIAN + return PIPE_FORMAT_B8G8R8A8_UNORM; +#else + return PIPE_FORMAT_A8R8G8B8_UNORM; +#endif } else if (type == GL_UNSIGNED_SHORT) { return PIPE_FORMAT_R16G16B16A16_UNORM; @@ -225,10 +227,11 @@ break; case OSMESA_ARGB: if (type == GL_UNSIGNED_BYTE) { - if (little_endian()) - return PIPE_FORMAT_A8R8G8B8_UNORM; - else - return PIPE_FORMAT_B8G8R8A8_UNORM; +#if UTIL_ARCH_LITTLE_ENDIAN + return PIPE_FORMAT_A8R8G8B8_UNORM; +#else + return PIPE_FORMAT_B8G8R8A8_UNORM; +#endif } else if (type == GL_UNSIGNED_SHORT) { return PIPE_FORMAT_R16G16B16A16_UNORM; @@ -258,6 +261,8 @@ /* No gallium format for this one */ return PIPE_FORMAT_NONE; case OSMESA_RGB_565: + if (type != GL_UNSIGNED_SHORT_5_6_5) + return PIPE_FORMAT_NONE; return PIPE_FORMAT_B5G6R5_UNORM; default: ; /* fall-through */ @@ -769,10 +774,6 @@ return GL_FALSE; } - if (osmesa->format == OSMESA_RGB_565 && type != GL_UNSIGNED_SHORT_5_6_5) { - return GL_FALSE; - } - color_format = osmesa_choose_format(osmesa->format, type); if (color_format == PIPE_FORMAT_NONE) { fprintf(stderr, "OSMesaMakeCurrent(unsupported format/type)\n"); diff -Nru mesa-19.2.8/src/gallium/state_trackers/va/image.c mesa-20.0.8/src/gallium/state_trackers/va/image.c --- mesa-19.2.8/src/gallium/state_trackers/va/image.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/state_trackers/va/image.c 2020-06-12 01:21:17.000000000 +0000 @@ -405,13 +405,20 @@ return VA_STATUS_ERROR_OPERATION_FAILED; } + if (format != surf->buffer->buffer_format) { /* support NV12 to YV12 and IYUV conversion now only */ if ((format == PIPE_FORMAT_YV12 && - surf->buffer->buffer_format == PIPE_FORMAT_NV12) || - (format == PIPE_FORMAT_IYUV && - surf->buffer->buffer_format == PIPE_FORMAT_NV12)) + surf->buffer->buffer_format == PIPE_FORMAT_NV12) || + (format == PIPE_FORMAT_IYUV && + surf->buffer->buffer_format == PIPE_FORMAT_NV12)) convert = true; + else if (format == PIPE_FORMAT_NV12 && + (surf->buffer->buffer_format == PIPE_FORMAT_P010 || + surf->buffer->buffer_format == PIPE_FORMAT_P016)) { + mtx_unlock(&drv->mutex); + return VA_STATUS_ERROR_OPERATION_FAILED; + } else { mtx_unlock(&drv->mutex); return VA_STATUS_ERROR_OPERATION_FAILED; diff -Nru mesa-19.2.8/src/gallium/state_trackers/va/picture.c mesa-20.0.8/src/gallium/state_trackers/va/picture.c --- mesa-19.2.8/src/gallium/state_trackers/va/picture.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/state_trackers/va/picture.c 2020-06-12 01:21:17.000000000 +0000 @@ -81,6 +81,7 @@ context->target->buffer_format != PIPE_FORMAT_B8G8R8X8_UNORM && context->target->buffer_format != PIPE_FORMAT_R8G8B8X8_UNORM && context->target->buffer_format != PIPE_FORMAT_NV12 && + context->target->buffer_format != PIPE_FORMAT_P010 && context->target->buffer_format != PIPE_FORMAT_P016) return VA_STATUS_ERROR_UNIMPLEMENTED; diff -Nru mesa-19.2.8/src/gallium/state_trackers/va/postproc.c mesa-20.0.8/src/gallium/state_trackers/va/postproc.c --- mesa-19.2.8/src/gallium/state_trackers/va/postproc.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/state_trackers/va/postproc.c 2020-06-12 01:21:17.000000000 +0000 @@ -353,6 +353,7 @@ dst_region = vlVaRegionDefault(param->output_region, dst_surface, &def_dst_region); if (context->target->buffer_format != PIPE_FORMAT_NV12 && + context->target->buffer_format != PIPE_FORMAT_P010 && context->target->buffer_format != PIPE_FORMAT_P016) return vlVaPostProcCompositor(drv, context, src_region, dst_region, src, context->target, deinterlace); diff -Nru mesa-19.2.8/src/gallium/state_trackers/va/surface.c mesa-20.0.8/src/gallium/state_trackers/va/surface.c --- mesa-19.2.8/src/gallium/state_trackers/va/surface.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/state_trackers/va/surface.c 2020-06-12 01:21:17.000000000 +0000 @@ -565,7 +565,7 @@ struct pipe_resource res_templ; struct winsys_handle whandle; struct pipe_resource *resources[VL_NUM_COMPONENTS]; - const enum pipe_format *resource_formats = NULL; + enum pipe_format resource_formats[VL_NUM_COMPONENTS]; VAStatus result; int i; @@ -584,9 +584,7 @@ if (memory_attribute->num_planes > VL_NUM_COMPONENTS) return VA_STATUS_ERROR_INVALID_PARAMETER; - resource_formats = vl_video_buffer_formats(pscreen, templat->buffer_format); - if (!resource_formats) - return VA_STATUS_ERROR_INVALID_PARAMETER; + vl_get_video_buffer_formats(pscreen, templat->buffer_format, resource_formats); memset(&res_templ, 0, sizeof(res_templ)); res_templ.target = PIPE_TEXTURE_2D; @@ -756,18 +754,28 @@ memset(&templat, 0, sizeof(templat)); - templat.buffer_format = pscreen->get_video_param( - pscreen, - PIPE_VIDEO_PROFILE_UNKNOWN, - PIPE_VIDEO_ENTRYPOINT_BITSTREAM, - PIPE_VIDEO_CAP_PREFERED_FORMAT - ); - templat.interlaced = pscreen->get_video_param( - pscreen, - PIPE_VIDEO_PROFILE_UNKNOWN, - PIPE_VIDEO_ENTRYPOINT_BITSTREAM, - PIPE_VIDEO_CAP_PREFERS_INTERLACED - ); + if (format == VA_RT_FORMAT_YUV420_10BPP) + { + templat.buffer_format = PIPE_FORMAT_P010; + templat.interlaced = false; + } + else + { + templat.buffer_format = pscreen->get_video_param( + pscreen, + PIPE_VIDEO_PROFILE_UNKNOWN, + PIPE_VIDEO_ENTRYPOINT_BITSTREAM, + PIPE_VIDEO_CAP_PREFERED_FORMAT + ); + templat.interlaced = pscreen->get_video_param( + pscreen, + PIPE_VIDEO_PROFILE_UNKNOWN, + PIPE_VIDEO_ENTRYPOINT_BITSTREAM, + PIPE_VIDEO_CAP_PREFERS_INTERLACED + ); + } + + if (expected_fourcc) { enum pipe_format expected_format = VaFourccToPipeFormat(expected_fourcc); diff -Nru mesa-19.2.8/src/gallium/state_trackers/va/va_private.h mesa-20.0.8/src/gallium/state_trackers/va/va_private.h --- mesa-19.2.8/src/gallium/state_trackers/va/va_private.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/state_trackers/va/va_private.h 2020-06-12 01:21:17.000000000 +0000 @@ -96,6 +96,7 @@ case VA_FOURCC('N','V','1','2'): return PIPE_FORMAT_NV12; case VA_FOURCC('P','0','1','0'): + return PIPE_FORMAT_P010; case VA_FOURCC('P','0','1','6'): return PIPE_FORMAT_P016; case VA_FOURCC('I','4','2','0'): @@ -126,6 +127,8 @@ switch (p_format) { case PIPE_FORMAT_NV12: return VA_FOURCC('N','V','1','2'); + case PIPE_FORMAT_P010: + return VA_FOURCC('P','0','1','0'); case PIPE_FORMAT_P016: return VA_FOURCC('P','0','1','6'); case PIPE_FORMAT_IYUV: diff -Nru mesa-19.2.8/src/gallium/state_trackers/vdpau/device.c mesa-20.0.8/src/gallium/state_trackers/vdpau/device.c --- mesa-19.2.8/src/gallium/state_trackers/vdpau/device.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/state_trackers/vdpau/device.c 2020-06-12 01:21:17.000000000 +0000 @@ -29,7 +29,7 @@ #include "util/u_memory.h" #include "util/u_debug.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_sampler.h" #include "vdpau_private.h" @@ -170,7 +170,7 @@ if (!dev) return VDP_STATUS_INVALID_HANDLE; - pqt = CALLOC(1, sizeof(vlVdpPresentationQueue)); + pqt = CALLOC(1, sizeof(vlVdpPresentationQueueTarget)); if (!pqt) return VDP_STATUS_RESOURCES; diff -Nru mesa-19.2.8/src/gallium/state_trackers/vdpau/output.c mesa-20.0.8/src/gallium/state_trackers/vdpau/output.c --- mesa-19.2.8/src/gallium/state_trackers/vdpau/output.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/state_trackers/vdpau/output.c 2020-06-12 01:21:17.000000000 +0000 @@ -31,7 +31,7 @@ #include "util/u_debug.h" #include "util/u_memory.h" #include "util/u_sampler.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_surface.h" #include "vl/vl_csc.h" diff -Nru mesa-19.2.8/src/gallium/state_trackers/vdpau/query.c mesa-20.0.8/src/gallium/state_trackers/vdpau/query.c --- mesa-19.2.8/src/gallium/state_trackers/vdpau/query.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/state_trackers/vdpau/query.c 2020-06-12 01:21:17.000000000 +0000 @@ -154,13 +154,13 @@ break; } - *is_supported &= pscreen->is_video_format_supported - ( - pscreen, - FormatYCBCRToPipe(bits_ycbcr_format), - PIPE_VIDEO_PROFILE_UNKNOWN, - PIPE_VIDEO_ENTRYPOINT_BITSTREAM - ); + if (*is_supported && + !pscreen->is_video_format_supported(pscreen, + FormatYCBCRToPipe(bits_ycbcr_format), + PIPE_VIDEO_PROFILE_UNKNOWN, + PIPE_VIDEO_ENTRYPOINT_BITSTREAM)) { + *is_supported = false; + } mtx_unlock(&dev->mutex); return VDP_STATUS_OK; diff -Nru mesa-19.2.8/src/gallium/state_trackers/wgl/meson.build mesa-20.0.8/src/gallium/state_trackers/wgl/meson.build --- mesa-19.2.8/src/gallium/state_trackers/wgl/meson.build 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/state_trackers/wgl/meson.build 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,57 @@ +# Copyright © 2018 Intel Corporation + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +inc_wgl = include_directories('.') +_c_args_wgl = [] +if not with_shared_glapi + # prevent _glapi_* from being declared __declspec(dllimport) + _c_args_wgl += '-D_GLAPI_NO_EXPORTS' +endif + +libwgl = static_library( + 'wgl', + files( + 'stw_context.c', + 'stw_device.c', + 'stw_ext_context.c', + 'stw_ext_extensionsstring.c', + 'stw_ext_pbuffer.c', + 'stw_ext_pixelformat.c', + 'stw_ext_rendertexture.c', + 'stw_ext_swapinterval.c', + 'stw_framebuffer.c', + 'stw_getprocaddress.c', + 'stw_nopfuncs.c', + 'stw_nopfuncs.h', + 'stw_pixelformat.c', + 'stw_st.c', + 'stw_tls.c', + 'stw_wgl.c', + ), + c_args : [ + '-D_GDI32_', # prevent wgl* being declared __declspec(dllimport) + '-DBUILD_GL32', # declare gl* as __declspec(dllexport) in Mesa headers + '-DWIN32_LEAN_AND_MEAN', # http://msdn2.microsoft.com/en-us/library/6dwk3a1z.aspx + _c_args_wgl + ], + include_directories : [ + inc_include, inc_src, inc_gallium, inc_gallium_aux, inc_mapi, inc_mesa, + ], +) diff -Nru mesa-19.2.8/src/gallium/state_trackers/wgl/stw_context.c mesa-20.0.8/src/gallium/state_trackers/wgl/stw_context.c --- mesa-19.2.8/src/gallium/state_trackers/wgl/stw_context.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/state_trackers/wgl/stw_context.c 2020-06-12 01:21:17.000000000 +0000 @@ -448,10 +448,11 @@ if (old_ctx->shared) { struct pipe_fence_handle *fence = NULL; old_ctx->st->flush(old_ctx->st, - ST_FLUSH_FRONT | ST_FLUSH_WAIT, &fence); + ST_FLUSH_FRONT | ST_FLUSH_WAIT, &fence, + NULL, NULL); } else { - old_ctx->st->flush(old_ctx->st, ST_FLUSH_FRONT, NULL); + old_ctx->st->flush(old_ctx->st, ST_FLUSH_FRONT, NULL, NULL, NULL); } } } diff -Nru mesa-19.2.8/src/gallium/state_trackers/wgl/stw_ext_pixelformat.c mesa-20.0.8/src/gallium/state_trackers/wgl/stw_ext_pixelformat.c --- mesa-19.2.8/src/gallium/state_trackers/wgl/stw_ext_pixelformat.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/state_trackers/wgl/stw_ext_pixelformat.c 2020-06-12 01:21:17.000000000 +0000 @@ -42,7 +42,7 @@ #include #include "pipe/p_compiler.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_memory.h" #include "stw_device.h" #include "stw_pixelformat.h" diff -Nru mesa-19.2.8/src/gallium/state_trackers/wgl/stw_framebuffer.c mesa-20.0.8/src/gallium/state_trackers/wgl/stw_framebuffer.c --- mesa-19.2.8/src/gallium/state_trackers/wgl/stw_framebuffer.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/state_trackers/wgl/stw_framebuffer.c 2020-06-12 01:21:17.000000000 +0000 @@ -647,7 +647,7 @@ if (ctx->current_framebuffer == fb) { /* flush current context */ - ctx->st->flush(ctx->st, ST_FLUSH_END_OF_FRAME, NULL); + ctx->st->flush(ctx->st, ST_FLUSH_END_OF_FRAME, NULL, NULL, NULL); } } diff -Nru mesa-19.2.8/src/gallium/state_trackers/wgl/stw_pixelformat.c mesa-20.0.8/src/gallium/state_trackers/wgl/stw_pixelformat.c --- mesa-19.2.8/src/gallium/state_trackers/wgl/stw_pixelformat.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/state_trackers/wgl/stw_pixelformat.c 2020-06-12 01:21:17.000000000 +0000 @@ -29,7 +29,7 @@ #include "pipe/p_defines.h" #include "pipe/p_screen.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_debug.h" #include "util/u_memory.h" diff -Nru mesa-19.2.8/src/gallium/state_trackers/wgl/stw_wgl.c mesa-20.0.8/src/gallium/state_trackers/wgl/stw_wgl.c --- mesa-19.2.8/src/gallium/state_trackers/wgl/stw_wgl.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/state_trackers/wgl/stw_wgl.c 2020-06-12 01:21:17.000000000 +0000 @@ -258,8 +258,10 @@ if (size != GDI_ERROR) { if (size == 0) { - glBitmap(0, 0, -gm.gmptGlyphOrigin.x, gm.gmptGlyphOrigin.y, - gm.gmCellIncX, gm.gmCellIncY, NULL); + glBitmap(0, 0, (GLfloat)-gm.gmptGlyphOrigin.x, + (GLfloat)gm.gmptGlyphOrigin.y, + (GLfloat)gm.gmCellIncX, + (GLfloat)gm.gmCellIncY, NULL); } else { buffer = realloc(buffer, size); diff -Nru mesa-19.2.8/src/gallium/state_trackers/xa/xa_tgsi.c mesa-20.0.8/src/gallium/state_trackers/xa/xa_tgsi.c --- mesa-19.2.8/src/gallium/state_trackers/xa/xa_tgsi.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/state_trackers/xa/xa_tgsi.c 2020-06-12 01:21:17.000000000 +0000 @@ -431,7 +431,7 @@ } static void -cache_destroy(struct cso_context *cso, +cache_destroy(struct pipe_context *pipe, struct cso_hash *hash, unsigned processor) { struct cso_hash_iter iter = cso_hash_first_node(hash); @@ -440,9 +440,9 @@ void *shader = (void *)cso_hash_iter_data(iter); if (processor == PIPE_SHADER_FRAGMENT) { - cso_delete_fragment_shader(cso, shader); + pipe->delete_fs_state(pipe, shader); } else if (processor == PIPE_SHADER_VERTEX) { - cso_delete_vertex_shader(cso, shader); + pipe->delete_vs_state(pipe, shader); } iter = cso_hash_erase(hash, iter); } @@ -452,8 +452,8 @@ void xa_shaders_destroy(struct xa_shaders *sc) { - cache_destroy(sc->r->cso, sc->vs_hash, PIPE_SHADER_VERTEX); - cache_destroy(sc->r->cso, sc->fs_hash, PIPE_SHADER_FRAGMENT); + cache_destroy(sc->r->pipe, sc->vs_hash, PIPE_SHADER_VERTEX); + cache_destroy(sc->r->pipe, sc->fs_hash, PIPE_SHADER_FRAGMENT); FREE(sc); } diff -Nru mesa-19.2.8/src/gallium/state_trackers/xvmc/subpicture.c mesa-20.0.8/src/gallium/state_trackers/xvmc/subpicture.c --- mesa-19.2.8/src/gallium/state_trackers/xvmc/subpicture.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/state_trackers/xvmc/subpicture.c 2020-06-12 01:21:17.000000000 +0000 @@ -36,7 +36,7 @@ #include "util/u_memory.h" #include "util/u_math.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_sampler.h" #include "util/u_surface.h" #include "util/u_rect.h" diff -Nru mesa-19.2.8/src/gallium/targets/d3dadapter9/meson.build mesa-20.0.8/src/gallium/targets/d3dadapter9/meson.build --- mesa-19.2.8/src/gallium/targets/d3dadapter9/meson.build 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/targets/d3dadapter9/meson.build 2020-06-12 01:21:17.000000000 +0000 @@ -57,11 +57,12 @@ ], c_args : [c_vis_args, gallium_nine_c_args], cpp_args : [cpp_vis_args], - link_args : [ld_args_gc_sections, gallium_nine_ld_args], + link_args : [ld_args_build_id, ld_args_gc_sections, gallium_nine_ld_args], link_depends : gallium_nine_link_depends, link_with : gallium_nine_link_with, dependencies : [ - dep_selinux, dep_libdrm, dep_llvm, dep_thread, idep_xmlconfig, idep_mesautil, + dep_selinux, dep_libdrm, dep_llvm, dep_thread, + idep_xmlconfig, idep_mesautil, idep_nir, driver_swrast, driver_r300, driver_r600, driver_radeonsi, driver_nouveau, driver_i915, driver_svga, driver_iris ], diff -Nru mesa-19.2.8/src/gallium/targets/dri/Android.mk mesa-20.0.8/src/gallium/targets/dri/Android.mk --- mesa-19.2.8/src/gallium/targets/dri/Android.mk 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/targets/dri/Android.mk 2020-06-12 01:21:17.000000000 +0000 @@ -42,7 +42,8 @@ LOCAL_SHARED_LIBRARIES := \ libdl \ libglapi \ - libz + libz \ + liblog # If Android version >=8 MESA should static link libexpat else should dynamic link ifeq ($(shell test $(PLATFORM_SDK_VERSION) -ge 27; echo $$?), 0) @@ -54,9 +55,16 @@ endif LOCAL_STATIC_LIBRARIES += \ + libetnaviv_drm \ libfreedreno_drm \ libfreedreno_ir3 \ - libpanfrost_shared \ + libfreedreno_perfcntrs \ + libmesa_gallium \ + libpanfrost_bifrost \ + libpanfrost_decode \ + libpanfrost_encoder \ + libpanfrost_midgard \ + libpanfrost_shared ifeq ($(USE_LIBBACKTRACE),true) LOCAL_SHARED_LIBRARIES += libbacktrace @@ -74,7 +82,6 @@ libmesa_nir \ libmesa_dri_common \ libmesa_megadriver_stub \ - libmesa_gallium \ libmesa_pipe_loader \ libmesa_util \ libmesa_loader diff -Nru mesa-19.2.8/src/gallium/targets/dri/meson.build mesa-20.0.8/src/gallium/targets/dri/meson.build --- mesa-19.2.8/src/gallium/targets/dri/meson.build 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/targets/dri/meson.build 2020-06-12 01:21:17.000000000 +0000 @@ -49,7 +49,7 @@ link_args : [ld_args_build_id, ld_args_gc_sections, gallium_dri_ld_args], link_depends : gallium_dri_link_depends, link_with : [ - libmesa_gallium, libdricommon, libmegadriver_stub, libdri, libgalliumvl, + libdri, libmesa_gallium, libdricommon, libmegadriver_stub, libgalliumvl, libgallium, libglapi, libpipe_loader_static, libws_null, libwsw, libswdri, libswkmsdri, ], @@ -58,7 +58,7 @@ driver_swrast, driver_r300, driver_r600, driver_radeonsi, driver_nouveau, driver_kmsro, driver_v3d, driver_vc4, driver_freedreno, driver_etnaviv, driver_tegra, driver_i915, driver_svga, driver_virgl, - driver_swr, driver_panfrost, driver_iris, driver_lima + driver_swr, driver_panfrost, driver_iris, driver_lima, driver_zink ], # Will be deleted during installation, see install_megadrivers.py install : true, @@ -73,6 +73,8 @@ 'ili9225_dri.so', 'ili9341_dri.so', 'imx-drm_dri.so', + 'ingenic-drm_dri.so', + 'mcde_dri.so', 'meson_dri.so', 'mi0283qt_dri.so', 'mxsfb-drm_dri.so', @@ -100,15 +102,15 @@ [with_gallium_r600, 'r600_dri.so'], [with_gallium_svga, 'vmwgfx_dri.so'], [with_gallium_virgl, 'virtio_gpu_dri.so'], - [with_gallium_lima, 'lima_dri.so']] + [with_gallium_lima, 'lima_dri.so'], + [with_gallium_zink, 'zink_dri.so']] if d[0] gallium_dri_drivers += d[1] endif endforeach meson.add_install_script( - prog_python.path(), - join_paths(meson.source_root(), 'bin/install_megadrivers.py'), + install_megadrivers_py.path(), libgallium_dri.full_path(), dri_drivers_path, gallium_dri_drivers, diff -Nru mesa-19.2.8/src/gallium/targets/dri/target.c mesa-20.0.8/src/gallium/targets/dri/target.c --- mesa-19.2.8/src/gallium/targets/dri/target.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/targets/dri/target.c 2020-06-12 01:21:17.000000000 +0000 @@ -98,6 +98,8 @@ DEFINE_LOADER_DRM_ENTRYPOINT(ili9225) DEFINE_LOADER_DRM_ENTRYPOINT(ili9341) DEFINE_LOADER_DRM_ENTRYPOINT(imx_drm) +DEFINE_LOADER_DRM_ENTRYPOINT(ingenic_drm) +DEFINE_LOADER_DRM_ENTRYPOINT(mcde) DEFINE_LOADER_DRM_ENTRYPOINT(meson) DEFINE_LOADER_DRM_ENTRYPOINT(mi0283qt) DEFINE_LOADER_DRM_ENTRYPOINT(mxsfb_drm) @@ -113,3 +115,7 @@ #if defined(GALLIUM_LIMA) DEFINE_LOADER_DRM_ENTRYPOINT(lima) #endif + +#if defined(GALLIUM_ZINK) +DEFINE_LOADER_DRM_ENTRYPOINT(zink); +#endif diff -Nru mesa-19.2.8/src/gallium/targets/graw-gdi/meson.build mesa-20.0.8/src/gallium/targets/graw-gdi/meson.build --- mesa-19.2.8/src/gallium/targets/graw-gdi/meson.build 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/targets/graw-gdi/meson.build 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,38 @@ +# Copyright © 2018-2019 Intel Corporation + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +libgraw_gdi = shared_library( + 'graw', + 'graw_gdi.c', + c_args : [c_vis_args, c_msvc_compat_args], + include_directories : [ + inc_include, inc_src, inc_gallium, inc_gallium_aux, inc_gallium_drivers, + inc_gallium_winsys_sw, + ], + link_with : [ + libgraw_util, libgallium, libwsgdi, + ], + dependencies : [ + dep_ws2_32, idep_mesautil, driver_swrast, + ], + name_prefix : host_machine.system() == 'windows' ? '' : 'lib', # otherwise mingw will create libgraw.dll +) + +libgraw = libgraw_gdi diff -Nru mesa-19.2.8/src/gallium/targets/graw-gdi/SConscript mesa-20.0.8/src/gallium/targets/graw-gdi/SConscript --- mesa-19.2.8/src/gallium/targets/graw-gdi/SConscript 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/targets/graw-gdi/SConscript 2020-06-12 01:21:17.000000000 +0000 @@ -10,8 +10,10 @@ ]) env.Prepend(LIBS = [ + compiler, mesautil, gallium, + nir, 'gdi32', 'user32', 'ws2_32', diff -Nru mesa-19.2.8/src/gallium/targets/graw-null/meson.build mesa-20.0.8/src/gallium/targets/graw-null/meson.build --- mesa-19.2.8/src/gallium/targets/graw-null/meson.build 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/targets/graw-null/meson.build 2020-06-12 01:21:17.000000000 +0000 @@ -21,15 +21,18 @@ libgraw_util = static_library( 'graw_util', ['graw_util.c'], + c_args : [c_vis_args, c_msvc_compat_args], include_directories : inc_common, ) libgraw_null = shared_library( 'graw_null', ['graw_null.c'], + c_args : [c_vis_args, c_msvc_compat_args], include_directories : inc_common, link_with : libgallium, dependencies : idep_mesautil, + name_prefix : host_machine.system() == 'windows' ? '' : 'lib', # otherwise mingw will create libgraw_null.dll ) libgraw = libgraw_null diff -Nru mesa-19.2.8/src/gallium/targets/graw-xlib/meson.build mesa-20.0.8/src/gallium/targets/graw-xlib/meson.build --- mesa-19.2.8/src/gallium/targets/graw-xlib/meson.build 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/targets/graw-xlib/meson.build 2020-06-12 01:21:17.000000000 +0000 @@ -21,6 +21,7 @@ libgraw_xlib = shared_library( 'graw_xlib', ['graw_xlib.c'], + c_args : [c_vis_args], include_directories : [inc_common, inc_gallium_drivers, inc_gallium_winsys], link_with : [ libgraw_util, libgallium, libws_xlib diff -Nru mesa-19.2.8/src/gallium/targets/graw-xlib/SConscript mesa-20.0.8/src/gallium/targets/graw-xlib/SConscript --- mesa-19.2.8/src/gallium/targets/graw-xlib/SConscript 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/targets/graw-xlib/SConscript 2020-06-12 01:21:17.000000000 +0000 @@ -7,8 +7,10 @@ env.Prepend(LIBS = [ ws_xlib, + compiler, mesautil, gallium, + nir, ]) env.Append(LIBS = env['X11_LIBS']) diff -Nru mesa-19.2.8/src/gallium/targets/haiku-softpipe/GalliumContext.cpp mesa-20.0.8/src/gallium/targets/haiku-softpipe/GalliumContext.cpp --- mesa-19.2.8/src/gallium/targets/haiku-softpipe/GalliumContext.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/targets/haiku-softpipe/GalliumContext.cpp 2020-06-12 01:21:17.000000000 +0000 @@ -243,7 +243,7 @@ return; if (fContext[contextID]->st) { - fContext[contextID]->st->flush(fContext[contextID]->st, 0, NULL); + fContext[contextID]->st->flush(fContext[contextID]->st, 0, NULL, NULL, NULL); fContext[contextID]->st->destroy(fContext[contextID]->st); } @@ -297,7 +297,7 @@ if (oldContextID > 0 && oldContextID != contextID) { fContext[oldContextID]->st->flush(fContext[oldContextID]->st, - ST_FLUSH_FRONT, NULL); + ST_FLUSH_FRONT, NULL, NULL, NULL); } // We need to lock and unlock framebuffers before accessing them @@ -333,7 +333,7 @@ ERROR("%s: context not found\n", __func__); return B_ERROR; } - context->st->flush(context->st, ST_FLUSH_FRONT, NULL); + context->st->flush(context->st, ST_FLUSH_FRONT, NULL, NULL, NULL); struct hgl_buffer* buffer = hgl_st_framebuffer(context->draw->stfbi); pipe_surface* surface = buffer->surface; diff -Nru mesa-19.2.8/src/gallium/targets/haiku-softpipe/meson.build mesa-20.0.8/src/gallium/targets/haiku-softpipe/meson.build --- mesa-19.2.8/src/gallium/targets/haiku-softpipe/meson.build 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/targets/haiku-softpipe/meson.build 2020-06-12 01:21:17.000000000 +0000 @@ -31,10 +31,10 @@ link_args : [ld_args_bsymbolic, ld_args_gc_sections], link_with : [ libglapi, libswhgl, libsthgl, libcompiler, - libmesa_gallium, libglsl, libnir, libgallium, libgl + libmesa_gallium, libglsl, libgallium, libgl ], dependencies : [ driver_swrast, cpp.find_library('be'), cpp.find_library('translation'), - cpp.find_library('network'), dep_unwind, idep_mesautil, + cpp.find_library('network'), dep_unwind, idep_mesautil, idep_nir, ] ) diff -Nru mesa-19.2.8/src/gallium/targets/libgl-gdi/libgl_gdi.c mesa-20.0.8/src/gallium/targets/libgl-gdi/libgl_gdi.c --- mesa-19.2.8/src/gallium/targets/libgl-gdi/libgl_gdi.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/targets/libgl-gdi/libgl_gdi.c 2020-06-12 01:21:17.000000000 +0000 @@ -45,13 +45,13 @@ #include "softpipe/sp_screen.h" #include "softpipe/sp_public.h" -#ifdef HAVE_LLVMPIPE +#ifdef GALLIUM_LLVMPIPE #include "llvmpipe/lp_texture.h" #include "llvmpipe/lp_screen.h" #include "llvmpipe/lp_public.h" #endif -#ifdef HAVE_SWR +#ifdef GALLIUM_SWR #include "swr/swr_public.h" #endif @@ -70,9 +70,9 @@ if(!winsys) goto no_winsys; -#ifdef HAVE_LLVMPIPE +#ifdef GALLIUM_LLVMPIPE default_driver = "llvmpipe"; -#elif HAVE_SWR +#elif GALLIUM_SWR default_driver = "swr"; #else default_driver = "softpipe"; @@ -80,14 +80,14 @@ driver = debug_get_option("GALLIUM_DRIVER", default_driver); -#ifdef HAVE_LLVMPIPE +#ifdef GALLIUM_LLVMPIPE if (strcmp(driver, "llvmpipe") == 0) { screen = llvmpipe_create_screen( winsys ); if (screen) use_llvmpipe = TRUE; } #endif -#ifdef HAVE_SWR +#ifdef GALLIUM_SWR if (strcmp(driver, "swr") == 0) { screen = swr_create_screen( winsys ); if (screen) @@ -130,7 +130,7 @@ struct sw_winsys *winsys = NULL; struct sw_displaytarget *dt = NULL; -#ifdef HAVE_LLVMPIPE +#ifdef GALLIUM_LLVMPIPE if (use_llvmpipe) { winsys = llvmpipe_screen(screen)->winsys; dt = llvmpipe_resource(res)->dt; @@ -139,7 +139,7 @@ } #endif -#ifdef HAVE_SWR +#ifdef GALLIUM_SWR if (use_swr) { swr_gdi_swap(screen, res, hDC); return; diff -Nru mesa-19.2.8/src/gallium/targets/libgl-gdi/meson.build mesa-20.0.8/src/gallium/targets/libgl-gdi/meson.build --- mesa-19.2.8/src/gallium/targets/libgl-gdi/meson.build 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/targets/libgl-gdi/meson.build 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,45 @@ +# Copyright © 2018 Intel Corporation + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +# DEF parser in certain versions of MinGW is busted, as does not behave as +# MSVC. mingw-w64 works fine. +if cc.get_id() == 'gcc' and host_machine.cpu_family() != 'x86_64' + ogldef = files('../../state_trackers/wgl/opengl32.mingw.def')[0] +else + ogldef = files('../../state_trackers/wgl/opengl32.def')[0] +endif + +libopengl32 = shared_library( + 'opengl32', + ['libgl_gdi.c'], + vs_module_defs : ogldef, + include_directories : [ + inc_common, inc_wgl, inc_gallium_winsys_sw, inc_gallium_drivers, + ], + link_whole : [libwgl], + link_with : [ + libgallium, libglsl, libmesa_gallium, libwsgdi, libglapi_static, libglapi + ], + dependencies : [ + dep_ws2_32, idep_nir, idep_mesautil, driver_swrast, driver_swr, + ], + name_prefix : '', # otherwise mingw will create libopengl32.dll + install : true, +) diff -Nru mesa-19.2.8/src/gallium/targets/libgl-gdi/SConscript mesa-20.0.8/src/gallium/targets/libgl-gdi/SConscript --- mesa-19.2.8/src/gallium/targets/libgl-gdi/SConscript 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/targets/libgl-gdi/SConscript 2020-06-12 01:21:17.000000000 +0000 @@ -32,11 +32,11 @@ drivers += [softpipe] if env['llvm']: - env.Append(CPPDEFINES = 'HAVE_LLVMPIPE') + env.Append(CPPDEFINES = 'GALLIUM_LLVMPIPE') drivers += [llvmpipe] if env['swr']: - env.Append(CPPDEFINES = 'HAVE_SWR') + env.Append(CPPDEFINES = 'GALLIUM_SWR') drivers += [swr] if env['gcc'] and env['machine'] != 'x86_64': diff -Nru mesa-19.2.8/src/gallium/targets/libgl-xlib/SConscript mesa-20.0.8/src/gallium/targets/libgl-xlib/SConscript --- mesa-19.2.8/src/gallium/targets/libgl-xlib/SConscript 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/targets/libgl-xlib/SConscript 2020-06-12 01:21:17.000000000 +0000 @@ -11,6 +11,7 @@ '#/src/mesa/main', '#src/gallium/state_trackers/glx/xlib', Dir('../../../mapi'), # src/mapi build path for python-generated GL API files/headers + Dir('../../../mapi/glapi/gen'), # src/mapi build path for python-generated GL API files/headers ]) env.Append(CPPDEFINES = ['USE_XSHM']) diff -Nru mesa-19.2.8/src/gallium/targets/libgl-xlib/xlib.c mesa-20.0.8/src/gallium/targets/libgl-xlib/xlib.c --- mesa-19.2.8/src/gallium/targets/libgl-xlib/xlib.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/targets/libgl-xlib/xlib.c 2020-06-12 01:21:17.000000000 +0000 @@ -128,6 +128,6 @@ /* skip normal ones */ #define _GLAPI_SKIP_NORMAL_ENTRY_POINTS -#include "glapi/glapitemp.h" +#include "glapitemp.h" #endif /* GLX_INDIRECT_RENDERING */ diff -Nru mesa-19.2.8/src/gallium/targets/omx/meson.build mesa-20.0.8/src/gallium/targets/omx/meson.build --- mesa-19.2.8/src/gallium/targets/omx/meson.build 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/targets/omx/meson.build 2020-06-12 01:21:17.000000000 +0000 @@ -32,7 +32,7 @@ libomx_gallium = shared_library( 'omx_mesa', - ['target.c', xmlpool_options_h], + 'target.c', c_args : c_vis_args, cpp_args : cpp_vis_args, link_args : [omx_link_args, ld_args_gc_sections], @@ -45,7 +45,7 @@ libpipe_loader_static, libws_null, libwsw, libswdri, libswkmsdri, ], link_depends : omx_link_depends, - dependencies : [idep_mesautil, driver_r600, driver_radeonsi, driver_nouveau], + dependencies : [idep_mesautil, idep_xmlconfig_headers, driver_r600, driver_radeonsi, driver_nouveau], install : true, install_dir : omx_drivers_path, ) diff -Nru mesa-19.2.8/src/gallium/targets/opencl/meson.build mesa-20.0.8/src/gallium/targets/opencl/meson.build --- mesa-19.2.8/src/gallium/targets/opencl/meson.build 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/targets/opencl/meson.build 2020-06-12 01:21:17.000000000 +0000 @@ -33,16 +33,16 @@ opencl_libname = with_opencl_icd ? 'MesaOpenCL' : 'OpenCL' -libopencl = shared_library( - opencl_libname, - [], - link_args : [ld_args_gc_sections, opencl_link_args], - link_depends : opencl_link_deps, - link_whole : libclover, - link_with : [libpipe_loader_dynamic, libgallium], - dependencies : [ - idep_mesautil, - dep_clock, dep_dl, dep_unwind, dep_elf, +polly_dep = null_dep +polly_isl_dep = null_dep +if dep_llvm.version().version_compare('>=10.0.0') + polly_dep = cpp.find_library('Polly', dirs : llvm_libdir, required : false) + polly_isl_dep = cpp.find_library('PollyISL', dirs : llvm_libdir, required : false) +endif + +dep_clang = cpp.find_library('clang-cpp', dirs : llvm_libdir, required : false) +if not dep_clang.found() + dep_clang = [ cpp.find_library('clangCodeGen', dirs : llvm_libdir), cpp.find_library('clangFrontendTool', dirs : llvm_libdir), cpp.find_library('clangFrontend', dirs : llvm_libdir), @@ -56,6 +56,20 @@ cpp.find_library('clangEdit', dirs : llvm_libdir), cpp.find_library('clangLex', dirs : llvm_libdir), cpp.find_library('clangBasic', dirs : llvm_libdir), + polly_dep, polly_isl_dep, + ] +endif + +libopencl = shared_library( + opencl_libname, + [], + link_args : [ld_args_gc_sections, opencl_link_args], + link_depends : opencl_link_deps, + link_whole : libclover, + link_with : [libpipe_loader_dynamic, libgallium], + dependencies : [ + idep_mesautil, + dep_clock, dep_dl, dep_unwind, dep_elf, dep_clang ], version : '@0@.0.0'.format(opencl_version), install : true, diff -Nru mesa-19.2.8/src/gallium/targets/osmesa/meson.build mesa-20.0.8/src/gallium/targets/osmesa/meson.build --- mesa-19.2.8/src/gallium/targets/osmesa/meson.build 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/targets/osmesa/meson.build 2020-06-12 01:21:17.000000000 +0000 @@ -32,12 +32,19 @@ osmesa_link_deps += files('osmesa.sym') endif +if cc.get_id() == 'gcc' and host_machine.cpu_family() != 'x86_64' + osmesa_def = 'osmesa.mingw.def' +else + osmesa_def = 'osmesa.def' +endif + libosmesa = shared_library( osmesa_lib_name, 'target.c', c_args : [c_vis_args], cpp_args : cpp_vis_args, link_args : [ld_args_gc_sections, osmesa_link_args], + vs_module_defs : osmesa_def, include_directories : [ inc_include, inc_src, inc_gallium, inc_gallium_aux, inc_gallium_winsys, inc_gallium_drivers, @@ -48,9 +55,11 @@ libmesa_gallium, libgallium, libws_null, osmesa_link_with, ], dependencies : [ - dep_selinux, dep_thread, dep_clock, dep_unwind, + dep_ws2_32, dep_selinux, dep_thread, dep_clock, dep_unwind, driver_swrast, driver_swr, ], + name_prefix : host_machine.system() == 'windows' ? '' : 'lib', # otherwise mingw will create libosmesa.dll + soversion : host_machine.system() == 'windows' ? '' : '8', version : '8.0.0', install : true, ) @@ -62,3 +71,16 @@ libraries : libosmesa, libraries_private : gl_priv_libs, ) + +if with_tests + test('osmesa-render', + executable( + 'osmesa-render', + 'test-render.cpp', + include_directories : inc_common, + link_with: libosmesa, + dependencies : [idep_gtest], + ), + suite: 'gallium' + ) +endif diff -Nru mesa-19.2.8/src/gallium/targets/osmesa/test-render.cpp mesa-20.0.8/src/gallium/targets/osmesa/test-render.cpp --- mesa-19.2.8/src/gallium/targets/osmesa/test-render.cpp 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/targets/osmesa/test-render.cpp 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,101 @@ +#include +#include +#include +#include +#include + +#include + +#include "GL/osmesa.h" + + +typedef std::array Params; + +class OSMesaRenderTestFixture : public testing::TestWithParam {}; + +std::string +name_params(const testing::TestParamInfo params) { + auto p = params.param; + std::string first, second; + switch (p[0]) { + case OSMESA_RGBA: + first = "rgba"; + break; + case OSMESA_BGRA: + first = "bgra"; + break; + case OSMESA_RGB: + first = "rgb"; + break; + case OSMESA_RGB_565: + first = "rgb_565"; + break; + case OSMESA_ARGB: + first = "argb"; + break; + } + + switch (p[1]) { + case GL_UNSIGNED_SHORT: + second = "unsigned_short"; + break; + case GL_UNSIGNED_BYTE: + second = "unsigned_byte"; + break; + case GL_FLOAT: + second = "float"; + break; + case GL_UNSIGNED_SHORT_5_6_5: + second = "unisgned_short_565"; + break; + } + + return first + "_" + second; +}; + +TEST_P(OSMesaRenderTestFixture, Render) +{ + auto params = GetParam(); + uint32_t pixel = 0; + uint32_t expected; // This should be green for the given color model + int w = 1, h = 1; + + std::unique_ptr ctx{ + OSMesaCreateContext(params[0], NULL), &OSMesaDestroyContext}; + ASSERT_TRUE(ctx); + + auto ret = OSMesaMakeCurrent(ctx.get(), &pixel, params[1], w, h); + ASSERT_EQ(ret, GL_TRUE); + + switch (params[0]) { + case OSMESA_RGBA: + case OSMESA_BGRA: + case OSMESA_RGB: + expected = 0xff << 8; + glClearColor(0, 1, 0, 0); + break; + case OSMESA_RGB_565: + expected = 0x3f << 5; + glClearColor(0, 1, 0, 0); + break; + case OSMESA_ARGB: + expected = 0xff << 24; + glClearColor(0, 0, 1, 0); + break; + } + glClear(GL_COLOR_BUFFER_BIT); + glFinish(); + + ASSERT_EQ(expected, pixel); +} + +INSTANTIATE_TEST_CASE_P( + OSMesaRenderTest, + OSMesaRenderTestFixture, + testing::Values( + Params{ OSMESA_RGBA, GL_UNSIGNED_BYTE }, + Params{ OSMESA_BGRA, GL_UNSIGNED_BYTE }, + Params{ OSMESA_ARGB, GL_UNSIGNED_BYTE } + ), + name_params +); diff -Nru mesa-19.2.8/src/gallium/targets/pipe-loader/meson.build mesa-20.0.8/src/gallium/targets/pipe-loader/meson.build --- mesa-19.2.8/src/gallium/targets/pipe-loader/meson.build 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/targets/pipe-loader/meson.build 2020-06-12 01:21:17.000000000 +0000 @@ -20,7 +20,7 @@ pipe_loader_link_args = [ld_args_gc_sections] pipe_loader_link_deps = [] -pipe_loader_link_with = [libgallium, libnir] +pipe_loader_link_with = [libgallium] pipe_loader_comp_args = [] pipe_loader_incs = [ inc_include, inc_src, inc_util, inc_gallium, inc_gallium_drivers, @@ -47,6 +47,15 @@ pipe_loader_install_dir = join_paths(get_option('libdir'), 'gallium-pipe') +_kmsro_targets = [ + driver_kmsro, driver_v3d, driver_vc4, driver_freedreno, driver_etnaviv, + driver_panfrost, driver_lima, +] + +if with_gallium_v3d + _kmsro_targets += [idep_xmlconfig, dep_expat] +endif + pipe_loaders = [ [with_gallium_i915, 'i915', driver_i915, []], [with_gallium_nouveau, 'nouveau', driver_nouveau, []], @@ -54,6 +63,7 @@ [with_gallium_r600, 'r600', driver_r600, []], [with_gallium_radeonsi, 'radeonsi', [driver_radeonsi, idep_xmlconfig], []], [with_gallium_freedreno, 'msm', driver_freedreno, []], + [with_gallium_kmsro, 'kmsro', _kmsro_targets, []], [with_gallium_svga, 'vmwgfx', driver_svga, []], [with_gallium_softpipe, 'swrast', [driver_swrast, driver_swr], [libwsw, libws_null]], ] @@ -69,7 +79,7 @@ link_depends : pipe_loader_link_deps, include_directories : pipe_loader_incs, link_with : [pipe_loader_link_with, x[3]], - dependencies : [idep_mesautil, dep_thread, x[2]], + dependencies : [idep_mesautil, idep_nir, dep_thread, x[2]], name_prefix : '', install : true, install_dir : pipe_loader_install_dir, diff -Nru mesa-19.2.8/src/gallium/targets/pipe-loader/pipe_kmsro.c mesa-20.0.8/src/gallium/targets/pipe-loader/pipe_kmsro.c --- mesa-19.2.8/src/gallium/targets/pipe-loader/pipe_kmsro.c 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/targets/pipe-loader/pipe_kmsro.c 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,21 @@ + +#include "target-helpers/inline_debug_helper.h" +#include "state_tracker/drm_driver.h" +#include "kmsro/drm/kmsro_drm_public.h" + +static struct pipe_screen * +create_screen(int fd, const struct pipe_screen_config *config) +{ + struct pipe_screen *screen; + + screen = kmsro_drm_screen_create(fd, config); + if (!screen) + return NULL; + + screen = debug_screen_wrap(screen); + + return screen; +} + +PUBLIC +DRM_DRIVER_DESCRIPTOR("kmsro", NULL, create_screen) diff -Nru mesa-19.2.8/src/gallium/targets/va/meson.build mesa-20.0.8/src/gallium/targets/va/meson.build --- mesa-19.2.8/src/gallium/targets/va/meson.build 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/targets/va/meson.build 2020-06-12 01:21:17.000000000 +0000 @@ -33,7 +33,7 @@ libva_gallium = shared_library( 'gallium_drv_video', - ['target.c', xmlpool_options_h], + 'target.c', c_args : c_vis_args, cpp_args : cpp_vis_args, link_args : [va_link_args, ld_args_gc_sections], @@ -47,7 +47,7 @@ ], dependencies : [ dep_libdrm, driver_r600, driver_radeonsi, driver_nouveau, - idep_mesautil, + idep_mesautil, idep_xmlconfig_headers, ], link_depends : va_link_depends, # Will be deleted during installation, see install_megadrivers.py @@ -65,8 +65,7 @@ endforeach meson.add_install_script( - prog_python.path(), - join_paths(meson.source_root(), 'bin/install_megadrivers.py'), + install_megadrivers_py.path(), libva_gallium.full_path(), va_drivers_path, va_drivers, diff -Nru mesa-19.2.8/src/gallium/targets/vdpau/meson.build mesa-20.0.8/src/gallium/targets/vdpau/meson.build --- mesa-19.2.8/src/gallium/targets/vdpau/meson.build 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/targets/vdpau/meson.build 2020-06-12 01:21:17.000000000 +0000 @@ -38,7 +38,7 @@ libvdpau_gallium = shared_library( 'vdpau_gallium', - ['target.c', xmlpool_options_h], + 'target.c', c_args : c_vis_args, cpp_args : cpp_vis_args, link_args : [vdpau_link_args, ld_args_gc_sections], @@ -51,7 +51,7 @@ libpipe_loader_static, libws_null, libwsw, libswdri, libswkmsdri, ], dependencies : [ - idep_mesautil, + idep_mesautil, idep_xmlconfig_headers, driver_r300, driver_r600, driver_radeonsi, driver_nouveau, ], link_depends : vdpau_link_depends, @@ -71,8 +71,7 @@ endforeach meson.add_install_script( - prog_python.path(), - join_paths(meson.source_root(), 'bin/install_megadrivers.py'), + install_megadrivers_py.path(), libvdpau_gallium.full_path(), vdpau_drivers_path, vdpau_drivers, diff -Nru mesa-19.2.8/src/gallium/targets/xa/meson.build mesa-20.0.8/src/gallium/targets/xa/meson.build --- mesa-19.2.8/src/gallium/targets/xa/meson.build 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/targets/xa/meson.build 2020-06-12 01:21:17.000000000 +0000 @@ -34,7 +34,7 @@ libxatracker = shared_library( 'xatracker', - ['target.c', xmlpool_options_h], + 'target.c', c_args : c_vis_args, cpp_args : cpp_vis_args, link_args : [xa_link_args, ld_args_gc_sections], @@ -48,7 +48,7 @@ ], link_depends : xa_link_depends, dependencies : [ - idep_mesautil, + idep_mesautil, idep_xmlconfig_headers, driver_nouveau, driver_i915, driver_svga, driver_freedreno, ], version : _xa_version, diff -Nru mesa-19.2.8/src/gallium/targets/xvmc/meson.build mesa-20.0.8/src/gallium/targets/xvmc/meson.build --- mesa-19.2.8/src/gallium/targets/xvmc/meson.build 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/targets/xvmc/meson.build 2020-06-12 01:21:17.000000000 +0000 @@ -33,7 +33,7 @@ libxvmc_gallium = shared_library( 'XvMCgallium', - ['target.c', xmlpool_options_h], + 'target.c', c_args : c_vis_args, cpp_args : cpp_vis_args, link_args : [xvmc_link_args, ld_args_gc_sections], @@ -45,7 +45,7 @@ libgalliumvlwinsys, libgalliumvl, libgallium, libpipe_loader_static, libws_null, libwsw, libswdri, libswkmsdri, ], - dependencies : [idep_mesautil, driver_r600, driver_nouveau], + dependencies : [idep_mesautil, idep_xmlconfig_headers, driver_r600, driver_nouveau], link_depends : xvmc_link_depends, # Will be deleted during installation, see install_megadrivers.py install : true, @@ -60,8 +60,7 @@ endforeach meson.add_install_script( - prog_python.path(), - join_paths(meson.source_root(), 'bin/install_megadrivers.py'), + install_megadrivers_py.path(), libxvmc_gallium.full_path(), xvmc_drivers_path, xvmc_drivers, diff -Nru mesa-19.2.8/src/gallium/tests/graw/graw_util.h mesa-20.0.8/src/gallium/tests/graw/graw_util.h --- mesa-19.2.8/src/gallium/tests/graw/graw_util.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/tests/graw/graw_util.h 2020-06-12 01:21:17.000000000 +0000 @@ -11,7 +11,7 @@ #include "util/u_debug.h" #include "util/u_debug_image.h" #include "util/u_draw_quad.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_inlines.h" #include "util/u_memory.h" diff -Nru mesa-19.2.8/src/gallium/tests/meson.build mesa-20.0.8/src/gallium/tests/meson.build --- mesa-19.2.8/src/gallium/tests/meson.build 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/tests/meson.build 2020-06-12 01:21:17.000000000 +0000 @@ -18,6 +18,16 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -subdir('trivial') -subdir('unit') -subdir('graw') +if not with_platform_windows + # pipe-loader doesn't build on windows. + subdir('trivial') +endif +if with_gallium_softpipe + subdir('unit') +endif + +if host_machine.system() != 'windows' or cpp.get_id() != 'gcc' + # FIXME: This has linking errors I can't figure out with MinGW. works fine + # with MSVC, works fine with GCC on Linux. + subdir('graw') +endif diff -Nru mesa-19.2.8/src/gallium/tests/trivial/compute.c mesa-20.0.8/src/gallium/tests/trivial/compute.c --- mesa-19.2.8/src/gallium/tests/trivial/compute.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/tests/trivial/compute.c 2020-06-12 01:21:17.000000000 +0000 @@ -36,7 +36,7 @@ #include "util/u_memory.h" #include "util/u_inlines.h" #include "util/u_sampler.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "tgsi/tgsi_text.h" #include "pipe-loader/pipe_loader.h" diff -Nru mesa-19.2.8/src/gallium/tests/unit/meson.build mesa-20.0.8/src/gallium/tests/unit/meson.build --- mesa-19.2.8/src/gallium/tests/unit/meson.build 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/tests/unit/meson.build 2020-06-12 01:21:17.000000000 +0000 @@ -19,8 +19,7 @@ # SOFTWARE. foreach t : ['pipe_barrier_test', 'u_cache_test', 'u_half_test', - 'u_format_test', 'u_format_compatible_test', 'translate_test', - 'u_prim_verts_test' ] + 'translate_test', 'u_prim_verts_test'] exe = executable( t, '@0@.c'.format(t), @@ -31,6 +30,8 @@ ) # u_cache_test is slow, and translate_test fails. if not ['u_cache_test', 'translate_test'].contains(t) - test(t, exe, suite: 'gallium') + test(t, exe, suite: 'gallium', + should_fail : meson.get_cross_property('xfail', '').contains(t), + ) endif endforeach diff -Nru mesa-19.2.8/src/gallium/tests/unit/SConscript mesa-20.0.8/src/gallium/tests/unit/SConscript --- mesa-19.2.8/src/gallium/tests/unit/SConscript 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/tests/unit/SConscript 2020-06-12 01:21:17.000000000 +0000 @@ -13,8 +13,6 @@ progs = [ 'pipe_barrier_test', 'u_cache_test', - 'u_format_test', - 'u_format_compatible_test', 'u_half_test', 'translate_test' ] diff -Nru mesa-19.2.8/src/gallium/tests/unit/translate_test.c mesa-20.0.8/src/gallium/tests/unit/translate_test.c --- mesa-19.2.8/src/gallium/tests/unit/translate_test.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/tests/unit/translate_test.c 2020-06-12 01:21:17.000000000 +0000 @@ -26,7 +26,7 @@ #include #include "translate/translate.h" #include "util/u_memory.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_half.h" #include "util/u_cpu_detect.h" #include "rtasm/rtasm_cpu.h" diff -Nru mesa-19.2.8/src/gallium/tests/unit/u_format_compatible_test.c mesa-20.0.8/src/gallium/tests/unit/u_format_compatible_test.c --- mesa-19.2.8/src/gallium/tests/unit/u_format_compatible_test.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/tests/unit/u_format_compatible_test.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,76 +0,0 @@ -/************************************************************************** - * - * Copyright 2009-2010 VMware, Inc. - * All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sub license, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice (including the - * next paragraph) shall be included in all copies or substantial portions - * of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. - * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR - * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - **************************************************************************/ - - -#include -#include - -#include "util/u_format.h" - - -static boolean -test_all(void) -{ - enum pipe_format src_format; - enum pipe_format dst_format; - - for (src_format = 1; src_format < PIPE_FORMAT_COUNT; ++src_format) { - const struct util_format_description *src_format_desc; - src_format_desc = util_format_description(src_format); - if (!src_format_desc) { - continue; - } - - for (dst_format = 1; dst_format < PIPE_FORMAT_COUNT; ++dst_format) { - const struct util_format_description *dst_format_desc; - dst_format_desc = util_format_description(dst_format); - if (!dst_format_desc) { - continue; - } - - if (dst_format == src_format) { - continue; - } - - if (util_is_format_compatible(src_format_desc, dst_format_desc)) { - printf("%s -> %s\n", src_format_desc->short_name, dst_format_desc->short_name); - } - } - } - - return TRUE; -} - - -int main(int argc, char **argv) -{ - boolean success; - - success = test_all(); - - return success ? 0 : 1; -} diff -Nru mesa-19.2.8/src/gallium/tests/unit/u_format_test.c mesa-20.0.8/src/gallium/tests/unit/u_format_test.c --- mesa-19.2.8/src/gallium/tests/unit/u_format_test.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/tests/unit/u_format_test.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,819 +0,0 @@ -/************************************************************************** - * - * Copyright 2009-2010 VMware, Inc. - * All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sub license, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice (including the - * next paragraph) shall be included in all copies or substantial portions - * of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. - * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR - * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - **************************************************************************/ - - -#include -#include -#include - -#include "util/u_half.h" -#include "util/u_format.h" -#include "util/u_format_tests.h" -#include "util/u_format_s3tc.h" - - -static boolean -compare_float(float x, float y) -{ - float error = y - x; - - if (error < 0.0f) - error = -error; - - if (error > FLT_EPSILON) { - return FALSE; - } - - return TRUE; -} - - -static void -print_packed(const struct util_format_description *format_desc, - const char *prefix, - const uint8_t *packed, - const char *suffix) -{ - unsigned i; - const char *sep = ""; - - printf("%s", prefix); - for (i = 0; i < format_desc->block.bits/8; ++i) { - printf("%s%02x", sep, packed[i]); - sep = " "; - } - printf("%s", suffix); - fflush(stdout); -} - - -static void -print_unpacked_rgba_doubl(const struct util_format_description *format_desc, - const char *prefix, - const double unpacked[UTIL_FORMAT_MAX_UNPACKED_HEIGHT][UTIL_FORMAT_MAX_UNPACKED_WIDTH][4], - const char *suffix) -{ - unsigned i, j; - const char *sep = ""; - - printf("%s", prefix); - for (i = 0; i < format_desc->block.height; ++i) { - for (j = 0; j < format_desc->block.width; ++j) { - printf("%s{%f, %f, %f, %f}", sep, unpacked[i][j][0], unpacked[i][j][1], unpacked[i][j][2], unpacked[i][j][3]); - sep = ", "; - } - sep = ",\n"; - } - printf("%s", suffix); - fflush(stdout); -} - - -static void -print_unpacked_rgba_float(const struct util_format_description *format_desc, - const char *prefix, - float unpacked[UTIL_FORMAT_MAX_UNPACKED_HEIGHT][UTIL_FORMAT_MAX_UNPACKED_WIDTH][4], - const char *suffix) -{ - unsigned i, j; - const char *sep = ""; - - printf("%s", prefix); - for (i = 0; i < format_desc->block.height; ++i) { - for (j = 0; j < format_desc->block.width; ++j) { - printf("%s{%f, %f, %f, %f}", sep, unpacked[i][j][0], unpacked[i][j][1], unpacked[i][j][2], unpacked[i][j][3]); - sep = ", "; - } - sep = ",\n"; - } - printf("%s", suffix); - fflush(stdout); -} - - -static void -print_unpacked_rgba_8unorm(const struct util_format_description *format_desc, - const char *prefix, - uint8_t unpacked[UTIL_FORMAT_MAX_UNPACKED_HEIGHT][UTIL_FORMAT_MAX_UNPACKED_WIDTH][4], - const char *suffix) -{ - unsigned i, j; - const char *sep = ""; - - printf("%s", prefix); - for (i = 0; i < format_desc->block.height; ++i) { - for (j = 0; j < format_desc->block.width; ++j) { - printf("%s{0x%02x, 0x%02x, 0x%02x, 0x%02x}", sep, unpacked[i][j][0], unpacked[i][j][1], unpacked[i][j][2], unpacked[i][j][3]); - sep = ", "; - } - } - printf("%s", suffix); - fflush(stdout); -} - - -static void -print_unpacked_z_float(const struct util_format_description *format_desc, - const char *prefix, - float unpacked[UTIL_FORMAT_MAX_UNPACKED_HEIGHT][UTIL_FORMAT_MAX_UNPACKED_WIDTH], - const char *suffix) -{ - unsigned i, j; - const char *sep = ""; - - printf("%s", prefix); - for (i = 0; i < format_desc->block.height; ++i) { - for (j = 0; j < format_desc->block.width; ++j) { - printf("%s%f", sep, unpacked[i][j]); - sep = ", "; - } - sep = ",\n"; - } - printf("%s", suffix); - fflush(stdout); -} - - -static void -print_unpacked_z_32unorm(const struct util_format_description *format_desc, - const char *prefix, - uint32_t unpacked[UTIL_FORMAT_MAX_UNPACKED_HEIGHT][UTIL_FORMAT_MAX_UNPACKED_WIDTH], - const char *suffix) -{ - unsigned i, j; - const char *sep = ""; - - printf("%s", prefix); - for (i = 0; i < format_desc->block.height; ++i) { - for (j = 0; j < format_desc->block.width; ++j) { - printf("%s0x%08x", sep, unpacked[i][j]); - sep = ", "; - } - } - printf("%s", suffix); - fflush(stdout); -} - - -static void -print_unpacked_s_8uint(const struct util_format_description *format_desc, - const char *prefix, - uint8_t unpacked[UTIL_FORMAT_MAX_UNPACKED_HEIGHT][UTIL_FORMAT_MAX_UNPACKED_WIDTH], - const char *suffix) -{ - unsigned i, j; - const char *sep = ""; - - printf("%s", prefix); - for (i = 0; i < format_desc->block.height; ++i) { - for (j = 0; j < format_desc->block.width; ++j) { - printf("%s0x%02x", sep, unpacked[i][j]); - sep = ", "; - } - } - printf("%s", suffix); - fflush(stdout); -} - - -static boolean -test_format_fetch_rgba_float(const struct util_format_description *format_desc, - const struct util_format_test_case *test) -{ - float unpacked[UTIL_FORMAT_MAX_UNPACKED_HEIGHT][UTIL_FORMAT_MAX_UNPACKED_WIDTH][4] = { { { 0 } } }; - unsigned i, j, k; - boolean success; - - success = TRUE; - for (i = 0; i < format_desc->block.height; ++i) { - for (j = 0; j < format_desc->block.width; ++j) { - format_desc->fetch_rgba_float(unpacked[i][j], test->packed, j, i); - for (k = 0; k < 4; ++k) { - if (!compare_float(test->unpacked[i][j][k], unpacked[i][j][k])) { - success = FALSE; - } - } - } - } - - /* Ignore S3TC errors */ - if (format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) { - success = TRUE; - } - - if (!success) { - print_unpacked_rgba_float(format_desc, "FAILED: ", unpacked, " obtained\n"); - print_unpacked_rgba_doubl(format_desc, " ", test->unpacked, " expected\n"); - } - - return success; -} - - -static boolean -test_format_unpack_rgba_float(const struct util_format_description *format_desc, - const struct util_format_test_case *test) -{ - float unpacked[UTIL_FORMAT_MAX_UNPACKED_HEIGHT][UTIL_FORMAT_MAX_UNPACKED_WIDTH][4] = { { { 0 } } }; - unsigned i, j, k; - boolean success; - - format_desc->unpack_rgba_float(&unpacked[0][0][0], sizeof unpacked[0], - test->packed, 0, - format_desc->block.width, format_desc->block.height); - - success = TRUE; - for (i = 0; i < format_desc->block.height; ++i) { - for (j = 0; j < format_desc->block.width; ++j) { - for (k = 0; k < 4; ++k) { - if (!compare_float(test->unpacked[i][j][k], unpacked[i][j][k])) { - success = FALSE; - } - } - } - } - - /* Ignore S3TC errors */ - if (format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) { - success = TRUE; - } - - if (!success) { - print_unpacked_rgba_float(format_desc, "FAILED: ", unpacked, " obtained\n"); - print_unpacked_rgba_doubl(format_desc, " ", test->unpacked, " expected\n"); - } - - return success; -} - - -static boolean -test_format_pack_rgba_float(const struct util_format_description *format_desc, - const struct util_format_test_case *test) -{ - float unpacked[UTIL_FORMAT_MAX_UNPACKED_HEIGHT][UTIL_FORMAT_MAX_UNPACKED_WIDTH][4]; - uint8_t packed[UTIL_FORMAT_MAX_PACKED_BYTES]; - unsigned i, j, k; - boolean success; - - if (test->format == PIPE_FORMAT_DXT1_RGBA) { - /* - * Skip S3TC as packed representation is not canonical. - * - * TODO: Do a round trip conversion. - */ - return TRUE; - } - - memset(packed, 0, sizeof packed); - for (i = 0; i < format_desc->block.height; ++i) { - for (j = 0; j < format_desc->block.width; ++j) { - for (k = 0; k < 4; ++k) { - unpacked[i][j][k] = (float) test->unpacked[i][j][k]; - } - } - } - - format_desc->pack_rgba_float(packed, 0, - &unpacked[0][0][0], sizeof unpacked[0], - format_desc->block.width, format_desc->block.height); - - success = TRUE; - for (i = 0; i < format_desc->block.bits/8; ++i) { - if ((test->packed[i] & test->mask[i]) != (packed[i] & test->mask[i])) - success = FALSE; - } - - /* Ignore NaN */ - if (util_is_double_nan(test->unpacked[0][0][0])) - success = TRUE; - - /* Ignore S3TC errors */ - if (format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) { - success = TRUE; - } - - if (!success) { - print_packed(format_desc, "FAILED: ", packed, " obtained\n"); - print_packed(format_desc, " ", test->packed, " expected\n"); - } - - return success; -} - - -static boolean -convert_float_to_8unorm(uint8_t *dst, const double *src) -{ - unsigned i; - boolean accurate = TRUE; - - for (i = 0; i < UTIL_FORMAT_MAX_UNPACKED_HEIGHT*UTIL_FORMAT_MAX_UNPACKED_WIDTH*4; ++i) { - if (src[i] < 0.0) { - accurate = FALSE; - dst[i] = 0; - } - else if (src[i] > 1.0) { - accurate = FALSE; - dst[i] = 255; - } - else { - dst[i] = src[i] * 255.0; - } - } - - return accurate; -} - - -static boolean -test_format_unpack_rgba_8unorm(const struct util_format_description *format_desc, - const struct util_format_test_case *test) -{ - uint8_t unpacked[UTIL_FORMAT_MAX_UNPACKED_HEIGHT][UTIL_FORMAT_MAX_UNPACKED_WIDTH][4] = { { { 0 } } }; - uint8_t expected[UTIL_FORMAT_MAX_UNPACKED_HEIGHT][UTIL_FORMAT_MAX_UNPACKED_WIDTH][4] = { { { 0 } } }; - unsigned i, j, k; - boolean success; - - format_desc->unpack_rgba_8unorm(&unpacked[0][0][0], sizeof unpacked[0], - test->packed, 0, - format_desc->block.width, format_desc->block.height); - - convert_float_to_8unorm(&expected[0][0][0], &test->unpacked[0][0][0]); - - success = TRUE; - for (i = 0; i < format_desc->block.height; ++i) { - for (j = 0; j < format_desc->block.width; ++j) { - for (k = 0; k < 4; ++k) { - if (expected[i][j][k] != unpacked[i][j][k]) { - success = FALSE; - } - } - } - } - - /* Ignore NaN */ - if (util_is_double_nan(test->unpacked[0][0][0])) - success = TRUE; - - if (!success) { - print_unpacked_rgba_8unorm(format_desc, "FAILED: ", unpacked, " obtained\n"); - print_unpacked_rgba_8unorm(format_desc, " ", expected, " expected\n"); - } - - return success; -} - - -static boolean -test_format_pack_rgba_8unorm(const struct util_format_description *format_desc, - const struct util_format_test_case *test) -{ - uint8_t unpacked[UTIL_FORMAT_MAX_UNPACKED_HEIGHT][UTIL_FORMAT_MAX_UNPACKED_WIDTH][4]; - uint8_t packed[UTIL_FORMAT_MAX_PACKED_BYTES]; - unsigned i; - boolean success; - - if (test->format == PIPE_FORMAT_DXT1_RGBA) { - /* - * Skip S3TC as packed representation is not canonical. - * - * TODO: Do a round trip conversion. - */ - return TRUE; - } - - if (!convert_float_to_8unorm(&unpacked[0][0][0], &test->unpacked[0][0][0])) { - /* - * Skip test cases which cannot be represented by four unorm bytes. - */ - return TRUE; - } - - memset(packed, 0, sizeof packed); - - format_desc->pack_rgba_8unorm(packed, 0, - &unpacked[0][0][0], sizeof unpacked[0], - format_desc->block.width, format_desc->block.height); - - success = TRUE; - for (i = 0; i < format_desc->block.bits/8; ++i) - if ((test->packed[i] & test->mask[i]) != (packed[i] & test->mask[i])) - success = FALSE; - - /* Ignore NaN */ - if (util_is_double_nan(test->unpacked[0][0][0])) - success = TRUE; - - /* Ignore failure cases due to unorm8 format */ - if (test->unpacked[0][0][0] > 1.0f || test->unpacked[0][0][0] < 0.0f) - success = TRUE; - - /* Multiple of 255 */ - if ((test->unpacked[0][0][0] * 255.0) != (int)(test->unpacked[0][0][0] * 255.0)) - success = TRUE; - - /* Ignore S3TC errors */ - if (format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) { - success = TRUE; - } - - if (!success) { - print_packed(format_desc, "FAILED: ", packed, " obtained\n"); - print_packed(format_desc, " ", test->packed, " expected\n"); - } - - return success; -} - - -static boolean -test_format_unpack_z_float(const struct util_format_description *format_desc, - const struct util_format_test_case *test) -{ - float unpacked[UTIL_FORMAT_MAX_UNPACKED_HEIGHT][UTIL_FORMAT_MAX_UNPACKED_WIDTH] = { { 0 } }; - unsigned i, j; - boolean success; - - format_desc->unpack_z_float(&unpacked[0][0], sizeof unpacked[0], - test->packed, 0, - format_desc->block.width, format_desc->block.height); - - success = TRUE; - for (i = 0; i < format_desc->block.height; ++i) { - for (j = 0; j < format_desc->block.width; ++j) { - if (!compare_float(test->unpacked[i][j][0], unpacked[i][j])) { - success = FALSE; - } - } - } - - if (!success) { - print_unpacked_z_float(format_desc, "FAILED: ", unpacked, " obtained\n"); - print_unpacked_rgba_doubl(format_desc, " ", test->unpacked, " expected\n"); - } - - return success; -} - - -static boolean -test_format_pack_z_float(const struct util_format_description *format_desc, - const struct util_format_test_case *test) -{ - float unpacked[UTIL_FORMAT_MAX_UNPACKED_HEIGHT][UTIL_FORMAT_MAX_UNPACKED_WIDTH]; - uint8_t packed[UTIL_FORMAT_MAX_PACKED_BYTES]; - unsigned i, j; - boolean success; - - memset(packed, 0, sizeof packed); - for (i = 0; i < format_desc->block.height; ++i) { - for (j = 0; j < format_desc->block.width; ++j) { - unpacked[i][j] = (float) test->unpacked[i][j][0]; - if (test->unpacked[i][j][1]) { - return TRUE; - } - } - } - - format_desc->pack_z_float(packed, 0, - &unpacked[0][0], sizeof unpacked[0], - format_desc->block.width, format_desc->block.height); - - success = TRUE; - for (i = 0; i < format_desc->block.bits/8; ++i) - if ((test->packed[i] & test->mask[i]) != (packed[i] & test->mask[i])) - success = FALSE; - - if (!success) { - print_packed(format_desc, "FAILED: ", packed, " obtained\n"); - print_packed(format_desc, " ", test->packed, " expected\n"); - } - - return success; -} - - -static boolean -test_format_unpack_z_32unorm(const struct util_format_description *format_desc, - const struct util_format_test_case *test) -{ - uint32_t unpacked[UTIL_FORMAT_MAX_UNPACKED_HEIGHT][UTIL_FORMAT_MAX_UNPACKED_WIDTH] = { { 0 } }; - uint32_t expected[UTIL_FORMAT_MAX_UNPACKED_HEIGHT][UTIL_FORMAT_MAX_UNPACKED_WIDTH] = { { 0 } }; - unsigned i, j; - boolean success; - - format_desc->unpack_z_32unorm(&unpacked[0][0], sizeof unpacked[0], - test->packed, 0, - format_desc->block.width, format_desc->block.height); - - for (i = 0; i < format_desc->block.height; ++i) { - for (j = 0; j < format_desc->block.width; ++j) { - expected[i][j] = test->unpacked[i][j][0] * 0xffffffff; - } - } - - success = TRUE; - for (i = 0; i < format_desc->block.height; ++i) { - for (j = 0; j < format_desc->block.width; ++j) { - if (expected[i][j] != unpacked[i][j]) { - success = FALSE; - } - } - } - - if (!success) { - print_unpacked_z_32unorm(format_desc, "FAILED: ", unpacked, " obtained\n"); - print_unpacked_z_32unorm(format_desc, " ", expected, " expected\n"); - } - - return success; -} - - -static boolean -test_format_pack_z_32unorm(const struct util_format_description *format_desc, - const struct util_format_test_case *test) -{ - uint32_t unpacked[UTIL_FORMAT_MAX_UNPACKED_HEIGHT][UTIL_FORMAT_MAX_UNPACKED_WIDTH]; - uint8_t packed[UTIL_FORMAT_MAX_PACKED_BYTES]; - unsigned i, j; - boolean success; - - for (i = 0; i < format_desc->block.height; ++i) { - for (j = 0; j < format_desc->block.width; ++j) { - unpacked[i][j] = test->unpacked[i][j][0] * 0xffffffff; - if (test->unpacked[i][j][1]) { - return TRUE; - } - } - } - - memset(packed, 0, sizeof packed); - - format_desc->pack_z_32unorm(packed, 0, - &unpacked[0][0], sizeof unpacked[0], - format_desc->block.width, format_desc->block.height); - - success = TRUE; - for (i = 0; i < format_desc->block.bits/8; ++i) - if ((test->packed[i] & test->mask[i]) != (packed[i] & test->mask[i])) - success = FALSE; - - if (!success) { - print_packed(format_desc, "FAILED: ", packed, " obtained\n"); - print_packed(format_desc, " ", test->packed, " expected\n"); - } - - return success; -} - - -static boolean -test_format_unpack_s_8uint(const struct util_format_description *format_desc, - const struct util_format_test_case *test) -{ - uint8_t unpacked[UTIL_FORMAT_MAX_UNPACKED_HEIGHT][UTIL_FORMAT_MAX_UNPACKED_WIDTH] = { { 0 } }; - uint8_t expected[UTIL_FORMAT_MAX_UNPACKED_HEIGHT][UTIL_FORMAT_MAX_UNPACKED_WIDTH] = { { 0 } }; - unsigned i, j; - boolean success; - - format_desc->unpack_s_8uint(&unpacked[0][0], sizeof unpacked[0], - test->packed, 0, - format_desc->block.width, format_desc->block.height); - - for (i = 0; i < format_desc->block.height; ++i) { - for (j = 0; j < format_desc->block.width; ++j) { - expected[i][j] = test->unpacked[i][j][1]; - } - } - - success = TRUE; - for (i = 0; i < format_desc->block.height; ++i) { - for (j = 0; j < format_desc->block.width; ++j) { - if (expected[i][j] != unpacked[i][j]) { - success = FALSE; - } - } - } - - if (!success) { - print_unpacked_s_8uint(format_desc, "FAILED: ", unpacked, " obtained\n"); - print_unpacked_s_8uint(format_desc, " ", expected, " expected\n"); - } - - return success; -} - - -static boolean -test_format_pack_s_8uint(const struct util_format_description *format_desc, - const struct util_format_test_case *test) -{ - uint8_t unpacked[UTIL_FORMAT_MAX_UNPACKED_HEIGHT][UTIL_FORMAT_MAX_UNPACKED_WIDTH]; - uint8_t packed[UTIL_FORMAT_MAX_PACKED_BYTES]; - unsigned i, j; - boolean success; - - for (i = 0; i < format_desc->block.height; ++i) { - for (j = 0; j < format_desc->block.width; ++j) { - unpacked[i][j] = test->unpacked[i][j][1]; - if (test->unpacked[i][j][0]) { - return TRUE; - } - } - } - - memset(packed, 0, sizeof packed); - - format_desc->pack_s_8uint(packed, 0, - &unpacked[0][0], sizeof unpacked[0], - format_desc->block.width, format_desc->block.height); - - success = TRUE; - for (i = 0; i < format_desc->block.bits/8; ++i) - if ((test->packed[i] & test->mask[i]) != (packed[i] & test->mask[i])) - success = FALSE; - - if (!success) { - print_packed(format_desc, "FAILED: ", packed, " obtained\n"); - print_packed(format_desc, " ", test->packed, " expected\n"); - } - - return success; -} - - -/* Touch-test that the unorm/snorm flags are set up right by codegen. */ -static boolean -test_format_norm_flags(const struct util_format_description *format_desc) -{ - boolean success = TRUE; - -#define FORMAT_CASE(format, unorm, snorm) \ - case format: \ - success = (format_desc->is_unorm == unorm && \ - format_desc->is_snorm == snorm); \ - break - - switch (format_desc->format) { - FORMAT_CASE(PIPE_FORMAT_R8G8B8A8_UNORM, TRUE, FALSE); - FORMAT_CASE(PIPE_FORMAT_R8G8B8A8_SRGB, TRUE, FALSE); - FORMAT_CASE(PIPE_FORMAT_R8G8B8A8_SNORM, FALSE, TRUE); - FORMAT_CASE(PIPE_FORMAT_R32_FLOAT, FALSE, FALSE); - FORMAT_CASE(PIPE_FORMAT_X8Z24_UNORM, TRUE, FALSE); - FORMAT_CASE(PIPE_FORMAT_S8X24_UINT, FALSE, FALSE); - FORMAT_CASE(PIPE_FORMAT_DXT1_RGB, TRUE, FALSE); - FORMAT_CASE(PIPE_FORMAT_ETC2_RGB8, TRUE, FALSE); - FORMAT_CASE(PIPE_FORMAT_ETC2_R11_SNORM, FALSE, TRUE); - FORMAT_CASE(PIPE_FORMAT_ASTC_4x4, TRUE, FALSE); - FORMAT_CASE(PIPE_FORMAT_BPTC_RGBA_UNORM, TRUE, FALSE); - FORMAT_CASE(PIPE_FORMAT_BPTC_RGB_FLOAT, FALSE, FALSE); - default: - success = !(format_desc->is_unorm && format_desc->is_snorm); - break; - } -#undef FORMAT_CASE - - if (!success) { - printf("FAILED: %s (unorm %s, snorm %s)\n", - format_desc->short_name, - format_desc->is_unorm ? "yes" : "no", - format_desc->is_snorm ? "yes" : "no"); - } - - return success; -} - -typedef boolean -(*test_func_t)(const struct util_format_description *format_desc, - const struct util_format_test_case *test); - - -static boolean -test_one_func(const struct util_format_description *format_desc, - test_func_t func, - const char *suffix) -{ - unsigned i; - boolean success = TRUE; - - printf("Testing util_format_%s_%s ...\n", - format_desc->short_name, suffix); - fflush(stdout); - - for (i = 0; i < util_format_nr_test_cases; ++i) { - const struct util_format_test_case *test = &util_format_test_cases[i]; - - if (test->format == format_desc->format) { - if (!func(format_desc, &util_format_test_cases[i])) { - success = FALSE; - } - } - } - - return success; -} - -static boolean -test_format_metadata(const struct util_format_description *format_desc, - boolean (*func)(const struct util_format_description *format_desc), - const char *suffix) -{ - boolean success = TRUE; - - printf("Testing util_format_%s_%s ...\n", format_desc->short_name, suffix); - fflush(stdout); - - if (!func(format_desc)) { - success = FALSE; - } - - return success; -} - -static boolean -test_all(void) -{ - enum pipe_format format; - boolean success = TRUE; - - for (format = 1; format < PIPE_FORMAT_COUNT; ++format) { - const struct util_format_description *format_desc; - - format_desc = util_format_description(format); - if (!format_desc) { - continue; - } - - assert(format_desc->block.bits <= UTIL_FORMAT_MAX_PACKED_BYTES * 8); - assert(format_desc->block.height <= UTIL_FORMAT_MAX_UNPACKED_HEIGHT); - assert(format_desc->block.width <= UTIL_FORMAT_MAX_UNPACKED_WIDTH); - -# define TEST_ONE_FUNC(name) \ - if (format_desc->name) { \ - if (!test_one_func(format_desc, &test_format_##name, #name)) { \ - success = FALSE; \ - } \ - } - -# define TEST_FORMAT_METADATA(name) \ - if (!test_format_metadata(format_desc, &test_format_##name, #name)) { \ - success = FALSE; \ - } \ - - TEST_ONE_FUNC(fetch_rgba_float); - TEST_ONE_FUNC(pack_rgba_float); - TEST_ONE_FUNC(unpack_rgba_float); - TEST_ONE_FUNC(pack_rgba_8unorm); - TEST_ONE_FUNC(unpack_rgba_8unorm); - - TEST_ONE_FUNC(unpack_z_32unorm); - TEST_ONE_FUNC(pack_z_32unorm); - TEST_ONE_FUNC(unpack_z_float); - TEST_ONE_FUNC(pack_z_float); - TEST_ONE_FUNC(unpack_s_8uint); - TEST_ONE_FUNC(pack_s_8uint); - - TEST_FORMAT_METADATA(norm_flags); - -# undef TEST_ONE_FUNC -# undef TEST_ONE_FORMAT - } - - return success; -} - - -int main(int argc, char **argv) -{ - boolean success; - - success = test_all(); - - return success ? 0 : 1; -} diff -Nru mesa-19.2.8/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c mesa-20.0.8/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c --- mesa-19.2.8/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c 2020-06-12 01:21:17.000000000 +0000 @@ -27,6 +27,7 @@ #include "amdgpu_cs.h" +#include "util/hash_table.h" #include "util/os_time.h" #include "util/u_hash_table.h" #include "state_tracker/drm_driver.h" @@ -35,10 +36,6 @@ #include #include -#ifndef AMDGPU_GEM_CREATE_VM_ALWAYS_VALID -#define AMDGPU_GEM_CREATE_VM_ALWAYS_VALID (1 << 6) -#endif - #ifndef AMDGPU_VA_RANGE_HIGH #define AMDGPU_VA_RANGE_HIGH 0x2 #endif @@ -151,6 +148,12 @@ return ((struct amdgpu_winsys_bo*)buf)->initial_domain; } +static enum radeon_bo_flag amdgpu_bo_get_flags( + struct pb_buffer *buf) +{ + return ((struct amdgpu_winsys_bo*)buf)->flags; +} + static void amdgpu_bo_remove_fences(struct amdgpu_winsys_bo *bo) { for (unsigned i = 0; i < bo->num_fences; ++i) @@ -164,6 +167,7 @@ void amdgpu_bo_destroy(struct pb_buffer *_buf) { struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf); + struct amdgpu_screen_winsys *sws_iter; struct amdgpu_winsys *ws = bo->ws; assert(bo->bo && "must not be called for slab entries"); @@ -176,11 +180,29 @@ if (ws->debug_all_bos) { simple_mtx_lock(&ws->global_bo_list_lock); - LIST_DEL(&bo->u.real.global_list_item); + list_del(&bo->u.real.global_list_item); ws->num_buffers--; simple_mtx_unlock(&ws->global_bo_list_lock); } + /* Close all KMS handles retrieved for other DRM file descriptions */ + simple_mtx_lock(&ws->sws_list_lock); + for (sws_iter = ws->sws_list; sws_iter; sws_iter = sws_iter->next) { + struct hash_entry *entry; + + if (!sws_iter->kms_handles) + continue; + + entry = _mesa_hash_table_search(sws_iter->kms_handles, bo); + if (entry) { + struct drm_gem_close args = { .handle = (uintptr_t)entry->data }; + + drmIoctl(sws_iter->fd, DRM_IOCTL_GEM_CLOSE, &args); + _mesa_hash_table_remove(sws_iter->kms_handles, entry); + } + } + simple_mtx_unlock(&ws->sws_list_lock); + simple_mtx_lock(&ws->bo_export_table_lock); util_hash_table_remove(ws->bo_export_table, bo->bo); simple_mtx_unlock(&ws->bo_export_table_lock); @@ -414,7 +436,7 @@ if (ws->debug_all_bos) { simple_mtx_lock(&ws->global_bo_list_lock); - LIST_ADDTAIL(&bo->u.real.global_list_item, &ws->global_bo_list); + list_addtail(&bo->u.real.global_list_item, &ws->global_bo_list); ws->num_buffers++; simple_mtx_unlock(&ws->global_bo_list_lock); } @@ -495,9 +517,6 @@ request.flags |= AMDGPU_GEM_CREATE_NO_CPU_ACCESS; if (flags & RADEON_FLAG_GTT_WC) request.flags |= AMDGPU_GEM_CREATE_CPU_GTT_USWC; - if (flags & RADEON_FLAG_NO_INTERPROCESS_SHARING && - ws->info.has_local_buffers) - request.flags |= AMDGPU_GEM_CREATE_VM_ALWAYS_VALID; if (ws->zero_all_vram_allocs && (request.preferred_heap & AMDGPU_GEM_DOMAIN_VRAM)) request.flags |= AMDGPU_GEM_CREATE_VRAM_CLEARED; @@ -546,8 +565,8 @@ bo->va = va; bo->u.real.va_handle = va_handle; bo->initial_domain = initial_domain; + bo->flags = flags; bo->unique_id = __sync_fetch_and_add(&ws->next_bo_unique_id, 1); - bo->is_local = !!(request.flags & AMDGPU_GEM_CREATE_VM_ALWAYS_VALID); if (initial_domain & RADEON_DOMAIN_VRAM) ws->allocated_vram += align64(size, ws->info.gart_page_size); @@ -664,7 +683,7 @@ if (!slab->entries) goto fail_buffer; - LIST_INITHEAD(&slab->base.free); + list_inithead(&slab->base.free); base_id = __sync_fetch_and_add(&ws->next_bo_unique_id, slab->base.num_entries); @@ -692,7 +711,7 @@ assert(bo->u.slab.real->bo); } - LIST_ADDTAIL(&bo->u.slab.entry.head, &slab->base.free); + list_addtail(&bo->u.slab.entry.head, &slab->base.free); } return &slab->base; @@ -961,7 +980,7 @@ fprintf(stderr, "amdgpu: clearing PRT VA region on destroy failed (%d)\n", r); } - while (!list_empty(&bo->u.sparse.backing)) { + while (!list_is_empty(&bo->u.sparse.backing)) { struct amdgpu_sparse_backing *dummy = NULL; sparse_free_backing_buffer(bo, container_of(bo->u.sparse.backing.next, @@ -1017,7 +1036,7 @@ if (!bo->u.sparse.commitments) goto error_alloc_commitments; - LIST_INITHEAD(&bo->u.sparse.backing); + list_inithead(&bo->u.sparse.backing); /* For simplicity, we always map a multiple of the page size. */ map_size = align64(size, RADEON_SPARSE_PAGE_SIZE); @@ -1204,6 +1223,9 @@ } } +#define AMDGPU_TILING_SCANOUT_SHIFT 63 +#define AMDGPU_TILING_SCANOUT_MASK 0x1 + static void amdgpu_buffer_get_metadata(struct pb_buffer *_buf, struct radeon_bo_metadata *md) { @@ -1226,6 +1248,7 @@ md->u.gfx9.dcc_offset_256B = AMDGPU_TILING_GET(tiling_flags, DCC_OFFSET_256B); md->u.gfx9.dcc_pitch_max = AMDGPU_TILING_GET(tiling_flags, DCC_PITCH_MAX); md->u.gfx9.dcc_independent_64B = AMDGPU_TILING_GET(tiling_flags, DCC_INDEPENDENT_64B); + md->u.gfx9.scanout = AMDGPU_TILING_GET(tiling_flags, SCANOUT); } else { md->u.legacy.microtile = RADEON_LAYOUT_LINEAR; md->u.legacy.macrotile = RADEON_LAYOUT_LINEAR; @@ -1263,6 +1286,7 @@ tiling_flags |= AMDGPU_TILING_SET(DCC_OFFSET_256B, md->u.gfx9.dcc_offset_256B); tiling_flags |= AMDGPU_TILING_SET(DCC_PITCH_MAX, md->u.gfx9.dcc_pitch_max); tiling_flags |= AMDGPU_TILING_SET(DCC_INDEPENDENT_64B, md->u.gfx9.dcc_independent_64B); + tiling_flags |= AMDGPU_TILING_SET(SCANOUT, md->u.gfx9.scanout); } else { if (md->u.legacy.macrotile == RADEON_LAYOUT_TILED) tiling_flags |= AMDGPU_TILING_SET(ARRAY_MODE, 4); /* 2D_TILED_THIN1 */ @@ -1408,9 +1432,7 @@ static struct pb_buffer *amdgpu_bo_from_handle(struct radeon_winsys *rws, struct winsys_handle *whandle, - unsigned vm_alignment, - unsigned *stride, - unsigned *offset) + unsigned vm_alignment) { struct amdgpu_winsys *ws = amdgpu_winsys(rws); struct amdgpu_winsys_bo *bo = NULL; @@ -1420,6 +1442,7 @@ amdgpu_va_handle va_handle = NULL; struct amdgpu_bo_info info = {0}; enum radeon_bo_domain initial = 0; + enum radeon_bo_flag flags = 0; int r; switch (whandle->type) { @@ -1433,11 +1456,6 @@ return NULL; } - if (stride) - *stride = whandle->stride; - if (offset) - *offset = whandle->offset; - r = amdgpu_bo_import(ws->dev, type, whandle->handle, &result); if (r) return NULL; @@ -1485,6 +1503,10 @@ initial |= RADEON_DOMAIN_VRAM; if (info.preferred_heap & AMDGPU_GEM_DOMAIN_GTT) initial |= RADEON_DOMAIN_GTT; + if (info.alloc_flags & AMDGPU_GEM_CREATE_NO_CPU_ACCESS) + flags |= RADEON_FLAG_NO_CPU_ACCESS; + if (info.alloc_flags & AMDGPU_GEM_CREATE_CPU_GTT_USWC) + flags |= RADEON_FLAG_GTT_WC; /* Initialize the structure. */ simple_mtx_init(&bo->lock, mtx_plain); @@ -1497,6 +1519,7 @@ bo->va = va; bo->u.real.va_handle = va_handle; bo->initial_domain = initial; + bo->flags = flags; bo->unique_id = __sync_fetch_and_add(&ws->next_bo_unique_id, 1); bo->is_shared = true; @@ -1526,14 +1549,13 @@ static bool amdgpu_bo_get_handle(struct radeon_winsys *rws, struct pb_buffer *buffer, - unsigned stride, unsigned offset, - unsigned slice_size, struct winsys_handle *whandle) { struct amdgpu_screen_winsys *sws = amdgpu_screen_winsys(rws); struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(buffer); struct amdgpu_winsys *ws = bo->ws; enum amdgpu_bo_handle_type type; + struct hash_entry *entry; int r; /* Don't allow exports of slab entries and sparse buffers. */ @@ -1547,6 +1569,23 @@ type = amdgpu_bo_handle_type_gem_flink_name; break; case WINSYS_HANDLE_TYPE_KMS: + if (sws->fd == ws->fd) { + whandle->handle = bo->u.real.kms_handle; + + if (bo->is_shared) + return true; + + goto hash_table_set; + } + + simple_mtx_lock(&ws->sws_list_lock); + entry = _mesa_hash_table_search(sws->kms_handles, bo); + simple_mtx_unlock(&ws->sws_list_lock); + if (entry) { + whandle->handle = (uintptr_t)entry->data; + return true; + } + /* Fall through */ case WINSYS_HANDLE_TYPE_FD: type = amdgpu_bo_handle_type_dma_buf_fd; break; @@ -1566,15 +1605,19 @@ if (r) return false; + + simple_mtx_lock(&ws->sws_list_lock); + _mesa_hash_table_insert_pre_hashed(sws->kms_handles, + bo->u.real.kms_handle, bo, + (void*)(uintptr_t)whandle->handle); + simple_mtx_unlock(&ws->sws_list_lock); } + hash_table_set: simple_mtx_lock(&ws->bo_export_table_lock); util_hash_table_set(ws->bo_export_table, bo->bo, bo); simple_mtx_unlock(&ws->bo_export_table_lock); - whandle->stride = stride; - whandle->offset = offset; - whandle->offset += slice_size * whandle->layer; bo->is_shared = true; return true; } @@ -1675,4 +1718,5 @@ ws->base.buffer_commit = amdgpu_bo_sparse_commit; ws->base.buffer_get_virtual_address = amdgpu_bo_get_va; ws->base.buffer_get_initial_domain = amdgpu_bo_get_initial_domain; + ws->base.buffer_get_flags = amdgpu_bo_get_flags; } diff -Nru mesa-19.2.8/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h mesa-20.0.8/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h --- mesa-19.2.8/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h 2020-06-12 01:21:17.000000000 +0000 @@ -93,10 +93,10 @@ amdgpu_bo_handle bo; /* NULL for slab entries and sparse buffers */ bool sparse; bool is_user_ptr; - bool is_local; uint32_t unique_id; uint64_t va; enum radeon_bo_domain initial_domain; + enum radeon_bo_flag flags; /* how many command streams is this bo referenced in? */ int num_cs_references; diff -Nru mesa-19.2.8/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c mesa-20.0.8/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c --- mesa-19.2.8/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c 2020-06-12 01:21:17.000000000 +0000 @@ -349,32 +349,49 @@ amdgpu_ctx_query_reset_status(struct radeon_winsys_ctx *rwctx) { struct amdgpu_ctx *ctx = (struct amdgpu_ctx*)rwctx; - uint32_t result, hangs; int r; /* Return a failure due to a GPU hang. */ - r = amdgpu_cs_query_reset_state(ctx->ctx, &result, &hangs); - if (r) { - fprintf(stderr, "amdgpu: amdgpu_cs_query_reset_state failed. (%i)\n", r); - return PIPE_NO_RESET; - } + if (ctx->ws->info.drm_minor >= 24) { + uint64_t flags; - switch (result) { - case AMDGPU_CTX_GUILTY_RESET: - return PIPE_GUILTY_CONTEXT_RESET; - case AMDGPU_CTX_INNOCENT_RESET: - return PIPE_INNOCENT_CONTEXT_RESET; - case AMDGPU_CTX_UNKNOWN_RESET: - return PIPE_UNKNOWN_CONTEXT_RESET; - case AMDGPU_CTX_NO_RESET: - default: - /* Return a failure due to a rejected command submission. */ - if (ctx->ws->num_total_rejected_cs > ctx->initial_num_total_rejected_cs) { - return ctx->num_rejected_cs ? PIPE_GUILTY_CONTEXT_RESET : - PIPE_INNOCENT_CONTEXT_RESET; + r = amdgpu_cs_query_reset_state2(ctx->ctx, &flags); + if (r) { + fprintf(stderr, "amdgpu: amdgpu_cs_query_reset_state failed. (%i)\n", r); + return PIPE_NO_RESET; + } + + if (flags & AMDGPU_CTX_QUERY2_FLAGS_RESET) { + if (flags & AMDGPU_CTX_QUERY2_FLAGS_GUILTY) + return PIPE_GUILTY_CONTEXT_RESET; + else + return PIPE_INNOCENT_CONTEXT_RESET; } - return PIPE_NO_RESET; + } else { + uint32_t result, hangs; + + r = amdgpu_cs_query_reset_state(ctx->ctx, &result, &hangs); + if (r) { + fprintf(stderr, "amdgpu: amdgpu_cs_query_reset_state failed. (%i)\n", r); + return PIPE_NO_RESET; + } + + switch (result) { + case AMDGPU_CTX_GUILTY_RESET: + return PIPE_GUILTY_CONTEXT_RESET; + case AMDGPU_CTX_INNOCENT_RESET: + return PIPE_INNOCENT_CONTEXT_RESET; + case AMDGPU_CTX_UNKNOWN_RESET: + return PIPE_UNKNOWN_CONTEXT_RESET; + } + } + + /* Return a failure due to a rejected command submission. */ + if (ctx->ws->num_total_rejected_cs > ctx->initial_num_total_rejected_cs) { + return ctx->num_rejected_cs ? PIPE_GUILTY_CONTEXT_RESET : + PIPE_INNOCENT_CONTEXT_RESET; } + return PIPE_NO_RESET; } /* COMMAND SUBMISSION */ @@ -1159,17 +1176,20 @@ amdgpu_fence_reference(&fences->list[idx], (struct pipe_fence_handle*)fence); } -/* TODO: recognizing dependencies as no-ops doesn't take the parallel - * compute IB into account. The compute IB won't wait for these. - * Also, the scheduler can execute compute and SDMA IBs on any rings. - * Should we always insert dependencies? - */ static bool is_noop_fence_dependency(struct amdgpu_cs *acs, struct amdgpu_fence *fence) { struct amdgpu_cs_context *cs = acs->csc; - if (!amdgpu_fence_is_syncobj(fence) && + /* Detect no-op dependencies only when there is only 1 ring, + * because IBs on one ring are always executed one at a time. + * + * We always want no dependency between back-to-back gfx IBs, because + * we need the parallelism between IBs for good performance. + */ + if ((acs->ring_type == RING_GFX || + acs->ctx->ws->info.num_rings[acs->ring_type] == 1) && + !amdgpu_fence_is_syncobj(fence) && fence->ctx == acs->ctx && fence->fence.ip_type == cs->ib[IB_MAIN].ip_type && fence->fence.ip_instance == cs->ib[IB_MAIN].ip_instance && @@ -1378,9 +1398,6 @@ simple_mtx_lock(&ws->global_bo_list_lock); LIST_FOR_EACH_ENTRY(bo, &ws->global_bo_list, u.real.global_list_item) { - if (bo->is_local) - continue; - list[num_handles].bo_handle = bo->u.real.kms_handle; list[num_handles].bo_priority = 0; ++num_handles; @@ -1405,10 +1422,6 @@ unsigned num_handles = 0; for (i = 0; i < cs->num_real_buffers; ++i) { struct amdgpu_cs_buffer *buffer = &cs->real_buffers[i]; - - if (buffer->bo->is_local) - continue; - assert(buffer->u.real.priority_usage != 0); list[num_handles].bo_handle = buffer->bo->u.real.kms_handle; @@ -1663,9 +1676,6 @@ if (ws->info.chip_class <= GFX6) { while (rcs->current.cdw & 7) radeon_emit(rcs, 0xf0000000); /* NOP packet */ - } else { - while (rcs->current.cdw & 7) - radeon_emit(rcs, 0x00000000); /* NOP packet */ } break; case RING_GFX: @@ -1756,7 +1766,7 @@ /* Submit. */ util_queue_add_job(&ws->cs_queue, cs, &cs->flush_completed, - amdgpu_cs_submit_ib, NULL); + amdgpu_cs_submit_ib, NULL, 0); /* The submission has been queued, unlock the fence now. */ simple_mtx_unlock(&ws->bo_fence_lock); diff -Nru mesa-19.2.8/src/gallium/winsys/amdgpu/drm/amdgpu_surface.c mesa-20.0.8/src/gallium/winsys/amdgpu/drm/amdgpu_surface.c --- mesa-19.2.8/src/gallium/winsys/amdgpu/drm/amdgpu_surface.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/winsys/amdgpu/drm/amdgpu_surface.c 2020-06-12 01:21:17.000000000 +0000 @@ -26,7 +26,7 @@ */ #include "amdgpu_winsys.h" -#include "util/u_format.h" +#include "util/format/u_format.h" static int amdgpu_surface_sanity(const struct pipe_resource *tex) { diff -Nru mesa-19.2.8/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c mesa-20.0.8/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c --- mesa-19.2.8/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c 2020-06-12 01:21:17.000000000 +0000 @@ -30,6 +30,8 @@ #include "amdgpu_cs.h" #include "amdgpu_public.h" +#include "util/os_file.h" +#include "util/os_misc.h" #include "util/u_cpu_detect.h" #include "util/u_hash_table.h" #include "util/hash_table.h" @@ -39,8 +41,8 @@ #include #include #include -#include "amd/common/ac_llvm_util.h" -#include "amd/common/sid.h" +#include "ac_llvm_util.h" +#include "sid.h" #ifndef AMDGPU_INFO_NUM_VRAM_CPU_PAGE_FAULTS #define AMDGPU_INFO_NUM_VRAM_CPU_PAGE_FAULTS 0x1E @@ -138,6 +140,7 @@ } pb_cache_deinit(&ws->bo_cache); util_hash_table_destroy(ws->bo_export_table); + simple_mtx_destroy(&ws->sws_list_lock); simple_mtx_destroy(&ws->global_bo_list_lock); simple_mtx_destroy(&ws->bo_export_table_lock); @@ -278,11 +281,41 @@ static bool amdgpu_winsys_unref(struct radeon_winsys *rws) { - /* radeon_winsys corresponds to amdgpu_screen_winsys, which is never - * referenced multiple times, so amdgpu_winsys_destroy always needs to be - * called. It handles reference counting for amdgpu_winsys. - */ - return true; + struct amdgpu_screen_winsys *sws = amdgpu_screen_winsys(rws); + struct amdgpu_winsys *aws = sws->aws; + bool ret; + + simple_mtx_lock(&aws->sws_list_lock); + + ret = pipe_reference(&sws->reference, NULL); + if (ret) { + struct amdgpu_screen_winsys **sws_iter; + struct amdgpu_winsys *aws = sws->aws; + + /* Remove this amdgpu_screen_winsys from amdgpu_winsys' list, so that + * amdgpu_winsys_create can't re-use it anymore + */ + for (sws_iter = &aws->sws_list; *sws_iter; sws_iter = &(*sws_iter)->next) { + if (*sws_iter == sws) { + *sws_iter = sws->next; + break; + } + } + } + + simple_mtx_unlock(&aws->sws_list_lock); + + if (ret && sws->kms_handles) { + struct drm_gem_close args; + + hash_table_foreach(sws->kms_handles, entry) { + args.handle = (uintptr_t)entry->data; + drmIoctl(sws->fd, DRM_IOCTL_GEM_CLOSE, &args); + } + _mesa_hash_table_destroy(sws->kms_handles, NULL); + } + + return ret; } static void amdgpu_pin_threads_to_L3_cache(struct radeon_winsys *rws, @@ -294,6 +327,18 @@ util_cpu_caps.cores_per_L3); } +static uint32_t kms_handle_hash(const void *key) +{ + const struct amdgpu_winsys_bo *bo = key; + + return bo->u.real.kms_handle; +} + +static bool kms_handle_equals(const void *a, const void *b) +{ + return a == b; +} + PUBLIC struct radeon_winsys * amdgpu_winsys_create(int fd, const struct pipe_screen_config *config, radeon_screen_create_t screen_create) @@ -307,6 +352,7 @@ if (!ws) return NULL; + pipe_reference_init(&ws->reference, 1); ws->fd = fcntl(fd, F_DUPFD_CLOEXEC, 0); /* Look up the winsys from the dev table. */ @@ -316,7 +362,7 @@ /* Initialize the amdgpu device. This should always return the same pointer * for the same fd. */ - r = amdgpu_device_initialize(fd, &drm_major, &drm_minor, &dev); + r = amdgpu_device_initialize(ws->fd, &drm_major, &drm_minor, &dev); if (r) { fprintf(stderr, "amdgpu: amdgpu_device_initialize failed.\n"); goto fail; @@ -325,13 +371,45 @@ /* Lookup a winsys if we have already created one for this device. */ aws = util_hash_table_get(dev_tab, dev); if (aws) { - pipe_reference(NULL, &aws->reference); + struct amdgpu_screen_winsys *sws_iter; /* Release the device handle, because we don't need it anymore. * This function is returning an existing winsys instance, which * has its own device handle. */ amdgpu_device_deinitialize(dev); + + simple_mtx_lock(&aws->sws_list_lock); + for (sws_iter = aws->sws_list; sws_iter; sws_iter = sws_iter->next) { + r = os_same_file_description(sws_iter->fd, ws->fd); + + if (r == 0) { + close(ws->fd); + FREE(ws); + ws = sws_iter; + pipe_reference(NULL, &ws->reference); + simple_mtx_unlock(&aws->sws_list_lock); + goto unlock; + } else if (r < 0) { + static bool logged; + + if (!logged) { + os_log_message("amdgpu: os_same_file_description couldn't " + "determine if two DRM fds reference the same " + "file description.\n" + "If they do, bad things may happen!\n"); + logged = true; + } + } + } + simple_mtx_unlock(&aws->sws_list_lock); + + ws->kms_handles = _mesa_hash_table_create(NULL, kms_handle_hash, + kms_handle_equals); + if (!ws->kms_handles) + goto fail; + + pipe_reference(NULL, &aws->reference); } else { /* Create a new winsys. */ aws = CALLOC_STRUCT(amdgpu_winsys); @@ -339,6 +417,7 @@ goto fail; aws->dev = dev; + aws->fd = ws->fd; aws->info.drm_major = drm_major; aws->info.drm_minor = drm_minor; @@ -382,9 +461,10 @@ /* init reference */ pipe_reference_init(&aws->reference, 1); - LIST_INITHEAD(&aws->global_bo_list); + list_inithead(&aws->global_bo_list); aws->bo_export_table = util_hash_table_create(hash_pointer, compare_pointers); + (void) simple_mtx_init(&aws->sws_list_lock, mtx_plain); (void) simple_mtx_init(&aws->global_bo_list_lock, mtx_plain); (void) simple_mtx_init(&aws->bo_fence_lock, mtx_plain); (void) simple_mtx_init(&aws->bo_export_table_lock, mtx_plain); @@ -435,6 +515,12 @@ return NULL; } + simple_mtx_lock(&aws->sws_list_lock); + ws->next = aws->sws_list; + aws->sws_list = ws; + simple_mtx_unlock(&aws->sws_list_lock); + +unlock: /* We must unlock the mutex once the winsys is fully initialized, so that * other threads attempting to create the winsys from the same fd will * get a fully initialized winsys and not just half-way initialized. */ @@ -445,6 +531,8 @@ fail_alloc: FREE(aws); fail: + if (ws->kms_handles) + _mesa_hash_table_destroy(ws->kms_handles, NULL); close(ws->fd); FREE(ws); simple_mtx_unlock(&dev_tab_mutex); diff -Nru mesa-19.2.8/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h mesa-20.0.8/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h --- mesa-19.2.8/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h 2020-06-12 01:21:17.000000000 +0000 @@ -42,6 +42,10 @@ struct amdgpu_winsys { struct pipe_reference reference; + + /* File descriptor which was passed to amdgpu_device_initialize */ + int fd; + struct pb_cache bo_cache; /* Each slab buffer can only contain suballocations of equal sizes, so we @@ -87,6 +91,12 @@ struct list_head global_bo_list; unsigned num_buffers; + /* Single-linked list of all structs amdgpu_screen_winsys referencing this + * struct amdgpu_winsys + */ + simple_mtx_t sws_list_lock; + struct amdgpu_screen_winsys *sws_list; + /* For returning the same amdgpu_winsys_bo instance for exported * and re-imported buffers. */ struct util_hash_table *bo_export_table; @@ -97,6 +107,13 @@ struct radeon_winsys base; struct amdgpu_winsys *aws; int fd; + struct pipe_reference reference; + struct amdgpu_screen_winsys *next; + + /* Maps a BO to its KMS handle valid for this DRM file descriptor + * Protected by amdgpu_winsys::sws_list_lock + */ + struct hash_table *kms_handles; }; static inline struct amdgpu_screen_winsys * diff -Nru mesa-19.2.8/src/gallium/winsys/amdgpu/drm/meson.build mesa-20.0.8/src/gallium/winsys/amdgpu/drm/meson.build --- mesa-19.2.8/src/gallium/winsys/amdgpu/drm/meson.build 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/winsys/amdgpu/drm/meson.build 2020-06-12 01:21:17.000000000 +0000 @@ -27,6 +27,7 @@ ), include_directories : [ inc_amd, inc_gallium, inc_gallium_aux, inc_include, inc_src, + inc_amd_common, inc_amd_common_llvm, ], c_args : [c_vis_args], cpp_args : [cpp_vis_args], diff -Nru mesa-19.2.8/src/gallium/winsys/etnaviv/drm/Android.mk mesa-20.0.8/src/gallium/winsys/etnaviv/drm/Android.mk --- mesa-19.2.8/src/gallium/winsys/etnaviv/drm/Android.mk 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/winsys/etnaviv/drm/Android.mk 2020-06-12 01:21:17.000000000 +0000 @@ -25,7 +25,7 @@ LOCAL_SRC_FILES := $(C_SOURCES) -LOCAL_SHARED_LIBRARIES := libdrm_etnaviv +LOCAL_STATIC_LIBRARIES := libmesa_nir libetnaviv_drm LOCAL_MODULE := libmesa_winsys_etnaviv diff -Nru mesa-19.2.8/src/gallium/winsys/freedreno/drm/freedreno_drm_winsys.c mesa-20.0.8/src/gallium/winsys/freedreno/drm/freedreno_drm_winsys.c --- mesa-19.2.8/src/gallium/winsys/freedreno/drm/freedreno_drm_winsys.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/winsys/freedreno/drm/freedreno_drm_winsys.c 2020-06-12 01:21:17.000000000 +0000 @@ -28,7 +28,7 @@ #include "pipe/p_context.h" #include "pipe/p_state.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_memory.h" #include "util/u_inlines.h" #include "util/u_hash_table.h" diff -Nru mesa-19.2.8/src/gallium/winsys/nouveau/drm/nouveau_drm_winsys.c mesa-20.0.8/src/gallium/winsys/nouveau/drm/nouveau_drm_winsys.c --- mesa-19.2.8/src/gallium/winsys/nouveau/drm/nouveau_drm_winsys.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/winsys/nouveau/drm/nouveau_drm_winsys.c 2020-06-12 01:21:17.000000000 +0000 @@ -3,7 +3,7 @@ #include #include "pipe/p_context.h" #include "pipe/p_state.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_memory.h" #include "util/u_inlines.h" #include "util/u_hash_table.h" diff -Nru mesa-19.2.8/src/gallium/winsys/radeon/drm/radeon_drm_bo.c mesa-20.0.8/src/gallium/winsys/radeon/drm/radeon_drm_bo.c --- mesa-19.2.8/src/gallium/winsys/radeon/drm/radeon_drm_bo.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/winsys/radeon/drm/radeon_drm_bo.c 2020-06-12 01:21:17.000000000 +0000 @@ -295,7 +295,7 @@ if ((va + size) == heap->start) { heap->start = va; /* Delete uppermost hole if it reaches the new top */ - if (!LIST_IS_EMPTY(&heap->holes)) { + if (!list_is_empty(&heap->holes)) { hole = container_of(heap->holes.next, hole, list); if ((hole->offset + hole->size) == va) { heap->start = hole->offset; @@ -796,7 +796,7 @@ if (!slab->entries) goto fail_buffer; - LIST_INITHEAD(&slab->base.free); + list_inithead(&slab->base.free); base_hash = __sync_fetch_and_add(&ws->next_bo_hash, slab->base.num_entries); @@ -815,7 +815,7 @@ bo->u.slab.entry.group_index = group_index; bo->u.slab.real = slab->buffer; - LIST_ADDTAIL(&bo->u.slab.entry.head, &slab->base.free); + list_addtail(&bo->u.slab.entry.head, &slab->base.free); } return &slab->base; @@ -1134,9 +1134,7 @@ static struct pb_buffer *radeon_winsys_bo_from_handle(struct radeon_winsys *rws, struct winsys_handle *whandle, - unsigned vm_alignment, - unsigned *stride, - unsigned *offset) + unsigned vm_alignment) { struct radeon_drm_winsys *ws = radeon_drm_winsys(rws); struct radeon_bo *bo; @@ -1144,12 +1142,6 @@ unsigned handle; uint64_t size = 0; - if (!offset && whandle->offset != 0) { - fprintf(stderr, "attempt to import unsupported winsys offset %u\n", - whandle->offset); - return NULL; - } - /* We must maintain a list of pairs , so that we always return * the same BO for one particular handle. If we didn't do that and created * more than one BO for the same handle and then relocated them in a CS, @@ -1232,11 +1224,6 @@ done: mtx_unlock(&ws->bo_handles_mutex); - if (stride) - *stride = whandle->stride; - if (offset) - *offset = whandle->offset; - if (ws->info.r600_has_virtual_memory && !bo->va) { struct drm_radeon_gem_va va; @@ -1287,8 +1274,6 @@ static bool radeon_winsys_bo_get_handle(struct radeon_winsys *rws, struct pb_buffer *buffer, - unsigned stride, unsigned offset, - unsigned slice_size, struct winsys_handle *whandle) { struct drm_gem_flink flink; @@ -1325,10 +1310,6 @@ return false; } - whandle->stride = stride; - whandle->offset = offset; - whandle->offset += slice_size * whandle->layer; - return true; } diff -Nru mesa-19.2.8/src/gallium/winsys/radeon/drm/radeon_drm_cs.c mesa-20.0.8/src/gallium/winsys/radeon/drm/radeon_drm_cs.c --- mesa-19.2.8/src/gallium/winsys/radeon/drm/radeon_drm_cs.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/winsys/radeon/drm/radeon_drm_cs.c 2020-06-12 01:21:17.000000000 +0000 @@ -697,7 +697,7 @@ if (util_queue_is_initialized(&cs->ws->cs_queue)) { util_queue_add_job(&cs->ws->cs_queue, cs, &cs->flush_completed, - radeon_drm_cs_emit_ioctl_oneshot, NULL); + radeon_drm_cs_emit_ioctl_oneshot, NULL, 0); if (!(flags & PIPE_FLUSH_ASYNC)) radeon_drm_cs_sync_flush(rcs); } else { diff -Nru mesa-19.2.8/src/gallium/winsys/radeon/drm/radeon_drm_surface.c mesa-20.0.8/src/gallium/winsys/radeon/drm/radeon_drm_surface.c --- mesa-19.2.8/src/gallium/winsys/radeon/drm/radeon_drm_surface.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/winsys/radeon/drm/radeon_drm_surface.c 2020-06-12 01:21:17.000000000 +0000 @@ -25,7 +25,7 @@ */ #include "radeon_drm_winsys.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include static unsigned cik_get_macro_tile_index(struct radeon_surf *surf) @@ -280,6 +280,72 @@ surf->cmask_size = align(slice_bytes, base_align) * num_layers; } +static void si_compute_htile(const struct radeon_info *info, + struct radeon_surf *surf, unsigned num_layers) +{ + unsigned cl_width, cl_height, width, height; + unsigned slice_elements, slice_bytes, pipe_interleave_bytes, base_align; + unsigned num_pipes = info->num_tile_pipes; + + surf->htile_size = 0; + + if (!(surf->flags & RADEON_SURF_Z_OR_SBUFFER) || + surf->flags & RADEON_SURF_NO_HTILE) + return; + + if (surf->u.legacy.level[0].mode == RADEON_SURF_MODE_1D && + !info->htile_cmask_support_1d_tiling) + return; + + /* Overalign HTILE on P2 configs to work around GPU hangs in + * piglit/depthstencil-render-miplevels 585. + * + * This has been confirmed to help Kabini & Stoney, where the hangs + * are always reproducible. I think I have seen the test hang + * on Carrizo too, though it was very rare there. + */ + if (info->chip_class >= GFX7 && num_pipes < 4) + num_pipes = 4; + + switch (num_pipes) { + case 1: + cl_width = 32; + cl_height = 16; + break; + case 2: + cl_width = 32; + cl_height = 32; + break; + case 4: + cl_width = 64; + cl_height = 32; + break; + case 8: + cl_width = 64; + cl_height = 64; + break; + case 16: + cl_width = 128; + cl_height = 64; + break; + default: + assert(0); + return; + } + + width = align(surf->u.legacy.level[0].nblk_x, cl_width * 8); + height = align(surf->u.legacy.level[0].nblk_y, cl_height * 8); + + slice_elements = (width * height) / (8 * 8); + slice_bytes = slice_elements * 4; + + pipe_interleave_bytes = info->pipe_interleave_bytes; + base_align = num_pipes * pipe_interleave_bytes; + + surf->htile_alignment = base_align; + surf->htile_size = num_layers * align(slice_bytes, base_align); +} + static int radeon_winsys_surface_init(struct radeon_winsys *rws, const struct pipe_resource *tex, unsigned flags, unsigned bpe, @@ -307,7 +373,7 @@ /* Compute FMASK. */ if (ws->gen == DRV_SI && tex->nr_samples >= 2 && - !(flags & (RADEON_SURF_Z_OR_SBUFFER | RADEON_SURF_FMASK))) { + !(flags & (RADEON_SURF_Z_OR_SBUFFER | RADEON_SURF_FMASK | RADEON_SURF_NO_FMASK))) { /* FMASK is allocated like an ordinary texture. */ struct pipe_resource templ = *tex; struct radeon_surf fmask = {}; @@ -351,7 +417,8 @@ surf_ws->u.legacy.fmask.pitch_in_pixels = fmask.u.legacy.level[0].nblk_x; } - if (ws->gen == DRV_SI) { + if (ws->gen == DRV_SI && + (tex->nr_samples <= 1 || surf_ws->fmask_size)) { struct ac_surf_config config; /* Only these fields need to be set for the CMASK computation. */ @@ -364,6 +431,31 @@ si_compute_cmask(&ws->info, &config, surf_ws); } + + if (ws->gen == DRV_SI) { + si_compute_htile(&ws->info, surf_ws, util_num_layers(tex, 0)); + + /* Determine the memory layout of multiple allocations in one buffer. */ + surf_ws->total_size = surf_ws->surf_size; + + if (surf_ws->htile_size) { + surf_ws->htile_offset = align64(surf_ws->total_size, surf_ws->htile_alignment); + surf_ws->total_size = surf_ws->htile_offset + surf_ws->htile_size; + } + + if (surf_ws->fmask_size) { + assert(tex->nr_samples >= 2); + surf_ws->fmask_offset = align64(surf_ws->total_size, surf_ws->fmask_alignment); + surf_ws->total_size = surf_ws->fmask_offset + surf_ws->fmask_size; + } + + /* Single-sample CMASK is in a separate buffer. */ + if (surf_ws->cmask_size && tex->nr_samples >= 2) { + surf_ws->cmask_offset = align64(surf_ws->total_size, surf_ws->cmask_alignment); + surf_ws->total_size = surf_ws->cmask_offset + surf_ws->cmask_size; + } + } + return 0; } diff -Nru mesa-19.2.8/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c mesa-20.0.8/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c --- mesa-19.2.8/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c 2020-06-12 01:21:17.000000000 +0000 @@ -303,11 +303,12 @@ ws->info.has_dedicated_vram = true; } + ws->info.num_rings[RING_GFX] = 1; /* Check for dma */ - ws->info.num_sdma_rings = 0; + ws->info.num_rings[RING_DMA] = 0; /* DMA is disabled on R700. There is IB corruption and hangs. */ if (ws->info.chip_class >= EVERGREEN && ws->info.drm_minor >= 27) { - ws->info.num_sdma_rings = 1; + ws->info.num_rings[RING_DMA] = 1; } /* Check for UVD and VCE */ @@ -316,16 +317,20 @@ if (ws->info.drm_minor >= 32) { uint32_t value = RADEON_CS_RING_UVD; if (radeon_get_drm_value(ws->fd, RADEON_INFO_RING_WORKING, - "UVD Ring working", &value)) + "UVD Ring working", &value)) { ws->info.has_hw_decode = value; + ws->info.num_rings[RING_UVD] = 1; + } value = RADEON_CS_RING_VCE; if (radeon_get_drm_value(ws->fd, RADEON_INFO_RING_WORKING, NULL, &value) && value) { if (radeon_get_drm_value(ws->fd, RADEON_INFO_VCE_FW_VERSION, - "VCE FW version", &value)) + "VCE FW version", &value)) { ws->info.vce_fw_version = value; + ws->info.num_rings[RING_VCE] = 1; + } } } @@ -588,6 +593,10 @@ ws->info.has_read_registers_query = ws->info.drm_minor >= 42; ws->info.max_alignment = 1024*1024; ws->info.has_graphics = true; + ws->info.cpdma_prefetch_writes_memory = true; + ws->info.max_wave64_per_simd = 10; + ws->info.num_physical_sgprs_per_simd = 512; + ws->info.num_physical_wave64_vgprs_per_simd = 256; ws->check_vm = strstr(debug_get_option("R600_DEBUG", ""), "check_vm") != NULL || strstr(debug_get_option("AMD_DEBUG", ""), "check_vm") != NULL; @@ -937,6 +946,7 @@ /* TTM aligns the BO size to the CPU page size */ ws->info.gart_page_size = sysconf(_SC_PAGESIZE); + ws->info.pte_fragment_size = 64 * 1024; /* GPUVM page size */ if (ws->num_cpus > 1 && debug_get_option_thread()) util_queue_init(&ws->cs_queue, "rcs", 8, 1, 0); diff -Nru mesa-19.2.8/src/gallium/winsys/svga/drm/pb_buffer_simple_fenced.c mesa-20.0.8/src/gallium/winsys/svga/drm/pb_buffer_simple_fenced.c --- mesa-19.2.8/src/gallium/winsys/svga/drm/pb_buffer_simple_fenced.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/winsys/svga/drm/pb_buffer_simple_fenced.c 2020-06-12 01:21:17.000000000 +0000 @@ -214,7 +214,7 @@ assert(!fenced_buf->fence); assert(fenced_buf->head.prev); assert(fenced_buf->head.next); - LIST_DEL(&fenced_buf->head); + list_del(&fenced_buf->head); assert(fenced_mgr->num_unfenced); --fenced_mgr->num_unfenced; @@ -239,10 +239,10 @@ p_atomic_inc(&fenced_buf->base.reference.count); - LIST_DEL(&fenced_buf->head); + list_del(&fenced_buf->head); assert(fenced_mgr->num_unfenced); --fenced_mgr->num_unfenced; - LIST_ADDTAIL(&fenced_buf->head, &fenced_mgr->fenced); + list_addtail(&fenced_buf->head, &fenced_mgr->fenced); ++fenced_mgr->num_fenced; } @@ -268,11 +268,11 @@ assert(fenced_buf->head.prev); assert(fenced_buf->head.next); - LIST_DEL(&fenced_buf->head); + list_del(&fenced_buf->head); assert(fenced_mgr->num_fenced); --fenced_mgr->num_fenced; - LIST_ADDTAIL(&fenced_buf->head, &fenced_mgr->unfenced); + list_addtail(&fenced_buf->head, &fenced_mgr->unfenced); ++fenced_mgr->num_unfenced; if (p_atomic_dec_zero(&fenced_buf->base.reference.count)) { @@ -756,7 +756,7 @@ assert(fenced_buf->buffer); - LIST_ADDTAIL(&fenced_buf->head, &fenced_mgr->unfenced); + list_addtail(&fenced_buf->head, &fenced_mgr->unfenced); ++fenced_mgr->num_unfenced; mtx_unlock(&fenced_mgr->mutex); @@ -835,10 +835,10 @@ fenced_mgr->provider = provider; fenced_mgr->ops = ops; - LIST_INITHEAD(&fenced_mgr->fenced); + list_inithead(&fenced_mgr->fenced); fenced_mgr->num_fenced = 0; - LIST_INITHEAD(&fenced_mgr->unfenced); + list_inithead(&fenced_mgr->unfenced); fenced_mgr->num_unfenced = 0; (void) mtx_init(&fenced_mgr->mutex, mtx_plain); diff -Nru mesa-19.2.8/src/gallium/winsys/svga/drm/vmw_fence.c mesa-20.0.8/src/gallium/winsys/svga/drm/vmw_fence.c --- mesa-19.2.8/src/gallium/winsys/svga/drm/vmw_fence.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/winsys/svga/drm/vmw_fence.c 2020-06-12 01:21:17.000000000 +0000 @@ -107,7 +107,7 @@ mtx_lock(&ops->mutex); LIST_FOR_EACH_ENTRY_SAFE(fence, n, &ops->not_signaled, ops_list) - LIST_DELINIT(&fence->ops_list); + list_delinit(&fence->ops_list); mtx_unlock(&ops->mutex); } @@ -150,7 +150,7 @@ break; p_atomic_set(&fence->signalled, 1); - LIST_DELINIT(&fence->ops_list); + list_delinit(&fence->ops_list); } ops->last_signaled = signaled; ops->last_emitted = emitted; @@ -215,10 +215,10 @@ if (vmw_fence_seq_is_signaled(seqno, ops->last_signaled, seqno)) { p_atomic_set(&fence->signalled, 1); - LIST_INITHEAD(&fence->ops_list); + list_inithead(&fence->ops_list); } else { p_atomic_set(&fence->signalled, 0); - LIST_ADDTAIL(&fence->ops_list, &ops->not_signaled); + list_addtail(&fence->ops_list, &ops->not_signaled); } mtx_unlock(&ops->mutex); @@ -264,7 +264,7 @@ vmw_ioctl_fence_unref(vws, vfence->handle); mtx_lock(&ops->mutex); - LIST_DELINIT(&vfence->ops_list); + list_delinit(&vfence->ops_list); mtx_unlock(&ops->mutex); } @@ -486,7 +486,7 @@ return NULL; (void) mtx_init(&ops->mutex, mtx_plain); - LIST_INITHEAD(&ops->not_signaled); + list_inithead(&ops->not_signaled); ops->base.destroy = &vmw_fence_ops_destroy; ops->base.fence_reference = &vmw_fence_ops_fence_reference; ops->base.fence_signalled = &vmw_fence_ops_fence_signalled; diff -Nru mesa-19.2.8/src/gallium/winsys/svga/drm/vmwgfx_drm.h mesa-20.0.8/src/gallium/winsys/svga/drm/vmwgfx_drm.h --- mesa-19.2.8/src/gallium/winsys/svga/drm/vmwgfx_drm.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/winsys/svga/drm/vmwgfx_drm.h 2020-06-12 01:21:17.000000000 +0000 @@ -71,6 +71,7 @@ #define DRM_VMW_CREATE_EXTENDED_CONTEXT 26 #define DRM_VMW_GB_SURFACE_CREATE_EXT 27 #define DRM_VMW_GB_SURFACE_REF_EXT 28 +#define DRM_VMW_MSG 29 /*************************************************************************/ /** @@ -1213,6 +1214,22 @@ struct drm_vmw_surface_arg req; }; +/** + * struct drm_vmw_msg_arg + * + * @send: Pointer to user-space msg string (null terminated). + * @receive: Pointer to user-space receive buffer. + * @send_only: Boolean whether this is only sending or receiving too. + * + * Argument to the DRM_VMW_MSG ioctl. + */ +struct drm_vmw_msg_arg { + __u64 send; + __u64 receive; + __s32 send_only; + __u32 receive_len; +}; + #if defined(__cplusplus) } #endif diff -Nru mesa-19.2.8/src/gallium/winsys/svga/drm/vmw_msg.c mesa-20.0.8/src/gallium/winsys/svga/drm/vmw_msg.c --- mesa-19.2.8/src/gallium/winsys/svga/drm/vmw_msg.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/winsys/svga/drm/vmw_msg.c 2020-06-12 01:21:17.000000000 +0000 @@ -31,6 +31,9 @@ #include "pipe/p_defines.h" #include "svga_winsys.h" #include "vmw_msg.h" +#include "vmwgfx_drm.h" +#include "vmw_screen.h" +#include "xf86drm.h" #define MESSAGE_STATUS_SUCCESS 0x0001 @@ -424,6 +427,7 @@ vmw_svga_winsys_host_log(struct svga_winsys_screen *sws, const char *log) { struct rpc_channel channel; + struct vmw_winsys_screen *vws = vmw_winsys_screen(sws); char *msg; int msg_len; int ret; @@ -444,9 +448,21 @@ sprintf(msg, "log %s", log); - if (!(ret = vmw_open_channel(&channel, RPCI_PROTOCOL_NUM))) { - ret = vmw_send_msg(&channel, msg); - vmw_close_channel(&channel); + if (vws->ioctl.have_drm_2_17) { + struct drm_vmw_msg_arg msg_arg; + + memset(&msg_arg, 0, sizeof(msg_arg)); + msg_arg.send = (uint64_t) (unsigned long) (msg); + msg_arg.send_only = 1; + + ret = drmCommandWriteRead(vws->ioctl.drm_fd, DRM_VMW_MSG, + &msg_arg, sizeof(msg_arg)); + + } else { + if (!(ret = vmw_open_channel(&channel, RPCI_PROTOCOL_NUM))) { + ret = vmw_send_msg(&channel, msg); + vmw_close_channel(&channel); + } } if (ret) diff -Nru mesa-19.2.8/src/gallium/winsys/svga/drm/vmw_screen_dri.c mesa-20.0.8/src/gallium/winsys/svga/drm/vmw_screen_dri.c --- mesa-19.2.8/src/gallium/winsys/svga/drm/vmw_screen_dri.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/winsys/svga/drm/vmw_screen_dri.c 2020-06-12 01:21:17.000000000 +0000 @@ -27,7 +27,7 @@ #include "pipe/p_compiler.h" #include "util/u_inlines.h" #include "util/u_memory.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "vmw_context.h" #include "vmw_screen.h" diff -Nru mesa-19.2.8/src/gallium/winsys/svga/drm/vmw_screen.h mesa-20.0.8/src/gallium/winsys/svga/drm/vmw_screen.h --- mesa-19.2.8/src/gallium/winsys/svga/drm/vmw_screen.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/winsys/svga/drm/vmw_screen.h 2020-06-12 01:21:17.000000000 +0000 @@ -78,6 +78,8 @@ boolean have_drm_2_9; uint32_t drm_execbuf_version; boolean have_drm_2_15; + boolean have_drm_2_16; + boolean have_drm_2_17; } ioctl; struct { diff -Nru mesa-19.2.8/src/gallium/winsys/svga/drm/vmw_screen_ioctl.c mesa-20.0.8/src/gallium/winsys/svga/drm/vmw_screen_ioctl.c --- mesa-19.2.8/src/gallium/winsys/svga/drm/vmw_screen_ioctl.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/winsys/svga/drm/vmw_screen_ioctl.c 2020-06-12 01:21:17.000000000 +0000 @@ -694,6 +694,7 @@ return NULL; } + (void) madvise(map, region->size, MADV_HUGEPAGE); region->data = map; } @@ -973,7 +974,6 @@ drmVersionPtr version; boolean drm_gb_capable; boolean have_drm_2_5; - boolean have_drm_2_16; const char *getenv_val; VMW_FUNC; @@ -990,8 +990,10 @@ (version->version_major == 2 && version->version_minor > 8); vws->ioctl.have_drm_2_15 = version->version_major > 2 || (version->version_major == 2 && version->version_minor > 14); - have_drm_2_16 = version->version_major > 2 || + vws->ioctl.have_drm_2_16 = version->version_major > 2 || (version->version_major == 2 && version->version_minor > 15); + vws->ioctl.have_drm_2_17 = version->version_major > 2 || + (version->version_major == 2 && version->version_minor > 16); vws->ioctl.drm_execbuf_version = vws->ioctl.have_drm_2_9 ? 2 : 1; @@ -1116,7 +1118,7 @@ else vws->ioctl.num_cap_3d = SVGA3D_DEVCAP_MAX; - if (have_drm_2_16) { + if (vws->ioctl.have_drm_2_16) { vws->base.have_coherent = TRUE; getenv_val = getenv("SVGA_FORCE_COHERENT"); if (getenv_val && strcmp(getenv_val, "0") != 0) diff -Nru mesa-19.2.8/src/gallium/winsys/svga/drm/vmw_surface.c mesa-20.0.8/src/gallium/winsys/svga/drm/vmw_surface.c --- mesa-19.2.8/src/gallium/winsys/svga/drm/vmw_surface.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/winsys/svga/drm/vmw_surface.c 2020-06-12 01:21:17.000000000 +0000 @@ -38,7 +38,8 @@ void * vmw_svga_winsys_surface_map(struct svga_winsys_context *swc, struct svga_winsys_surface *srf, - unsigned flags, boolean *retry) + unsigned flags, boolean *retry, + boolean *rebind) { struct vmw_svga_winsys_surface *vsrf = vmw_svga_winsys_surface(srf); void *data = NULL; @@ -47,6 +48,7 @@ struct vmw_winsys_screen *vws = vsrf->screen; *retry = FALSE; + *rebind = FALSE; assert((flags & (PIPE_TRANSFER_READ | PIPE_TRANSFER_WRITE)) != 0); mtx_lock(&vsrf->mutex); @@ -121,6 +123,12 @@ if (vsrf->buf) vmw_svga_winsys_buffer_destroy(&vws->base, vsrf->buf); vsrf->buf = vbuf; + + /* Rebind persistent maps immediately */ + if (flags & PIPE_TRANSFER_PERSISTENT) { + *rebind = TRUE; + vsrf->rebind = FALSE; + } goto out_mapped; } else vmw_svga_winsys_buffer_destroy(&vws->base, vbuf); diff -Nru mesa-19.2.8/src/gallium/winsys/svga/drm/vmw_surface.h mesa-20.0.8/src/gallium/winsys/svga/drm/vmw_surface.h --- mesa-19.2.8/src/gallium/winsys/svga/drm/vmw_surface.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/winsys/svga/drm/vmw_surface.h 2020-06-12 01:21:17.000000000 +0000 @@ -88,8 +88,9 @@ struct vmw_svga_winsys_surface *src); void * vmw_svga_winsys_surface_map(struct svga_winsys_context *swc, - struct svga_winsys_surface *srf, - unsigned flags, boolean *retry); + struct svga_winsys_surface *srf, + unsigned flags, boolean *retry, + boolean *rebind); void vmw_svga_winsys_surface_unmap(struct svga_winsys_context *swc, struct svga_winsys_surface *srf, diff -Nru mesa-19.2.8/src/gallium/winsys/sw/dri/dri_sw_winsys.c mesa-20.0.8/src/gallium/winsys/sw/dri/dri_sw_winsys.c --- mesa-19.2.8/src/gallium/winsys/sw/dri/dri_sw_winsys.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/winsys/sw/dri/dri_sw_winsys.c 2020-06-12 01:21:17.000000000 +0000 @@ -35,7 +35,7 @@ #include "pipe/p_format.h" #include "pipe/p_state.h" #include "util/u_inlines.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_math.h" #include "util/u_memory.h" diff -Nru mesa-19.2.8/src/gallium/winsys/sw/gdi/gdi_sw_winsys.c mesa-20.0.8/src/gallium/winsys/sw/gdi/gdi_sw_winsys.c --- mesa-19.2.8/src/gallium/winsys/sw/gdi/gdi_sw_winsys.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/winsys/sw/gdi/gdi_sw_winsys.c 2020-06-12 01:21:17.000000000 +0000 @@ -39,7 +39,7 @@ #include "pipe/p_format.h" #include "pipe/p_context.h" #include "util/u_inlines.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_math.h" #include "util/u_memory.h" #include "state_tracker/sw_winsys.h" diff -Nru mesa-19.2.8/src/gallium/winsys/sw/gdi/meson.build mesa-20.0.8/src/gallium/winsys/sw/gdi/meson.build --- mesa-19.2.8/src/gallium/winsys/sw/gdi/meson.build 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/gallium/winsys/sw/gdi/meson.build 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,27 @@ +# Copyright © 2018 Intel Corporation + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +libwsgdi = static_library( + 'wsgdi', + 'gdi_sw_winsys.c', + include_directories : [ + inc_src, inc_include, inc_gallium, inc_gallium_aux, inc_gallium_drivers, + ], +) diff -Nru mesa-19.2.8/src/gallium/winsys/sw/hgl/hgl_sw_winsys.c mesa-20.0.8/src/gallium/winsys/sw/hgl/hgl_sw_winsys.c --- mesa-19.2.8/src/gallium/winsys/sw/hgl/hgl_sw_winsys.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/winsys/sw/hgl/hgl_sw_winsys.c 2020-06-12 01:21:17.000000000 +0000 @@ -31,7 +31,7 @@ #include "pipe/p_defines.h" #include "pipe/p_format.h" #include "util/u_inlines.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_math.h" #include "util/u_memory.h" #include "state_tracker/st_api.h" diff -Nru mesa-19.2.8/src/gallium/winsys/sw/kms-dri/kms_dri_sw_winsys.c mesa-20.0.8/src/gallium/winsys/sw/kms-dri/kms_dri_sw_winsys.c --- mesa-19.2.8/src/gallium/winsys/sw/kms-dri/kms_dri_sw_winsys.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/winsys/sw/kms-dri/kms_dri_sw_winsys.c 2020-06-12 01:21:17.000000000 +0000 @@ -45,7 +45,7 @@ #include "pipe/p_format.h" #include "pipe/p_state.h" #include "util/u_inlines.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_math.h" #include "util/u_memory.h" #include "util/list.h" diff -Nru mesa-19.2.8/src/gallium/winsys/sw/xlib/xlib_sw_winsys.c mesa-20.0.8/src/gallium/winsys/sw/xlib/xlib_sw_winsys.c --- mesa-19.2.8/src/gallium/winsys/sw/xlib/xlib_sw_winsys.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/winsys/sw/xlib/xlib_sw_winsys.c 2020-06-12 01:21:17.000000000 +0000 @@ -35,7 +35,7 @@ #include "pipe/p_format.h" #include "pipe/p_context.h" #include "util/u_inlines.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_math.h" #include "util/u_memory.h" diff -Nru mesa-19.2.8/src/gallium/winsys/virgl/common/virgl_resource_cache.c mesa-20.0.8/src/gallium/winsys/virgl/common/virgl_resource_cache.c --- mesa-19.2.8/src/gallium/winsys/virgl/common/virgl_resource_cache.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/winsys/virgl/common/virgl_resource_cache.c 2020-06-12 01:21:17.000000000 +0000 @@ -45,7 +45,7 @@ virgl_resource_cache_entry_release(struct virgl_resource_cache *cache, struct virgl_resource_cache_entry *entry) { - LIST_DEL(&entry->head); + list_del(&entry->head); cache->entry_release_func(entry, cache->user_data); } @@ -70,7 +70,7 @@ virgl_resource_cache_entry_release_func destroy_func, void *user_data) { - LIST_INITHEAD(&cache->resources); + list_inithead(&cache->resources); cache->timeout_usecs = timeout_usecs; cache->entry_is_busy_func = is_busy_func; cache->entry_release_func = destroy_func; @@ -91,7 +91,7 @@ entry->timeout_start = now; entry->timeout_end = entry->timeout_start + cache->timeout_usecs; - LIST_ADDTAIL(&entry->head, &cache->resources); + list_addtail(&entry->head, &cache->resources); } struct virgl_resource_cache_entry * @@ -135,7 +135,7 @@ } if (compat_entry) - LIST_DEL(&compat_entry->head); + list_del(&compat_entry->head); return compat_entry; } diff -Nru mesa-19.2.8/src/gallium/winsys/virgl/drm/virgl_drm_winsys.c mesa-20.0.8/src/gallium/winsys/virgl/drm/virgl_drm_winsys.c --- mesa-19.2.8/src/gallium/winsys/virgl/drm/virgl_drm_winsys.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/winsys/virgl/drm/virgl_drm_winsys.c 2020-06-12 01:21:17.000000000 +0000 @@ -31,7 +31,7 @@ #include "os/os_mman.h" #include "util/os_time.h" #include "util/u_memory.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_hash_table.h" #include "util/u_inlines.h" #include "state_tracker/drm_driver.h" @@ -40,7 +40,7 @@ #include #include -#include "virtgpu_drm.h" +#include "drm-uapi/virtgpu_drm.h" #include "virgl_drm_winsys.h" #include "virgl_drm_public.h" @@ -186,12 +186,10 @@ } res->bind = bind; - res->format = format; res->res_handle = createcmd.res_handle; res->bo_handle = createcmd.bo_handle; res->size = size; - res->stride = stride; pipe_reference_init(&res->reference, 1); p_atomic_set(&res->external, false); p_atomic_set(&res->num_cs_references, 0); @@ -303,7 +301,11 @@ static struct virgl_hw_res * virgl_drm_winsys_resource_create_handle(struct virgl_winsys *qws, - struct winsys_handle *whandle) + struct winsys_handle *whandle, + uint32_t *plane, + uint32_t *stride, + uint32_t *plane_offset, + uint64_t *modifier) { struct virgl_drm_winsys *qdws = virgl_drm_winsys(qws); struct drm_gem_open open_arg = {}; @@ -311,10 +313,15 @@ struct virgl_hw_res *res = NULL; uint32_t handle = whandle->handle; - if (whandle->offset != 0) { - fprintf(stderr, "attempt to import unsupported winsys offset %u\n", - whandle->offset); + if (whandle->offset != 0 && whandle->type == WINSYS_HANDLE_TYPE_SHARED) { + _debug_printf("attempt to import unsupported winsys offset %u\n", + whandle->offset); return NULL; + } else if (whandle->type == WINSYS_HANDLE_TYPE_FD) { + *plane = whandle->plane; + *stride = whandle->stride; + *plane_offset = whandle->offset; + *modifier = whandle->modifier; } mtx_lock(&qdws->bo_handles_mutex); @@ -375,7 +382,6 @@ res->res_handle = info_arg.res_handle; res->size = info_arg.size; - res->stride = info_arg.stride; pipe_reference_init(&res->reference, 1); p_atomic_set(&res->external, true); res->num_cs_references = 0; @@ -468,10 +474,10 @@ memset(&waitcmd, 0, sizeof(waitcmd)); waitcmd.handle = res->bo_handle; - again: + ret = drmIoctl(qdws->fd, DRM_IOCTL_VIRTGPU_WAIT, &waitcmd); - if (ret == -EAGAIN) - goto again; + if (ret) + _debug_printf("waiting got error - %d, slow gpu or hang?\n", errno); p_atomic_set(&res->maybe_busy, false); } @@ -540,7 +546,7 @@ cbuf->nres * sizeof(struct virgl_hw_buf*), new_nres * sizeof(struct virgl_hw_buf*)); if (!new_ptr) { - fprintf(stderr,"failure to add relocation %d, %d\n", cbuf->cres, new_nres); + _debug_printf("failure to add relocation %d, %d\n", cbuf->cres, new_nres); return; } cbuf->res_bo = new_ptr; @@ -549,7 +555,7 @@ cbuf->nres * sizeof(uint32_t), new_nres * sizeof(uint32_t)); if (!new_ptr) { - fprintf(stderr,"failure to add hlist relocation %d, %d\n", cbuf->cres, cbuf->nres); + _debug_printf("failure to add hlist relocation %d, %d\n", cbuf->cres, cbuf->nres); return; } cbuf->res_hlist = new_ptr; @@ -735,7 +741,7 @@ ret = drmIoctl(qdws->fd, DRM_IOCTL_VIRTGPU_EXECBUFFER, &eb); if (ret == -1) - fprintf(stderr,"got error from kernel - expect bad rendering %d\n", errno); + _debug_printf("got error from kernel - expect bad rendering %d\n", errno); cbuf->base.cdw = 0; if (qws->supports_fences) { diff -Nru mesa-19.2.8/src/gallium/winsys/virgl/drm/virgl_drm_winsys.h mesa-20.0.8/src/gallium/winsys/virgl/drm/virgl_drm_winsys.h --- mesa-19.2.8/src/gallium/winsys/virgl/drm/virgl_drm_winsys.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/winsys/virgl/drm/virgl_drm_winsys.h 2020-06-12 01:21:17.000000000 +0000 @@ -41,10 +41,8 @@ int num_cs_references; uint32_t size; void *ptr; - uint32_t stride; struct virgl_resource_cache_entry cache_entry; - uint32_t format; uint32_t bind; uint32_t flink_name; diff -Nru mesa-19.2.8/src/gallium/winsys/virgl/drm/virtgpu_drm.h mesa-20.0.8/src/gallium/winsys/virgl/drm/virtgpu_drm.h --- mesa-19.2.8/src/gallium/winsys/virgl/drm/virtgpu_drm.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/winsys/virgl/drm/virtgpu_drm.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,174 +0,0 @@ -/* - * Copyright 2013 Red Hat - * All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ -#ifndef VIRTGPU_DRM_H -#define VIRTGPU_DRM_H - -#include -#include "drm-uapi/drm.h" - -/* Please note that modifications to all structs defined here are - * subject to backwards-compatibility constraints. - * - * Do not use pointers, use uint64_t instead for 32 bit / 64 bit user/kernel - * compatibility Keep fields aligned to their size - */ - -#define DRM_VIRTGPU_MAP 0x01 -#define DRM_VIRTGPU_EXECBUFFER 0x02 -#define DRM_VIRTGPU_GETPARAM 0x03 -#define DRM_VIRTGPU_RESOURCE_CREATE 0x04 -#define DRM_VIRTGPU_RESOURCE_INFO 0x05 -#define DRM_VIRTGPU_TRANSFER_FROM_HOST 0x06 -#define DRM_VIRTGPU_TRANSFER_TO_HOST 0x07 -#define DRM_VIRTGPU_WAIT 0x08 -#define DRM_VIRTGPU_GET_CAPS 0x09 - -/* - * virtgpu execbuffer flags - */ -#define VIRTGPU_EXECBUF_FENCE_FD_IN 0x01 -#define VIRTGPU_EXECBUF_FENCE_FD_OUT 0x02 -#define VIRTGPU_EXECBUF_FLAGS (\ - VIRTGPU_EXECBUF_FENCE_FD_IN |\ - VIRTGPU_EXECBUF_FENCE_FD_OUT |\ - 0) - -struct drm_virtgpu_map { - uint64_t offset; /* use for mmap system call */ - uint32_t handle; - uint32_t pad; -}; - -struct drm_virtgpu_execbuffer { - uint32_t flags; /* for future use */ - uint32_t size; - uint64_t command; /* void* */ - uint64_t bo_handles; - uint32_t num_bo_handles; - int32_t fence_fd; -}; - -#define VIRTGPU_PARAM_3D_FEATURES 1 /* do we have 3D features in the hw */ -#define VIRTGPU_PARAM_CAPSET_QUERY_FIX 2 - -struct drm_virtgpu_getparam { - uint64_t param; - uint64_t value; -}; - -/* NO_BO flags? NO resource flag? */ -/* resource flag for y_0_top */ -struct drm_virtgpu_resource_create { - uint32_t target; - uint32_t format; - uint32_t bind; - uint32_t width; - uint32_t height; - uint32_t depth; - uint32_t array_size; - uint32_t last_level; - uint32_t nr_samples; - uint32_t flags; - uint32_t bo_handle; /* if this is set - recreate a new resource attached to this bo ? */ - uint32_t res_handle; /* returned by kernel */ - uint32_t size; /* validate transfer in the host */ - uint32_t stride; /* validate transfer in the host */ -}; - -struct drm_virtgpu_resource_info { - uint32_t bo_handle; - uint32_t res_handle; - uint32_t size; - uint32_t stride; -}; - -struct drm_virtgpu_3d_box { - uint32_t x, y, z; - uint32_t w, h, d; -}; - -struct drm_virtgpu_3d_transfer_to_host { - uint32_t bo_handle; - struct drm_virtgpu_3d_box box; - uint32_t level; - uint32_t offset; -}; - -struct drm_virtgpu_3d_transfer_from_host { - uint32_t bo_handle; - struct drm_virtgpu_3d_box box; - uint32_t level; - uint32_t offset; -}; - -#define VIRTGPU_WAIT_NOWAIT 1 /* like it */ -struct drm_virtgpu_3d_wait { - uint32_t handle; /* 0 is an invalid handle */ - uint32_t flags; -}; - -struct drm_virtgpu_get_caps { - uint32_t cap_set_id; - uint32_t cap_set_ver; - uint64_t addr; - uint32_t size; - uint32_t pad; -}; - -#define DRM_IOCTL_VIRTGPU_MAP \ - DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_MAP, struct drm_virtgpu_map) - -#define DRM_IOCTL_VIRTGPU_EXECBUFFER \ - DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_EXECBUFFER,\ - struct drm_virtgpu_execbuffer) - -#define DRM_IOCTL_VIRTGPU_GETPARAM \ - DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_GETPARAM,\ - struct drm_virtgpu_getparam) - -#define DRM_IOCTL_VIRTGPU_RESOURCE_CREATE \ - DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_RESOURCE_CREATE, \ - struct drm_virtgpu_resource_create) - -#define DRM_IOCTL_VIRTGPU_RESOURCE_INFO \ - DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_RESOURCE_INFO, \ - struct drm_virtgpu_resource_info) - -#define DRM_IOCTL_VIRTGPU_TRANSFER_FROM_HOST \ - DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_TRANSFER_FROM_HOST, \ - struct drm_virtgpu_3d_transfer_from_host) - -#define DRM_IOCTL_VIRTGPU_TRANSFER_TO_HOST \ - DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_TRANSFER_TO_HOST, \ - struct drm_virtgpu_3d_transfer_to_host) - -#define DRM_IOCTL_VIRTGPU_WAIT \ - DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_WAIT, \ - struct drm_virtgpu_3d_wait) - -#define DRM_IOCTL_VIRTGPU_GET_CAPS \ - DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_GET_CAPS, \ - struct drm_virtgpu_get_caps) - -#endif diff -Nru mesa-19.2.8/src/gallium/winsys/virgl/vtest/virgl_vtest_socket.c mesa-20.0.8/src/gallium/winsys/virgl/vtest/virgl_vtest_socket.c --- mesa-19.2.8/src/gallium/winsys/virgl/vtest/virgl_vtest_socket.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/winsys/virgl/vtest/virgl_vtest_socket.c 2020-06-12 01:21:17.000000000 +0000 @@ -29,7 +29,7 @@ #include #include -#include +#include #include "virgl_vtest_winsys.h" #include "virgl_vtest_public.h" diff -Nru mesa-19.2.8/src/gallium/winsys/virgl/vtest/virgl_vtest_winsys.c mesa-20.0.8/src/gallium/winsys/virgl/vtest/virgl_vtest_winsys.c --- mesa-19.2.8/src/gallium/winsys/virgl/vtest/virgl_vtest_winsys.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gallium/winsys/virgl/vtest/virgl_vtest_winsys.c 2020-06-12 01:21:17.000000000 +0000 @@ -23,7 +23,7 @@ #include #include "util/u_surface.h" #include "util/u_memory.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_inlines.h" #include "util/os_time.h" #include "state_tracker/sw_winsys.h" diff -Nru mesa-19.2.8/src/gbm/backends/dri/gbm_dri.c mesa-20.0.8/src/gbm/backends/dri/gbm_dri.c --- mesa-19.2.8/src/gbm/backends/dri/gbm_dri.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gbm/backends/dri/gbm_dri.c 2020-06-12 01:21:17.000000000 +0000 @@ -110,6 +110,18 @@ count, out_count, surf->dri_private); } +static unsigned +dri_get_capability(void *loaderPrivate, enum dri_loader_cap cap) +{ + /* Note: loaderPrivate is _EGLDisplay* */ + switch (cap) { + case DRI_LOADER_CAP_FP16: + return 1; + default: + return 0; + } +} + static int image_get_buffers(__DRIdrawable *driDrawable, unsigned int format, @@ -207,18 +219,20 @@ }; static const __DRIdri2LoaderExtension dri2_loader_extension = { - .base = { __DRI_DRI2_LOADER, 3 }, + .base = { __DRI_DRI2_LOADER, 4 }, .getBuffers = dri_get_buffers, .flushFrontBuffer = dri_flush_front_buffer, .getBuffersWithFormat = dri_get_buffers_with_format, + .getCapability = dri_get_capability, }; static const __DRIimageLoaderExtension image_loader_extension = { - .base = { __DRI_IMAGE_LOADER, 1 }, + .base = { __DRI_IMAGE_LOADER, 2 }, .getBuffers = image_get_buffers, .flushFrontBuffer = dri_flush_front_buffer, + .getCapability = dri_get_capability, }; static const __DRIswrastLoaderExtension swrast_loader_extension = { @@ -243,38 +257,35 @@ const char *name; int version; int offset; - int optional; + bool optional; }; static struct dri_extension_match dri_core_extensions[] = { - { __DRI2_FLUSH, 1, offsetof(struct gbm_dri_device, flush) }, - { __DRI_IMAGE, 1, offsetof(struct gbm_dri_device, image) }, - { __DRI2_FENCE, 1, offsetof(struct gbm_dri_device, fence), 1 }, - { NULL, 0, 0 } + { __DRI2_FLUSH, 1, offsetof(struct gbm_dri_device, flush), false }, + { __DRI_IMAGE, 1, offsetof(struct gbm_dri_device, image), false }, + { __DRI2_FENCE, 1, offsetof(struct gbm_dri_device, fence), true }, }; static struct dri_extension_match gbm_dri_device_extensions[] = { - { __DRI_CORE, 1, offsetof(struct gbm_dri_device, core) }, - { __DRI_DRI2, 1, offsetof(struct gbm_dri_device, dri2) }, - { NULL, 0, 0 } + { __DRI_CORE, 1, offsetof(struct gbm_dri_device, core), false }, + { __DRI_DRI2, 1, offsetof(struct gbm_dri_device, dri2), false }, }; static struct dri_extension_match gbm_swrast_device_extensions[] = { - { __DRI_CORE, 1, offsetof(struct gbm_dri_device, core), }, - { __DRI_SWRAST, 1, offsetof(struct gbm_dri_device, swrast) }, - { NULL, 0, 0 } + { __DRI_CORE, 1, offsetof(struct gbm_dri_device, core), false }, + { __DRI_SWRAST, 1, offsetof(struct gbm_dri_device, swrast), false }, }; -static int +static bool dri_bind_extensions(struct gbm_dri_device *dri, - struct dri_extension_match *matches, + struct dri_extension_match *matches, size_t num_matches, const __DRIextension **extensions) { - int i, j, ret = 0; + bool ret = true; void *field; - for (i = 0; extensions[i]; i++) { - for (j = 0; matches[j].name; j++) { + for (size_t i = 0; extensions[i]; i++) { + for (size_t j = 0; j < num_matches; j++) { if (strcmp(extensions[i]->name, matches[j].name) == 0 && extensions[i]->version >= matches[j].version) { field = ((char *) dri + matches[j].offset); @@ -283,10 +294,10 @@ } } - for (j = 0; matches[j].name; j++) { + for (size_t j = 0; j < num_matches; j++) { field = ((char *) dri + matches[j].offset); if ((*(const __DRIextension **) field == NULL) && !matches[j].optional) { - ret = -1; + ret = false; } } @@ -327,7 +338,9 @@ if (!extensions) return -1; - if (dri_bind_extensions(dri, gbm_dri_device_extensions, extensions) < 0) { + if (!dri_bind_extensions(dri, gbm_dri_device_extensions, + ARRAY_SIZE(gbm_dri_device_extensions), + extensions)) { dlclose(dri->driver); fprintf(stderr, "failed to bind extensions\n"); return -1; @@ -347,7 +360,9 @@ if (!extensions) return -1; - if (dri_bind_extensions(dri, gbm_swrast_device_extensions, extensions) < 0) { + if (!dri_bind_extensions(dri, gbm_swrast_device_extensions, + ARRAY_SIZE(gbm_swrast_device_extensions), + extensions)) { dlclose(dri->driver); fprintf(stderr, "failed to bind extensions\n"); return -1; @@ -393,7 +408,9 @@ return -1; extensions = dri->core->getExtensions(dri->screen); - if (dri_bind_extensions(dri, dri_core_extensions, extensions) < 0) { + if (!dri_bind_extensions(dri, dri_core_extensions, + ARRAY_SIZE(dri_core_extensions), + extensions)) { ret = -1; goto free_screen; } @@ -478,61 +495,83 @@ static const struct gbm_dri_visual gbm_dri_visuals_table[] = { { GBM_FORMAT_R8, __DRI_IMAGE_FORMAT_R8, - { 0x000000ff, 0x00000000, 0x00000000, 0x00000000 }, + { 0, -1, -1, -1 }, + { 8, 0, 0, 0 }, }, { GBM_FORMAT_GR88, __DRI_IMAGE_FORMAT_GR88, - { 0x000000ff, 0x0000ff00, 0x00000000, 0x00000000 }, + { 0, 8, -1, -1 }, + { 8, 8, 0, 0 }, }, { GBM_FORMAT_ARGB1555, __DRI_IMAGE_FORMAT_ARGB1555, - { 0x00007c00, 0x000003e0, 0x0000001f, 0x00008000 }, + { 10, 5, 0, 11 }, + { 5, 5, 5, 1 }, }, { GBM_FORMAT_RGB565, __DRI_IMAGE_FORMAT_RGB565, - { 0x0000f800, 0x000007e0, 0x0000001f, 0x00000000 }, + { 11, 5, 0, -1 }, + { 5, 6, 5, 0 }, }, { GBM_FORMAT_XRGB8888, __DRI_IMAGE_FORMAT_XRGB8888, - { 0x00ff0000, 0x0000ff00, 0x000000ff, 0x00000000 }, + { 16, 8, 0, -1 }, + { 8, 8, 8, 0 }, }, { GBM_FORMAT_ARGB8888, __DRI_IMAGE_FORMAT_ARGB8888, - { 0x00ff0000, 0x0000ff00, 0x000000ff, 0xff000000 }, + { 16, 8, 0, 24 }, + { 8, 8, 8, 8 }, }, { GBM_FORMAT_XBGR8888, __DRI_IMAGE_FORMAT_XBGR8888, - { 0x000000ff, 0x0000ff00, 0x00ff0000, 0x00000000 }, + { 0, 8, 16, -1 }, + { 8, 8, 8, 0 }, }, { GBM_FORMAT_ABGR8888, __DRI_IMAGE_FORMAT_ABGR8888, - { 0x000000ff, 0x0000ff00, 0x00ff0000, 0xff000000 }, + { 0, 8, 16, 24 }, + { 8, 8, 8, 8 }, }, { GBM_FORMAT_XRGB2101010, __DRI_IMAGE_FORMAT_XRGB2101010, - { 0x3ff00000, 0x000ffc00, 0x000003ff, 0x00000000 }, + { 20, 10, 0, -1 }, + { 10, 10, 10, 0 }, }, { GBM_FORMAT_ARGB2101010, __DRI_IMAGE_FORMAT_ARGB2101010, - { 0x3ff00000, 0x000ffc00, 0x000003ff, 0xc0000000 }, + { 20, 10, 0, 30 }, + { 10, 10, 10, 2 }, }, { GBM_FORMAT_XBGR2101010, __DRI_IMAGE_FORMAT_XBGR2101010, - { 0x000003ff, 0x000ffc00, 0x3ff00000, 0x00000000 }, + { 0, 10, 20, -1 }, + { 10, 10, 10, 0 }, }, { GBM_FORMAT_ABGR2101010, __DRI_IMAGE_FORMAT_ABGR2101010, - { 0x000003ff, 0x000ffc00, 0x3ff00000, 0xc0000000 }, + { 0, 10, 20, 30 }, + { 10, 10, 10, 2 }, + }, + { + GBM_FORMAT_XBGR16161616F, __DRI_IMAGE_FORMAT_XBGR16161616F, + { 0, 16, 32, -1 }, + { 16, 16, 16, 0 }, + true, + }, + { + GBM_FORMAT_ABGR16161616F, __DRI_IMAGE_FORMAT_ABGR16161616F, + { 0, 16, 32, 48 }, + { 16, 16, 16, 16 }, + true, }, }; static int gbm_format_to_dri_format(uint32_t gbm_format) { - int i; - gbm_format = gbm_format_canonicalize(gbm_format); - for (i = 0; i < ARRAY_SIZE(gbm_dri_visuals_table); i++) { + for (size_t i = 0; i < ARRAY_SIZE(gbm_dri_visuals_table); i++) { if (gbm_dri_visuals_table[i].gbm_format == gbm_format) return gbm_dri_visuals_table[i].dri_image_format; } @@ -543,9 +582,7 @@ static uint32_t gbm_dri_to_gbm_format(int dri_format) { - int i; - - for (i = 0; i < ARRAY_SIZE(gbm_dri_visuals_table); i++) { + for (size_t i = 0; i < ARRAY_SIZE(gbm_dri_visuals_table); i++) { if (gbm_dri_visuals_table[i].dri_image_format == dri_format) return gbm_dri_visuals_table[i].gbm_format; } diff -Nru mesa-19.2.8/src/gbm/backends/dri/gbm_driint.h mesa-20.0.8/src/gbm/backends/dri/gbm_driint.h --- mesa-19.2.8/src/gbm/backends/dri/gbm_driint.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gbm/backends/dri/gbm_driint.h 2020-06-12 01:21:17.000000000 +0000 @@ -44,11 +44,18 @@ uint32_t gbm_format; int dri_image_format; struct { - uint32_t red; - uint32_t green; - uint32_t blue; - uint32_t alpha; - } rgba_masks; + int red; + int green; + int blue; + int alpha; + } rgba_shifts; + struct { + unsigned int red; + unsigned int green; + unsigned int blue; + unsigned int alpha; + } rgba_sizes; + bool is_float; }; struct gbm_dri_device { diff -Nru mesa-19.2.8/src/gbm/main/gbm.c mesa-20.0.8/src/gbm/main/gbm.c --- mesa-19.2.8/src/gbm/main/gbm.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gbm/main/gbm.c 2020-06-12 01:21:17.000000000 +0000 @@ -271,6 +271,9 @@ case GBM_FORMAT_RGBA1010102: case GBM_FORMAT_BGRA1010102: return 32; + case GBM_FORMAT_XBGR16161616F: + case GBM_FORMAT_ABGR16161616F: + return 64; } } diff -Nru mesa-19.2.8/src/gbm/main/gbm.h mesa-20.0.8/src/gbm/main/gbm.h --- mesa-19.2.8/src/gbm/main/gbm.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gbm/main/gbm.h 2020-06-12 01:21:17.000000000 +0000 @@ -150,6 +150,15 @@ #define GBM_FORMAT_RGBA1010102 __gbm_fourcc_code('R', 'A', '3', '0') /* [31:0] R:G:B:A 10:10:10:2 little endian */ #define GBM_FORMAT_BGRA1010102 __gbm_fourcc_code('B', 'A', '3', '0') /* [31:0] B:G:R:A 10:10:10:2 little endian */ +/* + * Floating point 64bpp RGB + * IEEE 754-2008 binary16 half-precision float + * [15:0] sign:exponent:mantissa 1:5:10 + */ +#define GBM_FORMAT_XBGR16161616F __gbm_fourcc_code('X', 'B', '4', 'H') /* [63:0] x:B:G:R 16:16:16:16 little endian */ + +#define GBM_FORMAT_ABGR16161616F __gbm_fourcc_code('A', 'B', '4', 'H') /* [63:0] A:B:G:R 16:16:16:16 little endian */ + /* packed YCbCr */ #define GBM_FORMAT_YUYV __gbm_fourcc_code('Y', 'U', 'Y', 'V') /* [31:0] Cr0:Y1:Cb0:Y0 8:8:8:8 little endian */ #define GBM_FORMAT_YVYU __gbm_fourcc_code('Y', 'V', 'Y', 'U') /* [31:0] Cb0:Y1:Cr0:Y0 8:8:8:8 little endian */ @@ -276,14 +285,16 @@ uint32_t format; }; +#define GBM_MAX_PLANES 4 + struct gbm_import_fd_modifier_data { uint32_t width; uint32_t height; uint32_t format; uint32_t num_fds; - int fds[4]; - int strides[4]; - int offsets[4]; + int fds[GBM_MAX_PLANES]; + int strides[GBM_MAX_PLANES]; + int offsets[GBM_MAX_PLANES]; uint64_t modifier; }; diff -Nru mesa-19.2.8/src/gbm/meson.build mesa-20.0.8/src/gbm/meson.build --- mesa-19.2.8/src/gbm/meson.build 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gbm/meson.build 2020-06-12 01:21:17.000000000 +0000 @@ -66,14 +66,14 @@ libraries_private : '-ldl', # FIXME: autotools lists this a incomplete ) -if with_tests and prog_nm.found() +if with_symbols_check test( 'gbm-symbols-check', symbols_check, args : [ '--lib', libgbm, '--symbols-file', files('gbm-symbols.txt'), - '--nm', prog_nm.path(), + symbols_check_args, ], suite : ['gbm'], ) diff -Nru mesa-19.2.8/src/getopt/meson.build mesa-20.0.8/src/getopt/meson.build --- mesa-19.2.8/src/getopt/meson.build 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/getopt/meson.build 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,29 @@ +# Copyright © 2018 Intel Corporation + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +libgetopt = static_library( + 'getopt', + 'getopt_long.c', +) + +idep_getopt = declare_dependency( + link_with : libgetopt, + include_directories : include_directories('.', is_system : true), +) diff -Nru mesa-19.2.8/src/glx/apple/glx_empty.c mesa-20.0.8/src/glx/apple/glx_empty.c --- mesa-19.2.8/src/glx/apple/glx_empty.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/glx/apple/glx_empty.c 2020-06-12 01:21:17.000000000 +0000 @@ -147,7 +147,7 @@ } -_X_EXPORT int +_X_EXPORT void glXQueryGLXPbufferSGIX(Display * dpy, GLXDrawable drawable, int attribute, unsigned int *value) { @@ -155,7 +155,6 @@ (void) drawable; (void) attribute; (void) value; - return 0; } _X_EXPORT GLXDrawable diff -Nru mesa-19.2.8/src/glx/dri2_glx.c mesa-20.0.8/src/glx/dri2_glx.c --- mesa-19.2.8/src/glx/dri2_glx.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/glx/dri2_glx.c 2020-06-12 01:21:17.000000000 +0000 @@ -599,7 +599,7 @@ flags = __DRI2_FLUSH_DRAWABLE; if (flush) flags |= __DRI2_FLUSH_CONTEXT; - dri2Flush(psc, ctx, priv, flags, __DRI2_THROTTLE_SWAPBUFFER); + dri2Flush(psc, ctx, priv, flags, __DRI2_THROTTLE_COPYSUBBUFFER); region = XFixesCreateRegion(psc->base.dpy, &xrect, 1); DRI2CopyRegion(psc->base.dpy, pdraw->xDrawable, region, @@ -1018,7 +1018,7 @@ psc = (struct dri2_screen *) pdraw->psc; - if (pdraw && psc->f && psc->f->base.version >= 3 && psc->f->invalidate) + if (psc->f && psc->f->base.version >= 3 && psc->f->invalidate) psc->f->invalidate(pdp->driDrawable); } @@ -1244,7 +1244,7 @@ psc->fd = loader_open_device(deviceName); if (psc->fd < 0) { - ErrorMessageF("failed to open drm device: %s\n", strerror(errno)); + ErrorMessageF("failed to open %s: %s\n", deviceName, strerror(errno)); goto handle_error; } diff -Nru mesa-19.2.8/src/glx/dri3_glx.c mesa-20.0.8/src/glx/dri3_glx.c --- mesa-19.2.8/src/glx/dri3_glx.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/glx/dri3_glx.c 2020-06-12 01:21:17.000000000 +0000 @@ -1105,8 +1105,6 @@ pdp->base.destroyDisplay = dri3_destroy_display; pdp->base.createScreen = dri3_create_screen; - loader_set_logger(dri_message); - pdp->loader_extensions = loader_extensions; return &pdp->base; diff -Nru mesa-19.2.8/src/glx/dri_common.c mesa-20.0.8/src/glx/dri_common.c --- mesa-19.2.8/src/glx/dri_common.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/glx/dri_common.c 2020-06-12 01:21:17.000000000 +0000 @@ -166,6 +166,10 @@ __ATTRIB(__DRI_ATTRIB_GREEN_MASK, greenMask), __ATTRIB(__DRI_ATTRIB_BLUE_MASK, blueMask), __ATTRIB(__DRI_ATTRIB_ALPHA_MASK, alphaMask), + __ATTRIB(__DRI_ATTRIB_RED_SHIFT, redShift), + __ATTRIB(__DRI_ATTRIB_GREEN_SHIFT, greenShift), + __ATTRIB(__DRI_ATTRIB_BLUE_SHIFT, blueShift), + __ATTRIB(__DRI_ATTRIB_ALPHA_SHIFT, alphaShift), #endif __ATTRIB(__DRI_ATTRIB_MAX_PBUFFER_WIDTH, maxPbufferWidth), __ATTRIB(__DRI_ATTRIB_MAX_PBUFFER_HEIGHT, maxPbufferHeight), diff -Nru mesa-19.2.8/src/glx/drisw_glx.c mesa-20.0.8/src/glx/drisw_glx.c --- mesa-19.2.8/src/glx/drisw_glx.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/glx/drisw_glx.c 2020-06-12 01:21:17.000000000 +0000 @@ -31,36 +31,6 @@ #include #include -static Bool -XCreateGCs(struct drisw_drawable * pdp, - Display * dpy, XID drawable, int visualid) -{ - XGCValues gcvalues; - long visMask; - XVisualInfo visTemp; - int num_visuals; - - /* create GC's */ - pdp->gc = XCreateGC(dpy, drawable, 0, NULL); - pdp->swapgc = XCreateGC(dpy, drawable, 0, NULL); - - gcvalues.function = GXcopy; - gcvalues.graphics_exposures = False; - XChangeGC(dpy, pdp->gc, GCFunction, &gcvalues); - XChangeGC(dpy, pdp->swapgc, GCFunction, &gcvalues); - XChangeGC(dpy, pdp->swapgc, GCGraphicsExposures, &gcvalues); - - /* visual */ - visTemp.visualid = visualid; - visMask = VisualIDMask; - pdp->visinfo = XGetVisualInfo(dpy, visMask, &visTemp, &num_visuals); - - if (!pdp->visinfo || num_visuals == 0) - return False; - - return True; -} - static int xshm_error = 0; static int xshm_opcode = -1; @@ -93,8 +63,8 @@ if (!xshm_error && shmid >= 0) { pdp->shminfo.shmid = shmid; pdp->ximage = XShmCreateImage(dpy, - pdp->visinfo->visual, - pdp->visinfo->depth, + NULL, + pdp->xDepth, ZPixmap, /* format */ NULL, /* data */ &pdp->shminfo, /* shminfo */ @@ -123,8 +93,8 @@ if (pdp->ximage == NULL) { pdp->shminfo.shmid = -1; pdp->ximage = XCreateImage(dpy, - pdp->visinfo->visual, - pdp->visinfo->depth, + NULL, + pdp->xDepth, ZPixmap, 0, /* format, offset */ NULL, /* data */ 0, 0, /* width, height */ @@ -151,10 +121,7 @@ if (pdp->shminfo.shmid > 0) XShmDetach(dpy, &pdp->shminfo); - free(pdp->visinfo); - XFreeGC(dpy, pdp->gc); - XFreeGC(dpy, pdp->swapgc); } /** @@ -214,37 +181,25 @@ Display *dpy = pdraw->psc->dpy; Drawable drawable; XImage *ximage; - GC gc; + GC gc = pdp->gc; if (!pdp->ximage || shmid != pdp->shminfo.shmid) { if (!XCreateDrawable(pdp, shmid, dpy)) return; } - switch (op) { - case __DRI_SWRAST_IMAGE_OP_DRAW: - gc = pdp->gc; - break; - case __DRI_SWRAST_IMAGE_OP_SWAP: - gc = pdp->swapgc; - break; - default: - return; - } - drawable = pdraw->xDrawable; ximage = pdp->ximage; ximage->bytes_per_line = stride ? stride : bytes_per_line(w * ximage->bits_per_pixel, 32); ximage->data = data; + ximage->width = ximage->bytes_per_line / ((ximage->bits_per_pixel + 7)/ 8); + ximage->height = h; + if (pdp->shminfo.shmid >= 0) { - ximage->width = ximage->bytes_per_line / ((ximage->bits_per_pixel + 7)/ 8); - ximage->height = h; XShmPutImage(dpy, drawable, gc, ximage, srcx, srcy, x, y, w, h, False); XSync(dpy, False); } else { - ximage->width = w; - ximage->height = h; XPutImage(dpy, drawable, gc, ximage, srcx, srcy, x, y, w, h); } ximage->data = NULL; @@ -332,10 +287,10 @@ swrastGetImage2(read, x, y, w, h, 0, data, loaderPrivate); } -static void -swrastGetImageShm(__DRIdrawable * read, - int x, int y, int w, int h, - int shmid, void *loaderPrivate) +static GLboolean +swrastGetImageShm2(__DRIdrawable * read, + int x, int y, int w, int h, + int shmid, void *loaderPrivate) { struct drisw_drawable *prp = loaderPrivate; __GLXDRIdrawable *pread = &(prp->base); @@ -345,8 +300,11 @@ if (!prp->ximage || shmid != prp->shminfo.shmid) { if (!XCreateDrawable(prp, shmid, dpy)) - return; + return GL_FALSE; } + + if (prp->shminfo.shmid == -1) + return GL_FALSE; readable = pread->xDrawable; ximage = prp->ximage; @@ -356,10 +314,19 @@ ximage->bytes_per_line = bytes_per_line(w * ximage->bits_per_pixel, 32); XShmGetImage(dpy, readable, ximage, x, y, ~0L); + return GL_TRUE; +} + +static void +swrastGetImageShm(__DRIdrawable * read, + int x, int y, int w, int h, + int shmid, void *loaderPrivate) +{ + swrastGetImageShm2(read, x, y, w, h, shmid, loaderPrivate); } static const __DRIswrastLoaderExtension swrastLoaderExtension_shm = { - .base = {__DRI_SWRAST_LOADER, 5 }, + .base = {__DRI_SWRAST_LOADER, 6 }, .getDrawableInfo = swrastGetDrawableInfo, .putImage = swrastPutImage, @@ -369,6 +336,7 @@ .putImageShm = swrastPutImageShm, .getImageShm = swrastGetImageShm, .putImageShm2 = swrastPutImageShm2, + .getImageShm2 = swrastGetImageShm2, }; static const __DRIextension *loader_extensions_shm[] = { @@ -690,8 +658,8 @@ struct drisw_drawable *pdp; __GLXDRIconfigPrivate *config = (__GLXDRIconfigPrivate *) modes; struct drisw_screen *psc = (struct drisw_screen *) base; - Bool ret; const __DRIswrastExtension *swrast = psc->swrast; + Display *dpy = psc->base.dpy; pdp = calloc(1, sizeof(*pdp)); if (!pdp) @@ -700,11 +668,34 @@ pdp->base.xDrawable = xDrawable; pdp->base.drawable = drawable; pdp->base.psc = &psc->base; + pdp->config = modes; + pdp->gc = XCreateGC(dpy, xDrawable, 0, NULL); + pdp->xDepth = 0; + + /* Use the visual depth, if this fbconfig corresponds to a visual */ + if (pdp->config->visualID != 0) { + int matches = 0; + XVisualInfo *visinfo, template; + + template.visualid = pdp->config->visualID; + template.screen = pdp->config->screen; + visinfo = XGetVisualInfo(dpy, VisualIDMask | VisualScreenMask, + &template, &matches); + + if (visinfo && matches) { + pdp->xDepth = visinfo->depth; + XFree(visinfo); + } + } - ret = XCreateGCs(pdp, psc->base.dpy, xDrawable, modes->visualID); - if (!ret) { - free(pdp); - return NULL; + /* Otherwise, or if XGetVisualInfo failed, ask the server */ + if (pdp->xDepth == 0) { + Window root; + int x, y; + unsigned uw, uh, bw, depth; + + XGetGeometry(dpy, xDrawable, &root, &x, &y, &uw, &uh, &bw, &depth); + pdp->xDepth = depth; } /* Create a new drawable */ diff -Nru mesa-19.2.8/src/glx/drisw_priv.h mesa-20.0.8/src/glx/drisw_priv.h --- mesa-19.2.8/src/glx/drisw_priv.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/glx/drisw_priv.h 2020-06-12 01:21:17.000000000 +0000 @@ -62,12 +62,11 @@ __GLXDRIdrawable base; GC gc; - GC swapgc; - __DRIdrawable *driDrawable; - XVisualInfo *visinfo; + struct glx_config *config; XImage *ximage; XShmSegmentInfo shminfo; + int xDepth; }; _X_HIDDEN int diff -Nru mesa-19.2.8/src/glx/glxclient.h mesa-20.0.8/src/glx/glxclient.h --- mesa-19.2.8/src/glx/glxclient.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/glx/glxclient.h 2020-06-12 01:21:17.000000000 +0000 @@ -328,13 +328,6 @@ /*@} */ /** - * Fill newImage with the unpacked form of \c oldImage getting it - * ready for transport to the server. - */ - void (*fillImage) (struct glx_context *, GLint, GLint, GLint, GLint, GLenum, - GLenum, const GLvoid *, GLubyte *, GLubyte *); - - /** * Client side attribs. */ __GLXattributeMachine attributes; diff -Nru mesa-19.2.8/src/glx/glxconfig.c mesa-20.0.8/src/glx/glxconfig.c --- mesa-19.2.8/src/glx/glxconfig.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/glx/glxconfig.c 2020-06-12 01:21:17.000000000 +0000 @@ -218,43 +218,54 @@ _X_HIDDEN struct glx_config * glx_config_create_list(unsigned count) { - struct glx_config *c = NULL; + const size_t size = sizeof(struct glx_config); + struct glx_config *base = NULL; + struct glx_config **next; unsigned i; - if (!(c = calloc(count, sizeof(struct glx_config)))) - return NULL; - + next = &base; for (i = 0; i < count; i++) { - c[i].visualID = GLX_DONT_CARE; - c[i].visualType = GLX_DONT_CARE; - c[i].visualRating = GLX_NONE; - c[i].transparentPixel = GLX_NONE; - c[i].transparentRed = GLX_DONT_CARE; - c[i].transparentGreen = GLX_DONT_CARE; - c[i].transparentBlue = GLX_DONT_CARE; - c[i].transparentAlpha = GLX_DONT_CARE; - c[i].transparentIndex = GLX_DONT_CARE; - c[i].xRenderable = GLX_DONT_CARE; - c[i].fbconfigID = GLX_DONT_CARE; - c[i].swapMethod = GLX_SWAP_UNDEFINED_OML; - c[i].bindToTextureRgb = GLX_DONT_CARE; - c[i].bindToTextureRgba = GLX_DONT_CARE; - c[i].bindToMipmapTexture = GLX_DONT_CARE; - c[i].bindToTextureTargets = GLX_DONT_CARE; - c[i].yInverted = GLX_DONT_CARE; - c[i].sRGBCapable = GLX_DONT_CARE; + *next = calloc(1, size); + if (*next == NULL) { + glx_config_destroy_list(base); + base = NULL; + break; + } + + (*next)->visualID = GLX_DONT_CARE; + (*next)->visualType = GLX_DONT_CARE; + (*next)->visualRating = GLX_NONE; + (*next)->transparentPixel = GLX_NONE; + (*next)->transparentRed = GLX_DONT_CARE; + (*next)->transparentGreen = GLX_DONT_CARE; + (*next)->transparentBlue = GLX_DONT_CARE; + (*next)->transparentAlpha = GLX_DONT_CARE; + (*next)->transparentIndex = GLX_DONT_CARE; + (*next)->xRenderable = GLX_DONT_CARE; + (*next)->fbconfigID = GLX_DONT_CARE; + (*next)->swapMethod = GLX_SWAP_UNDEFINED_OML; + (*next)->bindToTextureRgb = GLX_DONT_CARE; + (*next)->bindToTextureRgba = GLX_DONT_CARE; + (*next)->bindToMipmapTexture = GLX_DONT_CARE; + (*next)->bindToTextureTargets = GLX_DONT_CARE; + (*next)->yInverted = GLX_DONT_CARE; + (*next)->sRGBCapable = GLX_DONT_CARE; - if (i != count -1) - c[i].next = &(c[i+1]); + next = &((*next)->next); } - return c; + return base; } _X_HIDDEN void glx_config_destroy_list(struct glx_config *configs) { - free(configs); + while (configs != NULL) { + struct glx_config *const next = configs->next; + + free(configs); + configs = next; + } } diff -Nru mesa-19.2.8/src/glx/glxconfig.h mesa-20.0.8/src/glx/glxconfig.h --- mesa-19.2.8/src/glx/glxconfig.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/glx/glxconfig.h 2020-06-12 01:21:17.000000000 +0000 @@ -38,6 +38,7 @@ GLint redBits, greenBits, blueBits, alphaBits; /* bits per comp */ GLuint redMask, greenMask, blueMask, alphaMask; + GLuint redShift, greenShift, blueShift, alphaShift; GLint rgbBits; /* total bits for rgb */ GLint indexBits; /* total bits for colorindex */ diff -Nru mesa-19.2.8/src/glx/glxcurrent.c mesa-20.0.8/src/glx/glxcurrent.c --- mesa-19.2.8/src/glx/glxcurrent.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/glx/glxcurrent.c 2020-06-12 01:21:17.000000000 +0000 @@ -245,18 +245,6 @@ __glXUnlock(); - /* The indirect vertex array state must to be initialised after we - * have setup the context, as it needs to query server attributes. - */ - if (gc && !gc->isDirect) { - __GLXattribute *state = gc->client_state_private; - if (state && state->array_state == NULL) { - glGetString(GL_EXTENSIONS); - glGetString(GL_VERSION); - __glXInitVertexArrayState(gc); - } - } - return GL_TRUE; } diff -Nru mesa-19.2.8/src/glx/glxext.c mesa-20.0.8/src/glx/glxext.c --- mesa-19.2.8/src/glx/glxext.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/glx/glxext.c 2020-06-12 01:21:17.000000000 +0000 @@ -50,6 +50,9 @@ #include "glxextensions.h" #include "util/debug.h" +#ifndef GLX_USE_APPLEGL +#include "dri_common.h" +#endif #include #include @@ -69,7 +72,7 @@ /* Extension required boiler plate */ static const char __glXExtensionName[] = GLX_EXTENSION_NAME; - static struct glx_display *glx_displays; +static struct glx_display *glx_displays; static /* const */ char *error_list[] = { "GLXBadContext", @@ -714,7 +717,8 @@ LockDisplay(dpy); psc->configs = NULL; - if (atof(priv->serverGLXversion) >= 1.3) { + if (priv->majorVersion > 1 || + (priv->majorVersion == 1 && priv->minorVersion >= 3)) { GetReq(GLXGetFBConfigs, fb_req); fb_req->reqType = priv->majorOpcode; fb_req->glxCode = X_GLXGetFBConfigs; @@ -896,7 +900,7 @@ } XESetCloseDisplay(dpy, dpyPriv->codes->extension, __glXCloseDisplay); - XESetErrorString (dpy, dpyPriv->codes->extension,__glXErrorString); + XESetErrorString (dpy, dpyPriv->codes->extension, __glXErrorString); dpyPriv->glXDrawHash = __glxHashCreate(); @@ -906,6 +910,11 @@ dpyPriv->drawHash = __glxHashCreate(); +#ifndef GLX_USE_APPLEGL + /* Set the logger before the *CreateDisplay functions. */ + loader_set_logger(dri_message); +#endif + /* ** Initialize the direct rendering per display data and functions. ** Note: This _must_ be done before calling any other DRI routines diff -Nru mesa-19.2.8/src/glx/glx_pbuffer.c mesa-20.0.8/src/glx/glx_pbuffer.c --- mesa-19.2.8/src/glx/glx_pbuffer.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/glx/glx_pbuffer.c 2020-06-12 01:21:17.000000000 +0000 @@ -414,7 +414,7 @@ UnlockDisplay(dpy); SyncHandle(); - return 0; + return 1; } static void @@ -834,11 +834,11 @@ /** * Query an attribute of a pbuffer. */ -_GLX_PUBLIC int +_GLX_PUBLIC void glXQueryGLXPbufferSGIX(Display * dpy, GLXPbufferSGIX drawable, int attribute, unsigned int *value) { - return __glXGetDrawableAttribute(dpy, drawable, attribute, value); + __glXGetDrawableAttribute(dpy, drawable, attribute, value); } #endif diff -Nru mesa-19.2.8/src/glx/indirect_glx.c mesa-20.0.8/src/glx/indirect_glx.c --- mesa-19.2.8/src/glx/indirect_glx.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/glx/indirect_glx.c 2020-06-12 01:21:17.000000000 +0000 @@ -34,7 +34,7 @@ #include "glapi.h" #include "glxclient.h" - +#include "indirect.h" #include "util/debug.h" #ifndef GLX_USE_APPLEGL @@ -148,9 +148,27 @@ sent = SendMakeCurrentRequest(dpy, gc->xid, tag, draw, read, &gc->currentContextTag); - if (!IndirectAPI) - IndirectAPI = __glXNewIndirectAPI(); - _glapi_set_dispatch(IndirectAPI); + if (sent) { + if (!IndirectAPI) + IndirectAPI = __glXNewIndirectAPI(); + _glapi_set_dispatch(IndirectAPI); + + /* The indirect vertex array state must to be initialised after we + * have setup the context, as it needs to query server attributes. + * + * At the point this is called gc->currentDpy is not initialized + * nor is the thread's current context actually set. Hence the + * cleverness before the GetString calls. + */ + __GLXattribute *state = gc->client_state_private; + if (state && state->array_state == NULL) { + gc->currentDpy = gc->psc->dpy; + __glXSetCurrentContext(gc); + __indirect_glGetString(GL_EXTENSIONS); + __indirect_glGetString(GL_VERSION); + __glXInitVertexArrayState(gc); + } + } return !sent; } @@ -399,10 +417,6 @@ gc->attributes.stackPointer = &gc->attributes.stack[0]; - /* - ** PERFORMANCE NOTE: A mode dependent fill image can speed things up. - */ - gc->fillImage = __glFillImage; gc->pc = gc->buf; gc->bufEnd = gc->buf + bufSize; gc->isDirect = GL_FALSE; diff -Nru mesa-19.2.8/src/glx/renderpix.c mesa-20.0.8/src/glx/renderpix.c --- mesa-19.2.8/src/glx/renderpix.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/glx/renderpix.c 2020-06-12 01:21:17.000000000 +0000 @@ -92,8 +92,8 @@ /* Apply pixel store unpack modes to copy data into buf */ if (src != NULL) { - (*gc->fillImage) (gc, dim, width, height, depth, format, type, - src, buf, modes); + __glFillImage(gc, dim, width, height, depth, format, type, + src, buf, modes); } else { if (dim < 3) { @@ -147,13 +147,12 @@ __GLX_PUT_LONG(20, type); pc += hdrlen; if (compsize > 0) { - (*gc->fillImage) (gc, 1, width, 1, 1, format, type, - row, pc, pixelHeaderPC); + __glFillImage(gc, 1, width, 1, 1, format, type, row, pc, + pixelHeaderPC); pc += image1len; } if (compsize2 > 0) { - (*gc->fillImage) (gc, 1, height, 1, 1, format, type, - column, pc, NULL); + __glFillImage(gc, 1, height, 1, 1, format, type, column, pc, NULL); pc += image2len; } if ((compsize == 0) && (compsize2 == 0)) { @@ -183,11 +182,11 @@ __glXSetError(gc, GL_OUT_OF_MEMORY); return; } - (*gc->fillImage) (gc, 1, width, 1, 1, format, type, row, buf, - pixelHeaderPC); + __glFillImage(gc, 1, width, 1, 1, format, type, row, buf, + pixelHeaderPC); - (*gc->fillImage) (gc, 1, height, 1, 1, format, type, column, - buf + image1len, pixelHeaderPC); + __glFillImage(gc, 1, height, 1, 1, format, type, column, + buf + image1len, pixelHeaderPC); /* Send large command */ __glXSendLargeCommand(gc, gc->pc, (GLint) (pc - gc->pc), buf, diff -Nru mesa-19.2.8/src/glx/tests/query_renderer_unittest.cpp mesa-20.0.8/src/glx/tests/query_renderer_unittest.cpp --- mesa-19.2.8/src/glx/tests/query_renderer_unittest.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/glx/tests/query_renderer_unittest.cpp 2020-06-12 01:21:17.000000000 +0000 @@ -171,7 +171,6 @@ E(GLX_RENDERER_OPENGL_COMPATIBILITY_PROFILE_VERSION_MESA), E(GLX_RENDERER_OPENGL_ES_PROFILE_VERSION_MESA), E(GLX_RENDERER_OPENGL_ES2_PROFILE_VERSION_MESA), - E(GLX_RENDERER_ID_MESA), }; for (unsigned i = 0; i < ARRAY_SIZE(invalid_attributes); i++) { @@ -322,7 +321,6 @@ E(GLX_RENDERER_OPENGL_COMPATIBILITY_PROFILE_VERSION_MESA + 0x10000), E(GLX_RENDERER_OPENGL_ES_PROFILE_VERSION_MESA + 0x10000), E(GLX_RENDERER_OPENGL_ES2_PROFILE_VERSION_MESA + 0x10000), - E(GLX_RENDERER_ID_MESA + 0x10000), }; for (unsigned i = 0; i < ARRAY_SIZE(invalid_attributes); i++) { diff -Nru mesa-19.2.8/src/glx/xfont.c mesa-20.0.8/src/glx/xfont.c --- mesa-19.2.8/src/glx/xfont.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/glx/xfont.c 2020-06-12 01:21:17.000000000 +0000 @@ -129,7 +129,7 @@ * Generate OpenGL-compatible bitmap. */ static void -fill_bitmap(Display * dpy, Window win, GC gc, +fill_bitmap(Display * dpy, int screen, GC gc, unsigned int width, unsigned int height, int x0, int y0, unsigned int c, GLubyte * bitmap) { @@ -138,7 +138,7 @@ Pixmap pixmap; XChar2b char2b; - pixmap = XCreatePixmap(dpy, win, 8 * width, height, 1); + pixmap = XCreatePixmap(dpy, RootWindow(dpy, screen), 8 * width, height, 1); XSetForeground(dpy, gc, 0); XFillRectangle(dpy, pixmap, gc, 0, 0, 8 * width, height); XSetForeground(dpy, gc, 1); @@ -215,17 +215,13 @@ DRI_glXUseXFont(struct glx_context *CC, Font font, int first, int count, int listbase) { Display *dpy; - Window win; + int screen; Pixmap pixmap; GC gc; XGCValues values; unsigned long valuemask; XFontStruct *fs; -#if !defined(GLX_USE_APPLEGL) - __GLXDRIdrawable *glxdraw; -#endif - GLint swapbytes, lsbfirst, rowlength; GLint skiprows, skippixels, alignment; @@ -235,13 +231,7 @@ int i; dpy = CC->currentDpy; - win = CC->currentDrawable; - -#if !defined(GLX_USE_APPLEGL) - glxdraw = GetGLXDRIDrawable(CC->currentDpy, CC->currentDrawable); - if (glxdraw) - win = glxdraw->xDrawable; -#endif + screen = CC->screen; fs = XQueryFont(dpy, font); if (!fs) { @@ -289,7 +279,7 @@ glPixelStorei(GL_UNPACK_SKIP_PIXELS, 0); glPixelStorei(GL_UNPACK_ALIGNMENT, 1); - pixmap = XCreatePixmap(dpy, win, 10, 10, 1); + pixmap = XCreatePixmap(dpy, RootWindow(dpy, screen), 10, 10, 1); values.foreground = BlackPixel(dpy, DefaultScreen(dpy)); values.background = WhitePixel(dpy, DefaultScreen(dpy)); values.font = fs->fid; @@ -352,7 +342,7 @@ if (valid && (bm_width > 0) && (bm_height > 0)) { memset(bm, '\0', bm_width * bm_height); - fill_bitmap(dpy, win, gc, bm_width, bm_height, x, y, c, bm); + fill_bitmap(dpy, screen, gc, bm_width, bm_height, x, y, c, bm); glBitmap(width, height, x0, y0, dx, dy, bm); #ifdef DEBUG diff -Nru mesa-19.2.8/src/gtest/meson.build mesa-20.0.8/src/gtest/meson.build --- mesa-19.2.8/src/gtest/meson.build 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/gtest/meson.build 2020-06-12 01:21:17.000000000 +0000 @@ -29,4 +29,5 @@ idep_gtest = declare_dependency( link_with : libgtest, include_directories : include_directories('include', is_system : true), + dependencies : [dep_thread], ) diff -Nru mesa-19.2.8/src/imgui/meson.build mesa-20.0.8/src/imgui/meson.build --- mesa-19.2.8/src/imgui/meson.build 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/imgui/meson.build 2020-06-12 01:21:17.000000000 +0000 @@ -1,5 +1,5 @@ libimgui_core = static_library( - 'intel_imgui_core', + 'imgui_core', files('imgui.cpp', 'imgui_draw.cpp', 'imgui_widgets.cpp'), cpp_args : ['-w'], install : false diff -Nru mesa-19.2.8/src/intel/Android.genxml.mk mesa-20.0.8/src/intel/Android.genxml.mk --- mesa-19.2.8/src/intel/Android.genxml.mk 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/Android.genxml.mk 2020-06-12 01:21:17.000000000 +0000 @@ -106,6 +106,11 @@ $(intermediates)/genxml/gen11_pack.h: $(LOCAL_PATH)/genxml/gen11.xml $(LOCAL_PATH)/genxml/gen_pack_header.py $(call header-gen) +$(intermediates)/genxml/gen12_pack.h: PRIVATE_SCRIPT := $(MESA_PYTHON2) $(LOCAL_PATH)/genxml/gen_pack_header.py +$(intermediates)/genxml/gen12_pack.h: PRIVATE_XML := $(LOCAL_PATH)/genxml/gen12.xml +$(intermediates)/genxml/gen12_pack.h: $(LOCAL_PATH)/genxml/gen12.xml $(LOCAL_PATH)/genxml/gen_pack_header.py + $(call header-gen) + $(intermediates)/genxml/genX_xml.h: $(addprefix $(MESA_TOP)/src/intel/,$(GENXML_XML_FILES)) $(MESA_TOP)/src/intel/genxml/gen_zipped_file.py @mkdir -p $(dir $@) @echo "Gen Header: $(PRIVATE_MODULE) <= $(notdir $(@))" diff -Nru mesa-19.2.8/src/intel/Android.isl.mk mesa-20.0.8/src/intel/Android.isl.mk --- mesa-19.2.8/src/intel/Android.isl.mk 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/Android.isl.mk 2020-06-12 01:21:17.000000000 +0000 @@ -199,6 +199,25 @@ include $(BUILD_STATIC_LIBRARY) # --------------------------------------- +# Build libmesa_isl_gen12 +# --------------------------------------- + +include $(CLEAR_VARS) + +LOCAL_MODULE := libmesa_isl_gen12 + +LOCAL_SRC_FILES := $(ISL_GEN12_FILES) + +LOCAL_CFLAGS := -DGEN_VERSIONx10=120 + +LOCAL_C_INCLUDES := $(LIBISL_GENX_COMMON_INCLUDES) + +LOCAL_WHOLE_STATIC_LIBRARIES := libmesa_genxml + +include $(MESA_COMMON_MK) +include $(BUILD_STATIC_LIBRARY) + +# --------------------------------------- # Build libmesa_isl_tiled_memcpy # --------------------------------------- @@ -268,6 +287,7 @@ libmesa_isl_gen9 \ libmesa_isl_gen10 \ libmesa_isl_gen11 \ + libmesa_isl_gen12 \ libmesa_genxml \ libmesa_isl_tiled_memcpy diff -Nru mesa-19.2.8/src/intel/Android.vulkan.mk mesa-20.0.8/src/intel/Android.vulkan.mk --- mesa-19.2.8/src/intel/Android.vulkan.mk 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/Android.vulkan.mk 2020-06-12 01:21:17.000000000 +0000 @@ -190,6 +190,28 @@ include $(BUILD_STATIC_LIBRARY) # +# libanv for gen12 +# + +include $(CLEAR_VARS) +LOCAL_MODULE := libmesa_anv_gen12 +LOCAL_MODULE_CLASS := STATIC_LIBRARIES + +LOCAL_SRC_FILES := $(VULKAN_GEN12_FILES) +LOCAL_CFLAGS := -DGEN_VERSIONx10=120 + +LOCAL_C_INCLUDES := $(VULKAN_COMMON_INCLUDES) + +LOCAL_STATIC_LIBRARIES := $(ANV_STATIC_LIBRARIES) + +LOCAL_SHARED_LIBRARIES := $(ANV_SHARED_LIBRARIES) +LOCAL_HEADER_LIBRARIES += $(VULKAN_COMMON_HEADER_LIBRARIES) + +include $(MESA_COMMON_MK) +include $(BUILD_STATIC_LIBRARY) + + +# # libmesa_vulkan_common # @@ -283,6 +305,7 @@ libmesa_compiler \ libmesa_intel_common \ libmesa_intel_dev \ + libmesa_intel_perf \ libmesa_vulkan_common \ libmesa_vulkan_util \ libmesa_anv_gen7 \ @@ -291,6 +314,7 @@ libmesa_anv_gen9 \ libmesa_anv_gen10 \ libmesa_anv_gen11 \ + libmesa_anv_gen12 \ libmesa_intel_compiler LOCAL_SHARED_LIBRARIES := $(ANV_SHARED_LIBRARIES) libz libsync liblog @@ -305,9 +329,5 @@ libexpat endif -ifeq ($(shell test $(PLATFORM_SDK_VERSION) -ge 27; echo $$?), 0) -LOCAL_STATIC_LIBRARIES += libgrallocusage -endif - include $(MESA_COMMON_MK) include $(BUILD_SHARED_LIBRARY) diff -Nru mesa-19.2.8/src/intel/blorp/blorp_blit.c mesa-20.0.8/src/intel/blorp/blorp_blit.c --- mesa-19.2.8/src/intel/blorp/blorp_blit.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/blorp/blorp_blit.c 2020-06-12 01:21:17.000000000 +0000 @@ -593,7 +593,7 @@ nir_local_variable_create(b->impl, glsl_vec4_type(), "color"); nir_ssa_def *mcs = NULL; - if (tex_aux_usage == ISL_AUX_USAGE_MCS) + if (isl_aux_usage_has_mcs(tex_aux_usage)) mcs = blorp_blit_txf_ms_mcs(b, v, pos); nir_op combine_op; @@ -667,7 +667,7 @@ nir_imm_int(b, i)); texture_data[stack_depth++] = blorp_nir_txf_ms(b, v, ms_pos, mcs, dst_type); - if (i == 0 && tex_aux_usage == ISL_AUX_USAGE_MCS) { + if (i == 0 && isl_aux_usage_has_mcs(tex_aux_usage)) { /* The Ivy Bridge PRM, Vol4 Part1 p27 (Multisample Control Surface) * suggests an optimization: * @@ -783,7 +783,7 @@ * here inside the loop after computing the pixel coordinates. */ nir_ssa_def *mcs = NULL; - if (key->tex_aux_usage == ISL_AUX_USAGE_MCS) + if (isl_aux_usage_has_mcs(key->tex_aux_usage)) mcs = blorp_blit_txf_ms_mcs(b, v, sample_coords_int); /* Compute sample index and map the sample index to a sample number. @@ -889,8 +889,6 @@ bit_cast_color(struct nir_builder *b, nir_ssa_def *color, const struct brw_blorp_blit_prog_key *key) { - assert(key->texture_data_type == nir_type_uint); - if (key->src_format == key->dst_format) return color; @@ -899,40 +897,51 @@ const struct isl_format_layout *dst_fmtl = isl_format_get_layout(key->dst_format); - /* They must be uint formats with the same bit size */ + /* They must be formats with the same bit size */ assert(src_fmtl->bpb == dst_fmtl->bpb); - assert(src_fmtl->channels.r.type == ISL_UINT); - assert(dst_fmtl->channels.r.type == ISL_UINT); - - /* They must be in regular color formats (no luminance or alpha) */ - assert(src_fmtl->channels.r.bits > 0); - assert(dst_fmtl->channels.r.bits > 0); - - /* They must be in RGBA order (possibly with channels missing) */ - assert(src_fmtl->channels.r.start_bit == 0); - assert(dst_fmtl->channels.r.start_bit == 0); if (src_fmtl->bpb <= 32) { - const unsigned src_channels = - isl_format_get_num_channels(key->src_format); - const unsigned src_bits[4] = { - src_fmtl->channels.r.bits, - src_fmtl->channels.g.bits, - src_fmtl->channels.b.bits, - src_fmtl->channels.a.bits, - }; - const unsigned dst_channels = - isl_format_get_num_channels(key->dst_format); - const unsigned dst_bits[4] = { - dst_fmtl->channels.r.bits, - dst_fmtl->channels.g.bits, - dst_fmtl->channels.b.bits, - dst_fmtl->channels.a.bits, - }; - nir_ssa_def *packed = - nir_format_pack_uint_unmasked(b, color, src_bits, src_channels); - color = nir_format_unpack_uint(b, packed, dst_bits, dst_channels); + assert(src_fmtl->channels.r.type == ISL_UINT || + src_fmtl->channels.r.type == ISL_UNORM); + assert(dst_fmtl->channels.r.type == ISL_UINT || + dst_fmtl->channels.r.type == ISL_UNORM); + + nir_ssa_def *packed = nir_imm_int(b, 0); + for (unsigned c = 0; c < 4; c++) { + if (src_fmtl->channels_array[c].bits == 0) + continue; + + const unsigned chan_start_bit = src_fmtl->channels_array[c].start_bit; + const unsigned chan_bits = src_fmtl->channels_array[c].bits; + + nir_ssa_def *chan = nir_channel(b, color, c); + if (src_fmtl->channels_array[c].type == ISL_UNORM) + chan = nir_format_float_to_unorm(b, chan, &chan_bits); + + packed = nir_ior(b, packed, nir_shift(b, chan, chan_start_bit)); + } + + nir_ssa_def *chans[4] = { }; + for (unsigned c = 0; c < 4; c++) { + if (dst_fmtl->channels_array[c].bits == 0) { + chans[c] = nir_imm_int(b, 0); + continue; + } + + const unsigned chan_start_bit = dst_fmtl->channels_array[c].start_bit; + const unsigned chan_bits = dst_fmtl->channels_array[c].bits; + chans[c] = nir_iand(b, nir_shift(b, packed, -(int)chan_start_bit), + nir_imm_int(b, BITFIELD_MASK(chan_bits))); + + if (dst_fmtl->channels_array[c].type == ISL_UNORM) + chans[c] = nir_format_unorm_to_float(b, chans[c], &chan_bits); + } + color = nir_vec(b, chans, 4); } else { + /* This path only supports UINT formats */ + assert(src_fmtl->channels.r.type == ISL_UINT); + assert(dst_fmtl->channels.r.type == ISL_UINT); + const unsigned src_bpc = src_fmtl->channels.r.bits; const unsigned dst_bpc = dst_fmtl->channels.r.bits; @@ -1343,7 +1352,7 @@ color = blorp_nir_txf(&b, &v, src_pos, key->texture_data_type); } else { nir_ssa_def *mcs = NULL; - if (key->tex_aux_usage == ISL_AUX_USAGE_MCS) + if (isl_aux_usage_has_mcs(key->tex_aux_usage)) mcs = blorp_blit_txf_ms_mcs(&b, &v, src_pos); color = blorp_nir_txf_ms(&b, &v, src_pos, mcs, key->texture_data_type); @@ -1490,7 +1499,7 @@ struct brw_wm_prog_key wm_key; brw_blorp_init_wm_prog_key(&wm_key); wm_key.base.tex.compressed_multisample_layout_mask = - prog_key->tex_aux_usage == ISL_AUX_USAGE_MCS; + isl_aux_usage_has_mcs(prog_key->tex_aux_usage); wm_key.base.tex.msaa_16 = prog_key->tex_samples == 16; wm_key.multisample_fbo = prog_key->rt_samples > 1; @@ -1609,9 +1618,9 @@ info->z_offset = 0; } -static void -surf_fake_interleaved_msaa(const struct isl_device *isl_dev, - struct brw_blorp_surface_info *info) +void +blorp_surf_fake_interleaved_msaa(const struct isl_device *isl_dev, + struct brw_blorp_surface_info *info) { assert(info->surf.msaa_layout == ISL_MSAA_LAYOUT_INTERLEAVED); @@ -1623,9 +1632,9 @@ info->surf.msaa_layout = ISL_MSAA_LAYOUT_NONE; } -static void -surf_retile_w_to_y(const struct isl_device *isl_dev, - struct brw_blorp_surface_info *info) +void +blorp_surf_retile_w_to_y(const struct isl_device *isl_dev, + struct brw_blorp_surface_info *info) { assert(info->surf.tiling == ISL_TILING_W); @@ -1639,7 +1648,7 @@ */ if (isl_dev->info->gen > 6 && info->surf.msaa_layout == ISL_MSAA_LAYOUT_INTERLEAVED) { - surf_fake_interleaved_msaa(isl_dev, info); + blorp_surf_fake_interleaved_msaa(isl_dev, info); } if (isl_dev->info->gen == 6) { @@ -1688,20 +1697,12 @@ return true; } -static bool -can_shrink_surfaces(const struct blorp_params *params) -{ - return - can_shrink_surface(¶ms->src) && - can_shrink_surface(¶ms->dst); -} - static unsigned get_max_surface_size(const struct gen_device_info *devinfo, - const struct blorp_params *params) + const struct brw_blorp_surface_info *surf) { const unsigned max = devinfo->gen >= 7 ? 16384 : 8192; - if (split_blorp_blit_debug && can_shrink_surfaces(params)) + if (split_blorp_blit_debug && can_shrink_surface(surf)) return max >> 4; /* A smaller restriction when debug is enabled */ else return max; @@ -1789,8 +1790,10 @@ enum blit_shrink_status { BLIT_NO_SHRINK = 0, - BLIT_WIDTH_SHRINK = 1, - BLIT_HEIGHT_SHRINK = 2, + BLIT_SRC_WIDTH_SHRINK = (1 << 0), + BLIT_DST_WIDTH_SHRINK = (1 << 1), + BLIT_SRC_HEIGHT_SHRINK = (1 << 2), + BLIT_DST_HEIGHT_SHRINK = (1 << 3), }; /* Try to blit. If the surface parameters exceed the size allowed by hardware, @@ -1881,7 +1884,7 @@ params->x1 = ALIGN(params->x1, 2) * px_size_sa.width; params->y1 = ALIGN(params->y1, 2) * px_size_sa.height; - surf_fake_interleaved_msaa(batch->blorp->isl_dev, ¶ms->dst); + blorp_surf_fake_interleaved_msaa(batch->blorp->isl_dev, ¶ms->dst); wm_prog_key->use_kill = true; wm_prog_key->need_dst_offset = true; @@ -1942,7 +1945,7 @@ params->y1 = ALIGN(params->y1, y_align) / 2; /* Retile the surface to Y-tiled */ - surf_retile_w_to_y(batch->blorp->isl_dev, ¶ms->dst); + blorp_surf_retile_w_to_y(batch->blorp->isl_dev, ¶ms->dst); wm_prog_key->dst_tiled_w = true; wm_prog_key->use_kill = true; @@ -1968,7 +1971,7 @@ * * TODO: what if this makes the texture size too large? */ - surf_retile_w_to_y(batch->blorp->isl_dev, ¶ms->src); + blorp_surf_retile_w_to_y(batch->blorp->isl_dev, ¶ms->src); wm_prog_key->src_tiled_w = true; wm_prog_key->need_src_offset = true; @@ -2090,13 +2093,17 @@ return 0; unsigned result = 0; - unsigned max_surface_size = get_max_surface_size(devinfo, params); - if (params->src.surf.logical_level0_px.width > max_surface_size || - params->dst.surf.logical_level0_px.width > max_surface_size) - result |= BLIT_WIDTH_SHRINK; - if (params->src.surf.logical_level0_px.height > max_surface_size || - params->dst.surf.logical_level0_px.height > max_surface_size) - result |= BLIT_HEIGHT_SHRINK; + unsigned max_src_surface_size = get_max_surface_size(devinfo, ¶ms->src); + if (params->src.surf.logical_level0_px.width > max_src_surface_size) + result |= BLIT_SRC_WIDTH_SHRINK; + if (params->src.surf.logical_level0_px.height > max_src_surface_size) + result |= BLIT_SRC_HEIGHT_SHRINK; + + unsigned max_dst_surface_size = get_max_surface_size(devinfo, ¶ms->dst); + if (params->dst.surf.logical_level0_px.width > max_dst_surface_size) + result |= BLIT_DST_WIDTH_SHRINK; + if (params->dst.surf.logical_level0_px.height > max_dst_surface_size) + result |= BLIT_DST_HEIGHT_SHRINK; if (result == 0) { batch->blorp->exec(batch, params); @@ -2182,23 +2189,6 @@ } static void -shrink_surfaces(const struct isl_device *dev, - struct blorp_params *params, - struct brw_blorp_blit_prog_key *wm_prog_key, - struct blt_coords *coords) -{ - /* Shrink source surface */ - shrink_surface_params(dev, ¶ms->src, &coords->x.src0, &coords->x.src1, - &coords->y.src0, &coords->y.src1); - wm_prog_key->need_src_offset = false; - - /* Shrink destination surface */ - shrink_surface_params(dev, ¶ms->dst, &coords->x.dst0, &coords->x.dst1, - &coords->y.dst0, &coords->y.dst1); - wm_prog_key->need_dst_offset = false; -} - -static void do_blorp_blit(struct blorp_batch *batch, const struct blorp_params *orig_params, struct brw_blorp_blit_prog_key *wm_prog_key, @@ -2216,33 +2206,60 @@ if (orig->y.mirror) y_scale = -y_scale; + enum blit_shrink_status shrink = BLIT_NO_SHRINK; + if (split_blorp_blit_debug) { + if (can_shrink_surface(&orig_params->src)) + shrink |= BLIT_SRC_WIDTH_SHRINK | BLIT_SRC_HEIGHT_SHRINK; + if (can_shrink_surface(&orig_params->dst)) + shrink |= BLIT_DST_WIDTH_SHRINK | BLIT_DST_HEIGHT_SHRINK; + } + bool x_done, y_done; - bool shrink = split_blorp_blit_debug && can_shrink_surfaces(orig_params); do { params = *orig_params; blit_coords = split_coords; - if (shrink) - shrink_surfaces(batch->blorp->isl_dev, ¶ms, wm_prog_key, - &blit_coords); + + if (shrink & (BLIT_SRC_WIDTH_SHRINK | BLIT_SRC_HEIGHT_SHRINK)) { + shrink_surface_params(batch->blorp->isl_dev, ¶ms.src, + &blit_coords.x.src0, &blit_coords.x.src1, + &blit_coords.y.src0, &blit_coords.y.src1); + wm_prog_key->need_src_offset = false; + } + + if (shrink & (BLIT_DST_WIDTH_SHRINK | BLIT_DST_HEIGHT_SHRINK)) { + shrink_surface_params(batch->blorp->isl_dev, ¶ms.dst, + &blit_coords.x.dst0, &blit_coords.x.dst1, + &blit_coords.y.dst0, &blit_coords.y.dst1); + wm_prog_key->need_dst_offset = false; + } + enum blit_shrink_status result = try_blorp_blit(batch, ¶ms, wm_prog_key, &blit_coords); - if (result & BLIT_WIDTH_SHRINK) { + if (result & (BLIT_SRC_WIDTH_SHRINK | BLIT_SRC_HEIGHT_SHRINK)) + assert(can_shrink_surface(&orig_params->src)); + + if (result & (BLIT_DST_WIDTH_SHRINK | BLIT_DST_HEIGHT_SHRINK)) + assert(can_shrink_surface(&orig_params->dst)); + + if (result & (BLIT_SRC_WIDTH_SHRINK | BLIT_DST_WIDTH_SHRINK)) { w /= 2.0; assert(w >= 1.0); split_coords.x.dst1 = MIN2(split_coords.x.dst0 + w, orig->x.dst1); adjust_split_source_coords(&orig->x, &split_coords.x, x_scale); } - if (result & BLIT_HEIGHT_SHRINK) { + if (result & (BLIT_SRC_HEIGHT_SHRINK | BLIT_DST_HEIGHT_SHRINK)) { h /= 2.0; assert(h >= 1.0); split_coords.y.dst1 = MIN2(split_coords.y.dst0 + h, orig->y.dst1); adjust_split_source_coords(&orig->y, &split_coords.y, y_scale); } - if (result != 0) { - assert(can_shrink_surfaces(orig_params)); - shrink = true; + if (result) { + /* We may get less bits set on result than we had already, so make + * sure we remember all the ways in which a resize is required. + */ + shrink |= result; continue; } @@ -2297,17 +2314,6 @@ } } - /* ISL_FORMAT_R24_UNORM_X8_TYPELESS it isn't supported as a render target, - * which requires shader math to render to it. Blitting Z24X8 to Z24X8 - * is fairly common though, so we'd like to avoid it. Since we don't need - * to blend depth values, we can simply pick a renderable format with the - * right number of bits-per-pixel, like 8-bit BGRA. - */ - if (dst_surf->surf->format == ISL_FORMAT_R24_UNORM_X8_TYPELESS && - src_surf->surf->format == ISL_FORMAT_R24_UNORM_X8_TYPELESS) { - src_format = dst_format = ISL_FORMAT_B8G8R8A8_UNORM; - } - brw_blorp_surface_info_init(batch->blorp, ¶ms.src, src_surf, src_level, src_layer, src_format, false); brw_blorp_surface_info_init(batch->blorp, ¶ms.dst, dst_surf, dst_level, @@ -2428,7 +2434,7 @@ * operation between the two bit layouts. */ static enum isl_format -get_ccs_compatible_uint_format(const struct isl_format_layout *fmtl) +get_ccs_compatible_copy_format(const struct isl_format_layout *fmtl) { switch (fmtl->format) { case ISL_FORMAT_R32G32B32A32_FLOAT: @@ -2485,9 +2491,49 @@ case ISL_FORMAT_B10G10R10A2_UNORM: case ISL_FORMAT_B10G10R10A2_UNORM_SRGB: case ISL_FORMAT_R10G10B10A2_UNORM: + case ISL_FORMAT_R10G10B10A2_UNORM_SRGB: + case ISL_FORMAT_R10G10B10_FLOAT_A2_UNORM: case ISL_FORMAT_R10G10B10A2_UINT: return ISL_FORMAT_R10G10B10A2_UINT; + case ISL_FORMAT_R16_UNORM: + case ISL_FORMAT_R16_SNORM: + case ISL_FORMAT_R16_SINT: + case ISL_FORMAT_R16_UINT: + case ISL_FORMAT_R16_FLOAT: + return ISL_FORMAT_R16_UINT; + + case ISL_FORMAT_R8G8_UNORM: + case ISL_FORMAT_R8G8_SNORM: + case ISL_FORMAT_R8G8_SINT: + case ISL_FORMAT_R8G8_UINT: + return ISL_FORMAT_R8G8_UINT; + + case ISL_FORMAT_B5G5R5X1_UNORM: + case ISL_FORMAT_B5G5R5X1_UNORM_SRGB: + case ISL_FORMAT_B5G5R5A1_UNORM: + case ISL_FORMAT_B5G5R5A1_UNORM_SRGB: + return ISL_FORMAT_B5G5R5A1_UNORM; + + case ISL_FORMAT_A4B4G4R4_UNORM: + case ISL_FORMAT_B4G4R4A4_UNORM: + case ISL_FORMAT_B4G4R4A4_UNORM_SRGB: + return ISL_FORMAT_B4G4R4A4_UNORM; + + case ISL_FORMAT_B5G6R5_UNORM: + case ISL_FORMAT_B5G6R5_UNORM_SRGB: + return ISL_FORMAT_B5G6R5_UNORM; + + case ISL_FORMAT_A1B5G5R5_UNORM: + return ISL_FORMAT_A1B5G5R5_UNORM; + + case ISL_FORMAT_A8_UNORM: + case ISL_FORMAT_R8_UNORM: + case ISL_FORMAT_R8_SNORM: + case ISL_FORMAT_R8_SINT: + case ISL_FORMAT_R8_UINT: + return ISL_FORMAT_R8_UINT; + default: unreachable("Not a compressible format"); } @@ -2578,16 +2624,26 @@ isl_format_get_layout(params.dst.surf.format); assert(params.src.aux_usage == ISL_AUX_USAGE_NONE || + params.src.aux_usage == ISL_AUX_USAGE_HIZ || params.src.aux_usage == ISL_AUX_USAGE_MCS || + params.src.aux_usage == ISL_AUX_USAGE_MCS_CCS || params.src.aux_usage == ISL_AUX_USAGE_CCS_E); assert(params.dst.aux_usage == ISL_AUX_USAGE_NONE || params.dst.aux_usage == ISL_AUX_USAGE_MCS || + params.dst.aux_usage == ISL_AUX_USAGE_MCS_CCS || params.dst.aux_usage == ISL_AUX_USAGE_CCS_E); - if (params.dst.aux_usage == ISL_AUX_USAGE_CCS_E) { - params.dst.view.format = get_ccs_compatible_uint_format(dst_fmtl); + if (params.src.aux_usage == ISL_AUX_USAGE_HIZ) { + /* Depth <-> Color copies are not allowed and HiZ isn't allowed in + * destinations because we draw as color. + */ + assert(params.dst.aux_usage == ISL_AUX_USAGE_NONE); + params.src.view.format = params.src.surf.format; + params.dst.view.format = params.src.surf.format; + } else if (params.dst.aux_usage == ISL_AUX_USAGE_CCS_E) { + params.dst.view.format = get_ccs_compatible_copy_format(dst_fmtl); if (params.src.aux_usage == ISL_AUX_USAGE_CCS_E) { - params.src.view.format = get_ccs_compatible_uint_format(src_fmtl); + params.src.view.format = get_ccs_compatible_copy_format(src_fmtl); } else if (src_fmtl->bpb == dst_fmtl->bpb) { params.src.view.format = params.dst.view.format; } else { @@ -2595,7 +2651,7 @@ get_copy_format_for_bpb(isl_dev, src_fmtl->bpb); } } else if (params.src.aux_usage == ISL_AUX_USAGE_CCS_E) { - params.src.view.format = get_ccs_compatible_uint_format(src_fmtl); + params.src.view.format = get_ccs_compatible_copy_format(src_fmtl); if (src_fmtl->bpb == dst_fmtl->bpb) { params.dst.view.format = params.src.view.format; } else { @@ -2652,9 +2708,9 @@ * because BLORP likes to treat things as if they have vec4 colors all * the time anyway. */ - if (isl_format_is_rgb(src_cast_format)) + if (isl_format_get_layout(src_cast_format)->bpb % 3 == 0) src_cast_format = isl_format_rgb_to_rgba(src_cast_format); - if (isl_format_is_rgb(dst_cast_format)) + if (isl_format_get_layout(dst_cast_format)->bpb % 3 == 0) dst_cast_format = isl_format_rgb_to_rgba(dst_cast_format); if (src_cast_format != dst_cast_format) { diff -Nru mesa-19.2.8/src/intel/blorp/blorp.c mesa-20.0.8/src/intel/blorp/blorp.c --- mesa-19.2.8/src/intel/blorp/blorp.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/blorp/blorp.c 2020-06-12 01:21:17.000000000 +0000 @@ -66,6 +66,7 @@ unsigned int level, unsigned int layer, enum isl_format format, bool is_render_target) { + memset(info, 0, sizeof(*info)); assert(level < surf->surf->levels); assert(layer < MAX2(surf->surf->logical_level0_px.depth >> level, surf->surf->logical_level0_px.array_len)); @@ -82,9 +83,6 @@ if (info->aux_usage != ISL_AUX_USAGE_NONE) { info->aux_surf = *surf->aux_surf; info->aux_addr = surf->aux_addr; - assert(level < info->aux_surf.levels); - assert(layer < MAX2(info->aux_surf.logical_level0_px.depth >> level, - info->aux_surf.logical_level0_px.array_len)); } info->clear_color = surf->clear_color; @@ -205,7 +203,7 @@ const unsigned *program = brw_compile_fs(compiler, blorp->driver_ctx, mem_ctx, wm_key, - wm_prog_data, nir, NULL, -1, -1, -1, false, use_repclear, + wm_prog_data, nir, -1, -1, -1, false, use_repclear, NULL, NULL, NULL); return program; @@ -368,3 +366,33 @@ batch->blorp->exec(batch, ¶ms); } } + +void +blorp_hiz_stencil_op(struct blorp_batch *batch, struct blorp_surf *stencil, + uint32_t level, uint32_t start_layer, + uint32_t num_layers, enum isl_aux_op op) +{ + struct blorp_params params; + blorp_params_init(¶ms); + + params.hiz_op = op; + params.full_surface_hiz_op = true; + + for (uint32_t a = 0; a < num_layers; a++) { + const uint32_t layer = start_layer + a; + + brw_blorp_surface_info_init(batch->blorp, ¶ms.stencil, stencil, level, + layer, stencil->surf->format, true); + params.x1 = minify(params.stencil.surf.logical_level0_px.width, + params.stencil.view.base_level); + params.y1 = minify(params.stencil.surf.logical_level0_px.height, + params.stencil.view.base_level); + params.dst.surf.samples = params.stencil.surf.samples; + params.dst.surf.logical_level0_px = + params.stencil.surf.logical_level0_px; + params.dst.view = params.stencil.view; + params.num_samples = params.stencil.surf.samples; + + batch->blorp->exec(batch, ¶ms); + } +} diff -Nru mesa-19.2.8/src/intel/blorp/blorp_clear.c mesa-20.0.8/src/intel/blorp/blorp_clear.c --- mesa-19.2.8/src/intel/blorp/blorp_clear.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/blorp/blorp_clear.c 2020-06-12 01:21:17.000000000 +0000 @@ -34,13 +34,14 @@ #define FILE_DEBUG_FLAG DEBUG_BLORP +#pragma pack(push, 1) struct brw_blorp_const_color_prog_key { enum blorp_shader_type shader_type; /* Must be BLORP_SHADER_TYPE_CLEAR */ bool use_simd16_replicated_data; bool clear_rgb_as_red; - bool pad[3]; }; +#pragma pack(pop) static bool blorp_params_get_clear_kernel(struct blorp_batch *batch, @@ -108,10 +109,12 @@ return result; } +#pragma pack(push, 1) struct layer_offset_vs_key { enum blorp_shader_type shader_type; unsigned num_inputs; }; +#pragma pack(pop) /* In the case of doing attachment clears, we are using a surface state that * is handed to us so we can't set (and don't even know) the base array layer. @@ -232,10 +235,12 @@ x_align *= 16; - /* SKL+ line alignment requirement for Y-tiled are half those of the prior - * generations. + /* The line alignment requirement for Y-tiled is halved at SKL and again + * at TGL. */ - if (dev->info->gen >= 9) + if (dev->info->gen >= 12) + y_align *= 8; + else if (dev->info->gen >= 9) y_align *= 16; else y_align *= 32; @@ -327,15 +332,11 @@ void blorp_fast_clear(struct blorp_batch *batch, - const struct blorp_surf *surf, enum isl_format format, + const struct blorp_surf *surf, + enum isl_format format, struct isl_swizzle swizzle, uint32_t level, uint32_t start_layer, uint32_t num_layers, uint32_t x0, uint32_t y0, uint32_t x1, uint32_t y1) { - /* Ensure that all layers undergoing the clear have an auxiliary buffer. */ - assert(start_layer + num_layers <= - MAX2(surf->aux_surf->logical_level0_px.depth >> level, - surf->aux_surf->logical_level0_px.array_len)); - struct blorp_params params; blorp_params_init(¶ms); params.num_layers = num_layers; @@ -358,6 +359,12 @@ start_layer, format, true); params.num_samples = params.dst.surf.samples; + /* If a swizzle was provided, we need to swizzle the clear color so that + * the hardware color format conversion will work properly. + */ + params.dst.clear_color = swizzle_color_value(params.dst.clear_color, + swizzle); + batch->blorp->exec(batch, ¶ms); } @@ -565,6 +572,107 @@ } } +static bool +blorp_clear_stencil_as_rgba(struct blorp_batch *batch, + const struct blorp_surf *surf, + uint32_t level, uint32_t start_layer, + uint32_t num_layers, + uint32_t x0, uint32_t y0, uint32_t x1, uint32_t y1, + uint8_t stencil_mask, uint8_t stencil_value) +{ + /* We only support separate W-tiled stencil for now */ + if (surf->surf->format != ISL_FORMAT_R8_UINT || + surf->surf->tiling != ISL_TILING_W) + return false; + + /* Stencil mask support would require piles of shader magic */ + if (stencil_mask != 0xff) + return false; + + if (surf->surf->samples > 1) { + /* Adjust x0, y0, x1, and y1 to be in units of samples */ + assert(surf->surf->msaa_layout == ISL_MSAA_LAYOUT_INTERLEAVED); + struct isl_extent2d msaa_px_size_sa = + isl_get_interleaved_msaa_px_size_sa(surf->surf->samples); + + x0 *= msaa_px_size_sa.w; + y0 *= msaa_px_size_sa.h; + x1 *= msaa_px_size_sa.w; + y1 *= msaa_px_size_sa.h; + } + + /* W-tiles and Y-tiles have the same layout as far as cache lines are + * concerned: both are 8x8 cache lines laid out Y-major. The difference is + * entirely in how the data is arranged withing the cache line. W-tiling + * is 8x8 pixels in a swizzled pattern while Y-tiling is 16B by 4 rows + * regardless of image format size. As long as everything is aligned to 8, + * we can just treat the W-tiled image as Y-tiled, ignore the layout + * difference within a cache line, and blast out data. + */ + if (x0 % 8 != 0 || y0 % 8 != 0 || x1 % 8 != 0 || y1 % 8 != 0) + return false; + + struct blorp_params params; + blorp_params_init(¶ms); + + if (!blorp_params_get_clear_kernel(batch, ¶ms, true, false)) + return false; + + memset(¶ms.wm_inputs.clear_color, stencil_value, + sizeof(params.wm_inputs.clear_color)); + + /* The Sandy Bridge PRM Vol. 4 Pt. 2, section 2.11.2.1.1 has the + * following footnote to the format table: + * + * 128 BPE Formats cannot be Tiled Y when used as render targets + * + * We have to use RGBA16_UINT on SNB. + */ + enum isl_format wide_format; + if (ISL_DEV_GEN(batch->blorp->isl_dev) <= 6) { + wide_format = ISL_FORMAT_R16G16B16A16_UINT; + + /* For RGBA16_UINT, we need to mask the stencil value otherwise, we risk + * clamping giving us the wrong values + */ + for (unsigned i = 0; i < 4; i++) + params.wm_inputs.clear_color[i] &= 0xffff; + } else { + wide_format = ISL_FORMAT_R32G32B32A32_UINT; + } + + for (uint32_t a = 0; a < num_layers; a++) { + uint32_t layer = start_layer + a; + + brw_blorp_surface_info_init(batch->blorp, ¶ms.dst, surf, level, + layer, ISL_FORMAT_UNSUPPORTED, true); + + if (surf->surf->samples > 1) + blorp_surf_fake_interleaved_msaa(batch->blorp->isl_dev, ¶ms.dst); + + /* Make it Y-tiled */ + blorp_surf_retile_w_to_y(batch->blorp->isl_dev, ¶ms.dst); + + unsigned wide_Bpp = + isl_format_get_layout(wide_format)->bpb / 8; + + params.dst.view.format = params.dst.surf.format = wide_format; + assert(params.dst.surf.logical_level0_px.width % wide_Bpp == 0); + params.dst.surf.logical_level0_px.width /= wide_Bpp; + assert(params.dst.tile_x_sa % wide_Bpp == 0); + params.dst.tile_x_sa /= wide_Bpp; + + params.x0 = params.dst.tile_x_sa + x0 / (wide_Bpp / 2); + params.y0 = params.dst.tile_y_sa + y0 / 2; + params.x1 = params.dst.tile_x_sa + x1 / (wide_Bpp / 2); + params.y1 = params.dst.tile_y_sa + y1 / 2; + + batch->blorp->exec(batch, ¶ms); + } + + return true; +} + void blorp_clear_depth_stencil(struct blorp_batch *batch, const struct blorp_surf *depth, @@ -575,6 +683,13 @@ bool clear_depth, float depth_value, uint8_t stencil_mask, uint8_t stencil_value) { + if (!clear_depth && blorp_clear_stencil_as_rgba(batch, stencil, level, + start_layer, num_layers, + x0, y0, x1, y1, + stencil_mask, + stencil_value)) + return; + struct blorp_params params; blorp_params_init(¶ms); @@ -649,14 +764,16 @@ } bool -blorp_can_hiz_clear_depth(uint8_t gen, enum isl_format format, - uint32_t num_samples, +blorp_can_hiz_clear_depth(const struct gen_device_info *devinfo, + const struct isl_surf *surf, + enum isl_aux_usage aux_usage, + uint32_t level, uint32_t layer, uint32_t x0, uint32_t y0, uint32_t x1, uint32_t y1) { /* This function currently doesn't support any gen prior to gen8 */ - assert(gen >= 8); + assert(devinfo->gen >= 8); - if (gen == 8 && format == ISL_FORMAT_R16_UNORM) { + if (devinfo->gen == 8 && surf->format == ISL_FORMAT_R16_UNORM) { /* Apply the D16 alignment restrictions. On BDW, HiZ has an 8x4 sample * block with the following property: as the number of samples increases, * the number of pixels representable by this block decreases by a factor @@ -675,7 +792,7 @@ * Table: Pixel Dimensions in a HiZ Sample Block Pre-SKL */ const struct isl_extent2d sa_block_dim = - isl_get_interleaved_msaa_px_size_sa(num_samples); + isl_get_interleaved_msaa_px_size_sa(surf->samples); const uint8_t align_px_w = 8 / sa_block_dim.w; const uint8_t align_px_h = 4 / sa_block_dim.h; @@ -695,8 +812,56 @@ if (x0 % align_px_w || y0 % align_px_h || x1 % align_px_w || y1 % align_px_h) return false; + } else if (isl_surf_supports_hiz_ccs_wt(devinfo, surf, aux_usage)) { + /* We have to set the WM_HZ_OP::FullSurfaceDepthandStencilClear bit + * whenever we clear an uninitialized HIZ buffer (as some drivers + * currently do). However, this bit seems liable to clear 16x8 pixels in + * the ZCS on Gen12 - greater than the slice alignments for depth + * buffers. + */ + assert(surf->image_alignment_el.w % 16 != 0 || + surf->image_alignment_el.h % 8 != 0); + + /* This is the hypothesis behind some corruption that was seen with the + * amd_vertex_shader_layer-layered-depth-texture-render piglit test. + * + * From the Compressed Depth Buffers section of the Bspec, under the + * Gen12 texture performant and ZCS columns: + * + * Update with clear at either 16x8 or 8x4 granularity, based on + * fs_clr or otherwise. + * + * There are a number of ways to avoid full surface CCS clears that + * overlap other slices, but for now we choose to disable fast-clears + * when an initializing clear could hit another miplevel. + * + * NOTE: Because the CCS compresses the depth buffer and not a version + * of it that has been rearranged with different alignments (like Gen8+ + * HIZ), we have to make sure that the x0 and y0 are at least 16x8 + * aligned in the context of the entire surface. + */ + uint32_t slice_x0, slice_y0; + isl_surf_get_image_offset_el(surf, level, + surf->dim == ISL_SURF_DIM_3D ? 0 : layer, + surf->dim == ISL_SURF_DIM_3D ? layer: 0, + &slice_x0, &slice_y0); + const bool max_x1_y1 = + x1 == minify(surf->logical_level0_px.width, level) && + y1 == minify(surf->logical_level0_px.height, level); + const uint32_t haligned_x1 = ALIGN(x1, surf->image_alignment_el.w); + const uint32_t valigned_y1 = ALIGN(y1, surf->image_alignment_el.h); + const bool unaligned = (slice_x0 + x0) % 16 || (slice_y0 + y0) % 8 || + max_x1_y1 ? haligned_x1 % 16 || valigned_y1 % 8 : + x1 % 16 || y1 % 8; + const bool alignment_used = surf->levels > 1 || + surf->logical_level0_px.depth > 1 || + surf->logical_level0_px.array_len > 1; + + if (unaligned && alignment_used) + return false; } - return true; + + return isl_aux_usage_has_hiz(aux_usage); } void @@ -882,7 +1047,10 @@ assert(aux_fmtl->txc == ISL_TXC_CCS); unsigned x_scaledown, y_scaledown; - if (ISL_DEV_GEN(batch->blorp->isl_dev) >= 9) { + if (ISL_DEV_GEN(batch->blorp->isl_dev) >= 12) { + x_scaledown = aux_fmtl->bw * 8; + y_scaledown = aux_fmtl->bh * 4; + } else if (ISL_DEV_GEN(batch->blorp->isl_dev) >= 9) { x_scaledown = aux_fmtl->bw * 8; y_scaledown = aux_fmtl->bh * 8; } else if (ISL_DEV_GEN(batch->blorp->isl_dev) >= 8) { @@ -893,8 +1061,8 @@ y_scaledown = aux_fmtl->bh / 2; } params.x0 = params.y0 = 0; - params.x1 = minify(params.dst.aux_surf.logical_level0_px.width, level); - params.y1 = minify(params.dst.aux_surf.logical_level0_px.height, level); + params.x1 = minify(params.dst.surf.logical_level0_px.width, level); + params.y1 = minify(params.dst.surf.logical_level0_px.height, level); params.x1 = ALIGN(params.x1, x_scaledown) / x_scaledown; params.y1 = ALIGN(params.y1, y_scaledown) / y_scaledown; @@ -931,6 +1099,7 @@ nir_imm_int(b, 1)); } +#pragma pack(push, 1) struct blorp_mcs_partial_resolve_key { enum blorp_shader_type shader_type; @@ -938,6 +1107,7 @@ bool int_format; uint32_t num_samples; }; +#pragma pack(pop) static bool blorp_params_get_mcs_partial_resolve_kernel(struct blorp_batch *batch, diff -Nru mesa-19.2.8/src/intel/blorp/blorp_genX_exec.h mesa-20.0.8/src/intel/blorp/blorp_genX_exec.h --- mesa-19.2.8/src/intel/blorp/blorp_genX_exec.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/blorp/blorp_genX_exec.h 2020-06-12 01:21:17.000000000 +0000 @@ -27,6 +27,7 @@ #include "blorp_priv.h" #include "dev/gen_device_info.h" #include "common/gen_sample_positions.h" +#include "common/gen_l3_config.h" #include "genxml/gen_macros.h" /** @@ -62,12 +63,11 @@ static void blorp_vf_invalidate_for_vb_48b_transitions(struct blorp_batch *batch, const struct blorp_address *addrs, + uint32_t *sizes, unsigned num_vbs); -#if GEN_GEN >= 8 -static struct blorp_address +UNUSED static struct blorp_address blorp_get_workaround_page(struct blorp_batch *batch); -#endif static void blorp_alloc_binding_table(struct blorp_batch *batch, unsigned num_entries, @@ -91,9 +91,14 @@ blorp_get_surface_base_address(struct blorp_batch *batch); #endif +#if GEN_GEN >= 7 +static const struct gen_l3_config * +blorp_get_l3_config(struct blorp_batch *batch); +# else static void blorp_emit_urb_config(struct blorp_batch *batch, unsigned vs_entry_size, unsigned sf_entry_size); +#endif static void blorp_emit_pipeline(struct blorp_batch *batch, @@ -184,7 +189,8 @@ */ static void emit_urb_config(struct blorp_batch *batch, - const struct blorp_params *params) + const struct blorp_params *params, + enum gen_urb_deref_block_size *deref_block_size) { /* Once vertex fetcher has written full VUE entries with complete * header the space requirement is as follows per vertex (in bytes): @@ -206,7 +212,43 @@ const unsigned sf_entry_size = params->sf_prog_data ? params->sf_prog_data->urb_entry_size : 0; +#if GEN_GEN >= 7 + assert(sf_entry_size == 0); + const unsigned entry_size[4] = { vs_entry_size, 1, 1, 1 }; + + unsigned entries[4], start[4]; + gen_get_urb_config(batch->blorp->compiler->devinfo, + blorp_get_l3_config(batch), + false, false, entry_size, + entries, start, deref_block_size); + +#if GEN_GEN == 7 && !GEN_IS_HASWELL + /* From the IVB PRM Vol. 2, Part 1, Section 3.2.1: + * + * "A PIPE_CONTROL with Post-Sync Operation set to 1h and a depth stall + * needs to be sent just prior to any 3DSTATE_VS, 3DSTATE_URB_VS, + * 3DSTATE_CONSTANT_VS, 3DSTATE_BINDING_TABLE_POINTER_VS, + * 3DSTATE_SAMPLER_STATE_POINTER_VS command. Only one PIPE_CONTROL + * needs to be sent before any combination of VS associated 3DSTATE." + */ + blorp_emit(batch, GENX(PIPE_CONTROL), pc) { + pc.DepthStallEnable = true; + pc.PostSyncOperation = WriteImmediateData; + pc.Address = blorp_get_workaround_page(batch); + } +#endif + + for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) { + blorp_emit(batch, GENX(3DSTATE_URB_VS), urb) { + urb._3DCommandSubOpcode += i; + urb.VSURBStartingAddress = start[i]; + urb.VSURBEntryAllocationSize = entry_size[i] - 1; + urb.VSNumberofURBEntries = entries[i]; + } + } +#else /* GEN_GEN < 7 */ blorp_emit_urb_config(batch, vs_entry_size, sf_entry_size); +#endif } #if GEN_GEN >= 7 @@ -344,15 +386,15 @@ memset(vb, 0, sizeof(vb)); struct blorp_address addrs[2] = {}; - uint32_t size; - blorp_emit_vertex_data(batch, params, &addrs[0], &size); - blorp_fill_vertex_buffer_state(batch, vb, 0, addrs[0], size, + uint32_t sizes[2]; + blorp_emit_vertex_data(batch, params, &addrs[0], &sizes[0]); + blorp_fill_vertex_buffer_state(batch, vb, 0, addrs[0], sizes[0], 3 * sizeof(float)); - blorp_emit_input_varying_data(batch, params, &addrs[1], &size); - blorp_fill_vertex_buffer_state(batch, vb, 1, addrs[1], size, 0); + blorp_emit_input_varying_data(batch, params, &addrs[1], &sizes[1]); + blorp_fill_vertex_buffer_state(batch, vb, 1, addrs[1], sizes[1], 0); - blorp_vf_invalidate_for_vb_48b_transitions(batch, addrs, num_vbs); + blorp_vf_invalidate_for_vb_48b_transitions(batch, addrs, sizes, num_vbs); const unsigned num_dwords = 1 + num_vbs * GENX(VERTEX_BUFFER_STATE_length); uint32_t *dw = blorp_emitn(batch, GENX(3DSTATE_VERTEX_BUFFERS), num_dwords); @@ -645,7 +687,8 @@ static void blorp_emit_sf_config(struct blorp_batch *batch, - const struct blorp_params *params) + const struct blorp_params *params, + enum gen_urb_deref_block_size urb_deref_block_size) { const struct brw_wm_prog_data *prog_data = params->wm_prog_data; @@ -670,7 +713,11 @@ #if GEN_GEN >= 8 - blorp_emit(batch, GENX(3DSTATE_SF), sf); + blorp_emit(batch, GENX(3DSTATE_SF), sf) { +#if GEN_GEN >= 12 + sf.DerefBlockSize = urb_deref_block_size; +#endif + } blorp_emit(batch, GENX(3DSTATE_RASTER), raster) { raster.CullMode = CULLMODE_NONE; @@ -771,13 +818,6 @@ ps.BindingTableEntryCount = 1; } - /* Gen 11 workarounds table #2056 WABTPPrefetchDisable suggests to - * disable prefetching of binding tables on A0 and B0 steppings. - * TODO: Revisit this WA on C0 stepping. - */ - if (GEN_GEN == 11) - ps.BindingTableEntryCount = 0; - /* SAMPLER_STATE prefetching is broken on Gen11 - WA_1606682166 */ if (GEN_GEN == 11) ps.SamplerCount = 0; @@ -1218,7 +1258,8 @@ uint32_t color_calc_state_offset; uint32_t depth_stencil_state_offset; - emit_urb_config(batch, params); + enum gen_urb_deref_block_size urb_deref_block_size; + emit_urb_config(batch, params, &urb_deref_block_size); if (params->wm_prog_data) { blend_state_offset = blorp_emit_blend_state(batch, params); @@ -1252,6 +1293,12 @@ (void)depth_stencil_state_offset; #endif +#if GEN_GEN >= 12 + blorp_emit(batch, GENX(3DSTATE_CONSTANT_ALL), pc) { + /* Update empty push constants for all stages (bitmask = 11111b) */ + pc.ShaderUpdateEnable = 0x1f; + } +#else blorp_emit(batch, GENX(3DSTATE_CONSTANT_VS), vs); #if GEN_GEN >= 7 blorp_emit(batch, GENX(3DSTATE_CONSTANT_HS), hs); @@ -1259,6 +1306,7 @@ #endif blorp_emit(batch, GENX(3DSTATE_CONSTANT_GS), gs); blorp_emit(batch, GENX(3DSTATE_CONSTANT_PS), ps); +#endif if (params->src.enabled) blorp_emit_sampler_state(batch); @@ -1292,7 +1340,7 @@ clip.PerspectiveDivideDisable = true; } - blorp_emit_sf_config(batch, params); + blorp_emit_sf_config(batch, params, urb_deref_block_size); blorp_emit_ps_config(batch, params); blorp_emit_cc_viewport(batch); @@ -1355,10 +1403,13 @@ surf.dim = ISL_SURF_DIM_2D; } - /* Blorp doesn't support HiZ in any of the blit or slow-clear paths */ + if (isl_aux_usage_has_hiz(surface->aux_usage)) { + /* BLORP doesn't render with depth so we can't use HiZ */ + assert(!is_render_target); + /* We can't reinterpret HiZ */ + assert(surface->surf.format == surface->view.format); + } enum isl_aux_usage aux_usage = surface->aux_usage; - if (aux_usage == ISL_AUX_USAGE_HIZ) - aux_usage = ISL_AUX_USAGE_NONE; isl_channel_mask_t write_disable_mask = 0; if (is_render_target && GEN_GEN <= 5) { @@ -1520,6 +1571,9 @@ */ blorp_emit(batch, GENX(PIPE_CONTROL), pipe) { pipe.StateCacheInvalidationEnable = true; +#if GEN_GEN >= 12 + pipe.TileCacheFlushEnable = true; +#endif } } #endif @@ -1573,7 +1627,7 @@ params->depth.addr, 0); info.hiz_usage = params->depth.aux_usage; - if (info.hiz_usage == ISL_AUX_USAGE_HIZ) { + if (isl_aux_usage_has_hiz(info.hiz_usage)) { info.hiz_surf = ¶ms->depth.aux_surf; struct blorp_address hiz_address = params->depth.aux_addr; @@ -1601,6 +1655,7 @@ if (params->stencil.enabled) { info.stencil_surf = ¶ms->stencil.surf; + info.stencil_aux_usage = params->stencil.aux_usage; struct blorp_address stencil_address = params->stencil.addr; #if GEN_GEN == 6 /* Sandy bridge hardware does not technically support mipmapped stencil. @@ -1621,6 +1676,20 @@ } isl_emit_depth_stencil_hiz_s(isl_dev, dw, &info); + +#if GEN_GEN >= 12 + /* GEN:BUG:1408224581 + * + * Workaround: Gen12LP Astep only An additional pipe control with + * post-sync = store dword operation would be required.( w/a is to + * have an additional pipe control after the stencil state whenever + * the surface state bits of this state is changing). + */ + blorp_emit(batch, GENX(PIPE_CONTROL), pc) { + pc.PostSyncOperation = WriteImmediateData; + pc.Address = blorp_get_workaround_page(batch); + } +#endif } #if GEN_GEN >= 8 @@ -1636,11 +1705,18 @@ */ assert(params->depth.enabled || params->stencil.enabled); - /* The stencil buffer should only be enabled if a fast clear operation is - * requested. + /* The stencil buffer should only be enabled on GEN == 12, if a fast clear + * or full resolve operation is requested. On rest of the GEN, if a fast + * clear operation is requested. */ - if (params->stencil.enabled) + if (params->stencil.enabled) { +#if GEN_GEN >= 12 + assert(params->hiz_op == ISL_AUX_OP_FAST_CLEAR || + params->hiz_op == ISL_AUX_OP_FULL_RESOLVE); +#else assert(params->hiz_op == ISL_AUX_OP_FAST_CLEAR); +#endif + } /* From the BDW PRM Volume 2, 3DSTATE_WM_HZ_OP: * @@ -1696,7 +1772,13 @@ break; case ISL_AUX_OP_FULL_RESOLVE: assert(params->full_surface_hiz_op); - hzp.DepthBufferResolveEnable = true; + hzp.DepthBufferResolveEnable = params->depth.enabled; +#if GEN_GEN >= 12 + if (params->stencil.enabled) { + assert(params->stencil.aux_usage == ISL_AUX_USAGE_CCS_E); + hzp.StencilBufferResolveEnable = true; + } +#endif break; case ISL_AUX_OP_AMBIGUATE: assert(params->full_surface_hiz_op); @@ -1757,7 +1839,9 @@ .MemoryAddress = clear_addr); /* dw starts at dword 1, but we need to fill dwords 3 and 5 */ dw[2] = info->clear_color.u32[0]; + dw[3] = 0; dw[4] = info->clear_color.u32[1]; + dw[5] = 0; clear_addr.offset += 8; dw = blorp_emitn(batch, GENX(MI_ATOMIC), num_dwords, @@ -1769,20 +1853,60 @@ .MemoryAddress = clear_addr); /* dw starts at dword 1, but we need to fill dwords 3 and 5 */ dw[2] = info->clear_color.u32[2]; + dw[3] = 0; dw[4] = info->clear_color.u32[3]; + dw[5] = 0; blorp_emit(batch, GENX(PIPE_CONTROL), pipe) { pipe.StateCacheInvalidationEnable = true; pipe.TextureCacheInvalidationEnable = true; } #elif GEN_GEN >= 9 + + /* According to GEN:BUG:2201730850, in the Clear Color Programming Note + * under the Red channel, "Software shall write the converted Depth + * Clear to this dword." The only depth formats listed under the red + * channel are IEEE_FP and UNORM24_X8. These two requirements are + * incompatible with the UNORM16 depth format, so just ignore that case + * and simply perform the conversion for all depth formats. + */ + union isl_color_value fixed_color = info->clear_color; + if (GEN_GEN == 12 && isl_surf_usage_is_depth(info->surf.usage)) { + isl_color_value_pack(&info->clear_color, info->surf.format, + fixed_color.u32); + } + for (int i = 0; i < 4; i++) { blorp_emit(batch, GENX(MI_STORE_DATA_IMM), sdi) { sdi.Address = info->clear_color_addr; sdi.Address.offset += i * 4; - sdi.ImmediateData = info->clear_color.u32[i]; + sdi.ImmediateData = fixed_color.u32[i]; +#if GEN_GEN >= 12 + if (i == 3) + sdi.ForceWriteCompletionCheck = true; +#endif + } + } + +/* The RENDER_SURFACE_STATE::ClearColor field states that software should + * write the converted depth value 16B after the clear address: + * + * 3D Sampler will always fetch clear depth from the location 16-bytes + * above this address, where the clear depth, converted to native + * surface format by software, will be stored. + * + */ +#if GEN_GEN >= 12 + if (isl_surf_usage_is_depth(info->surf.usage)) { + blorp_emit(batch, GENX(MI_STORE_DATA_IMM), sdi) { + sdi.Address = info->clear_color_addr; + sdi.Address.offset += 4 * 4; + sdi.ImmediateData = fixed_color.u32[0]; + sdi.ForceWriteCompletionCheck = true; } } +#endif + #elif GEN_GEN >= 7 blorp_emit(batch, GENX(MI_STORE_DATA_IMM), sdi) { sdi.Address = info->clear_color_addr; diff -Nru mesa-19.2.8/src/intel/blorp/blorp.h mesa-20.0.8/src/intel/blorp/blorp.h --- mesa-19.2.8/src/intel/blorp/blorp.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/blorp/blorp.h 2020-06-12 01:21:17.000000000 +0000 @@ -165,7 +165,8 @@ void blorp_fast_clear(struct blorp_batch *batch, - const struct blorp_surf *surf, enum isl_format format, + const struct blorp_surf *surf, + enum isl_format format, struct isl_swizzle swizzle, uint32_t level, uint32_t start_layer, uint32_t num_layers, uint32_t x0, uint32_t y0, uint32_t x1, uint32_t y1); @@ -188,10 +189,11 @@ bool clear_depth, float depth_value, uint8_t stencil_mask, uint8_t stencil_value); bool -blorp_can_hiz_clear_depth(uint8_t gen, enum isl_format format, - uint32_t num_samples, - uint32_t x0, uint32_t y0, - uint32_t x1, uint32_t y1); +blorp_can_hiz_clear_depth(const struct gen_device_info *devinfo, + const struct isl_surf *surf, + enum isl_aux_usage aux_usage, + uint32_t level, uint32_t layer, + uint32_t x0, uint32_t y0, uint32_t x1, uint32_t y1); void blorp_hiz_clear_depth_stencil(struct blorp_batch *batch, const struct blorp_surf *depth, @@ -245,6 +247,10 @@ uint32_t level, uint32_t start_layer, uint32_t num_layers, enum isl_aux_op op); +void +blorp_hiz_stencil_op(struct blorp_batch *batch, struct blorp_surf *stencil, + uint32_t level, uint32_t start_layer, + uint32_t num_layers, enum isl_aux_op op); #ifdef __cplusplus } /* end extern "C" */ #endif /* __cplusplus */ diff -Nru mesa-19.2.8/src/intel/blorp/blorp_priv.h mesa-20.0.8/src/intel/blorp/blorp_priv.h --- mesa-19.2.8/src/intel/blorp/blorp_priv.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/blorp/blorp_priv.h 2020-06-12 01:21:17.000000000 +0000 @@ -83,6 +83,12 @@ struct brw_blorp_surface_info *info, uint32_t *x, uint32_t *y, uint32_t *width, uint32_t *height); +void +blorp_surf_fake_interleaved_msaa(const struct isl_device *isl_dev, + struct brw_blorp_surface_info *info); +void +blorp_surf_retile_w_to_y(const struct isl_device *isl_dev, + struct brw_blorp_surface_info *info); struct brw_blorp_coord_transform diff -Nru mesa-19.2.8/src/intel/common/gen_aux_map.c mesa-20.0.8/src/intel/common/gen_aux_map.c --- mesa-19.2.8/src/intel/common/gen_aux_map.c 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/intel/common/gen_aux_map.c 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,623 @@ +/* + * Copyright (c) 2018 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +/** + * The aux map provides a multi-level lookup of the main surface address which + * ends up providing information about the auxiliary surface data, including + * the address where the auxiliary data resides. + * + * The 48-bit VMA (GPU) address of the main surface is split to do the address + * lookup: + * + * 48 bit address of main surface + * +--------+--------+--------+------+ + * | 47:36 | 35:24 | 23:16 | 15:0 | + * | L3-idx | L2-idx | L1-idx | ... | + * +--------+--------+--------+------+ + * + * The GFX_AUX_TABLE_BASE_ADDR points to a buffer. The L3 Table Entry is + * located by indexing into this buffer as a uint64_t array using the L3-idx + * value. The 64-bit L3 entry is defined as: + * + * +-------+-------------+------+---+ + * | 63:48 | 47:15 | 14:1 | 0 | + * | ... | L2-tbl-addr | ... | V | + * +-------+-------------+------+---+ + * + * If the `V` (valid) bit is set, then the L2-tbl-addr gives the address for + * the level-2 table entries, with the lower address bits filled with zero. + * The L2 Table Entry is located by indexing into this buffer as a uint64_t + * array using the L2-idx value. The 64-bit L2 entry is similar to the L3 + * entry, except with 2 additional address bits: + * + * +-------+-------------+------+---+ + * | 63:48 | 47:13 | 12:1 | 0 | + * | ... | L1-tbl-addr | ... | V | + * +-------+-------------+------+---+ + * + * If the `V` bit is set, then the L1-tbl-addr gives the address for the + * level-1 table entries, with the lower address bits filled with zero. The L1 + * Table Entry is located by indexing into this buffer as a uint64_t array + * using the L1-idx value. The 64-bit L1 entry is defined as: + * + * +--------+------+-------+-------+-------+---------------+-----+---+ + * | 63:58 | 57 | 56:54 | 53:52 | 51:48 | 47:8 | 7:1 | 0 | + * | Format | Y/Cr | Depth | TM | ... | aux-data-addr | ... | V | + * +--------+------+-------+-------+-------+---------------+-----+---+ + * + * Where: + * - Format: See `get_format_encoding` + * - Y/Cr: 0=not-Y/Cr, 1=Y/Cr + * - (bit) Depth: See `get_bpp_encoding` + * - TM (Tile-mode): 0=Ys, 1=Y, 2=rsvd, 3=rsvd + * - aux-data-addr: VMA/GPU address for the aux-data + * - V: entry is valid + */ + +#include "gen_aux_map.h" +#include "gen_gem.h" + +#include "dev/gen_device_info.h" + +#include "drm-uapi/i915_drm.h" +#include "util/list.h" +#include "util/ralloc.h" +#include "util/u_atomic.h" +#include "main/macros.h" + +#include +#include +#include +#include + +static const bool aux_map_debug = false; + +struct aux_map_buffer { + struct list_head link; + struct gen_buffer *buffer; +}; + +struct gen_aux_map_context { + void *driver_ctx; + pthread_mutex_t mutex; + struct gen_mapped_pinned_buffer_alloc *buffer_alloc; + uint32_t num_buffers; + struct list_head buffers; + uint64_t level3_base_addr; + uint64_t *level3_map; + uint32_t tail_offset, tail_remaining; + uint32_t state_num; +}; + +static bool +add_buffer(struct gen_aux_map_context *ctx) +{ + struct aux_map_buffer *buf = ralloc(ctx, struct aux_map_buffer); + if (!buf) + return false; + + const uint32_t size = 0x100000; + buf->buffer = ctx->buffer_alloc->alloc(ctx->driver_ctx, size); + if (!buf->buffer) { + ralloc_free(buf); + return false; + } + + assert(buf->buffer->map != NULL); + + list_addtail(&buf->link, &ctx->buffers); + ctx->tail_offset = 0; + ctx->tail_remaining = size; + p_atomic_inc(&ctx->num_buffers); + + return true; +} + +static void +advance_current_pos(struct gen_aux_map_context *ctx, uint32_t size) +{ + assert(ctx->tail_remaining >= size); + ctx->tail_remaining -= size; + ctx->tail_offset += size; +} + +static bool +align_and_verify_space(struct gen_aux_map_context *ctx, uint32_t size, + uint32_t align) +{ + if (ctx->tail_remaining < size) + return false; + + struct aux_map_buffer *tail = + list_last_entry(&ctx->buffers, struct aux_map_buffer, link); + uint64_t gpu = tail->buffer->gpu + ctx->tail_offset; + uint64_t aligned = align64(gpu, align); + + if ((aligned - gpu) + size > ctx->tail_remaining) { + return false; + } else { + if (aligned - gpu > 0) + advance_current_pos(ctx, aligned - gpu); + return true; + } +} + +static void +get_current_pos(struct gen_aux_map_context *ctx, uint64_t *gpu, uint64_t **map) +{ + assert(!list_is_empty(&ctx->buffers)); + struct aux_map_buffer *tail = + list_last_entry(&ctx->buffers, struct aux_map_buffer, link); + if (gpu) + *gpu = tail->buffer->gpu + ctx->tail_offset; + if (map) + *map = (uint64_t*)((uint8_t*)tail->buffer->map + ctx->tail_offset); +} + +static bool +add_sub_table(struct gen_aux_map_context *ctx, uint32_t size, + uint32_t align, uint64_t *gpu, uint64_t **map) +{ + if (!align_and_verify_space(ctx, size, align)) { + if (!add_buffer(ctx)) + return false; + UNUSED bool aligned = align_and_verify_space(ctx, size, align); + assert(aligned); + } + get_current_pos(ctx, gpu, map); + memset(*map, 0, size); + advance_current_pos(ctx, size); + return true; +} + +uint32_t +gen_aux_map_get_state_num(struct gen_aux_map_context *ctx) +{ + return p_atomic_read(&ctx->state_num); +} + +struct gen_aux_map_context * +gen_aux_map_init(void *driver_ctx, + struct gen_mapped_pinned_buffer_alloc *buffer_alloc, + const struct gen_device_info *devinfo) +{ + struct gen_aux_map_context *ctx; + if (devinfo->gen < 12) + return NULL; + + ctx = ralloc(NULL, struct gen_aux_map_context); + if (!ctx) + return NULL; + + if (pthread_mutex_init(&ctx->mutex, NULL)) + return NULL; + + ctx->driver_ctx = driver_ctx; + ctx->buffer_alloc = buffer_alloc; + ctx->num_buffers = 0; + list_inithead(&ctx->buffers); + ctx->tail_offset = 0; + ctx->tail_remaining = 0; + ctx->state_num = 0; + + if (add_sub_table(ctx, 32 * 1024, 32 * 1024, &ctx->level3_base_addr, + &ctx->level3_map)) { + if (aux_map_debug) + fprintf(stderr, "AUX-MAP L3: 0x%"PRIx64", map=%p\n", + ctx->level3_base_addr, ctx->level3_map); + p_atomic_inc(&ctx->state_num); + return ctx; + } else { + ralloc_free(ctx); + return NULL; + } +} + +void +gen_aux_map_finish(struct gen_aux_map_context *ctx) +{ + if (!ctx) + return; + + pthread_mutex_destroy(&ctx->mutex); + list_for_each_entry_safe(struct aux_map_buffer, buf, &ctx->buffers, link) { + ctx->buffer_alloc->free(ctx->driver_ctx, buf->buffer); + list_del(&buf->link); + p_atomic_dec(&ctx->num_buffers); + ralloc_free(buf); + } + + ralloc_free(ctx); +} + +uint64_t +gen_aux_map_get_base(struct gen_aux_map_context *ctx) +{ + /** + * This get initialized in gen_aux_map_init, and never changes, so there is + * no need to lock the mutex. + */ + return ctx->level3_base_addr; +} + +static struct aux_map_buffer * +find_buffer(struct gen_aux_map_context *ctx, uint64_t addr) +{ + list_for_each_entry(struct aux_map_buffer, buf, &ctx->buffers, link) { + if (buf->buffer->gpu <= addr && buf->buffer->gpu_end > addr) { + return buf; + } + } + return NULL; +} + +static uint64_t * +get_u64_entry_ptr(struct gen_aux_map_context *ctx, uint64_t addr) +{ + struct aux_map_buffer *buf = find_buffer(ctx, addr); + assert(buf); + uintptr_t map_offset = addr - buf->buffer->gpu; + return (uint64_t*)((uint8_t*)buf->buffer->map + map_offset); +} + +static uint8_t +get_format_encoding(const struct isl_surf *isl_surf) +{ + switch(isl_surf->format) { + case ISL_FORMAT_R32G32B32A32_FLOAT: return 0x11; + case ISL_FORMAT_R32G32B32X32_FLOAT: return 0x11; + case ISL_FORMAT_R32G32B32A32_SINT: return 0x12; + case ISL_FORMAT_R32G32B32A32_UINT: return 0x13; + case ISL_FORMAT_R16G16B16A16_UNORM: return 0x14; + case ISL_FORMAT_R16G16B16A16_SNORM: return 0x15; + case ISL_FORMAT_R16G16B16A16_SINT: return 0x16; + case ISL_FORMAT_R16G16B16A16_UINT: return 0x17; + case ISL_FORMAT_R16G16B16A16_FLOAT: return 0x10; + case ISL_FORMAT_R16G16B16X16_FLOAT: return 0x10; + case ISL_FORMAT_R32G32_FLOAT: return 0x11; + case ISL_FORMAT_R32G32_SINT: return 0x12; + case ISL_FORMAT_R32G32_UINT: return 0x13; + case ISL_FORMAT_B8G8R8A8_UNORM: return 0xA; + case ISL_FORMAT_B8G8R8X8_UNORM: return 0xA; + case ISL_FORMAT_B8G8R8A8_UNORM_SRGB: return 0xA; + case ISL_FORMAT_B8G8R8X8_UNORM_SRGB: return 0xA; + case ISL_FORMAT_R10G10B10A2_UNORM: return 0x18; + case ISL_FORMAT_R10G10B10A2_UNORM_SRGB: return 0x18; + case ISL_FORMAT_R10G10B10_FLOAT_A2_UNORM: return 0x19; + case ISL_FORMAT_R10G10B10A2_UINT: return 0x1A; + case ISL_FORMAT_R8G8B8A8_UNORM: return 0xA; + case ISL_FORMAT_R8G8B8A8_UNORM_SRGB: return 0xA; + case ISL_FORMAT_R8G8B8A8_SNORM: return 0x1B; + case ISL_FORMAT_R8G8B8A8_SINT: return 0x1C; + case ISL_FORMAT_R8G8B8A8_UINT: return 0x1D; + case ISL_FORMAT_R16G16_UNORM: return 0x14; + case ISL_FORMAT_R16G16_SNORM: return 0x15; + case ISL_FORMAT_R16G16_SINT: return 0x16; + case ISL_FORMAT_R16G16_UINT: return 0x17; + case ISL_FORMAT_R16G16_FLOAT: return 0x10; + case ISL_FORMAT_B10G10R10A2_UNORM: return 0x18; + case ISL_FORMAT_B10G10R10A2_UNORM_SRGB: return 0x18; + case ISL_FORMAT_R11G11B10_FLOAT: return 0x1E; + case ISL_FORMAT_R32_SINT: return 0x12; + case ISL_FORMAT_R32_UINT: return 0x13; + case ISL_FORMAT_R32_FLOAT: return 0x11; + case ISL_FORMAT_R24_UNORM_X8_TYPELESS: return 0x11; + case ISL_FORMAT_B5G6R5_UNORM: return 0xA; + case ISL_FORMAT_B5G6R5_UNORM_SRGB: return 0xA; + case ISL_FORMAT_B5G5R5A1_UNORM: return 0xA; + case ISL_FORMAT_B5G5R5A1_UNORM_SRGB: return 0xA; + case ISL_FORMAT_B4G4R4A4_UNORM: return 0xA; + case ISL_FORMAT_B4G4R4A4_UNORM_SRGB: return 0xA; + case ISL_FORMAT_R8G8_UNORM: return 0xA; + case ISL_FORMAT_R8G8_SNORM: return 0x1B; + case ISL_FORMAT_R8G8_SINT: return 0x1C; + case ISL_FORMAT_R8G8_UINT: return 0x1D; + case ISL_FORMAT_R16_UNORM: return 0x14; + case ISL_FORMAT_R16_SNORM: return 0x15; + case ISL_FORMAT_R16_SINT: return 0x16; + case ISL_FORMAT_R16_UINT: return 0x17; + case ISL_FORMAT_R16_FLOAT: return 0x10; + case ISL_FORMAT_B5G5R5X1_UNORM: return 0xA; + case ISL_FORMAT_B5G5R5X1_UNORM_SRGB: return 0xA; + case ISL_FORMAT_A1B5G5R5_UNORM: return 0xA; + case ISL_FORMAT_A4B4G4R4_UNORM: return 0xA; + case ISL_FORMAT_R8_UNORM: return 0xA; + case ISL_FORMAT_R8_SNORM: return 0x1B; + case ISL_FORMAT_R8_SINT: return 0x1C; + case ISL_FORMAT_R8_UINT: return 0x1D; + case ISL_FORMAT_A8_UNORM: return 0xA; + default: + unreachable("Unsupported aux-map format!"); + return 0; + } +} + +static uint8_t +get_bpp_encoding(uint16_t bpp) +{ + switch (bpp) { + case 16: return 0; + case 10: return 1; + case 12: return 2; + case 8: return 4; + case 32: return 5; + case 64: return 6; + case 128: return 7; + default: + unreachable("Unsupported bpp!"); + return 0; + } +} + +#define GEN_AUX_MAP_ENTRY_Y_TILED_BIT (0x1ull << 52) + +uint64_t +gen_aux_map_format_bits_for_isl_surf(const struct isl_surf *isl_surf) +{ + const struct isl_format_layout *fmtl = + isl_format_get_layout(isl_surf->format); + + uint16_t bpp = fmtl->bpb; + assert(fmtl->bw == 1 && fmtl->bh == 1 && fmtl->bd == 1); + if (aux_map_debug) + fprintf(stderr, "AUX-MAP entry %s, bpp=%d\n", + isl_format_get_name(isl_surf->format), bpp); + + assert(isl_tiling_is_any_y(isl_surf->tiling)); + + uint64_t format_bits = + ((uint64_t)get_format_encoding(isl_surf) << 58) | + ((uint64_t)get_bpp_encoding(bpp) << 54) | + GEN_AUX_MAP_ENTRY_Y_TILED_BIT; + + assert((format_bits & GEN_AUX_MAP_FORMAT_BITS_MASK) == format_bits); + + return format_bits; +} + +static void +get_aux_entry(struct gen_aux_map_context *ctx, uint64_t address, + uint32_t *l1_index_out, uint64_t *l1_entry_addr_out, + uint64_t **l1_entry_map_out) +{ + uint32_t l3_index = (address >> 36) & 0xfff; + uint64_t *l3_entry = &ctx->level3_map[l3_index]; + + uint64_t *l2_map; + if ((*l3_entry & GEN_AUX_MAP_ENTRY_VALID_BIT) == 0) { + uint64_t l2_gpu; + if (add_sub_table(ctx, 32 * 1024, 32 * 1024, &l2_gpu, &l2_map)) { + if (aux_map_debug) + fprintf(stderr, "AUX-MAP L3[0x%x]: 0x%"PRIx64", map=%p\n", + l3_index, l2_gpu, l2_map); + } else { + unreachable("Failed to add L2 Aux-Map Page Table!"); + } + *l3_entry = (l2_gpu & 0xffffffff8000ULL) | 1; + } else { + uint64_t l2_addr = gen_canonical_address(*l3_entry & ~0x7fffULL); + l2_map = get_u64_entry_ptr(ctx, l2_addr); + } + uint32_t l2_index = (address >> 24) & 0xfff; + uint64_t *l2_entry = &l2_map[l2_index]; + + uint64_t l1_addr, *l1_map; + if ((*l2_entry & GEN_AUX_MAP_ENTRY_VALID_BIT) == 0) { + if (add_sub_table(ctx, 8 * 1024, 8 * 1024, &l1_addr, &l1_map)) { + if (aux_map_debug) + fprintf(stderr, "AUX-MAP L2[0x%x]: 0x%"PRIx64", map=%p\n", + l2_index, l1_addr, l1_map); + } else { + unreachable("Failed to add L1 Aux-Map Page Table!"); + } + *l2_entry = (l1_addr & 0xffffffffe000ULL) | 1; + } else { + l1_addr = gen_canonical_address(*l2_entry & ~0x1fffULL); + l1_map = get_u64_entry_ptr(ctx, l1_addr); + } + uint32_t l1_index = (address >> 16) & 0xff; + if (l1_index_out) + *l1_index_out = l1_index; + if (l1_entry_addr_out) + *l1_entry_addr_out = l1_addr + l1_index * sizeof(*l1_map); + if (l1_entry_map_out) + *l1_entry_map_out = &l1_map[l1_index]; +} + +static void +add_mapping(struct gen_aux_map_context *ctx, uint64_t address, + uint64_t aux_address, uint64_t format_bits, + bool *state_changed) +{ + if (aux_map_debug) + fprintf(stderr, "AUX-MAP 0x%"PRIx64" => 0x%"PRIx64"\n", address, + aux_address); + + uint32_t l1_index; + uint64_t *l1_entry; + get_aux_entry(ctx, address, &l1_index, NULL, &l1_entry); + + const uint64_t l1_data = + (aux_address & GEN_AUX_MAP_ADDRESS_MASK) | + format_bits | + GEN_AUX_MAP_ENTRY_VALID_BIT; + + const uint64_t current_l1_data = *l1_entry; + if ((current_l1_data & GEN_AUX_MAP_ENTRY_VALID_BIT) == 0) { + assert((aux_address & 0xffULL) == 0); + if (aux_map_debug) + fprintf(stderr, "AUX-MAP L1[0x%x] 0x%"PRIx64" -> 0x%"PRIx64"\n", + l1_index, current_l1_data, l1_data); + /** + * We use non-zero bits in 63:1 to indicate the entry had been filled + * previously. If these bits are non-zero and they don't exactly match + * what we want to program into the entry, then we must force the + * aux-map tables to be flushed. + */ + if (current_l1_data != 0 && \ + (current_l1_data | GEN_AUX_MAP_ENTRY_VALID_BIT) != l1_data) + *state_changed = true; + *l1_entry = l1_data; + } else { + if (aux_map_debug) + fprintf(stderr, "AUX-MAP L1[0x%x] is already marked valid!\n", + l1_index); + assert(*l1_entry == l1_data); + } +} + +uint64_t * +gen_aux_map_get_entry(struct gen_aux_map_context *ctx, + uint64_t address, + uint64_t *entry_address) +{ + pthread_mutex_lock(&ctx->mutex); + uint64_t *l1_entry_map; + get_aux_entry(ctx, address, NULL, entry_address, &l1_entry_map); + pthread_mutex_unlock(&ctx->mutex); + + return l1_entry_map; +} + +void +gen_aux_map_add_mapping(struct gen_aux_map_context *ctx, uint64_t address, + uint64_t aux_address, uint64_t main_size_B, + uint64_t format_bits) +{ + bool state_changed = false; + pthread_mutex_lock(&ctx->mutex); + uint64_t map_addr = address; + uint64_t dest_aux_addr = aux_address; + assert(align64(address, GEN_AUX_MAP_MAIN_PAGE_SIZE) == address); + assert(align64(aux_address, GEN_AUX_MAP_AUX_PAGE_SIZE) == aux_address); + while (map_addr - address < main_size_B) { + add_mapping(ctx, map_addr, dest_aux_addr, format_bits, &state_changed); + map_addr += GEN_AUX_MAP_MAIN_PAGE_SIZE; + dest_aux_addr += GEN_AUX_MAP_AUX_PAGE_SIZE; + } + pthread_mutex_unlock(&ctx->mutex); + if (state_changed) + p_atomic_inc(&ctx->state_num); +} + +void +gen_aux_map_add_image(struct gen_aux_map_context *ctx, + const struct isl_surf *isl_surf, uint64_t address, + uint64_t aux_address) +{ + gen_aux_map_add_mapping(ctx, address, aux_address, isl_surf->size_B, + gen_aux_map_format_bits_for_isl_surf(isl_surf)); +} + +/** + * We mark the leaf entry as invalid, but we don't attempt to cleanup the + * other levels of translation mappings. Since we attempt to re-use VMA + * ranges, hopefully this will not lead to unbounded growth of the translation + * tables. + */ +static void +remove_mapping(struct gen_aux_map_context *ctx, uint64_t address, + bool *state_changed) +{ + uint32_t l3_index = (address >> 36) & 0xfff; + uint64_t *l3_entry = &ctx->level3_map[l3_index]; + + uint64_t *l2_map; + if ((*l3_entry & GEN_AUX_MAP_ENTRY_VALID_BIT) == 0) { + return; + } else { + uint64_t l2_addr = gen_canonical_address(*l3_entry & ~0x7fffULL); + l2_map = get_u64_entry_ptr(ctx, l2_addr); + } + uint32_t l2_index = (address >> 24) & 0xfff; + uint64_t *l2_entry = &l2_map[l2_index]; + + uint64_t *l1_map; + if ((*l2_entry & GEN_AUX_MAP_ENTRY_VALID_BIT) == 0) { + return; + } else { + uint64_t l1_addr = gen_canonical_address(*l2_entry & ~0x1fffULL); + l1_map = get_u64_entry_ptr(ctx, l1_addr); + } + uint32_t l1_index = (address >> 16) & 0xff; + uint64_t *l1_entry = &l1_map[l1_index]; + + const uint64_t current_l1_data = *l1_entry; + const uint64_t l1_data = current_l1_data & ~1ull; + + if ((current_l1_data & GEN_AUX_MAP_ENTRY_VALID_BIT) == 0) { + return; + } else { + if (aux_map_debug) + fprintf(stderr, "AUX-MAP [0x%x][0x%x][0x%x] L1 entry removed!\n", + l3_index, l2_index, l1_index); + /** + * We use non-zero bits in 63:1 to indicate the entry had been filled + * previously. In the unlikely event that these are all zero, we force a + * flush of the aux-map tables. + */ + if (unlikely(l1_data == 0)) + *state_changed = true; + *l1_entry = l1_data; + } +} + +void +gen_aux_map_unmap_range(struct gen_aux_map_context *ctx, uint64_t address, + uint64_t size) +{ + bool state_changed = false; + pthread_mutex_lock(&ctx->mutex); + if (aux_map_debug) + fprintf(stderr, "AUX-MAP remove 0x%"PRIx64"-0x%"PRIx64"\n", address, + address + size); + + uint64_t map_addr = address; + assert(align64(address, GEN_AUX_MAP_MAIN_PAGE_SIZE) == address); + while (map_addr - address < size) { + remove_mapping(ctx, map_addr, &state_changed); + map_addr += 64 * 1024; + } + pthread_mutex_unlock(&ctx->mutex); + if (state_changed) + p_atomic_inc(&ctx->state_num); +} + +uint32_t +gen_aux_map_get_num_buffers(struct gen_aux_map_context *ctx) +{ + return p_atomic_read(&ctx->num_buffers); +} + +void +gen_aux_map_fill_bos(struct gen_aux_map_context *ctx, void **driver_bos, + uint32_t max_bos) +{ + assert(p_atomic_read(&ctx->num_buffers) >= max_bos); + uint32_t i = 0; + list_for_each_entry(struct aux_map_buffer, buf, &ctx->buffers, link) { + if (i >= max_bos) + return; + driver_bos[i++] = buf->buffer->driver_bo; + } +} diff -Nru mesa-19.2.8/src/intel/common/gen_aux_map.h mesa-20.0.8/src/intel/common/gen_aux_map.h --- mesa-19.2.8/src/intel/common/gen_aux_map.h 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/intel/common/gen_aux_map.h 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2018 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef GEN_AUX_MAP_H +#define GEN_AUX_MAP_H + +#include "gen_buffer_alloc.h" + +#include "isl/isl.h" + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Auxiliary surface mapping implementation + * + * These functions are implemented in common code shared by drivers. + */ + +struct gen_aux_map_context; +struct gen_device_info; + +#define GEN_AUX_MAP_ADDRESS_MASK 0x0000ffffffffff00ull +#define GEN_AUX_MAP_FORMAT_BITS_MASK 0xfff0000000000000ull +#define GEN_AUX_MAP_ENTRY_VALID_BIT 0x1ull +#define GEN_AUX_MAP_GEN12_CCS_SCALE 256 +#define GEN_AUX_MAP_MAIN_PAGE_SIZE (64 * 1024) +#define GEN_AUX_MAP_AUX_PAGE_SIZE \ + (GEN_AUX_MAP_MAIN_PAGE_SIZE / GEN_AUX_MAP_GEN12_CCS_SCALE) + +struct gen_aux_map_context * +gen_aux_map_init(void *driver_ctx, + struct gen_mapped_pinned_buffer_alloc *buffer_alloc, + const struct gen_device_info *devinfo); + +void +gen_aux_map_finish(struct gen_aux_map_context *ctx); + +uint32_t +gen_aux_map_get_state_num(struct gen_aux_map_context *ctx); + +/** Returns the current number of buffers used by the aux-map tables + * + * When preparing to execute a new batch, use this function to determine how + * many buffers will be required. More buffers may be added by concurrent + * accesses of the aux-map functions, but they won't be required for since + * they involve surfaces not used by this batch. + */ +uint32_t +gen_aux_map_get_num_buffers(struct gen_aux_map_context *ctx); + +/** Fill an array of exec_object2 with aux-map buffer handles + * + * The gen_aux_map_get_num_buffers call should be made, then the driver can + * make sure the `obj` array is large enough before calling this function. + */ +void +gen_aux_map_fill_bos(struct gen_aux_map_context *ctx, void **driver_bos, + uint32_t max_bos); + +uint64_t +gen_aux_map_get_base(struct gen_aux_map_context *ctx); + +uint64_t +gen_aux_map_format_bits_for_isl_surf(const struct isl_surf *isl_surf); + +uint64_t * +gen_aux_map_get_entry(struct gen_aux_map_context *ctx, + uint64_t address, + uint64_t *entry_address); + +void +gen_aux_map_add_mapping(struct gen_aux_map_context *ctx, uint64_t address, + uint64_t aux_address, uint64_t main_size_B, + uint64_t format_bits); + +void +gen_aux_map_add_image(struct gen_aux_map_context *ctx, + const struct isl_surf *isl_surf, uint64_t address, + uint64_t aux_address); + +void +gen_aux_map_unmap_range(struct gen_aux_map_context *ctx, uint64_t address, + uint64_t size); + +#ifdef __cplusplus +} +#endif + +#endif /* GEN_AUX_MAP_H */ diff -Nru mesa-19.2.8/src/intel/common/gen_batch_decoder.c mesa-20.0.8/src/intel/common/gen_batch_decoder.c --- mesa-19.2.8/src/intel/common/gen_batch_decoder.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/common/gen_batch_decoder.c 2020-06-12 01:21:17.000000000 +0000 @@ -36,7 +36,8 @@ struct gen_batch_decode_bo (*get_bo)(void *, bool, uint64_t), - unsigned (*get_state_size)(void *, uint32_t), + unsigned (*get_state_size)(void *, uint64_t, + uint64_t), void *user_data) { memset(ctx, 0, sizeof(*ctx)); @@ -110,14 +111,15 @@ static int update_count(struct gen_batch_decode_ctx *ctx, - uint32_t offset_from_dsba, + uint64_t address, + uint64_t base_address, unsigned element_dwords, unsigned guess) { unsigned size = 0; if (ctx->get_state_size) - size = ctx->get_state_size(ctx->user_data, offset_from_dsba); + size = ctx->get_state_size(ctx->user_data, address, base_address); if (size > 0) return size / (sizeof(uint32_t) * element_dwords); @@ -249,8 +251,10 @@ return; } - if (count < 0) - count = update_count(ctx, offset, 1, 8); + if (count < 0) { + count = update_count(ctx, ctx->surface_base + offset, + ctx->surface_base, 1, 8); + } if (offset % 32 != 0 || offset >= UINT16_MAX) { fprintf(ctx->fp, " invalid binding table pointer\n"); @@ -289,11 +293,13 @@ dump_samplers(struct gen_batch_decode_ctx *ctx, uint32_t offset, int count) { struct gen_group *strct = gen_spec_find_struct(ctx->spec, "SAMPLER_STATE"); + uint64_t state_addr = ctx->dynamic_base + offset; - if (count < 0) - count = update_count(ctx, offset, strct->dw_length, 4); + if (count < 0) { + count = update_count(ctx, state_addr, ctx->dynamic_base, + strct->dw_length, 4); + } - uint64_t state_addr = ctx->dynamic_base + offset; struct gen_batch_decode_bo bo = ctx_get_bo(ctx, true, state_addr); const void *state_map = bo.map; @@ -580,7 +586,52 @@ ctx_disassemble_program(ctx, ksp[1], "SIMD16 fragment shader"); if (enabled[2]) ctx_disassemble_program(ctx, ksp[2], "SIMD32 fragment shader"); - fprintf(ctx->fp, "\n"); + + if (enabled[0] || enabled[1] || enabled[2]) + fprintf(ctx->fp, "\n"); +} + +static void +decode_3dstate_constant_all(struct gen_batch_decode_ctx *ctx, const uint32_t *p) +{ + struct gen_group *inst = + gen_spec_find_instruction(ctx->spec, ctx->engine, p); + struct gen_group *body = + gen_spec_find_struct(ctx->spec, "3DSTATE_CONSTANT_ALL_DATA"); + + uint32_t read_length[4]; + struct gen_batch_decode_bo buffer[4]; + memset(buffer, 0, sizeof(buffer)); + + struct gen_field_iterator outer; + gen_field_iterator_init(&outer, inst, p, 0, false); + int idx = 0; + while (gen_field_iterator_next(&outer)) { + if (outer.struct_desc != body) + continue; + + struct gen_field_iterator iter; + gen_field_iterator_init(&iter, body, &outer.p[outer.start_bit / 32], + 0, false); + while (gen_field_iterator_next(&iter)) { + if (!strcmp(iter.name, "Pointer To Constant Buffer")) { + buffer[idx] = ctx_get_bo(ctx, true, iter.raw_value); + } else if (!strcmp(iter.name, "Constant Buffer Read Length")) { + read_length[idx] = iter.raw_value; + } + } + idx++; + } + + for (int i = 0; i < 4; i++) { + if (read_length[i] == 0 || buffer[i].map == NULL) + continue; + + unsigned size = read_length[i] * 32; + fprintf(ctx->fp, "constant buffer %d, size %u\n", i, size); + + ctx_print_buffer(ctx, buffer[i], size, 0, -1); + } } static void @@ -631,6 +682,20 @@ } static void +decode_gen6_3dstate_binding_table_pointers(struct gen_batch_decode_ctx *ctx, + const uint32_t *p) +{ + fprintf(ctx->fp, "VS Binding Table:\n"); + dump_binding_table(ctx, p[1], -1); + + fprintf(ctx->fp, "GS Binding Table:\n"); + dump_binding_table(ctx, p[2], -1); + + fprintf(ctx->fp, "PS Binding Table:\n"); + dump_binding_table(ctx, p[3], -1); +} + +static void decode_3dstate_binding_table_pointers(struct gen_batch_decode_ctx *ctx, const uint32_t *p) { @@ -706,7 +771,8 @@ state = gen_spec_find_struct(ctx->spec, struct_type); } - count = update_count(ctx, state_offset, state->dw_length, count); + count = update_count(ctx, ctx->dynamic_base + state_offset, + ctx->dynamic_base, state->dw_length, count); for (int i = 0; i < count; i++) { fprintf(ctx->fp, "%s %d\n", struct_type, i); @@ -784,12 +850,15 @@ { "3DSTATE_DS", decode_single_ksp }, { "3DSTATE_HS", decode_single_ksp }, { "3DSTATE_PS", decode_ps_kernels }, + { "3DSTATE_WM", decode_ps_kernels }, { "3DSTATE_CONSTANT_VS", decode_3dstate_constant }, { "3DSTATE_CONSTANT_GS", decode_3dstate_constant }, { "3DSTATE_CONSTANT_PS", decode_3dstate_constant }, { "3DSTATE_CONSTANT_HS", decode_3dstate_constant }, { "3DSTATE_CONSTANT_DS", decode_3dstate_constant }, + { "3DSTATE_CONSTANT_ALL", decode_3dstate_constant_all }, + { "3DSTATE_BINDING_TABLE_POINTERS", decode_gen6_3dstate_binding_table_pointers }, { "3DSTATE_BINDING_TABLE_POINTERS_VS", decode_3dstate_binding_table_pointers }, { "3DSTATE_BINDING_TABLE_POINTERS_HS", decode_3dstate_binding_table_pointers }, { "3DSTATE_BINDING_TABLE_POINTERS_DS", decode_3dstate_binding_table_pointers }, diff -Nru mesa-19.2.8/src/intel/common/gen_buffer_alloc.h mesa-20.0.8/src/intel/common/gen_buffer_alloc.h --- mesa-19.2.8/src/intel/common/gen_buffer_alloc.h 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/intel/common/gen_buffer_alloc.h 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2018 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef GEN_BUFFER_ALLOC_H +#define GEN_BUFFER_ALLOC_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +struct gen_buffer { + uint64_t gpu; + uint64_t gpu_end; + void *map; + void *driver_bo; +}; + +struct gen_mapped_pinned_buffer_alloc { + struct gen_buffer * (*alloc)(void *driver_ctx, uint32_t size); + void (*free)(void *driver_ctx, struct gen_buffer *buffer); +}; + +#ifdef __cplusplus +} +#endif + +#endif /* GEN_BUFFER_ALLOC_H */ diff -Nru mesa-19.2.8/src/intel/common/gen_decoder.c mesa-20.0.8/src/intel/common/gen_decoder.c --- mesa-19.2.8/src/intel/common/gen_decoder.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/common/gen_decoder.c 2020-06-12 01:21:17.000000000 +0000 @@ -738,10 +738,10 @@ gen_spec_load_from_path(const struct gen_device_info *devinfo, const char *path) { - size_t len, filename_len = strlen(path) + 20; + size_t filename_len = strlen(path) + 20; char *filename = malloc(filename_len); - len = snprintf(filename, filename_len, "%s/gen%i.xml", + ASSERTED size_t len = snprintf(filename, filename_len, "%s/gen%i.xml", path, devinfo_to_gen(devinfo, false)); assert(len < filename_len); diff -Nru mesa-19.2.8/src/intel/common/gen_decoder.h mesa-20.0.8/src/intel/common/gen_decoder.h --- mesa-19.2.8/src/intel/common/gen_decoder.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/common/gen_decoder.h 2020-06-12 01:21:17.000000000 +0000 @@ -231,7 +231,8 @@ */ struct gen_batch_decode_bo (*get_bo)(void *user_data, bool ppgtt, uint64_t address); unsigned (*get_state_size)(void *user_data, - uint32_t offset_from_dynamic_state_base_addr); + uint64_t address, + uint64_t base_address); void *user_data; FILE *fp; @@ -259,7 +260,8 @@ bool, uint64_t), - unsigned (*get_state_size)(void *, uint32_t), + unsigned (*get_state_size)(void *, uint64_t, + uint64_t), void *user_data); void gen_batch_decode_ctx_finish(struct gen_batch_decode_ctx *ctx); diff -Nru mesa-19.2.8/src/intel/common/gen_l3_config.c mesa-20.0.8/src/intel/common/gen_l3_config.c --- mesa-19.2.8/src/intel/common/gen_l3_config.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/common/gen_l3_config.c 2020-06-12 01:21:17.000000000 +0000 @@ -146,6 +146,15 @@ {{ 0 }} }; +/** + * TGL validated L3 configurations. \sa tgl_l3_configs. + */ +static const struct gen_l3_config tgl_l3_configs[] = { + /* SLM URB ALL DC RO IS C T */ + {{ 0, 32, 88, 0, 0, 0, 0, 0 }}, + {{ 0, 16, 104, 0, 0, 0, 0, 0 }}, + {{ 0 }} +}; /** * Return a zero-terminated array of validated L3 configurations for the @@ -172,6 +181,9 @@ case 11: return icl_l3_configs; + case 12: + return tgl_l3_configs; + default: unreachable("Not implemented"); } diff -Nru mesa-19.2.8/src/intel/common/gen_l3_config.h mesa-20.0.8/src/intel/common/gen_l3_config.h --- mesa-19.2.8/src/intel/common/gen_l3_config.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/common/gen_l3_config.h 2020-06-12 01:21:17.000000000 +0000 @@ -92,10 +92,17 @@ void gen_dump_l3_config(const struct gen_l3_config *cfg, FILE *fp); +enum gen_urb_deref_block_size { + GEN_URB_DEREF_BLOCK_SIZE_32 = 0, + GEN_URB_DEREF_BLOCK_SIZE_PER_POLY = 1, + GEN_URB_DEREF_BLOCK_SIZE_8 = 2, +}; + void gen_get_urb_config(const struct gen_device_info *devinfo, - unsigned push_constant_bytes, unsigned urb_size_bytes, + const struct gen_l3_config *l3_cfg, bool tess_present, bool gs_present, const unsigned entry_size[4], - unsigned entries[4], unsigned start[4]); + unsigned entries[4], unsigned start[4], + enum gen_urb_deref_block_size *deref_block_size); #endif /* GEN_L3_CONFIG_H */ diff -Nru mesa-19.2.8/src/intel/common/gen_mi_builder.h mesa-20.0.8/src/intel/common/gen_mi_builder.h --- mesa-19.2.8/src/intel/common/gen_mi_builder.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/common/gen_mi_builder.h 2020-06-12 01:21:17.000000000 +0000 @@ -358,6 +358,9 @@ case GEN_MI_VALUE_TYPE_IMM: gen_mi_builder_emit(b, GENX(MI_STORE_DATA_IMM), sdi) { sdi.Address = dst.addr; +#if GEN_GEN >= 12 + sdi.ForceWriteCompletionCheck = true; +#endif sdi.ImmediateData = src.imm; } break; diff -Nru mesa-19.2.8/src/intel/common/gen_urb_config.c mesa-20.0.8/src/intel/common/gen_urb_config.c --- mesa-19.2.8/src/intel/common/gen_urb_config.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/common/gen_urb_config.c 2020-06-12 01:21:17.000000000 +0000 @@ -59,19 +59,24 @@ */ void gen_get_urb_config(const struct gen_device_info *devinfo, - unsigned push_constant_bytes, unsigned urb_size_bytes, + const struct gen_l3_config *l3_cfg, bool tess_present, bool gs_present, const unsigned entry_size[4], - unsigned entries[4], unsigned start[4]) + unsigned entries[4], unsigned start[4], + enum gen_urb_deref_block_size *deref_block_size) { + const unsigned urb_size_kB = gen_get_l3_config_urb_size(devinfo, l3_cfg); + const unsigned push_constant_kB = + (devinfo->gen >= 8 || (devinfo->is_haswell && devinfo->gt == 3)) ? 32 : 16; + const bool active[4] = { true, tess_present, tess_present, gs_present }; /* URB allocations must be done in 8k chunks. */ - const unsigned chunk_size_bytes = 8192; + const unsigned chunk_size_kB = 8; + const unsigned chunk_size_bytes = chunk_size_kB * 1024; - const unsigned push_constant_chunks = - push_constant_bytes / chunk_size_bytes; - const unsigned urb_chunks = urb_size_bytes / chunk_size_bytes; + const unsigned push_constant_chunks = push_constant_kB / chunk_size_kB; + const unsigned urb_chunks = urb_size_kB / chunk_size_kB; /* From p35 of the Ivy Bridge PRM (section 1.7.1: 3DSTATE_URB_GS): * @@ -205,4 +210,43 @@ start[i] = 0; } } + + if (deref_block_size) { + if (devinfo->gen >= 12) { + /* From the Gen12 BSpec: + * + * "Deref Block size depends on the last enabled shader and number + * of handles programmed for that shader + * + * 1) For GS last shader enabled cases, the deref block is + * always set to a per poly(within hardware) + * + * If the last enabled shader is VS or DS. + * + * 1) If DS is last enabled shader then if the number of DS + * handles is less than 324, need to set per poly deref. + * + * 2) If VS is last enabled shader then if the number of VS + * handles is less than 192, need to set per poly deref" + * + * The default is 32 so we assume that's the right choice if we're + * not in one of the explicit cases listed above. + */ + if (gs_present) { + *deref_block_size = GEN_URB_DEREF_BLOCK_SIZE_PER_POLY; + } else if (tess_present) { + if (entries[MESA_SHADER_TESS_EVAL] < 324) + *deref_block_size = GEN_URB_DEREF_BLOCK_SIZE_PER_POLY; + else + *deref_block_size = GEN_URB_DEREF_BLOCK_SIZE_32; + } else { + if (entries[MESA_SHADER_VERTEX] < 192) + *deref_block_size = GEN_URB_DEREF_BLOCK_SIZE_PER_POLY; + else + *deref_block_size = GEN_URB_DEREF_BLOCK_SIZE_32; + } + } else { + *deref_block_size = 0; + } + } } diff -Nru mesa-19.2.8/src/intel/common/meson.build mesa-20.0.8/src/intel/common/meson.build --- mesa-19.2.8/src/intel/common/meson.build 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/common/meson.build 2020-06-12 01:21:17.000000000 +0000 @@ -21,6 +21,9 @@ # TODO: android? files_libintel_common = files( + 'gen_aux_map.c', + 'gen_aux_map.h', + 'gen_buffer_alloc.h', 'gen_clflush.h', 'gen_batch_decoder.c', 'gen_decoder.c', @@ -59,7 +62,7 @@ if install_intel_gpu_tests foreach g : [['70', 'gen7'], ['75', 'hsw'], ['80', 'gen8'], - ['90', 'gen9'], ['110', 'gen11']] + ['90', 'gen9'], ['110', 'gen11'], ['120', 'gen12']] executable( 'intel_@0@_mi_builder_test'.format(g[1]), files('tests/gen_mi_builder_test.cpp'), diff -Nru mesa-19.2.8/src/intel/compiler/brw_cfg.cpp mesa-20.0.8/src/intel/compiler/brw_cfg.cpp --- mesa-19.2.8/src/intel/compiler/brw_cfg.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/compiler/brw_cfg.cpp 2020-06-12 01:21:17.000000000 +0000 @@ -44,12 +44,21 @@ } static exec_node * -link(void *mem_ctx, bblock_t *block) +link(void *mem_ctx, bblock_t *block, enum bblock_link_kind kind) { - bblock_link *l = new(mem_ctx) bblock_link(block); + bblock_link *l = new(mem_ctx) bblock_link(block, kind); return &l->link; } +void +push_stack(exec_list *list, void *mem_ctx, bblock_t *block) +{ + /* The kind of the link is immaterial, but we need to provide one since + * this is (ab)using the edge data structure in order to implement a stack. + */ + list->push_tail(link(mem_ctx, block, bblock_link_logical)); +} + bblock_t::bblock_t(cfg_t *cfg) : cfg(cfg), idom(NULL), start_ip(0), end_ip(0), num(0), cycle_count(0) { @@ -59,17 +68,19 @@ } void -bblock_t::add_successor(void *mem_ctx, bblock_t *successor) +bblock_t::add_successor(void *mem_ctx, bblock_t *successor, + enum bblock_link_kind kind) { - successor->parents.push_tail(::link(mem_ctx, this)); - children.push_tail(::link(mem_ctx, successor)); + successor->parents.push_tail(::link(mem_ctx, this, kind)); + children.push_tail(::link(mem_ctx, successor, kind)); } bool -bblock_t::is_predecessor_of(const bblock_t *block) const +bblock_t::is_predecessor_of(const bblock_t *block, + enum bblock_link_kind kind) const { foreach_list_typed_safe (bblock_link, parent, link, &block->parents) { - if (parent->block == this) { + if (parent->block == this && parent->kind <= kind) { return true; } } @@ -78,10 +89,11 @@ } bool -bblock_t::is_successor_of(const bblock_t *block) const +bblock_t::is_successor_of(const bblock_t *block, + enum bblock_link_kind kind) const { foreach_list_typed_safe (bblock_link, child, link, &block->children) { - if (child->block == this) { + if (child->block == this && child->kind <= kind) { return true; } } @@ -185,8 +197,8 @@ /* Push our information onto a stack so we can recover from * nested ifs. */ - if_stack.push_tail(link(mem_ctx, cur_if)); - else_stack.push_tail(link(mem_ctx, cur_else)); + push_stack(&if_stack, mem_ctx, cur_if); + push_stack(&else_stack, mem_ctx, cur_else); cur_if = cur; cur_else = NULL; @@ -196,7 +208,7 @@ * instructions. */ next = new_block(); - cur_if->add_successor(mem_ctx, next); + cur_if->add_successor(mem_ctx, next, bblock_link_logical); set_next_block(&cur, next, ip); break; @@ -208,7 +220,8 @@ next = new_block(); assert(cur_if != NULL); - cur_if->add_successor(mem_ctx, next); + cur_if->add_successor(mem_ctx, next, bblock_link_logical); + cur_else->add_successor(mem_ctx, next, bblock_link_physical); set_next_block(&cur, next, ip); break; @@ -220,7 +233,7 @@ } else { cur_endif = new_block(); - cur->add_successor(mem_ctx, cur_endif); + cur->add_successor(mem_ctx, cur_endif, bblock_link_logical); set_next_block(&cur, cur_endif, ip - 1); } @@ -228,10 +241,10 @@ cur->instructions.push_tail(inst); if (cur_else) { - cur_else->add_successor(mem_ctx, cur_endif); + cur_else->add_successor(mem_ctx, cur_endif, bblock_link_logical); } else { assert(cur_if != NULL); - cur_if->add_successor(mem_ctx, cur_endif); + cur_if->add_successor(mem_ctx, cur_endif, bblock_link_logical); } assert(cur_if->end()->opcode == BRW_OPCODE_IF); @@ -246,8 +259,8 @@ /* Push our information onto a stack so we can recover from * nested loops. */ - do_stack.push_tail(link(mem_ctx, cur_do)); - while_stack.push_tail(link(mem_ctx, cur_while)); + push_stack(&do_stack, mem_ctx, cur_do); + push_stack(&while_stack, mem_ctx, cur_while); /* Set up the block just after the while. Don't know when exactly * it will start, yet. @@ -260,7 +273,7 @@ } else { cur_do = new_block(); - cur->add_successor(mem_ctx, cur_do); + cur->add_successor(mem_ctx, cur_do, bblock_link_logical); set_next_block(&cur, cur_do, ip - 1); } @@ -294,8 +307,8 @@ * corruption. */ next = new_block(); - cur->add_successor(mem_ctx, next); - cur->add_successor(mem_ctx, cur_while); + cur->add_successor(mem_ctx, next, bblock_link_logical); + cur->add_successor(mem_ctx, cur_while, bblock_link_physical); set_next_block(&cur, next, ip); break; @@ -316,11 +329,13 @@ * loop, the top of the loop again, into a use of the variable). */ assert(cur_do != NULL); - cur->add_successor(mem_ctx, cur_do->next()); + cur->add_successor(mem_ctx, cur_do->next(), bblock_link_logical); next = new_block(); if (inst->predicate) - cur->add_successor(mem_ctx, next); + cur->add_successor(mem_ctx, next, bblock_link_logical); + else + cur->add_successor(mem_ctx, next, bblock_link_physical); set_next_block(&cur, next, ip); break; @@ -339,11 +354,12 @@ * See the DO case for additional explanation. */ assert(cur_do != NULL); - cur->add_successor(mem_ctx, cur_do); + cur->add_successor(mem_ctx, cur_do, bblock_link_physical); + cur->add_successor(mem_ctx, cur_while, bblock_link_logical); next = new_block(); if (inst->predicate) - cur->add_successor(mem_ctx, next); + cur->add_successor(mem_ctx, next, bblock_link_logical); set_next_block(&cur, next, ip); break; @@ -362,8 +378,11 @@ * channels, so we may skip over the divergence point at the top of * the loop to keep the CFG as unambiguous as possible. */ - cur->add_successor(mem_ctx, inst->predicate ? cur_do : - cur_do->next()); + if (inst->predicate) { + cur->add_successor(mem_ctx, cur_do, bblock_link_logical); + } else { + cur->add_successor(mem_ctx, cur_do->next(), bblock_link_logical); + } set_next_block(&cur, cur_while, ip); @@ -403,9 +422,11 @@ /* Add removed-block's successors to its predecessors' successor lists. */ foreach_list_typed (bblock_link, successor, link, &block->children) { - if (!successor->block->is_successor_of(predecessor->block)) { + if (!successor->block->is_successor_of(predecessor->block, + successor->kind)) { predecessor->block->children.push_tail(link(mem_ctx, - successor->block)); + successor->block, + successor->kind)); } } } @@ -422,9 +443,11 @@ /* Add removed-block's predecessors to its successors' predecessor lists. */ foreach_list_typed (bblock_link, predecessor, link, &block->parents) { - if (!predecessor->block->is_predecessor_of(successor->block)) { + if (!predecessor->block->is_predecessor_of(successor->block, + predecessor->kind)) { successor->block->parents.push_tail(link(mem_ctx, - predecessor->block)); + predecessor->block, + predecessor->kind)); } } } @@ -487,7 +510,8 @@ fprintf(stderr, "START B%d IDOM(none)", block->num); foreach_list_typed(bblock_link, link, link, &block->parents) { - fprintf(stderr, " <-B%d", + fprintf(stderr, " <%cB%d", + link->kind == bblock_link_logical ? '-' : '~', link->block->num); } fprintf(stderr, "\n"); @@ -495,7 +519,8 @@ block->dump(s); fprintf(stderr, "END B%d", block->num); foreach_list_typed(bblock_link, link, link, &block->children) { - fprintf(stderr, " ->B%d", + fprintf(stderr, " %c>B%d", + link->kind == bblock_link_logical ? '-' : '~', link->block->num); } fprintf(stderr, "\n"); diff -Nru mesa-19.2.8/src/intel/compiler/brw_cfg.h mesa-20.0.8/src/intel/compiler/brw_cfg.h --- mesa-19.2.8/src/intel/compiler/brw_cfg.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/compiler/brw_cfg.h 2020-06-12 01:21:17.000000000 +0000 @@ -32,18 +32,42 @@ struct bblock_t; +/** + * CFG edge types. + * + * A logical edge represents a potential control flow path of the original + * scalar program, while a physical edge represents a control flow path that + * may not have existed in the original program but was introduced during + * vectorization in order to implement divergent control flow of different + * shader invocations within the same SIMD thread. + * + * All logical edges in the CFG are considered to be physical edges but not + * the other way around -- I.e. the logical CFG is a subset of the physical + * one. + */ +enum bblock_link_kind { + bblock_link_logical = 0, + bblock_link_physical +}; + struct bblock_link { #ifdef __cplusplus DECLARE_RALLOC_CXX_OPERATORS(bblock_link) - bblock_link(bblock_t *block) - : block(block) + bblock_link(bblock_t *block, enum bblock_link_kind kind) + : block(block), kind(kind) { } #endif struct exec_node link; struct bblock_t *block; + + /* Type of this CFG edge. Because bblock_link_logical also implies + * bblock_link_physical, the proper way to test for membership of edge 'l' + * in CFG kind 'k' is 'l.kind <= k'. + */ + enum bblock_link_kind kind; }; struct backend_instruction; @@ -54,9 +78,12 @@ explicit bblock_t(cfg_t *cfg); - void add_successor(void *mem_ctx, bblock_t *successor); - bool is_predecessor_of(const bblock_t *block) const; - bool is_successor_of(const bblock_t *block) const; + void add_successor(void *mem_ctx, bblock_t *successor, + enum bblock_link_kind kind); + bool is_predecessor_of(const bblock_t *block, + enum bblock_link_kind kind) const; + bool is_successor_of(const bblock_t *block, + enum bblock_link_kind kind) const; bool can_combine_with(const bblock_t *that) const; void combine_with(bblock_t *that); void dump(backend_shader *s) const; diff -Nru mesa-19.2.8/src/intel/compiler/brw_compiler.c mesa-20.0.8/src/intel/compiler/brw_compiler.c --- mesa-19.2.8/src/intel/compiler/brw_compiler.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/compiler/brw_compiler.c 2020-06-12 01:21:17.000000000 +0000 @@ -50,6 +50,7 @@ .lower_base_vertex = true #define COMMON_SCALAR_OPTIONS \ + .lower_to_scalar = true, \ .lower_pack_half_2x16 = true, \ .lower_pack_snorm_2x16 = true, \ .lower_pack_snorm_4x8 = true, \ @@ -60,6 +61,8 @@ .lower_unpack_snorm_4x8 = true, \ .lower_unpack_unorm_2x16 = true, \ .lower_unpack_unorm_4x8 = true, \ + .lower_usub_sat64 = true, \ + .lower_hadd64 = true, \ .max_unroll_iterations = 32 static const struct nir_shader_compiler_options scalar_nir_options = { @@ -100,7 +103,8 @@ compiler->precise_trig = env_var_as_boolean("INTEL_PRECISE_TRIG", false); compiler->use_tcs_8_patch = - devinfo->gen >= 9 && (INTEL_DEBUG & DEBUG_TCS_EIGHT_PATCH); + devinfo->gen >= 12 || + (devinfo->gen >= 9 && (INTEL_DEBUG & DEBUG_TCS_EIGHT_PATCH)); if (devinfo->gen >= 10) { /* We don't support vec4 mode on Cannonlake. */ @@ -137,7 +141,7 @@ nir_lower_dsub | nir_lower_ddiv; - if (!devinfo->has_64bit_types || (INTEL_DEBUG & DEBUG_SOFT64)) { + if (!devinfo->has_64bit_float || (INTEL_DEBUG & DEBUG_SOFT64)) { int64_options |= nir_lower_mov64 | nir_lower_icmp64 | nir_lower_iadd64 | @@ -185,12 +189,16 @@ */ nir_options->lower_ffma = devinfo->gen < 6; nir_options->lower_flrp32 = devinfo->gen < 6 || devinfo->gen >= 11; + nir_options->lower_fpow = devinfo->gen >= 12; nir_options->lower_rotate = devinfo->gen < 11; nir_options->lower_bitfield_reverse = devinfo->gen < 7; nir_options->lower_int64_options = int64_options; nir_options->lower_doubles_options = fp64_options; + + nir_options->unify_interfaces = i < MESA_SHADER_FRAGMENT; + compiler->glsl_compiler_options[i].NirOptions = nir_options; compiler->glsl_compiler_options[i].ClampBlockIndicesToArrayBounds = true; diff -Nru mesa-19.2.8/src/intel/compiler/brw_compiler.h mesa-20.0.8/src/intel/compiler/brw_compiler.h --- mesa-19.2.8/src/intel/compiler/brw_compiler.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/compiler/brw_compiler.h 2020-06-12 01:21:17.000000000 +0000 @@ -83,10 +83,10 @@ uint8_t *ra_reg_to_grf; /** - * ra class for the aligned pairs we use for PLN, which doesn't + * ra class for the aligned barycentrics we use for PLN, which doesn't * appear in *classes. */ - int aligned_pairs_class; + int aligned_bary_class; } fs_reg_sets[3]; void (*shader_debug_log)(void *, const char *str, ...) PRINTFLIKE(2, 3); @@ -119,6 +119,12 @@ * whether nir_opt_large_constants will be run. */ bool supports_shader_constants; + + /** + * Whether or not the driver wants uniform params to be compacted by the + * back-end compiler. + */ + bool compact_params; }; /** @@ -654,6 +660,9 @@ unsigned program_size; + /** Does this program pull from any UBO or other constant buffers? */ + bool has_ubo_pull; + /** * Register where the thread expects to find input data from the URB * (typically uniforms, followed by vertex or fragment attributes). @@ -769,6 +778,11 @@ */ uint32_t flat_inputs; + /** + * The FS inputs + */ + uint64_t inputs; + /* Mapping of VUE slots to interpolation modes. * Used by the Gen4-5 clip/sf/wm stages. */ @@ -780,6 +794,14 @@ * For varying slots that are not used by the FS, the value is -1. */ int urb_setup[VARYING_SLOT_MAX]; + + /** + * Cache structure into the urb_setup array above that contains the + * attribute numbers of active varyings out of urb_setup. + * The actual count is stored in urb_setup_attribs_count. + */ + uint8_t urb_setup_attribs[VARYING_SLOT_MAX]; + uint8_t urb_setup_attribs_count; }; /** Returns the SIMD width corresponding to a given KSP index @@ -1223,11 +1245,16 @@ struct brw_cs_prog_data cs; }; -#define DEFINE_PROG_DATA_DOWNCAST(stage) \ -static inline struct brw_##stage##_prog_data * \ -brw_##stage##_prog_data(struct brw_stage_prog_data *prog_data) \ -{ \ - return (struct brw_##stage##_prog_data *) prog_data; \ +#define DEFINE_PROG_DATA_DOWNCAST(stage) \ +static inline struct brw_##stage##_prog_data * \ +brw_##stage##_prog_data(struct brw_stage_prog_data *prog_data) \ +{ \ + return (struct brw_##stage##_prog_data *) prog_data; \ +} \ +static inline const struct brw_##stage##_prog_data * \ +brw_##stage##_prog_data_const(const struct brw_stage_prog_data *prog_data) \ +{ \ + return (const struct brw_##stage##_prog_data *) prog_data; \ } DEFINE_PROG_DATA_DOWNCAST(vue) DEFINE_PROG_DATA_DOWNCAST(vs) @@ -1319,7 +1346,6 @@ const struct brw_vue_map *input_vue_map, struct brw_tes_prog_data *prog_data, struct nir_shader *shader, - struct gl_program *prog, int shader_time_index, struct brw_compile_stats *stats, char **error_str); @@ -1383,7 +1409,6 @@ const struct brw_wm_prog_key *key, struct brw_wm_prog_data *prog_data, struct nir_shader *shader, - struct gl_program *prog, int shader_time_index8, int shader_time_index16, int shader_time_index32, @@ -1460,7 +1485,7 @@ * to do a full test run with brw_fs_test_dispatch_packing() hooked up to * the NIR front-end before changing this assertion. */ - assert(devinfo->gen <= 11); + assert(devinfo->gen <= 12); switch (stage) { case MESA_SHADER_FRAGMENT: { diff -Nru mesa-19.2.8/src/intel/compiler/brw_disasm.c mesa-20.0.8/src/intel/compiler/brw_disasm.c --- mesa-19.2.8/src/intel/compiler/brw_disasm.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/compiler/brw_disasm.c 2020-06-12 01:21:17.000000000 +0000 @@ -92,8 +92,11 @@ static bool is_split_send(UNUSED const struct gen_device_info *devinfo, unsigned opcode) { - return opcode == BRW_OPCODE_SENDS || - opcode == BRW_OPCODE_SENDSC; + if (devinfo->gen >= 12) + return is_send(opcode); + else + return opcode == BRW_OPCODE_SENDS || + opcode == BRW_OPCODE_SENDSC; } const char *const conditional_modifier[16] = { @@ -497,6 +500,14 @@ [GEN8_MATH_FUNCTION_RSQRTM] = "rsqrtm", }; +static const char *const sync_function[16] = { + [TGL_SYNC_NOP] = "nop", + [TGL_SYNC_ALLRD] = "allrd", + [TGL_SYNC_ALLWR] = "allwr", + [TGL_SYNC_BAR] = "bar", + [TGL_SYNC_HOST] = "host", +}; + static const char *const math_saturate[2] = { [0] = "", [1] = "sat" @@ -740,7 +751,11 @@ /* These are fixed for split sends */ type = BRW_REGISTER_TYPE_UD; elem_size = 4; - if (brw_inst_dst_address_mode(devinfo, inst) == BRW_ADDRESS_DIRECT) { + if (devinfo->gen >= 12) { + err |= reg(file, brw_inst_send_dst_reg_file(devinfo, inst), + brw_inst_dst_da_reg_nr(devinfo, inst)); + string(file, brw_reg_type_to_letters(type)); + } else if (brw_inst_dst_address_mode(devinfo, inst) == BRW_ADDRESS_DIRECT) { err |= reg(file, brw_inst_send_dst_reg_file(devinfo, inst), brw_inst_dst_da_reg_nr(devinfo, inst)); unsigned subreg_nr = brw_inst_dst_da16_subreg_nr(devinfo, inst); @@ -814,10 +829,15 @@ unsigned subreg_nr; enum brw_reg_type type; - if (is_align1 && brw_inst_3src_a1_dst_reg_file(devinfo, inst)) - reg_file = BRW_ARCHITECTURE_REGISTER_FILE; - else if (devinfo->gen == 6 && brw_inst_3src_a16_dst_reg_file(devinfo, inst)) + if (devinfo->gen < 10 && is_align1) + return 0; + + if (devinfo->gen == 6 && brw_inst_3src_a16_dst_reg_file(devinfo, inst)) reg_file = BRW_MESSAGE_REGISTER_FILE; + else if (devinfo->gen >= 12) + reg_file = brw_inst_3src_a1_dst_reg_file(devinfo, inst); + else if (is_align1 && brw_inst_3src_a1_dst_reg_file(devinfo, inst)) + reg_file = BRW_ARCHITECTURE_REGISTER_FILE; else reg_file = BRW_GENERAL_REGISTER_FILE; @@ -987,11 +1007,16 @@ } static enum brw_vertical_stride -vstride_from_align1_3src_vstride(enum gen10_align1_3src_vertical_stride vstride) +vstride_from_align1_3src_vstride(const struct gen_device_info *devinfo, + enum gen10_align1_3src_vertical_stride vstride) { switch (vstride) { case BRW_ALIGN1_3SRC_VERTICAL_STRIDE_0: return BRW_VERTICAL_STRIDE_0; - case BRW_ALIGN1_3SRC_VERTICAL_STRIDE_2: return BRW_VERTICAL_STRIDE_2; + case BRW_ALIGN1_3SRC_VERTICAL_STRIDE_2: + if (devinfo->gen >= 12) + return BRW_VERTICAL_STRIDE_1; + else + return BRW_VERTICAL_STRIDE_2; case BRW_ALIGN1_3SRC_VERTICAL_STRIDE_4: return BRW_VERTICAL_STRIDE_4; case BRW_ALIGN1_3SRC_VERTICAL_STRIDE_8: return BRW_VERTICAL_STRIDE_8; default: @@ -1079,19 +1104,18 @@ bool is_scalar_region; bool is_align1 = brw_inst_3src_access_mode(devinfo, inst) == BRW_ALIGN_1; + if (devinfo->gen < 10 && is_align1) + return 0; + if (is_align1) { - if (brw_inst_3src_a1_src0_reg_file(devinfo, inst) == - BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE) { + if (devinfo->gen >= 12 && !brw_inst_3src_a1_src0_is_imm(devinfo, inst)) { + _file = brw_inst_3src_a1_src0_reg_file(devinfo, inst); + } else if (brw_inst_3src_a1_src0_reg_file(devinfo, inst) == + BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE) { _file = BRW_GENERAL_REGISTER_FILE; - reg_nr = brw_inst_3src_src0_reg_nr(devinfo, inst); - subreg_nr = brw_inst_3src_a1_src0_subreg_nr(devinfo, inst); - type = brw_inst_3src_a1_src0_type(devinfo, inst); } else if (brw_inst_3src_a1_src0_type(devinfo, inst) == BRW_REGISTER_TYPE_NF) { _file = BRW_ARCHITECTURE_REGISTER_FILE; - reg_nr = brw_inst_3src_src0_reg_nr(devinfo, inst); - subreg_nr = brw_inst_3src_a1_src0_subreg_nr(devinfo, inst); - type = brw_inst_3src_a1_src0_type(devinfo, inst); } else { _file = BRW_IMMEDIATE_VALUE; uint16_t imm_val = brw_inst_3src_a1_src0_imm(devinfo, inst); @@ -1102,13 +1126,16 @@ } else if (type == BRW_REGISTER_TYPE_UW) { format(file, "0x%04xUW", imm_val); } else if (type == BRW_REGISTER_TYPE_HF) { - format(file, "%-gF", _mesa_half_to_float(imm_val)); + format(file, "0x%04xHF", imm_val); } return 0; } + reg_nr = brw_inst_3src_src0_reg_nr(devinfo, inst); + subreg_nr = brw_inst_3src_a1_src0_subreg_nr(devinfo, inst); + type = brw_inst_3src_a1_src0_type(devinfo, inst); _vert_stride = vstride_from_align1_3src_vstride( - brw_inst_3src_a1_src0_vstride(devinfo, inst)); + devinfo, brw_inst_3src_a1_src0_vstride(devinfo, inst)); _horiz_stride = hstride_from_align1_3src_hstride( brw_inst_3src_a1_src0_hstride(devinfo, inst)); _width = implied_width(_vert_stride, _horiz_stride); @@ -1163,9 +1190,14 @@ bool is_scalar_region; bool is_align1 = brw_inst_3src_access_mode(devinfo, inst) == BRW_ALIGN_1; + if (devinfo->gen < 10 && is_align1) + return 0; + if (is_align1) { - if (brw_inst_3src_a1_src1_reg_file(devinfo, inst) == - BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE) { + if (devinfo->gen >= 12) { + _file = brw_inst_3src_a1_src1_reg_file(devinfo, inst); + } else if (brw_inst_3src_a1_src1_reg_file(devinfo, inst) == + BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE) { _file = BRW_GENERAL_REGISTER_FILE; } else { _file = BRW_ARCHITECTURE_REGISTER_FILE; @@ -1176,7 +1208,7 @@ type = brw_inst_3src_a1_src1_type(devinfo, inst); _vert_stride = vstride_from_align1_3src_vstride( - brw_inst_3src_a1_src1_vstride(devinfo, inst)); + devinfo, brw_inst_3src_a1_src1_vstride(devinfo, inst)); _horiz_stride = hstride_from_align1_3src_hstride( brw_inst_3src_a1_src1_hstride(devinfo, inst)); _width = implied_width(_vert_stride, _horiz_stride); @@ -1231,13 +1263,15 @@ bool is_scalar_region; bool is_align1 = brw_inst_3src_access_mode(devinfo, inst) == BRW_ALIGN_1; + if (devinfo->gen < 10 && is_align1) + return 0; + if (is_align1) { - if (brw_inst_3src_a1_src2_reg_file(devinfo, inst) == - BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE) { + if (devinfo->gen >= 12 && !brw_inst_3src_a1_src2_is_imm(devinfo, inst)) { + _file = brw_inst_3src_a1_src2_reg_file(devinfo, inst); + } else if (brw_inst_3src_a1_src2_reg_file(devinfo, inst) == + BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE) { _file = BRW_GENERAL_REGISTER_FILE; - reg_nr = brw_inst_3src_src2_reg_nr(devinfo, inst); - subreg_nr = brw_inst_3src_a1_src2_subreg_nr(devinfo, inst); - type = brw_inst_3src_a1_src2_type(devinfo, inst); } else { _file = BRW_IMMEDIATE_VALUE; uint16_t imm_val = brw_inst_3src_a1_src2_imm(devinfo, inst); @@ -1248,11 +1282,14 @@ } else if (type == BRW_REGISTER_TYPE_UW) { format(file, "0x%04xUW", imm_val); } else if (type == BRW_REGISTER_TYPE_HF) { - format(file, "%-gF", _mesa_half_to_float(imm_val)); + format(file, "0x%04xHF", imm_val); } return 0; } + reg_nr = brw_inst_3src_src2_reg_nr(devinfo, inst); + subreg_nr = brw_inst_3src_a1_src2_subreg_nr(devinfo, inst); + type = brw_inst_3src_a1_src2_type(devinfo, inst); /* FINISHME: No vertical stride on src2. Is using the hstride in place * correct? Doesn't seem like it, since there's hstride=1 but * no vstride=1. @@ -1372,12 +1409,13 @@ src_sends_da(FILE *file, const struct gen_device_info *devinfo, enum brw_reg_type type, + enum brw_reg_file _reg_file, unsigned _reg_nr, unsigned _reg_subnr) { int err = 0; - err |= reg(file, BRW_GENERAL_REGISTER_FILE, _reg_nr); + err |= reg(file, _reg_file, _reg_nr); if (err == -1) return 0; if (_reg_subnr) @@ -1406,13 +1444,34 @@ } static int +src_send_desc_ia(FILE *file, + const struct gen_device_info *devinfo, + unsigned _addr_subreg_nr) +{ + string(file, "a0"); + if (_addr_subreg_nr) + format(file, ".%d", _addr_subreg_nr); + format(file, "<0>UD"); + + return 0; +} + +static int src0(FILE *file, const struct gen_device_info *devinfo, const brw_inst *inst) { if (is_split_send(devinfo, brw_inst_opcode(devinfo, inst))) { - if (brw_inst_send_src0_address_mode(devinfo, inst) == BRW_ADDRESS_DIRECT) { + if (devinfo->gen >= 12) { return src_sends_da(file, devinfo, BRW_REGISTER_TYPE_UD, + brw_inst_send_src0_reg_file(devinfo, inst), + brw_inst_src0_da_reg_nr(devinfo, inst), + 0); + } else if (brw_inst_send_src0_address_mode(devinfo, inst) == BRW_ADDRESS_DIRECT) { + return src_sends_da(file, + devinfo, + BRW_REGISTER_TYPE_UD, + BRW_GENERAL_REGISTER_FILE, brw_inst_src0_da_reg_nr(devinfo, inst), brw_inst_src0_da16_subreg_nr(devinfo, inst)); } else { @@ -1481,6 +1540,7 @@ return src_sends_da(file, devinfo, BRW_REGISTER_TYPE_UD, + brw_inst_send_src1_reg_file(devinfo, inst), brw_inst_send_src1_reg_nr(devinfo, inst), 0 /* subreg_nr */); } else if (brw_inst_src1_reg_file(devinfo, inst) == BRW_IMMEDIATE_VALUE) { @@ -1569,6 +1629,20 @@ return 0; } +static int +swsb(FILE *file, const struct gen_device_info *devinfo, const brw_inst *inst) +{ + const struct tgl_swsb swsb = tgl_swsb_decode(brw_inst_opcode(devinfo, inst), + brw_inst_swsb(devinfo, inst)); + if (swsb.regdist) + format(file, " @%d", swsb.regdist); + if (swsb.mode) + format(file, " $%d%s", swsb.sbid, + (swsb.mode & TGL_SBID_SET ? "" : + swsb.mode & TGL_SBID_DST ? ".dst" : ".src")); + return 0; +} + #ifdef DEBUG static __attribute__((__unused__)) int brw_disassemble_imm(const struct gen_device_info *devinfo, @@ -1609,8 +1683,10 @@ } err |= print_opcode(file, devinfo, opcode); - err |= control(file, "saturate", saturate, brw_inst_saturate(devinfo, inst), - NULL); + + if (!is_send(opcode)) + err |= control(file, "saturate", saturate, brw_inst_saturate(devinfo, inst), + NULL); err |= control(file, "debug control", debug_ctrl, brw_inst_debug_control(devinfo, inst), NULL); @@ -1619,6 +1695,12 @@ string(file, " "); err |= control(file, "function", math_function, brw_inst_math_function(devinfo, inst), NULL); + + } else if (opcode == BRW_OPCODE_SYNC) { + string(file, " "); + err |= control(file, "function", sync_function, + brw_inst_cond_modifier(devinfo, inst), NULL); + } else if (!is_send(opcode)) { err |= control(file, "conditional modifier", conditional_modifier, brw_inst_cond_modifier(devinfo, inst), NULL); @@ -1718,7 +1800,7 @@ pad(file, 64); if (brw_inst_send_sel_reg32_desc(devinfo, inst)) { /* show the indirect descriptor source */ - err |= src_sends_ia(file, devinfo, BRW_REGISTER_TYPE_UD, 0, 0); + err |= src_send_desc_ia(file, devinfo, 0); } else { has_imm_desc = true; imm_desc = brw_inst_send_desc(devinfo, inst); @@ -1728,11 +1810,11 @@ pad(file, 80); if (brw_inst_send_sel_reg32_ex_desc(devinfo, inst)) { /* show the indirect descriptor source */ - err |= src_sends_ia(file, devinfo, BRW_REGISTER_TYPE_UD, 0, - brw_inst_send_ex_desc_ia_subreg_nr(devinfo, inst)); + err |= src_send_desc_ia(file, devinfo, + brw_inst_send_ex_desc_ia_subreg_nr(devinfo, inst)); } else { has_imm_ex_desc = true; - imm_ex_desc = brw_inst_send_ex_desc(devinfo, inst); + imm_ex_desc = brw_inst_sends_ex_desc(devinfo, inst); fprintf(file, "0x%08"PRIx32, imm_ex_desc); } } else { @@ -2022,9 +2104,12 @@ err |= control(file, "mask control", mask_ctrl, brw_inst_mask_control(devinfo, inst), &space); } - err |= control(file, "dependency control", dep_ctrl, - ((brw_inst_no_dd_check(devinfo, inst) << 1) | - brw_inst_no_dd_clear(devinfo, inst)), &space); + + if (devinfo->gen < 12) { + err |= control(file, "dependency control", dep_ctrl, + ((brw_inst_no_dd_check(devinfo, inst) << 1) | + brw_inst_no_dd_clear(devinfo, inst)), &space); + } if (devinfo->gen >= 6) err |= qtr_ctrl(file, devinfo, inst); @@ -2040,9 +2125,14 @@ } } + if (devinfo->gen >= 12) + err |= swsb(file, devinfo, inst); + err |= control(file, "compaction", cmpt_ctrl, is_compacted, &space); err |= control(file, "thread control", thread_ctrl, - brw_inst_thread_control(devinfo, inst), &space); + (devinfo->gen >= 12 ? brw_inst_atomic_control(devinfo, inst) : + brw_inst_thread_control(devinfo, inst)), + &space); if (has_branch_ctrl(devinfo, opcode)) { err |= control(file, "branch ctrl", branch_ctrl, brw_inst_branch_control(devinfo, inst), &space); diff -Nru mesa-19.2.8/src/intel/compiler/brw_eu.c mesa-20.0.8/src/intel/compiler/brw_eu.c --- mesa-19.2.8/src/intel/compiler/brw_eu.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/compiler/brw_eu.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,771 +0,0 @@ -/* - Copyright (C) Intel Corp. 2006. All Rights Reserved. - Intel funded Tungsten Graphics to - develop this 3D driver. - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice (including the - next paragraph) shall be included in all copies or substantial - portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE - LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION - OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION - WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - **********************************************************************/ - /* - * Authors: - * Keith Whitwell - */ - -#include -#include - -#include "brw_eu_defines.h" -#include "brw_eu.h" -#include "brw_shader.h" -#include "dev/gen_debug.h" - -#include "util/ralloc.h" - -/* Returns a conditional modifier that negates the condition. */ -enum brw_conditional_mod -brw_negate_cmod(uint32_t cmod) -{ - switch (cmod) { - case BRW_CONDITIONAL_Z: - return BRW_CONDITIONAL_NZ; - case BRW_CONDITIONAL_NZ: - return BRW_CONDITIONAL_Z; - case BRW_CONDITIONAL_G: - return BRW_CONDITIONAL_LE; - case BRW_CONDITIONAL_GE: - return BRW_CONDITIONAL_L; - case BRW_CONDITIONAL_L: - return BRW_CONDITIONAL_GE; - case BRW_CONDITIONAL_LE: - return BRW_CONDITIONAL_G; - default: - return ~0; - } -} - -/* Returns the corresponding conditional mod for swapping src0 and - * src1 in e.g. CMP. - */ -enum brw_conditional_mod -brw_swap_cmod(uint32_t cmod) -{ - switch (cmod) { - case BRW_CONDITIONAL_Z: - case BRW_CONDITIONAL_NZ: - return cmod; - case BRW_CONDITIONAL_G: - return BRW_CONDITIONAL_L; - case BRW_CONDITIONAL_GE: - return BRW_CONDITIONAL_LE; - case BRW_CONDITIONAL_L: - return BRW_CONDITIONAL_G; - case BRW_CONDITIONAL_LE: - return BRW_CONDITIONAL_GE; - default: - return BRW_CONDITIONAL_NONE; - } -} - -/** - * Get the least significant bit offset of the i+1-th component of immediate - * type \p type. For \p i equal to the two's complement of j, return the - * offset of the j-th component starting from the end of the vector. For - * scalar register types return zero. - */ -static unsigned -imm_shift(enum brw_reg_type type, unsigned i) -{ - assert(type != BRW_REGISTER_TYPE_UV && type != BRW_REGISTER_TYPE_V && - "Not implemented."); - - if (type == BRW_REGISTER_TYPE_VF) - return 8 * (i & 3); - else - return 0; -} - -/** - * Swizzle an arbitrary immediate \p x of the given type according to the - * permutation specified as \p swz. - */ -uint32_t -brw_swizzle_immediate(enum brw_reg_type type, uint32_t x, unsigned swz) -{ - if (imm_shift(type, 1)) { - const unsigned n = 32 / imm_shift(type, 1); - uint32_t y = 0; - - for (unsigned i = 0; i < n; i++) { - /* Shift the specified component all the way to the right and left to - * discard any undesired L/MSBs, then shift it right into component i. - */ - y |= x >> imm_shift(type, (i & ~3) + BRW_GET_SWZ(swz, i & 3)) - << imm_shift(type, ~0u) - >> imm_shift(type, ~0u - i); - } - - return y; - } else { - return x; - } -} - -unsigned -brw_get_default_exec_size(struct brw_codegen *p) -{ - return p->current->exec_size; -} - -unsigned -brw_get_default_group(struct brw_codegen *p) -{ - return p->current->group; -} - -unsigned -brw_get_default_access_mode(struct brw_codegen *p) -{ - return p->current->access_mode; -} - -void -brw_set_default_exec_size(struct brw_codegen *p, unsigned value) -{ - p->current->exec_size = value; -} - -void brw_set_default_predicate_control( struct brw_codegen *p, unsigned pc ) -{ - p->current->predicate = pc; -} - -void brw_set_default_predicate_inverse(struct brw_codegen *p, bool predicate_inverse) -{ - p->current->pred_inv = predicate_inverse; -} - -void brw_set_default_flag_reg(struct brw_codegen *p, int reg, int subreg) -{ - assert(subreg < 2); - p->current->flag_subreg = reg * 2 + subreg; -} - -void brw_set_default_access_mode( struct brw_codegen *p, unsigned access_mode ) -{ - p->current->access_mode = access_mode; -} - -void -brw_set_default_compression_control(struct brw_codegen *p, - enum brw_compression compression_control) -{ - switch (compression_control) { - case BRW_COMPRESSION_NONE: - /* This is the "use the first set of bits of dmask/vmask/arf - * according to execsize" option. - */ - p->current->group = 0; - break; - case BRW_COMPRESSION_2NDHALF: - /* For SIMD8, this is "use the second set of 8 bits." */ - p->current->group = 8; - break; - case BRW_COMPRESSION_COMPRESSED: - /* For SIMD16 instruction compression, use the first set of 16 bits - * since we don't do SIMD32 dispatch. - */ - p->current->group = 0; - break; - default: - unreachable("not reached"); - } - - if (p->devinfo->gen <= 6) { - p->current->compressed = - (compression_control == BRW_COMPRESSION_COMPRESSED); - } -} - -/** - * Enable or disable instruction compression on the given instruction leaving - * the currently selected channel enable group untouched. - */ -void -brw_inst_set_compression(const struct gen_device_info *devinfo, - brw_inst *inst, bool on) -{ - if (devinfo->gen >= 6) { - /* No-op, the EU will figure out for us whether the instruction needs to - * be compressed. - */ - } else { - /* The channel group and compression controls are non-orthogonal, there - * are two possible representations for uncompressed instructions and we - * may need to preserve the current one to avoid changing the selected - * channel group inadvertently. - */ - if (on) - brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_COMPRESSED); - else if (brw_inst_qtr_control(devinfo, inst) - == BRW_COMPRESSION_COMPRESSED) - brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_NONE); - } -} - -void -brw_set_default_compression(struct brw_codegen *p, bool on) -{ - p->current->compressed = on; -} - -/** - * Apply the range of channel enable signals given by - * [group, group + exec_size) to the instruction passed as argument. - */ -void -brw_inst_set_group(const struct gen_device_info *devinfo, - brw_inst *inst, unsigned group) -{ - if (devinfo->gen >= 7) { - assert(group % 4 == 0 && group < 32); - brw_inst_set_qtr_control(devinfo, inst, group / 8); - brw_inst_set_nib_control(devinfo, inst, (group / 4) % 2); - - } else if (devinfo->gen == 6) { - assert(group % 8 == 0 && group < 32); - brw_inst_set_qtr_control(devinfo, inst, group / 8); - - } else { - assert(group % 8 == 0 && group < 16); - /* The channel group and compression controls are non-orthogonal, there - * are two possible representations for group zero and we may need to - * preserve the current one to avoid changing the selected compression - * enable inadvertently. - */ - if (group == 8) - brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_2NDHALF); - else if (brw_inst_qtr_control(devinfo, inst) == BRW_COMPRESSION_2NDHALF) - brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_NONE); - } -} - -void -brw_set_default_group(struct brw_codegen *p, unsigned group) -{ - p->current->group = group; -} - -void brw_set_default_mask_control( struct brw_codegen *p, unsigned value ) -{ - p->current->mask_control = value; -} - -void brw_set_default_saturate( struct brw_codegen *p, bool enable ) -{ - p->current->saturate = enable; -} - -void brw_set_default_acc_write_control(struct brw_codegen *p, unsigned value) -{ - p->current->acc_wr_control = value; -} - -void brw_push_insn_state( struct brw_codegen *p ) -{ - assert(p->current != &p->stack[BRW_EU_MAX_INSN_STACK-1]); - *(p->current + 1) = *p->current; - p->current++; -} - -void brw_pop_insn_state( struct brw_codegen *p ) -{ - assert(p->current != p->stack); - p->current--; -} - - -/*********************************************************************** - */ -void -brw_init_codegen(const struct gen_device_info *devinfo, - struct brw_codegen *p, void *mem_ctx) -{ - memset(p, 0, sizeof(*p)); - - p->devinfo = devinfo; - p->automatic_exec_sizes = true; - /* - * Set the initial instruction store array size to 1024, if found that - * isn't enough, then it will double the store size at brw_next_insn() - * until out of memory. - */ - p->store_size = 1024; - p->store = rzalloc_array(mem_ctx, brw_inst, p->store_size); - p->nr_insn = 0; - p->current = p->stack; - memset(p->current, 0, sizeof(p->current[0])); - - p->mem_ctx = mem_ctx; - - /* Some defaults? - */ - brw_set_default_exec_size(p, BRW_EXECUTE_8); - brw_set_default_mask_control(p, BRW_MASK_ENABLE); /* what does this do? */ - brw_set_default_saturate(p, 0); - brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); - - /* Set up control flow stack */ - p->if_stack_depth = 0; - p->if_stack_array_size = 16; - p->if_stack = rzalloc_array(mem_ctx, int, p->if_stack_array_size); - - p->loop_stack_depth = 0; - p->loop_stack_array_size = 16; - p->loop_stack = rzalloc_array(mem_ctx, int, p->loop_stack_array_size); - p->if_depth_in_loop = rzalloc_array(mem_ctx, int, p->loop_stack_array_size); -} - - -const unsigned *brw_get_program( struct brw_codegen *p, - unsigned *sz ) -{ - *sz = p->next_insn_offset; - return (const unsigned *)p->store; -} - -bool brw_try_override_assembly(struct brw_codegen *p, int start_offset, - const char *identifier) -{ - const char *read_path = getenv("INTEL_SHADER_ASM_READ_PATH"); - if (!read_path) { - return false; - } - - char *name = ralloc_asprintf(NULL, "%s/%s.bin", read_path, identifier); - - int fd = open(name, O_RDONLY); - ralloc_free(name); - - if (fd == -1) { - return false; - } - - struct stat sb; - if (fstat(fd, &sb) != 0 || (!S_ISREG(sb.st_mode))) { - close(fd); - return false; - } - - p->nr_insn -= (p->next_insn_offset - start_offset) / sizeof(brw_inst); - p->nr_insn += sb.st_size / sizeof(brw_inst); - - p->next_insn_offset = start_offset + sb.st_size; - p->store_size = (start_offset + sb.st_size) / sizeof(brw_inst); - p->store = reralloc_size(p->mem_ctx, p->store, p->next_insn_offset); - assert(p->store); - - read(fd, p->store + start_offset, sb.st_size); - close(fd); - - bool valid = brw_validate_instructions(p->devinfo, p->store, - start_offset, p->next_insn_offset, - 0); - assert(valid); - - return true; -} - -void -brw_disassemble(const struct gen_device_info *devinfo, - const void *assembly, int start, int end, FILE *out) -{ - bool dump_hex = (INTEL_DEBUG & DEBUG_HEX) != 0; - - for (int offset = start; offset < end;) { - const brw_inst *insn = assembly + offset; - brw_inst uncompacted; - bool compacted = brw_inst_cmpt_control(devinfo, insn); - if (0) - fprintf(out, "0x%08x: ", offset); - - if (compacted) { - brw_compact_inst *compacted = (void *)insn; - if (dump_hex) { - unsigned char * insn_ptr = ((unsigned char *)&insn[0]); - const unsigned int blank_spaces = 24; - for (int i = 0 ; i < 8; i = i + 4) { - fprintf(out, "%02x %02x %02x %02x ", - insn_ptr[i], - insn_ptr[i + 1], - insn_ptr[i + 2], - insn_ptr[i + 3]); - } - /* Make compacted instructions hex value output vertically aligned - * with uncompacted instructions hex value - */ - fprintf(out, "%*c", blank_spaces, ' '); - } - - brw_uncompact_instruction(devinfo, &uncompacted, compacted); - insn = &uncompacted; - offset += 8; - } else { - if (dump_hex) { - unsigned char * insn_ptr = ((unsigned char *)&insn[0]); - for (int i = 0 ; i < 16; i = i + 4) { - fprintf(out, "%02x %02x %02x %02x ", - insn_ptr[i], - insn_ptr[i + 1], - insn_ptr[i + 2], - insn_ptr[i + 3]); - } - } - offset += 16; - } - - brw_disassemble_inst(out, devinfo, insn, compacted); - } -} - -enum gen { - GEN4 = (1 << 0), - GEN45 = (1 << 1), - GEN5 = (1 << 2), - GEN6 = (1 << 3), - GEN7 = (1 << 4), - GEN75 = (1 << 5), - GEN8 = (1 << 6), - GEN9 = (1 << 7), - GEN10 = (1 << 8), - GEN11 = (1 << 9), - GEN_ALL = ~0 -}; - -#define GEN_LT(gen) ((gen) - 1) -#define GEN_GE(gen) (~GEN_LT(gen)) -#define GEN_LE(gen) (GEN_LT(gen) | (gen)) - -static const struct opcode_desc opcode_10_descs[] = { - { .name = "dim", .nsrc = 1, .ndst = 1, .gens = GEN75 }, - { .name = "smov", .nsrc = 0, .ndst = 0, .gens = GEN_GE(GEN8) }, -}; - -static const struct opcode_desc opcode_35_descs[] = { - { .name = "iff", .nsrc = 0, .ndst = 0, .gens = GEN_LE(GEN5) }, - { .name = "brc", .nsrc = 0, .ndst = 0, .gens = GEN_GE(GEN7) }, -}; - -static const struct opcode_desc opcode_38_descs[] = { - { .name = "do", .nsrc = 0, .ndst = 0, .gens = GEN_LE(GEN5) }, - { .name = "case", .nsrc = 0, .ndst = 0, .gens = GEN6 }, -}; - -static const struct opcode_desc opcode_44_descs[] = { - { .name = "msave", .nsrc = 0, .ndst = 0, .gens = GEN_LE(GEN5) }, - { .name = "call", .nsrc = 0, .ndst = 0, .gens = GEN_GE(GEN6) }, -}; - -static const struct opcode_desc opcode_45_descs[] = { - { .name = "mrest", .nsrc = 0, .ndst = 0, .gens = GEN_LE(GEN5) }, - { .name = "ret", .nsrc = 0, .ndst = 0, .gens = GEN_GE(GEN6) }, -}; - -static const struct opcode_desc opcode_46_descs[] = { - { .name = "push", .nsrc = 0, .ndst = 0, .gens = GEN_LE(GEN5) }, - { .name = "fork", .nsrc = 0, .ndst = 0, .gens = GEN6 }, - { .name = "goto", .nsrc = 0, .ndst = 0, .gens = GEN_GE(GEN8) }, -}; - -static const struct opcode_desc opcode_descs[128] = { - [BRW_OPCODE_ILLEGAL] = { - .name = "illegal", .nsrc = 0, .ndst = 0, .gens = GEN_ALL, - }, - [BRW_OPCODE_MOV] = { - .name = "mov", .nsrc = 1, .ndst = 1, .gens = GEN_ALL, - }, - [BRW_OPCODE_SEL] = { - .name = "sel", .nsrc = 2, .ndst = 1, .gens = GEN_ALL, - }, - [BRW_OPCODE_MOVI] = { - .name = "movi", .nsrc = 2, .ndst = 1, .gens = GEN_GE(GEN45), - }, - [BRW_OPCODE_NOT] = { - .name = "not", .nsrc = 1, .ndst = 1, .gens = GEN_ALL, - }, - [BRW_OPCODE_AND] = { - .name = "and", .nsrc = 2, .ndst = 1, .gens = GEN_ALL, - }, - [BRW_OPCODE_OR] = { - .name = "or", .nsrc = 2, .ndst = 1, .gens = GEN_ALL, - }, - [BRW_OPCODE_XOR] = { - .name = "xor", .nsrc = 2, .ndst = 1, .gens = GEN_ALL, - }, - [BRW_OPCODE_SHR] = { - .name = "shr", .nsrc = 2, .ndst = 1, .gens = GEN_ALL, - }, - [BRW_OPCODE_SHL] = { - .name = "shl", .nsrc = 2, .ndst = 1, .gens = GEN_ALL, - }, - [10] = { - .table = opcode_10_descs, .size = ARRAY_SIZE(opcode_10_descs), - }, - /* Reserved - 11 */ - [BRW_OPCODE_ASR] = { - .name = "asr", .nsrc = 2, .ndst = 1, .gens = GEN_ALL, - }, - /* Reserved - 13 */ - [BRW_OPCODE_ROR] = { - .name = "ror", .nsrc = 2, .ndst = 1, .gens = GEN_GE(GEN11), - }, - [BRW_OPCODE_ROL] = { - .name = "rol", .nsrc = 2, .ndst = 1, .gens = GEN_GE(GEN11), - }, - [BRW_OPCODE_CMP] = { - .name = "cmp", .nsrc = 2, .ndst = 1, .gens = GEN_ALL, - }, - [BRW_OPCODE_CMPN] = { - .name = "cmpn", .nsrc = 2, .ndst = 1, .gens = GEN_ALL, - }, - [BRW_OPCODE_CSEL] = { - .name = "csel", .nsrc = 3, .ndst = 1, .gens = GEN_GE(GEN8), - }, - [BRW_OPCODE_F32TO16] = { - .name = "f32to16", .nsrc = 1, .ndst = 1, .gens = GEN7 | GEN75, - }, - [BRW_OPCODE_F16TO32] = { - .name = "f16to32", .nsrc = 1, .ndst = 1, .gens = GEN7 | GEN75, - }, - /* Reserved - 21-22 */ - [BRW_OPCODE_BFREV] = { - .name = "bfrev", .nsrc = 1, .ndst = 1, .gens = GEN_GE(GEN7), - }, - [BRW_OPCODE_BFE] = { - .name = "bfe", .nsrc = 3, .ndst = 1, .gens = GEN_GE(GEN7), - }, - [BRW_OPCODE_BFI1] = { - .name = "bfi1", .nsrc = 2, .ndst = 1, .gens = GEN_GE(GEN7), - }, - [BRW_OPCODE_BFI2] = { - .name = "bfi2", .nsrc = 3, .ndst = 1, .gens = GEN_GE(GEN7), - }, - /* Reserved - 27-31 */ - [BRW_OPCODE_JMPI] = { - .name = "jmpi", .nsrc = 0, .ndst = 0, .gens = GEN_ALL, - }, - [33] = { - .name = "brd", .nsrc = 0, .ndst = 0, .gens = GEN_GE(GEN7), - }, - [BRW_OPCODE_IF] = { - .name = "if", .nsrc = 0, .ndst = 0, .gens = GEN_ALL, - }, - [35] = { - .table = opcode_35_descs, .size = ARRAY_SIZE(opcode_35_descs), - }, - [BRW_OPCODE_ELSE] = { - .name = "else", .nsrc = 0, .ndst = 0, .gens = GEN_ALL, - }, - [BRW_OPCODE_ENDIF] = { - .name = "endif", .nsrc = 0, .ndst = 0, .gens = GEN_ALL, - }, - [38] = { - .table = opcode_38_descs, .size = ARRAY_SIZE(opcode_38_descs), - }, - [BRW_OPCODE_WHILE] = { - .name = "while", .nsrc = 0, .ndst = 0, .gens = GEN_ALL, - }, - [BRW_OPCODE_BREAK] = { - .name = "break", .nsrc = 0, .ndst = 0, .gens = GEN_ALL, - }, - [BRW_OPCODE_CONTINUE] = { - .name = "cont", .nsrc = 0, .ndst = 0, .gens = GEN_ALL, - }, - [BRW_OPCODE_HALT] = { - .name = "halt", .nsrc = 0, .ndst = 0, .gens = GEN_ALL, - }, - [43] = { - .name = "calla", .nsrc = 0, .ndst = 0, .gens = GEN_GE(GEN75), - }, - [44] = { - .table = opcode_44_descs, .size = ARRAY_SIZE(opcode_44_descs), - }, - [45] = { - .table = opcode_45_descs, .size = ARRAY_SIZE(opcode_45_descs), - }, - [46] = { - .table = opcode_46_descs, .size = ARRAY_SIZE(opcode_46_descs), - }, - [47] = { - .name = "pop", .nsrc = 2, .ndst = 0, .gens = GEN_LE(GEN5), - }, - [BRW_OPCODE_WAIT] = { - .name = "wait", .nsrc = 1, .ndst = 0, .gens = GEN_ALL, - }, - [BRW_OPCODE_SEND] = { - .name = "send", .nsrc = 1, .ndst = 1, .gens = GEN_ALL, - }, - [BRW_OPCODE_SENDC] = { - .name = "sendc", .nsrc = 1, .ndst = 1, .gens = GEN_ALL, - }, - [BRW_OPCODE_SENDS] = { - .name = "sends", .nsrc = 2, .ndst = 1, .gens = GEN_GE(GEN9), - }, - [BRW_OPCODE_SENDSC] = { - .name = "sendsc", .nsrc = 2, .ndst = 1, .gens = GEN_GE(GEN9), - }, - /* Reserved 53-55 */ - [BRW_OPCODE_MATH] = { - .name = "math", .nsrc = 2, .ndst = 1, .gens = GEN_GE(GEN6), - }, - /* Reserved 57-63 */ - [BRW_OPCODE_ADD] = { - .name = "add", .nsrc = 2, .ndst = 1, .gens = GEN_ALL, - }, - [BRW_OPCODE_MUL] = { - .name = "mul", .nsrc = 2, .ndst = 1, .gens = GEN_ALL, - }, - [BRW_OPCODE_AVG] = { - .name = "avg", .nsrc = 2, .ndst = 1, .gens = GEN_ALL, - }, - [BRW_OPCODE_FRC] = { - .name = "frc", .nsrc = 1, .ndst = 1, .gens = GEN_ALL, - }, - [BRW_OPCODE_RNDU] = { - .name = "rndu", .nsrc = 1, .ndst = 1, .gens = GEN_ALL, - }, - [BRW_OPCODE_RNDD] = { - .name = "rndd", .nsrc = 1, .ndst = 1, .gens = GEN_ALL, - }, - [BRW_OPCODE_RNDE] = { - .name = "rnde", .nsrc = 1, .ndst = 1, .gens = GEN_ALL, - }, - [BRW_OPCODE_RNDZ] = { - .name = "rndz", .nsrc = 1, .ndst = 1, .gens = GEN_ALL, - }, - [BRW_OPCODE_MAC] = { - .name = "mac", .nsrc = 2, .ndst = 1, .gens = GEN_ALL, - }, - [BRW_OPCODE_MACH] = { - .name = "mach", .nsrc = 2, .ndst = 1, .gens = GEN_ALL, - }, - [BRW_OPCODE_LZD] = { - .name = "lzd", .nsrc = 1, .ndst = 1, .gens = GEN_ALL, - }, - [BRW_OPCODE_FBH] = { - .name = "fbh", .nsrc = 1, .ndst = 1, .gens = GEN_GE(GEN7), - }, - [BRW_OPCODE_FBL] = { - .name = "fbl", .nsrc = 1, .ndst = 1, .gens = GEN_GE(GEN7), - }, - [BRW_OPCODE_CBIT] = { - .name = "cbit", .nsrc = 1, .ndst = 1, .gens = GEN_GE(GEN7), - }, - [BRW_OPCODE_ADDC] = { - .name = "addc", .nsrc = 2, .ndst = 1, .gens = GEN_GE(GEN7), - }, - [BRW_OPCODE_SUBB] = { - .name = "subb", .nsrc = 2, .ndst = 1, .gens = GEN_GE(GEN7), - }, - [BRW_OPCODE_SAD2] = { - .name = "sad2", .nsrc = 2, .ndst = 1, .gens = GEN_ALL, - }, - [BRW_OPCODE_SADA2] = { - .name = "sada2", .nsrc = 2, .ndst = 1, .gens = GEN_ALL, - }, - /* Reserved 82-83 */ - [BRW_OPCODE_DP4] = { - .name = "dp4", .nsrc = 2, .ndst = 1, .gens = GEN_ALL, - }, - [BRW_OPCODE_DPH] = { - .name = "dph", .nsrc = 2, .ndst = 1, .gens = GEN_ALL, - }, - [BRW_OPCODE_DP3] = { - .name = "dp3", .nsrc = 2, .ndst = 1, .gens = GEN_ALL, - }, - [BRW_OPCODE_DP2] = { - .name = "dp2", .nsrc = 2, .ndst = 1, .gens = GEN_ALL, - }, - /* Reserved 88 */ - [BRW_OPCODE_LINE] = { - .name = "line", .nsrc = 2, .ndst = 1, .gens = GEN_LE(GEN10), - }, - [BRW_OPCODE_PLN] = { - .name = "pln", .nsrc = 2, .ndst = 1, .gens = GEN_GE(GEN45) & GEN_LE(GEN10), - }, - [BRW_OPCODE_MAD] = { - .name = "mad", .nsrc = 3, .ndst = 1, .gens = GEN_GE(GEN6), - }, - [BRW_OPCODE_LRP] = { - .name = "lrp", .nsrc = 3, .ndst = 1, .gens = GEN_GE(GEN6) & GEN_LE(GEN10), - }, - [93] = { - .name = "madm", .nsrc = 3, .ndst = 1, .gens = GEN_GE(GEN8), - }, - /* Reserved 94-124 */ - [BRW_OPCODE_NENOP] = { - .name = "nenop", .nsrc = 0, .ndst = 0, .gens = GEN45, - }, - [BRW_OPCODE_NOP] = { - .name = "nop", .nsrc = 0, .ndst = 0, .gens = GEN_ALL, - }, -}; - -static enum gen -gen_from_devinfo(const struct gen_device_info *devinfo) -{ - switch (devinfo->gen) { - case 4: return devinfo->is_g4x ? GEN45 : GEN4; - case 5: return GEN5; - case 6: return GEN6; - case 7: return devinfo->is_haswell ? GEN75 : GEN7; - case 8: return GEN8; - case 9: return GEN9; - case 10: return GEN10; - case 11: return GEN11; - default: - unreachable("not reached"); - } -} - -/* Return the matching opcode_desc for the specified opcode number and - * hardware generation, or NULL if the opcode is not supported by the device. - */ -const struct opcode_desc * -brw_opcode_desc(const struct gen_device_info *devinfo, enum opcode opcode) -{ - if (opcode >= ARRAY_SIZE(opcode_descs)) - return NULL; - - enum gen gen = gen_from_devinfo(devinfo); - if (opcode_descs[opcode].gens != 0) { - if ((opcode_descs[opcode].gens & gen) != 0) { - return &opcode_descs[opcode]; - } - } else if (opcode_descs[opcode].table != NULL) { - const struct opcode_desc *table = opcode_descs[opcode].table; - for (unsigned i = 0; i < opcode_descs[opcode].size; i++) { - if ((table[i].gens & gen) != 0) { - return &table[i]; - } - } - } - return NULL; -} diff -Nru mesa-19.2.8/src/intel/compiler/brw_eu_compact.c mesa-20.0.8/src/intel/compiler/brw_eu_compact.c --- mesa-19.2.8/src/intel/compiler/brw_eu_compact.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/compiler/brw_eu_compact.c 2020-06-12 01:21:17.000000000 +0000 @@ -1,5 +1,5 @@ /* - * Copyright © 2012 Intel Corporation + * Copyright © 2012-2018 Intel Corporation * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -70,6 +70,12 @@ * while JIP (must be negative) * * Gen 8 adds support for compacting 3-src instructions. + * + * Gen12 reduces the number of bits that available to compacted immediates from + * 13 to 12, but improves the compaction of floating-point immediates by + * allowing the high bits to be encoded (the sign, 8-bit exponent, and the + * three most significant bits of the mantissa), rather than the lowest bits of + * the mantissa. */ #include "brw_eu.h" @@ -672,6 +678,149 @@ 0b001001011001001001000, }; +static const uint32_t gen12_control_index_table[32] = { + 0b000000000000000000100, /* (16|M0) */ + 0b000000000000000000011, /* (8|M0) */ + 0b000000010000000000000, /* (W) (1|M0) */ + 0b000000010000000000100, /* (W) (16|M0) */ + 0b000000010000000000011, /* (W) (8|M0) */ + 0b010000000000000000100, /* (16|M0) (ge)f0.0 */ + 0b000000000000000100100, /* (16|M16) */ + 0b010100000000000000100, /* (16|M0) (lt)f0.0 */ + 0b000000000000000000000, /* (1|M0) */ + 0b000010000000000000100, /* (16|M0) (sat) */ + 0b000000000000000010011, /* (8|M8) */ + 0b001100000000000000100, /* (16|M0) (gt)f0.0 */ + 0b000100000000000000100, /* (16|M0) (eq)f0.0 */ + 0b000100010000000000100, /* (W) (16|M0) (eq)f0.0 */ + 0b001000000000000000100, /* (16|M0) (ne)f0.0 */ + 0b000000000000100000100, /* (f0.0) (16|M0) */ + 0b010100000000000000011, /* (8|M0) (lt)f0.0 */ + 0b000000000000110000100, /* (f1.0) (16|M0) */ + 0b000000010000000000001, /* (W) (2|M0) */ + 0b000000000000101000100, /* (f0.1) (16|M0) */ + 0b000000000000111000100, /* (f1.1) (16|M0) */ + 0b010000010000000000100, /* (W) (16|M0) (ge)f0.0 */ + 0b000000000000000100011, /* (8|M16) */ + 0b000000000000000110011, /* (8|M24) */ + 0b010100010000000000100, /* (W) (16|M0) (lt)f0.0 */ + 0b010000000000000000011, /* (8|M0) (ge)f0.0 */ + 0b000100010000000000000, /* (W) (1|M0) (eq)f0.0 */ + 0b000010000000000000011, /* (8|M0) (sat) */ + 0b010100000000010000100, /* (16|M0) (lt)f1.0 */ + 0b000100000000000000011, /* (8|M0) (eq)f0.0 */ + 0b000001000000000000011, /* (8|M0) {AccWrEn} */ + 0b000000010000000100100, /* (W) (16|M16) */ +}; + +static const uint32_t gen12_datatype_table[32] = { + 0b11010110100101010100, /* grf<1>:f grf:f grf:f */ + 0b00000110100101010100, /* grf<1>:f grf:f arf:ub */ + 0b00000010101101010100, /* grf<1>:f imm:f arf:ub */ + 0b01010110110101010100, /* grf<1>:f grf:f imm:f */ + 0b11010100100101010100, /* arf<1>:f grf:f grf:f */ + 0b11010010100101010100, /* grf<1>:f arf:f grf:f */ + 0b01010100110101010100, /* arf<1>:f grf:f imm:f */ + 0b00000000100000000000, /* arf<1>:ub arf:ub arf:ub */ + 0b11010000100101010100, /* arf<1>:f arf:f grf:f */ + 0b00101110110011001100, /* grf<1>:d grf:d imm:w */ + 0b10110110100011001100, /* grf<1>:d grf:d grf:d */ + 0b01010010110101010100, /* grf<1>:f arf:f imm:f */ + 0b10010110100001000100, /* grf<1>:ud grf:ud grf:ud */ + 0b01010000110101010100, /* arf<1>:f arf:f imm:f */ + 0b00110110110011001100, /* grf<1>:d grf:d imm:d */ + 0b00010110110001000100, /* grf<1>:ud grf:ud imm:ud */ + 0b00000111000101010100, /* grf<2>:f grf:f arf:ub */ + 0b00101100110011001100, /* arf<1>:d grf:d imm:w */ + 0b00000000100000100010, /* arf<1>:uw arf:uw arf:ub */ + 0b00000010100001000100, /* grf<1>:ud arf:ud arf:ub */ + 0b00100110110000101010, /* grf<1>:w grf:uw imm:uv */ + 0b00001110110000100010, /* grf<1>:uw grf:uw imm:uw */ + 0b10010111000001000100, /* grf<2>:ud grf:ud grf:ud */ + 0b00000110100101001100, /* grf<1>:d grf:f arf:ub */ + 0b10001100100011001100, /* arf<1>:d grf:d grf:uw */ + 0b00000110100001010100, /* grf<1>:f grf:ud arf:ub */ + 0b00101110110001001100, /* grf<1>:d grf:ud imm:w */ + 0b00000010100000100010, /* grf<1>:uw arf:uw arf:ub */ + 0b00000110100000110100, /* grf<1>:f grf:uw arf:ub */ + 0b00000110100000010100, /* grf<1>:f grf:ub arf:ub */ + 0b00000110100011010100, /* grf<1>:f grf:d arf:ub */ + 0b00000010100101010100, /* grf<1>:f arf:f arf:ub */ +}; + +static const uint16_t gen12_subreg_table[32] = { + 0b000000000000000, /* .0 .0 .0 */ + 0b100000000000000, /* .0 .0 .16 */ + 0b001000000000000, /* .0 .0 .4 */ + 0b011000000000000, /* .0 .0 .12 */ + 0b000000010000000, /* .0 .4 .0 */ + 0b010000000000000, /* .0 .0 .8 */ + 0b101000000000000, /* .0 .0 .20 */ + 0b000000000001000, /* .8 .0 .0 */ + 0b000000100000000, /* .0 .8 .0 */ + 0b110000000000000, /* .0 .0 .24 */ + 0b111000000000000, /* .0 .0 .28 */ + 0b000001000000000, /* .0 .16 .0 */ + 0b000000000000100, /* .4 .0 .0 */ + 0b000001100000000, /* .0 .24 .0 */ + 0b000001010000000, /* .0 .20 .0 */ + 0b000000110000000, /* .0 .12 .0 */ + 0b000001110000000, /* .0 .28 .0 */ + 0b000000000011100, /* .28 .0 .0 */ + 0b000000000010000, /* .16 .0 .0 */ + 0b000000000001100, /* .12 .0 .0 */ + 0b000000000011000, /* .24 .0 .0 */ + 0b000000000010100, /* .20 .0 .0 */ + 0b000000000000010, /* .2 .0 .0 */ + 0b000000101000000, /* .0 .10 .0 */ + 0b000000001000000, /* .0 .2 .0 */ + 0b000000010000100, /* .4 .4 .0 */ + 0b000000001011100, /* .28 .2 .0 */ + 0b000000001000010, /* .2 .2 .0 */ + 0b000000110001100, /* .12 .12 .0 */ + 0b000000000100000, /* .0 .1 .0 */ + 0b000000001100000, /* .0 .3 .0 */ + 0b110001100000000, /* .0 .24 .24 */ +}; + +static const uint16_t gen12_src0_index_table[16] = { + 0b010001100100, /* r<8;8,1> */ + 0b000000000000, /* r<0;1,0> */ + 0b010001100110, /* -r<8;8,1> */ + 0b010001100101, /* (abs)r<8;8,1> */ + 0b000000000010, /* -r<0;1,0> */ + 0b001000000000, /* r<2;1,0> */ + 0b001001000000, /* r<2;4,0> */ + 0b001101000000, /* r<4;4,0> */ + 0b001000100100, /* r<2;2,1> */ + 0b001100000000, /* r<4;1,0> */ + 0b001000100110, /* -r<2;2,1> */ + 0b001101000100, /* r<4;4,1> */ + 0b010001100111, /* -(abs)r<8;8,1> */ + 0b000100000000, /* r<1;1,0> */ + 0b000000000001, /* (abs)r<0;1,0> */ + 0b111100010000, /* r[a]<1,0> */ +}; + +static const uint16_t gen12_src1_index_table[16] = { + 0b000100011001, /* r<8;8,1> */ + 0b000000000000, /* r<0;1,0> */ + 0b100100011001, /* -r<8;8,1> */ + 0b100000000000, /* -r<0;1,0> */ + 0b010100011001, /* (abs)r<8;8,1> */ + 0b100011010000, /* -r<4;4,0> */ + 0b000010000000, /* r<2;1,0> */ + 0b000010001001, /* r<2;2,1> */ + 0b100010001001, /* -r<2;2,1> */ + 0b000011010000, /* r<4;4,0> */ + 0b000011010001, /* r<4;4,1> */ + 0b000011000000, /* r<4;1,0> */ + 0b110100011001, /* -(abs)r<8;8,1> */ + 0b010000000000, /* (abs)r<0;1,0> */ + 0b110000000000, /* -(abs)r<0;1,0> */ + 0b100011010001, /* -r<4;4,1> */ +}; + /* This is actually the control index table for Cherryview (26 bits), but the * only difference from Broadwell (24 bits) is that it has two extra 0-bits at * the start. @@ -699,29 +848,150 @@ 0b0000001110010011100100111001000001111000000100000, }; +static const uint64_t gen12_3src_control_index_table[32] = { + 0b000001001010010101000000000000000100, /* (16|M0) grf<1>:f :f :f :f */ + 0b000001001010010101000000000000000011, /* (8|M0) grf<1>:f :f :f :f */ + 0b000001001000010101000000000000000011, /* (8|M0) arf<1>:f :f :f :f */ + 0b000001001010010101000010000000000011, /* (W) (8|M0) grf<1>:f :f :f :f */ + 0b000001001000010101000010000000000011, /* (W) (8|M0) arf<1>:f :f :f :f */ + 0b000001001000010101000000000000010011, /* (8|M8) arf<1>:f :f :f :f */ + 0b000001001010010101000000000000010011, /* (8|M8) grf<1>:f :f :f :f */ + 0b000001001000010101000010000000010011, /* (W) (8|M8) arf<1>:f :f :f :f */ + 0b000001001010010101000010000000010011, /* (W) (8|M8) grf<1>:f :f :f :f */ + 0b000001001010010101000010000000000100, /* (W) (16|M0) grf<1>:f :f :f :f */ + 0b000001001000010101000000000000000100, /* (16|M0) arf<1>:f :f :f :f */ + 0b000001001010010101010000000000000100, /* (16|M0) (sat)grf<1>:f :f :f :f */ + 0b000001001010010101000000000000100100, /* (16|M16) grf<1>:f :f :f :f */ + 0b000001001000010101000010000000000100, /* (W) (16|M0) arf<1>:f :f :f :f */ + 0b000001001010010101000010000000000000, /* (W) (1|M0) grf<1>:f :f :f :f */ + 0b000001001010010101010000000000000011, /* (8|M0) (sat)grf<1>:f :f :f :f */ + 0b000001001000010101000010000000110011, /* (W) (8|M24) arf<1>:f :f :f :f */ + 0b000001001000010101000010000000100011, /* (W) (8|M16) arf<1>:f :f :f :f */ + 0b000001001010010101000010000000110011, /* (W) (8|M24) grf<1>:f :f :f :f */ + 0b000001001010010101000010000000100011, /* (W) (8|M16) grf<1>:f :f :f :f */ + 0b000001001000010101000000000000100011, /* (8|M16) arf<1>:f :f :f :f */ + 0b000001001000010101000000000000110011, /* (8|M24) arf<1>:f :f :f :f */ + 0b000001001010010101000000000000100011, /* (8|M16) grf<1>:f :f :f :f */ + 0b000001001010010101000000000000110011, /* (8|M24) grf<1>:f :f :f :f */ + 0b000001001000010101010000000000000100, /* (16|M0) (sat)arf<1>:f :f :f :f */ + 0b000001001010010101010010000000000100, /* (W) (16|M0) (sat)grf<1>:f :f :f :f */ + 0b000001001010010101000010000000100100, /* (W) (16|M16) grf<1>:f :f :f :f */ + 0b000001001010010001000010000000000000, /* (W) (1|M0) grf<1>:ud :ud :ud :ud */ + 0b000001001000010101000000000000100100, /* (16|M16) arf<1>:f :f :f :f */ + 0b000001001010010101010000000000100100, /* (16|M16) (sat)grf<1>:f :f :f :f */ + 0b000001001010010101000010000000000010, /* (W) (4|M0) grf<1>:f :f :f :f */ + 0b000001001000010101010000000000000011, /* (8|M0) (sat)arf<1>:f :f :f :f */ +}; + +static const uint32_t gen12_3src_source_index_table[32] = { + 0b100101100001100000000, /* grf<0;0> grf<8;1> grf<0> */ + 0b100101100001001000010, /* arf<4;1> grf<8;1> grf<0> */ + 0b101101100001101000011, /* grf<8;1> grf<8;1> grf<1> */ + 0b100101100001101000011, /* grf<8;1> grf<8;1> grf<0> */ + 0b101100000000101000011, /* grf<8;1> grf<0;0> grf<1> */ + 0b101101100001101001011, /* -grf<8;1> grf<8;1> grf<1> */ + 0b101001100001101000011, /* grf<8;1> arf<8;1> grf<1> */ + 0b100001100001100000000, /* grf<0;0> arf<8;1> grf<0> */ + 0b101101100001100000000, /* grf<0;0> grf<8;1> grf<1> */ + 0b101101100101101000011, /* grf<8;1> grf<8;1> -grf<1> */ + 0b101101110001101000011, /* grf<8;1> -grf<8;1> grf<1> */ + 0b101100000000100000000, /* grf<0;0> grf<0;0> grf<1> */ + 0b100001100001101000011, /* grf<8;1> arf<8;1> grf<0> */ + 0b100101110001100000000, /* grf<0;0> -grf<8;1> grf<0> */ + 0b100101110001101000011, /* grf<8;1> -grf<8;1> grf<0> */ + 0b100101100001101001011, /* -grf<8;1> grf<8;1> grf<0> */ + 0b100100000000101000011, /* grf<8;1> grf<0;0> grf<0> */ + 0b100101100001100001000, /* -grf<0;0> grf<8;1> grf<0> */ + 0b100100000000100000000, /* grf<0;0> grf<0;0> grf<0> */ + 0b101101110001100000000, /* grf<0;0> -grf<8;1> grf<1> */ + 0b100101100101100000000, /* grf<0;0> grf<8;1> -grf<0> */ + 0b101001100001100000000, /* grf<0;0> arf<8;1> grf<1> */ + 0b100101100101101000011, /* grf<8;1> grf<8;1> -grf<0> */ + 0b101101100101101001011, /* -grf<8;1> grf<8;1> -grf<1> */ + 0b101001100001101001011, /* -grf<8;1> arf<8;1> grf<1> */ + 0b101101110001101001011, /* -grf<8;1> -grf<8;1> grf<1> */ + 0b101100010000101000011, /* grf<8;1> -grf<0;0> grf<1> */ + 0b101100000100101000011, /* grf<8;1> grf<0;0> -grf<1> */ + 0b101101100001100001000, /* -grf<0;0> grf<8;1> grf<1> */ + 0b101101100101100000000, /* grf<0;0> grf<8;1> -grf<1> */ + 0b100100000100101000011, /* grf<8;1> grf<0;0> -grf<0> */ + 0b101001100101101000011, /* grf<8;1> arf<8;1> -grf<1> */ +}; + +static const uint32_t gen12_3src_subreg_table[32] = { + 0b00000000000000000000, /* .0 .0 .0 .0 */ + 0b00100000000000000000, /* .0 .0 .0 .4 */ + 0b00000000000110000000, /* .0 .12 .0 .0 */ + 0b10100000000000000000, /* .0 .0 .0 .20 */ + 0b10000000001110000000, /* .0 .28 .0 .16 */ + 0b01100000000000000000, /* .0 .0 .0 .12 */ + 0b01000000000000000000, /* .0 .0 .0 .8 */ + 0b00000010000000000000, /* .0 .0 .8 .0 */ + 0b00000001000000000000, /* .0 .0 .4 .0 */ + 0b11000000000000000000, /* .0 .0 .0 .24 */ + 0b10000000000000000000, /* .0 .0 .0 .16 */ + 0b11100000000000000000, /* .0 .0 .0 .28 */ + 0b00000110000000000000, /* .0 .0 .24 .0 */ + 0b00000000000010000000, /* .0 .4 .0 .0 */ + 0b00000100000000000000, /* .0 .0 .16 .0 */ + 0b00000011000000000000, /* .0 .0 .12 .0 */ + 0b00000101000000000000, /* .0 .0 .20 .0 */ + 0b00000111000000000000, /* .0 .0 .28 .0 */ + 0b00000000000100000000, /* .0 .8 .0 .0 */ + 0b00000000001000000000, /* .0 .16 .0 .0 */ + 0b00000000001100000000, /* .0 .24 .0 .0 */ + 0b00000000001010000000, /* .0 .20 .0 .0 */ + 0b00000000001110000000, /* .0 .28 .0 .0 */ + 0b11000000001110000000, /* .0 .28 .0 .24 */ + 0b00100000000100000000, /* .0 .8 .0 .4 */ + 0b00100000000110000000, /* .0 .12 .0 .4 */ + 0b01000000000110000000, /* .0 .12 .0 .8 */ + 0b10000000001100000000, /* .0 .24 .0 .16 */ + 0b10000000001010000000, /* .0 .20 .0 .16 */ + 0b01100000000010000000, /* .0 .4 .0 .12 */ + 0b10100000001110000000, /* .0 .28 .0 .20 */ + 0b01000000000010000000, /* .0 .4 .0 .8 */ +}; + static const uint32_t *control_index_table; static const uint32_t *datatype_table; static const uint16_t *subreg_table; -static const uint16_t *src_index_table; +static const uint16_t *src0_index_table; +static const uint16_t *src1_index_table; static bool set_control_index(const struct gen_device_info *devinfo, brw_compact_inst *dst, const brw_inst *src) { - uint32_t uncompacted = devinfo->gen >= 8 /* 17b/G45; 19b/IVB+ */ - ? (brw_inst_bits(src, 33, 31) << 16) | /* 3b */ - (brw_inst_bits(src, 23, 12) << 4) | /* 12b */ - (brw_inst_bits(src, 10, 9) << 2) | /* 2b */ - (brw_inst_bits(src, 34, 34) << 1) | /* 1b */ - (brw_inst_bits(src, 8, 8)) /* 1b */ - : (brw_inst_bits(src, 31, 31) << 16) | /* 1b */ - (brw_inst_bits(src, 23, 8)); /* 16b */ + uint32_t uncompacted; /* 17b/G45; 19b/IVB+; 21b/TGL+ */ - /* On gen7, the flag register and subregister numbers are integrated into - * the control index. - */ - if (devinfo->gen == 7) - uncompacted |= brw_inst_bits(src, 90, 89) << 17; /* 2b */ + if (devinfo->gen >= 12) { + uncompacted = (brw_inst_bits(src, 95, 92) << 17) | /* 4b */ + (brw_inst_bits(src, 34, 34) << 16) | /* 1b */ + (brw_inst_bits(src, 33, 33) << 15) | /* 1b */ + (brw_inst_bits(src, 32, 32) << 14) | /* 1b */ + (brw_inst_bits(src, 31, 31) << 13) | /* 1b */ + (brw_inst_bits(src, 28, 28) << 12) | /* 1b */ + (brw_inst_bits(src, 27, 24) << 8) | /* 4b */ + (brw_inst_bits(src, 23, 22) << 6) | /* 2b */ + (brw_inst_bits(src, 21, 19) << 3) | /* 3b */ + (brw_inst_bits(src, 18, 16)); /* 3b */ + } else if (devinfo->gen >= 8) { + uncompacted = (brw_inst_bits(src, 33, 31) << 16) | /* 3b */ + (brw_inst_bits(src, 23, 12) << 4) | /* 12b */ + (brw_inst_bits(src, 10, 9) << 2) | /* 2b */ + (brw_inst_bits(src, 34, 34) << 1) | /* 1b */ + (brw_inst_bits(src, 8, 8)); /* 1b */ + } else { + uncompacted = (brw_inst_bits(src, 31, 31) << 16) | /* 1b */ + (brw_inst_bits(src, 23, 8)); /* 16b */ + + /* On gen7, the flag register and subregister numbers are integrated into + * the control index. + */ + if (devinfo->gen == 7) + uncompacted |= brw_inst_bits(src, 90, 89) << 17; /* 2b */ + } for (int i = 0; i < 32; i++) { if (control_index_table[i] == uncompacted) { @@ -735,14 +1005,35 @@ static bool set_datatype_index(const struct gen_device_info *devinfo, brw_compact_inst *dst, - const brw_inst *src) + const brw_inst *src, bool is_immediate) { - uint32_t uncompacted = devinfo->gen >= 8 /* 18b/G45+; 21b/BDW+ */ - ? (brw_inst_bits(src, 63, 61) << 18) | /* 3b */ - (brw_inst_bits(src, 94, 89) << 12) | /* 6b */ - (brw_inst_bits(src, 46, 35)) /* 12b */ - : (brw_inst_bits(src, 63, 61) << 15) | /* 3b */ - (brw_inst_bits(src, 46, 32)); /* 15b */ + uint32_t uncompacted; /* 18b/G45+; 21b/BDW+; 20b/TGL+ */ + + if (devinfo->gen >= 12) { + uncompacted = (brw_inst_bits(src, 91, 88) << 15) | /* 4b */ + (brw_inst_bits(src, 66, 66) << 14) | /* 1b */ + (brw_inst_bits(src, 50, 50) << 13) | /* 1b */ + (brw_inst_bits(src, 49, 48) << 11) | /* 2b */ + (brw_inst_bits(src, 47, 47) << 10) | /* 1b */ + (brw_inst_bits(src, 46, 46) << 9) | /* 1b */ + (brw_inst_bits(src, 43, 40) << 5) | /* 4b */ + (brw_inst_bits(src, 39, 36) << 1) | /* 4b */ + (brw_inst_bits(src, 35, 35)); /* 1b */ + + /* Src1.RegFile overlaps with the immediate, so ignore it if an immediate + * is present + */ + if (!is_immediate) { + uncompacted |= brw_inst_bits(src, 98, 98) << 19; /* 1b */ + } + } else if (devinfo->gen >= 8) { + uncompacted = (brw_inst_bits(src, 63, 61) << 18) | /* 3b */ + (brw_inst_bits(src, 94, 89) << 12) | /* 6b */ + (brw_inst_bits(src, 46, 35)); /* 12b */ + } else { + uncompacted = (brw_inst_bits(src, 63, 61) << 15) | /* 3b */ + (brw_inst_bits(src, 46, 32)); /* 15b */ + } for (int i = 0; i < 32; i++) { if (datatype_table[i] == uncompacted) { @@ -758,12 +1049,21 @@ set_subreg_index(const struct gen_device_info *devinfo, brw_compact_inst *dst, const brw_inst *src, bool is_immediate) { - uint16_t uncompacted = /* 15b */ - (brw_inst_bits(src, 52, 48) << 0) | /* 5b */ - (brw_inst_bits(src, 68, 64) << 5); /* 5b */ + uint16_t uncompacted; /* 15b */ + + if (devinfo->gen >= 12) { + uncompacted = (brw_inst_bits(src, 55, 51) << 0) | /* 5b */ + (brw_inst_bits(src, 71, 67) << 5); /* 5b */ - if (!is_immediate) - uncompacted |= brw_inst_bits(src, 100, 96) << 10; /* 5b */ + if (!is_immediate) + uncompacted |= brw_inst_bits(src, 103, 99) << 10; /* 5b */ + } else { + uncompacted = (brw_inst_bits(src, 52, 48) << 0) | /* 5b */ + (brw_inst_bits(src, 68, 64) << 5); /* 5b */ + + if (!is_immediate) + uncompacted |= brw_inst_bits(src, 100, 96) << 10; /* 5b */ + } for (int i = 0; i < 32; i++) { if (subreg_table[i] == uncompacted) { @@ -776,12 +1076,27 @@ } static bool -get_src_index(uint16_t uncompacted, - uint16_t *compacted) +set_src0_index(const struct gen_device_info *devinfo, + brw_compact_inst *dst, const brw_inst *src) { - for (int i = 0; i < 32; i++) { - if (src_index_table[i] == uncompacted) { - *compacted = i; + uint16_t uncompacted; /* 12b */ + int table_len; + + if (devinfo->gen >= 12) { + table_len = ARRAY_SIZE(gen12_src0_index_table); + uncompacted = (brw_inst_bits(src, 87, 84) << 8) | /* 4b */ + (brw_inst_bits(src, 83, 81) << 5) | /* 3b */ + (brw_inst_bits(src, 80, 80) << 4) | /* 1b */ + (brw_inst_bits(src, 65, 64) << 2) | /* 2b */ + (brw_inst_bits(src, 45, 44)); /* 2b */ + } else { + table_len = ARRAY_SIZE(gen8_src_index_table); + uncompacted = brw_inst_bits(src, 88, 77); /* 12b */ + } + + for (int i = 0; i < table_len; i++) { + if (src0_index_table[i] == uncompacted) { + brw_compact_inst_set_src0_index(devinfo, dst, i); return true; } } @@ -790,57 +1105,153 @@ } static bool -set_src0_index(const struct gen_device_info *devinfo, - brw_compact_inst *dst, const brw_inst *src) +set_src1_index(const struct gen_device_info *devinfo, brw_compact_inst *dst, + const brw_inst *src, bool is_immediate, unsigned imm) { - uint16_t compacted; - uint16_t uncompacted = brw_inst_bits(src, 88, 77); /* 12b */ + if (is_immediate) { + if (devinfo->gen >= 12) { + /* src1 index takes the low 4 bits of the 12-bit compacted value */ + brw_compact_inst_set_src1_index(devinfo, dst, imm & 0xf); + } else { + /* src1 index takes the high 5 bits of the 13-bit compacted value */ + brw_compact_inst_set_src1_index(devinfo, dst, imm >> 8); + } + return true; + } else { + uint16_t uncompacted; /* 12b */ + int table_len; - if (!get_src_index(uncompacted, &compacted)) - return false; + if (devinfo->gen >= 12) { + table_len = ARRAY_SIZE(gen12_src0_index_table); + uncompacted = (brw_inst_bits(src, 121, 120) << 10) | /* 2b */ + (brw_inst_bits(src, 119, 116) << 6) | /* 4b */ + (brw_inst_bits(src, 115, 113) << 3) | /* 3b */ + (brw_inst_bits(src, 112, 112) << 2) | /* 1b */ + (brw_inst_bits(src, 97, 96)); /* 2b */ + } else { + table_len = ARRAY_SIZE(gen8_src_index_table); + uncompacted = brw_inst_bits(src, 120, 109); /* 12b */ + } - brw_compact_inst_set_src0_index(devinfo, dst, compacted); + for (int i = 0; i < table_len; i++) { + if (src1_index_table[i] == uncompacted) { + brw_compact_inst_set_src1_index(devinfo, dst, i); + return true; + } + } + } - return true; + return false; } static bool -set_src1_index(const struct gen_device_info *devinfo, brw_compact_inst *dst, - const brw_inst *src, bool is_immediate) +set_3src_control_index(const struct gen_device_info *devinfo, + brw_compact_inst *dst, const brw_inst *src) { - uint16_t compacted; + assert(devinfo->gen >= 8); - if (is_immediate) { - compacted = (brw_inst_imm_ud(devinfo, src) >> 8) & 0x1f; + if (devinfo->gen >= 12) { + uint64_t uncompacted = /* 36b/TGL+ */ + (brw_inst_bits(src, 95, 92) << 32) | /* 4b */ + (brw_inst_bits(src, 90, 88) << 29) | /* 3b */ + (brw_inst_bits(src, 82, 80) << 26) | /* 3b */ + (brw_inst_bits(src, 50, 50) << 25) | /* 1b */ + (brw_inst_bits(src, 48, 48) << 24) | /* 1b */ + (brw_inst_bits(src, 42, 40) << 21) | /* 3b */ + (brw_inst_bits(src, 39, 39) << 20) | /* 1b */ + (brw_inst_bits(src, 38, 36) << 17) | /* 3b */ + (brw_inst_bits(src, 34, 34) << 16) | /* 1b */ + (brw_inst_bits(src, 33, 33) << 15) | /* 1b */ + (brw_inst_bits(src, 32, 32) << 14) | /* 1b */ + (brw_inst_bits(src, 31, 31) << 13) | /* 1b */ + (brw_inst_bits(src, 28, 28) << 12) | /* 1b */ + (brw_inst_bits(src, 27, 24) << 8) | /* 4b */ + (brw_inst_bits(src, 23, 23) << 7) | /* 1b */ + (brw_inst_bits(src, 22, 22) << 6) | /* 1b */ + (brw_inst_bits(src, 21, 19) << 3) | /* 3b */ + (brw_inst_bits(src, 18, 16)); /* 3b */ + + for (unsigned i = 0; i < ARRAY_SIZE(gen12_3src_control_index_table); i++) { + if (gen12_3src_control_index_table[i] == uncompacted) { + brw_compact_inst_set_3src_control_index(devinfo, dst, i); + return true; + } + } } else { - uint16_t uncompacted = brw_inst_bits(src, 120, 109); /* 12b */ + uint32_t uncompacted = /* 24b/BDW; 26b/CHV/SKL+ */ + (brw_inst_bits(src, 34, 32) << 21) | /* 3b */ + (brw_inst_bits(src, 28, 8)); /* 21b */ + + if (devinfo->gen >= 9 || devinfo->is_cherryview) { + uncompacted |= + brw_inst_bits(src, 36, 35) << 24; /* 2b */ + } - if (!get_src_index(uncompacted, &compacted)) - return false; + for (unsigned i = 0; i < ARRAY_SIZE(gen8_3src_control_index_table); i++) { + if (gen8_3src_control_index_table[i] == uncompacted) { + brw_compact_inst_set_3src_control_index(devinfo, dst, i); + return true; + } + } } - brw_compact_inst_set_src1_index(devinfo, dst, compacted); - - return true; + return false; } static bool -set_3src_control_index(const struct gen_device_info *devinfo, - brw_compact_inst *dst, const brw_inst *src) +set_3src_source_index(const struct gen_device_info *devinfo, + brw_compact_inst *dst, const brw_inst *src) { assert(devinfo->gen >= 8); - uint32_t uncompacted = /* 24b/BDW; 26b/CHV */ - (brw_inst_bits(src, 34, 32) << 21) | /* 3b */ - (brw_inst_bits(src, 28, 8)); /* 21b */ - - if (devinfo->gen >= 9 || devinfo->is_cherryview) - uncompacted |= brw_inst_bits(src, 36, 35) << 24; /* 2b */ - - for (unsigned i = 0; i < ARRAY_SIZE(gen8_3src_control_index_table); i++) { - if (gen8_3src_control_index_table[i] == uncompacted) { - brw_compact_inst_set_3src_control_index(devinfo, dst, i); - return true; + if (devinfo->gen >= 12) { + uint32_t uncompacted = /* 21b/TGL+ */ + (brw_inst_bits(src, 114, 114) << 20) | /* 1b */ + (brw_inst_bits(src, 113, 112) << 18) | /* 2b */ + (brw_inst_bits(src, 98, 98) << 17) | /* 1b */ + (brw_inst_bits(src, 97, 96) << 15) | /* 2b */ + (brw_inst_bits(src, 91, 91) << 14) | /* 1b */ + (brw_inst_bits(src, 87, 86) << 12) | /* 2b */ + (brw_inst_bits(src, 85, 84) << 10) | /* 2b */ + (brw_inst_bits(src, 83, 83) << 9) | /* 1b */ + (brw_inst_bits(src, 66, 66) << 8) | /* 1b */ + (brw_inst_bits(src, 65, 64) << 6) | /* 2b */ + (brw_inst_bits(src, 47, 47) << 5) | /* 1b */ + (brw_inst_bits(src, 46, 46) << 4) | /* 1b */ + (brw_inst_bits(src, 45, 44) << 2) | /* 2b */ + (brw_inst_bits(src, 43, 43) << 1) | /* 1b */ + (brw_inst_bits(src, 35, 35)); /* 1b */ + + for (unsigned i = 0; i < ARRAY_SIZE(gen12_3src_source_index_table); i++) { + if (gen12_3src_source_index_table[i] == uncompacted) { + brw_compact_inst_set_3src_source_index(devinfo, dst, i); + return true; + } + } + } else { + uint64_t uncompacted = /* 46b/BDW; 49b/CHV/SKL+ */ + (brw_inst_bits(src, 83, 83) << 43) | /* 1b */ + (brw_inst_bits(src, 114, 107) << 35) | /* 8b */ + (brw_inst_bits(src, 93, 86) << 27) | /* 8b */ + (brw_inst_bits(src, 72, 65) << 19) | /* 8b */ + (brw_inst_bits(src, 55, 37)); /* 19b */ + + if (devinfo->gen >= 9 || devinfo->is_cherryview) { + uncompacted |= + (brw_inst_bits(src, 126, 125) << 47) | /* 2b */ + (brw_inst_bits(src, 105, 104) << 45) | /* 2b */ + (brw_inst_bits(src, 84, 84) << 44); /* 1b */ + } else { + uncompacted |= + (brw_inst_bits(src, 125, 125) << 45) | /* 1b */ + (brw_inst_bits(src, 104, 104) << 44); /* 1b */ + } + + for (unsigned i = 0; i < ARRAY_SIZE(gen8_3src_source_index_table); i++) { + if (gen8_3src_source_index_table[i] == uncompacted) { + brw_compact_inst_set_3src_source_index(devinfo, dst, i); + return true; + } } } @@ -848,32 +1259,20 @@ } static bool -set_3src_source_index(const struct gen_device_info *devinfo, +set_3src_subreg_index(const struct gen_device_info *devinfo, brw_compact_inst *dst, const brw_inst *src) { - assert(devinfo->gen >= 8); + assert(devinfo->gen >= 12); - uint64_t uncompacted = /* 46b/BDW; 49b/CHV */ - (brw_inst_bits(src, 83, 83) << 43) | /* 1b */ - (brw_inst_bits(src, 114, 107) << 35) | /* 8b */ - (brw_inst_bits(src, 93, 86) << 27) | /* 8b */ - (brw_inst_bits(src, 72, 65) << 19) | /* 8b */ - (brw_inst_bits(src, 55, 37)); /* 19b */ - - if (devinfo->gen >= 9 || devinfo->is_cherryview) { - uncompacted |= - (brw_inst_bits(src, 126, 125) << 47) | /* 2b */ - (brw_inst_bits(src, 105, 104) << 45) | /* 2b */ - (brw_inst_bits(src, 84, 84) << 44); /* 1b */ - } else { - uncompacted |= - (brw_inst_bits(src, 125, 125) << 45) | /* 1b */ - (brw_inst_bits(src, 104, 104) << 44); /* 1b */ - } - - for (unsigned i = 0; i < ARRAY_SIZE(gen8_3src_source_index_table); i++) { - if (gen8_3src_source_index_table[i] == uncompacted) { - brw_compact_inst_set_3src_source_index(devinfo, dst, i); + uint32_t uncompacted = /* 20b/TGL+ */ + (brw_inst_bits(src, 119, 115) << 15) | /* 5b */ + (brw_inst_bits(src, 103, 99) << 10) | /* 5b */ + (brw_inst_bits(src, 71, 67) << 5) | /* 5b */ + (brw_inst_bits(src, 55, 51)); /* 5b */ + + for (unsigned i = 0; i < ARRAY_SIZE(gen12_3src_subreg_table); i++) { + if (gen12_3src_subreg_table[i] == uncompacted) { + brw_compact_inst_set_3src_subreg_index(devinfo, dst, i); return true; } } @@ -899,7 +1298,10 @@ * - Imm64[27:31] (bits 91-95 on Gen7, bit 95 on Gen8) * - UIP[31] (bit 95 on Gen8) */ - if (devinfo->gen >= 8) { + if (devinfo->gen >= 12) { + assert(!brw_inst_bits(src, 7, 7)); + return false; + } else if (devinfo->gen >= 8) { assert(!brw_inst_bits(src, 7, 7)); return brw_inst_bits(src, 95, 95) || brw_inst_bits(src, 47, 47) || @@ -920,7 +1322,9 @@ * fields of the compacted instruction. All of them seem to be reserved * bits currently. */ - if (devinfo->gen >= 9 || devinfo->is_cherryview) { + if (devinfo->gen >= 12) { + assert(!brw_inst_bits(src, 7, 7)); + } else if (devinfo->gen >= 9 || devinfo->is_cherryview) { assert(!brw_inst_bits(src, 127, 127) && !brw_inst_bits(src, 7, 7)); } else { @@ -952,7 +1356,7 @@ #define compact_a16(field) \ brw_compact_inst_set_3src_##field(devinfo, dst, brw_inst_3src_a16_##field(devinfo, src)) - compact(opcode); + compact(hw_opcode); if (!set_3src_control_index(devinfo, dst, src)) return false; @@ -960,19 +1364,31 @@ if (!set_3src_source_index(devinfo, dst, src)) return false; - compact(dst_reg_nr); - compact_a16(src0_rep_ctrl); + if (devinfo->gen >= 12) { + if (!set_3src_subreg_index(devinfo, dst, src)) + return false; + + compact(swsb); + compact(debug_control); + compact(dst_reg_nr); + compact(src0_reg_nr); + compact(src1_reg_nr); + compact(src2_reg_nr); + } else { + compact(dst_reg_nr); + compact_a16(src0_rep_ctrl); + compact(debug_control); + compact(saturate); + compact_a16(src1_rep_ctrl); + compact_a16(src2_rep_ctrl); + compact(src0_reg_nr); + compact(src1_reg_nr); + compact(src2_reg_nr); + compact_a16(src0_subreg_nr); + compact_a16(src1_subreg_nr); + compact_a16(src2_subreg_nr); + } brw_compact_inst_set_3src_cmpt_control(devinfo, dst, true); - compact(debug_control); - compact(saturate); - compact_a16(src1_rep_ctrl); - compact_a16(src2_rep_ctrl); - compact(src0_reg_nr); - compact(src1_reg_nr); - compact(src2_reg_nr); - compact_a16(src0_subreg_nr); - compact_a16(src1_subreg_nr); - compact_a16(src2_subreg_nr); #undef compact #undef compact_a16 @@ -980,20 +1396,143 @@ return true; } -/* Compacted instructions have 12-bits for immediate sources, and a 13th bit - * that's replicated through the high 20 bits. +/* On SNB through ICL, compacted instructions have 12-bits for immediate + * sources, and a 13th bit that's replicated through the high 20 bits. * * Effectively this means we get 12-bit integers, 0.0f, and some limited uses * of packed vectors as compactable immediates. + * + * On TGL+, the high 12-bits of floating-point values (:f and :hf) are encoded + * rather than the low 12-bits. For signed integer the 12th bit is replicated, + * while for unsigned integers it is not. + * + * Returns the compacted immediate, or -1 if immediate cannot be compacted */ +static int +compact_immediate(const struct gen_device_info *devinfo, + enum brw_reg_type type, unsigned imm) +{ + if (devinfo->gen >= 12) { + /* 16-bit immediates need to be replicated through the 32-bit immediate + * field + */ + switch (type) { + case BRW_REGISTER_TYPE_W: + case BRW_REGISTER_TYPE_UW: + case BRW_REGISTER_TYPE_HF: + if ((imm >> 16) != (imm & 0xffff)) + return -1; + break; + default: + break; + } + + switch (type) { + case BRW_REGISTER_TYPE_F: + /* We get the high 12-bits as-is; rest must be zero */ + if ((imm & 0xfffff) == 0) + return (imm >> 20) & 0xfff; + break; + case BRW_REGISTER_TYPE_HF: + /* We get the high 12-bits as-is; rest must be zero */ + if ((imm & 0xf) == 0) + return (imm >> 4) & 0xfff; + break; + case BRW_REGISTER_TYPE_UD: + case BRW_REGISTER_TYPE_VF: + case BRW_REGISTER_TYPE_UV: + case BRW_REGISTER_TYPE_V: + /* We get the low 12-bits as-is; rest must be zero */ + if ((imm & 0xfffff000) == 0) + return imm & 0xfff; + break; + case BRW_REGISTER_TYPE_UW: + /* We get the low 12-bits as-is; rest must be zero */ + if ((imm & 0xf000) == 0) + return imm & 0xfff; + break; + case BRW_REGISTER_TYPE_D: + /* We get the low 11-bits as-is; 12th is replicated */ + if (((int)imm >> 11) == 0 || ((int)imm >> 11) == -1) + return imm & 0xfff; + break; + case BRW_REGISTER_TYPE_W: + /* We get the low 11-bits as-is; 12th is replicated */ + if (((short)imm >> 11) == 0 || ((short)imm >> 11) == -1) + return imm & 0xfff; + break; + case BRW_REGISTER_TYPE_NF: + case BRW_REGISTER_TYPE_DF: + case BRW_REGISTER_TYPE_Q: + case BRW_REGISTER_TYPE_UQ: + case BRW_REGISTER_TYPE_B: + case BRW_REGISTER_TYPE_UB: + return -1; + } + } else { + /* We get the low 12 bits as-is; 13th is replicated */ + if (((int)imm >> 12) == 0 || ((int)imm >> 12 == -1)) { + return imm & 0x1fff; + } + } + + return -1; +} + +static int +uncompact_immediate(const struct gen_device_info *devinfo, + enum brw_reg_type type, unsigned compact_imm) +{ + if (devinfo->gen >= 12) { + switch (type) { + case BRW_REGISTER_TYPE_F: + return compact_imm << 20; + case BRW_REGISTER_TYPE_HF: + return (compact_imm << 20) | (compact_imm << 4); + case BRW_REGISTER_TYPE_UD: + case BRW_REGISTER_TYPE_VF: + case BRW_REGISTER_TYPE_UV: + case BRW_REGISTER_TYPE_V: + return compact_imm; + case BRW_REGISTER_TYPE_UW: + /* Replicate */ + return compact_imm << 16 | compact_imm; + case BRW_REGISTER_TYPE_D: + /* Extend the 12th bit into the high 20 bits */ + return (int)(compact_imm << 20) >> 20; + case BRW_REGISTER_TYPE_W: + /* Extend the 12th bit into the high 4 bits and replicate */ + return ( (int)(compact_imm << 20) >> 4) | + ((short)(compact_imm << 4) >> 4); + case BRW_REGISTER_TYPE_NF: + case BRW_REGISTER_TYPE_DF: + case BRW_REGISTER_TYPE_Q: + case BRW_REGISTER_TYPE_UQ: + case BRW_REGISTER_TYPE_B: + case BRW_REGISTER_TYPE_UB: + unreachable("not reached"); + } + } else { + /* Replicate the 13th bit into the high 19 bits */ + return (int)(compact_imm << 19) >> 19; + } + + unreachable("not reached"); +} + static bool -is_compactable_immediate(unsigned imm) +has_immediate(const struct gen_device_info *devinfo, const brw_inst *inst, + enum brw_reg_type *type) { - /* We get the low 12 bits as-is. */ - imm &= ~0xfff; + if (brw_inst_src0_reg_file(devinfo, inst) == BRW_IMMEDIATE_VALUE) { + *type = brw_inst_src0_type(devinfo, inst); + return *type != INVALID_REG_TYPE; + } else if (brw_inst_src1_reg_file(devinfo, inst) == BRW_IMMEDIATE_VALUE) { + *type = brw_inst_src1_type(devinfo, inst); + return *type != INVALID_REG_TYPE; + } - /* We get one bit replicated through the top 20 bits. */ - return imm == 0 || imm == 0xfffff000; + return false; } /** @@ -1039,8 +1578,7 @@ (brw_inst_src0_type(devinfo, &inst) == BRW_REGISTER_TYPE_DF || brw_inst_src0_type(devinfo, &inst) == BRW_REGISTER_TYPE_UQ || brw_inst_src0_type(devinfo, &inst) == BRW_REGISTER_TYPE_Q))) { - enum brw_reg_file file = brw_inst_src1_reg_file(devinfo, &inst); - brw_inst_set_src1_file_type(devinfo, &inst, file, BRW_REGISTER_TYPE_UD); + brw_inst_set_src1_reg_hw_type(devinfo, &inst, 0); } /* Compacted instructions only have 12-bits (plus 1 for the other 20) @@ -1053,8 +1591,12 @@ * to do this there. * * If we see a 0.0:F, change the type to VF so that it can be compacted. + * + * Compaction of floating-point immediates is improved on Gen12, thus + * removing the need for this. */ - if (brw_inst_imm_ud(devinfo, &inst) == 0x0 && + if (devinfo->gen < 12 && + brw_inst_imm_ud(devinfo, &inst) == 0x0 && brw_inst_src0_type(devinfo, &inst) == BRW_REGISTER_TYPE_F && brw_inst_dst_type(devinfo, &inst) == BRW_REGISTER_TYPE_F && brw_inst_dst_hstride(devinfo, &inst) == BRW_HORIZONTAL_STRIDE_1) { @@ -1064,8 +1606,12 @@ /* There are no mappings for dst:d | i:d, so if the immediate is suitable * set the types to :UD so the instruction can be compacted. + * + * FINISHME: Use dst:f | imm:f on Gen12 */ - if (is_compactable_immediate(brw_inst_imm_ud(devinfo, &inst)) && + if (devinfo->gen < 12 && + compact_immediate(devinfo, BRW_REGISTER_TYPE_D, + brw_inst_imm_ud(devinfo, &inst)) != -1 && brw_inst_cond_modifier(devinfo, &inst) == BRW_CONDITIONAL_NONE && brw_inst_src0_type(devinfo, &inst) == BRW_REGISTER_TYPE_D && brw_inst_dst_type(devinfo, &inst) == BRW_REGISTER_TYPE_D) { @@ -1107,13 +1653,20 @@ } } - bool is_immediate = - brw_inst_src0_reg_file(devinfo, src) == BRW_IMMEDIATE_VALUE || - brw_inst_src1_reg_file(devinfo, src) == BRW_IMMEDIATE_VALUE; - if (is_immediate && - (devinfo->gen < 6 || - !is_compactable_immediate(brw_inst_imm_ud(devinfo, src)))) { - return false; + enum brw_reg_type type; + bool is_immediate = has_immediate(devinfo, src, &type); + + unsigned compacted_imm = 0; + + if (is_immediate) { + /* Instructions with immediates cannot be compacted on Gen < 6 */ + if (devinfo->gen < 6) + return false; + + compacted_imm = compact_immediate(devinfo, type, + brw_inst_imm_ud(devinfo, src)); + if (compacted_imm == -1) + return false; } if (has_unmapped_bits(devinfo, src)) @@ -1123,49 +1676,61 @@ #define compact(field) \ brw_compact_inst_set_##field(devinfo, &temp, brw_inst_##field(devinfo, src)) +#define compact_reg(field) \ + brw_compact_inst_set_##field##_reg_nr(devinfo, &temp, \ + brw_inst_##field##_da_reg_nr(devinfo, src)) - compact(opcode); + compact(hw_opcode); compact(debug_control); if (!set_control_index(devinfo, &temp, src)) return false; - if (!set_datatype_index(devinfo, &temp, src)) + if (!set_datatype_index(devinfo, &temp, src, is_immediate)) return false; if (!set_subreg_index(devinfo, &temp, src, is_immediate)) return false; + if (!set_src0_index(devinfo, &temp, src)) + return false; + if (!set_src1_index(devinfo, &temp, src, is_immediate, compacted_imm)) + return false; - if (devinfo->gen >= 6) { - compact(acc_wr_control); + if (devinfo->gen >= 12) { + compact(swsb); + compact_reg(dst); + compact_reg(src0); + + if (is_immediate) { + /* src1 reg takes the high 8 bits (of the 12-bit compacted value) */ + brw_compact_inst_set_src1_reg_nr(devinfo, &temp, compacted_imm >> 4); + } else { + compact_reg(src1); + } } else { - compact(mask_control_ex); - } + if (devinfo->gen >= 6) { + compact(acc_wr_control); + } else { + compact(mask_control_ex); + } - compact(cond_modifier); + if (devinfo->gen <= 6) + compact(flag_subreg_nr); - if (devinfo->gen <= 6) - compact(flag_subreg_nr); + compact(cond_modifier); - brw_compact_inst_set_cmpt_control(devinfo, &temp, true); + compact_reg(dst); + compact_reg(src0); - if (!set_src0_index(devinfo, &temp, src)) - return false; - if (!set_src1_index(devinfo, &temp, src, is_immediate)) - return false; - - brw_compact_inst_set_dst_reg_nr(devinfo, &temp, - brw_inst_dst_da_reg_nr(devinfo, src)); - brw_compact_inst_set_src0_reg_nr(devinfo, &temp, - brw_inst_src0_da_reg_nr(devinfo, src)); - - if (is_immediate) { - brw_compact_inst_set_src1_reg_nr(devinfo, &temp, - brw_inst_imm_ud(devinfo, src) & 0xff); - } else { - brw_compact_inst_set_src1_reg_nr(devinfo, &temp, - brw_inst_src1_da_reg_nr(devinfo, src)); + if (is_immediate) { + /* src1 reg takes the low 8 bits (of the 13-bit compacted value) */ + brw_compact_inst_set_src1_reg_nr(devinfo, &temp, compacted_imm & 0xff); + } else { + compact_reg(src1); + } } + brw_compact_inst_set_cmpt_control(devinfo, &temp, true); #undef compact +#undef compact_reg *dst = temp; @@ -1179,7 +1744,18 @@ uint32_t uncompacted = control_index_table[brw_compact_inst_control_index(devinfo, src)]; - if (devinfo->gen >= 8) { + if (devinfo->gen >= 12) { + brw_inst_set_bits(dst, 95, 92, (uncompacted >> 17)); + brw_inst_set_bits(dst, 34, 34, (uncompacted >> 16) & 0x1); + brw_inst_set_bits(dst, 33, 33, (uncompacted >> 15) & 0x1); + brw_inst_set_bits(dst, 32, 32, (uncompacted >> 14) & 0x1); + brw_inst_set_bits(dst, 31, 31, (uncompacted >> 13) & 0x1); + brw_inst_set_bits(dst, 28, 28, (uncompacted >> 12) & 0x1); + brw_inst_set_bits(dst, 27, 24, (uncompacted >> 8) & 0xf); + brw_inst_set_bits(dst, 23, 22, (uncompacted >> 6) & 0x3); + brw_inst_set_bits(dst, 21, 19, (uncompacted >> 3) & 0x7); + brw_inst_set_bits(dst, 18, 16, (uncompacted >> 0) & 0x7); + } else if (devinfo->gen >= 8) { brw_inst_set_bits(dst, 33, 31, (uncompacted >> 16)); brw_inst_set_bits(dst, 23, 12, (uncompacted >> 4) & 0xfff); brw_inst_set_bits(dst, 10, 9, (uncompacted >> 2) & 0x3); @@ -1201,7 +1777,18 @@ uint32_t uncompacted = datatype_table[brw_compact_inst_datatype_index(devinfo, src)]; - if (devinfo->gen >= 8) { + if (devinfo->gen >= 12) { + brw_inst_set_bits(dst, 98, 98, (uncompacted >> 19)); + brw_inst_set_bits(dst, 91, 88, (uncompacted >> 15) & 0xf); + brw_inst_set_bits(dst, 66, 66, (uncompacted >> 14) & 0x1); + brw_inst_set_bits(dst, 50, 50, (uncompacted >> 13) & 0x1); + brw_inst_set_bits(dst, 49, 48, (uncompacted >> 11) & 0x3); + brw_inst_set_bits(dst, 47, 47, (uncompacted >> 10) & 0x1); + brw_inst_set_bits(dst, 46, 46, (uncompacted >> 9) & 0x1); + brw_inst_set_bits(dst, 43, 40, (uncompacted >> 5) & 0xf); + brw_inst_set_bits(dst, 39, 36, (uncompacted >> 1) & 0xf); + brw_inst_set_bits(dst, 35, 35, (uncompacted >> 0) & 0x1); + } else if (devinfo->gen >= 8) { brw_inst_set_bits(dst, 63, 61, (uncompacted >> 18)); brw_inst_set_bits(dst, 94, 89, (uncompacted >> 12) & 0x3f); brw_inst_set_bits(dst, 46, 35, (uncompacted >> 0) & 0xfff); @@ -1218,9 +1805,15 @@ uint16_t uncompacted = subreg_table[brw_compact_inst_subreg_index(devinfo, src)]; - brw_inst_set_bits(dst, 100, 96, (uncompacted >> 10)); - brw_inst_set_bits(dst, 68, 64, (uncompacted >> 5) & 0x1f); - brw_inst_set_bits(dst, 52, 48, (uncompacted >> 0) & 0x1f); + if (devinfo->gen >= 12) { + brw_inst_set_bits(dst, 103, 99, (uncompacted >> 10)); + brw_inst_set_bits(dst, 71, 67, (uncompacted >> 5) & 0x1f); + brw_inst_set_bits(dst, 55, 51, (uncompacted >> 0) & 0x1f); + } else { + brw_inst_set_bits(dst, 100, 96, (uncompacted >> 10)); + brw_inst_set_bits(dst, 68, 64, (uncompacted >> 5) & 0x1f); + brw_inst_set_bits(dst, 52, 48, (uncompacted >> 0) & 0x1f); + } } static void @@ -1228,23 +1821,33 @@ brw_compact_inst *src) { uint32_t compacted = brw_compact_inst_src0_index(devinfo, src); - uint16_t uncompacted = src_index_table[compacted]; + uint16_t uncompacted = src0_index_table[compacted]; - brw_inst_set_bits(dst, 88, 77, uncompacted); + if (devinfo->gen >= 12) { + brw_inst_set_bits(dst, 87, 84, (uncompacted >> 8)); + brw_inst_set_bits(dst, 83, 81, (uncompacted >> 5) & 0x7); + brw_inst_set_bits(dst, 80, 80, (uncompacted >> 4) & 0x1); + brw_inst_set_bits(dst, 65, 64, (uncompacted >> 2) & 0x3); + brw_inst_set_bits(dst, 45, 44, (uncompacted >> 0) & 0x3); + } else { + brw_inst_set_bits(dst, 88, 77, uncompacted); + } } static void set_uncompacted_src1(const struct gen_device_info *devinfo, brw_inst *dst, - brw_compact_inst *src, bool is_immediate) + brw_compact_inst *src) { - if (is_immediate) { - signed high5 = brw_compact_inst_src1_index(devinfo, src); - /* Replicate top bit of src1_index into high 20 bits of the immediate. */ - brw_inst_set_imm_ud(devinfo, dst, (high5 << 27) >> 19); - } else { - uint16_t uncompacted = - src_index_table[brw_compact_inst_src1_index(devinfo, src)]; + uint16_t uncompacted = + src1_index_table[brw_compact_inst_src1_index(devinfo, src)]; + if (devinfo->gen >= 12) { + brw_inst_set_bits(dst, 121, 120, (uncompacted >> 10)); + brw_inst_set_bits(dst, 119, 116, (uncompacted >> 6) & 0xf); + brw_inst_set_bits(dst, 115, 113, (uncompacted >> 3) & 0x7); + brw_inst_set_bits(dst, 112, 112, (uncompacted >> 2) & 0x1); + brw_inst_set_bits(dst, 97, 96, (uncompacted >> 0) & 0x3); + } else { brw_inst_set_bits(dst, 120, 109, uncompacted); } } @@ -1255,14 +1858,38 @@ { assert(devinfo->gen >= 8); - uint32_t compacted = brw_compact_inst_3src_control_index(devinfo, src); - uint32_t uncompacted = gen8_3src_control_index_table[compacted]; + if (devinfo->gen >= 12) { + uint64_t compacted = brw_compact_inst_3src_control_index(devinfo, src); + uint64_t uncompacted = gen12_3src_control_index_table[compacted]; + + brw_inst_set_bits(dst, 95, 92, (uncompacted >> 32)); + brw_inst_set_bits(dst, 90, 88, (uncompacted >> 29) & 0x7); + brw_inst_set_bits(dst, 82, 80, (uncompacted >> 26) & 0x7); + brw_inst_set_bits(dst, 50, 50, (uncompacted >> 25) & 0x1); + brw_inst_set_bits(dst, 48, 48, (uncompacted >> 24) & 0x1); + brw_inst_set_bits(dst, 42, 40, (uncompacted >> 21) & 0x7); + brw_inst_set_bits(dst, 39, 39, (uncompacted >> 20) & 0x1); + brw_inst_set_bits(dst, 38, 36, (uncompacted >> 17) & 0x7); + brw_inst_set_bits(dst, 34, 34, (uncompacted >> 16) & 0x1); + brw_inst_set_bits(dst, 33, 33, (uncompacted >> 15) & 0x1); + brw_inst_set_bits(dst, 32, 32, (uncompacted >> 14) & 0x1); + brw_inst_set_bits(dst, 31, 31, (uncompacted >> 13) & 0x1); + brw_inst_set_bits(dst, 28, 28, (uncompacted >> 12) & 0x1); + brw_inst_set_bits(dst, 27, 24, (uncompacted >> 8) & 0xf); + brw_inst_set_bits(dst, 23, 23, (uncompacted >> 7) & 0x1); + brw_inst_set_bits(dst, 22, 22, (uncompacted >> 6) & 0x1); + brw_inst_set_bits(dst, 21, 19, (uncompacted >> 3) & 0x7); + brw_inst_set_bits(dst, 18, 16, (uncompacted >> 0) & 0x7); + } else { + uint32_t compacted = brw_compact_inst_3src_control_index(devinfo, src); + uint32_t uncompacted = gen8_3src_control_index_table[compacted]; - brw_inst_set_bits(dst, 34, 32, (uncompacted >> 21) & 0x7); - brw_inst_set_bits(dst, 28, 8, (uncompacted >> 0) & 0x1fffff); + brw_inst_set_bits(dst, 34, 32, (uncompacted >> 21) & 0x7); + brw_inst_set_bits(dst, 28, 8, (uncompacted >> 0) & 0x1fffff); - if (devinfo->gen >= 9 || devinfo->is_cherryview) - brw_inst_set_bits(dst, 36, 35, (uncompacted >> 24) & 0x3); + if (devinfo->gen >= 9 || devinfo->is_cherryview) + brw_inst_set_bits(dst, 36, 35, (uncompacted >> 24) & 0x3); + } } static void @@ -1272,25 +1899,61 @@ assert(devinfo->gen >= 8); uint32_t compacted = brw_compact_inst_3src_source_index(devinfo, src); - uint64_t uncompacted = gen8_3src_source_index_table[compacted]; - brw_inst_set_bits(dst, 83, 83, (uncompacted >> 43) & 0x1); - brw_inst_set_bits(dst, 114, 107, (uncompacted >> 35) & 0xff); - brw_inst_set_bits(dst, 93, 86, (uncompacted >> 27) & 0xff); - brw_inst_set_bits(dst, 72, 65, (uncompacted >> 19) & 0xff); - brw_inst_set_bits(dst, 55, 37, (uncompacted >> 0) & 0x7ffff); - - if (devinfo->gen >= 9 || devinfo->is_cherryview) { - brw_inst_set_bits(dst, 126, 125, (uncompacted >> 47) & 0x3); - brw_inst_set_bits(dst, 105, 104, (uncompacted >> 45) & 0x3); - brw_inst_set_bits(dst, 84, 84, (uncompacted >> 44) & 0x1); + if (devinfo->gen >= 12) { + uint32_t uncompacted = gen12_3src_source_index_table[compacted]; + + brw_inst_set_bits(dst, 114, 114, (uncompacted >> 20)); + brw_inst_set_bits(dst, 113, 112, (uncompacted >> 18) & 0x3); + brw_inst_set_bits(dst, 98, 98, (uncompacted >> 17) & 0x1); + brw_inst_set_bits(dst, 97, 96, (uncompacted >> 15) & 0x3); + brw_inst_set_bits(dst, 91, 91, (uncompacted >> 14) & 0x1); + brw_inst_set_bits(dst, 87, 86, (uncompacted >> 12) & 0x3); + brw_inst_set_bits(dst, 85, 84, (uncompacted >> 10) & 0x3); + brw_inst_set_bits(dst, 83, 83, (uncompacted >> 9) & 0x1); + brw_inst_set_bits(dst, 66, 66, (uncompacted >> 8) & 0x1); + brw_inst_set_bits(dst, 65, 64, (uncompacted >> 6) & 0x3); + brw_inst_set_bits(dst, 47, 47, (uncompacted >> 5) & 0x1); + brw_inst_set_bits(dst, 46, 46, (uncompacted >> 4) & 0x1); + brw_inst_set_bits(dst, 45, 44, (uncompacted >> 2) & 0x3); + brw_inst_set_bits(dst, 43, 43, (uncompacted >> 1) & 0x1); + brw_inst_set_bits(dst, 35, 35, (uncompacted >> 0) & 0x1); } else { - brw_inst_set_bits(dst, 125, 125, (uncompacted >> 45) & 0x1); - brw_inst_set_bits(dst, 104, 104, (uncompacted >> 44) & 0x1); + uint64_t uncompacted = gen8_3src_source_index_table[compacted]; + + brw_inst_set_bits(dst, 83, 83, (uncompacted >> 43) & 0x1); + brw_inst_set_bits(dst, 114, 107, (uncompacted >> 35) & 0xff); + brw_inst_set_bits(dst, 93, 86, (uncompacted >> 27) & 0xff); + brw_inst_set_bits(dst, 72, 65, (uncompacted >> 19) & 0xff); + brw_inst_set_bits(dst, 55, 37, (uncompacted >> 0) & 0x7ffff); + + if (devinfo->gen >= 9 || devinfo->is_cherryview) { + brw_inst_set_bits(dst, 126, 125, (uncompacted >> 47) & 0x3); + brw_inst_set_bits(dst, 105, 104, (uncompacted >> 45) & 0x3); + brw_inst_set_bits(dst, 84, 84, (uncompacted >> 44) & 0x1); + } else { + brw_inst_set_bits(dst, 125, 125, (uncompacted >> 45) & 0x1); + brw_inst_set_bits(dst, 104, 104, (uncompacted >> 44) & 0x1); + } } } static void +set_uncompacted_3src_subreg_index(const struct gen_device_info *devinfo, + brw_inst *dst, brw_compact_inst *src) +{ + assert(devinfo->gen >= 12); + + uint32_t compacted = brw_compact_inst_3src_subreg_index(devinfo, src); + uint32_t uncompacted = gen12_3src_subreg_table[compacted]; + + brw_inst_set_bits(dst, 119, 115, (uncompacted >> 15)); + brw_inst_set_bits(dst, 103, 99, (uncompacted >> 10) & 0x1f); + brw_inst_set_bits(dst, 71, 67, (uncompacted >> 5) & 0x1f); + brw_inst_set_bits(dst, 55, 51, (uncompacted >> 0) & 0x1f); +} + +static void brw_uncompact_3src_instruction(const struct gen_device_info *devinfo, brw_inst *dst, brw_compact_inst *src) { @@ -1301,24 +1964,37 @@ #define uncompact_a16(field) \ brw_inst_set_3src_a16_##field(devinfo, dst, brw_compact_inst_3src_##field(devinfo, src)) - uncompact(opcode); + uncompact(hw_opcode); - set_uncompacted_3src_control_index(devinfo, dst, src); - set_uncompacted_3src_source_index(devinfo, dst, src); + if (devinfo->gen >= 12) { + set_uncompacted_3src_control_index(devinfo, dst, src); + set_uncompacted_3src_source_index(devinfo, dst, src); + set_uncompacted_3src_subreg_index(devinfo, dst, src); + + uncompact(debug_control); + uncompact(swsb); + uncompact(dst_reg_nr); + uncompact(src0_reg_nr); + uncompact(src1_reg_nr); + uncompact(src2_reg_nr); + } else { + set_uncompacted_3src_control_index(devinfo, dst, src); + set_uncompacted_3src_source_index(devinfo, dst, src); - uncompact(dst_reg_nr); - uncompact_a16(src0_rep_ctrl); + uncompact(dst_reg_nr); + uncompact_a16(src0_rep_ctrl); + uncompact(debug_control); + uncompact(saturate); + uncompact_a16(src1_rep_ctrl); + uncompact_a16(src2_rep_ctrl); + uncompact(src0_reg_nr); + uncompact(src1_reg_nr); + uncompact(src2_reg_nr); + uncompact_a16(src0_subreg_nr); + uncompact_a16(src1_subreg_nr); + uncompact_a16(src2_subreg_nr); + } brw_inst_set_3src_cmpt_control(devinfo, dst, false); - uncompact(debug_control); - uncompact(saturate); - uncompact_a16(src1_rep_ctrl); - uncompact_a16(src2_rep_ctrl); - uncompact(src0_reg_nr); - uncompact(src1_reg_nr); - uncompact(src2_reg_nr); - uncompact_a16(src0_subreg_nr); - uncompact_a16(src1_subreg_nr); - uncompact_a16(src2_subreg_nr); #undef uncompact #undef uncompact_a16 @@ -1331,55 +2007,59 @@ memset(dst, 0, sizeof(*dst)); if (devinfo->gen >= 8 && - is_3src(devinfo, brw_compact_inst_3src_opcode(devinfo, src))) { + is_3src(devinfo, brw_opcode_decode( + devinfo, brw_compact_inst_3src_hw_opcode(devinfo, src)))) { brw_uncompact_3src_instruction(devinfo, dst, src); return; } #define uncompact(field) \ brw_inst_set_##field(devinfo, dst, brw_compact_inst_##field(devinfo, src)) +#define uncompact_reg(field) \ + brw_inst_set_##field##_da_reg_nr(devinfo, dst, \ + brw_compact_inst_##field##_reg_nr(devinfo, src)) - uncompact(opcode); + uncompact(hw_opcode); uncompact(debug_control); set_uncompacted_control(devinfo, dst, src); set_uncompacted_datatype(devinfo, dst, src); - - /* src0/1 register file fields are in the datatype table. */ - bool is_immediate = brw_inst_src0_reg_file(devinfo, dst) == BRW_IMMEDIATE_VALUE || - brw_inst_src1_reg_file(devinfo, dst) == BRW_IMMEDIATE_VALUE; - set_uncompacted_subreg(devinfo, dst, src); + set_uncompacted_src0(devinfo, dst, src); - if (devinfo->gen >= 6) { - uncompact(acc_wr_control); + enum brw_reg_type type; + if (has_immediate(devinfo, dst, &type)) { + unsigned imm = uncompact_immediate(devinfo, type, + brw_compact_inst_imm(devinfo, src)); + brw_inst_set_imm_ud(devinfo, dst, imm); } else { - uncompact(mask_control_ex); + set_uncompacted_src1(devinfo, dst, src); + uncompact_reg(src1); } - uncompact(cond_modifier); - - if (devinfo->gen <= 6) - uncompact(flag_subreg_nr); + if (devinfo->gen >= 12) { + uncompact(swsb); + uncompact_reg(dst); + uncompact_reg(src0); + } else { + if (devinfo->gen >= 6) { + uncompact(acc_wr_control); + } else { + uncompact(mask_control_ex); + } - set_uncompacted_src0(devinfo, dst, src); - set_uncompacted_src1(devinfo, dst, src, is_immediate); + uncompact(cond_modifier); - brw_inst_set_dst_da_reg_nr(devinfo, dst, - brw_compact_inst_dst_reg_nr(devinfo, src)); - brw_inst_set_src0_da_reg_nr(devinfo, dst, - brw_compact_inst_src0_reg_nr(devinfo, src)); + if (devinfo->gen <= 6) + uncompact(flag_subreg_nr); - if (is_immediate) { - brw_inst_set_imm_ud(devinfo, dst, - brw_inst_imm_ud(devinfo, dst) | - brw_compact_inst_src1_reg_nr(devinfo, src)); - } else { - brw_inst_set_src1_da_reg_nr(devinfo, dst, - brw_compact_inst_src1_reg_nr(devinfo, src)); + uncompact_reg(dst); + uncompact_reg(src0); } + brw_inst_set_cmpt_control(devinfo, dst, false); #undef uncompact +#undef uncompact_reg } void brw_debug_compact_uncompact(const struct gen_device_info *devinfo, @@ -1489,13 +2169,26 @@ assert(gen8_subreg_table[ARRAY_SIZE(gen8_subreg_table) - 1] != 0); assert(gen8_src_index_table[ARRAY_SIZE(gen8_src_index_table) - 1] != 0); assert(gen11_datatype_table[ARRAY_SIZE(gen11_datatype_table) - 1] != 0); + assert(gen12_control_index_table[ARRAY_SIZE(gen12_control_index_table) - 1] != 0); + assert(gen12_datatype_table[ARRAY_SIZE(gen12_datatype_table) - 1] != 0); + assert(gen12_subreg_table[ARRAY_SIZE(gen12_subreg_table) - 1] != 0); + assert(gen12_src0_index_table[ARRAY_SIZE(gen12_src0_index_table) - 1] != 0); + assert(gen12_src1_index_table[ARRAY_SIZE(gen12_src1_index_table) - 1] != 0); switch (devinfo->gen) { + case 12: + control_index_table = gen12_control_index_table;; + datatype_table = gen12_datatype_table; + subreg_table = gen12_subreg_table; + src0_index_table = gen12_src0_index_table; + src1_index_table = gen12_src1_index_table; + break; case 11: control_index_table = gen8_control_index_table; datatype_table = gen11_datatype_table; subreg_table = gen8_subreg_table; - src_index_table = gen8_src_index_table; + src0_index_table = gen8_src_index_table; + src1_index_table = gen8_src_index_table; break; case 10: case 9: @@ -1503,26 +2196,30 @@ control_index_table = gen8_control_index_table; datatype_table = gen8_datatype_table; subreg_table = gen8_subreg_table; - src_index_table = gen8_src_index_table; + src0_index_table = gen8_src_index_table; + src1_index_table = gen8_src_index_table; break; case 7: control_index_table = gen7_control_index_table; datatype_table = gen7_datatype_table; subreg_table = gen7_subreg_table; - src_index_table = gen7_src_index_table; + src0_index_table = gen7_src_index_table; + src1_index_table = gen7_src_index_table; break; case 6: control_index_table = gen6_control_index_table; datatype_table = gen6_datatype_table; subreg_table = gen6_subreg_table; - src_index_table = gen6_src_index_table; + src0_index_table = gen6_src_index_table; + src1_index_table = gen6_src_index_table; break; case 5: case 4: control_index_table = g45_control_index_table; datatype_table = g45_datatype_table; subreg_table = g45_subreg_table; - src_index_table = g45_src_index_table; + src0_index_table = g45_src_index_table; + src1_index_table = g45_src_index_table; break; default: unreachable("unknown generation"); @@ -1581,7 +2278,8 @@ if ((offset & sizeof(brw_compact_inst)) != 0 && devinfo->is_g4x){ brw_compact_inst *align = store + offset; memset(align, 0, sizeof(*align)); - brw_compact_inst_set_opcode(devinfo, align, BRW_OPCODE_NENOP); + brw_compact_inst_set_hw_opcode( + devinfo, align, brw_opcode_encode(devinfo, BRW_OPCODE_NENOP)); brw_compact_inst_set_cmpt_control(devinfo, align, true); offset += sizeof(brw_compact_inst); compacted_count--; @@ -1685,6 +2383,9 @@ brw_inst_set_imm_ud(devinfo, insn, jump_compacted << shift); } break; + + default: + break; } } @@ -1696,7 +2397,8 @@ if (p->next_insn_offset & sizeof(brw_compact_inst)) { brw_compact_inst *align = store + offset; memset(align, 0, sizeof(*align)); - brw_compact_inst_set_opcode(devinfo, align, BRW_OPCODE_NOP); + brw_compact_inst_set_hw_opcode( + devinfo, align, brw_opcode_encode(devinfo, BRW_OPCODE_NOP)); brw_compact_inst_set_cmpt_control(devinfo, align, true); p->next_insn_offset += sizeof(brw_compact_inst); } diff -Nru mesa-19.2.8/src/intel/compiler/brw_eu.cpp mesa-20.0.8/src/intel/compiler/brw_eu.cpp --- mesa-19.2.8/src/intel/compiler/brw_eu.cpp 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/intel/compiler/brw_eu.cpp 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,633 @@ +/* + Copyright (C) Intel Corp. 2006. All Rights Reserved. + Intel funded Tungsten Graphics to + develop this 3D driver. + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice (including the + next paragraph) shall be included in all copies or substantial + portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + **********************************************************************/ + /* + * Authors: + * Keith Whitwell + */ + +#include +#include + +#include "brw_eu_defines.h" +#include "brw_eu.h" +#include "brw_shader.h" +#include "brw_gen_enum.h" +#include "dev/gen_debug.h" + +#include "util/ralloc.h" + +/* Returns a conditional modifier that negates the condition. */ +enum brw_conditional_mod +brw_negate_cmod(enum brw_conditional_mod cmod) +{ + switch (cmod) { + case BRW_CONDITIONAL_Z: + return BRW_CONDITIONAL_NZ; + case BRW_CONDITIONAL_NZ: + return BRW_CONDITIONAL_Z; + case BRW_CONDITIONAL_G: + return BRW_CONDITIONAL_LE; + case BRW_CONDITIONAL_GE: + return BRW_CONDITIONAL_L; + case BRW_CONDITIONAL_L: + return BRW_CONDITIONAL_GE; + case BRW_CONDITIONAL_LE: + return BRW_CONDITIONAL_G; + default: + unreachable("Can't negate this cmod"); + } +} + +/* Returns the corresponding conditional mod for swapping src0 and + * src1 in e.g. CMP. + */ +enum brw_conditional_mod +brw_swap_cmod(enum brw_conditional_mod cmod) +{ + switch (cmod) { + case BRW_CONDITIONAL_Z: + case BRW_CONDITIONAL_NZ: + return cmod; + case BRW_CONDITIONAL_G: + return BRW_CONDITIONAL_L; + case BRW_CONDITIONAL_GE: + return BRW_CONDITIONAL_LE; + case BRW_CONDITIONAL_L: + return BRW_CONDITIONAL_G; + case BRW_CONDITIONAL_LE: + return BRW_CONDITIONAL_GE; + default: + return BRW_CONDITIONAL_NONE; + } +} + +/** + * Get the least significant bit offset of the i+1-th component of immediate + * type \p type. For \p i equal to the two's complement of j, return the + * offset of the j-th component starting from the end of the vector. For + * scalar register types return zero. + */ +static unsigned +imm_shift(enum brw_reg_type type, unsigned i) +{ + assert(type != BRW_REGISTER_TYPE_UV && type != BRW_REGISTER_TYPE_V && + "Not implemented."); + + if (type == BRW_REGISTER_TYPE_VF) + return 8 * (i & 3); + else + return 0; +} + +/** + * Swizzle an arbitrary immediate \p x of the given type according to the + * permutation specified as \p swz. + */ +uint32_t +brw_swizzle_immediate(enum brw_reg_type type, uint32_t x, unsigned swz) +{ + if (imm_shift(type, 1)) { + const unsigned n = 32 / imm_shift(type, 1); + uint32_t y = 0; + + for (unsigned i = 0; i < n; i++) { + /* Shift the specified component all the way to the right and left to + * discard any undesired L/MSBs, then shift it right into component i. + */ + y |= x >> imm_shift(type, (i & ~3) + BRW_GET_SWZ(swz, i & 3)) + << imm_shift(type, ~0u) + >> imm_shift(type, ~0u - i); + } + + return y; + } else { + return x; + } +} + +unsigned +brw_get_default_exec_size(struct brw_codegen *p) +{ + return p->current->exec_size; +} + +unsigned +brw_get_default_group(struct brw_codegen *p) +{ + return p->current->group; +} + +unsigned +brw_get_default_access_mode(struct brw_codegen *p) +{ + return p->current->access_mode; +} + +tgl_swsb +brw_get_default_swsb(struct brw_codegen *p) +{ + return p->current->swsb; +} + +void +brw_set_default_exec_size(struct brw_codegen *p, unsigned value) +{ + p->current->exec_size = value; +} + +void brw_set_default_predicate_control(struct brw_codegen *p, enum brw_predicate pc) +{ + p->current->predicate = pc; +} + +void brw_set_default_predicate_inverse(struct brw_codegen *p, bool predicate_inverse) +{ + p->current->pred_inv = predicate_inverse; +} + +void brw_set_default_flag_reg(struct brw_codegen *p, int reg, int subreg) +{ + assert(subreg < 2); + p->current->flag_subreg = reg * 2 + subreg; +} + +void brw_set_default_access_mode( struct brw_codegen *p, unsigned access_mode ) +{ + p->current->access_mode = access_mode; +} + +void +brw_set_default_compression_control(struct brw_codegen *p, + enum brw_compression compression_control) +{ + switch (compression_control) { + case BRW_COMPRESSION_NONE: + /* This is the "use the first set of bits of dmask/vmask/arf + * according to execsize" option. + */ + p->current->group = 0; + break; + case BRW_COMPRESSION_2NDHALF: + /* For SIMD8, this is "use the second set of 8 bits." */ + p->current->group = 8; + break; + case BRW_COMPRESSION_COMPRESSED: + /* For SIMD16 instruction compression, use the first set of 16 bits + * since we don't do SIMD32 dispatch. + */ + p->current->group = 0; + break; + default: + unreachable("not reached"); + } + + if (p->devinfo->gen <= 6) { + p->current->compressed = + (compression_control == BRW_COMPRESSION_COMPRESSED); + } +} + +/** + * Enable or disable instruction compression on the given instruction leaving + * the currently selected channel enable group untouched. + */ +void +brw_inst_set_compression(const struct gen_device_info *devinfo, + brw_inst *inst, bool on) +{ + if (devinfo->gen >= 6) { + /* No-op, the EU will figure out for us whether the instruction needs to + * be compressed. + */ + } else { + /* The channel group and compression controls are non-orthogonal, there + * are two possible representations for uncompressed instructions and we + * may need to preserve the current one to avoid changing the selected + * channel group inadvertently. + */ + if (on) + brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_COMPRESSED); + else if (brw_inst_qtr_control(devinfo, inst) + == BRW_COMPRESSION_COMPRESSED) + brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_NONE); + } +} + +void +brw_set_default_compression(struct brw_codegen *p, bool on) +{ + p->current->compressed = on; +} + +/** + * Apply the range of channel enable signals given by + * [group, group + exec_size) to the instruction passed as argument. + */ +void +brw_inst_set_group(const struct gen_device_info *devinfo, + brw_inst *inst, unsigned group) +{ + if (devinfo->gen >= 7) { + assert(group % 4 == 0 && group < 32); + brw_inst_set_qtr_control(devinfo, inst, group / 8); + brw_inst_set_nib_control(devinfo, inst, (group / 4) % 2); + + } else if (devinfo->gen == 6) { + assert(group % 8 == 0 && group < 32); + brw_inst_set_qtr_control(devinfo, inst, group / 8); + + } else { + assert(group % 8 == 0 && group < 16); + /* The channel group and compression controls are non-orthogonal, there + * are two possible representations for group zero and we may need to + * preserve the current one to avoid changing the selected compression + * enable inadvertently. + */ + if (group == 8) + brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_2NDHALF); + else if (brw_inst_qtr_control(devinfo, inst) == BRW_COMPRESSION_2NDHALF) + brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_NONE); + } +} + +void +brw_set_default_group(struct brw_codegen *p, unsigned group) +{ + p->current->group = group; +} + +void brw_set_default_mask_control( struct brw_codegen *p, unsigned value ) +{ + p->current->mask_control = value; +} + +void brw_set_default_saturate( struct brw_codegen *p, bool enable ) +{ + p->current->saturate = enable; +} + +void brw_set_default_acc_write_control(struct brw_codegen *p, unsigned value) +{ + p->current->acc_wr_control = value; +} + +void brw_set_default_swsb(struct brw_codegen *p, tgl_swsb value) +{ + p->current->swsb = value; +} + +void brw_push_insn_state( struct brw_codegen *p ) +{ + assert(p->current != &p->stack[BRW_EU_MAX_INSN_STACK-1]); + *(p->current + 1) = *p->current; + p->current++; +} + +void brw_pop_insn_state( struct brw_codegen *p ) +{ + assert(p->current != p->stack); + p->current--; +} + + +/*********************************************************************** + */ +void +brw_init_codegen(const struct gen_device_info *devinfo, + struct brw_codegen *p, void *mem_ctx) +{ + memset(p, 0, sizeof(*p)); + + p->devinfo = devinfo; + p->automatic_exec_sizes = true; + /* + * Set the initial instruction store array size to 1024, if found that + * isn't enough, then it will double the store size at brw_next_insn() + * until out of memory. + */ + p->store_size = 1024; + p->store = rzalloc_array(mem_ctx, brw_inst, p->store_size); + p->nr_insn = 0; + p->current = p->stack; + memset(p->current, 0, sizeof(p->current[0])); + + p->mem_ctx = mem_ctx; + + /* Some defaults? + */ + brw_set_default_exec_size(p, BRW_EXECUTE_8); + brw_set_default_mask_control(p, BRW_MASK_ENABLE); /* what does this do? */ + brw_set_default_saturate(p, 0); + brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); + + /* Set up control flow stack */ + p->if_stack_depth = 0; + p->if_stack_array_size = 16; + p->if_stack = rzalloc_array(mem_ctx, int, p->if_stack_array_size); + + p->loop_stack_depth = 0; + p->loop_stack_array_size = 16; + p->loop_stack = rzalloc_array(mem_ctx, int, p->loop_stack_array_size); + p->if_depth_in_loop = rzalloc_array(mem_ctx, int, p->loop_stack_array_size); +} + + +const unsigned *brw_get_program( struct brw_codegen *p, + unsigned *sz ) +{ + *sz = p->next_insn_offset; + return (const unsigned *)p->store; +} + +bool brw_try_override_assembly(struct brw_codegen *p, int start_offset, + const char *identifier) +{ + const char *read_path = getenv("INTEL_SHADER_ASM_READ_PATH"); + if (!read_path) { + return false; + } + + char *name = ralloc_asprintf(NULL, "%s/%s.bin", read_path, identifier); + + int fd = open(name, O_RDONLY); + ralloc_free(name); + + if (fd == -1) { + return false; + } + + struct stat sb; + if (fstat(fd, &sb) != 0 || (!S_ISREG(sb.st_mode))) { + close(fd); + return false; + } + + p->nr_insn -= (p->next_insn_offset - start_offset) / sizeof(brw_inst); + p->nr_insn += sb.st_size / sizeof(brw_inst); + + p->next_insn_offset = start_offset + sb.st_size; + p->store_size = (start_offset + sb.st_size) / sizeof(brw_inst); + p->store = (brw_inst *)reralloc_size(p->mem_ctx, p->store, p->next_insn_offset); + assert(p->store); + + ssize_t ret = read(fd, p->store + start_offset, sb.st_size); + close(fd); + if (ret != sb.st_size) { + return false; + } + + ASSERTED bool valid = + brw_validate_instructions(p->devinfo, p->store, + start_offset, p->next_insn_offset, + NULL); + assert(valid); + + return true; +} + +void +brw_disassemble(const struct gen_device_info *devinfo, + const void *assembly, int start, int end, FILE *out) +{ + bool dump_hex = (INTEL_DEBUG & DEBUG_HEX) != 0; + + for (int offset = start; offset < end;) { + const brw_inst *insn = (const brw_inst *)((char *)assembly + offset); + brw_inst uncompacted; + bool compacted = brw_inst_cmpt_control(devinfo, insn); + if (0) + fprintf(out, "0x%08x: ", offset); + + if (compacted) { + brw_compact_inst *compacted = (brw_compact_inst *)insn; + if (dump_hex) { + unsigned char * insn_ptr = ((unsigned char *)&insn[0]); + const unsigned int blank_spaces = 24; + for (int i = 0 ; i < 8; i = i + 4) { + fprintf(out, "%02x %02x %02x %02x ", + insn_ptr[i], + insn_ptr[i + 1], + insn_ptr[i + 2], + insn_ptr[i + 3]); + } + /* Make compacted instructions hex value output vertically aligned + * with uncompacted instructions hex value + */ + fprintf(out, "%*c", blank_spaces, ' '); + } + + brw_uncompact_instruction(devinfo, &uncompacted, compacted); + insn = &uncompacted; + offset += 8; + } else { + if (dump_hex) { + unsigned char * insn_ptr = ((unsigned char *)&insn[0]); + for (int i = 0 ; i < 16; i = i + 4) { + fprintf(out, "%02x %02x %02x %02x ", + insn_ptr[i], + insn_ptr[i + 1], + insn_ptr[i + 2], + insn_ptr[i + 3]); + } + } + offset += 16; + } + + brw_disassemble_inst(out, devinfo, insn, compacted); + } +} + +static const struct opcode_desc opcode_descs[] = { + /* IR, HW, name, nsrc, ndst, gens */ + { BRW_OPCODE_ILLEGAL, 0, "illegal", 0, 0, GEN_ALL }, + { BRW_OPCODE_SYNC, 1, "sync", 1, 0, GEN_GE(GEN12) }, + { BRW_OPCODE_MOV, 1, "mov", 1, 1, GEN_LT(GEN12) }, + { BRW_OPCODE_MOV, 97, "mov", 1, 1, GEN_GE(GEN12) }, + { BRW_OPCODE_SEL, 2, "sel", 2, 1, GEN_LT(GEN12) }, + { BRW_OPCODE_SEL, 98, "sel", 2, 1, GEN_GE(GEN12) }, + { BRW_OPCODE_MOVI, 3, "movi", 2, 1, GEN_GE(GEN45) & GEN_LT(GEN12) }, + { BRW_OPCODE_MOVI, 99, "movi", 2, 1, GEN_GE(GEN12) }, + { BRW_OPCODE_NOT, 4, "not", 1, 1, GEN_LT(GEN12) }, + { BRW_OPCODE_NOT, 100, "not", 1, 1, GEN_GE(GEN12) }, + { BRW_OPCODE_AND, 5, "and", 2, 1, GEN_LT(GEN12) }, + { BRW_OPCODE_AND, 101, "and", 2, 1, GEN_GE(GEN12) }, + { BRW_OPCODE_OR, 6, "or", 2, 1, GEN_LT(GEN12) }, + { BRW_OPCODE_OR, 102, "or", 2, 1, GEN_GE(GEN12) }, + { BRW_OPCODE_XOR, 7, "xor", 2, 1, GEN_LT(GEN12) }, + { BRW_OPCODE_XOR, 103, "xor", 2, 1, GEN_GE(GEN12) }, + { BRW_OPCODE_SHR, 8, "shr", 2, 1, GEN_LT(GEN12) }, + { BRW_OPCODE_SHR, 104, "shr", 2, 1, GEN_GE(GEN12) }, + { BRW_OPCODE_SHL, 9, "shl", 2, 1, GEN_LT(GEN12) }, + { BRW_OPCODE_SHL, 105, "shl", 2, 1, GEN_GE(GEN12) }, + { BRW_OPCODE_DIM, 10, "dim", 1, 1, GEN75 }, + { BRW_OPCODE_SMOV, 10, "smov", 0, 0, GEN_GE(GEN8) & GEN_LT(GEN12) }, + { BRW_OPCODE_SMOV, 106, "smov", 0, 0, GEN_GE(GEN12) }, + { BRW_OPCODE_ASR, 12, "asr", 2, 1, GEN_LT(GEN12) }, + { BRW_OPCODE_ASR, 108, "asr", 2, 1, GEN_GE(GEN12) }, + { BRW_OPCODE_ROR, 14, "ror", 2, 1, GEN11 }, + { BRW_OPCODE_ROR, 110, "ror", 2, 1, GEN_GE(GEN12) }, + { BRW_OPCODE_ROL, 15, "rol", 2, 1, GEN11 }, + { BRW_OPCODE_ROL, 111, "rol", 2, 1, GEN_GE(GEN12) }, + { BRW_OPCODE_CMP, 16, "cmp", 2, 1, GEN_LT(GEN12) }, + { BRW_OPCODE_CMP, 112, "cmp", 2, 1, GEN_GE(GEN12) }, + { BRW_OPCODE_CMPN, 17, "cmpn", 2, 1, GEN_LT(GEN12) }, + { BRW_OPCODE_CMPN, 113, "cmpn", 2, 1, GEN_GE(GEN12) }, + { BRW_OPCODE_CSEL, 18, "csel", 3, 1, GEN_GE(GEN8) & GEN_LT(GEN12) }, + { BRW_OPCODE_CSEL, 114, "csel", 3, 1, GEN_GE(GEN12) }, + { BRW_OPCODE_F32TO16, 19, "f32to16", 1, 1, GEN7 | GEN75 }, + { BRW_OPCODE_F16TO32, 20, "f16to32", 1, 1, GEN7 | GEN75 }, + { BRW_OPCODE_BFREV, 23, "bfrev", 1, 1, GEN_GE(GEN7) & GEN_LT(GEN12) }, + { BRW_OPCODE_BFREV, 119, "bfrev", 1, 1, GEN_GE(GEN12) }, + { BRW_OPCODE_BFE, 24, "bfe", 3, 1, GEN_GE(GEN7) & GEN_LT(GEN12) }, + { BRW_OPCODE_BFE, 120, "bfe", 3, 1, GEN_GE(GEN12) }, + { BRW_OPCODE_BFI1, 25, "bfi1", 2, 1, GEN_GE(GEN7) & GEN_LT(GEN12) }, + { BRW_OPCODE_BFI1, 121, "bfi1", 2, 1, GEN_GE(GEN12) }, + { BRW_OPCODE_BFI2, 26, "bfi2", 3, 1, GEN_GE(GEN7) & GEN_LT(GEN12) }, + { BRW_OPCODE_BFI2, 122, "bfi2", 3, 1, GEN_GE(GEN12) }, + { BRW_OPCODE_JMPI, 32, "jmpi", 0, 0, GEN_ALL }, + { BRW_OPCODE_BRD, 33, "brd", 0, 0, GEN_GE(GEN7) }, + { BRW_OPCODE_IF, 34, "if", 0, 0, GEN_ALL }, + { BRW_OPCODE_IFF, 35, "iff", 0, 0, GEN_LE(GEN5) }, + { BRW_OPCODE_BRC, 35, "brc", 0, 0, GEN_GE(GEN7) }, + { BRW_OPCODE_ELSE, 36, "else", 0, 0, GEN_ALL }, + { BRW_OPCODE_ENDIF, 37, "endif", 0, 0, GEN_ALL }, + { BRW_OPCODE_DO, 38, "do", 0, 0, GEN_LE(GEN5) }, + { BRW_OPCODE_CASE, 38, "case", 0, 0, GEN6 }, + { BRW_OPCODE_WHILE, 39, "while", 0, 0, GEN_ALL }, + { BRW_OPCODE_BREAK, 40, "break", 0, 0, GEN_ALL }, + { BRW_OPCODE_CONTINUE, 41, "cont", 0, 0, GEN_ALL }, + { BRW_OPCODE_HALT, 42, "halt", 0, 0, GEN_ALL }, + { BRW_OPCODE_CALLA, 43, "calla", 0, 0, GEN_GE(GEN75) }, + { BRW_OPCODE_MSAVE, 44, "msave", 0, 0, GEN_LE(GEN5) }, + { BRW_OPCODE_CALL, 44, "call", 0, 0, GEN_GE(GEN6) }, + { BRW_OPCODE_MREST, 45, "mrest", 0, 0, GEN_LE(GEN5) }, + { BRW_OPCODE_RET, 45, "ret", 0, 0, GEN_GE(GEN6) }, + { BRW_OPCODE_PUSH, 46, "push", 0, 0, GEN_LE(GEN5) }, + { BRW_OPCODE_FORK, 46, "fork", 0, 0, GEN6 }, + { BRW_OPCODE_GOTO, 46, "goto", 0, 0, GEN_GE(GEN8) }, + { BRW_OPCODE_POP, 47, "pop", 2, 0, GEN_LE(GEN5) }, + { BRW_OPCODE_WAIT, 48, "wait", 1, 0, GEN_LT(GEN12) }, + { BRW_OPCODE_SEND, 49, "send", 1, 1, GEN_ALL }, + { BRW_OPCODE_SENDC, 50, "sendc", 1, 1, GEN_ALL }, + { BRW_OPCODE_SENDS, 51, "sends", 2, 1, GEN_GE(GEN9) & GEN_LT(GEN12) }, + { BRW_OPCODE_SENDSC, 52, "sendsc", 2, 1, GEN_GE(GEN9) & GEN_LT(GEN12) }, + { BRW_OPCODE_MATH, 56, "math", 2, 1, GEN_GE(GEN6) }, + { BRW_OPCODE_ADD, 64, "add", 2, 1, GEN_ALL }, + { BRW_OPCODE_MUL, 65, "mul", 2, 1, GEN_ALL }, + { BRW_OPCODE_AVG, 66, "avg", 2, 1, GEN_ALL }, + { BRW_OPCODE_FRC, 67, "frc", 1, 1, GEN_ALL }, + { BRW_OPCODE_RNDU, 68, "rndu", 1, 1, GEN_ALL }, + { BRW_OPCODE_RNDD, 69, "rndd", 1, 1, GEN_ALL }, + { BRW_OPCODE_RNDE, 70, "rnde", 1, 1, GEN_ALL }, + { BRW_OPCODE_RNDZ, 71, "rndz", 1, 1, GEN_ALL }, + { BRW_OPCODE_MAC, 72, "mac", 2, 1, GEN_ALL }, + { BRW_OPCODE_MACH, 73, "mach", 2, 1, GEN_ALL }, + { BRW_OPCODE_LZD, 74, "lzd", 1, 1, GEN_ALL }, + { BRW_OPCODE_FBH, 75, "fbh", 1, 1, GEN_GE(GEN7) }, + { BRW_OPCODE_FBL, 76, "fbl", 1, 1, GEN_GE(GEN7) }, + { BRW_OPCODE_CBIT, 77, "cbit", 1, 1, GEN_GE(GEN7) }, + { BRW_OPCODE_ADDC, 78, "addc", 2, 1, GEN_GE(GEN7) }, + { BRW_OPCODE_SUBB, 79, "subb", 2, 1, GEN_GE(GEN7) }, + { BRW_OPCODE_SAD2, 80, "sad2", 2, 1, GEN_ALL }, + { BRW_OPCODE_SADA2, 81, "sada2", 2, 1, GEN_ALL }, + { BRW_OPCODE_DP4, 84, "dp4", 2, 1, GEN_LT(GEN11) }, + { BRW_OPCODE_DPH, 85, "dph", 2, 1, GEN_LT(GEN11) }, + { BRW_OPCODE_DP3, 86, "dp3", 2, 1, GEN_LT(GEN11) }, + { BRW_OPCODE_DP2, 87, "dp2", 2, 1, GEN_LT(GEN11) }, + { BRW_OPCODE_LINE, 89, "line", 2, 1, GEN_LE(GEN10) }, + { BRW_OPCODE_PLN, 90, "pln", 2, 1, GEN_GE(GEN45) & GEN_LE(GEN10) }, + { BRW_OPCODE_MAD, 91, "mad", 3, 1, GEN_GE(GEN6) }, + { BRW_OPCODE_LRP, 92, "lrp", 3, 1, GEN_GE(GEN6) & GEN_LE(GEN10) }, + { BRW_OPCODE_MADM, 93, "madm", 3, 1, GEN_GE(GEN8) }, + { BRW_OPCODE_NENOP, 125, "nenop", 0, 0, GEN45 }, + { BRW_OPCODE_NOP, 126, "nop", 0, 0, GEN_LT(GEN12) }, + { BRW_OPCODE_NOP, 96, "nop", 0, 0, GEN_GE(GEN12) } +}; + +/** + * Look up the opcode_descs[] entry with \p key member matching \p k which is + * supported by the device specified by \p devinfo, or NULL if there is no + * matching entry. + * + * This is implemented by using an index data structure (storage for which is + * provided by the caller as \p index_gen and \p index_descs) in order to + * provide efficient constant-time look-up. + */ +static const opcode_desc * +lookup_opcode_desc(gen *index_gen, + const opcode_desc **index_descs, + unsigned index_size, + unsigned opcode_desc::*key, + const gen_device_info *devinfo, + unsigned k) +{ + if (*index_gen != gen_from_devinfo(devinfo)) { + *index_gen = gen_from_devinfo(devinfo); + + for (unsigned l = 0; l < index_size; l++) + index_descs[l] = NULL; + + for (unsigned i = 0; i < ARRAY_SIZE(opcode_descs); i++) { + if (opcode_descs[i].gens & *index_gen) { + const unsigned l = opcode_descs[i].*key; + assert(l < index_size && !index_descs[l]); + index_descs[l] = &opcode_descs[i]; + } + } + } + + if (k < index_size) + return index_descs[k]; + else + return NULL; +} + +/** + * Return the matching opcode_desc for the specified IR opcode and hardware + * generation, or NULL if the opcode is not supported by the device. + */ +const struct opcode_desc * +brw_opcode_desc(const struct gen_device_info *devinfo, enum opcode opcode) +{ + static __thread gen index_gen = {}; + static __thread const opcode_desc *index_descs[NUM_BRW_OPCODES]; + return lookup_opcode_desc(&index_gen, index_descs, ARRAY_SIZE(index_descs), + &opcode_desc::ir, devinfo, opcode); +} + +/** + * Return the matching opcode_desc for the specified HW opcode and hardware + * generation, or NULL if the opcode is not supported by the device. + */ +const struct opcode_desc * +brw_opcode_desc_from_hw(const struct gen_device_info *devinfo, unsigned hw) +{ + static __thread gen index_gen = {}; + static __thread const opcode_desc *index_descs[128]; + return lookup_opcode_desc(&index_gen, index_descs, ARRAY_SIZE(index_descs), + &opcode_desc::hw, devinfo, hw); +} diff -Nru mesa-19.2.8/src/intel/compiler/brw_eu_defines.h mesa-20.0.8/src/intel/compiler/brw_eu_defines.h --- mesa-19.2.8/src/intel/compiler/brw_eu_defines.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/compiler/brw_eu_defines.h 2020-06-12 01:21:17.000000000 +0000 @@ -32,6 +32,7 @@ #ifndef BRW_EU_DEFINES_H #define BRW_EU_DEFINES_H +#include #include "util/macros.h" /* The following hunk, up-to "Execution Unit" is used by both the @@ -195,103 +196,96 @@ /** @} */ enum opcode { - /* These are the actual hardware opcodes. */ - BRW_OPCODE_ILLEGAL = 0, - BRW_OPCODE_MOV = 1, - BRW_OPCODE_SEL = 2, - BRW_OPCODE_MOVI = 3, /**< G45+ */ - BRW_OPCODE_NOT = 4, - BRW_OPCODE_AND = 5, - BRW_OPCODE_OR = 6, - BRW_OPCODE_XOR = 7, - BRW_OPCODE_SHR = 8, - BRW_OPCODE_SHL = 9, - BRW_OPCODE_DIM = 10, /**< Gen7.5 only */ /* Reused */ - BRW_OPCODE_SMOV = 10, /**< Gen8+ */ /* Reused */ - /* Reserved - 11 */ - BRW_OPCODE_ASR = 12, - /* Reserved - 13 */ - BRW_OPCODE_ROR = 14, /**< Gen11+ */ - BRW_OPCODE_ROL = 15, /**< Gen11+ */ - BRW_OPCODE_CMP = 16, - BRW_OPCODE_CMPN = 17, - BRW_OPCODE_CSEL = 18, /**< Gen8+ */ - BRW_OPCODE_F32TO16 = 19, /**< Gen7 only */ - BRW_OPCODE_F16TO32 = 20, /**< Gen7 only */ - /* Reserved - 21-22 */ - BRW_OPCODE_BFREV = 23, /**< Gen7+ */ - BRW_OPCODE_BFE = 24, /**< Gen7+ */ - BRW_OPCODE_BFI1 = 25, /**< Gen7+ */ - BRW_OPCODE_BFI2 = 26, /**< Gen7+ */ - /* Reserved - 27-31 */ - BRW_OPCODE_JMPI = 32, - BRW_OPCODE_BRD = 33, /**< Gen7+ */ - BRW_OPCODE_IF = 34, - BRW_OPCODE_IFF = 35, /**< Pre-Gen6 */ /* Reused */ - BRW_OPCODE_BRC = 35, /**< Gen7+ */ /* Reused */ - BRW_OPCODE_ELSE = 36, - BRW_OPCODE_ENDIF = 37, - BRW_OPCODE_DO = 38, /**< Pre-Gen6 */ /* Reused */ - BRW_OPCODE_CASE = 38, /**< Gen6 only */ /* Reused */ - BRW_OPCODE_WHILE = 39, - BRW_OPCODE_BREAK = 40, - BRW_OPCODE_CONTINUE = 41, - BRW_OPCODE_HALT = 42, - BRW_OPCODE_CALLA = 43, /**< Gen7.5+ */ - BRW_OPCODE_MSAVE = 44, /**< Pre-Gen6 */ /* Reused */ - BRW_OPCODE_CALL = 44, /**< Gen6+ */ /* Reused */ - BRW_OPCODE_MREST = 45, /**< Pre-Gen6 */ /* Reused */ - BRW_OPCODE_RET = 45, /**< Gen6+ */ /* Reused */ - BRW_OPCODE_PUSH = 46, /**< Pre-Gen6 */ /* Reused */ - BRW_OPCODE_FORK = 46, /**< Gen6 only */ /* Reused */ - BRW_OPCODE_GOTO = 46, /**< Gen8+ */ /* Reused */ - BRW_OPCODE_POP = 47, /**< Pre-Gen6 */ - BRW_OPCODE_WAIT = 48, - BRW_OPCODE_SEND = 49, - BRW_OPCODE_SENDC = 50, - BRW_OPCODE_SENDS = 51, /**< Gen9+ */ - BRW_OPCODE_SENDSC = 52, /**< Gen9+ */ - /* Reserved 53-55 */ - BRW_OPCODE_MATH = 56, /**< Gen6+ */ - /* Reserved 57-63 */ - BRW_OPCODE_ADD = 64, - BRW_OPCODE_MUL = 65, - BRW_OPCODE_AVG = 66, - BRW_OPCODE_FRC = 67, - BRW_OPCODE_RNDU = 68, - BRW_OPCODE_RNDD = 69, - BRW_OPCODE_RNDE = 70, - BRW_OPCODE_RNDZ = 71, - BRW_OPCODE_MAC = 72, - BRW_OPCODE_MACH = 73, - BRW_OPCODE_LZD = 74, - BRW_OPCODE_FBH = 75, /**< Gen7+ */ - BRW_OPCODE_FBL = 76, /**< Gen7+ */ - BRW_OPCODE_CBIT = 77, /**< Gen7+ */ - BRW_OPCODE_ADDC = 78, /**< Gen7+ */ - BRW_OPCODE_SUBB = 79, /**< Gen7+ */ - BRW_OPCODE_SAD2 = 80, - BRW_OPCODE_SADA2 = 81, - /* Reserved 82-83 */ - BRW_OPCODE_DP4 = 84, - BRW_OPCODE_DPH = 85, - BRW_OPCODE_DP3 = 86, - BRW_OPCODE_DP2 = 87, - /* Reserved 88 */ - BRW_OPCODE_LINE = 89, - BRW_OPCODE_PLN = 90, /**< G45+ */ - BRW_OPCODE_MAD = 91, /**< Gen6+ */ - BRW_OPCODE_LRP = 92, /**< Gen6+ */ - BRW_OPCODE_MADM = 93, /**< Gen8+ */ - /* Reserved 94-124 */ - BRW_OPCODE_NENOP = 125, /**< G45 only */ - BRW_OPCODE_NOP = 126, - /* Reserved 127 */ + /* These are the actual hardware instructions. */ + BRW_OPCODE_ILLEGAL, + BRW_OPCODE_SYNC, + BRW_OPCODE_MOV, + BRW_OPCODE_SEL, + BRW_OPCODE_MOVI, /**< G45+ */ + BRW_OPCODE_NOT, + BRW_OPCODE_AND, + BRW_OPCODE_OR, + BRW_OPCODE_XOR, + BRW_OPCODE_SHR, + BRW_OPCODE_SHL, + BRW_OPCODE_DIM, /**< Gen7.5 only */ + BRW_OPCODE_SMOV, /**< Gen8+ */ + BRW_OPCODE_ASR, + BRW_OPCODE_ROR, /**< Gen11+ */ + BRW_OPCODE_ROL, /**< Gen11+ */ + BRW_OPCODE_CMP, + BRW_OPCODE_CMPN, + BRW_OPCODE_CSEL, /**< Gen8+ */ + BRW_OPCODE_F32TO16, /**< Gen7 only */ + BRW_OPCODE_F16TO32, /**< Gen7 only */ + BRW_OPCODE_BFREV, /**< Gen7+ */ + BRW_OPCODE_BFE, /**< Gen7+ */ + BRW_OPCODE_BFI1, /**< Gen7+ */ + BRW_OPCODE_BFI2, /**< Gen7+ */ + BRW_OPCODE_JMPI, + BRW_OPCODE_BRD, /**< Gen7+ */ + BRW_OPCODE_IF, + BRW_OPCODE_IFF, /**< Pre-Gen6 */ + BRW_OPCODE_BRC, /**< Gen7+ */ + BRW_OPCODE_ELSE, + BRW_OPCODE_ENDIF, + BRW_OPCODE_DO, /**< Pre-Gen6 */ + BRW_OPCODE_CASE, /**< Gen6 only */ + BRW_OPCODE_WHILE, + BRW_OPCODE_BREAK, + BRW_OPCODE_CONTINUE, + BRW_OPCODE_HALT, + BRW_OPCODE_CALLA, /**< Gen7.5+ */ + BRW_OPCODE_MSAVE, /**< Pre-Gen6 */ + BRW_OPCODE_CALL, /**< Gen6+ */ + BRW_OPCODE_MREST, /**< Pre-Gen6 */ + BRW_OPCODE_RET, /**< Gen6+ */ + BRW_OPCODE_PUSH, /**< Pre-Gen6 */ + BRW_OPCODE_FORK, /**< Gen6 only */ + BRW_OPCODE_GOTO, /**< Gen8+ */ + BRW_OPCODE_POP, /**< Pre-Gen6 */ + BRW_OPCODE_WAIT, + BRW_OPCODE_SEND, + BRW_OPCODE_SENDC, + BRW_OPCODE_SENDS, /**< Gen9+ */ + BRW_OPCODE_SENDSC, /**< Gen9+ */ + BRW_OPCODE_MATH, /**< Gen6+ */ + BRW_OPCODE_ADD, + BRW_OPCODE_MUL, + BRW_OPCODE_AVG, + BRW_OPCODE_FRC, + BRW_OPCODE_RNDU, + BRW_OPCODE_RNDD, + BRW_OPCODE_RNDE, + BRW_OPCODE_RNDZ, + BRW_OPCODE_MAC, + BRW_OPCODE_MACH, + BRW_OPCODE_LZD, + BRW_OPCODE_FBH, /**< Gen7+ */ + BRW_OPCODE_FBL, /**< Gen7+ */ + BRW_OPCODE_CBIT, /**< Gen7+ */ + BRW_OPCODE_ADDC, /**< Gen7+ */ + BRW_OPCODE_SUBB, /**< Gen7+ */ + BRW_OPCODE_SAD2, + BRW_OPCODE_SADA2, + BRW_OPCODE_DP4, + BRW_OPCODE_DPH, + BRW_OPCODE_DP3, + BRW_OPCODE_DP2, + BRW_OPCODE_LINE, + BRW_OPCODE_PLN, /**< G45+ */ + BRW_OPCODE_MAD, /**< Gen6+ */ + BRW_OPCODE_LRP, /**< Gen6+ */ + BRW_OPCODE_MADM, /**< Gen8+ */ + BRW_OPCODE_NENOP, /**< G45 only */ + BRW_OPCODE_NOP, + + NUM_BRW_OPCODES, /* These are compiler backend opcodes that get translated into other * instructions. */ - FS_OPCODE_FB_WRITE = 128, + FS_OPCODE_FB_WRITE = NUM_BRW_OPCODES, /** * Same as FS_OPCODE_FB_WRITE but expects its arguments separately as @@ -441,6 +435,7 @@ SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL, SHADER_OPCODE_RND_MODE, + SHADER_OPCODE_FLOAT_CONTROL_MODE, /** * Byte scattered write/read opcodes. @@ -451,6 +446,8 @@ */ SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL, SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL, + SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL, + SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL, /** * Memory fence messages. @@ -465,6 +462,11 @@ */ SHADER_OPCODE_MEMORY_FENCE, + /** + * Scheduling-only fence. + */ + FS_OPCODE_SCHEDULING_FENCE, + SHADER_OPCODE_GEN4_SCRATCH_READ, SHADER_OPCODE_GEN4_SCRATCH_WRITE, SHADER_OPCODE_GEN7_SCRATCH_READ, @@ -489,6 +491,12 @@ SHADER_OPCODE_FIND_LIVE_CHANNEL, /** + * Return the current execution mask in the specified flag subregister. + * Can be CSE'ed more easily than a plain MOV from the ce0 ARF register. + */ + FS_OPCODE_LOAD_LIVE_CHANNELS, + + /** * Pick the channel from its first source register given by the index * specified as second source. Useful for variable indexing of surfaces. * @@ -740,6 +748,12 @@ */ SHADER_OPCODE_MULH, + /** Signed subtraction with saturation. */ + SHADER_OPCODE_ISUB_SAT, + + /** Unsigned subtraction with saturation. */ + SHADER_OPCODE_USUB_SAT, + /** * A MOV that uses VxH indirect addressing. * @@ -996,6 +1010,7 @@ enum PACKED gen10_align1_3src_vertical_stride { BRW_ALIGN1_3SRC_VERTICAL_STRIDE_0 = 0, + BRW_ALIGN1_3SRC_VERTICAL_STRIDE_1 = 1, BRW_ALIGN1_3SRC_VERTICAL_STRIDE_2 = 1, BRW_ALIGN1_3SRC_VERTICAL_STRIDE_4 = 2, BRW_ALIGN1_3SRC_VERTICAL_STRIDE_8 = 3, @@ -1010,6 +1025,164 @@ }; /** + * Gen12+ SWSB SBID synchronization mode. + * + * This is represented as a bitmask including any required SBID token + * synchronization modes, used to synchronize out-of-order instructions. Only + * the strongest mode of the mask will be provided to the hardware in the SWSB + * field of an actual hardware instruction, but virtual instructions may be + * able to take into account multiple of them. + */ +enum tgl_sbid_mode { + TGL_SBID_NULL = 0, + TGL_SBID_SRC = 1, + TGL_SBID_DST = 2, + TGL_SBID_SET = 4 +}; + +#ifdef __cplusplus +/** + * Allow bitwise arithmetic of tgl_sbid_mode enums. + */ +inline tgl_sbid_mode +operator|(tgl_sbid_mode x, tgl_sbid_mode y) +{ + return tgl_sbid_mode(unsigned(x) | unsigned(y)); +} + +inline tgl_sbid_mode +operator&(tgl_sbid_mode x, tgl_sbid_mode y) +{ + return tgl_sbid_mode(unsigned(x) & unsigned(y)); +} + +inline tgl_sbid_mode & +operator|=(tgl_sbid_mode &x, tgl_sbid_mode y) +{ + return x = x | y; +} + +#endif + +/** + * Logical representation of the SWSB scheduling information of a hardware + * instruction. The binary representation is slightly more compact. + */ +struct tgl_swsb { + unsigned regdist : 3; + unsigned sbid : 4; + enum tgl_sbid_mode mode : 3; +}; + +/** + * Construct a scheduling annotation with a single RegDist dependency. This + * synchronizes with the completion of the d-th previous in-order instruction. + * The index is one-based, zero causes a no-op tgl_swsb to be constructed. + */ +static inline struct tgl_swsb +tgl_swsb_regdist(unsigned d) +{ + const struct tgl_swsb swsb = { d }; + assert(swsb.regdist == d); + return swsb; +} + +/** + * Construct a scheduling annotation that synchronizes with the specified SBID + * token. + */ +static inline struct tgl_swsb +tgl_swsb_sbid(enum tgl_sbid_mode mode, unsigned sbid) +{ + const struct tgl_swsb swsb = { 0, sbid, mode }; + assert(swsb.sbid == sbid); + return swsb; +} + +/** + * Construct a no-op scheduling annotation. + */ +static inline struct tgl_swsb +tgl_swsb_null(void) +{ + return tgl_swsb_regdist(0); +} + +/** + * Return a scheduling annotation that allocates the same SBID synchronization + * token as \p swsb. In addition it will synchronize against a previous + * in-order instruction if \p regdist is non-zero. + */ +static inline struct tgl_swsb +tgl_swsb_dst_dep(struct tgl_swsb swsb, unsigned regdist) +{ + swsb.regdist = regdist; + swsb.mode = swsb.mode & TGL_SBID_SET; + return swsb; +} + +/** + * Return a scheduling annotation that synchronizes against the same SBID and + * RegDist dependencies as \p swsb, but doesn't allocate any SBID token. + */ +static inline struct tgl_swsb +tgl_swsb_src_dep(struct tgl_swsb swsb) +{ + swsb.mode = swsb.mode & (TGL_SBID_SRC | TGL_SBID_DST); + return swsb; +} + +/** + * Convert the provided tgl_swsb to the hardware's binary representation of an + * SWSB annotation. + */ +static inline uint8_t +tgl_swsb_encode(struct tgl_swsb swsb) +{ + if (!swsb.mode) { + return swsb.regdist; + } else if (swsb.regdist) { + return 0x80 | swsb.regdist << 4 | swsb.sbid; + } else { + return swsb.sbid | (swsb.mode & TGL_SBID_SET ? 0x40 : + swsb.mode & TGL_SBID_DST ? 0x20 : 0x30); + } +} + +/** + * Convert the provided binary representation of an SWSB annotation to a + * tgl_swsb. + */ +static inline struct tgl_swsb +tgl_swsb_decode(enum opcode opcode, uint8_t x) +{ + if (x & 0x80) { + const struct tgl_swsb swsb = { (x & 0x70u) >> 4, x & 0xfu, + (opcode == BRW_OPCODE_SEND || + opcode == BRW_OPCODE_SENDC || + opcode == BRW_OPCODE_MATH) ? + TGL_SBID_SET : TGL_SBID_DST }; + return swsb; + } else if ((x & 0x70) == 0x20) { + return tgl_swsb_sbid(TGL_SBID_DST, x & 0xfu); + } else if ((x & 0x70) == 0x30) { + return tgl_swsb_sbid(TGL_SBID_SRC, x & 0xfu); + } else if ((x & 0x70) == 0x40) { + return tgl_swsb_sbid(TGL_SBID_SET, x & 0xfu); + } else { + return tgl_swsb_regdist(x & 0x7u); + } +} + +enum tgl_sync_function { + TGL_SYNC_NOP = 0x0, + TGL_SYNC_ALLRD = 0x2, + TGL_SYNC_ALLWR = 0x3, + TGL_SYNC_BAR = 0xe, + TGL_SYNC_HOST = 0xf +}; + +/** * Message target: Shared Function ID for where to SEND a message. * * These are enumerated in the ISA reference under "send - Send Message". @@ -1383,6 +1556,15 @@ BRW_RND_MODE_UNSPECIFIED, /* Unspecified rounding mode */ }; +#define BRW_CR0_FP64_DENORM_PRESERVE (1 << 6) +#define BRW_CR0_FP32_DENORM_PRESERVE (1 << 7) +#define BRW_CR0_FP16_DENORM_PRESERVE (1 << 10) + +#define BRW_CR0_FP_MODE_MASK (BRW_CR0_FP64_DENORM_PRESERVE | \ + BRW_CR0_FP32_DENORM_PRESERVE | \ + BRW_CR0_FP16_DENORM_PRESERVE | \ + BRW_CR0_RND_MODE_MASK) + /* MDC_DS - Data Size Message Descriptor Control Field * Skylake PRM, Volume 2d, page 129 * diff -Nru mesa-19.2.8/src/intel/compiler/brw_eu_emit.c mesa-20.0.8/src/intel/compiler/brw_eu_emit.c --- mesa-19.2.8/src/intel/compiler/brw_eu_emit.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/compiler/brw_eu_emit.c 2020-06-12 01:21:17.000000000 +0000 @@ -55,6 +55,7 @@ return; if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) { + assert(devinfo->gen < 12); brw_push_insn_state(p); brw_set_default_exec_size(p, BRW_EXECUTE_8); brw_set_default_mask_control(p, BRW_MASK_DISABLE); @@ -94,21 +95,37 @@ else if (dest.file == BRW_GENERAL_REGISTER_FILE) assert(dest.nr < 128); - /* The hardware has a restriction where if the destination is Byte, - * the instruction needs to have a stride of 2 (except for packed byte - * MOV). This seems to be required even if the destination is the NULL - * register. + /* The hardware has a restriction where a destination of size Byte with + * a stride of 1 is only allowed for a packed byte MOV. For any other + * instruction, the stride must be at least 2, even when the destination + * is the NULL register. */ if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE && dest.nr == BRW_ARF_NULL && - type_sz(dest.type) == 1) { + type_sz(dest.type) == 1 && + dest.hstride == BRW_HORIZONTAL_STRIDE_1) { dest.hstride = BRW_HORIZONTAL_STRIDE_2; } gen7_convert_mrf_to_grf(p, &dest); - if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDS || - brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDSC) { + if (devinfo->gen >= 12 && + (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND || + brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC)) { + assert(dest.file == BRW_GENERAL_REGISTER_FILE || + dest.file == BRW_ARCHITECTURE_REGISTER_FILE); + assert(dest.address_mode == BRW_ADDRESS_DIRECT); + assert(dest.subnr == 0); + assert(brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1 || + (dest.hstride == BRW_HORIZONTAL_STRIDE_1 && + dest.vstride == dest.width + 1)); + assert(!dest.negate && !dest.abs); + brw_inst_set_dst_reg_file(devinfo, inst, dest.file); + brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr); + + } else if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDS || + brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDSC) { + assert(devinfo->gen < 12); assert(dest.file == BRW_GENERAL_REGISTER_FILE || dest.file == BRW_ARCHITECTURE_REGISTER_FILE); assert(dest.address_mode == BRW_ADDRESS_DIRECT); @@ -214,8 +231,21 @@ assert(reg.address_mode == BRW_ADDRESS_DIRECT); } - if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDS || - brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDSC) { + if (devinfo->gen >= 12 && + (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND || + brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC)) { + assert(reg.file != BRW_IMMEDIATE_VALUE); + assert(reg.address_mode == BRW_ADDRESS_DIRECT); + assert(reg.subnr == 0); + assert(brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1 || + (reg.hstride == BRW_HORIZONTAL_STRIDE_1 && + reg.vstride == reg.width + 1)); + assert(!reg.negate && !reg.abs); + brw_inst_set_send_src0_reg_file(devinfo, inst, reg.file); + brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr); + + } else if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDS || + brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDSC) { assert(reg.file == BRW_GENERAL_REGISTER_FILE); assert(reg.address_mode == BRW_ADDRESS_DIRECT); assert(reg.subnr % 16 == 0); @@ -240,7 +270,7 @@ else brw_inst_set_imm_ud(devinfo, inst, reg.ud); - if (type_sz(reg.type) < 8) { + if (devinfo->gen < 12 && type_sz(reg.type) < 8) { brw_inst_set_src1_reg_file(devinfo, inst, BRW_ARCHITECTURE_REGISTER_FILE); brw_inst_set_src1_reg_hw_type(devinfo, inst, @@ -319,13 +349,17 @@ assert(reg.nr < 128); if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDS || - brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDSC) { + brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDSC || + (devinfo->gen >= 12 && + (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND || + brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC))) { assert(reg.file == BRW_GENERAL_REGISTER_FILE || reg.file == BRW_ARCHITECTURE_REGISTER_FILE); assert(reg.address_mode == BRW_ADDRESS_DIRECT); assert(reg.subnr == 0); - assert(reg.hstride == BRW_HORIZONTAL_STRIDE_1 && - reg.vstride == reg.width + 1); + assert(brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1 || + (reg.hstride == BRW_HORIZONTAL_STRIDE_1 && + reg.vstride == reg.width + 1)); assert(!reg.negate && !reg.abs); brw_inst_set_send_src1_reg_nr(devinfo, inst, reg.nr); brw_inst_set_send_src1_reg_file(devinfo, inst, reg.file); @@ -423,8 +457,9 @@ const struct gen_device_info *devinfo = p->devinfo; assert(brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND || brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC); - brw_inst_set_src1_file_type(devinfo, inst, - BRW_IMMEDIATE_VALUE, BRW_REGISTER_TYPE_UD); + if (devinfo->gen < 12) + brw_inst_set_src1_file_type(devinfo, inst, + BRW_IMMEDIATE_VALUE, BRW_REGISTER_TYPE_UD); brw_inst_set_send_desc(devinfo, inst, desc); if (devinfo->gen >= 9) brw_inst_set_send_ex_desc(devinfo, inst, ex_desc); @@ -583,6 +618,8 @@ brw_inst_set_compression(devinfo, insn, state->compressed); brw_inst_set_access_mode(devinfo, insn, state->access_mode); brw_inst_set_mask_control(devinfo, insn, state->mask_control); + if (devinfo->gen >= 12) + brw_inst_set_swsb(devinfo, insn, tgl_swsb_encode(state->swsb)); brw_inst_set_saturate(devinfo, insn, state->saturate); brw_inst_set_pred_control(devinfo, insn, state->predicate); brw_inst_set_pred_inv(devinfo, insn, state->pred_inv); @@ -662,12 +699,17 @@ } static enum gen10_align1_3src_vertical_stride -to_3src_align1_vstride(enum brw_vertical_stride vstride) +to_3src_align1_vstride(const struct gen_device_info *devinfo, + enum brw_vertical_stride vstride) { switch (vstride) { case BRW_VERTICAL_STRIDE_0: return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_0; + case BRW_VERTICAL_STRIDE_1: + assert(devinfo->gen >= 12); + return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_1; case BRW_VERTICAL_STRIDE_2: + assert(devinfo->gen < 12); return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_2; case BRW_VERTICAL_STRIDE_4: return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_4; @@ -707,6 +749,11 @@ gen7_convert_mrf_to_grf(p, &dest); assert(dest.nr < 128); + + if (devinfo->gen >= 10) + assert(!(src0.file == BRW_IMMEDIATE_VALUE && + src2.file == BRW_IMMEDIATE_VALUE)); + assert(src0.file == BRW_IMMEDIATE_VALUE || src0.nr < 128); assert(src1.file != BRW_IMMEDIATE_VALUE && src1.nr < 128); assert(src2.file == BRW_IMMEDIATE_VALUE || src2.nr < 128); @@ -719,14 +766,19 @@ assert(dest.file == BRW_GENERAL_REGISTER_FILE || dest.file == BRW_ARCHITECTURE_REGISTER_FILE); - if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE) { - brw_inst_set_3src_a1_dst_reg_file(devinfo, inst, - BRW_ALIGN1_3SRC_ACCUMULATOR); - brw_inst_set_3src_dst_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR); - } else { - brw_inst_set_3src_a1_dst_reg_file(devinfo, inst, - BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE); + if (devinfo->gen >= 12) { + brw_inst_set_3src_a1_dst_reg_file(devinfo, inst, dest.file); brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr); + } else { + if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE) { + brw_inst_set_3src_a1_dst_reg_file(devinfo, inst, + BRW_ALIGN1_3SRC_ACCUMULATOR); + brw_inst_set_3src_dst_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR); + } else { + brw_inst_set_3src_a1_dst_reg_file(devinfo, inst, + BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE); + brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr); + } } brw_inst_set_3src_a1_dst_subreg_nr(devinfo, inst, dest.subnr / 8); @@ -745,27 +797,26 @@ brw_inst_set_3src_a1_src1_type(devinfo, inst, src1.type); brw_inst_set_3src_a1_src2_type(devinfo, inst, src2.type); - brw_inst_set_3src_a1_src0_vstride(devinfo, inst, - to_3src_align1_vstride(src0.vstride)); - brw_inst_set_3src_a1_src1_vstride(devinfo, inst, - to_3src_align1_vstride(src1.vstride)); - /* no vstride on src2 */ - - brw_inst_set_3src_a1_src0_hstride(devinfo, inst, - to_3src_align1_hstride(src0.hstride)); - brw_inst_set_3src_a1_src1_hstride(devinfo, inst, - to_3src_align1_hstride(src1.hstride)); - brw_inst_set_3src_a1_src2_hstride(devinfo, inst, - to_3src_align1_hstride(src2.hstride)); - - brw_inst_set_3src_a1_src0_subreg_nr(devinfo, inst, src0.subnr); - if (src0.type == BRW_REGISTER_TYPE_NF) { - brw_inst_set_3src_src0_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR); + if (src0.file == BRW_IMMEDIATE_VALUE) { + brw_inst_set_3src_a1_src0_imm(devinfo, inst, src0.ud); } else { - brw_inst_set_3src_src0_reg_nr(devinfo, inst, src0.nr); + brw_inst_set_3src_a1_src0_vstride( + devinfo, inst, to_3src_align1_vstride(devinfo, src0.vstride)); + brw_inst_set_3src_a1_src0_hstride(devinfo, inst, + to_3src_align1_hstride(src0.hstride)); + brw_inst_set_3src_a1_src0_subreg_nr(devinfo, inst, src0.subnr); + if (src0.type == BRW_REGISTER_TYPE_NF) { + brw_inst_set_3src_src0_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR); + } else { + brw_inst_set_3src_src0_reg_nr(devinfo, inst, src0.nr); + } + brw_inst_set_3src_src0_abs(devinfo, inst, src0.abs); + brw_inst_set_3src_src0_negate(devinfo, inst, src0.negate); } - brw_inst_set_3src_src0_abs(devinfo, inst, src0.abs); - brw_inst_set_3src_src0_negate(devinfo, inst, src0.negate); + brw_inst_set_3src_a1_src1_vstride( + devinfo, inst, to_3src_align1_vstride(devinfo, src1.vstride)); + brw_inst_set_3src_a1_src1_hstride(devinfo, inst, + to_3src_align1_hstride(src1.hstride)); brw_inst_set_3src_a1_src1_subreg_nr(devinfo, inst, src1.subnr); if (src1.file == BRW_ARCHITECTURE_REGISTER_FILE) { @@ -776,10 +827,17 @@ brw_inst_set_3src_src1_abs(devinfo, inst, src1.abs); brw_inst_set_3src_src1_negate(devinfo, inst, src1.negate); - brw_inst_set_3src_a1_src2_subreg_nr(devinfo, inst, src2.subnr); - brw_inst_set_3src_src2_reg_nr(devinfo, inst, src2.nr); - brw_inst_set_3src_src2_abs(devinfo, inst, src2.abs); - brw_inst_set_3src_src2_negate(devinfo, inst, src2.negate); + if (src2.file == BRW_IMMEDIATE_VALUE) { + brw_inst_set_3src_a1_src2_imm(devinfo, inst, src2.ud); + } else { + brw_inst_set_3src_a1_src2_hstride(devinfo, inst, + to_3src_align1_hstride(src2.hstride)); + /* no vstride on src2 */ + brw_inst_set_3src_a1_src2_subreg_nr(devinfo, inst, src2.subnr); + brw_inst_set_3src_src2_reg_nr(devinfo, inst, src2.nr); + brw_inst_set_3src_src2_abs(devinfo, inst, src2.abs); + brw_inst_set_3src_src2_negate(devinfo, inst, src2.negate); + } assert(src0.file == BRW_GENERAL_REGISTER_FILE || src0.file == BRW_IMMEDIATE_VALUE || @@ -790,18 +848,35 @@ assert(src2.file == BRW_GENERAL_REGISTER_FILE || src2.file == BRW_IMMEDIATE_VALUE); - brw_inst_set_3src_a1_src0_reg_file(devinfo, inst, - src0.file == BRW_GENERAL_REGISTER_FILE ? - BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE : - BRW_ALIGN1_3SRC_IMMEDIATE_VALUE); - brw_inst_set_3src_a1_src1_reg_file(devinfo, inst, - src1.file == BRW_GENERAL_REGISTER_FILE ? - BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE : - BRW_ALIGN1_3SRC_ACCUMULATOR); - brw_inst_set_3src_a1_src2_reg_file(devinfo, inst, - src2.file == BRW_GENERAL_REGISTER_FILE ? - BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE : - BRW_ALIGN1_3SRC_IMMEDIATE_VALUE); + if (devinfo->gen >= 12) { + if (src0.file == BRW_IMMEDIATE_VALUE) { + brw_inst_set_3src_a1_src0_is_imm(devinfo, inst, 1); + } else { + brw_inst_set_3src_a1_src0_reg_file(devinfo, inst, src0.file); + } + + brw_inst_set_3src_a1_src1_reg_file(devinfo, inst, src1.file); + + if (src2.file == BRW_IMMEDIATE_VALUE) { + brw_inst_set_3src_a1_src2_is_imm(devinfo, inst, 1); + } else { + brw_inst_set_3src_a1_src2_reg_file(devinfo, inst, src2.file); + } + } else { + brw_inst_set_3src_a1_src0_reg_file(devinfo, inst, + src0.file == BRW_GENERAL_REGISTER_FILE ? + BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE : + BRW_ALIGN1_3SRC_IMMEDIATE_VALUE); + brw_inst_set_3src_a1_src1_reg_file(devinfo, inst, + src1.file == BRW_GENERAL_REGISTER_FILE ? + BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE : + BRW_ALIGN1_3SRC_ACCUMULATOR); + brw_inst_set_3src_a1_src2_reg_file(devinfo, inst, + src2.file == BRW_GENERAL_REGISTER_FILE ? + BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE : + BRW_ALIGN1_3SRC_IMMEDIATE_VALUE); + } + } else { assert(dest.file == BRW_GENERAL_REGISTER_FILE || dest.file == BRW_MESSAGE_REGISTER_FILE); @@ -945,33 +1020,6 @@ return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \ } -/* Rounding operations (other than RNDD) require two instructions - the first - * stores a rounded value (possibly the wrong way) in the dest register, but - * also sets a per-channel "increment bit" in the flag register. A predicated - * add of 1.0 fixes dest to contain the desired result. - * - * Sandybridge and later appear to round correctly without an ADD. - */ -#define ROUND(OP) \ -void brw_##OP(struct brw_codegen *p, \ - struct brw_reg dest, \ - struct brw_reg src) \ -{ \ - const struct gen_device_info *devinfo = p->devinfo; \ - brw_inst *rnd, *add; \ - rnd = next_insn(p, BRW_OPCODE_##OP); \ - brw_set_dest(p, rnd, dest); \ - brw_set_src0(p, rnd, src); \ - \ - if (devinfo->gen < 6) { \ - /* turn on round-increments */ \ - brw_inst_set_cond_modifier(devinfo, rnd, BRW_CONDITIONAL_R); \ - add = brw_ADD(p, dest, dest, brw_imm_f(1.0f)); \ - brw_inst_set_pred_control(devinfo, add, BRW_PREDICATE_NORMAL); \ - } \ -} - - ALU2(SEL) ALU1(NOT) ALU2(AND) @@ -986,6 +1034,8 @@ ALU3(CSEL) ALU1(FRC) ALU1(RNDD) +ALU1(RNDE) +ALU1(RNDZ) ALU2(MAC) ALU2(MACH) ALU1(LZD) @@ -1005,9 +1055,6 @@ ALU2(ADDC) ALU2(SUBB) -ROUND(RNDZ) -ROUND(RNDE) - brw_inst * brw_MOV(struct brw_codegen *p, struct brw_reg dest, struct brw_reg src0) { @@ -1171,9 +1218,12 @@ } if (needs_zero_fill) { - brw_inst_set_no_dd_clear(devinfo, inst, true); + if (devinfo->gen < 12) + brw_inst_set_no_dd_clear(devinfo, inst, true); + brw_set_default_swsb(p, tgl_swsb_null()); inst = brw_MOV(p, suboffset(dst, 1), brw_imm_w(0)); - brw_inst_set_no_dd_check(devinfo, inst, true); + if (devinfo->gen < 12) + brw_inst_set_no_dd_check(devinfo, inst, true); } brw_pop_insn_state(p); @@ -1219,9 +1269,11 @@ brw_inst_set_opcode(p->devinfo, insn, BRW_OPCODE_NOP); } - - - +void brw_SYNC(struct brw_codegen *p, enum tgl_sync_function func) +{ + brw_inst *insn = next_insn(p, BRW_OPCODE_SYNC); + brw_inst_set_cond_modifier(p->devinfo, insn, func); +} /*********************************************************************** * Comparisons, if/else/endif @@ -1325,7 +1377,8 @@ brw_inst_set_uip(devinfo, insn, 0); } else { brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D))); - brw_set_src0(p, insn, brw_imm_d(0)); + if (devinfo->gen < 12) + brw_set_src0(p, insn, brw_imm_d(0)); brw_inst_set_jip(devinfo, insn, 0); brw_inst_set_uip(devinfo, insn, 0); } @@ -1525,7 +1578,8 @@ brw_inst_set_uip(devinfo, insn, 0); } else { brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); - brw_set_src0(p, insn, brw_imm_d(0)); + if (devinfo->gen < 12) + brw_set_src0(p, insn, brw_imm_d(0)); brw_inst_set_jip(devinfo, insn, 0); brw_inst_set_uip(devinfo, insn, 0); } @@ -1678,11 +1732,11 @@ insn = next_insn(p, BRW_OPCODE_HALT); brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); - if (devinfo->gen >= 8) { - brw_set_src0(p, insn, brw_imm_d(0x0)); - } else { + if (devinfo->gen < 8) { brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */ + } else if (devinfo->gen < 12) { + brw_set_src0(p, insn, brw_imm_d(0x0)); } brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE); @@ -1778,7 +1832,8 @@ if (devinfo->gen >= 8) { brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); - brw_set_src0(p, insn, brw_imm_d(0)); + if (devinfo->gen < 12) + brw_set_src0(p, insn, brw_imm_d(0)); brw_inst_set_jip(devinfo, insn, br * (do_insn - insn)); } else if (devinfo->gen == 7) { brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); @@ -1998,6 +2053,7 @@ (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE : devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE : BRW_SFID_DATAPORT_WRITE); + const struct tgl_swsb swsb = brw_get_default_swsb(p); uint32_t msg_type; if (devinfo->gen >= 6) @@ -2017,11 +2073,13 @@ brw_set_default_exec_size(p, BRW_EXECUTE_8); brw_set_default_mask_control(p, BRW_MASK_DISABLE); brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); + brw_set_default_swsb(p, tgl_swsb_src_dep(swsb)); brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); /* set message header global offset field (reg 0, element 2) */ brw_set_default_exec_size(p, BRW_EXECUTE_1); + brw_set_default_swsb(p, tgl_swsb_null()); brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, mrf.nr, @@ -2029,6 +2087,7 @@ brw_imm_ud(offset)); brw_pop_insn_state(p); + brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1)); } { @@ -2103,6 +2162,7 @@ unsigned offset) { const struct gen_device_info *devinfo = p->devinfo; + const struct tgl_swsb swsb = brw_get_default_swsb(p); if (devinfo->gen >= 6) offset /= 16; @@ -2129,6 +2189,7 @@ { brw_push_insn_state(p); + brw_set_default_swsb(p, tgl_swsb_src_dep(swsb)); brw_set_default_exec_size(p, BRW_EXECUTE_8); brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); brw_set_default_mask_control(p, BRW_MASK_DISABLE); @@ -2137,9 +2198,11 @@ /* set message header global offset field (reg 0, element 2) */ brw_set_default_exec_size(p, BRW_EXECUTE_1); + brw_set_default_swsb(p, tgl_swsb_null()); brw_MOV(p, get_element_ud(mrf, 2), brw_imm_ud(offset)); brw_pop_insn_state(p); + brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1)); } { @@ -2216,6 +2279,7 @@ (devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_CONSTANT_CACHE : BRW_SFID_DATAPORT_READ); const unsigned exec_size = 1 << brw_get_default_exec_size(p); + const struct tgl_swsb swsb = brw_get_default_swsb(p); /* On newer hardware, offset is in units of owords. */ if (devinfo->gen >= 6) @@ -2230,10 +2294,12 @@ brw_push_insn_state(p); brw_set_default_exec_size(p, BRW_EXECUTE_8); + brw_set_default_swsb(p, tgl_swsb_src_dep(swsb)); brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); /* set message header global offset field (reg 0, element 2) */ brw_set_default_exec_size(p, BRW_EXECUTE_1); + brw_set_default_swsb(p, tgl_swsb_null()); brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, mrf.nr, @@ -2241,6 +2307,8 @@ brw_imm_ud(offset)); brw_pop_insn_state(p); + brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1)); + brw_inst *insn = next_insn(p, BRW_OPCODE_SEND); brw_inst_set_sfid(devinfo, insn, target_cache); @@ -2446,12 +2514,15 @@ struct brw_reg temp = get_element_ud(header, 3); + brw_push_insn_state(p); brw_AND(p, temp, get_element_ud(sampler_index, 0), brw_imm_ud(0x0f0)); + brw_set_default_swsb(p, tgl_swsb_regdist(1)); brw_SHL(p, temp, temp, brw_imm_ud(4)); brw_ADD(p, get_element_ud(header, 3), get_element_ud(brw_vec8_grf(0, 0), 3), temp); + brw_pop_insn_state(p); } } @@ -2525,9 +2596,10 @@ if (desc.file == BRW_IMMEDIATE_VALUE) { send = next_insn(p, BRW_OPCODE_SEND); + brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD)); brw_set_desc(p, send, desc.ud | desc_imm); - } else { + const struct tgl_swsb swsb = brw_get_default_swsb(p); struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD); brw_push_insn_state(p); @@ -2535,6 +2607,7 @@ brw_set_default_mask_control(p, BRW_MASK_DISABLE); brw_set_default_exec_size(p, BRW_EXECUTE_1); brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); + brw_set_default_swsb(p, tgl_swsb_src_dep(swsb)); /* Load the indirect descriptor to an address register using OR so the * caller can specify additional descriptor bits with the desc_imm @@ -2544,12 +2617,17 @@ brw_pop_insn_state(p); + brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1)); send = next_insn(p, BRW_OPCODE_SEND); - brw_set_src1(p, send, addr); + brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD)); + + if (devinfo->gen >= 12) + brw_inst_set_send_sel_reg32_desc(devinfo, send, true); + else + brw_set_src1(p, send, addr); } brw_set_dest(p, send, dst); - brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD)); brw_inst_set_sfid(devinfo, send, sfid); brw_inst_set_eot(devinfo, send, eot); } @@ -2576,6 +2654,7 @@ if (desc.file == BRW_IMMEDIATE_VALUE) { desc.ud |= desc_imm; } else { + const struct tgl_swsb swsb = brw_get_default_swsb(p); struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD); brw_push_insn_state(p); @@ -2583,6 +2662,7 @@ brw_set_default_mask_control(p, BRW_MASK_DISABLE); brw_set_default_exec_size(p, BRW_EXECUTE_1); brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); + brw_set_default_swsb(p, tgl_swsb_src_dep(swsb)); /* Load the indirect descriptor to an address register using OR so the * caller can specify additional descriptor bits with the desc_imm @@ -2592,11 +2672,15 @@ brw_pop_insn_state(p); desc = addr; + + brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1)); } - if (ex_desc.file == BRW_IMMEDIATE_VALUE) { + if (ex_desc.file == BRW_IMMEDIATE_VALUE && + (devinfo->gen >= 12 || (ex_desc.ud & INTEL_MASK(15, 12)) == 0)) { ex_desc.ud |= ex_desc_imm; } else { + const struct tgl_swsb swsb = brw_get_default_swsb(p); struct brw_reg addr = retype(brw_address_reg(2), BRW_REGISTER_TYPE_UD); brw_push_insn_state(p); @@ -2604,6 +2688,7 @@ brw_set_default_mask_control(p, BRW_MASK_DISABLE); brw_set_default_exec_size(p, BRW_EXECUTE_1); brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); + brw_set_default_swsb(p, tgl_swsb_src_dep(swsb)); /* Load the indirect extended descriptor to an address register using OR * so the caller can specify additional descriptor bits with the @@ -2615,13 +2700,25 @@ * descriptor which comes from the address register. If we don't OR * those two bits in, the external unit may get confused and hang. */ - brw_OR(p, addr, ex_desc, brw_imm_ud(ex_desc_imm | sfid | eot << 5)); + unsigned imm_part = ex_desc_imm | sfid | eot << 5; + + if (ex_desc.file == BRW_IMMEDIATE_VALUE) { + /* ex_desc bits 15:12 don't exist in the instruction encoding prior + * to Gen12, so we may have fallen back to an indirect extended + * descriptor. + */ + brw_MOV(p, addr, brw_imm_ud(ex_desc.ud | imm_part)); + } else { + brw_OR(p, addr, ex_desc, brw_imm_ud(imm_part)); + } brw_pop_insn_state(p); ex_desc = addr; + + brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1)); } - send = next_insn(p, BRW_OPCODE_SENDS); + send = next_insn(p, devinfo->gen >= 12 ? BRW_OPCODE_SEND : BRW_OPCODE_SENDS); brw_set_dest(p, send, dst); brw_set_src0(p, send, retype(payload0, BRW_REGISTER_TYPE_UD)); brw_set_src1(p, send, retype(payload1, BRW_REGISTER_TYPE_UD)); @@ -2638,7 +2735,7 @@ if (ex_desc.file == BRW_IMMEDIATE_VALUE) { brw_inst_set_send_sel_reg32_ex_desc(devinfo, send, 0); - brw_inst_set_send_ex_desc(devinfo, send, ex_desc.ud); + brw_inst_set_sends_ex_desc(devinfo, send, ex_desc.ud); } else { assert(ex_desc.file == BRW_ARCHITECTURE_REGISTER_FILE); assert(ex_desc.nr == BRW_ARF_ADDRESS); @@ -2660,6 +2757,7 @@ unsigned desc_imm) { if (surface.file != BRW_IMMEDIATE_VALUE) { + const struct tgl_swsb swsb = brw_get_default_swsb(p); struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD); brw_push_insn_state(p); @@ -2667,6 +2765,7 @@ brw_set_default_mask_control(p, BRW_MASK_DISABLE); brw_set_default_exec_size(p, BRW_EXECUTE_1); brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); + brw_set_default_swsb(p, tgl_swsb_src_dep(swsb)); /* Mask out invalid bits from the surface index to avoid hangs e.g. when * some surface array is accessed out of bounds. @@ -2679,6 +2778,7 @@ brw_pop_insn_state(p); surface = addr; + brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1)); } brw_send_indirect_message(p, sfid, dst, payload, surface, desc_imm, false); @@ -2730,6 +2830,8 @@ case BRW_OPCODE_HALT: if (depth == 0) return offset; + default: + break; } } @@ -2835,6 +2937,9 @@ assert(brw_inst_uip(devinfo, insn) != 0); assert(brw_inst_jip(devinfo, insn) != 0); break; + + default: + break; } } } @@ -3088,8 +3193,12 @@ brw_MOV(p, dst, offset(dst, 1)); } - if (stall) + if (stall) { + brw_set_default_swsb(p, tgl_swsb_sbid(TGL_SBID_DST, + brw_get_default_swsb(p).sbid)); + brw_MOV(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW), dst); + } brw_pop_insn_state(p); } @@ -3170,6 +3279,7 @@ * hardware. */ brw_SHR(p, vec1(dst), mask, brw_imm_ud(qtr_control * 8)); + brw_set_default_swsb(p, tgl_swsb_regdist(1)); brw_AND(p, vec1(dst), exec_mask, vec1(dst)); exec_mask = vec1(dst); } @@ -3313,12 +3423,15 @@ * register is above this limit. */ if (offset >= limit) { + brw_set_default_swsb(p, tgl_swsb_regdist(1)); brw_ADD(p, addr, addr, brw_imm_ud(offset - offset % limit)); offset = offset % limit; } brw_pop_insn_state(p); + brw_set_default_swsb(p, tgl_swsb_regdist(1)); + /* Use indirect addressing to fetch the specified component. */ if (type_sz(src.type) > 4 && (devinfo->is_cherryview || gen_device_info_is_9lp(devinfo))) { @@ -3337,6 +3450,7 @@ brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0), retype(brw_vec1_indirect(addr.subnr, offset), BRW_REGISTER_TYPE_D)); + brw_set_default_swsb(p, tgl_swsb_null()); brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1), retype(brw_vec1_indirect(addr.subnr, offset + 4), BRW_REGISTER_TYPE_D)); @@ -3437,7 +3551,6 @@ brw_set_desc(p, inst, brw_message_desc(devinfo, 1, 0, false)); brw_inst_set_sfid(devinfo, inst, BRW_SFID_MESSAGE_GATEWAY); - brw_inst_set_gateway_notify(devinfo, inst, 1); brw_inst_set_gateway_subfuncid(devinfo, inst, BRW_MESSAGE_GATEWAY_SFID_BARRIER_MSG); @@ -3466,37 +3579,35 @@ brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE); } -/** - * Changes the floating point rounding mode updating the control register - * field defined at cr0.0[5-6] bits. This function supports the changes to - * RTNE (00), RU (01), RD (10) and RTZ (11) rounding using bitwise operations. - * Only RTNE and RTZ rounding are enabled at nir. - */ -void -brw_rounding_mode(struct brw_codegen *p, - enum brw_rnd_mode mode) -{ - const unsigned bits = mode << BRW_CR0_RND_MODE_SHIFT; - - if (bits != BRW_CR0_RND_MODE_MASK) { - brw_inst *inst = brw_AND(p, brw_cr0_reg(0), brw_cr0_reg(0), - brw_imm_ud(~BRW_CR0_RND_MODE_MASK)); - brw_inst_set_exec_size(p->devinfo, inst, BRW_EXECUTE_1); - - /* From the Skylake PRM, Volume 7, page 760: - * "Implementation Restriction on Register Access: When the control - * register is used as an explicit source and/or destination, hardware - * does not ensure execution pipeline coherency. Software must set the - * thread control field to ‘switch’ for an instruction that uses - * control register as an explicit operand." - */ - brw_inst_set_thread_control(p->devinfo, inst, BRW_THREAD_SWITCH); - } +void +brw_float_controls_mode(struct brw_codegen *p, + unsigned mode, unsigned mask) +{ + /* From the Skylake PRM, Volume 7, page 760: + * "Implementation Restriction on Register Access: When the control + * register is used as an explicit source and/or destination, hardware + * does not ensure execution pipeline coherency. Software must set the + * thread control field to ‘switch’ for an instruction that uses + * control register as an explicit operand." + * + * On Gen12+ this is implemented in terms of SWSB annotations instead. + */ + brw_set_default_swsb(p, tgl_swsb_regdist(1)); - if (bits) { - brw_inst *inst = brw_OR(p, brw_cr0_reg(0), brw_cr0_reg(0), - brw_imm_ud(bits)); - brw_inst_set_exec_size(p->devinfo, inst, BRW_EXECUTE_1); + brw_inst *inst = brw_AND(p, brw_cr0_reg(0), brw_cr0_reg(0), + brw_imm_ud(~mask)); + brw_inst_set_exec_size(p->devinfo, inst, BRW_EXECUTE_1); + if (p->devinfo->gen < 12) brw_inst_set_thread_control(p->devinfo, inst, BRW_THREAD_SWITCH); + + if (mode) { + brw_inst *inst_or = brw_OR(p, brw_cr0_reg(0), brw_cr0_reg(0), + brw_imm_ud(mode)); + brw_inst_set_exec_size(p->devinfo, inst_or, BRW_EXECUTE_1); + if (p->devinfo->gen < 12) + brw_inst_set_thread_control(p->devinfo, inst_or, BRW_THREAD_SWITCH); } + + if (p->devinfo->gen >= 12) + brw_SYNC(p, TGL_SYNC_NOP); } diff -Nru mesa-19.2.8/src/intel/compiler/brw_eu.h mesa-20.0.8/src/intel/compiler/brw_eu.h --- mesa-19.2.8/src/intel/compiler/brw_eu.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/compiler/brw_eu.h 2020-06-12 01:21:17.000000000 +0000 @@ -59,6 +59,9 @@ /* One of BRW_MASK_* */ unsigned mask_control:1; + /* Scheduling info for Gen12+ */ + struct tgl_swsb swsb; + bool saturate:1; /* One of BRW_ALIGN_* */ @@ -139,6 +142,7 @@ unsigned brw_get_default_exec_size(struct brw_codegen *p); unsigned brw_get_default_group(struct brw_codegen *p); unsigned brw_get_default_access_mode(struct brw_codegen *p); +struct tgl_swsb brw_get_default_swsb(struct brw_codegen *p); void brw_set_default_exec_size(struct brw_codegen *p, unsigned value); void brw_set_default_mask_control( struct brw_codegen *p, unsigned value ); void brw_set_default_saturate( struct brw_codegen *p, bool enable ); @@ -150,10 +154,11 @@ brw_inst *inst, unsigned group); void brw_set_default_group(struct brw_codegen *p, unsigned group); void brw_set_default_compression_control(struct brw_codegen *p, enum brw_compression c); -void brw_set_default_predicate_control( struct brw_codegen *p, unsigned pc ); +void brw_set_default_predicate_control(struct brw_codegen *p, enum brw_predicate pc); void brw_set_default_predicate_inverse(struct brw_codegen *p, bool predicate_inverse); void brw_set_default_flag_reg(struct brw_codegen *p, int reg, int subreg); void brw_set_default_acc_write_control(struct brw_codegen *p, unsigned value); +void brw_set_default_swsb(struct brw_codegen *p, struct tgl_swsb value); void brw_init_codegen(const struct gen_device_info *, struct brw_codegen *p, void *mem_ctx); @@ -194,9 +199,6 @@ struct brw_reg src1, \ struct brw_reg src2); -#define ROUND(OP) \ -void brw_##OP(struct brw_codegen *p, struct brw_reg dest, struct brw_reg src0); - ALU1(MOV) ALU2(SEL) ALU1(NOT) @@ -217,6 +219,8 @@ ALU2(MUL) ALU1(FRC) ALU1(RNDD) +ALU1(RNDE) +ALU1(RNDZ) ALU2(MAC) ALU2(MACH) ALU1(LZD) @@ -239,13 +243,9 @@ ALU2(SUBB) ALU2(MAC) -ROUND(RNDZ) -ROUND(RNDE) - #undef ALU1 #undef ALU2 #undef ALU3 -#undef ROUND /* Helpers for SEND instruction: @@ -693,6 +693,37 @@ } static inline uint32_t +brw_dp_dword_scattered_rw_desc(const struct gen_device_info *devinfo, + unsigned exec_size, + bool write) +{ + assert(exec_size == 8 || exec_size == 16); + + unsigned msg_type; + if (write) { + if (devinfo->gen >= 6) { + msg_type = GEN6_DATAPORT_WRITE_MESSAGE_DWORD_SCATTERED_WRITE; + } else { + msg_type = BRW_DATAPORT_WRITE_MESSAGE_DWORD_SCATTERED_WRITE; + } + } else { + if (devinfo->gen >= 7) { + msg_type = GEN7_DATAPORT_DC_DWORD_SCATTERED_READ; + } else if (devinfo->gen > 4 || devinfo->is_g4x) { + msg_type = G45_DATAPORT_READ_MESSAGE_DWORD_SCATTERED_READ; + } else { + msg_type = BRW_DATAPORT_READ_MESSAGE_DWORD_SCATTERED_READ; + } + } + + const unsigned msg_control = + SET_BITS(1, 1, 1) | /* Legacy SIMD Mode */ + SET_BITS(exec_size == 16, 0, 0); + + return brw_dp_surface_desc(devinfo, msg_type, msg_control); +} + +static inline uint32_t brw_dp_a64_untyped_surface_rw_desc(const struct gen_device_info *devinfo, unsigned exec_size, /**< 0 for SIMD4x2 */ unsigned num_channels, @@ -1080,6 +1111,8 @@ void brw_WAIT(struct brw_codegen *p); +void brw_SYNC(struct brw_codegen *p, enum tgl_sync_function func); + /* Special case: there is never a destination, execution size will be * taken from src0: */ @@ -1145,8 +1178,8 @@ struct brw_reg idx); void -brw_rounding_mode(struct brw_codegen *p, - enum brw_rnd_mode mode); +brw_float_controls_mode(struct brw_codegen *p, + unsigned mode, unsigned mask); /*********************************************************************** * brw_eu_util.c: @@ -1189,8 +1222,8 @@ void brw_set_uip_jip(struct brw_codegen *p, int start_offset); -enum brw_conditional_mod brw_negate_cmod(uint32_t cmod); -enum brw_conditional_mod brw_swap_cmod(uint32_t cmod); +enum brw_conditional_mod brw_negate_cmod(enum brw_conditional_mod cmod); +enum brw_conditional_mod brw_swap_cmod(enum brw_conditional_mod cmod); /* brw_eu_compact.c */ void brw_init_compaction_tables(const struct gen_device_info *devinfo); @@ -1205,6 +1238,9 @@ brw_inst *orig, brw_inst *uncompacted); /* brw_eu_validate.c */ +bool brw_validate_instruction(const struct gen_device_info *devinfo, + const brw_inst *inst, int offset, + struct disasm_info *disasm); bool brw_validate_instructions(const struct gen_device_info *devinfo, const void *assembly, int start_offset, int end_offset, struct disasm_info *disasm); @@ -1221,32 +1257,46 @@ } struct opcode_desc { - /* The union is an implementation detail used by brw_opcode_desc() to handle - * opcodes that have been reused for different instructions across hardware - * generations. - * - * The gens field acts as a tag. If it is non-zero, name points to a string - * containing the instruction mnemonic. If it is zero, the table field is - * valid and either points to a secondary opcode_desc table with 'size' - * elements or is NULL and no such instruction exists for the opcode. - */ - union { - struct { - char *name; - int nsrc; - }; - struct { - const struct opcode_desc *table; - unsigned size; - }; - }; - int ndst; - int gens; + unsigned ir; + unsigned hw; + const char *name; + int nsrc; + int ndst; + int gens; }; const struct opcode_desc * brw_opcode_desc(const struct gen_device_info *devinfo, enum opcode opcode); +const struct opcode_desc * +brw_opcode_desc_from_hw(const struct gen_device_info *devinfo, unsigned hw); + +static inline unsigned +brw_opcode_encode(const struct gen_device_info *devinfo, enum opcode opcode) +{ + return brw_opcode_desc(devinfo, opcode)->hw; +} + +static inline enum opcode +brw_opcode_decode(const struct gen_device_info *devinfo, unsigned hw) +{ + const struct opcode_desc *desc = brw_opcode_desc_from_hw(devinfo, hw); + return desc ? (enum opcode)desc->ir : BRW_OPCODE_ILLEGAL; +} + +static inline void +brw_inst_set_opcode(const struct gen_device_info *devinfo, + brw_inst *inst, enum opcode opcode) +{ + brw_inst_set_hw_opcode(devinfo, inst, brw_opcode_encode(devinfo, opcode)); +} + +static inline enum opcode +brw_inst_opcode(const struct gen_device_info *devinfo, const brw_inst *inst) +{ + return brw_opcode_decode(devinfo, brw_inst_hw_opcode(devinfo, inst)); +} + static inline bool is_3src(const struct gen_device_info *devinfo, enum opcode opcode) { diff -Nru mesa-19.2.8/src/intel/compiler/brw_eu_validate.c mesa-20.0.8/src/intel/compiler/brw_eu_validate.c --- mesa-19.2.8/src/intel/compiler/brw_eu_validate.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/compiler/brw_eu_validate.c 2020-06-12 01:21:17.000000000 +0000 @@ -105,12 +105,16 @@ static bool inst_is_split_send(const struct gen_device_info *devinfo, const brw_inst *inst) { - switch (brw_inst_opcode(devinfo, inst)) { - case BRW_OPCODE_SENDS: - case BRW_OPCODE_SENDSC: - return true; - default: - return false; + if (devinfo->gen >= 12) { + return inst_is_send(devinfo, inst); + } else { + switch (brw_inst_opcode(devinfo, inst)) { + case BRW_OPCODE_SENDS: + case BRW_OPCODE_SENDSC: + return true; + default: + return false; + } } } @@ -126,10 +130,17 @@ } } +static enum brw_reg_type +inst_dst_type(const struct gen_device_info *devinfo, const brw_inst *inst) +{ + return (devinfo->gen < 12 || !inst_is_send(devinfo, inst)) ? + brw_inst_dst_type(devinfo, inst) : BRW_REGISTER_TYPE_D; +} + static bool inst_is_raw_move(const struct gen_device_info *devinfo, const brw_inst *inst) { - unsigned dst_type = signed_type(brw_inst_dst_type(devinfo, inst)); + unsigned dst_type = signed_type(inst_dst_type(devinfo, inst)); unsigned src_type = signed_type(brw_inst_src0_type(devinfo, inst)); if (brw_inst_src0_reg_file(devinfo, inst) == BRW_IMMEDIATE_VALUE) { @@ -159,7 +170,8 @@ static bool src0_is_null(const struct gen_device_info *devinfo, const brw_inst *inst) { - return brw_inst_src0_reg_file(devinfo, inst) == BRW_ARCHITECTURE_REGISTER_FILE && + return brw_inst_src0_address_mode(devinfo, inst) == BRW_ADDRESS_DIRECT && + brw_inst_src0_reg_file(devinfo, inst) == BRW_ARCHITECTURE_REGISTER_FILE && brw_inst_src0_da_reg_nr(devinfo, inst) == BRW_ARF_NULL; } @@ -185,12 +197,6 @@ } static bool -src0_is_grf(const struct gen_device_info *devinfo, const brw_inst *inst) -{ - return brw_inst_src0_reg_file(devinfo, inst) == BRW_GENERAL_REGISTER_FILE; -} - -static bool src0_has_scalar_region(const struct gen_device_info *devinfo, const brw_inst *inst) { return brw_inst_src0_vstride(devinfo, inst) == BRW_VERTICAL_STRIDE_0 && @@ -262,6 +268,76 @@ } static struct string +invalid_values(const struct gen_device_info *devinfo, const brw_inst *inst) +{ + unsigned num_sources = num_sources_from_inst(devinfo, inst); + struct string error_msg = { .str = NULL, .len = 0 }; + + switch ((enum brw_execution_size) brw_inst_exec_size(devinfo, inst)) { + case BRW_EXECUTE_1: + case BRW_EXECUTE_2: + case BRW_EXECUTE_4: + case BRW_EXECUTE_8: + case BRW_EXECUTE_16: + case BRW_EXECUTE_32: + break; + default: + ERROR("invalid execution size"); + break; + } + + if (inst_is_send(devinfo, inst)) + return error_msg; + + if (num_sources == 3) { + /* Nothing to test: + * No 3-src instructions on Gen4-5 + * No reg file bits on Gen6-10 (align16) + * No invalid encodings on Gen10-12 (align1) + */ + } else { + if (devinfo->gen > 6) { + ERROR_IF(brw_inst_dst_reg_file(devinfo, inst) == MRF || + (num_sources > 0 && + brw_inst_src0_reg_file(devinfo, inst) == MRF) || + (num_sources > 1 && + brw_inst_src1_reg_file(devinfo, inst) == MRF), + "invalid register file encoding"); + } + } + + if (error_msg.str) + return error_msg; + + if (num_sources == 3) { + if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) { + if (devinfo->gen >= 10) { + ERROR_IF(brw_inst_3src_a1_dst_type (devinfo, inst) == INVALID_REG_TYPE || + brw_inst_3src_a1_src0_type(devinfo, inst) == INVALID_REG_TYPE || + brw_inst_3src_a1_src1_type(devinfo, inst) == INVALID_REG_TYPE || + brw_inst_3src_a1_src2_type(devinfo, inst) == INVALID_REG_TYPE, + "invalid register type encoding"); + } else { + ERROR("Align1 mode not allowed on Gen < 10"); + } + } else { + ERROR_IF(brw_inst_3src_a16_dst_type(devinfo, inst) == INVALID_REG_TYPE || + brw_inst_3src_a16_src_type(devinfo, inst) == INVALID_REG_TYPE, + "invalid register type encoding"); + } + } else { + ERROR_IF(brw_inst_dst_type (devinfo, inst) == INVALID_REG_TYPE || + (num_sources > 0 && + brw_inst_src0_type(devinfo, inst) == INVALID_REG_TYPE) || + (num_sources > 1 && + brw_inst_src1_type(devinfo, inst) == INVALID_REG_TYPE), + "invalid register type encoding"); + } + + return error_msg; +} + +static struct string sources_not_null(const struct gen_device_info *devinfo, const brw_inst *inst) { @@ -280,7 +356,7 @@ if (inst_is_split_send(devinfo, inst)) return (struct string){}; - if (num_sources >= 1) + if (num_sources >= 1 && brw_inst_opcode(devinfo, inst) != BRW_OPCODE_SYNC) ERROR_IF(src0_is_null(devinfo, inst), "src0 is null"); if (num_sources == 2) @@ -310,6 +386,8 @@ case BRW_OPCODE_MACH: case BRW_OPCODE_SADA2: return true; + default: + break; } /* FIXME: support 3-src instructions */ @@ -348,7 +426,7 @@ unsigned ex_mlen = 1; if (!brw_inst_send_sel_reg32_ex_desc(devinfo, inst)) { - const uint32_t ex_desc = brw_inst_send_ex_desc(devinfo, inst); + const uint32_t ex_desc = brw_inst_sends_ex_desc(devinfo, inst); ex_mlen = brw_message_ex_desc_ex_mlen(devinfo, ex_desc); } const unsigned src0_reg_nr = brw_inst_src0_da_reg_nr(devinfo, inst); @@ -364,7 +442,8 @@ "send must use direct addressing"); if (devinfo->gen >= 7) { - ERROR_IF(!src0_is_grf(devinfo, inst), "send from non-GRF"); + ERROR_IF(brw_inst_send_src0_reg_file(devinfo, inst) != BRW_GENERAL_REGISTER_FILE, + "send from non-GRF"); ERROR_IF(brw_inst_eot(devinfo, inst) && brw_inst_src0_da_reg_nr(devinfo, inst) < 112, "send with EOT must use g112-g127"); @@ -389,7 +468,7 @@ is_unsupported_inst(const struct gen_device_info *devinfo, const brw_inst *inst) { - return brw_opcode_desc(devinfo, brw_inst_opcode(devinfo, inst)) == NULL; + return brw_inst_opcode(devinfo, inst) == BRW_OPCODE_ILLEGAL; } /** @@ -447,7 +526,7 @@ /* Execution data type is independent of destination data type, except in * mixed F/HF instructions. */ - enum brw_reg_type dst_exec_type = brw_inst_dst_type(devinfo, inst); + enum brw_reg_type dst_exec_type = inst_dst_type(devinfo, inst); src0_exec_type = execution_type_for_type(brw_inst_src0_type(devinfo, inst)); if (num_sources == 1) { @@ -466,6 +545,10 @@ if (src0_exec_type == src1_exec_type) return src0_exec_type; + if (src0_exec_type == BRW_REGISTER_TYPE_NF || + src1_exec_type == BRW_REGISTER_TYPE_NF) + return BRW_REGISTER_TYPE_NF; + /* Mixed operand types where one is float is float on Gen < 6 * (and not allowed on later platforms) */ @@ -654,10 +737,10 @@ */ unsigned dst_stride = STRIDE(brw_inst_dst_hstride(devinfo, inst)); - enum brw_reg_type dst_type = brw_inst_dst_type(devinfo, inst); + enum brw_reg_type dst_type = inst_dst_type(devinfo, inst); bool dst_type_is_byte = - brw_inst_dst_type(devinfo, inst) == BRW_REGISTER_TYPE_B || - brw_inst_dst_type(devinfo, inst) == BRW_REGISTER_TYPE_UB; + inst_dst_type(devinfo, inst) == BRW_REGISTER_TYPE_B || + inst_dst_type(devinfo, inst) == BRW_REGISTER_TYPE_UB; if (dst_type_is_byte) { if (is_packed(exec_size * dst_stride, exec_size, dst_stride)) { @@ -971,7 +1054,7 @@ unsigned offset = rowbase; for (int x = 0; x < width; x++) { - access_mask |= mask << offset; + access_mask |= mask << (offset % 64); offset += hstride * element_size; } @@ -1241,7 +1324,7 @@ unsigned offset = rowbase; for (int x = 0; x < width; x++) { - access_mask[element++] = mask << offset; + access_mask[element++] = mask << (offset % 64); offset += hstride * element_size; } @@ -1346,7 +1429,7 @@ return error_msg; unsigned stride = STRIDE(brw_inst_dst_hstride(devinfo, inst)); - enum brw_reg_type dst_type = brw_inst_dst_type(devinfo, inst); + enum brw_reg_type dst_type = inst_dst_type(devinfo, inst); unsigned element_size = brw_reg_type_to_size(dst_type); unsigned subreg = brw_inst_dst_da1_subreg_nr(devinfo, inst); unsigned offset = ((exec_size - 1) * stride * element_size) + subreg; @@ -1553,7 +1636,7 @@ * is that the size of the destination type is 4 bytes. */ if (devinfo->gen <= 7 && dst_regs == 2) { - enum brw_reg_type dst_type = brw_inst_dst_type(devinfo, inst); + enum brw_reg_type dst_type = inst_dst_type(devinfo, inst); bool dst_is_packed_dword = is_packed(exec_size * stride, exec_size, stride) && brw_reg_type_to_size(dst_type) == 4; @@ -1604,7 +1687,7 @@ if (file != BRW_IMMEDIATE_VALUE) return (struct string){}; - enum brw_reg_type dst_type = brw_inst_dst_type(devinfo, inst); + enum brw_reg_type dst_type = inst_dst_type(devinfo, inst); unsigned dst_type_size = brw_reg_type_to_size(dst_type); unsigned dst_subreg = brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1 ? brw_inst_dst_da1_subreg_nr(devinfo, inst) : 0; @@ -1668,7 +1751,7 @@ unsigned exec_type_size = brw_reg_type_to_size(exec_type); enum brw_reg_file dst_file = brw_inst_dst_reg_file(devinfo, inst); - enum brw_reg_type dst_type = brw_inst_dst_type(devinfo, inst); + enum brw_reg_type dst_type = inst_dst_type(devinfo, inst); unsigned dst_type_size = brw_reg_type_to_size(dst_type); unsigned dst_hstride = STRIDE(brw_inst_dst_hstride(devinfo, inst)); unsigned dst_reg = brw_inst_dst_da_reg_nr(devinfo, inst); @@ -1822,6 +1905,72 @@ return error_msg; } +static struct string +instruction_restrictions(const struct gen_device_info *devinfo, + const brw_inst *inst) +{ + struct string error_msg = { .str = NULL, .len = 0 }; + + /* From GEN:BUG:1604601757: + * + * "When multiplying a DW and any lower precision integer, source modifier + * is not supported." + */ + if (devinfo->gen >= 12 && + brw_inst_opcode(devinfo, inst) == BRW_OPCODE_MUL) { + enum brw_reg_type exec_type = execution_type(devinfo, inst); + const bool src0_valid = type_sz(brw_inst_src0_type(devinfo, inst)) == 4 || + brw_inst_src0_reg_file(devinfo, inst) == BRW_IMMEDIATE_VALUE || + !(brw_inst_src0_negate(devinfo, inst) || + brw_inst_src0_abs(devinfo, inst)); + const bool src1_valid = type_sz(brw_inst_src1_type(devinfo, inst)) == 4 || + brw_inst_src1_reg_file(devinfo, inst) == BRW_IMMEDIATE_VALUE || + !(brw_inst_src1_negate(devinfo, inst) || + brw_inst_src1_abs(devinfo, inst)); + + ERROR_IF(!brw_reg_type_is_floating_point(exec_type) && + type_sz(exec_type) == 4 && !(src0_valid && src1_valid), + "When multiplying a DW and any lower precision integer, source " + "modifier is not supported."); + } + + return error_msg; +} + +bool +brw_validate_instruction(const struct gen_device_info *devinfo, + const brw_inst *inst, int offset, + struct disasm_info *disasm) +{ + struct string error_msg = { .str = NULL, .len = 0 }; + + if (is_unsupported_inst(devinfo, inst)) { + ERROR("Instruction not supported on this Gen"); + } else { + CHECK(invalid_values); + + if (error_msg.str == NULL) { + CHECK(sources_not_null); + CHECK(send_restrictions); + CHECK(alignment_supported); + CHECK(general_restrictions_based_on_operand_types); + CHECK(general_restrictions_on_region_parameters); + CHECK(special_restrictions_for_mixed_float_mode); + CHECK(region_alignment_rules); + CHECK(vector_immediate_restrictions); + CHECK(special_requirements_for_handling_double_precision_data_types); + CHECK(instruction_restrictions); + } + } + + if (error_msg.str && disasm) { + disasm_insert_error(disasm, offset, error_msg.str); + } + free(error_msg.str); + + return error_msg.len == 0; +} + bool brw_validate_instructions(const struct gen_device_info *devinfo, const void *assembly, int start_offset, int end_offset, @@ -1830,9 +1979,10 @@ bool valid = true; for (int src_offset = start_offset; src_offset < end_offset;) { - struct string error_msg = { .str = NULL, .len = 0 }; const brw_inst *inst = assembly + src_offset; bool is_compact = brw_inst_cmpt_control(devinfo, inst); + unsigned inst_size = is_compact ? sizeof(brw_compact_inst) + : sizeof(brw_inst); brw_inst uncompacted; if (is_compact) { @@ -1841,31 +1991,10 @@ inst = &uncompacted; } - if (is_unsupported_inst(devinfo, inst)) { - ERROR("Instruction not supported on this Gen"); - } else { - CHECK(sources_not_null); - CHECK(send_restrictions); - CHECK(alignment_supported); - CHECK(general_restrictions_based_on_operand_types); - CHECK(general_restrictions_on_region_parameters); - CHECK(special_restrictions_for_mixed_float_mode); - CHECK(region_alignment_rules); - CHECK(vector_immediate_restrictions); - CHECK(special_requirements_for_handling_double_precision_data_types); - } - - if (error_msg.str && disasm) { - disasm_insert_error(disasm, src_offset, error_msg.str); - } - valid = valid && error_msg.len == 0; - free(error_msg.str); + bool v = brw_validate_instruction(devinfo, inst, src_offset, disasm); + valid = valid && v; - if (is_compact) { - src_offset += sizeof(brw_compact_inst); - } else { - src_offset += sizeof(brw_inst); - } + src_offset += inst_size; } return valid; diff -Nru mesa-19.2.8/src/intel/compiler/brw_fs_bank_conflicts.cpp mesa-20.0.8/src/intel/compiler/brw_fs_bank_conflicts.cpp --- mesa-19.2.8/src/intel/compiler/brw_fs_bank_conflicts.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/compiler/brw_fs_bank_conflicts.cpp 2020-06-12 01:21:17.000000000 +0000 @@ -567,13 +567,21 @@ constrained[p.atom_of_reg(reg_of(inst->src[i]))] = true; } + /* Preserve the original allocation of VGRFs used by the barycentric + * source of the LINTERP instruction on Gen6, since pair-aligned + * barycentrics allow the PLN instruction to be used. + */ + if (v->devinfo->has_pln && v->devinfo->gen <= 6 && + inst->opcode == FS_OPCODE_LINTERP) + constrained[p.atom_of_reg(reg_of(inst->src[0]))] = true; + /* The location of the Gen7 MRF hack registers is hard-coded in the * rest of the compiler back-end. Don't attempt to move them around. */ if (v->devinfo->gen >= 7) { assert(inst->dst.file != MRF); - for (int i = 0; i < v->implied_mrf_writes(inst); i++) { + for (unsigned i = 0; i < inst->implied_mrf_writes(); i++) { const unsigned reg = GEN7_MRF_HACK_START + inst->base_mrf + i; constrained[p.atom_of_reg(reg)] = true; } diff -Nru mesa-19.2.8/src/intel/compiler/brw_fs_builder.h mesa-20.0.8/src/intel/compiler/brw_fs_builder.h --- mesa-19.2.8/src/intel/compiler/brw_fs_builder.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/compiler/brw_fs_builder.h 2020-06-12 01:21:17.000000000 +0000 @@ -503,24 +503,29 @@ } } - if (cluster_size > 4) { - const fs_builder ubld = exec_all().group(4, 0); - src_reg left = component(tmp, 3); - dst_reg right = horiz_offset(tmp, 4); + for (unsigned i = 4; + i < MIN2(cluster_size, dispatch_width()); + i *= 2) { + const fs_builder ubld = exec_all().group(i, 0); + src_reg left = component(tmp, i - 1); + dst_reg right = horiz_offset(tmp, i); set_condmod(mod, ubld.emit(opcode, right, left, right)); - if (dispatch_width() > 8) { - left = component(tmp, 8 + 3); - right = horiz_offset(tmp, 8 + 4); + if (dispatch_width() > i * 2) { + left = component(tmp, i * 3 - 1); + right = horiz_offset(tmp, i * 3); set_condmod(mod, ubld.emit(opcode, right, left, right)); } - } - if (cluster_size > 8 && dispatch_width() > 8) { - const fs_builder ubld = exec_all().group(8, 0); - src_reg left = component(tmp, 7); - dst_reg right = horiz_offset(tmp, 8); - set_condmod(mod, ubld.emit(opcode, right, left, right)); + if (dispatch_width() > i * 4) { + left = component(tmp, i * 5 - 1); + right = horiz_offset(tmp, i * 5); + set_condmod(mod, ubld.emit(opcode, right, left, right)); + + left = component(tmp, i * 7 - 1); + right = horiz_offset(tmp, i * 7); + set_condmod(mod, ubld.emit(opcode, right, left, right)); + } } } diff -Nru mesa-19.2.8/src/intel/compiler/brw_fs_cmod_propagation.cpp mesa-20.0.8/src/intel/compiler/brw_fs_cmod_propagation.cpp --- mesa-19.2.8/src/intel/compiler/brw_fs_cmod_propagation.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/compiler/brw_fs_cmod_propagation.cpp 2020-06-12 01:21:17.000000000 +0000 @@ -326,17 +326,69 @@ } } - /* If the instruction generating inst's source also wrote the - * flag, and inst is doing a simple .nz comparison, then inst - * is redundant - the appropriate value is already in the flag - * register. Delete inst. + /* Knowing following: + * - CMP writes to flag register the result of + * applying cmod to the `src0 - src1`. + * After that it stores the same value to dst. + * Other instructions first store their result to + * dst, and then store cmod(dst) to the flag + * register. + * - inst is either CMP or MOV + * - inst->dst is null + * - inst->src[0] overlaps with scan_inst->dst + * - inst->src[1] is zero + * - scan_inst wrote to a flag register + * + * There can be three possible paths: + * + * - scan_inst is CMP: + * + * Considering that src0 is either 0x0 (false), + * or 0xffffffff (true), and src1 is 0x0: + * + * - If inst's cmod is NZ, we can always remove + * scan_inst: NZ is invariant for false and true. This + * holds even if src0 is NaN: .nz is the only cmod, + * that returns true for NaN. + * + * - .g is invariant if src0 has a UD type + * + * - .l is invariant if src0 has a D type + * + * - scan_inst and inst have the same cmod: + * + * If scan_inst is anything than CMP, it already + * wrote the appropriate value to the flag register. + * + * - else: + * + * We can change cmod of scan_inst to that of inst, + * and remove inst. It is valid as long as we make + * sure that no instruction uses the flag register + * between scan_inst and inst. */ - if (inst->conditional_mod == BRW_CONDITIONAL_NZ && - !inst->src[0].negate && + if (!inst->src[0].negate && scan_inst->flags_written()) { - inst->remove(block); - progress = true; - break; + if (scan_inst->opcode == BRW_OPCODE_CMP) { + if ((inst->conditional_mod == BRW_CONDITIONAL_NZ) || + (inst->conditional_mod == BRW_CONDITIONAL_G && + inst->src[0].type == BRW_REGISTER_TYPE_UD) || + (inst->conditional_mod == BRW_CONDITIONAL_L && + inst->src[0].type == BRW_REGISTER_TYPE_D)) { + inst->remove(block); + progress = true; + break; + } + } else if (scan_inst->conditional_mod == inst->conditional_mod) { + inst->remove(block); + progress = true; + break; + } else if (!read_flag) { + scan_inst->conditional_mod = inst->conditional_mod; + inst->remove(block); + progress = true; + break; + } } /* The conditional mod of the CMP/CMPN instructions behaves diff -Nru mesa-19.2.8/src/intel/compiler/brw_fs_combine_constants.cpp mesa-20.0.8/src/intel/compiler/brw_fs_combine_constants.cpp --- mesa-19.2.8/src/intel/compiler/brw_fs_combine_constants.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/compiler/brw_fs_combine_constants.cpp 2020-06-12 01:21:17.000000000 +0000 @@ -56,7 +56,14 @@ case BRW_OPCODE_CMP: case BRW_OPCODE_ADD: case BRW_OPCODE_MUL: - return true; + /* Only float instructions can coissue. We don't have a great + * understanding of whether or not something like float(int(a) + int(b)) + * would be considered float (based on the destination type) or integer + * (based on the source types), so we take the conservative choice of + * only promoting when both destination and source are float. + */ + return inst->dst.type == BRW_REGISTER_TYPE_F && + inst->src[0].type == BRW_REGISTER_TYPE_F; default: return false; } @@ -232,7 +239,7 @@ break; } case BRW_REGISTER_TYPE_Q: { - int64_t val = !can_do_source_mods ? src->d64 : abs(src->d64); + int64_t val = !can_do_source_mods ? src->d64 : llabs(src->d64); memcpy(out, &val, 8); break; } @@ -313,6 +320,36 @@ }; } +static bool +representable_as_hf(float f, uint16_t *hf) +{ + union fi u; + uint16_t h = _mesa_float_to_half(f); + u.f = _mesa_half_to_float(h); + + if (u.f == f) { + *hf = h; + return true; + } + + return false; +} + +static bool +represent_src_as_imm(const struct gen_device_info *devinfo, + fs_reg *src) +{ + /* TODO : consider specific platforms also */ + if (devinfo->gen == 12) { + uint16_t hf; + if (representable_as_hf(src->f, &hf)) { + *src = retype(brw_imm_uw(hf), BRW_REGISTER_TYPE_HF); + return true; + } + } + return false; +} + bool fs_visitor::opt_combine_constants() { @@ -336,10 +373,18 @@ if (!could_coissue(devinfo, inst) && !must_promote_imm(devinfo, inst)) continue; + bool represented_as_imm = false; for (int i = 0; i < inst->sources; i++) { if (inst->src[i].file != IMM) continue; + if (!represented_as_imm && i == 0 && + inst->opcode == BRW_OPCODE_MAD && + represent_src_as_imm(devinfo, &inst->src[i])) { + represented_as_imm = true; + continue; + } + char data[8]; brw_reg_type type; if (!get_constant_value(devinfo, inst, i, data, &type)) diff -Nru mesa-19.2.8/src/intel/compiler/brw_fs_copy_propagation.cpp mesa-20.0.8/src/intel/compiler/brw_fs_copy_propagation.cpp --- mesa-19.2.8/src/intel/compiler/brw_fs_copy_propagation.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/compiler/brw_fs_copy_propagation.cpp 2020-06-12 01:21:17.000000000 +0000 @@ -48,8 +48,8 @@ fs_reg dst; fs_reg src; unsigned global_idx; - uint8_t size_written; - uint8_t size_read; + unsigned size_written; + unsigned size_read; enum opcode opcode; bool saturate; }; @@ -194,7 +194,7 @@ * destinations. */ for (int i = 0; i < num_acp; i++) { - unsigned idx = acp[i]->dst.nr & (acp_table_size - 1); + unsigned idx = reg_space(acp[i]->dst) & (acp_table_size - 1); acp_table[idx].push_tail(acp[i]); } @@ -203,7 +203,7 @@ if (inst->dst.file != VGRF) continue; - unsigned idx = inst->dst.nr & (acp_table_size - 1); + unsigned idx = reg_space(inst->dst) & (acp_table_size - 1); foreach_in_list(acp_entry, entry, &acp_table[idx]) { if (regions_overlap(inst->dst, inst->size_written, entry->dst, entry->size_written)) @@ -220,16 +220,17 @@ * sources. */ for (int i = 0; i < num_acp; i++) { - unsigned idx = acp[i]->src.nr & (acp_table_size - 1); + unsigned idx = reg_space(acp[i]->src) & (acp_table_size - 1); acp_table[idx].push_tail(acp[i]); } foreach_block (block, cfg) { foreach_inst_in_block(fs_inst, inst, block) { - if (inst->dst.file != VGRF) + if (inst->dst.file != VGRF && + inst->dst.file != FIXED_GRF) continue; - unsigned idx = inst->dst.nr & (acp_table_size - 1); + unsigned idx = reg_space(inst->dst) & (acp_table_size - 1); foreach_in_list(acp_entry, entry, &acp_table[idx]) { if (regions_overlap(inst->dst, inst->size_written, entry->src, entry->size_read)) @@ -451,10 +452,24 @@ if (entry->src.file == IMM) return false; assert(entry->src.file == VGRF || entry->src.file == UNIFORM || - entry->src.file == ATTR); + entry->src.file == ATTR || entry->src.file == FIXED_GRF); + /* Avoid propagating a LOAD_PAYLOAD instruction into another if there is a + * good chance that we'll be able to eliminate the latter through register + * coalescing. If only part of the sources of the second LOAD_PAYLOAD can + * be simplified through copy propagation we would be making register + * coalescing impossible, ending up with unnecessary copies in the program. + * This is also the case for is_multi_copy_payload() copies that can only + * be coalesced when the instruction is lowered into a sequence of MOVs. + * + * Worse -- In cases where the ACP entry was the result of CSE combining + * multiple LOAD_PAYLOAD subexpressions, propagating the first LOAD_PAYLOAD + * into the second would undo the work of CSE, leading to an infinite + * optimization loop. Avoid this by detecting LOAD_PAYLOAD copies from CSE + * temporaries which should match is_coalescing_payload(). + */ if (entry->opcode == SHADER_OPCODE_LOAD_PAYLOAD && - inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) + (is_coalescing_payload(alloc, inst) || is_multi_copy_payload(inst))) return false; assert(entry->dst.file == VGRF); @@ -468,6 +483,21 @@ entry->dst, entry->size_written)) return false; + /* Avoid propagating a FIXED_GRF register into an EOT instruction in order + * for any register allocation restrictions to be applied. + */ + if (entry->src.file == FIXED_GRF && inst->eot) + return false; + + /* Avoid propagating odd-numbered FIXED_GRF registers into the first source + * of a LINTERP instruction on platforms where the PLN instruction has + * register alignment restrictions. + */ + if (devinfo->has_pln && devinfo->gen <= 6 && + entry->src.file == FIXED_GRF && (entry->src.nr & 1) && + inst->opcode == FS_OPCODE_LINTERP && arg == 0) + return false; + /* we can't generally copy-propagate UD negations because we * can end up accessing the resulting values as signed integers * instead. See also resolve_ud_negate() and comment in @@ -492,16 +522,30 @@ * derivatives, assume that their operands are packed so we can't * generally propagate strided regions to them. */ - if (instruction_requires_packed_data(inst) && entry->src.stride > 1) + const unsigned entry_stride = (entry->src.file == FIXED_GRF ? 1 : + entry->src.stride); + if (instruction_requires_packed_data(inst) && entry_stride > 1) return false; /* Bail if the result of composing both strides would exceed the * hardware limit. */ - if (!can_take_stride(inst, arg, entry->src.stride * inst->src[arg].stride, + if (!can_take_stride(inst, arg, entry_stride * inst->src[arg].stride, devinfo)) return false; + /* Bail if the source FIXED_GRF region of the copy cannot be trivially + * composed with the source region of the instruction -- E.g. because the + * copy uses some extended stride greater than 4 not supported natively by + * the hardware as a horizontal stride, or because instruction compression + * could require us to use a vertical stride shorter than a GRF. + */ + if (entry->src.file == FIXED_GRF && + (inst->src[arg].stride > 4 || + inst->dst.component_size(inst->exec_size) > + inst->src[arg].component_size(inst->exec_size))) + return false; + /* Bail if the instruction type is larger than the execution type of the * copy, what implies that each channel is reading multiple channels of the * destination of the copy, and simply replacing the sources would give a @@ -523,7 +567,7 @@ * * Which would have different semantics. */ - if (entry->src.stride != 1 && + if (entry_stride != 1 && (inst->src[arg].stride * type_sz(inst->src[arg].type)) % type_sz(entry->src.type) != 0) return false; @@ -561,13 +605,42 @@ } } + /* Save the offset of inst->src[arg] relative to entry->dst for it to be + * applied later. + */ + const unsigned rel_offset = inst->src[arg].offset - entry->dst.offset; + + /* Fold the copy into the instruction consuming it. */ inst->src[arg].file = entry->src.file; inst->src[arg].nr = entry->src.nr; - inst->src[arg].stride *= entry->src.stride; - inst->saturate = inst->saturate || entry->saturate; + inst->src[arg].subnr = entry->src.subnr; + inst->src[arg].offset = entry->src.offset; - /* Compute the offset of inst->src[arg] relative to entry->dst */ - const unsigned rel_offset = inst->src[arg].offset - entry->dst.offset; + /* Compose the strides of both regions. */ + if (entry->src.file == FIXED_GRF) { + if (inst->src[arg].stride) { + const unsigned orig_width = 1 << entry->src.width; + const unsigned reg_width = REG_SIZE / (type_sz(inst->src[arg].type) * + inst->src[arg].stride); + inst->src[arg].width = cvt(MIN2(orig_width, reg_width)) - 1; + inst->src[arg].hstride = cvt(inst->src[arg].stride); + inst->src[arg].vstride = inst->src[arg].hstride + inst->src[arg].width; + } else { + inst->src[arg].vstride = inst->src[arg].hstride = + inst->src[arg].width = 0; + } + + inst->src[arg].stride = 1; + + /* Hopefully no Align16 around here... */ + assert(entry->src.swizzle == BRW_SWIZZLE_XYZW); + inst->src[arg].swizzle = entry->src.swizzle; + } else { + inst->src[arg].stride *= entry->src.stride; + } + + /* Compose any saturate modifiers. */ + inst->saturate = inst->saturate || entry->saturate; /* Compute the first component of the copy that the instruction is * reading, and the base byte offset within that component. @@ -579,9 +652,8 @@ /* Calculate the byte offset at the origin of the copy of the given * component and suboffset. */ - inst->src[arg].offset = suboffset + - component * entry->src.stride * type_sz(entry->src.type) + - entry->src.offset; + inst->src[arg] = byte_offset(inst->src[arg], + component * entry_stride * type_sz(entry->src.type) + suboffset); if (has_source_modifiers) { if (entry->dst.type != inst->src[arg].type) { @@ -833,9 +905,12 @@ inst->src[0], inst->size_read(0))) || inst->src[0].file == ATTR || inst->src[0].file == UNIFORM || - inst->src[0].file == IMM) && + inst->src[0].file == IMM || + (inst->src[0].file == FIXED_GRF && + inst->src[0].is_contiguous())) && inst->src[0].type == inst->dst.type && - !inst->is_partial_write()); + !inst->is_partial_write()) || + is_identity_payload(FIXED_GRF, inst); } /* Walks a basic block and does copy propagation on it using the acp @@ -862,7 +937,7 @@ } /* kill the destination from the ACP */ - if (inst->dst.file == VGRF) { + if (inst->dst.file == VGRF || inst->dst.file == FIXED_GRF) { foreach_in_list_safe(acp_entry, entry, &acp[inst->dst.nr % ACP_HASH_SIZE]) { if (regions_overlap(entry->dst, entry->size_written, inst->dst, inst->size_written)) @@ -888,11 +963,12 @@ * operand of another instruction, add it to the ACP. */ if (can_propagate_from(inst)) { - acp_entry *entry = ralloc(copy_prop_ctx, acp_entry); + acp_entry *entry = rzalloc(copy_prop_ctx, acp_entry); entry->dst = inst->dst; entry->src = inst->src[0]; entry->size_written = inst->size_written; - entry->size_read = inst->size_read(0); + for (unsigned i = 0; i < inst->sources; i++) + entry->size_read += inst->size_read(i); entry->opcode = inst->opcode; entry->saturate = inst->saturate; acp[entry->dst.nr % ACP_HASH_SIZE].push_tail(entry); @@ -904,7 +980,9 @@ assert(effective_width * type_sz(inst->src[i].type) % REG_SIZE == 0); const unsigned size_written = effective_width * type_sz(inst->src[i].type); - if (inst->src[i].file == VGRF) { + if (inst->src[i].file == VGRF || + (inst->src[i].file == FIXED_GRF && + inst->src[i].is_contiguous())) { acp_entry *entry = rzalloc(copy_prop_ctx, acp_entry); entry->dst = byte_offset(inst->dst, offset); entry->src = inst->src[i]; diff -Nru mesa-19.2.8/src/intel/compiler/brw_fs.cpp mesa-20.0.8/src/intel/compiler/brw_fs.cpp --- mesa-19.2.8/src/intel/compiler/brw_fs.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/compiler/brw_fs.cpp 2020-06-12 01:21:17.000000000 +0000 @@ -31,6 +31,7 @@ #include "main/macros.h" #include "brw_eu.h" #include "brw_fs.h" +#include "brw_fs_live_variables.h" #include "brw_nir.h" #include "brw_vec4_gs_visitor.h" #include "brw_cfg.h" @@ -227,6 +228,9 @@ case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT: case SHADER_OPCODE_URB_READ_SIMD8: case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT: + case SHADER_OPCODE_INTERLOCK: + case SHADER_OPCODE_MEMORY_FENCE: + case SHADER_OPCODE_BARRIER: return true; case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD: return src[1].file == VGRF; @@ -287,6 +291,44 @@ } } +bool +fs_inst::is_payload(unsigned arg) const +{ + switch (opcode) { + case FS_OPCODE_FB_WRITE: + case FS_OPCODE_FB_READ: + case SHADER_OPCODE_URB_WRITE_SIMD8: + case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT: + case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED: + case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT: + case SHADER_OPCODE_URB_READ_SIMD8: + case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT: + case VEC4_OPCODE_UNTYPED_ATOMIC: + case VEC4_OPCODE_UNTYPED_SURFACE_READ: + case VEC4_OPCODE_UNTYPED_SURFACE_WRITE: + case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET: + case SHADER_OPCODE_SHADER_TIME_ADD: + case FS_OPCODE_INTERPOLATE_AT_SAMPLE: + case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET: + case SHADER_OPCODE_INTERLOCK: + case SHADER_OPCODE_MEMORY_FENCE: + case SHADER_OPCODE_BARRIER: + return arg == 0; + + case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7: + return arg == 1; + + case SHADER_OPCODE_SEND: + return arg == 2 || arg == 3; + + default: + if (is_tex()) + return arg == 0; + else + return false; + } +} + /** * Returns true if this instruction's sources and destinations cannot * safely be the same register. @@ -387,34 +429,6 @@ } bool -fs_inst::is_copy_payload(const brw::simple_allocator &grf_alloc) const -{ - if (this->opcode != SHADER_OPCODE_LOAD_PAYLOAD) - return false; - - fs_reg reg = this->src[0]; - if (reg.file != VGRF || reg.offset != 0 || reg.stride != 1) - return false; - - if (grf_alloc.sizes[reg.nr] * REG_SIZE != this->size_written) - return false; - - for (int i = 0; i < this->sources; i++) { - reg.type = this->src[i].type; - if (!this->src[i].equals(reg)) - return false; - - if (i < this->header_size) { - reg.offset += REG_SIZE; - } else { - reg = horiz_offset(reg, this->exec_size); - } - } - - return true; -} - -bool fs_inst::can_do_source_mods(const struct gen_device_info *devinfo) const { if (devinfo->gen == 6 && is_math()) @@ -423,6 +437,24 @@ if (is_send_from_grf()) return false; + /* From GEN:BUG:1604601757: + * + * "When multiplying a DW and any lower precision integer, source modifier + * is not supported." + */ + if (devinfo->gen >= 12 && (opcode == BRW_OPCODE_MUL || + opcode == BRW_OPCODE_MAD)) { + const brw_reg_type exec_type = get_exec_type(this); + const unsigned min_type_sz = opcode == BRW_OPCODE_MAD ? + MIN2(type_sz(src[1].type), type_sz(src[2].type)) : + MIN2(type_sz(src[0].type), type_sz(src[1].type)); + + if (brw_reg_type_is_integer(exec_type) && + type_sz(exec_type) >= 4 && + type_sz(exec_type) != min_type_sz) + return false; + } + if (!backend_instruction::can_do_source_mods()) return false; @@ -505,7 +537,22 @@ bool fs_reg::is_contiguous() const { - return stride == 1; + switch (file) { + case ARF: + case FIXED_GRF: + return hstride == BRW_HORIZONTAL_STRIDE_1 && + vstride == width + hstride; + case MRF: + case VGRF: + case ATTR: + return stride == 1; + case UNIFORM: + case IMM: + case BAD_FILE: + return true; + } + + unreachable("Invalid register file"); } unsigned @@ -517,62 +564,8 @@ return MAX2(width * stride, 1) * type_sz(type); } -extern "C" int -type_size_scalar(const struct glsl_type *type, bool bindless) -{ - unsigned int size, i; - - switch (type->base_type) { - case GLSL_TYPE_UINT: - case GLSL_TYPE_INT: - case GLSL_TYPE_FLOAT: - case GLSL_TYPE_BOOL: - return type->components(); - case GLSL_TYPE_UINT16: - case GLSL_TYPE_INT16: - case GLSL_TYPE_FLOAT16: - return DIV_ROUND_UP(type->components(), 2); - case GLSL_TYPE_UINT8: - case GLSL_TYPE_INT8: - return DIV_ROUND_UP(type->components(), 4); - case GLSL_TYPE_DOUBLE: - case GLSL_TYPE_UINT64: - case GLSL_TYPE_INT64: - return type->components() * 2; - case GLSL_TYPE_ARRAY: - return type_size_scalar(type->fields.array, bindless) * type->length; - case GLSL_TYPE_STRUCT: - case GLSL_TYPE_INTERFACE: - size = 0; - for (i = 0; i < type->length; i++) { - size += type_size_scalar(type->fields.structure[i].type, bindless); - } - return size; - case GLSL_TYPE_SAMPLER: - case GLSL_TYPE_IMAGE: - if (bindless) - return type->components() * 2; - case GLSL_TYPE_ATOMIC_UINT: - /* Samplers, atomics, and images take up no register space, since - * they're baked in at link time. - */ - return 0; - case GLSL_TYPE_SUBROUTINE: - return 1; - case GLSL_TYPE_VOID: - case GLSL_TYPE_ERROR: - case GLSL_TYPE_FUNCTION: - unreachable("not reached"); - } - - return 0; -} - /** * Create a MOV to read the timestamp register. - * - * The caller is responsible for emitting the MOV. The return value is - * the destination of the MOV, with extra parameters set. */ fs_reg fs_visitor::get_timestamp(const fs_builder &bld) @@ -866,6 +859,7 @@ } case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL: + case SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL: /* Scattered logical opcodes use the following params: * src[0] Surface coordinates * src[1] Surface operation source (ignored for reads) @@ -878,6 +872,7 @@ return i == SURFACE_LOGICAL_SRC_DATA ? 0 : 1; case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL: + case SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL: assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM && src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM); return 1; @@ -1092,9 +1087,11 @@ opcode != BRW_OPCODE_CSEL && opcode != BRW_OPCODE_IF && opcode != BRW_OPCODE_WHILE)) || - opcode == SHADER_OPCODE_FIND_LIVE_CHANNEL || opcode == FS_OPCODE_FB_WRITE) { return flag_mask(this, 1); + } else if (opcode == SHADER_OPCODE_FIND_LIVE_CHANNEL || + opcode == FS_OPCODE_LOAD_LIVE_CHANNELS) { + return flag_mask(this, 32); } else { return flag_mask(dst, size_written); } @@ -1106,16 +1103,16 @@ * Note that this is not the 0 or 1 implied writes in an actual gen * instruction -- the FS opcodes often generate MOVs in addition. */ -int -fs_visitor::implied_mrf_writes(fs_inst *inst) const +unsigned +fs_inst::implied_mrf_writes() const { - if (inst->mlen == 0) + if (mlen == 0) return 0; - if (inst->base_mrf == -1) + if (base_mrf == -1) return 0; - switch (inst->opcode) { + switch (opcode) { case SHADER_OPCODE_RCP: case SHADER_OPCODE_RSQ: case SHADER_OPCODE_SQRT: @@ -1123,11 +1120,11 @@ case SHADER_OPCODE_LOG2: case SHADER_OPCODE_SIN: case SHADER_OPCODE_COS: - return 1 * dispatch_width / 8; + return 1 * exec_size / 8; case SHADER_OPCODE_POW: case SHADER_OPCODE_INT_QUOTIENT: case SHADER_OPCODE_INT_REMAINDER: - return 2 * dispatch_width / 8; + return 2 * exec_size / 8; case SHADER_OPCODE_TEX: case FS_OPCODE_TXB: case SHADER_OPCODE_TXD: @@ -1143,14 +1140,14 @@ return 1; case FS_OPCODE_FB_WRITE: case FS_OPCODE_REP_FB_WRITE: - return inst->src[0].file == BAD_FILE ? 0 : 2; + return src[0].file == BAD_FILE ? 0 : 2; case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD: case SHADER_OPCODE_GEN4_SCRATCH_READ: return 1; case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN4: - return inst->mlen; + return mlen; case SHADER_OPCODE_GEN4_SCRATCH_WRITE: - return inst->mlen; + return mlen; default: unreachable("not reached"); } @@ -1161,7 +1158,7 @@ { int reg_width = dispatch_width / 8; return fs_reg(VGRF, - alloc.allocate(type_size_scalar(type, false) * reg_width), + alloc.allocate(glsl_count_dword_slots(type, false) * reg_width), brw_type_for_base_type(type)); } @@ -1214,7 +1211,7 @@ } else { bld.emit(FS_OPCODE_LINTERP, wpos, this->delta_xy[BRW_BARYCENTRIC_PERSPECTIVE_PIXEL], - interp_reg(VARYING_SLOT_POS, 2)); + component(interp_reg(VARYING_SLOT_POS, 2), 0)); } wpos = offset(wpos, bld, 1); @@ -1267,7 +1264,13 @@ { fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type)); - if (devinfo->gen >= 6) { + if (devinfo->gen >= 12) { + fs_reg g1 = fs_reg(retype(brw_vec1_grf(1, 1), BRW_REGISTER_TYPE_W)); + + fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_W); + bld.ASR(tmp, g1, brw_imm_d(15)); + bld.NOT(*reg, tmp); + } else if (devinfo->gen >= 6) { /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create * a boolean result from this (~0/true or 0/false). * @@ -1638,6 +1641,26 @@ this->first_non_payload_grf = payload.num_regs + prog_data->curb_read_length; } +/* + * Build up an array of indices into the urb_setup array that + * references the active entries of the urb_setup array. + * Used to accelerate walking the active entries of the urb_setup array + * on each upload. + */ +void +brw_compute_urb_setup_index(struct brw_wm_prog_data *wm_prog_data) +{ + /* Make sure uint8_t is sufficient */ + STATIC_ASSERT(VARYING_SLOT_MAX <= 0xff); + uint8_t index = 0; + for (uint8_t attr = 0; attr < VARYING_SLOT_MAX; attr++) { + if (wm_prog_data->urb_setup[attr] >= 0) { + wm_prog_data->urb_setup_attribs[index++] = attr; + } + } + wm_prog_data->urb_setup_attribs_count = index; +} + static void calculate_urb_setup(const struct gen_device_info *devinfo, const struct brw_wm_prog_key *key, @@ -1725,6 +1748,9 @@ } prog_data->num_varying_inputs = urb_next; + prog_data->inputs = nir->info.inputs_read; + + brw_compute_urb_setup_index(prog_data); } void @@ -2243,159 +2269,190 @@ return; } - struct uniform_slot_info slots[uniforms]; - memset(slots, 0, sizeof(slots)); + if (compiler->compact_params) { + struct uniform_slot_info slots[uniforms]; + memset(slots, 0, sizeof(slots)); + + foreach_block_and_inst_safe(block, fs_inst, inst, cfg) { + for (int i = 0 ; i < inst->sources; i++) { + if (inst->src[i].file != UNIFORM) + continue; - foreach_block_and_inst_safe(block, fs_inst, inst, cfg) { - for (int i = 0 ; i < inst->sources; i++) { - if (inst->src[i].file != UNIFORM) - continue; + /* NIR tightly packs things so the uniform number might not be + * aligned (if we have a double right after a float, for + * instance). This is fine because the process of re-arranging + * them will ensure that things are properly aligned. The offset + * into that uniform, however, must be aligned. + * + * In Vulkan, we have explicit offsets but everything is crammed + * into a single "variable" so inst->src[i].nr will always be 0. + * Everything will be properly aligned relative to that one base. + */ + assert(inst->src[i].offset % type_sz(inst->src[i].type) == 0); - /* NIR tightly packs things so the uniform number might not be - * aligned (if we have a double right after a float, for instance). - * This is fine because the process of re-arranging them will ensure - * that things are properly aligned. The offset into that uniform, - * however, must be aligned. - * - * In Vulkan, we have explicit offsets but everything is crammed - * into a single "variable" so inst->src[i].nr will always be 0. - * Everything will be properly aligned relative to that one base. - */ - assert(inst->src[i].offset % type_sz(inst->src[i].type) == 0); + unsigned u = inst->src[i].nr + + inst->src[i].offset / UNIFORM_SLOT_SIZE; - unsigned u = inst->src[i].nr + - inst->src[i].offset / UNIFORM_SLOT_SIZE; + if (u >= uniforms) + continue; - if (u >= uniforms) - continue; + unsigned slots_read; + if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0) { + slots_read = DIV_ROUND_UP(inst->src[2].ud, UNIFORM_SLOT_SIZE); + } else { + unsigned bytes_read = inst->components_read(i) * + type_sz(inst->src[i].type); + slots_read = DIV_ROUND_UP(bytes_read, UNIFORM_SLOT_SIZE); + } - unsigned slots_read; - if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0) { - slots_read = DIV_ROUND_UP(inst->src[2].ud, UNIFORM_SLOT_SIZE); - } else { - unsigned bytes_read = inst->components_read(i) * - type_sz(inst->src[i].type); - slots_read = DIV_ROUND_UP(bytes_read, UNIFORM_SLOT_SIZE); + assert(u + slots_read <= uniforms); + mark_uniform_slots_read(&slots[u], slots_read, + type_sz(inst->src[i].type)); } - - assert(u + slots_read <= uniforms); - mark_uniform_slots_read(&slots[u], slots_read, - type_sz(inst->src[i].type)); } - } - int subgroup_id_index = get_subgroup_id_param_index(stage_prog_data); + int subgroup_id_index = get_subgroup_id_param_index(stage_prog_data); - /* Only allow 16 registers (128 uniform components) as push constants. - * - * Just demote the end of the list. We could probably do better - * here, demoting things that are rarely used in the program first. - * - * If changing this value, note the limitation about total_regs in - * brw_curbe.c. - */ - unsigned int max_push_components = 16 * 8; - if (subgroup_id_index >= 0) - max_push_components--; /* Save a slot for the thread ID */ + /* Only allow 16 registers (128 uniform components) as push constants. + * + * Just demote the end of the list. We could probably do better + * here, demoting things that are rarely used in the program first. + * + * If changing this value, note the limitation about total_regs in + * brw_curbe.c. + */ + unsigned int max_push_components = 16 * 8; + if (subgroup_id_index >= 0) + max_push_components--; /* Save a slot for the thread ID */ - /* We push small arrays, but no bigger than 16 floats. This is big enough - * for a vec4 but hopefully not large enough to push out other stuff. We - * should probably use a better heuristic at some point. - */ - const unsigned int max_chunk_size = 16; + /* We push small arrays, but no bigger than 16 floats. This is big + * enough for a vec4 but hopefully not large enough to push out other + * stuff. We should probably use a better heuristic at some point. + */ + const unsigned int max_chunk_size = 16; - unsigned int num_push_constants = 0; - unsigned int num_pull_constants = 0; + unsigned int num_push_constants = 0; + unsigned int num_pull_constants = 0; - push_constant_loc = ralloc_array(mem_ctx, int, uniforms); - pull_constant_loc = ralloc_array(mem_ctx, int, uniforms); + push_constant_loc = ralloc_array(mem_ctx, int, uniforms); + pull_constant_loc = ralloc_array(mem_ctx, int, uniforms); - /* Default to -1 meaning no location */ - memset(push_constant_loc, -1, uniforms * sizeof(*push_constant_loc)); - memset(pull_constant_loc, -1, uniforms * sizeof(*pull_constant_loc)); + /* Default to -1 meaning no location */ + memset(push_constant_loc, -1, uniforms * sizeof(*push_constant_loc)); + memset(pull_constant_loc, -1, uniforms * sizeof(*pull_constant_loc)); - int chunk_start = -1; - struct cplx_align align; - for (unsigned u = 0; u < uniforms; u++) { - if (!slots[u].is_live) { - assert(chunk_start == -1); - continue; - } + int chunk_start = -1; + struct cplx_align align; + for (unsigned u = 0; u < uniforms; u++) { + if (!slots[u].is_live) { + assert(chunk_start == -1); + continue; + } - /* Skip subgroup_id_index to put it in the last push register. */ - if (subgroup_id_index == (int)u) - continue; + /* Skip subgroup_id_index to put it in the last push register. */ + if (subgroup_id_index == (int)u) + continue; - if (chunk_start == -1) { - chunk_start = u; - align = slots[u].align; - } else { - /* Offset into the chunk */ - unsigned chunk_offset = (u - chunk_start) * UNIFORM_SLOT_SIZE; + if (chunk_start == -1) { + chunk_start = u; + align = slots[u].align; + } else { + /* Offset into the chunk */ + unsigned chunk_offset = (u - chunk_start) * UNIFORM_SLOT_SIZE; - /* Shift the slot alignment down by the chunk offset so it is - * comparable with the base chunk alignment. - */ - struct cplx_align slot_align = slots[u].align; - slot_align.offset = - (slot_align.offset - chunk_offset) & (align.mul - 1); + /* Shift the slot alignment down by the chunk offset so it is + * comparable with the base chunk alignment. + */ + struct cplx_align slot_align = slots[u].align; + slot_align.offset = + (slot_align.offset - chunk_offset) & (align.mul - 1); - align = cplx_align_combine(align, slot_align); - } + align = cplx_align_combine(align, slot_align); + } - /* Sanity check the alignment */ - cplx_align_assert_sane(align); + /* Sanity check the alignment */ + cplx_align_assert_sane(align); - if (slots[u].contiguous) - continue; + if (slots[u].contiguous) + continue; - /* Adjust the alignment to be in terms of slots, not bytes */ - assert((align.mul & (UNIFORM_SLOT_SIZE - 1)) == 0); - assert((align.offset & (UNIFORM_SLOT_SIZE - 1)) == 0); - align.mul /= UNIFORM_SLOT_SIZE; - align.offset /= UNIFORM_SLOT_SIZE; - - unsigned push_start_align = cplx_align_apply(align, num_push_constants); - unsigned chunk_size = u - chunk_start + 1; - if ((!compiler->supports_pull_constants && u < UBO_START) || - (chunk_size < max_chunk_size && - push_start_align + chunk_size <= max_push_components)) { - /* Align up the number of push constants */ - num_push_constants = push_start_align; - for (unsigned i = 0; i < chunk_size; i++) - push_constant_loc[chunk_start + i] = num_push_constants++; - } else { - /* We need to pull this one */ - num_pull_constants = cplx_align_apply(align, num_pull_constants); - for (unsigned i = 0; i < chunk_size; i++) - pull_constant_loc[chunk_start + i] = num_pull_constants++; + /* Adjust the alignment to be in terms of slots, not bytes */ + assert((align.mul & (UNIFORM_SLOT_SIZE - 1)) == 0); + assert((align.offset & (UNIFORM_SLOT_SIZE - 1)) == 0); + align.mul /= UNIFORM_SLOT_SIZE; + align.offset /= UNIFORM_SLOT_SIZE; + + unsigned push_start_align = cplx_align_apply(align, num_push_constants); + unsigned chunk_size = u - chunk_start + 1; + if ((!compiler->supports_pull_constants && u < UBO_START) || + (chunk_size < max_chunk_size && + push_start_align + chunk_size <= max_push_components)) { + /* Align up the number of push constants */ + num_push_constants = push_start_align; + for (unsigned i = 0; i < chunk_size; i++) + push_constant_loc[chunk_start + i] = num_push_constants++; + } else { + /* We need to pull this one */ + num_pull_constants = cplx_align_apply(align, num_pull_constants); + for (unsigned i = 0; i < chunk_size; i++) + pull_constant_loc[chunk_start + i] = num_pull_constants++; + } + + /* Reset the chunk and start again */ + chunk_start = -1; } - /* Reset the chunk and start again */ - chunk_start = -1; - } + /* Add the CS local thread ID uniform at the end of the push constants */ + if (subgroup_id_index >= 0) + push_constant_loc[subgroup_id_index] = num_push_constants++; - /* Add the CS local thread ID uniform at the end of the push constants */ - if (subgroup_id_index >= 0) - push_constant_loc[subgroup_id_index] = num_push_constants++; + /* As the uniforms are going to be reordered, stash the old array and + * create two new arrays for push/pull params. + */ + uint32_t *param = stage_prog_data->param; + stage_prog_data->nr_params = num_push_constants; + if (num_push_constants) { + stage_prog_data->param = rzalloc_array(mem_ctx, uint32_t, + num_push_constants); + } else { + stage_prog_data->param = NULL; + } + assert(stage_prog_data->nr_pull_params == 0); + assert(stage_prog_data->pull_param == NULL); + if (num_pull_constants > 0) { + stage_prog_data->nr_pull_params = num_pull_constants; + stage_prog_data->pull_param = rzalloc_array(mem_ctx, uint32_t, + num_pull_constants); + } - /* As the uniforms are going to be reordered, stash the old array and - * create two new arrays for push/pull params. - */ - uint32_t *param = stage_prog_data->param; - stage_prog_data->nr_params = num_push_constants; - if (num_push_constants) { - stage_prog_data->param = rzalloc_array(mem_ctx, uint32_t, - num_push_constants); + /* Up until now, the param[] array has been indexed by reg + offset + * of UNIFORM registers. Move pull constants into pull_param[] and + * condense param[] to only contain the uniforms we chose to push. + * + * NOTE: Because we are condensing the params[] array, we know that + * push_constant_loc[i] <= i and we can do it in one smooth loop without + * having to make a copy. + */ + for (unsigned int i = 0; i < uniforms; i++) { + uint32_t value = param[i]; + if (pull_constant_loc[i] != -1) { + stage_prog_data->pull_param[pull_constant_loc[i]] = value; + } else if (push_constant_loc[i] != -1) { + stage_prog_data->param[push_constant_loc[i]] = value; + } + } + ralloc_free(param); } else { - stage_prog_data->param = NULL; - } - assert(stage_prog_data->nr_pull_params == 0); - assert(stage_prog_data->pull_param == NULL); - if (num_pull_constants > 0) { - stage_prog_data->nr_pull_params = num_pull_constants; - stage_prog_data->pull_param = rzalloc_array(mem_ctx, uint32_t, - num_pull_constants); + /* If we don't want to compact anything, just set up dummy push/pull + * arrays. All the rest of the compiler cares about are these arrays. + */ + push_constant_loc = ralloc_array(mem_ctx, int, uniforms); + pull_constant_loc = ralloc_array(mem_ctx, int, uniforms); + + for (unsigned u = 0; u < uniforms; u++) + push_constant_loc[u] = u; + + memset(pull_constant_loc, -1, uniforms * sizeof(*pull_constant_loc)); } /* Now that we know how many regular uniforms we'll push, reduce the @@ -2411,24 +2468,6 @@ push_length += range->length; } assert(push_length <= 64); - - /* Up until now, the param[] array has been indexed by reg + offset - * of UNIFORM registers. Move pull constants into pull_param[] and - * condense param[] to only contain the uniforms we chose to push. - * - * NOTE: Because we are condensing the params[] array, we know that - * push_constant_loc[i] <= i and we can do it in one smooth loop without - * having to make a copy. - */ - for (unsigned int i = 0; i < uniforms; i++) { - uint32_t value = param[i]; - if (pull_constant_loc[i] != -1) { - stage_prog_data->pull_param[pull_constant_loc[i]] = value; - } else if (push_constant_loc[i] != -1) { - stage_prog_data->param[push_constant_loc[i]] = value; - } - } - ralloc_free(param); } bool @@ -2448,6 +2487,8 @@ *out_surf_index = prog_data->binding_table.ubo_start + range->block; *out_pull_index = (32 * range->start + src.offset) / 4; + + prog_data->has_ubo_pull = true; return true; } @@ -2457,6 +2498,8 @@ /* A regular uniform push constant */ *out_surf_index = stage_prog_data->binding_table.pull_constants_start; *out_pull_index = pull_constant_loc[location]; + + prog_data->has_ubo_pull = true; return true; } @@ -2528,7 +2571,8 @@ foreach_block_and_inst_safe(block, fs_inst, inst, cfg) { switch (inst->opcode) { case BRW_OPCODE_MOV: - if (!devinfo->has_64bit_types && + if (!devinfo->has_64bit_float && + !devinfo->has_64bit_int && (inst->dst.type == BRW_REGISTER_TYPE_DF || inst->dst.type == BRW_REGISTER_TYPE_UQ || inst->dst.type == BRW_REGISTER_TYPE_Q)) { @@ -2661,7 +2705,8 @@ } break; case BRW_OPCODE_SEL: - if (!devinfo->has_64bit_types && + if (!devinfo->has_64bit_float && + !devinfo->has_64bit_int && (inst->dst.type == BRW_REGISTER_TYPE_DF || inst->dst.type == BRW_REGISTER_TYPE_UQ || inst->dst.type == BRW_REGISTER_TYPE_Q)) { @@ -3062,107 +3107,6 @@ } bool -fs_visitor::opt_peephole_csel() -{ - if (devinfo->gen < 8) - return false; - - bool progress = false; - - foreach_block_reverse(block, cfg) { - int ip = block->end_ip + 1; - - foreach_inst_in_block_reverse_safe(fs_inst, inst, block) { - ip--; - - if (inst->opcode != BRW_OPCODE_SEL || - inst->predicate != BRW_PREDICATE_NORMAL || - (inst->dst.type != BRW_REGISTER_TYPE_F && - inst->dst.type != BRW_REGISTER_TYPE_D && - inst->dst.type != BRW_REGISTER_TYPE_UD)) - continue; - - /* Because it is a 3-src instruction, CSEL cannot have an immediate - * value as a source, but we can sometimes handle zero. - */ - if ((inst->src[0].file != VGRF && inst->src[0].file != ATTR && - inst->src[0].file != UNIFORM) || - (inst->src[1].file != VGRF && inst->src[1].file != ATTR && - inst->src[1].file != UNIFORM && !inst->src[1].is_zero())) - continue; - - foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) { - if (!scan_inst->flags_written()) - continue; - - if ((scan_inst->opcode != BRW_OPCODE_CMP && - scan_inst->opcode != BRW_OPCODE_MOV) || - scan_inst->predicate != BRW_PREDICATE_NONE || - (scan_inst->src[0].file != VGRF && - scan_inst->src[0].file != ATTR && - scan_inst->src[0].file != UNIFORM) || - scan_inst->src[0].type != BRW_REGISTER_TYPE_F) - break; - - if (scan_inst->opcode == BRW_OPCODE_CMP && !scan_inst->src[1].is_zero()) - break; - - const brw::fs_builder ibld(this, block, inst); - - const enum brw_conditional_mod cond = - inst->predicate_inverse - ? brw_negate_cmod(scan_inst->conditional_mod) - : scan_inst->conditional_mod; - - fs_inst *csel_inst = NULL; - - if (inst->src[1].file != IMM) { - csel_inst = ibld.CSEL(inst->dst, - inst->src[0], - inst->src[1], - scan_inst->src[0], - cond); - } else if (cond == BRW_CONDITIONAL_NZ) { - /* Consider the sequence - * - * cmp.nz.f0 null<1>F g3<8,8,1>F 0F - * (+f0) sel g124<1>UD g2<8,8,1>UD 0x00000000UD - * - * The sel will pick the immediate value 0 if r0 is ±0.0. - * Therefore, this sequence is equivalent: - * - * cmp.nz.f0 null<1>F g3<8,8,1>F 0F - * (+f0) sel g124<1>F g2<8,8,1>F (abs)g3<8,8,1>F - * - * The abs is ensures that the result is 0UD when g3 is -0.0F. - * By normal cmp-sel merging, this is also equivalent: - * - * csel.nz g124<1>F g2<4,4,1>F (abs)g3<4,4,1>F g3<4,4,1>F - */ - csel_inst = ibld.CSEL(inst->dst, - inst->src[0], - scan_inst->src[0], - scan_inst->src[0], - cond); - - csel_inst->src[1].abs = true; - } - - if (csel_inst != NULL) { - progress = true; - csel_inst->saturate = inst->saturate; - inst->remove(block); - } - - break; - } - } - } - - return progress; -} - -bool fs_visitor::compute_to_mrf() { bool progress = false; @@ -3464,6 +3408,8 @@ assert(mov->src[0].file == FIXED_GRF); mov->src[0] = brw_vec4_grf(mov->src[0].nr, 0); } + + lower_scoreboard(); } /** @@ -3512,7 +3458,7 @@ /* Found a SEND instruction, which will include two or fewer * implied MRF writes. We could do better here. */ - for (int i = 0; i < implied_mrf_writes(inst); i++) { + for (unsigned i = 0; i < inst->implied_mrf_writes(); i++) { last_mrf_move[inst->base_mrf + i] = NULL; } } @@ -3553,9 +3499,22 @@ fs_visitor::remove_extra_rounding_modes() { bool progress = false; + unsigned execution_mode = this->nir->info.float_controls_execution_mode; + + brw_rnd_mode base_mode = BRW_RND_MODE_UNSPECIFIED; + if ((FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 | + FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32 | + FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64) & + execution_mode) + base_mode = BRW_RND_MODE_RTNE; + if ((FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 | + FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 | + FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64) & + execution_mode) + base_mode = BRW_RND_MODE_RTZ; foreach_block (block, cfg) { - brw_rnd_mode prev_mode = BRW_RND_MODE_UNSPECIFIED; + brw_rnd_mode prev_mode = base_mode; foreach_inst_in_block_safe (fs_inst, inst, block) { if (inst->opcode == SHADER_OPCODE_RND_MODE) { @@ -3824,15 +3783,23 @@ dst.nr = dst.nr & ~BRW_MRF_COMPR4; const fs_builder ibld(this, block, inst); - const fs_builder hbld = ibld.exec_all().group(8, 0); + const fs_builder ubld = ibld.exec_all(); - for (uint8_t i = 0; i < inst->header_size; i++) { - if (inst->src[i].file != BAD_FILE) { - fs_reg mov_dst = retype(dst, BRW_REGISTER_TYPE_UD); - fs_reg mov_src = retype(inst->src[i], BRW_REGISTER_TYPE_UD); - hbld.MOV(mov_dst, mov_src); - } - dst = offset(dst, hbld, 1); + for (uint8_t i = 0; i < inst->header_size;) { + /* Number of header GRFs to initialize at once with a single MOV + * instruction. + */ + const unsigned n = + (i + 1 < inst->header_size && inst->src[i].stride == 1 && + inst->src[i + 1].equals(byte_offset(inst->src[i], REG_SIZE))) ? + 2 : 1; + + if (inst->src[i].file != BAD_FILE) + ubld.group(8 * n, 0).MOV(retype(dst, BRW_REGISTER_TYPE_UD), + retype(inst->src[i], BRW_REGISTER_TYPE_UD)); + + dst = byte_offset(dst, n * REG_SIZE); + i += n; } if (inst->dst.file == MRF && (inst->dst.nr & BRW_MRF_COMPR4) && @@ -3913,7 +3880,10 @@ { const fs_builder ibld(this, block, inst); - if (inst->src[1].file == IMM && inst->src[1].ud < (1 << 16)) { + const bool ud = (inst->src[1].type == BRW_REGISTER_TYPE_UD); + if (inst->src[1].file == IMM && + (( ud && inst->src[1].ud <= UINT16_MAX) || + (!ud && inst->src[1].d <= INT16_MAX && inst->src[1].d >= INT16_MIN))) { /* The MUL instruction isn't commutative. On Gen <= 6, only the low * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of * src1 are used. @@ -3926,7 +3896,6 @@ ibld.MOV(imm, inst->src[1]); ibld.MUL(inst->dst, imm, inst->src[0]); } else { - const bool ud = (inst->src[1].type == BRW_REGISTER_TYPE_UD); ibld.MUL(inst->dst, inst->src[0], ud ? brw_imm_uw(inst->src[1].ud) : brw_imm_w(inst->src[1].d)); @@ -4164,6 +4133,17 @@ foreach_block_and_inst_safe(block, fs_inst, inst, cfg) { if (inst->opcode == BRW_OPCODE_MUL) { + /* If the instruction is already in a form that does not need lowering, + * return early. + */ + if (devinfo->gen >= 7) { + if (type_sz(inst->src[1].type) < 4 && type_sz(inst->src[0].type) <= 4) + continue; + } else { + if (type_sz(inst->src[0].type) < 4 && type_sz(inst->src[1].type) <= 4) + continue; + } + if ((inst->dst.type == BRW_REGISTER_TYPE_Q || inst->dst.type == BRW_REGISTER_TYPE_UQ) && (inst->src[0].type == BRW_REGISTER_TYPE_Q || @@ -4225,6 +4205,95 @@ return progress; } +bool +fs_visitor::lower_sub_sat() +{ + bool progress = false; + + foreach_block_and_inst_safe(block, fs_inst, inst, cfg) { + const fs_builder ibld(this, block, inst); + + if (inst->opcode == SHADER_OPCODE_USUB_SAT || + inst->opcode == SHADER_OPCODE_ISUB_SAT) { + /* The fundamental problem is the hardware performs source negation + * at the bit width of the source. If the source is 0x80000000D, the + * negation is 0x80000000D. As a result, subtractSaturate(0, + * 0x80000000) will produce 0x80000000 instead of 0x7fffffff. There + * are at least three ways to resolve this: + * + * 1. Use the accumulator for the negated source. The accumulator is + * 33 bits, so our source 0x80000000 is sign-extended to + * 0x1800000000. The negation of which is 0x080000000. This + * doesn't help for 64-bit integers (which are already bigger than + * 33 bits). There are also only 8 accumulators, so SIMD16 or + * SIMD32 instructions would have to be split into multiple SIMD8 + * instructions. + * + * 2. Use slightly different math. For any n-bit value x, we know (x + * >> 1) != -(x >> 1). We can use this fact to only do + * subtractions involving (x >> 1). subtractSaturate(a, b) == + * subtractSaturate(subtractSaturate(a, (b >> 1)), b - (b >> 1)). + * + * 3. For unsigned sources, it is sufficient to replace the + * subtractSaturate with (a > b) ? a - b : 0. + * + * It may also be possible to use the SUBB instruction. This + * implicitly writes the accumulator, so it could only be used in the + * same situations as #1 above. It is further limited by only + * allowing UD sources. + */ + if (inst->exec_size == 8 && inst->src[0].type != BRW_REGISTER_TYPE_Q && + inst->src[0].type != BRW_REGISTER_TYPE_UQ) { + fs_reg acc(ARF, BRW_ARF_ACCUMULATOR, inst->src[1].type); + + ibld.MOV(acc, inst->src[1]); + fs_inst *add = ibld.ADD(inst->dst, acc, inst->src[0]); + add->saturate = true; + add->src[0].negate = true; + } else if (inst->opcode == SHADER_OPCODE_ISUB_SAT) { + /* tmp = src1 >> 1; + * dst = add.sat(add.sat(src0, -tmp), -(src1 - tmp)); + */ + fs_reg tmp1 = ibld.vgrf(inst->src[0].type); + fs_reg tmp2 = ibld.vgrf(inst->src[0].type); + fs_reg tmp3 = ibld.vgrf(inst->src[0].type); + fs_inst *add; + + ibld.SHR(tmp1, inst->src[1], brw_imm_d(1)); + + add = ibld.ADD(tmp2, inst->src[1], tmp1); + add->src[1].negate = true; + + add = ibld.ADD(tmp3, inst->src[0], tmp1); + add->src[1].negate = true; + add->saturate = true; + + add = ibld.ADD(inst->dst, tmp3, tmp2); + add->src[1].negate = true; + add->saturate = true; + } else { + /* a > b ? a - b : 0 */ + ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1], + BRW_CONDITIONAL_G); + + fs_inst *add = ibld.ADD(inst->dst, inst->src[0], inst->src[1]); + add->src[1].negate = !add->src[1].negate; + + ibld.SEL(inst->dst, inst->dst, brw_imm_ud(0)) + ->predicate = BRW_PREDICATE_NORMAL; + } + + inst->remove(block); + progress = true; + } + } + + if (progress) + invalidate_live_intervals(); + + return progress; +} + static void setup_color_payload(const fs_builder &bld, const brw_wm_prog_key *key, fs_reg *dst, fs_reg color, unsigned components) @@ -4244,6 +4313,38 @@ dst[i] = offset(color, bld, i); } +uint32_t +brw_fb_write_msg_control(const fs_inst *inst, + const struct brw_wm_prog_data *prog_data) +{ + uint32_t mctl; + + if (inst->opcode == FS_OPCODE_REP_FB_WRITE) { + assert(inst->group == 0 && inst->exec_size == 16); + mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED; + } else if (prog_data->dual_src_blend) { + assert(inst->exec_size == 8); + + if (inst->group % 16 == 0) + mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01; + else if (inst->group % 16 == 8) + mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN23; + else + unreachable("Invalid dual-source FB write instruction group"); + } else { + assert(inst->group == 0 || (inst->group == 16 && inst->exec_size == 16)); + + if (inst->exec_size == 16) + mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE; + else if (inst->exec_size == 8) + mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01; + else + unreachable("Invalid FB write execution size"); + } + + return mctl; +} + static void lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst, const struct brw_wm_prog_data *prog_data, @@ -4295,8 +4396,8 @@ length = 2; } else if ((devinfo->gen <= 7 && !devinfo->is_haswell && prog_data->uses_kill) || - color1.file != BAD_FILE || - key->nr_color_regions > 1) { + (devinfo->gen < 11 && + (color1.file != BAD_FILE || key->nr_color_regions > 1))) { /* From the Sandy Bridge PRM, volume 4, page 198: * * "Dispatched Pixel Enables. One bit per pixel indicating @@ -4370,6 +4471,8 @@ length++; } + bool src0_alpha_present = false; + if (src0_alpha.file != BAD_FILE) { for (unsigned i = 0; i < bld.dispatch_width() / 8; i++) { const fs_builder &ubld = bld.exec_all().group(8, i) @@ -4379,12 +4482,14 @@ setup_color_payload(ubld, key, &sources[length], tmp, 1); length++; } + src0_alpha_present = true; } else if (prog_data->replicate_alpha && inst->target != 0) { /* Handle the case when fragment shader doesn't write to draw buffer * zero. No need to call setup_color_payload() for src0_alpha because * alpha value will be undefined. */ length += bld.dispatch_width() / 8; + src0_alpha_present = true; } if (sample_mask.file != BAD_FILE) { @@ -4453,8 +4558,36 @@ payload.nr = bld.shader->alloc.allocate(regs_written(load)); load->dst = payload; - inst->src[0] = payload; - inst->resize_sources(1); + uint32_t msg_ctl = brw_fb_write_msg_control(inst, prog_data); + uint32_t ex_desc = 0; + + inst->desc = + (inst->group / 16) << 11 | /* rt slot group */ + brw_dp_write_desc(devinfo, inst->target, msg_ctl, + GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE, + inst->last_rt, false); + + if (devinfo->gen >= 11) { + /* Set the "Render Target Index" and "Src0 Alpha Present" fields + * in the extended message descriptor, in lieu of using a header. + */ + ex_desc = inst->target << 12 | src0_alpha_present << 15; + + if (key->nr_color_regions == 0) + ex_desc |= 1 << 20; /* Null Render Target */ + } + + inst->opcode = SHADER_OPCODE_SEND; + inst->resize_sources(3); + inst->sfid = GEN6_SFID_DATAPORT_RENDER_CACHE; + inst->src[0] = brw_imm_ud(inst->desc); + inst->src[1] = brw_imm_ud(ex_desc); + inst->src[2] = payload; + inst->mlen = regs_written(load); + inst->ex_mlen = 0; + inst->header_size = header_size; + inst->check_tdr = true; + inst->send_has_side_effects = true; } else { /* Send from the MRF */ load = bld.LOAD_PAYLOAD(fs_reg(MRF, 1, BRW_REGISTER_TYPE_F), @@ -4474,11 +4607,10 @@ inst->resize_sources(0); } inst->base_mrf = 1; + inst->opcode = FS_OPCODE_FB_WRITE; + inst->mlen = regs_written(load); + inst->header_size = header_size; } - - inst->opcode = FS_OPCODE_FB_WRITE; - inst->mlen = regs_written(load); - inst->header_size = header_size; } static void @@ -4537,7 +4669,8 @@ if (coord_components > 0 && (has_lod || shadow_c.file != BAD_FILE || (op == SHADER_OPCODE_TEX && bld.dispatch_width() == 8))) { - for (unsigned i = coord_components; i < 3; i++) + assert(coord_components <= 3); + for (unsigned i = 0; i < 3 - coord_components; i++) bld.MOV(offset(msg_end, bld, i), brw_imm_f(0.0f)); msg_end = offset(msg_end, bld, 3 - coord_components); @@ -5193,20 +5326,6 @@ } } -/** - * Initialize the header present in some typed and untyped surface - * messages. - */ -static fs_reg -emit_surface_header(const fs_builder &bld, const fs_reg &sample_mask) -{ - fs_builder ubld = bld.exec_all().group(8, 0); - const fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_UD); - ubld.MOV(dst, brw_imm_d(0)); - ubld.group(1, 0).MOV(component(dst, 7), sample_mask); - return dst; -} - static void lower_surface_logical_send(const fs_builder &bld, fs_inst *inst) { @@ -5233,6 +5352,19 @@ inst->opcode == SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL || inst->opcode == SHADER_OPCODE_TYPED_ATOMIC_LOGICAL; + const bool is_surface_access = is_typed_access || + inst->opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL || + inst->opcode == SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL || + inst->opcode == SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL; + + const bool is_stateless = + surface.file == IMM && (surface.ud == BRW_BTI_STATELESS || + surface.ud == GEN8_BTI_STATELESS_NON_COHERENT); + + const bool has_side_effects = inst->has_side_effects(); + fs_reg sample_mask = has_side_effects ? bld.sample_mask_reg() : + fs_reg(brw_imm_d(0xffff)); + /* From the BDW PRM Volume 7, page 147: * * "For the Data Cache Data Port*, the header must be present for the @@ -5242,22 +5374,63 @@ * we don't attempt to implement sample masks via predication for such * messages prior to Gen9, since we have to provide a header anyway. On * Gen11+ the header has been removed so we can only use predication. + * + * For all stateless A32 messages, we also need a header */ - const unsigned header_sz = devinfo->gen < 9 && is_typed_access ? 1 : 0; - - const bool has_side_effects = inst->has_side_effects(); - fs_reg sample_mask = has_side_effects ? bld.sample_mask_reg() : - fs_reg(brw_imm_d(0xffff)); + fs_reg header; + if ((devinfo->gen < 9 && is_typed_access) || is_stateless) { + fs_builder ubld = bld.exec_all().group(8, 0); + header = ubld.vgrf(BRW_REGISTER_TYPE_UD); + ubld.MOV(header, brw_imm_d(0)); + if (is_stateless) { + /* Both the typed and scattered byte/dword A32 messages take a buffer + * base address in R0.5:[31:0] (See MH1_A32_PSM for typed messages or + * MH_A32_GO for byte/dword scattered messages in the SKL PRM Vol. 2d + * for more details.) This is conveniently where the HW places the + * scratch surface base address. + * + * From the SKL PRM Vol. 7 "Per-Thread Scratch Space": + * + * "When a thread becomes 'active' it is allocated a portion of + * scratch space, sized according to PerThreadScratchSpace. The + * starting location of each thread’s scratch space allocation, + * ScratchSpaceOffset, is passed in the thread payload in + * R0.5[31:10] and is specified as a 1KB-granular offset from the + * GeneralStateBaseAddress. The computation of ScratchSpaceOffset + * includes the starting address of the stage’s scratch space + * allocation, as programmed by ScratchSpaceBasePointer." + * + * The base address is passed in bits R0.5[31:10] and the bottom 10 + * bits of R0.5 are used for other things. Therefore, we have to + * mask off the bottom 10 bits so that we don't get a garbage base + * address. + */ + ubld.group(1, 0).AND(component(header, 5), + retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD), + brw_imm_ud(0xfffffc00)); + } + if (is_surface_access) + ubld.group(1, 0).MOV(component(header, 7), sample_mask); + } + const unsigned header_sz = header.file != BAD_FILE ? 1 : 0; fs_reg payload, payload2; unsigned mlen, ex_mlen = 0; - if (devinfo->gen >= 9) { + if (devinfo->gen >= 9 && + (src.file == BAD_FILE || header.file == BAD_FILE)) { /* We have split sends on gen9 and above */ - assert(header_sz == 0); - payload = bld.move_to_vgrf(addr, addr_sz); - payload2 = bld.move_to_vgrf(src, src_sz); - mlen = addr_sz * (inst->exec_size / 8); - ex_mlen = src_sz * (inst->exec_size / 8); + if (header.file == BAD_FILE) { + payload = bld.move_to_vgrf(addr, addr_sz); + payload2 = bld.move_to_vgrf(src, src_sz); + mlen = addr_sz * (inst->exec_size / 8); + ex_mlen = src_sz * (inst->exec_size / 8); + } else { + assert(src.file == BAD_FILE); + payload = header; + payload2 = bld.move_to_vgrf(addr, addr_sz); + mlen = header_sz; + ex_mlen = addr_sz * (inst->exec_size / 8); + } } else { /* Allocate space for the payload. */ const unsigned sz = header_sz + addr_sz + src_sz; @@ -5266,8 +5439,8 @@ unsigned n = 0; /* Construct the payload. */ - if (header_sz) - components[n++] = emit_surface_header(bld, sample_mask); + if (header.file != BAD_FILE) + components[n++] = header; for (unsigned i = 0; i < addr_sz; i++) components[n++] = offset(addr, bld, i); @@ -5284,8 +5457,8 @@ /* Predicate the instruction on the sample mask if no header is * provided. */ - if (!header_sz && sample_mask.file != BAD_FILE && - sample_mask.file != IMM) { + if ((header.file == BAD_FILE || !is_surface_access) && + sample_mask.file != BAD_FILE && sample_mask.file != IMM) { const fs_builder ubld = bld.group(1, 0).exec_all(); if (inst->predicate) { assert(inst->predicate == BRW_PREDICATE_NORMAL); @@ -5315,6 +5488,13 @@ sfid = GEN7_SFID_DATAPORT_DATA_CACHE; break; + case SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL: + case SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL: + sfid = devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE : + devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE : + BRW_DATAPORT_READ_TARGET_RENDER_CACHE; + break; + case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL: case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL: case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL: @@ -5368,6 +5548,18 @@ true /* write */); break; + case SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL: + assert(arg.ud == 32); /* bit_size */ + desc = brw_dp_dword_scattered_rw_desc(devinfo, inst->exec_size, + false /* write */); + break; + + case SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL: + assert(arg.ud == 32); /* bit_size */ + desc = brw_dp_dword_scattered_rw_desc(devinfo, inst->exec_size, + true /* write */); + break; + case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL: desc = brw_dp_untyped_atomic_desc(devinfo, inst->exec_size, arg.ud, /* atomic_op */ @@ -5727,6 +5919,8 @@ case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL: case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL: case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL: + case SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL: + case SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL: case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL: case SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL: case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL: @@ -6116,6 +6310,8 @@ case BRW_OPCODE_SHR: case BRW_OPCODE_SHL: case BRW_OPCODE_ASR: + case BRW_OPCODE_ROR: + case BRW_OPCODE_ROL: case BRW_OPCODE_CMPN: case BRW_OPCODE_CSEL: case BRW_OPCODE_F32TO16: @@ -6199,6 +6395,10 @@ return MIN2(16, inst->exec_size); } + case SHADER_OPCODE_USUB_SAT: + case SHADER_OPCODE_ISUB_SAT: + return get_fpu_lowered_simd_width(devinfo, inst); + case SHADER_OPCODE_INT_QUOTIENT: case SHADER_OPCODE_INT_REMAINDER: /* Integer division is limited to SIMD8 on all generations. */ @@ -6321,6 +6521,8 @@ case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL: case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL: case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL: + case SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL: + case SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL: return MIN2(16, inst->exec_size); case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL: @@ -6664,6 +6866,87 @@ return progress; } +/** + * Transform barycentric vectors into the interleaved form expected by the PLN + * instruction and returned by the Gen7+ PI shared function. + * + * For channels 0-15 in SIMD16 mode they are expected to be laid out as + * follows in the register file: + * + * rN+0: X[0-7] + * rN+1: Y[0-7] + * rN+2: X[8-15] + * rN+3: Y[8-15] + * + * There is no need to handle SIMD32 here -- This is expected to be run after + * SIMD lowering, since SIMD lowering relies on vectors having the standard + * component layout. + */ +bool +fs_visitor::lower_barycentrics() +{ + const bool has_interleaved_layout = devinfo->has_pln || devinfo->gen >= 7; + bool progress = false; + + if (stage != MESA_SHADER_FRAGMENT || !has_interleaved_layout) + return false; + + foreach_block_and_inst_safe(block, fs_inst, inst, cfg) { + if (inst->exec_size < 16) + continue; + + const fs_builder ibld(this, block, inst); + const fs_builder ubld = ibld.exec_all().group(8, 0); + + switch (inst->opcode) { + case FS_OPCODE_LINTERP : { + assert(inst->exec_size == 16); + const fs_reg tmp = ibld.vgrf(inst->src[0].type, 2); + fs_reg srcs[4]; + + for (unsigned i = 0; i < ARRAY_SIZE(srcs); i++) + srcs[i] = horiz_offset(offset(inst->src[0], ibld, i % 2), + 8 * (i / 2)); + + ubld.LOAD_PAYLOAD(tmp, srcs, ARRAY_SIZE(srcs), ARRAY_SIZE(srcs)); + + inst->src[0] = tmp; + progress = true; + break; + } + case FS_OPCODE_INTERPOLATE_AT_SAMPLE: + case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET: + case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET: { + assert(inst->exec_size == 16); + const fs_reg tmp = ibld.vgrf(inst->dst.type, 2); + + for (unsigned i = 0; i < 2; i++) { + for (unsigned g = 0; g < inst->exec_size / 8; g++) { + fs_inst *mov = ibld.at(block, inst->next).group(8, g) + .MOV(horiz_offset(offset(inst->dst, ibld, i), + 8 * g), + offset(tmp, ubld, 2 * g + i)); + mov->predicate = inst->predicate; + mov->predicate_inverse = inst->predicate_inverse; + mov->flag_subreg = inst->flag_subreg; + } + } + + inst->dst = tmp; + progress = true; + break; + } + default: + break; + } + } + + if (progress) + invalidate_live_intervals(); + + return progress; +} + void fs_visitor::dump_instructions() { @@ -7174,12 +7457,6 @@ OPT(compact_virtual_grfs); } while (progress); - /* Do this after cmod propagation has had every possible opportunity to - * propagate results into SEL instructions. - */ - if (OPT(opt_peephole_csel)) - OPT(dead_code_eliminate); - progress = false; pass_num = 0; @@ -7189,12 +7466,16 @@ } OPT(lower_simd_width); + OPT(lower_barycentrics); /* After SIMD lowering just in case we had to unroll the EOT send. */ OPT(opt_sampler_eot); OPT(lower_logical_sends); + /* After logical SEND lowering. */ + OPT(fixup_nomask_control_flow); + if (progress) { OPT(opt_copy_propagation); /* Only run after logical send lowering because it's easier to implement @@ -7219,6 +7500,11 @@ if (OPT(lower_load_payload)) { split_virtual_grfs(); + + /* Lower 64 bit MOVs generated by payload lowering. */ + if (!devinfo->has_64bit_float && !devinfo->has_64bit_int) + OPT(opt_algebraic); + OPT(register_coalesce); OPT(lower_simd_width); OPT(compute_to_mrf); @@ -7227,6 +7513,7 @@ OPT(opt_combine_constants); OPT(lower_integer_multiplication); + OPT(lower_sub_sat); if (devinfo->gen <= 5 && OPT(lower_minmax)) { OPT(opt_cmod_propagation); @@ -7317,6 +7604,151 @@ invalidate_live_intervals(); } +/** + * Find the first instruction in the program that might start a region of + * divergent control flow due to a HALT jump. There is no + * find_halt_control_flow_region_end(), the region of divergence extends until + * the only FS_OPCODE_PLACEHOLDER_HALT in the program. + */ +static const fs_inst * +find_halt_control_flow_region_start(const fs_visitor *v) +{ + if (brw_wm_prog_data(v->prog_data)->uses_kill) { + foreach_block_and_inst(block, fs_inst, inst, v->cfg) { + if (inst->opcode == FS_OPCODE_DISCARD_JUMP || + inst->opcode == FS_OPCODE_PLACEHOLDER_HALT) + return inst; + } + } + + return NULL; +} + +/** + * Work around the Gen12 hardware bug filed as GEN:BUG:1407528679. EU fusion + * can cause a BB to be executed with all channels disabled, which will lead + * to the execution of any NoMask instructions in it, even though any + * execution-masked instructions will be correctly shot down. This may break + * assumptions of some NoMask SEND messages whose descriptor depends on data + * generated by live invocations of the shader. + * + * This avoids the problem by predicating certain instructions on an ANY + * horizontal predicate that makes sure that their execution is omitted when + * all channels of the program are disabled. + */ +bool +fs_visitor::fixup_nomask_control_flow() +{ + if (devinfo->gen != 12) + return false; + + const brw_predicate pred = dispatch_width > 16 ? BRW_PREDICATE_ALIGN1_ANY32H : + dispatch_width > 8 ? BRW_PREDICATE_ALIGN1_ANY16H : + BRW_PREDICATE_ALIGN1_ANY8H; + const fs_inst *halt_start = find_halt_control_flow_region_start(this); + unsigned depth = 0; + bool progress = false; + + calculate_live_intervals(); + + /* Scan the program backwards in order to be able to easily determine + * whether the flag register is live at any point. + */ + foreach_block_reverse_safe(block, cfg) { + BITSET_WORD flag_liveout = live_intervals->block_data[block->num] + .flag_liveout[0]; + STATIC_ASSERT(ARRAY_SIZE(live_intervals->block_data[0].flag_liveout) == 1); + + foreach_inst_in_block_reverse_safe(fs_inst, inst, block) { + if (!inst->predicate && inst->exec_size >= 8) + flag_liveout &= ~inst->flags_written(); + + switch (inst->opcode) { + case BRW_OPCODE_DO: + case BRW_OPCODE_IF: + /* Note that this doesn't handle FS_OPCODE_DISCARD_JUMP since only + * the first one in the program closes the region of divergent + * control flow due to any HALT instructions -- Instead this is + * handled with the halt_start check below. + */ + depth--; + break; + + case BRW_OPCODE_WHILE: + case BRW_OPCODE_ENDIF: + case FS_OPCODE_PLACEHOLDER_HALT: + depth++; + break; + + default: + /* Note that the vast majority of NoMask SEND instructions in the + * program are harmless while executed in a block with all + * channels disabled, since any instructions with side effects we + * could hit here should be execution-masked. + * + * The main concern is NoMask SEND instructions where the message + * descriptor or header depends on data generated by live + * invocations of the shader (RESINFO and + * FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD with a dynamically + * computed surface index seem to be the only examples right now + * where this could easily lead to GPU hangs). Unfortunately we + * have no straightforward way to detect that currently, so just + * predicate any NoMask SEND instructions we find under control + * flow. + * + * If this proves to have a measurable performance impact it can + * be easily extended with a whitelist of messages we know we can + * safely omit the predication for. + */ + if (depth && inst->force_writemask_all && + is_send(inst) && !inst->predicate) { + /* We need to load the execution mask into the flag register by + * using a builder with channel group matching the whole shader + * (rather than the default which is derived from the original + * instruction), in order to avoid getting a right-shifted + * value. + */ + const fs_builder ubld = fs_builder(this, block, inst) + .exec_all().group(dispatch_width, 0); + const fs_reg flag = retype(brw_flag_reg(0, 0), + BRW_REGISTER_TYPE_UD); + + /* Due to the lack of flag register allocation we need to save + * and restore the flag register if it's live. + */ + const bool save_flag = flag_liveout & + flag_mask(flag, dispatch_width / 8); + const fs_reg tmp = ubld.group(1, 0).vgrf(flag.type); + + if (save_flag) + ubld.group(1, 0).MOV(tmp, flag); + + ubld.emit(FS_OPCODE_LOAD_LIVE_CHANNELS); + + set_predicate(pred, inst); + inst->flag_subreg = 0; + + if (save_flag) + ubld.group(1, 0).at(block, inst->next).MOV(flag, tmp); + + progress = true; + } + break; + } + + if (inst == halt_start) + depth--; + + flag_liveout |= inst->flags_read(devinfo); + } + } + + if (progress) + invalidate_live_intervals(); + + return progress; +} + void fs_visitor::allocate_registers(unsigned min_dispatch_width, bool allow_spilling) { @@ -7436,6 +7868,8 @@ */ assert(prog_data->total_scratch < max_scratch_size); } + + lower_scoreboard(); } bool @@ -7698,6 +8132,8 @@ wm_prog_data->urb_setup[VARYING_SLOT_LAYER] = 0; wm_prog_data->num_varying_inputs = 1; + + brw_compute_urb_setup_index(wm_prog_data); } bool @@ -8075,7 +8511,6 @@ const struct brw_wm_prog_key *key, struct brw_wm_prog_data *prog_data, nir_shader *shader, - struct gl_program *prog, int shader_time_index8, int shader_time_index16, int shader_time_index32, bool allow_spilling, bool use_rep_send, struct brw_vue_map *vue_map, @@ -8093,6 +8528,19 @@ if (devinfo->gen < 6) brw_setup_vue_interpolation(vue_map, shader, prog_data); + /* From the SKL PRM, Volume 7, "Alpha Coverage": + * "If Pixel Shader outputs oMask, AlphaToCoverage is disabled in + * hardware, regardless of the state setting for this feature." + */ + if (devinfo->gen > 6 && key->alpha_to_coverage) { + /* Run constant fold optimization in order to get the correct source + * offset to determine render target 0 store instruction in + * emit_alpha_to_coverage pass. + */ + NIR_PASS_V(shader, nir_opt_constant_folding); + NIR_PASS_V(shader, brw_nir_lower_alpha_to_coverage); + } + if (!key->multisample_fbo) NIR_PASS_V(shader, demote_sample_qualifiers); NIR_PASS_V(shader, move_interpolation_to_top); @@ -8132,7 +8580,7 @@ cfg_t *simd8_cfg = NULL, *simd16_cfg = NULL, *simd32_cfg = NULL; fs_visitor v8(compiler, log_data, mem_ctx, &key->base, - &prog_data->base, prog, shader, 8, + &prog_data->base, shader, 8, shader_time_index8); if (!v8.run_fs(allow_spilling, false /* do_rep_send */)) { if (error_str) @@ -8145,11 +8593,21 @@ prog_data->reg_blocks_8 = brw_register_blocks(v8.grf_used); } + /* Limit dispatch width to simd8 with dual source blending on gen8. + * See: https://gitlab.freedesktop.org/mesa/mesa/-/issues/1917 + */ + if (devinfo->gen == 8 && prog_data->dual_src_blend && + !(INTEL_DEBUG & DEBUG_NO8)) { + assert(!use_rep_send); + v8.limit_dispatch_width(8, "gen8 workaround: " + "using SIMD8 when dual src blending.\n"); + } + if (v8.max_dispatch_width >= 16 && likely(!(INTEL_DEBUG & DEBUG_NO16) || use_rep_send)) { /* Try a SIMD16 compile */ fs_visitor v16(compiler, log_data, mem_ctx, &key->base, - &prog_data->base, prog, shader, 16, + &prog_data->base, shader, 16, shader_time_index16); v16.import_uniforms(&v8); if (!v16.run_fs(allow_spilling, use_rep_send)) { @@ -8169,7 +8627,7 @@ unlikely(INTEL_DEBUG & DEBUG_DO32)) { /* Try a SIMD32 compile */ fs_visitor v32(compiler, log_data, mem_ctx, &key->base, - &prog_data->base, prog, shader, 32, + &prog_data->base, shader, 32, shader_time_index32); v32.import_uniforms(&v8); if (!v32.run_fs(allow_spilling, false)) { @@ -8379,8 +8837,10 @@ src_shader->info.cs.local_size[0] * src_shader->info.cs.local_size[1] * src_shader->info.cs.local_size[2]; + /* Limit max_threads to 64 for the GPGPU_WALKER command */ + const uint32_t max_threads = MIN2(64, compiler->devinfo->max_cs_threads); unsigned min_dispatch_width = - DIV_ROUND_UP(local_workgroup_size, compiler->devinfo->max_cs_threads); + DIV_ROUND_UP(local_workgroup_size, max_threads); min_dispatch_width = MAX2(8, min_dispatch_width); min_dispatch_width = util_next_power_of_two(min_dispatch_width); assert(min_dispatch_width <= 32); @@ -8414,7 +8874,6 @@ src_shader, 8); v8 = new fs_visitor(compiler, log_data, mem_ctx, &key->base, &prog_data->base, - NULL, /* Never used in core profile */ nir8, 8, shader_time_index); if (!v8->run_cs(min_dispatch_width)) { fail_msg = v8->fail_msg; @@ -8435,7 +8894,6 @@ src_shader, 16); v16 = new fs_visitor(compiler, log_data, mem_ctx, &key->base, &prog_data->base, - NULL, /* Never used in core profile */ nir16, 16, shader_time_index); if (v8) v16->import_uniforms(v8); @@ -8469,7 +8927,6 @@ src_shader, 32); v32 = new fs_visitor(compiler, log_data, mem_ctx, &key->base, &prog_data->base, - NULL, /* Never used in core profile */ nir32, 32, shader_time_index); if (v8) v32->import_uniforms(v8); @@ -8550,3 +9007,11 @@ set_predicate(BRW_PREDICATE_NORMAL, bld.emit(BRW_OPCODE_WHILE)); } } + +unsigned +fs_visitor::workgroup_size() const +{ + assert(stage == MESA_SHADER_COMPUTE); + const struct brw_cs_prog_data *cs = brw_cs_prog_data(prog_data); + return cs->local_size[0] * cs->local_size[1] * cs->local_size[2]; +} diff -Nru mesa-19.2.8/src/intel/compiler/brw_fs_cse.cpp mesa-20.0.8/src/intel/compiler/brw_fs_cse.cpp --- mesa-19.2.8/src/intel/compiler/brw_fs_cse.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/compiler/brw_fs_cse.cpp 2020-06-12 01:21:17.000000000 +0000 @@ -76,6 +76,7 @@ case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL: case FS_OPCODE_LINTERP: case SHADER_OPCODE_FIND_LIVE_CHANNEL: + case FS_OPCODE_LOAD_LIVE_CHANNELS: case SHADER_OPCODE_BROADCAST: case SHADER_OPCODE_MOV_INDIRECT: case SHADER_OPCODE_TEX_LOGICAL: @@ -105,7 +106,7 @@ case SHADER_OPCODE_COS: return inst->mlen < 2; case SHADER_OPCODE_LOAD_PAYLOAD: - return !inst->is_copy_payload(v->alloc); + return !is_coalescing_payload(v->alloc, inst); default: return inst->is_send_from_grf() && !inst->has_side_effects() && !inst->is_volatile(); @@ -242,14 +243,13 @@ } bool -fs_visitor::opt_cse_local(bblock_t *block) +fs_visitor::opt_cse_local(bblock_t *block, int &ip) { bool progress = false; exec_list aeb; void *cse_ctx = ralloc_context(NULL); - int ip = block->start_ip; foreach_inst_in_block(fs_inst, inst, block) { /* Skip some cases. */ if (is_expression(this, inst) && !inst->is_partial_write() && @@ -318,6 +318,16 @@ } } + /* Discard jumps aren't represented in the CFG unfortunately, so we need + * to make sure that they behave as a CSE barrier, since we lack global + * dataflow information. This is particularly likely to cause problems + * with instructions dependent on the current execution mask like + * SHADER_OPCODE_FIND_LIVE_CHANNEL. + */ + if (inst->opcode == FS_OPCODE_DISCARD_JUMP || + inst->opcode == FS_OPCODE_PLACEHOLDER_HALT) + aeb.make_empty(); + foreach_in_list_safe(aeb_entry, entry, &aeb) { /* Kill all AEB entries that write a different value to or read from * the flag register if we just wrote it. @@ -370,11 +380,12 @@ fs_visitor::opt_cse() { bool progress = false; + int ip = 0; calculate_live_intervals(); foreach_block (block, cfg) { - progress = opt_cse_local(block) || progress; + progress = opt_cse_local(block, ip) || progress; } if (progress) diff -Nru mesa-19.2.8/src/intel/compiler/brw_fs_generator.cpp mesa-20.0.8/src/intel/compiler/brw_fs_generator.cpp --- mesa-19.2.8/src/intel/compiler/brw_fs_generator.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/compiler/brw_fs_generator.cpp 2020-06-12 01:21:17.000000000 +0000 @@ -85,6 +85,8 @@ const unsigned phys_width = compressed ? inst->exec_size / 2 : inst->exec_size; + const unsigned max_hw_width = 16; + /* XXX - The equation above is strictly speaking not correct on * hardware that supports unbalanced GRF writes -- On Gen9+ * each decompressed chunk of the instruction may have a @@ -97,7 +99,7 @@ brw_reg = brw_vecn_reg(1, brw_file_from_reg(reg), reg->nr, 0); brw_reg = stride(brw_reg, reg->stride, 1, 0); } else { - const unsigned width = MIN2(reg_width, phys_width); + const unsigned width = MIN3(reg_width, phys_width, max_hw_width); brw_reg = brw_vecn_reg(width, brw_file_from_reg(reg), reg->nr, 0); brw_reg = stride(brw_reg, width * reg->stride, width, reg->stride); } @@ -283,7 +285,8 @@ desc, desc_imm, ex_desc, ex_desc_imm, inst->eot); if (inst->check_tdr) - brw_inst_set_opcode(p->devinfo, brw_last_inst, BRW_OPCODE_SENDSC); + brw_inst_set_opcode(p->devinfo, brw_last_inst, + devinfo->gen >= 12 ? BRW_OPCODE_SENDC : BRW_OPCODE_SENDSC); } else { brw_send_indirect_message(p, inst->sfid, dst, payload, desc, desc_imm, inst->eot); @@ -298,8 +301,6 @@ struct brw_reg implied_header, GLuint nr) { - uint32_t msg_control; - struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data); if (devinfo->gen < 6) { @@ -313,30 +314,7 @@ brw_pop_insn_state(p); } - if (inst->opcode == FS_OPCODE_REP_FB_WRITE) { - assert(inst->group == 0 && inst->exec_size == 16); - msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED; - - } else if (prog_data->dual_src_blend) { - assert(inst->exec_size == 8); - - if (inst->group % 16 == 0) - msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01; - else if (inst->group % 16 == 8) - msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN23; - else - unreachable("Invalid dual-source FB write instruction group"); - - } else { - assert(inst->group == 0 || (inst->group == 16 && inst->exec_size == 16)); - - if (inst->exec_size == 16) - msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE; - else if (inst->exec_size == 8) - msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01; - else - unreachable("Invalid FB write execution size"); - } + uint32_t msg_control = brw_fb_write_msg_control(inst, prog_data); /* We assume render targets start at 0, because headerless FB write * messages set "Render Target Index" to 0. Using a different binding @@ -442,6 +420,13 @@ /* We use VxH indirect addressing, clobbering a0.0 through a0.7. */ struct brw_reg addr = vec8(brw_address_reg(0)); + /* Whether we can use destination dependency control without running the + * risk of a hang if an instruction gets shot down. + */ + const bool use_dep_ctrl = !inst->predicate && + inst->exec_size == dispatch_width; + brw_inst *insn; + /* The destination stride of an instruction (in bytes) must be greater * than or equal to the size of the rest of the instruction. Since the * address register is of type UW, we can't use a D-type instruction. @@ -474,13 +459,34 @@ * In the end, while base_offset is nice to look at in the generated * code, using it saves us 0 instructions and would require quite a bit * of case-by-case work. It's just not worth it. + * + * Due to a hardware bug some platforms (particularly Gen11+) seem to + * require the address components of all channels to be valid whether or + * not they're active, which causes issues if we use VxH addressing + * under non-uniform control-flow. We can easily work around that by + * initializing the whole address register with a pipelined NoMask MOV + * instruction. */ - brw_ADD(p, addr, indirect_byte_offset, brw_imm_uw(imm_byte_offset)); + if (devinfo->gen >= 7) { + insn = brw_MOV(p, addr, brw_imm_uw(imm_byte_offset)); + brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE); + brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE); + if (devinfo->gen >= 12) + brw_set_default_swsb(p, tgl_swsb_null()); + else + brw_inst_set_no_dd_clear(devinfo, insn, use_dep_ctrl); + } + + insn = brw_ADD(p, addr, indirect_byte_offset, brw_imm_uw(imm_byte_offset)); + if (devinfo->gen >= 12) + brw_set_default_swsb(p, tgl_swsb_regdist(1)); + else if (devinfo->gen >= 7) + brw_inst_set_no_dd_check(devinfo, insn, use_dep_ctrl); if (type_sz(reg.type) > 4 && ((devinfo->gen == 7 && !devinfo->is_haswell) || devinfo->is_cherryview || gen_device_info_is_9lp(devinfo) || - !devinfo->has_64bit_types)) { + !devinfo->has_64bit_float)) { /* IVB has an issue (which we found empirically) where it reads two * address register components per channel for indirectly addressed * 64-bit sources. @@ -498,6 +504,7 @@ */ brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0), retype(brw_VxH_indirect(0, 0), BRW_REGISTER_TYPE_D)); + brw_set_default_swsb(p, tgl_swsb_null()); brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1), retype(brw_VxH_indirect(0, 4), BRW_REGISTER_TYPE_D)); } else { @@ -586,6 +593,7 @@ src.hstride - 1)); /* Add on the register start offset */ + brw_set_default_swsb(p, tgl_swsb_regdist(1)); brw_ADD(p, addr, addr, brw_imm_uw(src.nr * REG_SIZE + src.subnr)); if (type_sz(src.type) > 4 && @@ -610,15 +618,19 @@ struct brw_reg gdst = suboffset(dst, group); struct brw_reg dst_d = retype(spread(gdst, 2), BRW_REGISTER_TYPE_D); + assert(dst.hstride == 1); brw_MOV(p, dst_d, retype(brw_VxH_indirect(0, 0), BRW_REGISTER_TYPE_D)); + brw_set_default_swsb(p, tgl_swsb_null()); brw_MOV(p, byte_offset(dst_d, 4), retype(brw_VxH_indirect(0, 4), BRW_REGISTER_TYPE_D)); } else { - brw_MOV(p, suboffset(dst, group), + brw_MOV(p, suboffset(dst, group * dst.hstride), retype(brw_VxH_indirect(0, 0), src.type)); } } + + brw_set_default_swsb(p, tgl_swsb_null()); } } @@ -679,8 +691,12 @@ 4 * inst->dst.stride, 1, 4 * inst->dst.stride), stride(suboffset(src, BRW_GET_SWZ(swiz, c)), 4, 1, 0)); - brw_inst_set_no_dd_clear(devinfo, insn, c < 3); - brw_inst_set_no_dd_check(devinfo, insn, c > 0); + if (devinfo->gen < 12) { + brw_inst_set_no_dd_clear(devinfo, insn, c < 3); + brw_inst_set_no_dd_check(devinfo, insn, c > 0); + } + + brw_set_default_swsb(p, tgl_swsb_null()); } break; @@ -700,7 +716,8 @@ brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND); brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UD)); brw_set_src0(p, send, header); - brw_set_src1(p, send, brw_imm_ud(0u)); + if (devinfo->gen < 12) + brw_set_src1(p, send, brw_imm_ud(0u)); brw_inst_set_sfid(p->devinfo, send, BRW_SFID_URB); brw_inst_set_urb_opcode(p->devinfo, send, GEN8_URB_OPCODE_SIMD8_READ); @@ -736,7 +753,8 @@ brw_set_dest(p, insn, brw_null_reg()); brw_set_src0(p, insn, payload); - brw_set_src1(p, insn, brw_imm_ud(0u)); + if (devinfo->gen < 12) + brw_set_src1(p, insn, brw_imm_ud(0u)); brw_inst_set_sfid(p->devinfo, insn, BRW_SFID_URB); brw_inst_set_urb_opcode(p->devinfo, insn, GEN8_URB_OPCODE_SIMD8_WRITE); @@ -765,7 +783,8 @@ brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW)); brw_set_src0(p, insn, retype(payload, BRW_REGISTER_TYPE_UW)); - brw_set_src1(p, insn, brw_imm_ud(0u)); + if (devinfo->gen < 12) + brw_set_src1(p, insn, brw_imm_ud(0u)); /* Terminate a compute shader by sending a message to the thread spawner. */ @@ -776,13 +795,16 @@ brw_inst_set_header_present(devinfo, insn, false); brw_inst_set_ts_opcode(devinfo, insn, 0); /* Dereference resource */ - brw_inst_set_ts_request_type(devinfo, insn, 0); /* Root thread */ - /* Note that even though the thread has a URB resource associated with it, - * we set the "do not dereference URB" bit, because the URB resource is - * managed by the fixed-function unit, so it will free it automatically. - */ - brw_inst_set_ts_resource_select(devinfo, insn, 1); /* Do not dereference URB */ + if (devinfo->gen < 11) { + brw_inst_set_ts_request_type(devinfo, insn, 0); /* Root thread */ + + /* Note that even though the thread has a URB resource associated with it, + * we set the "do not dereference URB" bit, because the URB resource is + * managed by the fixed-function unit, so it will free it automatically. + */ + brw_inst_set_ts_resource_select(devinfo, insn, 1); /* Do not dereference URB */ + } brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE); } @@ -791,7 +813,12 @@ fs_generator::generate_barrier(fs_inst *, struct brw_reg src) { brw_barrier(p, src); - brw_WAIT(p); + if (devinfo->gen >= 12) { + brw_set_default_swsb(p, tgl_swsb_null()); + brw_SYNC(p, TGL_SYNC_BAR); + } else { + brw_WAIT(p); + } } bool @@ -820,7 +847,7 @@ */ struct brw_reg delta_x = src[0]; struct brw_reg delta_y = offset(src[0], inst->exec_size / 8); - struct brw_reg interp = stride(src[1], 0, 1, 0); + struct brw_reg interp = src[1]; brw_inst *i[2]; /* nir_lower_interpolation() will do the lowering to MAD instructions for @@ -1125,15 +1152,18 @@ /* Set up an implied move from g0 to the MRF. */ src = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW); } else { + const tgl_swsb swsb = brw_get_default_swsb(p); assert(inst->base_mrf != -1); struct brw_reg header_reg = brw_message_reg(inst->base_mrf); brw_push_insn_state(p); + brw_set_default_swsb(p, tgl_swsb_src_dep(swsb)); brw_set_default_exec_size(p, BRW_EXECUTE_8); brw_set_default_mask_control(p, BRW_MASK_DISABLE); brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); /* Explicitly set up the message header by copying g0 to the MRF. */ brw_MOV(p, header_reg, brw_vec8_grf(0, 0)); + brw_set_default_swsb(p, tgl_swsb_regdist(1)); brw_set_default_exec_size(p, BRW_EXECUTE_1); if (inst->offset) { @@ -1143,6 +1173,7 @@ } brw_pop_insn_state(p); + brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1)); } } @@ -1289,6 +1320,7 @@ brw_ADD(p, byte_offset(dst, g * type_size), negate(byte_offset(src, g * type_size)), byte_offset(src, (g + 2) * type_size)); + brw_set_default_swsb(p, tgl_swsb_null()); } brw_pop_insn_state(p); } else { @@ -1353,6 +1385,7 @@ const unsigned lower_size = inst->force_writemask_all ? inst->exec_size : MIN2(16, inst->exec_size); const unsigned block_size = 4 * lower_size / REG_SIZE; + const tgl_swsb swsb = brw_get_default_swsb(p); assert(inst->mlen != 0); brw_push_insn_state(p); @@ -1362,9 +1395,17 @@ for (unsigned i = 0; i < inst->exec_size / lower_size; i++) { brw_set_default_group(p, inst->group + lower_size * i); + if (i > 0) { + assert(swsb.mode & TGL_SBID_SET); + brw_set_default_swsb(p, tgl_swsb_sbid(TGL_SBID_SRC, swsb.sbid)); + } else { + brw_set_default_swsb(p, tgl_swsb_src_dep(swsb)); + } + brw_MOV(p, brw_uvec_mrf(lower_size, inst->base_mrf + 1, 0), retype(offset(src, block_size * i), BRW_REGISTER_TYPE_UD)); + brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1)); brw_oword_block_write_scratch(p, brw_message_reg(inst->base_mrf), block_size, inst->offset + block_size * REG_SIZE * i); @@ -1442,12 +1483,14 @@ BRW_DATAPORT_READ_TARGET_DATA_CACHE)); } else { + const tgl_swsb swsb = brw_get_default_swsb(p); struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD)); brw_push_insn_state(p); brw_set_default_mask_control(p, BRW_MASK_DISABLE); /* a0.0 = surf_index & 0xff */ + brw_set_default_swsb(p, tgl_swsb_src_dep(swsb)); brw_inst *insn_and = brw_next_insn(p, BRW_OPCODE_AND); brw_inst_set_exec_size(p->devinfo, insn_and, BRW_EXECUTE_1); brw_set_dest(p, insn_and, addr); @@ -1455,6 +1498,7 @@ brw_set_src1(p, insn_and, brw_imm_ud(0x0ff)); /* dst = send(payload, a0.0 | ) */ + brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1)); brw_send_indirect_message( p, GEN6_SFID_DATAPORT_CONSTANT_CACHE, retype(dst, BRW_REGISTER_TYPE_UD), @@ -1578,6 +1622,7 @@ brw_inst_set_exec_size(devinfo, insn, cvt(lower_size) - 1); brw_inst_set_group(devinfo, insn, inst->group + lower_size * i); brw_inst_set_compression(devinfo, insn, lower_size > 8); + brw_set_default_swsb(p, tgl_swsb_null()); } } @@ -1612,6 +1657,7 @@ /* Now the form: * 0xhhhh0000 */ + brw_set_default_swsb(p, tgl_swsb_regdist(1)); brw_SHL(p, dst, dst, brw_imm_ud(16u)); /* And, finally the form of packHalf2x16's output: @@ -1626,9 +1672,12 @@ struct brw_reg offset, struct brw_reg value) { + const tgl_swsb swsb = brw_get_default_swsb(p); + assert(devinfo->gen >= 7); brw_push_insn_state(p); brw_set_default_mask_control(p, true); + brw_set_default_swsb(p, tgl_swsb_src_dep(swsb)); assert(payload.file == BRW_GENERAL_REGISTER_FILE); struct brw_reg payload_offset = retype(brw_vec1_grf(payload.nr, 0), @@ -1650,7 +1699,9 @@ * out of this path, so we just emit the MOVs from here. */ brw_MOV(p, payload_offset, offset); + brw_set_default_swsb(p, tgl_swsb_null()); brw_MOV(p, payload_value, value); + brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1)); brw_shader_time_add(p, payload, prog_data->binding_table.shader_time_start); brw_pop_insn_state(p); @@ -1674,8 +1725,16 @@ this->dispatch_width = dispatch_width; int start_offset = p->next_insn_offset; + + /* `send_count` explicitly does not include spills or fills, as we'd + * like to use it as a metric for intentional memory access or other + * shared function use. Otherwise, subtle changes to scheduling or + * register allocation could cause it to fluctuate wildly - and that + * effect is already counted in spill/fill counts. + */ int spill_count = 0, fill_count = 0; - int loop_count = 0; + int loop_count = 0, send_count = 0; + bool is_accum_used = false; struct disasm_info *disasm_info = disasm_initialize(devinfo, cfg); @@ -1706,6 +1765,23 @@ last_insn_offset = p->next_insn_offset; } + /* GEN:BUG:14010017096: + * + * Clear accumulator register before end of thread. + */ + if (inst->eot && is_accum_used && devinfo->gen >= 12) { + brw_set_default_exec_size(p, BRW_EXECUTE_16); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); + brw_MOV(p, brw_acc_reg(8), brw_imm_f(0.0f)); + last_insn_offset = p->next_insn_offset; + } + + if (!is_accum_used && !inst->eot) { + is_accum_used = inst->writes_accumulator_implicitly(devinfo) || + inst->dst.is_accumulator(); + } + if (unlikely(debug_flag)) disasm_annotate(disasm_info, inst, p->next_insn_offset); @@ -1756,6 +1832,7 @@ brw_set_default_saturate(p, inst->saturate); brw_set_default_mask_control(p, inst->force_writemask_all); brw_set_default_acc_write_control(p, inst->writes_accumulator); + brw_set_default_swsb(p, inst->sched); unsigned exec_size = inst->exec_size; if (devinfo->gen == 7 && !devinfo->is_haswell && @@ -1771,6 +1848,10 @@ assert(inst->mlen <= BRW_MAX_MSG_LENGTH); switch (inst->opcode) { + case BRW_OPCODE_SYNC: + assert(src[0].file == BRW_IMMEDIATE_VALUE); + brw_SYNC(p, tgl_sync_function(src[0].ud)); + break; case BRW_OPCODE_MOV: brw_MOV(p, dst, src[0]); break; @@ -1984,6 +2065,7 @@ brw_math_function(inst->opcode), inst->base_mrf, src[0], BRW_MATH_PRECISION_FULL); + send_count++; } break; case SHADER_OPCODE_INT_QUOTIENT: @@ -2001,6 +2083,7 @@ gen4_math(p, dst, brw_math_function(inst->opcode), inst->base_mrf, src[0], BRW_MATH_PRECISION_FULL); + send_count++; } break; case FS_OPCODE_LINTERP: @@ -2020,10 +2103,20 @@ case SHADER_OPCODE_SEND: generate_send(inst, dst, src[0], src[1], src[2], inst->ex_mlen > 0 ? src[3] : brw_null_reg()); + if ((inst->desc & 0xff) == BRW_BTI_STATELESS || + (inst->desc & 0xff) == GEN8_BTI_STATELESS_NON_COHERENT) { + if (inst->size_written) + fill_count++; + else + spill_count++; + } else { + send_count++; + } break; case SHADER_OPCODE_GET_BUFFER_SIZE: generate_get_buffer_size(inst, dst, src[0], src[1]); + send_count++; break; case SHADER_OPCODE_TEX: case FS_OPCODE_TXB: @@ -2037,6 +2130,7 @@ case SHADER_OPCODE_SAMPLEINFO: assert(inst->src[0].file == BAD_FILE); generate_tex(inst, dst, src[1], src[2]); + send_count++; break; case FS_OPCODE_DDX_COARSE: @@ -2070,6 +2164,7 @@ case SHADER_OPCODE_URB_READ_SIMD8: case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT: generate_urb_read(inst, dst, src[0]); + send_count++; break; case SHADER_OPCODE_URB_WRITE_SIMD8: @@ -2077,29 +2172,35 @@ case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED: case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT: generate_urb_write(inst, src[0]); + send_count++; break; case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD: assert(inst->force_writemask_all); generate_uniform_pull_constant_load(inst, dst, src[0], src[1]); + send_count++; break; case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7: assert(inst->force_writemask_all); generate_uniform_pull_constant_load_gen7(inst, dst, src[0], src[1]); + send_count++; break; case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN4: generate_varying_pull_constant_load_gen4(inst, dst, src[0]); + send_count++; break; case FS_OPCODE_REP_FB_WRITE: case FS_OPCODE_FB_WRITE: generate_fb_write(inst, src[0]); + send_count++; break; case FS_OPCODE_FB_READ: generate_fb_read(inst, dst, src[0]); + send_count++; break; case FS_OPCODE_DISCARD_JUMP: @@ -2114,6 +2215,12 @@ assert(src[1].file == BRW_IMMEDIATE_VALUE); assert(src[2].file == BRW_IMMEDIATE_VALUE); brw_memory_fence(p, dst, src[0], BRW_OPCODE_SEND, src[1].ud, src[2].ud); + send_count++; + break; + + case FS_OPCODE_SCHEDULING_FENCE: + if (unlikely(debug_flag)) + disasm_info->use_tail = true; break; case SHADER_OPCODE_INTERLOCK: @@ -2131,7 +2238,16 @@ brw_find_live_channel(p, dst, mask); break; } - + case FS_OPCODE_LOAD_LIVE_CHANNELS: { + assert(devinfo->gen >= 8); + assert(inst->force_writemask_all && inst->group == 0); + assert(inst->dst.file == BAD_FILE); + brw_set_default_exec_size(p, BRW_EXECUTE_1); + brw_MOV(p, retype(brw_flag_subreg(inst->flag_subreg), + BRW_REGISTER_TYPE_UD), + retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD)); + break; + } case SHADER_OPCODE_BROADCAST: assert(inst->force_writemask_all); brw_broadcast(p, dst, src[0], src[1]); @@ -2146,6 +2262,7 @@ brw_set_default_mask_control(p, BRW_MASK_DISABLE); brw_MOV(p, dst, src[1]); brw_set_default_mask_control(p, BRW_MASK_ENABLE); + brw_set_default_swsb(p, tgl_swsb_null()); brw_MOV(p, dst, src[0]); break; @@ -2156,7 +2273,6 @@ break; case SHADER_OPCODE_CLUSTER_BROADCAST: { - assert(src[0].type == dst.type); assert(!src[0].negate && !src[0].abs); assert(src[1].file == BRW_IMMEDIATE_VALUE); assert(src[1].type == BRW_REGISTER_TYPE_UD); @@ -2193,8 +2309,10 @@ * indirect here to handle adding 4 bytes to the offset and avoid * the extra ADD to the register file. */ + assert(src[0].type == dst.type); brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0), subscript(strided, BRW_REGISTER_TYPE_D, 0)); + brw_set_default_swsb(p, tgl_swsb_null()); brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1), subscript(strided, BRW_REGISTER_TYPE_D, 1)); } else { @@ -2225,24 +2343,29 @@ case FS_OPCODE_INTERPOLATE_AT_SAMPLE: generate_pixel_interpolator_query(inst, dst, src[0], src[1], GEN7_PIXEL_INTERPOLATOR_LOC_SAMPLE); + send_count++; break; case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET: generate_pixel_interpolator_query(inst, dst, src[0], src[1], GEN7_PIXEL_INTERPOLATOR_LOC_SHARED_OFFSET); + send_count++; break; case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET: generate_pixel_interpolator_query(inst, dst, src[0], src[1], GEN7_PIXEL_INTERPOLATOR_LOC_PER_SLOT_OFFSET); + send_count++; break; case CS_OPCODE_CS_TERMINATE: generate_cs_terminate(inst, src[0]); + send_count++; break; case SHADER_OPCODE_BARRIER: generate_barrier(inst, src[0]); + send_count++; break; case BRW_OPCODE_DIM: @@ -2252,9 +2375,22 @@ brw_DIM(p, dst, retype(src[0], BRW_REGISTER_TYPE_F)); break; - case SHADER_OPCODE_RND_MODE: + case SHADER_OPCODE_RND_MODE: { assert(src[0].file == BRW_IMMEDIATE_VALUE); - brw_rounding_mode(p, (brw_rnd_mode) src[0].d); + /* + * Changes the floating point rounding mode updating the control + * register field defined at cr0.0[5-6] bits. + */ + enum brw_rnd_mode mode = + (enum brw_rnd_mode) (src[0].d << BRW_CR0_RND_MODE_SHIFT); + brw_float_controls_mode(p, mode, BRW_CR0_RND_MODE_MASK); + } + break; + + case SHADER_OPCODE_FLOAT_CONTROL_MODE: + assert(src[0].file == BRW_IMMEDIATE_VALUE); + assert(src[1].file == BRW_IMMEDIATE_VALUE); + brw_float_controls_mode(p, src[0].d, src[1].d); break; default: @@ -2276,8 +2412,10 @@ if (inst->conditional_mod) brw_inst_set_cond_modifier(p->devinfo, last, inst->conditional_mod); - brw_inst_set_no_dd_clear(p->devinfo, last, inst->no_dd_clear); - brw_inst_set_no_dd_check(p->devinfo, last, inst->no_dd_check); + if (devinfo->gen < 12) { + brw_inst_set_no_dd_clear(p->devinfo, last, inst->no_dd_clear); + brw_inst_set_no_dd_check(p->devinfo, last, inst->no_dd_check); + } } } @@ -2310,14 +2448,14 @@ fprintf(stderr, "Native code for %s (sha1 %s)\n" "SIMD%d shader: %d instructions. %d loops. %u cycles. " - "%d:%d spills:fills. " + "%d:%d spills:fills, %u sends, " "scheduled with mode %s. " "Promoted %u constants. " "Compacted %d to %d bytes (%.0f%%)\n", shader_name, sha1buf, dispatch_width, before_size / 16, loop_count, cfg->cycle_count, - spill_count, fill_count, + spill_count, fill_count, send_count, shader_stats.scheduler_mode, shader_stats.promoted_constants, before_size, after_size, @@ -2335,14 +2473,14 @@ compiler->shader_debug_log(log_data, "%s SIMD%d shader: %d inst, %d loops, %u cycles, " - "%d:%d spills:fills, " + "%d:%d spills:fills, %u sends, " "scheduled with mode %s, " "Promoted %u constants, " "compacted %d to %d bytes.", _mesa_shader_stage_to_abbrev(stage), dispatch_width, before_size / 16, loop_count, cfg->cycle_count, - spill_count, fill_count, + spill_count, fill_count, send_count, shader_stats.scheduler_mode, shader_stats.promoted_constants, before_size, after_size); diff -Nru mesa-19.2.8/src/intel/compiler/brw_fs.h mesa-20.0.8/src/intel/compiler/brw_fs.h --- mesa-19.2.8/src/intel/compiler/brw_fs.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/compiler/brw_fs.h 2020-06-12 01:21:17.000000000 +0000 @@ -69,7 +69,6 @@ void *mem_ctx, const brw_base_prog_key *key, struct brw_stage_prog_data *prog_data, - struct gl_program *prog, const nir_shader *shader, unsigned dispatch_width, int shader_time_index, @@ -108,6 +107,7 @@ void setup_cs_payload(); bool fixup_sends_duplicate_payload(); void fixup_3src_null_dest(); + bool fixup_nomask_control_flow(); void assign_curb_setup(); void assign_urb_setup(); void convert_attr_sources_to_hw_regs(fs_inst *inst); @@ -132,7 +132,7 @@ bool opt_algebraic(); bool opt_redundant_discard_jumps(); bool opt_cse(); - bool opt_cse_local(bblock_t *block); + bool opt_cse_local(bblock_t *block, int &ip); bool opt_copy_propagation(); bool try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry); bool try_constant_propagate(fs_inst *inst, acp_entry *entry); @@ -168,6 +168,9 @@ bool lower_integer_multiplication(); bool lower_minmax(); bool lower_simd_width(); + bool lower_barycentrics(); + bool lower_scoreboard(); + bool lower_sub_sat(); bool opt_combine_constants(); void emit_dummy_fs(); @@ -188,8 +191,8 @@ void emit_discard_jump(); void emit_fsign(const class brw::fs_builder &, const nir_alu_instr *instr, fs_reg result, fs_reg *op, unsigned fsign_src); + void emit_shader_float_controls_execution_mode(); bool opt_peephole_sel(); - bool opt_peephole_csel(); bool opt_peephole_predicated_break(); bool opt_saturate_propagation(); bool opt_cmod_propagation(); @@ -227,6 +230,9 @@ nir_intrinsic_instr *instr); fs_reg get_nir_ssbo_intrinsic_index(const brw::fs_builder &bld, nir_intrinsic_instr *instr); + fs_reg swizzle_nir_scratch_addr(const brw::fs_builder &bld, + const fs_reg &addr, + bool in_dwords); void nir_emit_intrinsic(const brw::fs_builder &bld, nir_intrinsic_instr *instr); void nir_emit_tes_intrinsic(const brw::fs_builder &bld, @@ -299,8 +305,6 @@ fs_reg interp_reg(int location, int channel); - int implied_mrf_writes(fs_inst *inst) const; - virtual void dump_instructions(); virtual void dump_instructions(const char *name); void dump_instruction(backend_instruction *inst); @@ -312,7 +316,6 @@ struct brw_gs_compile *gs_compile; struct brw_stage_prog_data *prog_data; - struct gl_program *prog; const struct brw_vue_map *input_vue_map; @@ -341,6 +344,7 @@ int *push_constant_loc; fs_reg subgroup_id; + fs_reg scratch_base; fs_reg frag_depth; fs_reg frag_stencil; fs_reg sample_mask; @@ -409,6 +413,8 @@ void lower_mul_dword_inst(fs_inst *inst, bblock_t *block); void lower_mul_qword_inst(fs_inst *inst, bblock_t *block); void lower_mulh_inst(fs_inst *inst, bblock_t *block); + + unsigned workgroup_size() const; }; /** @@ -538,25 +544,21 @@ namespace brw { inline fs_reg fetch_payload_reg(const brw::fs_builder &bld, uint8_t regs[2], - brw_reg_type type = BRW_REGISTER_TYPE_F, unsigned n = 1) + brw_reg_type type = BRW_REGISTER_TYPE_F) { if (!regs[0]) return fs_reg(); if (bld.dispatch_width() > 16) { - const fs_reg tmp = bld.vgrf(type, n); + const fs_reg tmp = bld.vgrf(type); const brw::fs_builder hbld = bld.exec_all().group(16, 0); const unsigned m = bld.dispatch_width() / hbld.dispatch_width(); - fs_reg *const components = new fs_reg[n * m]; + fs_reg *const components = new fs_reg[m]; - for (unsigned c = 0; c < n; c++) { - for (unsigned g = 0; g < m; g++) { - components[c * m + g] = - offset(retype(brw_vec8_grf(regs[g], 0), type), hbld, c); - } - } + for (unsigned g = 0; g < m; g++) + components[g] = retype(brw_vec8_grf(regs[g], 0), type); - hbld.LOAD_PAYLOAD(tmp, components, n * m, 0); + hbld.LOAD_PAYLOAD(tmp, components, m, 0); delete[] components; return tmp; @@ -566,6 +568,29 @@ } } + inline fs_reg + fetch_barycentric_reg(const brw::fs_builder &bld, uint8_t regs[2]) + { + if (!regs[0]) + return fs_reg(); + + const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, 2); + const brw::fs_builder hbld = bld.exec_all().group(8, 0); + const unsigned m = bld.dispatch_width() / hbld.dispatch_width(); + fs_reg *const components = new fs_reg[2 * m]; + + for (unsigned c = 0; c < 2; c++) { + for (unsigned g = 0; g < m; g++) + components[c * m + g] = offset(brw_vec8_grf(regs[g / 2], 0), + hbld, c + 2 * (g % 2)); + } + + hbld.LOAD_PAYLOAD(tmp, components, 2 * m, 0); + + delete[] components; + return tmp; + } + bool lower_src_modifiers(fs_visitor *v, bblock_t *block, fs_inst *inst, unsigned i); } @@ -588,4 +613,9 @@ enum brw_barycentric_mode brw_barycentric_mode(enum glsl_interp_mode mode, nir_intrinsic_op op); +uint32_t brw_fb_write_msg_control(const fs_inst *inst, + const struct brw_wm_prog_data *prog_data); + +void brw_compute_urb_setup_index(struct brw_wm_prog_data *wm_prog_data); + #endif /* BRW_FS_H */ diff -Nru mesa-19.2.8/src/intel/compiler/brw_fs_live_variables.cpp mesa-20.0.8/src/intel/compiler/brw_fs_live_variables.cpp --- mesa-19.2.8/src/intel/compiler/brw_fs_live_variables.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/compiler/brw_fs_live_variables.cpp 2020-06-12 01:21:17.000000000 +0000 @@ -53,7 +53,7 @@ */ void -fs_live_variables::setup_one_read(struct block_data *bd, fs_inst *inst, +fs_live_variables::setup_one_read(struct fs_block_data *bd, fs_inst *inst, int ip, const fs_reg ®) { int var = var_from_reg(reg); @@ -71,7 +71,7 @@ } void -fs_live_variables::setup_one_write(struct block_data *bd, fs_inst *inst, +fs_live_variables::setup_one_write(struct fs_block_data *bd, fs_inst *inst, int ip, const fs_reg ®) { int var = var_from_reg(reg); @@ -110,7 +110,7 @@ if (block->num > 0) assert(cfg->blocks[block->num - 1]->end_ip == ip - 1); - struct block_data *bd = &block_data[block->num]; + struct fs_block_data *bd = &block_data[block->num]; foreach_inst_in_block(fs_inst, inst, block) { /* Set use[] for this instruction */ @@ -160,11 +160,11 @@ cont = false; foreach_block_reverse (block, cfg) { - struct block_data *bd = &block_data[block->num]; + struct fs_block_data *bd = &block_data[block->num]; /* Update liveout */ foreach_list_typed(bblock_link, child_link, link, &block->children) { - struct block_data *child_bd = &block_data[child_link->block->num]; + struct fs_block_data *child_bd = &block_data[child_link->block->num]; for (int i = 0; i < bitset_words; i++) { BITSET_WORD new_liveout = (child_bd->livein[i] & @@ -209,10 +209,10 @@ cont = false; foreach_block (block, cfg) { - const struct block_data *bd = &block_data[block->num]; + const struct fs_block_data *bd = &block_data[block->num]; foreach_list_typed(bblock_link, child_link, link, &block->children) { - struct block_data *child_bd = &block_data[child_link->block->num]; + struct fs_block_data *child_bd = &block_data[child_link->block->num]; for (int i = 0; i < bitset_words; i++) { const BITSET_WORD new_def = bd->defout[i] & ~child_bd->defin[i]; @@ -233,7 +233,7 @@ fs_live_variables::compute_start_end() { foreach_block (block, cfg) { - struct block_data *bd = &block_data[block->num]; + struct fs_block_data *bd = &block_data[block->num]; for (int w = 0; w < bitset_words; w++) { BITSET_WORD livedefin = bd->livein[w] & bd->defin[w]; @@ -282,7 +282,7 @@ end[i] = -1; } - block_data= rzalloc_array(mem_ctx, struct block_data, cfg->num_blocks); + block_data = rzalloc_array(mem_ctx, struct fs_block_data, cfg->num_blocks); bitset_words = BITSET_WORDS(num_vars); for (int i = 0; i < cfg->num_blocks; i++) { diff -Nru mesa-19.2.8/src/intel/compiler/brw_fs_live_variables.h mesa-20.0.8/src/intel/compiler/brw_fs_live_variables.h --- mesa-19.2.8/src/intel/compiler/brw_fs_live_variables.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/compiler/brw_fs_live_variables.h 2020-06-12 01:21:17.000000000 +0000 @@ -35,7 +35,7 @@ namespace brw { -struct block_data { +struct fs_block_data { /** * Which variables are defined before being used in the block. * @@ -110,13 +110,13 @@ /** @} */ /** Per-basic-block information on live variables */ - struct block_data *block_data; + struct fs_block_data *block_data; protected: void setup_def_use(); - void setup_one_read(struct block_data *bd, fs_inst *inst, int ip, + void setup_one_read(struct fs_block_data *bd, fs_inst *inst, int ip, const fs_reg ®); - void setup_one_write(struct block_data *bd, fs_inst *inst, int ip, + void setup_one_write(struct fs_block_data *bd, fs_inst *inst, int ip, const fs_reg ®); void compute_live_variables(); void compute_start_end(); diff -Nru mesa-19.2.8/src/intel/compiler/brw_fs_nir.cpp mesa-20.0.8/src/intel/compiler/brw_fs_nir.cpp --- mesa-19.2.8/src/intel/compiler/brw_fs_nir.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/compiler/brw_fs_nir.cpp 2020-06-12 01:21:17.000000000 +0000 @@ -34,12 +34,15 @@ void fs_visitor::emit_nir_code() { + emit_shader_float_controls_execution_mode(); + /* emit the arrays used for inputs and outputs - load/store intrinsics will * be converted to reads/writes of these arrays */ nir_setup_outputs(); nir_setup_uniforms(); nir_emit_system_values(); + last_scratch = ALIGN(nir->scratch_size, 4) * dispatch_width; nir_emit_impl(nir_shader_get_entrypoint((nir_shader *)nir)); } @@ -578,7 +581,24 @@ fs_reg tmp = vgrf(glsl_type::int_type); - if (devinfo->gen >= 6) { + if (devinfo->gen >= 12) { + /* Bit 15 of g1.1 is 0 if the polygon is front facing. */ + fs_reg g1 = fs_reg(retype(brw_vec1_grf(1, 1), BRW_REGISTER_TYPE_W)); + + /* For (gl_FrontFacing ? 1.0 : -1.0), emit: + * + * or(8) tmp.1<2>W g0.0<0,1,0>W 0x00003f80W + * and(8) dst<1>D tmp<8,8,1>D 0xbf800000D + * + * and negate the result for (gl_FrontFacing ? -1.0 : 1.0). + */ + bld.OR(subscript(tmp, BRW_REGISTER_TYPE_W, 1), + g1, brw_imm_uw(0x3f80)); + + if (value1 == -1.0f) + bld.MOV(tmp, negate(tmp)); + + } else if (devinfo->gen >= 6) { /* Bit 15 of g0.0 is 0 if the polygon is front facing. */ fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W)); @@ -684,6 +704,16 @@ } } +static brw_rnd_mode +brw_rnd_mode_from_execution_mode(unsigned execution_mode) +{ + if (nir_has_any_rounding_mode_rtne(execution_mode)) + return BRW_RND_MODE_RTNE; + if (nir_has_any_rounding_mode_rtz(execution_mode)) + return BRW_RND_MODE_RTZ; + return BRW_RND_MODE_UNSPECIFIED; +} + fs_reg fs_visitor::prepare_alu_destination_and_sources(const fs_builder &bld, nir_alu_instr *instr, @@ -851,6 +881,16 @@ } op[0] = offset(op[0], bld, fsign_instr->src[0].swizzle[channel]); + + /* Resolve any source modifiers. We could do slightly better on Gen8+ + * if the only source modifier is negation, but *shrug*. + */ + if (op[1].negate || op[1].abs) { + fs_reg tmp = bld.vgrf(op[1].type); + + bld.MOV(tmp, op[1]); + op[1] = tmp; + } } else { assert(!instr->dest.saturate); } @@ -987,6 +1027,8 @@ { struct brw_wm_prog_key *fs_key = (struct brw_wm_prog_key *) this->key; fs_inst *inst; + unsigned execution_mode = + bld.shader->nir->info.float_controls_execution_mode; fs_reg op[4]; fs_reg result = prepare_alu_destination_and_sources(bld, instr, op, need_dest); @@ -1046,10 +1088,17 @@ case nir_op_f2f16_rtne: case nir_op_f2f16_rtz: - bld.emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(), - brw_imm_d(brw_rnd_mode_from_nir_op(instr->op))); - /* fallthrough */ - case nir_op_f2f16: + case nir_op_f2f16: { + brw_rnd_mode rnd = BRW_RND_MODE_UNSPECIFIED; + + if (nir_op_f2f16 == instr->op) + rnd = brw_rnd_mode_from_execution_mode(execution_mode); + else + rnd = brw_rnd_mode_from_nir_op(instr->op); + + if (BRW_RND_MODE_UNSPECIFIED != rnd) + bld.emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(), brw_imm_d(rnd)); + /* In theory, it would be better to use BRW_OPCODE_F32TO16. Depending * on the HW gen, it is a special hw opcode or just a MOV, and * brw_F32TO16 (at brw_eu_emit) would do the work to chose. @@ -1063,6 +1112,7 @@ inst = bld.MOV(result, op[0]); inst->saturate = instr->dest.saturate; break; + } case nir_op_b2i8: case nir_op_b2i16: @@ -1085,7 +1135,6 @@ case nir_op_f2u64: case nir_op_i2i32: case nir_op_u2u32: - case nir_op_f2f32: case nir_op_f2i32: case nir_op_f2u32: case nir_op_i2f16: @@ -1134,6 +1183,21 @@ inst->saturate = instr->dest.saturate; break; + case nir_op_f2f32: + if (nir_has_any_rounding_mode_enabled(execution_mode)) { + brw_rnd_mode rnd = + brw_rnd_mode_from_execution_mode(execution_mode); + bld.emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(), + brw_imm_d(rnd)); + } + + if (op[0].type == BRW_REGISTER_TYPE_HF) + assert(type_sz(result.type) < 8); /* brw_nir_lower_conversions */ + + inst = bld.MOV(result, op[0]); + inst->saturate = instr->dest.saturate; + break; + case nir_op_fsign: emit_fsign(bld, instr, result, op, 0); break; @@ -1196,17 +1260,58 @@ inst->saturate = instr->dest.saturate; break; - case nir_op_iadd: case nir_op_fadd: + if (nir_has_any_rounding_mode_enabled(execution_mode)) { + brw_rnd_mode rnd = + brw_rnd_mode_from_execution_mode(execution_mode); + bld.emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(), + brw_imm_d(rnd)); + } + /* fallthrough */ + case nir_op_iadd: inst = bld.ADD(result, op[0], op[1]); inst->saturate = instr->dest.saturate; break; + case nir_op_iadd_sat: case nir_op_uadd_sat: inst = bld.ADD(result, op[0], op[1]); inst->saturate = true; break; + case nir_op_isub_sat: + bld.emit(SHADER_OPCODE_ISUB_SAT, result, op[0], op[1]); + break; + + case nir_op_usub_sat: + bld.emit(SHADER_OPCODE_USUB_SAT, result, op[0], op[1]); + break; + + case nir_op_irhadd: + case nir_op_urhadd: + assert(nir_dest_bit_size(instr->dest.dest) < 64); + inst = bld.AVG(result, op[0], op[1]); + break; + + case nir_op_ihadd: + case nir_op_uhadd: { + assert(nir_dest_bit_size(instr->dest.dest) < 64); + fs_reg tmp = bld.vgrf(result.type); + + if (devinfo->gen >= 8) { + op[0] = resolve_source_modifiers(op[0]); + op[1] = resolve_source_modifiers(op[1]); + } + + /* AVG(x, y) - ((x ^ y) & 1) */ + bld.XOR(tmp, op[0], op[1]); + bld.AND(tmp, tmp, retype(brw_imm_ud(1), result.type)); + bld.AVG(result, op[0], op[1]); + inst = bld.ADD(result, result, tmp); + inst->src[1].negate = true; + break; + } + case nir_op_fmul: for (unsigned i = 0; i < 2; i++) { if (can_fuse_fmul_fsign(instr, i)) { @@ -1215,6 +1320,17 @@ } } + /* We emit the rounding mode after the previous fsign optimization since + * it won't result in a MUL, but will try to negate the value by other + * means. + */ + if (nir_has_any_rounding_mode_enabled(execution_mode)) { + brw_rnd_mode rnd = + brw_rnd_mode_from_execution_mode(execution_mode); + bld.emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(), + brw_imm_d(rnd)); + } + inst = bld.MUL(result, op[0], op[1]); inst->saturate = instr->dest.saturate; break; @@ -1224,6 +1340,34 @@ bld.MUL(result, op[0], op[1]); break; + case nir_op_imul_32x16: + case nir_op_umul_32x16: { + const bool ud = instr->op == nir_op_umul_32x16; + + assert(nir_dest_bit_size(instr->dest.dest) == 32); + + /* Before Gen7, the order of the 32-bit source and the 16-bit source was + * swapped. The extension isn't enabled on those platforms, so don't + * pretend to support the differences. + */ + assert(devinfo->gen >= 7); + + if (op[1].file == IMM) + op[1] = ud ? brw_imm_uw(op[1].ud) : brw_imm_w(op[1].d); + else { + const enum brw_reg_type word_type = + ud ? BRW_REGISTER_TYPE_UW : BRW_REGISTER_TYPE_W; + + op[1] = subscript(op[1], word_type, 0); + } + + const enum brw_reg_type dword_type = + ud ? BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_D; + + bld.MUL(result, retype(op[0], dword_type), op[1]); + break; + } + case nir_op_imul: assert(nir_dest_bit_size(instr->dest.dest) < 64); bld.MUL(result, op[0], op[1]); @@ -1521,6 +1665,12 @@ case nir_op_ftrunc: inst = bld.RNDZ(result, op[0]); + if (devinfo->gen < 6) { + set_condmod(BRW_CONDITIONAL_R, inst); + set_predicate(BRW_PREDICATE_NORMAL, + bld.ADD(result, result, brw_imm_f(1.0f))); + inst = bld.MOV(result, result); /* for potential saturation */ + } inst->saturate = instr->dest.saturate; break; @@ -1543,6 +1693,12 @@ break; case nir_op_fround_even: inst = bld.RNDE(result, op[0]); + if (devinfo->gen < 6) { + set_condmod(BRW_CONDITIONAL_R, inst); + set_predicate(BRW_PREDICATE_NORMAL, + bld.ADD(result, result, brw_imm_f(1.0f))); + inst = bld.MOV(result, result); /* for potential saturation */ + } inst->saturate = instr->dest.saturate; break; @@ -1600,11 +1756,18 @@ case nir_op_pack_half_2x16: unreachable("not reached: should be handled by lower_packing_builtins"); + case nir_op_unpack_half_2x16_split_x_flush_to_zero: + assert(FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16 & execution_mode); + /* Fall-through */ case nir_op_unpack_half_2x16_split_x: inst = bld.emit(BRW_OPCODE_F16TO32, result, subscript(op[0], BRW_REGISTER_TYPE_UW, 0)); inst->saturate = instr->dest.saturate; break; + + case nir_op_unpack_half_2x16_split_y_flush_to_zero: + assert(FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16 & execution_mode); + /* Fall-through */ case nir_op_unpack_half_2x16_split_y: inst = bld.emit(BRW_OPCODE_F16TO32, result, subscript(op[0], BRW_REGISTER_TYPE_UW, 1)); @@ -1655,6 +1818,11 @@ break; } + case nir_op_uclz: + assert(nir_dest_bit_size(instr->dest.dest) == 32); + bld.LZD(retype(result, BRW_REGISTER_TYPE_UD), op[0]); + break; + case nir_op_ifind_msb: { assert(nir_dest_bit_size(instr->dest.dest) < 64); @@ -1744,11 +1912,25 @@ break; case nir_op_ffma: + if (nir_has_any_rounding_mode_enabled(execution_mode)) { + brw_rnd_mode rnd = + brw_rnd_mode_from_execution_mode(execution_mode); + bld.emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(), + brw_imm_d(rnd)); + } + inst = bld.MAD(result, op[2], op[1], op[0]); inst->saturate = instr->dest.saturate; break; case nir_op_flrp: + if (nir_has_any_rounding_mode_enabled(execution_mode)) { + brw_rnd_mode rnd = + brw_rnd_mode_from_execution_mode(execution_mode); + bld.emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(), + brw_imm_d(rnd)); + } + inst = bld.LRP(result, op[0], op[1], op[2]); inst->saturate = instr->dest.saturate; break; @@ -2658,7 +2840,7 @@ brw_imm_d(tcs_key->input_vertices)); break; - case nir_intrinsic_barrier: { + case nir_intrinsic_control_barrier: { if (tcs_prog_data->instances == 1) break; @@ -3070,7 +3252,15 @@ static fs_reg fetch_render_target_array_index(const fs_builder &bld) { - if (bld.shader->devinfo->gen >= 6) { + if (bld.shader->devinfo->gen >= 12) { + /* The render target array index is provided in the thread payload as + * bits 26:16 of r1.1. + */ + const fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_UD); + bld.AND(idx, brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, 1, 3), + brw_imm_uw(0x7ff)); + return idx; + } else if (bld.shader->devinfo->gen >= 6) { /* The render target array index is provided in the thread payload as * bits 26:16 of r0.0. */ @@ -3220,44 +3410,6 @@ unreachable("Invalid location"); } -/* Annoyingly, we get the barycentrics into the shader in a layout that's - * optimized for PLN but it doesn't work nearly as well as one would like for - * manual interpolation. - */ -static void -shuffle_from_pln_layout(const fs_builder &bld, fs_reg dest, fs_reg pln_data) -{ - dest.type = BRW_REGISTER_TYPE_F; - pln_data.type = BRW_REGISTER_TYPE_F; - const fs_reg dest_u = offset(dest, bld, 0); - const fs_reg dest_v = offset(dest, bld, 1); - - for (unsigned g = 0; g < bld.dispatch_width() / 8; g++) { - const fs_builder gbld = bld.group(8, g); - gbld.MOV(horiz_offset(dest_u, g * 8), - byte_offset(pln_data, (g * 2 + 0) * REG_SIZE)); - gbld.MOV(horiz_offset(dest_v, g * 8), - byte_offset(pln_data, (g * 2 + 1) * REG_SIZE)); - } -} - -static void -shuffle_to_pln_layout(const fs_builder &bld, fs_reg pln_data, fs_reg src) -{ - pln_data.type = BRW_REGISTER_TYPE_F; - src.type = BRW_REGISTER_TYPE_F; - const fs_reg src_u = offset(src, bld, 0); - const fs_reg src_v = offset(src, bld, 1); - - for (unsigned g = 0; g < bld.dispatch_width() / 8; g++) { - const fs_builder gbld = bld.group(8, g); - gbld.MOV(byte_offset(pln_data, (g * 2 + 0) * REG_SIZE), - horiz_offset(src_u, g * 8)); - gbld.MOV(byte_offset(pln_data, (g * 2 + 1) * REG_SIZE), - horiz_offset(src_v, g * 8)); - } -} - void fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr) @@ -3428,7 +3580,7 @@ emit_discard_jump(); } - limit_dispatch_width(16, "Fragment discard/demote not implemented in SIMD32 mode."); + limit_dispatch_width(16, "Fragment discard/demote not implemented in SIMD32 mode.\n"); break; } @@ -3472,8 +3624,9 @@ (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr); enum brw_barycentric_mode bary = brw_barycentric_mode(interp_mode, instr->intrinsic); - - shuffle_from_pln_layout(bld, dest, this->delta_xy[bary]); + const fs_reg srcs[] = { offset(this->delta_xy[bary], bld, 0), + offset(this->delta_xy[bary], bld, 1) }; + bld.LOAD_PAYLOAD(dest, srcs, ARRAY_SIZE(srcs), 0); break; } @@ -3481,13 +3634,12 @@ const glsl_interp_mode interpolation = (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr); - fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, 2); if (nir_src_is_const(instr->src[0])) { unsigned msg_data = nir_src_as_uint(instr->src[0]) << 4; emit_pixel_interpolater_send(bld, FS_OPCODE_INTERPOLATE_AT_SAMPLE, - tmp, + dest, fs_reg(), /* src */ brw_imm_ud(msg_data), interpolation); @@ -3502,9 +3654,9 @@ .SHL(msg_data, sample_id, brw_imm_ud(4u)); emit_pixel_interpolater_send(bld, FS_OPCODE_INTERPOLATE_AT_SAMPLE, - tmp, + dest, fs_reg(), /* src */ - msg_data, + component(msg_data, 0), interpolation); } else { /* Make a loop that sends a message to the pixel interpolater @@ -3530,7 +3682,7 @@ fs_inst *inst = emit_pixel_interpolater_send(bld, FS_OPCODE_INTERPOLATE_AT_SAMPLE, - tmp, + dest, fs_reg(), /* src */ component(msg_data, 0), interpolation); @@ -3542,7 +3694,6 @@ bld.emit(BRW_OPCODE_WHILE)); } } - shuffle_from_pln_layout(bld, dest, tmp); break; } @@ -3552,7 +3703,6 @@ nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]); - fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, 2); if (const_offset) { assert(nir_src_bit_size(instr->src[0]) == 32); unsigned off_x = MIN2((int)(const_offset[0].f32 * 16), 7) & 0xf; @@ -3560,7 +3710,7 @@ emit_pixel_interpolater_send(bld, FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET, - tmp, + dest, fs_reg(), /* src */ brw_imm_ud(off_x | (off_y << 4)), interpolation); @@ -3597,12 +3747,11 @@ const enum opcode opcode = FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET; emit_pixel_interpolater_send(bld, opcode, - tmp, + dest, src, brw_imm_ud(0u), interpolation); } - shuffle_from_pln_layout(bld, dest, tmp); break; } @@ -3622,25 +3771,19 @@ if (bary_intrin == nir_intrinsic_load_barycentric_at_offset || bary_intrin == nir_intrinsic_load_barycentric_at_sample) { - /* Use the result of the PI message. Because the load_barycentric - * intrinsics return a regular vec2 and we need it in PLN layout, we - * have to do a translation. Fortunately, copy-prop cleans this up - * reliably. - */ - dst_xy = bld.vgrf(BRW_REGISTER_TYPE_F, 2); - shuffle_to_pln_layout(bld, dst_xy, get_nir_src(instr->src[0])); + /* Use the result of the PI message. */ + dst_xy = retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_F); } else { /* Use the delta_xy values computed from the payload */ enum brw_barycentric_mode bary = brw_barycentric_mode(interp_mode, bary_intrin); - dst_xy = this->delta_xy[bary]; } for (unsigned int i = 0; i < instr->num_components; i++) { fs_reg interp = - interp_reg(nir_intrinsic_base(instr), - nir_intrinsic_component(instr) + i); + component(interp_reg(nir_intrinsic_base(instr), + nir_intrinsic_component(instr) + i), 0); interp.type = BRW_REGISTER_TYPE_F; dest.type = BRW_REGISTER_TYPE_F; @@ -3661,20 +3804,6 @@ } } -static int -get_op_for_atomic_add(nir_intrinsic_instr *instr, unsigned src) -{ - if (nir_src_is_const(instr->src[src])) { - int64_t add_val = nir_src_as_int(instr->src[src]); - if (add_val == 1) - return BRW_AOP_INC; - else if (add_val == -1) - return BRW_AOP_DEC; - } - - return BRW_AOP_ADD; -} - void fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr) @@ -3687,7 +3816,16 @@ dest = get_nir_dest(instr->dest); switch (instr->intrinsic) { - case nir_intrinsic_barrier: + case nir_intrinsic_control_barrier: + /* The whole workgroup fits in a single HW thread, so all the + * invocations are already executed lock-step. Instead of an actual + * barrier just emit a scheduling fence, that will generate no code. + */ + if (workgroup_size() <= dispatch_width) { + bld.exec_all().group(1, 0).emit(FS_OPCODE_SCHEDULING_FENCE); + break; + } + emit_barrier(); cs_prog_data->uses_barrier = true; break; @@ -3728,43 +3866,21 @@ } case nir_intrinsic_shared_atomic_add: - nir_emit_shared_atomic(bld, get_op_for_atomic_add(instr, 1), instr); - break; case nir_intrinsic_shared_atomic_imin: - nir_emit_shared_atomic(bld, BRW_AOP_IMIN, instr); - break; case nir_intrinsic_shared_atomic_umin: - nir_emit_shared_atomic(bld, BRW_AOP_UMIN, instr); - break; case nir_intrinsic_shared_atomic_imax: - nir_emit_shared_atomic(bld, BRW_AOP_IMAX, instr); - break; case nir_intrinsic_shared_atomic_umax: - nir_emit_shared_atomic(bld, BRW_AOP_UMAX, instr); - break; case nir_intrinsic_shared_atomic_and: - nir_emit_shared_atomic(bld, BRW_AOP_AND, instr); - break; case nir_intrinsic_shared_atomic_or: - nir_emit_shared_atomic(bld, BRW_AOP_OR, instr); - break; case nir_intrinsic_shared_atomic_xor: - nir_emit_shared_atomic(bld, BRW_AOP_XOR, instr); - break; case nir_intrinsic_shared_atomic_exchange: - nir_emit_shared_atomic(bld, BRW_AOP_MOV, instr); - break; case nir_intrinsic_shared_atomic_comp_swap: - nir_emit_shared_atomic(bld, BRW_AOP_CMPWR, instr); + nir_emit_shared_atomic(bld, brw_aop_for_nir_intrinsic(instr), instr); break; case nir_intrinsic_shared_atomic_fmin: - nir_emit_shared_atomic_float(bld, BRW_AOP_FMIN, instr); - break; case nir_intrinsic_shared_atomic_fmax: - nir_emit_shared_atomic_float(bld, BRW_AOP_FMAX, instr); - break; case nir_intrinsic_shared_atomic_fcomp_swap: - nir_emit_shared_atomic_float(bld, BRW_AOP_FCMPWR, instr); + nir_emit_shared_atomic_float(bld, brw_aop_for_nir_intrinsic(instr), instr); break; case nir_intrinsic_load_shared: { @@ -3849,8 +3965,14 @@ { nir_const_value value = nir_alu_binop_identity(op, type_sz(type) * 8); switch (type_sz(type)) { + case 1: + if (type == BRW_REGISTER_TYPE_UB) { + return brw_imm_uw(value.u8); + } else { + assert(type == BRW_REGISTER_TYPE_B); + return brw_imm_w(value.i8); + } case 2: - assert(type != BRW_REGISTER_TYPE_HF); return retype(brw_imm_uw(value.u16), type); case 4: return retype(brw_imm_ud(value.u32), type); @@ -3913,17 +4035,20 @@ nir_intrinsic_instr *instr) { fs_reg image = retype(get_nir_src_imm(instr->src[0]), BRW_REGISTER_TYPE_UD); + fs_reg surf_index = image; if (stage_prog_data->binding_table.image_start > 0) { if (image.file == BRW_IMMEDIATE_VALUE) { - image.d += stage_prog_data->binding_table.image_start; + surf_index = + brw_imm_ud(image.d + stage_prog_data->binding_table.image_start); } else { - bld.ADD(image, image, + surf_index = vgrf(glsl_type::uint_type); + bld.ADD(surf_index, image, brw_imm_d(stage_prog_data->binding_table.image_start)); } } - return bld.emit_uniformize(image); + return bld.emit_uniformize(surf_index); } fs_reg @@ -3968,6 +4093,61 @@ } } +/** + * The offsets we get from NIR act as if each SIMD channel has it's own blob + * of contiguous space. However, if we actually place each SIMD channel in + * it's own space, we end up with terrible cache performance because each SIMD + * channel accesses a different cache line even when they're all accessing the + * same byte offset. To deal with this problem, we swizzle the address using + * a simple algorithm which ensures that any time a SIMD message reads or + * writes the same address, it's all in the same cache line. We have to keep + * the bottom two bits fixed so that we can read/write up to a dword at a time + * and the individual element is contiguous. We do this by splitting the + * address as follows: + * + * 31 4-6 2 0 + * +-------------------------------+------------+----------+ + * | Hi address bits | chan index | addr low | + * +-------------------------------+------------+----------+ + * + * In other words, the bottom two address bits stay, and the top 30 get + * shifted up so that we can stick the SIMD channel index in the middle. This + * way, we can access 8, 16, or 32-bit elements and, when accessing a 32-bit + * at the same logical offset, the scratch read/write instruction acts on + * continuous elements and we get good cache locality. + */ +fs_reg +fs_visitor::swizzle_nir_scratch_addr(const brw::fs_builder &bld, + const fs_reg &nir_addr, + bool in_dwords) +{ + const fs_reg &chan_index = + nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION]; + const unsigned chan_index_bits = ffs(dispatch_width) - 1; + + fs_reg addr = bld.vgrf(BRW_REGISTER_TYPE_UD); + if (in_dwords) { + /* In this case, we know the address is aligned to a DWORD and we want + * the final address in DWORDs. + */ + bld.SHL(addr, nir_addr, brw_imm_ud(chan_index_bits - 2)); + bld.OR(addr, addr, chan_index); + } else { + /* This case substantially more annoying because we have to pay + * attention to those pesky two bottom bits. + */ + fs_reg addr_hi = bld.vgrf(BRW_REGISTER_TYPE_UD); + bld.AND(addr_hi, nir_addr, brw_imm_ud(~0x3u)); + bld.SHL(addr_hi, addr_hi, brw_imm_ud(chan_index_bits)); + fs_reg chan_addr = bld.vgrf(BRW_REGISTER_TYPE_UD); + bld.SHL(chan_addr, chan_index, brw_imm_ud(2)); + bld.AND(addr, nir_addr, brw_imm_ud(0x3u)); + bld.OR(addr, addr, addr_hi); + bld.OR(addr, addr, chan_addr); + } + return addr; +} + void fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr) { @@ -3979,8 +4159,10 @@ case nir_intrinsic_image_load: case nir_intrinsic_image_store: case nir_intrinsic_image_atomic_add: - case nir_intrinsic_image_atomic_min: - case nir_intrinsic_image_atomic_max: + case nir_intrinsic_image_atomic_imin: + case nir_intrinsic_image_atomic_umin: + case nir_intrinsic_image_atomic_imax: + case nir_intrinsic_image_atomic_umax: case nir_intrinsic_image_atomic_and: case nir_intrinsic_image_atomic_or: case nir_intrinsic_image_atomic_xor: @@ -3989,8 +4171,10 @@ case nir_intrinsic_bindless_image_load: case nir_intrinsic_bindless_image_store: case nir_intrinsic_bindless_image_atomic_add: - case nir_intrinsic_bindless_image_atomic_min: - case nir_intrinsic_bindless_image_atomic_max: + case nir_intrinsic_bindless_image_atomic_imin: + case nir_intrinsic_bindless_image_atomic_umin: + case nir_intrinsic_bindless_image_atomic_imax: + case nir_intrinsic_bindless_image_atomic_umax: case nir_intrinsic_bindless_image_atomic_and: case nir_intrinsic_bindless_image_atomic_or: case nir_intrinsic_bindless_image_atomic_xor: @@ -4002,7 +4186,6 @@ /* Get some metadata from the image intrinsic. */ const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic]; - const GLenum format = nir_intrinsic_format(instr); fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; @@ -4010,8 +4193,10 @@ case nir_intrinsic_image_load: case nir_intrinsic_image_store: case nir_intrinsic_image_atomic_add: - case nir_intrinsic_image_atomic_min: - case nir_intrinsic_image_atomic_max: + case nir_intrinsic_image_atomic_imin: + case nir_intrinsic_image_atomic_umin: + case nir_intrinsic_image_atomic_imax: + case nir_intrinsic_image_atomic_umax: case nir_intrinsic_image_atomic_and: case nir_intrinsic_image_atomic_or: case nir_intrinsic_image_atomic_xor: @@ -4047,51 +4232,11 @@ bld.emit(SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL, fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS); } else { - int op; unsigned num_srcs = info->num_srcs; - - switch (instr->intrinsic) { - case nir_intrinsic_image_atomic_add: - case nir_intrinsic_bindless_image_atomic_add: + int op = brw_aop_for_nir_intrinsic(instr); + if (op == BRW_AOP_INC || op == BRW_AOP_DEC) { assert(num_srcs == 4); - - op = get_op_for_atomic_add(instr, 3); - - if (op != BRW_AOP_ADD) - num_srcs = 3; - break; - case nir_intrinsic_image_atomic_min: - case nir_intrinsic_bindless_image_atomic_min: - assert(format == GL_R32UI || format == GL_R32I); - op = (format == GL_R32I) ? BRW_AOP_IMIN : BRW_AOP_UMIN; - break; - case nir_intrinsic_image_atomic_max: - case nir_intrinsic_bindless_image_atomic_max: - assert(format == GL_R32UI || format == GL_R32I); - op = (format == GL_R32I) ? BRW_AOP_IMAX : BRW_AOP_UMAX; - break; - case nir_intrinsic_image_atomic_and: - case nir_intrinsic_bindless_image_atomic_and: - op = BRW_AOP_AND; - break; - case nir_intrinsic_image_atomic_or: - case nir_intrinsic_bindless_image_atomic_or: - op = BRW_AOP_OR; - break; - case nir_intrinsic_image_atomic_xor: - case nir_intrinsic_bindless_image_atomic_xor: - op = BRW_AOP_XOR; - break; - case nir_intrinsic_image_atomic_exchange: - case nir_intrinsic_bindless_image_atomic_exchange: - op = BRW_AOP_MOV; - break; - case nir_intrinsic_image_atomic_comp_swap: - case nir_intrinsic_bindless_image_atomic_comp_swap: - op = BRW_AOP_CMPWR; - break; - default: - unreachable("Not reachable."); + num_srcs = 3; } srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op); @@ -4188,22 +4333,42 @@ break; } + case nir_intrinsic_scoped_memory_barrier: case nir_intrinsic_group_memory_barrier: case nir_intrinsic_memory_barrier_shared: - case nir_intrinsic_memory_barrier_atomic_counter: case nir_intrinsic_memory_barrier_buffer: case nir_intrinsic_memory_barrier_image: case nir_intrinsic_memory_barrier: { bool l3_fence, slm_fence; - if (devinfo->gen >= 11) { + if (instr->intrinsic == nir_intrinsic_scoped_memory_barrier) { + nir_variable_mode modes = nir_intrinsic_memory_modes(instr); + l3_fence = modes & (nir_var_shader_out | + nir_var_mem_ssbo | + nir_var_mem_global); + slm_fence = modes & nir_var_mem_shared; + } else { l3_fence = instr->intrinsic != nir_intrinsic_memory_barrier_shared; slm_fence = instr->intrinsic == nir_intrinsic_group_memory_barrier || instr->intrinsic == nir_intrinsic_memory_barrier || instr->intrinsic == nir_intrinsic_memory_barrier_shared; - } else { - /* Prior to gen11, we only have one kind of fence. */ - l3_fence = true; + } + + if (stage != MESA_SHADER_COMPUTE) + slm_fence = false; + + /* If the workgroup fits in a single HW thread, the messages for SLM are + * processed in-order and the shader itself is already synchronized so + * the memory fence is not necessary. + * + * TODO: Check if applies for many HW threads sharing same Data Port. + */ + if (slm_fence && workgroup_size() <= dispatch_width) + slm_fence = false; + + /* Prior to Gen11, there's only L3 fence, so emit that instead. */ + if (slm_fence && devinfo->gen < 11) { slm_fence = false; + l3_fence = true; } /* Be conservative in Gen11+ and always stall in a fence. Since there @@ -4235,9 +4400,15 @@ ->size_written = 2 * REG_SIZE; } + if (!l3_fence && !slm_fence) + ubld.emit(FS_OPCODE_SCHEDULING_FENCE); + break; } + case nir_intrinsic_memory_barrier_tcs_patch: + break; + case nir_intrinsic_shader_clock: { /* We cannot do anything if there is an event, so ignore it for now */ const fs_reg shader_clock = get_timestamp(bld); @@ -4340,6 +4511,8 @@ for (int i = 0; i < instr->num_components; i++) VARYING_PULL_CONSTANT_LOAD(bld, offset(dest, bld, i), surf_index, base_offset, i * type_sz(dest.type)); + + prog_data->has_ubo_pull = true; } else { /* Even if we are loading doubles, a pull constant load will load * a 32-bit vec4, so should only reserve vgrf space for that. If we @@ -4379,6 +4552,8 @@ } } + prog_data->has_ubo_pull = true; + const unsigned block_sz = 64; /* Fetch one cacheline at a time. */ const fs_builder ubld = bld.exec_all().group(block_sz / 4, 0); const fs_reg packed_consts = ubld.vgrf(BRW_REGISTER_TYPE_UD); @@ -4463,43 +4638,21 @@ break; case nir_intrinsic_global_atomic_add: - nir_emit_global_atomic(bld, get_op_for_atomic_add(instr, 1), instr); - break; case nir_intrinsic_global_atomic_imin: - nir_emit_global_atomic(bld, BRW_AOP_IMIN, instr); - break; case nir_intrinsic_global_atomic_umin: - nir_emit_global_atomic(bld, BRW_AOP_UMIN, instr); - break; case nir_intrinsic_global_atomic_imax: - nir_emit_global_atomic(bld, BRW_AOP_IMAX, instr); - break; case nir_intrinsic_global_atomic_umax: - nir_emit_global_atomic(bld, BRW_AOP_UMAX, instr); - break; case nir_intrinsic_global_atomic_and: - nir_emit_global_atomic(bld, BRW_AOP_AND, instr); - break; case nir_intrinsic_global_atomic_or: - nir_emit_global_atomic(bld, BRW_AOP_OR, instr); - break; case nir_intrinsic_global_atomic_xor: - nir_emit_global_atomic(bld, BRW_AOP_XOR, instr); - break; case nir_intrinsic_global_atomic_exchange: - nir_emit_global_atomic(bld, BRW_AOP_MOV, instr); - break; case nir_intrinsic_global_atomic_comp_swap: - nir_emit_global_atomic(bld, BRW_AOP_CMPWR, instr); + nir_emit_global_atomic(bld, brw_aop_for_nir_intrinsic(instr), instr); break; case nir_intrinsic_global_atomic_fmin: - nir_emit_global_atomic_float(bld, BRW_AOP_FMIN, instr); - break; case nir_intrinsic_global_atomic_fmax: - nir_emit_global_atomic_float(bld, BRW_AOP_FMAX, instr); - break; case nir_intrinsic_global_atomic_fcomp_swap: - nir_emit_global_atomic_float(bld, BRW_AOP_FCMPWR, instr); + nir_emit_global_atomic_float(bld, brw_aop_for_nir_intrinsic(instr), instr); break; case nir_intrinsic_load_ssbo: { @@ -4593,43 +4746,21 @@ } case nir_intrinsic_ssbo_atomic_add: - nir_emit_ssbo_atomic(bld, get_op_for_atomic_add(instr, 2), instr); - break; case nir_intrinsic_ssbo_atomic_imin: - nir_emit_ssbo_atomic(bld, BRW_AOP_IMIN, instr); - break; case nir_intrinsic_ssbo_atomic_umin: - nir_emit_ssbo_atomic(bld, BRW_AOP_UMIN, instr); - break; case nir_intrinsic_ssbo_atomic_imax: - nir_emit_ssbo_atomic(bld, BRW_AOP_IMAX, instr); - break; case nir_intrinsic_ssbo_atomic_umax: - nir_emit_ssbo_atomic(bld, BRW_AOP_UMAX, instr); - break; case nir_intrinsic_ssbo_atomic_and: - nir_emit_ssbo_atomic(bld, BRW_AOP_AND, instr); - break; case nir_intrinsic_ssbo_atomic_or: - nir_emit_ssbo_atomic(bld, BRW_AOP_OR, instr); - break; case nir_intrinsic_ssbo_atomic_xor: - nir_emit_ssbo_atomic(bld, BRW_AOP_XOR, instr); - break; case nir_intrinsic_ssbo_atomic_exchange: - nir_emit_ssbo_atomic(bld, BRW_AOP_MOV, instr); - break; case nir_intrinsic_ssbo_atomic_comp_swap: - nir_emit_ssbo_atomic(bld, BRW_AOP_CMPWR, instr); + nir_emit_ssbo_atomic(bld, brw_aop_for_nir_intrinsic(instr), instr); break; case nir_intrinsic_ssbo_atomic_fmin: - nir_emit_ssbo_atomic_float(bld, BRW_AOP_FMIN, instr); - break; case nir_intrinsic_ssbo_atomic_fmax: - nir_emit_ssbo_atomic_float(bld, BRW_AOP_FMAX, instr); - break; case nir_intrinsic_ssbo_atomic_fcomp_swap: - nir_emit_ssbo_atomic_float(bld, BRW_AOP_FCMPWR, instr); + nir_emit_ssbo_atomic_float(bld, brw_aop_for_nir_intrinsic(instr), instr); break; case nir_intrinsic_get_buffer_size: { @@ -4691,6 +4822,99 @@ break; } + case nir_intrinsic_load_scratch: { + assert(devinfo->gen >= 7); + + assert(nir_dest_num_components(instr->dest) == 1); + const unsigned bit_size = nir_dest_bit_size(instr->dest); + fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; + + if (devinfo->gen >= 8) { + srcs[SURFACE_LOGICAL_SRC_SURFACE] = + brw_imm_ud(GEN8_BTI_STATELESS_NON_COHERENT); + } else { + srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(BRW_BTI_STATELESS); + } + + srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); + srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size); + const fs_reg nir_addr = get_nir_src(instr->src[0]); + + /* Make dest unsigned because that's what the temporary will be */ + dest.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD); + + /* Read the vector */ + if (nir_intrinsic_align(instr) >= 4) { + assert(nir_dest_bit_size(instr->dest) == 32); + + /* The offset for a DWORD scattered message is in dwords. */ + srcs[SURFACE_LOGICAL_SRC_ADDRESS] = + swizzle_nir_scratch_addr(bld, nir_addr, true); + + bld.emit(SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL, + dest, srcs, SURFACE_LOGICAL_NUM_SRCS); + } else { + assert(nir_dest_bit_size(instr->dest) <= 32); + + srcs[SURFACE_LOGICAL_SRC_ADDRESS] = + swizzle_nir_scratch_addr(bld, nir_addr, false); + + fs_reg read_result = bld.vgrf(BRW_REGISTER_TYPE_UD); + bld.emit(SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL, + read_result, srcs, SURFACE_LOGICAL_NUM_SRCS); + bld.MOV(dest, read_result); + } + break; + } + + case nir_intrinsic_store_scratch: { + assert(devinfo->gen >= 7); + + assert(nir_src_num_components(instr->src[0]) == 1); + const unsigned bit_size = nir_src_bit_size(instr->src[0]); + fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; + + if (devinfo->gen >= 8) { + srcs[SURFACE_LOGICAL_SRC_SURFACE] = + brw_imm_ud(GEN8_BTI_STATELESS_NON_COHERENT); + } else { + srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(BRW_BTI_STATELESS); + } + + srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); + srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size); + const fs_reg nir_addr = get_nir_src(instr->src[1]); + + fs_reg data = get_nir_src(instr->src[0]); + data.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD); + + assert(nir_intrinsic_write_mask(instr) == + (1u << instr->num_components) - 1); + if (nir_intrinsic_align(instr) >= 4) { + assert(nir_src_bit_size(instr->src[0]) == 32); + srcs[SURFACE_LOGICAL_SRC_DATA] = data; + + /* The offset for a DWORD scattered message is in dwords. */ + srcs[SURFACE_LOGICAL_SRC_ADDRESS] = + swizzle_nir_scratch_addr(bld, nir_addr, true); + + bld.emit(SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL, + fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS); + } else { + assert(nir_src_bit_size(instr->src[0]) <= 32); + + srcs[SURFACE_LOGICAL_SRC_DATA] = bld.vgrf(BRW_REGISTER_TYPE_UD); + bld.MOV(srcs[SURFACE_LOGICAL_SRC_DATA], data); + + srcs[SURFACE_LOGICAL_SRC_ADDRESS] = + swizzle_nir_scratch_addr(bld, nir_addr, false); + + bld.emit(SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL, + fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS); + } + break; + } + case nir_intrinsic_load_subgroup_size: /* This should only happen for fragment shaders because every other case * is lowered in NIR so we can optimize on it. @@ -4973,10 +5197,28 @@ opcode brw_op = brw_op_for_nir_reduction_op(redop); brw_conditional_mod cond_mod = brw_cond_mod_for_nir_reduction_op(redop); + /* There are a couple of register region issues that make things + * complicated for 8-bit types: + * + * 1. Only raw moves are allowed to write to a packed 8-bit + * destination. + * 2. If we use a strided destination, the efficient way to do scan + * operations ends up using strides that are too big to encode in + * an instruction. + * + * To get around these issues, we just do all 8-bit scan operations in + * 16 bits. It's actually fewer instructions than what we'd have to do + * if we were trying to do it in native 8-bit types and the results are + * the same once we truncate to 8 bits at the end. + */ + brw_reg_type scan_type = src.type; + if (type_sz(scan_type) == 1) + scan_type = brw_reg_type_from_bit_size(16, src.type); + /* Set up a register for all of our scratching around and initialize it * to reduction operation's identity value. */ - fs_reg scan = bld.vgrf(src.type); + fs_reg scan = bld.vgrf(scan_type); bld.exec_all().emit(SHADER_OPCODE_SEL_EXEC, scan, src, identity); bld.emit_scan(brw_op, scan, cluster_size, cond_mod); @@ -5019,10 +5261,28 @@ opcode brw_op = brw_op_for_nir_reduction_op(redop); brw_conditional_mod cond_mod = brw_cond_mod_for_nir_reduction_op(redop); + /* There are a couple of register region issues that make things + * complicated for 8-bit types: + * + * 1. Only raw moves are allowed to write to a packed 8-bit + * destination. + * 2. If we use a strided destination, the efficient way to do scan + * operations ends up using strides that are too big to encode in + * an instruction. + * + * To get around these issues, we just do all 8-bit scan operations in + * 16 bits. It's actually fewer instructions than what we'd have to do + * if we were trying to do it in native 8-bit types and the results are + * the same once we truncate to 8 bits at the end. + */ + brw_reg_type scan_type = src.type; + if (type_sz(scan_type) == 1) + scan_type = brw_reg_type_from_bit_size(16, src.type); + /* Set up a register for all of our scratching around and initialize it * to reduction operation's identity value. */ - fs_reg scan = bld.vgrf(src.type); + fs_reg scan = bld.vgrf(scan_type); const fs_builder allbld = bld.exec_all(); allbld.emit(SHADER_OPCODE_SEL_EXEC, scan, src, identity); @@ -5031,7 +5291,7 @@ * shift of the contents before we can begin. To make things worse, * we can't do this with a normal stride; we have to use indirects. */ - fs_reg shifted = bld.vgrf(src.type); + fs_reg shifted = bld.vgrf(scan_type); fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_W); allbld.ADD(idx, nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION], brw_imm_w(-1)); diff -Nru mesa-19.2.8/src/intel/compiler/brw_fs_reg_allocate.cpp mesa-20.0.8/src/intel/compiler/brw_fs_reg_allocate.cpp --- mesa-19.2.8/src/intel/compiler/brw_fs_reg_allocate.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/compiler/brw_fs_reg_allocate.cpp 2020-06-12 01:21:17.000000000 +0000 @@ -72,6 +72,15 @@ } +/** + * Size of a register from the aligned_bary_class register class. + */ +static unsigned +aligned_bary_size(unsigned dispatch_width) +{ + return (dispatch_width == 8 ? 2 : 4); +} + static void brw_alloc_reg_set(struct brw_compiler *compiler, int dispatch_width) { @@ -145,10 +154,11 @@ if (devinfo->gen >= 6) ra_set_allocate_round_robin(regs); int *classes = ralloc_array(compiler, int, class_count); - int aligned_pairs_class = -1; + int aligned_bary_class = -1; /* Allocate space for q values. We allocate class_count + 1 because we - * want to leave room for the aligned pairs class if we have it. */ + * want to leave room for the aligned barycentric class if we have it. + */ unsigned int **q_values = ralloc_array(compiler, unsigned int *, class_count + 1); for (int i = 0; i < class_count + 1; ++i) @@ -158,8 +168,8 @@ * between them and the base GRF registers (and also each other). */ int reg = 0; - int pairs_base_reg = 0; - int pairs_reg_count = 0; + int aligned_bary_base_reg = 0; + int aligned_bary_reg_count = 0; for (int i = 0; i < class_count; i++) { int class_reg_count; if (devinfo->gen <= 5 && dispatch_width >= 16) { @@ -202,10 +212,10 @@ } classes[i] = ra_alloc_reg_class(regs); - /* Save this off for the aligned pair class at the end. */ - if (class_sizes[i] == 2) { - pairs_base_reg = reg; - pairs_reg_count = class_reg_count; + /* Save this off for the aligned barycentric class at the end. */ + if (class_sizes[i] == int(aligned_bary_size(dispatch_width))) { + aligned_bary_base_reg = reg; + aligned_bary_reg_count = class_reg_count; } if (devinfo->gen <= 5 && dispatch_width >= 16) { @@ -246,29 +256,33 @@ for (int reg = 0; reg < base_reg_count; reg++) ra_make_reg_conflicts_transitive(regs, reg); - /* Add a special class for aligned pairs, which we'll put delta_xy - * in on Gen <= 6 so that we can do PLN. + /* Add a special class for aligned barycentrics, which we'll put the + * first source of LINTERP on so that we can do PLN on Gen <= 6. */ - if (devinfo->has_pln && dispatch_width == 8 && devinfo->gen <= 6) { - aligned_pairs_class = ra_alloc_reg_class(regs); - - for (int i = 0; i < pairs_reg_count; i++) { - if ((ra_reg_to_grf[pairs_base_reg + i] & 1) == 0) { - ra_class_add_reg(regs, aligned_pairs_class, pairs_base_reg + i); + if (devinfo->has_pln && (devinfo->gen == 6 || + (dispatch_width == 8 && devinfo->gen <= 5))) { + aligned_bary_class = ra_alloc_reg_class(regs); + + for (int i = 0; i < aligned_bary_reg_count; i++) { + if ((ra_reg_to_grf[aligned_bary_base_reg + i] & 1) == 0) { + ra_class_add_reg(regs, aligned_bary_class, + aligned_bary_base_reg + i); } } for (int i = 0; i < class_count; i++) { - /* These are a little counter-intuitive because the pair registers - * are required to be aligned while the register they are - * potentially interferring with are not. In the case where the - * size is even, the worst-case is that the register is - * odd-aligned. In the odd-size case, it doesn't matter. + /* These are a little counter-intuitive because the barycentric + * registers are required to be aligned while the register they are + * potentially interferring with are not. In the case where the size + * is even, the worst-case is that the register is odd-aligned. In + * the odd-size case, it doesn't matter. */ - q_values[class_count][i] = class_sizes[i] / 2 + 1; - q_values[i][class_count] = class_sizes[i] + 1; + q_values[class_count][i] = class_sizes[i] / 2 + + aligned_bary_size(dispatch_width) / 2; + q_values[i][class_count] = class_sizes[i] + + aligned_bary_size(dispatch_width) - 1; } - q_values[class_count][class_count] = 1; + q_values[class_count][class_count] = aligned_bary_size(dispatch_width) - 1; } ra_set_finalize(regs, q_values); @@ -281,7 +295,7 @@ for (int i = 0; i < class_count; i++) compiler->fs_reg_sets[index].classes[class_sizes[i] - 1] = classes[i]; compiler->fs_reg_sets[index].ra_reg_to_grf = ra_reg_to_grf; - compiler->fs_reg_sets[index].aligned_pairs_class = aligned_pairs_class; + compiler->fs_reg_sets[index].aligned_bary_class = aligned_bary_class; } void @@ -494,7 +508,7 @@ } if (inst->mlen > 0) { - for (int i = 0; i < v->implied_mrf_writes(inst); i++) { + for (unsigned i = 0; i < inst->implied_mrf_writes(); i++) { mrf_used[inst->base_mrf + i] = true; } } @@ -683,12 +697,18 @@ int size = fs->alloc.sizes[vgrf]; int reg = compiler->fs_reg_sets[rsi].class_to_ra_reg_range[size] - 1; - /* If something happened to spill, we want to push the EOT send - * register early enough in the register file that we don't - * conflict with any used MRF hack registers. - */ - if (first_mrf_hack_node >= 0) + if (first_mrf_hack_node >= 0) { + /* If something happened to spill, we want to push the EOT send + * register early enough in the register file that we don't + * conflict with any used MRF hack registers. + */ reg -= BRW_MAX_MRF(devinfo->gen) - spill_base_mrf(fs); + } else if (grf127_send_hack_node >= 0) { + /* Avoid r127 which might be unusable if the node was previously + * written by a SIMD8 SEND message with source/destination overlap. + */ + reg--; + } ra_set_node_reg(g, first_vgrf_node + vgrf, reg); } @@ -763,32 +783,34 @@ if (grf127_send_hack_node >= 0) ra_set_node_reg(g, grf127_send_hack_node, 127); + /* Specify the classes of each virtual register. */ for (unsigned i = 0; i < fs->alloc.count; i++) { unsigned size = fs->alloc.sizes[i]; - int c; assert(size <= ARRAY_SIZE(compiler->fs_reg_sets[rsi].classes) && "Register allocation relies on split_virtual_grfs()"); - c = compiler->fs_reg_sets[rsi].classes[size - 1]; - /* Special case: on pre-GEN6 hardware that supports PLN, the - * second operand of a PLN instruction needs to be an - * even-numbered register, so we have a special register class - * wm_aligned_pairs_class to handle this case. pre-GEN6 always - * uses fs->delta_xy[BRW_BARYCENTRIC_PERSPECTIVE_PIXEL] as the - * second operand of a PLN instruction (since it doesn't support - * any other interpolation modes). So all we need to do is find - * that register and set it to the appropriate class. - */ - if (compiler->fs_reg_sets[rsi].aligned_pairs_class >= 0 && - fs->delta_xy[BRW_BARYCENTRIC_PERSPECTIVE_PIXEL].file == VGRF && - fs->delta_xy[BRW_BARYCENTRIC_PERSPECTIVE_PIXEL].nr == i) { - c = compiler->fs_reg_sets[rsi].aligned_pairs_class; - } + ra_set_node_class(g, first_vgrf_node + i, + compiler->fs_reg_sets[rsi].classes[size - 1]); + } - ra_set_node_class(g, first_vgrf_node + i, c); + /* Special case: on pre-Gen7 hardware that supports PLN, the second operand + * of a PLN instruction needs to be an even-numbered register, so we have a + * special register class aligned_bary_class to handle this case. + */ + if (compiler->fs_reg_sets[rsi].aligned_bary_class >= 0) { + foreach_block_and_inst(block, fs_inst, inst, fs->cfg) { + if (inst->opcode == FS_OPCODE_LINTERP && inst->src[0].file == VGRF && + fs->alloc.sizes[inst->src[0].nr] == + aligned_bary_size(fs->dispatch_width)) { + ra_set_node_class(g, first_vgrf_node + inst->src[0].nr, + compiler->fs_reg_sets[rsi].aligned_bary_class); + } + } + } - /* Add interference based on the live range of the register */ + /* Add interference based on the live range of the register */ + for (unsigned i = 0; i < fs->alloc.count; i++) { setup_live_interference(first_vgrf_node + i, fs->virtual_grf_start[i], fs->virtual_grf_end[i]); diff -Nru mesa-19.2.8/src/intel/compiler/brw_fs_register_coalesce.cpp mesa-20.0.8/src/intel/compiler/brw_fs_register_coalesce.cpp --- mesa-19.2.8/src/intel/compiler/brw_fs_register_coalesce.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/compiler/brw_fs_register_coalesce.cpp 2020-06-12 01:21:17.000000000 +0000 @@ -86,7 +86,7 @@ return false; if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) { - if (!inst->is_copy_payload(v->alloc)) { + if (!is_coalescing_payload(v->alloc, inst)) { return false; } } @@ -242,13 +242,26 @@ progress = true; for (int i = 0; i < src_size; i++) { - if (mov[i]) { + if (!mov[i]) + continue; + + if (mov[i]->conditional_mod == BRW_CONDITIONAL_NONE) { mov[i]->opcode = BRW_OPCODE_NOP; - mov[i]->conditional_mod = BRW_CONDITIONAL_NONE; mov[i]->dst = reg_undef; for (int j = 0; j < mov[i]->sources; j++) { mov[i]->src[j] = reg_undef; } + } else { + /* If we have a conditional modifier, rewrite the MOV to be a + * MOV.cmod from the coalesced register. Hopefully, cmod + * propagation will clean this up and move it to the instruction + * that writes the register. If not, this keeps things correct + * while still letting us coalesce. + */ + assert(mov[i]->opcode == BRW_OPCODE_MOV); + assert(mov[i]->sources == 1); + mov[i]->src[0] = mov[i]->dst; + mov[i]->dst = retype(brw_null_reg(), mov[i]->dst.type); } } diff -Nru mesa-19.2.8/src/intel/compiler/brw_fs_scoreboard.cpp mesa-20.0.8/src/intel/compiler/brw_fs_scoreboard.cpp --- mesa-19.2.8/src/intel/compiler/brw_fs_scoreboard.cpp 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/intel/compiler/brw_fs_scoreboard.cpp 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,1088 @@ +/* + * Copyright © 2019 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +/** @file brw_fs_scoreboard.cpp + * + * Gen12+ hardware lacks the register scoreboard logic that used to guarantee + * data coherency between register reads and writes in previous generations. + * This lowering pass runs after register allocation in order to make up for + * it. + * + * It works by performing global dataflow analysis in order to determine the + * set of potential dependencies of every instruction in the shader, and then + * inserts any required SWSB annotations and additional SYNC instructions in + * order to guarantee data coherency. + * + * WARNING - Access of the following (rarely used) ARF registers is not + * tracked here, and require the RegDist SWSB annotation to be set + * to 1 by the generator in order to avoid data races: + * + * - sp stack pointer + * - sr0 state register + * - cr0 control register + * - ip instruction pointer + * - tm0 timestamp register + * - dbg0 debug register + * + * The following ARF registers don't need to be tracked here because data + * coherency is still provided transparently by the hardware: + * + * - f0-1 flag registers + * - n0 notification register + * - tdr0 thread dependency register + */ + +#include "brw_fs.h" +#include "brw_cfg.h" + +using namespace brw; + +namespace { + /** + * In-order instruction accounting. + * @{ + */ + + /** + * Number of in-order hardware instructions contained in this IR + * instruction. This determines the increment applied to the RegDist + * counter calculated for any ordered dependency that crosses this + * instruction. + */ + unsigned + ordered_unit(const fs_inst *inst) + { + switch (inst->opcode) { + case BRW_OPCODE_SYNC: + case BRW_OPCODE_DO: + case SHADER_OPCODE_UNDEF: + case FS_OPCODE_PLACEHOLDER_HALT: + return 0; + default: + /* Note that the following is inaccurate for virtual instructions + * that expand to more in-order instructions than assumed here, but + * that can only lead to suboptimal execution ordering, data + * coherency won't be impacted. Providing exact RegDist counts for + * each virtual instruction would allow better ALU performance, but + * it would require keeping this switch statement in perfect sync + * with the generator in order to avoid data corruption. Lesson is + * (again) don't use virtual instructions if you want optimal + * scheduling. + */ + return is_unordered(inst) ? 0 : 1; + } + } + + /** + * Type for an instruction counter that increments for in-order + * instructions only, arbitrarily denoted 'jp' throughout this lowering + * pass in order to distinguish it from the regular instruction counter. + */ + typedef int ordered_address; + + /** + * Return the number of instructions in the program. + */ + unsigned + num_instructions(const backend_shader *shader) + { + return shader->cfg->blocks[shader->cfg->num_blocks - 1]->end_ip + 1; + } + + /** + * Calculate the local ordered_address instruction counter at every + * instruction of the shader for subsequent constant-time look-up. + */ + ordered_address * + ordered_inst_addresses(const fs_visitor *shader) + { + ordered_address *jps = new ordered_address[num_instructions(shader)]; + ordered_address jp = 0; + unsigned ip = 0; + + foreach_block_and_inst(block, fs_inst, inst, shader->cfg) { + jps[ip] = jp; + jp += ordered_unit(inst); + ip++; + } + + return jps; + } + + /** + * Synchronization mode required for data manipulated by in-order + * instructions. + * + * Similar to tgl_sbid_mode, but without SET mode. Defined as a separate + * enum for additional type safety. The hardware doesn't provide control + * over the synchronization mode for RegDist annotations, this is only used + * internally in this pass in order to optimize out redundant read + * dependencies where possible. + */ + enum tgl_regdist_mode { + TGL_REGDIST_NULL = 0, + TGL_REGDIST_SRC = 1, + TGL_REGDIST_DST = 2 + }; + + /** + * Allow bitwise arithmetic of tgl_regdist_mode enums. + */ + tgl_regdist_mode + operator|(tgl_regdist_mode x, tgl_regdist_mode y) + { + return tgl_regdist_mode(unsigned(x) | unsigned(y)); + } + + tgl_regdist_mode + operator&(tgl_regdist_mode x, tgl_regdist_mode y) + { + return tgl_regdist_mode(unsigned(x) & unsigned(y)); + } + + tgl_regdist_mode & + operator|=(tgl_regdist_mode &x, tgl_regdist_mode y) + { + return x = x | y; + } + + tgl_regdist_mode & + operator&=(tgl_regdist_mode &x, tgl_regdist_mode y) + { + return x = x & y; + } + + /** @} */ + + /** + * Representation of an equivalence relation among the set of unsigned + * integers. + * + * Its initial state is the identity relation '~' such that i ~ j if and + * only if i == j for every pair of unsigned integers i and j. + */ + struct equivalence_relation { + equivalence_relation(unsigned n) : is(new unsigned[n]), n(n) + { + for (unsigned i = 0; i < n; i++) + is[i] = i; + } + + ~equivalence_relation() + { + delete[] is; + } + + /** + * Return equivalence class index of the specified element. Effectively + * this is the numeric value of an arbitrary representative from the + * equivalence class. + * + * Allows the evaluation of the equivalence relation according to the + * rule that i ~ j if and only if lookup(i) == lookup(j). + */ + unsigned + lookup(unsigned i) const + { + if (i < n && is[i] != i) + return lookup(is[i]); + else + return i; + } + + /** + * Create an array with the results of the lookup() method for + * constant-time evaluation. + */ + unsigned * + flatten() const + { + unsigned *ids = new unsigned[n]; + + for (unsigned i = 0; i < n; i++) + ids[i] = lookup(i); + + return ids; + } + + /** + * Mutate the existing equivalence relation minimally by imposing the + * additional requirement that i ~ j. + * + * The algorithm updates the internal representation recursively in + * order to guarantee transitivity while preserving the previously + * specified equivalence requirements. + */ + unsigned + link(unsigned i, unsigned j) + { + const unsigned k = lookup(i); + assign(i, k); + assign(j, k); + return k; + } + + private: + equivalence_relation(const equivalence_relation &); + + equivalence_relation & + operator=(const equivalence_relation &); + + /** + * Assign the representative of \p from to be equivalent to \p to. + * + * At the same time the data structure is partially flattened as much as + * it's possible without increasing the number of recursive calls. + */ + void + assign(unsigned from, unsigned to) + { + if (from != to) { + assert(from < n); + + if (is[from] != from) + assign(is[from], to); + + is[from] = to; + } + } + + unsigned *is; + unsigned n; + }; + + /** + * Representation of a data dependency between two instructions in the + * program. + * @{ + */ + struct dependency { + /** + * No dependency information. + */ + dependency() : ordered(TGL_REGDIST_NULL), jp(INT_MIN), + unordered(TGL_SBID_NULL), id(0), + exec_all(false) {} + + /** + * Construct a dependency on the in-order instruction with the provided + * ordered_address instruction counter. + */ + dependency(tgl_regdist_mode mode, ordered_address jp, bool exec_all) : + ordered(mode), jp(jp), unordered(TGL_SBID_NULL), id(0), + exec_all(exec_all) {} + + /** + * Construct a dependency on the out-of-order instruction with the + * specified synchronization token. + */ + dependency(tgl_sbid_mode mode, unsigned id, bool exec_all) : + ordered(TGL_REGDIST_NULL), jp(INT_MIN), unordered(mode), id(id), + exec_all(exec_all) {} + + /** + * Synchronization mode of in-order dependency, or zero if no in-order + * dependency is present. + */ + tgl_regdist_mode ordered; + + /** + * Instruction counter of in-order dependency. + * + * For a dependency part of a different block in the program, this is + * relative to the specific control flow path taken between the + * dependency and the current block: It is the ordered_address such that + * the difference between it and the ordered_address of the first + * instruction of the current block is exactly the number of in-order + * instructions across that control flow path. It is not guaranteed to + * be equal to the local ordered_address of the generating instruction + * [as returned by ordered_inst_addresses()], except for block-local + * dependencies. + */ + ordered_address jp; + + /** + * Synchronization mode of unordered dependency, or zero if no unordered + * dependency is present. + */ + tgl_sbid_mode unordered; + + /** Synchronization token of out-of-order dependency. */ + unsigned id; + + /** + * Whether the dependency could be run with execution masking disabled, + * which might lead to the unwanted execution of the generating + * instruction in cases where a BB is executed with all channels + * disabled due to hardware bug GEN:BUG:1407528679. + */ + bool exec_all; + + /** + * Trivial in-order dependency that's always satisfied. + * + * Note that unlike a default-constructed dependency() which is also + * trivially satisfied, this is considered to provide dependency + * information and can be used to clear a previously pending dependency + * via shadow(). + */ + static const dependency done; + + friend bool + operator==(const dependency &dep0, const dependency &dep1) + { + return dep0.ordered == dep1.ordered && + dep0.jp == dep1.jp && + dep0.unordered == dep1.unordered && + dep0.id == dep1.id && + dep0.exec_all == dep1.exec_all; + } + + friend bool + operator!=(const dependency &dep0, const dependency &dep1) + { + return !(dep0 == dep1); + } + }; + + const dependency dependency::done = dependency(TGL_REGDIST_SRC, INT_MIN, false); + + /** + * Return whether \p dep contains any dependency information. + */ + bool + is_valid(const dependency &dep) + { + return dep.ordered || dep.unordered; + } + + /** + * Combine \p dep0 and \p dep1 into a single dependency object that is only + * satisfied when both original dependencies are satisfied. This might + * involve updating the equivalence relation \p eq in order to make sure + * that both out-of-order dependencies are assigned the same hardware SBID + * as synchronization token. + */ + dependency + merge(equivalence_relation &eq, + const dependency &dep0, const dependency &dep1) + { + dependency dep; + + if (dep0.ordered || dep1.ordered) { + dep.ordered = dep0.ordered | dep1.ordered; + dep.jp = MAX2(dep0.jp, dep1.jp); + } + + if (dep0.unordered || dep1.unordered) { + dep.unordered = dep0.unordered | dep1.unordered; + dep.id = eq.link(dep0.unordered ? dep0.id : dep1.id, + dep1.unordered ? dep1.id : dep0.id); + } + + dep.exec_all = dep0.exec_all || dep1.exec_all; + + return dep; + } + + /** + * Override dependency information of \p dep0 with that of \p dep1. + */ + dependency + shadow(const dependency &dep0, const dependency &dep1) + { + return is_valid(dep1) ? dep1 : dep0; + } + + /** + * Translate dependency information across the program. + * + * This returns a dependency on the same instruction translated to the + * ordered_address space of a different block. The correct shift for + * transporting a dependency across an edge of the CFG is the difference + * between the local ordered_address of the first instruction of the target + * block and the local ordered_address of the instruction immediately after + * the end of the origin block. + */ + dependency + transport(dependency dep, int delta) + { + if (dep.ordered && dep.jp > INT_MIN) + dep.jp += delta; + + return dep; + } + + /** + * Return simplified dependency removing any synchronization modes not + * applicable to an instruction reading the same register location. + */ + dependency + dependency_for_read(dependency dep) + { + dep.ordered &= TGL_REGDIST_DST; + return dep; + } + + /** + * Return simplified dependency removing any synchronization modes not + * applicable to an instruction \p inst writing the same register location. + */ + dependency + dependency_for_write(const fs_inst *inst, dependency dep) + { + if (!is_unordered(inst)) + dep.ordered &= TGL_REGDIST_DST; + return dep; + } + + /** @} */ + + /** + * Scoreboard representation. This keeps track of the data dependencies of + * registers with GRF granularity. + */ + class scoreboard { + public: + /** + * Look up the most current data dependency for register \p r. + */ + dependency + get(const fs_reg &r) const + { + if (const dependency *p = const_cast(this)->dep(r)) + return *p; + else + return dependency(); + } + + /** + * Specify the most current data dependency for register \p r. + */ + void + set(const fs_reg &r, const dependency &d) + { + if (dependency *p = dep(r)) + *p = d; + } + + /** + * Component-wise merge() of corresponding dependencies from two + * scoreboard objects. \sa merge(). + */ + friend scoreboard + merge(equivalence_relation &eq, + const scoreboard &sb0, const scoreboard &sb1) + { + scoreboard sb; + + for (unsigned i = 0; i < ARRAY_SIZE(sb.grf_deps); i++) + sb.grf_deps[i] = merge(eq, sb0.grf_deps[i], sb1.grf_deps[i]); + + sb.addr_dep = merge(eq, sb0.addr_dep, sb1.addr_dep); + + for (unsigned i = 0; i < ARRAY_SIZE(sb.accum_deps); i++) + sb.accum_deps[i] = merge(eq, sb0.accum_deps[i], sb1.accum_deps[i]); + + return sb; + } + + /** + * Component-wise shadow() of corresponding dependencies from two + * scoreboard objects. \sa shadow(). + */ + friend scoreboard + shadow(const scoreboard &sb0, const scoreboard &sb1) + { + scoreboard sb; + + for (unsigned i = 0; i < ARRAY_SIZE(sb.grf_deps); i++) + sb.grf_deps[i] = shadow(sb0.grf_deps[i], sb1.grf_deps[i]); + + sb.addr_dep = shadow(sb0.addr_dep, sb1.addr_dep); + + for (unsigned i = 0; i < ARRAY_SIZE(sb.accum_deps); i++) + sb.accum_deps[i] = shadow(sb0.accum_deps[i], sb1.accum_deps[i]); + + return sb; + } + + /** + * Component-wise transport() of dependencies from a scoreboard + * object. \sa transport(). + */ + friend scoreboard + transport(const scoreboard &sb0, int delta) + { + scoreboard sb; + + for (unsigned i = 0; i < ARRAY_SIZE(sb.grf_deps); i++) + sb.grf_deps[i] = transport(sb0.grf_deps[i], delta); + + sb.addr_dep = transport(sb0.addr_dep, delta); + + for (unsigned i = 0; i < ARRAY_SIZE(sb.accum_deps); i++) + sb.accum_deps[i] = transport(sb0.accum_deps[i], delta); + + return sb; + } + + friend bool + operator==(const scoreboard &sb0, const scoreboard &sb1) + { + for (unsigned i = 0; i < ARRAY_SIZE(sb0.grf_deps); i++) { + if (sb0.grf_deps[i] != sb1.grf_deps[i]) + return false; + } + + if (sb0.addr_dep != sb1.addr_dep) + return false; + + for (unsigned i = 0; i < ARRAY_SIZE(sb0.accum_deps); i++) { + if (sb0.accum_deps[i] != sb1.accum_deps[i]) + return false; + } + + return true; + } + + friend bool + operator!=(const scoreboard &sb0, const scoreboard &sb1) + { + return !(sb0 == sb1); + } + + private: + dependency grf_deps[BRW_MAX_GRF]; + dependency addr_dep; + dependency accum_deps[10]; + + dependency * + dep(const fs_reg &r) + { + const unsigned reg = (r.file == VGRF ? r.nr + r.offset / REG_SIZE : + reg_offset(r) / REG_SIZE); + + return (r.file == VGRF || r.file == FIXED_GRF ? &grf_deps[reg] : + r.file == MRF ? &grf_deps[GEN7_MRF_HACK_START + reg] : + r.file == ARF && reg >= BRW_ARF_ADDRESS && + reg < BRW_ARF_ACCUMULATOR ? &addr_dep : + r.file == ARF && reg >= BRW_ARF_ACCUMULATOR && + reg < BRW_ARF_FLAG ? &accum_deps[ + reg - BRW_ARF_ACCUMULATOR] : + NULL); + } + }; + + /** + * Dependency list handling. + * @{ + */ + struct dependency_list { + dependency_list() : deps(NULL), n(0) {} + + ~dependency_list() + { + free(deps); + } + + void + push_back(const dependency &dep) + { + deps = (dependency *)realloc(deps, (n + 1) * sizeof(*deps)); + deps[n++] = dep; + } + + unsigned + size() const + { + return n; + } + + const dependency & + operator[](unsigned i) const + { + assert(i < n); + return deps[i]; + } + + dependency & + operator[](unsigned i) + { + assert(i < n); + return deps[i]; + } + + private: + dependency_list(const dependency_list &); + dependency_list & + operator=(const dependency_list &); + + dependency *deps; + unsigned n; + }; + + /** + * Add dependency \p dep to the list of dependencies of an instruction + * \p deps. + */ + void + add_dependency(const unsigned *ids, dependency_list &deps, dependency dep) + { + if (is_valid(dep)) { + /* Translate the unordered dependency token first in order to keep + * the list minimally redundant. + */ + if (dep.unordered) + dep.id = ids[dep.id]; + + /* Try to combine the specified dependency with any existing ones. */ + for (unsigned i = 0; i < deps.size(); i++) { + /* Don't combine otherwise matching dependencies if there is an + * exec_all mismatch which would cause a SET dependency to gain an + * exec_all flag, since that would prevent it from being baked + * into the instruction we want to allocate an SBID for. + */ + if (deps[i].exec_all != dep.exec_all && + (!deps[i].exec_all || (dep.unordered & TGL_SBID_SET)) && + (!dep.exec_all || (deps[i].unordered & TGL_SBID_SET))) + continue; + + if (dep.ordered && deps[i].ordered) { + deps[i].jp = MAX2(deps[i].jp, dep.jp); + deps[i].ordered |= dep.ordered; + deps[i].exec_all |= dep.exec_all; + dep.ordered = TGL_REGDIST_NULL; + } + + if (dep.unordered && deps[i].unordered && deps[i].id == dep.id) { + deps[i].unordered |= dep.unordered; + deps[i].exec_all |= dep.exec_all; + dep.unordered = TGL_SBID_NULL; + } + } + + /* Add it to the end of the list if necessary. */ + if (is_valid(dep)) + deps.push_back(dep); + } + } + + /** + * Construct a tgl_swsb annotation encoding any ordered dependencies from + * the dependency list \p deps of an instruction with ordered_address \p + * jp. If \p exec_all is false only dependencies known to be executed with + * channel masking applied will be considered in the calculation. + */ + tgl_swsb + ordered_dependency_swsb(const dependency_list &deps, + const ordered_address &jp, + bool exec_all) + { + unsigned min_dist = ~0u; + + for (unsigned i = 0; i < deps.size(); i++) { + if (deps[i].ordered && exec_all >= deps[i].exec_all) { + const unsigned dist = jp - deps[i].jp; + const unsigned max_dist = 10; + assert(jp > deps[i].jp); + if (dist <= max_dist) + min_dist = MIN3(min_dist, dist, 7); + } + } + + return { min_dist == ~0u ? 0 : min_dist }; + } + + /** + * Return whether the dependency list \p deps of an instruction with + * ordered_address \p jp has any non-trivial ordered dependencies. If \p + * exec_all is false only dependencies known to be executed with channel + * masking applied will be considered in the calculation. + */ + bool + find_ordered_dependency(const dependency_list &deps, + const ordered_address &jp, + bool exec_all) + { + return ordered_dependency_swsb(deps, jp, exec_all).regdist; + } + + /** + * Return the full tgl_sbid_mode bitset for the first unordered dependency + * on the list \p deps that matches the specified tgl_sbid_mode, or zero if + * no such dependency is present. If \p exec_all is false only + * dependencies known to be executed with channel masking applied will be + * considered in the calculation. + */ + tgl_sbid_mode + find_unordered_dependency(const dependency_list &deps, + tgl_sbid_mode unordered, + bool exec_all) + { + if (unordered) { + for (unsigned i = 0; i < deps.size(); i++) { + if ((unordered & deps[i].unordered) && + exec_all >= deps[i].exec_all) + return deps[i].unordered; + } + } + + return TGL_SBID_NULL; + } + + /** + * Return the tgl_sbid_mode bitset of an unordered dependency from the list + * \p deps that can be represented directly in the SWSB annotation of the + * instruction without additional SYNC instructions, or zero if no such + * dependency is present. + */ + tgl_sbid_mode + baked_unordered_dependency_mode(const fs_inst *inst, + const dependency_list &deps, + const ordered_address &jp) + { + const bool exec_all = inst->force_writemask_all; + const bool has_ordered = find_ordered_dependency(deps, jp, exec_all); + + if (find_unordered_dependency(deps, TGL_SBID_SET, exec_all)) + return find_unordered_dependency(deps, TGL_SBID_SET, exec_all); + else if (has_ordered && is_unordered(inst)) + return TGL_SBID_NULL; + else if (find_unordered_dependency(deps, TGL_SBID_DST, exec_all) && + (!has_ordered || !is_unordered(inst))) + return find_unordered_dependency(deps, TGL_SBID_DST, exec_all); + else if (!has_ordered) + return find_unordered_dependency(deps, TGL_SBID_SRC, exec_all); + else + return TGL_SBID_NULL; + } + + /** @} */ + + /** + * Shader instruction dependency calculation. + * @{ + */ + + /** + * Update scoreboard object \p sb to account for the execution of + * instruction \p inst. + */ + void + update_inst_scoreboard(const fs_visitor *shader, const ordered_address *jps, + const fs_inst *inst, unsigned ip, scoreboard &sb) + { + const bool exec_all = inst->force_writemask_all; + + /* Track any source registers that may be fetched asynchronously by this + * instruction, otherwise clear the dependency in order to avoid + * subsequent redundant synchronization. + */ + for (unsigned i = 0; i < inst->sources; i++) { + const dependency rd_dep = + (inst->is_payload(i) || + inst->is_math()) ? dependency(TGL_SBID_SRC, ip, exec_all) : + ordered_unit(inst) ? dependency(TGL_REGDIST_SRC, jps[ip], exec_all) : + dependency::done; + + for (unsigned j = 0; j < regs_read(inst, i); j++) + sb.set(byte_offset(inst->src[i], REG_SIZE * j), rd_dep); + } + + if (is_send(inst) && inst->base_mrf != -1) { + const dependency rd_dep = dependency(TGL_SBID_SRC, ip, exec_all); + + for (unsigned j = 0; j < inst->mlen; j++) + sb.set(brw_uvec_mrf(8, inst->base_mrf + j, 0), rd_dep); + } + + /* Track any destination registers of this instruction. */ + const dependency wr_dep = + is_unordered(inst) ? dependency(TGL_SBID_DST, ip, exec_all) : + ordered_unit(inst) ? dependency(TGL_REGDIST_DST, jps[ip], exec_all) : + dependency(); + + if (is_valid(wr_dep) && inst->dst.file != BAD_FILE && + !inst->dst.is_null()) { + for (unsigned j = 0; j < regs_written(inst); j++) + sb.set(byte_offset(inst->dst, REG_SIZE * j), wr_dep); + } + } + + /** + * Calculate scoreboard objects locally that represent any pending (and + * unconditionally resolved) dependencies at the end of each block of the + * program. + */ + scoreboard * + gather_block_scoreboards(const fs_visitor *shader, + const ordered_address *jps) + { + scoreboard *sbs = new scoreboard[shader->cfg->num_blocks]; + unsigned ip = 0; + + foreach_block_and_inst(block, fs_inst, inst, shader->cfg) + update_inst_scoreboard(shader, jps, inst, ip++, sbs[block->num]); + + return sbs; + } + + /** + * Propagate data dependencies globally through the control flow graph + * until a fixed point is reached. + * + * Calculates the set of dependencies potentially pending at the beginning + * of each block, and returns it as an array of scoreboard objects. + */ + scoreboard * + propagate_block_scoreboards(const fs_visitor *shader, + const ordered_address *jps, + equivalence_relation &eq) + { + const scoreboard *delta_sbs = gather_block_scoreboards(shader, jps); + scoreboard *in_sbs = new scoreboard[shader->cfg->num_blocks]; + scoreboard *out_sbs = new scoreboard[shader->cfg->num_blocks]; + + for (bool progress = true; progress;) { + progress = false; + + foreach_block(block, shader->cfg) { + const scoreboard sb = shadow(in_sbs[block->num], + delta_sbs[block->num]); + + if (sb != out_sbs[block->num]) { + foreach_list_typed(bblock_link, child_link, link, + &block->children) { + scoreboard &in_sb = in_sbs[child_link->block->num]; + const int delta = + jps[child_link->block->start_ip] - jps[block->end_ip] + - ordered_unit(static_cast(block->end())); + + in_sb = merge(eq, in_sb, transport(sb, delta)); + } + + out_sbs[block->num] = sb; + progress = true; + } + } + } + + delete[] delta_sbs; + delete[] out_sbs; + + return in_sbs; + } + + /** + * Return the list of potential dependencies of each instruction in the + * shader based on the result of global dependency analysis. + */ + dependency_list * + gather_inst_dependencies(const fs_visitor *shader, + const ordered_address *jps) + { + equivalence_relation eq(num_instructions(shader)); + scoreboard *sbs = propagate_block_scoreboards(shader, jps, eq); + const unsigned *ids = eq.flatten(); + dependency_list *deps = new dependency_list[num_instructions(shader)]; + unsigned ip = 0; + + foreach_block_and_inst(block, fs_inst, inst, shader->cfg) { + const bool exec_all = inst->force_writemask_all; + scoreboard &sb = sbs[block->num]; + + for (unsigned i = 0; i < inst->sources; i++) { + for (unsigned j = 0; j < regs_read(inst, i); j++) + add_dependency(ids, deps[ip], dependency_for_read( + sb.get(byte_offset(inst->src[i], REG_SIZE * j)))); + } + + if (is_send(inst) && inst->base_mrf != -1) { + for (unsigned j = 0; j < inst->mlen; j++) + add_dependency(ids, deps[ip], dependency_for_read( + sb.get(brw_uvec_mrf(8, inst->base_mrf + j, 0)))); + } + + if (is_unordered(inst)) + add_dependency(ids, deps[ip], + dependency(TGL_SBID_SET, ip, exec_all)); + + if (!inst->no_dd_check) { + if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) { + for (unsigned j = 0; j < regs_written(inst); j++) { + add_dependency(ids, deps[ip], dependency_for_write(inst, + sb.get(byte_offset(inst->dst, REG_SIZE * j)))); + } + } + + if (is_send(inst) && inst->base_mrf != -1) { + for (unsigned j = 0; j < inst->implied_mrf_writes(); j++) + add_dependency(ids, deps[ip], dependency_for_write(inst, + sb.get(brw_uvec_mrf(8, inst->base_mrf + j, 0)))); + } + } + + update_inst_scoreboard(shader, jps, inst, ip, sb); + ip++; + } + + delete[] sbs; + delete[] ids; + + return deps; + } + + /** @} */ + + /** + * Allocate SBID tokens to track the execution of every out-of-order + * instruction of the shader. + */ + dependency_list * + allocate_inst_dependencies(const fs_visitor *shader, + const dependency_list *deps0) + { + /* XXX - Use bin-packing algorithm to assign hardware SBIDs optimally in + * shaders with a large number of SEND messages. + */ + + /* Allocate an unordered dependency ID to hardware SBID translation + * table with as many entries as instructions there are in the shader, + * which is the maximum number of unordered IDs we can find in the + * program. + */ + unsigned *ids = new unsigned[num_instructions(shader)]; + for (unsigned ip = 0; ip < num_instructions(shader); ip++) + ids[ip] = ~0u; + + dependency_list *deps1 = new dependency_list[num_instructions(shader)]; + unsigned next_id = 0; + + for (unsigned ip = 0; ip < num_instructions(shader); ip++) { + for (unsigned i = 0; i < deps0[ip].size(); i++) { + const dependency &dep = deps0[ip][i]; + + if (dep.unordered && ids[dep.id] == ~0u) + ids[dep.id] = (next_id++) & 0xf; + + add_dependency(ids, deps1[ip], dep); + } + } + + delete[] ids; + + return deps1; + } + + /** + * Emit dependency information provided by \p deps into the shader, + * inserting additional SYNC instructions for dependencies that can't be + * represented directly by annotating existing instructions. + */ + void + emit_inst_dependencies(fs_visitor *shader, + const ordered_address *jps, + const dependency_list *deps) + { + unsigned ip = 0; + + foreach_block_and_inst_safe(block, fs_inst, inst, shader->cfg) { + const bool exec_all = inst->force_writemask_all; + tgl_swsb swsb = ordered_dependency_swsb(deps[ip], jps[ip], exec_all); + const tgl_sbid_mode unordered_mode = + baked_unordered_dependency_mode(inst, deps[ip], jps[ip]); + + for (unsigned i = 0; i < deps[ip].size(); i++) { + const dependency &dep = deps[ip][i]; + + if (dep.unordered) { + if (unordered_mode == dep.unordered && + exec_all >= dep.exec_all && !swsb.mode) { + /* Bake unordered dependency into the instruction's SWSB if + * possible, except in cases where the current instruction + * isn't marked NoMask but the dependency is, since that + * might lead to data coherency issues due to + * GEN:BUG:1407528679. + */ + swsb.sbid = dep.id; + swsb.mode = dep.unordered; + } else { + /* Emit dependency into the SWSB of an extra SYNC + * instruction. + */ + const fs_builder ibld = fs_builder(shader, block, inst) + .exec_all().group(1, 0); + fs_inst *sync = ibld.emit(BRW_OPCODE_SYNC, ibld.null_reg_ud(), + brw_imm_ud(TGL_SYNC_NOP)); + sync->sched.sbid = dep.id; + sync->sched.mode = dep.unordered; + assert(!(sync->sched.mode & TGL_SBID_SET)); + } + } + } + + for (unsigned i = 0; i < deps[ip].size(); i++) { + const dependency &dep = deps[ip][i]; + + if (dep.ordered && dep.exec_all > exec_all && + find_ordered_dependency(deps[ip], jps[ip], true)) { + /* If the current instruction is not marked NoMask but an + * ordered dependency is, perform the synchronization as a + * separate NoMask SYNC instruction in order to avoid data + * coherency issues due to GEN:BUG:1407528679. The similar + * scenario with unordered dependencies should have been + * handled above. + */ + const fs_builder ibld = fs_builder(shader, block, inst) + .exec_all().group(1, 0); + fs_inst *sync = ibld.emit(BRW_OPCODE_SYNC, ibld.null_reg_ud(), + brw_imm_ud(TGL_SYNC_NOP)); + sync->sched = ordered_dependency_swsb(deps[ip], jps[ip], true); + break; + } + } + + /* Update the IR. */ + inst->sched = swsb; + inst->no_dd_check = inst->no_dd_clear = false; + ip++; + } + } +} + +bool +fs_visitor::lower_scoreboard() +{ + if (devinfo->gen >= 12) { + const ordered_address *jps = ordered_inst_addresses(this); + const dependency_list *deps0 = gather_inst_dependencies(this, jps); + const dependency_list *deps1 = allocate_inst_dependencies(this, deps0); + emit_inst_dependencies(this, jps, deps1); + delete[] deps1; + delete[] deps0; + delete[] jps; + } + + return true; +} diff -Nru mesa-19.2.8/src/intel/compiler/brw_fs_visitor.cpp mesa-20.0.8/src/intel/compiler/brw_fs_visitor.cpp --- mesa-19.2.8/src/intel/compiler/brw_fs_visitor.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/compiler/brw_fs_visitor.cpp 2020-06-12 01:21:17.000000000 +0000 @@ -122,6 +122,7 @@ wm_prog_data->num_varying_inputs = devinfo->gen < 6 ? 1 : 0; memset(wm_prog_data->urb_setup, -1, sizeof(wm_prog_data->urb_setup[0]) * VARYING_SLOT_MAX); + brw_compute_urb_setup_index(wm_prog_data); /* We don't have any uniforms. */ stage_prog_data->nr_params = 0; @@ -176,11 +177,11 @@ const fs_reg xstart(negate(brw_vec1_grf(1, 0))); const fs_reg ystart(negate(brw_vec1_grf(1, 1))); - if (devinfo->has_pln && dispatch_width == 16) { - for (unsigned i = 0; i < 2; i++) { - abld.half(i).ADD(half(offset(delta_xy, abld, i), 0), + if (devinfo->has_pln) { + for (unsigned i = 0; i < dispatch_width / 8; i++) { + abld.half(i).ADD(half(offset(delta_xy, abld, 0), i), half(this->pixel_x, i), xstart); - abld.half(i).ADD(half(offset(delta_xy, abld, i), 1), + abld.half(i).ADD(half(offset(delta_xy, abld, 1), i), half(this->pixel_y, i), ystart); } } else { @@ -194,12 +195,70 @@ */ this->wpos_w = vgrf(glsl_type::float_type); abld.emit(FS_OPCODE_LINTERP, wpos_w, delta_xy, - interp_reg(VARYING_SLOT_POS, 3)); + component(interp_reg(VARYING_SLOT_POS, 3), 0)); /* Compute the pixel 1/W value from wpos.w. */ this->pixel_w = vgrf(glsl_type::float_type); abld.emit(SHADER_OPCODE_RCP, this->pixel_w, wpos_w); } +static unsigned +brw_rnd_mode_from_nir(unsigned mode, unsigned *mask) +{ + unsigned brw_mode = 0; + *mask = 0; + + if ((FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 | + FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 | + FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64) & + mode) { + brw_mode |= BRW_RND_MODE_RTZ << BRW_CR0_RND_MODE_SHIFT; + *mask |= BRW_CR0_RND_MODE_MASK; + } + if ((FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 | + FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32 | + FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64) & + mode) { + brw_mode |= BRW_RND_MODE_RTNE << BRW_CR0_RND_MODE_SHIFT; + *mask |= BRW_CR0_RND_MODE_MASK; + } + if (mode & FLOAT_CONTROLS_DENORM_PRESERVE_FP16) { + brw_mode |= BRW_CR0_FP16_DENORM_PRESERVE; + *mask |= BRW_CR0_FP16_DENORM_PRESERVE; + } + if (mode & FLOAT_CONTROLS_DENORM_PRESERVE_FP32) { + brw_mode |= BRW_CR0_FP32_DENORM_PRESERVE; + *mask |= BRW_CR0_FP32_DENORM_PRESERVE; + } + if (mode & FLOAT_CONTROLS_DENORM_PRESERVE_FP64) { + brw_mode |= BRW_CR0_FP64_DENORM_PRESERVE; + *mask |= BRW_CR0_FP64_DENORM_PRESERVE; + } + if (mode & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16) + *mask |= BRW_CR0_FP16_DENORM_PRESERVE; + if (mode & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32) + *mask |= BRW_CR0_FP32_DENORM_PRESERVE; + if (mode & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP64) + *mask |= BRW_CR0_FP64_DENORM_PRESERVE; + if (mode == FLOAT_CONTROLS_DEFAULT_FLOAT_CONTROL_MODE) + *mask |= BRW_CR0_FP_MODE_MASK; + + return brw_mode; +} + +void +fs_visitor::emit_shader_float_controls_execution_mode() +{ + unsigned execution_mode = this->nir->info.float_controls_execution_mode; + if (execution_mode == FLOAT_CONTROLS_DEFAULT_FLOAT_CONTROL_MODE) + return; + + fs_builder abld = bld.annotate("shader floats control execution mode"); + unsigned mask = 0; + unsigned mode = brw_rnd_mode_from_nir(execution_mode, &mask); + abld.emit(SHADER_OPCODE_FLOAT_CONTROL_MODE, bld.null_reg_ud(), + brw_imm_d(mode), brw_imm_d(mask)); +} + /** Emits the interpolation for the varying inputs. */ void fs_visitor::emit_interpolation_setup_gen6() @@ -270,8 +329,8 @@ struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(prog_data); for (int i = 0; i < BRW_BARYCENTRIC_MODE_COUNT; ++i) { - this->delta_xy[i] = fetch_payload_reg( - bld, payload.barycentric_coord_reg[i], BRW_REGISTER_TYPE_F, 2); + this->delta_xy[i] = fetch_barycentric_reg( + bld, payload.barycentric_coord_reg[i]); } uint32_t centroid_modes = wm_prog_data->barycentric_interp_modes & @@ -293,15 +352,17 @@ if (!(centroid_modes & (1 << i))) continue; + const fs_reg centroid_delta_xy = delta_xy[i]; const fs_reg &pixel_delta_xy = delta_xy[i - 1]; - for (unsigned q = 0; q < dispatch_width / 8; q++) { - for (unsigned c = 0; c < 2; c++) { - const unsigned idx = c + (q & 2) + (q & 1) * dispatch_width / 8; - set_predicate_inv( - BRW_PREDICATE_NORMAL, true, - bld.half(q).MOV(horiz_offset(delta_xy[i], idx * 8), - horiz_offset(pixel_delta_xy, idx * 8))); + delta_xy[i] = bld.vgrf(BRW_REGISTER_TYPE_F, 2); + + for (unsigned c = 0; c < 2; c++) { + for (unsigned q = 0; q < dispatch_width / 8; q++) { + set_predicate(BRW_PREDICATE_NORMAL, + bld.half(q).SEL(half(offset(delta_xy[i], bld, c), q), + half(offset(centroid_delta_xy, bld, c), q), + half(offset(pixel_delta_xy, bld, c), q))); } } } @@ -402,82 +463,6 @@ } void -fs_visitor::emit_alpha_to_coverage_workaround(const fs_reg &src0_alpha) -{ - /* We need to compute alpha to coverage dithering manually in shader - * and replace sample mask store with the bitwise-AND of sample mask and - * alpha to coverage dithering. - * - * The following formula is used to compute final sample mask: - * m = int(16.0 * clamp(src0_alpha, 0.0, 1.0)) - * dither_mask = 0x1111 * ((0xfea80 >> (m & ~3)) & 0xf) | - * 0x0808 * (m & 2) | 0x0100 * (m & 1) - * sample_mask = sample_mask & dither_mask - * - * It gives a number of ones proportional to the alpha for 2, 4, 8 or 16 - * least significant bits of the result: - * 0.0000 0000000000000000 - * 0.0625 0000000100000000 - * 0.1250 0001000000010000 - * 0.1875 0001000100010000 - * 0.2500 1000100010001000 - * 0.3125 1000100110001000 - * 0.3750 1001100010011000 - * 0.4375 1001100110011000 - * 0.5000 1010101010101010 - * 0.5625 1010101110101010 - * 0.6250 1011101010111010 - * 0.6875 1011101110111010 - * 0.7500 1110111011101110 - * 0.8125 1110111111101110 - * 0.8750 1111111011111110 - * 0.9375 1111111111111110 - * 1.0000 1111111111111111 - */ - const fs_builder abld = bld.annotate("compute alpha_to_coverage & " - "sample_mask"); - - /* clamp(src0_alpha, 0.f, 1.f) */ - const fs_reg float_tmp = abld.vgrf(BRW_REGISTER_TYPE_F); - set_saturate(true, abld.MOV(float_tmp, src0_alpha)); - - /* 16.0 * clamp(src0_alpha, 0.0, 1.0) */ - abld.MUL(float_tmp, float_tmp, brw_imm_f(16.0)); - - /* m = int(16.0 * clamp(src0_alpha, 0.0, 1.0)) */ - const fs_reg m = abld.vgrf(BRW_REGISTER_TYPE_UW); - abld.MOV(m, float_tmp); - - /* 0x1111 * ((0xfea80 >> (m & ~3)) & 0xf) */ - const fs_reg int_tmp_1 = abld.vgrf(BRW_REGISTER_TYPE_UW); - const fs_reg shift_const = abld.vgrf(BRW_REGISTER_TYPE_UD); - abld.MOV(shift_const, brw_imm_d(0xfea80)); - abld.AND(int_tmp_1, m, brw_imm_uw(~3)); - abld.SHR(int_tmp_1, shift_const, int_tmp_1); - abld.AND(int_tmp_1, int_tmp_1, brw_imm_uw(0xf)); - abld.MUL(int_tmp_1, int_tmp_1, brw_imm_uw(0x1111)); - - /* 0x0808 * (m & 2) */ - const fs_reg int_tmp_2 = abld.vgrf(BRW_REGISTER_TYPE_UW); - abld.AND(int_tmp_2, m, brw_imm_uw(2)); - abld.MUL(int_tmp_2, int_tmp_2, brw_imm_uw(0x0808)); - - abld.OR(int_tmp_1, int_tmp_1, int_tmp_2); - - /* 0x0100 * (m & 1) */ - const fs_reg int_tmp_3 = abld.vgrf(BRW_REGISTER_TYPE_UW); - abld.AND(int_tmp_3, m, brw_imm_uw(1)); - abld.MUL(int_tmp_3, int_tmp_3, brw_imm_uw(0x0100)); - - abld.OR(int_tmp_1, int_tmp_1, int_tmp_3); - - /* sample_mask = sample_mask & dither_mask */ - const fs_reg mask = abld.vgrf(BRW_REGISTER_TYPE_UD); - abld.AND(mask, sample_mask, int_tmp_1); - sample_mask = mask; -} - -void fs_visitor::emit_fb_writes() { assert(stage == MESA_SHADER_FRAGMENT); @@ -513,14 +498,6 @@ (key->nr_color_regions > 1 && key->alpha_to_coverage && (sample_mask.file == BAD_FILE || devinfo->gen == 6)); - /* From the SKL PRM, Volume 7, "Alpha Coverage": - * "If Pixel Shader outputs oMask, AlphaToCoverage is disabled in - * hardware, regardless of the state setting for this feature." - */ - if (devinfo->gen > 6 && key->alpha_to_coverage && - sample_mask.file != BAD_FILE && this->outputs[0].file != BAD_FILE) - emit_alpha_to_coverage_workaround(offset(this->outputs[0], bld, 3)); - for (int target = 0; target < key->nr_color_regions; target++) { /* Skip over outputs that weren't written. */ if (this->outputs[target].file == BAD_FILE) @@ -561,6 +538,23 @@ inst->last_rt = true; inst->eot = true; + + if (devinfo->gen >= 11 && devinfo->gen <= 12 && + prog_data->dual_src_blend) { + /* The dual-source RT write messages fail to release the thread + * dependency on ICL and TGL with SIMD32 dispatch, leading to hangs. + * + * XXX - Emit an extra single-source NULL RT-write marked LastRT in + * order to release the thread dependency without disabling + * SIMD32. + * + * The dual-source RT write messages may lead to hangs with SIMD16 + * dispatch on ICL due some unknown reasons, see + * https://gitlab.freedesktop.org/mesa/mesa/-/issues/2183 + */ + limit_dispatch_width(8, "Dual source blending unsupported " + "in SIMD16 and SIMD32 modes.\n"); + } } void @@ -862,6 +856,7 @@ case 10: barrier_id_mask = 0x8f000000u; break; case 11: + case 12: barrier_id_mask = 0x7f000000u; break; default: unreachable("barrier is only available on gen >= 7"); @@ -890,13 +885,12 @@ void *mem_ctx, const brw_base_prog_key *key, struct brw_stage_prog_data *prog_data, - struct gl_program *prog, const nir_shader *shader, unsigned dispatch_width, int shader_time_index, const struct brw_vue_map *input_vue_map) : backend_shader(compiler, log_data, mem_ctx, shader, prog_data), - key(key), gs_compile(NULL), prog_data(prog_data), prog(prog), + key(key), gs_compile(NULL), prog_data(prog_data), input_vue_map(input_vue_map), dispatch_width(dispatch_width), shader_time_index(shader_time_index), @@ -914,7 +908,7 @@ : backend_shader(compiler, log_data, mem_ctx, shader, &prog_data->base.base), key(&c->key.base), gs_compile(c), - prog_data(&prog_data->base.base), prog(NULL), + prog_data(&prog_data->base.base), dispatch_width(8), shader_time_index(shader_time_index), bld(fs_builder(this, dispatch_width).at_end()) @@ -926,7 +920,10 @@ void fs_visitor::init() { - this->key_tex = &key->tex; + if (key) + this->key_tex = &key->tex; + else + this->key_tex = NULL; this->max_dispatch_width = 32; this->prog_data = this->stage_prog_data; diff -Nru mesa-19.2.8/src/intel/compiler/brw_gen_enum.h mesa-20.0.8/src/intel/compiler/brw_gen_enum.h --- mesa-19.2.8/src/intel/compiler/brw_gen_enum.h 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/intel/compiler/brw_gen_enum.h 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,62 @@ +/* + * Copyright © 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "util/macros.h" +#include "dev/gen_device_info.h" + +enum gen { + GEN4 = (1 << 0), + GEN45 = (1 << 1), + GEN5 = (1 << 2), + GEN6 = (1 << 3), + GEN7 = (1 << 4), + GEN75 = (1 << 5), + GEN8 = (1 << 6), + GEN9 = (1 << 7), + GEN10 = (1 << 8), + GEN11 = (1 << 9), + GEN12 = (1 << 10), + GEN_ALL = ~0 +}; + +#define GEN_LT(gen) ((gen) - 1) +#define GEN_GE(gen) (~GEN_LT(gen)) +#define GEN_LE(gen) (GEN_LT(gen) | (gen)) + +static enum gen +gen_from_devinfo(const struct gen_device_info *devinfo) +{ + switch (devinfo->gen) { + case 4: return devinfo->is_g4x ? GEN45 : GEN4; + case 5: return GEN5; + case 6: return GEN6; + case 7: return devinfo->is_haswell ? GEN75 : GEN7; + case 8: return GEN8; + case 9: return GEN9; + case 10: return GEN10; + case 11: return GEN11; + case 12: return GEN12; + default: + unreachable("not reached"); + } +} diff -Nru mesa-19.2.8/src/intel/compiler/brw_inst.h mesa-20.0.8/src/intel/compiler/brw_inst.h --- mesa-19.2.8/src/intel/compiler/brw_inst.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/compiler/brw_inst.h 2020-06-12 01:21:17.000000000 +0000 @@ -53,30 +53,39 @@ unsigned high, unsigned low, uint64_t value); -#define FC(name, high, low, assertions) \ +#define FC(name, hi4, lo4, hi12, lo12, assertions) \ static inline void \ brw_inst_set_##name(const struct gen_device_info *devinfo, \ brw_inst *inst, uint64_t v) \ { \ assert(assertions); \ - (void) devinfo; \ - brw_inst_set_bits(inst, high, low, v); \ + if (devinfo->gen >= 12) \ + brw_inst_set_bits(inst, hi12, lo12, v); \ + else \ + brw_inst_set_bits(inst, hi4, lo4, v); \ } \ static inline uint64_t \ brw_inst_##name(const struct gen_device_info *devinfo, \ const brw_inst *inst) \ { \ assert(assertions); \ - (void) devinfo; \ - return brw_inst_bits(inst, high, low); \ + if (devinfo->gen >= 12) \ + return brw_inst_bits(inst, hi12, lo12); \ + else \ + return brw_inst_bits(inst, hi4, lo4); \ } -/* A simple macro for fields which stay in the same place on all generations. */ -#define F(name, high, low) FC(name, high, low, true) +/* A simple macro for fields which stay in the same place on all generations, + * except for Gen12! + */ +#define F(name, hi4, lo4, hi12, lo12) FC(name, hi4, lo4, hi12, lo12, true) -#define BOUNDS(hi4, lo4, hi45, lo45, hi5, lo5, hi6, lo6, hi7, lo7, hi8, lo8) \ +#define BOUNDS(hi4, lo4, hi45, lo45, hi5, lo5, hi6, lo6, \ + hi7, lo7, hi8, lo8, hi12, lo12) \ unsigned high, low; \ - if (devinfo->gen >= 8) { \ + if (devinfo->gen >= 12) { \ + high = hi12; low = lo12; \ + } else if (devinfo->gen >= 8) { \ high = hi8; low = lo8; \ } else if (devinfo->gen >= 7) { \ high = hi7; low = lo7; \ @@ -95,157 +104,277 @@ * bit locations across generations. GCC appears to combine cases where the * bits are identical, removing some of the inefficiency. */ -#define FF(name, hi4, lo4, hi45, lo45, hi5, lo5, hi6, lo6, hi7, lo7, hi8, lo8)\ +#define FF(name, hi4, lo4, hi45, lo45, hi5, lo5, hi6, lo6, \ + hi7, lo7, hi8, lo8, hi12, lo12) \ static inline void \ brw_inst_set_##name(const struct gen_device_info *devinfo, \ brw_inst *inst, uint64_t value) \ { \ - BOUNDS(hi4, lo4, hi45, lo45, hi5, lo5, hi6, lo6, hi7, lo7, hi8, lo8) \ + BOUNDS(hi4, lo4, hi45, lo45, hi5, lo5, hi6, lo6, \ + hi7, lo7, hi8, lo8, hi12, lo12) \ brw_inst_set_bits(inst, high, low, value); \ } \ static inline uint64_t \ brw_inst_##name(const struct gen_device_info *devinfo, const brw_inst *inst) \ { \ - BOUNDS(hi4, lo4, hi45, lo45, hi5, lo5, hi6, lo6, hi7, lo7, hi8, lo8) \ + BOUNDS(hi4, lo4, hi45, lo45, hi5, lo5, hi6, lo6, \ + hi7, lo7, hi8, lo8, hi12, lo12) \ return brw_inst_bits(inst, high, low); \ } /* A macro for fields which moved as of Gen8+. */ -#define F8(name, gen4_high, gen4_low, gen8_high, gen8_low) \ +#define F8(name, gen4_high, gen4_low, gen8_high, gen8_low, \ + gen12_high, gen12_low) \ FF(name, \ /* 4: */ gen4_high, gen4_low, \ /* 4.5: */ gen4_high, gen4_low, \ /* 5: */ gen4_high, gen4_low, \ /* 6: */ gen4_high, gen4_low, \ /* 7: */ gen4_high, gen4_low, \ - /* 8: */ gen8_high, gen8_low); + /* 8: */ gen8_high, gen8_low, \ + /* 12: */ gen12_high, gen12_low); + +/* Macro for fields that gained extra discontiguous MSBs in Gen12 (specified + * by hi12ex-lo12ex). + */ +#define FFDC(name, hi4, lo4, hi45, lo45, hi5, lo5, hi6, lo6, \ + hi7, lo7, hi8, lo8, hi12ex, lo12ex, hi12, lo12, assertions) \ +static inline void \ +brw_inst_set_##name(const struct gen_device_info *devinfo, \ + brw_inst *inst, uint64_t value) \ +{ \ + assert(assertions); \ + if (devinfo->gen >= 12) { \ + const unsigned k = hi12 - lo12 + 1; \ + if (hi12ex != -1 && lo12ex != -1) \ + brw_inst_set_bits(inst, hi12ex, lo12ex, value >> k); \ + brw_inst_set_bits(inst, hi12, lo12, value & ((1ull << k) - 1)); \ + } else { \ + BOUNDS(hi4, lo4, hi45, lo45, hi5, lo5, hi6, lo6, \ + hi7, lo7, hi8, lo8, -1, -1); \ + brw_inst_set_bits(inst, high, low, value); \ + } \ +} \ +static inline uint64_t \ +brw_inst_##name(const struct gen_device_info *devinfo, const brw_inst *inst) \ +{ \ + assert(assertions); \ + if (devinfo->gen >= 12) { \ + const unsigned k = hi12 - lo12 + 1; \ + return (hi12ex == -1 || lo12ex == -1 ? 0 : \ + brw_inst_bits(inst, hi12ex, lo12ex) << k) | \ + brw_inst_bits(inst, hi12, lo12); \ + } else { \ + BOUNDS(hi4, lo4, hi45, lo45, hi5, lo5, hi6, lo6, \ + hi7, lo7, hi8, lo8, -1, -1); \ + return brw_inst_bits(inst, high, low); \ + } \ +} + +#define FD(name, hi4, lo4, hi45, lo45, hi5, lo5, hi6, lo6, \ + hi7, lo7, hi8, lo8, hi12ex, lo12ex, hi12, lo12) \ + FFDC(name, hi4, lo4, hi45, lo45, hi5, lo5, hi6, lo6, \ + hi7, lo7, hi8, lo8, hi12ex, lo12ex, hi12, lo12, true) + +/* Macro for fields that didn't move across generations until Gen12, and then + * gained extra discontiguous bits. + */ +#define FDC(name, hi4, lo4, hi12ex, lo12ex, hi12, lo12, assertions) \ + FFDC(name, hi4, lo4, hi4, lo4, hi4, lo4, hi4, lo4, \ + hi4, lo4, hi4, lo4, hi12ex, lo12ex, hi12, lo12, assertions) + + +/* Macro for the 2-bit register file field, which on Gen12+ is stored as the + * variable length combination of an IsImm (hi12) bit and an additional file + * (lo12) bit. + */ +#define FI(name, hi4, lo4, hi8, lo8, hi12, lo12) \ +static inline void \ +brw_inst_set_##name(const struct gen_device_info *devinfo, \ + brw_inst *inst, uint64_t value) \ +{ \ + if (devinfo->gen >= 12) { \ + brw_inst_set_bits(inst, hi12, hi12, value >> 1); \ + if ((value >> 1) == 0) \ + brw_inst_set_bits(inst, lo12, lo12, value & 1); \ + } else { \ + BOUNDS(hi4, lo4, hi4, lo4, hi4, lo4, hi4, lo4, \ + hi4, lo4, hi8, lo8, -1, -1); \ + brw_inst_set_bits(inst, high, low, value); \ + } \ +} \ +static inline uint64_t \ +brw_inst_##name(const struct gen_device_info *devinfo, const brw_inst *inst) \ +{ \ + if (devinfo->gen >= 12) { \ + return (brw_inst_bits(inst, hi12, hi12) << 1) | \ + (brw_inst_bits(inst, hi12, hi12) == 0 ? \ + brw_inst_bits(inst, lo12, lo12) : 1); \ + } else { \ + BOUNDS(hi4, lo4, hi4, lo4, hi4, lo4, hi4, lo4, \ + hi4, lo4, hi8, lo8, -1, -1); \ + return brw_inst_bits(inst, high, low); \ + } \ +} + +/* Macro for fields that become a constant in Gen12+ not actually represented + * in the instruction. + */ +#define FK(name, hi4, lo4, const12) \ +static inline void \ +brw_inst_set_##name(const struct gen_device_info *devinfo, \ + brw_inst *inst, uint64_t v) \ +{ \ + if (devinfo->gen >= 12) \ + assert(v == (const12)); \ + else \ + brw_inst_set_bits(inst, hi4, lo4, v); \ +} \ +static inline uint64_t \ +brw_inst_##name(const struct gen_device_info *devinfo, \ + const brw_inst *inst) \ +{ \ + if (devinfo->gen >= 12) \ + return (const12); \ + else \ + return brw_inst_bits(inst, hi4, lo4); \ +} -F(src1_vstride, 120, 117) -F(src1_width, 116, 114) -F(src1_da16_swiz_w, 115, 114) -F(src1_da16_swiz_z, 113, 112) -F(src1_hstride, 113, 112) -F(src1_address_mode, 111, 111) +F(src1_vstride, /* 4+ */ 120, 117, /* 12+ */ 119, 116) +F(src1_width, /* 4+ */ 116, 114, /* 12+ */ 115, 113) +F(src1_da16_swiz_w, /* 4+ */ 115, 114, /* 12+ */ -1, -1) +F(src1_da16_swiz_z, /* 4+ */ 113, 112, /* 12+ */ -1, -1) +F(src1_hstride, /* 4+ */ 113, 112, /* 12+ */ 97, 96) +F(src1_address_mode, /* 4+ */ 111, 111, /* 12+ */ 112, 112) /** Src1.SrcMod @{ */ -F(src1_negate, 110, 110) -F(src1_abs, 109, 109) +F(src1_negate, /* 4+ */ 110, 110, /* 12+ */ 121, 121) +F(src1_abs, /* 4+ */ 109, 109, /* 12+ */ 120, 120) /** @} */ -F8(src1_ia_subreg_nr, /* 4+ */ 108, 106, /* 8+ */ 108, 105) -F(src1_da_reg_nr, 108, 101) -F(src1_da16_subreg_nr, 100, 100) -F(src1_da1_subreg_nr, 100, 96) -F(src1_da16_swiz_y, 99, 98) -F(src1_da16_swiz_x, 97, 96) -F8(src1_reg_hw_type, /* 4+ */ 46, 44, /* 8+ */ 94, 91) -F8(src1_reg_file, /* 4+ */ 43, 42, /* 8+ */ 90, 89) -F(src0_vstride, 88, 85) -F(src0_width, 84, 82) -F(src0_da16_swiz_w, 83, 82) -F(src0_da16_swiz_z, 81, 80) -F(src0_hstride, 81, 80) -F(src0_address_mode, 79, 79) +F8(src1_ia_subreg_nr, /* 4+ */ 108, 106, /* 8+ */ 108, 105, /* 12+ */ 111, 108) +F(src1_da_reg_nr, /* 4+ */ 108, 101, /* 12+ */ 111, 104) +F(src1_da16_subreg_nr, /* 4+ */ 100, 100, /* 12+ */ -1, -1) +F(src1_da1_subreg_nr, /* 4+ */ 100, 96, /* 12+ */ 103, 99) +F(src1_da16_swiz_y, /* 4+ */ 99, 98, /* 12+ */ -1, -1) +F(src1_da16_swiz_x, /* 4+ */ 97, 96, /* 12+ */ -1, -1) +F8(src1_reg_hw_type, /* 4+ */ 46, 44, /* 8+ */ 94, 91, /* 12+ */ 91, 88) +FI(src1_reg_file, /* 4+ */ 43, 42, /* 8+ */ 90, 89, /* 12+ */ 47, 98) +F(src1_is_imm, /* 4+ */ -1, -1, /* 12+ */ 47, 47) +F(src0_vstride, /* 4+ */ 88, 85, /* 12+ */ 87, 84) +F(src0_width, /* 4+ */ 84, 82, /* 12+ */ 83, 81) +F(src0_da16_swiz_w, /* 4+ */ 83, 82, /* 12+ */ -1, -1) +F(src0_da16_swiz_z, /* 4+ */ 81, 80, /* 12+ */ -1, -1) +F(src0_hstride, /* 4+ */ 81, 80, /* 12+ */ 65, 64) +F(src0_address_mode, /* 4+ */ 79, 79, /* 12+ */ 80, 80) /** Src0.SrcMod @{ */ -F(src0_negate, 78, 78) -F(src0_abs, 77, 77) +F(src0_negate, /* 4+ */ 78, 78, /* 12+ */ 45, 45) +F(src0_abs, /* 4+ */ 77, 77, /* 12+ */ 44, 44) /** @} */ -F8(src0_ia_subreg_nr, /* 4+ */ 76, 74, /* 8+ */ 76, 73) -F(src0_da_reg_nr, 76, 69) -F(src0_da16_subreg_nr, 68, 68) -F(src0_da1_subreg_nr, 68, 64) -F(src0_da16_swiz_y, 67, 66) -F(src0_da16_swiz_x, 65, 64) -F(dst_address_mode, 63, 63) -F(dst_hstride, 62, 61) -F8(dst_ia_subreg_nr, /* 4+ */ 60, 58, /* 8+ */ 60, 57) -F(dst_da_reg_nr, 60, 53) -F(dst_da16_subreg_nr, 52, 52) -F(dst_da1_subreg_nr, 52, 48) -F(da16_writemask, 51, 48) /* Dst.ChanEn */ -F8(src0_reg_hw_type, /* 4+ */ 41, 39, /* 8+ */ 46, 43) -F8(src0_reg_file, /* 4+ */ 38, 37, /* 8+ */ 42, 41) -F8(dst_reg_hw_type, /* 4+ */ 36, 34, /* 8+ */ 40, 37) -F8(dst_reg_file, /* 4+ */ 33, 32, /* 8+ */ 36, 35) -F8(mask_control, /* 4+ */ 9, 9, /* 8+ */ 34, 34) +F8(src0_ia_subreg_nr, /* 4+ */ 76, 74, /* 8+ */ 76, 73, /* 12+ */ 79, 76) +F(src0_da_reg_nr, /* 4+ */ 76, 69, /* 12+ */ 79, 72) +F(src0_da16_subreg_nr, /* 4+ */ 68, 68, /* 12+ */ -1, -1) +F(src0_da1_subreg_nr, /* 4+ */ 68, 64, /* 12+ */ 71, 67) +F(src0_da16_swiz_y, /* 4+ */ 67, 66, /* 12+ */ -1, -1) +F(src0_da16_swiz_x, /* 4+ */ 65, 64, /* 12+ */ -1, -1) +F(dst_address_mode, /* 4+ */ 63, 63, /* 12+ */ 35, 35) +F(dst_hstride, /* 4+ */ 62, 61, /* 12+ */ 49, 48) +F8(dst_ia_subreg_nr, /* 4+ */ 60, 58, /* 8+ */ 60, 57, /* 12+ */ 63, 60) +F(dst_da_reg_nr, /* 4+ */ 60, 53, /* 12+ */ 63, 56) +F(dst_da16_subreg_nr, /* 4+ */ 52, 52, /* 12+ */ -1, -1) +F(dst_da1_subreg_nr, /* 4+ */ 52, 48, /* 12+ */ 55, 51) +F(da16_writemask, /* 4+ */ 51, 48, /* 12+ */ -1, -1) /* Dst.ChanEn */ +F8(src0_reg_hw_type, /* 4+ */ 41, 39, /* 8+ */ 46, 43, /* 12+ */ 43, 40) +FI(src0_reg_file, /* 4+ */ 38, 37, /* 8+ */ 42, 41, /* 12+ */ 46, 66) +F(src0_is_imm, /* 4+ */ -1, -1, /* 12+ */ 46, 46) +F8(dst_reg_hw_type, /* 4+ */ 36, 34, /* 8+ */ 40, 37, /* 12+ */ 39, 36) +F8(dst_reg_file, /* 4+ */ 33, 32, /* 8+ */ 36, 35, /* 12+ */ 50, 50) +F8(mask_control, /* 4+ */ 9, 9, /* 8+ */ 34, 34, /* 12+ */ 31, 31) FF(flag_reg_nr, /* 4-6: doesn't exist */ -1, -1, -1, -1, -1, -1, -1, -1, /* 7: */ 90, 90, - /* 8: */ 33, 33) -F8(flag_subreg_nr, /* 4+ */ 89, 89, /* 8+ */ 32, 32) -F(saturate, 31, 31) -F(debug_control, 30, 30) -F(cmpt_control, 29, 29) -FC(branch_control, 28, 28, devinfo->gen >= 8) -FC(acc_wr_control, 28, 28, devinfo->gen >= 6) -FC(mask_control_ex, 28, 28, devinfo->is_g4x || devinfo->gen == 5) -F(cond_modifier, 27, 24) -FC(math_function, 27, 24, devinfo->gen >= 6) -F(exec_size, 23, 21) -F(pred_inv, 20, 20) -F(pred_control, 19, 16) -F(thread_control, 15, 14) -F(qtr_control, 13, 12) + /* 8: */ 33, 33, + /* 12: */ 23, 23) +F8(flag_subreg_nr, /* 4+ */ 89, 89, /* 8+ */ 32, 32, /* 12+ */ 22, 22) +F(saturate, /* 4+ */ 31, 31, /* 12+ */ 34, 34) +F(debug_control, /* 4+ */ 30, 30, /* 12+ */ 30, 30) +F(cmpt_control, /* 4+ */ 29, 29, /* 12+ */ 29, 29) +FC(branch_control, /* 4+ */ 28, 28, /* 12+ */ 33, 33, devinfo->gen >= 8) +FC(acc_wr_control, /* 4+ */ 28, 28, /* 12+ */ 33, 33, devinfo->gen >= 6) +FC(mask_control_ex, /* 4+ */ 28, 28, /* 12+ */ -1, -1, devinfo->is_g4x || devinfo->gen == 5) +F(cond_modifier, /* 4+ */ 27, 24, /* 12+ */ 95, 92) +FC(math_function, /* 4+ */ 27, 24, /* 12+ */ 95, 92, devinfo->gen >= 6) +F(exec_size, /* 4+ */ 23, 21, /* 12+ */ 18, 16) +F(pred_inv, /* 4+ */ 20, 20, /* 12+ */ 28, 28) +F(pred_control, /* 4+ */ 19, 16, /* 12+ */ 27, 24) +F(thread_control, /* 4+ */ 15, 14, /* 12+ */ -1, -1) +F(atomic_control, /* 4+ */ -1, -1, /* 12+ */ 32, 32) +F(qtr_control, /* 4+ */ 13, 12, /* 12+ */ 21, 20) FF(nib_control, /* 4-6: doesn't exist */ -1, -1, -1, -1, -1, -1, -1, -1, /* 7: */ 47, 47, - /* 8: */ 11, 11) -F8(no_dd_check, /* 4+ */ 11, 11, /* 8+ */ 10, 10) -F8(no_dd_clear, /* 4+ */ 10, 10, /* 8+ */ 9, 9) -F(access_mode, 8, 8) + /* 8: */ 11, 11, + /* 12: */ 19, 19) +F8(no_dd_check, /* 4+ */ 11, 11, /* 8+ */ 10, 10, /* 12+ */ -1, -1) +F8(no_dd_clear, /* 4+ */ 10, 10, /* 8+ */ 9, 9, /* 12+ */ -1, -1) +F(swsb, /* 4+ */ -1, -1, /* 12+ */ 15, 8) +FK(access_mode, /* 4+ */ 8, 8, /* 12+ */ BRW_ALIGN_1) /* Bit 7 is Reserved (for future Opcode expansion) */ -F(opcode, 6, 0) +F(hw_opcode, /* 4+ */ 6, 0, /* 12+ */ 6, 0) /** * Three-source instructions: * @{ */ -F(3src_src2_reg_nr, 125, 118) /* same in align1 */ -F(3src_a16_src2_subreg_nr, 117, 115) /* Extra discontiguous bit on CHV? */ -F(3src_a16_src2_swizzle, 114, 107) -F(3src_a16_src2_rep_ctrl, 106, 106) -F(3src_src1_reg_nr, 104, 97) /* same in align1 */ -F(3src_a16_src1_subreg_nr, 96, 94) /* Extra discontiguous bit on CHV? */ -F(3src_a16_src1_swizzle, 93, 86) -F(3src_a16_src1_rep_ctrl, 85, 85) -F(3src_src0_reg_nr, 83, 76) /* same in align1 */ -F(3src_a16_src0_subreg_nr, 75, 73) /* Extra discontiguous bit on CHV? */ -F(3src_a16_src0_swizzle, 72, 65) -F(3src_a16_src0_rep_ctrl, 64, 64) -F(3src_dst_reg_nr, 63, 56) /* same in align1 */ -F(3src_a16_dst_subreg_nr, 55, 53) -F(3src_a16_dst_writemask, 52, 49) -F8(3src_a16_nib_ctrl, 47, 47, 11, 11) /* only exists on IVB+ */ -F8(3src_a16_dst_hw_type, 45, 44, 48, 46) /* only exists on IVB+ */ -F8(3src_a16_src_hw_type, 43, 42, 45, 43) -F8(3src_src2_negate, 41, 41, 42, 42) -F8(3src_src2_abs, 40, 40, 41, 41) -F8(3src_src1_negate, 39, 39, 40, 40) -F8(3src_src1_abs, 38, 38, 39, 39) -F8(3src_src0_negate, 37, 37, 38, 38) -F8(3src_src0_abs, 36, 36, 37, 37) -F8(3src_a16_src1_type, -1, -1, 36, 36) -F8(3src_a16_src2_type, -1, -1, 35, 35) -F8(3src_a16_flag_reg_nr, 34, 34, 33, 33) -F8(3src_a16_flag_subreg_nr, 33, 33, 32, 32) +F(3src_src2_reg_nr, /* 4+ */ 125, 118, /* 12+ */ 127, 120) /* same in align1 */ +F(3src_a16_src2_subreg_nr, /* 4+ */ 117, 115, /* 12+ */ -1, -1) /* Extra discontiguous bit on CHV? */ +F(3src_a16_src2_swizzle, /* 4+ */ 114, 107, /* 12+ */ -1, -1) +F(3src_a16_src2_rep_ctrl, /* 4+ */ 106, 106, /* 12+ */ -1, -1) +F(3src_src1_reg_nr, /* 4+ */ 104, 97, /* 12+ */ 111, 104) /* same in align1 */ +F(3src_a16_src1_subreg_nr, /* 4+ */ 96, 94, /* 12+ */ -1, -1) /* Extra discontiguous bit on CHV? */ +F(3src_a16_src1_swizzle, /* 4+ */ 93, 86, /* 12+ */ -1, -1) +F(3src_a16_src1_rep_ctrl, /* 4+ */ 85, 85, /* 12+ */ -1, -1) +F(3src_src0_reg_nr, /* 4+ */ 83, 76, /* 12+ */ 79, 72) /* same in align1 */ +F(3src_a16_src0_subreg_nr, /* 4+ */ 75, 73, /* 12+ */ -1, -1) /* Extra discontiguous bit on CHV? */ +F(3src_a16_src0_swizzle, /* 4+ */ 72, 65, /* 12+ */ -1, -1) +F(3src_a16_src0_rep_ctrl, /* 4+ */ 64, 64, /* 12+ */ -1, -1) +F(3src_dst_reg_nr, /* 4+ */ 63, 56, /* 12+ */ 63, 56) /* same in align1 */ +F(3src_a16_dst_subreg_nr, /* 4+ */ 55, 53, /* 12+ */ -1, -1) +F(3src_a16_dst_writemask, /* 4+ */ 52, 49, /* 12+ */ -1, -1) +F8(3src_a16_nib_ctrl, /* 4+ */ 47, 47, /* 8+ */ 11, 11, /* 12+ */ -1, -1) /* only exists on IVB+ */ +F8(3src_a16_dst_hw_type, /* 4+ */ 45, 44, /* 8+ */ 48, 46, /* 12+ */ -1, -1) /* only exists on IVB+ */ +F8(3src_a16_src_hw_type, /* 4+ */ 43, 42, /* 8+ */ 45, 43, /* 12+ */ -1, -1) +F8(3src_src2_negate, /* 4+ */ 41, 41, /* 8+ */ 42, 42, /* 12+ */ 85, 85) +F8(3src_src2_abs, /* 4+ */ 40, 40, /* 8+ */ 41, 41, /* 12+ */ 84, 84) +F8(3src_src1_negate, /* 4+ */ 39, 39, /* 8+ */ 40, 40, /* 12+ */ 87, 87) +F8(3src_src1_abs, /* 4+ */ 38, 38, /* 8+ */ 39, 39, /* 12+ */ 86, 86) +F8(3src_src0_negate, /* 4+ */ 37, 37, /* 8+ */ 38, 38, /* 12+ */ 45, 45) +F8(3src_src0_abs, /* 4+ */ 36, 36, /* 8+ */ 37, 37, /* 12+ */ 44, 44) +F8(3src_a16_src1_type, /* 4+ */ -1, -1, /* 8+ */ 36, 36, /* 12+ */ -1, -1) +F8(3src_a16_src2_type, /* 4+ */ -1, -1, /* 8+ */ 35, 35, /* 12+ */ -1, -1) +F8(3src_a16_flag_reg_nr, /* 4+ */ 34, 34, /* 8+ */ 33, 33, /* 12+ */ -1, -1) +F8(3src_a16_flag_subreg_nr, /* 4+ */ 33, 33, /* 8+ */ 32, 32, /* 12+ */ -1, -1) FF(3src_a16_dst_reg_file, /* 4-5: doesn't exist - no 3-source instructions */ -1, -1, -1, -1, -1, -1, /* 6: */ 32, 32, - /* 7-8: doesn't exist - no MRFs */ -1, -1, -1, -1) -F(3src_saturate, 31, 31) -F(3src_debug_control, 30, 30) -F(3src_cmpt_control, 29, 29) -F(3src_acc_wr_control, 28, 28) -F(3src_cond_modifier, 27, 24) -F(3src_exec_size, 23, 21) -F(3src_pred_inv, 20, 20) -F(3src_pred_control, 19, 16) -F(3src_thread_control, 15, 14) -F(3src_qtr_control, 13, 12) -F8(3src_no_dd_check, 11, 11, 10, 10) -F8(3src_no_dd_clear, 10, 10, 9, 9) -F8(3src_mask_control, 9, 9, 34, 34) -F(3src_access_mode, 8, 8) + /* 7-8: doesn't exist - no MRFs */ -1, -1, -1, -1, + /* 12: */ -1, -1) +F(3src_saturate, /* 4+ */ 31, 31, /* 12+ */ 34, 34) +F(3src_debug_control, /* 4+ */ 30, 30, /* 12+ */ 30, 30) +F(3src_cmpt_control, /* 4+ */ 29, 29, /* 12+ */ 29, 29) +F(3src_acc_wr_control, /* 4+ */ 28, 28, /* 12+ */ 33, 33) +F(3src_cond_modifier, /* 4+ */ 27, 24, /* 12+ */ 95, 92) +F(3src_exec_size, /* 4+ */ 23, 21, /* 12+ */ 18, 16) +F(3src_pred_inv, /* 4+ */ 20, 20, /* 12+ */ 28, 28) +F(3src_pred_control, /* 4+ */ 19, 16, /* 12+ */ 27, 24) +F(3src_thread_control, /* 4+ */ 15, 14, /* 12+ */ -1, -1) +F(3src_atomic_control, /* 4+ */ -1, -1, /* 12+ */ 32, 32) +F(3src_qtr_control, /* 4+ */ 13, 12, /* 12+ */ 21, 20) +F8(3src_no_dd_check, /* 4+ */ 11, 11, /* 8+ */ 10, 10, /* 12+ */ -1, -1) +F8(3src_no_dd_clear, /* 4+ */ 10, 10, /* 8+ */ 9, 9, /* 12+ */ -1, -1) +F8(3src_mask_control, /* 4+ */ 9, 9, /* 8+ */ 34, 34, /* 12+ */ 31, 31) +FK(3src_access_mode, /* 4+ */ 8, 8, /* 12+ */ BRW_ALIGN_1) +F(3src_swsb, /* 4+ */ -1, -1, /* 12+ */ 15, 8) /* Bit 7 is Reserved (for future Opcode expansion) */ -F(3src_opcode, 6, 0) +F(3src_hw_opcode, /* 4+ */ 6, 0, /* 12+ */ 6, 0) /** @} */ #define REG_TYPE(reg) \ @@ -275,34 +404,38 @@ */ /* Reserved 127:126 */ /* src2_reg_nr same in align16 */ -FC(3src_a1_src2_subreg_nr, 117, 113, devinfo->gen >= 10) -FC(3src_a1_src2_hstride, 112, 111, devinfo->gen >= 10) +FC(3src_a1_src2_subreg_nr, /* 4+ */ 117, 113, /* 12+ */ 119, 115, devinfo->gen >= 10) +FC(3src_a1_src2_hstride, /* 4+ */ 112, 111, /* 12+ */ 113, 112, devinfo->gen >= 10) /* Reserved 110:109. src2 vstride is an implied parameter */ -FC(3src_a1_src2_hw_type, 108, 106, devinfo->gen >= 10) +FC(3src_a1_src2_hw_type, /* 4+ */ 108, 106, /* 12+ */ 82, 80, devinfo->gen >= 10) /* Reserved 105 */ /* src1_reg_nr same in align16 */ -FC(3src_a1_src1_subreg_nr, 96, 92, devinfo->gen >= 10) -FC(3src_a1_src1_hstride, 91, 90, devinfo->gen >= 10) -FC(3src_a1_src1_vstride, 89, 88, devinfo->gen >= 10) -FC(3src_a1_src1_hw_type, 87, 85, devinfo->gen >= 10) +FC(3src_a1_src1_subreg_nr, /* 4+ */ 96, 92, /* 12+ */ 103, 99, devinfo->gen >= 10) +FC(3src_a1_src1_hstride, /* 4+ */ 91, 90, /* 12+ */ 97, 96, devinfo->gen >= 10) +FDC(3src_a1_src1_vstride, /* 4+ */ 89, 88, /* 12+ */ 91, 91, 83, 83, devinfo->gen >= 10) +FC(3src_a1_src1_hw_type, /* 4+ */ 87, 85, /* 12+ */ 90, 88, devinfo->gen >= 10) /* Reserved 84 */ /* src0_reg_nr same in align16 */ -FC(3src_a1_src0_subreg_nr, 75, 71, devinfo->gen >= 10) -FC(3src_a1_src0_hstride, 70, 69, devinfo->gen >= 10) -FC(3src_a1_src0_vstride, 68, 67, devinfo->gen >= 10) -FC(3src_a1_src0_hw_type, 66, 64, devinfo->gen >= 10) +FC(3src_a1_src0_subreg_nr, /* 4+ */ 75, 71, /* 12+ */ 71, 67, devinfo->gen >= 10) +FC(3src_a1_src0_hstride, /* 4+ */ 70, 69, /* 12+ */ 65, 64, devinfo->gen >= 10) +FDC(3src_a1_src0_vstride, /* 4+ */ 68, 67, /* 12+ */ 43, 43, 35, 35, devinfo->gen >= 10) +FC(3src_a1_src0_hw_type, /* 4+ */ 66, 64, /* 12+ */ 42, 40, devinfo->gen >= 10) /* dst_reg_nr same in align16 */ -FC(3src_a1_dst_subreg_nr, 55, 54, devinfo->gen >= 10) -FC(3src_a1_special_acc, 55, 52, devinfo->gen >= 10) /* aliases dst_subreg_nr */ +FC(3src_a1_dst_subreg_nr, /* 4+ */ 55, 54, /* 12+ */ 55, 54, devinfo->gen >= 10) +FC(3src_a1_special_acc, /* 4+ */ 55, 52, /* 12+ */ 54, 51, devinfo->gen >= 10) /* aliases dst_subreg_nr */ /* Reserved 51:50 */ -FC(3src_a1_dst_hstride, 49, 49, devinfo->gen >= 10) -FC(3src_a1_dst_hw_type, 48, 46, devinfo->gen >= 10) -FC(3src_a1_src2_reg_file, 45, 45, devinfo->gen >= 10) -FC(3src_a1_src1_reg_file, 44, 44, devinfo->gen >= 10) -FC(3src_a1_src0_reg_file, 43, 43, devinfo->gen >= 10) +FC(3src_a1_dst_hstride, /* 4+ */ 49, 49, /* 12+ */ 48, 48, devinfo->gen >= 10) +FC(3src_a1_dst_hw_type, /* 4+ */ 48, 46, /* 12+ */ 38, 36, devinfo->gen >= 10) +FI(3src_a1_src2_reg_file, /* 4+ */ -1, -1, /* 8+ */ 45, 45, /* 12+ */ 47, 114) +FC(3src_a1_src1_reg_file, /* 4+ */ 44, 44, /* 12+ */ 98, 98, devinfo->gen >= 10) +FI(3src_a1_src0_reg_file, /* 4+ */ -1, -1, /* 8+ */ 43, 43, /* 12+ */ 46, 66) + +F(3src_a1_src2_is_imm, /* 4+ */ -1, -1, /* 12+ */ 47, 47) +F(3src_a1_src0_is_imm, /* 4+ */ -1, -1, /* 12+ */ 46, 46) + /* Source Modifier fields same in align16 */ -FC(3src_a1_dst_reg_file, 36, 36, devinfo->gen >= 10) -FC(3src_a1_exec_type, 35, 35, devinfo->gen >= 10) +FC(3src_a1_dst_reg_file, /* 4+ */ 36, 36, /* 12+ */ 50, 50, devinfo->gen >= 10) +FC(3src_a1_exec_type, /* 4+ */ 35, 35, /* 12+ */ 39, 39, devinfo->gen >= 10) /* Fields below this same in align16 */ /** @} */ @@ -349,7 +482,10 @@ const brw_inst *insn) { assert(devinfo->gen >= 10); - return brw_inst_bits(insn, 82, 67); + if (devinfo->gen >= 12) + return brw_inst_bits(insn, 79, 64); + else + return brw_inst_bits(insn, 82, 67); } static inline uint16_t @@ -357,7 +493,10 @@ const brw_inst *insn) { assert(devinfo->gen >= 10); - return brw_inst_bits(insn, 124, 109); + if (devinfo->gen >= 12) + return brw_inst_bits(insn, 127, 112); + else + return brw_inst_bits(insn, 124, 109); } static inline void @@ -365,7 +504,10 @@ brw_inst *insn, uint16_t value) { assert(devinfo->gen >= 10); - brw_inst_set_bits(insn, 82, 67, value); + if (devinfo->gen >= 12) + brw_inst_set_bits(insn, 79, 64, value); + else + brw_inst_set_bits(insn, 82, 67, value); } static inline void @@ -373,7 +515,10 @@ brw_inst *insn, uint16_t value) { assert(devinfo->gen >= 10); - brw_inst_set_bits(insn, 124, 109, value); + if (devinfo->gen >= 12) + brw_inst_set_bits(insn, 127, 112, value); + else + brw_inst_set_bits(insn, 124, 109, value); } /** @} */ @@ -387,6 +532,9 @@ { assert(devinfo->gen >= 6); + if (devinfo->gen >= 12) + brw_inst_set_src1_is_imm(devinfo, inst, 1); + if (devinfo->gen >= 8) { brw_inst_set_bits(inst, 95, 64, (uint32_t)value); } else { @@ -414,6 +562,9 @@ { assert(devinfo->gen >= 6); + if (devinfo->gen >= 12) + brw_inst_set_src0_is_imm(devinfo, inst, 1); + if (devinfo->gen >= 8) { brw_inst_set_bits(inst, 127, 96, (uint32_t)value); } else { @@ -454,24 +605,30 @@ FJ(gen6_jump_count, 63, 48, devinfo->gen == 6) FJ(gen4_jump_count, 111, 96, devinfo->gen < 6) -FC(gen4_pop_count, 115, 112, devinfo->gen < 6) +FC(gen4_pop_count, /* 4+ */ 115, 112, /* 12+ */ -1, -1, devinfo->gen < 6) /** @} */ /** * SEND instructions: * @{ */ -FC(send_ex_desc_ia_subreg_nr, 82, 80, devinfo->gen >= 9) -FC(send_src0_address_mode, 79, 79, devinfo->gen >= 9) -FC(send_sel_reg32_desc, 77, 77, devinfo->gen >= 9) -FC(send_sel_reg32_ex_desc, 61, 61, devinfo->gen >= 9) -FC(send_src1_reg_nr, 51, 44, devinfo->gen >= 9) -FC(send_src1_reg_file, 36, 36, devinfo->gen >= 9) -FC(send_dst_reg_file, 35, 35, devinfo->gen >= 9) +FC(send_ex_desc_ia_subreg_nr, /* 4+ */ 82, 80, /* 12+ */ 42, 40, devinfo->gen >= 9) +FC(send_src0_address_mode, /* 4+ */ 79, 79, /* 12+ */ -1, -1, devinfo->gen >= 9) +FC(send_sel_reg32_desc, /* 4+ */ 77, 77, /* 12+ */ 48, 48, devinfo->gen >= 9) +FC(send_sel_reg32_ex_desc, /* 4+ */ 61, 61, /* 12+ */ 49, 49, devinfo->gen >= 9) +F8(send_src0_reg_file, /* 4+ */ 38, 37, /* 8+ */ 42, 41, /* 12+ */ 66, 66) +FC(send_src1_reg_nr, /* 4+ */ 51, 44, /* 12+ */ 111, 104, devinfo->gen >= 9) +FC(send_src1_reg_file, /* 4+ */ 36, 36, /* 12+ */ 98, 98, devinfo->gen >= 9) +FC(send_dst_reg_file, /* 4+ */ 35, 35, /* 12+ */ 50, 50, devinfo->gen >= 9) /** @} */ /* Message descriptor bits */ #define MD(x) ((x) + 96) +#define MD12(x) ((x) >= 30 ? (x) - 30 + 122 : \ + (x) >= 25 ? (x) - 25 + 67 : \ + (x) >= 20 ? (x) - 20 + 51 : \ + (x) >= 11 ? (x) - 11 + 113 : \ + (x) - 0 + 81) /** * Set the SEND(C) message descriptor immediate. @@ -486,7 +643,13 @@ brw_inst_set_send_desc(const struct gen_device_info *devinfo, brw_inst *inst, uint32_t value) { - if (devinfo->gen >= 9) { + if (devinfo->gen >= 12) { + brw_inst_set_bits(inst, 123, 122, GET_BITS(value, 31, 30)); + brw_inst_set_bits(inst, 71, 67, GET_BITS(value, 29, 25)); + brw_inst_set_bits(inst, 55, 51, GET_BITS(value, 24, 20)); + brw_inst_set_bits(inst, 121, 113, GET_BITS(value, 19, 11)); + brw_inst_set_bits(inst, 91, 81, GET_BITS(value, 10, 0)); + } else if (devinfo->gen >= 9) { brw_inst_set_bits(inst, 126, 96, value); assert(value >> 31 == 0); } else if (devinfo->gen >= 5) { @@ -506,12 +669,19 @@ static inline uint32_t brw_inst_send_desc(const struct gen_device_info *devinfo, const brw_inst *inst) { - if (devinfo->gen >= 9) + if (devinfo->gen >= 12) { + return (brw_inst_bits(inst, 123, 122) << 30 | + brw_inst_bits(inst, 71, 67) << 25 | + brw_inst_bits(inst, 55, 51) << 20 | + brw_inst_bits(inst, 121, 113) << 11 | + brw_inst_bits(inst, 91, 81)); + } else if (devinfo->gen >= 9) { return brw_inst_bits(inst, 126, 96); - else if (devinfo->gen >= 5) + } else if (devinfo->gen >= 5) { return brw_inst_bits(inst, 124, 96); - else + } else { return brw_inst_bits(inst, 119, 96); + } } /** @@ -527,17 +697,39 @@ brw_inst_set_send_ex_desc(const struct gen_device_info *devinfo, brw_inst *inst, uint32_t value) { - assert(devinfo->gen >= 9); - if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND || - brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC) { + if (devinfo->gen >= 12) { + brw_inst_set_bits(inst, 127, 124, GET_BITS(value, 31, 28)); + brw_inst_set_bits(inst, 97, 96, GET_BITS(value, 27, 26)); + brw_inst_set_bits(inst, 65, 64, GET_BITS(value, 25, 24)); + brw_inst_set_bits(inst, 47, 35, GET_BITS(value, 23, 11)); + brw_inst_set_bits(inst, 103, 99, GET_BITS(value, 10, 6)); + assert(GET_BITS(value, 5, 0) == 0); + } else { + assert(devinfo->gen >= 9); brw_inst_set_bits(inst, 94, 91, GET_BITS(value, 31, 28)); brw_inst_set_bits(inst, 88, 85, GET_BITS(value, 27, 24)); brw_inst_set_bits(inst, 83, 80, GET_BITS(value, 23, 20)); brw_inst_set_bits(inst, 67, 64, GET_BITS(value, 19, 16)); assert(GET_BITS(value, 15, 0) == 0); + } +} + +/** + * Set the SENDS(C) message extended descriptor immediate. + * + * This doesn't include the SFID nor the EOT field that were considered to be + * part of the extended message descriptor by some versions of the BSpec, + * because they are present in the instruction even if the extended message + * descriptor is provided indirectly in a register, so we want to specify them + * separately. + */ +static inline void +brw_inst_set_sends_ex_desc(const struct gen_device_info *devinfo, + brw_inst *inst, uint32_t value) +{ + if (devinfo->gen >= 12) { + brw_inst_set_send_ex_desc(devinfo, inst, value); } else { - assert(brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDS || - brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDSC); brw_inst_set_bits(inst, 95, 80, GET_BITS(value, 31, 16)); assert(GET_BITS(value, 15, 10) == 0); brw_inst_set_bits(inst, 67, 64, GET_BITS(value, 9, 6)); @@ -554,16 +746,33 @@ brw_inst_send_ex_desc(const struct gen_device_info *devinfo, const brw_inst *inst) { - assert(devinfo->gen >= 9); - if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND || - brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC) { + if (devinfo->gen >= 12) { + return (brw_inst_bits(inst, 127, 124) << 28 | + brw_inst_bits(inst, 97, 96) << 26 | + brw_inst_bits(inst, 65, 64) << 24 | + brw_inst_bits(inst, 47, 35) << 11 | + brw_inst_bits(inst, 103, 99) << 6); + } else { + assert(devinfo->gen >= 9); return (brw_inst_bits(inst, 94, 91) << 28 | brw_inst_bits(inst, 88, 85) << 24 | brw_inst_bits(inst, 83, 80) << 20 | brw_inst_bits(inst, 67, 64) << 16); + } +} + +/** + * Get the SENDS(C) message extended descriptor immediate. + * + * \sa brw_inst_set_send_ex_desc(). + */ +static inline uint32_t +brw_inst_sends_ex_desc(const struct gen_device_info *devinfo, + const brw_inst *inst) +{ + if (devinfo->gen >= 12) { + return brw_inst_send_ex_desc(devinfo, inst); } else { - assert(brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDS || - brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDSC); return (brw_inst_bits(inst, 95, 80) << 16 | brw_inst_bits(inst, 67, 64) << 6); } @@ -573,53 +782,68 @@ * Fields for SEND messages: * @{ */ -F(eot, 127, 127) +F(eot, /* 4+ */ 127, 127, /* 12+ */ 34, 34) FF(mlen, /* 4: */ 119, 116, /* 4.5: */ 119, 116, /* 5: */ 124, 121, /* 6: */ 124, 121, /* 7: */ 124, 121, - /* 8: */ 124, 121); + /* 8: */ 124, 121, + /* 12: */ MD12(28), MD12(25)); FF(rlen, /* 4: */ 115, 112, /* 4.5: */ 115, 112, /* 5: */ 120, 116, /* 6: */ 120, 116, /* 7: */ 120, 116, - /* 8: */ 120, 116); + /* 8: */ 120, 116, + /* 12: */ MD12(24), MD12(20)); FF(header_present, /* 4: doesn't exist */ -1, -1, -1, -1, /* 5: */ 115, 115, /* 6: */ 115, 115, /* 7: */ 115, 115, - /* 8: */ 115, 115) -F(gateway_notify, MD(16), MD(15)) -FF(function_control, + /* 8: */ 115, 115, + /* 12: */ MD12(19), MD12(19)) +F(gateway_notify, /* 4+ */ MD(16), MD(15), /* 12+ */ -1, -1) +FD(function_control, /* 4: */ 111, 96, /* 4.5: */ 111, 96, /* 5: */ 114, 96, /* 6: */ 114, 96, /* 7: */ 114, 96, - /* 8: */ 114, 96) + /* 8: */ 114, 96, + /* 12: */ MD12(18), MD12(11), MD12(10), MD12(0)) FF(gateway_subfuncid, /* 4: */ MD(1), MD(0), /* 4.5: */ MD(1), MD(0), /* 5: */ MD(1), MD(0), /* 2:0, but bit 2 is reserved MBZ */ /* 6: */ MD(2), MD(0), /* 7: */ MD(2), MD(0), - /* 8: */ MD(2), MD(0)) + /* 8: */ MD(2), MD(0), + /* 12: */ MD12(2), MD12(0)) FF(sfid, /* 4: */ 123, 120, /* called msg_target */ /* 4.5 */ 123, 120, /* 5: */ 95, 92, /* 6: */ 27, 24, /* 7: */ 27, 24, - /* 8: */ 27, 24) + /* 8: */ 27, 24, + /* 12: */ 95, 92) FF(null_rt, /* 4-7: */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - /* 8: */ 80, 80) /* actually only Gen11+ */ -FC(base_mrf, 27, 24, devinfo->gen < 6); + /* 8: */ 80, 80, + /* 12: */ 44, 44) /* actually only Gen11+ */ +FC(base_mrf, /* 4+ */ 27, 24, /* 12+ */ -1, -1, devinfo->gen < 6); +FF(send_rta_index, + /* 4: */ -1, -1, + /* 4.5 */ -1, -1, + /* 5: */ -1, -1, + /* 6: */ -1, -1, + /* 7: */ -1, -1, + /* 8: */ -1, -1, + /* 12: */ 38, 36) /** @} */ /** @@ -629,43 +853,47 @@ FF(urb_per_slot_offset, /* 4-6: */ -1, -1, -1, -1, -1, -1, -1, -1, /* 7: */ MD(16), MD(16), - /* 8: */ MD(17), MD(17)) -FC(urb_channel_mask_present, MD(15), MD(15), devinfo->gen >= 8) -FC(urb_complete, MD(15), MD(15), devinfo->gen < 8) -FC(urb_used, MD(14), MD(14), devinfo->gen < 7) -FC(urb_allocate, MD(13), MD(13), devinfo->gen < 7) + /* 8: */ MD(17), MD(17), + /* 12: */ MD12(17), MD12(17)) +FC(urb_channel_mask_present, /* 4+ */ MD(15), MD(15), /* 12+ */ MD12(15), MD12(15), devinfo->gen >= 8) +FC(urb_complete, /* 4+ */ MD(15), MD(15), /* 12+ */ -1, -1, devinfo->gen < 8) +FC(urb_used, /* 4+ */ MD(14), MD(14), /* 12+ */ -1, -1, devinfo->gen < 7) +FC(urb_allocate, /* 4+ */ MD(13), MD(13), /* 12+ */ -1, -1, devinfo->gen < 7) FF(urb_swizzle_control, /* 4: */ MD(11), MD(10), /* 4.5: */ MD(11), MD(10), /* 5: */ MD(11), MD(10), /* 6: */ MD(11), MD(10), /* 7: */ MD(14), MD(14), - /* 8: */ MD(15), MD(15)) -FF(urb_global_offset, + /* 8: */ MD(15), MD(15), + /* 12: */ -1, -1) +FD(urb_global_offset, /* 4: */ MD( 9), MD(4), /* 4.5: */ MD( 9), MD(4), /* 5: */ MD( 9), MD(4), /* 6: */ MD( 9), MD(4), /* 7: */ MD(13), MD(3), - /* 8: */ MD(14), MD(4)) + /* 8: */ MD(14), MD(4), + /* 12: */ MD12(14), MD12(11), MD12(10), MD12(4)) FF(urb_opcode, /* 4: */ MD( 3), MD(0), /* 4.5: */ MD( 3), MD(0), /* 5: */ MD( 3), MD(0), /* 6: */ MD( 3), MD(0), /* 7: */ MD( 2), MD(0), - /* 8: */ MD( 3), MD(0)) + /* 8: */ MD( 3), MD(0), + /* 12: */ MD12(3), MD12(0)) /** @} */ /** * Gen4-5 math messages: * @{ */ -FC(math_msg_data_type, MD(7), MD(7), devinfo->gen < 6) -FC(math_msg_saturate, MD(6), MD(6), devinfo->gen < 6) -FC(math_msg_precision, MD(5), MD(5), devinfo->gen < 6) -FC(math_msg_signed_int, MD(4), MD(4), devinfo->gen < 6) -FC(math_msg_function, MD(3), MD(0), devinfo->gen < 6) +FC(math_msg_data_type, /* 4+ */ MD(7), MD(7), /* 12+ */ -1, -1, devinfo->gen < 6) +FC(math_msg_saturate, /* 4+ */ MD(6), MD(6), /* 12+ */ -1, -1, devinfo->gen < 6) +FC(math_msg_precision, /* 4+ */ MD(5), MD(5), /* 12+ */ -1, -1, devinfo->gen < 6) +FC(math_msg_signed_int, /* 4+ */ MD(4), MD(4), /* 12+ */ -1, -1, devinfo->gen < 6) +FC(math_msg_function, /* 4+ */ MD(3), MD(0), /* 12+ */ -1, -1, devinfo->gen < 6) /** @} */ /** @@ -677,24 +905,33 @@ /* 5: */ MD(17), MD(16), /* 6: */ MD(17), MD(16), /* 7: */ MD(18), MD(17), - /* 8: */ MD(18), MD(17)) + /* 8: */ MD(18), MD(17), + /* 12: */ MD12(18), MD12(17)) FF(sampler_msg_type, /* 4: */ MD(15), MD(14), /* 4.5: */ MD(15), MD(12), /* 5: */ MD(15), MD(12), /* 6: */ MD(15), MD(12), /* 7: */ MD(16), MD(12), - /* 8: */ MD(16), MD(12)) -FC(sampler_return_format, MD(13), MD(12), devinfo->gen == 4 && !devinfo->is_g4x) -F(sampler, MD(11), MD(8)) -F(binding_table_index, MD( 7), MD(0)) /* also used by other messages */ + /* 8: */ MD(16), MD(12), + /* 12: */ MD12(16), MD12(12)) +FC(sampler_return_format, /* 4+ */ MD(13), MD(12), /* 12+ */ -1, -1, devinfo->gen == 4 && !devinfo->is_g4x) +FD(sampler, + /* 4: */ MD(11), MD(8), + /* 4.5: */ MD(11), MD(8), + /* 5: */ MD(11), MD(8), + /* 6: */ MD(11), MD(8), + /* 7: */ MD(11), MD(8), + /* 8: */ MD(11), MD(8), + /* 12: */ MD12(11), MD12(11), MD12(10), MD12(8)) +F(binding_table_index, /* 4+ */ MD(7), MD(0), /* 12+ */ MD12(7), MD12(0)) /* also used by other messages */ /** @} */ /** * Data port message function control bits: * @{ */ -FC(dp_category, MD(18), MD(18), devinfo->gen >= 7) +FC(dp_category, /* 4+ */ MD(18), MD(18), /* 12+ */ MD12(18), MD12(18), devinfo->gen >= 7) /* Gen4-5 store fields in different bits for read/write messages. */ FF(dp_read_msg_type, @@ -703,36 +940,41 @@ /* 5: */ MD(13), MD(11), /* 6: */ MD(16), MD(13), /* 7: */ MD(17), MD(14), - /* 8: */ MD(17), MD(14)) + /* 8: */ MD(17), MD(14), + /* 12: */ MD12(17), MD12(14)) FF(dp_write_msg_type, /* 4: */ MD(14), MD(12), /* 4.5: */ MD(14), MD(12), /* 5: */ MD(14), MD(12), /* 6: */ MD(16), MD(13), /* 7: */ MD(17), MD(14), - /* 8: */ MD(17), MD(14)) -FF(dp_read_msg_control, + /* 8: */ MD(17), MD(14), + /* 12: */ MD12(17), MD12(14)) +FD(dp_read_msg_control, /* 4: */ MD(11), MD( 8), /* 4.5: */ MD(10), MD( 8), /* 5: */ MD(10), MD( 8), /* 6: */ MD(12), MD( 8), /* 7: */ MD(13), MD( 8), - /* 8: */ MD(13), MD( 8)) -FF(dp_write_msg_control, + /* 8: */ MD(13), MD( 8), + /* 12: */ MD12(13), MD12(11), MD12(10), MD12(8)) +FD(dp_write_msg_control, /* 4: */ MD(11), MD( 8), /* 4.5: */ MD(11), MD( 8), /* 5: */ MD(11), MD( 8), /* 6: */ MD(12), MD( 8), /* 7: */ MD(13), MD( 8), - /* 8: */ MD(13), MD( 8)) -FC(dp_read_target_cache, MD(15), MD(14), devinfo->gen < 6); + /* 8: */ MD(13), MD( 8), + /* 12: */ MD12(13), MD12(11), MD12(10), MD12(8)) +FC(dp_read_target_cache, /* 4+ */ MD(15), MD(14), /* 12+ */ -1, -1, devinfo->gen < 6); FF(dp_write_commit, /* 4: */ MD(15), MD(15), /* 4.5: */ MD(15), MD(15), /* 5: */ MD(15), MD(15), /* 6: */ MD(17), MD(17), - /* 7+: does not exist */ -1, -1, -1, -1) + /* 7+: does not exist */ -1, -1, -1, -1, + /* 12: */ -1, -1) /* Gen6+ use the same bit locations for everything. */ FF(dp_msg_type, @@ -740,24 +982,33 @@ -1, -1, -1, -1, -1, -1, /* 6: */ MD(16), MD(13), /* 7: */ MD(17), MD(14), - /* 8: */ MD(18), MD(14)) -FF(dp_msg_control, + /* 8: */ MD(18), MD(14), + /* 12: */ MD12(18), MD12(14)) +FD(dp_msg_control, /* 4: */ MD(11), MD( 8), /* 4.5-5: use dp_read_msg_control or dp_write_msg_control */ -1, -1, -1, -1, /* 6: */ MD(12), MD( 8), /* 7: */ MD(13), MD( 8), - /* 8: */ MD(13), MD( 8)) + /* 8: */ MD(13), MD( 8), + /* 12: */ MD12(13), MD12(11), MD12(10), MD12(8)) /** @} */ /** * Scratch message bits (Gen7+): * @{ */ -FC(scratch_read_write, MD(17), MD(17), devinfo->gen >= 7) /* 0 = read, 1 = write */ -FC(scratch_type, MD(16), MD(16), devinfo->gen >= 7) /* 0 = OWord, 1 = DWord */ -FC(scratch_invalidate_after_read, MD(15), MD(15), devinfo->gen >= 7) -FC(scratch_block_size, MD(13), MD(12), devinfo->gen >= 7) -FC(scratch_addr_offset, MD(11), MD( 0), devinfo->gen >= 7) +FC(scratch_read_write, /* 4+ */ MD(17), MD(17), /* 12+ */ MD12(17), MD12(17), devinfo->gen >= 7) /* 0 = read, 1 = write */ +FC(scratch_type, /* 4+ */ MD(16), MD(16), /* 12+ */ -1, -1, devinfo->gen >= 7) /* 0 = OWord, 1 = DWord */ +FC(scratch_invalidate_after_read, /* 4+ */ MD(15), MD(15), /* 12+ */ MD12(15), MD12(15), devinfo->gen >= 7) +FC(scratch_block_size, /* 4+ */ MD(13), MD(12), /* 12+ */ MD12(13), MD12(12), devinfo->gen >= 7) +FD(scratch_addr_offset, + /* 4: */ -1, -1, + /* 4.5: */ -1, -1, + /* 5: */ -1, -1, + /* 6: */ -1, -1, + /* 7: */ MD(11), MD(0), + /* 8: */ MD(11), MD(0), + /* 12: */ MD12(11), MD12(11), MD12(10), MD12(0)) /** @} */ /** @@ -770,29 +1021,30 @@ /* 5: */ MD(11), MD(11), /* 6: */ MD(12), MD(12), /* 7: */ MD(12), MD(12), - /* 8: */ MD(12), MD(12)) -FC(rt_slot_group, MD(11), MD(11), devinfo->gen >= 6) -F(rt_message_type, MD(10), MD( 8)) + /* 8: */ MD(12), MD(12), + /* 12: */ MD12(12), MD12(12)) +FC(rt_slot_group, /* 4+ */ MD(11), MD(11), /* 12+ */ MD12(11), MD12(11), devinfo->gen >= 6) +F(rt_message_type, /* 4+ */ MD(10), MD( 8), /* 12+ */ MD12(10), MD12(8)) /** @} */ /** * Thread Spawn message function control bits: * @{ */ -F(ts_resource_select, MD( 4), MD( 4)) -F(ts_request_type, MD( 1), MD( 1)) -F(ts_opcode, MD( 0), MD( 0)) +FC(ts_resource_select, /* 4+ */ MD( 4), MD( 4), /* 12+ */ -1, -1, devinfo->gen < 11) +FC(ts_request_type, /* 4+ */ MD( 1), MD( 1), /* 12+ */ -1, -1, devinfo->gen < 11) +F(ts_opcode, /* 4+ */ MD( 0), MD( 0), /* 12+ */ MD12(0), MD12(0)) /** @} */ /** * Pixel Interpolator message function control bits: * @{ */ -F(pi_simd_mode, MD(16), MD(16)) -F(pi_nopersp, MD(14), MD(14)) -F(pi_message_type, MD(13), MD(12)) -F(pi_slot_group, MD(11), MD(11)) -F(pi_message_data, MD(7), MD(0)) +F(pi_simd_mode, /* 4+ */ MD(16), MD(16), /* 12+ */ MD12(16), MD12(16)) +F(pi_nopersp, /* 4+ */ MD(14), MD(14), /* 12+ */ MD12(14), MD12(14)) +F(pi_message_type, /* 4+ */ MD(13), MD(12), /* 12+ */ MD12(13), MD12(12)) +F(pi_slot_group, /* 4+ */ MD(11), MD(11), /* 12+ */ MD12(11), MD12(11)) +F(pi_message_data, /* 4+ */ MD(7), MD(0), /* 12+ */ MD12(7), MD12(0)) /** @} */ /** @@ -884,7 +1136,13 @@ } dt; (void) devinfo; dt.d = value; - brw_inst_set_bits(insn, 127, 64, dt.u); + + if (devinfo->gen >= 12) { + brw_inst_set_bits(insn, 95, 64, dt.u >> 32); + brw_inst_set_bits(insn, 127, 96, dt.u & 0xFFFFFFFF); + } else { + brw_inst_set_bits(insn, 127, 64, dt.u); + } } static inline void @@ -892,7 +1150,12 @@ brw_inst *insn, uint64_t value) { (void) devinfo; - brw_inst_set_bits(insn, 127, 64, value); + if (devinfo->gen >= 12) { + brw_inst_set_bits(insn, 95, 64, value >> 32); + brw_inst_set_bits(insn, 127, 96, value & 0xFFFFFFFF); + } else { + brw_inst_set_bits(insn, 127, 64, value); + } } /** @} */ @@ -927,14 +1190,17 @@ /* The AddrImm fields are split into two discontiguous sections on Gen8+ */ -#define BRW_IA1_ADDR_IMM(reg, g4_high, g4_low, g8_nine, g8_high, g8_low) \ +#define BRW_IA1_ADDR_IMM(reg, g4_high, g4_low, g8_nine, g8_high, g8_low, \ + g12_high, g12_low) \ static inline void \ brw_inst_set_##reg##_ia1_addr_imm(const struct gen_device_info *devinfo, \ brw_inst *inst, \ unsigned value) \ { \ assert((value & ~0x3ff) == 0); \ - if (devinfo->gen >= 8) { \ + if (devinfo->gen >= 12) { \ + brw_inst_set_bits(inst, g12_high, g12_low, value); \ + } else if (devinfo->gen >= 8) { \ brw_inst_set_bits(inst, g8_high, g8_low, value & 0x1ff); \ brw_inst_set_bits(inst, g8_nine, g8_nine, value >> 9); \ } else { \ @@ -945,7 +1211,9 @@ brw_inst_##reg##_ia1_addr_imm(const struct gen_device_info *devinfo, \ const brw_inst *inst) \ { \ - if (devinfo->gen >= 8) { \ + if (devinfo->gen >= 12) { \ + return brw_inst_bits(inst, g12_high, g12_low); \ + } else if (devinfo->gen >= 8) { \ return brw_inst_bits(inst, g8_high, g8_low) | \ (brw_inst_bits(inst, g8_nine, g8_nine) << 9); \ } else { \ @@ -953,17 +1221,18 @@ } \ } -/* AddrImm[9:0] for Align1 Indirect Addressing */ -/* -Gen 4- ----Gen8---- */ -BRW_IA1_ADDR_IMM(src1, 105, 96, 121, 104, 96) -BRW_IA1_ADDR_IMM(src0, 73, 64, 95, 72, 64) -BRW_IA1_ADDR_IMM(dst, 57, 48, 47, 56, 48) +/* AddrImm[9:0] for Align1 Indirect Addressing */ +/* -Gen 4- ----Gen8---- -Gen12- */ +BRW_IA1_ADDR_IMM(src1, 105, 96, 121, 104, 96, 107, 98) +BRW_IA1_ADDR_IMM(src0, 73, 64, 95, 72, 64, 75, 66) +BRW_IA1_ADDR_IMM(dst, 57, 48, 47, 56, 48, 59, 50) #define BRW_IA16_ADDR_IMM(reg, g4_high, g4_low, g8_nine, g8_high, g8_low) \ static inline void \ brw_inst_set_##reg##_ia16_addr_imm(const struct gen_device_info *devinfo, \ brw_inst *inst, unsigned value) \ { \ + assert(devinfo->gen < 12); \ assert((value & ~0x3ff) == 0); \ if (devinfo->gen >= 8) { \ assert(GET_BITS(value, 3, 0) == 0); \ @@ -977,6 +1246,7 @@ brw_inst_##reg##_ia16_addr_imm(const struct gen_device_info *devinfo, \ const brw_inst *inst) \ { \ + assert(devinfo->gen < 12); \ if (devinfo->gen >= 8) { \ return (brw_inst_bits(inst, g8_high, g8_low) << 4) | \ (brw_inst_bits(inst, g8_nine, g8_nine) << 9); \ @@ -1003,6 +1273,8 @@ static inline uint64_t brw_inst_bits(const brw_inst *inst, unsigned high, unsigned low) { + assert(high < 128); + assert(high >= low); /* We assume the field doesn't cross 64-bit boundaries. */ const unsigned word = high / 64; assert(word == low / 64); @@ -1023,6 +1295,8 @@ static inline void brw_inst_set_bits(brw_inst *inst, unsigned high, unsigned low, uint64_t value) { + assert(high < 128); + assert(high >= low); const unsigned word = high / 64; assert(word == low / 64); @@ -1080,65 +1354,87 @@ inst->data = (inst->data & ~mask) | (value << low); } -#define FC(name, high, low, assertions) \ +#define FC(name, high, low, gen12_high, gen12_low, assertions) \ static inline void \ brw_compact_inst_set_##name(const struct gen_device_info *devinfo, \ brw_compact_inst *inst, unsigned v) \ { \ assert(assertions); \ - (void) devinfo; \ - brw_compact_inst_set_bits(inst, high, low, v); \ + if (devinfo->gen >= 12) \ + brw_compact_inst_set_bits(inst, gen12_high, gen12_low, v); \ + else \ + brw_compact_inst_set_bits(inst, high, low, v); \ } \ static inline unsigned \ brw_compact_inst_##name(const struct gen_device_info *devinfo, \ const brw_compact_inst *inst) \ { \ assert(assertions); \ - (void) devinfo; \ - return brw_compact_inst_bits(inst, high, low); \ -} + if (devinfo->gen >= 12) \ + return brw_compact_inst_bits(inst, gen12_high, gen12_low); \ + else \ + return brw_compact_inst_bits(inst, high, low); \ +} + +/* A simple macro for fields which stay in the same place on all generations + * except for Gen12. + */ +#define F(name, high, low, gen12_high, gen12_low) \ + FC(name, high, low, gen12_high, gen12_low, true) + +F(src1_reg_nr, /* 4+ */ 63, 56, /* 12+ */ 63, 56) +F(src0_reg_nr, /* 4+ */ 55, 48, /* 12+ */ 47, 40) +F(dst_reg_nr, /* 4+ */ 47, 40, /* 12+ */ 23, 16) +F(src1_index, /* 4+ */ 39, 35, /* 12+ */ 55, 52) +F(src0_index, /* 4+ */ 34, 30, /* 12+ */ 51, 48) +F(cmpt_control, /* 4+ */ 29, 29, /* 12+ */ 29, 29) /* Same location as brw_inst */ +FC(flag_subreg_nr, /* 4+ */ 28, 28, /* 12+ */ -1, -1, devinfo->gen <= 6) +F(cond_modifier, /* 4+ */ 27, 24, /* 12+ */ -1, -1) /* Same location as brw_inst */ +FC(acc_wr_control, /* 4+ */ 23, 23, /* 12+ */ -1, -1, devinfo->gen >= 6) +FC(mask_control_ex, /* 4+ */ 23, 23, /* 12+ */ -1, -1, devinfo->is_g4x || devinfo->gen == 5) +F(subreg_index, /* 4+ */ 22, 18, /* 12+ */ 39, 35) +F(datatype_index, /* 4+ */ 17, 13, /* 12+ */ 34, 30) +F(control_index, /* 4+ */ 12, 8, /* 12+ */ 28, 24) +FC(swsb, /* 4+ */ -1, -1, /* 12+ */ 15, 8, devinfo->gen >= 12) +F(debug_control, /* 4+ */ 7, 7, /* 12+ */ 7, 7) +F(hw_opcode, /* 4+ */ 6, 0, /* 12+ */ 6, 0) /* Same location as brw_inst */ -/* A simple macro for fields which stay in the same place on all generations. */ -#define F(name, high, low) FC(name, high, low, true) - -F(src1_reg_nr, 63, 56) -F(src0_reg_nr, 55, 48) -F(dst_reg_nr, 47, 40) -F(src1_index, 39, 35) -F(src0_index, 34, 30) -F(cmpt_control, 29, 29) /* Same location as brw_inst */ -FC(flag_subreg_nr, 28, 28, devinfo->gen <= 6) -F(cond_modifier, 27, 24) /* Same location as brw_inst */ -FC(acc_wr_control, 23, 23, devinfo->gen >= 6) -FC(mask_control_ex, 23, 23, devinfo->is_g4x || devinfo->gen == 5) -F(subreg_index, 22, 18) -F(datatype_index, 17, 13) -F(control_index, 12, 8) -F(debug_control, 7, 7) -F(opcode, 6, 0) /* Same location as brw_inst */ +static inline unsigned +brw_compact_inst_imm(const struct gen_device_info *devinfo, + const brw_compact_inst *inst) +{ + if (devinfo->gen >= 12) { + return brw_compact_inst_bits(inst, 63, 52); + } else { + return (brw_compact_inst_bits(inst, 39, 35) << 8) | + (brw_compact_inst_bits(inst, 63, 56)); + } +} /** * (Gen8+) Compacted three-source instructions: * @{ */ -FC(3src_src2_reg_nr, 63, 57, devinfo->gen >= 8) -FC(3src_src1_reg_nr, 56, 50, devinfo->gen >= 8) -FC(3src_src0_reg_nr, 49, 43, devinfo->gen >= 8) -FC(3src_src2_subreg_nr, 42, 40, devinfo->gen >= 8) -FC(3src_src1_subreg_nr, 39, 37, devinfo->gen >= 8) -FC(3src_src0_subreg_nr, 36, 34, devinfo->gen >= 8) -FC(3src_src2_rep_ctrl, 33, 33, devinfo->gen >= 8) -FC(3src_src1_rep_ctrl, 32, 32, devinfo->gen >= 8) -FC(3src_saturate, 31, 31, devinfo->gen >= 8) -FC(3src_debug_control, 30, 30, devinfo->gen >= 8) -FC(3src_cmpt_control, 29, 29, devinfo->gen >= 8) -FC(3src_src0_rep_ctrl, 28, 28, devinfo->gen >= 8) +FC(3src_src2_reg_nr, /* 4+ */ 63, 57, /* 12+ */ 55, 48, devinfo->gen >= 8) +FC(3src_src1_reg_nr, /* 4+ */ 56, 50, /* 12+ */ 63, 56, devinfo->gen >= 8) +FC(3src_src0_reg_nr, /* 4+ */ 49, 43, /* 12+ */ 47, 40, devinfo->gen >= 8) +FC(3src_src2_subreg_nr, /* 4+ */ 42, 40, /* 12+ */ -1, -1, devinfo->gen >= 8) +FC(3src_src1_subreg_nr, /* 4+ */ 39, 37, /* 12+ */ -1, -1, devinfo->gen >= 8) +FC(3src_src0_subreg_nr, /* 4+ */ 36, 34, /* 12+ */ -1, -1, devinfo->gen >= 8) +FC(3src_src2_rep_ctrl, /* 4+ */ 33, 33, /* 12+ */ -1, -1, devinfo->gen >= 8) +FC(3src_src1_rep_ctrl, /* 4+ */ 32, 32, /* 12+ */ -1, -1, devinfo->gen >= 8) +FC(3src_saturate, /* 4+ */ 31, 31, /* 12+ */ -1, -1, devinfo->gen >= 8) +FC(3src_debug_control, /* 4+ */ 30, 30, /* 12+ */ 7, 7, devinfo->gen >= 8) +FC(3src_cmpt_control, /* 4+ */ 29, 29, /* 12+ */ 29, 29, devinfo->gen >= 8) +FC(3src_src0_rep_ctrl, /* 4+ */ 28, 28, /* 12+ */ -1, -1, devinfo->gen >= 8) /* Reserved */ -FC(3src_dst_reg_nr, 18, 12, devinfo->gen >= 8) -FC(3src_source_index, 11, 10, devinfo->gen >= 8) -FC(3src_control_index, 9, 8, devinfo->gen >= 8) +FC(3src_dst_reg_nr, /* 4+ */ 18, 12, /* 12+ */ 23, 16, devinfo->gen >= 8) +FC(3src_source_index, /* 4+ */ 11, 10, /* 12+ */ 34, 30, devinfo->gen >= 8) +FC(3src_subreg_index, /* 4+ */ -1, -1, /* 12+ */ 39, 35, devinfo->gen >= 12) +FC(3src_control_index, /* 4+ */ 9, 8, /* 12+ */ 28, 24, devinfo->gen >= 8) +FC(3src_swsb, /* 4+ */ -1, -1, /* 12+ */ 15, 8, devinfo->gen >= 8) /* Bit 7 is Reserved (for future Opcode expansion) */ -FC(3src_opcode, 6, 0, devinfo->gen >= 8) +FC(3src_hw_opcode, /* 4+ */ 6, 0, /* 12+ */ 6, 0, devinfo->gen >= 8) /** @} */ #undef F diff -Nru mesa-19.2.8/src/intel/compiler/brw_ir_fs.h mesa-20.0.8/src/intel/compiler/brw_ir_fs.h --- mesa-19.2.8/src/intel/compiler/brw_ir_fs.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/compiler/brw_ir_fs.h 2020-06-12 01:21:17.000000000 +0000 @@ -348,14 +348,15 @@ void resize_sources(uint8_t num_sources); bool is_send_from_grf() const; + bool is_payload(unsigned arg) const; bool is_partial_write() const; - bool is_copy_payload(const brw::simple_allocator &grf_alloc) const; unsigned components_read(unsigned i) const; unsigned size_read(int arg) const; bool can_do_source_mods(const struct gen_device_info *devinfo) const; bool can_do_cmod(); bool can_change_types() const; bool has_source_and_destination_hazard() const; + unsigned implied_mrf_writes() const; /** * Return whether \p arg is a control source of a virtual instruction which @@ -383,6 +384,8 @@ bool last_rt:1; bool pi_noperspective:1; /**< Pixel interpolator noperspective flag */ + + tgl_swsb sched; /**< Scheduling info. */ }; /** @@ -514,6 +517,12 @@ return type_sz(get_exec_type(inst)); } +static inline bool +is_send(const fs_inst *inst) +{ + return inst->mlen || inst->is_send_from_grf(); +} + /** * Return whether the instruction isn't an ALU instruction and cannot be * assumed to complete in-order. @@ -521,7 +530,7 @@ static inline bool is_unordered(const fs_inst *inst) { - return inst->mlen || inst->is_send_from_grf() || inst->is_math(); + return is_send(inst) || inst->is_math(); } /** @@ -560,4 +569,103 @@ return false; } +/** + * Return whether the LOAD_PAYLOAD instruction is a plain copy of bits from + * the specified register file into a VGRF. + * + * This implies identity register regions without any source-destination + * overlap, but otherwise has no implications on the location of sources and + * destination in the register file: Gathering any number of portions from + * multiple virtual registers in any order is allowed. + */ +inline bool +is_copy_payload(brw_reg_file file, const fs_inst *inst) +{ + if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD || + inst->is_partial_write() || inst->saturate || + inst->dst.file != VGRF) + return false; + + for (unsigned i = 0; i < inst->sources; i++) { + if (inst->src[i].file != file || + inst->src[i].abs || inst->src[i].negate) + return false; + + if (!inst->src[i].is_contiguous()) + return false; + + if (regions_overlap(inst->dst, inst->size_written, + inst->src[i], inst->size_read(i))) + return false; + } + + return true; +} + +/** + * Like is_copy_payload(), but the instruction is required to copy a single + * contiguous block of registers from the given register file into the + * destination without any reordering. + */ +inline bool +is_identity_payload(brw_reg_file file, const fs_inst *inst) { + if (is_copy_payload(file, inst)) { + fs_reg reg = inst->src[0]; + + for (unsigned i = 0; i < inst->sources; i++) { + reg.type = inst->src[i].type; + if (!inst->src[i].equals(reg)) + return false; + + reg = byte_offset(reg, inst->size_read(i)); + } + + return true; + } else { + return false; + } +} + +/** + * Like is_copy_payload(), but the instruction is required to source data from + * at least two disjoint VGRFs. + * + * This doesn't necessarily rule out the elimination of this instruction + * through register coalescing, but due to limitations of the register + * coalesce pass it might be impossible to do so directly until a later stage, + * when the LOAD_PAYLOAD instruction is unrolled into a sequence of MOV + * instructions. + */ +inline bool +is_multi_copy_payload(const fs_inst *inst) { + if (is_copy_payload(VGRF, inst)) { + for (unsigned i = 0; i < inst->sources; i++) { + if (inst->src[i].nr != inst->src[0].nr) + return true; + } + } + + return false; +} + +/** + * Like is_identity_payload(), but the instruction is required to copy the + * whole contents of a single VGRF into the destination. + * + * This means that there is a good chance that the instruction will be + * eliminated through register coalescing, but it's neither a necessary nor a + * sufficient condition for that to happen -- E.g. consider the case where + * source and destination registers diverge due to other instructions in the + * program overwriting part of their contents, which isn't something we can + * predict up front based on a cheap strictly local test of the copy + * instruction. + */ +inline bool +is_coalescing_payload(const brw::simple_allocator &alloc, const fs_inst *inst) +{ + return is_identity_payload(VGRF, inst) && + inst->src[0].offset == 0 && + alloc.sizes[inst->src[0].nr] * REG_SIZE == inst->size_written; +} + #endif diff -Nru mesa-19.2.8/src/intel/compiler/brw_nir_analyze_ubo_ranges.c mesa-20.0.8/src/intel/compiler/brw_nir_analyze_ubo_ranges.c --- mesa-19.2.8/src/intel/compiler/brw_nir_analyze_ubo_ranges.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/compiler/brw_nir_analyze_ubo_ranges.c 2020-06-12 01:21:17.000000000 +0000 @@ -129,8 +129,10 @@ case nir_intrinsic_image_deref_load: case nir_intrinsic_image_deref_store: case nir_intrinsic_image_deref_atomic_add: - case nir_intrinsic_image_deref_atomic_min: - case nir_intrinsic_image_deref_atomic_max: + case nir_intrinsic_image_deref_atomic_imin: + case nir_intrinsic_image_deref_atomic_umin: + case nir_intrinsic_image_deref_atomic_imax: + case nir_intrinsic_image_deref_atomic_umax: case nir_intrinsic_image_deref_atomic_and: case nir_intrinsic_image_deref_atomic_or: case nir_intrinsic_image_deref_atomic_xor: diff -Nru mesa-19.2.8/src/intel/compiler/brw_nir.c mesa-20.0.8/src/intel/compiler/brw_nir.c --- mesa-19.2.8/src/intel/compiler/brw_nir.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/compiler/brw_nir.c 2020-06-12 01:21:17.000000000 +0000 @@ -518,7 +518,7 @@ OPT(nir_opt_combine_stores, nir_var_all); if (is_scalar) { - OPT(nir_lower_alu_to_scalar, NULL); + OPT(nir_lower_alu_to_scalar, NULL, NULL); } OPT(nir_copy_prop); @@ -553,7 +553,7 @@ (nir->info.stage == MESA_SHADER_TESS_CTRL || nir->info.stage == MESA_SHADER_TESS_EVAL); OPT(nir_opt_peephole_select, 0, !is_vec4_tessellation, false); - OPT(nir_opt_peephole_select, 1, !is_vec4_tessellation, + OPT(nir_opt_peephole_select, 8, !is_vec4_tessellation, compiler->devinfo->gen >= 6); OPT(nir_opt_intrinsics); @@ -654,17 +654,20 @@ const bool is_scalar = compiler->scalar_stage[nir->info.stage]; if (is_scalar) { - OPT(nir_lower_alu_to_scalar, NULL); + OPT(nir_lower_alu_to_scalar, NULL, NULL); } if (nir->info.stage == MESA_SHADER_GEOMETRY) - OPT(nir_lower_gs_intrinsics); + OPT(nir_lower_gs_intrinsics, false); /* See also brw_nir_trig_workarounds.py */ if (compiler->precise_trig && !(devinfo->gen >= 10 || devinfo->is_kabylake)) OPT(brw_nir_apply_trig_workarounds); + if (devinfo->gen >= 12) + OPT(brw_nir_clamp_image_1d_2d_array_sizes); + static const nir_lower_tex_options tex_options = { .lower_txp = ~0, .lower_txf_offset = true, @@ -690,13 +693,6 @@ OPT(nir_lower_doubles, softfp64, nir->options->lower_doubles_options); OPT(nir_lower_int64, nir->options->lower_int64_options); - /* This needs to be run after the first optimization pass but before we - * lower indirect derefs away - */ - if (compiler->supports_shader_constants) { - OPT(nir_opt_large_constants, NULL, 32); - } - OPT(nir_lower_bit_size, lower_bit_size_callback, (void *)compiler); if (is_scalar) { @@ -706,6 +702,13 @@ /* Lower a bunch of stuff */ OPT(nir_lower_var_copies); + /* This needs to be run after the first optimization pass but before we + * lower indirect derefs away + */ + if (compiler->supports_shader_constants) { + OPT(nir_opt_large_constants, NULL, 32); + } + OPT(nir_lower_system_values); const nir_lower_subgroups_options subgroups_options = { @@ -713,11 +716,35 @@ .lower_to_scalar = true, .lower_vote_trivial = !is_scalar, .lower_shuffle = true, + .lower_quad_broadcast_dynamic = true, }; OPT(nir_lower_subgroups, &subgroups_options); OPT(nir_lower_clip_cull_distance_arrays); + if ((devinfo->gen >= 8 || devinfo->is_haswell) && is_scalar) { + /* TODO: Yes, we could in theory do this on gen6 and earlier. However, + * that would require plumbing through support for these indirect + * scratch read/write messages with message registers and that's just a + * pain. Also, the primary benefit of this is for compute shaders which + * won't run on gen6 and earlier anyway. + * + * On gen7 and earlier the scratch space size is limited to 12kB. + * By enabling this optimization we may easily exceed this limit without + * having any fallback. + * + * The threshold of 128B was chosen semi-arbitrarily. The idea is that + * 128B per channel on a SIMD8 program is 32 registers or 25% of the + * register file. Any array that large is likely to cause pressure + * issues. Also, this value is sufficiently high that the benchmarks + * known to suffer from large temporary array issues are helped but + * nothing else in shader-db is hurt except for maybe that one kerbal + * space program shader. + */ + OPT(nir_lower_vars_to_scratch, nir_var_function_temp, 128, + glsl_get_natural_size_align_bytes); + } + nir_variable_mode indirect_mask = brw_nir_no_indirect_mask(compiler, nir->info.stage); OPT(nir_lower_indirect_derefs, indirect_mask); @@ -814,7 +841,7 @@ UNUSED bool progress; /* Written by OPT */ - OPT(brw_nir_lower_mem_access_bit_sizes); + OPT(brw_nir_lower_mem_access_bit_sizes, devinfo); do { progress = false; @@ -871,7 +898,7 @@ OPT(brw_nir_lower_conversions); if (is_scalar) - OPT(nir_lower_alu_to_scalar, NULL); + OPT(nir_lower_alu_to_scalar, NULL, NULL); OPT(nir_lower_to_source_mods, nir_lower_all_source_mods); OPT(nir_copy_prop); OPT(nir_opt_dce); @@ -1090,6 +1117,72 @@ } } +uint32_t +brw_aop_for_nir_intrinsic(const nir_intrinsic_instr *atomic) +{ + switch (atomic->intrinsic) { +#define AOP_CASE(atom) \ + case nir_intrinsic_image_atomic_##atom: \ + case nir_intrinsic_bindless_image_atomic_##atom: \ + case nir_intrinsic_ssbo_atomic_##atom: \ + case nir_intrinsic_shared_atomic_##atom: \ + case nir_intrinsic_global_atomic_##atom + + AOP_CASE(add): { + unsigned src_idx; + switch (atomic->intrinsic) { + case nir_intrinsic_image_atomic_add: + case nir_intrinsic_bindless_image_atomic_add: + src_idx = 3; + break; + case nir_intrinsic_ssbo_atomic_add: + src_idx = 2; + break; + case nir_intrinsic_shared_atomic_add: + case nir_intrinsic_global_atomic_add: + src_idx = 1; + break; + default: + unreachable("Invalid add atomic opcode"); + } + + if (nir_src_is_const(atomic->src[src_idx])) { + int64_t add_val = nir_src_as_int(atomic->src[src_idx]); + if (add_val == 1) + return BRW_AOP_INC; + else if (add_val == -1) + return BRW_AOP_DEC; + } + return BRW_AOP_ADD; + } + + AOP_CASE(imin): return BRW_AOP_IMIN; + AOP_CASE(umin): return BRW_AOP_UMIN; + AOP_CASE(imax): return BRW_AOP_IMAX; + AOP_CASE(umax): return BRW_AOP_UMAX; + AOP_CASE(and): return BRW_AOP_AND; + AOP_CASE(or): return BRW_AOP_OR; + AOP_CASE(xor): return BRW_AOP_XOR; + AOP_CASE(exchange): return BRW_AOP_MOV; + AOP_CASE(comp_swap): return BRW_AOP_CMPWR; + +#undef AOP_CASE +#define AOP_CASE(atom) \ + case nir_intrinsic_ssbo_atomic_##atom: \ + case nir_intrinsic_shared_atomic_##atom: \ + case nir_intrinsic_global_atomic_##atom + + AOP_CASE(fmin): return BRW_AOP_FMIN; + AOP_CASE(fmax): return BRW_AOP_FMAX; + AOP_CASE(fcomp_swap): return BRW_AOP_FCMPWR; + +#undef AOP_CASE + + default: + unreachable("Unsupported NIR atomic intrinsic"); + } +} + enum brw_reg_type brw_type_for_nir_type(const struct gen_device_info *devinfo, nir_alu_type type) { diff -Nru mesa-19.2.8/src/intel/compiler/brw_nir_clamp_image_1d_2d_array_sizes.c mesa-20.0.8/src/intel/compiler/brw_nir_clamp_image_1d_2d_array_sizes.c --- mesa-19.2.8/src/intel/compiler/brw_nir_clamp_image_1d_2d_array_sizes.c 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/intel/compiler/brw_nir_clamp_image_1d_2d_array_sizes.c 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,139 @@ +/* + * Copyright © 2020 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "compiler/nir/nir_builder.h" +#include "brw_nir.h" + +/** + * GEN:BUG:1806565034: + * + * Gen12+ allows to set RENDER_SURFACE_STATE::SurfaceArray to 1 only if + * array_len > 1. Setting RENDER_SURFACE_STATE::SurfaceArray to 0 results in + * the HW RESINFO message to report an array size of 0 which breaks texture + * array size queries. + * + * This NIR pass works around this by patching the array size with a + * MAX(array_size, 1) for array textures. + */ + +bool +brw_nir_clamp_image_1d_2d_array_sizes(nir_shader *shader) +{ + bool progress = false; + nir_builder b; + + nir_foreach_function(func, shader) { + bool function_progress = false; + + if (!func->impl) + continue; + + nir_builder_init(&b, func->impl); + + nir_foreach_block(block, func->impl) { + nir_foreach_instr_safe(instr, block) { + nir_ssa_def *image_size = NULL; + + switch (instr->type) { + case nir_instr_type_intrinsic: { + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + + switch (intr->intrinsic) { + case nir_intrinsic_image_size: + case nir_intrinsic_bindless_image_size: + if (!nir_intrinsic_image_array(intr)) + break; + + image_size = &intr->dest.ssa; + break; + + case nir_intrinsic_image_deref_size: { + nir_deref_instr *deref = nir_src_as_deref(intr->src[0]); + + assert(glsl_type_is_image(deref->type)); + + if (!glsl_sampler_type_is_array(deref->type)) + break; + + image_size = &intr->dest.ssa; + break; + } + + default: + break; + } + break; + } + + case nir_instr_type_tex: { + nir_tex_instr *tex_instr = nir_instr_as_tex(instr); + if (tex_instr->op != nir_texop_txs) + break; + + if (!tex_instr->is_array) + break; + + image_size = &tex_instr->dest.ssa; + break; + } + + default: + break; + } + + if (!image_size) + continue; + + b.cursor = nir_after_instr(instr); + + nir_ssa_def *components[4]; + for (int i = 0; i < image_size->num_components; i++) { + if (i == (image_size->num_components - 1)) { + components[i] = nir_imax(&b, nir_channel(&b, image_size, i), + nir_imm_int(&b, 1)); + } else { + components[i] = nir_channel(&b, image_size, i); + } + } + nir_ssa_def *image_size_replacement = + nir_vec(&b, components, image_size->num_components); + + b.cursor = nir_after_instr(instr); + + nir_ssa_def_rewrite_uses_after(image_size, + nir_src_for_ssa(image_size_replacement), + image_size_replacement->parent_instr); + + function_progress = true; + } + } + + if (function_progress) { + nir_metadata_preserve(func->impl, nir_metadata_block_index | + nir_metadata_dominance); + progress = function_progress; + } + } + + return progress; +} diff -Nru mesa-19.2.8/src/intel/compiler/brw_nir.h mesa-20.0.8/src/intel/compiler/brw_nir.h --- mesa-19.2.8/src/intel/compiler/brw_nir.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/compiler/brw_nir.h 2020-06-12 01:21:17.000000000 +0000 @@ -32,14 +32,13 @@ extern "C" { #endif -int type_size_scalar(const struct glsl_type *type, bool bindless); int type_size_vec4(const struct glsl_type *type, bool bindless); int type_size_dvec4(const struct glsl_type *type, bool bindless); static inline int type_size_scalar_bytes(const struct glsl_type *type, bool bindless) { - return type_size_scalar(type, bindless) * 4; + return glsl_count_dword_slots(type, bindless) * 4; } static inline int @@ -102,6 +101,7 @@ bool brw_nir_lower_cs_intrinsics(nir_shader *nir, unsigned dispatch_width); +void brw_nir_lower_alpha_to_coverage(nir_shader *shader); void brw_nir_lower_legacy_clipping(nir_shader *nir, int nr_userclip_plane_consts, struct brw_stage_prog_data *prog_data); @@ -127,12 +127,15 @@ void brw_nir_rewrite_bindless_image_intrinsic(nir_intrinsic_instr *intrin, nir_ssa_def *handle); -bool brw_nir_lower_mem_access_bit_sizes(nir_shader *shader); +bool brw_nir_lower_mem_access_bit_sizes(nir_shader *shader, + const struct gen_device_info *devinfo); void brw_postprocess_nir(nir_shader *nir, const struct brw_compiler *compiler, bool is_scalar); +bool brw_nir_clamp_image_1d_2d_array_sizes(nir_shader *shader); + bool brw_nir_apply_attribute_workarounds(nir_shader *nir, const uint8_t *attrib_wa_flags); @@ -147,6 +150,7 @@ bool is_scalar); enum brw_conditional_mod brw_cmod_for_nir_comparison(nir_op op); +uint32_t brw_aop_for_nir_intrinsic(const nir_intrinsic_instr *atomic); enum brw_reg_type brw_type_for_nir_type(const struct gen_device_info *devinfo, nir_alu_type type); diff -Nru mesa-19.2.8/src/intel/compiler/brw_nir_lower_alpha_to_coverage.c mesa-20.0.8/src/intel/compiler/brw_nir_lower_alpha_to_coverage.c --- mesa-19.2.8/src/intel/compiler/brw_nir_lower_alpha_to_coverage.c 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/intel/compiler/brw_nir_lower_alpha_to_coverage.c 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,169 @@ +/* + * Copyright © 2019 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "compiler/nir/nir_builder.h" +#include "brw_nir.h" + +/** + * We need to compute alpha to coverage dithering manually in shader + * and replace sample mask store with the bitwise-AND of sample mask and + * alpha to coverage dithering. + * + * The following formula is used to compute final sample mask: + * m = int(16.0 * clamp(src0_alpha, 0.0, 1.0)) + * dither_mask = 0x1111 * ((0xfea80 >> (m & ~3)) & 0xf) | + * 0x0808 * (m & 2) | 0x0100 * (m & 1) + * sample_mask = sample_mask & dither_mask + * + * It gives a number of ones proportional to the alpha for 2, 4, 8 or 16 + * least significant bits of the result: + * 0.0000 0000000000000000 + * 0.0625 0000000100000000 + * 0.1250 0001000000010000 + * 0.1875 0001000100010000 + * 0.2500 1000100010001000 + * 0.3125 1000100110001000 + * 0.3750 1001100010011000 + * 0.4375 1001100110011000 + * 0.5000 1010101010101010 + * 0.5625 1010101110101010 + * 0.6250 1011101010111010 + * 0.6875 1011101110111010 + * 0.7500 1110111011101110 + * 0.8125 1110111111101110 + * 0.8750 1111111011111110 + * 0.9375 1111111111111110 + * 1.0000 1111111111111111 + */ +static nir_ssa_def * +build_dither_mask(nir_builder b, nir_intrinsic_instr *store_instr) +{ + nir_ssa_def *alpha = + nir_channel(&b, nir_ssa_for_src(&b, store_instr->src[0], 4), 3); + + nir_ssa_def *m = + nir_f2i32(&b, nir_fmul_imm(&b, nir_fsat(&b, alpha), 16.0)); + + nir_ssa_def *part_a = + nir_iand(&b, + nir_imm_int(&b, 0xf), + nir_ushr(&b, + nir_imm_int(&b, 0xfea80), + nir_iand(&b, m, nir_imm_int(&b, ~3)))); + + nir_ssa_def *part_b = nir_iand(&b, m, nir_imm_int(&b, 2)); + + nir_ssa_def *part_c = nir_iand(&b, m, nir_imm_int(&b, 1)); + + return nir_ior(&b, + nir_imul_imm(&b, part_a, 0x1111), + nir_ior(&b, + nir_imul_imm(&b, part_b, 0x0808), + nir_imul_imm(&b, part_c, 0x0100))); +} + +void +brw_nir_lower_alpha_to_coverage(nir_shader *shader) +{ + assert(shader->info.stage == MESA_SHADER_FRAGMENT); + + /* Bail out early if we don't have gl_SampleMask */ + bool is_sample_mask = false; + nir_foreach_variable(var, &shader->outputs) { + if (var->data.location == FRAG_RESULT_SAMPLE_MASK) { + is_sample_mask = true; + break; + } + } + + if (!is_sample_mask) + return; + + nir_foreach_function(function, shader) { + nir_function_impl *impl = function->impl; + nir_builder b; + nir_builder_init(&b, impl); + + nir_foreach_block(block, impl) { + nir_intrinsic_instr *sample_mask_instr = NULL; + nir_intrinsic_instr *store_instr = NULL; + + nir_foreach_instr_safe(instr, block) { + if (instr->type == nir_instr_type_intrinsic) { + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + nir_variable *out = NULL; + + switch (intr->intrinsic) { + case nir_intrinsic_store_output: + nir_foreach_variable(var, &shader->outputs) { + int drvloc = var->data.driver_location; + if (nir_intrinsic_base(intr) == drvloc) { + out = var; + break; + } + } + + if (out->data.mode != nir_var_shader_out) + continue; + + /* save gl_SampleMask instruction pointer */ + if (out->data.location == FRAG_RESULT_SAMPLE_MASK) { + assert(!sample_mask_instr); + sample_mask_instr = intr; + } + + /* save out_color[0] instruction pointer */ + if ((out->data.location == FRAG_RESULT_COLOR || + out->data.location == FRAG_RESULT_DATA0)) { + nir_src *offset_src = nir_get_io_offset_src(intr); + if (nir_src_is_const(*offset_src) && nir_src_as_uint(*offset_src) == 0) { + assert(!store_instr); + store_instr = intr; + } + } + break; + default: + continue; + } + } + } + + if (sample_mask_instr && store_instr) { + b.cursor = nir_before_instr(&store_instr->instr); + nir_ssa_def *dither_mask = build_dither_mask(b, store_instr); + + /* Combine dither_mask and reorder gl_SampleMask store instruction + * after render target 0 store instruction. + */ + nir_instr_remove(&sample_mask_instr->instr); + dither_mask = nir_iand(&b, sample_mask_instr->src[0].ssa, dither_mask); + nir_instr_insert_after(&store_instr->instr, &sample_mask_instr->instr); + nir_instr_rewrite_src(&sample_mask_instr->instr, + &sample_mask_instr->src[0], + nir_src_for_ssa(dither_mask)); + } + } + nir_metadata_preserve(impl, nir_metadata_block_index | + nir_metadata_dominance); + } +} diff -Nru mesa-19.2.8/src/intel/compiler/brw_nir_lower_image_load_store.c mesa-20.0.8/src/intel/compiler/brw_nir_lower_image_load_store.c --- mesa-19.2.8/src/intel/compiler/brw_nir_lower_image_load_store.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/compiler/brw_nir_lower_image_load_store.c 2020-06-12 01:21:17.000000000 +0000 @@ -583,7 +583,7 @@ /* For write-only surfaces, we trust that the hardware can just do the * conversion for us. */ - if (var->data.image.access & ACCESS_NON_READABLE) + if (var->data.access & ACCESS_NON_READABLE) return false; const enum isl_format image_fmt = @@ -696,7 +696,7 @@ /* For write-only images, we have an actual image surface so we fall back * and let the back-end emit a TXS for this. */ - if (var->data.image.access & ACCESS_NON_READABLE) + if (var->data.access & ACCESS_NON_READABLE) return false; /* If we have a matching typed format, then we have an actual image surface @@ -763,8 +763,10 @@ break; case nir_intrinsic_image_deref_atomic_add: - case nir_intrinsic_image_deref_atomic_min: - case nir_intrinsic_image_deref_atomic_max: + case nir_intrinsic_image_deref_atomic_imin: + case nir_intrinsic_image_deref_atomic_umin: + case nir_intrinsic_image_deref_atomic_imax: + case nir_intrinsic_image_deref_atomic_umax: case nir_intrinsic_image_deref_atomic_and: case nir_intrinsic_image_deref_atomic_or: case nir_intrinsic_image_deref_atomic_xor: diff -Nru mesa-19.2.8/src/intel/compiler/brw_nir_lower_mem_access_bit_sizes.c mesa-20.0.8/src/intel/compiler/brw_nir_lower_mem_access_bit_sizes.c --- mesa-19.2.8/src/intel/compiler/brw_nir_lower_mem_access_bit_sizes.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/compiler/brw_nir_lower_mem_access_bit_sizes.c 2020-06-12 01:21:17.000000000 +0000 @@ -74,10 +74,15 @@ } static bool -lower_mem_load_bit_size(nir_builder *b, nir_intrinsic_instr *intrin) +lower_mem_load_bit_size(nir_builder *b, nir_intrinsic_instr *intrin, + const struct gen_device_info *devinfo) { + const bool needs_scalar = + intrin->intrinsic == nir_intrinsic_load_scratch; + assert(intrin->dest.is_ssa); - if (intrin->dest.ssa.bit_size == 32) + if (intrin->dest.ssa.bit_size == 32 && + (!needs_scalar || intrin->num_components == 1)) return false; const unsigned bit_size = intrin->dest.ssa.bit_size; @@ -85,8 +90,7 @@ const unsigned bytes_read = num_components * (bit_size / 8); const unsigned align = nir_intrinsic_align(intrin); - nir_ssa_def *result[NIR_MAX_VEC_COMPONENTS] = { NULL, }; - + nir_ssa_def *result; nir_src *offset_src = nir_get_io_offset_src(intrin); if (bit_size < 32 && nir_src_is_const(*offset_src)) { /* The offset is constant so we can use a 32-bit load and just shift it @@ -102,21 +106,12 @@ nir_ssa_def *load = dup_mem_intrinsic(b, intrin, NULL, -load_offset, load_comps32, 32, 4); - nir_ssa_def *unpacked[3]; - for (unsigned i = 0; i < load_comps32; i++) - unpacked[i] = nir_unpack_bits(b, nir_channel(b, load, i), bit_size); - - assert(load_offset % (bit_size / 8) == 0); - const unsigned divisor = 32 / bit_size; - - for (unsigned i = 0; i < num_components; i++) { - unsigned load_i = i + load_offset / (bit_size / 8); - result[i] = nir_channel(b, unpacked[load_i / divisor], - load_i % divisor); - } + result = nir_extract_bits(b, &load, 1, load_offset * 8, + num_components, bit_size); } else { /* Otherwise, we have to break it into smaller loads */ - unsigned res_idx = 0; + nir_ssa_def *loads[8]; + unsigned num_loads = 0; int load_offset = 0; while (load_offset < bytes_read) { const unsigned bytes_left = bytes_read - load_offset; @@ -128,34 +123,35 @@ } else { assert(load_offset % 4 == 0); load_bit_size = 32; - load_comps = DIV_ROUND_UP(MIN2(bytes_left, 16), 4); + load_comps = needs_scalar ? 1 : + DIV_ROUND_UP(MIN2(bytes_left, 16), 4); } - nir_ssa_def *load = dup_mem_intrinsic(b, intrin, NULL, load_offset, - load_comps, load_bit_size, - align); - - nir_ssa_def *unpacked = nir_bitcast_vector(b, load, bit_size); - for (unsigned i = 0; i < unpacked->num_components; i++) { - if (res_idx < num_components) - result[res_idx++] = nir_channel(b, unpacked, i); - } + loads[num_loads++] = dup_mem_intrinsic(b, intrin, NULL, load_offset, + load_comps, load_bit_size, + align); load_offset += load_comps * (load_bit_size / 8); } + assert(num_loads <= ARRAY_SIZE(loads)); + result = nir_extract_bits(b, loads, num_loads, 0, + num_components, bit_size); } - nir_ssa_def *vec_result = nir_vec(b, result, num_components); nir_ssa_def_rewrite_uses(&intrin->dest.ssa, - nir_src_for_ssa(vec_result)); + nir_src_for_ssa(result)); nir_instr_remove(&intrin->instr); return true; } static bool -lower_mem_store_bit_size(nir_builder *b, nir_intrinsic_instr *intrin) +lower_mem_store_bit_size(nir_builder *b, nir_intrinsic_instr *intrin, + const struct gen_device_info *devinfo) { + const bool needs_scalar = + intrin->intrinsic == nir_intrinsic_store_scratch; + assert(intrin->src[0].is_ssa); nir_ssa_def *value = intrin->src[0].ssa; @@ -171,7 +167,9 @@ assert(writemask < (1 << num_components)); if ((value->bit_size <= 32 && num_components == 1) || - (value->bit_size == 32 && writemask == (1 << num_components) - 1)) + (value->bit_size == 32 && + writemask == (1 << num_components) - 1 && + !needs_scalar)) return false; nir_src *offset_src = nir_get_io_offset_src(intrin); @@ -192,7 +190,6 @@ while (BITSET_FFS(mask) != 0) { const int start = BITSET_FFS(mask) - 1; - assert(start % byte_size == 0); int end; for (end = start + 1; end < bytes_written; end++) { @@ -210,7 +207,7 @@ if (chunk_bytes >= 4 && is_dword_aligned) { store_align = MAX2(align, 4); store_bit_size = 32; - store_comps = MIN2(chunk_bytes, 16) / 4; + store_comps = needs_scalar ? 1 : MIN2(chunk_bytes, 16) / 4; } else { store_align = align; store_comps = 1; @@ -219,19 +216,10 @@ if (store_bit_size == 24) store_bit_size = 16; } - const unsigned store_bytes = store_comps * (store_bit_size / 8); - assert(store_bytes % byte_size == 0); - const unsigned store_first_src_comp = start / byte_size; - const unsigned store_src_comps = store_bytes / byte_size; - assert(store_first_src_comp + store_src_comps <= num_components); - - unsigned src_swiz[4] = { 0, }; - for (unsigned i = 0; i < store_src_comps; i++) - src_swiz[i] = store_first_src_comp + i; - nir_ssa_def *store_value = - nir_swizzle(b, value, src_swiz, store_src_comps); - nir_ssa_def *packed = nir_bitcast_vector(b, store_value, store_bit_size); + + nir_ssa_def *packed = nir_extract_bits(b, &value, 1, start * 8, + store_comps, store_bit_size); dup_mem_intrinsic(b, intrin, packed, start, store_comps, store_bit_size, store_align); @@ -245,7 +233,8 @@ } static bool -lower_mem_access_bit_sizes_impl(nir_function_impl *impl) +lower_mem_access_bit_sizes_impl(nir_function_impl *impl, + const struct gen_device_info *devinfo) { bool progress = false; @@ -264,14 +253,16 @@ case nir_intrinsic_load_global: case nir_intrinsic_load_ssbo: case nir_intrinsic_load_shared: - if (lower_mem_load_bit_size(&b, intrin)) + case nir_intrinsic_load_scratch: + if (lower_mem_load_bit_size(&b, intrin, devinfo)) progress = true; break; case nir_intrinsic_store_global: case nir_intrinsic_store_ssbo: case nir_intrinsic_store_shared: - if (lower_mem_store_bit_size(&b, intrin)) + case nir_intrinsic_store_scratch: + if (lower_mem_store_bit_size(&b, intrin, devinfo)) progress = true; break; @@ -304,14 +295,21 @@ * all nir load/store intrinsics into a series of either 8 or 32-bit * load/store intrinsics with a number of components that we can directly * handle in hardware and with a trivial write-mask. + * + * For scratch access, additional consideration has to be made due to the way + * that we swizzle the memory addresses to achieve decent cache locality. In + * particular, even though untyped surface read/write messages exist and work, + * we can't use them to load multiple components in a single SEND. For more + * detail on the scratch swizzle, see fs_visitor::swizzle_nir_scratch_addr. */ bool -brw_nir_lower_mem_access_bit_sizes(nir_shader *shader) +brw_nir_lower_mem_access_bit_sizes(nir_shader *shader, + const struct gen_device_info *devinfo) { bool progress = false; nir_foreach_function(func, shader) { - if (func->impl && lower_mem_access_bit_sizes_impl(func->impl)) + if (func->impl && lower_mem_access_bit_sizes_impl(func->impl, devinfo)) progress = true; } diff -Nru mesa-19.2.8/src/intel/compiler/brw_nir_opt_peephole_ffma.c mesa-20.0.8/src/intel/compiler/brw_nir_opt_peephole_ffma.c --- mesa-19.2.8/src/intel/compiler/brw_nir_opt_peephole_ffma.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/compiler/brw_nir_opt_peephole_ffma.c 2020-06-12 01:21:17.000000000 +0000 @@ -36,7 +36,7 @@ static inline bool are_all_uses_fadd(nir_ssa_def *def) { - if (!list_empty(&def->if_uses)) + if (!list_is_empty(&def->if_uses)) return false; nir_foreach_use(use_src, def) { @@ -153,7 +153,7 @@ nir_instr_as_load_const (srcs[i].src.ssa->parent_instr); if (list_is_singular(&load_const->def.uses) && - list_empty(&load_const->def.if_uses)) { + list_is_empty(&load_const->def.if_uses)) { return true; } } @@ -256,7 +256,7 @@ nir_src_for_ssa(&ffma->dest.dest.ssa)); nir_builder_instr_insert(b, &ffma->instr); - assert(list_empty(&add->dest.dest.ssa.uses)); + assert(list_is_empty(&add->dest.dest.ssa.uses)); nir_instr_remove(&add->instr); progress = true; diff -Nru mesa-19.2.8/src/intel/compiler/brw_packed_float.c mesa-20.0.8/src/intel/compiler/brw_packed_float.c --- mesa-19.2.8/src/intel/compiler/brw_packed_float.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/compiler/brw_packed_float.c 2020-06-12 01:21:17.000000000 +0000 @@ -63,7 +63,7 @@ /* ±0.0f is special cased. */ if (vf == 0x00 || vf == 0x80) { - fu.u = vf << 24; + fu.u = (unsigned)vf << 24; return fu.f; } diff -Nru mesa-19.2.8/src/intel/compiler/brw_predicated_break.cpp mesa-20.0.8/src/intel/compiler/brw_predicated_break.cpp --- mesa-19.2.8/src/intel/compiler/brw_predicated_break.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/compiler/brw_predicated_break.cpp 2020-06-12 01:21:17.000000000 +0000 @@ -100,13 +100,15 @@ if (!earlier_block->ends_with_control_flow()) { earlier_block->children.make_empty(); - earlier_block->add_successor(s->cfg->mem_ctx, jump_block); + earlier_block->add_successor(s->cfg->mem_ctx, jump_block, + bblock_link_logical); } if (!later_block->starts_with_control_flow()) { later_block->parents.make_empty(); } - jump_block->add_successor(s->cfg->mem_ctx, later_block); + jump_block->add_successor(s->cfg->mem_ctx, later_block, + bblock_link_logical); if (earlier_block->can_combine_with(jump_block)) { earlier_block->combine_with(jump_block); diff -Nru mesa-19.2.8/src/intel/compiler/brw_reg.h mesa-20.0.8/src/intel/compiler/brw_reg.h --- mesa-19.2.8/src/intel/compiler/brw_reg.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/compiler/brw_reg.h 2020-06-12 01:21:17.000000000 +0000 @@ -318,6 +318,7 @@ case BRW_REGISTER_TYPE_UQ: case BRW_REGISTER_TYPE_Q: case BRW_REGISTER_TYPE_DF: + case BRW_REGISTER_TYPE_NF: return 8; case BRW_REGISTER_TYPE_UD: case BRW_REGISTER_TYPE_D: diff -Nru mesa-19.2.8/src/intel/compiler/brw_reg_type.c mesa-20.0.8/src/intel/compiler/brw_reg_type.c --- mesa-19.2.8/src/intel/compiler/brw_reg_type.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/compiler/brw_reg_type.c 2020-06-12 01:21:17.000000000 +0000 @@ -84,12 +84,58 @@ GEN11_HW_IMM_TYPE_VF = 11, }; +#define GEN12_HW_REG_TYPE_UINT(n) (n) +#define GEN12_HW_REG_TYPE_SINT(n) (0x4 | (n)) +#define GEN12_HW_REG_TYPE_FLOAT(n) (0x8 | (n)) + static const struct hw_type { enum hw_reg_type reg_type; enum hw_imm_type imm_type; } gen4_hw_type[] = { [0 ... BRW_REGISTER_TYPE_LAST] = { INVALID, INVALID }, + [BRW_REGISTER_TYPE_F] = { BRW_HW_REG_TYPE_F, BRW_HW_IMM_TYPE_F }, + [BRW_REGISTER_TYPE_VF] = { INVALID, BRW_HW_IMM_TYPE_VF }, + + [BRW_REGISTER_TYPE_D] = { BRW_HW_REG_TYPE_D, BRW_HW_IMM_TYPE_D }, + [BRW_REGISTER_TYPE_UD] = { BRW_HW_REG_TYPE_UD, BRW_HW_IMM_TYPE_UD }, + [BRW_REGISTER_TYPE_W] = { BRW_HW_REG_TYPE_W, BRW_HW_IMM_TYPE_W }, + [BRW_REGISTER_TYPE_UW] = { BRW_HW_REG_TYPE_UW, BRW_HW_IMM_TYPE_UW }, + [BRW_REGISTER_TYPE_B] = { BRW_HW_REG_TYPE_B, INVALID }, + [BRW_REGISTER_TYPE_UB] = { BRW_HW_REG_TYPE_UB, INVALID }, + [BRW_REGISTER_TYPE_V] = { INVALID, BRW_HW_IMM_TYPE_V }, +}, gen6_hw_type[] = { + [0 ... BRW_REGISTER_TYPE_LAST] = { INVALID, INVALID }, + + [BRW_REGISTER_TYPE_F] = { BRW_HW_REG_TYPE_F, BRW_HW_IMM_TYPE_F }, + [BRW_REGISTER_TYPE_VF] = { INVALID, BRW_HW_IMM_TYPE_VF }, + + [BRW_REGISTER_TYPE_D] = { BRW_HW_REG_TYPE_D, BRW_HW_IMM_TYPE_D }, + [BRW_REGISTER_TYPE_UD] = { BRW_HW_REG_TYPE_UD, BRW_HW_IMM_TYPE_UD }, + [BRW_REGISTER_TYPE_W] = { BRW_HW_REG_TYPE_W, BRW_HW_IMM_TYPE_W }, + [BRW_REGISTER_TYPE_UW] = { BRW_HW_REG_TYPE_UW, BRW_HW_IMM_TYPE_UW }, + [BRW_REGISTER_TYPE_B] = { BRW_HW_REG_TYPE_B, INVALID }, + [BRW_REGISTER_TYPE_UB] = { BRW_HW_REG_TYPE_UB, INVALID }, + [BRW_REGISTER_TYPE_V] = { INVALID, BRW_HW_IMM_TYPE_V }, + [BRW_REGISTER_TYPE_UV] = { INVALID, BRW_HW_IMM_TYPE_UV }, +}, gen7_hw_type[] = { + [0 ... BRW_REGISTER_TYPE_LAST] = { INVALID, INVALID }, + + [BRW_REGISTER_TYPE_DF] = { GEN7_HW_REG_TYPE_DF, INVALID }, + [BRW_REGISTER_TYPE_F] = { BRW_HW_REG_TYPE_F, BRW_HW_IMM_TYPE_F }, + [BRW_REGISTER_TYPE_VF] = { INVALID, BRW_HW_IMM_TYPE_VF }, + + [BRW_REGISTER_TYPE_D] = { BRW_HW_REG_TYPE_D, BRW_HW_IMM_TYPE_D }, + [BRW_REGISTER_TYPE_UD] = { BRW_HW_REG_TYPE_UD, BRW_HW_IMM_TYPE_UD }, + [BRW_REGISTER_TYPE_W] = { BRW_HW_REG_TYPE_W, BRW_HW_IMM_TYPE_W }, + [BRW_REGISTER_TYPE_UW] = { BRW_HW_REG_TYPE_UW, BRW_HW_IMM_TYPE_UW }, + [BRW_REGISTER_TYPE_B] = { BRW_HW_REG_TYPE_B, INVALID }, + [BRW_REGISTER_TYPE_UB] = { BRW_HW_REG_TYPE_UB, INVALID }, + [BRW_REGISTER_TYPE_V] = { INVALID, BRW_HW_IMM_TYPE_V }, + [BRW_REGISTER_TYPE_UV] = { INVALID, BRW_HW_IMM_TYPE_UV }, +}, gen8_hw_type[] = { + [0 ... BRW_REGISTER_TYPE_LAST] = { INVALID, INVALID }, + [BRW_REGISTER_TYPE_DF] = { GEN7_HW_REG_TYPE_DF, GEN8_HW_IMM_TYPE_DF }, [BRW_REGISTER_TYPE_F] = { BRW_HW_REG_TYPE_F, BRW_HW_IMM_TYPE_F }, [BRW_REGISTER_TYPE_HF] = { GEN8_HW_REG_TYPE_HF, GEN8_HW_IMM_TYPE_HF }, @@ -106,14 +152,13 @@ [BRW_REGISTER_TYPE_V] = { INVALID, BRW_HW_IMM_TYPE_V }, [BRW_REGISTER_TYPE_UV] = { INVALID, BRW_HW_IMM_TYPE_UV }, }, gen11_hw_type[] = { + [0 ... BRW_REGISTER_TYPE_LAST] = { INVALID, INVALID }, + [BRW_REGISTER_TYPE_NF] = { GEN11_HW_REG_TYPE_NF, INVALID }, - [BRW_REGISTER_TYPE_DF] = { GEN11_HW_REG_TYPE_DF, GEN11_HW_IMM_TYPE_DF }, [BRW_REGISTER_TYPE_F] = { GEN11_HW_REG_TYPE_F, GEN11_HW_IMM_TYPE_F }, [BRW_REGISTER_TYPE_HF] = { GEN11_HW_REG_TYPE_HF, GEN11_HW_IMM_TYPE_HF }, [BRW_REGISTER_TYPE_VF] = { INVALID, GEN11_HW_IMM_TYPE_VF }, - [BRW_REGISTER_TYPE_Q] = { GEN11_HW_REG_TYPE_Q, GEN11_HW_IMM_TYPE_Q }, - [BRW_REGISTER_TYPE_UQ] = { GEN11_HW_REG_TYPE_UQ, GEN11_HW_IMM_TYPE_UQ }, [BRW_REGISTER_TYPE_D] = { GEN11_HW_REG_TYPE_D, GEN11_HW_IMM_TYPE_D }, [BRW_REGISTER_TYPE_UD] = { GEN11_HW_REG_TYPE_UD, GEN11_HW_IMM_TYPE_UD }, [BRW_REGISTER_TYPE_W] = { GEN11_HW_REG_TYPE_W, GEN11_HW_IMM_TYPE_W }, @@ -122,6 +167,21 @@ [BRW_REGISTER_TYPE_UB] = { GEN11_HW_REG_TYPE_UB, INVALID }, [BRW_REGISTER_TYPE_V] = { INVALID, GEN11_HW_IMM_TYPE_V }, [BRW_REGISTER_TYPE_UV] = { INVALID, GEN11_HW_IMM_TYPE_UV }, +}, gen12_hw_type[] = { + [0 ... BRW_REGISTER_TYPE_LAST] = { INVALID, INVALID }, + + [BRW_REGISTER_TYPE_F] = { GEN12_HW_REG_TYPE_FLOAT(2), GEN12_HW_REG_TYPE_FLOAT(2) }, + [BRW_REGISTER_TYPE_HF] = { GEN12_HW_REG_TYPE_FLOAT(1), GEN12_HW_REG_TYPE_FLOAT(1) }, + [BRW_REGISTER_TYPE_VF] = { INVALID, GEN12_HW_REG_TYPE_FLOAT(0) }, + + [BRW_REGISTER_TYPE_D] = { GEN12_HW_REG_TYPE_SINT(2), GEN12_HW_REG_TYPE_SINT(2) }, + [BRW_REGISTER_TYPE_UD] = { GEN12_HW_REG_TYPE_UINT(2), GEN12_HW_REG_TYPE_UINT(2) }, + [BRW_REGISTER_TYPE_W] = { GEN12_HW_REG_TYPE_SINT(1), GEN12_HW_REG_TYPE_SINT(1) }, + [BRW_REGISTER_TYPE_UW] = { GEN12_HW_REG_TYPE_UINT(1), GEN12_HW_REG_TYPE_UINT(1) }, + [BRW_REGISTER_TYPE_B] = { GEN12_HW_REG_TYPE_SINT(0), INVALID }, + [BRW_REGISTER_TYPE_UB] = { GEN12_HW_REG_TYPE_UINT(0), INVALID }, + [BRW_REGISTER_TYPE_V] = { INVALID, GEN12_HW_REG_TYPE_SINT(0) }, + [BRW_REGISTER_TYPE_UV] = { INVALID, GEN12_HW_REG_TYPE_UINT(0) }, }; /* SNB adds 3-src instructions (MAD and LRP) that only operate on floats, so @@ -160,7 +220,18 @@ static const struct hw_3src_type { enum hw_3src_reg_type reg_type; enum gen10_align1_3src_exec_type exec_type; -} gen7_hw_3src_type[] = { +} gen6_hw_3src_type[] = { + [0 ... BRW_REGISTER_TYPE_LAST] = { INVALID }, + + [BRW_REGISTER_TYPE_F] = { GEN7_3SRC_TYPE_F }, +}, gen7_hw_3src_type[] = { + [0 ... BRW_REGISTER_TYPE_LAST] = { INVALID }, + + [BRW_REGISTER_TYPE_F] = { GEN7_3SRC_TYPE_F }, + [BRW_REGISTER_TYPE_D] = { GEN7_3SRC_TYPE_D }, + [BRW_REGISTER_TYPE_UD] = { GEN7_3SRC_TYPE_UD }, + [BRW_REGISTER_TYPE_DF] = { GEN7_3SRC_TYPE_DF }, +}, gen8_hw_3src_type[] = { [0 ... BRW_REGISTER_TYPE_LAST] = { INVALID }, [BRW_REGISTER_TYPE_F] = { GEN7_3SRC_TYPE_F }, @@ -172,7 +243,6 @@ #define E(x) BRW_ALIGN1_3SRC_EXEC_TYPE_##x [0 ... BRW_REGISTER_TYPE_LAST] = { INVALID }, - [BRW_REGISTER_TYPE_NF] = { GEN11_ALIGN1_3SRC_REG_TYPE_NF, E(FLOAT) }, [BRW_REGISTER_TYPE_DF] = { GEN10_ALIGN1_3SRC_REG_TYPE_DF, E(FLOAT) }, [BRW_REGISTER_TYPE_F] = { GEN10_ALIGN1_3SRC_REG_TYPE_F, E(FLOAT) }, [BRW_REGISTER_TYPE_HF] = { GEN10_ALIGN1_3SRC_REG_TYPE_HF, E(FLOAT) }, @@ -183,6 +253,31 @@ [BRW_REGISTER_TYPE_UW] = { GEN10_ALIGN1_3SRC_REG_TYPE_UW, E(INT) }, [BRW_REGISTER_TYPE_B] = { GEN10_ALIGN1_3SRC_REG_TYPE_B, E(INT) }, [BRW_REGISTER_TYPE_UB] = { GEN10_ALIGN1_3SRC_REG_TYPE_UB, E(INT) }, +}, gen11_hw_3src_type[] = { + [0 ... BRW_REGISTER_TYPE_LAST] = { INVALID }, + + [BRW_REGISTER_TYPE_NF] = { GEN11_ALIGN1_3SRC_REG_TYPE_NF, E(FLOAT) }, + [BRW_REGISTER_TYPE_F] = { GEN10_ALIGN1_3SRC_REG_TYPE_F, E(FLOAT) }, + [BRW_REGISTER_TYPE_HF] = { GEN10_ALIGN1_3SRC_REG_TYPE_HF, E(FLOAT) }, + + [BRW_REGISTER_TYPE_D] = { GEN10_ALIGN1_3SRC_REG_TYPE_D, E(INT) }, + [BRW_REGISTER_TYPE_UD] = { GEN10_ALIGN1_3SRC_REG_TYPE_UD, E(INT) }, + [BRW_REGISTER_TYPE_W] = { GEN10_ALIGN1_3SRC_REG_TYPE_W, E(INT) }, + [BRW_REGISTER_TYPE_UW] = { GEN10_ALIGN1_3SRC_REG_TYPE_UW, E(INT) }, + [BRW_REGISTER_TYPE_B] = { GEN10_ALIGN1_3SRC_REG_TYPE_B, E(INT) }, + [BRW_REGISTER_TYPE_UB] = { GEN10_ALIGN1_3SRC_REG_TYPE_UB, E(INT) }, +}, gen12_hw_3src_type[] = { + [0 ... BRW_REGISTER_TYPE_LAST] = { INVALID }, + + [BRW_REGISTER_TYPE_F] = { GEN12_HW_REG_TYPE_UINT(2), E(FLOAT), }, + [BRW_REGISTER_TYPE_HF] = { GEN12_HW_REG_TYPE_UINT(1), E(FLOAT), }, + + [BRW_REGISTER_TYPE_D] = { GEN12_HW_REG_TYPE_SINT(2), E(INT), }, + [BRW_REGISTER_TYPE_UD] = { GEN12_HW_REG_TYPE_UINT(2), E(INT), }, + [BRW_REGISTER_TYPE_W] = { GEN12_HW_REG_TYPE_SINT(1), E(INT), }, + [BRW_REGISTER_TYPE_UW] = { GEN12_HW_REG_TYPE_UINT(1), E(INT), }, + [BRW_REGISTER_TYPE_B] = { GEN12_HW_REG_TYPE_SINT(0), E(INT), }, + [BRW_REGISTER_TYPE_UB] = { GEN12_HW_REG_TYPE_UINT(0), E(INT), }, #undef E }; @@ -198,17 +293,26 @@ { const struct hw_type *table; - if (devinfo->gen >= 11) { + if (devinfo->gen >= 12) { + assert(type < ARRAY_SIZE(gen12_hw_type)); + table = gen12_hw_type; + } else if (devinfo->gen >= 11) { assert(type < ARRAY_SIZE(gen11_hw_type)); table = gen11_hw_type; + } else if (devinfo->gen >= 8) { + assert(type < ARRAY_SIZE(gen8_hw_type)); + table = gen8_hw_type; + } else if (devinfo->gen >= 7) { + assert(type < ARRAY_SIZE(gen7_hw_type)); + table = gen7_hw_type; + } else if (devinfo->gen >= 6) { + assert(type < ARRAY_SIZE(gen6_hw_type)); + table = gen6_hw_type; } else { assert(type < ARRAY_SIZE(gen4_hw_type)); table = gen4_hw_type; } - assert(devinfo->has_64bit_types || brw_reg_type_to_size(type) < 8 || - type == BRW_REGISTER_TYPE_NF); - if (file == BRW_IMMEDIATE_VALUE) { assert(table[type].imm_type != (enum hw_imm_type)INVALID); return table[type].imm_type; @@ -229,8 +333,16 @@ { const struct hw_type *table; - if (devinfo->gen >= 11) { + if (devinfo->gen >= 12) { + table = gen12_hw_type; + } else if (devinfo->gen >= 11) { table = gen11_hw_type; + } else if (devinfo->gen >= 8) { + table = gen8_hw_type; + } else if (devinfo->gen >= 7) { + table = gen7_hw_type; + } else if (devinfo->gen >= 6) { + table = gen6_hw_type; } else { table = gen4_hw_type; } @@ -248,7 +360,7 @@ } } } - unreachable("not reached"); + return INVALID_REG_TYPE; } /** @@ -259,10 +371,21 @@ brw_reg_type_to_a16_hw_3src_type(const struct gen_device_info *devinfo, enum brw_reg_type type) { - assert(type < ARRAY_SIZE(gen7_hw_3src_type)); - assert(devinfo->gen >= 8 || type != BRW_REGISTER_TYPE_HF); - assert(gen7_hw_3src_type[type].reg_type != (enum hw_3src_reg_type)INVALID); - return gen7_hw_3src_type[type].reg_type; + const struct hw_3src_type *table; + + if (devinfo->gen >= 8) { + assert(type < ARRAY_SIZE(gen8_hw_3src_type)); + table = gen8_hw_3src_type; + } else if (devinfo->gen >= 7) { + assert(type < ARRAY_SIZE(gen7_hw_3src_type)); + table = gen7_hw_3src_type; + } else { + assert(type < ARRAY_SIZE(gen6_hw_3src_type)); + table = gen6_hw_3src_type; + } + + assert(table[type].reg_type != (enum hw_3src_reg_type)INVALID); + return table[type].reg_type; } /** @@ -273,9 +396,16 @@ brw_reg_type_to_a1_hw_3src_type(const struct gen_device_info *devinfo, enum brw_reg_type type) { - assert(type < ARRAY_SIZE(gen10_hw_3src_align1_type)); - assert(gen10_hw_3src_align1_type[type].reg_type != (enum hw_3src_reg_type)INVALID); - return gen10_hw_3src_align1_type[type].reg_type; + if (devinfo->gen >= 12) { + assert(type < ARRAY_SIZE(gen12_hw_3src_type)); + return gen12_hw_3src_type[type].reg_type; + } else if (devinfo->gen >= 11) { + assert(type < ARRAY_SIZE(gen11_hw_3src_type)); + return gen11_hw_3src_type[type].reg_type; + } else { + assert(type < ARRAY_SIZE(gen10_hw_3src_align1_type)); + return gen10_hw_3src_align1_type[type].reg_type; + } } /** @@ -286,13 +416,22 @@ brw_a16_hw_3src_type_to_reg_type(const struct gen_device_info *devinfo, unsigned hw_type) { - assert(devinfo->gen >= 8 || hw_type != GEN8_3SRC_TYPE_HF); + const struct hw_3src_type *table = NULL; + + if (devinfo->gen >= 8) { + table = gen8_hw_3src_type; + } else if (devinfo->gen >= 7) { + table = gen7_hw_3src_type; + } else if (devinfo->gen >= 6) { + table = gen6_hw_3src_type; + } + for (enum brw_reg_type i = 0; i <= BRW_REGISTER_TYPE_LAST; i++) { - if (gen7_hw_3src_type[i].reg_type == hw_type) { + if (table[i].reg_type == hw_type) { return i; } } - unreachable("not reached"); + return INVALID_REG_TYPE; } /** @@ -303,13 +442,17 @@ brw_a1_hw_3src_type_to_reg_type(const struct gen_device_info *devinfo, unsigned hw_type, unsigned exec_type) { + const struct hw_3src_type *table = (devinfo->gen >= 12 ? gen12_hw_3src_type : + devinfo->gen >= 11 ? gen11_hw_3src_type : + gen10_hw_3src_align1_type); + for (enum brw_reg_type i = 0; i <= BRW_REGISTER_TYPE_LAST; i++) { - if (gen10_hw_3src_align1_type[i].reg_type == hw_type && - gen10_hw_3src_align1_type[i].exec_type == exec_type) { + if (table[i].reg_type == hw_type && + table[i].exec_type == exec_type) { return i; } } - unreachable("not reached"); + return INVALID_REG_TYPE; } /** @@ -336,6 +479,9 @@ [BRW_REGISTER_TYPE_V] = 2, [BRW_REGISTER_TYPE_UV] = 2, }; + if (type >= ARRAY_SIZE(type_size)) + return -1; + return type_size[type]; } @@ -366,6 +512,9 @@ [BRW_REGISTER_TYPE_V] = "V", [BRW_REGISTER_TYPE_UV] = "UV", }; + if (type >= ARRAY_SIZE(letters)) + return "INVALID"; + assert(type < ARRAY_SIZE(letters)); return letters[type]; } diff -Nru mesa-19.2.8/src/intel/compiler/brw_reg_type.h mesa-20.0.8/src/intel/compiler/brw_reg_type.h --- mesa-19.2.8/src/intel/compiler/brw_reg_type.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/compiler/brw_reg_type.h 2020-06-12 01:21:17.000000000 +0000 @@ -100,6 +100,9 @@ } } +#define INVALID_REG_TYPE ((enum brw_reg_type)-1) +#define INVALID_HW_REG_TYPE ((unsigned)-1) + unsigned brw_reg_type_to_hw_type(const struct gen_device_info *devinfo, enum brw_reg_file file, enum brw_reg_type type); diff -Nru mesa-19.2.8/src/intel/compiler/brw_schedule_instructions.cpp mesa-20.0.8/src/intel/compiler/brw_schedule_instructions.cpp --- mesa-19.2.8/src/intel/compiler/brw_schedule_instructions.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/compiler/brw_schedule_instructions.cpp 2020-06-12 01:21:17.000000000 +0000 @@ -412,6 +412,11 @@ latency = 14000; break; + case GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE: + /* completely fabricated number */ + latency = 600; + break; + default: unreachable("Unknown render cache message"); } @@ -419,6 +424,8 @@ case GEN7_SFID_DATAPORT_DATA_CACHE: switch ((inst->desc >> 14) & 0x1f) { + case GEN7_DATAPORT_DC_DWORD_SCATTERED_READ: + case GEN6_DATAPORT_WRITE_MESSAGE_DWORD_SCATTERED_WRITE: case HSW_DATAPORT_DC_PORT0_BYTE_SCATTERED_READ: case HSW_DATAPORT_DC_PORT0_BYTE_SCATTERED_WRITE: /* We have no data for this but assume it's roughly the same as @@ -1183,7 +1190,7 @@ } if (inst->mlen > 0 && inst->base_mrf != -1) { - for (int i = 0; i < v->implied_mrf_writes(inst); i++) { + for (unsigned i = 0; i < inst->implied_mrf_writes(); i++) { add_dep(last_mrf_write[inst->base_mrf + i], n); last_mrf_write[inst->base_mrf + i] = n; } @@ -1306,7 +1313,7 @@ } if (inst->mlen > 0 && inst->base_mrf != -1) { - for (int i = 0; i < v->implied_mrf_writes(inst); i++) { + for (unsigned i = 0; i < inst->implied_mrf_writes(); i++) { last_mrf_write[inst->base_mrf + i] = n; } } diff -Nru mesa-19.2.8/src/intel/compiler/brw_shader.cpp mesa-20.0.8/src/intel/compiler/brw_shader.cpp --- mesa-19.2.8/src/intel/compiler/brw_shader.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/compiler/brw_shader.cpp 2020-06-12 01:21:17.000000000 +0000 @@ -164,7 +164,7 @@ brw_instruction_name(const struct gen_device_info *devinfo, enum opcode op) { switch (op) { - case BRW_OPCODE_ILLEGAL ... BRW_OPCODE_NOP: + case 0 ... NUM_BRW_OPCODES - 1: /* The DO instruction doesn't exist on Gen6+, but we use it to mark the * start of a loop in the IR. */ @@ -323,6 +323,8 @@ return "typed_surface_write_logical"; case SHADER_OPCODE_MEMORY_FENCE: return "memory_fence"; + case FS_OPCODE_SCHEDULING_FENCE: + return "scheduling_fence"; case SHADER_OPCODE_INTERLOCK: /* For an interlock we actually issue a memory fence via sendc. */ return "interlock"; @@ -331,6 +333,10 @@ return "byte_scattered_read_logical"; case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL: return "byte_scattered_write_logical"; + case SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL: + return "dword_scattered_read_logical"; + case SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL: + return "dword_scattered_write_logical"; case SHADER_OPCODE_LOAD_PAYLOAD: return "load_payload"; @@ -358,6 +364,9 @@ case SHADER_OPCODE_FIND_LIVE_CHANNEL: return "find_live_channel"; + case FS_OPCODE_LOAD_LIVE_CHANNELS: + return "load_live_channels"; + case SHADER_OPCODE_BROADCAST: return "broadcast"; case SHADER_OPCODE_SHUFFLE: @@ -487,6 +496,10 @@ return "barrier"; case SHADER_OPCODE_MULH: return "mulh"; + case SHADER_OPCODE_ISUB_SAT: + return "isub_sat"; + case SHADER_OPCODE_USUB_SAT: + return "usub_sat"; case SHADER_OPCODE_MOV_INDIRECT: return "mov_indirect"; @@ -519,6 +532,8 @@ case SHADER_OPCODE_RND_MODE: return "rnd_mode"; + case SHADER_OPCODE_FLOAT_CONTROL_MODE: + return "float_control_mode"; } unreachable("not reached"); @@ -1040,6 +1055,7 @@ case SHADER_OPCODE_SEND: return send_has_side_effects; + case BRW_OPCODE_SYNC: case VEC4_OPCODE_UNTYPED_ATOMIC: case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL: case SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL: @@ -1052,6 +1068,7 @@ case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT64_LOGICAL: case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT_LOGICAL: case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL: + case SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL: case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL: case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL: case SHADER_OPCODE_MEMORY_FENCE: @@ -1067,6 +1084,8 @@ case TCS_OPCODE_URB_WRITE: case TCS_OPCODE_RELEASE_INPUT: case SHADER_OPCODE_RND_MODE: + case SHADER_OPCODE_FLOAT_CONTROL_MODE: + case FS_OPCODE_SCHEDULING_FENCE: return true; default: return eot; @@ -1084,6 +1103,7 @@ case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL: case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL: case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL: + case SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL: case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL: case SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL: case SHADER_OPCODE_URB_READ_SIMD8: @@ -1233,7 +1253,6 @@ const struct brw_vue_map *input_vue_map, struct brw_tes_prog_data *prog_data, nir_shader *nir, - struct gl_program *prog, int shader_time_index, struct brw_compile_stats *stats, char **error_str) @@ -1324,7 +1343,7 @@ if (is_scalar) { fs_visitor v(compiler, log_data, mem_ctx, &key->base, - &prog_data->base.base, NULL, nir, 8, + &prog_data->base.base, nir, 8, shader_time_index, input_vue_map); if (!v.run_tes()) { if (error_str) diff -Nru mesa-19.2.8/src/intel/compiler/brw_vec4.cpp mesa-20.0.8/src/intel/compiler/brw_vec4.cpp --- mesa-19.2.8/src/intel/compiler/brw_vec4.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/compiler/brw_vec4.cpp 2020-06-12 01:21:17.000000000 +0000 @@ -633,6 +633,9 @@ void vec4_visitor::pack_uniform_registers() { + if (!compiler->compact_params) + return; + uint8_t chans_used[this->uniforms]; int new_loc[this->uniforms]; int new_chan[this->uniforms]; @@ -2964,7 +2967,6 @@ fs_visitor v(compiler, log_data, mem_ctx, &key->base, &prog_data->base.base, - NULL, /* prog; Only used for TEXTURE_RECTANGLE on gen < 8 */ shader, 8, shader_time_index); if (!v.run_vs()) { if (error_str) diff -Nru mesa-19.2.8/src/intel/compiler/brw_vec4_generator.cpp mesa-20.0.8/src/intel/compiler/brw_vec4_generator.cpp --- mesa-19.2.8/src/intel/compiler/brw_vec4_generator.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/compiler/brw_vec4_generator.cpp 2020-06-12 01:21:17.000000000 +0000 @@ -270,6 +270,17 @@ break; } + /* Stomp the resinfo output type to UINT32. On gens 4-5, the output type + * is set as part of the message descriptor. On gen4, the PRM seems to + * allow UINT32 and FLOAT32 (i965 PRM, Vol. 4 Section 4.8.1.1), but on + * later gens UINT32 is required. Once you hit Sandy Bridge, the bit is + * gone from the message descriptor entirely and you just get UINT32 all + * the time regasrdless. Since we can really only do non-UINT32 on gen4, + * just stomp it to UINT32 all the time. + */ + if (inst->opcode == SHADER_OPCODE_TXS) + return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32; + uint32_t base_binding_table_index = (inst->opcode == SHADER_OPCODE_TG4 || inst->opcode == SHADER_OPCODE_TG4_OFFSET) ? prog_data->base.binding_table.gather_texture_start @@ -1208,7 +1219,7 @@ /* If the instruction is predicated, we'll predicate the send, not * the header setup. */ - brw_set_default_predicate_control(p, false); + brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); gen6_resolve_implied_move(p, &header, inst->base_mrf); @@ -1505,8 +1516,15 @@ bool debug_flag = INTEL_DEBUG & intel_debug_flag_for_shader_stage(nir->info.stage); struct disasm_info *disasm_info = disasm_initialize(devinfo, cfg); + + /* `send_count` explicitly does not include spills or fills, as we'd + * like to use it as a metric for intentional memory access or other + * shared function use. Otherwise, subtle changes to scheduling or + * register allocation could cause it to fluctuate wildly - and that + * effect is already counted in spill/fill counts. + */ int spill_count = 0, fill_count = 0; - int loop_count = 0; + int loop_count = 0, send_count = 0; foreach_block_and_inst (block, vec4_instruction, inst, cfg) { struct brw_reg src[3], dst; @@ -1746,6 +1764,7 @@ generate_math_gen6(p, inst, dst, src[0], brw_null_reg()); } else { generate_math1_gen4(p, inst, dst, src[0]); + send_count++; } break; @@ -1759,6 +1778,7 @@ generate_math_gen6(p, inst, dst, src[0], src[1]); } else { generate_math2_gen4(p, inst, dst, src[0], src[1]); + send_count++; } break; @@ -1775,14 +1795,17 @@ case SHADER_OPCODE_SAMPLEINFO: generate_tex(p, prog_data, nir->info.stage, inst, dst, src[0], src[1], src[2]); + send_count++; break; case SHADER_OPCODE_GET_BUFFER_SIZE: generate_get_buffer_size(p, prog_data, inst, dst, src[0], src[1]); + send_count++; break; case VS_OPCODE_URB_WRITE: generate_vs_urb_write(p, inst); + send_count++; break; case SHADER_OPCODE_GEN4_SCRATCH_READ: @@ -1797,10 +1820,12 @@ case VS_OPCODE_PULL_CONSTANT_LOAD: generate_pull_constant_load(p, prog_data, inst, dst, src[0], src[1]); + send_count++; break; case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7: generate_pull_constant_load_gen7(p, prog_data, inst, dst, src[0], src[1]); + send_count++; break; case VS_OPCODE_SET_SIMD4X2_HEADER_GEN9: @@ -1809,14 +1834,17 @@ case GS_OPCODE_URB_WRITE: generate_gs_urb_write(p, inst); + send_count++; break; case GS_OPCODE_URB_WRITE_ALLOCATE: generate_gs_urb_write_allocate(p, inst); + send_count++; break; case GS_OPCODE_SVB_WRITE: generate_gs_svb_write(p, prog_data, inst, dst, src[0], src[1]); + send_count++; break; case GS_OPCODE_SVB_SET_DST_INDEX: @@ -1825,6 +1853,7 @@ case GS_OPCODE_THREAD_END: generate_gs_thread_end(p, inst); + send_count++; break; case GS_OPCODE_SET_WRITE_OFFSET: @@ -1837,6 +1866,7 @@ case GS_OPCODE_FF_SYNC: generate_gs_ff_sync(p, inst, dst, src[0], src[1]); + send_count++; break; case GS_OPCODE_FF_SYNC_SET_PRIMITIVES: @@ -1866,12 +1896,14 @@ case SHADER_OPCODE_SHADER_TIME_ADD: brw_shader_time_add(p, src[0], prog_data->base.binding_table.shader_time_start); + send_count++; break; case VEC4_OPCODE_UNTYPED_ATOMIC: assert(src[2].file == BRW_IMMEDIATE_VALUE); brw_untyped_atomic(p, dst, src[0], src[1], src[2].ud, inst->mlen, !inst->dst.is_null(), inst->header_size); + send_count++; break; case VEC4_OPCODE_UNTYPED_SURFACE_READ: @@ -1879,16 +1911,19 @@ assert(src[2].file == BRW_IMMEDIATE_VALUE); brw_untyped_surface_read(p, dst, src[0], src[1], inst->mlen, src[2].ud); + send_count++; break; case VEC4_OPCODE_UNTYPED_SURFACE_WRITE: assert(src[2].file == BRW_IMMEDIATE_VALUE); brw_untyped_surface_write(p, src[0], src[1], inst->mlen, src[2].ud, inst->header_size); + send_count++; break; case SHADER_OPCODE_MEMORY_FENCE: brw_memory_fence(p, dst, src[0], BRW_OPCODE_SEND, false, /* bti */ 0); + send_count++; break; case SHADER_OPCODE_FIND_LIVE_CHANNEL: { @@ -2068,10 +2103,12 @@ case TCS_OPCODE_URB_WRITE: generate_tcs_urb_write(p, inst, src[0]); + send_count++; break; case VEC4_OPCODE_URB_READ: generate_vec4_urb_read(p, inst, dst, src[0]); + send_count++; break; case TCS_OPCODE_SET_INPUT_URB_OFFSETS: @@ -2113,15 +2150,18 @@ case TCS_OPCODE_RELEASE_INPUT: generate_tcs_release_input(p, dst, src[0], src[1]); + send_count++; break; case TCS_OPCODE_THREAD_END: generate_tcs_thread_end(p, inst); + send_count++; break; case SHADER_OPCODE_BARRIER: brw_barrier(p, src[0]); brw_WAIT(p); + send_count++; break; case SHADER_OPCODE_MOV_INDIRECT: @@ -2188,9 +2228,9 @@ sha1buf); fprintf(stderr, "%s vec4 shader: %d instructions. %d loops. %u cycles. %d:%d " - "spills:fills. Compacted %d to %d bytes (%.0f%%)\n", + "spills:fills, %u sends. Compacted %d to %d bytes (%.0f%%)\n", stage_abbrev, before_size / 16, loop_count, cfg->cycle_count, - spill_count, fill_count, before_size, after_size, + spill_count, fill_count, send_count, before_size, after_size, 100.0f * (before_size - after_size) / before_size); /* overriding the shader makes disasm_info invalid */ @@ -2205,10 +2245,11 @@ compiler->shader_debug_log(log_data, "%s vec4 shader: %d inst, %d loops, %u cycles, " - "%d:%d spills:fills, compacted %d to %d bytes.", + "%d:%d spills:fills, %u sends, " + "compacted %d to %d bytes.", stage_abbrev, before_size / 16, loop_count, cfg->cycle_count, spill_count, - fill_count, before_size, after_size); + fill_count, send_count, before_size, after_size); if (stats) { stats->dispatch_width = 0; stats->instructions = before_size / 16; diff -Nru mesa-19.2.8/src/intel/compiler/brw_vec4_live_variables.cpp mesa-20.0.8/src/intel/compiler/brw_vec4_live_variables.cpp --- mesa-19.2.8/src/intel/compiler/brw_vec4_live_variables.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/compiler/brw_vec4_live_variables.cpp 2020-06-12 01:21:17.000000000 +0000 @@ -71,7 +71,7 @@ assert(cfg->blocks[block->num - 1]->end_ip == ip - 1); foreach_inst_in_block(vec4_instruction, inst, block) { - struct block_data *bd = &block_data[block->num]; + struct vec4_block_data *bd = &block_data[block->num]; /* Set use[] for this instruction */ for (unsigned int i = 0; i < 3; i++) { @@ -137,11 +137,11 @@ cont = false; foreach_block_reverse (block, cfg) { - struct block_data *bd = &block_data[block->num]; + struct vec4_block_data *bd = &block_data[block->num]; /* Update liveout */ foreach_list_typed(bblock_link, child_link, link, &block->children) { - struct block_data *child_bd = &block_data[child_link->block->num]; + struct vec4_block_data *child_bd = &block_data[child_link->block->num]; for (int i = 0; i < bitset_words; i++) { BITSET_WORD new_liveout = (child_bd->livein[i] & @@ -187,7 +187,7 @@ mem_ctx = ralloc_context(NULL); num_vars = alloc.total_size * 8; - block_data = rzalloc_array(mem_ctx, struct block_data, cfg->num_blocks); + block_data = rzalloc_array(mem_ctx, struct vec4_block_data, cfg->num_blocks); bitset_words = BITSET_WORDS(num_vars); for (int i = 0; i < cfg->num_blocks; i++) { @@ -288,7 +288,7 @@ this->live_intervals = new(mem_ctx) vec4_live_variables(alloc, cfg); foreach_block (block, cfg) { - struct block_data *bd = &live_intervals->block_data[block->num]; + struct vec4_block_data *bd = &live_intervals->block_data[block->num]; for (int i = 0; i < live_intervals->num_vars; i++) { if (BITSET_TEST(bd->livein, i)) { diff -Nru mesa-19.2.8/src/intel/compiler/brw_vec4_live_variables.h mesa-20.0.8/src/intel/compiler/brw_vec4_live_variables.h --- mesa-19.2.8/src/intel/compiler/brw_vec4_live_variables.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/compiler/brw_vec4_live_variables.h 2020-06-12 01:21:17.000000000 +0000 @@ -33,7 +33,7 @@ namespace brw { -struct block_data { +struct vec4_block_data { /** * Which variables are defined before being used in the block. * @@ -70,7 +70,7 @@ int bitset_words; /** Per-basic-block information on live variables */ - struct block_data *block_data; + struct vec4_block_data *block_data; protected: void setup_def_use(); diff -Nru mesa-19.2.8/src/intel/compiler/brw_vec4_nir.cpp mesa-20.0.8/src/intel/compiler/brw_vec4_nir.cpp --- mesa-19.2.8/src/intel/compiler/brw_vec4_nir.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/compiler/brw_vec4_nir.cpp 2020-06-12 01:21:17.000000000 +0000 @@ -548,46 +548,17 @@ break; } - case nir_intrinsic_ssbo_atomic_add: { - int op = BRW_AOP_ADD; - - if (nir_src_is_const(instr->src[2])) { - int add_val = nir_src_as_int(instr->src[2]); - if (add_val == 1) - op = BRW_AOP_INC; - else if (add_val == -1) - op = BRW_AOP_DEC; - } - - nir_emit_ssbo_atomic(op, instr); - break; - } + case nir_intrinsic_ssbo_atomic_add: case nir_intrinsic_ssbo_atomic_imin: - nir_emit_ssbo_atomic(BRW_AOP_IMIN, instr); - break; case nir_intrinsic_ssbo_atomic_umin: - nir_emit_ssbo_atomic(BRW_AOP_UMIN, instr); - break; case nir_intrinsic_ssbo_atomic_imax: - nir_emit_ssbo_atomic(BRW_AOP_IMAX, instr); - break; case nir_intrinsic_ssbo_atomic_umax: - nir_emit_ssbo_atomic(BRW_AOP_UMAX, instr); - break; case nir_intrinsic_ssbo_atomic_and: - nir_emit_ssbo_atomic(BRW_AOP_AND, instr); - break; case nir_intrinsic_ssbo_atomic_or: - nir_emit_ssbo_atomic(BRW_AOP_OR, instr); - break; case nir_intrinsic_ssbo_atomic_xor: - nir_emit_ssbo_atomic(BRW_AOP_XOR, instr); - break; case nir_intrinsic_ssbo_atomic_exchange: - nir_emit_ssbo_atomic(BRW_AOP_MOV, instr); - break; case nir_intrinsic_ssbo_atomic_comp_swap: - nir_emit_ssbo_atomic(BRW_AOP_CMPWR, instr); + nir_emit_ssbo_atomic(brw_aop_for_nir_intrinsic(instr), instr); break; case nir_intrinsic_load_vertex_id: @@ -656,6 +627,8 @@ case nir_intrinsic_load_ubo: { src_reg surf_index; + prog_data->base.has_ubo_pull = true; + dest = get_nir_dest(instr->dest); if (nir_src_is_const(instr->src[0])) { @@ -727,7 +700,8 @@ break; } - case nir_intrinsic_memory_barrier: { + case nir_intrinsic_memory_barrier: + case nir_intrinsic_scoped_memory_barrier: { const vec4_builder bld = vec4_builder(this).at_end().annotate(current_annotation, base_ir); const dst_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, 2); @@ -992,7 +966,7 @@ case BRW_REGISTER_TYPE_D: case BRW_REGISTER_TYPE_UD: { int first_comp = -1; - int d; + int d = 0; for (unsigned i = 0; i < NIR_MAX_VEC_COMPONENTS; i++) { if (nir_alu_instr_channel_used(instr, idx, i)) { @@ -1007,6 +981,8 @@ } } + assert(first_comp >= 0); + if (op[idx].abs) d = MAX2(-d, d); @@ -1054,7 +1030,8 @@ } else { uint8_t vf_values[4] = { 0, 0, 0, 0 }; - for (unsigned i = 0; i < NIR_MAX_VEC_COMPONENTS; i++) { + for (unsigned i = 0; i < ARRAY_SIZE(vf_values); i++) { + if (op[idx].abs) f[i] = fabs(f[i]); @@ -1134,6 +1111,18 @@ } } +static bool +const_src_fits_in_16_bits(const nir_src &src, brw_reg_type type) +{ + assert(nir_src_is_const(src)); + if (type_is_unsigned_int(type)) { + return nir_src_comp_as_uint(src, 0) <= UINT16_MAX; + } else { + const int64_t c = nir_src_comp_as_int(src, 0); + return c <= INT16_MAX && c >= INT16_MIN; + } +} + void vec4_visitor::nir_emit_alu(nir_alu_instr *instr) { @@ -1242,14 +1231,14 @@ */ if (nir_src_is_const(instr->src[0].src) && nir_alu_instr_src_read_mask(instr, 0) == 1 && - nir_src_comp_as_uint(instr->src[0].src, 0) < (1 << 16)) { + const_src_fits_in_16_bits(instr->src[0].src, op[0].type)) { if (devinfo->gen < 7) emit(MUL(dst, op[0], op[1])); else emit(MUL(dst, op[1], op[0])); } else if (nir_src_is_const(instr->src[1].src) && nir_alu_instr_src_read_mask(instr, 1) == 1 && - nir_src_comp_as_uint(instr->src[1].src, 0) < (1 << 16)) { + const_src_fits_in_16_bits(instr->src[1].src, op[1].type)) { if (devinfo->gen < 7) emit(MUL(dst, op[1], op[0])); else @@ -1391,6 +1380,12 @@ case nir_op_ftrunc: inst = emit(RNDZ(dst, op[0])); + if (devinfo->gen < 6) { + inst->conditional_mod = BRW_CONDITIONAL_R; + inst = emit(ADD(dst, src_reg(dst), brw_imm_f(1.0f))); + inst->predicate = BRW_PREDICATE_NORMAL; + inst = emit(MOV(dst, src_reg(dst))); /* for potential saturation */ + } inst->saturate = instr->dest.saturate; break; @@ -1421,6 +1416,12 @@ case nir_op_fround_even: inst = emit(RNDE(dst, op[0])); + if (devinfo->gen < 6) { + inst->conditional_mod = BRW_CONDITIONAL_R; + inst = emit(ADD(dst, src_reg(dst), brw_imm_f(1.0f))); + inst->predicate = BRW_PREDICATE_NORMAL; + inst = emit(MOV(dst, src_reg(dst))); /* for potential saturation */ + } inst->saturate = instr->dest.saturate; break; diff -Nru mesa-19.2.8/src/intel/compiler/brw_vec4_tcs.cpp mesa-20.0.8/src/intel/compiler/brw_vec4_tcs.cpp --- mesa-19.2.8/src/intel/compiler/brw_vec4_tcs.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/compiler/brw_vec4_tcs.cpp 2020-06-12 01:21:17.000000000 +0000 @@ -308,13 +308,16 @@ break; } - case nir_intrinsic_barrier: { + case nir_intrinsic_control_barrier: { dst_reg header = dst_reg(this, glsl_type::uvec4_type); emit(TCS_OPCODE_CREATE_BARRIER_HEADER, header); emit(SHADER_OPCODE_BARRIER, dst_null_ud(), src_reg(header)); break; } + case nir_intrinsic_memory_barrier_tcs_patch: + break; + default: vec4_visitor::nir_emit_intrinsic(instr); } @@ -360,12 +363,13 @@ nir->info.system_values_read & (1 << SYSTEM_VALUE_PRIMITIVE_ID); if (compiler->use_tcs_8_patch && - nir->info.tess.tcs_vertices_out <= 16 && - 2 + has_primitive_id + key->input_vertices <= 31) { - /* 3DSTATE_HS imposes two constraints on using 8_PATCH mode. First, - * the "Instance" field limits the number of output vertices to [1, 16]. - * Secondly, the "Dispatch GRF Start Register for URB Data" field is - * limited to [0, 31] - which imposes a limit on the input vertices. + nir->info.tess.tcs_vertices_out <= (devinfo->gen >= 12 ? 32 : 16) && + 2 + has_primitive_id + key->input_vertices <= (devinfo->gen >= 12 ? 63 : 31)) { + /* 3DSTATE_HS imposes two constraints on using 8_PATCH mode. First, the + * "Instance" field limits the number of output vertices to [1, 16] on + * gen11 and below, or [1, 32] on gen12 and above. Secondly, the + * "Dispatch GRF Start Register for URB Data" field is limited to [0, + * 31] - which imposes a limit on the input vertices. */ vue_prog_data->dispatch_mode = DISPATCH_MODE_TCS_8_PATCH; prog_data->instances = nir->info.tess.tcs_vertices_out; @@ -426,7 +430,7 @@ if (is_scalar) { fs_visitor v(compiler, log_data, mem_ctx, &key->base, - &prog_data->base.base, NULL, nir, 8, + &prog_data->base.base, nir, 8, shader_time_index, &input_vue_map); if (!v.run_tcs()) { if (error_str) diff -Nru mesa-19.2.8/src/intel/compiler/meson.build mesa-20.0.8/src/intel/compiler/meson.build --- mesa-19.2.8/src/intel/compiler/meson.build 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/compiler/meson.build 2020-06-12 01:21:17.000000000 +0000 @@ -37,7 +37,7 @@ 'brw_disasm.c', 'brw_disasm_info.c', 'brw_disasm_info.h', - 'brw_eu.c', + 'brw_eu.cpp', 'brw_eu_compact.c', 'brw_eu_defines.h', 'brw_eu_emit.c', @@ -62,6 +62,7 @@ 'brw_fs_reg_allocate.cpp', 'brw_fs_register_coalesce.cpp', 'brw_fs_saturate_propagation.cpp', + 'brw_fs_scoreboard.cpp', 'brw_fs_sel_peephole.cpp', 'brw_fs_validate.cpp', 'brw_fs_visitor.cpp', @@ -77,10 +78,12 @@ 'brw_nir_attribute_workarounds.c', 'brw_nir_lower_conversions.c', 'brw_nir_lower_cs_intrinsics.c', + 'brw_nir_lower_alpha_to_coverage.c', 'brw_nir_lower_image_load_store.c', 'brw_nir_lower_mem_access_bit_sizes.c', 'brw_nir_opt_peephole_ffma.c', 'brw_nir_tcs_workarounds.c', + 'brw_nir_clamp_image_1d_2d_array_sizes.c', 'brw_packed_float.c', 'brw_predicated_break.cpp', 'brw_reg.h', @@ -147,7 +150,7 @@ 'fs_saturate_propagation', 'vf_float_conversions', 'vec4_register_coalesce', 'vec4_copy_propagation', 'vec4_cmod_propagation', 'vec4_dead_code_eliminate', - 'eu_compact', 'eu_validate'] + 'eu_compact', 'eu_validate', 'fs_scoreboard'] test( t, executable( diff -Nru mesa-19.2.8/src/intel/compiler/test_eu_compact.cpp mesa-20.0.8/src/intel/compiler/test_eu_compact.cpp --- mesa-19.2.8/src/intel/compiler/test_eu_compact.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/compiler/test_eu_compact.cpp 2020-06-12 01:21:17.000000000 +0000 @@ -26,6 +26,7 @@ #include #include "util/ralloc.h" #include "brw_eu.h" +#include "brw_gen_enum.h" static bool test_compact_instruction(struct brw_codegen *p, brw_inst src) @@ -74,7 +75,7 @@ } if (devinfo->gen == 8 && !devinfo->is_cherryview && - is_3src(devinfo, (opcode)brw_inst_opcode(devinfo, inst))) { + is_3src(devinfo, brw_inst_opcode(devinfo, inst))) { brw_inst_set_bits(inst, 105, 105, 0); brw_inst_set_bits(inst, 84, 84, 0); brw_inst_set_bits(inst, 36, 35, 0); @@ -92,7 +93,7 @@ if (bit == 29) return true; - if (is_3src(devinfo, (opcode)brw_inst_opcode(devinfo, src))) { + if (is_3src(devinfo, brw_inst_opcode(devinfo, src))) { if (devinfo->gen >= 9 || devinfo->is_cherryview) { if (bit == 127) return true; @@ -159,6 +160,9 @@ clear_pad_bits(p->devinfo, &instr); + if (!brw_validate_instruction(p->devinfo, &instr, 0, NULL)) + continue; + if (!test_compact_instruction(p, instr)) { printf(" twiddled bits for fuzzing %d, %d\n", bit0, bit1); return false; @@ -243,7 +247,7 @@ struct brw_reg g2 = brw_vec8_grf(2, 0); brw_push_insn_state(p); - brw_set_default_predicate_control(p, true); + brw_set_default_predicate_control(p, BRW_PREDICATE_NORMAL); brw_MOV(p, g0, g2); brw_pop_insn_state(p); } @@ -259,7 +263,7 @@ struct brw_reg g2 = brw_vec8_grf(2, 0); brw_push_insn_state(p); - brw_set_default_predicate_control(p, true); + brw_set_default_predicate_control(p, BRW_PREDICATE_NORMAL); brw_inst *mov = brw_MOV(p, g0, g2); brw_inst_set_flag_subreg_nr(p->devinfo, mov, 1); brw_pop_insn_state(p); @@ -267,16 +271,17 @@ struct { void (*func)(struct brw_codegen *p); + int gens; } tests[] = { - { gen_MOV_GRF_GRF }, - { gen_ADD_GRF_GRF_GRF }, - { gen_ADD_GRF_GRF_IMM }, - { gen_ADD_GRF_GRF_IMM_d }, - { gen_ADD_MRF_GRF_GRF }, - { gen_ADD_vec1_GRF_GRF_GRF }, - { gen_PLN_MRF_GRF_GRF }, - { gen_f0_0_MOV_GRF_GRF }, - { gen_f0_1_MOV_GRF_GRF }, + { gen_MOV_GRF_GRF, GEN_ALL }, + { gen_ADD_GRF_GRF_GRF, GEN_ALL }, + { gen_ADD_GRF_GRF_IMM, GEN_ALL }, + { gen_ADD_GRF_GRF_IMM_d, GEN_ALL }, + { gen_ADD_MRF_GRF_GRF, GEN_LE(GEN6) }, + { gen_ADD_vec1_GRF_GRF_GRF, GEN_ALL }, + { gen_PLN_MRF_GRF_GRF, GEN_LE(GEN6) }, + { gen_f0_0_MOV_GRF_GRF, GEN_ALL }, + { gen_f0_1_MOV_GRF_GRF, GEN_ALL }, }; static bool @@ -286,7 +291,14 @@ bool fail = false; for (unsigned i = 0; i < ARRAY_SIZE(tests); i++) { + if ((tests[i].gens & gen_from_devinfo(devinfo)) == 0) + continue; + for (int align_16 = 0; align_16 <= 1; align_16++) { + /* Align16 support is not present on Gen11+ */ + if (devinfo->gen >= 11 && align_16) + continue; + struct brw_codegen *p = rzalloc(NULL, struct brw_codegen); brw_init_codegen(devinfo, p, p); @@ -322,7 +334,7 @@ struct gen_device_info *devinfo = (struct gen_device_info *)calloc(1, sizeof(*devinfo)); bool fail = false; - for (devinfo->gen = 5; devinfo->gen <= 9; devinfo->gen++) { + for (devinfo->gen = 5; devinfo->gen <= 12; devinfo->gen++) { fail |= run_tests(devinfo); } diff -Nru mesa-19.2.8/src/intel/compiler/test_eu_validate.cpp mesa-20.0.8/src/intel/compiler/test_eu_validate.cpp --- mesa-19.2.8/src/intel/compiler/test_eu_validate.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/compiler/test_eu_validate.cpp 2020-06-12 01:21:17.000000000 +0000 @@ -23,6 +23,8 @@ #include #include "brw_eu.h" +#include "brw_eu_defines.h" +#include "util/bitset.h" #include "util/ralloc.h" static const struct gen_info { @@ -46,6 +48,7 @@ { "whl", }, { "cnl", }, { "icl", }, + { "tgl", }, }; class validation_test: public ::testing::TestWithParam { @@ -178,7 +181,7 @@ * reserved on Gen 7 * "goto" on Gen8+ */ - brw_next_insn(p, 46); + brw_next_insn(p, brw_opcode_decode(&devinfo, 46)); if (devinfo.gen == 7) { EXPECT_FALSE(validate(p)); @@ -187,6 +190,397 @@ } } +TEST_P(validation_test, invalid_exec_size_encoding) +{ + const struct { + enum brw_execution_size exec_size; + bool expected_result; + } test_case[] = { + { BRW_EXECUTE_1, true }, + { BRW_EXECUTE_2, true }, + { BRW_EXECUTE_4, true }, + { BRW_EXECUTE_8, true }, + { BRW_EXECUTE_16, true }, + { BRW_EXECUTE_32, true }, + + { (enum brw_execution_size)((int)BRW_EXECUTE_32 + 1), false }, + { (enum brw_execution_size)((int)BRW_EXECUTE_32 + 2), false }, + }; + + for (unsigned i = 0; i < ARRAY_SIZE(test_case); i++) { + brw_MOV(p, g0, g0); + + brw_inst_set_exec_size(&devinfo, last_inst, test_case[i].exec_size); + brw_inst_set_src0_file_type(&devinfo, last_inst, BRW_GENERAL_REGISTER_FILE, BRW_REGISTER_TYPE_W); + brw_inst_set_dst_file_type(&devinfo, last_inst, BRW_GENERAL_REGISTER_FILE, BRW_REGISTER_TYPE_W); + + if (test_case[i].exec_size == BRW_EXECUTE_1) { + brw_inst_set_src0_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_0); + brw_inst_set_src0_width(&devinfo, last_inst, BRW_WIDTH_1); + brw_inst_set_src0_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_0); + } else { + brw_inst_set_src0_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_2); + brw_inst_set_src0_width(&devinfo, last_inst, BRW_WIDTH_2); + brw_inst_set_src0_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_1); + } + + EXPECT_EQ(test_case[i].expected_result, validate(p)); + + clear_instructions(p); + } +} + +TEST_P(validation_test, invalid_file_encoding) +{ + /* Register file on Gen12 is only one bit */ + if (devinfo.gen >= 12) + return; + + brw_MOV(p, g0, g0); + brw_inst_set_dst_file_type(&devinfo, last_inst, BRW_MESSAGE_REGISTER_FILE, BRW_REGISTER_TYPE_F); + + if (devinfo.gen > 6) { + EXPECT_FALSE(validate(p)); + } else { + EXPECT_TRUE(validate(p)); + } + + clear_instructions(p); + + if (devinfo.gen < 6) { + gen4_math(p, g0, BRW_MATH_FUNCTION_SIN, 0, g0, BRW_MATH_PRECISION_FULL); + } else { + gen6_math(p, g0, BRW_MATH_FUNCTION_SIN, g0, null); + } + brw_inst_set_src0_file_type(&devinfo, last_inst, BRW_MESSAGE_REGISTER_FILE, BRW_REGISTER_TYPE_F); + + if (devinfo.gen > 6) { + EXPECT_FALSE(validate(p)); + } else { + EXPECT_TRUE(validate(p)); + } +} + +TEST_P(validation_test, invalid_type_encoding) +{ + enum brw_reg_file files[2] = { + BRW_GENERAL_REGISTER_FILE, + BRW_IMMEDIATE_VALUE, + }; + + for (unsigned i = 0; i < ARRAY_SIZE(files); i++) { + const enum brw_reg_file file = files[i]; + const int num_bits = devinfo.gen >= 8 ? 4 : 3; + const int num_encodings = 1 << num_bits; + + /* The data types are encoded into bits to be used in hardware + * instructions, so keep a record in a bitset the invalid patterns so + * they can be verified to be invalid when used. + */ + BITSET_DECLARE(invalid_encodings, num_encodings); + + const struct { + enum brw_reg_type type; + bool expected_result; + } test_case[] = { + { BRW_REGISTER_TYPE_NF, devinfo.gen == 11 && file != IMM }, + { BRW_REGISTER_TYPE_DF, devinfo.has_64bit_float && (devinfo.gen >= 8 || file != IMM) }, + { BRW_REGISTER_TYPE_F, true }, + { BRW_REGISTER_TYPE_HF, devinfo.gen >= 8 }, + { BRW_REGISTER_TYPE_VF, file == IMM }, + { BRW_REGISTER_TYPE_Q, devinfo.has_64bit_int }, + { BRW_REGISTER_TYPE_UQ, devinfo.has_64bit_int }, + { BRW_REGISTER_TYPE_D, true }, + { BRW_REGISTER_TYPE_UD, true }, + { BRW_REGISTER_TYPE_W, true }, + { BRW_REGISTER_TYPE_UW, true }, + { BRW_REGISTER_TYPE_B, file == FIXED_GRF }, + { BRW_REGISTER_TYPE_UB, file == FIXED_GRF }, + { BRW_REGISTER_TYPE_V, file == IMM }, + { BRW_REGISTER_TYPE_UV, devinfo.gen >= 6 && file == IMM }, + }; + + /* Initially assume all hardware encodings are invalid */ + BITSET_ONES(invalid_encodings); + + brw_set_default_exec_size(p, BRW_EXECUTE_4); + + for (unsigned i = 0; i < ARRAY_SIZE(test_case); i++) { + if (test_case[i].expected_result) { + unsigned hw_type = brw_reg_type_to_hw_type(&devinfo, file, test_case[i].type); + if (hw_type != INVALID_REG_TYPE) { + /* ... and remove valid encodings from the set */ + assert(BITSET_TEST(invalid_encodings, hw_type)); + BITSET_CLEAR(invalid_encodings, hw_type); + } + + if (file == FIXED_GRF) { + struct brw_reg g = retype(g0, test_case[i].type); + brw_MOV(p, g, g); + brw_inst_set_src0_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4); + brw_inst_set_src0_width(&devinfo, last_inst, BRW_WIDTH_4); + brw_inst_set_src0_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_1); + } else { + enum brw_reg_type t; + + switch (test_case[i].type) { + case BRW_REGISTER_TYPE_V: + t = BRW_REGISTER_TYPE_W; + break; + case BRW_REGISTER_TYPE_UV: + t = BRW_REGISTER_TYPE_UW; + break; + case BRW_REGISTER_TYPE_VF: + t = BRW_REGISTER_TYPE_F; + break; + default: + t = test_case[i].type; + break; + } + + struct brw_reg g = retype(g0, t); + brw_MOV(p, g, retype(brw_imm_w(0), test_case[i].type)); + } + + EXPECT_TRUE(validate(p)); + + clear_instructions(p); + } + } + + /* The remaining encodings in invalid_encodings do not have a mapping + * from BRW_REGISTER_TYPE_* and must be invalid. Verify that invalid + * encodings are rejected by the validator. + */ + int e; + BITSET_FOREACH_SET(e, invalid_encodings, num_encodings) { + if (file == FIXED_GRF) { + brw_MOV(p, g0, g0); + brw_inst_set_src0_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4); + brw_inst_set_src0_width(&devinfo, last_inst, BRW_WIDTH_4); + brw_inst_set_src0_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_1); + } else { + brw_MOV(p, g0, brw_imm_w(0)); + } + brw_inst_set_dst_reg_hw_type(&devinfo, last_inst, e); + brw_inst_set_src0_reg_hw_type(&devinfo, last_inst, e); + + EXPECT_FALSE(validate(p)); + + clear_instructions(p); + } + } +} + +TEST_P(validation_test, invalid_type_encoding_3src_a16) +{ + /* 3-src instructions in align16 mode only supported on Gen6-10 */ + if (devinfo.gen < 6 || devinfo.gen > 10) + return; + + const int num_bits = devinfo.gen >= 8 ? 3 : 2; + const int num_encodings = 1 << num_bits; + + /* The data types are encoded into bits to be used in hardware + * instructions, so keep a record in a bitset the invalid patterns so + * they can be verified to be invalid when used. + */ + BITSET_DECLARE(invalid_encodings, num_encodings); + + const struct { + enum brw_reg_type type; + bool expected_result; + } test_case[] = { + { BRW_REGISTER_TYPE_DF, devinfo.gen >= 7 }, + { BRW_REGISTER_TYPE_F, true }, + { BRW_REGISTER_TYPE_HF, devinfo.gen >= 8 }, + { BRW_REGISTER_TYPE_D, devinfo.gen >= 7 }, + { BRW_REGISTER_TYPE_UD, devinfo.gen >= 7 }, + }; + + /* Initially assume all hardware encodings are invalid */ + BITSET_ONES(invalid_encodings); + + brw_set_default_access_mode(p, BRW_ALIGN_16); + brw_set_default_exec_size(p, BRW_EXECUTE_4); + + for (unsigned i = 0; i < ARRAY_SIZE(test_case); i++) { + if (test_case[i].expected_result) { + unsigned hw_type = brw_reg_type_to_a16_hw_3src_type(&devinfo, test_case[i].type); + if (hw_type != INVALID_HW_REG_TYPE) { + /* ... and remove valid encodings from the set */ + assert(BITSET_TEST(invalid_encodings, hw_type)); + BITSET_CLEAR(invalid_encodings, hw_type); + } + + struct brw_reg g = retype(g0, test_case[i].type); + if (!brw_reg_type_is_integer(test_case[i].type)) { + brw_MAD(p, g, g, g, g); + } else { + brw_BFE(p, g, g, g, g); + } + + EXPECT_TRUE(validate(p)); + + clear_instructions(p); + } + } + + /* The remaining encodings in invalid_encodings do not have a mapping + * from BRW_REGISTER_TYPE_* and must be invalid. Verify that invalid + * encodings are rejected by the validator. + */ + int e; + BITSET_FOREACH_SET(e, invalid_encodings, num_encodings) { + for (unsigned i = 0; i < 2; i++) { + if (i == 0) { + brw_MAD(p, g0, g0, g0, g0); + } else { + brw_BFE(p, g0, g0, g0, g0); + } + + brw_inst_set_3src_a16_dst_hw_type(&devinfo, last_inst, e); + brw_inst_set_3src_a16_src_hw_type(&devinfo, last_inst, e); + + EXPECT_FALSE(validate(p)); + + clear_instructions(p); + + if (devinfo.gen == 6) + break; + } + } +} + +TEST_P(validation_test, invalid_type_encoding_3src_a1) +{ + /* 3-src instructions in align1 mode only supported on Gen10+ */ + if (devinfo.gen < 10) + return; + + const int num_bits = 3 + 1 /* for exec_type */; + const int num_encodings = 1 << num_bits; + + /* The data types are encoded into bits to be used in hardware + * instructions, so keep a record in a bitset the invalid patterns so + * they can be verified to be invalid when used. + */ + BITSET_DECLARE(invalid_encodings, num_encodings); + + const struct { + enum brw_reg_type type; + unsigned exec_type; + bool expected_result; + } test_case[] = { +#define E(x) ((unsigned)BRW_ALIGN1_3SRC_EXEC_TYPE_##x) + { BRW_REGISTER_TYPE_NF, E(FLOAT), devinfo.gen == 11 }, + { BRW_REGISTER_TYPE_DF, E(FLOAT), devinfo.has_64bit_float }, + { BRW_REGISTER_TYPE_F, E(FLOAT), true }, + { BRW_REGISTER_TYPE_HF, E(FLOAT), true }, + { BRW_REGISTER_TYPE_D, E(INT), true }, + { BRW_REGISTER_TYPE_UD, E(INT), true }, + { BRW_REGISTER_TYPE_W, E(INT), true }, + { BRW_REGISTER_TYPE_UW, E(INT), true }, + + /* There are no ternary instructions that can operate on B-type sources + * on Gen11-12. Src1/Src2 cannot be B-typed either. + */ + { BRW_REGISTER_TYPE_B, E(INT), devinfo.gen == 10 }, + { BRW_REGISTER_TYPE_UB, E(INT), devinfo.gen == 10 }, + }; + + /* Initially assume all hardware encodings are invalid */ + BITSET_ONES(invalid_encodings); + + brw_set_default_access_mode(p, BRW_ALIGN_1); + brw_set_default_exec_size(p, BRW_EXECUTE_4); + + for (unsigned i = 0; i < ARRAY_SIZE(test_case); i++) { + if (test_case[i].expected_result) { + unsigned hw_type = brw_reg_type_to_a1_hw_3src_type(&devinfo, test_case[i].type); + unsigned hw_exec_type = hw_type | (test_case[i].exec_type << 3); + if (hw_type != INVALID_HW_REG_TYPE) { + /* ... and remove valid encodings from the set */ + assert(BITSET_TEST(invalid_encodings, hw_exec_type)); + BITSET_CLEAR(invalid_encodings, hw_exec_type); + } + + struct brw_reg g = retype(g0, test_case[i].type); + if (!brw_reg_type_is_integer(test_case[i].type)) { + brw_MAD(p, g, g, g, g); + } else { + brw_BFE(p, g, g, g, g); + } + + EXPECT_TRUE(validate(p)); + + clear_instructions(p); + } + } + + /* The remaining encodings in invalid_encodings do not have a mapping + * from BRW_REGISTER_TYPE_* and must be invalid. Verify that invalid + * encodings are rejected by the validator. + */ + int e; + BITSET_FOREACH_SET(e, invalid_encodings, num_encodings) { + const unsigned hw_type = e & 0x7; + const unsigned exec_type = e >> 3; + + for (unsigned i = 0; i < 2; i++) { + if (i == 0) { + brw_MAD(p, g0, g0, g0, g0); + brw_inst_set_3src_a1_exec_type(&devinfo, last_inst, BRW_ALIGN1_3SRC_EXEC_TYPE_FLOAT); + } else { + brw_CSEL(p, g0, g0, g0, g0); + brw_inst_set_3src_cond_modifier(&devinfo, last_inst, BRW_CONDITIONAL_NZ); + brw_inst_set_3src_a1_exec_type(&devinfo, last_inst, BRW_ALIGN1_3SRC_EXEC_TYPE_INT); + } + + brw_inst_set_3src_a1_exec_type(&devinfo, last_inst, exec_type); + brw_inst_set_3src_a1_dst_hw_type (&devinfo, last_inst, hw_type); + brw_inst_set_3src_a1_src0_hw_type(&devinfo, last_inst, hw_type); + brw_inst_set_3src_a1_src1_hw_type(&devinfo, last_inst, hw_type); + brw_inst_set_3src_a1_src2_hw_type(&devinfo, last_inst, hw_type); + + EXPECT_FALSE(validate(p)); + + clear_instructions(p); + } + } +} + +TEST_P(validation_test, 3src_inst_access_mode) +{ + /* 3-src instructions only supported on Gen6+ */ + if (devinfo.gen < 6) + return; + + /* No access mode bit on Gen12+ */ + if (devinfo.gen >= 12) + return; + + const struct { + unsigned mode; + bool expected_result; + } test_case[] = { + { BRW_ALIGN_1, devinfo.gen >= 10 }, + { BRW_ALIGN_16, devinfo.gen <= 10 }, + }; + + for (unsigned i = 0; i < ARRAY_SIZE(test_case); i++) { + if (devinfo.gen < 10) + brw_set_default_access_mode(p, BRW_ALIGN_16); + + brw_MAD(p, g0, g0, g0, g0); + brw_inst_set_access_mode(&devinfo, last_inst, test_case[i].mode); + + EXPECT_EQ(test_case[i].expected_result, validate(p)); + + clear_instructions(p); + } +} + /* When the Execution Data Type is wider than the destination data type, the * destination must [...] specify a HorzStride equal to the ratio in sizes of * the two data types. @@ -450,7 +844,7 @@ brw_set_default_access_mode(p, BRW_ALIGN_16); - for (unsigned i = 0; i < sizeof(vstride) / sizeof(vstride[0]); i++) { + for (unsigned i = 0; i < ARRAY_SIZE(vstride); i++) { brw_ADD(p, g0, g0, g0); brw_inst_set_src0_vstride(&devinfo, last_inst, vstride[i].vstride); @@ -459,7 +853,7 @@ clear_instructions(p); } - for (unsigned i = 0; i < sizeof(vstride) / sizeof(vstride[0]); i++) { + for (unsigned i = 0; i < ARRAY_SIZE(vstride); i++) { brw_ADD(p, g0, g0, g0); brw_inst_set_src1_vstride(&devinfo, last_inst, vstride[i].vstride); @@ -794,7 +1188,7 @@ { BRW_REGISTER_TYPE_B , BRW_REGISTER_TYPE_D , 0, 0, 0, false }, }; - for (unsigned i = 0; i < sizeof(move) / sizeof(move[0]); i++) { + for (unsigned i = 0; i < ARRAY_SIZE(move); i++) { brw_MOV(p, retype(g0, move[i].dst_type), retype(g0, move[i].src_type)); brw_inst_set_src0_negate(&devinfo, last_inst, move[i].neg); brw_inst_set_src0_abs(&devinfo, last_inst, move[i].abs); @@ -891,8 +1285,14 @@ if (devinfo.gen < 8) return; - for (unsigned i = 0; i < sizeof(inst) / sizeof(inst[0]); i++) { - if (!devinfo.has_64bit_types && type_sz(inst[i].src_type) == 8) + for (unsigned i = 0; i < ARRAY_SIZE(inst); i++) { + if (!devinfo.has_64bit_float && + inst[i].src_type == BRW_REGISTER_TYPE_DF) + continue; + + if (!devinfo.has_64bit_int && + (inst[i].src_type == BRW_REGISTER_TYPE_Q || + inst[i].src_type == BRW_REGISTER_TYPE_UQ)) continue; brw_MOV(p, retype(g0, inst[i].dst_type), retype(g0, inst[i].src_type)); @@ -987,11 +1387,18 @@ if (devinfo.gen < 8) return; - for (unsigned i = 0; i < sizeof(inst) / sizeof(inst[0]); i++) { - if (!devinfo.has_64bit_types && - (type_sz(inst[i].src_type) == 8 || type_sz(inst[i].dst_type) == 8)) { + for (unsigned i = 0; i < ARRAY_SIZE(inst); i++) { + if (!devinfo.has_64bit_float && + (inst[i].dst_type == BRW_REGISTER_TYPE_DF || + inst[i].src_type == BRW_REGISTER_TYPE_DF)) + continue; + + if (!devinfo.has_64bit_int && + (inst[i].dst_type == BRW_REGISTER_TYPE_Q || + inst[i].dst_type == BRW_REGISTER_TYPE_UQ || + inst[i].src_type == BRW_REGISTER_TYPE_Q || + inst[i].src_type == BRW_REGISTER_TYPE_UQ)) continue; - } brw_MOV(p, retype(g0, inst[i].dst_type), retype(g0, inst[i].src_type)); @@ -1067,7 +1474,7 @@ if (devinfo.gen < 8) return; - for (unsigned i = 0; i < sizeof(inst) / sizeof(inst[0]); i++) { + for (unsigned i = 0; i < ARRAY_SIZE(inst); i++) { brw_ADD(p, retype(g0, inst[i].dst_type), retype(g0, inst[i].src0_type), retype(g0, inst[i].src1_type)); @@ -1121,7 +1528,7 @@ if (devinfo.gen < 8) return; - for (unsigned i = 0; i < sizeof(inst) / sizeof(inst[0]); i++) { + for (unsigned i = 0; i < ARRAY_SIZE(inst); i++) { brw_ADD(p, retype(g0, inst[i].dst_type), retype(g0, inst[i].src0_type), retype(g0, inst[i].src1_type)); @@ -1188,7 +1595,7 @@ if (devinfo.gen < 8) return; - for (unsigned i = 0; i < sizeof(inst) / sizeof(inst[0]); i++) { + for (unsigned i = 0; i < ARRAY_SIZE(inst); i++) { brw_ADD(p, retype(g0, inst[i].dst_type), retype(inst[i].read_acc ? acc0 : g0, inst[i].src0_type), retype(g0, inst[i].src1_type)); @@ -1264,7 +1671,7 @@ if (devinfo.gen < 8) return; - for (unsigned i = 0; i < sizeof(inst) / sizeof(inst[0]); i++) { + for (unsigned i = 0; i < ARRAY_SIZE(inst); i++) { if (inst[i].opcode == BRW_OPCODE_MAC) { brw_MAC(p, retype(g0, inst[i].dst_type), retype(g0, inst[i].src0_type), @@ -1331,7 +1738,7 @@ if (devinfo.gen < 9) return; - for (unsigned i = 0; i < sizeof(inst) / sizeof(inst[0]); i++) { + for (unsigned i = 0; i < ARRAY_SIZE(inst); i++) { gen6_math(p, retype(g0, inst[i].dst_type), BRW_MATH_FUNCTION_POW, retype(g0, inst[i].src0_type), @@ -1406,7 +1813,7 @@ if (devinfo.gen < 8) return; - for (unsigned i = 0; i < sizeof(inst) / sizeof(inst[0]); i++) { + for (unsigned i = 0; i < ARRAY_SIZE(inst); i++) { brw_ADD(p, retype(g0, inst[i].dst_type), retype(g0, inst[i].src0_type), retype(g0, inst[i].src1_type)); @@ -1477,7 +1884,7 @@ brw_set_default_access_mode(p, BRW_ALIGN_16); - for (unsigned i = 0; i < sizeof(inst) / sizeof(inst[0]); i++) { + for (unsigned i = 0; i < ARRAY_SIZE(inst); i++) { brw_ADD(p, retype(g0, inst[i].dst_type), retype(g0, inst[i].src0_type), retype(g0, inst[i].src1_type)); @@ -1528,7 +1935,7 @@ brw_set_default_access_mode(p, BRW_ALIGN_16); - for (unsigned i = 0; i < sizeof(inst) / sizeof(inst[0]); i++) { + for (unsigned i = 0; i < ARRAY_SIZE(inst); i++) { brw_ADD(p, retype(g0, inst[i].dst_type), retype(g0, inst[i].src0_type), retype(g0, inst[i].src1_type)); @@ -1579,7 +1986,7 @@ brw_set_default_access_mode(p, BRW_ALIGN_16); - for (unsigned i = 0; i < sizeof(inst) / sizeof(inst[0]); i++) { + for (unsigned i = 0; i < ARRAY_SIZE(inst); i++) { brw_ADD(p, retype(g0, inst[i].dst_type), retype(inst[i].read_acc ? acc0 : g0, inst[i].src0_type), retype(g0, inst[i].src1_type)); @@ -1634,7 +2041,7 @@ brw_set_default_access_mode(p, BRW_ALIGN_16); - for (unsigned i = 0; i < sizeof(inst) / sizeof(inst[0]); i++) { + for (unsigned i = 0; i < ARRAY_SIZE(inst); i++) { gen6_math(p, retype(g0, inst[i].dst_type), BRW_MATH_FUNCTION_POW, retype(g0, inst[i].src0_type), @@ -1671,7 +2078,7 @@ { BRW_REGISTER_TYPE_W, BRW_REGISTER_TYPE_UV, 1, BRW_EXECUTE_8, false }, }; - for (unsigned i = 0; i < sizeof(move) / sizeof(move[0]); i++) { + for (unsigned i = 0; i < ARRAY_SIZE(move); i++) { /* UV type is Gen6+ */ if (devinfo.gen < 6 && move[i].src_type == BRW_REGISTER_TYPE_UV) @@ -1713,7 +2120,7 @@ { BRW_REGISTER_TYPE_B, BRW_REGISTER_TYPE_UV, BRW_HORIZONTAL_STRIDE_2, true }, }; - for (unsigned i = 0; i < sizeof(move) / sizeof(move[0]); i++) { + for (unsigned i = 0; i < ARRAY_SIZE(move); i++) { /* UV type is Gen6+ */ if (devinfo.gen < 6 && move[i].src_type == BRW_REGISTER_TYPE_UV) @@ -1869,9 +2276,21 @@ if (devinfo.gen < 8) return; - for (unsigned i = 0; i < sizeof(inst) / sizeof(inst[0]); i++) { - if (!devinfo.has_64bit_types && - (type_sz(inst[i].dst_type) == 8 || type_sz(inst[i].src_type) == 8)) + /* NoDDChk/NoDDClr does not exist on Gen12+ */ + if (devinfo.gen >= 12) + return; + + for (unsigned i = 0; i < ARRAY_SIZE(inst); i++) { + if (!devinfo.has_64bit_float && + (inst[i].dst_type == BRW_REGISTER_TYPE_DF || + inst[i].src_type == BRW_REGISTER_TYPE_DF)) + continue; + + if (!devinfo.has_64bit_int && + (inst[i].dst_type == BRW_REGISTER_TYPE_Q || + inst[i].dst_type == BRW_REGISTER_TYPE_UQ || + inst[i].src_type == BRW_REGISTER_TYPE_Q || + inst[i].src_type == BRW_REGISTER_TYPE_UQ)) continue; if (inst[i].opcode == BRW_OPCODE_MOV) { @@ -1993,9 +2412,17 @@ if (devinfo.gen < 8) return; - for (unsigned i = 0; i < sizeof(inst) / sizeof(inst[0]); i++) { - if (!devinfo.has_64bit_types && - (type_sz(inst[i].dst_type) == 8 || type_sz(inst[i].src_type) == 8)) + for (unsigned i = 0; i < ARRAY_SIZE(inst); i++) { + if (!devinfo.has_64bit_float && + (inst[i].dst_type == BRW_REGISTER_TYPE_DF || + inst[i].src_type == BRW_REGISTER_TYPE_DF)) + continue; + + if (!devinfo.has_64bit_int && + (inst[i].dst_type == BRW_REGISTER_TYPE_Q || + inst[i].dst_type == BRW_REGISTER_TYPE_UQ || + inst[i].src_type == BRW_REGISTER_TYPE_Q || + inst[i].src_type == BRW_REGISTER_TYPE_UQ)) continue; if (inst[i].opcode == BRW_OPCODE_MOV) { @@ -2133,9 +2560,17 @@ if (devinfo.gen < 8) return; - for (unsigned i = 0; i < sizeof(inst) / sizeof(inst[0]); i++) { - if (!devinfo.has_64bit_types && - (type_sz(inst[i].dst_type) == 8 || type_sz(inst[i].src_type) == 8)) + for (unsigned i = 0; i < ARRAY_SIZE(inst); i++) { + if (!devinfo.has_64bit_float && + (inst[i].dst_type == BRW_REGISTER_TYPE_DF || + inst[i].src_type == BRW_REGISTER_TYPE_DF)) + continue; + + if (!devinfo.has_64bit_int && + (inst[i].dst_type == BRW_REGISTER_TYPE_Q || + inst[i].dst_type == BRW_REGISTER_TYPE_UQ || + inst[i].src_type == BRW_REGISTER_TYPE_Q || + inst[i].src_type == BRW_REGISTER_TYPE_UQ)) continue; if (inst[i].opcode == BRW_OPCODE_MOV) { @@ -2166,7 +2601,7 @@ clear_instructions(p); } - if (!devinfo.has_64bit_types) + if (!devinfo.has_64bit_float) return; /* MAC implicitly reads the accumulator */ @@ -2236,7 +2671,7 @@ brw_set_default_access_mode(p, BRW_ALIGN_16); - for (unsigned i = 0; i < sizeof(inst) / sizeof(inst[0]); i++) { + for (unsigned i = 0; i < ARRAY_SIZE(inst); i++) { if (inst[i].opcode == BRW_OPCODE_MOV) { brw_MOV(p, retype(g0, inst[i].dst_type), retype(g0, inst[i].src_type)); @@ -2338,9 +2773,21 @@ if (devinfo.gen < 8) return; - for (unsigned i = 0; i < sizeof(inst) / sizeof(inst[0]); i++) { - if (!devinfo.has_64bit_types && - (type_sz(inst[i].dst_type) == 8 || type_sz(inst[i].src_type) == 8)) + /* NoDDChk/NoDDClr does not exist on Gen12+ */ + if (devinfo.gen >= 12) + return; + + for (unsigned i = 0; i < ARRAY_SIZE(inst); i++) { + if (!devinfo.has_64bit_float && + (inst[i].dst_type == BRW_REGISTER_TYPE_DF || + inst[i].src_type == BRW_REGISTER_TYPE_DF)) + continue; + + if (!devinfo.has_64bit_int && + (inst[i].dst_type == BRW_REGISTER_TYPE_Q || + inst[i].dst_type == BRW_REGISTER_TYPE_UQ || + inst[i].src_type == BRW_REGISTER_TYPE_Q || + inst[i].src_type == BRW_REGISTER_TYPE_UQ)) continue; if (inst[i].opcode == BRW_OPCODE_MOV) { diff -Nru mesa-19.2.8/src/intel/compiler/test_fs_cmod_propagation.cpp mesa-20.0.8/src/intel/compiler/test_fs_cmod_propagation.cpp --- mesa-19.2.8/src/intel/compiler/test_fs_cmod_propagation.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/compiler/test_fs_cmod_propagation.cpp 2020-06-12 01:21:17.000000000 +0000 @@ -57,8 +57,7 @@ struct brw_wm_prog_data *prog_data, nir_shader *shader) : fs_visitor(compiler, NULL, NULL, NULL, - &prog_data->base, (struct gl_program *) NULL, - shader, 8, -1) {} + &prog_data->base, shader, 8, -1) {} }; @@ -647,6 +646,281 @@ EXPECT_EQ(BRW_CONDITIONAL_NZ, instruction(block0, 1)->conditional_mod); } +TEST_F(cmod_propagation_test, cmp_cmpnz) +{ + const fs_builder &bld = v->bld; + + fs_reg dst0 = v->vgrf(glsl_type::float_type); + fs_reg src0 = v->vgrf(glsl_type::float_type); + fs_reg zero(brw_imm_f(0)); + + bld.CMP(dst0, src0, zero, BRW_CONDITIONAL_NZ); + bld.CMP(bld.null_reg_f(), dst0, zero, BRW_CONDITIONAL_NZ); + + /* = Before = + * 0: cmp.nz.f0.0(8) vgrf0:F, vgrf1:F, 0f + * 1: cmp.nz.f0.0(8) null:F, vgrf0:F, 0f + * + * = After = + * 0: cmp.nz.f0.0(8) vgrf0:F, vgrf1:F, 0f + */ + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + + EXPECT_TRUE(cmod_propagation(v)); + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(0, block0->end_ip); + EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 0)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_NZ, instruction(block0, 0)->conditional_mod); +} + +TEST_F(cmod_propagation_test, cmp_cmpg) +{ + const fs_builder &bld = v->bld; + + fs_reg dst0 = v->vgrf(glsl_type::float_type); + fs_reg src0 = v->vgrf(glsl_type::float_type); + fs_reg zero(brw_imm_f(0)); + + bld.CMP(dst0, src0, zero, BRW_CONDITIONAL_NZ); + bld.CMP(bld.null_reg_f(), dst0, zero, BRW_CONDITIONAL_G); + + /* = Before = + * 0: cmp.nz.f0.0(8) vgrf0:F, vgrf1:F, 0f + * 1: cmp.g.f0.0(8) null:F, vgrf0:F, 0f + * + * = After = + * (no changes) + */ + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + + EXPECT_FALSE(cmod_propagation(v)); + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(1, block0->end_ip); + EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 0)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_NZ, instruction(block0, 0)->conditional_mod); + EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_G, instruction(block0, 1)->conditional_mod); +} + +TEST_F(cmod_propagation_test, plnnz_cmpnz) +{ + const fs_builder &bld = v->bld; + + fs_reg dst0 = v->vgrf(glsl_type::float_type); + fs_reg src0 = v->vgrf(glsl_type::float_type); + fs_reg zero(brw_imm_f(0)); + + set_condmod(BRW_CONDITIONAL_NZ, bld.PLN(dst0, src0, zero)); + bld.CMP(bld.null_reg_f(), dst0, zero, BRW_CONDITIONAL_NZ); + + /* = Before = + * 0: pln.nz.f0.0(8) vgrf0:F, vgrf1:F, 0f + * 1: cmp.nz.f0.0(8) null:F, vgrf0:F, 0f + * + * = After = + * 0: pln.nz.f0.0(8) vgrf0:F, vgrf1:F, 0f + */ + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + + EXPECT_TRUE(cmod_propagation(v)); + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(0, block0->end_ip); + EXPECT_EQ(BRW_OPCODE_PLN, instruction(block0, 0)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_NZ, instruction(block0, 0)->conditional_mod); +} + +TEST_F(cmod_propagation_test, plnnz_cmpz) +{ + const fs_builder &bld = v->bld; + + fs_reg dst0 = v->vgrf(glsl_type::float_type); + fs_reg src0 = v->vgrf(glsl_type::float_type); + fs_reg zero(brw_imm_f(0)); + + set_condmod(BRW_CONDITIONAL_NZ, bld.PLN(dst0, src0, zero)); + bld.CMP(bld.null_reg_f(), dst0, zero, BRW_CONDITIONAL_Z); + + /* = Before = + * 0: pln.nz.f0.0(8) vgrf0:F, vgrf1:F, 0f + * 1: cmp.z.f0.0(8) null:F, vgrf0:F, 0f + * + * = After = + * 0: pln.z.f0.0(8) vgrf0:F, vgrf1:F, 0f + */ + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + + EXPECT_TRUE(cmod_propagation(v)); + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(0, block0->end_ip); + EXPECT_EQ(BRW_OPCODE_PLN, instruction(block0, 0)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_Z, instruction(block0, 0)->conditional_mod); +} + +TEST_F(cmod_propagation_test, plnnz_sel_cmpz) +{ + const fs_builder &bld = v->bld; + + fs_reg dst0 = v->vgrf(glsl_type::float_type); + fs_reg dst1 = v->vgrf(glsl_type::float_type); + fs_reg src0 = v->vgrf(glsl_type::float_type); + fs_reg zero(brw_imm_f(0)); + + set_condmod(BRW_CONDITIONAL_NZ, bld.PLN(dst0, src0, zero)); + set_predicate(BRW_PREDICATE_NORMAL, bld.SEL(dst1, src0, zero)); + bld.CMP(bld.null_reg_f(), dst0, zero, BRW_CONDITIONAL_Z); + + /* = Before = + * 0: pln.nz.f0.0(8) vgrf0:F, vgrf2:F, 0f + * 1: (+f0.0) sel(8) vgrf1:F, vgrf2:F, 0f + * 2: cmp.z.f0.0(8) null:F, vgrf0:F, 0f + * + * = After = + * (no changes) + */ + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + + EXPECT_FALSE(cmod_propagation(v)); + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(2, block0->end_ip); + EXPECT_EQ(BRW_OPCODE_PLN, instruction(block0, 0)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_NZ, instruction(block0, 0)->conditional_mod); + EXPECT_EQ(BRW_OPCODE_SEL, instruction(block0, 1)->opcode); + EXPECT_EQ(BRW_PREDICATE_NORMAL, instruction(block0, 1)->predicate); + EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 2)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_Z, instruction(block0, 2)->conditional_mod); +} + +TEST_F(cmod_propagation_test, cmp_cmpg_D) +{ + const fs_builder &bld = v->bld; + + fs_reg dst0 = v->vgrf(glsl_type::int_type); + fs_reg src0 = v->vgrf(glsl_type::int_type); + fs_reg zero(brw_imm_d(0)); + fs_reg one(brw_imm_d(1)); + + bld.CMP(dst0, src0, zero, BRW_CONDITIONAL_NZ); + bld.CMP(bld.null_reg_d(), dst0, zero, BRW_CONDITIONAL_G); + + /* = Before = + * 0: cmp.nz.f0.0(8) vgrf0:D, vgrf1:D, 0d + * 1: cmp.g.f0.0(8) null:D, vgrf0:D, 0d + * + * = After = + * (no changes) + */ + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + + EXPECT_FALSE(cmod_propagation(v)); + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(1, block0->end_ip); + EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 0)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_NZ, instruction(block0, 0)->conditional_mod); + EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_G, instruction(block0, 1)->conditional_mod); +} + +TEST_F(cmod_propagation_test, cmp_cmpg_UD) +{ + const fs_builder &bld = v->bld; + + fs_reg dst0 = v->vgrf(glsl_type::uint_type); + fs_reg src0 = v->vgrf(glsl_type::uint_type); + fs_reg zero(brw_imm_ud(0)); + + bld.CMP(dst0, src0, zero, BRW_CONDITIONAL_NZ); + bld.CMP(bld.null_reg_ud(), dst0, zero, BRW_CONDITIONAL_G); + + /* = Before = + * 0: cmp.nz.f0.0(8) vgrf0:UD, vgrf1:UD, 0u + * 1: cmp.g.f0.0(8) null:UD, vgrf0:UD, 0u + * + * = After = + * 0: cmp.nz.f0.0(8) vgrf0:UD, vgrf1:UD, 0u + */ + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + + EXPECT_TRUE(cmod_propagation(v)); + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(0, block0->end_ip); + EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 0)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_NZ, instruction(block0, 0)->conditional_mod); +} + +TEST_F(cmod_propagation_test, cmp_cmpl_D) +{ + const fs_builder &bld = v->bld; + + fs_reg dst0 = v->vgrf(glsl_type::int_type); + fs_reg src0 = v->vgrf(glsl_type::int_type); + fs_reg zero(brw_imm_d(0)); + + bld.CMP(dst0, src0, zero, BRW_CONDITIONAL_NZ); + bld.CMP(bld.null_reg_d(), dst0, zero, BRW_CONDITIONAL_L); + + /* = Before = + * 0: cmp.nz.f0.0(8) vgrf0:D, vgrf1:D, 0d + * 1: cmp.l.f0.0(8) null:D, vgrf0:D, 0d + * + * = After = + * 0: cmp.nz.f0.0(8) vgrf0:D, vgrf1:D, 0d + */ + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + + EXPECT_TRUE(cmod_propagation(v)); + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(0, block0->end_ip); + EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 0)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_NZ, instruction(block0, 0)->conditional_mod); +} + +TEST_F(cmod_propagation_test, cmp_cmpl_UD) +{ + const fs_builder &bld = v->bld; + + fs_reg dst0 = v->vgrf(glsl_type::uint_type); + fs_reg src0 = v->vgrf(glsl_type::uint_type); + fs_reg zero(brw_imm_ud(0)); + + bld.CMP(dst0, src0, zero, BRW_CONDITIONAL_NZ); + bld.CMP(bld.null_reg_ud(), dst0, zero, BRW_CONDITIONAL_L); + + /* = Before = + * 0: cmp.nz.f0.0(8) vgrf0:UD, vgrf1:UD, 0u + * 1: cmp.l.f0.0(8) null:UD, vgrf0:UD, 0u + * + * = After = + * (no changes) + */ + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + + EXPECT_FALSE(cmod_propagation(v)); + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(1, block0->end_ip); + EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 0)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_NZ, instruction(block0, 0)->conditional_mod); + EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_L, instruction(block0, 1)->conditional_mod); +} + TEST_F(cmod_propagation_test, andz_one) { const fs_builder &bld = v->bld; diff -Nru mesa-19.2.8/src/intel/compiler/test_fs_copy_propagation.cpp mesa-20.0.8/src/intel/compiler/test_fs_copy_propagation.cpp --- mesa-19.2.8/src/intel/compiler/test_fs_copy_propagation.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/compiler/test_fs_copy_propagation.cpp 2020-06-12 01:21:17.000000000 +0000 @@ -47,8 +47,7 @@ struct brw_wm_prog_data *prog_data, nir_shader *shader) : fs_visitor(compiler, NULL, NULL, NULL, - &prog_data->base, (struct gl_program *) NULL, - shader, 8, -1) {} + &prog_data->base, shader, 8, -1) {} }; diff -Nru mesa-19.2.8/src/intel/compiler/test_fs_saturate_propagation.cpp mesa-20.0.8/src/intel/compiler/test_fs_saturate_propagation.cpp --- mesa-19.2.8/src/intel/compiler/test_fs_saturate_propagation.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/compiler/test_fs_saturate_propagation.cpp 2020-06-12 01:21:17.000000000 +0000 @@ -47,8 +47,7 @@ struct brw_wm_prog_data *prog_data, nir_shader *shader) : fs_visitor(compiler, NULL, NULL, NULL, - &prog_data->base, (struct gl_program *) NULL, - shader, 16, -1) {} + &prog_data->base, shader, 16, -1) {} }; diff -Nru mesa-19.2.8/src/intel/compiler/test_fs_scoreboard.cpp mesa-20.0.8/src/intel/compiler/test_fs_scoreboard.cpp --- mesa-19.2.8/src/intel/compiler/test_fs_scoreboard.cpp 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/intel/compiler/test_fs_scoreboard.cpp 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,863 @@ +/* + * Copyright © 2019 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include +#include "brw_fs.h" +#include "brw_cfg.h" +#include "program/program.h" + +using namespace brw; + +class scoreboard_test : public ::testing::Test { + virtual void SetUp(); + +public: + struct brw_compiler *compiler; + struct gen_device_info *devinfo; + struct gl_context *ctx; + struct brw_wm_prog_data *prog_data; + struct gl_shader_program *shader_prog; + fs_visitor *v; +}; + +void scoreboard_test::SetUp() +{ + ctx = (struct gl_context *)calloc(1, sizeof(*ctx)); + compiler = (struct brw_compiler *)calloc(1, sizeof(*compiler)); + devinfo = (struct gen_device_info *)calloc(1, sizeof(*devinfo)); + compiler->devinfo = devinfo; + + prog_data = ralloc(NULL, struct brw_wm_prog_data); + nir_shader *shader = + nir_shader_create(NULL, MESA_SHADER_FRAGMENT, NULL, NULL); + + v = new fs_visitor(compiler, NULL, NULL, NULL, &prog_data->base, shader, 8, -1); + + devinfo->gen = 12; +} + +static fs_inst * +instruction(bblock_t *block, int num) +{ + fs_inst *inst = (fs_inst *)block->start(); + for (int i = 0; i < num; i++) { + inst = (fs_inst *)inst->next; + } + return inst; +} + +static void +lower_scoreboard(fs_visitor *v) +{ + const bool print = getenv("TEST_DEBUG"); + + if (print) { + fprintf(stderr, "= Before =\n"); + v->cfg->dump(v); + } + + v->lower_scoreboard(); + + if (print) { + fprintf(stderr, "\n= After =\n"); + v->cfg->dump(v); + } +} + +fs_inst * +emit_SEND(const fs_builder &bld, const fs_reg &dst, + const fs_reg &desc, const fs_reg &payload) +{ + fs_inst *inst = bld.emit(SHADER_OPCODE_SEND, dst, desc, desc, payload); + inst->mlen = 1; + return inst; +} + +bool operator ==(const tgl_swsb &a, const tgl_swsb &b) +{ + return a.mode == b.mode && + a.regdist == b.regdist && + (a.mode == TGL_SBID_NULL || a.sbid == b.sbid); +} + +std::ostream &operator<<(std::ostream &os, const tgl_swsb &swsb) { + if (swsb.regdist) + os << "@" << swsb.regdist; + + if (swsb.mode) { + if (swsb.regdist) + os << " "; + os << "$" << swsb.sbid; + if (swsb.mode & TGL_SBID_DST) + os << ".dst"; + if (swsb.mode & TGL_SBID_SRC) + os << ".src"; + } + + return os; +} + +TEST_F(scoreboard_test, RAW_inorder_inorder) +{ + const fs_builder &bld = v->bld; + fs_reg g[16]; + for (unsigned i = 0; i < ARRAY_SIZE(g); i++) + g[i] = v->vgrf(glsl_type::int_type); + + fs_reg x = v->vgrf(glsl_type::int_type); + fs_reg y = v->vgrf(glsl_type::int_type); + bld.ADD( x, g[1], g[2]); + bld.MUL( y, g[3], g[4]); + bld.AND(g[5], x, y); + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + ASSERT_EQ(0, block0->start_ip); + ASSERT_EQ(2, block0->end_ip); + + lower_scoreboard(v); + ASSERT_EQ(0, block0->start_ip); + ASSERT_EQ(2, block0->end_ip); + + EXPECT_EQ(instruction(block0, 0)->sched, tgl_swsb_null()); + EXPECT_EQ(instruction(block0, 1)->sched, tgl_swsb_null()); + EXPECT_EQ(instruction(block0, 2)->sched, tgl_swsb_regdist(1)); +} + +TEST_F(scoreboard_test, RAW_inorder_outoforder) +{ + const fs_builder &bld = v->bld; + fs_reg g[16]; + for (unsigned i = 0; i < ARRAY_SIZE(g); i++) + g[i] = v->vgrf(glsl_type::int_type); + + fs_reg x = v->vgrf(glsl_type::int_type); + bld.ADD( x, g[1], g[2]); + bld.MUL( g[3], g[4], g[5]); + emit_SEND(bld, g[6], g[7], x); + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + ASSERT_EQ(0, block0->start_ip); + ASSERT_EQ(2, block0->end_ip); + + lower_scoreboard(v); + ASSERT_EQ(0, block0->start_ip); + ASSERT_EQ(2, block0->end_ip); + + EXPECT_EQ(instruction(block0, 0)->sched, tgl_swsb_null()); + EXPECT_EQ(instruction(block0, 1)->sched, tgl_swsb_null()); + EXPECT_EQ(instruction(block0, 2)->sched, + (tgl_swsb { .regdist = 2, .sbid = 0, .mode = TGL_SBID_SET })); +} + +TEST_F(scoreboard_test, RAW_outoforder_inorder) +{ + const fs_builder &bld = v->bld; + fs_reg g[16]; + for (unsigned i = 0; i < ARRAY_SIZE(g); i++) + g[i] = v->vgrf(glsl_type::int_type); + + fs_reg x = v->vgrf(glsl_type::int_type); + fs_reg y = v->vgrf(glsl_type::int_type); + emit_SEND(bld, x, g[1], g[2]); + bld.MUL( y, g[3], g[4]); + bld.AND( g[5], x, y); + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + ASSERT_EQ(0, block0->start_ip); + ASSERT_EQ(2, block0->end_ip); + + lower_scoreboard(v); + ASSERT_EQ(0, block0->start_ip); + ASSERT_EQ(2, block0->end_ip); + + EXPECT_EQ(instruction(block0, 0)->sched, tgl_swsb_sbid(TGL_SBID_SET, 0)); + EXPECT_EQ(instruction(block0, 1)->sched, tgl_swsb_null()); + EXPECT_EQ(instruction(block0, 2)->sched, + (tgl_swsb { .regdist = 1, .sbid = 0, .mode = TGL_SBID_DST })); +} + +TEST_F(scoreboard_test, RAW_outoforder_outoforder) +{ + const fs_builder &bld = v->bld; + fs_reg g[16]; + for (unsigned i = 0; i < ARRAY_SIZE(g); i++) + g[i] = v->vgrf(glsl_type::int_type); + + /* The second SEND depends on the first, and would need to refer to two + * SBIDs. Since it is not possible we expect a SYNC instruction to be + * added. + */ + fs_reg x = v->vgrf(glsl_type::int_type); + emit_SEND(bld, x, g[1], g[2]); + emit_SEND(bld, g[3], x, g[4])->sfid++; + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + ASSERT_EQ(0, block0->start_ip); + ASSERT_EQ(1, block0->end_ip); + + lower_scoreboard(v); + ASSERT_EQ(0, block0->start_ip); + ASSERT_EQ(2, block0->end_ip); + + EXPECT_EQ(instruction(block0, 0)->sched, tgl_swsb_sbid(TGL_SBID_SET, 0)); + + fs_inst *sync = instruction(block0, 1); + EXPECT_EQ(sync->opcode, BRW_OPCODE_SYNC); + EXPECT_EQ(sync->sched, tgl_swsb_sbid(TGL_SBID_DST, 0)); + + EXPECT_EQ(instruction(block0, 2)->sched, tgl_swsb_sbid(TGL_SBID_SET, 1)); +} + +TEST_F(scoreboard_test, WAR_inorder_inorder) +{ + const fs_builder &bld = v->bld; + fs_reg g[16]; + for (unsigned i = 0; i < ARRAY_SIZE(g); i++) + g[i] = v->vgrf(glsl_type::int_type); + + fs_reg x = v->vgrf(glsl_type::int_type); + bld.ADD(g[1], x, g[2]); + bld.MUL(g[3], g[4], g[5]); + bld.AND( x, g[6], g[7]); + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + ASSERT_EQ(0, block0->start_ip); + ASSERT_EQ(2, block0->end_ip); + + lower_scoreboard(v); + ASSERT_EQ(0, block0->start_ip); + ASSERT_EQ(2, block0->end_ip); + + EXPECT_EQ(instruction(block0, 0)->sched, tgl_swsb_null()); + EXPECT_EQ(instruction(block0, 1)->sched, tgl_swsb_null()); + EXPECT_EQ(instruction(block0, 2)->sched, tgl_swsb_null()); +} + +TEST_F(scoreboard_test, WAR_inorder_outoforder) +{ + const fs_builder &bld = v->bld; + fs_reg g[16]; + for (unsigned i = 0; i < ARRAY_SIZE(g); i++) + g[i] = v->vgrf(glsl_type::int_type); + + fs_reg x = v->vgrf(glsl_type::int_type); + bld.ADD( g[1], x, g[2]); + bld.MUL( g[3], g[4], g[5]); + emit_SEND(bld, x, g[6], g[7]); + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + ASSERT_EQ(0, block0->start_ip); + ASSERT_EQ(2, block0->end_ip); + + lower_scoreboard(v); + ASSERT_EQ(0, block0->start_ip); + ASSERT_EQ(2, block0->end_ip); + + EXPECT_EQ(instruction(block0, 0)->sched, tgl_swsb_null()); + EXPECT_EQ(instruction(block0, 1)->sched, tgl_swsb_null()); + EXPECT_EQ(instruction(block0, 2)->sched, + (tgl_swsb { .regdist = 2, .sbid = 0, .mode = TGL_SBID_SET })); +} + +TEST_F(scoreboard_test, WAR_outoforder_inorder) +{ + const fs_builder &bld = v->bld; + fs_reg g[16]; + for (unsigned i = 0; i < ARRAY_SIZE(g); i++) + g[i] = v->vgrf(glsl_type::int_type); + + fs_reg x = v->vgrf(glsl_type::int_type); + emit_SEND(bld, g[1], g[2], x); + bld.MUL( g[4], g[5], g[6]); + bld.AND( x, g[7], g[8]); + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + ASSERT_EQ(0, block0->start_ip); + ASSERT_EQ(2, block0->end_ip); + + lower_scoreboard(v); + ASSERT_EQ(0, block0->start_ip); + ASSERT_EQ(2, block0->end_ip); + + EXPECT_EQ(instruction(block0, 0)->sched, tgl_swsb_sbid(TGL_SBID_SET, 0)); + EXPECT_EQ(instruction(block0, 1)->sched, tgl_swsb_null()); + EXPECT_EQ(instruction(block0, 2)->sched, tgl_swsb_sbid(TGL_SBID_SRC, 0)); +} + +TEST_F(scoreboard_test, WAR_outoforder_outoforder) +{ + const fs_builder &bld = v->bld; + fs_reg g[16]; + for (unsigned i = 0; i < ARRAY_SIZE(g); i++) + g[i] = v->vgrf(glsl_type::int_type); + + fs_reg x = v->vgrf(glsl_type::int_type); + emit_SEND(bld, g[1], g[2], x); + emit_SEND(bld, x, g[3], g[4])->sfid++; + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + ASSERT_EQ(0, block0->start_ip); + ASSERT_EQ(1, block0->end_ip); + + lower_scoreboard(v); + ASSERT_EQ(0, block0->start_ip); + ASSERT_EQ(2, block0->end_ip); + + EXPECT_EQ(instruction(block0, 0)->sched, tgl_swsb_sbid(TGL_SBID_SET, 0)); + + fs_inst *sync = instruction(block0, 1); + EXPECT_EQ(sync->opcode, BRW_OPCODE_SYNC); + EXPECT_EQ(sync->sched, tgl_swsb_sbid(TGL_SBID_SRC, 0)); + + EXPECT_EQ(instruction(block0, 2)->sched, tgl_swsb_sbid(TGL_SBID_SET, 1)); +} + +TEST_F(scoreboard_test, WAW_inorder_inorder) +{ + const fs_builder &bld = v->bld; + fs_reg g[16]; + for (unsigned i = 0; i < ARRAY_SIZE(g); i++) + g[i] = v->vgrf(glsl_type::int_type); + + fs_reg x = v->vgrf(glsl_type::int_type); + bld.ADD( x, g[1], g[2]); + bld.MUL(g[3], g[4], g[5]); + bld.AND( x, g[6], g[7]); + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + ASSERT_EQ(0, block0->start_ip); + ASSERT_EQ(2, block0->end_ip); + + lower_scoreboard(v); + ASSERT_EQ(0, block0->start_ip); + ASSERT_EQ(2, block0->end_ip); + + EXPECT_EQ(instruction(block0, 0)->sched, tgl_swsb_null()); + EXPECT_EQ(instruction(block0, 1)->sched, tgl_swsb_null()); + + /* NOTE: We only need this RegDist if a long instruction is followed by a + * short one. The pass is currently conservative about this and adding the + * annotation. + */ + EXPECT_EQ(instruction(block0, 2)->sched, tgl_swsb_regdist(2)); +} + +TEST_F(scoreboard_test, WAW_inorder_outoforder) +{ + const fs_builder &bld = v->bld; + fs_reg g[16]; + for (unsigned i = 0; i < ARRAY_SIZE(g); i++) + g[i] = v->vgrf(glsl_type::int_type); + + fs_reg x = v->vgrf(glsl_type::int_type); + bld.ADD( x, g[1], g[2]); + bld.MUL( g[3], g[4], g[5]); + emit_SEND(bld, x, g[6], g[7]); + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + ASSERT_EQ(0, block0->start_ip); + ASSERT_EQ(2, block0->end_ip); + + lower_scoreboard(v); + ASSERT_EQ(0, block0->start_ip); + ASSERT_EQ(2, block0->end_ip); + + EXPECT_EQ(instruction(block0, 0)->sched, tgl_swsb_null()); + EXPECT_EQ(instruction(block0, 1)->sched, tgl_swsb_null()); + EXPECT_EQ(instruction(block0, 2)->sched, + (tgl_swsb { .regdist = 2, .sbid = 0, .mode = TGL_SBID_SET })); +} + +TEST_F(scoreboard_test, WAW_outoforder_inorder) +{ + const fs_builder &bld = v->bld; + fs_reg g[16]; + for (unsigned i = 0; i < ARRAY_SIZE(g); i++) + g[i] = v->vgrf(glsl_type::int_type); + + fs_reg x = v->vgrf(glsl_type::int_type); + emit_SEND(bld, x, g[1], g[2]); + bld.MUL( g[3], g[4], g[5]); + bld.AND( x, g[6], g[7]); + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + ASSERT_EQ(0, block0->start_ip); + ASSERT_EQ(2, block0->end_ip); + + lower_scoreboard(v); + ASSERT_EQ(0, block0->start_ip); + ASSERT_EQ(2, block0->end_ip); + + EXPECT_EQ(instruction(block0, 0)->sched, tgl_swsb_sbid(TGL_SBID_SET, 0)); + EXPECT_EQ(instruction(block0, 1)->sched, tgl_swsb_null()); + EXPECT_EQ(instruction(block0, 2)->sched, tgl_swsb_sbid(TGL_SBID_DST, 0)); +} + +TEST_F(scoreboard_test, WAW_outoforder_outoforder) +{ + const fs_builder &bld = v->bld; + fs_reg g[16]; + for (unsigned i = 0; i < ARRAY_SIZE(g); i++) + g[i] = v->vgrf(glsl_type::int_type); + + fs_reg x = v->vgrf(glsl_type::int_type); + emit_SEND(bld, x, g[1], g[2]); + emit_SEND(bld, x, g[3], g[4])->sfid++; + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + ASSERT_EQ(0, block0->start_ip); + ASSERT_EQ(1, block0->end_ip); + + lower_scoreboard(v); + ASSERT_EQ(0, block0->start_ip); + ASSERT_EQ(2, block0->end_ip); + + EXPECT_EQ(instruction(block0, 0)->sched, tgl_swsb_sbid(TGL_SBID_SET, 0)); + + fs_inst *sync = instruction(block0, 1); + EXPECT_EQ(sync->opcode, BRW_OPCODE_SYNC); + EXPECT_EQ(sync->sched, tgl_swsb_sbid(TGL_SBID_DST, 0)); + + EXPECT_EQ(instruction(block0, 2)->sched, tgl_swsb_sbid(TGL_SBID_SET, 1)); +} + + +TEST_F(scoreboard_test, loop1) +{ + const fs_builder &bld = v->bld; + fs_reg g[16]; + for (unsigned i = 0; i < ARRAY_SIZE(g); i++) + g[i] = v->vgrf(glsl_type::int_type); + + fs_reg x = v->vgrf(glsl_type::int_type); + bld.XOR( x, g[1], g[2]); + + bld.emit(BRW_OPCODE_DO); + + bld.ADD( x, g[1], g[2]); + bld.emit(BRW_OPCODE_WHILE)->predicate = BRW_PREDICATE_NORMAL; + + bld.MUL( x, g[1], g[2]); + + v->calculate_cfg(); + lower_scoreboard(v); + + bblock_t *body = v->cfg->blocks[2]; + fs_inst *add = instruction(body, 0); + EXPECT_EQ(add->opcode, BRW_OPCODE_ADD); + EXPECT_EQ(add->sched, tgl_swsb_regdist(1)); + + bblock_t *last_block = v->cfg->blocks[3]; + fs_inst *mul = instruction(last_block, 0); + EXPECT_EQ(mul->opcode, BRW_OPCODE_MUL); + EXPECT_EQ(mul->sched, tgl_swsb_regdist(1)); +} + +TEST_F(scoreboard_test, loop2) +{ + const fs_builder &bld = v->bld; + fs_reg g[16]; + for (unsigned i = 0; i < ARRAY_SIZE(g); i++) + g[i] = v->vgrf(glsl_type::int_type); + + fs_reg x = v->vgrf(glsl_type::int_type); + bld.XOR( x, g[1], g[2]); + bld.XOR(g[3], g[1], g[2]); + bld.XOR(g[4], g[1], g[2]); + bld.XOR(g[5], g[1], g[2]); + + bld.emit(BRW_OPCODE_DO); + + bld.ADD( x, g[1], g[2]); + bld.emit(BRW_OPCODE_WHILE)->predicate = BRW_PREDICATE_NORMAL; + + bld.MUL( x, g[1], g[2]); + + v->calculate_cfg(); + lower_scoreboard(v); + + /* Now the write in ADD has the tightest RegDist for both ADD and MUL. */ + + bblock_t *body = v->cfg->blocks[2]; + fs_inst *add = instruction(body, 0); + EXPECT_EQ(add->opcode, BRW_OPCODE_ADD); + EXPECT_EQ(add->sched, tgl_swsb_regdist(2)); + + bblock_t *last_block = v->cfg->blocks[3]; + fs_inst *mul = instruction(last_block, 0); + EXPECT_EQ(mul->opcode, BRW_OPCODE_MUL); + EXPECT_EQ(mul->sched, tgl_swsb_regdist(2)); +} + +TEST_F(scoreboard_test, loop3) +{ + const fs_builder &bld = v->bld; + fs_reg g[16]; + for (unsigned i = 0; i < ARRAY_SIZE(g); i++) + g[i] = v->vgrf(glsl_type::int_type); + + fs_reg x = v->vgrf(glsl_type::int_type); + bld.XOR( x, g[1], g[2]); + + bld.emit(BRW_OPCODE_DO); + + /* For the ADD in the loop body this extra distance will always apply. */ + bld.XOR(g[3], g[1], g[2]); + bld.XOR(g[4], g[1], g[2]); + bld.XOR(g[5], g[1], g[2]); + bld.XOR(g[6], g[1], g[2]); + + bld.ADD( x, g[1], g[2]); + bld.emit(BRW_OPCODE_WHILE)->predicate = BRW_PREDICATE_NORMAL; + + bld.MUL( x, g[1], g[2]); + + v->calculate_cfg(); + lower_scoreboard(v); + + bblock_t *body = v->cfg->blocks[2]; + fs_inst *add = instruction(body, 4); + EXPECT_EQ(add->opcode, BRW_OPCODE_ADD); + EXPECT_EQ(add->sched, tgl_swsb_regdist(5)); + + bblock_t *last_block = v->cfg->blocks[3]; + fs_inst *mul = instruction(last_block, 0); + EXPECT_EQ(mul->opcode, BRW_OPCODE_MUL); + EXPECT_EQ(mul->sched, tgl_swsb_regdist(1)); +} + + +TEST_F(scoreboard_test, conditional1) +{ + const fs_builder &bld = v->bld; + fs_reg g[16]; + for (unsigned i = 0; i < ARRAY_SIZE(g); i++) + g[i] = v->vgrf(glsl_type::int_type); + + fs_reg x = v->vgrf(glsl_type::int_type); + bld.XOR( x, g[1], g[2]); + bld.emit(BRW_OPCODE_IF); + + bld.ADD( x, g[1], g[2]); + + bld.emit(BRW_OPCODE_ENDIF); + bld.MUL( x, g[1], g[2]); + + v->calculate_cfg(); + lower_scoreboard(v); + + bblock_t *body = v->cfg->blocks[1]; + fs_inst *add = instruction(body, 0); + EXPECT_EQ(add->opcode, BRW_OPCODE_ADD); + EXPECT_EQ(add->sched, tgl_swsb_regdist(2)); + + bblock_t *last_block = v->cfg->blocks[2]; + fs_inst *mul = instruction(last_block, 1); + EXPECT_EQ(mul->opcode, BRW_OPCODE_MUL); + EXPECT_EQ(mul->sched, tgl_swsb_regdist(2)); +} + +TEST_F(scoreboard_test, conditional2) +{ + const fs_builder &bld = v->bld; + fs_reg g[16]; + for (unsigned i = 0; i < ARRAY_SIZE(g); i++) + g[i] = v->vgrf(glsl_type::int_type); + + fs_reg x = v->vgrf(glsl_type::int_type); + bld.XOR( x, g[1], g[2]); + bld.XOR(g[3], g[1], g[2]); + bld.XOR(g[4], g[1], g[2]); + bld.XOR(g[5], g[1], g[2]); + bld.emit(BRW_OPCODE_IF); + + bld.ADD( x, g[1], g[2]); + + bld.emit(BRW_OPCODE_ENDIF); + bld.MUL( x, g[1], g[2]); + + v->calculate_cfg(); + lower_scoreboard(v); + + bblock_t *body = v->cfg->blocks[1]; + fs_inst *add = instruction(body, 0); + EXPECT_EQ(add->opcode, BRW_OPCODE_ADD); + EXPECT_EQ(add->sched, tgl_swsb_regdist(5)); + + bblock_t *last_block = v->cfg->blocks[2]; + fs_inst *mul = instruction(last_block, 1); + EXPECT_EQ(mul->opcode, BRW_OPCODE_MUL); + EXPECT_EQ(mul->sched, tgl_swsb_regdist(2)); +} + +TEST_F(scoreboard_test, conditional3) +{ + const fs_builder &bld = v->bld; + fs_reg g[16]; + for (unsigned i = 0; i < ARRAY_SIZE(g); i++) + g[i] = v->vgrf(glsl_type::int_type); + + fs_reg x = v->vgrf(glsl_type::int_type); + bld.XOR( x, g[1], g[2]); + bld.emit(BRW_OPCODE_IF); + + bld.XOR(g[3], g[1], g[2]); + bld.XOR(g[4], g[1], g[2]); + bld.XOR(g[5], g[1], g[2]); + bld.ADD( x, g[1], g[2]); + + bld.emit(BRW_OPCODE_ENDIF); + bld.MUL( x, g[1], g[2]); + + v->calculate_cfg(); + lower_scoreboard(v); + + bblock_t *body = v->cfg->blocks[1]; + fs_inst *add = instruction(body, 3); + EXPECT_EQ(add->opcode, BRW_OPCODE_ADD); + EXPECT_EQ(add->sched, tgl_swsb_regdist(5)); + + bblock_t *last_block = v->cfg->blocks[2]; + fs_inst *mul = instruction(last_block, 1); + EXPECT_EQ(mul->opcode, BRW_OPCODE_MUL); + EXPECT_EQ(mul->sched, tgl_swsb_regdist(2)); +} + +TEST_F(scoreboard_test, conditional4) +{ + const fs_builder &bld = v->bld; + fs_reg g[16]; + for (unsigned i = 0; i < ARRAY_SIZE(g); i++) + g[i] = v->vgrf(glsl_type::int_type); + + fs_reg x = v->vgrf(glsl_type::int_type); + bld.XOR( x, g[1], g[2]); + bld.emit(BRW_OPCODE_IF); + + bld.ADD( x, g[1], g[2]); + bld.XOR(g[3], g[1], g[2]); + bld.XOR(g[4], g[1], g[2]); + bld.XOR(g[5], g[1], g[2]); + + bld.emit(BRW_OPCODE_ENDIF); + bld.MUL( x, g[1], g[2]); + + v->calculate_cfg(); + lower_scoreboard(v); + + bblock_t *body = v->cfg->blocks[1]; + fs_inst *add = instruction(body, 0); + EXPECT_EQ(add->opcode, BRW_OPCODE_ADD); + EXPECT_EQ(add->sched, tgl_swsb_regdist(2)); + + bblock_t *last_block = v->cfg->blocks[2]; + fs_inst *mul = instruction(last_block, 1); + EXPECT_EQ(mul->opcode, BRW_OPCODE_MUL); + EXPECT_EQ(mul->sched, tgl_swsb_regdist(3)); +} + +TEST_F(scoreboard_test, conditional5) +{ + const fs_builder &bld = v->bld; + fs_reg g[16]; + for (unsigned i = 0; i < ARRAY_SIZE(g); i++) + g[i] = v->vgrf(glsl_type::int_type); + + fs_reg x = v->vgrf(glsl_type::int_type); + bld.XOR( x, g[1], g[2]); + bld.emit(BRW_OPCODE_IF); + + bld.ADD( x, g[1], g[2]); + bld.emit(BRW_OPCODE_ELSE); + + bld.ROL( x, g[1], g[2]); + + bld.emit(BRW_OPCODE_ENDIF); + bld.MUL( x, g[1], g[2]); + + v->calculate_cfg(); + lower_scoreboard(v); + + bblock_t *then_body = v->cfg->blocks[1]; + fs_inst *add = instruction(then_body, 0); + EXPECT_EQ(add->opcode, BRW_OPCODE_ADD); + EXPECT_EQ(add->sched, tgl_swsb_regdist(2)); + + bblock_t *else_body = v->cfg->blocks[2]; + fs_inst *rol = instruction(else_body, 0); + EXPECT_EQ(rol->opcode, BRW_OPCODE_ROL); + EXPECT_EQ(rol->sched, tgl_swsb_regdist(2)); + + bblock_t *last_block = v->cfg->blocks[3]; + fs_inst *mul = instruction(last_block, 1); + EXPECT_EQ(mul->opcode, BRW_OPCODE_MUL); + EXPECT_EQ(mul->sched, tgl_swsb_regdist(2)); +} + +TEST_F(scoreboard_test, conditional6) +{ + const fs_builder &bld = v->bld; + fs_reg g[16]; + for (unsigned i = 0; i < ARRAY_SIZE(g); i++) + g[i] = v->vgrf(glsl_type::int_type); + + fs_reg x = v->vgrf(glsl_type::int_type); + bld.XOR( x, g[1], g[2]); + bld.emit(BRW_OPCODE_IF); + + bld.XOR(g[3], g[1], g[2]); + bld.XOR(g[4], g[1], g[2]); + bld.XOR(g[5], g[1], g[2]); + bld.ADD( x, g[1], g[2]); + bld.emit(BRW_OPCODE_ELSE); + + bld.XOR(g[6], g[1], g[2]); + bld.XOR(g[7], g[1], g[2]); + bld.XOR(g[8], g[1], g[2]); + bld.XOR(g[9], g[1], g[2]); + bld.ROL( x, g[1], g[2]); + + bld.emit(BRW_OPCODE_ENDIF); + bld.MUL( x, g[1], g[2]); + + v->calculate_cfg(); + lower_scoreboard(v); + + bblock_t *then_body = v->cfg->blocks[1]; + fs_inst *add = instruction(then_body, 3); + EXPECT_EQ(add->opcode, BRW_OPCODE_ADD); + EXPECT_EQ(add->sched, tgl_swsb_regdist(5)); + + bblock_t *else_body = v->cfg->blocks[2]; + fs_inst *rol = instruction(else_body, 4); + EXPECT_EQ(rol->opcode, BRW_OPCODE_ROL); + EXPECT_EQ(rol->sched, tgl_swsb_regdist(6)); + + bblock_t *last_block = v->cfg->blocks[3]; + fs_inst *mul = instruction(last_block, 1); + EXPECT_EQ(mul->opcode, BRW_OPCODE_MUL); + EXPECT_EQ(mul->sched, tgl_swsb_regdist(2)); +} + +TEST_F(scoreboard_test, conditional7) +{ + const fs_builder &bld = v->bld; + fs_reg g[16]; + for (unsigned i = 0; i < ARRAY_SIZE(g); i++) + g[i] = v->vgrf(glsl_type::int_type); + + fs_reg x = v->vgrf(glsl_type::int_type); + bld.XOR( x, g[1], g[2]); + bld.emit(BRW_OPCODE_IF); + + bld.ADD( x, g[1], g[2]); + bld.XOR(g[3], g[1], g[2]); + bld.XOR(g[4], g[1], g[2]); + bld.XOR(g[5], g[1], g[2]); + bld.emit(BRW_OPCODE_ELSE); + + bld.ROL( x, g[1], g[2]); + bld.XOR(g[6], g[1], g[2]); + bld.XOR(g[7], g[1], g[2]); + bld.XOR(g[8], g[1], g[2]); + bld.XOR(g[9], g[1], g[2]); + + bld.emit(BRW_OPCODE_ENDIF); + bld.MUL( x, g[1], g[2]); + + v->calculate_cfg(); + lower_scoreboard(v); + + bblock_t *then_body = v->cfg->blocks[1]; + fs_inst *add = instruction(then_body, 0); + EXPECT_EQ(add->opcode, BRW_OPCODE_ADD); + EXPECT_EQ(add->sched, tgl_swsb_regdist(2)); + + bblock_t *else_body = v->cfg->blocks[2]; + fs_inst *rol = instruction(else_body, 0); + EXPECT_EQ(rol->opcode, BRW_OPCODE_ROL); + EXPECT_EQ(rol->sched, tgl_swsb_regdist(2)); + + bblock_t *last_block = v->cfg->blocks[3]; + fs_inst *mul = instruction(last_block, 1); + EXPECT_EQ(mul->opcode, BRW_OPCODE_MUL); + EXPECT_EQ(mul->sched, tgl_swsb_regdist(6)); +} + +TEST_F(scoreboard_test, conditional8) +{ + const fs_builder &bld = v->bld; + fs_reg g[16]; + for (unsigned i = 0; i < ARRAY_SIZE(g); i++) + g[i] = v->vgrf(glsl_type::int_type); + + fs_reg x = v->vgrf(glsl_type::int_type); + bld.XOR( x, g[1], g[2]); + bld.XOR(g[3], g[1], g[2]); + bld.XOR(g[4], g[1], g[2]); + bld.XOR(g[5], g[1], g[2]); + bld.XOR(g[6], g[1], g[2]); + bld.XOR(g[7], g[1], g[2]); + bld.emit(BRW_OPCODE_IF); + + bld.ADD( x, g[1], g[2]); + bld.emit(BRW_OPCODE_ELSE); + + bld.ROL( x, g[1], g[2]); + + bld.emit(BRW_OPCODE_ENDIF); + bld.MUL( x, g[1], g[2]); + + v->calculate_cfg(); + lower_scoreboard(v); + + bblock_t *then_body = v->cfg->blocks[1]; + fs_inst *add = instruction(then_body, 0); + EXPECT_EQ(add->opcode, BRW_OPCODE_ADD); + EXPECT_EQ(add->sched, tgl_swsb_regdist(7)); + + /* Note that the ROL will have RegDist 2 and not 7, illustrating the + * physical CFG edge between the then-block and the else-block. + */ + bblock_t *else_body = v->cfg->blocks[2]; + fs_inst *rol = instruction(else_body, 0); + EXPECT_EQ(rol->opcode, BRW_OPCODE_ROL); + EXPECT_EQ(rol->sched, tgl_swsb_regdist(2)); + + bblock_t *last_block = v->cfg->blocks[3]; + fs_inst *mul = instruction(last_block, 1); + EXPECT_EQ(mul->opcode, BRW_OPCODE_MUL); + EXPECT_EQ(mul->sched, tgl_swsb_regdist(2)); +} diff -Nru mesa-19.2.8/src/intel/dev/gen_debug.c mesa-20.0.8/src/intel/dev/gen_debug.c --- mesa-19.2.8/src/intel/dev/gen_debug.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/dev/gen_debug.c 2020-06-12 01:21:17.000000000 +0000 @@ -89,6 +89,7 @@ { "tcs8", DEBUG_TCS_EIGHT_PATCH }, { "bt", DEBUG_BT }, { "pc", DEBUG_PIPE_CONTROL }, + { "nofc", DEBUG_NO_FAST_CLEAR }, { NULL, 0 } }; diff -Nru mesa-19.2.8/src/intel/dev/gen_debug.h mesa-20.0.8/src/intel/dev/gen_debug.h --- mesa-19.2.8/src/intel/dev/gen_debug.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/dev/gen_debug.h 2020-06-12 01:21:17.000000000 +0000 @@ -87,6 +87,7 @@ #define DEBUG_TCS_EIGHT_PATCH (1ull << 43) #define DEBUG_BT (1ull << 44) #define DEBUG_PIPE_CONTROL (1ull << 45) +#define DEBUG_NO_FAST_CLEAR (1ull << 46) /* These flags are not compatible with the disk shader cache */ #define DEBUG_DISK_CACHE_DISABLE_MASK DEBUG_SHADER_TIME diff -Nru mesa-19.2.8/src/intel/dev/gen_device_info.c mesa-20.0.8/src/intel/dev/gen_device_info.c --- mesa-19.2.8/src/intel/dev/gen_device_info.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/dev/gen_device_info.c 2020-06-12 01:21:17.000000000 +0000 @@ -66,6 +66,9 @@ { "cml", 0x9b41 }, { "cnl", 0x5a52 }, { "icl", 0x8a52 }, + { "ehl", 0x4500 }, + { "jsl", 0x4E71 }, + { "tgl", 0x9a49 }, }; for (unsigned i = 0; i < ARRAY_SIZE(name_map); i++) { @@ -216,7 +219,7 @@ .must_use_separate_stencil = true, \ .has_llc = true, \ .has_pln = true, \ - .has_64bit_types = true, \ + .has_64bit_float = true, \ .has_surface_tile_offset = true, \ .timestamp_frequency = 12500000 @@ -414,7 +417,8 @@ .has_sample_with_hiz = false, \ .has_pln = true, \ .has_integer_dword_mul = true, \ - .has_64bit_types = true, \ + .has_64bit_float = true, \ + .has_64bit_int = true, \ .supports_simd16_3src = true, \ .has_surface_tile_offset = true, \ .num_thread_per_eu = 7, \ @@ -430,7 +434,7 @@ .is_broadwell = true, .num_slices = 1, .num_subslices = { 2, }, - .num_eu_per_subslice = 8, + .num_eu_per_subslice = 6, .l3_banks = 2, .max_cs_threads = 42, .urb = { @@ -443,7 +447,8 @@ [MESA_SHADER_VERTEX] = 2560, [MESA_SHADER_TESS_CTRL] = 504, [MESA_SHADER_TESS_EVAL] = 1536, - [MESA_SHADER_GEOMETRY] = 960, + /* Reduced from 960, seems to be similar to the bug on Gen9 GT1. */ + [MESA_SHADER_GEOMETRY] = 690, }, }, .simulator_id = 11, @@ -850,28 +855,28 @@ .num_eu_per_subslice = 8, \ .l3_banks = _l3 -static const struct gen_device_info gen_device_info_cnl_2x8 = { +static const struct gen_device_info gen_device_info_cnl_gt0_5 = { /* GT0.5 */ GEN10_FEATURES(1, 1, subslices(2), 2), .is_cannonlake = true, .simulator_id = 15, }; -static const struct gen_device_info gen_device_info_cnl_3x8 = { +static const struct gen_device_info gen_device_info_cnl_gt1 = { /* GT1 */ GEN10_FEATURES(1, 1, subslices(3), 3), .is_cannonlake = true, .simulator_id = 15, }; -static const struct gen_device_info gen_device_info_cnl_4x8 = { +static const struct gen_device_info gen_device_info_cnl_gt1_5 = { /* GT 1.5 */ GEN10_FEATURES(1, 2, subslices(2, 2), 6), .is_cannonlake = true, .simulator_id = 15, }; -static const struct gen_device_info gen_device_info_cnl_5x8 = { +static const struct gen_device_info gen_device_info_cnl_gt2 = { /* GT2 */ GEN10_FEATURES(2, 2, subslices(3, 2), 6), .is_cannonlake = true, @@ -890,7 +895,8 @@ #define GEN11_FEATURES(_gt, _slices, _subslices, _l3) \ GEN8_FEATURES, \ GEN11_HW_INFO, \ - .has_64bit_types = false, \ + .has_64bit_float = false, \ + .has_64bit_int = false, \ .has_integer_dword_mul = false, \ .has_sample_with_hiz = false, \ .gt = _gt, .num_slices = _slices, .l3_banks = _l3, \ @@ -909,7 +915,7 @@ [MESA_SHADER_GEOMETRY] = 1032, \ } -static const struct gen_device_info gen_device_info_icl_8x8 = { +static const struct gen_device_info gen_device_info_icl_gt2 = { GEN11_FEATURES(2, 1, subslices(8), 8), .urb = { .size = 1024, @@ -918,7 +924,7 @@ .simulator_id = 19, }; -static const struct gen_device_info gen_device_info_icl_6x8 = { +static const struct gen_device_info gen_device_info_icl_gt1_5 = { GEN11_FEATURES(1, 1, subslices(6), 6), .urb = { .size = 768, @@ -927,7 +933,7 @@ .simulator_id = 19, }; -static const struct gen_device_info gen_device_info_icl_4x8 = { +static const struct gen_device_info gen_device_info_icl_gt1 = { GEN11_FEATURES(1, 1, subslices(4), 6), .urb = { .size = 768, @@ -936,7 +942,7 @@ .simulator_id = 19, }; -static const struct gen_device_info gen_device_info_icl_1x8 = { +static const struct gen_device_info gen_device_info_icl_gt0_5 = { GEN11_FEATURES(1, 1, subslices(1), 6), .urb = { .size = 768, @@ -945,8 +951,9 @@ .simulator_id = 19, }; -static const struct gen_device_info gen_device_info_ehl_4x8 = { +static const struct gen_device_info gen_device_info_ehl_7 = { GEN11_FEATURES(1, 1, subslices(4), 4), + .is_elkhartlake = true, .urb = { .size = 512, .min_entries = { @@ -964,10 +971,30 @@ .simulator_id = 28, }; -/* FIXME: Verfiy below entries when more information is available for this SKU. - */ -static const struct gen_device_info gen_device_info_ehl_4x4 = { +static const struct gen_device_info gen_device_info_ehl_6 = { + GEN11_FEATURES(1, 1, subslices(4), 4), + .is_elkhartlake = true, + .urb = { + .size = 512, + .min_entries = { + [MESA_SHADER_VERTEX] = 64, + [MESA_SHADER_TESS_EVAL] = 34, + }, + .max_entries = { + [MESA_SHADER_VERTEX] = 2384, + [MESA_SHADER_TESS_CTRL] = 1032, + [MESA_SHADER_TESS_EVAL] = 2384, + [MESA_SHADER_GEOMETRY] = 1032, + }, + }, + .disable_ccs_repack = true, + .num_eu_per_subslice = 6, + .simulator_id = 28, +}; + +static const struct gen_device_info gen_device_info_ehl_5 = { GEN11_FEATURES(1, 1, subslices(4), 4), + .is_elkhartlake = true, .urb = { .size = 512, .min_entries = { @@ -986,10 +1013,9 @@ .simulator_id = 28, }; -/* FIXME: Verfiy below entries when more information is available for this SKU. - */ -static const struct gen_device_info gen_device_info_ehl_2x4 = { +static const struct gen_device_info gen_device_info_ehl_4 = { GEN11_FEATURES(1, 1, subslices(2), 4), + .is_elkhartlake = true, .urb = { .size = 512, .min_entries = { @@ -1008,6 +1034,54 @@ .simulator_id = 28, }; +#define GEN12_URB_MIN_MAX_ENTRIES \ + .min_entries = { \ + [MESA_SHADER_VERTEX] = 64, \ + [MESA_SHADER_TESS_EVAL] = 34, \ + }, \ + .max_entries = { \ + [MESA_SHADER_VERTEX] = 3576, \ + [MESA_SHADER_TESS_CTRL] = 1548, \ + [MESA_SHADER_TESS_EVAL] = 3576, \ + [MESA_SHADER_GEOMETRY] = 1548, \ + } + +#define GEN12_HW_INFO \ + .gen = 12, \ + .has_pln = false, \ + .has_sample_with_hiz = false, \ + .has_aux_map = true, \ + .max_vs_threads = 546, \ + .max_gs_threads = 336, \ + .max_tcs_threads = 336, \ + .max_tes_threads = 546, \ + .max_cs_threads = 112, /* threads per DSS */ \ + .urb = { \ + GEN12_URB_MIN_MAX_ENTRIES, \ + } + +#define GEN12_FEATURES(_gt, _slices, _dual_subslices, _l3) \ + GEN8_FEATURES, \ + GEN12_HW_INFO, \ + .has_64bit_float = false, \ + .has_64bit_int = false, \ + .has_integer_dword_mul = false, \ + .gt = _gt, .num_slices = _slices, .l3_banks = _l3, \ + .simulator_id = 22, \ + .urb.size = (_gt) == 1 ? 512 : 1024, \ + .num_subslices = _dual_subslices, \ + .num_eu_per_subslice = 16 + +#define dual_subslices(args...) { args, } + +static const struct gen_device_info gen_device_info_tgl_gt1 = { + GEN12_FEATURES(1, 1, dual_subslices(2), 8), +}; + +static const struct gen_device_info gen_device_info_tgl_gt2 = { + GEN12_FEATURES(2, 1, dual_subslices(6), 8), +}; + static void gen_device_info_set_eu_mask(struct gen_device_info *devinfo, unsigned slice, @@ -1127,6 +1201,17 @@ } } + if (devinfo->gen == 12 && devinfo->num_slices == 1) { + if (n_subslices >= 6) { + assert(n_subslices == 6); + devinfo->l3_banks = 8; + } else if (n_subslices > 2) { + devinfo->l3_banks = 6; + } else { + devinfo->l3_banks = 4; + } + } + uint32_t eu_mask_len = topology->eu_stride * topology->max_subslices * topology->max_slices; assert(sizeof(devinfo->eu_masks) >= eu_mask_len); @@ -1223,9 +1308,10 @@ { switch (pci_id) { #undef CHIPSET -#define CHIPSET(id, family, name) \ +#define CHIPSET(id, family, fam_str, name) \ case id: *devinfo = gen_device_info_##family; break; #include "pci_ids/i965_pci_ids.h" +#include "pci_ids/iris_pci_ids.h" default: fprintf(stderr, "Driver does not support the 0x%x PCI ID.\n", pci_id); return false; @@ -1255,11 +1341,13 @@ * 4; /* effective subslices per slice */ break; case 11: + case 12: devinfo->max_wm_threads = 128 /* threads-per-PSD */ * devinfo->num_slices * 8; /* subslices per slice */ break; default: + assert(devinfo->gen < 9); break; } @@ -1274,8 +1362,9 @@ { switch (devid) { #undef CHIPSET -#define CHIPSET(id, family, name) case id: return name; +#define CHIPSET(id, family, fam_str, name) case id: return name " (" fam_str ")"; break; #include "pci_ids/i965_pci_ids.h" +#include "pci_ids/iris_pci_ids.h" default: return NULL; } @@ -1369,7 +1458,7 @@ return false; if (!getparam(fd, I915_PARAM_REVISION, &devinfo->revision)) - return false; + devinfo->revision = 0; if (!query_topology(devinfo, fd)) { if (devinfo->gen >= 10) { diff -Nru mesa-19.2.8/src/intel/dev/gen_device_info.h mesa-20.0.8/src/intel/dev/gen_device_info.h --- mesa-19.2.8/src/intel/dev/gen_device_info.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/dev/gen_device_info.h 2020-06-12 01:21:17.000000000 +0000 @@ -62,6 +62,7 @@ bool is_geminilake; bool is_coffeelake; bool is_cannonlake; + bool is_elkhartlake; bool has_hiz_and_separate_stencil; bool must_use_separate_stencil; @@ -69,13 +70,15 @@ bool has_llc; bool has_pln; - bool has_64bit_types; + bool has_64bit_float; + bool has_64bit_int; bool has_integer_dword_mul; bool has_compr4; bool has_surface_tile_offset; bool supports_simd16_3src; bool has_resource_streamer; bool disable_ccs_repack; + bool has_aux_map; /** * \name Intel hardware quirks diff -Nru mesa-19.2.8/src/intel/dev/gen_device_info_test.c mesa-20.0.8/src/intel/dev/gen_device_info_test.c --- mesa-19.2.8/src/intel/dev/gen_device_info_test.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/dev/gen_device_info_test.c 2020-06-12 01:21:17.000000000 +0000 @@ -13,8 +13,9 @@ const char *name; } chipsets[] = { #undef CHIPSET -#define CHIPSET(id, family, str_name) { .pci_id = id, .name = str_name, }, +#define CHIPSET(id, family, family_str, str_name) { .pci_id = id, .name = str_name, }, #include "pci_ids/i965_pci_ids.h" +#include "pci_ids/iris_pci_ids.h" }; for (uint32_t i = 0; i < ARRAY_SIZE(chipsets); i++) { diff -Nru mesa-19.2.8/src/intel/genxml/gen10.xml mesa-20.0.8/src/intel/genxml/gen10.xml --- mesa-19.2.8/src/intel/genxml/gen10.xml 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/genxml/gen10.xml 2020-06-12 01:21:17.000000000 +0000 @@ -719,7 +719,6 @@ - @@ -1101,7 +1100,7 @@ - + @@ -6793,6 +6792,24 @@ + + + + + + + + + + + + + + + + + + @@ -6815,6 +6832,11 @@ + + + + + diff -Nru mesa-19.2.8/src/intel/genxml/gen11.xml mesa-20.0.8/src/intel/genxml/gen11.xml --- mesa-19.2.8/src/intel/genxml/gen11.xml 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/genxml/gen11.xml 2020-06-12 01:21:17.000000000 +0000 @@ -706,7 +706,7 @@ - + @@ -735,7 +735,6 @@ - @@ -1131,10 +1130,18 @@ + + + + + + + + - + @@ -1250,21 +1257,19 @@ - - - - - - - - - - - - + + + + + + + + + + + @@ -2499,14 +2504,6 @@ - - - - - - - - @@ -6240,6 +6237,7 @@ + @@ -6916,6 +6914,11 @@ + + + + + @@ -6994,7 +6997,6 @@ - @@ -7003,6 +7005,31 @@ + + + + + + + + + + + + + + + + + + + + + + + + + @@ -7025,6 +7052,11 @@ + + + + + diff -Nru mesa-19.2.8/src/intel/genxml/gen12.xml mesa-20.0.8/src/intel/genxml/gen12.xml --- mesa-19.2.8/src/intel/genxml/gen12.xml 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/intel/genxml/gen12.xml 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,7312 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff -Nru mesa-19.2.8/src/intel/genxml/gen6.xml mesa-20.0.8/src/intel/genxml/gen6.xml --- mesa-19.2.8/src/intel/genxml/gen6.xml 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/genxml/gen6.xml 2020-06-12 01:21:17.000000000 +0000 @@ -2890,6 +2890,11 @@ + + + + + diff -Nru mesa-19.2.8/src/intel/genxml/gen75.xml mesa-20.0.8/src/intel/genxml/gen75.xml --- mesa-19.2.8/src/intel/genxml/gen75.xml 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/genxml/gen75.xml 2020-06-12 01:21:17.000000000 +0000 @@ -668,7 +668,7 @@ - + @@ -4260,6 +4260,11 @@ + + + + + diff -Nru mesa-19.2.8/src/intel/genxml/gen7.xml mesa-20.0.8/src/intel/genxml/gen7.xml --- mesa-19.2.8/src/intel/genxml/gen7.xml 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/genxml/gen7.xml 2020-06-12 01:21:17.000000000 +0000 @@ -563,7 +563,7 @@ - + @@ -3846,6 +3846,11 @@ + + + + + diff -Nru mesa-19.2.8/src/intel/genxml/gen8.xml mesa-20.0.8/src/intel/genxml/gen8.xml --- mesa-19.2.8/src/intel/genxml/gen8.xml 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/genxml/gen8.xml 2020-06-12 01:21:17.000000000 +0000 @@ -751,7 +751,7 @@ - + @@ -4757,6 +4757,24 @@ + + + + + + + + + + + + + + + + + + @@ -4792,6 +4810,11 @@ + + + + + diff -Nru mesa-19.2.8/src/intel/genxml/gen9.xml mesa-20.0.8/src/intel/genxml/gen9.xml --- mesa-19.2.8/src/intel/genxml/gen9.xml 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/genxml/gen9.xml 2020-06-12 01:21:17.000000000 +0000 @@ -1033,7 +1033,7 @@ - + @@ -6477,23 +6477,6 @@ - - - - - - - - - - - - - - - - - @@ -6576,6 +6559,23 @@ + + + + + + + + + + + + + + + + + @@ -6621,6 +6621,24 @@ + + + + + + + + + + + + + + + + + + @@ -6656,6 +6674,11 @@ + + + + + diff -Nru mesa-19.2.8/src/intel/genxml/gen_bits_header.py mesa-20.0.8/src/intel/genxml/gen_bits_header.py --- mesa-19.2.8/src/intel/genxml/gen_bits_header.py 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/genxml/gen_bits_header.py 2020-06-12 01:21:17.000000000 +0000 @@ -80,6 +80,7 @@ ${item.token_name}_${prop}(const struct gen_device_info *devinfo) { switch (devinfo->gen) { + case 12: return ${item.get_prop(prop, 12)}; case 11: return ${item.get_prop(prop, 11)}; case 10: return ${item.get_prop(prop, 10)}; case 9: return ${item.get_prop(prop, 9)}; @@ -134,7 +135,7 @@ alphanum_nono = re.compile(r'[ /\[\]()\-:.,=>#&*"+\\]+') def to_alphanum(name): global alphanum_nono - return alphanum_nono.sub('', name).replace('α', 'alpha') + return alphanum_nono.sub('', name) def safe_name(name): name = to_alphanum(name) @@ -182,12 +183,13 @@ self.length_by_gen[gen] = xml_attrs['length'] def get_field(self, field_name, create=False): - if field_name not in self.fields: + key = to_alphanum(field_name) + if key not in self.fields: if create: - self.fields[field_name] = Field(self, field_name) + self.fields[key] = Field(self, field_name) else: return None - return self.fields[field_name] + return self.fields[key] def has_prop(self, prop): if prop == 'length': diff -Nru mesa-19.2.8/src/intel/genxml/gen_macros.h mesa-20.0.8/src/intel/genxml/gen_macros.h --- mesa-19.2.8/src/intel/genxml/gen_macros.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/genxml/gen_macros.h 2020-06-12 01:21:17.000000000 +0000 @@ -91,6 +91,9 @@ #elif (GEN_VERSIONx10 == 110) # define GENX(X) GEN11_##X # define genX(x) gen11_##x +#elif (GEN_VERSIONx10 == 120) +# define GENX(X) GEN12_##X +# define genX(x) gen12_##x #else # error "Need to add prefixing macros for this gen" #endif diff -Nru mesa-19.2.8/src/intel/genxml/gen_pack_header.py mesa-20.0.8/src/intel/genxml/gen_pack_header.py --- mesa-19.2.8/src/intel/genxml/gen_pack_header.py 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/genxml/gen_pack_header.py 2020-06-12 01:21:17.000000000 +0000 @@ -69,13 +69,13 @@ uint32_t dw; }; -static inline uint64_t +static inline __attribute__((always_inline)) uint64_t __gen_mbo(uint32_t start, uint32_t end) { return (~0ull >> (64 - (end - start + 1))) << start; } -static inline uint64_t +static inline __attribute__((always_inline)) uint64_t __gen_uint(uint64_t v, uint32_t start, NDEBUG_UNUSED uint32_t end) { __gen_validate_value(v); @@ -91,7 +91,7 @@ return v << start; } -static inline uint64_t +static inline __attribute__((always_inline)) uint64_t __gen_sint(int64_t v, uint32_t start, uint32_t end) { const int width = end - start + 1; @@ -111,7 +111,7 @@ return (v & mask) << start; } -static inline uint64_t +static inline __attribute__((always_inline)) uint64_t __gen_offset(uint64_t v, NDEBUG_UNUSED uint32_t start, NDEBUG_UNUSED uint32_t end) { __gen_validate_value(v); @@ -124,14 +124,14 @@ return v; } -static inline uint32_t +static inline __attribute__((always_inline)) uint32_t __gen_float(float v) { __gen_validate_value(v); return ((union __gen_value) { .f = (v) }).dw; } -static inline uint64_t +static inline __attribute__((always_inline)) uint64_t __gen_sfixed(float v, uint32_t start, uint32_t end, uint32_t fract_bits) { __gen_validate_value(v); @@ -150,7 +150,7 @@ return (int_val & mask) << start; } -static inline uint64_t +static inline __attribute__((always_inline)) uint64_t __gen_ufixed(float v, uint32_t start, NDEBUG_UNUSED uint32_t end, uint32_t fract_bits) { __gen_validate_value(v); @@ -197,7 +197,6 @@ '=': '', '>': '', '#': '', - 'α': 'alpha', '&': '', '*': '', '"': '', @@ -619,7 +618,7 @@ def emit_pack_function(self, name, group): name = self.gen_prefix(name) print(textwrap.dedent("""\ - static inline void + static inline __attribute__((always_inline)) void %s_pack(__attribute__((unused)) __gen_user_data *data, %s__attribute__((unused)) void * restrict dst, %s__attribute__((unused)) const struct %s * restrict values) diff -Nru mesa-19.2.8/src/intel/genxml/gen_sort_tags.py mesa-20.0.8/src/intel/genxml/gen_sort_tags.py --- mesa-19.2.8/src/intel/genxml/gen_sort_tags.py 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/genxml/gen_sort_tags.py 2020-06-12 01:21:17.000000000 +0000 @@ -167,7 +167,7 @@ for r in registers: r[:] = sorted(r.getchildren(), key=get_start) - genxml[:] = enums + sorted_structs.values() + instructions + registers + genxml[:] = enums + list(sorted_structs.values()) + instructions + registers print('') print_node(sys.stdout, 0, genxml) diff -Nru mesa-19.2.8/src/intel/genxml/genX_pack.h mesa-20.0.8/src/intel/genxml/genX_pack.h --- mesa-19.2.8/src/intel/genxml/genX_pack.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/genxml/genX_pack.h 2020-06-12 01:21:17.000000000 +0000 @@ -48,6 +48,8 @@ # include "genxml/gen10_pack.h" #elif (GEN_VERSIONx10 == 110) # include "genxml/gen11_pack.h" +#elif (GEN_VERSIONx10 == 120) +# include "genxml/gen12_pack.h" #else # error "Need to add a pack header include for this gen" #endif diff -Nru mesa-19.2.8/src/intel/genxml/meson.build mesa-20.0.8/src/intel/genxml/meson.build --- mesa-19.2.8/src/intel/genxml/meson.build 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/genxml/meson.build 2020-06-12 01:21:17.000000000 +0000 @@ -29,6 +29,7 @@ 'gen9.xml', 'gen10.xml', 'gen11.xml', + 'gen12.xml', ] genX_xml_h = custom_target( diff -Nru mesa-19.2.8/src/intel/isl/isl.c mesa-20.0.8/src/intel/isl/isl.c --- mesa-19.2.8/src/intel/isl/isl.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/isl/isl.c 2020-06-12 01:21:17.000000000 +0000 @@ -33,6 +33,7 @@ #include "isl_gen7.h" #include "isl_gen8.h" #include "isl_gen9.h" +#include "isl_gen12.h" #include "isl_priv.h" void @@ -94,6 +95,56 @@ fprintf(stderr, "%s:%d: FINISHME: %s\n", file, line, buf); } +static void +isl_device_setup_mocs(struct isl_device *dev) +{ + if (dev->info->gen >= 12) { + /* TODO: Set PTE to MOCS 61 when the kernel is ready */ + /* TC=1/LLC Only, LeCC=1/Uncacheable, LRUM=0, L3CC=1/Uncacheable */ + dev->mocs.external = 3 << 1; + /* TC=LLC/eLLC, LeCC=WB, LRUM=3, L3CC=WB */ + dev->mocs.internal = 2 << 1; + } else if (dev->info->gen >= 9) { + /* TC=LLC/eLLC, LeCC=PTE, LRUM=3, L3CC=WB */ + dev->mocs.external = 1 << 1; + /* TC=LLC/eLLC, LeCC=WB, LRUM=3, L3CC=WB */ + dev->mocs.internal = 2 << 1; + } else if (dev->info->gen >= 8) { + /* MEMORY_OBJECT_CONTROL_STATE: + * .MemoryTypeLLCeLLCCacheabilityControl = UCwithFenceifcoherentcycle, + * .TargetCache = L3DefertoPATforLLCeLLCselection, + * .AgeforQUADLRU = 0 + */ + dev->mocs.external = 0x18; + /* MEMORY_OBJECT_CONTROL_STATE: + * .MemoryTypeLLCeLLCCacheabilityControl = WB, + * .TargetCache = L3DefertoPATforLLCeLLCselection, + * .AgeforQUADLRU = 0 + */ + dev->mocs.internal = 0x78; + } else if (dev->info->gen >= 7) { + if (dev->info->is_haswell) { + /* MEMORY_OBJECT_CONTROL_STATE: + * .LLCeLLCCacheabilityControlLLCCC = 0, + * .L3CacheabilityControlL3CC = 1, + */ + dev->mocs.internal = 1; + dev->mocs.external = 1; + } else { + /* MEMORY_OBJECT_CONTROL_STATE: + * .GraphicsDataTypeGFDT = 0, + * .LLCCacheabilityControlLLCCC = 0, + * .L3CacheabilityControlL3CC = 1, + */ + dev->mocs.internal = 1; + dev->mocs.external = 1; + } + } else { + dev->mocs.internal = 0; + dev->mocs.external = 0; + } +} + void isl_device_init(struct isl_device *dev, const struct gen_device_info *info, @@ -171,6 +222,8 @@ dev->ds.stencil_offset = 0; dev->ds.hiz_offset = 0; } + + isl_device_setup_mocs(dev); } /** @@ -310,6 +363,29 @@ phys_B = isl_extent2d(128, 32); break; + case ISL_TILING_GEN12_CCS: + /* From the Bspec, Gen Graphics > Gen12 > Memory Data Formats > Memory + * Compression > Memory Compression - Gen12: + * + * 4 bits of auxiliary plane data are required for 2 cachelines of + * main surface data. This results in a single cacheline of auxiliary + * plane data mapping to 4 4K pages of main surface data for the 4K + * pages (tile Y ) and 1 64K Tile Ys page. + * + * The Y-tiled pairing bit of 9 shown in the table below that Bspec + * section expresses that the 2 cachelines of main surface data are + * horizontally adjacent. + * + * TODO: Handle Ys, Yf and their pairing bits. + * + * Therefore, each CCS cacheline represents a 512Bx32 row area and each + * element represents a 32Bx4 row area. + */ + assert(format_bpb == 4); + logical_el = isl_extent2d(16, 8); + phys_B = isl_extent2d(64, 1); + break; + default: unreachable("not reached"); } /* end switch */ @@ -383,15 +459,19 @@ if (info->usage & ISL_SURF_USAGE_HIZ_BIT) { assert(info->format == ISL_FORMAT_HIZ); assert(tiling_flags == ISL_TILING_HIZ_BIT); - *tiling = ISL_TILING_HIZ; + *tiling = isl_tiling_flag_to_enum(tiling_flags); return true; } /* CCS surfaces always use the CCS tiling */ if (info->usage & ISL_SURF_USAGE_CCS_BIT) { assert(isl_format_get_layout(info->format)->txc == ISL_TXC_CCS); - assert(tiling_flags == ISL_TILING_CCS_BIT); - *tiling = ISL_TILING_CCS; + UNUSED bool ivb_ccs = ISL_DEV_GEN(dev) < 12 && + tiling_flags == ISL_TILING_CCS_BIT; + UNUSED bool tgl_ccs = ISL_DEV_GEN(dev) >= 12 && + tiling_flags == ISL_TILING_GEN12_CCS_BIT; + assert(ivb_ccs != tgl_ccs); + *tiling = isl_tiling_flag_to_enum(tiling_flags); return true; } @@ -628,16 +708,25 @@ if (ISL_DEV_GEN(dev) == 6) { /* HiZ surfaces on Sandy Bridge are packed tightly. */ *image_align_el = isl_extent3d(1, 1, 1); - } else { + } else if (ISL_DEV_GEN(dev) < 12) { /* On gen7+, HiZ surfaces are always aligned to 16x8 pixels in the * primary surface which works out to 2x2 HiZ elments. */ *image_align_el = isl_extent3d(2, 2, 1); + } else { + /* On gen12+, HiZ surfaces are always aligned to 16x16 pixels in the + * primary surface which works out to 2x4 HiZ elments. + * TODO: Verify + */ + *image_align_el = isl_extent3d(2, 4, 1); } return; } - if (ISL_DEV_GEN(dev) >= 9) { + if (ISL_DEV_GEN(dev) >= 12) { + isl_gen12_choose_image_alignment_el(dev, info, tiling, dim_layout, + msaa_layout, image_align_el); + } else if (ISL_DEV_GEN(dev) >= 9) { isl_gen9_choose_image_alignment_el(dev, info, tiling, dim_layout, msaa_layout, image_align_el); } else if (ISL_DEV_GEN(dev) >= 8) { @@ -915,7 +1004,8 @@ assert(pitch_sa_rows % fmtl->bh == 0); uint32_t pitch_el_rows = pitch_sa_rows / fmtl->bh; - if (ISL_DEV_GEN(dev) >= 9 && fmtl->txc == ISL_TXC_CCS) { + if (ISL_DEV_GEN(dev) >= 9 && ISL_DEV_GEN(dev) <= 11 && + fmtl->txc == ISL_TXC_CCS) { /* * From the Sky Lake PRM Vol 7, "MCS Buffer for Render Target(s)" (p. 632): * @@ -933,6 +1023,8 @@ * The first restriction is already handled by isl_choose_image_alignment_el * but the second restriction, which is an extension of the first, only * applies to qpitch and must be applied here. + * + * The second restriction disappears on Gen12. */ assert(fmtl->bh == 4); pitch_el_rows = isl_align(pitch_el_rows, 256 / 4); @@ -1269,11 +1361,31 @@ } static uint32_t -isl_calc_row_pitch_alignment(const struct isl_surf_init_info *surf_info, +isl_calc_row_pitch_alignment(const struct isl_device *dev, + const struct isl_surf_init_info *surf_info, const struct isl_tile_info *tile_info) { - if (tile_info->tiling != ISL_TILING_LINEAR) + if (tile_info->tiling != ISL_TILING_LINEAR) { + /* According to BSpec: 44930, Gen12's CCS-compressed surface pitches must + * be 512B-aligned. CCS is only support on Y tilings. + * + * Only consider 512B alignment when : + * - AUX is not explicitly disabled + * - the caller has specified no pitch + * + * isl_surf_get_ccs_surf() will check that the main surface alignment + * matches CCS expectations. + */ + if (ISL_DEV_GEN(dev) >= 12 && + isl_format_supports_ccs_e(dev->info, surf_info->format) && + tile_info->tiling != ISL_TILING_X && + !(surf_info->usage & ISL_SURF_USAGE_DISABLE_AUX_BIT) && + surf_info->row_pitch_B == 0) { + return isl_align(tile_info->phys_extent_B.width, 512); + } + return tile_info->phys_extent_B.width; + } /* From the Broadwel PRM >> Volume 2d: Command Reference: Structures >> * RENDER_SURFACE_STATE Surface Pitch (p349): @@ -1291,16 +1403,27 @@ */ const struct isl_format_layout *fmtl = isl_format_get_layout(surf_info->format); const uint32_t bs = fmtl->bpb / 8; + uint32_t alignment; if (surf_info->usage & ISL_SURF_USAGE_RENDER_TARGET_BIT) { if (isl_format_is_yuv(surf_info->format)) { - return 2 * bs; + alignment = 2 * bs; } else { - return bs; + alignment = bs; } + } else { + alignment = 1; } - return 1; + /* From the Broadwell PRM >> Volume 2c: Command Reference: Registers >> + * PRI_STRIDE Stride (p1254): + * + * "When using linear memory, this must be at least 64 byte aligned." + */ + if (surf_info->usage & ISL_SURF_USAGE_DISPLAY_BIT) + alignment = isl_align(alignment, 64); + + return alignment; } static uint32_t @@ -1331,8 +1454,12 @@ isl_align_div(phys_total_el->w * tile_el_scale, tile_info->logical_extent_el.width); - assert(alignment_B == tile_info->phys_extent_B.width); - return total_w_tl * tile_info->phys_extent_B.width; + /* In some cases the alignment of the pitch might be > to the tile size + * (for example Gen12 CCS requires 512B alignment while the tile's width + * can be 128B), so align the row pitch to the alignment. + */ + assert(alignment_B >= tile_info->phys_extent_B.width); + return isl_align(total_w_tl * tile_info->phys_extent_B.width, alignment_B); } static uint32_t @@ -1376,24 +1503,23 @@ uint32_t *out_row_pitch_B) { uint32_t alignment_B = - isl_calc_row_pitch_alignment(surf_info, tile_info); + isl_calc_row_pitch_alignment(dev, surf_info, tile_info); const uint32_t min_row_pitch_B = isl_calc_min_row_pitch(dev, surf_info, tile_info, phys_total_el, alignment_B); - uint32_t row_pitch_B = min_row_pitch_B; - if (surf_info->row_pitch_B != 0) { - row_pitch_B = surf_info->row_pitch_B; - - if (row_pitch_B < min_row_pitch_B) + if (surf_info->row_pitch_B < min_row_pitch_B) return false; - if (row_pitch_B % alignment_B != 0) + if (surf_info->row_pitch_B % alignment_B != 0) return false; } + const uint32_t row_pitch_B = + surf_info->row_pitch_B != 0 ? surf_info->row_pitch_B : min_row_pitch_B; + const uint32_t row_pitch_tl = row_pitch_B / tile_info->phys_extent_B.width; if (row_pitch_B == 0) @@ -1532,6 +1658,19 @@ tile_info.phys_extent_B.height; assert(isl_is_pow2(info->min_alignment_B) && isl_is_pow2(tile_size_B)); base_alignment_B = MAX(info->min_alignment_B, tile_size_B); + + /* The diagram in the Bspec section Memory Compression - Gen12, shows + * that the CCS is indexed in 256B chunks. However, the + * PLANE_AUX_DIST::Auxiliary Surface Distance field is in units of 4K + * pages. We currently don't assign the usage field like we do for main + * surfaces, so just use 4K for now. + */ + if (tiling == ISL_TILING_GEN12_CCS) + base_alignment_B = MAX(base_alignment_B, 4096); + } + + if (ISL_DEV_GEN(dev) >= 12) { + base_alignment_B = MAX(base_alignment_B, 64 * 1024); } if (ISL_DEV_GEN(dev) < 9) { @@ -1601,6 +1740,9 @@ { assert(ISL_DEV_GEN(dev) >= 5 && ISL_DEV_USE_SEPARATE_STENCIL(dev)); + if (!isl_surf_usage_is_depth(surf->usage)) + return false; + /* HiZ only works with Y-tiled depth buffers */ if (!isl_tiling_is_any_y(surf->tiling)) return false; @@ -1705,16 +1847,19 @@ const struct isl_surf *surf, struct isl_surf *mcs_surf) { + /* It must be multisampled with an array layout */ + if (surf->msaa_layout != ISL_MSAA_LAYOUT_ARRAY) + return false; + + if (mcs_surf->size_B > 0) + return false; + /* The following are true of all multisampled surfaces */ assert(surf->samples > 1); assert(surf->dim == ISL_SURF_DIM_2D); assert(surf->levels == 1); assert(surf->logical_level0_px.depth == 1); - /* It must be multisampled with an array layout */ - if (surf->msaa_layout != ISL_MSAA_LAYOUT_ARRAY) - return false; - /* From the Ivy Bridge PRM, Vol4 Part1 p77 ("MCS Enable"): * * This field must be set to 0 for all SINT MSRTs when all RT channels @@ -1765,10 +1910,24 @@ bool isl_surf_get_ccs_surf(const struct isl_device *dev, const struct isl_surf *surf, - struct isl_surf *ccs_surf, + struct isl_surf *aux_surf, + struct isl_surf *extra_aux_surf, uint32_t row_pitch_B) { - assert(surf->samples == 1 && surf->msaa_layout == ISL_MSAA_LAYOUT_NONE); + assert(aux_surf); + + /* An uninitialized surface is needed to get a CCS surface. */ + if (aux_surf->size_B > 0 && + (extra_aux_surf == NULL || extra_aux_surf->size_B > 0)) { + return false; + } + + /* A surface can't have two CCS surfaces. */ + if (aux_surf->usage & ISL_SURF_USAGE_CCS_BIT) + return false; + + if (ISL_DEV_GEN(dev) < 12 && surf->samples > 1) + return false; /* CCS support does not exist prior to Gen7 */ if (ISL_DEV_GEN(dev) <= 6) @@ -1777,6 +1936,23 @@ if (surf->usage & ISL_SURF_USAGE_DISABLE_AUX_BIT) return false; + /* Allow CCS for single-sampled stencil buffers Gen12+. */ + if (isl_surf_usage_is_stencil(surf->usage) && + (ISL_DEV_GEN(dev) < 12 || surf->samples > 1)) + return false; + + /* [TGL+] CCS can only be added to a non-D16-formatted depth buffer if it + * has HiZ. If not for GEN:BUG:1406512483 "deprecate compression enable + * states", D16 would be supported. Supporting D16 requires being able to + * specify that the control surface is present and simultaneously disabling + * compression. The above bug makes it so that it's not possible to specify + * this configuration. + */ + if (isl_surf_usage_is_depth(surf->usage) && (aux_surf->size_B == 0 || + ISL_DEV_GEN(dev) < 12 || surf->format == ISL_FORMAT_R16_UNORM)) { + return false; + } + /* The PRM doesn't say this explicitly, but fast-clears don't appear to * work for 3D textures until gen9 where the layout of 3D textures changes * to match 2D array textures. @@ -1801,9 +1977,48 @@ (surf->levels > 1 || surf->logical_level0_px.array_len > 1)) return false; + /* On Gen12, 8BPP surfaces cannot be compressed if any level is not + * 32Bx4row-aligned. For now, just reject the cases where alignment + * matters. + */ + if (ISL_DEV_GEN(dev) >= 12 && + isl_format_get_layout(surf->format)->bpb == 8 && surf->levels >= 3) { + isl_finishme("%s:%s: CCS for 8BPP textures with 3+ miplevels is " + "disabled, but support for more levels is possible.", + __FILE__, __func__); + return false; + } + + /* On Gen12, all CCS-compressed surface pitches must be multiples of 512B. + */ + if (ISL_DEV_GEN(dev) >= 12 && surf->row_pitch_B % 512 != 0) + return false; + if (isl_format_is_compressed(surf->format)) return false; + /* According to GEN:BUG:1406738321, 3D textures need a blit to a new + * surface in order to perform a resolve. For now, just disable CCS. + */ + if (ISL_DEV_GEN(dev) >= 12 && surf->dim == ISL_SURF_DIM_3D) { + isl_finishme("%s:%s: CCS for 3D textures is disabled, but a workaround" + " is available.", __FILE__, __func__); + return false; + } + + /* GEN:BUG:1207137018 + * + * TODO: implement following workaround currently covered by the restriction + * above. If following conditions are met: + * + * - RENDER_SURFACE_STATE.Surface Type == 3D + * - RENDER_SURFACE_STATE.Auxiliary Surface Mode != AUX_NONE + * - RENDER_SURFACE_STATE.Tiled ResourceMode is TYF or TYS + * + * Set the value of RENDER_SURFACE_STATE.Mip Tail Start LOD to a mip that + * larger than those present in the surface (i.e. 15) + */ + /* TODO: More conditions where it can fail. */ /* From the Ivy Bridge PRM, Vol2 Part1 11.7 "MCS Buffer for Render @@ -1820,7 +2035,31 @@ * TiledY/TileYs/TileYf non-MSRTs only. */ enum isl_format ccs_format; - if (ISL_DEV_GEN(dev) >= 9) { + if (ISL_DEV_GEN(dev) >= 12) { + /* TODO: Handle the other tiling formats */ + if (surf->tiling != ISL_TILING_Y0) + return false; + + /* BSpec 44930: + * + * Linear CCS is only allowed for Untyped Buffers but only via HDC + * Data-Port messages. + * + * We probably want to limit linear CCS to storage usage and check that + * the shaders actually use only untyped messages. + */ + assert(surf->tiling != ISL_TILING_LINEAR); + + switch (isl_format_get_layout(surf->format)->bpb) { + case 8: ccs_format = ISL_FORMAT_GEN12_CCS_8BPP_Y0; break; + case 16: ccs_format = ISL_FORMAT_GEN12_CCS_16BPP_Y0; break; + case 32: ccs_format = ISL_FORMAT_GEN12_CCS_32BPP_Y0; break; + case 64: ccs_format = ISL_FORMAT_GEN12_CCS_64BPP_Y0; break; + case 128: ccs_format = ISL_FORMAT_GEN12_CCS_128BPP_Y0; break; + default: + return false; + } + } else if (ISL_DEV_GEN(dev) >= 9) { if (!isl_tiling_is_any_y(surf->tiling)) return false; @@ -1851,18 +2090,41 @@ return false; } - return isl_surf_init(dev, ccs_surf, - .dim = surf->dim, - .format = ccs_format, - .width = surf->logical_level0_px.width, - .height = surf->logical_level0_px.height, - .depth = surf->logical_level0_px.depth, - .levels = surf->levels, - .array_len = surf->logical_level0_px.array_len, - .samples = 1, - .row_pitch_B = row_pitch_B, - .usage = ISL_SURF_USAGE_CCS_BIT, - .tiling_flags = ISL_TILING_CCS_BIT); + if (ISL_DEV_GEN(dev) >= 12) { + /* On Gen12, the CCS is a scaled-down version of the main surface. We + * model this as the CCS compressing a 2D-view of the entire surface. + */ + struct isl_surf *ccs_surf = + aux_surf->size_B > 0 ? extra_aux_surf : aux_surf; + const bool ok = + isl_surf_init(dev, ccs_surf, + .dim = ISL_SURF_DIM_2D, + .format = ccs_format, + .width = isl_surf_get_row_pitch_el(surf), + .height = surf->size_B / surf->row_pitch_B, + .depth = 1, + .levels = 1, + .array_len = 1, + .samples = 1, + .row_pitch_B = row_pitch_B, + .usage = ISL_SURF_USAGE_CCS_BIT, + .tiling_flags = ISL_TILING_GEN12_CCS_BIT); + assert(!ok || ccs_surf->size_B == surf->size_B / 256); + return ok; + } else { + return isl_surf_init(dev, aux_surf, + .dim = surf->dim, + .format = ccs_format, + .width = surf->logical_level0_px.width, + .height = surf->logical_level0_px.height, + .depth = surf->logical_level0_px.depth, + .levels = surf->levels, + .array_len = surf->logical_level0_px.array_len, + .samples = 1, + .row_pitch_B = row_pitch_B, + .usage = ISL_SURF_USAGE_CCS_BIT, + .tiling_flags = ISL_TILING_CCS_BIT); + } } #define isl_genX_call(dev, func, ...) \ @@ -1900,6 +2162,9 @@ case 11: \ isl_gen11_##func(__VA_ARGS__); \ break; \ + case 12: \ + isl_gen12_##func(__VA_ARGS__); \ + break; \ default: \ assert(!"Unknown hardware generation"); \ } @@ -1934,7 +2199,7 @@ isl_buffer_fill_state_s(const struct isl_device *dev, void *state, const struct isl_buffer_fill_state_info *restrict info) { - isl_genX_call(dev, buffer_fill_state_s, state, info); + isl_genX_call(dev, buffer_fill_state_s, dev, state, info); } void @@ -2285,6 +2550,56 @@ } void +isl_surf_get_image_range_B_tile(const struct isl_surf *surf, + uint32_t level, + uint32_t logical_array_layer, + uint32_t logical_z_offset_px, + uint32_t *start_tile_B, + uint32_t *end_tile_B) +{ + uint32_t start_x_offset_el, start_y_offset_el; + isl_surf_get_image_offset_el(surf, level, logical_array_layer, + logical_z_offset_px, + &start_x_offset_el, + &start_y_offset_el); + + /* Compute the size of the subimage in surface elements */ + const uint32_t subimage_w_sa = isl_minify(surf->phys_level0_sa.w, level); + const uint32_t subimage_h_sa = isl_minify(surf->phys_level0_sa.h, level); + const struct isl_format_layout *fmtl = isl_format_get_layout(surf->format); + const uint32_t subimage_w_el = isl_align_div_npot(subimage_w_sa, fmtl->bw); + const uint32_t subimage_h_el = isl_align_div_npot(subimage_h_sa, fmtl->bh); + + /* Find the last pixel */ + uint32_t end_x_offset_el = start_x_offset_el + subimage_w_el - 1; + uint32_t end_y_offset_el = start_y_offset_el + subimage_h_el - 1; + + UNUSED uint32_t x_offset_el, y_offset_el; + isl_tiling_get_intratile_offset_el(surf->tiling, fmtl->bpb, + surf->row_pitch_B, + start_x_offset_el, + start_y_offset_el, + start_tile_B, + &x_offset_el, + &y_offset_el); + + isl_tiling_get_intratile_offset_el(surf->tiling, fmtl->bpb, + surf->row_pitch_B, + end_x_offset_el, + end_y_offset_el, + end_tile_B, + &x_offset_el, + &y_offset_el); + + /* We want the range we return to be exclusive but the tile containing the + * last pixel (what we just calculated) is inclusive. Add one. + */ + (*end_tile_B)++; + + assert(*end_tile_B <= surf->size_B); +} + +void isl_surf_get_image_surf(const struct isl_device *dev, const struct isl_surf *surf, uint32_t level, @@ -2419,6 +2734,16 @@ } bool +isl_surf_supports_hiz_ccs_wt(const struct gen_device_info *dev, + const struct isl_surf *surf, + enum isl_aux_usage aux_usage) +{ + return aux_usage == ISL_AUX_USAGE_HIZ_CCS && + surf->samples == 1 && + surf->usage & ISL_SURF_USAGE_TEXTURE_BIT; +} + +bool isl_swizzle_supports_rendering(const struct gen_device_info *devinfo, struct isl_swizzle swizzle) { diff -Nru mesa-19.2.8/src/intel/isl/isl_drm.c mesa-20.0.8/src/intel/isl/isl_drm.c --- mesa-19.2.8/src/intel/isl/isl_drm.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/isl/isl_drm.c 2020-06-12 01:21:17.000000000 +0000 @@ -41,13 +41,14 @@ return I915_TILING_X; case ISL_TILING_Y0: + case ISL_TILING_HIZ: + case ISL_TILING_CCS: return I915_TILING_Y; case ISL_TILING_W: case ISL_TILING_Yf: case ISL_TILING_Ys: - case ISL_TILING_HIZ: - case ISL_TILING_CCS: + case ISL_TILING_GEN12_CCS: return I915_TILING_NONE; } diff -Nru mesa-19.2.8/src/intel/isl/isl_emit_depth_stencil.c mesa-20.0.8/src/intel/isl/isl_emit_depth_stencil.c --- mesa-19.2.8/src/intel/isl/isl_emit_depth_stencil.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/isl/isl_emit_depth_stencil.c 2020-06-12 01:21:17.000000000 +0000 @@ -71,11 +71,15 @@ db.SurfaceFormat = isl_surf_get_depth_format(dev, info->depth_surf); db.Width = info->depth_surf->logical_level0_px.width - 1; db.Height = info->depth_surf->logical_level0_px.height - 1; + if (db.SurfaceType == SURFTYPE_3D) + db.Depth = info->depth_surf->logical_level0_px.depth - 1; } else if (info->stencil_surf) { db.SurfaceType = isl_to_gen_ds_surftype[info->stencil_surf->dim]; db.SurfaceFormat = D32_FLOAT; db.Width = info->stencil_surf->logical_level0_px.width - 1; db.Height = info->stencil_surf->logical_level0_px.height - 1; + if (db.SurfaceType == SURFTYPE_3D) + db.Depth = info->stencil_surf->logical_level0_px.depth - 1; } else { db.SurfaceType = SURFTYPE_NULL; db.SurfaceFormat = D32_FLOAT; @@ -83,9 +87,23 @@ if (info->depth_surf || info->stencil_surf) { /* These are based entirely on the view */ - db.Depth = db.RenderTargetViewExtent = info->view->array_len - 1; + db.RenderTargetViewExtent = info->view->array_len - 1; db.LOD = info->view->base_level; db.MinimumArrayElement = info->view->base_array_layer; + + /* From the Haswell PRM docs for 3DSTATE_DEPTH_BUFFER::Depth + * + * "This field specifies the total number of levels for a volume + * texture or the number of array elements allowed to be accessed + * starting at the Minimum Array Element for arrayed surfaces. If the + * volume texture is MIP-mapped, this field specifies the depth of + * the base MIP level." + * + * For 3D surfaces, we set it to the correct depth above. For non-3D + * surfaces, this is the same as RenderTargetViewExtent. + */ + if (db.SurfaceType != SURFTYPE_3D) + db.Depth = db.RenderTargetViewExtent; } if (info->depth_surf) { @@ -109,6 +127,11 @@ db.SurfaceQPitch = isl_surf_get_array_pitch_el_rows(info->depth_surf) >> 2; #endif + +#if GEN_GEN >= 12 + db.ControlSurfaceEnable = db.DepthBufferCompressionEnable = + info->hiz_usage == ISL_AUX_USAGE_HIZ_CCS; +#endif } #if GEN_GEN == 5 || GEN_GEN == 6 @@ -130,10 +153,21 @@ #endif if (info->stencil_surf) { -#if GEN_GEN >= 7 +#if GEN_GEN >= 7 && GEN_GEN < 12 db.StencilWriteEnable = true; #endif -#if GEN_GEN >= 8 || GEN_IS_HASWELL +#if GEN_GEN >= 12 + sb.StencilWriteEnable = true; + sb.SurfaceType = SURFTYPE_2D; + sb.Width = info->stencil_surf->logical_level0_px.width - 1; + sb.Height = info->stencil_surf->logical_level0_px.height - 1; + sb.Depth = sb.RenderTargetViewExtent = info->view->array_len - 1; + sb.SurfLOD = info->view->base_level; + sb.MinimumArrayElement = info->view->base_array_layer; + sb.StencilCompressionEnable = + info->stencil_aux_usage == ISL_AUX_USAGE_CCS_E; + sb.ControlSurfaceEnable = sb.StencilCompressionEnable; +#elif GEN_GEN >= 8 || GEN_IS_HASWELL sb.StencilBufferEnable = true; #endif sb.SurfaceBaseAddress = info->stencil_address; @@ -145,6 +179,19 @@ sb.SurfaceQPitch = isl_surf_get_array_pitch_el_rows(info->stencil_surf) >> 2; #endif + } else { +#if GEN_GEN >= 12 + sb.SurfaceType = SURFTYPE_NULL; + + /* The docs seem to indicate that if surf-type is null, then we may need + * to match the depth-buffer value for `Depth`. It may be a + * documentation bug, since the other fields don't require this. + * + * TODO: Confirm documentation and remove seeting of `Depth` if not + * required. + */ + sb.Depth = db.Depth; +#endif } #if GEN_GEN >= 6 @@ -156,13 +203,22 @@ }; assert(info->hiz_usage == ISL_AUX_USAGE_NONE || - info->hiz_usage == ISL_AUX_USAGE_HIZ); - if (info->hiz_usage == ISL_AUX_USAGE_HIZ) { + info->hiz_usage == ISL_AUX_USAGE_HIZ || + info->hiz_usage == ISL_AUX_USAGE_HIZ_CCS); + if (info->hiz_usage == ISL_AUX_USAGE_HIZ || + info->hiz_usage == ISL_AUX_USAGE_HIZ_CCS) { + assert(GEN_GEN >= 12 || info->hiz_usage == ISL_AUX_USAGE_HIZ); db.HierarchicalDepthBufferEnable = true; hiz.SurfaceBaseAddress = info->hiz_address; hiz.MOCS = info->mocs; hiz.SurfacePitch = info->hiz_surf->row_pitch_B - 1; +#if GEN_GEN >= 12 + hiz.HierarchicalDepthBufferWriteThruEnable = + isl_surf_supports_hiz_ccs_wt(dev->info, info->depth_surf, + info->hiz_usage); +#endif + #if GEN_GEN >= 8 /* From the SKL PRM Vol2a: * diff -Nru mesa-19.2.8/src/intel/isl/isl_format.c mesa-20.0.8/src/intel/isl/isl_format.c --- mesa-19.2.8/src/intel/isl/isl_format.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/isl/isl_format.c 2020-06-12 01:21:17.000000000 +0000 @@ -147,7 +147,7 @@ SF( Y, Y, x, x, Y, Y, x, x, x, x, x, 100, B8G8R8A8_UNORM_SRGB) /* smpl filt shad CK RT AB VB SO color TW TR ccs_e */ SF( Y, Y, x, x, Y, Y, Y, x, 60, 70, x, 100, R10G10B10A2_UNORM) - SF( Y, Y, x, x, x, x, x, x, 60, x, x, x, R10G10B10A2_UNORM_SRGB) + SF( Y, Y, x, x, x, x, x, x, 60, x, x, 120, R10G10B10A2_UNORM_SRGB) SF( Y, x, x, x, Y, x, Y, x, x, 70, x, 100, R10G10B10A2_UINT) SF( Y, Y, x, x, x, x, Y, x, x, x, x, x, R10G10B10_SNORM_A2_UNORM) SF( Y, Y, x, x, Y, Y, Y, x, 60, 70, 110, 90, R8G8B8A8_UNORM) @@ -163,10 +163,11 @@ SF( Y, Y, x, x, Y, Y, 75, x, 60, 70, x, 100, B10G10R10A2_UNORM) SF( Y, Y, x, x, Y, Y, x, x, 60, x, x, 100, B10G10R10A2_UNORM_SRGB) SF( Y, Y, x, x, Y, Y, Y, x, x, 70, x, 100, R11G11B10_FLOAT) + SF(120, 120, x, x, 120, 120, x, x, x, x, x, 120, R10G10B10_FLOAT_A2_UNORM) SF( Y, x, x, x, Y, x, Y, Y, x, 70, 70, 90, R32_SINT) SF( Y, x, x, x, Y, x, Y, Y, x, 70, 70, 90, R32_UINT) SF( Y, 50, Y, x, Y, Y, Y, Y, x, 70, 70, 90, R32_FLOAT) - SF( Y, 50, Y, x, x, x, x, x, x, x, x, x, R24_UNORM_X8_TYPELESS) + SF( Y, 50, Y, x, x, x, x, x, x, x, x, 120, R24_UNORM_X8_TYPELESS) SF( Y, x, x, x, x, x, x, x, x, x, x, x, X24_TYPELESS_G8_UINT) SF( Y, Y, x, x, x, x, x, x, x, x, x, x, L16A16_UNORM) SF( Y, 50, Y, x, x, x, x, x, x, x, x, x, I24X8_UNORM) @@ -192,21 +193,21 @@ SF( x, x, x, x, x, x, Y, x, x, x, x, x, R16G16_USCALED) SF( x, x, x, x, x, x, Y, x, x, x, x, x, R32_SSCALED) SF( x, x, x, x, x, x, Y, x, x, x, x, x, R32_USCALED) - SF( Y, Y, x, Y, Y, Y, x, x, x, 70, x, x, B5G6R5_UNORM) - SF( Y, Y, x, x, Y, Y, x, x, x, x, x, x, B5G6R5_UNORM_SRGB) - SF( Y, Y, x, Y, Y, Y, x, x, x, 70, x, x, B5G5R5A1_UNORM) - SF( Y, Y, x, x, Y, Y, x, x, x, x, x, x, B5G5R5A1_UNORM_SRGB) - SF( Y, Y, x, Y, Y, Y, x, x, x, 70, x, x, B4G4R4A4_UNORM) - SF( Y, Y, x, x, Y, Y, x, x, x, x, x, x, B4G4R4A4_UNORM_SRGB) - SF( Y, Y, x, x, Y, Y, Y, x, x, 70, 110, x, R8G8_UNORM) - SF( Y, Y, x, Y, Y, 60, Y, x, x, 70, 110, x, R8G8_SNORM) - SF( Y, x, x, x, Y, x, Y, x, x, 70, 90, x, R8G8_SINT) - SF( Y, x, x, x, Y, x, Y, x, x, 70, 75, x, R8G8_UINT) - SF( Y, Y, Y, x, Y, 45, Y, x, 70, 70, 110, x, R16_UNORM) - SF( Y, Y, x, x, Y, 60, Y, x, x, 70, 110, x, R16_SNORM) - SF( Y, x, x, x, Y, x, Y, x, x, 70, 90, x, R16_SINT) - SF( Y, x, x, x, Y, x, Y, x, x, 70, 75, x, R16_UINT) - SF( Y, Y, x, x, Y, Y, Y, x, x, 70, 90, x, R16_FLOAT) + SF( Y, Y, x, Y, Y, Y, x, x, x, 70, x, 120, B5G6R5_UNORM) + SF( Y, Y, x, x, Y, Y, x, x, x, x, x, 120, B5G6R5_UNORM_SRGB) + SF( Y, Y, x, Y, Y, Y, x, x, x, 70, x, 120, B5G5R5A1_UNORM) + SF( Y, Y, x, x, Y, Y, x, x, x, x, x, 120, B5G5R5A1_UNORM_SRGB) + SF( Y, Y, x, Y, Y, Y, x, x, x, 70, x, 120, B4G4R4A4_UNORM) + SF( Y, Y, x, x, Y, Y, x, x, x, x, x, 120, B4G4R4A4_UNORM_SRGB) + SF( Y, Y, x, x, Y, Y, Y, x, x, 70, 110, 120, R8G8_UNORM) + SF( Y, Y, x, Y, Y, 60, Y, x, x, 70, 110, 120, R8G8_SNORM) + SF( Y, x, x, x, Y, x, Y, x, x, 70, 90, 120, R8G8_SINT) + SF( Y, x, x, x, Y, x, Y, x, x, 70, 75, 120, R8G8_UINT) + SF( Y, Y, Y, x, Y, 45, Y, x, 70, 70, 110, 120, R16_UNORM) + SF( Y, Y, x, x, Y, 60, Y, x, x, 70, 110, 120, R16_SNORM) + SF( Y, x, x, x, Y, x, Y, x, x, 70, 90, 120, R16_SINT) + SF( Y, x, x, x, Y, x, Y, x, x, 70, 75, 120, R16_UINT) + SF( Y, Y, x, x, Y, Y, Y, x, x, 70, 90, 120, R16_FLOAT) SF( 50, 50, x, x, x, x, x, x, x, x, x, x, A8P8_UNORM_PALETTE0) SF( 50, 50, x, x, x, x, x, x, x, x, x, x, A8P8_UNORM_PALETTE1) SF( Y, Y, Y, x, x, x, x, x, x, x, x, x, I16_UNORM) @@ -218,8 +219,8 @@ SF( Y, Y, Y, x, x, x, x, x, x, x, x, x, A16_FLOAT) SF( 45, 45, x, x, x, x, x, x, x, x, x, x, L8A8_UNORM_SRGB) SF( Y, Y, x, Y, x, x, x, x, x, x, x, x, R5G5_SNORM_B6_UNORM) - SF( x, x, x, x, Y, Y, x, x, x, 70, x, x, B5G5R5X1_UNORM) - SF( x, x, x, x, Y, Y, x, x, x, x, x, x, B5G5R5X1_UNORM_SRGB) + SF( x, x, x, x, Y, Y, x, x, x, 70, x, 120, B5G5R5X1_UNORM) + SF( x, x, x, x, Y, Y, x, x, x, x, x, 120, B5G5R5X1_UNORM_SRGB) SF( x, x, x, x, x, x, Y, x, x, x, x, x, R8G8_SSCALED) SF( x, x, x, x, x, x, Y, x, x, x, x, x, R8G8_USCALED) /* smpl filt shad CK RT AB VB SO color TW TR ccs_e */ @@ -227,19 +228,19 @@ SF( x, x, x, x, x, x, Y, x, x, x, x, x, R16_USCALED) SF( 50, 50, x, x, x, x, x, x, x, x, x, x, P8A8_UNORM_PALETTE0) SF( 50, 50, x, x, x, x, x, x, x, x, x, x, P8A8_UNORM_PALETTE1) - SF( x, x, x, x, x, x, x, x, x, x, x, x, A1B5G5R5_UNORM) + SF(120, 120, x, x, 120, 120, x, x, x, x, x, 120, A1B5G5R5_UNORM) /* According to the PRM, A4B4G4R4_UNORM isn't supported until Sky Lake * but empirical testing indicates that at least sampling works just fine * on Broadwell. */ - SF( 80, 80, x, x, 90, x, x, x, x, x, x, x, A4B4G4R4_UNORM) + SF( 80, 80, x, x, 90, 120, x, x, x, x, x, 120, A4B4G4R4_UNORM) SF( 90, x, x, x, x, x, x, x, x, x, x, x, L8A8_UINT) SF( 90, x, x, x, x, x, x, x, x, x, x, x, L8A8_SINT) - SF( Y, Y, x, 45, Y, Y, Y, x, x, 70, 110, x, R8_UNORM) - SF( Y, Y, x, x, Y, 60, Y, x, x, 70, 110, x, R8_SNORM) - SF( Y, x, x, x, Y, x, Y, x, x, 70, 90, x, R8_SINT) - SF( Y, x, x, x, Y, x, Y, x, x, 70, 75, x, R8_UINT) - SF( Y, Y, x, Y, Y, Y, x, x, x, 70, 110, x, A8_UNORM) + SF( Y, Y, x, 45, Y, Y, Y, x, x, 70, 110, 120, R8_UNORM) + SF( Y, Y, x, x, Y, 60, Y, x, x, 70, 110, 120, R8_SNORM) + SF( Y, x, x, x, Y, x, Y, x, x, 70, 90, 120, R8_SINT) + SF( Y, x, x, x, Y, x, Y, x, x, 70, 75, 120, R8_UINT) + SF( Y, Y, x, Y, Y, Y, x, x, x, 70, 110, 120, A8_UNORM) SF( Y, Y, x, x, x, x, x, x, x, x, x, x, I8_UNORM) SF( Y, Y, x, Y, x, x, x, x, x, x, x, x, L8_UNORM) SF( Y, Y, x, x, x, x, x, x, x, x, x, x, P4A4_UNORM_PALETTE0) @@ -532,8 +533,10 @@ isl_format_supports_ccs_d(const struct gen_device_info *devinfo, enum isl_format format) { - /* Fast clears were first added on Ivy Bridge */ - if (devinfo->gen < 7) + /* Clear-only compression was first added on Ivy Bridge and was last + * implemented on Ice lake (see BSpec: 43862). + */ + if (devinfo->gen < 7 || devinfo->gen > 11) return false; if (!isl_format_supports_rendering(devinfo, format)) @@ -559,13 +562,9 @@ /* For simplicity, only report that a format supports CCS_E if blorp can * perform bit-for-bit copies with an image of that format while compressed. - * This allows ISL users to avoid having to resolve the image before - * performing such a copy. We may want to change this behavior in the - * future. - * - * R11G11B10_FLOAT has no equivalent UINT format. Given how blorp_copy - * currently works, bit-for-bit copy operations are not possible without an - * intermediate resolve. + * Unfortunately, R11G11B10_FLOAT is in a compression class of its own and + * there is no way to copy to/from it which doesn't potentially loose data + * if one of the bit patterns being copied isn't valid finite floats. */ if (format == ISL_FORMAT_R11G11B10_FLOAT) return false; diff -Nru mesa-19.2.8/src/intel/isl/isl_format_layout.csv mesa-20.0.8/src/intel/isl/isl_format_layout.csv --- mesa-19.2.8/src/intel/isl/isl_format_layout.csv 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/isl/isl_format_layout.csv 2020-06-12 01:21:17.000000000 +0000 @@ -128,6 +128,7 @@ B10G10R10A2_UNORM , 32, 1, 1, 1, un10, un10, un10, un2, , , , bgra, linear, B10G10R10A2_UNORM_SRGB , 32, 1, 1, 1, un10, un10, un10, un2, , , , bgra, srgb, R11G11B10_FLOAT , 32, 1, 1, 1, uf11, uf11, uf10, , , , , rgb, linear, +R10G10B10_FLOAT_A2_UNORM , 32, 1, 1, 1, uf10, uf10, uf10, un2, , , , rgba, linear, R32_SINT , 32, 1, 1, 1, si32, , , , , , , r, linear, R32_UINT , 32, 1, 1, 1, ui32, , , , , , , r, linear, R32_FLOAT , 32, 1, 1, 1, sf32, , , , , , , r, linear, @@ -342,3 +343,8 @@ GEN9_CCS_32BPP , 2, 8, 4, 1, , , , , , , , , , ccs GEN9_CCS_64BPP , 2, 4, 4, 1, , , , , , , , , , ccs GEN9_CCS_128BPP , 2, 2, 4, 1, , , , , , , , , , ccs +GEN12_CCS_8BPP_Y0 , 4, 32, 4, 1, , , , , , , , , , ccs +GEN12_CCS_16BPP_Y0 , 4, 16, 4, 1, , , , , , , , , , ccs +GEN12_CCS_32BPP_Y0 , 4, 8, 4, 1, , , , , , , , , , ccs +GEN12_CCS_64BPP_Y0 , 4, 4, 4, 1, , , , , , , , , , ccs +GEN12_CCS_128BPP_Y0 , 4, 2, 4, 1, , , , , , , , , , ccs diff -Nru mesa-19.2.8/src/intel/isl/isl_gen12.c mesa-20.0.8/src/intel/isl/isl_gen12.c --- mesa-19.2.8/src/intel/isl/isl_gen12.c 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/intel/isl/isl_gen12.c 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2018 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "isl_gen9.h" +#include "isl_gen12.h" +#include "isl_priv.h" + +void +isl_gen12_choose_image_alignment_el(const struct isl_device *dev, + const struct isl_surf_init_info *restrict info, + enum isl_tiling tiling, + enum isl_dim_layout dim_layout, + enum isl_msaa_layout msaa_layout, + struct isl_extent3d *image_align_el) +{ + /* Handled by isl_choose_image_alignment_el */ + assert(info->format != ISL_FORMAT_HIZ); + + const struct isl_format_layout *fmtl = isl_format_get_layout(info->format); + if (fmtl->txc == ISL_TXC_CCS) { + /* This CCS compresses a 2D-view of the entire surface. */ + assert(info->levels == 1 && info->array_len == 1 && info->depth == 1); + *image_align_el = isl_extent3d(1, 1, 1); + return; + } + + if (isl_surf_usage_is_depth(info->usage)) { + /* The alignment parameters for depth buffers are summarized in the + * following table: + * + * Surface Format | MSAA | Align Width | Align Height + * -----------------+-------------+-------------+-------------- + * D16_UNORM | 1x, 4x, 16x | 8 | 8 + * ----------------+-------------+-------------+-------------- + * D16_UNORM | 2x, 8x | 16 | 4 + * ----------------+-------------+-------------+-------------- + * other | any | 8 | 4 + * -----------------+-------------+-------------+-------------- + */ + assert(isl_is_pow2(info->samples)); + *image_align_el = + info->format != ISL_FORMAT_R16_UNORM ? + isl_extent3d(8, 4, 1) : + (info->samples == 2 || info->samples == 8 ? + isl_extent3d(16, 4, 1) : isl_extent3d(8, 8, 1)); + } else if (isl_surf_usage_is_stencil(info->usage)) { + *image_align_el = isl_extent3d(16, 8, 1); + } else { + isl_gen9_choose_image_alignment_el(dev, info, tiling, dim_layout, + msaa_layout, image_align_el); + } +} diff -Nru mesa-19.2.8/src/intel/isl/isl_gen12.h mesa-20.0.8/src/intel/isl/isl_gen12.h --- mesa-19.2.8/src/intel/isl/isl_gen12.h 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/intel/isl/isl_gen12.h 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2018 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef ISL_GEN12_H +#define ISL_GEN12_H + +#include "isl.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void +isl_gen12_choose_image_alignment_el(const struct isl_device *dev, + const struct isl_surf_init_info *restrict info, + enum isl_tiling tiling, + enum isl_dim_layout dim_layout, + enum isl_msaa_layout msaa_layout, + struct isl_extent3d *image_align_el); + +#ifdef __cplusplus +} +#endif + +#endif /* ISL_GEN12_H */ diff -Nru mesa-19.2.8/src/intel/isl/isl_gen7.c mesa-20.0.8/src/intel/isl/isl_gen7.c --- mesa-19.2.8/src/intel/isl/isl_gen7.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/isl/isl_gen7.c 2020-06-12 01:21:17.000000000 +0000 @@ -213,11 +213,14 @@ *flags &= ISL_TILING_ANY_Y_MASK; } - /* Separate stencil requires W tiling, and W tiling requires separate - * stencil. - */ if (isl_surf_usage_is_stencil(info->usage)) { - *flags &= ISL_TILING_W_BIT; + if (ISL_DEV_GEN(dev) >= 12) { + /* Stencil requires Y. */ + *flags &= ISL_TILING_ANY_Y_MASK; + } else { + /* Stencil requires W. */ + *flags &= ISL_TILING_W_BIT; + } } else { *flags &= ~ISL_TILING_W_BIT; } @@ -248,9 +251,19 @@ } if (info->usage & ISL_SURF_USAGE_DISPLAY_BIT) { - /* Before Skylake, the display engine does not accept Y */ - /* FINISHME[SKL]: Y tiling for display surfaces */ - *flags &= (ISL_TILING_LINEAR_BIT | ISL_TILING_X_BIT); + if (ISL_DEV_GEN(dev) >= 12) { + *flags &= (ISL_TILING_LINEAR_BIT | ISL_TILING_X_BIT | + ISL_TILING_Y0_BIT); + } else if (ISL_DEV_GEN(dev) >= 9) { + /* Note we let Yf even though it was cleared above. This is just for + * completeness. + */ + *flags &= (ISL_TILING_LINEAR_BIT | ISL_TILING_X_BIT | + ISL_TILING_Y0_BIT | ISL_TILING_Yf_BIT); + } else { + /* Before Skylake, the display engine does not accept Y */ + *flags &= (ISL_TILING_LINEAR_BIT | ISL_TILING_X_BIT); + } } if (info->samples > 1) { diff -Nru mesa-19.2.8/src/intel/isl/isl_gen8.c mesa-20.0.8/src/intel/isl/isl_gen8.c --- mesa-19.2.8/src/intel/isl/isl_gen8.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/isl/isl_gen8.c 2020-06-12 01:21:17.000000000 +0000 @@ -151,7 +151,13 @@ */ const uint32_t valign = 4; - bool needs_halign16 = false; + /* XXX(chadv): I believe the hardware requires each image to be + * cache-aligned. If that's true, then defaulting to halign=4 is wrong for + * many formats. Depending on the format's block size, we may need to + * increase halign to 8. + */ + uint32_t halign = 4; + if (!(info->usage & ISL_SURF_USAGE_DISABLE_AUX_BIT)) { /* From the Broadwell PRM, Volume 2d "Command Reference: Structures", * RENDER_SURFACE_STATE Surface Horizontal Alignment, p326: @@ -163,15 +169,22 @@ * or CCS_E. Depth buffers, including those that own an auxiliary HiZ * surface, are handled above and do not require HALIGN_16. */ - needs_halign16 = true; + assert(halign <= 16); + halign = 16; } - /* XXX(chadv): I believe the hardware requires each image to be - * cache-aligned. If that's true, then defaulting to halign=4 is wrong for - * many formats. Depending on the format's block size, we may need to - * increase halign to 8. - */ - const uint32_t halign = needs_halign16 ? 16 : 4; + if (ISL_DEV_GEN(dev) >= 11 && isl_tiling_is_any_y(tiling) && + fmtl->bpb == 32 && info->samples == 1) { + /* GEN_BUG_1406667188: Pixel Corruption in subspan combining (8x4 + * combining) scenarios if halign=4. + * + * See RENDER_SURFACE_STATE in Ice Lake h/w spec: + * + * "For surface format = 32 bpp, num_multisamples = 1 , MIpcount > 0 + * and surface walk = TiledY, HALIGN must be programmed to 8" + */ + halign = MAX(halign, 8); + } *image_align_el = isl_extent3d(halign, valign, 1); } diff -Nru mesa-19.2.8/src/intel/isl/isl_genX_priv.h mesa-20.0.8/src/intel/isl/isl_genX_priv.h --- mesa-19.2.8/src/intel/isl/isl_genX_priv.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/isl/isl_genX_priv.h 2020-06-12 01:21:17.000000000 +0000 @@ -37,7 +37,7 @@ const struct isl_surf_fill_state_info *restrict info); void -isl_genX(buffer_fill_state_s)(void *state, +isl_genX(buffer_fill_state_s)(const struct isl_device *dev, void *state, const struct isl_buffer_fill_state_info *restrict info); void diff -Nru mesa-19.2.8/src/intel/isl/isl.h mesa-20.0.8/src/intel/isl/isl.h --- mesa-19.2.8/src/intel/isl/isl.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/isl/isl.h 2020-06-12 01:21:17.000000000 +0000 @@ -167,6 +167,7 @@ ISL_FORMAT_B10G10R10A2_UNORM = 209, ISL_FORMAT_B10G10R10A2_UNORM_SRGB = 210, ISL_FORMAT_R11G11B10_FLOAT = 211, + ISL_FORMAT_R10G10B10_FLOAT_A2_UNORM = 213, ISL_FORMAT_R32_SINT = 214, ISL_FORMAT_R32_UINT = 215, ISL_FORMAT_R32_FLOAT = 216, @@ -388,6 +389,11 @@ ISL_FORMAT_GEN9_CCS_32BPP, ISL_FORMAT_GEN9_CCS_64BPP, ISL_FORMAT_GEN9_CCS_128BPP, + ISL_FORMAT_GEN12_CCS_8BPP_Y0, + ISL_FORMAT_GEN12_CCS_16BPP_Y0, + ISL_FORMAT_GEN12_CCS_32BPP_Y0, + ISL_FORMAT_GEN12_CCS_64BPP_Y0, + ISL_FORMAT_GEN12_CCS_128BPP_Y0, /* An upper bound on the supported format enumerations */ ISL_NUM_FORMATS, @@ -464,6 +470,7 @@ ISL_TILING_Ys, /**< Standard 64K tiling. The 's' means "sixty-four". */ ISL_TILING_HIZ, /**< Tiling format for HiZ surfaces */ ISL_TILING_CCS, /**< Tiling format for CCS surfaces */ + ISL_TILING_GEN12_CCS, /**< Tiling format for Gen12 CCS surfaces */ }; /** @@ -479,6 +486,7 @@ #define ISL_TILING_Ys_BIT (1u << ISL_TILING_Ys) #define ISL_TILING_HIZ_BIT (1u << ISL_TILING_HIZ) #define ISL_TILING_CCS_BIT (1u << ISL_TILING_CCS) +#define ISL_TILING_GEN12_CCS_BIT (1u << ISL_TILING_GEN12_CCS) #define ISL_TILING_ANY_MASK (~0u) #define ISL_TILING_NON_LINEAR_MASK (~ISL_TILING_LINEAR_BIT) @@ -601,6 +609,21 @@ * @invariant isl_surf::samples == 1 */ ISL_AUX_USAGE_CCS_E, + + /** The auxiliary surface provides full lossless media color compression + * + * @invariant isl_surf::samples == 1 + */ + ISL_AUX_USAGE_MC, + + /** The auxiliary surface is a HiZ surface and CCS is also enabled */ + ISL_AUX_USAGE_HIZ_CCS, + + /** The auxiliary surface is an MCS and CCS is also enabled + * + * @invariant isl_surf::samples > 1 + */ + ISL_AUX_USAGE_MCS_CCS, }; /** @@ -992,6 +1015,11 @@ uint8_t stencil_offset; uint8_t hiz_offset; } ds; + + struct { + uint32_t internal; + uint32_t external; + } mocs; }; struct isl_extent2d { @@ -1422,6 +1450,11 @@ * The depth clear value */ float depth_clear_value; + + /** + * Track stencil aux usage for Gen >= 12 + */ + enum isl_aux_usage stencil_aux_usage; }; extern const struct isl_format_layout isl_format_layouts[]; @@ -1620,6 +1653,13 @@ isl_has_matching_typed_storage_image_format(const struct gen_device_info *devinfo, enum isl_format fmt); +static inline enum isl_tiling +isl_tiling_flag_to_enum(isl_tiling_flags_t flag) +{ + assert(__builtin_popcount(flag) == 1); + return (enum isl_tiling) (__builtin_ffs(flag) - 1); +} + static inline bool isl_tiling_is_any_y(enum isl_tiling tiling) { @@ -1638,6 +1678,44 @@ enum isl_tiling isl_tiling_from_i915_tiling(uint32_t tiling); +static inline bool +isl_aux_usage_has_hiz(enum isl_aux_usage usage) +{ + return usage == ISL_AUX_USAGE_HIZ || + usage == ISL_AUX_USAGE_HIZ_CCS; +} + +static inline bool +isl_aux_usage_has_mcs(enum isl_aux_usage usage) +{ + return usage == ISL_AUX_USAGE_MCS || + usage == ISL_AUX_USAGE_MCS_CCS; +} + +static inline bool +isl_aux_usage_has_ccs(enum isl_aux_usage usage) +{ + return usage == ISL_AUX_USAGE_CCS_D || + usage == ISL_AUX_USAGE_CCS_E || + usage == ISL_AUX_USAGE_MC || + usage == ISL_AUX_USAGE_HIZ_CCS || + usage == ISL_AUX_USAGE_MCS_CCS; +} + +static inline bool +isl_aux_state_has_valid_primary(enum isl_aux_state state) +{ + return state == ISL_AUX_STATE_RESOLVED || + state == ISL_AUX_STATE_PASS_THROUGH || + state == ISL_AUX_STATE_AUX_INVALID; +} + +static inline bool +isl_aux_state_has_valid_aux(enum isl_aux_state state) +{ + return state != ISL_AUX_STATE_AUX_INVALID; +} + const struct isl_drm_modifier_info * ATTRIBUTE_CONST isl_drm_modifier_get_info(uint64_t modifier); @@ -1818,7 +1896,8 @@ bool isl_surf_get_ccs_surf(const struct isl_device *dev, const struct isl_surf *surf, - struct isl_surf *ccs_surf, + struct isl_surf *aux_surf, + struct isl_surf *extra_aux_surf, uint32_t row_pitch_B /**< Ignored if 0 */); #define isl_surf_fill_state(dev, state, ...) \ @@ -2026,6 +2105,27 @@ uint32_t *y_offset_sa); /** + * Calculate the range in bytes occupied by a subimage, to the nearest tile. + * + * The range returned will be the smallest memory range in which the give + * subimage fits, rounded to even tiles. Intel images do not usually have a + * direct subimage -> range mapping so the range returned may contain data + * from other sub-images. The returned range is a half-open interval where + * all of the addresses within the subimage are < end_tile_B. + * + * @invariant level < surface levels + * @invariant logical_array_layer < logical array length of surface + * @invariant logical_z_offset_px < logical depth of surface at level + */ +void +isl_surf_get_image_range_B_tile(const struct isl_surf *surf, + uint32_t level, + uint32_t logical_array_layer, + uint32_t logical_z_offset_px, + uint32_t *start_tile_B, + uint32_t *end_tile_B); + +/** * Create an isl_surf that represents a particular subimage in the surface. * * The newly created surface will have a single miplevel and array slice. The @@ -2108,6 +2208,14 @@ const struct isl_surf *surf); /** + * @brief determines if a surface supports writing through HIZ to the CCS. + */ +bool +isl_surf_supports_hiz_ccs_wt(const struct gen_device_info *dev, + const struct isl_surf *surf, + enum isl_aux_usage aux_usage); + +/** * @brief performs a copy from linear to tiled surface * */ diff -Nru mesa-19.2.8/src/intel/isl/isl_priv.h mesa-20.0.8/src/intel/isl/isl_priv.h --- mesa-19.2.8/src/intel/isl/isl_priv.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/isl/isl_priv.h 2020-06-12 01:21:17.000000000 +0000 @@ -232,6 +232,9 @@ # define genX(x) gen11_##x # include "isl_genX_priv.h" # undef genX +# define genX(x) gen12_##x +# include "isl_genX_priv.h" +# undef genX #endif #endif /* ISL_PRIV_H */ diff -Nru mesa-19.2.8/src/intel/isl/isl_surface_state.c mesa-20.0.8/src/intel/isl/isl_surface_state.c --- mesa-19.2.8/src/intel/isl/isl_surface_state.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/isl/isl_surface_state.c 2020-06-12 01:21:17.000000000 +0000 @@ -72,7 +72,9 @@ [ISL_TILING_Y0] = YMAJOR, [ISL_TILING_Yf] = YMAJOR, [ISL_TILING_Ys] = YMAJOR, +#if GEN_GEN <= 11 [ISL_TILING_W] = WMAJOR, +#endif }; #endif @@ -84,7 +86,14 @@ }; #endif -#if GEN_GEN >= 9 +#if GEN_GEN >= 12 +static const uint32_t isl_to_gen_aux_mode[] = { + [ISL_AUX_USAGE_NONE] = AUX_NONE, + [ISL_AUX_USAGE_MCS] = AUX_CCS_E, + [ISL_AUX_USAGE_CCS_E] = AUX_CCS_E, + [ISL_AUX_USAGE_MCS_CCS] = AUX_MCS_LCE, +}; +#elif GEN_GEN >= 9 static const uint32_t isl_to_gen_aux_mode[] = { [ISL_AUX_USAGE_NONE] = AUX_NONE, [ISL_AUX_USAGE_HIZ] = AUX_HIZ, @@ -272,6 +281,11 @@ s.SurfaceFormat = info->view->format; +#if GEN_GEN >= 12 + s.DepthStencilResource = + isl_surf_usage_is_depth_or_stencil(info->surf->usage); +#endif + #if GEN_GEN <= 5 s.ColorBufferComponentWriteDisables = info->write_disables; #else @@ -388,7 +402,11 @@ unreachable("bad SurfaceType"); } -#if GEN_GEN >= 7 +#if GEN_GEN >= 12 + /* GEN:BUG:1806565034: Only set SurfaceArray if arrayed surface is > 1. */ + s.SurfaceArray = info->surf->dim != ISL_SURF_DIM_3D && + info->view->array_len > 1; +#elif GEN_GEN >= 7 s.SurfaceArray = info->surf->dim != ISL_SURF_DIM_3D; #endif @@ -441,6 +459,7 @@ #endif #if GEN_GEN >= 8 + assert(GEN_GEN < 12 || info->surf->tiling != ISL_TILING_W); s.TileMode = isl_to_gen_tiling[info->surf->tiling]; #else s.TiledSurface = info->surf->tiling != ISL_TILING_LINEAR, @@ -535,30 +554,41 @@ #endif #if GEN_GEN >= 7 - if (info->aux_surf && info->aux_usage != ISL_AUX_USAGE_NONE) { + if (info->aux_usage != ISL_AUX_USAGE_NONE) { + /* Check valid aux usages per-gen */ + if (GEN_GEN >= 12) { + assert(info->aux_usage == ISL_AUX_USAGE_MCS || + info->aux_usage == ISL_AUX_USAGE_CCS_E || + info->aux_usage == ISL_AUX_USAGE_MCS_CCS); + } else if (GEN_GEN >= 9) { + assert(info->aux_usage == ISL_AUX_USAGE_HIZ || + info->aux_usage == ISL_AUX_USAGE_MCS || + info->aux_usage == ISL_AUX_USAGE_CCS_D || + info->aux_usage == ISL_AUX_USAGE_CCS_E); + } else if (GEN_GEN >= 8) { + assert(info->aux_usage == ISL_AUX_USAGE_HIZ || + info->aux_usage == ISL_AUX_USAGE_MCS || + info->aux_usage == ISL_AUX_USAGE_CCS_D); + } else if (GEN_GEN >= 7) { + assert(info->aux_usage == ISL_AUX_USAGE_MCS || + info->aux_usage == ISL_AUX_USAGE_CCS_D); + } + + if (GEN_GEN >= 12) { + /* We don't need an auxiliary surface for CCS on gen12+ */ + assert (info->aux_usage == ISL_AUX_USAGE_CCS_E || + info->aux_usage == ISL_AUX_USAGE_MC || info->aux_surf); + } else { + /* We must have an auxiliary surface */ + assert(info->aux_surf); + } + /* The docs don't appear to say anything whatsoever about compression * and the data port. Testing seems to indicate that the data port * completely ignores the AuxiliarySurfaceMode field. */ assert(!(info->view->usage & ISL_SURF_USAGE_STORAGE_BIT)); - struct isl_tile_info tile_info; - isl_surf_get_tile_info(info->aux_surf, &tile_info); - uint32_t pitch_in_tiles = - info->aux_surf->row_pitch_B / tile_info.phys_extent_B.width; - - s.AuxiliarySurfaceBaseAddress = info->aux_address; - s.AuxiliarySurfacePitch = pitch_in_tiles - 1; - -#if GEN_GEN >= 8 - assert(GEN_GEN >= 9 || info->aux_usage != ISL_AUX_USAGE_CCS_E); - /* Auxiliary surfaces in ISL have compressed formats but the hardware - * doesn't expect our definition of the compression, it expects qpitch - * in units of samples on the main surface. - */ - s.AuxiliarySurfaceQPitch = - isl_surf_get_array_pitch_sa_rows(info->aux_surf) >> 2; - if (info->aux_usage == ISL_AUX_USAGE_HIZ) { /* The number of samples must be 1 */ assert(info->surf->samples == 1); @@ -578,16 +608,43 @@ } } +#if GEN_GEN >= 8 s.AuxiliarySurfaceMode = isl_to_gen_aux_mode[info->aux_usage]; #else - assert(info->aux_usage == ISL_AUX_USAGE_MCS || - info->aux_usage == ISL_AUX_USAGE_CCS_D); s.MCSEnable = true; #endif } -#endif + + /* The auxiliary buffer info is filled when it's useable by the HW. On + * gen12 and above, CCS is controlled by the aux table and not the + * auxiliary surface information in SURFACE_STATE. + */ + if (info->aux_usage != ISL_AUX_USAGE_NONE && + ((info->aux_usage != ISL_AUX_USAGE_MC && + info->aux_usage != ISL_AUX_USAGE_CCS_E) || GEN_GEN <= 11)) { + + assert(info->aux_surf != NULL); + + struct isl_tile_info tile_info; + isl_surf_get_tile_info(info->aux_surf, &tile_info); + uint32_t pitch_in_tiles = + info->aux_surf->row_pitch_B / tile_info.phys_extent_B.width; + + s.AuxiliarySurfaceBaseAddress = info->aux_address; + s.AuxiliarySurfacePitch = pitch_in_tiles - 1; #if GEN_GEN >= 8 + /* Auxiliary surfaces in ISL have compressed formats but the hardware + * doesn't expect our definition of the compression, it expects qpitch + * in units of samples on the main surface. + */ + s.AuxiliarySurfaceQPitch = + isl_surf_get_array_pitch_sa_rows(info->aux_surf) >> 2; +#endif + } +#endif + +#if GEN_GEN >= 8 && GEN_GEN < 11 /* From the CHV PRM, Volume 2d, page 321 (RENDER_SURFACE_STATE dword 0 * bit 9 "Sampler L2 Bypass Mode Disable" Programming Notes): * @@ -648,7 +705,9 @@ } #endif -#if GEN_GEN >= 9 +#if GEN_GEN >= 12 + assert(info->use_clear_address); +#elif GEN_GEN >= 9 if (!info->use_clear_address) { s.RedClearColor = info->clear_color.u32[0]; s.GreenClearColor = info->clear_color.u32[1]; @@ -685,7 +744,7 @@ } void -isl_genX(buffer_fill_state_s)(void *state, +isl_genX(buffer_fill_state_s)(const struct isl_device *dev, void *state, const struct isl_buffer_fill_state_info *restrict info) { uint64_t buffer_size = info->size_B; @@ -751,6 +810,25 @@ s.Depth = ((num_elements - 1) >> 20) & 0x7f; #endif + if (GEN_GEN == 12 && dev->info->revision == 0) { + /* TGL-LP A0 has a HW bug (fixed in later HW) which causes buffer + * textures with very close base addresses (delta < 64B) to corrupt each + * other. We can sort-of work around this by making small buffer + * textures 1D textures instead. This doesn't fix the problem for large + * buffer textures but the liklihood of large, overlapping, and very + * close buffer textures is fairly low and the point is to hack around + * the bug so we can run apps and tests. + */ + if (info->format != ISL_FORMAT_RAW && + info->stride_B == isl_format_get_layout(info->format)->bpb / 8 && + num_elements <= (1 << 14)) { + s.SurfaceType = SURFTYPE_1D; + s.Width = num_elements - 1; + s.Height = 0; + s.Depth = 0; + } + } + s.SurfacePitch = info->stride_B - 1; #if GEN_GEN >= 6 @@ -792,11 +870,11 @@ /* We previously had this format set to B8G8R8A8_UNORM but ran into * hangs on IVB. R32_UINT seems to work for everybody. * - * https://gitlab.freedesktop.org/mesa/mesa/issues/1872 + * https://gitlab.freedesktop.org/mesa/mesa/-/issues/1872 */ .SurfaceFormat = ISL_FORMAT_R32_UINT, #if GEN_GEN >= 7 - .SurfaceArray = size.depth > 0, + .SurfaceArray = size.depth > 1, #endif #if GEN_GEN >= 8 .TileMode = YMAJOR, @@ -804,6 +882,19 @@ .TiledSurface = true, .TileWalk = TILEWALK_YMAJOR, #endif +#if GEN_GEN == 7 + /* According to PRMs: "Volume 4 Part 1: Subsystem and Cores – Shared + * Functions" + * + * RENDER_SURFACE_STATE::Surface Vertical Alignment + * + * "This field must be set to VALIGN_4 for all tiled Y Render Target + * surfaces." + * + * Affect IVB, HSW. + */ + .SurfaceVerticalAlignment = VALIGN_4, +#endif .Width = size.width - 1, .Height = size.height - 1, .Depth = size.depth - 1, diff -Nru mesa-19.2.8/src/intel/isl/meson.build mesa-20.0.8/src/intel/isl/meson.build --- mesa-19.2.8/src/intel/isl/meson.build 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/isl/meson.build 2020-06-12 01:21:17.000000000 +0000 @@ -48,10 +48,16 @@ 'isl_gen9.h', ) +isl_gen12_files = files( + 'isl_gen12.c', + 'isl_gen12.h', +) + isl_gen_libs = [] foreach g : [['40', isl_gen4_files], ['50', []], ['60', isl_gen6_files], ['70', isl_gen7_files], ['75', []], ['80', isl_gen8_files], - ['90', isl_gen9_files], ['100', []], ['110', []]] + ['90', isl_gen9_files], ['100', []], ['110', []], + ['120', isl_gen12_files]] _gen = g[0] isl_gen_libs += static_library( 'isl_gen@0@'.format(_gen), diff -Nru mesa-19.2.8/src/intel/Makefile.sources mesa-20.0.8/src/intel/Makefile.sources --- mesa-19.2.8/src/intel/Makefile.sources 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/Makefile.sources 2020-06-12 01:21:17.000000000 +0000 @@ -8,6 +8,9 @@ blorp/blorp_priv.h COMMON_FILES = \ + common/gen_aux_map.c \ + common/gen_aux_map.h \ + common/gen_buffer_alloc.h \ common/gen_clflush.h \ common/gen_batch_decoder.c \ common/gen_decoder.c \ @@ -43,7 +46,7 @@ compiler/brw_disasm.c \ compiler/brw_disasm_info.c \ compiler/brw_disasm_info.h \ - compiler/brw_eu.c \ + compiler/brw_eu.cpp \ compiler/brw_eu_compact.c \ compiler/brw_eu_defines.h \ compiler/brw_eu_emit.c \ @@ -68,6 +71,7 @@ compiler/brw_fs_reg_allocate.cpp \ compiler/brw_fs_register_coalesce.cpp \ compiler/brw_fs_saturate_propagation.cpp \ + compiler/brw_fs_scoreboard.cpp \ compiler/brw_fs_sel_peephole.cpp \ compiler/brw_fs_validate.cpp \ compiler/brw_fs_visitor.cpp \ @@ -80,7 +84,9 @@ compiler/brw_nir.c \ compiler/brw_nir_analyze_boolean_resolves.c \ compiler/brw_nir_analyze_ubo_ranges.c \ + compiler/brw_nir_clamp_image_1d_2d_array_sizes.c \ compiler/brw_nir_attribute_workarounds.c \ + compiler/brw_nir_lower_alpha_to_coverage.c \ compiler/brw_nir_lower_conversions.c \ compiler/brw_nir_lower_cs_intrinsics.c \ compiler/brw_nir_lower_image_load_store.c \ @@ -143,7 +149,8 @@ genxml/gen8.xml \ genxml/gen9.xml \ genxml/gen10.xml \ - genxml/gen11.xml + genxml/gen11.xml \ + genxml/gen12.xml GENXML_GENERATED_PACK_FILES = \ genxml/gen4_pack.h \ @@ -155,7 +162,8 @@ genxml/gen8_pack.h \ genxml/gen9_pack.h \ genxml/gen10_pack.h \ - genxml/gen11_pack.h + genxml/gen11_pack.h \ + genxml/gen12_pack.h GENXML_GENERATED_FILES = \ $(GENXML_GENERATED_PACK_FILES) \ @@ -217,6 +225,12 @@ isl/isl_emit_depth_stencil.c \ isl/isl_surface_state.c +ISL_GEN12_FILES = \ + isl/isl_gen12.c \ + isl/isl_gen12.h \ + isl/isl_emit_depth_stencil.c \ + isl/isl_surface_state.c + ISL_GENERATED_FILES = \ isl/isl_format_layout.c @@ -245,10 +259,11 @@ vulkan/anv_nir.h \ vulkan/anv_nir_add_base_work_group_id.c \ vulkan/anv_nir_apply_pipeline_layout.c \ + vulkan/anv_nir_compute_push_layout.c \ vulkan/anv_nir_lower_multiview.c \ - vulkan/anv_nir_lower_push_constants.c \ vulkan/anv_nir_lower_ycbcr_textures.c \ vulkan/anv_pass.c \ + vulkan/anv_perf.c \ vulkan/anv_pipeline.c \ vulkan/anv_pipeline_cache.c \ vulkan/anv_private.h \ @@ -316,6 +331,10 @@ vulkan/gen8_cmd_buffer.c \ $(VULKAN_GENX_FILES) +VULKAN_GEN12_FILES := \ + vulkan/gen8_cmd_buffer.c \ + $(VULKAN_GENX_FILES) + GEN_PERF_XML_FILES = \ perf/oa-hsw.xml \ perf/oa-bdw.xml \ @@ -330,13 +349,18 @@ perf/oa-cflgt2.xml \ perf/oa-cflgt3.xml \ perf/oa-cnl.xml \ - perf/oa-icl.xml + perf/oa-icl.xml \ + perf/oa-lkf.xml \ + perf/oa-tgl.xml GEN_PERF_FILES = \ perf/gen_perf.c \ perf/gen_perf.h \ + perf/gen_perf_mdapi.c \ perf/gen_perf_mdapi.h \ - perf/gen_perf_mdapi.c + perf/gen_perf_private.h \ + perf/gen_perf_query.h \ + perf/gen_perf_query.c GEN_PERF_GENERATED_FILES = \ perf/gen_perf_metrics.c \ diff -Nru mesa-19.2.8/src/intel/meson.build mesa-20.0.8/src/intel/meson.build --- mesa-19.2.8/src/intel/meson.build 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/meson.build 2020-06-12 01:21:17.000000000 +0000 @@ -28,7 +28,7 @@ subdir('common') subdir('compiler') subdir('perf') -if with_tools.contains('intel') or with_tools.contains('intel-ui') +if with_intel_tools subdir('tools') endif if with_intel_vk diff -Nru mesa-19.2.8/src/intel/perf/gen_perf.c mesa-20.0.8/src/intel/perf/gen_perf.c --- mesa-19.2.8/src/intel/perf/gen_perf.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/perf/gen_perf.c 2020-06-12 01:21:17.000000000 +0000 @@ -29,363 +29,48 @@ #include #include +#ifndef HAVE_DIRENT_D_TYPE +#include // PATH_MAX +#endif + #include #include "common/gen_gem.h" -#include "gen_perf.h" -#include "perf/gen_perf_mdapi.h" -#include "perf/gen_perf_metrics.h" #include "dev/gen_debug.h" #include "dev/gen_device_info.h" + +#include "perf/gen_perf.h" +#include "perf/gen_perf_regs.h" +#include "perf/gen_perf_mdapi.h" +#include "perf/gen_perf_metrics.h" +#include "perf/gen_perf_private.h" + #include "util/bitscan.h" +#include "util/mesa-sha1.h" #include "util/u_math.h" #define FILE_DEBUG_FLAG DEBUG_PERFMON -#define MI_RPC_BO_SIZE 4096 -#define MI_FREQ_START_OFFSET_BYTES (3072) -#define MI_RPC_BO_END_OFFSET_BYTES (MI_RPC_BO_SIZE / 2) -#define MI_FREQ_END_OFFSET_BYTES (3076) - -#define INTEL_MASK(high, low) (((1u<<((high)-(low)+1))-1)<<(low)) - -#define GEN7_RPSTAT1 0xA01C -#define GEN7_RPSTAT1_CURR_GT_FREQ_SHIFT 7 -#define GEN7_RPSTAT1_CURR_GT_FREQ_MASK INTEL_MASK(13, 7) -#define GEN7_RPSTAT1_PREV_GT_FREQ_SHIFT 0 -#define GEN7_RPSTAT1_PREV_GT_FREQ_MASK INTEL_MASK(6, 0) - -#define GEN9_RPSTAT0 0xA01C -#define GEN9_RPSTAT0_CURR_GT_FREQ_SHIFT 23 -#define GEN9_RPSTAT0_CURR_GT_FREQ_MASK INTEL_MASK(31, 23) -#define GEN9_RPSTAT0_PREV_GT_FREQ_SHIFT 0 -#define GEN9_RPSTAT0_PREV_GT_FREQ_MASK INTEL_MASK(8, 0) - -#define GEN6_SO_PRIM_STORAGE_NEEDED 0x2280 -#define GEN7_SO_PRIM_STORAGE_NEEDED(n) (0x5240 + (n) * 8) -#define GEN6_SO_NUM_PRIMS_WRITTEN 0x2288 -#define GEN7_SO_NUM_PRIMS_WRITTEN(n) (0x5200 + (n) * 8) - -#define MAP_READ (1 << 0) -#define MAP_WRITE (1 << 1) #define OA_REPORT_INVALID_CTX_ID (0xffffffff) -/** - * Periodic OA samples are read() into these buffer structures via the - * i915 perf kernel interface and appended to the - * perf_ctx->sample_buffers linked list. When we process the - * results of an OA metrics query we need to consider all the periodic - * samples between the Begin and End MI_REPORT_PERF_COUNT command - * markers. - * - * 'Periodic' is a simplification as there are other automatic reports - * written by the hardware also buffered here. - * - * Considering three queries, A, B and C: - * - * Time ----> - * ________________A_________________ - * | | - * | ________B_________ _____C___________ - * | | | | | | - * - * And an illustration of sample buffers read over this time frame: - * [HEAD ][ ][ ][ ][ ][ ][ ][ ][TAIL ] - * - * These nodes may hold samples for query A: - * [ ][ ][ A ][ A ][ A ][ A ][ A ][ ][ ] - * - * These nodes may hold samples for query B: - * [ ][ ][ B ][ B ][ B ][ ][ ][ ][ ] - * - * These nodes may hold samples for query C: - * [ ][ ][ ][ ][ ][ C ][ C ][ C ][ ] - * - * The illustration assumes we have an even distribution of periodic - * samples so all nodes have the same size plotted against time: - * - * Note, to simplify code, the list is never empty. - * - * With overlapping queries we can see that periodic OA reports may - * relate to multiple queries and care needs to be take to keep - * track of sample buffers until there are no queries that might - * depend on their contents. - * - * We use a node ref counting system where a reference ensures that a - * node and all following nodes can't be freed/recycled until the - * reference drops to zero. - * - * E.g. with a ref of one here: - * [ 0 ][ 0 ][ 1 ][ 0 ][ 0 ][ 0 ][ 0 ][ 0 ][ 0 ] - * - * These nodes could be freed or recycled ("reaped"): - * [ 0 ][ 0 ] - * - * These must be preserved until the leading ref drops to zero: - * [ 1 ][ 0 ][ 0 ][ 0 ][ 0 ][ 0 ][ 0 ] - * - * When a query starts we take a reference on the current tail of - * the list, knowing that no already-buffered samples can possibly - * relate to the newly-started query. A pointer to this node is - * also saved in the query object's ->oa.samples_head. - * - * E.g. starting query A while there are two nodes in .sample_buffers: - * ________________A________ - * | - * - * [ 0 ][ 1 ] - * ^_______ Add a reference and store pointer to node in - * A->oa.samples_head - * - * Moving forward to when the B query starts with no new buffer nodes: - * (for reference, i915 perf reads() are only done when queries finish) - * ________________A_______ - * | ________B___ - * | | - * - * [ 0 ][ 2 ] - * ^_______ Add a reference and store pointer to - * node in B->oa.samples_head - * - * Once a query is finished, after an OA query has become 'Ready', - * once the End OA report has landed and after we we have processed - * all the intermediate periodic samples then we drop the - * ->oa.samples_head reference we took at the start. - * - * So when the B query has finished we have: - * ________________A________ - * | ______B___________ - * | | | - * [ 0 ][ 1 ][ 0 ][ 0 ][ 0 ] - * ^_______ Drop B->oa.samples_head reference - * - * We still can't free these due to the A->oa.samples_head ref: - * [ 1 ][ 0 ][ 0 ][ 0 ] - * - * When the A query finishes: (note there's a new ref for C's samples_head) - * ________________A_________________ - * | | - * | _____C_________ - * | | | - * [ 0 ][ 0 ][ 0 ][ 0 ][ 1 ][ 0 ][ 0 ] - * ^_______ Drop A->oa.samples_head reference - * - * And we can now reap these nodes up to the C->oa.samples_head: - * [ X ][ X ][ X ][ X ] - * keeping -> [ 1 ][ 0 ][ 0 ] - * - * We reap old sample buffers each time we finish processing an OA - * query by iterating the sample_buffers list from the head until we - * find a referenced node and stop. - * - * Reaped buffers move to a perfquery.free_sample_buffers list and - * when we come to read() we first look to recycle a buffer from the - * free_sample_buffers list before allocating a new buffer. - */ -struct oa_sample_buf { - struct exec_node link; - int refcount; - int len; - uint8_t buf[I915_PERF_OA_SAMPLE_SIZE * 10]; - uint32_t last_timestamp; -}; - -/** - * gen representation of a performance query object. - * - * NB: We want to keep this structure relatively lean considering that - * applications may expect to allocate enough objects to be able to - * query around all draw calls in a frame. - */ -struct gen_perf_query_object -{ - const struct gen_perf_query_info *queryinfo; - - /* See query->kind to know which state below is in use... */ - union { - struct { - - /** - * BO containing OA counter snapshots at query Begin/End time. - */ - void *bo; - - /** - * Address of mapped of @bo - */ - void *map; - - /** - * The MI_REPORT_PERF_COUNT command lets us specify a unique - * ID that will be reflected in the resulting OA report - * that's written by the GPU. This is the ID we're expecting - * in the begin report and the the end report should be - * @begin_report_id + 1. - */ - int begin_report_id; - - /** - * Reference the head of the brw->perfquery.sample_buffers - * list at the time that the query started (so we only need - * to look at nodes after this point when looking for samples - * related to this query) - * - * (See struct brw_oa_sample_buf description for more details) - */ - struct exec_node *samples_head; - - /** - * false while in the unaccumulated_elements list, and set to - * true when the final, end MI_RPC snapshot has been - * accumulated. - */ - bool results_accumulated; - - /** - * Frequency of the GT at begin and end of the query. - */ - uint64_t gt_frequency[2]; - - /** - * Accumulated OA results between begin and end of the query. - */ - struct gen_perf_query_result result; - } oa; - - struct { - /** - * BO containing starting and ending snapshots for the - * statistics counters. - */ - void *bo; - } pipeline_stats; - }; -}; - -struct gen_perf_context { - struct gen_perf_config *perf; - - void * ctx; /* driver context (eg, brw_context) */ - void * bufmgr; - const struct gen_device_info *devinfo; - - uint32_t hw_ctx; - int drm_fd; - - /* The i915 perf stream we open to setup + enable the OA counters */ - int oa_stream_fd; - - /* An i915 perf stream fd gives exclusive access to the OA unit that will - * report counter snapshots for a specific counter set/profile in a - * specific layout/format so we can only start OA queries that are - * compatible with the currently open fd... - */ - int current_oa_metrics_set_id; - int current_oa_format; - - /* List of buffers containing OA reports */ - struct exec_list sample_buffers; - - /* Cached list of empty sample buffers */ - struct exec_list free_sample_buffers; - - int n_active_oa_queries; - int n_active_pipeline_stats_queries; - - /* The number of queries depending on running OA counters which - * extends beyond brw_end_perf_query() since we need to wait until - * the last MI_RPC command has parsed by the GPU. - * - * Accurate accounting is important here as emitting an - * MI_REPORT_PERF_COUNT command while the OA unit is disabled will - * effectively hang the gpu. - */ - int n_oa_users; - - /* To help catch an spurious problem with the hardware or perf - * forwarding samples, we emit each MI_REPORT_PERF_COUNT command - * with a unique ID that we can explicitly check for... - */ - int next_query_start_report_id; - - /** - * An array of queries whose results haven't yet been assembled - * based on the data in buffer objects. - * - * These may be active, or have already ended. However, the - * results have not been requested. - */ - struct gen_perf_query_object **unaccumulated; - int unaccumulated_elements; - int unaccumulated_array_size; - - /* The total number of query objects so we can relinquish - * our exclusive access to perf if the application deletes - * all of its objects. (NB: We only disable perf while - * there are no active queries) - */ - int n_query_instances; -}; - -const struct gen_perf_query_info* -gen_perf_query_info(const struct gen_perf_query_object *query) -{ - return query->queryinfo; -} - -struct gen_perf_context * -gen_perf_new_context(void *parent) -{ - struct gen_perf_context *ctx = rzalloc(parent, struct gen_perf_context); - if (! ctx) - fprintf(stderr, "%s: failed to alloc context\n", __func__); - return ctx; -} - -struct gen_perf_config * -gen_perf_config(struct gen_perf_context *ctx) +static inline uint64_t to_user_pointer(void *ptr) { - return ctx->perf; -} - -struct gen_perf_query_object * -gen_perf_new_query(struct gen_perf_context *perf_ctx, unsigned query_index) -{ - const struct gen_perf_query_info *query = - &perf_ctx->perf->queries[query_index]; - struct gen_perf_query_object *obj = - calloc(1, sizeof(struct gen_perf_query_object)); - - if (!obj) - return NULL; - - obj->queryinfo = query; - - perf_ctx->n_query_instances++; - return obj; + return (uintptr_t) ptr; } -int -gen_perf_active_queries(struct gen_perf_context *perf_ctx, - const struct gen_perf_query_info *query) +static bool +is_dir_or_link(const struct dirent *entry, const char *parent_dir) { - assert(perf_ctx->n_active_oa_queries == 0 || perf_ctx->n_active_pipeline_stats_queries == 0); - - switch (query->kind) { - case GEN_PERF_QUERY_TYPE_OA: - case GEN_PERF_QUERY_TYPE_RAW: - return perf_ctx->n_active_oa_queries; - break; - - case GEN_PERF_QUERY_TYPE_PIPELINE: - return perf_ctx->n_active_pipeline_stats_queries; - break; - - default: - unreachable("Unknown query type"); - break; - } +#ifdef HAVE_DIRENT_D_TYPE + return entry->d_type == DT_DIR || entry->d_type == DT_LNK; +#else + struct stat st; + char path[PATH_MAX + 1]; + snprintf(path, sizeof(path), "%s/%s", parent_dir, entry->d_name); + lstat(path, &st); + return S_ISDIR(st.st_mode) || S_ISLNK(st.st_mode); +#endif } static bool @@ -427,8 +112,7 @@ } while ((drm_entry = readdir(drmdir))) { - if ((drm_entry->d_type == DT_DIR || - drm_entry->d_type == DT_LNK) && + if (is_dir_or_link(drm_entry, perf->sysfs_dev_dir) && strncmp(drm_entry->d_name, "card", 4) == 0) { len = snprintf(perf->sysfs_dev_dir, @@ -489,32 +173,13 @@ return read_file_uint64(buf, value); } -static inline struct gen_perf_query_info * -append_query_info(struct gen_perf_config *perf, int max_counters) -{ - struct gen_perf_query_info *query; - - perf->queries = reralloc(perf, perf->queries, - struct gen_perf_query_info, - ++perf->n_queries); - query = &perf->queries[perf->n_queries - 1]; - memset(query, 0, sizeof(*query)); - - if (max_counters > 0) { - query->max_counters = max_counters; - query->counters = - rzalloc_array(perf, struct gen_perf_query_counter, max_counters); - } - - return query; -} - static void register_oa_config(struct gen_perf_config *perf, const struct gen_perf_query_info *query, uint64_t config_id) { - struct gen_perf_query_info *registered_query = append_query_info(perf, 0); + struct gen_perf_query_info *registered_query = + gen_perf_append_query_info(perf, 0); *registered_query = *query; registered_query->oa_metrics_set_id = config_id; @@ -544,9 +209,7 @@ while ((metric_entry = readdir(metricsdir))) { struct hash_entry *entry; - - if ((metric_entry->d_type != DT_DIR && - metric_entry->d_type != DT_LNK) || + if (!is_dir_or_link(metric_entry, buf) || metric_entry->d_name[0] == '.') continue; @@ -555,15 +218,7 @@ metric_entry->d_name); if (entry) { uint64_t id; - - len = snprintf(buf, sizeof(buf), "%s/metrics/%s/id", - perf->sysfs_dev_dir, metric_entry->d_name); - if (len < 0 || len >= sizeof(buf)) { - DBG("Failed to concatenate path to sysfs metric id file\n"); - continue; - } - - if (!read_file_uint64(buf, &id)) { + if (!gen_perf_load_metric_id(perf, metric_entry->d_name, &id)) { DBG("Failed to read metric set id from %s: %m", buf); continue; } @@ -585,48 +240,106 @@ &invalid_config_id) < 0 && errno == ENOENT; } +static int +i915_query_items(struct gen_perf_config *perf, int fd, + struct drm_i915_query_item *items, uint32_t n_items) +{ + struct drm_i915_query q = { + .num_items = n_items, + .items_ptr = to_user_pointer(items), + }; + return gen_ioctl(fd, DRM_IOCTL_I915_QUERY, &q); +} + +static bool +i915_query_perf_config_supported(struct gen_perf_config *perf, int fd) +{ + struct drm_i915_query_item item = { + .query_id = DRM_I915_QUERY_PERF_CONFIG, + .flags = DRM_I915_QUERY_PERF_CONFIG_LIST, + }; + + return i915_query_items(perf, fd, &item, 1) == 0 && item.length > 0; +} + static bool -load_metric_id(struct gen_perf_config *perf, const char *guid, - uint64_t *metric_id) +i915_query_perf_config_data(struct gen_perf_config *perf, + int fd, const char *guid, + struct drm_i915_perf_oa_config *config) +{ + struct { + struct drm_i915_query_perf_config query; + struct drm_i915_perf_oa_config config; + } item_data; + struct drm_i915_query_item item = { + .query_id = DRM_I915_QUERY_PERF_CONFIG, + .flags = DRM_I915_QUERY_PERF_CONFIG_DATA_FOR_UUID, + .data_ptr = to_user_pointer(&item_data), + .length = sizeof(item_data), + }; + + memset(&item_data, 0, sizeof(item_data)); + memcpy(item_data.query.uuid, guid, sizeof(item_data.query.uuid)); + memcpy(&item_data.config, config, sizeof(item_data.config)); + + if (!(i915_query_items(perf, fd, &item, 1) == 0 && item.length > 0)) + return false; + + memcpy(config, &item_data.config, sizeof(item_data.config)); + + return true; +} + +bool +gen_perf_load_metric_id(struct gen_perf_config *perf_cfg, + const char *guid, + uint64_t *metric_id) { char config_path[280]; snprintf(config_path, sizeof(config_path), "%s/metrics/%s/id", - perf->sysfs_dev_dir, guid); + perf_cfg->sysfs_dev_dir, guid); /* Don't recreate already loaded configs. */ return read_file_uint64(config_path, metric_id); } +static uint64_t +i915_add_config(struct gen_perf_config *perf, int fd, + const struct gen_perf_registers *config, + const char *guid) +{ + struct drm_i915_perf_oa_config i915_config = { 0, }; + + memcpy(i915_config.uuid, guid, sizeof(i915_config.uuid)); + + i915_config.n_mux_regs = config->n_mux_regs; + i915_config.mux_regs_ptr = to_user_pointer(config->mux_regs); + + i915_config.n_boolean_regs = config->n_b_counter_regs; + i915_config.boolean_regs_ptr = to_user_pointer(config->b_counter_regs); + + i915_config.n_flex_regs = config->n_flex_regs; + i915_config.flex_regs_ptr = to_user_pointer(config->flex_regs); + + int ret = gen_ioctl(fd, DRM_IOCTL_I915_PERF_ADD_CONFIG, &i915_config); + return ret > 0 ? ret : 0; +} + static void init_oa_configs(struct gen_perf_config *perf, int fd) { hash_table_foreach(perf->oa_metrics_table, entry) { const struct gen_perf_query_info *query = entry->data; - struct drm_i915_perf_oa_config config; uint64_t config_id; - int ret; - if (load_metric_id(perf, query->guid, &config_id)) { + if (gen_perf_load_metric_id(perf, query->guid, &config_id)) { DBG("metric set: %s (already loaded)\n", query->guid); register_oa_config(perf, query, config_id); continue; } - memset(&config, 0, sizeof(config)); - - memcpy(config.uuid, query->guid, sizeof(config.uuid)); - - config.n_mux_regs = query->n_mux_regs; - config.mux_regs_ptr = (uintptr_t) query->mux_regs; - - config.n_boolean_regs = query->n_b_counter_regs; - config.boolean_regs_ptr = (uintptr_t) query->b_counter_regs; - - config.n_flex_regs = query->n_flex_regs; - config.flex_regs_ptr = (uintptr_t) query->flex_regs; - - ret = gen_ioctl(fd, DRM_IOCTL_I915_PERF_ADD_CONFIG, &config); + int ret = i915_add_config(perf, fd, &query->config, query->guid); if (ret < 0) { DBG("Failed to load \"%s\" (%s) metrics set in kernel: %s\n", query->name, query->guid, strerror(errno)); @@ -732,127 +445,120 @@ } if (devinfo->is_cannonlake) return gen_oa_register_queries_cnl; - if (devinfo->gen == 11) + if (devinfo->gen == 11) { + if (devinfo->is_elkhartlake) + return gen_oa_register_queries_lkf; return gen_oa_register_queries_icl; + } + if (devinfo->gen == 12) + return gen_oa_register_queries_tgl; return NULL; } -static inline void -add_stat_reg(struct gen_perf_query_info *query, uint32_t reg, - uint32_t numerator, uint32_t denominator, - const char *name, const char *description) -{ - struct gen_perf_query_counter *counter; - - assert(query->n_counters < query->max_counters); - - counter = &query->counters[query->n_counters]; - counter->name = name; - counter->desc = description; - counter->type = GEN_PERF_COUNTER_TYPE_RAW; - counter->data_type = GEN_PERF_COUNTER_DATA_TYPE_UINT64; - counter->offset = sizeof(uint64_t) * query->n_counters; - counter->pipeline_stat.reg = reg; - counter->pipeline_stat.numerator = numerator; - counter->pipeline_stat.denominator = denominator; - - query->n_counters++; -} - -static inline void -add_basic_stat_reg(struct gen_perf_query_info *query, - uint32_t reg, const char *name) -{ - add_stat_reg(query, reg, 1, 1, name, name); -} - static void load_pipeline_statistic_metrics(struct gen_perf_config *perf_cfg, - const struct gen_device_info *devinfo) + const struct gen_device_info *devinfo) { struct gen_perf_query_info *query = - append_query_info(perf_cfg, MAX_STAT_COUNTERS); + gen_perf_append_query_info(perf_cfg, MAX_STAT_COUNTERS); query->kind = GEN_PERF_QUERY_TYPE_PIPELINE; query->name = "Pipeline Statistics Registers"; - add_basic_stat_reg(query, IA_VERTICES_COUNT, - "N vertices submitted"); - add_basic_stat_reg(query, IA_PRIMITIVES_COUNT, - "N primitives submitted"); - add_basic_stat_reg(query, VS_INVOCATION_COUNT, - "N vertex shader invocations"); + gen_perf_query_add_basic_stat_reg(query, IA_VERTICES_COUNT, + "N vertices submitted"); + gen_perf_query_add_basic_stat_reg(query, IA_PRIMITIVES_COUNT, + "N primitives submitted"); + gen_perf_query_add_basic_stat_reg(query, VS_INVOCATION_COUNT, + "N vertex shader invocations"); if (devinfo->gen == 6) { - add_stat_reg(query, GEN6_SO_PRIM_STORAGE_NEEDED, 1, 1, - "SO_PRIM_STORAGE_NEEDED", - "N geometry shader stream-out primitives (total)"); - add_stat_reg(query, GEN6_SO_NUM_PRIMS_WRITTEN, 1, 1, - "SO_NUM_PRIMS_WRITTEN", - "N geometry shader stream-out primitives (written)"); + gen_perf_query_add_stat_reg(query, GEN6_SO_PRIM_STORAGE_NEEDED, 1, 1, + "SO_PRIM_STORAGE_NEEDED", + "N geometry shader stream-out primitives (total)"); + gen_perf_query_add_stat_reg(query, GEN6_SO_NUM_PRIMS_WRITTEN, 1, 1, + "SO_NUM_PRIMS_WRITTEN", + "N geometry shader stream-out primitives (written)"); } else { - add_stat_reg(query, GEN7_SO_PRIM_STORAGE_NEEDED(0), 1, 1, - "SO_PRIM_STORAGE_NEEDED (Stream 0)", - "N stream-out (stream 0) primitives (total)"); - add_stat_reg(query, GEN7_SO_PRIM_STORAGE_NEEDED(1), 1, 1, - "SO_PRIM_STORAGE_NEEDED (Stream 1)", - "N stream-out (stream 1) primitives (total)"); - add_stat_reg(query, GEN7_SO_PRIM_STORAGE_NEEDED(2), 1, 1, - "SO_PRIM_STORAGE_NEEDED (Stream 2)", - "N stream-out (stream 2) primitives (total)"); - add_stat_reg(query, GEN7_SO_PRIM_STORAGE_NEEDED(3), 1, 1, - "SO_PRIM_STORAGE_NEEDED (Stream 3)", - "N stream-out (stream 3) primitives (total)"); - add_stat_reg(query, GEN7_SO_NUM_PRIMS_WRITTEN(0), 1, 1, - "SO_NUM_PRIMS_WRITTEN (Stream 0)", - "N stream-out (stream 0) primitives (written)"); - add_stat_reg(query, GEN7_SO_NUM_PRIMS_WRITTEN(1), 1, 1, - "SO_NUM_PRIMS_WRITTEN (Stream 1)", - "N stream-out (stream 1) primitives (written)"); - add_stat_reg(query, GEN7_SO_NUM_PRIMS_WRITTEN(2), 1, 1, - "SO_NUM_PRIMS_WRITTEN (Stream 2)", - "N stream-out (stream 2) primitives (written)"); - add_stat_reg(query, GEN7_SO_NUM_PRIMS_WRITTEN(3), 1, 1, - "SO_NUM_PRIMS_WRITTEN (Stream 3)", - "N stream-out (stream 3) primitives (written)"); - } - - add_basic_stat_reg(query, HS_INVOCATION_COUNT, - "N TCS shader invocations"); - add_basic_stat_reg(query, DS_INVOCATION_COUNT, - "N TES shader invocations"); - - add_basic_stat_reg(query, GS_INVOCATION_COUNT, - "N geometry shader invocations"); - add_basic_stat_reg(query, GS_PRIMITIVES_COUNT, - "N geometry shader primitives emitted"); - - add_basic_stat_reg(query, CL_INVOCATION_COUNT, - "N primitives entering clipping"); - add_basic_stat_reg(query, CL_PRIMITIVES_COUNT, - "N primitives leaving clipping"); + gen_perf_query_add_stat_reg(query, GEN7_SO_PRIM_STORAGE_NEEDED(0), 1, 1, + "SO_PRIM_STORAGE_NEEDED (Stream 0)", + "N stream-out (stream 0) primitives (total)"); + gen_perf_query_add_stat_reg(query, GEN7_SO_PRIM_STORAGE_NEEDED(1), 1, 1, + "SO_PRIM_STORAGE_NEEDED (Stream 1)", + "N stream-out (stream 1) primitives (total)"); + gen_perf_query_add_stat_reg(query, GEN7_SO_PRIM_STORAGE_NEEDED(2), 1, 1, + "SO_PRIM_STORAGE_NEEDED (Stream 2)", + "N stream-out (stream 2) primitives (total)"); + gen_perf_query_add_stat_reg(query, GEN7_SO_PRIM_STORAGE_NEEDED(3), 1, 1, + "SO_PRIM_STORAGE_NEEDED (Stream 3)", + "N stream-out (stream 3) primitives (total)"); + gen_perf_query_add_stat_reg(query, GEN7_SO_NUM_PRIMS_WRITTEN(0), 1, 1, + "SO_NUM_PRIMS_WRITTEN (Stream 0)", + "N stream-out (stream 0) primitives (written)"); + gen_perf_query_add_stat_reg(query, GEN7_SO_NUM_PRIMS_WRITTEN(1), 1, 1, + "SO_NUM_PRIMS_WRITTEN (Stream 1)", + "N stream-out (stream 1) primitives (written)"); + gen_perf_query_add_stat_reg(query, GEN7_SO_NUM_PRIMS_WRITTEN(2), 1, 1, + "SO_NUM_PRIMS_WRITTEN (Stream 2)", + "N stream-out (stream 2) primitives (written)"); + gen_perf_query_add_stat_reg(query, GEN7_SO_NUM_PRIMS_WRITTEN(3), 1, 1, + "SO_NUM_PRIMS_WRITTEN (Stream 3)", + "N stream-out (stream 3) primitives (written)"); + } + + gen_perf_query_add_basic_stat_reg(query, HS_INVOCATION_COUNT, + "N TCS shader invocations"); + gen_perf_query_add_basic_stat_reg(query, DS_INVOCATION_COUNT, + "N TES shader invocations"); + + gen_perf_query_add_basic_stat_reg(query, GS_INVOCATION_COUNT, + "N geometry shader invocations"); + gen_perf_query_add_basic_stat_reg(query, GS_PRIMITIVES_COUNT, + "N geometry shader primitives emitted"); + + gen_perf_query_add_basic_stat_reg(query, CL_INVOCATION_COUNT, + "N primitives entering clipping"); + gen_perf_query_add_basic_stat_reg(query, CL_PRIMITIVES_COUNT, + "N primitives leaving clipping"); if (devinfo->is_haswell || devinfo->gen == 8) { - add_stat_reg(query, PS_INVOCATION_COUNT, 1, 4, - "N fragment shader invocations", - "N fragment shader invocations"); + gen_perf_query_add_stat_reg(query, PS_INVOCATION_COUNT, 1, 4, + "N fragment shader invocations", + "N fragment shader invocations"); } else { - add_basic_stat_reg(query, PS_INVOCATION_COUNT, - "N fragment shader invocations"); + gen_perf_query_add_basic_stat_reg(query, PS_INVOCATION_COUNT, + "N fragment shader invocations"); } - add_basic_stat_reg(query, PS_DEPTH_COUNT, - "N z-pass fragments"); + gen_perf_query_add_basic_stat_reg(query, PS_DEPTH_COUNT, + "N z-pass fragments"); if (devinfo->gen >= 7) { - add_basic_stat_reg(query, CS_INVOCATION_COUNT, - "N compute shader invocations"); + gen_perf_query_add_basic_stat_reg(query, CS_INVOCATION_COUNT, + "N compute shader invocations"); } query->data_size = sizeof(uint64_t) * query->n_counters; } +static int +i915_perf_version(int drm_fd) +{ + int tmp; + drm_i915_getparam_t gp = { + .param = I915_PARAM_PERF_REVISION, + .value = &tmp, + }; + + int ret = gen_ioctl(drm_fd, DRM_IOCTL_I915_GETPARAM, &gp); + + /* Return 0 if this getparam is not supported, the first version supported + * is 1. + */ + return ret < 0 ? 0 : tmp; +} + static bool load_oa_metrics(struct gen_perf_config *perf, int fd, const struct gen_device_info *devinfo) @@ -861,6 +567,9 @@ bool i915_perf_oa_available = false; struct stat sb; + perf->i915_query_supported = i915_query_perf_config_supported(perf, fd); + perf->i915_perf_version = i915_perf_version(fd); + /* The existence of this sysctl parameter implies the kernel supports * the i915 perf interface. */ @@ -888,7 +597,7 @@ return false; perf->oa_metrics_table = - _mesa_hash_table_create(perf, _mesa_key_hash_string, + _mesa_hash_table_create(perf, _mesa_hash_string, _mesa_key_string_equal); /* Index all the metric sets mesa knows about before looking to see what @@ -905,6 +614,87 @@ return true; } +struct gen_perf_registers * +gen_perf_load_configuration(struct gen_perf_config *perf_cfg, int fd, const char *guid) +{ + if (!perf_cfg->i915_query_supported) + return NULL; + + struct drm_i915_perf_oa_config i915_config = { 0, }; + if (!i915_query_perf_config_data(perf_cfg, fd, guid, &i915_config)) + return NULL; + + struct gen_perf_registers *config = rzalloc(NULL, struct gen_perf_registers); + config->n_flex_regs = i915_config.n_flex_regs; + config->flex_regs = rzalloc_array(config, struct gen_perf_query_register_prog, config->n_flex_regs); + config->n_mux_regs = i915_config.n_mux_regs; + config->mux_regs = rzalloc_array(config, struct gen_perf_query_register_prog, config->n_mux_regs); + config->n_b_counter_regs = i915_config.n_boolean_regs; + config->b_counter_regs = rzalloc_array(config, struct gen_perf_query_register_prog, config->n_b_counter_regs); + + /* + * struct gen_perf_query_register_prog maps exactly to the tuple of + * (register offset, register value) returned by the i915. + */ + i915_config.flex_regs_ptr = to_user_pointer(config->flex_regs); + i915_config.mux_regs_ptr = to_user_pointer(config->mux_regs); + i915_config.boolean_regs_ptr = to_user_pointer(config->b_counter_regs); + if (!i915_query_perf_config_data(perf_cfg, fd, guid, &i915_config)) { + ralloc_free(config); + return NULL; + } + + return config; +} + +uint64_t +gen_perf_store_configuration(struct gen_perf_config *perf_cfg, int fd, + const struct gen_perf_registers *config, + const char *guid) +{ + if (guid) + return i915_add_config(perf_cfg, fd, config, guid); + + struct mesa_sha1 sha1_ctx; + _mesa_sha1_init(&sha1_ctx); + + if (config->flex_regs) { + _mesa_sha1_update(&sha1_ctx, config->flex_regs, + sizeof(config->flex_regs[0]) * + config->n_flex_regs); + } + if (config->mux_regs) { + _mesa_sha1_update(&sha1_ctx, config->mux_regs, + sizeof(config->mux_regs[0]) * + config->n_mux_regs); + } + if (config->b_counter_regs) { + _mesa_sha1_update(&sha1_ctx, config->b_counter_regs, + sizeof(config->b_counter_regs[0]) * + config->n_b_counter_regs); + } + + uint8_t hash[20]; + _mesa_sha1_final(&sha1_ctx, hash); + + char formatted_hash[41]; + _mesa_sha1_format(formatted_hash, hash); + + char generated_guid[37]; + snprintf(generated_guid, sizeof(generated_guid), + "%.8s-%.4s-%.4s-%.4s-%.12s", + &formatted_hash[0], &formatted_hash[8], + &formatted_hash[8 + 4], &formatted_hash[8 + 4 + 4], + &formatted_hash[8 + 4 + 4 + 4]); + + /* Check if already present. */ + uint64_t id; + if (gen_perf_load_metric_id(perf_cfg, generated_guid, &id)) + return id; + + return i915_add_config(perf_cfg, fd, config, generated_guid); +} + /* Accumulate 32bits OA counters */ static inline void accumulate_uint32(const uint32_t *report0, @@ -966,11 +756,11 @@ *unslice_freq_hz = unslice_freq * 16666667ULL; } -static void -query_result_read_frequencies(struct gen_perf_query_result *result, - const struct gen_device_info *devinfo, - const uint32_t *start, - const uint32_t *end) +void +gen_perf_query_result_read_frequencies(struct gen_perf_query_result *result, + const struct gen_device_info *devinfo, + const uint32_t *start, + const uint32_t *end) { /* Slice/Unslice frequency is only available in the OA reports when the * "Disable OA reports due to clock ratio change" field in @@ -991,17 +781,19 @@ &result->unslice_frequency[1]); } -static void -query_result_accumulate(struct gen_perf_query_result *result, - const struct gen_perf_query_info *query, - const uint32_t *start, - const uint32_t *end) +void +gen_perf_query_result_accumulate(struct gen_perf_query_result *result, + const struct gen_perf_query_info *query, + const uint32_t *start, + const uint32_t *end) { int i, idx = 0; if (result->hw_id == OA_REPORT_INVALID_CTX_ID && start[2] != OA_REPORT_INVALID_CTX_ID) result->hw_id = start[2]; + if (result->reports_accumulated == 0) + result->begin_timestamp = start[1]; result->reports_accumulated++; switch (query->oa_format) { @@ -1035,1458 +827,20 @@ } -static void -query_result_clear(struct gen_perf_query_result *result) +void +gen_perf_query_result_clear(struct gen_perf_query_result *result) { memset(result, 0, sizeof(*result)); result->hw_id = OA_REPORT_INVALID_CTX_ID; /* invalid */ } -static void -register_mdapi_statistic_query(struct gen_perf_config *perf_cfg, - const struct gen_device_info *devinfo) -{ - if (!(devinfo->gen >= 7 && devinfo->gen <= 11)) - return; - - struct gen_perf_query_info *query = - append_query_info(perf_cfg, MAX_STAT_COUNTERS); - - query->kind = GEN_PERF_QUERY_TYPE_PIPELINE; - query->name = "Intel_Raw_Pipeline_Statistics_Query"; - - /* The order has to match mdapi_pipeline_metrics. */ - add_basic_stat_reg(query, IA_VERTICES_COUNT, - "N vertices submitted"); - add_basic_stat_reg(query, IA_PRIMITIVES_COUNT, - "N primitives submitted"); - add_basic_stat_reg(query, VS_INVOCATION_COUNT, - "N vertex shader invocations"); - add_basic_stat_reg(query, GS_INVOCATION_COUNT, - "N geometry shader invocations"); - add_basic_stat_reg(query, GS_PRIMITIVES_COUNT, - "N geometry shader primitives emitted"); - add_basic_stat_reg(query, CL_INVOCATION_COUNT, - "N primitives entering clipping"); - add_basic_stat_reg(query, CL_PRIMITIVES_COUNT, - "N primitives leaving clipping"); - if (devinfo->is_haswell || devinfo->gen == 8) { - add_stat_reg(query, PS_INVOCATION_COUNT, 1, 4, - "N fragment shader invocations", - "N fragment shader invocations"); - } else { - add_basic_stat_reg(query, PS_INVOCATION_COUNT, - "N fragment shader invocations"); - } - add_basic_stat_reg(query, HS_INVOCATION_COUNT, - "N TCS shader invocations"); - add_basic_stat_reg(query, DS_INVOCATION_COUNT, - "N TES shader invocations"); - if (devinfo->gen >= 7) { - add_basic_stat_reg(query, CS_INVOCATION_COUNT, - "N compute shader invocations"); - } - - if (devinfo->gen >= 10) { - /* Reuse existing CS invocation register until we can expose this new - * one. - */ - add_basic_stat_reg(query, CS_INVOCATION_COUNT, - "Reserved1"); - } - - query->data_size = sizeof(uint64_t) * query->n_counters; -} - -static void -fill_mdapi_perf_query_counter(struct gen_perf_query_info *query, - const char *name, - uint32_t data_offset, - uint32_t data_size, - enum gen_perf_counter_data_type data_type) -{ - struct gen_perf_query_counter *counter = &query->counters[query->n_counters]; - - assert(query->n_counters <= query->max_counters); - - counter->name = name; - counter->desc = "Raw counter value"; - counter->type = GEN_PERF_COUNTER_TYPE_RAW; - counter->data_type = data_type; - counter->offset = data_offset; - - query->n_counters++; - - assert(counter->offset + gen_perf_query_counter_get_size(counter) <= query->data_size); -} - -#define MDAPI_QUERY_ADD_COUNTER(query, struct_name, field_name, type_name) \ - fill_mdapi_perf_query_counter(query, #field_name, \ - (uint8_t *) &struct_name.field_name - \ - (uint8_t *) &struct_name, \ - sizeof(struct_name.field_name), \ - GEN_PERF_COUNTER_DATA_TYPE_##type_name) -#define MDAPI_QUERY_ADD_ARRAY_COUNTER(ctx, query, struct_name, field_name, idx, type_name) \ - fill_mdapi_perf_query_counter(query, \ - ralloc_asprintf(ctx, "%s%i", #field_name, idx), \ - (uint8_t *) &struct_name.field_name[idx] - \ - (uint8_t *) &struct_name, \ - sizeof(struct_name.field_name[0]), \ - GEN_PERF_COUNTER_DATA_TYPE_##type_name) - -static void -register_mdapi_oa_query(const struct gen_device_info *devinfo, - struct gen_perf_config *perf) -{ - struct gen_perf_query_info *query = NULL; - - /* MDAPI requires different structures for pretty much every generation - * (right now we have definitions for gen 7 to 11). - */ - if (!(devinfo->gen >= 7 && devinfo->gen <= 11)) - return; - - switch (devinfo->gen) { - case 7: { - query = append_query_info(perf, 1 + 45 + 16 + 7); - query->oa_format = I915_OA_FORMAT_A45_B8_C8; - - struct gen7_mdapi_metrics metric_data; - query->data_size = sizeof(metric_data); - - MDAPI_QUERY_ADD_COUNTER(query, metric_data, TotalTime, UINT64); - for (int i = 0; i < ARRAY_SIZE(metric_data.ACounters); i++) { - MDAPI_QUERY_ADD_ARRAY_COUNTER(perf->queries, query, - metric_data, ACounters, i, UINT64); - } - for (int i = 0; i < ARRAY_SIZE(metric_data.NOACounters); i++) { - MDAPI_QUERY_ADD_ARRAY_COUNTER(perf->queries, query, - metric_data, NOACounters, i, UINT64); - } - MDAPI_QUERY_ADD_COUNTER(query, metric_data, PerfCounter1, UINT64); - MDAPI_QUERY_ADD_COUNTER(query, metric_data, PerfCounter2, UINT64); - MDAPI_QUERY_ADD_COUNTER(query, metric_data, SplitOccured, BOOL32); - MDAPI_QUERY_ADD_COUNTER(query, metric_data, CoreFrequencyChanged, BOOL32); - MDAPI_QUERY_ADD_COUNTER(query, metric_data, CoreFrequency, UINT64); - MDAPI_QUERY_ADD_COUNTER(query, metric_data, ReportId, UINT32); - MDAPI_QUERY_ADD_COUNTER(query, metric_data, ReportsCount, UINT32); - break; - } - case 8: { - query = append_query_info(perf, 2 + 36 + 16 + 16); - query->oa_format = I915_OA_FORMAT_A32u40_A4u32_B8_C8; - - struct gen8_mdapi_metrics metric_data; - query->data_size = sizeof(metric_data); - - MDAPI_QUERY_ADD_COUNTER(query, metric_data, TotalTime, UINT64); - MDAPI_QUERY_ADD_COUNTER(query, metric_data, GPUTicks, UINT64); - for (int i = 0; i < ARRAY_SIZE(metric_data.OaCntr); i++) { - MDAPI_QUERY_ADD_ARRAY_COUNTER(perf->queries, query, - metric_data, OaCntr, i, UINT64); - } - for (int i = 0; i < ARRAY_SIZE(metric_data.NoaCntr); i++) { - MDAPI_QUERY_ADD_ARRAY_COUNTER(perf->queries, query, - metric_data, NoaCntr, i, UINT64); - } - MDAPI_QUERY_ADD_COUNTER(query, metric_data, BeginTimestamp, UINT64); - MDAPI_QUERY_ADD_COUNTER(query, metric_data, Reserved1, UINT64); - MDAPI_QUERY_ADD_COUNTER(query, metric_data, Reserved2, UINT64); - MDAPI_QUERY_ADD_COUNTER(query, metric_data, Reserved3, UINT32); - MDAPI_QUERY_ADD_COUNTER(query, metric_data, OverrunOccured, BOOL32); - MDAPI_QUERY_ADD_COUNTER(query, metric_data, MarkerUser, UINT64); - MDAPI_QUERY_ADD_COUNTER(query, metric_data, MarkerDriver, UINT64); - MDAPI_QUERY_ADD_COUNTER(query, metric_data, SliceFrequency, UINT64); - MDAPI_QUERY_ADD_COUNTER(query, metric_data, UnsliceFrequency, UINT64); - MDAPI_QUERY_ADD_COUNTER(query, metric_data, PerfCounter1, UINT64); - MDAPI_QUERY_ADD_COUNTER(query, metric_data, PerfCounter2, UINT64); - MDAPI_QUERY_ADD_COUNTER(query, metric_data, SplitOccured, BOOL32); - MDAPI_QUERY_ADD_COUNTER(query, metric_data, CoreFrequencyChanged, BOOL32); - MDAPI_QUERY_ADD_COUNTER(query, metric_data, CoreFrequency, UINT64); - MDAPI_QUERY_ADD_COUNTER(query, metric_data, ReportId, UINT32); - MDAPI_QUERY_ADD_COUNTER(query, metric_data, ReportsCount, UINT32); - break; - } - case 9: - case 10: - case 11: { - query = append_query_info(perf, 2 + 36 + 16 + 16 + 16 + 2); - query->oa_format = I915_OA_FORMAT_A32u40_A4u32_B8_C8; - - struct gen9_mdapi_metrics metric_data; - query->data_size = sizeof(metric_data); - - MDAPI_QUERY_ADD_COUNTER(query, metric_data, TotalTime, UINT64); - MDAPI_QUERY_ADD_COUNTER(query, metric_data, GPUTicks, UINT64); - for (int i = 0; i < ARRAY_SIZE(metric_data.OaCntr); i++) { - MDAPI_QUERY_ADD_ARRAY_COUNTER(perf->queries, query, - metric_data, OaCntr, i, UINT64); - } - for (int i = 0; i < ARRAY_SIZE(metric_data.NoaCntr); i++) { - MDAPI_QUERY_ADD_ARRAY_COUNTER(perf->queries, query, - metric_data, NoaCntr, i, UINT64); - } - MDAPI_QUERY_ADD_COUNTER(query, metric_data, BeginTimestamp, UINT64); - MDAPI_QUERY_ADD_COUNTER(query, metric_data, Reserved1, UINT64); - MDAPI_QUERY_ADD_COUNTER(query, metric_data, Reserved2, UINT64); - MDAPI_QUERY_ADD_COUNTER(query, metric_data, Reserved3, UINT32); - MDAPI_QUERY_ADD_COUNTER(query, metric_data, OverrunOccured, BOOL32); - MDAPI_QUERY_ADD_COUNTER(query, metric_data, MarkerUser, UINT64); - MDAPI_QUERY_ADD_COUNTER(query, metric_data, MarkerDriver, UINT64); - MDAPI_QUERY_ADD_COUNTER(query, metric_data, SliceFrequency, UINT64); - MDAPI_QUERY_ADD_COUNTER(query, metric_data, UnsliceFrequency, UINT64); - MDAPI_QUERY_ADD_COUNTER(query, metric_data, PerfCounter1, UINT64); - MDAPI_QUERY_ADD_COUNTER(query, metric_data, PerfCounter2, UINT64); - MDAPI_QUERY_ADD_COUNTER(query, metric_data, SplitOccured, BOOL32); - MDAPI_QUERY_ADD_COUNTER(query, metric_data, CoreFrequencyChanged, BOOL32); - MDAPI_QUERY_ADD_COUNTER(query, metric_data, CoreFrequency, UINT64); - MDAPI_QUERY_ADD_COUNTER(query, metric_data, ReportId, UINT32); - MDAPI_QUERY_ADD_COUNTER(query, metric_data, ReportsCount, UINT32); - for (int i = 0; i < ARRAY_SIZE(metric_data.UserCntr); i++) { - MDAPI_QUERY_ADD_ARRAY_COUNTER(perf->queries, query, - metric_data, UserCntr, i, UINT64); - } - MDAPI_QUERY_ADD_COUNTER(query, metric_data, UserCntrCfgId, UINT32); - MDAPI_QUERY_ADD_COUNTER(query, metric_data, Reserved4, UINT32); - break; - } - default: - unreachable("Unsupported gen"); - break; - } - - query->kind = GEN_PERF_QUERY_TYPE_RAW; - query->name = "Intel_Raw_Hardware_Counters_Set_0_Query"; - query->guid = GEN_PERF_QUERY_GUID_MDAPI; - - { - /* Accumulation buffer offsets copied from an actual query... */ - const struct gen_perf_query_info *copy_query = - &perf->queries[0]; - - query->gpu_time_offset = copy_query->gpu_time_offset; - query->gpu_clock_offset = copy_query->gpu_clock_offset; - query->a_offset = copy_query->a_offset; - query->b_offset = copy_query->b_offset; - query->c_offset = copy_query->c_offset; - } -} - -static uint64_t -get_metric_id(struct gen_perf_config *perf, - const struct gen_perf_query_info *query) -{ - /* These queries are know not to ever change, their config ID has been - * loaded upon the first query creation. No need to look them up again. - */ - if (query->kind == GEN_PERF_QUERY_TYPE_OA) - return query->oa_metrics_set_id; - - assert(query->kind == GEN_PERF_QUERY_TYPE_RAW); - - /* Raw queries can be reprogrammed up by an external application/library. - * When a raw query is used for the first time it's id is set to a value != - * 0. When it stops being used the id returns to 0. No need to reload the - * ID when it's already loaded. - */ - if (query->oa_metrics_set_id != 0) { - DBG("Raw query '%s' guid=%s using cached ID: %"PRIu64"\n", - query->name, query->guid, query->oa_metrics_set_id); - return query->oa_metrics_set_id; - } - - struct gen_perf_query_info *raw_query = (struct gen_perf_query_info *)query; - if (!load_metric_id(perf, query->guid, - &raw_query->oa_metrics_set_id)) { - DBG("Unable to read query guid=%s ID, falling back to test config\n", query->guid); - raw_query->oa_metrics_set_id = 1ULL; - } else { - DBG("Raw query '%s'guid=%s loaded ID: %"PRIu64"\n", - query->name, query->guid, query->oa_metrics_set_id); - } - return query->oa_metrics_set_id; -} - -static struct oa_sample_buf * -get_free_sample_buf(struct gen_perf_context *perf_ctx) -{ - struct exec_node *node = exec_list_pop_head(&perf_ctx->free_sample_buffers); - struct oa_sample_buf *buf; - - if (node) - buf = exec_node_data(struct oa_sample_buf, node, link); - else { - buf = ralloc_size(perf_ctx->perf, sizeof(*buf)); - - exec_node_init(&buf->link); - buf->refcount = 0; - } - buf->len = 0; - - return buf; -} - -static void -reap_old_sample_buffers(struct gen_perf_context *perf_ctx) -{ - struct exec_node *tail_node = - exec_list_get_tail(&perf_ctx->sample_buffers); - struct oa_sample_buf *tail_buf = - exec_node_data(struct oa_sample_buf, tail_node, link); - - /* Remove all old, unreferenced sample buffers walking forward from - * the head of the list, except always leave at least one node in - * the list so we always have a node to reference when we Begin - * a new query. - */ - foreach_list_typed_safe(struct oa_sample_buf, buf, link, - &perf_ctx->sample_buffers) - { - if (buf->refcount == 0 && buf != tail_buf) { - exec_node_remove(&buf->link); - exec_list_push_head(&perf_ctx->free_sample_buffers, &buf->link); - } else - return; - } -} - -static void -free_sample_bufs(struct gen_perf_context *perf_ctx) -{ - foreach_list_typed_safe(struct oa_sample_buf, buf, link, - &perf_ctx->free_sample_buffers) - ralloc_free(buf); - - exec_list_make_empty(&perf_ctx->free_sample_buffers); -} - -/******************************************************************************/ - -/** - * Emit MI_STORE_REGISTER_MEM commands to capture all of the - * pipeline statistics for the performance query object. - */ -static void -snapshot_statistics_registers(void *context, - struct gen_perf_config *perf, - struct gen_perf_query_object *obj, - uint32_t offset_in_bytes) -{ - const struct gen_perf_query_info *query = obj->queryinfo; - const int n_counters = query->n_counters; - - for (int i = 0; i < n_counters; i++) { - const struct gen_perf_query_counter *counter = &query->counters[i]; - - assert(counter->data_type == GEN_PERF_COUNTER_DATA_TYPE_UINT64); - - perf->vtbl.store_register_mem64(context, obj->pipeline_stats.bo, - counter->pipeline_stat.reg, - offset_in_bytes + i * sizeof(uint64_t)); - } -} - -static void -gen_perf_close(struct gen_perf_context *perfquery, - const struct gen_perf_query_info *query) -{ - if (perfquery->oa_stream_fd != -1) { - close(perfquery->oa_stream_fd); - perfquery->oa_stream_fd = -1; - } - if (query->kind == GEN_PERF_QUERY_TYPE_RAW) { - struct gen_perf_query_info *raw_query = - (struct gen_perf_query_info *) query; - raw_query->oa_metrics_set_id = 0; - } -} - -static bool -gen_perf_open(struct gen_perf_context *perf_ctx, - int metrics_set_id, - int report_format, - int period_exponent, - int drm_fd, - uint32_t ctx_id) -{ - uint64_t properties[] = { - /* Single context sampling */ - DRM_I915_PERF_PROP_CTX_HANDLE, ctx_id, - - /* Include OA reports in samples */ - DRM_I915_PERF_PROP_SAMPLE_OA, true, - - /* OA unit configuration */ - DRM_I915_PERF_PROP_OA_METRICS_SET, metrics_set_id, - DRM_I915_PERF_PROP_OA_FORMAT, report_format, - DRM_I915_PERF_PROP_OA_EXPONENT, period_exponent, - }; - struct drm_i915_perf_open_param param = { - .flags = I915_PERF_FLAG_FD_CLOEXEC | - I915_PERF_FLAG_FD_NONBLOCK | - I915_PERF_FLAG_DISABLED, - .num_properties = ARRAY_SIZE(properties) / 2, - .properties_ptr = (uintptr_t) properties, - }; - int fd = gen_ioctl(drm_fd, DRM_IOCTL_I915_PERF_OPEN, ¶m); - if (fd == -1) { - DBG("Error opening gen perf OA stream: %m\n"); - return false; - } - - perf_ctx->oa_stream_fd = fd; - - perf_ctx->current_oa_metrics_set_id = metrics_set_id; - perf_ctx->current_oa_format = report_format; - - return true; -} - -static bool -inc_n_users(struct gen_perf_context *perf_ctx) -{ - if (perf_ctx->n_oa_users == 0 && - gen_ioctl(perf_ctx->oa_stream_fd, I915_PERF_IOCTL_ENABLE, 0) < 0) - { - return false; - } - ++perf_ctx->n_oa_users; - - return true; -} - -static void -dec_n_users(struct gen_perf_context *perf_ctx) -{ - /* Disabling the i915 perf stream will effectively disable the OA - * counters. Note it's important to be sure there are no outstanding - * MI_RPC commands at this point since they could stall the CS - * indefinitely once OACONTROL is disabled. - */ - --perf_ctx->n_oa_users; - if (perf_ctx->n_oa_users == 0 && - gen_ioctl(perf_ctx->oa_stream_fd, I915_PERF_IOCTL_DISABLE, 0) < 0) - { - DBG("WARNING: Error disabling gen perf stream: %m\n"); - } -} - -void -gen_perf_init_metrics(struct gen_perf_config *perf_cfg, - const struct gen_device_info *devinfo, - int drm_fd) +void +gen_perf_init_metrics(struct gen_perf_config *perf_cfg, + const struct gen_device_info *devinfo, + int drm_fd) { load_pipeline_statistic_metrics(perf_cfg, devinfo); - register_mdapi_statistic_query(perf_cfg, devinfo); + gen_perf_register_mdapi_statistic_query(perf_cfg, devinfo); if (load_oa_metrics(perf_cfg, drm_fd, devinfo)) - register_mdapi_oa_query(devinfo, perf_cfg); -} - -void -gen_perf_init_context(struct gen_perf_context *perf_ctx, - struct gen_perf_config *perf_cfg, - void * ctx, /* driver context (eg, brw_context) */ - void * bufmgr, /* eg brw_bufmgr */ - const struct gen_device_info *devinfo, - uint32_t hw_ctx, - int drm_fd) -{ - perf_ctx->perf = perf_cfg; - perf_ctx->ctx = ctx; - perf_ctx->bufmgr = bufmgr; - perf_ctx->drm_fd = drm_fd; - perf_ctx->hw_ctx = hw_ctx; - perf_ctx->devinfo = devinfo; - - perf_ctx->unaccumulated = - ralloc_array(ctx, struct gen_perf_query_object *, 2); - perf_ctx->unaccumulated_elements = 0; - perf_ctx->unaccumulated_array_size = 2; - - exec_list_make_empty(&perf_ctx->sample_buffers); - exec_list_make_empty(&perf_ctx->free_sample_buffers); - - /* It's convenient to guarantee that this linked list of sample - * buffers is never empty so we add an empty head so when we - * Begin an OA query we can always take a reference on a buffer - * in this list. - */ - struct oa_sample_buf *buf = get_free_sample_buf(perf_ctx); - exec_list_push_head(&perf_ctx->sample_buffers, &buf->link); - - perf_ctx->oa_stream_fd = -1; - perf_ctx->next_query_start_report_id = 1000; -} - -/** - * Add a query to the global list of "unaccumulated queries." - * - * Queries are tracked here until all the associated OA reports have - * been accumulated via accumulate_oa_reports() after the end - * MI_REPORT_PERF_COUNT has landed in query->oa.bo. - */ -static void -add_to_unaccumulated_query_list(struct gen_perf_context *perf_ctx, - struct gen_perf_query_object *obj) -{ - if (perf_ctx->unaccumulated_elements >= - perf_ctx->unaccumulated_array_size) - { - perf_ctx->unaccumulated_array_size *= 1.5; - perf_ctx->unaccumulated = - reralloc(perf_ctx->ctx, perf_ctx->unaccumulated, - struct gen_perf_query_object *, - perf_ctx->unaccumulated_array_size); - } - - perf_ctx->unaccumulated[perf_ctx->unaccumulated_elements++] = obj; -} - -bool -gen_perf_begin_query(struct gen_perf_context *perf_ctx, - struct gen_perf_query_object *query) -{ - struct gen_perf_config *perf_cfg = perf_ctx->perf; - const struct gen_perf_query_info *queryinfo = query->queryinfo; - - /* XXX: We have to consider that the command parser unit that parses batch - * buffer commands and is used to capture begin/end counter snapshots isn't - * implicitly synchronized with what's currently running across other GPU - * units (such as the EUs running shaders) that the performance counters are - * associated with. - * - * The intention of performance queries is to measure the work associated - * with commands between the begin/end delimiters and so for that to be the - * case we need to explicitly synchronize the parsing of commands to capture - * Begin/End counter snapshots with what's running across other parts of the - * GPU. - * - * When the command parser reaches a Begin marker it effectively needs to - * drain everything currently running on the GPU until the hardware is idle - * before capturing the first snapshot of counters - otherwise the results - * would also be measuring the effects of earlier commands. - * - * When the command parser reaches an End marker it needs to stall until - * everything currently running on the GPU has finished before capturing the - * end snapshot - otherwise the results won't be a complete representation - * of the work. - * - * Theoretically there could be opportunities to minimize how much of the - * GPU pipeline is drained, or that we stall for, when we know what specific - * units the performance counters being queried relate to but we don't - * currently attempt to be clever here. - * - * Note: with our current simple approach here then for back-to-back queries - * we will redundantly emit duplicate commands to synchronize the command - * streamer with the rest of the GPU pipeline, but we assume that in HW the - * second synchronization is effectively a NOOP. - * - * N.B. The final results are based on deltas of counters between (inside) - * Begin/End markers so even though the total wall clock time of the - * workload is stretched by larger pipeline bubbles the bubbles themselves - * are generally invisible to the query results. Whether that's a good or a - * bad thing depends on the use case. For a lower real-time impact while - * capturing metrics then periodic sampling may be a better choice than - * INTEL_performance_query. - * - * - * This is our Begin synchronization point to drain current work on the - * GPU before we capture our first counter snapshot... - */ - perf_cfg->vtbl.emit_mi_flush(perf_ctx->ctx); - - switch (queryinfo->kind) { - case GEN_PERF_QUERY_TYPE_OA: - case GEN_PERF_QUERY_TYPE_RAW: { - - /* Opening an i915 perf stream implies exclusive access to the OA unit - * which will generate counter reports for a specific counter set with a - * specific layout/format so we can't begin any OA based queries that - * require a different counter set or format unless we get an opportunity - * to close the stream and open a new one... - */ - uint64_t metric_id = get_metric_id(perf_ctx->perf, queryinfo); - - if (perf_ctx->oa_stream_fd != -1 && - perf_ctx->current_oa_metrics_set_id != metric_id) { - - if (perf_ctx->n_oa_users != 0) { - DBG("WARNING: Begin failed already using perf config=%i/%"PRIu64"\n", - perf_ctx->current_oa_metrics_set_id, metric_id); - return false; - } else - gen_perf_close(perf_ctx, queryinfo); - } - - /* If the OA counters aren't already on, enable them. */ - if (perf_ctx->oa_stream_fd == -1) { - const struct gen_device_info *devinfo = perf_ctx->devinfo; - - /* The period_exponent gives a sampling period as follows: - * sample_period = timestamp_period * 2^(period_exponent + 1) - * - * The timestamps increments every 80ns (HSW), ~52ns (GEN9LP) or - * ~83ns (GEN8/9). - * - * The counter overflow period is derived from the EuActive counter - * which reads a counter that increments by the number of clock - * cycles multiplied by the number of EUs. It can be calculated as: - * - * 2^(number of bits in A counter) / (n_eus * max_gen_freq * 2) - * - * (E.g. 40 EUs @ 1GHz = ~53ms) - * - * We select a sampling period inferior to that overflow period to - * ensure we cannot see more than 1 counter overflow, otherwise we - * could loose information. - */ - - int a_counter_in_bits = 32; - if (devinfo->gen >= 8) - a_counter_in_bits = 40; - - uint64_t overflow_period = pow(2, a_counter_in_bits) / (perf_cfg->sys_vars.n_eus * - /* drop 1GHz freq to have units in nanoseconds */ - 2); - - DBG("A counter overflow period: %"PRIu64"ns, %"PRIu64"ms (n_eus=%"PRIu64")\n", - overflow_period, overflow_period / 1000000ul, perf_cfg->sys_vars.n_eus); - - int period_exponent = 0; - uint64_t prev_sample_period, next_sample_period; - for (int e = 0; e < 30; e++) { - prev_sample_period = 1000000000ull * pow(2, e + 1) / devinfo->timestamp_frequency; - next_sample_period = 1000000000ull * pow(2, e + 2) / devinfo->timestamp_frequency; - - /* Take the previous sampling period, lower than the overflow - * period. - */ - if (prev_sample_period < overflow_period && - next_sample_period > overflow_period) - period_exponent = e + 1; - } - - if (period_exponent == 0) { - DBG("WARNING: enable to find a sampling exponent\n"); - return false; - } - - DBG("OA sampling exponent: %i ~= %"PRIu64"ms\n", period_exponent, - prev_sample_period / 1000000ul); - - if (!gen_perf_open(perf_ctx, metric_id, queryinfo->oa_format, - period_exponent, perf_ctx->drm_fd, - perf_ctx->hw_ctx)) - return false; - } else { - assert(perf_ctx->current_oa_metrics_set_id == metric_id && - perf_ctx->current_oa_format == queryinfo->oa_format); - } - - if (!inc_n_users(perf_ctx)) { - DBG("WARNING: Error enabling i915 perf stream: %m\n"); - return false; - } - - if (query->oa.bo) { - perf_cfg->vtbl.bo_unreference(query->oa.bo); - query->oa.bo = NULL; - } - - query->oa.bo = perf_cfg->vtbl.bo_alloc(perf_ctx->bufmgr, - "perf. query OA MI_RPC bo", - MI_RPC_BO_SIZE); -#ifdef DEBUG - /* Pre-filling the BO helps debug whether writes landed. */ - void *map = perf_cfg->vtbl.bo_map(perf_ctx->ctx, query->oa.bo, MAP_WRITE); - memset(map, 0x80, MI_RPC_BO_SIZE); - perf_cfg->vtbl.bo_unmap(query->oa.bo); -#endif - - query->oa.begin_report_id = perf_ctx->next_query_start_report_id; - perf_ctx->next_query_start_report_id += 2; - - /* We flush the batchbuffer here to minimize the chances that MI_RPC - * delimiting commands end up in different batchbuffers. If that's the - * case, the measurement will include the time it takes for the kernel - * scheduler to load a new request into the hardware. This is manifested in - * tools like frameretrace by spikes in the "GPU Core Clocks" counter. - */ - perf_cfg->vtbl.batchbuffer_flush(perf_ctx->ctx, __FILE__, __LINE__); - - /* Take a starting OA counter snapshot. */ - perf_cfg->vtbl.emit_mi_report_perf_count(perf_ctx->ctx, query->oa.bo, 0, - query->oa.begin_report_id); - perf_cfg->vtbl.capture_frequency_stat_register(perf_ctx->ctx, query->oa.bo, - MI_FREQ_START_OFFSET_BYTES); - - ++perf_ctx->n_active_oa_queries; - - /* No already-buffered samples can possibly be associated with this query - * so create a marker within the list of sample buffers enabling us to - * easily ignore earlier samples when processing this query after - * completion. - */ - assert(!exec_list_is_empty(&perf_ctx->sample_buffers)); - query->oa.samples_head = exec_list_get_tail(&perf_ctx->sample_buffers); - - struct oa_sample_buf *buf = - exec_node_data(struct oa_sample_buf, query->oa.samples_head, link); - - /* This reference will ensure that future/following sample - * buffers (that may relate to this query) can't be freed until - * this drops to zero. - */ - buf->refcount++; - - query_result_clear(&query->oa.result); - query->oa.results_accumulated = false; - - add_to_unaccumulated_query_list(perf_ctx, query); - break; - } - - case GEN_PERF_QUERY_TYPE_PIPELINE: - if (query->pipeline_stats.bo) { - perf_cfg->vtbl.bo_unreference(query->pipeline_stats.bo); - query->pipeline_stats.bo = NULL; - } - - query->pipeline_stats.bo = - perf_cfg->vtbl.bo_alloc(perf_ctx->bufmgr, - "perf. query pipeline stats bo", - STATS_BO_SIZE); - - /* Take starting snapshots. */ - snapshot_statistics_registers(perf_ctx->ctx , perf_cfg, query, 0); - - ++perf_ctx->n_active_pipeline_stats_queries; - break; - - default: - unreachable("Unknown query type"); - break; - } - - return true; -} - -void -gen_perf_end_query(struct gen_perf_context *perf_ctx, - struct gen_perf_query_object *query) -{ - struct gen_perf_config *perf_cfg = perf_ctx->perf; - - /* Ensure that the work associated with the queried commands will have - * finished before taking our query end counter readings. - * - * For more details see comment in brw_begin_perf_query for - * corresponding flush. - */ - perf_cfg->vtbl.emit_mi_flush(perf_ctx->ctx); - - switch (query->queryinfo->kind) { - case GEN_PERF_QUERY_TYPE_OA: - case GEN_PERF_QUERY_TYPE_RAW: - - /* NB: It's possible that the query will have already been marked - * as 'accumulated' if an error was seen while reading samples - * from perf. In this case we mustn't try and emit a closing - * MI_RPC command in case the OA unit has already been disabled - */ - if (!query->oa.results_accumulated) { - /* Take an ending OA counter snapshot. */ - perf_cfg->vtbl.capture_frequency_stat_register(perf_ctx->ctx, query->oa.bo, - MI_FREQ_END_OFFSET_BYTES); - perf_cfg->vtbl.emit_mi_report_perf_count(perf_ctx->ctx, query->oa.bo, - MI_RPC_BO_END_OFFSET_BYTES, - query->oa.begin_report_id + 1); - } - - --perf_ctx->n_active_oa_queries; - - /* NB: even though the query has now ended, it can't be accumulated - * until the end MI_REPORT_PERF_COUNT snapshot has been written - * to query->oa.bo - */ - break; - - case GEN_PERF_QUERY_TYPE_PIPELINE: - snapshot_statistics_registers(perf_ctx->ctx, perf_cfg, query, - STATS_BO_END_OFFSET_BYTES); - --perf_ctx->n_active_pipeline_stats_queries; - break; - - default: - unreachable("Unknown query type"); - break; - } -} - -enum OaReadStatus { - OA_READ_STATUS_ERROR, - OA_READ_STATUS_UNFINISHED, - OA_READ_STATUS_FINISHED, -}; - -static enum OaReadStatus -read_oa_samples_until(struct gen_perf_context *perf_ctx, - uint32_t start_timestamp, - uint32_t end_timestamp) -{ - struct exec_node *tail_node = - exec_list_get_tail(&perf_ctx->sample_buffers); - struct oa_sample_buf *tail_buf = - exec_node_data(struct oa_sample_buf, tail_node, link); - uint32_t last_timestamp = - tail_buf->len == 0 ? start_timestamp : tail_buf->last_timestamp; - - while (1) { - struct oa_sample_buf *buf = get_free_sample_buf(perf_ctx); - uint32_t offset; - int len; - - while ((len = read(perf_ctx->oa_stream_fd, buf->buf, - sizeof(buf->buf))) < 0 && errno == EINTR) - ; - - if (len <= 0) { - exec_list_push_tail(&perf_ctx->free_sample_buffers, &buf->link); - - if (len < 0) { - if (errno == EAGAIN) { - return ((last_timestamp - start_timestamp) < INT32_MAX && - (last_timestamp - start_timestamp) >= - (end_timestamp - start_timestamp)) ? - OA_READ_STATUS_FINISHED : - OA_READ_STATUS_UNFINISHED; - } else { - DBG("Error reading i915 perf samples: %m\n"); - } - } else - DBG("Spurious EOF reading i915 perf samples\n"); - - return OA_READ_STATUS_ERROR; - } - - buf->len = len; - exec_list_push_tail(&perf_ctx->sample_buffers, &buf->link); - - /* Go through the reports and update the last timestamp. */ - offset = 0; - while (offset < buf->len) { - const struct drm_i915_perf_record_header *header = - (const struct drm_i915_perf_record_header *) &buf->buf[offset]; - uint32_t *report = (uint32_t *) (header + 1); - - if (header->type == DRM_I915_PERF_RECORD_SAMPLE) - last_timestamp = report[1]; - - offset += header->size; - } - - buf->last_timestamp = last_timestamp; - } - - unreachable("not reached"); - return OA_READ_STATUS_ERROR; -} - -/** - * Try to read all the reports until either the delimiting timestamp - * or an error arises. - */ -static bool -read_oa_samples_for_query(struct gen_perf_context *perf_ctx, - struct gen_perf_query_object *query, - void *current_batch) -{ - uint32_t *start; - uint32_t *last; - uint32_t *end; - struct gen_perf_config *perf_cfg = perf_ctx->perf; - - /* We need the MI_REPORT_PERF_COUNT to land before we can start - * accumulate. */ - assert(!perf_cfg->vtbl.batch_references(current_batch, query->oa.bo) && - !perf_cfg->vtbl.bo_busy(query->oa.bo)); - - /* Map the BO once here and let accumulate_oa_reports() unmap - * it. */ - if (query->oa.map == NULL) - query->oa.map = perf_cfg->vtbl.bo_map(perf_ctx->ctx, query->oa.bo, MAP_READ); - - start = last = query->oa.map; - end = query->oa.map + MI_RPC_BO_END_OFFSET_BYTES; - - if (start[0] != query->oa.begin_report_id) { - DBG("Spurious start report id=%"PRIu32"\n", start[0]); - return true; - } - if (end[0] != (query->oa.begin_report_id + 1)) { - DBG("Spurious end report id=%"PRIu32"\n", end[0]); - return true; - } - - /* Read the reports until the end timestamp. */ - switch (read_oa_samples_until(perf_ctx, start[1], end[1])) { - case OA_READ_STATUS_ERROR: - /* Fallthrough and let accumulate_oa_reports() deal with the - * error. */ - case OA_READ_STATUS_FINISHED: - return true; - case OA_READ_STATUS_UNFINISHED: - return false; - } - - unreachable("invalid read status"); - return false; -} - -void -gen_perf_wait_query(struct gen_perf_context *perf_ctx, - struct gen_perf_query_object *query, - void *current_batch) -{ - struct gen_perf_config *perf_cfg = perf_ctx->perf; - struct brw_bo *bo = NULL; - - switch (query->queryinfo->kind) { - case GEN_PERF_QUERY_TYPE_OA: - case GEN_PERF_QUERY_TYPE_RAW: - bo = query->oa.bo; - break; - - case GEN_PERF_QUERY_TYPE_PIPELINE: - bo = query->pipeline_stats.bo; - break; - - default: - unreachable("Unknown query type"); - break; - } - - if (bo == NULL) - return; - - /* If the current batch references our results bo then we need to - * flush first... - */ - if (perf_cfg->vtbl.batch_references(current_batch, bo)) - perf_cfg->vtbl.batchbuffer_flush(perf_ctx->ctx, __FILE__, __LINE__); - - perf_cfg->vtbl.bo_wait_rendering(bo); - - /* Due to a race condition between the OA unit signaling report - * availability and the report actually being written into memory, - * we need to wait for all the reports to come in before we can - * read them. - */ - if (query->queryinfo->kind == GEN_PERF_QUERY_TYPE_OA || - query->queryinfo->kind == GEN_PERF_QUERY_TYPE_RAW) { - while (!read_oa_samples_for_query(perf_ctx, query, current_batch)) - ; - } -} - -bool -gen_perf_is_query_ready(struct gen_perf_context *perf_ctx, - struct gen_perf_query_object *query, - void *current_batch) -{ - struct gen_perf_config *perf_cfg = perf_ctx->perf; - - switch (query->queryinfo->kind) { - case GEN_PERF_QUERY_TYPE_OA: - case GEN_PERF_QUERY_TYPE_RAW: - return (query->oa.results_accumulated || - (query->oa.bo && - !perf_cfg->vtbl.batch_references(current_batch, query->oa.bo) && - !perf_cfg->vtbl.bo_busy(query->oa.bo) && - read_oa_samples_for_query(perf_ctx, query, current_batch))); - case GEN_PERF_QUERY_TYPE_PIPELINE: - return (query->pipeline_stats.bo && - !perf_cfg->vtbl.batch_references(current_batch, query->pipeline_stats.bo) && - !perf_cfg->vtbl.bo_busy(query->pipeline_stats.bo)); - - default: - unreachable("Unknown query type"); - break; - } - - return false; -} - -/** - * Remove a query from the global list of unaccumulated queries once - * after successfully accumulating the OA reports associated with the - * query in accumulate_oa_reports() or when discarding unwanted query - * results. - */ -static void -drop_from_unaccumulated_query_list(struct gen_perf_context *perf_ctx, - struct gen_perf_query_object *query) -{ - for (int i = 0; i < perf_ctx->unaccumulated_elements; i++) { - if (perf_ctx->unaccumulated[i] == query) { - int last_elt = --perf_ctx->unaccumulated_elements; - - if (i == last_elt) - perf_ctx->unaccumulated[i] = NULL; - else { - perf_ctx->unaccumulated[i] = - perf_ctx->unaccumulated[last_elt]; - } - - break; - } - } - - /* Drop our samples_head reference so that associated periodic - * sample data buffers can potentially be reaped if they aren't - * referenced by any other queries... - */ - - struct oa_sample_buf *buf = - exec_node_data(struct oa_sample_buf, query->oa.samples_head, link); - - assert(buf->refcount > 0); - buf->refcount--; - - query->oa.samples_head = NULL; - - reap_old_sample_buffers(perf_ctx); -} - -/* In general if we see anything spurious while accumulating results, - * we don't try and continue accumulating the current query, hoping - * for the best, we scrap anything outstanding, and then hope for the - * best with new queries. - */ -static void -discard_all_queries(struct gen_perf_context *perf_ctx) -{ - while (perf_ctx->unaccumulated_elements) { - struct gen_perf_query_object *query = perf_ctx->unaccumulated[0]; - - query->oa.results_accumulated = true; - drop_from_unaccumulated_query_list(perf_ctx, query); - - dec_n_users(perf_ctx); - } -} - -/* Looks for the validity bit of context ID (dword 2) of an OA report. */ -static bool -oa_report_ctx_id_valid(const struct gen_device_info *devinfo, - const uint32_t *report) -{ - assert(devinfo->gen >= 8); - if (devinfo->gen == 8) - return (report[0] & (1 << 25)) != 0; - return (report[0] & (1 << 16)) != 0; -} - -/** - * Accumulate raw OA counter values based on deltas between pairs of - * OA reports. - * - * Accumulation starts from the first report captured via - * MI_REPORT_PERF_COUNT (MI_RPC) by brw_begin_perf_query() until the - * last MI_RPC report requested by brw_end_perf_query(). Between these - * two reports there may also some number of periodically sampled OA - * reports collected via the i915 perf interface - depending on the - * duration of the query. - * - * These periodic snapshots help to ensure we handle counter overflow - * correctly by being frequent enough to ensure we don't miss multiple - * overflows of a counter between snapshots. For Gen8+ the i915 perf - * snapshots provide the extra context-switch reports that let us - * subtract out the progress of counters associated with other - * contexts running on the system. - */ -static void -accumulate_oa_reports(struct gen_perf_context *perf_ctx, - struct gen_perf_query_object *query) -{ - const struct gen_device_info *devinfo = perf_ctx->devinfo; - uint32_t *start; - uint32_t *last; - uint32_t *end; - struct exec_node *first_samples_node; - bool last_report_ctx_match = true; - int out_duration = 0; - - assert(query->oa.map != NULL); - - start = last = query->oa.map; - end = query->oa.map + MI_RPC_BO_END_OFFSET_BYTES; - - if (start[0] != query->oa.begin_report_id) { - DBG("Spurious start report id=%"PRIu32"\n", start[0]); - goto error; - } - if (end[0] != (query->oa.begin_report_id + 1)) { - DBG("Spurious end report id=%"PRIu32"\n", end[0]); - goto error; - } - - /* See if we have any periodic reports to accumulate too... */ - - /* N.B. The oa.samples_head was set when the query began and - * pointed to the tail of the perf_ctx->sample_buffers list at - * the time the query started. Since the buffer existed before the - * first MI_REPORT_PERF_COUNT command was emitted we therefore know - * that no data in this particular node's buffer can possibly be - * associated with the query - so skip ahead one... - */ - first_samples_node = query->oa.samples_head->next; - - foreach_list_typed_from(struct oa_sample_buf, buf, link, - &perf_ctx->sample_buffers, - first_samples_node) - { - int offset = 0; - - while (offset < buf->len) { - const struct drm_i915_perf_record_header *header = - (const struct drm_i915_perf_record_header *)(buf->buf + offset); - - assert(header->size != 0); - assert(header->size <= buf->len); - - offset += header->size; - - switch (header->type) { - case DRM_I915_PERF_RECORD_SAMPLE: { - uint32_t *report = (uint32_t *)(header + 1); - bool report_ctx_match = true; - bool add = true; - - /* Ignore reports that come before the start marker. - * (Note: takes care to allow overflow of 32bit timestamps) - */ - if (gen_device_info_timebase_scale(devinfo, - report[1] - start[1]) > 5000000000) { - continue; - } - - /* Ignore reports that come after the end marker. - * (Note: takes care to allow overflow of 32bit timestamps) - */ - if (gen_device_info_timebase_scale(devinfo, - report[1] - end[1]) <= 5000000000) { - goto end; - } - - /* For Gen8+ since the counters continue while other - * contexts are running we need to discount any unrelated - * deltas. The hardware automatically generates a report - * on context switch which gives us a new reference point - * to continuing adding deltas from. - * - * For Haswell we can rely on the HW to stop the progress - * of OA counters while any other context is acctive. - */ - if (devinfo->gen >= 8) { - /* Consider that the current report matches our context only if - * the report says the report ID is valid. - */ - report_ctx_match = oa_report_ctx_id_valid(devinfo, report) && - report[2] == start[2]; - if (report_ctx_match) - out_duration = 0; - else - out_duration++; - - /* Only add the delta between if the last report - * was clearly identified as our context, or if we have at most - * 1 report without a matching ID. - * - * The OA unit will sometimes label reports with an invalid - * context ID when i915 rewrites the execlist submit register - * with the same context as the one currently running. This - * happens when i915 wants to notify the HW of ringbuffer tail - * register update. We have to consider this report as part of - * our context as the 3d pipeline behind the OACS unit is still - * processing the operations started at the previous execlist - * submission. - */ - add = last_report_ctx_match && out_duration < 2; - } - - if (add) { - query_result_accumulate(&query->oa.result, query->queryinfo, - last, report); - } - - last = report; - last_report_ctx_match = report_ctx_match; - - break; - } - - case DRM_I915_PERF_RECORD_OA_BUFFER_LOST: - DBG("i915 perf: OA error: all reports lost\n"); - goto error; - case DRM_I915_PERF_RECORD_OA_REPORT_LOST: - DBG("i915 perf: OA report lost\n"); - break; - } - } - } - -end: - - query_result_accumulate(&query->oa.result, query->queryinfo, - last, end); - - query->oa.results_accumulated = true; - drop_from_unaccumulated_query_list(perf_ctx, query); - dec_n_users(perf_ctx); - - return; - -error: - - discard_all_queries(perf_ctx); -} - -void -gen_perf_delete_query(struct gen_perf_context *perf_ctx, - struct gen_perf_query_object *query) -{ - struct gen_perf_config *perf_cfg = perf_ctx->perf; - - /* We can assume that the frontend waits for a query to complete - * before ever calling into here, so we don't have to worry about - * deleting an in-flight query object. - */ - switch (query->queryinfo->kind) { - case GEN_PERF_QUERY_TYPE_OA: - case GEN_PERF_QUERY_TYPE_RAW: - if (query->oa.bo) { - if (!query->oa.results_accumulated) { - drop_from_unaccumulated_query_list(perf_ctx, query); - dec_n_users(perf_ctx); - } - - perf_cfg->vtbl.bo_unreference(query->oa.bo); - query->oa.bo = NULL; - } - - query->oa.results_accumulated = false; - break; - - case GEN_PERF_QUERY_TYPE_PIPELINE: - if (query->pipeline_stats.bo) { - perf_cfg->vtbl.bo_unreference(query->pipeline_stats.bo); - query->pipeline_stats.bo = NULL; - } - break; - - default: - unreachable("Unknown query type"); - break; - } - - /* As an indication that the INTEL_performance_query extension is no - * longer in use, it's a good time to free our cache of sample - * buffers and close any current i915-perf stream. - */ - if (--perf_ctx->n_query_instances == 0) { - free_sample_bufs(perf_ctx); - gen_perf_close(perf_ctx, query->queryinfo); - } - - free(query); -} - -#define GET_FIELD(word, field) (((word) & field ## _MASK) >> field ## _SHIFT) - -static void -read_gt_frequency(struct gen_perf_context *perf_ctx, - struct gen_perf_query_object *obj) -{ - const struct gen_device_info *devinfo = perf_ctx->devinfo; - uint32_t start = *((uint32_t *)(obj->oa.map + MI_FREQ_START_OFFSET_BYTES)), - end = *((uint32_t *)(obj->oa.map + MI_FREQ_END_OFFSET_BYTES)); - - switch (devinfo->gen) { - case 7: - case 8: - obj->oa.gt_frequency[0] = GET_FIELD(start, GEN7_RPSTAT1_CURR_GT_FREQ) * 50ULL; - obj->oa.gt_frequency[1] = GET_FIELD(end, GEN7_RPSTAT1_CURR_GT_FREQ) * 50ULL; - break; - case 9: - case 10: - case 11: - obj->oa.gt_frequency[0] = GET_FIELD(start, GEN9_RPSTAT0_CURR_GT_FREQ) * 50ULL / 3ULL; - obj->oa.gt_frequency[1] = GET_FIELD(end, GEN9_RPSTAT0_CURR_GT_FREQ) * 50ULL / 3ULL; - break; - default: - unreachable("unexpected gen"); - } - - /* Put the numbers into Hz. */ - obj->oa.gt_frequency[0] *= 1000000ULL; - obj->oa.gt_frequency[1] *= 1000000ULL; -} - -static int -get_oa_counter_data(struct gen_perf_context *perf_ctx, - struct gen_perf_query_object *query, - size_t data_size, - uint8_t *data) -{ - struct gen_perf_config *perf_cfg = perf_ctx->perf; - const struct gen_perf_query_info *queryinfo = query->queryinfo; - int n_counters = queryinfo->n_counters; - int written = 0; - - for (int i = 0; i < n_counters; i++) { - const struct gen_perf_query_counter *counter = &queryinfo->counters[i]; - uint64_t *out_uint64; - float *out_float; - size_t counter_size = gen_perf_query_counter_get_size(counter); - - if (counter_size) { - switch (counter->data_type) { - case GEN_PERF_COUNTER_DATA_TYPE_UINT64: - out_uint64 = (uint64_t *)(data + counter->offset); - *out_uint64 = - counter->oa_counter_read_uint64(perf_cfg, queryinfo, - query->oa.result.accumulator); - break; - case GEN_PERF_COUNTER_DATA_TYPE_FLOAT: - out_float = (float *)(data + counter->offset); - *out_float = - counter->oa_counter_read_float(perf_cfg, queryinfo, - query->oa.result.accumulator); - break; - default: - /* So far we aren't using uint32, double or bool32... */ - unreachable("unexpected counter data type"); - } - written = counter->offset + counter_size; - } - } - - return written; -} - -static int -get_pipeline_stats_data(struct gen_perf_context *perf_ctx, - struct gen_perf_query_object *query, - size_t data_size, - uint8_t *data) - -{ - struct gen_perf_config *perf_cfg = perf_ctx->perf; - const struct gen_perf_query_info *queryinfo = query->queryinfo; - int n_counters = queryinfo->n_counters; - uint8_t *p = data; - - uint64_t *start = perf_cfg->vtbl.bo_map(perf_ctx->ctx, query->pipeline_stats.bo, MAP_READ); - uint64_t *end = start + (STATS_BO_END_OFFSET_BYTES / sizeof(uint64_t)); - - for (int i = 0; i < n_counters; i++) { - const struct gen_perf_query_counter *counter = &queryinfo->counters[i]; - uint64_t value = end[i] - start[i]; - - if (counter->pipeline_stat.numerator != - counter->pipeline_stat.denominator) { - value *= counter->pipeline_stat.numerator; - value /= counter->pipeline_stat.denominator; - } - - *((uint64_t *)p) = value; - p += 8; - } - - perf_cfg->vtbl.bo_unmap(query->pipeline_stats.bo); - - return p - data; -} - -void -gen_perf_get_query_data(struct gen_perf_context *perf_ctx, - struct gen_perf_query_object *query, - int data_size, - unsigned *data, - unsigned *bytes_written) -{ - struct gen_perf_config *perf_cfg = perf_ctx->perf; - int written = 0; - - switch (query->queryinfo->kind) { - case GEN_PERF_QUERY_TYPE_OA: - case GEN_PERF_QUERY_TYPE_RAW: - if (!query->oa.results_accumulated) { - read_gt_frequency(perf_ctx, query); - uint32_t *begin_report = query->oa.map; - uint32_t *end_report = query->oa.map + MI_RPC_BO_END_OFFSET_BYTES; - query_result_read_frequencies(&query->oa.result, - perf_ctx->devinfo, - begin_report, - end_report); - accumulate_oa_reports(perf_ctx, query); - assert(query->oa.results_accumulated); - - perf_cfg->vtbl.bo_unmap(query->oa.bo); - query->oa.map = NULL; - } - if (query->queryinfo->kind == GEN_PERF_QUERY_TYPE_OA) { - written = get_oa_counter_data(perf_ctx, query, data_size, (uint8_t *)data); - } else { - const struct gen_device_info *devinfo = perf_ctx->devinfo; - - written = gen_perf_query_result_write_mdapi((uint8_t *)data, data_size, - devinfo, &query->oa.result, - query->oa.gt_frequency[0], - query->oa.gt_frequency[1]); - } - break; - - case GEN_PERF_QUERY_TYPE_PIPELINE: - written = get_pipeline_stats_data(perf_ctx, query, data_size, (uint8_t *)data); - break; - - default: - unreachable("Unknown query type"); - break; - } - - if (bytes_written) - *bytes_written = written; -} - -void -gen_perf_dump_query_count(struct gen_perf_context *perf_ctx) -{ - DBG("Queries: (Open queries = %d, OA users = %d)\n", - perf_ctx->n_active_oa_queries, perf_ctx->n_oa_users); -} - -void -gen_perf_dump_query(struct gen_perf_context *ctx, - struct gen_perf_query_object *obj, - void *current_batch) -{ - switch (obj->queryinfo->kind) { - case GEN_PERF_QUERY_TYPE_OA: - case GEN_PERF_QUERY_TYPE_RAW: - DBG("BO: %-4s OA data: %-10s %-15s\n", - obj->oa.bo ? "yes," : "no,", - gen_perf_is_query_ready(ctx, obj, current_batch) ? "ready," : "not ready,", - obj->oa.results_accumulated ? "accumulated" : "not accumulated"); - break; - case GEN_PERF_QUERY_TYPE_PIPELINE: - DBG("BO: %-4s\n", - obj->pipeline_stats.bo ? "yes" : "no"); - break; - default: - unreachable("Unknown query type"); - break; - } + gen_perf_register_mdapi_oa_query(perf_cfg, devinfo); } diff -Nru mesa-19.2.8/src/intel/perf/gen_perf.h mesa-20.0.8/src/intel/perf/gen_perf.h --- mesa-19.2.8/src/intel/perf/gen_perf.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/perf/gen_perf.h 2020-06-12 01:21:17.000000000 +0000 @@ -43,18 +43,6 @@ struct gen_perf_config; struct gen_perf_query_info; -#define GEN7_RPSTAT1 0xA01C -#define GEN7_RPSTAT1_CURR_GT_FREQ_SHIFT 7 -#define GEN7_RPSTAT1_CURR_GT_FREQ_MASK INTEL_MASK(13, 7) -#define GEN7_RPSTAT1_PREV_GT_FREQ_SHIFT 0 -#define GEN7_RPSTAT1_PREV_GT_FREQ_MASK INTEL_MASK(6, 0) - -#define GEN9_RPSTAT0 0xA01C -#define GEN9_RPSTAT0_CURR_GT_FREQ_SHIFT 23 -#define GEN9_RPSTAT0_CURR_GT_FREQ_MASK INTEL_MASK(31, 23) -#define GEN9_RPSTAT0_PREV_GT_FREQ_SHIFT 0 -#define GEN9_RPSTAT0_PREV_GT_FREQ_MASK INTEL_MASK(8, 0) - enum gen_perf_counter_type { GEN_PERF_COUNTER_TYPE_EVENT, GEN_PERF_COUNTER_TYPE_DURATION_NORM, @@ -87,19 +75,6 @@ */ #define MAX_OA_REPORT_COUNTERS 62 -#define IA_VERTICES_COUNT 0x2310 -#define IA_PRIMITIVES_COUNT 0x2318 -#define VS_INVOCATION_COUNT 0x2320 -#define HS_INVOCATION_COUNT 0x2300 -#define DS_INVOCATION_COUNT 0x2308 -#define GS_INVOCATION_COUNT 0x2328 -#define GS_PRIMITIVES_COUNT 0x2330 -#define CL_INVOCATION_COUNT 0x2338 -#define CL_PRIMITIVES_COUNT 0x2340 -#define PS_INVOCATION_COUNT 0x2348 -#define CS_INVOCATION_COUNT 0x2290 -#define PS_DEPTH_COUNT 0x2350 - /* * When currently allocate only one page for pipeline statistics queries. Here * we derived the maximum number of counters for that amount. @@ -138,6 +113,16 @@ * query. */ uint64_t unslice_frequency[2]; + + /** + * Timestamp of the query. + */ + uint64_t begin_timestamp; + + /** + * Whether the query was interrupted by another workload (aka preemption). + */ + bool query_disjoint; }; struct gen_perf_query_counter { @@ -164,6 +149,18 @@ uint32_t val; }; +/* Register programming for a given query */ +struct gen_perf_registers { + struct gen_perf_query_register_prog *flex_regs; + uint32_t n_flex_regs; + + struct gen_perf_query_register_prog *mux_regs; + uint32_t n_mux_regs; + + struct gen_perf_query_register_prog *b_counter_regs; + uint32_t n_b_counter_regs; +}; + struct gen_perf_query_info { enum gen_perf_query_type { GEN_PERF_QUERY_TYPE_OA, @@ -188,18 +185,15 @@ int b_offset; int c_offset; - /* Register programming for a given query */ - struct gen_perf_query_register_prog *flex_regs; - uint32_t n_flex_regs; - - struct gen_perf_query_register_prog *mux_regs; - uint32_t n_mux_regs; - - struct gen_perf_query_register_prog *b_counter_regs; - uint32_t n_b_counter_regs; + struct gen_perf_registers config; }; struct gen_perf_config { + bool i915_query_supported; + + /* Version of the i915-perf subsystem, refer to i915_drm.h. */ + int i915_perf_version; + struct gen_perf_query_info *queries; int n_queries; @@ -238,41 +232,56 @@ bool (*batch_references)(void *batch, void *bo); void (*bo_wait_rendering)(void *bo); int (*bo_busy)(void *bo); - void (*emit_mi_flush)(void *ctx); + void (*emit_stall_at_pixel_scoreboard)(void *ctx); void (*emit_mi_report_perf_count)(void *ctx, void *bo, uint32_t offset_in_bytes, uint32_t report_id); void (*batchbuffer_flush)(void *ctx, const char *file, int line); - void (*capture_frequency_stat_register)(void *ctx, void *bo, - uint32_t bo_offset); - void (*store_register_mem64)(void *ctx, void *bo, uint32_t reg, uint32_t offset); + void (*store_register_mem)(void *ctx, void *bo, uint32_t reg, uint32_t reg_size, uint32_t offset); } vtbl; }; -struct gen_perf_query_object; -const struct gen_perf_query_info* gen_perf_query_info(const struct gen_perf_query_object *); - -struct gen_perf_context; -struct gen_perf_context *gen_perf_new_context(void *parent); - void gen_perf_init_metrics(struct gen_perf_config *perf_cfg, const struct gen_device_info *devinfo, int drm_fd); -void gen_perf_init_context(struct gen_perf_context *perf_ctx, - struct gen_perf_config *perf_cfg, - void * ctx, /* driver context (eg, brw_context) */ - void * bufmgr, /* eg brw_bufmgr */ - const struct gen_device_info *devinfo, - uint32_t hw_ctx, - int drm_fd); -struct gen_perf_config *gen_perf_config(struct gen_perf_context *ctx); +/** Query i915 for a metric id using guid. + */ +bool gen_perf_load_metric_id(struct gen_perf_config *perf_cfg, + const char *guid, + uint64_t *metric_id); + +/** Load a configuation's content from i915 using a guid. + */ +struct gen_perf_registers *gen_perf_load_configuration(struct gen_perf_config *perf_cfg, + int fd, const char *guid); + +/** Store a configuration into i915 using guid and return a new metric id. + * + * If guid is NULL, then a generated one will be provided by hashing the + * content of the configuration. + */ +uint64_t gen_perf_store_configuration(struct gen_perf_config *perf_cfg, int fd, + const struct gen_perf_registers *config, + const char *guid); -int gen_perf_active_queries(struct gen_perf_context *perf_ctx, - const struct gen_perf_query_info *query); +/** Read the slice/unslice frequency from 2 OA reports and store then into + * result. + */ +void gen_perf_query_result_read_frequencies(struct gen_perf_query_result *result, + const struct gen_device_info *devinfo, + const uint32_t *start, + const uint32_t *end); +/** Accumulate the delta between 2 OA reports into result for a given query. + */ +void gen_perf_query_result_accumulate(struct gen_perf_query_result *result, + const struct gen_perf_query_info *query, + const uint32_t *start, + const uint32_t *end); +void gen_perf_query_result_clear(struct gen_perf_query_result *result); static inline size_t gen_perf_query_counter_get_size(const struct gen_perf_query_counter *counter) @@ -300,31 +309,4 @@ return perf; } -struct gen_perf_query_object * -gen_perf_new_query(struct gen_perf_context *, unsigned query_index); - - -bool gen_perf_begin_query(struct gen_perf_context *perf_ctx, - struct gen_perf_query_object *query); -void gen_perf_end_query(struct gen_perf_context *perf_ctx, - struct gen_perf_query_object *query); -void gen_perf_wait_query(struct gen_perf_context *perf_ctx, - struct gen_perf_query_object *query, - void *current_batch); -bool gen_perf_is_query_ready(struct gen_perf_context *perf_ctx, - struct gen_perf_query_object *query, - void *current_batch); -void gen_perf_delete_query(struct gen_perf_context *perf_ctx, - struct gen_perf_query_object *query); -void gen_perf_get_query_data(struct gen_perf_context *perf_ctx, - struct gen_perf_query_object *query, - int data_size, - unsigned *data, - unsigned *bytes_written); - -void gen_perf_dump_query_count(struct gen_perf_context *perf_ctx); -void gen_perf_dump_query(struct gen_perf_context *perf_ctx, - struct gen_perf_query_object *obj, - void *current_batch); - #endif /* GEN_PERF_H */ diff -Nru mesa-19.2.8/src/intel/perf/gen_perf_mdapi.c mesa-20.0.8/src/intel/perf/gen_perf_mdapi.c --- mesa-19.2.8/src/intel/perf/gen_perf_mdapi.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/perf/gen_perf_mdapi.c 2020-06-12 01:21:17.000000000 +0000 @@ -23,9 +23,14 @@ #include "gen_perf.h" #include "gen_perf_mdapi.h" +#include "gen_perf_private.h" +#include "gen_perf_regs.h" #include "dev/gen_device_info.h" +#include + + int gen_perf_query_result_write_mdapi(void *data, uint32_t data_size, const struct gen_device_info *devinfo, @@ -54,6 +59,7 @@ gen_device_info_timebase_scale(devinfo, result->accumulator[0]); mdapi_data->CoreFrequency = freq_end; mdapi_data->CoreFrequencyChanged = freq_end != freq_start; + mdapi_data->SplitOccured = result->query_disjoint; return sizeof(*mdapi_data); } case 8: { @@ -73,6 +79,8 @@ mdapi_data->ReportsCount = result->reports_accumulated; mdapi_data->TotalTime = gen_device_info_timebase_scale(devinfo, result->accumulator[0]); + mdapi_data->BeginTimestamp = + gen_device_info_timebase_scale(devinfo, result->begin_timestamp); mdapi_data->GPUTicks = result->accumulator[1]; mdapi_data->CoreFrequency = freq_end; mdapi_data->CoreFrequencyChanged = freq_end != freq_start; @@ -80,11 +88,13 @@ (result->slice_frequency[0] + result->slice_frequency[1]) / 2ULL; mdapi_data->UnsliceFrequency = (result->unslice_frequency[0] + result->unslice_frequency[1]) / 2ULL; + mdapi_data->SplitOccured = result->query_disjoint; return sizeof(*mdapi_data); } case 9: case 10: - case 11: { + case 11: + case 12:{ struct gen9_mdapi_metrics *mdapi_data = (struct gen9_mdapi_metrics *) data; if (data_size < sizeof(*mdapi_data)) @@ -101,6 +111,8 @@ mdapi_data->ReportsCount = result->reports_accumulated; mdapi_data->TotalTime = gen_device_info_timebase_scale(devinfo, result->accumulator[0]); + mdapi_data->BeginTimestamp = + gen_device_info_timebase_scale(devinfo, result->begin_timestamp); mdapi_data->GPUTicks = result->accumulator[1]; mdapi_data->CoreFrequency = freq_end; mdapi_data->CoreFrequencyChanged = freq_end != freq_start; @@ -108,9 +120,241 @@ (result->slice_frequency[0] + result->slice_frequency[1]) / 2ULL; mdapi_data->UnsliceFrequency = (result->unslice_frequency[0] + result->unslice_frequency[1]) / 2ULL; + mdapi_data->SplitOccured = result->query_disjoint; return sizeof(*mdapi_data); } default: unreachable("unexpected gen"); } } + +void +gen_perf_register_mdapi_statistic_query(struct gen_perf_config *perf_cfg, + const struct gen_device_info *devinfo) +{ + if (!(devinfo->gen >= 7 && devinfo->gen <= 12)) + return; + + struct gen_perf_query_info *query = + gen_perf_append_query_info(perf_cfg, MAX_STAT_COUNTERS); + + query->kind = GEN_PERF_QUERY_TYPE_PIPELINE; + query->name = "Intel_Raw_Pipeline_Statistics_Query"; + + /* The order has to match mdapi_pipeline_metrics. */ + gen_perf_query_add_basic_stat_reg(query, IA_VERTICES_COUNT, + "N vertices submitted"); + gen_perf_query_add_basic_stat_reg(query, IA_PRIMITIVES_COUNT, + "N primitives submitted"); + gen_perf_query_add_basic_stat_reg(query, VS_INVOCATION_COUNT, + "N vertex shader invocations"); + gen_perf_query_add_basic_stat_reg(query, GS_INVOCATION_COUNT, + "N geometry shader invocations"); + gen_perf_query_add_basic_stat_reg(query, GS_PRIMITIVES_COUNT, + "N geometry shader primitives emitted"); + gen_perf_query_add_basic_stat_reg(query, CL_INVOCATION_COUNT, + "N primitives entering clipping"); + gen_perf_query_add_basic_stat_reg(query, CL_PRIMITIVES_COUNT, + "N primitives leaving clipping"); + if (devinfo->is_haswell || devinfo->gen == 8) { + gen_perf_query_add_stat_reg(query, PS_INVOCATION_COUNT, 1, 4, + "N fragment shader invocations", + "N fragment shader invocations"); + } else { + gen_perf_query_add_basic_stat_reg(query, PS_INVOCATION_COUNT, + "N fragment shader invocations"); + } + gen_perf_query_add_basic_stat_reg(query, HS_INVOCATION_COUNT, + "N TCS shader invocations"); + gen_perf_query_add_basic_stat_reg(query, DS_INVOCATION_COUNT, + "N TES shader invocations"); + if (devinfo->gen >= 7) { + gen_perf_query_add_basic_stat_reg(query, CS_INVOCATION_COUNT, + "N compute shader invocations"); + } + + if (devinfo->gen >= 10) { + /* Reuse existing CS invocation register until we can expose this new + * one. + */ + gen_perf_query_add_basic_stat_reg(query, CS_INVOCATION_COUNT, + "Reserved1"); + } + + query->data_size = sizeof(uint64_t) * query->n_counters; +} + +static void +fill_mdapi_perf_query_counter(struct gen_perf_query_info *query, + const char *name, + uint32_t data_offset, + uint32_t data_size, + enum gen_perf_counter_data_type data_type) +{ + struct gen_perf_query_counter *counter = &query->counters[query->n_counters]; + + assert(query->n_counters <= query->max_counters); + + counter->name = name; + counter->desc = "Raw counter value"; + counter->type = GEN_PERF_COUNTER_TYPE_RAW; + counter->data_type = data_type; + counter->offset = data_offset; + + query->n_counters++; + + assert(counter->offset + gen_perf_query_counter_get_size(counter) <= query->data_size); +} + +#define MDAPI_QUERY_ADD_COUNTER(query, struct_name, field_name, type_name) \ + fill_mdapi_perf_query_counter(query, #field_name, \ + (uint8_t *) &struct_name.field_name - \ + (uint8_t *) &struct_name, \ + sizeof(struct_name.field_name), \ + GEN_PERF_COUNTER_DATA_TYPE_##type_name) +#define MDAPI_QUERY_ADD_ARRAY_COUNTER(ctx, query, struct_name, field_name, idx, type_name) \ + fill_mdapi_perf_query_counter(query, \ + ralloc_asprintf(ctx, "%s%i", #field_name, idx), \ + (uint8_t *) &struct_name.field_name[idx] - \ + (uint8_t *) &struct_name, \ + sizeof(struct_name.field_name[0]), \ + GEN_PERF_COUNTER_DATA_TYPE_##type_name) + +void +gen_perf_register_mdapi_oa_query(struct gen_perf_config *perf, + const struct gen_device_info *devinfo) +{ + struct gen_perf_query_info *query = NULL; + + /* MDAPI requires different structures for pretty much every generation + * (right now we have definitions for gen 7 to 12). + */ + if (!(devinfo->gen >= 7 && devinfo->gen <= 12)) + return; + + switch (devinfo->gen) { + case 7: { + query = gen_perf_append_query_info(perf, 1 + 45 + 16 + 7); + query->oa_format = I915_OA_FORMAT_A45_B8_C8; + + struct gen7_mdapi_metrics metric_data; + query->data_size = sizeof(metric_data); + + MDAPI_QUERY_ADD_COUNTER(query, metric_data, TotalTime, UINT64); + for (int i = 0; i < ARRAY_SIZE(metric_data.ACounters); i++) { + MDAPI_QUERY_ADD_ARRAY_COUNTER(perf->queries, query, + metric_data, ACounters, i, UINT64); + } + for (int i = 0; i < ARRAY_SIZE(metric_data.NOACounters); i++) { + MDAPI_QUERY_ADD_ARRAY_COUNTER(perf->queries, query, + metric_data, NOACounters, i, UINT64); + } + MDAPI_QUERY_ADD_COUNTER(query, metric_data, PerfCounter1, UINT64); + MDAPI_QUERY_ADD_COUNTER(query, metric_data, PerfCounter2, UINT64); + MDAPI_QUERY_ADD_COUNTER(query, metric_data, SplitOccured, BOOL32); + MDAPI_QUERY_ADD_COUNTER(query, metric_data, CoreFrequencyChanged, BOOL32); + MDAPI_QUERY_ADD_COUNTER(query, metric_data, CoreFrequency, UINT64); + MDAPI_QUERY_ADD_COUNTER(query, metric_data, ReportId, UINT32); + MDAPI_QUERY_ADD_COUNTER(query, metric_data, ReportsCount, UINT32); + break; + } + case 8: { + query = gen_perf_append_query_info(perf, 2 + 36 + 16 + 16); + query->oa_format = I915_OA_FORMAT_A32u40_A4u32_B8_C8; + + struct gen8_mdapi_metrics metric_data; + query->data_size = sizeof(metric_data); + + MDAPI_QUERY_ADD_COUNTER(query, metric_data, TotalTime, UINT64); + MDAPI_QUERY_ADD_COUNTER(query, metric_data, GPUTicks, UINT64); + for (int i = 0; i < ARRAY_SIZE(metric_data.OaCntr); i++) { + MDAPI_QUERY_ADD_ARRAY_COUNTER(perf->queries, query, + metric_data, OaCntr, i, UINT64); + } + for (int i = 0; i < ARRAY_SIZE(metric_data.NoaCntr); i++) { + MDAPI_QUERY_ADD_ARRAY_COUNTER(perf->queries, query, + metric_data, NoaCntr, i, UINT64); + } + MDAPI_QUERY_ADD_COUNTER(query, metric_data, BeginTimestamp, UINT64); + MDAPI_QUERY_ADD_COUNTER(query, metric_data, Reserved1, UINT64); + MDAPI_QUERY_ADD_COUNTER(query, metric_data, Reserved2, UINT64); + MDAPI_QUERY_ADD_COUNTER(query, metric_data, Reserved3, UINT32); + MDAPI_QUERY_ADD_COUNTER(query, metric_data, OverrunOccured, BOOL32); + MDAPI_QUERY_ADD_COUNTER(query, metric_data, MarkerUser, UINT64); + MDAPI_QUERY_ADD_COUNTER(query, metric_data, MarkerDriver, UINT64); + MDAPI_QUERY_ADD_COUNTER(query, metric_data, SliceFrequency, UINT64); + MDAPI_QUERY_ADD_COUNTER(query, metric_data, UnsliceFrequency, UINT64); + MDAPI_QUERY_ADD_COUNTER(query, metric_data, PerfCounter1, UINT64); + MDAPI_QUERY_ADD_COUNTER(query, metric_data, PerfCounter2, UINT64); + MDAPI_QUERY_ADD_COUNTER(query, metric_data, SplitOccured, BOOL32); + MDAPI_QUERY_ADD_COUNTER(query, metric_data, CoreFrequencyChanged, BOOL32); + MDAPI_QUERY_ADD_COUNTER(query, metric_data, CoreFrequency, UINT64); + MDAPI_QUERY_ADD_COUNTER(query, metric_data, ReportId, UINT32); + MDAPI_QUERY_ADD_COUNTER(query, metric_data, ReportsCount, UINT32); + break; + } + case 9: + case 10: + case 11: + case 12: { + query = gen_perf_append_query_info(perf, 2 + 36 + 16 + 16 + 16 + 2); + query->oa_format = I915_OA_FORMAT_A32u40_A4u32_B8_C8; + + struct gen9_mdapi_metrics metric_data; + query->data_size = sizeof(metric_data); + + MDAPI_QUERY_ADD_COUNTER(query, metric_data, TotalTime, UINT64); + MDAPI_QUERY_ADD_COUNTER(query, metric_data, GPUTicks, UINT64); + for (int i = 0; i < ARRAY_SIZE(metric_data.OaCntr); i++) { + MDAPI_QUERY_ADD_ARRAY_COUNTER(perf->queries, query, + metric_data, OaCntr, i, UINT64); + } + for (int i = 0; i < ARRAY_SIZE(metric_data.NoaCntr); i++) { + MDAPI_QUERY_ADD_ARRAY_COUNTER(perf->queries, query, + metric_data, NoaCntr, i, UINT64); + } + MDAPI_QUERY_ADD_COUNTER(query, metric_data, BeginTimestamp, UINT64); + MDAPI_QUERY_ADD_COUNTER(query, metric_data, Reserved1, UINT64); + MDAPI_QUERY_ADD_COUNTER(query, metric_data, Reserved2, UINT64); + MDAPI_QUERY_ADD_COUNTER(query, metric_data, Reserved3, UINT32); + MDAPI_QUERY_ADD_COUNTER(query, metric_data, OverrunOccured, BOOL32); + MDAPI_QUERY_ADD_COUNTER(query, metric_data, MarkerUser, UINT64); + MDAPI_QUERY_ADD_COUNTER(query, metric_data, MarkerDriver, UINT64); + MDAPI_QUERY_ADD_COUNTER(query, metric_data, SliceFrequency, UINT64); + MDAPI_QUERY_ADD_COUNTER(query, metric_data, UnsliceFrequency, UINT64); + MDAPI_QUERY_ADD_COUNTER(query, metric_data, PerfCounter1, UINT64); + MDAPI_QUERY_ADD_COUNTER(query, metric_data, PerfCounter2, UINT64); + MDAPI_QUERY_ADD_COUNTER(query, metric_data, SplitOccured, BOOL32); + MDAPI_QUERY_ADD_COUNTER(query, metric_data, CoreFrequencyChanged, BOOL32); + MDAPI_QUERY_ADD_COUNTER(query, metric_data, CoreFrequency, UINT64); + MDAPI_QUERY_ADD_COUNTER(query, metric_data, ReportId, UINT32); + MDAPI_QUERY_ADD_COUNTER(query, metric_data, ReportsCount, UINT32); + for (int i = 0; i < ARRAY_SIZE(metric_data.UserCntr); i++) { + MDAPI_QUERY_ADD_ARRAY_COUNTER(perf->queries, query, + metric_data, UserCntr, i, UINT64); + } + MDAPI_QUERY_ADD_COUNTER(query, metric_data, UserCntrCfgId, UINT32); + MDAPI_QUERY_ADD_COUNTER(query, metric_data, Reserved4, UINT32); + break; + } + default: + unreachable("Unsupported gen"); + break; + } + + query->kind = GEN_PERF_QUERY_TYPE_RAW; + query->name = "Intel_Raw_Hardware_Counters_Set_0_Query"; + query->guid = GEN_PERF_QUERY_GUID_MDAPI; + + { + /* Accumulation buffer offsets copied from an actual query... */ + const struct gen_perf_query_info *copy_query = + &perf->queries[0]; + + query->gpu_time_offset = copy_query->gpu_time_offset; + query->gpu_clock_offset = copy_query->gpu_clock_offset; + query->a_offset = copy_query->a_offset; + query->b_offset = copy_query->b_offset; + query->c_offset = copy_query->c_offset; + } +} diff -Nru mesa-19.2.8/src/intel/perf/gen_perf_mdapi.h mesa-20.0.8/src/intel/perf/gen_perf_mdapi.h --- mesa-19.2.8/src/intel/perf/gen_perf_mdapi.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/perf/gen_perf_mdapi.h 2020-06-12 01:21:17.000000000 +0000 @@ -26,7 +26,8 @@ #include -struct gen_device_info; +#include "dev/gen_device_info.h" + struct gen_perf_query_result; /* Guid has to matches with MDAPI's. */ @@ -132,4 +133,66 @@ const struct gen_perf_query_result *result, uint64_t freq_start, uint64_t freq_end); +static inline void gen_perf_query_mdapi_write_perfcntr(void *data, uint32_t data_size, + const struct gen_device_info *devinfo, + const uint64_t *begin_perf_cntrs, + const uint64_t *end_perf_cntrs) +{ + /* Only bits 0:43 of the 64bit registers contains the value. */ + const uint64_t mask = (1ull << 44) - 1; + + switch (devinfo->gen) { + case 8: { + if (data_size < sizeof(struct gen8_mdapi_metrics)) + return; + struct gen8_mdapi_metrics *mdapi_data = data; + mdapi_data->PerfCounter1 = + (end_perf_cntrs[0] & mask) - (begin_perf_cntrs[0] & mask); + mdapi_data->PerfCounter2 = + (end_perf_cntrs[1] & mask) - (begin_perf_cntrs[1] & mask); + break; + } + case 9: + case 10: + case 11: { + if (data_size < sizeof(struct gen9_mdapi_metrics)) + return; + struct gen9_mdapi_metrics *mdapi_data = data; + mdapi_data->PerfCounter1 = + (end_perf_cntrs[0] & mask) - (begin_perf_cntrs[0] & mask); + mdapi_data->PerfCounter2 = + (end_perf_cntrs[1] & mask) - (begin_perf_cntrs[1] & mask); + break; + } + default: + break; + } +} + +static inline void gen_perf_query_mdapi_write_marker(void *data, uint32_t data_size, + const struct gen_device_info *devinfo, + uint64_t value) +{ + switch (devinfo->gen) { + case 8: { + if (data_size < sizeof(struct gen8_mdapi_metrics)) + return; + struct gen8_mdapi_metrics *mdapi_data = data; + mdapi_data->MarkerUser = value; + break; + } + case 9: + case 10: + case 11: { + if (data_size < sizeof(struct gen9_mdapi_metrics)) + return; + struct gen9_mdapi_metrics *mdapi_data = data; + mdapi_data->MarkerUser = value; + break; + } + default: + break; + } +} + #endif /* GEN_PERF_MDAPI_H */ diff -Nru mesa-19.2.8/src/intel/perf/gen_perf_private.h mesa-20.0.8/src/intel/perf/gen_perf_private.h --- mesa-19.2.8/src/intel/perf/gen_perf_private.h 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/intel/perf/gen_perf_private.h 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,84 @@ +/* + * Copyright © 2019 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef GEN_PERF_PRIVATE_H +#define GEN_PERF_PRIVATE_H + +#include "gen_perf.h" + +static inline void +gen_perf_query_add_stat_reg(struct gen_perf_query_info *query, uint32_t reg, + uint32_t numerator, uint32_t denominator, + const char *name, const char *description) +{ + struct gen_perf_query_counter *counter; + + assert(query->n_counters < query->max_counters); + + counter = &query->counters[query->n_counters]; + counter->name = name; + counter->desc = description; + counter->type = GEN_PERF_COUNTER_TYPE_RAW; + counter->data_type = GEN_PERF_COUNTER_DATA_TYPE_UINT64; + counter->offset = sizeof(uint64_t) * query->n_counters; + counter->pipeline_stat.reg = reg; + counter->pipeline_stat.numerator = numerator; + counter->pipeline_stat.denominator = denominator; + + query->n_counters++; +} + +static inline void +gen_perf_query_add_basic_stat_reg(struct gen_perf_query_info *query, + uint32_t reg, const char *name) +{ + gen_perf_query_add_stat_reg(query, reg, 1, 1, name, name); +} + +static inline struct gen_perf_query_info * +gen_perf_append_query_info(struct gen_perf_config *perf, int max_counters) +{ + struct gen_perf_query_info *query; + + perf->queries = reralloc(perf, perf->queries, + struct gen_perf_query_info, + ++perf->n_queries); + query = &perf->queries[perf->n_queries - 1]; + memset(query, 0, sizeof(*query)); + + if (max_counters > 0) { + query->max_counters = max_counters; + query->counters = + rzalloc_array(perf, struct gen_perf_query_counter, max_counters); + } + + return query; +} + +void gen_perf_register_mdapi_statistic_query(struct gen_perf_config *perf_cfg, + const struct gen_device_info *devinfo); +void gen_perf_register_mdapi_oa_query(struct gen_perf_config *perf, + const struct gen_device_info *devinfo); + + +#endif /* GEN_PERF_PRIVATE_H */ diff -Nru mesa-19.2.8/src/intel/perf/gen_perf.py mesa-20.0.8/src/intel/perf/gen_perf.py --- mesa-19.2.8/src/intel/perf/gen_perf.py 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/perf/gen_perf.py 2020-06-12 01:21:17.000000000 +0000 @@ -181,7 +181,10 @@ hw_vars["$EuSubslicesTotalCount"] = "perf->sys_vars.n_eu_sub_slices" hw_vars["$EuThreadsCount"] = "perf->sys_vars.eu_threads_count" hw_vars["$SliceMask"] = "perf->sys_vars.slice_mask" +# subslice_mask is interchangeable with subslice/dual-subslice since Gen12+ +# only has dual subslices which can be assimilated with 16EUs subslices. hw_vars["$SubsliceMask"] = "perf->sys_vars.subslice_mask" +hw_vars["$DualSubsliceMask"] = "perf->sys_vars.subslice_mask" hw_vars["$GpuTimestampFrequency"] = "perf->sys_vars.timestamp_frequency" hw_vars["$GpuMinFrequency"] = "perf->sys_vars.gt_min_freq" hw_vars["$GpuMaxFrequency"] = "perf->sys_vars.gt_max_freq" @@ -419,7 +422,7 @@ c_indent(3) for register in register_config.findall('register'): - c("query->%s[query->n_%s++] = (struct gen_perf_query_register_prog) { .reg = %s, .val = %s };" % + c("query->config.%s[query->config.n_%s++] = (struct gen_perf_query_register_prog) { .reg = %s, .val = %s };" % (t, t, register.get('address'), register.get('value'))) if availability: @@ -692,9 +695,13 @@ .c_offset = 46, """)) + c(".config = {") + c_indent(3) for reg_type, reg_length in register_lengths.items(): c(".{0} = {1}_{2}_{3},".format(reg_type, gen.chipset, set.underscore_name, reg_type)) c(".n_{0} = 0, /* Determined at runtime */".format(reg_type)) + c_outdent(3) + c("},") c_outdent(3) c("};\n") diff -Nru mesa-19.2.8/src/intel/perf/gen_perf_query.c mesa-20.0.8/src/intel/perf/gen_perf_query.c --- mesa-19.2.8/src/intel/perf/gen_perf_query.c 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/intel/perf/gen_perf_query.c 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,1585 @@ +/* + * Copyright © 2019 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include + +#include "common/gen_gem.h" + +#include "dev/gen_debug.h" +#include "dev/gen_device_info.h" + +#include "perf/gen_perf.h" +#include "perf/gen_perf_mdapi.h" +#include "perf/gen_perf_query.h" +#include "perf/gen_perf_regs.h" + +#include "drm-uapi/i915_drm.h" + +#include "util/u_math.h" + +#define FILE_DEBUG_FLAG DEBUG_PERFMON +#define MI_RPC_BO_SIZE 4096 +#define MI_FREQ_START_OFFSET_BYTES (3072) +#define MI_RPC_BO_END_OFFSET_BYTES (MI_RPC_BO_SIZE / 2) +#define MI_FREQ_END_OFFSET_BYTES (3076) + +#define MAP_READ (1 << 0) +#define MAP_WRITE (1 << 1) + +/** + * Periodic OA samples are read() into these buffer structures via the + * i915 perf kernel interface and appended to the + * perf_ctx->sample_buffers linked list. When we process the + * results of an OA metrics query we need to consider all the periodic + * samples between the Begin and End MI_REPORT_PERF_COUNT command + * markers. + * + * 'Periodic' is a simplification as there are other automatic reports + * written by the hardware also buffered here. + * + * Considering three queries, A, B and C: + * + * Time ----> + * ________________A_________________ + * | | + * | ________B_________ _____C___________ + * | | | | | | + * + * And an illustration of sample buffers read over this time frame: + * [HEAD ][ ][ ][ ][ ][ ][ ][ ][TAIL ] + * + * These nodes may hold samples for query A: + * [ ][ ][ A ][ A ][ A ][ A ][ A ][ ][ ] + * + * These nodes may hold samples for query B: + * [ ][ ][ B ][ B ][ B ][ ][ ][ ][ ] + * + * These nodes may hold samples for query C: + * [ ][ ][ ][ ][ ][ C ][ C ][ C ][ ] + * + * The illustration assumes we have an even distribution of periodic + * samples so all nodes have the same size plotted against time: + * + * Note, to simplify code, the list is never empty. + * + * With overlapping queries we can see that periodic OA reports may + * relate to multiple queries and care needs to be take to keep + * track of sample buffers until there are no queries that might + * depend on their contents. + * + * We use a node ref counting system where a reference ensures that a + * node and all following nodes can't be freed/recycled until the + * reference drops to zero. + * + * E.g. with a ref of one here: + * [ 0 ][ 0 ][ 1 ][ 0 ][ 0 ][ 0 ][ 0 ][ 0 ][ 0 ] + * + * These nodes could be freed or recycled ("reaped"): + * [ 0 ][ 0 ] + * + * These must be preserved until the leading ref drops to zero: + * [ 1 ][ 0 ][ 0 ][ 0 ][ 0 ][ 0 ][ 0 ] + * + * When a query starts we take a reference on the current tail of + * the list, knowing that no already-buffered samples can possibly + * relate to the newly-started query. A pointer to this node is + * also saved in the query object's ->oa.samples_head. + * + * E.g. starting query A while there are two nodes in .sample_buffers: + * ________________A________ + * | + * + * [ 0 ][ 1 ] + * ^_______ Add a reference and store pointer to node in + * A->oa.samples_head + * + * Moving forward to when the B query starts with no new buffer nodes: + * (for reference, i915 perf reads() are only done when queries finish) + * ________________A_______ + * | ________B___ + * | | + * + * [ 0 ][ 2 ] + * ^_______ Add a reference and store pointer to + * node in B->oa.samples_head + * + * Once a query is finished, after an OA query has become 'Ready', + * once the End OA report has landed and after we we have processed + * all the intermediate periodic samples then we drop the + * ->oa.samples_head reference we took at the start. + * + * So when the B query has finished we have: + * ________________A________ + * | ______B___________ + * | | | + * [ 0 ][ 1 ][ 0 ][ 0 ][ 0 ] + * ^_______ Drop B->oa.samples_head reference + * + * We still can't free these due to the A->oa.samples_head ref: + * [ 1 ][ 0 ][ 0 ][ 0 ] + * + * When the A query finishes: (note there's a new ref for C's samples_head) + * ________________A_________________ + * | | + * | _____C_________ + * | | | + * [ 0 ][ 0 ][ 0 ][ 0 ][ 1 ][ 0 ][ 0 ] + * ^_______ Drop A->oa.samples_head reference + * + * And we can now reap these nodes up to the C->oa.samples_head: + * [ X ][ X ][ X ][ X ] + * keeping -> [ 1 ][ 0 ][ 0 ] + * + * We reap old sample buffers each time we finish processing an OA + * query by iterating the sample_buffers list from the head until we + * find a referenced node and stop. + * + * Reaped buffers move to a perfquery.free_sample_buffers list and + * when we come to read() we first look to recycle a buffer from the + * free_sample_buffers list before allocating a new buffer. + */ +struct oa_sample_buf { + struct exec_node link; + int refcount; + int len; + uint8_t buf[I915_PERF_OA_SAMPLE_SIZE * 10]; + uint32_t last_timestamp; +}; + +/** + * gen representation of a performance query object. + * + * NB: We want to keep this structure relatively lean considering that + * applications may expect to allocate enough objects to be able to + * query around all draw calls in a frame. + */ +struct gen_perf_query_object +{ + const struct gen_perf_query_info *queryinfo; + + /* See query->kind to know which state below is in use... */ + union { + struct { + + /** + * BO containing OA counter snapshots at query Begin/End time. + */ + void *bo; + + /** + * Address of mapped of @bo + */ + void *map; + + /** + * The MI_REPORT_PERF_COUNT command lets us specify a unique + * ID that will be reflected in the resulting OA report + * that's written by the GPU. This is the ID we're expecting + * in the begin report and the the end report should be + * @begin_report_id + 1. + */ + int begin_report_id; + + /** + * Reference the head of the brw->perfquery.sample_buffers + * list at the time that the query started (so we only need + * to look at nodes after this point when looking for samples + * related to this query) + * + * (See struct brw_oa_sample_buf description for more details) + */ + struct exec_node *samples_head; + + /** + * false while in the unaccumulated_elements list, and set to + * true when the final, end MI_RPC snapshot has been + * accumulated. + */ + bool results_accumulated; + + /** + * Frequency of the GT at begin and end of the query. + */ + uint64_t gt_frequency[2]; + + /** + * Accumulated OA results between begin and end of the query. + */ + struct gen_perf_query_result result; + } oa; + + struct { + /** + * BO containing starting and ending snapshots for the + * statistics counters. + */ + void *bo; + } pipeline_stats; + }; +}; + +struct gen_perf_context { + struct gen_perf_config *perf; + + void * ctx; /* driver context (eg, brw_context) */ + void * bufmgr; + const struct gen_device_info *devinfo; + + uint32_t hw_ctx; + int drm_fd; + + /* The i915 perf stream we open to setup + enable the OA counters */ + int oa_stream_fd; + + /* An i915 perf stream fd gives exclusive access to the OA unit that will + * report counter snapshots for a specific counter set/profile in a + * specific layout/format so we can only start OA queries that are + * compatible with the currently open fd... + */ + int current_oa_metrics_set_id; + int current_oa_format; + + /* List of buffers containing OA reports */ + struct exec_list sample_buffers; + + /* Cached list of empty sample buffers */ + struct exec_list free_sample_buffers; + + int n_active_oa_queries; + int n_active_pipeline_stats_queries; + + /* The number of queries depending on running OA counters which + * extends beyond brw_end_perf_query() since we need to wait until + * the last MI_RPC command has parsed by the GPU. + * + * Accurate accounting is important here as emitting an + * MI_REPORT_PERF_COUNT command while the OA unit is disabled will + * effectively hang the gpu. + */ + int n_oa_users; + + /* To help catch an spurious problem with the hardware or perf + * forwarding samples, we emit each MI_REPORT_PERF_COUNT command + * with a unique ID that we can explicitly check for... + */ + int next_query_start_report_id; + + /** + * An array of queries whose results haven't yet been assembled + * based on the data in buffer objects. + * + * These may be active, or have already ended. However, the + * results have not been requested. + */ + struct gen_perf_query_object **unaccumulated; + int unaccumulated_elements; + int unaccumulated_array_size; + + /* The total number of query objects so we can relinquish + * our exclusive access to perf if the application deletes + * all of its objects. (NB: We only disable perf while + * there are no active queries) + */ + int n_query_instances; +}; + +static bool +inc_n_users(struct gen_perf_context *perf_ctx) +{ + if (perf_ctx->n_oa_users == 0 && + gen_ioctl(perf_ctx->oa_stream_fd, I915_PERF_IOCTL_ENABLE, 0) < 0) + { + return false; + } + ++perf_ctx->n_oa_users; + + return true; +} + +static void +dec_n_users(struct gen_perf_context *perf_ctx) +{ + /* Disabling the i915 perf stream will effectively disable the OA + * counters. Note it's important to be sure there are no outstanding + * MI_RPC commands at this point since they could stall the CS + * indefinitely once OACONTROL is disabled. + */ + --perf_ctx->n_oa_users; + if (perf_ctx->n_oa_users == 0 && + gen_ioctl(perf_ctx->oa_stream_fd, I915_PERF_IOCTL_DISABLE, 0) < 0) + { + DBG("WARNING: Error disabling gen perf stream: %m\n"); + } +} + +static void +gen_perf_close(struct gen_perf_context *perfquery, + const struct gen_perf_query_info *query) +{ + if (perfquery->oa_stream_fd != -1) { + close(perfquery->oa_stream_fd); + perfquery->oa_stream_fd = -1; + } + if (query->kind == GEN_PERF_QUERY_TYPE_RAW) { + struct gen_perf_query_info *raw_query = + (struct gen_perf_query_info *) query; + raw_query->oa_metrics_set_id = 0; + } +} + +static bool +gen_perf_open(struct gen_perf_context *perf_ctx, + int metrics_set_id, + int report_format, + int period_exponent, + int drm_fd, + uint32_t ctx_id) +{ + uint64_t properties[] = { + /* Single context sampling */ + DRM_I915_PERF_PROP_CTX_HANDLE, ctx_id, + + /* Include OA reports in samples */ + DRM_I915_PERF_PROP_SAMPLE_OA, true, + + /* OA unit configuration */ + DRM_I915_PERF_PROP_OA_METRICS_SET, metrics_set_id, + DRM_I915_PERF_PROP_OA_FORMAT, report_format, + DRM_I915_PERF_PROP_OA_EXPONENT, period_exponent, + }; + struct drm_i915_perf_open_param param = { + .flags = I915_PERF_FLAG_FD_CLOEXEC | + I915_PERF_FLAG_FD_NONBLOCK | + I915_PERF_FLAG_DISABLED, + .num_properties = ARRAY_SIZE(properties) / 2, + .properties_ptr = (uintptr_t) properties, + }; + int fd = gen_ioctl(drm_fd, DRM_IOCTL_I915_PERF_OPEN, ¶m); + if (fd == -1) { + DBG("Error opening gen perf OA stream: %m\n"); + return false; + } + + perf_ctx->oa_stream_fd = fd; + + perf_ctx->current_oa_metrics_set_id = metrics_set_id; + perf_ctx->current_oa_format = report_format; + + return true; +} + +static uint64_t +get_metric_id(struct gen_perf_config *perf, + const struct gen_perf_query_info *query) +{ + /* These queries are know not to ever change, their config ID has been + * loaded upon the first query creation. No need to look them up again. + */ + if (query->kind == GEN_PERF_QUERY_TYPE_OA) + return query->oa_metrics_set_id; + + assert(query->kind == GEN_PERF_QUERY_TYPE_RAW); + + /* Raw queries can be reprogrammed up by an external application/library. + * When a raw query is used for the first time it's id is set to a value != + * 0. When it stops being used the id returns to 0. No need to reload the + * ID when it's already loaded. + */ + if (query->oa_metrics_set_id != 0) { + DBG("Raw query '%s' guid=%s using cached ID: %"PRIu64"\n", + query->name, query->guid, query->oa_metrics_set_id); + return query->oa_metrics_set_id; + } + + struct gen_perf_query_info *raw_query = (struct gen_perf_query_info *)query; + if (!gen_perf_load_metric_id(perf, query->guid, + &raw_query->oa_metrics_set_id)) { + DBG("Unable to read query guid=%s ID, falling back to test config\n", query->guid); + raw_query->oa_metrics_set_id = 1ULL; + } else { + DBG("Raw query '%s'guid=%s loaded ID: %"PRIu64"\n", + query->name, query->guid, query->oa_metrics_set_id); + } + return query->oa_metrics_set_id; +} + +static struct oa_sample_buf * +get_free_sample_buf(struct gen_perf_context *perf_ctx) +{ + struct exec_node *node = exec_list_pop_head(&perf_ctx->free_sample_buffers); + struct oa_sample_buf *buf; + + if (node) + buf = exec_node_data(struct oa_sample_buf, node, link); + else { + buf = ralloc_size(perf_ctx->perf, sizeof(*buf)); + + exec_node_init(&buf->link); + buf->refcount = 0; + } + buf->len = 0; + + return buf; +} + +static void +reap_old_sample_buffers(struct gen_perf_context *perf_ctx) +{ + struct exec_node *tail_node = + exec_list_get_tail(&perf_ctx->sample_buffers); + struct oa_sample_buf *tail_buf = + exec_node_data(struct oa_sample_buf, tail_node, link); + + /* Remove all old, unreferenced sample buffers walking forward from + * the head of the list, except always leave at least one node in + * the list so we always have a node to reference when we Begin + * a new query. + */ + foreach_list_typed_safe(struct oa_sample_buf, buf, link, + &perf_ctx->sample_buffers) + { + if (buf->refcount == 0 && buf != tail_buf) { + exec_node_remove(&buf->link); + exec_list_push_head(&perf_ctx->free_sample_buffers, &buf->link); + } else + return; + } +} + +static void +free_sample_bufs(struct gen_perf_context *perf_ctx) +{ + foreach_list_typed_safe(struct oa_sample_buf, buf, link, + &perf_ctx->free_sample_buffers) + ralloc_free(buf); + + exec_list_make_empty(&perf_ctx->free_sample_buffers); +} + + +struct gen_perf_query_object * +gen_perf_new_query(struct gen_perf_context *perf_ctx, unsigned query_index) +{ + const struct gen_perf_query_info *query = + &perf_ctx->perf->queries[query_index]; + struct gen_perf_query_object *obj = + calloc(1, sizeof(struct gen_perf_query_object)); + + if (!obj) + return NULL; + + obj->queryinfo = query; + + perf_ctx->n_query_instances++; + return obj; +} + +int +gen_perf_active_queries(struct gen_perf_context *perf_ctx, + const struct gen_perf_query_info *query) +{ + assert(perf_ctx->n_active_oa_queries == 0 || perf_ctx->n_active_pipeline_stats_queries == 0); + + switch (query->kind) { + case GEN_PERF_QUERY_TYPE_OA: + case GEN_PERF_QUERY_TYPE_RAW: + return perf_ctx->n_active_oa_queries; + break; + + case GEN_PERF_QUERY_TYPE_PIPELINE: + return perf_ctx->n_active_pipeline_stats_queries; + break; + + default: + unreachable("Unknown query type"); + break; + } +} + +const struct gen_perf_query_info* +gen_perf_query_info(const struct gen_perf_query_object *query) +{ + return query->queryinfo; +} + +struct gen_perf_context * +gen_perf_new_context(void *parent) +{ + struct gen_perf_context *ctx = rzalloc(parent, struct gen_perf_context); + if (! ctx) + fprintf(stderr, "%s: failed to alloc context\n", __func__); + return ctx; +} + +struct gen_perf_config * +gen_perf_config(struct gen_perf_context *ctx) +{ + return ctx->perf; +} + +void +gen_perf_init_context(struct gen_perf_context *perf_ctx, + struct gen_perf_config *perf_cfg, + void * ctx, /* driver context (eg, brw_context) */ + void * bufmgr, /* eg brw_bufmgr */ + const struct gen_device_info *devinfo, + uint32_t hw_ctx, + int drm_fd) +{ + perf_ctx->perf = perf_cfg; + perf_ctx->ctx = ctx; + perf_ctx->bufmgr = bufmgr; + perf_ctx->drm_fd = drm_fd; + perf_ctx->hw_ctx = hw_ctx; + perf_ctx->devinfo = devinfo; + + perf_ctx->unaccumulated = + ralloc_array(ctx, struct gen_perf_query_object *, 2); + perf_ctx->unaccumulated_elements = 0; + perf_ctx->unaccumulated_array_size = 2; + + exec_list_make_empty(&perf_ctx->sample_buffers); + exec_list_make_empty(&perf_ctx->free_sample_buffers); + + /* It's convenient to guarantee that this linked list of sample + * buffers is never empty so we add an empty head so when we + * Begin an OA query we can always take a reference on a buffer + * in this list. + */ + struct oa_sample_buf *buf = get_free_sample_buf(perf_ctx); + exec_list_push_head(&perf_ctx->sample_buffers, &buf->link); + + perf_ctx->oa_stream_fd = -1; + perf_ctx->next_query_start_report_id = 1000; +} + +/** + * Add a query to the global list of "unaccumulated queries." + * + * Queries are tracked here until all the associated OA reports have + * been accumulated via accumulate_oa_reports() after the end + * MI_REPORT_PERF_COUNT has landed in query->oa.bo. + */ +static void +add_to_unaccumulated_query_list(struct gen_perf_context *perf_ctx, + struct gen_perf_query_object *obj) +{ + if (perf_ctx->unaccumulated_elements >= + perf_ctx->unaccumulated_array_size) + { + perf_ctx->unaccumulated_array_size *= 1.5; + perf_ctx->unaccumulated = + reralloc(perf_ctx->ctx, perf_ctx->unaccumulated, + struct gen_perf_query_object *, + perf_ctx->unaccumulated_array_size); + } + + perf_ctx->unaccumulated[perf_ctx->unaccumulated_elements++] = obj; +} + +/** + * Emit MI_STORE_REGISTER_MEM commands to capture all of the + * pipeline statistics for the performance query object. + */ +static void +snapshot_statistics_registers(struct gen_perf_context *ctx, + struct gen_perf_query_object *obj, + uint32_t offset_in_bytes) +{ + struct gen_perf_config *perf = ctx->perf; + const struct gen_perf_query_info *query = obj->queryinfo; + const int n_counters = query->n_counters; + + for (int i = 0; i < n_counters; i++) { + const struct gen_perf_query_counter *counter = &query->counters[i]; + + assert(counter->data_type == GEN_PERF_COUNTER_DATA_TYPE_UINT64); + + perf->vtbl.store_register_mem(ctx->ctx, obj->pipeline_stats.bo, + counter->pipeline_stat.reg, 8, + offset_in_bytes + i * sizeof(uint64_t)); + } +} + +static void +snapshot_freq_register(struct gen_perf_context *ctx, + struct gen_perf_query_object *query, + uint32_t bo_offset) +{ + struct gen_perf_config *perf = ctx->perf; + const struct gen_device_info *devinfo = ctx->devinfo; + + if (devinfo->gen == 8 && !devinfo->is_cherryview) + perf->vtbl.store_register_mem(ctx->ctx, query->oa.bo, GEN7_RPSTAT1, 4, bo_offset); + else if (devinfo->gen >= 9) + perf->vtbl.store_register_mem(ctx->ctx, query->oa.bo, GEN9_RPSTAT0, 4, bo_offset); +} + +bool +gen_perf_begin_query(struct gen_perf_context *perf_ctx, + struct gen_perf_query_object *query) +{ + struct gen_perf_config *perf_cfg = perf_ctx->perf; + const struct gen_perf_query_info *queryinfo = query->queryinfo; + + /* XXX: We have to consider that the command parser unit that parses batch + * buffer commands and is used to capture begin/end counter snapshots isn't + * implicitly synchronized with what's currently running across other GPU + * units (such as the EUs running shaders) that the performance counters are + * associated with. + * + * The intention of performance queries is to measure the work associated + * with commands between the begin/end delimiters and so for that to be the + * case we need to explicitly synchronize the parsing of commands to capture + * Begin/End counter snapshots with what's running across other parts of the + * GPU. + * + * When the command parser reaches a Begin marker it effectively needs to + * drain everything currently running on the GPU until the hardware is idle + * before capturing the first snapshot of counters - otherwise the results + * would also be measuring the effects of earlier commands. + * + * When the command parser reaches an End marker it needs to stall until + * everything currently running on the GPU has finished before capturing the + * end snapshot - otherwise the results won't be a complete representation + * of the work. + * + * To achieve this, we stall the pipeline at pixel scoreboard (prevent any + * additional work to be processed by the pipeline until all pixels of the + * previous draw has be completed). + * + * N.B. The final results are based on deltas of counters between (inside) + * Begin/End markers so even though the total wall clock time of the + * workload is stretched by larger pipeline bubbles the bubbles themselves + * are generally invisible to the query results. Whether that's a good or a + * bad thing depends on the use case. For a lower real-time impact while + * capturing metrics then periodic sampling may be a better choice than + * INTEL_performance_query. + * + * + * This is our Begin synchronization point to drain current work on the + * GPU before we capture our first counter snapshot... + */ + perf_cfg->vtbl.emit_stall_at_pixel_scoreboard(perf_ctx->ctx); + + switch (queryinfo->kind) { + case GEN_PERF_QUERY_TYPE_OA: + case GEN_PERF_QUERY_TYPE_RAW: { + + /* Opening an i915 perf stream implies exclusive access to the OA unit + * which will generate counter reports for a specific counter set with a + * specific layout/format so we can't begin any OA based queries that + * require a different counter set or format unless we get an opportunity + * to close the stream and open a new one... + */ + uint64_t metric_id = get_metric_id(perf_ctx->perf, queryinfo); + + if (perf_ctx->oa_stream_fd != -1 && + perf_ctx->current_oa_metrics_set_id != metric_id) { + + if (perf_ctx->n_oa_users != 0) { + DBG("WARNING: Begin failed already using perf config=%i/%"PRIu64"\n", + perf_ctx->current_oa_metrics_set_id, metric_id); + return false; + } else + gen_perf_close(perf_ctx, queryinfo); + } + + /* If the OA counters aren't already on, enable them. */ + if (perf_ctx->oa_stream_fd == -1) { + const struct gen_device_info *devinfo = perf_ctx->devinfo; + + /* The period_exponent gives a sampling period as follows: + * sample_period = timestamp_period * 2^(period_exponent + 1) + * + * The timestamps increments every 80ns (HSW), ~52ns (GEN9LP) or + * ~83ns (GEN8/9). + * + * The counter overflow period is derived from the EuActive counter + * which reads a counter that increments by the number of clock + * cycles multiplied by the number of EUs. It can be calculated as: + * + * 2^(number of bits in A counter) / (n_eus * max_gen_freq * 2) + * + * (E.g. 40 EUs @ 1GHz = ~53ms) + * + * We select a sampling period inferior to that overflow period to + * ensure we cannot see more than 1 counter overflow, otherwise we + * could loose information. + */ + + int a_counter_in_bits = 32; + if (devinfo->gen >= 8) + a_counter_in_bits = 40; + + uint64_t overflow_period = pow(2, a_counter_in_bits) / (perf_cfg->sys_vars.n_eus * + /* drop 1GHz freq to have units in nanoseconds */ + 2); + + DBG("A counter overflow period: %"PRIu64"ns, %"PRIu64"ms (n_eus=%"PRIu64")\n", + overflow_period, overflow_period / 1000000ul, perf_cfg->sys_vars.n_eus); + + int period_exponent = 0; + uint64_t prev_sample_period, next_sample_period; + for (int e = 0; e < 30; e++) { + prev_sample_period = 1000000000ull * pow(2, e + 1) / devinfo->timestamp_frequency; + next_sample_period = 1000000000ull * pow(2, e + 2) / devinfo->timestamp_frequency; + + /* Take the previous sampling period, lower than the overflow + * period. + */ + if (prev_sample_period < overflow_period && + next_sample_period > overflow_period) + period_exponent = e + 1; + } + + if (period_exponent == 0) { + DBG("WARNING: enable to find a sampling exponent\n"); + return false; + } + + DBG("OA sampling exponent: %i ~= %"PRIu64"ms\n", period_exponent, + prev_sample_period / 1000000ul); + + if (!gen_perf_open(perf_ctx, metric_id, queryinfo->oa_format, + period_exponent, perf_ctx->drm_fd, + perf_ctx->hw_ctx)) + return false; + } else { + assert(perf_ctx->current_oa_metrics_set_id == metric_id && + perf_ctx->current_oa_format == queryinfo->oa_format); + } + + if (!inc_n_users(perf_ctx)) { + DBG("WARNING: Error enabling i915 perf stream: %m\n"); + return false; + } + + if (query->oa.bo) { + perf_cfg->vtbl.bo_unreference(query->oa.bo); + query->oa.bo = NULL; + } + + query->oa.bo = perf_cfg->vtbl.bo_alloc(perf_ctx->bufmgr, + "perf. query OA MI_RPC bo", + MI_RPC_BO_SIZE); +#ifdef DEBUG + /* Pre-filling the BO helps debug whether writes landed. */ + void *map = perf_cfg->vtbl.bo_map(perf_ctx->ctx, query->oa.bo, MAP_WRITE); + memset(map, 0x80, MI_RPC_BO_SIZE); + perf_cfg->vtbl.bo_unmap(query->oa.bo); +#endif + + query->oa.begin_report_id = perf_ctx->next_query_start_report_id; + perf_ctx->next_query_start_report_id += 2; + + /* Take a starting OA counter snapshot. */ + perf_cfg->vtbl.emit_mi_report_perf_count(perf_ctx->ctx, query->oa.bo, 0, + query->oa.begin_report_id); + snapshot_freq_register(perf_ctx, query, MI_FREQ_START_OFFSET_BYTES); + + ++perf_ctx->n_active_oa_queries; + + /* No already-buffered samples can possibly be associated with this query + * so create a marker within the list of sample buffers enabling us to + * easily ignore earlier samples when processing this query after + * completion. + */ + assert(!exec_list_is_empty(&perf_ctx->sample_buffers)); + query->oa.samples_head = exec_list_get_tail(&perf_ctx->sample_buffers); + + struct oa_sample_buf *buf = + exec_node_data(struct oa_sample_buf, query->oa.samples_head, link); + + /* This reference will ensure that future/following sample + * buffers (that may relate to this query) can't be freed until + * this drops to zero. + */ + buf->refcount++; + + gen_perf_query_result_clear(&query->oa.result); + query->oa.results_accumulated = false; + + add_to_unaccumulated_query_list(perf_ctx, query); + break; + } + + case GEN_PERF_QUERY_TYPE_PIPELINE: + if (query->pipeline_stats.bo) { + perf_cfg->vtbl.bo_unreference(query->pipeline_stats.bo); + query->pipeline_stats.bo = NULL; + } + + query->pipeline_stats.bo = + perf_cfg->vtbl.bo_alloc(perf_ctx->bufmgr, + "perf. query pipeline stats bo", + STATS_BO_SIZE); + + /* Take starting snapshots. */ + snapshot_statistics_registers(perf_ctx, query, 0); + + ++perf_ctx->n_active_pipeline_stats_queries; + break; + + default: + unreachable("Unknown query type"); + break; + } + + return true; +} + +void +gen_perf_end_query(struct gen_perf_context *perf_ctx, + struct gen_perf_query_object *query) +{ + struct gen_perf_config *perf_cfg = perf_ctx->perf; + + /* Ensure that the work associated with the queried commands will have + * finished before taking our query end counter readings. + * + * For more details see comment in brw_begin_perf_query for + * corresponding flush. + */ + perf_cfg->vtbl.emit_stall_at_pixel_scoreboard(perf_ctx->ctx); + + switch (query->queryinfo->kind) { + case GEN_PERF_QUERY_TYPE_OA: + case GEN_PERF_QUERY_TYPE_RAW: + + /* NB: It's possible that the query will have already been marked + * as 'accumulated' if an error was seen while reading samples + * from perf. In this case we mustn't try and emit a closing + * MI_RPC command in case the OA unit has already been disabled + */ + if (!query->oa.results_accumulated) { + /* Take an ending OA counter snapshot. */ + snapshot_freq_register(perf_ctx, query, MI_FREQ_END_OFFSET_BYTES); + perf_cfg->vtbl.emit_mi_report_perf_count(perf_ctx->ctx, query->oa.bo, + MI_RPC_BO_END_OFFSET_BYTES, + query->oa.begin_report_id + 1); + } + + --perf_ctx->n_active_oa_queries; + + /* NB: even though the query has now ended, it can't be accumulated + * until the end MI_REPORT_PERF_COUNT snapshot has been written + * to query->oa.bo + */ + break; + + case GEN_PERF_QUERY_TYPE_PIPELINE: + snapshot_statistics_registers(perf_ctx, query, + STATS_BO_END_OFFSET_BYTES); + --perf_ctx->n_active_pipeline_stats_queries; + break; + + default: + unreachable("Unknown query type"); + break; + } +} + +enum OaReadStatus { + OA_READ_STATUS_ERROR, + OA_READ_STATUS_UNFINISHED, + OA_READ_STATUS_FINISHED, +}; + +static enum OaReadStatus +read_oa_samples_until(struct gen_perf_context *perf_ctx, + uint32_t start_timestamp, + uint32_t end_timestamp) +{ + struct exec_node *tail_node = + exec_list_get_tail(&perf_ctx->sample_buffers); + struct oa_sample_buf *tail_buf = + exec_node_data(struct oa_sample_buf, tail_node, link); + uint32_t last_timestamp = + tail_buf->len == 0 ? start_timestamp : tail_buf->last_timestamp; + + while (1) { + struct oa_sample_buf *buf = get_free_sample_buf(perf_ctx); + uint32_t offset; + int len; + + while ((len = read(perf_ctx->oa_stream_fd, buf->buf, + sizeof(buf->buf))) < 0 && errno == EINTR) + ; + + if (len <= 0) { + exec_list_push_tail(&perf_ctx->free_sample_buffers, &buf->link); + + if (len < 0) { + if (errno == EAGAIN) { + return ((last_timestamp - start_timestamp) < INT32_MAX && + (last_timestamp - start_timestamp) >= + (end_timestamp - start_timestamp)) ? + OA_READ_STATUS_FINISHED : + OA_READ_STATUS_UNFINISHED; + } else { + DBG("Error reading i915 perf samples: %m\n"); + } + } else + DBG("Spurious EOF reading i915 perf samples\n"); + + return OA_READ_STATUS_ERROR; + } + + buf->len = len; + exec_list_push_tail(&perf_ctx->sample_buffers, &buf->link); + + /* Go through the reports and update the last timestamp. */ + offset = 0; + while (offset < buf->len) { + const struct drm_i915_perf_record_header *header = + (const struct drm_i915_perf_record_header *) &buf->buf[offset]; + uint32_t *report = (uint32_t *) (header + 1); + + if (header->type == DRM_I915_PERF_RECORD_SAMPLE) + last_timestamp = report[1]; + + offset += header->size; + } + + buf->last_timestamp = last_timestamp; + } + + unreachable("not reached"); + return OA_READ_STATUS_ERROR; +} + +/** + * Try to read all the reports until either the delimiting timestamp + * or an error arises. + */ +static bool +read_oa_samples_for_query(struct gen_perf_context *perf_ctx, + struct gen_perf_query_object *query, + void *current_batch) +{ + uint32_t *start; + uint32_t *last; + uint32_t *end; + struct gen_perf_config *perf_cfg = perf_ctx->perf; + + /* We need the MI_REPORT_PERF_COUNT to land before we can start + * accumulate. */ + assert(!perf_cfg->vtbl.batch_references(current_batch, query->oa.bo) && + !perf_cfg->vtbl.bo_busy(query->oa.bo)); + + /* Map the BO once here and let accumulate_oa_reports() unmap + * it. */ + if (query->oa.map == NULL) + query->oa.map = perf_cfg->vtbl.bo_map(perf_ctx->ctx, query->oa.bo, MAP_READ); + + start = last = query->oa.map; + end = query->oa.map + MI_RPC_BO_END_OFFSET_BYTES; + + if (start[0] != query->oa.begin_report_id) { + DBG("Spurious start report id=%"PRIu32"\n", start[0]); + return true; + } + if (end[0] != (query->oa.begin_report_id + 1)) { + DBG("Spurious end report id=%"PRIu32"\n", end[0]); + return true; + } + + /* Read the reports until the end timestamp. */ + switch (read_oa_samples_until(perf_ctx, start[1], end[1])) { + case OA_READ_STATUS_ERROR: + /* Fallthrough and let accumulate_oa_reports() deal with the + * error. */ + case OA_READ_STATUS_FINISHED: + return true; + case OA_READ_STATUS_UNFINISHED: + return false; + } + + unreachable("invalid read status"); + return false; +} + +void +gen_perf_wait_query(struct gen_perf_context *perf_ctx, + struct gen_perf_query_object *query, + void *current_batch) +{ + struct gen_perf_config *perf_cfg = perf_ctx->perf; + struct brw_bo *bo = NULL; + + switch (query->queryinfo->kind) { + case GEN_PERF_QUERY_TYPE_OA: + case GEN_PERF_QUERY_TYPE_RAW: + bo = query->oa.bo; + break; + + case GEN_PERF_QUERY_TYPE_PIPELINE: + bo = query->pipeline_stats.bo; + break; + + default: + unreachable("Unknown query type"); + break; + } + + if (bo == NULL) + return; + + /* If the current batch references our results bo then we need to + * flush first... + */ + if (perf_cfg->vtbl.batch_references(current_batch, bo)) + perf_cfg->vtbl.batchbuffer_flush(perf_ctx->ctx, __FILE__, __LINE__); + + perf_cfg->vtbl.bo_wait_rendering(bo); + + /* Due to a race condition between the OA unit signaling report + * availability and the report actually being written into memory, + * we need to wait for all the reports to come in before we can + * read them. + */ + if (query->queryinfo->kind == GEN_PERF_QUERY_TYPE_OA || + query->queryinfo->kind == GEN_PERF_QUERY_TYPE_RAW) { + while (!read_oa_samples_for_query(perf_ctx, query, current_batch)) + ; + } +} + +bool +gen_perf_is_query_ready(struct gen_perf_context *perf_ctx, + struct gen_perf_query_object *query, + void *current_batch) +{ + struct gen_perf_config *perf_cfg = perf_ctx->perf; + + switch (query->queryinfo->kind) { + case GEN_PERF_QUERY_TYPE_OA: + case GEN_PERF_QUERY_TYPE_RAW: + return (query->oa.results_accumulated || + (query->oa.bo && + !perf_cfg->vtbl.batch_references(current_batch, query->oa.bo) && + !perf_cfg->vtbl.bo_busy(query->oa.bo) && + read_oa_samples_for_query(perf_ctx, query, current_batch))); + case GEN_PERF_QUERY_TYPE_PIPELINE: + return (query->pipeline_stats.bo && + !perf_cfg->vtbl.batch_references(current_batch, query->pipeline_stats.bo) && + !perf_cfg->vtbl.bo_busy(query->pipeline_stats.bo)); + + default: + unreachable("Unknown query type"); + break; + } + + return false; +} + +/** + * Remove a query from the global list of unaccumulated queries once + * after successfully accumulating the OA reports associated with the + * query in accumulate_oa_reports() or when discarding unwanted query + * results. + */ +static void +drop_from_unaccumulated_query_list(struct gen_perf_context *perf_ctx, + struct gen_perf_query_object *query) +{ + for (int i = 0; i < perf_ctx->unaccumulated_elements; i++) { + if (perf_ctx->unaccumulated[i] == query) { + int last_elt = --perf_ctx->unaccumulated_elements; + + if (i == last_elt) + perf_ctx->unaccumulated[i] = NULL; + else { + perf_ctx->unaccumulated[i] = + perf_ctx->unaccumulated[last_elt]; + } + + break; + } + } + + /* Drop our samples_head reference so that associated periodic + * sample data buffers can potentially be reaped if they aren't + * referenced by any other queries... + */ + + struct oa_sample_buf *buf = + exec_node_data(struct oa_sample_buf, query->oa.samples_head, link); + + assert(buf->refcount > 0); + buf->refcount--; + + query->oa.samples_head = NULL; + + reap_old_sample_buffers(perf_ctx); +} + +/* In general if we see anything spurious while accumulating results, + * we don't try and continue accumulating the current query, hoping + * for the best, we scrap anything outstanding, and then hope for the + * best with new queries. + */ +static void +discard_all_queries(struct gen_perf_context *perf_ctx) +{ + while (perf_ctx->unaccumulated_elements) { + struct gen_perf_query_object *query = perf_ctx->unaccumulated[0]; + + query->oa.results_accumulated = true; + drop_from_unaccumulated_query_list(perf_ctx, query); + + dec_n_users(perf_ctx); + } +} + +/* Looks for the validity bit of context ID (dword 2) of an OA report. */ +static bool +oa_report_ctx_id_valid(const struct gen_device_info *devinfo, + const uint32_t *report) +{ + assert(devinfo->gen >= 8); + if (devinfo->gen == 8) + return (report[0] & (1 << 25)) != 0; + return (report[0] & (1 << 16)) != 0; +} + +/** + * Accumulate raw OA counter values based on deltas between pairs of + * OA reports. + * + * Accumulation starts from the first report captured via + * MI_REPORT_PERF_COUNT (MI_RPC) by brw_begin_perf_query() until the + * last MI_RPC report requested by brw_end_perf_query(). Between these + * two reports there may also some number of periodically sampled OA + * reports collected via the i915 perf interface - depending on the + * duration of the query. + * + * These periodic snapshots help to ensure we handle counter overflow + * correctly by being frequent enough to ensure we don't miss multiple + * overflows of a counter between snapshots. For Gen8+ the i915 perf + * snapshots provide the extra context-switch reports that let us + * subtract out the progress of counters associated with other + * contexts running on the system. + */ +static void +accumulate_oa_reports(struct gen_perf_context *perf_ctx, + struct gen_perf_query_object *query) +{ + const struct gen_device_info *devinfo = perf_ctx->devinfo; + uint32_t *start; + uint32_t *last; + uint32_t *end; + struct exec_node *first_samples_node; + bool last_report_ctx_match = true; + int out_duration = 0; + + assert(query->oa.map != NULL); + + start = last = query->oa.map; + end = query->oa.map + MI_RPC_BO_END_OFFSET_BYTES; + + if (start[0] != query->oa.begin_report_id) { + DBG("Spurious start report id=%"PRIu32"\n", start[0]); + goto error; + } + if (end[0] != (query->oa.begin_report_id + 1)) { + DBG("Spurious end report id=%"PRIu32"\n", end[0]); + goto error; + } + + /* On Gen12+ OA reports are sourced from per context counters, so we don't + * ever have to look at the global OA buffer. Yey \o/ + */ + if (perf_ctx->devinfo->gen >= 12) { + last = start; + goto end; + } + + /* See if we have any periodic reports to accumulate too... */ + + /* N.B. The oa.samples_head was set when the query began and + * pointed to the tail of the perf_ctx->sample_buffers list at + * the time the query started. Since the buffer existed before the + * first MI_REPORT_PERF_COUNT command was emitted we therefore know + * that no data in this particular node's buffer can possibly be + * associated with the query - so skip ahead one... + */ + first_samples_node = query->oa.samples_head->next; + + foreach_list_typed_from(struct oa_sample_buf, buf, link, + &perf_ctx->sample_buffers, + first_samples_node) + { + int offset = 0; + + while (offset < buf->len) { + const struct drm_i915_perf_record_header *header = + (const struct drm_i915_perf_record_header *)(buf->buf + offset); + + assert(header->size != 0); + assert(header->size <= buf->len); + + offset += header->size; + + switch (header->type) { + case DRM_I915_PERF_RECORD_SAMPLE: { + uint32_t *report = (uint32_t *)(header + 1); + bool report_ctx_match = true; + bool add = true; + + /* Ignore reports that come before the start marker. + * (Note: takes care to allow overflow of 32bit timestamps) + */ + if (gen_device_info_timebase_scale(devinfo, + report[1] - start[1]) > 5000000000) { + continue; + } + + /* Ignore reports that come after the end marker. + * (Note: takes care to allow overflow of 32bit timestamps) + */ + if (gen_device_info_timebase_scale(devinfo, + report[1] - end[1]) <= 5000000000) { + goto end; + } + + /* For Gen8+ since the counters continue while other + * contexts are running we need to discount any unrelated + * deltas. The hardware automatically generates a report + * on context switch which gives us a new reference point + * to continuing adding deltas from. + * + * For Haswell we can rely on the HW to stop the progress + * of OA counters while any other context is acctive. + */ + if (devinfo->gen >= 8) { + /* Consider that the current report matches our context only if + * the report says the report ID is valid. + */ + report_ctx_match = oa_report_ctx_id_valid(devinfo, report) && + report[2] == start[2]; + if (report_ctx_match) + out_duration = 0; + else + out_duration++; + + /* Only add the delta between if the last report + * was clearly identified as our context, or if we have at most + * 1 report without a matching ID. + * + * The OA unit will sometimes label reports with an invalid + * context ID when i915 rewrites the execlist submit register + * with the same context as the one currently running. This + * happens when i915 wants to notify the HW of ringbuffer tail + * register update. We have to consider this report as part of + * our context as the 3d pipeline behind the OACS unit is still + * processing the operations started at the previous execlist + * submission. + */ + add = last_report_ctx_match && out_duration < 2; + } + + if (add) { + gen_perf_query_result_accumulate(&query->oa.result, + query->queryinfo, + last, report); + } else { + /* We're not adding the delta because we've identified it's not + * for the context we filter for. We can consider that the + * query was split. + */ + query->oa.result.query_disjoint = true; + } + + last = report; + last_report_ctx_match = report_ctx_match; + + break; + } + + case DRM_I915_PERF_RECORD_OA_BUFFER_LOST: + DBG("i915 perf: OA error: all reports lost\n"); + goto error; + case DRM_I915_PERF_RECORD_OA_REPORT_LOST: + DBG("i915 perf: OA report lost\n"); + break; + } + } + } + +end: + + gen_perf_query_result_accumulate(&query->oa.result, query->queryinfo, + last, end); + + query->oa.results_accumulated = true; + drop_from_unaccumulated_query_list(perf_ctx, query); + dec_n_users(perf_ctx); + + return; + +error: + + discard_all_queries(perf_ctx); +} + +void +gen_perf_delete_query(struct gen_perf_context *perf_ctx, + struct gen_perf_query_object *query) +{ + struct gen_perf_config *perf_cfg = perf_ctx->perf; + + /* We can assume that the frontend waits for a query to complete + * before ever calling into here, so we don't have to worry about + * deleting an in-flight query object. + */ + switch (query->queryinfo->kind) { + case GEN_PERF_QUERY_TYPE_OA: + case GEN_PERF_QUERY_TYPE_RAW: + if (query->oa.bo) { + if (!query->oa.results_accumulated) { + drop_from_unaccumulated_query_list(perf_ctx, query); + dec_n_users(perf_ctx); + } + + perf_cfg->vtbl.bo_unreference(query->oa.bo); + query->oa.bo = NULL; + } + + query->oa.results_accumulated = false; + break; + + case GEN_PERF_QUERY_TYPE_PIPELINE: + if (query->pipeline_stats.bo) { + perf_cfg->vtbl.bo_unreference(query->pipeline_stats.bo); + query->pipeline_stats.bo = NULL; + } + break; + + default: + unreachable("Unknown query type"); + break; + } + + /* As an indication that the INTEL_performance_query extension is no + * longer in use, it's a good time to free our cache of sample + * buffers and close any current i915-perf stream. + */ + if (--perf_ctx->n_query_instances == 0) { + free_sample_bufs(perf_ctx); + gen_perf_close(perf_ctx, query->queryinfo); + } + + free(query); +} + +#define GET_FIELD(word, field) (((word) & field ## _MASK) >> field ## _SHIFT) + +static void +read_gt_frequency(struct gen_perf_context *perf_ctx, + struct gen_perf_query_object *obj) +{ + const struct gen_device_info *devinfo = perf_ctx->devinfo; + uint32_t start = *((uint32_t *)(obj->oa.map + MI_FREQ_START_OFFSET_BYTES)), + end = *((uint32_t *)(obj->oa.map + MI_FREQ_END_OFFSET_BYTES)); + + switch (devinfo->gen) { + case 7: + case 8: + obj->oa.gt_frequency[0] = GET_FIELD(start, GEN7_RPSTAT1_CURR_GT_FREQ) * 50ULL; + obj->oa.gt_frequency[1] = GET_FIELD(end, GEN7_RPSTAT1_CURR_GT_FREQ) * 50ULL; + break; + case 9: + case 10: + case 11: + case 12: + obj->oa.gt_frequency[0] = GET_FIELD(start, GEN9_RPSTAT0_CURR_GT_FREQ) * 50ULL / 3ULL; + obj->oa.gt_frequency[1] = GET_FIELD(end, GEN9_RPSTAT0_CURR_GT_FREQ) * 50ULL / 3ULL; + break; + default: + unreachable("unexpected gen"); + } + + /* Put the numbers into Hz. */ + obj->oa.gt_frequency[0] *= 1000000ULL; + obj->oa.gt_frequency[1] *= 1000000ULL; +} + +static int +get_oa_counter_data(struct gen_perf_context *perf_ctx, + struct gen_perf_query_object *query, + size_t data_size, + uint8_t *data) +{ + struct gen_perf_config *perf_cfg = perf_ctx->perf; + const struct gen_perf_query_info *queryinfo = query->queryinfo; + int n_counters = queryinfo->n_counters; + int written = 0; + + for (int i = 0; i < n_counters; i++) { + const struct gen_perf_query_counter *counter = &queryinfo->counters[i]; + uint64_t *out_uint64; + float *out_float; + size_t counter_size = gen_perf_query_counter_get_size(counter); + + if (counter_size) { + switch (counter->data_type) { + case GEN_PERF_COUNTER_DATA_TYPE_UINT64: + out_uint64 = (uint64_t *)(data + counter->offset); + *out_uint64 = + counter->oa_counter_read_uint64(perf_cfg, queryinfo, + query->oa.result.accumulator); + break; + case GEN_PERF_COUNTER_DATA_TYPE_FLOAT: + out_float = (float *)(data + counter->offset); + *out_float = + counter->oa_counter_read_float(perf_cfg, queryinfo, + query->oa.result.accumulator); + break; + default: + /* So far we aren't using uint32, double or bool32... */ + unreachable("unexpected counter data type"); + } + written = counter->offset + counter_size; + } + } + + return written; +} + +static int +get_pipeline_stats_data(struct gen_perf_context *perf_ctx, + struct gen_perf_query_object *query, + size_t data_size, + uint8_t *data) + +{ + struct gen_perf_config *perf_cfg = perf_ctx->perf; + const struct gen_perf_query_info *queryinfo = query->queryinfo; + int n_counters = queryinfo->n_counters; + uint8_t *p = data; + + uint64_t *start = perf_cfg->vtbl.bo_map(perf_ctx->ctx, query->pipeline_stats.bo, MAP_READ); + uint64_t *end = start + (STATS_BO_END_OFFSET_BYTES / sizeof(uint64_t)); + + for (int i = 0; i < n_counters; i++) { + const struct gen_perf_query_counter *counter = &queryinfo->counters[i]; + uint64_t value = end[i] - start[i]; + + if (counter->pipeline_stat.numerator != + counter->pipeline_stat.denominator) { + value *= counter->pipeline_stat.numerator; + value /= counter->pipeline_stat.denominator; + } + + *((uint64_t *)p) = value; + p += 8; + } + + perf_cfg->vtbl.bo_unmap(query->pipeline_stats.bo); + + return p - data; +} + +void +gen_perf_get_query_data(struct gen_perf_context *perf_ctx, + struct gen_perf_query_object *query, + int data_size, + unsigned *data, + unsigned *bytes_written) +{ + struct gen_perf_config *perf_cfg = perf_ctx->perf; + int written = 0; + + switch (query->queryinfo->kind) { + case GEN_PERF_QUERY_TYPE_OA: + case GEN_PERF_QUERY_TYPE_RAW: + if (!query->oa.results_accumulated) { + read_gt_frequency(perf_ctx, query); + uint32_t *begin_report = query->oa.map; + uint32_t *end_report = query->oa.map + MI_RPC_BO_END_OFFSET_BYTES; + gen_perf_query_result_read_frequencies(&query->oa.result, + perf_ctx->devinfo, + begin_report, + end_report); + accumulate_oa_reports(perf_ctx, query); + assert(query->oa.results_accumulated); + + perf_cfg->vtbl.bo_unmap(query->oa.bo); + query->oa.map = NULL; + } + if (query->queryinfo->kind == GEN_PERF_QUERY_TYPE_OA) { + written = get_oa_counter_data(perf_ctx, query, data_size, (uint8_t *)data); + } else { + const struct gen_device_info *devinfo = perf_ctx->devinfo; + + written = gen_perf_query_result_write_mdapi((uint8_t *)data, data_size, + devinfo, &query->oa.result, + query->oa.gt_frequency[0], + query->oa.gt_frequency[1]); + } + break; + + case GEN_PERF_QUERY_TYPE_PIPELINE: + written = get_pipeline_stats_data(perf_ctx, query, data_size, (uint8_t *)data); + break; + + default: + unreachable("Unknown query type"); + break; + } + + if (bytes_written) + *bytes_written = written; +} + +void +gen_perf_dump_query_count(struct gen_perf_context *perf_ctx) +{ + DBG("Queries: (Open queries = %d, OA users = %d)\n", + perf_ctx->n_active_oa_queries, perf_ctx->n_oa_users); +} + +void +gen_perf_dump_query(struct gen_perf_context *ctx, + struct gen_perf_query_object *obj, + void *current_batch) +{ + switch (obj->queryinfo->kind) { + case GEN_PERF_QUERY_TYPE_OA: + case GEN_PERF_QUERY_TYPE_RAW: + DBG("BO: %-4s OA data: %-10s %-15s\n", + obj->oa.bo ? "yes," : "no,", + gen_perf_is_query_ready(ctx, obj, current_batch) ? "ready," : "not ready,", + obj->oa.results_accumulated ? "accumulated" : "not accumulated"); + break; + case GEN_PERF_QUERY_TYPE_PIPELINE: + DBG("BO: %-4s\n", + obj->pipeline_stats.bo ? "yes" : "no"); + break; + default: + unreachable("Unknown query type"); + break; + } +} diff -Nru mesa-19.2.8/src/intel/perf/gen_perf_query.h mesa-20.0.8/src/intel/perf/gen_perf_query.h --- mesa-19.2.8/src/intel/perf/gen_perf_query.h 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/intel/perf/gen_perf_query.h 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,88 @@ +/* + * Copyright © 2019 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef GEN_PERF_QUERY_H +#define GEN_PERF_QUERY_H + +#include + +struct gen_device_info; + +struct gen_perf_config; +struct gen_perf_context; +struct gen_perf_query_object; + +struct gen_perf_context *gen_perf_new_context(void *parent); + +void gen_perf_init_context(struct gen_perf_context *perf_ctx, + struct gen_perf_config *perf_cfg, + void * ctx, /* driver context (eg, brw_context) */ + void * bufmgr, /* eg brw_bufmgr */ + const struct gen_device_info *devinfo, + uint32_t hw_ctx, + int drm_fd); + +const struct gen_perf_query_info* gen_perf_query_info(const struct gen_perf_query_object *); + + +void gen_perf_init_context(struct gen_perf_context *perf_ctx, + struct gen_perf_config *perf_cfg, + void * ctx, /* driver context (eg, brw_context) */ + void * bufmgr, /* eg brw_bufmgr */ + const struct gen_device_info *devinfo, + uint32_t hw_ctx, + int drm_fd); + +struct gen_perf_config *gen_perf_config(struct gen_perf_context *ctx); + +int gen_perf_active_queries(struct gen_perf_context *perf_ctx, + const struct gen_perf_query_info *query); + +struct gen_perf_query_object * +gen_perf_new_query(struct gen_perf_context *, unsigned query_index); + + +bool gen_perf_begin_query(struct gen_perf_context *perf_ctx, + struct gen_perf_query_object *query); +void gen_perf_end_query(struct gen_perf_context *perf_ctx, + struct gen_perf_query_object *query); +void gen_perf_wait_query(struct gen_perf_context *perf_ctx, + struct gen_perf_query_object *query, + void *current_batch); +bool gen_perf_is_query_ready(struct gen_perf_context *perf_ctx, + struct gen_perf_query_object *query, + void *current_batch); +void gen_perf_delete_query(struct gen_perf_context *perf_ctx, + struct gen_perf_query_object *query); +void gen_perf_get_query_data(struct gen_perf_context *perf_ctx, + struct gen_perf_query_object *query, + int data_size, + unsigned *data, + unsigned *bytes_written); + +void gen_perf_dump_query_count(struct gen_perf_context *perf_ctx); +void gen_perf_dump_query(struct gen_perf_context *perf_ctx, + struct gen_perf_query_object *obj, + void *current_batch); + +#endif /* GEN_PERF_QUERY_H */ diff -Nru mesa-19.2.8/src/intel/perf/gen_perf_regs.h mesa-20.0.8/src/intel/perf/gen_perf_regs.h --- mesa-19.2.8/src/intel/perf/gen_perf_regs.h 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/intel/perf/gen_perf_regs.h 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,62 @@ +/* + * Copyright © 2019 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef GEN_PERF_REGS_H +#define GEN_PERF_REGS_H + +#define INTEL_MASK(high, low) (((1u<<((high)-(low)+1))-1)<<(low)) + +/* GT core frequency counters */ +#define GEN7_RPSTAT1 0xA01C +#define GEN7_RPSTAT1_CURR_GT_FREQ_SHIFT 7 +#define GEN7_RPSTAT1_CURR_GT_FREQ_MASK INTEL_MASK(13, 7) +#define GEN7_RPSTAT1_PREV_GT_FREQ_SHIFT 0 +#define GEN7_RPSTAT1_PREV_GT_FREQ_MASK INTEL_MASK(6, 0) + +#define GEN9_RPSTAT0 0xA01C +#define GEN9_RPSTAT0_CURR_GT_FREQ_SHIFT 23 +#define GEN9_RPSTAT0_CURR_GT_FREQ_MASK INTEL_MASK(31, 23) +#define GEN9_RPSTAT0_PREV_GT_FREQ_SHIFT 0 +#define GEN9_RPSTAT0_PREV_GT_FREQ_MASK INTEL_MASK(8, 0) + +/* Pipeline statistic counters */ +#define IA_VERTICES_COUNT 0x2310 +#define IA_PRIMITIVES_COUNT 0x2318 +#define VS_INVOCATION_COUNT 0x2320 +#define HS_INVOCATION_COUNT 0x2300 +#define DS_INVOCATION_COUNT 0x2308 +#define GS_INVOCATION_COUNT 0x2328 +#define GS_PRIMITIVES_COUNT 0x2330 +#define CL_INVOCATION_COUNT 0x2338 +#define CL_PRIMITIVES_COUNT 0x2340 +#define PS_INVOCATION_COUNT 0x2348 +#define CS_INVOCATION_COUNT 0x2290 +#define PS_DEPTH_COUNT 0x2350 + +/* Stream-out counters */ +#define GEN6_SO_PRIM_STORAGE_NEEDED 0x2280 +#define GEN7_SO_PRIM_STORAGE_NEEDED(n) (0x5240 + (n) * 8) +#define GEN6_SO_NUM_PRIMS_WRITTEN 0x2288 +#define GEN7_SO_NUM_PRIMS_WRITTEN(n) (0x5200 + (n) * 8) + +#endif /* GEN_PERF_REGS_H */ diff -Nru mesa-19.2.8/src/intel/perf/meson.build mesa-20.0.8/src/intel/perf/meson.build --- mesa-19.2.8/src/intel/perf/meson.build 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/perf/meson.build 2020-06-12 01:21:17.000000000 +0000 @@ -6,7 +6,8 @@ 'cflgt2', 'cflgt3', 'bxt', 'glk', 'cnl', - 'icl', + 'icl', 'lkf', + 'tgl', ] gen_hw_metrics_xml_files = [] @@ -16,6 +17,7 @@ gen_perf_sources = [ 'gen_perf.c', + 'gen_perf_query.c', 'gen_perf_mdapi.c', ] diff -Nru mesa-19.2.8/src/intel/perf/oa-icl.xml mesa-20.0.8/src/intel/perf/oa-icl.xml --- mesa-19.2.8/src/intel/perf/oa-icl.xml 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/perf/oa-icl.xml 2020-06-12 01:21:17.000000000 +0000 @@ -1,9 +1,9 @@ - + @@ -518,7 +518,7 @@ description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB." data_type="uint64" max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL" - equation="A 30 READ A 31 READ UADD A 32 READ UADD 64 UMUL" + equation="$ShaderMemoryAccesses 64 UMUL" underscore_name="l3_shader_throughput" units="bytes" symbol_name="L3ShaderThroughput" @@ -651,7 +651,7 @@ @@ -861,7 +861,7 @@ description="The total number of typed atomic accesses via Data Port." data_type="uint64" max_equation="$GpuCoreClocks 64 UMUL $EuSlicesTotalCount UMUL" - equation="B 4 READ B 5 READ FADD 2 FDIV $EuSubslicesTotalCount FMUL" + equation="B 4 READ B 5 READ UADD 2 UDIV $EuSubslicesTotalCount UMUL" underscore_name="typed_atomics" units="events" symbol_name="TypedAtomics" @@ -942,7 +942,7 @@ description="The total number of GPU memory bytes read from GTI." data_type="uint64" max_equation="$GpuCoreClocks 64 UMUL" - equation="64 B 1 READ B 3 READ UADD B 7 READ UADD B 6 READ UADD UMUL" + equation="64 B 1 READ B 3 READ UADD B 6 READ UADD B 7 READ UADD UMUL" underscore_name="gti_read_throughput" units="bytes" symbol_name="GtiReadThroughput" @@ -1118,7 +1118,7 @@ description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB." data_type="uint64" max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL" - equation="A 30 READ A 31 READ UADD A 32 READ UADD 64 UMUL" + equation="$ShaderMemoryAccesses 64 UMUL" underscore_name="l3_shader_throughput" units="bytes" symbol_name="L3ShaderThroughput" @@ -1284,17 +1284,17 @@ - + - + - + - + - + @@ -1765,7 +1765,7 @@ @@ -2084,19 +2084,6 @@ mdapi_hw_unit_type="gpu" mdapi_group="EU Array/Barrier" /> - - - - - - - - + + - + - - + + @@ -3172,7 +3141,7 @@ description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB." data_type="uint64" max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL" - equation="A 30 READ A 31 READ UADD A 32 READ UADD 64 UMUL" + equation="$ShaderMemoryAccesses 64 UMUL" underscore_name="l3_shader_throughput" units="bytes" symbol_name="L3ShaderThroughput" @@ -3834,7 +3803,7 @@ description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB." data_type="uint64" max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL" - equation="A 30 READ A 31 READ UADD A 32 READ UADD 64 UMUL" + equation="$ShaderMemoryAccesses 64 UMUL" underscore_name="l3_shader_throughput" units="bytes" symbol_name="L3ShaderThroughput" @@ -4566,7 +4535,7 @@ description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB." data_type="uint64" max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL" - equation="A 30 READ A 31 READ UADD A 32 READ UADD 64 UMUL" + equation="$ShaderMemoryAccesses 64 UMUL" underscore_name="l3_shader_throughput" units="bytes" symbol_name="L3ShaderThroughput" @@ -5285,7 +5254,7 @@ description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB." data_type="uint64" max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL" - equation="A 30 READ A 31 READ UADD A 32 READ UADD 64 UMUL" + equation="$ShaderMemoryAccesses 64 UMUL" underscore_name="l3_shader_throughput" units="bytes" symbol_name="L3ShaderThroughput" @@ -7839,7 +7808,7 @@ description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB." data_type="uint64" max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL" - equation="A 30 READ A 31 READ UADD A 32 READ UADD 64 UMUL" + equation="$ShaderMemoryAccesses 64 UMUL" underscore_name="l3_shader_throughput" units="bytes" symbol_name="L3ShaderThroughput" @@ -8652,7 +8621,7 @@ description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB." data_type="uint64" max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL" - equation="A 30 READ A 31 READ UADD A 32 READ UADD 64 UMUL" + equation="$ShaderMemoryAccesses 64 UMUL" underscore_name="l3_shader_throughput" units="bytes" symbol_name="L3ShaderThroughput" @@ -9497,7 +9466,7 @@ description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB." data_type="uint64" max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL" - equation="A 30 READ A 31 READ UADD A 32 READ UADD 64 UMUL" + equation="$ShaderMemoryAccesses 64 UMUL" underscore_name="l3_shader_throughput" units="bytes" symbol_name="L3ShaderThroughput" @@ -10330,7 +10299,7 @@ description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB." data_type="uint64" max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL" - equation="A 30 READ A 31 READ UADD A 32 READ UADD 64 UMUL" + equation="$ShaderMemoryAccesses 64 UMUL" underscore_name="l3_shader_throughput" units="bytes" symbol_name="L3ShaderThroughput" @@ -11162,7 +11131,7 @@ description="The total number of GPU memory bytes transferred between shaders and L3 caches w/o URB." data_type="uint64" max_equation="$GpuCoreClocks 64 UMUL $EuSubslicesTotalCount UMUL" - equation="A 30 READ A 31 READ UADD A 32 READ UADD 64 UMUL" + equation="$ShaderMemoryAccesses 64 UMUL" underscore_name="l3_shader_throughput" units="bytes" symbol_name="L3ShaderThroughput" diff -Nru mesa-19.2.8/src/intel/perf/oa-lkf.xml mesa-20.0.8/src/intel/perf/oa-lkf.xml --- mesa-19.2.8/src/intel/perf/oa-lkf.xml 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/intel/perf/oa-lkf.xml 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,11802 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff -Nru mesa-19.2.8/src/intel/perf/oa-tgl.xml mesa-20.0.8/src/intel/perf/oa-tgl.xml --- mesa-19.2.8/src/intel/perf/oa-tgl.xml 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/intel/perf/oa-tgl.xml 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,8597 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff -Nru mesa-19.2.8/src/intel/tools/aubinator_viewer_decoder.cpp mesa-20.0.8/src/intel/tools/aubinator_viewer_decoder.cpp --- mesa-19.2.8/src/intel/tools/aubinator_viewer_decoder.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/tools/aubinator_viewer_decoder.cpp 2020-06-12 01:21:17.000000000 +0000 @@ -955,9 +955,9 @@ } if (strcmp(inst_name, "MI_BATCH_BUFFER_START") == 0) { - uint64_t next_batch_addr; + uint64_t next_batch_addr = 0xd0d0d0d0; bool ppgtt = false; - bool second_level; + bool second_level = false; struct gen_field_iterator iter; gen_field_iterator_init(&iter, inst, p, 0, false); while (gen_field_iterator_next(&iter)) { diff -Nru mesa-19.2.8/src/intel/tools/aub_mem.c mesa-20.0.8/src/intel/tools/aub_mem.c --- mesa-19.2.8/src/intel/tools/aub_mem.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/tools/aub_mem.c 2020-06-12 01:21:17.000000000 +0000 @@ -87,9 +87,9 @@ cmp_uint64(uint64_t a, uint64_t b) { if (a < b) - return -1; - if (a > b) return 1; + if (a > b) + return -1; return 0; } @@ -109,7 +109,7 @@ if (!node || (cmp = cmp_ggtt_entry(node, &virt_addr))) { struct ggtt_entry *new_entry = calloc(1, sizeof(*new_entry)); new_entry->virt_addr = virt_addr; - rb_tree_insert_at(&mem->ggtt, node, &new_entry->node, cmp > 0); + rb_tree_insert_at(&mem->ggtt, node, &new_entry->node, cmp < 0); node = &new_entry->node; } @@ -153,7 +153,7 @@ mem->mem_fd, new_mem->fd_offset); assert(new_mem->data != MAP_FAILED); - rb_tree_insert_at(&mem->mem, node, &new_mem->node, cmp > 0); + rb_tree_insert_at(&mem->mem, node, &new_mem->node, cmp < 0); node = &new_mem->node; } diff -Nru mesa-19.2.8/src/intel/tools/aub_write.c mesa-20.0.8/src/intel/tools/aub_write.c --- mesa-19.2.8/src/intel/tools/aub_write.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/tools/aub_write.c 2020-06-12 01:21:17.000000000 +0000 @@ -179,14 +179,17 @@ aub_write_header(aub, app_name); aub->phys_addrs_allocator = 0; + aub->ggtt_addrs_allocator = 0; aub->pml4.phys_addr = aub->phys_addrs_allocator++ << 12; - mem_trace_memory_write_header_out(aub, 0, + mem_trace_memory_write_header_out(aub, aub->ggtt_addrs_allocator++, GEN8_PTE_SIZE, AUB_MEM_TRACE_MEMORY_ADDRESS_SPACE_GGTT_ENTRY, "GGTT PT"); dword_out(aub, 1); dword_out(aub, 0); + + aub->next_context_handle = 1; } void @@ -393,6 +396,69 @@ }, }; +static void +aub_map_ggtt(struct aub_file *aub, uint64_t virt_addr, uint64_t size) +{ + /* Makes the code below a bit simpler. In practice all of the write we + * receive from error2aub are page aligned. + */ + assert(virt_addr % 4096 == 0); + assert((aub->phys_addrs_allocator + size) < (1UL << 32)); + + /* GGTT PT */ + uint32_t ggtt_ptes = DIV_ROUND_UP(size, 4096); + uint64_t phys_addr = aub->phys_addrs_allocator << 12; + aub->phys_addrs_allocator += ggtt_ptes; + + if (aub->verbose_log_file) { + fprintf(aub->verbose_log_file, + " Mapping GGTT address: 0x%" PRIx64 ", size: %" PRIu64" phys_addr=0x%" PRIx64 " entries=%u\n", + virt_addr, size, phys_addr, ggtt_ptes); + } + + mem_trace_memory_write_header_out(aub, + (virt_addr >> 12) * GEN8_PTE_SIZE, + ggtt_ptes * GEN8_PTE_SIZE, + AUB_MEM_TRACE_MEMORY_ADDRESS_SPACE_GGTT_ENTRY, + "GGTT PT"); + for (uint32_t i = 0; i < ggtt_ptes; i++) { + dword_out(aub, 1 + phys_addr + i * 4096); + dword_out(aub, 0); + } +} + +void +aub_write_ggtt(struct aub_file *aub, uint64_t virt_addr, uint64_t size, const void *data) +{ + /* Default setup assumes a 1 to 1 mapping between physical and virtual GGTT + * addresses. This is somewhat incompatible with the aub_write_ggtt() + * function. In practice it doesn't matter as the GGTT writes are used to + * replace the default setup and we've taken care to setup the PML4 as the + * top of the GGTT. + */ + assert(!aub->has_default_setup); + + aub_map_ggtt(aub, virt_addr, size); + + /* We write the GGTT buffer through the GGTT aub command rather than the + * PHYSICAL aub command. This is because the Gen9 simulator seems to have 2 + * different set of memory pools for GGTT and physical (probably someone + * didn't really understand the concept?). + */ + static const char null_block[8 * 4096]; + for (uint64_t offset = 0; offset < size; offset += 4096) { + uint32_t block_size = min(4096, size - offset); + + mem_trace_memory_write_header_out(aub, virt_addr + offset, block_size, + AUB_MEM_TRACE_MEMORY_ADDRESS_SPACE_GGTT, + "GGTT buffer"); + data_out(aub, (char *) data + offset, block_size); + + /* Pad to a multiple of 4 bytes. */ + data_out(aub, null_block, -block_size & 3); + } +} + static const struct engine * engine_from_engine_class(enum drm_i915_gem_engine_class engine_class) { @@ -432,8 +498,38 @@ gen10_contexts[engine_class](params, data, size); } +static uint64_t +alloc_ggtt_address(struct aub_file *aub, uint64_t size) +{ + uint32_t ggtt_ptes = DIV_ROUND_UP(size, 4096); + uint64_t addr = aub->ggtt_addrs_allocator << 12; + + aub->ggtt_addrs_allocator += ggtt_ptes; + aub_map_ggtt(aub, addr, size); + + return addr; +} + +static void +write_hwsp(struct aub_file *aub, + enum drm_i915_gem_engine_class engine_class) +{ + uint32_t reg = 0; + switch (engine_class) { + case I915_ENGINE_CLASS_RENDER: reg = HWS_PGA_RCSUNIT; break; + case I915_ENGINE_CLASS_COPY: reg = HWS_PGA_BCSUNIT; break; + case I915_ENGINE_CLASS_VIDEO: reg = HWS_PGA_VCSUNIT0; break; + default: + unreachable("unknown ring"); + } + + register_write_out(aub, reg, aub->engine_setup[engine_class].hwsp_addr); +} + static uint32_t write_engine_execlist_setup(struct aub_file *aub, + uint32_t ctx_id, + struct aub_hw_context *hw_ctx, enum drm_i915_gem_engine_class engine_class) { const struct engine *cs = engine_from_engine_class(engine_class); @@ -442,39 +538,26 @@ get_context_init(&aub->devinfo, NULL, engine_class, NULL, &context_size); /* GGTT PT */ - uint64_t phys_addr = aub->phys_addrs_allocator << 12; uint32_t total_size = RING_SIZE + PPHWSP_SIZE + context_size; - uint32_t ggtt_ptes = DIV_ROUND_UP(total_size, 4096); char name[80]; + uint64_t ggtt_addr = alloc_ggtt_address(aub, total_size); - aub->phys_addrs_allocator += ggtt_ptes; - - snprintf(name, sizeof(name), "%s GGTT PT", cs->name); - mem_trace_memory_write_header_out(aub, - sizeof(uint64_t) * (phys_addr >> 12), - ggtt_ptes * GEN8_PTE_SIZE, - AUB_MEM_TRACE_MEMORY_ADDRESS_SPACE_GGTT_ENTRY, - name); - for (uint32_t i = 0; i < ggtt_ptes; i++) { - dword_out(aub, 1 + 0x1000 * i + phys_addr); - dword_out(aub, 0); - } + snprintf(name, sizeof(name), "%s (ctx id: %d) GGTT PT", cs->name, ctx_id); /* RING */ - aub->engine_setup[engine_class].ring_addr = phys_addr; + hw_ctx->ring_addr = ggtt_addr; snprintf(name, sizeof(name), "%s RING", cs->name); - mem_trace_memory_write_header_out(aub, phys_addr, RING_SIZE, + mem_trace_memory_write_header_out(aub, ggtt_addr, RING_SIZE, AUB_MEM_TRACE_MEMORY_ADDRESS_SPACE_GGTT, name); for (uint32_t i = 0; i < RING_SIZE; i += sizeof(uint32_t)) dword_out(aub, 0); - phys_addr += RING_SIZE; + ggtt_addr += RING_SIZE; /* PPHWSP */ - aub->engine_setup[engine_class].pphwsp_addr = phys_addr; - aub->engine_setup[engine_class].descriptor = cs->hw_class | phys_addr | CONTEXT_FLAGS; + hw_ctx->pphwsp_addr = ggtt_addr; snprintf(name, sizeof(name), "%s PPHWSP", cs->name); - mem_trace_memory_write_header_out(aub, phys_addr, + mem_trace_memory_write_header_out(aub, ggtt_addr, PPHWSP_SIZE + context_size, AUB_MEM_TRACE_MEMORY_ADDRESS_SPACE_GGTT, name); @@ -483,7 +566,7 @@ /* CONTEXT */ struct gen_context_parameters params = { - .ring_addr = aub->engine_setup[engine_class].ring_addr, + .ring_addr = hw_ctx->ring_addr, .ring_size = RING_SIZE, .pml4_addr = aub->pml4.phys_addr, }; @@ -492,20 +575,14 @@ data_out(aub, context_data, context_size); free(context_data); + hw_ctx->initialized = true; + return total_size; } static void write_execlists_default_setup(struct aub_file *aub) { - write_engine_execlist_setup(aub, I915_ENGINE_CLASS_RENDER); - write_engine_execlist_setup(aub, I915_ENGINE_CLASS_COPY); - write_engine_execlist_setup(aub, I915_ENGINE_CLASS_VIDEO); - - register_write_out(aub, HWS_PGA_RCSUNIT, aub->engine_setup[I915_ENGINE_CLASS_RENDER].pphwsp_addr); - register_write_out(aub, HWS_PGA_VCSUNIT0, aub->engine_setup[I915_ENGINE_CLASS_VIDEO].pphwsp_addr); - register_write_out(aub, HWS_PGA_BCSUNIT, aub->engine_setup[I915_ENGINE_CLASS_COPY].pphwsp_addr); - register_write_out(aub, GFX_MODE_RCSUNIT, 0x80008000 /* execlist enable */); register_write_out(aub, GFX_MODE_VCSUNIT0, 0x80008000 /* execlist enable */); register_write_out(aub, GFX_MODE_BCSUNIT, 0x80008000 /* execlist enable */); @@ -547,67 +624,62 @@ aub->has_default_setup = true; } -void -aub_write_ggtt(struct aub_file *aub, uint64_t virt_addr, uint64_t size, const void *data) +static struct aub_context * +aub_context_new(struct aub_file *aub, uint32_t new_id) { - if (aub->verbose_log_file) { - fprintf(aub->verbose_log_file, - " Writting GGTT address: 0x%" PRIx64 ", size: %" PRIu64"\n", - virt_addr, size); - } + assert(aub->num_contexts < MAX_CONTEXT_COUNT); - /* Default setup assumes a 1 to 1 mapping between physical and virtual GGTT - * addresses. This is somewhat incompatible with the aub_write_ggtt() - * function. In practice it doesn't matter as the GGTT writes are used to - * replace the default setup and we've taken care to setup the PML4 as the - * top of the GGTT. - */ - assert(!aub->has_default_setup); + struct aub_context *ctx = &aub->contexts[aub->num_contexts++]; + memset(ctx, 0, sizeof(*ctx)); + ctx->id = new_id; - /* Makes the code below a bit simpler. In practice all of the write we - * receive from error2aub are page aligned. - */ - assert(virt_addr % 4096 == 0); - assert((aub->phys_addrs_allocator + size) < (1UL << 32)); + return ctx; +} - /* GGTT PT */ - uint32_t ggtt_ptes = DIV_ROUND_UP(size, 4096); - uint64_t phys_addr = aub->phys_addrs_allocator << 12; - aub->phys_addrs_allocator += ggtt_ptes; +uint32_t +aub_write_context_create(struct aub_file *aub, uint32_t *ctx_id) +{ + uint32_t new_id = ctx_id ? *ctx_id : aub->next_context_handle; - if (aub->verbose_log_file) { - fprintf(aub->verbose_log_file, - " Writting GGTT address: 0x%" PRIx64 ", size: %" PRIu64" phys_addr=0x%" PRIx64 " entries=%u\n", - virt_addr, size, phys_addr, ggtt_ptes); - } + aub_context_new(aub, new_id); - mem_trace_memory_write_header_out(aub, - (virt_addr >> 12) * GEN8_PTE_SIZE, - ggtt_ptes * GEN8_PTE_SIZE, - AUB_MEM_TRACE_MEMORY_ADDRESS_SPACE_GGTT_ENTRY, - "GGTT PT"); - for (uint32_t i = 0; i < ggtt_ptes; i++) { - dword_out(aub, 1 + phys_addr + i * 4096); - dword_out(aub, 0); + if (!ctx_id) + aub->next_context_handle++; + + return new_id; +} + +static struct aub_context * +aub_context_find(struct aub_file *aub, uint32_t id) +{ + for (int i = 0; i < aub->num_contexts; i++) { + if (aub->contexts[i].id == id) + return &aub->contexts[i]; } - /* We write the GGTT buffer through the GGTT aub command rather than the - * PHYSICAL aub command. This is because the Gen9 simulator seems to have 2 - * different set of memory pools for GGTT and physical (probably someone - * didn't really understand the concept?). - */ - static const char null_block[8 * 4096]; - for (uint64_t offset = 0; offset < size; offset += 4096) { - uint32_t block_size = min(4096, size - offset); + return NULL; +} - mem_trace_memory_write_header_out(aub, virt_addr + offset, block_size, - AUB_MEM_TRACE_MEMORY_ADDRESS_SPACE_GGTT, - "GGTT buffer"); - data_out(aub, (char *) data + offset, block_size); +static struct aub_hw_context * +aub_write_ensure_context(struct aub_file *aub, uint32_t ctx_id, + enum drm_i915_gem_engine_class engine_class) +{ + struct aub_context *ctx = aub_context_find(aub, ctx_id); + assert(ctx != NULL); - /* Pad to a multiple of 4 bytes. */ - data_out(aub, null_block, -block_size & 3); - } + struct aub_hw_context *hw_ctx = &ctx->hw_contexts[engine_class]; + if (!hw_ctx->initialized) + write_engine_execlist_setup(aub, ctx->id, hw_ctx, engine_class); + + return hw_ctx; +} + +static uint64_t +get_context_descriptor(struct aub_file *aub, + const struct engine *cs, + struct aub_hw_context *hw_ctx) +{ + return cs->hw_class | hw_ctx->pphwsp_addr | CONTEXT_FLAGS; } /** @@ -658,10 +730,11 @@ static void aub_dump_ring_buffer_execlist(struct aub_file *aub, + struct aub_hw_context *hw_ctx, const struct engine *cs, uint64_t batch_offset) { - mem_trace_memory_write_header_out(aub, aub->engine_setup[cs->engine_class].ring_addr, 16, + mem_trace_memory_write_header_out(aub, hw_ctx->ring_addr, 16, AUB_MEM_TRACE_MEMORY_ADDRESS_SPACE_GGTT, "RING MI_BATCH_BUFFER_START user"); dword_out(aub, AUB_MI_BATCH_BUFFER_START | MI_BATCH_NON_SECURE_I965 | (3 - 2)); @@ -669,11 +742,11 @@ dword_out(aub, batch_offset >> 32); dword_out(aub, 0 /* MI_NOOP */); - mem_trace_memory_write_header_out(aub, aub->engine_setup[cs->engine_class].ring_addr + 8192 + 20, 4, + mem_trace_memory_write_header_out(aub, hw_ctx->ring_addr + 8192 + 20, 4, AUB_MEM_TRACE_MEMORY_ADDRESS_SPACE_GGTT, "RING BUFFER HEAD"); dword_out(aub, 0); /* RING_BUFFER_HEAD */ - mem_trace_memory_write_header_out(aub, aub->engine_setup[cs->engine_class].ring_addr + 8192 + 28, 4, + mem_trace_memory_write_header_out(aub, hw_ctx->ring_addr + 8192 + 28, 4, AUB_MEM_TRACE_MEMORY_ADDRESS_SPACE_GGTT, "RING BUFFER TAIL"); dword_out(aub, 16); /* RING_BUFFER_TAIL */ @@ -747,15 +820,32 @@ data_out(aub, ringbuffer, ring_count * 4); } +static void +aub_write_ensure_hwsp(struct aub_file *aub, + enum drm_i915_gem_engine_class engine_class) +{ + uint64_t *hwsp_addr = &aub->engine_setup[engine_class].hwsp_addr; + + if (*hwsp_addr != 0) + return; + + *hwsp_addr = alloc_ggtt_address(aub, 4096); + write_hwsp(aub, engine_class); +} + void -aub_write_exec(struct aub_file *aub, uint64_t batch_addr, +aub_write_exec(struct aub_file *aub, uint32_t ctx_id, uint64_t batch_addr, uint64_t offset, enum drm_i915_gem_engine_class engine_class) { const struct engine *cs = engine_from_engine_class(engine_class); if (aub_use_execlists(aub)) { - aub_dump_ring_buffer_execlist(aub, cs, batch_addr); - aub_dump_execlist(aub, cs, aub->engine_setup[engine_class].descriptor); + struct aub_hw_context *hw_ctx = + aub_write_ensure_context(aub, ctx_id, engine_class); + uint64_t descriptor = get_context_descriptor(aub, cs, hw_ctx); + aub_write_ensure_hwsp(aub, engine_class); + aub_dump_ring_buffer_execlist(aub, hw_ctx, cs, batch_addr); + aub_dump_execlist(aub, cs, descriptor); } else { /* Dump ring buffer */ aub_dump_ring_buffer_legacy(aub, batch_addr, offset, engine_class); diff -Nru mesa-19.2.8/src/intel/tools/aub_write.h mesa-20.0.8/src/intel/tools/aub_write.h --- mesa-19.2.8/src/intel/tools/aub_write.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/tools/aub_write.h 2020-06-12 01:21:17.000000000 +0000 @@ -37,11 +37,25 @@ extern "C" { #endif +#define MAX_CONTEXT_COUNT 64 + struct aub_ppgtt_table { uint64_t phys_addr; struct aub_ppgtt_table *subtables[512]; }; +struct aub_hw_context { + bool initialized; + uint64_t ring_addr; + uint64_t pphwsp_addr; +}; + +/* GEM context, as seen from userspace */ +struct aub_context { + uint32_t id; + struct aub_hw_context hw_contexts[I915_ENGINE_CLASS_VIDEO + 1]; +}; + struct aub_file { FILE *file; @@ -57,12 +71,16 @@ struct aub_ppgtt_table pml4; uint64_t phys_addrs_allocator; + uint64_t ggtt_addrs_allocator; struct { - uint64_t ring_addr; - uint64_t pphwsp_addr; - uint64_t descriptor; + uint64_t hwsp_addr; } engine_setup[I915_ENGINE_CLASS_VIDEO_ENHANCE + 1]; + + struct aub_context contexts[MAX_CONTEXT_COUNT]; + int num_contexts; + + uint32_t next_context_handle; }; void aub_file_init(struct aub_file *aub, FILE *file, FILE *debug, uint16_t pci_id, const char *app_name); @@ -91,11 +109,13 @@ void aub_write_trace_block(struct aub_file *aub, uint32_t type, void *virtual, uint32_t size, uint64_t gtt_offset); -void aub_write_exec(struct aub_file *aub, uint64_t batch_addr, +void aub_write_exec(struct aub_file *aub, uint32_t ctx_id, uint64_t batch_addr, uint64_t offset, enum drm_i915_gem_engine_class engine_class); void aub_write_context_execlists(struct aub_file *aub, uint64_t context_addr, enum drm_i915_gem_engine_class engine_class); +uint32_t aub_write_context_create(struct aub_file *aub, uint32_t *ctx_id); + #ifdef __cplusplus } #endif diff -Nru mesa-19.2.8/src/intel/tools/error2aub.c mesa-20.0.8/src/intel/tools/error2aub.c --- mesa-19.2.8/src/intel/tools/error2aub.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/tools/error2aub.c 2020-06-12 01:21:17.000000000 +0000 @@ -292,6 +292,7 @@ int active_engine_instance = -1; enum address_space active_gtt = PPGTT; + enum address_space default_gtt = PPGTT; struct { struct { @@ -321,8 +322,7 @@ NULL, pci_id, "error_state"); if (verbose) aub.verbose_log_file = stdout; - fail_if(!aub_use_execlists(&aub), - "%s currently only works on gen8+\n", argv[0]); + default_gtt = active_gtt = aub_use_execlists(&aub) ? PPGTT : GGTT; continue; } @@ -350,7 +350,7 @@ char *ring = line + strlen(active_start); engine_from_name(ring, &active_engine_class, &active_engine_instance); - active_gtt = PPGTT; + active_gtt = default_gtt; char *count = strchr(ring, '['); fail_if(!count || sscanf(count, "[%d]:", &num_ring_bos) < 1, @@ -369,7 +369,6 @@ if (num_ring_bos > 0) { unsigned hi, lo, size; if (sscanf(line, " %x_%x %d", &hi, &lo, &size) == 3) { - assert(aub_use_execlists(&aub)); struct bo *bo_entry = find_or_create(&bo_list, ((uint64_t)hi) << 32 | lo, active_gtt, active_engine_class, @@ -408,8 +407,8 @@ enum bo_type type; enum address_space gtt; } bo_types[] = { - { "gtt_offset", BO_TYPE_BATCH, PPGTT }, - { "user", BO_TYPE_USER, PPGTT }, + { "gtt_offset", BO_TYPE_BATCH, default_gtt }, + { "user", BO_TYPE_USER, default_gtt }, { "HW context", BO_TYPE_CONTEXT, GGTT }, { "ringbuffer", BO_TYPE_RINGBUFFER, GGTT }, { "HW Status", BO_TYPE_STATUS, GGTT }, @@ -461,18 +460,25 @@ list_for_each_entry(struct bo, bo_entry, &bo_list, link) { switch (bo_entry->type) { case BO_TYPE_BATCH: - aub_map_ppgtt(&aub, bo_entry->addr, bo_entry->size); - aub_write_trace_block(&aub, AUB_TRACE_TYPE_BATCH, - bo_entry->data, bo_entry->size, bo_entry->addr); + if (bo_entry->gtt == PPGTT) { + aub_map_ppgtt(&aub, bo_entry->addr, bo_entry->size); + aub_write_trace_block(&aub, AUB_TRACE_TYPE_BATCH, + bo_entry->data, bo_entry->size, bo_entry->addr); + } else + aub_write_ggtt(&aub, bo_entry->addr, bo_entry->size, bo_entry->data); break; case BO_TYPE_USER: - aub_map_ppgtt(&aub, bo_entry->addr, bo_entry->size); - aub_write_trace_block(&aub, AUB_TRACE_TYPE_NOTYPE, - bo_entry->data, bo_entry->size, bo_entry->addr); + if (bo_entry->gtt == PPGTT) { + aub_map_ppgtt(&aub, bo_entry->addr, bo_entry->size); + aub_write_trace_block(&aub, AUB_TRACE_TYPE_NOTYPE, + bo_entry->data, bo_entry->size, bo_entry->addr); + } else + aub_write_ggtt(&aub, bo_entry->addr, bo_entry->size, bo_entry->data); break; case BO_TYPE_CONTEXT: if (bo_entry->engine_class == batch_bo->engine_class && - bo_entry->engine_instance == batch_bo->engine_instance) { + bo_entry->engine_instance == batch_bo->engine_instance && + aub_use_execlists(&aub)) { hwsp_bo = bo_entry; uint32_t *context = (uint32_t *) (bo_entry->data + 4096 /* GuC */ + 4096 /* HWSP */); @@ -531,8 +537,15 @@ } } - fail_if(!hwsp_bo, "Failed to find Context buffer.\n"); - aub_write_context_execlists(&aub, hwsp_bo->addr + 4096 /* skip GuC page */, hwsp_bo->engine_class); + if (aub_use_execlists(&aub)) { + fail_if(!hwsp_bo, "Failed to find Context buffer.\n"); + aub_write_context_execlists(&aub, hwsp_bo->addr + 4096 /* skip GuC page */, hwsp_bo->engine_class); + } else { + /* Use context id 0 -- if we are not using execlists it doesn't matter + * anyway + */ + aub_write_exec(&aub, 0, batch_bo->addr, 0, I915_ENGINE_CLASS_RENDER); + } /* Cleanup */ list_for_each_entry_safe(struct bo, bo_entry, &bo_list, link) { diff -Nru mesa-19.2.8/src/intel/tools/i965_gram.y mesa-20.0.8/src/intel/tools/i965_gram.y --- mesa-19.2.8/src/intel/tools/i965_gram.y 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/tools/i965_gram.y 2020-06-12 01:21:17.000000000 +0000 @@ -962,7 +962,7 @@ if (brw_inst_send_sel_reg32_ex_desc(p->devinfo, brw_last_inst)) { brw_inst_set_send_ex_desc_ia_subreg_nr(p->devinfo, brw_last_inst, $5.subnr); } else { - brw_inst_set_send_ex_desc(p->devinfo, brw_last_inst, $8); + brw_inst_set_sends_ex_desc(p->devinfo, brw_last_inst, $8); } brw_inst_set_bits(brw_last_inst, 127, 96, $7); @@ -988,7 +988,7 @@ brw_set_src1(p, brw_last_inst, $6); brw_inst_set_send_sel_reg32_desc(p->devinfo, brw_last_inst, 1); - brw_inst_set_send_ex_desc(p->devinfo, brw_last_inst, $8); + brw_inst_set_sends_ex_desc(p->devinfo, brw_last_inst, $8); brw_inst_set_sfid(p->devinfo, brw_last_inst, $9); brw_inst_set_eot(p->devinfo, brw_last_inst, $10.end_of_thread); diff -Nru mesa-19.2.8/src/intel/tools/intel_dump_gpu.c mesa-20.0.8/src/intel/tools/intel_dump_gpu.c --- mesa-19.2.8/src/intel/tools/intel_dump_gpu.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/tools/intel_dump_gpu.c 2020-06-12 01:21:17.000000000 +0000 @@ -291,7 +291,9 @@ free(data); } - aub_write_exec(&aub_file, + uint32_t ctx_id = execbuffer2->rsvd1; + + aub_write_exec(&aub_file, ctx_id, batch_bo->offset + execbuffer2->batch_start_offset, offset, engine_class_from_ring_flag(ring_flag)); @@ -336,6 +338,23 @@ bo->map = NULL; } +static uint32_t +dump_create_context(int fd, uint32_t *ctx_id) +{ + if (!aub_file.file) { + aub_file_init(&aub_file, output_file, + verbose == 2 ? stdout : NULL, + device, program_invocation_short_name); + aub_write_default_setup(&aub_file); + + if (verbose) + printf("[running, output file %s, chipset id 0x%04x, gen %d]\n", + output_filename, device, devinfo.gen); + } + + return aub_write_context_create(&aub_file, ctx_id); +} + __attribute__ ((visibility ("default"))) int close(int fd) { @@ -458,6 +477,36 @@ return libc_ioctl(fd, request, argp); } + case DRM_IOCTL_I915_GEM_CONTEXT_CREATE: { + uint32_t *ctx_id = NULL; + struct drm_i915_gem_context_create *create = argp; + ret = 0; + if (!device_override) { + ret = libc_ioctl(fd, request, argp); + ctx_id = &create->ctx_id; + } + + if (ret == 0) + create->ctx_id = dump_create_context(fd, ctx_id); + + return ret; + } + + case DRM_IOCTL_I915_GEM_CONTEXT_CREATE_EXT: { + uint32_t *ctx_id = NULL; + struct drm_i915_gem_context_create_ext *create = argp; + ret = 0; + if (!device_override) { + ret = libc_ioctl(fd, request, argp); + ctx_id = &create->ctx_id; + } + + if (ret == 0) + create->ctx_id = dump_create_context(fd, ctx_id); + + return ret; + } + case DRM_IOCTL_I915_GEM_CREATE: { struct drm_i915_gem_create *create = argp; diff -Nru mesa-19.2.8/src/intel/vulkan/anv_allocator.c mesa-20.0.8/src/intel/vulkan/anv_allocator.c --- mesa-19.2.8/src/intel/vulkan/anv_allocator.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/vulkan/anv_allocator.c 2020-06-12 01:21:17.000000000 +0000 @@ -29,8 +29,7 @@ #include "anv_private.h" -#include "util/hash_table.h" -#include "util/simple_mtx.h" +#include "common/gen_aux_map.h" #include "util/anon_file.h" #ifdef HAVE_VALGRIND @@ -110,11 +109,8 @@ struct anv_mmap_cleanup { void *map; size_t size; - uint32_t gem_handle; }; -#define ANV_MMAP_CLEANUP_INIT ((struct anv_mmap_cleanup){0}) - static inline uint32_t ilog2_round_up(uint32_t value) { @@ -212,7 +208,7 @@ map = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, table->fd, 0); if (map == MAP_FAILED) { - return vk_errorf(table->device->instance, table->device, + return vk_errorf(table->device, table->device, VK_ERROR_OUT_OF_HOST_MEMORY, "mmap failed: %m"); } @@ -361,57 +357,6 @@ return NULL; } -/* All pointers in the ptr_free_list are assumed to be page-aligned. This - * means that the bottom 12 bits should all be zero. - */ -#define PFL_COUNT(x) ((uintptr_t)(x) & 0xfff) -#define PFL_PTR(x) ((void *)((uintptr_t)(x) & ~(uintptr_t)0xfff)) -#define PFL_PACK(ptr, count) ({ \ - (void *)(((uintptr_t)(ptr) & ~(uintptr_t)0xfff) | ((count) & 0xfff)); \ -}) - -static bool -anv_ptr_free_list_pop(void **list, void **elem) -{ - void *current = *list; - while (PFL_PTR(current) != NULL) { - void **next_ptr = PFL_PTR(current); - void *new_ptr = VG_NOACCESS_READ(next_ptr); - unsigned new_count = PFL_COUNT(current) + 1; - void *new = PFL_PACK(new_ptr, new_count); - void *old = __sync_val_compare_and_swap(list, current, new); - if (old == current) { - *elem = PFL_PTR(current); - return true; - } - current = old; - } - - return false; -} - -static void -anv_ptr_free_list_push(void **list, void *elem) -{ - void *old, *current; - void **next_ptr = elem; - - /* The pointer-based free list requires that the pointer be - * page-aligned. This is because we use the bottom 12 bits of the - * pointer to store a counter to solve the ABA concurrency problem. - */ - assert(((uintptr_t)elem & 0xfff) == 0); - - old = *list; - do { - current = old; - VG_NOACCESS_WRITE(next_ptr, PFL_PTR(current)); - unsigned new_count = PFL_COUNT(current) + 1; - void *new = PFL_PACK(elem, new_count); - old = __sync_val_compare_and_swap(list, current, new); - } while (old != current); -} - static VkResult anv_block_pool_expand_range(struct anv_block_pool *pool, uint32_t center_bo_offset, uint32_t size); @@ -420,25 +365,22 @@ anv_block_pool_init(struct anv_block_pool *pool, struct anv_device *device, uint64_t start_address, - uint32_t initial_size, - uint64_t bo_flags) + uint32_t initial_size) { VkResult result; pool->device = device; - pool->bo_flags = bo_flags; + pool->use_softpin = device->physical->use_softpin; pool->nbos = 0; pool->size = 0; pool->center_bo_offset = 0; pool->start_address = gen_canonical_address(start_address); pool->map = NULL; - /* This pointer will always point to the first BO in the list */ - pool->bo = &pool->bos[0]; - - anv_bo_init(pool->bo, 0, 0); - - if (!(pool->bo_flags & EXEC_OBJECT_PINNED)) { + if (pool->use_softpin) { + pool->bo = NULL; + pool->fd = -1; + } else { /* Just make it 2GB up-front. The Linux kernel won't actually back it * with pages until we either map and fault on one of them or we use * userptr and send a chunk of it off to the GPU. @@ -446,8 +388,13 @@ pool->fd = os_create_anonymous_file(BLOCK_POOL_MEMFD_SIZE, "block pool"); if (pool->fd == -1) return vk_error(VK_ERROR_INITIALIZATION_FAILED); - } else { - pool->fd = -1; + + pool->wrapper_bo = (struct anv_bo) { + .refcount = 1, + .offset = -1, + .is_wrapper = true, + }; + pool->bo = &pool->wrapper_bo; } if (!u_vector_init(&pool->mmap_cleanups, @@ -476,7 +423,7 @@ fail_mmap_cleanups: u_vector_finish(&pool->mmap_cleanups); fail_fd: - if (!(pool->bo_flags & EXEC_OBJECT_PINNED)) + if (pool->fd >= 0) close(pool->fd); return result; @@ -485,21 +432,18 @@ void anv_block_pool_finish(struct anv_block_pool *pool) { - struct anv_mmap_cleanup *cleanup; - const bool use_softpin = !!(pool->bo_flags & EXEC_OBJECT_PINNED); - - u_vector_foreach(cleanup, &pool->mmap_cleanups) { - if (use_softpin) - anv_gem_munmap(cleanup->map, cleanup->size); - else - munmap(cleanup->map, cleanup->size); - - if (cleanup->gem_handle) - anv_gem_close(pool->device, cleanup->gem_handle); + anv_block_pool_foreach_bo(bo, pool) { + if (bo->map) + anv_gem_munmap(bo->map, bo->size); + anv_gem_close(pool->device, bo->gem_handle); } + struct anv_mmap_cleanup *cleanup; + u_vector_foreach(cleanup, &pool->mmap_cleanups) + munmap(cleanup->map, cleanup->size); u_vector_finish(&pool->mmap_cleanups); - if (!(pool->bo_flags & EXEC_OBJECT_PINNED)) + + if (pool->fd >= 0) close(pool->fd); } @@ -507,80 +451,17 @@ anv_block_pool_expand_range(struct anv_block_pool *pool, uint32_t center_bo_offset, uint32_t size) { - void *map; - uint32_t gem_handle; - struct anv_mmap_cleanup *cleanup; - const bool use_softpin = !!(pool->bo_flags & EXEC_OBJECT_PINNED); - /* Assert that we only ever grow the pool */ assert(center_bo_offset >= pool->back_state.end); assert(size - center_bo_offset >= pool->state.end); /* Assert that we don't go outside the bounds of the memfd */ assert(center_bo_offset <= BLOCK_POOL_MEMFD_CENTER); - assert(use_softpin || + assert(pool->use_softpin || size - center_bo_offset <= BLOCK_POOL_MEMFD_SIZE - BLOCK_POOL_MEMFD_CENTER); - cleanup = u_vector_add(&pool->mmap_cleanups); - if (!cleanup) - return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); - - *cleanup = ANV_MMAP_CLEANUP_INIT; - - uint32_t newbo_size = size - pool->size; - if (use_softpin) { - gem_handle = anv_gem_create(pool->device, newbo_size); - map = anv_gem_mmap(pool->device, gem_handle, 0, newbo_size, 0); - if (map == MAP_FAILED) { - anv_gem_close(pool->device, gem_handle); - return vk_errorf(pool->device->instance, pool->device, - VK_ERROR_MEMORY_MAP_FAILED, "gem mmap failed: %m"); - } - assert(center_bo_offset == 0); - } else { - /* Just leak the old map until we destroy the pool. We can't munmap it - * without races or imposing locking on the block allocate fast path. On - * the whole the leaked maps adds up to less than the size of the - * current map. MAP_POPULATE seems like the right thing to do, but we - * should try to get some numbers. - */ - map = mmap(NULL, size, PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_POPULATE, pool->fd, - BLOCK_POOL_MEMFD_CENTER - center_bo_offset); - if (map == MAP_FAILED) - return vk_errorf(pool->device->instance, pool->device, - VK_ERROR_MEMORY_MAP_FAILED, "mmap failed: %m"); - - /* Now that we mapped the new memory, we can write the new - * center_bo_offset back into pool and update pool->map. */ - pool->center_bo_offset = center_bo_offset; - pool->map = map + center_bo_offset; - gem_handle = anv_gem_userptr(pool->device, map, size); - if (gem_handle == 0) { - munmap(map, size); - return vk_errorf(pool->device->instance, pool->device, - VK_ERROR_TOO_MANY_OBJECTS, "userptr failed: %m"); - } - } - - cleanup->map = map; - cleanup->size = use_softpin ? newbo_size : size; - cleanup->gem_handle = gem_handle; - - /* Regular objects are created I915_CACHING_CACHED on LLC platforms and - * I915_CACHING_NONE on non-LLC platforms. However, userptr objects are - * always created as I915_CACHING_CACHED, which on non-LLC means - * snooped. - * - * On platforms that support softpin, we are not going to use userptr - * anymore, but we still want to rely on the snooped states. So make sure - * everything is set to I915_CACHING_CACHED. - */ - if (!pool->device->info.has_llc) - anv_gem_set_caching(pool->device, gem_handle, I915_CACHING_CACHED); - - /* For block pool BOs we have to be a bit careful about where we place them + /* For state pool BOs we have to be a bit careful about where we place them * in the GTT. There are two documented workarounds for state base address * placement : Wa32bitGeneralStateOffset and Wa32bitInstructionBaseOffset * which state that those two base addresses do not support 48-bit @@ -603,65 +484,78 @@ * BO to some particular location of our choosing, but that's significantly * more work than just not setting a flag. So, we explicitly DO NOT set * the EXEC_OBJECT_SUPPORTS_48B_ADDRESS flag and the kernel does all of the - * hard work for us. + * hard work for us. When using softpin, we're in control and the fixed + * addresses we choose are fine for base addresses. */ - struct anv_bo *bo; - uint32_t bo_size; - uint64_t bo_offset; + enum anv_bo_alloc_flags bo_alloc_flags = ANV_BO_ALLOC_CAPTURE; + if (!pool->use_softpin) + bo_alloc_flags |= ANV_BO_ALLOC_32BIT_ADDRESS; + + if (pool->use_softpin) { + uint32_t new_bo_size = size - pool->size; + struct anv_bo *new_bo; + assert(center_bo_offset == 0); + VkResult result = anv_device_alloc_bo(pool->device, new_bo_size, + bo_alloc_flags | + ANV_BO_ALLOC_FIXED_ADDRESS | + ANV_BO_ALLOC_MAPPED | + ANV_BO_ALLOC_SNOOPED, + pool->start_address + pool->size, + &new_bo); + if (result != VK_SUCCESS) + return result; - assert(pool->nbos < ANV_MAX_BLOCK_POOL_BOS); + pool->bos[pool->nbos++] = new_bo; - if (use_softpin) { - /* With softpin, we add a new BO to the pool, and set its offset to right - * where the previous BO ends (the end of the pool). - */ - bo = &pool->bos[pool->nbos++]; - bo_size = newbo_size; - bo_offset = pool->start_address + pool->size; + /* This pointer will always point to the first BO in the list */ + pool->bo = pool->bos[0]; } else { - /* Without softpin, we just need one BO, and we already have a pointer to - * it. Simply "allocate" it from our array if we didn't do it before. - * The offset doesn't matter since we are not pinning the BO anyway. + /* Just leak the old map until we destroy the pool. We can't munmap it + * without races or imposing locking on the block allocate fast path. On + * the whole the leaked maps adds up to less than the size of the + * current map. MAP_POPULATE seems like the right thing to do, but we + * should try to get some numbers. */ - if (pool->nbos == 0) - pool->nbos++; - bo = pool->bo; - bo_size = size; - bo_offset = 0; - } - - anv_bo_init(bo, gem_handle, bo_size); - bo->offset = bo_offset; - bo->flags = pool->bo_flags; - bo->map = map; - pool->size = size; - - return VK_SUCCESS; -} + void *map = mmap(NULL, size, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_POPULATE, pool->fd, + BLOCK_POOL_MEMFD_CENTER - center_bo_offset); + if (map == MAP_FAILED) + return vk_errorf(pool->device, pool->device, + VK_ERROR_MEMORY_MAP_FAILED, "mmap failed: %m"); -static struct anv_bo * -anv_block_pool_get_bo(struct anv_block_pool *pool, int32_t *offset) -{ - struct anv_bo *bo, *bo_found = NULL; - int32_t cur_offset = 0; + struct anv_bo *new_bo; + VkResult result = anv_device_import_bo_from_host_ptr(pool->device, + map, size, + bo_alloc_flags, + 0 /* client_address */, + &new_bo); + if (result != VK_SUCCESS) { + munmap(map, size); + return result; + } - assert(offset); + struct anv_mmap_cleanup *cleanup = u_vector_add(&pool->mmap_cleanups); + if (!cleanup) { + munmap(map, size); + anv_device_release_bo(pool->device, new_bo); + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + } + cleanup->map = map; + cleanup->size = size; - if (!(pool->bo_flags & EXEC_OBJECT_PINNED)) - return pool->bo; + /* Now that we mapped the new memory, we can write the new + * center_bo_offset back into pool and update pool->map. */ + pool->center_bo_offset = center_bo_offset; + pool->map = map + center_bo_offset; - anv_block_pool_foreach_bo(bo, pool) { - if (*offset < cur_offset + bo->size) { - bo_found = bo; - break; - } - cur_offset += bo->size; + pool->bos[pool->nbos++] = new_bo; + pool->wrapper_bo.map = new_bo; } - assert(bo_found != NULL); - *offset -= cur_offset; + assert(pool->nbos < ANV_MAX_BLOCK_POOL_BOS); + pool->size = size; - return bo_found; + return VK_SUCCESS; } /** Returns current memory map of the block pool. @@ -671,11 +565,23 @@ * rather than the start of the block pool BO map. */ void* -anv_block_pool_map(struct anv_block_pool *pool, int32_t offset) +anv_block_pool_map(struct anv_block_pool *pool, int32_t offset, uint32_t size) { - if (pool->bo_flags & EXEC_OBJECT_PINNED) { - struct anv_bo *bo = anv_block_pool_get_bo(pool, &offset); - return bo->map + offset; + if (pool->use_softpin) { + struct anv_bo *bo = NULL; + int32_t bo_offset = 0; + anv_block_pool_foreach_bo(iter_bo, pool) { + if (offset < bo_offset + iter_bo->size) { + bo = iter_bo; + break; + } + bo_offset += iter_bo->size; + } + assert(bo != NULL); + assert(offset >= bo_offset); + assert((offset - bo_offset) + size <= bo->size); + + return bo->map + (offset - bo_offset); } else { return pool->map + offset; } @@ -706,7 +612,8 @@ * the pool and a 4K CPU page. */ static uint32_t -anv_block_pool_grow(struct anv_block_pool *pool, struct anv_block_state *state) +anv_block_pool_grow(struct anv_block_pool *pool, struct anv_block_state *state, + uint32_t contiguous_size) { VkResult result = VK_SUCCESS; @@ -737,12 +644,24 @@ */ assert(old_size > 0); + const uint32_t old_back = pool->center_bo_offset; + const uint32_t old_front = old_size - pool->center_bo_offset; + /* The back_used and front_used may actually be smaller than the actual * requirement because they are based on the next pointers which are * updated prior to calling this function. */ - uint32_t back_required = MAX2(back_used, pool->center_bo_offset); - uint32_t front_required = MAX2(front_used, old_size - pool->center_bo_offset); + uint32_t back_required = MAX2(back_used, old_back); + uint32_t front_required = MAX2(front_used, old_front); + + if (pool->use_softpin) { + /* With softpin, the pool is made up of a bunch of buffers with separate + * maps. Make sure we have enough contiguous space that we can get a + * properly contiguous map for the next chunk. + */ + assert(old_back == 0); + front_required = MAX2(front_required, old_front + contiguous_size); + } if (back_used * 2 <= back_required && front_used * 2 <= front_required) { /* If we're in this case then this isn't the firsta allocation and we @@ -793,8 +712,6 @@ result = anv_block_pool_expand_range(pool, center_bo_offset, size); - pool->bo->flags = pool->bo_flags; - done: pthread_mutex_unlock(&pool->device->mutex); @@ -830,7 +747,7 @@ if (state.next + block_size <= state.end) { return state.next; } else if (state.next <= state.end) { - if (pool->bo_flags & EXEC_OBJECT_PINNED && state.next < state.end) { + if (pool->use_softpin && state.next < state.end) { /* We need to grow the block pool, but still have some leftover * space that can't be used by that particular allocation. So we * add that as a "padding", and return it. @@ -853,7 +770,7 @@ */ new.next = state.next + block_size; do { - new.end = anv_block_pool_grow(pool, pool_state); + new.end = anv_block_pool_grow(pool, pool_state, block_size); } while (new.end < new.next); old.u64 = __sync_lock_test_and_set(&pool_state->u64, new.u64); @@ -907,13 +824,11 @@ anv_state_pool_init(struct anv_state_pool *pool, struct anv_device *device, uint64_t start_address, - uint32_t block_size, - uint64_t bo_flags) + uint32_t block_size) { VkResult result = anv_block_pool_init(&pool->block_pool, device, start_address, - block_size * 16, - bo_flags); + block_size * 16); if (result != VK_SUCCESS) return result; @@ -1028,7 +943,9 @@ st_idx + i); state_i->alloc_size = block_size; state_i->offset = chunk_offset + block_size * i; - state_i->map = anv_block_pool_map(&pool->block_pool, state_i->offset); + state_i->map = anv_block_pool_map(&pool->block_pool, + state_i->offset, + state_i->alloc_size); } uint32_t block_bucket = anv_state_pool_get_bucket(block_size); @@ -1169,7 +1086,7 @@ state = anv_state_table_get(&pool->table, idx); state->offset = offset; state->alloc_size = alloc_size; - state->map = anv_block_pool_map(&pool->block_pool, offset); + state->map = anv_block_pool_map(&pool->block_pool, offset, alloc_size); if (padding > 0) { uint32_t return_offset = offset - padding; @@ -1213,7 +1130,7 @@ state = anv_state_table_get(&pool->table, idx); state->offset = offset; state->alloc_size = alloc_size; - state->map = anv_block_pool_map(&pool->block_pool, state->offset); + state->map = anv_block_pool_map(&pool->block_pool, offset, alloc_size); done: VG(VALGRIND_MEMPOOL_ALLOC(pool, state->map, state->alloc_size)); @@ -1309,9 +1226,10 @@ uint32_t offset = align_u32(stream->next, alignment); if (offset + size > stream->block.alloc_size) { + uint32_t min_block_size = size + sizeof(struct anv_state_stream_block); uint32_t block_size = stream->block_size; - if (block_size < size) - block_size = round_to_power_of_two(size); + if (block_size < min_block_size) + block_size = round_to_power_of_two(min_block_size); stream->block = anv_state_pool_alloc_no_vg(stream->state_pool, block_size, PAGE_SIZE); @@ -1358,18 +1276,15 @@ return state; } -struct bo_pool_bo_link { - struct bo_pool_bo_link *next; - struct anv_bo bo; -}; - void -anv_bo_pool_init(struct anv_bo_pool *pool, struct anv_device *device, - uint64_t bo_flags) +anv_bo_pool_init(struct anv_bo_pool *pool, struct anv_device *device) { pool->device = device; - pool->bo_flags = bo_flags; - memset(pool->free_list, 0, sizeof(pool->free_list)); + for (unsigned i = 0; i < ARRAY_SIZE(pool->free_list); i++) { + util_sparse_array_free_list_init(&pool->free_list[i], + &device->bo_cache.bo_map, 0, + offsetof(struct anv_bo, free_index)); + } VG(VALGRIND_CREATE_MEMPOOL(pool, 0, false)); } @@ -1378,14 +1293,15 @@ anv_bo_pool_finish(struct anv_bo_pool *pool) { for (unsigned i = 0; i < ARRAY_SIZE(pool->free_list); i++) { - struct bo_pool_bo_link *link = PFL_PTR(pool->free_list[i]); - while (link != NULL) { - struct bo_pool_bo_link link_copy = VG_NOACCESS_READ(link); - - anv_gem_munmap(link_copy.bo.map, link_copy.bo.size); - anv_vma_free(pool->device, &link_copy.bo); - anv_gem_close(pool->device, link_copy.bo.gem_handle); - link = link_copy.next; + while (1) { + struct anv_bo *bo = + util_sparse_array_free_list_pop_elem(&pool->free_list[i]); + if (bo == NULL) + break; + + /* anv_device_release_bo is going to "free" it */ + VG(VALGRIND_MALLOCLIKE_BLOCK(bo->map, bo->size, 0, 1)); + anv_device_release_bo(pool->device, bo); } } @@ -1393,80 +1309,55 @@ } VkResult -anv_bo_pool_alloc(struct anv_bo_pool *pool, struct anv_bo *bo, uint32_t size) +anv_bo_pool_alloc(struct anv_bo_pool *pool, uint32_t size, + struct anv_bo **bo_out) { - VkResult result; - const unsigned size_log2 = size < 4096 ? 12 : ilog2_round_up(size); const unsigned pow2_size = 1 << size_log2; const unsigned bucket = size_log2 - 12; assert(bucket < ARRAY_SIZE(pool->free_list)); - void *next_free_void; - if (anv_ptr_free_list_pop(&pool->free_list[bucket], &next_free_void)) { - struct bo_pool_bo_link *next_free = next_free_void; - *bo = VG_NOACCESS_READ(&next_free->bo); - assert(bo->gem_handle); - assert(bo->map == next_free); - assert(size <= bo->size); - + struct anv_bo *bo = + util_sparse_array_free_list_pop_elem(&pool->free_list[bucket]); + if (bo != NULL) { VG(VALGRIND_MEMPOOL_ALLOC(pool, bo->map, size)); - + *bo_out = bo; return VK_SUCCESS; } - struct anv_bo new_bo; - - result = anv_bo_init_new(&new_bo, pool->device, pow2_size); + VkResult result = anv_device_alloc_bo(pool->device, + pow2_size, + ANV_BO_ALLOC_MAPPED | + ANV_BO_ALLOC_SNOOPED | + ANV_BO_ALLOC_CAPTURE, + 0 /* explicit_address */, + &bo); if (result != VK_SUCCESS) return result; - new_bo.flags = pool->bo_flags; - - if (!anv_vma_alloc(pool->device, &new_bo)) - return vk_error(VK_ERROR_OUT_OF_DEVICE_MEMORY); - - assert(new_bo.size == pow2_size); - - new_bo.map = anv_gem_mmap(pool->device, new_bo.gem_handle, 0, pow2_size, 0); - if (new_bo.map == MAP_FAILED) { - anv_gem_close(pool->device, new_bo.gem_handle); - anv_vma_free(pool->device, &new_bo); - return vk_error(VK_ERROR_MEMORY_MAP_FAILED); - } - - /* We are removing the state flushes, so lets make sure that these buffers - * are cached/snooped. - */ - if (!pool->device->info.has_llc) { - anv_gem_set_caching(pool->device, new_bo.gem_handle, - I915_CACHING_CACHED); - } - - *bo = new_bo; - + /* We want it to look like it came from this pool */ + VG(VALGRIND_FREELIKE_BLOCK(bo->map, 0)); VG(VALGRIND_MEMPOOL_ALLOC(pool, bo->map, size)); + *bo_out = bo; + return VK_SUCCESS; } void -anv_bo_pool_free(struct anv_bo_pool *pool, const struct anv_bo *bo_in) +anv_bo_pool_free(struct anv_bo_pool *pool, struct anv_bo *bo) { - /* Make a copy in case the anv_bo happens to be storred in the BO */ - struct anv_bo bo = *bo_in; - - VG(VALGRIND_MEMPOOL_FREE(pool, bo.map)); + VG(VALGRIND_MEMPOOL_FREE(pool, bo->map)); - struct bo_pool_bo_link *link = bo.map; - VG_NOACCESS_WRITE(&link->bo, bo); - - assert(util_is_power_of_two_or_zero(bo.size)); - const unsigned size_log2 = ilog2_round_up(bo.size); + assert(util_is_power_of_two_or_zero(bo->size)); + const unsigned size_log2 = ilog2_round_up(bo->size); const unsigned bucket = size_log2 - 12; assert(bucket < ARRAY_SIZE(pool->free_list)); - anv_ptr_free_list_push(&pool->free_list[bucket], link); + assert(util_sparse_array_get(&pool->device->bo_cache.bo_map, + bo->gem_handle) == bo); + util_sparse_array_free_list_push(&pool->free_list[bucket], + &bo->gem_handle, 1); } // Scratch pool @@ -1482,11 +1373,8 @@ { for (unsigned s = 0; s < MESA_SHADER_STAGES; s++) { for (unsigned i = 0; i < 16; i++) { - struct anv_scratch_bo *bo = &pool->bos[i][s]; - if (bo->exists > 0) { - anv_vma_free(device, &bo->bo); - anv_gem_close(device, bo->bo.gem_handle); - } + if (pool->bos[i][s] != NULL) + anv_device_release_bo(device, pool->bos[i][s]); } } } @@ -1501,28 +1389,27 @@ unsigned scratch_size_log2 = ffs(per_thread_scratch / 2048); assert(scratch_size_log2 < 16); - struct anv_scratch_bo *bo = &pool->bos[scratch_size_log2][stage]; + struct anv_bo *bo = p_atomic_read(&pool->bos[scratch_size_log2][stage]); - /* We can use "exists" to shortcut and ignore the critical section */ - if (bo->exists) - return &bo->bo; + if (bo != NULL) + return bo; - pthread_mutex_lock(&device->mutex); + const struct gen_device_info *devinfo = &device->info; - __sync_synchronize(); - if (bo->exists) { - pthread_mutex_unlock(&device->mutex); - return &bo->bo; - } - - const struct anv_physical_device *physical_device = - &device->instance->physicalDevice; - const struct gen_device_info *devinfo = &physical_device->info; + unsigned subslices = MAX2(device->physical->subslice_total, 1); - const unsigned subslices = MAX2(physical_device->subslice_total, 1); + /* For, Gen11+, scratch space allocation is based on the number of threads + * in the base configuration. */ + if (devinfo->gen >= 12) + subslices = devinfo->num_subslices[0]; + else if (devinfo->gen == 11) + subslices = 8; unsigned scratch_ids_per_subslice; - if (devinfo->gen >= 11) { + if (devinfo->gen >= 12) { + /* Same as ICL below, but with 16 EUs. */ + scratch_ids_per_subslice = 16 * 8; + } else if (devinfo->gen == 11) { /* The MEDIA_VFE_STATE docs say: * * "Starting with this configuration, the Maximum Number of @@ -1572,8 +1459,6 @@ uint32_t size = per_thread_scratch * max_threads[stage]; - anv_bo_init_new(&bo->bo, device, size); - /* Even though the Scratch base pointers in 3DSTATE_*S are 64 bits, they * are still relative to the general state base address. When we emit * STATE_BASE_ADDRESS, we set general state base address to 0 and the size @@ -1591,40 +1476,30 @@ * * so nothing will ever touch the top page. */ - assert(!(bo->bo.flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS)); - - if (device->instance->physicalDevice.has_exec_async) - bo->bo.flags |= EXEC_OBJECT_ASYNC; - - if (device->instance->physicalDevice.use_softpin) - bo->bo.flags |= EXEC_OBJECT_PINNED; - - anv_vma_alloc(device, &bo->bo); - - /* Set the exists last because it may be read by other threads */ - __sync_synchronize(); - bo->exists = true; - - pthread_mutex_unlock(&device->mutex); + VkResult result = anv_device_alloc_bo(device, size, + ANV_BO_ALLOC_32BIT_ADDRESS, + 0 /* explicit_address */, + &bo); + if (result != VK_SUCCESS) + return NULL; /* TODO */ - return &bo->bo; + struct anv_bo *current_bo = + p_atomic_cmpxchg(&pool->bos[scratch_size_log2][stage], NULL, bo); + if (current_bo) { + anv_device_release_bo(device, bo); + return current_bo; + } else { + return bo; + } } -struct anv_cached_bo { - struct anv_bo bo; - - uint32_t refcount; -}; - VkResult anv_bo_cache_init(struct anv_bo_cache *cache) { - cache->bo_map = _mesa_pointer_hash_table_create(NULL); - if (!cache->bo_map) - return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + util_sparse_array_init(&cache->bo_map, sizeof(struct anv_bo), 1024); if (pthread_mutex_init(&cache->mutex, NULL)) { - _mesa_hash_table_destroy(cache->bo_map, NULL); + util_sparse_array_finish(&cache->bo_map); return vk_errorf(NULL, NULL, VK_ERROR_OUT_OF_HOST_MEMORY, "pthread_mutex_init failed: %m"); } @@ -1635,101 +1510,188 @@ void anv_bo_cache_finish(struct anv_bo_cache *cache) { - _mesa_hash_table_destroy(cache->bo_map, NULL); + util_sparse_array_finish(&cache->bo_map); pthread_mutex_destroy(&cache->mutex); } -static struct anv_cached_bo * -anv_bo_cache_lookup_locked(struct anv_bo_cache *cache, uint32_t gem_handle) +#define ANV_BO_CACHE_SUPPORTED_FLAGS \ + (EXEC_OBJECT_WRITE | \ + EXEC_OBJECT_ASYNC | \ + EXEC_OBJECT_SUPPORTS_48B_ADDRESS | \ + EXEC_OBJECT_PINNED | \ + EXEC_OBJECT_CAPTURE) + +static uint32_t +anv_bo_alloc_flags_to_bo_flags(struct anv_device *device, + enum anv_bo_alloc_flags alloc_flags) { - struct hash_entry *entry = - _mesa_hash_table_search(cache->bo_map, - (const void *)(uintptr_t)gem_handle); - if (!entry) - return NULL; + struct anv_physical_device *pdevice = device->physical; - struct anv_cached_bo *bo = (struct anv_cached_bo *)entry->data; - assert(bo->bo.gem_handle == gem_handle); + uint64_t bo_flags = 0; + if (!(alloc_flags & ANV_BO_ALLOC_32BIT_ADDRESS) && + pdevice->supports_48bit_addresses) + bo_flags |= EXEC_OBJECT_SUPPORTS_48B_ADDRESS; - return bo; -} + if ((alloc_flags & ANV_BO_ALLOC_CAPTURE) && pdevice->has_exec_capture) + bo_flags |= EXEC_OBJECT_CAPTURE; -UNUSED static struct anv_bo * -anv_bo_cache_lookup(struct anv_bo_cache *cache, uint32_t gem_handle) -{ - pthread_mutex_lock(&cache->mutex); + if (alloc_flags & ANV_BO_ALLOC_IMPLICIT_WRITE) { + assert(alloc_flags & ANV_BO_ALLOC_IMPLICIT_SYNC); + bo_flags |= EXEC_OBJECT_WRITE; + } - struct anv_cached_bo *bo = anv_bo_cache_lookup_locked(cache, gem_handle); + if (!(alloc_flags & ANV_BO_ALLOC_IMPLICIT_SYNC) && pdevice->has_exec_async) + bo_flags |= EXEC_OBJECT_ASYNC; - pthread_mutex_unlock(&cache->mutex); + if (pdevice->use_softpin) + bo_flags |= EXEC_OBJECT_PINNED; - return bo ? &bo->bo : NULL; + return bo_flags; } -#define ANV_BO_CACHE_SUPPORTED_FLAGS \ - (EXEC_OBJECT_WRITE | \ - EXEC_OBJECT_ASYNC | \ - EXEC_OBJECT_SUPPORTS_48B_ADDRESS | \ - EXEC_OBJECT_PINNED | \ - ANV_BO_EXTERNAL) +static uint32_t +anv_device_get_bo_align(struct anv_device *device, + enum anv_bo_alloc_flags alloc_flags) +{ + /* Gen12 CCS surface addresses need to be 64K aligned. */ + if (device->info.gen >= 12 && (alloc_flags & ANV_BO_ALLOC_IMPLICIT_CCS)) + return 64 * 1024; + + return 4096; +} VkResult -anv_bo_cache_alloc(struct anv_device *device, - struct anv_bo_cache *cache, - uint64_t size, uint64_t bo_flags, - struct anv_bo **bo_out) +anv_device_alloc_bo(struct anv_device *device, + uint64_t size, + enum anv_bo_alloc_flags alloc_flags, + uint64_t explicit_address, + struct anv_bo **bo_out) { - assert(bo_flags == (bo_flags & ANV_BO_CACHE_SUPPORTED_FLAGS)); + if (!device->physical->has_implicit_ccs) + assert(!(alloc_flags & ANV_BO_ALLOC_IMPLICIT_CCS)); - struct anv_cached_bo *bo = - vk_alloc(&device->alloc, sizeof(struct anv_cached_bo), 8, - VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); - if (!bo) - return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); - - bo->refcount = 1; + const uint32_t bo_flags = + anv_bo_alloc_flags_to_bo_flags(device, alloc_flags); + assert(bo_flags == (bo_flags & ANV_BO_CACHE_SUPPORTED_FLAGS)); /* The kernel is going to give us whole pages anyway */ size = align_u64(size, 4096); - VkResult result = anv_bo_init_new(&bo->bo, device, size); - if (result != VK_SUCCESS) { - vk_free(&device->alloc, bo); - return result; + const uint32_t align = anv_device_get_bo_align(device, alloc_flags); + + uint64_t ccs_size = 0; + if (device->info.has_aux_map && (alloc_flags & ANV_BO_ALLOC_IMPLICIT_CCS)) { + /* Align the size up to the next multiple of 64K so we don't have any + * AUX-TT entries pointing from a 64K page to itself. + */ + size = align_u64(size, 64 * 1024); + + /* See anv_bo::_ccs_size */ + ccs_size = align_u64(DIV_ROUND_UP(size, GEN_AUX_MAP_GEN12_CCS_SCALE), 4096); } - bo->bo.flags = bo_flags; + uint32_t gem_handle = anv_gem_create(device, size + ccs_size); + if (gem_handle == 0) + return vk_error(VK_ERROR_OUT_OF_DEVICE_MEMORY); + + struct anv_bo new_bo = { + .gem_handle = gem_handle, + .refcount = 1, + .offset = -1, + .size = size, + ._ccs_size = ccs_size, + .flags = bo_flags, + .is_external = (alloc_flags & ANV_BO_ALLOC_EXTERNAL), + .has_client_visible_address = + (alloc_flags & ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS) != 0, + .has_implicit_ccs = ccs_size > 0, + }; - if (!anv_vma_alloc(device, &bo->bo)) { - anv_gem_close(device, bo->bo.gem_handle); - vk_free(&device->alloc, bo); - return vk_errorf(device->instance, NULL, - VK_ERROR_OUT_OF_DEVICE_MEMORY, - "failed to allocate virtual address for BO"); + if (alloc_flags & ANV_BO_ALLOC_MAPPED) { + new_bo.map = anv_gem_mmap(device, new_bo.gem_handle, 0, size, 0); + if (new_bo.map == MAP_FAILED) { + anv_gem_close(device, new_bo.gem_handle); + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + } } - assert(bo->bo.gem_handle); + if (alloc_flags & ANV_BO_ALLOC_SNOOPED) { + assert(alloc_flags & ANV_BO_ALLOC_MAPPED); + /* We don't want to change these defaults if it's going to be shared + * with another process. + */ + assert(!(alloc_flags & ANV_BO_ALLOC_EXTERNAL)); - pthread_mutex_lock(&cache->mutex); + /* Regular objects are created I915_CACHING_CACHED on LLC platforms and + * I915_CACHING_NONE on non-LLC platforms. For many internal state + * objects, we'd rather take the snooping overhead than risk forgetting + * a CLFLUSH somewhere. Userptr objects are always created as + * I915_CACHING_CACHED, which on non-LLC means snooped so there's no + * need to do this there. + */ + if (!device->info.has_llc) { + anv_gem_set_caching(device, new_bo.gem_handle, + I915_CACHING_CACHED); + } + } - _mesa_hash_table_insert(cache->bo_map, - (void *)(uintptr_t)bo->bo.gem_handle, bo); + if (alloc_flags & ANV_BO_ALLOC_FIXED_ADDRESS) { + new_bo.has_fixed_address = true; + new_bo.offset = explicit_address; + } else if (new_bo.flags & EXEC_OBJECT_PINNED) { + new_bo.offset = anv_vma_alloc(device, new_bo.size + new_bo._ccs_size, + align, alloc_flags, explicit_address); + if (new_bo.offset == 0) { + if (new_bo.map) + anv_gem_munmap(new_bo.map, size); + anv_gem_close(device, new_bo.gem_handle); + return vk_errorf(device, NULL, VK_ERROR_OUT_OF_DEVICE_MEMORY, + "failed to allocate virtual address for BO"); + } + } else { + assert(!new_bo.has_client_visible_address); + } - pthread_mutex_unlock(&cache->mutex); + if (new_bo._ccs_size > 0) { + assert(device->info.has_aux_map); + gen_aux_map_add_mapping(device->aux_map_ctx, + gen_canonical_address(new_bo.offset), + gen_canonical_address(new_bo.offset + new_bo.size), + new_bo.size, 0 /* format_bits */); + } + + assert(new_bo.gem_handle); + + /* If we just got this gem_handle from anv_bo_init_new then we know no one + * else is touching this BO at the moment so we don't need to lock here. + */ + struct anv_bo *bo = anv_device_lookup_bo(device, new_bo.gem_handle); + *bo = new_bo; - *bo_out = &bo->bo; + *bo_out = bo; return VK_SUCCESS; } VkResult -anv_bo_cache_import_host_ptr(struct anv_device *device, - struct anv_bo_cache *cache, - void *host_ptr, uint32_t size, - uint64_t bo_flags, struct anv_bo **bo_out) -{ +anv_device_import_bo_from_host_ptr(struct anv_device *device, + void *host_ptr, uint32_t size, + enum anv_bo_alloc_flags alloc_flags, + uint64_t client_address, + struct anv_bo **bo_out) +{ + assert(!(alloc_flags & (ANV_BO_ALLOC_MAPPED | + ANV_BO_ALLOC_SNOOPED | + ANV_BO_ALLOC_FIXED_ADDRESS))); + + /* We can't do implicit CCS with an aux table on shared memory */ + if (!device->physical->has_implicit_ccs || device->info.has_aux_map) + assert(!(alloc_flags & ANV_BO_ALLOC_IMPLICIT_CCS)); + + struct anv_bo_cache *cache = &device->bo_cache; + const uint32_t bo_flags = + anv_bo_alloc_flags_to_bo_flags(device, alloc_flags); assert(bo_flags == (bo_flags & ANV_BO_CACHE_SUPPORTED_FLAGS)); - assert((bo_flags & ANV_BO_EXTERNAL) == 0); uint32_t gem_handle = anv_gem_userptr(device, host_ptr, size); if (!gem_handle) @@ -1737,59 +1699,94 @@ pthread_mutex_lock(&cache->mutex); - struct anv_cached_bo *bo = anv_bo_cache_lookup_locked(cache, gem_handle); - if (bo) { + struct anv_bo *bo = anv_device_lookup_bo(device, gem_handle); + if (bo->refcount > 0) { /* VK_EXT_external_memory_host doesn't require handling importing the * same pointer twice at the same time, but we don't get in the way. If * kernel gives us the same gem_handle, only succeed if the flags match. */ - if (bo_flags != bo->bo.flags) { + assert(bo->gem_handle == gem_handle); + if (bo_flags != bo->flags) { pthread_mutex_unlock(&cache->mutex); - return vk_errorf(device->instance, NULL, - VK_ERROR_INVALID_EXTERNAL_HANDLE, + return vk_errorf(device, NULL, VK_ERROR_INVALID_EXTERNAL_HANDLE, "same host pointer imported two different ways"); } - __sync_fetch_and_add(&bo->refcount, 1); - } else { - bo = vk_alloc(&device->alloc, sizeof(struct anv_cached_bo), 8, - VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); - if (!bo) { - anv_gem_close(device, gem_handle); + + if (bo->has_client_visible_address != + ((alloc_flags & ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS) != 0)) { pthread_mutex_unlock(&cache->mutex); - return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + return vk_errorf(device, NULL, VK_ERROR_INVALID_EXTERNAL_HANDLE, + "The same BO was imported with and without buffer " + "device address"); } - bo->refcount = 1; - - anv_bo_init(&bo->bo, gem_handle, size); - bo->bo.flags = bo_flags; - - if (!anv_vma_alloc(device, &bo->bo)) { - anv_gem_close(device, bo->bo.gem_handle); + if (client_address && client_address != gen_48b_address(bo->offset)) { pthread_mutex_unlock(&cache->mutex); - vk_free(&device->alloc, bo); - return vk_errorf(device->instance, NULL, - VK_ERROR_OUT_OF_DEVICE_MEMORY, - "failed to allocate virtual address for BO"); + return vk_errorf(device, NULL, VK_ERROR_INVALID_EXTERNAL_HANDLE, + "The same BO was imported at two different " + "addresses"); } - _mesa_hash_table_insert(cache->bo_map, (void *)(uintptr_t)gem_handle, bo); + __sync_fetch_and_add(&bo->refcount, 1); + } else { + struct anv_bo new_bo = { + .gem_handle = gem_handle, + .refcount = 1, + .offset = -1, + .size = size, + .map = host_ptr, + .flags = bo_flags, + .is_external = true, + .from_host_ptr = true, + .has_client_visible_address = + (alloc_flags & ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS) != 0, + }; + + assert(client_address == gen_48b_address(client_address)); + if (new_bo.flags & EXEC_OBJECT_PINNED) { + assert(new_bo._ccs_size == 0); + new_bo.offset = anv_vma_alloc(device, new_bo.size, + anv_device_get_bo_align(device, + alloc_flags), + alloc_flags, client_address); + if (new_bo.offset == 0) { + anv_gem_close(device, new_bo.gem_handle); + pthread_mutex_unlock(&cache->mutex); + return vk_errorf(device, NULL, VK_ERROR_OUT_OF_DEVICE_MEMORY, + "failed to allocate virtual address for BO"); + } + } else { + assert(!new_bo.has_client_visible_address); + } + + *bo = new_bo; } pthread_mutex_unlock(&cache->mutex); - *bo_out = &bo->bo; + *bo_out = bo; return VK_SUCCESS; } VkResult -anv_bo_cache_import(struct anv_device *device, - struct anv_bo_cache *cache, - int fd, uint64_t bo_flags, - struct anv_bo **bo_out) -{ +anv_device_import_bo(struct anv_device *device, + int fd, + enum anv_bo_alloc_flags alloc_flags, + uint64_t client_address, + struct anv_bo **bo_out) +{ + assert(!(alloc_flags & (ANV_BO_ALLOC_MAPPED | + ANV_BO_ALLOC_SNOOPED | + ANV_BO_ALLOC_FIXED_ADDRESS))); + + /* We can't do implicit CCS with an aux table on shared memory */ + if (!device->physical->has_implicit_ccs || device->info.has_aux_map) + assert(!(alloc_flags & ANV_BO_ALLOC_IMPLICIT_CCS)); + + struct anv_bo_cache *cache = &device->bo_cache; + const uint32_t bo_flags = + anv_bo_alloc_flags_to_bo_flags(device, alloc_flags); assert(bo_flags == (bo_flags & ANV_BO_CACHE_SUPPORTED_FLAGS)); - assert(bo_flags & ANV_BO_EXTERNAL); pthread_mutex_lock(&cache->mutex); @@ -1799,29 +1796,29 @@ return vk_error(VK_ERROR_INVALID_EXTERNAL_HANDLE); } - struct anv_cached_bo *bo = anv_bo_cache_lookup_locked(cache, gem_handle); - if (bo) { + struct anv_bo *bo = anv_device_lookup_bo(device, gem_handle); + if (bo->refcount > 0) { /* We have to be careful how we combine flags so that it makes sense. * Really, though, if we get to this case and it actually matters, the * client has imported a BO twice in different ways and they get what * they have coming. */ - uint64_t new_flags = ANV_BO_EXTERNAL; - new_flags |= (bo->bo.flags | bo_flags) & EXEC_OBJECT_WRITE; - new_flags |= (bo->bo.flags & bo_flags) & EXEC_OBJECT_ASYNC; - new_flags |= (bo->bo.flags & bo_flags) & EXEC_OBJECT_SUPPORTS_48B_ADDRESS; - new_flags |= (bo->bo.flags | bo_flags) & EXEC_OBJECT_PINNED; + uint64_t new_flags = 0; + new_flags |= (bo->flags | bo_flags) & EXEC_OBJECT_WRITE; + new_flags |= (bo->flags & bo_flags) & EXEC_OBJECT_ASYNC; + new_flags |= (bo->flags & bo_flags) & EXEC_OBJECT_SUPPORTS_48B_ADDRESS; + new_flags |= (bo->flags | bo_flags) & EXEC_OBJECT_PINNED; + new_flags |= (bo->flags | bo_flags) & EXEC_OBJECT_CAPTURE; /* It's theoretically possible for a BO to get imported such that it's * both pinned and not pinned. The only way this can happen is if it * gets imported as both a semaphore and a memory object and that would * be an application error. Just fail out in that case. */ - if ((bo->bo.flags & EXEC_OBJECT_PINNED) != + if ((bo->flags & EXEC_OBJECT_PINNED) != (bo_flags & EXEC_OBJECT_PINNED)) { pthread_mutex_unlock(&cache->mutex); - return vk_errorf(device->instance, NULL, - VK_ERROR_INVALID_EXTERNAL_HANDLE, + return vk_errorf(device, NULL, VK_ERROR_INVALID_EXTERNAL_HANDLE, "The same BO was imported two different ways"); } @@ -1833,15 +1830,29 @@ * app is actually that stupid. */ if ((new_flags & EXEC_OBJECT_PINNED) && - (bo->bo.flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS) != + (bo->flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS) != (bo_flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS)) { pthread_mutex_unlock(&cache->mutex); - return vk_errorf(device->instance, NULL, - VK_ERROR_INVALID_EXTERNAL_HANDLE, + return vk_errorf(device, NULL, VK_ERROR_INVALID_EXTERNAL_HANDLE, "The same BO was imported on two different heaps"); } - bo->bo.flags = new_flags; + if (bo->has_client_visible_address != + ((alloc_flags & ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS) != 0)) { + pthread_mutex_unlock(&cache->mutex); + return vk_errorf(device, NULL, VK_ERROR_INVALID_EXTERNAL_HANDLE, + "The same BO was imported with and without buffer " + "device address"); + } + + if (client_address && client_address != gen_48b_address(bo->offset)) { + pthread_mutex_unlock(&cache->mutex); + return vk_errorf(device, NULL, VK_ERROR_INVALID_EXTERNAL_HANDLE, + "The same BO was imported at two different " + "addresses"); + } + + bo->flags = new_flags; __sync_fetch_and_add(&bo->refcount, 1); } else { @@ -1852,52 +1863,56 @@ return vk_error(VK_ERROR_INVALID_EXTERNAL_HANDLE); } - bo = vk_alloc(&device->alloc, sizeof(struct anv_cached_bo), 8, - VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); - if (!bo) { - anv_gem_close(device, gem_handle); - pthread_mutex_unlock(&cache->mutex); - return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); - } - - bo->refcount = 1; - - anv_bo_init(&bo->bo, gem_handle, size); - bo->bo.flags = bo_flags; - - if (!anv_vma_alloc(device, &bo->bo)) { - anv_gem_close(device, bo->bo.gem_handle); - pthread_mutex_unlock(&cache->mutex); - vk_free(&device->alloc, bo); - return vk_errorf(device->instance, NULL, - VK_ERROR_OUT_OF_DEVICE_MEMORY, - "failed to allocate virtual address for BO"); + struct anv_bo new_bo = { + .gem_handle = gem_handle, + .refcount = 1, + .offset = -1, + .size = size, + .flags = bo_flags, + .is_external = true, + .has_client_visible_address = + (alloc_flags & ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS) != 0, + }; + + assert(client_address == gen_48b_address(client_address)); + if (new_bo.flags & EXEC_OBJECT_PINNED) { + assert(new_bo._ccs_size == 0); + new_bo.offset = anv_vma_alloc(device, new_bo.size, + anv_device_get_bo_align(device, + alloc_flags), + alloc_flags, client_address); + if (new_bo.offset == 0) { + anv_gem_close(device, new_bo.gem_handle); + pthread_mutex_unlock(&cache->mutex); + return vk_errorf(device, NULL, VK_ERROR_OUT_OF_DEVICE_MEMORY, + "failed to allocate virtual address for BO"); + } + } else { + assert(!new_bo.has_client_visible_address); } - _mesa_hash_table_insert(cache->bo_map, (void *)(uintptr_t)gem_handle, bo); + *bo = new_bo; } pthread_mutex_unlock(&cache->mutex); - *bo_out = &bo->bo; + *bo_out = bo; return VK_SUCCESS; } VkResult -anv_bo_cache_export(struct anv_device *device, - struct anv_bo_cache *cache, - struct anv_bo *bo_in, int *fd_out) +anv_device_export_bo(struct anv_device *device, + struct anv_bo *bo, int *fd_out) { - assert(anv_bo_cache_lookup(cache, bo_in->gem_handle) == bo_in); - struct anv_cached_bo *bo = (struct anv_cached_bo *)bo_in; + assert(anv_device_lookup_bo(device, bo->gem_handle) == bo); /* This BO must have been flagged external in order for us to be able * to export it. This is done based on external options passed into * anv_AllocateMemory. */ - assert(bo->bo.flags & ANV_BO_EXTERNAL); + assert(bo->is_external); - int fd = anv_gem_handle_to_fd(device, bo->bo.gem_handle); + int fd = anv_gem_handle_to_fd(device, bo->gem_handle); if (fd < 0) return vk_error(VK_ERROR_TOO_MANY_OBJECTS); @@ -1925,12 +1940,11 @@ } void -anv_bo_cache_release(struct anv_device *device, - struct anv_bo_cache *cache, - struct anv_bo *bo_in) +anv_device_release_bo(struct anv_device *device, + struct anv_bo *bo) { - assert(anv_bo_cache_lookup(cache, bo_in->gem_handle) == bo_in); - struct anv_cached_bo *bo = (struct anv_cached_bo *)bo_in; + struct anv_bo_cache *cache = &device->bo_cache; + assert(anv_device_lookup_bo(device, bo->gem_handle) == bo); /* Try to decrement the counter but don't go below one. If this succeeds * then the refcount has been decremented and we are not the last @@ -1951,19 +1965,35 @@ pthread_mutex_unlock(&cache->mutex); return; } + assert(bo->refcount == 0); - struct hash_entry *entry = - _mesa_hash_table_search(cache->bo_map, - (const void *)(uintptr_t)bo->bo.gem_handle); - assert(entry); - _mesa_hash_table_remove(cache->bo_map, entry); + if (bo->map && !bo->from_host_ptr) + anv_gem_munmap(bo->map, bo->size); - if (bo->bo.map) - anv_gem_munmap(bo->bo.map, bo->bo.size); + if (bo->_ccs_size > 0) { + assert(device->physical->has_implicit_ccs); + assert(device->info.has_aux_map); + assert(bo->has_implicit_ccs); + gen_aux_map_unmap_range(device->aux_map_ctx, + gen_canonical_address(bo->offset), + bo->size); + } - anv_vma_free(device, &bo->bo); + if ((bo->flags & EXEC_OBJECT_PINNED) && !bo->has_fixed_address) + anv_vma_free(device, bo->offset, bo->size + bo->_ccs_size); - anv_gem_close(device, bo->bo.gem_handle); + uint32_t gem_handle = bo->gem_handle; + + /* Memset the BO just in case. The refcount being zero should be enough to + * prevent someone from assuming the data is valid but it's safer to just + * stomp to zero just in case. We explicitly do this *before* we close the + * GEM handle to ensure that if anyone allocates something and gets the + * same GEM handle, the memset has already happen and won't stomp all over + * any data they may write in this BO. + */ + memset(bo, 0, sizeof(*bo)); + + anv_gem_close(device, gem_handle); /* Don't unlock until we've actually closed the BO. The whole point of * the BO cache is to ensure that we correctly handle races with creating @@ -1971,6 +2001,4 @@ * again between mutex unlock and closing the GEM handle. */ pthread_mutex_unlock(&cache->mutex); - - vk_free(&device->alloc, bo); } diff -Nru mesa-19.2.8/src/intel/vulkan/anv_android.c mesa-20.0.8/src/intel/vulkan/anv_android.c --- mesa-19.2.8/src/intel/vulkan/anv_android.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/vulkan/anv_android.c 2020-06-12 01:21:17.000000000 +0000 @@ -25,7 +25,6 @@ #if ANDROID_API_LEVEL >= 26 #include -#include #endif #include @@ -192,7 +191,6 @@ VkAndroidHardwareBufferPropertiesANDROID *pProperties) { ANV_FROM_HANDLE(anv_device, dev, device_h); - struct anv_physical_device *pdevice = &dev->instance->physicalDevice; VkAndroidHardwareBufferFormatPropertiesANDROID *format_prop = vk_find_struct(pProperties->pNext, @@ -214,7 +212,7 @@ return VK_ERROR_INVALID_EXTERNAL_HANDLE; /* All memory types. */ - uint32_t memory_types = (1ull << pdevice->memory.type_count) - 1; + uint32_t memory_types = (1ull << dev->physical->memory.type_count) - 1; pProperties->allocationSize = lseek(dma_buf, 0, SEEK_END); pProperties->memoryTypeBits = memory_types; @@ -307,15 +305,10 @@ if (dma_buf < 0) return VK_ERROR_INVALID_EXTERNAL_HANDLE; - uint64_t bo_flags = ANV_BO_EXTERNAL; - if (device->instance->physicalDevice.supports_48bit_addresses) - bo_flags |= EXEC_OBJECT_SUPPORTS_48B_ADDRESS; - if (device->instance->physicalDevice.use_softpin) - bo_flags |= EXEC_OBJECT_PINNED; - - VkResult result = anv_bo_cache_import(device, &device->bo_cache, - dma_buf, bo_flags, &mem->bo); - assert(VK_SUCCESS); + VkResult result = anv_device_import_bo(device, dma_buf, 0, + 0 /* client_address */, + &mem->bo); + assert(result == VK_SUCCESS); /* "If the vkAllocateMemory command succeeds, the implementation must * acquire a reference to the imported hardware buffer, which it must @@ -394,14 +387,14 @@ anv_image_from_external( VkDevice device_h, const VkImageCreateInfo *base_info, - const struct VkExternalMemoryImageCreateInfo *create_info, + const VkExternalMemoryImageCreateInfo *create_info, const VkAllocationCallbacks *alloc, VkImage *out_image_h) { #if ANDROID_API_LEVEL >= 26 ANV_FROM_HANDLE(anv_device, device, device_h); - const struct VkExternalFormatANDROID *ext_info = + const VkExternalFormatANDROID *ext_info = vk_find_struct_const(base_info->pNext, EXTERNAL_FORMAT_ANDROID); if (ext_info && ext_info->externalFormat != 0) { @@ -451,8 +444,7 @@ }; if (gralloc_info->handle->numFds != 1) { - return vk_errorf(device->instance, device, - VK_ERROR_INVALID_EXTERNAL_HANDLE, + return vk_errorf(device, device, VK_ERROR_INVALID_EXTERNAL_HANDLE, "VkNativeBufferANDROID::handle::numFds is %d, " "expected 1", gralloc_info->handle->numFds); } @@ -463,15 +455,22 @@ */ int dma_buf = gralloc_info->handle->data[0]; - uint64_t bo_flags = ANV_BO_EXTERNAL; - if (device->instance->physicalDevice.supports_48bit_addresses) - bo_flags |= EXEC_OBJECT_SUPPORTS_48B_ADDRESS; - if (device->instance->physicalDevice.use_softpin) - bo_flags |= EXEC_OBJECT_PINNED; - - result = anv_bo_cache_import(device, &device->bo_cache, dma_buf, bo_flags, &bo); + /* We need to set the WRITE flag on window system buffers so that GEM will + * know we're writing to them and synchronize uses on other rings (for + * example, if the display server uses the blitter ring). + * + * If this function fails and if the imported bo was resident in the cache, + * we should avoid updating the bo's flags. Therefore, we defer updating + * the flags until success is certain. + * + */ + result = anv_device_import_bo(device, dma_buf, + ANV_BO_ALLOC_IMPLICIT_SYNC | + ANV_BO_ALLOC_IMPLICIT_WRITE, + 0 /* client_address */, + &bo); if (result != VK_SUCCESS) { - return vk_errorf(device->instance, device, result, + return vk_errorf(device, device, result, "failed to import dma-buf from VkNativeBufferANDROID"); } @@ -487,14 +486,12 @@ anv_info.isl_tiling_flags = ISL_TILING_Y0_BIT; break; case -1: - result = vk_errorf(device->instance, device, - VK_ERROR_INVALID_EXTERNAL_HANDLE, + result = vk_errorf(device, device, VK_ERROR_INVALID_EXTERNAL_HANDLE, "DRM_IOCTL_I915_GEM_GET_TILING failed for " "VkNativeBufferANDROID"); goto fail_tiling; default: - result = vk_errorf(device->instance, device, - VK_ERROR_INVALID_EXTERNAL_HANDLE, + result = vk_errorf(device, device, VK_ERROR_INVALID_EXTERNAL_HANDLE, "DRM_IOCTL_I915_GEM_GET_TILING returned unknown " "tiling %d for VkNativeBufferANDROID", i915_tiling); goto fail_tiling; @@ -515,8 +512,7 @@ goto fail_create; if (bo->size < image->size) { - result = vk_errorf(device->instance, device, - VK_ERROR_INVALID_EXTERNAL_HANDLE, + result = vk_errorf(device, device, VK_ERROR_INVALID_EXTERNAL_HANDLE, "dma-buf from VkNativeBufferANDROID is too small for " "VkImage: %"PRIu64"B < %"PRIu64"B", bo->size, image->size); @@ -529,18 +525,6 @@ image->planes[0].address.bo = bo; image->planes[0].bo_is_owned = true; - /* We need to set the WRITE flag on window system buffers so that GEM will - * know we're writing to them and synchronize uses on other rings (for - * example, if the display server uses the blitter ring). - * - * If this function fails and if the imported bo was resident in the cache, - * we should avoid updating the bo's flags. Therefore, we defer updating - * the flags until success is certain. - * - */ - bo->flags &= ~EXEC_OBJECT_ASYNC; - bo->flags |= EXEC_OBJECT_WRITE; - /* Don't clobber the out-parameter until success is certain. */ *out_image_h = image_h; @@ -550,18 +534,17 @@ anv_DestroyImage(device_h, image_h, alloc); fail_create: fail_tiling: - anv_bo_cache_release(device, &device->bo_cache, bo); + anv_device_release_bo(device, bo); return result; } -VkResult +static VkResult format_supported_with_usage(VkDevice device_h, VkFormat format, VkImageUsageFlags imageUsage) { ANV_FROM_HANDLE(anv_device, device, device_h); - struct anv_physical_device *phys_dev = &device->instance->physicalDevice; - VkPhysicalDevice phys_dev_h = anv_physical_device_to_handle(phys_dev); + VkPhysicalDevice phys_dev_h = anv_physical_device_to_handle(device->physical); VkResult result; const VkPhysicalDeviceImageFormatInfo2 image_format_info = { @@ -580,7 +563,7 @@ result = anv_GetPhysicalDeviceImageFormatProperties2(phys_dev_h, &image_format_info, &image_format_props); if (result != VK_SUCCESS) { - return vk_errorf(device->instance, device, result, + return vk_errorf(device, device, result, "anv_GetPhysicalDeviceImageFormatProperties2 failed " "inside %s", __func__); } @@ -619,7 +602,7 @@ * gralloc swapchains. */ if (imageUsage != 0) { - return vk_errorf(device->instance, device, VK_ERROR_FORMAT_NOT_SUPPORTED, + return vk_errorf(device, device, VK_ERROR_FORMAT_NOT_SUPPORTED, "unsupported VkImageUsageFlags(0x%x) for gralloc " "swapchain", imageUsage); } @@ -650,7 +633,6 @@ return VK_SUCCESS; } - #if ANDROID_API_LEVEL >= 26 VkResult anv_GetSwapchainGrallocUsage2ANDROID( VkDevice device_h, @@ -676,8 +658,23 @@ if (result != VK_SUCCESS) return result; - android_convertGralloc0To1Usage(grallocUsage, grallocProducerUsage, - grallocConsumerUsage); + /* Setup gralloc1 usage flags from gralloc0 flags. */ + + if (grallocUsage & GRALLOC_USAGE_HW_RENDER) { + *grallocProducerUsage |= GRALLOC1_PRODUCER_USAGE_GPU_RENDER_TARGET; + *grallocConsumerUsage |= GRALLOC1_CONSUMER_USAGE_CLIENT_TARGET; + } + + if (grallocUsage & GRALLOC_USAGE_HW_TEXTURE) { + *grallocConsumerUsage |= GRALLOC1_CONSUMER_USAGE_GPU_TEXTURE; + } + + if (grallocUsage & (GRALLOC_USAGE_HW_FB | + GRALLOC_USAGE_HW_COMPOSER | + GRALLOC_USAGE_EXTERNAL_DISP)) { + *grallocProducerUsage |= GRALLOC1_PRODUCER_USAGE_GPU_RENDER_TARGET; + *grallocConsumerUsage |= GRALLOC1_CONSUMER_USAGE_HWCOMPOSER; + } return VK_SUCCESS; } @@ -689,9 +686,6 @@ VkImageUsageFlags imageUsage, int* grallocUsage) { - ANV_FROM_HANDLE(anv_device, device, device_h); - struct anv_physical_device *phys_dev = &device->instance->physicalDevice; - VkPhysicalDevice phys_dev_h = anv_physical_device_to_handle(phys_dev); VkResult result; *grallocUsage = 0; @@ -724,7 +718,7 @@ * VkFence. */ if (sync_wait(nativeFenceFd, /*timeout*/ -1) < 0) { - result = vk_errorf(device->instance, device, VK_ERROR_DEVICE_LOST, + result = vk_errorf(device, device, VK_ERROR_DEVICE_LOST, "%s: failed to wait on nativeFenceFd=%d", __func__, nativeFenceFd); } @@ -770,7 +764,7 @@ result = anv_QueueSubmit(anv_queue_to_handle(&device->queue), 1, &submit, fence_h); if (result != VK_SUCCESS) { - return vk_errorf(device->instance, device, result, + return vk_errorf(device, device, result, "anv_QueueSubmit failed inside %s", __func__); } } diff -Nru mesa-19.2.8/src/intel/vulkan/anv_android.h mesa-20.0.8/src/intel/vulkan/anv_android.h --- mesa-19.2.8/src/intel/vulkan/anv_android.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/vulkan/anv_android.h 2020-06-12 01:21:17.000000000 +0000 @@ -40,7 +40,7 @@ VkResult anv_image_from_external(VkDevice device_h, const VkImageCreateInfo *base_info, - const struct VkExternalMemoryImageCreateInfo *create_info, + const VkExternalMemoryImageCreateInfo *create_info, const VkAllocationCallbacks *alloc, VkImage *out_image_h); diff -Nru mesa-19.2.8/src/intel/vulkan/anv_android_stubs.c mesa-20.0.8/src/intel/vulkan/anv_android_stubs.c --- mesa-19.2.8/src/intel/vulkan/anv_android_stubs.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/vulkan/anv_android_stubs.c 2020-06-12 01:21:17.000000000 +0000 @@ -59,7 +59,7 @@ VkResult anv_image_from_external(VkDevice device_h, const VkImageCreateInfo *base_info, - const struct VkExternalMemoryImageCreateInfo *create_info, + const VkExternalMemoryImageCreateInfo *create_info, const VkAllocationCallbacks *alloc, VkImage *out_image_h) { diff -Nru mesa-19.2.8/src/intel/vulkan/anv_batch_chain.c mesa-20.0.8/src/intel/vulkan/anv_batch_chain.c --- mesa-19.2.8/src/intel/vulkan/anv_batch_chain.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/vulkan/anv_batch_chain.c 2020-06-12 01:21:17.000000000 +0000 @@ -46,61 +46,59 @@ * Functions related to anv_reloc_list *-----------------------------------------------------------------------*/ +VkResult +anv_reloc_list_init(struct anv_reloc_list *list, + const VkAllocationCallbacks *alloc) +{ + memset(list, 0, sizeof(*list)); + return VK_SUCCESS; +} + static VkResult anv_reloc_list_init_clone(struct anv_reloc_list *list, const VkAllocationCallbacks *alloc, const struct anv_reloc_list *other_list) { - if (other_list) { - list->num_relocs = other_list->num_relocs; - list->array_length = other_list->array_length; - } else { - list->num_relocs = 0; - list->array_length = 256; - } + list->num_relocs = other_list->num_relocs; + list->array_length = other_list->array_length; - list->relocs = - vk_alloc(alloc, list->array_length * sizeof(*list->relocs), 8, - VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); - - if (list->relocs == NULL) - return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); - - list->reloc_bos = - vk_alloc(alloc, list->array_length * sizeof(*list->reloc_bos), 8, - VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); - - if (list->reloc_bos == NULL) { - vk_free(alloc, list->relocs); - return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); - } - - list->deps = _mesa_pointer_set_create(NULL); + if (list->num_relocs > 0) { + list->relocs = + vk_alloc(alloc, list->array_length * sizeof(*list->relocs), 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (list->relocs == NULL) + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); - if (!list->deps) { - vk_free(alloc, list->relocs); - vk_free(alloc, list->reloc_bos); - return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); - } + list->reloc_bos = + vk_alloc(alloc, list->array_length * sizeof(*list->reloc_bos), 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (list->reloc_bos == NULL) { + vk_free(alloc, list->relocs); + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + } - if (other_list) { memcpy(list->relocs, other_list->relocs, list->array_length * sizeof(*list->relocs)); memcpy(list->reloc_bos, other_list->reloc_bos, list->array_length * sizeof(*list->reloc_bos)); - set_foreach(other_list->deps, entry) { - _mesa_set_add_pre_hashed(list->deps, entry->hash, entry->key); - } + } else { + list->relocs = NULL; + list->reloc_bos = NULL; } - return VK_SUCCESS; -} + list->dep_words = other_list->dep_words; -VkResult -anv_reloc_list_init(struct anv_reloc_list *list, - const VkAllocationCallbacks *alloc) -{ - return anv_reloc_list_init_clone(list, alloc, NULL); + if (list->dep_words > 0) { + list->deps = + vk_alloc(alloc, list->dep_words * sizeof(BITSET_WORD), 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + memcpy(list->deps, other_list->deps, + list->dep_words * sizeof(BITSET_WORD)); + } else { + list->deps = NULL; + } + + return VK_SUCCESS; } void @@ -109,7 +107,7 @@ { vk_free(alloc, list->relocs); vk_free(alloc, list->reloc_bos); - _mesa_set_destroy(list->deps, NULL); + vk_free(alloc, list->deps); } static VkResult @@ -120,48 +118,79 @@ if (list->num_relocs + num_additional_relocs <= list->array_length) return VK_SUCCESS; - size_t new_length = list->array_length * 2; + size_t new_length = MAX2(16, list->array_length * 2); while (new_length < list->num_relocs + num_additional_relocs) new_length *= 2; struct drm_i915_gem_relocation_entry *new_relocs = - vk_alloc(alloc, new_length * sizeof(*list->relocs), 8, - VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + vk_realloc(alloc, list->relocs, + new_length * sizeof(*list->relocs), 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); if (new_relocs == NULL) return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + list->relocs = new_relocs; struct anv_bo **new_reloc_bos = - vk_alloc(alloc, new_length * sizeof(*list->reloc_bos), 8, - VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); - if (new_reloc_bos == NULL) { - vk_free(alloc, new_relocs); + vk_realloc(alloc, list->reloc_bos, + new_length * sizeof(*list->reloc_bos), 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (new_reloc_bos == NULL) return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); - } + list->reloc_bos = new_reloc_bos; - memcpy(new_relocs, list->relocs, list->num_relocs * sizeof(*list->relocs)); - memcpy(new_reloc_bos, list->reloc_bos, - list->num_relocs * sizeof(*list->reloc_bos)); + list->array_length = new_length; - vk_free(alloc, list->relocs); - vk_free(alloc, list->reloc_bos); + return VK_SUCCESS; +} - list->array_length = new_length; - list->relocs = new_relocs; - list->reloc_bos = new_reloc_bos; +static VkResult +anv_reloc_list_grow_deps(struct anv_reloc_list *list, + const VkAllocationCallbacks *alloc, + uint32_t min_num_words) +{ + if (min_num_words <= list->dep_words) + return VK_SUCCESS; + + uint32_t new_length = MAX2(32, list->dep_words * 2); + while (new_length < min_num_words) + new_length *= 2; + + BITSET_WORD *new_deps = + vk_realloc(alloc, list->deps, new_length * sizeof(BITSET_WORD), 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (new_deps == NULL) + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + list->deps = new_deps; + + /* Zero out the new data */ + memset(list->deps + list->dep_words, 0, + (new_length - list->dep_words) * sizeof(BITSET_WORD)); + list->dep_words = new_length; return VK_SUCCESS; } +#define READ_ONCE(x) (*(volatile __typeof__(x) *)&(x)) + VkResult anv_reloc_list_add(struct anv_reloc_list *list, const VkAllocationCallbacks *alloc, - uint32_t offset, struct anv_bo *target_bo, uint32_t delta) + uint32_t offset, struct anv_bo *target_bo, uint32_t delta, + uint64_t *address_u64_out) { struct drm_i915_gem_relocation_entry *entry; int index; - if (target_bo->flags & EXEC_OBJECT_PINNED) { - _mesa_set_add(list->deps, target_bo); + struct anv_bo *unwrapped_target_bo = anv_bo_unwrap(target_bo); + uint64_t target_bo_offset = READ_ONCE(unwrapped_target_bo->offset); + if (address_u64_out) + *address_u64_out = target_bo_offset + delta; + + if (unwrapped_target_bo->flags & EXEC_OBJECT_PINNED) { + assert(!target_bo->is_wrapper); + uint32_t idx = unwrapped_target_bo->gem_handle; + anv_reloc_list_grow_deps(list, alloc, (idx / BITSET_WORDBITS) + 1); + BITSET_SET(list->deps, unwrapped_target_bo->gem_handle); return VK_SUCCESS; } @@ -173,10 +202,10 @@ index = list->num_relocs++; list->reloc_bos[index] = target_bo; entry = &list->relocs[index]; - entry->target_handle = target_bo->gem_handle; + entry->target_handle = -1; /* See also anv_cmd_buffer_process_relocs() */ entry->delta = delta; entry->offset = offset; - entry->presumed_offset = target_bo->offset; + entry->presumed_offset = target_bo_offset; entry->read_domains = 0; entry->write_domain = 0; VG(VALGRIND_CHECK_MEM_IS_DEFINED(entry, sizeof(*entry))); @@ -184,6 +213,14 @@ return VK_SUCCESS; } +static void +anv_reloc_list_clear(struct anv_reloc_list *list) +{ + list->num_relocs = 0; + if (list->dep_words > 0) + memset(list->deps, 0, list->dep_words * sizeof(BITSET_WORD)); +} + static VkResult anv_reloc_list_append(struct anv_reloc_list *list, const VkAllocationCallbacks *alloc, @@ -193,20 +230,22 @@ if (result != VK_SUCCESS) return result; - memcpy(&list->relocs[list->num_relocs], &other->relocs[0], - other->num_relocs * sizeof(other->relocs[0])); - memcpy(&list->reloc_bos[list->num_relocs], &other->reloc_bos[0], - other->num_relocs * sizeof(other->reloc_bos[0])); - - for (uint32_t i = 0; i < other->num_relocs; i++) - list->relocs[i + list->num_relocs].offset += offset; + if (other->num_relocs > 0) { + memcpy(&list->relocs[list->num_relocs], &other->relocs[0], + other->num_relocs * sizeof(other->relocs[0])); + memcpy(&list->reloc_bos[list->num_relocs], &other->reloc_bos[0], + other->num_relocs * sizeof(other->reloc_bos[0])); - list->num_relocs += other->num_relocs; + for (uint32_t i = 0; i < other->num_relocs; i++) + list->relocs[i + list->num_relocs].offset += offset; - set_foreach(other->deps, entry) { - _mesa_set_add_pre_hashed(list->deps, entry->hash, entry->key); + list->num_relocs += other->num_relocs; } + anv_reloc_list_grow_deps(list, alloc, other->dep_words); + for (uint32_t w = 0; w < other->dep_words; w++) + list->deps[w] |= other->deps[w]; + return VK_SUCCESS; } @@ -237,14 +276,16 @@ anv_batch_emit_reloc(struct anv_batch *batch, void *location, struct anv_bo *bo, uint32_t delta) { + uint64_t address_u64 = 0; VkResult result = anv_reloc_list_add(batch->relocs, batch->alloc, - location - batch->start, bo, delta); + location - batch->start, bo, delta, + &address_u64); if (result != VK_SUCCESS) { anv_batch_set_error(batch, result); return 0; } - return bo->offset + delta; + return address_u64; } void @@ -294,8 +335,8 @@ if (bbo == NULL) return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); - result = anv_bo_pool_alloc(&cmd_buffer->device->batch_bo_pool, &bbo->bo, - ANV_CMD_BUFFER_BATCH_SIZE); + result = anv_bo_pool_alloc(&cmd_buffer->device->batch_bo_pool, + ANV_CMD_BUFFER_BATCH_SIZE, &bbo->bo); if (result != VK_SUCCESS) goto fail_alloc; @@ -308,7 +349,7 @@ return VK_SUCCESS; fail_bo_alloc: - anv_bo_pool_free(&cmd_buffer->device->batch_bo_pool, &bbo->bo); + anv_bo_pool_free(&cmd_buffer->device->batch_bo_pool, bbo->bo); fail_alloc: vk_free(&cmd_buffer->pool->alloc, bbo); @@ -327,8 +368,8 @@ if (bbo == NULL) return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); - result = anv_bo_pool_alloc(&cmd_buffer->device->batch_bo_pool, &bbo->bo, - other_bbo->bo.size); + result = anv_bo_pool_alloc(&cmd_buffer->device->batch_bo_pool, + other_bbo->bo->size, &bbo->bo); if (result != VK_SUCCESS) goto fail_alloc; @@ -338,14 +379,13 @@ goto fail_bo_alloc; bbo->length = other_bbo->length; - memcpy(bbo->bo.map, other_bbo->bo.map, other_bbo->length); - + memcpy(bbo->bo->map, other_bbo->bo->map, other_bbo->length); *bbo_out = bbo; return VK_SUCCESS; fail_bo_alloc: - anv_bo_pool_free(&cmd_buffer->device->batch_bo_pool, &bbo->bo); + anv_bo_pool_free(&cmd_buffer->device->batch_bo_pool, bbo->bo); fail_alloc: vk_free(&cmd_buffer->pool->alloc, bbo); @@ -356,27 +396,26 @@ anv_batch_bo_start(struct anv_batch_bo *bbo, struct anv_batch *batch, size_t batch_padding) { - batch->next = batch->start = bbo->bo.map; - batch->end = bbo->bo.map + bbo->bo.size - batch_padding; + batch->next = batch->start = bbo->bo->map; + batch->end = bbo->bo->map + bbo->bo->size - batch_padding; batch->relocs = &bbo->relocs; - bbo->relocs.num_relocs = 0; - _mesa_set_clear(bbo->relocs.deps, NULL); + anv_reloc_list_clear(&bbo->relocs); } static void anv_batch_bo_continue(struct anv_batch_bo *bbo, struct anv_batch *batch, size_t batch_padding) { - batch->start = bbo->bo.map; - batch->next = bbo->bo.map + bbo->length; - batch->end = bbo->bo.map + bbo->bo.size - batch_padding; + batch->start = bbo->bo->map; + batch->next = bbo->bo->map + bbo->length; + batch->end = bbo->bo->map + bbo->bo->size - batch_padding; batch->relocs = &bbo->relocs; } static void anv_batch_bo_finish(struct anv_batch_bo *bbo, struct anv_batch *batch) { - assert(batch->start == bbo->bo.map); + assert(batch->start == bbo->bo->map); bbo->length = batch->next - batch->start; VG(VALGRIND_CHECK_MEM_IS_DEFINED(batch->start, bbo->length)); } @@ -386,25 +425,25 @@ struct anv_batch *batch, size_t aditional, size_t batch_padding) { - assert(batch->start == bbo->bo.map); + assert(batch->start == bbo->bo->map); bbo->length = batch->next - batch->start; - size_t new_size = bbo->bo.size; + size_t new_size = bbo->bo->size; while (new_size <= bbo->length + aditional + batch_padding) new_size *= 2; - if (new_size == bbo->bo.size) + if (new_size == bbo->bo->size) return VK_SUCCESS; - struct anv_bo new_bo; + struct anv_bo *new_bo; VkResult result = anv_bo_pool_alloc(&cmd_buffer->device->batch_bo_pool, - &new_bo, new_size); + new_size, &new_bo); if (result != VK_SUCCESS) return result; - memcpy(new_bo.map, bbo->bo.map, bbo->length); + memcpy(new_bo->map, bbo->bo->map, bbo->length); - anv_bo_pool_free(&cmd_buffer->device->batch_bo_pool, &bbo->bo); + anv_bo_pool_free(&cmd_buffer->device->batch_bo_pool, bbo->bo); bbo->bo = new_bo; anv_batch_bo_continue(bbo, batch, batch_padding); @@ -420,24 +459,24 @@ { const uint32_t bb_start_offset = prev_bbo->length - GEN8_MI_BATCH_BUFFER_START_length * 4; - ASSERTED const uint32_t *bb_start = prev_bbo->bo.map + bb_start_offset; + ASSERTED const uint32_t *bb_start = prev_bbo->bo->map + bb_start_offset; /* Make sure we're looking at a MI_BATCH_BUFFER_START */ assert(((*bb_start >> 29) & 0x07) == 0); assert(((*bb_start >> 23) & 0x3f) == 49); - if (cmd_buffer->device->instance->physicalDevice.use_softpin) { - assert(prev_bbo->bo.flags & EXEC_OBJECT_PINNED); - assert(next_bbo->bo.flags & EXEC_OBJECT_PINNED); + if (cmd_buffer->device->physical->use_softpin) { + assert(prev_bbo->bo->flags & EXEC_OBJECT_PINNED); + assert(next_bbo->bo->flags & EXEC_OBJECT_PINNED); write_reloc(cmd_buffer->device, - prev_bbo->bo.map + bb_start_offset + 4, - next_bbo->bo.offset + next_bbo_offset, true); + prev_bbo->bo->map + bb_start_offset + 4, + next_bbo->bo->offset + next_bbo_offset, true); } else { uint32_t reloc_idx = prev_bbo->relocs.num_relocs - 1; assert(prev_bbo->relocs.relocs[reloc_idx].offset == bb_start_offset + 4); - prev_bbo->relocs.reloc_bos[reloc_idx] = &next_bbo->bo; + prev_bbo->relocs.reloc_bos[reloc_idx] = next_bbo->bo; prev_bbo->relocs.relocs[reloc_idx].delta = next_bbo_offset; /* Use a bogus presumed offset to force a relocation */ @@ -450,7 +489,7 @@ struct anv_cmd_buffer *cmd_buffer) { anv_reloc_list_finish(&bbo->relocs, &cmd_buffer->pool->alloc); - anv_bo_pool_free(&cmd_buffer->device->batch_bo_pool, &bbo->bo); + anv_bo_pool_free(&cmd_buffer->device->batch_bo_pool, bbo->bo); vk_free(&cmd_buffer->pool->alloc, bbo); } @@ -549,9 +588,9 @@ * chaining command, let's set it back where it should go. */ batch->end += GEN8_MI_BATCH_BUFFER_START_length * 4; - assert(batch->end == current_bbo->bo.map + current_bbo->bo.size); + assert(batch->end == current_bbo->bo->map + current_bbo->bo->size); - emit_batch_buffer_start(cmd_buffer, &bbo->bo, 0); + emit_batch_buffer_start(cmd_buffer, bbo->bo, 0); anv_batch_bo_finish(current_bbo, batch); } @@ -670,22 +709,20 @@ uint32_t entries, uint32_t *state_offset) { struct anv_device *device = cmd_buffer->device; - struct anv_state_pool *state_pool = &device->surface_state_pool; struct anv_state *bt_block = u_vector_head(&cmd_buffer->bt_block_states); - struct anv_state state; - state.alloc_size = align_u32(entries * 4, 32); + uint32_t bt_size = align_u32(entries * 4, 32); - if (cmd_buffer->bt_next + state.alloc_size > state_pool->block_size) + struct anv_state state = cmd_buffer->bt_next; + if (bt_size > state.alloc_size) return (struct anv_state) { 0 }; - state.offset = cmd_buffer->bt_next; - state.map = anv_block_pool_map(&anv_binding_table_pool(device)->block_pool, - bt_block->offset + state.offset); - - cmd_buffer->bt_next += state.alloc_size; + state.alloc_size = bt_size; + cmd_buffer->bt_next.offset += bt_size; + cmd_buffer->bt_next.map += bt_size; + cmd_buffer->bt_next.alloc_size -= bt_size; - if (device->instance->physicalDevice.use_softpin) { + if (device->physical->use_softpin) { assert(bt_block->offset >= 0); *state_offset = device->surface_state_pool.block_pool.start_address - device->binding_table_pool.block_pool.start_address - bt_block->offset; @@ -723,7 +760,12 @@ } *bt_block = anv_binding_table_pool_alloc(cmd_buffer->device); - cmd_buffer->bt_next = 0; + + /* The bt_next state is a rolling state (we update it as we suballocate + * from it) which is relative to the start of the binding table block. + */ + cmd_buffer->bt_next = *bt_block; + cmd_buffer->bt_next.offset = 0; return VK_SUCCESS; } @@ -815,13 +857,13 @@ anv_cmd_buffer_reset_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer) { /* Delete all but the first batch bo */ - assert(!list_empty(&cmd_buffer->batch_bos)); + assert(!list_is_empty(&cmd_buffer->batch_bos)); while (cmd_buffer->batch_bos.next != cmd_buffer->batch_bos.prev) { struct anv_batch_bo *bbo = anv_cmd_buffer_current_batch_bo(cmd_buffer); list_del(&bbo->link); anv_batch_bo_destroy(bbo, cmd_buffer); } - assert(!list_empty(&cmd_buffer->batch_bos)); + assert(!list_is_empty(&cmd_buffer->batch_bos)); anv_batch_bo_start(anv_cmd_buffer_current_batch_bo(cmd_buffer), &cmd_buffer->batch, @@ -832,10 +874,10 @@ anv_binding_table_pool_free(cmd_buffer->device, *bt_block); } assert(u_vector_length(&cmd_buffer->bt_block_states) == 1); - cmd_buffer->bt_next = 0; + cmd_buffer->bt_next = *(struct anv_state *)u_vector_head(&cmd_buffer->bt_block_states); + cmd_buffer->bt_next.offset = 0; - cmd_buffer->surface_relocs.num_relocs = 0; - _mesa_set_clear(cmd_buffer->surface_relocs.deps, NULL); + anv_reloc_list_clear(&cmd_buffer->surface_relocs); cmd_buffer->last_ss_pool_center = 0; /* Reset the list of seen buffers */ @@ -859,7 +901,7 @@ * with our BATCH_BUFFER_END in another BO. */ cmd_buffer->batch.end += GEN8_MI_BATCH_BUFFER_START_length * 4; - assert(cmd_buffer->batch.end == batch_bo->bo.map + batch_bo->bo.size); + assert(cmd_buffer->batch.end == batch_bo->bo->map + batch_bo->bo->size); anv_batch_emit(&cmd_buffer->batch, GEN8_MI_BATCH_BUFFER_END, bbe); @@ -900,11 +942,11 @@ * chaining command, let's set it back where it should go. */ cmd_buffer->batch.end += GEN8_MI_BATCH_BUFFER_START_length * 4; - assert(cmd_buffer->batch.start == batch_bo->bo.map); - assert(cmd_buffer->batch.end == batch_bo->bo.map + batch_bo->bo.size); + assert(cmd_buffer->batch.start == batch_bo->bo->map); + assert(cmd_buffer->batch.end == batch_bo->bo->map + batch_bo->bo->size); - emit_batch_buffer_start(cmd_buffer, &batch_bo->bo, 0); - assert(cmd_buffer->batch.start == batch_bo->bo.map); + emit_batch_buffer_start(cmd_buffer, batch_bo->bo, 0); + assert(cmd_buffer->batch.start == batch_bo->bo->map); } else { cmd_buffer->exec_mode = ANV_CMD_BUFFER_EXEC_MODE_COPY_AND_CHAIN; } @@ -950,10 +992,10 @@ struct anv_batch_bo *last_bbo = list_last_entry(&secondary->batch_bos, struct anv_batch_bo, link); - emit_batch_buffer_start(primary, &first_bbo->bo, 0); + emit_batch_buffer_start(primary, first_bbo->bo, 0); struct anv_batch_bo *this_bbo = anv_cmd_buffer_current_batch_bo(primary); - assert(primary->batch.start == this_bbo->bo.map); + assert(primary->batch.start == this_bbo->bo->map); uint32_t offset = primary->batch.next - primary->batch.start; /* Make the tail of the secondary point back to right after the @@ -1007,10 +1049,8 @@ bool has_relocs; - uint32_t fence_count; - uint32_t fence_array_length; - struct drm_i915_gem_exec_fence * fences; - struct anv_syncobj ** syncobjs; + const VkAllocationCallbacks * alloc; + VkSystemAllocationScope alloc_scope; }; static void @@ -1020,39 +1060,30 @@ } static void -anv_execbuf_finish(struct anv_execbuf *exec, - const VkAllocationCallbacks *alloc) +anv_execbuf_finish(struct anv_execbuf *exec) { - vk_free(alloc, exec->objects); - vk_free(alloc, exec->bos); - vk_free(alloc, exec->fences); - vk_free(alloc, exec->syncobjs); -} - -static int -_compare_bo_handles(const void *_bo1, const void *_bo2) -{ - struct anv_bo * const *bo1 = _bo1; - struct anv_bo * const *bo2 = _bo2; - - return (*bo1)->gem_handle - (*bo2)->gem_handle; + vk_free(exec->alloc, exec->objects); + vk_free(exec->alloc, exec->bos); } static VkResult -anv_execbuf_add_bo_set(struct anv_execbuf *exec, - struct set *deps, - uint32_t extra_flags, - const VkAllocationCallbacks *alloc); +anv_execbuf_add_bo_bitset(struct anv_device *device, + struct anv_execbuf *exec, + uint32_t dep_words, + BITSET_WORD *deps, + uint32_t extra_flags); static VkResult -anv_execbuf_add_bo(struct anv_execbuf *exec, +anv_execbuf_add_bo(struct anv_device *device, + struct anv_execbuf *exec, struct anv_bo *bo, struct anv_reloc_list *relocs, - uint32_t extra_flags, - const VkAllocationCallbacks *alloc) + uint32_t extra_flags) { struct drm_i915_gem_exec_object2 *obj = NULL; + bo = anv_bo_unwrap(bo); + if (bo->index < exec->bo_count && exec->bos[bo->index] == bo) obj = &exec->objects[bo->index]; @@ -1064,16 +1095,14 @@ uint32_t new_len = exec->objects ? exec->array_length * 2 : 64; struct drm_i915_gem_exec_object2 *new_objects = - vk_alloc(alloc, new_len * sizeof(*new_objects), - 8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); + vk_alloc(exec->alloc, new_len * sizeof(*new_objects), 8, exec->alloc_scope); if (new_objects == NULL) return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); struct anv_bo **new_bos = - vk_alloc(alloc, new_len * sizeof(*new_bos), - 8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); + vk_alloc(exec->alloc, new_len * sizeof(*new_bos), 8, exec->alloc_scope); if (new_bos == NULL) { - vk_free(alloc, new_objects); + vk_free(exec->alloc, new_objects); return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); } @@ -1084,8 +1113,8 @@ exec->bo_count * sizeof(*new_bos)); } - vk_free(alloc, exec->objects); - vk_free(alloc, exec->bos); + vk_free(exec->alloc, exec->objects); + vk_free(exec->alloc, exec->bos); exec->objects = new_objects; exec->bos = new_bos; @@ -1103,11 +1132,16 @@ obj->relocs_ptr = 0; obj->alignment = 0; obj->offset = bo->offset; - obj->flags = (bo->flags & ~ANV_BO_FLAG_MASK) | extra_flags; + obj->flags = bo->flags | extra_flags; obj->rsvd1 = 0; obj->rsvd2 = 0; } + if (extra_flags & EXEC_OBJECT_WRITE) { + obj->flags |= EXEC_OBJECT_WRITE; + obj->flags &= ~EXEC_OBJECT_ASYNC; + } + if (relocs != NULL) { assert(obj->relocation_count == 0); @@ -1125,15 +1159,15 @@ /* A quick sanity check on relocations */ assert(relocs->relocs[i].offset < bo->size); - result = anv_execbuf_add_bo(exec, relocs->reloc_bos[i], NULL, - extra_flags, alloc); - + result = anv_execbuf_add_bo(device, exec, relocs->reloc_bos[i], + NULL, extra_flags); if (result != VK_SUCCESS) return result; } } - return anv_execbuf_add_bo_set(exec, relocs->deps, extra_flags, alloc); + return anv_execbuf_add_bo_bitset(device, exec, relocs->dep_words, + relocs->deps, extra_flags); } return VK_SUCCESS; @@ -1141,66 +1175,26 @@ /* Add BO dependencies to execbuf */ static VkResult -anv_execbuf_add_bo_set(struct anv_execbuf *exec, - struct set *deps, - uint32_t extra_flags, - const VkAllocationCallbacks *alloc) -{ - if (!deps || deps->entries <= 0) - return VK_SUCCESS; - - const uint32_t entries = deps->entries; - struct anv_bo **bos = - vk_alloc(alloc, entries * sizeof(*bos), - 8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); - if (bos == NULL) - return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); - - struct anv_bo **bo = bos; - set_foreach(deps, entry) { - *bo++ = (void *)entry->key; - } - - qsort(bos, entries, sizeof(struct anv_bo*), _compare_bo_handles); - - VkResult result = VK_SUCCESS; - for (bo = bos; bo < bos + entries; bo++) { - result = anv_execbuf_add_bo(exec, *bo, NULL, extra_flags, alloc); - if (result != VK_SUCCESS) - break; - } - - vk_free(alloc, bos); - - return result; -} - -static VkResult -anv_execbuf_add_syncobj(struct anv_execbuf *exec, - uint32_t handle, uint32_t flags, - const VkAllocationCallbacks *alloc) -{ - assert(flags != 0); - - if (exec->fence_count >= exec->fence_array_length) { - uint32_t new_len = MAX2(exec->fence_array_length * 2, 64); - - exec->fences = vk_realloc(alloc, exec->fences, - new_len * sizeof(*exec->fences), - 8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); - if (exec->fences == NULL) - return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); - - exec->fence_array_length = new_len; +anv_execbuf_add_bo_bitset(struct anv_device *device, + struct anv_execbuf *exec, + uint32_t dep_words, + BITSET_WORD *deps, + uint32_t extra_flags) +{ + for (uint32_t w = 0; w < dep_words; w++) { + BITSET_WORD mask = deps[w]; + while (mask) { + int i = u_bit_scan(&mask); + uint32_t gem_handle = w * BITSET_WORDBITS + i; + struct anv_bo *bo = anv_device_lookup_bo(device, gem_handle); + assert(bo->refcount > 0); + VkResult result = + anv_execbuf_add_bo(device, exec, bo, NULL, extra_flags); + if (result != VK_SUCCESS) + return result; + } } - exec->fences[exec->fence_count] = (struct drm_i915_gem_exec_fence) { - .handle = handle, - .flags = flags, - }; - - exec->fence_count++; - return VK_SUCCESS; } @@ -1209,7 +1203,7 @@ struct anv_reloc_list *list) { for (size_t i = 0; i < list->num_relocs; i++) - list->relocs[i].target_handle = list->reloc_bos[i]->index; + list->relocs[i].target_handle = anv_bo_unwrap(list->reloc_bos[i])->index; } static void @@ -1236,6 +1230,7 @@ struct anv_reloc_list *relocs, uint32_t last_pool_center_bo_offset) { + assert(!from_bo->is_wrapper); assert(last_pool_center_bo_offset <= pool->block_pool.center_bo_offset); uint32_t delta = pool->block_pool.center_bo_offset - last_pool_center_bo_offset; @@ -1274,8 +1269,10 @@ struct anv_bo *bo, bool always_relocate) { + bo = anv_bo_unwrap(bo); + for (size_t i = 0; i < list->num_relocs; i++) { - struct anv_bo *target_bo = list->reloc_bos[i]; + struct anv_bo *target_bo = anv_bo_unwrap(list->reloc_bos[i]); if (list->relocs[i].presumed_offset == target_bo->offset && !always_relocate) continue; @@ -1344,6 +1341,7 @@ * Invalid offsets are indicated by anv_bo::offset == (uint64_t)-1. */ for (uint32_t i = 0; i < exec->bo_count; i++) { + assert(!exec->bos[i]->is_wrapper); if (exec->bos[i]->offset == (uint64_t)-1) return false; } @@ -1353,8 +1351,10 @@ * what address is actually written in the surface state object at any * given time. The only option is to always relocate them. */ + struct anv_bo *surface_state_bo = + anv_bo_unwrap(cmd_buffer->device->surface_state_pool.block_pool.bo); anv_reloc_list_apply(cmd_buffer->device, &cmd_buffer->surface_relocs, - cmd_buffer->device->surface_state_pool.block_pool.bo, + surface_state_bo, true /* always relocate surface states */); /* Since we own all of the batch buffers, we know what values are stored @@ -1364,7 +1364,7 @@ struct anv_batch_bo **bbo; u_vector_foreach(bbo, &cmd_buffer->seen_bbos) { anv_reloc_list_apply(cmd_buffer->device, - &(*bbo)->relocs, &(*bbo)->bo, false); + &(*bbo)->relocs, (*bbo)->bo, false); } for (uint32_t i = 0; i < exec->bo_count; i++) @@ -1384,23 +1384,23 @@ adjust_relocations_from_state_pool(ss_pool, &cmd_buffer->surface_relocs, cmd_buffer->last_ss_pool_center); VkResult result; - struct anv_bo *bo; - if (cmd_buffer->device->instance->physicalDevice.use_softpin) { + if (cmd_buffer->device->physical->use_softpin) { anv_block_pool_foreach_bo(bo, &ss_pool->block_pool) { - result = anv_execbuf_add_bo(execbuf, bo, NULL, 0, - &cmd_buffer->device->alloc); + result = anv_execbuf_add_bo(cmd_buffer->device, execbuf, + bo, NULL, 0); if (result != VK_SUCCESS) return result; } /* Add surface dependencies (BOs) to the execbuf */ - anv_execbuf_add_bo_set(execbuf, cmd_buffer->surface_relocs.deps, 0, - &cmd_buffer->device->alloc); + anv_execbuf_add_bo_bitset(cmd_buffer->device, execbuf, + cmd_buffer->surface_relocs.dep_words, + cmd_buffer->surface_relocs.deps, 0); /* Add the BOs for all memory objects */ list_for_each_entry(struct anv_device_memory, mem, &cmd_buffer->device->memory_objects, link) { - result = anv_execbuf_add_bo(execbuf, mem->bo, NULL, 0, - &cmd_buffer->device->alloc); + result = anv_execbuf_add_bo(cmd_buffer->device, execbuf, + mem->bo, NULL, 0); if (result != VK_SUCCESS) return result; } @@ -1408,24 +1408,24 @@ struct anv_block_pool *pool; pool = &cmd_buffer->device->dynamic_state_pool.block_pool; anv_block_pool_foreach_bo(bo, pool) { - result = anv_execbuf_add_bo(execbuf, bo, NULL, 0, - &cmd_buffer->device->alloc); + result = anv_execbuf_add_bo(cmd_buffer->device, execbuf, + bo, NULL, 0); if (result != VK_SUCCESS) return result; } pool = &cmd_buffer->device->instruction_state_pool.block_pool; anv_block_pool_foreach_bo(bo, pool) { - result = anv_execbuf_add_bo(execbuf, bo, NULL, 0, - &cmd_buffer->device->alloc); + result = anv_execbuf_add_bo(cmd_buffer->device, execbuf, + bo, NULL, 0); if (result != VK_SUCCESS) return result; } pool = &cmd_buffer->device->binding_table_pool.block_pool; anv_block_pool_foreach_bo(bo, pool) { - result = anv_execbuf_add_bo(execbuf, bo, NULL, 0, - &cmd_buffer->device->alloc); + result = anv_execbuf_add_bo(cmd_buffer->device, execbuf, + bo, NULL, 0); if (result != VK_SUCCESS) return result; } @@ -1435,9 +1435,9 @@ * buffer. We have to add the surface state BO manually because it has * relocations of its own that we need to be sure are processsed. */ - result = anv_execbuf_add_bo(execbuf, ss_pool->block_pool.bo, - &cmd_buffer->surface_relocs, 0, - &cmd_buffer->device->alloc); + result = anv_execbuf_add_bo(cmd_buffer->device, execbuf, + ss_pool->block_pool.bo, + &cmd_buffer->surface_relocs, 0); if (result != VK_SUCCESS) return result; } @@ -1447,11 +1447,11 @@ */ struct anv_batch_bo **bbo; u_vector_foreach(bbo, &cmd_buffer->seen_bbos) { - adjust_relocations_to_state_pool(ss_pool, &(*bbo)->bo, &(*bbo)->relocs, + adjust_relocations_to_state_pool(ss_pool, (*bbo)->bo, &(*bbo)->relocs, cmd_buffer->last_ss_pool_center); - result = anv_execbuf_add_bo(execbuf, &(*bbo)->bo, &(*bbo)->relocs, 0, - &cmd_buffer->device->alloc); + result = anv_execbuf_add_bo(cmd_buffer->device, execbuf, + (*bbo)->bo, &(*bbo)->relocs, 0); if (result != VK_SUCCESS) return result; } @@ -1470,24 +1470,24 @@ * corresponding to the first batch_bo in the chain with the last * element in the list. */ - if (first_batch_bo->bo.index != execbuf->bo_count - 1) { - uint32_t idx = first_batch_bo->bo.index; + if (first_batch_bo->bo->index != execbuf->bo_count - 1) { + uint32_t idx = first_batch_bo->bo->index; uint32_t last_idx = execbuf->bo_count - 1; struct drm_i915_gem_exec_object2 tmp_obj = execbuf->objects[idx]; - assert(execbuf->bos[idx] == &first_batch_bo->bo); + assert(execbuf->bos[idx] == first_batch_bo->bo); execbuf->objects[idx] = execbuf->objects[last_idx]; execbuf->bos[idx] = execbuf->bos[last_idx]; execbuf->bos[idx]->index = idx; execbuf->objects[last_idx] = tmp_obj; - execbuf->bos[last_idx] = &first_batch_bo->bo; - first_batch_bo->bo.index = last_idx; + execbuf->bos[last_idx] = first_batch_bo->bo; + first_batch_bo->bo->index = last_idx; } /* If we are pinning our BOs, we shouldn't have to relocate anything */ - if (cmd_buffer->device->instance->physicalDevice.use_softpin) + if (cmd_buffer->device->physical->use_softpin) assert(!execbuf->has_relocs); /* Now we go through and fixup all of the relocation lists to point to @@ -1505,7 +1505,7 @@ __builtin_ia32_mfence(); u_vector_foreach(bbo, &cmd_buffer->seen_bbos) { for (uint32_t i = 0; i < (*bbo)->length; i += CACHELINE_SIZE) - __builtin_ia32_clflush((*bbo)->bo.map + i); + __builtin_ia32_clflush((*bbo)->bo->map + i); } } @@ -1569,8 +1569,9 @@ static VkResult setup_empty_execbuf(struct anv_execbuf *execbuf, struct anv_device *device) { - VkResult result = anv_execbuf_add_bo(execbuf, &device->trivial_batch_bo, - NULL, 0, &device->alloc); + VkResult result = anv_execbuf_add_bo(device, execbuf, + device->trivial_batch_bo, + NULL, 0); if (result != VK_SUCCESS) return result; @@ -1587,245 +1588,128 @@ return VK_SUCCESS; } +/* We lock around execbuf for three main reasons: + * + * 1) When a block pool is resized, we create a new gem handle with a + * different size and, in the case of surface states, possibly a different + * center offset but we re-use the same anv_bo struct when we do so. If + * this happens in the middle of setting up an execbuf, we could end up + * with our list of BOs out of sync with our list of gem handles. + * + * 2) The algorithm we use for building the list of unique buffers isn't + * thread-safe. While the client is supposed to syncronize around + * QueueSubmit, this would be extremely difficult to debug if it ever came + * up in the wild due to a broken app. It's better to play it safe and + * just lock around QueueSubmit. + * + * 3) The anv_cmd_buffer_execbuf function may perform relocations in + * userspace. Due to the fact that the surface state buffer is shared + * between batches, we can't afford to have that happen from multiple + * threads at the same time. Even though the user is supposed to ensure + * this doesn't happen, we play it safe as in (2) above. + * + * Since the only other things that ever take the device lock such as block + * pool resize only rarely happen, this will almost never be contended so + * taking a lock isn't really an expensive operation in this case. + */ VkResult -anv_cmd_buffer_execbuf(struct anv_device *device, - struct anv_cmd_buffer *cmd_buffer, - const VkSemaphore *in_semaphores, - uint32_t num_in_semaphores, - const VkSemaphore *out_semaphores, - uint32_t num_out_semaphores, - VkFence _fence) +anv_queue_execbuf_locked(struct anv_queue *queue, + struct anv_queue_submit *submit) { - ANV_FROM_HANDLE(anv_fence, fence, _fence); - UNUSED struct anv_physical_device *pdevice = &device->instance->physicalDevice; - + struct anv_device *device = queue->device; struct anv_execbuf execbuf; anv_execbuf_init(&execbuf); + execbuf.alloc = submit->alloc; + execbuf.alloc_scope = submit->alloc_scope; - int in_fence = -1; - VkResult result = VK_SUCCESS; - for (uint32_t i = 0; i < num_in_semaphores; i++) { - ANV_FROM_HANDLE(anv_semaphore, semaphore, in_semaphores[i]); - struct anv_semaphore_impl *impl = - semaphore->temporary.type != ANV_SEMAPHORE_TYPE_NONE ? - &semaphore->temporary : &semaphore->permanent; - - switch (impl->type) { - case ANV_SEMAPHORE_TYPE_BO: - assert(!pdevice->has_syncobj); - result = anv_execbuf_add_bo(&execbuf, impl->bo, NULL, - 0, &device->alloc); - if (result != VK_SUCCESS) - return result; - break; - - case ANV_SEMAPHORE_TYPE_SYNC_FILE: - assert(!pdevice->has_syncobj); - if (in_fence == -1) { - in_fence = impl->fd; - if (in_fence == -1) - return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); - impl->fd = -1; - } else { - int merge = anv_gem_sync_file_merge(device, in_fence, impl->fd); - if (merge == -1) - return vk_error(VK_ERROR_INVALID_EXTERNAL_HANDLE); - - close(impl->fd); - close(in_fence); - impl->fd = -1; - in_fence = merge; - } - break; + VkResult result; - case ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ: - result = anv_execbuf_add_syncobj(&execbuf, impl->syncobj, - I915_EXEC_FENCE_WAIT, - &device->alloc); - if (result != VK_SUCCESS) - return result; - break; + for (uint32_t i = 0; i < submit->fence_bo_count; i++) { + int signaled; + struct anv_bo *bo = anv_unpack_ptr(submit->fence_bos[i], 1, &signaled); - default: - break; - } + result = anv_execbuf_add_bo(device, &execbuf, bo, NULL, + signaled ? EXEC_OBJECT_WRITE : 0); + if (result != VK_SUCCESS) + goto error; } - bool need_out_fence = false; - for (uint32_t i = 0; i < num_out_semaphores; i++) { - ANV_FROM_HANDLE(anv_semaphore, semaphore, out_semaphores[i]); - - /* Under most circumstances, out fences won't be temporary. However, - * the spec does allow it for opaque_fd. From the Vulkan 1.0.53 spec: - * - * "If the import is temporary, the implementation must restore the - * semaphore to its prior permanent state after submitting the next - * semaphore wait operation." - * - * The spec says nothing whatsoever about signal operations on - * temporarily imported semaphores so it appears they are allowed. - * There are also CTS tests that require this to work. - */ - struct anv_semaphore_impl *impl = - semaphore->temporary.type != ANV_SEMAPHORE_TYPE_NONE ? - &semaphore->temporary : &semaphore->permanent; - - switch (impl->type) { - case ANV_SEMAPHORE_TYPE_BO: - assert(!pdevice->has_syncobj); - result = anv_execbuf_add_bo(&execbuf, impl->bo, NULL, - EXEC_OBJECT_WRITE, &device->alloc); - if (result != VK_SUCCESS) - return result; - break; - - case ANV_SEMAPHORE_TYPE_SYNC_FILE: - assert(!pdevice->has_syncobj); - need_out_fence = true; - break; - - case ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ: - result = anv_execbuf_add_syncobj(&execbuf, impl->syncobj, - I915_EXEC_FENCE_SIGNAL, - &device->alloc); - if (result != VK_SUCCESS) - return result; - break; + if (submit->cmd_buffer) { + result = setup_execbuf_for_cmd_buffer(&execbuf, submit->cmd_buffer); + } else if (submit->simple_bo) { + result = anv_execbuf_add_bo(device, &execbuf, submit->simple_bo, NULL, 0); + if (result != VK_SUCCESS) + goto error; - default: - break; - } + execbuf.execbuf = (struct drm_i915_gem_execbuffer2) { + .buffers_ptr = (uintptr_t) execbuf.objects, + .buffer_count = execbuf.bo_count, + .batch_start_offset = 0, + .batch_len = submit->simple_bo_size, + .flags = I915_EXEC_HANDLE_LUT | I915_EXEC_RENDER, + .rsvd1 = device->context_id, + .rsvd2 = 0, + }; + } else { + result = setup_empty_execbuf(&execbuf, queue->device); } - if (fence) { - /* Under most circumstances, out fences won't be temporary. However, - * the spec does allow it for opaque_fd. From the Vulkan 1.0.53 spec: - * - * "If the import is temporary, the implementation must restore the - * semaphore to its prior permanent state after submitting the next - * semaphore wait operation." - * - * The spec says nothing whatsoever about signal operations on - * temporarily imported semaphores so it appears they are allowed. - * There are also CTS tests that require this to work. - */ - struct anv_fence_impl *impl = - fence->temporary.type != ANV_FENCE_TYPE_NONE ? - &fence->temporary : &fence->permanent; - - switch (impl->type) { - case ANV_FENCE_TYPE_BO: - assert(!pdevice->has_syncobj_wait); - result = anv_execbuf_add_bo(&execbuf, &impl->bo.bo, NULL, - EXEC_OBJECT_WRITE, &device->alloc); - if (result != VK_SUCCESS) - return result; - break; - - case ANV_FENCE_TYPE_SYNCOBJ: - result = anv_execbuf_add_syncobj(&execbuf, impl->syncobj, - I915_EXEC_FENCE_SIGNAL, - &device->alloc); - if (result != VK_SUCCESS) - return result; - break; - - default: - unreachable("Invalid fence type"); - } - } + if (result != VK_SUCCESS) + goto error; - if (cmd_buffer) { - if (unlikely(INTEL_DEBUG & DEBUG_BATCH)) { - struct anv_batch_bo **bo = u_vector_tail(&cmd_buffer->seen_bbos); - - device->cmd_buffer_being_decoded = cmd_buffer; - gen_print_batch(&device->decoder_ctx, (*bo)->bo.map, - (*bo)->bo.size, (*bo)->bo.offset, false); + if (unlikely(INTEL_DEBUG & DEBUG_BATCH)) { + if (submit->cmd_buffer) { + struct anv_batch_bo **bo = u_vector_tail(&submit->cmd_buffer->seen_bbos); + + device->cmd_buffer_being_decoded = submit->cmd_buffer; + gen_print_batch(&device->decoder_ctx, (*bo)->bo->map, + (*bo)->bo->size, (*bo)->bo->offset, false); device->cmd_buffer_being_decoded = NULL; + } else if (submit->simple_bo) { + gen_print_batch(&device->decoder_ctx, submit->simple_bo->map, + submit->simple_bo->size, submit->simple_bo->offset, false); + } else { + gen_print_batch(&device->decoder_ctx, + device->trivial_batch_bo->map, + device->trivial_batch_bo->size, + device->trivial_batch_bo->offset, false); } - - result = setup_execbuf_for_cmd_buffer(&execbuf, cmd_buffer); - } else { - result = setup_empty_execbuf(&execbuf, device); } - if (result != VK_SUCCESS) - return result; - - if (execbuf.fence_count > 0) { - assert(device->instance->physicalDevice.has_syncobj); + if (submit->fence_count > 0) { + assert(device->physical->has_syncobj); execbuf.execbuf.flags |= I915_EXEC_FENCE_ARRAY; - execbuf.execbuf.num_cliprects = execbuf.fence_count; - execbuf.execbuf.cliprects_ptr = (uintptr_t) execbuf.fences; + execbuf.execbuf.num_cliprects = submit->fence_count; + execbuf.execbuf.cliprects_ptr = (uintptr_t)submit->fences; } - if (in_fence != -1) { + if (submit->in_fence != -1) { execbuf.execbuf.flags |= I915_EXEC_FENCE_IN; - execbuf.execbuf.rsvd2 |= (uint32_t)in_fence; + execbuf.execbuf.rsvd2 |= (uint32_t)submit->in_fence; } - if (need_out_fence) + if (submit->need_out_fence) execbuf.execbuf.flags |= I915_EXEC_FENCE_OUT; - result = anv_device_execbuf(device, &execbuf.execbuf, execbuf.bos); - - /* Execbuf does not consume the in_fence. It's our job to close it. */ - if (in_fence != -1) - close(in_fence); - - for (uint32_t i = 0; i < num_in_semaphores; i++) { - ANV_FROM_HANDLE(anv_semaphore, semaphore, in_semaphores[i]); - /* From the Vulkan 1.0.53 spec: - * - * "If the import is temporary, the implementation must restore the - * semaphore to its prior permanent state after submitting the next - * semaphore wait operation." - * - * This has to happen after the execbuf in case we close any syncobjs in - * the process. - */ - anv_semaphore_reset_temporary(device, semaphore); - } + int ret = queue->device->no_hw ? 0 : + anv_gem_execbuffer(queue->device, &execbuf.execbuf); + if (ret) + result = anv_queue_set_lost(queue, "execbuf2 failed: %m"); - if (fence && fence->permanent.type == ANV_FENCE_TYPE_BO) { - assert(!pdevice->has_syncobj_wait); - /* BO fences can't be shared, so they can't be temporary. */ - assert(fence->temporary.type == ANV_FENCE_TYPE_NONE); - - /* Once the execbuf has returned, we need to set the fence state to - * SUBMITTED. We can't do this before calling execbuf because - * anv_GetFenceStatus does take the global device lock before checking - * fence->state. - * - * We set the fence state to SUBMITTED regardless of whether or not the - * execbuf succeeds because we need to ensure that vkWaitForFences() and - * vkGetFenceStatus() return a valid result (VK_ERROR_DEVICE_LOST or - * VK_SUCCESS) in a finite amount of time even if execbuf fails. - */ - fence->permanent.bo.state = ANV_BO_FENCE_STATE_SUBMITTED; + struct drm_i915_gem_exec_object2 *objects = execbuf.objects; + for (uint32_t k = 0; k < execbuf.bo_count; k++) { + if (execbuf.bos[k]->flags & EXEC_OBJECT_PINNED) + assert(execbuf.bos[k]->offset == objects[k].offset); + execbuf.bos[k]->offset = objects[k].offset; } - if (result == VK_SUCCESS && need_out_fence) { - assert(!pdevice->has_syncobj_wait); - int out_fence = execbuf.execbuf.rsvd2 >> 32; - for (uint32_t i = 0; i < num_out_semaphores; i++) { - ANV_FROM_HANDLE(anv_semaphore, semaphore, out_semaphores[i]); - /* Out fences can't have temporary state because that would imply - * that we imported a sync file and are trying to signal it. - */ - assert(semaphore->temporary.type == ANV_SEMAPHORE_TYPE_NONE); - struct anv_semaphore_impl *impl = &semaphore->permanent; + if (result == VK_SUCCESS && submit->need_out_fence) + submit->out_fence = execbuf.execbuf.rsvd2 >> 32; - if (impl->type == ANV_SEMAPHORE_TYPE_SYNC_FILE) { - assert(impl->fd == -1); - impl->fd = dup(out_fence); - } - } - close(out_fence); - } + error: + pthread_cond_broadcast(&device->queue_submit); - anv_execbuf_finish(&execbuf, &device->alloc); + anv_execbuf_finish(&execbuf); return result; } diff -Nru mesa-19.2.8/src/intel/vulkan/anv_blorp.c mesa-20.0.8/src/intel/vulkan/anv_blorp.c --- mesa-19.2.8/src/intel/vulkan/anv_blorp.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/vulkan/anv_blorp.c 2020-06-12 01:21:17.000000000 +0000 @@ -94,7 +94,7 @@ anv_device_init_blorp(struct anv_device *device) { blorp_init(&device->blorp, device, &device->isl_dev); - device->blorp.compiler = device->instance->physicalDevice.compiler; + device->blorp.compiler = device->physical->compiler; device->blorp.lookup_shader = lookup_blorp_shader; device->blorp.upload_shader = upload_blorp_shader; switch (device->info.gen) { @@ -117,6 +117,9 @@ case 11: device->blorp.exec = gen11_blorp_exec; break; + case 12: + device->blorp.exec = gen12_blorp_exec; + break; default: unreachable("Unknown hardware generation"); } @@ -196,14 +199,18 @@ get_blorp_surf_for_anv_image(const struct anv_device *device, const struct anv_image *image, VkImageAspectFlags aspect, + VkImageUsageFlags usage, VkImageLayout layout, enum isl_aux_usage aux_usage, struct blorp_surf *blorp_surf) { uint32_t plane = anv_image_aspect_to_plane(image->aspects, aspect); - if (layout != ANV_IMAGE_LAYOUT_EXPLICIT_AUX) - aux_usage = anv_layout_to_aux_usage(&device->info, image, aspect, layout); + if (layout != ANV_IMAGE_LAYOUT_EXPLICIT_AUX) { + assert(usage != 0); + aux_usage = anv_layout_to_aux_usage(&device->info, image, + aspect, usage, layout); + } const struct anv_surface *surface = &image->planes[plane].surface; *blorp_surf = (struct blorp_surf) { @@ -235,16 +242,23 @@ const struct anv_address clear_color_addr = anv_image_get_clear_color_addr(device, image, aspect); blorp_surf->clear_color_addr = anv_to_blorp_address(clear_color_addr); - } else if (aspect & VK_IMAGE_ASPECT_DEPTH_BIT - && device->info.gen >= 10) { - /* Vulkan always clears to 1.0. On gen < 10, we set that directly in - * the state packet. For gen >= 10, must provide the clear value in a - * buffer. We have a single global buffer that stores the 1.0 value. - */ - const struct anv_address clear_color_addr = (struct anv_address) { - .bo = (struct anv_bo *)&device->hiz_clear_bo - }; - blorp_surf->clear_color_addr = anv_to_blorp_address(clear_color_addr); + } else if (aspect & VK_IMAGE_ASPECT_DEPTH_BIT) { + if (device->info.gen >= 10) { + /* Vulkan always clears to 1.0. On gen < 10, we set that directly + * in the state packet. For gen >= 10, must provide the clear + * value in a buffer. We have a single global buffer that stores + * the 1.0 value. + */ + const struct anv_address clear_color_addr = (struct anv_address) { + .bo = device->hiz_clear_bo, + }; + blorp_surf->clear_color_addr = + anv_to_blorp_address(clear_color_addr); + } else { + blorp_surf->clear_color = (union isl_color_value) { + .f32 = { ANV_HZ_FC_VAL }, + }; + } } } } @@ -329,10 +343,12 @@ struct blorp_surf src_surf, dst_surf; get_blorp_surf_for_anv_image(cmd_buffer->device, src_image, 1UL << aspect_bit, + VK_IMAGE_USAGE_TRANSFER_SRC_BIT, srcImageLayout, ISL_AUX_USAGE_NONE, &src_surf); get_blorp_surf_for_anv_image(cmd_buffer->device, dst_image, 1UL << aspect_bit, + VK_IMAGE_USAGE_TRANSFER_DST_BIT, dstImageLayout, ISL_AUX_USAGE_NONE, &dst_surf); anv_cmd_buffer_mark_image_written(cmd_buffer, dst_image, @@ -365,9 +381,11 @@ } else { struct blorp_surf src_surf, dst_surf; get_blorp_surf_for_anv_image(cmd_buffer->device, src_image, src_mask, + VK_IMAGE_USAGE_TRANSFER_SRC_BIT, srcImageLayout, ISL_AUX_USAGE_NONE, &src_surf); get_blorp_surf_for_anv_image(cmd_buffer->device, dst_image, dst_mask, + VK_IMAGE_USAGE_TRANSFER_DST_BIT, dstImageLayout, ISL_AUX_USAGE_NONE, &dst_surf); anv_cmd_buffer_mark_image_written(cmd_buffer, dst_image, dst_mask, @@ -400,6 +418,24 @@ blorp_batch_finish(&batch); } +static enum isl_format +isl_format_for_size(unsigned size_B) +{ + /* Prefer 32-bit per component formats for CmdFillBuffer */ + switch (size_B) { + case 1: return ISL_FORMAT_R8_UINT; + case 2: return ISL_FORMAT_R16_UINT; + case 3: return ISL_FORMAT_R8G8B8_UINT; + case 4: return ISL_FORMAT_R32_UINT; + case 6: return ISL_FORMAT_R16G16B16_UINT; + case 8: return ISL_FORMAT_R32G32_UINT; + case 12: return ISL_FORMAT_R32G32B32_UINT; + case 16: return ISL_FORMAT_R32G32B32A32_UINT; + default: + unreachable("Unknown format size"); + } +} + static void copy_buffer_to_image(struct anv_cmd_buffer *cmd_buffer, struct anv_buffer *anv_buffer, @@ -433,6 +469,9 @@ const VkImageAspectFlags aspect = pRegions[r].imageSubresource.aspectMask; get_blorp_surf_for_anv_image(cmd_buffer->device, anv_image, aspect, + buffer_to_image ? + VK_IMAGE_USAGE_TRANSFER_DST_BIT : + VK_IMAGE_USAGE_TRANSFER_SRC_BIT, image_layout, ISL_AUX_USAGE_NONE, &image.surf); image.offset = @@ -447,38 +486,63 @@ anv_get_layerCount(anv_image, &pRegions[r].imageSubresource); } - const enum isl_format buffer_format = + const enum isl_format linear_format = anv_get_isl_format(&cmd_buffer->device->info, anv_image->vk_format, aspect, VK_IMAGE_TILING_LINEAR); + const struct isl_format_layout *linear_fmtl = + isl_format_get_layout(linear_format); - const VkExtent3D bufferImageExtent = { - .width = pRegions[r].bufferRowLength ? - pRegions[r].bufferRowLength : extent.width, - .height = pRegions[r].bufferImageHeight ? - pRegions[r].bufferImageHeight : extent.height, - }; - - const struct isl_format_layout *buffer_fmtl = - isl_format_get_layout(buffer_format); + const uint32_t buffer_row_length = + pRegions[r].bufferRowLength ? + pRegions[r].bufferRowLength : extent.width; + + const uint32_t buffer_image_height = + pRegions[r].bufferImageHeight ? + pRegions[r].bufferImageHeight : extent.height; const uint32_t buffer_row_pitch = - DIV_ROUND_UP(bufferImageExtent.width, buffer_fmtl->bw) * - (buffer_fmtl->bpb / 8); + DIV_ROUND_UP(buffer_row_length, linear_fmtl->bw) * + (linear_fmtl->bpb / 8); const uint32_t buffer_layer_stride = - DIV_ROUND_UP(bufferImageExtent.height, buffer_fmtl->bh) * + DIV_ROUND_UP(buffer_image_height, linear_fmtl->bh) * buffer_row_pitch; + /* Some formats have additional restrictions which may cause ISL to + * fail to create a surface for us. Some examples include: + * + * 1. ASTC formats are not allowed to be LINEAR and must be tiled + * 2. YCbCr formats have to have 2-pixel aligned strides + * + * To avoid these issues, we always bind the buffer as if it's a + * "normal" format like RGBA32_UINT. Since we're using blorp_copy, + * the format doesn't matter as long as it has the right bpb. + */ + const VkExtent2D buffer_extent = { + .width = DIV_ROUND_UP(extent.width, linear_fmtl->bw), + .height = DIV_ROUND_UP(extent.height, linear_fmtl->bh), + }; + const enum isl_format buffer_format = + isl_format_for_size(linear_fmtl->bpb / 8); + struct isl_surf buffer_isl_surf; get_blorp_surf_for_anv_buffer(cmd_buffer->device, anv_buffer, pRegions[r].bufferOffset, - extent.width, extent.height, + buffer_extent.width, buffer_extent.height, buffer_row_pitch, buffer_format, &buffer.surf, &buffer_isl_surf); bool dst_has_shadow = false; struct blorp_surf dst_shadow_surf; if (&image == dst) { + /* In this case, the source is the buffer and, since blorp takes its + * copy dimensions in terms of the source format, we have to use the + * scaled down version for compressed textures because the source + * format is an RGB format. + */ + extent.width = buffer_extent.width; + extent.height = buffer_extent.height; + anv_cmd_buffer_mark_image_written(cmd_buffer, anv_image, aspect, dst->surf.aux_usage, dst->level, @@ -610,9 +674,11 @@ anv_foreach_image_aspect_bit(aspect_bit, src_image, src_res->aspectMask) { get_blorp_surf_for_anv_image(cmd_buffer->device, src_image, 1U << aspect_bit, + VK_IMAGE_USAGE_TRANSFER_SRC_BIT, srcImageLayout, ISL_AUX_USAGE_NONE, &src); get_blorp_surf_for_anv_image(cmd_buffer->device, dst_image, 1U << aspect_bit, + VK_IMAGE_USAGE_TRANSFER_DST_BIT, dstImageLayout, ISL_AUX_USAGE_NONE, &dst); struct anv_format_plane src_format = @@ -688,18 +754,6 @@ blorp_batch_finish(&batch); } -static enum isl_format -isl_format_for_size(unsigned size_B) -{ - switch (size_B) { - case 4: return ISL_FORMAT_R32_UINT; - case 8: return ISL_FORMAT_R32G32_UINT; - case 16: return ISL_FORMAT_R32G32B32A32_UINT; - default: - unreachable("Not a power-of-two format size"); - } -} - /** * Returns the greatest common divisor of a and b that is a power of two. */ @@ -792,7 +846,7 @@ struct blorp_address src = { .buffer = cmd_buffer->device->dynamic_state_pool.block_pool.bo, .offset = tmp_data.offset, - .mocs = cmd_buffer->device->default_mocs, + .mocs = cmd_buffer->device->isl_dev.mocs.internal, }; struct blorp_address dst = { .buffer = dst_buffer->address.bo, @@ -927,6 +981,7 @@ struct blorp_surf surf; get_blorp_surf_for_anv_image(cmd_buffer->device, image, pRanges[r].aspectMask, + VK_IMAGE_USAGE_TRANSFER_DST_BIT, imageLayout, ISL_AUX_USAGE_NONE, &surf); struct anv_format_plane src_format = @@ -980,6 +1035,7 @@ if (image->aspects & VK_IMAGE_ASPECT_DEPTH_BIT) { get_blorp_surf_for_anv_image(cmd_buffer->device, image, VK_IMAGE_ASPECT_DEPTH_BIT, + VK_IMAGE_USAGE_TRANSFER_DST_BIT, imageLayout, ISL_AUX_USAGE_NONE, &depth); } else { memset(&depth, 0, sizeof(depth)); @@ -989,6 +1045,7 @@ if (image->aspects & VK_IMAGE_ASPECT_STENCIL_BIT) { get_blorp_surf_for_anv_image(cmd_buffer->device, image, VK_IMAGE_ASPECT_STENCIL_BIT, + VK_IMAGE_USAGE_TRANSFER_DST_BIT, imageLayout, ISL_AUX_USAGE_NONE, &stencil); has_stencil_shadow = @@ -1294,7 +1351,7 @@ struct blorp_surf src_surf, dst_surf; get_blorp_surf_for_anv_image(cmd_buffer->device, src_image, aspect, - ANV_IMAGE_LAYOUT_EXPLICIT_AUX, + 0, ANV_IMAGE_LAYOUT_EXPLICIT_AUX, src_aux_usage, &src_surf); if (src_aux_usage == ISL_AUX_USAGE_MCS) { src_surf.clear_color_addr = anv_to_blorp_address( @@ -1302,7 +1359,7 @@ VK_IMAGE_ASPECT_COLOR_BIT)); } get_blorp_surf_for_anv_image(cmd_buffer->device, dst_image, aspect, - ANV_IMAGE_LAYOUT_EXPLICIT_AUX, + 0, ANV_IMAGE_LAYOUT_EXPLICIT_AUX, dst_aux_usage, &dst_surf); anv_cmd_buffer_mark_image_written(cmd_buffer, dst_image, aspect, dst_aux_usage, @@ -1364,10 +1421,14 @@ pRegions[r].srcSubresource.aspectMask) { enum isl_aux_usage src_aux_usage = anv_layout_to_aux_usage(&cmd_buffer->device->info, src_image, - (1 << aspect_bit), srcImageLayout); + (1 << aspect_bit), + VK_IMAGE_USAGE_TRANSFER_SRC_BIT, + srcImageLayout); enum isl_aux_usage dst_aux_usage = anv_layout_to_aux_usage(&cmd_buffer->device->info, dst_image, - (1 << aspect_bit), dstImageLayout); + (1 << aspect_bit), + VK_IMAGE_USAGE_TRANSFER_DST_BIT, + dstImageLayout); anv_image_msaa_resolve(cmd_buffer, src_image, src_aux_usage, @@ -1388,17 +1449,6 @@ } } -static enum isl_aux_usage -fast_clear_aux_usage(const struct anv_image *image, - VkImageAspectFlagBits aspect) -{ - uint32_t plane = anv_image_aspect_to_plane(image->aspects, aspect); - if (image->planes[plane].aux_usage == ISL_AUX_USAGE_NONE) - return ISL_AUX_USAGE_CCS_D; - else - return image->planes[plane].aux_usage; -} - void anv_image_copy_to_shadow(struct anv_cmd_buffer *cmd_buffer, const struct anv_image *image, @@ -1421,6 +1471,7 @@ struct blorp_surf surf; get_blorp_surf_for_anv_image(cmd_buffer->device, image, aspect, + VK_IMAGE_USAGE_TRANSFER_SRC_BIT, VK_IMAGE_LAYOUT_GENERAL, ISL_AUX_USAGE_NONE, &surf); assert(surf.aux_usage == ISL_AUX_USAGE_NONE); @@ -1476,7 +1527,7 @@ struct blorp_surf surf; get_blorp_surf_for_anv_image(cmd_buffer->device, image, aspect, - ANV_IMAGE_LAYOUT_EXPLICIT_AUX, + 0, ANV_IMAGE_LAYOUT_EXPLICIT_AUX, aux_usage, &surf); anv_cmd_buffer_mark_image_written(cmd_buffer, image, aspect, aux_usage, level, base_layer, layer_count); @@ -1511,7 +1562,7 @@ if (aspects & VK_IMAGE_ASPECT_DEPTH_BIT) { get_blorp_surf_for_anv_image(cmd_buffer->device, image, VK_IMAGE_ASPECT_DEPTH_BIT, - ANV_IMAGE_LAYOUT_EXPLICIT_AUX, + 0, ANV_IMAGE_LAYOUT_EXPLICIT_AUX, depth_aux_usage, &depth); depth.clear_color.f32[0] = ANV_HZ_FC_VAL; } @@ -1520,10 +1571,17 @@ if (aspects & VK_IMAGE_ASPECT_STENCIL_BIT) { get_blorp_surf_for_anv_image(cmd_buffer->device, image, VK_IMAGE_ASPECT_STENCIL_BIT, - ANV_IMAGE_LAYOUT_EXPLICIT_AUX, + 0, ANV_IMAGE_LAYOUT_EXPLICIT_AUX, ISL_AUX_USAGE_NONE, &stencil); } + /* Blorp may choose to clear stencil using RGBA32_UINT for better + * performance. If it does this, we need to flush it out of the depth + * cache before rendering to it. + */ + cmd_buffer->state.pending_pipe_bits |= + ANV_PIPE_DEPTH_CACHE_FLUSH_BIT | ANV_PIPE_END_OF_PIPE_SYNC_BIT; + blorp_clear_depth_stencil(&batch, &depth, &stencil, level, base_layer, layer_count, area.offset.x, area.offset.y, @@ -1534,6 +1592,13 @@ (aspects & VK_IMAGE_ASPECT_STENCIL_BIT) ? 0xff : 0, stencil_value); + /* Blorp may choose to clear stencil using RGBA32_UINT for better + * performance. If it does this, we need to flush it out of the render + * cache before someone starts trying to do stencil on it. + */ + cmd_buffer->state.pending_pipe_bits |= + ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | ANV_PIPE_END_OF_PIPE_SYNC_BIT; + struct blorp_surf stencil_shadow; if ((aspects & VK_IMAGE_ASPECT_STENCIL_BIT) && get_blorp_surf_for_anv_shadow_image(cmd_buffer->device, image, @@ -1572,7 +1637,7 @@ struct blorp_surf surf; get_blorp_surf_for_anv_image(cmd_buffer->device, image, VK_IMAGE_ASPECT_DEPTH_BIT, - ANV_IMAGE_LAYOUT_EXPLICIT_AUX, + 0, ANV_IMAGE_LAYOUT_EXPLICIT_AUX, ISL_AUX_USAGE_HIZ, &surf); surf.clear_color.f32[0] = ANV_HZ_FC_VAL; @@ -1601,7 +1666,7 @@ anv_image_aux_layers(image, VK_IMAGE_ASPECT_DEPTH_BIT, level)); get_blorp_surf_for_anv_image(cmd_buffer->device, image, VK_IMAGE_ASPECT_DEPTH_BIT, - ANV_IMAGE_LAYOUT_EXPLICIT_AUX, + 0, ANV_IMAGE_LAYOUT_EXPLICIT_AUX, ISL_AUX_USAGE_HIZ, &depth); depth.clear_color.f32[0] = ANV_HZ_FC_VAL; } @@ -1610,7 +1675,7 @@ if (aspects & VK_IMAGE_ASPECT_STENCIL_BIT) { get_blorp_surf_for_anv_image(cmd_buffer->device, image, VK_IMAGE_ASPECT_STENCIL_BIT, - ANV_IMAGE_LAYOUT_EXPLICIT_AUX, + 0, ANV_IMAGE_LAYOUT_EXPLICIT_AUX, ISL_AUX_USAGE_NONE, &stencil); } @@ -1667,7 +1732,7 @@ void anv_image_mcs_op(struct anv_cmd_buffer *cmd_buffer, const struct anv_image *image, - enum isl_format format, + enum isl_format format, struct isl_swizzle swizzle, VkImageAspectFlagBits aspect, uint32_t base_layer, uint32_t layer_count, enum isl_aux_op mcs_op, union isl_color_value *clear_value, @@ -1682,26 +1747,20 @@ struct blorp_batch batch; blorp_batch_init(&cmd_buffer->device->blorp, &batch, cmd_buffer, - predicate ? BLORP_BATCH_PREDICATE_ENABLE : 0); + BLORP_BATCH_PREDICATE_ENABLE * predicate + + BLORP_BATCH_NO_UPDATE_CLEAR_COLOR * !clear_value); struct blorp_surf surf; get_blorp_surf_for_anv_image(cmd_buffer->device, image, aspect, - ANV_IMAGE_LAYOUT_EXPLICIT_AUX, + 0, ANV_IMAGE_LAYOUT_EXPLICIT_AUX, ISL_AUX_USAGE_MCS, &surf); /* Blorp will store the clear color for us if we provide the clear color * address and we are doing a fast clear. So we save the clear value into - * the blorp surface. However, in some situations we want to do a fast clear - * without changing the clear value stored in the state buffer. For those - * cases, we set the clear color address pointer to NULL, so blorp will not - * try to store a garbage color. + * the blorp surface. */ - if (mcs_op == ISL_AUX_OP_FAST_CLEAR) { - if (clear_value) - surf.clear_color = *clear_value; - else - surf.clear_color_addr.buffer = NULL; - } + if (clear_value) + surf.clear_color = *clear_value; /* From the Sky Lake PRM Vol. 7, "Render Target Fast Clear": * @@ -1719,11 +1778,11 @@ * that it is completed before any additional drawing occurs. */ cmd_buffer->state.pending_pipe_bits |= - ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | ANV_PIPE_CS_STALL_BIT; + ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | ANV_PIPE_END_OF_PIPE_SYNC_BIT; switch (mcs_op) { case ISL_AUX_OP_FAST_CLEAR: - blorp_fast_clear(&batch, &surf, format, + blorp_fast_clear(&batch, &surf, format, swizzle, 0, base_layer, layer_count, 0, 0, image->extent.width, image->extent.height); break; @@ -1738,7 +1797,7 @@ } cmd_buffer->state.pending_pipe_bits |= - ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | ANV_PIPE_CS_STALL_BIT; + ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | ANV_PIPE_END_OF_PIPE_SYNC_BIT; blorp_batch_finish(&batch); } @@ -1746,7 +1805,7 @@ void anv_image_ccs_op(struct anv_cmd_buffer *cmd_buffer, const struct anv_image *image, - enum isl_format format, + enum isl_format format, struct isl_swizzle swizzle, VkImageAspectFlagBits aspect, uint32_t level, uint32_t base_layer, uint32_t layer_count, enum isl_aux_op ccs_op, union isl_color_value *clear_value, @@ -1768,27 +1827,21 @@ struct blorp_batch batch; blorp_batch_init(&cmd_buffer->device->blorp, &batch, cmd_buffer, - predicate ? BLORP_BATCH_PREDICATE_ENABLE : 0); + BLORP_BATCH_PREDICATE_ENABLE * predicate + + BLORP_BATCH_NO_UPDATE_CLEAR_COLOR * !clear_value); struct blorp_surf surf; get_blorp_surf_for_anv_image(cmd_buffer->device, image, aspect, - ANV_IMAGE_LAYOUT_EXPLICIT_AUX, - fast_clear_aux_usage(image, aspect), + 0, ANV_IMAGE_LAYOUT_EXPLICIT_AUX, + image->planes[plane].aux_usage, &surf); /* Blorp will store the clear color for us if we provide the clear color * address and we are doing a fast clear. So we save the clear value into - * the blorp surface. However, in some situations we want to do a fast clear - * without changing the clear value stored in the state buffer. For those - * cases, we set the clear color address pointer to NULL, so blorp will not - * try to store a garbage color. + * the blorp surface. */ - if (ccs_op == ISL_AUX_OP_FAST_CLEAR) { - if (clear_value) - surf.clear_color = *clear_value; - else - surf.clear_color_addr.buffer = NULL; - } + if (clear_value) + surf.clear_color = *clear_value; /* From the Sky Lake PRM Vol. 7, "Render Target Fast Clear": * @@ -1806,11 +1859,11 @@ * that it is completed before any additional drawing occurs. */ cmd_buffer->state.pending_pipe_bits |= - ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | ANV_PIPE_CS_STALL_BIT; + ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | ANV_PIPE_END_OF_PIPE_SYNC_BIT; switch (ccs_op) { case ISL_AUX_OP_FAST_CLEAR: - blorp_fast_clear(&batch, &surf, format, + blorp_fast_clear(&batch, &surf, format, swizzle, level, base_layer, layer_count, 0, 0, level_width, level_height); break; @@ -1830,7 +1883,7 @@ } cmd_buffer->state.pending_pipe_bits |= - ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | ANV_PIPE_CS_STALL_BIT; + ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | ANV_PIPE_END_OF_PIPE_SYNC_BIT; blorp_batch_finish(&batch); } diff -Nru mesa-19.2.8/src/intel/vulkan/anv_cmd_buffer.c mesa-20.0.8/src/intel/vulkan/anv_cmd_buffer.c --- mesa-19.2.8/src/intel/vulkan/anv_cmd_buffer.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/vulkan/anv_cmd_buffer.c 2020-06-12 01:21:17.000000000 +0000 @@ -345,6 +345,9 @@ case 11: \ gen11_##func(__VA_ARGS__); \ break; \ + case 12: \ + gen12_##func(__VA_ARGS__); \ + break; \ default: \ assert(!"Unknown hardware generation"); \ } @@ -380,6 +383,34 @@ cmd_buffer); } +static bool +mem_update(void *dst, const void *src, size_t size) +{ + if (memcmp(dst, src, size) == 0) + return false; + + memcpy(dst, src, size); + return true; +} + +static void +set_dirty_for_bind_map(struct anv_cmd_buffer *cmd_buffer, + gl_shader_stage stage, + const struct anv_pipeline_bind_map *map) +{ + if (mem_update(cmd_buffer->state.surface_sha1s[stage], + map->surface_sha1, sizeof(map->surface_sha1))) + cmd_buffer->state.descriptors_dirty |= mesa_to_vk_shader_stage(stage); + + if (mem_update(cmd_buffer->state.sampler_sha1s[stage], + map->sampler_sha1, sizeof(map->sampler_sha1))) + cmd_buffer->state.descriptors_dirty |= mesa_to_vk_shader_stage(stage); + + if (mem_update(cmd_buffer->state.push_sha1s[stage], + map->push_sha1, sizeof(map->push_sha1))) + cmd_buffer->state.push_constants_dirty |= mesa_to_vk_shader_stage(stage); +} + void anv_CmdBindPipeline( VkCommandBuffer commandBuffer, VkPipelineBindPoint pipelineBindPoint, @@ -389,19 +420,30 @@ ANV_FROM_HANDLE(anv_pipeline, pipeline, _pipeline); switch (pipelineBindPoint) { - case VK_PIPELINE_BIND_POINT_COMPUTE: + case VK_PIPELINE_BIND_POINT_COMPUTE: { + if (cmd_buffer->state.compute.base.pipeline == pipeline) + return; + cmd_buffer->state.compute.base.pipeline = pipeline; cmd_buffer->state.compute.pipeline_dirty = true; - cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_COMPUTE_BIT; - cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_COMPUTE_BIT; + const struct anv_pipeline_bind_map *bind_map = + &pipeline->shaders[MESA_SHADER_COMPUTE]->bind_map; + set_dirty_for_bind_map(cmd_buffer, MESA_SHADER_COMPUTE, bind_map); break; + } case VK_PIPELINE_BIND_POINT_GRAPHICS: + if (cmd_buffer->state.gfx.base.pipeline == pipeline) + return; + cmd_buffer->state.gfx.base.pipeline = pipeline; cmd_buffer->state.gfx.vb_dirty |= pipeline->vb_used; cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_PIPELINE; - cmd_buffer->state.push_constants_dirty |= pipeline->active_stages; - cmd_buffer->state.descriptors_dirty |= pipeline->active_stages; + + anv_foreach_stage(stage, pipeline->active_stages) { + set_dirty_for_bind_map(cmd_buffer, stage, + &pipeline->shaders[stage]->bind_map); + } /* Apply the dynamic state from the pipeline */ cmd_buffer->state.gfx.dirty |= @@ -572,56 +614,65 @@ struct anv_descriptor_set_layout *set_layout = layout->set[set_index].layout; - struct anv_cmd_pipeline_state *pipe_state; + VkShaderStageFlags stages = set_layout->shader_stages & + (bind_point == VK_PIPELINE_BIND_POINT_COMPUTE ? + VK_SHADER_STAGE_COMPUTE_BIT : VK_SHADER_STAGE_ALL_GRAPHICS); + + VkShaderStageFlags dirty_stages = 0; if (bind_point == VK_PIPELINE_BIND_POINT_COMPUTE) { - pipe_state = &cmd_buffer->state.compute.base; + if (cmd_buffer->state.compute.base.descriptors[set_index] != set) { + cmd_buffer->state.compute.base.descriptors[set_index] = set; + dirty_stages |= stages; + } } else { assert(bind_point == VK_PIPELINE_BIND_POINT_GRAPHICS); - pipe_state = &cmd_buffer->state.gfx.base; + if (cmd_buffer->state.gfx.base.descriptors[set_index] != set) { + cmd_buffer->state.gfx.base.descriptors[set_index] = set; + dirty_stages |= stages; + } } - pipe_state->descriptors[set_index] = set; + + /* If it's a push descriptor set, we have to flag things as dirty + * regardless of whether or not the CPU-side data structure changed as we + * may have edited in-place. + */ + if (set->pool == NULL) + dirty_stages |= stages; if (dynamic_offsets) { if (set_layout->dynamic_offset_count > 0) { uint32_t dynamic_offset_start = layout->set[set_index].dynamic_offset_start; - /* Assert that everything is in range */ - assert(set_layout->dynamic_offset_count <= *dynamic_offset_count); - assert(dynamic_offset_start + set_layout->dynamic_offset_count <= - ARRAY_SIZE(pipe_state->dynamic_offsets)); - - typed_memcpy(&pipe_state->dynamic_offsets[dynamic_offset_start], - *dynamic_offsets, set_layout->dynamic_offset_count); + anv_foreach_stage(stage, stages) { + struct anv_push_constants *push = + &cmd_buffer->state.push_constants[stage]; + uint32_t *push_offsets = + &push->dynamic_offsets[dynamic_offset_start]; + + /* Assert that everything is in range */ + assert(set_layout->dynamic_offset_count <= *dynamic_offset_count); + assert(dynamic_offset_start + set_layout->dynamic_offset_count <= + ARRAY_SIZE(push->dynamic_offsets)); + + unsigned mask = set_layout->stage_dynamic_offsets[stage]; + STATIC_ASSERT(MAX_DYNAMIC_BUFFERS <= sizeof(mask) * 8); + while (mask) { + int i = u_bit_scan(&mask); + if (push_offsets[i] != (*dynamic_offsets)[i]) { + push_offsets[i] = (*dynamic_offsets)[i]; + dirty_stages |= mesa_to_vk_shader_stage(stage); + } + } + } *dynamic_offsets += set_layout->dynamic_offset_count; *dynamic_offset_count -= set_layout->dynamic_offset_count; - - if (bind_point == VK_PIPELINE_BIND_POINT_COMPUTE) { - cmd_buffer->state.push_constants_dirty |= - VK_SHADER_STAGE_COMPUTE_BIT; - } else { - cmd_buffer->state.push_constants_dirty |= - VK_SHADER_STAGE_ALL_GRAPHICS; - } } } - if (bind_point == VK_PIPELINE_BIND_POINT_COMPUTE) { - cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_COMPUTE_BIT; - } else { - assert(bind_point == VK_PIPELINE_BIND_POINT_GRAPHICS); - cmd_buffer->state.descriptors_dirty |= - set_layout->shader_stages & VK_SHADER_STAGE_ALL_GRAPHICS; - } - - /* Pipeline layout objects are required to live at least while any command - * buffers that use them are in recording state. We need to grab a reference - * to the pipeline layout being bound here so we can compute correct dynamic - * offsets for VK_DESCRIPTOR_TYPE_*_DYNAMIC in dynamic_offset_for_binding() - * when we record draw commands that come after this. - */ - pipe_state->layout = layout; + cmd_buffer->state.descriptors_dirty |= dirty_stages; + cmd_buffer->state.push_constants_dirty |= dirty_stages; } void anv_CmdBindDescriptorSets( @@ -748,71 +799,18 @@ return state; } -static uint32_t -anv_push_constant_value(const struct anv_cmd_pipeline_state *state, - const struct anv_push_constants *data, uint32_t param) -{ - if (BRW_PARAM_IS_BUILTIN(param)) { - switch (param) { - case BRW_PARAM_BUILTIN_ZERO: - return 0; - case BRW_PARAM_BUILTIN_BASE_WORK_GROUP_ID_X: - return data->base_work_group_id[0]; - case BRW_PARAM_BUILTIN_BASE_WORK_GROUP_ID_Y: - return data->base_work_group_id[1]; - case BRW_PARAM_BUILTIN_BASE_WORK_GROUP_ID_Z: - return data->base_work_group_id[2]; - default: - unreachable("Invalid param builtin"); - } - } else if (ANV_PARAM_IS_PUSH(param)) { - uint32_t offset = ANV_PARAM_PUSH_OFFSET(param); - assert(offset % sizeof(uint32_t) == 0); - if (offset < sizeof(data->client_data)) - return *(uint32_t *)((uint8_t *)data + offset); - else - return 0; - } else if (ANV_PARAM_IS_DYN_OFFSET(param)) { - unsigned idx = ANV_PARAM_DYN_OFFSET_IDX(param); - assert(idx < MAX_DYNAMIC_BUFFERS); - return state->dynamic_offsets[idx]; - } - - assert(!"Invalid param"); - return 0; -} - struct anv_state anv_cmd_buffer_push_constants(struct anv_cmd_buffer *cmd_buffer, gl_shader_stage stage) { - struct anv_cmd_pipeline_state *pipeline_state = &cmd_buffer->state.gfx.base; - struct anv_pipeline *pipeline = cmd_buffer->state.gfx.base.pipeline; - - /* If we don't have this stage, bail. */ - if (!anv_pipeline_has_stage(pipeline, stage)) - return (struct anv_state) { .offset = 0 }; - struct anv_push_constants *data = &cmd_buffer->state.push_constants[stage]; - const struct brw_stage_prog_data *prog_data = - pipeline->shaders[stage]->prog_data; - - /* If we don't actually have any push constants, bail. */ - if (prog_data == NULL || prog_data->nr_params == 0) - return (struct anv_state) { .offset = 0 }; struct anv_state state = anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, - prog_data->nr_params * sizeof(float), + sizeof(struct anv_push_constants), 32 /* bottom 5 bits MBZ */); - - /* Walk through the param array and fill the buffer with data */ - uint32_t *u32_map = state.map; - for (unsigned i = 0; i < prog_data->nr_params; i++) { - u32_map[i] = anv_push_constant_value(pipeline_state, data, - prog_data->param[i]); - } + memcpy(state.map, data, sizeof(struct anv_push_constants)); return state; } @@ -820,14 +818,13 @@ struct anv_state anv_cmd_buffer_cs_push_constants(struct anv_cmd_buffer *cmd_buffer) { - struct anv_cmd_pipeline_state *pipeline_state = &cmd_buffer->state.compute.base; struct anv_push_constants *data = &cmd_buffer->state.push_constants[MESA_SHADER_COMPUTE]; struct anv_pipeline *pipeline = cmd_buffer->state.compute.base.pipeline; const struct brw_cs_prog_data *cs_prog_data = get_cs_prog_data(pipeline); - const struct brw_stage_prog_data *prog_data = &cs_prog_data->base; + const struct anv_push_range *range = + &pipeline->shaders[MESA_SHADER_COMPUTE]->bind_map.push_ranges[0]; - /* If we don't actually have any push constants, bail. */ if (cs_prog_data->push.total.size == 0) return (struct anv_state) { .offset = 0 }; @@ -840,33 +837,25 @@ aligned_total_push_constants_size, push_constant_alignment); - /* Walk through the param array and fill the buffer with data */ - uint32_t *u32_map = state.map; + void *dst = state.map; + const void *src = (char *)data + (range->start * 32); if (cs_prog_data->push.cross_thread.size > 0) { - for (unsigned i = 0; - i < cs_prog_data->push.cross_thread.dwords; - i++) { - assert(prog_data->param[i] != BRW_PARAM_BUILTIN_SUBGROUP_ID); - u32_map[i] = anv_push_constant_value(pipeline_state, data, - prog_data->param[i]); - } + memcpy(dst, src, cs_prog_data->push.cross_thread.size); + dst += cs_prog_data->push.cross_thread.size; + src += cs_prog_data->push.cross_thread.size; } if (cs_prog_data->push.per_thread.size > 0) { for (unsigned t = 0; t < cs_prog_data->threads; t++) { - unsigned dst = - 8 * (cs_prog_data->push.per_thread.regs * t + - cs_prog_data->push.cross_thread.regs); - unsigned src = cs_prog_data->push.cross_thread.dwords; - for ( ; src < prog_data->nr_params; src++, dst++) { - if (prog_data->param[src] == BRW_PARAM_BUILTIN_SUBGROUP_ID) { - u32_map[dst] = t; - } else { - u32_map[dst] = anv_push_constant_value(pipeline_state, data, - prog_data->param[src]); - } - } + memcpy(dst, src, cs_prog_data->push.per_thread.size); + + uint32_t *subgroup_id = dst + + offsetof(struct anv_push_constants, cs.subgroup_id) - + (range->start * 32 + cs_prog_data->push.cross_thread.size); + *subgroup_id = t; + + dst += cs_prog_data->push.per_thread.size; } } diff -Nru mesa-19.2.8/src/intel/vulkan/anv_descriptor_set.c mesa-20.0.8/src/intel/vulkan/anv_descriptor_set.c --- mesa-19.2.8/src/intel/vulkan/anv_descriptor_set.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/vulkan/anv_descriptor_set.c 2020-06-12 01:21:17.000000000 +0000 @@ -245,8 +245,7 @@ VkDescriptorSetLayoutSupport* pSupport) { ANV_FROM_HANDLE(anv_device, device, _device); - const struct anv_physical_device *pdevice = - &device->instance->physicalDevice; + const struct anv_physical_device *pdevice = device->physical; uint32_t surface_count[MESA_SHADER_STAGES] = { 0, }; bool needs_descriptor_buffer = false; @@ -427,7 +426,7 @@ } set_layout->binding[b].data = - anv_descriptor_data_for_type(&device->instance->physicalDevice, + anv_descriptor_data_for_type(device->physical, binding->descriptorType); set_layout->binding[b].array_size = binding->descriptorCount; set_layout->binding[b].descriptor_index = set_layout->size; @@ -469,7 +468,15 @@ case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: set_layout->binding[b].dynamic_offset_index = dynamic_offset_count; + anv_foreach_stage(s, binding->stageFlags) { + STATIC_ASSERT(MAX_DYNAMIC_BUFFERS <= + sizeof(set_layout->stage_dynamic_offsets[s]) * 8); + set_layout->stage_dynamic_offsets[s] |= + BITFIELD_RANGE(set_layout->binding[b].dynamic_offset_index, + binding->descriptorCount); + } dynamic_offset_count += binding->descriptorCount; + assert(dynamic_offset_count < MAX_DYNAMIC_BUFFERS); break; default: @@ -603,6 +610,7 @@ dynamic_offset_count += set_layout->binding[b].array_size; } } + assert(dynamic_offset_count < MAX_DYNAMIC_BUFFERS); struct mesa_sha1 ctx; _mesa_sha1_init(&ctx); @@ -674,7 +682,7 @@ uint32_t descriptor_bo_size = 0; for (uint32_t i = 0; i < pCreateInfo->poolSizeCount; i++) { enum anv_descriptor_data desc_data = - anv_descriptor_data_for_type(&device->instance->physicalDevice, + anv_descriptor_data_for_type(device->physical, pCreateInfo->pPoolSizes[i].type); if (desc_data & ANV_DESCRIPTOR_BUFFER_VIEW) @@ -733,30 +741,20 @@ pool->free_list = EMPTY; if (descriptor_bo_size > 0) { - VkResult result = anv_bo_init_new(&pool->bo, device, descriptor_bo_size); + VkResult result = anv_device_alloc_bo(device, + descriptor_bo_size, + ANV_BO_ALLOC_MAPPED | + ANV_BO_ALLOC_SNOOPED, + 0 /* explicit_address */, + &pool->bo); if (result != VK_SUCCESS) { vk_free2(&device->alloc, pAllocator, pool); return result; } - anv_gem_set_caching(device, pool->bo.gem_handle, I915_CACHING_CACHED); - - pool->bo.map = anv_gem_mmap(device, pool->bo.gem_handle, 0, - descriptor_bo_size, 0); - if (pool->bo.map == NULL) { - anv_gem_close(device, pool->bo.gem_handle); - vk_free2(&device->alloc, pAllocator, pool); - return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); - } - - if (device->instance->physicalDevice.use_softpin) { - pool->bo.flags |= EXEC_OBJECT_PINNED; - anv_vma_alloc(device, &pool->bo); - } - util_vma_heap_init(&pool->bo_heap, POOL_HEAP_OFFSET, descriptor_bo_size); } else { - pool->bo.size = 0; + pool->bo = NULL; } anv_state_stream_init(&pool->surface_state_stream, @@ -786,12 +784,8 @@ anv_descriptor_set_layout_unref(device, set->layout); } - if (pool->bo.size) { - anv_gem_munmap(pool->bo.map, pool->bo.size); - anv_vma_free(device, &pool->bo); - anv_gem_close(device, pool->bo.gem_handle); - util_vma_heap_finish(&pool->bo_heap); - } + if (pool->bo) + anv_device_release_bo(device, pool->bo); anv_state_stream_finish(&pool->surface_state_stream); vk_free2(&device->alloc, pAllocator, pool); @@ -814,9 +808,9 @@ pool->next = 0; pool->free_list = EMPTY; - if (pool->bo.size) { + if (pool->bo) { util_vma_heap_finish(&pool->bo_heap); - util_vma_heap_init(&pool->bo_heap, POOL_HEAP_OFFSET, pool->bo.size); + util_vma_heap_init(&pool->bo_heap, POOL_HEAP_OFFSET, pool->bo->size); } anv_state_stream_finish(&pool->surface_state_stream); @@ -947,13 +941,13 @@ pool_vma_offset - POOL_HEAP_OFFSET <= INT32_MAX); set->desc_mem.offset = pool_vma_offset - POOL_HEAP_OFFSET; set->desc_mem.alloc_size = set_buffer_size; - set->desc_mem.map = pool->bo.map + set->desc_mem.offset; + set->desc_mem.map = pool->bo->map + set->desc_mem.offset; set->desc_surface_state = anv_descriptor_pool_alloc_state(pool); anv_fill_buffer_surface_state(device, set->desc_surface_state, ISL_FORMAT_R32G32B32A32_FLOAT, (struct anv_address) { - .bo = &pool->bo, + .bo = pool->bo, .offset = set->desc_mem.offset, }, layout->descriptor_buffer_size, 1); @@ -990,7 +984,7 @@ * will always write in the immutable sampler regardless of what * is in the sampler parameter. */ - struct VkDescriptorImageInfo info = { }; + VkDescriptorImageInfo info = { }; anv_descriptor_set_write_image_view(device, set, &info, VK_DESCRIPTOR_TYPE_SAMPLER, b, i); @@ -1347,11 +1341,11 @@ element * anv_descriptor_size(bind_layout); if (bind_layout->data & ANV_DESCRIPTOR_ADDRESS_RANGE) { - struct anv_address_range_descriptor desc = { + struct anv_address_range_descriptor desc_data = { .address = anv_address_physical(bind_addr), .range = bind_range, }; - memcpy(desc_map, &desc, sizeof(desc)); + memcpy(desc_map, &desc_data, sizeof(desc_data)); } } diff -Nru mesa-19.2.8/src/intel/vulkan/anv_device.c mesa-20.0.8/src/intel/vulkan/anv_device.c --- mesa-19.2.8/src/intel/vulkan/anv_device.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/vulkan/anv_device.c 2020-06-12 01:21:17.000000000 +0000 @@ -42,6 +42,7 @@ #include "util/xmlpool.h" #include "git_sha1.h" #include "vk_util.h" +#include "common/gen_aux_map.h" #include "common/gen_defines.h" #include "compiler/glsl_types.h" @@ -53,6 +54,11 @@ DRI_CONF_VK_X11_OVERRIDE_MIN_IMAGE_COUNT(0) DRI_CONF_VK_X11_STRICT_IMAGE_COUNT("false") DRI_CONF_SECTION_END + + DRI_CONF_SECTION_DEBUG + DRI_CONF_ALWAYS_FLUSH_CACHE("false") + DRI_CONF_VK_WSI_FORCE_BGRA8_UNORM_FIRST("false") + DRI_CONF_SECTION_END DRI_CONF_END; /* This is probably far to big but it reflects the max size used for messages @@ -65,8 +71,9 @@ { char str[MAX_DEBUG_MESSAGE_LENGTH]; struct anv_device *device = (struct anv_device *)data; + struct anv_instance *instance = device->physical->instance; - if (list_empty(&device->instance->debug_report_callbacks.callbacks)) + if (list_is_empty(&instance->debug_report_callbacks.callbacks)) return; va_list args; @@ -74,7 +81,7 @@ (void) vsnprintf(str, MAX_DEBUG_MESSAGE_LENGTH, fmt, args); va_end(args); - vk_debug_report(&device->instance->debug_report_callbacks, + vk_debug_report(&instance->debug_report_callbacks, VK_DEBUG_REPORT_DEBUG_BIT_EXT, VK_DEBUG_REPORT_OBJECT_TYPE_UNKNOWN_EXT, 0, 0, 0, "anv", str); @@ -121,25 +128,29 @@ static VkResult anv_physical_device_init_heaps(struct anv_physical_device *device, int fd) { - uint64_t gtt_size; if (anv_gem_get_context_param(fd, 0, I915_CONTEXT_PARAM_GTT_SIZE, - >t_size) == -1) { + &device->gtt_size) == -1) { /* If, for whatever reason, we can't actually get the GTT size from the * kernel (too old?) fall back to the aperture size. */ anv_perf_warn(NULL, NULL, "Failed to get I915_CONTEXT_PARAM_GTT_SIZE: %m"); - if (anv_gem_get_aperture(fd, >t_size) == -1) { - return vk_errorf(NULL, NULL, VK_ERROR_INITIALIZATION_FAILED, - "failed to get aperture size: %m"); + if (anv_gem_get_aperture(fd, &device->gtt_size) == -1) { + return vk_errorfi(device->instance, NULL, + VK_ERROR_INITIALIZATION_FAILED, + "failed to get aperture size: %m"); } } + /* We only allow 48-bit addresses with softpin because knowing the actual + * address is required for the vertex cache flush workaround. + */ device->supports_48bit_addresses = (device->info.gen >= 8) && - gtt_size > (4ULL << 30 /* GiB */); + device->has_softpin && + device->gtt_size > (4ULL << 30 /* GiB */); - uint64_t heap_size = anv_compute_heap_size(fd, gtt_size); + uint64_t heap_size = anv_compute_heap_size(fd, device->gtt_size); if (heap_size > (2ull << 30) && !device->supports_48bit_addresses) { /* When running with an overridden PCI ID, we may get a GTT size from @@ -153,69 +164,14 @@ heap_size = 2ull << 30; } - if (heap_size <= 3ull * (1ull << 30)) { - /* In this case, everything fits nicely into the 32-bit address space, - * so there's no need for supporting 48bit addresses on client-allocated - * memory objects. - */ - device->memory.heap_count = 1; - device->memory.heaps[0] = (struct anv_memory_heap) { - .vma_start = LOW_HEAP_MIN_ADDRESS, - .vma_size = LOW_HEAP_SIZE, - .size = heap_size, - .flags = VK_MEMORY_HEAP_DEVICE_LOCAL_BIT, - .supports_48bit_addresses = false, - }; - } else { - /* Not everything will fit nicely into a 32-bit address space. In this - * case we need a 64-bit heap. Advertise a small 32-bit heap and a - * larger 48-bit heap. If we're in this case, then we have a total heap - * size larger than 3GiB which most likely means they have 8 GiB of - * video memory and so carving off 1 GiB for the 32-bit heap should be - * reasonable. - */ - const uint64_t heap_size_32bit = 1ull << 30; - const uint64_t heap_size_48bit = heap_size - heap_size_32bit; - - assert(device->supports_48bit_addresses); - - device->memory.heap_count = 2; - device->memory.heaps[0] = (struct anv_memory_heap) { - .vma_start = HIGH_HEAP_MIN_ADDRESS, - /* Leave the last 4GiB out of the high vma range, so that no state - * base address + size can overflow 48 bits. For more information see - * the comment about Wa32bitGeneralStateOffset in anv_allocator.c - */ - .vma_size = gtt_size - (1ull << 32) - HIGH_HEAP_MIN_ADDRESS, - .size = heap_size_48bit, - .flags = VK_MEMORY_HEAP_DEVICE_LOCAL_BIT, - .supports_48bit_addresses = true, - }; - device->memory.heaps[1] = (struct anv_memory_heap) { - .vma_start = LOW_HEAP_MIN_ADDRESS, - .vma_size = LOW_HEAP_SIZE, - .size = heap_size_32bit, - .flags = VK_MEMORY_HEAP_DEVICE_LOCAL_BIT, - .supports_48bit_addresses = false, - }; - } + device->memory.heap_count = 1; + device->memory.heaps[0] = (struct anv_memory_heap) { + .size = heap_size, + .flags = VK_MEMORY_HEAP_DEVICE_LOCAL_BIT, + }; uint32_t type_count = 0; for (uint32_t heap = 0; heap < device->memory.heap_count; heap++) { - uint32_t valid_buffer_usage = ~0; - - /* There appears to be a hardware issue in the VF cache where it only - * considers the bottom 32 bits of memory addresses. If you happen to - * have two vertex buffers which get placed exactly 4 GiB apart and use - * them in back-to-back draw calls, you can get collisions. In order to - * solve this problem, we require vertex and index buffers be bound to - * memory allocated out of the 32-bit heap. - */ - if (device->memory.heaps[heap].supports_48bit_addresses) { - valid_buffer_usage &= ~(VK_BUFFER_USAGE_INDEX_BUFFER_BIT | - VK_BUFFER_USAGE_VERTEX_BUFFER_BIT); - } - if (device->info.has_llc) { /* Big core GPUs share LLC with the CPU and thus one memory type can be * both cached and coherent at the same time. @@ -226,7 +182,6 @@ VK_MEMORY_PROPERTY_HOST_COHERENT_BIT | VK_MEMORY_PROPERTY_HOST_CACHED_BIT, .heapIndex = heap, - .valid_buffer_usage = valid_buffer_usage, }; } else { /* The spec requires that we expose a host-visible, coherent memory @@ -239,14 +194,12 @@ VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, .heapIndex = heap, - .valid_buffer_usage = valid_buffer_usage, }; device->memory.types[type_count++] = (struct anv_memory_type) { .propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT | VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_CACHED_BIT, .heapIndex = heap, - .valid_buffer_usage = valid_buffer_usage, }; } } @@ -261,16 +214,16 @@ const struct build_id_note *note = build_id_find_nhdr_for_addr(anv_physical_device_init_uuids); if (!note) { - return vk_errorf(device->instance, device, - VK_ERROR_INITIALIZATION_FAILED, - "Failed to find build-id"); + return vk_errorfi(device->instance, NULL, + VK_ERROR_INITIALIZATION_FAILED, + "Failed to find build-id"); } unsigned build_id_len = build_id_length(note); if (build_id_len < 20) { - return vk_errorf(device->instance, device, - VK_ERROR_INITIALIZATION_FAILED, - "build-id too short. It needs to be a SHA"); + return vk_errorfi(device->instance, NULL, + VK_ERROR_INITIALIZATION_FAILED, + "build-id too short. It needs to be a SHA"); } memcpy(device->driver_build_sha1, build_id_data(note), 20); @@ -284,8 +237,8 @@ */ _mesa_sha1_init(&sha1_ctx); _mesa_sha1_update(&sha1_ctx, build_id_data(note), build_id_len); - _mesa_sha1_update(&sha1_ctx, &device->chipset_id, - sizeof(device->chipset_id)); + _mesa_sha1_update(&sha1_ctx, &device->info.chipset_id, + sizeof(device->info.chipset_id)); _mesa_sha1_update(&sha1_ctx, &device->always_use_bindless, sizeof(device->always_use_bindless)); _mesa_sha1_update(&sha1_ctx, &device->has_a64_buffer_access, @@ -311,8 +264,8 @@ * some bits of ISL info to ensure that this is safe. */ _mesa_sha1_init(&sha1_ctx); - _mesa_sha1_update(&sha1_ctx, &device->chipset_id, - sizeof(device->chipset_id)); + _mesa_sha1_update(&sha1_ctx, &device->info.chipset_id, + sizeof(device->info.chipset_id)); _mesa_sha1_update(&sha1_ctx, &device->isl_dev.has_bit6_swizzling, sizeof(device->isl_dev.has_bit6_swizzling)); _mesa_sha1_final(&sha1_ctx, sha1); @@ -327,7 +280,7 @@ #ifdef ENABLE_SHADER_CACHE char renderer[10]; ASSERTED int len = snprintf(renderer, sizeof(renderer), "anv_%04x", - device->chipset_id); + device->info.chipset_id); assert(len == sizeof(renderer) - 2); char timestamp[41]; @@ -376,9 +329,9 @@ } static VkResult -anv_physical_device_init(struct anv_physical_device *device, - struct anv_instance *instance, - drmDevicePtr drm_device) +anv_physical_device_try_create(struct anv_instance *instance, + drmDevicePtr drm_device, + struct anv_physical_device **device_out) { const char *primary_path = drm_device->nodes[DRM_NODE_PRIMARY]; const char *path = drm_device->nodes[DRM_NODE_RENDER]; @@ -392,19 +345,48 @@ if (fd < 0) return vk_error(VK_ERROR_INCOMPATIBLE_DRIVER); + struct gen_device_info devinfo; + if (!gen_get_device_info_from_fd(fd, &devinfo)) { + result = vk_error(VK_ERROR_INCOMPATIBLE_DRIVER); + goto fail_fd; + } + + const char *device_name = gen_get_device_name(devinfo.chipset_id); + + if (devinfo.is_haswell) { + intel_logw("Haswell Vulkan support is incomplete"); + } else if (devinfo.gen == 7 && !devinfo.is_baytrail) { + intel_logw("Ivy Bridge Vulkan support is incomplete"); + } else if (devinfo.gen == 7 && devinfo.is_baytrail) { + intel_logw("Bay Trail Vulkan support is incomplete"); + } else if (devinfo.gen >= 8 && devinfo.gen <= 11) { + /* Gen8-11 fully supported */ + } else if (devinfo.gen == 12) { + intel_logw("Vulkan is not yet fully supported on gen12"); + } else { + result = vk_errorfi(instance, NULL, VK_ERROR_INCOMPATIBLE_DRIVER, + "Vulkan not yet supported on %s", device_name); + goto fail_fd; + } + + struct anv_physical_device *device = + vk_alloc(&instance->alloc, sizeof(*device), 8, + VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE); + if (device == NULL) { + result = vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + goto fail_fd; + } + device->_loader_data.loaderMagic = ICD_LOADER_MAGIC; device->instance = instance; assert(strlen(path) < ARRAY_SIZE(device->path)); snprintf(device->path, ARRAY_SIZE(device->path), "%s", path); - if (!gen_get_device_info_from_fd(fd, &device->info)) { - result = vk_error(VK_ERROR_INCOMPATIBLE_DRIVER); - goto fail; - } - device->chipset_id = device->info.chipset_id; - device->no_hw = device->info.no_hw; + device->info = devinfo; + device->name = device_name; + device->no_hw = device->info.no_hw; if (getenv("INTEL_NO_HW") != NULL) device->no_hw = true; @@ -413,61 +395,41 @@ device->pci_info.device = drm_device->businfo.pci->dev; device->pci_info.function = drm_device->businfo.pci->func; - device->name = gen_get_device_name(device->chipset_id); - - if (device->info.is_haswell) { - intel_logw("Haswell Vulkan support is incomplete"); - } else if (device->info.gen == 7 && !device->info.is_baytrail) { - intel_logw("Ivy Bridge Vulkan support is incomplete"); - } else if (device->info.gen == 7 && device->info.is_baytrail) { - intel_logw("Bay Trail Vulkan support is incomplete"); - } else if (device->info.gen >= 8 && device->info.gen <= 11) { - /* Gen8-11 fully supported */ - } else { - result = vk_errorf(device->instance, device, - VK_ERROR_INCOMPATIBLE_DRIVER, - "Vulkan not yet supported on %s", device->name); - goto fail; - } - device->cmd_parser_version = -1; if (device->info.gen == 7) { device->cmd_parser_version = anv_gem_get_param(fd, I915_PARAM_CMD_PARSER_VERSION); if (device->cmd_parser_version == -1) { - result = vk_errorf(device->instance, device, - VK_ERROR_INITIALIZATION_FAILED, - "failed to get command parser version"); - goto fail; + result = vk_errorfi(device->instance, NULL, + VK_ERROR_INITIALIZATION_FAILED, + "failed to get command parser version"); + goto fail_alloc; } } if (!anv_gem_get_param(fd, I915_PARAM_HAS_WAIT_TIMEOUT)) { - result = vk_errorf(device->instance, device, - VK_ERROR_INITIALIZATION_FAILED, - "kernel missing gem wait"); - goto fail; + result = vk_errorfi(device->instance, NULL, + VK_ERROR_INITIALIZATION_FAILED, + "kernel missing gem wait"); + goto fail_alloc; } if (!anv_gem_get_param(fd, I915_PARAM_HAS_EXECBUF2)) { - result = vk_errorf(device->instance, device, - VK_ERROR_INITIALIZATION_FAILED, - "kernel missing execbuf2"); - goto fail; + result = vk_errorfi(device->instance, NULL, + VK_ERROR_INITIALIZATION_FAILED, + "kernel missing execbuf2"); + goto fail_alloc; } if (!device->info.has_llc && anv_gem_get_param(fd, I915_PARAM_MMAP_VERSION) < 1) { - result = vk_errorf(device->instance, device, - VK_ERROR_INITIALIZATION_FAILED, - "kernel missing wc mmap"); - goto fail; + result = vk_errorfi(device->instance, NULL, + VK_ERROR_INITIALIZATION_FAILED, + "kernel missing wc mmap"); + goto fail_alloc; } - result = anv_physical_device_init_heaps(device, fd); - if (result != VK_SUCCESS) - goto fail; - + device->has_softpin = anv_gem_get_param(fd, I915_PARAM_HAS_EXEC_SOFTPIN); device->has_exec_async = anv_gem_get_param(fd, I915_PARAM_HAS_EXEC_ASYNC); device->has_exec_capture = anv_gem_get_param(fd, I915_PARAM_HAS_EXEC_CAPTURE); device->has_exec_fence = anv_gem_get_param(fd, I915_PARAM_HAS_EXEC_FENCE); @@ -476,8 +438,12 @@ anv_gem_supports_syncobj_wait(fd); device->has_context_priority = anv_gem_has_context_priority(fd); - device->use_softpin = anv_gem_get_param(fd, I915_PARAM_HAS_EXEC_SOFTPIN) - && device->supports_48bit_addresses; + result = anv_physical_device_init_heaps(device, fd); + if (result != VK_SUCCESS) + goto fail_alloc; + + device->use_softpin = device->has_softpin && + device->supports_48bit_addresses; device->has_context_isolation = anv_gem_get_param(fd, I915_PARAM_HAS_CONTEXT_ISOLATION); @@ -504,8 +470,13 @@ */ device->has_bindless_samplers = device->info.gen >= 8; + device->has_implicit_ccs = device->info.has_aux_map; + device->has_mem_available = get_available_system_memory() != 0; + device->always_flush_cache = + driQueryOptionb(&instance->dri_options, "always_flush_cache"); + /* Starting with Gen10, the timestamp frequency of the command streamer may * vary from one part to another. We can query the value from the kernel. */ @@ -549,7 +520,7 @@ device->compiler = brw_compiler_create(NULL, &device->info); if (device->compiler == NULL) { result = vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); - goto fail; + goto fail_alloc; } device->compiler->shader_debug_log = compiler_debug_log; device->compiler->shader_perf_log = compiler_perf_log; @@ -557,6 +528,7 @@ device->compiler->constant_buffer_0_is_relative = device->info.gen < 8 || !device->has_context_isolation; device->compiler->supports_shader_constants = true; + device->compiler->compact_params = false; /* Broadwell PRM says: * @@ -576,7 +548,7 @@ result = anv_physical_device_init_uuids(device); if (result != VK_SUCCESS) - goto fail; + goto fail_compiler; anv_physical_device_init_disk_cache(device); @@ -595,11 +567,10 @@ device->master_fd = master_fd; result = anv_init_wsi(device); - if (result != VK_SUCCESS) { - ralloc_free(device->compiler); - anv_physical_device_free_disk_cache(device); - goto fail; - } + if (result != VK_SUCCESS) + goto fail_disk_cache; + + device->perf = anv_get_perf(&device->info, fd); anv_physical_device_get_supported_extensions(device, &device->supported_extensions); @@ -607,9 +578,17 @@ device->local_fd = fd; + *device_out = device; + return VK_SUCCESS; -fail: +fail_disk_cache: + anv_physical_device_free_disk_cache(device); +fail_compiler: + ralloc_free(device->compiler); +fail_alloc: + vk_free(&instance->alloc, device); +fail_fd: close(fd); if (master_fd != -1) close(master_fd); @@ -617,14 +596,16 @@ } static void -anv_physical_device_finish(struct anv_physical_device *device) +anv_physical_device_destroy(struct anv_physical_device *device) { anv_finish_wsi(device); anv_physical_device_free_disk_cache(device); ralloc_free(device->compiler); + ralloc_free(device->perf); close(device->local_fd); if (device->master_fd >= 0) close(device->master_fd); + vk_free(&device->instance->alloc, device); } static void * @@ -747,6 +728,19 @@ } } + for (unsigned i = 0; i < ARRAY_SIZE(instance->physical_device_dispatch.entrypoints); i++) { + /* Vulkan requires that entrypoints for extensions which have not been + * enabled must not be advertised. + */ + if (!anv_physical_device_entrypoint_is_enabled(i, instance->app_info.api_version, + &instance->enabled_extensions)) { + instance->physical_device_dispatch.entrypoints[i] = NULL; + } else { + instance->physical_device_dispatch.entrypoints[i] = + anv_physical_device_dispatch_table.entrypoints[i]; + } + } + for (unsigned i = 0; i < ARRAY_SIZE(instance->device_dispatch.entrypoints); i++) { /* Vulkan requires that entrypoints for extensions which have not been * enabled must not be advertised. @@ -760,7 +754,8 @@ } } - instance->physicalDeviceCount = -1; + instance->physical_devices_enumerated = false; + list_inithead(&instance->physical_devices); result = vk_debug_report_instance_init(&instance->debug_report_callbacks); if (result != VK_SUCCESS) { @@ -795,11 +790,9 @@ if (!instance) return; - if (instance->physicalDeviceCount > 0) { - /* We support at most one physical device. */ - assert(instance->physicalDeviceCount == 1); - anv_physical_device_finish(&instance->physicalDevice); - } + list_for_each_entry_safe(struct anv_physical_device, pdevice, + &instance->physical_devices, link) + anv_physical_device_destroy(pdevice); vk_free(&instance->alloc, (char *)instance->app_info.app_name); vk_free(&instance->alloc, (char *)instance->app_info.engine_name); @@ -817,51 +810,49 @@ } static VkResult -anv_enumerate_devices(struct anv_instance *instance) +anv_enumerate_physical_devices(struct anv_instance *instance) { + if (instance->physical_devices_enumerated) + return VK_SUCCESS; + + instance->physical_devices_enumerated = true; + /* TODO: Check for more devices ? */ drmDevicePtr devices[8]; - VkResult result = VK_ERROR_INCOMPATIBLE_DRIVER; int max_devices; - instance->physicalDeviceCount = 0; - max_devices = drmGetDevices2(0, devices, ARRAY_SIZE(devices)); if (max_devices < 1) - return VK_ERROR_INCOMPATIBLE_DRIVER; + return VK_SUCCESS; + VkResult result = VK_SUCCESS; for (unsigned i = 0; i < (unsigned)max_devices; i++) { if (devices[i]->available_nodes & 1 << DRM_NODE_RENDER && devices[i]->bustype == DRM_BUS_PCI && devices[i]->deviceinfo.pci->vendor_id == 0x8086) { - result = anv_physical_device_init(&instance->physicalDevice, - instance, devices[i]); - if (result != VK_ERROR_INCOMPATIBLE_DRIVER) + struct anv_physical_device *pdevice; + result = anv_physical_device_try_create(instance, devices[i], + &pdevice); + /* Incompatible DRM device, skip. */ + if (result == VK_ERROR_INCOMPATIBLE_DRIVER) { + result = VK_SUCCESS; + continue; + } + + /* Error creating the physical device, report the error. */ + if (result != VK_SUCCESS) break; + + list_addtail(&pdevice->link, &instance->physical_devices); } } drmFreeDevices(devices, max_devices); - if (result == VK_SUCCESS) - instance->physicalDeviceCount = 1; - + /* If we successfully enumerated any devices, call it success */ return result; } -static VkResult -anv_instance_ensure_physical_device(struct anv_instance *instance) -{ - if (instance->physicalDeviceCount < 0) { - VkResult result = anv_enumerate_devices(instance); - if (result != VK_SUCCESS && - result != VK_ERROR_INCOMPATIBLE_DRIVER) - return result; - } - - return VK_SUCCESS; -} - VkResult anv_EnumeratePhysicalDevices( VkInstance _instance, uint32_t* pPhysicalDeviceCount, @@ -870,16 +861,15 @@ ANV_FROM_HANDLE(anv_instance, instance, _instance); VK_OUTARRAY_MAKE(out, pPhysicalDevices, pPhysicalDeviceCount); - VkResult result = anv_instance_ensure_physical_device(instance); + VkResult result = anv_enumerate_physical_devices(instance); if (result != VK_SUCCESS) return result; - if (instance->physicalDeviceCount == 0) - return VK_SUCCESS; - - assert(instance->physicalDeviceCount == 1); - vk_outarray_append(&out, i) { - *i = anv_physical_device_to_handle(&instance->physicalDevice); + list_for_each_entry(struct anv_physical_device, pdevice, + &instance->physical_devices, link) { + vk_outarray_append(&out, i) { + *i = anv_physical_device_to_handle(pdevice); + } } return vk_outarray_status(&out); @@ -894,24 +884,21 @@ VK_OUTARRAY_MAKE(out, pPhysicalDeviceGroupProperties, pPhysicalDeviceGroupCount); - VkResult result = anv_instance_ensure_physical_device(instance); + VkResult result = anv_enumerate_physical_devices(instance); if (result != VK_SUCCESS) return result; - if (instance->physicalDeviceCount == 0) - return VK_SUCCESS; - - assert(instance->physicalDeviceCount == 1); - - vk_outarray_append(&out, p) { - p->physicalDeviceCount = 1; - memset(p->physicalDevices, 0, sizeof(p->physicalDevices)); - p->physicalDevices[0] = - anv_physical_device_to_handle(&instance->physicalDevice); - p->subsetAllocation = false; + list_for_each_entry(struct anv_physical_device, pdevice, + &instance->physical_devices, link) { + vk_outarray_append(&out, p) { + p->physicalDeviceCount = 1; + memset(p->physicalDevices, 0, sizeof(p->physicalDevices)); + p->physicalDevices[0] = anv_physical_device_to_handle(pdevice); + p->subsetAllocation = false; - vk_foreach_struct(ext, p->pNext) - anv_debug_ignored_stype(ext->sType); + vk_foreach_struct(ext, p->pNext) + anv_debug_ignored_stype(ext->sType); + } } return vk_outarray_status(&out); @@ -938,7 +925,7 @@ .depthClamp = true, .depthBiasClamp = true, .fillModeNonSolid = true, - .depthBounds = false, + .depthBounds = pdevice->info.gen >= 12, .wideLines = true, .largePoints = true, .alphaToOne = true, @@ -964,9 +951,9 @@ .shaderClipDistance = true, .shaderCullDistance = true, .shaderFloat64 = pdevice->info.gen >= 8 && - pdevice->info.has_64bit_types, + pdevice->info.has_64bit_float, .shaderInt64 = pdevice->info.gen >= 8 && - pdevice->info.has_64bit_types, + pdevice->info.has_64bit_int, .shaderInt16 = pdevice->info.gen >= 8, .shaderResourceMinLod = pdevice->info.gen >= 9, .variableMultisampleRate = true, @@ -988,6 +975,86 @@ pFeatures->depthBounds = true; } +static void +anv_get_physical_device_features_1_1(struct anv_physical_device *pdevice, + VkPhysicalDeviceVulkan11Features *f) +{ + assert(f->sType == VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_FEATURES); + + f->storageBuffer16BitAccess = pdevice->info.gen >= 8; + f->uniformAndStorageBuffer16BitAccess = pdevice->info.gen >= 8; + f->storagePushConstant16 = pdevice->info.gen >= 8; + f->storageInputOutput16 = false; + f->multiview = true; + f->multiviewGeometryShader = true; + f->multiviewTessellationShader = true; + f->variablePointersStorageBuffer = true; + f->variablePointers = true; + f->protectedMemory = false; + f->samplerYcbcrConversion = true; + f->shaderDrawParameters = true; +} + +static void +anv_get_physical_device_features_1_2(struct anv_physical_device *pdevice, + VkPhysicalDeviceVulkan12Features *f) +{ + assert(f->sType == VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES); + + f->samplerMirrorClampToEdge = true; + f->drawIndirectCount = true; + f->storageBuffer8BitAccess = pdevice->info.gen >= 8; + f->uniformAndStorageBuffer8BitAccess = pdevice->info.gen >= 8; + f->storagePushConstant8 = pdevice->info.gen >= 8; + f->shaderBufferInt64Atomics = pdevice->info.gen >= 9 && + pdevice->use_softpin; + f->shaderSharedInt64Atomics = false; + f->shaderFloat16 = pdevice->info.gen >= 8; + f->shaderInt8 = pdevice->info.gen >= 8; + + bool descIndexing = pdevice->has_a64_buffer_access && + pdevice->has_bindless_images; + f->descriptorIndexing = descIndexing; + f->shaderInputAttachmentArrayDynamicIndexing = false; + f->shaderUniformTexelBufferArrayDynamicIndexing = descIndexing; + f->shaderStorageTexelBufferArrayDynamicIndexing = descIndexing; + f->shaderUniformBufferArrayNonUniformIndexing = false; + f->shaderSampledImageArrayNonUniformIndexing = descIndexing; + f->shaderStorageBufferArrayNonUniformIndexing = descIndexing; + f->shaderStorageImageArrayNonUniformIndexing = descIndexing; + f->shaderInputAttachmentArrayNonUniformIndexing = false; + f->shaderUniformTexelBufferArrayNonUniformIndexing = descIndexing; + f->shaderStorageTexelBufferArrayNonUniformIndexing = descIndexing; + f->descriptorBindingUniformBufferUpdateAfterBind = false; + f->descriptorBindingSampledImageUpdateAfterBind = descIndexing; + f->descriptorBindingStorageImageUpdateAfterBind = descIndexing; + f->descriptorBindingStorageBufferUpdateAfterBind = descIndexing; + f->descriptorBindingUniformTexelBufferUpdateAfterBind = descIndexing; + f->descriptorBindingStorageTexelBufferUpdateAfterBind = descIndexing; + f->descriptorBindingUpdateUnusedWhilePending = descIndexing; + f->descriptorBindingPartiallyBound = descIndexing; + f->descriptorBindingVariableDescriptorCount = false; + f->runtimeDescriptorArray = descIndexing; + + f->samplerFilterMinmax = pdevice->info.gen >= 9; + f->scalarBlockLayout = true; + f->imagelessFramebuffer = true; + f->uniformBufferStandardLayout = true; + f->shaderSubgroupExtendedTypes = true; + f->separateDepthStencilLayouts = true; + f->hostQueryReset = true; + f->timelineSemaphore = true; + f->bufferDeviceAddress = pdevice->has_a64_buffer_access; + f->bufferDeviceAddressCaptureReplay = pdevice->has_a64_buffer_access; + f->bufferDeviceAddressMultiDevice = false; + f->vulkanMemoryModel = true; + f->vulkanMemoryModelDeviceScope = true; + f->vulkanMemoryModelAvailabilityVisibilityChains = true; + f->shaderOutputViewportIndex = true; + f->shaderOutputLayer = true; + f->subgroupBroadcastDynamicId = true; +} + void anv_GetPhysicalDeviceFeatures2( VkPhysicalDevice physicalDevice, VkPhysicalDeviceFeatures2* pFeatures) @@ -995,24 +1062,38 @@ ANV_FROM_HANDLE(anv_physical_device, pdevice, physicalDevice); anv_GetPhysicalDeviceFeatures(physicalDevice, &pFeatures->features); + VkPhysicalDeviceVulkan11Features core_1_1 = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_FEATURES, + }; + anv_get_physical_device_features_1_1(pdevice, &core_1_1); + + VkPhysicalDeviceVulkan12Features core_1_2 = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES, + }; + anv_get_physical_device_features_1_2(pdevice, &core_1_2); + +#define CORE_FEATURE(major, minor, feature) \ + features->feature = core_##major##_##minor.feature + + vk_foreach_struct(ext, pFeatures->pNext) { switch (ext->sType) { case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_8BIT_STORAGE_FEATURES_KHR: { VkPhysicalDevice8BitStorageFeaturesKHR *features = (VkPhysicalDevice8BitStorageFeaturesKHR *)ext; - features->storageBuffer8BitAccess = pdevice->info.gen >= 8; - features->uniformAndStorageBuffer8BitAccess = pdevice->info.gen >= 8; - features->storagePushConstant8 = pdevice->info.gen >= 8; + CORE_FEATURE(1, 2, storageBuffer8BitAccess); + CORE_FEATURE(1, 2, uniformAndStorageBuffer8BitAccess); + CORE_FEATURE(1, 2, storagePushConstant8); break; } case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_16BIT_STORAGE_FEATURES: { VkPhysicalDevice16BitStorageFeatures *features = (VkPhysicalDevice16BitStorageFeatures *)ext; - features->storageBuffer16BitAccess = pdevice->info.gen >= 8; - features->uniformAndStorageBuffer16BitAccess = pdevice->info.gen >= 8; - features->storagePushConstant16 = pdevice->info.gen >= 8; - features->storageInputOutput16 = false; + CORE_FEATURE(1, 1, storageBuffer16BitAccess); + CORE_FEATURE(1, 1, uniformAndStorageBuffer16BitAccess); + CORE_FEATURE(1, 1, storagePushConstant16); + CORE_FEATURE(1, 1, storageInputOutput16); break; } @@ -1024,6 +1105,14 @@ break; } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_BUFFER_DEVICE_ADDRESS_FEATURES_KHR: { + VkPhysicalDeviceBufferDeviceAddressFeaturesKHR *features = (void *)ext; + CORE_FEATURE(1, 2, bufferDeviceAddress); + CORE_FEATURE(1, 2, bufferDeviceAddressCaptureReplay); + CORE_FEATURE(1, 2, bufferDeviceAddressMultiDevice); + break; + } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COMPUTE_SHADER_DERIVATIVES_FEATURES_NV: { VkPhysicalDeviceComputeShaderDerivativesFeaturesNV *features = (VkPhysicalDeviceComputeShaderDerivativesFeaturesNV *)ext; @@ -1051,8 +1140,8 @@ case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT16_INT8_FEATURES_KHR: { VkPhysicalDeviceFloat16Int8FeaturesKHR *features = (void *)ext; - features->shaderFloat16 = pdevice->info.gen >= 8; - features->shaderInt8 = pdevice->info.gen >= 8; + CORE_FEATURE(1, 2, shaderFloat16); + CORE_FEATURE(1, 2, shaderInt8); break; } @@ -1068,33 +1157,33 @@ case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_HOST_QUERY_RESET_FEATURES_EXT: { VkPhysicalDeviceHostQueryResetFeaturesEXT *features = (VkPhysicalDeviceHostQueryResetFeaturesEXT *)ext; - features->hostQueryReset = true; + CORE_FEATURE(1, 2, hostQueryReset); break; } case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DESCRIPTOR_INDEXING_FEATURES_EXT: { VkPhysicalDeviceDescriptorIndexingFeaturesEXT *features = (VkPhysicalDeviceDescriptorIndexingFeaturesEXT *)ext; - features->shaderInputAttachmentArrayDynamicIndexing = false; - features->shaderUniformTexelBufferArrayDynamicIndexing = true; - features->shaderStorageTexelBufferArrayDynamicIndexing = true; - features->shaderUniformBufferArrayNonUniformIndexing = false; - features->shaderSampledImageArrayNonUniformIndexing = true; - features->shaderStorageBufferArrayNonUniformIndexing = true; - features->shaderStorageImageArrayNonUniformIndexing = true; - features->shaderInputAttachmentArrayNonUniformIndexing = false; - features->shaderUniformTexelBufferArrayNonUniformIndexing = true; - features->shaderStorageTexelBufferArrayNonUniformIndexing = true; - features->descriptorBindingUniformBufferUpdateAfterBind = false; - features->descriptorBindingSampledImageUpdateAfterBind = true; - features->descriptorBindingStorageImageUpdateAfterBind = true; - features->descriptorBindingStorageBufferUpdateAfterBind = true; - features->descriptorBindingUniformTexelBufferUpdateAfterBind = true; - features->descriptorBindingStorageTexelBufferUpdateAfterBind = true; - features->descriptorBindingUpdateUnusedWhilePending = true; - features->descriptorBindingPartiallyBound = true; - features->descriptorBindingVariableDescriptorCount = false; - features->runtimeDescriptorArray = true; + CORE_FEATURE(1, 2, shaderInputAttachmentArrayDynamicIndexing); + CORE_FEATURE(1, 2, shaderUniformTexelBufferArrayDynamicIndexing); + CORE_FEATURE(1, 2, shaderStorageTexelBufferArrayDynamicIndexing); + CORE_FEATURE(1, 2, shaderUniformBufferArrayNonUniformIndexing); + CORE_FEATURE(1, 2, shaderSampledImageArrayNonUniformIndexing); + CORE_FEATURE(1, 2, shaderStorageBufferArrayNonUniformIndexing); + CORE_FEATURE(1, 2, shaderStorageImageArrayNonUniformIndexing); + CORE_FEATURE(1, 2, shaderInputAttachmentArrayNonUniformIndexing); + CORE_FEATURE(1, 2, shaderUniformTexelBufferArrayNonUniformIndexing); + CORE_FEATURE(1, 2, shaderStorageTexelBufferArrayNonUniformIndexing); + CORE_FEATURE(1, 2, descriptorBindingUniformBufferUpdateAfterBind); + CORE_FEATURE(1, 2, descriptorBindingSampledImageUpdateAfterBind); + CORE_FEATURE(1, 2, descriptorBindingStorageImageUpdateAfterBind); + CORE_FEATURE(1, 2, descriptorBindingStorageBufferUpdateAfterBind); + CORE_FEATURE(1, 2, descriptorBindingUniformTexelBufferUpdateAfterBind); + CORE_FEATURE(1, 2, descriptorBindingStorageTexelBufferUpdateAfterBind); + CORE_FEATURE(1, 2, descriptorBindingUpdateUnusedWhilePending); + CORE_FEATURE(1, 2, descriptorBindingPartiallyBound); + CORE_FEATURE(1, 2, descriptorBindingVariableDescriptorCount); + CORE_FEATURE(1, 2, runtimeDescriptorArray); break; } @@ -1118,7 +1207,15 @@ (VkPhysicalDeviceLineRasterizationFeaturesEXT *)ext; features->rectangularLines = true; features->bresenhamLines = true; - features->smoothLines = true; + /* Support for Smooth lines with MSAA was removed on gen11. From the + * BSpec section "Multisample ModesState" table for "AA Line Support + * Requirements": + * + * GEN10:BUG:######## NUM_MULTISAMPLES == 1 + * + * Fortunately, this isn't a case most people care about. + */ + features->smoothLines = pdevice->info.gen < 10; features->stippledRectangularLines = false; features->stippledBresenhamLines = true; features->stippledSmoothLines = false; @@ -1128,16 +1225,16 @@ case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MULTIVIEW_FEATURES: { VkPhysicalDeviceMultiviewFeatures *features = (VkPhysicalDeviceMultiviewFeatures *)ext; - features->multiview = true; - features->multiviewGeometryShader = true; - features->multiviewTessellationShader = true; + CORE_FEATURE(1, 1, multiview); + CORE_FEATURE(1, 1, multiviewGeometryShader); + CORE_FEATURE(1, 1, multiviewTessellationShader); break; } case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGELESS_FRAMEBUFFER_FEATURES_KHR: { VkPhysicalDeviceImagelessFramebufferFeaturesKHR *features = (VkPhysicalDeviceImagelessFramebufferFeaturesKHR *)ext; - features->imagelessFramebuffer = true; + CORE_FEATURE(1, 2, imagelessFramebuffer); break; } @@ -1150,29 +1247,35 @@ case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROTECTED_MEMORY_FEATURES: { VkPhysicalDeviceProtectedMemoryFeatures *features = (void *)ext; - features->protectedMemory = false; + CORE_FEATURE(1, 1, protectedMemory); break; } case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SAMPLER_YCBCR_CONVERSION_FEATURES: { VkPhysicalDeviceSamplerYcbcrConversionFeatures *features = (VkPhysicalDeviceSamplerYcbcrConversionFeatures *) ext; - features->samplerYcbcrConversion = true; + CORE_FEATURE(1, 1, samplerYcbcrConversion); break; } case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SCALAR_BLOCK_LAYOUT_FEATURES_EXT: { VkPhysicalDeviceScalarBlockLayoutFeaturesEXT *features = (VkPhysicalDeviceScalarBlockLayoutFeaturesEXT *)ext; - features->scalarBlockLayout = true; + CORE_FEATURE(1, 2, scalarBlockLayout); + break; + } + + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SEPARATE_DEPTH_STENCIL_LAYOUTS_FEATURES_KHR: { + VkPhysicalDeviceSeparateDepthStencilLayoutsFeaturesKHR *features = + (VkPhysicalDeviceSeparateDepthStencilLayoutsFeaturesKHR *)ext; + CORE_FEATURE(1, 2, separateDepthStencilLayouts); break; } case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_ATOMIC_INT64_FEATURES_KHR: { VkPhysicalDeviceShaderAtomicInt64FeaturesKHR *features = (void *)ext; - features->shaderBufferInt64Atomics = - pdevice->info.gen >= 9 && pdevice->use_softpin; - features->shaderSharedInt64Atomics = VK_FALSE; + CORE_FEATURE(1, 2, shaderBufferInt64Atomics); + CORE_FEATURE(1, 2, shaderSharedInt64Atomics); break; } @@ -1182,9 +1285,24 @@ break; } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_CLOCK_FEATURES_KHR: { + VkPhysicalDeviceShaderClockFeaturesKHR *features = + (VkPhysicalDeviceShaderClockFeaturesKHR *)ext; + features->shaderSubgroupClock = true; + features->shaderDeviceClock = false; + break; + } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_DRAW_PARAMETERS_FEATURES: { VkPhysicalDeviceShaderDrawParametersFeatures *features = (void *)ext; - features->shaderDrawParameters = true; + CORE_FEATURE(1, 1, shaderDrawParameters); + break; + } + + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_SUBGROUP_EXTENDED_TYPES_FEATURES_KHR: { + VkPhysicalDeviceShaderSubgroupExtendedTypesFeaturesKHR *features = + (VkPhysicalDeviceShaderSubgroupExtendedTypesFeaturesKHR *)ext; + CORE_FEATURE(1, 2, shaderSubgroupExtendedTypes); break; } @@ -1203,10 +1321,17 @@ break; } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TIMELINE_SEMAPHORE_FEATURES_KHR: { + VkPhysicalDeviceTimelineSemaphoreFeaturesKHR *features = + (VkPhysicalDeviceTimelineSemaphoreFeaturesKHR *) ext; + CORE_FEATURE(1, 2, timelineSemaphore); + break; + } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VARIABLE_POINTERS_FEATURES: { VkPhysicalDeviceVariablePointersFeatures *features = (void *)ext; - features->variablePointersStorageBuffer = true; - features->variablePointers = true; + CORE_FEATURE(1, 1, variablePointersStorageBuffer); + CORE_FEATURE(1, 1, variablePointers); break; } @@ -1221,7 +1346,7 @@ case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_UNIFORM_BUFFER_STANDARD_LAYOUT_FEATURES_KHR: { VkPhysicalDeviceUniformBufferStandardLayoutFeaturesKHR *features = (VkPhysicalDeviceUniformBufferStandardLayoutFeaturesKHR *)ext; - features->uniformBufferStandardLayout = true; + CORE_FEATURE(1, 2, uniformBufferStandardLayout); break; } @@ -1233,6 +1358,22 @@ break; } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_FEATURES: + anv_get_physical_device_features_1_1(pdevice, (void *)ext); + break; + + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES: + anv_get_physical_device_features_1_2(pdevice, (void *)ext); + break; + + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_MEMORY_MODEL_FEATURES_KHR: { + VkPhysicalDeviceVulkanMemoryModelFeaturesKHR *features = (void *)ext; + CORE_FEATURE(1, 2, vulkanMemoryModel); + CORE_FEATURE(1, 2, vulkanMemoryModelDeviceScope); + CORE_FEATURE(1, 2, vulkanMemoryModelAvailabilityVisibilityChains); + break; + } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_YCBCR_IMAGE_ARRAYS_FEATURES_EXT: { VkPhysicalDeviceYcbcrImageArraysFeaturesEXT *features = (VkPhysicalDeviceYcbcrImageArraysFeaturesEXT *)ext; @@ -1245,6 +1386,8 @@ break; } } + +#undef CORE_FEATURE } #define MAX_PER_STAGE_DESCRIPTOR_UNIFORM_BUFFERS 64 @@ -1279,7 +1422,8 @@ pdevice->has_bindless_images && pdevice->has_a64_buffer_access ? UINT32_MAX : MAX_BINDING_TABLE_SIZE - MAX_RTS - 1; - const uint32_t max_workgroup_size = 32 * devinfo->max_cs_threads; + /* Limit max_threads to 64 for the GPGPU_WALKER command */ + const uint32_t max_workgroup_size = 32 * MIN2(64, devinfo->max_cs_threads); VkSampleCountFlags sample_counts = isl_device_get_sample_counts(&pdevice->isl_dev); @@ -1380,7 +1524,7 @@ .framebufferNoAttachmentsSampleCounts = sample_counts, .maxColorAttachments = MAX_RTS, .sampledImageColorSampleCounts = sample_counts, - .sampledImageIntegerSampleCounts = VK_SAMPLE_COUNT_1_BIT, + .sampledImageIntegerSampleCounts = sample_counts, .sampledImageDepthSampleCounts = sample_counts, .sampledImageStencilSampleCounts = sample_counts, .storageImageSampleCounts = VK_SAMPLE_COUNT_1_BIT, @@ -1410,7 +1554,7 @@ .apiVersion = anv_physical_device_api_version(pdevice), .driverVersion = vk_get_driver_version(), .vendorID = 0x8086, - .deviceID = pdevice->chipset_id, + .deviceID = pdevice->info.chipset_id, .deviceType = VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU, .limits = limits, .sparseProperties = {0}, /* Broadwell doesn't do sparse. */ @@ -1422,6 +1566,164 @@ pdevice->pipeline_cache_uuid, VK_UUID_SIZE); } +static void +anv_get_physical_device_properties_1_1(struct anv_physical_device *pdevice, + VkPhysicalDeviceVulkan11Properties *p) +{ + assert(p->sType == VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_PROPERTIES); + + memcpy(p->deviceUUID, pdevice->device_uuid, VK_UUID_SIZE); + memcpy(p->driverUUID, pdevice->driver_uuid, VK_UUID_SIZE); + memset(p->deviceLUID, 0, VK_LUID_SIZE); + p->deviceNodeMask = 0; + p->deviceLUIDValid = false; + + p->subgroupSize = BRW_SUBGROUP_SIZE; + VkShaderStageFlags scalar_stages = 0; + for (unsigned stage = 0; stage < MESA_SHADER_STAGES; stage++) { + if (pdevice->compiler->scalar_stage[stage]) + scalar_stages |= mesa_to_vk_shader_stage(stage); + } + p->subgroupSupportedStages = scalar_stages; + p->subgroupSupportedOperations = VK_SUBGROUP_FEATURE_BASIC_BIT | + VK_SUBGROUP_FEATURE_VOTE_BIT | + VK_SUBGROUP_FEATURE_BALLOT_BIT | + VK_SUBGROUP_FEATURE_SHUFFLE_BIT | + VK_SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT | + VK_SUBGROUP_FEATURE_QUAD_BIT; + if (pdevice->info.gen >= 8) { + /* TODO: There's no technical reason why these can't be made to + * work on gen7 but they don't at the moment so it's best to leave + * the feature disabled than enabled and broken. + */ + p->subgroupSupportedOperations |= VK_SUBGROUP_FEATURE_ARITHMETIC_BIT | + VK_SUBGROUP_FEATURE_CLUSTERED_BIT; + } + p->subgroupQuadOperationsInAllStages = pdevice->info.gen >= 8; + + p->pointClippingBehavior = VK_POINT_CLIPPING_BEHAVIOR_USER_CLIP_PLANES_ONLY; + p->maxMultiviewViewCount = 16; + p->maxMultiviewInstanceIndex = UINT32_MAX / 16; + p->protectedNoFault = false; + /* This value doesn't matter for us today as our per-stage descriptors are + * the real limit. + */ + p->maxPerSetDescriptors = 1024; + p->maxMemoryAllocationSize = MAX_MEMORY_ALLOCATION_SIZE; +} + +static void +anv_get_physical_device_properties_1_2(struct anv_physical_device *pdevice, + VkPhysicalDeviceVulkan12Properties *p) +{ + assert(p->sType == VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_PROPERTIES); + + p->driverID = VK_DRIVER_ID_INTEL_OPEN_SOURCE_MESA_KHR; + memset(p->driverName, 0, sizeof(p->driverName)); + snprintf(p->driverName, VK_MAX_DRIVER_NAME_SIZE_KHR, + "Intel open-source Mesa driver"); + memset(p->driverInfo, 0, sizeof(p->driverInfo)); + snprintf(p->driverInfo, VK_MAX_DRIVER_INFO_SIZE_KHR, + "Mesa " PACKAGE_VERSION MESA_GIT_SHA1); + p->conformanceVersion = (VkConformanceVersionKHR) { + .major = 1, + .minor = 2, + .subminor = 0, + .patch = 0, + }; + + p->denormBehaviorIndependence = + VK_SHADER_FLOAT_CONTROLS_INDEPENDENCE_ALL_KHR; + p->roundingModeIndependence = + VK_SHADER_FLOAT_CONTROLS_INDEPENDENCE_NONE_KHR; + + /* Broadwell does not support HF denorms and there are restrictions + * other gens. According to Kabylake's PRM: + * + * "math - Extended Math Function + * [...] + * Restriction : Half-float denorms are always retained." + */ + p->shaderDenormFlushToZeroFloat16 = false; + p->shaderDenormPreserveFloat16 = pdevice->info.gen > 8; + p->shaderRoundingModeRTEFloat16 = true; + p->shaderRoundingModeRTZFloat16 = true; + p->shaderSignedZeroInfNanPreserveFloat16 = true; + + p->shaderDenormFlushToZeroFloat32 = true; + p->shaderDenormPreserveFloat32 = true; + p->shaderRoundingModeRTEFloat32 = true; + p->shaderRoundingModeRTZFloat32 = true; + p->shaderSignedZeroInfNanPreserveFloat32 = true; + + p->shaderDenormFlushToZeroFloat64 = true; + p->shaderDenormPreserveFloat64 = true; + p->shaderRoundingModeRTEFloat64 = true; + p->shaderRoundingModeRTZFloat64 = true; + p->shaderSignedZeroInfNanPreserveFloat64 = true; + + /* It's a bit hard to exactly map our implementation to the limits + * described here. The bindless surface handle in the extended + * message descriptors is 20 bits and it's an index into the table of + * RENDER_SURFACE_STATE structs that starts at bindless surface base + * address. Given that most things consume two surface states per + * view (general/sampled for textures and write-only/read-write for + * images), we claim 2^19 things. + * + * For SSBOs, we just use A64 messages so there is no real limit + * there beyond the limit on the total size of a descriptor set. + */ + const unsigned max_bindless_views = 1 << 19; + p->maxUpdateAfterBindDescriptorsInAllPools = max_bindless_views; + p->shaderUniformBufferArrayNonUniformIndexingNative = false; + p->shaderSampledImageArrayNonUniformIndexingNative = false; + p->shaderStorageBufferArrayNonUniformIndexingNative = true; + p->shaderStorageImageArrayNonUniformIndexingNative = false; + p->shaderInputAttachmentArrayNonUniformIndexingNative = false; + p->robustBufferAccessUpdateAfterBind = true; + p->quadDivergentImplicitLod = false; + p->maxPerStageDescriptorUpdateAfterBindSamplers = max_bindless_views; + p->maxPerStageDescriptorUpdateAfterBindUniformBuffers = MAX_PER_STAGE_DESCRIPTOR_UNIFORM_BUFFERS; + p->maxPerStageDescriptorUpdateAfterBindStorageBuffers = UINT32_MAX; + p->maxPerStageDescriptorUpdateAfterBindSampledImages = max_bindless_views; + p->maxPerStageDescriptorUpdateAfterBindStorageImages = max_bindless_views; + p->maxPerStageDescriptorUpdateAfterBindInputAttachments = MAX_PER_STAGE_DESCRIPTOR_INPUT_ATTACHMENTS; + p->maxPerStageUpdateAfterBindResources = UINT32_MAX; + p->maxDescriptorSetUpdateAfterBindSamplers = max_bindless_views; + p->maxDescriptorSetUpdateAfterBindUniformBuffers = 6 * MAX_PER_STAGE_DESCRIPTOR_UNIFORM_BUFFERS; + p->maxDescriptorSetUpdateAfterBindUniformBuffersDynamic = MAX_DYNAMIC_BUFFERS / 2; + p->maxDescriptorSetUpdateAfterBindStorageBuffers = UINT32_MAX; + p->maxDescriptorSetUpdateAfterBindStorageBuffersDynamic = MAX_DYNAMIC_BUFFERS / 2; + p->maxDescriptorSetUpdateAfterBindSampledImages = max_bindless_views; + p->maxDescriptorSetUpdateAfterBindStorageImages = max_bindless_views; + p->maxDescriptorSetUpdateAfterBindInputAttachments = MAX_DESCRIPTOR_SET_INPUT_ATTACHMENTS; + + /* We support all of the depth resolve modes */ + p->supportedDepthResolveModes = VK_RESOLVE_MODE_SAMPLE_ZERO_BIT_KHR | + VK_RESOLVE_MODE_AVERAGE_BIT_KHR | + VK_RESOLVE_MODE_MIN_BIT_KHR | + VK_RESOLVE_MODE_MAX_BIT_KHR; + /* Average doesn't make sense for stencil so we don't support that */ + p->supportedStencilResolveModes = VK_RESOLVE_MODE_SAMPLE_ZERO_BIT_KHR; + if (pdevice->info.gen >= 8) { + /* The advanced stencil resolve modes currently require stencil + * sampling be supported by the hardware. + */ + p->supportedStencilResolveModes |= VK_RESOLVE_MODE_MIN_BIT_KHR | + VK_RESOLVE_MODE_MAX_BIT_KHR; + } + p->independentResolveNone = true; + p->independentResolve = true; + + p->filterMinmaxSingleComponentFormats = pdevice->info.gen >= 9; + p->filterMinmaxImageComponentMapping = pdevice->info.gen >= 9; + + p->maxTimelineSemaphoreValueDifference = UINT64_MAX; + + p->framebufferIntegerColorSampleCounts = + isl_device_get_sample_counts(&pdevice->isl_dev); +} + void anv_GetPhysicalDeviceProperties2( VkPhysicalDevice physicalDevice, VkPhysicalDeviceProperties2* pProperties) @@ -1430,96 +1732,71 @@ anv_GetPhysicalDeviceProperties(physicalDevice, &pProperties->properties); + VkPhysicalDeviceVulkan11Properties core_1_1 = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_PROPERTIES, + }; + anv_get_physical_device_properties_1_1(pdevice, &core_1_1); + + VkPhysicalDeviceVulkan12Properties core_1_2 = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_PROPERTIES, + }; + anv_get_physical_device_properties_1_2(pdevice, &core_1_2); + +#define CORE_RENAMED_PROPERTY(major, minor, ext_property, core_property) \ + memcpy(&properties->ext_property, &core_##major##_##minor.core_property, \ + sizeof(core_##major##_##minor.core_property)) + +#define CORE_PROPERTY(major, minor, property) \ + CORE_RENAMED_PROPERTY(major, minor, property, property) + vk_foreach_struct(ext, pProperties->pNext) { switch (ext->sType) { case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DEPTH_STENCIL_RESOLVE_PROPERTIES_KHR: { - VkPhysicalDeviceDepthStencilResolvePropertiesKHR *props = + VkPhysicalDeviceDepthStencilResolvePropertiesKHR *properties = (VkPhysicalDeviceDepthStencilResolvePropertiesKHR *)ext; - - /* We support all of the depth resolve modes */ - props->supportedDepthResolveModes = - VK_RESOLVE_MODE_SAMPLE_ZERO_BIT_KHR | - VK_RESOLVE_MODE_AVERAGE_BIT_KHR | - VK_RESOLVE_MODE_MIN_BIT_KHR | - VK_RESOLVE_MODE_MAX_BIT_KHR; - - /* Average doesn't make sense for stencil so we don't support that */ - props->supportedStencilResolveModes = - VK_RESOLVE_MODE_SAMPLE_ZERO_BIT_KHR; - if (pdevice->info.gen >= 8) { - /* The advanced stencil resolve modes currently require stencil - * sampling be supported by the hardware. - */ - props->supportedStencilResolveModes |= - VK_RESOLVE_MODE_MIN_BIT_KHR | - VK_RESOLVE_MODE_MAX_BIT_KHR; - } - - props->independentResolveNone = true; - props->independentResolve = true; + CORE_PROPERTY(1, 2, supportedDepthResolveModes); + CORE_PROPERTY(1, 2, supportedStencilResolveModes); + CORE_PROPERTY(1, 2, independentResolveNone); + CORE_PROPERTY(1, 2, independentResolve); break; } case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DESCRIPTOR_INDEXING_PROPERTIES_EXT: { - VkPhysicalDeviceDescriptorIndexingPropertiesEXT *props = + VkPhysicalDeviceDescriptorIndexingPropertiesEXT *properties = (VkPhysicalDeviceDescriptorIndexingPropertiesEXT *)ext; - - /* It's a bit hard to exactly map our implementation to the limits - * described here. The bindless surface handle in the extended - * message descriptors is 20 bits and it's an index into the table of - * RENDER_SURFACE_STATE structs that starts at bindless surface base - * address. Given that most things consume two surface states per - * view (general/sampled for textures and write-only/read-write for - * images), we claim 2^19 things. - * - * For SSBOs, we just use A64 messages so there is no real limit - * there beyond the limit on the total size of a descriptor set. - */ - const unsigned max_bindless_views = 1 << 19; - - props->maxUpdateAfterBindDescriptorsInAllPools = max_bindless_views; - props->shaderUniformBufferArrayNonUniformIndexingNative = false; - props->shaderSampledImageArrayNonUniformIndexingNative = false; - props->shaderStorageBufferArrayNonUniformIndexingNative = true; - props->shaderStorageImageArrayNonUniformIndexingNative = false; - props->shaderInputAttachmentArrayNonUniformIndexingNative = false; - props->robustBufferAccessUpdateAfterBind = true; - props->quadDivergentImplicitLod = false; - props->maxPerStageDescriptorUpdateAfterBindSamplers = max_bindless_views; - props->maxPerStageDescriptorUpdateAfterBindUniformBuffers = MAX_PER_STAGE_DESCRIPTOR_UNIFORM_BUFFERS; - props->maxPerStageDescriptorUpdateAfterBindStorageBuffers = UINT32_MAX; - props->maxPerStageDescriptorUpdateAfterBindSampledImages = max_bindless_views; - props->maxPerStageDescriptorUpdateAfterBindStorageImages = max_bindless_views; - props->maxPerStageDescriptorUpdateAfterBindInputAttachments = MAX_PER_STAGE_DESCRIPTOR_INPUT_ATTACHMENTS; - props->maxPerStageUpdateAfterBindResources = UINT32_MAX; - props->maxDescriptorSetUpdateAfterBindSamplers = max_bindless_views; - props->maxDescriptorSetUpdateAfterBindUniformBuffers = 6 * MAX_PER_STAGE_DESCRIPTOR_UNIFORM_BUFFERS; - props->maxDescriptorSetUpdateAfterBindUniformBuffersDynamic = MAX_DYNAMIC_BUFFERS / 2; - props->maxDescriptorSetUpdateAfterBindStorageBuffers = UINT32_MAX; - props->maxDescriptorSetUpdateAfterBindStorageBuffersDynamic = MAX_DYNAMIC_BUFFERS / 2; - props->maxDescriptorSetUpdateAfterBindSampledImages = max_bindless_views; - props->maxDescriptorSetUpdateAfterBindStorageImages = max_bindless_views; - props->maxDescriptorSetUpdateAfterBindInputAttachments = MAX_DESCRIPTOR_SET_INPUT_ATTACHMENTS; + CORE_PROPERTY(1, 2, maxUpdateAfterBindDescriptorsInAllPools); + CORE_PROPERTY(1, 2, shaderUniformBufferArrayNonUniformIndexingNative); + CORE_PROPERTY(1, 2, shaderSampledImageArrayNonUniformIndexingNative); + CORE_PROPERTY(1, 2, shaderStorageBufferArrayNonUniformIndexingNative); + CORE_PROPERTY(1, 2, shaderStorageImageArrayNonUniformIndexingNative); + CORE_PROPERTY(1, 2, shaderInputAttachmentArrayNonUniformIndexingNative); + CORE_PROPERTY(1, 2, robustBufferAccessUpdateAfterBind); + CORE_PROPERTY(1, 2, quadDivergentImplicitLod); + CORE_PROPERTY(1, 2, maxPerStageDescriptorUpdateAfterBindSamplers); + CORE_PROPERTY(1, 2, maxPerStageDescriptorUpdateAfterBindUniformBuffers); + CORE_PROPERTY(1, 2, maxPerStageDescriptorUpdateAfterBindStorageBuffers); + CORE_PROPERTY(1, 2, maxPerStageDescriptorUpdateAfterBindSampledImages); + CORE_PROPERTY(1, 2, maxPerStageDescriptorUpdateAfterBindStorageImages); + CORE_PROPERTY(1, 2, maxPerStageDescriptorUpdateAfterBindInputAttachments); + CORE_PROPERTY(1, 2, maxPerStageUpdateAfterBindResources); + CORE_PROPERTY(1, 2, maxDescriptorSetUpdateAfterBindSamplers); + CORE_PROPERTY(1, 2, maxDescriptorSetUpdateAfterBindUniformBuffers); + CORE_PROPERTY(1, 2, maxDescriptorSetUpdateAfterBindUniformBuffersDynamic); + CORE_PROPERTY(1, 2, maxDescriptorSetUpdateAfterBindStorageBuffers); + CORE_PROPERTY(1, 2, maxDescriptorSetUpdateAfterBindStorageBuffersDynamic); + CORE_PROPERTY(1, 2, maxDescriptorSetUpdateAfterBindSampledImages); + CORE_PROPERTY(1, 2, maxDescriptorSetUpdateAfterBindStorageImages); + CORE_PROPERTY(1, 2, maxDescriptorSetUpdateAfterBindInputAttachments); break; } case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DRIVER_PROPERTIES_KHR: { - VkPhysicalDeviceDriverPropertiesKHR *driver_props = + VkPhysicalDeviceDriverPropertiesKHR *properties = (VkPhysicalDeviceDriverPropertiesKHR *) ext; - - driver_props->driverID = VK_DRIVER_ID_INTEL_OPEN_SOURCE_MESA_KHR; - snprintf(driver_props->driverName, VK_MAX_DRIVER_NAME_SIZE_KHR, - "Intel open-source Mesa driver"); - - snprintf(driver_props->driverInfo, VK_MAX_DRIVER_INFO_SIZE_KHR, - "Mesa " PACKAGE_VERSION MESA_GIT_SHA1); - - driver_props->conformanceVersion = (VkConformanceVersionKHR) { - .major = 1, - .minor = 1, - .subminor = 2, - .patch = 0, - }; + CORE_PROPERTY(1, 2, driverID); + CORE_PROPERTY(1, 2, driverName); + CORE_PROPERTY(1, 2, driverInfo); + CORE_PROPERTY(1, 2, conformanceVersion); break; } @@ -1532,12 +1809,12 @@ } case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES: { - VkPhysicalDeviceIDProperties *id_props = + VkPhysicalDeviceIDProperties *properties = (VkPhysicalDeviceIDProperties *)ext; - memcpy(id_props->deviceUUID, pdevice->device_uuid, VK_UUID_SIZE); - memcpy(id_props->driverUUID, pdevice->driver_uuid, VK_UUID_SIZE); - /* The LUID is for Windows. */ - id_props->deviceLUIDValid = false; + CORE_PROPERTY(1, 1, deviceUUID); + CORE_PROPERTY(1, 1, driverUUID); + CORE_PROPERTY(1, 1, deviceLUID); + CORE_PROPERTY(1, 1, deviceLUIDValid); break; } @@ -1576,21 +1853,21 @@ } case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MAINTENANCE_3_PROPERTIES: { - VkPhysicalDeviceMaintenance3Properties *props = + VkPhysicalDeviceMaintenance3Properties *properties = (VkPhysicalDeviceMaintenance3Properties *)ext; /* This value doesn't matter for us today as our per-stage * descriptors are the real limit. */ - props->maxPerSetDescriptors = 1024; - props->maxMemoryAllocationSize = MAX_MEMORY_ALLOCATION_SIZE; + CORE_PROPERTY(1, 1, maxPerSetDescriptors); + CORE_PROPERTY(1, 1, maxMemoryAllocationSize); break; } case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MULTIVIEW_PROPERTIES: { VkPhysicalDeviceMultiviewProperties *properties = (VkPhysicalDeviceMultiviewProperties *)ext; - properties->maxMultiviewViewCount = 16; - properties->maxMultiviewInstanceIndex = UINT32_MAX / 16; + CORE_PROPERTY(1, 1, maxMultiviewViewCount); + CORE_PROPERTY(1, 1, maxMultiviewInstanceIndex); break; } @@ -1607,7 +1884,7 @@ case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_POINT_CLIPPING_PROPERTIES: { VkPhysicalDevicePointClippingProperties *properties = (VkPhysicalDevicePointClippingProperties *) ext; - properties->pointClippingBehavior = VK_POINT_CLIPPING_BEHAVIOR_USER_CLIP_PLANES_ONLY; + CORE_PROPERTY(1, 1, pointClippingBehavior); break; } @@ -1622,16 +1899,15 @@ #pragma GCC diagnostic pop case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROTECTED_MEMORY_PROPERTIES: { - VkPhysicalDeviceProtectedMemoryProperties *props = + VkPhysicalDeviceProtectedMemoryProperties *properties = (VkPhysicalDeviceProtectedMemoryProperties *)ext; - props->protectedNoFault = false; + CORE_PROPERTY(1, 1, protectedNoFault); break; } case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PUSH_DESCRIPTOR_PROPERTIES_KHR: { VkPhysicalDevicePushDescriptorPropertiesKHR *properties = (VkPhysicalDevicePushDescriptorPropertiesKHR *) ext; - properties->maxPushDescriptors = MAX_PUSH_DESCRIPTORS; break; } @@ -1639,39 +1915,20 @@ case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SAMPLER_FILTER_MINMAX_PROPERTIES_EXT: { VkPhysicalDeviceSamplerFilterMinmaxPropertiesEXT *properties = (VkPhysicalDeviceSamplerFilterMinmaxPropertiesEXT *)ext; - properties->filterMinmaxImageComponentMapping = pdevice->info.gen >= 9; - properties->filterMinmaxSingleComponentFormats = pdevice->info.gen >= 9; + CORE_PROPERTY(1, 2, filterMinmaxImageComponentMapping); + CORE_PROPERTY(1, 2, filterMinmaxSingleComponentFormats); break; } case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_PROPERTIES: { VkPhysicalDeviceSubgroupProperties *properties = (void *)ext; - - properties->subgroupSize = BRW_SUBGROUP_SIZE; - - VkShaderStageFlags scalar_stages = 0; - for (unsigned stage = 0; stage < MESA_SHADER_STAGES; stage++) { - if (pdevice->compiler->scalar_stage[stage]) - scalar_stages |= mesa_to_vk_shader_stage(stage); - } - properties->supportedStages = scalar_stages; - - properties->supportedOperations = VK_SUBGROUP_FEATURE_BASIC_BIT | - VK_SUBGROUP_FEATURE_VOTE_BIT | - VK_SUBGROUP_FEATURE_BALLOT_BIT | - VK_SUBGROUP_FEATURE_SHUFFLE_BIT | - VK_SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT | - VK_SUBGROUP_FEATURE_QUAD_BIT; - if (pdevice->info.gen >= 8) { - /* TODO: There's no technical reason why these can't be made to - * work on gen7 but they don't at the moment so it's best to leave - * the feature disabled than enabled and broken. - */ - properties->supportedOperations |= - VK_SUBGROUP_FEATURE_ARITHMETIC_BIT | - VK_SUBGROUP_FEATURE_CLUSTERED_BIT; - } - properties->quadOperationsInAllStages = pdevice->info.gen >= 8; + CORE_PROPERTY(1, 1, subgroupSize); + CORE_RENAMED_PROPERTY(1, 1, supportedStages, + subgroupSupportedStages); + CORE_RENAMED_PROPERTY(1, 1, supportedOperations, + subgroupSupportedOperations); + CORE_RENAMED_PROPERTY(1, 1, quadOperationsInAllStages, + subgroupQuadOperationsInAllStages); break; } @@ -1685,6 +1942,27 @@ props->requiredSubgroupSizeStages = VK_SHADER_STAGE_COMPUTE_BIT; break; } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT_CONTROLS_PROPERTIES_KHR : { + VkPhysicalDeviceFloatControlsPropertiesKHR *properties = (void *)ext; + CORE_PROPERTY(1, 2, denormBehaviorIndependence); + CORE_PROPERTY(1, 2, roundingModeIndependence); + CORE_PROPERTY(1, 2, shaderDenormFlushToZeroFloat16); + CORE_PROPERTY(1, 2, shaderDenormPreserveFloat16); + CORE_PROPERTY(1, 2, shaderRoundingModeRTEFloat16); + CORE_PROPERTY(1, 2, shaderRoundingModeRTZFloat16); + CORE_PROPERTY(1, 2, shaderSignedZeroInfNanPreserveFloat16); + CORE_PROPERTY(1, 2, shaderDenormFlushToZeroFloat32); + CORE_PROPERTY(1, 2, shaderDenormPreserveFloat32); + CORE_PROPERTY(1, 2, shaderRoundingModeRTEFloat32); + CORE_PROPERTY(1, 2, shaderRoundingModeRTZFloat32); + CORE_PROPERTY(1, 2, shaderSignedZeroInfNanPreserveFloat32); + CORE_PROPERTY(1, 2, shaderDenormFlushToZeroFloat64); + CORE_PROPERTY(1, 2, shaderDenormPreserveFloat64); + CORE_PROPERTY(1, 2, shaderRoundingModeRTEFloat64); + CORE_PROPERTY(1, 2, shaderRoundingModeRTZFloat64); + CORE_PROPERTY(1, 2, shaderSignedZeroInfNanPreserveFloat64); + break; + } case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TEXEL_BUFFER_ALIGNMENT_PROPERTIES_EXT: { VkPhysicalDeviceTexelBufferAlignmentPropertiesEXT *props = @@ -1716,6 +1994,13 @@ break; } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TIMELINE_SEMAPHORE_PROPERTIES_KHR: { + VkPhysicalDeviceTimelineSemaphorePropertiesKHR *properties = + (VkPhysicalDeviceTimelineSemaphorePropertiesKHR *) ext; + CORE_PROPERTY(1, 2, maxTimelineSemaphoreValueDifference); + break; + } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TRANSFORM_FEEDBACK_PROPERTIES_EXT: { VkPhysicalDeviceTransformFeedbackPropertiesEXT *props = (VkPhysicalDeviceTransformFeedbackPropertiesEXT *)ext; @@ -1741,11 +2026,22 @@ break; } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_PROPERTIES: + anv_get_physical_device_properties_1_1(pdevice, (void *)ext); + break; + + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_PROPERTIES: + anv_get_physical_device_properties_1_2(pdevice, (void *)ext); + break; + default: anv_debug_ignored_stype(ext->sType); break; } } + +#undef CORE_RENAMED_PROPERTY +#undef CORE_PROPERTY } /* We support exactly one queue family. */ @@ -1928,6 +2224,10 @@ if (idx >= 0) return instance->dispatch.entrypoints[idx]; + idx = anv_get_physical_device_entrypoint_index(pName); + if (idx >= 0) + return instance->physical_device_dispatch.entrypoints[idx]; + idx = anv_get_device_entrypoint_index(pName); if (idx >= 0) return instance->device_dispatch.entrypoints[idx]; @@ -1967,6 +2267,31 @@ return device->dispatch.entrypoints[idx]; } +/* With version 4+ of the loader interface the ICD should expose + * vk_icdGetPhysicalDeviceProcAddr() + */ +PUBLIC +VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL vk_icdGetPhysicalDeviceProcAddr( + VkInstance _instance, + const char* pName); + +PFN_vkVoidFunction vk_icdGetPhysicalDeviceProcAddr( + VkInstance _instance, + const char* pName) +{ + ANV_FROM_HANDLE(anv_instance, instance, _instance); + + if (!pName || !instance) + return NULL; + + int idx = anv_get_physical_device_entrypoint_index(pName); + if (idx < 0) + return NULL; + + return instance->physical_device_dispatch.entrypoints[idx]; +} + + VkResult anv_CreateDebugReportCallbackEXT(VkInstance _instance, const VkDebugReportCallbackCreateInfoEXT* pCreateInfo, @@ -2004,19 +2329,6 @@ object, location, messageCode, pLayerPrefix, pMessage); } -static void -anv_queue_init(struct anv_device *device, struct anv_queue *queue) -{ - queue->_loader_data.loaderMagic = ICD_LOADER_MAGIC; - queue->device = device; - queue->flags = 0; -} - -static void -anv_queue_finish(struct anv_queue *queue) -{ -} - static struct anv_state anv_state_pool_emit_data(struct anv_state_pool *pool, size_t size, size_t align, const void *p) { @@ -2088,35 +2400,29 @@ } } -static void +static VkResult anv_device_init_trivial_batch(struct anv_device *device) { - anv_bo_init_new(&device->trivial_batch_bo, device, 4096); - - if (device->instance->physicalDevice.has_exec_async) - device->trivial_batch_bo.flags |= EXEC_OBJECT_ASYNC; - - if (device->instance->physicalDevice.use_softpin) - device->trivial_batch_bo.flags |= EXEC_OBJECT_PINNED; - - anv_vma_alloc(device, &device->trivial_batch_bo); - - void *map = anv_gem_mmap(device, device->trivial_batch_bo.gem_handle, - 0, 4096, 0); + VkResult result = anv_device_alloc_bo(device, 4096, + ANV_BO_ALLOC_MAPPED, + 0 /* explicit_address */, + &device->trivial_batch_bo); + if (result != VK_SUCCESS) + return result; struct anv_batch batch = { - .start = map, - .next = map, - .end = map + 4096, + .start = device->trivial_batch_bo->map, + .next = device->trivial_batch_bo->map, + .end = device->trivial_batch_bo->map + 4096, }; anv_batch_emit(&batch, GEN7_MI_BATCH_BUFFER_END, bbe); anv_batch_emit(&batch, GEN7_MI_NOOP, noop); if (!device->info.has_llc) - gen_clflush_range(map, batch.next - map); + gen_clflush_range(batch.start, batch.next - batch.start); - anv_gem_munmap(map, device->trivial_batch_bo.size); + return VK_SUCCESS; } VkResult anv_EnumerateDeviceExtensionProperties( @@ -2142,8 +2448,13 @@ static void anv_device_init_dispatch(struct anv_device *device) { + const struct anv_instance *instance = device->physical->instance; + const struct anv_device_dispatch_table *genX_table; switch (device->info.gen) { + case 12: + genX_table = &gen12_device_dispatch_table; + break; case 11: genX_table = &gen11_device_dispatch_table; break; @@ -2170,8 +2481,8 @@ /* Vulkan requires that entrypoints for extensions which have not been * enabled must not be advertised. */ - if (!anv_device_entrypoint_is_enabled(i, device->instance->app_info.api_version, - &device->instance->enabled_extensions, + if (!anv_device_entrypoint_is_enabled(i, instance->app_info.api_version, + &instance->enabled_extensions, &device->enabled_extensions)) { device->dispatch.entrypoints[i] = NULL; } else if (genX_table->entrypoints[i]) { @@ -2200,27 +2511,25 @@ } } -static void +static VkResult anv_device_init_hiz_clear_value_bo(struct anv_device *device) { - anv_bo_init_new(&device->hiz_clear_bo, device, 4096); - - if (device->instance->physicalDevice.has_exec_async) - device->hiz_clear_bo.flags |= EXEC_OBJECT_ASYNC; - - if (device->instance->physicalDevice.use_softpin) - device->hiz_clear_bo.flags |= EXEC_OBJECT_PINNED; - - anv_vma_alloc(device, &device->hiz_clear_bo); - - uint32_t *map = anv_gem_mmap(device, device->hiz_clear_bo.gem_handle, - 0, 4096, 0); + VkResult result = anv_device_alloc_bo(device, 4096, + ANV_BO_ALLOC_MAPPED, + 0 /* explicit_address */, + &device->hiz_clear_bo); + if (result != VK_SUCCESS) + return result; union isl_color_value hiz_clear = { .u32 = { 0, } }; hiz_clear.f32[0] = ANV_HZ_FC_VAL; - memcpy(map, hiz_clear.u32, sizeof(hiz_clear.u32)); - anv_gem_munmap(map, device->hiz_clear_bo.size); + memcpy(device->hiz_clear_bo->map, hiz_clear.u32, sizeof(hiz_clear.u32)); + + if (!device->info.has_llc) + gen_clflush_range(device->hiz_clear_bo->map, sizeof(hiz_clear.u32)); + + return VK_SUCCESS; } static bool @@ -2228,14 +2537,13 @@ struct anv_block_pool *pool, uint64_t address) { - for (uint32_t i = 0; i < pool->nbos; i++) { - uint64_t bo_address = pool->bos[i].offset & (~0ull >> 16); - uint32_t bo_size = pool->bos[i].size; - if (address >= bo_address && address < (bo_address + bo_size)) { + anv_block_pool_foreach_bo(bo, pool) { + uint64_t bo_address = gen_48b_address(bo->offset); + if (address >= bo_address && address < (bo_address + bo->size)) { *ret = (struct gen_batch_decode_bo) { .addr = bo_address, - .size = bo_size, - .map = pool->bos[i].map, + .size = bo->size, + .map = bo->map, }; return true; } @@ -2268,13 +2576,13 @@ u_vector_foreach(bo, &device->cmd_buffer_being_decoded->seen_bbos) { /* The decoder zeroes out the top 16 bits, so we need to as well */ - uint64_t bo_address = (*bo)->bo.offset & (~0ull >> 16); + uint64_t bo_address = (*bo)->bo->offset & (~0ull >> 16); - if (address >= bo_address && address < bo_address + (*bo)->bo.size) { + if (address >= bo_address && address < bo_address + (*bo)->bo->size) { return (struct gen_batch_decode_bo) { .addr = bo_address, - .size = (*bo)->bo.size, - .map = (*bo)->bo.map, + .size = (*bo)->bo->size, + .map = (*bo)->bo->map, }; } } @@ -2282,6 +2590,64 @@ return (struct gen_batch_decode_bo) { }; } +struct gen_aux_map_buffer { + struct gen_buffer base; + struct anv_state state; +}; + +static struct gen_buffer * +gen_aux_map_buffer_alloc(void *driver_ctx, uint32_t size) +{ + struct gen_aux_map_buffer *buf = malloc(sizeof(struct gen_aux_map_buffer)); + if (!buf) + return NULL; + + struct anv_device *device = (struct anv_device*)driver_ctx; + assert(device->physical->supports_48bit_addresses && + device->physical->use_softpin); + + struct anv_state_pool *pool = &device->dynamic_state_pool; + buf->state = anv_state_pool_alloc(pool, size, size); + + buf->base.gpu = pool->block_pool.bo->offset + buf->state.offset; + buf->base.gpu_end = buf->base.gpu + buf->state.alloc_size; + buf->base.map = buf->state.map; + buf->base.driver_bo = &buf->state; + return &buf->base; +} + +static void +gen_aux_map_buffer_free(void *driver_ctx, struct gen_buffer *buffer) +{ + struct gen_aux_map_buffer *buf = (struct gen_aux_map_buffer*)buffer; + struct anv_device *device = (struct anv_device*)driver_ctx; + struct anv_state_pool *pool = &device->dynamic_state_pool; + anv_state_pool_free(pool, buf->state); + free(buf); +} + +static struct gen_mapped_pinned_buffer_alloc aux_map_allocator = { + .alloc = gen_aux_map_buffer_alloc, + .free = gen_aux_map_buffer_free, +}; + +static VkResult +check_physical_device_features(VkPhysicalDevice physicalDevice, + const VkPhysicalDeviceFeatures *features) +{ + VkPhysicalDeviceFeatures supported_features; + anv_GetPhysicalDeviceFeatures(physicalDevice, &supported_features); + VkBool32 *supported_feature = (VkBool32 *)&supported_features; + VkBool32 *enabled_feature = (VkBool32 *)features; + unsigned num_features = sizeof(VkPhysicalDeviceFeatures) / sizeof(VkBool32); + for (uint32_t i = 0; i < num_features; i++) { + if (enabled_feature[i] && !supported_feature[i]) + return vk_error(VK_ERROR_FEATURE_NOT_PRESENT); + } + + return VK_SUCCESS; +} + VkResult anv_CreateDevice( VkPhysicalDevice physicalDevice, const VkDeviceCreateInfo* pCreateInfo, @@ -2313,15 +2679,34 @@ } /* Check enabled features */ + bool robust_buffer_access = false; if (pCreateInfo->pEnabledFeatures) { - VkPhysicalDeviceFeatures supported_features; - anv_GetPhysicalDeviceFeatures(physicalDevice, &supported_features); - VkBool32 *supported_feature = (VkBool32 *)&supported_features; - VkBool32 *enabled_feature = (VkBool32 *)pCreateInfo->pEnabledFeatures; - unsigned num_features = sizeof(VkPhysicalDeviceFeatures) / sizeof(VkBool32); - for (uint32_t i = 0; i < num_features; i++) { - if (enabled_feature[i] && !supported_feature[i]) - return vk_error(VK_ERROR_FEATURE_NOT_PRESENT); + result = check_physical_device_features(physicalDevice, + pCreateInfo->pEnabledFeatures); + if (result != VK_SUCCESS) + return result; + + if (pCreateInfo->pEnabledFeatures->robustBufferAccess) + robust_buffer_access = true; + } + + vk_foreach_struct_const(ext, pCreateInfo->pNext) { + switch (ext->sType) { + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2: { + const VkPhysicalDeviceFeatures2 *features = (const void *)ext; + result = check_physical_device_features(physicalDevice, + &features->features); + if (result != VK_SUCCESS) + return result; + + if (features->features.robustBufferAccess) + robust_buffer_access = true; + break; + } + + default: + /* Don't warn */ + break; } } @@ -2363,8 +2748,7 @@ } device->_loader_data.loaderMagic = ICD_LOADER_MAGIC; - device->instance = physical_device->instance; - device->chipset_id = physical_device->chipset_id; + device->physical = physical_device; device->no_hw = physical_device->no_hw; device->_lost = false; @@ -2386,23 +2770,30 @@ goto fail_fd; } + result = anv_queue_init(device, &device->queue); + if (result != VK_SUCCESS) + goto fail_context_id; + if (physical_device->use_softpin) { if (pthread_mutex_init(&device->vma_mutex, NULL) != 0) { result = vk_error(VK_ERROR_INITIALIZATION_FAILED); - goto fail_context_id; + goto fail_queue; } /* keep the page with address zero out of the allocator */ - struct anv_memory_heap *low_heap = - &physical_device->memory.heaps[physical_device->memory.heap_count - 1]; - util_vma_heap_init(&device->vma_lo, low_heap->vma_start, low_heap->vma_size); - device->vma_lo_available = low_heap->size; - - struct anv_memory_heap *high_heap = - &physical_device->memory.heaps[0]; - util_vma_heap_init(&device->vma_hi, high_heap->vma_start, high_heap->vma_size); - device->vma_hi_available = physical_device->memory.heap_count == 1 ? 0 : - high_heap->size; + util_vma_heap_init(&device->vma_lo, + LOW_HEAP_MIN_ADDRESS, LOW_HEAP_SIZE); + + util_vma_heap_init(&device->vma_cva, CLIENT_VISIBLE_HEAP_MIN_ADDRESS, + CLIENT_VISIBLE_HEAP_SIZE); + + /* Leave the last 4GiB out of the high vma range, so that no state + * base address + size can overflow 48 bits. For more information see + * the comment about Wa32bitGeneralStateOffset in anv_allocator.c + */ + util_vma_heap_init(&device->vma_hi, HIGH_HEAP_MIN_ADDRESS, + physical_device->gtt_size - (1ull << 32) - + HIGH_HEAP_MIN_ADDRESS); } list_inithead(&device->memory_objects); @@ -2432,15 +2823,14 @@ */ device->can_chain_batches = device->info.gen >= 8; - device->robust_buffer_access = pCreateInfo->pEnabledFeatures && - pCreateInfo->pEnabledFeatures->robustBufferAccess; + device->robust_buffer_access = robust_buffer_access; device->enabled_extensions = enabled_extensions; anv_device_init_dispatch(device); if (pthread_mutex_init(&device->mutex, NULL) != 0) { result = vk_error(VK_ERROR_INITIALIZATION_FAILED); - goto fail_context_id; + goto fail_queue; } pthread_condattr_t condattr; @@ -2460,70 +2850,59 @@ } pthread_condattr_destroy(&condattr); - uint64_t bo_flags = - (physical_device->supports_48bit_addresses ? EXEC_OBJECT_SUPPORTS_48B_ADDRESS : 0) | - (physical_device->has_exec_async ? EXEC_OBJECT_ASYNC : 0) | - (physical_device->has_exec_capture ? EXEC_OBJECT_CAPTURE : 0) | - (physical_device->use_softpin ? EXEC_OBJECT_PINNED : 0); - - anv_bo_pool_init(&device->batch_bo_pool, device, bo_flags); - result = anv_bo_cache_init(&device->bo_cache); if (result != VK_SUCCESS) - goto fail_batch_bo_pool; + goto fail_queue_cond; - if (!physical_device->use_softpin) - bo_flags &= ~EXEC_OBJECT_SUPPORTS_48B_ADDRESS; + anv_bo_pool_init(&device->batch_bo_pool, device); result = anv_state_pool_init(&device->dynamic_state_pool, device, - DYNAMIC_STATE_POOL_MIN_ADDRESS, - 16384, - bo_flags); + DYNAMIC_STATE_POOL_MIN_ADDRESS, 16384); if (result != VK_SUCCESS) - goto fail_bo_cache; + goto fail_batch_bo_pool; result = anv_state_pool_init(&device->instruction_state_pool, device, - INSTRUCTION_STATE_POOL_MIN_ADDRESS, - 16384, - bo_flags); + INSTRUCTION_STATE_POOL_MIN_ADDRESS, 16384); if (result != VK_SUCCESS) goto fail_dynamic_state_pool; result = anv_state_pool_init(&device->surface_state_pool, device, - SURFACE_STATE_POOL_MIN_ADDRESS, - 4096, - bo_flags); + SURFACE_STATE_POOL_MIN_ADDRESS, 4096); if (result != VK_SUCCESS) goto fail_instruction_state_pool; if (physical_device->use_softpin) { result = anv_state_pool_init(&device->binding_table_pool, device, - BINDING_TABLE_POOL_MIN_ADDRESS, - 4096, - bo_flags); + BINDING_TABLE_POOL_MIN_ADDRESS, 4096); if (result != VK_SUCCESS) goto fail_surface_state_pool; } - result = anv_bo_init_new(&device->workaround_bo, device, 4096); - if (result != VK_SUCCESS) - goto fail_binding_table_pool; + if (device->info.gen >= 12) { + device->aux_map_ctx = gen_aux_map_init(device, &aux_map_allocator, + &physical_device->info); + if (!device->aux_map_ctx) + goto fail_binding_table_pool; + } - if (physical_device->use_softpin) - device->workaround_bo.flags |= EXEC_OBJECT_PINNED; + result = anv_device_alloc_bo(device, 4096, 0 /* flags */, + 0 /* explicit_address */, + &device->workaround_bo); + if (result != VK_SUCCESS) + goto fail_surface_aux_map_pool; - if (!anv_vma_alloc(device, &device->workaround_bo)) + result = anv_device_init_trivial_batch(device); + if (result != VK_SUCCESS) goto fail_workaround_bo; - anv_device_init_trivial_batch(device); - - if (device->info.gen >= 10) - anv_device_init_hiz_clear_value_bo(device); + if (device->info.gen >= 10) { + result = anv_device_init_hiz_clear_value_bo(device); + if (result != VK_SUCCESS) + goto fail_trivial_batch_bo; + } anv_scratch_pool_init(device, &device->scratch_pool); - anv_queue_init(device, &device->queue); - switch (device->info.gen) { case 7: if (!device->info.is_haswell) @@ -2543,6 +2922,9 @@ case 11: result = gen11_init_device_state(device); break; + case 12: + result = gen12_init_device_state(device); + break; default: /* Shouldn't get here as we don't create physical devices for any other * gens. */ @@ -2557,15 +2939,24 @@ anv_device_init_border_colors(device); + anv_device_perf_init(device); + *pDevice = anv_device_to_handle(device); return VK_SUCCESS; fail_workaround_bo: - anv_queue_finish(&device->queue); anv_scratch_pool_finish(device, &device->scratch_pool); - anv_gem_munmap(device->workaround_bo.map, device->workaround_bo.size); - anv_gem_close(device, device->workaround_bo.gem_handle); + if (device->info.gen >= 10) + anv_device_release_bo(device, device->hiz_clear_bo); + anv_device_release_bo(device, device->workaround_bo); + fail_trivial_batch_bo: + anv_device_release_bo(device, device->trivial_batch_bo); + fail_surface_aux_map_pool: + if (device->info.gen >= 12) { + gen_aux_map_finish(device->aux_map_ctx); + device->aux_map_ctx = NULL; + } fail_binding_table_pool: if (physical_device->use_softpin) anv_state_pool_finish(&device->binding_table_pool); @@ -2575,18 +2966,21 @@ anv_state_pool_finish(&device->instruction_state_pool); fail_dynamic_state_pool: anv_state_pool_finish(&device->dynamic_state_pool); - fail_bo_cache: - anv_bo_cache_finish(&device->bo_cache); fail_batch_bo_pool: anv_bo_pool_finish(&device->batch_bo_pool); + anv_bo_cache_finish(&device->bo_cache); + fail_queue_cond: pthread_cond_destroy(&device->queue_submit); fail_mutex: pthread_mutex_destroy(&device->mutex); fail_vmas: if (physical_device->use_softpin) { util_vma_heap_finish(&device->vma_hi); + util_vma_heap_finish(&device->vma_cva); util_vma_heap_finish(&device->vma_lo); } + fail_queue: + anv_queue_finish(&device->queue); fail_context_id: anv_gem_destroy_context(device, device->context_id); fail_fd: @@ -2602,13 +2996,10 @@ const VkAllocationCallbacks* pAllocator) { ANV_FROM_HANDLE(anv_device, device, _device); - struct anv_physical_device *physical_device; if (!device) return; - physical_device = &device->instance->physicalDevice; - anv_device_finish_blorp(device); anv_pipeline_cache_finish(&device->default_pipeline_cache); @@ -2625,27 +3016,29 @@ anv_scratch_pool_finish(device, &device->scratch_pool); - anv_gem_munmap(device->workaround_bo.map, device->workaround_bo.size); - anv_vma_free(device, &device->workaround_bo); - anv_gem_close(device, device->workaround_bo.gem_handle); - - anv_vma_free(device, &device->trivial_batch_bo); - anv_gem_close(device, device->trivial_batch_bo.gem_handle); + anv_device_release_bo(device, device->workaround_bo); + anv_device_release_bo(device, device->trivial_batch_bo); if (device->info.gen >= 10) - anv_gem_close(device, device->hiz_clear_bo.gem_handle); + anv_device_release_bo(device, device->hiz_clear_bo); - if (physical_device->use_softpin) + if (device->info.gen >= 12) { + gen_aux_map_finish(device->aux_map_ctx); + device->aux_map_ctx = NULL; + } + + if (device->physical->use_softpin) anv_state_pool_finish(&device->binding_table_pool); anv_state_pool_finish(&device->surface_state_pool); anv_state_pool_finish(&device->instruction_state_pool); anv_state_pool_finish(&device->dynamic_state_pool); - anv_bo_cache_finish(&device->bo_cache); - anv_bo_pool_finish(&device->batch_bo_pool); - if (physical_device->use_softpin) { + anv_bo_cache_finish(&device->bo_cache); + + if (device->physical->use_softpin) { util_vma_heap_finish(&device->vma_hi); + util_vma_heap_finish(&device->vma_cva); util_vma_heap_finish(&device->vma_lo); } @@ -2695,11 +3088,15 @@ uint32_t queueIndex, VkQueue* pQueue) { - ANV_FROM_HANDLE(anv_device, device, _device); - - assert(queueIndex == 0); + const VkDeviceQueueInfo2 info = { + .sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_INFO_2, + .pNext = NULL, + .flags = 0, + .queueFamilyIndex = queueNodeIndex, + .queueIndex = queueIndex, + }; - *pQueue = anv_queue_to_handle(&device->queue); + anv_GetDeviceQueue2(_device, &info, pQueue); } void anv_GetDeviceQueue2( @@ -2725,10 +3122,32 @@ VkResult err; va_list ap; - device->_lost = true; + p_atomic_inc(&device->_lost); va_start(ap, msg); - err = __vk_errorv(device->instance, device, + err = __vk_errorv(device->physical->instance, device, + VK_DEBUG_REPORT_OBJECT_TYPE_DEVICE_EXT, + VK_ERROR_DEVICE_LOST, file, line, msg, ap); + va_end(ap); + + if (env_var_as_boolean("ANV_ABORT_ON_DEVICE_LOSS", false)) + abort(); + + return err; +} + +VkResult +_anv_queue_set_lost(struct anv_queue *queue, + const char *file, int line, + const char *msg, ...) +{ + VkResult err; + va_list ap; + + p_atomic_inc(&queue->device->_lost); + + va_start(ap, msg); + err = __vk_errorv(queue->device->physical->instance, queue->device, VK_DEBUG_REPORT_OBJECT_TYPE_DEVICE_EXT, VK_ERROR_DEVICE_LOST, file, line, msg, ap); va_end(ap); @@ -2813,94 +3232,71 @@ VkDevice _device) { ANV_FROM_HANDLE(anv_device, device, _device); + if (anv_device_is_lost(device)) return VK_ERROR_DEVICE_LOST; - struct anv_batch batch; - - uint32_t cmds[8]; - batch.start = batch.next = cmds; - batch.end = (void *) cmds + sizeof(cmds); - - anv_batch_emit(&batch, GEN7_MI_BATCH_BUFFER_END, bbe); - anv_batch_emit(&batch, GEN7_MI_NOOP, noop); - - return anv_device_submit_simple_batch(device, &batch); + return anv_queue_submit_simple_batch(&device->queue, NULL); } -bool -anv_vma_alloc(struct anv_device *device, struct anv_bo *bo) +uint64_t +anv_vma_alloc(struct anv_device *device, + uint64_t size, uint64_t align, + enum anv_bo_alloc_flags alloc_flags, + uint64_t client_address) { - if (!(bo->flags & EXEC_OBJECT_PINNED)) - return true; - pthread_mutex_lock(&device->vma_mutex); - bo->offset = 0; + uint64_t addr = 0; - if (bo->flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS && - device->vma_hi_available >= bo->size) { - uint64_t addr = util_vma_heap_alloc(&device->vma_hi, bo->size, 4096); - if (addr) { - bo->offset = gen_canonical_address(addr); - assert(addr == gen_48b_address(bo->offset)); - device->vma_hi_available -= bo->size; + if (alloc_flags & ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS) { + if (client_address) { + if (util_vma_heap_alloc_addr(&device->vma_cva, + client_address, size)) { + addr = client_address; + } + } else { + addr = util_vma_heap_alloc(&device->vma_cva, size, align); } + /* We don't want to fall back to other heaps */ + goto done; } - if (bo->offset == 0 && device->vma_lo_available >= bo->size) { - uint64_t addr = util_vma_heap_alloc(&device->vma_lo, bo->size, 4096); - if (addr) { - bo->offset = gen_canonical_address(addr); - assert(addr == gen_48b_address(bo->offset)); - device->vma_lo_available -= bo->size; - } - } + assert(client_address == 0); + + if (!(alloc_flags & ANV_BO_ALLOC_32BIT_ADDRESS)) + addr = util_vma_heap_alloc(&device->vma_hi, size, align); + + if (addr == 0) + addr = util_vma_heap_alloc(&device->vma_lo, size, align); +done: pthread_mutex_unlock(&device->vma_mutex); - return bo->offset != 0; + assert(addr == gen_48b_address(addr)); + return gen_canonical_address(addr); } void -anv_vma_free(struct anv_device *device, struct anv_bo *bo) +anv_vma_free(struct anv_device *device, + uint64_t address, uint64_t size) { - if (!(bo->flags & EXEC_OBJECT_PINNED)) - return; - - const uint64_t addr_48b = gen_48b_address(bo->offset); + const uint64_t addr_48b = gen_48b_address(address); pthread_mutex_lock(&device->vma_mutex); if (addr_48b >= LOW_HEAP_MIN_ADDRESS && addr_48b <= LOW_HEAP_MAX_ADDRESS) { - util_vma_heap_free(&device->vma_lo, addr_48b, bo->size); - device->vma_lo_available += bo->size; + util_vma_heap_free(&device->vma_lo, addr_48b, size); + } else if (addr_48b >= CLIENT_VISIBLE_HEAP_MIN_ADDRESS && + addr_48b <= CLIENT_VISIBLE_HEAP_MAX_ADDRESS) { + util_vma_heap_free(&device->vma_cva, addr_48b, size); } else { - ASSERTED const struct anv_physical_device *physical_device = - &device->instance->physicalDevice; - assert(addr_48b >= physical_device->memory.heaps[0].vma_start && - addr_48b < (physical_device->memory.heaps[0].vma_start + - physical_device->memory.heaps[0].vma_size)); - util_vma_heap_free(&device->vma_hi, addr_48b, bo->size); - device->vma_hi_available += bo->size; + assert(addr_48b >= HIGH_HEAP_MIN_ADDRESS); + util_vma_heap_free(&device->vma_hi, addr_48b, size); } pthread_mutex_unlock(&device->vma_mutex); - - bo->offset = 0; -} - -VkResult -anv_bo_init_new(struct anv_bo *bo, struct anv_device *device, uint64_t size) -{ - uint32_t gem_handle = anv_gem_create(device, size); - if (!gem_handle) - return vk_error(VK_ERROR_OUT_OF_DEVICE_MEMORY); - - anv_bo_init(bo, gem_handle, size); - - return VK_SUCCESS; } VkResult anv_AllocateMemory( @@ -2910,7 +3306,7 @@ VkDeviceMemory* pMem) { ANV_FROM_HANDLE(anv_device, device, _device); - struct anv_physical_device *pdevice = &device->instance->physicalDevice; + struct anv_physical_device *pdevice = device->physical; struct anv_device_memory *mem; VkResult result = VK_SUCCESS; @@ -2919,10 +3315,22 @@ /* The Vulkan 1.0.33 spec says "allocationSize must be greater than 0". */ assert(pAllocateInfo->allocationSize > 0); - if (pAllocateInfo->allocationSize > MAX_MEMORY_ALLOCATION_SIZE) - return VK_ERROR_OUT_OF_DEVICE_MEMORY; + VkDeviceSize aligned_alloc_size = + align_u64(pAllocateInfo->allocationSize, 4096); + + if (aligned_alloc_size > MAX_MEMORY_ALLOCATION_SIZE) + return vk_error(VK_ERROR_OUT_OF_DEVICE_MEMORY); + + assert(pAllocateInfo->memoryTypeIndex < pdevice->memory.type_count); + struct anv_memory_type *mem_type = + &pdevice->memory.types[pAllocateInfo->memoryTypeIndex]; + assert(mem_type->heapIndex < pdevice->memory.heap_count); + struct anv_memory_heap *mem_heap = + &pdevice->memory.heaps[mem_type->heapIndex]; - /* FINISHME: Fail if allocation request exceeds heap size. */ + uint64_t mem_heap_used = p_atomic_read(&mem_heap->used); + if (mem_heap_used + aligned_alloc_size > mem_heap->size) + return vk_error(VK_ERROR_OUT_OF_DEVICE_MEMORY); mem = vk_alloc2(&device->alloc, pAllocator, sizeof(*mem), 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); @@ -2930,35 +3338,83 @@ return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); assert(pAllocateInfo->memoryTypeIndex < pdevice->memory.type_count); - mem->type = &pdevice->memory.types[pAllocateInfo->memoryTypeIndex]; + mem->type = mem_type; mem->map = NULL; mem->map_size = 0; mem->ahw = NULL; mem->host_ptr = NULL; - uint64_t bo_flags = 0; + enum anv_bo_alloc_flags alloc_flags = 0; - assert(mem->type->heapIndex < pdevice->memory.heap_count); - if (pdevice->memory.heaps[mem->type->heapIndex].supports_48bit_addresses) - bo_flags |= EXEC_OBJECT_SUPPORTS_48B_ADDRESS; - - const struct wsi_memory_allocate_info *wsi_info = - vk_find_struct_const(pAllocateInfo->pNext, WSI_MEMORY_ALLOCATE_INFO_MESA); - if (wsi_info && wsi_info->implicit_sync) { - /* We need to set the WRITE flag on window system buffers so that GEM - * will know we're writing to them and synchronize uses on other rings - * (eg if the display server uses the blitter ring). - */ - bo_flags |= EXEC_OBJECT_WRITE; - } else if (pdevice->has_exec_async) { - bo_flags |= EXEC_OBJECT_ASYNC; - } + const VkExportMemoryAllocateInfo *export_info = NULL; + const VkImportAndroidHardwareBufferInfoANDROID *ahw_import_info = NULL; + const VkImportMemoryFdInfoKHR *fd_info = NULL; + const VkImportMemoryHostPointerInfoEXT *host_ptr_info = NULL; + const VkMemoryDedicatedAllocateInfo *dedicated_info = NULL; + VkMemoryAllocateFlags vk_flags = 0; + uint64_t client_address = 0; + + vk_foreach_struct_const(ext, pAllocateInfo->pNext) { + switch (ext->sType) { + case VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO: + export_info = (void *)ext; + break; + + case VK_STRUCTURE_TYPE_IMPORT_ANDROID_HARDWARE_BUFFER_INFO_ANDROID: + ahw_import_info = (void *)ext; + break; - if (pdevice->use_softpin) - bo_flags |= EXEC_OBJECT_PINNED; + case VK_STRUCTURE_TYPE_IMPORT_MEMORY_FD_INFO_KHR: + fd_info = (void *)ext; + break; + + case VK_STRUCTURE_TYPE_IMPORT_MEMORY_HOST_POINTER_INFO_EXT: + host_ptr_info = (void *)ext; + break; - const VkExportMemoryAllocateInfo *export_info = - vk_find_struct_const(pAllocateInfo->pNext, EXPORT_MEMORY_ALLOCATE_INFO); + case VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_FLAGS_INFO: { + const VkMemoryAllocateFlagsInfo *flags_info = (void *)ext; + vk_flags = flags_info->flags; + break; + } + + case VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO: + dedicated_info = (void *)ext; + break; + + case VK_STRUCTURE_TYPE_MEMORY_OPAQUE_CAPTURE_ADDRESS_ALLOCATE_INFO_KHR: { + const VkMemoryOpaqueCaptureAddressAllocateInfoKHR *addr_info = + (const VkMemoryOpaqueCaptureAddressAllocateInfoKHR *)ext; + client_address = addr_info->opaqueCaptureAddress; + break; + } + + default: + anv_debug_ignored_stype(ext->sType); + break; + } + } + + /* By default, we want all VkDeviceMemory objects to support CCS */ + if (device->physical->has_implicit_ccs) + alloc_flags |= ANV_BO_ALLOC_IMPLICIT_CCS; + + if (vk_flags & VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_BIT_KHR) + alloc_flags |= ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS; + + if ((export_info && export_info->handleTypes) || + (fd_info && fd_info->handleType) || + (host_ptr_info && host_ptr_info->handleType)) { + /* Anything imported or exported is EXTERNAL */ + alloc_flags |= ANV_BO_ALLOC_EXTERNAL; + + /* We can't have implicit CCS on external memory with an AUX-table. + * Doing so would require us to sync the aux tables across processes + * which is impractical. + */ + if (device->info.has_aux_map) + alloc_flags &= ~ANV_BO_ALLOC_IMPLICIT_CCS; + } /* Check if we need to support Android HW buffer export. If so, * create AHardwareBuffer and import memory from it. @@ -2968,11 +3424,6 @@ VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID) android_export = true; - /* Android memory import. */ - const struct VkImportAndroidHardwareBufferInfoANDROID *ahw_import_info = - vk_find_struct_const(pAllocateInfo->pNext, - IMPORT_ANDROID_HARDWARE_BUFFER_INFO_ANDROID); - if (ahw_import_info) { result = anv_import_ahw_memory(_device, mem, ahw_import_info); if (result != VK_SUCCESS) @@ -2984,7 +3435,7 @@ if (result != VK_SUCCESS) goto fail; - const struct VkImportAndroidHardwareBufferInfoANDROID import_info = { + const VkImportAndroidHardwareBufferInfoANDROID import_info = { .buffer = mem->ahw, }; result = anv_import_ahw_memory(_device, mem, &import_info); @@ -2994,9 +3445,6 @@ goto success; } - const VkImportMemoryFdInfoKHR *fd_info = - vk_find_struct_const(pAllocateInfo->pNext, IMPORT_MEMORY_FD_INFO_KHR); - /* The Vulkan spec permits handleType to be 0, in which case the struct is * ignored. */ @@ -3007,14 +3455,11 @@ fd_info->handleType == VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT); - result = anv_bo_cache_import(device, &device->bo_cache, fd_info->fd, - bo_flags | ANV_BO_EXTERNAL, &mem->bo); + result = anv_device_import_bo(device, fd_info->fd, alloc_flags, + client_address, &mem->bo); if (result != VK_SUCCESS) goto fail; - VkDeviceSize aligned_alloc_size = - align_u64(pAllocateInfo->allocationSize, 4096); - /* For security purposes, we reject importing the bo if it's smaller * than the requested allocation size. This prevents a malicious client * from passing a buffer to a trusted client, lying about the size, and @@ -3024,13 +3469,12 @@ * this sort of attack but only if it can trust the buffer size. */ if (mem->bo->size < aligned_alloc_size) { - result = vk_errorf(device->instance, device, - VK_ERROR_INVALID_EXTERNAL_HANDLE, + result = vk_errorf(device, device, VK_ERROR_INVALID_EXTERNAL_HANDLE, "aligned allocationSize too large for " "VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT: " "%"PRIu64"B > %"PRIu64"B", aligned_alloc_size, mem->bo->size); - anv_bo_cache_release(device, &device->bo_cache, mem->bo); + anv_device_release_bo(device, mem->bo); goto fail; } @@ -3047,9 +3491,6 @@ goto success; } - const VkImportMemoryHostPointerInfoEXT *host_ptr_info = - vk_find_struct_const(pAllocateInfo->pNext, - IMPORT_MEMORY_HOST_POINTER_INFO_EXT); if (host_ptr_info && host_ptr_info->handleType) { if (host_ptr_info->handleType == VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_MAPPED_FOREIGN_MEMORY_BIT_EXT) { @@ -3060,10 +3501,12 @@ assert(host_ptr_info->handleType == VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT); - result = anv_bo_cache_import_host_ptr( - device, &device->bo_cache, host_ptr_info->pHostPointer, - pAllocateInfo->allocationSize, bo_flags, &mem->bo); - + result = anv_device_import_bo_from_host_ptr(device, + host_ptr_info->pHostPointer, + pAllocateInfo->allocationSize, + alloc_flags, + client_address, + &mem->bo); if (result != VK_SUCCESS) goto fail; @@ -3073,17 +3516,11 @@ /* Regular allocate (not importing memory). */ - if (export_info && export_info->handleTypes) - bo_flags |= ANV_BO_EXTERNAL; - - result = anv_bo_cache_alloc(device, &device->bo_cache, - pAllocateInfo->allocationSize, bo_flags, - &mem->bo); + result = anv_device_alloc_bo(device, pAllocateInfo->allocationSize, + alloc_flags, client_address, &mem->bo); if (result != VK_SUCCESS) goto fail; - const VkMemoryDedicatedAllocateInfo *dedicated_info = - vk_find_struct_const(pAllocateInfo->pNext, MEMORY_DEDICATED_ALLOCATE_INFO); if (dedicated_info && dedicated_info->image != VK_NULL_HANDLE) { ANV_FROM_HANDLE(anv_image, image, dedicated_info->image); @@ -3097,9 +3534,8 @@ image->planes[0].surface.isl.row_pitch_B, i915_tiling); if (ret) { - anv_bo_cache_release(device, &device->bo_cache, mem->bo); - result = vk_errorf(device->instance, NULL, - VK_ERROR_OUT_OF_DEVICE_MEMORY, + anv_device_release_bo(device, mem->bo); + result = vk_errorf(device, device, VK_ERROR_OUT_OF_DEVICE_MEMORY, "failed to set BO tiling: %m"); goto fail; } @@ -3107,15 +3543,21 @@ } success: + mem_heap_used = p_atomic_add_return(&mem_heap->used, mem->bo->size); + if (mem_heap_used > mem_heap->size) { + p_atomic_add(&mem_heap->used, -mem->bo->size); + anv_device_release_bo(device, mem->bo); + result = vk_errorf(device, device, VK_ERROR_OUT_OF_DEVICE_MEMORY, + "Out of heap memory"); + goto fail; + } + pthread_mutex_lock(&device->mutex); list_addtail(&mem->link, &device->memory_objects); pthread_mutex_unlock(&device->mutex); *pMem = anv_device_memory_to_handle(mem); - p_atomic_add(&pdevice->memory.heaps[mem->type->heapIndex].used, - mem->bo->size); - return VK_SUCCESS; fail: @@ -3137,7 +3579,7 @@ assert(pGetFdInfo->handleType == VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT || pGetFdInfo->handleType == VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT); - return anv_bo_cache_export(dev, &dev->bo_cache, mem->bo, pFd); + return anv_device_export_bo(dev, mem->bo, pFd); } VkResult anv_GetMemoryFdPropertiesKHR( @@ -3147,13 +3589,12 @@ VkMemoryFdPropertiesKHR* pMemoryFdProperties) { ANV_FROM_HANDLE(anv_device, device, _device); - struct anv_physical_device *pdevice = &device->instance->physicalDevice; switch (handleType) { case VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT: /* dma-buf can be imported as any memory type */ pMemoryFdProperties->memoryTypeBits = - (1 << pdevice->memory.type_count) - 1; + (1 << device->physical->memory.type_count) - 1; return VK_SUCCESS; default: @@ -3180,15 +3621,13 @@ VK_STRUCTURE_TYPE_MEMORY_HOST_POINTER_PROPERTIES_EXT); switch (handleType) { - case VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT: { - struct anv_physical_device *pdevice = &device->instance->physicalDevice; - + case VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT: /* Host memory can be imported as any memory type. */ pMemoryHostPointerProperties->memoryTypeBits = - (1ull << pdevice->memory.type_count) - 1; + (1ull << device->physical->memory.type_count) - 1; return VK_SUCCESS; - } + default: return VK_ERROR_INVALID_EXTERNAL_HANDLE; } @@ -3201,7 +3640,6 @@ { ANV_FROM_HANDLE(anv_device, device, _device); ANV_FROM_HANDLE(anv_device_memory, mem, _mem); - struct anv_physical_device *pdevice = &device->instance->physicalDevice; if (mem == NULL) return; @@ -3213,10 +3651,10 @@ if (mem->map) anv_UnmapMemory(_device, _mem); - p_atomic_add(&pdevice->memory.heaps[mem->type->heapIndex].used, + p_atomic_add(&device->physical->memory.heaps[mem->type->heapIndex].used, -mem->bo->size); - anv_bo_cache_release(device, &device->bo_cache, mem->bo); + anv_device_release_bo(device, mem->bo); #if defined(ANDROID) && ANDROID_API_LEVEL >= 26 if (mem->ahw) @@ -3366,7 +3804,6 @@ { ANV_FROM_HANDLE(anv_buffer, buffer, _buffer); ANV_FROM_HANDLE(anv_device, device, _device); - struct anv_physical_device *pdevice = &device->instance->physicalDevice; /* The Vulkan spec (git aaed022) says: * @@ -3375,12 +3812,7 @@ * only if the memory type `i` in the VkPhysicalDeviceMemoryProperties * structure for the physical device is supported. */ - uint32_t memory_types = 0; - for (uint32_t i = 0; i < pdevice->memory.type_count; i++) { - uint32_t valid_usage = pdevice->memory.types[i].valid_buffer_usage; - if ((valid_usage & buffer->usage) == buffer->usage) - memory_types |= (1u << i); - } + uint32_t memory_types = (1ull << device->physical->memory.type_count) - 1; /* Base alignment requirement of a cache line */ uint32_t alignment = 16; @@ -3436,7 +3868,6 @@ { ANV_FROM_HANDLE(anv_image, image, _image); ANV_FROM_HANDLE(anv_device, device, _device); - struct anv_physical_device *pdevice = &device->instance->physicalDevice; /* The Vulkan spec (git aaed022) says: * @@ -3447,7 +3878,7 @@ * * All types are currently supported for images. */ - uint32_t memory_types = (1ull << pdevice->memory.type_count) - 1; + uint32_t memory_types = (1ull << device->physical->memory.type_count) - 1; /* We must have image allocated or imported at this point. According to the * specification, external images must have been bound to memory before @@ -3474,7 +3905,6 @@ vk_foreach_struct_const(ext, pInfo->pNext) { switch (ext->sType) { case VK_STRUCTURE_TYPE_IMAGE_PLANE_MEMORY_REQUIREMENTS_INFO: { - struct anv_physical_device *pdevice = &device->instance->physicalDevice; const VkImagePlaneMemoryRequirementsInfo *plane_reqs = (const VkImagePlaneMemoryRequirementsInfo *) ext; uint32_t plane = anv_image_aspect_to_plane(image->aspects, @@ -3493,7 +3923,7 @@ * All types are currently supported for images. */ pMemoryRequirements->memoryRequirements.memoryTypeBits = - (1ull << pdevice->memory.type_count) - 1; + (1ull << device->physical->memory.type_count) - 1; /* We must have image allocated or imported at this point. According to the * specification, external images must have been bound to memory before @@ -3574,7 +4004,6 @@ assert(pBindInfo->sType == VK_STRUCTURE_TYPE_BIND_BUFFER_MEMORY_INFO); if (mem) { - assert((buffer->usage & mem->type->valid_buffer_usage) == buffer->usage); buffer->address = (struct anv_address) { .bo = mem->bo, .offset = pBindInfo->memoryOffset, @@ -3737,6 +4166,14 @@ ANV_FROM_HANDLE(anv_device, device, _device); struct anv_buffer *buffer; + /* Don't allow creating buffers bigger than our address space. The real + * issue here is that we may align up the buffer size and we don't want + * doing so to cause roll-over. However, no one has any business + * allocating a buffer larger than our GTT size. + */ + if (pCreateInfo->size > device->physical->gtt_size) + return vk_error(VK_ERROR_OUT_OF_DEVICE_MEMORY); + assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO); buffer = vk_alloc2(&device->alloc, pAllocator, sizeof(*buffer), 8, @@ -3767,17 +4204,37 @@ vk_free2(&device->alloc, pAllocator, buffer); } -VkDeviceAddress anv_GetBufferDeviceAddressEXT( +VkDeviceAddress anv_GetBufferDeviceAddress( VkDevice device, - const VkBufferDeviceAddressInfoEXT* pInfo) + const VkBufferDeviceAddressInfoKHR* pInfo) { ANV_FROM_HANDLE(anv_buffer, buffer, pInfo->buffer); + assert(!anv_address_is_null(buffer->address)); assert(buffer->address.bo->flags & EXEC_OBJECT_PINNED); return anv_address_physical(buffer->address); } +uint64_t anv_GetBufferOpaqueCaptureAddress( + VkDevice device, + const VkBufferDeviceAddressInfoKHR* pInfo) +{ + return 0; +} + +uint64_t anv_GetDeviceMemoryOpaqueCaptureAddress( + VkDevice device, + const VkDeviceMemoryOpaqueCaptureAddressInfoKHR* pInfo) +{ + ANV_FROM_HANDLE(anv_device_memory, memory, pInfo->memory); + + assert(memory->bo->flags & EXEC_OBJECT_PINNED); + assert(memory->bo->has_client_visible_address); + + return gen_48b_address(memory->bo->offset); +} + void anv_fill_buffer_surface_state(struct anv_device *device, struct anv_state state, enum isl_format format, @@ -3786,7 +4243,7 @@ { isl_buffer_fill_state(&device->isl_dev, state.map, .address = anv_address_physical(address), - .mocs = device->default_mocs, + .mocs = device->isl_dev.mocs.internal, .size_B = range, .format = format, .swizzle = ISL_SWIZZLE_IDENTITY, @@ -3843,7 +4300,6 @@ } framebuffer->attachment_count = pCreateInfo->attachmentCount; } else { - assert(device->enabled_extensions.KHR_imageless_framebuffer); framebuffer = vk_alloc2(&device->alloc, pAllocator, size, 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); if (framebuffer == NULL) @@ -4041,7 +4497,10 @@ * - The ICD must implement vkCreate{PLATFORM}SurfaceKHR(), * vkDestroySurfaceKHR(), and other API which uses VKSurfaceKHR, * because the loader no longer does so. + * + * - Loader interface v4 differs from v3 in: + * - The ICD must implement vk_icdGetPhysicalDeviceProcAddr(). */ - *pSupportedVersion = MIN2(*pSupportedVersion, 3u); + *pSupportedVersion = MIN2(*pSupportedVersion, 4u); return VK_SUCCESS; } diff -Nru mesa-19.2.8/src/intel/vulkan/anv_entrypoints_gen.py mesa-20.0.8/src/intel/vulkan/anv_entrypoints_gen.py --- mesa-19.2.8/src/intel/vulkan/anv_entrypoints_gen.py 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/vulkan/anv_entrypoints_gen.py 2020-06-12 01:21:17.000000000 +0000 @@ -45,6 +45,7 @@ 'gen9', 'gen10', 'gen11', + 'gen12', ] TEMPLATE_H = Template("""\ @@ -69,6 +70,25 @@ }; }; +struct anv_physical_device_dispatch_table { + union { + void *entrypoints[${len(physical_device_entrypoints)}]; + struct { + % for e in physical_device_entrypoints: + % if e.guard is not None: +#ifdef ${e.guard} + PFN_${e.name} ${e.name}; +#else + void *${e.name}; +# endif + % else: + PFN_${e.name} ${e.name}; + % endif + % endfor + }; + }; +}; + struct anv_device_dispatch_table { union { void *entrypoints[${len(device_entrypoints)}]; @@ -90,11 +110,14 @@ extern const struct anv_instance_dispatch_table anv_instance_dispatch_table; %for layer in LAYERS: +extern const struct anv_physical_device_dispatch_table ${layer}_physical_device_dispatch_table; +%endfor +%for layer in LAYERS: extern const struct anv_device_dispatch_table ${layer}_device_dispatch_table; %endfor % for e in instance_entrypoints: - % if e.alias: + % if e.alias and e.alias.enabled: <% continue %> % endif % if e.guard is not None: @@ -106,7 +129,7 @@ % endif % endfor -% for e in device_entrypoints: +% for e in physical_device_entrypoints: % if e.alias: <% continue %> % endif @@ -120,6 +143,21 @@ #endif // ${e.guard} % endif % endfor + +% for e in device_entrypoints: + % if e.alias and e.alias.enabled: + <% continue %> + % endif + % if e.guard is not None: +#ifdef ${e.guard} + % endif + % for layer in LAYERS: + ${e.return_type} ${e.prefixed_name(layer)}(${e.decl_params()}); + % endfor + % if e.guard is not None: +#endif // ${e.guard} + % endif +% endfor """, output_encoding='utf-8') TEMPLATE_C = Template(u"""\ @@ -150,6 +188,8 @@ #include "anv_private.h" +#include "util/macros.h" + struct string_map_entry { uint32_t name; uint32_t hash; @@ -216,9 +256,20 @@ return -1; } + +static const char * +${prefix}_entry_name(int num) +{ + for (int i = 0; i < ARRAY_SIZE(${prefix}_string_map_entries); i++) { + if (${prefix}_string_map_entries[i].num == num) + return &${prefix}_strings[${prefix}_string_map_entries[i].name]; + } + return NULL; +} ${strmap(instance_strmap, 'instance')} +${strmap(physical_device_strmap, 'physical_device')} ${strmap(device_strmap, 'device')} /* Weak aliases for all potential implementations. These will resolve to @@ -227,7 +278,7 @@ */ % for e in instance_entrypoints: - % if e.alias: + % if e.alias and e.alias.enabled: <% continue %> % endif % if e.guard is not None: @@ -251,9 +302,35 @@ % endfor }; +% for e in physical_device_entrypoints: + % if e.alias and e.alias.enabled: + <% continue %> + % endif + % if e.guard is not None: +#ifdef ${e.guard} + % endif + ${e.return_type} ${e.prefixed_name('anv')}(${e.decl_params()}) __attribute__ ((weak)); + % if e.guard is not None: +#endif // ${e.guard} + % endif +% endfor + +const struct anv_physical_device_dispatch_table anv_physical_device_dispatch_table = { +% for e in physical_device_entrypoints: + % if e.guard is not None: +#ifdef ${e.guard} + % endif + .${e.name} = ${e.prefixed_name('anv')}, + % if e.guard is not None: +#endif // ${e.guard} + % endif +% endfor +}; + + % for layer in LAYERS: % for e in device_entrypoints: - % if e.alias: + % if e.alias and e.alias.enabled: <% continue %> % endif % if e.guard is not None: @@ -338,6 +415,40 @@ * If device is NULL, all device extensions are considered enabled. */ bool +anv_physical_device_entrypoint_is_enabled(int index, uint32_t core_version, + const struct anv_instance_extension_table *instance) +{ + switch (index) { +% for e in physical_device_entrypoints: + case ${e.num}: + /* ${e.name} */ + % if e.core_version: + return ${e.core_version.c_vk_version()} <= core_version; + % elif e.extensions: + % for ext in e.extensions: + % if ext.type == 'instance': + if (instance->${ext.name[3:]}) return true; + % else: + /* All device extensions are considered enabled at the instance level */ + return true; + % endif + % endfor + return false; + % else: + return true; + % endif +% endfor + default: + return false; + } +} + +/** Return true if the core version or extension in which the given entrypoint + * is defined is enabled. + * + * If device is NULL, all device extensions are considered enabled. + */ +bool anv_device_entrypoint_is_enabled(int index, uint32_t core_version, const struct anv_instance_extension_table *instance, const struct anv_device_extension_table *device) @@ -373,16 +484,43 @@ } int +anv_get_physical_device_entrypoint_index(const char *name) +{ + return physical_device_string_map_lookup(name); +} + +int anv_get_device_entrypoint_index(const char *name) { return device_string_map_lookup(name); } +const char * +anv_get_instance_entry_name(int index) +{ + return instance_entry_name(index); +} + +const char * +anv_get_physical_device_entry_name(int index) +{ + return physical_device_entry_name(index); +} + +const char * +anv_get_device_entry_name(int index) +{ + return device_entry_name(index); +} + static void * __attribute__ ((noinline)) anv_resolve_device_entrypoint(const struct gen_device_info *devinfo, uint32_t index) { const struct anv_device_dispatch_table *genX_table; switch (devinfo->gen) { + case 12: + genX_table = &gen12_device_dispatch_table; + break; case 11: genX_table = &gen11_device_dispatch_table; break; @@ -418,6 +556,10 @@ if (idx >= 0) return anv_instance_dispatch_table.entrypoints[idx]; + idx = anv_get_physical_device_entrypoint_index(name); + if (idx >= 0) + return anv_physical_device_dispatch_table.entrypoints[idx]; + idx = anv_get_device_entrypoint_index(name); if (idx >= 0) return anv_resolve_device_entrypoint(devinfo, idx); @@ -495,6 +637,10 @@ self.core_version = None self.extensions = [] + def prefixed_name(self, prefix): + assert self.name.startswith('vk') + return prefix + '_' + self.name[2:] + class Entrypoint(EntrypointBase): def __init__(self, name, return_type, params, guard=None): super(Entrypoint, self).__init__(name) @@ -502,13 +648,12 @@ self.params = params self.guard = guard + def is_physical_device_entrypoint(self): + return self.params[0].type in ('VkPhysicalDevice', ) + def is_device_entrypoint(self): return self.params[0].type in ('VkDevice', 'VkCommandBuffer', 'VkQueue') - def prefixed_name(self, prefix): - assert self.name.startswith('vk') - return prefix + '_' + self.name[2:] - def decl_params(self): return ', '.join(p.decl for p in self.params) @@ -520,11 +665,30 @@ super(EntrypointAlias, self).__init__(name) self.alias = entrypoint + def is_physical_device_entrypoint(self): + return self.alias.is_physical_device_entrypoint() + def is_device_entrypoint(self): return self.alias.is_device_entrypoint() def prefixed_name(self, prefix): - return self.alias.prefixed_name(prefix) + if self.alias.enabled: + return self.alias.prefixed_name(prefix) + return super(EntrypointAlias, self).prefixed_name(prefix) + + @property + def params(self): + return self.alias.params + + @property + def return_type(self): + return self.alias.return_type + + def decl_params(self): + return self.alias.decl_params() + + def call_params(self): + return self.alias.call_params() def get_entrypoints(doc, entrypoints_to_defines): """Extract the entry points from the registry.""" @@ -629,10 +793,13 @@ ])) device_entrypoints = [] + physical_device_entrypoints = [] instance_entrypoints = [] for e in entrypoints: if e.is_device_entrypoint(): device_entrypoints.append(e) + elif e.is_physical_device_entrypoint(): + physical_device_entrypoints.append(e) else: instance_entrypoints.append(e) @@ -642,6 +809,12 @@ e.num = num device_strmap.bake() + physical_device_strmap = StringIntMap() + for num, e in enumerate(physical_device_entrypoints): + physical_device_strmap.add_string(e.name, num) + e.num = num + physical_device_strmap.bake() + instance_strmap = StringIntMap() for num, e in enumerate(instance_entrypoints): instance_strmap.add_string(e.name, num) @@ -653,14 +826,17 @@ try: with open(os.path.join(args.outdir, 'anv_entrypoints.h'), 'wb') as f: f.write(TEMPLATE_H.render(instance_entrypoints=instance_entrypoints, + physical_device_entrypoints=physical_device_entrypoints, device_entrypoints=device_entrypoints, LAYERS=LAYERS, filename=os.path.basename(__file__))) with open(os.path.join(args.outdir, 'anv_entrypoints.c'), 'wb') as f: f.write(TEMPLATE_C.render(instance_entrypoints=instance_entrypoints, + physical_device_entrypoints=physical_device_entrypoints, device_entrypoints=device_entrypoints, LAYERS=LAYERS, instance_strmap=instance_strmap, + physical_device_strmap=physical_device_strmap, device_strmap=device_strmap, filename=os.path.basename(__file__))) except Exception: diff -Nru mesa-19.2.8/src/intel/vulkan/anv_extensions_gen.py mesa-20.0.8/src/intel/vulkan/anv_extensions_gen.py --- mesa-19.2.8/src/intel/vulkan/anv_extensions_gen.py 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/vulkan/anv_extensions_gen.py 2020-06-12 01:21:17.000000000 +0000 @@ -62,6 +62,8 @@ #include "stdbool.h" +#include "perf/gen_perf.h" + #define ANV_INSTANCE_EXTENSION_COUNT ${len(instance_extensions)} extern const VkExtensionProperties anv_instance_extensions[]; diff -Nru mesa-19.2.8/src/intel/vulkan/anv_extensions.py mesa-20.0.8/src/intel/vulkan/anv_extensions.py --- mesa-19.2.8/src/intel/vulkan/anv_extensions.py 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/vulkan/anv_extensions.py 2020-06-12 01:21:17.000000000 +0000 @@ -45,7 +45,7 @@ self.version = version self.enable = _bool_to_c_expr(enable) -API_PATCH_VERSION = 102 +API_PATCH_VERSION = 131 # Supported API versions. Each one is the maximum patch version for the given # version. Version come in increasing order and each version is available if @@ -53,10 +53,8 @@ # available. API_VERSIONS = [ ApiVersion('1.0', True), - - # DRM_IOCTL_SYNCOBJ_WAIT is required for VK_KHR_external_fence which is a - # required core feature in Vulkan 1.1 - ApiVersion('1.1', 'device->has_syncobj_wait'), + ApiVersion('1.1', True), + ApiVersion('1.2', True), ] MAX_API_VERSION = None # Computed later @@ -70,6 +68,8 @@ Extension('VK_KHR_8bit_storage', 1, 'device->info.gen >= 8'), Extension('VK_KHR_16bit_storage', 1, 'device->info.gen >= 8'), Extension('VK_KHR_bind_memory2', 1, True), + Extension('VK_KHR_buffer_device_address', 1, + 'device->has_a64_buffer_access && device->info.gen < 12'), Extension('VK_KHR_create_renderpass2', 1, True), Extension('VK_KHR_dedicated_allocation', 1, True), Extension('VK_KHR_depth_stencil_resolve', 1, True), @@ -106,21 +106,30 @@ Extension('VK_KHR_relaxed_block_layout', 1, True), Extension('VK_KHR_sampler_mirror_clamp_to_edge', 1, True), Extension('VK_KHR_sampler_ycbcr_conversion', 1, True), + Extension('VK_KHR_separate_depth_stencil_layouts', 1, True), Extension('VK_KHR_shader_atomic_int64', 1, 'device->info.gen >= 9 && device->use_softpin'), + Extension('VK_KHR_shader_clock', 1, True), Extension('VK_KHR_shader_draw_parameters', 1, True), Extension('VK_KHR_shader_float16_int8', 1, 'device->info.gen >= 8'), + Extension('VK_KHR_shader_float_controls', 1, 'device->info.gen >= 8'), + Extension('VK_KHR_shader_subgroup_extended_types', 1, 'device->info.gen >= 8'), + Extension('VK_KHR_spirv_1_4', 1, True), Extension('VK_KHR_storage_buffer_storage_class', 1, True), Extension('VK_KHR_surface', 25, 'ANV_HAS_SURFACE'), Extension('VK_KHR_surface_protected_capabilities', 1, 'ANV_HAS_SURFACE'), Extension('VK_KHR_swapchain', 70, 'ANV_HAS_SURFACE'), + Extension('VK_KHR_swapchain_mutable_format', 1, 'ANV_HAS_SURFACE'), + Extension('VK_KHR_timeline_semaphore', 1, True), Extension('VK_KHR_uniform_buffer_standard_layout', 1, True), Extension('VK_KHR_variable_pointers', 1, True), + Extension('VK_KHR_vulkan_memory_model', 3, True), Extension('VK_KHR_wayland_surface', 6, 'VK_USE_PLATFORM_WAYLAND_KHR'), Extension('VK_KHR_xcb_surface', 6, 'VK_USE_PLATFORM_XCB_KHR'), Extension('VK_KHR_xlib_surface', 6, 'VK_USE_PLATFORM_XLIB_KHR'), Extension('VK_EXT_acquire_xlib_display', 1, 'VK_USE_PLATFORM_XLIB_XRANDR_EXT'), - Extension('VK_EXT_buffer_device_address', 1, 'device->has_a64_buffer_access'), + Extension('VK_EXT_buffer_device_address', 1, + 'device->has_a64_buffer_access && device->info.gen < 12'), Extension('VK_EXT_calibrated_timestamps', 1, True), Extension('VK_EXT_conditional_rendering', 1, 'device->info.gen >= 8 || device->info.is_haswell'), Extension('VK_EXT_debug_report', 8, True), @@ -136,6 +145,7 @@ Extension('VK_EXT_global_priority', 1, 'device->has_context_priority'), Extension('VK_EXT_host_query_reset', 1, True), + Extension('VK_EXT_image_drm_format_modifier', 1, False), Extension('VK_EXT_index_type_uint8', 1, True), Extension('VK_EXT_inline_uniform_block', 1, True), Extension('VK_EXT_line_rasterization', 1, True), @@ -149,6 +159,8 @@ Extension('VK_EXT_separate_stencil_usage', 1, True), Extension('VK_EXT_shader_demote_to_helper_invocation', 1, True), Extension('VK_EXT_shader_stencil_export', 1, 'device->info.gen >= 9'), + Extension('VK_EXT_shader_subgroup_ballot', 1, True), + Extension('VK_EXT_shader_subgroup_vote', 1, True), Extension('VK_EXT_shader_viewport_index_layer', 1, True), Extension('VK_EXT_subgroup_size_control', 2, True), Extension('VK_EXT_texel_buffer_alignment', 1, True), @@ -159,6 +171,9 @@ Extension('VK_ANDROID_native_buffer', 7, 'ANDROID'), Extension('VK_GOOGLE_decorate_string', 1, True), Extension('VK_GOOGLE_hlsl_functionality1', 1, True), + Extension('VK_GOOGLE_user_type', 1, True), + Extension('VK_INTEL_performance_query', 1, 'device->perf && device->perf->i915_perf_version >= 3'), + Extension('VK_INTEL_shader_integer_functions2', 1, 'device->info.gen >= 8'), Extension('VK_NV_compute_shader_derivatives', 1, True), ] diff -Nru mesa-19.2.8/src/intel/vulkan/anv_formats.c mesa-20.0.8/src/intel/vulkan/anv_formats.c --- mesa-19.2.8/src/intel/vulkan/anv_formats.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/vulkan/anv_formats.c 2020-06-12 01:21:17.000000000 +0000 @@ -475,6 +475,18 @@ (isl_layout->bpb == 24 || isl_layout->bpb == 48)) return unsupported; + if (tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT) { + /* No non-power-of-two fourcc formats exist */ + if (!util_is_power_of_two_or_zero(isl_layout->bpb)) + return unsupported; + + if (vk_format_is_depth_or_stencil(vk_format)) + return unsupported; + + if (isl_format_is_compressed(plane_format.isl_format)) + return unsupported; + } + if (tiling == VK_IMAGE_TILING_OPTIMAL && !util_is_power_of_two_or_zero(isl_layout->bpb)) { /* Tiled formats *must* be power-of-two because we need up upload @@ -544,7 +556,7 @@ return 0; struct anv_format_plane base_plane_format = plane_format; - if (vk_tiling == VK_IMAGE_TILING_OPTIMAL) { + if (vk_tiling != VK_IMAGE_TILING_LINEAR) { base_plane_format = anv_get_format_plane(devinfo, vk_format, VK_IMAGE_ASPECT_COLOR_BIT, VK_IMAGE_TILING_LINEAR); @@ -705,11 +717,12 @@ static void get_wsi_format_modifier_properties_list(const struct anv_physical_device *physical_device, VkFormat vk_format, - struct wsi_format_modifier_properties_list *list) + VkDrmFormatModifierPropertiesListEXT *list) { const struct anv_format *anv_format = anv_get_format(vk_format); - VK_OUTARRAY_MAKE(out, list->modifier_properties, &list->modifier_count); + VK_OUTARRAY_MAKE(out, list->pDrmFormatModifierProperties, + &list->drmFormatModifierCount); /* This is a simplified list where all the modifiers are available */ assert(vk_format == VK_FORMAT_B8G8R8_SRGB || @@ -733,12 +746,17 @@ anv_format->planes[0].isl_format)) continue; + /* Gen12's CCS layout changes compared to Gen9-11. */ + if (mod_info->modifier == I915_FORMAT_MOD_Y_TILED_CCS && + physical_device->info.gen >= 12) + continue; + vk_outarray_append(&out, mod_props) { - mod_props->modifier = modifiers[i]; + mod_props->drmFormatModifier = modifiers[i]; if (isl_drm_modifier_has_aux(modifiers[i])) - mod_props->modifier_plane_count = 2; + mod_props->drmFormatModifierPlaneCount = 2; else - mod_props->modifier_plane_count = anv_format->n_planes; + mod_props->drmFormatModifierPlaneCount = anv_format->n_planes; } } } @@ -776,7 +794,7 @@ vk_foreach_struct(ext, pFormatProperties->pNext) { /* Use unsigned since some cases are not in the VkStructureType enum. */ switch ((unsigned)ext->sType) { - case VK_STRUCTURE_TYPE_WSI_FORMAT_MODIFIER_PROPERTIES_LIST_MESA: + case VK_STRUCTURE_TYPE_DRM_FORMAT_MODIFIER_PROPERTIES_LIST_EXT: get_wsi_format_modifier_properties_list(physical_device, format, (void *)ext); break; @@ -903,6 +921,40 @@ */ } + if (info->tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT) { + const VkPhysicalDeviceImageDrmFormatModifierInfoEXT *modifier_info = + vk_find_struct_const(info->pNext, + PHYSICAL_DEVICE_IMAGE_DRM_FORMAT_MODIFIER_INFO_EXT); + + /* Modifiers are only supported on simple 2D images */ + if (info->type != VK_IMAGE_TYPE_2D) + goto unsupported; + maxArraySize = 1; + maxMipLevels = 1; + assert(sampleCounts == VK_SAMPLE_COUNT_1_BIT); + + /* Modifiers are not yet supported for YCbCr */ + const struct anv_format *format = anv_get_format(info->format); + if (format->n_planes > 1) + goto unsupported; + + const struct isl_drm_modifier_info *isl_mod_info = + isl_drm_modifier_get_info(modifier_info->drmFormatModifier); + if (isl_mod_info->aux_usage == ISL_AUX_USAGE_CCS_E) { + /* If we have a CCS modifier, ensure that the format supports CCS + * and, if VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT is set, all of the + * formats in the format list are CCS compatible. + */ + const VkImageFormatListCreateInfoKHR *fmt_list = + vk_find_struct_const(info->pNext, + IMAGE_FORMAT_LIST_CREATE_INFO_KHR); + if (!anv_formats_ccs_e_compatible(devinfo, info->flags, + info->format, info->tiling, + fmt_list)) + goto unsupported; + } + } + /* From the bspec section entitled "Surface Layout and Tiling", * pre-gen9 has a 2 GB limitation of the size in bytes, * gen9 and gen10 have a 256 GB limitation and gen11+ @@ -1021,7 +1073,7 @@ const VkPhysicalDeviceExternalImageFormatInfo *external_info = NULL; VkExternalImageFormatProperties *external_props = NULL; VkSamplerYcbcrConversionImageFormatProperties *ycbcr_props = NULL; - struct VkAndroidHardwareBufferUsageANDROID *android_usage = NULL; + VkAndroidHardwareBufferUsageANDROID *android_usage = NULL; VkResult result; /* Extract input structs */ @@ -1030,6 +1082,9 @@ case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_IMAGE_FORMAT_INFO: external_info = (const void *) s; break; + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_DRM_FORMAT_MODIFIER_INFO_EXT: + /* anv_get_image_format_properties will handle this */ + break; case VK_STRUCTURE_TYPE_IMAGE_STENCIL_USAGE_CREATE_INFO_EXT: /* Ignore but don't warn */ break; @@ -1105,10 +1160,10 @@ * vkGetPhysicalDeviceImageFormatProperties2 returns * VK_ERROR_FORMAT_NOT_SUPPORTED. */ - result = vk_errorf(physical_device->instance, physical_device, - VK_ERROR_FORMAT_NOT_SUPPORTED, - "unsupported VkExternalMemoryTypeFlagBits 0x%x", - external_info->handleType); + result = vk_errorfi(physical_device->instance, physical_device, + VK_ERROR_FORMAT_NOT_SUPPORTED, + "unsupported VkExternalMemoryTypeFlagBits 0x%x", + external_info->handleType); goto fail; } } @@ -1216,7 +1271,7 @@ /* Search for VkExternalFormatANDROID and resolve the format. */ struct anv_format *ext_format = NULL; - const struct VkExternalFormatANDROID *ext_info = + const VkExternalFormatANDROID *ext_info = vk_find_struct_const(pCreateInfo->pNext, EXTERNAL_FORMAT_ANDROID); uint64_t format = ext_info ? ext_info->externalFormat : 0; diff -Nru mesa-19.2.8/src/intel/vulkan/anv_genX.h mesa-20.0.8/src/intel/vulkan/anv_genX.h --- mesa-19.2.8/src/intel/vulkan/anv_genX.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/vulkan/anv_genX.h 2020-06-12 01:21:17.000000000 +0000 @@ -44,6 +44,14 @@ void genX(cmd_buffer_emit_gen7_depth_flush)(struct anv_cmd_buffer *cmd_buffer); +void genX(cmd_buffer_set_binding_for_gen8_vb_flush)(struct anv_cmd_buffer *cmd_buffer, + int vb_index, + struct anv_address vb_address, + uint32_t vb_size); +void genX(cmd_buffer_update_dirty_vbs_for_gen8_vb_flush)(struct anv_cmd_buffer *cmd_buffer, + uint32_t access_type, + uint64_t vb_used); + void genX(cmd_buffer_emit_hashing_mode)(struct anv_cmd_buffer *cmd_buffer, unsigned width, unsigned height, unsigned scale); @@ -76,7 +84,8 @@ genX(emit_urb_setup)(struct anv_device *device, struct anv_batch *batch, const struct gen_l3_config *l3_config, VkShaderStageFlags active_stages, - const unsigned entry_size[4]); + const unsigned entry_size[4], + enum gen_urb_deref_block_size *deref_block_size); void genX(cmd_buffer_so_memcpy)(struct anv_cmd_buffer *cmd_buffer, struct anv_address dst, struct anv_address src, diff -Nru mesa-19.2.8/src/intel/vulkan/anv_image.c mesa-20.0.8/src/intel/vulkan/anv_image.c --- mesa-19.2.8/src/intel/vulkan/anv_image.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/vulkan/anv_image.c 2020-06-12 01:21:17.000000000 +0000 @@ -109,6 +109,9 @@ case VK_IMAGE_TILING_LINEAR: flags = ISL_TILING_LINEAR_BIT; break; + case VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT: + assert(isl_mod_info); + flags = 1 << isl_mod_info->tiling; } if (anv_info->isl_tiling_flags) @@ -117,9 +120,6 @@ if (legacy_scanout) flags &= ISL_TILING_LINEAR_BIT | ISL_TILING_X_BIT; - if (isl_mod_info) - flags &= 1 << isl_mod_info->tiling; - assert(flags); return flags; @@ -157,19 +157,21 @@ } -static bool -all_formats_ccs_e_compatible(const struct gen_device_info *devinfo, - const VkImageFormatListCreateInfoKHR *fmt_list, - struct anv_image *image) +bool +anv_formats_ccs_e_compatible(const struct gen_device_info *devinfo, + VkImageCreateFlags create_flags, + VkFormat vk_format, + VkImageTiling vk_tiling, + const VkImageFormatListCreateInfoKHR *fmt_list) { enum isl_format format = - anv_get_isl_format(devinfo, image->vk_format, - VK_IMAGE_ASPECT_COLOR_BIT, image->tiling); + anv_get_isl_format(devinfo, vk_format, + VK_IMAGE_ASPECT_COLOR_BIT, vk_tiling); if (!isl_format_supports_ccs_e(devinfo, format)) return false; - if (!(image->create_flags & VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT)) + if (!(create_flags & VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT)) return true; if (!fmt_list || fmt_list->viewFormatCount == 0) @@ -178,7 +180,7 @@ for (uint32_t i = 0; i < fmt_list->viewFormatCount; i++) { enum isl_format view_format = anv_get_isl_format(devinfo, fmt_list->pViewFormats[i], - VK_IMAGE_ASPECT_COLOR_BIT, image->tiling); + VK_IMAGE_ASPECT_COLOR_BIT, vk_tiling); if (!isl_formats_are_ccs_e_compatible(devinfo, format, view_format)) return false; @@ -247,7 +249,7 @@ const struct anv_device *device) { assert(image && device); - assert(image->planes[plane].aux_surface.isl.size_B > 0 && + assert(image->planes[plane].aux_usage != ISL_AUX_USAGE_NONE && image->aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV); /* Compressed images must be tiled and therefore everything should be 4K @@ -283,6 +285,15 @@ } } + /* Add some padding to make sure the fast clear color state buffer starts at + * a 4K alignment. We believe that 256B might be enough, but due to lack of + * testing we will leave this as 4K for now. + */ + image->planes[plane].size = align_u64(image->planes[plane].size, 4096); + image->size = align_u64(image->size, 4096); + + assert(image->planes[plane].offset % 4096 == 0); + image->planes[plane].fast_clear_state_offset = image->planes[plane].offset + image->planes[plane].size; @@ -295,7 +306,7 @@ * image's memory requirements (that is, the image's size and alignment). */ static VkResult -make_surface(const struct anv_device *dev, +make_surface(struct anv_device *dev, struct anv_image *image, uint32_t stride, isl_tiling_flags_t tiling_flags, @@ -411,22 +422,22 @@ if (!(image->usage & VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT)) { /* It will never be used as an attachment, HiZ is pointless. */ } else if (dev->info.gen == 7) { - anv_perf_warn(dev->instance, image, "Implement gen7 HiZ"); + anv_perf_warn(dev, image, "Implement gen7 HiZ"); } else if (image->levels > 1) { - anv_perf_warn(dev->instance, image, "Enable multi-LOD HiZ"); + anv_perf_warn(dev, image, "Enable multi-LOD HiZ"); } else if (image->array_size > 1) { - anv_perf_warn(dev->instance, image, + anv_perf_warn(dev, image, "Implement multi-arrayLayer HiZ clears and resolves"); } else if (dev->info.gen == 8 && image->samples > 1) { - anv_perf_warn(dev->instance, image, "Enable gen8 multisampled HiZ"); + anv_perf_warn(dev, image, "Enable gen8 multisampled HiZ"); } else if (!unlikely(INTEL_DEBUG & DEBUG_NO_HIZ)) { assert(image->planes[plane].aux_surface.isl.size_B == 0); ok = isl_surf_get_hiz_surf(&dev->isl_dev, &image->planes[plane].surface.isl, &image->planes[plane].aux_surface.isl); assert(ok); - add_surface(image, &image->planes[plane].aux_surface, plane); image->planes[plane].aux_usage = ISL_AUX_USAGE_HIZ; + add_surface(image, &image->planes[plane].aux_surface, plane); } } else if ((aspect & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) && image->samples == 1) { /* TODO: Disallow compression with : @@ -449,7 +460,8 @@ assert(image->planes[plane].aux_surface.isl.size_B == 0); ok = isl_surf_get_ccs_surf(&dev->isl_dev, &image->planes[plane].surface.isl, - &image->planes[plane].aux_surface.isl, 0); + &image->planes[plane].aux_surface.isl, + NULL, 0); if (ok) { /* Disable CCS when it is not useful (i.e., when you can't render @@ -461,7 +473,7 @@ * image, we currently don't have things hooked up to get it * working. */ - anv_perf_warn(dev->instance, image, + anv_perf_warn(dev, image, "This image format doesn't support rendering. " "Not allocating an CCS buffer."); image->planes[plane].aux_surface.isl.size_B = 0; @@ -479,9 +491,19 @@ if (!(image->usage & VK_IMAGE_USAGE_STORAGE_BIT) && image->ccs_e_compatible) { image->planes[plane].aux_usage = ISL_AUX_USAGE_CCS_E; + } else if (dev->info.gen >= 12) { + anv_perf_warn(dev, image, + "The CCS_D aux mode is not yet handled on " + "Gen12+. Not allocating a CCS buffer."); + image->planes[plane].aux_surface.isl.size_B = 0; + return VK_SUCCESS; + } else { + image->planes[plane].aux_usage = ISL_AUX_USAGE_CCS_D; } - add_surface(image, &image->planes[plane].aux_surface, plane); + if (!dev->physical->has_implicit_ccs) + add_surface(image, &image->planes[plane].aux_surface, plane); + add_aux_state_tracking_buffer(image, plane, dev); } } @@ -492,9 +514,9 @@ &image->planes[plane].surface.isl, &image->planes[plane].aux_surface.isl); if (ok) { + image->planes[plane].aux_usage = ISL_AUX_USAGE_MCS; add_surface(image, &image->planes[plane].aux_surface, plane); add_aux_state_tracking_buffer(image, plane, dev); - image->planes[plane].aux_usage = ISL_AUX_USAGE_MCS; } } @@ -510,7 +532,7 @@ image->planes[plane].surface.isl.size_B)) <= (image->planes[plane].offset + image->planes[plane].size)); - if (image->planes[plane].aux_surface.isl.size_B) { + if (image->planes[plane].aux_usage != ISL_AUX_USAGE_NONE) { /* assert(image->planes[plane].fast_clear_state_offset == */ /* (image->planes[plane].aux_surface.offset + image->planes[plane].aux_surface.isl.size_B)); */ assert(image->planes[plane].fast_clear_state_offset < @@ -569,10 +591,14 @@ const struct wsi_image_create_info *wsi_info = vk_find_struct_const(pCreateInfo->pNext, WSI_IMAGE_CREATE_INFO_MESA); - if (wsi_info && wsi_info->modifier_count > 0) { - isl_mod_info = choose_drm_format_mod(&device->instance->physicalDevice, - wsi_info->modifier_count, - wsi_info->modifiers); + + if (pCreateInfo->tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT) { + const VkImageDrmFormatModifierListCreateInfoEXT *mod_info = + vk_find_struct_const(pCreateInfo->pNext, + IMAGE_DRM_FORMAT_MODIFIER_LIST_CREATE_INFO_EXT); + isl_mod_info = choose_drm_format_mod(device->physical, + mod_info->drmFormatModifierCount, + mod_info->pDrmFormatModifiers); assert(isl_mod_info); } @@ -636,7 +662,8 @@ IMAGE_FORMAT_LIST_CREATE_INFO_KHR); image->ccs_e_compatible = - all_formats_ccs_e_compatible(&device->info, fmt_list, image); + anv_formats_ccs_e_compatible(&device->info, image->create_flags, + image->vk_format, image->tiling, fmt_list); uint32_t b; for_each_bit(b, image->aspects) { @@ -704,13 +731,13 @@ local_create_info.usage |= VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT; /* If the image has a particular modifier, specify that modifier. */ - struct wsi_image_create_info local_wsi_info = { - .sType = VK_STRUCTURE_TYPE_WSI_IMAGE_CREATE_INFO_MESA, - .modifier_count = 1, - .modifiers = &swapchain_image->drm_format_mod, + VkImageDrmFormatModifierListCreateInfoEXT local_modifier_info = { + .sType = VK_STRUCTURE_TYPE_IMAGE_DRM_FORMAT_MODIFIER_LIST_CREATE_INFO_EXT, + .drmFormatModifierCount = 1, + .pDrmFormatModifiers = &swapchain_image->drm_format_mod, }; if (swapchain_image->drm_format_mod != DRM_FORMAT_MOD_INVALID) - __vk_append_struct(&local_create_info, &local_wsi_info); + __vk_append_struct(&local_create_info, &local_modifier_info); return anv_image_create(device, &(struct anv_image_create_info) { @@ -727,7 +754,7 @@ const VkAllocationCallbacks *pAllocator, VkImage *pImage) { - const struct VkExternalMemoryImageCreateInfo *create_info = + const VkExternalMemoryImageCreateInfo *create_info = vk_find_struct_const(pCreateInfo->pNext, EXTERNAL_MEMORY_IMAGE_CREATE_INFO); if (create_info && (create_info->handleTypes & @@ -736,7 +763,7 @@ pAllocator, pImage); bool use_external_format = false; - const struct VkExternalFormatANDROID *ext_format = + const VkExternalFormatANDROID *ext_format = vk_find_struct_const(pCreateInfo->pNext, EXTERNAL_FORMAT_ANDROID); /* "If externalFormat is zero, the effect is as if the @@ -780,8 +807,7 @@ for (uint32_t p = 0; p < image->n_planes; ++p) { if (image->planes[p].bo_is_owned) { assert(image->planes[p].address.bo != NULL); - anv_bo_cache_release(device, &device->bo_cache, - image->planes[p].address.bo); + anv_device_release_bo(device, image->planes[p].address.bo); } } @@ -805,6 +831,12 @@ .bo = memory->bo, .offset = memory_offset, }; + + /* If we're on a platform that uses implicit CCS and our buffer does not + * have any implicit CCS data, disable compression on that image. + */ + if (device->physical->has_implicit_ccs && !memory->bo->has_implicit_ccs) + image->planes[plane].aux_usage = ISL_AUX_USAGE_NONE; } /* We are binding AHardwareBuffer. Get a description, resolve the @@ -1020,11 +1052,25 @@ } } +VkResult anv_GetImageDrmFormatModifierPropertiesEXT( + VkDevice device, + VkImage _image, + VkImageDrmFormatModifierPropertiesEXT* pProperties) +{ + ANV_FROM_HANDLE(anv_image, image, _image); + + assert(pProperties->sType == + VK_STRUCTURE_TYPE_IMAGE_DRM_FORMAT_MODIFIER_PROPERTIES_EXT); + + pProperties->drmFormatModifier = image->drm_format_mod; + + return VK_SUCCESS; +} + /** - * This function determines the optimal buffer to use for a given - * VkImageLayout and other pieces of information needed to make that - * determination. This does not determine the optimal buffer to use - * during a resolve operation. + * This function returns the assumed isl_aux_state for a given VkImageLayout. + * Because Vulkan image layouts don't map directly to isl_aux_state enums, the + * returned enum is the assumed worst case. * * @param devinfo The device information of the Intel GPU. * @param image The image that may contain a collection of buffers. @@ -1033,8 +1079,8 @@ * * @return The primary buffer that should be used for the given layout. */ -enum isl_aux_usage -anv_layout_to_aux_usage(const struct gen_device_info * const devinfo, +enum isl_aux_state +anv_layout_to_aux_state(const struct gen_device_info * const devinfo, const struct anv_image * const image, const VkImageAspectFlagBits aspect, const VkImageLayout layout) @@ -1054,21 +1100,17 @@ uint32_t plane = anv_image_aspect_to_plane(image->aspects, aspect); - /* If there is no auxiliary surface allocated, we must use the one and only - * main buffer. - */ - if (image->planes[plane].aux_surface.isl.size_B == 0) - return ISL_AUX_USAGE_NONE; + /* If we don't have an aux buffer then aux state makes no sense */ + assert(image->planes[plane].aux_usage != ISL_AUX_USAGE_NONE); /* All images that use an auxiliary surface are required to be tiled. */ - assert(image->tiling == VK_IMAGE_TILING_OPTIMAL); + assert(image->planes[plane].surface.isl.tiling != ISL_TILING_LINEAR); /* Stencil has no aux */ assert(aspect != VK_IMAGE_ASPECT_STENCIL_BIT); switch (layout) { - - /* Invalid Layouts */ + /* Invalid layouts */ case VK_IMAGE_LAYOUT_RANGE_SIZE: case VK_IMAGE_LAYOUT_MAX_ENUM: unreachable("Invalid image layout."); @@ -1081,42 +1123,45 @@ */ case VK_IMAGE_LAYOUT_UNDEFINED: case VK_IMAGE_LAYOUT_PREINITIALIZED: - return ISL_AUX_USAGE_NONE; - + return ISL_AUX_STATE_AUX_INVALID; - /* Transfer Layouts - */ + /* Transfer layouts */ case VK_IMAGE_LAYOUT_GENERAL: case VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL: - case VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL: if (aspect == VK_IMAGE_ASPECT_DEPTH_BIT) { /* This buffer could be a depth buffer used in a transfer operation. * BLORP currently doesn't use HiZ for transfer operations so we must * use the main buffer for this layout. TODO: Enable HiZ in BLORP. */ assert(image->planes[plane].aux_usage == ISL_AUX_USAGE_HIZ); - return ISL_AUX_USAGE_NONE; + return ISL_AUX_STATE_AUX_INVALID; + } else if (image->planes[plane].aux_usage == ISL_AUX_USAGE_CCS_D) { + return ISL_AUX_STATE_PASS_THROUGH; } else { - assert(image->aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV); - return image->planes[plane].aux_usage; + return ISL_AUX_STATE_COMPRESSED_CLEAR; } - - /* Sampling Layouts */ + /* Sampling layouts */ + case VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_OPTIMAL_KHR: case VK_IMAGE_LAYOUT_DEPTH_STENCIL_READ_ONLY_OPTIMAL: case VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_STENCIL_ATTACHMENT_OPTIMAL: assert((image->aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) == 0); /* Fall-through */ + case VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL: case VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL: if (aspect == VK_IMAGE_ASPECT_DEPTH_BIT) { if (anv_can_sample_with_hiz(devinfo, image)) - return ISL_AUX_USAGE_HIZ; + return ISL_AUX_STATE_COMPRESSED_CLEAR; else - return ISL_AUX_USAGE_NONE; + return ISL_AUX_STATE_RESOLVED; + } else if (image->planes[plane].aux_usage == ISL_AUX_USAGE_CCS_D) { + return ISL_AUX_STATE_PASS_THROUGH; } else { - return image->planes[plane].aux_usage; + return ISL_AUX_STATE_COMPRESSED_CLEAR; } + case VK_IMAGE_LAYOUT_STENCIL_READ_ONLY_OPTIMAL_KHR: + return ISL_AUX_STATE_RESOLVED; case VK_IMAGE_LAYOUT_PRESENT_SRC_KHR: { assert(image->aspects == VK_IMAGE_ASPECT_COLOR_BIT); @@ -1130,25 +1175,36 @@ */ const struct isl_drm_modifier_info *mod_info = isl_drm_modifier_get_info(image->drm_format_mod); - return mod_info ? mod_info->aux_usage : ISL_AUX_USAGE_NONE; + if (mod_info && mod_info->aux_usage != ISL_AUX_USAGE_NONE) { + assert(mod_info->aux_usage == ISL_AUX_USAGE_CCS_E); + assert(image->planes[plane].aux_usage == ISL_AUX_USAGE_CCS_E); + /* We do not yet support any modifiers which support clear color so + * we just always return COMPRESSED_NO_CLEAR. One day, this will + * change. + */ + assert(!mod_info->supports_clear_color); + return ISL_AUX_STATE_COMPRESSED_NO_CLEAR; + } else { + return ISL_AUX_STATE_PASS_THROUGH; + } } - - /* Rendering Layouts */ + /* Rendering layouts */ case VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL: assert(aspect & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV); - if (image->planes[plane].aux_usage == ISL_AUX_USAGE_NONE) { - assert(image->samples == 1); - return ISL_AUX_USAGE_CCS_D; + /* fall-through */ + case VK_IMAGE_LAYOUT_STENCIL_ATTACHMENT_OPTIMAL_KHR: + if (image->planes[plane].aux_usage == ISL_AUX_USAGE_CCS_D) { + return ISL_AUX_STATE_PARTIAL_CLEAR; } else { - assert(image->planes[plane].aux_usage != ISL_AUX_USAGE_CCS_D); - return image->planes[plane].aux_usage; + return ISL_AUX_STATE_COMPRESSED_CLEAR; } + case VK_IMAGE_LAYOUT_DEPTH_ATTACHMENT_OPTIMAL_KHR: case VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL: case VK_IMAGE_LAYOUT_DEPTH_ATTACHMENT_STENCIL_READ_ONLY_OPTIMAL: assert(aspect == VK_IMAGE_ASPECT_DEPTH_BIT); - return ISL_AUX_USAGE_HIZ; + return ISL_AUX_STATE_COMPRESSED_CLEAR; case VK_IMAGE_LAYOUT_SHARED_PRESENT_KHR: unreachable("VK_KHR_shared_presentable_image is unsupported"); @@ -1160,12 +1216,129 @@ unreachable("VK_NV_shading_rate_image is unsupported"); } - /* If the layout isn't recognized in the exhaustive switch above, the - * VkImageLayout value is not defined in vulkan.h. - */ unreachable("layout is not a VkImageLayout enumeration member."); } +ASSERTED static bool +vk_image_layout_is_read_only(VkImageLayout layout, + VkImageAspectFlagBits aspect) +{ + assert(util_bitcount(aspect) == 1); + + switch (layout) { + case VK_IMAGE_LAYOUT_UNDEFINED: + case VK_IMAGE_LAYOUT_PREINITIALIZED: + return true; /* These are only used for layout transitions */ + + case VK_IMAGE_LAYOUT_GENERAL: + case VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL: + case VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL: + case VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL: + case VK_IMAGE_LAYOUT_SHARED_PRESENT_KHR: + case VK_IMAGE_LAYOUT_DEPTH_ATTACHMENT_OPTIMAL_KHR: + case VK_IMAGE_LAYOUT_STENCIL_ATTACHMENT_OPTIMAL_KHR: + return false; + + case VK_IMAGE_LAYOUT_DEPTH_STENCIL_READ_ONLY_OPTIMAL: + case VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL: + case VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL: + case VK_IMAGE_LAYOUT_PRESENT_SRC_KHR: + case VK_IMAGE_LAYOUT_SHADING_RATE_OPTIMAL_NV: + case VK_IMAGE_LAYOUT_FRAGMENT_DENSITY_MAP_OPTIMAL_EXT: + case VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_OPTIMAL_KHR: + case VK_IMAGE_LAYOUT_STENCIL_READ_ONLY_OPTIMAL_KHR: + return true; + + case VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_STENCIL_ATTACHMENT_OPTIMAL: + return aspect == VK_IMAGE_ASPECT_DEPTH_BIT; + + case VK_IMAGE_LAYOUT_DEPTH_ATTACHMENT_STENCIL_READ_ONLY_OPTIMAL: + return aspect == VK_IMAGE_ASPECT_STENCIL_BIT; + + case VK_IMAGE_LAYOUT_RANGE_SIZE: + case VK_IMAGE_LAYOUT_MAX_ENUM: + unreachable("Invalid image layout."); + } + + unreachable("Invalid image layout."); +} + +/** + * This function determines the optimal buffer to use for a given + * VkImageLayout and other pieces of information needed to make that + * determination. This does not determine the optimal buffer to use + * during a resolve operation. + * + * @param devinfo The device information of the Intel GPU. + * @param image The image that may contain a collection of buffers. + * @param aspect The aspect of the image to be accessed. + * @param usage The usage which describes how the image will be accessed. + * @param layout The current layout of the image aspect(s). + * + * @return The primary buffer that should be used for the given layout. + */ +enum isl_aux_usage +anv_layout_to_aux_usage(const struct gen_device_info * const devinfo, + const struct anv_image * const image, + const VkImageAspectFlagBits aspect, + const VkImageUsageFlagBits usage, + const VkImageLayout layout) +{ + uint32_t plane = anv_image_aspect_to_plane(image->aspects, aspect); + + /* If there is no auxiliary surface allocated, we must use the one and only + * main buffer. + */ + if (image->planes[plane].aux_usage == ISL_AUX_USAGE_NONE) + return ISL_AUX_USAGE_NONE; + + enum isl_aux_state aux_state = + anv_layout_to_aux_state(devinfo, image, aspect, layout); + + switch (aux_state) { + case ISL_AUX_STATE_CLEAR: + unreachable("We never use this state"); + + case ISL_AUX_STATE_PARTIAL_CLEAR: + assert(image->aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV); + assert(image->planes[plane].aux_usage == ISL_AUX_USAGE_CCS_D); + assert(image->samples == 1); + return ISL_AUX_USAGE_CCS_D; + + case ISL_AUX_STATE_COMPRESSED_CLEAR: + case ISL_AUX_STATE_COMPRESSED_NO_CLEAR: + if (aspect == VK_IMAGE_ASPECT_DEPTH_BIT) { + return ISL_AUX_USAGE_HIZ; + } else { + assert(image->planes[plane].aux_usage != ISL_AUX_USAGE_NONE); + return image->planes[plane].aux_usage; + } + + case ISL_AUX_STATE_RESOLVED: + /* We can only use RESOLVED in read-only layouts because any write will + * either land us in AUX_INVALID or COMPRESSED_NO_CLEAR. We can do + * writes in PASS_THROUGH without destroying it so that is allowed. + */ + assert(vk_image_layout_is_read_only(layout, aspect)); + assert(util_is_power_of_two_or_zero(usage)); + if (usage == VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT) { + /* If we have valid HiZ data and are using the image as a read-only + * depth/stencil attachment, we should enable HiZ so that we can get + * faster depth testing. + */ + return ISL_AUX_USAGE_HIZ; + } else { + return ISL_AUX_USAGE_NONE; + } + + case ISL_AUX_STATE_PASS_THROUGH: + case ISL_AUX_STATE_AUX_INVALID: + return ISL_AUX_USAGE_NONE; + } + + unreachable("Invalid isl_aux_state"); +} + /** * This function returns the level of unresolved fast-clear support of the * given image in the given VkImageLayout. @@ -1173,6 +1346,7 @@ * @param devinfo The device information of the Intel GPU. * @param image The image that may contain a collection of buffers. * @param aspect The aspect of the image to be accessed. + * @param usage The usage which describes how the image will be accessed. * @param layout The current layout of the image aspect(s). */ enum anv_fast_clear_type @@ -1181,69 +1355,58 @@ const VkImageAspectFlagBits aspect, const VkImageLayout layout) { - /* The aspect must be exactly one of the image aspects. */ - assert(util_bitcount(aspect) == 1 && (aspect & image->aspects)); + if (INTEL_DEBUG & DEBUG_NO_FAST_CLEAR) + return ANV_FAST_CLEAR_NONE; uint32_t plane = anv_image_aspect_to_plane(image->aspects, aspect); /* If there is no auxiliary surface allocated, there are no fast-clears */ - if (image->planes[plane].aux_surface.isl.size_B == 0) + if (image->planes[plane].aux_usage == ISL_AUX_USAGE_NONE) return ANV_FAST_CLEAR_NONE; - /* All images that use an auxiliary surface are required to be tiled. */ - assert(image->tiling == VK_IMAGE_TILING_OPTIMAL); - - /* Stencil has no aux */ - assert(aspect != VK_IMAGE_ASPECT_STENCIL_BIT); - - if (aspect == VK_IMAGE_ASPECT_DEPTH_BIT) { - /* For depth images (with HiZ), the layout supports fast-clears if and - * only if it supports HiZ. However, we only support fast-clears to the - * default depth value. - */ - enum isl_aux_usage aux_usage = - anv_layout_to_aux_usage(devinfo, image, aspect, layout); - return aux_usage == ISL_AUX_USAGE_HIZ ? - ANV_FAST_CLEAR_DEFAULT_VALUE : ANV_FAST_CLEAR_NONE; - } - - assert(image->aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV); - /* We don't support MSAA fast-clears on Ivybridge or Bay Trail because they * lack the MI ALU which we need to determine the predicates. */ if (devinfo->gen == 7 && !devinfo->is_haswell && image->samples > 1) return ANV_FAST_CLEAR_NONE; - switch (layout) { - case VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL: - return ANV_FAST_CLEAR_ANY; + enum isl_aux_state aux_state = + anv_layout_to_aux_state(devinfo, image, aspect, layout); - case VK_IMAGE_LAYOUT_PRESENT_SRC_KHR: { - assert(image->aspects == VK_IMAGE_ASPECT_COLOR_BIT); -#ifndef NDEBUG - /* We do not yet support any modifiers which support clear color so we - * just always return NONE. One day, this will change. - */ - const struct isl_drm_modifier_info *mod_info = - isl_drm_modifier_get_info(image->drm_format_mod); - assert(!mod_info || !mod_info->supports_clear_color); -#endif - return ANV_FAST_CLEAR_NONE; - } + switch (aux_state) { + case ISL_AUX_STATE_CLEAR: + unreachable("We never use this state"); - default: - /* If the image has MCS or CCS_E enabled all the time then we can use - * fast-clear as long as the clear color is the default value of zero - * since this is the default value we program into every surface state - * used for texturing. - */ - if (image->planes[plane].aux_usage == ISL_AUX_USAGE_MCS || - image->planes[plane].aux_usage == ISL_AUX_USAGE_CCS_E) + case ISL_AUX_STATE_PARTIAL_CLEAR: + case ISL_AUX_STATE_COMPRESSED_CLEAR: + if (aspect == VK_IMAGE_ASPECT_DEPTH_BIT) { + return ANV_FAST_CLEAR_DEFAULT_VALUE; + } else if (layout == VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL) { + /* When we're in a render pass we have the clear color data from the + * VkRenderPassBeginInfo and we can use arbitrary clear colors. They + * must get partially resolved before we leave the render pass. + */ + return ANV_FAST_CLEAR_ANY; + } else if (image->planes[plane].aux_usage == ISL_AUX_USAGE_MCS || + image->planes[plane].aux_usage == ISL_AUX_USAGE_CCS_E) { + /* If the image has MCS or CCS_E enabled all the time then we can use + * fast-clear as long as the clear color is the default value of zero + * since this is the default value we program into every surface + * state used for texturing. + */ return ANV_FAST_CLEAR_DEFAULT_VALUE; - else + } else { return ANV_FAST_CLEAR_NONE; + } + + case ISL_AUX_STATE_COMPRESSED_NO_CLEAR: + case ISL_AUX_STATE_RESOLVED: + case ISL_AUX_STATE_PASS_THROUGH: + case ISL_AUX_STATE_AUX_INVALID: + return ANV_FAST_CLEAR_NONE; } + + unreachable("Invalid isl_aux_state"); } @@ -1429,7 +1592,7 @@ if (device->info.gen >= 10 && aux_usage != ISL_AUX_USAGE_NONE) { if (aux_usage == ISL_AUX_USAGE_HIZ) { clear_address = (struct anv_address) { - .bo = &device->hiz_clear_bo, + .bo = device->hiz_clear_bo, .offset = 0, }; } else { @@ -1537,7 +1700,7 @@ /* Check if a conversion info was passed. */ const struct anv_format *conv_format = NULL; - const struct VkSamplerYcbcrConversionInfo *conv_info = + const VkSamplerYcbcrConversionInfo *conv_info = vk_find_struct_const(pCreateInfo->pNext, SAMPLER_YCBCR_CONVERSION_INFO); /* If image has an external format, the pNext chain must contain an instance of @@ -1670,9 +1833,11 @@ enum isl_aux_usage general_aux_usage = anv_layout_to_aux_usage(&device->info, image, 1UL << iaspect_bit, + VK_IMAGE_USAGE_SAMPLED_BIT, VK_IMAGE_LAYOUT_GENERAL); enum isl_aux_usage optimal_aux_usage = anv_layout_to_aux_usage(&device->info, image, 1UL << iaspect_bit, + VK_IMAGE_USAGE_SAMPLED_BIT, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL); anv_image_fill_surface_state(device, image, 1ULL << iaspect_bit, diff -Nru mesa-19.2.8/src/intel/vulkan/anv_intel.c mesa-20.0.8/src/intel/vulkan/anv_intel.c --- mesa-19.2.8/src/intel/vulkan/anv_intel.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/vulkan/anv_intel.c 2020-06-12 01:21:17.000000000 +0000 @@ -72,26 +72,21 @@ image = anv_image_from_handle(image_h); - uint64_t bo_flags = ANV_BO_EXTERNAL; - if (device->instance->physicalDevice.supports_48bit_addresses) - bo_flags |= EXEC_OBJECT_SUPPORTS_48B_ADDRESS; - if (device->instance->physicalDevice.use_softpin) - bo_flags |= EXEC_OBJECT_PINNED; - - result = anv_bo_cache_import(device, &device->bo_cache, - pCreateInfo->fd, bo_flags, &mem->bo); + result = anv_device_import_bo(device, pCreateInfo->fd, + ANV_BO_ALLOC_IMPLICIT_SYNC, + 0 /* address */, + &mem->bo); if (result != VK_SUCCESS) goto fail_import; VkDeviceSize aligned_image_size = align_u64(image->size, 4096); if (mem->bo->size < aligned_image_size) { - result = vk_errorf(device->instance, device, - VK_ERROR_INVALID_EXTERNAL_HANDLE, + result = vk_errorf(device, device, VK_ERROR_INVALID_EXTERNAL_HANDLE, "dma-buf too small for image in " "vkCreateDmaBufImageINTEL: %"PRIu64"B < %"PRIu64"B", mem->bo->size, aligned_image_size); - anv_bo_cache_release(device, &device->bo_cache, mem->bo); + anv_device_release_bo(device, mem->bo); goto fail_import; } diff -Nru mesa-19.2.8/src/intel/vulkan/anv_nir_add_base_work_group_id.c mesa-20.0.8/src/intel/vulkan/anv_nir_add_base_work_group_id.c --- mesa-19.2.8/src/intel/vulkan/anv_nir_add_base_work_group_id.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/vulkan/anv_nir_add_base_work_group_id.c 2020-06-12 01:21:17.000000000 +0000 @@ -26,13 +26,11 @@ #include "compiler/brw_compiler.h" bool -anv_nir_add_base_work_group_id(nir_shader *shader, - struct brw_cs_prog_data *prog_data) +anv_nir_add_base_work_group_id(nir_shader *shader) { assert(shader->info.stage == MESA_SHADER_COMPUTE); nir_builder b; - int base_id_offset = -1; bool progress = false; nir_foreach_function(function, shader) { if (!function->impl) @@ -51,27 +49,14 @@ b.cursor = nir_after_instr(&load_id->instr); - if (base_id_offset < 0) { - /* If we don't have a set of BASE_WORK_GROUP_ID params, - * add them. - */ - assert(shader->num_uniforms == prog_data->base.nr_params * 4); - uint32_t *param = - brw_stage_prog_data_add_params(&prog_data->base, 3); - param[0] = BRW_PARAM_BUILTIN_BASE_WORK_GROUP_ID_X; - param[1] = BRW_PARAM_BUILTIN_BASE_WORK_GROUP_ID_Y; - param[2] = BRW_PARAM_BUILTIN_BASE_WORK_GROUP_ID_Z; - - base_id_offset = shader->num_uniforms; - shader->num_uniforms += 12; - } - nir_intrinsic_instr *load_base = - nir_intrinsic_instr_create(shader, nir_intrinsic_load_uniform); + nir_intrinsic_instr_create(shader, nir_intrinsic_load_push_constant); load_base->num_components = 3; load_base->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0)); nir_ssa_dest_init(&load_base->instr, &load_base->dest, 3, 32, NULL); - nir_intrinsic_set_base(load_base, base_id_offset); + nir_intrinsic_set_base(load_base, + offsetof(struct anv_push_constants, + cs.base_work_group_id)); nir_intrinsic_set_range(load_base, 3 * sizeof(uint32_t)); nir_builder_instr_insert(&b, &load_base->instr); diff -Nru mesa-19.2.8/src/intel/vulkan/anv_nir_apply_pipeline_layout.c mesa-20.0.8/src/intel/vulkan/anv_nir_apply_pipeline_layout.c --- mesa-19.2.8/src/intel/vulkan/anv_nir_apply_pipeline_layout.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/vulkan/anv_nir_apply_pipeline_layout.c 2020-06-12 01:21:17.000000000 +0000 @@ -25,6 +25,7 @@ #include "program/prog_parameter.h" #include "nir/nir_builder.h" #include "compiler/brw_nir.h" +#include "util/mesa-sha1.h" #include "util/set.h" /* Sampler tables don't actually have a maximum size but we pick one just so @@ -39,16 +40,15 @@ nir_shader *shader; nir_builder builder; - struct anv_pipeline_layout *layout; + const struct anv_pipeline_layout *layout; bool add_bounds_checks; nir_address_format ssbo_addr_format; /* Place to flag lowered instructions so we don't lower them twice */ struct set *lowered_instrs; - int dynamic_offset_uniform_start; - bool uses_constants; + bool has_dynamic_buffers; uint8_t constants_offset; struct { bool desc_buffer_used; @@ -114,8 +114,10 @@ case nir_intrinsic_image_deref_load: case nir_intrinsic_image_deref_store: case nir_intrinsic_image_deref_atomic_add: - case nir_intrinsic_image_deref_atomic_min: - case nir_intrinsic_image_deref_atomic_max: + case nir_intrinsic_image_deref_atomic_imin: + case nir_intrinsic_image_deref_atomic_umin: + case nir_intrinsic_image_deref_atomic_imax: + case nir_intrinsic_image_deref_atomic_umax: case nir_intrinsic_image_deref_atomic_and: case nir_intrinsic_image_deref_atomic_or: case nir_intrinsic_image_deref_atomic_xor: @@ -562,7 +564,7 @@ if (!state->add_bounds_checks) desc = nir_pack_64_2x32(b, nir_channels(b, desc, 0x3)); - if (state->dynamic_offset_uniform_start >= 0) { + if (state->has_dynamic_buffers) { /* This shader has dynamic offsets and we have no way of knowing * (save from the dynamic offset base index) if this buffer has a * dynamic offset. @@ -596,8 +598,10 @@ } nir_intrinsic_instr *dyn_load = - nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_uniform); - nir_intrinsic_set_base(dyn_load, state->dynamic_offset_uniform_start); + nir_intrinsic_instr_create(b->shader, + nir_intrinsic_load_push_constant); + nir_intrinsic_set_base(dyn_load, offsetof(struct anv_push_constants, + dynamic_offsets)); nir_intrinsic_set_range(dyn_load, MAX_DYNAMIC_BUFFERS * 4); dyn_load->src[0] = nir_src_for_ssa(nir_imul_imm(b, dyn_offset_idx, 4)); dyn_load->num_components = 1; @@ -748,7 +752,7 @@ nir_ssa_def_rewrite_uses(&intrin->dest.ssa, nir_src_for_ssa(desc)); } else if (binding_offset > MAX_BINDING_TABLE_SIZE) { const bool write_only = - (var->data.image.access & ACCESS_NON_READABLE) != 0; + (var->data.access & ACCESS_NON_READABLE) != 0; nir_ssa_def *desc = build_descriptor_load(deref, 0, 2, 32, state); nir_ssa_def *handle = nir_channel(b, desc, write_only ? 1 : 0); @@ -780,6 +784,11 @@ b->cursor = nir_before_instr(&intrin->instr); + /* Any constant-offset load_constant instructions should have been removed + * by constant folding. + */ + assert(!nir_src_is_const(intrin->src[0])); + nir_ssa_def *index = nir_imm_int(b, state->constants_offset); nir_ssa_def *offset = nir_iadd(b, nir_ssa_for_src(b, intrin->src[0], 1), nir_imm_int(b, nir_intrinsic_base(intrin))); @@ -1043,8 +1052,10 @@ case nir_intrinsic_image_deref_load: case nir_intrinsic_image_deref_store: case nir_intrinsic_image_deref_atomic_add: - case nir_intrinsic_image_deref_atomic_min: - case nir_intrinsic_image_deref_atomic_max: + case nir_intrinsic_image_deref_atomic_imin: + case nir_intrinsic_image_deref_atomic_umin: + case nir_intrinsic_image_deref_atomic_imax: + case nir_intrinsic_image_deref_atomic_umax: case nir_intrinsic_image_deref_atomic_and: case nir_intrinsic_image_deref_atomic_or: case nir_intrinsic_image_deref_atomic_xor: @@ -1096,9 +1107,8 @@ void anv_nir_apply_pipeline_layout(const struct anv_physical_device *pdevice, bool robust_buffer_access, - struct anv_pipeline_layout *layout, + const struct anv_pipeline_layout *layout, nir_shader *shader, - struct brw_stage_prog_data *prog_data, struct anv_pipeline_bind_map *map) { void *mem_ctx = ralloc_context(NULL); @@ -1110,7 +1120,6 @@ .add_bounds_checks = robust_buffer_access, .ssbo_addr_format = anv_nir_ssbo_addr_format(pdevice, robust_buffer_access), .lowered_instrs = _mesa_pointer_set_create(mem_ctx), - .dynamic_offset_uniform_start = -1, }; for (unsigned s = 0; s < layout->num_sets; s++) { @@ -1133,7 +1142,7 @@ map->surface_to_descriptor[map->surface_count] = (struct anv_pipeline_binding) { .set = ANV_DESCRIPTOR_SET_DESCRIPTORS, - .binding = s, + .index = s, }; state.set[s].desc_offset = map->surface_count; map->surface_count++; @@ -1162,12 +1171,12 @@ rzalloc_array(mem_ctx, struct binding_info, used_binding_count); used_binding_count = 0; for (uint32_t set = 0; set < layout->num_sets; set++) { - struct anv_descriptor_set_layout *set_layout = layout->set[set].layout; + const struct anv_descriptor_set_layout *set_layout = layout->set[set].layout; for (unsigned b = 0; b < set_layout->binding_count; b++) { if (state.set[set].use_count[b] == 0) continue; - struct anv_descriptor_set_binding_layout *binding = + const struct anv_descriptor_set_binding_layout *binding = &layout->set[set].layout->binding[b]; /* Do a fixed-point calculation to generate a score based on the @@ -1200,18 +1209,16 @@ qsort(infos, used_binding_count, sizeof(struct binding_info), compare_binding_infos); - bool have_dynamic_buffers = false; - for (unsigned i = 0; i < used_binding_count; i++) { unsigned set = infos[i].set, b = infos[i].binding; - struct anv_descriptor_set_binding_layout *binding = + const struct anv_descriptor_set_binding_layout *binding = &layout->set[set].layout->binding[b]; - if (binding->dynamic_offset_index >= 0) - have_dynamic_buffers = true; - const uint32_t array_size = binding->array_size; + if (binding->dynamic_offset_index >= 0) + state.has_dynamic_buffers = true; + if (binding->data & ANV_DESCRIPTOR_SURFACE_STATE) { if (map->surface_count + array_size > MAX_BINDING_TABLE_SIZE || anv_descriptor_requires_bindless(pdevice, binding, false)) { @@ -1222,16 +1229,28 @@ state.set[set].surface_offsets[b] = BINDLESS_OFFSET; } else { state.set[set].surface_offsets[b] = map->surface_count; - struct anv_sampler **samplers = binding->immutable_samplers; - for (unsigned i = 0; i < binding->array_size; i++) { - uint8_t planes = samplers ? samplers[i]->n_planes : 1; - for (uint8_t p = 0; p < planes; p++) { + if (binding->dynamic_offset_index < 0) { + struct anv_sampler **samplers = binding->immutable_samplers; + for (unsigned i = 0; i < binding->array_size; i++) { + uint8_t planes = samplers ? samplers[i]->n_planes : 1; + for (uint8_t p = 0; p < planes; p++) { + map->surface_to_descriptor[map->surface_count++] = + (struct anv_pipeline_binding) { + .set = set, + .index = binding->descriptor_index + i, + .plane = p, + }; + } + } + } else { + for (unsigned i = 0; i < binding->array_size; i++) { map->surface_to_descriptor[map->surface_count++] = (struct anv_pipeline_binding) { .set = set, - .binding = b, - .index = i, - .plane = p, + .index = binding->descriptor_index + i, + .dynamic_offset_index = + layout->set[set].dynamic_offset_start + + binding->dynamic_offset_index + i, }; } } @@ -1260,8 +1279,7 @@ map->sampler_to_descriptor[map->sampler_count++] = (struct anv_pipeline_binding) { .set = set, - .binding = b, - .index = i, + .index = binding->descriptor_index + i, .plane = p, }; } @@ -1270,16 +1288,6 @@ } } - if (have_dynamic_buffers) { - state.dynamic_offset_uniform_start = shader->num_uniforms; - uint32_t *param = brw_stage_prog_data_add_params(prog_data, - MAX_DYNAMIC_BUFFERS); - for (unsigned i = 0; i < MAX_DYNAMIC_BUFFERS; i++) - param[i] = ANV_PARAM_DYN_OFFSET(i); - shader->num_uniforms += MAX_DYNAMIC_BUFFERS * 4; - assert(shader->num_uniforms == prog_data->nr_params * 4); - } - nir_foreach_variable(var, &shader->uniforms) { const struct glsl_type *glsl_type = glsl_without_array(var->type); @@ -1290,8 +1298,9 @@ const uint32_t set = var->data.descriptor_set; const uint32_t binding = var->data.binding; - const uint32_t array_size = - layout->set[set].layout->binding[binding].array_size; + const struct anv_descriptor_set_binding_layout *bind_layout = + &layout->set[set].layout->binding[binding]; + const uint32_t array_size = bind_layout->array_size; if (state.set[set].use_count[binding] == 0) continue; @@ -1303,15 +1312,15 @@ &map->surface_to_descriptor[state.set[set].surface_offsets[binding]]; for (unsigned i = 0; i < array_size; i++) { assert(pipe_binding[i].set == set); - assert(pipe_binding[i].binding == binding); - assert(pipe_binding[i].index == i); + assert(pipe_binding[i].index == bind_layout->descriptor_index + i); if (dim == GLSL_SAMPLER_DIM_SUBPASS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS) pipe_binding[i].input_attachment_index = var->data.index + i; + /* NOTE: This is a uint8_t so we really do need to != 0 here */ pipe_binding[i].write_only = - (var->data.image.access & ACCESS_NON_READABLE) != 0; + (var->data.access & ACCESS_NON_READABLE) != 0; } } @@ -1319,6 +1328,8 @@ if (!function->impl) continue; + nir_builder_init(&state.builder, function->impl); + /* Before we do the normal lowering, we look for any SSBO operations * that we can lower to the BTI model and lower them up-front. The BTI * model can perform better than the A64 model for a couple reasons: @@ -1351,7 +1362,6 @@ */ lower_direct_buffer_access(function->impl, &state); - nir_builder_init(&state.builder, function->impl); nir_foreach_block(block, function->impl) apply_pipeline_layout_block(block, &state); nir_metadata_preserve(function->impl, nir_metadata_block_index | @@ -1359,4 +1369,15 @@ } ralloc_free(mem_ctx); + + /* Now that we're done computing the surface and sampler portions of the + * bind map, hash them. This lets us quickly determine if the actual + * mapping has changed and not just a no-op pipeline change. + */ + _mesa_sha1_compute(map->surface_to_descriptor, + map->surface_count * sizeof(struct anv_pipeline_binding), + map->surface_sha1); + _mesa_sha1_compute(map->sampler_to_descriptor, + map->sampler_count * sizeof(struct anv_pipeline_binding), + map->sampler_sha1); } diff -Nru mesa-19.2.8/src/intel/vulkan/anv_nir_compute_push_layout.c mesa-20.0.8/src/intel/vulkan/anv_nir_compute_push_layout.c --- mesa-19.2.8/src/intel/vulkan/anv_nir_compute_push_layout.c 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/intel/vulkan/anv_nir_compute_push_layout.c 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,195 @@ +/* + * Copyright © 2019 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "anv_nir.h" +#include "compiler/brw_nir.h" +#include "util/mesa-sha1.h" + +void +anv_nir_compute_push_layout(const struct anv_physical_device *pdevice, + nir_shader *nir, + struct brw_stage_prog_data *prog_data, + struct anv_pipeline_bind_map *map, + void *mem_ctx) +{ + memset(map->push_ranges, 0, sizeof(map->push_ranges)); + + unsigned push_start = UINT_MAX, push_end = 0; + nir_foreach_function(function, nir) { + if (!function->impl) + continue; + + nir_foreach_block(block, function->impl) { + nir_foreach_instr(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + if (intrin->intrinsic != nir_intrinsic_load_push_constant) + continue; + + unsigned base = nir_intrinsic_base(intrin); + unsigned range = nir_intrinsic_range(intrin); + push_start = MIN2(push_start, base); + push_end = MAX2(push_end, base + range); + } + } + } + + const bool has_push_intrinsic = push_start <= push_end; + + if (nir->info.stage == MESA_SHADER_COMPUTE) { + /* For compute shaders, we always have to have the subgroup ID. The + * back-end compiler will "helpfully" add it for us in the last push + * constant slot. Yes, there is an off-by-one error here but that's + * because the back-end will add it so we want to claim the number of + * push constants one dword less than the full amount including + * gl_SubgroupId. + */ + assert(push_end <= offsetof(struct anv_push_constants, cs.subgroup_id)); + push_end = offsetof(struct anv_push_constants, cs.subgroup_id); + } + + /* Align push_start down to a 32B boundary and make it no larger than + * push_end (no push constants is indicated by push_start = UINT_MAX). + */ + push_start = MIN2(push_start, push_end); + push_start &= ~31u; + + if (has_push_intrinsic) { + nir_foreach_function(function, nir) { + if (!function->impl) + continue; + + nir_foreach_block(block, function->impl) { + nir_foreach_instr(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + if (intrin->intrinsic != nir_intrinsic_load_push_constant) + continue; + + intrin->intrinsic = nir_intrinsic_load_uniform; + nir_intrinsic_set_base(intrin, + nir_intrinsic_base(intrin) - + push_start); + } + } + } + } + + /* For vec4 our push data size needs to be aligned to a vec4 and for + * scalar, it needs to be aligned to a DWORD. + */ + const unsigned align = + pdevice->compiler->scalar_stage[nir->info.stage] ? 4 : 16; + nir->num_uniforms = ALIGN(push_end - push_start, align); + prog_data->nr_params = nir->num_uniforms / 4; + prog_data->param = rzalloc_array(mem_ctx, uint32_t, prog_data->nr_params); + + struct anv_push_range push_constant_range = { + .set = ANV_DESCRIPTOR_SET_PUSH_CONSTANTS, + .start = push_start / 32, + .length = DIV_ROUND_UP(push_end - push_start, 32), + }; + + if ((pdevice->info.gen >= 8 || pdevice->info.is_haswell) && + nir->info.stage != MESA_SHADER_COMPUTE) { + brw_nir_analyze_ubo_ranges(pdevice->compiler, nir, NULL, + prog_data->ubo_ranges); + + /* We can push at most 64 registers worth of data. The back-end + * compiler would do this fixup for us but we'd like to calculate + * the push constant layout ourselves. + */ + unsigned total_push_regs = push_constant_range.length; + for (unsigned i = 0; i < 4; i++) { + if (total_push_regs + prog_data->ubo_ranges[i].length > 64) + prog_data->ubo_ranges[i].length = 64 - total_push_regs; + total_push_regs += prog_data->ubo_ranges[i].length; + } + assert(total_push_regs <= 64); + + int n = 0; + + if (push_constant_range.length > 0) + map->push_ranges[n++] = push_constant_range; + + for (int i = 0; i < 4; i++) { + const struct brw_ubo_range *ubo_range = &prog_data->ubo_ranges[i]; + if (ubo_range->length == 0) + continue; + + const struct anv_pipeline_binding *binding = + &map->surface_to_descriptor[ubo_range->block]; + + map->push_ranges[n++] = (struct anv_push_range) { + .set = binding->set, + .index = binding->index, + .dynamic_offset_index = binding->dynamic_offset_index, + .start = ubo_range->start, + .length = ubo_range->length, + }; + } + } else { + /* For Ivy Bridge, the push constants packets have a different + * rule that would require us to iterate in the other direction + * and possibly mess around with dynamic state base address. + * Don't bother; just emit regular push constants at n = 0. + * + * In the compute case, we don't have multiple push ranges so it's + * better to just provide one in push_ranges[0]. + */ + map->push_ranges[0] = push_constant_range; + } + + /* Now that we're done computing the push constant portion of the + * bind map, hash it. This lets us quickly determine if the actual + * mapping has changed and not just a no-op pipeline change. + */ + _mesa_sha1_compute(map->push_ranges, + sizeof(map->push_ranges), + map->push_sha1); +} + +void +anv_nir_validate_push_layout(struct brw_stage_prog_data *prog_data, + struct anv_pipeline_bind_map *map) +{ +#ifndef NDEBUG + unsigned prog_data_push_size = DIV_ROUND_UP(prog_data->nr_params, 8); + for (unsigned i = 0; i < 4; i++) + prog_data_push_size += prog_data->ubo_ranges[i].length; + + unsigned bind_map_push_size = 0; + for (unsigned i = 0; i < 4; i++) + bind_map_push_size += map->push_ranges[i].length; + + /* We could go through everything again but it should be enough to assert + * that they push the same number of registers. This should alert us if + * the back-end compiler decides to re-arrange stuff or shrink a range. + */ + assert(prog_data_push_size == bind_map_push_size); +#endif +} diff -Nru mesa-19.2.8/src/intel/vulkan/anv_nir.h mesa-20.0.8/src/intel/vulkan/anv_nir.h --- mesa-19.2.8/src/intel/vulkan/anv_nir.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/vulkan/anv_nir.h 2020-06-12 01:21:17.000000000 +0000 @@ -31,12 +31,10 @@ extern "C" { #endif -void anv_nir_lower_push_constants(nir_shader *shader); - bool anv_nir_lower_multiview(nir_shader *shader, uint32_t view_mask); bool anv_nir_lower_ycbcr_textures(nir_shader *shader, - struct anv_pipeline_layout *layout); + const struct anv_pipeline_layout *layout); static inline nir_address_format anv_nir_ssbo_addr_format(const struct anv_physical_device *pdevice, @@ -54,13 +52,20 @@ void anv_nir_apply_pipeline_layout(const struct anv_physical_device *pdevice, bool robust_buffer_access, - struct anv_pipeline_layout *layout, + const struct anv_pipeline_layout *layout, nir_shader *shader, - struct brw_stage_prog_data *prog_data, struct anv_pipeline_bind_map *map); -bool anv_nir_add_base_work_group_id(nir_shader *shader, - struct brw_cs_prog_data *prog_data); +void anv_nir_compute_push_layout(const struct anv_physical_device *pdevice, + nir_shader *nir, + struct brw_stage_prog_data *prog_data, + struct anv_pipeline_bind_map *map, + void *mem_ctx); + +void anv_nir_validate_push_layout(struct brw_stage_prog_data *prog_data, + struct anv_pipeline_bind_map *map); + +bool anv_nir_add_base_work_group_id(nir_shader *shader); #ifdef __cplusplus } diff -Nru mesa-19.2.8/src/intel/vulkan/anv_nir_lower_push_constants.c mesa-20.0.8/src/intel/vulkan/anv_nir_lower_push_constants.c --- mesa-19.2.8/src/intel/vulkan/anv_nir_lower_push_constants.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/vulkan/anv_nir_lower_push_constants.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,49 +0,0 @@ -/* - * Copyright © 2015 Intel Corporation - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ - -#include "anv_nir.h" - -void -anv_nir_lower_push_constants(nir_shader *shader) -{ - nir_foreach_function(function, shader) { - if (!function->impl) - continue; - - nir_foreach_block(block, function->impl) { - nir_foreach_instr(instr, block) { - if (instr->type != nir_instr_type_intrinsic) - continue; - - nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); - - /* TODO: Handle indirect push constants */ - if (intrin->intrinsic != nir_intrinsic_load_push_constant) - continue; - - /* We just turn them into uniform loads */ - intrin->intrinsic = nir_intrinsic_load_uniform; - } - } - } -} diff -Nru mesa-19.2.8/src/intel/vulkan/anv_nir_lower_ycbcr_textures.c mesa-20.0.8/src/intel/vulkan/anv_nir_lower_ycbcr_textures.c --- mesa-19.2.8/src/intel/vulkan/anv_nir_lower_ycbcr_textures.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/vulkan/anv_nir_lower_ycbcr_textures.c 2020-06-12 01:21:17.000000000 +0000 @@ -320,7 +320,7 @@ } static bool -try_lower_tex_ycbcr(struct anv_pipeline_layout *layout, +try_lower_tex_ycbcr(const struct anv_pipeline_layout *layout, nir_builder *builder, nir_tex_instr *tex) { @@ -448,7 +448,7 @@ bool anv_nir_lower_ycbcr_textures(nir_shader *shader, - struct anv_pipeline_layout *layout) + const struct anv_pipeline_layout *layout) { bool progress = false; diff -Nru mesa-19.2.8/src/intel/vulkan/anv_pass.c mesa-20.0.8/src/intel/vulkan/anv_pass.c --- mesa-19.2.8/src/intel/vulkan/anv_pass.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/vulkan/anv_pass.c 2020-06-12 01:21:17.000000000 +0000 @@ -269,6 +269,9 @@ .stencil_load_op = pCreateInfo->pAttachments[i].stencilLoadOp, .initial_layout = pCreateInfo->pAttachments[i].initialLayout, .final_layout = pCreateInfo->pAttachments[i].finalLayout, + + .stencil_initial_layout = pCreateInfo->pAttachments[i].initialLayout, + .stencil_final_layout = pCreateInfo->pAttachments[i].finalLayout, }; } @@ -288,9 +291,10 @@ for (uint32_t j = 0; j < desc->inputAttachmentCount; j++) { subpass->input_attachments[j] = (struct anv_subpass_attachment) { - .usage = VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT, - .attachment = desc->pInputAttachments[j].attachment, - .layout = desc->pInputAttachments[j].layout, + .usage = VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT, + .attachment = desc->pInputAttachments[j].attachment, + .layout = desc->pInputAttachments[j].layout, + .stencil_layout = desc->pInputAttachments[j].layout, }; } } @@ -325,16 +329,17 @@ subpass->depth_stencil_attachment = subpass_attachments++; *subpass->depth_stencil_attachment = (struct anv_subpass_attachment) { - .usage = VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT, - .attachment = desc->pDepthStencilAttachment->attachment, - .layout = desc->pDepthStencilAttachment->layout, + .usage = VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT, + .attachment = desc->pDepthStencilAttachment->attachment, + .layout = desc->pDepthStencilAttachment->layout, + .stencil_layout = desc->pDepthStencilAttachment->layout, }; } } for (uint32_t i = 0; i < pCreateInfo->dependencyCount; i++) { /* Convert to a Dependency2KHR */ - struct VkSubpassDependency2KHR dep2 = { + VkSubpassDependency2 dep2 = { .srcSubpass = pCreateInfo->pDependencies[i].srcSubpass, .dstSubpass = pCreateInfo->pDependencies[i].dstSubpass, .srcStageMask = pCreateInfo->pDependencies[i].srcStageMask, @@ -387,7 +392,7 @@ (ds_resolve && ds_resolve->pDepthStencilResolveAttachment); } -VkResult anv_CreateRenderPass2KHR( +VkResult anv_CreateRenderPass2( VkDevice _device, const VkRenderPassCreateInfo2KHR* pCreateInfo, const VkAllocationCallbacks* pAllocator, @@ -430,6 +435,10 @@ pass->subpass_flushes = subpass_flushes; for (uint32_t i = 0; i < pCreateInfo->attachmentCount; i++) { + const VkAttachmentDescriptionStencilLayoutKHR *stencil_layout = + vk_find_struct_const(pCreateInfo->pAttachments[i].pNext, + ATTACHMENT_DESCRIPTION_STENCIL_LAYOUT_KHR); + pass->attachments[i] = (struct anv_render_pass_attachment) { .format = pCreateInfo->pAttachments[i].format, .samples = pCreateInfo->pAttachments[i].samples, @@ -438,6 +447,13 @@ .stencil_load_op = pCreateInfo->pAttachments[i].stencilLoadOp, .initial_layout = pCreateInfo->pAttachments[i].initialLayout, .final_layout = pCreateInfo->pAttachments[i].finalLayout, + + .stencil_initial_layout = (stencil_layout ? + stencil_layout->stencilInitialLayout : + pCreateInfo->pAttachments[i].initialLayout), + .stencil_final_layout = (stencil_layout ? + stencil_layout->stencilFinalLayout : + pCreateInfo->pAttachments[i].finalLayout), }; } @@ -456,10 +472,17 @@ subpass_attachments += desc->inputAttachmentCount; for (uint32_t j = 0; j < desc->inputAttachmentCount; j++) { + const VkAttachmentReferenceStencilLayoutKHR *stencil_layout = + vk_find_struct_const(desc->pInputAttachments[j].pNext, + ATTACHMENT_REFERENCE_STENCIL_LAYOUT_KHR); + subpass->input_attachments[j] = (struct anv_subpass_attachment) { - .usage = VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT, - .attachment = desc->pInputAttachments[j].attachment, - .layout = desc->pInputAttachments[j].layout, + .usage = VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT, + .attachment = desc->pInputAttachments[j].attachment, + .layout = desc->pInputAttachments[j].layout, + .stencil_layout = (stencil_layout ? + stencil_layout->stencilLayout : + desc->pInputAttachments[j].layout), }; } } @@ -493,10 +516,17 @@ if (desc->pDepthStencilAttachment) { subpass->depth_stencil_attachment = subpass_attachments++; + const VkAttachmentReferenceStencilLayoutKHR *stencil_attachment = + vk_find_struct_const(desc->pDepthStencilAttachment->pNext, + ATTACHMENT_REFERENCE_STENCIL_LAYOUT_KHR); + *subpass->depth_stencil_attachment = (struct anv_subpass_attachment) { - .usage = VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT, - .attachment = desc->pDepthStencilAttachment->attachment, - .layout = desc->pDepthStencilAttachment->layout, + .usage = VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT, + .attachment = desc->pDepthStencilAttachment->attachment, + .layout = desc->pDepthStencilAttachment->layout, + .stencil_layout = stencil_attachment ? + stencil_attachment->stencilLayout : + desc->pDepthStencilAttachment->layout, }; } @@ -507,10 +537,17 @@ if (ds_resolve && ds_resolve->pDepthStencilResolveAttachment) { subpass->ds_resolve_attachment = subpass_attachments++; + const VkAttachmentReferenceStencilLayoutKHR *stencil_resolve_attachment = + vk_find_struct_const(ds_resolve->pDepthStencilResolveAttachment->pNext, + ATTACHMENT_REFERENCE_STENCIL_LAYOUT_KHR); + *subpass->ds_resolve_attachment = (struct anv_subpass_attachment) { - .usage = VK_IMAGE_USAGE_TRANSFER_DST_BIT, - .attachment = ds_resolve->pDepthStencilResolveAttachment->attachment, - .layout = ds_resolve->pDepthStencilResolveAttachment->layout, + .usage = VK_IMAGE_USAGE_TRANSFER_DST_BIT, + .attachment = ds_resolve->pDepthStencilResolveAttachment->attachment, + .layout = ds_resolve->pDepthStencilResolveAttachment->layout, + .stencil_layout = stencil_resolve_attachment ? + stencil_resolve_attachment->stencilLayout : + ds_resolve->pDepthStencilResolveAttachment->layout, }; subpass->depth_resolve_mode = ds_resolve->depthResolveMode; subpass->stencil_resolve_mode = ds_resolve->stencilResolveMode; diff -Nru mesa-19.2.8/src/intel/vulkan/anv_perf.c mesa-20.0.8/src/intel/vulkan/anv_perf.c --- mesa-19.2.8/src/intel/vulkan/anv_perf.c 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/intel/vulkan/anv_perf.c 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,218 @@ +/* + * Copyright © 2018 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include +#include +#include + +#include "anv_private.h" + +#include "perf/gen_perf.h" +#include "perf/gen_perf_mdapi.h" + +struct gen_perf_config * +anv_get_perf(const struct gen_device_info *devinfo, int fd) +{ + struct gen_perf_config *perf = gen_perf_new(NULL); + + gen_perf_init_metrics(perf, devinfo, fd); + + /* We need DRM_I915_PERF_PROP_HOLD_PREEMPTION support, only available in + * perf revision 2. + */ + if (perf->i915_perf_version < 3) + goto err; + + return perf; + + err: + ralloc_free(perf); + return NULL; +} + +void +anv_device_perf_init(struct anv_device *device) +{ + device->perf_fd = -1; +} + +static int +anv_device_perf_open(struct anv_device *device, uint64_t metric_id) +{ + uint64_t properties[DRM_I915_PERF_PROP_MAX * 2]; + struct drm_i915_perf_open_param param; + int p = 0, stream_fd; + + properties[p++] = DRM_I915_PERF_PROP_SAMPLE_OA; + properties[p++] = true; + + properties[p++] = DRM_I915_PERF_PROP_OA_METRICS_SET; + properties[p++] = metric_id; + + properties[p++] = DRM_I915_PERF_PROP_OA_FORMAT; + properties[p++] = device->info.gen >= 8 ? + I915_OA_FORMAT_A32u40_A4u32_B8_C8 : + I915_OA_FORMAT_A45_B8_C8; + + properties[p++] = DRM_I915_PERF_PROP_OA_EXPONENT; + properties[p++] = 31; /* slowest sampling period */ + + properties[p++] = DRM_I915_PERF_PROP_CTX_HANDLE; + properties[p++] = device->context_id; + + properties[p++] = DRM_I915_PERF_PROP_HOLD_PREEMPTION; + properties[p++] = true; + + memset(¶m, 0, sizeof(param)); + param.flags = 0; + param.flags |= I915_PERF_FLAG_FD_CLOEXEC | I915_PERF_FLAG_FD_NONBLOCK; + param.properties_ptr = (uintptr_t)properties; + param.num_properties = p / 2; + + stream_fd = gen_ioctl(device->fd, DRM_IOCTL_I915_PERF_OPEN, ¶m); + return stream_fd; +} + +VkResult anv_InitializePerformanceApiINTEL( + VkDevice _device, + const VkInitializePerformanceApiInfoINTEL* pInitializeInfo) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + + if (!device->physical->perf) + return VK_ERROR_EXTENSION_NOT_PRESENT; + + /* Not much to do here */ + return VK_SUCCESS; +} + +VkResult anv_GetPerformanceParameterINTEL( + VkDevice _device, + VkPerformanceParameterTypeINTEL parameter, + VkPerformanceValueINTEL* pValue) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + + if (!device->physical->perf) + return VK_ERROR_EXTENSION_NOT_PRESENT; + + VkResult result = VK_SUCCESS; + switch (parameter) { + case VK_PERFORMANCE_PARAMETER_TYPE_HW_COUNTERS_SUPPORTED_INTEL: + pValue->type = VK_PERFORMANCE_VALUE_TYPE_BOOL_INTEL; + pValue->data.valueBool = VK_TRUE; + break; + + case VK_PERFORMANCE_PARAMETER_TYPE_STREAM_MARKER_VALID_BITS_INTEL: + pValue->type = VK_PERFORMANCE_VALUE_TYPE_UINT32_INTEL; + pValue->data.value32 = 25; + break; + + default: + result = VK_ERROR_FEATURE_NOT_PRESENT; + break; + } + + return result; +} + +VkResult anv_CmdSetPerformanceMarkerINTEL( + VkCommandBuffer commandBuffer, + const VkPerformanceMarkerInfoINTEL* pMarkerInfo) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + + cmd_buffer->intel_perf_marker = pMarkerInfo->marker; + + return VK_SUCCESS; +} + +VkResult anv_AcquirePerformanceConfigurationINTEL( + VkDevice _device, + const VkPerformanceConfigurationAcquireInfoINTEL* pAcquireInfo, + VkPerformanceConfigurationINTEL* pConfiguration) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + + struct gen_perf_registers *perf_config = + gen_perf_load_configuration(device->physical->perf, device->fd, + GEN_PERF_QUERY_GUID_MDAPI); + if (!perf_config) + return VK_INCOMPLETE; + + int ret = gen_perf_store_configuration(device->physical->perf, device->fd, + perf_config, NULL /* guid */); + if (ret < 0) { + ralloc_free(perf_config); + return VK_INCOMPLETE; + } + + *pConfiguration = (VkPerformanceConfigurationINTEL) (uint64_t) ret; + + return VK_SUCCESS; +} + +VkResult anv_ReleasePerformanceConfigurationINTEL( + VkDevice _device, + VkPerformanceConfigurationINTEL _configuration) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + uint64_t config = (uint64_t) _configuration; + + gen_ioctl(device->fd, DRM_IOCTL_I915_PERF_REMOVE_CONFIG, &config); + + return VK_SUCCESS; +} + +VkResult anv_QueueSetPerformanceConfigurationINTEL( + VkQueue _queue, + VkPerformanceConfigurationINTEL _configuration) +{ + ANV_FROM_HANDLE(anv_queue, queue, _queue); + struct anv_device *device = queue->device; + uint64_t configuration = (uint64_t) _configuration; + + if (device->perf_fd < 0) { + device->perf_fd = anv_device_perf_open(device, configuration); + if (device->perf_fd < 0) + return VK_ERROR_INITIALIZATION_FAILED; + } else { + int ret = gen_ioctl(device->perf_fd, I915_PERF_IOCTL_CONFIG, + (void *)(uintptr_t) _configuration); + if (ret < 0) + return anv_device_set_lost(device, "i915-perf config failed: %m"); + } + + return VK_SUCCESS; +} + +void anv_UninitializePerformanceApiINTEL( + VkDevice _device) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + + if (device->perf_fd >= 0) { + close(device->perf_fd); + device->perf_fd = -1; + } +} diff -Nru mesa-19.2.8/src/intel/vulkan/anv_pipeline.c mesa-20.0.8/src/intel/vulkan/anv_pipeline.c --- mesa-19.2.8/src/intel/vulkan/anv_pipeline.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/vulkan/anv_pipeline.c 2020-06-12 01:21:17.000000000 +0000 @@ -107,6 +107,8 @@ const char *message) { struct anv_spirv_debug_data *debug_data = private_data; + struct anv_instance *instance = debug_data->device->physical->instance; + static const VkDebugReportFlagsEXT vk_flags[] = { [NIR_SPIRV_DEBUG_LEVEL_INFO] = VK_DEBUG_REPORT_INFORMATION_BIT_EXT, [NIR_SPIRV_DEBUG_LEVEL_WARNING] = VK_DEBUG_REPORT_WARNING_BIT_EXT, @@ -116,7 +118,7 @@ snprintf(buffer, sizeof(buffer), "SPIR-V offset %lu: %s", (unsigned long) spirv_offset, message); - vk_debug_report(&debug_data->device->instance->debug_report_callbacks, + vk_debug_report(&instance->debug_report_callbacks, vk_flags[level], VK_DEBUG_REPORT_OBJECT_TYPE_SHADER_MODULE_EXT, (uint64_t) (uintptr_t) debug_data->module, @@ -134,8 +136,7 @@ gl_shader_stage stage, const VkSpecializationInfo *spec_info) { - const struct anv_physical_device *pdevice = - &device->instance->physicalDevice; + const struct anv_physical_device *pdevice = device->physical; const struct brw_compiler *compiler = pdevice->compiler; const nir_shader_compiler_options *nir_options = compiler->glsl_compiler_options[stage].NirOptions; @@ -155,10 +156,23 @@ assert(data + entry.size <= spec_info->pData + spec_info->dataSize); spec_entries[i].id = spec_info->pMapEntries[i].constantID; - if (spec_info->dataSize == 8) + switch (entry.size) { + case 8: spec_entries[i].data64 = *(const uint64_t *)data; - else + break; + case 4: spec_entries[i].data32 = *(const uint32_t *)data; + break; + case 2: + spec_entries[i].data32 = *(const uint16_t *)data; + break; + case 1: + spec_entries[i].data32 = *(const uint8_t *)data; + break; + default: + assert(!"Invalid spec constant size"); + break; + } } } @@ -168,6 +182,7 @@ }; struct spirv_to_nir_options spirv_options = { .frag_coord_is_sysval = true, + .use_scoped_memory_barrier = true, .caps = { .demote_to_helper_invocation = true, .derivative_group = true, @@ -186,11 +201,14 @@ .int16 = pdevice->info.gen >= 8, .int64 = pdevice->info.gen >= 8, .int64_atomics = pdevice->info.gen >= 9 && pdevice->use_softpin, + .integer_functions2 = pdevice->info.gen >= 8, .min_lod = true, .multiview = true, .physical_storage_buffer_address = pdevice->has_a64_buffer_access, .post_depth_coverage = pdevice->info.gen >= 9, .runtime_descriptor_array = true, + .float_controls = pdevice->info.gen >= 8, + .shader_clock = true, .shader_viewport_index_layer = true, .stencil_export = pdevice->info.gen >= 9, .storage_8bit = pdevice->info.gen >= 8, @@ -204,6 +222,8 @@ .tessellation = true, .transform_feedback = pdevice->info.gen >= 8, .variable_pointers = true, + .vk_memory_model = true, + .vk_memory_model_device_scope = true, }, .ubo_addr_format = nir_address_format_32bit_index_offset, .ssbo_addr_format = @@ -443,7 +463,7 @@ key->color_outputs_valid |= (1 << i); } - key->nr_color_regions = util_bitcount(key->color_outputs_valid); + key->nr_color_regions = subpass->color_count; /* To reduce possible shader recompilations we would need to know if * there is a SampleMask output variable to compute if we should emit @@ -619,7 +639,7 @@ struct anv_pipeline_stage *stage) { const struct brw_compiler *compiler = - pipeline->device->instance->physicalDevice.compiler; + pipeline->device->physical->compiler; const nir_shader_compiler_options *nir_options = compiler->glsl_compiler_options[stage->stage].NirOptions; nir_shader *nir; @@ -653,8 +673,7 @@ struct anv_pipeline_stage *stage, struct anv_pipeline_layout *layout) { - const struct anv_physical_device *pdevice = - &pipeline->device->instance->physicalDevice; + const struct anv_physical_device *pdevice = pipeline->device->physical; const struct brw_compiler *compiler = pdevice->compiler; struct brw_stage_prog_data *prog_data = &stage->prog_data.base; @@ -667,72 +686,38 @@ NIR_PASS_V(nir, anv_nir_lower_ycbcr_textures, layout); - NIR_PASS_V(nir, anv_nir_lower_push_constants); - if (nir->info.stage != MESA_SHADER_COMPUTE) NIR_PASS_V(nir, anv_nir_lower_multiview, pipeline->subpass->view_mask); nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir)); - if (nir->num_uniforms > 0) { - assert(prog_data->nr_params == 0); - - /* If the shader uses any push constants at all, we'll just give - * them the maximum possible number - */ - assert(nir->num_uniforms <= MAX_PUSH_CONSTANTS_SIZE); - nir->num_uniforms = MAX_PUSH_CONSTANTS_SIZE; - prog_data->nr_params += MAX_PUSH_CONSTANTS_SIZE / sizeof(float); - prog_data->param = ralloc_array(mem_ctx, uint32_t, prog_data->nr_params); - - /* We now set the param values to be offsets into a - * anv_push_constant_data structure. Since the compiler doesn't - * actually dereference any of the gl_constant_value pointers in the - * params array, it doesn't really matter what we put here. - */ - struct anv_push_constants *null_data = NULL; - /* Fill out the push constants section of the param array */ - for (unsigned i = 0; i < MAX_PUSH_CONSTANTS_SIZE / sizeof(float); i++) { - prog_data->param[i] = ANV_PARAM_PUSH( - (uintptr_t)&null_data->client_data[i * sizeof(float)]); - } - } - - if (nir->info.num_ssbos > 0 || nir->info.num_images > 0) - pipeline->needs_data_cache = true; - NIR_PASS_V(nir, brw_nir_lower_image_load_store, compiler->devinfo); NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_global, nir_address_format_64bit_global); /* Apply the actual pipeline layout to UBOs, SSBOs, and textures */ - if (layout) { - anv_nir_apply_pipeline_layout(pdevice, - pipeline->device->robust_buffer_access, - layout, nir, prog_data, - &stage->bind_map); - - NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_ubo, - nir_address_format_32bit_index_offset); - NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_ssbo, - anv_nir_ssbo_addr_format(pdevice, - pipeline->device->robust_buffer_access)); + anv_nir_apply_pipeline_layout(pdevice, + pipeline->device->robust_buffer_access, + layout, nir, &stage->bind_map); + + NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_ubo, + nir_address_format_32bit_index_offset); + NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_ssbo, + anv_nir_ssbo_addr_format(pdevice, + pipeline->device->robust_buffer_access)); - NIR_PASS_V(nir, nir_opt_constant_folding); - - /* We don't support non-uniform UBOs and non-uniform SSBO access is - * handled naturally by falling back to A64 messages. - */ - NIR_PASS_V(nir, nir_lower_non_uniform_access, - nir_lower_non_uniform_texture_access | - nir_lower_non_uniform_image_access); - } + NIR_PASS_V(nir, nir_opt_constant_folding); - if (nir->info.stage != MESA_SHADER_COMPUTE) - brw_nir_analyze_ubo_ranges(compiler, nir, NULL, prog_data->ubo_ranges); + /* We don't support non-uniform UBOs and non-uniform SSBO access is + * handled naturally by falling back to A64 messages. + */ + NIR_PASS_V(nir, nir_lower_non_uniform_access, + nir_lower_non_uniform_texture_access | + nir_lower_non_uniform_image_access); - assert(nir->num_uniforms == prog_data->nr_params * 4); + anv_nir_compute_push_layout(pdevice, nir, prog_data, + &stage->bind_map, mem_ctx); stage->nir = nir; } @@ -878,7 +863,7 @@ &tes_stage->key.tes, &tcs_stage->prog_data.tcs.base.vue_map, &tes_stage->prog_data.tes, - tes_stage->nir, NULL, -1, + tes_stage->nir, -1, tes_stage->stats, NULL); } @@ -915,115 +900,86 @@ anv_pipeline_link_fs(const struct brw_compiler *compiler, struct anv_pipeline_stage *stage) { - unsigned num_rts = 0; - const int max_rt = FRAG_RESULT_DATA7 - FRAG_RESULT_DATA0 + 1; - struct anv_pipeline_binding rt_bindings[max_rt]; - nir_function_impl *impl = nir_shader_get_entrypoint(stage->nir); - int rt_to_bindings[max_rt]; - memset(rt_to_bindings, -1, sizeof(rt_to_bindings)); - bool rt_used[max_rt]; - memset(rt_used, 0, sizeof(rt_used)); - - /* Flag used render targets */ - nir_foreach_variable_safe(var, &stage->nir->outputs) { - if (var->data.location < FRAG_RESULT_DATA0) - continue; - - const unsigned rt = var->data.location - FRAG_RESULT_DATA0; - /* Out-of-bounds */ - if (rt >= MAX_RTS) - continue; - - const unsigned array_len = - glsl_type_is_array(var->type) ? glsl_get_length(var->type) : 1; - assert(rt + array_len <= max_rt); - - /* Unused */ - if (!(stage->key.wm.color_outputs_valid & BITFIELD_RANGE(rt, array_len))) { - /* If this is the RT at location 0 and we have alpha to coverage - * enabled we will have to create a null RT for it, so mark it as - * used. - */ - if (rt > 0 || !stage->key.wm.alpha_to_coverage) - continue; + unsigned num_rt_bindings; + struct anv_pipeline_binding rt_bindings[MAX_RTS]; + if (stage->key.wm.nr_color_regions > 0) { + assert(stage->key.wm.nr_color_regions <= MAX_RTS); + for (unsigned rt = 0; rt < stage->key.wm.nr_color_regions; rt++) { + if (stage->key.wm.color_outputs_valid & BITFIELD_BIT(rt)) { + rt_bindings[rt] = (struct anv_pipeline_binding) { + .set = ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS, + .index = rt, + }; + } else { + /* Setup a null render target */ + rt_bindings[rt] = (struct anv_pipeline_binding) { + .set = ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS, + .index = UINT32_MAX, + }; + } } - - for (unsigned i = 0; i < array_len; i++) - rt_used[rt + i] = true; + num_rt_bindings = stage->key.wm.nr_color_regions; + } else { + /* Setup a null render target */ + rt_bindings[0] = (struct anv_pipeline_binding) { + .set = ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS, + .index = UINT32_MAX, + }; + num_rt_bindings = 1; } - /* Set new, compacted, location */ - for (unsigned i = 0; i < max_rt; i++) { - if (!rt_used[i]) - continue; - - rt_to_bindings[i] = num_rts; - - if (stage->key.wm.color_outputs_valid & (1 << i)) { - rt_bindings[rt_to_bindings[i]] = (struct anv_pipeline_binding) { - .set = ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS, - .binding = 0, - .index = i, - }; - } else { - /* Setup a null render target */ - rt_bindings[rt_to_bindings[i]] = (struct anv_pipeline_binding) { - .set = ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS, - .binding = 0, - .index = UINT32_MAX, - }; - } - - num_rts++; - } + assert(num_rt_bindings <= MAX_RTS); + assert(stage->bind_map.surface_count == 0); + typed_memcpy(stage->bind_map.surface_to_descriptor, + rt_bindings, num_rt_bindings); + stage->bind_map.surface_count += num_rt_bindings; + /* Now that we've set up the color attachments, we can go through and + * eliminate any shader outputs that map to VK_ATTACHMENT_UNUSED in the + * hopes that dead code can clean them up in this and any earlier shader + * stages. + */ + nir_function_impl *impl = nir_shader_get_entrypoint(stage->nir); bool deleted_output = false; nir_foreach_variable_safe(var, &stage->nir->outputs) { + /* TODO: We don't delete depth/stencil writes. We probably could if the + * subpass doesn't have a depth/stencil attachment. + */ if (var->data.location < FRAG_RESULT_DATA0) continue; const unsigned rt = var->data.location - FRAG_RESULT_DATA0; - if (rt >= MAX_RTS || !rt_used[rt]) { - /* Unused or out-of-bounds, throw it away, unless it is the first - * RT and we have alpha to coverage enabled. - */ + /* If this is the RT at location 0 and we have alpha to coverage + * enabled we still need that write because it will affect the coverage + * mask even if it's never written to a color target. + */ + if (rt == 0 && stage->key.wm.alpha_to_coverage) + continue; + + const unsigned array_len = + glsl_type_is_array(var->type) ? glsl_get_length(var->type) : 1; + assert(rt + array_len <= MAX_RTS); + + if (rt >= MAX_RTS || !(stage->key.wm.color_outputs_valid & + BITFIELD_RANGE(rt, array_len))) { deleted_output = true; var->data.mode = nir_var_function_temp; exec_node_remove(&var->node); exec_list_push_tail(&impl->locals, &var->node); - continue; } - - /* Give it the new location */ - assert(rt_to_bindings[rt] != -1); - var->data.location = rt_to_bindings[rt] + FRAG_RESULT_DATA0; } if (deleted_output) nir_fixup_deref_modes(stage->nir); - if (num_rts == 0) { - /* If we have no render targets, we need a null render target */ - rt_bindings[0] = (struct anv_pipeline_binding) { - .set = ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS, - .binding = 0, - .index = UINT32_MAX, - }; - num_rts = 1; - } - - /* Now that we've determined the actual number of render targets, adjust - * the key accordingly. + /* We stored the number of subpass color attachments in nr_color_regions + * when calculating the key for caching. Now that we've computed the bind + * map, we can reduce this to the actual max before we go into the back-end + * compiler. */ - stage->key.wm.nr_color_regions = num_rts; - stage->key.wm.color_outputs_valid = (1 << num_rts) - 1; - - assert(num_rts <= max_rt); - assert(stage->bind_map.surface_count == 0); - typed_memcpy(stage->bind_map.surface_to_descriptor, - rt_bindings, num_rts); - stage->bind_map.surface_count += num_rts; + stage->key.wm.nr_color_regions = + util_last_bit(stage->key.wm.color_outputs_valid); } static void @@ -1043,7 +999,7 @@ fs_stage->code = brw_compile_fs(compiler, device, mem_ctx, &fs_stage->key.wm, &fs_stage->prog_data.wm, - fs_stage->nir, NULL, -1, -1, -1, + fs_stage->nir, -1, -1, -1, true, false, NULL, fs_stage->stats, NULL); @@ -1051,8 +1007,10 @@ (uint32_t)fs_stage->prog_data.wm.dispatch_16 + (uint32_t)fs_stage->prog_data.wm.dispatch_32; - if (fs_stage->key.wm.nr_color_regions == 0 && + if (fs_stage->key.wm.color_outputs_valid == 0 && !fs_stage->prog_data.wm.has_side_effects && + !fs_stage->prog_data.wm.uses_omask && + !fs_stage->key.wm.alpha_to_coverage && !fs_stage->prog_data.wm.uses_kill && fs_stage->prog_data.wm.computed_depth_mode == BRW_PSCDEPTH_OFF && !fs_stage->prog_data.wm.computed_stencil) { @@ -1071,6 +1029,26 @@ struct brw_compile_stats *stats, uint32_t code_offset) { + char *nir = NULL; + if (stage->nir && + (pipeline->flags & + VK_PIPELINE_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR)) { + char *stream_data = NULL; + size_t stream_size = 0; + FILE *stream = open_memstream(&stream_data, &stream_size); + + nir_print_shader(stage->nir, stream); + + fclose(stream); + + /* Copy it to a ralloc'd thing */ + nir = ralloc_size(pipeline->mem_ctx, stream_size + 1); + memcpy(nir, stream_data, stream_size); + nir[stream_size] = 0; + + free(stream_data); + } + char *disasm = NULL; if (stage->code && (pipeline->flags & @@ -1100,6 +1078,7 @@ (struct anv_pipeline_executable) { .stage = stage->stage, .stats = *stats, + .nir = nir, .disasm = disasm, }; } @@ -1146,8 +1125,7 @@ }; int64_t pipeline_start = os_time_get_nano(); - const struct brw_compiler *compiler = - pipeline->device->instance->physicalDevice.compiler; + const struct brw_compiler *compiler = pipeline->device->physical->compiler; struct anv_pipeline_stage stages[MESA_SHADER_STAGES] = {}; pipeline->active_stages = 0; @@ -1187,12 +1165,15 @@ case MESA_SHADER_GEOMETRY: populate_gs_prog_key(devinfo, sinfo->flags, &stages[stage].key.gs); break; - case MESA_SHADER_FRAGMENT: + case MESA_SHADER_FRAGMENT: { + const bool raster_enabled = + !info->pRasterizationState->rasterizerDiscardEnable; populate_wm_prog_key(devinfo, sinfo->flags, pipeline->subpass, - info->pMultisampleState, + raster_enabled ? info->pMultisampleState : NULL, &stages[stage].key.wm); break; + } default: unreachable("Invalid graphics shader stage"); } @@ -1270,7 +1251,7 @@ */ assert(found < __builtin_popcount(pipeline->active_stages)); - vk_debug_report(&pipeline->device->instance->debug_report_callbacks, + vk_debug_report(&pipeline->device->physical->instance->debug_report_callbacks, VK_DEBUG_REPORT_WARNING_BIT_EXT | VK_DEBUG_REPORT_PERFORMANCE_WARNING_BIT_EXT, VK_DEBUG_REPORT_OBJECT_TYPE_PIPELINE_CACHE_EXT, @@ -1397,6 +1378,9 @@ goto fail; } + anv_nir_validate_push_layout(&stages[s].prog_data.base, + &stages[s].bind_map); + struct anv_shader_bin *bin = anv_device_upload_kernel(pipeline->device, cache, &stages[s].cache_key, @@ -1492,8 +1476,7 @@ }; int64_t pipeline_start = os_time_get_nano(); - const struct brw_compiler *compiler = - pipeline->device->instance->physicalDevice.compiler; + const struct brw_compiler *compiler = pipeline->device->physical->compiler; struct anv_pipeline_stage stage = { .stage = MESA_SHADER_COMPUTE, @@ -1558,10 +1541,9 @@ return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); } - anv_pipeline_lower_nir(pipeline, mem_ctx, &stage, layout); + NIR_PASS_V(stage.nir, anv_nir_add_base_work_group_id); - NIR_PASS_V(stage.nir, anv_nir_add_base_work_group_id, - &stage.prog_data.cs); + anv_pipeline_lower_nir(pipeline, mem_ctx, &stage, layout); NIR_PASS_V(stage.nir, nir_lower_vars_to_explicit_types, nir_var_mem_shared, shared_type_info); @@ -1577,6 +1559,14 @@ return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); } + anv_nir_validate_push_layout(&stage.prog_data.base, &stage.bind_map); + + if (!stage.prog_data.cs.uses_num_work_groups) { + assert(stage.bind_map.surface_to_descriptor[0].set == + ANV_DESCRIPTOR_SET_NUM_WORK_GROUPS); + stage.bind_map.surface_to_descriptor[0].set = ANV_DESCRIPTOR_SET_NULL; + } + const unsigned code_size = stage.prog_data.base.program_size; bin = anv_device_upload_kernel(pipeline->device, cache, &stage.cache_key, sizeof(stage.cache_key), @@ -1843,7 +1833,7 @@ const struct gen_device_info *devinfo = &pipeline->device->info; const struct gen_l3_weights w = - gen_get_default_l3_weights(devinfo, pipeline->needs_data_cache, needs_slm); + gen_get_default_l3_weights(devinfo, true, needs_slm); pipeline->urb.l3_config = gen_get_l3_config(devinfo, w); pipeline->urb.total_size = @@ -1883,9 +1873,10 @@ pipeline->mem_ctx = ralloc_context(NULL); pipeline->flags = pCreateInfo->flags; + assert(pCreateInfo->pRasterizationState); + copy_non_dynamic_state(pipeline, pCreateInfo); - pipeline->depth_clamp_enable = pCreateInfo->pRasterizationState && - pCreateInfo->pRasterizationState->depthClampEnable; + pipeline->depth_clamp_enable = pCreateInfo->pRasterizationState->depthClampEnable; /* Previously we enabled depth clipping when !depthClampEnable. * DepthClipStateCreateInfo now makes depth clipping explicit so if the @@ -1897,10 +1888,10 @@ PIPELINE_RASTERIZATION_DEPTH_CLIP_STATE_CREATE_INFO_EXT); pipeline->depth_clip_enable = clip_info ? clip_info->depthClipEnable : !pipeline->depth_clamp_enable; - pipeline->sample_shading_enable = pCreateInfo->pMultisampleState && - pCreateInfo->pMultisampleState->sampleShadingEnable; - - pipeline->needs_data_cache = false; + pipeline->sample_shading_enable = + !pCreateInfo->pRasterizationState->rasterizerDiscardEnable && + pCreateInfo->pMultisampleState && + pCreateInfo->pMultisampleState->sampleShadingEnable; /* When we free the pipeline, we detect stages based on the NULL status * of various prog_data pointers. Make them NULL by default. @@ -2121,7 +2112,7 @@ "Number of bytes of workgroup shared memory used by this " "compute shader including any padding."); stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; - stat->value.u64 = prog_data->total_scratch; + stat->value.u64 = brw_cs_prog_data_const(prog_data)->slm_size; } } @@ -2164,6 +2155,17 @@ const struct anv_pipeline_executable *exe = &pipeline->executables[pExecutableInfo->executableIndex]; + if (exe->nir) { + vk_outarray_append(&out, ir) { + WRITE_STR(ir->name, "Final NIR"); + WRITE_STR(ir->description, + "Final NIR before going into the back-end compiler"); + + if (!write_ir_text(ir, exe->nir)) + incomplete_text = true; + } + } + if (exe->disasm) { vk_outarray_append(&out, ir) { WRITE_STR(ir->name, "GEN Assembly"); diff -Nru mesa-19.2.8/src/intel/vulkan/anv_pipeline_cache.c mesa-20.0.8/src/intel/vulkan/anv_pipeline_cache.c --- mesa-19.2.8/src/intel/vulkan/anv_pipeline_cache.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/vulkan/anv_pipeline_cache.c 2020-06-12 01:21:17.000000000 +0000 @@ -21,7 +21,7 @@ * IN THE SOFTWARE. */ -#include "compiler/blob.h" +#include "util/blob.h" #include "util/hash_table.h" #include "util/debug.h" #include "util/disk_cache.h" @@ -161,6 +161,12 @@ blob_write_uint32(blob, 0); } + blob_write_bytes(blob, shader->bind_map.surface_sha1, + sizeof(shader->bind_map.surface_sha1)); + blob_write_bytes(blob, shader->bind_map.sampler_sha1, + sizeof(shader->bind_map.sampler_sha1)); + blob_write_bytes(blob, shader->bind_map.push_sha1, + sizeof(shader->bind_map.push_sha1)); blob_write_uint32(blob, shader->bind_map.surface_count); blob_write_uint32(blob, shader->bind_map.sampler_count); blob_write_bytes(blob, shader->bind_map.surface_to_descriptor, @@ -169,6 +175,8 @@ blob_write_bytes(blob, shader->bind_map.sampler_to_descriptor, shader->bind_map.sampler_count * sizeof(*shader->bind_map.sampler_to_descriptor)); + blob_write_bytes(blob, shader->bind_map.push_ranges, + sizeof(shader->bind_map.push_ranges)); return !blob->out_of_memory; } @@ -204,6 +212,9 @@ xfb_info = blob_read_bytes(blob, xfb_size); struct anv_pipeline_bind_map bind_map; + blob_copy_bytes(blob, bind_map.surface_sha1, sizeof(bind_map.surface_sha1)); + blob_copy_bytes(blob, bind_map.sampler_sha1, sizeof(bind_map.sampler_sha1)); + blob_copy_bytes(blob, bind_map.push_sha1, sizeof(bind_map.push_sha1)); bind_map.surface_count = blob_read_uint32(blob); bind_map.sampler_count = blob_read_uint32(blob); bind_map.surface_to_descriptor = (void *) @@ -212,6 +223,7 @@ bind_map.sampler_to_descriptor = (void *) blob_read_bytes(blob, bind_map.sampler_count * sizeof(*bind_map.sampler_to_descriptor)); + blob_copy_bytes(blob, bind_map.push_ranges, sizeof(bind_map.push_ranges)); if (blob->overrun) return NULL; @@ -453,7 +465,7 @@ const void *data, size_t size) { struct anv_device *device = cache->device; - struct anv_physical_device *pdevice = &device->instance->physicalDevice; + struct anv_physical_device *pdevice = device->physical; if (cache->cache == NULL) return; @@ -473,7 +485,7 @@ return; if (header.vendor_id != 0x8086) return; - if (header.device_id != device->chipset_id) + if (header.device_id != device->info.chipset_id) return; if (memcmp(header.uuid, pdevice->pipeline_cache_uuid, VK_UUID_SIZE) != 0) return; @@ -506,7 +518,7 @@ return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); anv_pipeline_cache_init(cache, device, - device->instance->pipeline_cache_enabled); + device->physical->instance->pipeline_cache_enabled); if (pCreateInfo->initialDataSize > 0) anv_pipeline_cache_load(cache, @@ -542,7 +554,6 @@ { ANV_FROM_HANDLE(anv_device, device, _device); ANV_FROM_HANDLE(anv_pipeline_cache, cache, _cache); - struct anv_physical_device *pdevice = &device->instance->physicalDevice; struct blob blob; if (pData) { @@ -555,9 +566,9 @@ .header_size = sizeof(struct cache_header), .header_version = VK_PIPELINE_CACHE_HEADER_VERSION_ONE, .vendor_id = 0x8086, - .device_id = device->chipset_id, + .device_id = device->info.chipset_id, }; - memcpy(header.uuid, pdevice->pipeline_cache_uuid, VK_UUID_SIZE); + memcpy(header.uuid, device->physical->pipeline_cache_uuid, VK_UUID_SIZE); blob_write_bytes(&blob, &header, sizeof(header)); uint32_t count = 0; @@ -644,8 +655,8 @@ } #ifdef ENABLE_SHADER_CACHE - struct disk_cache *disk_cache = device->instance->physicalDevice.disk_cache; - if (disk_cache && device->instance->pipeline_cache_enabled) { + struct disk_cache *disk_cache = device->physical->disk_cache; + if (disk_cache && device->physical->instance->pipeline_cache_enabled) { cache_key cache_key; disk_cache_compute_key(disk_cache, key_data, key_size, cache_key); @@ -705,7 +716,7 @@ return NULL; #ifdef ENABLE_SHADER_CACHE - struct disk_cache *disk_cache = device->instance->physicalDevice.disk_cache; + struct disk_cache *disk_cache = device->physical->disk_cache; if (disk_cache) { struct blob binary; blob_init(&binary); @@ -779,7 +790,7 @@ struct blob blob; blob_init(&blob); - nir_serialize(&blob, nir); + nir_serialize(&blob, nir, false); if (blob.out_of_memory) { blob_finish(&blob); return; diff -Nru mesa-19.2.8/src/intel/vulkan/anv_private.h mesa-20.0.8/src/intel/vulkan/anv_private.h --- mesa-19.2.8/src/intel/vulkan/anv_private.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/vulkan/anv_private.h 2020-06-12 01:21:17.000000000 +0000 @@ -40,19 +40,21 @@ #define __gen_validate_value(x) VALGRIND_CHECK_MEM_IS_DEFINED(&(x), sizeof(x)) #endif #else -#define VG(x) +#define VG(x) ((void)0) #endif #include "common/gen_clflush.h" #include "common/gen_decoder.h" #include "common/gen_gem.h" +#include "common/gen_l3_config.h" #include "dev/gen_device_info.h" #include "blorp/blorp.h" #include "compiler/brw_compiler.h" +#include "util/bitset.h" #include "util/macros.h" #include "util/hash_table.h" #include "util/list.h" -#include "util/set.h" +#include "util/sparse_array.h" #include "util/u_atomic.h" #include "util/u_vector.h" #include "util/u_math.h" @@ -68,12 +70,14 @@ typedef uint32_t xcb_visualid_t; typedef uint32_t xcb_window_t; +struct anv_batch; struct anv_buffer; struct anv_buffer_view; struct anv_image_view; struct anv_instance; -struct gen_l3_config; +struct gen_aux_map_context; +struct gen_perf_config; #include #include @@ -88,6 +92,8 @@ #include "common/intel_log.h" #include "wsi_common.h" +#define NSEC_PER_SEC 1000000000ull + /* anv Virtual Memory Layout * ========================= * @@ -121,7 +127,9 @@ #define SURFACE_STATE_POOL_MAX_ADDRESS 0x00017fffffffULL #define INSTRUCTION_STATE_POOL_MIN_ADDRESS 0x000180000000ULL /* 6 GiB */ #define INSTRUCTION_STATE_POOL_MAX_ADDRESS 0x0001bfffffffULL -#define HIGH_HEAP_MIN_ADDRESS 0x0001c0000000ULL /* 7 GiB */ +#define CLIENT_VISIBLE_HEAP_MIN_ADDRESS 0x0001c0000000ULL /* 7 GiB */ +#define CLIENT_VISIBLE_HEAP_MAX_ADDRESS 0x0002bfffffffULL +#define HIGH_HEAP_MIN_ADDRESS 0x0002c0000000ULL /* 11 GiB */ #define LOW_HEAP_SIZE \ (LOW_HEAP_MAX_ADDRESS - LOW_HEAP_MIN_ADDRESS + 1) @@ -133,6 +141,8 @@ (SURFACE_STATE_POOL_MAX_ADDRESS - SURFACE_STATE_POOL_MIN_ADDRESS + 1) #define INSTRUCTION_STATE_POOL_SIZE \ (INSTRUCTION_STATE_POOL_MAX_ADDRESS - INSTRUCTION_STATE_POOL_MIN_ADDRESS + 1) +#define CLIENT_VISIBLE_HEAP_SIZE \ + (CLIENT_VISIBLE_HEAP_MAX_ADDRESS - CLIENT_VISIBLE_HEAP_MIN_ADDRESS + 1) /* Allowing different clear colors requires us to perform a depth resolve at * the end of certain render passes. This is because while slow clears store @@ -202,6 +212,15 @@ */ #define ANV_PREDICATE_RESULT_REG 0x2678 /* MI_ALU_REG15 */ +/* For gen12 we set the streamout buffers using 4 separate commands + * (3DSTATE_SO_BUFFER_INDEX_*) instead of 3DSTATE_SO_BUFFER. However the layout + * of the 3DSTATE_SO_BUFFER_INDEX_* commands is identical to that of + * 3DSTATE_SO_BUFFER apart from the SOBufferIndex field, so for now we use the + * 3DSTATE_SO_BUFFER command, but change the 3DCommandSubOpcode. + * SO_BUFFER_INDEX_0_CMD is actually the 3DCommandSubOpcode for + * 3DSTATE_SO_BUFFER_INDEX_0. + */ +#define SO_BUFFER_INDEX_0_CMD 0x60 #define anv_printflike(a, b) __attribute__((__format__(__printf__, a, b))) static inline uint32_t @@ -218,10 +237,16 @@ } static inline uint64_t -align_u64(uint64_t v, uint64_t a) +align_down_u64(uint64_t v, uint64_t a) { assert(a != 0 && a == (a & -a)); - return (v + a - 1) & ~(a - 1); + return v & ~(a - 1); +} + +static inline uint64_t +align_u64(uint64_t v, uint64_t a) +{ + return align_down_u64(v + a - 1, a); } static inline int32_t @@ -285,6 +310,20 @@ }; } +static inline void *anv_unpack_ptr(uintptr_t ptr, int bits, int *flags) +{ + uintptr_t mask = (1ull << bits) - 1; + *flags = ptr & mask; + return (void *) (ptr & ~mask); +} + +static inline uintptr_t anv_pack_ptr(void *ptr, int bits, int flags) +{ + uintptr_t value = (uintptr_t) ptr; + uintptr_t mask = (1ull << bits) - 1; + return value | (mask & flags); +} + #define for_each_bit(b, dword) \ for (uint32_t __dword = (dword); \ (b) = __builtin_ffs(__dword) - 1, __dword; \ @@ -408,21 +447,23 @@ VkResult __vk_errorf(struct anv_instance *instance, const void *object, VkDebugReportObjectTypeEXT type, VkResult error, - const char *file, int line, const char *format, ...); + const char *file, int line, const char *format, ...) + anv_printflike(7, 8); #ifdef DEBUG #define vk_error(error) __vk_errorf(NULL, NULL,\ VK_DEBUG_REPORT_OBJECT_TYPE_UNKNOWN_EXT,\ error, __FILE__, __LINE__, NULL) -#define vk_errorv(instance, obj, error, format, args)\ - __vk_errorv(instance, obj, REPORT_OBJECT_TYPE(obj), error,\ - __FILE__, __LINE__, format, args) -#define vk_errorf(instance, obj, error, format, ...)\ +#define vk_errorfi(instance, obj, error, format, ...)\ __vk_errorf(instance, obj, REPORT_OBJECT_TYPE(obj), error,\ __FILE__, __LINE__, format, ## __VA_ARGS__) +#define vk_errorf(device, obj, error, format, ...)\ + vk_errorfi(anv_device_instance_or_null(device),\ + obj, error, format, ## __VA_ARGS__) #else #define vk_error(error) error -#define vk_errorf(instance, obj, error, format, ...) error +#define vk_errorfi(instance, obj, error, format, ...) error +#define vk_errorf(device, obj, error, format, ...) error #endif /** @@ -443,7 +484,7 @@ #define anv_debug_ignored_stype(sType) \ intel_logd("%s: ignored VkStructureType %u\n", __func__, (sType)) -void __anv_perf_warn(struct anv_instance *instance, const void *object, +void __anv_perf_warn(struct anv_device *device, const void *object, VkDebugReportObjectTypeEXT type, const char *file, int line, const char *format, ...) anv_printflike(6, 7); @@ -584,41 +625,100 @@ return anv_multialloc_alloc(ma, alloc ? alloc : parent_alloc, scope); } -/* Extra ANV-defined BO flags which won't be passed to the kernel */ -#define ANV_BO_EXTERNAL (1ull << 31) -#define ANV_BO_FLAG_MASK (1ull << 31) - struct anv_bo { uint32_t gem_handle; + uint32_t refcount; + /* Index into the current validation list. This is used by the * validation list building alrogithm to track which buffers are already * in the validation list so that we can ensure uniqueness. */ uint32_t index; + /* Index for use with util_sparse_array_free_list */ + uint32_t free_index; + /* Last known offset. This value is provided by the kernel when we * execbuf and is used as the presumed offset for the next bunch of * relocations. */ uint64_t offset; + /** Size of the buffer not including implicit aux */ uint64_t size; + + /* Map for internally mapped BOs. + * + * If ANV_BO_WRAPPER is set in flags, map points to the wrapped BO. + */ void *map; + /** Size of the implicit CCS range at the end of the buffer + * + * On Gen12, CCS data is always a direct 1/256 scale-down. A single 64K + * page of main surface data maps to a 256B chunk of CCS data and that + * mapping is provided on TGL-LP by the AUX table which maps virtual memory + * addresses in the main surface to virtual memory addresses for CCS data. + * + * Because we can't change these maps around easily and because Vulkan + * allows two VkImages to be bound to overlapping memory regions (as long + * as the app is careful), it's not feasible to make this mapping part of + * the image. (On Gen11 and earlier, the mapping was provided via + * RENDER_SURFACE_STATE so each image had its own main -> CCS mapping.) + * Instead, we attach the CCS data directly to the buffer object and setup + * the AUX table mapping at BO creation time. + * + * This field is for internal tracking use by the BO allocator only and + * should not be touched by other parts of the code. If something wants to + * know if a BO has implicit CCS data, it should instead look at the + * has_implicit_ccs boolean below. + * + * This data is not included in maps of this buffer. + */ + uint32_t _ccs_size; + /** Flags to pass to the kernel through drm_i915_exec_object2::flags */ uint32_t flags; + + /** True if this BO may be shared with other processes */ + bool is_external:1; + + /** True if this BO is a wrapper + * + * When set to true, none of the fields in this BO are meaningful except + * for anv_bo::is_wrapper and anv_bo::map which points to the actual BO. + * See also anv_bo_unwrap(). Wrapper BOs are not allowed when use_softpin + * is set in the physical device. + */ + bool is_wrapper:1; + + /** See also ANV_BO_ALLOC_FIXED_ADDRESS */ + bool has_fixed_address:1; + + /** True if this BO wraps a host pointer */ + bool from_host_ptr:1; + + /** See also ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS */ + bool has_client_visible_address:1; + + /** True if this BO has implicit CCS data attached to it */ + bool has_implicit_ccs:1; }; -static inline void -anv_bo_init(struct anv_bo *bo, uint32_t gem_handle, uint64_t size) +static inline struct anv_bo * +anv_bo_ref(struct anv_bo *bo) +{ + p_atomic_inc(&bo->refcount); + return bo; +} + +static inline struct anv_bo * +anv_bo_unwrap(struct anv_bo *bo) { - bo->gem_handle = gem_handle; - bo->index = 0; - bo->offset = -1; - bo->size = size; - bo->map = NULL; - bo->flags = 0; + while (bo->is_wrapper) + bo = bo->map; + return bo; } /* Represents a lock-free linked list of "free" things. This is used by @@ -632,7 +732,10 @@ /* A simple count that is incremented every time the head changes. */ uint32_t count; }; - uint64_t u64; + /* Make sure it's aligned to 64 bits. This will make atomic operations + * faster on 32 bit platforms. + */ + uint64_t u64 __attribute__ ((aligned (8))); }; #define ANV_FREE_LIST_EMPTY ((union anv_free_list) { { UINT32_MAX, 0 } }) @@ -643,21 +746,32 @@ uint32_t next; uint32_t end; }; - uint64_t u64; + /* Make sure it's aligned to 64 bits. This will make atomic operations + * faster on 32 bit platforms. + */ + uint64_t u64 __attribute__ ((aligned (8))); }; }; #define anv_block_pool_foreach_bo(bo, pool) \ - for (bo = (pool)->bos; bo != &(pool)->bos[(pool)->nbos]; bo++) + for (struct anv_bo **_pp_bo = (pool)->bos, *bo; \ + _pp_bo != &(pool)->bos[(pool)->nbos] && (bo = *_pp_bo, true); \ + _pp_bo++) #define ANV_MAX_BLOCK_POOL_BOS 20 struct anv_block_pool { struct anv_device *device; + bool use_softpin; - uint64_t bo_flags; + /* Wrapper BO for use in relocation lists. This BO is simply a wrapper + * around the actual BO so that we grow the pool after the wrapper BO has + * been put in a relocation list. This is only used in the non-softpin + * case. + */ + struct anv_bo wrapper_bo; - struct anv_bo bos[ANV_MAX_BLOCK_POOL_BOS]; + struct anv_bo *bos[ANV_MAX_BLOCK_POOL_BOS]; struct anv_bo *bo; uint32_t nbos; @@ -786,20 +900,19 @@ VkResult anv_block_pool_init(struct anv_block_pool *pool, struct anv_device *device, uint64_t start_address, - uint32_t initial_size, - uint64_t bo_flags); + uint32_t initial_size); void anv_block_pool_finish(struct anv_block_pool *pool); int32_t anv_block_pool_alloc(struct anv_block_pool *pool, uint32_t block_size, uint32_t *padding); int32_t anv_block_pool_alloc_back(struct anv_block_pool *pool, uint32_t block_size); -void* anv_block_pool_map(struct anv_block_pool *pool, int32_t offset); +void* anv_block_pool_map(struct anv_block_pool *pool, int32_t offset, uint32_t +size); VkResult anv_state_pool_init(struct anv_state_pool *pool, struct anv_device *device, uint64_t start_address, - uint32_t block_size, - uint64_t bo_flags); + uint32_t block_size); void anv_state_pool_finish(struct anv_state_pool *pool); struct anv_state anv_state_pool_alloc(struct anv_state_pool *pool, uint32_t state_size, uint32_t alignment); @@ -837,26 +950,18 @@ struct anv_bo_pool { struct anv_device *device; - uint64_t bo_flags; - - void *free_list[16]; + struct util_sparse_array_free_list free_list[16]; }; -void anv_bo_pool_init(struct anv_bo_pool *pool, struct anv_device *device, - uint64_t bo_flags); +void anv_bo_pool_init(struct anv_bo_pool *pool, struct anv_device *device); void anv_bo_pool_finish(struct anv_bo_pool *pool); -VkResult anv_bo_pool_alloc(struct anv_bo_pool *pool, struct anv_bo *bo, - uint32_t size); -void anv_bo_pool_free(struct anv_bo_pool *pool, const struct anv_bo *bo); - -struct anv_scratch_bo { - bool exists; - struct anv_bo bo; -}; +VkResult anv_bo_pool_alloc(struct anv_bo_pool *pool, uint32_t size, + struct anv_bo **bo_out); +void anv_bo_pool_free(struct anv_bo_pool *pool, struct anv_bo *bo); struct anv_scratch_pool { /* Indexed by Per-Thread Scratch Space number (the hardware value) and stage */ - struct anv_scratch_bo bos[16][MESA_SHADER_STAGES]; + struct anv_bo *bos[16][MESA_SHADER_STAGES]; }; void anv_scratch_pool_init(struct anv_device *device, @@ -870,38 +975,17 @@ /** Implements a BO cache that ensures a 1-1 mapping of GEM BOs to anv_bos */ struct anv_bo_cache { - struct hash_table *bo_map; + struct util_sparse_array bo_map; pthread_mutex_t mutex; }; VkResult anv_bo_cache_init(struct anv_bo_cache *cache); void anv_bo_cache_finish(struct anv_bo_cache *cache); -VkResult anv_bo_cache_alloc(struct anv_device *device, - struct anv_bo_cache *cache, - uint64_t size, uint64_t bo_flags, - struct anv_bo **bo); -VkResult anv_bo_cache_import_host_ptr(struct anv_device *device, - struct anv_bo_cache *cache, - void *host_ptr, uint32_t size, - uint64_t bo_flags, struct anv_bo **bo_out); -VkResult anv_bo_cache_import(struct anv_device *device, - struct anv_bo_cache *cache, - int fd, uint64_t bo_flags, - struct anv_bo **bo); -VkResult anv_bo_cache_export(struct anv_device *device, - struct anv_bo_cache *cache, - struct anv_bo *bo_in, int *fd_out); -void anv_bo_cache_release(struct anv_device *device, - struct anv_bo_cache *cache, - struct anv_bo *bo); struct anv_memory_type { /* Standard bits passed on to the client */ VkMemoryPropertyFlags propertyFlags; uint32_t heapIndex; - - /* Driver-internal book-keeping */ - VkBufferUsageFlags valid_buffer_usage; }; struct anv_memory_heap { @@ -910,17 +994,16 @@ VkMemoryHeapFlags flags; /* Driver-internal book-keeping */ - uint64_t vma_start; - uint64_t vma_size; - bool supports_48bit_addresses; VkDeviceSize used; }; struct anv_physical_device { VK_LOADER_DATA _loader_data; + /* Link in anv_instance::physical_devices */ + struct list_head link; + struct anv_instance * instance; - uint32_t chipset_id; bool no_hw; char path[20]; const char * name; @@ -942,16 +1025,20 @@ bool supports_48bit_addresses; struct brw_compiler * compiler; struct isl_device isl_dev; + struct gen_perf_config * perf; int cmd_parser_version; + bool has_softpin; bool has_exec_async; bool has_exec_capture; bool has_exec_fence; bool has_syncobj; bool has_syncobj_wait; bool has_context_priority; - bool use_softpin; bool has_context_isolation; bool has_mem_available; + uint64_t gtt_size; + + bool use_softpin; bool always_use_bindless; /** True if we can access buffers using A64 messages */ @@ -961,6 +1048,15 @@ /** True if we can use bindless access for samplers */ bool has_bindless_samplers; + /** True if this device has implicit AUX + * + * If true, CCS is handled as an implicit attachment to the BO rather than + * as an explicitly bound surface. + */ + bool has_implicit_ccs; + + bool always_flush_cache; + struct anv_device_extension_table supported_extensions; uint32_t eu_total; @@ -1002,10 +1098,11 @@ struct anv_instance_extension_table enabled_extensions; struct anv_instance_dispatch_table dispatch; + struct anv_physical_device_dispatch_table physical_device_dispatch; struct anv_device_dispatch_table device_dispatch; - int physicalDeviceCount; - struct anv_physical_device physicalDevice; + bool physical_devices_enumerated; + struct list_head physical_devices; bool pipeline_cache_enabled; @@ -1022,11 +1119,63 @@ bool anv_physical_device_extension_supported(struct anv_physical_device *dev, const char *name); +struct anv_queue_submit { + struct anv_cmd_buffer * cmd_buffer; + + uint32_t fence_count; + uint32_t fence_array_length; + struct drm_i915_gem_exec_fence * fences; + + uint32_t temporary_semaphore_count; + uint32_t temporary_semaphore_array_length; + struct anv_semaphore_impl * temporary_semaphores; + + /* Semaphores to be signaled with a SYNC_FD. */ + struct anv_semaphore ** sync_fd_semaphores; + uint32_t sync_fd_semaphore_count; + uint32_t sync_fd_semaphore_array_length; + + /* Allocated only with non shareable timelines. */ + struct anv_timeline ** wait_timelines; + uint32_t wait_timeline_count; + uint32_t wait_timeline_array_length; + uint64_t * wait_timeline_values; + + struct anv_timeline ** signal_timelines; + uint32_t signal_timeline_count; + uint32_t signal_timeline_array_length; + uint64_t * signal_timeline_values; + + int in_fence; + bool need_out_fence; + int out_fence; + + uint32_t fence_bo_count; + uint32_t fence_bo_array_length; + /* An array of struct anv_bo pointers with lower bit used as a flag to + * signal we will wait on that BO (see anv_(un)pack_ptr). + */ + uintptr_t * fence_bos; + + const VkAllocationCallbacks * alloc; + VkSystemAllocationScope alloc_scope; + + struct anv_bo * simple_bo; + uint32_t simple_bo_size; + + struct list_head link; +}; + struct anv_queue { VK_LOADER_DATA _loader_data; struct anv_device * device; + /* + * A list of struct anv_queue_submit to be submitted to i915. + */ + struct list_head queued_submits; + VkDeviceQueueCreateFlags flags; }; @@ -1104,8 +1253,7 @@ VkAllocationCallbacks alloc; - struct anv_instance * instance; - uint32_t chipset_id; + struct anv_physical_device * physical; bool no_hw; struct gen_device_info info; struct isl_device isl_dev; @@ -1118,9 +1266,8 @@ pthread_mutex_t vma_mutex; struct util_vma_heap vma_lo; + struct util_vma_heap vma_cva; struct util_vma_heap vma_hi; - uint64_t vma_lo_available; - uint64_t vma_hi_available; /** List of all anv_device_memory objects */ struct list_head memory_objects; @@ -1134,9 +1281,9 @@ struct anv_state_pool binding_table_pool; struct anv_state_pool surface_state_pool; - struct anv_bo workaround_bo; - struct anv_bo trivial_batch_bo; - struct anv_bo hiz_clear_bo; + struct anv_bo * workaround_bo; + struct anv_bo * trivial_batch_bo; + struct anv_bo * hiz_clear_bo; struct anv_pipeline_cache default_pipeline_cache; struct blorp_context blorp; @@ -1149,12 +1296,9 @@ struct anv_scratch_pool scratch_pool; - uint32_t default_mocs; - uint32_t external_mocs; - pthread_mutex_t mutex; pthread_cond_t queue_submit; - bool _lost; + int _lost; struct gen_batch_decode_ctx decoder_ctx; /* @@ -1162,12 +1306,23 @@ * the cmd_buffer's list. */ struct anv_cmd_buffer *cmd_buffer_being_decoded; + + int perf_fd; /* -1 if no opened */ + uint64_t perf_metric; /* 0 if unset */ + + struct gen_aux_map_context *aux_map_ctx; }; +static inline struct anv_instance * +anv_device_instance_or_null(const struct anv_device *device) +{ + return device ? device->physical->instance : NULL; +} + static inline struct anv_state_pool * anv_binding_table_pool(struct anv_device *device) { - if (device->instance->physicalDevice.use_softpin) + if (device->physical->use_softpin) return &device->binding_table_pool; else return &device->surface_state_pool; @@ -1175,7 +1330,7 @@ static inline struct anv_state anv_binding_table_pool_alloc(struct anv_device *device) { - if (device->instance->physicalDevice.use_softpin) + if (device->physical->use_softpin) return anv_state_pool_alloc(&device->binding_table_pool, device->binding_table_pool.block_size, 0); else @@ -1190,35 +1345,120 @@ static inline uint32_t anv_mocs_for_bo(const struct anv_device *device, const struct anv_bo *bo) { - if (bo->flags & ANV_BO_EXTERNAL) - return device->external_mocs; + if (bo->is_external) + return device->isl_dev.mocs.external; else - return device->default_mocs; + return device->isl_dev.mocs.internal; } void anv_device_init_blorp(struct anv_device *device); void anv_device_finish_blorp(struct anv_device *device); +void _anv_device_set_all_queue_lost(struct anv_device *device); VkResult _anv_device_set_lost(struct anv_device *device, const char *file, int line, - const char *msg, ...); + const char *msg, ...) + anv_printflike(4, 5); +VkResult _anv_queue_set_lost(struct anv_queue *queue, + const char *file, int line, + const char *msg, ...) + anv_printflike(4, 5); #define anv_device_set_lost(dev, ...) \ _anv_device_set_lost(dev, __FILE__, __LINE__, __VA_ARGS__) +#define anv_queue_set_lost(queue, ...) \ + _anv_queue_set_lost(queue, __FILE__, __LINE__, __VA_ARGS__) static inline bool anv_device_is_lost(struct anv_device *device) { - return unlikely(device->_lost); + return unlikely(p_atomic_read(&device->_lost)); } -VkResult anv_device_execbuf(struct anv_device *device, - struct drm_i915_gem_execbuffer2 *execbuf, - struct anv_bo **execbuf_bos); VkResult anv_device_query_status(struct anv_device *device); + + +enum anv_bo_alloc_flags { + /** Specifies that the BO must have a 32-bit address + * + * This is the opposite of EXEC_OBJECT_SUPPORTS_48B_ADDRESS. + */ + ANV_BO_ALLOC_32BIT_ADDRESS = (1 << 0), + + /** Specifies that the BO may be shared externally */ + ANV_BO_ALLOC_EXTERNAL = (1 << 1), + + /** Specifies that the BO should be mapped */ + ANV_BO_ALLOC_MAPPED = (1 << 2), + + /** Specifies that the BO should be snooped so we get coherency */ + ANV_BO_ALLOC_SNOOPED = (1 << 3), + + /** Specifies that the BO should be captured in error states */ + ANV_BO_ALLOC_CAPTURE = (1 << 4), + + /** Specifies that the BO will have an address assigned by the caller + * + * Such BOs do not exist in any VMA heap. + */ + ANV_BO_ALLOC_FIXED_ADDRESS = (1 << 5), + + /** Enables implicit synchronization on the BO + * + * This is the opposite of EXEC_OBJECT_ASYNC. + */ + ANV_BO_ALLOC_IMPLICIT_SYNC = (1 << 6), + + /** Enables implicit synchronization on the BO + * + * This is equivalent to EXEC_OBJECT_WRITE. + */ + ANV_BO_ALLOC_IMPLICIT_WRITE = (1 << 7), + + /** Has an address which is visible to the client */ + ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS = (1 << 8), + + /** This buffer has implicit CCS data attached to it */ + ANV_BO_ALLOC_IMPLICIT_CCS = (1 << 9), +}; + +VkResult anv_device_alloc_bo(struct anv_device *device, uint64_t size, + enum anv_bo_alloc_flags alloc_flags, + uint64_t explicit_address, + struct anv_bo **bo); +VkResult anv_device_import_bo_from_host_ptr(struct anv_device *device, + void *host_ptr, uint32_t size, + enum anv_bo_alloc_flags alloc_flags, + uint64_t client_address, + struct anv_bo **bo_out); +VkResult anv_device_import_bo(struct anv_device *device, int fd, + enum anv_bo_alloc_flags alloc_flags, + uint64_t client_address, + struct anv_bo **bo); +VkResult anv_device_export_bo(struct anv_device *device, + struct anv_bo *bo, int *fd_out); +void anv_device_release_bo(struct anv_device *device, + struct anv_bo *bo); + +static inline struct anv_bo * +anv_device_lookup_bo(struct anv_device *device, uint32_t gem_handle) +{ + return util_sparse_array_get(&device->bo_cache.bo_map, gem_handle); +} + VkResult anv_device_bo_busy(struct anv_device *device, struct anv_bo *bo); VkResult anv_device_wait(struct anv_device *device, struct anv_bo *bo, int64_t timeout); +VkResult anv_queue_init(struct anv_device *device, struct anv_queue *queue); +void anv_queue_finish(struct anv_queue *queue); + +VkResult anv_queue_execbuf_locked(struct anv_queue *queue, struct anv_queue_submit *submit); +VkResult anv_queue_submit_simple_batch(struct anv_queue *queue, + struct anv_batch *batch); + +uint64_t anv_gettime_ns(void); +uint64_t anv_get_absolute_timeout(uint64_t timeout); + void* anv_gem_mmap(struct anv_device *device, uint32_t gem_handle, uint64_t offset, uint64_t size, uint32_t flags); void anv_gem_munmap(void *p, uint64_t size); @@ -1266,17 +1506,20 @@ uint32_t *handles, uint32_t num_handles, int64_t abs_timeout_ns, bool wait_all); -bool anv_vma_alloc(struct anv_device *device, struct anv_bo *bo); -void anv_vma_free(struct anv_device *device, struct anv_bo *bo); - -VkResult anv_bo_init_new(struct anv_bo *bo, struct anv_device *device, uint64_t size); +uint64_t anv_vma_alloc(struct anv_device *device, + uint64_t size, uint64_t align, + enum anv_bo_alloc_flags alloc_flags, + uint64_t client_address); +void anv_vma_free(struct anv_device *device, + uint64_t address, uint64_t size); struct anv_reloc_list { uint32_t num_relocs; uint32_t array_length; struct drm_i915_gem_relocation_entry * relocs; struct anv_bo ** reloc_bos; - struct set * deps; + uint32_t dep_words; + BITSET_WORD * deps; }; VkResult anv_reloc_list_init(struct anv_reloc_list *list, @@ -1287,13 +1530,13 @@ VkResult anv_reloc_list_add(struct anv_reloc_list *list, const VkAllocationCallbacks *alloc, uint32_t offset, struct anv_bo *target_bo, - uint32_t delta); + uint32_t delta, uint64_t *address_u64_out); struct anv_batch_bo { /* Link in the anv_cmd_buffer.owned_batch_bos list */ struct list_head link; - struct anv_bo bo; + struct anv_bo * bo; /* Bytes actually consumed in this batch BO */ uint32_t length; @@ -1330,8 +1573,6 @@ void anv_batch_emit_batch(struct anv_batch *batch, struct anv_batch *other); uint64_t anv_batch_emit_reloc(struct anv_batch *batch, void *location, struct anv_bo *bo, uint32_t offset); -VkResult anv_device_submit_simple_batch(struct anv_device *device, - struct anv_batch *batch); static inline VkResult anv_batch_set_error(struct anv_batch *batch, VkResult error) @@ -1465,51 +1706,6 @@ _dst = NULL; \ })) -/* MEMORY_OBJECT_CONTROL_STATE: - * .GraphicsDataTypeGFDT = 0, - * .LLCCacheabilityControlLLCCC = 0, - * .L3CacheabilityControlL3CC = 1, - */ -#define GEN7_MOCS 1 - -/* MEMORY_OBJECT_CONTROL_STATE: - * .LLCeLLCCacheabilityControlLLCCC = 0, - * .L3CacheabilityControlL3CC = 1, - */ -#define GEN75_MOCS 1 - -/* MEMORY_OBJECT_CONTROL_STATE: - * .MemoryTypeLLCeLLCCacheabilityControl = WB, - * .TargetCache = L3DefertoPATforLLCeLLCselection, - * .AgeforQUADLRU = 0 - */ -#define GEN8_MOCS 0x78 - -/* MEMORY_OBJECT_CONTROL_STATE: - * .MemoryTypeLLCeLLCCacheabilityControl = UCwithFenceifcoherentcycle, - * .TargetCache = L3DefertoPATforLLCeLLCselection, - * .AgeforQUADLRU = 0 - */ -#define GEN8_EXTERNAL_MOCS 0x18 - -/* Skylake: MOCS is now an index into an array of 62 different caching - * configurations programmed by the kernel. - */ - -/* TC=LLC/eLLC, LeCC=WB, LRUM=3, L3CC=WB */ -#define GEN9_MOCS (2 << 1) - -/* TC=LLC/eLLC, LeCC=WB, LRUM=3, L3CC=WB */ -#define GEN9_EXTERNAL_MOCS (1 << 1) - -/* Cannonlake MOCS defines are duplicates of Skylake MOCS defines. */ -#define GEN10_MOCS GEN9_MOCS -#define GEN10_EXTERNAL_MOCS GEN9_EXTERNAL_MOCS - -/* Ice Lake MOCS defines are duplicates of Skylake MOCS defines. */ -#define GEN11_MOCS GEN9_MOCS -#define GEN11_EXTERNAL_MOCS GEN9_EXTERNAL_MOCS - struct anv_device_memory { struct list_head link; @@ -1682,6 +1878,9 @@ /* Number of dynamic offsets used by this descriptor set */ uint16_t dynamic_offset_count; + /* For each shader stage, which offsets apply to that stage */ + uint16_t stage_dynamic_offsets[MESA_SHADER_STAGES]; + /* Size of the descriptor buffer for this descriptor set */ uint32_t descriptor_buffer_size; @@ -1777,7 +1976,7 @@ uint32_t next; uint32_t free_list; - struct anv_bo bo; + struct anv_bo *bo; struct util_vma_heap bo_heap; struct anv_state_stream surface_state_stream; @@ -1885,32 +2084,63 @@ struct anv_descriptor_pool *pool, struct anv_descriptor_set *set); +#define ANV_DESCRIPTOR_SET_NULL (UINT8_MAX - 5) +#define ANV_DESCRIPTOR_SET_PUSH_CONSTANTS (UINT8_MAX - 4) #define ANV_DESCRIPTOR_SET_DESCRIPTORS (UINT8_MAX - 3) #define ANV_DESCRIPTOR_SET_NUM_WORK_GROUPS (UINT8_MAX - 2) #define ANV_DESCRIPTOR_SET_SHADER_CONSTANTS (UINT8_MAX - 1) #define ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS UINT8_MAX struct anv_pipeline_binding { - /* The descriptor set this surface corresponds to. The special value of - * ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS indicates that the offset refers - * to a color attachment and not a regular descriptor. + /** Index in the descriptor set + * + * This is a flattened index; the descriptor set layout is already taken + * into account. + */ + uint32_t index; + + /** The descriptor set this surface corresponds to. + * + * The special ANV_DESCRIPTOR_SET_* values above indicates that this + * binding is not a normal descriptor set but something else. */ uint8_t set; - /* Binding in the descriptor set */ - uint32_t binding; + union { + /** Plane in the binding index for images */ + uint8_t plane; + + /** Input attachment index (relative to the subpass) */ + uint8_t input_attachment_index; - /* Index in the binding */ + /** Dynamic offset index (for dynamic UBOs and SSBOs) */ + uint8_t dynamic_offset_index; + }; + + /** For a storage image, whether it is write-only */ + uint8_t write_only; + + /** Pad to 64 bits so that there are no holes and we can safely memcmp + * assuming POD zero-initialization. + */ + uint8_t pad; +}; + +struct anv_push_range { + /** Index in the descriptor set */ uint32_t index; - /* Plane in the binding index */ - uint8_t plane; + /** Descriptor set index */ + uint8_t set; + + /** Dynamic offset index (for dynamic UBOs) */ + uint8_t dynamic_offset_index; - /* Input attachment index (relative to the subpass) */ - uint8_t input_attachment_index; + /** Start offset in units of 32B */ + uint8_t start; - /* For a storage image, whether it is write-only */ - bool write_only; + /** Range in units of 32B */ + uint8_t length; }; struct anv_pipeline_layout { @@ -2015,18 +2245,20 @@ ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT = (1 << 3), ANV_PIPE_VF_CACHE_INVALIDATE_BIT = (1 << 4), ANV_PIPE_DATA_CACHE_FLUSH_BIT = (1 << 5), + ANV_PIPE_TILE_CACHE_FLUSH_BIT = (1 << 6), ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT = (1 << 10), ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT = (1 << 11), ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT = (1 << 12), ANV_PIPE_DEPTH_STALL_BIT = (1 << 13), ANV_PIPE_CS_STALL_BIT = (1 << 20), + ANV_PIPE_END_OF_PIPE_SYNC_BIT = (1 << 21), /* This bit does not exist directly in PIPE_CONTROL. Instead it means that * a flush has happened but not a CS stall. The next time we do any sort * of invalidation we need to insert a CS stall at that time. Otherwise, * we would have to CS stall on every flush which could be bad. */ - ANV_PIPE_NEEDS_CS_STALL_BIT = (1 << 21), + ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT = (1 << 22), /* This bit does not exist directly in PIPE_CONTROL. It means that render * target operations related to transfer commands with VkBuffer as @@ -2034,13 +2266,26 @@ * streamer might need to be aware of this to trigger the appropriate stall * before they can proceed with the copy. */ - ANV_PIPE_RENDER_TARGET_BUFFER_WRITES = (1 << 22), + ANV_PIPE_RENDER_TARGET_BUFFER_WRITES = (1 << 23), + + /* This bit does not exist directly in PIPE_CONTROL. It means that Gen12 + * AUX-TT data has changed and we need to invalidate AUX-TT data. This is + * done by writing the AUX-TT register. + */ + ANV_PIPE_AUX_TABLE_INVALIDATE_BIT = (1 << 24), + + /* This bit does not exist directly in PIPE_CONTROL. It means that a + * PIPE_CONTROL with a post-sync operation will follow. This is used to + * implement a workaround for Gen9. + */ + ANV_PIPE_POST_SYNC_BIT = (1 << 25), }; #define ANV_PIPE_FLUSH_BITS ( \ ANV_PIPE_DEPTH_CACHE_FLUSH_BIT | \ ANV_PIPE_DATA_CACHE_FLUSH_BIT | \ - ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT) + ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | \ + ANV_PIPE_TILE_CACHE_FLUSH_BIT) #define ANV_PIPE_STALL_BITS ( \ ANV_PIPE_STALL_AT_SCOREBOARD_BIT | \ @@ -2053,7 +2298,8 @@ ANV_PIPE_VF_CACHE_INVALIDATE_BIT | \ ANV_PIPE_DATA_CACHE_FLUSH_BIT | \ ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT | \ - ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT) + ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT | \ + ANV_PIPE_AUX_TABLE_INVALIDATE_BIT) static inline enum anv_pipe_bits anv_pipe_flush_bits_for_access_flags(VkAccessFlags flags) @@ -2214,20 +2460,30 @@ VkDeviceSize size; }; -#define ANV_PARAM_PUSH(offset) ((1 << 16) | (uint32_t)(offset)) -#define ANV_PARAM_IS_PUSH(param) ((uint32_t)(param) >> 16 == 1) -#define ANV_PARAM_PUSH_OFFSET(param) ((param) & 0xffff) - -#define ANV_PARAM_DYN_OFFSET(offset) ((2 << 16) | (uint32_t)(offset)) -#define ANV_PARAM_IS_DYN_OFFSET(param) ((uint32_t)(param) >> 16 == 2) -#define ANV_PARAM_DYN_OFFSET_IDX(param) ((param) & 0xffff) - struct anv_push_constants { - /* Push constant data provided by the client through vkPushConstants */ + /** Push constant data provided by the client through vkPushConstants */ uint8_t client_data[MAX_PUSH_CONSTANTS_SIZE]; - /* Used for vkCmdDispatchBase */ - uint32_t base_work_group_id[3]; + /** Dynamic offsets for dynamic UBOs and SSBOs */ + uint32_t dynamic_offsets[MAX_DYNAMIC_BUFFERS]; + + struct { + /** Base workgroup ID + * + * Used for vkCmdDispatchBase. + */ + uint32_t base_work_group_id[3]; + + /** Subgroup ID + * + * This is never set by software but is implicitly filled out when + * uploading the push constants for compute shaders. + */ + uint32_t subgroup_id; + + /** Pad out to a multiple of 32 bytes */ + uint32_t pad[4]; + } cs; }; struct anv_dynamic_state { @@ -2317,6 +2573,7 @@ struct anv_surface_state input; VkImageLayout current_layout; + VkImageLayout current_stencil_layout; VkImageAspectFlags pending_clear_aspects; VkImageAspectFlags pending_load_aspects; bool fast_clear; @@ -2334,6 +2591,27 @@ struct anv_image_view * image_view; }; +/** State tracking for vertex buffer flushes + * + * On Gen8-9, the VF cache only considers the bottom 32 bits of memory + * addresses. If you happen to have two vertex buffers which get placed + * exactly 4 GiB apart and use them in back-to-back draw calls, you can get + * collisions. In order to solve this problem, we track vertex address ranges + * which are live in the cache and invalidate the cache if one ever exceeds 32 + * bits. + */ +struct anv_vb_cache_range { + /* Virtual address at which the live vertex buffer cache range starts for + * this vertex buffer index. + */ + uint64_t start; + + /* Virtual address of the byte after where vertex buffer cache range ends. + * This is exclusive such that end - start is the size of the range. + */ + uint64_t end; +}; + /** State tracking for particular pipeline bind point * * This struct is the base struct for anv_cmd_graphics_state and @@ -2344,11 +2622,8 @@ */ struct anv_cmd_pipeline_state { struct anv_pipeline *pipeline; - struct anv_pipeline_layout *layout; struct anv_descriptor_set *descriptors[MAX_SETS]; - uint32_t dynamic_offsets[MAX_DYNAMIC_BUFFERS]; - struct anv_push_descriptor_set *push_descriptors[MAX_SETS]; }; @@ -2365,6 +2640,11 @@ anv_cmd_dirty_mask_t dirty; uint32_t vb_dirty; + struct anv_vb_cache_range ib_bound_range; + struct anv_vb_cache_range ib_dirty_range; + struct anv_vb_cache_range vb_bound_ranges[33]; + struct anv_vb_cache_range vb_dirty_ranges[33]; + struct anv_dynamic_state dynamic; struct { @@ -2394,6 +2674,7 @@ /* PIPELINE_SELECT.PipelineSelection */ uint32_t current_pipeline; const struct gen_l3_config * current_l3_config; + uint32_t last_aux_map_state; struct anv_cmd_graphics_state gfx; struct anv_cmd_compute_state compute; @@ -2415,6 +2696,10 @@ struct anv_state binding_tables[MESA_SHADER_STAGES]; struct anv_state samplers[MESA_SHADER_STAGES]; + unsigned char sampler_sha1s[MESA_SHADER_STAGES][20]; + unsigned char surface_sha1s[MESA_SHADER_STAGES][20]; + unsigned char push_sha1s[MESA_SHADER_STAGES][20]; + /** * Whether or not the gen8 PMA fix is enabled. We ensure that, at the top * of any command buffer it is disabled by disabling it in EndCommandBuffer @@ -2501,7 +2786,7 @@ * initialized by anv_cmd_buffer_init_batch_bo_chain() */ struct u_vector bt_block_states; - uint32_t bt_next; + struct anv_state bt_next; struct anv_reloc_list surface_relocs; /** Last seen surface state block pool center bo offset */ @@ -2518,6 +2803,9 @@ VkCommandBufferLevel level; struct anv_cmd_state state; + + /* Set by SetPerformanceMarkerINTEL, written into queries by CmdBeginQuery */ + uint64_t intel_perf_marker; }; VkResult anv_cmd_buffer_init_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer); @@ -2527,11 +2815,13 @@ void anv_cmd_buffer_add_secondary(struct anv_cmd_buffer *primary, struct anv_cmd_buffer *secondary); void anv_cmd_buffer_prepare_execbuf(struct anv_cmd_buffer *cmd_buffer); -VkResult anv_cmd_buffer_execbuf(struct anv_device *device, +VkResult anv_cmd_buffer_execbuf(struct anv_queue *queue, struct anv_cmd_buffer *cmd_buffer, const VkSemaphore *in_semaphores, + const uint64_t *in_wait_values, uint32_t num_in_semaphores, const VkSemaphore *out_semaphores, + const uint64_t *out_signal_values, uint32_t num_out_semaphores, VkFence fence); @@ -2591,6 +2881,7 @@ enum anv_fence_type { ANV_FENCE_TYPE_NONE = 0, ANV_FENCE_TYPE_BO, + ANV_FENCE_TYPE_WSI_BO, ANV_FENCE_TYPE_SYNCOBJ, ANV_FENCE_TYPE_WSI, }; @@ -2621,7 +2912,7 @@ * will say it's idle in this case. */ struct { - struct anv_bo bo; + struct anv_bo *bo; enum anv_bo_fence_state state; } bo; @@ -2650,6 +2941,9 @@ struct anv_fence_impl temporary; }; +void anv_fence_reset_temporary(struct anv_device *device, + struct anv_fence *fence); + struct anv_event { uint64_t semaphore; struct anv_state state; @@ -2659,18 +2953,46 @@ ANV_SEMAPHORE_TYPE_NONE = 0, ANV_SEMAPHORE_TYPE_DUMMY, ANV_SEMAPHORE_TYPE_BO, + ANV_SEMAPHORE_TYPE_WSI_BO, ANV_SEMAPHORE_TYPE_SYNC_FILE, ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ, + ANV_SEMAPHORE_TYPE_TIMELINE, +}; + +struct anv_timeline_point { + struct list_head link; + + uint64_t serial; + + /* Number of waiter on this point, when > 0 the point should not be garbage + * collected. + */ + int waiting; + + /* BO used for synchronization. */ + struct anv_bo *bo; +}; + +struct anv_timeline { + pthread_mutex_t mutex; + pthread_cond_t cond; + + uint64_t highest_past; + uint64_t highest_pending; + + struct list_head points; + struct list_head free_points; }; struct anv_semaphore_impl { enum anv_semaphore_type type; union { - /* A BO representing this semaphore when type == ANV_SEMAPHORE_TYPE_BO. - * This BO will be added to the object list on any execbuf2 calls for - * which this semaphore is used as a wait or signal fence. When used as - * a signal fence, the EXEC_OBJECT_WRITE flag will be set. + /* A BO representing this semaphore when type == ANV_SEMAPHORE_TYPE_BO + * or type == ANV_SEMAPHORE_TYPE_WSI_BO. This BO will be added to the + * object list on any execbuf2 calls for which this semaphore is used as + * a wait or signal fence. When used as a signal fence or when type == + * ANV_SEMAPHORE_TYPE_WSI_BO, the EXEC_OBJECT_WRITE flag will be set. */ struct anv_bo *bo; @@ -2685,10 +3007,18 @@ * import so we don't need to bother with a userspace cache. */ uint32_t syncobj; + + /* Non shareable timeline semaphore + * + * Used when kernel don't have support for timeline semaphores. + */ + struct anv_timeline timeline; }; }; struct anv_semaphore { + uint32_t refcount; + /* Permanent semaphore state. Every semaphore has some form of permanent * state (type != ANV_SEMAPHORE_TYPE_NONE). This may be a BO to fence on * (for cross-process semaphores0 or it could just be a dummy for use @@ -2737,11 +3067,17 @@ __tmp &= ~(1 << (stage))) struct anv_pipeline_bind_map { + unsigned char surface_sha1[20]; + unsigned char sampler_sha1[20]; + unsigned char push_sha1[20]; + uint32_t surface_count; uint32_t sampler_count; struct anv_pipeline_binding * surface_to_descriptor; struct anv_pipeline_binding * sampler_to_descriptor; + + struct anv_push_range push_ranges[4]; }; struct anv_shader_bin_key { @@ -2808,6 +3144,7 @@ struct brw_compile_stats stats; + char *nir; char *disasm; }; @@ -2824,8 +3161,6 @@ VkPipelineCreateFlags flags; struct anv_subpass * subpass; - bool needs_data_cache; - struct anv_shader_bin * shaders[MESA_SHADER_STAGES]; uint32_t num_executables; @@ -2861,6 +3196,7 @@ bool depth_clip_enable; bool sample_shading_enable; bool kill_pixel; + bool depth_bounds_test_enable; struct { uint32_t sf[7]; @@ -3019,6 +3355,12 @@ return anv_get_format_plane(devinfo, vk_format, aspect, tiling).isl_format; } +bool anv_formats_ccs_e_compatible(const struct gen_device_info *devinfo, + VkImageCreateFlags create_flags, + VkFormat vk_format, + VkImageTiling vk_tiling, + const VkImageFormatListCreateInfoKHR *fmt_list); + static inline struct isl_swizzle anv_swizzle_for_render(struct isl_swizzle swizzle) { @@ -3156,11 +3498,9 @@ struct anv_surface shadow_surface; /** - * For color images, this is the aux usage for this image when not used - * as a color attachment. - * - * For depth/stencil images, this is set to ISL_AUX_USAGE_HIZ if the - * image has a HiZ buffer. + * The base aux usage for this image. For color images, this can be + * either CCS_E or CCS_D depending on whether or not we can reliably + * leave CCS on all the time. */ enum isl_aux_usage aux_usage; @@ -3200,8 +3540,13 @@ VkImageAspectFlagBits aspect) { uint32_t plane = anv_image_aspect_to_plane(image->aspects, aspect); - return image->planes[plane].aux_surface.isl.size_B > 0 ? - image->planes[plane].aux_surface.isl.levels : 0; + if (image->planes[plane].aux_usage == ISL_AUX_USAGE_NONE) + return 0; + + /* The Gen12 CCS aux surface is represented with only one level. */ + return image->planes[plane].aux_surface.isl.tiling == ISL_TILING_GEN12_CCS ? + image->planes[plane].surface.isl.levels : + image->planes[plane].aux_surface.isl.levels; } /* Returns the number of auxiliary buffer layers attached to an image. */ @@ -3222,8 +3567,15 @@ return 0; } else { uint32_t plane = anv_image_aspect_to_plane(image->aspects, aspect); - return MAX2(image->planes[plane].aux_surface.isl.logical_level0_px.array_len, - image->planes[plane].aux_surface.isl.logical_level0_px.depth >> miplevel); + + /* The Gen12 CCS aux surface is represented with only one layer. */ + const struct isl_extent4d *aux_logical_level0_px = + image->planes[plane].aux_surface.isl.tiling == ISL_TILING_GEN12_CCS ? + &image->planes[plane].surface.isl.logical_level0_px : + &image->planes[plane].aux_surface.isl.logical_level0_px; + + return MAX2(aux_logical_level0_px->array_len, + aux_logical_level0_px->depth >> miplevel); } } @@ -3301,6 +3653,15 @@ return image->samples == 1; } +static inline bool +anv_image_plane_uses_aux_map(const struct anv_device *device, + const struct anv_image *image, + uint32_t plane) +{ + return device->info.has_aux_map && + isl_aux_usage_has_ccs(image->planes[plane].aux_usage); +} + void anv_cmd_buffer_mark_image_written(struct anv_cmd_buffer *cmd_buffer, const struct anv_image *image, @@ -3357,7 +3718,7 @@ void anv_image_mcs_op(struct anv_cmd_buffer *cmd_buffer, const struct anv_image *image, - enum isl_format format, + enum isl_format format, struct isl_swizzle swizzle, VkImageAspectFlagBits aspect, uint32_t base_layer, uint32_t layer_count, enum isl_aux_op mcs_op, union isl_color_value *clear_value, @@ -3365,7 +3726,7 @@ void anv_image_ccs_op(struct anv_cmd_buffer *cmd_buffer, const struct anv_image *image, - enum isl_format format, + enum isl_format format, struct isl_swizzle swizzle, VkImageAspectFlagBits aspect, uint32_t level, uint32_t base_layer, uint32_t layer_count, enum isl_aux_op ccs_op, union isl_color_value *clear_value, @@ -3378,10 +3739,17 @@ uint32_t base_level, uint32_t level_count, uint32_t base_layer, uint32_t layer_count); +enum isl_aux_state +anv_layout_to_aux_state(const struct gen_device_info * const devinfo, + const struct anv_image *image, + const VkImageAspectFlagBits aspect, + const VkImageLayout layout); + enum isl_aux_usage anv_layout_to_aux_usage(const struct gen_device_info * const devinfo, const struct anv_image *image, const VkImageAspectFlagBits aspect, + const VkImageUsageFlagBits usage, const VkImageLayout layout); enum anv_fast_clear_type @@ -3514,9 +3882,9 @@ enum isl_format anv_isl_format_for_descriptor_type(VkDescriptorType type); -static inline struct VkExtent3D +static inline VkExtent3D anv_sanitize_image_extent(const VkImageType imageType, - const struct VkExtent3D imageExtent) + const VkExtent3D imageExtent) { switch (imageType) { case VK_IMAGE_TYPE_1D: @@ -3530,9 +3898,9 @@ } } -static inline struct VkOffset3D +static inline VkOffset3D anv_sanitize_image_offset(const VkImageType imageType, - const struct VkOffset3D imageOffset) + const VkOffset3D imageOffset) { switch (imageType) { case VK_IMAGE_TYPE_1D: @@ -3613,6 +3981,9 @@ VkImageUsageFlagBits usage; uint32_t attachment; VkImageLayout layout; + + /* Used only with attachment containing stencil data. */ + VkImageLayout stencil_layout; }; struct anv_subpass { @@ -3663,6 +4034,9 @@ VkImageLayout final_layout; VkImageLayout first_subpass_layout; + VkImageLayout stencil_initial_layout; + VkImageLayout stencil_final_layout; + /* The subpass id in which the attachment will be used last. */ uint32_t last_subpass_idx; }; @@ -3685,16 +4059,23 @@ uint32_t stride; /** Number of slots in this query pool */ uint32_t slots; - struct anv_bo bo; + struct anv_bo * bo; }; int anv_get_instance_entrypoint_index(const char *name); int anv_get_device_entrypoint_index(const char *name); +int anv_get_physical_device_entrypoint_index(const char *name); + +const char *anv_get_instance_entry_name(int index); +const char *anv_get_physical_device_entry_name(int index); +const char *anv_get_device_entry_name(int index); bool anv_instance_entrypoint_is_enabled(int index, uint32_t core_version, const struct anv_instance_extension_table *instance); - +bool +anv_physical_device_entrypoint_is_enabled(int index, uint32_t core_version, + const struct anv_instance_extension_table *instance); bool anv_device_entrypoint_is_enabled(int index, uint32_t core_version, const struct anv_instance_extension_table *instance, @@ -3732,6 +4113,9 @@ return subpass_id; } +struct gen_perf_config *anv_get_perf(const struct gen_device_info *devinfo, int fd); +void anv_device_perf_init(struct anv_device *device); + #define ANV_DEFINE_HANDLE_CASTS(__anv_type, __VkType) \ \ static inline struct __anv_type * \ @@ -3815,6 +4199,9 @@ # define genX(x) gen11_##x # include "anv_genX.h" # undef genX +# define genX(x) gen12_##x +# include "anv_genX.h" +# undef genX #endif #endif /* ANV_PRIVATE_H */ diff -Nru mesa-19.2.8/src/intel/vulkan/anv_queue.c mesa-20.0.8/src/intel/vulkan/anv_queue.c --- mesa-19.2.8/src/intel/vulkan/anv_queue.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/vulkan/anv_queue.c 2020-06-12 01:21:17.000000000 +0000 @@ -25,6 +25,7 @@ * This file implements VkQueue, VkFence, and VkSemaphore */ +#include #include #include @@ -33,83 +34,907 @@ #include "genxml/gen7_pack.h" +uint64_t anv_gettime_ns(void) +{ + struct timespec current; + clock_gettime(CLOCK_MONOTONIC, ¤t); + return (uint64_t)current.tv_sec * NSEC_PER_SEC + current.tv_nsec; +} + +uint64_t anv_get_absolute_timeout(uint64_t timeout) +{ + if (timeout == 0) + return 0; + uint64_t current_time = anv_gettime_ns(); + uint64_t max_timeout = (uint64_t) INT64_MAX - current_time; + + timeout = MIN2(max_timeout, timeout); + + return (current_time + timeout); +} + +static int64_t anv_get_relative_timeout(uint64_t abs_timeout) +{ + uint64_t now = anv_gettime_ns(); + + /* We don't want negative timeouts. + * + * DRM_IOCTL_I915_GEM_WAIT uses a signed 64 bit timeout and is + * supposed to block indefinitely timeouts < 0. Unfortunately, + * this was broken for a couple of kernel releases. Since there's + * no way to know whether or not the kernel we're using is one of + * the broken ones, the best we can do is to clamp the timeout to + * INT64_MAX. This limits the maximum timeout from 584 years to + * 292 years - likely not a big deal. + */ + if (abs_timeout < now) + return 0; + + uint64_t rel_timeout = abs_timeout - now; + if (rel_timeout > (uint64_t) INT64_MAX) + rel_timeout = INT64_MAX; + + return rel_timeout; +} + +static struct anv_semaphore *anv_semaphore_ref(struct anv_semaphore *semaphore); +static void anv_semaphore_unref(struct anv_device *device, struct anv_semaphore *semaphore); +static void anv_semaphore_impl_cleanup(struct anv_device *device, + struct anv_semaphore_impl *impl); + +static void +anv_queue_submit_free(struct anv_device *device, + struct anv_queue_submit *submit) +{ + const VkAllocationCallbacks *alloc = submit->alloc; + + for (uint32_t i = 0; i < submit->temporary_semaphore_count; i++) + anv_semaphore_impl_cleanup(device, &submit->temporary_semaphores[i]); + for (uint32_t i = 0; i < submit->sync_fd_semaphore_count; i++) + anv_semaphore_unref(device, submit->sync_fd_semaphores[i]); + /* Execbuf does not consume the in_fence. It's our job to close it. */ + if (submit->in_fence != -1) + close(submit->in_fence); + if (submit->out_fence != -1) + close(submit->out_fence); + vk_free(alloc, submit->fences); + vk_free(alloc, submit->temporary_semaphores); + vk_free(alloc, submit->wait_timelines); + vk_free(alloc, submit->wait_timeline_values); + vk_free(alloc, submit->signal_timelines); + vk_free(alloc, submit->signal_timeline_values); + vk_free(alloc, submit->fence_bos); + vk_free(alloc, submit); +} + +static bool +anv_queue_submit_ready_locked(struct anv_queue_submit *submit) +{ + for (uint32_t i = 0; i < submit->wait_timeline_count; i++) { + if (submit->wait_timeline_values[i] > submit->wait_timelines[i]->highest_pending) + return false; + } + + return true; +} + +static VkResult +anv_timeline_init(struct anv_device *device, + struct anv_timeline *timeline, + uint64_t initial_value) +{ + timeline->highest_past = + timeline->highest_pending = initial_value; + list_inithead(&timeline->points); + list_inithead(&timeline->free_points); + + return VK_SUCCESS; +} + +static void +anv_timeline_finish(struct anv_device *device, + struct anv_timeline *timeline) +{ + list_for_each_entry_safe(struct anv_timeline_point, point, + &timeline->free_points, link) { + list_del(&point->link); + anv_device_release_bo(device, point->bo); + vk_free(&device->alloc, point); + } + list_for_each_entry_safe(struct anv_timeline_point, point, + &timeline->points, link) { + list_del(&point->link); + anv_device_release_bo(device, point->bo); + vk_free(&device->alloc, point); + } +} + +static VkResult +anv_timeline_add_point_locked(struct anv_device *device, + struct anv_timeline *timeline, + uint64_t value, + struct anv_timeline_point **point) +{ + VkResult result = VK_SUCCESS; + + if (list_is_empty(&timeline->free_points)) { + *point = + vk_zalloc(&device->alloc, sizeof(**point), + 8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); + if (!(*point)) + result = vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + if (result == VK_SUCCESS) { + result = anv_device_alloc_bo(device, 4096, + ANV_BO_ALLOC_EXTERNAL | + ANV_BO_ALLOC_IMPLICIT_SYNC, + 0 /* explicit_address */, + &(*point)->bo); + if (result != VK_SUCCESS) + vk_free(&device->alloc, *point); + } + } else { + *point = list_first_entry(&timeline->free_points, + struct anv_timeline_point, link); + list_del(&(*point)->link); + } + + if (result == VK_SUCCESS) { + (*point)->serial = value; + list_addtail(&(*point)->link, &timeline->points); + } + + return result; +} + +static VkResult +anv_timeline_gc_locked(struct anv_device *device, + struct anv_timeline *timeline) +{ + list_for_each_entry_safe(struct anv_timeline_point, point, + &timeline->points, link) { + /* timeline->higest_pending is only incremented once submission has + * happened. If this point has a greater serial, it means the point + * hasn't been submitted yet. + */ + if (point->serial > timeline->highest_pending) + return VK_SUCCESS; + + /* If someone is waiting on this time point, consider it busy and don't + * try to recycle it. There's a slim possibility that it's no longer + * busy by the time we look at it but we would be recycling it out from + * under a waiter and that can lead to weird races. + * + * We walk the list in-order so if this time point is still busy so is + * every following time point + */ + assert(point->waiting >= 0); + if (point->waiting) + return VK_SUCCESS; + + /* Garbage collect any signaled point. */ + VkResult result = anv_device_bo_busy(device, point->bo); + if (result == VK_NOT_READY) { + /* We walk the list in-order so if this time point is still busy so + * is every following time point + */ + return VK_SUCCESS; + } else if (result != VK_SUCCESS) { + return result; + } + + assert(timeline->highest_past < point->serial); + timeline->highest_past = point->serial; + + list_del(&point->link); + list_add(&point->link, &timeline->free_points); + } + + return VK_SUCCESS; +} + +static VkResult anv_queue_submit_add_fence_bo(struct anv_queue_submit *submit, + struct anv_bo *bo, + bool signal); + +static VkResult +anv_queue_submit_timeline_locked(struct anv_queue *queue, + struct anv_queue_submit *submit) +{ + VkResult result; + + for (uint32_t i = 0; i < submit->wait_timeline_count; i++) { + struct anv_timeline *timeline = submit->wait_timelines[i]; + uint64_t wait_value = submit->wait_timeline_values[i]; + + if (timeline->highest_past >= wait_value) + continue; + + list_for_each_entry(struct anv_timeline_point, point, &timeline->points, link) { + if (point->serial < wait_value) + continue; + result = anv_queue_submit_add_fence_bo(submit, point->bo, false); + if (result != VK_SUCCESS) + return result; + break; + } + } + for (uint32_t i = 0; i < submit->signal_timeline_count; i++) { + struct anv_timeline *timeline = submit->signal_timelines[i]; + uint64_t signal_value = submit->signal_timeline_values[i]; + struct anv_timeline_point *point; + + result = anv_timeline_add_point_locked(queue->device, timeline, + signal_value, &point); + if (result != VK_SUCCESS) + return result; + + result = anv_queue_submit_add_fence_bo(submit, point->bo, true); + if (result != VK_SUCCESS) + return result; + } + + result = anv_queue_execbuf_locked(queue, submit); + + if (result == VK_SUCCESS) { + /* Update the pending values in the timeline objects. */ + for (uint32_t i = 0; i < submit->signal_timeline_count; i++) { + struct anv_timeline *timeline = submit->signal_timelines[i]; + uint64_t signal_value = submit->signal_timeline_values[i]; + + assert(signal_value > timeline->highest_pending); + timeline->highest_pending = signal_value; + } + + /* Update signaled semaphores backed by syncfd. */ + for (uint32_t i = 0; i < submit->sync_fd_semaphore_count; i++) { + struct anv_semaphore *semaphore = submit->sync_fd_semaphores[i]; + /* Out fences can't have temporary state because that would imply + * that we imported a sync file and are trying to signal it. + */ + assert(semaphore->temporary.type == ANV_SEMAPHORE_TYPE_NONE); + struct anv_semaphore_impl *impl = &semaphore->permanent; + + assert(impl->type == ANV_SEMAPHORE_TYPE_SYNC_FILE); + impl->fd = dup(submit->out_fence); + } + } else { + /* Unblock any waiter by signaling the points, the application will get + * a device lost error code. + */ + for (uint32_t i = 0; i < submit->signal_timeline_count; i++) { + struct anv_timeline *timeline = submit->signal_timelines[i]; + uint64_t signal_value = submit->signal_timeline_values[i]; + + assert(signal_value > timeline->highest_pending); + timeline->highest_past = timeline->highest_pending = signal_value; + } + } + + return result; +} + +static VkResult +anv_queue_submit_deferred_locked(struct anv_queue *queue, uint32_t *advance) +{ + VkResult result = VK_SUCCESS; + + /* Go through all the queued submissions and submit then until we find one + * that's waiting on a point that hasn't materialized yet. + */ + list_for_each_entry_safe(struct anv_queue_submit, submit, + &queue->queued_submits, link) { + if (!anv_queue_submit_ready_locked(submit)) + break; + + (*advance)++; + list_del(&submit->link); + + result = anv_queue_submit_timeline_locked(queue, submit); + + anv_queue_submit_free(queue->device, submit); + + if (result != VK_SUCCESS) + break; + } + + return result; +} + +static VkResult +anv_device_submit_deferred_locked(struct anv_device *device) +{ + uint32_t advance = 0; + return anv_queue_submit_deferred_locked(&device->queue, &advance); +} + +static VkResult +_anv_queue_submit(struct anv_queue *queue, struct anv_queue_submit **_submit, + bool flush_queue) +{ + struct anv_queue_submit *submit = *_submit; + + /* Wait before signal behavior means we might keep alive the + * anv_queue_submit object a bit longer, so transfer the ownership to the + * anv_queue. + */ + *_submit = NULL; + + pthread_mutex_lock(&queue->device->mutex); + list_addtail(&submit->link, &queue->queued_submits); + VkResult result = anv_device_submit_deferred_locked(queue->device); + if (flush_queue) { + while (result == VK_SUCCESS && !list_is_empty(&queue->queued_submits)) { + int ret = pthread_cond_wait(&queue->device->queue_submit, + &queue->device->mutex); + if (ret != 0) { + result = anv_device_set_lost(queue->device, "wait timeout"); + break; + } + + result = anv_device_submit_deferred_locked(queue->device); + } + } + pthread_mutex_unlock(&queue->device->mutex); + return result; +} + VkResult -anv_device_execbuf(struct anv_device *device, - struct drm_i915_gem_execbuffer2 *execbuf, - struct anv_bo **execbuf_bos) -{ - int ret = device->no_hw ? 0 : anv_gem_execbuffer(device, execbuf); - if (ret != 0) { - /* We don't know the real error. */ - return anv_device_set_lost(device, "execbuf2 failed: %m"); - } - - struct drm_i915_gem_exec_object2 *objects = - (void *)(uintptr_t)execbuf->buffers_ptr; - for (uint32_t k = 0; k < execbuf->buffer_count; k++) { - if (execbuf_bos[k]->flags & EXEC_OBJECT_PINNED) - assert(execbuf_bos[k]->offset == objects[k].offset); - execbuf_bos[k]->offset = objects[k].offset; +anv_queue_init(struct anv_device *device, struct anv_queue *queue) +{ + queue->_loader_data.loaderMagic = ICD_LOADER_MAGIC; + queue->device = device; + queue->flags = 0; + + list_inithead(&queue->queued_submits); + + return VK_SUCCESS; +} + +void +anv_queue_finish(struct anv_queue *queue) +{ +} + +static VkResult +anv_queue_submit_add_fence_bo(struct anv_queue_submit *submit, + struct anv_bo *bo, + bool signal) +{ + if (submit->fence_bo_count >= submit->fence_bo_array_length) { + uint32_t new_len = MAX2(submit->fence_bo_array_length * 2, 64); + + submit->fence_bos = + vk_realloc(submit->alloc, + submit->fence_bos, new_len * sizeof(*submit->fence_bos), + 8, submit->alloc_scope); + if (submit->fence_bos == NULL) + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + + submit->fence_bo_array_length = new_len; + } + + /* Take advantage that anv_bo are allocated at 8 byte alignement so we can + * use the lowest bit to store whether this is a BO we need to signal. + */ + submit->fence_bos[submit->fence_bo_count++] = anv_pack_ptr(bo, 1, signal); + + return VK_SUCCESS; +} + +static VkResult +anv_queue_submit_add_syncobj(struct anv_queue_submit* submit, + struct anv_device *device, + uint32_t handle, uint32_t flags) +{ + assert(flags != 0); + + if (submit->fence_count >= submit->fence_array_length) { + uint32_t new_len = MAX2(submit->fence_array_length * 2, 64); + + submit->fences = + vk_realloc(submit->alloc, + submit->fences, new_len * sizeof(*submit->fences), + 8, submit->alloc_scope); + if (submit->fences == NULL) + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + + submit->fence_array_length = new_len; + } + + submit->fences[submit->fence_count++] = (struct drm_i915_gem_exec_fence) { + .handle = handle, + .flags = flags, + }; + + return VK_SUCCESS; +} + +static VkResult +anv_queue_submit_add_sync_fd_fence(struct anv_queue_submit *submit, + struct anv_semaphore *semaphore) +{ + if (submit->sync_fd_semaphore_count >= submit->sync_fd_semaphore_array_length) { + uint32_t new_len = MAX2(submit->sync_fd_semaphore_array_length * 2, 64); + struct anv_semaphore **new_semaphores = + vk_realloc(submit->alloc, submit->sync_fd_semaphores, + new_len * sizeof(*submit->sync_fd_semaphores), 8, + submit->alloc_scope); + if (new_semaphores == NULL) + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + + submit->sync_fd_semaphores = new_semaphores; } + submit->sync_fd_semaphores[submit->sync_fd_semaphore_count++] = + anv_semaphore_ref(semaphore); + submit->need_out_fence = true; + return VK_SUCCESS; } +static VkResult +anv_queue_submit_add_timeline_wait(struct anv_queue_submit* submit, + struct anv_device *device, + struct anv_timeline *timeline, + uint64_t value) +{ + if (submit->wait_timeline_count >= submit->wait_timeline_array_length) { + uint32_t new_len = MAX2(submit->wait_timeline_array_length * 2, 64); + + submit->wait_timelines = + vk_realloc(submit->alloc, + submit->wait_timelines, new_len * sizeof(*submit->wait_timelines), + 8, submit->alloc_scope); + if (submit->wait_timelines == NULL) + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + + submit->wait_timeline_values = + vk_realloc(submit->alloc, + submit->wait_timeline_values, new_len * sizeof(*submit->wait_timeline_values), + 8, submit->alloc_scope); + if (submit->wait_timeline_values == NULL) + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + + submit->wait_timeline_array_length = new_len; + } + + submit->wait_timelines[submit->wait_timeline_count] = timeline; + submit->wait_timeline_values[submit->wait_timeline_count] = value; + + submit->wait_timeline_count++; + + return VK_SUCCESS; +} + +static VkResult +anv_queue_submit_add_timeline_signal(struct anv_queue_submit* submit, + struct anv_device *device, + struct anv_timeline *timeline, + uint64_t value) +{ + assert(timeline->highest_pending < value); + + if (submit->signal_timeline_count >= submit->signal_timeline_array_length) { + uint32_t new_len = MAX2(submit->signal_timeline_array_length * 2, 64); + + submit->signal_timelines = + vk_realloc(submit->alloc, + submit->signal_timelines, new_len * sizeof(*submit->signal_timelines), + 8, submit->alloc_scope); + if (submit->signal_timelines == NULL) + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + + submit->signal_timeline_values = + vk_realloc(submit->alloc, + submit->signal_timeline_values, new_len * sizeof(*submit->signal_timeline_values), + 8, submit->alloc_scope); + if (submit->signal_timeline_values == NULL) + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + + submit->signal_timeline_array_length = new_len; + } + + submit->signal_timelines[submit->signal_timeline_count] = timeline; + submit->signal_timeline_values[submit->signal_timeline_count] = value; + + submit->signal_timeline_count++; + + return VK_SUCCESS; +} + +static struct anv_queue_submit * +anv_queue_submit_alloc(struct anv_device *device) +{ + const VkAllocationCallbacks *alloc = &device->alloc; + VkSystemAllocationScope alloc_scope = VK_SYSTEM_ALLOCATION_SCOPE_DEVICE; + + struct anv_queue_submit *submit = vk_zalloc(alloc, sizeof(*submit), 8, alloc_scope); + if (!submit) + return NULL; + + submit->alloc = alloc; + submit->alloc_scope = alloc_scope; + submit->in_fence = -1; + submit->out_fence = -1; + + return submit; +} + VkResult -anv_device_submit_simple_batch(struct anv_device *device, - struct anv_batch *batch) +anv_queue_submit_simple_batch(struct anv_queue *queue, + struct anv_batch *batch) { - struct drm_i915_gem_execbuffer2 execbuf; - struct drm_i915_gem_exec_object2 exec2_objects[1]; - struct anv_bo bo, *exec_bos[1]; - VkResult result = VK_SUCCESS; - uint32_t size; + struct anv_device *device = queue->device; + struct anv_queue_submit *submit = anv_queue_submit_alloc(device); + if (!submit) + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + + bool has_syncobj_wait = device->physical->has_syncobj_wait; + VkResult result; + uint32_t syncobj; + struct anv_bo *batch_bo, *sync_bo; + + if (has_syncobj_wait) { + syncobj = anv_gem_syncobj_create(device, 0); + if (!syncobj) { + result = vk_error(VK_ERROR_OUT_OF_DEVICE_MEMORY); + goto err_free_submit; + } + + result = anv_queue_submit_add_syncobj(submit, device, syncobj, + I915_EXEC_FENCE_SIGNAL); + } else { + result = anv_device_alloc_bo(device, 4096, + ANV_BO_ALLOC_EXTERNAL | + ANV_BO_ALLOC_IMPLICIT_SYNC, + 0 /* explicit_address */, + &sync_bo); + if (result != VK_SUCCESS) + goto err_free_submit; + + result = anv_queue_submit_add_fence_bo(submit, sync_bo, true /* signal */); + } - /* Kernel driver requires 8 byte aligned batch length */ - size = align_u32(batch->next - batch->start, 8); - result = anv_bo_pool_alloc(&device->batch_bo_pool, &bo, size); if (result != VK_SUCCESS) - return result; + goto err_destroy_sync_primitive; + + if (batch) { + uint32_t size = align_u32(batch->next - batch->start, 8); + result = anv_bo_pool_alloc(&device->batch_bo_pool, size, &batch_bo); + if (result != VK_SUCCESS) + goto err_destroy_sync_primitive; + + memcpy(batch_bo->map, batch->start, size); + if (!device->info.has_llc) + gen_flush_range(batch_bo->map, size); + + submit->simple_bo = batch_bo; + submit->simple_bo_size = size; + } + + result = _anv_queue_submit(queue, &submit, true); + + if (result == VK_SUCCESS) { + if (has_syncobj_wait) { + if (anv_gem_syncobj_wait(device, &syncobj, 1, + anv_get_absolute_timeout(INT64_MAX), true)) + result = anv_device_set_lost(device, "anv_gem_syncobj_wait failed: %m"); + anv_gem_syncobj_destroy(device, syncobj); + } else { + result = anv_device_wait(device, sync_bo, + anv_get_relative_timeout(INT64_MAX)); + anv_device_release_bo(device, sync_bo); + } + } + + if (batch) + anv_bo_pool_free(&device->batch_bo_pool, batch_bo); + + if (submit) + anv_queue_submit_free(device, submit); + + return result; + + err_destroy_sync_primitive: + if (has_syncobj_wait) + anv_gem_syncobj_destroy(device, syncobj); + else + anv_device_release_bo(device, sync_bo); + err_free_submit: + if (submit) + anv_queue_submit_free(device, submit); + + return result; +} + +/* Transfer ownership of temporary semaphores from the VkSemaphore object to + * the anv_queue_submit object. Those temporary semaphores are then freed in + * anv_queue_submit_free() once the driver is finished with them. + */ +static VkResult +maybe_transfer_temporary_semaphore(struct anv_queue_submit *submit, + struct anv_semaphore *semaphore, + struct anv_semaphore_impl **out_impl) +{ + struct anv_semaphore_impl *impl = &semaphore->temporary; + + if (impl->type == ANV_SEMAPHORE_TYPE_NONE) { + *out_impl = &semaphore->permanent; + return VK_SUCCESS; + } + + /* BO backed timeline semaphores cannot be temporary. */ + assert(impl->type != ANV_SEMAPHORE_TYPE_TIMELINE); + + /* + * There is a requirement to reset semaphore to their permanent state after + * submission. From the Vulkan 1.0.53 spec: + * + * "If the import is temporary, the implementation must restore the + * semaphore to its prior permanent state after submitting the next + * semaphore wait operation." + * + * In the case we defer the actual submission to a thread because of the + * wait-before-submit behavior required for timeline semaphores, we need to + * make copies of the temporary syncobj to ensure they stay alive until we + * do the actual execbuffer ioctl. + */ + if (submit->temporary_semaphore_count >= submit->temporary_semaphore_array_length) { + uint32_t new_len = MAX2(submit->temporary_semaphore_array_length * 2, 8); + /* Make sure that if the realloc fails, we still have the old semaphore + * array around to properly clean things up on failure. + */ + struct anv_semaphore_impl *new_array = + vk_realloc(submit->alloc, + submit->temporary_semaphores, + new_len * sizeof(*submit->temporary_semaphores), + 8, submit->alloc_scope); + if (new_array == NULL) + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + + submit->temporary_semaphores = new_array; + submit->temporary_semaphore_array_length = new_len; + } + + /* Copy anv_semaphore_impl into anv_queue_submit. */ + submit->temporary_semaphores[submit->temporary_semaphore_count++] = *impl; + *out_impl = &submit->temporary_semaphores[submit->temporary_semaphore_count - 1]; + + /* Clear the incoming semaphore */ + impl->type = ANV_SEMAPHORE_TYPE_NONE; + + return VK_SUCCESS; +} + +static VkResult +anv_queue_submit(struct anv_queue *queue, + struct anv_cmd_buffer *cmd_buffer, + const VkSemaphore *in_semaphores, + const uint64_t *in_values, + uint32_t num_in_semaphores, + const VkSemaphore *out_semaphores, + const uint64_t *out_values, + uint32_t num_out_semaphores, + struct anv_bo *wsi_signal_bo, + VkFence _fence) +{ + ANV_FROM_HANDLE(anv_fence, fence, _fence); + struct anv_device *device = queue->device; + UNUSED struct anv_physical_device *pdevice = device->physical; + struct anv_queue_submit *submit = anv_queue_submit_alloc(device); + if (!submit) + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); - memcpy(bo.map, batch->start, size); - if (!device->info.has_llc) - gen_flush_range(bo.map, size); - - exec_bos[0] = &bo; - exec2_objects[0].handle = bo.gem_handle; - exec2_objects[0].relocation_count = 0; - exec2_objects[0].relocs_ptr = 0; - exec2_objects[0].alignment = 0; - exec2_objects[0].offset = bo.offset; - exec2_objects[0].flags = bo.flags; - exec2_objects[0].rsvd1 = 0; - exec2_objects[0].rsvd2 = 0; - - execbuf.buffers_ptr = (uintptr_t) exec2_objects; - execbuf.buffer_count = 1; - execbuf.batch_start_offset = 0; - execbuf.batch_len = size; - execbuf.cliprects_ptr = 0; - execbuf.num_cliprects = 0; - execbuf.DR1 = 0; - execbuf.DR4 = 0; - - execbuf.flags = - I915_EXEC_HANDLE_LUT | I915_EXEC_NO_RELOC | I915_EXEC_RENDER; - execbuf.rsvd1 = device->context_id; - execbuf.rsvd2 = 0; + submit->cmd_buffer = cmd_buffer; - if (unlikely(INTEL_DEBUG & DEBUG_BATCH)) - gen_print_batch(&device->decoder_ctx, bo.map, bo.size, bo.offset, false); + VkResult result = VK_SUCCESS; + + for (uint32_t i = 0; i < num_in_semaphores; i++) { + ANV_FROM_HANDLE(anv_semaphore, semaphore, in_semaphores[i]); + struct anv_semaphore_impl *impl; + + result = maybe_transfer_temporary_semaphore(submit, semaphore, &impl); + if (result != VK_SUCCESS) + goto error; + + switch (impl->type) { + case ANV_SEMAPHORE_TYPE_BO: + assert(!pdevice->has_syncobj); + result = anv_queue_submit_add_fence_bo(submit, impl->bo, false /* signal */); + if (result != VK_SUCCESS) + goto error; + break; + + case ANV_SEMAPHORE_TYPE_WSI_BO: + /* When using a window-system buffer as a semaphore, always enable + * EXEC_OBJECT_WRITE. This gives us a WaR hazard with the display or + * compositor's read of the buffer and enforces that we don't start + * rendering until they are finished. This is exactly the + * synchronization we want with vkAcquireNextImage. + */ + result = anv_queue_submit_add_fence_bo(submit, impl->bo, true /* signal */); + if (result != VK_SUCCESS) + goto error; + break; + + case ANV_SEMAPHORE_TYPE_SYNC_FILE: + assert(!pdevice->has_syncobj); + if (submit->in_fence == -1) { + submit->in_fence = impl->fd; + if (submit->in_fence == -1) { + result = vk_error(VK_ERROR_INVALID_EXTERNAL_HANDLE); + goto error; + } + impl->fd = -1; + } else { + int merge = anv_gem_sync_file_merge(device, submit->in_fence, impl->fd); + if (merge == -1) { + result = vk_error(VK_ERROR_INVALID_EXTERNAL_HANDLE); + goto error; + } + close(impl->fd); + close(submit->in_fence); + impl->fd = -1; + submit->in_fence = merge; + } + break; + + case ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ: { + result = anv_queue_submit_add_syncobj(submit, device, + impl->syncobj, + I915_EXEC_FENCE_WAIT); + if (result != VK_SUCCESS) + goto error; + break; + } + + case ANV_SEMAPHORE_TYPE_TIMELINE: + result = anv_queue_submit_add_timeline_wait(submit, device, + &impl->timeline, + in_values ? in_values[i] : 0); + if (result != VK_SUCCESS) + goto error; + break; + + default: + break; + } + } + + for (uint32_t i = 0; i < num_out_semaphores; i++) { + ANV_FROM_HANDLE(anv_semaphore, semaphore, out_semaphores[i]); + + /* Under most circumstances, out fences won't be temporary. However, + * the spec does allow it for opaque_fd. From the Vulkan 1.0.53 spec: + * + * "If the import is temporary, the implementation must restore the + * semaphore to its prior permanent state after submitting the next + * semaphore wait operation." + * + * The spec says nothing whatsoever about signal operations on + * temporarily imported semaphores so it appears they are allowed. + * There are also CTS tests that require this to work. + */ + struct anv_semaphore_impl *impl = + semaphore->temporary.type != ANV_SEMAPHORE_TYPE_NONE ? + &semaphore->temporary : &semaphore->permanent; + + switch (impl->type) { + case ANV_SEMAPHORE_TYPE_BO: + assert(!pdevice->has_syncobj); + result = anv_queue_submit_add_fence_bo(submit, impl->bo, true /* signal */); + if (result != VK_SUCCESS) + goto error; + break; + + case ANV_SEMAPHORE_TYPE_SYNC_FILE: + assert(!pdevice->has_syncobj); + result = anv_queue_submit_add_sync_fd_fence(submit, semaphore); + if (result != VK_SUCCESS) + goto error; + break; + + case ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ: { + result = anv_queue_submit_add_syncobj(submit, device, impl->syncobj, + I915_EXEC_FENCE_SIGNAL); + if (result != VK_SUCCESS) + goto error; + break; + } + + case ANV_SEMAPHORE_TYPE_TIMELINE: + result = anv_queue_submit_add_timeline_signal(submit, device, + &impl->timeline, + out_values ? out_values[i] : 0); + if (result != VK_SUCCESS) + goto error; + break; - result = anv_device_execbuf(device, &execbuf, exec_bos); + default: + break; + } + } + + if (wsi_signal_bo) { + result = anv_queue_submit_add_fence_bo(submit, wsi_signal_bo, true /* signal */); + if (result != VK_SUCCESS) + goto error; + } + + if (fence) { + /* Under most circumstances, out fences won't be temporary. However, + * the spec does allow it for opaque_fd. From the Vulkan 1.0.53 spec: + * + * "If the import is temporary, the implementation must restore the + * semaphore to its prior permanent state after submitting the next + * semaphore wait operation." + * + * The spec says nothing whatsoever about signal operations on + * temporarily imported semaphores so it appears they are allowed. + * There are also CTS tests that require this to work. + */ + struct anv_fence_impl *impl = + fence->temporary.type != ANV_FENCE_TYPE_NONE ? + &fence->temporary : &fence->permanent; + + switch (impl->type) { + case ANV_FENCE_TYPE_BO: + result = anv_queue_submit_add_fence_bo(submit, impl->bo.bo, true /* signal */); + if (result != VK_SUCCESS) + goto error; + break; + + case ANV_FENCE_TYPE_SYNCOBJ: { + /* + * For the same reason we reset the signaled binary syncobj above, + * also reset the fence's syncobj so that they don't contain a + * signaled dma-fence. + */ + result = anv_queue_submit_add_syncobj(submit, device, impl->syncobj, + I915_EXEC_FENCE_SIGNAL); + if (result != VK_SUCCESS) + goto error; + break; + } + + default: + unreachable("Invalid fence type"); + } + } + + result = _anv_queue_submit(queue, &submit, false); if (result != VK_SUCCESS) - goto fail; + goto error; - result = anv_device_wait(device, &bo, INT64_MAX); + if (fence && fence->permanent.type == ANV_FENCE_TYPE_BO) { + /* If we have permanent BO fence, the only type of temporary possible + * would be BO_WSI (because BO fences are not shareable). The Vulkan spec + * also requires that the fence passed to vkQueueSubmit() be : + * + * * unsignaled + * * not be associated with any other queue command that has not yet + * completed execution on that queue + * + * So the only acceptable type for the temporary is NONE. + */ + assert(fence->temporary.type == ANV_FENCE_TYPE_NONE); + + /* Once the execbuf has returned, we need to set the fence state to + * SUBMITTED. We can't do this before calling execbuf because + * anv_GetFenceStatus does take the global device lock before checking + * fence->state. + * + * We set the fence state to SUBMITTED regardless of whether or not the + * execbuf succeeds because we need to ensure that vkWaitForFences() and + * vkGetFenceStatus() return a valid result (VK_ERROR_DEVICE_LOST or + * VK_SUCCESS) in a finite amount of time even if execbuf fails. + */ + fence->permanent.bo.state = ANV_BO_FENCE_STATE_SUBMITTED; + } - fail: - anv_bo_pool_free(&device->batch_bo_pool, &bo); + error: + if (submit) + anv_queue_submit_free(device, submit); return result; } @@ -121,7 +946,6 @@ VkFence fence) { ANV_FROM_HANDLE(anv_queue, queue, _queue); - struct anv_device *device = queue->device; /* Query for device status prior to submitting. Technically, we don't need * to do this. However, if we have a client that's submitting piles of @@ -130,44 +954,18 @@ * the kernel to kick us or we'll have to wait until the client waits on a * fence before we actually know whether or not we've hung. */ - VkResult result = anv_device_query_status(device); + VkResult result = anv_device_query_status(queue->device); if (result != VK_SUCCESS) return result; - /* We lock around QueueSubmit for three main reasons: - * - * 1) When a block pool is resized, we create a new gem handle with a - * different size and, in the case of surface states, possibly a - * different center offset but we re-use the same anv_bo struct when - * we do so. If this happens in the middle of setting up an execbuf, - * we could end up with our list of BOs out of sync with our list of - * gem handles. - * - * 2) The algorithm we use for building the list of unique buffers isn't - * thread-safe. While the client is supposed to syncronize around - * QueueSubmit, this would be extremely difficult to debug if it ever - * came up in the wild due to a broken app. It's better to play it - * safe and just lock around QueueSubmit. - * - * 3) The anv_cmd_buffer_execbuf function may perform relocations in - * userspace. Due to the fact that the surface state buffer is shared - * between batches, we can't afford to have that happen from multiple - * threads at the same time. Even though the user is supposed to - * ensure this doesn't happen, we play it safe as in (2) above. - * - * Since the only other things that ever take the device lock such as block - * pool resize only rarely happen, this will almost never be contended so - * taking a lock isn't really an expensive operation in this case. - */ - pthread_mutex_lock(&device->mutex); - if (fence && submitCount == 0) { /* If we don't have any command buffers, we need to submit a dummy * batch to give GEM something to wait on. We could, potentially, * come up with something more efficient but this shouldn't be a * common case. */ - result = anv_cmd_buffer_execbuf(device, NULL, NULL, 0, NULL, 0, fence); + result = anv_queue_submit(queue, NULL, NULL, NULL, 0, NULL, NULL, 0, + NULL, fence); goto out; } @@ -175,18 +973,38 @@ /* Fence for this submit. NULL for all but the last one */ VkFence submit_fence = (i == submitCount - 1) ? fence : VK_NULL_HANDLE; + const struct wsi_memory_signal_submit_info *mem_signal_info = + vk_find_struct_const(pSubmits[i].pNext, + WSI_MEMORY_SIGNAL_SUBMIT_INFO_MESA); + struct anv_bo *wsi_signal_bo = + mem_signal_info && mem_signal_info->memory != VK_NULL_HANDLE ? + anv_device_memory_from_handle(mem_signal_info->memory)->bo : NULL; + + const VkTimelineSemaphoreSubmitInfoKHR *timeline_info = + vk_find_struct_const(pSubmits[i].pNext, + TIMELINE_SEMAPHORE_SUBMIT_INFO_KHR); + const uint64_t *wait_values = + timeline_info && timeline_info->waitSemaphoreValueCount ? + timeline_info->pWaitSemaphoreValues : NULL; + const uint64_t *signal_values = + timeline_info && timeline_info->signalSemaphoreValueCount ? + timeline_info->pSignalSemaphoreValues : NULL; + if (pSubmits[i].commandBufferCount == 0) { /* If we don't have any command buffers, we need to submit a dummy * batch to give GEM something to wait on. We could, potentially, * come up with something more efficient but this shouldn't be a * common case. */ - result = anv_cmd_buffer_execbuf(device, NULL, - pSubmits[i].pWaitSemaphores, - pSubmits[i].waitSemaphoreCount, - pSubmits[i].pSignalSemaphores, - pSubmits[i].signalSemaphoreCount, - submit_fence); + result = anv_queue_submit(queue, NULL, + pSubmits[i].pWaitSemaphores, + wait_values, + pSubmits[i].waitSemaphoreCount, + pSubmits[i].pSignalSemaphores, + signal_values, + pSubmits[i].signalSemaphoreCount, + wsi_signal_bo, + submit_fence); if (result != VK_SUCCESS) goto out; @@ -205,32 +1023,33 @@ submit_fence : VK_NULL_HANDLE; const VkSemaphore *in_semaphores = NULL, *out_semaphores = NULL; + const uint64_t *in_values = NULL, *out_values = NULL; uint32_t num_in_semaphores = 0, num_out_semaphores = 0; if (j == 0) { /* Only the first batch gets the in semaphores */ in_semaphores = pSubmits[i].pWaitSemaphores; + in_values = wait_values; num_in_semaphores = pSubmits[i].waitSemaphoreCount; } if (j == pSubmits[i].commandBufferCount - 1) { /* Only the last batch gets the out semaphores */ out_semaphores = pSubmits[i].pSignalSemaphores; + out_values = signal_values; num_out_semaphores = pSubmits[i].signalSemaphoreCount; } - result = anv_cmd_buffer_execbuf(device, cmd_buffer, - in_semaphores, num_in_semaphores, - out_semaphores, num_out_semaphores, - execbuf_fence); + result = anv_queue_submit(queue, cmd_buffer, + in_semaphores, in_values, num_in_semaphores, + out_semaphores, out_values, num_out_semaphores, + wsi_signal_bo, execbuf_fence); if (result != VK_SUCCESS) goto out; } } - pthread_cond_broadcast(&device->queue_submit); - out: - if (result != VK_SUCCESS) { + if (result != VK_SUCCESS && result != VK_ERROR_DEVICE_LOST) { /* In the case that something has gone wrong we may end up with an * inconsistent state from which it may not be trivial to recover. * For example, we might have computed address relocations and @@ -242,12 +1061,14 @@ * anyway (such us being out of memory) and return * VK_ERROR_DEVICE_LOST to ensure that clients do not attempt to * submit the same job again to this device. + * + * We skip doing this on VK_ERROR_DEVICE_LOST because + * anv_device_set_lost() would have been called already by a callee of + * anv_queue_submit(). */ - result = anv_device_set_lost(device, "vkQueueSubmit() failed"); + result = anv_device_set_lost(queue->device, "vkQueueSubmit() failed"); } - pthread_mutex_unlock(&device->mutex); - return result; } @@ -256,7 +1077,10 @@ { ANV_FROM_HANDLE(anv_queue, queue, _queue); - return anv_DeviceWaitIdle(anv_device_to_handle(queue->device)); + if (anv_device_is_lost(queue->device)) + return VK_ERROR_DEVICE_LOST; + + return anv_queue_submit_simple_batch(queue, NULL); } VkResult anv_CreateFence( @@ -275,7 +1099,7 @@ if (fence == NULL) return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); - if (device->instance->physicalDevice.has_syncobj_wait) { + if (device->physical->has_syncobj_wait) { fence->permanent.type = ANV_FENCE_TYPE_SYNCOBJ; uint32_t create_flags = 0; @@ -288,8 +1112,8 @@ } else { fence->permanent.type = ANV_FENCE_TYPE_BO; - VkResult result = anv_bo_pool_alloc(&device->batch_bo_pool, - &fence->permanent.bo.bo, 4096); + VkResult result = anv_bo_pool_alloc(&device->batch_bo_pool, 4096, + &fence->permanent.bo.bo); if (result != VK_SUCCESS) return result; @@ -315,7 +1139,11 @@ break; case ANV_FENCE_TYPE_BO: - anv_bo_pool_free(&device->batch_bo_pool, &impl->bo.bo); + anv_bo_pool_free(&device->batch_bo_pool, impl->bo.bo); + break; + + case ANV_FENCE_TYPE_WSI_BO: + anv_device_release_bo(device, impl->bo.bo); break; case ANV_FENCE_TYPE_SYNCOBJ: @@ -333,6 +1161,16 @@ impl->type = ANV_FENCE_TYPE_NONE; } +void +anv_fence_reset_temporary(struct anv_device *device, + struct anv_fence *fence) +{ + if (fence->temporary.type == ANV_FENCE_TYPE_NONE) + return; + + anv_fence_impl_cleanup(device, &fence->temporary); +} + void anv_DestroyFence( VkDevice _device, VkFence _fence, @@ -367,8 +1205,7 @@ * first restored. The remaining operations described therefore * operate on the restored payload. */ - if (fence->temporary.type != ANV_FENCE_TYPE_NONE) - anv_fence_impl_cleanup(device, &fence->temporary); + anv_fence_reset_temporary(device, fence); struct anv_fence_impl *impl = &fence->permanent; @@ -405,8 +1242,7 @@ switch (impl->type) { case ANV_FENCE_TYPE_BO: - /* BO fences don't support import/export */ - assert(fence->temporary.type == ANV_FENCE_TYPE_NONE); + case ANV_FENCE_TYPE_WSI_BO: switch (impl->bo.state) { case ANV_BO_FENCE_STATE_RESET: /* If it hasn't even been sent off to the GPU yet, it's not ready */ @@ -417,7 +1253,7 @@ return VK_SUCCESS; case ANV_BO_FENCE_STATE_SUBMITTED: { - VkResult result = anv_device_bo_busy(device, &impl->bo.bo); + VkResult result = anv_device_bo_busy(device, impl->bo.bo); if (result == VK_SUCCESS) { impl->bo.state = ANV_BO_FENCE_STATE_SIGNALED; return VK_SUCCESS; @@ -448,53 +1284,6 @@ } } -#define NSEC_PER_SEC 1000000000 -#define INT_TYPE_MAX(type) ((1ull << (sizeof(type) * 8 - 1)) - 1) - -static uint64_t -gettime_ns(void) -{ - struct timespec current; - clock_gettime(CLOCK_MONOTONIC, ¤t); - return (uint64_t)current.tv_sec * NSEC_PER_SEC + current.tv_nsec; -} - -static uint64_t anv_get_absolute_timeout(uint64_t timeout) -{ - if (timeout == 0) - return 0; - uint64_t current_time = gettime_ns(); - uint64_t max_timeout = (uint64_t) INT64_MAX - current_time; - - timeout = MIN2(max_timeout, timeout); - - return (current_time + timeout); -} - -static int64_t anv_get_relative_timeout(uint64_t abs_timeout) -{ - uint64_t now = gettime_ns(); - - /* We don't want negative timeouts. - * - * DRM_IOCTL_I915_GEM_WAIT uses a signed 64 bit timeout and is - * supposed to block indefinitely timeouts < 0. Unfortunately, - * this was broken for a couple of kernel releases. Since there's - * no way to know whether or not the kernel we're using is one of - * the broken ones, the best we can do is to clamp the timeout to - * INT64_MAX. This limits the maximum timeout from 584 years to - * 292 years - likely not a big deal. - */ - if (abs_timeout < now) - return 0; - - uint64_t rel_timeout = abs_timeout - now; - if (rel_timeout > (uint64_t) INT64_MAX) - rel_timeout = INT64_MAX; - - return rel_timeout; -} - static VkResult anv_wait_for_syncobj_fences(struct anv_device *device, uint32_t fenceCount, @@ -528,7 +1317,7 @@ do { ret = anv_gem_syncobj_wait(device, syncobjs, fenceCount, abs_timeout_ns, waitAll); - } while (ret == -1 && errno == ETIME && gettime_ns() < abs_timeout_ns); + } while (ret == -1 && errno == ETIME && anv_gettime_ns() < abs_timeout_ns); vk_free(&device->alloc, syncobjs); @@ -559,13 +1348,11 @@ for (uint32_t i = 0; i < fenceCount; i++) { ANV_FROM_HANDLE(anv_fence, fence, pFences[i]); - /* This function assumes that all fences are BO fences and that they - * have no temporary state. Since BO fences will never be exported, - * this should be a safe assumption. - */ - assert(fence->permanent.type == ANV_FENCE_TYPE_BO); - assert(fence->temporary.type == ANV_FENCE_TYPE_NONE); - struct anv_fence_impl *impl = &fence->permanent; + struct anv_fence_impl *impl = + fence->temporary.type != ANV_FENCE_TYPE_NONE ? + &fence->temporary : &fence->permanent; + assert(impl->type == ANV_FENCE_TYPE_BO || + impl->type == ANV_FENCE_TYPE_WSI_BO); switch (impl->bo.state) { case ANV_BO_FENCE_STATE_RESET: @@ -591,7 +1378,7 @@ /* These are the fences we really care about. Go ahead and wait * on it until we hit a timeout. */ - result = anv_device_wait(device, &impl->bo.bo, + result = anv_device_wait(device, impl->bo.bo, anv_get_relative_timeout(abs_timeout_ns)); switch (result) { case VK_SUCCESS: @@ -640,7 +1427,7 @@ ret = pthread_cond_timedwait(&device->queue_submit, &device->mutex, &abstime); assert(ret != EINVAL); - if (gettime_ns() >= abs_timeout_ns) { + if (anv_gettime_ns() >= abs_timeout_ns) { pthread_mutex_unlock(&device->mutex); result = VK_TIMEOUT; goto done; @@ -660,12 +1447,9 @@ static VkResult anv_wait_for_wsi_fence(struct anv_device *device, - const VkFence _fence, + struct anv_fence_impl *impl, uint64_t abs_timeout) { - ANV_FROM_HANDLE(anv_fence, fence, _fence); - struct anv_fence_impl *impl = &fence->permanent; - return impl->fence_wsi->wait(impl->fence_wsi, abs_timeout); } @@ -687,6 +1471,7 @@ switch (impl->type) { case ANV_FENCE_TYPE_BO: + case ANV_FENCE_TYPE_WSI_BO: result = anv_wait_for_bo_fences(device, 1, &pFences[i], true, abs_timeout); break; @@ -695,7 +1480,7 @@ true, abs_timeout); break; case ANV_FENCE_TYPE_WSI: - result = anv_wait_for_wsi_fence(device, pFences[i], abs_timeout); + result = anv_wait_for_wsi_fence(device, impl, abs_timeout); break; case ANV_FENCE_TYPE_NONE: result = VK_SUCCESS; @@ -710,7 +1495,7 @@ if (anv_wait_for_fences(device, 1, &pFences[i], true, 0) == VK_SUCCESS) return VK_SUCCESS; } - } while (gettime_ns() < abs_timeout); + } while (anv_gettime_ns() < abs_timeout); result = VK_TIMEOUT; } return result; @@ -736,7 +1521,8 @@ struct anv_fence_impl *impl = fence->temporary.type != ANV_FENCE_TYPE_NONE ? &fence->temporary : &fence->permanent; - if (impl->type != ANV_FENCE_TYPE_BO) + if (impl->type != ANV_FENCE_TYPE_BO && + impl->type != ANV_FENCE_TYPE_WSI_BO) return false; } return true; @@ -838,8 +1624,7 @@ if (anv_gem_syncobj_import_sync_file(device, new_impl.syncobj, fd)) { anv_gem_syncobj_destroy(device, new_impl.syncobj); - return vk_errorf(device->instance, NULL, - VK_ERROR_INVALID_EXTERNAL_HANDLE, + return vk_errorf(device, NULL, VK_ERROR_INVALID_EXTERNAL_HANDLE, "syncobj sync file import failed: %m"); } break; @@ -923,6 +1708,57 @@ // Queue semaphore functions +static VkSemaphoreTypeKHR +get_semaphore_type(const void *pNext, uint64_t *initial_value) +{ + const VkSemaphoreTypeCreateInfoKHR *type_info = + vk_find_struct_const(pNext, SEMAPHORE_TYPE_CREATE_INFO_KHR); + + if (!type_info) + return VK_SEMAPHORE_TYPE_BINARY_KHR; + + if (initial_value) + *initial_value = type_info->initialValue; + return type_info->semaphoreType; +} + +static VkResult +binary_semaphore_create(struct anv_device *device, + struct anv_semaphore_impl *impl, + bool exportable) +{ + if (device->physical->has_syncobj) { + impl->type = ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ; + impl->syncobj = anv_gem_syncobj_create(device, 0); + if (!impl->syncobj) + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + return VK_SUCCESS; + } else { + impl->type = ANV_SEMAPHORE_TYPE_BO; + VkResult result = + anv_device_alloc_bo(device, 4096, + ANV_BO_ALLOC_EXTERNAL | + ANV_BO_ALLOC_IMPLICIT_SYNC, + 0 /* explicit_address */, + &impl->bo); + /* If we're going to use this as a fence, we need to *not* have the + * EXEC_OBJECT_ASYNC bit set. + */ + assert(!(impl->bo->flags & EXEC_OBJECT_ASYNC)); + return result; + } +} + +static VkResult +timeline_semaphore_create(struct anv_device *device, + struct anv_semaphore_impl *impl, + uint64_t initial_value) +{ + impl->type = ANV_SEMAPHORE_TYPE_TIMELINE; + anv_timeline_init(device, &impl->timeline, initial_value); + return VK_SUCCESS; +} + VkResult anv_CreateSemaphore( VkDevice _device, const VkSemaphoreCreateInfo* pCreateInfo, @@ -934,25 +1770,43 @@ assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO); - semaphore = vk_alloc2(&device->alloc, pAllocator, sizeof(*semaphore), 8, - VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + uint64_t timeline_value = 0; + VkSemaphoreTypeKHR sem_type = get_semaphore_type(pCreateInfo->pNext, &timeline_value); + + semaphore = vk_alloc(&device->alloc, sizeof(*semaphore), 8, + VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); if (semaphore == NULL) return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + p_atomic_set(&semaphore->refcount, 1); + const VkExportSemaphoreCreateInfo *export = vk_find_struct_const(pCreateInfo->pNext, EXPORT_SEMAPHORE_CREATE_INFO); VkExternalSemaphoreHandleTypeFlags handleTypes = export ? export->handleTypes : 0; + VkResult result; if (handleTypes == 0) { - /* The DRM execbuffer ioctl always execute in-oder so long as you stay - * on the same ring. Since we don't expose the blit engine as a DMA - * queue, a dummy no-op semaphore is a perfectly valid implementation. - */ - semaphore->permanent.type = ANV_SEMAPHORE_TYPE_DUMMY; + if (sem_type == VK_SEMAPHORE_TYPE_BINARY_KHR) + result = binary_semaphore_create(device, &semaphore->permanent, false); + else + result = timeline_semaphore_create(device, &semaphore->permanent, timeline_value); + if (result != VK_SUCCESS) { + vk_free2(&device->alloc, pAllocator, semaphore); + return result; + } } else if (handleTypes & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT) { assert(handleTypes == VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT); - if (device->instance->physicalDevice.has_syncobj) { + assert(sem_type == VK_SEMAPHORE_TYPE_BINARY_KHR); + result = binary_semaphore_create(device, &semaphore->permanent, true); + if (result != VK_SUCCESS) { + vk_free2(&device->alloc, pAllocator, semaphore); + return result; + } + } else if (handleTypes & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT) { + assert(handleTypes == VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT); + assert(sem_type == VK_SEMAPHORE_TYPE_BINARY_KHR); + if (device->physical->has_syncobj) { semaphore->permanent.type = ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ; semaphore->permanent.syncobj = anv_gem_syncobj_create(device, 0); if (!semaphore->permanent.syncobj) { @@ -960,26 +1814,6 @@ return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); } } else { - semaphore->permanent.type = ANV_SEMAPHORE_TYPE_BO; - VkResult result = anv_bo_cache_alloc(device, &device->bo_cache, - 4096, ANV_BO_EXTERNAL, - &semaphore->permanent.bo); - if (result != VK_SUCCESS) { - vk_free2(&device->alloc, pAllocator, semaphore); - return result; - } - - /* If we're going to use this as a fence, we need to *not* have the - * EXEC_OBJECT_ASYNC bit set. - */ - assert(!(semaphore->permanent.bo->flags & EXEC_OBJECT_ASYNC)); - } - } else if (handleTypes & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT) { - assert(handleTypes == VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT); - if (device->instance->physicalDevice.has_syncobj) { - semaphore->permanent.type = ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ; - semaphore->permanent.syncobj = anv_gem_syncobj_create(device, 0); - } else { semaphore->permanent.type = ANV_SEMAPHORE_TYPE_SYNC_FILE; semaphore->permanent.fd = -1; } @@ -1007,11 +1841,17 @@ break; case ANV_SEMAPHORE_TYPE_BO: - anv_bo_cache_release(device, &device->bo_cache, impl->bo); + case ANV_SEMAPHORE_TYPE_WSI_BO: + anv_device_release_bo(device, impl->bo); break; case ANV_SEMAPHORE_TYPE_SYNC_FILE: - close(impl->fd); + if (impl->fd >= 0) + close(impl->fd); + break; + + case ANV_SEMAPHORE_TYPE_TIMELINE: + anv_timeline_finish(device, &impl->timeline); break; case ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ: @@ -1035,6 +1875,25 @@ anv_semaphore_impl_cleanup(device, &semaphore->temporary); } +static struct anv_semaphore * +anv_semaphore_ref(struct anv_semaphore *semaphore) +{ + assert(semaphore->refcount); + p_atomic_inc(&semaphore->refcount); + return semaphore; +} + +static void +anv_semaphore_unref(struct anv_device *device, struct anv_semaphore *semaphore) +{ + if (!p_atomic_dec_zero(&semaphore->refcount)) + return; + + anv_semaphore_impl_cleanup(device, &semaphore->temporary); + anv_semaphore_impl_cleanup(device, &semaphore->permanent); + vk_free(&device->alloc, semaphore); +} + void anv_DestroySemaphore( VkDevice _device, VkSemaphore _semaphore, @@ -1046,10 +1905,7 @@ if (semaphore == NULL) return; - anv_semaphore_impl_cleanup(device, &semaphore->temporary); - anv_semaphore_impl_cleanup(device, &semaphore->permanent); - - vk_free2(&device->alloc, pAllocator, semaphore); + anv_semaphore_unref(device, semaphore); } void anv_GetPhysicalDeviceExternalSemaphoreProperties( @@ -1059,8 +1915,14 @@ { ANV_FROM_HANDLE(anv_physical_device, device, physicalDevice); + VkSemaphoreTypeKHR sem_type = + get_semaphore_type(pExternalSemaphoreInfo->pNext, NULL); + switch (pExternalSemaphoreInfo->handleType) { case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT: + /* Timeline semaphores are not exportable. */ + if (sem_type == VK_SEMAPHORE_TYPE_TIMELINE_KHR) + break; pExternalSemaphoreProperties->exportFromImportedHandleTypes = VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT; pExternalSemaphoreProperties->compatibleHandleTypes = @@ -1071,17 +1933,18 @@ return; case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT: - if (device->has_exec_fence) { - pExternalSemaphoreProperties->exportFromImportedHandleTypes = - VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT; - pExternalSemaphoreProperties->compatibleHandleTypes = - VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT; - pExternalSemaphoreProperties->externalSemaphoreFeatures = - VK_EXTERNAL_SEMAPHORE_FEATURE_EXPORTABLE_BIT | - VK_EXTERNAL_SEMAPHORE_FEATURE_IMPORTABLE_BIT; - return; - } - break; + if (sem_type == VK_SEMAPHORE_TYPE_TIMELINE_KHR) + break; + if (!device->has_exec_fence) + break; + pExternalSemaphoreProperties->exportFromImportedHandleTypes = + VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT; + pExternalSemaphoreProperties->compatibleHandleTypes = + VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT; + pExternalSemaphoreProperties->externalSemaphoreFeatures = + VK_EXTERNAL_SEMAPHORE_FEATURE_EXPORTABLE_BIT | + VK_EXTERNAL_SEMAPHORE_FEATURE_IMPORTABLE_BIT; + return; default: break; @@ -1106,7 +1969,7 @@ switch (pImportSemaphoreFdInfo->handleType) { case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT: - if (device->instance->physicalDevice.has_syncobj) { + if (device->physical->has_syncobj) { new_impl.type = ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ; new_impl.syncobj = anv_gem_syncobj_fd_to_handle(device, fd); @@ -1115,14 +1978,16 @@ } else { new_impl.type = ANV_SEMAPHORE_TYPE_BO; - VkResult result = anv_bo_cache_import(device, &device->bo_cache, - fd, ANV_BO_EXTERNAL, - &new_impl.bo); + VkResult result = anv_device_import_bo(device, fd, + ANV_BO_ALLOC_EXTERNAL | + ANV_BO_ALLOC_IMPLICIT_SYNC, + 0 /* client_address */, + &new_impl.bo); if (result != VK_SUCCESS) return result; if (new_impl.bo->size < 4096) { - anv_bo_cache_release(device, &device->bo_cache, new_impl.bo); + anv_device_release_bo(device, new_impl.bo); return vk_error(VK_ERROR_INVALID_EXTERNAL_HANDLE); } @@ -1145,7 +2010,7 @@ break; case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT: - if (device->instance->physicalDevice.has_syncobj) { + if (device->physical->has_syncobj) { new_impl = (struct anv_semaphore_impl) { .type = ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ, .syncobj = anv_gem_syncobj_create(device, 0), @@ -1154,8 +2019,7 @@ return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); if (anv_gem_syncobj_import_sync_file(device, new_impl.syncobj, fd)) { anv_gem_syncobj_destroy(device, new_impl.syncobj); - return vk_errorf(device->instance, NULL, - VK_ERROR_INVALID_EXTERNAL_HANDLE, + return vk_errorf(device, NULL, VK_ERROR_INVALID_EXTERNAL_HANDLE, "syncobj sync file import failed: %m"); } /* Ownership of the FD is transfered to Anv. Since we don't need it @@ -1204,25 +2068,20 @@ switch (impl->type) { case ANV_SEMAPHORE_TYPE_BO: - result = anv_bo_cache_export(device, &device->bo_cache, impl->bo, pFd); + result = anv_device_export_bo(device, impl->bo, pFd); if (result != VK_SUCCESS) return result; break; - case ANV_SEMAPHORE_TYPE_SYNC_FILE: - /* There are two reasons why this could happen: - * - * 1) The user is trying to export without submitting something that - * signals the semaphore. If this is the case, it's their bug so - * what we return here doesn't matter. - * - * 2) The kernel didn't give us a file descriptor. The most likely - * reason for this is running out of file descriptors. + case ANV_SEMAPHORE_TYPE_SYNC_FILE: { + /* There's a potential race here with vkQueueSubmit if you are trying + * to export a semaphore Fd while the queue submit is still happening. + * This can happen if we see all dependencies get resolved via timeline + * semaphore waits completing before the execbuf completes and we + * process the resulting out fence. To work around this, take a lock + * around grabbing the fd. */ - if (impl->fd < 0) - return vk_error(VK_ERROR_TOO_MANY_OBJECTS); - - *pFd = impl->fd; + pthread_mutex_lock(&device->mutex); /* From the Vulkan 1.0.53 spec: * @@ -1234,8 +2093,26 @@ * considered to have been waited on and no longer has a sync file * attached. */ + int fd = impl->fd; impl->fd = -1; + + pthread_mutex_unlock(&device->mutex); + + /* There are two reasons why this could happen: + * + * 1) The user is trying to export without submitting something that + * signals the semaphore. If this is the case, it's their bug so + * what we return here doesn't matter. + * + * 2) The kernel didn't give us a file descriptor. The most likely + * reason for this is running out of file descriptors. + */ + if (fd < 0) + return vk_error(VK_ERROR_TOO_MANY_OBJECTS); + + *pFd = fd; return VK_SUCCESS; + } case ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ: if (pGetFdInfo->handleType == VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT) @@ -1265,3 +2142,222 @@ return VK_SUCCESS; } + +VkResult anv_GetSemaphoreCounterValue( + VkDevice _device, + VkSemaphore _semaphore, + uint64_t* pValue) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + ANV_FROM_HANDLE(anv_semaphore, semaphore, _semaphore); + + struct anv_semaphore_impl *impl = + semaphore->temporary.type != ANV_SEMAPHORE_TYPE_NONE ? + &semaphore->temporary : &semaphore->permanent; + + switch (impl->type) { + case ANV_SEMAPHORE_TYPE_TIMELINE: { + pthread_mutex_lock(&device->mutex); + *pValue = impl->timeline.highest_past; + pthread_mutex_unlock(&device->mutex); + return VK_SUCCESS; + } + + default: + unreachable("Invalid semaphore type"); + } +} + +static VkResult +anv_timeline_wait_locked(struct anv_device *device, + struct anv_timeline *timeline, + uint64_t serial, uint64_t abs_timeout_ns) +{ + /* Wait on the queue_submit condition variable until the timeline has a + * time point pending that's at least as high as serial. + */ + while (timeline->highest_pending < serial) { + struct timespec abstime = { + .tv_sec = abs_timeout_ns / NSEC_PER_SEC, + .tv_nsec = abs_timeout_ns % NSEC_PER_SEC, + }; + + int ret = pthread_cond_timedwait(&device->queue_submit, + &device->mutex, &abstime); + assert(ret != EINVAL); + if (anv_gettime_ns() >= abs_timeout_ns && + timeline->highest_pending < serial) + return VK_TIMEOUT; + } + + while (1) { + VkResult result = anv_timeline_gc_locked(device, timeline); + if (result != VK_SUCCESS) + return result; + + if (timeline->highest_past >= serial) + return VK_SUCCESS; + + /* If we got here, our earliest time point has a busy BO */ + struct anv_timeline_point *point = + list_first_entry(&timeline->points, + struct anv_timeline_point, link); + + /* Drop the lock while we wait. */ + point->waiting++; + pthread_mutex_unlock(&device->mutex); + + result = anv_device_wait(device, point->bo, + anv_get_relative_timeout(abs_timeout_ns)); + + /* Pick the mutex back up */ + pthread_mutex_lock(&device->mutex); + point->waiting--; + + /* This covers both VK_TIMEOUT and VK_ERROR_DEVICE_LOST */ + if (result != VK_SUCCESS) + return result; + } +} + +static VkResult +anv_timelines_wait(struct anv_device *device, + struct anv_timeline **timelines, + const uint64_t *serials, + uint32_t n_timelines, + bool wait_all, + uint64_t abs_timeout_ns) +{ + if (!wait_all && n_timelines > 1) { + pthread_mutex_lock(&device->mutex); + + while (1) { + VkResult result; + for (uint32_t i = 0; i < n_timelines; i++) { + result = + anv_timeline_wait_locked(device, timelines[i], serials[i], 0); + if (result != VK_TIMEOUT) + break; + } + + if (result != VK_TIMEOUT || + anv_gettime_ns() >= abs_timeout_ns) { + pthread_mutex_unlock(&device->mutex); + return result; + } + + /* If none of them are ready do a short wait so we don't completely + * spin while holding the lock. The 10us is completely arbitrary. + */ + uint64_t abs_short_wait_ns = + anv_get_absolute_timeout( + MIN2((anv_gettime_ns() - abs_timeout_ns) / 10, 10 * 1000)); + struct timespec abstime = { + .tv_sec = abs_short_wait_ns / NSEC_PER_SEC, + .tv_nsec = abs_short_wait_ns % NSEC_PER_SEC, + }; + ASSERTED int ret; + ret = pthread_cond_timedwait(&device->queue_submit, + &device->mutex, &abstime); + assert(ret != EINVAL); + } + } else { + VkResult result = VK_SUCCESS; + pthread_mutex_lock(&device->mutex); + for (uint32_t i = 0; i < n_timelines; i++) { + result = + anv_timeline_wait_locked(device, timelines[i], + serials[i], abs_timeout_ns); + if (result != VK_SUCCESS) + break; + } + pthread_mutex_unlock(&device->mutex); + return result; + } +} + +VkResult anv_WaitSemaphores( + VkDevice _device, + const VkSemaphoreWaitInfoKHR* pWaitInfo, + uint64_t timeout) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + + struct anv_timeline **timelines = + vk_alloc(&device->alloc, + pWaitInfo->semaphoreCount * sizeof(*timelines), + 8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); + if (!timelines) + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + + uint64_t *values = vk_alloc(&device->alloc, + pWaitInfo->semaphoreCount * sizeof(*values), + 8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); + if (!values) { + vk_free(&device->alloc, timelines); + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + } + + uint32_t handle_count = 0; + for (uint32_t i = 0; i < pWaitInfo->semaphoreCount; i++) { + ANV_FROM_HANDLE(anv_semaphore, semaphore, pWaitInfo->pSemaphores[i]); + struct anv_semaphore_impl *impl = + semaphore->temporary.type != ANV_SEMAPHORE_TYPE_NONE ? + &semaphore->temporary : &semaphore->permanent; + + assert(impl->type == ANV_SEMAPHORE_TYPE_TIMELINE); + + if (pWaitInfo->pValues[i] == 0) + continue; + + timelines[handle_count] = &impl->timeline; + values[handle_count] = pWaitInfo->pValues[i]; + handle_count++; + } + + VkResult result = VK_SUCCESS; + if (handle_count > 0) { + result = anv_timelines_wait(device, timelines, values, handle_count, + !(pWaitInfo->flags & VK_SEMAPHORE_WAIT_ANY_BIT_KHR), + anv_get_absolute_timeout(timeout)); + } + + vk_free(&device->alloc, timelines); + vk_free(&device->alloc, values); + + return result; +} + +VkResult anv_SignalSemaphore( + VkDevice _device, + const VkSemaphoreSignalInfoKHR* pSignalInfo) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + ANV_FROM_HANDLE(anv_semaphore, semaphore, pSignalInfo->semaphore); + + struct anv_semaphore_impl *impl = + semaphore->temporary.type != ANV_SEMAPHORE_TYPE_NONE ? + &semaphore->temporary : &semaphore->permanent; + + switch (impl->type) { + case ANV_SEMAPHORE_TYPE_TIMELINE: { + pthread_mutex_lock(&device->mutex); + + VkResult result = anv_timeline_gc_locked(device, &impl->timeline); + + assert(pSignalInfo->value > impl->timeline.highest_pending); + + impl->timeline.highest_pending = impl->timeline.highest_past = pSignalInfo->value; + + if (result == VK_SUCCESS) + result = anv_device_submit_deferred_locked(device); + + pthread_cond_broadcast(&device->queue_submit); + pthread_mutex_unlock(&device->mutex); + return result; + } + + default: + unreachable("Invalid semaphore type"); + } +} diff -Nru mesa-19.2.8/src/intel/vulkan/anv_util.c mesa-20.0.8/src/intel/vulkan/anv_util.c --- mesa-19.2.8/src/intel/vulkan/anv_util.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/vulkan/anv_util.c 2020-06-12 01:21:17.000000000 +0000 @@ -50,7 +50,7 @@ } void anv_printflike(6, 7) -__anv_perf_warn(struct anv_instance *instance, const void *object, +__anv_perf_warn(struct anv_device *device, const void *object, VkDebugReportObjectTypeEXT type, const char *file, int line, const char *format, ...) { @@ -64,7 +64,7 @@ snprintf(report, sizeof(report), "%s: %s", file, buffer); - vk_debug_report(&instance->debug_report_callbacks, + vk_debug_report(&device->physical->instance->debug_report_callbacks, VK_DEBUG_REPORT_PERFORMANCE_WARNING_BIT_EXT, type, (uint64_t) (uintptr_t) object, diff -Nru mesa-19.2.8/src/intel/vulkan/anv_wsi.c mesa-20.0.8/src/intel/vulkan/anv_wsi.c --- mesa-19.2.8/src/intel/vulkan/anv_wsi.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/vulkan/anv_wsi.c 2020-06-12 01:21:17.000000000 +0000 @@ -33,11 +33,46 @@ return anv_lookup_entrypoint(&physical_device->info, pName); } -static uint64_t -anv_wsi_image_get_modifier(VkImage _image) +static void +anv_wsi_signal_semaphore_for_memory(VkDevice _device, + VkSemaphore _semaphore, + VkDeviceMemory _memory) { - ANV_FROM_HANDLE(anv_image, image, _image); - return image->drm_format_mod; + ANV_FROM_HANDLE(anv_device, device, _device); + ANV_FROM_HANDLE(anv_semaphore, semaphore, _semaphore); + ANV_FROM_HANDLE(anv_device_memory, memory, _memory); + + /* Put a BO semaphore with the image BO in the temporary. For BO binary + * semaphores, we always set EXEC_OBJECT_WRITE so this creates a WaR + * hazard with the display engine's read to ensure that no one writes to + * the image before the read is complete. + */ + anv_semaphore_reset_temporary(device, semaphore); + + struct anv_semaphore_impl *impl = &semaphore->temporary; + impl->type = ANV_SEMAPHORE_TYPE_WSI_BO; + impl->bo = anv_bo_ref(memory->bo); +} + +static void +anv_wsi_signal_fence_for_memory(VkDevice _device, + VkFence _fence, + VkDeviceMemory _memory) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + ANV_FROM_HANDLE(anv_fence, fence, _fence); + ANV_FROM_HANDLE(anv_device_memory, memory, _memory); + + /* Put a BO fence with the image BO in the temporary. For BO fences, we + * always just wait until the BO isn't busy and reads from the BO should + * count as busy. + */ + anv_fence_reset_temporary(device, fence); + + struct anv_fence_impl *impl = &fence->temporary; + impl->type = ANV_FENCE_TYPE_WSI_BO; + impl->bo.bo = anv_bo_ref(memory->bo); + impl->bo.state = ANV_BO_FENCE_STATE_SUBMITTED; } VkResult @@ -55,7 +90,10 @@ return result; physical_device->wsi_device.supports_modifiers = true; - physical_device->wsi_device.image_get_modifier = anv_wsi_image_get_modifier; + physical_device->wsi_device.signal_semaphore_for_memory = + anv_wsi_signal_semaphore_for_memory; + physical_device->wsi_device.signal_fence_for_memory = + anv_wsi_signal_fence_for_memory; return VK_SUCCESS; } @@ -175,7 +213,7 @@ VkSwapchainKHR* pSwapchain) { ANV_FROM_HANDLE(anv_device, device, _device); - struct wsi_device *wsi_device = &device->instance->physicalDevice.wsi_device; + struct wsi_device *wsi_device = &device->physical->wsi_device; const VkAllocationCallbacks *alloc; if (pAllocator) @@ -240,38 +278,9 @@ uint32_t* pImageIndex) { ANV_FROM_HANDLE(anv_device, device, _device); - struct anv_physical_device *pdevice = &device->instance->physicalDevice; - - VkResult result = wsi_common_acquire_next_image2(&pdevice->wsi_device, - _device, - pAcquireInfo, - pImageIndex); - - /* Thanks to implicit sync, the image is ready immediately. However, we - * should wait for the current GPU state to finish. Regardless of the - * result of the presentation, we need to signal the semaphore & fence. - */ - - if (pAcquireInfo->semaphore != VK_NULL_HANDLE) { - /* Put a dummy semaphore in temporary, this is the fastest way to avoid - * any kind of work yet still provide some kind of synchronization. This - * only works because the Mesa WSI code always returns an image - * immediately if available. - */ - ANV_FROM_HANDLE(anv_semaphore, semaphore, pAcquireInfo->semaphore); - anv_semaphore_reset_temporary(device, semaphore); - - struct anv_semaphore_impl *impl = &semaphore->temporary; - - impl->type = ANV_SEMAPHORE_TYPE_DUMMY; - } - - if (pAcquireInfo->fence != VK_NULL_HANDLE) { - result = anv_QueueSubmit(anv_queue_to_handle(&device->queue), - 0, NULL, pAcquireInfo->fence); - } - return result; + return wsi_common_acquire_next_image2(&device->physical->wsi_device, + _device, pAcquireInfo, pImageIndex); } VkResult anv_QueuePresentKHR( @@ -279,10 +288,8 @@ const VkPresentInfoKHR* pPresentInfo) { ANV_FROM_HANDLE(anv_queue, queue, _queue); - struct anv_physical_device *pdevice = - &queue->device->instance->physicalDevice; - return wsi_common_queue_present(&pdevice->wsi_device, + return wsi_common_queue_present(&queue->device->physical->wsi_device, anv_device_to_handle(queue->device), _queue, 0, pPresentInfo); diff -Nru mesa-19.2.8/src/intel/vulkan/anv_wsi_display.c mesa-20.0.8/src/intel/vulkan/anv_wsi_display.c --- mesa-19.2.8/src/intel/vulkan/anv_wsi_display.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/vulkan/anv_wsi_display.c 2020-06-12 01:21:17.000000000 +0000 @@ -241,7 +241,7 @@ ANV_FROM_HANDLE(anv_device, device, _device); return wsi_display_power_control( - _device, &device->instance->physicalDevice.wsi_device, + _device, &device->physical->wsi_device, display, display_power_info); } @@ -255,7 +255,7 @@ struct anv_fence *fence; VkResult ret; - fence = vk_zalloc2(&device->instance->alloc, allocator, sizeof (*fence), 8, + fence = vk_zalloc2(&device->alloc, allocator, sizeof (*fence), 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); if (!fence) return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); @@ -263,14 +263,14 @@ fence->permanent.type = ANV_FENCE_TYPE_WSI; ret = wsi_register_device_event(_device, - &device->instance->physicalDevice.wsi_device, + &device->physical->wsi_device, device_event_info, allocator, &fence->permanent.fence_wsi); if (ret == VK_SUCCESS) *_fence = anv_fence_to_handle(fence); else - vk_free2(&device->instance->alloc, allocator, fence); + vk_free2(&device->alloc, allocator, fence); return ret; } @@ -293,7 +293,7 @@ fence->permanent.type = ANV_FENCE_TYPE_WSI; ret = wsi_register_display_event( - _device, &device->instance->physicalDevice.wsi_device, + _device, &device->physical->wsi_device, display, display_event_info, allocator, &(fence->permanent.fence_wsi)); if (ret == VK_SUCCESS) @@ -312,6 +312,6 @@ ANV_FROM_HANDLE(anv_device, device, _device); return wsi_get_swapchain_counter( - _device, &device->instance->physicalDevice.wsi_device, + _device, &device->physical->wsi_device, swapchain, flag_bits, value); } diff -Nru mesa-19.2.8/src/intel/vulkan/gen8_cmd_buffer.c mesa-20.0.8/src/intel/vulkan/gen8_cmd_buffer.c --- mesa-19.2.8/src/intel/vulkan/gen8_cmd_buffer.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/vulkan/gen8_cmd_buffer.c 2020-06-12 01:21:17.000000000 +0000 @@ -140,6 +140,14 @@ pc.DepthCacheFlushEnable = true; pc.CommandStreamerStallEnable = true; pc.RenderTargetCacheFlushEnable = true; +#if GEN_GEN >= 12 + pc.TileCacheFlushEnable = true; + + /* GEN:BUG:1409600907: "PIPE_CONTROL with Depth Stall Enable bit must + * be set with any PIPE_CONTROL with Depth Flush Enable bit set. + */ + pc.DepthStallEnable = true; +#endif } #if GEN_GEN == 9 @@ -179,6 +187,9 @@ pc.DepthStallEnable = true; pc.DepthCacheFlushEnable = true; pc.RenderTargetCacheFlushEnable = true; +#if GEN_GEN >= 12 + pc.TileCacheFlushEnable = true; +#endif } } @@ -542,6 +553,19 @@ } #endif +#if GEN_GEN >= 12 + if(cmd_buffer->state.gfx.dirty & (ANV_CMD_DIRTY_PIPELINE | + ANV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS)) { + anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_DEPTH_BOUNDS), db) { + db.DepthBoundsTestValueModifyDisable = false; + db.DepthBoundsTestEnableModifyDisable = false; + db.DepthBoundsTestEnable = pipeline->depth_bounds_test_enable; + db.DepthBoundsTestMinValue = d->depth_bounds.min; + db.DepthBoundsTestMaxValue = d->depth_bounds.max; + } + } +#endif + if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_DYNAMIC_LINE_STIPPLE) { anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_LINE_STIPPLE), ls) { ls.LineStipplePattern = d->line_stipple.pattern; diff -Nru mesa-19.2.8/src/intel/vulkan/genX_blorp_exec.c mesa-20.0.8/src/intel/vulkan/genX_blorp_exec.c --- mesa-19.2.8/src/intel/vulkan/genX_blorp_exec.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/vulkan/genX_blorp_exec.c 2020-06-12 01:21:17.000000000 +0000 @@ -57,17 +57,17 @@ struct blorp_address address, uint32_t delta) { struct anv_cmd_buffer *cmd_buffer = batch->driver_batch; + uint64_t address_u64 = 0; VkResult result = anv_reloc_list_add(&cmd_buffer->surface_relocs, &cmd_buffer->pool->alloc, - ss_offset, address.buffer, address.offset + delta); + ss_offset, address.buffer, address.offset + delta, + &address_u64); if (result != VK_SUCCESS) anv_batch_set_error(&cmd_buffer->batch, result); void *dest = anv_block_pool_map( - &cmd_buffer->device->surface_state_pool.block_pool, ss_offset); - uint64_t val = ((struct anv_bo*)address.buffer)->offset + address.offset + - delta; - write_reloc(cmd_buffer->device, dest, val, false); + &cmd_buffer->device->surface_state_pool.block_pool, ss_offset, 8); + write_reloc(cmd_buffer->device, dest, address_u64, false); } static uint64_t @@ -139,26 +139,13 @@ struct blorp_address *addr) { struct anv_cmd_buffer *cmd_buffer = batch->driver_batch; - - /* From the Skylake PRM, 3DSTATE_VERTEX_BUFFERS: - * - * "The VF cache needs to be invalidated before binding and then using - * Vertex Buffers that overlap with any previously bound Vertex Buffer - * (at a 64B granularity) since the last invalidation. A VF cache - * invalidate is performed by setting the "VF Cache Invalidation Enable" - * bit in PIPE_CONTROL." - * - * This restriction first appears in the Skylake PRM but the internal docs - * also list it as being an issue on Broadwell. In order to avoid this - * problem, we align all vertex buffer allocations to 64 bytes. - */ struct anv_state vb_state = anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, size, 64); *addr = (struct blorp_address) { .buffer = cmd_buffer->device->dynamic_state_pool.block_pool.bo, .offset = vb_state.offset, - .mocs = cmd_buffer->device->default_mocs, + .mocs = cmd_buffer->device->isl_dev.mocs.internal, }; return vb_state.map; @@ -167,24 +154,39 @@ static void blorp_vf_invalidate_for_vb_48b_transitions(struct blorp_batch *batch, const struct blorp_address *addrs, + uint32_t *sizes, unsigned num_vbs) { - /* anv forces all vertex buffers into the low 4GB so there are never any - * transitions that require a VF invalidation. + struct anv_cmd_buffer *cmd_buffer = batch->driver_batch; + + for (unsigned i = 0; i < num_vbs; i++) { + struct anv_address anv_addr = { + .bo = addrs[i].buffer, + .offset = addrs[i].offset, + }; + genX(cmd_buffer_set_binding_for_gen8_vb_flush)(cmd_buffer, + i, anv_addr, sizes[i]); + } + + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + + /* Technically, we should call this *after* 3DPRIMITIVE but it doesn't + * really matter for blorp because we never call apply_pipe_flushes after + * this point. */ + genX(cmd_buffer_update_dirty_vbs_for_gen8_vb_flush)(cmd_buffer, SEQUENTIAL, + (1 << num_vbs) - 1); } -#if GEN_GEN >= 8 -static struct blorp_address +UNUSED static struct blorp_address blorp_get_workaround_page(struct blorp_batch *batch) { struct anv_cmd_buffer *cmd_buffer = batch->driver_batch; return (struct blorp_address) { - .buffer = &cmd_buffer->device->workaround_bo, + .buffer = cmd_buffer->device->workaround_bo, }; } -#endif static void blorp_flush_range(struct blorp_batch *batch, void *start, size_t size) @@ -193,22 +195,11 @@ */ } -static void -blorp_emit_urb_config(struct blorp_batch *batch, - unsigned vs_entry_size, unsigned sf_entry_size) +static const struct gen_l3_config * +blorp_get_l3_config(struct blorp_batch *batch) { - struct anv_device *device = batch->blorp->driver_ctx; struct anv_cmd_buffer *cmd_buffer = batch->driver_batch; - - assert(sf_entry_size == 0); - - const unsigned entry_size[4] = { vs_entry_size, 1, 1, 1 }; - - genX(emit_urb_setup)(device, &cmd_buffer->batch, - cmd_buffer->state.current_l3_config, - VK_SHADER_STAGE_VERTEX_BIT | - VK_SHADER_STAGE_FRAGMENT_BIT, - entry_size); + return cmd_buffer->state.current_l3_config; } void @@ -264,6 +255,20 @@ blorp_exec(batch, params); +#if GEN_GEN >= 11 + /* The PIPE_CONTROL command description says: + * + * "Whenever a Binding Table Index (BTI) used by a Render Taget Message + * points to a different RENDER_SURFACE_STATE, SW must issue a Render + * Target Cache Flush by enabling this bit. When render target flush + * is set due to new association of BTI, PS Scoreboard Stall bit must + * be set in this packet." + */ + cmd_buffer->state.pending_pipe_bits |= + ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | + ANV_PIPE_STALL_AT_SCOREBOARD_BIT; +#endif + cmd_buffer->state.gfx.vb_dirty = ~0; cmd_buffer->state.gfx.dirty = ~0; cmd_buffer->state.push_constants_dirty = ~0; diff -Nru mesa-19.2.8/src/intel/vulkan/genX_cmd_buffer.c mesa-20.0.8/src/intel/vulkan/genX_cmd_buffer.c --- mesa-19.2.8/src/intel/vulkan/genX_cmd_buffer.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/vulkan/genX_cmd_buffer.c 2020-06-12 01:21:17.000000000 +0000 @@ -29,6 +29,7 @@ #include "vk_util.h" #include "util/fast_idiv_by_const.h" +#include "common/gen_aux_map.h" #include "common/gen_l3_config.h" #include "genxml/gen_macros.h" #include "genxml/genX_pack.h" @@ -39,6 +40,9 @@ #define __gen_address_offset anv_address_add #include "common/gen_mi_builder.h" +static void genX(flush_pipeline_select)(struct anv_cmd_buffer *cmd_buffer, + uint32_t pipeline); + static void emit_lri(struct anv_batch *batch, uint32_t reg, uint32_t imm) { @@ -52,6 +56,8 @@ genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer) { struct anv_device *device = cmd_buffer->device; + UNUSED const struct gen_device_info *devinfo = &device->info; + uint32_t mocs = device->isl_dev.mocs.internal; /* If we are emitting a new state base address we probably need to re-emit * binding tables. @@ -69,32 +75,56 @@ pc.DCFlushEnable = true; pc.RenderTargetCacheFlushEnable = true; pc.CommandStreamerStallEnable = true; +#if GEN_GEN >= 12 + pc.TileCacheFlushEnable = true; +#endif +#if GEN_GEN == 12 + /* GEN:BUG:1606662791: + * + * Software must program PIPE_CONTROL command with "HDC Pipeline + * Flush" prior to programming of the below two non-pipeline state : + * * STATE_BASE_ADDRESS + * * 3DSTATE_BINDING_TABLE_POOL_ALLOC + */ + if (devinfo->revision == 0 /* A0 */) + pc.HDCPipelineFlushEnable = true; +#endif } +#if GEN_GEN == 12 + /* GEN:BUG:1607854226: + * + * Workaround the non pipelined state not applying in MEDIA/GPGPU pipeline + * mode by putting the pipeline temporarily in 3D mode. + */ + uint32_t gen12_wa_pipeline = cmd_buffer->state.current_pipeline; + genX(flush_pipeline_select_3d)(cmd_buffer); +#endif + anv_batch_emit(&cmd_buffer->batch, GENX(STATE_BASE_ADDRESS), sba) { sba.GeneralStateBaseAddress = (struct anv_address) { NULL, 0 }; - sba.GeneralStateMOCS = GENX(MOCS); + sba.GeneralStateMOCS = mocs; sba.GeneralStateBaseAddressModifyEnable = true; - sba.StatelessDataPortAccessMOCS = GENX(MOCS); + sba.StatelessDataPortAccessMOCS = mocs; sba.SurfaceStateBaseAddress = anv_cmd_buffer_surface_base_address(cmd_buffer); - sba.SurfaceStateMOCS = GENX(MOCS); + sba.SurfaceStateMOCS = mocs; sba.SurfaceStateBaseAddressModifyEnable = true; sba.DynamicStateBaseAddress = (struct anv_address) { device->dynamic_state_pool.block_pool.bo, 0 }; - sba.DynamicStateMOCS = GENX(MOCS); + sba.DynamicStateMOCS = mocs; sba.DynamicStateBaseAddressModifyEnable = true; sba.IndirectObjectBaseAddress = (struct anv_address) { NULL, 0 }; - sba.IndirectObjectMOCS = GENX(MOCS); + sba.IndirectObjectMOCS = mocs; sba.IndirectObjectBaseAddressModifyEnable = true; sba.InstructionBaseAddress = (struct anv_address) { device->instruction_state_pool.block_pool.bo, 0 }; - sba.InstructionMOCS = GENX(MOCS); + sba.InstructionMOCS = mocs; sba.InstructionBaseAddressModifyEnable = true; # if (GEN_GEN >= 8) @@ -102,13 +132,21 @@ * these fields. However, since we will be growing the BO's live, we * just set them all to the maximum. */ - sba.GeneralStateBufferSize = 0xfffff; + sba.GeneralStateBufferSize = 0xfffff; + sba.IndirectObjectBufferSize = 0xfffff; + if (device->physical->use_softpin) { + /* With softpin, we use fixed addresses so we actually know how big + * our base addresses are. + */ + sba.DynamicStateBufferSize = DYNAMIC_STATE_POOL_SIZE / 4096; + sba.InstructionBufferSize = INSTRUCTION_STATE_POOL_SIZE / 4096; + } else { + sba.DynamicStateBufferSize = 0xfffff; + sba.InstructionBufferSize = 0xfffff; + } sba.GeneralStateBufferSizeModifyEnable = true; - sba.DynamicStateBufferSize = 0xfffff; - sba.DynamicStateBufferSizeModifyEnable = true; - sba.IndirectObjectBufferSize = 0xfffff; sba.IndirectObjectBufferSizeModifyEnable = true; - sba.InstructionBufferSize = 0xfffff; + sba.DynamicStateBufferSizeModifyEnable = true; sba.InstructionBuffersizeModifyEnable = true; # else /* On gen7, we have upper bounds instead. According to the docs, @@ -129,7 +167,7 @@ sba.InstructionAccessUpperBoundModifyEnable = true; # endif # if (GEN_GEN >= 9) - if (cmd_buffer->device->instance->physicalDevice.use_softpin) { + if (cmd_buffer->device->physical->use_softpin) { sba.BindlessSurfaceStateBaseAddress = (struct anv_address) { .bo = device->surface_state_pool.block_pool.bo, .offset = 0, @@ -139,17 +177,26 @@ sba.BindlessSurfaceStateBaseAddress = ANV_NULL_ADDRESS; sba.BindlessSurfaceStateSize = 0; } - sba.BindlessSurfaceStateMOCS = GENX(MOCS); + sba.BindlessSurfaceStateMOCS = mocs; sba.BindlessSurfaceStateBaseAddressModifyEnable = true; # endif # if (GEN_GEN >= 10) sba.BindlessSamplerStateBaseAddress = (struct anv_address) { NULL, 0 }; - sba.BindlessSamplerStateMOCS = GENX(MOCS); + sba.BindlessSamplerStateMOCS = mocs; sba.BindlessSamplerStateBaseAddressModifyEnable = true; sba.BindlessSamplerStateBufferSize = 0; # endif } +#if GEN_GEN == 12 + /* GEN:BUG:1607854226: + * + * Put the pipeline back into its current mode. + */ + if (gen12_wa_pipeline != UINT32_MAX) + genX(flush_pipeline_select)(cmd_buffer, gen12_wa_pipeline); +#endif + /* After re-setting the surface state base address, we have to do some * cache flusing so that the sampler engine will pick up the new * SURFACE_STATE objects and binding tables. From the Broadwell PRM, @@ -203,7 +250,7 @@ VkResult result = anv_reloc_list_add(&cmd_buffer->surface_relocs, &cmd_buffer->pool->alloc, state.offset + isl_dev->ss.addr_offset, - addr.bo, addr.offset); + addr.bo, addr.offset, NULL); if (result != VK_SUCCESS) anv_batch_set_error(&cmd_buffer->batch, result); } @@ -222,7 +269,9 @@ anv_reloc_list_add(&cmd_buffer->surface_relocs, &cmd_buffer->pool->alloc, state.state.offset + isl_dev->ss.aux_addr_offset, - state.aux_address.bo, state.aux_address.offset); + state.aux_address.bo, + state.aux_address.offset, + NULL); if (result != VK_SUCCESS) anv_batch_set_error(&cmd_buffer->batch, result); } @@ -233,7 +282,9 @@ &cmd_buffer->pool->alloc, state.state.offset + isl_dev->ss.clear_color_state_offset, - state.clear_address.bo, state.clear_address.offset); + state.clear_address.bo, + state.clear_address.offset, + NULL); if (result != VK_SUCCESS) anv_batch_set_error(&cmd_buffer->batch, result); } @@ -265,6 +316,7 @@ att_state->aux_usage = anv_layout_to_aux_usage(&device->info, iview->image, VK_IMAGE_ASPECT_COLOR_BIT, + VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT, VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL); /* If we don't have aux, then we should have returned early in the layer @@ -287,7 +339,8 @@ * In other words, we can only sample from a fast-cleared image if it * also supports color compression. */ - if (isl_format_supports_ccs_e(&device->info, iview->planes[0].isl.format)) { + if (isl_format_supports_ccs_e(&device->info, iview->planes[0].isl.format) && + isl_format_supports_ccs_d(&device->info, iview->planes[0].isl.format)) { att_state->input_aux_usage = ISL_AUX_USAGE_CCS_D; /* While fast-clear resolves and partial resolves are fairly cheap in the @@ -298,7 +351,7 @@ */ if (cmd_state->pass->attachments[att].first_subpass_layout == VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL) { - anv_perf_warn(device->instance, iview->image, + anv_perf_warn(device, iview->image, "Not temporarily enabling CCS_E."); } } else { @@ -362,13 +415,13 @@ if (att_state->fast_clear && (iview->planes[0].isl.base_level > 0 || iview->planes[0].isl.base_array_layer > 0)) { - anv_perf_warn(device->instance, iview->image, + anv_perf_warn(device, iview->image, "Rendering with multi-lod or multi-layer framebuffer " "with LOAD_OP_LOAD and baseMipLevel > 0 or " "baseArrayLayer > 0. Not fast clearing."); att_state->fast_clear = false; } else if (att_state->fast_clear && cmd_state->framebuffer->layers > 1) { - anv_perf_warn(device->instance, iview->image, + anv_perf_warn(device, iview->image, "Rendering to a multi-layer framebuffer with " "LOAD_OP_CLEAR. Only fast-clearing the first slice"); } @@ -394,6 +447,11 @@ att_state->aux_usage = ISL_AUX_USAGE_NONE; att_state->input_aux_usage = ISL_AUX_USAGE_NONE; + /* This is unused for depth/stencil but valgrind complains if it + * isn't initialized + */ + att_state->clear_color_is_zero_one = false; + if (GEN_GEN == 7) { /* We don't do any HiZ or depth fast-clears on gen7 yet */ att_state->fast_clear = false; @@ -416,13 +474,13 @@ const enum isl_aux_usage first_subpass_aux_usage = anv_layout_to_aux_usage(&device->info, iview->image, VK_IMAGE_ASPECT_DEPTH_BIT, + VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT, pass_att->first_subpass_layout); - if (first_subpass_aux_usage != ISL_AUX_USAGE_HIZ) - return; - - if (!blorp_can_hiz_clear_depth(GEN_GEN, - iview->planes[0].isl.format, - iview->image->samples, + if (!blorp_can_hiz_clear_depth(&device->info, + &iview->image->planes[0].surface.isl, + first_subpass_aux_usage, + iview->planes[0].isl.base_level, + iview->planes[0].isl.base_array_layer, render_area.offset.x, render_area.offset.y, render_area.offset.x + @@ -470,34 +528,52 @@ VkImageLayout initial_layout, VkImageLayout final_layout) { - const bool hiz_enabled = ISL_AUX_USAGE_HIZ == - anv_layout_to_aux_usage(&cmd_buffer->device->info, image, - VK_IMAGE_ASPECT_DEPTH_BIT, initial_layout); - const bool enable_hiz = ISL_AUX_USAGE_HIZ == - anv_layout_to_aux_usage(&cmd_buffer->device->info, image, - VK_IMAGE_ASPECT_DEPTH_BIT, final_layout); - - enum isl_aux_op hiz_op; - if (hiz_enabled && !enable_hiz) { - hiz_op = ISL_AUX_OP_FULL_RESOLVE; - } else if (!hiz_enabled && enable_hiz) { - hiz_op = ISL_AUX_OP_AMBIGUATE; - } else { - assert(hiz_enabled == enable_hiz); - /* If the same buffer will be used, no resolves are necessary. */ - hiz_op = ISL_AUX_OP_NONE; - } + uint32_t depth_plane = + anv_image_aspect_to_plane(image->aspects, VK_IMAGE_ASPECT_DEPTH_BIT); + if (image->planes[depth_plane].aux_usage == ISL_AUX_USAGE_NONE) + return; - if (hiz_op != ISL_AUX_OP_NONE) + const enum isl_aux_state initial_state = + anv_layout_to_aux_state(&cmd_buffer->device->info, image, + VK_IMAGE_ASPECT_DEPTH_BIT, + initial_layout); + const enum isl_aux_state final_state = + anv_layout_to_aux_state(&cmd_buffer->device->info, image, + VK_IMAGE_ASPECT_DEPTH_BIT, + final_layout); + + const bool initial_depth_valid = + isl_aux_state_has_valid_primary(initial_state); + const bool initial_hiz_valid = + isl_aux_state_has_valid_aux(initial_state); + const bool final_needs_depth = + isl_aux_state_has_valid_primary(final_state); + const bool final_needs_hiz = + isl_aux_state_has_valid_aux(final_state); + + /* Getting into the pass-through state for Depth is tricky and involves + * both a resolve and an ambiguate. We don't handle that state right now + * as anv_layout_to_aux_state never returns it. + */ + assert(final_state != ISL_AUX_STATE_PASS_THROUGH); + + if (final_needs_depth && !initial_depth_valid) { + assert(initial_hiz_valid); + anv_image_hiz_op(cmd_buffer, image, VK_IMAGE_ASPECT_DEPTH_BIT, + 0, 0, 1, ISL_AUX_OP_FULL_RESOLVE); + } else if (final_needs_hiz && !initial_hiz_valid) { + assert(initial_depth_valid); anv_image_hiz_op(cmd_buffer, image, VK_IMAGE_ASPECT_DEPTH_BIT, - 0, 0, 1, hiz_op); + 0, 0, 1, ISL_AUX_OP_AMBIGUATE); + } } static inline bool vk_image_layout_stencil_write_optimal(VkImageLayout layout) { return layout == VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL || - layout == VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_STENCIL_ATTACHMENT_OPTIMAL; + layout == VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_STENCIL_ATTACHMENT_OPTIMAL || + layout == VK_IMAGE_LAYOUT_STENCIL_ATTACHMENT_OPTIMAL_KHR; } /* Transitions a HiZ-enabled depth buffer from one layout to another. Unless @@ -518,18 +594,19 @@ /* On gen7, we have to store a texturable version of the stencil buffer in * a shadow whenever VK_IMAGE_USAGE_SAMPLED_BIT is set and copy back and - * forth at strategic points. Stencil writes are only allowed in three + * forth at strategic points. Stencil writes are only allowed in following * layouts: * * - VK_IMAGE_LAYOUT_GENERAL * - VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL * - VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL * - VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_STENCIL_ATTACHMENT_OPTIMAL + * - VK_IMAGE_LAYOUT_STENCIL_ATTACHMENT_OPTIMAL_KHR * * For general, we have no nice opportunity to transition so we do the copy - * to the shadow unconditionally at the end of the subpass. For transfer - * destinations, we can update it as part of the transfer op. For the - * other two, we delay the copy until a transition into some other layout. + * to the shadow unconditionally at the end of the subpass. For transfer + * destinations, we can update it as part of the transfer op. For the other + * layouts, we delay the copy until a transition into some other layout. */ if (image->planes[plane].shadow_surface.isl.size_B > 0 && vk_image_layout_stencil_write_optimal(initial_layout) && @@ -725,6 +802,7 @@ anv_cmd_predicated_ccs_resolve(struct anv_cmd_buffer *cmd_buffer, const struct anv_image *image, enum isl_format format, + struct isl_swizzle swizzle, VkImageAspectFlagBits aspect, uint32_t level, uint32_t array_layer, enum isl_aux_op resolve_op, @@ -746,17 +824,18 @@ * to do a partial resolve on a CCS_D surface. */ if (resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE && - image->planes[plane].aux_usage == ISL_AUX_USAGE_NONE) + image->planes[plane].aux_usage == ISL_AUX_USAGE_CCS_D) resolve_op = ISL_AUX_OP_FULL_RESOLVE; - anv_image_ccs_op(cmd_buffer, image, format, aspect, level, - array_layer, 1, resolve_op, NULL, true); + anv_image_ccs_op(cmd_buffer, image, format, swizzle, aspect, + level, array_layer, 1, resolve_op, NULL, true); } static void anv_cmd_predicated_mcs_resolve(struct anv_cmd_buffer *cmd_buffer, const struct anv_image *image, enum isl_format format, + struct isl_swizzle swizzle, VkImageAspectFlagBits aspect, uint32_t array_layer, enum isl_aux_op resolve_op, @@ -770,7 +849,7 @@ aspect, 0, array_layer, resolve_op, fast_clear_supported); - anv_image_mcs_op(cmd_buffer, image, format, aspect, + anv_image_mcs_op(cmd_buffer, image, format, swizzle, aspect, array_layer, 1, resolve_op, NULL, true); #else unreachable("MCS resolves are unsupported on Ivybridge and Bay Trail"); @@ -921,6 +1000,105 @@ } } +#define READ_ONCE(x) (*(volatile __typeof__(x) *)&(x)) + +#if GEN_GEN == 12 +static void +anv_image_init_aux_tt(struct anv_cmd_buffer *cmd_buffer, + const struct anv_image *image, + VkImageAspectFlagBits aspect, + uint32_t base_level, uint32_t level_count, + uint32_t base_layer, uint32_t layer_count) +{ + uint32_t plane = anv_image_aspect_to_plane(image->aspects, aspect); + + uint64_t base_address = + anv_address_physical(image->planes[plane].address); + + const struct isl_surf *isl_surf = &image->planes[plane].surface.isl; + uint64_t format_bits = gen_aux_map_format_bits_for_isl_surf(isl_surf); + + /* We're about to live-update the AUX-TT. We really don't want anyone else + * trying to read it while we're doing this. We could probably get away + * with not having this stall in some cases if we were really careful but + * it's better to play it safe. Full stall the GPU. + */ + cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_END_OF_PIPE_SYNC_BIT; + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + + struct gen_mi_builder b; + gen_mi_builder_init(&b, &cmd_buffer->batch); + + for (uint32_t a = 0; a < layer_count; a++) { + const uint32_t layer = base_layer + a; + + uint64_t start_offset_B = UINT64_MAX, end_offset_B = 0; + for (uint32_t l = 0; l < level_count; l++) { + const uint32_t level = base_level + l; + + uint32_t logical_array_layer, logical_z_offset_px; + if (image->type == VK_IMAGE_TYPE_3D) { + logical_array_layer = 0; + + /* If the given miplevel does not have this layer, then any higher + * miplevels won't either because miplevels only get smaller the + * higher the LOD. + */ + assert(layer < image->extent.depth); + if (layer >= anv_minify(image->extent.depth, level)) + break; + logical_z_offset_px = layer; + } else { + assert(layer < image->array_size); + logical_array_layer = layer; + logical_z_offset_px = 0; + } + + uint32_t slice_start_offset_B, slice_end_offset_B; + isl_surf_get_image_range_B_tile(isl_surf, level, + logical_array_layer, + logical_z_offset_px, + &slice_start_offset_B, + &slice_end_offset_B); + + start_offset_B = MIN2(start_offset_B, slice_start_offset_B); + end_offset_B = MAX2(end_offset_B, slice_end_offset_B); + } + + /* Aux operates 64K at a time */ + start_offset_B = align_down_u64(start_offset_B, 64 * 1024); + end_offset_B = align_u64(end_offset_B, 64 * 1024); + + for (uint64_t offset = start_offset_B; + offset < end_offset_B; offset += 64 * 1024) { + uint64_t address = base_address + offset; + + uint64_t aux_entry_addr64, *aux_entry_map; + aux_entry_map = gen_aux_map_get_entry(cmd_buffer->device->aux_map_ctx, + address, &aux_entry_addr64); + + assert(cmd_buffer->device->physical->use_softpin); + struct anv_address aux_entry_address = { + .bo = NULL, + .offset = aux_entry_addr64, + }; + + const uint64_t old_aux_entry = READ_ONCE(*aux_entry_map); + uint64_t new_aux_entry = + (old_aux_entry & GEN_AUX_MAP_ADDRESS_MASK) | format_bits; + + if (isl_aux_usage_has_ccs(image->planes[plane].aux_usage)) + new_aux_entry |= GEN_AUX_MAP_ENTRY_VALID_BIT; + + gen_mi_store(&b, gen_mi_mem64(aux_entry_address), + gen_mi_imm(new_aux_entry)); + } + } + + cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_AUX_TABLE_INVALIDATE_BIT; +} +#endif /* GEN_GEN == 12 */ + /** * @brief Transitions a color buffer from one layout to another. * @@ -941,7 +1119,8 @@ VkImageLayout initial_layout, VkImageLayout final_layout) { - const struct gen_device_info *devinfo = &cmd_buffer->device->info; + struct anv_device *device = cmd_buffer->device; + const struct gen_device_info *devinfo = &device->info; /* Validate the inputs. */ assert(cmd_buffer); assert(image && image->aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV); @@ -986,10 +1165,20 @@ if (base_layer >= anv_image_aux_layers(image, aspect, base_level)) return; - assert(image->tiling == VK_IMAGE_TILING_OPTIMAL); + assert(image->planes[plane].surface.isl.tiling != ISL_TILING_LINEAR); if (initial_layout == VK_IMAGE_LAYOUT_UNDEFINED || initial_layout == VK_IMAGE_LAYOUT_PREINITIALIZED) { +#if GEN_GEN == 12 + if (device->physical->has_implicit_ccs && devinfo->has_aux_map) { + anv_image_init_aux_tt(cmd_buffer, image, aspect, + base_level, level_count, + base_layer, layer_count); + } +#else + assert(!(device->physical->has_implicit_ccs && devinfo->has_aux_map)); +#endif + /* A subresource in the undefined layout may have been aliased and * populated with any arrangement of bits. Therefore, we must initialize * the related aux buffer and clear buffer entry with desirable values. @@ -1046,6 +1235,7 @@ anv_image_ccs_op(cmd_buffer, image, image->planes[plane].surface.isl.format, + ISL_SWIZZLE_IDENTITY, aspect, level, base_layer, level_layer_count, ISL_AUX_OP_AMBIGUATE, NULL, false); @@ -1057,7 +1247,7 @@ } } else { if (image->samples == 4 || image->samples == 16) { - anv_perf_warn(cmd_buffer->device->instance, image, + anv_perf_warn(cmd_buffer->device, image, "Doing a potentially unnecessary fast-clear to " "define an MCS buffer."); } @@ -1065,6 +1255,7 @@ assert(base_level == 0 && level_count == 1); anv_image_mcs_op(cmd_buffer, image, image->planes[plane].surface.isl.format, + ISL_SWIZZLE_IDENTITY, aspect, base_layer, layer_count, ISL_AUX_OP_FAST_CLEAR, NULL, false); } @@ -1072,9 +1263,9 @@ } const enum isl_aux_usage initial_aux_usage = - anv_layout_to_aux_usage(devinfo, image, aspect, initial_layout); + anv_layout_to_aux_usage(devinfo, image, aspect, 0, initial_layout); const enum isl_aux_usage final_aux_usage = - anv_layout_to_aux_usage(devinfo, image, aspect, final_layout); + anv_layout_to_aux_usage(devinfo, image, aspect, 0, final_layout); /* The current code assumes that there is no mixing of CCS_E and CCS_D. * We can handle transitions between CCS_D/E to and from NONE. What we @@ -1128,7 +1319,7 @@ * we do any more rendering or clearing. */ cmd_buffer->state.pending_pipe_bits |= - ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | ANV_PIPE_CS_STALL_BIT; + ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | ANV_PIPE_END_OF_PIPE_SYNC_BIT; for (uint32_t l = 0; l < level_count; l++) { uint32_t level = base_level + l; @@ -1144,6 +1335,7 @@ if (image->samples == 1) { anv_cmd_predicated_ccs_resolve(cmd_buffer, image, image->planes[plane].surface.isl.format, + ISL_SWIZZLE_IDENTITY, aspect, level, array_layer, resolve_op, final_fast_clear); } else { @@ -1157,6 +1349,7 @@ anv_cmd_predicated_mcs_resolve(cmd_buffer, image, image->planes[plane].surface.isl.format, + ISL_SWIZZLE_IDENTITY, aspect, array_layer, resolve_op, final_fast_clear); } @@ -1164,7 +1357,7 @@ } cmd_buffer->state.pending_pipe_bits |= - ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | ANV_PIPE_CS_STALL_BIT; + ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | ANV_PIPE_END_OF_PIPE_SYNC_BIT; } /** @@ -1285,6 +1478,7 @@ } state->attachments[i].current_layout = att->initial_layout; + state->attachments[i].current_stencil_layout = att->stencil_initial_layout; state->attachments[i].pending_clear_aspects = clear_aspects; state->attachments[i].pending_load_aspects = load_aspects; if (clear_aspects) @@ -1380,9 +1574,18 @@ * executing anything. The chances are fairly high that they will use * blorp at least once per primary command buffer so it shouldn't be * wasted. + * + * There is also a workaround on gen8 which requires us to invalidate the + * VF cache occasionally. It's easier if we can assume we start with a + * fresh cache (See also genX(cmd_buffer_set_binding_for_gen8_vb_flush).) */ - if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) - cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_VF_CACHE_INVALIDATE_BIT; + cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_VF_CACHE_INVALIDATE_BIT; + + /* Re-emit the aux table register in every command buffer. This way we're + * ensured that we have the table even if this command buffer doesn't + * initialize any images. + */ + cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_AUX_TABLE_INVALIDATE_BIT; /* We send an "Indirect State Pointers Disable" packet at * EndCommandBuffer, so all push contant packets are ignored during a @@ -1419,7 +1622,9 @@ enum isl_aux_usage aux_usage = anv_layout_to_aux_usage(&cmd_buffer->device->info, iview->image, - VK_IMAGE_ASPECT_DEPTH_BIT, layout); + VK_IMAGE_ASPECT_DEPTH_BIT, + VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT, + layout); cmd_buffer->state.hiz_enabled = aux_usage == ISL_AUX_USAGE_HIZ; } @@ -1587,6 +1792,14 @@ anv_cmd_buffer_add_secondary(primary, secondary); } + /* The secondary isn't counted in our VF cache tracking so we need to + * invalidate the whole thing. + */ + if (GEN_GEN >= 8 && GEN_GEN <= 9) { + primary->state.pending_pipe_bits |= + ANV_PIPE_CS_STALL_BIT | ANV_PIPE_VF_CACHE_INVALIDATE_BIT; + } + /* The secondary may have selected a different pipeline (3D or compute) and * may have changed the current L3$ configuration. Reset our tracking * variables to invalid values to ensure that we re-emit these in the case @@ -1627,7 +1840,7 @@ gen_dump_l3_config(cfg, stderr); } - const bool has_slm = cfg->n[GEN_L3P_SLM]; + UNUSED const bool has_slm = cfg->n[GEN_L3P_SLM]; /* According to the hardware docs, the L3 partitioning can only be changed * while the pipeline is completely drained and the caches are flushed, @@ -1674,9 +1887,19 @@ assert(!cfg->n[GEN_L3P_IS] && !cfg->n[GEN_L3P_C] && !cfg->n[GEN_L3P_T]); +#if GEN_GEN >= 12 +#define L3_ALLOCATION_REG GENX(L3ALLOC) +#define L3_ALLOCATION_REG_num GENX(L3ALLOC_num) +#else +#define L3_ALLOCATION_REG GENX(L3CNTLREG) +#define L3_ALLOCATION_REG_num GENX(L3CNTLREG_num) +#endif + uint32_t l3cr; - anv_pack_struct(&l3cr, GENX(L3CNTLREG), + anv_pack_struct(&l3cr, L3_ALLOCATION_REG, +#if GEN_GEN < 11 .SLMEnable = has_slm, +#endif #if GEN_GEN == 11 /* WA_1406697149: Bit 9 "Error Detection Behavior Control" must be set * in L3CNTLREG register. The default setting of the bit is not the @@ -1691,7 +1914,7 @@ .AllAllocation = cfg->n[GEN_L3P_ALL]); /* Set up the L3 partitioning. */ - emit_lri(&cmd_buffer->batch, GENX(L3CNTLREG_num), l3cr); + emit_lri(&cmd_buffer->batch, L3_ALLOCATION_REG_num, l3cr); #else @@ -1753,7 +1976,7 @@ emit_lri(&cmd_buffer->batch, GENX(L3CNTLREG3_num), l3cr3); #if GEN_IS_HASWELL - if (cmd_buffer->device->instance->physicalDevice.cmd_parser_version >= 4) { + if (cmd_buffer->device->physical->cmd_parser_version >= 4) { /* Enable L3 atomics on HSW if we have a DC partition, otherwise keep * them disabled to avoid crashing the system hard. */ @@ -1778,33 +2001,167 @@ { enum anv_pipe_bits bits = cmd_buffer->state.pending_pipe_bits; - /* Flushes are pipelined while invalidations are handled immediately. - * Therefore, if we're flushing anything then we need to schedule a stall - * before any invalidations can happen. + if (cmd_buffer->device->physical->always_flush_cache) + bits |= ANV_PIPE_FLUSH_BITS | ANV_PIPE_INVALIDATE_BITS; + + /* + * From Sandybridge PRM, volume 2, "1.7.2 End-of-Pipe Synchronization": + * + * Write synchronization is a special case of end-of-pipe + * synchronization that requires that the render cache and/or depth + * related caches are flushed to memory, where the data will become + * globally visible. This type of synchronization is required prior to + * SW (CPU) actually reading the result data from memory, or initiating + * an operation that will use as a read surface (such as a texture + * surface) a previous render target and/or depth/stencil buffer + * + * + * From Haswell PRM, volume 2, part 1, "End-of-Pipe Synchronization": + * + * Exercising the write cache flush bits (Render Target Cache Flush + * Enable, Depth Cache Flush Enable, DC Flush) in PIPE_CONTROL only + * ensures the write caches are flushed and doesn't guarantee the data + * is globally visible. + * + * SW can track the completion of the end-of-pipe-synchronization by + * using "Notify Enable" and "PostSync Operation - Write Immediate + * Data" in the PIPE_CONTROL command. + * + * In other words, flushes are pipelined while invalidations are handled + * immediately. Therefore, if we're flushing anything then we need to + * schedule an end-of-pipe sync before any invalidations can happen. */ if (bits & ANV_PIPE_FLUSH_BITS) - bits |= ANV_PIPE_NEEDS_CS_STALL_BIT; + bits |= ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT; - /* If we're going to do an invalidate and we have a pending CS stall that - * has yet to be resolved, we do the CS stall now. + + /* HSD 1209978178: docs say that before programming the aux table: + * + * "Driver must ensure that the engine is IDLE but ensure it doesn't + * add extra flushes in the case it knows that the engine is already + * IDLE." + */ + if (GEN_GEN == 12 && ANV_PIPE_AUX_TABLE_INVALIDATE_BIT) + bits |= ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT; + + /* If we're going to do an invalidate and we have a pending end-of-pipe + * sync that has yet to be resolved, we do the end-of-pipe sync now. */ if ((bits & ANV_PIPE_INVALIDATE_BITS) && - (bits & ANV_PIPE_NEEDS_CS_STALL_BIT)) { - bits |= ANV_PIPE_CS_STALL_BIT; - bits &= ~ANV_PIPE_NEEDS_CS_STALL_BIT; + (bits & ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT)) { + bits |= ANV_PIPE_END_OF_PIPE_SYNC_BIT; + bits &= ~ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT; + } + + if (GEN_GEN >= 12 && + ((bits & ANV_PIPE_DEPTH_CACHE_FLUSH_BIT) || + (bits & ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT))) { + /* From the PIPE_CONTROL instruction table, bit 28 (Tile Cache Flush + * Enable): + * + * Unified Cache (Tile Cache Disabled): + * + * When the Color and Depth (Z) streams are enabled to be cached in + * the DC space of L2, Software must use "Render Target Cache Flush + * Enable" and "Depth Cache Flush Enable" along with "Tile Cache + * Flush" for getting the color and depth (Z) write data to be + * globally observable. In this mode of operation it is not required + * to set "CS Stall" upon setting "Tile Cache Flush" bit. + */ + bits |= ANV_PIPE_TILE_CACHE_FLUSH_BIT; + } + + /* GEN:BUG:1409226450, Wait for EU to be idle before pipe control which + * invalidates the instruction cache + */ + if (GEN_GEN == 12 && (bits & ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT)) + bits |= ANV_PIPE_CS_STALL_BIT | ANV_PIPE_STALL_AT_SCOREBOARD_BIT; + + if ((GEN_GEN >= 8 && GEN_GEN <= 9) && + (bits & ANV_PIPE_CS_STALL_BIT) && + (bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT)) { + /* If we are doing a VF cache invalidate AND a CS stall (it must be + * both) then we can reset our vertex cache tracking. + */ + memset(cmd_buffer->state.gfx.vb_dirty_ranges, 0, + sizeof(cmd_buffer->state.gfx.vb_dirty_ranges)); + memset(&cmd_buffer->state.gfx.ib_dirty_range, 0, + sizeof(cmd_buffer->state.gfx.ib_dirty_range)); + } + + /* Project: SKL / Argument: LRI Post Sync Operation [23] + * + * "PIPECONTROL command with “Command Streamer Stall Enable” must be + * programmed prior to programming a PIPECONTROL command with "LRI + * Post Sync Operation" in GPGPU mode of operation (i.e when + * PIPELINE_SELECT command is set to GPGPU mode of operation)." + * + * The same text exists a few rows below for Post Sync Op. + */ + if (bits & ANV_PIPE_POST_SYNC_BIT) { + if (GEN_GEN == 9 && cmd_buffer->state.current_pipeline == GPGPU) + bits |= ANV_PIPE_CS_STALL_BIT; + bits &= ~ANV_PIPE_POST_SYNC_BIT; } - if (bits & (ANV_PIPE_FLUSH_BITS | ANV_PIPE_CS_STALL_BIT)) { + if (bits & (ANV_PIPE_FLUSH_BITS | ANV_PIPE_CS_STALL_BIT | + ANV_PIPE_END_OF_PIPE_SYNC_BIT)) { anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) { +#if GEN_GEN >= 12 + pipe.TileCacheFlushEnable = bits & ANV_PIPE_TILE_CACHE_FLUSH_BIT; +#endif pipe.DepthCacheFlushEnable = bits & ANV_PIPE_DEPTH_CACHE_FLUSH_BIT; pipe.DCFlushEnable = bits & ANV_PIPE_DATA_CACHE_FLUSH_BIT; pipe.RenderTargetCacheFlushEnable = bits & ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT; + /* GEN:BUG:1409600907: "PIPE_CONTROL with Depth Stall Enable bit must + * be set with any PIPE_CONTROL with Depth Flush Enable bit set. + */ +#if GEN_GEN >= 12 + pipe.DepthStallEnable = + pipe.DepthCacheFlushEnable || (bits & ANV_PIPE_DEPTH_STALL_BIT); +#else pipe.DepthStallEnable = bits & ANV_PIPE_DEPTH_STALL_BIT; +#endif + pipe.CommandStreamerStallEnable = bits & ANV_PIPE_CS_STALL_BIT; pipe.StallAtPixelScoreboard = bits & ANV_PIPE_STALL_AT_SCOREBOARD_BIT; + /* From Sandybridge PRM, volume 2, "1.7.3.1 Writing a Value to Memory": + * + * "The most common action to perform upon reaching a + * synchronization point is to write a value out to memory. An + * immediate value (included with the synchronization command) may + * be written." + * + * + * From Broadwell PRM, volume 7, "End-of-Pipe Synchronization": + * + * "In case the data flushed out by the render engine is to be + * read back in to the render engine in coherent manner, then the + * render engine has to wait for the fence completion before + * accessing the flushed data. This can be achieved by following + * means on various products: PIPE_CONTROL command with CS Stall + * and the required write caches flushed with Post-Sync-Operation + * as Write Immediate Data. + * + * Example: + * - Workload-1 (3D/GPGPU/MEDIA) + * - PIPE_CONTROL (CS Stall, Post-Sync-Operation Write + * Immediate Data, Required Write Cache Flush bits set) + * - Workload-2 (Can use the data produce or output by + * Workload-1) + */ + if (bits & ANV_PIPE_END_OF_PIPE_SYNC_BIT) { + pipe.CommandStreamerStallEnable = true; + pipe.PostSyncOperation = WriteImmediateData; + pipe.Address = (struct anv_address) { + .bo = cmd_buffer->device->workaround_bo, + .offset = 0 + }; + } + /* * According to the Broadwell documentation, any PIPE_CONTROL with the * "Command Streamer Stall" bit set must also have another bit set, @@ -1820,9 +2177,13 @@ * I chose "Stall at Pixel Scoreboard" since that's what we use in * mesa and it seems to work fine. The choice is fairly arbitrary. */ - if ((bits & ANV_PIPE_CS_STALL_BIT) && - !(bits & (ANV_PIPE_FLUSH_BITS | ANV_PIPE_DEPTH_STALL_BIT | - ANV_PIPE_STALL_AT_SCOREBOARD_BIT))) + if (pipe.CommandStreamerStallEnable && + !pipe.RenderTargetCacheFlushEnable && + !pipe.DepthCacheFlushEnable && + !pipe.StallAtPixelScoreboard && + !pipe.PostSyncOperation && + !pipe.DepthStallEnable && + !pipe.DCFlushEnable) pipe.StallAtPixelScoreboard = true; } @@ -1832,7 +2193,51 @@ if (bits & ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT) bits &= ~(ANV_PIPE_RENDER_TARGET_BUFFER_WRITES); - bits &= ~(ANV_PIPE_FLUSH_BITS | ANV_PIPE_CS_STALL_BIT); + if (GEN_IS_HASWELL) { + /* Haswell needs addition work-arounds: + * + * From Haswell PRM, volume 2, part 1, "End-of-Pipe Synchronization": + * + * Option 1: + * PIPE_CONTROL command with the CS Stall and the required write + * caches flushed with Post-SyncOperation as Write Immediate Data + * followed by eight dummy MI_STORE_DATA_IMM (write to scratch + * spce) commands. + * + * Example: + * - Workload-1 + * - PIPE_CONTROL (CS Stall, Post-Sync-Operation Write + * Immediate Data, Required Write Cache Flush bits set) + * - MI_STORE_DATA_IMM (8 times) (Dummy data, Scratch Address) + * - Workload-2 (Can use the data produce or output by + * Workload-1) + * + * Unfortunately, both the PRMs and the internal docs are a bit + * out-of-date in this regard. What the windows driver does (and + * this appears to actually work) is to emit a register read from the + * memory address written by the pipe control above. + * + * What register we load into doesn't matter. We choose an indirect + * rendering register because we know it always exists and it's one + * of the first registers the command parser allows us to write. If + * you don't have command parser support in your kernel (pre-4.2), + * this will get turned into MI_NOOP and you won't get the + * workaround. Unfortunately, there's just not much we can do in + * that case. This register is perfectly safe to write since we + * always re-load all of the indirect draw registers right before + * 3DPRIMITIVE when needed anyway. + */ + anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_MEM), lrm) { + lrm.RegisterAddress = 0x243C; /* GEN7_3DPRIM_START_INSTANCE */ + lrm.MemoryAddress = (struct anv_address) { + .bo = cmd_buffer->device->workaround_bo, + .offset = 0 + }; + } + } + + bits &= ~(ANV_PIPE_FLUSH_BITS | ANV_PIPE_CS_STALL_BIT | + ANV_PIPE_END_OF_PIPE_SYNC_BIT); } if (bits & ANV_PIPE_INVALIDATE_BITS) { @@ -1870,9 +2275,19 @@ if (GEN_GEN == 9 && pipe.VFCacheInvalidationEnable) { pipe.PostSyncOperation = WriteImmediateData; pipe.Address = - (struct anv_address) { &cmd_buffer->device->workaround_bo, 0 }; + (struct anv_address) { cmd_buffer->device->workaround_bo, 0 }; + } + } + +#if GEN_GEN == 12 + if ((bits & ANV_PIPE_AUX_TABLE_INVALIDATE_BIT) && + cmd_buffer->device->info.has_aux_map) { + anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_IMM), lri) { + lri.RegisterOffset = GENX(GFX_CCS_AUX_INV_num); + lri.DataDWord = 1; } } +#endif bits &= ~ANV_PIPE_INVALIDATE_BITS; } @@ -2029,34 +2444,6 @@ cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_ALL_GRAPHICS; } -static const struct anv_descriptor * -anv_descriptor_for_binding(const struct anv_cmd_pipeline_state *pipe_state, - const struct anv_pipeline_binding *binding) -{ - assert(binding->set < MAX_SETS); - const struct anv_descriptor_set *set = - pipe_state->descriptors[binding->set]; - const uint32_t offset = - set->layout->binding[binding->binding].descriptor_index; - return &set->descriptors[offset + binding->index]; -} - -static uint32_t -dynamic_offset_for_binding(const struct anv_cmd_pipeline_state *pipe_state, - const struct anv_pipeline_binding *binding) -{ - assert(binding->set < MAX_SETS); - const struct anv_descriptor_set *set = - pipe_state->descriptors[binding->set]; - - uint32_t dynamic_offset_idx = - pipe_state->layout->set[binding->set].dynamic_offset_start + - set->layout->binding[binding->binding].dynamic_offset_index + - binding->index; - - return pipe_state->dynamic_offsets[dynamic_offset_idx]; -} - static struct anv_address anv_descriptor_set_address(struct anv_cmd_buffer *cmd_buffer, struct anv_descriptor_set *set) @@ -2064,7 +2451,7 @@ if (set->pool) { /* This is a normal descriptor set */ return (struct anv_address) { - .bo = &set->pool->bo, + .bo = set->pool->bo, .offset = set->desc_mem.offset, }; } else { @@ -2125,17 +2512,21 @@ * softpin then we always keep all user-allocated memory objects resident. */ const bool need_client_mem_relocs = - !cmd_buffer->device->instance->physicalDevice.use_softpin; + !cmd_buffer->device->physical->use_softpin; for (uint32_t s = 0; s < map->surface_count; s++) { struct anv_pipeline_binding *binding = &map->surface_to_descriptor[s]; struct anv_state surface_state; - if (binding->set == ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS) { + switch (binding->set) { + case ANV_DESCRIPTOR_SET_NULL: + bt_map[s] = 0; + break; + + case ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS: /* Color attachment binding */ assert(stage == MESA_SHADER_FRAGMENT); - assert(binding->binding == 0); if (binding->index < subpass->color_count) { const unsigned att = subpass->color_attachments[binding->index].attachment; @@ -2156,8 +2547,9 @@ } bt_map[s] = surface_state.offset + state_offset; - continue; - } else if (binding->set == ANV_DESCRIPTOR_SET_SHADER_CONSTANTS) { + break; + + case ANV_DESCRIPTOR_SET_SHADER_CONSTANTS: { struct anv_state surface_state = anv_cmd_buffer_alloc_surface_state(cmd_buffer); @@ -2176,12 +2568,12 @@ bt_map[s] = surface_state.offset + state_offset; add_surface_reloc(cmd_buffer, surface_state, constant_data); - continue; - } else if (binding->set == ANV_DESCRIPTOR_SET_NUM_WORK_GROUPS) { + break; + } + + case ANV_DESCRIPTOR_SET_NUM_WORK_GROUPS: { /* This is always the first binding for compute shaders */ assert(stage == MESA_SHADER_COMPUTE && s == 0); - if (!get_cs_prog_data(pipeline)->uses_num_work_groups) - continue; struct anv_state surface_state = anv_cmd_buffer_alloc_surface_state(cmd_buffer); @@ -2197,47 +2589,35 @@ add_surface_reloc(cmd_buffer, surface_state, cmd_buffer->state.compute.num_workgroups); } - continue; - } else if (binding->set == ANV_DESCRIPTOR_SET_DESCRIPTORS) { + break; + } + + case ANV_DESCRIPTOR_SET_DESCRIPTORS: { /* This is a descriptor set buffer so the set index is actually * given by binding->binding. (Yes, that's confusing.) */ struct anv_descriptor_set *set = - pipe_state->descriptors[binding->binding]; + pipe_state->descriptors[binding->index]; assert(set->desc_mem.alloc_size); assert(set->desc_surface_state.alloc_size); bt_map[s] = set->desc_surface_state.offset + state_offset; add_surface_reloc(cmd_buffer, set->desc_surface_state, anv_descriptor_set_address(cmd_buffer, set)); - continue; + break; } - const struct anv_descriptor *desc = - anv_descriptor_for_binding(pipe_state, binding); - - switch (desc->type) { - case VK_DESCRIPTOR_TYPE_SAMPLER: - /* Nothing for us to do here */ - continue; + default: { + assert(binding->set < MAX_SETS); + const struct anv_descriptor *desc = + &pipe_state->descriptors[binding->set]->descriptors[binding->index]; + + switch (desc->type) { + case VK_DESCRIPTOR_TYPE_SAMPLER: + /* Nothing for us to do here */ + continue; - case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: - case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE: { - struct anv_surface_state sstate = - (desc->layout == VK_IMAGE_LAYOUT_GENERAL) ? - desc->image_view->planes[binding->plane].general_sampler_surface_state : - desc->image_view->planes[binding->plane].optimal_sampler_surface_state; - surface_state = sstate.state; - assert(surface_state.alloc_size); - if (need_client_mem_relocs) - add_surface_state_relocs(cmd_buffer, sstate); - break; - } - case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: - assert(stage == MESA_SHADER_FRAGMENT); - if ((desc->image_view->aspect_mask & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) == 0) { - /* For depth and stencil input attachments, we treat it like any - * old texture that a user may have bound. - */ + case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: + case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE: { struct anv_surface_state sstate = (desc->layout == VK_IMAGE_LAYOUT_GENERAL) ? desc->image_view->planes[binding->plane].general_sampler_surface_state : @@ -2246,104 +2626,106 @@ assert(surface_state.alloc_size); if (need_client_mem_relocs) add_surface_state_relocs(cmd_buffer, sstate); - } else { - /* For color input attachments, we create the surface state at - * vkBeginRenderPass time so that we can include aux and clear - * color information. - */ - assert(binding->input_attachment_index < subpass->input_count); - const unsigned subpass_att = binding->input_attachment_index; - const unsigned att = subpass->input_attachments[subpass_att].attachment; - surface_state = cmd_buffer->state.attachments[att].input.state; + break; } - break; - - case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: { - struct anv_surface_state sstate = (binding->write_only) - ? desc->image_view->planes[binding->plane].writeonly_storage_surface_state - : desc->image_view->planes[binding->plane].storage_surface_state; - surface_state = sstate.state; - assert(surface_state.alloc_size); - if (need_client_mem_relocs) - add_surface_state_relocs(cmd_buffer, sstate); - break; - } + case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: + assert(stage == MESA_SHADER_FRAGMENT); + if ((desc->image_view->aspect_mask & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) == 0) { + /* For depth and stencil input attachments, we treat it like any + * old texture that a user may have bound. + */ + assert(desc->image_view->n_planes == 1); + struct anv_surface_state sstate = + (desc->layout == VK_IMAGE_LAYOUT_GENERAL) ? + desc->image_view->planes[0].general_sampler_surface_state : + desc->image_view->planes[0].optimal_sampler_surface_state; + surface_state = sstate.state; + assert(surface_state.alloc_size); + if (need_client_mem_relocs) + add_surface_state_relocs(cmd_buffer, sstate); + } else { + /* For color input attachments, we create the surface state at + * vkBeginRenderPass time so that we can include aux and clear + * color information. + */ + assert(binding->input_attachment_index < subpass->input_count); + const unsigned subpass_att = binding->input_attachment_index; + const unsigned att = subpass->input_attachments[subpass_att].attachment; + surface_state = cmd_buffer->state.attachments[att].input.state; + } + break; - case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: - case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: - case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: - surface_state = desc->buffer_view->surface_state; - assert(surface_state.alloc_size); - if (need_client_mem_relocs) { - add_surface_reloc(cmd_buffer, surface_state, - desc->buffer_view->address); + case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: { + struct anv_surface_state sstate = (binding->write_only) + ? desc->image_view->planes[binding->plane].writeonly_storage_surface_state + : desc->image_view->planes[binding->plane].storage_surface_state; + surface_state = sstate.state; + assert(surface_state.alloc_size); + if (need_client_mem_relocs) + add_surface_state_relocs(cmd_buffer, sstate); + break; } - break; - case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: - case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: { - /* Compute the offset within the buffer */ - uint32_t dynamic_offset = - dynamic_offset_for_binding(pipe_state, binding); - uint64_t offset = desc->offset + dynamic_offset; - /* Clamp to the buffer size */ - offset = MIN2(offset, desc->buffer->size); - /* Clamp the range to the buffer size */ - uint32_t range = MIN2(desc->range, desc->buffer->size - offset); - - struct anv_address address = - anv_address_add(desc->buffer->address, offset); - - surface_state = - anv_state_stream_alloc(&cmd_buffer->surface_state_stream, 64, 64); - enum isl_format format = - anv_isl_format_for_descriptor_type(desc->type); + case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: + case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: + surface_state = desc->buffer_view->surface_state; + assert(surface_state.alloc_size); + if (need_client_mem_relocs) { + add_surface_reloc(cmd_buffer, surface_state, + desc->buffer_view->address); + } + break; - anv_fill_buffer_surface_state(cmd_buffer->device, surface_state, - format, address, range, 1); - if (need_client_mem_relocs) - add_surface_reloc(cmd_buffer, surface_state, address); - break; - } + case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: { + /* Compute the offset within the buffer */ + struct anv_push_constants *push = + &cmd_buffer->state.push_constants[stage]; + + uint32_t dynamic_offset = + push->dynamic_offsets[binding->dynamic_offset_index]; + uint64_t offset = desc->offset + dynamic_offset; + /* Clamp to the buffer size */ + offset = MIN2(offset, desc->buffer->size); + /* Clamp the range to the buffer size */ + uint32_t range = MIN2(desc->range, desc->buffer->size - offset); + + struct anv_address address = + anv_address_add(desc->buffer->address, offset); + + surface_state = + anv_state_stream_alloc(&cmd_buffer->surface_state_stream, 64, 64); + enum isl_format format = + anv_isl_format_for_descriptor_type(desc->type); - case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: - surface_state = (binding->write_only) - ? desc->buffer_view->writeonly_storage_surface_state - : desc->buffer_view->storage_surface_state; - assert(surface_state.alloc_size); - if (need_client_mem_relocs) { - add_surface_reloc(cmd_buffer, surface_state, - desc->buffer_view->address); + anv_fill_buffer_surface_state(cmd_buffer->device, surface_state, + format, address, range, 1); + if (need_client_mem_relocs) + add_surface_reloc(cmd_buffer, surface_state, address); + break; } - break; - - default: - assert(!"Invalid descriptor type"); - continue; - } - bt_map[s] = surface_state.offset + state_offset; - } + case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: + surface_state = (binding->write_only) + ? desc->buffer_view->writeonly_storage_surface_state + : desc->buffer_view->storage_surface_state; + assert(surface_state.alloc_size); + if (need_client_mem_relocs) { + add_surface_reloc(cmd_buffer, surface_state, + desc->buffer_view->address); + } + break; -#if GEN_GEN >= 11 - /* The PIPE_CONTROL command description says: - * - * "Whenever a Binding Table Index (BTI) used by a Render Taget Message - * points to a different RENDER_SURFACE_STATE, SW must issue a Render - * Target Cache Flush by enabling this bit. When render target flush - * is set due to new association of BTI, PS Scoreboard Stall bit must - * be set in this packet." - * - * FINISHME: Currently we shuffle around the surface states in the binding - * table based on if they are getting used or not. So, we've to do below - * pipe control flush for every binding table upload. Make changes so - * that we do it only when we modify render target surface states. - */ - anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { - pc.RenderTargetCacheFlushEnable = true; - pc.StallAtPixelScoreboard = true; + default: + assert(!"Invalid descriptor type"); + continue; + } + bt_map[s] = surface_state.offset + state_offset; + break; + } + } } -#endif return VK_SUCCESS; } @@ -2378,7 +2760,7 @@ for (uint32_t s = 0; s < map->sampler_count; s++) { struct anv_pipeline_binding *binding = &map->sampler_to_descriptor[s]; const struct anv_descriptor *desc = - anv_descriptor_for_binding(pipe_state, binding); + &pipe_state->descriptors[binding->set]->descriptors[binding->index]; if (desc->type != VK_DESCRIPTOR_TYPE_SAMPLER && desc->type != VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER) @@ -2400,10 +2782,9 @@ } static uint32_t -flush_descriptor_sets(struct anv_cmd_buffer *cmd_buffer) +flush_descriptor_sets(struct anv_cmd_buffer *cmd_buffer, + struct anv_pipeline *pipeline) { - struct anv_pipeline *pipeline = cmd_buffer->state.gfx.base.pipeline; - VkShaderStageFlags dirty = cmd_buffer->state.descriptors_dirty & pipeline->active_stages; @@ -2496,10 +2877,153 @@ } } +#if GEN_GEN >= 8 || GEN_IS_HASWELL +static struct anv_address +get_push_range_address(struct anv_cmd_buffer *cmd_buffer, + gl_shader_stage stage, + const struct anv_push_range *range) +{ + const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx; + switch (range->set) { + case ANV_DESCRIPTOR_SET_DESCRIPTORS: { + /* This is a descriptor set buffer so the set index is + * actually given by binding->binding. (Yes, that's + * confusing.) + */ + struct anv_descriptor_set *set = + gfx_state->base.descriptors[range->index]; + return anv_descriptor_set_address(cmd_buffer, set); + break; + } + + case ANV_DESCRIPTOR_SET_PUSH_CONSTANTS: { + struct anv_state state = + anv_cmd_buffer_push_constants(cmd_buffer, stage); + return (struct anv_address) { + .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo, + .offset = state.offset, + }; + break; + } + + default: { + assert(range->set < MAX_SETS); + struct anv_descriptor_set *set = + gfx_state->base.descriptors[range->set]; + const struct anv_descriptor *desc = + &set->descriptors[range->index]; + + if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER) { + return desc->buffer_view->address; + } else { + assert(desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC); + struct anv_push_constants *push = + &cmd_buffer->state.push_constants[stage]; + uint32_t dynamic_offset = + push->dynamic_offsets[range->dynamic_offset_index]; + return anv_address_add(desc->buffer->address, + desc->offset + dynamic_offset); + } + } + } +} +#endif + static void -cmd_buffer_flush_push_constants(struct anv_cmd_buffer *cmd_buffer, - VkShaderStageFlags dirty_stages) +cmd_buffer_emit_push_constant(struct anv_cmd_buffer *cmd_buffer, + gl_shader_stage stage, unsigned buffer_count) +{ + const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx; + const struct anv_pipeline *pipeline = gfx_state->base.pipeline; + + static const uint32_t push_constant_opcodes[] = { + [MESA_SHADER_VERTEX] = 21, + [MESA_SHADER_TESS_CTRL] = 25, /* HS */ + [MESA_SHADER_TESS_EVAL] = 26, /* DS */ + [MESA_SHADER_GEOMETRY] = 22, + [MESA_SHADER_FRAGMENT] = 23, + [MESA_SHADER_COMPUTE] = 0, + }; + + assert(stage < ARRAY_SIZE(push_constant_opcodes)); + assert(push_constant_opcodes[stage] > 0); + + anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_VS), c) { + c._3DCommandSubOpcode = push_constant_opcodes[stage]; + + if (anv_pipeline_has_stage(pipeline, stage)) { + const struct anv_pipeline_bind_map *bind_map = + &pipeline->shaders[stage]->bind_map; + +#if GEN_GEN >= 12 + c.MOCS = cmd_buffer->device->isl_dev.mocs.internal; +#endif + +#if GEN_GEN >= 8 || GEN_IS_HASWELL + /* The Skylake PRM contains the following restriction: + * + * "The driver must ensure The following case does not occur + * without a flush to the 3D engine: 3DSTATE_CONSTANT_* with + * buffer 3 read length equal to zero committed followed by a + * 3DSTATE_CONSTANT_* with buffer 0 read length not equal to + * zero committed." + * + * To avoid this, we program the buffers in the highest slots. + * This way, slot 0 is only used if slot 3 is also used. + */ + assert(buffer_count <= 4); + const unsigned shift = 4 - buffer_count; + for (unsigned i = 0; i < buffer_count; i++) { + const struct anv_push_range *range = &bind_map->push_ranges[i]; + + /* At this point we only have non-empty ranges */ + assert(range->length > 0); + + /* For Ivy Bridge, make sure we only set the first range (actual + * push constants) + */ + assert((GEN_GEN >= 8 || GEN_IS_HASWELL) || i == 0); + + const struct anv_address addr = + get_push_range_address(cmd_buffer, stage, range); + c.ConstantBody.ReadLength[i + shift] = range->length; + c.ConstantBody.Buffer[i + shift] = + anv_address_add(addr, range->start * 32); + } +#else + /* For Ivy Bridge, push constants are relative to dynamic state + * base address and we only ever push actual push constants. + */ + if (bind_map->push_ranges[0].length > 0) { + assert(bind_map->push_ranges[0].set == + ANV_DESCRIPTOR_SET_PUSH_CONSTANTS); + struct anv_state state = + anv_cmd_buffer_push_constants(cmd_buffer, stage); + c.ConstantBody.ReadLength[0] = bind_map->push_ranges[0].length; + c.ConstantBody.Buffer[0].bo = NULL; + c.ConstantBody.Buffer[0].offset = state.offset; + } + assert(bind_map->push_ranges[1].length == 0); + assert(bind_map->push_ranges[2].length == 0); + assert(bind_map->push_ranges[3].length == 0); +#endif + } + } +} + +#if GEN_GEN >= 12 +static void +cmd_buffer_emit_push_constant_all(struct anv_cmd_buffer *cmd_buffer, + uint32_t shader_mask, uint32_t count) { + if (count == 0) { + anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_ALL), c) { + c.ShaderUpdateEnable = shader_mask; + c.MOCS = cmd_buffer->device->isl_dev.mocs.internal; + } + return; + } + const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx; const struct anv_pipeline *pipeline = gfx_state->base.pipeline; @@ -2512,115 +3036,97 @@ [MESA_SHADER_COMPUTE] = 0, }; - VkShaderStageFlags flushed = 0; - - anv_foreach_stage(stage, dirty_stages) { - assert(stage < ARRAY_SIZE(push_constant_opcodes)); - assert(push_constant_opcodes[stage] > 0); - - anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_VS), c) { - c._3DCommandSubOpcode = push_constant_opcodes[stage]; - - if (anv_pipeline_has_stage(pipeline, stage)) { -#if GEN_GEN >= 8 || GEN_IS_HASWELL - const struct brw_stage_prog_data *prog_data = - pipeline->shaders[stage]->prog_data; - const struct anv_pipeline_bind_map *bind_map = - &pipeline->shaders[stage]->bind_map; + gl_shader_stage stage = vk_to_mesa_shader_stage(shader_mask); + assert(stage < ARRAY_SIZE(push_constant_opcodes)); + assert(push_constant_opcodes[stage] > 0); + + const struct anv_pipeline_bind_map *bind_map = + &pipeline->shaders[stage]->bind_map; + + uint32_t *dw; + const uint32_t buffers = (1 << count) - 1; + const uint32_t num_dwords = 2 + 2 * count; + + dw = anv_batch_emitn(&cmd_buffer->batch, num_dwords, + GENX(3DSTATE_CONSTANT_ALL), + .ShaderUpdateEnable = shader_mask, + .PointerBufferMask = buffers, + .MOCS = cmd_buffer->device->isl_dev.mocs.internal); + + for (int i = 0; i < count; i++) { + const struct anv_push_range *range = &bind_map->push_ranges[i]; + const struct anv_address addr = + get_push_range_address(cmd_buffer, stage, range); + + GENX(3DSTATE_CONSTANT_ALL_DATA_pack)( + &cmd_buffer->batch, dw + 2 + i * 2, + &(struct GENX(3DSTATE_CONSTANT_ALL_DATA)) { + .PointerToConstantBuffer = anv_address_add(addr, range->start * 32), + .ConstantBufferReadLength = range->length, + }); + } +} +#endif - /* The Skylake PRM contains the following restriction: - * - * "The driver must ensure The following case does not occur - * without a flush to the 3D engine: 3DSTATE_CONSTANT_* with - * buffer 3 read length equal to zero committed followed by a - * 3DSTATE_CONSTANT_* with buffer 0 read length not equal to - * zero committed." - * - * To avoid this, we program the buffers in the highest slots. - * This way, slot 0 is only used if slot 3 is also used. - */ - int n = 3; +static void +cmd_buffer_flush_push_constants(struct anv_cmd_buffer *cmd_buffer, + VkShaderStageFlags dirty_stages) +{ + VkShaderStageFlags flushed = 0; + const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx; + const struct anv_pipeline *pipeline = gfx_state->base.pipeline; - for (int i = 3; i >= 0; i--) { - const struct brw_ubo_range *range = &prog_data->ubo_ranges[i]; - if (range->length == 0) - continue; - - const unsigned surface = - prog_data->binding_table.ubo_start + range->block; - - assert(surface <= bind_map->surface_count); - const struct anv_pipeline_binding *binding = - &bind_map->surface_to_descriptor[surface]; - - struct anv_address addr; - if (binding->set == ANV_DESCRIPTOR_SET_SHADER_CONSTANTS) { - addr = (struct anv_address) { - .bo = pipeline->device->dynamic_state_pool.block_pool.bo, - .offset = pipeline->shaders[stage]->constant_data.offset, - }; - } else if (binding->set == ANV_DESCRIPTOR_SET_DESCRIPTORS) { - /* This is a descriptor set buffer so the set index is - * actually given by binding->binding. (Yes, that's - * confusing.) - */ - struct anv_descriptor_set *set = - gfx_state->base.descriptors[binding->binding]; - addr = anv_descriptor_set_address(cmd_buffer, set); - } else { - const struct anv_descriptor *desc = - anv_descriptor_for_binding(&gfx_state->base, binding); +#if GEN_GEN >= 12 + uint32_t nobuffer_stages = 0; +#endif - if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER) { - addr = desc->buffer_view->address; - } else { - assert(desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC); - - uint32_t dynamic_offset = - dynamic_offset_for_binding(&gfx_state->base, binding); - addr = anv_address_add(desc->buffer->address, - desc->offset + dynamic_offset); - } - } + anv_foreach_stage(stage, dirty_stages) { + unsigned buffer_count = 0; + flushed |= mesa_to_vk_shader_stage(stage); + uint32_t max_push_range = 0; - c.ConstantBody.Buffer[n] = - anv_address_add(addr, range->start * 32); - c.ConstantBody.ReadLength[n] = range->length; - n--; + if (anv_pipeline_has_stage(pipeline, stage)) { + const struct anv_pipeline_bind_map *bind_map = + &pipeline->shaders[stage]->bind_map; + + for (unsigned i = 0; i < 4; i++) { + const struct anv_push_range *range = &bind_map->push_ranges[i]; + if (range->length > 0) { + buffer_count++; + if (GEN_GEN >= 12 && range->length > max_push_range) + max_push_range = range->length; } + } + } - struct anv_state state = - anv_cmd_buffer_push_constants(cmd_buffer, stage); - - if (state.alloc_size > 0) { - c.ConstantBody.Buffer[n] = (struct anv_address) { - .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo, - .offset = state.offset, - }; - c.ConstantBody.ReadLength[n] = - DIV_ROUND_UP(state.alloc_size, 32); - } -#else - /* For Ivy Bridge, the push constants packets have a different - * rule that would require us to iterate in the other direction - * and possibly mess around with dynamic state base address. - * Don't bother; just emit regular push constants at n = 0. - */ - struct anv_state state = - anv_cmd_buffer_push_constants(cmd_buffer, stage); +#if GEN_GEN >= 12 + /* If this stage doesn't have any push constants, emit it later in a + * single CONSTANT_ALL packet. + */ + if (buffer_count == 0) { + nobuffer_stages |= 1 << stage; + continue; + } - if (state.alloc_size > 0) { - c.ConstantBody.Buffer[0].offset = state.offset, - c.ConstantBody.ReadLength[0] = - DIV_ROUND_UP(state.alloc_size, 32); - } -#endif - } + /* The Constant Buffer Read Length field from 3DSTATE_CONSTANT_ALL + * contains only 5 bits, so we can only use it for buffers smaller than + * 32. + */ + if (max_push_range < 32) { + cmd_buffer_emit_push_constant_all(cmd_buffer, 1 << stage, + buffer_count); + continue; } +#endif - flushed |= mesa_to_vk_shader_stage(stage); + cmd_buffer_emit_push_constant(cmd_buffer, stage, buffer_count); } +#if GEN_GEN >= 12 + if (nobuffer_stages) + cmd_buffer_emit_push_constant_all(cmd_buffer, nobuffer_stages, 0); +#endif + cmd_buffer->state.push_constants_dirty &= ~flushed; } @@ -2630,10 +3136,6 @@ struct anv_pipeline *pipeline = cmd_buffer->state.gfx.base.pipeline; uint32_t *p; - uint32_t vb_emit = cmd_buffer->state.gfx.vb_dirty & pipeline->vb_used; - if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) - vb_emit |= pipeline->vb_used; - assert((pipeline->active_stages & VK_SHADER_STAGE_COMPUTE_BIT) == 0); genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->urb.l3_config); @@ -2642,6 +3144,16 @@ genX(flush_pipeline_select_3d)(cmd_buffer); + /* Apply any pending pipeline flushes we may have. We want to apply them + * now because, if any of those flushes are for things like push constants, + * the GPU will read the state at weird times. + */ + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + + uint32_t vb_emit = cmd_buffer->state.gfx.vb_dirty & pipeline->vb_used; + if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) + vb_emit |= pipeline->vb_used; + if (vb_emit) { const uint32_t num_buffers = __builtin_popcount(vb_emit); const uint32_t num_dwords = 1 + num_buffers * 4; @@ -2673,6 +3185,12 @@ #endif }; +#if GEN_GEN >= 8 && GEN_GEN <= 9 + genX(cmd_buffer_set_binding_for_gen8_vb_flush)(cmd_buffer, vb, + state.BufferStartingAddress, + state.BufferSize); +#endif + GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, &p[1 + i * 4], &state); i++; } @@ -2688,11 +3206,16 @@ for (unsigned idx = 0; idx < MAX_XFB_BUFFERS; idx++) { struct anv_xfb_binding *xfb = &cmd_buffer->state.xfb_bindings[idx]; anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_SO_BUFFER), sob) { +#if GEN_GEN < 12 sob.SOBufferIndex = idx; +#else + sob._3DCommandOpcode = 0; + sob._3DCommandSubOpcode = SO_BUFFER_INDEX_0_CMD + idx; +#endif if (cmd_buffer->state.xfb_enabled && xfb->buffer && xfb->size != 0) { sob.SOBufferEnable = true; - sob.MOCS = cmd_buffer->device->default_mocs, + sob.MOCS = cmd_buffer->device->isl_dev.mocs.internal, sob.StreamOffsetWriteEnable = false; sob.SurfaceBaseAddress = anv_address_add(xfb->buffer->address, xfb->offset); @@ -2711,11 +3234,6 @@ if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) { anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->batch); - /* The exact descriptor layout is pulled from the pipeline, so we need - * to re-emit binding tables on every pipeline change. - */ - cmd_buffer->state.descriptors_dirty |= pipeline->active_stages; - /* If the pipeline changed, we may need to re-allocate push constant * space in the URB. */ @@ -2739,7 +3257,7 @@ pc.DepthStallEnable = true; pc.PostSyncOperation = WriteImmediateData; pc.Address = - (struct anv_address) { &cmd_buffer->device->workaround_bo, 0 }; + (struct anv_address) { cmd_buffer->device->workaround_bo, 0 }; } } #endif @@ -2757,7 +3275,7 @@ */ uint32_t dirty = 0; if (cmd_buffer->state.descriptors_dirty) - dirty = flush_descriptor_sets(cmd_buffer); + dirty = flush_descriptor_sets(cmd_buffer, pipeline); if (dirty || cmd_buffer->state.push_constants_dirty) { /* Because we're pushing UBOs, we have to push whenever either @@ -2785,8 +3303,6 @@ gen7_cmd_buffer_emit_scissor(cmd_buffer); genX(cmd_buffer_flush_dynamic_state)(cmd_buffer); - - genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); } static void @@ -2802,7 +3318,8 @@ .VertexBufferIndex = index, .AddressModifyEnable = true, .BufferPitch = 0, - .MOCS = anv_mocs_for_bo(cmd_buffer->device, addr.bo), + .MOCS = addr.bo ? anv_mocs_for_bo(cmd_buffer->device, addr.bo) : 0, + .NullVertexBuffer = size == 0, #if (GEN_GEN >= 8) .BufferStartingAddress = addr, .BufferSize = size @@ -2811,31 +3328,38 @@ .EndAddress = anv_address_add(addr, size), #endif }); + + genX(cmd_buffer_set_binding_for_gen8_vb_flush)(cmd_buffer, + index, addr, size); } static void emit_base_vertex_instance_bo(struct anv_cmd_buffer *cmd_buffer, struct anv_address addr) { - emit_vertex_bo(cmd_buffer, addr, 8, ANV_SVGS_VB_INDEX); + emit_vertex_bo(cmd_buffer, addr, addr.bo ? 8 : 0, ANV_SVGS_VB_INDEX); } static void emit_base_vertex_instance(struct anv_cmd_buffer *cmd_buffer, uint32_t base_vertex, uint32_t base_instance) { - struct anv_state id_state = - anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 8, 4); + if (base_vertex == 0 && base_instance == 0) { + emit_base_vertex_instance_bo(cmd_buffer, ANV_NULL_ADDRESS); + } else { + struct anv_state id_state = + anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 8, 4); - ((uint32_t *)id_state.map)[0] = base_vertex; - ((uint32_t *)id_state.map)[1] = base_instance; + ((uint32_t *)id_state.map)[0] = base_vertex; + ((uint32_t *)id_state.map)[1] = base_instance; - struct anv_address addr = { - .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo, - .offset = id_state.offset, - }; + struct anv_address addr = { + .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo, + .offset = id_state.offset, + }; - emit_base_vertex_instance_bo(cmd_buffer, addr); + emit_base_vertex_instance_bo(cmd_buffer, addr); + } } static void @@ -2854,6 +3378,25 @@ emit_vertex_bo(cmd_buffer, addr, 4, ANV_DRAWID_VB_INDEX); } +static void +update_dirty_vbs_for_gen8_vb_flush(struct anv_cmd_buffer *cmd_buffer, + uint32_t access_type) +{ + struct anv_pipeline *pipeline = cmd_buffer->state.gfx.base.pipeline; + const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline); + + uint64_t vb_used = pipeline->vb_used; + if (vs_prog_data->uses_firstvertex || + vs_prog_data->uses_baseinstance) + vb_used |= 1ull << ANV_SVGS_VB_INDEX; + if (vs_prog_data->uses_drawid) + vb_used |= 1ull << ANV_DRAWID_VB_INDEX; + + genX(cmd_buffer_update_dirty_vbs_for_gen8_vb_flush)(cmd_buffer, + access_type == RANDOM, + vb_used); +} + void genX(CmdDraw)( VkCommandBuffer commandBuffer, uint32_t vertexCount, @@ -2879,6 +3422,11 @@ if (vs_prog_data->uses_drawid) emit_draw_index(cmd_buffer, 0); + /* Emitting draw index or vertex index BOs may result in needing + * additional VF cache flushes. + */ + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + /* Our implementation of VK_KHR_multiview uses instancing to draw the * different views. We need to multiply instanceCount by the view count. */ @@ -2894,6 +3442,8 @@ prim.StartInstanceLocation = firstInstance; prim.BaseVertexLocation = 0; } + + update_dirty_vbs_for_gen8_vb_flush(cmd_buffer, SEQUENTIAL); } void genX(CmdDrawIndexed)( @@ -2922,6 +3472,11 @@ if (vs_prog_data->uses_drawid) emit_draw_index(cmd_buffer, 0); + /* Emitting draw index or vertex index BOs may result in needing + * additional VF cache flushes. + */ + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + /* Our implementation of VK_KHR_multiview uses instancing to draw the * different views. We need to multiply instanceCount by the view count. */ @@ -2937,6 +3492,8 @@ prim.StartInstanceLocation = firstInstance; prim.BaseVertexLocation = vertexOffset; } + + update_dirty_vbs_for_gen8_vb_flush(cmd_buffer, RANDOM); } /* Auto-Draw / Indirect Registers */ @@ -2976,6 +3533,11 @@ if (vs_prog_data->uses_drawid) emit_draw_index(cmd_buffer, 0); + /* Emitting draw index or vertex index BOs may result in needing + * additional VF cache flushes. + */ + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + /* Our implementation of VK_KHR_multiview uses instancing to draw the * different views. We need to multiply instanceCount by the view count. */ @@ -3004,6 +3566,8 @@ prim.VertexAccessType = SEQUENTIAL; prim.PrimitiveTopologyType = pipeline->topology; } + + update_dirty_vbs_for_gen8_vb_flush(cmd_buffer, SEQUENTIAL); #endif /* GEN_IS_HASWELL || GEN_GEN >= 8 */ } @@ -3074,6 +3638,11 @@ if (vs_prog_data->uses_drawid) emit_draw_index(cmd_buffer, i); + /* Emitting draw index or vertex index BOs may result in needing + * additional VF cache flushes. + */ + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + load_indirect_parameters(cmd_buffer, draw, false); anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { @@ -3083,6 +3652,8 @@ prim.PrimitiveTopologyType = pipeline->topology; } + update_dirty_vbs_for_gen8_vb_flush(cmd_buffer, SEQUENTIAL); + offset += stride; } } @@ -3117,6 +3688,11 @@ if (vs_prog_data->uses_drawid) emit_draw_index(cmd_buffer, i); + /* Emitting draw index or vertex index BOs may result in needing + * additional VF cache flushes. + */ + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + load_indirect_parameters(cmd_buffer, draw, true); anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { @@ -3126,6 +3702,8 @@ prim.PrimitiveTopologyType = pipeline->topology; } + update_dirty_vbs_for_gen8_vb_flush(cmd_buffer, RANDOM); + offset += stride; } } @@ -3220,7 +3798,7 @@ } #endif -void genX(CmdDrawIndirectCountKHR)( +void genX(CmdDrawIndirectCount)( VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset, @@ -3266,6 +3844,11 @@ if (vs_prog_data->uses_drawid) emit_draw_index(cmd_buffer, i); + /* Emitting draw index or vertex index BOs may result in needing + * additional VF cache flushes. + */ + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + load_indirect_parameters(cmd_buffer, draw, false); anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { @@ -3275,11 +3858,13 @@ prim.PrimitiveTopologyType = pipeline->topology; } + update_dirty_vbs_for_gen8_vb_flush(cmd_buffer, SEQUENTIAL); + offset += stride; } } -void genX(CmdDrawIndexedIndirectCountKHR)( +void genX(CmdDrawIndexedIndirectCount)( VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset, @@ -3326,6 +3911,11 @@ if (vs_prog_data->uses_drawid) emit_draw_index(cmd_buffer, i); + /* Emitting draw index or vertex index BOs may result in needing + * additional VF cache flushes. + */ + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + load_indirect_parameters(cmd_buffer, draw, true); anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { @@ -3335,6 +3925,8 @@ prim.PrimitiveTopologyType = pipeline->topology; } + update_dirty_vbs_for_gen8_vb_flush(cmd_buffer, RANDOM); + offset += stride; } } @@ -3440,67 +4032,10 @@ cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_XFB_ENABLE; } -static VkResult -flush_compute_descriptor_set(struct anv_cmd_buffer *cmd_buffer) -{ - struct anv_pipeline *pipeline = cmd_buffer->state.compute.base.pipeline; - struct anv_state surfaces = { 0, }, samplers = { 0, }; - VkResult result; - - result = emit_binding_table(cmd_buffer, MESA_SHADER_COMPUTE, &surfaces); - if (result != VK_SUCCESS) { - assert(result == VK_ERROR_OUT_OF_DEVICE_MEMORY); - - result = anv_cmd_buffer_new_binding_table_block(cmd_buffer); - if (result != VK_SUCCESS) - return result; - - /* Re-emit state base addresses so we get the new surface state base - * address before we start emitting binding tables etc. - */ - genX(cmd_buffer_emit_state_base_address)(cmd_buffer); - - result = emit_binding_table(cmd_buffer, MESA_SHADER_COMPUTE, &surfaces); - if (result != VK_SUCCESS) { - anv_batch_set_error(&cmd_buffer->batch, result); - return result; - } - } - - result = emit_samplers(cmd_buffer, MESA_SHADER_COMPUTE, &samplers); - if (result != VK_SUCCESS) { - anv_batch_set_error(&cmd_buffer->batch, result); - return result; - } - - uint32_t iface_desc_data_dw[GENX(INTERFACE_DESCRIPTOR_DATA_length)]; - struct GENX(INTERFACE_DESCRIPTOR_DATA) desc = { - .BindingTablePointer = surfaces.offset, - .SamplerStatePointer = samplers.offset, - }; - GENX(INTERFACE_DESCRIPTOR_DATA_pack)(NULL, iface_desc_data_dw, &desc); - - struct anv_state state = - anv_cmd_buffer_merge_dynamic(cmd_buffer, iface_desc_data_dw, - pipeline->interface_descriptor_data, - GENX(INTERFACE_DESCRIPTOR_DATA_length), - 64); - - uint32_t size = GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t); - anv_batch_emit(&cmd_buffer->batch, - GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), mid) { - mid.InterfaceDescriptorTotalLength = size; - mid.InterfaceDescriptorDataStartAddress = state.offset; - } - - return VK_SUCCESS; -} - void genX(cmd_buffer_flush_compute_state)(struct anv_cmd_buffer *cmd_buffer) { struct anv_pipeline *pipeline = cmd_buffer->state.compute.base.pipeline; - VkResult result; assert(pipeline->active_stages == VK_SHADER_STAGE_COMPUTE_BIT); @@ -3508,6 +4043,12 @@ genX(flush_pipeline_select_gpgpu)(cmd_buffer); + /* Apply any pending pipeline flushes we may have. We want to apply them + * now because, if any of those flushes are for things like push constants, + * the GPU will read the state at weird times. + */ + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + if (cmd_buffer->state.compute.pipeline_dirty) { /* From the Sky Lake PRM Vol 2a, MEDIA_VFE_STATE: * @@ -3521,16 +4062,38 @@ genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->batch); + + /* The workgroup size of the pipeline affects our push constant layout + * so flag push constants as dirty if we change the pipeline. + */ + cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_COMPUTE_BIT; } if ((cmd_buffer->state.descriptors_dirty & VK_SHADER_STAGE_COMPUTE_BIT) || cmd_buffer->state.compute.pipeline_dirty) { - /* FIXME: figure out descriptors for gen7 */ - result = flush_compute_descriptor_set(cmd_buffer); - if (result != VK_SUCCESS) - return; + flush_descriptor_sets(cmd_buffer, pipeline); + + uint32_t iface_desc_data_dw[GENX(INTERFACE_DESCRIPTOR_DATA_length)]; + struct GENX(INTERFACE_DESCRIPTOR_DATA) desc = { + .BindingTablePointer = + cmd_buffer->state.binding_tables[MESA_SHADER_COMPUTE].offset, + .SamplerStatePointer = + cmd_buffer->state.samplers[MESA_SHADER_COMPUTE].offset, + }; + GENX(INTERFACE_DESCRIPTOR_DATA_pack)(NULL, iface_desc_data_dw, &desc); + + struct anv_state state = + anv_cmd_buffer_merge_dynamic(cmd_buffer, iface_desc_data_dw, + pipeline->interface_descriptor_data, + GENX(INTERFACE_DESCRIPTOR_DATA_length), + 64); - cmd_buffer->state.descriptors_dirty &= ~VK_SHADER_STAGE_COMPUTE_BIT; + uint32_t size = GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t); + anv_batch_emit(&cmd_buffer->batch, + GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), mid) { + mid.InterfaceDescriptorTotalLength = size; + mid.InterfaceDescriptorDataStartAddress = state.offset; + } } if (cmd_buffer->state.push_constants_dirty & VK_SHADER_STAGE_COMPUTE_BIT) { @@ -3559,8 +4122,8 @@ int required_version, const char *function) { - if (device->instance->physicalDevice.cmd_parser_version < required_version) { - return vk_errorf(device->instance, device->instance, + if (device->physical->cmd_parser_version < required_version) { + return vk_errorf(device, device->physical, VK_ERROR_FEATURE_NOT_PRESENT, "cmd parser version %d is required for %s", required_version, function); @@ -3582,12 +4145,12 @@ struct anv_push_constants *push = &cmd_buffer->state.push_constants[MESA_SHADER_COMPUTE]; - if (push->base_work_group_id[0] != baseGroupX || - push->base_work_group_id[1] != baseGroupY || - push->base_work_group_id[2] != baseGroupZ) { - push->base_work_group_id[0] = baseGroupX; - push->base_work_group_id[1] = baseGroupY; - push->base_work_group_id[2] = baseGroupZ; + if (push->cs.base_work_group_id[0] != baseGroupX || + push->cs.base_work_group_id[1] != baseGroupY || + push->cs.base_work_group_id[2] != baseGroupZ) { + push->cs.base_work_group_id[0] = baseGroupX; + push->cs.base_work_group_id[1] = baseGroupY; + push->cs.base_work_group_id[2] = baseGroupZ; cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_COMPUTE_BIT; } @@ -3632,6 +4195,9 @@ .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo, .offset = state.offset, }; + + /* The num_workgroups buffer goes in the binding table */ + cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_COMPUTE_BIT; } genX(cmd_buffer_flush_compute_state)(cmd_buffer); @@ -3682,9 +4248,13 @@ return; #endif - if (prog_data->uses_num_work_groups) + if (prog_data->uses_num_work_groups) { cmd_buffer->state.compute.num_workgroups = addr; + /* The num_workgroups buffer goes in the binding table */ + cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_COMPUTE_BIT; + } + genX(cmd_buffer_flush_compute_state)(cmd_buffer); struct gen_mi_builder b; @@ -3796,7 +4366,7 @@ * really know why. */ const uint32_t subslices = - MAX2(cmd_buffer->device->instance->physicalDevice.subslice_total, 1); + MAX2(cmd_buffer->device->physical->subslice_total, 1); anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_VFE_STATE), vfe) { vfe.MaximumNumberofThreads = devinfo->max_cs_threads * subslices - 1; @@ -3829,6 +4399,14 @@ pc.DCFlushEnable = true; pc.PostSyncOperation = NoWrite; pc.CommandStreamerStallEnable = true; +#if GEN_GEN >= 12 + pc.TileCacheFlushEnable = true; + + /* GEN:BUG:1409600907: "PIPE_CONTROL with Depth Stall Enable bit must be + * set with any PIPE_CONTROL with Depth Flush Enable bit set. + */ + pc.DepthStallEnable = true; +#endif } anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { @@ -3837,6 +4415,9 @@ pc.StateCacheInvalidationEnable = true; pc.InstructionCacheInvalidateEnable = true; pc.PostSyncOperation = NoWrite; +#if GEN_GEN >= 12 + pc.TileCacheFlushEnable = true; +#endif } anv_batch_emit(&cmd_buffer->batch, GENX(PIPELINE_SELECT), ps) { @@ -3903,12 +4484,129 @@ } anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) { pipe.DepthCacheFlushEnable = true; +#if GEN_GEN >= 12 + pipe.TileCacheFlushEnable = true; +#endif } anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) { pipe.DepthStallEnable = true; } } +/* From the Skylake PRM, 3DSTATE_VERTEX_BUFFERS: + * + * "The VF cache needs to be invalidated before binding and then using + * Vertex Buffers that overlap with any previously bound Vertex Buffer + * (at a 64B granularity) since the last invalidation. A VF cache + * invalidate is performed by setting the "VF Cache Invalidation Enable" + * bit in PIPE_CONTROL." + * + * This is implemented by carefully tracking all vertex and index buffer + * bindings and flushing if the cache ever ends up with a range in the cache + * that would exceed 4 GiB. This is implemented in three parts: + * + * 1. genX(cmd_buffer_set_binding_for_gen8_vb_flush)() which must be called + * every time a 3DSTATE_VERTEX_BUFFER packet is emitted and informs the + * tracking code of the new binding. If this new binding would cause + * the cache to have a too-large range on the next draw call, a pipeline + * stall and VF cache invalidate are added to pending_pipeline_bits. + * + * 2. genX(cmd_buffer_apply_pipe_flushes)() resets the cache tracking to + * empty whenever we emit a VF invalidate. + * + * 3. genX(cmd_buffer_update_dirty_vbs_for_gen8_vb_flush)() must be called + * after every 3DPRIMITIVE and copies the bound range into the dirty + * range for each used buffer. This has to be a separate step because + * we don't always re-bind all buffers and so 1. can't know which + * buffers are actually bound. + */ +void +genX(cmd_buffer_set_binding_for_gen8_vb_flush)(struct anv_cmd_buffer *cmd_buffer, + int vb_index, + struct anv_address vb_address, + uint32_t vb_size) +{ + if (GEN_GEN < 8 || GEN_GEN > 9 || + !cmd_buffer->device->physical->use_softpin) + return; + + struct anv_vb_cache_range *bound, *dirty; + if (vb_index == -1) { + bound = &cmd_buffer->state.gfx.ib_bound_range; + dirty = &cmd_buffer->state.gfx.ib_dirty_range; + } else { + assert(vb_index >= 0); + assert(vb_index < ARRAY_SIZE(cmd_buffer->state.gfx.vb_bound_ranges)); + assert(vb_index < ARRAY_SIZE(cmd_buffer->state.gfx.vb_dirty_ranges)); + bound = &cmd_buffer->state.gfx.vb_bound_ranges[vb_index]; + dirty = &cmd_buffer->state.gfx.vb_dirty_ranges[vb_index]; + } + + if (vb_size == 0) { + bound->start = 0; + bound->end = 0; + return; + } + + assert(vb_address.bo && (vb_address.bo->flags & EXEC_OBJECT_PINNED)); + bound->start = gen_48b_address(anv_address_physical(vb_address)); + bound->end = bound->start + vb_size; + assert(bound->end > bound->start); /* No overflow */ + + /* Align everything to a cache line */ + bound->start &= ~(64ull - 1ull); + bound->end = align_u64(bound->end, 64); + + /* Compute the dirty range */ + dirty->start = MIN2(dirty->start, bound->start); + dirty->end = MAX2(dirty->end, bound->end); + + /* If our range is larger than 32 bits, we have to flush */ + assert(bound->end - bound->start <= (1ull << 32)); + if (dirty->end - dirty->start > (1ull << 32)) { + cmd_buffer->state.pending_pipe_bits |= + ANV_PIPE_CS_STALL_BIT | ANV_PIPE_VF_CACHE_INVALIDATE_BIT; + } +} + +void +genX(cmd_buffer_update_dirty_vbs_for_gen8_vb_flush)(struct anv_cmd_buffer *cmd_buffer, + uint32_t access_type, + uint64_t vb_used) +{ + if (GEN_GEN < 8 || GEN_GEN > 9 || + !cmd_buffer->device->physical->use_softpin) + return; + + if (access_type == RANDOM) { + /* We have an index buffer */ + struct anv_vb_cache_range *bound = &cmd_buffer->state.gfx.ib_bound_range; + struct anv_vb_cache_range *dirty = &cmd_buffer->state.gfx.ib_dirty_range; + + if (bound->end > bound->start) { + dirty->start = MIN2(dirty->start, bound->start); + dirty->end = MAX2(dirty->end, bound->end); + } + } + + uint64_t mask = vb_used; + while (mask) { + int i = u_bit_scan64(&mask); + assert(i >= 0); + assert(i < ARRAY_SIZE(cmd_buffer->state.gfx.vb_bound_ranges)); + assert(i < ARRAY_SIZE(cmd_buffer->state.gfx.vb_dirty_ranges)); + + struct anv_vb_cache_range *bound, *dirty; + bound = &cmd_buffer->state.gfx.vb_bound_ranges[i]; + dirty = &cmd_buffer->state.gfx.vb_dirty_ranges[i]; + + if (bound->end > bound->start) { + dirty->start = MIN2(dirty->start, bound->start); + dirty->end = MAX2(dirty->end, bound->end); + } + } +} + /** * Update the pixel hashing modes that determine the balancing of PS threads * across subslices and slices. @@ -4075,6 +4773,23 @@ isl_emit_depth_stencil_hiz_s(&device->isl_dev, dw, &info); + if (GEN_GEN >= 12) { + cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT; + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + + /* GEN:BUG:1408224581 + * + * Workaround: Gen12LP Astep only An additional pipe control with + * post-sync = store dword operation would be required.( w/a is to + * have an additional pipe control after the stencil state whenever + * the surface state bits of this state is changing). + */ + anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { + pc.PostSyncOperation = WriteImmediateData; + pc.Address = + (struct anv_address) { cmd_buffer->device->workaround_bo, 0 }; + } + } cmd_buffer->state.hiz_enabled = info.hiz_usage == ISL_AUX_USAGE_HIZ; } @@ -4185,6 +4900,9 @@ target_layout = subpass->attachments[i].layout; } + VkImageLayout target_stencil_layout = + subpass->attachments[i].stencil_layout; + uint32_t base_layer, layer_count; if (image->type == VK_IMAGE_TYPE_3D) { base_layer = 0; @@ -4208,16 +4926,20 @@ att_state->current_layout, target_layout); att_state->aux_usage = anv_layout_to_aux_usage(&cmd_buffer->device->info, image, - VK_IMAGE_ASPECT_DEPTH_BIT, target_layout); + VK_IMAGE_ASPECT_DEPTH_BIT, + VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT, + target_layout); } if (image->aspects & VK_IMAGE_ASPECT_STENCIL_BIT) { transition_stencil_buffer(cmd_buffer, image, iview->planes[0].isl.base_level, 1, base_layer, layer_count, - att_state->current_layout, target_layout); + att_state->current_stencil_layout, + target_stencil_layout); } att_state->current_layout = target_layout; + att_state->current_stencil_layout = target_stencil_layout; if (att_state->pending_clear_aspects & VK_IMAGE_ASPECT_COLOR_BIT) { assert(att_state->pending_clear_aspects == VK_IMAGE_ASPECT_COLOR_BIT); @@ -4240,6 +4962,7 @@ if (iview->image->samples == 1) { anv_image_ccs_op(cmd_buffer, image, iview->planes[0].isl.format, + iview->planes[0].isl.swizzle, VK_IMAGE_ASPECT_COLOR_BIT, 0, 0, 1, ISL_AUX_OP_FAST_CLEAR, &clear_color, @@ -4247,6 +4970,7 @@ } else { anv_image_mcs_op(cmd_buffer, image, iview->planes[0].isl.format, + iview->planes[0].isl.swizzle, VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, ISL_AUX_OP_FAST_CLEAR, &clear_color, @@ -4369,7 +5093,7 @@ if (GEN_GEN < 10 && (att_state->pending_load_aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) && - image->planes[0].aux_surface.isl.size_B > 0 && + image->planes[0].aux_usage != ISL_AUX_USAGE_NONE && iview->planes[0].isl.base_level == 0 && iview->planes[0].isl.base_array_layer == 0) { if (att_state->aux_usage != ISL_AUX_USAGE_NONE) { @@ -4442,6 +5166,20 @@ } cmd_buffer_emit_depth_stencil(cmd_buffer); + +#if GEN_GEN >= 11 + /* The PIPE_CONTROL command description says: + * + * "Whenever a Binding Table Index (BTI) used by a Render Taget Message + * points to a different RENDER_SURFACE_STATE, SW must issue a Render + * Target Cache Flush by enabling this bit. When render target flush + * is set due to new association of BTI, PS Scoreboard Stall bit must + * be set in this packet." + */ + cmd_buffer->state.pending_pipe_bits |= + ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | + ANV_PIPE_STALL_AT_SCOREBOARD_BIT; +#endif } static enum blorp_filter @@ -4559,26 +5297,27 @@ const VkRect2D render_area = cmd_buffer->state.render_area; + struct anv_attachment_state *src_state = + &cmd_state->attachments[src_att]; + struct anv_attachment_state *dst_state = + &cmd_state->attachments[dst_att]; + if ((src_iview->image->aspects & VK_IMAGE_ASPECT_DEPTH_BIT) && subpass->depth_resolve_mode != VK_RESOLVE_MODE_NONE_KHR) { - struct anv_attachment_state *src_state = - &cmd_state->attachments[src_att]; - struct anv_attachment_state *dst_state = - &cmd_state->attachments[dst_att]; - /* MSAA resolves sample from the source attachment. Transition the * depth attachment first to get rid of any HiZ that we may not be * able to handle. */ transition_depth_buffer(cmd_buffer, src_iview->image, src_state->current_layout, - VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL); + VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL); src_state->aux_usage = anv_layout_to_aux_usage(&cmd_buffer->device->info, src_iview->image, VK_IMAGE_ASPECT_DEPTH_BIT, - VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL); - src_state->current_layout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; + VK_IMAGE_USAGE_TRANSFER_SRC_BIT, + VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL); + src_state->current_layout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL; /* MSAA resolves write to the resolve attachment as if it were any * other transfer op. Transition the resolve attachment accordingly. @@ -4601,6 +5340,7 @@ dst_state->aux_usage = anv_layout_to_aux_usage(&cmd_buffer->device->info, dst_iview->image, VK_IMAGE_ASPECT_DEPTH_BIT, + VK_IMAGE_USAGE_TRANSFER_DST_BIT, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL); dst_state->current_layout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL; @@ -4625,6 +5365,9 @@ if ((src_iview->image->aspects & VK_IMAGE_ASPECT_STENCIL_BIT) && subpass->stencil_resolve_mode != VK_RESOLVE_MODE_NONE_KHR) { + src_state->current_stencil_layout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL; + dst_state->current_stencil_layout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL; + enum isl_aux_usage src_aux_usage = ISL_AUX_USAGE_NONE; enum isl_aux_usage dst_aux_usage = ISL_AUX_USAGE_NONE; @@ -4650,18 +5393,19 @@ #if GEN_GEN == 7 /* On gen7, we have to store a texturable version of the stencil buffer in * a shadow whenever VK_IMAGE_USAGE_SAMPLED_BIT is set and copy back and - * forth at strategic points. Stencil writes are only allowed in three + * forth at strategic points. Stencil writes are only allowed in following * layouts: * * - VK_IMAGE_LAYOUT_GENERAL * - VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL * - VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL * - VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_STENCIL_ATTACHMENT_OPTIMAL + * - VK_IMAGE_LAYOUT_STENCIL_ATTACHMENT_OPTIMAL_KHR * * For general, we have no nice opportunity to transition so we do the copy - * to the shadow unconditionally at the end of the subpass. For transfer - * destinations, we can update it as part of the transfer op. For the - * other two, we delay the copy until a transition into some other layout. + * to the shadow unconditionally at the end of the subpass. For transfer + * destinations, we can update it as part of the transfer op. For the other + * layouts, we delay the copy until a transition into some other layout. */ if (subpass->depth_stencil_attachment) { uint32_t a = subpass->depth_stencil_attachment->attachment; @@ -4676,7 +5420,7 @@ VK_IMAGE_ASPECT_STENCIL_BIT); if (image->planes[plane].shadow_surface.isl.size_B > 0 && - att_state->current_layout == VK_IMAGE_LAYOUT_GENERAL) { + att_state->current_stencil_layout == VK_IMAGE_LAYOUT_GENERAL) { assert(image->aspects & VK_IMAGE_ASPECT_STENCIL_BIT); anv_image_copy_to_shadow(cmd_buffer, image, VK_IMAGE_ASPECT_STENCIL_BIT, @@ -4714,7 +5458,7 @@ * SRGB view & a UNORM image). */ if (fast_clear_type != ANV_FAST_CLEAR_NONE) { - anv_perf_warn(cmd_buffer->device->instance, iview, + anv_perf_warn(cmd_buffer->device, iview, "Doing a partial resolve to get rid of clear color at the " "end of a renderpass due to an image/view format mismatch"); @@ -4733,6 +5477,7 @@ if (image->samples == 1) { anv_cmd_predicated_ccs_resolve(cmd_buffer, image, iview->planes[0].isl.format, + iview->planes[0].isl.swizzle, VK_IMAGE_ASPECT_COLOR_BIT, iview->planes[0].isl.base_level, array_layer, @@ -4741,6 +5486,7 @@ } else { anv_cmd_predicated_mcs_resolve(cmd_buffer, image, iview->planes[0].isl.format, + iview->planes[0].isl.swizzle, VK_IMAGE_ASPECT_COLOR_BIT, base_layer, ISL_AUX_OP_PARTIAL_RESOLVE, @@ -4753,6 +5499,8 @@ /* Transition the image into the final layout for this render pass */ VkImageLayout target_layout = cmd_state->pass->attachments[a].final_layout; + VkImageLayout target_stencil_layout = + cmd_state->pass->attachments[a].stencil_final_layout; uint32_t base_layer, layer_count; if (image->type == VK_IMAGE_TYPE_3D) { @@ -4781,7 +5529,8 @@ transition_stencil_buffer(cmd_buffer, image, iview->planes[0].isl.base_level, 1, base_layer, layer_count, - att_state->current_layout, target_layout); + att_state->current_stencil_layout, + target_stencil_layout); } } @@ -4820,7 +5569,7 @@ cmd_buffer_begin_subpass(cmd_buffer, 0); } -void genX(CmdBeginRenderPass2KHR)( +void genX(CmdBeginRenderPass2)( VkCommandBuffer commandBuffer, const VkRenderPassBeginInfo* pRenderPassBeginInfo, const VkSubpassBeginInfoKHR* pSubpassBeginInfo) @@ -4845,7 +5594,7 @@ cmd_buffer_begin_subpass(cmd_buffer, prev_subpass + 1); } -void genX(CmdNextSubpass2KHR)( +void genX(CmdNextSubpass2)( VkCommandBuffer commandBuffer, const VkSubpassBeginInfoKHR* pSubpassBeginInfo, const VkSubpassEndInfoKHR* pSubpassEndInfo) @@ -4877,7 +5626,7 @@ cmd_buffer->state.subpass = NULL; } -void genX(CmdEndRenderPass2KHR)( +void genX(CmdEndRenderPass2)( VkCommandBuffer commandBuffer, const VkSubpassEndInfoKHR* pSubpassEndInfo) { @@ -4983,6 +5732,9 @@ ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); ANV_FROM_HANDLE(anv_event, event, _event); + cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT; + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { if (stageMask & ANV_PIPELINE_STAGE_PIPELINED_BITS) { pc.StallAtPixelScoreboard = true; @@ -5007,6 +5759,9 @@ ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); ANV_FROM_HANDLE(anv_event, event, _event); + cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT; + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { if (stageMask & ANV_PIPELINE_STAGE_PIPELINED_BITS) { pc.StallAtPixelScoreboard = true; @@ -5062,3 +5817,57 @@ bufferMemoryBarrierCount, pBufferMemoryBarriers, imageMemoryBarrierCount, pImageMemoryBarriers); } + +VkResult genX(CmdSetPerformanceOverrideINTEL)( + VkCommandBuffer commandBuffer, + const VkPerformanceOverrideInfoINTEL* pOverrideInfo) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + + switch (pOverrideInfo->type) { + case VK_PERFORMANCE_OVERRIDE_TYPE_NULL_HARDWARE_INTEL: { + uint32_t dw; + +#if GEN_GEN >= 9 + anv_pack_struct(&dw, GENX(CS_DEBUG_MODE2), + ._3DRenderingInstructionDisable = pOverrideInfo->enable, + .MediaInstructionDisable = pOverrideInfo->enable, + ._3DRenderingInstructionDisableMask = true, + .MediaInstructionDisableMask = true); + emit_lri(&cmd_buffer->batch, GENX(CS_DEBUG_MODE2_num), dw); +#else + anv_pack_struct(&dw, GENX(INSTPM), + ._3DRenderingInstructionDisable = pOverrideInfo->enable, + .MediaInstructionDisable = pOverrideInfo->enable, + ._3DRenderingInstructionDisableMask = true, + .MediaInstructionDisableMask = true); + emit_lri(&cmd_buffer->batch, GENX(INSTPM_num), dw); +#endif + break; + } + + case VK_PERFORMANCE_OVERRIDE_TYPE_FLUSH_GPU_CACHES_INTEL: + if (pOverrideInfo->enable) { + /* FLUSH ALL THE THINGS! As requested by the MDAPI team. */ + cmd_buffer->state.pending_pipe_bits |= + ANV_PIPE_FLUSH_BITS | + ANV_PIPE_INVALIDATE_BITS; + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + } + break; + + default: + unreachable("Invalid override"); + } + + return VK_SUCCESS; +} + +VkResult genX(CmdSetPerformanceStreamMarkerINTEL)( + VkCommandBuffer commandBuffer, + const VkPerformanceStreamMarkerInfoINTEL* pMarkerInfo) +{ + /* TODO: Waiting on the register to write, might depend on generation. */ + + return VK_SUCCESS; +} diff -Nru mesa-19.2.8/src/intel/vulkan/genX_gpu_memcpy.c mesa-20.0.8/src/intel/vulkan/genX_gpu_memcpy.c --- mesa-19.2.8/src/intel/vulkan/genX_gpu_memcpy.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/vulkan/genX_gpu_memcpy.c 2020-06-12 01:21:17.000000000 +0000 @@ -78,6 +78,7 @@ genX(cmd_buffer_config_l3)(cmd_buffer, cfg); } + genX(cmd_buffer_set_binding_for_gen8_vb_flush)(cmd_buffer, 32, src, size); genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); genX(flush_pipeline_select_3d)(cmd_buffer); @@ -112,6 +113,13 @@ }); #if GEN_GEN >= 8 + anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF_INSTANCING), vfi) { + vfi.InstancingEnable = false; + vfi.VertexElementIndex = 0; + } +#endif + +#if GEN_GEN >= 8 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF_SGVS), sgvs); #endif @@ -146,10 +154,15 @@ genX(emit_urb_setup)(cmd_buffer->device, &cmd_buffer->batch, cmd_buffer->state.current_l3_config, - VK_SHADER_STAGE_VERTEX_BIT, entry_size); + VK_SHADER_STAGE_VERTEX_BIT, entry_size, NULL); anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_SO_BUFFER), sob) { +#if GEN_GEN < 12 sob.SOBufferIndex = 0; +#else + sob._3DCommandOpcode = 0; + sob._3DCommandSubOpcode = SO_BUFFER_INDEX_0_CMD; +#endif sob.MOCS = anv_mocs_for_bo(cmd_buffer->device, dst.bo), sob.SurfaceBaseAddress = dst; @@ -224,5 +237,8 @@ prim.BaseVertexLocation = 0; } + genX(cmd_buffer_update_dirty_vbs_for_gen8_vb_flush)(cmd_buffer, SEQUENTIAL, + 1ull << 32); + cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_PIPELINE; } diff -Nru mesa-19.2.8/src/intel/vulkan/genX_pipeline.c mesa-20.0.8/src/intel/vulkan/genX_pipeline.c --- mesa-19.2.8/src/intel/vulkan/genX_pipeline.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/vulkan/genX_pipeline.c 2020-06-12 01:21:17.000000000 +0000 @@ -259,25 +259,18 @@ genX(emit_urb_setup)(struct anv_device *device, struct anv_batch *batch, const struct gen_l3_config *l3_config, VkShaderStageFlags active_stages, - const unsigned entry_size[4]) + const unsigned entry_size[4], + enum gen_urb_deref_block_size *deref_block_size) { const struct gen_device_info *devinfo = &device->info; -#if GEN_IS_HASWELL - const unsigned push_constant_kb = devinfo->gt == 3 ? 32 : 16; -#else - const unsigned push_constant_kb = GEN_GEN >= 8 ? 32 : 16; -#endif - - const unsigned urb_size_kb = gen_get_l3_config_urb_size(devinfo, l3_config); unsigned entries[4]; unsigned start[4]; - gen_get_urb_config(devinfo, - 1024 * push_constant_kb, 1024 * urb_size_kb, + gen_get_urb_config(devinfo, l3_config, active_stages & VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT, active_stages & VK_SHADER_STAGE_GEOMETRY_BIT, - entry_size, entries, start); + entry_size, entries, start, deref_block_size); #if GEN_GEN == 7 && !GEN_IS_HASWELL /* From the IVB PRM Vol. 2, Part 1, Section 3.2.1: @@ -291,7 +284,7 @@ anv_batch_emit(batch, GEN7_PIPE_CONTROL, pc) { pc.DepthStallEnable = true; pc.PostSyncOperation = WriteImmediateData; - pc.Address = (struct anv_address) { &device->workaround_bo, 0 }; + pc.Address = (struct anv_address) { device->workaround_bo, 0 }; } #endif @@ -306,7 +299,8 @@ } static void -emit_urb_setup(struct anv_pipeline *pipeline) +emit_urb_setup(struct anv_pipeline *pipeline, + enum gen_urb_deref_block_size *deref_block_size) { unsigned entry_size[4]; for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) { @@ -319,7 +313,8 @@ genX(emit_urb_setup)(pipeline->device, &pipeline->batch, pipeline->urb.l3_config, - pipeline->active_stages, entry_size); + pipeline->active_stages, entry_size, + deref_block_size); } static void @@ -360,8 +355,10 @@ # define swiz sbe #endif - /* Skip the VUE header and position slots by default */ - unsigned urb_entry_read_offset = 1; + int first_slot = brw_compute_first_urb_slot_required(wm_prog_data->inputs, + fs_input_map); + assert(first_slot % 2 == 0); + unsigned urb_entry_read_offset = first_slot / 2; int max_source_attr = 0; for (int attr = 0; attr < VARYING_SLOT_MAX; attr++) { int input_index = wm_prog_data->urb_setup[attr]; @@ -371,7 +368,6 @@ /* gl_Viewport and gl_Layer are stored in the VUE header */ if (attr == VARYING_SLOT_VIEWPORT || attr == VARYING_SLOT_LAYER) { - urb_entry_read_offset = 0; continue; } @@ -382,9 +378,6 @@ const int slot = fs_input_map->varying_to_slot[attr]; - if (input_index >= 16) - continue; - if (slot == -1) { /* This attribute does not exist in the VUE--that means that the * vertex shader did not write to it. It could be that it's a @@ -398,15 +391,24 @@ swiz.Attribute[input_index].ComponentOverrideY = true; swiz.Attribute[input_index].ComponentOverrideZ = true; swiz.Attribute[input_index].ComponentOverrideW = true; - } else { - /* We have to subtract two slots to accout for the URB entry output - * read offset in the VS and GS stages. - */ - const int source_attr = slot - 2 * urb_entry_read_offset; - assert(source_attr >= 0 && source_attr < 32); - max_source_attr = MAX2(max_source_attr, source_attr); - swiz.Attribute[input_index].SourceAttribute = source_attr; + continue; } + + /* We have to subtract two slots to accout for the URB entry output + * read offset in the VS and GS stages. + */ + const int source_attr = slot - 2 * urb_entry_read_offset; + assert(source_attr >= 0 && source_attr < 32); + max_source_attr = MAX2(max_source_attr, source_attr); + /* The hardware can only do overrides on 16 overrides at a time, and the + * other up to 16 have to be lined up so that the input index = the + * output index. We'll need to do some tweaking to make sure that's the + * case. + */ + if (input_index < 16) + swiz.Attribute[input_index].SourceAttribute = source_attr; + else + assert(source_attr == input_index); } sbe.VertexURBEntryReadOffset = urb_entry_read_offset; @@ -477,12 +479,6 @@ const VkPipelineInputAssemblyStateCreateInfo *ia_info, const VkPipelineRasterizationStateCreateInfo *rs_info) { - /* Points always override everything. This saves us from having to handle - * rs_info->polygonMode in all of the line cases below. - */ - if (rs_info->polygonMode == VK_POLYGON_MODE_POINT) - return VK_POLYGON_MODE_POINT; - if (anv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY)) { switch (get_gs_prog_data(pipeline)->output_topology) { case _3DPRIM_POINTLIST: @@ -579,7 +575,8 @@ const VkPipelineMultisampleStateCreateInfo *ms_info, const VkPipelineRasterizationLineStateCreateInfoEXT *line_info, const struct anv_render_pass *pass, - const struct anv_subpass *subpass) + const struct anv_subpass *subpass, + enum gen_urb_deref_block_size urb_deref_block_size) { struct GENX(3DSTATE_SF) sf = { GENX(3DSTATE_SF_header), @@ -597,6 +594,10 @@ sf.LineStippleEnable = line_info && line_info->stippledLineEnable; #endif +#if GEN_GEN >= 12 + sf.DerefBlockSize = urb_deref_block_size; +#endif + const struct brw_vue_prog_data *last_vue_prog_data = anv_pipeline_get_last_vue_prog_data(pipeline); @@ -1005,6 +1006,7 @@ pipeline->stencil_test_enable = false; pipeline->writes_depth = false; pipeline->depth_test_enable = false; + pipeline->depth_bounds_test_enable = false; memset(depth_stencil_dw, 0, sizeof(depth_stencil_dw)); return; } @@ -1021,8 +1023,7 @@ pipeline->stencil_test_enable = info.stencilTestEnable; pipeline->writes_depth = info.depthWriteEnable; pipeline->depth_test_enable = info.depthTestEnable; - - /* VkBool32 depthBoundsTestEnable; // optional (depth_bounds_test) */ + pipeline->depth_bounds_test_enable = info.depthBoundsTestEnable; #if GEN_GEN <= 7 struct GENX(DEPTH_STENCIL_STATE) depth_stencil = { @@ -1117,7 +1118,6 @@ continue; } - assert(binding->binding == 0); const VkPipelineColorBlendAttachmentState *a = &info->pAttachments[binding->index]; @@ -1170,7 +1170,7 @@ is_dual_src_blend_factor(a->dstColorBlendFactor) || is_dual_src_blend_factor(a->srcAlphaBlendFactor) || is_dual_src_blend_factor(a->dstAlphaBlendFactor))) { - vk_debug_report(&device->instance->debug_report_callbacks, + vk_debug_report(&device->physical->instance->debug_report_callbacks, VK_DEBUG_REPORT_WARNING_BIT_EXT, VK_DEBUG_REPORT_OBJECT_TYPE_DEVICE_EXT, (uint64_t)(uintptr_t)device, @@ -1415,11 +1415,23 @@ next_offset[buffer] = output->offset + __builtin_popcount(component_mask) * 4; - so_decl[stream][decls[stream]++] = (struct GENX(SO_DECL)) { - .OutputBufferSlot = buffer, - .RegisterIndex = vue_map->varying_to_slot[varying], - .ComponentMask = component_mask, - }; + const int slot = vue_map->varying_to_slot[varying]; + if (slot < 0) { + /* This can happen if the shader never writes to the varying. + * Insert a hole instead of actual varying data. + */ + so_decl[stream][decls[stream]++] = (struct GENX(SO_DECL)) { + .HoleFlag = true, + .OutputBufferSlot = buffer, + .ComponentMask = component_mask, + }; + } else { + so_decl[stream][decls[stream]++] = (struct GENX(SO_DECL)) { + .OutputBufferSlot = buffer, + .RegisterIndex = slot, + .ComponentMask = component_mask, + }; + } } int max_decls = 0; @@ -1523,11 +1535,7 @@ * programming 0xB000[30] to '1'. */ vs.SamplerCount = GEN_GEN == 11 ? 0 : get_sampler_count(vs_bin); - /* Gen 11 workarounds table #2056 WABTPPrefetchDisable suggests to - * disable prefetching of binding tables on A0 and B0 steppings. - * TODO: Revisit this WA on newer steppings. - */ - vs.BindingTableEntryCount = GEN_GEN == 11 ? 0 : get_binding_table_entry_count(vs_bin); + vs.BindingTableEntryCount = get_binding_table_entry_count(vs_bin); vs.FloatingPointMode = IEEE754; vs.IllegalOpcodeExceptionEnable = false; vs.SoftwareExceptionEnable = false; @@ -1599,8 +1607,18 @@ hs.KernelStartPointer = tcs_bin->kernel.offset; /* WA_1606682166 */ hs.SamplerCount = GEN_GEN == 11 ? 0 : get_sampler_count(tcs_bin); - /* Gen 11 workarounds table #2056 WABTPPrefetchDisable */ - hs.BindingTableEntryCount = GEN_GEN == 11 ? 0 : get_binding_table_entry_count(tcs_bin); + hs.BindingTableEntryCount = get_binding_table_entry_count(tcs_bin); + +#if GEN_GEN >= 12 + /* GEN:BUG:1604578095: + * + * Hang occurs when the number of max threads is less than 2 times + * the number of instance count. The number of max threads must be + * more than 2 times the number of instance count. + */ + assert((devinfo->max_tcs_threads / 2) > tcs_prog_data->instances); +#endif + hs.MaximumNumberofThreads = devinfo->max_tcs_threads - 1; hs.IncludeVertexHandles = true; hs.InstanceCount = tcs_prog_data->instances - 1; @@ -1608,7 +1626,12 @@ hs.VertexURBEntryReadLength = 0; hs.VertexURBEntryReadOffset = 0; hs.DispatchGRFStartRegisterForURBData = - tcs_prog_data->base.base.dispatch_grf_start_reg; + tcs_prog_data->base.base.dispatch_grf_start_reg & 0x1f; +#if GEN_GEN >= 12 + hs.DispatchGRFStartRegisterForURBData5 = + tcs_prog_data->base.base.dispatch_grf_start_reg >> 5; +#endif + hs.PerThreadScratchSpace = get_scratch_space(tcs_bin); hs.ScratchSpaceBasePointer = @@ -1655,8 +1678,7 @@ ds.KernelStartPointer = tes_bin->kernel.offset; /* WA_1606682166 */ ds.SamplerCount = GEN_GEN == 11 ? 0 : get_sampler_count(tes_bin); - /* Gen 11 workarounds table #2056 WABTPPrefetchDisable */ - ds.BindingTableEntryCount = GEN_GEN == 11 ? 0 : get_binding_table_entry_count(tes_bin); + ds.BindingTableEntryCount = get_binding_table_entry_count(tes_bin); ds.MaximumNumberofThreads = devinfo->max_tes_threads - 1; ds.ComputeWCoordinateEnable = @@ -1714,8 +1736,7 @@ gs.VectorMaskEnable = false; /* WA_1606682166 */ gs.SamplerCount = GEN_GEN == 11 ? 0 : get_sampler_count(gs_bin); - /* Gen 11 workarounds table #2056 WABTPPrefetchDisable */ - gs.BindingTableEntryCount = GEN_GEN == 11 ? 0 : get_binding_table_entry_count(gs_bin); + gs.BindingTableEntryCount = get_binding_table_entry_count(gs_bin); gs.IncludeVertexHandles = gs_prog_data->base.include_vue_handles; gs.IncludePrimitiveID = gs_prog_data->include_primitive_id; @@ -1950,8 +1971,7 @@ ps.VectorMaskEnable = GEN_GEN >= 8; /* WA_1606682166 */ ps.SamplerCount = GEN_GEN == 11 ? 0 : get_sampler_count(fs_bin); - /* Gen 11 workarounds table #2056 WABTPPrefetchDisable */ - ps.BindingTableEntryCount = GEN_GEN == 11 ? 0 : get_binding_table_entry_count(fs_bin); + ps.BindingTableEntryCount = get_binding_table_entry_count(fs_bin); ps.PushConstantEnable = wm_prog_data->base.nr_params > 0 || wm_prog_data->base.ubo_ranges[0].length; ps.PositionXYOffsetSelect = wm_prog_data->uses_pos_offset ? @@ -1993,8 +2013,7 @@ #if GEN_GEN >= 8 static void emit_3dstate_ps_extra(struct anv_pipeline *pipeline, - struct anv_subpass *subpass, - const VkPipelineColorBlendStateCreateInfo *blend) + struct anv_subpass *subpass) { const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline); @@ -2104,7 +2123,7 @@ assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO); /* Use the default pipeline cache if none is specified */ - if (cache == NULL && device->instance->pipeline_cache_enabled) + if (cache == NULL && device->physical->instance->pipeline_cache_enabled) cache = &device->default_pipeline_cache; pipeline = vk_alloc2(&device->alloc, pAllocator, sizeof(*pipeline), 8, @@ -2119,28 +2138,46 @@ return result; } + /* If rasterization is not enabled, various CreateInfo structs must be + * ignored. + */ + const bool raster_enabled = + !pCreateInfo->pRasterizationState->rasterizerDiscardEnable; + + const VkPipelineViewportStateCreateInfo *vp_info = + raster_enabled ? pCreateInfo->pViewportState : NULL; + + const VkPipelineMultisampleStateCreateInfo *ms_info = + raster_enabled ? pCreateInfo->pMultisampleState : NULL; + + const VkPipelineDepthStencilStateCreateInfo *ds_info = + raster_enabled ? pCreateInfo->pDepthStencilState : NULL; + + const VkPipelineColorBlendStateCreateInfo *cb_info = + raster_enabled ? pCreateInfo->pColorBlendState : NULL; + const VkPipelineRasterizationLineStateCreateInfoEXT *line_info = vk_find_struct_const(pCreateInfo->pRasterizationState->pNext, PIPELINE_RASTERIZATION_LINE_STATE_CREATE_INFO_EXT); + enum gen_urb_deref_block_size urb_deref_block_size; + emit_urb_setup(pipeline, &urb_deref_block_size); + assert(pCreateInfo->pVertexInputState); emit_vertex_input(pipeline, pCreateInfo->pVertexInputState); assert(pCreateInfo->pRasterizationState); emit_rs_state(pipeline, pCreateInfo->pInputAssemblyState, pCreateInfo->pRasterizationState, - pCreateInfo->pMultisampleState, - line_info, pass, subpass); - emit_ms_state(pipeline, pCreateInfo->pMultisampleState); - emit_ds_state(pipeline, pCreateInfo->pDepthStencilState, pass, subpass); - emit_cb_state(pipeline, pCreateInfo->pColorBlendState, - pCreateInfo->pMultisampleState); - compute_kill_pixel(pipeline, pCreateInfo->pMultisampleState, subpass); - - emit_urb_setup(pipeline); + ms_info, line_info, pass, subpass, + urb_deref_block_size); + emit_ms_state(pipeline, ms_info); + emit_ds_state(pipeline, ds_info, pass, subpass); + emit_cb_state(pipeline, cb_info, ms_info); + compute_kill_pixel(pipeline, ms_info, subpass); emit_3dstate_clip(pipeline, pCreateInfo->pInputAssemblyState, - pCreateInfo->pViewportState, + vp_info, pCreateInfo->pRasterizationState); emit_3dstate_streamout(pipeline, pCreateInfo->pRasterizationState); @@ -2170,12 +2207,10 @@ emit_3dstate_wm(pipeline, subpass, pCreateInfo->pInputAssemblyState, pCreateInfo->pRasterizationState, - pCreateInfo->pColorBlendState, - pCreateInfo->pMultisampleState, line_info); - emit_3dstate_ps(pipeline, pCreateInfo->pColorBlendState, - pCreateInfo->pMultisampleState); + cb_info, ms_info, line_info); + emit_3dstate_ps(pipeline, cb_info, ms_info); #if GEN_GEN >= 8 - emit_3dstate_ps_extra(pipeline, subpass, pCreateInfo->pColorBlendState); + emit_3dstate_ps_extra(pipeline, subpass); emit_3dstate_vf_topology(pipeline); #endif emit_3dstate_vf_statistics(pipeline); @@ -2194,16 +2229,14 @@ VkPipeline* pPipeline) { ANV_FROM_HANDLE(anv_device, device, _device); - const struct anv_physical_device *physical_device = - &device->instance->physicalDevice; - const struct gen_device_info *devinfo = &physical_device->info; + const struct gen_device_info *devinfo = &device->info; struct anv_pipeline *pipeline; VkResult result; assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO); /* Use the default pipeline cache if none is specified */ - if (cache == NULL && device->instance->pipeline_cache_enabled) + if (cache == NULL && device->physical->instance->pipeline_cache_enabled) cache = &device->default_pipeline_cache; pipeline = vk_alloc2(&device->alloc, pAllocator, sizeof(*pipeline), 8, @@ -2215,12 +2248,15 @@ pipeline->blend_state.map = NULL; - result = anv_reloc_list_init(&pipeline->batch_relocs, - pAllocator ? pAllocator : &device->alloc); + const VkAllocationCallbacks *alloc = + pAllocator ? pAllocator : &device->alloc; + + result = anv_reloc_list_init(&pipeline->batch_relocs, alloc); if (result != VK_SUCCESS) { vk_free2(&device->alloc, pAllocator, pipeline); return result; } + pipeline->batch.alloc = alloc; pipeline->batch.next = pipeline->batch.start = pipeline->batch_data; pipeline->batch.end = pipeline->batch.start + sizeof(pipeline->batch_data); pipeline->batch.relocs = &pipeline->batch_relocs; @@ -2235,8 +2271,6 @@ memset(pipeline->shaders, 0, sizeof(pipeline->shaders)); pipeline->num_executables = 0; - pipeline->needs_data_cache = false; - assert(pCreateInfo->stage.stage == VK_SHADER_STAGE_COMPUTE_BIT); pipeline->active_stages |= VK_SHADER_STAGE_COMPUTE_BIT; ANV_FROM_HANDLE(anv_shader_module, module, pCreateInfo->stage.module); @@ -2266,7 +2300,7 @@ ALIGN(cs_prog_data->push.per_thread.regs * cs_prog_data->threads + cs_prog_data->push.cross_thread.regs, 2); - const uint32_t subslices = MAX2(physical_device->subslice_total, 1); + const uint32_t subslices = MAX2(device->physical->subslice_total, 1); const struct anv_shader_bin *cs_bin = pipeline->shaders[MESA_SHADER_COMPUTE]; @@ -2318,12 +2352,10 @@ .KernelStartPointer = cs_bin->kernel.offset, /* WA_1606682166 */ .SamplerCount = GEN_GEN == 11 ? 0 : get_sampler_count(cs_bin), - /* Gen 11 workarounds table #2056 WABTPPrefetchDisable - * - * We add 1 because the CS indirect parameters buffer isn't accounted + /* We add 1 because the CS indirect parameters buffer isn't accounted * for in bind_map.surface_count. */ - .BindingTableEntryCount = GEN_GEN == 11 ? 0 : 1 + MIN2(cs_bin->bind_map.surface_count, 30), + .BindingTableEntryCount = 1 + MIN2(cs_bin->bind_map.surface_count, 30), .BarrierEnable = cs_prog_data->uses_barrier, .SharedLocalMemorySize = encode_slm_size(GEN_GEN, cs_prog_data->base.total_shared), @@ -2336,6 +2368,18 @@ .CrossThreadConstantDataReadLength = cs_prog_data->push.cross_thread.regs, #endif +#if GEN_GEN >= 12 + /* TODO: Check if we are missing workarounds and enable mid-thread + * preemption. + * + * We still have issues with mid-thread preemption (it was already + * disabled by the kernel on gen11, due to missing workarounds). It's + * possible that we are just missing some workarounds, and could enable + * it later, but for now let's disable it to fix a GPU in compute in Car + * Chase (and possibly more). + */ + .ThreadPreemptionDisable = true, +#endif .NumberofThreadsinGPGPUThreadGroup = cs_prog_data->threads, }; diff -Nru mesa-19.2.8/src/intel/vulkan/genX_query.c mesa-20.0.8/src/intel/vulkan/genX_query.c --- mesa-19.2.8/src/intel/vulkan/genX_query.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/vulkan/genX_query.c 2020-06-12 01:21:17.000000000 +0000 @@ -37,6 +37,10 @@ #define __gen_get_batch_dwords anv_batch_emit_dwords #define __gen_address_offset anv_address_add #include "common/gen_mi_builder.h" +#include "perf/gen_perf.h" +#include "perf/gen_perf_mdapi.h" + +#define OA_REPORT_N_UINT64 (256 / sizeof(uint64_t)) VkResult genX(CreateQueryPool)( VkDevice _device, @@ -45,16 +49,21 @@ VkQueryPool* pQueryPool) { ANV_FROM_HANDLE(anv_device, device, _device); - const struct anv_physical_device *pdevice = &device->instance->physicalDevice; + const struct anv_physical_device *pdevice = device->physical; struct anv_query_pool *pool; VkResult result; assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO); /* Query pool slots are made up of some number of 64-bit values packed - * tightly together. The first 64-bit value is always the "available" bit - * which is 0 when the query is unavailable and 1 when it is available. - * The 64-bit values that follow are determined by the type of query. + * tightly together. For most query types have the first 64-bit value is + * the "available" bit which is 0 when the query is unavailable and 1 when + * it is available. The 64-bit values that follow are determined by the + * type of query. + * + * For performance queries, we have a requirement to align OA reports at + * 64bytes so we put those first and have the "available" bit behind + * together with some other counters. */ uint32_t uint64s_per_slot = 1; @@ -84,6 +93,10 @@ */ uint64s_per_slot += 4; break; + case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: { + uint64s_per_slot = 72; /* 576 bytes, see layout below */ + break; + } default: assert(!"Invalid query type"); } @@ -98,31 +111,24 @@ pool->stride = uint64s_per_slot * sizeof(uint64_t); pool->slots = pCreateInfo->queryCount; - uint64_t size = pool->slots * pool->stride; - result = anv_bo_init_new(&pool->bo, device, size); - if (result != VK_SUCCESS) - goto fail; - + uint32_t bo_flags = 0; if (pdevice->supports_48bit_addresses) - pool->bo.flags |= EXEC_OBJECT_SUPPORTS_48B_ADDRESS; + bo_flags |= EXEC_OBJECT_SUPPORTS_48B_ADDRESS; if (pdevice->use_softpin) - pool->bo.flags |= EXEC_OBJECT_PINNED; + bo_flags |= EXEC_OBJECT_PINNED; if (pdevice->has_exec_async) - pool->bo.flags |= EXEC_OBJECT_ASYNC; + bo_flags |= EXEC_OBJECT_ASYNC; - anv_vma_alloc(device, &pool->bo); - - /* For query pools, we set the caching mode to I915_CACHING_CACHED. On LLC - * platforms, this does nothing. On non-LLC platforms, this means snooping - * which comes at a slight cost. However, the buffers aren't big, won't be - * written frequently, and trying to handle the flushing manually without - * doing too much flushing is extremely painful. - */ - anv_gem_set_caching(device, pool->bo.gem_handle, I915_CACHING_CACHED); - - pool->bo.map = anv_gem_mmap(device, pool->bo.gem_handle, 0, size, 0); + uint64_t size = pool->slots * pool->stride; + result = anv_device_alloc_bo(device, size, + ANV_BO_ALLOC_MAPPED | + ANV_BO_ALLOC_SNOOPED, + 0 /* explicit_address */, + &pool->bo); + if (result != VK_SUCCESS) + goto fail; *pQueryPool = anv_query_pool_to_handle(pool); @@ -145,9 +151,7 @@ if (!pool) return; - anv_gem_munmap(pool->bo.map, pool->bo.size); - anv_vma_free(device, &pool->bo); - anv_gem_close(device, pool->bo.gem_handle); + anv_device_release_bo(device, pool->bo); vk_free2(&device->alloc, pAllocator, pool); } @@ -155,11 +159,59 @@ anv_query_address(struct anv_query_pool *pool, uint32_t query) { return (struct anv_address) { - .bo = &pool->bo, + .bo = pool->bo, .offset = query * pool->stride, }; } +/** + * VK_INTEL_performance_query layout (576 bytes) : + * + * ------------------------------ + * | availability (8b) | + * |----------------------------| + * | marker (8b) | + * |----------------------------| + * | begin RPSTAT register (4b) | + * |----------------------------| + * | end RPSTAT register (4b) | + * |----------------------------| + * | begin perfcntr 1 & 2 (16b) | + * |----------------------------| + * | end perfcntr 1 & 2 (16b) | + * |----------------------------| + * | Unused (8b) | + * |----------------------------| + * | begin MI_RPC (256b) | + * |----------------------------| + * | end MI_RPC (256b) | + * ------------------------------ + */ + +static uint32_t +intel_perf_marker_offset(void) +{ + return 8; +} + +static uint32_t +intel_perf_rpstart_offset(bool end) +{ + return 16 + (end ? sizeof(uint32_t) : 0); +} + +static uint32_t +intel_perf_counter(bool end) +{ + return 24 + (end ? (2 * sizeof(uint64_t)) : 0); +} + +static uint32_t +intel_perf_mi_rpc_offset(bool end) +{ + return 64 + (end ? 256 : 0); +} + static void cpu_write_query_result(void *dst_slot, VkQueryResultFlags flags, uint32_t value_index, uint64_t result) @@ -173,48 +225,33 @@ } } +static void * +query_slot(struct anv_query_pool *pool, uint32_t query) +{ + return pool->bo->map + query * pool->stride; +} + static bool -query_is_available(uint64_t *slot) +query_is_available(struct anv_query_pool *pool, uint32_t query) { - return *(volatile uint64_t *)slot; + return *(volatile uint64_t *)query_slot(pool, query); } static VkResult wait_for_available(struct anv_device *device, - struct anv_query_pool *pool, uint64_t *slot) + struct anv_query_pool *pool, uint32_t query) { - while (true) { - if (query_is_available(slot)) - return VK_SUCCESS; + uint64_t abs_timeout = anv_get_absolute_timeout(5 * NSEC_PER_SEC); - int ret = anv_gem_busy(device, pool->bo.gem_handle); - if (ret == 1) { - /* The BO is still busy, keep waiting. */ - continue; - } else if (ret == -1) { - /* We don't know the real error. */ - return anv_device_set_lost(device, "gem wait failed: %m"); - } else { - assert(ret == 0); - /* The BO is no longer busy. */ - if (query_is_available(slot)) { - return VK_SUCCESS; - } else { - VkResult status = anv_device_query_status(device); - if (status != VK_SUCCESS) - return status; - - /* If we haven't seen availability yet, then we never will. This - * can only happen if we have a client error where they call - * GetQueryPoolResults on a query that they haven't submitted to - * the GPU yet. The spec allows us to do anything in this case, - * but returning VK_SUCCESS doesn't seem right and we shouldn't - * just keep spinning. - */ - return VK_NOT_READY; - } - } + while (anv_gettime_ns() < abs_timeout) { + if (query_is_available(pool, query)) + return VK_SUCCESS; + VkResult status = anv_device_query_status(device); + if (status != VK_SUCCESS) + return status; } + + return anv_device_set_lost(device, "query timeout"); } VkResult genX(GetQueryPoolResults)( @@ -233,7 +270,8 @@ assert(pool->type == VK_QUERY_TYPE_OCCLUSION || pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS || pool->type == VK_QUERY_TYPE_TIMESTAMP || - pool->type == VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT); + pool->type == VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT || + pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL); if (anv_device_is_lost(device)) return VK_ERROR_DEVICE_LOST; @@ -245,13 +283,10 @@ VkResult status = VK_SUCCESS; for (uint32_t i = 0; i < queryCount; i++) { - uint64_t *slot = pool->bo.map + (firstQuery + i) * pool->stride; - - /* Availability is always at the start of the slot */ - bool available = slot[0]; + bool available = query_is_available(pool, firstQuery + i); if (!available && (flags & VK_QUERY_RESULT_WAIT_BIT)) { - status = wait_for_available(device, pool, slot); + status = wait_for_available(device, pool, firstQuery + i); if (status != VK_SUCCESS) return status; @@ -271,13 +306,25 @@ uint32_t idx = 0; switch (pool->type) { - case VK_QUERY_TYPE_OCCLUSION: - if (write_results) - cpu_write_query_result(pData, flags, idx, slot[2] - slot[1]); + case VK_QUERY_TYPE_OCCLUSION: { + uint64_t *slot = query_slot(pool, firstQuery + i); + if (write_results) { + /* From the Vulkan 1.2.132 spec: + * + * "If VK_QUERY_RESULT_PARTIAL_BIT is set, + * VK_QUERY_RESULT_WAIT_BIT is not set, and the query’s status + * is unavailable, an intermediate result value between zero and + * the final result value is written to pData for that query." + */ + uint64_t result = available ? slot[2] - slot[1] : 0; + cpu_write_query_result(pData, flags, idx, result); + } idx++; break; + } case VK_QUERY_TYPE_PIPELINE_STATISTICS: { + uint64_t *slot = query_slot(pool, firstQuery + i); uint32_t statistics = pool->pipeline_statistics; while (statistics) { uint32_t stat = u_bit_scan(&statistics); @@ -297,7 +344,8 @@ break; } - case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: + case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: { + uint64_t *slot = query_slot(pool, firstQuery + i); if (write_results) cpu_write_query_result(pData, flags, idx, slot[2] - slot[1]); idx++; @@ -305,12 +353,54 @@ cpu_write_query_result(pData, flags, idx, slot[4] - slot[3]); idx++; break; + } - case VK_QUERY_TYPE_TIMESTAMP: + case VK_QUERY_TYPE_TIMESTAMP: { + uint64_t *slot = query_slot(pool, firstQuery + i); if (write_results) cpu_write_query_result(pData, flags, idx, slot[1]); idx++; break; + } + + case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: { + if (!write_results) + break; + const void *query_data = query_slot(pool, firstQuery + i); + const uint32_t *oa_begin = query_data + intel_perf_mi_rpc_offset(false); + const uint32_t *oa_end = query_data + intel_perf_mi_rpc_offset(true); + const uint32_t *rpstat_begin = query_data + intel_perf_rpstart_offset(false); + const uint32_t *rpstat_end = query_data + intel_perf_mi_rpc_offset(true); + struct gen_perf_query_result result; + struct gen_perf_query_info metric = { + .oa_format = (GEN_GEN >= 8 ? + I915_OA_FORMAT_A32u40_A4u32_B8_C8 : + I915_OA_FORMAT_A45_B8_C8), + }; + uint32_t core_freq[2]; +#if GEN_GEN < 9 + core_freq[0] = ((*rpstat_begin >> 7) & 0x7f) * 1000000ULL; + core_freq[1] = ((*rpstat_end >> 7) & 0x7f) * 1000000ULL; +#else + core_freq[0] = ((*rpstat_begin >> 23) & 0x1ff) * 1000000ULL; + core_freq[1] = ((*rpstat_end >> 23) & 0x1ff) * 1000000ULL; +#endif + gen_perf_query_result_clear(&result); + gen_perf_query_result_accumulate(&result, &metric, + oa_begin, oa_end); + gen_perf_query_result_read_frequencies(&result, &device->info, + oa_begin, oa_end); + gen_perf_query_result_write_mdapi(pData, stride, + &device->info, + &result, + core_freq[0], core_freq[1]); + gen_perf_query_mdapi_write_perfcntr(pData, stride, &device->info, + query_data + intel_perf_counter(false), + query_data + intel_perf_counter(true)); + const uint64_t *marker = query_data + intel_perf_marker_offset(); + gen_perf_query_mdapi_write_marker(pData, stride, &device->info, *marker); + break; + } default: unreachable("invalid pool type"); @@ -334,6 +424,9 @@ emit_ps_depth_count(struct anv_cmd_buffer *cmd_buffer, struct anv_address addr) { + cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT; + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { pc.DestinationAddressType = DAT_PPGTT; pc.PostSyncOperation = WritePSDepthCount; @@ -358,6 +451,9 @@ struct anv_address addr, bool available) { + cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT; + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { pc.DestinationAddressType = DAT_PPGTT; pc.PostSyncOperation = WriteImmediateData; @@ -406,6 +502,15 @@ } break; + case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: + for (uint32_t i = 0; i < num_queries; i++) { + struct anv_address slot_addr = + anv_query_address(pool, first_index + i); + gen_mi_memset(b, anv_address_add(slot_addr, 8), 0, pool->stride - 8); + emit_query_mi_availability(b, slot_addr, true); + } + break; + default: unreachable("Unsupported query type"); } @@ -440,12 +545,21 @@ break; } + case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: { + struct gen_mi_builder b; + gen_mi_builder_init(&b, &cmd_buffer->batch); + + for (uint32_t i = 0; i < queryCount; i++) + emit_query_mi_availability(&b, anv_query_address(pool, firstQuery + i), false); + break; + } + default: unreachable("Unsupported query type"); } } -void genX(ResetQueryPoolEXT)( +void genX(ResetQueryPool)( VkDevice _device, VkQueryPool queryPool, uint32_t firstQuery, @@ -454,7 +568,7 @@ ANV_FROM_HANDLE(anv_query_pool, pool, queryPool); for (uint32_t i = 0; i < queryCount; i++) { - uint64_t *slot = pool->bo.map + (firstQuery + i) * pool->stride; + uint64_t *slot = query_slot(pool, firstQuery + i); *slot = 0; } } @@ -550,6 +664,37 @@ emit_xfb_query(&b, index, anv_address_add(query_addr, 8)); break; + case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: { + anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { + pc.CommandStreamerStallEnable = true; + pc.StallAtPixelScoreboard = true; + } + anv_batch_emit(&cmd_buffer->batch, GENX(MI_REPORT_PERF_COUNT), rpc) { + rpc.MemoryAddress = + anv_address_add(query_addr, intel_perf_mi_rpc_offset(false)); + } +#if GEN_GEN < 9 + gen_mi_store(&b, + gen_mi_mem32(anv_address_add(query_addr, + intel_perf_rpstart_offset(false))), + gen_mi_reg32(GENX(RPSTAT1_num))); +#else + gen_mi_store(&b, + gen_mi_mem32(anv_address_add(query_addr, + intel_perf_rpstart_offset(false))), + gen_mi_reg32(GENX(RPSTAT0_num))); +#endif +#if GEN_GEN >= 8 && GEN_GEN <= 11 + gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr, + intel_perf_counter(false))), + gen_mi_reg64(GENX(PERFCNT1_num))); + gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr, + intel_perf_counter(false) + 8)), + gen_mi_reg64(GENX(PERFCNT2_num))); +#endif + break; + } + default: unreachable(""); } @@ -611,6 +756,43 @@ emit_query_mi_availability(&b, query_addr, true); break; + case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: { + anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { + pc.CommandStreamerStallEnable = true; + pc.StallAtPixelScoreboard = true; + } + uint32_t marker_offset = intel_perf_marker_offset(); + gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr, marker_offset)), + gen_mi_imm(cmd_buffer->intel_perf_marker)); +#if GEN_GEN >= 8 && GEN_GEN <= 11 + gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr, intel_perf_counter(true))), + gen_mi_reg64(GENX(PERFCNT1_num))); + gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr, intel_perf_counter(true) + 8)), + gen_mi_reg64(GENX(PERFCNT2_num))); +#endif +#if GEN_GEN < 9 + gen_mi_store(&b, + gen_mi_mem32(anv_address_add(query_addr, + intel_perf_rpstart_offset(true))), + gen_mi_reg32(GENX(RPSTAT1_num))); +#else + gen_mi_store(&b, + gen_mi_mem32(anv_address_add(query_addr, + intel_perf_rpstart_offset(true))), + gen_mi_reg32(GENX(RPSTAT0_num))); +#endif + /* Position the last OA snapshot at the beginning of the query so that + * we can tell whether it's ready. + */ + anv_batch_emit(&cmd_buffer->batch, GENX(MI_REPORT_PERF_COUNT), rpc) { + rpc.MemoryAddress = anv_address_add(query_addr, + intel_perf_mi_rpc_offset(true)); + rpc.ReportID = 0xdeadbeef; /* This goes in the first dword */ + } + emit_query_mi_availability(&b, query_addr, true); + break; + } + default: unreachable(""); } @@ -656,6 +838,9 @@ default: /* Everything else is bottom-of-pipe */ + cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT; + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { pc.DestinationAddressType = DAT_PPGTT; pc.PostSyncOperation = WriteTimestamp; @@ -687,6 +872,45 @@ #if GEN_GEN > 7 || GEN_IS_HASWELL +#if GEN_GEN >= 8 || GEN_IS_HASWELL + +#define MI_PREDICATE_SRC0 0x2400 +#define MI_PREDICATE_SRC1 0x2408 +#define MI_PREDICATE_RESULT 0x2418 + +/** + * Writes the results of a query to dst_addr is the value at poll_addr is equal + * to the reference value. + */ +static void +gpu_write_query_result_cond(struct anv_cmd_buffer *cmd_buffer, + struct gen_mi_builder *b, + struct anv_address poll_addr, + struct anv_address dst_addr, + uint64_t ref_value, + VkQueryResultFlags flags, + uint32_t value_index, + struct gen_mi_value query_result) +{ + gen_mi_store(b, gen_mi_reg64(MI_PREDICATE_SRC0), gen_mi_mem64(poll_addr)); + gen_mi_store(b, gen_mi_reg64(MI_PREDICATE_SRC1), gen_mi_imm(ref_value)); + anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) { + mip.LoadOperation = LOAD_LOAD; + mip.CombineOperation = COMBINE_SET; + mip.CompareOperation = COMPARE_SRCS_EQUAL; + } + + if (flags & VK_QUERY_RESULT_64_BIT) { + struct anv_address res_addr = anv_address_add(dst_addr, value_index * 8); + gen_mi_store_if(b, gen_mi_mem64(res_addr), query_result); + } else { + struct anv_address res_addr = anv_address_add(dst_addr, value_index * 4); + gen_mi_store_if(b, gen_mi_mem32(res_addr), query_result); + } +} + +#endif /* GEN_GEN >= 8 || GEN_IS_HASWELL */ + static void gpu_write_query_result(struct gen_mi_builder *b, struct anv_address dst_addr, @@ -763,7 +987,22 @@ switch (pool->type) { case VK_QUERY_TYPE_OCCLUSION: result = compute_query_result(&b, anv_address_add(query_addr, 8)); +#if GEN_GEN >= 8 || GEN_IS_HASWELL + /* Like in the case of vkGetQueryPoolResults, if the query is + * unavailable and the VK_QUERY_RESULT_PARTIAL_BIT flag is set, + * conservatively write 0 as the query result. If the + * VK_QUERY_RESULT_PARTIAL_BIT isn't set, don't write any value. + */ + gpu_write_query_result_cond(cmd_buffer, &b, query_addr, dest_addr, + 1 /* available */, flags, idx, result); + if (flags & VK_QUERY_RESULT_PARTIAL_BIT) { + gpu_write_query_result_cond(cmd_buffer, &b, query_addr, dest_addr, + 0 /* unavailable */, flags, idx, gen_mi_imm(0)); + } + idx++; +#else /* GEN_GEN < 8 && !GEN_IS_HASWELL */ gpu_write_query_result(&b, dest_addr, flags, idx++, result); +#endif break; case VK_QUERY_TYPE_PIPELINE_STATISTICS: { diff -Nru mesa-19.2.8/src/intel/vulkan/genX_state.c mesa-20.0.8/src/intel/vulkan/genX_state.c --- mesa-19.2.8/src/intel/vulkan/genX_state.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/vulkan/genX_state.c 2020-06-12 01:21:17.000000000 +0000 @@ -29,65 +29,13 @@ #include "anv_private.h" +#include "common/gen_aux_map.h" #include "common/gen_sample_positions.h" #include "genxml/gen_macros.h" #include "genxml/genX_pack.h" #include "vk_util.h" -#if GEN_GEN == 10 -/** - * From Gen10 Workarounds page in h/w specs: - * WaSampleOffsetIZ: - * "Prior to the 3DSTATE_SAMPLE_PATTERN driver must ensure there are no - * markers in the pipeline by programming a PIPE_CONTROL with stall." - */ -static void -gen10_emit_wa_cs_stall_flush(struct anv_batch *batch) -{ - - anv_batch_emit(batch, GENX(PIPE_CONTROL), pc) { - pc.CommandStreamerStallEnable = true; - pc.StallAtPixelScoreboard = true; - } -} - -/** - * From Gen10 Workarounds page in h/w specs: - * WaSampleOffsetIZ:_cs_stall_flush - * "When 3DSTATE_SAMPLE_PATTERN is programmed, driver must then issue an - * MI_LOAD_REGISTER_IMM command to an offset between 0x7000 and 0x7FFF(SVL) - * after the command to ensure the state has been delivered prior to any - * command causing a marker in the pipeline." - */ -static void -gen10_emit_wa_lri_to_cache_mode_zero(struct anv_batch *batch) -{ - /* Before changing the value of CACHE_MODE_0 register, GFX pipeline must - * be idle; i.e., full flush is required. - */ - anv_batch_emit(batch, GENX(PIPE_CONTROL), pc) { - pc.DepthCacheFlushEnable = true; - pc.DCFlushEnable = true; - pc.RenderTargetCacheFlushEnable = true; - pc.InstructionCacheInvalidateEnable = true; - pc.StateCacheInvalidationEnable = true; - pc.TextureCacheInvalidationEnable = true; - pc.VFCacheInvalidationEnable = true; - pc.ConstantCacheInvalidationEnable =true; - } - - /* Write to CACHE_MODE_0 (0x7000) */ - uint32_t cache_mode_0 = 0; - anv_pack_struct(&cache_mode_0, GENX(CACHE_MODE_0)); - - anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_IMM), lri) { - lri.RegisterOffset = GENX(CACHE_MODE_0_num); - lri.DataDWord = cache_mode_0; - } -} -#endif - static void genX(emit_slice_hashing_state)(struct anv_device *device, struct anv_batch *batch) @@ -164,13 +112,6 @@ VkResult genX(init_device_state)(struct anv_device *device) { - device->default_mocs = GENX(MOCS); -#if GEN_GEN >= 8 - device->external_mocs = GENX(EXTERNAL_MOCS); -#else - device->external_mocs = device->default_mocs; -#endif - struct anv_batch batch; uint32_t cmds[64]; @@ -212,10 +153,6 @@ #if GEN_GEN >= 8 anv_batch_emit(&batch, GENX(3DSTATE_WM_CHROMAKEY), ck); -#if GEN_GEN == 10 - gen10_emit_wa_cs_stall_flush(&batch); -#endif - /* See the Vulkan 1.0 spec Table 24.1 "Standard sample locations" and * VkPhysicalDeviceFeatures::standardSampleLocations. */ @@ -240,10 +177,6 @@ anv_batch_emit(&batch, GENX(3DSTATE_WM_HZ_OP), hzp); #endif -#if GEN_GEN == 10 - gen10_emit_wa_lri_to_cache_mode_zero(&batch); -#endif - #if GEN_GEN == 11 /* The default behavior of bit 5 "Headerless Message for Pre-emptable * Contexts" in SAMPLER MODE register is set to 0, which means @@ -273,16 +206,16 @@ lri.DataDWord = half_slice_chicken7; } - /* WaEnableStateCacheRedirectToCS:icl */ - uint32_t slice_common_eco_chicken1; - anv_pack_struct(&slice_common_eco_chicken1, - GENX(SLICE_COMMON_ECO_CHICKEN1), - .StateCacheRedirectToCSSectionEnable = true, - .StateCacheRedirectToCSSectionEnableMask = true); + uint32_t tccntlreg; + anv_pack_struct(&tccntlreg, GENX(TCCNTLREG), + .L3DataPartialWriteMergingEnable = true, + .ColorZPartialWriteMergingEnable = true, + .URBPartialWriteMergingEnable = true, + .TCDisable = true); anv_batch_emit(&batch, GENX(MI_LOAD_REGISTER_IMM), lri) { - lri.RegisterOffset = GENX(SLICE_COMMON_ECO_CHICKEN1_num); - lri.DataDWord = slice_common_eco_chicken1; + lri.RegisterOffset = GENX(TCCNTLREG_num); + lri.DataDWord = tccntlreg; } #endif @@ -304,6 +237,35 @@ lri.DataDWord = cache_mode_0; } } + + /* an unknown issue is causing vs push constants to become + * corrupted during object-level preemption. For now, restrict + * to command buffer level preemption to avoid rendering + * corruption. + */ + uint32_t cs_chicken1; + anv_pack_struct(&cs_chicken1, + GENX(CS_CHICKEN1), + .ReplayMode = MidcmdbufferPreemption, + .ReplayModeMask = true); + + anv_batch_emit(&batch, GENX(MI_LOAD_REGISTER_IMM), lri) { + lri.RegisterOffset = GENX(CS_CHICKEN1_num); + lri.DataDWord = cs_chicken1; + } +#endif + +#if GEN_GEN == 12 + uint64_t aux_base_addr = gen_aux_map_get_base(device->aux_map_ctx); + assert(aux_base_addr % (32 * 1024) == 0); + anv_batch_emit(&batch, GENX(MI_LOAD_REGISTER_IMM), lri) { + lri.RegisterOffset = GENX(GFX_AUX_TABLE_BASE_ADDR_num); + lri.DataDWord = aux_base_addr & 0xffffffff; + } + anv_batch_emit(&batch, GENX(MI_LOAD_REGISTER_IMM), lri) { + lri.RegisterOffset = GENX(GFX_AUX_TABLE_BASE_ADDR_num) + 4; + lri.DataDWord = aux_base_addr >> 32; + } #endif /* Set the "CONSTANT_BUFFER Address Offset Disable" bit, so @@ -311,8 +273,7 @@ * * This is only safe on kernels with context isolation support. */ - if (GEN_GEN >= 8 && - device->instance->physicalDevice.has_context_isolation) { + if (GEN_GEN >= 8 && device->physical->has_context_isolation) { UNUSED uint32_t tmp_reg; #if GEN_GEN >= 9 anv_pack_struct(&tmp_reg, GENX(CS_DEBUG_MODE2), @@ -337,7 +298,7 @@ assert(batch.next <= batch.end); - return anv_device_submit_simple_batch(device, &batch); + return anv_queue_submit_simple_batch(&device->queue, &batch); } static uint32_t @@ -409,8 +370,6 @@ VkSampler* pSampler) { ANV_FROM_HANDLE(anv_device, device, _device); - const struct anv_physical_device *pdevice = - &device->instance->physicalDevice; struct anv_sampler *sampler; assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO); @@ -453,9 +412,9 @@ break; } #if GEN_GEN >= 9 - case VK_STRUCTURE_TYPE_SAMPLER_REDUCTION_MODE_CREATE_INFO_EXT: { - struct VkSamplerReductionModeCreateInfoEXT *sampler_reduction = - (struct VkSamplerReductionModeCreateInfoEXT *) ext; + case VK_STRUCTURE_TYPE_SAMPLER_REDUCTION_MODE_CREATE_INFO: { + VkSamplerReductionModeCreateInfo *sampler_reduction = + (VkSamplerReductionModeCreateInfo *) ext; sampler_reduction_mode = vk_to_gen_sampler_reduction_mode[sampler_reduction->reductionMode]; enable_sampler_reduction = true; @@ -468,7 +427,7 @@ } } - if (pdevice->has_bindless_samplers) { + if (device->physical->has_bindless_samplers) { /* If we have bindless, allocate enough samplers. We allocate 32 bytes * for each sampler instead of 16 bytes because we want all bindless * samplers to be 32-byte aligned so we don't have to use indirect @@ -513,13 +472,16 @@ .MagModeFilter = vk_to_gen_tex_filter(mag_filter, pCreateInfo->anisotropyEnable), .MinModeFilter = vk_to_gen_tex_filter(min_filter, pCreateInfo->anisotropyEnable), .TextureLODBias = anv_clamp_f(pCreateInfo->mipLodBias, -16, 15.996), - .AnisotropicAlgorithm = EWAApproximation, + .AnisotropicAlgorithm = + pCreateInfo->anisotropyEnable ? EWAApproximation : LEGACY, .MinLOD = anv_clamp_f(pCreateInfo->minLod, 0, 14), .MaxLOD = anv_clamp_f(pCreateInfo->maxLod, 0, 14), .ChromaKeyEnable = 0, .ChromaKeyIndex = 0, .ChromaKeyMode = 0, - .ShadowFunction = vk_to_gen_shadow_compare_op[pCreateInfo->compareOp], + .ShadowFunction = + vk_to_gen_shadow_compare_op[pCreateInfo->compareEnable ? + pCreateInfo->compareOp : VK_COMPARE_OP_NEVER], .CubeSurfaceControlMode = OVERRIDE, .BorderColorPointer = border_color_offset, diff -Nru mesa-19.2.8/src/intel/vulkan/meson.build mesa-20.0.8/src/intel/vulkan/meson.build --- mesa-19.2.8/src/intel/vulkan/meson.build 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/vulkan/meson.build 2020-06-12 01:21:17.000000000 +0000 @@ -79,7 +79,8 @@ ) foreach g : [['70', ['gen7_cmd_buffer.c']], ['75', ['gen7_cmd_buffer.c']], ['80', ['gen8_cmd_buffer.c']], ['90', ['gen8_cmd_buffer.c']], - ['100', ['gen8_cmd_buffer.c']], ['110', ['gen8_cmd_buffer.c']]] + ['100', ['gen8_cmd_buffer.c']], ['110', ['gen8_cmd_buffer.c']], + ['120', ['gen8_cmd_buffer.c']]] _gen = g[0] libanv_gen_libs += static_library( 'anv_gen@0@'.format(_gen), @@ -113,10 +114,11 @@ 'anv_nir.h', 'anv_nir_add_base_work_group_id.c', 'anv_nir_apply_pipeline_layout.c', + 'anv_nir_compute_push_layout.c', 'anv_nir_lower_multiview.c', - 'anv_nir_lower_push_constants.c', 'anv_nir_lower_ycbcr_textures.c', 'anv_pass.c', + 'anv_perf.c', 'anv_pipeline.c', 'anv_pipeline_cache.c', 'anv_private.h', @@ -131,6 +133,7 @@ dep_valgrind, idep_nir_headers, idep_vulkan_util_headers, + idep_xmlconfig_headers, ] anv_flags = [ c_vis_args, @@ -193,6 +196,7 @@ link_whole : [libanv_common, libanv_gen_libs], link_with : [ libintel_compiler, libintel_dev, libisl, libblorp, libvulkan_wsi, + libintel_perf, ], dependencies : [ dep_thread, dep_dl, dep_m, anv_deps, idep_libintel_common, @@ -203,6 +207,19 @@ install : true, ) +if with_symbols_check + test( + 'anv symbols check', + symbols_check, + args : [ + '--lib', libvulkan_intel, + '--symbols-file', vulkan_icd_symbols, + symbols_check_args, + ], + suite : ['intel'], + ) +endif + if with_tests libvulkan_intel_test = static_library( 'vulkan_intel_test', @@ -213,7 +230,7 @@ link_whole : libanv_common, link_with : [ libanv_gen_libs, libintel_compiler, libintel_common, libintel_dev, - libisl, libblorp, libvulkan_wsi, + libisl, libblorp, libvulkan_wsi, libintel_perf, ], dependencies : [ dep_thread, dep_dl, dep_m, anv_deps, diff -Nru mesa-19.2.8/src/intel/vulkan/tests/block_pool_grow_first.c mesa-20.0.8/src/intel/vulkan/tests/block_pool_grow_first.c --- mesa-19.2.8/src/intel/vulkan/tests/block_pool_grow_first.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/vulkan/tests/block_pool_grow_first.c 2020-06-12 01:21:17.000000000 +0000 @@ -21,19 +21,16 @@ * IN THE SOFTWARE. */ -#undef NDEBUG - #include "anv_private.h" +#include "test_common.h" int main(int argc, char **argv) { - struct anv_instance instance = { - .physicalDevice = { - .use_softpin = true, - }, + struct anv_physical_device physical_device = { + .use_softpin = true, }; struct anv_device device = { - .instance = &instance, + .physical = &physical_device, }; struct anv_block_pool pool; @@ -43,24 +40,26 @@ const uint32_t block_size = 16 * 1024; const uint32_t initial_size = block_size / 2; - anv_block_pool_init(&pool, &device, 4096, initial_size, EXEC_OBJECT_PINNED); - assert(pool.size == initial_size); + pthread_mutex_init(&device.mutex, NULL); + anv_bo_cache_init(&device.bo_cache); + anv_block_pool_init(&pool, &device, 4096, initial_size); + ASSERT(pool.size == initial_size); uint32_t padding; int32_t offset = anv_block_pool_alloc(&pool, block_size, &padding); /* Pool will have grown at least space to fit the new allocation. */ - assert(pool.size > initial_size); - assert(pool.size >= initial_size + block_size); + ASSERT(pool.size > initial_size); + ASSERT(pool.size >= initial_size + block_size); /* The whole initial size is considered padding and the allocation should be * right next to it. */ - assert(padding == initial_size); - assert(offset == initial_size); + ASSERT(padding == initial_size); + ASSERT(offset == initial_size); /* Use the memory to ensure it is valid. */ - void *map = anv_block_pool_map(&pool, offset); + void *map = anv_block_pool_map(&pool, offset, block_size); memset(map, 22, block_size); anv_block_pool_finish(&pool); diff -Nru mesa-19.2.8/src/intel/vulkan/tests/block_pool_no_free.c mesa-20.0.8/src/intel/vulkan/tests/block_pool_no_free.c --- mesa-19.2.8/src/intel/vulkan/tests/block_pool_no_free.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/vulkan/tests/block_pool_no_free.c 2020-06-12 01:21:17.000000000 +0000 @@ -21,11 +21,10 @@ * IN THE SOFTWARE. */ -#undef NDEBUG - #include #include "anv_private.h" +#include "test_common.h" #define NUM_THREADS 16 #define BLOCKS_PER_THREAD 1024 @@ -49,26 +48,26 @@ for (unsigned i = 0; i < BLOCKS_PER_THREAD; i++) { block = anv_block_pool_alloc(job->pool, block_size, NULL); - data = anv_block_pool_map(job->pool, block); + data = anv_block_pool_map(job->pool, block, block_size); *data = block; - assert(block >= 0); + ASSERT(block >= 0); job->blocks[i] = block; block = anv_block_pool_alloc_back(job->pool, block_size); - data = anv_block_pool_map(job->pool, block); + data = anv_block_pool_map(job->pool, block, block_size); *data = block; - assert(block < 0); + ASSERT(block < 0); job->back_blocks[i] = -block; } for (unsigned i = 0; i < BLOCKS_PER_THREAD; i++) { block = job->blocks[i]; - data = anv_block_pool_map(job->pool, block); - assert(*data == block); + data = anv_block_pool_map(job->pool, block, block_size); + ASSERT(*data == block); block = -job->back_blocks[i]; - data = anv_block_pool_map(job->pool, block); - assert(*data == block); + data = anv_block_pool_map(job->pool, block, block_size); + ASSERT(*data == block); } return NULL; @@ -102,7 +101,7 @@ break; /* That next element had better be higher than the previous highest */ - assert(blocks[min_thread_idx][next[min_thread_idx]] > highest); + ASSERT(blocks[min_thread_idx][next[min_thread_idx]] > highest); highest = blocks[min_thread_idx][next[min_thread_idx]]; next[min_thread_idx]++; @@ -111,14 +110,15 @@ static void run_test() { - struct anv_instance instance = { }; + struct anv_physical_device physical_device = { }; struct anv_device device = { - .instance = &instance, + .physical = &physical_device, }; struct anv_block_pool pool; pthread_mutex_init(&device.mutex, NULL); - anv_block_pool_init(&pool, &device, 4096, 4096, 0); + anv_bo_cache_init(&device.bo_cache); + anv_block_pool_init(&pool, &device, 4096, 4096); for (unsigned i = 0; i < NUM_THREADS; i++) { jobs[i].pool = &pool; diff -Nru mesa-19.2.8/src/intel/vulkan/tests/state_pool.c mesa-20.0.8/src/intel/vulkan/tests/state_pool.c --- mesa-19.2.8/src/intel/vulkan/tests/state_pool.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/vulkan/tests/state_pool.c 2020-06-12 01:21:17.000000000 +0000 @@ -21,11 +21,10 @@ * IN THE SOFTWARE. */ -#undef NDEBUG - #include #include "anv_private.h" +#include "test_common.h" #define NUM_THREADS 8 #define STATES_PER_THREAD_LOG2 10 @@ -36,16 +35,17 @@ int main(int argc, char **argv) { - struct anv_instance instance = { }; + struct anv_physical_device physical_device = { }; struct anv_device device = { - .instance = &instance, + .physical = &physical_device, }; struct anv_state_pool state_pool; pthread_mutex_init(&device.mutex, NULL); + anv_bo_cache_init(&device.bo_cache); for (unsigned i = 0; i < NUM_RUNS; i++) { - anv_state_pool_init(&state_pool, &device, 4096, 256, 0); + anv_state_pool_init(&state_pool, &device, 4096, 256); /* Grab one so a zero offset is impossible */ anv_state_pool_alloc(&state_pool, 16, 16); diff -Nru mesa-19.2.8/src/intel/vulkan/tests/state_pool_free_list_only.c mesa-20.0.8/src/intel/vulkan/tests/state_pool_free_list_only.c --- mesa-19.2.8/src/intel/vulkan/tests/state_pool_free_list_only.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/vulkan/tests/state_pool_free_list_only.c 2020-06-12 01:21:17.000000000 +0000 @@ -21,11 +21,10 @@ * IN THE SOFTWARE. */ -#undef NDEBUG - #include #include "anv_private.h" +#include "test_common.h" #define NUM_THREADS 8 #define STATES_PER_THREAD_LOG2 12 @@ -35,14 +34,15 @@ int main(int argc, char **argv) { - struct anv_instance instance = { }; + struct anv_physical_device physical_device = { }; struct anv_device device = { - .instance = &instance, + .physical = &physical_device, }; struct anv_state_pool state_pool; pthread_mutex_init(&device.mutex, NULL); - anv_state_pool_init(&state_pool, &device, 4096, 4096, 0); + anv_bo_cache_init(&device.bo_cache); + anv_state_pool_init(&state_pool, &device, 4096, 4096); /* Grab one so a zero offset is impossible */ anv_state_pool_alloc(&state_pool, 16, 16); @@ -54,7 +54,7 @@ struct anv_state states[NUM_THREADS * STATES_PER_THREAD]; for (unsigned i = 0; i < NUM_THREADS * STATES_PER_THREAD; i++) { states[i] = anv_state_pool_alloc(&state_pool, 16, 16); - assert(states[i].offset != 0); + ASSERT(states[i].offset != 0); } for (unsigned i = 0; i < NUM_THREADS * STATES_PER_THREAD; i++) diff -Nru mesa-19.2.8/src/intel/vulkan/tests/state_pool_no_free.c mesa-20.0.8/src/intel/vulkan/tests/state_pool_no_free.c --- mesa-19.2.8/src/intel/vulkan/tests/state_pool_no_free.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/vulkan/tests/state_pool_no_free.c 2020-06-12 01:21:17.000000000 +0000 @@ -21,11 +21,10 @@ * IN THE SOFTWARE. */ -#undef NDEBUG - #include #include "anv_private.h" +#include "test_common.h" #define NUM_THREADS 16 #define STATES_PER_THREAD 1024 @@ -56,14 +55,15 @@ static void run_test() { - struct anv_instance instance = { }; + struct anv_physical_device physical_device = { }; struct anv_device device = { - .instance = &instance, + .physical = &physical_device, }; struct anv_state_pool state_pool; pthread_mutex_init(&device.mutex, NULL); - anv_state_pool_init(&state_pool, &device, 4096, 64, 0); + anv_bo_cache_init(&device.bo_cache); + anv_state_pool_init(&state_pool, &device, 4096, 64); pthread_barrier_init(&barrier, NULL, NUM_THREADS); @@ -102,7 +102,7 @@ break; /* That next element had better be higher than the previous highest */ - assert(jobs[max_thread_idx].offsets[next[max_thread_idx]] > highest); + ASSERT(jobs[max_thread_idx].offsets[next[max_thread_idx]] > highest); highest = jobs[max_thread_idx].offsets[next[max_thread_idx]]; next[max_thread_idx]++; diff -Nru mesa-19.2.8/src/intel/vulkan/tests/state_pool_padding.c mesa-20.0.8/src/intel/vulkan/tests/state_pool_padding.c --- mesa-19.2.8/src/intel/vulkan/tests/state_pool_padding.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/vulkan/tests/state_pool_padding.c 2020-06-12 01:21:17.000000000 +0000 @@ -21,23 +21,22 @@ * IN THE SOFTWARE. */ -#undef NDEBUG - #include "anv_private.h" +#include "test_common.h" int main(int argc, char **argv) { - struct anv_instance instance = { - .physicalDevice = { - .use_softpin = true, - }, + struct anv_physical_device physical_device = { + .use_softpin = true, }; struct anv_device device = { - .instance = &instance, + .physical = &physical_device, }; struct anv_state_pool state_pool; - anv_state_pool_init(&state_pool, &device, 4096, 4096, EXEC_OBJECT_PINNED); + pthread_mutex_init(&device.mutex, NULL); + anv_bo_cache_init(&device.bo_cache); + anv_state_pool_init(&state_pool, &device, 4096, 4096); /* Get the size of the underlying block_pool */ struct anv_block_pool *bp = &state_pool.block_pool; @@ -50,30 +49,30 @@ struct anv_state state = anv_state_pool_alloc(&state_pool, pool_size, 16); /* The pool must have grown */ - assert(bp->size > pool_size); + ASSERT(bp->size > pool_size); /* And the state must have been allocated at the end of the original size */ - assert(state.offset == pool_size); + ASSERT(state.offset == pool_size); /* A new allocation that fits into the returned empty space should have an * offset within the original pool size */ state = anv_state_pool_alloc(&state_pool, 4096, 16); - assert(state.offset + state.alloc_size <= pool_size); + ASSERT(state.offset + state.alloc_size <= pool_size); /* We should be able to allocate pool->block_size'd chunks in the returned area */ int left_chunks = pool_size / 4096 - 2; for (int i = 0; i < left_chunks; i++) { state = anv_state_pool_alloc(&state_pool, 4096, 16); - assert(state.offset + state.alloc_size <= pool_size); + ASSERT(state.offset + state.alloc_size <= pool_size); } /* Now the next chunk to be allocated should make the pool grow again */ pool_size = bp->size; state = anv_state_pool_alloc(&state_pool, 4096, 16); - assert(bp->size > pool_size); - assert(state.offset == pool_size); + ASSERT(bp->size > pool_size); + ASSERT(state.offset == pool_size); anv_state_pool_finish(&state_pool); } diff -Nru mesa-19.2.8/src/intel/vulkan/tests/state_pool_test_helper.h mesa-20.0.8/src/intel/vulkan/tests/state_pool_test_helper.h --- mesa-19.2.8/src/intel/vulkan/tests/state_pool_test_helper.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/intel/vulkan/tests/state_pool_test_helper.h 2020-06-12 01:21:17.000000000 +0000 @@ -46,7 +46,7 @@ for (unsigned i = 0; i < chunk_size; i++) { states[i] = anv_state_pool_alloc(job->pool, 16, 16); memset(states[i].map, 139, 16); - assert(states[i].offset != 0); + ASSERT(states[i].offset != 0); } for (unsigned i = 0; i < chunk_size; i++) diff -Nru mesa-19.2.8/src/intel/vulkan/tests/test_common.h mesa-20.0.8/src/intel/vulkan/tests/test_common.h --- mesa-19.2.8/src/intel/vulkan/tests/test_common.h 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/intel/vulkan/tests/test_common.h 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,34 @@ +/* + * Copyright © 2020 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include +#include + +#define ASSERT(cond) \ + do { \ + if (!(cond)) { \ + fprintf(stderr, "%s:%d: Test assertion `%s` failed.\n", \ + __FILE__, __LINE__, # cond); \ + abort(); \ + } \ + } while (false) diff -Nru mesa-19.2.8/src/loader/loader.c mesa-20.0.8/src/loader/loader.c --- mesa-19.2.8/src/loader/loader.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/loader/loader.c 2020-06-12 01:21:17.000000000 +0000 @@ -36,6 +36,7 @@ #include #include #include +#include #include #ifdef MAJOR_IN_MKDEV #include @@ -55,9 +56,16 @@ #endif #endif +#include "util/macros.h" + #define __IS_LOADER #include "pci_id_driver_map.h" +/* For systems like Hurd */ +#ifndef PATH_MAX +#define PATH_MAX 4096 +#endif + static void default_logger(int level, const char *fmt, ...) { if (level <= _LOADER_WARNING) { @@ -106,6 +114,16 @@ #endif } +bool +is_kernel_i915(int fd) +{ + char *kernel_driver = loader_get_kernel_driver_name(fd); + bool is_i915 = kernel_driver && strcmp(kernel_driver, "i915") == 0; + + free(kernel_driver); + return is_i915; +} + #if defined(HAVE_LIBDRM) int loader_open_render_node(const char *name) @@ -385,27 +403,27 @@ #if defined(HAVE_LIBDRM) -static int +static bool drm_get_pci_id_for_fd(int fd, int *vendor_id, int *chip_id) { drmDevicePtr device; - int ret; + bool ret; if (drmGetDevice2(fd, 0, &device) == 0) { if (device->bustype == DRM_BUS_PCI) { *vendor_id = device->deviceinfo.pci->vendor_id; *chip_id = device->deviceinfo.pci->device_id; - ret = 1; + ret = true; } else { log_(_LOADER_DEBUG, "MESA-LOADER: device is not located on the PCI bus\n"); - ret = 0; + ret = false; } drmFreeDevice(&device); } else { log_(_LOADER_WARNING, "MESA-LOADER: failed to retrieve device information\n"); - ret = 0; + ret = false; } return ret; @@ -413,14 +431,13 @@ #endif -int +bool loader_get_pci_id_for_fd(int fd, int *vendor_id, int *chip_id) { #if HAVE_LIBDRM - if (drm_get_pci_id_for_fd(fd, vendor_id, chip_id)) - return 1; + return drm_get_pci_id_for_fd(fd, vendor_id, chip_id); #endif - return 0; + return false; } char * @@ -465,7 +482,7 @@ return driver; } - for (i = 0; driver_map[i].driver; i++) { + for (i = 0; i < ARRAY_SIZE(driver_map); i++) { if (vendor_id != driver_map[i].vendor_id) continue; diff -Nru mesa-19.2.8/src/loader/loader_dri3_helper.c mesa-20.0.8/src/loader/loader_dri3_helper.c --- mesa-19.2.8/src/loader/loader_dri3_helper.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/loader/loader_dri3_helper.c 2020-06-12 01:21:17.000000000 +0000 @@ -763,7 +763,7 @@ if (flush) flags |= __DRI2_FLUSH_CONTEXT; - loader_dri3_flush(draw, flags, __DRI2_THROTTLE_SWAPBUFFER); + loader_dri3_flush(draw, flags, __DRI2_THROTTLE_COPYSUBBUFFER); back = dri3_find_back_alloc(draw); if (!back) @@ -817,7 +817,7 @@ xcb_drawable_t dest, xcb_drawable_t src) { - loader_dri3_flush(draw, __DRI2_FLUSH_DRAWABLE, 0); + loader_dri3_flush(draw, __DRI2_FLUSH_DRAWABLE, __DRI2_THROTTLE_COPYSUBBUFFER); dri3_fence_reset(draw->conn, dri3_fake_front_buffer(draw)); dri3_copy_area(draw->conn, @@ -1119,7 +1119,11 @@ case __DRI_IMAGE_FORMAT_ABGR2101010: case __DRI_IMAGE_FORMAT_SARGB8: case __DRI_IMAGE_FORMAT_SABGR8: + case __DRI_IMAGE_FORMAT_SXRGB8: return 4; + case __DRI_IMAGE_FORMAT_XBGR16161616F: + case __DRI_IMAGE_FORMAT_ABGR16161616F: + return 8; case __DRI_IMAGE_FORMAT_NONE: default: return 0; @@ -1157,27 +1161,30 @@ } /* the DRIimage createImage function takes __DRI_IMAGE_FORMAT codes, while - * the createImageFromFds call takes __DRI_IMAGE_FOURCC codes. To avoid + * the createImageFromFds call takes DRM_FORMAT codes. To avoid * complete confusion, just deal in __DRI_IMAGE_FORMAT codes for now and - * translate to __DRI_IMAGE_FOURCC codes in the call to createImageFromFds + * translate to DRM_FORMAT codes in the call to createImageFromFds */ static int image_format_to_fourcc(int format) { - /* Convert from __DRI_IMAGE_FORMAT to __DRI_IMAGE_FOURCC (sigh) */ + /* Convert from __DRI_IMAGE_FORMAT to DRM_FORMAT (sigh) */ switch (format) { case __DRI_IMAGE_FORMAT_SARGB8: return __DRI_IMAGE_FOURCC_SARGB8888; case __DRI_IMAGE_FORMAT_SABGR8: return __DRI_IMAGE_FOURCC_SABGR8888; - case __DRI_IMAGE_FORMAT_RGB565: return __DRI_IMAGE_FOURCC_RGB565; - case __DRI_IMAGE_FORMAT_XRGB8888: return __DRI_IMAGE_FOURCC_XRGB8888; - case __DRI_IMAGE_FORMAT_ARGB8888: return __DRI_IMAGE_FOURCC_ARGB8888; - case __DRI_IMAGE_FORMAT_ABGR8888: return __DRI_IMAGE_FOURCC_ABGR8888; - case __DRI_IMAGE_FORMAT_XBGR8888: return __DRI_IMAGE_FOURCC_XBGR8888; - case __DRI_IMAGE_FORMAT_XRGB2101010: return __DRI_IMAGE_FOURCC_XRGB2101010; - case __DRI_IMAGE_FORMAT_ARGB2101010: return __DRI_IMAGE_FOURCC_ARGB2101010; - case __DRI_IMAGE_FORMAT_XBGR2101010: return __DRI_IMAGE_FOURCC_XBGR2101010; - case __DRI_IMAGE_FORMAT_ABGR2101010: return __DRI_IMAGE_FOURCC_ABGR2101010; + case __DRI_IMAGE_FORMAT_SXRGB8: return __DRI_IMAGE_FOURCC_SXRGB8888; + case __DRI_IMAGE_FORMAT_RGB565: return DRM_FORMAT_RGB565; + case __DRI_IMAGE_FORMAT_XRGB8888: return DRM_FORMAT_XRGB8888; + case __DRI_IMAGE_FORMAT_ARGB8888: return DRM_FORMAT_ARGB8888; + case __DRI_IMAGE_FORMAT_ABGR8888: return DRM_FORMAT_ABGR8888; + case __DRI_IMAGE_FORMAT_XBGR8888: return DRM_FORMAT_XBGR8888; + case __DRI_IMAGE_FORMAT_XRGB2101010: return DRM_FORMAT_XRGB2101010; + case __DRI_IMAGE_FORMAT_ARGB2101010: return DRM_FORMAT_ARGB2101010; + case __DRI_IMAGE_FORMAT_XBGR2101010: return DRM_FORMAT_XBGR2101010; + case __DRI_IMAGE_FORMAT_ABGR2101010: return DRM_FORMAT_ABGR2101010; + case __DRI_IMAGE_FORMAT_XBGR16161616F: return DRM_FORMAT_XBGR16161616F; + case __DRI_IMAGE_FORMAT_ABGR16161616F: return DRM_FORMAT_ABGR16161616F; } return 0; } @@ -1387,6 +1394,8 @@ image = pixmap_buffer; } + buffer_fds[i] = -1; + ret = draw->ext->image->queryImage(image, __DRI_IMAGE_ATTRIB_FD, &buffer_fds[i]); ret &= draw->ext->image->queryImage(image, __DRI_IMAGE_ATTRIB_STRIDE, @@ -1459,7 +1468,8 @@ no_buffer_attrib: do { - close(buffer_fds[i]); + if (buffer_fds[i] != -1) + close(buffer_fds[i]); } while (--i >= 0); draw->ext->image->destroyImage(pixmap_buffer); no_linear_buffer: @@ -1836,7 +1846,9 @@ if (!loader_dri3_blit_image(draw, new_buffer->image, buffer->image, - 0, 0, draw->width, draw->height, + 0, 0, + MIN2(buffer->width, new_buffer->width), + MIN2(buffer->height, new_buffer->height), 0, 0, 0) && !buffer->linear_buffer) { dri3_fence_reset(draw->conn, new_buffer); diff -Nru mesa-19.2.8/src/loader/loader.h mesa-20.0.8/src/loader/loader.h --- mesa-19.2.8/src/loader/loader.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/loader/loader.h 2020-06-12 01:21:17.000000000 +0000 @@ -43,7 +43,7 @@ int loader_open_render_node(const char *name); -int +bool loader_get_pci_id_for_fd(int fd, int *vendor_id, int *chip_id); char * diff -Nru mesa-19.2.8/src/loader/meson.build mesa-20.0.8/src/loader/meson.build --- mesa-19.2.8/src/loader/meson.build 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/loader/meson.build 2020-06-12 01:21:17.000000000 +0000 @@ -23,7 +23,7 @@ if with_platform_x11 and with_dri3 libloader_dri3_helper = static_library( 'loader_dri3_helper', - ['loader_dri3_helper.c', 'loader_dri3_helper.h'], + 'loader_dri3_helper.c', c_args : c_vis_args, include_directories : [inc_include, inc_src], dependencies : [ @@ -35,14 +35,20 @@ libloader_dri3_helper = [] endif +loader_c_args = [ + c_vis_args, '-DUSE_DRICONF', + '-DDEFAULT_DRIVER_DIR="@0@"'.format(dri_search_path), +] + +if with_gallium_iris and get_option('prefer-iris') + loader_c_args += ['-DPREFER_IRIS'] +endif + libloader = static_library( 'loader', - ['loader.c', 'loader.h', 'pci_id_driver_map.c', 'pci_id_driver_map.h', - xmlpool_options_h], - c_args : [c_vis_args, '-DUSE_DRICONF', - '-DDEFAULT_DRIVER_DIR="@0@"'.format(dri_search_path), -], + ['loader.c', 'pci_id_driver_map.c'], + c_args : loader_c_args, include_directories : [inc_include, inc_src, inc_util], - dependencies : [dep_libdrm, dep_thread], + dependencies : [dep_libdrm, dep_thread, idep_xmlconfig_headers], build_by_default : false, ) diff -Nru mesa-19.2.8/src/loader/pci_id_driver_map.c mesa-20.0.8/src/loader/pci_id_driver_map.c --- mesa-19.2.8/src/loader/pci_id_driver_map.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/loader/pci_id_driver_map.c 2020-06-12 01:21:17.000000000 +0000 @@ -21,7 +21,9 @@ * SOFTWARE. */ -int is_nouveau_vieux(int fd); +#include + +bool is_nouveau_vieux(int fd); #ifdef HAVE_LIBDRM @@ -42,7 +44,7 @@ return gp.value; } -int +bool is_nouveau_vieux(int fd) { int chipset = nouveau_chipset(fd); @@ -52,6 +54,6 @@ #else -int is_nouveau_vieux(int fd) { return 0; } +bool is_nouveau_vieux(int fd) { return false; } #endif diff -Nru mesa-19.2.8/src/loader/pci_id_driver_map.h mesa-20.0.8/src/loader/pci_id_driver_map.h --- mesa-19.2.8/src/loader/pci_id_driver_map.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/loader/pci_id_driver_map.h 2020-06-12 01:21:17.000000000 +0000 @@ -1,12 +1,9 @@ #ifndef _PCI_ID_DRIVER_MAP_H_ #define _PCI_ID_DRIVER_MAP_H_ +#include #include -#ifndef ARRAY_SIZE -#define ARRAY_SIZE(a) (sizeof(a) / sizeof((a)[0])) -#endif - #ifndef __IS_LOADER # error "Only include from loader.c" #endif @@ -18,19 +15,11 @@ }; static const int i965_chip_ids[] = { -#define CHIPSET(chip, family, name) chip, +#define CHIPSET(chip, family, family_str, name) chip, #include "pci_ids/i965_pci_ids.h" #undef CHIPSET }; -static const int iris_chip_ids[] = { -#define CHIPSET(chip, family, name) chip, -#define IRIS 1 -#include "pci_ids/i965_pci_ids.h" -#undef IRIS -#undef CHIPSET -}; - static const int r100_chip_ids[] = { #define CHIPSET(chip, name, family) chip, #include "pci_ids/radeon_pci_ids.h" @@ -55,12 +44,6 @@ #undef CHIPSET }; -static const int radeonsi_chip_ids[] = { -#define CHIPSET(chip, family) chip, -#include "pci_ids/radeonsi_pci_ids.h" -#undef CHIPSET -}; - static const int virtio_gpu_chip_ids[] = { #define CHIPSET(chip, name, family) chip, #include "pci_ids/virtio_gpu_pci_ids.h" @@ -73,28 +56,28 @@ #undef CHIPSET }; -int is_nouveau_vieux(int fd); +bool is_nouveau_vieux(int fd); +bool is_kernel_i915(int fd); static const struct { int vendor_id; const char *driver; const int *chip_ids; int num_chips_ids; - int (*predicate)(int fd); + bool (*predicate)(int fd); } driver_map[] = { { 0x8086, "i915", i915_chip_ids, ARRAY_SIZE(i915_chip_ids) }, { 0x8086, "i965", i965_chip_ids, ARRAY_SIZE(i965_chip_ids) }, - { 0x8086, "iris", iris_chip_ids, ARRAY_SIZE(iris_chip_ids) }, + { 0x8086, "iris", NULL, -1, is_kernel_i915 }, { 0x1002, "radeon", r100_chip_ids, ARRAY_SIZE(r100_chip_ids) }, { 0x1002, "r200", r200_chip_ids, ARRAY_SIZE(r200_chip_ids) }, { 0x1002, "r300", r300_chip_ids, ARRAY_SIZE(r300_chip_ids) }, { 0x1002, "r600", r600_chip_ids, ARRAY_SIZE(r600_chip_ids) }, - { 0x1002, "radeonsi", radeonsi_chip_ids, ARRAY_SIZE(radeonsi_chip_ids) }, + { 0x1002, "radeonsi", NULL, -1 }, { 0x10de, "nouveau_vieux", NULL, -1, is_nouveau_vieux }, { 0x10de, "nouveau", NULL, -1, }, { 0x1af4, "virtio_gpu", virtio_gpu_chip_ids, ARRAY_SIZE(virtio_gpu_chip_ids) }, { 0x15ad, "vmwgfx", vmwgfx_chip_ids, ARRAY_SIZE(vmwgfx_chip_ids) }, - { 0x0000, NULL, NULL, 0 }, }; #endif /* _PCI_ID_DRIVER_MAP_H_ */ diff -Nru mesa-19.2.8/src/mapi/entry.c mesa-20.0.8/src/mapi/entry.c --- mesa-19.2.8/src/mapi/entry.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mapi/entry.c 2020-06-12 01:21:17.000000000 +0000 @@ -53,7 +53,7 @@ # endif #elif defined(USE_X86_64_ASM) && defined(__GNUC__) && defined(USE_ELF_TLS) # include "entry_x86-64_tls.h" -#elif defined(USE_PPC64LE_ASM) && defined(__GNUC__) && defined(PIPE_ARCH_LITTLE_ENDIAN) +#elif defined(USE_PPC64LE_ASM) && defined(__GNUC__) && UTIL_ARCH_LITTLE_ENDIAN # ifdef USE_ELF_TLS # include "entry_ppc64le_tls.h" # else diff -Nru mesa-19.2.8/src/mapi/entry_x86_tls.h mesa-20.0.8/src/mapi/entry_x86_tls.h --- mesa-19.2.8/src/mapi/entry_x86_tls.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mapi/entry_x86_tls.h 2020-06-12 01:21:17.000000000 +0000 @@ -33,6 +33,8 @@ #define HIDDEN #endif +#define X86_ENTRY_SIZE 32 + __asm__(".text"); __asm__("x86_current_tls:\n\t" @@ -56,9 +58,13 @@ ".balign 16\n" \ func ":" -#define STUB_ASM_CODE(slot) \ - "call x86_current_tls\n\t" \ - "movl %gs:(%eax), %eax\n\t" \ +#define STUB_ASM_CODE(slot) \ + "call 1f\n" \ + "1:\n\t" \ + "popl %eax\n\t" \ + "addl $_GLOBAL_OFFSET_TABLE_+[.-1b], %eax\n\t" \ + "movl " ENTRY_CURRENT_TABLE "@GOTNTPOFF(%eax), %eax\n\t" \ + "movl %gs:(%eax), %eax\n\t" \ "jmp *(4 * " slot ")(%eax)" #define MAPI_TMP_STUB_ASM_GCC @@ -80,27 +86,25 @@ extern char x86_entry_start[] HIDDEN; extern char x86_entry_end[] HIDDEN; +static inline mapi_func +entry_generate_or_patch(int, char *, size_t); + void entry_patch_public(void) { #ifndef GLX_X86_READONLY_TEXT - char patch[8] = { - 0x65, 0xa1, 0x00, 0x00, 0x00, 0x00, /* movl %gs:0x0, %eax */ - 0x90, 0x90 /* nop's */ - }; char *entry; - - *((unsigned long *) (patch + 2)) = x86_current_tls(); - - for (entry = x86_entry_start; entry < x86_entry_end; entry += 16) - memcpy(entry, patch, sizeof(patch)); + int slot = 0; + for (entry = x86_entry_start; entry < x86_entry_end; + entry += X86_ENTRY_SIZE, ++slot) + entry_generate_or_patch(slot, entry, X86_ENTRY_SIZE); #endif } mapi_func entry_get_public(int slot) { - return (mapi_func) (x86_entry_start + slot * 16); + return (mapi_func) (x86_entry_start + slot * X86_ENTRY_SIZE); } void @@ -110,19 +114,21 @@ *((unsigned long *) (code + 8)) = slot * sizeof(mapi_func); } -mapi_func -entry_generate(int slot) +static inline mapi_func +entry_generate_or_patch(int slot, char *code, size_t size) { const char code_templ[16] = { 0x65, 0xa1, 0x00, 0x00, 0x00, 0x00, /* movl %gs:0x0, %eax */ 0xff, 0xa0, 0x34, 0x12, 0x00, 0x00, /* jmp *0x1234(%eax) */ 0x90, 0x90, 0x90, 0x90 /* nop's */ }; - char *code; mapi_func entry; - code = u_execmem_alloc(sizeof(code_templ)); - if (!code) + if (code == NULL) { + size = sizeof(code_templ); + code = u_execmem_alloc(size); + } + if (!code || size < sizeof(code_templ)) return NULL; memcpy(code, code_templ, sizeof(code_templ)); @@ -134,4 +140,10 @@ return entry; } +mapi_func +entry_generate(int slot) +{ + return entry_generate_or_patch(slot, NULL, 0); +} + #endif /* MAPI_MODE_BRIDGE */ diff -Nru mesa-19.2.8/src/mapi/entry_x86_tsd.h mesa-20.0.8/src/mapi/entry_x86_tsd.h --- mesa-19.2.8/src/mapi/entry_x86_tsd.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mapi/entry_x86_tsd.h 2020-06-12 01:21:17.000000000 +0000 @@ -31,7 +31,7 @@ #define HIDDEN #endif -#define X86_ENTRY_SIZE 32 +#define X86_ENTRY_SIZE 64 __asm__(".text\n" ".balign 32\n" @@ -44,12 +44,19 @@ func ":" #define STUB_ASM_CODE(slot) \ - "movl " ENTRY_CURRENT_TABLE ", %eax\n\t" \ + "call 1f\n\t" \ + "1:\n\t" \ + "popl %ecx\n\t" \ + "addl $_GLOBAL_OFFSET_TABLE_+[.-1b], %ecx\n\t" \ + "movl " ENTRY_CURRENT_TABLE "@GOT(%ecx), %eax\n\t" \ + "mov (%eax), %eax\n\t" \ "testl %eax, %eax\n\t" \ - "je 1f\n\t" \ - "jmp *(4 * " slot ")(%eax)\n" \ + "jne 1f\n\t" \ + "push %ebx\n\t" \ + "movl %ecx, %ebx\n\t" \ + "call " ENTRY_CURRENT_TABLE_GET "@PLT\n\t" \ + "popl %ebx\n\t" \ "1:\n\t" \ - "call " ENTRY_CURRENT_TABLE_GET "\n\t" \ "jmp *(4 * " slot ")(%eax)" #define MAPI_TMP_STUB_ASM_GCC diff -Nru mesa-19.2.8/src/mapi/es1api/meson.build mesa-20.0.8/src/mapi/es1api/meson.build --- mesa-19.2.8/src/mapi/es1api/meson.build 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mapi/es1api/meson.build 2020-06-12 01:21:17.000000000 +0000 @@ -27,11 +27,19 @@ capture : true, ) +_es1_c_args = [] +if with_platform_windows + _es1_c_args += ['-D_GDI32_', '-DBUILD_GL32'] +endif + libglesv1_cm = shared_library( 'GLESv1_CM' + get_option('gles-lib-suffix'), ['../entry.c', es1_glapi_mapi_tmp_h], c_args : [ - c_msvc_compat_args, c_vis_args, '-DMAPI_MODE_BRIDGE', + c_msvc_compat_args, + c_vis_args, + _es1_c_args, + '-DMAPI_MODE_BRIDGE', '-DMAPI_ABI_HEADER="@0@"'.format(es1_glapi_mapi_tmp_h.full_path()), gcc_lto_quirk, ], @@ -39,7 +47,9 @@ include_directories : [inc_src, inc_include, inc_mapi], link_with : libglapi, dependencies : [dep_thread, dep_libdrm, dep_m, dep_dl], + soversion : host_machine.system() == 'windows' ? '' : '1', version : '1.1.0', + name_prefix : 'lib', install : true, ) @@ -52,14 +62,14 @@ libraries_private : gl_priv_libs, ) -if with_tests and prog_nm.found() +if with_symbols_check test( 'es1-ABI-check', symbols_check, args : [ '--lib', libglesv1_cm, '--symbols-file', files('gles1-symbols.txt'), - '--nm', prog_nm.path(), + symbols_check_args, ], suite : ['mapi'], ) diff -Nru mesa-19.2.8/src/mapi/es2api/meson.build mesa-20.0.8/src/mapi/es2api/meson.build --- mesa-19.2.8/src/mapi/es2api/meson.build 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mapi/es2api/meson.build 2020-06-12 01:21:17.000000000 +0000 @@ -27,11 +27,19 @@ capture : true, ) +_es2_c_args = [] +if with_platform_windows + _es2_c_args += ['-D_GDI32_', '-DBUILD_GL32'] +endif + libgles2 = shared_library( 'GLESv2' + get_option('gles-lib-suffix'), ['../entry.c', es2_glapi_mapi_tmp_h], c_args : [ - c_msvc_compat_args, c_vis_args, '-DMAPI_MODE_BRIDGE', + c_msvc_compat_args, + c_vis_args, + _es2_c_args, + '-DMAPI_MODE_BRIDGE', '-DMAPI_ABI_HEADER="@0@"'.format(es2_glapi_mapi_tmp_h.full_path()), gcc_lto_quirk, ], @@ -39,7 +47,9 @@ include_directories : [inc_src, inc_include, inc_mapi], link_with : libglapi, dependencies : [dep_thread, dep_libdrm, dep_m, dep_dl], + soversion : host_machine.system() == 'windows' ? '' : '2', version : '2.0.0', + name_prefix : 'lib', install : true, ) @@ -52,14 +62,14 @@ libraries_private : gl_priv_libs, ) -if with_tests and prog_nm.found() +if with_symbols_check test( 'es2-ABI-check', symbols_check, args : [ '--lib', libgles2, '--symbols-file', files('gles2-symbols.txt'), - '--nm', prog_nm.path(), + symbols_check_args, ], suite : ['mapi'], ) diff -Nru mesa-19.2.8/src/mapi/glapi/gen/apiexec.py mesa-20.0.8/src/mapi/glapi/gen/apiexec.py --- mesa-19.2.8/src/mapi/glapi/gen/apiexec.py 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mapi/glapi/gen/apiexec.py 2020-06-12 01:21:17.000000000 +0000 @@ -148,7 +148,11 @@ # OpenGL 4.3 / GL_ARB_framebuffer_no_attachments. Mesa can expose the # extension with OpenGL 3.0. "FramebufferParameteri": exec_info(compatibility=30, core=31, es2=31), - "GetFramebufferParameteri": exec_info(compatibility=30, core=31, es2=31), + "GetFramebufferParameteriv": exec_info(compatibility=30, core=31, es2=31), + + # OpenGL 4.3 / GL_MESA_framebuffer_flip_y. + "FramebufferParameteriMESA": exec_info(core=31, es2=30), + "GetFramebufferParameterivMESA": exec_info(core=31, es2=30), # OpenGL 4.5 / GL_ARB_direct_state_access. Mesa can expose the extension # with core profile. @@ -250,44 +254,43 @@ "GetQueryBufferObjecti64v": exec_info(compatibility=31, core=31), "GetQueryBufferObjectui64v": exec_info(compatibility=31, core=31), - # GL_ARB_gpu_shader_int64 - nominally requires OpenGL 4.0, and Mesa - # only supports 4.0 in core profile. - "Uniform1i64ARB": exec_info(core=31), - "Uniform2i64ARB": exec_info(core=31), - "Uniform3i64ARB": exec_info(core=31), - "Uniform4i64ARB": exec_info(core=31), - "Uniform1i64vARB": exec_info(core=31), - "Uniform2i64vARB": exec_info(core=31), - "Uniform3i64vARB": exec_info(core=31), - "Uniform4i64vARB": exec_info(core=31), - "Uniform1ui64ARB": exec_info(core=31), - "Uniform2ui64ARB": exec_info(core=31), - "Uniform3ui64ARB": exec_info(core=31), - "Uniform4ui64ARB": exec_info(core=31), - "Uniform1ui64vARB": exec_info(core=31), - "Uniform2ui64vARB": exec_info(core=31), - "Uniform3ui64vARB": exec_info(core=31), - "Uniform4ui64vARB": exec_info(core=31), - "GetUniformi64vARB": exec_info(core=31), - "GetUniformui64vARB": exec_info(core=31), - "GetnUniformi64vARB": exec_info(core=31), - "GetnUniformui64vARB": exec_info(core=31), - "ProgramUniform1i64ARB": exec_info(core=31), - "ProgramUniform2i64ARB": exec_info(core=31), - "ProgramUniform3i64ARB": exec_info(core=31), - "ProgramUniform4i64ARB": exec_info(core=31), - "ProgramUniform1i64vARB": exec_info(core=31), - "ProgramUniform2i64vARB": exec_info(core=31), - "ProgramUniform3i64vARB": exec_info(core=31), - "ProgramUniform4i64vARB": exec_info(core=31), - "ProgramUniform1ui64ARB": exec_info(core=31), - "ProgramUniform2ui64ARB": exec_info(core=31), - "ProgramUniform3ui64ARB": exec_info(core=31), - "ProgramUniform4ui64ARB": exec_info(core=31), - "ProgramUniform1ui64vARB": exec_info(core=31), - "ProgramUniform2ui64vARB": exec_info(core=31), - "ProgramUniform3ui64vARB": exec_info(core=31), - "ProgramUniform4ui64vARB": exec_info(core=31), + # GL_ARB_gpu_shader_int64 - nominally requires OpenGL 4.0 + "Uniform1i64ARB": exec_info(compatibility=40, core=31), + "Uniform2i64ARB": exec_info(compatibility=40, core=31), + "Uniform3i64ARB": exec_info(compatibility=40, core=31), + "Uniform4i64ARB": exec_info(compatibility=40, core=31), + "Uniform1i64vARB": exec_info(compatibility=40, core=31), + "Uniform2i64vARB": exec_info(compatibility=40, core=31), + "Uniform3i64vARB": exec_info(compatibility=40, core=31), + "Uniform4i64vARB": exec_info(compatibility=40, core=31), + "Uniform1ui64ARB": exec_info(compatibility=40, core=31), + "Uniform2ui64ARB": exec_info(compatibility=40, core=31), + "Uniform3ui64ARB": exec_info(compatibility=40, core=31), + "Uniform4ui64ARB": exec_info(compatibility=40, core=31), + "Uniform1ui64vARB": exec_info(compatibility=40, core=31), + "Uniform2ui64vARB": exec_info(compatibility=40, core=31), + "Uniform3ui64vARB": exec_info(compatibility=40, core=31), + "Uniform4ui64vARB": exec_info(compatibility=40, core=31), + "GetUniformi64vARB": exec_info(compatibility=40, core=31), + "GetUniformui64vARB": exec_info(compatibility=40, core=31), + "GetnUniformi64vARB": exec_info(compatibility=40, core=31), + "GetnUniformui64vARB": exec_info(compatibility=40, core=31), + "ProgramUniform1i64ARB": exec_info(compatibility=40, core=31), + "ProgramUniform2i64ARB": exec_info(compatibility=40, core=31), + "ProgramUniform3i64ARB": exec_info(compatibility=40, core=31), + "ProgramUniform4i64ARB": exec_info(compatibility=40, core=31), + "ProgramUniform1i64vARB": exec_info(compatibility=40, core=31), + "ProgramUniform2i64vARB": exec_info(compatibility=40, core=31), + "ProgramUniform3i64vARB": exec_info(compatibility=40, core=31), + "ProgramUniform4i64vARB": exec_info(compatibility=40, core=31), + "ProgramUniform1ui64ARB": exec_info(compatibility=40, core=31), + "ProgramUniform2ui64ARB": exec_info(compatibility=40, core=31), + "ProgramUniform3ui64ARB": exec_info(compatibility=40, core=31), + "ProgramUniform4ui64ARB": exec_info(compatibility=40, core=31), + "ProgramUniform1ui64vARB": exec_info(compatibility=40, core=31), + "ProgramUniform2ui64vARB": exec_info(compatibility=40, core=31), + "ProgramUniform3ui64vARB": exec_info(compatibility=40, core=31), + "ProgramUniform4ui64vARB": exec_info(compatibility=40, core=31), # GL_ARB_bindless_texture "GetVertexAttribLui64vARB": exec_info(compatibility=30, core=31), diff -Nru mesa-19.2.8/src/mapi/glapi/gen/ARB_clear_buffer_object.xml mesa-20.0.8/src/mapi/glapi/gen/ARB_clear_buffer_object.xml --- mesa-19.2.8/src/mapi/glapi/gen/ARB_clear_buffer_object.xml 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mapi/glapi/gen/ARB_clear_buffer_object.xml 2020-06-12 01:21:17.000000000 +0000 @@ -26,7 +26,7 @@ - + diff -Nru mesa-19.2.8/src/mapi/glapi/gen/ARB_framebuffer_no_attachments.xml mesa-20.0.8/src/mapi/glapi/gen/ARB_framebuffer_no_attachments.xml --- mesa-19.2.8/src/mapi/glapi/gen/ARB_framebuffer_no_attachments.xml 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mapi/glapi/gen/ARB_framebuffer_no_attachments.xml 2020-06-12 01:21:17.000000000 +0000 @@ -27,6 +27,18 @@ + + + + + + + + + + + + diff -Nru mesa-19.2.8/src/mapi/glapi/gen/ARB_gpu_shader_fp64.xml mesa-20.0.8/src/mapi/glapi/gen/ARB_gpu_shader_fp64.xml --- mesa-19.2.8/src/mapi/glapi/gen/ARB_gpu_shader_fp64.xml 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mapi/glapi/gen/ARB_gpu_shader_fp64.xml 2020-06-12 01:21:17.000000000 +0000 @@ -124,6 +124,137 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff -Nru mesa-19.2.8/src/mapi/glapi/gen/ARB_instanced_arrays.xml mesa-20.0.8/src/mapi/glapi/gen/ARB_instanced_arrays.xml --- mesa-19.2.8/src/mapi/glapi/gen/ARB_instanced_arrays.xml 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mapi/glapi/gen/ARB_instanced_arrays.xml 2020-06-12 01:21:17.000000000 +0000 @@ -15,6 +15,12 @@ + + + + + + diff -Nru mesa-19.2.8/src/mapi/glapi/gen/ARB_multi_bind.xml mesa-20.0.8/src/mapi/glapi/gen/ARB_multi_bind.xml --- mesa-19.2.8/src/mapi/glapi/gen/ARB_multi_bind.xml 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mapi/glapi/gen/ARB_multi_bind.xml 2020-06-12 01:21:17.000000000 +0000 @@ -11,42 +11,42 @@ - + - - - + + + - + - + - + - - - + + + diff -Nru mesa-19.2.8/src/mapi/glapi/gen/ARB_shading_language_include.xml mesa-20.0.8/src/mapi/glapi/gen/ARB_shading_language_include.xml --- mesa-19.2.8/src/mapi/glapi/gen/ARB_shading_language_include.xml 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/mapi/glapi/gen/ARB_shading_language_include.xml 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,42 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff -Nru mesa-19.2.8/src/mapi/glapi/gen/ARB_sparse_buffer.xml mesa-20.0.8/src/mapi/glapi/gen/ARB_sparse_buffer.xml --- mesa-19.2.8/src/mapi/glapi/gen/ARB_sparse_buffer.xml 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mapi/glapi/gen/ARB_sparse_buffer.xml 2020-06-12 01:21:17.000000000 +0000 @@ -12,8 +12,14 @@ - - + + + + + + + + @@ -21,7 +27,7 @@ - + diff -Nru mesa-19.2.8/src/mapi/glapi/gen/ARB_texture_buffer_range.xml mesa-20.0.8/src/mapi/glapi/gen/ARB_texture_buffer_range.xml --- mesa-19.2.8/src/mapi/glapi/gen/ARB_texture_buffer_range.xml 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mapi/glapi/gen/ARB_texture_buffer_range.xml 2020-06-12 01:21:17.000000000 +0000 @@ -17,6 +17,15 @@ + + + + + + + + + diff -Nru mesa-19.2.8/src/mapi/glapi/gen/ARB_texture_storage_multisample.xml mesa-20.0.8/src/mapi/glapi/gen/ARB_texture_storage_multisample.xml --- mesa-19.2.8/src/mapi/glapi/gen/ARB_texture_storage_multisample.xml 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mapi/glapi/gen/ARB_texture_storage_multisample.xml 2020-06-12 01:21:17.000000000 +0000 @@ -26,6 +26,27 @@ + + + + + + + + + + + + + + + + + + + + + diff -Nru mesa-19.2.8/src/mapi/glapi/gen/ARB_vertex_attrib_64bit.xml mesa-20.0.8/src/mapi/glapi/gen/ARB_vertex_attrib_64bit.xml --- mesa-19.2.8/src/mapi/glapi/gen/ARB_vertex_attrib_64bit.xml 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mapi/glapi/gen/ARB_vertex_attrib_64bit.xml 2020-06-12 01:21:17.000000000 +0000 @@ -64,6 +64,16 @@ + + + + + + + + + + diff -Nru mesa-19.2.8/src/mapi/glapi/gen/ARB_vertex_attrib_binding.xml mesa-20.0.8/src/mapi/glapi/gen/ARB_vertex_attrib_binding.xml --- mesa-19.2.8/src/mapi/glapi/gen/ARB_vertex_attrib_binding.xml 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mapi/glapi/gen/ARB_vertex_attrib_binding.xml 2020-06-12 01:21:17.000000000 +0000 @@ -46,6 +46,51 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff -Nru mesa-19.2.8/src/mapi/glapi/gen/es_EXT.xml mesa-20.0.8/src/mapi/glapi/gen/es_EXT.xml --- mesa-19.2.8/src/mapi/glapi/gen/es_EXT.xml 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mapi/glapi/gen/es_EXT.xml 2020-06-12 01:21:17.000000000 +0000 @@ -864,6 +864,12 @@ + + + + + + @@ -1499,4 +1505,19 @@ + + + + + + + + + + + + + + + diff -Nru mesa-19.2.8/src/mapi/glapi/gen/EXT_direct_state_access.xml mesa-20.0.8/src/mapi/glapi/gen/EXT_direct_state_access.xml --- mesa-19.2.8/src/mapi/glapi/gen/EXT_direct_state_access.xml 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mapi/glapi/gen/EXT_direct_state_access.xml 2020-06-12 01:21:17.000000000 +0000 @@ -102,6 +102,14 @@ + + + + + + + + @@ -895,6 +903,11 @@ + + + @@ -987,5 +1000,359 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff -Nru mesa-19.2.8/src/mapi/glapi/gen/EXT_EGL_image_storage.xml mesa-20.0.8/src/mapi/glapi/gen/EXT_EGL_image_storage.xml --- mesa-19.2.8/src/mapi/glapi/gen/EXT_EGL_image_storage.xml 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/mapi/glapi/gen/EXT_EGL_image_storage.xml 2020-06-12 01:21:17.000000000 +0000 @@ -0,0 +1,22 @@ + + + + + + + + + + + + + + + + + + + + + + diff -Nru mesa-19.2.8/src/mapi/glapi/gen/gl_API.xml mesa-20.0.8/src/mapi/glapi/gen/gl_API.xml --- mesa-19.2.8/src/mapi/glapi/gen/gl_API.xml 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mapi/glapi/gen/gl_API.xml 2020-06-12 01:21:17.000000000 +0000 @@ -8059,7 +8059,8 @@ - + + @@ -10932,12 +10933,12 @@ - + - + @@ -10984,12 +10985,12 @@ - + - + @@ -13259,6 +13260,7 @@ + diff -Nru mesa-19.2.8/src/mapi/glapi/gen/gl_XML.py mesa-20.0.8/src/mapi/glapi/gen/gl_XML.py --- mesa-19.2.8/src/mapi/glapi/gen/gl_XML.py 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mapi/glapi/gen/gl_XML.py 2020-06-12 01:21:17.000000000 +0000 @@ -565,7 +565,14 @@ def size_string(self, use_parens = 1): - s = self.size() + base_size_str = "" + + count = self.get_element_count() + if count: + base_size_str = "%d * " % count + + base_size_str += "sizeof(%s)" % ( self.get_base_type_string() ) + if self.counter or self.count_parameter_list: list = [ "compsize" ] @@ -574,8 +581,8 @@ elif self.counter: list = [ self.counter ] - if s > 1: - list.append( str(s) ) + if self.size() > 1: + list.append( base_size_str ) if len(list) > 1 and use_parens : return "safe_mul(%s)" % ", ".join(list) @@ -585,7 +592,7 @@ elif self.is_image(): return "compsize" else: - return str(s) + return base_size_str def format_string(self): @@ -706,7 +713,7 @@ parameters = [] return_type = "void" - for child in element.getchildren(): + for child in element: if child.tag == "return": return_type = child.get( "type", "void" ) elif child.tag == "param": @@ -736,7 +743,7 @@ if param.is_image(): self.images.append( param ) - if element.getchildren(): + if list(element): self.initialized = 1 self.entry_point_parameters[name] = parameters else: @@ -866,7 +873,7 @@ def process_OpenGLAPI(self, file_name, element): - for child in element.getchildren(): + for child in element: if child.tag == "category": self.process_category( child ) elif child.tag == "OpenGLAPI": @@ -886,7 +893,7 @@ [cat_type, key] = classify_category(cat_name, cat_number) self.categories[cat_type][key] = [cat_name, cat_number] - for child in cat.getchildren(): + for child in cat: if child.tag == "function": func_name = real_function_name( child ) diff -Nru mesa-19.2.8/src/mapi/glapi/gen/glX_proto_send.py mesa-20.0.8/src/mapi/glapi/gen/glX_proto_send.py --- mesa-19.2.8/src/mapi/glapi/gen/glX_proto_send.py 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mapi/glapi/gen/glX_proto_send.py 2020-06-12 01:21:17.000000000 +0000 @@ -573,7 +573,7 @@ condition = 'compsize > 0' print('if (%s) {' % (condition)) - print(' gc->fillImage(gc, %s, %s, %s, %s, %s, %s, %s, %s, %s);' % (dim_str, width, height, depth, param.img_format, param.img_type, param.name, pcPtr, pixHeaderPtr)) + print(' __glFillImage(gc, %s, %s, %s, %s, %s, %s, %s, %s, %s);' % (dim_str, width, height, depth, param.img_format, param.img_type, param.name, pcPtr, pixHeaderPtr)) print('} else {') print(' (void) memcpy( %s, default_pixel_store_%uD, default_pixel_store_%uD_size );' % (pixHeaderPtr, dim, dim)) print('}') diff -Nru mesa-19.2.8/src/mapi/glapi/gen/glX_XML.py mesa-20.0.8/src/mapi/glapi/gen/glX_XML.py --- mesa-19.2.8/src/mapi/glapi/gen/glX_XML.py 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mapi/glapi/gen/glX_XML.py 2020-06-12 01:21:17.000000000 +0000 @@ -48,7 +48,7 @@ self.functions = {} - for child in element.getchildren(): + for child in element: if child.tag == "size": n = child.get( "name" ) c = child.get( "count" ) @@ -130,7 +130,7 @@ self.counter_list.append(param.counter) - for child in element.getchildren(): + for child in element: if child.tag == "glx": rop = child.get( 'rop' ) sop = child.get( 'sop' ) diff -Nru mesa-19.2.8/src/mapi/glapi/gen/SConscript mesa-20.0.8/src/mapi/glapi/gen/SConscript --- mesa-19.2.8/src/mapi/glapi/gen/SConscript 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mapi/glapi/gen/SConscript 2020-06-12 01:21:17.000000000 +0000 @@ -28,7 +28,7 @@ ) env.CodeGenerate( - target = '../../../mapi/glapi/glapitemp.h', + target = '../../../mapi/glapi/gen/glapitemp.h', script = 'gl_apitemp.py', source = sources, command = python_cmd + ' $SCRIPT -f $SOURCE > $TARGET' diff -Nru mesa-19.2.8/src/mapi/glapi/gen/static_data.py mesa-20.0.8/src/mapi/glapi/gen/static_data.py --- mesa-19.2.8/src/mapi/glapi/gen/static_data.py 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mapi/glapi/gen/static_data.py 2020-06-12 01:21:17.000000000 +0000 @@ -1567,6 +1567,79 @@ "GetCompressedMultiTexImageEXT": 1531, "GetMultiTexLevelParameterivEXT": 1532, "GetMultiTexLevelParameterfvEXT": 1533, + "FramebufferParameteriMESA": 1534, + "GetFramebufferParameterivMESA": 1535, + "NamedRenderbufferStorageEXT": 1536, + "GetNamedRenderbufferParameterivEXT": 1537, + "ClientAttribDefaultEXT": 1538, + "PushClientAttribDefaultEXT": 1539, + "NamedProgramStringEXT": 1540, + "GetNamedProgramStringEXT": 1541, + "NamedProgramLocalParameter4fEXT": 1542, + "NamedProgramLocalParameter4fvEXT": 1543, + "GetNamedProgramLocalParameterfvEXT": 1544, + "NamedProgramLocalParameter4dEXT": 1545, + "NamedProgramLocalParameter4dvEXT": 1546, + "GetNamedProgramLocalParameterdvEXT": 1547, + "GetNamedProgramivEXT": 1548, + "TextureBufferEXT": 1549, + "MultiTexBufferEXT": 1550, + "TextureParameterIivEXT": 1551, + "TextureParameterIuivEXT": 1552, + "GetTextureParameterIivEXT": 1553, + "GetTextureParameterIuivEXT": 1554, + "MultiTexParameterIivEXT": 1555, + "MultiTexParameterIuivEXT": 1556, + "GetMultiTexParameterIivEXT": 1557, + "GetMultiTexParameterIuivEXT": 1558, + "NamedProgramLocalParameters4fvEXT": 1559, + "GenerateTextureMipmapEXT": 1560, + "GenerateMultiTexMipmapEXT": 1561, + "NamedRenderbufferStorageMultisampleEXT": 1562, + "NamedCopyBufferSubDataEXT": 1563, + "VertexArrayVertexOffsetEXT": 1564, + "VertexArrayColorOffsetEXT": 1565, + "VertexArrayEdgeFlagOffsetEXT": 1566, + "VertexArrayIndexOffsetEXT": 1567, + "VertexArrayNormalOffsetEXT": 1568, + "VertexArrayTexCoordOffsetEXT": 1569, + "VertexArrayMultiTexCoordOffsetEXT": 1570, + "VertexArrayFogCoordOffsetEXT": 1571, + "VertexArraySecondaryColorOffsetEXT": 1572, + "VertexArrayVertexAttribOffsetEXT": 1573, + "VertexArrayVertexAttribIOffsetEXT": 1574, + "EnableVertexArrayEXT": 1575, + "DisableVertexArrayEXT": 1576, + "EnableVertexArrayAttribEXT": 1577, + "DisableVertexArrayAttribEXT": 1578, + "GetVertexArrayIntegervEXT": 1579, + "GetVertexArrayPointervEXT": 1580, + "GetVertexArrayIntegeri_vEXT": 1581, + "GetVertexArrayPointeri_vEXT": 1582, + "ClearNamedBufferDataEXT": 1583, + "ClearNamedBufferSubDataEXT": 1584, + "NamedFramebufferParameteriEXT": 1585, + "GetNamedFramebufferParameterivEXT": 1586, + "VertexArrayVertexAttribLOffsetEXT": 1587, + "VertexArrayVertexAttribDivisorEXT": 1588, + "TextureBufferRangeEXT": 1589, + "TextureStorage2DMultisampleEXT": 1590, + "TextureStorage3DMultisampleEXT": 1591, + "VertexArrayBindVertexBufferEXT": 1592, + "VertexArrayVertexAttribFormatEXT": 1593, + "VertexArrayVertexAttribIFormatEXT": 1594, + "VertexArrayVertexAttribLFormatEXT": 1595, + "VertexArrayVertexAttribBindingEXT": 1596, + "VertexArrayVertexBindingDivisorEXT": 1597, + "NamedBufferPageCommitmentEXT": 1598, + "NamedStringARB": 1599, + "DeleteNamedStringARB": 1600, + "CompileShaderIncludeARB": 1601, + "IsNamedStringARB": 1602, + "GetNamedStringARB": 1603, + "GetNamedStringivARB": 1604, + "EGLImageTargetTexStorageEXT" : 1605, + "EGLImageTargetTextureStorageEXT" : 1606, } functions = [ diff -Nru mesa-19.2.8/src/mapi/glapi/glapi_dispatch.c mesa-20.0.8/src/mapi/glapi/glapi_dispatch.c --- mesa-19.2.8/src/mapi/glapi/glapi_dispatch.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mapi/glapi/glapi_dispatch.c 2020-06-12 01:21:17.000000000 +0000 @@ -171,6 +171,6 @@ # endif #endif -#include "glapi/glapitemp.h" +#include "glapitemp.h" #endif /* USE_X86_ASM */ diff -Nru mesa-19.2.8/src/mapi/glapi/glapi.h mesa-20.0.8/src/mapi/glapi/glapi.h --- mesa-19.2.8/src/mapi/glapi/glapi.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mapi/glapi/glapi.h 2020-06-12 01:21:17.000000000 +0000 @@ -104,7 +104,7 @@ #endif /* defined (USE_ELF_TLS) */ -void +_GLAPI_EXPORT void _glapi_destroy_multithread(void); diff -Nru mesa-19.2.8/src/mapi/glapi/meson.build mesa-20.0.8/src/mapi/glapi/meson.build --- mesa-19.2.8/src/mapi/glapi/meson.build 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mapi/glapi/meson.build 2020-06-12 01:21:17.000000000 +0000 @@ -25,7 +25,7 @@ static_glapi_files = [] static_glapi_args = [] -if ['apple', 'windows'].contains(with_dri_platform) +if with_dri and ['apple', 'windows'].contains(with_dri_platform) static_glapi_files += [glapi_gentable_c, glapitable_h] endif @@ -46,8 +46,19 @@ '-DMAPI_ABI_HEADER="@0@"'.format(glapi_mapi_tmp_h.full_path()), gcc_lto_quirk, ] + if with_platform_windows + static_glapi_args += ['-D_GDI32_', '-DBUILD_GL32'] + endif else static_glapi_args += '-DMAPI_MODE_UTIL' + if with_platform_windows + static_glapi_args += ['-D_GDI32_', '-DBUILD_GL32', '-DKHRONOS_DLL_EXPORTS'] + if with_shared_glapi + static_glapi_args += '-D_GLAPI_DLL_EXPORTS' + else + static_glapi_args += '-D_GLAPI_NO_EXPORTS' + endif + endif static_glapi_files += files( 'glapi_dispatch.c', 'glapi_entrypoint.c', @@ -79,7 +90,9 @@ build_by_default : false, ) -if with_any_opengl and not with_shared_glapi and with_tests +# TODO: this test doesn't compile on windows with mingw or msvc due to +# undefined symbols from libglapi_static, but that should be fixable. +if with_any_opengl and not with_shared_glapi and with_tests and not with_platform_windows test( 'glapi_static_check_table', executable( diff -Nru mesa-19.2.8/src/mapi/glapi/registry/gl.xml mesa-20.0.8/src/mapi/glapi/registry/gl.xml --- mesa-19.2.8/src/mapi/glapi/registry/gl.xml 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mapi/glapi/registry/gl.xml 2020-06-12 01:21:18.000000000 +0000 @@ -1,7 +1,7 @@ -Copyright (c) 2013-2018 The Khronos Group Inc. +Copyright (c) 2013-2019 The Khronos Group Inc. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -79,9 +79,650 @@ typedef void ( *GLVULKANPROCNV)(void); - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -220,23 +861,22 @@ - + + + + + - - - - - - - - + + + @@ -391,6 +1031,38 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -408,7 +1080,14 @@ - + + + + + + + + @@ -1420,8 +2099,6 @@ - - @@ -1546,11 +2223,7 @@ - - - - - + @@ -1567,9 +2240,6 @@ - - - @@ -1847,6 +2517,7 @@ + @@ -1914,7 +2585,6 @@ - @@ -1926,6 +2596,20 @@ + + + + + + + + + + + + + + @@ -1961,6 +2645,83 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -2191,6 +2952,22 @@ + + + + + + + + + + + + + + + + @@ -2494,6 +3271,8 @@ + + @@ -2517,6 +3296,18 @@ + + + + + + + + + + + + @@ -2644,11 +3435,15 @@ - + + + + + @@ -2699,6 +3494,10 @@ + + + + @@ -2860,6 +3659,20 @@ + + + + + + + + + + + + + + @@ -2954,17 +3767,21 @@ - + + + + + + - - + @@ -3003,7 +3820,6 @@ - @@ -3187,24 +4003,6 @@ - - - - - - - - - - - - - - - - - - @@ -3219,6 +4017,87 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -3517,6 +4396,182 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -3552,8 +4607,17 @@ - - + + + + + + + + + + + @@ -3583,7 +4647,7 @@ - + @@ -3600,15 +4664,6 @@ - - - - - - - - - @@ -3679,6 +4734,18 @@ + + + + + + + + + + + + @@ -10156,7 +11223,10 @@ - + + + + @@ -10177,7 +11247,8 @@ - + + @@ -10541,6 +11612,48 @@ GLuint offset + GLuint glAsyncCopyBufferSubDataNVX + GLsizei waitSemaphoreCount + const GLuint *waitSemaphoreArray + const GLuint64 *fenceValueArray + GLuint readGpu + GLbitfield writeGpuMask + GLuint readBuffer + GLuint writeBuffer + GLintptr readOffset + GLintptr writeOffset + GLsizeiptr size + GLsizei signalSemaphoreCount + const GLuint *signalSemaphoreArray + const GLuint64 *signalValueArray + + + GLuint glAsyncCopyImageSubDataNVX + GLsizei waitSemaphoreCount + const GLuint *waitSemaphoreArray + const GLuint64 *waitValueArray + GLuint srcGpu + GLbitfield dstGpuMask + GLuint srcName + GLenum srcTarget + GLint srcLevel + GLint srcX + GLint srcY + GLint srcZ + GLuint dstName + GLenum dstTarget + GLint dstLevel + GLint dstX + GLint dstY + GLint dstZ + GLsizei srcWidth + GLsizei srcHeight + GLsizei srcDepth + GLsizei signalSemaphoreCount + const GLuint *signalSemaphoreArray + const GLuint64 *signalValueArray + + void glAsyncMarkerSGIX GLuint marker @@ -10563,12 +11676,12 @@ void glBeginConditionalRender GLuint id - GLenum mode + GLenum mode void glBeginConditionalRenderNV GLuint id - GLenum mode + GLenum mode @@ -11433,14 +12546,14 @@ GLenum target GLsizeiptr size const void *data - GLbitfield flags + GLbitfield flags void glBufferStorageEXT GLenum target GLsizeiptr size const void *data - GLbitfield flags + GLbitfield flags @@ -11449,7 +12562,7 @@ GLintptr offset GLsizeiptr size GLeglClientBufferEXT clientBuffer - GLbitfield flags + GLbitfield flags void glBufferStorageMemEXT @@ -11557,7 +12670,7 @@ void glClearBufferSubData - GLenum target + GLenum target GLenum internalformat GLintptr offset GLsizeiptr size @@ -11805,6 +12918,12 @@ GLbitfield mask + void glClientWaitSemaphoreui64NVX + GLsizei fenceObjectCount + const GLuint *semaphoreArray + const GLuint64 *fenceValueArray + + GLenum glClientWaitSync GLsync sync GLbitfield flags @@ -13086,13 +14205,13 @@ void glCopyImageSubData GLuint srcName - GLenum srcTarget + GLenum srcTarget GLint srcLevel GLint srcX GLint srcY GLint srcZ GLuint dstName - GLenum dstTarget + GLenum dstTarget GLint dstLevel GLint dstX GLint dstY @@ -13553,6 +14672,9 @@ GLuint *pipelines + GLuint glCreateProgressFenceNVX + + void glCreateQueries GLenum target GLsizei n @@ -14328,19 +15450,19 @@ void glDrawBuffers GLsizei n - const GLenum *bufs + const GLenum *bufs void glDrawBuffersARB GLsizei n - const GLenum *bufs + const GLenum *bufs void glDrawBuffersATI GLsizei n - const GLenum *bufs + const GLenum *bufs @@ -14501,7 +15623,7 @@ void glDrawElementsInstancedBaseVertexBaseInstance GLenum mode GLsizei count - GLenum type + GLenum type const void *indices GLsizei instancecount GLint basevertex @@ -14511,7 +15633,7 @@ void glDrawElementsInstancedBaseVertexBaseInstanceEXT GLenum mode GLsizei count - GLenum type + GLenum type const void *indices GLsizei instancecount GLint basevertex @@ -16145,7 +17267,7 @@ GLsizei bufSize GLsizei *length GLint *size - GLenum *type + GLenum *type GLchar *name @@ -16155,7 +17277,7 @@ GLsizei maxLength GLsizei *length GLint *size - GLenum *type + GLenum *type GLcharARB *name @@ -16619,20 +17741,20 @@ void glGetDoubleIndexedvEXT - GLenum target + GLenum target GLuint index GLdouble *data void glGetDoublei_v - GLenum target + GLenum target GLuint index GLdouble *data void glGetDoublei_vEXT - GLenum pname + GLenum pname GLuint index GLdouble *params @@ -16697,34 +17819,34 @@ void glGetFloatIndexedvEXT - GLenum target + GLenum target GLuint index GLfloat *data void glGetFloati_v - GLenum target + GLenum target GLuint index GLfloat *data void glGetFloati_vEXT - GLenum pname + GLenum pname GLuint index GLfloat *params void glGetFloati_vNV - GLenum target + GLenum target GLuint index GLfloat *data void glGetFloati_vOES - GLenum target + GLenum target GLuint index GLfloat *data @@ -16946,7 +18068,7 @@ void glGetInteger64i_v - GLenum target + GLenum target GLuint index GLint64 *data @@ -16962,6 +18084,12 @@ + void glGetInteger64vEXT + GLenum pname + GLint64 *data + + + void glGetIntegerIndexedvEXT GLenum target GLuint index @@ -16971,13 +18099,13 @@ void glGetIntegeri_v - GLenum target + GLenum target GLuint index GLint *data void glGetIntegeri_vEXT - GLenum target + GLenum target GLuint index GLint *data @@ -17533,7 +18661,7 @@ void glGetObjectLabel - GLenum identifier + GLenum identifier GLuint name GLsizei bufSize GLsizei *length @@ -17809,27 +18937,27 @@ void glGetPixelTransformParameterfvEXT - GLenum target + GLenum target GLenum pname GLfloat *params void glGetPixelTransformParameterivEXT - GLenum target + GLenum target GLenum pname GLint *params void glGetPointerIndexedvEXT - GLenum target + GLenum target GLuint index void **data void glGetPointeri_vEXT - GLenum pname + GLenum pname GLuint index void **params @@ -18045,7 +19173,7 @@ GLenum programInterface GLuint index GLsizei propCount - const GLenum *props + const GLenum *props GLsizei bufSize GLsizei *length GLint *params @@ -18126,7 +19254,7 @@ void glGetQueryIndexediv - GLenum target + GLenum target GLuint index GLenum pname GLint *params @@ -18244,53 +19372,53 @@ void glGetSamplerParameterIiv GLuint sampler - GLenum pname + GLenum pname GLint *params void glGetSamplerParameterIivEXT GLuint sampler - GLenum pname + GLenum pname GLint *params void glGetSamplerParameterIivOES GLuint sampler - GLenum pname + GLenum pname GLint *params void glGetSamplerParameterIuiv GLuint sampler - GLenum pname + GLenum pname GLuint *params void glGetSamplerParameterIuivEXT GLuint sampler - GLenum pname + GLenum pname GLuint *params void glGetSamplerParameterIuivOES GLuint sampler - GLenum pname + GLenum pname GLuint *params void glGetSamplerParameterfv GLuint sampler - GLenum pname + GLenum pname GLfloat *params void glGetSamplerParameteriv GLuint sampler - GLenum pname + GLenum pname GLint *params @@ -18768,7 +19896,7 @@ GLsizei bufSize GLsizei *length GLsizei *size - GLenum *type + GLenum *type GLchar *name @@ -18779,7 +19907,7 @@ GLsizei bufSize GLsizei *length GLsizei *size - GLenum *type + GLenum *type GLchar *name @@ -19835,7 +20963,7 @@ void glInvalidateFramebuffer GLenum target GLsizei numAttachments - const GLenum *attachments + const GLenum *attachments void glInvalidateNamedFramebufferData @@ -19855,7 +20983,7 @@ void glInvalidateSubFramebuffer - GLenum target + GLenum target GLsizei numAttachments const GLenum *attachments GLint x @@ -20575,7 +21703,7 @@ GLenum target GLintptr offset GLsizeiptr length - GLbitfield access + GLbitfield access @@ -20583,7 +21711,7 @@ GLenum target GLintptr offset GLsizeiptr length - GLbitfield access + GLbitfield access @@ -20661,14 +21789,14 @@ GLuint buffer GLintptr offset GLsizeiptr length - GLbitfield access + GLbitfield access void *glMapNamedBufferRangeEXT GLuint buffer GLintptr offset GLsizeiptr length - GLbitfield access + GLbitfield access void *glMapObjectBufferATI @@ -21228,7 +22356,7 @@ void glMultiDrawElementsIndirectCount GLenum mode - GLenum type + GLenum type const void *indirect GLintptr drawcount GLsizei maxdrawcount @@ -21296,7 +22424,7 @@ void glMultiTexBufferEXT GLenum texunit GLenum target - GLenum internalformat + GLenum internalformat GLuint buffer @@ -22269,6 +23397,27 @@ GLuint *params + void glMulticastScissorArrayvNVX + GLuint gpu + GLuint first + GLsizei count + const GLint *v + + + void glMulticastViewportArrayvNVX + GLuint gpu + GLuint first + GLsizei count + const GLfloat *v + + + void glMulticastViewportPositionWScaleNVX + GLuint gpu + GLuint index + GLfloat xcoeff + GLfloat ycoeff + + void glMulticastWaitSyncNV GLuint signalGpu GLbitfield waitGpuMask @@ -22312,7 +23461,7 @@ GLuint buffer GLsizeiptr size const void *data - GLbitfield flags + GLbitfield flags void glNamedBufferStorageExternalEXT @@ -22320,14 +23469,14 @@ GLintptr offset GLsizeiptr size GLeglClientBufferEXT clientBuffer - GLbitfield flags + GLbitfield flags void glNamedBufferStorageEXT GLuint buffer GLsizeiptr size const void *data - GLbitfield flags + GLbitfield flags @@ -25875,65 +27024,65 @@ void glSamplerParameterIiv GLuint sampler - GLenum pname + GLenum pname const GLint *param void glSamplerParameterIivEXT GLuint sampler - GLenum pname + GLenum pname const GLint *param void glSamplerParameterIivOES GLuint sampler - GLenum pname + GLenum pname const GLint *param void glSamplerParameterIuiv GLuint sampler - GLenum pname + GLenum pname const GLuint *param void glSamplerParameterIuivEXT GLuint sampler - GLenum pname + GLenum pname const GLuint *param void glSamplerParameterIuivOES GLuint sampler - GLenum pname + GLenum pname const GLuint *param void glSamplerParameterf GLuint sampler - GLenum pname + GLenum pname GLfloat param void glSamplerParameterfv GLuint sampler - GLenum pname + GLenum pname const GLfloat *param void glSamplerParameteri GLuint sampler - GLenum pname + GLenum pname GLint param void glSamplerParameteriv GLuint sampler - GLenum pname + GLenum pname const GLint *param @@ -26476,6 +27625,13 @@ const GLenum *dstLayouts + void glSignalSemaphoreui64NVX + GLuint signalGpu + GLsizei fenceObjectCount + const GLuint *semaphoreArray + const GLuint64 *fenceValueArray + + void glSpecializeShader GLuint shader const GLchar *pEntryPoint @@ -28581,7 +29737,7 @@ GLuint program GLsizei count const GLchar *const*varyings - GLenum bufferMode + GLenum bufferMode @@ -29518,6 +30674,10 @@ GLenum preserve + void glUploadGpuMaskNVX + GLbitfield mask + + void glUseProgram GLuint program @@ -29933,7 +31093,7 @@ GLuint vaobj GLuint attribindex GLint size - GLenum type + GLenum type GLuint relativeoffset @@ -29941,7 +31101,7 @@ GLuint vaobj GLuint attribindex GLint size - GLenum type + GLenum type GLuint relativeoffset @@ -30063,7 +31223,7 @@ GLuint vaobj GLuint attribindex GLint size - GLenum type + GLenum type GLboolean normalized GLuint relativeoffset @@ -30072,7 +31232,7 @@ GLuint vaobj GLuint attribindex GLint size - GLenum type + GLenum type GLuint relativeoffset @@ -30081,7 +31241,7 @@ GLuint buffer GLuint index GLint size - GLenum type + GLenum type GLsizei stride GLintptr offset @@ -30090,7 +31250,7 @@ GLuint vaobj GLuint attribindex GLint size - GLenum type + GLenum type GLuint relativeoffset @@ -30099,7 +31259,7 @@ GLuint buffer GLuint index GLint size - GLenum type + GLenum type GLsizei stride GLintptr offset @@ -30939,7 +32099,7 @@ void glVertexAttribFormat GLuint attribindex GLint size - GLenum type + GLenum type GLboolean normalized GLuint relativeoffset @@ -30947,7 +32107,7 @@ void glVertexAttribFormatNV GLuint index GLint size - GLenum type + GLenum type GLboolean normalized GLsizei stride @@ -31215,14 +32375,14 @@ void glVertexAttribIFormat GLuint attribindex GLint size - GLenum type + GLenum type GLuint relativeoffset void glVertexAttribIFormatNV GLuint index GLint size - GLenum type + GLenum type GLsizei stride @@ -31448,14 +32608,14 @@ void glVertexAttribLFormat GLuint attribindex GLint size - GLenum type + GLenum type GLuint relativeoffset void glVertexAttribLFormatNV GLuint index GLint size - GLenum type + GLenum type GLsizei stride @@ -32101,6 +33261,13 @@ const GLenum *srcLayouts + void glWaitSemaphoreui64NVX + GLuint waitGpu + GLsizei fenceObjectCount + const GLuint *semaphoreArray + const GLuint64 *fenceValueArray + + void glWaitSync GLsync sync GLbitfield flags @@ -32575,6 +33742,18 @@ void glSignalVkFenceNV GLuint64 vkFence + + void glFramebufferParameteriMESA + GLenum target + GLenum pname + GLint param + + + void glGetFramebufferParameterivMESA + GLenum target + GLenum pname + GLint *params + @@ -41626,7 +42805,7 @@ - + @@ -42996,6 +44175,7 @@ + @@ -43697,6 +44877,7 @@ + @@ -44273,6 +45454,9 @@ + + + @@ -45195,6 +46379,7 @@ + @@ -46143,6 +47328,22 @@ + + + + + + + + + + + + + + + + @@ -46228,6 +47429,8 @@ + + @@ -46798,7 +48001,7 @@ - + @@ -47083,6 +48286,25 @@ + + + + + + + + + + + + + + + + + + + @@ -47105,7 +48327,7 @@ - + @@ -47145,14 +48367,8 @@ - - - - - - @@ -47160,6 +48376,14 @@ + + + + + + + + @@ -47658,7 +48882,7 @@ - + @@ -47698,7 +48922,7 @@ - + @@ -47741,7 +48965,12 @@ - + + + + + + @@ -47750,7 +48979,7 @@ - + @@ -50179,5 +51408,6 @@ + diff -Nru mesa-19.2.8/src/mapi/glapi/SConscript mesa-20.0.8/src/mapi/glapi/SConscript --- mesa-19.2.8/src/mapi/glapi/SConscript 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mapi/glapi/SConscript 2020-06-12 01:21:17.000000000 +0000 @@ -27,7 +27,7 @@ '#/src/mapi', '#/src/mesa', Dir('.'), # src/mapi/glapi build path - Dir('..'), # src/mapi build path + Dir('gen'), # src/mapi/glapi/gen build path ]) glapi_sources = [ diff -Nru mesa-19.2.8/src/mapi/meson.build mesa-20.0.8/src/mapi/meson.build --- mesa-19.2.8/src/mapi/meson.build 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mapi/meson.build 2020-06-12 01:21:18.000000000 +0000 @@ -1,4 +1,4 @@ -# Copyright © 2017-2019 Intel Corporation +# Copyright © 2017 Intel Corporation # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -35,31 +35,11 @@ else libglapi = [] endif -if with_gles1 - if not with_glvnd +if not with_glvnd + if with_gles1 subdir('es1api') - elif not glvnd_has_headers_and_pc_files - pkg.generate( - name : 'glesv1_cm', - filebase : 'glesv1_cm', - description : 'Mesa OpenGL ES 1.1 CM library', - version : meson.project_version(), - libraries : '-L${libdir} -lGLESv1_CM', - libraries_private : gl_priv_libs, - ) endif -endif -if with_gles2 - if not with_glvnd + if with_gles2 subdir('es2api') - elif not glvnd_has_headers_and_pc_files - pkg.generate( - name : 'glesv2', - filebase : 'glesv2', - description : 'Mesa OpenGL ES 2.0 library', - version : meson.project_version(), - libraries : '-L${libdir} -lGLESv2', - libraries_private : gl_priv_libs, - ) endif endif diff -Nru mesa-19.2.8/src/mapi/shared-glapi/glapi-symbols.txt mesa-20.0.8/src/mapi/shared-glapi/glapi-symbols.txt --- mesa-19.2.8/src/mapi/shared-glapi/glapi-symbols.txt 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mapi/shared-glapi/glapi-symbols.txt 2020-06-12 01:21:18.000000000 +0000 @@ -2,6 +2,7 @@ _glapi_Dispatch _glapi_add_dispatch _glapi_check_multithread +_glapi_destroy_multithread _glapi_get_context _glapi_get_dispatch _glapi_get_dispatch_table_size diff -Nru mesa-19.2.8/src/mapi/shared-glapi/meson.build mesa-20.0.8/src/mapi/shared-glapi/meson.build --- mesa-19.2.8/src/mapi/shared-glapi/meson.build 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mapi/shared-glapi/meson.build 2020-06-12 01:21:18.000000000 +0000 @@ -36,18 +36,28 @@ capture : true, ) +_glapi_c_args = [] +if with_platform_windows + _glapi_c_args += ['-D_GLAPI_DLL_EXPORTS'] +endif + libglapi = shared_library( 'glapi', [files_mapi_glapi, files_mapi_util, shared_glapi_mapi_tmp_h], c_args : [ - c_msvc_compat_args, c_vis_args, '-DMAPI_MODE_GLAPI', + _glapi_c_args, + c_msvc_compat_args, + c_vis_args, + '-DMAPI_MODE_GLAPI', '-DMAPI_ABI_HEADER="@0@"'.format(shared_glapi_mapi_tmp_h.full_path()), gcc_lto_quirk, ], link_args : [ld_args_gc_sections], include_directories : [inc_src, inc_include, inc_mapi], dependencies : [dep_thread, dep_selinux], + soversion : host_machine.system() == 'windows' ? '' : '0', version : '0.0.0', + name_prefix : 'lib', install : true, ) @@ -64,14 +74,14 @@ ), suite : ['mapi'], ) - if prog_nm.found() + if with_symbols_check test( 'shared-glapi symbols check', symbols_check, args : [ '--lib', libglapi, '--symbols-file', files('glapi-symbols.txt'), - '--nm', prog_nm.path(), + symbols_check_args, ], suite : ['mapi'], ) diff -Nru mesa-19.2.8/src/mapi/u_execmem.c mesa-20.0.8/src/mapi/u_execmem.c --- mesa-19.2.8/src/mapi/u_execmem.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mapi/u_execmem.c 2020-06-12 01:21:18.000000000 +0000 @@ -46,7 +46,7 @@ static unsigned char *exec_mem = (unsigned char *)0; -#if defined(__linux__) || defined(__OpenBSD__) || defined(_NetBSD__) || defined(__sun) || defined(__HAIKU__) +#if defined(__linux__) || defined(__OpenBSD__) || defined(__NetBSD__) || defined(__sun) || defined(__HAIKU__) #include #include diff -Nru mesa-19.2.8/src/mesa/Android.libmesa_dricore.mk mesa-20.0.8/src/mesa/Android.libmesa_dricore.mk --- mesa-19.2.8/src/mesa/Android.libmesa_dricore.mk 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mesa/Android.libmesa_dricore.mk 2020-06-12 01:21:18.000000000 +0000 @@ -39,11 +39,9 @@ LOCAL_SRC_FILES := \ $(MESA_FILES) -ifeq ($(strip $(MESA_ENABLE_ASM)),true) ifeq ($(TARGET_ARCH),x86) LOCAL_SRC_FILES += $(X86_FILES) endif # x86 -endif # MESA_ENABLE_ASM ifeq ($(ARCH_X86_HAVE_SSE4_1),true) LOCAL_WHOLE_STATIC_LIBRARIES := \ diff -Nru mesa-19.2.8/src/mesa/Android.libmesa_st_mesa.mk mesa-20.0.8/src/mesa/Android.libmesa_st_mesa.mk --- mesa-19.2.8/src/mesa/Android.libmesa_st_mesa.mk 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mesa/Android.libmesa_st_mesa.mk 2020-06-12 01:21:18.000000000 +0000 @@ -42,11 +42,9 @@ $(MESA_GEN_GLSL_H) \ $(MESA_GEN_NIR_H) -ifeq ($(strip $(MESA_ENABLE_ASM)),true) ifeq ($(TARGET_ARCH),x86) LOCAL_SRC_FILES += $(X86_FILES) endif # x86 -endif # MESA_ENABLE_ASM ifeq ($(ARCH_X86_HAVE_SSE4_1),true) LOCAL_WHOLE_STATIC_LIBRARIES := \ diff -Nru mesa-19.2.8/src/mesa/drivers/common/meta.c mesa-20.0.8/src/mesa/drivers/common/meta.c --- mesa-19.2.8/src/mesa/drivers/common/meta.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mesa/drivers/common/meta.c 2020-06-12 01:21:18.000000000 +0000 @@ -110,6 +110,10 @@ struct decompress_state *decompress); static void meta_drawpix_cleanup(struct gl_context *ctx, struct drawpix_state *drawpix); +static void meta_drawtex_cleanup(struct gl_context *ctx, + struct drawtex_state *drawtex); +static void meta_bitmap_cleanup(struct gl_context *ctx, + struct bitmap_state *bitmap); void _mesa_meta_framebuffer_texture_image(struct gl_context *ctx, @@ -429,6 +433,9 @@ cleanup_temp_texture(ctx, &ctx->Meta->TempTex); meta_decompress_cleanup(ctx, &ctx->Meta->Decompress); meta_drawpix_cleanup(ctx, &ctx->Meta->DrawPix); + meta_drawtex_cleanup(ctx, &ctx->Meta->DrawTex); + meta_bitmap_cleanup(ctx, &ctx->Meta->Bitmap); + if (old_context) _mesa_make_current(old_context, old_context->WinSysDrawBuffer, old_context->WinSysReadBuffer); else @@ -1970,6 +1977,30 @@ } } +static void +meta_drawtex_cleanup(struct gl_context *ctx, struct drawtex_state *drawtex) +{ + if (drawtex->VAO != 0) { + _mesa_DeleteVertexArrays(1, &drawtex->VAO); + drawtex->VAO = 0; + + _mesa_reference_buffer_object(ctx, &drawtex->buf_obj, NULL); + } +} + +static void +meta_bitmap_cleanup(struct gl_context *ctx, struct bitmap_state *bitmap) +{ + if (bitmap->VAO != 0) { + _mesa_DeleteVertexArrays(1, &bitmap->VAO); + bitmap->VAO = 0; + + _mesa_reference_buffer_object(ctx, &bitmap->buf_obj, NULL); + + cleanup_temp_texture(ctx, &bitmap->Tex); + } +} + /** * When the glDrawPixels() image size is greater than the max rectangle * texture size we use this function to break the glDrawPixels() image @@ -2993,6 +3024,7 @@ } _mesa_reference_sampler_object(ctx, &decompress->samp_obj, NULL); + _mesa_meta_blit_shader_table_cleanup(ctx, &decompress->shaders); memset(decompress, 0, sizeof(*decompress)); } diff -Nru mesa-19.2.8/src/mesa/drivers/dri/common/dri_util.c mesa-20.0.8/src/mesa/drivers/dri/common/dri_util.c --- mesa-19.2.8/src/mesa/drivers/dri/common/dri_util.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mesa/drivers/dri/common/dri_util.c 2020-06-12 01:21:18.000000000 +0000 @@ -42,6 +42,7 @@ #include #include "dri_util.h" #include "utils.h" +#include "util/u_endian.h" #include "util/xmlpool.h" #include "main/mtypes.h" #include "main/framebuffer.h" @@ -874,87 +875,127 @@ static const struct { uint32_t image_format; mesa_format mesa_format; + GLenum internal_format; } format_mapping[] = { { - .image_format = __DRI_IMAGE_FORMAT_RGB565, - .mesa_format = MESA_FORMAT_B5G6R5_UNORM, + .image_format = __DRI_IMAGE_FORMAT_RGB565, + .mesa_format = MESA_FORMAT_B5G6R5_UNORM, + .internal_format = GL_RGB565, }, { - .image_format = __DRI_IMAGE_FORMAT_ARGB1555, - .mesa_format = MESA_FORMAT_B5G5R5A1_UNORM, + .image_format = __DRI_IMAGE_FORMAT_ARGB1555, + .mesa_format = MESA_FORMAT_B5G5R5A1_UNORM, + .internal_format = GL_RGB5_A1, }, { - .image_format = __DRI_IMAGE_FORMAT_XRGB8888, - .mesa_format = MESA_FORMAT_B8G8R8X8_UNORM, + .image_format = __DRI_IMAGE_FORMAT_XRGB8888, + .mesa_format = MESA_FORMAT_B8G8R8X8_UNORM, + .internal_format = GL_RGBA8, }, { - .image_format = __DRI_IMAGE_FORMAT_ARGB2101010, - .mesa_format = MESA_FORMAT_B10G10R10A2_UNORM, + .image_format = __DRI_IMAGE_FORMAT_ABGR16161616F, + .mesa_format = MESA_FORMAT_RGBA_FLOAT16, + .internal_format = GL_RGBA16F, }, { - .image_format = __DRI_IMAGE_FORMAT_XRGB2101010, - .mesa_format = MESA_FORMAT_B10G10R10X2_UNORM, + .image_format = __DRI_IMAGE_FORMAT_XBGR16161616F, + .mesa_format = MESA_FORMAT_RGBX_FLOAT16, + .internal_format = GL_RGBA16F, }, { - .image_format = __DRI_IMAGE_FORMAT_ABGR2101010, - .mesa_format = MESA_FORMAT_R10G10B10A2_UNORM, + .image_format = __DRI_IMAGE_FORMAT_ARGB2101010, + .mesa_format = MESA_FORMAT_B10G10R10A2_UNORM, + .internal_format = GL_RGB10_A2, }, { - .image_format = __DRI_IMAGE_FORMAT_XBGR2101010, - .mesa_format = MESA_FORMAT_R10G10B10X2_UNORM, + .image_format = __DRI_IMAGE_FORMAT_XRGB2101010, + .mesa_format = MESA_FORMAT_B10G10R10X2_UNORM, + .internal_format = GL_RGB10_A2, }, { - .image_format = __DRI_IMAGE_FORMAT_ARGB8888, - .mesa_format = MESA_FORMAT_B8G8R8A8_UNORM, + .image_format = __DRI_IMAGE_FORMAT_ABGR2101010, + .mesa_format = MESA_FORMAT_R10G10B10A2_UNORM, + .internal_format = GL_RGB10_A2, }, { - .image_format = __DRI_IMAGE_FORMAT_ABGR8888, - .mesa_format = MESA_FORMAT_R8G8B8A8_UNORM, + .image_format = __DRI_IMAGE_FORMAT_XBGR2101010, + .mesa_format = MESA_FORMAT_R10G10B10X2_UNORM, + .internal_format = GL_RGB10_A2, }, { - .image_format = __DRI_IMAGE_FORMAT_XBGR8888, - .mesa_format = MESA_FORMAT_R8G8B8X8_UNORM, + .image_format = __DRI_IMAGE_FORMAT_ARGB8888, + .mesa_format = MESA_FORMAT_B8G8R8A8_UNORM, + .internal_format = GL_RGBA8, }, { - .image_format = __DRI_IMAGE_FORMAT_R8, - .mesa_format = MESA_FORMAT_R_UNORM8, + .image_format = __DRI_IMAGE_FORMAT_ABGR8888, + .mesa_format = MESA_FORMAT_R8G8B8A8_UNORM, + .internal_format = GL_RGBA8, }, { - .image_format = __DRI_IMAGE_FORMAT_R8, - .mesa_format = MESA_FORMAT_L_UNORM8, + .image_format = __DRI_IMAGE_FORMAT_XBGR8888, + .mesa_format = MESA_FORMAT_R8G8B8X8_UNORM, + .internal_format = GL_RGB8, }, { - .image_format = __DRI_IMAGE_FORMAT_GR88, - .mesa_format = MESA_FORMAT_R8G8_UNORM, + .image_format = __DRI_IMAGE_FORMAT_R8, + .mesa_format = MESA_FORMAT_R_UNORM8, + .internal_format = GL_R8, }, { - .image_format = __DRI_IMAGE_FORMAT_GR88, - .mesa_format = MESA_FORMAT_L8A8_UNORM, + .image_format = __DRI_IMAGE_FORMAT_R8, + .mesa_format = MESA_FORMAT_L_UNORM8, + .internal_format = GL_R8, }, +#if UTIL_ARCH_LITTLE_ENDIAN { - .image_format = __DRI_IMAGE_FORMAT_SABGR8, - .mesa_format = MESA_FORMAT_R8G8B8A8_SRGB, + .image_format = __DRI_IMAGE_FORMAT_GR88, + .mesa_format = MESA_FORMAT_RG_UNORM8, + .internal_format = GL_RG8, }, { - .image_format = __DRI_IMAGE_FORMAT_SARGB8, - .mesa_format = MESA_FORMAT_B8G8R8A8_SRGB, + .image_format = __DRI_IMAGE_FORMAT_GR88, + .mesa_format = MESA_FORMAT_LA_UNORM8, + .internal_format = GL_RG8, }, +#endif { - .image_format = __DRI_IMAGE_FORMAT_R16, - .mesa_format = MESA_FORMAT_R_UNORM16, + .image_format = __DRI_IMAGE_FORMAT_SABGR8, + .mesa_format = MESA_FORMAT_R8G8B8A8_SRGB, + .internal_format = GL_SRGB8_ALPHA8, }, { - .image_format = __DRI_IMAGE_FORMAT_R16, - .mesa_format = MESA_FORMAT_L_UNORM16, + .image_format = __DRI_IMAGE_FORMAT_SARGB8, + .mesa_format = MESA_FORMAT_B8G8R8A8_SRGB, + .internal_format = GL_SRGB8_ALPHA8, }, { - .image_format = __DRI_IMAGE_FORMAT_GR1616, - .mesa_format = MESA_FORMAT_R16G16_UNORM, + .image_format = __DRI_IMAGE_FORMAT_SXRGB8, + .mesa_format = MESA_FORMAT_B8G8R8X8_SRGB, + .internal_format = GL_SRGB8_ALPHA8, }, { - .image_format = __DRI_IMAGE_FORMAT_GR1616, - .mesa_format = MESA_FORMAT_L16A16_UNORM, + .image_format = __DRI_IMAGE_FORMAT_R16, + .mesa_format = MESA_FORMAT_R_UNORM16, + .internal_format = GL_R16, }, + { + .image_format = __DRI_IMAGE_FORMAT_R16, + .mesa_format = MESA_FORMAT_L_UNORM16, + .internal_format = GL_R16, + }, +#if UTIL_ARCH_LITTLE_ENDIAN + { + .image_format = __DRI_IMAGE_FORMAT_GR1616, + .mesa_format = MESA_FORMAT_RG_UNORM16, + .internal_format = GL_RG16, + }, + { + .image_format = __DRI_IMAGE_FORMAT_GR1616, + .mesa_format = MESA_FORMAT_LA_UNORM16, + .internal_format = GL_RG16, + }, +#endif }; uint32_t @@ -967,6 +1008,16 @@ return __DRI_IMAGE_FORMAT_NONE; } +uint32_t +driGLFormatToSizedInternalGLFormat(mesa_format format) +{ + for (size_t i = 0; i < ARRAY_SIZE(format_mapping); i++) + if (format_mapping[i].mesa_format == format) + return format_mapping[i].internal_format; + + return GL_NONE; +} + mesa_format driImageFormatToGLFormat(uint32_t image_format) { diff -Nru mesa-19.2.8/src/mesa/drivers/dri/common/dri_util.h mesa-20.0.8/src/mesa/drivers/dri/common/dri_util.h --- mesa-19.2.8/src/mesa/drivers/dri/common/dri_util.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mesa/drivers/dri/common/dri_util.h 2020-06-12 01:21:18.000000000 +0000 @@ -320,6 +320,9 @@ extern uint32_t driGLFormatToImageFormat(mesa_format format); +extern uint32_t +driGLFormatToSizedInternalGLFormat(mesa_format format); + extern mesa_format driImageFormatToGLFormat(uint32_t image_format); diff -Nru mesa-19.2.8/src/mesa/drivers/dri/common/meson.build mesa-20.0.8/src/mesa/drivers/dri/common/meson.build --- mesa-19.2.8/src/mesa/drivers/dri/common/meson.build 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mesa/drivers/dri/common/meson.build 2020-06-12 01:21:18.000000000 +0000 @@ -22,10 +22,10 @@ libdricommon = static_library( 'dricommon', - ['utils.c', 'utils.h', 'dri_util.c', 'dri_util.h', xmlpool_options_h], + files('utils.c', 'utils.h', 'dri_util.c', 'dri_util.h'), include_directories : [inc_common, inc_util], c_args : c_vis_args, - dependencies : dep_libdrm, + dependencies : [dep_libdrm, idep_xmlconfig_headers], build_by_default : false, ) diff -Nru mesa-19.2.8/src/mesa/drivers/dri/common/utils.c mesa-20.0.8/src/mesa/drivers/dri/common/utils.c --- mesa-19.2.8/src/mesa/drivers/dri/common/utils.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mesa/drivers/dri/common/utils.c 2020-06-12 01:21:18.000000000 +0000 @@ -181,28 +181,47 @@ GLboolean enable_accum, GLboolean color_depth_match, GLboolean mutable_render_buffer) { - static const uint32_t masks_table[][4] = { + static const struct { + uint32_t masks[4]; + int shifts[4]; + } format_table[] = { /* MESA_FORMAT_B5G6R5_UNORM */ - { 0x0000F800, 0x000007E0, 0x0000001F, 0x00000000 }, + {{ 0x0000F800, 0x000007E0, 0x0000001F, 0x00000000 }, + { 11, 5, 0, -1 }}, /* MESA_FORMAT_B8G8R8X8_UNORM */ - { 0x00FF0000, 0x0000FF00, 0x000000FF, 0x00000000 }, + {{ 0x00FF0000, 0x0000FF00, 0x000000FF, 0x00000000 }, + { 16, 8, 0, -1 }}, /* MESA_FORMAT_B8G8R8A8_UNORM */ - { 0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000 }, + {{ 0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000 }, + { 16, 8, 0, 24 }}, /* MESA_FORMAT_B10G10R10X2_UNORM */ - { 0x3FF00000, 0x000FFC00, 0x000003FF, 0x00000000 }, + {{ 0x3FF00000, 0x000FFC00, 0x000003FF, 0x00000000 }, + { 20, 10, 0, -1 }}, /* MESA_FORMAT_B10G10R10A2_UNORM */ - { 0x3FF00000, 0x000FFC00, 0x000003FF, 0xC0000000 }, + {{ 0x3FF00000, 0x000FFC00, 0x000003FF, 0xC0000000 }, + { 20, 10, 0, 30 }}, /* MESA_FORMAT_R8G8B8A8_UNORM */ - { 0x000000FF, 0x0000FF00, 0x00FF0000, 0xFF000000 }, + {{ 0x000000FF, 0x0000FF00, 0x00FF0000, 0xFF000000 }, + { 0, 8, 16, 24 }}, /* MESA_FORMAT_R8G8B8X8_UNORM */ - { 0x000000FF, 0x0000FF00, 0x00FF0000, 0x00000000 }, + {{ 0x000000FF, 0x0000FF00, 0x00FF0000, 0x00000000 }, + { 0, 8, 16, -1 }}, /* MESA_FORMAT_R10G10B10X2_UNORM */ - { 0x000003FF, 0x000FFC00, 0x3FF00000, 0x00000000 }, + {{ 0x000003FF, 0x000FFC00, 0x3FF00000, 0x00000000 }, + { 0, 10, 20, -1 }}, /* MESA_FORMAT_R10G10B10A2_UNORM */ - { 0x000003FF, 0x000FFC00, 0x3FF00000, 0xC0000000 }, + {{ 0x000003FF, 0x000FFC00, 0x3FF00000, 0xC0000000 }, + { 0, 10, 20, 30 }}, + /* MESA_FORMAT_RGBX_FLOAT16 */ + {{ 0, 0, 0, 0}, + { 0, 16, 32, -1 }}, + /* MESA_FORMAT_RGBA_FLOAT16 */ + {{ 0, 0, 0, 0}, + { 0, 16, 32, 48 }}, }; const uint32_t * masks; + const int * shifts; __DRIconfig **configs, **c; struct gl_config *modes; unsigned i, j, k, h; @@ -213,37 +232,55 @@ int blue_bits; int alpha_bits; bool is_srgb; + bool is_float; switch (format) { case MESA_FORMAT_B5G6R5_UNORM: - masks = masks_table[0]; + masks = format_table[0].masks; + shifts = format_table[0].shifts; break; case MESA_FORMAT_B8G8R8X8_UNORM: case MESA_FORMAT_B8G8R8X8_SRGB: - masks = masks_table[1]; + masks = format_table[1].masks; + shifts = format_table[1].shifts; break; case MESA_FORMAT_B8G8R8A8_UNORM: case MESA_FORMAT_B8G8R8A8_SRGB: - masks = masks_table[2]; + masks = format_table[2].masks; + shifts = format_table[2].shifts; break; case MESA_FORMAT_R8G8B8A8_UNORM: case MESA_FORMAT_R8G8B8A8_SRGB: - masks = masks_table[5]; + masks = format_table[5].masks; + shifts = format_table[5].shifts; break; case MESA_FORMAT_R8G8B8X8_UNORM: - masks = masks_table[6]; + masks = format_table[6].masks; + shifts = format_table[6].shifts; break; case MESA_FORMAT_B10G10R10X2_UNORM: - masks = masks_table[3]; + masks = format_table[3].masks; + shifts = format_table[3].shifts; break; case MESA_FORMAT_B10G10R10A2_UNORM: - masks = masks_table[4]; + masks = format_table[4].masks; + shifts = format_table[4].shifts; + break; + case MESA_FORMAT_RGBX_FLOAT16: + masks = format_table[9].masks; + shifts = format_table[9].shifts; + break; + case MESA_FORMAT_RGBA_FLOAT16: + masks = format_table[10].masks; + shifts = format_table[10].shifts; break; case MESA_FORMAT_R10G10B10X2_UNORM: - masks = masks_table[7]; + masks = format_table[7].masks; + shifts = format_table[7].shifts; break; case MESA_FORMAT_R10G10B10A2_UNORM: - masks = masks_table[8]; + masks = format_table[8].masks; + shifts = format_table[8].shifts; break; default: fprintf(stderr, "[%s:%u] Unknown framebuffer type %s (%d).\n", @@ -257,6 +294,7 @@ blue_bits = _mesa_get_format_bits(format, GL_BLUE_BITS); alpha_bits = _mesa_get_format_bits(format, GL_ALPHA_BITS); is_srgb = _mesa_is_format_srgb(format); + is_float = _mesa_get_format_datatype(format) == GL_FLOAT; num_modes = num_depth_stencil_bits * num_db_modes * num_accum_bits * num_msaa_modes; configs = calloc(num_modes + 1, sizeof *configs); @@ -286,6 +324,7 @@ c++; memset(modes, 0, sizeof *modes); + modes->floatMode = is_float; modes->redBits = red_bits; modes->greenBits = green_bits; modes->blueBits = blue_bits; @@ -294,6 +333,10 @@ modes->greenMask = masks[1]; modes->blueMask = masks[2]; modes->alphaMask = masks[3]; + modes->redShift = shifts[0]; + modes->greenShift = shifts[1]; + modes->blueShift = shifts[2]; + modes->alphaShift = shifts[3]; modes->rgbBits = modes->redBits + modes->greenBits + modes->blueBits + modes->alphaBits; @@ -312,7 +355,6 @@ modes->transparentBlue = GLX_DONT_CARE; modes->transparentAlpha = GLX_DONT_CARE; modes->transparentIndex = GLX_DONT_CARE; - modes->rgbMode = GL_TRUE; if (db_modes[i] == __DRI_ATTRIB_SWAP_NONE) { modes->doubleBufferMode = GL_FALSE; @@ -326,14 +368,6 @@ modes->samples = msaa_samples[h]; modes->sampleBuffers = modes->samples ? 1 : 0; - - modes->haveAccumBuffer = ((modes->accumRedBits + - modes->accumGreenBits + - modes->accumBlueBits + - modes->accumAlphaBits) > 0); - modes->haveDepthBuffer = (modes->depthBits > 0); - modes->haveStencilBuffer = (modes->stencilBits > 0); - modes->bindToTextureRgb = GL_TRUE; modes->bindToTextureRgba = GL_TRUE; modes->bindToMipmapTexture = GL_FALSE; @@ -414,9 +448,13 @@ __ATTRIB(__DRI_ATTRIB_TRANSPARENT_BLUE_VALUE, transparentBlue), __ATTRIB(__DRI_ATTRIB_TRANSPARENT_ALPHA_VALUE, transparentAlpha), __ATTRIB(__DRI_ATTRIB_RED_MASK, redMask), + __ATTRIB(__DRI_ATTRIB_RED_SHIFT, redShift), __ATTRIB(__DRI_ATTRIB_GREEN_MASK, greenMask), + __ATTRIB(__DRI_ATTRIB_GREEN_SHIFT, greenShift), __ATTRIB(__DRI_ATTRIB_BLUE_MASK, blueMask), + __ATTRIB(__DRI_ATTRIB_BLUE_SHIFT, blueShift), __ATTRIB(__DRI_ATTRIB_ALPHA_MASK, alphaMask), + __ATTRIB(__DRI_ATTRIB_ALPHA_SHIFT, alphaShift), __ATTRIB(__DRI_ATTRIB_MAX_PBUFFER_WIDTH, maxPbufferWidth), __ATTRIB(__DRI_ATTRIB_MAX_PBUFFER_HEIGHT, maxPbufferHeight), __ATTRIB(__DRI_ATTRIB_MAX_PBUFFER_PIXELS, maxPbufferPixels), @@ -451,6 +489,8 @@ case __DRI_ATTRIB_RENDER_TYPE: /* no support for color index mode */ *value = __DRI_ATTRIB_RGBA_BIT; + if (config->modes.floatMode) + *value |= __DRI_ATTRIB_FLOAT_BIT; break; case __DRI_ATTRIB_CONFIG_CAVEAT: if (config->modes.visualRating == GLX_NON_CONFORMANT_CONFIG) diff -Nru mesa-19.2.8/src/mesa/drivers/dri/i915/i830_texstate.c mesa-20.0.8/src/mesa/drivers/dri/i915/i830_texstate.c --- mesa-19.2.8/src/mesa/drivers/dri/i915/i830_texstate.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mesa/drivers/dri/i915/i830_texstate.c 2020-06-12 01:21:18.000000000 +0000 @@ -47,7 +47,7 @@ return MAPSURF_8BIT | MT_8BIT_L8; case MESA_FORMAT_I_UNORM8: return MAPSURF_8BIT | MT_8BIT_I8; - case MESA_FORMAT_L8A8_UNORM: + case MESA_FORMAT_LA_UNORM8: return MAPSURF_16BIT | MT_16BIT_AY88; case MESA_FORMAT_B5G6R5_UNORM: return MAPSURF_16BIT | MT_16BIT_RGB565; diff -Nru mesa-19.2.8/src/mesa/drivers/dri/i915/i915_context.c mesa-20.0.8/src/mesa/drivers/dri/i915/i915_context.c --- mesa-19.2.8/src/mesa/drivers/dri/i915/i915_context.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mesa/drivers/dri/i915/i915_context.c 2020-06-12 01:21:18.000000000 +0000 @@ -118,7 +118,7 @@ if (intel->gen == 3) ctx->TextureFormatSupported[MESA_FORMAT_A_UNORM8] = true; ctx->TextureFormatSupported[MESA_FORMAT_I_UNORM8] = true; - ctx->TextureFormatSupported[MESA_FORMAT_L8A8_UNORM] = true; + ctx->TextureFormatSupported[MESA_FORMAT_LA_UNORM8] = true; /* Depth and stencil */ if (intel->gen == 3) { diff -Nru mesa-19.2.8/src/mesa/drivers/dri/i915/i915_texstate.c mesa-20.0.8/src/mesa/drivers/dri/i915/i915_texstate.c --- mesa-19.2.8/src/mesa/drivers/dri/i915/i915_texstate.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mesa/drivers/dri/i915/i915_texstate.c 2020-06-12 01:21:18.000000000 +0000 @@ -48,7 +48,7 @@ return MAPSURF_8BIT | MT_8BIT_I8; case MESA_FORMAT_A_UNORM8: return MAPSURF_8BIT | MT_8BIT_A8; - case MESA_FORMAT_L8A8_UNORM: + case MESA_FORMAT_LA_UNORM8: return MAPSURF_16BIT | MT_16BIT_AY88; case MESA_FORMAT_B5G6R5_UNORM: return MAPSURF_16BIT | MT_16BIT_RGB565; diff -Nru mesa-19.2.8/src/mesa/drivers/dri/i915/intel_context.c mesa-20.0.8/src/mesa/drivers/dri/i915/intel_context.c --- mesa-19.2.8/src/mesa/drivers/dri/i915/intel_context.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mesa/drivers/dri/i915/intel_context.c 2020-06-12 01:21:18.000000000 +0000 @@ -599,7 +599,7 @@ driDestroyOptionCache(&intel->optionCache); /* free the Mesa context */ - _mesa_free_context_data(&intel->ctx, true); + _mesa_free_context_data(&intel->ctx); _math_matrix_dtr(&intel->ViewportMatrix); diff -Nru mesa-19.2.8/src/mesa/drivers/dri/i915/intel_screen.c mesa-20.0.8/src/mesa/drivers/dri/i915/intel_screen.c --- mesa-19.2.8/src/mesa/drivers/dri/i915/intel_screen.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mesa/drivers/dri/i915/intel_screen.c 2020-06-12 01:21:18.000000000 +0000 @@ -28,6 +28,7 @@ #include #include #include +#include "drm-uapi/drm_fourcc.h" #include "main/glheader.h" #include "main/context.h" #include "main/framebuffer.h" @@ -178,45 +179,45 @@ }; static struct intel_image_format intel_image_formats[] = { - { __DRI_IMAGE_FOURCC_ARGB8888, __DRI_IMAGE_COMPONENTS_RGBA, 1, + { DRM_FORMAT_ARGB8888, __DRI_IMAGE_COMPONENTS_RGBA, 1, { { 0, 0, 0, __DRI_IMAGE_FORMAT_ARGB8888, 4 } } }, { __DRI_IMAGE_FOURCC_SARGB8888, __DRI_IMAGE_COMPONENTS_RGBA, 1, { { 0, 0, 0, __DRI_IMAGE_FORMAT_SARGB8, 4 } } }, - { __DRI_IMAGE_FOURCC_XRGB8888, __DRI_IMAGE_COMPONENTS_RGB, 1, + { DRM_FORMAT_XRGB8888, __DRI_IMAGE_COMPONENTS_RGB, 1, { { 0, 0, 0, __DRI_IMAGE_FORMAT_XRGB8888, 4 }, } }, - { __DRI_IMAGE_FOURCC_YUV410, __DRI_IMAGE_COMPONENTS_Y_U_V, 3, + { DRM_FORMAT_YUV410, __DRI_IMAGE_COMPONENTS_Y_U_V, 3, { { 0, 0, 0, __DRI_IMAGE_FORMAT_R8, 1 }, { 1, 2, 2, __DRI_IMAGE_FORMAT_R8, 1 }, { 2, 2, 2, __DRI_IMAGE_FORMAT_R8, 1 } } }, - { __DRI_IMAGE_FOURCC_YUV411, __DRI_IMAGE_COMPONENTS_Y_U_V, 3, + { DRM_FORMAT_YUV411, __DRI_IMAGE_COMPONENTS_Y_U_V, 3, { { 0, 0, 0, __DRI_IMAGE_FORMAT_R8, 1 }, { 1, 2, 0, __DRI_IMAGE_FORMAT_R8, 1 }, { 2, 2, 0, __DRI_IMAGE_FORMAT_R8, 1 } } }, - { __DRI_IMAGE_FOURCC_YUV420, __DRI_IMAGE_COMPONENTS_Y_U_V, 3, + { DRM_FORMAT_YUV420, __DRI_IMAGE_COMPONENTS_Y_U_V, 3, { { 0, 0, 0, __DRI_IMAGE_FORMAT_R8, 1 }, { 1, 1, 1, __DRI_IMAGE_FORMAT_R8, 1 }, { 2, 1, 1, __DRI_IMAGE_FORMAT_R8, 1 } } }, - { __DRI_IMAGE_FOURCC_YUV422, __DRI_IMAGE_COMPONENTS_Y_U_V, 3, + { DRM_FORMAT_YUV422, __DRI_IMAGE_COMPONENTS_Y_U_V, 3, { { 0, 0, 0, __DRI_IMAGE_FORMAT_R8, 1 }, { 1, 1, 0, __DRI_IMAGE_FORMAT_R8, 1 }, { 2, 1, 0, __DRI_IMAGE_FORMAT_R8, 1 } } }, - { __DRI_IMAGE_FOURCC_YUV444, __DRI_IMAGE_COMPONENTS_Y_U_V, 3, + { DRM_FORMAT_YUV444, __DRI_IMAGE_COMPONENTS_Y_U_V, 3, { { 0, 0, 0, __DRI_IMAGE_FORMAT_R8, 1 }, { 1, 0, 0, __DRI_IMAGE_FORMAT_R8, 1 }, { 2, 0, 0, __DRI_IMAGE_FORMAT_R8, 1 } } }, - { __DRI_IMAGE_FOURCC_NV12, __DRI_IMAGE_COMPONENTS_Y_UV, 2, + { DRM_FORMAT_NV12, __DRI_IMAGE_COMPONENTS_Y_UV, 2, { { 0, 0, 0, __DRI_IMAGE_FORMAT_R8, 1 }, { 1, 1, 1, __DRI_IMAGE_FORMAT_GR88, 2 } } }, - { __DRI_IMAGE_FOURCC_NV16, __DRI_IMAGE_COMPONENTS_Y_UV, 2, + { DRM_FORMAT_NV16, __DRI_IMAGE_COMPONENTS_Y_UV, 2, { { 0, 0, 0, __DRI_IMAGE_FORMAT_R8, 1 }, { 1, 1, 0, __DRI_IMAGE_FORMAT_GR88, 2 } } }, @@ -228,10 +229,10 @@ * V into A. This lets the texture sampler interpolate the Y * components correctly when sampling from plane 0, and interpolate * U and V correctly when sampling from plane 1. */ - { __DRI_IMAGE_FOURCC_YUYV, __DRI_IMAGE_COMPONENTS_Y_XUXV, 2, + { DRM_FORMAT_YUYV, __DRI_IMAGE_COMPONENTS_Y_XUXV, 2, { { 0, 0, 0, __DRI_IMAGE_FORMAT_GR88, 2 }, { 0, 1, 0, __DRI_IMAGE_FORMAT_ARGB8888, 4 } } }, - { __DRI_IMAGE_FOURCC_UYVY, __DRI_IMAGE_COMPONENTS_Y_UXVX, 2, + { DRM_FORMAT_UYVY, __DRI_IMAGE_COMPONENTS_Y_UXVX, 2, { { 0, 0, 0, __DRI_IMAGE_FORMAT_GR88, 2 }, { 0, 1, 0, __DRI_IMAGE_FORMAT_ABGR8888, 4 } } } }; diff -Nru mesa-19.2.8/src/mesa/drivers/dri/i915/meson.build mesa-20.0.8/src/mesa/drivers/dri/i915/meson.build --- mesa-19.2.8/src/mesa/drivers/dri/i915/meson.build 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mesa/drivers/dri/i915/meson.build 2020-06-12 01:21:18.000000000 +0000 @@ -86,9 +86,9 @@ libi915 = static_library( 'i915', - [files_i915, xmlpool_options_h], + files_i915, include_directories : [inc_common, inc_dri_common, inc_util], c_args : [c_vis_args, no_override_init_args], cpp_args : [cpp_vis_args], - dependencies : [dep_libdrm, dep_libdrm_intel], + dependencies : [dep_libdrm, dep_libdrm_intel, idep_xmlconfig_headers], ) diff -Nru mesa-19.2.8/src/mesa/drivers/dri/i965/brw_blorp.c mesa-20.0.8/src/mesa/drivers/dri/i965/brw_blorp.c --- mesa-19.2.8/src/mesa/drivers/dri/i965/brw_blorp.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mesa/drivers/dri/i965/brw_blorp.c 2020-06-12 01:21:18.000000000 +0000 @@ -299,6 +299,12 @@ dst_level, dst_layer, dst_x0, dst_y0, dst_x1, dst_y1, mirror_x, mirror_y); + if (src_format == MESA_FORMAT_NONE) + src_format = src_mt->format; + + if (dst_format == MESA_FORMAT_NONE) + dst_format = dst_mt->format; + if (!decode_srgb) src_format = _mesa_get_srgb_format_linear(src_format); @@ -389,7 +395,7 @@ /* We do format workarounds for some depth formats so we can't reliably * sample with HiZ. One of these days, we should fix that. */ - if (src_aux_usage == ISL_AUX_USAGE_HIZ) + if (src_aux_usage == ISL_AUX_USAGE_HIZ && src_mt->format != src_format) src_aux_usage = ISL_AUX_USAGE_NONE; const bool src_clear_supported = src_aux_usage != ISL_AUX_USAGE_NONE && src_mt->format == src_format; @@ -457,6 +463,15 @@ bool src_clear_supported, dst_clear_supported; switch (src_mt->aux_usage) { + case ISL_AUX_USAGE_HIZ: + if (intel_miptree_sample_with_hiz(brw, src_mt)) { + src_aux_usage = src_mt->aux_usage; + src_clear_supported = true; + } else { + src_aux_usage = ISL_AUX_USAGE_NONE; + src_clear_supported = false; + } + break; case ISL_AUX_USAGE_MCS: case ISL_AUX_USAGE_CCS_E: src_aux_usage = src_mt->aux_usage; @@ -1211,6 +1226,9 @@ bool can_fast_clear = !partial_clear; + if (INTEL_DEBUG & DEBUG_NO_FAST_CLEAR) + can_fast_clear = false; + bool color_write_disable[4] = { false, false, false, false }; if (set_write_disables(irb, GET_COLORMASK(ctx->Color.ColorMask, buf), color_write_disable)) @@ -1289,6 +1307,7 @@ struct blorp_batch batch; blorp_batch_init(&brw->blorp, &batch, brw, 0); blorp_fast_clear(&batch, &surf, isl_format_srgb_to_linear(isl_format), + ISL_SWIZZLE_IDENTITY, level, irb->mt_layer, num_layers, x0, y0, x1, y1); blorp_batch_finish(&batch); @@ -1405,7 +1424,7 @@ if (x0 == x1 || y0 == y1) return; - uint32_t level, start_layer, num_layers; + uint32_t level = 0, start_layer = 0, num_layers; struct blorp_surf depth_surf, stencil_surf; struct intel_mipmap_tree *depth_mt = NULL; diff -Nru mesa-19.2.8/src/mesa/drivers/dri/i965/brw_bufmgr.c mesa-20.0.8/src/mesa/drivers/dri/i965/brw_bufmgr.c --- mesa-19.2.8/src/mesa/drivers/dri/i965/brw_bufmgr.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mesa/drivers/dri/i965/brw_bufmgr.c 2020-06-12 01:21:18.000000000 +0000 @@ -58,6 +58,7 @@ #include "util/macros.h" #include "util/hash_table.h" #include "util/list.h" +#include "util/os_file.h" #include "util/u_dynarray.h" #include "util/vma.h" #include "brw_bufmgr.h" @@ -74,6 +75,20 @@ #define VG(x) #endif +/* Bufmgr is not aware of brw_context. */ +#undef WARN_ONCE +#define WARN_ONCE(cond, fmt...) do { \ + if (unlikely(cond)) { \ + static bool _warned = false; \ + if (!_warned) { \ + fprintf(stderr, "WARNING: "); \ + fprintf(stderr, fmt); \ + _warned = true; \ + } \ + } \ +} while (0) + + /* VALGRIND_FREELIKE_BLOCK unfortunately does not actually undo the earlier * VALGRIND_MALLOCLIKE_BLOCK but instead leaves vg convinced the memory is * leaked. All because it does not call VG(cli_free) from its @@ -135,7 +150,21 @@ struct util_dynarray vma_list[BRW_MEMZONE_COUNT]; }; +struct bo_export { + /** File descriptor associated with a handle export. */ + int drm_fd; + + /** GEM handle in drm_fd */ + uint32_t gem_handle; + + struct list_head link; +}; + struct brw_bufmgr { + uint32_t refcount; + + struct list_head link; + int fd; mtx_t lock; @@ -157,6 +186,12 @@ uint64_t initial_kflags; }; +static mtx_t global_bufmgr_list_mutex = _MTX_INITIALIZER_NP; +static struct list_head global_bufmgr_list = { + .next = &global_bufmgr_list, + .prev = &global_bufmgr_list, +}; + static int bo_set_tiling_internal(struct brw_bo *bo, uint32_t tiling_mode, uint32_t stride); @@ -166,18 +201,6 @@ enum brw_memory_zone memzone, uint64_t size, uint64_t alignment); -static uint32_t -key_hash_uint(const void *key) -{ - return _mesa_hash_data(key, 4); -} - -static bool -key_uint_equal(const void *a, const void *b) -{ - return *((unsigned *) a) == *((unsigned *) b); -} - static struct brw_bo * hash_find_bo(struct hash_table *ht, unsigned int key) { @@ -486,6 +509,18 @@ } static struct brw_bo * +bo_calloc(void) +{ + struct brw_bo *bo = calloc(1, sizeof(*bo)); + if (!bo) + return NULL; + + list_inithead(&bo->exports); + + return bo; +} + +static struct brw_bo * bo_alloc_internal(struct brw_bufmgr *bufmgr, const char *name, uint64_t size, @@ -531,7 +566,7 @@ /* Get a buffer out of the cache if available */ retry: alloc_from_cache = false; - if (bucket != NULL && !list_empty(&bucket->head)) { + if (bucket != NULL && !list_is_empty(&bucket->head)) { if (busy && !zeroed) { /* Allocate new render-target BOs from the tail (MRU) * of the list, as it will likely be hot in the GPU @@ -558,6 +593,7 @@ } if (alloc_from_cache) { + assert(list_is_empty(&bo->exports)); if (!brw_bo_madvise(bo, I915_MADV_WILLNEED)) { bo_free(bo); brw_bo_cache_purge_bucket(bufmgr, bucket); @@ -590,7 +626,7 @@ bo->gtt_offset = 0ull; } } else { - bo = calloc(1, sizeof(*bo)); + bo = bo_calloc(); if (!bo) goto err; @@ -761,11 +797,12 @@ */ bo = hash_find_bo(bufmgr->handle_table, open_arg.handle); if (bo) { + assert(list_is_empty(&bo->exports)); brw_bo_reference(bo); goto out; } - bo = calloc(1, sizeof(*bo)); + bo = bo_calloc(); if (!bo) goto out; @@ -835,6 +872,8 @@ entry = _mesa_hash_table_search(bufmgr->handle_table, &bo->gem_handle); _mesa_hash_table_remove(bufmgr->handle_table, entry); + } else { + assert(list_is_empty(&bo->exports)); } /* Close this object */ @@ -884,6 +923,14 @@ DBG("bo_unreference final: %d (%s)\n", bo->gem_handle, bo->name); + list_for_each_entry_safe(struct bo_export, export, &bo->exports, link) { + struct drm_gem_close close = { .handle = export->gem_handle }; + gen_ioctl(export->drm_fd, DRM_IOCTL_GEM_CLOSE, &close); + + list_del(&export->link); + free(export); + } + bucket = bucket_for_size(bufmgr, bo->size); /* Put the buffer into our internal cache for reuse if we can. */ if (bufmgr->bo_reuse && bo->reusable && bucket != NULL && @@ -1291,8 +1338,19 @@ } void -brw_bufmgr_destroy(struct brw_bufmgr *bufmgr) +brw_bufmgr_unref(struct brw_bufmgr *bufmgr) { + mtx_lock(&global_bufmgr_list_mutex); + if (p_atomic_dec_zero(&bufmgr->refcount)) { + list_del(&bufmgr->link); + } else { + bufmgr = NULL; + } + mtx_unlock(&global_bufmgr_list_mutex); + + if (!bufmgr) + return; + mtx_destroy(&bufmgr->lock); /* Free any cached buffer objects we were going to reuse */ @@ -1321,6 +1379,9 @@ } } + close(bufmgr->fd); + bufmgr->fd = -1; + free(bufmgr); } @@ -1389,11 +1450,12 @@ */ bo = hash_find_bo(bufmgr->handle_table, handle); if (bo) { + assert(list_is_empty(&bo->exports)); brw_bo_reference(bo); goto out; } - bo = calloc(1, sizeof(*bo)); + bo = bo_calloc(); if (!bo) goto out; @@ -1528,17 +1590,68 @@ return 0; } -/** - * Enables unlimited caching of buffer objects for reuse. - * - * This is potentially very memory expensive, as the cache at each bucket - * size is only bounded by how many buffers of that size we've managed to have - * in flight at once. - */ -void -brw_bufmgr_enable_reuse(struct brw_bufmgr *bufmgr) +int +brw_bo_export_gem_handle_for_device(struct brw_bo *bo, int drm_fd, + uint32_t *out_handle) { - bufmgr->bo_reuse = true; + struct brw_bufmgr *bufmgr = bo->bufmgr; + + /* Only add the new GEM handle to the list of export if it belongs to a + * different GEM device. Otherwise we might close the same buffer multiple + * times. + */ + int ret = os_same_file_description(drm_fd, bufmgr->fd); + WARN_ONCE(ret < 0, + "Kernel has no file descriptor comparison support: %s\n", + strerror(errno)); + if (ret == 0) { + *out_handle = brw_bo_export_gem_handle(bo); + return 0; + } + + struct bo_export *export = calloc(1, sizeof(*export)); + if (!export) + return -ENOMEM; + + export->drm_fd = drm_fd; + + int dmabuf_fd = -1; + int err = brw_bo_gem_export_to_prime(bo, &dmabuf_fd); + if (err) { + free(export); + return err; + } + + mtx_lock(&bufmgr->lock); + err = drmPrimeFDToHandle(drm_fd, dmabuf_fd, &export->gem_handle); + close(dmabuf_fd); + if (err) { + mtx_unlock(&bufmgr->lock); + free(export); + return err; + } + + bool found = false; + list_for_each_entry(struct bo_export, iter, &bo->exports, link) { + if (iter->drm_fd != drm_fd) + continue; + /* Here we assume that for a given DRM fd, we'll always get back the + * same GEM handle for a given buffer. + */ + assert(iter->gem_handle == export->gem_handle); + free(export); + export = iter; + found = true; + break; + } + if (!found) + list_addtail(&export->link, &bo->exports); + + mtx_unlock(&bufmgr->lock); + + *out_handle = export->gem_handle; + + return 0; } static void @@ -1676,14 +1789,21 @@ return bufmgr->initial_kflags & EXEC_OBJECT_PINNED; } +static struct brw_bufmgr * +brw_bufmgr_ref(struct brw_bufmgr *bufmgr) +{ + p_atomic_inc(&bufmgr->refcount); + return bufmgr; +} + /** * Initializes the GEM buffer manager, which uses the kernel to allocate, map, * and manage map buffer objections. * * \param fd File descriptor of the opened DRM device. */ -struct brw_bufmgr * -brw_bufmgr_init(struct gen_device_info *devinfo, int fd) +static struct brw_bufmgr * +brw_bufmgr_create(struct gen_device_info *devinfo, int fd, bool bo_reuse) { struct brw_bufmgr *bufmgr; @@ -1700,9 +1820,16 @@ * Don't do this! Ensure that each library/bufmgr has its own device * fd so that its namespace does not clash with another. */ - bufmgr->fd = fd; + bufmgr->fd = dup(fd); + if (bufmgr->fd < 0) { + free(bufmgr); + return NULL; + } + + p_atomic_set(&bufmgr->refcount, 1); if (mtx_init(&bufmgr->lock, mtx_plain) != 0) { + close(bufmgr->fd); free(bufmgr); return NULL; } @@ -1713,6 +1840,7 @@ bufmgr->has_llc = devinfo->has_llc; bufmgr->has_mmap_wc = gem_param(fd, I915_PARAM_MMAP_VERSION) > 0; + bufmgr->bo_reuse = bo_reuse; const uint64_t _4GB = 4ull << 30; @@ -1743,6 +1871,7 @@ * might actually mean requiring 4.14. */ fprintf(stderr, "i965 requires softpin (Kernel 4.5) on Gen10+."); + close(bufmgr->fd); free(bufmgr); return NULL; } @@ -1751,9 +1880,47 @@ init_cache_buckets(bufmgr); bufmgr->name_table = - _mesa_hash_table_create(NULL, key_hash_uint, key_uint_equal); + _mesa_hash_table_create(NULL, _mesa_hash_uint, _mesa_key_uint_equal); bufmgr->handle_table = - _mesa_hash_table_create(NULL, key_hash_uint, key_uint_equal); + _mesa_hash_table_create(NULL, _mesa_hash_uint, _mesa_key_uint_equal); + + return bufmgr; +} + +struct brw_bufmgr * +brw_bufmgr_get_for_fd(struct gen_device_info *devinfo, int fd, bool bo_reuse) +{ + struct stat st; + + if (fstat(fd, &st)) + return NULL; + + struct brw_bufmgr *bufmgr = NULL; + + mtx_lock(&global_bufmgr_list_mutex); + list_for_each_entry(struct brw_bufmgr, iter_bufmgr, &global_bufmgr_list, link) { + struct stat iter_st; + if (fstat(iter_bufmgr->fd, &iter_st)) + continue; + + if (st.st_rdev == iter_st.st_rdev) { + assert(iter_bufmgr->bo_reuse == bo_reuse); + bufmgr = brw_bufmgr_ref(iter_bufmgr); + goto unlock; + } + } + + bufmgr = brw_bufmgr_create(devinfo, fd, bo_reuse); + list_addtail(&bufmgr->link, &global_bufmgr_list); + + unlock: + mtx_unlock(&global_bufmgr_list_mutex); return bufmgr; } + +int +brw_bufmgr_get_fd(struct brw_bufmgr *bufmgr) +{ + return bufmgr->fd; +} diff -Nru mesa-19.2.8/src/mesa/drivers/dri/i965/brw_bufmgr.h mesa-20.0.8/src/mesa/drivers/dri/i965/brw_bufmgr.h --- mesa-19.2.8/src/mesa/drivers/dri/i965/brw_bufmgr.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mesa/drivers/dri/i965/brw_bufmgr.h 2020-06-12 01:21:18.000000000 +0000 @@ -39,6 +39,7 @@ #include #include +#include "c11/threads.h" #include "util/u_atomic.h" #include "util/list.h" @@ -180,6 +181,13 @@ struct list_head head; /** + * List of GEM handle exports of this buffer (bo_export). + * + * Hold bufmgr->lock when using this list. + */ + struct list_head exports; + + /** * Boolean of whether this buffer can be re-used */ bool reusable; @@ -300,9 +308,9 @@ void brw_bo_wait_rendering(struct brw_bo *bo); /** - * Tears down the buffer manager instance. + * Unref a buffer manager instance. */ -void brw_bufmgr_destroy(struct brw_bufmgr *bufmgr); +void brw_bufmgr_unref(struct brw_bufmgr *bufmgr); /** * Get the current tiling (and resulting swizzling) mode for the bo. @@ -343,11 +351,12 @@ int brw_bo_madvise(struct brw_bo *bo, int madv); /* drm_bacon_bufmgr_gem.c */ -struct brw_bufmgr *brw_bufmgr_init(struct gen_device_info *devinfo, int fd); +struct brw_bufmgr *brw_bufmgr_get_for_fd(struct gen_device_info *devinfo, int fd, + bool bo_reuse); + struct brw_bo *brw_bo_gem_create_from_name(struct brw_bufmgr *bufmgr, const char *name, unsigned int handle); -void brw_bufmgr_enable_reuse(struct brw_bufmgr *bufmgr); int brw_bo_wait(struct brw_bo *bo, int64_t timeout_ns); @@ -359,6 +368,8 @@ void brw_destroy_hw_context(struct brw_bufmgr *bufmgr, uint32_t ctx_id); +int brw_bufmgr_get_fd(struct brw_bufmgr *bufmgr); + int brw_bo_gem_export_to_prime(struct brw_bo *bo, int *prime_fd); struct brw_bo *brw_bo_gem_create_from_prime(struct brw_bufmgr *bufmgr, int prime_fd); @@ -369,6 +380,18 @@ uint32_t brw_bo_export_gem_handle(struct brw_bo *bo); +/** + * Exports a bo as a GEM handle into a given DRM file descriptor + * \param bo Buffer to export + * \param drm_fd File descriptor where the new handle is created + * \param out_handle Pointer to store the new handle + * + * Returns 0 if the buffer was successfully exported, a non zero error code + * otherwise. + */ +int brw_bo_export_gem_handle_for_device(struct brw_bo *bo, int drm_fd, + uint32_t *out_handle); + int brw_reg_read(struct brw_bufmgr *bufmgr, uint32_t offset, uint64_t *result); diff -Nru mesa-19.2.8/src/mesa/drivers/dri/i965/brw_clear.c mesa-20.0.8/src/mesa/drivers/dri/i965/brw_clear.c --- mesa-19.2.8/src/mesa/drivers/dri/i965/brw_clear.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mesa/drivers/dri/i965/brw_clear.c 2020-06-12 01:21:18.000000000 +0000 @@ -108,6 +108,9 @@ struct gl_renderbuffer_attachment *depth_att = &fb->Attachment[BUFFER_DEPTH]; const struct gen_device_info *devinfo = &brw->screen->devinfo; + if (INTEL_DEBUG & DEBUG_NO_FAST_CLEAR) + return false; + if (devinfo->gen < 6) return false; diff -Nru mesa-19.2.8/src/mesa/drivers/dri/i965/brw_context.c mesa-20.0.8/src/mesa/drivers/dri/i965/brw_context.c --- mesa-19.2.8/src/mesa/drivers/dri/i965/brw_context.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mesa/drivers/dri/i965/brw_context.c 2020-06-12 01:21:18.000000000 +0000 @@ -103,32 +103,22 @@ const char * brw_get_renderer_string(const struct intel_screen *screen) { - const char *chipset; - static char buffer[128]; - char *bsw = NULL; - - switch (screen->deviceID) { -#undef CHIPSET -#define CHIPSET(id, symbol, str) case id: chipset = str; break; -#include "pci_ids/i965_pci_ids.h" - default: - chipset = "Unknown Intel Chipset"; - break; - } + static char buf[128]; + const char *name = gen_get_device_name(screen->deviceID); + + if (!name) + name = "Intel Unknown"; + + snprintf(buf, sizeof(buf), "Mesa DRI %s", name); /* Braswell branding is funny, so we have to fix it up here */ if (screen->deviceID == 0x22B1) { - bsw = strdup(chipset); - char *needle = strstr(bsw, "XXX"); - if (needle) { + char *needle = strstr(buf, "XXX"); + if (needle) memcpy(needle, get_bsw_model(screen), 3); - chipset = bsw; - } } - (void) driGetRendererString(buffer, chipset, 0); - free(bsw); - return buffer; + return buf; } static const GLubyte * @@ -421,6 +411,7 @@ ctx->Const.SpirVCapabilities.tessellation = true; ctx->Const.SpirVCapabilities.transform_feedback = devinfo->gen >= 7; ctx->Const.SpirVCapabilities.variable_pointers = true; + ctx->Const.SpirVCapabilities.integer_functions2 = devinfo->gen >= 8; } static void @@ -846,15 +837,6 @@ brw->driContext->driScreenPriv->myNum, "i965", NULL, NULL, 0); - int bo_reuse_mode = driQueryOptioni(options, "bo_reuse"); - switch (bo_reuse_mode) { - case DRI_CONF_BO_REUSE_DISABLED: - break; - case DRI_CONF_BO_REUSE_ALL: - brw_bufmgr_enable_reuse(brw->bufmgr); - break; - } - if (INTEL_DEBUG & DEBUG_NO_HIZ) { brw->has_hiz = false; /* On gen6, you can only do separate stencil with HIZ. */ @@ -1236,7 +1218,7 @@ driDestroyOptionCache(&brw->optionCache); /* free the Mesa context */ - _mesa_free_context_data(&brw->ctx, true); + _mesa_free_context_data(&brw->ctx); ralloc_free(brw); driContextPriv->driverPrivate = NULL; @@ -1541,8 +1523,10 @@ * that will happen next will probably dirty the front buffer. So * mark it as dirty here. */ - if (_mesa_is_front_buffer_drawing(ctx->DrawBuffer)) + if (_mesa_is_front_buffer_drawing(ctx->DrawBuffer) && + ctx->DrawBuffer != _mesa_get_incomplete_framebuffer()) { brw->front_buffer_dirty = true; + } if (brw->is_shared_buffer_bound) { /* Subsequent rendering will probably dirty the shared buffer. */ diff -Nru mesa-19.2.8/src/mesa/drivers/dri/i965/brw_context.h mesa-20.0.8/src/mesa/drivers/dri/i965/brw_context.h --- mesa-19.2.8/src/mesa/drivers/dri/i965/brw_context.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mesa/drivers/dri/i965/brw_context.h 2020-06-12 01:21:18.000000000 +0000 @@ -51,6 +51,7 @@ #include "intel_screen.h" #include "intel_tex_obj.h" #include "perf/gen_perf.h" +#include "perf/gen_perf_query.h" #ifdef __cplusplus extern "C" { @@ -1504,6 +1505,9 @@ /* gen8_multisample_state.c */ void gen8_emit_3dstate_sample_pattern(struct brw_context *brw); +/* gen7_l3_state.c */ +void brw_emit_l3_state(struct brw_context *brw); + /* gen7_urb.c */ void gen7_emit_push_constant_state(struct brw_context *brw, unsigned vs_size, diff -Nru mesa-19.2.8/src/mesa/drivers/dri/i965/brw_disk_cache.c mesa-20.0.8/src/mesa/drivers/dri/i965/brw_disk_cache.c --- mesa-19.2.8/src/mesa/drivers/dri/i965/brw_disk_cache.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mesa/drivers/dri/i965/brw_disk_cache.c 2020-06-12 01:21:18.000000000 +0000 @@ -21,10 +21,10 @@ * IN THE SOFTWARE. */ -#include "compiler/blob.h" #include "compiler/glsl/ir_uniform.h" #include "compiler/glsl/shader_cache.h" #include "main/mtypes.h" +#include "util/blob.h" #include "util/build_id.h" #include "util/debug.h" #include "util/disk_cache.h" diff -Nru mesa-19.2.8/src/mesa/drivers/dri/i965/brw_link.cpp mesa-20.0.8/src/mesa/drivers/dri/i965/brw_link.cpp --- mesa-19.2.8/src/mesa/drivers/dri/i965/brw_link.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mesa/drivers/dri/i965/brw_link.cpp 2020-06-12 01:21:18.000000000 +0000 @@ -269,15 +269,11 @@ /* SPIR-V programs use a NIR linker */ if (shProg->data->spirv) { - if (!gl_nir_link_uniform_blocks(ctx, shProg)) { + static const gl_nir_linker_options opts = { + .fill_parameters = false, + }; + if (!gl_nir_link_spirv(ctx, shProg, &opts)) return GL_FALSE; - } - - if (!gl_nir_link_uniforms(ctx, shProg)) - return GL_FALSE; - - gl_nir_link_assign_atomic_counter_resources(ctx, shProg); - gl_nir_link_assign_xfb_resources(ctx, shProg); } for (stage = 0; stage < ARRAY_SIZE(shProg->_LinkedShaders); stage++) { @@ -336,8 +332,7 @@ brw_shader_gather_info(prog->nir, prog); NIR_PASS_V(prog->nir, gl_nir_lower_atomics, shProg, false); - NIR_PASS_V(prog->nir, nir_lower_atomics_to_ssbo, - prog->nir->info.num_abos); + NIR_PASS_V(prog->nir, nir_lower_atomics_to_ssbo); nir_sweep(prog->nir); @@ -388,9 +383,9 @@ /* SPIR-V programs build its resource list from linked NIR shaders. */ if (!shProg->data->spirv) - build_program_resource_list(ctx, shProg); + build_program_resource_list(ctx, shProg, false); else - nir_build_program_resource_list(ctx, shProg); + nir_build_program_resource_list(ctx, shProg, true); for (stage = 0; stage < ARRAY_SIZE(shProg->_LinkedShaders); stage++) { struct gl_linked_shader *shader = shProg->_LinkedShaders[stage]; diff -Nru mesa-19.2.8/src/mesa/drivers/dri/i965/brw_nir_uniforms.cpp mesa-20.0.8/src/mesa/drivers/dri/i965/brw_nir_uniforms.cpp --- mesa-19.2.8/src/mesa/drivers/dri/i965/brw_nir_uniforms.cpp 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mesa/drivers/dri/i965/brw_nir_uniforms.cpp 2020-06-12 01:21:18.000000000 +0000 @@ -333,8 +333,10 @@ case nir_intrinsic_image_deref_load: case nir_intrinsic_image_deref_store: case nir_intrinsic_image_deref_atomic_add: - case nir_intrinsic_image_deref_atomic_min: - case nir_intrinsic_image_deref_atomic_max: + case nir_intrinsic_image_deref_atomic_imin: + case nir_intrinsic_image_deref_atomic_umin: + case nir_intrinsic_image_deref_atomic_imax: + case nir_intrinsic_image_deref_atomic_umax: case nir_intrinsic_image_deref_atomic_and: case nir_intrinsic_image_deref_atomic_or: case nir_intrinsic_image_deref_atomic_xor: @@ -405,7 +407,8 @@ nir_function_impl *impl = nir_shader_get_entrypoint(nir); - nir_lower_clip_vs(nir, (1 << nr_userclip_plane_consts) - 1, true); + nir_lower_clip_vs(nir, (1 << nr_userclip_plane_consts) - 1, true, false, + NULL); nir_lower_io_to_temporaries(nir, impl, true, false); nir_lower_global_vars_to_local(nir); nir_lower_vars_to_ssa(nir); diff -Nru mesa-19.2.8/src/mesa/drivers/dri/i965/brw_performance_query.c mesa-20.0.8/src/mesa/drivers/dri/i965/brw_performance_query.c --- mesa-19.2.8/src/mesa/drivers/dri/i965/brw_performance_query.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mesa/drivers/dri/i965/brw_performance_query.c 2020-06-12 01:21:18.000000000 +0000 @@ -73,7 +73,9 @@ #include "intel_batchbuffer.h" #include "perf/gen_perf.h" +#include "perf/gen_perf_regs.h" #include "perf/gen_perf_mdapi.h" +#include "perf/gen_perf_query.h" #define FILE_DEBUG_FLAG DEBUG_PERFMON @@ -222,21 +224,6 @@ /******************************************************************************/ -static void -capture_frequency_stat_register(struct brw_context *brw, - struct brw_bo *bo, - uint32_t bo_offset) -{ - const struct gen_device_info *devinfo = &brw->screen->devinfo; - - if (devinfo->gen >= 7 && devinfo->gen <= 8 && - !devinfo->is_baytrail && !devinfo->is_cherryview) { - brw_store_register_mem32(brw, bo, GEN7_RPSTAT1, bo_offset); - } else if (devinfo->gen >= 9) { - brw_store_register_mem32(brw, bo, GEN9_RPSTAT0, bo_offset); - } -} - /** * Driver hook for glBeginPerfQueryINTEL(). */ @@ -458,9 +445,29 @@ _intel_batchbuffer_flush_fence(ctx, -1, NULL, file, line); } -typedef void (*capture_frequency_stat_register_t)(void *, void *, uint32_t ); -typedef void (*store_register_mem64_t)(void *ctx, void *bo, - uint32_t reg, uint32_t offset); +static void +brw_oa_emit_stall_at_pixel_scoreboard(void *c) +{ + struct brw_context *brw = c; + brw_emit_end_of_pipe_sync(brw, PIPE_CONTROL_STALL_AT_SCOREBOARD); +} + +static void +brw_perf_store_register(struct brw_context *brw, struct brw_bo *bo, + uint32_t reg, uint32_t reg_size, + uint32_t offset) +{ + if (reg_size == 8) { + brw_store_register_mem64(brw, bo, reg, offset); + } else { + assert(reg_size == 4); + brw_store_register_mem32(brw, bo, reg, offset); + } +} + +typedef void (*store_register_mem_t)(void *ctx, void *bo, + uint32_t reg, uint32_t reg_size, + uint32_t offset); typedef bool (*batch_references_t)(void *batch, void *bo); typedef void (*bo_wait_rendering_t)(void *bo); typedef int (*bo_busy_t)(void *bo); @@ -477,7 +484,7 @@ if (perf_cfg) return perf_cfg->n_queries; - if (!oa_metrics_kernel_support(brw->screen->driScrnPriv->fd, devinfo)) + if (!oa_metrics_kernel_support(brw->screen->fd, devinfo)) return 0; perf_cfg = gen_perf_new(ctx); @@ -486,21 +493,20 @@ perf_cfg->vtbl.bo_unreference = (bo_unreference_t)brw_bo_unreference; perf_cfg->vtbl.bo_map = (bo_map_t)brw_bo_map; perf_cfg->vtbl.bo_unmap = (bo_unmap_t)brw_bo_unmap; - perf_cfg->vtbl.emit_mi_flush = (emit_mi_flush_t)brw_emit_mi_flush; + perf_cfg->vtbl.emit_stall_at_pixel_scoreboard = + (emit_mi_flush_t)brw_oa_emit_stall_at_pixel_scoreboard; perf_cfg->vtbl.emit_mi_report_perf_count = (emit_mi_report_t)brw_oa_emit_mi_report_perf_count; perf_cfg->vtbl.batchbuffer_flush = brw_oa_batchbuffer_flush; - perf_cfg->vtbl.capture_frequency_stat_register = - (capture_frequency_stat_register_t) capture_frequency_stat_register; - perf_cfg->vtbl.store_register_mem64 = - (store_register_mem64_t) brw_store_register_mem64; + perf_cfg->vtbl.store_register_mem = + (store_register_mem_t) brw_perf_store_register; perf_cfg->vtbl.batch_references = (batch_references_t)brw_batch_references; perf_cfg->vtbl.bo_wait_rendering = (bo_wait_rendering_t)brw_bo_wait_rendering; perf_cfg->vtbl.bo_busy = (bo_busy_t)brw_bo_busy; gen_perf_init_context(perf_ctx, perf_cfg, brw, brw->bufmgr, devinfo, - brw->hw_ctx, brw->screen->driScrnPriv->fd); - gen_perf_init_metrics(perf_cfg, devinfo, brw->screen->driScrnPriv->fd); + brw->hw_ctx, brw->screen->fd); + gen_perf_init_metrics(perf_cfg, devinfo, brw->screen->fd); return perf_cfg->n_queries; } diff -Nru mesa-19.2.8/src/mesa/drivers/dri/i965/brw_program_binary.c mesa-20.0.8/src/mesa/drivers/dri/i965/brw_program_binary.c --- mesa-19.2.8/src/mesa/drivers/dri/i965/brw_program_binary.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mesa/drivers/dri/i965/brw_program_binary.c 2020-06-12 01:21:18.000000000 +0000 @@ -132,7 +132,7 @@ blob_write_uint32(writer, NIR_PART); intptr_t size_offset = blob_reserve_uint32(writer); size_t nir_start = writer->size; - nir_serialize(writer, prog->nir); + nir_serialize(writer, prog->nir, false); blob_overwrite_uint32(writer, size_offset, writer->size - nir_start); } diff -Nru mesa-19.2.8/src/mesa/drivers/dri/i965/brw_program.c mesa-20.0.8/src/mesa/drivers/dri/i965/brw_program.c --- mesa-19.2.8/src/mesa/drivers/dri/i965/brw_program.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mesa/drivers/dri/i965/brw_program.c 2020-06-12 01:21:18.000000000 +0000 @@ -95,6 +95,14 @@ nir = _mesa_spirv_to_nir(ctx, shader_prog, stage, options); } else { nir = glsl_to_nir(ctx, shader_prog, stage, options); + + /* Remap the locations to slots so those requiring two slots will + * occupy two locations. For instance, if we have in the IR code a + * dvec3 attr0 in location 0 and vec4 attr1 in location 1, in NIR attr0 + * will use locations/slots 0 and 1, and attr1 will use location/slot 2 + */ + if (nir->info.stage == MESA_SHADER_VERTEX) + nir_remap_dual_slot_attributes(nir, &prog->DualSlotInputs); } assert (nir); @@ -459,8 +467,13 @@ * brw->screen->subslice_total is the TOTAL number of subslices * and we wish to view that there are 4 subslices per slice * instead of the actual number of subslices per slice. + * + * For, ICL, scratch space allocation is based on the number of threads + * in the base configuration. */ - if (devinfo->gen >= 9 && devinfo->gen < 11) + if (devinfo->gen == 11) + subslices = 8; + else if (devinfo->gen >= 9 && devinfo->gen < 11) subslices = 4 * brw->screen->devinfo.num_slices; unsigned scratch_ids_per_subslice; diff -Nru mesa-19.2.8/src/mesa/drivers/dri/i965/brw_program_cache.c mesa-20.0.8/src/mesa/drivers/dri/i965/brw_program_cache.c --- mesa-19.2.8/src/mesa/drivers/dri/i965/brw_program_cache.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mesa/drivers/dri/i965/brw_program_cache.c 2020-06-12 01:21:18.000000000 +0000 @@ -37,7 +37,7 @@ * data) in return. Objects in the cache may not have relocations * (pointers to other BOs) in them. * - * The inner workings are a simple hash table based on a CRC of the + * The inner workings are a simple hash table based on a FNV-1a of the * key data. * * Replacement is not implemented. Instead, when the cache gets too @@ -96,17 +96,9 @@ static GLuint hash_key(struct brw_cache_item *item) { - GLuint *ikey = (GLuint *)item->key; - GLuint hash = item->cache_id, i; - - assert(item->key_size % 4 == 0); - - /* I'm sure this can be improved on: - */ - for (i = 0; i < item->key_size/4; i++) { - hash ^= ikey[i]; - hash = (hash << 5) | (hash >> 27); - } + uint32_t hash = _mesa_fnv32_1a_offset_bias; + hash = _mesa_fnv32_1a_accumulate(hash, item->cache_id); + hash = _mesa_fnv32_1a_accumulate_block(hash, item->key, item->key_size); return hash; } diff -Nru mesa-19.2.8/src/mesa/drivers/dri/i965/brw_reset.c mesa-20.0.8/src/mesa/drivers/dri/i965/brw_reset.c --- mesa-19.2.8/src/mesa/drivers/dri/i965/brw_reset.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mesa/drivers/dri/i965/brw_reset.c 2020-06-12 01:21:18.000000000 +0000 @@ -35,7 +35,6 @@ brw_get_graphics_reset_status(struct gl_context *ctx) { struct brw_context *brw = brw_context(ctx); - __DRIscreen *dri_screen = brw->screen->driScrnPriv; struct drm_i915_reset_stats stats = { .ctx_id = brw->hw_ctx }; /* If hardware contexts are not being used (or @@ -51,7 +50,7 @@ if (brw->reset_count != 0) return GL_NO_ERROR; - if (drmIoctl(dri_screen->fd, DRM_IOCTL_I915_GET_RESET_STATS, &stats) != 0) + if (drmIoctl(brw->screen->fd, DRM_IOCTL_I915_GET_RESET_STATS, &stats) != 0) return GL_NO_ERROR; /* A reset was observed while a batch from this context was executing. @@ -77,10 +76,9 @@ void brw_check_for_reset(struct brw_context *brw) { - __DRIscreen *dri_screen = brw->screen->driScrnPriv; struct drm_i915_reset_stats stats = { .ctx_id = brw->hw_ctx }; - if (drmIoctl(dri_screen->fd, DRM_IOCTL_I915_GET_RESET_STATS, &stats) != 0) + if (drmIoctl(brw->screen->fd, DRM_IOCTL_I915_GET_RESET_STATS, &stats) != 0) return; if (stats.batch_active > 0 || stats.batch_pending > 0) diff -Nru mesa-19.2.8/src/mesa/drivers/dri/i965/brw_state_upload.c mesa-20.0.8/src/mesa/drivers/dri/i965/brw_state_upload.c --- mesa-19.2.8/src/mesa/drivers/dri/i965/brw_state_upload.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mesa/drivers/dri/i965/brw_state_upload.c 2020-06-12 01:21:18.000000000 +0000 @@ -189,11 +189,6 @@ */ brw_load_register_imm32(brw, GEN8_L3CNTLREG, GEN8_L3CNTLREG_EDBC_NO_HANG); - - /* WaEnableStateCacheRedirectToCS:icl */ - brw_load_register_imm32(brw, SLICE_COMMON_ECO_CHICKEN1, - GEN11_STATE_CACHE_REDIRECT_TO_CS_SECTION_ENABLE | - REG_MASK(GEN11_STATE_CACHE_REDIRECT_TO_CS_SECTION_ENABLE)); } /* hardware specification recommends disabling repacking for diff -Nru mesa-19.2.8/src/mesa/drivers/dri/i965/brw_surface_formats.c mesa-20.0.8/src/mesa/drivers/dri/i965/brw_surface_formats.c --- mesa-19.2.8/src/mesa/drivers/dri/i965/brw_surface_formats.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mesa/drivers/dri/i965/brw_surface_formats.c 2020-06-12 01:21:18.000000000 +0000 @@ -47,8 +47,8 @@ [MESA_FORMAT_B5G6R5_UNORM] = ISL_FORMAT_B5G6R5_UNORM, [MESA_FORMAT_B4G4R4A4_UNORM] = ISL_FORMAT_B4G4R4A4_UNORM, [MESA_FORMAT_B5G5R5A1_UNORM] = ISL_FORMAT_B5G5R5A1_UNORM, - [MESA_FORMAT_L8A8_UNORM] = ISL_FORMAT_L8A8_UNORM, - [MESA_FORMAT_L16A16_UNORM] = ISL_FORMAT_L16A16_UNORM, + [MESA_FORMAT_LA_UNORM8] = ISL_FORMAT_L8A8_UNORM, + [MESA_FORMAT_LA_UNORM16] = ISL_FORMAT_L16A16_UNORM, [MESA_FORMAT_A_UNORM8] = ISL_FORMAT_A8_UNORM, [MESA_FORMAT_A_UNORM16] = ISL_FORMAT_A16_UNORM, [MESA_FORMAT_L_UNORM8] = ISL_FORMAT_L8_UNORM, @@ -58,9 +58,9 @@ [MESA_FORMAT_YCBCR_REV] = ISL_FORMAT_YCRCB_NORMAL, [MESA_FORMAT_YCBCR] = ISL_FORMAT_YCRCB_SWAPUVY, [MESA_FORMAT_R_UNORM8] = ISL_FORMAT_R8_UNORM, - [MESA_FORMAT_R8G8_UNORM] = ISL_FORMAT_R8G8_UNORM, + [MESA_FORMAT_RG_UNORM8] = ISL_FORMAT_R8G8_UNORM, [MESA_FORMAT_R_UNORM16] = ISL_FORMAT_R16_UNORM, - [MESA_FORMAT_R16G16_UNORM] = ISL_FORMAT_R16G16_UNORM, + [MESA_FORMAT_RG_UNORM16] = ISL_FORMAT_R16G16_UNORM, [MESA_FORMAT_B10G10R10A2_UNORM] = ISL_FORMAT_B10G10R10A2_UNORM, [MESA_FORMAT_S_UINT8] = ISL_FORMAT_R8_UINT, @@ -69,7 +69,7 @@ [MESA_FORMAT_B8G8R8X8_SRGB] = ISL_FORMAT_B8G8R8X8_UNORM_SRGB, [MESA_FORMAT_R_SRGB8] = ISL_FORMAT_L8_UNORM_SRGB, [MESA_FORMAT_L_SRGB8] = ISL_FORMAT_L8_UNORM_SRGB, - [MESA_FORMAT_L8A8_SRGB] = ISL_FORMAT_L8A8_UNORM_SRGB, + [MESA_FORMAT_LA_SRGB8] = ISL_FORMAT_L8A8_UNORM_SRGB, [MESA_FORMAT_SRGB_DXT1] = ISL_FORMAT_BC1_UNORM_SRGB, [MESA_FORMAT_SRGBA_DXT1] = ISL_FORMAT_BC1_UNORM_SRGB, [MESA_FORMAT_SRGBA_DXT3] = ISL_FORMAT_BC2_UNORM_SRGB, @@ -125,10 +125,10 @@ [MESA_FORMAT_RGBA_UINT32] = ISL_FORMAT_R32G32B32A32_UINT, [MESA_FORMAT_R_SNORM8] = ISL_FORMAT_R8_SNORM, - [MESA_FORMAT_R8G8_SNORM] = ISL_FORMAT_R8G8_SNORM, + [MESA_FORMAT_RG_SNORM8] = ISL_FORMAT_R8G8_SNORM, [MESA_FORMAT_R8G8B8A8_SNORM] = ISL_FORMAT_R8G8B8A8_SNORM, [MESA_FORMAT_R_SNORM16] = ISL_FORMAT_R16_SNORM, - [MESA_FORMAT_R16G16_SNORM] = ISL_FORMAT_R16G16_SNORM, + [MESA_FORMAT_RG_SNORM16] = ISL_FORMAT_R16G16_SNORM, [MESA_FORMAT_RGB_SNORM16] = ISL_FORMAT_R16G16B16_SNORM, [MESA_FORMAT_RGBA_SNORM16] = ISL_FORMAT_R16G16B16A16_SNORM, [MESA_FORMAT_RGBA_UNORM16] = ISL_FORMAT_R16G16B16A16_UNORM, @@ -217,6 +217,8 @@ gen += 5; for (format = MESA_FORMAT_NONE + 1; format < MESA_FORMAT_COUNT; format++) { + if (!_mesa_get_format_name(format)) + continue; enum isl_format texture, render; bool is_integer = _mesa_is_format_integer_color(format); diff -Nru mesa-19.2.8/src/mesa/drivers/dri/i965/brw_tes.c mesa-20.0.8/src/mesa/drivers/dri/i965/brw_tes.c --- mesa-19.2.8/src/mesa/drivers/dri/i965/brw_tes.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mesa/drivers/dri/i965/brw_tes.c 2020-06-12 01:21:18.000000000 +0000 @@ -76,7 +76,7 @@ char *error_str; const unsigned *program = brw_compile_tes(compiler, brw, mem_ctx, key, &input_vue_map, &prog_data, - nir, &tep->program, st_index, NULL, &error_str); + nir, st_index, NULL, &error_str); if (program == NULL) { tep->program.sh.data->LinkStatus = LINKING_FAILURE; ralloc_strcat(&tep->program.sh.data->InfoLog, error_str); diff -Nru mesa-19.2.8/src/mesa/drivers/dri/i965/brw_wm.c mesa-20.0.8/src/mesa/drivers/dri/i965/brw_wm.c --- mesa-19.2.8/src/mesa/drivers/dri/i965/brw_wm.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mesa/drivers/dri/i965/brw_wm.c 2020-06-12 01:21:18.000000000 +0000 @@ -122,7 +122,7 @@ char *error_str = NULL; program = brw_compile_fs(brw->screen->compiler, brw, mem_ctx, key, &prog_data, nir, - &fp->program, st_index8, st_index16, st_index32, + st_index8, st_index16, st_index32, true, false, vue_map, NULL, &error_str); diff -Nru mesa-19.2.8/src/mesa/drivers/dri/i965/brw_wm_surface_state.c mesa-20.0.8/src/mesa/drivers/dri/i965/brw_wm_surface_state.c --- mesa-19.2.8/src/mesa/drivers/dri/i965/brw_wm_surface_state.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mesa/drivers/dri/i965/brw_wm_surface_state.c 2020-06-12 01:21:18.000000000 +0000 @@ -1364,33 +1364,39 @@ prog->info.num_abos == 0)) return; - uint32_t *ubo_surf_offsets = - &stage_state->surf_offset[prog_data->binding_table.ubo_start]; - - for (int i = 0; i < prog->info.num_ubos; i++) { - struct gl_buffer_binding *binding = - &ctx->UniformBufferBindings[prog->sh.UniformBlocks[i]->Binding]; - upload_buffer_surface(brw, binding, &ubo_surf_offsets[i], - ISL_FORMAT_R32G32B32A32_FLOAT, 0); - } - - uint32_t *abo_surf_offsets = - &stage_state->surf_offset[prog_data->binding_table.ssbo_start]; - uint32_t *ssbo_surf_offsets = abo_surf_offsets + prog->info.num_abos; - - for (int i = 0; i < prog->info.num_abos; i++) { - struct gl_buffer_binding *binding = - &ctx->AtomicBufferBindings[prog->sh.AtomicBuffers[i]->Binding]; - upload_buffer_surface(brw, binding, &abo_surf_offsets[i], - ISL_FORMAT_RAW, RELOC_WRITE); + if (prog->info.num_ubos) { + assert(prog_data->binding_table.ubo_start < BRW_MAX_SURFACES); + uint32_t *ubo_surf_offsets = + &stage_state->surf_offset[prog_data->binding_table.ubo_start]; + + for (int i = 0; i < prog->info.num_ubos; i++) { + struct gl_buffer_binding *binding = + &ctx->UniformBufferBindings[prog->sh.UniformBlocks[i]->Binding]; + upload_buffer_surface(brw, binding, &ubo_surf_offsets[i], + ISL_FORMAT_R32G32B32A32_FLOAT, 0); + } } - for (int i = 0; i < prog->info.num_ssbos; i++) { - struct gl_buffer_binding *binding = - &ctx->ShaderStorageBufferBindings[prog->sh.ShaderStorageBlocks[i]->Binding]; - - upload_buffer_surface(brw, binding, &ssbo_surf_offsets[i], - ISL_FORMAT_RAW, RELOC_WRITE); + if (prog->info.num_ssbos || prog->info.num_abos) { + assert(prog_data->binding_table.ssbo_start < BRW_MAX_SURFACES); + uint32_t *ssbo_surf_offsets = + &stage_state->surf_offset[prog_data->binding_table.ssbo_start]; + uint32_t *abo_surf_offsets = ssbo_surf_offsets + prog->info.num_ssbos; + + for (int i = 0; i < prog->info.num_abos; i++) { + struct gl_buffer_binding *binding = + &ctx->AtomicBufferBindings[prog->sh.AtomicBuffers[i]->Binding]; + upload_buffer_surface(brw, binding, &abo_surf_offsets[i], + ISL_FORMAT_RAW, RELOC_WRITE); + } + + for (int i = 0; i < prog->info.num_ssbos; i++) { + struct gl_buffer_binding *binding = + &ctx->ShaderStorageBufferBindings[prog->sh.ShaderStorageBlocks[i]->Binding]; + + upload_buffer_surface(brw, binding, &ssbo_surf_offsets[i], + ISL_FORMAT_RAW, RELOC_WRITE); + } } stage_state->push_constants_dirty = true; diff -Nru mesa-19.2.8/src/mesa/drivers/dri/i965/gen4_blorp_exec.h mesa-20.0.8/src/mesa/drivers/dri/i965/gen4_blorp_exec.h --- mesa-19.2.8/src/mesa/drivers/dri/i965/gen4_blorp_exec.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mesa/drivers/dri/i965/gen4_blorp_exec.h 2020-06-12 01:21:18.000000000 +0000 @@ -178,7 +178,7 @@ assert(batch->blorp->driver_ctx == batch->driver_batch); struct brw_context *brw = batch->driver_batch; - emit_urb_config(batch, params); + emit_urb_config(batch, params, NULL); blorp_emit(batch, GENX(3DSTATE_PIPELINED_POINTERS), pp) { pp.PointertoVSState = blorp_emit_vs_state(batch); diff -Nru mesa-19.2.8/src/mesa/drivers/dri/i965/gen6_queryobj.c mesa-20.0.8/src/mesa/drivers/dri/i965/gen6_queryobj.c --- mesa-19.2.8/src/mesa/drivers/dri/i965/gen6_queryobj.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mesa/drivers/dri/i965/gen6_queryobj.c 2020-06-12 01:21:18.000000000 +0000 @@ -36,7 +36,7 @@ #include "brw_context.h" #include "brw_defines.h" #include "brw_state.h" -#include "perf/gen_perf.h" +#include "perf/gen_perf_regs.h" #include "intel_batchbuffer.h" #include "intel_buffer_objects.h" diff -Nru mesa-19.2.8/src/mesa/drivers/dri/i965/gen7_l3_state.c mesa-20.0.8/src/mesa/drivers/dri/i965/gen7_l3_state.c --- mesa-19.2.8/src/mesa/drivers/dri/i965/gen7_l3_state.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mesa/drivers/dri/i965/gen7_l3_state.c 2020-06-12 01:21:18.000000000 +0000 @@ -118,7 +118,8 @@ if (devinfo->gen >= 8) { assert(!cfg->n[GEN_L3P_IS] && !cfg->n[GEN_L3P_C] && !cfg->n[GEN_L3P_T]); - const unsigned imm_data = ((has_slm ? GEN8_L3CNTLREG_SLM_ENABLE : 0) | + const unsigned imm_data = ( + (devinfo->gen < 11 && has_slm ? GEN8_L3CNTLREG_SLM_ENABLE : 0) | (devinfo->gen == 11 ? GEN11_L3CNTLREG_USE_FULL_WAYS : 0) | SET_FIELD(cfg->n[GEN_L3P_URB], GEN8_L3CNTLREG_URB_ALLOC) | SET_FIELD(cfg->n[GEN_L3P_RO], GEN8_L3CNTLREG_RO_ALLOC) | @@ -211,8 +212,8 @@ } } -static void -emit_l3_state(struct brw_context *brw) +void +brw_emit_l3_state(struct brw_context *brw) { const struct gen_l3_weights w = get_pipeline_state_l3_weights(brw); const float dw = gen_diff_l3_weights(w, gen_get_l3_config_weights(brw->l3.config)); @@ -260,7 +261,7 @@ BRW_NEW_TES_PROG_DATA | BRW_NEW_VS_PROG_DATA, }, - .emit = emit_l3_state + .emit = brw_emit_l3_state }; /** diff -Nru mesa-19.2.8/src/mesa/drivers/dri/i965/gen7_urb.c mesa-20.0.8/src/mesa/drivers/dri/i965/gen7_urb.c --- mesa-19.2.8/src/mesa/drivers/dri/i965/gen7_urb.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mesa/drivers/dri/i965/gen7_urb.c 2020-06-12 01:21:18.000000000 +0000 @@ -208,8 +208,6 @@ bool gs_present, bool tess_present) { const struct gen_device_info *devinfo = &brw->screen->devinfo; - const int push_size_kB = - (devinfo->gen >= 8 || (devinfo->is_haswell && devinfo->gt == 3)) ? 32 : 16; /* BRW_NEW_{VS,TCS,TES,GS}_PROG_DATA */ struct brw_vue_prog_data *prog_data[4] = { @@ -249,8 +247,9 @@ unsigned entries[4]; unsigned start[4]; - gen_get_urb_config(devinfo, 1024 * push_size_kB, 1024 * brw->urb.size, - tess_present, gs_present, entry_size, entries, start); + gen_get_urb_config(devinfo, brw->l3.config, + tess_present, gs_present, entry_size, + entries, start, NULL); if (devinfo->gen == 7 && !devinfo->is_haswell && !devinfo->is_baytrail) gen7_emit_vs_workaround_flush(brw); diff -Nru mesa-19.2.8/src/mesa/drivers/dri/i965/genX_blorp_exec.c mesa-20.0.8/src/mesa/drivers/dri/i965/genX_blorp_exec.c --- mesa-19.2.8/src/mesa/drivers/dri/i965/genX_blorp_exec.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mesa/drivers/dri/i965/genX_blorp_exec.c 2020-06-12 01:21:18.000000000 +0000 @@ -207,6 +207,7 @@ static void blorp_vf_invalidate_for_vb_48b_transitions(struct blorp_batch *batch, const struct blorp_address *addrs, + UNUSED uint32_t *sizes, unsigned num_vbs) { #if GEN_GEN >= 8 && GEN_GEN < 11 @@ -230,8 +231,7 @@ #endif } -#if GEN_GEN >= 8 -static struct blorp_address +UNUSED static struct blorp_address blorp_get_workaround_page(struct blorp_batch *batch) { assert(batch->blorp->driver_ctx == batch->driver_batch); @@ -241,7 +241,6 @@ .buffer = brw->workaround_bo, }; } -#endif static void blorp_flush_range(UNUSED struct blorp_batch *batch, UNUSED void *start, @@ -252,6 +251,16 @@ */ } +#if GEN_GEN >= 7 +static const struct gen_l3_config * +blorp_get_l3_config(struct blorp_batch *batch) +{ + assert(batch->blorp->driver_ctx == batch->driver_batch); + struct brw_context *brw = batch->driver_batch; + + return brw->l3.config; +} +#else /* GEN_GEN < 7 */ static void blorp_emit_urb_config(struct blorp_batch *batch, unsigned vs_entry_size, @@ -260,18 +269,14 @@ assert(batch->blorp->driver_ctx == batch->driver_batch); struct brw_context *brw = batch->driver_batch; -#if GEN_GEN >= 7 - if (brw->urb.vsize >= vs_entry_size) - return; - - gen7_upload_urb(brw, vs_entry_size, false, false); -#elif GEN_GEN == 6 +#if GEN_GEN == 6 gen6_upload_urb(brw, vs_entry_size, false, 0); #else /* We calculate it now and emit later. */ brw_calculate_urb_fence(brw, 0, vs_entry_size, sf_entry_size); #endif } +#endif void genX(blorp_exec)(struct blorp_batch *batch, @@ -316,6 +321,7 @@ brw_cache_flush_for_depth(brw, params->stencil.addr.buffer); brw_select_pipeline(brw, BRW_RENDER_PIPELINE); + brw_emit_l3_state(brw); retry: intel_batchbuffer_require_space(brw, 1400); @@ -385,6 +391,12 @@ brw->no_depth_or_stencil = !params->depth.enabled && !params->stencil.enabled; brw->ib.index_size = -1; + brw->urb.vsize = 0; + brw->urb.gs_present = false; + brw->urb.gsize = 0; + brw->urb.tess_present = false; + brw->urb.hsize = 0; + brw->urb.dsize = 0; if (params->dst.enabled) { brw_render_cache_add_bo(brw, params->dst.addr.buffer, diff -Nru mesa-19.2.8/src/mesa/drivers/dri/i965/genX_state_upload.c mesa-20.0.8/src/mesa/drivers/dri/i965/genX_state_upload.c --- mesa-19.2.8/src/mesa/drivers/dri/i965/genX_state_upload.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mesa/drivers/dri/i965/genX_state_upload.c 2020-06-12 01:21:18.000000000 +0000 @@ -1099,11 +1099,11 @@ */ bool drawing_points = brw_is_drawing_points(brw); - for (int attr = 0; attr < VARYING_SLOT_MAX; attr++) { + for (uint8_t idx = 0; idx < wm_prog_data->urb_setup_attribs_count; idx++) { + uint8_t attr = wm_prog_data->urb_setup_attribs[idx]; int input_index = wm_prog_data->urb_setup[attr]; - if (input_index < 0) - continue; + assert(0 <= input_index); /* _NEW_POINT */ bool point_sprite = false; @@ -2101,13 +2101,7 @@ GEN_GEN == 11 ? \ 0 : \ DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4); \ - /* Gen 11 workarounds table #2056 WABTPPrefetchDisable suggests to \ - * disable prefetching of binding tables in A0 and B0 steppings. \ - * TODO: Revisit this WA on C0 stepping. \ - */ \ pkt.BindingTableEntryCount = \ - GEN_GEN == 11 ? \ - 0 : \ stage_prog_data->binding_table.size_bytes / 4; \ pkt.FloatingPointMode = stage_prog_data->use_alt_mode; \ \ @@ -3877,13 +3871,7 @@ 0 : DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4); /* BRW_NEW_FS_PROG_DATA */ - /* Gen 11 workarounds table #2056 WABTPPrefetchDisable suggests to disable - * prefetching of binding tables in A0 and B0 steppings. - * TODO: Revisit this workaround on C0 stepping. - */ - ps.BindingTableEntryCount = GEN_GEN == 11 ? - 0 : - prog_data->base.binding_table.size_bytes / 4; + ps.BindingTableEntryCount = prog_data->base.binding_table.size_bytes / 4; if (prog_data->base.use_alt_mode) ps.FloatingPointMode = Alternate; diff -Nru mesa-19.2.8/src/mesa/drivers/dri/i965/intel_batchbuffer.c mesa-20.0.8/src/mesa/drivers/dri/i965/intel_batchbuffer.c --- mesa-19.2.8/src/mesa/drivers/dri/i965/intel_batchbuffer.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mesa/drivers/dri/i965/intel_batchbuffer.c 2020-06-12 01:21:18.000000000 +0000 @@ -104,12 +104,13 @@ } static unsigned -decode_get_state_size(void *v_brw, uint32_t offset_from_dsba) +decode_get_state_size(void *v_brw, uint64_t address, uint64_t base_address) { struct brw_context *brw = v_brw; struct intel_batchbuffer *batch = &brw->batch; - unsigned size = (uintptr_t) _mesa_hash_table_u64_search( - batch->state_batch_sizes, offset_from_dsba); + unsigned size = (uintptr_t) + _mesa_hash_table_u64_search(batch->state_batch_sizes, + address - base_address); return size; } @@ -502,11 +503,17 @@ new_bo->refcount = bo->refcount; bo->refcount = 1; + assert(list_is_empty(&bo->exports)); + assert(list_is_empty(&new_bo->exports)); + struct brw_bo tmp; memcpy(&tmp, bo, sizeof(struct brw_bo)); memcpy(bo, new_bo, sizeof(struct brw_bo)); memcpy(new_bo, &tmp, sizeof(struct brw_bo)); + list_inithead(&bo->exports); + list_inithead(&new_bo->exports); + grow->partial_bo = new_bo; /* the one reference of the OLD bo */ grow->partial_bytes = existing_bytes; } @@ -672,8 +679,7 @@ } if (brw->need_flush_throttle) { - __DRIscreen *dri_screen = brw->screen->driScrnPriv; - drmCommandNone(dri_screen->fd, DRM_I915_GEM_THROTTLE); + drmCommandNone(brw->screen->fd, DRM_I915_GEM_THROTTLE); brw->need_flush_throttle = false; } } @@ -738,7 +744,6 @@ static int submit_batch(struct brw_context *brw, int in_fence_fd, int *out_fence_fd) { - __DRIscreen *dri_screen = brw->screen->driScrnPriv; struct intel_batchbuffer *batch = &brw->batch; int ret = 0; @@ -805,7 +810,7 @@ batch->exec_bos[index] = tmp_bo; } - ret = execbuffer(dri_screen->fd, batch, brw->hw_ctx, + ret = execbuffer(brw->screen->fd, batch, brw->hw_ctx, 4 * USED_BATCH(*batch), in_fence_fd, out_fence_fd, flags); diff -Nru mesa-19.2.8/src/mesa/drivers/dri/i965/intel_extensions.c mesa-20.0.8/src/mesa/drivers/dri/i965/intel_extensions.c --- mesa-19.2.8/src/mesa/drivers/dri/i965/intel_extensions.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mesa/drivers/dri/i965/intel_extensions.c 2020-06-12 01:21:18.000000000 +0000 @@ -97,6 +97,7 @@ ctx->Extensions.EXT_blend_func_separate = true; ctx->Extensions.EXT_blend_minmax = true; ctx->Extensions.EXT_draw_buffers2 = true; + ctx->Extensions.EXT_EGL_image_storage = true; ctx->Extensions.EXT_float_blend = true; ctx->Extensions.EXT_framebuffer_sRGB = true; ctx->Extensions.EXT_gpu_program_parameters = true; @@ -111,6 +112,7 @@ ctx->Extensions.EXT_texture_env_dot3 = true; ctx->Extensions.EXT_texture_filter_anisotropic = true; ctx->Extensions.EXT_texture_integer = true; + ctx->Extensions.EXT_texture_norm16 = true; ctx->Extensions.EXT_texture_shared_exponent = true; ctx->Extensions.EXT_texture_snorm = true; ctx->Extensions.EXT_texture_sRGB = true; @@ -143,7 +145,7 @@ ctx->Extensions.OES_texture_half_float_linear = true; if (devinfo->gen >= 8) - ctx->Const.GLSLVersion = 450; + ctx->Const.GLSLVersion = 460; else if (devinfo->is_haswell && can_do_pipelined_register_writes(brw->screen)) ctx->Const.GLSLVersion = 450; else if (devinfo->gen >= 7 && can_do_pipelined_register_writes(brw->screen)) @@ -201,6 +203,7 @@ ctx->Extensions.ARB_texture_gather = true; ctx->Extensions.ARB_texture_multisample = true; ctx->Extensions.ARB_uniform_buffer_object = true; + ctx->Extensions.EXT_gpu_shader4 = true; ctx->Extensions.EXT_texture_shadow_lod = true; if (ctx->API != API_OPENGL_COMPAT || @@ -275,6 +278,9 @@ ctx->Extensions.ARB_indirect_parameters = true; } } + + ctx->Extensions.ARB_gl_spirv = true; + ctx->Extensions.ARB_spirv_extensions = true; } if (devinfo->gen >= 8 || devinfo->is_haswell) { @@ -315,6 +321,11 @@ /* requires ARB_gpu_shader_int64 */ ctx->Extensions.ARB_shader_ballot = true; ctx->Extensions.ARB_ES3_2_compatibility = true; + + /* Currently only implemented in the scalar backend, so only enable for + * Gen8+. Eventually Gen6+ could be supported. + */ + ctx->Extensions.INTEL_shader_integer_functions2 = true; } if (devinfo->gen >= 9) { @@ -369,4 +380,6 @@ ctx->Extensions.EXT_texture_compression_s3tc = true; ctx->Extensions.EXT_texture_compression_s3tc_srgb = true; ctx->Extensions.ANGLE_texture_compression_dxt = true; + + ctx->Extensions.EXT_demote_to_helper_invocation = true; } diff -Nru mesa-19.2.8/src/mesa/drivers/dri/i965/intel_image.h mesa-20.0.8/src/mesa/drivers/dri/i965/intel_image.h --- mesa-19.2.8/src/mesa/drivers/dri/i965/intel_image.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mesa/drivers/dri/i965/intel_image.h 2020-06-12 01:21:18.000000000 +0000 @@ -89,6 +89,7 @@ GLuint tile_x; GLuint tile_y; bool has_depthstencil; + bool imported_dmabuf; /** Offset of the auxiliary compression surface in the bo. */ uint32_t aux_offset; diff -Nru mesa-19.2.8/src/mesa/drivers/dri/i965/intel_mipmap_tree.c mesa-20.0.8/src/mesa/drivers/dri/i965/intel_mipmap_tree.c --- mesa-19.2.8/src/mesa/drivers/dri/i965/intel_mipmap_tree.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mesa/drivers/dri/i965/intel_mipmap_tree.c 2020-06-12 01:21:18.000000000 +0000 @@ -219,9 +219,9 @@ case MESA_FORMAT_ETC2_SIGNED_R11_EAC: return MESA_FORMAT_R_SNORM16; case MESA_FORMAT_ETC2_RG11_EAC: - return MESA_FORMAT_R16G16_UNORM; + return MESA_FORMAT_RG_UNORM16; case MESA_FORMAT_ETC2_SIGNED_RG11_EAC: - return MESA_FORMAT_R16G16_SNORM; + return MESA_FORMAT_RG_SNORM16; default: /* Non ETC1 / ETC2 format */ return format; @@ -712,7 +712,7 @@ struct intel_mipmap_tree *mt, enum isl_aux_state initial_state) { - struct isl_surf temp_ccs_surf; + struct isl_surf temp_ccs_surf = {0,}; /* CCS is only supported for very simple miptrees */ assert(image->aux_offset != 0 && image->aux_pitch != 0); @@ -727,7 +727,7 @@ /* We shouldn't already have a CCS */ assert(!mt->aux_buf); - if (!isl_surf_get_ccs_surf(&brw->isl_dev, &mt->surf, &temp_ccs_surf, + if (!isl_surf_get_ccs_surf(&brw->isl_dev, &mt->surf, &temp_ccs_surf, NULL, image->aux_pitch)) return false; @@ -1576,7 +1576,7 @@ /* Get the aux buf allocation parameters for this miptree. */ enum isl_aux_state initial_state; uint8_t memset_value; - struct isl_surf aux_surf; + struct isl_surf aux_surf = {0,}; bool aux_surf_ok = false; switch (mt->aux_usage) { @@ -1624,8 +1624,11 @@ initial_state = ISL_AUX_STATE_PASS_THROUGH; memset_value = 0; aux_surf_ok = - isl_surf_get_ccs_surf(&brw->isl_dev, &mt->surf, &aux_surf, 0); + isl_surf_get_ccs_surf(&brw->isl_dev, &mt->surf, &aux_surf, NULL, 0); break; + + default: + unreachable("Invalid aux usage"); } /* We should have a valid aux_surf. */ diff -Nru mesa-19.2.8/src/mesa/drivers/dri/i965/intel_screen.c mesa-20.0.8/src/mesa/drivers/dri/i965/intel_screen.c --- mesa-19.2.8/src/mesa/drivers/dri/i965/intel_screen.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mesa/drivers/dri/i965/intel_screen.c 2020-06-12 01:21:18.000000000 +0000 @@ -98,8 +98,10 @@ DRI_CONF_SECTION_MISCELLANEOUS DRI_CONF_GLSL_ZERO_INIT("false") + DRI_CONF_VS_POSITION_ALWAYS_INVARIANT("false") DRI_CONF_ALLOW_RGB10_CONFIGS("false") DRI_CONF_ALLOW_RGB565_CONFIGS("true") + DRI_CONF_ALLOW_FP16_CONFIGS("false") DRI_CONF_SECTION_END DRI_CONF_END }; @@ -189,125 +191,134 @@ }; static const struct intel_image_format intel_image_formats[] = { - { __DRI_IMAGE_FOURCC_ARGB2101010, __DRI_IMAGE_COMPONENTS_RGBA, 1, + { DRM_FORMAT_ABGR16161616F, __DRI_IMAGE_COMPONENTS_RGBA, 1, + { { 0, 0, 0, __DRI_IMAGE_FORMAT_ABGR16161616F, 8 } } }, + + { DRM_FORMAT_XBGR16161616F, __DRI_IMAGE_COMPONENTS_RGB, 1, + { { 0, 0, 0, __DRI_IMAGE_FORMAT_XBGR16161616F, 8 } } }, + + { DRM_FORMAT_ARGB2101010, __DRI_IMAGE_COMPONENTS_RGBA, 1, { { 0, 0, 0, __DRI_IMAGE_FORMAT_ARGB2101010, 4 } } }, - { __DRI_IMAGE_FOURCC_XRGB2101010, __DRI_IMAGE_COMPONENTS_RGB, 1, + { DRM_FORMAT_XRGB2101010, __DRI_IMAGE_COMPONENTS_RGB, 1, { { 0, 0, 0, __DRI_IMAGE_FORMAT_XRGB2101010, 4 } } }, - { __DRI_IMAGE_FOURCC_ABGR2101010, __DRI_IMAGE_COMPONENTS_RGBA, 1, + { DRM_FORMAT_ABGR2101010, __DRI_IMAGE_COMPONENTS_RGBA, 1, { { 0, 0, 0, __DRI_IMAGE_FORMAT_ABGR2101010, 4 } } }, - { __DRI_IMAGE_FOURCC_XBGR2101010, __DRI_IMAGE_COMPONENTS_RGB, 1, + { DRM_FORMAT_XBGR2101010, __DRI_IMAGE_COMPONENTS_RGB, 1, { { 0, 0, 0, __DRI_IMAGE_FORMAT_XBGR2101010, 4 } } }, - { __DRI_IMAGE_FOURCC_ARGB8888, __DRI_IMAGE_COMPONENTS_RGBA, 1, + { DRM_FORMAT_ARGB8888, __DRI_IMAGE_COMPONENTS_RGBA, 1, { { 0, 0, 0, __DRI_IMAGE_FORMAT_ARGB8888, 4 } } }, - { __DRI_IMAGE_FOURCC_ABGR8888, __DRI_IMAGE_COMPONENTS_RGBA, 1, + { DRM_FORMAT_ABGR8888, __DRI_IMAGE_COMPONENTS_RGBA, 1, { { 0, 0, 0, __DRI_IMAGE_FORMAT_ABGR8888, 4 } } }, { __DRI_IMAGE_FOURCC_SARGB8888, __DRI_IMAGE_COMPONENTS_RGBA, 1, { { 0, 0, 0, __DRI_IMAGE_FORMAT_SARGB8, 4 } } }, - { __DRI_IMAGE_FOURCC_XRGB8888, __DRI_IMAGE_COMPONENTS_RGB, 1, + { __DRI_IMAGE_FOURCC_SXRGB8888, __DRI_IMAGE_COMPONENTS_RGB, 1, + { { 0, 0, 0, __DRI_IMAGE_FORMAT_SXRGB8, 4 } } }, + + { DRM_FORMAT_XRGB8888, __DRI_IMAGE_COMPONENTS_RGB, 1, { { 0, 0, 0, __DRI_IMAGE_FORMAT_XRGB8888, 4 }, } }, - { __DRI_IMAGE_FOURCC_XBGR8888, __DRI_IMAGE_COMPONENTS_RGB, 1, + { DRM_FORMAT_XBGR8888, __DRI_IMAGE_COMPONENTS_RGB, 1, { { 0, 0, 0, __DRI_IMAGE_FORMAT_XBGR8888, 4 }, } }, - { __DRI_IMAGE_FOURCC_ARGB1555, __DRI_IMAGE_COMPONENTS_RGBA, 1, + { DRM_FORMAT_ARGB1555, __DRI_IMAGE_COMPONENTS_RGBA, 1, { { 0, 0, 0, __DRI_IMAGE_FORMAT_ARGB1555, 2 } } }, - { __DRI_IMAGE_FOURCC_RGB565, __DRI_IMAGE_COMPONENTS_RGB, 1, + { DRM_FORMAT_RGB565, __DRI_IMAGE_COMPONENTS_RGB, 1, { { 0, 0, 0, __DRI_IMAGE_FORMAT_RGB565, 2 } } }, - { __DRI_IMAGE_FOURCC_R8, __DRI_IMAGE_COMPONENTS_R, 1, + { DRM_FORMAT_R8, __DRI_IMAGE_COMPONENTS_R, 1, { { 0, 0, 0, __DRI_IMAGE_FORMAT_R8, 1 }, } }, - { __DRI_IMAGE_FOURCC_R16, __DRI_IMAGE_COMPONENTS_R, 1, + { DRM_FORMAT_R16, __DRI_IMAGE_COMPONENTS_R, 1, { { 0, 0, 0, __DRI_IMAGE_FORMAT_R16, 1 }, } }, - { __DRI_IMAGE_FOURCC_GR88, __DRI_IMAGE_COMPONENTS_RG, 1, + { DRM_FORMAT_GR88, __DRI_IMAGE_COMPONENTS_RG, 1, { { 0, 0, 0, __DRI_IMAGE_FORMAT_GR88, 2 }, } }, - { __DRI_IMAGE_FOURCC_GR1616, __DRI_IMAGE_COMPONENTS_RG, 1, + { DRM_FORMAT_GR1616, __DRI_IMAGE_COMPONENTS_RG, 1, { { 0, 0, 0, __DRI_IMAGE_FORMAT_GR1616, 2 }, } }, - { __DRI_IMAGE_FOURCC_YUV410, __DRI_IMAGE_COMPONENTS_Y_U_V, 3, + { DRM_FORMAT_YUV410, __DRI_IMAGE_COMPONENTS_Y_U_V, 3, { { 0, 0, 0, __DRI_IMAGE_FORMAT_R8, 1 }, { 1, 2, 2, __DRI_IMAGE_FORMAT_R8, 1 }, { 2, 2, 2, __DRI_IMAGE_FORMAT_R8, 1 } } }, - { __DRI_IMAGE_FOURCC_YUV411, __DRI_IMAGE_COMPONENTS_Y_U_V, 3, + { DRM_FORMAT_YUV411, __DRI_IMAGE_COMPONENTS_Y_U_V, 3, { { 0, 0, 0, __DRI_IMAGE_FORMAT_R8, 1 }, { 1, 2, 0, __DRI_IMAGE_FORMAT_R8, 1 }, { 2, 2, 0, __DRI_IMAGE_FORMAT_R8, 1 } } }, - { __DRI_IMAGE_FOURCC_YUV420, __DRI_IMAGE_COMPONENTS_Y_U_V, 3, + { DRM_FORMAT_YUV420, __DRI_IMAGE_COMPONENTS_Y_U_V, 3, { { 0, 0, 0, __DRI_IMAGE_FORMAT_R8, 1 }, { 1, 1, 1, __DRI_IMAGE_FORMAT_R8, 1 }, { 2, 1, 1, __DRI_IMAGE_FORMAT_R8, 1 } } }, - { __DRI_IMAGE_FOURCC_YUV422, __DRI_IMAGE_COMPONENTS_Y_U_V, 3, + { DRM_FORMAT_YUV422, __DRI_IMAGE_COMPONENTS_Y_U_V, 3, { { 0, 0, 0, __DRI_IMAGE_FORMAT_R8, 1 }, { 1, 1, 0, __DRI_IMAGE_FORMAT_R8, 1 }, { 2, 1, 0, __DRI_IMAGE_FORMAT_R8, 1 } } }, - { __DRI_IMAGE_FOURCC_YUV444, __DRI_IMAGE_COMPONENTS_Y_U_V, 3, + { DRM_FORMAT_YUV444, __DRI_IMAGE_COMPONENTS_Y_U_V, 3, { { 0, 0, 0, __DRI_IMAGE_FORMAT_R8, 1 }, { 1, 0, 0, __DRI_IMAGE_FORMAT_R8, 1 }, { 2, 0, 0, __DRI_IMAGE_FORMAT_R8, 1 } } }, - { __DRI_IMAGE_FOURCC_YVU410, __DRI_IMAGE_COMPONENTS_Y_U_V, 3, + { DRM_FORMAT_YVU410, __DRI_IMAGE_COMPONENTS_Y_U_V, 3, { { 0, 0, 0, __DRI_IMAGE_FORMAT_R8, 1 }, { 2, 2, 2, __DRI_IMAGE_FORMAT_R8, 1 }, { 1, 2, 2, __DRI_IMAGE_FORMAT_R8, 1 } } }, - { __DRI_IMAGE_FOURCC_YVU411, __DRI_IMAGE_COMPONENTS_Y_U_V, 3, + { DRM_FORMAT_YVU411, __DRI_IMAGE_COMPONENTS_Y_U_V, 3, { { 0, 0, 0, __DRI_IMAGE_FORMAT_R8, 1 }, { 2, 2, 0, __DRI_IMAGE_FORMAT_R8, 1 }, { 1, 2, 0, __DRI_IMAGE_FORMAT_R8, 1 } } }, - { __DRI_IMAGE_FOURCC_YVU420, __DRI_IMAGE_COMPONENTS_Y_U_V, 3, + { DRM_FORMAT_YVU420, __DRI_IMAGE_COMPONENTS_Y_U_V, 3, { { 0, 0, 0, __DRI_IMAGE_FORMAT_R8, 1 }, { 2, 1, 1, __DRI_IMAGE_FORMAT_R8, 1 }, { 1, 1, 1, __DRI_IMAGE_FORMAT_R8, 1 } } }, - { __DRI_IMAGE_FOURCC_YVU422, __DRI_IMAGE_COMPONENTS_Y_U_V, 3, + { DRM_FORMAT_YVU422, __DRI_IMAGE_COMPONENTS_Y_U_V, 3, { { 0, 0, 0, __DRI_IMAGE_FORMAT_R8, 1 }, { 2, 1, 0, __DRI_IMAGE_FORMAT_R8, 1 }, { 1, 1, 0, __DRI_IMAGE_FORMAT_R8, 1 } } }, - { __DRI_IMAGE_FOURCC_YVU444, __DRI_IMAGE_COMPONENTS_Y_U_V, 3, + { DRM_FORMAT_YVU444, __DRI_IMAGE_COMPONENTS_Y_U_V, 3, { { 0, 0, 0, __DRI_IMAGE_FORMAT_R8, 1 }, { 2, 0, 0, __DRI_IMAGE_FORMAT_R8, 1 }, { 1, 0, 0, __DRI_IMAGE_FORMAT_R8, 1 } } }, - { __DRI_IMAGE_FOURCC_NV12, __DRI_IMAGE_COMPONENTS_Y_UV, 2, + { DRM_FORMAT_NV12, __DRI_IMAGE_COMPONENTS_Y_UV, 2, { { 0, 0, 0, __DRI_IMAGE_FORMAT_R8, 1 }, { 1, 1, 1, __DRI_IMAGE_FORMAT_GR88, 2 } } }, - { __DRI_IMAGE_FOURCC_P010, __DRI_IMAGE_COMPONENTS_Y_UV, 2, + { DRM_FORMAT_P010, __DRI_IMAGE_COMPONENTS_Y_UV, 2, { { 0, 0, 0, __DRI_IMAGE_FORMAT_R16, 2 }, { 1, 1, 1, __DRI_IMAGE_FORMAT_GR1616, 4 } } }, - { __DRI_IMAGE_FOURCC_P012, __DRI_IMAGE_COMPONENTS_Y_UV, 2, + { DRM_FORMAT_P012, __DRI_IMAGE_COMPONENTS_Y_UV, 2, { { 0, 0, 0, __DRI_IMAGE_FORMAT_R16, 2 }, { 1, 1, 1, __DRI_IMAGE_FORMAT_GR1616, 4 } } }, - { __DRI_IMAGE_FOURCC_P016, __DRI_IMAGE_COMPONENTS_Y_UV, 2, + { DRM_FORMAT_P016, __DRI_IMAGE_COMPONENTS_Y_UV, 2, { { 0, 0, 0, __DRI_IMAGE_FORMAT_R16, 2 }, { 1, 1, 1, __DRI_IMAGE_FORMAT_GR1616, 4 } } }, - { __DRI_IMAGE_FOURCC_NV16, __DRI_IMAGE_COMPONENTS_Y_UV, 2, + { DRM_FORMAT_NV16, __DRI_IMAGE_COMPONENTS_Y_UV, 2, { { 0, 0, 0, __DRI_IMAGE_FORMAT_R8, 1 }, { 1, 1, 0, __DRI_IMAGE_FORMAT_GR88, 2 } } }, - { __DRI_IMAGE_FOURCC_AYUV, __DRI_IMAGE_COMPONENTS_AYUV, 1, + { DRM_FORMAT_AYUV, __DRI_IMAGE_COMPONENTS_AYUV, 1, { { 0, 0, 0, __DRI_IMAGE_FORMAT_ABGR8888, 4 } } }, - { __DRI_IMAGE_FOURCC_XYUV8888, __DRI_IMAGE_COMPONENTS_XYUV, 1, + { DRM_FORMAT_XYUV8888, __DRI_IMAGE_COMPONENTS_XYUV, 1, { { 0, 0, 0, __DRI_IMAGE_FORMAT_XBGR8888, 4 } } }, /* For YUYV and UYVY buffers, we set up two overlapping DRI images @@ -318,10 +329,10 @@ * V into A. This lets the texture sampler interpolate the Y * components correctly when sampling from plane 0, and interpolate * U and V correctly when sampling from plane 1. */ - { __DRI_IMAGE_FOURCC_YUYV, __DRI_IMAGE_COMPONENTS_Y_XUXV, 2, + { DRM_FORMAT_YUYV, __DRI_IMAGE_COMPONENTS_Y_XUXV, 2, { { 0, 0, 0, __DRI_IMAGE_FORMAT_GR88, 2 }, { 0, 1, 0, __DRI_IMAGE_FORMAT_ARGB8888, 4 } } }, - { __DRI_IMAGE_FOURCC_UYVY, __DRI_IMAGE_COMPONENTS_Y_UXVX, 2, + { DRM_FORMAT_UYVY, __DRI_IMAGE_COMPONENTS_Y_UXVX, 2, { { 0, 0, 0, __DRI_IMAGE_FORMAT_GR88, 2 }, { 0, 1, 0, __DRI_IMAGE_FORMAT_ABGR8888, 4 } } } }; @@ -747,7 +758,9 @@ .samples = 1, .usage = ISL_SURF_USAGE_RENDER_TARGET_BIT | ISL_SURF_USAGE_TEXTURE_BIT | - ISL_SURF_USAGE_STORAGE_BIT, + ISL_SURF_USAGE_STORAGE_BIT | + ((use & __DRI_IMAGE_USE_SCANOUT) ? + ISL_SURF_USAGE_DISPLAY_BIT : 0), .tiling_flags = (1 << mod_info->tiling)); assert(ok); if (!ok) { @@ -755,9 +768,9 @@ return NULL; } - struct isl_surf aux_surf; + struct isl_surf aux_surf = {0,}; if (mod_info->aux_usage == ISL_AUX_USAGE_CCS_E) { - ok = isl_surf_get_ccs_surf(&screen->isl_dev, &surf, &aux_surf, 0); + ok = isl_surf_get_ccs_surf(&screen->isl_dev, &surf, &aux_surf, NULL, 0); if (!ok) { free(image); return NULL; @@ -888,9 +901,16 @@ case __DRI_IMAGE_ATTRIB_STRIDE: *value = image->pitch; return true; - case __DRI_IMAGE_ATTRIB_HANDLE: - *value = brw_bo_export_gem_handle(image->bo); + case __DRI_IMAGE_ATTRIB_HANDLE: { + __DRIscreen *dri_screen = image->screen->driScrnPriv; + uint32_t handle; + if (brw_bo_export_gem_handle_for_device(image->bo, + dri_screen->fd, + &handle)) + return false; + *value = handle; return true; + } case __DRI_IMAGE_ATTRIB_NAME: return !brw_bo_flink(image->bo, (uint32_t *) value); case __DRI_IMAGE_ATTRIB_FORMAT: @@ -967,6 +987,7 @@ return NULL; brw_bo_reference(orig_image->bo); + image->screen = orig_image->screen; image->bo = orig_image->bo; image->internal_format = orig_image->internal_format; image->planar_format = orig_image->planar_format; @@ -1176,8 +1197,8 @@ return NULL; } - struct isl_surf aux_surf; - ok = isl_surf_get_ccs_surf(&screen->isl_dev, &surf, &aux_surf, + struct isl_surf aux_surf = {0,}; + ok = isl_surf_get_ccs_surf(&screen->isl_dev, &surf, &aux_surf, NULL, image->aux_pitch); if (!ok) { brw_bo_unreference(image->bo); @@ -1264,6 +1285,7 @@ image->sample_range = sample_range; image->horizontal_siting = horizontal_siting; image->vertical_siting = vertical_siting; + image->imported_dmabuf = true; *error = __DRI_IMAGE_ERROR_SUCCESS; return image; @@ -1324,12 +1346,13 @@ int num_formats = 0, i; for (i = 0; i < ARRAY_SIZE(intel_image_formats); i++) { - /* These two formats are valid DRI formats but do not exist in - * drm_fourcc.h in the Linux kernel. We don't want to accidentally - * advertise them through the EGL layer. + /* These formats are valid DRI formats but do not exist in drm_fourcc.h + * in the Linux kernel. We don't want to accidentally advertise them + * them through the EGL layer. */ if (intel_image_formats[i].fourcc == __DRI_IMAGE_FOURCC_SARGB8888 || - intel_image_formats[i].fourcc == __DRI_IMAGE_FOURCC_SABGR8888) + intel_image_formats[i].fourcc == __DRI_IMAGE_FOURCC_SABGR8888 || + intel_image_formats[i].fourcc == __DRI_IMAGE_FOURCC_SXRGB8888) continue; if (!intel_image_format_is_supported(&screen->devinfo, @@ -1661,7 +1684,7 @@ gp.param = param; gp.value = value; - if (drmIoctl(screen->driScrnPriv->fd, DRM_IOCTL_I915_GETPARAM, &gp) == -1) { + if (drmIoctl(screen->fd, DRM_IOCTL_I915_GETPARAM, &gp) == -1) { ret = -errno; if (ret != -EINVAL) _mesa_warning(NULL, "drm_i915_getparam: %d", ret); @@ -1693,7 +1716,7 @@ { struct intel_screen *screen = sPriv->driverPrivate; - brw_bufmgr_destroy(screen->bufmgr); + brw_bufmgr_unref(screen->bufmgr); driDestroyOptionInfo(&screen->optionCache); disk_cache_destroy(screen->disk_cache); @@ -1736,7 +1759,11 @@ fb->Visual.samples = num_samples; } - if (mesaVis->redBits == 10 && mesaVis->alphaBits > 0) { + if (mesaVis->redBits == 16 && mesaVis->alphaBits > 0 && mesaVis->floatMode) { + rgbFormat = MESA_FORMAT_RGBA_FLOAT16; + } else if (mesaVis->redBits == 16 && mesaVis->floatMode) { + rgbFormat = MESA_FORMAT_RGBX_FLOAT16; + } else if (mesaVis->redBits == 10 && mesaVis->alphaBits > 0) { rgbFormat = mesaVis->redMask == 0x3ff00000 ? MESA_FORMAT_B10G10R10A2_UNORM : MESA_FORMAT_R10G10B10A2_UNORM; } else if (mesaVis->redBits == 10) { @@ -1745,12 +1772,14 @@ } else if (mesaVis->redBits == 5) { rgbFormat = mesaVis->redMask == 0x1f ? MESA_FORMAT_R5G6B5_UNORM : MESA_FORMAT_B5G6R5_UNORM; + } else if (mesaVis->alphaBits == 0) { + rgbFormat = mesaVis->redMask == 0xff ? MESA_FORMAT_R8G8B8X8_SRGB + : MESA_FORMAT_B8G8R8X8_SRGB; + fb->Visual.sRGBCapable = true; } else if (mesaVis->sRGBCapable) { rgbFormat = mesaVis->redMask == 0xff ? MESA_FORMAT_R8G8B8A8_SRGB : MESA_FORMAT_B8G8R8A8_SRGB; - } else if (mesaVis->alphaBits == 0) { - rgbFormat = mesaVis->redMask == 0xff ? MESA_FORMAT_R8G8B8X8_UNORM - : MESA_FORMAT_B8G8R8X8_UNORM; + fb->Visual.sRGBCapable = true; } else { rgbFormat = mesaVis->redMask == 0xff ? MESA_FORMAT_R8G8B8A8_SRGB : MESA_FORMAT_B8G8R8A8_SRGB; @@ -1895,12 +1924,23 @@ if (getenv("INTEL_NO_HW") != NULL) screen->no_hw = true; - screen->bufmgr = brw_bufmgr_init(&screen->devinfo, dri_screen->fd); + bool bo_reuse = false; + int bo_reuse_mode = driQueryOptioni(&screen->optionCache, "bo_reuse"); + switch (bo_reuse_mode) { + case DRI_CONF_BO_REUSE_DISABLED: + break; + case DRI_CONF_BO_REUSE_ALL: + bo_reuse = true; + break; + } + + screen->bufmgr = brw_bufmgr_get_for_fd(&screen->devinfo, dri_screen->fd, bo_reuse); if (screen->bufmgr == NULL) { fprintf(stderr, "[%s:%u] Error initializing buffer manager.\n", __func__, __LINE__); return false; } + screen->fd = brw_bufmgr_get_fd(screen->bufmgr); if (!intel_get_boolean(screen, I915_PARAM_HAS_EXEC_NO_RELOC)) { fprintf(stderr, "[%s: %u] Kernel 3.9 required.\n", __func__, __LINE__); @@ -2067,8 +2107,7 @@ /* Don't bother with error checking - if the execbuf fails, the * value won't be written and we'll just report that there's no access. */ - __DRIscreen *dri_screen = screen->driScrnPriv; - drmIoctl(dri_screen->fd, DRM_IOCTL_I915_GEM_EXECBUFFER2, &execbuf); + drmIoctl(screen->fd, DRM_IOCTL_I915_GEM_EXECBUFFER2, &execbuf); /* Check whether the value got written. */ void *results_map = brw_bo_map(NULL, results, MAP_READ); @@ -2150,6 +2189,45 @@ return 0; } +static bool +intel_allowed_format(__DRIscreen *dri_screen, mesa_format format) +{ + struct intel_screen *screen = dri_screen->driverPrivate; + + /* Expose only BGRA ordering if the loader doesn't support RGBA ordering. */ + bool allow_rgba_ordering = intel_loader_get_cap(dri_screen, DRI_LOADER_CAP_RGBA_ORDERING); + if (!allow_rgba_ordering && + (format == MESA_FORMAT_R8G8B8A8_UNORM || + format == MESA_FORMAT_R8G8B8X8_UNORM || + format == MESA_FORMAT_R8G8B8A8_SRGB)) + return false; + + /* Shall we expose 10 bpc formats? */ + bool allow_rgb10_configs = driQueryOptionb(&screen->optionCache, + "allow_rgb10_configs"); + if (!allow_rgb10_configs && + (format == MESA_FORMAT_B10G10R10A2_UNORM || + format == MESA_FORMAT_B10G10R10X2_UNORM)) + return false; + + /* Shall we expose 565 formats? */ + bool allow_rgb565_configs = driQueryOptionb(&screen->optionCache, + "allow_rgb565_configs"); + if (!allow_rgb565_configs && format == MESA_FORMAT_B5G6R5_UNORM) + return false; + + /* Shall we expose fp16 formats? */ + bool allow_fp16_configs = driQueryOptionb(&screen->optionCache, + "allow_fp16_configs"); + allow_fp16_configs &= intel_loader_get_cap(dri_screen, DRI_LOADER_CAP_FP16); + if (!allow_fp16_configs && + (format == MESA_FORMAT_RGBA_FLOAT16 || + format == MESA_FORMAT_RGBX_FLOAT16)) + return false; + + return true; +} + static __DRIconfig** intel_screen_make_configs(__DRIscreen *dri_screen) { @@ -2159,11 +2237,15 @@ MESA_FORMAT_B8G8R8X8_UNORM, MESA_FORMAT_B8G8R8A8_SRGB, + MESA_FORMAT_B8G8R8X8_SRGB, /* For 10 bpc, 30 bit depth framebuffers. */ MESA_FORMAT_B10G10R10A2_UNORM, MESA_FORMAT_B10G10R10X2_UNORM, + MESA_FORMAT_RGBA_FLOAT16, + MESA_FORMAT_RGBX_FLOAT16, + /* The 32-bit RGBA format must not precede the 32-bit BGRA format. * Likewise for RGBX and BGRX. Otherwise, the GLX client and the GLX * server may disagree on which format the GLXFBConfig represents, @@ -2200,33 +2282,16 @@ uint8_t depth_bits[4], stencil_bits[4]; __DRIconfig **configs = NULL; - /* Expose only BGRA ordering if the loader doesn't support RGBA ordering. */ - unsigned num_formats; - if (intel_loader_get_cap(dri_screen, DRI_LOADER_CAP_RGBA_ORDERING)) - num_formats = ARRAY_SIZE(formats); - else - num_formats = ARRAY_SIZE(formats) - 3; /* all - RGBA_ORDERING formats */ - - /* Shall we expose 10 bpc formats? */ - bool allow_rgb10_configs = driQueryOptionb(&screen->optionCache, - "allow_rgb10_configs"); - /* Shall we expose 565 formats? */ - bool allow_rgb565_configs = driQueryOptionb(&screen->optionCache, - "allow_rgb565_configs"); + unsigned num_formats = ARRAY_SIZE(formats); /* Generate singlesample configs, each without accumulation buffer * and with EGL_MUTABLE_RENDER_BUFFER_BIT_KHR. */ for (unsigned i = 0; i < num_formats; i++) { __DRIconfig **new_configs; - int num_depth_stencil_bits = 2; - - if (!allow_rgb10_configs && - (formats[i] == MESA_FORMAT_B10G10R10A2_UNORM || - formats[i] == MESA_FORMAT_B10G10R10X2_UNORM)) - continue; + int num_depth_stencil_bits = 1; - if (!allow_rgb565_configs && formats[i] == MESA_FORMAT_B5G6R5_UNORM) + if (!intel_allowed_format(dri_screen, formats[i])) continue; /* Starting with DRI2 protocol version 1.1 we can request a depth/stencil @@ -2237,16 +2302,20 @@ stencil_bits[0] = 0; if (formats[i] == MESA_FORMAT_B5G6R5_UNORM) { - depth_bits[1] = 16; - stencil_bits[1] = 0; + if (devinfo->gen >= 8) { + depth_bits[num_depth_stencil_bits] = 16; + stencil_bits[num_depth_stencil_bits] = 0; + num_depth_stencil_bits++; + } if (devinfo->gen >= 6) { - depth_bits[2] = 24; - stencil_bits[2] = 8; - num_depth_stencil_bits = 3; + depth_bits[num_depth_stencil_bits] = 24; + stencil_bits[num_depth_stencil_bits] = 8; + num_depth_stencil_bits++; } } else { - depth_bits[1] = 24; - stencil_bits[1] = 8; + depth_bits[num_depth_stencil_bits] = 24; + stencil_bits[num_depth_stencil_bits] = 8; + num_depth_stencil_bits++; } new_configs = driCreateConfigs(formats[i], @@ -2266,17 +2335,20 @@ for (unsigned i = 0; i < num_formats; i++) { __DRIconfig **new_configs; - if (!allow_rgb10_configs && - (formats[i] == MESA_FORMAT_B10G10R10A2_UNORM || - formats[i] == MESA_FORMAT_B10G10R10X2_UNORM)) - continue; - - if (!allow_rgb565_configs && formats[i] == MESA_FORMAT_B5G6R5_UNORM) + if (!intel_allowed_format(dri_screen, formats[i])) continue; if (formats[i] == MESA_FORMAT_B5G6R5_UNORM) { - depth_bits[0] = 16; - stencil_bits[0] = 0; + if (devinfo->gen >= 8) { + depth_bits[0] = 16; + stencil_bits[0] = 0; + } else if (devinfo->gen >= 6) { + depth_bits[0] = 24; + stencil_bits[0] = 8; + } else { + depth_bits[0] = 0; + stencil_bits[0] = 0; + } } else { depth_bits[0] = 24; stencil_bits[0] = 8; @@ -2307,12 +2379,7 @@ if (devinfo->gen < 6) break; - if (!allow_rgb10_configs && - (formats[i] == MESA_FORMAT_B10G10R10A2_UNORM || - formats[i] == MESA_FORMAT_B10G10R10X2_UNORM)) - continue; - - if (!allow_rgb565_configs && formats[i] == MESA_FORMAT_B5G6R5_UNORM) + if (!intel_allowed_format(dri_screen, formats[i])) continue; __DRIconfig **new_configs; @@ -2323,7 +2390,7 @@ depth_bits[0] = 0; stencil_bits[0] = 0; - if (formats[i] == MESA_FORMAT_B5G6R5_UNORM) { + if (formats[i] == MESA_FORMAT_B5G6R5_UNORM && devinfo->gen >= 8) { depth_bits[1] = 16; stencil_bits[1] = 0; } else { @@ -2380,7 +2447,7 @@ case 10: case 9: case 8: - dri_screen->max_gl_core_version = 45; + dri_screen->max_gl_core_version = 46; dri_screen->max_gl_compat_version = 30; dri_screen->max_gl_es1_version = 11; dri_screen->max_gl_es2_version = has_astc ? 32 : 31; @@ -2501,6 +2568,11 @@ screen->deviceID = devinfo->chipset_id; screen->no_hw = devinfo->no_hw; + if (devinfo->gen >= 12) { + fprintf(stderr, "gen12 and newer are not supported on i965\n"); + return NULL; + } + if (!intel_init_bufmgr(screen)) return NULL; @@ -2546,7 +2618,7 @@ screen->max_gtt_map_object_size = gtt_size / 4; } - screen->aperture_threshold = get_aperture_size(dri_screen->fd) * 3 / 4; + screen->aperture_threshold = get_aperture_size(screen->fd) * 3 / 4; screen->hw_has_swizzling = intel_detect_swizzling(screen); screen->hw_has_timestamp = intel_detect_timestamp(screen); @@ -2735,7 +2807,7 @@ struct drm_i915_reset_stats stats; memset(&stats, 0, sizeof(stats)); - const int ret = drmIoctl(dri_screen->fd, DRM_IOCTL_I915_GET_RESET_STATS, &stats); + const int ret = drmIoctl(screen->fd, DRM_IOCTL_I915_GET_RESET_STATS, &stats); screen->has_context_reset_notification = (ret != -1 || errno != EINVAL); @@ -2755,7 +2827,10 @@ screen->compiler->constant_buffer_0_is_relative = devinfo->gen < 8 || !(screen->kernel_features & KERNEL_ALLOWS_CONTEXT_ISOLATION); + screen->compiler->glsl_compiler_options[MESA_SHADER_VERTEX].PositionAlwaysInvariant = driQueryOptionb(&screen->optionCache, "vs_position_always_invariant"); + screen->compiler->supports_pull_constants = true; + screen->compiler->compact_params = true; screen->has_exec_fence = intel_get_boolean(screen, I915_PARAM_HAS_EXEC_FENCE); diff -Nru mesa-19.2.8/src/mesa/drivers/dri/i965/intel_screen.h mesa-20.0.8/src/mesa/drivers/dri/i965/intel_screen.h --- mesa-19.2.8/src/mesa/drivers/dri/i965/intel_screen.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mesa/drivers/dri/i965/intel_screen.h 2020-06-12 01:21:18.000000000 +0000 @@ -56,6 +56,9 @@ /** Bytes of aperture usage beyond which execbuf is likely to fail. */ uint64_t aperture_threshold; + /** DRM fd associated with this screen. Not owned by this object. Do not close. */ + int fd; + bool no_hw; bool hw_has_swizzling; bool has_exec_fence; /**< I915_PARAM_HAS_EXEC_FENCE */ diff -Nru mesa-19.2.8/src/mesa/drivers/dri/i965/intel_tex_image.c mesa-20.0.8/src/mesa/drivers/dri/i965/intel_tex_image.c --- mesa-19.2.8/src/mesa/drivers/dri/i965/intel_tex_image.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mesa/drivers/dri/i965/intel_tex_image.c 2020-06-12 01:21:18.000000000 +0000 @@ -325,7 +325,8 @@ if (pixels == NULL && !_mesa_is_bufferobj(packing->BufferObj)) return; - bool tex_busy = mt && brw_bo_busy(mt->bo); + bool tex_busy = mt && + (brw_batch_references(&brw->batch, mt->bo) || brw_bo_busy(mt->bo)); if (_mesa_is_bufferobj(packing->BufferObj) || tex_busy || mt->aux_usage == ISL_AUX_USAGE_CCS_E) { @@ -602,10 +603,11 @@ } static void -intel_image_target_texture_2d(struct gl_context *ctx, GLenum target, - struct gl_texture_object *texObj, - struct gl_texture_image *texImage, - GLeglImageOES image_handle) +intel_image_target_texture(struct gl_context *ctx, GLenum target, + struct gl_texture_object *texObj, + struct gl_texture_image *texImage, + GLeglImageOES image_handle, + bool storage) { struct brw_context *brw = brw_context(ctx); struct intel_mipmap_tree *mt; @@ -637,17 +639,59 @@ image->internal_format != 0 ? image->internal_format : _mesa_get_format_base_format(mt->format); - /* Setup a sized internal format for MESA_FORMAT_R10G10B10[X2|A2]_UNORM. */ + /* Fix the internal format when _mesa_get_format_base_format(mt->format) + * isn't a valid one for that particular format. + */ if (brw->mesa_format_supports_render[image->format]) { if (image->format == MESA_FORMAT_R10G10B10A2_UNORM || - image->format == MESA_FORMAT_R10G10B10X2_UNORM) + image->format == MESA_FORMAT_R10G10B10X2_UNORM || + image->format == MESA_FORMAT_B10G10R10A2_UNORM || + image->format == MESA_FORMAT_B10G10R10X2_UNORM) internal_format = GL_RGB10_A2; } + /* Guess sized internal format for dma-bufs, as specified by + * EXT_EGL_image_storage. + */ + if (storage && target == GL_TEXTURE_2D && image->imported_dmabuf) { + internal_format = driGLFormatToSizedInternalGLFormat(image->format); + if (internal_format == GL_NONE) { + _mesa_error(ctx, GL_INVALID_OPERATION, __func__); + return; + } + } + intel_set_texture_image_mt(brw, texImage, internal_format, mt->format, mt); intel_miptree_release(&mt); } +static void +intel_image_target_texture_2d(struct gl_context *ctx, GLenum target, + struct gl_texture_object *texObj, + struct gl_texture_image *texImage, + GLeglImageOES image_handle) +{ + intel_image_target_texture(ctx, target, texObj, texImage, image_handle, + false); +} + +static void +intel_image_target_tex_storage(struct gl_context *ctx, GLenum target, + struct gl_texture_object *texObj, + struct gl_texture_image *texImage, + GLeglImageOES image_handle) +{ + struct intel_texture_object *intel_texobj = intel_texture_object(texObj); + intel_image_target_texture(ctx, target, texObj, texImage, image_handle, + true); + + /* The miptree is in a validated state, so no need to check later. */ + intel_texobj->needs_validate = false; + intel_texobj->validated_first_level = 0; + intel_texobj->validated_last_level = 0; + intel_texobj->_Format = texImage->TexFormat; +} + static bool intel_gettexsubimage_blorp(struct brw_context *brw, struct gl_texture_image *tex_image, @@ -940,6 +984,7 @@ functions->TexSubImage = intelTexSubImage; functions->CompressedTexSubImage = intelCompressedTexSubImage; functions->EGLImageTargetTexture2D = intel_image_target_texture_2d; + functions->EGLImageTargetTexStorage = intel_image_target_tex_storage; functions->BindRenderbufferTexImage = intel_bind_renderbuffer_tex_image; functions->GetTexSubImage = intel_get_tex_sub_image; } diff -Nru mesa-19.2.8/src/mesa/drivers/dri/meson.build mesa-20.0.8/src/mesa/drivers/dri/meson.build --- mesa-19.2.8/src/mesa/drivers/dri/meson.build 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mesa/drivers/dri/meson.build 2020-06-12 01:21:18.000000000 +0000 @@ -74,8 +74,7 @@ ) meson.add_install_script( - prog_python.path(), - join_paths(meson.source_root(), 'bin/install_megadrivers.py'), + install_megadrivers_py.path(), libmesa_dri_drivers.full_path(), dri_drivers_path, _dri_link, @@ -87,7 +86,7 @@ if with_dri dri_req_private = [] if dep_libdrm.found() - dri_req_private = ['libdrm >= ' + dep_libdrm.version()] + dri_req_private = ['libdrm >= ' + _drm_ver] endif pkg.generate( diff -Nru mesa-19.2.8/src/mesa/drivers/dri/nouveau/meson.build mesa-20.0.8/src/mesa/drivers/dri/nouveau/meson.build --- mesa-19.2.8/src/mesa/drivers/dri/nouveau/meson.build 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mesa/drivers/dri/nouveau/meson.build 2020-06-12 01:21:18.000000000 +0000 @@ -81,9 +81,9 @@ libnouveau_vieux = static_library( 'nouveau_vieux', - [files_nouveau_vieux, xmlpool_options_h], + files_nouveau_vieux, include_directories : [inc_common, inc_dri_common, inc_util], c_args : [c_vis_args], cpp_args : [cpp_vis_args], - dependencies : [dep_libdrm, dep_libdrm_nouveau], + dependencies : [dep_libdrm, dep_libdrm_nouveau, idep_xmlconfig_headers], ) diff -Nru mesa-19.2.8/src/mesa/drivers/dri/nouveau/nouveau_context.c mesa-20.0.8/src/mesa/drivers/dri/nouveau/nouveau_context.c --- mesa-19.2.8/src/mesa/drivers/dri/nouveau/nouveau_context.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mesa/drivers/dri/nouveau/nouveau_context.c 2020-06-12 01:21:18.000000000 +0000 @@ -217,7 +217,7 @@ nouveau_object_del(&nctx->hw.chan); nouveau_scratch_destroy(ctx); - _mesa_free_context_data(ctx, true); + _mesa_free_context_data(ctx); } void @@ -250,11 +250,11 @@ attachments[i++] = __DRI_BUFFER_FRONT_LEFT; if (fb->Visual.doubleBufferMode) attachments[i++] = __DRI_BUFFER_BACK_LEFT; - if (fb->Visual.haveDepthBuffer && fb->Visual.haveStencilBuffer) + if (fb->Visual.depthBits > 0 && fb->Visual.stencilBits > 0) attachments[i++] = __DRI_BUFFER_DEPTH_STENCIL; - else if (fb->Visual.haveDepthBuffer) + else if (fb->Visual.depthBits > 0) attachments[i++] = __DRI_BUFFER_DEPTH; - else if (fb->Visual.haveStencilBuffer) + else if (fb->Visual.stencilBits > 0) attachments[i++] = __DRI_BUFFER_STENCIL; buffers = screen->dri2.loader->getBuffers(draw, &draw->w, &draw->h, diff -Nru mesa-19.2.8/src/mesa/drivers/dri/nouveau/nv04_surface.c mesa-20.0.8/src/mesa/drivers/dri/nouveau/nv04_surface.c --- mesa-19.2.8/src/mesa/drivers/dri/nouveau/nv04_surface.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mesa/drivers/dri/nouveau/nv04_surface.c 2020-06-12 01:21:18.000000000 +0000 @@ -50,8 +50,7 @@ case MESA_FORMAT_B5G5R5A1_UNORM: case MESA_FORMAT_A1B5G5R5_UNORM: case MESA_FORMAT_A1R5G5B5_UNORM: - case MESA_FORMAT_L8A8_UNORM: - case MESA_FORMAT_A8L8_UNORM: + case MESA_FORMAT_LA_UNORM8: case MESA_FORMAT_YCBCR: case MESA_FORMAT_YCBCR_REV: case MESA_FORMAT_Z_UNORM16: @@ -89,8 +88,7 @@ case MESA_FORMAT_B5G5R5A1_UNORM: case MESA_FORMAT_A1B5G5R5_UNORM: case MESA_FORMAT_A1R5G5B5_UNORM: - case MESA_FORMAT_L8A8_UNORM: - case MESA_FORMAT_A8L8_UNORM: + case MESA_FORMAT_LA_UNORM8: case MESA_FORMAT_YCBCR: case MESA_FORMAT_YCBCR_REV: case MESA_FORMAT_Z_UNORM16: @@ -128,8 +126,7 @@ case MESA_FORMAT_B5G5R5A1_UNORM: case MESA_FORMAT_A1B5G5R5_UNORM: case MESA_FORMAT_A1R5G5B5_UNORM: - case MESA_FORMAT_L8A8_UNORM: - case MESA_FORMAT_A8L8_UNORM: + case MESA_FORMAT_LA_UNORM8: case MESA_FORMAT_YCBCR: case MESA_FORMAT_YCBCR_REV: case MESA_FORMAT_Z_UNORM16: @@ -167,8 +164,7 @@ case MESA_FORMAT_B5G5R5A1_UNORM: case MESA_FORMAT_A1B5G5R5_UNORM: case MESA_FORMAT_A1R5G5B5_UNORM: - case MESA_FORMAT_L8A8_UNORM: - case MESA_FORMAT_A8L8_UNORM: + case MESA_FORMAT_LA_UNORM8: case MESA_FORMAT_YCBCR: case MESA_FORMAT_YCBCR_REV: case MESA_FORMAT_Z_UNORM16: diff -Nru mesa-19.2.8/src/mesa/drivers/dri/r200/meson.build mesa-20.0.8/src/mesa/drivers/dri/r200/meson.build --- mesa-19.2.8/src/mesa/drivers/dri/r200/meson.build 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mesa/drivers/dri/r200/meson.build 2020-06-12 01:21:18.000000000 +0000 @@ -78,11 +78,11 @@ libr200 = static_library( 'r200', - [files_r200, xmlpool_options_h], + files_r200, include_directories : [ inc_common, inc_dri_common, inc_util, include_directories('server'), ], c_args : [c_vis_args, '-DRADEON_R200'], cpp_args : [cpp_vis_args], - dependencies : [dep_libdrm, dep_libdrm_radeon], + dependencies : [dep_libdrm, dep_libdrm_radeon, idep_xmlconfig_headers], ) diff -Nru mesa-19.2.8/src/mesa/drivers/dri/r200/r200_blit.c mesa-20.0.8/src/mesa/drivers/dri/r200/r200_blit.c --- mesa-19.2.8/src/mesa/drivers/dri/r200/r200_blit.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mesa/drivers/dri/r200/r200_blit.c 2020-06-12 01:21:18.000000000 +0000 @@ -42,41 +42,29 @@ unsigned r200_check_blit(mesa_format mesa_format, uint32_t dst_pitch) { /* XXX others? */ - if (_mesa_little_endian()) { - switch (mesa_format) { - case MESA_FORMAT_B8G8R8A8_UNORM: - case MESA_FORMAT_B8G8R8X8_UNORM: - case MESA_FORMAT_B5G6R5_UNORM: - case MESA_FORMAT_B4G4R4A4_UNORM: - case MESA_FORMAT_B5G5R5A1_UNORM: - case MESA_FORMAT_A_UNORM8: - case MESA_FORMAT_L_UNORM8: - case MESA_FORMAT_I_UNORM8: - /* swizzled - probably can't happen with the disabled Choose8888TexFormat code */ - case MESA_FORMAT_A8B8G8R8_UNORM: - case MESA_FORMAT_R8G8B8A8_UNORM: - break; - default: - return 0; - } - } - else { - switch (mesa_format) { - case MESA_FORMAT_A8R8G8B8_UNORM: - case MESA_FORMAT_X8R8G8B8_UNORM: - case MESA_FORMAT_R5G6B5_UNORM: - case MESA_FORMAT_A4R4G4B4_UNORM: - case MESA_FORMAT_A1R5G5B5_UNORM: - case MESA_FORMAT_A_UNORM8: - case MESA_FORMAT_L_UNORM8: - case MESA_FORMAT_I_UNORM8: - /* swizzled - probably can't happen with the disabled Choose8888TexFormat code */ - case MESA_FORMAT_R8G8B8A8_UNORM: - case MESA_FORMAT_A8B8G8R8_UNORM: - break; - default: - return 0; - } + switch (mesa_format) { +#if UTIL_ARCH_LITTLE_ENDIAN + case MESA_FORMAT_B8G8R8A8_UNORM: + case MESA_FORMAT_B8G8R8X8_UNORM: + case MESA_FORMAT_B5G6R5_UNORM: + case MESA_FORMAT_B4G4R4A4_UNORM: + case MESA_FORMAT_B5G5R5A1_UNORM: +#else + case MESA_FORMAT_A8R8G8B8_UNORM: + case MESA_FORMAT_X8R8G8B8_UNORM: + case MESA_FORMAT_R5G6B5_UNORM: + case MESA_FORMAT_A4R4G4B4_UNORM: + case MESA_FORMAT_A1R5G5B5_UNORM: +#endif + case MESA_FORMAT_A_UNORM8: + case MESA_FORMAT_L_UNORM8: + case MESA_FORMAT_I_UNORM8: + /* swizzled - probably can't happen with the disabled Choose8888TexFormat code */ + case MESA_FORMAT_A8B8G8R8_UNORM: + case MESA_FORMAT_R8G8B8A8_UNORM: + break; + default: + return 0; } /* Rendering to small buffer doesn't work. @@ -133,12 +121,11 @@ assert(height <= 2048); assert(offset % 32 == 0); - if (_mesa_little_endian()) { - txformat |= tx_table_le[src_mesa_format].format; - } - else { - txformat |= tx_table_be[src_mesa_format].format; - } +#if UTIL_ARCH_LITTLE_ENDIAN + txformat |= tx_table_le[src_mesa_format].format; +#else + txformat |= tx_table_be[src_mesa_format].format; +#endif if (bo->flags & RADEON_BO_FLAGS_MACRO_TILE) offset |= R200_TXO_MACRO_TILE; @@ -183,8 +170,11 @@ break; case MESA_FORMAT_A8B8G8R8_UNORM: case MESA_FORMAT_R8G8B8A8_UNORM: - if ((dst_mesa_format == MESA_FORMAT_A8B8G8R8_UNORM && _mesa_little_endian()) || - (dst_mesa_format == MESA_FORMAT_R8G8B8A8_UNORM && !_mesa_little_endian())) { +#if UTIL_ARCH_LITTLE_ENDIAN + if (dst_mesa_format == MESA_FORMAT_A8B8G8R8_UNORM) { +#else + if (dst_mesa_format == MESA_FORMAT_R8G8B8A8_UNORM) { +#endif BEGIN_BATCH(10); OUT_BATCH_REGVAL(RADEON_PP_CNTL, (RADEON_TEX_0_ENABLE | RADEON_TEX_BLEND_0_ENABLE)); @@ -302,7 +292,7 @@ OUT_BATCH_REGVAL(R200_PP_TXPITCH_0, pitch * _mesa_get_format_bytes(src_mesa_format) - 32); OUT_BATCH_REGSEQ(R200_PP_TXOFFSET_0, 1); - OUT_BATCH_RELOC(offset, bo, offset, RADEON_GEM_DOMAIN_GTT|RADEON_GEM_DOMAIN_VRAM, 0, 0); + OUT_BATCH_RELOC(bo, offset, RADEON_GEM_DOMAIN_GTT|RADEON_GEM_DOMAIN_VRAM, 0, 0); END_BATCH(); } @@ -367,9 +357,9 @@ OUT_BATCH_REGVAL(RADEON_RB3D_CNTL, dst_format); OUT_BATCH_REGSEQ(RADEON_RB3D_COLOROFFSET, 1); - OUT_BATCH_RELOC(offset, bo, offset, 0, RADEON_GEM_DOMAIN_GTT|RADEON_GEM_DOMAIN_VRAM, 0); + OUT_BATCH_RELOC(bo, offset, 0, RADEON_GEM_DOMAIN_GTT|RADEON_GEM_DOMAIN_VRAM, 0); OUT_BATCH_REGSEQ(RADEON_RB3D_COLORPITCH, 1); - OUT_BATCH_RELOC(dst_pitch, bo, dst_pitch, 0, RADEON_GEM_DOMAIN_GTT|RADEON_GEM_DOMAIN_VRAM, 0); + OUT_BATCH_RELOC(bo, dst_pitch, 0, RADEON_GEM_DOMAIN_GTT|RADEON_GEM_DOMAIN_VRAM, 0); END_BATCH(); } diff -Nru mesa-19.2.8/src/mesa/drivers/dri/r200/r200_cmdbuf.c mesa-20.0.8/src/mesa/drivers/dri/r200/r200_cmdbuf.c --- mesa-19.2.8/src/mesa/drivers/dri/r200/r200_cmdbuf.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mesa/drivers/dri/r200/r200_cmdbuf.c 2020-06-12 01:21:18.000000000 +0000 @@ -232,7 +232,7 @@ OUT_BATCH_PACKET3(R200_CP_CMD_3D_LOAD_VBPNTR, 2); OUT_BATCH(1); OUT_BATCH(vertex_size | (vertex_size << 8)); - OUT_BATCH_RELOC(offset, bo, offset, RADEON_GEM_DOMAIN_GTT, 0, 0); + OUT_BATCH_RELOC(bo, offset, RADEON_GEM_DOMAIN_GTT, 0, 0); END_BATCH(); } diff -Nru mesa-19.2.8/src/mesa/drivers/dri/r200/r200_context.c mesa-20.0.8/src/mesa/drivers/dri/r200/r200_context.c --- mesa-19.2.8/src/mesa/drivers/dri/r200/r200_context.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mesa/drivers/dri/r200/r200_context.c 2020-06-12 01:21:18.000000000 +0000 @@ -149,7 +149,7 @@ BEGIN_BATCH(4); OUT_BATCH(CP_PACKET0(RADEON_RB3D_ZPASS_ADDR, 0)); - OUT_BATCH_RELOC(0, query->bo, query->curr_offset, 0, RADEON_GEM_DOMAIN_GTT, 0); + OUT_BATCH_RELOC(query->bo, query->curr_offset, 0, RADEON_GEM_DOMAIN_GTT, 0); END_BATCH(); query->curr_offset += sizeof(uint32_t); assert(query->curr_offset < RADEON_QUERY_PAGE_SIZE); diff -Nru mesa-19.2.8/src/mesa/drivers/dri/r200/r200_state_init.c mesa-20.0.8/src/mesa/drivers/dri/r200/r200_state_init.c --- mesa-19.2.8/src/mesa/drivers/dri/r200/r200_state_init.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mesa/drivers/dri/r200/r200_state_init.c 2020-06-12 01:21:18.000000000 +0000 @@ -496,7 +496,7 @@ if (drb) { OUT_BATCH(CP_PACKET0(RADEON_RB3D_DEPTHOFFSET, 0)); - OUT_BATCH_RELOC(0, drb->bo, 0, 0, RADEON_GEM_DOMAIN_VRAM, 0); + OUT_BATCH_RELOC(drb->bo, 0, 0, RADEON_GEM_DOMAIN_VRAM, 0); OUT_BATCH(CP_PACKET0(RADEON_RB3D_DEPTHPITCH, 0)); OUT_BATCH(zbpitch); @@ -511,10 +511,10 @@ if (rrb) { OUT_BATCH(CP_PACKET0(RADEON_RB3D_COLOROFFSET, 0)); - OUT_BATCH_RELOC(rrb->draw_offset, rrb->bo, rrb->draw_offset, 0, RADEON_GEM_DOMAIN_VRAM, 0); + OUT_BATCH_RELOC(rrb->bo, rrb->draw_offset, 0, RADEON_GEM_DOMAIN_VRAM, 0); OUT_BATCH(CP_PACKET0(RADEON_RB3D_COLORPITCH, 0)); - OUT_BATCH_RELOC(cbpitch, rrb->bo, cbpitch, 0, RADEON_GEM_DOMAIN_VRAM, 0); + OUT_BATCH_RELOC(rrb->bo, cbpitch, 0, RADEON_GEM_DOMAIN_VRAM, 0); } if (atom->cmd_size == CTX_STATE_SIZE_NEWDRM) { @@ -581,11 +581,11 @@ if (dwords > atom->cmd_size) { OUT_BATCH(CP_PACKET0(R200_PP_TXOFFSET_0 + (24 * i), 0)); if (t->mt && !t->image_override) { - OUT_BATCH_RELOC(t->tile_bits, t->mt->bo, 0, + OUT_BATCH_RELOC(t->mt->bo, t->tile_bits, RADEON_GEM_DOMAIN_GTT|RADEON_GEM_DOMAIN_VRAM, 0, 0); } else { if (t->bo) - OUT_BATCH_RELOC(t->tile_bits, t->bo, 0, + OUT_BATCH_RELOC(t->bo, t->tile_bits, RADEON_GEM_DOMAIN_GTT|RADEON_GEM_DOMAIN_VRAM, 0, 0); } } @@ -610,7 +610,7 @@ lvl = &t->mt->levels[0]; for (j = 1; j <= 5; j++) { OUT_BATCH(CP_PACKET0(R200_PP_CUBIC_OFFSET_F1_0 + (24*i) + (4 * (j-1)), 0)); - OUT_BATCH_RELOC(lvl->faces[j].offset, t->mt->bo, lvl->faces[j].offset, + OUT_BATCH_RELOC(t->mt->bo, lvl->faces[j].offset, RADEON_GEM_DOMAIN_GTT|RADEON_GEM_DOMAIN_VRAM, 0, 0); } } diff -Nru mesa-19.2.8/src/mesa/drivers/dri/r200/r200_tex.h mesa-20.0.8/src/mesa/drivers/dri/r200/r200_tex.h --- mesa-19.2.8/src/mesa/drivers/dri/r200/r200_tex.h 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mesa/drivers/dri/r200/r200_tex.h 2020-06-12 01:21:18.000000000 +0000 @@ -73,10 +73,9 @@ [ MESA_FORMAT_A4R4G4B4_UNORM ] = { R200_TXFORMAT_ARGB4444 | R200_TXFORMAT_ALPHA_IN_MAP, 0 }, [ MESA_FORMAT_B5G5R5A1_UNORM ] = { R200_TXFORMAT_ARGB1555 | R200_TXFORMAT_ALPHA_IN_MAP, 0 }, [ MESA_FORMAT_A1R5G5B5_UNORM ] = { R200_TXFORMAT_ARGB1555 | R200_TXFORMAT_ALPHA_IN_MAP, 0 }, - [ MESA_FORMAT_L8A8_UNORM ] = { R200_TXFORMAT_AI88 | R200_TXFORMAT_ALPHA_IN_MAP, 0 }, - [ MESA_FORMAT_A8L8_UNORM ] = { R200_TXFORMAT_AI88 | R200_TXFORMAT_ALPHA_IN_MAP, 0 }, [ MESA_FORMAT_A_UNORM8 ] = { R200_TXFORMAT_I8 | R200_TXFORMAT_ALPHA_IN_MAP, 0 }, [ MESA_FORMAT_L_UNORM8 ] = { R200_TXFORMAT_I8, 0 }, + [ MESA_FORMAT_LA_UNORM8 ] = { R200_TXFORMAT_AI88 | R200_TXFORMAT_ALPHA_IN_MAP, 0 }, [ MESA_FORMAT_I_UNORM8 ] = { R200_TXFORMAT_I8 | R200_TXFORMAT_ALPHA_IN_MAP, 0 }, [ MESA_FORMAT_YCBCR ] = { R200_TXFORMAT_YVYU422, R200_YUV_TO_RGB }, [ MESA_FORMAT_YCBCR_REV ] = { R200_TXFORMAT_VYUY422, R200_YUV_TO_RGB }, @@ -103,10 +102,9 @@ [ MESA_FORMAT_A4R4G4B4_UNORM ] = { R200_TXFORMAT_ARGB4444 | R200_TXFORMAT_ALPHA_IN_MAP, 0 }, [ MESA_FORMAT_B5G5R5A1_UNORM ] = { R200_TXFORMAT_ARGB1555 | R200_TXFORMAT_ALPHA_IN_MAP, 0 }, [ MESA_FORMAT_A1R5G5B5_UNORM ] = { R200_TXFORMAT_ARGB1555 | R200_TXFORMAT_ALPHA_IN_MAP, 0 }, - [ MESA_FORMAT_L8A8_UNORM ] = { R200_TXFORMAT_AI88 | R200_TXFORMAT_ALPHA_IN_MAP, 0 }, - [ MESA_FORMAT_A8L8_UNORM ] = { R200_TXFORMAT_AI88 | R200_TXFORMAT_ALPHA_IN_MAP, 0 }, [ MESA_FORMAT_A_UNORM8 ] = { R200_TXFORMAT_I8 | R200_TXFORMAT_ALPHA_IN_MAP, 0 }, [ MESA_FORMAT_L_UNORM8 ] = { R200_TXFORMAT_I8, 0 }, + [ MESA_FORMAT_LA_UNORM8 ] = { R200_TXFORMAT_AI88 | R200_TXFORMAT_ALPHA_IN_MAP, 0 }, [ MESA_FORMAT_I_UNORM8 ] = { R200_TXFORMAT_I8 | R200_TXFORMAT_ALPHA_IN_MAP, 0 }, [ MESA_FORMAT_YCBCR ] = { R200_TXFORMAT_YVYU422, R200_YUV_TO_RGB }, [ MESA_FORMAT_YCBCR_REV ] = { R200_TXFORMAT_VYUY422, R200_YUV_TO_RGB }, diff -Nru mesa-19.2.8/src/mesa/drivers/dri/r200/r200_texstate.c mesa-20.0.8/src/mesa/drivers/dri/r200/r200_texstate.c --- mesa-19.2.8/src/mesa/drivers/dri/r200/r200_texstate.c 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mesa/drivers/dri/r200/r200_texstate.c 2020-06-12 01:21:18.000000000 +0000 @@ -691,29 +691,23 @@ pitch_val = rb->pitch; switch (rb->cpp) { case 4: - if (texture_format == __DRI_TEXTURE_FORMAT_RGB) { + if (texture_format == __DRI_TEXTURE_FORMAT_RGB) texFormat = MESA_FORMAT_BGR_UNORM8; - t->pp_txformat = tx_table_le[MESA_FORMAT_BGR_UNORM8].format; - } - else { + else texFormat = MESA_FORMAT_B8G8R8A8_UNORM; - t->pp_txformat = tx_table_le[MESA_FORMAT_B8G8R8A8_UNORM].format; - } - t->pp_txfilter |= tx_table_le[MESA_FORMAT_B8G8R8A8_UNORM].filter; break; case 3: default: texFormat = MESA_FORMAT_BGR_UNORM8; - t->pp_txformat = tx_table_le[MESA_FORMAT_BGR_UNORM8].format; - t->pp_txfilter |= tx_table_le[MESA_FORMAT_BGR_UNORM8].filter; break; case 2: texFormat = MESA_FORMAT_B5G6R5_UNORM; - t->pp_txformat = tx_table_le[MESA_FORMAT_B5G6R5_UNORM].format; - t->pp_txfilter |= tx_table_le[MESA_FORMAT_B5G6R5_UNORM].filter; break; } + t->pp_txformat = tx_table_le[texFormat].format; + t->pp_txfilter |= tx_table_le[texFormat].filter; + _mesa_init_teximage_fields(&radeon->glCtx, texImage, rb->base.Base.Width, rb->base.Base.Height, 1, 0, @@ -1314,8 +1308,11 @@ if (!t->image_override) { if (VALID_FORMAT(firstImage->TexFormat)) { - const struct tx_table *table = _mesa_little_endian() ? tx_table_le : - tx_table_be; +#if UTIL_ARCH_LITTLE_ENDIAN + const struct tx_table *table = tx_table_le; +#else + const struct tx_table *table = tx_table_be; +#endif t->pp_txformat &= ~(R200_TXFORMAT_FORMAT_MASK | R200_TXFORMAT_ALPHA_IN_MAP); diff -Nru mesa-19.2.8/src/mesa/drivers/dri/r200/radeon_cmdbuf.h mesa-20.0.8/src/mesa/drivers/dri/r200/radeon_cmdbuf.h --- mesa-19.2.8/src/mesa/drivers/dri/r200/radeon_cmdbuf.h 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/drivers/dri/r200/radeon_cmdbuf.h 2020-06-12 01:21:18.000000000 +0000 @@ -44,7 +44,7 @@ /** * Write a relocated dword to the command buffer. */ -#define OUT_BATCH_RELOC(data, bo, offset, rd, wd, flags) \ +#define OUT_BATCH_RELOC(bo, offset, rd, wd, flags) \ do { \ int __offset = (offset); \ if (0 && __offset) { \ diff -Nru mesa-19.2.8/src/mesa/drivers/dri/r200/radeon_common_context.c mesa-20.0.8/src/mesa/drivers/dri/r200/radeon_common_context.c --- mesa-19.2.8/src/mesa/drivers/dri/r200/radeon_common_context.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/drivers/dri/r200/radeon_common_context.c 2020-06-12 01:21:18.000000000 +0000 @@ -270,7 +270,7 @@ /* free atom list */ /* free the Mesa context data */ - _mesa_free_context_data(&radeon->glCtx, true); + _mesa_free_context_data(&radeon->glCtx); /* free the option cache */ driDestroyOptionCache(&radeon->optionCache); diff -Nru mesa-19.2.8/src/mesa/drivers/dri/r200/radeon_debug.c mesa-20.0.8/src/mesa/drivers/dri/r200/radeon_debug.c --- mesa-19.2.8/src/mesa/drivers/dri/r200/radeon_debug.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/drivers/dri/r200/radeon_debug.c 2020-06-12 01:21:18.000000000 +0000 @@ -57,13 +57,17 @@ {NULL, 0} }; -radeon_debug_type_t radeon_enabled_debug_types; +#if defined(RADEON_R200) +radeon_debug_type_t r200_enabled_debug_types; +#elif defined(RADEON_R100) +radeon_debug_type_t r100_enabled_debug_types; +#endif void radeon_init_debug(void) { - radeon_enabled_debug_types = parse_debug_string(getenv("RADEON_DEBUG"), debug_control); + RADEON_DEBUG = parse_debug_string(getenv("RADEON_DEBUG"), debug_control); - radeon_enabled_debug_types |= RADEON_GENERAL; + RADEON_DEBUG |= RADEON_GENERAL; } void _radeon_debug_add_indent(void) diff -Nru mesa-19.2.8/src/mesa/drivers/dri/r200/radeon_debug.h mesa-20.0.8/src/mesa/drivers/dri/r200/radeon_debug.h --- mesa-19.2.8/src/mesa/drivers/dri/r200/radeon_debug.h 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/drivers/dri/r200/radeon_debug.h 2020-06-12 01:21:18.000000000 +0000 @@ -81,18 +81,24 @@ char indent[RADEON_MAX_INDENT]; }; -extern radeon_debug_type_t radeon_enabled_debug_types; - /** * Compabibility layer for old debug code **/ -#define RADEON_DEBUG radeon_enabled_debug_types +#if defined(RADEON_R200) +extern radeon_debug_type_t r200_enabled_debug_types; +#define RADEON_DEBUG r200_enabled_debug_types +#elif defined(RADEON_R100) +extern radeon_debug_type_t r100_enabled_debug_types; +#define RADEON_DEBUG r100_enabled_debug_types +#else +#error "Neither RADEON_R100 nor RADEON_R200 are defined." +#endif static inline int radeon_is_debug_enabled(const radeon_debug_type_t type, const radeon_debug_level_t level) { return RADEON_DEBUG_LEVEL >= level - && (type & radeon_enabled_debug_types); + && (type & RADEON_DEBUG); } extern void _radeon_print(const radeon_debug_type_t type, diff -Nru mesa-19.2.8/src/mesa/drivers/dri/r200/radeon_screen.c mesa-20.0.8/src/mesa/drivers/dri/r200/radeon_screen.c --- mesa-19.2.8/src/mesa/drivers/dri/r200/radeon_screen.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/drivers/dri/r200/radeon_screen.c 2020-06-12 01:21:18.000000000 +0000 @@ -696,11 +696,26 @@ _mesa_initialize_window_framebuffer(&rfb->base, mesaVis); if (mesaVis->redBits == 5) - rgbFormat = _mesa_little_endian() ? MESA_FORMAT_B5G6R5_UNORM : MESA_FORMAT_R5G6B5_UNORM; + rgbFormat = +#if UTIL_ARCH_LITTLE_ENDIAN + MESA_FORMAT_B5G6R5_UNORM; +#else + MESA_FORMAT_R5G6B5_UNORM; +#endif else if (mesaVis->alphaBits == 0) - rgbFormat = _mesa_little_endian() ? MESA_FORMAT_B8G8R8X8_UNORM : MESA_FORMAT_X8R8G8B8_UNORM; + rgbFormat = +#if UTIL_ARCH_LITTLE_ENDIAN + MESA_FORMAT_B8G8R8X8_UNORM; +#else + MESA_FORMAT_X8R8G8B8_UNORM; +#endif else - rgbFormat = _mesa_little_endian() ? MESA_FORMAT_B8G8R8A8_UNORM : MESA_FORMAT_A8R8G8B8_UNORM; + rgbFormat = +#if UTIL_ARCH_LITTLE_ENDIAN + MESA_FORMAT_B8G8R8A8_UNORM; +#else + MESA_FORMAT_A8R8G8B8_UNORM; +#endif /* front color renderbuffer */ rfb->color_rb[0] = radeon_create_renderbuffer(rgbFormat, driDrawPriv); diff -Nru mesa-19.2.8/src/mesa/drivers/dri/r200/radeon_texture.c mesa-20.0.8/src/mesa/drivers/dri/r200/radeon_texture.c --- mesa-19.2.8/src/mesa/drivers/dri/r200/radeon_texture.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/drivers/dri/r200/radeon_texture.c 2020-06-12 01:21:18.000000000 +0000 @@ -359,7 +359,7 @@ #if defined(RADEON_R200) /* r200: can't use a8 format since interpreting hw I8 as a8 would result in wrong rgb values (same as alpha value instead of 0). */ - return _radeon_texformat_al88; + return MESA_FORMAT_LA_UNORM8; #else return MESA_FORMAT_A_UNORM8; #endif @@ -381,7 +381,7 @@ case GL_LUMINANCE12_ALPHA12: case GL_LUMINANCE16_ALPHA16: case GL_COMPRESSED_LUMINANCE_ALPHA: - return _radeon_texformat_al88; + return MESA_FORMAT_LA_UNORM8; case GL_INTENSITY: case GL_INTENSITY4: @@ -464,7 +464,7 @@ case GL_SLUMINANCE_ALPHA: case GL_SLUMINANCE8_ALPHA8: case GL_COMPRESSED_SLUMINANCE_ALPHA: - return MESA_FORMAT_L8A8_SRGB; + return MESA_FORMAT_LA_SRGB8; case GL_COMPRESSED_SRGB_S3TC_DXT1_EXT: return MESA_FORMAT_SRGB_DXT1; @@ -596,29 +596,25 @@ mesa_format _radeon_texformat_rgb565 = MESA_FORMAT_NONE; mesa_format _radeon_texformat_argb4444 = MESA_FORMAT_NONE; mesa_format _radeon_texformat_argb1555 = MESA_FORMAT_NONE; -mesa_format _radeon_texformat_al88 = MESA_FORMAT_NONE; /*@}*/ static void radeonInitTextureFormats(void) { - if (_mesa_little_endian()) { - _radeon_texformat_rgba8888 = MESA_FORMAT_A8B8G8R8_UNORM; - _radeon_texformat_argb8888 = MESA_FORMAT_B8G8R8A8_UNORM; - _radeon_texformat_rgb565 = MESA_FORMAT_B5G6R5_UNORM; - _radeon_texformat_argb4444 = MESA_FORMAT_B4G4R4A4_UNORM; - _radeon_texformat_argb1555 = MESA_FORMAT_B5G5R5A1_UNORM; - _radeon_texformat_al88 = MESA_FORMAT_L8A8_UNORM; - } - else { - _radeon_texformat_rgba8888 = MESA_FORMAT_R8G8B8A8_UNORM; - _radeon_texformat_argb8888 = MESA_FORMAT_A8R8G8B8_UNORM; - _radeon_texformat_rgb565 = MESA_FORMAT_R5G6B5_UNORM; - _radeon_texformat_argb4444 = MESA_FORMAT_A4R4G4B4_UNORM; - _radeon_texformat_argb1555 = MESA_FORMAT_A1R5G5B5_UNORM; - _radeon_texformat_al88 = MESA_FORMAT_A8L8_UNORM; - } +#if UTIL_ARCH_LITTLE_ENDIAN + _radeon_texformat_rgba8888 = MESA_FORMAT_A8B8G8R8_UNORM; + _radeon_texformat_argb8888 = MESA_FORMAT_B8G8R8A8_UNORM; + _radeon_texformat_rgb565 = MESA_FORMAT_B5G6R5_UNORM; + _radeon_texformat_argb4444 = MESA_FORMAT_B4G4R4A4_UNORM; + _radeon_texformat_argb1555 = MESA_FORMAT_B5G5R5A1_UNORM; +#else + _radeon_texformat_rgba8888 = MESA_FORMAT_R8G8B8A8_UNORM; + _radeon_texformat_argb8888 = MESA_FORMAT_A8R8G8B8_UNORM; + _radeon_texformat_rgb565 = MESA_FORMAT_R5G6B5_UNORM; + _radeon_texformat_argb4444 = MESA_FORMAT_A4R4G4B4_UNORM; + _radeon_texformat_argb1555 = MESA_FORMAT_A1R5G5B5_UNORM; +#endif } void diff -Nru mesa-19.2.8/src/mesa/drivers/dri/r200/radeon_texture.h mesa-20.0.8/src/mesa/drivers/dri/r200/radeon_texture.h --- mesa-19.2.8/src/mesa/drivers/dri/r200/radeon_texture.h 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/drivers/dri/r200/radeon_texture.h 2020-06-12 01:21:18.000000000 +0000 @@ -38,7 +38,6 @@ extern mesa_format _radeon_texformat_rgb565; extern mesa_format _radeon_texformat_argb4444; extern mesa_format _radeon_texformat_argb1555; -extern mesa_format _radeon_texformat_al88; extern void copy_rows(void* dst, GLuint dststride, const void* src, GLuint srcstride, diff -Nru mesa-19.2.8/src/mesa/drivers/dri/radeon/meson.build mesa-20.0.8/src/mesa/drivers/dri/radeon/meson.build --- mesa-19.2.8/src/mesa/drivers/dri/radeon/meson.build 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/drivers/dri/radeon/meson.build 2020-06-12 01:21:18.000000000 +0000 @@ -74,11 +74,11 @@ libr100 = static_library( 'r100', - [files_r100, xmlpool_options_h], + files_r100, include_directories : [ inc_common, inc_dri_common, inc_util, include_directories('server'), ], c_args : [c_vis_args, '-DRADEON_R100'], cpp_args : [cpp_vis_args], - dependencies : [dep_libdrm, dep_libdrm_radeon], + dependencies : [dep_libdrm, dep_libdrm_radeon, idep_xmlconfig_headers], ) diff -Nru mesa-19.2.8/src/mesa/drivers/dri/radeon/radeon_blit.c mesa-20.0.8/src/mesa/drivers/dri/radeon/radeon_blit.c --- mesa-19.2.8/src/mesa/drivers/dri/radeon/radeon_blit.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/drivers/dri/radeon/radeon_blit.c 2020-06-12 01:21:18.000000000 +0000 @@ -42,35 +42,26 @@ unsigned r100_check_blit(mesa_format mesa_format, uint32_t dst_pitch) { /* XXX others? */ - if (_mesa_little_endian()) { - switch (mesa_format) { - case MESA_FORMAT_B8G8R8A8_UNORM: - case MESA_FORMAT_B8G8R8X8_UNORM: - case MESA_FORMAT_B5G6R5_UNORM: - case MESA_FORMAT_B4G4R4A4_UNORM: - case MESA_FORMAT_B5G5R5A1_UNORM: - case MESA_FORMAT_A_UNORM8: - case MESA_FORMAT_L_UNORM8: - case MESA_FORMAT_I_UNORM8: - break; - default: - return 0; - } - } - else { - switch (mesa_format) { - case MESA_FORMAT_A8R8G8B8_UNORM: - case MESA_FORMAT_X8R8G8B8_UNORM: - case MESA_FORMAT_R5G6B5_UNORM: - case MESA_FORMAT_A4R4G4B4_UNORM: - case MESA_FORMAT_A1R5G5B5_UNORM: - case MESA_FORMAT_A_UNORM8: - case MESA_FORMAT_L_UNORM8: - case MESA_FORMAT_I_UNORM8: - break; - default: - return 0; - } + switch (mesa_format) { +#if UTIL_ARCH_LITTLE_ENDIAN + case MESA_FORMAT_B8G8R8A8_UNORM: + case MESA_FORMAT_B8G8R8X8_UNORM: + case MESA_FORMAT_B5G6R5_UNORM: + case MESA_FORMAT_B4G4R4A4_UNORM: + case MESA_FORMAT_B5G5R5A1_UNORM: +#else + case MESA_FORMAT_A8R8G8B8_UNORM: + case MESA_FORMAT_X8R8G8B8_UNORM: + case MESA_FORMAT_R5G6B5_UNORM: + case MESA_FORMAT_A4R4G4B4_UNORM: + case MESA_FORMAT_A1R5G5B5_UNORM: +#endif + case MESA_FORMAT_A_UNORM8: + case MESA_FORMAT_L_UNORM8: + case MESA_FORMAT_I_UNORM8: + break; + default: + return 0; } /* Rendering to small buffer doesn't work. @@ -153,7 +144,7 @@ OUT_BATCH_REGVAL(RADEON_PP_TEX_PITCH_0, pitch * _mesa_get_format_bytes(mesa_format) - 32); OUT_BATCH_REGSEQ(RADEON_PP_TXOFFSET_0, 1); - OUT_BATCH_RELOC(offset, bo, offset, RADEON_GEM_DOMAIN_GTT|RADEON_GEM_DOMAIN_VRAM, 0, 0); + OUT_BATCH_RELOC(bo, offset, RADEON_GEM_DOMAIN_GTT|RADEON_GEM_DOMAIN_VRAM, 0, 0); END_BATCH(); } @@ -215,9 +206,9 @@ OUT_BATCH_REGVAL(RADEON_RB3D_CNTL, dst_format); OUT_BATCH_REGSEQ(RADEON_RB3D_COLOROFFSET, 1); - OUT_BATCH_RELOC(offset, bo, offset, 0, RADEON_GEM_DOMAIN_GTT|RADEON_GEM_DOMAIN_VRAM, 0); + OUT_BATCH_RELOC(bo, offset, 0, RADEON_GEM_DOMAIN_GTT|RADEON_GEM_DOMAIN_VRAM, 0); OUT_BATCH_REGSEQ(RADEON_RB3D_COLORPITCH, 1); - OUT_BATCH_RELOC(dst_pitch, bo, dst_pitch, 0, RADEON_GEM_DOMAIN_GTT|RADEON_GEM_DOMAIN_VRAM, 0); + OUT_BATCH_RELOC(bo, dst_pitch, 0, RADEON_GEM_DOMAIN_GTT|RADEON_GEM_DOMAIN_VRAM, 0); END_BATCH(); } diff -Nru mesa-19.2.8/src/mesa/drivers/dri/radeon/radeon_cmdbuf.h mesa-20.0.8/src/mesa/drivers/dri/radeon/radeon_cmdbuf.h --- mesa-19.2.8/src/mesa/drivers/dri/radeon/radeon_cmdbuf.h 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/drivers/dri/radeon/radeon_cmdbuf.h 2020-06-12 01:21:18.000000000 +0000 @@ -44,7 +44,7 @@ /** * Write a relocated dword to the command buffer. */ -#define OUT_BATCH_RELOC(data, bo, offset, rd, wd, flags) \ +#define OUT_BATCH_RELOC(bo, offset, rd, wd, flags) \ do { \ int __offset = (offset); \ if (0 && __offset) { \ diff -Nru mesa-19.2.8/src/mesa/drivers/dri/radeon/radeon_common_context.c mesa-20.0.8/src/mesa/drivers/dri/radeon/radeon_common_context.c --- mesa-19.2.8/src/mesa/drivers/dri/radeon/radeon_common_context.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/drivers/dri/radeon/radeon_common_context.c 2020-06-12 01:21:18.000000000 +0000 @@ -270,7 +270,7 @@ /* free atom list */ /* free the Mesa context data */ - _mesa_free_context_data(&radeon->glCtx, true); + _mesa_free_context_data(&radeon->glCtx); /* free the option cache */ driDestroyOptionCache(&radeon->optionCache); diff -Nru mesa-19.2.8/src/mesa/drivers/dri/radeon/radeon_context.c mesa-20.0.8/src/mesa/drivers/dri/radeon/radeon_context.c --- mesa-19.2.8/src/mesa/drivers/dri/radeon/radeon_context.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/drivers/dri/radeon/radeon_context.c 2020-06-12 01:21:18.000000000 +0000 @@ -114,7 +114,7 @@ BEGIN_BATCH(4); OUT_BATCH(CP_PACKET0(RADEON_RB3D_ZPASS_ADDR, 0)); - OUT_BATCH_RELOC(0, query->bo, query->curr_offset, 0, RADEON_GEM_DOMAIN_GTT, 0); + OUT_BATCH_RELOC(query->bo, query->curr_offset, 0, RADEON_GEM_DOMAIN_GTT, 0); END_BATCH(); query->curr_offset += sizeof(uint32_t); assert(query->curr_offset < RADEON_QUERY_PAGE_SIZE); diff -Nru mesa-19.2.8/src/mesa/drivers/dri/radeon/radeon_debug.c mesa-20.0.8/src/mesa/drivers/dri/radeon/radeon_debug.c --- mesa-19.2.8/src/mesa/drivers/dri/radeon/radeon_debug.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/drivers/dri/radeon/radeon_debug.c 2020-06-12 01:21:18.000000000 +0000 @@ -57,13 +57,17 @@ {NULL, 0} }; -radeon_debug_type_t radeon_enabled_debug_types; +#if defined(RADEON_R200) +radeon_debug_type_t r200_enabled_debug_types; +#elif defined(RADEON_R100) +radeon_debug_type_t r100_enabled_debug_types; +#endif void radeon_init_debug(void) { - radeon_enabled_debug_types = parse_debug_string(getenv("RADEON_DEBUG"), debug_control); + RADEON_DEBUG = parse_debug_string(getenv("RADEON_DEBUG"), debug_control); - radeon_enabled_debug_types |= RADEON_GENERAL; + RADEON_DEBUG |= RADEON_GENERAL; } void _radeon_debug_add_indent(void) diff -Nru mesa-19.2.8/src/mesa/drivers/dri/radeon/radeon_debug.h mesa-20.0.8/src/mesa/drivers/dri/radeon/radeon_debug.h --- mesa-19.2.8/src/mesa/drivers/dri/radeon/radeon_debug.h 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/drivers/dri/radeon/radeon_debug.h 2020-06-12 01:21:18.000000000 +0000 @@ -81,18 +81,24 @@ char indent[RADEON_MAX_INDENT]; }; -extern radeon_debug_type_t radeon_enabled_debug_types; - /** * Compabibility layer for old debug code **/ -#define RADEON_DEBUG radeon_enabled_debug_types +#if defined(RADEON_R200) +extern radeon_debug_type_t r200_enabled_debug_types; +#define RADEON_DEBUG r200_enabled_debug_types +#elif defined(RADEON_R100) +extern radeon_debug_type_t r100_enabled_debug_types; +#define RADEON_DEBUG r100_enabled_debug_types +#else +#error "Neither RADEON_R100 nor RADEON_R200 are defined." +#endif static inline int radeon_is_debug_enabled(const radeon_debug_type_t type, const radeon_debug_level_t level) { return RADEON_DEBUG_LEVEL >= level - && (type & radeon_enabled_debug_types); + && (type & RADEON_DEBUG); } extern void _radeon_print(const radeon_debug_type_t type, diff -Nru mesa-19.2.8/src/mesa/drivers/dri/radeon/radeon_ioctl.c mesa-20.0.8/src/mesa/drivers/dri/radeon/radeon_ioctl.c --- mesa-19.2.8/src/mesa/drivers/dri/radeon/radeon_ioctl.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/drivers/dri/radeon/radeon_ioctl.c 2020-06-12 01:21:18.000000000 +0000 @@ -290,7 +290,7 @@ OUT_BATCH_PACKET3(RADEON_CP_PACKET3_3D_LOAD_VBPNTR, 2); OUT_BATCH(1); OUT_BATCH(vertex_size | (vertex_size << 8)); - OUT_BATCH_RELOC(offset, bo, offset, RADEON_GEM_DOMAIN_GTT, 0, 0); + OUT_BATCH_RELOC(bo, offset, RADEON_GEM_DOMAIN_GTT, 0, 0); END_BATCH(); #endif diff -Nru mesa-19.2.8/src/mesa/drivers/dri/radeon/radeon_screen.c mesa-20.0.8/src/mesa/drivers/dri/radeon/radeon_screen.c --- mesa-19.2.8/src/mesa/drivers/dri/radeon/radeon_screen.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/drivers/dri/radeon/radeon_screen.c 2020-06-12 01:21:18.000000000 +0000 @@ -696,11 +696,26 @@ _mesa_initialize_window_framebuffer(&rfb->base, mesaVis); if (mesaVis->redBits == 5) - rgbFormat = _mesa_little_endian() ? MESA_FORMAT_B5G6R5_UNORM : MESA_FORMAT_R5G6B5_UNORM; + rgbFormat = +#if UTIL_ARCH_LITTLE_ENDIAN + MESA_FORMAT_B5G6R5_UNORM; +#else + MESA_FORMAT_R5G6B5_UNORM; +#endif else if (mesaVis->alphaBits == 0) - rgbFormat = _mesa_little_endian() ? MESA_FORMAT_B8G8R8X8_UNORM : MESA_FORMAT_X8R8G8B8_UNORM; + rgbFormat = +#if UTIL_ARCH_LITTLE_ENDIAN + MESA_FORMAT_B8G8R8X8_UNORM; +#else + MESA_FORMAT_X8R8G8B8_UNORM; +#endif else - rgbFormat = _mesa_little_endian() ? MESA_FORMAT_B8G8R8A8_UNORM : MESA_FORMAT_A8R8G8B8_UNORM; + rgbFormat = +#if UTIL_ARCH_LITTLE_ENDIAN + MESA_FORMAT_B8G8R8A8_UNORM; +#else + MESA_FORMAT_A8R8G8B8_UNORM; +#endif /* front color renderbuffer */ rfb->color_rb[0] = radeon_create_renderbuffer(rgbFormat, driDrawPriv); diff -Nru mesa-19.2.8/src/mesa/drivers/dri/radeon/radeon_state_init.c mesa-20.0.8/src/mesa/drivers/dri/radeon/radeon_state_init.c --- mesa-19.2.8/src/mesa/drivers/dri/radeon/radeon_state_init.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/drivers/dri/radeon/radeon_state_init.c 2020-06-12 01:21:18.000000000 +0000 @@ -378,7 +378,7 @@ if (drb) { OUT_BATCH(CP_PACKET0(RADEON_RB3D_DEPTHOFFSET, 0)); - OUT_BATCH_RELOC(0, drb->bo, 0, 0, RADEON_GEM_DOMAIN_VRAM, 0); + OUT_BATCH_RELOC(drb->bo, 0, 0, RADEON_GEM_DOMAIN_VRAM, 0); OUT_BATCH(CP_PACKET0(RADEON_RB3D_DEPTHPITCH, 0)); OUT_BATCH(zbpitch); @@ -392,10 +392,10 @@ if (rrb) { OUT_BATCH(CP_PACKET0(RADEON_RB3D_COLOROFFSET, 0)); - OUT_BATCH_RELOC(rrb->draw_offset, rrb->bo, rrb->draw_offset, 0, RADEON_GEM_DOMAIN_VRAM, 0); + OUT_BATCH_RELOC(rrb->bo, rrb->draw_offset, 0, RADEON_GEM_DOMAIN_VRAM, 0); OUT_BATCH(CP_PACKET0(RADEON_RB3D_COLORPITCH, 0)); - OUT_BATCH_RELOC(cbpitch, rrb->bo, cbpitch, 0, RADEON_GEM_DOMAIN_VRAM, 0); + OUT_BATCH_RELOC(rrb->bo, cbpitch, 0, RADEON_GEM_DOMAIN_VRAM, 0); } // if (atom->cmd_size == CTX_STATE_SIZE_NEWDRM) { @@ -447,7 +447,7 @@ lvl = &t->mt->levels[0]; for (j = 0; j < 5; j++) { OUT_BATCH(CP_PACKET0(base_reg + (4 * j), 0)); - OUT_BATCH_RELOC(lvl->faces[j].offset, t->mt->bo, lvl->faces[j].offset, + OUT_BATCH_RELOC(t->mt->bo, lvl->faces[j].offset, RADEON_GEM_DOMAIN_GTT|RADEON_GEM_DOMAIN_VRAM, 0, 0); } END_BATCH(); @@ -485,15 +485,16 @@ if (ctx->Texture.Unit[i]._Current && ctx->Texture.Unit[i]._Current->Target == GL_TEXTURE_CUBE_MAP) { lvl = &t->mt->levels[t->minLod]; - OUT_BATCH_RELOC(lvl->faces[5].offset, t->mt->bo, lvl->faces[5].offset, + OUT_BATCH_RELOC(t->mt->bo, lvl->faces[5].offset, RADEON_GEM_DOMAIN_GTT|RADEON_GEM_DOMAIN_VRAM, 0, 0); } else { - OUT_BATCH_RELOC(t->tile_bits, t->mt->bo, get_base_teximage_offset(t), + OUT_BATCH_RELOC(t->mt->bo, + get_base_teximage_offset(t) | t->tile_bits, RADEON_GEM_DOMAIN_GTT|RADEON_GEM_DOMAIN_VRAM, 0, 0); } } else { if (t->bo) - OUT_BATCH_RELOC(t->tile_bits, t->bo, 0, + OUT_BATCH_RELOC(t->bo, t->tile_bits, RADEON_GEM_DOMAIN_GTT|RADEON_GEM_DOMAIN_VRAM, 0, 0); } } diff -Nru mesa-19.2.8/src/mesa/drivers/dri/radeon/radeon_tex.h mesa-20.0.8/src/mesa/drivers/dri/radeon/radeon_tex.h --- mesa-19.2.8/src/mesa/drivers/dri/radeon/radeon_tex.h 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/drivers/dri/radeon/radeon_tex.h 2020-06-12 01:21:18.000000000 +0000 @@ -72,8 +72,7 @@ [ MESA_FORMAT_A4R4G4B4_UNORM ] = { RADEON_TXFORMAT_ARGB4444 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 }, [ MESA_FORMAT_B5G5R5A1_UNORM ] = { RADEON_TXFORMAT_ARGB1555 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 }, [ MESA_FORMAT_A1R5G5B5_UNORM ] = { RADEON_TXFORMAT_ARGB1555 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 }, - [ MESA_FORMAT_L8A8_UNORM ] = { RADEON_TXFORMAT_AI88 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 }, - [ MESA_FORMAT_A8L8_UNORM ] = { RADEON_TXFORMAT_AI88 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 }, + [ MESA_FORMAT_LA_UNORM8 ] = { RADEON_TXFORMAT_AI88 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 }, [ MESA_FORMAT_A_UNORM8 ] = { RADEON_TXFORMAT_I8 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 }, [ MESA_FORMAT_L_UNORM8 ] = { RADEON_TXFORMAT_I8, 0 }, [ MESA_FORMAT_I_UNORM8 ] = { RADEON_TXFORMAT_I8 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 }, diff -Nru mesa-19.2.8/src/mesa/drivers/dri/radeon/radeon_texstate.c mesa-20.0.8/src/mesa/drivers/dri/radeon/radeon_texstate.c --- mesa-19.2.8/src/mesa/drivers/dri/radeon/radeon_texstate.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/drivers/dri/radeon/radeon_texstate.c 2020-06-12 01:21:18.000000000 +0000 @@ -597,29 +597,23 @@ t->override_offset = 0; switch (rb->cpp) { case 4: - if (texture_format == __DRI_TEXTURE_FORMAT_RGB) { - t->pp_txformat = tx_table[MESA_FORMAT_BGR_UNORM8].format; + if (texture_format == __DRI_TEXTURE_FORMAT_RGB) texFormat = MESA_FORMAT_BGR_UNORM8; - } - else { - t->pp_txformat = tx_table[MESA_FORMAT_B8G8R8A8_UNORM].format; + else texFormat = MESA_FORMAT_B8G8R8A8_UNORM; - } - t->pp_txfilter |= tx_table[MESA_FORMAT_B8G8R8A8_UNORM].filter; break; case 3: default: texFormat = MESA_FORMAT_BGR_UNORM8; - t->pp_txformat = tx_table[MESA_FORMAT_BGR_UNORM8].format; - t->pp_txfilter |= tx_table[MESA_FORMAT_BGR_UNORM8].filter; break; case 2: texFormat = MESA_FORMAT_B5G6R5_UNORM; - t->pp_txformat = tx_table[MESA_FORMAT_B5G6R5_UNORM].format; - t->pp_txfilter |= tx_table[MESA_FORMAT_B5G6R5_UNORM].filter; break; } + t->pp_txformat = tx_table[texFormat].format; + t->pp_txfilter |= tx_table[texFormat].filter; + _mesa_init_teximage_fields(&radeon->glCtx, texImage, rb->base.Base.Width, rb->base.Base.Height, 1, 0, diff -Nru mesa-19.2.8/src/mesa/drivers/dri/radeon/radeon_texture.c mesa-20.0.8/src/mesa/drivers/dri/radeon/radeon_texture.c --- mesa-19.2.8/src/mesa/drivers/dri/radeon/radeon_texture.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/drivers/dri/radeon/radeon_texture.c 2020-06-12 01:21:18.000000000 +0000 @@ -359,7 +359,7 @@ #if defined(RADEON_R200) /* r200: can't use a8 format since interpreting hw I8 as a8 would result in wrong rgb values (same as alpha value instead of 0). */ - return _radeon_texformat_al88; + return MESA_FORMAT_LA_UNORM8; #else return MESA_FORMAT_A_UNORM8; #endif @@ -381,7 +381,7 @@ case GL_LUMINANCE12_ALPHA12: case GL_LUMINANCE16_ALPHA16: case GL_COMPRESSED_LUMINANCE_ALPHA: - return _radeon_texformat_al88; + return MESA_FORMAT_LA_UNORM8; case GL_INTENSITY: case GL_INTENSITY4: @@ -464,7 +464,7 @@ case GL_SLUMINANCE_ALPHA: case GL_SLUMINANCE8_ALPHA8: case GL_COMPRESSED_SLUMINANCE_ALPHA: - return MESA_FORMAT_L8A8_SRGB; + return MESA_FORMAT_LA_SRGB8; case GL_COMPRESSED_SRGB_S3TC_DXT1_EXT: return MESA_FORMAT_SRGB_DXT1; @@ -596,29 +596,25 @@ mesa_format _radeon_texformat_rgb565 = MESA_FORMAT_NONE; mesa_format _radeon_texformat_argb4444 = MESA_FORMAT_NONE; mesa_format _radeon_texformat_argb1555 = MESA_FORMAT_NONE; -mesa_format _radeon_texformat_al88 = MESA_FORMAT_NONE; /*@}*/ static void radeonInitTextureFormats(void) { - if (_mesa_little_endian()) { - _radeon_texformat_rgba8888 = MESA_FORMAT_A8B8G8R8_UNORM; - _radeon_texformat_argb8888 = MESA_FORMAT_B8G8R8A8_UNORM; - _radeon_texformat_rgb565 = MESA_FORMAT_B5G6R5_UNORM; - _radeon_texformat_argb4444 = MESA_FORMAT_B4G4R4A4_UNORM; - _radeon_texformat_argb1555 = MESA_FORMAT_B5G5R5A1_UNORM; - _radeon_texformat_al88 = MESA_FORMAT_L8A8_UNORM; - } - else { - _radeon_texformat_rgba8888 = MESA_FORMAT_R8G8B8A8_UNORM; - _radeon_texformat_argb8888 = MESA_FORMAT_A8R8G8B8_UNORM; - _radeon_texformat_rgb565 = MESA_FORMAT_R5G6B5_UNORM; - _radeon_texformat_argb4444 = MESA_FORMAT_A4R4G4B4_UNORM; - _radeon_texformat_argb1555 = MESA_FORMAT_A1R5G5B5_UNORM; - _radeon_texformat_al88 = MESA_FORMAT_A8L8_UNORM; - } +#if UTIL_ARCH_LITTLE_ENDIAN + _radeon_texformat_rgba8888 = MESA_FORMAT_A8B8G8R8_UNORM; + _radeon_texformat_argb8888 = MESA_FORMAT_B8G8R8A8_UNORM; + _radeon_texformat_rgb565 = MESA_FORMAT_B5G6R5_UNORM; + _radeon_texformat_argb4444 = MESA_FORMAT_B4G4R4A4_UNORM; + _radeon_texformat_argb1555 = MESA_FORMAT_B5G5R5A1_UNORM; +#else + _radeon_texformat_rgba8888 = MESA_FORMAT_R8G8B8A8_UNORM; + _radeon_texformat_argb8888 = MESA_FORMAT_A8R8G8B8_UNORM; + _radeon_texformat_rgb565 = MESA_FORMAT_R5G6B5_UNORM; + _radeon_texformat_argb4444 = MESA_FORMAT_A4R4G4B4_UNORM; + _radeon_texformat_argb1555 = MESA_FORMAT_A1R5G5B5_UNORM; +#endif } void diff -Nru mesa-19.2.8/src/mesa/drivers/dri/radeon/radeon_texture.h mesa-20.0.8/src/mesa/drivers/dri/radeon/radeon_texture.h --- mesa-19.2.8/src/mesa/drivers/dri/radeon/radeon_texture.h 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/drivers/dri/radeon/radeon_texture.h 2020-06-12 01:21:18.000000000 +0000 @@ -38,7 +38,6 @@ extern mesa_format _radeon_texformat_rgb565; extern mesa_format _radeon_texformat_argb4444; extern mesa_format _radeon_texformat_argb1555; -extern mesa_format _radeon_texformat_al88; extern void copy_rows(void* dst, GLuint dststride, const void* src, GLuint srcstride, diff -Nru mesa-19.2.8/src/mesa/drivers/dri/swrast/swrast.c mesa-20.0.8/src/mesa/drivers/dri/swrast/swrast.c --- mesa-19.2.8/src/mesa/drivers/dri/swrast/swrast.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/drivers/dri/swrast/swrast.c 2020-06-12 01:21:18.000000000 +0000 @@ -584,9 +584,9 @@ /* add software renderbuffers */ _swrast_add_soft_renderbuffers(fb, GL_FALSE, /* color */ - visual->haveDepthBuffer, - visual->haveStencilBuffer, - visual->haveAccumBuffer, + visual->depthBits > 0, + visual->stencilBits > 0, + visual->accumRedBits > 0, GL_FALSE, /* alpha */ GL_FALSE /* aux bufs */); diff -Nru mesa-19.2.8/src/mesa/drivers/osmesa/meson.build mesa-20.0.8/src/mesa/drivers/osmesa/meson.build --- mesa-19.2.8/src/mesa/drivers/osmesa/meson.build 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/drivers/osmesa/meson.build 2020-06-12 01:21:18.000000000 +0000 @@ -36,6 +36,8 @@ link_whole : libglapi_static, link_with : [libmesa_classic, osmesa_link_with], dependencies : [dep_thread, dep_selinux], + name_prefix : host_machine.system() == 'windows' ? '' : 'lib', # otherwise mingw will create libosmesa.dll + soversion : host_machine.system() == 'windows' ? '' : '8', version : '8.0.0', install : true, ) diff -Nru mesa-19.2.8/src/mesa/drivers/osmesa/osmesa.c mesa-20.0.8/src/mesa/drivers/osmesa/osmesa.c --- mesa-19.2.8/src/mesa/drivers/osmesa/osmesa.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/drivers/osmesa/osmesa.c 2020-06-12 01:21:18.000000000 +0000 @@ -455,10 +455,11 @@ */ if (osmesa->format == OSMESA_RGBA) { if (osmesa->DataType == GL_UNSIGNED_BYTE) { - if (_mesa_little_endian()) +#if UTIL_ARCH_LITTLE_ENDIAN rb->Format = MESA_FORMAT_R8G8B8A8_UNORM; - else +#else rb->Format = MESA_FORMAT_A8B8G8R8_UNORM; +#endif } else if (osmesa->DataType == GL_UNSIGNED_SHORT) { rb->Format = MESA_FORMAT_RGBA_UNORM16; @@ -469,10 +470,11 @@ } else if (osmesa->format == OSMESA_BGRA) { if (osmesa->DataType == GL_UNSIGNED_BYTE) { - if (_mesa_little_endian()) +#if UTIL_ARCH_LITTLE_ENDIAN rb->Format = MESA_FORMAT_B8G8R8A8_UNORM; - else +#else rb->Format = MESA_FORMAT_A8R8G8B8_UNORM; +#endif } else if (osmesa->DataType == GL_UNSIGNED_SHORT) { _mesa_warning(ctx, "Unsupported OSMesa format BGRA/GLushort"); @@ -485,10 +487,11 @@ } else if (osmesa->format == OSMESA_ARGB) { if (osmesa->DataType == GL_UNSIGNED_BYTE) { - if (_mesa_little_endian()) +#if UTIL_ARCH_LITTLE_ENDIAN rb->Format = MESA_FORMAT_A8R8G8B8_UNORM; - else +#else rb->Format = MESA_FORMAT_B8G8R8A8_UNORM; +#endif } else if (osmesa->DataType == GL_UNSIGNED_SHORT) { _mesa_warning(ctx, "Unsupported OSMesa format ARGB/GLushort"); @@ -854,7 +857,7 @@ osmesa->gl_buffer = _mesa_create_framebuffer(osmesa->gl_visual); if (!osmesa->gl_buffer) { _mesa_destroy_visual( osmesa->gl_visual ); - _mesa_free_context_data(&osmesa->mesa, true); + _mesa_free_context_data(&osmesa->mesa); free(osmesa); return NULL; } @@ -864,9 +867,9 @@ */ _swrast_add_soft_renderbuffers(osmesa->gl_buffer, GL_FALSE, /* color */ - osmesa->gl_visual->haveDepthBuffer, - osmesa->gl_visual->haveStencilBuffer, - osmesa->gl_visual->haveAccumBuffer, + osmesa->gl_visual->depthBits > 0, + osmesa->gl_visual->stencilBits > 0, + osmesa->gl_visual->accumRedBits > 0, GL_FALSE, /* alpha */ GL_FALSE /* aux */ ); @@ -891,7 +894,7 @@ !_tnl_CreateContext( ctx ) || !_swsetup_CreateContext( ctx )) { _mesa_destroy_visual(osmesa->gl_visual); - _mesa_free_context_data(ctx, true); + _mesa_free_context_data(ctx); free(osmesa); return NULL; } @@ -919,7 +922,7 @@ if (ctx->Version < version_major * 10 + version_minor) { _mesa_destroy_visual(osmesa->gl_visual); - _mesa_free_context_data(ctx, true); + _mesa_free_context_data(ctx); free(osmesa); return NULL; } @@ -955,7 +958,7 @@ _mesa_destroy_visual( osmesa->gl_visual ); _mesa_reference_framebuffer( &osmesa->gl_buffer, NULL ); - _mesa_free_context_data(&osmesa->mesa, true); + _mesa_free_context_data(&osmesa->mesa); free( osmesa ); } } diff -Nru mesa-19.2.8/src/mesa/drivers/osmesa/SConscript mesa-20.0.8/src/mesa/drivers/osmesa/SConscript --- mesa-19.2.8/src/mesa/drivers/osmesa/SConscript 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/drivers/osmesa/SConscript 2020-06-12 01:21:18.000000000 +0000 @@ -8,6 +8,7 @@ '#src/mesa', Dir('../../../mapi'), # src/mapi build path for python-generated GL API files/headers Dir('../../../mapi/glapi'), # src/mapi/glapi build path + Dir('../../../mapi/glapi/gen'), # src/mapi/glapi build path ]) env.Prepend(LIBS = [ diff -Nru mesa-19.2.8/src/mesa/drivers/x11/fakeglx.c mesa-20.0.8/src/mesa/drivers/x11/fakeglx.c --- mesa-19.2.8/src/mesa/drivers/x11/fakeglx.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/drivers/x11/fakeglx.c 2020-06-12 01:21:18.000000000 +0000 @@ -1551,12 +1551,7 @@ case GLX_RGBA: if (fbconfig) return GLX_BAD_ATTRIBUTE; - if (xmvis->mesa_visual.rgbMode) { - *value = True; - } - else { - *value = False; - } + *value = True; return 0; case GLX_DOUBLEBUFFER: *value = (int) xmvis->mesa_visual.doubleBufferMode; @@ -1618,12 +1613,7 @@ } else if (xmvis->mesa_visual.level>0) { /* overlay */ - if (xmvis->mesa_visual.rgbMode) { - *value = GLX_TRANSPARENT_RGB_EXT; - } - else { - *value = GLX_TRANSPARENT_INDEX_EXT; - } + *value = GLX_TRANSPARENT_RGB_EXT; } else if (xmvis->mesa_visual.level<0) { /* underlay */ @@ -1691,10 +1681,8 @@ return GLX_BAD_ATTRIBUTE; if (xmvis->mesa_visual.floatMode) *value = GLX_RGBA_FLOAT_BIT_ARB; - else if (xmvis->mesa_visual.rgbMode) - *value = GLX_RGBA_BIT; else - *value = GLX_COLOR_INDEX_BIT; + *value = GLX_RGBA_BIT; break; case GLX_X_RENDERABLE_SGIX: if (!fbconfig) @@ -2536,14 +2524,14 @@ } -static int +static void Fake_glXQueryGLXPbufferSGIX(Display *dpy, GLXPbufferSGIX pbuf, int attribute, unsigned int *value) { const XMesaBuffer xmbuf = XMesaFindBuffer(dpy, pbuf); if (!xmbuf) { /* Generate GLXBadPbufferSGIX for bad pbuffer */ - return 0; + return; } switch (attribute) { @@ -2565,7 +2553,6 @@ default: *value = 0; } - return 0; } @@ -2687,7 +2674,7 @@ /*** GLX_SUN_get_transparent_index ***/ static Status -Fake_glXGetTransparentIndexSUN(Display *dpy, Window overlay, Window underlay, long *pTransparent) +Fake_glXGetTransparentIndexSUN(Display *dpy, Window overlay, Window underlay, unsigned long *pTransparent) { (void) dpy; (void) overlay; diff -Nru mesa-19.2.8/src/mesa/drivers/x11/glxapi.c mesa-20.0.8/src/mesa/drivers/x11/glxapi.c --- mesa-19.2.8/src/mesa/drivers/x11/glxapi.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/drivers/x11/glxapi.c 2020-06-12 01:21:18.000000000 +0000 @@ -69,7 +69,7 @@ /* skip normal ones */ #define _GLAPI_SKIP_NORMAL_ENTRY_POINTS -#include "glapi/glapitemp.h" +#include "glapitemp.h" #endif /* GLX_INDIRECT_RENDERING */ @@ -791,14 +791,14 @@ t->DestroyGLXPbufferSGIX(dpy, pbuf); } -int PUBLIC +void PUBLIC glXQueryGLXPbufferSGIX(Display *dpy, GLXPbufferSGIX pbuf, int attribute, unsigned int *value) { struct _glxapi_table *t; GET_DISPATCH(dpy, t); if (!t) - return 0; - return t->QueryGLXPbufferSGIX(dpy, pbuf, attribute, value); + return; + t->QueryGLXPbufferSGIX(dpy, pbuf, attribute, value); } void PUBLIC @@ -909,7 +909,7 @@ /*** GLX_SUN_get_transparent_index ***/ Status PUBLIC -glXGetTransparentIndexSUN(Display *dpy, Window overlay, Window underlay, long *pTransparent) +glXGetTransparentIndexSUN(Display *dpy, Window overlay, Window underlay, unsigned long *pTransparent) { struct _glxapi_table *t; GET_DISPATCH(dpy, t); diff -Nru mesa-19.2.8/src/mesa/drivers/x11/glxapi.h mesa-20.0.8/src/mesa/drivers/x11/glxapi.h --- mesa-19.2.8/src/mesa/drivers/x11/glxapi.h 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/drivers/x11/glxapi.h 2020-06-12 01:21:18.000000000 +0000 @@ -143,7 +143,7 @@ /*** GLX_SGIX_pbuffer ***/ GLXPbufferSGIX (*CreateGLXPbufferSGIX)(Display *, GLXFBConfigSGIX, unsigned int, unsigned int, int *); void (*DestroyGLXPbufferSGIX)(Display *, GLXPbufferSGIX); - int (*QueryGLXPbufferSGIX)(Display *, GLXPbufferSGIX, int, unsigned int *); + void (*QueryGLXPbufferSGIX)(Display *, GLXPbufferSGIX, int, unsigned int *); void (*SelectEventSGIX)(Display *, GLXDrawable, unsigned long); void (*GetSelectedEventSGIX)(Display *, GLXDrawable, unsigned long *); @@ -165,7 +165,7 @@ #endif /*** GLX_SUN_get_transparent_index ***/ - Status (*GetTransparentIndexSUN)(Display *, Window, Window, long *); + Status (*GetTransparentIndexSUN)(Display *, Window, Window, unsigned long *); /*** GLX_MESA_copy_sub_buffer ***/ void (*CopySubBufferMESA)(Display *dpy, GLXDrawable drawable, int x, int y, int width, int height); diff -Nru mesa-19.2.8/src/mesa/drivers/x11/SConscript mesa-20.0.8/src/mesa/drivers/x11/SConscript --- mesa-19.2.8/src/mesa/drivers/x11/SConscript 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/drivers/x11/SConscript 2020-06-12 01:21:18.000000000 +0000 @@ -8,6 +8,7 @@ '#/src/mesa', '#/src/mesa/main', Dir('../../../mapi'), # src/mapi build path for python-generated GL API files/headers + Dir('../../../mapi/glapi/gen'), # src/mapi/glapi/gen build path for python-generated GL API files/headers ]) env.Append(CPPDEFINES = ['USE_XSHM']) diff -Nru mesa-19.2.8/src/mesa/drivers/x11/xm_api.c mesa-20.0.8/src/mesa/drivers/x11/xm_api.c --- mesa-19.2.8/src/mesa/drivers/x11/xm_api.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/drivers/x11/xm_api.c 2020-06-12 01:21:18.000000000 +0000 @@ -338,9 +338,9 @@ */ _swrast_add_soft_renderbuffers(&b->mesa_buffer, GL_FALSE, /* color */ - vis->mesa_visual.haveDepthBuffer, - vis->mesa_visual.haveStencilBuffer, - vis->mesa_visual.haveAccumBuffer, + vis->mesa_visual.depthBits > 0, + vis->mesa_visual.stencilBits > 0, + vis->mesa_visual.accumRedBits > 0, GL_FALSE, /* software alpha buffer */ vis->mesa_visual.numAuxBuffers > 0 ); @@ -582,7 +582,6 @@ _mesa_warning(NULL, "XMesa: RGB mode rendering not supported in given visual.\n"); return GL_FALSE; } - v->mesa_visual.indexBits = 0; if (getenv("MESA_NO_DITHER")) { v->dithered_pf = v->undithered_pf; @@ -945,7 +944,7 @@ !_vbo_CreateContext( mesaCtx ) || !_tnl_CreateContext( mesaCtx ) || !_swsetup_CreateContext( mesaCtx )) { - _mesa_free_context_data(&c->mesa, true); + _mesa_free_context_data(&c->mesa); free(c); return NULL; } @@ -982,7 +981,7 @@ _swrast_DestroyContext( mesaCtx ); _tnl_DestroyContext( mesaCtx ); _vbo_DestroyContext( mesaCtx ); - _mesa_free_context_data(mesaCtx, true); + _mesa_free_context_data(mesaCtx); free( c ); } diff -Nru mesa-19.2.8/src/mesa/main/accum.c mesa-20.0.8/src/mesa/main/accum.c --- mesa-19.2.8/src/mesa/main/accum.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/main/accum.c 2020-06-12 01:21:18.000000000 +0000 @@ -468,7 +468,7 @@ return; } - if (ctx->DrawBuffer->Visual.haveAccumBuffer == 0) { + if (ctx->DrawBuffer->Visual.accumRedBits == 0) { _mesa_error(ctx, GL_INVALID_OPERATION, "glAccum(no accum buffer)"); return; } diff -Nru mesa-19.2.8/src/mesa/main/api_loopback.c mesa-20.0.8/src/mesa/main/api_loopback.c --- mesa-19.2.8/src/mesa/main/api_loopback.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/main/api_loopback.c 2020-06-12 01:21:18.000000000 +0000 @@ -631,10 +631,7 @@ void GLAPIENTRY _mesa_Vertex3dv( const GLdouble *v ) { - if (v[2] == 0.0) - VERTEX2( (GLfloat) v[0], (GLfloat) v[1] ); - else - VERTEX3( (GLfloat) v[0], (GLfloat) v[1], (GLfloat) v[2] ); + VERTEX3( (GLfloat) v[0], (GLfloat) v[1], (GLfloat) v[2] ); } void GLAPIENTRY diff -Nru mesa-19.2.8/src/mesa/main/arbprogram.c mesa-20.0.8/src/mesa/main/arbprogram.c --- mesa-19.2.8/src/mesa/main/arbprogram.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/main/arbprogram.c 2020-06-12 01:21:18.000000000 +0000 @@ -59,6 +59,40 @@ ctx->NewDriverState |= new_driver_state; } +static struct gl_program* +lookup_or_create_program(GLuint id, GLenum target, const char* caller) +{ + GET_CURRENT_CONTEXT(ctx); + struct gl_program* newProg; + + if (id == 0) { + /* Bind a default program */ + if (target == GL_VERTEX_PROGRAM_ARB) + newProg = ctx->Shared->DefaultVertexProgram; + else + newProg = ctx->Shared->DefaultFragmentProgram; + } + else { + /* Bind a user program */ + newProg = _mesa_lookup_program(ctx, id); + if (!newProg || newProg == &_mesa_DummyProgram) { + /* allocate a new program now */ + newProg = ctx->Driver.NewProgram(ctx, target, id, true); + if (!newProg) { + _mesa_error(ctx, GL_OUT_OF_MEMORY, "%s", caller); + return NULL; + } + _mesa_HashInsert(ctx->Shared->Programs, id, newProg); + } + else if (newProg->Target != target) { + _mesa_error(ctx, GL_INVALID_OPERATION, + "%s(target mismatch)", caller); + return NULL; + } + } + return newProg; +} + /** * Bind a program (make it current) * \note Called from the GL API dispatcher by both glBindProgramNV @@ -88,32 +122,9 @@ * NOTE: binding to a non-existant program is not an error. * That's supposed to be caught in glBegin. */ - if (id == 0) { - /* Bind a default program */ - newProg = NULL; - if (target == GL_VERTEX_PROGRAM_ARB) - newProg = ctx->Shared->DefaultVertexProgram; - else - newProg = ctx->Shared->DefaultFragmentProgram; - } - else { - /* Bind a user program */ - newProg = _mesa_lookup_program(ctx, id); - if (!newProg || newProg == &_mesa_DummyProgram) { - /* allocate a new program now */ - newProg = ctx->Driver.NewProgram(ctx, target, id, true); - if (!newProg) { - _mesa_error(ctx, GL_OUT_OF_MEMORY, "glBindProgramARB"); - return; - } - _mesa_HashInsert(ctx->Shared->Programs, id, newProg); - } - else if (newProg->Target != target) { - _mesa_error(ctx, GL_INVALID_OPERATION, - "glBindProgramARB(target mismatch)"); - return; - } - } + newProg = lookup_or_create_program(id, target, "glBindProgram"); + if (!newProg) + return; /** All error checking is complete now **/ @@ -259,27 +270,35 @@ return GL_FALSE; } -static GLboolean -get_local_param_pointer(struct gl_context *ctx, const char *func, - GLenum target, GLuint index, GLfloat **param) +static struct gl_program* +get_current_program(struct gl_context* ctx, GLenum target, const char* caller) { - struct gl_program *prog; - GLuint maxParams; - if (target == GL_VERTEX_PROGRAM_ARB && ctx->Extensions.ARB_vertex_program) { - prog = ctx->VertexProgram.Current; - maxParams = ctx->Const.Program[MESA_SHADER_VERTEX].MaxLocalParams; + return ctx->VertexProgram.Current; } else if (target == GL_FRAGMENT_PROGRAM_ARB && ctx->Extensions.ARB_fragment_program) { - prog = ctx->FragmentProgram.Current; - maxParams = ctx->Const.Program[MESA_SHADER_FRAGMENT].MaxLocalParams; + return ctx->FragmentProgram.Current; } else { _mesa_error(ctx, GL_INVALID_ENUM, - "%s(target)", func); - return GL_FALSE; + "%s(target)", caller); + return NULL; + } +} + +static GLboolean +get_local_param_pointer(struct gl_context *ctx, const char *func, + struct gl_program* prog, GLenum target, + GLuint index, GLfloat **param) +{ + GLuint maxParams; + + if (target == GL_VERTEX_PROGRAM_ARB) { + maxParams = ctx->Const.Program[MESA_SHADER_VERTEX].MaxLocalParams; + } else { + maxParams = ctx->Const.Program[MESA_SHADER_FRAGMENT].MaxLocalParams; } if (index >= maxParams) { @@ -326,11 +345,10 @@ } } -void GLAPIENTRY -_mesa_ProgramStringARB(GLenum target, GLenum format, GLsizei len, +static void +set_program_string(struct gl_program *prog, GLenum target, GLenum format, GLsizei len, const GLvoid *string) { - struct gl_program *prog; bool failed; GET_CURRENT_CONTEXT(ctx); @@ -363,12 +381,10 @@ #endif /* ENABLE_SHADER_CACHE */ if (target == GL_VERTEX_PROGRAM_ARB && ctx->Extensions.ARB_vertex_program) { - prog = ctx->VertexProgram.Current; _mesa_parse_arb_vertex_program(ctx, target, string, len, prog); } else if (target == GL_FRAGMENT_PROGRAM_ARB && ctx->Extensions.ARB_fragment_program) { - prog = ctx->FragmentProgram.Current; _mesa_parse_arb_fragment_program(ctx, target, string, len, prog); } else { @@ -432,6 +448,36 @@ } } +void GLAPIENTRY +_mesa_ProgramStringARB(GLenum target, GLenum format, GLsizei len, + const GLvoid *string) +{ + GET_CURRENT_CONTEXT(ctx); + if (target == GL_VERTEX_PROGRAM_ARB && ctx->Extensions.ARB_vertex_program) { + set_program_string(ctx->VertexProgram.Current, target, format, len, string); + } + else if (target == GL_FRAGMENT_PROGRAM_ARB + && ctx->Extensions.ARB_fragment_program) { + set_program_string(ctx->FragmentProgram.Current, target, format, len, string); + } + else { + _mesa_error(ctx, GL_INVALID_ENUM, "glProgramStringARB(target)"); + return; + } +} + +void GLAPIENTRY +_mesa_NamedProgramStringEXT(GLuint program, GLenum target, GLenum format, GLsizei len, + const GLvoid *string) +{ + struct gl_program* prog = lookup_or_create_program(program, target, "glNamedProgramStringEXT"); + + if (!prog) { + return; + } + set_program_string(prog, target, format, len, string); +} + /** * Set a program env parameter register. @@ -576,11 +622,40 @@ { GET_CURRENT_CONTEXT(ctx); GLfloat *param; + struct gl_program* prog = get_current_program(ctx, target, "glProgramLocalParameterARB"); + if (!prog) { + return; + } flush_vertices_for_program_constants(ctx, target); if (get_local_param_pointer(ctx, "glProgramLocalParameterARB", - target, index, ¶m)) { + prog, target, index, ¶m)) { + assert(index < MAX_PROGRAM_LOCAL_PARAMS); + ASSIGN_4V(param, x, y, z, w); + } +} + +void GLAPIENTRY +_mesa_NamedProgramLocalParameter4fEXT(GLuint program, GLenum target, GLuint index, + GLfloat x, GLfloat y, GLfloat z, GLfloat w) +{ + GET_CURRENT_CONTEXT(ctx); + GLfloat *param; + struct gl_program* prog = lookup_or_create_program(program, target, + "glNamedProgramLocalParameter4fEXT"); + + if (!prog) { + return; + } + + if ((target == GL_VERTEX_PROGRAM_ARB && prog == ctx->VertexProgram.Current) || + (target == GL_FRAGMENT_PROGRAM_ARB && prog == ctx->FragmentProgram.Current)) { + flush_vertices_for_program_constants(ctx, target); + } + + if (get_local_param_pointer(ctx, "glNamedProgramLocalParameter4fEXT", + prog, target, index, ¶m)) { assert(index < MAX_PROGRAM_LOCAL_PARAMS); ASSIGN_4V(param, x, y, z, w); } @@ -597,27 +672,36 @@ void GLAPIENTRY -_mesa_ProgramLocalParameters4fvEXT(GLenum target, GLuint index, GLsizei count, - const GLfloat *params) +_mesa_NamedProgramLocalParameter4fvEXT(GLuint program, GLenum target, GLuint index, + const GLfloat *params) +{ + _mesa_NamedProgramLocalParameter4fEXT(program, target, index, params[0], + params[1], params[2], params[3]); +} + + +static void +program_local_parameters4fv(struct gl_program* prog, GLuint index, GLsizei count, + const GLfloat *params, const char* caller) { GET_CURRENT_CONTEXT(ctx); GLfloat *dest; - - flush_vertices_for_program_constants(ctx, target); + flush_vertices_for_program_constants(ctx, prog->Target); if (count <= 0) { - _mesa_error(ctx, GL_INVALID_VALUE, "glProgramLocalParameters4fv(count)"); + _mesa_error(ctx, GL_INVALID_VALUE, "%s(count)", caller); } - if (get_local_param_pointer(ctx, "glProgramLocalParameters4fvEXT", - target, index, &dest)) { - GLuint maxParams = target == GL_FRAGMENT_PROGRAM_ARB ? + if (get_local_param_pointer(ctx, caller, + prog, prog->Target, index, &dest)) { + GLuint maxParams = prog->Target == GL_FRAGMENT_PROGRAM_ARB ? ctx->Const.Program[MESA_SHADER_FRAGMENT].MaxLocalParams : ctx->Const.Program[MESA_SHADER_VERTEX].MaxLocalParams; if ((index + count) > maxParams) { _mesa_error(ctx, GL_INVALID_VALUE, - "glProgramLocalParameters4fvEXT(index + count)"); + "%s(index + count)", + caller); return; } @@ -627,6 +711,37 @@ void GLAPIENTRY +_mesa_ProgramLocalParameters4fvEXT(GLenum target, GLuint index, GLsizei count, + const GLfloat *params) +{ + GET_CURRENT_CONTEXT(ctx); + struct gl_program* prog = get_current_program(ctx, target, + "glProgramLocalParameters4fv"); + if (!prog) { + return; + } + + program_local_parameters4fv(prog, index, count, params, + "glProgramLocalParameters4fv"); +} + +void GLAPIENTRY +_mesa_NamedProgramLocalParameters4fvEXT(GLuint program, GLenum target, GLuint index, + GLsizei count, const GLfloat *params) +{ + struct gl_program* prog = + lookup_or_create_program(program, target, + "glNamedProgramLocalParameters4fvEXT"); + if (!prog) { + return; + } + + program_local_parameters4fv(prog, index, count, params, + "glNamedProgramLocalParameters4fvEXT"); +} + + +void GLAPIENTRY _mesa_ProgramLocalParameter4dARB(GLenum target, GLuint index, GLdouble x, GLdouble y, GLdouble z, GLdouble w) @@ -637,6 +752,16 @@ void GLAPIENTRY +_mesa_NamedProgramLocalParameter4dEXT(GLuint program, GLenum target, GLuint index, + GLdouble x, GLdouble y, + GLdouble z, GLdouble w) +{ + _mesa_NamedProgramLocalParameter4fEXT(program, target, index, (GLfloat) x, (GLfloat) y, + (GLfloat) z, (GLfloat) w); +} + + +void GLAPIENTRY _mesa_ProgramLocalParameter4dvARB(GLenum target, GLuint index, const GLdouble *params) { @@ -647,14 +772,47 @@ void GLAPIENTRY +_mesa_NamedProgramLocalParameter4dvEXT(GLuint program, GLenum target, GLuint index, + const GLdouble *params) +{ + _mesa_NamedProgramLocalParameter4fEXT(program, target, index, + (GLfloat) params[0], (GLfloat) params[1], + (GLfloat) params[2], (GLfloat) params[3]); +} + + +void GLAPIENTRY _mesa_GetProgramLocalParameterfvARB(GLenum target, GLuint index, GLfloat *params) { GLfloat *param; GET_CURRENT_CONTEXT(ctx); + struct gl_program* prog = get_current_program(ctx, target, "glGetProgramLocalParameterfvARB"); + if (!prog) { + return; + } if (get_local_param_pointer(ctx, "glProgramLocalParameters4fvEXT", - target, index, ¶m)) { + prog, target, index, ¶m)) { + COPY_4V(params, param); + } +} + + +void GLAPIENTRY +_mesa_GetNamedProgramLocalParameterfvEXT(GLuint program, GLenum target, GLuint index, + GLfloat *params) +{ + GLfloat *param; + GET_CURRENT_CONTEXT(ctx); + struct gl_program* prog = lookup_or_create_program(program, target, + "glGetNamedProgramLocalParameterfvEXT"); + if (!prog) { + return; + } + + if (get_local_param_pointer(ctx, "glGetNamedProgramLocalParameterfvEXT", + prog, target, index, ¶m)) { COPY_4V(params, param); } } @@ -666,34 +824,50 @@ { GLfloat *param; GET_CURRENT_CONTEXT(ctx); + struct gl_program* prog = get_current_program(ctx, target, "glGetProgramLocalParameterdvARB"); + if (!prog) { + return; + } if (get_local_param_pointer(ctx, "glProgramLocalParameters4fvEXT", - target, index, ¶m)) { + prog, target, index, ¶m)) { COPY_4V(params, param); } } void GLAPIENTRY -_mesa_GetProgramivARB(GLenum target, GLenum pname, GLint *params) +_mesa_GetNamedProgramLocalParameterdvEXT(GLuint program, GLenum target, GLuint index, + GLdouble *params) +{ + GLfloat *param; + GET_CURRENT_CONTEXT(ctx); + struct gl_program* prog = lookup_or_create_program(program, target, + "glGetNamedProgramLocalParameterdvEXT"); + if (!prog) { + return; + } + + if (get_local_param_pointer(ctx, "glGetNamedProgramLocalParameterdvEXT", + prog, target, index, ¶m)) { + COPY_4V(params, param); + } +} + + +static void +get_program_iv(struct gl_program *prog, GLenum target, GLenum pname, + GLint *params) { const struct gl_program_constants *limits; - struct gl_program *prog; + GET_CURRENT_CONTEXT(ctx); - if (target == GL_VERTEX_PROGRAM_ARB - && ctx->Extensions.ARB_vertex_program) { - prog = ctx->VertexProgram.Current; + if (target == GL_VERTEX_PROGRAM_ARB) { limits = &ctx->Const.Program[MESA_SHADER_VERTEX]; } - else if (target == GL_FRAGMENT_PROGRAM_ARB - && ctx->Extensions.ARB_fragment_program) { - prog = ctx->FragmentProgram.Current; - limits = &ctx->Const.Program[MESA_SHADER_FRAGMENT]; - } else { - _mesa_error(ctx, GL_INVALID_ENUM, "glGetProgramivARB(target)"); - return; + limits = &ctx->Const.Program[MESA_SHADER_FRAGMENT]; } assert(prog); @@ -857,6 +1031,36 @@ void GLAPIENTRY +_mesa_GetProgramivARB(GLenum target, GLenum pname, GLint *params) +{ + GET_CURRENT_CONTEXT(ctx); + struct gl_program* prog = get_current_program(ctx, target, + "glGetProgramivARB"); + if (!prog) { + return; + } + get_program_iv(prog, target, pname, params); +} + +void GLAPIENTRY +_mesa_GetNamedProgramivEXT(GLuint program, GLenum target, GLenum pname, + GLint *params) +{ + struct gl_program* prog; + if (pname == GL_PROGRAM_BINDING_ARB) { + _mesa_GetProgramivARB(target, pname, params); + return; + } + prog = lookup_or_create_program(program, target, + "glGetNamedProgramivEXT"); + if (!prog) { + return; + } + get_program_iv(prog, target, pname, params); +} + + +void GLAPIENTRY _mesa_GetProgramStringARB(GLenum target, GLenum pname, GLvoid *string) { const struct gl_program *prog; @@ -881,6 +1085,28 @@ return; } + if (prog->String) + memcpy(dst, prog->String, strlen((char *) prog->String)); + else + *dst = '\0'; +} + + +void GLAPIENTRY +_mesa_GetNamedProgramStringEXT(GLuint program, GLenum target, + GLenum pname, GLvoid *string) { + char *dst = (char *) string; + GET_CURRENT_CONTEXT(ctx); + struct gl_program* prog = lookup_or_create_program(program, target, + "glGetNamedProgramStringEXT"); + if (!prog) + return; + + if (pname != GL_PROGRAM_STRING_ARB) { + _mesa_error(ctx, GL_INVALID_ENUM, "glGetNamedProgramStringEXT(pname)"); + return; + } + if (prog->String) memcpy(dst, prog->String, strlen((char *) prog->String)); else diff -Nru mesa-19.2.8/src/mesa/main/arbprogram.h mesa-20.0.8/src/mesa/main/arbprogram.h --- mesa-19.2.8/src/mesa/main/arbprogram.h 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/main/arbprogram.h 2020-06-12 01:21:18.000000000 +0000 @@ -48,6 +48,9 @@ _mesa_ProgramStringARB(GLenum target, GLenum format, GLsizei len, const GLvoid *string); +extern void GLAPIENTRY +_mesa_NamedProgramStringEXT(GLuint program, GLenum target, GLenum format, + GLsizei len, const GLvoid *string); extern void GLAPIENTRY _mesa_ProgramEnvParameter4dARB(GLenum target, GLuint index, @@ -91,14 +94,36 @@ extern void GLAPIENTRY +_mesa_NamedProgramLocalParameter4fvEXT(GLuint program, GLenum target, + GLuint index, const GLfloat *params); + +extern void GLAPIENTRY +_mesa_NamedProgramLocalParameter4dEXT(GLuint program, GLenum target, + GLuint index, GLdouble x, GLdouble y, + GLdouble z, GLdouble w); + +extern void GLAPIENTRY +_mesa_NamedProgramLocalParameter4dvEXT(GLuint program, GLenum target, + GLuint index, const GLdouble *params); + + +extern void GLAPIENTRY +_mesa_NamedProgramLocalParameter4fEXT(GLuint program, GLenum target, + GLuint index, GLfloat x, GLfloat y, + GLfloat z, GLfloat w); + + +extern void GLAPIENTRY _mesa_ProgramLocalParameter4fvARB(GLenum target, GLuint index, const GLfloat *params); - extern void GLAPIENTRY _mesa_ProgramLocalParameters4fvEXT(GLenum target, GLuint index, GLsizei count, const GLfloat *params); +extern void GLAPIENTRY +_mesa_NamedProgramLocalParameters4fvEXT(GLuint program, GLenum target, GLuint index, + GLsizei count, const GLfloat *params); extern void GLAPIENTRY _mesa_GetProgramEnvParameterdvARB(GLenum target, GLuint index, @@ -114,18 +139,30 @@ _mesa_GetProgramLocalParameterdvARB(GLenum target, GLuint index, GLdouble *params); +extern void GLAPIENTRY +_mesa_GetNamedProgramLocalParameterdvEXT(GLuint program, GLenum target, + GLuint index, GLdouble *params); extern void GLAPIENTRY -_mesa_GetProgramLocalParameterfvARB(GLenum target, GLuint index, +_mesa_GetProgramLocalParameterfvARB(GLenum target, GLuint index, GLfloat *params); +extern void GLAPIENTRY +_mesa_GetNamedProgramLocalParameterfvEXT(GLuint program, GLenum target, + GLuint index, GLfloat *params); extern void GLAPIENTRY _mesa_GetProgramivARB(GLenum target, GLenum pname, GLint *params); +extern void GLAPIENTRY +_mesa_GetNamedProgramivEXT(GLuint program, GLenum target, GLenum pname, + GLint *params); extern void GLAPIENTRY _mesa_GetProgramStringARB(GLenum target, GLenum pname, GLvoid *string); +extern void GLAPIENTRY +_mesa_GetNamedProgramStringEXT(GLuint program, GLenum target, + GLenum pname, GLvoid *string); #endif diff -Nru mesa-19.2.8/src/mesa/main/arrayobj.c mesa-20.0.8/src/mesa/main/arrayobj.c --- mesa-19.2.8/src/mesa/main/arrayobj.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/main/arrayobj.c 2020-06-12 01:21:18.000000000 +0000 @@ -229,12 +229,17 @@ /** * Looks up the array object for the given ID. * - * Unlike _mesa_lookup_vao, this function generates a GL_INVALID_OPERATION + * While _mesa_lookup_vao doesn't generate an error if the object does not + * exist, this function comes in two variants. + * If is_ext_dsa is false, this function generates a GL_INVALID_OPERATION * error if the array object does not exist. It also returns the default * array object when ctx is a compatibility profile context and id is zero. + * If is_ext_dsa is true, 0 is not a valid name. If the name exists but + * the object has never been bound, it is initialized. */ struct gl_vertex_array_object * -_mesa_lookup_vao_err(struct gl_context *ctx, GLuint id, const char *caller) +_mesa_lookup_vao_err(struct gl_context *ctx, GLuint id, + bool is_ext_dsa, const char *caller) { /* The ARB_direct_state_access specification says: * @@ -243,10 +248,11 @@ * the name of the vertex array object." */ if (id == 0) { - if (ctx->API == API_OPENGL_CORE) { + if (is_ext_dsa || ctx->API == API_OPENGL_CORE) { _mesa_error(ctx, GL_INVALID_OPERATION, - "%s(zero is not valid vaobj name in a core profile " - "context)", caller); + "%s(zero is not valid vaobj name%s)", + caller, + is_ext_dsa ? "" : " in a core profile context"); return NULL; } @@ -267,12 +273,23 @@ * [compatibility profile: zero or] the name of an existing * vertex array object." */ - if (!vao || !vao->EverBound) { + if (!vao || (!is_ext_dsa && !vao->EverBound)) { _mesa_error(ctx, GL_INVALID_OPERATION, "%s(non-existent vaobj=%u)", caller, id); return NULL; } + /* The EXT_direct_state_access specification says: + * + * "If the vertex array object named by the vaobj parameter has not + * been previously bound but has been generated (without subsequent + * deletion) by GenVertexArrays, the GL first creates a new state + * vector in the same manner as when BindVertexArray creates a new + * vertex array object." + */ + if (vao && is_ext_dsa && !vao->EverBound) + vao->EverBound = true; + _mesa_reference_vao(ctx, &ctx->Array.LastLookedUpVAO, vao); } @@ -1273,7 +1290,7 @@ * VertexArrayElementBuffer if is not [compatibility profile: * zero or] the name of an existing vertex array object." */ - vao =_mesa_lookup_vao_err(ctx, vaobj, "glVertexArrayElementBuffer"); + vao =_mesa_lookup_vao_err(ctx, vaobj, false, "glVertexArrayElementBuffer"); if (!vao) return; } else { @@ -1333,7 +1350,7 @@ * [compatibility profile: zero or] the name of an existing * vertex array object." */ - vao =_mesa_lookup_vao_err(ctx, vaobj, "glGetVertexArrayiv"); + vao = _mesa_lookup_vao_err(ctx, vaobj, false, "glGetVertexArrayiv"); if (!vao) return; diff -Nru mesa-19.2.8/src/mesa/main/arrayobj.h mesa-20.0.8/src/mesa/main/arrayobj.h --- mesa-19.2.8/src/mesa/main/arrayobj.h 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/main/arrayobj.h 2020-06-12 01:21:18.000000000 +0000 @@ -50,7 +50,8 @@ _mesa_lookup_vao(struct gl_context *ctx, GLuint id); extern struct gl_vertex_array_object * -_mesa_lookup_vao_err(struct gl_context *ctx, GLuint id, const char *caller); +_mesa_lookup_vao_err(struct gl_context *ctx, GLuint id, + bool is_ext_dsa, const char *caller); extern struct gl_vertex_array_object * _mesa_new_vao(struct gl_context *ctx, GLuint name); diff -Nru mesa-19.2.8/src/mesa/main/attrib.c mesa-20.0.8/src/mesa/main/attrib.c --- mesa-19.2.8/src/mesa/main/attrib.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/main/attrib.c 2020-06-12 01:21:18.000000000 +0000 @@ -43,6 +43,7 @@ #include "macros.h" #include "matrix.h" #include "multisample.h" +#include "pixelstore.h" #include "points.h" #include "polygon.h" #include "shared.h" @@ -1852,6 +1853,80 @@ } } +void GLAPIENTRY +_mesa_ClientAttribDefaultEXT( GLbitfield mask ) +{ + if (mask & GL_CLIENT_PIXEL_STORE_BIT) { + _mesa_PixelStorei(GL_UNPACK_SWAP_BYTES, GL_FALSE); + _mesa_PixelStorei(GL_UNPACK_LSB_FIRST, GL_FALSE); + _mesa_PixelStorei(GL_UNPACK_IMAGE_HEIGHT, 0); + _mesa_PixelStorei(GL_UNPACK_SKIP_IMAGES, 0); + _mesa_PixelStorei(GL_UNPACK_ROW_LENGTH, 0); + _mesa_PixelStorei(GL_UNPACK_SKIP_ROWS, 0); + _mesa_PixelStorei(GL_UNPACK_SKIP_PIXELS, 0); + _mesa_PixelStorei(GL_UNPACK_ALIGNMENT, 4); + _mesa_PixelStorei(GL_PACK_SWAP_BYTES, GL_FALSE); + _mesa_PixelStorei(GL_PACK_LSB_FIRST, GL_FALSE); + _mesa_PixelStorei(GL_PACK_IMAGE_HEIGHT, 0); + _mesa_PixelStorei(GL_PACK_SKIP_IMAGES, 0); + _mesa_PixelStorei(GL_PACK_ROW_LENGTH, 0); + _mesa_PixelStorei(GL_PACK_SKIP_ROWS, 0); + _mesa_PixelStorei(GL_PACK_SKIP_PIXELS, 0); + _mesa_PixelStorei(GL_PACK_ALIGNMENT, 4); + + _mesa_BindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); + _mesa_BindBuffer(GL_PIXEL_PACK_BUFFER, 0); + } + if (mask & GL_CLIENT_VERTEX_ARRAY_BIT) { + GET_CURRENT_CONTEXT(ctx); + int i; + + _mesa_BindBuffer(GL_ARRAY_BUFFER, 0); + _mesa_BindBuffer(GL_ELEMENT_ARRAY_BUFFER, 0); + + _mesa_DisableClientState(GL_EDGE_FLAG_ARRAY); + _mesa_EdgeFlagPointer(0, 0); + + _mesa_DisableClientState(GL_INDEX_ARRAY); + _mesa_IndexPointer(GL_FLOAT, 0, 0); + + _mesa_DisableClientState(GL_SECONDARY_COLOR_ARRAY); + _mesa_SecondaryColorPointer(4, GL_FLOAT, 0, 0); + + _mesa_DisableClientState(GL_FOG_COORD_ARRAY); + _mesa_FogCoordPointer(GL_FLOAT, 0, 0); + + for (i = 0; i < ctx->Const.MaxTextureCoordUnits; i++) { + _mesa_ClientActiveTexture(GL_TEXTURE0 + i); + _mesa_DisableClientState(GL_TEXTURE_COORD_ARRAY); + _mesa_TexCoordPointer(4, GL_FLOAT, 0, 0); + } + + _mesa_DisableClientState(GL_COLOR_ARRAY); + _mesa_ColorPointer(4, GL_FLOAT, 0, 0); + + _mesa_DisableClientState(GL_NORMAL_ARRAY); + _mesa_NormalPointer(GL_FLOAT, 0, 0); + + _mesa_DisableClientState(GL_VERTEX_ARRAY); + _mesa_VertexPointer(4, GL_FLOAT, 0, 0); + + for (i = 0; i < ctx->Const.Program[MESA_SHADER_VERTEX].MaxAttribs; i++) { + _mesa_DisableVertexAttribArray(i); + _mesa_VertexAttribPointer(i, 4, GL_FLOAT, GL_FALSE, 0, 0); + } + + _mesa_ClientActiveTexture(GL_TEXTURE0); + } +} + +void GLAPIENTRY +_mesa_PushClientAttribDefaultEXT( GLbitfield mask ) +{ + _mesa_PushClientAttrib(mask); + _mesa_ClientAttribDefaultEXT(mask); +} + /** * Free any attribute state data that might be attached to the context. diff -Nru mesa-19.2.8/src/mesa/main/attrib.h mesa-20.0.8/src/mesa/main/attrib.h --- mesa-19.2.8/src/mesa/main/attrib.h 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/main/attrib.h 2020-06-12 01:21:18.000000000 +0000 @@ -43,6 +43,12 @@ extern void GLAPIENTRY _mesa_PopClientAttrib( void ); +extern void GLAPIENTRY +_mesa_ClientAttribDefaultEXT( GLbitfield mask ); + +extern void GLAPIENTRY +_mesa_PushClientAttribDefaultEXT( GLbitfield mask ); + extern void _mesa_init_attrib( struct gl_context *ctx ); diff -Nru mesa-19.2.8/src/mesa/main/bufferobj.c mesa-20.0.8/src/mesa/main/bufferobj.c --- mesa-19.2.8/src/mesa/main/bufferobj.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/main/bufferobj.c 2020-06-12 01:21:18.000000000 +0000 @@ -2574,6 +2574,22 @@ void GLAPIENTRY +_mesa_ClearNamedBufferDataEXT(GLuint buffer, GLenum internalformat, + GLenum format, GLenum type, const GLvoid *data) +{ + GET_CURRENT_CONTEXT(ctx); + struct gl_buffer_object *bufObj = _mesa_lookup_bufferobj(ctx, buffer); + if (!_mesa_handle_bind_buffer_gen(ctx, buffer, + &bufObj, "glClearNamedBufferDataEXT")) + return; + + clear_buffer_sub_data_error(ctx, bufObj, internalformat, 0, bufObj->Size, + format, type, data, "glClearNamedBufferDataEXT", + false); +} + + +void GLAPIENTRY _mesa_ClearBufferSubData_no_error(GLenum target, GLenum internalformat, GLintptr offset, GLsizeiptr size, GLenum format, GLenum type, @@ -2641,6 +2657,23 @@ true); } +void GLAPIENTRY +_mesa_ClearNamedBufferSubDataEXT(GLuint buffer, GLenum internalformat, + GLintptr offset, GLsizeiptr size, + GLenum format, GLenum type, + const GLvoid *data) +{ + GET_CURRENT_CONTEXT(ctx); + struct gl_buffer_object *bufObj = _mesa_lookup_bufferobj(ctx, buffer); + if (!_mesa_handle_bind_buffer_gen(ctx, buffer, + &bufObj, "glClearNamedBufferSubDataEXT")) + return; + + clear_buffer_sub_data_error(ctx, bufObj, internalformat, offset, size, + format, type, data, "glClearNamedBufferSubDataEXT", + true); +} + static GLboolean unmap_buffer(struct gl_context *ctx, struct gl_buffer_object *bufObj) { @@ -3091,6 +3124,30 @@ } void GLAPIENTRY +_mesa_NamedCopyBufferSubDataEXT(GLuint readBuffer, GLuint writeBuffer, + GLintptr readOffset, GLintptr writeOffset, + GLsizeiptr size) +{ + GET_CURRENT_CONTEXT(ctx); + struct gl_buffer_object *src, *dst; + + src = _mesa_lookup_bufferobj(ctx, readBuffer); + if (!_mesa_handle_bind_buffer_gen(ctx, readBuffer, + &src, + "glNamedCopyBufferSubDataEXT")) + return; + + dst = _mesa_lookup_bufferobj(ctx, writeBuffer); + if (!_mesa_handle_bind_buffer_gen(ctx, writeBuffer, + &dst, + "glNamedCopyBufferSubDataEXT")) + return; + + copy_buffer_sub_data(ctx, src, dst, readOffset, writeOffset, size, + "glNamedCopyBufferSubDataEXT"); +} + +void GLAPIENTRY _mesa_CopyNamedBufferSubData_no_error(GLuint readBuffer, GLuint writeBuffer, GLintptr readOffset, GLintptr writeOffset, GLsizeiptr size) @@ -4897,3 +4954,23 @@ buffer_page_commitment(ctx, bufferObj, offset, size, commit, "glNamedBufferPageCommitmentARB"); } + +void GLAPIENTRY +_mesa_NamedBufferPageCommitmentEXT(GLuint buffer, GLintptr offset, + GLsizeiptr size, GLboolean commit) +{ + GET_CURRENT_CONTEXT(ctx); + struct gl_buffer_object *bufferObj; + + /* Use NamedBuffer* functions logic from EXT_direct_state_access */ + if (buffer != 0) { + bufferObj = _mesa_lookup_bufferobj(ctx, buffer); + if (!_mesa_handle_bind_buffer_gen(ctx, buffer, &bufferObj, + "glNamedBufferPageCommitmentEXT")) + return; + } else { + bufferObj = ctx->Shared->NullBufferObj; + } + buffer_page_commitment(ctx, bufferObj, offset, size, commit, + "glNamedBufferPageCommitmentEXT"); +} diff -Nru mesa-19.2.8/src/mesa/main/bufferobj.h mesa-20.0.8/src/mesa/main/bufferobj.h --- mesa-19.2.8/src/mesa/main/bufferobj.h 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/main/bufferobj.h 2020-06-12 01:21:18.000000000 +0000 @@ -278,6 +278,11 @@ const GLvoid *data); void GLAPIENTRY +_mesa_ClearNamedBufferDataEXT(GLuint buffer, GLenum internalformat, + GLenum format, GLenum type, + const GLvoid *data); + +void GLAPIENTRY _mesa_ClearBufferSubData_no_error(GLenum target, GLenum internalformat, GLintptr offset, GLsizeiptr size, GLenum format, GLenum type, @@ -301,6 +306,12 @@ GLenum format, GLenum type, const GLvoid *data); +void GLAPIENTRY +_mesa_ClearNamedBufferSubDataEXT(GLuint buffer, GLenum internalformat, + GLintptr offset, GLsizeiptr size, + GLenum format, GLenum type, + const GLvoid *data); + GLboolean GLAPIENTRY _mesa_UnmapBuffer_no_error(GLenum target); GLboolean GLAPIENTRY @@ -346,6 +357,11 @@ GLsizeiptr size); void GLAPIENTRY +_mesa_NamedCopyBufferSubDataEXT(GLuint readBuffer, GLuint writeBuffer, + GLintptr readOffset, GLintptr writeOffset, + GLsizeiptr size); + +void GLAPIENTRY _mesa_CopyNamedBufferSubData_no_error(GLuint readBuffer, GLuint writeBuffer, GLintptr readOffset, GLintptr writeOffset, GLsizeiptr size); @@ -441,4 +457,8 @@ _mesa_NamedBufferPageCommitmentARB(GLuint buffer, GLintptr offset, GLsizeiptr size, GLboolean commit); +void GLAPIENTRY +_mesa_NamedBufferPageCommitmentEXT(GLuint buffer, GLintptr offset, + GLsizeiptr size, GLboolean commit); + #endif diff -Nru mesa-19.2.8/src/mesa/main/buffers.c mesa-20.0.8/src/mesa/main/buffers.c --- mesa-19.2.8/src/mesa/main/buffers.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/main/buffers.c 2020-06-12 01:21:18.000000000 +0000 @@ -85,8 +85,9 @@ return mask; } -static GLenum -back_to_front_if_single_buffered(const struct gl_context *ctx, GLenum buffer) +GLenum +_mesa_back_to_front_if_single_buffered(const struct gl_framebuffer *fb, + GLenum buffer) { /* If the front buffer is the only buffer, GL_BACK and all other flags * that include BACK select the front buffer for drawing. There are @@ -110,7 +111,7 @@ * but they are front buffers from the Mesa point of view, * because they are always single buffered. */ - if (!ctx->DrawBuffer->Visual.doubleBufferMode) { + if (!fb->Visual.doubleBufferMode) { switch (buffer) { case GL_BACK: buffer = GL_FRONT; @@ -135,7 +136,7 @@ static GLbitfield draw_buffer_enum_to_bitmask(const struct gl_context *ctx, GLenum buffer) { - buffer = back_to_front_if_single_buffered(ctx, buffer); + buffer = _mesa_back_to_front_if_single_buffered(ctx->DrawBuffer, buffer); switch (buffer) { case GL_NONE: @@ -200,7 +201,7 @@ static gl_buffer_index read_buffer_enum_to_index(const struct gl_context *ctx, GLenum buffer) { - buffer = back_to_front_if_single_buffered(ctx, buffer); + buffer = _mesa_back_to_front_if_single_buffered(ctx->ReadBuffer, buffer); switch (buffer) { case GL_FRONT: diff -Nru mesa-19.2.8/src/mesa/main/buffers.h mesa-20.0.8/src/mesa/main/buffers.h --- mesa-19.2.8/src/mesa/main/buffers.h 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/main/buffers.h 2020-06-12 01:21:18.000000000 +0000 @@ -78,6 +78,10 @@ extern void _mesa_update_draw_buffers(struct gl_context *ctx); +extern GLenum +_mesa_back_to_front_if_single_buffered(const struct gl_framebuffer *fb, + GLenum buffer); + void GLAPIENTRY _mesa_ReadBuffer_no_error(GLenum mode); diff -Nru mesa-19.2.8/src/mesa/main/clear.c mesa-20.0.8/src/mesa/main/clear.c --- mesa-19.2.8/src/mesa/main/clear.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/main/clear.c 2020-06-12 01:21:18.000000000 +0000 @@ -203,17 +203,17 @@ } if ((mask & GL_DEPTH_BUFFER_BIT) - && ctx->DrawBuffer->Visual.haveDepthBuffer) { + && ctx->DrawBuffer->Visual.depthBits > 0) { bufferMask |= BUFFER_BIT_DEPTH; } if ((mask & GL_STENCIL_BUFFER_BIT) - && ctx->DrawBuffer->Visual.haveStencilBuffer) { + && ctx->DrawBuffer->Visual.stencilBits > 0) { bufferMask |= BUFFER_BIT_STENCIL; } if ((mask & GL_ACCUM_BUFFER_BIT) - && ctx->DrawBuffer->Visual.haveAccumBuffer) { + && ctx->DrawBuffer->Visual.accumRedBits > 0) { bufferMask |= BUFFER_BIT_ACCUM; } diff -Nru mesa-19.2.8/src/mesa/main/compute.c mesa-20.0.8/src/mesa/main/compute.c --- mesa-19.2.8/src/mesa/main/compute.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/main/compute.c 2020-06-12 01:21:18.000000000 +0000 @@ -103,8 +103,6 @@ const GLuint *num_groups, const GLuint *group_size) { - GLuint total_invocations = 1; - if (!check_valid_to_compute(ctx, "glDispatchComputeGroupSizeARB")) return GL_FALSE; @@ -153,8 +151,6 @@ "glDispatchComputeGroupSizeARB(group_size_%c)", 'x' + i); return GL_FALSE; } - - total_invocations *= group_size[i]; } /* The ARB_compute_variable_group_size spec says: @@ -165,11 +161,19 @@ * for compute shaders with variable group size * (MAX_COMPUTE_VARIABLE_GROUP_INVOCATIONS_ARB)." */ + uint64_t total_invocations = group_size[0] * group_size[1]; + if (total_invocations <= UINT32_MAX) { + /* Only bother multiplying the third value if total still fits in + * 32-bit, since MaxComputeVariableGroupInvocations is also 32-bit. + */ + total_invocations *= group_size[2]; + } if (total_invocations > ctx->Const.MaxComputeVariableGroupInvocations) { _mesa_error(ctx, GL_INVALID_VALUE, "glDispatchComputeGroupSizeARB(product of local_sizes " "exceeds MAX_COMPUTE_VARIABLE_GROUP_INVOCATIONS_ARB " - "(%d > %d))", total_invocations, + "(%u * %u * %u > %u))", + group_size[0], group_size[1], group_size[2], ctx->Const.MaxComputeVariableGroupInvocations); return GL_FALSE; } diff -Nru mesa-19.2.8/src/mesa/main/config.h mesa-20.0.8/src/mesa/main/config.h --- mesa-19.2.8/src/mesa/main/config.h 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/main/config.h 2020-06-12 01:21:18.000000000 +0000 @@ -31,6 +31,7 @@ #ifndef MESA_CONFIG_H_INCLUDED #define MESA_CONFIG_H_INCLUDED +#include "compiler/shader_enums.h" /** * \name OpenGL implementation limits @@ -224,12 +225,6 @@ /*@}*/ -/** For GL_ARB_draw_buffers */ -/*@{*/ -#define MAX_DRAW_BUFFERS 8 -/*@}*/ - - /** For GL_EXT_framebuffer_object */ /*@{*/ #define MAX_COLOR_ATTACHMENTS 8 diff -Nru mesa-19.2.8/src/mesa/main/conservativeraster.c mesa-20.0.8/src/mesa/main/conservativeraster.c --- mesa-19.2.8/src/mesa/main/conservativeraster.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/main/conservativeraster.c 2020-06-12 01:21:18.000000000 +0000 @@ -59,6 +59,11 @@ _mesa_error(ctx, GL_INVALID_VALUE, "%s(param=%g)", func, param); return; } + + FLUSH_VERTICES(ctx, 0); + ctx->NewDriverState |= + ctx->DriverFlags.NewNvConservativeRasterizationParams; + ctx->ConservativeRasterDilate = CLAMP(param, ctx->Const.ConservativeRasterDilateRange[0], @@ -74,6 +79,11 @@ "%s(pname=%s)", func, _mesa_enum_to_string(param)); return; } + + FLUSH_VERTICES(ctx, 0); + ctx->NewDriverState |= + ctx->DriverFlags.NewNvConservativeRasterizationParams; + ctx->ConservativeRasterMode = param; break; default: @@ -81,10 +91,6 @@ break; } - FLUSH_VERTICES(ctx, 0); - ctx->NewDriverState |= - ctx->DriverFlags.NewNvConservativeRasterizationParams; - return; invalid_pname_enum: if (!no_error) diff -Nru mesa-19.2.8/src/mesa/main/context.c mesa-20.0.8/src/mesa/main/context.c --- mesa-19.2.8/src/mesa/main/context.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/main/context.c 2020-06-12 01:21:18.000000000 +0000 @@ -149,6 +149,7 @@ #endif #include "compiler/glsl_types.h" +#include "compiler/glsl/builtin_functions.h" #include "compiler/glsl/glsl_parser_extras.h" #include @@ -204,7 +205,6 @@ * \param stencilBits requested minimum bits per stencil buffer value * \param accumRedBits, accumGreenBits, accumBlueBits, accumAlphaBits number * of bits per color component in accum buffer. - * \param indexBits number of bits per pixel if \p rgbFlag is GL_FALSE * \param redBits number of bits per color component in frame buffer for RGB(A) * mode. We always use 8 in core Mesa though. * \param greenBits same as above. @@ -287,7 +287,6 @@ assert(accumBlueBits >= 0); assert(accumAlphaBits >= 0); - vis->rgbMode = GL_TRUE; vis->doubleBufferMode = dbFlag; vis->stereoMode = stereoFlag; @@ -297,7 +296,6 @@ vis->alphaBits = alphaBits; vis->rgbBits = redBits + greenBits + blueBits; - vis->indexBits = 0; vis->depthBits = depthBits; vis->stencilBits = stencilBits; @@ -306,10 +304,6 @@ vis->accumBlueBits = accumBlueBits; vis->accumAlphaBits = accumAlphaBits; - vis->haveAccumBuffer = accumRedBits > 0; - vis->haveDepthBuffer = depthBits > 0; - vis->haveStencilBuffer = stencilBits > 0; - vis->numAuxBuffers = 0; vis->level = 0; vis->sampleBuffers = numSamples > 0 ? 1 : 0; @@ -360,7 +354,7 @@ static void one_time_fini(void) { - _mesa_destroy_shader_compiler(); + glsl_type_singleton_decref(); _mesa_locale_fini(); } @@ -408,6 +402,11 @@ _mesa_debug(ctx, "Mesa " PACKAGE_VERSION " DEBUG build" MESA_GIT_SHA1 "\n"); } #endif + + /* Take a glsl type reference for the duration of libGL's life to avoid + * unecessary creation/destruction of glsl types. + */ + glsl_type_singleton_init_or_ref(); } /* per-API one-time init */ @@ -627,6 +626,8 @@ consts->GLSLVersion = api == API_OPENGL_CORE ? 130 : 120; consts->GLSLVersionCompat = consts->GLSLVersion; + consts->GLSLLowerConstArrays = true; + /* Assume that if GLSL 1.30+ (or GLSL ES 3.00+) is supported that * gl_VertexID is implemented using a native hardware register with OpenGL * semantics. @@ -1205,8 +1206,6 @@ /* misc one-time initializations */ one_time_init(ctx); - _mesa_init_shader_compiler_types(); - /* Plug in driver functions and context pointer here. * This is important because when we call alloc_shared_state() below * we'll call ctx->Driver.NewTextureObject() to create the default @@ -1319,7 +1318,7 @@ * \sa _mesa_initialize_context() and init_attrib_groups(). */ void -_mesa_free_context_data(struct gl_context *ctx, bool destroy_compiler_types) +_mesa_free_context_data(struct gl_context *ctx) { if (!_mesa_get_current_context()){ /* No current context, but we may need one in order to delete @@ -1394,9 +1393,6 @@ free(ctx->VersionString); - if (destroy_compiler_types) - _mesa_destroy_shader_compiler_types(); - ralloc_free(ctx->SoftFP64); /* unbind the context if it's currently bound */ @@ -1404,6 +1400,12 @@ _mesa_make_current(NULL, NULL, NULL); } + /* Do this after unbinding context to ensure any thread is finished. */ + if (ctx->shader_builtin_ref) { + _mesa_glsl_builtin_functions_decref(); + ctx->shader_builtin_ref = false; + } + free(ctx->Const.SpirVExtensions); } @@ -1419,7 +1421,7 @@ _mesa_destroy_context( struct gl_context *ctx ) { if (ctx) { - _mesa_free_context_data(ctx, true); + _mesa_free_context_data(ctx); free( (void *) ctx ); } } @@ -1557,9 +1559,12 @@ ctxvis->foo != bufvis->foo) \ return GL_FALSE - check_component(redMask); - check_component(greenMask); - check_component(blueMask); + check_component(redShift); + check_component(greenShift); + check_component(blueShift); + check_component(redBits); + check_component(greenBits); + check_component(blueBits); check_component(depthBits); check_component(stencilBits); diff -Nru mesa-19.2.8/src/mesa/main/context.h mesa-20.0.8/src/mesa/main/context.h --- mesa-19.2.8/src/mesa/main/context.h 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/main/context.h 2020-06-12 01:21:18.000000000 +0000 @@ -115,7 +115,7 @@ const struct dd_function_table *driverFunctions); extern void -_mesa_free_context_data(struct gl_context *ctx, bool destroy_compiler_types); +_mesa_free_context_data(struct gl_context *ctx); extern void _mesa_destroy_context( struct gl_context *ctx ); diff -Nru mesa-19.2.8/src/mesa/main/dd.h mesa-20.0.8/src/mesa/main/dd.h --- mesa-19.2.8/src/mesa/main/dd.h 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/main/dd.h 2020-06-12 01:21:18.000000000 +0000 @@ -975,6 +975,13 @@ void *image_handle); /** + * \name GL_EXT_EGL_image_storage interface + */ + void (*EGLImageTargetTexStorage)(struct gl_context *ctx, GLenum target, + struct gl_texture_object *texObj, + struct gl_texture_image *texImage, + GLeglImageOES image_handle); + /** * \name GL_EXT_transform_feedback interface */ struct gl_transform_feedback_object * diff -Nru mesa-19.2.8/src/mesa/main/debug.c mesa-20.0.8/src/mesa/main/debug.c --- mesa-19.2.8/src/mesa/main/debug.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/main/debug.c 2020-06-12 01:21:18.000000000 +0000 @@ -625,8 +625,7 @@ case MESA_FORMAT_I_UNORM8: c = 1; break; - case MESA_FORMAT_L8A8_UNORM: - case MESA_FORMAT_A8L8_UNORM: + case MESA_FORMAT_LA_UNORM8: c = 2; break; case MESA_FORMAT_BGR_UNORM8: diff -Nru mesa-19.2.8/src/mesa/main/dlist.c mesa-20.0.8/src/mesa/main/dlist.c --- mesa-19.2.8/src/mesa/main/dlist.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/main/dlist.c 2020-06-12 01:21:18.000000000 +0000 @@ -393,6 +393,40 @@ OPCODE_UNIFORM_MATRIX34D, OPCODE_UNIFORM_MATRIX43D, + /* GL_ARB_gpu_shader_int64 */ + OPCODE_UNIFORM_1I64, + OPCODE_UNIFORM_2I64, + OPCODE_UNIFORM_3I64, + OPCODE_UNIFORM_4I64, + OPCODE_UNIFORM_1I64V, + OPCODE_UNIFORM_2I64V, + OPCODE_UNIFORM_3I64V, + OPCODE_UNIFORM_4I64V, + OPCODE_UNIFORM_1UI64, + OPCODE_UNIFORM_2UI64, + OPCODE_UNIFORM_3UI64, + OPCODE_UNIFORM_4UI64, + OPCODE_UNIFORM_1UI64V, + OPCODE_UNIFORM_2UI64V, + OPCODE_UNIFORM_3UI64V, + OPCODE_UNIFORM_4UI64V, + OPCODE_PROGRAM_UNIFORM_1I64, + OPCODE_PROGRAM_UNIFORM_2I64, + OPCODE_PROGRAM_UNIFORM_3I64, + OPCODE_PROGRAM_UNIFORM_4I64, + OPCODE_PROGRAM_UNIFORM_1I64V, + OPCODE_PROGRAM_UNIFORM_2I64V, + OPCODE_PROGRAM_UNIFORM_3I64V, + OPCODE_PROGRAM_UNIFORM_4I64V, + OPCODE_PROGRAM_UNIFORM_1UI64, + OPCODE_PROGRAM_UNIFORM_2UI64, + OPCODE_PROGRAM_UNIFORM_3UI64, + OPCODE_PROGRAM_UNIFORM_4UI64, + OPCODE_PROGRAM_UNIFORM_1UI64V, + OPCODE_PROGRAM_UNIFORM_2UI64V, + OPCODE_PROGRAM_UNIFORM_3UI64V, + OPCODE_PROGRAM_UNIFORM_4UI64V, + /* OpenGL 4.0 / GL_ARB_tessellation_shader */ OPCODE_PATCH_PARAMETER_I, OPCODE_PATCH_PARAMETER_FV_INNER, @@ -570,6 +604,8 @@ OPCODE_MATRIX_POP, OPCODE_TEXTUREPARAMETER_F, OPCODE_TEXTUREPARAMETER_I, + OPCODE_TEXTUREPARAMETER_II, + OPCODE_TEXTUREPARAMETER_IUI, OPCODE_TEXTURE_IMAGE1D, OPCODE_TEXTURE_IMAGE2D, OPCODE_TEXTURE_IMAGE3D, @@ -584,6 +620,8 @@ OPCODE_BIND_MULTITEXTURE, OPCODE_MULTITEXPARAMETER_F, OPCODE_MULTITEXPARAMETER_I, + OPCODE_MULTITEXPARAMETER_II, + OPCODE_MULTITEXPARAMETER_IUI, OPCODE_MULTITEX_IMAGE1D, OPCODE_MULTITEX_IMAGE2D, OPCODE_MULTITEX_IMAGE3D, @@ -608,6 +646,8 @@ OPCODE_COMPRESSED_MULTITEX_SUB_IMAGE_1D, OPCODE_COMPRESSED_MULTITEX_SUB_IMAGE_2D, OPCODE_COMPRESSED_MULTITEX_SUB_IMAGE_3D, + OPCODE_NAMED_PROGRAM_STRING, + OPCODE_NAMED_PROGRAM_LOCAL_PARAMETER, /* The following three are meta instructions */ OPCODE_ERROR, /* raise compiled-in error */ @@ -719,6 +759,11 @@ GLuint uint32[2]; }; +union int64_pair +{ + GLint64 int64; + GLint int32[2]; +}; #define ASSIGN_DOUBLE_TO_NODES(n, idx, value) \ do { \ @@ -728,6 +773,21 @@ n[idx+1].ui = tmp.uint32[1]; \ } while (0) +#define ASSIGN_UINT64_TO_NODES(n, idx, value) \ + do { \ + union uint64_pair tmp; \ + tmp.uint64 = value; \ + n[idx].ui = tmp.uint32[0]; \ + n[idx+1].ui = tmp.uint32[1]; \ + } while (0) + +#define ASSIGN_INT64_TO_NODES(n, idx, value) \ + do { \ + union int64_pair tmp; \ + tmp.int64 = value; \ + n[idx].i = tmp.int32[0]; \ + n[idx+1].i = tmp.int32[1]; \ + } while (0) /** * How many nodes to allocate at a time. Note that bulk vertex data @@ -941,9 +1001,14 @@ goto out_of_memory; } - _mesa_init_teximage_fields(ctx, atlas->texImage, - atlas->texWidth, atlas->texHeight, 1, 0, - GL_ALPHA, MESA_FORMAT_A_UNORM8); + if (ctx->Const.BitmapUsesRed) + _mesa_init_teximage_fields(ctx, atlas->texImage, + atlas->texWidth, atlas->texHeight, 1, 0, + GL_RED, MESA_FORMAT_R_UNORM8); + else + _mesa_init_teximage_fields(ctx, atlas->texImage, + atlas->texWidth, atlas->texHeight, 1, 0, + GL_ALPHA, MESA_FORMAT_A_UNORM8); /* alloc image storage */ if (!ctx->Driver.AllocTextureImageBuffer(ctx, atlas->texImage)) { @@ -1172,6 +1237,14 @@ case OPCODE_UNIFORM_2UIV: case OPCODE_UNIFORM_3UIV: case OPCODE_UNIFORM_4UIV: + case OPCODE_UNIFORM_1I64V: + case OPCODE_UNIFORM_2I64V: + case OPCODE_UNIFORM_3I64V: + case OPCODE_UNIFORM_4I64V: + case OPCODE_UNIFORM_1UI64V: + case OPCODE_UNIFORM_2UI64V: + case OPCODE_UNIFORM_3UI64V: + case OPCODE_UNIFORM_4UI64V: free(get_pointer(&n[3])); break; case OPCODE_UNIFORM_MATRIX22: @@ -1210,6 +1283,14 @@ case OPCODE_PROGRAM_UNIFORM_2UIV: case OPCODE_PROGRAM_UNIFORM_3UIV: case OPCODE_PROGRAM_UNIFORM_4UIV: + case OPCODE_PROGRAM_UNIFORM_1I64V: + case OPCODE_PROGRAM_UNIFORM_2I64V: + case OPCODE_PROGRAM_UNIFORM_3I64V: + case OPCODE_PROGRAM_UNIFORM_4I64V: + case OPCODE_PROGRAM_UNIFORM_1UI64V: + case OPCODE_PROGRAM_UNIFORM_2UI64V: + case OPCODE_PROGRAM_UNIFORM_3UI64V: + case OPCODE_PROGRAM_UNIFORM_4UI64V: free(get_pointer(&n[4])); break; case OPCODE_PROGRAM_UNIFORM_MATRIX22F: @@ -1284,7 +1365,9 @@ case OPCODE_COMPRESSED_MULTITEX_IMAGE_3D: free(get_pointer(&n[10])); break; - + case OPCODE_NAMED_PROGRAM_STRING: + free(get_pointer(&n[5])); + break; case OPCODE_CONTINUE: n = (Node *) get_pointer(&n[1]); free(block); @@ -7830,629 +7913,1215 @@ } } - static void GLAPIENTRY -save_UseProgramStages(GLuint pipeline, GLbitfield stages, GLuint program) +save_Uniform1i64ARB(GLint location, GLint64 x) { GET_CURRENT_CONTEXT(ctx); Node *n; ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx); - n = alloc_instruction(ctx, OPCODE_USE_PROGRAM_STAGES, 3); + n = alloc_instruction(ctx, OPCODE_UNIFORM_1I64, 3); if (n) { - n[1].ui = pipeline; - n[2].ui = stages; - n[3].ui = program; + n[1].i = location; + ASSIGN_INT64_TO_NODES(n, 2, x); } if (ctx->ExecuteFlag) { - CALL_UseProgramStages(ctx->Exec, (pipeline, stages, program)); + CALL_Uniform1i64ARB(ctx->Exec, (location, x)); } } static void GLAPIENTRY -save_ProgramUniform1f(GLuint program, GLint location, GLfloat x) +save_Uniform2i64ARB(GLint location, GLint64 x, GLint64 y) { GET_CURRENT_CONTEXT(ctx); Node *n; ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx); - n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_1F, 3); + n = alloc_instruction(ctx, OPCODE_UNIFORM_2I64, 5); if (n) { - n[1].ui = program; - n[2].i = location; - n[3].f = x; + n[1].i = location; + ASSIGN_INT64_TO_NODES(n, 2, x); + ASSIGN_INT64_TO_NODES(n, 4, y); } if (ctx->ExecuteFlag) { - CALL_ProgramUniform1f(ctx->Exec, (program, location, x)); + CALL_Uniform2i64ARB(ctx->Exec, (location, x, y)); } } static void GLAPIENTRY -save_ProgramUniform2f(GLuint program, GLint location, GLfloat x, GLfloat y) +save_Uniform3i64ARB(GLint location, GLint64 x, GLint64 y, GLint64 z) { GET_CURRENT_CONTEXT(ctx); Node *n; ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx); - n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_2F, 4); + n = alloc_instruction(ctx, OPCODE_UNIFORM_3I64, 7); if (n) { - n[1].ui = program; - n[2].i = location; - n[3].f = x; - n[4].f = y; + n[1].i = location; + ASSIGN_INT64_TO_NODES(n, 2, x); + ASSIGN_INT64_TO_NODES(n, 4, y); + ASSIGN_INT64_TO_NODES(n, 6, z); } if (ctx->ExecuteFlag) { - CALL_ProgramUniform2f(ctx->Exec, (program, location, x, y)); + CALL_Uniform3i64ARB(ctx->Exec, (location, x, y, z)); } } static void GLAPIENTRY -save_ProgramUniform3f(GLuint program, GLint location, - GLfloat x, GLfloat y, GLfloat z) +save_Uniform4i64ARB(GLint location, GLint64 x, GLint64 y, GLint64 z, GLint64 w) { GET_CURRENT_CONTEXT(ctx); Node *n; ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx); - n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_3F, 5); + n = alloc_instruction(ctx, OPCODE_UNIFORM_4I64, 9); if (n) { - n[1].ui = program; - n[2].i = location; - n[3].f = x; - n[4].f = y; - n[5].f = z; + n[1].i = location; + ASSIGN_INT64_TO_NODES(n, 2, x); + ASSIGN_INT64_TO_NODES(n, 4, y); + ASSIGN_INT64_TO_NODES(n, 6, z); + ASSIGN_INT64_TO_NODES(n, 8, w); } if (ctx->ExecuteFlag) { - CALL_ProgramUniform3f(ctx->Exec, (program, location, x, y, z)); + CALL_Uniform4i64ARB(ctx->Exec, (location, x, y, z, w)); } } static void GLAPIENTRY -save_ProgramUniform4f(GLuint program, GLint location, - GLfloat x, GLfloat y, GLfloat z, GLfloat w) +save_Uniform1i64vARB(GLint location, GLsizei count, const GLint64 *v) { GET_CURRENT_CONTEXT(ctx); Node *n; ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx); - n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_4F, 6); + n = alloc_instruction(ctx, OPCODE_UNIFORM_1I64V, 2 + POINTER_DWORDS); if (n) { - n[1].ui = program; - n[2].i = location; - n[3].f = x; - n[4].f = y; - n[5].f = z; - n[6].f = w; + n[1].i = location; + n[2].i = count; + save_pointer(&n[3], memdup(v, count * 1 * sizeof(GLint64))); } if (ctx->ExecuteFlag) { - CALL_ProgramUniform4f(ctx->Exec, (program, location, x, y, z, w)); + CALL_Uniform1i64vARB(ctx->Exec, (location, count, v)); } } static void GLAPIENTRY -save_ProgramUniform1fv(GLuint program, GLint location, GLsizei count, - const GLfloat *v) +save_Uniform2i64vARB(GLint location, GLsizei count, const GLint64 *v) { GET_CURRENT_CONTEXT(ctx); Node *n; ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx); - n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_1FV, 3 + POINTER_DWORDS); + n = alloc_instruction(ctx, OPCODE_UNIFORM_2I64V, 2 + POINTER_DWORDS); if (n) { - n[1].ui = program; - n[2].i = location; - n[3].i = count; - save_pointer(&n[4], memdup(v, count * 1 * sizeof(GLfloat))); + n[1].i = location; + n[2].i = count; + save_pointer(&n[3], memdup(v, count * 2 * sizeof(GLint64))); } if (ctx->ExecuteFlag) { - CALL_ProgramUniform1fv(ctx->Exec, (program, location, count, v)); + CALL_Uniform2i64vARB(ctx->Exec, (location, count, v)); } } static void GLAPIENTRY -save_ProgramUniform2fv(GLuint program, GLint location, GLsizei count, - const GLfloat *v) +save_Uniform3i64vARB(GLint location, GLsizei count, const GLint64 *v) { GET_CURRENT_CONTEXT(ctx); Node *n; ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx); - n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_2FV, 3 + POINTER_DWORDS); + n = alloc_instruction(ctx, OPCODE_UNIFORM_3I64V, 2 + POINTER_DWORDS); if (n) { - n[1].ui = program; - n[2].i = location; - n[3].i = count; - save_pointer(&n[4], memdup(v, count * 2 * sizeof(GLfloat))); + n[1].i = location; + n[2].i = count; + save_pointer(&n[3], memdup(v, count * 3 * sizeof(GLint64))); } if (ctx->ExecuteFlag) { - CALL_ProgramUniform2fv(ctx->Exec, (program, location, count, v)); + CALL_Uniform3i64vARB(ctx->Exec, (location, count, v)); } } static void GLAPIENTRY -save_ProgramUniform3fv(GLuint program, GLint location, GLsizei count, - const GLfloat *v) +save_Uniform4i64vARB(GLint location, GLsizei count, const GLint64 *v) { GET_CURRENT_CONTEXT(ctx); Node *n; ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx); - n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_3FV, 3 + POINTER_DWORDS); + n = alloc_instruction(ctx, OPCODE_UNIFORM_4I64V, 2 + POINTER_DWORDS); if (n) { - n[1].ui = program; - n[2].i = location; - n[3].i = count; - save_pointer(&n[4], memdup(v, count * 3 * sizeof(GLfloat))); + n[1].i = location; + n[2].i = count; + save_pointer(&n[3], memdup(v, count * 4 * sizeof(GLint64))); } if (ctx->ExecuteFlag) { - CALL_ProgramUniform3fv(ctx->Exec, (program, location, count, v)); + CALL_Uniform4i64vARB(ctx->Exec, (location, count, v)); } } static void GLAPIENTRY -save_ProgramUniform4fv(GLuint program, GLint location, GLsizei count, - const GLfloat *v) +save_Uniform1ui64ARB(GLint location, GLuint64 x) { GET_CURRENT_CONTEXT(ctx); Node *n; ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx); - n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_4FV, 3 + POINTER_DWORDS); + n = alloc_instruction(ctx, OPCODE_UNIFORM_1UI64, 3); if (n) { - n[1].ui = program; - n[2].i = location; - n[3].i = count; - save_pointer(&n[4], memdup(v, count * 4 * sizeof(GLfloat))); + n[1].i = location; + ASSIGN_UINT64_TO_NODES(n, 2, x); } if (ctx->ExecuteFlag) { - CALL_ProgramUniform4fv(ctx->Exec, (program, location, count, v)); + CALL_Uniform1ui64ARB(ctx->Exec, (location, x)); } } static void GLAPIENTRY -save_ProgramUniform1d(GLuint program, GLint location, GLdouble x) +save_Uniform2ui64ARB(GLint location, GLuint64 x, GLuint64 y) { GET_CURRENT_CONTEXT(ctx); Node *n; ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx); - n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_1D, 4); + n = alloc_instruction(ctx, OPCODE_UNIFORM_2UI64, 5); if (n) { - n[1].ui = program; - n[2].i = location; - ASSIGN_DOUBLE_TO_NODES(n, 3, x); + n[1].i = location; + ASSIGN_UINT64_TO_NODES(n, 2, x); + ASSIGN_UINT64_TO_NODES(n, 4, y); } if (ctx->ExecuteFlag) { - CALL_ProgramUniform1d(ctx->Exec, (program, location, x)); + CALL_Uniform2ui64ARB(ctx->Exec, (location, x, y)); } } static void GLAPIENTRY -save_ProgramUniform2d(GLuint program, GLint location, GLdouble x, GLdouble y) +save_Uniform3ui64ARB(GLint location, GLuint64 x, GLuint64 y, GLuint64 z) { GET_CURRENT_CONTEXT(ctx); Node *n; ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx); - n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_2D, 6); + n = alloc_instruction(ctx, OPCODE_UNIFORM_3UI64, 7); if (n) { - n[1].ui = program; - n[2].i = location; - ASSIGN_DOUBLE_TO_NODES(n, 3, x); - ASSIGN_DOUBLE_TO_NODES(n, 5, y); + n[1].i = location; + ASSIGN_UINT64_TO_NODES(n, 2, x); + ASSIGN_UINT64_TO_NODES(n, 4, y); + ASSIGN_UINT64_TO_NODES(n, 6, z); } if (ctx->ExecuteFlag) { - CALL_ProgramUniform2d(ctx->Exec, (program, location, x, y)); + CALL_Uniform3ui64ARB(ctx->Exec, (location, x, y, z)); } } static void GLAPIENTRY -save_ProgramUniform3d(GLuint program, GLint location, - GLdouble x, GLdouble y, GLdouble z) +save_Uniform4ui64ARB(GLint location, GLuint64 x, GLuint64 y, GLuint64 z, GLuint64 w) { GET_CURRENT_CONTEXT(ctx); Node *n; ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx); - n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_3D, 8); + n = alloc_instruction(ctx, OPCODE_UNIFORM_4UI64, 9); if (n) { - n[1].ui = program; - n[2].i = location; - ASSIGN_DOUBLE_TO_NODES(n, 3, x); - ASSIGN_DOUBLE_TO_NODES(n, 5, y); - ASSIGN_DOUBLE_TO_NODES(n, 7, z); + n[1].i = location; + ASSIGN_UINT64_TO_NODES(n, 2, x); + ASSIGN_UINT64_TO_NODES(n, 4, y); + ASSIGN_UINT64_TO_NODES(n, 6, z); + ASSIGN_UINT64_TO_NODES(n, 8, w); } if (ctx->ExecuteFlag) { - CALL_ProgramUniform3d(ctx->Exec, (program, location, x, y, z)); + CALL_Uniform4ui64ARB(ctx->Exec, (location, x, y, z, w)); } } static void GLAPIENTRY -save_ProgramUniform4d(GLuint program, GLint location, - GLdouble x, GLdouble y, GLdouble z, GLdouble w) +save_Uniform1ui64vARB(GLint location, GLsizei count, const GLuint64 *v) { GET_CURRENT_CONTEXT(ctx); Node *n; ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx); - n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_4D, 10); + n = alloc_instruction(ctx, OPCODE_UNIFORM_1UI64V, 2 + POINTER_DWORDS); if (n) { - n[1].ui = program; - n[2].i = location; - ASSIGN_DOUBLE_TO_NODES(n, 3, x); - ASSIGN_DOUBLE_TO_NODES(n, 5, y); - ASSIGN_DOUBLE_TO_NODES(n, 7, z); - ASSIGN_DOUBLE_TO_NODES(n, 9, w); + n[1].i = location; + n[2].i = count; + save_pointer(&n[3], memdup(v, count * 1 * sizeof(GLuint64))); } if (ctx->ExecuteFlag) { - CALL_ProgramUniform4d(ctx->Exec, (program, location, x, y, z, w)); + CALL_Uniform1ui64vARB(ctx->Exec, (location, count, v)); } } static void GLAPIENTRY -save_ProgramUniform1dv(GLuint program, GLint location, GLsizei count, - const GLdouble *v) +save_Uniform2ui64vARB(GLint location, GLsizei count, const GLuint64 *v) { GET_CURRENT_CONTEXT(ctx); Node *n; ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx); - n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_1DV, 3 + POINTER_DWORDS); + n = alloc_instruction(ctx, OPCODE_UNIFORM_2UI64V, 2 + POINTER_DWORDS); if (n) { - n[1].ui = program; - n[2].i = location; - n[3].i = count; - save_pointer(&n[4], memdup(v, count * 1 * sizeof(GLdouble))); + n[1].i = location; + n[2].i = count; + save_pointer(&n[3], memdup(v, count * 2 * sizeof(GLuint64))); } if (ctx->ExecuteFlag) { - CALL_ProgramUniform1dv(ctx->Exec, (program, location, count, v)); + CALL_Uniform2ui64vARB(ctx->Exec, (location, count, v)); } } static void GLAPIENTRY -save_ProgramUniform2dv(GLuint program, GLint location, GLsizei count, - const GLdouble *v) +save_Uniform3ui64vARB(GLint location, GLsizei count, const GLuint64 *v) { GET_CURRENT_CONTEXT(ctx); Node *n; ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx); - n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_2DV, 3 + POINTER_DWORDS); + n = alloc_instruction(ctx, OPCODE_UNIFORM_3UI64V, 2 + POINTER_DWORDS); if (n) { - n[1].ui = program; - n[2].i = location; - n[3].i = count; - save_pointer(&n[4], memdup(v, count * 2 * sizeof(GLdouble))); + n[1].i = location; + n[2].i = count; + save_pointer(&n[3], memdup(v, count * 3 * sizeof(GLuint64))); } if (ctx->ExecuteFlag) { - CALL_ProgramUniform2dv(ctx->Exec, (program, location, count, v)); + CALL_Uniform3ui64vARB(ctx->Exec, (location, count, v)); } } static void GLAPIENTRY -save_ProgramUniform3dv(GLuint program, GLint location, GLsizei count, - const GLdouble *v) +save_Uniform4ui64vARB(GLint location, GLsizei count, const GLuint64 *v) { GET_CURRENT_CONTEXT(ctx); Node *n; ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx); - n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_3DV, 3 + POINTER_DWORDS); + n = alloc_instruction(ctx, OPCODE_UNIFORM_4UI64V, 2 + POINTER_DWORDS); if (n) { - n[1].ui = program; - n[2].i = location; - n[3].i = count; - save_pointer(&n[4], memdup(v, count * 3 * sizeof(GLdouble))); + n[1].i = location; + n[2].i = count; + save_pointer(&n[3], memdup(v, count * 4 * sizeof(GLuint64))); } if (ctx->ExecuteFlag) { - CALL_ProgramUniform3dv(ctx->Exec, (program, location, count, v)); + CALL_Uniform4ui64vARB(ctx->Exec, (location, count, v)); } } static void GLAPIENTRY -save_ProgramUniform4dv(GLuint program, GLint location, GLsizei count, - const GLdouble *v) +save_ProgramUniform1i64ARB(GLuint program, GLint location, GLint64 x) { GET_CURRENT_CONTEXT(ctx); Node *n; ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx); - n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_4DV, 3 + POINTER_DWORDS); + n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_1I64, 4); if (n) { n[1].ui = program; n[2].i = location; - n[3].i = count; - save_pointer(&n[4], memdup(v, count * 4 * sizeof(GLdouble))); + ASSIGN_INT64_TO_NODES(n, 3, x); } if (ctx->ExecuteFlag) { - CALL_ProgramUniform4dv(ctx->Exec, (program, location, count, v)); + CALL_ProgramUniform1i64ARB(ctx->Exec, (program, location, x)); } } static void GLAPIENTRY -save_ProgramUniform1i(GLuint program, GLint location, GLint x) +save_ProgramUniform2i64ARB(GLuint program, GLint location, GLint64 x, + GLint64 y) { GET_CURRENT_CONTEXT(ctx); Node *n; ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx); - n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_1I, 3); + n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_2I64, 6); if (n) { n[1].ui = program; n[2].i = location; - n[3].i = x; + ASSIGN_INT64_TO_NODES(n, 3, x); + ASSIGN_INT64_TO_NODES(n, 5, y); } if (ctx->ExecuteFlag) { - CALL_ProgramUniform1i(ctx->Exec, (program, location, x)); + CALL_ProgramUniform2i64ARB(ctx->Exec, (program, location, x, y)); } } static void GLAPIENTRY -save_ProgramUniform2i(GLuint program, GLint location, GLint x, GLint y) +save_ProgramUniform3i64ARB(GLuint program, GLint location, GLint64 x, + GLint64 y, GLint64 z) { GET_CURRENT_CONTEXT(ctx); Node *n; ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx); - n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_2I, 4); + n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_3I64, 8); if (n) { n[1].ui = program; n[2].i = location; - n[3].i = x; - n[4].i = y; + ASSIGN_INT64_TO_NODES(n, 3, x); + ASSIGN_INT64_TO_NODES(n, 5, y); + ASSIGN_INT64_TO_NODES(n, 7, z); } if (ctx->ExecuteFlag) { - CALL_ProgramUniform2i(ctx->Exec, (program, location, x, y)); + CALL_ProgramUniform3i64ARB(ctx->Exec, (program, location, x, y, z)); } } static void GLAPIENTRY -save_ProgramUniform3i(GLuint program, GLint location, - GLint x, GLint y, GLint z) +save_ProgramUniform4i64ARB(GLuint program, GLint location, GLint64 x, + GLint64 y, GLint64 z, GLint64 w) { GET_CURRENT_CONTEXT(ctx); Node *n; ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx); - n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_3I, 5); + n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_4I64, 10); if (n) { n[1].ui = program; n[2].i = location; - n[3].i = x; - n[4].i = y; - n[5].i = z; + ASSIGN_INT64_TO_NODES(n, 3, x); + ASSIGN_INT64_TO_NODES(n, 5, y); + ASSIGN_INT64_TO_NODES(n, 7, z); + ASSIGN_INT64_TO_NODES(n, 9, w); } if (ctx->ExecuteFlag) { - CALL_ProgramUniform3i(ctx->Exec, (program, location, x, y, z)); + CALL_ProgramUniform4i64ARB(ctx->Exec, (program, location, x, y, z, w)); } } static void GLAPIENTRY -save_ProgramUniform4i(GLuint program, GLint location, - GLint x, GLint y, GLint z, GLint w) +save_ProgramUniform1i64vARB(GLuint program, GLint location, GLsizei count, + const GLint64 *v) { GET_CURRENT_CONTEXT(ctx); Node *n; ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx); - n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_4I, 6); + n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_1I64V, 3 + POINTER_DWORDS); if (n) { n[1].ui = program; n[2].i = location; - n[3].i = x; - n[4].i = y; - n[5].i = z; - n[6].i = w; + n[3].i = count; + save_pointer(&n[4], memdup(v, count * 1 * sizeof(GLint64))); } if (ctx->ExecuteFlag) { - CALL_ProgramUniform4i(ctx->Exec, (program, location, x, y, z, w)); + CALL_ProgramUniform1i64vARB(ctx->Exec, (program, location, count, v)); } } static void GLAPIENTRY -save_ProgramUniform1iv(GLuint program, GLint location, GLsizei count, - const GLint *v) +save_ProgramUniform2i64vARB(GLuint program, GLint location, GLsizei count, + const GLint64 *v) { GET_CURRENT_CONTEXT(ctx); Node *n; ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx); - n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_1IV, 3 + POINTER_DWORDS); + n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_2I64V, 3 + POINTER_DWORDS); if (n) { n[1].ui = program; n[2].i = location; n[3].i = count; - save_pointer(&n[4], memdup(v, count * 1 * sizeof(GLint))); + save_pointer(&n[4], memdup(v, count * 1 * sizeof(GLint64))); } if (ctx->ExecuteFlag) { - CALL_ProgramUniform1iv(ctx->Exec, (program, location, count, v)); + CALL_ProgramUniform2i64vARB(ctx->Exec, (program, location, count, v)); } } static void GLAPIENTRY -save_ProgramUniform2iv(GLuint program, GLint location, GLsizei count, - const GLint *v) +save_ProgramUniform3i64vARB(GLuint program, GLint location, GLsizei count, + const GLint64 *v) { GET_CURRENT_CONTEXT(ctx); Node *n; ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx); - n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_2IV, 3 + POINTER_DWORDS); + n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_3I64V, 3 + POINTER_DWORDS); if (n) { n[1].ui = program; n[2].i = location; n[3].i = count; - save_pointer(&n[4], memdup(v, count * 2 * sizeof(GLint))); + save_pointer(&n[4], memdup(v, count * 1 * sizeof(GLint64))); } if (ctx->ExecuteFlag) { - CALL_ProgramUniform2iv(ctx->Exec, (program, location, count, v)); + CALL_ProgramUniform3i64vARB(ctx->Exec, (program, location, count, v)); } } static void GLAPIENTRY -save_ProgramUniform3iv(GLuint program, GLint location, GLsizei count, - const GLint *v) +save_ProgramUniform4i64vARB(GLuint program, GLint location, GLsizei count, + const GLint64 *v) { GET_CURRENT_CONTEXT(ctx); Node *n; ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx); - n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_3IV, 3 + POINTER_DWORDS); + n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_4I64V, 3 + POINTER_DWORDS); if (n) { n[1].ui = program; n[2].i = location; n[3].i = count; - save_pointer(&n[4], memdup(v, count * 3 * sizeof(GLint))); + save_pointer(&n[4], memdup(v, count * 1 * sizeof(GLint64))); } if (ctx->ExecuteFlag) { - CALL_ProgramUniform3iv(ctx->Exec, (program, location, count, v)); + CALL_ProgramUniform4i64vARB(ctx->Exec, (program, location, count, v)); } } static void GLAPIENTRY -save_ProgramUniform4iv(GLuint program, GLint location, GLsizei count, - const GLint *v) +save_ProgramUniform1ui64ARB(GLuint program, GLint location, GLuint64 x) { GET_CURRENT_CONTEXT(ctx); Node *n; ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx); - n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_4IV, 3 + POINTER_DWORDS); + n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_1UI64, 4); if (n) { n[1].ui = program; n[2].i = location; - n[3].i = count; - save_pointer(&n[4], memdup(v, count * 4 * sizeof(GLint))); + ASSIGN_UINT64_TO_NODES(n, 3, x); } if (ctx->ExecuteFlag) { - CALL_ProgramUniform4iv(ctx->Exec, (program, location, count, v)); + CALL_ProgramUniform1ui64ARB(ctx->Exec, (program, location, x)); } } static void GLAPIENTRY -save_ProgramUniform1ui(GLuint program, GLint location, GLuint x) +save_ProgramUniform2ui64ARB(GLuint program, GLint location, GLuint64 x, + GLuint64 y) { GET_CURRENT_CONTEXT(ctx); Node *n; ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx); - n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_1UI, 3); + n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_2UI64, 6); if (n) { n[1].ui = program; n[2].i = location; - n[3].ui = x; + ASSIGN_UINT64_TO_NODES(n, 3, x); + ASSIGN_UINT64_TO_NODES(n, 5, y); } if (ctx->ExecuteFlag) { - CALL_ProgramUniform1ui(ctx->Exec, (program, location, x)); + CALL_ProgramUniform2ui64ARB(ctx->Exec, (program, location, x, y)); } } static void GLAPIENTRY -save_ProgramUniform2ui(GLuint program, GLint location, GLuint x, GLuint y) +save_ProgramUniform3ui64ARB(GLuint program, GLint location, GLuint64 x, + GLuint64 y, GLuint64 z) { GET_CURRENT_CONTEXT(ctx); Node *n; ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx); - n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_2UI, 4); + n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_3UI64, 8); if (n) { n[1].ui = program; n[2].i = location; - n[3].ui = x; - n[4].ui = y; + ASSIGN_UINT64_TO_NODES(n, 3, x); + ASSIGN_UINT64_TO_NODES(n, 5, y); + ASSIGN_UINT64_TO_NODES(n, 7, z); } if (ctx->ExecuteFlag) { - CALL_ProgramUniform2ui(ctx->Exec, (program, location, x, y)); + CALL_ProgramUniform3ui64ARB(ctx->Exec, (program, location, x, y, z)); } } static void GLAPIENTRY -save_ProgramUniform3ui(GLuint program, GLint location, - GLuint x, GLuint y, GLuint z) +save_ProgramUniform4ui64ARB(GLuint program, GLint location, GLuint64 x, + GLuint64 y, GLuint64 z, GLuint64 w) { GET_CURRENT_CONTEXT(ctx); Node *n; ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx); - n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_3UI, 5); + n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_4UI64, 10); if (n) { n[1].ui = program; n[2].i = location; - n[3].ui = x; - n[4].ui = y; - n[5].ui = z; + ASSIGN_UINT64_TO_NODES(n, 3, x); + ASSIGN_UINT64_TO_NODES(n, 5, y); + ASSIGN_UINT64_TO_NODES(n, 7, z); + ASSIGN_UINT64_TO_NODES(n, 9, w); } if (ctx->ExecuteFlag) { - CALL_ProgramUniform3ui(ctx->Exec, (program, location, x, y, z)); + CALL_ProgramUniform4i64ARB(ctx->Exec, (program, location, x, y, z, w)); } } static void GLAPIENTRY -save_ProgramUniform4ui(GLuint program, GLint location, - GLuint x, GLuint y, GLuint z, GLuint w) +save_ProgramUniform1ui64vARB(GLuint program, GLint location, GLsizei count, + const GLuint64 *v) { GET_CURRENT_CONTEXT(ctx); Node *n; ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx); - n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_4UI, 6); + n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_1UI64V, + 3 + POINTER_DWORDS); if (n) { n[1].ui = program; n[2].i = location; - n[3].ui = x; - n[4].ui = y; - n[5].ui = z; - n[6].ui = w; + n[3].i = count; + save_pointer(&n[4], memdup(v, count * 1 * sizeof(GLuint64))); } if (ctx->ExecuteFlag) { - CALL_ProgramUniform4ui(ctx->Exec, (program, location, x, y, z, w)); + CALL_ProgramUniform1ui64vARB(ctx->Exec, (program, location, count, v)); } } static void GLAPIENTRY -save_ProgramUniform1uiv(GLuint program, GLint location, GLsizei count, - const GLuint *v) +save_ProgramUniform2ui64vARB(GLuint program, GLint location, GLsizei count, + const GLuint64 *v) { GET_CURRENT_CONTEXT(ctx); Node *n; ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx); - n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_1UIV, 3 + POINTER_DWORDS); + n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_2UI64V, + 3 + POINTER_DWORDS); if (n) { n[1].ui = program; n[2].i = location; n[3].i = count; - save_pointer(&n[4], memdup(v, count * 1 * sizeof(GLuint))); + save_pointer(&n[4], memdup(v, count * 1 * sizeof(GLuint64))); } if (ctx->ExecuteFlag) { - CALL_ProgramUniform1uiv(ctx->Exec, (program, location, count, v)); + CALL_ProgramUniform2ui64vARB(ctx->Exec, (program, location, count, v)); } } static void GLAPIENTRY -save_ProgramUniform2uiv(GLuint program, GLint location, GLsizei count, - const GLuint *v) +save_ProgramUniform3ui64vARB(GLuint program, GLint location, GLsizei count, + const GLuint64 *v) { GET_CURRENT_CONTEXT(ctx); Node *n; ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx); - n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_2UIV, 3 + POINTER_DWORDS); + n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_3UI64V, + 3 + POINTER_DWORDS); if (n) { n[1].ui = program; n[2].i = location; n[3].i = count; - save_pointer(&n[4], memdup(v, count * 2 * sizeof(GLuint))); + save_pointer(&n[4], memdup(v, count * 1 * sizeof(GLuint64))); } if (ctx->ExecuteFlag) { - CALL_ProgramUniform2uiv(ctx->Exec, (program, location, count, v)); + CALL_ProgramUniform3ui64vARB(ctx->Exec, (program, location, count, v)); } } static void GLAPIENTRY -save_ProgramUniform3uiv(GLuint program, GLint location, GLsizei count, - const GLuint *v) +save_ProgramUniform4ui64vARB(GLuint program, GLint location, GLsizei count, + const GLuint64 *v) { GET_CURRENT_CONTEXT(ctx); Node *n; ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx); - n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_3UIV, 3 + POINTER_DWORDS); + n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_4UI64V, + 3 + POINTER_DWORDS); if (n) { n[1].ui = program; n[2].i = location; n[3].i = count; - save_pointer(&n[4], memdup(v, count * 3 * sizeof(GLuint))); + save_pointer(&n[4], memdup(v, count * 1 * sizeof(GLuint64))); } if (ctx->ExecuteFlag) { - CALL_ProgramUniform3uiv(ctx->Exec, (program, location, count, v)); + CALL_ProgramUniform4ui64vARB(ctx->Exec, (program, location, count, v)); } } + static void GLAPIENTRY -save_ProgramUniform4uiv(GLuint program, GLint location, GLsizei count, - const GLuint *v) +save_UseProgramStages(GLuint pipeline, GLbitfield stages, GLuint program) { GET_CURRENT_CONTEXT(ctx); Node *n; ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx); - n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_4UIV, 3 + POINTER_DWORDS); + n = alloc_instruction(ctx, OPCODE_USE_PROGRAM_STAGES, 3); if (n) { - n[1].ui = program; - n[2].i = location; - n[3].i = count; - save_pointer(&n[4], memdup(v, count * 4 * sizeof(GLuint))); + n[1].ui = pipeline; + n[2].ui = stages; + n[3].ui = program; } if (ctx->ExecuteFlag) { - CALL_ProgramUniform4uiv(ctx->Exec, (program, location, count, v)); + CALL_UseProgramStages(ctx->Exec, (pipeline, stages, program)); + } +} + +static void GLAPIENTRY +save_ProgramUniform1f(GLuint program, GLint location, GLfloat x) +{ + GET_CURRENT_CONTEXT(ctx); + Node *n; + ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx); + n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_1F, 3); + if (n) { + n[1].ui = program; + n[2].i = location; + n[3].f = x; + } + if (ctx->ExecuteFlag) { + CALL_ProgramUniform1f(ctx->Exec, (program, location, x)); + } +} + +static void GLAPIENTRY +save_ProgramUniform2f(GLuint program, GLint location, GLfloat x, GLfloat y) +{ + GET_CURRENT_CONTEXT(ctx); + Node *n; + ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx); + n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_2F, 4); + if (n) { + n[1].ui = program; + n[2].i = location; + n[3].f = x; + n[4].f = y; + } + if (ctx->ExecuteFlag) { + CALL_ProgramUniform2f(ctx->Exec, (program, location, x, y)); + } +} + +static void GLAPIENTRY +save_ProgramUniform3f(GLuint program, GLint location, + GLfloat x, GLfloat y, GLfloat z) +{ + GET_CURRENT_CONTEXT(ctx); + Node *n; + ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx); + n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_3F, 5); + if (n) { + n[1].ui = program; + n[2].i = location; + n[3].f = x; + n[4].f = y; + n[5].f = z; + } + if (ctx->ExecuteFlag) { + CALL_ProgramUniform3f(ctx->Exec, (program, location, x, y, z)); + } +} + +static void GLAPIENTRY +save_ProgramUniform4f(GLuint program, GLint location, + GLfloat x, GLfloat y, GLfloat z, GLfloat w) +{ + GET_CURRENT_CONTEXT(ctx); + Node *n; + ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx); + n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_4F, 6); + if (n) { + n[1].ui = program; + n[2].i = location; + n[3].f = x; + n[4].f = y; + n[5].f = z; + n[6].f = w; + } + if (ctx->ExecuteFlag) { + CALL_ProgramUniform4f(ctx->Exec, (program, location, x, y, z, w)); + } +} + +static void GLAPIENTRY +save_ProgramUniform1fv(GLuint program, GLint location, GLsizei count, + const GLfloat *v) +{ + GET_CURRENT_CONTEXT(ctx); + Node *n; + ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx); + n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_1FV, 3 + POINTER_DWORDS); + if (n) { + n[1].ui = program; + n[2].i = location; + n[3].i = count; + save_pointer(&n[4], memdup(v, count * 1 * sizeof(GLfloat))); + } + if (ctx->ExecuteFlag) { + CALL_ProgramUniform1fv(ctx->Exec, (program, location, count, v)); + } +} + +static void GLAPIENTRY +save_ProgramUniform2fv(GLuint program, GLint location, GLsizei count, + const GLfloat *v) +{ + GET_CURRENT_CONTEXT(ctx); + Node *n; + ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx); + n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_2FV, 3 + POINTER_DWORDS); + if (n) { + n[1].ui = program; + n[2].i = location; + n[3].i = count; + save_pointer(&n[4], memdup(v, count * 2 * sizeof(GLfloat))); + } + if (ctx->ExecuteFlag) { + CALL_ProgramUniform2fv(ctx->Exec, (program, location, count, v)); + } +} + +static void GLAPIENTRY +save_ProgramUniform3fv(GLuint program, GLint location, GLsizei count, + const GLfloat *v) +{ + GET_CURRENT_CONTEXT(ctx); + Node *n; + ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx); + n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_3FV, 3 + POINTER_DWORDS); + if (n) { + n[1].ui = program; + n[2].i = location; + n[3].i = count; + save_pointer(&n[4], memdup(v, count * 3 * sizeof(GLfloat))); + } + if (ctx->ExecuteFlag) { + CALL_ProgramUniform3fv(ctx->Exec, (program, location, count, v)); + } +} + +static void GLAPIENTRY +save_ProgramUniform4fv(GLuint program, GLint location, GLsizei count, + const GLfloat *v) +{ + GET_CURRENT_CONTEXT(ctx); + Node *n; + ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx); + n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_4FV, 3 + POINTER_DWORDS); + if (n) { + n[1].ui = program; + n[2].i = location; + n[3].i = count; + save_pointer(&n[4], memdup(v, count * 4 * sizeof(GLfloat))); + } + if (ctx->ExecuteFlag) { + CALL_ProgramUniform4fv(ctx->Exec, (program, location, count, v)); + } +} + +static void GLAPIENTRY +save_ProgramUniform1d(GLuint program, GLint location, GLdouble x) +{ + GET_CURRENT_CONTEXT(ctx); + Node *n; + ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx); + n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_1D, 4); + if (n) { + n[1].ui = program; + n[2].i = location; + ASSIGN_DOUBLE_TO_NODES(n, 3, x); + } + if (ctx->ExecuteFlag) { + CALL_ProgramUniform1d(ctx->Exec, (program, location, x)); + } +} + +static void GLAPIENTRY +save_ProgramUniform2d(GLuint program, GLint location, GLdouble x, GLdouble y) +{ + GET_CURRENT_CONTEXT(ctx); + Node *n; + ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx); + n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_2D, 6); + if (n) { + n[1].ui = program; + n[2].i = location; + ASSIGN_DOUBLE_TO_NODES(n, 3, x); + ASSIGN_DOUBLE_TO_NODES(n, 5, y); + } + if (ctx->ExecuteFlag) { + CALL_ProgramUniform2d(ctx->Exec, (program, location, x, y)); + } +} + +static void GLAPIENTRY +save_ProgramUniform3d(GLuint program, GLint location, + GLdouble x, GLdouble y, GLdouble z) +{ + GET_CURRENT_CONTEXT(ctx); + Node *n; + ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx); + n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_3D, 8); + if (n) { + n[1].ui = program; + n[2].i = location; + ASSIGN_DOUBLE_TO_NODES(n, 3, x); + ASSIGN_DOUBLE_TO_NODES(n, 5, y); + ASSIGN_DOUBLE_TO_NODES(n, 7, z); + } + if (ctx->ExecuteFlag) { + CALL_ProgramUniform3d(ctx->Exec, (program, location, x, y, z)); + } +} + +static void GLAPIENTRY +save_ProgramUniform4d(GLuint program, GLint location, + GLdouble x, GLdouble y, GLdouble z, GLdouble w) +{ + GET_CURRENT_CONTEXT(ctx); + Node *n; + ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx); + n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_4D, 10); + if (n) { + n[1].ui = program; + n[2].i = location; + ASSIGN_DOUBLE_TO_NODES(n, 3, x); + ASSIGN_DOUBLE_TO_NODES(n, 5, y); + ASSIGN_DOUBLE_TO_NODES(n, 7, z); + ASSIGN_DOUBLE_TO_NODES(n, 9, w); + } + if (ctx->ExecuteFlag) { + CALL_ProgramUniform4d(ctx->Exec, (program, location, x, y, z, w)); + } +} + +static void GLAPIENTRY +save_ProgramUniform1dv(GLuint program, GLint location, GLsizei count, + const GLdouble *v) +{ + GET_CURRENT_CONTEXT(ctx); + Node *n; + ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx); + n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_1DV, 3 + POINTER_DWORDS); + if (n) { + n[1].ui = program; + n[2].i = location; + n[3].i = count; + save_pointer(&n[4], memdup(v, count * 1 * sizeof(GLdouble))); + } + if (ctx->ExecuteFlag) { + CALL_ProgramUniform1dv(ctx->Exec, (program, location, count, v)); + } +} + +static void GLAPIENTRY +save_ProgramUniform2dv(GLuint program, GLint location, GLsizei count, + const GLdouble *v) +{ + GET_CURRENT_CONTEXT(ctx); + Node *n; + ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx); + n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_2DV, 3 + POINTER_DWORDS); + if (n) { + n[1].ui = program; + n[2].i = location; + n[3].i = count; + save_pointer(&n[4], memdup(v, count * 2 * sizeof(GLdouble))); + } + if (ctx->ExecuteFlag) { + CALL_ProgramUniform2dv(ctx->Exec, (program, location, count, v)); + } +} + +static void GLAPIENTRY +save_ProgramUniform3dv(GLuint program, GLint location, GLsizei count, + const GLdouble *v) +{ + GET_CURRENT_CONTEXT(ctx); + Node *n; + ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx); + n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_3DV, 3 + POINTER_DWORDS); + if (n) { + n[1].ui = program; + n[2].i = location; + n[3].i = count; + save_pointer(&n[4], memdup(v, count * 3 * sizeof(GLdouble))); + } + if (ctx->ExecuteFlag) { + CALL_ProgramUniform3dv(ctx->Exec, (program, location, count, v)); + } +} + +static void GLAPIENTRY +save_ProgramUniform4dv(GLuint program, GLint location, GLsizei count, + const GLdouble *v) +{ + GET_CURRENT_CONTEXT(ctx); + Node *n; + ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx); + n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_4DV, 3 + POINTER_DWORDS); + if (n) { + n[1].ui = program; + n[2].i = location; + n[3].i = count; + save_pointer(&n[4], memdup(v, count * 4 * sizeof(GLdouble))); + } + if (ctx->ExecuteFlag) { + CALL_ProgramUniform4dv(ctx->Exec, (program, location, count, v)); + } +} + +static void GLAPIENTRY +save_ProgramUniform1i(GLuint program, GLint location, GLint x) +{ + GET_CURRENT_CONTEXT(ctx); + Node *n; + ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx); + n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_1I, 3); + if (n) { + n[1].ui = program; + n[2].i = location; + n[3].i = x; + } + if (ctx->ExecuteFlag) { + CALL_ProgramUniform1i(ctx->Exec, (program, location, x)); + } +} + +static void GLAPIENTRY +save_ProgramUniform2i(GLuint program, GLint location, GLint x, GLint y) +{ + GET_CURRENT_CONTEXT(ctx); + Node *n; + ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx); + n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_2I, 4); + if (n) { + n[1].ui = program; + n[2].i = location; + n[3].i = x; + n[4].i = y; + } + if (ctx->ExecuteFlag) { + CALL_ProgramUniform2i(ctx->Exec, (program, location, x, y)); + } +} + +static void GLAPIENTRY +save_ProgramUniform3i(GLuint program, GLint location, + GLint x, GLint y, GLint z) +{ + GET_CURRENT_CONTEXT(ctx); + Node *n; + ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx); + n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_3I, 5); + if (n) { + n[1].ui = program; + n[2].i = location; + n[3].i = x; + n[4].i = y; + n[5].i = z; + } + if (ctx->ExecuteFlag) { + CALL_ProgramUniform3i(ctx->Exec, (program, location, x, y, z)); + } +} + +static void GLAPIENTRY +save_ProgramUniform4i(GLuint program, GLint location, + GLint x, GLint y, GLint z, GLint w) +{ + GET_CURRENT_CONTEXT(ctx); + Node *n; + ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx); + n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_4I, 6); + if (n) { + n[1].ui = program; + n[2].i = location; + n[3].i = x; + n[4].i = y; + n[5].i = z; + n[6].i = w; + } + if (ctx->ExecuteFlag) { + CALL_ProgramUniform4i(ctx->Exec, (program, location, x, y, z, w)); + } +} + +static void GLAPIENTRY +save_ProgramUniform1iv(GLuint program, GLint location, GLsizei count, + const GLint *v) +{ + GET_CURRENT_CONTEXT(ctx); + Node *n; + ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx); + n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_1IV, 3 + POINTER_DWORDS); + if (n) { + n[1].ui = program; + n[2].i = location; + n[3].i = count; + save_pointer(&n[4], memdup(v, count * 1 * sizeof(GLint))); + } + if (ctx->ExecuteFlag) { + CALL_ProgramUniform1iv(ctx->Exec, (program, location, count, v)); + } +} + +static void GLAPIENTRY +save_ProgramUniform2iv(GLuint program, GLint location, GLsizei count, + const GLint *v) +{ + GET_CURRENT_CONTEXT(ctx); + Node *n; + ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx); + n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_2IV, 3 + POINTER_DWORDS); + if (n) { + n[1].ui = program; + n[2].i = location; + n[3].i = count; + save_pointer(&n[4], memdup(v, count * 2 * sizeof(GLint))); + } + if (ctx->ExecuteFlag) { + CALL_ProgramUniform2iv(ctx->Exec, (program, location, count, v)); + } +} + +static void GLAPIENTRY +save_ProgramUniform3iv(GLuint program, GLint location, GLsizei count, + const GLint *v) +{ + GET_CURRENT_CONTEXT(ctx); + Node *n; + ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx); + n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_3IV, 3 + POINTER_DWORDS); + if (n) { + n[1].ui = program; + n[2].i = location; + n[3].i = count; + save_pointer(&n[4], memdup(v, count * 3 * sizeof(GLint))); + } + if (ctx->ExecuteFlag) { + CALL_ProgramUniform3iv(ctx->Exec, (program, location, count, v)); + } +} + +static void GLAPIENTRY +save_ProgramUniform4iv(GLuint program, GLint location, GLsizei count, + const GLint *v) +{ + GET_CURRENT_CONTEXT(ctx); + Node *n; + ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx); + n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_4IV, 3 + POINTER_DWORDS); + if (n) { + n[1].ui = program; + n[2].i = location; + n[3].i = count; + save_pointer(&n[4], memdup(v, count * 4 * sizeof(GLint))); + } + if (ctx->ExecuteFlag) { + CALL_ProgramUniform4iv(ctx->Exec, (program, location, count, v)); + } +} + +static void GLAPIENTRY +save_ProgramUniform1ui(GLuint program, GLint location, GLuint x) +{ + GET_CURRENT_CONTEXT(ctx); + Node *n; + ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx); + n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_1UI, 3); + if (n) { + n[1].ui = program; + n[2].i = location; + n[3].ui = x; + } + if (ctx->ExecuteFlag) { + CALL_ProgramUniform1ui(ctx->Exec, (program, location, x)); + } +} + +static void GLAPIENTRY +save_ProgramUniform2ui(GLuint program, GLint location, GLuint x, GLuint y) +{ + GET_CURRENT_CONTEXT(ctx); + Node *n; + ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx); + n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_2UI, 4); + if (n) { + n[1].ui = program; + n[2].i = location; + n[3].ui = x; + n[4].ui = y; + } + if (ctx->ExecuteFlag) { + CALL_ProgramUniform2ui(ctx->Exec, (program, location, x, y)); + } +} + +static void GLAPIENTRY +save_ProgramUniform3ui(GLuint program, GLint location, + GLuint x, GLuint y, GLuint z) +{ + GET_CURRENT_CONTEXT(ctx); + Node *n; + ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx); + n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_3UI, 5); + if (n) { + n[1].ui = program; + n[2].i = location; + n[3].ui = x; + n[4].ui = y; + n[5].ui = z; + } + if (ctx->ExecuteFlag) { + CALL_ProgramUniform3ui(ctx->Exec, (program, location, x, y, z)); + } +} + +static void GLAPIENTRY +save_ProgramUniform4ui(GLuint program, GLint location, + GLuint x, GLuint y, GLuint z, GLuint w) +{ + GET_CURRENT_CONTEXT(ctx); + Node *n; + ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx); + n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_4UI, 6); + if (n) { + n[1].ui = program; + n[2].i = location; + n[3].ui = x; + n[4].ui = y; + n[5].ui = z; + n[6].ui = w; + } + if (ctx->ExecuteFlag) { + CALL_ProgramUniform4ui(ctx->Exec, (program, location, x, y, z, w)); + } +} + +static void GLAPIENTRY +save_ProgramUniform1uiv(GLuint program, GLint location, GLsizei count, + const GLuint *v) +{ + GET_CURRENT_CONTEXT(ctx); + Node *n; + ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx); + n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_1UIV, 3 + POINTER_DWORDS); + if (n) { + n[1].ui = program; + n[2].i = location; + n[3].i = count; + save_pointer(&n[4], memdup(v, count * 1 * sizeof(GLuint))); + } + if (ctx->ExecuteFlag) { + CALL_ProgramUniform1uiv(ctx->Exec, (program, location, count, v)); + } +} + +static void GLAPIENTRY +save_ProgramUniform2uiv(GLuint program, GLint location, GLsizei count, + const GLuint *v) +{ + GET_CURRENT_CONTEXT(ctx); + Node *n; + ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx); + n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_2UIV, 3 + POINTER_DWORDS); + if (n) { + n[1].ui = program; + n[2].i = location; + n[3].i = count; + save_pointer(&n[4], memdup(v, count * 2 * sizeof(GLuint))); + } + if (ctx->ExecuteFlag) { + CALL_ProgramUniform2uiv(ctx->Exec, (program, location, count, v)); + } +} + +static void GLAPIENTRY +save_ProgramUniform3uiv(GLuint program, GLint location, GLsizei count, + const GLuint *v) +{ + GET_CURRENT_CONTEXT(ctx); + Node *n; + ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx); + n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_3UIV, 3 + POINTER_DWORDS); + if (n) { + n[1].ui = program; + n[2].i = location; + n[3].i = count; + save_pointer(&n[4], memdup(v, count * 3 * sizeof(GLuint))); + } + if (ctx->ExecuteFlag) { + CALL_ProgramUniform3uiv(ctx->Exec, (program, location, count, v)); + } +} + +static void GLAPIENTRY +save_ProgramUniform4uiv(GLuint program, GLint location, GLsizei count, + const GLuint *v) +{ + GET_CURRENT_CONTEXT(ctx); + Node *n; + ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx); + n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_4UIV, 3 + POINTER_DWORDS); + if (n) { + n[1].ui = program; + n[2].i = location; + n[3].i = count; + save_pointer(&n[4], memdup(v, count * 4 * sizeof(GLuint))); + } + if (ctx->ExecuteFlag) { + CALL_ProgramUniform4uiv(ctx->Exec, (program, location, count, v)); } } @@ -9608,6 +10277,49 @@ } static void GLAPIENTRY +save_TextureParameterIivEXT(GLuint texture, GLenum target, GLenum pname, const GLint* params) +{ + GET_CURRENT_CONTEXT(ctx); + Node *n; + ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx); + n = alloc_instruction(ctx, OPCODE_TEXTUREPARAMETER_II, 7); + if (n) { + n[1].ui = texture; + n[2].e = target; + n[3].e = pname; + n[4].i = params[0]; + n[5].i = params[1]; + n[6].i = params[2]; + n[7].i = params[3]; + } + if (ctx->ExecuteFlag) { + CALL_TextureParameterIivEXT(ctx->Exec, (texture, target, pname, params)); + } +} + +static void GLAPIENTRY +save_TextureParameterIuivEXT(GLuint texture, GLenum target, GLenum pname, const GLuint* params) +{ + GET_CURRENT_CONTEXT(ctx); + Node *n; + ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx); + n = alloc_instruction(ctx, OPCODE_TEXTUREPARAMETER_IUI, 7); + if (n) { + n[1].ui = texture; + n[2].e = target; + n[3].e = pname; + n[4].ui = params[0]; + n[5].ui = params[1]; + n[6].ui = params[2]; + n[7].ui = params[3]; + } + if (ctx->ExecuteFlag) { + CALL_TextureParameterIuivEXT(ctx->Exec, (texture, target, pname, params)); + } +} + + +static void GLAPIENTRY save_TextureImage1DEXT(GLuint texture, GLenum target, GLint level, GLint components, GLsizei width, GLint border, @@ -10029,6 +10741,48 @@ } static void GLAPIENTRY +save_MultiTexParameterIivEXT(GLenum texunit, GLenum target, GLenum pname, const GLint *params) +{ + GET_CURRENT_CONTEXT(ctx); + Node *n; + ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx); + n = alloc_instruction(ctx, OPCODE_MULTITEXPARAMETER_II, 7); + if (n) { + n[1].e = texunit; + n[2].e = target; + n[3].e = pname; + n[4].i = params[0]; + n[5].i = params[1]; + n[6].i = params[2]; + n[7].i = params[3]; + } + if (ctx->ExecuteFlag) { + CALL_MultiTexParameterIivEXT(ctx->Exec, (texunit, target, pname, params)); + } +} + +static void GLAPIENTRY +save_MultiTexParameterIuivEXT(GLenum texunit, GLenum target, GLenum pname, const GLuint *params) +{ + GET_CURRENT_CONTEXT(ctx); + Node *n; + ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx); + n = alloc_instruction(ctx, OPCODE_MULTITEXPARAMETER_IUI, 7); + if (n) { + n[1].e = texunit; + n[2].e = target; + n[3].e = pname; + n[4].ui = params[0]; + n[5].ui = params[1]; + n[6].ui = params[2]; + n[7].ui = params[3]; + } + if (ctx->ExecuteFlag) { + CALL_MultiTexParameterIuivEXT(ctx->Exec, (texunit, target, pname, params)); + } +} + +static void GLAPIENTRY save_MultiTexParameteriEXT(GLenum texunit, GLenum target, GLenum pname, GLint param) { GLint fparam[4]; @@ -10900,6 +11654,87 @@ } +static void GLAPIENTRY +save_NamedProgramStringEXT(GLuint program, GLenum target, GLenum format, GLsizei len, + const GLvoid * string) +{ + GET_CURRENT_CONTEXT(ctx); + Node *n; + + ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx); + + n = alloc_instruction(ctx, OPCODE_NAMED_PROGRAM_STRING, 4 + POINTER_DWORDS); + if (n) { + GLubyte *programCopy = malloc(len); + if (!programCopy) { + _mesa_error(ctx, GL_OUT_OF_MEMORY, "glNamedProgramStringEXT"); + return; + } + memcpy(programCopy, string, len); + n[1].ui = program; + n[2].e = target; + n[3].e = format; + n[4].i = len; + save_pointer(&n[5], programCopy); + } + if (ctx->ExecuteFlag) { + CALL_NamedProgramStringEXT(ctx->Exec, (program, target, format, len, string)); + } +} + + +static void GLAPIENTRY +save_NamedProgramLocalParameter4fEXT(GLuint program, GLenum target, GLuint index, + GLfloat x, GLfloat y, GLfloat z, GLfloat w) +{ + GET_CURRENT_CONTEXT(ctx); + Node *n; + ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx); + n = alloc_instruction(ctx, OPCODE_NAMED_PROGRAM_LOCAL_PARAMETER, 7); + if (n) { + n[1].ui = program; + n[2].e = target; + n[3].ui = index; + n[4].f = x; + n[5].f = y; + n[6].f = z; + n[7].f = w; + } + if (ctx->ExecuteFlag) { + CALL_NamedProgramLocalParameter4fEXT(ctx->Exec, (program, target, index, x, y, z, w)); + } +} + + +static void GLAPIENTRY +save_NamedProgramLocalParameter4fvEXT(GLuint program, GLenum target, GLuint index, + const GLfloat *params) +{ + save_NamedProgramLocalParameter4fEXT(program, target, index, params[0], + params[1], params[2], params[3]); +} + + +static void GLAPIENTRY +save_NamedProgramLocalParameter4dEXT(GLuint program, GLenum target, GLuint index, + GLdouble x, GLdouble y, + GLdouble z, GLdouble w) +{ + save_NamedProgramLocalParameter4fEXT(program, target, index, (GLfloat) x, + (GLfloat) y, (GLfloat) z, (GLfloat) w); +} + + +static void GLAPIENTRY +save_NamedProgramLocalParameter4dvEXT(GLuint program, GLenum target, GLuint index, + const GLdouble *params) +{ + save_NamedProgramLocalParameter4fEXT(program, target, index, (GLfloat) params[0], + (GLfloat) params[1], (GLfloat) params[2], + (GLfloat) params[3]); +} + + /** * Save an error-generating command into display list. * @@ -11920,6 +12755,294 @@ (n[1].i, n[2].i, n[3].b, get_pointer(&n[4]))); break; + case OPCODE_UNIFORM_1I64: { + union int64_pair x; + + x.int32[0] = n[2].i; + x.int32[1] = n[3].i; + + CALL_Uniform1i64ARB(ctx->Exec, (n[1].i, x.int64)); + break; + } + case OPCODE_UNIFORM_2I64: { + union int64_pair x; + union int64_pair y; + + x.int32[0] = n[2].i; + x.int32[1] = n[3].i; + y.int32[0] = n[4].i; + y.int32[1] = n[5].i; + + CALL_Uniform2i64ARB(ctx->Exec, (n[1].i, x.int64, y.int64)); + break; + } + case OPCODE_UNIFORM_3I64: { + union int64_pair x; + union int64_pair y; + union int64_pair z; + + x.int32[0] = n[2].i; + x.int32[1] = n[3].i; + y.int32[0] = n[4].i; + y.int32[1] = n[5].i; + z.int32[0] = n[6].i; + z.int32[1] = n[7].i; + + + CALL_Uniform3i64ARB(ctx->Exec, (n[1].i, x.int64, y.int64, z.int64)); + break; + } + case OPCODE_UNIFORM_4I64: { + union int64_pair x; + union int64_pair y; + union int64_pair z; + union int64_pair w; + + x.int32[0] = n[2].i; + x.int32[1] = n[3].i; + y.int32[0] = n[4].i; + y.int32[1] = n[5].i; + z.int32[0] = n[6].i; + z.int32[1] = n[7].i; + w.int32[0] = n[8].i; + w.int32[1] = n[9].i; + + CALL_Uniform4i64ARB(ctx->Exec, (n[1].i, x.int64, y.int64, z.int64, w.int64)); + break; + } + case OPCODE_UNIFORM_1I64V: + CALL_Uniform1i64vARB(ctx->Exec, (n[1].i, n[2].i, get_pointer(&n[3]))); + break; + case OPCODE_UNIFORM_2I64V: + CALL_Uniform2i64vARB(ctx->Exec, (n[1].i, n[2].i, get_pointer(&n[3]))); + break; + case OPCODE_UNIFORM_3I64V: + CALL_Uniform3i64vARB(ctx->Exec, (n[1].i, n[2].i, get_pointer(&n[3]))); + break; + case OPCODE_UNIFORM_4I64V: + CALL_Uniform4i64vARB(ctx->Exec, (n[1].i, n[2].i, get_pointer(&n[3]))); + break; + case OPCODE_UNIFORM_1UI64: { + union uint64_pair x; + + x.uint32[0] = n[2].ui; + x.uint32[1] = n[3].ui; + + CALL_Uniform1ui64ARB(ctx->Exec, (n[1].i, x.uint64)); + break; + } + case OPCODE_UNIFORM_2UI64: { + union uint64_pair x; + union uint64_pair y; + + x.uint32[0] = n[2].ui; + x.uint32[1] = n[3].ui; + y.uint32[0] = n[4].ui; + y.uint32[1] = n[5].ui; + + CALL_Uniform2ui64ARB(ctx->Exec, (n[1].i, x.uint64, y.uint64)); + break; + } + case OPCODE_UNIFORM_3UI64: { + union uint64_pair x; + union uint64_pair y; + union uint64_pair z; + + x.uint32[0] = n[2].ui; + x.uint32[1] = n[3].ui; + y.uint32[0] = n[4].ui; + y.uint32[1] = n[5].ui; + z.uint32[0] = n[6].ui; + z.uint32[1] = n[7].ui; + + + CALL_Uniform3ui64ARB(ctx->Exec, (n[1].i, x.uint64, y.uint64, + z.uint64)); + break; + } + case OPCODE_UNIFORM_4UI64: { + union uint64_pair x; + union uint64_pair y; + union uint64_pair z; + union uint64_pair w; + + x.uint32[0] = n[2].ui; + x.uint32[1] = n[3].ui; + y.uint32[0] = n[4].ui; + y.uint32[1] = n[5].ui; + z.uint32[0] = n[6].ui; + z.uint32[1] = n[7].ui; + w.uint32[0] = n[8].ui; + w.uint32[1] = n[9].ui; + + CALL_Uniform4ui64ARB(ctx->Exec, (n[1].i, x.uint64, y.uint64, + z.uint64, w.uint64)); + break; + } + case OPCODE_UNIFORM_1UI64V: + CALL_Uniform1ui64vARB(ctx->Exec, (n[1].i, n[2].i, + get_pointer(&n[3]))); + break; + case OPCODE_UNIFORM_2UI64V: + CALL_Uniform2ui64vARB(ctx->Exec, (n[1].i, n[2].i, + get_pointer(&n[3]))); + break; + case OPCODE_UNIFORM_3UI64V: + CALL_Uniform3ui64vARB(ctx->Exec, (n[1].i, n[2].i, + get_pointer(&n[3]))); + break; + case OPCODE_UNIFORM_4UI64V: + CALL_Uniform4ui64vARB(ctx->Exec, (n[1].i, n[2].i, + get_pointer(&n[3]))); + break; + + case OPCODE_PROGRAM_UNIFORM_1I64: { + union int64_pair x; + + x.int32[0] = n[3].i; + x.int32[1] = n[4].i; + + CALL_ProgramUniform1i64ARB(ctx->Exec, (n[1].ui, n[2].i, x.int64)); + break; + } + case OPCODE_PROGRAM_UNIFORM_2I64: { + union int64_pair x; + union int64_pair y; + + x.int32[0] = n[3].i; + x.int32[1] = n[4].i; + y.int32[0] = n[5].i; + y.int32[1] = n[6].i; + + CALL_ProgramUniform2i64ARB(ctx->Exec, (n[1].ui, n[2].i, x.int64, + y.int64)); + break; + } + case OPCODE_PROGRAM_UNIFORM_3I64: { + union int64_pair x; + union int64_pair y; + union int64_pair z; + + x.int32[0] = n[3].i; + x.int32[1] = n[4].i; + y.int32[0] = n[5].i; + y.int32[1] = n[6].i; + z.int32[0] = n[7].i; + z.int32[1] = n[8].i; + + CALL_ProgramUniform3i64ARB(ctx->Exec, (n[1].ui, n[2].i, x.int64, + y.int64, z.int64)); + break; + } + case OPCODE_PROGRAM_UNIFORM_4I64: { + union int64_pair x; + union int64_pair y; + union int64_pair z; + union int64_pair w; + + x.int32[0] = n[3].i; + x.int32[1] = n[4].i; + y.int32[0] = n[5].i; + y.int32[1] = n[6].i; + z.int32[0] = n[7].i; + z.int32[1] = n[8].i; + w.int32[0] = n[9].i; + w.int32[1] = n[10].i; + + CALL_ProgramUniform4i64ARB(ctx->Exec, (n[1].ui, n[2].i, x.int64, + y.int64, z.int64, w.int64)); + break; + } + case OPCODE_PROGRAM_UNIFORM_1I64V: + CALL_ProgramUniform1i64vARB(ctx->Exec, (n[1].ui, n[2].i, n[3].i, + get_pointer(&n[4]))); + break; + case OPCODE_PROGRAM_UNIFORM_2I64V: + CALL_ProgramUniform2i64vARB(ctx->Exec, (n[1].ui, n[2].i, n[3].i, + get_pointer(&n[4]))); + break; + case OPCODE_PROGRAM_UNIFORM_3I64V: + CALL_ProgramUniform3i64vARB(ctx->Exec, (n[1].ui, n[2].i, n[3].i, + get_pointer(&n[4]))); + break; + case OPCODE_PROGRAM_UNIFORM_4I64V: + CALL_ProgramUniform4i64vARB(ctx->Exec, (n[1].ui, n[2].i, n[3].i, + get_pointer(&n[4]))); + break; + case OPCODE_PROGRAM_UNIFORM_1UI64: { + union uint64_pair x; + + x.uint32[0] = n[3].ui; + x.uint32[1] = n[4].ui; + + CALL_ProgramUniform1i64ARB(ctx->Exec, (n[1].ui, n[2].i, x.uint64)); + break; + } + case OPCODE_PROGRAM_UNIFORM_2UI64: { + union uint64_pair x; + union uint64_pair y; + + x.uint32[0] = n[3].ui; + x.uint32[1] = n[4].ui; + y.uint32[0] = n[5].ui; + y.uint32[1] = n[6].ui; + + CALL_ProgramUniform2ui64ARB(ctx->Exec, (n[1].ui, n[2].i, x.uint64, + y.uint64)); + break; + } + case OPCODE_PROGRAM_UNIFORM_3UI64: { + union uint64_pair x; + union uint64_pair y; + union uint64_pair z; + + x.uint32[0] = n[3].ui; + x.uint32[1] = n[4].ui; + y.uint32[0] = n[5].ui; + y.uint32[1] = n[6].ui; + z.uint32[0] = n[7].ui; + z.uint32[1] = n[8].ui; + + CALL_ProgramUniform3ui64ARB(ctx->Exec, (n[1].ui, n[2].i, x.uint64, + y.uint64, z.uint64)); + break; + } + case OPCODE_PROGRAM_UNIFORM_4UI64: { + union uint64_pair x; + union uint64_pair y; + union uint64_pair z; + union uint64_pair w; + + x.uint32[0] = n[3].ui; + x.uint32[1] = n[4].ui; + y.uint32[0] = n[5].ui; + y.uint32[1] = n[6].ui; + z.uint32[0] = n[7].ui; + z.uint32[1] = n[8].ui; + w.uint32[0] = n[9].ui; + w.uint32[1] = n[10].ui; + + CALL_ProgramUniform4ui64ARB(ctx->Exec, (n[1].ui, n[2].i, x.uint64, + y.uint64, z.uint64, w.uint64)); + break; + } + case OPCODE_PROGRAM_UNIFORM_1UI64V: + CALL_ProgramUniform1ui64vARB(ctx->Exec, (n[1].ui, n[2].i, n[3].i, + get_pointer(&n[4]))); + break; + case OPCODE_PROGRAM_UNIFORM_2UI64V: + CALL_ProgramUniform2ui64vARB(ctx->Exec, (n[1].ui, n[2].i, n[3].i, + get_pointer(&n[4]))); + break; + case OPCODE_PROGRAM_UNIFORM_3UI64V: + CALL_ProgramUniform3ui64vARB(ctx->Exec, (n[1].ui, n[2].i, n[3].i, + get_pointer(&n[4]))); + break; + case OPCODE_PROGRAM_UNIFORM_4UI64V: + CALL_ProgramUniform4ui64vARB(ctx->Exec, (n[1].ui, n[2].i, n[3].i, + get_pointer(&n[4]))); + break; + case OPCODE_USE_PROGRAM_STAGES: CALL_UseProgramStages(ctx->Exec, (n[1].ui, n[2].ui, n[3].ui)); break; @@ -12485,6 +13608,26 @@ CALL_TextureParameterivEXT(ctx->Exec, (n[1].ui, n[2].e, n[3].e, params)); } break; + case OPCODE_TEXTUREPARAMETER_II: + { + GLint params[4]; + params[0] = n[4].i; + params[1] = n[5].i; + params[2] = n[6].i; + params[3] = n[7].i; + CALL_TextureParameterIivEXT(ctx->Exec, (n[1].ui, n[2].e, n[3].e, params)); + } + break; + case OPCODE_TEXTUREPARAMETER_IUI: + { + GLuint params[4]; + params[0] = n[4].ui; + params[1] = n[5].ui; + params[2] = n[6].ui; + params[3] = n[7].ui; + CALL_TextureParameterIuivEXT(ctx->Exec, (n[1].ui, n[2].e, n[3].e, params)); + } + break; case OPCODE_TEXTURE_IMAGE1D: { const struct gl_pixelstore_attrib save = ctx->Unpack; @@ -12618,6 +13761,26 @@ CALL_MultiTexParameterivEXT(ctx->Exec, (n[1].e, n[2].e, n[3].e, params)); } break; + case OPCODE_MULTITEXPARAMETER_II: + { + GLint params[4]; + params[0] = n[4].i; + params[1] = n[5].i; + params[2] = n[6].i; + params[3] = n[7].i; + CALL_MultiTexParameterIivEXT(ctx->Exec, (n[1].e, n[2].e, n[3].e, params)); + } + break; + case OPCODE_MULTITEXPARAMETER_IUI: + { + GLuint params[4]; + params[0] = n[4].ui; + params[1] = n[5].ui; + params[2] = n[6].ui; + params[3] = n[7].ui; + CALL_MultiTexParameterIuivEXT(ctx->Exec, (n[1].e, n[2].e, n[3].e, params)); + } + break; case OPCODE_MULTITEX_IMAGE1D: { const struct gl_pixelstore_attrib save = ctx->Unpack; @@ -12810,6 +13973,16 @@ n[9].i, n[10].e, n[11].i, get_pointer(&n[12]))); break; + case OPCODE_NAMED_PROGRAM_STRING: + CALL_NamedProgramStringEXT(ctx->Exec, + (n[1].ui, n[2].e, n[3].e, n[4].i, + get_pointer(&n[5]))); + break; + case OPCODE_NAMED_PROGRAM_LOCAL_PARAMETER: + CALL_NamedProgramLocalParameter4fEXT(ctx->Exec, + (n[1].ui, n[2].e, n[3].ui, n[4].f, + n[5].f, n[6].f, n[7].f)); + break; case OPCODE_CONTINUE: n = (Node *) get_pointer(&n[1]); @@ -13648,6 +14821,41 @@ SET_UniformMatrix3x4dv(table, save_UniformMatrix3x4dv); SET_UniformMatrix4x3dv(table, save_UniformMatrix4x3dv); + /* GL_ARB_gpu_shader_int64 */ + SET_Uniform1i64ARB(table, save_Uniform1i64ARB); + SET_Uniform2i64ARB(table, save_Uniform2i64ARB); + SET_Uniform3i64ARB(table, save_Uniform3i64ARB); + SET_Uniform4i64ARB(table, save_Uniform4i64ARB); + SET_Uniform1i64vARB(table, save_Uniform1i64vARB); + SET_Uniform2i64vARB(table, save_Uniform2i64vARB); + SET_Uniform3i64vARB(table, save_Uniform3i64vARB); + SET_Uniform4i64vARB(table, save_Uniform4i64vARB); + SET_Uniform1ui64ARB(table, save_Uniform1ui64ARB); + SET_Uniform2ui64ARB(table, save_Uniform2ui64ARB); + SET_Uniform3ui64ARB(table, save_Uniform3ui64ARB); + SET_Uniform4ui64ARB(table, save_Uniform4ui64ARB); + SET_Uniform1ui64vARB(table, save_Uniform1ui64vARB); + SET_Uniform2ui64vARB(table, save_Uniform2ui64vARB); + SET_Uniform3ui64vARB(table, save_Uniform3ui64vARB); + SET_Uniform4ui64vARB(table, save_Uniform4ui64vARB); + + SET_ProgramUniform1i64ARB(table, save_ProgramUniform1i64ARB); + SET_ProgramUniform2i64ARB(table, save_ProgramUniform2i64ARB); + SET_ProgramUniform3i64ARB(table, save_ProgramUniform3i64ARB); + SET_ProgramUniform4i64ARB(table, save_ProgramUniform4i64ARB); + SET_ProgramUniform1i64vARB(table, save_ProgramUniform1i64vARB); + SET_ProgramUniform2i64vARB(table, save_ProgramUniform2i64vARB); + SET_ProgramUniform3i64vARB(table, save_ProgramUniform3i64vARB); + SET_ProgramUniform4i64vARB(table, save_ProgramUniform4i64vARB); + SET_ProgramUniform1ui64ARB(table, save_ProgramUniform1ui64ARB); + SET_ProgramUniform2ui64ARB(table, save_ProgramUniform2ui64ARB); + SET_ProgramUniform3ui64ARB(table, save_ProgramUniform3ui64ARB); + SET_ProgramUniform4ui64ARB(table, save_ProgramUniform4ui64ARB); + SET_ProgramUniform1ui64vARB(table, save_ProgramUniform1ui64vARB); + SET_ProgramUniform2ui64vARB(table, save_ProgramUniform2ui64vARB); + SET_ProgramUniform3ui64vARB(table, save_ProgramUniform3ui64vARB); + SET_ProgramUniform4ui64vARB(table, save_ProgramUniform4ui64vARB); + /* These are: */ SET_BeginTransformFeedback(table, save_BeginTransformFeedback); SET_EndTransformFeedback(table, save_EndTransformFeedback); @@ -13806,6 +15014,8 @@ SET_TextureParameterivEXT(table, save_TextureParameterivEXT); SET_TextureParameterfEXT(table, save_TextureParameterfEXT); SET_TextureParameterfvEXT(table, save_TextureParameterfvEXT); + SET_TextureParameterIivEXT(table, save_TextureParameterIivEXT); + SET_TextureParameterIuivEXT(table, save_TextureParameterIuivEXT); SET_TextureImage1DEXT(table, save_TextureImage1DEXT); SET_TextureImage2DEXT(table, save_TextureImage2DEXT); SET_TextureImage3DEXT(table, save_TextureImage3DEXT); @@ -13820,6 +15030,8 @@ SET_BindMultiTextureEXT(table, save_BindMultiTextureEXT); SET_MultiTexParameteriEXT(table, save_MultiTexParameteriEXT); SET_MultiTexParameterivEXT(table, save_MultiTexParameterivEXT); + SET_MultiTexParameterIivEXT(table, save_MultiTexParameterIivEXT); + SET_MultiTexParameterIuivEXT(table, save_MultiTexParameterIuivEXT); SET_MultiTexParameterfEXT(table, save_MultiTexParameterfEXT); SET_MultiTexParameterfvEXT(table, save_MultiTexParameterfvEXT); SET_MultiTexImage1DEXT(table, save_MultiTexImage1DEXT); @@ -13849,6 +15061,11 @@ SET_CompressedMultiTexSubImage1DEXT(table, save_CompressedMultiTexSubImage1DEXT); SET_CompressedMultiTexSubImage2DEXT(table, save_CompressedMultiTexSubImage2DEXT); SET_CompressedMultiTexSubImage3DEXT(table, save_CompressedMultiTexSubImage3DEXT); + SET_NamedProgramStringEXT(table, save_NamedProgramStringEXT); + SET_NamedProgramLocalParameter4dEXT(table, save_NamedProgramLocalParameter4dEXT); + SET_NamedProgramLocalParameter4dvEXT(table, save_NamedProgramLocalParameter4dvEXT); + SET_NamedProgramLocalParameter4fEXT(table, save_NamedProgramLocalParameter4fEXT); + SET_NamedProgramLocalParameter4fvEXT(table, save_NamedProgramLocalParameter4fvEXT); } diff -Nru mesa-19.2.8/src/mesa/main/draw.c mesa-20.0.8/src/mesa/main/draw.c --- mesa-19.2.8/src/mesa/main/draw.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/main/draw.c 2020-06-12 01:21:18.000000000 +0000 @@ -1228,8 +1228,8 @@ ib.ptr = (void *) min_index_ptr; for (i = 0; i < primcount; i++) { - prim[i].begin = (i == 0); - prim[i].end = (i == primcount - 1); + prim[i].begin = 1; + prim[i].end = 1; prim[i].pad = 0; prim[i].mode = mode; prim[i].start = diff -Nru mesa-19.2.8/src/mesa/main/enable.c mesa-20.0.8/src/mesa/main/enable.c --- mesa-19.2.8/src/mesa/main/enable.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/main/enable.c 2020-06-12 01:21:18.000000000 +0000 @@ -58,12 +58,13 @@ * Helper to enable/disable VAO client-side state. */ static void -vao_state(struct gl_context *ctx, gl_vert_attrib attr, GLboolean state) +vao_state(struct gl_context *ctx, struct gl_vertex_array_object* vao, + gl_vert_attrib attr, GLboolean state) { if (state) - _mesa_enable_vertex_array_attrib(ctx, ctx->Array.VAO, attr); + _mesa_enable_vertex_array_attrib(ctx, vao, attr); else - _mesa_disable_vertex_array_attrib(ctx, ctx->Array.VAO, attr); + _mesa_disable_vertex_array_attrib(ctx, vao, attr); } @@ -71,38 +72,39 @@ * Helper to enable/disable client-side state. */ static void -client_state(struct gl_context *ctx, GLenum cap, GLboolean state) +client_state(struct gl_context *ctx, struct gl_vertex_array_object* vao, + GLenum cap, GLboolean state) { switch (cap) { case GL_VERTEX_ARRAY: - vao_state(ctx, VERT_ATTRIB_POS, state); + vao_state(ctx, vao, VERT_ATTRIB_POS, state); break; case GL_NORMAL_ARRAY: - vao_state(ctx, VERT_ATTRIB_NORMAL, state); + vao_state(ctx, vao, VERT_ATTRIB_NORMAL, state); break; case GL_COLOR_ARRAY: - vao_state(ctx, VERT_ATTRIB_COLOR0, state); + vao_state(ctx, vao, VERT_ATTRIB_COLOR0, state); break; case GL_INDEX_ARRAY: - vao_state(ctx, VERT_ATTRIB_COLOR_INDEX, state); + vao_state(ctx, vao, VERT_ATTRIB_COLOR_INDEX, state); break; case GL_TEXTURE_COORD_ARRAY: - vao_state(ctx, VERT_ATTRIB_TEX(ctx->Array.ActiveTexture), state); + vao_state(ctx, vao, VERT_ATTRIB_TEX(ctx->Array.ActiveTexture), state); break; case GL_EDGE_FLAG_ARRAY: - vao_state(ctx, VERT_ATTRIB_EDGEFLAG, state); + vao_state(ctx, vao, VERT_ATTRIB_EDGEFLAG, state); break; case GL_FOG_COORDINATE_ARRAY_EXT: - vao_state(ctx, VERT_ATTRIB_FOG, state); + vao_state(ctx, vao, VERT_ATTRIB_FOG, state); break; case GL_SECONDARY_COLOR_ARRAY_EXT: - vao_state(ctx, VERT_ATTRIB_COLOR1, state); + vao_state(ctx, vao, VERT_ATTRIB_COLOR1, state); break; case GL_POINT_SIZE_ARRAY_OES: FLUSH_VERTICES(ctx, _NEW_PROGRAM); ctx->VertexProgram.PointSizeEnabled = state; - vao_state(ctx, VERT_ATTRIB_POINT_SIZE, state); + vao_state(ctx, vao, VERT_ATTRIB_POINT_SIZE, state); break; /* GL_NV_primitive_restart */ @@ -140,7 +142,8 @@ * - DisableClientStateiEXT */ static void -client_state_i(struct gl_context *ctx, GLenum cap, GLuint index, GLboolean state) +client_state_i(struct gl_context *ctx, struct gl_vertex_array_object* vao, + GLenum cap, GLuint index, GLboolean state) { int saved_active; @@ -160,7 +163,7 @@ saved_active = ctx->Array.ActiveTexture; _mesa_ClientActiveTexture(GL_TEXTURE0 + index); - client_state(ctx, cap, state); + client_state(ctx, vao, cap, state); _mesa_ClientActiveTexture(GL_TEXTURE0 + saved_active); } @@ -176,7 +179,38 @@ _mesa_EnableClientState( GLenum cap ) { GET_CURRENT_CONTEXT(ctx); - client_state( ctx, cap, GL_TRUE ); + client_state( ctx, ctx->Array.VAO, cap, GL_TRUE ); +} + + +void GLAPIENTRY +_mesa_EnableVertexArrayEXT( GLuint vaobj, GLenum cap ) +{ + GET_CURRENT_CONTEXT(ctx); + struct gl_vertex_array_object* vao = _mesa_lookup_vao_err(ctx, vaobj, + true, + "glEnableVertexArrayEXT"); + if (!vao) + return; + + /* The EXT_direct_state_access spec says: + * "Additionally EnableVertexArrayEXT and DisableVertexArrayEXT accept + * the tokens TEXTURE0 through TEXTUREn where n is less than the + * implementation-dependent limit of MAX_TEXTURE_COORDS. For these + * GL_TEXTUREi tokens, EnableVertexArrayEXT and DisableVertexArrayEXT + * act identically to EnableVertexArrayEXT(vaobj, TEXTURE_COORD_ARRAY) + * or DisableVertexArrayEXT(vaobj, TEXTURE_COORD_ARRAY) respectively + * as if the active client texture is set to texture coordinate set i + * based on the token TEXTUREi indicated by array." + */ + if (GL_TEXTURE0 <= cap && cap < GL_TEXTURE0 + ctx->Const.MaxTextureCoordUnits) { + GLuint saved_active = ctx->Array.ActiveTexture; + _mesa_ClientActiveTexture(cap); + client_state(ctx, vao, GL_TEXTURE_COORD_ARRAY, GL_TRUE); + _mesa_ClientActiveTexture(GL_TEXTURE0 + saved_active); + } else { + client_state(ctx, vao, cap, GL_TRUE); + } } @@ -184,7 +218,7 @@ _mesa_EnableClientStateiEXT( GLenum cap, GLuint index ) { GET_CURRENT_CONTEXT(ctx); - client_state_i(ctx, cap, index, GL_TRUE); + client_state_i(ctx, ctx->Array.VAO, cap, index, GL_TRUE); } @@ -199,14 +233,44 @@ _mesa_DisableClientState( GLenum cap ) { GET_CURRENT_CONTEXT(ctx); - client_state( ctx, cap, GL_FALSE ); + client_state( ctx, ctx->Array.VAO, cap, GL_FALSE ); +} + +void GLAPIENTRY +_mesa_DisableVertexArrayEXT( GLuint vaobj, GLenum cap ) +{ + GET_CURRENT_CONTEXT(ctx); + struct gl_vertex_array_object* vao = _mesa_lookup_vao_err(ctx, vaobj, + true, + "glDisableVertexArrayEXT"); + if (!vao) + return; + + /* The EXT_direct_state_access spec says: + * "Additionally EnableVertexArrayEXT and DisableVertexArrayEXT accept + * the tokens TEXTURE0 through TEXTUREn where n is less than the + * implementation-dependent limit of MAX_TEXTURE_COORDS. For these + * GL_TEXTUREi tokens, EnableVertexArrayEXT and DisableVertexArrayEXT + * act identically to EnableVertexArrayEXT(vaobj, TEXTURE_COORD_ARRAY) + * or DisableVertexArrayEXT(vaobj, TEXTURE_COORD_ARRAY) respectively + * as if the active client texture is set to texture coordinate set i + * based on the token TEXTUREi indicated by array." + */ + if (GL_TEXTURE0 <= cap && cap < GL_TEXTURE0 + ctx->Const.MaxTextureCoordUnits) { + GLuint saved_active = ctx->Array.ActiveTexture; + _mesa_ClientActiveTexture(cap); + client_state(ctx, vao, GL_TEXTURE_COORD_ARRAY, GL_FALSE); + _mesa_ClientActiveTexture(GL_TEXTURE0 + saved_active); + } else { + client_state(ctx, vao, cap, GL_FALSE); + } } void GLAPIENTRY _mesa_DisableClientStateiEXT( GLenum cap, GLuint index ) { GET_CURRENT_CONTEXT(ctx); - client_state_i(ctx, cap, index, GL_FALSE); + client_state_i(ctx, ctx->Array.VAO, cap, index, GL_FALSE); } #define CHECK_EXTENSION(EXTNAME) \ @@ -847,7 +911,7 @@ case GL_TEXTURE_COORD_ARRAY: if (ctx->API != API_OPENGL_COMPAT && ctx->API != API_OPENGLES) goto invalid_enum_error; - client_state( ctx, cap, state ); + client_state( ctx, ctx->Array.VAO, cap, state ); return; case GL_INDEX_ARRAY: case GL_EDGE_FLAG_ARRAY: @@ -855,12 +919,12 @@ case GL_SECONDARY_COLOR_ARRAY_EXT: if (ctx->API != API_OPENGL_COMPAT) goto invalid_enum_error; - client_state( ctx, cap, state ); + client_state( ctx, ctx->Array.VAO, cap, state ); return; case GL_POINT_SIZE_ARRAY_OES: if (ctx->API != API_OPENGLES) goto invalid_enum_error; - client_state( ctx, cap, state ); + client_state( ctx, ctx->Array.VAO, cap, state ); return; /* GL_ARB_texture_cube_map */ diff -Nru mesa-19.2.8/src/mesa/main/enable.h mesa-20.0.8/src/mesa/main/enable.h --- mesa-19.2.8/src/mesa/main/enable.h 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/main/enable.h 2020-06-12 01:21:18.000000000 +0000 @@ -68,11 +68,17 @@ _mesa_EnableClientStateiEXT( GLenum cap, GLuint index ); extern void GLAPIENTRY +_mesa_EnableVertexArrayEXT( GLuint vaobj, GLenum cap ); + +extern void GLAPIENTRY _mesa_DisableClientState( GLenum cap ); extern void GLAPIENTRY _mesa_DisableClientStateiEXT( GLenum cap, GLuint index ); +extern void GLAPIENTRY +_mesa_DisableVertexArrayEXT( GLuint vaobj, GLenum cap ); + extern void _mesa_set_multisample(struct gl_context *ctx, GLboolean state); diff -Nru mesa-19.2.8/src/mesa/main/execmem.c mesa-20.0.8/src/mesa/main/execmem.c --- mesa-19.2.8/src/mesa/main/execmem.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/main/execmem.c 2020-06-12 01:21:18.000000000 +0000 @@ -37,7 +37,7 @@ #include "c11/threads.h" -#if defined(__linux__) || defined(__OpenBSD__) || defined(_NetBSD__) || defined(__sun) || defined(__HAIKU__) +#if defined(__linux__) || defined(__OpenBSD__) || defined(__NetBSD__) || defined(__sun) || defined(__HAIKU__) /* * Allocate a large block of memory which can hold code then dole it out @@ -46,7 +46,7 @@ #include #include -#include "mm.h" +#include "util/u_mm.h" #ifdef MESA_SELINUX #include @@ -78,11 +78,11 @@ #endif if (!exec_heap) - exec_heap = mmInit( 0, EXEC_HEAP_SIZE ); + exec_heap = u_mmInit( 0, EXEC_HEAP_SIZE ); if (!exec_mem) exec_mem = mmap(NULL, EXEC_HEAP_SIZE, PROT_EXEC | PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); return (exec_mem != MAP_FAILED); } @@ -101,7 +101,7 @@ if (exec_heap) { size = (size + 31) & ~31; - block = mmAllocMem( exec_heap, size, 32, 0 ); + block = u_mmAllocMem(exec_heap, size, 5, 0); } if (block) @@ -122,10 +122,10 @@ mtx_lock(&exec_mutex); if (exec_heap) { - struct mem_block *block = mmFindBlock(exec_heap, (unsigned char *)addr - exec_mem); + struct mem_block *block = u_mmFindBlock(exec_heap, (unsigned char *)addr - exec_mem); if (block) - mmFreeMem(block); + u_mmFreeMem(block); } mtx_unlock(&exec_mutex); diff -Nru mesa-19.2.8/src/mesa/main/extensions_table.h mesa-20.0.8/src/mesa/main/extensions_table.h --- mesa-19.2.8/src/mesa/main/extensions_table.h 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/main/extensions_table.h 2020-06-12 01:21:18.000000000 +0000 @@ -81,10 +81,10 @@ EXT(ARB_framebuffer_sRGB , EXT_framebuffer_sRGB , GLL, GLC, x , x , 1998) EXT(ARB_get_program_binary , dummy_true , GLL, GLC, x , x , 2010) EXT(ARB_get_texture_sub_image , dummy_true , GLL, GLC, x , x , 2014) -EXT(ARB_gl_spirv , ARB_gl_spirv , x, GLC, x , x , 2016) +EXT(ARB_gl_spirv , ARB_gl_spirv , GLL, GLC, x , x , 2016) EXT(ARB_gpu_shader5 , ARB_gpu_shader5 , GLL, GLC, x , x , 2010) EXT(ARB_gpu_shader_fp64 , ARB_gpu_shader_fp64 , 32, GLC, x , x , 2010) -EXT(ARB_gpu_shader_int64 , ARB_gpu_shader_int64 , x , GLC, x , x , 2015) +EXT(ARB_gpu_shader_int64 , ARB_gpu_shader_int64 , 40, GLC, x , x , 2015) EXT(ARB_half_float_pixel , dummy_true , GLL, GLC, x , x , 2003) EXT(ARB_half_float_vertex , ARB_half_float_vertex , GLL, GLC, x , x , 2008) EXT(ARB_indirect_parameters , ARB_indirect_parameters , GLL, GLC, x , x , 2013) @@ -137,10 +137,11 @@ EXT(ARB_shader_viewport_layer_array , ARB_shader_viewport_layer_array , GLL, GLC, x , x , 2015) EXT(ARB_shading_language_100 , dummy_true , GLL, x , x , x , 2003) EXT(ARB_shading_language_420pack , ARB_shading_language_420pack , GLL, GLC, x , x , 2011) +EXT(ARB_shading_language_include , dummy_true , GLL, GLC, x , x , 2013) EXT(ARB_shading_language_packing , ARB_shading_language_packing , GLL, GLC, x , x , 2011) EXT(ARB_shadow , ARB_shadow , GLL, x , x , x , 2001) EXT(ARB_sparse_buffer , ARB_sparse_buffer , GLL, GLC, x , x , 2014) -EXT(ARB_spirv_extensions , ARB_spirv_extensions , x, GLC, x , x , 2016) +EXT(ARB_spirv_extensions , ARB_spirv_extensions , GLL, GLC, x , x , 2016) EXT(ARB_stencil_texturing , ARB_stencil_texturing , GLL, GLC, x , x , 2012) EXT(ARB_sync , ARB_sync , GLL, GLC, x , x , 2003) EXT(ARB_tessellation_shader , ARB_tessellation_shader , GLL, GLC, x , x , 2009) @@ -204,6 +205,8 @@ EXT(ATI_texture_float , ARB_texture_float , GLL, GLC, x , x , 2002) EXT(ATI_texture_mirror_once , ATI_texture_mirror_once , GLL, GLC, x , x , 2006) +EXT(EXT_EGL_image_storage , EXT_EGL_image_storage , GLL, GLC , x , 30, 2018) +EXT(EXT_EGL_sync , dummy_true , GLL, GLC, x , x , 2019) EXT(EXT_abgr , dummy_true , GLL, GLC, x , x , 1995) EXT(EXT_base_instance , ARB_base_instance , x , x , x , 30, 2014) EXT(EXT_bgra , dummy_true , GLL, x , x , x , 1995) @@ -221,8 +224,10 @@ EXT(EXT_compressed_ETC1_RGB8_sub_texture , OES_compressed_ETC1_RGB8_texture , x , x , ES1, ES2, 2014) EXT(EXT_copy_image , OES_copy_image , x , x , x , 30, 2014) EXT(EXT_copy_texture , dummy_true , GLL, x , x , x , 1995) +EXT(EXT_demote_to_helper_invocation , EXT_demote_to_helper_invocation , GLL, GLC, ES1, ES2, 2019) EXT(EXT_depth_bounds_test , EXT_depth_bounds_test , GLL, GLC, x , x , 2002) EXT(EXT_depth_clamp , ARB_depth_clamp , x , x , x , ES2, 2019) +EXT(EXT_direct_state_access , dummy_true , GLL, x , x , x , 2010) EXT(EXT_discard_framebuffer , dummy_true , x , x , ES1, ES2, 2009) EXT(EXT_disjoint_timer_query , EXT_disjoint_timer_query , x , x , x , ES2, 2016) EXT(EXT_draw_buffers , dummy_true , x , x , x , ES2, 2012) @@ -308,7 +313,7 @@ EXT(EXT_texture_integer , EXT_texture_integer , GLL, GLC, x , x , 2006) EXT(EXT_texture_lod_bias , dummy_true , GLL, x , ES1, x , 1999) EXT(EXT_texture_mirror_clamp , EXT_texture_mirror_clamp , GLL, GLC, x , x , 2004) -EXT(EXT_texture_norm16 , dummy_true , x , x , x , 31, 2014) +EXT(EXT_texture_norm16 , EXT_texture_norm16 , x , x , x , 31, 2014) EXT(EXT_texture_object , dummy_true , GLL, x , x , x , 1995) EXT(EXT_texture_query_lod , ARB_texture_query_lod , x , x , x , 30, 2019) EXT(EXT_texture_rectangle , NV_texture_rectangle , GLL, x , x , x , 2004) @@ -341,6 +346,7 @@ EXT(INTEL_conservative_rasterization , INTEL_conservative_rasterization , x , GLC, x , 31, 2013) EXT(INTEL_performance_query , INTEL_performance_query , GLL, GLC, x , ES2, 2013) EXT(INTEL_shader_atomic_float_minmax , INTEL_shader_atomic_float_minmax , GLL, GLC, x , x , 2018) +EXT(INTEL_shader_integer_functions2 , INTEL_shader_integer_functions2 , GLL, GLC, x , x , 2018) EXT(KHR_blend_equation_advanced , KHR_blend_equation_advanced , GLL, GLC, x , ES2, 2014) EXT(KHR_blend_equation_advanced_coherent , KHR_blend_equation_advanced_coherent , GLL, GLC, x , ES2, 2014) @@ -354,7 +360,7 @@ EXT(KHR_texture_compression_astc_ldr , KHR_texture_compression_astc_ldr , GLL, GLC, x , ES2, 2012) EXT(KHR_texture_compression_astc_sliced_3d , KHR_texture_compression_astc_sliced_3d , GLL, GLC, x , ES2, 2015) -EXT(MESA_framebuffer_flip_y , MESA_framebuffer_flip_y , 43, 43, x , 31, 2018) +EXT(MESA_framebuffer_flip_y , MESA_framebuffer_flip_y , 43, 43, x , 30, 2018) EXT(MESA_pack_invert , MESA_pack_invert , GLL, GLC, x , x , 2002) EXT(MESA_shader_integer_functions , MESA_shader_integer_functions , GLL, GLC, x , 30, 2016) EXT(MESA_texture_signed_rgba , EXT_texture_snorm , GLL, GLC, x , x , 2009) diff -Nru mesa-19.2.8/src/mesa/main/fbobject.c mesa-20.0.8/src/mesa/main/fbobject.c --- mesa-19.2.8/src/mesa/main/fbobject.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/main/fbobject.c 2020-06-12 01:21:18.000000000 +0000 @@ -174,21 +174,16 @@ /* Name exists but buffer is not initialized */ if (fb == &DummyFramebuffer) { fb = ctx->Driver.NewFramebuffer(ctx, id); - _mesa_HashLockMutex(ctx->Shared->FrameBuffers); _mesa_HashInsert(ctx->Shared->FrameBuffers, id, fb); - _mesa_HashUnlockMutex(ctx->Shared->BufferObjects); } /* Name doesn't exist */ else if (!fb) { - _mesa_HashLockMutex(ctx->Shared->FrameBuffers); fb = ctx->Driver.NewFramebuffer(ctx, id); if (!fb) { - _mesa_HashUnlockMutex(ctx->Shared->FrameBuffers); _mesa_error(ctx, GL_OUT_OF_MEMORY, "%s", func); return NULL; } - _mesa_HashInsertLocked(ctx->Shared->BufferObjects, id, fb); - _mesa_HashUnlockMutex(ctx->Shared->BufferObjects); + _mesa_HashInsert(ctx->Shared->FrameBuffers, id, fb); } return fb; } @@ -326,22 +321,24 @@ { assert(_mesa_is_winsys_fbo(fb)); + attachment = _mesa_back_to_front_if_single_buffered(fb, attachment); + if (_mesa_is_gles3(ctx)) { - assert(attachment == GL_BACK || - attachment == GL_DEPTH || - attachment == GL_STENCIL); switch (attachment) { case GL_BACK: /* Since there is no stereo rendering in ES 3.0, only return the * LEFT bits. */ - if (ctx->DrawBuffer->Visual.doubleBufferMode) - return &fb->Attachment[BUFFER_BACK_LEFT]; + return &fb->Attachment[BUFFER_BACK_LEFT]; + case GL_FRONT: + /* We might get this if back_to_front triggers above */ return &fb->Attachment[BUFFER_FRONT_LEFT]; case GL_DEPTH: return &fb->Attachment[BUFFER_DEPTH]; case GL_STENCIL: return &fb->Attachment[BUFFER_STENCIL]; + default: + unreachable("invalid attachment"); } } @@ -1625,18 +1622,46 @@ _mesa_error(ctx, GL_INVALID_ENUM, "%s(pname=0x%x)", func, pname); } +static bool +validate_framebuffer_parameter_extensions(GLenum pname, const char *func) +{ + GET_CURRENT_CONTEXT(ctx); + + if (!ctx->Extensions.ARB_framebuffer_no_attachments && + !ctx->Extensions.ARB_sample_locations && + !ctx->Extensions.MESA_framebuffer_flip_y) { + _mesa_error(ctx, GL_INVALID_OPERATION, + "%s not supported " + "(none of ARB_framebuffer_no_attachments," + " ARB_sample_locations, or" + " MESA_framebuffer_flip_y extensions are available)", + func); + return false; + } + + /* + * If only the MESA_framebuffer_flip_y extension is enabled + * pname can only be GL_FRAMEBUFFER_FLIP_Y_MESA + */ + if (ctx->Extensions.MESA_framebuffer_flip_y && + pname != GL_FRAMEBUFFER_FLIP_Y_MESA && + !(ctx->Extensions.ARB_framebuffer_no_attachments || + ctx->Extensions.ARB_sample_locations)) { + _mesa_error(ctx, GL_INVALID_ENUM, "%s(pname=0x%x)", func, pname); + return false; + } + + return true; +} + void GLAPIENTRY _mesa_FramebufferParameteri(GLenum target, GLenum pname, GLint param) { GET_CURRENT_CONTEXT(ctx); struct gl_framebuffer *fb; - if (!ctx->Extensions.ARB_framebuffer_no_attachments && - !ctx->Extensions.ARB_sample_locations) { - _mesa_error(ctx, GL_INVALID_OPERATION, - "glFramebufferParameteriv not supported " - "(neither ARB_framebuffer_no_attachments nor ARB_sample_locations" - " is available)"); + if (!validate_framebuffer_parameter_extensions(pname, + "glFramebufferParameteri")) { return; } @@ -1650,6 +1675,12 @@ framebuffer_parameteri(ctx, fb, pname, param, "glFramebufferParameteri"); } +void GLAPIENTRY +_mesa_FramebufferParameteriMESA(GLenum target, GLenum pname, GLint param) +{ + _mesa_FramebufferParameteri(target, pname, param); +} + static bool validate_get_framebuffer_parameteriv_pname(struct gl_context *ctx, struct gl_framebuffer *fb, @@ -1779,12 +1810,8 @@ GET_CURRENT_CONTEXT(ctx); struct gl_framebuffer *fb; - if (!ctx->Extensions.ARB_framebuffer_no_attachments && - !ctx->Extensions.ARB_sample_locations) { - _mesa_error(ctx, GL_INVALID_OPERATION, - "glGetFramebufferParameteriv not supported " - "(neither ARB_framebuffer_no_attachments nor ARB_sample_locations" - " is available)"); + if (!validate_framebuffer_parameter_extensions(pname, + "glGetFramebufferParameteriv")) { return; } @@ -1799,6 +1826,11 @@ "glGetFramebufferParameteriv"); } +void GLAPIENTRY +_mesa_GetFramebufferParameterivMESA(GLenum target, GLenum pname, GLint *params) +{ + _mesa_GetFramebufferParameteriv(target, pname, params); +} /** * Remove the specified renderbuffer or texture from any attachment point in @@ -2671,6 +2703,22 @@ } void GLAPIENTRY +_mesa_NamedRenderbufferStorageEXT(GLuint renderbuffer, GLenum internalformat, + GLsizei width, GLsizei height) +{ + GET_CURRENT_CONTEXT(ctx); + struct gl_renderbuffer *rb = _mesa_lookup_renderbuffer(ctx, renderbuffer); + if (!rb || rb == &DummyRenderbuffer) { + _mesa_HashLockMutex(ctx->Shared->RenderBuffers); + rb = allocate_renderbuffer_locked(ctx, renderbuffer, "glNamedRenderbufferStorageEXT"); + _mesa_HashUnlockMutex(ctx->Shared->RenderBuffers); + } + renderbuffer_storage(ctx, rb, internalformat, width, height, NO_SAMPLES, + 0, "glNamedRenderbufferStorageEXT"); +} + + +void GLAPIENTRY _mesa_NamedRenderbufferStorageMultisample(GLuint renderbuffer, GLsizei samples, GLenum internalformat, GLsizei width, GLsizei height) @@ -2682,6 +2730,25 @@ void GLAPIENTRY +_mesa_NamedRenderbufferStorageMultisampleEXT(GLuint renderbuffer, GLsizei samples, + GLenum internalformat, + GLsizei width, GLsizei height) +{ + GET_CURRENT_CONTEXT(ctx); + struct gl_renderbuffer *rb = _mesa_lookup_renderbuffer(ctx, renderbuffer); + if (!rb || rb == &DummyRenderbuffer) { + _mesa_HashLockMutex(ctx->Shared->RenderBuffers); + rb = allocate_renderbuffer_locked(ctx, renderbuffer, + "glNamedRenderbufferStorageMultisampleEXT"); + _mesa_HashUnlockMutex(ctx->Shared->RenderBuffers); + } + renderbuffer_storage(ctx, rb, internalformat, width, height, + samples, samples, + "glNamedRenderbufferStorageMultisample"); +} + + +void GLAPIENTRY _mesa_NamedRenderbufferStorageMultisampleAdvancedAMD( GLuint renderbuffer, GLsizei samples, GLsizei storageSamples, GLenum internalformat, GLsizei width, GLsizei height) @@ -2780,6 +2847,24 @@ } +void GLAPIENTRY +_mesa_GetNamedRenderbufferParameterivEXT(GLuint renderbuffer, GLenum pname, + GLint *params) +{ + GET_CURRENT_CONTEXT(ctx); + + struct gl_renderbuffer *rb = _mesa_lookup_renderbuffer(ctx, renderbuffer); + if (!rb || rb == &DummyRenderbuffer) { + _mesa_HashLockMutex(ctx->Shared->RenderBuffers); + rb = allocate_renderbuffer_locked(ctx, renderbuffer, "glGetNamedRenderbufferParameterivEXT"); + _mesa_HashUnlockMutex(ctx->Shared->RenderBuffers); + } + + get_render_buffer_parameteriv(ctx, rb, pname, params, + "glGetNamedRenderbufferParameterivEXT"); +} + + GLboolean GLAPIENTRY _mesa_IsFramebuffer(GLuint framebuffer) { @@ -4643,6 +4728,63 @@ } +/* Helper function for ARB_framebuffer_no_attachments functions interacting with EXT_direct_state_access */ +static struct gl_framebuffer * +lookup_named_framebuffer_ext_dsa(struct gl_context *ctx, GLuint framebuffer, const char* caller) +{ + struct gl_framebuffer *fb = NULL; + + if (framebuffer) { + /* The ARB_framebuffer_no_attachments spec says: + * + * "The error INVALID_VALUE is generated if is not + * a name returned by GenFramebuffers. If a framebuffer object + * named does not yet exist, it will be created." + * + * This is different from the EXT_direct_state_access spec which says: + * + * "If the framebuffer object named by the framebuffer parameter has not + * been previously bound or has been deleted since the last binding, + * the GL first creates a new state vector in the same manner as when + * BindFramebuffer creates a new framebuffer object" + * + * So first we verify that the name exists. + */ + fb = _mesa_lookup_framebuffer(ctx, framebuffer); + if (!fb) { + _mesa_error(ctx, GL_INVALID_VALUE, "%s(frameBuffer)", caller); + return NULL; + } + /* Then, make sure it's initialized */ + if (fb == &DummyFramebuffer) { + fb = ctx->Driver.NewFramebuffer(ctx, framebuffer); + _mesa_HashInsert(ctx->Shared->FrameBuffers, framebuffer, fb); + } + } + else + fb = ctx->WinSysDrawBuffer; + + return fb; +} + + +void GLAPIENTRY +_mesa_NamedFramebufferParameteriEXT(GLuint framebuffer, GLenum pname, + GLint param) +{ + GET_CURRENT_CONTEXT(ctx); + struct gl_framebuffer *fb = + lookup_named_framebuffer_ext_dsa(ctx, framebuffer, + "glNamedFramebufferParameteriEXT"); + + if (!fb) + return; + + framebuffer_parameteri(ctx, fb, pname, param, + "glNamedFramebufferParameteriEXT"); +} + + void GLAPIENTRY _mesa_GetFramebufferParameterivEXT(GLuint framebuffer, GLenum pname, GLint *param) @@ -4708,6 +4850,23 @@ } +void GLAPIENTRY +_mesa_GetNamedFramebufferParameterivEXT(GLuint framebuffer, GLenum pname, + GLint *param) +{ + GET_CURRENT_CONTEXT(ctx); + struct gl_framebuffer *fb = + lookup_named_framebuffer_ext_dsa(ctx, framebuffer, + "glGetNamedFramebufferParameterivEXT"); + + if (!fb) + return; + + get_framebuffer_parameteriv(ctx, fb, pname, param, + "glGetNamedFramebufferParameterivEXT"); +} + + static void invalidate_framebuffer_storage(struct gl_context *ctx, struct gl_framebuffer *fb, @@ -4904,9 +5063,10 @@ GL_STENCIL_ATTACHMENT : GL_DEPTH_ATTACHMENT); bool has_both = false; for (int j = 0; j < numAttachments; j++) { - if (attachments[j] == other_format) + if (attachments[j] == other_format) { has_both = true; - break; + break; + } } if (fb->Attachment[BUFFER_DEPTH].Renderbuffer != diff -Nru mesa-19.2.8/src/mesa/main/fbobject.h mesa-20.0.8/src/mesa/main/fbobject.h --- mesa-19.2.8/src/mesa/main/fbobject.h 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/main/fbobject.h 2020-06-12 01:21:18.000000000 +0000 @@ -190,6 +190,9 @@ extern void GLAPIENTRY _mesa_NamedRenderbufferStorage(GLuint renderbuffer, GLenum internalformat, GLsizei width, GLsizei height); +extern void GLAPIENTRY +_mesa_NamedRenderbufferStorageEXT(GLuint renderbuffer, GLenum internalformat, + GLsizei width, GLsizei height); extern void GLAPIENTRY _mesa_NamedRenderbufferStorageMultisample(GLuint renderbuffer, GLsizei samples, @@ -197,6 +200,11 @@ GLsizei width, GLsizei height); extern void GLAPIENTRY +_mesa_NamedRenderbufferStorageMultisampleEXT(GLuint renderbuffer, GLsizei samples, + GLenum internalformat, + GLsizei width, GLsizei height); + +extern void GLAPIENTRY _mesa_NamedRenderbufferStorageMultisampleAdvancedAMD( GLuint renderbuffer, GLsizei samples, GLsizei storageSamples, GLenum internalformat, GLsizei width, GLsizei height); @@ -361,6 +369,14 @@ GLint param); extern void GLAPIENTRY +_mesa_NamedFramebufferParameteriEXT(GLuint framebuffer, GLenum pname, + GLint param); + +extern void GLAPIENTRY +_mesa_GetNamedRenderbufferParameterivEXT(GLuint renderbuffer, GLenum pname, + GLint *params); + +extern void GLAPIENTRY _mesa_GetFramebufferParameterivEXT(GLuint framebuffer, GLenum pname, GLint *param); @@ -368,6 +384,10 @@ _mesa_GetNamedFramebufferParameteriv(GLuint framebuffer, GLenum pname, GLint *param); +extern void GLAPIENTRY +_mesa_GetNamedFramebufferParameterivEXT(GLuint framebuffer, GLenum pname, + GLint *param); + void GLAPIENTRY _mesa_InvalidateSubFramebuffer_no_error(GLenum target, GLsizei numAttachments, const GLenum *attachments, GLint x, @@ -405,9 +425,15 @@ _mesa_FramebufferParameteri(GLenum target, GLenum pname, GLint param); extern void GLAPIENTRY +_mesa_FramebufferParameteriMESA(GLenum target, GLenum pname, GLint param); + +extern void GLAPIENTRY _mesa_GetFramebufferParameteriv(GLenum target, GLenum pname, GLint *params); extern void GLAPIENTRY +_mesa_GetFramebufferParameterivMESA(GLenum target, GLenum pname, GLint *params); + +extern void GLAPIENTRY _mesa_FramebufferSampleLocationsfvARB(GLenum target, GLuint start, GLsizei count, const GLfloat *v); diff -Nru mesa-19.2.8/src/mesa/main/format_fallback.py mesa-20.0.8/src/mesa/main/format_fallback.py --- mesa-19.2.8/src/mesa/main/format_fallback.py 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/main/format_fallback.py 2020-06-12 01:21:18.000000000 +0000 @@ -85,6 +85,20 @@ yield rgbx_name, rgba_name +def get_intensity_to_red_map(formats): + names = set(fmt.name for fmt in formats) + + for fmt in formats: + if str(fmt.swizzle) != 'xxxx': + continue + + i_name = fmt.name + r_name = i_name.replace("_I_", "_R_") + + assert r_name in names + + yield i_name, r_name + TEMPLATE = Template(COPYRIGHT + """ #include "formats.h" #include "util/macros.h" @@ -129,6 +143,23 @@ } /** + * For an intensity format, return the corresponding red format. For other + * formats, return the format as-is. + */ +mesa_format +_mesa_get_intensity_format_red(mesa_format format) +{ + switch (format) { +%for i, r in intensity_to_red_map: + case ${i}: + return ${r}; +%endfor + default: + return format; + } +} + +/** * If the format has an alpha channel, and there exists a non-alpha * variant of the format with an identical bit layout, then return * the non-alpha format. Otherwise return the original format. @@ -164,6 +195,7 @@ template_env = { 'unorm_to_srgb_map': list(get_unorm_to_srgb_map(formats)), 'rgbx_to_rgba_map': list(get_rgbx_to_rgba_map(formats)), + 'intensity_to_red_map': list(get_intensity_to_red_map(formats)), } with open(pargs.out, 'w') as f: diff -Nru mesa-19.2.8/src/mesa/main/format_info.py mesa-20.0.8/src/mesa/main/format_info.py --- mesa-19.2.8/src/mesa/main/format_info.py 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/main/format_info.py 2020-06-12 01:21:18.000000000 +0000 @@ -176,9 +176,13 @@ def format_channel_bits(fmat, tuple_list): return ['.%s = %s' % (field, str(get_channel_bits(fmat, name))) for (field, name) in tuple_list] +bf_map = { + "GL_DEPTH_COMPONENT" : "MESA_ARRAY_FORMAT_BASE_FORMAT_DEPTH", + "GL_STENCIL_INDEX" : "MESA_ARRAY_FORMAT_BASE_FORMAT_STENCIL", +} for fmat in formats: - print(' {') + print(' [{0}] = {{'.format(fmat.name)) print(' .Name = {0},'.format(fmat.name)) print(' .StrName = "{0}",'.format(fmat.name)) print(' .Layout = {0},'.format('MESA_FORMAT_LAYOUT_' + fmat.layout.upper())) @@ -200,6 +204,7 @@ chan = fmat.array_element() norm = chan.norm or chan.type == parser.FLOAT print(' .ArrayFormat = MESA_ARRAY_FORMAT({0}),'.format(', '.join([ + bf_map.get(get_gl_base_format(fmat), "MESA_ARRAY_FORMAT_BASE_FORMAT_RGBA_VARIANTS"), str(chan.size // 8), str(int(chan.sign)), str(int(chan.type == parser.FLOAT)), diff -Nru mesa-19.2.8/src/mesa/main/formats.c mesa-20.0.8/src/mesa/main/formats.c --- mesa-19.2.8/src/mesa/main/formats.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/main/formats.c 2020-06-12 01:21:18.000000000 +0000 @@ -85,6 +85,13 @@ { const struct mesa_format_info *info = &format_info[format]; STATIC_ASSERT(ARRAY_SIZE(format_info) == MESA_FORMAT_COUNT); + + /* The MESA_FORMAT_* enums are sparse, don't return a format info + * for empty entries. + */ + if (info->Name == MESA_FORMAT_NONE && format != MESA_FORMAT_NONE) + return NULL; + assert(info->Name == format); return info; } @@ -95,6 +102,8 @@ _mesa_get_format_name(mesa_format format) { const struct mesa_format_info *info = _mesa_get_format_info(format); + if (!info) + return NULL; return info->StrName; } @@ -232,6 +241,15 @@ uint8_t swizzle[4]; int num_channels; + switch (_mesa_array_format_get_base_format(format)) { + case MESA_ARRAY_FORMAT_BASE_FORMAT_DEPTH: + return GL_DEPTH_COMPONENT; + case MESA_ARRAY_FORMAT_BASE_FORMAT_STENCIL: + return GL_STENCIL_INDEX; + case MESA_ARRAY_FORMAT_BASE_FORMAT_RGBA_VARIANTS: + break; + } + _mesa_array_format_get_swizzle(format, swizzle); num_channels = _mesa_array_format_get_num_channels(format); @@ -416,10 +434,11 @@ _mesa_format_to_array_format(mesa_format format) { const struct mesa_format_info *info = _mesa_get_format_info(format); - if (info->ArrayFormat && !_mesa_little_endian() && - info->Layout == MESA_FORMAT_LAYOUT_PACKED) +#if UTIL_ARCH_BIG_ENDIAN + if (info->ArrayFormat && info->Layout == MESA_FORMAT_LAYOUT_PACKED) return _mesa_array_format_flip_channels(info->ArrayFormat); else +#endif return info->ArrayFormat; } @@ -455,23 +474,21 @@ for (f = 1; f < MESA_FORMAT_COUNT; ++f) { info = _mesa_get_format_info(f); - if (!info->ArrayFormat) + if (!info || !info->ArrayFormat) continue; - if (_mesa_little_endian()) { - array_format = info->ArrayFormat; - } else { - array_format = _mesa_array_format_flip_channels(info->ArrayFormat); - } - - /* This can happen and does for some of the BGR formats. Let's take - * the first one in the list. + /* All sRGB formats should have an equivalent UNORM format, and that's + * the one we want in the table. */ - if (_mesa_hash_table_search_pre_hashed(format_array_format_table, - array_format, - (void *)(intptr_t)array_format)) + if (_mesa_is_format_srgb(f)) continue; +#if UTIL_ARCH_LITTLE_ENDIAN + array_format = info->ArrayFormat; +#else + array_format = _mesa_array_format_flip_channels(info->ArrayFormat); +#endif + _mesa_hash_table_insert_pre_hashed(format_array_format_table, array_format, (void *)(intptr_t)array_format, @@ -701,17 +718,17 @@ case MESA_FORMAT_R_RGTC1_SNORM: return MESA_FORMAT_R_SNORM8; case MESA_FORMAT_RG_RGTC2_UNORM: - return MESA_FORMAT_R8G8_UNORM; + return MESA_FORMAT_RG_UNORM8; case MESA_FORMAT_RG_RGTC2_SNORM: - return MESA_FORMAT_R8G8_SNORM; + return MESA_FORMAT_RG_SNORM8; case MESA_FORMAT_L_LATC1_UNORM: return MESA_FORMAT_L_UNORM8; case MESA_FORMAT_L_LATC1_SNORM: return MESA_FORMAT_L_SNORM8; case MESA_FORMAT_LA_LATC2_UNORM: - return MESA_FORMAT_L8A8_UNORM; + return MESA_FORMAT_LA_UNORM8; case MESA_FORMAT_LA_LATC2_SNORM: - return MESA_FORMAT_L8A8_SNORM; + return MESA_FORMAT_LA_SNORM8; case MESA_FORMAT_ETC1_RGB8: case MESA_FORMAT_ETC2_RGB8: case MESA_FORMAT_ETC2_SRGB8: @@ -729,7 +746,7 @@ return MESA_FORMAT_R_UNORM16; case MESA_FORMAT_ETC2_RG11_EAC: case MESA_FORMAT_ETC2_SIGNED_RG11_EAC: - return MESA_FORMAT_R16G16_UNORM; + return MESA_FORMAT_RG_UNORM16; case MESA_FORMAT_BPTC_RGBA_UNORM: case MESA_FORMAT_BPTC_SRGB_ALPHA_UNORM: return MESA_FORMAT_A8B8G8R8_UNORM; @@ -937,18 +954,14 @@ *comps = 2; return; - case MESA_FORMAT_L8A8_UNORM: - case MESA_FORMAT_A8L8_UNORM: - case MESA_FORMAT_R8G8_UNORM: - case MESA_FORMAT_G8R8_UNORM: + case MESA_FORMAT_LA_UNORM8: + case MESA_FORMAT_RG_UNORM8: *datatype = GL_UNSIGNED_BYTE; *comps = 2; return; - case MESA_FORMAT_L16A16_UNORM: - case MESA_FORMAT_A16L16_UNORM: - case MESA_FORMAT_R16G16_UNORM: - case MESA_FORMAT_G16R16_UNORM: + case MESA_FORMAT_LA_UNORM16: + case MESA_FORMAT_RG_UNORM16: *datatype = GL_UNSIGNED_SHORT; *comps = 2; return; @@ -1061,9 +1074,8 @@ *datatype = GL_BYTE; *comps = 1; return; - case MESA_FORMAT_R8G8_SNORM: - case MESA_FORMAT_L8A8_SNORM: - case MESA_FORMAT_A8L8_SNORM: + case MESA_FORMAT_RG_SNORM8: + case MESA_FORMAT_LA_SNORM8: *datatype = GL_BYTE; *comps = 2; return; @@ -1086,7 +1098,7 @@ *datatype = GL_SHORT; *comps = 1; return; - case MESA_FORMAT_R16G16_SNORM: + case MESA_FORMAT_RG_SNORM16: case MESA_FORMAT_LA_SNORM16: *datatype = GL_SHORT; *comps = 2; @@ -1116,8 +1128,7 @@ *datatype = GL_UNSIGNED_BYTE; *comps = 1; return; - case MESA_FORMAT_L8A8_SRGB: - case MESA_FORMAT_A8L8_SRGB: + case MESA_FORMAT_LA_SRGB8: *datatype = GL_UNSIGNED_BYTE; *comps = 2; return; @@ -1291,10 +1302,6 @@ *datatype = GL_UNSIGNED_BYTE; *comps = 3; return; - case MESA_FORMAT_RGBA_UINT8: - *datatype = GL_UNSIGNED_BYTE; - *comps = 4; - return; case MESA_FORMAT_R_UINT16: *datatype = GL_UNSIGNED_SHORT; *comps = 1; @@ -1400,16 +1407,6 @@ *comps = 4; return; - case MESA_FORMAT_G8R8_SNORM: - *datatype = GL_BYTE; - *comps = 2; - return; - - case MESA_FORMAT_G16R16_SNORM: - *datatype = GL_SHORT; - *comps = 2; - return; - case MESA_FORMAT_B8G8R8X8_SRGB: case MESA_FORMAT_X8R8G8B8_SRGB: *datatype = GL_UNSIGNED_BYTE; @@ -1419,15 +1416,17 @@ case MESA_FORMAT_COUNT: assert(0); return; - default: + default: { + const char *name = _mesa_get_format_name(format); /* Warn if any formats are not handled */ _mesa_problem(NULL, "bad format %s in _mesa_uncompressed_format_to_type_and_comps", - _mesa_get_format_name(format)); + name ? name : "???"); assert(format == MESA_FORMAT_NONE || _mesa_is_format_compressed(format)); *datatype = 0; *comps = 1; } + } } /** @@ -1442,655 +1441,37 @@ * \return true if the formats match, false otherwise. */ bool -_mesa_format_matches_format_and_type(mesa_format mesa_format, +_mesa_format_matches_format_and_type(mesa_format mformat, GLenum format, GLenum type, bool swapBytes, GLenum *error) { - const bool littleEndian = _mesa_little_endian(); if (error) *error = GL_NO_ERROR; - /* Note: When reading a GL format/type combination, the format lists channel - * assignments from most significant channel in the type to least - * significant. A type with _REV indicates that the assignments are - * swapped, so they are listed from least significant to most significant. - * - * Compressed formats will fall through and return false. - * - * For sanity, please keep this switch statement ordered the same as the - * enums in formats.h. - */ - - switch (mesa_format) { - - case MESA_FORMAT_NONE: - case MESA_FORMAT_COUNT: - return false; - - case MESA_FORMAT_A8B8G8R8_UNORM: - case MESA_FORMAT_A8B8G8R8_SRGB: - if (format == GL_RGBA && type == GL_UNSIGNED_INT_8_8_8_8 && !swapBytes) - return true; - - if (format == GL_RGBA && type == GL_UNSIGNED_INT_8_8_8_8_REV && swapBytes) - return true; - - if (format == GL_RGBA && type == GL_UNSIGNED_BYTE && !littleEndian) - return true; - - if (format == GL_ABGR_EXT && type == GL_UNSIGNED_INT_8_8_8_8_REV - && !swapBytes) - return true; - - if (format == GL_ABGR_EXT && type == GL_UNSIGNED_INT_8_8_8_8 - && swapBytes) - return true; - - if (format == GL_ABGR_EXT && type == GL_UNSIGNED_BYTE && littleEndian) - return true; - - return false; - - case MESA_FORMAT_R8G8B8A8_UNORM: - case MESA_FORMAT_R8G8B8A8_SRGB: - if (format == GL_RGBA && type == GL_UNSIGNED_INT_8_8_8_8_REV && - !swapBytes) - return true; - - if (format == GL_RGBA && type == GL_UNSIGNED_INT_8_8_8_8 && swapBytes) - return true; - - if (format == GL_RGBA && type == GL_UNSIGNED_BYTE && littleEndian) - return true; - - if (format == GL_ABGR_EXT && type == GL_UNSIGNED_INT_8_8_8_8 && - !swapBytes) - return true; - - if (format == GL_ABGR_EXT && type == GL_UNSIGNED_INT_8_8_8_8_REV && - swapBytes) - return true; - - if (format == GL_ABGR_EXT && type == GL_UNSIGNED_BYTE && !littleEndian) - return true; - - return false; - - case MESA_FORMAT_B8G8R8A8_UNORM: - case MESA_FORMAT_B8G8R8A8_SRGB: - if (format == GL_BGRA && type == GL_UNSIGNED_INT_8_8_8_8_REV && - !swapBytes) - return true; - - if (format == GL_BGRA && type == GL_UNSIGNED_INT_8_8_8_8 && swapBytes) - return true; - - if (format == GL_BGRA && type == GL_UNSIGNED_BYTE && littleEndian) - return true; - - return false; - - case MESA_FORMAT_A8R8G8B8_UNORM: - case MESA_FORMAT_A8R8G8B8_SRGB: - if (format == GL_BGRA && type == GL_UNSIGNED_INT_8_8_8_8 && !swapBytes) - return true; - - if (format == GL_BGRA && type == GL_UNSIGNED_INT_8_8_8_8_REV && - swapBytes) - return true; - - if (format == GL_BGRA && type == GL_UNSIGNED_BYTE && !littleEndian) - return true; - - return false; - - case MESA_FORMAT_X8B8G8R8_UNORM: - case MESA_FORMAT_R8G8B8X8_UNORM: - return false; - - case MESA_FORMAT_B8G8R8X8_UNORM: - case MESA_FORMAT_X8R8G8B8_UNORM: - return false; - - case MESA_FORMAT_BGR_UNORM8: - case MESA_FORMAT_BGR_SRGB8: - return format == GL_BGR && type == GL_UNSIGNED_BYTE && littleEndian; - - case MESA_FORMAT_RGB_UNORM8: - return format == GL_RGB && type == GL_UNSIGNED_BYTE && littleEndian; - - case MESA_FORMAT_B5G6R5_UNORM: - return ((format == GL_RGB && type == GL_UNSIGNED_SHORT_5_6_5) || - (format == GL_BGR && type == GL_UNSIGNED_SHORT_5_6_5_REV)) && - !swapBytes; - - case MESA_FORMAT_R5G6B5_UNORM: - return ((format == GL_BGR && type == GL_UNSIGNED_SHORT_5_6_5) || - (format == GL_RGB && type == GL_UNSIGNED_SHORT_5_6_5_REV)) && - !swapBytes; - - case MESA_FORMAT_B4G4R4A4_UNORM: - return format == GL_BGRA && type == GL_UNSIGNED_SHORT_4_4_4_4_REV && - !swapBytes; - - case MESA_FORMAT_A4R4G4B4_UNORM: - return false; - - case MESA_FORMAT_A1B5G5R5_UNORM: - return format == GL_RGBA && type == GL_UNSIGNED_SHORT_5_5_5_1 && - !swapBytes; - - case MESA_FORMAT_X1B5G5R5_UNORM: - return format == GL_RGB && type == GL_UNSIGNED_SHORT_5_5_5_1 && - !swapBytes; - - case MESA_FORMAT_B5G5R5A1_UNORM: - return format == GL_BGRA && type == GL_UNSIGNED_SHORT_1_5_5_5_REV && - !swapBytes; - - case MESA_FORMAT_A1R5G5B5_UNORM: - return format == GL_BGRA && type == GL_UNSIGNED_SHORT_5_5_5_1 && - !swapBytes; - - case MESA_FORMAT_L4A4_UNORM: - return false; - case MESA_FORMAT_L8A8_UNORM: - case MESA_FORMAT_L8A8_SRGB: - return format == GL_LUMINANCE_ALPHA && type == GL_UNSIGNED_BYTE && littleEndian; - case MESA_FORMAT_A8L8_UNORM: - case MESA_FORMAT_A8L8_SRGB: - return false; - - case MESA_FORMAT_L16A16_UNORM: - return format == GL_LUMINANCE_ALPHA && type == GL_UNSIGNED_SHORT && littleEndian && !swapBytes; - case MESA_FORMAT_A16L16_UNORM: - return false; - - case MESA_FORMAT_B2G3R3_UNORM: - return format == GL_RGB && type == GL_UNSIGNED_BYTE_3_3_2; - - case MESA_FORMAT_R3G3B2_UNORM: - return format == GL_RGB && type == GL_UNSIGNED_BYTE_2_3_3_REV; - - case MESA_FORMAT_A4B4G4R4_UNORM: - if (format == GL_RGBA && type == GL_UNSIGNED_SHORT_4_4_4_4 && !swapBytes) - return true; - - if (format == GL_ABGR_EXT && type == GL_UNSIGNED_SHORT_4_4_4_4_REV && !swapBytes) - return true; - - return false; - - case MESA_FORMAT_R4G4B4A4_UNORM: - if (format == GL_ABGR_EXT && type == GL_UNSIGNED_SHORT_4_4_4_4 && !swapBytes) - return true; - - if (format == GL_ABGR_EXT && type == GL_UNSIGNED_SHORT_4_4_4_4_REV && swapBytes) - return true; - - if (format == GL_RGBA && type == GL_UNSIGNED_SHORT_4_4_4_4_REV && !swapBytes) - return true; - - if (format == GL_RGBA && type == GL_UNSIGNED_SHORT_4_4_4_4 && swapBytes) - return true; - - return false; - - case MESA_FORMAT_R5G5B5A1_UNORM: - return format == GL_RGBA && type == GL_UNSIGNED_SHORT_1_5_5_5_REV; - - case MESA_FORMAT_A2B10G10R10_UNORM: - return format == GL_RGBA && type == GL_UNSIGNED_INT_10_10_10_2; - - case MESA_FORMAT_A2B10G10R10_UINT: - return format == GL_RGBA_INTEGER_EXT && type == GL_UNSIGNED_INT_10_10_10_2; - - case MESA_FORMAT_A2R10G10B10_UNORM: - return format == GL_BGRA && type == GL_UNSIGNED_INT_10_10_10_2; - - case MESA_FORMAT_A2R10G10B10_UINT: - return format == GL_BGRA_INTEGER_EXT && type == GL_UNSIGNED_INT_10_10_10_2; - - case MESA_FORMAT_A_UNORM8: - return format == GL_ALPHA && type == GL_UNSIGNED_BYTE; - case MESA_FORMAT_A_UNORM16: - return format == GL_ALPHA && type == GL_UNSIGNED_SHORT && !swapBytes; - case MESA_FORMAT_L_UNORM8: - case MESA_FORMAT_L_SRGB8: - return format == GL_LUMINANCE && type == GL_UNSIGNED_BYTE; - case MESA_FORMAT_L_UNORM16: - return format == GL_LUMINANCE && type == GL_UNSIGNED_SHORT && !swapBytes; - case MESA_FORMAT_I_UNORM8: - return format == GL_RED && type == GL_UNSIGNED_BYTE; - case MESA_FORMAT_I_UNORM16: - return format == GL_RED && type == GL_UNSIGNED_SHORT && !swapBytes; - - case MESA_FORMAT_YCBCR: - return format == GL_YCBCR_MESA && - ((type == GL_UNSIGNED_SHORT_8_8_MESA && littleEndian != swapBytes) || - (type == GL_UNSIGNED_SHORT_8_8_REV_MESA && littleEndian == swapBytes)); - case MESA_FORMAT_YCBCR_REV: - return format == GL_YCBCR_MESA && - ((type == GL_UNSIGNED_SHORT_8_8_MESA && littleEndian == swapBytes) || - (type == GL_UNSIGNED_SHORT_8_8_REV_MESA && littleEndian != swapBytes)); - - case MESA_FORMAT_R_UNORM8: - case MESA_FORMAT_R_SRGB8: - return format == GL_RED && type == GL_UNSIGNED_BYTE; - case MESA_FORMAT_R8G8_UNORM: - return format == GL_RG && type == GL_UNSIGNED_BYTE && littleEndian; - case MESA_FORMAT_G8R8_UNORM: - return false; - - case MESA_FORMAT_R_UNORM16: - return format == GL_RED && type == GL_UNSIGNED_SHORT && - !swapBytes; - case MESA_FORMAT_R16G16_UNORM: - return format == GL_RG && type == GL_UNSIGNED_SHORT && littleEndian && - !swapBytes; - case MESA_FORMAT_G16R16_UNORM: - return false; - - case MESA_FORMAT_B10G10R10A2_UNORM: - return format == GL_BGRA && type == GL_UNSIGNED_INT_2_10_10_10_REV && - !swapBytes; - - case MESA_FORMAT_S8_UINT_Z24_UNORM: - return format == GL_DEPTH_STENCIL && type == GL_UNSIGNED_INT_24_8 && - !swapBytes; - case MESA_FORMAT_X8_UINT_Z24_UNORM: - case MESA_FORMAT_Z24_UNORM_S8_UINT: - return false; - - case MESA_FORMAT_Z_UNORM16: - return format == GL_DEPTH_COMPONENT && type == GL_UNSIGNED_SHORT && - !swapBytes; - - case MESA_FORMAT_Z24_UNORM_X8_UINT: - return false; - - case MESA_FORMAT_Z_UNORM32: - return format == GL_DEPTH_COMPONENT && type == GL_UNSIGNED_INT && - !swapBytes; - - case MESA_FORMAT_S_UINT8: - return format == GL_STENCIL_INDEX && type == GL_UNSIGNED_BYTE; - - case MESA_FORMAT_RGBA_FLOAT32: - return format == GL_RGBA && type == GL_FLOAT && !swapBytes; - case MESA_FORMAT_RGBA_FLOAT16: - return format == GL_RGBA && type == GL_HALF_FLOAT && !swapBytes; - - case MESA_FORMAT_RGB_FLOAT32: - return format == GL_RGB && type == GL_FLOAT && !swapBytes; - case MESA_FORMAT_RGB_FLOAT16: - return format == GL_RGB && type == GL_HALF_FLOAT && !swapBytes; - - case MESA_FORMAT_A_FLOAT32: - return format == GL_ALPHA && type == GL_FLOAT && !swapBytes; - case MESA_FORMAT_A_FLOAT16: - return format == GL_ALPHA && type == GL_HALF_FLOAT && !swapBytes; - - case MESA_FORMAT_L_FLOAT32: - return format == GL_LUMINANCE && type == GL_FLOAT && !swapBytes; - case MESA_FORMAT_L_FLOAT16: - return format == GL_LUMINANCE && type == GL_HALF_FLOAT && !swapBytes; - - case MESA_FORMAT_LA_FLOAT32: - return format == GL_LUMINANCE_ALPHA && type == GL_FLOAT && !swapBytes; - case MESA_FORMAT_LA_FLOAT16: - return format == GL_LUMINANCE_ALPHA && type == GL_HALF_FLOAT && !swapBytes; - - case MESA_FORMAT_I_FLOAT32: - return format == GL_RED && type == GL_FLOAT && !swapBytes; - case MESA_FORMAT_I_FLOAT16: - return format == GL_RED && type == GL_HALF_FLOAT && !swapBytes; - - case MESA_FORMAT_R_FLOAT32: - return format == GL_RED && type == GL_FLOAT && !swapBytes; - case MESA_FORMAT_R_FLOAT16: - return format == GL_RED && type == GL_HALF_FLOAT && !swapBytes; - - case MESA_FORMAT_RG_FLOAT32: - return format == GL_RG && type == GL_FLOAT && !swapBytes; - case MESA_FORMAT_RG_FLOAT16: - return format == GL_RG && type == GL_HALF_FLOAT && !swapBytes; - - case MESA_FORMAT_A_UINT8: - return format == GL_ALPHA_INTEGER && type == GL_UNSIGNED_BYTE; - case MESA_FORMAT_A_UINT16: - return format == GL_ALPHA_INTEGER && type == GL_UNSIGNED_SHORT && - !swapBytes; - case MESA_FORMAT_A_UINT32: - return format == GL_ALPHA_INTEGER && type == GL_UNSIGNED_INT && - !swapBytes; - case MESA_FORMAT_A_SINT8: - return format == GL_ALPHA_INTEGER && type == GL_BYTE; - case MESA_FORMAT_A_SINT16: - return format == GL_ALPHA_INTEGER && type == GL_SHORT && !swapBytes; - case MESA_FORMAT_A_SINT32: - return format == GL_ALPHA_INTEGER && type == GL_INT && !swapBytes; - - case MESA_FORMAT_I_UINT8: - return format == GL_RED_INTEGER && type == GL_UNSIGNED_BYTE; - case MESA_FORMAT_I_UINT16: - return format == GL_RED_INTEGER && type == GL_UNSIGNED_SHORT && !swapBytes; - case MESA_FORMAT_I_UINT32: - return format == GL_RED_INTEGER && type == GL_UNSIGNED_INT && !swapBytes; - case MESA_FORMAT_I_SINT8: - return format == GL_RED_INTEGER && type == GL_BYTE; - case MESA_FORMAT_I_SINT16: - return format == GL_RED_INTEGER && type == GL_SHORT && !swapBytes; - case MESA_FORMAT_I_SINT32: - return format == GL_RED_INTEGER && type == GL_INT && !swapBytes; - - case MESA_FORMAT_L_UINT8: - return format == GL_LUMINANCE_INTEGER_EXT && type == GL_UNSIGNED_BYTE; - case MESA_FORMAT_L_UINT16: - return format == GL_LUMINANCE_INTEGER_EXT && type == GL_UNSIGNED_SHORT && - !swapBytes; - case MESA_FORMAT_L_UINT32: - return format == GL_LUMINANCE_INTEGER_EXT && type == GL_UNSIGNED_INT && - !swapBytes; - case MESA_FORMAT_L_SINT8: - return format == GL_LUMINANCE_INTEGER_EXT && type == GL_BYTE; - case MESA_FORMAT_L_SINT16: - return format == GL_LUMINANCE_INTEGER_EXT && type == GL_SHORT && - !swapBytes; - case MESA_FORMAT_L_SINT32: - return format == GL_LUMINANCE_INTEGER_EXT && type == GL_INT && !swapBytes; - - case MESA_FORMAT_LA_UINT8: - return format == GL_LUMINANCE_ALPHA_INTEGER_EXT && - type == GL_UNSIGNED_BYTE && !swapBytes; - case MESA_FORMAT_LA_UINT16: - return format == GL_LUMINANCE_ALPHA_INTEGER_EXT && - type == GL_UNSIGNED_SHORT && !swapBytes; - case MESA_FORMAT_LA_UINT32: - return format == GL_LUMINANCE_ALPHA_INTEGER_EXT && - type == GL_UNSIGNED_INT && !swapBytes; - case MESA_FORMAT_LA_SINT8: - return format == GL_LUMINANCE_ALPHA_INTEGER_EXT && type == GL_BYTE && - !swapBytes; - case MESA_FORMAT_LA_SINT16: - return format == GL_LUMINANCE_ALPHA_INTEGER_EXT && type == GL_SHORT && - !swapBytes; - case MESA_FORMAT_LA_SINT32: - return format == GL_LUMINANCE_ALPHA_INTEGER_EXT && type == GL_INT && - !swapBytes; - - case MESA_FORMAT_R_SINT8: - return format == GL_RED_INTEGER && type == GL_BYTE; - case MESA_FORMAT_RG_SINT8: - return format == GL_RG_INTEGER && type == GL_BYTE && !swapBytes; - case MESA_FORMAT_RGB_SINT8: - return format == GL_RGB_INTEGER && type == GL_BYTE && !swapBytes; - case MESA_FORMAT_RGBA_SINT8: - return format == GL_RGBA_INTEGER && type == GL_BYTE && !swapBytes; - case MESA_FORMAT_R_SINT16: - return format == GL_RED_INTEGER && type == GL_SHORT && !swapBytes; - case MESA_FORMAT_RG_SINT16: - return format == GL_RG_INTEGER && type == GL_SHORT && !swapBytes; - case MESA_FORMAT_RGB_SINT16: - return format == GL_RGB_INTEGER && type == GL_SHORT && !swapBytes; - case MESA_FORMAT_RGBA_SINT16: - return format == GL_RGBA_INTEGER && type == GL_SHORT && !swapBytes; - case MESA_FORMAT_R_SINT32: - return format == GL_RED_INTEGER && type == GL_INT && !swapBytes; - case MESA_FORMAT_RG_SINT32: - return format == GL_RG_INTEGER && type == GL_INT && !swapBytes; - case MESA_FORMAT_RGB_SINT32: - return format == GL_RGB_INTEGER && type == GL_INT && !swapBytes; - case MESA_FORMAT_RGBA_SINT32: - return format == GL_RGBA_INTEGER && type == GL_INT && !swapBytes; - - case MESA_FORMAT_R_UINT8: - return format == GL_RED_INTEGER && type == GL_UNSIGNED_BYTE; - case MESA_FORMAT_RG_UINT8: - return format == GL_RG_INTEGER && type == GL_UNSIGNED_BYTE && !swapBytes; - case MESA_FORMAT_RGB_UINT8: - return format == GL_RGB_INTEGER && type == GL_UNSIGNED_BYTE && !swapBytes; - case MESA_FORMAT_RGBA_UINT8: - return format == GL_RGBA_INTEGER && type == GL_UNSIGNED_BYTE && - !swapBytes; - case MESA_FORMAT_R_UINT16: - return format == GL_RED_INTEGER && type == GL_UNSIGNED_SHORT && - !swapBytes; - case MESA_FORMAT_RG_UINT16: - return format == GL_RG_INTEGER && type == GL_UNSIGNED_SHORT && !swapBytes; - case MESA_FORMAT_RGB_UINT16: - return format == GL_RGB_INTEGER && type == GL_UNSIGNED_SHORT && - !swapBytes; - case MESA_FORMAT_RGBA_UINT16: - return format == GL_RGBA_INTEGER && type == GL_UNSIGNED_SHORT && - !swapBytes; - case MESA_FORMAT_R_UINT32: - return format == GL_RED_INTEGER && type == GL_UNSIGNED_INT && !swapBytes; - case MESA_FORMAT_RG_UINT32: - return format == GL_RG_INTEGER && type == GL_UNSIGNED_INT && !swapBytes; - case MESA_FORMAT_RGB_UINT32: - return format == GL_RGB_INTEGER && type == GL_UNSIGNED_INT && !swapBytes; - case MESA_FORMAT_RGBA_UINT32: - return format == GL_RGBA_INTEGER && type == GL_UNSIGNED_INT && !swapBytes; - - case MESA_FORMAT_R_SNORM8: - return format == GL_RED && type == GL_BYTE; - case MESA_FORMAT_R8G8_SNORM: - return format == GL_RG && type == GL_BYTE && littleEndian && - !swapBytes; - case MESA_FORMAT_X8B8G8R8_SNORM: - return false; - - case MESA_FORMAT_A8B8G8R8_SNORM: - if (format == GL_RGBA && type == GL_BYTE && !littleEndian) - return true; - - if (format == GL_ABGR_EXT && type == GL_BYTE && littleEndian) - return true; - - return false; - - case MESA_FORMAT_R8G8B8A8_SNORM: - if (format == GL_RGBA && type == GL_BYTE && littleEndian) - return true; - - if (format == GL_ABGR_EXT && type == GL_BYTE && !littleEndian) - return true; - - return false; - - case MESA_FORMAT_R_SNORM16: - return format == GL_RED && type == GL_SHORT && - !swapBytes; - case MESA_FORMAT_R16G16_SNORM: - return format == GL_RG && type == GL_SHORT && littleEndian && !swapBytes; - case MESA_FORMAT_RGB_SNORM16: - return format == GL_RGB && type == GL_SHORT && !swapBytes; - case MESA_FORMAT_RGBA_SNORM16: - return format == GL_RGBA && type == GL_SHORT && !swapBytes; - case MESA_FORMAT_RGBA_UNORM16: - return format == GL_RGBA && type == GL_UNSIGNED_SHORT && - !swapBytes; - - case MESA_FORMAT_A_SNORM8: - return format == GL_ALPHA && type == GL_BYTE; - case MESA_FORMAT_L_SNORM8: - return format == GL_LUMINANCE && type == GL_BYTE; - case MESA_FORMAT_L8A8_SNORM: - return format == GL_LUMINANCE_ALPHA && type == GL_BYTE && - littleEndian && !swapBytes; - case MESA_FORMAT_A8L8_SNORM: - return format == GL_LUMINANCE_ALPHA && type == GL_BYTE && - !littleEndian && !swapBytes; - case MESA_FORMAT_I_SNORM8: - return format == GL_RED && type == GL_BYTE; - case MESA_FORMAT_A_SNORM16: - return format == GL_ALPHA && type == GL_SHORT && !swapBytes; - case MESA_FORMAT_L_SNORM16: - return format == GL_LUMINANCE && type == GL_SHORT && !swapBytes; - case MESA_FORMAT_LA_SNORM16: - return format == GL_LUMINANCE_ALPHA && type == GL_SHORT && - littleEndian && !swapBytes; - case MESA_FORMAT_I_SNORM16: - return format == GL_RED && type == GL_SHORT && littleEndian && - !swapBytes; - - case MESA_FORMAT_B10G10R10A2_UINT: - return (format == GL_BGRA_INTEGER_EXT && - type == GL_UNSIGNED_INT_2_10_10_10_REV && - !swapBytes); - - case MESA_FORMAT_R10G10B10A2_UINT: - return (format == GL_RGBA_INTEGER_EXT && - type == GL_UNSIGNED_INT_2_10_10_10_REV && - !swapBytes); - - case MESA_FORMAT_B5G6R5_UINT: - return format == GL_RGB_INTEGER && type == GL_UNSIGNED_SHORT_5_6_5; - - case MESA_FORMAT_R5G6B5_UINT: - return format == GL_RGB_INTEGER && type == GL_UNSIGNED_SHORT_5_6_5_REV; - - case MESA_FORMAT_B2G3R3_UINT: - return format == GL_RGB_INTEGER && type == GL_UNSIGNED_BYTE_3_3_2; - - case MESA_FORMAT_R3G3B2_UINT: - return format == GL_RGB_INTEGER && type == GL_UNSIGNED_BYTE_2_3_3_REV; - - case MESA_FORMAT_A4B4G4R4_UINT: - if (format == GL_RGBA_INTEGER && type == GL_UNSIGNED_SHORT_4_4_4_4 && !swapBytes) - return true; - - if (format == GL_RGBA_INTEGER && type == GL_UNSIGNED_SHORT_4_4_4_4_REV && swapBytes) - return true; - return false; - - case MESA_FORMAT_R4G4B4A4_UINT: - if (format == GL_RGBA_INTEGER && type == GL_UNSIGNED_SHORT_4_4_4_4_REV && !swapBytes) - return true; - - if (format == GL_RGBA_INTEGER && type == GL_UNSIGNED_SHORT_4_4_4_4 && swapBytes) - return true; - - return false; - - case MESA_FORMAT_B4G4R4A4_UINT: - return format == GL_BGRA_INTEGER && type == GL_UNSIGNED_SHORT_4_4_4_4_REV && - !swapBytes; - - case MESA_FORMAT_A4R4G4B4_UINT: - return false; - - case MESA_FORMAT_A1B5G5R5_UINT: - return format == GL_RGBA_INTEGER && type == GL_UNSIGNED_SHORT_5_5_5_1 && - !swapBytes; - - case MESA_FORMAT_B5G5R5A1_UINT: - return format == GL_BGRA_INTEGER && type == GL_UNSIGNED_SHORT_1_5_5_5_REV && - !swapBytes; - - case MESA_FORMAT_A1R5G5B5_UINT: - return format == GL_BGRA_INTEGER && type == GL_UNSIGNED_SHORT_5_5_5_1 && - !swapBytes; - - case MESA_FORMAT_R5G5B5A1_UINT: - return format == GL_RGBA_INTEGER && type == GL_UNSIGNED_SHORT_1_5_5_5_REV; - - case MESA_FORMAT_A8B8G8R8_UINT: - if (format == GL_RGBA_INTEGER && type == GL_UNSIGNED_INT_8_8_8_8 && !swapBytes) - return true; - - if (format == GL_RGBA_INTEGER && type == GL_UNSIGNED_INT_8_8_8_8_REV && swapBytes) - return true; - return false; - - case MESA_FORMAT_A8R8G8B8_UINT: - if (format == GL_BGRA_INTEGER && type == GL_UNSIGNED_INT_8_8_8_8 && - !swapBytes) - return true; - - if (format == GL_BGRA_INTEGER && type == GL_UNSIGNED_INT_8_8_8_8_REV && - swapBytes) - return true; - - return false; - - case MESA_FORMAT_R8G8B8A8_UINT: - if (format == GL_RGBA_INTEGER && type == GL_UNSIGNED_INT_8_8_8_8_REV && - !swapBytes) - return true; - - if (format == GL_RGBA_INTEGER && type == GL_UNSIGNED_INT_8_8_8_8 && swapBytes) - return true; - + if (_mesa_is_format_compressed(mformat)) { + if (error) + *error = GL_INVALID_ENUM; return false; + } - case MESA_FORMAT_B8G8R8A8_UINT: - if (format == GL_BGRA_INTEGER && type == GL_UNSIGNED_INT_8_8_8_8_REV && - !swapBytes) - return true; - - if (format == GL_BGRA_INTEGER && type == GL_UNSIGNED_INT_8_8_8_8 && swapBytes) - return true; - + if (swapBytes && !_mesa_swap_bytes_in_type_enum(&type)) return false; - case MESA_FORMAT_R9G9B9E5_FLOAT: - return format == GL_RGB && type == GL_UNSIGNED_INT_5_9_9_9_REV && - !swapBytes; - - case MESA_FORMAT_R11G11B10_FLOAT: - return format == GL_RGB && type == GL_UNSIGNED_INT_10F_11F_11F_REV && - !swapBytes; - - case MESA_FORMAT_Z_FLOAT32: - return format == GL_DEPTH_COMPONENT && type == GL_FLOAT && !swapBytes; + /* format/type don't include srgb and should match regardless of it. */ + mformat = _mesa_get_srgb_format_linear(mformat); - case MESA_FORMAT_Z32_FLOAT_S8X24_UINT: - return format == GL_DEPTH_STENCIL && - type == GL_FLOAT_32_UNSIGNED_INT_24_8_REV && !swapBytes; + /* intensity formats are uploaded with GL_RED, and we want to find + * memcpy matches for them. + */ + mformat = _mesa_get_intensity_format_red(mformat); - case MESA_FORMAT_B4G4R4X4_UNORM: - case MESA_FORMAT_B5G5R5X1_UNORM: - case MESA_FORMAT_R8G8B8X8_SNORM: - case MESA_FORMAT_R8G8B8X8_SRGB: - case MESA_FORMAT_X8B8G8R8_SRGB: - case MESA_FORMAT_RGBX_UINT8: - case MESA_FORMAT_RGBX_SINT8: - case MESA_FORMAT_B10G10R10X2_UNORM: - case MESA_FORMAT_RGBX_UNORM16: - case MESA_FORMAT_RGBX_SNORM16: - case MESA_FORMAT_RGBX_FLOAT16: - case MESA_FORMAT_RGBX_UINT16: - case MESA_FORMAT_RGBX_SINT16: - case MESA_FORMAT_RGBX_FLOAT32: - case MESA_FORMAT_RGBX_UINT32: - case MESA_FORMAT_RGBX_SINT32: + if (format == GL_COLOR_INDEX) return false; - case MESA_FORMAT_R10G10B10X2_UNORM: - return format == GL_RGB && type == GL_UNSIGNED_INT_2_10_10_10_REV && - !swapBytes; - case MESA_FORMAT_R10G10B10A2_UNORM: - return format == GL_RGBA && type == GL_UNSIGNED_INT_2_10_10_10_REV && - !swapBytes; - - case MESA_FORMAT_G8R8_SNORM: - return format == GL_RG && type == GL_BYTE && !littleEndian && - !swapBytes; - - case MESA_FORMAT_G16R16_SNORM: - return format == GL_RG && type == GL_SHORT && !littleEndian && - !swapBytes; + mesa_format other_format = _mesa_format_from_format_and_type(format, type); + if (_mesa_format_is_mesa_array_format(other_format)) + other_format = _mesa_format_from_array_format(other_format); - case MESA_FORMAT_B8G8R8X8_SRGB: - case MESA_FORMAT_X8R8G8B8_SRGB: - return false; - default: - assert(_mesa_is_format_compressed(mesa_format)); - if (error) - *error = GL_INVALID_ENUM; - } - return false; + return other_format == mformat; } diff -Nru mesa-19.2.8/src/mesa/main/formats.csv mesa-20.0.8/src/mesa/main/formats.csv --- mesa-19.2.8/src/mesa/main/formats.csv 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/main/formats.csv 2020-06-12 01:21:18.000000000 +0000 @@ -60,8 +60,6 @@ MESA_FORMAT_B8G8R8X8_UNORM , packed, 1, 1, 1, un8 , un8 , un8 , x8 , zyx1, rgb MESA_FORMAT_A8R8G8B8_UNORM , packed, 1, 1, 1, un8 , un8 , un8 , un8 , yzwx, rgb MESA_FORMAT_X8R8G8B8_UNORM , packed, 1, 1, 1, x8 , un8 , un8 , un8 , yzw1, rgb -MESA_FORMAT_L16A16_UNORM , packed, 1, 1, 1, un16, un16, , , xxxy, rgb -MESA_FORMAT_A16L16_UNORM , packed, 1, 1, 1, un16, un16, , , yyyx, rgb MESA_FORMAT_B5G6R5_UNORM , packed, 1, 1, 1, un5 , un6 , un5 , , zyx1, rgb MESA_FORMAT_R5G6B5_UNORM , packed, 1, 1, 1, un5 , un6 , un5 , , xyz1, rgb MESA_FORMAT_B4G4R4A4_UNORM , packed, 1, 1, 1, un4 , un4 , un4 , un4 , zyxw, rgb @@ -72,15 +70,9 @@ MESA_FORMAT_B5G5R5A1_UNORM , packed, 1, 1, 1, un5 , un5 , un5 , un1 , zyxw, rgb MESA_FORMAT_B5G5R5X1_UNORM , packed, 1, 1, 1, un5 , un5 , un5 , x1 , zyx1, rgb MESA_FORMAT_A1R5G5B5_UNORM , packed, 1, 1, 1, un1 , un5 , un5 , un5 , yzwx, rgb -MESA_FORMAT_L8A8_UNORM , packed, 1, 1, 1, un8 , un8 , , , xxxy, rgb -MESA_FORMAT_A8L8_UNORM , packed, 1, 1, 1, un8 , un8 , , , yyyx, rgb -MESA_FORMAT_R8G8_UNORM , packed, 1, 1, 1, un8 , un8 , , , xy01, rgb -MESA_FORMAT_G8R8_UNORM , packed, 1, 1, 1, un8 , un8 , , , yx01, rgb MESA_FORMAT_L4A4_UNORM , packed, 1, 1, 1, un4 , un4 , , , xxxy, rgb MESA_FORMAT_B2G3R3_UNORM , packed, 1, 1, 1, un2 , un3 , un3 , , zyx1, rgb -MESA_FORMAT_R16G16_UNORM , packed, 1, 1, 1, un16, un16, , , xy01, rgb -MESA_FORMAT_G16R16_UNORM , packed, 1, 1, 1, un16, un16, , , yx01, rgb MESA_FORMAT_B10G10R10A2_UNORM , packed, 1, 1, 1, un10, un10, un10, un2 , zyxw, rgb MESA_FORMAT_B10G10R10X2_UNORM , packed, 1, 1, 1, un10, un10, un10, x2 , zyx1, rgb MESA_FORMAT_R10G10B10A2_UNORM , packed, 1, 1, 1, un10, un10, un10, un2 , xyzw, rgb @@ -106,10 +98,14 @@ MESA_FORMAT_A_UNORM16 , array , 1, 1, 1, un16, , , , 000x, rgb MESA_FORMAT_L_UNORM8 , array , 1, 1, 1, un8 , , , , xxx1, rgb MESA_FORMAT_L_UNORM16 , array , 1, 1, 1, un16, , , , xxx1, rgb +MESA_FORMAT_LA_UNORM8 , array , 1, 1, 1, un8 , un8 , , , xxxy, rgb +MESA_FORMAT_LA_UNORM16 , array , 1, 1, 1, un16, un16, , , xxxy, rgb MESA_FORMAT_I_UNORM8 , array , 1, 1, 1, un8 , , , , xxxx, rgb MESA_FORMAT_I_UNORM16 , array , 1, 1, 1, un16, , , , xxxx, rgb MESA_FORMAT_R_UNORM8 , array , 1, 1, 1, un8 , , , , x001, rgb MESA_FORMAT_R_UNORM16 , array , 1, 1, 1, un16, , , , x001, rgb +MESA_FORMAT_RG_UNORM8 , array , 1, 1, 1, un8 , un8 , , , xy01, rgb +MESA_FORMAT_RG_UNORM16 , array , 1, 1, 1, un16, un16, , , xy01, rgb MESA_FORMAT_BGR_UNORM8 , array , 1, 1, 1, un8 , un8 , un8 , , zyx1, rgb MESA_FORMAT_RGB_UNORM8 , array , 1, 1, 1, un8 , un8 , un8 , , xyz1, rgb MESA_FORMAT_RGBA_UNORM16 , array , 1, 1, 1, un16, un16, un16, un16, xyzw, rgb @@ -124,12 +120,6 @@ MESA_FORMAT_X8B8G8R8_SNORM , packed, 1, 1, 1, x8 , sn8 , sn8 , sn8 , wzy1, rgb MESA_FORMAT_R8G8B8A8_SNORM , packed, 1, 1, 1, sn8 , sn8 , sn8 , sn8 , xyzw, rgb MESA_FORMAT_R8G8B8X8_SNORM , packed, 1, 1, 1, sn8 , sn8 , sn8 , x8 , xyz1, rgb -MESA_FORMAT_R16G16_SNORM , packed, 1, 1, 1, sn16, sn16, , , xy01, rgb -MESA_FORMAT_G16R16_SNORM , packed, 1, 1, 1, sn16, sn16, , , yx01, rgb -MESA_FORMAT_R8G8_SNORM , packed, 1, 1, 1, sn8 , sn8 , , , xy01, rgb -MESA_FORMAT_G8R8_SNORM , packed, 1, 1, 1, sn8 , sn8 , , , yx01, rgb -MESA_FORMAT_L8A8_SNORM , packed, 1, 1, 1, sn8 , sn8 , , , xxxy, rgb -MESA_FORMAT_A8L8_SNORM , packed, 1, 1, 1, sn8 , sn8 , , , yyyx, rgb # Array signed/normalized formats MESA_FORMAT_A_SNORM8 , array , 1, 1, 1, sn8 , , , , 000x, rgb @@ -140,7 +130,10 @@ MESA_FORMAT_I_SNORM16 , array , 1, 1, 1, sn16, , , , xxxx, rgb MESA_FORMAT_R_SNORM8 , array , 1, 1, 1, sn8 , , , , x001, rgb MESA_FORMAT_R_SNORM16 , array , 1, 1, 1, sn16, , , , x001, rgb +MESA_FORMAT_LA_SNORM8 , array , 1, 1, 1, sn8 , sn8 , , , xxxy, rgb MESA_FORMAT_LA_SNORM16 , array , 1, 1, 1, sn16, sn16, , , xxxy, rgb +MESA_FORMAT_RG_SNORM8 , array , 1, 1, 1, sn8 , sn8 , , , xy01, rgb +MESA_FORMAT_RG_SNORM16 , array , 1, 1, 1, sn16, sn16, , , xy01, rgb MESA_FORMAT_RGB_SNORM16 , array , 1, 1, 1, sn16, sn16, sn16, , xyz1, rgb MESA_FORMAT_RGBA_SNORM16 , array , 1, 1, 1, sn16, sn16, sn16, sn16, xyzw, rgb MESA_FORMAT_RGBX_SNORM16 , array , 1, 1, 1, sn16, sn16, sn16, x16 , xyz1, rgb @@ -154,12 +147,11 @@ MESA_FORMAT_R8G8B8A8_SRGB , packed, 1, 1, 1, un8 , un8 , un8 , un8 , xyzw, srgb MESA_FORMAT_R8G8B8X8_SRGB , packed, 1, 1, 1, un8 , un8 , un8 , x8 , xyz1, srgb MESA_FORMAT_X8B8G8R8_SRGB , packed, 1, 1, 1, x8 , un8 , un8 , un8 , wzy1, srgb -MESA_FORMAT_L8A8_SRGB , packed, 1, 1, 1, un8 , un8 , , , xxxy, srgb -MESA_FORMAT_A8L8_SRGB , packed, 1, 1, 1, un8 , un8 , , , yyyx, srgb # Array sRGB formats MESA_FORMAT_R_SRGB8 , array , 1, 1, 1, un8 , , , , x001, srgb MESA_FORMAT_L_SRGB8 , array , 1, 1, 1, un8 , , , , xxx1, srgb +MESA_FORMAT_LA_SRGB8 , array , 1, 1, 1, un8 , un8 , , , xxxy, srgb MESA_FORMAT_BGR_SRGB8 , array , 1, 1, 1, un8 , un8 , un8 , , zyx1, srgb # Packed float formats @@ -260,7 +252,6 @@ MESA_FORMAT_RGB_SINT16 , array , 1, 1, 1, s16 , s16 , s16 , , xyz1, rgb MESA_FORMAT_RGB_SINT32 , array , 1, 1, 1, s32 , s32 , s32 , , xyz1, rgb -MESA_FORMAT_RGBA_UINT8 , array , 1, 1, 1, u8 , u8 , u8 , u8 , xyzw, rgb MESA_FORMAT_RGBA_UINT16 , array , 1, 1, 1, u16 , u16 , u16 , u16 , xyzw, rgb MESA_FORMAT_RGBA_UINT32 , array , 1, 1, 1, u32 , u32 , u32 , u32 , xyzw, rgb MESA_FORMAT_RGBA_SINT8 , array , 1, 1, 1, s8 , s8 , s8 , s8 , xyzw, rgb diff -Nru mesa-19.2.8/src/mesa/main/formats.h mesa-20.0.8/src/mesa/main/formats.h --- mesa-19.2.8/src/mesa/main/formats.h 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/main/formats.h 2020-06-12 01:21:18.000000000 +0000 @@ -36,6 +36,8 @@ #include #include #include +#include "gallium/include/pipe/p_format.h" +#include "util/u_endian.h" #ifdef __cplusplus extern "C" { @@ -109,6 +111,12 @@ MESA_ARRAY_FORMAT_TYPE_FLOAT = 0xe, }; +enum mesa_array_format_base_format { + MESA_ARRAY_FORMAT_BASE_FORMAT_RGBA_VARIANTS = 0x0, + MESA_ARRAY_FORMAT_BASE_FORMAT_DEPTH = 0x1, + MESA_ARRAY_FORMAT_BASE_FORMAT_STENCIL = 0x2, +}; + /** * An enum useful to encode/decode information stored in a mesa_array_format */ @@ -124,11 +132,12 @@ MESA_ARRAY_FORMAT_SWIZZLE_Y_MASK = 0x03800, MESA_ARRAY_FORMAT_SWIZZLE_Z_MASK = 0x1c000, MESA_ARRAY_FORMAT_SWIZZLE_W_MASK = 0xe0000, + MESA_ARRAY_FORMAT_BASE_FORMAT_MASK = 0x300000, MESA_ARRAY_FORMAT_BIT = 0x80000000 }; -#define MESA_ARRAY_FORMAT(SIZE, SIGNED, IS_FLOAT, NORM, NUM_CHANS, \ - SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_W) ( \ +#define MESA_ARRAY_FORMAT(BASE_FORMAT, SIZE, SIGNED, IS_FLOAT, NORM, NUM_CHANS, \ + SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_W) ( \ (((SIZE >> 1) ) & MESA_ARRAY_FORMAT_TYPE_SIZE_MASK) | \ (((SIGNED) << 2 ) & MESA_ARRAY_FORMAT_TYPE_IS_SIGNED) | \ (((IS_FLOAT) << 3 ) & MESA_ARRAY_FORMAT_TYPE_IS_FLOAT) | \ @@ -138,6 +147,7 @@ (((SWIZZLE_Y) << 11) & MESA_ARRAY_FORMAT_SWIZZLE_Y_MASK) | \ (((SWIZZLE_Z) << 14) & MESA_ARRAY_FORMAT_SWIZZLE_Z_MASK) | \ (((SWIZZLE_W) << 17) & MESA_ARRAY_FORMAT_SWIZZLE_W_MASK) | \ + (((BASE_FORMAT) << 20) & MESA_ARRAY_FORMAT_BASE_FORMAT_MASK) | \ MESA_ARRAY_FORMAT_BIT) /** @@ -161,6 +171,13 @@ return (f & MESA_ARRAY_FORMAT_TYPE_NORMALIZED) !=0; } +static inline enum mesa_array_format_base_format +_mesa_array_format_get_base_format(mesa_array_format f) +{ + return (enum mesa_array_format_base_format) + ((f & MESA_ARRAY_FORMAT_BASE_FORMAT_MASK) >> 20); +} + static inline enum mesa_array_format_datatype _mesa_array_format_get_datatype(mesa_array_format f) { @@ -221,11 +238,10 @@ } /** - * Mesa texture/renderbuffer image formats. + * Mesa texture/renderbuffer image formats. These are just other names of the + * gallium p_format.h formats. */ -typedef enum -{ - MESA_FORMAT_NONE = 0, +typedef enum pipe_format mesa_format; /** * \name Basic hardware formats @@ -332,341 +348,280 @@ * */ - /* Packed unorm formats */ /* msb <------ TEXEL BITS -----------> lsb */ - /* ---- ---- ---- ---- ---- ---- ---- ---- */ - MESA_FORMAT_A8B8G8R8_UNORM, /* RRRR RRRR GGGG GGGG BBBB BBBB AAAA AAAA */ - MESA_FORMAT_X8B8G8R8_UNORM, /* RRRR RRRR GGGG GGGG BBBB BBBB xxxx xxxx */ - MESA_FORMAT_R8G8B8A8_UNORM, /* AAAA AAAA BBBB BBBB GGGG GGGG RRRR RRRR */ - MESA_FORMAT_R8G8B8X8_UNORM, /* xxxx xxxx BBBB BBBB GGGG GGGG RRRR RRRR */ - MESA_FORMAT_B8G8R8A8_UNORM, /* AAAA AAAA RRRR RRRR GGGG GGGG BBBB BBBB */ - MESA_FORMAT_B8G8R8X8_UNORM, /* xxxx xxxx RRRR RRRR GGGG GGGG BBBB BBBB */ - MESA_FORMAT_A8R8G8B8_UNORM, /* BBBB BBBB GGGG GGGG RRRR RRRR AAAA AAAA */ - MESA_FORMAT_X8R8G8B8_UNORM, /* BBBB BBBB GGGG GGGG RRRR RRRR xxxx xxxx */ - MESA_FORMAT_L16A16_UNORM, /* AAAA AAAA AAAA AAAA LLLL LLLL LLLL LLLL */ - MESA_FORMAT_A16L16_UNORM, /* LLLL LLLL LLLL LLLL AAAA AAAA AAAA AAAA */ - MESA_FORMAT_B5G6R5_UNORM, /* RRRR RGGG GGGB BBBB */ - MESA_FORMAT_R5G6B5_UNORM, /* BBBB BGGG GGGR RRRR */ - MESA_FORMAT_B4G4R4A4_UNORM, /* AAAA RRRR GGGG BBBB */ - MESA_FORMAT_B4G4R4X4_UNORM, /* xxxx RRRR GGGG BBBB */ - MESA_FORMAT_A4R4G4B4_UNORM, /* BBBB GGGG RRRR AAAA */ - MESA_FORMAT_A1B5G5R5_UNORM, /* RRRR RGGG GGBB BBBA */ - MESA_FORMAT_X1B5G5R5_UNORM, /* BBBB BGGG GGRR RRRX */ - MESA_FORMAT_B5G5R5A1_UNORM, /* ARRR RRGG GGGB BBBB */ - MESA_FORMAT_B5G5R5X1_UNORM, /* xRRR RRGG GGGB BBBB */ - MESA_FORMAT_A1R5G5B5_UNORM, /* BBBB BGGG GGRR RRRA */ - MESA_FORMAT_L8A8_UNORM, /* AAAA AAAA LLLL LLLL */ - MESA_FORMAT_A8L8_UNORM, /* LLLL LLLL AAAA AAAA */ - MESA_FORMAT_R8G8_UNORM, /* GGGG GGGG RRRR RRRR */ - MESA_FORMAT_G8R8_UNORM, /* RRRR RRRR GGGG GGGG */ - MESA_FORMAT_L4A4_UNORM, /* AAAA LLLL */ - MESA_FORMAT_B2G3R3_UNORM, /* RRRG GGBB */ - - MESA_FORMAT_R16G16_UNORM, /* GGGG GGGG GGGG GGGG RRRR RRRR RRRR RRRR */ - MESA_FORMAT_G16R16_UNORM, /* RRRR RRRR RRRR RRRR GGGG GGGG GGGG GGGG */ - MESA_FORMAT_B10G10R10A2_UNORM,/* AARR RRRR RRRR GGGG GGGG GGBB BBBB BBBB */ - MESA_FORMAT_B10G10R10X2_UNORM,/* xxRR RRRR RRRR GGGG GGGG GGBB BBBB BBBB */ - MESA_FORMAT_R10G10B10A2_UNORM,/* AABB BBBB BBBB GGGG GGGG GGRR RRRR RRRR */ - MESA_FORMAT_R10G10B10X2_UNORM,/* xxBB BBBB BBBB GGGG GGGG GGRR RRRR RRRR */ - - MESA_FORMAT_S8_UINT_Z24_UNORM,/* ZZZZ ZZZZ ZZZZ ZZZZ ZZZZ ZZZZ SSSS SSSS */ - MESA_FORMAT_X8_UINT_Z24_UNORM,/* ZZZZ ZZZZ ZZZZ ZZZZ ZZZZ ZZZZ xxxx xxxx */ - MESA_FORMAT_Z24_UNORM_S8_UINT,/* SSSS SSSS ZZZZ ZZZZ ZZZZ ZZZZ ZZZZ ZZZZ */ - MESA_FORMAT_Z24_UNORM_X8_UINT,/* xxxx xxxx ZZZZ ZZZZ ZZZZ ZZZZ ZZZZ ZZZZ */ - - /* Other formats */ - MESA_FORMAT_R3G3B2_UNORM, /* BBGG GRRR */ - MESA_FORMAT_A4B4G4R4_UNORM, /* RRRR GGGG BBBB AAAA */ - MESA_FORMAT_R4G4B4A4_UNORM, /* AAAA BBBB GGGG RRRR */ - MESA_FORMAT_R5G5B5A1_UNORM, /* ABBB BBGG GGGR RRRR */ - MESA_FORMAT_A2B10G10R10_UNORM,/* RRRR RRRR RRGG GGGG GGGG BBBB BBBB BBAA */ - MESA_FORMAT_A2R10G10B10_UNORM,/* BBBB BBBB BBGG GGGG GGGG RRRR RRRR RRAA */ - - MESA_FORMAT_YCBCR, /* YYYY YYYY UorV UorV */ - MESA_FORMAT_YCBCR_REV, /* UorV UorV YYYY YYYY */ - - /* Array unorm formats */ - MESA_FORMAT_A_UNORM8, /* ubyte[i] = A */ - MESA_FORMAT_A_UNORM16, /* ushort[i] = A */ - MESA_FORMAT_L_UNORM8, /* ubyte[i] = L */ - MESA_FORMAT_L_UNORM16, /* ushort[i] = L */ - MESA_FORMAT_I_UNORM8, /* ubyte[i] = I */ - MESA_FORMAT_I_UNORM16, /* ushort[i] = I */ - MESA_FORMAT_R_UNORM8, /* ubyte[i] = R */ - MESA_FORMAT_R_UNORM16, /* ushort[i] = R */ - MESA_FORMAT_BGR_UNORM8, /* ubyte[i*3] = B, [i*3+1] = G, [i*3+2] = R */ - MESA_FORMAT_RGB_UNORM8, /* ubyte[i*3] = R, [i*3+1] = G, [i*3+2] = B */ - MESA_FORMAT_RGBA_UNORM16, /* ushort[i] = R, [1] = G, [2] = B, [3] = A */ - MESA_FORMAT_RGBX_UNORM16, - - MESA_FORMAT_Z_UNORM16, /* ushort[i] = Z */ - MESA_FORMAT_Z_UNORM32, /* uint[i] = Z */ - MESA_FORMAT_S_UINT8, /* ubyte[i] = S */ - - /* Packed signed/normalized formats */ - /* msb <------ TEXEL BITS -----------> lsb */ - /* ---- ---- ---- ---- ---- ---- ---- ---- */ - MESA_FORMAT_A8B8G8R8_SNORM, /* RRRR RRRR GGGG GGGG BBBB BBBB AAAA AAAA */ - MESA_FORMAT_X8B8G8R8_SNORM, /* RRRR RRRR GGGG GGGG BBBB BBBB xxxx xxxx */ - MESA_FORMAT_R8G8B8A8_SNORM, /* AAAA AAAA BBBB BBBB GGGG GGGG RRRR RRRR */ - MESA_FORMAT_R8G8B8X8_SNORM, /* xxxx xxxx BBBB BBBB GGGG GGGG RRRR RRRR */ - MESA_FORMAT_R16G16_SNORM, /* GGGG GGGG GGGG GGGG RRRR RRRR RRRR RRRR */ - MESA_FORMAT_G16R16_SNORM, /* RRRR RRRR RRRR RRRR GGGG GGGG GGGG GGGG */ - MESA_FORMAT_R8G8_SNORM, /* GGGG GGGG RRRR RRRR */ - MESA_FORMAT_G8R8_SNORM, /* RRRR RRRR GGGG GGGG */ - MESA_FORMAT_L8A8_SNORM, /* AAAA AAAA LLLL LLLL */ - MESA_FORMAT_A8L8_SNORM, /* LLLL LLLL AAAA AAAA */ - - /* Array signed/normalized formats */ - MESA_FORMAT_A_SNORM8, /* byte[i] = A */ - MESA_FORMAT_A_SNORM16, /* short[i] = A */ - MESA_FORMAT_L_SNORM8, /* byte[i] = L */ - MESA_FORMAT_L_SNORM16, /* short[i] = L */ - MESA_FORMAT_I_SNORM8, /* byte[i] = I */ - MESA_FORMAT_I_SNORM16, /* short[i] = I */ - MESA_FORMAT_R_SNORM8, /* byte[i] = R */ - MESA_FORMAT_R_SNORM16, /* short[i] = R */ - MESA_FORMAT_LA_SNORM16, /* short[i * 2] = L, [i * 2 + 1] = A */ - MESA_FORMAT_RGB_SNORM16, /* short[i*3] = R, [i*3+1] = G, [i*3+2] = B */ - MESA_FORMAT_RGBA_SNORM16, /* ... */ - MESA_FORMAT_RGBX_SNORM16, /* ... */ - - /* Packed sRGB formats */ - MESA_FORMAT_A8B8G8R8_SRGB, /* RRRR RRRR GGGG GGGG BBBB BBBB AAAA AAAA */ - MESA_FORMAT_B8G8R8A8_SRGB, /* AAAA AAAA RRRR RRRR GGGG GGGG BBBB BBBB */ - MESA_FORMAT_A8R8G8B8_SRGB, /* BBBB BBBB GGGG GGGG RRRR RRRR AAAA AAAA */ - MESA_FORMAT_B8G8R8X8_SRGB, /* xxxx xxxx RRRR RRRR GGGG GGGG BBBB BBBB */ - MESA_FORMAT_X8R8G8B8_SRGB, /* BBBB BBBB GGGG GGGG RRRR RRRR xxxx xxxx */ - MESA_FORMAT_R8G8B8A8_SRGB, /* AAAA AAAA BBBB BBBB GGGG GGGG RRRR RRRR */ - MESA_FORMAT_R8G8B8X8_SRGB, /* xxxx xxxx BBBB BBBB GGGG GGGG RRRR RRRR */ - MESA_FORMAT_X8B8G8R8_SRGB, /* RRRR RRRR GGGG GGGG BBBB BBBB xxxx xxxx */ - MESA_FORMAT_L8A8_SRGB, /* AAAA AAAA LLLL LLLL */ - MESA_FORMAT_A8L8_SRGB, /* LLLL LLLL AAAA AAAA */ - MESA_FORMAT_R_SRGB8, /* RRRR RRRR */ - - /* Array sRGB formats */ - MESA_FORMAT_L_SRGB8, /* ubyte[i] = L */ - MESA_FORMAT_BGR_SRGB8, /* ubyte[i*3] = B, [i*3+1] = G, [i*3+2] = R */ - - /* Packed float formats */ - MESA_FORMAT_R9G9B9E5_FLOAT, - MESA_FORMAT_R11G11B10_FLOAT, /* BBBB BBBB BBGG GGGG GGGG GRRR RRRR RRRR */ - MESA_FORMAT_Z32_FLOAT_S8X24_UINT, /* (float, x24s8) */ - - /* Array float formats */ - MESA_FORMAT_A_FLOAT16, - MESA_FORMAT_A_FLOAT32, - MESA_FORMAT_L_FLOAT16, - MESA_FORMAT_L_FLOAT32, - MESA_FORMAT_LA_FLOAT16, - MESA_FORMAT_LA_FLOAT32, - MESA_FORMAT_I_FLOAT16, - MESA_FORMAT_I_FLOAT32, - MESA_FORMAT_R_FLOAT16, - MESA_FORMAT_R_FLOAT32, - MESA_FORMAT_RG_FLOAT16, - MESA_FORMAT_RG_FLOAT32, - MESA_FORMAT_RGB_FLOAT16, - MESA_FORMAT_RGB_FLOAT32, - MESA_FORMAT_RGBA_FLOAT16, - MESA_FORMAT_RGBA_FLOAT32, /* float[0] = R, [1] = G, [2] = B, [3] = A */ - MESA_FORMAT_RGBX_FLOAT16, - MESA_FORMAT_RGBX_FLOAT32, - MESA_FORMAT_Z_FLOAT32, - - /* Packed signed/unsigned non-normalized integer formats */ - - MESA_FORMAT_A8B8G8R8_UINT, /* RRRR RRRR GGGG GGGG BBBB BBBB AAAA AAAA */ - MESA_FORMAT_A8R8G8B8_UINT, /* BBBB BBBB GGGG GGGG RRRR RRRR AAAA AAAA */ - MESA_FORMAT_R8G8B8A8_UINT, /* AAAA AAAA BBBB BBBB GGGG GGGG RRRR RRRR */ - MESA_FORMAT_B8G8R8A8_UINT, /* AAAA AAAA RRRR RRRR GGGG GGGG BBBB BBBB */ - MESA_FORMAT_B10G10R10A2_UINT, /* AARR RRRR RRRR GGGG GGGG GGBB BBBB BBBB */ - MESA_FORMAT_R10G10B10A2_UINT, /* AABB BBBB BBBB GGGG GGGG GGRR RRRR RRRR */ - MESA_FORMAT_A2B10G10R10_UINT, /* RRRR RRRR RRGG GGGG GGGG BBBB BBBB BBAA */ - MESA_FORMAT_A2R10G10B10_UINT, /* BBBB BBBB BBGG GGGG GGGG RRRR RRRR RRAA */ - MESA_FORMAT_B5G6R5_UINT, /* RRRR RGGG GGGB BBBB */ - MESA_FORMAT_R5G6B5_UINT, /* BBBB BGGG GGGR RRRR */ - MESA_FORMAT_B2G3R3_UINT, /* RRRG GGBB */ - MESA_FORMAT_R3G3B2_UINT, /* BBGG GRRR */ - MESA_FORMAT_A4B4G4R4_UINT, /* RRRR GGGG BBBB AAAA */ - MESA_FORMAT_R4G4B4A4_UINT, /* AAAA BBBB GGGG RRRR */ - MESA_FORMAT_B4G4R4A4_UINT, /* AAAA RRRR GGGG BBBB */ - MESA_FORMAT_A4R4G4B4_UINT, /* BBBB GGGG RRRR AAAA */ - MESA_FORMAT_A1B5G5R5_UINT, /* RRRR RGGG GGBB BBBA */ - MESA_FORMAT_B5G5R5A1_UINT, /* ARRR RRGG GGGB BBBB */ - MESA_FORMAT_A1R5G5B5_UINT, /* BBBB BGGG GGRR RRRA */ - MESA_FORMAT_R5G5B5A1_UINT, /* ABBB BBGG GGGR RRRR */ - - /* Array signed/unsigned non-normalized integer formats */ - MESA_FORMAT_A_UINT8, - MESA_FORMAT_A_UINT16, - MESA_FORMAT_A_UINT32, - MESA_FORMAT_A_SINT8, - MESA_FORMAT_A_SINT16, - MESA_FORMAT_A_SINT32, - - MESA_FORMAT_I_UINT8, - MESA_FORMAT_I_UINT16, - MESA_FORMAT_I_UINT32, - MESA_FORMAT_I_SINT8, - MESA_FORMAT_I_SINT16, - MESA_FORMAT_I_SINT32, - - MESA_FORMAT_L_UINT8, - MESA_FORMAT_L_UINT16, - MESA_FORMAT_L_UINT32, - MESA_FORMAT_L_SINT8, - MESA_FORMAT_L_SINT16, - MESA_FORMAT_L_SINT32, - - MESA_FORMAT_LA_UINT8, - MESA_FORMAT_LA_UINT16, - MESA_FORMAT_LA_UINT32, - MESA_FORMAT_LA_SINT8, - MESA_FORMAT_LA_SINT16, - MESA_FORMAT_LA_SINT32, - - MESA_FORMAT_R_UINT8, - MESA_FORMAT_R_UINT16, - MESA_FORMAT_R_UINT32, - MESA_FORMAT_R_SINT8, - MESA_FORMAT_R_SINT16, - MESA_FORMAT_R_SINT32, - - MESA_FORMAT_RG_UINT8, - MESA_FORMAT_RG_UINT16, - MESA_FORMAT_RG_UINT32, - MESA_FORMAT_RG_SINT8, - MESA_FORMAT_RG_SINT16, - MESA_FORMAT_RG_SINT32, - - MESA_FORMAT_RGB_UINT8, - MESA_FORMAT_RGB_UINT16, - MESA_FORMAT_RGB_UINT32, - MESA_FORMAT_RGB_SINT8, - MESA_FORMAT_RGB_SINT16, - MESA_FORMAT_RGB_SINT32, - - MESA_FORMAT_RGBA_UINT8, - MESA_FORMAT_RGBA_UINT16, - MESA_FORMAT_RGBA_UINT32, - MESA_FORMAT_RGBA_SINT8, - MESA_FORMAT_RGBA_SINT16, - MESA_FORMAT_RGBA_SINT32, - - MESA_FORMAT_RGBX_UINT8, - MESA_FORMAT_RGBX_UINT16, - MESA_FORMAT_RGBX_UINT32, - MESA_FORMAT_RGBX_SINT8, - MESA_FORMAT_RGBX_SINT16, - MESA_FORMAT_RGBX_SINT32, - - /* DXT compressed formats */ - MESA_FORMAT_RGB_DXT1, - MESA_FORMAT_RGBA_DXT1, - MESA_FORMAT_RGBA_DXT3, - MESA_FORMAT_RGBA_DXT5, - - /* DXT sRGB compressed formats */ - MESA_FORMAT_SRGB_DXT1, - MESA_FORMAT_SRGBA_DXT1, - MESA_FORMAT_SRGBA_DXT3, - MESA_FORMAT_SRGBA_DXT5, - - /* FXT1 compressed formats */ - MESA_FORMAT_RGB_FXT1, - MESA_FORMAT_RGBA_FXT1, - - /* RGTC compressed formats */ - MESA_FORMAT_R_RGTC1_UNORM, - MESA_FORMAT_R_RGTC1_SNORM, - MESA_FORMAT_RG_RGTC2_UNORM, - MESA_FORMAT_RG_RGTC2_SNORM, - - /* LATC1/2 compressed formats */ - MESA_FORMAT_L_LATC1_UNORM, - MESA_FORMAT_L_LATC1_SNORM, - MESA_FORMAT_LA_LATC2_UNORM, - MESA_FORMAT_LA_LATC2_SNORM, - - /* ETC1/2 compressed formats */ - MESA_FORMAT_ETC1_RGB8, - MESA_FORMAT_ETC2_RGB8, - MESA_FORMAT_ETC2_SRGB8, - MESA_FORMAT_ETC2_RGBA8_EAC, - MESA_FORMAT_ETC2_SRGB8_ALPHA8_EAC, - MESA_FORMAT_ETC2_R11_EAC, - MESA_FORMAT_ETC2_RG11_EAC, - MESA_FORMAT_ETC2_SIGNED_R11_EAC, - MESA_FORMAT_ETC2_SIGNED_RG11_EAC, - MESA_FORMAT_ETC2_RGB8_PUNCHTHROUGH_ALPHA1, - MESA_FORMAT_ETC2_SRGB8_PUNCHTHROUGH_ALPHA1, - - /* BPTC compressed formats */ - MESA_FORMAT_BPTC_RGBA_UNORM, - MESA_FORMAT_BPTC_SRGB_ALPHA_UNORM, - MESA_FORMAT_BPTC_RGB_SIGNED_FLOAT, - MESA_FORMAT_BPTC_RGB_UNSIGNED_FLOAT, - - /* ASTC compressed formats */ - MESA_FORMAT_RGBA_ASTC_4x4, - MESA_FORMAT_RGBA_ASTC_5x4, - MESA_FORMAT_RGBA_ASTC_5x5, - MESA_FORMAT_RGBA_ASTC_6x5, - MESA_FORMAT_RGBA_ASTC_6x6, - MESA_FORMAT_RGBA_ASTC_8x5, - MESA_FORMAT_RGBA_ASTC_8x6, - MESA_FORMAT_RGBA_ASTC_8x8, - MESA_FORMAT_RGBA_ASTC_10x5, - MESA_FORMAT_RGBA_ASTC_10x6, - MESA_FORMAT_RGBA_ASTC_10x8, - MESA_FORMAT_RGBA_ASTC_10x10, - MESA_FORMAT_RGBA_ASTC_12x10, - MESA_FORMAT_RGBA_ASTC_12x12, - - MESA_FORMAT_SRGB8_ALPHA8_ASTC_4x4, - MESA_FORMAT_SRGB8_ALPHA8_ASTC_5x4, - MESA_FORMAT_SRGB8_ALPHA8_ASTC_5x5, - MESA_FORMAT_SRGB8_ALPHA8_ASTC_6x5, - MESA_FORMAT_SRGB8_ALPHA8_ASTC_6x6, - MESA_FORMAT_SRGB8_ALPHA8_ASTC_8x5, - MESA_FORMAT_SRGB8_ALPHA8_ASTC_8x6, - MESA_FORMAT_SRGB8_ALPHA8_ASTC_8x8, - MESA_FORMAT_SRGB8_ALPHA8_ASTC_10x5, - MESA_FORMAT_SRGB8_ALPHA8_ASTC_10x6, - MESA_FORMAT_SRGB8_ALPHA8_ASTC_10x8, - MESA_FORMAT_SRGB8_ALPHA8_ASTC_10x10, - MESA_FORMAT_SRGB8_ALPHA8_ASTC_12x10, - MESA_FORMAT_SRGB8_ALPHA8_ASTC_12x12, - - MESA_FORMAT_RGBA_ASTC_3x3x3, - MESA_FORMAT_RGBA_ASTC_4x3x3, - MESA_FORMAT_RGBA_ASTC_4x4x3, - MESA_FORMAT_RGBA_ASTC_4x4x4, - MESA_FORMAT_RGBA_ASTC_5x4x4, - MESA_FORMAT_RGBA_ASTC_5x5x4, - MESA_FORMAT_RGBA_ASTC_5x5x5, - MESA_FORMAT_RGBA_ASTC_6x5x5, - MESA_FORMAT_RGBA_ASTC_6x6x5, - MESA_FORMAT_RGBA_ASTC_6x6x6, - MESA_FORMAT_SRGB8_ALPHA8_ASTC_3x3x3, - MESA_FORMAT_SRGB8_ALPHA8_ASTC_4x3x3, - MESA_FORMAT_SRGB8_ALPHA8_ASTC_4x4x3, - MESA_FORMAT_SRGB8_ALPHA8_ASTC_4x4x4, - MESA_FORMAT_SRGB8_ALPHA8_ASTC_5x4x4, - MESA_FORMAT_SRGB8_ALPHA8_ASTC_5x5x4, - MESA_FORMAT_SRGB8_ALPHA8_ASTC_5x5x5, - MESA_FORMAT_SRGB8_ALPHA8_ASTC_6x5x5, - MESA_FORMAT_SRGB8_ALPHA8_ASTC_6x6x5, - MESA_FORMAT_SRGB8_ALPHA8_ASTC_6x6x6, - - /* ATC compressed formats */ - MESA_FORMAT_ATC_RGB, - MESA_FORMAT_ATC_RGBA_EXPLICIT, - MESA_FORMAT_ATC_RGBA_INTERPOLATED, - - MESA_FORMAT_COUNT -} mesa_format; - +#define MESA_FORMAT_NONE PIPE_FORMAT_NONE +#define MESA_FORMAT_A8B8G8R8_UNORM PIPE_FORMAT_ABGR8888_UNORM +#define MESA_FORMAT_X8B8G8R8_UNORM PIPE_FORMAT_XBGR8888_UNORM +#define MESA_FORMAT_R8G8B8A8_UNORM PIPE_FORMAT_RGBA8888_UNORM +#define MESA_FORMAT_R8G8B8X8_UNORM PIPE_FORMAT_RGBX8888_UNORM +#define MESA_FORMAT_B8G8R8A8_UNORM PIPE_FORMAT_BGRA8888_UNORM +#define MESA_FORMAT_B8G8R8X8_UNORM PIPE_FORMAT_BGRX8888_UNORM +#define MESA_FORMAT_A8R8G8B8_UNORM PIPE_FORMAT_ARGB8888_UNORM +#define MESA_FORMAT_X8R8G8B8_UNORM PIPE_FORMAT_XRGB8888_UNORM +#define MESA_FORMAT_B5G6R5_UNORM PIPE_FORMAT_B5G6R5_UNORM +#define MESA_FORMAT_R5G6B5_UNORM PIPE_FORMAT_R5G6B5_UNORM +#define MESA_FORMAT_B4G4R4A4_UNORM PIPE_FORMAT_B4G4R4A4_UNORM +#define MESA_FORMAT_B4G4R4X4_UNORM PIPE_FORMAT_B4G4R4X4_UNORM +#define MESA_FORMAT_A4R4G4B4_UNORM PIPE_FORMAT_A4R4G4B4_UNORM +#define MESA_FORMAT_A1B5G5R5_UNORM PIPE_FORMAT_A1B5G5R5_UNORM +#define MESA_FORMAT_X1B5G5R5_UNORM PIPE_FORMAT_X1B5G5R5_UNORM +#define MESA_FORMAT_B5G5R5A1_UNORM PIPE_FORMAT_B5G5R5A1_UNORM +#define MESA_FORMAT_B5G5R5X1_UNORM PIPE_FORMAT_B5G5R5X1_UNORM +#define MESA_FORMAT_A1R5G5B5_UNORM PIPE_FORMAT_A1R5G5B5_UNORM +#define MESA_FORMAT_L4A4_UNORM PIPE_FORMAT_L4A4_UNORM +#define MESA_FORMAT_B2G3R3_UNORM PIPE_FORMAT_B2G3R3_UNORM +#define MESA_FORMAT_B10G10R10A2_UNORM PIPE_FORMAT_B10G10R10A2_UNORM +#define MESA_FORMAT_B10G10R10X2_UNORM PIPE_FORMAT_B10G10R10X2_UNORM +#define MESA_FORMAT_R10G10B10A2_UNORM PIPE_FORMAT_R10G10B10A2_UNORM +#define MESA_FORMAT_R10G10B10X2_UNORM PIPE_FORMAT_R10G10B10X2_UNORM +#define MESA_FORMAT_S8_UINT_Z24_UNORM PIPE_FORMAT_S8_UINT_Z24_UNORM +#define MESA_FORMAT_X8_UINT_Z24_UNORM PIPE_FORMAT_X8Z24_UNORM +#define MESA_FORMAT_Z24_UNORM_S8_UINT PIPE_FORMAT_Z24_UNORM_S8_UINT +#define MESA_FORMAT_Z24_UNORM_X8_UINT PIPE_FORMAT_Z24X8_UNORM +#define MESA_FORMAT_R3G3B2_UNORM PIPE_FORMAT_R3G3B2_UNORM +#define MESA_FORMAT_A4B4G4R4_UNORM PIPE_FORMAT_A4B4G4R4_UNORM +#define MESA_FORMAT_R4G4B4A4_UNORM PIPE_FORMAT_R4G4B4A4_UNORM +#define MESA_FORMAT_R5G5B5A1_UNORM PIPE_FORMAT_R5G5B5A1_UNORM +#define MESA_FORMAT_A2B10G10R10_UNORM PIPE_FORMAT_A2B10G10R10_UNORM +#define MESA_FORMAT_A2R10G10B10_UNORM PIPE_FORMAT_A2R10G10B10_UNORM +#define MESA_FORMAT_YCBCR PIPE_FORMAT_UYVY +#define MESA_FORMAT_YCBCR_REV PIPE_FORMAT_YUYV +#define MESA_FORMAT_A_UNORM8 PIPE_FORMAT_A8_UNORM +#define MESA_FORMAT_A_UNORM16 PIPE_FORMAT_A16_UNORM +#define MESA_FORMAT_L_UNORM8 PIPE_FORMAT_L8_UNORM +#define MESA_FORMAT_L_UNORM16 PIPE_FORMAT_L16_UNORM +#define MESA_FORMAT_LA_UNORM8 PIPE_FORMAT_L8A8_UNORM +#define MESA_FORMAT_LA_UNORM16 PIPE_FORMAT_L16A16_UNORM +#define MESA_FORMAT_I_UNORM8 PIPE_FORMAT_I8_UNORM +#define MESA_FORMAT_I_UNORM16 PIPE_FORMAT_I16_UNORM +#define MESA_FORMAT_R_UNORM8 PIPE_FORMAT_R8_UNORM +#define MESA_FORMAT_R_UNORM16 PIPE_FORMAT_R16_UNORM +#define MESA_FORMAT_RG_UNORM8 PIPE_FORMAT_R8G8_UNORM +#define MESA_FORMAT_RG_UNORM16 PIPE_FORMAT_R16G16_UNORM +#define MESA_FORMAT_BGR_UNORM8 PIPE_FORMAT_B8G8R8_UNORM +#define MESA_FORMAT_RGB_UNORM8 PIPE_FORMAT_R8G8B8_UNORM +#define MESA_FORMAT_RGBA_UNORM16 PIPE_FORMAT_R16G16B16A16_UNORM +#define MESA_FORMAT_RGBX_UNORM16 PIPE_FORMAT_R16G16B16X16_UNORM +#define MESA_FORMAT_Z_UNORM16 PIPE_FORMAT_Z16_UNORM +#define MESA_FORMAT_Z_UNORM32 PIPE_FORMAT_Z32_UNORM +#define MESA_FORMAT_S_UINT8 PIPE_FORMAT_S8_UINT +#define MESA_FORMAT_A8B8G8R8_SNORM PIPE_FORMAT_ABGR8888_SNORM +#define MESA_FORMAT_X8B8G8R8_SNORM PIPE_FORMAT_XBGR8888_SNORM +#define MESA_FORMAT_R8G8B8A8_SNORM PIPE_FORMAT_RGBA8888_SNORM +#define MESA_FORMAT_R8G8B8X8_SNORM PIPE_FORMAT_RGBX8888_SNORM +#define MESA_FORMAT_A_SNORM8 PIPE_FORMAT_A8_SNORM +#define MESA_FORMAT_A_SNORM16 PIPE_FORMAT_A16_SNORM +#define MESA_FORMAT_L_SNORM8 PIPE_FORMAT_L8_SNORM +#define MESA_FORMAT_L_SNORM16 PIPE_FORMAT_L16_SNORM +#define MESA_FORMAT_I_SNORM8 PIPE_FORMAT_I8_SNORM +#define MESA_FORMAT_I_SNORM16 PIPE_FORMAT_I16_SNORM +#define MESA_FORMAT_R_SNORM8 PIPE_FORMAT_R8_SNORM +#define MESA_FORMAT_R_SNORM16 PIPE_FORMAT_R16_SNORM +#define MESA_FORMAT_LA_SNORM8 PIPE_FORMAT_L8A8_SNORM +#define MESA_FORMAT_LA_SNORM16 PIPE_FORMAT_L16A16_SNORM +#define MESA_FORMAT_RG_SNORM8 PIPE_FORMAT_R8G8_SNORM +#define MESA_FORMAT_RG_SNORM16 PIPE_FORMAT_R16G16_SNORM +#define MESA_FORMAT_RGB_SNORM16 PIPE_FORMAT_R16G16B16_SNORM +#define MESA_FORMAT_RGBA_SNORM16 PIPE_FORMAT_R16G16B16A16_SNORM +#define MESA_FORMAT_RGBX_SNORM16 PIPE_FORMAT_R16G16B16X16_SNORM +#define MESA_FORMAT_A8B8G8R8_SRGB PIPE_FORMAT_ABGR8888_SRGB +#define MESA_FORMAT_B8G8R8A8_SRGB PIPE_FORMAT_BGRA8888_SRGB +#define MESA_FORMAT_A8R8G8B8_SRGB PIPE_FORMAT_ARGB8888_SRGB +#define MESA_FORMAT_B8G8R8X8_SRGB PIPE_FORMAT_BGRX8888_SRGB +#define MESA_FORMAT_X8R8G8B8_SRGB PIPE_FORMAT_XRGB8888_SRGB +#define MESA_FORMAT_R8G8B8A8_SRGB PIPE_FORMAT_RGBA8888_SRGB +#define MESA_FORMAT_R8G8B8X8_SRGB PIPE_FORMAT_RGBX8888_SRGB +#define MESA_FORMAT_X8B8G8R8_SRGB PIPE_FORMAT_XBGR8888_SRGB +#define MESA_FORMAT_R_SRGB8 PIPE_FORMAT_R8_SRGB +#define MESA_FORMAT_L_SRGB8 PIPE_FORMAT_L8_SRGB +#define MESA_FORMAT_LA_SRGB8 PIPE_FORMAT_L8A8_SRGB +#define MESA_FORMAT_BGR_SRGB8 PIPE_FORMAT_R8G8B8_SRGB +#define MESA_FORMAT_R9G9B9E5_FLOAT PIPE_FORMAT_R9G9B9E5_FLOAT +#define MESA_FORMAT_R11G11B10_FLOAT PIPE_FORMAT_R11G11B10_FLOAT +#define MESA_FORMAT_Z32_FLOAT_S8X24_UINT PIPE_FORMAT_Z32_FLOAT_S8X24_UINT +#define MESA_FORMAT_A_FLOAT16 PIPE_FORMAT_A16_FLOAT +#define MESA_FORMAT_A_FLOAT32 PIPE_FORMAT_A32_FLOAT +#define MESA_FORMAT_L_FLOAT16 PIPE_FORMAT_L16_FLOAT +#define MESA_FORMAT_L_FLOAT32 PIPE_FORMAT_L32_FLOAT +#define MESA_FORMAT_LA_FLOAT16 PIPE_FORMAT_L16A16_FLOAT +#define MESA_FORMAT_LA_FLOAT32 PIPE_FORMAT_L32A32_FLOAT +#define MESA_FORMAT_I_FLOAT16 PIPE_FORMAT_I16_FLOAT +#define MESA_FORMAT_I_FLOAT32 PIPE_FORMAT_I32_FLOAT +#define MESA_FORMAT_R_FLOAT16 PIPE_FORMAT_R16_FLOAT +#define MESA_FORMAT_R_FLOAT32 PIPE_FORMAT_R32_FLOAT +#define MESA_FORMAT_RG_FLOAT16 PIPE_FORMAT_R16G16_FLOAT +#define MESA_FORMAT_RG_FLOAT32 PIPE_FORMAT_R32G32_FLOAT +#define MESA_FORMAT_RGB_FLOAT16 PIPE_FORMAT_R16G16B16_FLOAT +#define MESA_FORMAT_RGB_FLOAT32 PIPE_FORMAT_R32G32B32_FLOAT +#define MESA_FORMAT_RGBA_FLOAT16 PIPE_FORMAT_R16G16B16A16_FLOAT +#define MESA_FORMAT_RGBA_FLOAT32 PIPE_FORMAT_R32G32B32A32_FLOAT +#define MESA_FORMAT_RGBX_FLOAT16 PIPE_FORMAT_R16G16B16X16_FLOAT +#define MESA_FORMAT_RGBX_FLOAT32 PIPE_FORMAT_R32G32B32X32_FLOAT +#define MESA_FORMAT_Z_FLOAT32 PIPE_FORMAT_Z32_FLOAT +#define MESA_FORMAT_A8B8G8R8_UINT PIPE_FORMAT_ABGR8888_UINT +#define MESA_FORMAT_A8R8G8B8_UINT PIPE_FORMAT_ARGB8888_UINT +#define MESA_FORMAT_R8G8B8A8_UINT PIPE_FORMAT_RGBA8888_UINT +#define MESA_FORMAT_B8G8R8A8_UINT PIPE_FORMAT_BGRA8888_UINT +#define MESA_FORMAT_B10G10R10A2_UINT PIPE_FORMAT_B10G10R10A2_UINT +#define MESA_FORMAT_R10G10B10A2_UINT PIPE_FORMAT_R10G10B10A2_UINT +#define MESA_FORMAT_A2B10G10R10_UINT PIPE_FORMAT_A2B10G10R10_UINT +#define MESA_FORMAT_A2R10G10B10_UINT PIPE_FORMAT_A2R10G10B10_UINT +#define MESA_FORMAT_B5G6R5_UINT PIPE_FORMAT_B5G6R5_UINT +#define MESA_FORMAT_R5G6B5_UINT PIPE_FORMAT_R5G6B5_UINT +#define MESA_FORMAT_B2G3R3_UINT PIPE_FORMAT_B2G3R3_UINT +#define MESA_FORMAT_R3G3B2_UINT PIPE_FORMAT_R3G3B2_UINT +#define MESA_FORMAT_A4B4G4R4_UINT PIPE_FORMAT_A4B4G4R4_UINT +#define MESA_FORMAT_R4G4B4A4_UINT PIPE_FORMAT_R4G4B4A4_UINT +#define MESA_FORMAT_B4G4R4A4_UINT PIPE_FORMAT_B4G4R4A4_UINT +#define MESA_FORMAT_A4R4G4B4_UINT PIPE_FORMAT_A4R4G4B4_UINT +#define MESA_FORMAT_A1B5G5R5_UINT PIPE_FORMAT_A1B5G5R5_UINT +#define MESA_FORMAT_B5G5R5A1_UINT PIPE_FORMAT_B5G5R5A1_UINT +#define MESA_FORMAT_A1R5G5B5_UINT PIPE_FORMAT_A1R5G5B5_UINT +#define MESA_FORMAT_R5G5B5A1_UINT PIPE_FORMAT_R5G5B5A1_UINT +#define MESA_FORMAT_A_UINT8 PIPE_FORMAT_A8_UINT +#define MESA_FORMAT_A_UINT16 PIPE_FORMAT_A16_UINT +#define MESA_FORMAT_A_UINT32 PIPE_FORMAT_A32_UINT +#define MESA_FORMAT_A_SINT8 PIPE_FORMAT_A8_SINT +#define MESA_FORMAT_A_SINT16 PIPE_FORMAT_A16_SINT +#define MESA_FORMAT_A_SINT32 PIPE_FORMAT_A32_SINT +#define MESA_FORMAT_I_UINT8 PIPE_FORMAT_I8_UINT +#define MESA_FORMAT_I_UINT16 PIPE_FORMAT_I16_UINT +#define MESA_FORMAT_I_UINT32 PIPE_FORMAT_I32_UINT +#define MESA_FORMAT_I_SINT8 PIPE_FORMAT_I8_SINT +#define MESA_FORMAT_I_SINT16 PIPE_FORMAT_I16_SINT +#define MESA_FORMAT_I_SINT32 PIPE_FORMAT_I32_SINT +#define MESA_FORMAT_L_UINT8 PIPE_FORMAT_L8_UINT +#define MESA_FORMAT_L_UINT16 PIPE_FORMAT_L16_UINT +#define MESA_FORMAT_L_UINT32 PIPE_FORMAT_L32_UINT +#define MESA_FORMAT_L_SINT8 PIPE_FORMAT_L8_SINT +#define MESA_FORMAT_L_SINT16 PIPE_FORMAT_L16_SINT +#define MESA_FORMAT_L_SINT32 PIPE_FORMAT_L32_SINT +#define MESA_FORMAT_LA_UINT8 PIPE_FORMAT_L8A8_UINT +#define MESA_FORMAT_LA_UINT16 PIPE_FORMAT_L16A16_UINT +#define MESA_FORMAT_LA_UINT32 PIPE_FORMAT_L32A32_UINT +#define MESA_FORMAT_LA_SINT8 PIPE_FORMAT_L8A8_SINT +#define MESA_FORMAT_LA_SINT16 PIPE_FORMAT_L16A16_SINT +#define MESA_FORMAT_LA_SINT32 PIPE_FORMAT_L32A32_SINT +#define MESA_FORMAT_R_UINT8 PIPE_FORMAT_R8_UINT +#define MESA_FORMAT_R_UINT16 PIPE_FORMAT_R16_UINT +#define MESA_FORMAT_R_UINT32 PIPE_FORMAT_R32_UINT +#define MESA_FORMAT_R_SINT8 PIPE_FORMAT_R8_SINT +#define MESA_FORMAT_R_SINT16 PIPE_FORMAT_R16_SINT +#define MESA_FORMAT_R_SINT32 PIPE_FORMAT_R32_SINT +#define MESA_FORMAT_RG_UINT8 PIPE_FORMAT_R8G8_UINT +#define MESA_FORMAT_RG_UINT16 PIPE_FORMAT_R16G16_UINT +#define MESA_FORMAT_RG_UINT32 PIPE_FORMAT_R32G32_UINT +#define MESA_FORMAT_RG_SINT8 PIPE_FORMAT_R8G8_SINT +#define MESA_FORMAT_RG_SINT16 PIPE_FORMAT_R16G16_SINT +#define MESA_FORMAT_RG_SINT32 PIPE_FORMAT_R32G32_SINT +#define MESA_FORMAT_RGB_UINT8 PIPE_FORMAT_R8G8B8_UINT +#define MESA_FORMAT_RGB_UINT16 PIPE_FORMAT_R16G16B16_UINT +#define MESA_FORMAT_RGB_UINT32 PIPE_FORMAT_R32G32B32_UINT +#define MESA_FORMAT_RGB_SINT8 PIPE_FORMAT_R8G8B8_SINT +#define MESA_FORMAT_RGB_SINT16 PIPE_FORMAT_R16G16B16_SINT +#define MESA_FORMAT_RGB_SINT32 PIPE_FORMAT_R32G32B32_SINT +#define MESA_FORMAT_RGBA_UINT16 PIPE_FORMAT_R16G16B16A16_UINT +#define MESA_FORMAT_RGBA_UINT32 PIPE_FORMAT_R32G32B32A32_UINT +#define MESA_FORMAT_RGBA_SINT8 PIPE_FORMAT_R8G8B8A8_SINT +#define MESA_FORMAT_RGBA_SINT16 PIPE_FORMAT_R16G16B16A16_SINT +#define MESA_FORMAT_RGBA_SINT32 PIPE_FORMAT_R32G32B32A32_SINT +#define MESA_FORMAT_RGBX_UINT8 PIPE_FORMAT_R8G8B8X8_UINT +#define MESA_FORMAT_RGBX_UINT16 PIPE_FORMAT_R16G16B16X16_UINT +#define MESA_FORMAT_RGBX_UINT32 PIPE_FORMAT_R32G32B32X32_UINT +#define MESA_FORMAT_RGBX_SINT8 PIPE_FORMAT_R8G8B8X8_SINT +#define MESA_FORMAT_RGBX_SINT16 PIPE_FORMAT_R16G16B16X16_SINT +#define MESA_FORMAT_RGBX_SINT32 PIPE_FORMAT_R32G32B32X32_SINT +#define MESA_FORMAT_RGB_DXT1 PIPE_FORMAT_DXT1_RGB +#define MESA_FORMAT_RGBA_DXT1 PIPE_FORMAT_DXT1_RGBA +#define MESA_FORMAT_RGBA_DXT3 PIPE_FORMAT_DXT3_RGBA +#define MESA_FORMAT_RGBA_DXT5 PIPE_FORMAT_DXT5_RGBA +#define MESA_FORMAT_SRGB_DXT1 PIPE_FORMAT_DXT1_SRGB +#define MESA_FORMAT_SRGBA_DXT1 PIPE_FORMAT_DXT1_SRGBA +#define MESA_FORMAT_SRGBA_DXT3 PIPE_FORMAT_DXT3_SRGBA +#define MESA_FORMAT_SRGBA_DXT5 PIPE_FORMAT_DXT5_SRGBA +#define MESA_FORMAT_RGB_FXT1 PIPE_FORMAT_FXT1_RGB +#define MESA_FORMAT_RGBA_FXT1 PIPE_FORMAT_FXT1_RGBA +#define MESA_FORMAT_R_RGTC1_UNORM PIPE_FORMAT_RGTC1_UNORM +#define MESA_FORMAT_R_RGTC1_SNORM PIPE_FORMAT_RGTC1_SNORM +#define MESA_FORMAT_RG_RGTC2_UNORM PIPE_FORMAT_RGTC2_UNORM +#define MESA_FORMAT_RG_RGTC2_SNORM PIPE_FORMAT_RGTC2_SNORM +#define MESA_FORMAT_L_LATC1_UNORM PIPE_FORMAT_LATC1_UNORM +#define MESA_FORMAT_L_LATC1_SNORM PIPE_FORMAT_LATC1_SNORM +#define MESA_FORMAT_LA_LATC2_UNORM PIPE_FORMAT_LATC2_UNORM +#define MESA_FORMAT_LA_LATC2_SNORM PIPE_FORMAT_LATC2_SNORM +#define MESA_FORMAT_ETC1_RGB8 PIPE_FORMAT_ETC1_RGB8 +#define MESA_FORMAT_ETC2_RGB8 PIPE_FORMAT_ETC2_RGB8 +#define MESA_FORMAT_ETC2_SRGB8 PIPE_FORMAT_ETC2_SRGB8 +#define MESA_FORMAT_ETC2_RGBA8_EAC PIPE_FORMAT_ETC2_RGBA8 +#define MESA_FORMAT_ETC2_SRGB8_ALPHA8_EAC PIPE_FORMAT_ETC2_SRGBA8 +#define MESA_FORMAT_ETC2_R11_EAC PIPE_FORMAT_ETC2_R11_UNORM +#define MESA_FORMAT_ETC2_RG11_EAC PIPE_FORMAT_ETC2_RG11_UNORM +#define MESA_FORMAT_ETC2_SIGNED_R11_EAC PIPE_FORMAT_ETC2_R11_SNORM +#define MESA_FORMAT_ETC2_SIGNED_RG11_EAC PIPE_FORMAT_ETC2_RG11_SNORM +#define MESA_FORMAT_ETC2_RGB8_PUNCHTHROUGH_ALPHA1 PIPE_FORMAT_ETC2_RGB8A1 +#define MESA_FORMAT_ETC2_SRGB8_PUNCHTHROUGH_ALPHA1 PIPE_FORMAT_ETC2_SRGB8A1 +#define MESA_FORMAT_BPTC_RGBA_UNORM PIPE_FORMAT_BPTC_RGBA_UNORM +#define MESA_FORMAT_BPTC_SRGB_ALPHA_UNORM PIPE_FORMAT_BPTC_SRGBA +#define MESA_FORMAT_BPTC_RGB_SIGNED_FLOAT PIPE_FORMAT_BPTC_RGB_FLOAT +#define MESA_FORMAT_BPTC_RGB_UNSIGNED_FLOAT PIPE_FORMAT_BPTC_RGB_UFLOAT +#define MESA_FORMAT_RGBA_ASTC_4x4 PIPE_FORMAT_ASTC_4x4 +#define MESA_FORMAT_RGBA_ASTC_5x4 PIPE_FORMAT_ASTC_5x4 +#define MESA_FORMAT_RGBA_ASTC_5x5 PIPE_FORMAT_ASTC_5x5 +#define MESA_FORMAT_RGBA_ASTC_6x5 PIPE_FORMAT_ASTC_6x5 +#define MESA_FORMAT_RGBA_ASTC_6x6 PIPE_FORMAT_ASTC_6x6 +#define MESA_FORMAT_RGBA_ASTC_8x5 PIPE_FORMAT_ASTC_8x5 +#define MESA_FORMAT_RGBA_ASTC_8x6 PIPE_FORMAT_ASTC_8x6 +#define MESA_FORMAT_RGBA_ASTC_8x8 PIPE_FORMAT_ASTC_8x8 +#define MESA_FORMAT_RGBA_ASTC_10x5 PIPE_FORMAT_ASTC_10x5 +#define MESA_FORMAT_RGBA_ASTC_10x6 PIPE_FORMAT_ASTC_10x6 +#define MESA_FORMAT_RGBA_ASTC_10x8 PIPE_FORMAT_ASTC_10x8 +#define MESA_FORMAT_RGBA_ASTC_10x10 PIPE_FORMAT_ASTC_10x10 +#define MESA_FORMAT_RGBA_ASTC_12x10 PIPE_FORMAT_ASTC_12x10 +#define MESA_FORMAT_RGBA_ASTC_12x12 PIPE_FORMAT_ASTC_12x12 +#define MESA_FORMAT_SRGB8_ALPHA8_ASTC_4x4 PIPE_FORMAT_ASTC_4x4_SRGB +#define MESA_FORMAT_SRGB8_ALPHA8_ASTC_5x4 PIPE_FORMAT_ASTC_5x4_SRGB +#define MESA_FORMAT_SRGB8_ALPHA8_ASTC_5x5 PIPE_FORMAT_ASTC_5x5_SRGB +#define MESA_FORMAT_SRGB8_ALPHA8_ASTC_6x5 PIPE_FORMAT_ASTC_6x5_SRGB +#define MESA_FORMAT_SRGB8_ALPHA8_ASTC_6x6 PIPE_FORMAT_ASTC_6x6_SRGB +#define MESA_FORMAT_SRGB8_ALPHA8_ASTC_8x5 PIPE_FORMAT_ASTC_8x5_SRGB +#define MESA_FORMAT_SRGB8_ALPHA8_ASTC_8x6 PIPE_FORMAT_ASTC_8x6_SRGB +#define MESA_FORMAT_SRGB8_ALPHA8_ASTC_8x8 PIPE_FORMAT_ASTC_8x8_SRGB +#define MESA_FORMAT_SRGB8_ALPHA8_ASTC_10x5 PIPE_FORMAT_ASTC_10x5_SRGB +#define MESA_FORMAT_SRGB8_ALPHA8_ASTC_10x6 PIPE_FORMAT_ASTC_10x6_SRGB +#define MESA_FORMAT_SRGB8_ALPHA8_ASTC_10x8 PIPE_FORMAT_ASTC_10x8_SRGB +#define MESA_FORMAT_SRGB8_ALPHA8_ASTC_10x10 PIPE_FORMAT_ASTC_10x10_SRGB +#define MESA_FORMAT_SRGB8_ALPHA8_ASTC_12x10 PIPE_FORMAT_ASTC_12x10_SRGB +#define MESA_FORMAT_SRGB8_ALPHA8_ASTC_12x12 PIPE_FORMAT_ASTC_12x12_SRGB +#define MESA_FORMAT_RGBA_ASTC_3x3x3 PIPE_FORMAT_ASTC_3x3x3 +#define MESA_FORMAT_RGBA_ASTC_4x3x3 PIPE_FORMAT_ASTC_4x3x3 +#define MESA_FORMAT_RGBA_ASTC_4x4x3 PIPE_FORMAT_ASTC_4x4x3 +#define MESA_FORMAT_RGBA_ASTC_4x4x4 PIPE_FORMAT_ASTC_4x4x4 +#define MESA_FORMAT_RGBA_ASTC_5x4x4 PIPE_FORMAT_ASTC_5x4x4 +#define MESA_FORMAT_RGBA_ASTC_5x5x4 PIPE_FORMAT_ASTC_5x5x4 +#define MESA_FORMAT_RGBA_ASTC_5x5x5 PIPE_FORMAT_ASTC_5x5x5 +#define MESA_FORMAT_RGBA_ASTC_6x5x5 PIPE_FORMAT_ASTC_6x5x5 +#define MESA_FORMAT_RGBA_ASTC_6x6x5 PIPE_FORMAT_ASTC_6x6x5 +#define MESA_FORMAT_RGBA_ASTC_6x6x6 PIPE_FORMAT_ASTC_6x6x6 +#define MESA_FORMAT_SRGB8_ALPHA8_ASTC_3x3x3 PIPE_FORMAT_ASTC_3x3x3_SRGB +#define MESA_FORMAT_SRGB8_ALPHA8_ASTC_4x3x3 PIPE_FORMAT_ASTC_4x3x3_SRGB +#define MESA_FORMAT_SRGB8_ALPHA8_ASTC_4x4x3 PIPE_FORMAT_ASTC_4x4x3_SRGB +#define MESA_FORMAT_SRGB8_ALPHA8_ASTC_4x4x4 PIPE_FORMAT_ASTC_4x4x4_SRGB +#define MESA_FORMAT_SRGB8_ALPHA8_ASTC_5x4x4 PIPE_FORMAT_ASTC_5x4x4_SRGB +#define MESA_FORMAT_SRGB8_ALPHA8_ASTC_5x5x4 PIPE_FORMAT_ASTC_5x5x4_SRGB +#define MESA_FORMAT_SRGB8_ALPHA8_ASTC_5x5x5 PIPE_FORMAT_ASTC_5x5x5_SRGB +#define MESA_FORMAT_SRGB8_ALPHA8_ASTC_6x5x5 PIPE_FORMAT_ASTC_6x5x5_SRGB +#define MESA_FORMAT_SRGB8_ALPHA8_ASTC_6x6x5 PIPE_FORMAT_ASTC_6x6x5_SRGB +#define MESA_FORMAT_SRGB8_ALPHA8_ASTC_6x6x6 PIPE_FORMAT_ASTC_6x6x6_SRGB +#define MESA_FORMAT_ATC_RGB PIPE_FORMAT_ATC_RGB +#define MESA_FORMAT_ATC_RGBA_EXPLICIT PIPE_FORMAT_ATC_RGBA_EXPLICIT +#define MESA_FORMAT_ATC_RGBA_INTERPOLATED PIPE_FORMAT_ATC_RGBA_INTERPOLATED +#define MESA_FORMAT_COUNT PIPE_FORMAT_COUNT + +/* Packed to array format adapters */ +#if UTIL_ARCH_LITTLE_ENDIAN +#define MESA_FORMAT_RGBA_UINT8 MESA_FORMAT_R8G8B8A8_UINT +#else +#define MESA_FORMAT_RGBA_UINT8 MESA_FORMAT_A8B8G8R8_UINT +#endif extern const char * _mesa_get_format_name(mesa_format format); @@ -764,6 +719,9 @@ _mesa_get_linear_format_srgb(mesa_format format); extern mesa_format +_mesa_get_intensity_format_red(mesa_format format); + +extern mesa_format _mesa_get_uncompressed_format(mesa_format format); extern unsigned int diff -Nru mesa-19.2.8/src/mesa/main/format_utils.c mesa-20.0.8/src/mesa/main/format_utils.c --- mesa-19.2.8/src/mesa/main/format_utils.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/main/format_utils.c 2020-06-12 01:21:18.000000000 +0000 @@ -29,16 +29,20 @@ #include "format_unpack.h" const mesa_array_format RGBA32_FLOAT = - MESA_ARRAY_FORMAT(4, 1, 1, 1, 4, 0, 1, 2, 3); + MESA_ARRAY_FORMAT(MESA_ARRAY_FORMAT_BASE_FORMAT_RGBA_VARIANTS, + 4, 1, 1, 1, 4, 0, 1, 2, 3); const mesa_array_format RGBA8_UBYTE = - MESA_ARRAY_FORMAT(1, 0, 0, 1, 4, 0, 1, 2, 3); + MESA_ARRAY_FORMAT(MESA_ARRAY_FORMAT_BASE_FORMAT_RGBA_VARIANTS, + 1, 0, 0, 1, 4, 0, 1, 2, 3); const mesa_array_format RGBA32_UINT = - MESA_ARRAY_FORMAT(4, 0, 0, 0, 4, 0, 1, 2, 3); + MESA_ARRAY_FORMAT(MESA_ARRAY_FORMAT_BASE_FORMAT_RGBA_VARIANTS, + 4, 0, 0, 0, 4, 0, 1, 2, 3); const mesa_array_format RGBA32_INT = - MESA_ARRAY_FORMAT(4, 1, 0, 0, 4, 0, 1, 2, 3); + MESA_ARRAY_FORMAT(MESA_ARRAY_FORMAT_BASE_FORMAT_RGBA_VARIANTS, + 4, 1, 0, 0, 4, 0, 1, 2, 3); static void invert_swizzle(uint8_t dst[4], const uint8_t src[4]) @@ -648,8 +652,10 @@ } static const uint8_t map_identity[7] = { 0, 1, 2, 3, 4, 5, 6 }; +#if UTIL_ARCH_BIG_ENDIAN static const uint8_t map_3210[7] = { 3, 2, 1, 0, 4, 5, 6 }; static const uint8_t map_1032[7] = { 1, 0, 3, 2, 4, 5, 6 }; +#endif /** * Describes a format as an array format, if possible @@ -700,10 +706,18 @@ endian = map_identity; break; case 2: - endian = _mesa_little_endian() ? map_identity : map_1032; +#if UTIL_ARCH_LITTLE_ENDIAN + endian = map_identity; +#else + endian = map_1032; +#endif break; case 4: - endian = _mesa_little_endian() ? map_identity : map_3210; +#if UTIL_ARCH_LITTLE_ENDIAN + endian = map_identity; +#else + endian = map_3210; +#endif break; default: endian = map_identity; @@ -721,7 +735,11 @@ endian = map_identity; break; case 2: - endian = _mesa_little_endian() ? map_identity : map_1032; +#if UTIL_ARCH_LITTLE_ENDIAN + endian = map_identity; +#else + endian = map_1032; +#endif break; default: endian = map_identity; diff -Nru mesa-19.2.8/src/mesa/main/framebuffer.c mesa-20.0.8/src/mesa/main/framebuffer.c --- mesa-19.2.8/src/mesa/main/framebuffer.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/main/framebuffer.c 2020-06-12 01:21:18.000000000 +0000 @@ -435,7 +435,6 @@ struct gl_framebuffer *fb) { memset(&fb->Visual, 0, sizeof(fb->Visual)); - fb->Visual.rgbMode = GL_TRUE; /* assume this */ /* find first RGB renderbuffer */ for (unsigned i = 0; i < BUFFER_COUNT; i++) { @@ -482,7 +481,6 @@ const struct gl_renderbuffer *rb = fb->Attachment[BUFFER_DEPTH].Renderbuffer; const mesa_format fmt = rb->Format; - fb->Visual.haveDepthBuffer = GL_TRUE; fb->Visual.depthBits = _mesa_get_format_bits(fmt, GL_DEPTH_BITS); } @@ -490,7 +488,6 @@ const struct gl_renderbuffer *rb = fb->Attachment[BUFFER_STENCIL].Renderbuffer; const mesa_format fmt = rb->Format; - fb->Visual.haveStencilBuffer = GL_TRUE; fb->Visual.stencilBits = _mesa_get_format_bits(fmt, GL_STENCIL_BITS); } @@ -498,7 +495,6 @@ const struct gl_renderbuffer *rb = fb->Attachment[BUFFER_ACCUM].Renderbuffer; const mesa_format fmt = rb->Format; - fb->Visual.haveAccumBuffer = GL_TRUE; fb->Visual.accumRedBits = _mesa_get_format_bits(fmt, GL_RED_BITS); fb->Visual.accumGreenBits = _mesa_get_format_bits(fmt, GL_GREEN_BITS); fb->Visual.accumBlueBits = _mesa_get_format_bits(fmt, GL_BLUE_BITS); @@ -856,8 +852,7 @@ return GL_RGB; case MESA_FORMAT_RG_FLOAT32: case MESA_FORMAT_RG_FLOAT16: - case MESA_FORMAT_R8G8_UNORM: - case MESA_FORMAT_R8G8_SNORM: + case MESA_FORMAT_RG_UNORM8: return GL_RG; case MESA_FORMAT_RG_SINT32: case MESA_FORMAT_RG_UINT32: diff -Nru mesa-19.2.8/src/mesa/main/genmipmap.c mesa-20.0.8/src/mesa/main/genmipmap.c --- mesa-19.2.8/src/mesa/main/genmipmap.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/main/genmipmap.c 2020-06-12 01:21:18.000000000 +0000 @@ -106,14 +106,14 @@ /** * Implements glGenerateMipmap and glGenerateTextureMipmap. * Generates all the mipmap levels below the base level. + * Error-checking is done only if caller is not NULL. */ static ALWAYS_INLINE void generate_texture_mipmap(struct gl_context *ctx, struct gl_texture_object *texObj, GLenum target, - bool dsa, bool no_error) + const char* caller) { struct gl_texture_image *srcImage; - const char *suffix = dsa ? "Texture" : ""; FLUSH_VERTICES(ctx, 0); @@ -122,21 +122,21 @@ return; } - if (!no_error && texObj->Target == GL_TEXTURE_CUBE_MAP && + if (caller && texObj->Target == GL_TEXTURE_CUBE_MAP && !_mesa_cube_complete(texObj)) { _mesa_error(ctx, GL_INVALID_OPERATION, - "glGenerate%sMipmap(incomplete cube map)", suffix); + "%s(incomplete cube map)", caller); return; } _mesa_lock_texture(ctx, texObj); srcImage = _mesa_select_tex_image(texObj, target, texObj->BaseLevel); - if (!no_error) { + if (caller) { if (!srcImage) { _mesa_unlock_texture(ctx, texObj); _mesa_error(ctx, GL_INVALID_OPERATION, - "glGenerate%sMipmap(zero size base image)", suffix); + "%s(zero size base image)", caller); return; } @@ -144,7 +144,7 @@ srcImage->InternalFormat)) { _mesa_unlock_texture(ctx, texObj); _mesa_error(ctx, GL_INVALID_OPERATION, - "glGenerate%sMipmap(invalid internal format %s)", suffix, + "%s(invalid internal format %s)", caller, _mesa_enum_to_string(srcImage->InternalFormat)); return; } @@ -168,22 +168,6 @@ _mesa_unlock_texture(ctx, texObj); } -static void -generate_texture_mipmap_error(struct gl_context *ctx, - struct gl_texture_object *texObj, GLenum target, - bool dsa) -{ - generate_texture_mipmap(ctx, texObj, target, dsa, false); -} - -static void -generate_texture_mipmap_no_error(struct gl_context *ctx, - struct gl_texture_object *texObj, - GLenum target, bool dsa) -{ - generate_texture_mipmap(ctx, texObj, target, dsa, true); -} - /** * Generate all the mipmap levels below the base level. * Note: this GL function would be more useful if one could specify a @@ -195,7 +179,7 @@ GET_CURRENT_CONTEXT(ctx); struct gl_texture_object *texObj = _mesa_get_current_tex_object(ctx, target); - generate_texture_mipmap_no_error(ctx, texObj, target, false); + generate_texture_mipmap(ctx, texObj, target, NULL); } void GLAPIENTRY @@ -214,7 +198,7 @@ if (!texObj) return; - generate_texture_mipmap_error(ctx, texObj, target, false); + generate_texture_mipmap(ctx, texObj, target, "glGenerateMipmap"); } /** @@ -226,24 +210,60 @@ GET_CURRENT_CONTEXT(ctx); struct gl_texture_object *texObj = _mesa_lookup_texture(ctx, texture); - generate_texture_mipmap_no_error(ctx, texObj, texObj->Target, true); + generate_texture_mipmap(ctx, texObj, texObj->Target, NULL); } -void GLAPIENTRY -_mesa_GenerateTextureMipmap(GLuint texture) +static void +validate_params_and_generate_mipmap(struct gl_texture_object *texObj, const char* caller) { - struct gl_texture_object *texObj; GET_CURRENT_CONTEXT(ctx); - texObj = _mesa_lookup_texture_err(ctx, texture, "glGenerateTextureMipmap"); if (!texObj) return; if (!_mesa_is_valid_generate_texture_mipmap_target(ctx, texObj->Target)) { - _mesa_error(ctx, GL_INVALID_ENUM, "glGenerateTextureMipmap(target=%s)", + _mesa_error(ctx, GL_INVALID_ENUM, "%s(target=%s)", + caller, _mesa_enum_to_string(texObj->Target)); return; } - generate_texture_mipmap_error(ctx, texObj, texObj->Target, true); + generate_texture_mipmap(ctx, texObj, texObj->Target, caller); +} + +void GLAPIENTRY +_mesa_GenerateTextureMipmap(GLuint texture) +{ + struct gl_texture_object *texObj; + GET_CURRENT_CONTEXT(ctx); + + texObj = _mesa_lookup_texture_err(ctx, texture, "glGenerateTextureMipmap"); + validate_params_and_generate_mipmap(texObj, "glGenerateTextureMipmap"); +} + +void GLAPIENTRY +_mesa_GenerateTextureMipmapEXT(GLuint texture, GLenum target) +{ + struct gl_texture_object *texObj; + GET_CURRENT_CONTEXT(ctx); + + texObj = _mesa_lookup_or_create_texture(ctx, target, texture, + false, true, + "glGenerateTextureMipmapEXT"); + validate_params_and_generate_mipmap(texObj, + "glGenerateTextureMipmapEXT"); +} + +void GLAPIENTRY +_mesa_GenerateMultiTexMipmapEXT(GLenum texunit, GLenum target) +{ + struct gl_texture_object *texObj; + GET_CURRENT_CONTEXT(ctx); + + texObj = _mesa_get_texobj_by_target_and_texunit(ctx, target, + texunit - GL_TEXTURE0, + true, + "glGenerateMultiTexMipmapEXT"); + validate_params_and_generate_mipmap(texObj, + "glGenerateMultiTexMipmapEXT"); } diff -Nru mesa-19.2.8/src/mesa/main/genmipmap.h mesa-20.0.8/src/mesa/main/genmipmap.h --- mesa-19.2.8/src/mesa/main/genmipmap.h 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/main/genmipmap.h 2020-06-12 01:21:18.000000000 +0000 @@ -47,4 +47,10 @@ extern void GLAPIENTRY _mesa_GenerateTextureMipmap(GLuint texture); +extern void GLAPIENTRY +_mesa_GenerateTextureMipmapEXT(GLuint texture, GLenum target); + +extern void GLAPIENTRY +_mesa_GenerateMultiTexMipmapEXT(GLenum texunit, GLenum target); + #endif /* GENMIPMAP_H */ diff -Nru mesa-19.2.8/src/mesa/main/get.h mesa-20.0.8/src/mesa/main/get.h --- mesa-19.2.8/src/mesa/main/get.h 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/main/get.h 2020-06-12 01:21:18.000000000 +0000 @@ -93,4 +93,10 @@ extern GLenum GLAPIENTRY _mesa_GetGraphicsResetStatusARB( void ); +struct gl_vertex_array_object; + +extern void +_get_vao_pointerv(GLenum pname, struct gl_vertex_array_object* vao, + GLvoid **params, const char* callerstr); + #endif diff -Nru mesa-19.2.8/src/mesa/main/get_hash_params.py mesa-20.0.8/src/mesa/main/get_hash_params.py --- mesa-19.2.8/src/mesa/main/get_hash_params.py 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/main/get_hash_params.py 2020-06-12 01:21:18.000000000 +0000 @@ -714,7 +714,7 @@ [ "FOG_INDEX", "CONTEXT_FLOAT(Fog.Index), NO_EXTRA" ], [ "GREEN_BIAS", "CONTEXT_FLOAT(Pixel.GreenBias), NO_EXTRA" ], [ "GREEN_SCALE", "CONTEXT_FLOAT(Pixel.GreenScale), NO_EXTRA" ], - [ "INDEX_BITS", "BUFFER_INT(Visual.indexBits), extra_new_buffers" ], + [ "INDEX_BITS", "CONST(0), NO_EXTRA" ], [ "INDEX_CLEAR_VALUE", "CONTEXT_INT(Color.ClearIndex), NO_EXTRA" ], [ "INDEX_MODE", "CONST(0) , NO_EXTRA" ], [ "INDEX_OFFSET", "CONTEXT_INT(Pixel.IndexOffset), NO_EXTRA" ], diff -Nru mesa-19.2.8/src/mesa/main/getstring.c mesa-20.0.8/src/mesa/main/getstring.c --- mesa-19.2.8/src/mesa/main/getstring.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/main/getstring.c 2020-06-12 01:21:18.000000000 +0000 @@ -230,29 +230,12 @@ } - -/** - * Return pointer-valued state, such as a vertex array pointer. - * - * \param pname names state to be queried - * \param params returns the pointer value - * - * \sa glGetPointerv(). - * - * Tries to get the specified pointer via dd_function_table::GetPointerv, - * otherwise gets the specified pointer from the current context. - */ -void GLAPIENTRY -_mesa_GetPointerv( GLenum pname, GLvoid **params ) +void +_get_vao_pointerv(GLenum pname, struct gl_vertex_array_object* vao, + GLvoid **params, const char* callerstr ) { GET_CURRENT_CONTEXT(ctx); const GLuint clientUnit = ctx->Array.ActiveTexture; - const char *callerstr; - - if (_mesa_is_desktop_gl(ctx)) - callerstr = "glGetPointerv"; - else - callerstr = "glGetPointervKHR"; if (!params) return; @@ -264,42 +247,42 @@ case GL_VERTEX_ARRAY_POINTER: if (ctx->API != API_OPENGL_COMPAT && ctx->API != API_OPENGLES) goto invalid_pname; - *params = (GLvoid *) ctx->Array.VAO->VertexAttrib[VERT_ATTRIB_POS].Ptr; + *params = (GLvoid *) vao->VertexAttrib[VERT_ATTRIB_POS].Ptr; break; case GL_NORMAL_ARRAY_POINTER: if (ctx->API != API_OPENGL_COMPAT && ctx->API != API_OPENGLES) goto invalid_pname; - *params = (GLvoid *) ctx->Array.VAO->VertexAttrib[VERT_ATTRIB_NORMAL].Ptr; + *params = (GLvoid *) vao->VertexAttrib[VERT_ATTRIB_NORMAL].Ptr; break; case GL_COLOR_ARRAY_POINTER: if (ctx->API != API_OPENGL_COMPAT && ctx->API != API_OPENGLES) goto invalid_pname; - *params = (GLvoid *) ctx->Array.VAO->VertexAttrib[VERT_ATTRIB_COLOR0].Ptr; + *params = (GLvoid *) vao->VertexAttrib[VERT_ATTRIB_COLOR0].Ptr; break; case GL_SECONDARY_COLOR_ARRAY_POINTER_EXT: if (ctx->API != API_OPENGL_COMPAT) goto invalid_pname; - *params = (GLvoid *) ctx->Array.VAO->VertexAttrib[VERT_ATTRIB_COLOR1].Ptr; + *params = (GLvoid *) vao->VertexAttrib[VERT_ATTRIB_COLOR1].Ptr; break; case GL_FOG_COORDINATE_ARRAY_POINTER_EXT: if (ctx->API != API_OPENGL_COMPAT) goto invalid_pname; - *params = (GLvoid *) ctx->Array.VAO->VertexAttrib[VERT_ATTRIB_FOG].Ptr; + *params = (GLvoid *) vao->VertexAttrib[VERT_ATTRIB_FOG].Ptr; break; case GL_INDEX_ARRAY_POINTER: if (ctx->API != API_OPENGL_COMPAT) goto invalid_pname; - *params = (GLvoid *) ctx->Array.VAO->VertexAttrib[VERT_ATTRIB_COLOR_INDEX].Ptr; + *params = (GLvoid *) vao->VertexAttrib[VERT_ATTRIB_COLOR_INDEX].Ptr; break; case GL_TEXTURE_COORD_ARRAY_POINTER: if (ctx->API != API_OPENGL_COMPAT && ctx->API != API_OPENGLES) goto invalid_pname; - *params = (GLvoid *) ctx->Array.VAO->VertexAttrib[VERT_ATTRIB_TEX(clientUnit)].Ptr; + *params = (GLvoid *) vao->VertexAttrib[VERT_ATTRIB_TEX(clientUnit)].Ptr; break; case GL_EDGE_FLAG_ARRAY_POINTER: if (ctx->API != API_OPENGL_COMPAT) goto invalid_pname; - *params = (GLvoid *) ctx->Array.VAO->VertexAttrib[VERT_ATTRIB_EDGEFLAG].Ptr; + *params = (GLvoid *) vao->VertexAttrib[VERT_ATTRIB_EDGEFLAG].Ptr; break; case GL_FEEDBACK_BUFFER_POINTER: if (ctx->API != API_OPENGL_COMPAT) @@ -314,7 +297,7 @@ case GL_POINT_SIZE_ARRAY_POINTER_OES: if (ctx->API != API_OPENGLES) goto invalid_pname; - *params = (GLvoid *) ctx->Array.VAO->VertexAttrib[VERT_ATTRIB_POINT_SIZE].Ptr; + *params = (GLvoid *) vao->VertexAttrib[VERT_ATTRIB_POINT_SIZE].Ptr; break; case GL_DEBUG_CALLBACK_FUNCTION_ARB: case GL_DEBUG_CALLBACK_USER_PARAM_ARB: @@ -332,6 +315,35 @@ } +/** + * Return pointer-valued state, such as a vertex array pointer. + * + * \param pname names state to be queried + * \param params returns the pointer value + * + * \sa glGetPointerv(). + * + * Tries to get the specified pointer via dd_function_table::GetPointerv, + * otherwise gets the specified pointer from the current context. + */ +void GLAPIENTRY +_mesa_GetPointerv( GLenum pname, GLvoid **params ) +{ + GET_CURRENT_CONTEXT(ctx); + const char *callerstr; + + if (_mesa_is_desktop_gl(ctx)) + callerstr = "glGetPointerv"; + else + callerstr = "glGetPointervKHR"; + + if (!params) + return; + + _get_vao_pointerv(pname, ctx->Array.VAO, params, callerstr); +} + + void GLAPIENTRY _mesa_GetPointerIndexedvEXT( GLenum pname, GLuint index, GLvoid **params ) { diff -Nru mesa-19.2.8/src/mesa/main/glformats.c mesa-20.0.8/src/mesa/main/glformats.c --- mesa-19.2.8/src/mesa/main/glformats.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/main/glformats.c 2020-06-12 01:21:18.000000000 +0000 @@ -3501,7 +3501,43 @@ case GL_INTENSITY: set_swizzle(swizzle, 0, 0, 0, 0); return true; + case GL_DEPTH_COMPONENT: + set_swizzle(swizzle, 0, 6, 6, 6); + return true; + case GL_STENCIL_INDEX: + set_swizzle(swizzle, 6, 0, 6, 6); + return true; + default: + return false; + } +} + +bool +_mesa_swap_bytes_in_type_enum(GLenum *type) +{ + switch (*type) { + case GL_UNSIGNED_INT_8_8_8_8: + *type = GL_UNSIGNED_INT_8_8_8_8_REV; + return true; + case GL_UNSIGNED_INT_8_8_8_8_REV: + *type = GL_UNSIGNED_INT_8_8_8_8; + return true; + case GL_UNSIGNED_SHORT_8_8_MESA: + *type = GL_UNSIGNED_SHORT_8_8_REV_MESA; + return true; + case GL_UNSIGNED_SHORT_8_8_REV_MESA: + *type = GL_UNSIGNED_SHORT_8_8_MESA; + return true; + case GL_BYTE: + case GL_UNSIGNED_BYTE: + /* format/types that are arrays of 8-bit values are unaffected by + * swapBytes. + */ + return true; default: + /* swapping bytes on 4444, 1555, or >8 bit per channel types etc. will + * never match a Mesa format. + */ return false; } } @@ -3530,6 +3566,9 @@ bool normalized = false, is_float = false, is_signed = false; int num_channels = 0, type_size = 0; + if (format == GL_COLOR_INDEX) + return MESA_FORMAT_NONE; + /* Extract array format type information from the OpenGL data type */ switch (type) { case GL_UNSIGNED_BYTE: @@ -3577,10 +3616,24 @@ * create the array format */ if (is_array_format) { - normalized = !_mesa_is_enum_format_integer(format); + enum mesa_array_format_base_format bf; + switch (format) { + case GL_DEPTH_COMPONENT: + bf = MESA_ARRAY_FORMAT_BASE_FORMAT_DEPTH; + break; + case GL_STENCIL_INDEX: + bf = MESA_ARRAY_FORMAT_BASE_FORMAT_STENCIL; + break; + default: + bf = MESA_ARRAY_FORMAT_BASE_FORMAT_RGBA_VARIANTS; + break; + } + + normalized = !(_mesa_is_enum_format_integer(format) || + format == GL_STENCIL_INDEX); num_channels = _mesa_components_in_format(format); - return MESA_ARRAY_FORMAT(type_size, is_signed, is_float, + return MESA_ARRAY_FORMAT(bf, type_size, is_signed, is_float, normalized, num_channels, swizzle[0], swizzle[1], swizzle[2], swizzle[3]); } @@ -3737,7 +3790,9 @@ break; case GL_UNSIGNED_INT_24_8: if (format == GL_DEPTH_STENCIL) - return MESA_FORMAT_Z24_UNORM_S8_UINT; + return MESA_FORMAT_S8_UINT_Z24_UNORM; + else if (format == GL_DEPTH_COMPONENT) + return MESA_FORMAT_X8_UINT_Z24_UNORM; break; case GL_FLOAT_32_UNSIGNED_INT_24_8_REV: if (format == GL_DEPTH_STENCIL) @@ -3747,6 +3802,10 @@ break; } + fprintf(stderr, "Unsupported format/type: %s/%s\n", + _mesa_enum_to_string(format), + _mesa_enum_to_string(type)); + /* If we got here it means that we could not find a Mesa format that * matches the GL format/type provided. We may need to add a new Mesa * format in that case. diff -Nru mesa-19.2.8/src/mesa/main/glformats.h mesa-20.0.8/src/mesa/main/glformats.h --- mesa-19.2.8/src/mesa/main/glformats.h 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/main/glformats.h 2020-06-12 01:21:18.000000000 +0000 @@ -147,6 +147,9 @@ extern uint32_t _mesa_format_from_format_and_type(GLenum format, GLenum type); +bool +_mesa_swap_bytes_in_type_enum(GLenum *type); + extern uint32_t _mesa_tex_format_from_format_and_type(const struct gl_context *ctx, GLenum gl_format, GLenum type); diff -Nru mesa-19.2.8/src/mesa/main/glheader.h mesa-20.0.8/src/mesa/main/glheader.h --- mesa-19.2.8/src/mesa/main/glheader.h 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/main/glheader.h 2020-06-12 01:21:18.000000000 +0000 @@ -128,10 +128,6 @@ #define GL_HALF_FLOAT_OES 0x8D61 #endif -#ifndef GL_MESA_framebuffer_flip_y -#define GL_FRAMEBUFFER_FLIP_Y_MESA 0x8BBB -#endif - /* There is no formal spec for the following extension. */ #ifndef GL_ATI_texture_compression_3dc #define GL_ATI_texture_compression_3dc 1 diff -Nru mesa-19.2.8/src/mesa/main/glthread.c mesa-20.0.8/src/mesa/main/glthread.c --- mesa-19.2.8/src/mesa/main/glthread.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/main/glthread.c 2020-06-12 01:21:18.000000000 +0000 @@ -99,7 +99,7 @@ struct util_queue_fence fence; util_queue_fence_init(&fence); util_queue_add_job(&glthread->queue, ctx, &fence, - glthread_thread_initialization, NULL); + glthread_thread_initialization, NULL, 0); util_queue_fence_wait(&fence); util_queue_fence_destroy(&fence); } @@ -167,7 +167,7 @@ p_atomic_add(&glthread->stats.num_offloaded_items, next->used); util_queue_add_job(&glthread->queue, next, &next->fence, - glthread_unmarshal_batch, NULL); + glthread_unmarshal_batch, NULL, 0); glthread->last = glthread->next; glthread->next = (glthread->next + 1) % MARSHAL_MAX_BATCHES; } diff -Nru mesa-19.2.8/src/mesa/main/imports.c mesa-20.0.8/src/mesa/main/imports.c --- mesa-19.2.8/src/mesa/main/imports.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/main/imports.c 2020-06-12 01:21:18.000000000 +0000 @@ -45,7 +45,6 @@ #include #include #include "c99_math.h" -#include "util/rounding.h" /* for _mesa_roundeven */ #include "imports.h" #include "context.h" #include "version.h" diff -Nru mesa-19.2.8/src/mesa/main/imports.h mesa-20.0.8/src/mesa/main/imports.h --- mesa-19.2.8/src/mesa/main/imports.h 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/main/imports.h 2020-06-12 01:21:18.000000000 +0000 @@ -285,18 +285,6 @@ } -/** - * Return 1 if this is a little endian machine, 0 if big endian. - */ -static inline GLboolean -_mesa_little_endian(void) -{ - const GLuint ui = 1; /* intentionally not static */ - return *((const GLubyte *) &ui); -} - - - /********************************************************************** * Functions */ @@ -321,7 +309,7 @@ _mesa_vsnprintf(char *str, size_t size, const char *fmt, va_list arg); -#if defined(_WIN32) && !defined(strtok_r) +#if defined(_WIN32) && !defined(HAVE_STRTOK_R) #define strtok_r strtok_s #endif diff -Nru mesa-19.2.8/src/mesa/main/mm.c mesa-20.0.8/src/mesa/main/mm.c --- mesa-19.2.8/src/mesa/main/mm.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/main/mm.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,282 +0,0 @@ -/* - * GLX Hardware Device Driver common code - * Copyright (C) 1999 Wittawat Yamwong - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * WITTAWAT YAMWONG, OR ANY OTHER CONTRIBUTORS BE LIABLE FOR ANY CLAIM, - * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR - * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE - * OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - */ - -#include -#include -#include - -#include "mm.h" - - -void -mmDumpMemInfo(const struct mem_block *heap) -{ - fprintf(stderr, "Memory heap %p:\n", (void *)heap); - if (heap == 0) { - fprintf(stderr, " heap == 0\n"); - } else { - const struct mem_block *p; - - for(p = heap->next; p != heap; p = p->next) { - fprintf(stderr, " Offset:%08x, Size:%08x, %c%c\n",p->ofs,p->size, - p->free ? 'F':'.', - p->reserved ? 'R':'.'); - } - - fprintf(stderr, "\nFree list:\n"); - - for(p = heap->next_free; p != heap; p = p->next_free) { - fprintf(stderr, " FREE Offset:%08x, Size:%08x, %c%c\n",p->ofs,p->size, - p->free ? 'F':'.', - p->reserved ? 'R':'.'); - } - - } - fprintf(stderr, "End of memory blocks\n"); -} - -struct mem_block * -mmInit(unsigned ofs, unsigned size) -{ - struct mem_block *heap, *block; - - if (!size) - return NULL; - - heap = calloc(1, sizeof(struct mem_block)); - if (!heap) - return NULL; - - block = calloc(1, sizeof(struct mem_block)); - if (!block) { - free(heap); - return NULL; - } - - heap->next = block; - heap->prev = block; - heap->next_free = block; - heap->prev_free = block; - - block->heap = heap; - block->next = heap; - block->prev = heap; - block->next_free = heap; - block->prev_free = heap; - - block->ofs = ofs; - block->size = size; - block->free = 1; - - return heap; -} - - -static struct mem_block * -SliceBlock(struct mem_block *p, - unsigned startofs, unsigned size, - unsigned reserved, unsigned alignment) -{ - struct mem_block *newblock; - - /* break left [p, newblock, p->next], then p = newblock */ - if (startofs > p->ofs) { - newblock = calloc(1, sizeof(struct mem_block)); - if (!newblock) - return NULL; - newblock->ofs = startofs; - newblock->size = p->size - (startofs - p->ofs); - newblock->free = 1; - newblock->heap = p->heap; - - newblock->next = p->next; - newblock->prev = p; - p->next->prev = newblock; - p->next = newblock; - - newblock->next_free = p->next_free; - newblock->prev_free = p; - p->next_free->prev_free = newblock; - p->next_free = newblock; - - p->size -= newblock->size; - p = newblock; - } - - /* break right, also [p, newblock, p->next] */ - if (size < p->size) { - newblock = calloc(1, sizeof(struct mem_block)); - if (!newblock) - return NULL; - newblock->ofs = startofs + size; - newblock->size = p->size - size; - newblock->free = 1; - newblock->heap = p->heap; - - newblock->next = p->next; - newblock->prev = p; - p->next->prev = newblock; - p->next = newblock; - - newblock->next_free = p->next_free; - newblock->prev_free = p; - p->next_free->prev_free = newblock; - p->next_free = newblock; - - p->size = size; - } - - /* p = middle block */ - p->free = 0; - - /* Remove p from the free list: - */ - p->next_free->prev_free = p->prev_free; - p->prev_free->next_free = p->next_free; - - p->next_free = 0; - p->prev_free = 0; - - p->reserved = reserved; - return p; -} - - -struct mem_block * -mmAllocMem(struct mem_block *heap, unsigned size, unsigned align2, unsigned startSearch) -{ - struct mem_block *p; - const unsigned mask = (1 << align2)-1; - unsigned startofs = 0; - unsigned endofs; - - if (!heap || !size) - return NULL; - - for (p = heap->next_free; p != heap; p = p->next_free) { - assert(p->free); - - startofs = (p->ofs + mask) & ~mask; - if ( startofs < startSearch ) { - startofs = startSearch; - } - endofs = startofs+size; - if (endofs <= (p->ofs+p->size)) - break; - } - - if (p == heap) - return NULL; - - assert(p->free); - p = SliceBlock(p,startofs,size,0,mask+1); - - return p; -} - - -struct mem_block * -mmFindBlock(struct mem_block *heap, unsigned start) -{ - struct mem_block *p; - - for (p = heap->next; p != heap; p = p->next) { - if (p->ofs == start) - return p; - } - - return NULL; -} - - -static inline int -Join2Blocks(struct mem_block *p) -{ - /* XXX there should be some assertions here */ - - /* NOTE: heap->free == 0 */ - - if (p->free && p->next->free) { - struct mem_block *q = p->next; - - assert(p->ofs + p->size == q->ofs); - p->size += q->size; - - p->next = q->next; - q->next->prev = p; - - q->next_free->prev_free = q->prev_free; - q->prev_free->next_free = q->next_free; - - free(q); - return 1; - } - return 0; -} - -int -mmFreeMem(struct mem_block *b) -{ - if (!b) - return 0; - - if (b->free) { - fprintf(stderr, "block already free\n"); - return -1; - } - if (b->reserved) { - fprintf(stderr, "block is reserved\n"); - return -1; - } - - b->free = 1; - b->next_free = b->heap->next_free; - b->prev_free = b->heap; - b->next_free->prev_free = b; - b->prev_free->next_free = b; - - Join2Blocks(b); - if (b->prev != b->heap) - Join2Blocks(b->prev); - - return 0; -} - - -void -mmDestroy(struct mem_block *heap) -{ - struct mem_block *p; - - if (!heap) - return; - - for (p = heap->next; p != heap; ) { - struct mem_block *next = p->next; - free(p); - p = next; - } - - free(heap); -} diff -Nru mesa-19.2.8/src/mesa/main/mm.h mesa-20.0.8/src/mesa/main/mm.h --- mesa-19.2.8/src/mesa/main/mm.h 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/main/mm.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,90 +0,0 @@ -/* - * GLX Hardware Device Driver common code - * Copyright (C) 1999 Wittawat Yamwong - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * KEITH WHITWELL, OR ANY OTHER CONTRIBUTORS BE LIABLE FOR ANY CLAIM, - * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR - * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE - * OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - - -/** - * Memory manager code. Primarily used by device drivers to manage texture - * heaps, etc. - */ - - -#ifndef MM_H -#define MM_H - - -struct mem_block { - struct mem_block *next, *prev; - struct mem_block *next_free, *prev_free; - struct mem_block *heap; - unsigned ofs; - unsigned size; - unsigned free:1; - unsigned reserved:1; -}; - - - -/** - * input: total size in bytes - * return: a heap pointer if OK, NULL if error - */ -extern struct mem_block *mmInit(unsigned ofs, unsigned size); - -/** - * Allocate 'size' bytes with 2^align2 bytes alignment, - * restrict the search to free memory after 'startSearch' - * depth and back buffers should be in different 4mb banks - * to get better page hits if possible - * input: size = size of block - * align2 = 2^align2 bytes alignment - * startSearch = linear offset from start of heap to begin search - * return: pointer to the allocated block, 0 if error - */ -extern struct mem_block *mmAllocMem(struct mem_block *heap, unsigned size, - unsigned align2, unsigned startSearch); - -/** - * Free block starts at offset - * input: pointer to a block - * return: 0 if OK, -1 if error - */ -extern int mmFreeMem(struct mem_block *b); - -/** - * Free block starts at offset - * input: pointer to a heap, start offset - * return: pointer to a block - */ -extern struct mem_block *mmFindBlock(struct mem_block *heap, unsigned start); - -/** - * destroy MM - */ -extern void mmDestroy(struct mem_block *mmInit); - -/** - * For debuging purpose. - */ -extern void mmDumpMemInfo(const struct mem_block *mmInit); - -#endif diff -Nru mesa-19.2.8/src/mesa/main/mtypes.h mesa-20.0.8/src/mesa/main/mtypes.h --- mesa-19.2.8/src/mesa/main/mtypes.h 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/main/mtypes.h 2020-06-12 01:21:18.000000000 +0000 @@ -77,6 +77,7 @@ struct gl_program_parameter_list; struct gl_shader_spirv_data; struct set; +struct shader_includes; struct vbo_context; /*@}*/ @@ -159,19 +160,14 @@ */ struct gl_config { - GLboolean rgbMode; GLboolean floatMode; GLuint doubleBufferMode; GLuint stereoMode; - GLboolean haveAccumBuffer; - GLboolean haveDepthBuffer; - GLboolean haveStencilBuffer; - GLint redBits, greenBits, blueBits, alphaBits; /* bits per comp */ GLuint redMask, greenMask, blueMask, alphaMask; + GLint redShift, greenShift, blueShift, alphaShift; GLint rgbBits; /* total bits for rgb */ - GLint indexBits; /* total bits for colorindex */ GLint accumRedBits, accumGreenBits, accumBlueBits, accumAlphaBits; GLint depthBits; @@ -2921,6 +2917,9 @@ */ union gl_constant_value *UniformDataDefaults; + /** Hash for quick search by name. */ + struct hash_table_u64 *ProgramResourceHash; + GLboolean Validated; /** List of all active resources after linking. */ @@ -3198,6 +3197,9 @@ /** Clamp UBO and SSBO block indices so they don't go out-of-bounds. */ GLboolean ClampBlockIndicesToArrayBounds; + /** (driconf) Force gl_Position to be considered invariant */ + GLboolean PositionAlwaysInvariant; + const struct nir_shader_compiler_options *NirOptions; }; @@ -3327,6 +3329,13 @@ struct hash_table_u64 *ImageHandles; mtx_t HandlesMutex; /**< For texture/image handles safety */ + /* GL_ARB_shading_language_include */ + struct shader_includes *ShaderIncludes; + /* glCompileShaderInclude expects ShaderIncludes not to change while it is + * in progress. + */ + mtx_t ShaderIncludeMutex; + /** * Some context in this share group was affected by a GPU reset * @@ -3928,6 +3937,11 @@ bool GLSLOptimizeConservatively; /** + * Whether to call lower_const_arrays_to_uniforms() during linking. + */ + bool GLSLLowerConstArrays; + + /** * True if gl_TessLevelInner/Outer[] in the TES should be inputs * (otherwise, they're system values). */ @@ -4130,6 +4144,12 @@ /** Is the drivers uniform storage packed or padded to 16 bytes. */ bool PackedDriverUniformStorage; + /** Does the driver make use of the NIR based GLSL linker */ + bool UseNIRGLSLLinker; + + /** Wether or not glBitmap uses red textures rather than alpha */ + bool BitmapUsesRed; + /** GL_ARB_gl_spirv */ struct spirv_supported_capabilities SpirVCapabilities; @@ -4271,9 +4291,11 @@ GLboolean EXT_blend_equation_separate; GLboolean EXT_blend_func_separate; GLboolean EXT_blend_minmax; + GLboolean EXT_demote_to_helper_invocation; GLboolean EXT_depth_bounds_test; GLboolean EXT_disjoint_timer_query; GLboolean EXT_draw_buffers2; + GLboolean EXT_EGL_image_storage; GLboolean EXT_float_blend; GLboolean EXT_framebuffer_multisample; GLboolean EXT_framebuffer_multisample_blit_scaled; @@ -4305,6 +4327,7 @@ GLboolean EXT_texture_filter_anisotropic; GLboolean EXT_texture_integer; GLboolean EXT_texture_mirror_clamp; + GLboolean EXT_texture_norm16; GLboolean EXT_texture_shadow_lod; GLboolean EXT_texture_shared_exponent; GLboolean EXT_texture_snorm; @@ -4345,6 +4368,7 @@ GLboolean INTEL_conservative_rasterization; GLboolean INTEL_performance_query; GLboolean INTEL_shader_atomic_float_minmax; + GLboolean INTEL_shader_integer_functions2; GLboolean KHR_blend_equation_advanced; GLboolean KHR_blend_equation_advanced_coherent; GLboolean KHR_robustness; @@ -5174,6 +5198,8 @@ struct hash_table_u64 *ResidentTextureHandles; struct hash_table_u64 *ResidentImageHandles; /*@}*/ + + bool shader_builtin_ref; }; /** diff -Nru mesa-19.2.8/src/mesa/main/performance_monitor.c mesa-20.0.8/src/mesa/main/performance_monitor.c --- mesa-19.2.8/src/mesa/main/performance_monitor.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/main/performance_monitor.c 2020-06-12 01:21:18.000000000 +0000 @@ -568,9 +568,8 @@ for (group = 0; group < ctx->PerfMonitor.NumGroups; group++) { const struct gl_perf_monitor_group *g = &ctx->PerfMonitor.Groups[group]; - BITSET_WORD tmp; - BITSET_FOREACH_SET(counter, tmp, m->ActiveCounters[group], g->NumCounters) { + BITSET_FOREACH_SET(counter, m->ActiveCounters[group], g->NumCounters) { const struct gl_perf_monitor_counter *c = &g->Counters[counter]; size += sizeof(uint32_t); /* Group ID */ diff -Nru mesa-19.2.8/src/mesa/main/program_binary.c mesa-20.0.8/src/mesa/main/program_binary.c --- mesa-19.2.8/src/mesa/main/program_binary.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/main/program_binary.c 2020-06-12 01:21:18.000000000 +0000 @@ -29,12 +29,12 @@ */ -#include "compiler/blob.h" #include "compiler/glsl/serialize.h" #include "main/errors.h" #include "main/mtypes.h" #include "main/shaderapi.h" #include "util/bitscan.h" +#include "util/blob.h" #include "util/crc32.h" #include "program_binary.h" #include "program/prog_parameter.h" diff -Nru mesa-19.2.8/src/mesa/main/queryobj.c mesa-20.0.8/src/mesa/main/queryobj.c --- mesa-19.2.8/src/mesa/main/queryobj.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/main/queryobj.c 2020-06-12 01:21:18.000000000 +0000 @@ -663,11 +663,21 @@ * is not CURRENT_QUERY_EXT." * * Same rule is present also in ES 3.2 spec. + * + * EXT_disjoint_timer_query extends this with GL_QUERY_COUNTER_BITS. */ - if (_mesa_is_gles(ctx) && pname != GL_CURRENT_QUERY) { - _mesa_error(ctx, GL_INVALID_ENUM, "glGetQueryivEXT(%s)", - _mesa_enum_to_string(pname)); - return; + if (_mesa_is_gles(ctx)) { + switch (pname) { + case GL_CURRENT_QUERY: + break; + case GL_QUERY_COUNTER_BITS: + if (_mesa_has_EXT_disjoint_timer_query(ctx)) + break; + /* fallthrough */ + default: + _mesa_error(ctx, GL_INVALID_ENUM, "glGetQueryivEXT(%s)", + _mesa_enum_to_string(pname)); + } } if (target == GL_TIMESTAMP) { diff -Nru mesa-19.2.8/src/mesa/main/shaderapi.c mesa-20.0.8/src/mesa/main/shaderapi.c --- mesa-19.2.8/src/mesa/main/shaderapi.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/main/shaderapi.c 2020-06-12 01:21:18.000000000 +0000 @@ -35,6 +35,7 @@ #include #include #include + #include "main/glheader.h" #include "main/context.h" #include "main/enums.h" @@ -48,6 +49,7 @@ #include "main/state.h" #include "main/transformfeedback.h" #include "main/uniforms.h" +#include "compiler/glsl/builtin_functions.h" #include "compiler/glsl/glsl_parser_extras.h" #include "compiler/glsl/ir.h" #include "compiler/glsl/ir_uniform.h" @@ -60,6 +62,7 @@ #include "util/mesa-sha1.h" #include "util/crc32.h" #include "util/os_file.h" +#include "util/simple_list.h" /** * Return mask of GLSL_x flags by examining the MESA_GLSL env var. @@ -832,8 +835,10 @@ } return; case GL_GEOMETRY_SHADER_INVOCATIONS: - if (!has_gs || !ctx->Extensions.ARB_gpu_shader5) + if (!has_gs || + (_mesa_is_desktop_gl(ctx) && !ctx->Extensions.ARB_gpu_shader5)) { break; + } if (check_gs_query(ctx, shProg)) { *params = shProg->_LinkedShaders[MESA_SHADER_GEOMETRY]-> Program->info.gs.invocations; @@ -1155,6 +1160,14 @@ #endif } +static void +ensure_builtin_types(struct gl_context *ctx) +{ + if (!ctx->shader_builtin_ref) { + _mesa_glsl_builtin_functions_init_or_ref(); + ctx->shader_builtin_ref = true; + } +} /** * Compile a shader. @@ -1189,6 +1202,8 @@ _mesa_log("%s\n", sh->Source); } + ensure_builtin_types(ctx); + /* this call will set the shader->CompileStatus field to indicate if * compilation was successful. */ @@ -1266,6 +1281,8 @@ } } + ensure_builtin_types(ctx); + FLUSH_VERTICES(ctx, 0); _mesa_glsl_link_shader(ctx, shProg); @@ -2245,7 +2262,12 @@ void GLAPIENTRY _mesa_ReleaseShaderCompiler(void) { - _mesa_destroy_shader_compiler_caches(); + GET_CURRENT_CONTEXT(ctx); + + if (ctx->shader_builtin_ref) { + _mesa_glsl_builtin_functions_decref(); + ctx->shader_builtin_ref = false; + } } @@ -2638,6 +2660,7 @@ _mesa_PatchParameteri_no_error(GLenum pname, GLint value) { GET_CURRENT_CONTEXT(ctx); + FLUSH_VERTICES(ctx, 0); ctx->TessCtrlProgram.patch_vertices = value; } @@ -2662,6 +2685,7 @@ return; } + FLUSH_VERTICES(ctx, 0); ctx->TessCtrlProgram.patch_vertices = value; } @@ -3118,6 +3142,533 @@ } } +/* This is simple list entry that will be used to hold a list of string + * tokens of a parsed shader include path. + */ +struct sh_incl_path_entry +{ + struct sh_incl_path_entry *next; + struct sh_incl_path_entry *prev; + + char *path; +}; + +/* Nodes of the shader include tree */ +struct sh_incl_path_ht_entry +{ + struct hash_table *path; + char *shader_source; +}; + +struct shader_includes { + /* Array to hold include paths given to glCompileShaderIncludeARB() */ + struct sh_incl_path_entry **include_paths; + size_t num_include_paths; + size_t relative_path_cursor; + + /* Root hash table holding the shader include tree */ + struct hash_table *shader_include_tree; +}; + +void +_mesa_init_shader_includes(struct gl_shared_state *shared) +{ + shared->ShaderIncludes = calloc(1, sizeof(struct shader_includes)); + shared->ShaderIncludes->shader_include_tree = + _mesa_hash_table_create(NULL, _mesa_hash_string, + _mesa_key_string_equal); +} + +size_t +_mesa_get_shader_include_cursor(struct gl_shared_state *shared) +{ + return shared->ShaderIncludes->relative_path_cursor; +} + +void +_mesa_set_shader_include_cursor(struct gl_shared_state *shared, size_t cursor) +{ + shared->ShaderIncludes->relative_path_cursor = cursor; +} + +static void +destroy_shader_include(struct hash_entry *entry) +{ + struct sh_incl_path_ht_entry *sh_incl_ht_entry = + (struct sh_incl_path_ht_entry *) entry->data; + + _mesa_hash_table_destroy(sh_incl_ht_entry->path, destroy_shader_include); + free(sh_incl_ht_entry->shader_source); + free(sh_incl_ht_entry); +} + +void +_mesa_destroy_shader_includes(struct gl_shared_state *shared) +{ + _mesa_hash_table_destroy(shared->ShaderIncludes->shader_include_tree, + destroy_shader_include); + free(shared->ShaderIncludes); +} + +static bool +valid_path_format(const char *str, bool relative_path) +{ + int i = 0; + + if (!str[i] || (!relative_path && str[i] != '/')) + return false; + + i++; + + while (str[i]) { + const char c = str[i++]; + if (('A' <= c && c <= 'Z') || + ('a' <= c && c <= 'z') || + ('0' <= c && c <= '9')) + continue; + + if (c == '/') { + if (str[i - 2] == '/') + return false; + + continue; + } + + if (strchr("^. _+*%[](){}|&~=!:;,?-", c) == NULL) + return false; + } + + if (str[i - 1] == '/') + return false; + + return true; +} + + +static bool +validate_and_tokenise_sh_incl(struct gl_context *ctx, + void *mem_ctx, + struct sh_incl_path_entry **path_list, + char *full_path, bool error_check) +{ + bool relative_path = ctx->Shared->ShaderIncludes->num_include_paths; + + if (!valid_path_format(full_path, relative_path)) { + if (error_check) { + _mesa_error(ctx, GL_INVALID_VALUE, + "glNamedStringARB(invalid name %s)", full_path); + } + return false; + } + + char *save_ptr = NULL; + char *path_str = strtok_r(full_path, "/", &save_ptr); + + *path_list = rzalloc(mem_ctx, struct sh_incl_path_entry); + + make_empty_list(*path_list); + + while (path_str != NULL) { + if (strlen(path_str) == 0) { + if (error_check) { + _mesa_error(ctx, GL_INVALID_VALUE, + "glNamedStringARB(invalid name %s)", full_path); + } + + return false; + } + + if (strcmp(path_str, ".") == 0) { + /* Do nothing */ + } else if (strcmp(path_str, "..") == 0) { + struct sh_incl_path_entry *last = last_elem(*path_list); + remove_from_list(last); + } else { + struct sh_incl_path_entry *path = + rzalloc(mem_ctx, struct sh_incl_path_entry); + + path->path = strdup(path_str); + insert_at_tail(*path_list, path); + } + + path_str = strtok_r(NULL, "/", &save_ptr); + } + + return true; +} + +static struct sh_incl_path_ht_entry * +lookup_shader_include(struct gl_context *ctx, char *path, + bool error_check) +{ + void *mem_ctx = ralloc_context(NULL); + struct sh_incl_path_entry *path_list; + + if (!validate_and_tokenise_sh_incl(ctx, mem_ctx, &path_list, path, + error_check)) { + ralloc_free(mem_ctx); + return NULL; + } + + struct sh_incl_path_ht_entry *sh_incl_ht_entry = NULL; + struct hash_table *path_ht = + ctx->Shared->ShaderIncludes->shader_include_tree; + + size_t count = ctx->Shared->ShaderIncludes->num_include_paths; + bool relative_path = path[0] != '/'; + + size_t i = ctx->Shared->ShaderIncludes->relative_path_cursor; + bool use_cursor = ctx->Shared->ShaderIncludes->relative_path_cursor; + + do { + struct sh_incl_path_entry *entry; + + if (relative_path) { +next_relative_path: + { + struct sh_incl_path_entry *rel_path_list = + ctx->Shared->ShaderIncludes->include_paths[i]; + foreach(entry, rel_path_list) { + struct hash_entry *ht_entry = + _mesa_hash_table_search(path_ht, entry->path); + + if (!ht_entry) { + /* Reset search path and skip to the next include path */ + path_ht = ctx->Shared->ShaderIncludes->shader_include_tree; + sh_incl_ht_entry = NULL; + if (use_cursor) { + i = 0; + use_cursor = false; + + goto next_relative_path; + } + i++; + if (i < count) + goto next_relative_path; + else + break; + } else { + sh_incl_ht_entry = + (struct sh_incl_path_ht_entry *) ht_entry->data; + } + + path_ht = sh_incl_ht_entry->path; + } + } + } + + foreach(entry, path_list) { + struct hash_entry *ht_entry = + _mesa_hash_table_search(path_ht, entry->path); + + if (!ht_entry) { + /* Reset search path and skip to the next include path */ + path_ht = ctx->Shared->ShaderIncludes->shader_include_tree; + sh_incl_ht_entry = NULL; + if (use_cursor) { + i = 0; + use_cursor = false; + + break; + } + i++; + break; + } else { + + sh_incl_ht_entry = + (struct sh_incl_path_ht_entry *) ht_entry->data; + } + + path_ht = sh_incl_ht_entry->path; + } + + if (i < count && + (sh_incl_ht_entry == NULL || !sh_incl_ht_entry->shader_source)) + continue; + + /* If we get here then we have found a matching path or exahusted our + * relative search paths. + */ + ctx->Shared->ShaderIncludes->relative_path_cursor = i; + break; + } while (i < count); + + ralloc_free(mem_ctx); + + return sh_incl_ht_entry; +} + +const char * +_mesa_lookup_shader_include(struct gl_context *ctx, char *path, + bool error_check) +{ + struct sh_incl_path_ht_entry *shader_include = + lookup_shader_include(ctx, path, error_check); + + return shader_include ? shader_include->shader_source : NULL; +} + +static char * +copy_string(struct gl_context *ctx, const char *str, int str_len, + const char *caller) +{ + if (!str) { + _mesa_error(ctx, GL_INVALID_VALUE, "%s(NULL string)", caller); + return NULL; + } + + char *cp; + if (str_len == -1) + cp = strdup(str); + else { + cp = calloc(sizeof(char), str_len + 1); + memcpy(cp, str, str_len); + } + + return cp; +} + +GLvoid GLAPIENTRY +_mesa_NamedStringARB(GLenum type, GLint namelen, const GLchar *name, + GLint stringlen, const GLchar *string) +{ + GET_CURRENT_CONTEXT(ctx); + const char *caller = "glNamedStringARB"; + + if (type != GL_SHADER_INCLUDE_ARB) { + _mesa_error(ctx, GL_INVALID_VALUE, "%s(invalid type)", caller); + return; + } + + char *name_cp = copy_string(ctx, name, namelen, caller); + char *string_cp = copy_string(ctx, string, stringlen, caller); + if (!name_cp || !string_cp) { + free(string_cp); + free(name_cp); + return; + } + + void *mem_ctx = ralloc_context(NULL); + struct sh_incl_path_entry *path_list; + + if (!validate_and_tokenise_sh_incl(ctx, mem_ctx, &path_list, name_cp, + true)) { + free(string_cp); + free(name_cp); + ralloc_free(mem_ctx); + return; + } + + mtx_lock(&ctx->Shared->ShaderIncludeMutex); + + struct hash_table *path_ht = + ctx->Shared->ShaderIncludes->shader_include_tree; + + struct sh_incl_path_entry *entry; + foreach(entry, path_list) { + struct hash_entry *ht_entry = + _mesa_hash_table_search(path_ht, entry->path); + + struct sh_incl_path_ht_entry *sh_incl_ht_entry; + if (!ht_entry) { + sh_incl_ht_entry = calloc(1, sizeof(struct sh_incl_path_ht_entry)); + sh_incl_ht_entry->path = + _mesa_hash_table_create(NULL, _mesa_hash_string, + _mesa_key_string_equal); + _mesa_hash_table_insert(path_ht, entry->path, sh_incl_ht_entry); + } else { + sh_incl_ht_entry = (struct sh_incl_path_ht_entry *) ht_entry->data; + } + + path_ht = sh_incl_ht_entry->path; + + if (last_elem(path_list) == entry) { + free(sh_incl_ht_entry->shader_source); + sh_incl_ht_entry->shader_source = string_cp; + } + } + + mtx_unlock(&ctx->Shared->ShaderIncludeMutex); + + free(name_cp); + ralloc_free(mem_ctx); +} + +GLvoid GLAPIENTRY +_mesa_DeleteNamedStringARB(GLint namelen, const GLchar *name) +{ + GET_CURRENT_CONTEXT(ctx); + const char *caller = "glDeleteNamedStringARB"; + + char *name_cp = copy_string(ctx, name, namelen, caller); + if (!name_cp) + return; + + struct sh_incl_path_ht_entry *shader_include = + lookup_shader_include(ctx, name_cp, true); + + if (!shader_include) { + _mesa_error(ctx, GL_INVALID_OPERATION, + "%s(no string associated with path %s)", caller, name_cp); + free(name_cp); + return; + } + + mtx_lock(&ctx->Shared->ShaderIncludeMutex); + + free(shader_include->shader_source); + shader_include->shader_source = NULL; + + mtx_unlock(&ctx->Shared->ShaderIncludeMutex); + + free(name_cp); +} + +GLvoid GLAPIENTRY +_mesa_CompileShaderIncludeARB(GLuint shader, GLsizei count, + const GLchar* const *path, const GLint *length) +{ + GET_CURRENT_CONTEXT(ctx); + const char *caller = "glCompileShaderIncludeARB"; + + if (count > 0 && path == NULL) { + _mesa_error(ctx, GL_INVALID_VALUE, "%s(count > 0 && path == NULL)", + caller); + return; + } + + void *mem_ctx = ralloc_context(NULL); + + mtx_lock(&ctx->Shared->ShaderIncludeMutex); + + ctx->Shared->ShaderIncludes->include_paths = + ralloc_array_size(mem_ctx, sizeof(struct sh_incl_path_entry *), count); + + for (size_t i = 0; i < count; i++) { + char *path_cp = copy_string(ctx, path[i], length ? length[i] : -1, + caller); + if (!path_cp) { + goto exit; + } + + struct sh_incl_path_entry *path_list; + + if (!validate_and_tokenise_sh_incl(ctx, mem_ctx, &path_list, path_cp, + true)) { + free(path_cp); + goto exit; + } + + ctx->Shared->ShaderIncludes->include_paths[i] = path_list; + + free(path_cp); + } + + /* We must set this *after* all calls to validate_and_tokenise_sh_incl() + * are done as we use this to decide if we need to check the start of the + * path for a '/' + */ + ctx->Shared->ShaderIncludes->num_include_paths = count; + + struct gl_shader *sh = _mesa_lookup_shader(ctx, shader); + if (!sh) { + _mesa_error(ctx, GL_INVALID_OPERATION, "%s(shader)", caller); + goto exit; + } + + _mesa_compile_shader(ctx, sh); + +exit: + ctx->Shared->ShaderIncludes->num_include_paths = 0; + ctx->Shared->ShaderIncludes->relative_path_cursor = 0; + ctx->Shared->ShaderIncludes->include_paths = NULL; + + mtx_unlock(&ctx->Shared->ShaderIncludeMutex); + + ralloc_free(mem_ctx); +} + +GLboolean GLAPIENTRY +_mesa_IsNamedStringARB(GLint namelen, const GLchar *name) +{ + GET_CURRENT_CONTEXT(ctx); + + if (!name) + return false; + + char *name_cp = copy_string(ctx, name, namelen, ""); + + const char *source = _mesa_lookup_shader_include(ctx, name_cp, false); + free(name_cp); + + if (!source) + return false; + + return true; +} + +GLvoid GLAPIENTRY +_mesa_GetNamedStringARB(GLint namelen, const GLchar *name, GLsizei bufSize, + GLint *stringlen, GLchar *string) +{ + GET_CURRENT_CONTEXT(ctx); + const char *caller = "glGetNamedStringARB"; + + char *name_cp = copy_string(ctx, name, namelen, caller); + if (!name_cp) + return; + + const char *source = _mesa_lookup_shader_include(ctx, name_cp, true); + if (!source) { + _mesa_error(ctx, GL_INVALID_OPERATION, + "%s(no string associated with path %s)", caller, name_cp); + free(name_cp); + return; + } + + size_t size = MIN2(strlen(source), bufSize - 1); + memcpy(string, source, size); + string[size] = '\0'; + + *stringlen = size; + + free(name_cp); +} + +GLvoid GLAPIENTRY +_mesa_GetNamedStringivARB(GLint namelen, const GLchar *name, + GLenum pname, GLint *params) +{ + GET_CURRENT_CONTEXT(ctx); + const char *caller = "glGetNamedStringivARB"; + + char *name_cp = copy_string(ctx, name, namelen, caller); + if (!name_cp) + return; + + const char *source = _mesa_lookup_shader_include(ctx, name_cp, true); + if (!source) { + _mesa_error(ctx, GL_INVALID_OPERATION, + "%s(no string associated with path %s)", caller, name_cp); + free(name_cp); + return; + } + + switch (pname) { + case GL_NAMED_STRING_LENGTH_ARB: + *params = strlen(source) + 1; + break; + case GL_NAMED_STRING_TYPE_ARB: + *params = GL_SHADER_INCLUDE_ARB; + break; + default: + _mesa_error(ctx, GL_INVALID_ENUM, "%s(pname)", caller); + break; + } + + free(name_cp); +} + static int find_compat_subroutine(struct gl_program *p, const struct glsl_type *type) { diff -Nru mesa-19.2.8/src/mesa/main/shaderapi.h mesa-20.0.8/src/mesa/main/shaderapi.h --- mesa-19.2.8/src/mesa/main/shaderapi.h 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/main/shaderapi.h 2020-06-12 01:21:18.000000000 +0000 @@ -332,6 +332,9 @@ GLsizei bufSize, GLsizei *length, GLint *params); +extern void +_mesa_create_program_resource_hash(struct gl_shader_program *shProg); + /* GL_ARB_tessellation_shader */ void GLAPIENTRY _mesa_PatchParameteri_no_error(GLenum pname, GLint value); @@ -381,12 +384,50 @@ _mesa_GetProgramStageiv(GLuint program, GLenum shadertype, GLenum pname, GLint *values); +extern GLvoid GLAPIENTRY +_mesa_NamedStringARB(GLenum type, GLint namelen, const GLchar *name, + GLint stringlen, const GLchar *string); + +extern GLvoid GLAPIENTRY +_mesa_DeleteNamedStringARB(GLint namelen, const GLchar *name); + +extern GLvoid GLAPIENTRY +_mesa_CompileShaderIncludeARB(GLuint shader, GLsizei count, + const GLchar* const *path, const GLint *length); + +extern GLboolean GLAPIENTRY +_mesa_IsNamedStringARB(GLint namelen, const GLchar *name); + +extern GLvoid GLAPIENTRY +_mesa_GetNamedStringARB(GLint namelen, const GLchar *name, GLsizei bufSize, + GLint *stringlen, GLchar *string); + +extern GLvoid GLAPIENTRY +_mesa_GetNamedStringivARB(GLint namelen, const GLchar *name, + GLenum pname, GLint *params); + GLcharARB * _mesa_read_shader_source(const gl_shader_stage stage, const char *source); void _mesa_dump_shader_source(const gl_shader_stage stage, const char *source); +void +_mesa_init_shader_includes(struct gl_shared_state *shared); + +size_t +_mesa_get_shader_include_cursor(struct gl_shared_state *shared); + +void +_mesa_set_shader_include_cursor(struct gl_shared_state *shared, size_t cusor); + +void +_mesa_destroy_shader_includes(struct gl_shared_state *shared); + +const char * +_mesa_lookup_shader_include(struct gl_context *ctx, char *path, + bool error_check); + #ifdef __cplusplus } #endif diff -Nru mesa-19.2.8/src/mesa/main/shaderimage.c mesa-20.0.8/src/mesa/main/shaderimage.c --- mesa-19.2.8/src/mesa/main/shaderimage.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/main/shaderimage.c 2020-06-12 01:21:18.000000000 +0000 @@ -45,18 +45,10 @@ */ #ifdef MESA_BIG_ENDIAN # define MESA_FORMAT_RGBA_8 MESA_FORMAT_A8B8G8R8_UNORM -# define MESA_FORMAT_RG_16 MESA_FORMAT_G16R16_UNORM -# define MESA_FORMAT_RG_8 MESA_FORMAT_G8R8_UNORM # define MESA_FORMAT_SIGNED_RGBA_8 MESA_FORMAT_A8B8G8R8_SNORM -# define MESA_FORMAT_SIGNED_RG_16 MESA_FORMAT_G16R16_SNORM -# define MESA_FORMAT_SIGNED_RG_8 MESA_FORMAT_G8R8_SNORM #else # define MESA_FORMAT_RGBA_8 MESA_FORMAT_R8G8B8A8_UNORM -# define MESA_FORMAT_RG_16 MESA_FORMAT_R16G16_UNORM -# define MESA_FORMAT_RG_8 MESA_FORMAT_R8G8_UNORM # define MESA_FORMAT_SIGNED_RGBA_8 MESA_FORMAT_R8G8B8A8_SNORM -# define MESA_FORMAT_SIGNED_RG_16 MESA_FORMAT_R16G16_SNORM -# define MESA_FORMAT_SIGNED_RG_8 MESA_FORMAT_R8G8_SNORM #endif mesa_format @@ -151,10 +143,10 @@ return MESA_FORMAT_RGBA_8; case GL_RG16: - return MESA_FORMAT_RG_16; + return MESA_FORMAT_RG_UNORM16; case GL_RG8: - return MESA_FORMAT_RG_8; + return MESA_FORMAT_RG_UNORM8; case GL_R16: return MESA_FORMAT_R_UNORM16; @@ -169,10 +161,10 @@ return MESA_FORMAT_SIGNED_RGBA_8; case GL_RG16_SNORM: - return MESA_FORMAT_SIGNED_RG_16; + return MESA_FORMAT_RG_SNORM16; case GL_RG8_SNORM: - return MESA_FORMAT_SIGNED_RG_8; + return MESA_FORMAT_RG_SNORM8; case GL_R16_SNORM: return MESA_FORMAT_R_SNORM16; @@ -297,10 +289,10 @@ case MESA_FORMAT_RGBA_8: return IMAGE_FORMAT_CLASS_4X8; - case MESA_FORMAT_RG_16: + case MESA_FORMAT_RG_UNORM16: return IMAGE_FORMAT_CLASS_2X16; - case MESA_FORMAT_RG_8: + case MESA_FORMAT_RG_UNORM8: return IMAGE_FORMAT_CLASS_2X8; case MESA_FORMAT_R_UNORM16: @@ -315,10 +307,10 @@ case MESA_FORMAT_SIGNED_RGBA_8: return IMAGE_FORMAT_CLASS_4X8; - case MESA_FORMAT_SIGNED_RG_16: + case MESA_FORMAT_RG_SNORM16: return IMAGE_FORMAT_CLASS_2X16; - case MESA_FORMAT_SIGNED_RG_8: + case MESA_FORMAT_RG_SNORM8: return IMAGE_FORMAT_CLASS_2X8; case MESA_FORMAT_R_SNORM16: @@ -662,9 +654,13 @@ * However note that issue 7 of the GL_OES_texture_buffer spec * recognizes that there is no way to create immutable buffer textures, * so those are excluded from this requirement. + * + * Additionally, issue 10 of the OES_EGL_image_external_essl3 spec + * states that glBindImageTexture must accept external textures. */ if (_mesa_is_gles(ctx) && !texObj->Immutable && - texObj->Target != GL_TEXTURE_BUFFER) { + texObj->Target != GL_TEXTURE_BUFFER && + texObj->Target != GL_TEXTURE_EXTERNAL_OES) { _mesa_error(ctx, GL_INVALID_OPERATION, "glBindImageTexture(!immutable)"); return; diff -Nru mesa-19.2.8/src/mesa/main/shaderobj.c mesa-20.0.8/src/mesa/main/shaderobj.c --- mesa-19.2.8/src/mesa/main/shaderobj.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/main/shaderobj.c 2020-06-12 01:21:18.000000000 +0000 @@ -344,6 +344,11 @@ shProg->UniformHash = NULL; } + if (shProg->data && shProg->data->ProgramResourceHash) { + _mesa_hash_table_u64_destroy(shProg->data->ProgramResourceHash, NULL); + shProg->data->ProgramResourceHash = NULL; + } + _mesa_reference_shader_program_data(ctx, &shProg->data, NULL); } diff -Nru mesa-19.2.8/src/mesa/main/shader_query.cpp mesa-20.0.8/src/mesa/main/shader_query.cpp --- mesa-19.2.8/src/mesa/main/shader_query.cpp 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/main/shader_query.cpp 2020-06-12 01:21:18.000000000 +0000 @@ -37,7 +37,7 @@ #include "compiler/glsl/ir.h" #include "compiler/glsl/program.h" #include "compiler/glsl/string_to_uint_map.h" - +#include "util/mesa-sha1.h" static GLint program_resource_location(struct gl_program_resource *res, @@ -461,7 +461,7 @@ case GL_TESS_EVALUATION_SUBROUTINE: return RESOURCE_SUB(res)->name; default: - assert(!"support for resource type not implemented"); + break; } return NULL; } @@ -527,6 +527,51 @@ return true; } +static uint32_t +compute_resource_key(GLenum programInterface, const char *name) +{ + struct mesa_sha1 ctx; + unsigned char sha1[20]; + + _mesa_sha1_init(&ctx); + _mesa_sha1_update(&ctx, &programInterface, sizeof(programInterface)); + _mesa_sha1_update(&ctx, name, strlen(name)); + _mesa_sha1_final(&ctx, sha1); + + return _mesa_hash_data(sha1, sizeof(sha1)); +} + +static struct gl_program_resource * +search_resource_hash(struct gl_shader_program *shProg, + GLenum programInterface, const char *name, + unsigned *array_index) +{ + const char *base_name_end; + long index = parse_program_resource_name(name, &base_name_end); + char *name_copy; + + /* If dealing with array, we need to get the basename. */ + if (index >= 0) { + name_copy = (char *) malloc(base_name_end - name + 1); + memcpy(name_copy, name, base_name_end - name); + name_copy[base_name_end - name] = '\0'; + } else { + name_copy = (char*) name; + } + + uint32_t key = compute_resource_key(programInterface, name_copy); + struct gl_program_resource *res = (struct gl_program_resource *) + _mesa_hash_table_u64_search(shProg->data->ProgramResourceHash, key); + + if (name_copy != name) + free(name_copy); + + if (res && array_index) + *array_index = index >= 0 ? index : 0; + + return res; +} + /* Find a program resource with specific name in given interface. */ struct gl_program_resource * @@ -534,9 +579,20 @@ GLenum programInterface, const char *name, unsigned *array_index) { - struct gl_program_resource *res = shProg->data->ProgramResourceList; - for (unsigned i = 0; i < shProg->data->NumProgramResourceList; - i++, res++) { + struct gl_program_resource *res = NULL; + + if (name == NULL) + return NULL; + + /* If we have a name, try the ProgramResourceHash first. */ + if (shProg->data->ProgramResourceHash) + res = search_resource_hash(shProg, programInterface, name, array_index); + + if (res) + return res; + + res = shProg->data->ProgramResourceList; + for (unsigned i = 0; i < shProg->data->NumProgramResourceList; i++, res++) { if (res->Type != programInterface) continue; @@ -1850,3 +1906,23 @@ } return true; } + +extern "C" void +_mesa_create_program_resource_hash(struct gl_shader_program *shProg) +{ + /* Rebuild resource hash. */ + if (shProg->data->ProgramResourceHash) + _mesa_hash_table_u64_destroy(shProg->data->ProgramResourceHash, NULL); + + shProg->data->ProgramResourceHash = _mesa_hash_table_u64_create(shProg); + + struct gl_program_resource *res = shProg->data->ProgramResourceList; + for (unsigned i = 0; i < shProg->data->NumProgramResourceList; i++, res++) { + const char *name = _mesa_program_resource_name(res); + if (name) { + uint32_t key = compute_resource_key(res->Type, name); + _mesa_hash_table_u64_insert(shProg->data->ProgramResourceHash, key, + res); + } + } +} diff -Nru mesa-19.2.8/src/mesa/main/shared.c mesa-20.0.8/src/mesa/main/shared.c --- mesa-19.2.8/src/mesa/main/shared.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/main/shared.c 2020-06-12 01:21:18.000000000 +0000 @@ -91,6 +91,10 @@ /* GL_ARB_bindless_texture */ _mesa_init_shared_handles(shared); + /* ARB_shading_language_include */ + _mesa_init_shader_includes(shared); + mtx_init(&shared->ShaderIncludeMutex, mtx_plain); + /* Allocate the default buffer object */ shared->NullBufferObj = ctx->Driver.NewBufferObject(ctx, 0); if (!shared->NullBufferObj) @@ -441,6 +445,10 @@ _mesa_free_shared_handles(shared); + /* ARB_shading_language_include */ + _mesa_destroy_shader_includes(shared); + mtx_destroy(&shared->ShaderIncludeMutex); + if (shared->MemoryObjects) { _mesa_HashDeleteAll(shared->MemoryObjects, delete_memory_object_cb, ctx); _mesa_DeleteHashTable(shared->MemoryObjects); diff -Nru mesa-19.2.8/src/mesa/main/spirv_extensions.c mesa-20.0.8/src/mesa/main/spirv_extensions.c --- mesa-19.2.8/src/mesa/main/spirv_extensions.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/main/spirv_extensions.c 2020-06-12 01:21:18.000000000 +0000 @@ -103,8 +103,11 @@ ext->supported[SPV_KHR_shader_draw_parameters] = cap->draw_parameters; ext->supported[SPV_KHR_multiview] = cap->multiview; + ext->supported[SPV_KHR_storage_buffer_storage_class] = true; ext->supported[SPV_KHR_variable_pointers] = cap->variable_pointers; ext->supported[SPV_AMD_gcn_shader] = cap->amd_gcn_shader; + ext->supported[SPV_KHR_shader_ballot] = cap->subgroup_ballot; + ext->supported[SPV_KHR_subgroup_vote] = cap->subgroup_vote; for (unsigned i = 0; i < SPV_EXTENSIONS_COUNT; i++) { if (ext->supported[i]) diff -Nru mesa-19.2.8/src/mesa/main/tests/dispatch_sanity.cpp mesa-20.0.8/src/mesa/main/tests/dispatch_sanity.cpp --- mesa-19.2.8/src/mesa/main/tests/dispatch_sanity.cpp 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/main/tests/dispatch_sanity.cpp 2020-06-12 01:21:18.000000000 +0000 @@ -546,6 +546,7 @@ /* GL_ARB_instanced_arrays */ { "glVertexAttribDivisorARB", 31, -1 }, + { "glVertexArrayVertexAttribDivisorEXT", 31, -1 }, /* GL_NV_texture_barrier */ { "glTextureBarrierNV", 31, -1 }, @@ -558,6 +559,10 @@ { "glEGLImageTargetRenderbufferStorageOES", 31, -1 }, { "glEGLImageTargetTexture2DOES", 31, -1 }, + /* EXT_EGL_image_storage */ + { "glEGLImageTargetTexStorageEXT", 31, -1 }, + { "glEGLImageTargetTextureStorageEXT", 31, -1 }, + /* GL 3.2 */ { "glGetInteger64i_v", 32, -1 }, { "glGetBufferParameteri64v", 32, -1 }, @@ -616,6 +621,7 @@ { "glVertexAttribL4dv", 41, -1 }, { "glVertexAttribLPointer", 41, -1 }, { "glGetVertexAttribLdv", 41, -1 }, + { "glVertexArrayVertexAttribLOffsetEXT", 41, -1 }, /* GL 4.3 */ { "glIsRenderbuffer", 43, -1 }, @@ -673,12 +679,6 @@ { "glBlendFunciARB", 43, -1 }, { "glBlendFuncSeparateiARB", 43, -1 }, { "glMinSampleShadingARB", 43, -1 }, // XXX: Add to xml -// { "glNamedStringARB", 43, -1 }, // XXX: Add to xml -// { "glDeleteNamedStringARB", 43, -1 }, // XXX: Add to xml -// { "glCompileShaderIncludeARB", 43, -1 }, // XXX: Add to xml -// { "glIsNamedStringARB", 43, -1 }, // XXX: Add to xml -// { "glGetNamedStringARB", 43, -1 }, // XXX: Add to xml -// { "glGetNamedStringivARB", 43, -1 }, // XXX: Add to xml { "glBindFragDataLocationIndexed", 43, -1 }, { "glGetFragDataIndex", 43, -1 }, { "glGenSamplers", 43, -1 }, @@ -857,8 +857,8 @@ { "glTextureStorage3DEXT", 43, -1 }, { "glClearBufferData", 43, -1 }, { "glClearBufferSubData", 43, -1 }, -// { "glClearNamedBufferDataEXT", 43, -1 }, // XXX: Add to xml -// { "glClearNamedBufferSubDataEXT", 43, -1 }, // XXX: Add to xml + { "glClearNamedBufferDataEXT", 43, -1 }, + { "glClearNamedBufferSubDataEXT", 43, -1 }, { "glCopyImageSubData", 43, -1 }, { "glTextureView", 43, -1 }, { "glBindVertexBuffer", 43, -1 }, @@ -867,15 +867,16 @@ { "glVertexAttribLFormat", 43, -1 }, { "glVertexAttribBinding", 43, -1 }, { "glVertexBindingDivisor", 43, -1 }, -// { "glVertexArrayBindVertexBufferEXT", 43, -1 }, // XXX: Add to xml -// { "glVertexArrayVertexAttribFormatEXT", 43, -1 }, // XXX: Add to xml -// { "glVertexArrayVertexAttribIFormatEXT", 43, -1 }, // XXX: Add to xml -// { "glVertexArrayVertexAttribBindingEXT", 43, -1 }, // XXX: Add to xml -// { "glVertexArrayVertexBindingDivisorEXT", 43, -1 }, // XXX: Add to xml + { "glVertexArrayBindVertexBufferEXT", 43, -1 }, + { "glVertexArrayVertexAttribFormatEXT", 43, -1 }, + { "glVertexArrayVertexAttribIFormatEXT", 43, -1 }, + { "glVertexArrayVertexAttribLFormatEXT", 43, -1 }, + { "glVertexArrayVertexAttribBindingEXT", 43, -1 }, + { "glVertexArrayVertexBindingDivisorEXT", 43, -1 }, { "glFramebufferParameteri", 43, -1 }, { "glGetFramebufferParameteriv", 43, -1 }, -// { "glNamedFramebufferParameteriEXT", 43, -1 }, // XXX: Add to xml -// { "glGetNamedFramebufferParameterivEXT", 43, -1 }, // XXX: Add to xml + { "glNamedFramebufferParameteriEXT", 43, -1 }, + { "glGetNamedFramebufferParameterivEXT", 43, -1 }, // { "glGetInternalformati64v", 43, -1 }, // XXX: Add to xml { "glInvalidateTexSubImage", 43, -1 }, { "glInvalidateTexImage", 43, -1 }, @@ -892,11 +893,11 @@ { "glGetProgramResourceLocation", 43, -1 }, { "glGetProgramResourceLocationIndex", 43, -1 }, { "glShaderStorageBlockBinding", 43, -1 }, -// { "glTextureBufferRangeEXT", 43, -1 }, // XXX: Add to xml + { "glTextureBufferRangeEXT", 43, -1 }, { "glTexStorage2DMultisample", 43, -1 }, { "glTexStorage3DMultisample", 43, -1 }, -// { "glTextureStorage2DMultisampleEXT", 43, -1 }, // XXX: Add to xml -// { "glTextureStorage3DMultisampleEXT", 43, -1 }, // XXX: Add to xml + { "glTextureStorage2DMultisampleEXT", 43, -1 }, + { "glTextureStorage3DMultisampleEXT", 43, -1 }, { "glViewportArrayv", 43, -1 }, { "glViewportIndexedf", 43, -1 }, @@ -1031,8 +1032,8 @@ { "glMatrixPushEXT", 10, -1 }, { "glMatrixPopEXT", 10, -1 }, /* GL_EXT_direct_state_access - GL 1.1 */ - //{ "glClientAttribDefaultEXT", 11, -1 }, - //{ "glPushClientAttribDefaultEXT", 11, -1 }, + { "glClientAttribDefaultEXT", 11, -1 }, + { "glPushClientAttribDefaultEXT", 11, -1 }, { "glTextureParameteriEXT", 11, -1 }, { "glTextureParameterivEXT", 11, -1 }, { "glTextureParameterfEXT", 11, -1 }, @@ -1096,15 +1097,15 @@ { "glDisableClientStateIndexedEXT", 12, -1 }, { "glGetPointerIndexedvEXT", 12, -1 }, /* GL_EXT_direct_state_access - ARB_vertex_program */ - //{ "glNamedProgramStringEXT", 10, -1 }, - //{ "glNamedProgramLocalParameter4dEXT", 10, -1 }, - //{ "glNamedProgramLocalParameter4dvEXT", 10, -1 }, - //{ "glNamedProgramLocalParameter4fEXT", 10, -1 }, - //{ "glNamedProgramLocalParameter4fvEXT", 10, -1 }, - //{ "glGetNamedProgramLocalParameter4dvEXT", 10, -1 }, - //{ "glGetNamedProgramLocalParameter4fvEXT", 10, -1 }, - //{ "glGetNamedProgramivEXT", 10, -1 }, - //{ "glGetNamedProgramStringEXT", 10, -1 }, + { "glNamedProgramStringEXT", 10, -1 }, + { "glNamedProgramLocalParameter4dEXT", 10, -1 }, + { "glNamedProgramLocalParameter4dvEXT", 10, -1 }, + { "glNamedProgramLocalParameter4fEXT", 10, -1 }, + { "glNamedProgramLocalParameter4fvEXT", 10, -1 }, + { "glGetNamedProgramLocalParameterdvEXT", 10, -1 }, + { "glGetNamedProgramLocalParameterfvEXT", 10, -1 }, + { "glGetNamedProgramivEXT", 10, -1 }, + { "glGetNamedProgramStringEXT", 10, -1 }, /* GL_EXT_direct_state_access - GL 1.3 */ { "glCompressedTextureImage1DEXT", 13, -1 }, { "glCompressedTextureImage2DEXT", 13, -1 }, @@ -1133,93 +1134,61 @@ { "glGetNamedBufferPointervEXT", 15, -1 }, { "glGetNamedBufferSubDataEXT", 15, -1 }, /* GL_EXT_direct_state_access - GL 2.0 */ - //{ "glProgramUniform1iEXT", 20, -1 }, - //{ "glProgramUniform1ivEXT", 20, -1 }, - //{ "glProgramUniform1fEXT", 20, -1 }, - //{ "glProgramUniform1fvEXT", 20, -1 }, - //{ "glProgramUniform2iEXT", 20, -1 }, - //{ "glProgramUniform2ivEXT", 20, -1 }, - //{ "glProgramUniform2fEXT", 20, -1 }, - //{ "glProgramUniform2fvEXT", 20, -1 }, - //{ "glProgramUniform3iEXT", 20, -1 }, - //{ "glProgramUniform3ivEXT", 20, -1 }, - //{ "glProgramUniform3fEXT", 20, -1 }, - //{ "glProgramUniform3fvEXT", 20, -1 }, - //{ "glProgramUniform4iEXT", 20, -1 }, - //{ "glProgramUniform4ivEXT", 20, -1 }, - //{ "glProgramUniform4fEXT", 20, -1 }, - //{ "glProgramUniform4fvEXT", 20, -1 }, - //{ "glProgramUniformMatrix2fvEXT", 20, -1 }, - //{ "glProgramUniformMatrix3fvEXT", 20, -1 }, - //{ "glProgramUniformMatrix4fvEXT", 20, -1 }, + /* Added glProgramUniform*EXT functions are aliases */ /* GL_EXT_direct_state_access - GL 2.1 */ - //{ "glProgramUniformMatrix2x3fvEXT", 21, -1 }, - //{ "glProgramUniformMatrix3x2fvEXT", 21, -1 }, - //{ "glProgramUniformMatrix2x4fvEXT", 21, -1 }, - //{ "glProgramUniformMatrix4x2fvEXT", 21, -1 }, - //{ "glProgramUniformMatrix3x4fvEXT", 21, -1 }, - //{ "glProgramUniformMatrix4x3fvEXT", 21, -1 }, + /* Added glProgramUniformMAtrix*EXT functions are aliases */ /* GL_EXT_direct_state_access - EXT_texture_buffer_object */ - //{ "glTextureBufferEXT", 10, -1 }, - //{ "glMultiTexBufferEXT", 10, -1 }, + { "glTextureBufferEXT", 10, -1 }, + { "glMultiTexBufferEXT", 10, -1 }, /* GL_EXT_direct_state_access - EXT_texture_integer */ - //{ "glTextureParameterIivEXT", 10, -1 }, - //{ "glTextureParameterIuivEXT", 10, -1 }, - //{ "glGetTextureParameterIivEXT", 10, -1 }, - //{ "glGetTextureParameterIuivEXT", 10, -1 }, - //{ "glMultiTexParameterIivEXT", 10, -1 }, - //{ "glMultiTexParameterIuivEXT", 10, -1 }, - //{ "glGetMultiTexParameterIivEXT", 10, -1 }, - //{ "glGetMultiTexParameterIuivEXT", 10, -1 }, + { "glTextureParameterIivEXT", 10, -1 }, + { "glTextureParameterIuivEXT", 10, -1 }, + { "glGetTextureParameterIivEXT", 10, -1 }, + { "glGetTextureParameterIuivEXT", 10, -1 }, + { "glMultiTexParameterIivEXT", 10, -1 }, + { "glMultiTexParameterIuivEXT", 10, -1 }, + { "glGetMultiTexParameterIivEXT", 10, -1 }, + { "glGetMultiTexParameterIuivEXT", 10, -1 }, /* GL_EXT_direct_state_access - EXT_gpu_shader4 */ - //{ "glProgramUniform1uiEXT", 10, -1 }, - //{ "glProgramUniform1uivEXT", 10, -1 }, - //{ "glProgramUniform2uiEXT", 10, -1 }, - //{ "glProgramUniform2uivEXT", 10, -1 }, - //{ "glProgramUniform3uiEXT", 10, -1 }, - //{ "glProgramUniform3uivEXT", 10, -1 }, - //{ "glProgramUniform4uiEXT", 10, -1 }, - //{ "glProgramUniform4uivEXT", 10, -1 }, + /* Added glProgramUniform*u*EXT functions are aliases */ /* GL_EXT_direct_state_access - EXT_gpu_program_parameters */ - //{ "glNamedProgramLocalParameters4fvEXT", 10, -1 }, + { "glNamedProgramLocalParameters4fvEXT", 10, -1 }, /* GL_EXT_direct_state_access - GL 3.0 */ - //{ "glGetFloati_vEXT", 30, -1 }, - //{ "glGetDoublei_vEXT", 30, -1 }, - //{ "glNamedRenderbufferStorageEXT", 30, -1 }, - //{ "glGetNamedRenderbufferParameterivEXT", 30, -1 }, - //{ "glNamedRenderbufferStorageMultisampleEXT", 30, -1 }, + { "glNamedRenderbufferStorageEXT", 30, -1 }, + { "glGetNamedRenderbufferParameterivEXT", 30, -1 }, + { "glNamedRenderbufferStorageMultisampleEXT", 30, -1 }, { "glCheckNamedFramebufferStatusEXT", 30, -1 }, { "glNamedFramebufferTexture1DEXT", 30, -1 }, { "glNamedFramebufferTexture2DEXT", 30, -1 }, { "glNamedFramebufferTexture3DEXT", 30, -1 }, { "glNamedFramebufferRenderbufferEXT", 30, -1 }, { "glGetNamedFramebufferAttachmentParameterivEXT", 30, -1 }, - //{ "glGenerateTextureMipmapEXT", 30, -1 }, - //{ "glGenerateMultiTexMipmapEXT", 30, -1 }, + { "glGenerateTextureMipmapEXT", 30, -1 }, + { "glGenerateMultiTexMipmapEXT", 30, -1 }, { "glFramebufferDrawBufferEXT", 30, -1 }, { "glFramebufferDrawBuffersEXT", 30, -1 }, { "glFramebufferReadBufferEXT", 30, -1 }, { "glGetFramebufferParameterivEXT", 30, -1 }, - //{ "glNamedCopyBufferSubDataEXT", 30, -1 }, - //{ "glVertexArrayVertexOffsetEXT", 30, -1 }, - //{ "glVertexArrayColorOffsetEXT", 30, -1 }, - //{ "glVertexArrayEdgeFlagOffsetEXT", 30, -1 }, - //{ "glVertexArrayIndexOffsetEXT", 30, -1 }, - //{ "glVertexArrayNormalOffsetEXT", 30, -1 }, - //{ "glVertexArrayTexCoordOffsetEXT", 30, -1 }, - //{ "glVertexArrayMultiTexCoordOffsetEXT", 30, -1 }, - //{ "glVertexArrayFogCoordOffsetEXT", 30, -1 }, - //{ "glVertexArraySecondColorOffsetEXT", 30, -1 }, - //{ "glVertexArrayVertexAttribOffsetEXT", 30, -1 }, - //{ "glVertexArrayVertexAttribIOffsetEXT", 30, -1 }, - //{ "glEnableVertexArrayEXT", 30, -1 }, - //{ "glDisableVertexArrayEXT", 30, -1 }, - //{ "glEnableVertexArrayAttribEXT", 30, -1 }, - //{ "glDisableVertexArrayAttribEXT", 30, -1 }, - //{ "glGetVertexArrayIntegervEXT", 30, -1 }, - //{ "glGetVertexArrayPointervEXT", 30, -1 }, - //{ "glGetVertexArrayIntegeri_vEXT", 30, -1 }, - //{ "glGetVertexArrayPointeri_vEXT", 30, -1 }, + { "glNamedCopyBufferSubDataEXT", 30, -1 }, + { "glVertexArrayVertexOffsetEXT", 30, -1 }, + { "glVertexArrayColorOffsetEXT", 30, -1 }, + { "glVertexArrayEdgeFlagOffsetEXT", 30, -1 }, + { "glVertexArrayIndexOffsetEXT", 30, -1 }, + { "glVertexArrayNormalOffsetEXT", 30, -1 }, + { "glVertexArrayTexCoordOffsetEXT", 30, -1 }, + { "glVertexArrayMultiTexCoordOffsetEXT", 30, -1 }, + { "glVertexArrayFogCoordOffsetEXT", 30, -1 }, + { "glVertexArraySecondaryColorOffsetEXT", 30, -1 }, + { "glVertexArrayVertexAttribOffsetEXT", 30, -1 }, + { "glVertexArrayVertexAttribIOffsetEXT", 30, -1 }, + { "glEnableVertexArrayEXT", 30, -1 }, + { "glDisableVertexArrayEXT", 30, -1 }, + { "glEnableVertexArrayAttribEXT", 30, -1 }, + { "glDisableVertexArrayAttribEXT", 30, -1 }, + { "glGetVertexArrayIntegervEXT", 30, -1 }, + { "glGetVertexArrayPointervEXT", 30, -1 }, + { "glGetVertexArrayIntegeri_vEXT", 30, -1 }, + { "glGetVertexArrayPointeri_vEXT", 30, -1 }, { "glMapNamedBufferRangeEXT", 30, -1 }, { "glFlushMappedNamedBufferRangeEXT", 30, -1 }, @@ -1237,6 +1206,14 @@ { "glBindImageTextures", 44, -1 }, { "glBindVertexBuffers", 44, -1 }, + /* GL_ARB_shading_language_include */ + { "glNamedStringARB", 20, -1 }, + { "glDeleteNamedStringARB", 20, -1 }, + { "glCompileShaderIncludeARB", 20, -1 }, + { "glIsNamedStringARB", 20, -1 }, + { "glGetNamedStringARB", 20, -1 }, + { "glGetNamedStringivARB", 20, -1 }, + /* GL_KHR_debug/GL_ARB_debug_output */ { "glPushDebugGroup", 11, -1 }, { "glPopDebugGroup", 11, -1 }, @@ -1328,6 +1305,7 @@ /* GL_ARB_sparse_buffer */ { "glBufferPageCommitmentARB", 43, -1 }, { "glNamedBufferPageCommitmentARB", 43, -1 }, + { "glNamedBufferPageCommitmentEXT", 43, -1 }, /* GL_ARB_bindless_texture */ { "glGetTextureHandleARB", 40, -1 }, @@ -1412,6 +1390,48 @@ /* GL_EXT_shader_image_load_store */ { "glBindImageTextureEXT", 30, -1 }, + /* GL_MESA_framebuffer_flip_y */ + { "glFramebufferParameteriMESA", 43, -1 }, + { "glGetFramebufferParameterivMESA", 43, -1 }, + + /* GL_ARB_gpu_shader_int64 */ + { "glUniform1i64ARB", 40, -1 }, + { "glUniform2i64ARB", 40, -1 }, + { "glUniform3i64ARB", 40, -1 }, + { "glUniform4i64ARB", 40, -1 }, + { "glUniform1ui64ARB", 40, -1 }, + { "glUniform2ui64ARB", 40, -1 }, + { "glUniform3ui64ARB", 40, -1 }, + { "glUniform4ui64ARB", 40, -1 }, + { "glUniform1i64vARB", 40, -1 }, + { "glUniform2i64vARB", 40, -1 }, + { "glUniform3i64vARB", 40, -1 }, + { "glUniform4i64vARB", 40, -1 }, + { "glUniform1ui64vARB", 40, -1 }, + { "glUniform2ui64vARB", 40, -1 }, + { "glUniform3ui64vARB", 40, -1 }, + { "glUniform4ui64vARB", 40, -1 }, + { "glGetUniformi64vARB", 40, -1 }, + { "glGetUniformui64vARB", 40, -1 }, + { "glGetnUniformi64vARB", 40, -1 }, + { "glGetnUniformui64vARB", 40, -1 }, + { "glProgramUniform1i64ARB", 40, -1 }, + { "glProgramUniform2i64ARB", 40, -1 }, + { "glProgramUniform3i64ARB", 40, -1 }, + { "glProgramUniform4i64ARB", 40, -1 }, + { "glProgramUniform1ui64ARB", 40, -1 }, + { "glProgramUniform2ui64ARB", 40, -1 }, + { "glProgramUniform3ui64ARB", 40, -1 }, + { "glProgramUniform4ui64ARB", 40, -1 }, + { "glProgramUniform1i64vARB", 40, -1 }, + { "glProgramUniform2i64vARB", 40, -1 }, + { "glProgramUniform3i64vARB", 40, -1 }, + { "glProgramUniform4i64vARB", 40, -1 }, + { "glProgramUniform1ui64vARB", 40, -1 }, + { "glProgramUniform2ui64vARB", 40, -1 }, + { "glProgramUniform3ui64vARB", 40, -1 }, + { "glProgramUniform4ui64vARB", 40, -1 }, + { NULL, 0, -1 } }; @@ -1898,44 +1918,6 @@ /* GL_ARB_ES3_2_compatibility */ { "glPrimitiveBoundingBoxARB", 45, -1 }, - /* GL_ARB_gpu_shader_int64 */ - { "glUniform1i64ARB", 45, -1 }, - { "glUniform2i64ARB", 45, -1 }, - { "glUniform3i64ARB", 45, -1 }, - { "glUniform4i64ARB", 45, -1 }, - { "glUniform1ui64ARB", 45, -1 }, - { "glUniform2ui64ARB", 45, -1 }, - { "glUniform3ui64ARB", 45, -1 }, - { "glUniform4ui64ARB", 45, -1 }, - { "glUniform1i64vARB", 45, -1 }, - { "glUniform2i64vARB", 45, -1 }, - { "glUniform3i64vARB", 45, -1 }, - { "glUniform4i64vARB", 45, -1 }, - { "glUniform1ui64vARB", 45, -1 }, - { "glUniform2ui64vARB", 45, -1 }, - { "glUniform3ui64vARB", 45, -1 }, - { "glUniform4ui64vARB", 45, -1 }, - { "glGetUniformi64vARB", 45, -1 }, - { "glGetUniformui64vARB", 45, -1 }, - { "glGetnUniformi64vARB", 45, -1 }, - { "glGetnUniformui64vARB", 45, -1 }, - { "glProgramUniform1i64ARB", 45, -1 }, - { "glProgramUniform2i64ARB", 45, -1 }, - { "glProgramUniform3i64ARB", 45, -1 }, - { "glProgramUniform4i64ARB", 45, -1 }, - { "glProgramUniform1ui64ARB", 45, -1 }, - { "glProgramUniform2ui64ARB", 45, -1 }, - { "glProgramUniform3ui64ARB", 45, -1 }, - { "glProgramUniform4ui64ARB", 45, -1 }, - { "glProgramUniform1i64vARB", 45, -1 }, - { "glProgramUniform2i64vARB", 45, -1 }, - { "glProgramUniform3i64vARB", 45, -1 }, - { "glProgramUniform4i64vARB", 45, -1 }, - { "glProgramUniform1ui64vARB", 45, -1 }, - { "glProgramUniform2ui64vARB", 45, -1 }, - { "glProgramUniform3ui64vARB", 45, -1 }, - { "glProgramUniform4ui64vARB", 45, -1 }, - /* GL_ARB_gl_spirv */ { "glSpecializeShaderARB", 45, -1 }, @@ -2440,6 +2422,7 @@ { "glGetQueryObjecti64vEXT", 20, -1 }, { "glGetQueryObjectui64vEXT", 20, -1 }, { "glQueryCounterEXT", 20, -1 }, + { "glGetInteger64vEXT", 20, -1 }, /* GL_EXT_shader_framebuffer_fetch_non_coherent */ { "glFramebufferFetchBarrierEXT", 20, -1 }, @@ -2523,7 +2506,8 @@ // { "glGetBufferPointerv", 30, -1 }, { "glGetFragDataLocation", 30, -1 }, { "glGetInteger64i_v", 30, -1 }, - { "glGetInteger64v", 30, -1 }, + // We check for the aliased -EXT version in GLES 2 + //{ "glGetInteger64v", 30, -1 }, { "glGetIntegeri_v", 30, -1 }, { "glGetInternalformativ", 30, -1 }, { "glGetInternalformati64v", 30, -1 }, @@ -2653,6 +2637,13 @@ { "glRenderbufferStorageMultisampleAdvancedAMD", 11, -1 }, { "glNamedRenderbufferStorageMultisampleAdvancedAMD", 11, -1 }, + /* GL_MESA_framebuffer_flip_y */ + { "glFramebufferParameteriMESA", 30, -1 }, + { "glGetFramebufferParameterivMESA", 30, -1 }, + + /* EXT_EGL_image_storage */ + { "glEGLImageTargetTexStorageEXT", 30, -1 }, + { NULL, 0, -1 } }; diff -Nru mesa-19.2.8/src/mesa/main/tests/mesa_formats.cpp mesa-20.0.8/src/mesa/main/tests/mesa_formats.cpp --- mesa-19.2.8/src/mesa/main/tests/mesa_formats.cpp 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/main/tests/mesa_formats.cpp 2020-06-12 01:21:18.000000000 +0000 @@ -45,23 +45,19 @@ mesa_format f = (mesa_format) fi; SCOPED_TRACE(_mesa_get_format_name(f)); + if (!_mesa_get_format_name(f)) + continue; + /* This function will emit a problem/warning if the format is * not handled. */ if (!_mesa_is_format_compressed(f)) { GLenum datatype = 0; - GLenum error = 0; GLuint comps = 0; /* If the datatype is zero, the format was not handled */ _mesa_uncompressed_format_to_type_and_comps(f, &datatype, &comps); EXPECT_NE(datatype, (GLenum)0); - - /* If the error isn't NO_ERROR, the format was not handled. - * Use an arbitrary GLenum format. */ - _mesa_format_matches_format_and_type(f, GL_RG, datatype, - GL_FALSE, &error); - EXPECT_EQ((GLenum)GL_NO_ERROR, error); } } @@ -75,6 +71,9 @@ for (int fi = 0; fi < MESA_FORMAT_COUNT; ++fi) { mesa_format f = (mesa_format) fi; SCOPED_TRACE(_mesa_get_format_name(f)); + if (!_mesa_get_format_name(f)) + continue; + GLenum datatype = _mesa_get_format_datatype(f); GLint r = _mesa_get_format_bits(f, GL_RED_BITS); GLint g = _mesa_get_format_bits(f, GL_GREEN_BITS); @@ -137,3 +136,51 @@ } } + +TEST(MesaFormatsTest, IntensityToRed) +{ + EXPECT_EQ(_mesa_get_intensity_format_red(MESA_FORMAT_I_UNORM8), + MESA_FORMAT_R_UNORM8); + EXPECT_EQ(_mesa_get_intensity_format_red(MESA_FORMAT_I_SINT32), + MESA_FORMAT_R_SINT32); + EXPECT_EQ(_mesa_get_intensity_format_red(MESA_FORMAT_R8G8B8A8_UNORM), + MESA_FORMAT_R8G8B8A8_UNORM); +} + +static mesa_format fffat_wrap(GLenum format, GLenum type) +{ + uint32_t f = _mesa_format_from_format_and_type(format, type); + if (_mesa_format_is_mesa_array_format(f)) + f = _mesa_format_from_array_format((mesa_array_format)f); + return (mesa_format)f; +} + +TEST(MesaFormatsTest, FormatFromFormatAndType) +{ + EXPECT_EQ(fffat_wrap(GL_RGBA, GL_SHORT), + MESA_FORMAT_RGBA_SNORM16); + EXPECT_EQ(fffat_wrap(GL_DEPTH_COMPONENT, GL_UNSIGNED_SHORT), + MESA_FORMAT_Z_UNORM16); + EXPECT_EQ(fffat_wrap(GL_STENCIL_INDEX, GL_UNSIGNED_BYTE), + MESA_FORMAT_S_UINT8); + + /* Should return an array format, but not a proper MESA_FORMAT. */ + EXPECT_TRUE(_mesa_format_is_mesa_array_format(_mesa_format_from_format_and_type(GL_DEPTH_COMPONENT, + GL_BYTE))); +} + +TEST(MesaFormatsTest, FormatMatchesFormatAndType) +{ + EXPECT_TRUE(_mesa_format_matches_format_and_type(MESA_FORMAT_RGBA_UNORM16, + GL_RGBA, + GL_UNSIGNED_SHORT, false, + NULL)); + EXPECT_TRUE(_mesa_format_matches_format_and_type(MESA_FORMAT_S_UINT8, + GL_STENCIL_INDEX, + GL_UNSIGNED_BYTE, false, + NULL)); + EXPECT_TRUE(_mesa_format_matches_format_and_type(MESA_FORMAT_Z_UNORM16, + GL_DEPTH_COMPONENT, + GL_UNSIGNED_SHORT, false, + NULL)); +} diff -Nru mesa-19.2.8/src/mesa/main/texcompress_bptc.c mesa-20.0.8/src/mesa/main/texcompress_bptc.c --- mesa-19.2.8/src/mesa/main/texcompress_bptc.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/main/texcompress_bptc.c 2020-06-12 01:21:18.000000000 +0000 @@ -142,8 +142,11 @@ tempImageSlices[0] = (GLubyte *) tempImage; _mesa_texstore(ctx, dims, baseInternalFormat, - _mesa_little_endian() ? MESA_FORMAT_R8G8B8A8_UNORM - : MESA_FORMAT_A8B8G8R8_UNORM, +#if UTIL_ARCH_LITTLE_ENDIAN + MESA_FORMAT_R8G8B8A8_UNORM, +#else + MESA_FORMAT_A8B8G8R8_UNORM, +#endif rgbaRowStride, tempImageSlices, srcWidth, srcHeight, srcDepth, srcFormat, srcType, srcAddr, diff -Nru mesa-19.2.8/src/mesa/main/texcompress_bptc_tmp.h mesa-20.0.8/src/mesa/main/texcompress_bptc_tmp.h --- mesa-19.2.8/src/mesa/main/texcompress_bptc_tmp.h 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/main/texcompress_bptc_tmp.h 2020-06-12 01:21:18.000000000 +0000 @@ -826,11 +826,10 @@ sign_extend(int32_t value, int n_bits) { - if ((value & (1 << (n_bits - 1)))) { - value |= (~(int32_t) 0) << n_bits; - } + assert(n_bits > 0 && n_bits < 32); - return value; + const unsigned n = 32 - n_bits; + return (int32_t)((uint32_t)value << n) >> n; } static int diff -Nru mesa-19.2.8/src/mesa/main/texcompress_etc.c mesa-20.0.8/src/mesa/main/texcompress_etc.c --- mesa-19.2.8/src/mesa/main/texcompress_etc.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/main/texcompress_etc.c 2020-06-12 01:21:18.000000000 +0000 @@ -548,6 +548,8 @@ if (punchthrough_alpha) dst[3] = 255; } + else + unreachable("unhandled block mode"); } static void diff -Nru mesa-19.2.8/src/mesa/main/texcompress_fxt1.c mesa-20.0.8/src/mesa/main/texcompress_fxt1.c --- mesa-19.2.8/src/mesa/main/texcompress_fxt1.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/main/texcompress_fxt1.c 2020-06-12 01:21:18.000000000 +0000 @@ -132,8 +132,11 @@ tempImageSlices[0] = (GLubyte *) tempImage; _mesa_texstore(ctx, dims, baseInternalFormat, - _mesa_little_endian() ? MESA_FORMAT_R8G8B8A8_UNORM - : MESA_FORMAT_A8B8G8R8_UNORM, +#if UTIL_ARCH_LITTLE_ENDIAN + MESA_FORMAT_R8G8B8A8_UNORM, +#else + MESA_FORMAT_A8B8G8R8_UNORM, +#endif rgbaRowStride, tempImageSlices, srcWidth, srcHeight, srcDepth, srcFormat, srcType, srcAddr, diff -Nru mesa-19.2.8/src/mesa/main/texcompress_rgtc.c mesa-20.0.8/src/mesa/main/texcompress_rgtc.c --- mesa-19.2.8/src/mesa/main/texcompress_rgtc.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/main/texcompress_rgtc.c 2020-06-12 01:21:18.000000000 +0000 @@ -197,11 +197,9 @@ dstFormat == MESA_FORMAT_LA_LATC2_UNORM); if (baseInternalFormat == GL_RG) - tempFormat = _mesa_little_endian() ? MESA_FORMAT_R8G8_UNORM - : MESA_FORMAT_G8R8_UNORM; + tempFormat = MESA_FORMAT_RG_UNORM8; else - tempFormat = _mesa_little_endian() ? MESA_FORMAT_L8A8_UNORM - : MESA_FORMAT_A8L8_UNORM; + tempFormat = MESA_FORMAT_LA_UNORM8; rgRowStride = 2 * srcWidth * sizeof(GLubyte); tempImage = malloc(srcWidth * srcHeight * 2 * sizeof(GLubyte)); diff -Nru mesa-19.2.8/src/mesa/main/texcompress_s3tc.c mesa-20.0.8/src/mesa/main/texcompress_s3tc.c --- mesa-19.2.8/src/mesa/main/texcompress_s3tc.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/main/texcompress_s3tc.c 2020-06-12 01:21:18.000000000 +0000 @@ -121,8 +121,11 @@ tempImageSlices[0] = (GLubyte *) tempImage; _mesa_texstore(ctx, dims, baseInternalFormat, - _mesa_little_endian() ? MESA_FORMAT_R8G8B8A8_UNORM - : MESA_FORMAT_A8B8G8R8_UNORM, +#if UTIL_ARCH_LITTLE_ENDIAN + MESA_FORMAT_R8G8B8A8_UNORM, +#else + MESA_FORMAT_A8B8G8R8_UNORM, +#endif rgbaRowStride, tempImageSlices, srcWidth, srcHeight, srcDepth, srcFormat, srcType, srcAddr, @@ -174,8 +177,11 @@ tempImageSlices[0] = (GLubyte *) tempImage; _mesa_texstore(ctx, dims, baseInternalFormat, - _mesa_little_endian() ? MESA_FORMAT_R8G8B8A8_UNORM - : MESA_FORMAT_A8B8G8R8_UNORM, +#if UTIL_ARCH_LITTLE_ENDIAN + MESA_FORMAT_R8G8B8A8_UNORM, +#else + MESA_FORMAT_A8B8G8R8_UNORM, +#endif rgbaRowStride, tempImageSlices, srcWidth, srcHeight, srcDepth, srcFormat, srcType, srcAddr, @@ -226,8 +232,11 @@ tempImageSlices[0] = (GLubyte *) tempImage; _mesa_texstore(ctx, dims, baseInternalFormat, - _mesa_little_endian() ? MESA_FORMAT_R8G8B8A8_UNORM - : MESA_FORMAT_A8B8G8R8_UNORM, +#if UTIL_ARCH_LITTLE_ENDIAN + MESA_FORMAT_R8G8B8A8_UNORM, +#else + MESA_FORMAT_A8B8G8R8_UNORM, +#endif rgbaRowStride, tempImageSlices, srcWidth, srcHeight, srcDepth, srcFormat, srcType, srcAddr, diff -Nru mesa-19.2.8/src/mesa/main/texcompress_s3tc_tmp.h mesa-20.0.8/src/mesa/main/texcompress_s3tc_tmp.h --- mesa-19.2.8/src/mesa/main/texcompress_s3tc_tmp.h 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/main/texcompress_s3tc_tmp.h 2020-06-12 01:21:18.000000000 +0000 @@ -59,7 +59,7 @@ const GLushort color0 = img_block_src[0] | (img_block_src[1] << 8); const GLushort color1 = img_block_src[2] | (img_block_src[3] << 8); const GLuint bits = img_block_src[4] | (img_block_src[5] << 8) | - (img_block_src[6] << 16) | (img_block_src[7] << 24); + (img_block_src[6] << 16) | ((GLuint)img_block_src[7] << 24); /* What about big/little endian? */ GLubyte bit_pos = 2 * (j * 4 + i) ; GLubyte code = (GLubyte) ((bits >> bit_pos) & 3); @@ -430,7 +430,7 @@ } } testerror += pixerrorbest; - bits |= enc << (2 * (j * 4 + i)); + bits |= (uint32_t)enc << (2 * (j * 4 + i)); } } /* some hw might disagree but actually decoding should always use 4-color encoding @@ -470,7 +470,7 @@ } } testerror2 += pixerrorbest; - bits2 |= enc << (2 * (j * 4 + i)); + bits2 |= (uint32_t)enc << (2 * (j * 4 + i)); } } } else { diff -Nru mesa-19.2.8/src/mesa/main/texenv.c mesa-20.0.8/src/mesa/main/texenv.c --- mesa-19.2.8/src/mesa/main/texenv.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/main/texenv.c 2020-06-12 01:21:18.000000000 +0000 @@ -491,16 +491,17 @@ if (iparam0 == GL_TRUE) { if (ctx->Point.CoordReplace & (1u << texunit)) return; + FLUSH_VERTICES(ctx, _NEW_POINT); ctx->Point.CoordReplace |= (1u << texunit); } else if (iparam0 == GL_FALSE) { if (~(ctx->Point.CoordReplace) & (1u << texunit)) return; + FLUSH_VERTICES(ctx, _NEW_POINT); ctx->Point.CoordReplace &= ~(1u << texunit); } else { _mesa_error( ctx, GL_INVALID_VALUE, "glTexEnv(param=0x%x)", iparam0); return; } - FLUSH_VERTICES(ctx, _NEW_POINT); } else { _mesa_error( ctx, GL_INVALID_ENUM, "glTexEnv(pname=0x%x)", pname ); diff -Nru mesa-19.2.8/src/mesa/main/texformat.c mesa-20.0.8/src/mesa/main/texformat.c --- mesa-19.2.8/src/mesa/main/texformat.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/main/texformat.c 2020-06-12 01:21:18.000000000 +0000 @@ -162,14 +162,14 @@ case GL_ALPHA4: case GL_ALPHA8: RETURN_IF_SUPPORTED(MESA_FORMAT_A_UNORM8); - RETURN_IF_SUPPORTED(MESA_FORMAT_L8A8_UNORM); + RETURN_IF_SUPPORTED(MESA_FORMAT_LA_UNORM8); break; case GL_ALPHA12: case GL_ALPHA16: RETURN_IF_SUPPORTED(MESA_FORMAT_A_UNORM16); RETURN_IF_SUPPORTED(MESA_FORMAT_A_UNORM8); - RETURN_IF_SUPPORTED(MESA_FORMAT_L8A8_UNORM); + RETURN_IF_SUPPORTED(MESA_FORMAT_LA_UNORM8); break; /* Luminance formats */ @@ -189,21 +189,21 @@ /* Luminance/Alpha formats */ case GL_LUMINANCE4_ALPHA4: RETURN_IF_SUPPORTED(MESA_FORMAT_L4A4_UNORM); - RETURN_IF_SUPPORTED(MESA_FORMAT_L8A8_UNORM); + RETURN_IF_SUPPORTED(MESA_FORMAT_LA_UNORM8); break; case 2: case GL_LUMINANCE_ALPHA: case GL_LUMINANCE6_ALPHA2: case GL_LUMINANCE8_ALPHA8: - RETURN_IF_SUPPORTED(MESA_FORMAT_L8A8_UNORM); + RETURN_IF_SUPPORTED(MESA_FORMAT_LA_UNORM8); break; case GL_LUMINANCE12_ALPHA4: case GL_LUMINANCE12_ALPHA12: case GL_LUMINANCE16_ALPHA16: - RETURN_IF_SUPPORTED(MESA_FORMAT_L16A16_UNORM); - RETURN_IF_SUPPORTED(MESA_FORMAT_L8A8_UNORM); + RETURN_IF_SUPPORTED(MESA_FORMAT_LA_UNORM16); + RETURN_IF_SUPPORTED(MESA_FORMAT_LA_UNORM8); break; case GL_INTENSITY: @@ -233,13 +233,13 @@ case GL_COMPRESSED_ALPHA_ARB: RETURN_IF_SUPPORTED(MESA_FORMAT_A_UNORM8); - RETURN_IF_SUPPORTED(MESA_FORMAT_L8A8_UNORM); + RETURN_IF_SUPPORTED(MESA_FORMAT_LA_UNORM8); break; case GL_COMPRESSED_LUMINANCE_ARB: RETURN_IF_SUPPORTED(MESA_FORMAT_L_UNORM8); break; case GL_COMPRESSED_LUMINANCE_ALPHA_ARB: - RETURN_IF_SUPPORTED(MESA_FORMAT_L8A8_UNORM); + RETURN_IF_SUPPORTED(MESA_FORMAT_LA_UNORM8); break; case GL_COMPRESSED_INTENSITY_ARB: RETURN_IF_SUPPORTED(MESA_FORMAT_I_UNORM8); @@ -379,7 +379,7 @@ break; case GL_RG_SNORM: case GL_RG8_SNORM: - RETURN_IF_SUPPORTED(MESA_FORMAT_R8G8_SNORM); + RETURN_IF_SUPPORTED(MESA_FORMAT_RG_SNORM8); break; case GL_RGB_SNORM: case GL_RGB8_SNORM: @@ -409,8 +409,7 @@ break; case GL_LUMINANCE_ALPHA_SNORM: case GL_LUMINANCE8_ALPHA8_SNORM: - RETURN_IF_SUPPORTED(MESA_FORMAT_L8A8_SNORM); - RETURN_IF_SUPPORTED(MESA_FORMAT_A8L8_SNORM); + RETURN_IF_SUPPORTED(MESA_FORMAT_LA_SNORM8); RETURN_IF_SUPPORTED(MESA_FORMAT_A8B8G8R8_SNORM); RETURN_IF_SUPPORTED(MESA_FORMAT_R8G8B8A8_SNORM); break; @@ -424,7 +423,7 @@ RETURN_IF_SUPPORTED(MESA_FORMAT_R_SNORM16); break; case GL_RG16_SNORM: - RETURN_IF_SUPPORTED(MESA_FORMAT_R16G16_SNORM); + RETURN_IF_SUPPORTED(MESA_FORMAT_RG_SNORM16); break; case GL_RGB16_SNORM: RETURN_IF_SUPPORTED(MESA_FORMAT_RGB_SNORM16); @@ -488,8 +487,7 @@ break; case GL_SLUMINANCE_ALPHA_EXT: case GL_SLUMINANCE8_ALPHA8_EXT: - RETURN_IF_SUPPORTED(MESA_FORMAT_L8A8_SRGB); - RETURN_IF_SUPPORTED(MESA_FORMAT_A8L8_SRGB); + RETURN_IF_SUPPORTED(MESA_FORMAT_LA_SRGB8); RETURN_IF_SUPPORTED(MESA_FORMAT_B8G8R8A8_SRGB); RETURN_IF_SUPPORTED(MESA_FORMAT_A8R8G8B8_SRGB); break; @@ -499,8 +497,7 @@ RETURN_IF_SUPPORTED(MESA_FORMAT_A8R8G8B8_SRGB); break; case GL_COMPRESSED_SLUMINANCE_ALPHA_EXT: - RETURN_IF_SUPPORTED(MESA_FORMAT_L8A8_SRGB); - RETURN_IF_SUPPORTED(MESA_FORMAT_A8L8_SRGB); + RETURN_IF_SUPPORTED(MESA_FORMAT_LA_SRGB8); RETURN_IF_SUPPORTED(MESA_FORMAT_B8G8R8A8_SRGB); RETURN_IF_SUPPORTED(MESA_FORMAT_A8R8G8B8_SRGB); break; @@ -680,17 +677,17 @@ case GL_RG: case GL_RG8: - RETURN_IF_SUPPORTED(MESA_FORMAT_R8G8_UNORM); + RETURN_IF_SUPPORTED(MESA_FORMAT_RG_UNORM8); break; case GL_COMPRESSED_RG: if (target != GL_TEXTURE_1D && target != GL_TEXTURE_1D_ARRAY) RETURN_IF_SUPPORTED(MESA_FORMAT_RG_RGTC2_UNORM); - RETURN_IF_SUPPORTED(MESA_FORMAT_R8G8_UNORM); + RETURN_IF_SUPPORTED(MESA_FORMAT_RG_UNORM8); break; case GL_RG16: - RETURN_IF_SUPPORTED(MESA_FORMAT_R16G16_UNORM); + RETURN_IF_SUPPORTED(MESA_FORMAT_RG_UNORM16); break; case GL_R16F: diff -Nru mesa-19.2.8/src/mesa/main/texgetimage.c mesa-20.0.8/src/mesa/main/texgetimage.c --- mesa-19.2.8/src/mesa/main/texgetimage.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/main/texgetimage.c 2020-06-12 01:21:18.000000000 +0000 @@ -1969,7 +1969,7 @@ } -void APIENTRY +void GLAPIENTRY _mesa_GetCompressedTextureSubImage(GLuint texture, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLsizei width, diff -Nru mesa-19.2.8/src/mesa/main/teximage.c mesa-20.0.8/src/mesa/main/teximage.c --- mesa-19.2.8/src/mesa/main/teximage.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/main/teximage.c 2020-06-12 01:21:18.000000000 +0000 @@ -2205,6 +2205,15 @@ return GL_TRUE; } + if (!texture_formats_agree(texImage->InternalFormat, format)) { + _mesa_error(ctx, GL_INVALID_OPERATION, + "%s(incompatible internalFormat = %s, format = %s)", + callerName, + _mesa_enum_to_string(texImage->InternalFormat), + _mesa_enum_to_string(format)); + return GL_TRUE; + } + GLenum internalFormat = _mesa_is_gles(ctx) ? oes_float_internal_format(ctx, texImage->InternalFormat, type) : texImage->InternalFormat; @@ -3373,23 +3382,28 @@ width, height, depth, border, format, type, 0, pixels); } - -void GLAPIENTRY -_mesa_EGLImageTargetTexture2DOES (GLenum target, GLeglImageOES image) +/* + * Helper used by __mesa_EGLImageTargetTexture2DOES and + * _mesa_EGLImageTargetTexStorageEXT. + */ +static void +egl_image_target_texture(struct gl_context *ctx, + struct gl_texture_object *texObj, GLenum target, + GLeglImageOES image, bool tex_storage, + const char *caller) { - struct gl_texture_object *texObj; struct gl_texture_image *texImage; bool valid_target; - GET_CURRENT_CONTEXT(ctx); FLUSH_VERTICES(ctx, 0); switch (target) { case GL_TEXTURE_2D: - valid_target = ctx->Extensions.OES_EGL_image; + valid_target = _mesa_has_OES_EGL_image(ctx) || + (tex_storage && _mesa_has_EXT_EGL_image_storage(ctx)); break; case GL_TEXTURE_EXTERNAL_OES: valid_target = - _mesa_is_gles(ctx) ? ctx->Extensions.OES_EGL_image_external : false; + _mesa_is_gles(ctx) ? _mesa_has_OES_EGL_image_external(ctx) : false; break; default: valid_target = false; @@ -3397,47 +3411,141 @@ } if (!valid_target) { - _mesa_error(ctx, GL_INVALID_ENUM, - "glEGLImageTargetTexture2D(target=%d)", target); + _mesa_error(ctx, GL_INVALID_ENUM, "%s(target=%d)", caller, target); return; } if (!image) { - _mesa_error(ctx, GL_INVALID_OPERATION, - "glEGLImageTargetTexture2D(image=%p)", image); + _mesa_error(ctx, GL_INVALID_VALUE, "%s(image=%p)", caller, image); return; } if (ctx->NewState & _NEW_PIXEL) _mesa_update_state(ctx); - texObj = _mesa_get_current_tex_object(ctx, target); - if (!texObj) - return; - _mesa_lock_texture(ctx, texObj); if (texObj->Immutable) { - _mesa_error(ctx, GL_INVALID_OPERATION, - "glEGLImageTargetTexture2D(texture is immutable)"); + _mesa_error(ctx, GL_INVALID_OPERATION, "%s(texture is immutable)", caller); _mesa_unlock_texture(ctx, texObj); return; } texImage = _mesa_get_tex_image(ctx, texObj, target, 0); if (!texImage) { - _mesa_error(ctx, GL_OUT_OF_MEMORY, "glEGLImageTargetTexture2D"); + _mesa_error(ctx, GL_OUT_OF_MEMORY, "%s", caller); } else { ctx->Driver.FreeTextureImageBuffer(ctx, texImage); - ctx->Driver.EGLImageTargetTexture2D(ctx, target, - texObj, texImage, image); + if (tex_storage) { + ctx->Driver.EGLImageTargetTexStorage(ctx, target, texObj, texImage, + image); + } else { + ctx->Driver.EGLImageTargetTexture2D(ctx, target, texObj, texImage, + image); + } _mesa_dirty_texobj(ctx, texObj); } + + if (tex_storage) + _mesa_set_texture_view_state(ctx, texObj, target, 1); + _mesa_unlock_texture(ctx, texObj); } +void GLAPIENTRY +_mesa_EGLImageTargetTexture2DOES(GLenum target, GLeglImageOES image) +{ + struct gl_texture_object *texObj; + const char *func = "glEGLImageTargetTexture2D"; + GET_CURRENT_CONTEXT(ctx); + + texObj = _mesa_get_current_tex_object(ctx, target); + if (!texObj) { + _mesa_error(ctx, GL_INVALID_ENUM, "%s(target=%d)", func, target); + return; + } + + egl_image_target_texture(ctx, texObj, target, image, false, func); +} + +static void +egl_image_target_texture_storage(struct gl_context *ctx, + struct gl_texture_object *texObj, GLenum target, + GLeglImageOES image, const GLint *attrib_list, + const char *caller) +{ + /* + * EXT_EGL_image_storage: + * + * " must be NULL or a pointer to the value GL_NONE." + */ + if (attrib_list && attrib_list[0] != GL_NONE) { + _mesa_error(ctx, GL_INVALID_VALUE, "%s(image=%p)", caller, image); + return; + } + + switch (target) { + case GL_TEXTURE_2D: + case GL_TEXTURE_EXTERNAL_OES: + break; + default: + /* + * The EXT_EGL_image_storage spec allows for many other targets besides + * GL_TEXTURE_2D and GL_TEXTURE_EXTERNAL_OES, however these are complicated + * to implement. + */ + _mesa_error(ctx, GL_INVALID_OPERATION, "%s(unsupported target=%d)", + caller, target); + return; + } + + egl_image_target_texture(ctx, texObj, target, image, true, caller); +} + + +void GLAPIENTRY +_mesa_EGLImageTargetTexStorageEXT(GLenum target, GLeglImageOES image, + const GLint *attrib_list) +{ + struct gl_texture_object *texObj; + const char *func = "glEGLImageTargetTexStorageEXT"; + GET_CURRENT_CONTEXT(ctx); + + texObj = _mesa_get_current_tex_object(ctx, target); + if (!texObj) { + _mesa_error(ctx, GL_INVALID_ENUM, "%s(target=%d)", func, target); + return; + } + + egl_image_target_texture_storage(ctx, texObj, target, image, attrib_list, + func); +} + +void GLAPIENTRY +_mesa_EGLImageTargetTextureStorageEXT(GLuint texture, GLeglImageOES image, + const GLint *attrib_list) +{ + struct gl_texture_object *texObj; + const char *func = "glEGLImageTargetTextureStorageEXT"; + GET_CURRENT_CONTEXT(ctx); + + if (!(_mesa_is_desktop_gl(ctx) && ctx->Version >= 45) && + !_mesa_has_ARB_direct_state_access(ctx) && + !_mesa_has_EXT_direct_state_access(ctx)) { + _mesa_error(ctx, GL_INVALID_OPERATION, "direct access not supported"); + return; + } + + texObj = _mesa_lookup_texture_err(ctx, texture, func); + if (!texObj) + return; + + egl_image_target_texture_storage(ctx, texObj, texObj->Target, image, + attrib_list, func); +} + /** * Helper that implements the glTexSubImage1/2/3D() * and glTextureSubImage1/2/3D() functions. @@ -5997,9 +6105,9 @@ case GL_LUMINANCE32UI_EXT: return MESA_FORMAT_L_UINT32; case GL_LUMINANCE8_ALPHA8: - return MESA_FORMAT_L8A8_UNORM; + return MESA_FORMAT_LA_UNORM8; case GL_LUMINANCE16_ALPHA16: - return MESA_FORMAT_L16A16_UNORM; + return MESA_FORMAT_LA_UNORM16; case GL_LUMINANCE_ALPHA16F_ARB: return MESA_FORMAT_LA_FLOAT16; case GL_LUMINANCE_ALPHA32F_ARB: @@ -6080,11 +6188,11 @@ return MESA_FORMAT_RGBA_UINT32; case GL_RG8: - return MESA_FORMAT_R8G8_UNORM; + return MESA_FORMAT_RG_UNORM8; case GL_RG16: if (_mesa_is_gles(ctx) && !_mesa_has_EXT_texture_norm16(ctx)) return MESA_FORMAT_NONE; - return MESA_FORMAT_R16G16_UNORM; + return MESA_FORMAT_RG_UNORM16; case GL_RG16F: return MESA_FORMAT_RG_FLOAT16; case GL_RG32F: @@ -6254,10 +6362,10 @@ */ static bool check_texture_buffer_target(struct gl_context *ctx, GLenum target, - const char *caller) + const char *caller, bool dsa) { if (target != GL_TEXTURE_BUFFER_ARB) { - _mesa_error(ctx, GL_INVALID_ENUM, + _mesa_error(ctx, dsa ? GL_INVALID_OPERATION : GL_INVALID_ENUM, "%s(texture target is not GL_TEXTURE_BUFFER)", caller); return false; } @@ -6327,7 +6435,7 @@ /* Need to catch a bad target before it gets to * _mesa_get_current_tex_object. */ - if (!check_texture_buffer_target(ctx, target, "glTexBuffer")) + if (!check_texture_buffer_target(ctx, target, "glTexBuffer", false)) return; if (buffer) { @@ -6359,7 +6467,7 @@ /* Need to catch a bad target before it gets to * _mesa_get_current_tex_object. */ - if (!check_texture_buffer_target(ctx, target, "glTexBufferRange")) + if (!check_texture_buffer_target(ctx, target, "glTexBufferRange", false)) return; if (buffer) { @@ -6392,6 +6500,52 @@ offset, size, "glTexBufferRange"); } + +/** GL_ARB_texture_buffer_range + GL_EXT_direct_state_access */ +void GLAPIENTRY +_mesa_TextureBufferRangeEXT(GLuint texture, GLenum target, GLenum internalFormat, + GLuint buffer, GLintptr offset, GLsizeiptr size) +{ + struct gl_texture_object *texObj; + struct gl_buffer_object *bufObj; + + GET_CURRENT_CONTEXT(ctx); + + texObj = _mesa_lookup_or_create_texture(ctx, target, texture, false, true, + "glTextureBufferRangeEXT"); + if (!texObj) + return; + + if (!check_texture_buffer_target(ctx, target, "glTextureBufferRangeEXT", true)) + return; + + if (buffer) { + bufObj = _mesa_lookup_bufferobj_err(ctx, buffer, "glTextureBufferRangeEXT"); + if (!bufObj) + return; + + if (!check_texture_buffer_range(ctx, bufObj, offset, size, + "glTextureBufferRangeEXT")) + return; + + } else { + /* OpenGL 4.5 core spec (02.02.2015) says in Section 8.9 Buffer + * Textures (PDF page 254): + * "If buffer is zero, then any buffer object attached to the buffer + * texture is detached, the values offset and size are ignored and + * the state for offset and size for the buffer texture are reset to + * zero." + */ + offset = 0; + size = 0; + bufObj = NULL; + } + + texture_buffer_range(ctx, texObj, internalFormat, bufObj, + offset, size, "glTextureBufferRangeEXT"); +} + + void GLAPIENTRY _mesa_TextureBuffer(GLuint texture, GLenum internalFormat, GLuint buffer) { @@ -6412,7 +6566,7 @@ if (!texObj) return; - if (!check_texture_buffer_target(ctx, texObj->Target, "glTextureBuffer")) + if (!check_texture_buffer_target(ctx, texObj->Target, "glTextureBuffer", true)) return; texture_buffer_range(ctx, texObj, internalFormat, @@ -6420,6 +6574,65 @@ } void GLAPIENTRY +_mesa_TextureBufferEXT(GLuint texture, GLenum target, + GLenum internalFormat, GLuint buffer) +{ + struct gl_texture_object *texObj; + struct gl_buffer_object *bufObj; + + GET_CURRENT_CONTEXT(ctx); + + if (buffer) { + bufObj = _mesa_lookup_bufferobj_err(ctx, buffer, "glTextureBuffer"); + if (!bufObj) + return; + } else + bufObj = NULL; + + /* Get the texture object by Name. */ + texObj = _mesa_lookup_or_create_texture(ctx, target, texture, + false, true, + "glTextureBufferEXT"); + + if (!texObj || + !check_texture_buffer_target(ctx, texObj->Target, "glTextureBufferEXT", true)) + return; + + texture_buffer_range(ctx, texObj, internalFormat, + bufObj, 0, buffer ? -1 : 0, "glTextureBufferEXT"); +} + +void GLAPIENTRY +_mesa_MultiTexBufferEXT(GLenum texunit, GLenum target, + GLenum internalFormat, GLuint buffer) +{ + struct gl_texture_object *texObj; + struct gl_buffer_object *bufObj; + + GET_CURRENT_CONTEXT(ctx); + + if (buffer) { + bufObj = _mesa_lookup_bufferobj_err(ctx, buffer, "glMultiTexBufferEXT"); + if (!bufObj) + return; + } else + bufObj = NULL; + + /* Get the texture object */ + texObj = _mesa_get_texobj_by_target_and_texunit(ctx, target, + texunit - GL_TEXTURE0, + true, + "glMultiTexBufferEXT"); + + if (!texObj || + !check_texture_buffer_target(ctx, texObj->Target, "glMultiTexBufferEXT", false)) + return; + + texture_buffer_range(ctx, texObj, internalFormat, + bufObj, 0, buffer ? -1 : 0, "glMultiTexBufferEXT"); +} + +void GLAPIENTRY _mesa_TextureBufferRange(GLuint texture, GLenum internalFormat, GLuint buffer, GLintptr offset, GLsizeiptr size) { @@ -6457,7 +6670,7 @@ return; if (!check_texture_buffer_target(ctx, texObj->Target, - "glTextureBufferRange")) + "glTextureBufferRange", true)) return; texture_buffer_range(ctx, texObj, internalFormat, @@ -6816,6 +7029,52 @@ "glTextureStorage3DMultisample"); } +void GLAPIENTRY +_mesa_TextureStorage2DMultisampleEXT(GLuint texture, GLenum target, GLsizei samples, + GLenum internalformat, GLsizei width, + GLsizei height, + GLboolean fixedsamplelocations) +{ + struct gl_texture_object *texObj; + GET_CURRENT_CONTEXT(ctx); + + texObj = lookup_texture_ext_dsa(ctx, target, texture, + "glTextureStorage2DMultisampleEXT"); + if (!texObj) + return; + + if (!valid_texstorage_ms_parameters(width, height, 1, 2)) + return; + + texture_image_multisample(ctx, 2, texObj, NULL, texObj->Target, + samples, internalformat, width, height, 1, + fixedsamplelocations, GL_TRUE, 0, + "glTextureStorage2DMultisampleEXT"); +} + +void GLAPIENTRY +_mesa_TextureStorage3DMultisampleEXT(GLuint texture, GLenum target, GLsizei samples, + GLenum internalformat, GLsizei width, + GLsizei height, GLsizei depth, + GLboolean fixedsamplelocations) +{ + struct gl_texture_object *texObj; + GET_CURRENT_CONTEXT(ctx); + + texObj = lookup_texture_ext_dsa(ctx, target, texture, + "glTextureStorage3DMultisampleEXT"); + if (!texObj) + return; + + if (!valid_texstorage_ms_parameters(width, height, depth, 3)) + return; + + texture_image_multisample(ctx, 3, texObj, NULL, texObj->Target, samples, + internalformat, width, height, depth, + fixedsamplelocations, GL_TRUE, 0, + "glTextureStorage3DMultisampleEXT"); +} + void _mesa_texture_storage_ms_memory(struct gl_context *ctx, GLuint dims, struct gl_texture_object *texObj, diff -Nru mesa-19.2.8/src/mesa/main/teximage.h mesa-20.0.8/src/mesa/main/teximage.h --- mesa-19.2.8/src/mesa/main/teximage.h 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/main/teximage.h 2020-06-12 01:21:18.000000000 +0000 @@ -330,6 +330,12 @@ extern void GLAPIENTRY _mesa_EGLImageTargetTexture2DOES( GLenum target, GLeglImageOES image ); +extern void GLAPIENTRY +_mesa_EGLImageTargetTexStorageEXT(GLenum target, GLeglImageOES image, + const GLint *attrib_list); +extern void GLAPIENTRY +_mesa_EGLImageTargetTextureStorageEXT(GLuint texture, GLeglImageOES image, + const GLint *attrib_list); void GLAPIENTRY _mesa_TexSubImage1D_no_error(GLenum target, GLint level, GLint xoffset, GLsizei width, @@ -785,9 +791,21 @@ GLintptr offset, GLsizeiptr size); extern void GLAPIENTRY +_mesa_TextureBufferRangeEXT(GLuint texture, GLenum target, GLenum internalFormat, + GLuint buffer, GLintptr offset, GLsizeiptr size); + +extern void GLAPIENTRY _mesa_TextureBuffer(GLuint texture, GLenum internalFormat, GLuint buffer); extern void GLAPIENTRY +_mesa_TextureBufferEXT(GLuint texture, GLenum target, GLenum internalFormat, + GLuint buffer); + +extern void GLAPIENTRY +_mesa_MultiTexBufferEXT(GLenum texunit, GLenum target, GLenum internalFormat, + GLuint buffer); + +extern void GLAPIENTRY _mesa_TextureBufferRange(GLuint texture, GLenum internalFormat, GLuint buffer, GLintptr offset, GLsizeiptr size); @@ -825,6 +843,18 @@ GLenum internalformat, GLsizei width, GLsizei height, GLsizei depth, GLboolean fixedsamplelocations); + +extern void GLAPIENTRY +_mesa_TextureStorage2DMultisampleEXT(GLuint texture, GLenum target, GLsizei samples, + GLenum internalformat, GLsizei width, + GLsizei height, GLboolean fixedsamplelocations); + +extern void GLAPIENTRY +_mesa_TextureStorage3DMultisampleEXT(GLuint texture, GLenum target, GLsizei samples, + GLenum internalformat, GLsizei width, + GLsizei height, GLsizei depth, + GLboolean fixedsamplelocations); + /*@}*/ #ifdef __cplusplus diff -Nru mesa-19.2.8/src/mesa/main/texobj.c mesa-20.0.8/src/mesa/main/texobj.c --- mesa-19.2.8/src/mesa/main/texobj.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/main/texobj.c 2020-06-12 01:21:18.000000000 +0000 @@ -757,7 +757,7 @@ /* Adjust max level for views: the data store may have more levels than * the view exposes. */ - t->_MaxLevel = MIN2(t->_MaxLevel, t->NumLevels - 1); + t->_MaxLevel = MAX2(MIN2(t->_MaxLevel, t->NumLevels - 1), 0); } /* Compute _MaxLambda = q - p in the spec used during mipmapping */ diff -Nru mesa-19.2.8/src/mesa/main/texparam.c mesa-20.0.8/src/mesa/main/texparam.c --- mesa-19.2.8/src/mesa/main/texparam.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/main/texparam.c 2020-06-12 01:21:18.000000000 +0000 @@ -1395,6 +1395,38 @@ } void GLAPIENTRY +_mesa_TextureParameterIivEXT(GLuint texture, GLenum target, GLenum pname, + const GLint *params) +{ + struct gl_texture_object *texObj; + GET_CURRENT_CONTEXT(ctx); + + texObj = _mesa_lookup_or_create_texture(ctx, target, texture, false, true, + "glTextureParameterIivEXT"); + if (!texObj) + return; + + _mesa_texture_parameterIiv(ctx, texObj, pname, params, true); +} + +void GLAPIENTRY +_mesa_MultiTexParameterIivEXT(GLenum texunit, GLenum target, GLenum pname, + const GLint *params) +{ + struct gl_texture_object *texObj; + GET_CURRENT_CONTEXT(ctx); + + texObj = _mesa_get_texobj_by_target_and_texunit(ctx, target, + texunit - GL_TEXTURE0, + true, + "glMultiTexParameterIivEXT"); + if (!texObj) + return; + + _mesa_texture_parameterIiv(ctx, texObj, pname, params, true); +} + +void GLAPIENTRY _mesa_TextureParameterIuiv(GLuint texture, GLenum pname, const GLuint *params) { struct gl_texture_object *texObj; @@ -1407,6 +1439,38 @@ _mesa_texture_parameterIuiv(ctx, texObj, pname, params, true); } +void GLAPIENTRY +_mesa_TextureParameterIuivEXT(GLuint texture, GLenum target, GLenum pname, + const GLuint *params) +{ + struct gl_texture_object *texObj; + GET_CURRENT_CONTEXT(ctx); + + texObj = _mesa_lookup_or_create_texture(ctx, target, texture, false, true, + "glTextureParameterIuivEXT"); + if (!texObj) + return; + + _mesa_texture_parameterIuiv(ctx, texObj, pname, params, true); +} + +void GLAPIENTRY +_mesa_MultiTexParameterIuivEXT(GLenum texunit, GLenum target, GLenum pname, + const GLuint *params) +{ + struct gl_texture_object *texObj; + GET_CURRENT_CONTEXT(ctx); + + texObj = _mesa_get_texobj_by_target_and_texunit(ctx, target, + texunit - GL_TEXTURE0, + true, + "glMultiTexParameterIuivEXT"); + if (!texObj) + return; + + _mesa_texture_parameterIuiv(ctx, texObj, pname, params, true); +} + GLboolean _mesa_legal_get_tex_level_parameter_target(struct gl_context *ctx, GLenum target, bool dsa) @@ -2739,6 +2803,37 @@ get_tex_parameterIiv(ctx, texObj, pname, params, true); } +void GLAPIENTRY +_mesa_GetTextureParameterIivEXT(GLuint texture, GLenum target, GLenum pname, GLint *params) +{ + struct gl_texture_object *texObj; + GET_CURRENT_CONTEXT(ctx); + + texObj = _mesa_lookup_or_create_texture(ctx, target, texture, false, true, + "glGetTextureParameterIivEXT"); + if (!texObj) + return; + + + get_tex_parameterIiv(ctx, texObj, pname, params, true); +} + +void GLAPIENTRY +_mesa_GetMultiTexParameterIivEXT(GLenum texunit, GLenum target, GLenum pname, + GLint *params) +{ + struct gl_texture_object *texObj; + GET_CURRENT_CONTEXT(ctx); + + texObj = _mesa_get_texobj_by_target_and_texunit(ctx, target, + texunit - GL_TEXTURE0, + true, + "glGetMultiTexParameterIiv"); + if (!texObj) + return; + + get_tex_parameterIiv(ctx, texObj, pname, params, true); +} void GLAPIENTRY _mesa_GetTextureParameterIuiv(GLuint texture, GLenum pname, GLuint *params) @@ -2750,5 +2845,37 @@ if (!texObj) return; + get_tex_parameterIiv(ctx, texObj, pname, (GLint *) params, true); +} + +void GLAPIENTRY +_mesa_GetTextureParameterIuivEXT(GLuint texture, GLenum target, GLenum pname, + GLuint *params) +{ + struct gl_texture_object *texObj; + GET_CURRENT_CONTEXT(ctx); + + texObj = _mesa_lookup_or_create_texture(ctx, target, texture, false, true, + "glGetTextureParameterIuvEXT"); + if (!texObj) + return; + + get_tex_parameterIiv(ctx, texObj, pname, (GLint *) params, true); +} + +void GLAPIENTRY +_mesa_GetMultiTexParameterIuivEXT(GLenum texunit, GLenum target, GLenum pname, + GLuint *params) +{ + struct gl_texture_object *texObj; + GET_CURRENT_CONTEXT(ctx); + + texObj = _mesa_get_texobj_by_target_and_texunit(ctx, target, + texunit - GL_TEXTURE0, + true, + "glGetMultiTexParameterIuiv"); + if (!texObj) + return; + get_tex_parameterIiv(ctx, texObj, pname, (GLint *) params, true); } diff -Nru mesa-19.2.8/src/mesa/main/texparam.h mesa-20.0.8/src/mesa/main/texparam.h --- mesa-19.2.8/src/mesa/main/texparam.h 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/main/texparam.h 2020-06-12 01:21:18.000000000 +0000 @@ -144,8 +144,20 @@ _mesa_GetTextureParameterIiv(GLuint texture, GLenum pname, GLint *params); extern void GLAPIENTRY +_mesa_GetTextureParameterIivEXT(GLuint texture, GLenum target, GLenum pname, GLint *params); + +extern void GLAPIENTRY +_mesa_GetMultiTexParameterIivEXT(GLenum texunit, GLenum target, GLenum pname, GLint *params); + +extern void GLAPIENTRY _mesa_GetTextureParameterIuiv(GLuint texture, GLenum pname, GLuint *params); +extern void GLAPIENTRY +_mesa_GetTextureParameterIuivEXT(GLuint texture, GLenum target, GLenum pname, GLuint *params); + +extern void GLAPIENTRY +_mesa_GetMultiTexParameterIuivEXT(GLenum texunit, GLenum target, GLenum pname, GLuint *params); + extern void GLAPIENTRY _mesa_TexParameterfv( GLenum target, GLenum pname, const GLfloat *params ); @@ -193,9 +205,21 @@ _mesa_TextureParameterIiv(GLuint texture, GLenum pname, const GLint *params); extern void GLAPIENTRY +_mesa_TextureParameterIivEXT(GLuint texture, GLenum target, GLenum pname, const GLint *params); + +extern void GLAPIENTRY +_mesa_MultiTexParameterIivEXT(GLenum texunit, GLenum target, GLenum pname, const GLint *params); + +extern void GLAPIENTRY _mesa_TextureParameterIuiv(GLuint texture, GLenum pname, const GLuint *params); extern void GLAPIENTRY +_mesa_TextureParameterIuivEXT(GLuint texture, GLenum target, GLenum pname, const GLuint *params); + +extern void GLAPIENTRY +_mesa_MultiTexParameterIuivEXT(GLenum texunit, GLenum target, GLenum pname, const GLuint *params); + +extern void GLAPIENTRY _mesa_MultiTexParameterfEXT(GLenum texunit, GLenum target, GLenum pname, GLfloat param); extern void GLAPIENTRY diff -Nru mesa-19.2.8/src/mesa/main/texstorage.c mesa-20.0.8/src/mesa/main/texstorage.c --- mesa-19.2.8/src/mesa/main/texstorage.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/main/texstorage.c 2020-06-12 01:21:18.000000000 +0000 @@ -738,29 +738,20 @@ } -/* - * Note: we don't support GL_EXT_direct_state_access and the spec says - * we don't need the following functions. However, glew checks for the - * presence of all six functions and will say that GL_ARB_texture_storage - * is not supported if these functions are missing. - */ - - void GLAPIENTRY _mesa_TextureStorage1DEXT(GLuint texture, GLenum target, GLsizei levels, GLenum internalformat, GLsizei width) { GET_CURRENT_CONTEXT(ctx); - - (void) texture; - (void) target; - (void) levels; - (void) internalformat; - (void) width; - - _mesa_error(ctx, GL_INVALID_OPERATION, - "glTextureStorage1DEXT not supported"); + /* 'texture' must always be initialized, even if the call to + * glTextureStorage1DEXT will generate an error. + */ + if (!_mesa_lookup_or_create_texture(ctx, target, texture, false, true, + "glTextureStorage1DEXT")) + return; + texturestorage_error(1, texture, levels, internalformat, width, 1, 1, + "glTextureStorage1DEXT"); } @@ -770,16 +761,14 @@ GLsizei width, GLsizei height) { GET_CURRENT_CONTEXT(ctx); - - (void) texture; - (void) target; - (void) levels; - (void) internalformat; - (void) width; - (void) height; - - _mesa_error(ctx, GL_INVALID_OPERATION, - "glTextureStorage2DEXT not supported"); + /* 'texture' must always be initialized, even if the call to + * glTextureStorage2DEXT will generate an error. + */ + if (!_mesa_lookup_or_create_texture(ctx, target, texture, false, true, + "glTextureStorage2DEXT")) + return; + texturestorage_error(2, texture, levels, internalformat, width, height, 1, + "glTextureStorage2DEXT"); } @@ -789,17 +778,14 @@ GLsizei width, GLsizei height, GLsizei depth) { GET_CURRENT_CONTEXT(ctx); - - (void) texture; - (void) target; - (void) levels; - (void) internalformat; - (void) width; - (void) height; - (void) depth; - - _mesa_error(ctx, GL_INVALID_OPERATION, - "glTextureStorage3DEXT not supported"); + /* 'texture' must always be initialized, even if the call to + * glTextureStorage3DEXT will generate an error. + */ + if (!_mesa_lookup_or_create_texture(ctx, target, texture, false, true, + "glTextureStorage3DEXT")) + return; + texturestorage_error(3, texture, levels, internalformat, width, height, depth, + "glTextureStorage3DEXT"); } diff -Nru mesa-19.2.8/src/mesa/main/texstore.c mesa-20.0.8/src/mesa/main/texstore.c --- mesa-19.2.8/src/mesa/main/texstore.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/main/texstore.c 2020-06-12 01:21:18.000000000 +0000 @@ -280,8 +280,6 @@ static GLboolean _mesa_texstore_ycbcr(TEXSTORE_PARAMS) { - const GLboolean littleEndian = _mesa_little_endian(); - (void) ctx; (void) dims; (void) baseInternalFormat; assert((dstFormat == MESA_FORMAT_YCBCR) || @@ -305,7 +303,7 @@ if (srcPacking->SwapBytes ^ (srcType == GL_UNSIGNED_SHORT_8_8_REV_MESA) ^ (dstFormat == MESA_FORMAT_YCBCR_REV) ^ - !littleEndian) { + !UTIL_ARCH_LITTLE_ENDIAN) { GLint img, row; for (img = 0; img < srcDepth; img++) { GLubyte *dstRow = dstSlices[img]; @@ -538,7 +536,7 @@ GLint img, row; const GLint srcRowStride = _mesa_image_row_stride(srcPacking, srcWidth, srcFormat, srcType) - / sizeof(uint64_t); + / sizeof(int32_t); assert(dstFormat == MESA_FORMAT_Z32_FLOAT_S8X24_UINT); assert(srcFormat == GL_DEPTH_STENCIL || @@ -551,8 +549,8 @@ /* In case we only upload depth we need to preserve the stencil */ for (img = 0; img < srcDepth; img++) { uint64_t *dstRow = (uint64_t *) dstSlices[img]; - const uint64_t *src - = (const uint64_t *) _mesa_image_address(dims, srcPacking, srcAddr, + const int32_t *src + = (const int32_t *) _mesa_image_address(dims, srcPacking, srcAddr, srcWidth, srcHeight, srcFormat, srcType, img, 0, 0); diff -Nru mesa-19.2.8/src/mesa/main/varray.c mesa-20.0.8/src/mesa/main/varray.c --- mesa-19.2.8/src/mesa/main/varray.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/main/varray.c 2020-06-12 01:21:18.000000000 +0000 @@ -40,6 +40,7 @@ #include "mtypes.h" #include "varray.h" #include "arrayobj.h" +#include "get.h" #include "main/dispatch.h" @@ -484,6 +485,8 @@ * Do error checking for glVertex/Color/TexCoord/...Pointer functions. * * \param func name of calling function used for error reporting + * \param vao the vao to update + * \param obj the bound buffer object * \param attrib the attribute array index to update * \param legalTypes bitmask of *_BIT above indicating legal datatypes * \param sizeMin min allowable size value @@ -498,14 +501,14 @@ */ static void validate_array(struct gl_context *ctx, const char *func, + struct gl_vertex_array_object *vao, + struct gl_buffer_object *obj, GLuint attrib, GLbitfield legalTypesMask, GLint sizeMin, GLint sizeMax, GLint size, GLenum type, GLsizei stride, GLboolean normalized, GLboolean integer, GLboolean doubles, const GLvoid *ptr) { - struct gl_vertex_array_object *vao = ctx->Array.VAO; - /* Page 407 (page 423 of the PDF) of the OpenGL 3.0 spec says: * * "Client vertex arrays - all vertex array attribute pointers must @@ -547,7 +550,7 @@ * 2.9.6), and the pointer argument is not NULL." */ if (ptr != NULL && vao != ctx->Array.DefaultVAO && - !_mesa_is_bufferobj(ctx->Array.ArrayBufferObj)) { + !_mesa_is_bufferobj(obj)) { _mesa_error(ctx, GL_INVALID_OPERATION, "%s(non-VBO array)", func); return; } @@ -556,15 +559,16 @@ static bool validate_array_and_format(struct gl_context *ctx, const char *func, + struct gl_vertex_array_object *vao, + struct gl_buffer_object *obj, GLuint attrib, GLbitfield legalTypes, GLint sizeMin, GLint sizeMax, GLint size, GLenum type, GLsizei stride, GLboolean normalized, GLboolean integer, - GLboolean doubles, GLenum format, const GLvoid *ptr, - struct gl_vertex_array_object *vao) + GLboolean doubles, GLenum format, const GLvoid *ptr) { - validate_array(ctx, func, attrib, legalTypes, sizeMin, sizeMax, size, - type, stride, normalized, integer, doubles, ptr); + validate_array(ctx, func, vao, obj, attrib, legalTypes, sizeMin, sizeMax, + size, type, stride, normalized, integer, doubles, ptr); return validate_array_format(ctx, func, vao, attrib, legalTypes, sizeMin, sizeMax, size, type, normalized, integer, @@ -575,6 +579,8 @@ /** * Update state for glVertex/Color/TexCoord/...Pointer functions. * + * \param vao the vao to update + * \param obj the bound buffer object * \param attrib the attribute array index to update * \param format Either GL_RGBA or GL_BGRA. * \param sizeMax max allowable size value (may also be BGRA_OR_4) @@ -588,14 +594,14 @@ */ static void update_array(struct gl_context *ctx, + struct gl_vertex_array_object *vao, + struct gl_buffer_object *obj, GLuint attrib, GLenum format, GLint sizeMax, GLint size, GLenum type, GLsizei stride, GLboolean normalized, GLboolean integer, GLboolean doubles, const GLvoid *ptr) { - struct gl_vertex_array_object *vao = ctx->Array.VAO; - _mesa_update_array_format(ctx, vao, attrib, size, type, format, normalized, integer, doubles, 0); @@ -616,17 +622,50 @@ GLsizei effectiveStride = stride != 0 ? stride : array->Format._ElementSize; _mesa_bind_vertex_buffer(ctx, vao, attrib, - ctx->Array.ArrayBufferObj, (GLintptr) ptr, + obj, (GLintptr) ptr, effectiveStride); } + +/* Helper function for all EXT_direct_state_access glVertexArray* functions */ +static bool +_lookup_vao_and_vbo_dsa(struct gl_context *ctx, + GLuint vaobj, GLuint buffer, + GLintptr offset, + struct gl_vertex_array_object** vao, + struct gl_buffer_object** vbo, + const char* caller) +{ + *vao = _mesa_lookup_vao_err(ctx, vaobj, true, caller); + if (!(*vao)) + return false; + + if (buffer != 0) { + *vbo = _mesa_lookup_bufferobj(ctx, buffer); + if (!_mesa_handle_bind_buffer_gen(ctx, buffer, vbo, caller)) + return false; + + if (offset < 0) { + _mesa_error(ctx, GL_INVALID_VALUE, + "%s(negative offset with non-0 buffer)", caller); + return false; + } + } else { + *vbo = ctx->Shared->NullBufferObj; + } + + return true; +} + + void GLAPIENTRY _mesa_VertexPointer_no_error(GLint size, GLenum type, GLsizei stride, const GLvoid *ptr) { GET_CURRENT_CONTEXT(ctx); - update_array(ctx, VERT_ATTRIB_POS, GL_RGBA, 4, size, type, stride, + update_array(ctx, ctx->Array.VAO, ctx->Array.ArrayBufferObj, + VERT_ATTRIB_POS, GL_RGBA, 4, size, type, stride, GL_FALSE, GL_FALSE, GL_FALSE, ptr); } @@ -644,23 +683,61 @@ UNSIGNED_INT_2_10_10_10_REV_BIT | INT_2_10_10_10_REV_BIT); - if (!validate_array_and_format(ctx, "glVertexPointer", VERT_ATTRIB_POS, - legalTypes, 2, 4, size, type, stride, - GL_FALSE, GL_FALSE, GL_FALSE, format, - ptr, ctx->Array.VAO)) + if (!validate_array_and_format(ctx, "glVertexPointer", + ctx->Array.VAO, ctx->Array.ArrayBufferObj, + VERT_ATTRIB_POS, legalTypes, 2, 4, size, + type, stride, GL_FALSE, GL_FALSE, GL_FALSE, + format, ptr)) return; - update_array(ctx, VERT_ATTRIB_POS, format, 4, size, type, stride, + update_array(ctx, ctx->Array.VAO, ctx->Array.ArrayBufferObj, + VERT_ATTRIB_POS, format, 4, size, type, stride, GL_FALSE, GL_FALSE, GL_FALSE, ptr); } void GLAPIENTRY +_mesa_VertexArrayVertexOffsetEXT(GLuint vaobj, GLuint buffer, GLint size, + GLenum type, GLsizei stride, GLintptr offset) +{ + GET_CURRENT_CONTEXT(ctx); + + GLenum format = GL_RGBA; + GLbitfield legalTypes = (ctx->API == API_OPENGLES) + ? (BYTE_BIT | SHORT_BIT | FLOAT_BIT | FIXED_ES_BIT) + : (SHORT_BIT | INT_BIT | FLOAT_BIT | + DOUBLE_BIT | HALF_BIT | + UNSIGNED_INT_2_10_10_10_REV_BIT | + INT_2_10_10_10_REV_BIT); + + struct gl_vertex_array_object* vao; + struct gl_buffer_object* vbo; + + if (!_lookup_vao_and_vbo_dsa(ctx, vaobj, buffer, offset, + &vao, &vbo, + "glVertexArrayVertexOffsetEXT")) + return; + + if (!validate_array_and_format(ctx, "glVertexArrayVertexOffsetEXT", + vao, vbo, + VERT_ATTRIB_POS, legalTypes, 2, 4, size, + type, stride, GL_FALSE, GL_FALSE, GL_FALSE, + format, (void*) offset)) + return; + + update_array(ctx, vao, vbo, + VERT_ATTRIB_POS, format, 4, size, type, stride, + GL_FALSE, GL_FALSE, GL_FALSE, (void*) offset); +} + + +void GLAPIENTRY _mesa_NormalPointer_no_error(GLenum type, GLsizei stride, const GLvoid *ptr ) { GET_CURRENT_CONTEXT(ctx); - update_array(ctx, VERT_ATTRIB_NORMAL, GL_RGBA, 3, 3, type, stride, GL_TRUE, + update_array(ctx, ctx->Array.VAO, ctx->Array.ArrayBufferObj, + VERT_ATTRIB_NORMAL, GL_RGBA, 3, 3, type, stride, GL_TRUE, GL_FALSE, GL_FALSE, ptr); } @@ -679,24 +756,62 @@ INT_2_10_10_10_REV_BIT); if (!validate_array_and_format(ctx, "glNormalPointer", + ctx->Array.VAO, ctx->Array.ArrayBufferObj, VERT_ATTRIB_NORMAL, legalTypes, 3, 3, 3, type, stride, GL_TRUE, GL_FALSE, - GL_FALSE, format, ptr, ctx->Array.VAO)) + GL_FALSE, format, ptr)) return; - update_array(ctx, VERT_ATTRIB_NORMAL, format, 3, 3, type, stride, GL_TRUE, + update_array(ctx, ctx->Array.VAO, ctx->Array.ArrayBufferObj, + VERT_ATTRIB_NORMAL, format, 3, 3, type, stride, GL_TRUE, GL_FALSE, GL_FALSE, ptr); } void GLAPIENTRY +_mesa_VertexArrayNormalOffsetEXT(GLuint vaobj, GLuint buffer, GLenum type, + GLsizei stride, GLintptr offset) +{ + GET_CURRENT_CONTEXT(ctx); + + GLenum format = GL_RGBA; + const GLbitfield legalTypes = (ctx->API == API_OPENGLES) + ? (BYTE_BIT | SHORT_BIT | FLOAT_BIT | FIXED_ES_BIT) + : (BYTE_BIT | SHORT_BIT | INT_BIT | + HALF_BIT | FLOAT_BIT | DOUBLE_BIT | + UNSIGNED_INT_2_10_10_10_REV_BIT | + INT_2_10_10_10_REV_BIT); + + struct gl_vertex_array_object* vao; + struct gl_buffer_object* vbo; + + if (!_lookup_vao_and_vbo_dsa(ctx, vaobj, buffer, offset, + &vao, &vbo, + "glNormalPointer")) + return; + + if (!validate_array_and_format(ctx, "glNormalPointer", + vao, vbo, + VERT_ATTRIB_NORMAL, legalTypes, 3, 3, 3, + type, stride, GL_TRUE, GL_FALSE, + GL_FALSE, format, (void*) offset)) + return; + + update_array(ctx, vao, vbo, + VERT_ATTRIB_NORMAL, format, 3, 3, type, stride, GL_TRUE, + GL_FALSE, GL_FALSE, (void*) offset); +} + + +void GLAPIENTRY _mesa_ColorPointer_no_error(GLint size, GLenum type, GLsizei stride, const GLvoid *ptr) { GET_CURRENT_CONTEXT(ctx); GLenum format = get_array_format(ctx, BGRA_OR_4, &size); - update_array(ctx, VERT_ATTRIB_COLOR0, format, BGRA_OR_4, size, + update_array(ctx, ctx->Array.VAO, ctx->Array.ArrayBufferObj, + VERT_ATTRIB_COLOR0, format, BGRA_OR_4, size, type, stride, GL_TRUE, GL_FALSE, GL_FALSE, ptr); } @@ -718,23 +833,63 @@ INT_2_10_10_10_REV_BIT); if (!validate_array_and_format(ctx, "glColorPointer", + ctx->Array.VAO, ctx->Array.ArrayBufferObj, VERT_ATTRIB_COLOR0, legalTypes, sizeMin, BGRA_OR_4, size, type, stride, GL_TRUE, - GL_FALSE, GL_FALSE, format, ptr, - ctx->Array.VAO)) + GL_FALSE, GL_FALSE, format, ptr)) return; - update_array(ctx, VERT_ATTRIB_COLOR0, format, BGRA_OR_4, size, + update_array(ctx, ctx->Array.VAO, ctx->Array.ArrayBufferObj, + VERT_ATTRIB_COLOR0, format, BGRA_OR_4, size, type, stride, GL_TRUE, GL_FALSE, GL_FALSE, ptr); } void GLAPIENTRY +_mesa_VertexArrayColorOffsetEXT(GLuint vaobj, GLuint buffer, GLint size, + GLenum type, GLsizei stride, GLintptr offset) +{ + GET_CURRENT_CONTEXT(ctx); + const GLint sizeMin = (ctx->API == API_OPENGLES) ? 4 : 3; + + GLenum format = get_array_format(ctx, BGRA_OR_4, &size); + const GLbitfield legalTypes = (ctx->API == API_OPENGLES) + ? (UNSIGNED_BYTE_BIT | HALF_BIT | FLOAT_BIT | FIXED_ES_BIT) + : (BYTE_BIT | UNSIGNED_BYTE_BIT | + SHORT_BIT | UNSIGNED_SHORT_BIT | + INT_BIT | UNSIGNED_INT_BIT | + HALF_BIT | FLOAT_BIT | DOUBLE_BIT | + UNSIGNED_INT_2_10_10_10_REV_BIT | + INT_2_10_10_10_REV_BIT); + + struct gl_vertex_array_object* vao; + struct gl_buffer_object* vbo; + + if (!_lookup_vao_and_vbo_dsa(ctx, vaobj, buffer, offset, + &vao, &vbo, + "glVertexArrayColorOffsetEXT")) + return; + + if (!validate_array_and_format(ctx, "glVertexArrayColorOffsetEXT", + vao, vbo, + VERT_ATTRIB_COLOR0, legalTypes, sizeMin, + BGRA_OR_4, size, type, stride, GL_TRUE, + GL_FALSE, GL_FALSE, format, (void*) offset)) + return; + + update_array(ctx, vao, vbo, + VERT_ATTRIB_COLOR0, format, BGRA_OR_4, size, + type, stride, GL_TRUE, GL_FALSE, GL_FALSE, (void*) offset); +} + + +void GLAPIENTRY _mesa_FogCoordPointer_no_error(GLenum type, GLsizei stride, const GLvoid *ptr) { GET_CURRENT_CONTEXT(ctx); - update_array(ctx, VERT_ATTRIB_FOG, GL_RGBA, 1, 1, type, stride, GL_FALSE, + update_array(ctx, ctx->Array.VAO, ctx->Array.ArrayBufferObj, + VERT_ATTRIB_FOG, GL_RGBA, 1, 1, type, stride, GL_FALSE, GL_FALSE, GL_FALSE, ptr); } @@ -748,22 +903,55 @@ const GLbitfield legalTypes = (HALF_BIT | FLOAT_BIT | DOUBLE_BIT); if (!validate_array_and_format(ctx, "glFogCoordPointer", + ctx->Array.VAO, ctx->Array.ArrayBufferObj, VERT_ATTRIB_FOG, legalTypes, 1, 1, 1, type, stride, GL_FALSE, GL_FALSE, - GL_FALSE, format, ptr, ctx->Array.VAO)) + GL_FALSE, format, ptr)) return; - update_array(ctx, VERT_ATTRIB_FOG, format, 1, 1, type, stride, GL_FALSE, + update_array(ctx, ctx->Array.VAO, ctx->Array.ArrayBufferObj, + VERT_ATTRIB_FOG, format, 1, 1, type, stride, GL_FALSE, GL_FALSE, GL_FALSE, ptr); } void GLAPIENTRY +_mesa_VertexArrayFogCoordOffsetEXT(GLuint vaobj, GLuint buffer, GLenum type, + GLsizei stride, GLintptr offset) +{ + GET_CURRENT_CONTEXT(ctx); + + GLenum format = GL_RGBA; + const GLbitfield legalTypes = (HALF_BIT | FLOAT_BIT | DOUBLE_BIT); + + struct gl_vertex_array_object* vao; + struct gl_buffer_object* vbo; + + if (!_lookup_vao_and_vbo_dsa(ctx, vaobj, buffer, offset, + &vao, &vbo, + "glVertexArrayFogCoordOffsetEXT")) + return; + + if (!validate_array_and_format(ctx, "glVertexArrayFogCoordOffsetEXT", + vao, vbo, + VERT_ATTRIB_FOG, legalTypes, 1, 1, 1, + type, stride, GL_FALSE, GL_FALSE, + GL_FALSE, format, (void*) offset)) + return; + + update_array(ctx, vao, vbo, + VERT_ATTRIB_FOG, format, 1, 1, type, stride, GL_FALSE, + GL_FALSE, GL_FALSE, (void*) offset); +} + + +void GLAPIENTRY _mesa_IndexPointer_no_error(GLenum type, GLsizei stride, const GLvoid *ptr) { GET_CURRENT_CONTEXT(ctx); - update_array(ctx, VERT_ATTRIB_COLOR_INDEX, GL_RGBA, 1, 1, type, stride, + update_array(ctx, ctx->Array.VAO, ctx->Array.ArrayBufferObj, + VERT_ATTRIB_COLOR_INDEX, GL_RGBA, 1, 1, type, stride, GL_FALSE, GL_FALSE, GL_FALSE, ptr); } @@ -778,25 +966,58 @@ FLOAT_BIT | DOUBLE_BIT); if (!validate_array_and_format(ctx, "glIndexPointer", + ctx->Array.VAO, ctx->Array.ArrayBufferObj, VERT_ATTRIB_COLOR_INDEX, legalTypes, 1, 1, 1, type, stride, - GL_FALSE, GL_FALSE, GL_FALSE, format, - ptr, ctx->Array.VAO)) + GL_FALSE, GL_FALSE, GL_FALSE, format, ptr)) return; - update_array(ctx, VERT_ATTRIB_COLOR_INDEX, format, 1, 1, type, stride, + update_array(ctx, ctx->Array.VAO, ctx->Array.ArrayBufferObj, + VERT_ATTRIB_COLOR_INDEX, format, 1, 1, type, stride, GL_FALSE, GL_FALSE, GL_FALSE, ptr); } void GLAPIENTRY +_mesa_VertexArrayIndexOffsetEXT(GLuint vaobj, GLuint buffer, GLenum type, + GLsizei stride, GLintptr offset) +{ + GET_CURRENT_CONTEXT(ctx); + + GLenum format = GL_RGBA; + const GLbitfield legalTypes = (UNSIGNED_BYTE_BIT | SHORT_BIT | INT_BIT | + FLOAT_BIT | DOUBLE_BIT); + + struct gl_vertex_array_object* vao; + struct gl_buffer_object* vbo; + + if (!_lookup_vao_and_vbo_dsa(ctx, vaobj, buffer, offset, + &vao, &vbo, + "glVertexArrayIndexOffsetEXT")) + return; + + if (!validate_array_and_format(ctx, "glVertexArrayIndexOffsetEXT", + vao, vbo, + VERT_ATTRIB_COLOR_INDEX, + legalTypes, 1, 1, 1, type, stride, + GL_FALSE, GL_FALSE, GL_FALSE, format, (void*) offset)) + return; + + update_array(ctx, vao, vbo, + VERT_ATTRIB_COLOR_INDEX, format, 1, 1, type, stride, + GL_FALSE, GL_FALSE, GL_FALSE, (void*) offset); +} + + +void GLAPIENTRY _mesa_SecondaryColorPointer_no_error(GLint size, GLenum type, GLsizei stride, const GLvoid *ptr) { GET_CURRENT_CONTEXT(ctx); GLenum format = get_array_format(ctx, BGRA_OR_4, &size); - update_array(ctx, VERT_ATTRIB_COLOR1, format, BGRA_OR_4, size, type, + update_array(ctx, ctx->Array.VAO, ctx->Array.ArrayBufferObj, + VERT_ATTRIB_COLOR1, format, BGRA_OR_4, size, type, stride, GL_TRUE, GL_FALSE, GL_FALSE, ptr); } @@ -816,25 +1037,62 @@ INT_2_10_10_10_REV_BIT); if (!validate_array_and_format(ctx, "glSecondaryColorPointer", + ctx->Array.VAO, ctx->Array.ArrayBufferObj, VERT_ATTRIB_COLOR1, legalTypes, 3, BGRA_OR_4, size, type, stride, - GL_TRUE, GL_FALSE, GL_FALSE, format, ptr, - ctx->Array.VAO)) + GL_TRUE, GL_FALSE, GL_FALSE, format, ptr)) return; - update_array(ctx, VERT_ATTRIB_COLOR1, format, BGRA_OR_4, size, type, + update_array(ctx, ctx->Array.VAO, ctx->Array.ArrayBufferObj, + VERT_ATTRIB_COLOR1, format, BGRA_OR_4, size, type, stride, GL_TRUE, GL_FALSE, GL_FALSE, ptr); } void GLAPIENTRY +_mesa_VertexArraySecondaryColorOffsetEXT(GLuint vaobj, GLuint buffer, GLint size, + GLenum type, GLsizei stride, GLintptr offset) +{ + GET_CURRENT_CONTEXT(ctx); + + GLenum format = get_array_format(ctx, BGRA_OR_4, &size); + const GLbitfield legalTypes = (BYTE_BIT | UNSIGNED_BYTE_BIT | + SHORT_BIT | UNSIGNED_SHORT_BIT | + INT_BIT | UNSIGNED_INT_BIT | + HALF_BIT | FLOAT_BIT | DOUBLE_BIT | + UNSIGNED_INT_2_10_10_10_REV_BIT | + INT_2_10_10_10_REV_BIT); + + struct gl_vertex_array_object* vao; + struct gl_buffer_object* vbo; + + if (!_lookup_vao_and_vbo_dsa(ctx, vaobj, buffer, offset, + &vao, &vbo, + "glVertexArraySecondaryColorOffsetEXT")) + return; + + if (!validate_array_and_format(ctx, "glVertexArraySecondaryColorOffsetEXT", + vao, vbo, + VERT_ATTRIB_COLOR1, legalTypes, 3, + BGRA_OR_4, size, type, stride, + GL_TRUE, GL_FALSE, GL_FALSE, format, (void*) offset)) + return; + + update_array(ctx, vao, vbo, + VERT_ATTRIB_COLOR1, format, BGRA_OR_4, size, type, + stride, GL_TRUE, GL_FALSE, GL_FALSE, (void*) offset); +} + + +void GLAPIENTRY _mesa_TexCoordPointer_no_error(GLint size, GLenum type, GLsizei stride, const GLvoid *ptr) { GET_CURRENT_CONTEXT(ctx); const GLuint unit = ctx->Array.ActiveTexture; - update_array(ctx, VERT_ATTRIB_TEX(unit), GL_RGBA, 4, size, type, + update_array(ctx, ctx->Array.VAO, ctx->Array.ArrayBufferObj, + VERT_ATTRIB_TEX(unit), GL_RGBA, 4, size, type, stride, GL_FALSE, GL_FALSE, GL_FALSE, ptr); } @@ -856,25 +1114,108 @@ INT_2_10_10_10_REV_BIT); if (!validate_array_and_format(ctx, "glTexCoordPointer", + ctx->Array.VAO, ctx->Array.ArrayBufferObj, VERT_ATTRIB_TEX(unit), legalTypes, sizeMin, 4, size, type, stride, - GL_FALSE, GL_FALSE, GL_FALSE, format, ptr, - ctx->Array.VAO)) + GL_FALSE, GL_FALSE, GL_FALSE, format, ptr)) return; - update_array(ctx, VERT_ATTRIB_TEX(unit), format, 4, size, type, + update_array(ctx, ctx->Array.VAO, ctx->Array.ArrayBufferObj, + VERT_ATTRIB_TEX(unit), format, 4, size, type, stride, GL_FALSE, GL_FALSE, GL_FALSE, ptr); } void GLAPIENTRY +_mesa_VertexArrayTexCoordOffsetEXT(GLuint vaobj, GLuint buffer, GLint size, + GLenum type, GLsizei stride, GLintptr offset) +{ + GET_CURRENT_CONTEXT(ctx); + const GLint sizeMin = (ctx->API == API_OPENGLES) ? 2 : 1; + const GLuint unit = ctx->Array.ActiveTexture; + + GLenum format = GL_RGBA; + const GLbitfield legalTypes = (ctx->API == API_OPENGLES) + ? (BYTE_BIT | SHORT_BIT | FLOAT_BIT | FIXED_ES_BIT) + : (SHORT_BIT | INT_BIT | + HALF_BIT | FLOAT_BIT | DOUBLE_BIT | + UNSIGNED_INT_2_10_10_10_REV_BIT | + INT_2_10_10_10_REV_BIT); + + struct gl_vertex_array_object* vao; + struct gl_buffer_object* vbo; + + if (!_lookup_vao_and_vbo_dsa(ctx, vaobj, buffer, offset, + &vao, &vbo, + "glVertexArrayTexCoordOffsetEXT")) + return; + + if (!validate_array_and_format(ctx, "glVertexArrayTexCoordOffsetEXT", + vao, vbo, + VERT_ATTRIB_TEX(unit), legalTypes, + sizeMin, 4, size, type, stride, + GL_FALSE, GL_FALSE, GL_FALSE, format, (void*) offset)) + return; + + update_array(ctx, vao, vbo, + VERT_ATTRIB_TEX(unit), format, 4, size, type, + stride, GL_FALSE, GL_FALSE, GL_FALSE, (void*) offset); +} + + +void GLAPIENTRY +_mesa_VertexArrayMultiTexCoordOffsetEXT(GLuint vaobj, GLuint buffer, GLenum texunit, + GLint size, GLenum type, GLsizei stride, + GLintptr offset) +{ + GET_CURRENT_CONTEXT(ctx); + const GLint sizeMin = (ctx->API == API_OPENGLES) ? 2 : 1; + const GLuint unit = texunit - GL_TEXTURE0; + + GLenum format = GL_RGBA; + const GLbitfield legalTypes = (ctx->API == API_OPENGLES) + ? (BYTE_BIT | SHORT_BIT | FLOAT_BIT | FIXED_ES_BIT) + : (SHORT_BIT | INT_BIT | + HALF_BIT | FLOAT_BIT | DOUBLE_BIT | + UNSIGNED_INT_2_10_10_10_REV_BIT | + INT_2_10_10_10_REV_BIT); + + struct gl_vertex_array_object* vao; + struct gl_buffer_object* vbo; + + if (!_lookup_vao_and_vbo_dsa(ctx, vaobj, buffer, offset, + &vao, &vbo, + "glVertexArrayMultiTexCoordOffsetEXT")) + return; + + if (unit >= ctx->Const.MaxCombinedTextureImageUnits) { + _mesa_error(ctx, GL_INVALID_OPERATION, "glVertexArrayMultiTexCoordOffsetEXT(texunit=%d)", + texunit); + return; + } + + if (!validate_array_and_format(ctx, "glVertexArrayMultiTexCoordOffsetEXT", + vao, vbo, + VERT_ATTRIB_TEX(unit), legalTypes, + sizeMin, 4, size, type, stride, + GL_FALSE, GL_FALSE, GL_FALSE, format, (void*) offset)) + return; + + update_array(ctx, vao, vbo, + VERT_ATTRIB_TEX(unit), format, 4, size, type, + stride, GL_FALSE, GL_FALSE, GL_FALSE, (void*) offset); +} + + +void GLAPIENTRY _mesa_EdgeFlagPointer_no_error(GLsizei stride, const GLvoid *ptr) { /* this is the same type that glEdgeFlag uses */ const GLboolean integer = GL_FALSE; GET_CURRENT_CONTEXT(ctx); - update_array(ctx, VERT_ATTRIB_EDGEFLAG, GL_RGBA, 1, 1, GL_UNSIGNED_BYTE, + update_array(ctx, ctx->Array.VAO, ctx->Array.ArrayBufferObj, + VERT_ATTRIB_EDGEFLAG, GL_RGBA, 1, 1, GL_UNSIGNED_BYTE, stride, GL_FALSE, integer, GL_FALSE, ptr); } @@ -890,24 +1231,58 @@ const GLbitfield legalTypes = UNSIGNED_BYTE_BIT; if (!validate_array_and_format(ctx, "glEdgeFlagPointer", + ctx->Array.VAO, ctx->Array.ArrayBufferObj, VERT_ATTRIB_EDGEFLAG, legalTypes, 1, 1, 1, GL_UNSIGNED_BYTE, stride, - GL_FALSE, integer, GL_FALSE, format, ptr, - ctx->Array.VAO)) + GL_FALSE, integer, GL_FALSE, format, ptr)) return; - update_array(ctx, VERT_ATTRIB_EDGEFLAG, format, 1, 1, GL_UNSIGNED_BYTE, + update_array(ctx, ctx->Array.VAO, ctx->Array.ArrayBufferObj, + VERT_ATTRIB_EDGEFLAG, format, 1, 1, GL_UNSIGNED_BYTE, stride, GL_FALSE, integer, GL_FALSE, ptr); } void GLAPIENTRY +_mesa_VertexArrayEdgeFlagOffsetEXT(GLuint vaobj, GLuint buffer, GLsizei stride, + GLintptr offset) +{ + /* this is the same type that glEdgeFlag uses */ + const GLboolean integer = GL_FALSE; + GET_CURRENT_CONTEXT(ctx); + + GLenum format = GL_RGBA; + const GLbitfield legalTypes = UNSIGNED_BYTE_BIT; + + struct gl_vertex_array_object* vao; + struct gl_buffer_object* vbo; + + if (!_lookup_vao_and_vbo_dsa(ctx, vaobj, buffer, offset, + &vao, &vbo, + "glVertexArrayEdgeFlagOffsetEXT")) + return; + + if (!validate_array_and_format(ctx, "glVertexArrayEdgeFlagOffsetEXT", + vao, vbo, + VERT_ATTRIB_EDGEFLAG, legalTypes, + 1, 1, 1, GL_UNSIGNED_BYTE, stride, + GL_FALSE, integer, GL_FALSE, format, (void*) offset)) + return; + + update_array(ctx, vao, vbo, + VERT_ATTRIB_EDGEFLAG, format, 1, 1, GL_UNSIGNED_BYTE, + stride, GL_FALSE, integer, GL_FALSE, (void*) offset); +} + + +void GLAPIENTRY _mesa_PointSizePointerOES_no_error(GLenum type, GLsizei stride, const GLvoid *ptr) { GET_CURRENT_CONTEXT(ctx); - update_array(ctx, VERT_ATTRIB_POINT_SIZE, GL_RGBA, 1, 1, type, stride, + update_array(ctx, ctx->Array.VAO, ctx->Array.ArrayBufferObj, + VERT_ATTRIB_POINT_SIZE, GL_RGBA, 1, 1, type, stride, GL_FALSE, GL_FALSE, GL_FALSE, ptr); } @@ -927,12 +1302,14 @@ const GLbitfield legalTypes = (FLOAT_BIT | FIXED_ES_BIT); if (!validate_array_and_format(ctx, "glPointSizePointer", + ctx->Array.VAO, ctx->Array.ArrayBufferObj, VERT_ATTRIB_POINT_SIZE, legalTypes, 1, 1, 1, type, stride, GL_FALSE, GL_FALSE, - GL_FALSE, format, ptr, ctx->Array.VAO)) + GL_FALSE, format, ptr)) return; - update_array(ctx, VERT_ATTRIB_POINT_SIZE, format, 1, 1, type, stride, + update_array(ctx, ctx->Array.VAO, ctx->Array.ArrayBufferObj, + VERT_ATTRIB_POINT_SIZE, format, 1, 1, type, stride, GL_FALSE, GL_FALSE, GL_FALSE, ptr); } @@ -945,7 +1322,8 @@ GET_CURRENT_CONTEXT(ctx); GLenum format = get_array_format(ctx, BGRA_OR_4, &size); - update_array(ctx, VERT_ATTRIB_GENERIC(index), format, BGRA_OR_4, + update_array(ctx, ctx->Array.VAO, ctx->Array.ArrayBufferObj, + VERT_ATTRIB_GENERIC(index), format, BGRA_OR_4, size, type, stride, normalized, GL_FALSE, GL_FALSE, ptr); } @@ -978,18 +1356,95 @@ UNSIGNED_INT_10F_11F_11F_REV_BIT); if (!validate_array_and_format(ctx, "glVertexAttribPointer", + ctx->Array.VAO, ctx->Array.ArrayBufferObj, VERT_ATTRIB_GENERIC(index), legalTypes, 1, BGRA_OR_4, size, type, stride, - normalized, GL_FALSE, GL_FALSE, format, - ptr, ctx->Array.VAO)) + normalized, GL_FALSE, GL_FALSE, format, ptr)) return; - update_array(ctx, VERT_ATTRIB_GENERIC(index), format, BGRA_OR_4, + update_array(ctx, ctx->Array.VAO, ctx->Array.ArrayBufferObj, + VERT_ATTRIB_GENERIC(index), format, BGRA_OR_4, size, type, stride, normalized, GL_FALSE, GL_FALSE, ptr); } void GLAPIENTRY +_mesa_VertexArrayVertexAttribOffsetEXT(GLuint vaobj, GLuint buffer, GLuint index, GLint size, + GLenum type, GLboolean normalized, + GLsizei stride, GLintptr offset) +{ + GET_CURRENT_CONTEXT(ctx); + GLenum format = get_array_format(ctx, BGRA_OR_4, &size); + struct gl_vertex_array_object* vao; + struct gl_buffer_object* vbo; + + if (!_lookup_vao_and_vbo_dsa(ctx, vaobj, buffer, offset, + &vao, &vbo, + "glVertexArrayVertexAttribOffsetEXT")) + return; + + if (index >= ctx->Const.Program[MESA_SHADER_VERTEX].MaxAttribs) { + _mesa_error(ctx, GL_INVALID_VALUE, "glVertexArrayVertexAttribOffsetEXT(idx)"); + return; + } + + const GLbitfield legalTypes = (BYTE_BIT | UNSIGNED_BYTE_BIT | + SHORT_BIT | UNSIGNED_SHORT_BIT | + INT_BIT | UNSIGNED_INT_BIT | + HALF_BIT | FLOAT_BIT | DOUBLE_BIT | + FIXED_ES_BIT | FIXED_GL_BIT | + UNSIGNED_INT_2_10_10_10_REV_BIT | + INT_2_10_10_10_REV_BIT | + UNSIGNED_INT_10F_11F_11F_REV_BIT); + + if (!validate_array_and_format(ctx, "glVertexArrayVertexAttribOffsetEXT", + vao, vbo, + VERT_ATTRIB_GENERIC(index), legalTypes, + 1, BGRA_OR_4, size, type, stride, + normalized, GL_FALSE, GL_FALSE, format, (void*) offset)) + return; + + update_array(ctx, vao, vbo, + VERT_ATTRIB_GENERIC(index), format, BGRA_OR_4, + size, type, stride, normalized, GL_FALSE, GL_FALSE, (void*) offset); +} + + +void GLAPIENTRY +_mesa_VertexArrayVertexAttribLOffsetEXT(GLuint vaobj, GLuint buffer, GLuint index, GLint size, + GLenum type, GLsizei stride, GLintptr offset) +{ + GET_CURRENT_CONTEXT(ctx); + GLenum format = GL_RGBA; + struct gl_vertex_array_object* vao; + struct gl_buffer_object* vbo; + + if (!_lookup_vao_and_vbo_dsa(ctx, vaobj, buffer, offset, + &vao, &vbo, + "glVertexArrayVertexAttribLOffsetEXT")) + return; + + if (index >= ctx->Const.Program[MESA_SHADER_VERTEX].MaxAttribs) { + _mesa_error(ctx, GL_INVALID_VALUE, "glVertexArrayVertexAttribLOffsetEXT(idx)"); + return; + } + + const GLbitfield legalTypes = DOUBLE_BIT; + + if (!validate_array_and_format(ctx, "glVertexArrayVertexAttribLOffsetEXT", + vao, vbo, + VERT_ATTRIB_GENERIC(index), legalTypes, + 1, 4, size, type, stride, + GL_FALSE, GL_FALSE, GL_TRUE, format, (void*) offset)) + return; + + update_array(ctx, vao, vbo, + VERT_ATTRIB_GENERIC(index), format, 4, + size, type, stride, GL_FALSE, GL_FALSE, GL_TRUE, (void*) offset); +} + + +void GLAPIENTRY _mesa_VertexAttribIPointer_no_error(GLuint index, GLint size, GLenum type, GLsizei stride, const GLvoid *ptr) { @@ -997,7 +1452,8 @@ const GLboolean integer = GL_TRUE; GET_CURRENT_CONTEXT(ctx); - update_array(ctx, VERT_ATTRIB_GENERIC(index), GL_RGBA, 4, size, type, + update_array(ctx, ctx->Array.VAO, ctx->Array.ArrayBufferObj, + VERT_ATTRIB_GENERIC(index), GL_RGBA, 4, size, type, stride, normalized, integer, GL_FALSE, ptr); } @@ -1009,16 +1465,67 @@ * (position, normal, color, fog, texcoord, etc). */ void GLAPIENTRY -_mesa_VertexAttribIPointer(GLuint index, GLint size, GLenum type, - GLsizei stride, const GLvoid *ptr) +_mesa_VertexAttribIPointer(GLuint index, GLint size, GLenum type, + GLsizei stride, const GLvoid *ptr) +{ + const GLboolean normalized = GL_FALSE; + const GLboolean integer = GL_TRUE; + GET_CURRENT_CONTEXT(ctx); + + GLenum format = GL_RGBA; + if (index >= ctx->Const.Program[MESA_SHADER_VERTEX].MaxAttribs) { + _mesa_error(ctx, GL_INVALID_VALUE, "glVertexAttribIPointer(index)"); + return; + } + + const GLbitfield legalTypes = (BYTE_BIT | UNSIGNED_BYTE_BIT | + SHORT_BIT | UNSIGNED_SHORT_BIT | + INT_BIT | UNSIGNED_INT_BIT); + + if (!validate_array_and_format(ctx, "glVertexAttribIPointer", + ctx->Array.VAO, ctx->Array.ArrayBufferObj, + VERT_ATTRIB_GENERIC(index), legalTypes, + 1, 4, size, type, stride, + normalized, integer, GL_FALSE, format, ptr)) + return; + + update_array(ctx, ctx->Array.VAO, ctx->Array.ArrayBufferObj, + VERT_ATTRIB_GENERIC(index), format, 4, size, type, + stride, normalized, integer, GL_FALSE, ptr); +} + + +void GLAPIENTRY +_mesa_VertexAttribLPointer_no_error(GLuint index, GLint size, GLenum type, + GLsizei stride, const GLvoid *ptr) +{ + GET_CURRENT_CONTEXT(ctx); + + update_array(ctx, ctx->Array.VAO, ctx->Array.ArrayBufferObj, + VERT_ATTRIB_GENERIC(index), GL_RGBA, 4, size, type, + stride, GL_FALSE, GL_FALSE, GL_TRUE, ptr); +} + + +void GLAPIENTRY +_mesa_VertexArrayVertexAttribIOffsetEXT(GLuint vaobj, GLuint buffer, GLuint index, GLint size, + GLenum type, GLsizei stride, GLintptr offset) { const GLboolean normalized = GL_FALSE; const GLboolean integer = GL_TRUE; GET_CURRENT_CONTEXT(ctx); - GLenum format = GL_RGBA; + + struct gl_vertex_array_object* vao; + struct gl_buffer_object* vbo; + + if (!_lookup_vao_and_vbo_dsa(ctx, vaobj, buffer, offset, + &vao, &vbo, + "glVertexArrayVertexAttribIOffsetEXT")) + return; + if (index >= ctx->Const.Program[MESA_SHADER_VERTEX].MaxAttribs) { - _mesa_error(ctx, GL_INVALID_VALUE, "glVertexAttribIPointer(index)"); + _mesa_error(ctx, GL_INVALID_VALUE, "glVertexArrayVertexAttribIOffsetEXT(index)"); return; } @@ -1026,26 +1533,16 @@ SHORT_BIT | UNSIGNED_SHORT_BIT | INT_BIT | UNSIGNED_INT_BIT); - if (!validate_array_and_format(ctx, "glVertexAttribIPointer", + if (!validate_array_and_format(ctx, "glVertexArrayVertexAttribIOffsetEXT", + vao, vbo, VERT_ATTRIB_GENERIC(index), legalTypes, 1, 4, size, type, stride, - normalized, integer, GL_FALSE, format, - ptr, ctx->Array.VAO)) + normalized, integer, GL_FALSE, format, (void*) offset)) return; - update_array(ctx, VERT_ATTRIB_GENERIC(index), format, 4, size, type, - stride, normalized, integer, GL_FALSE, ptr); -} - - -void GLAPIENTRY -_mesa_VertexAttribLPointer_no_error(GLuint index, GLint size, GLenum type, - GLsizei stride, const GLvoid *ptr) -{ - GET_CURRENT_CONTEXT(ctx); - - update_array(ctx, VERT_ATTRIB_GENERIC(index), GL_RGBA, 4, size, type, - stride, GL_FALSE, GL_FALSE, GL_TRUE, ptr); + update_array(ctx, vao, vbo, + VERT_ATTRIB_GENERIC(index), format, 4, size, type, + stride, normalized, integer, GL_FALSE, (void*) offset); } @@ -1064,13 +1561,14 @@ const GLbitfield legalTypes = DOUBLE_BIT; if (!validate_array_and_format(ctx, "glVertexAttribLPointer", + ctx->Array.VAO, ctx->Array.ArrayBufferObj, VERT_ATTRIB_GENERIC(index), legalTypes, 1, 4, size, type, stride, - GL_FALSE, GL_FALSE, GL_TRUE, format, - ptr, ctx->Array.VAO)) + GL_FALSE, GL_FALSE, GL_TRUE, format, ptr)) return; - update_array(ctx, VERT_ATTRIB_GENERIC(index), format, 4, size, type, + update_array(ctx, ctx->Array.VAO, ctx->Array.ArrayBufferObj, + VERT_ATTRIB_GENERIC(index), format, 4, size, type, stride, GL_FALSE, GL_FALSE, GL_TRUE, ptr); } @@ -1142,13 +1640,26 @@ * [compatibility profile: zero or] the name of an existing vertex * array object." */ - vao = _mesa_lookup_vao_err(ctx, vaobj, "glEnableVertexArrayAttrib"); + vao = _mesa_lookup_vao_err(ctx, vaobj, false, "glEnableVertexArrayAttrib"); if (!vao) return; enable_vertex_array_attrib(ctx, vao, index, "glEnableVertexArrayAttrib"); } +void GLAPIENTRY +_mesa_EnableVertexArrayAttribEXT(GLuint vaobj, GLuint index) +{ + GET_CURRENT_CONTEXT(ctx); + struct gl_vertex_array_object* vao = _mesa_lookup_vao_err(ctx, vaobj, + true, + "glEnableVertexArrayAttribEXT"); + if (!vao) + return; + + enable_vertex_array_attrib(ctx, vao, index, "glEnableVertexArrayAttribEXT"); +} + void GLAPIENTRY _mesa_EnableVertexArrayAttrib_no_error(GLuint vaobj, GLuint index) @@ -1218,7 +1729,26 @@ * [compatibility profile: zero or] the name of an existing vertex * array object." */ - vao = _mesa_lookup_vao_err(ctx, vaobj, "glDisableVertexArrayAttrib"); + vao = _mesa_lookup_vao_err(ctx, vaobj, false, "glDisableVertexArrayAttrib"); + if (!vao) + return; + + if (index >= ctx->Const.Program[MESA_SHADER_VERTEX].MaxAttribs) { + _mesa_error(ctx, GL_INVALID_VALUE, "glDisableVertexArrayAttrib(index)"); + return; + } + + const gl_vert_attrib attrib = VERT_ATTRIB_GENERIC(index); + _mesa_disable_vertex_array_attrib(ctx, vao, attrib); +} + +void GLAPIENTRY +_mesa_DisableVertexArrayAttribEXT(GLuint vaobj, GLuint index) +{ + GET_CURRENT_CONTEXT(ctx); + struct gl_vertex_array_object* vao = _mesa_lookup_vao_err(ctx, vaobj, + true, + "glEnableVertexArrayAttribEXT"); if (!vao) return; @@ -1525,7 +2055,7 @@ * [compatibility profile: zero or] the name of an existing * vertex array object." */ - vao = _mesa_lookup_vao_err(ctx, vaobj, "glGetVertexArrayIndexediv"); + vao = _mesa_lookup_vao_err(ctx, vaobj, false, "glGetVertexArrayIndexediv"); if (!vao) return; @@ -1588,7 +2118,7 @@ * [compatibility profile: zero or] the name of an existing * vertex array object." */ - vao = _mesa_lookup_vao_err(ctx, vaobj, "glGetVertexArrayIndexed64iv"); + vao = _mesa_lookup_vao_err(ctx, vaobj, false, "glGetVertexArrayIndexed64iv"); if (!vao) return; @@ -1688,13 +2218,14 @@ INT_2_10_10_10_REV_BIT); if (!validate_array_and_format(ctx, "glMultiTexCoordPointerEXT", + ctx->Array.VAO, ctx->Array.ArrayBufferObj, VERT_ATTRIB_TEX(unit), legalTypes, sizeMin, 4, size, type, stride, - GL_FALSE, GL_FALSE, GL_FALSE, format, ptr, - ctx->Array.VAO)) + GL_FALSE, GL_FALSE, GL_FALSE, format, ptr)) return; - update_array(ctx, VERT_ATTRIB_TEX(unit), format, 4, size, type, + update_array(ctx, ctx->Array.VAO, ctx->Array.ArrayBufferObj, + VERT_ATTRIB_TEX(unit), format, 4, size, type, stride, GL_FALSE, GL_FALSE, GL_FALSE, ptr); } @@ -2027,6 +2558,55 @@ } +void GLAPIENTRY +_mesa_VertexArrayVertexAttribDivisorEXT(GLuint vaobj, GLuint index, GLuint divisor) +{ + GET_CURRENT_CONTEXT(ctx); + + const gl_vert_attrib genericIndex = VERT_ATTRIB_GENERIC(index); + struct gl_vertex_array_object * vao; + /* The ARB_instanced_arrays spec says: + * + * "The vertex array object named by vaobj must + * be generated by GenVertexArrays (and not since deleted); + * otherwise an INVALID_OPERATION error is generated." + */ + vao = _mesa_lookup_vao_err(ctx, vaobj, + false, + "glVertexArrayVertexAttribDivisorEXT"); + if (!vao) + return; + + if (!ctx->Extensions.ARB_instanced_arrays) { + _mesa_error(ctx, GL_INVALID_OPERATION, "glVertexArrayVertexAttribDivisorEXT()"); + return; + } + + if (index >= ctx->Const.Program[MESA_SHADER_VERTEX].MaxAttribs) { + _mesa_error(ctx, GL_INVALID_VALUE, + "glVertexArrayVertexAttribDivisorEXT(index = %u)", index); + return; + } + + assert(genericIndex < ARRAY_SIZE(vao->VertexAttrib)); + + /* The ARB_vertex_attrib_binding spec says: + * + * "The command + * + * void VertexAttribDivisor(uint index, uint divisor); + * + * is equivalent to (assuming no errors are generated): + * + * VertexAttribBinding(index, index); + * VertexBindingDivisor(index, divisor);" + */ + _mesa_vertex_attrib_binding(ctx, vao, genericIndex, genericIndex); + vertex_binding_divisor(ctx, vao, genericIndex, divisor); +} + + + static ALWAYS_INLINE void vertex_array_vertex_buffer(struct gl_context *ctx, struct gl_vertex_array_object *vao, @@ -2186,7 +2766,7 @@ * if is not [compatibility profile: zero or] the name of an * existing vertex array object." */ - vao = _mesa_lookup_vao_err(ctx, vaobj, "glVertexArrayVertexBuffer"); + vao = _mesa_lookup_vao_err(ctx, vaobj, false, "glVertexArrayVertexBuffer"); if (!vao) return; @@ -2195,6 +2775,21 @@ } +void GLAPIENTRY +_mesa_VertexArrayBindVertexBufferEXT(GLuint vaobj, GLuint bindingIndex, GLuint buffer, + GLintptr offset, GLsizei stride) +{ + GET_CURRENT_CONTEXT(ctx); + struct gl_vertex_array_object *vao; + vao = _mesa_lookup_vao_err(ctx, vaobj, true, "glVertexArrayBindVertexBufferEXT"); + if (!vao) + return; + + vertex_array_vertex_buffer_err(ctx, vao, bindingIndex, buffer, offset, + stride, "glVertexArrayBindVertexBufferEXT"); +} + + static ALWAYS_INLINE void vertex_array_vertex_buffers(struct gl_context *ctx, struct gl_vertex_array_object *vao, @@ -2392,7 +2987,7 @@ * if is not [compatibility profile: zero or] the name of an * existing vertex array object." */ - vao = _mesa_lookup_vao_err(ctx, vaobj, "glVertexArrayVertexBuffers"); + vao = _mesa_lookup_vao_err(ctx, vaobj, false, "glVertexArrayVertexBuffers"); if (!vao) return; @@ -2496,8 +3091,8 @@ static void -vertex_array_attrib_format(GLuint vaobj, GLuint attribIndex, GLint size, - GLenum type, GLboolean normalized, +vertex_array_attrib_format(GLuint vaobj, bool isExtDsa, GLuint attribIndex, + GLint size, GLenum type, GLboolean normalized, GLboolean integer, GLboolean doubles, GLbitfield legalTypes, GLsizei sizeMax, GLuint relativeOffset, const char *func) @@ -2514,13 +3109,7 @@ if (!vao) return; } else { - /* The ARB_direct_state_access spec says: - * - * "An INVALID_OPERATION error is generated by - * VertexArrayAttrib*Format if is not [compatibility profile: - * zero or] the name of an existing vertex array object." - */ - vao = _mesa_lookup_vao_err(ctx, vaobj, func); + vao = _mesa_lookup_vao_err(ctx, vaobj, isExtDsa, func); if (!vao) return; @@ -2556,7 +3145,7 @@ GLenum type, GLboolean normalized, GLuint relativeOffset) { - vertex_array_attrib_format(vaobj, attribIndex, size, type, normalized, + vertex_array_attrib_format(vaobj, false, attribIndex, size, type, normalized, GL_FALSE, GL_FALSE, ATTRIB_FORMAT_TYPES_MASK, BGRA_OR_4, relativeOffset, "glVertexArrayAttribFormat"); @@ -2564,11 +3153,23 @@ void GLAPIENTRY +_mesa_VertexArrayVertexAttribFormatEXT(GLuint vaobj, GLuint attribIndex, GLint size, + GLenum type, GLboolean normalized, + GLuint relativeOffset) +{ + vertex_array_attrib_format(vaobj, true, attribIndex, size, type, normalized, + GL_FALSE, GL_FALSE, ATTRIB_FORMAT_TYPES_MASK, + BGRA_OR_4, relativeOffset, + "glVertexArrayVertexAttribFormatEXT"); +} + + +void GLAPIENTRY _mesa_VertexArrayAttribIFormat(GLuint vaobj, GLuint attribIndex, GLint size, GLenum type, GLuint relativeOffset) { - vertex_array_attrib_format(vaobj, attribIndex, size, type, GL_FALSE, + vertex_array_attrib_format(vaobj, false, attribIndex, size, type, GL_FALSE, GL_TRUE, GL_FALSE, ATTRIB_IFORMAT_TYPES_MASK, 4, relativeOffset, "glVertexArrayAttribIFormat"); @@ -2576,17 +3177,41 @@ void GLAPIENTRY +_mesa_VertexArrayVertexAttribIFormatEXT(GLuint vaobj, GLuint attribIndex, + GLint size, GLenum type, + GLuint relativeOffset) +{ + vertex_array_attrib_format(vaobj, true, attribIndex, size, type, GL_FALSE, + GL_TRUE, GL_FALSE, ATTRIB_IFORMAT_TYPES_MASK, + 4, relativeOffset, + "glVertexArrayVertexAttribIFormatEXT"); +} + + +void GLAPIENTRY _mesa_VertexArrayAttribLFormat(GLuint vaobj, GLuint attribIndex, GLint size, GLenum type, GLuint relativeOffset) { - vertex_array_attrib_format(vaobj, attribIndex, size, type, GL_FALSE, + vertex_array_attrib_format(vaobj, false, attribIndex, size, type, GL_FALSE, GL_FALSE, GL_TRUE, ATTRIB_LFORMAT_TYPES_MASK, 4, relativeOffset, "glVertexArrayAttribLFormat"); } +void GLAPIENTRY +_mesa_VertexArrayVertexAttribLFormatEXT(GLuint vaobj, GLuint attribIndex, + GLint size, GLenum type, + GLuint relativeOffset) +{ + vertex_array_attrib_format(vaobj, true, attribIndex, size, type, GL_FALSE, + GL_FALSE, GL_TRUE, ATTRIB_LFORMAT_TYPES_MASK, + 4, relativeOffset, + "glVertexArrayVertexAttribLFormatEXT"); +} + + static void vertex_array_attrib_binding(struct gl_context *ctx, struct gl_vertex_array_object *vao, @@ -2684,7 +3309,7 @@ * if is not [compatibility profile: zero or] the name of an * existing vertex array object." */ - vao = _mesa_lookup_vao_err(ctx, vaobj, "glVertexArrayAttribBinding"); + vao = _mesa_lookup_vao_err(ctx, vaobj, false, "glVertexArrayAttribBinding"); if (!vao) return; @@ -2693,6 +3318,20 @@ } +void GLAPIENTRY +_mesa_VertexArrayVertexAttribBindingEXT(GLuint vaobj, GLuint attribIndex, GLuint bindingIndex) +{ + GET_CURRENT_CONTEXT(ctx); + struct gl_vertex_array_object *vao; + vao = _mesa_lookup_vao_err(ctx, vaobj, true, "glVertexArrayVertexAttribBindingEXT"); + if (!vao) + return; + + vertex_array_attrib_binding(ctx, vao, attribIndex, bindingIndex, + "glVertexArrayVertexAttribBindingEXT"); +} + + static void vertex_array_binding_divisor(struct gl_context *ctx, struct gl_vertex_array_object *vao, @@ -2779,7 +3418,7 @@ * if is not [compatibility profile: zero or] the name of an * existing vertex array object." */ - vao = _mesa_lookup_vao_err(ctx, vaobj, "glVertexArrayBindingDivisor"); + vao = _mesa_lookup_vao_err(ctx, vaobj, false, "glVertexArrayBindingDivisor"); if (!vao) return; @@ -2788,6 +3427,28 @@ } +void GLAPIENTRY +_mesa_VertexArrayVertexBindingDivisorEXT(GLuint vaobj, GLuint bindingIndex, + GLuint divisor) +{ + struct gl_vertex_array_object *vao; + GET_CURRENT_CONTEXT(ctx); + + /* The ARB_direct_state_access specification says: + * + * "An INVALID_OPERATION error is generated by VertexArrayBindingDivisor + * if is not [compatibility profile: zero or] the name of an + * existing vertex array object." + */ + vao = _mesa_lookup_vao_err(ctx, vaobj, true, "glVertexArrayVertexBindingDivisorEXT"); + if (!vao) + return; + + vertex_array_binding_divisor(ctx, vao, bindingIndex, divisor, + "glVertexArrayVertexBindingDivisorEXT"); +} + + void _mesa_copy_vertex_attrib_array(struct gl_context *ctx, struct gl_array_attributes *dst, @@ -2884,3 +3545,266 @@ _mesa_HashDeleteAll(ctx->Array.Objects, delete_arrayobj_cb, ctx); _mesa_DeleteHashTable(ctx->Array.Objects); } + +void GLAPIENTRY +_mesa_GetVertexArrayIntegervEXT(GLuint vaobj, GLenum pname, GLint *param) +{ + GET_CURRENT_CONTEXT(ctx); + struct gl_vertex_array_object* vao; + void* ptr; + + vao = _mesa_lookup_vao_err(ctx, vaobj, true, + "glGetVertexArrayIntegervEXT"); + if (!vao) + return; + + /* The EXT_direct_state_access spec says: + * + * "For GetVertexArrayIntegervEXT, pname must be one of the "Get value" tokens + * in tables 6.6, 6.7, 6.8, and 6.9 that use GetIntegerv, IsEnabled, or + * GetPointerv for their "Get command" (so excluding the VERTEX_ATTRIB_* + * tokens)." + */ + switch (pname) { + /* Tokens using GetIntegerv */ + case GL_CLIENT_ACTIVE_TEXTURE: + *param = GL_TEXTURE0_ARB + ctx->Array.ActiveTexture; + break; + case GL_VERTEX_ARRAY_SIZE: + *param = vao->VertexAttrib[VERT_ATTRIB_POS].Format.Size; + break; + case GL_VERTEX_ARRAY_TYPE: + *param = vao->VertexAttrib[VERT_ATTRIB_POS].Format.Type; + break; + case GL_VERTEX_ARRAY_STRIDE: + *param = vao->VertexAttrib[VERT_ATTRIB_POS].Stride; + break; + case GL_VERTEX_ARRAY_BUFFER_BINDING: + *param = vao->BufferBinding[VERT_ATTRIB_POS].BufferObj->Name; + break; + case GL_COLOR_ARRAY_SIZE: + *param = vao->VertexAttrib[VERT_ATTRIB_COLOR0].Format.Size; + break; + case GL_COLOR_ARRAY_TYPE: + *param = vao->VertexAttrib[VERT_ATTRIB_COLOR0].Format.Type; + break; + case GL_COLOR_ARRAY_STRIDE: + *param = vao->VertexAttrib[VERT_ATTRIB_COLOR0].Stride; + break; + case GL_COLOR_ARRAY_BUFFER_BINDING: + *param = vao->BufferBinding[VERT_ATTRIB_COLOR0].BufferObj->Name; + break; + case GL_EDGE_FLAG_ARRAY_STRIDE: + *param = vao->VertexAttrib[VERT_ATTRIB_EDGEFLAG].Stride; + break; + case GL_EDGE_FLAG_ARRAY_BUFFER_BINDING: + *param = vao->BufferBinding[VERT_ATTRIB_EDGEFLAG].BufferObj->Name; + break; + case GL_INDEX_ARRAY_TYPE: + *param = vao->VertexAttrib[VERT_ATTRIB_COLOR_INDEX].Format.Type; + break; + case GL_INDEX_ARRAY_STRIDE: + *param = vao->VertexAttrib[VERT_ATTRIB_COLOR_INDEX].Stride; + break; + case GL_INDEX_ARRAY_BUFFER_BINDING: + *param = vao->BufferBinding[VERT_ATTRIB_COLOR_INDEX].BufferObj->Name; + break; + case GL_NORMAL_ARRAY_TYPE: + *param = vao->VertexAttrib[VERT_ATTRIB_NORMAL].Format.Type; + break; + case GL_NORMAL_ARRAY_STRIDE: + *param = vao->VertexAttrib[VERT_ATTRIB_NORMAL].Stride; + break; + case GL_NORMAL_ARRAY_BUFFER_BINDING: + *param = vao->BufferBinding[VERT_ATTRIB_NORMAL].BufferObj->Name; + break; + case GL_TEXTURE_COORD_ARRAY_SIZE: + *param = vao->VertexAttrib[VERT_ATTRIB_TEX(ctx->Array.ActiveTexture)].Format.Size; + break; + case GL_TEXTURE_COORD_ARRAY_TYPE: + *param = vao->VertexAttrib[VERT_ATTRIB_TEX(ctx->Array.ActiveTexture)].Format.Type; + break; + case GL_TEXTURE_COORD_ARRAY_STRIDE: + *param = vao->VertexAttrib[VERT_ATTRIB_TEX(ctx->Array.ActiveTexture)].Stride; + break; + case GL_TEXTURE_COORD_ARRAY_BUFFER_BINDING: + *param = vao->BufferBinding[VERT_ATTRIB_TEX(ctx->Array.ActiveTexture)].BufferObj->Name; + break; + case GL_FOG_COORD_ARRAY_TYPE: + *param = vao->VertexAttrib[VERT_ATTRIB_FOG].Format.Type; + break; + case GL_FOG_COORD_ARRAY_STRIDE: + *param = vao->VertexAttrib[VERT_ATTRIB_FOG].Stride; + break; + case GL_FOG_COORD_ARRAY_BUFFER_BINDING: + *param = vao->BufferBinding[VERT_ATTRIB_FOG].BufferObj->Name; + break; + case GL_SECONDARY_COLOR_ARRAY_SIZE: + *param = vao->VertexAttrib[VERT_ATTRIB_COLOR1].Format.Size; + break; + case GL_SECONDARY_COLOR_ARRAY_TYPE: + *param = vao->VertexAttrib[VERT_ATTRIB_COLOR1].Format.Type; + break; + case GL_SECONDARY_COLOR_ARRAY_STRIDE: + *param = vao->VertexAttrib[VERT_ATTRIB_COLOR1].Stride; + break; + case GL_SECONDARY_COLOR_ARRAY_BUFFER_BINDING: + *param = vao->BufferBinding[VERT_ATTRIB_COLOR1].BufferObj->Name; + break; + + /* Tokens using IsEnabled */ + case GL_VERTEX_ARRAY: + *param = !!(vao->Enabled & VERT_BIT_POS); + break; + case GL_COLOR_ARRAY: + *param = !!(vao->Enabled & VERT_BIT_COLOR0); + break; + case GL_EDGE_FLAG_ARRAY: + *param = !!(vao->Enabled & VERT_BIT_EDGEFLAG); + break; + case GL_INDEX_ARRAY: + *param = !!(vao->Enabled & VERT_BIT_COLOR_INDEX); + break; + case GL_NORMAL_ARRAY: + *param = !!(vao->Enabled & VERT_BIT_NORMAL); + break; + case GL_TEXTURE_COORD_ARRAY: + *param = !!(vao->Enabled & VERT_BIT_TEX(ctx->Array.ActiveTexture)); + break; + case GL_FOG_COORD_ARRAY: + *param = !!(vao->Enabled & VERT_BIT_FOG); + break; + case GL_SECONDARY_COLOR_ARRAY: + *param = !!(vao->Enabled & VERT_BIT_COLOR1); + break; + + /* Tokens using GetPointerv */ + case GL_VERTEX_ARRAY_POINTER: + case GL_COLOR_ARRAY_POINTER: + case GL_EDGE_FLAG_ARRAY_POINTER: + case GL_INDEX_ARRAY_POINTER: + case GL_NORMAL_ARRAY_POINTER: + case GL_TEXTURE_COORD_ARRAY_POINTER: + case GL_FOG_COORD_ARRAY_POINTER: + case GL_SECONDARY_COLOR_ARRAY_POINTER: + _get_vao_pointerv(pname, vao, &ptr, "glGetVertexArrayIntegervEXT"); + *param = (int) ((intptr_t) ptr & 0xFFFFFFFF); + break; + + default: + _mesa_error(ctx, GL_INVALID_ENUM, "glGetVertexArrayIntegervEXT(pname)"); + } +} + +void GLAPIENTRY +_mesa_GetVertexArrayPointervEXT(GLuint vaobj, GLenum pname, GLvoid** param) +{ + GET_CURRENT_CONTEXT(ctx); + struct gl_vertex_array_object* vao; + + vao = _mesa_lookup_vao_err(ctx, vaobj, true, + "glGetVertexArrayPointervEXT"); + if (!vao) + return; + + /* The EXT_direct_state_access spec says: + * + * "For GetVertexArrayPointervEXT, pname must be a *_ARRAY_POINTER token from + * tables 6.6, 6.7, and 6.8 excluding VERTEX_ATTRIB_ARRAY_POINT." + */ + switch (pname) { + case GL_VERTEX_ARRAY_POINTER: + case GL_COLOR_ARRAY_POINTER: + case GL_EDGE_FLAG_ARRAY_POINTER: + case GL_INDEX_ARRAY_POINTER: + case GL_NORMAL_ARRAY_POINTER: + case GL_TEXTURE_COORD_ARRAY_POINTER: + case GL_FOG_COORD_ARRAY_POINTER: + case GL_SECONDARY_COLOR_ARRAY_POINTER: + break; + + default: + _mesa_error(ctx, GL_INVALID_ENUM, "glGetVertexArrayPointervEXT(pname)"); + return; + } + + /* pname has been validated, we can now use the helper function */ + _get_vao_pointerv(pname, vao, param, "glGetVertexArrayPointervEXT"); +} + +void GLAPIENTRY +_mesa_GetVertexArrayIntegeri_vEXT(GLuint vaobj, GLuint index, GLenum pname, GLint *param) +{ + GET_CURRENT_CONTEXT(ctx); + struct gl_vertex_array_object* vao; + + vao = _mesa_lookup_vao_err(ctx, vaobj, true, + "glGetVertexArrayIntegeri_vEXT"); + if (!vao) + return; + + + /* The EXT_direct_state_access spec says: + * + * "For GetVertexArrayIntegeri_vEXT, pname must be one of the + * "Get value" tokens in tables 6.8 and 6.9 that use GetVertexAttribiv + * or GetVertexAttribPointerv (so allowing only the VERTEX_ATTRIB_* + * tokens) or a token of the form TEXTURE_COORD_ARRAY (the enable) or + * TEXTURE_COORD_ARRAY_*; index identifies the vertex attribute + * array to query or texture coordinate set index respectively." + */ + + switch (pname) { + case GL_TEXTURE_COORD_ARRAY: + *param = !!(vao->Enabled & VERT_BIT_TEX(index)); + break; + case GL_TEXTURE_COORD_ARRAY_SIZE: + *param = vao->VertexAttrib[VERT_ATTRIB_TEX(index)].Format.Size; + break; + case GL_TEXTURE_COORD_ARRAY_TYPE: + *param = vao->VertexAttrib[VERT_ATTRIB_TEX(index)].Format.Type; + break; + case GL_TEXTURE_COORD_ARRAY_STRIDE: + *param = vao->VertexAttrib[VERT_ATTRIB_TEX(index)].Stride; + break; + case GL_TEXTURE_COORD_ARRAY_BUFFER_BINDING: + *param = vao->BufferBinding[VERT_ATTRIB_TEX(index)].BufferObj->Name; + break; + default: + *param = get_vertex_array_attrib(ctx, vao, index, pname, "glGetVertexArrayIntegeri_vEXT"); + } +} + +void GLAPIENTRY +_mesa_GetVertexArrayPointeri_vEXT(GLuint vaobj, GLuint index, GLenum pname, GLvoid** param) +{ + GET_CURRENT_CONTEXT(ctx); + struct gl_vertex_array_object* vao; + + vao = _mesa_lookup_vao_err(ctx, vaobj, true, + "glGetVertexArrayPointeri_vEXT"); + if (!vao) + return; + + if (index >= ctx->Const.Program[MESA_SHADER_VERTEX].MaxAttribs) { + _mesa_error(ctx, GL_INVALID_VALUE, "glGetVertexArrayPointeri_vEXT(index)"); + return; + } + + /* The EXT_direct_state_access spec says: + * + * "For GetVertexArrayPointeri_vEXT, pname must be VERTEX_ATTRIB_ARRAY_POINTER + * or TEXTURE_COORD_ARRAY_POINTER with the index parameter indicating the vertex + * attribute or texture coordindate set index." + */ + switch(pname) { + case GL_VERTEX_ATTRIB_ARRAY_POINTER: + *param = (GLvoid *) vao->VertexAttrib[VERT_ATTRIB_GENERIC(index)].Ptr; + break; + case GL_TEXTURE_COORD_ARRAY_POINTER: + *param = (GLvoid *) vao->VertexAttrib[VERT_ATTRIB_TEX(index)].Ptr; + break; + default: + _mesa_error(ctx, GL_INVALID_ENUM, "glGetVertexArrayPointeri_vEXT(pname)"); + } +} diff -Nru mesa-19.2.8/src/mesa/main/varray.h mesa-20.0.8/src/mesa/main/varray.h --- mesa-19.2.8/src/mesa/main/varray.h 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/main/varray.h 2020-06-12 01:21:18.000000000 +0000 @@ -238,6 +238,8 @@ extern void GLAPIENTRY _mesa_EnableVertexArrayAttrib_no_error(GLuint vaobj, GLuint index); +extern void GLAPIENTRY +_mesa_EnableVertexArrayAttribEXT( GLuint vaobj, GLuint index ); extern void GLAPIENTRY @@ -253,6 +255,8 @@ extern void GLAPIENTRY _mesa_DisableVertexArrayAttrib_no_error(GLuint vaobj, GLuint index); +extern void GLAPIENTRY +_mesa_DisableVertexArrayAttribEXT( GLuint vaobj, GLuint index ); extern void GLAPIENTRY _mesa_GetVertexAttribdv(GLuint index, GLenum pname, GLdouble *params); @@ -313,6 +317,8 @@ _mesa_VertexAttribDivisor_no_error(GLuint index, GLuint divisor); extern void GLAPIENTRY _mesa_VertexAttribDivisor(GLuint index, GLuint divisor); +extern void GLAPIENTRY +_mesa_VertexArrayVertexAttribDivisorEXT(GLuint vaobj, GLuint index, GLuint divisor); static inline unsigned _mesa_primitive_restart_index(const struct gl_context *ctx, @@ -349,6 +355,10 @@ _mesa_VertexArrayVertexBuffer(GLuint vaobj, GLuint bindingIndex, GLuint buffer, GLintptr offset, GLsizei stride); +extern void GLAPIENTRY +_mesa_VertexArrayBindVertexBufferEXT(GLuint vaobj, GLuint bindingIndex, GLuint buffer, + GLintptr offset, GLsizei stride); + void GLAPIENTRY _mesa_BindVertexBuffers_no_error(GLuint first, GLsizei count, const GLuint *buffers, const GLintptr *offsets, @@ -379,6 +389,11 @@ GLuint relativeOffset); extern void GLAPIENTRY +_mesa_VertexArrayVertexAttribFormatEXT(GLuint vaobj, GLuint attribIndex, GLint size, + GLenum type, GLboolean normalized, + GLuint relativeOffset); + +extern void GLAPIENTRY _mesa_VertexAttribIFormat(GLuint attribIndex, GLint size, GLenum type, GLuint relativeOffset); @@ -388,6 +403,11 @@ GLuint relativeOffset); extern void GLAPIENTRY +_mesa_VertexArrayVertexAttribIFormatEXT(GLuint vaobj, GLuint attribIndex, + GLint size, GLenum type, + GLuint relativeOffset); + +extern void GLAPIENTRY _mesa_VertexAttribLFormat(GLuint attribIndex, GLint size, GLenum type, GLuint relativeOffset); @@ -396,6 +416,11 @@ GLint size, GLenum type, GLuint relativeOffset); +extern void GLAPIENTRY +_mesa_VertexArrayVertexAttribLFormatEXT(GLuint vaobj, GLuint attribIndex, + GLint size, GLenum type, + GLuint relativeOffset); + void GLAPIENTRY _mesa_VertexAttribBinding_no_error(GLuint attribIndex, GLuint bindingIndex); @@ -410,6 +435,10 @@ _mesa_VertexArrayAttribBinding(GLuint vaobj, GLuint attribIndex, GLuint bindingIndex); +extern void GLAPIENTRY +_mesa_VertexArrayVertexAttribBindingEXT(GLuint vaobj, GLuint attribIndex, + GLuint bindingIndex); + void GLAPIENTRY _mesa_VertexBindingDivisor_no_error(GLuint bindingIndex, GLuint divisor); @@ -424,6 +453,10 @@ _mesa_VertexArrayBindingDivisor(GLuint vaobj, GLuint bindingIndex, GLuint divisor); +extern void GLAPIENTRY +_mesa_VertexArrayVertexBindingDivisorEXT(GLuint vaobj, GLuint bindingIndex, + GLuint divisor); + extern void _mesa_copy_vertex_attrib_array(struct gl_context *ctx, @@ -444,4 +477,66 @@ extern void _mesa_free_varray_data(struct gl_context *ctx); +extern void GLAPIENTRY +_mesa_VertexArrayVertexOffsetEXT(GLuint vaobj, GLuint buffer, GLint size, + GLenum type, GLsizei stride, GLintptr offset); + +extern void GLAPIENTRY +_mesa_VertexArrayColorOffsetEXT(GLuint vaobj, GLuint buffer, GLint size, + GLenum type, GLsizei stride, GLintptr offset); + +extern void GLAPIENTRY +_mesa_VertexArrayEdgeFlagOffsetEXT(GLuint vaobj, GLuint buffer, GLsizei stride, + GLintptr offset); + +extern void GLAPIENTRY +_mesa_VertexArrayIndexOffsetEXT(GLuint vaobj, GLuint buffer, GLenum type, + GLsizei stride, GLintptr offset); + +extern void GLAPIENTRY +_mesa_VertexArrayNormalOffsetEXT(GLuint vaobj, GLuint buffer, GLenum type, + GLsizei stride, GLintptr offset); + +extern void GLAPIENTRY +_mesa_VertexArrayTexCoordOffsetEXT(GLuint vaobj, GLuint buffer, GLint size, + GLenum type, GLsizei stride, GLintptr offset); + +extern void GLAPIENTRY +_mesa_VertexArrayMultiTexCoordOffsetEXT(GLuint vaobj, GLuint buffer, GLenum texunit, + GLint size, GLenum type, GLsizei stride, + GLintptr offset); + +extern void GLAPIENTRY +_mesa_VertexArrayFogCoordOffsetEXT(GLuint vaobj, GLuint buffer, GLenum type, + GLsizei stride, GLintptr offset); + +extern void GLAPIENTRY +_mesa_VertexArraySecondaryColorOffsetEXT(GLuint vaobj, GLuint buffer, GLint size, + GLenum type, GLsizei stride, GLintptr offset); + +extern void GLAPIENTRY +_mesa_VertexArrayVertexAttribOffsetEXT(GLuint vaobj, GLuint buffer, GLuint index, GLint size, + GLenum type, GLboolean normalized, + GLsizei stride, GLintptr offset); + +extern void GLAPIENTRY +_mesa_VertexArrayVertexAttribIOffsetEXT(GLuint vaobj, GLuint buffer, GLuint index, GLint size, + GLenum type, GLsizei stride, GLintptr offset); + +extern void GLAPIENTRY +_mesa_VertexArrayVertexAttribLOffsetEXT(GLuint vaobj, GLuint buffer, GLuint index, GLint size, + GLenum type, GLsizei stride, GLintptr offset); + +extern void GLAPIENTRY +_mesa_GetVertexArrayIntegervEXT(GLuint vaobj, GLenum pname, GLint *param); + +extern void GLAPIENTRY +_mesa_GetVertexArrayPointervEXT(GLuint vaobj, GLenum pname, GLvoid** param); + +extern void GLAPIENTRY +_mesa_GetVertexArrayIntegeri_vEXT(GLuint vaobj, GLuint index, GLenum pname, GLint *param); + +extern void GLAPIENTRY +_mesa_GetVertexArrayPointeri_vEXT(GLuint vaobj, GLuint index, GLenum pname, GLvoid** param); + #endif diff -Nru mesa-19.2.8/src/mesa/main/version.c mesa-20.0.8/src/mesa/main/version.c --- mesa-19.2.8/src/mesa/main/version.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/main/version.c 2020-06-12 01:21:18.000000000 +0000 @@ -384,8 +384,8 @@ extensions->NV_texture_barrier); const bool ver_4_6 = (ver_4_5 && consts->GLSLVersion >= 460 && - /* extensions->ARB_gl_spirv */ 0 && - /* extensions->ARB_spirv_extensions */ 0 && + extensions->ARB_gl_spirv && + extensions->ARB_spirv_extensions && extensions->ARB_indirect_parameters && extensions->ARB_pipeline_statistics_query && extensions->ARB_polygon_offset_clamp && diff -Nru mesa-19.2.8/src/mesa/Makefile.sources mesa-20.0.8/src/mesa/Makefile.sources --- mesa-19.2.8/src/mesa/Makefile.sources 2019-12-18 19:04:21.000000000 +0000 +++ mesa-20.0.8/src/mesa/Makefile.sources 2020-06-12 01:21:18.000000000 +0000 @@ -149,8 +149,6 @@ main/matrix.h \ main/mipmap.c \ main/mipmap.h \ - main/mm.c \ - main/mm.h \ main/menums.h \ main/mtypes.h \ main/multisample.c \ @@ -485,6 +483,8 @@ state_tracker/st_cb_msaa.h \ state_tracker/st_cb_perfmon.c \ state_tracker/st_cb_perfmon.h \ + state_tracker/st_cb_perfquery.c \ + state_tracker/st_cb_perfquery.h \ state_tracker/st_cb_program.c \ state_tracker/st_cb_program.h \ state_tracker/st_cb_queryobj.c \ @@ -534,8 +534,6 @@ state_tracker/st_glsl_to_tgsi_private.h \ state_tracker/st_glsl_to_tgsi_temprename.cpp \ state_tracker/st_glsl_to_tgsi_temprename.h \ - state_tracker/st_glsl_types.cpp \ - state_tracker/st_glsl_types.h \ state_tracker/st_manager.c \ state_tracker/st_manager.h \ state_tracker/st_mesa_to_tgsi.c \ diff -Nru mesa-19.2.8/src/mesa/math/m_debug.h mesa-20.0.8/src/mesa/math/m_debug.h --- mesa-19.2.8/src/mesa/math/m_debug.h 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/math/m_debug.h 2020-06-12 01:21:18.000000000 +0000 @@ -33,10 +33,4 @@ extern void _math_test_all_normal_transform_functions( char *description ); extern void _math_test_all_cliptest_functions( char *description ); -/* Deprecated? - */ -extern void _math_test_all_vertex_functions( char *description ); - -extern char *mesa_profile; - #endif diff -Nru mesa-19.2.8/src/mesa/meson.build mesa-20.0.8/src/mesa/meson.build --- mesa-19.2.8/src/mesa/meson.build 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/meson.build 2020-06-12 01:21:18.000000000 +0000 @@ -189,8 +189,6 @@ 'main/matrix.h', 'main/mipmap.c', 'main/mipmap.h', - 'main/mm.c', - 'main/mm.h', 'main/menums.h', 'main/mtypes.h', 'main/multisample.c', @@ -529,6 +527,8 @@ 'state_tracker/st_cb_msaa.h', 'state_tracker/st_cb_perfmon.c', 'state_tracker/st_cb_perfmon.h', + 'state_tracker/st_cb_perfquery.c', + 'state_tracker/st_cb_perfquery.h', 'state_tracker/st_cb_program.c', 'state_tracker/st_cb_program.h', 'state_tracker/st_cb_queryobj.c', @@ -578,8 +578,6 @@ 'state_tracker/st_glsl_to_tgsi_private.h', 'state_tracker/st_glsl_to_tgsi_temprename.cpp', 'state_tracker/st_glsl_to_tgsi_temprename.h', - 'state_tracker/st_glsl_types.cpp', - 'state_tracker/st_glsl_types.h', 'state_tracker/st_manager.c', 'state_tracker/st_manager.h', 'state_tracker/st_mesa_to_tgsi.c', @@ -697,6 +695,10 @@ main_remap_helper_h, sha1_h, ] +files_libmesa_gallium += [ + ir_expression_operation_h, + sha1_h, +] if with_sse41 libmesa_sse41 = static_library( @@ -709,24 +711,46 @@ libmesa_sse41 = [] endif +_mesa_windows_args = [] +if with_platform_windows + _mesa_windows_args += [ + '-D_GDI32_', # prevent gl* being declared __declspec(dllimport) in MS headers + '-DBUILD_GL32' # declare gl* as __declspec(dllexport) in Mesa headers + ] + if not with_shared_glapi + # prevent _glapi_* from being declared __declspec(dllimport) + _mesa_windows_args += '-D_GLAPI_NO_EXPORTS' + endif +endif + +libmesa_common = static_library( + 'mesa_common', + files_libmesa_common, + c_args : [c_vis_args, c_msvc_compat_args, _mesa_windows_args], + cpp_args : [cpp_vis_args, cpp_msvc_compat_args, _mesa_windows_args], + include_directories : [inc_common, inc_libmesa_asm, include_directories('main')], + dependencies : idep_nir_headers, + build_by_default : false, +) + libmesa_classic = static_library( 'mesa_classic', - [files_libmesa_common, files_libmesa_classic], + files_libmesa_classic, c_args : [c_vis_args, c_msvc_compat_args], cpp_args : [cpp_vis_args, cpp_msvc_compat_args], include_directories : [inc_common, inc_libmesa_asm, include_directories('main')], - link_with : [libglsl, libmesa_sse41], + link_with : [libmesa_common, libglsl, libmesa_sse41], dependencies : idep_nir_headers, build_by_default : false, ) libmesa_gallium = static_library( 'mesa_gallium', - [files_libmesa_common, files_libmesa_gallium], - c_args : [c_vis_args, c_msvc_compat_args], - cpp_args : [cpp_vis_args, cpp_msvc_compat_args], + files_libmesa_gallium, + c_args : [c_vis_args, c_msvc_compat_args, _mesa_windows_args], + cpp_args : [cpp_vis_args, cpp_msvc_compat_args, _mesa_windows_args], include_directories : [inc_common, inc_libmesa_asm, include_directories('main')], - link_with : [libglsl, libmesa_sse41], + link_with : [libmesa_common, libglsl, libmesa_sse41], dependencies : [idep_nir_headers, dep_vdpau], build_by_default : false, ) diff -Nru mesa-19.2.8/src/mesa/program/ir_to_mesa.cpp mesa-20.0.8/src/mesa/program/ir_to_mesa.cpp --- mesa-19.2.8/src/mesa/program/ir_to_mesa.cpp 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/program/ir_to_mesa.cpp 2020-06-12 01:21:18.000000000 +0000 @@ -252,6 +252,7 @@ virtual void visit(ir_call *); virtual void visit(ir_return *); virtual void visit(ir_discard *); + virtual void visit(ir_demote *); virtual void visit(ir_texture *); virtual void visit(ir_if *); virtual void visit(ir_emit_vertex *); @@ -499,83 +500,9 @@ } static int -storage_type_size(const struct glsl_type *type, bool bindless) -{ - unsigned int i; - int size; - - switch (type->base_type) { - case GLSL_TYPE_UINT: - case GLSL_TYPE_INT: - case GLSL_TYPE_UINT8: - case GLSL_TYPE_INT8: - case GLSL_TYPE_UINT16: - case GLSL_TYPE_INT16: - case GLSL_TYPE_FLOAT: - case GLSL_TYPE_FLOAT16: - case GLSL_TYPE_BOOL: - if (type->is_matrix()) { - return type->matrix_columns; - } else { - /* Regardless of size of vector, it gets a vec4. This is bad - * packing for things like floats, but otherwise arrays become a - * mess. Hopefully a later pass over the code can pack scalars - * down if appropriate. - */ - return 1; - } - break; - case GLSL_TYPE_DOUBLE: - if (type->is_matrix()) { - if (type->vector_elements > 2) - return type->matrix_columns * 2; - else - return type->matrix_columns; - } else { - if (type->vector_elements > 2) - return 2; - else - return 1; - } - break; - case GLSL_TYPE_UINT64: - case GLSL_TYPE_INT64: - if (type->vector_elements > 2) - return 2; - else - return 1; - case GLSL_TYPE_ARRAY: - assert(type->length > 0); - return storage_type_size(type->fields.array, bindless) * type->length; - case GLSL_TYPE_STRUCT: - size = 0; - for (i = 0; i < type->length; i++) { - size += storage_type_size(type->fields.structure[i].type, bindless); - } - return size; - case GLSL_TYPE_SAMPLER: - case GLSL_TYPE_IMAGE: - if (!bindless) - return 0; - /* fall through */ - case GLSL_TYPE_SUBROUTINE: - return 1; - case GLSL_TYPE_ATOMIC_UINT: - case GLSL_TYPE_VOID: - case GLSL_TYPE_ERROR: - case GLSL_TYPE_INTERFACE: - case GLSL_TYPE_FUNCTION: - assert(!"Invalid type in type_size"); - break; - } - - return 0; -} - -static int type_size(const struct glsl_type *type) { - return storage_type_size(type, false); + return type->count_vec4_slots(false, false); } /** @@ -1369,6 +1296,12 @@ case ir_binop_ldexp: case ir_binop_carry: case ir_binop_borrow: + case ir_binop_abs_sub: + case ir_binop_add_sat: + case ir_binop_sub_sat: + case ir_binop_avg: + case ir_binop_avg_round: + case ir_binop_mul_32x16: case ir_binop_imul_high: case ir_unop_interpolate_at_centroid: case ir_binop_interpolate_at_offset: @@ -1411,6 +1344,9 @@ case ir_unop_unpack_sampler_2x32: case ir_unop_pack_image_2x32: case ir_unop_unpack_image_2x32: + case ir_unop_atan: + case ir_binop_atan2: + case ir_unop_clz: assert(!"not supported"); break; @@ -2202,6 +2138,12 @@ } void +ir_to_mesa_visitor::visit(ir_demote *ir) +{ + assert(!"demote statement unsupported"); +} + +void ir_to_mesa_visitor::visit(ir_if *ir) { ir_to_mesa_instruction *if_inst; @@ -2390,7 +2332,7 @@ add_uniform_to_shader(struct gl_context *ctx, struct gl_shader_program *shader_program, struct gl_program_parameter_list *params) - : ctx(ctx), params(params), idx(-1) + : ctx(ctx), shader_program(shader_program), params(params), idx(-1) { /* empty */ } @@ -2411,6 +2353,7 @@ bool last_field); struct gl_context *ctx; + struct gl_shader_program *shader_program; struct gl_program_parameter_list *params; int idx; ir_variable *var; @@ -2472,6 +2415,21 @@ */ if (this->idx < 0) this->idx = index; + + /* Each Parameter will hold the index to the backing uniform storage. + * This avoids relying on names to match parameters and uniform + * storages later when associating uniform storage. + */ + unsigned location; + const bool found = + shader_program->UniformHash->get(location, params->Parameters[index].Name); + assert(found); + + for (unsigned i = 0; i < num_params; i++) { + struct gl_program_parameter *param = ¶ms->Parameters[index + i]; + param->UniformStorageIndex = location; + param->MainUniformStorageIndex = params->Parameters[this->idx].UniformStorageIndex; + } } /** @@ -2520,13 +2478,7 @@ if (params->Parameters[i].Type != PROGRAM_UNIFORM) continue; - unsigned location; - const bool found = - shader_program->UniformHash->get(location, params->Parameters[i].Name); - assert(found); - - if (!found) - continue; + unsigned location = params->Parameters[i].UniformStorageIndex; struct gl_uniform_storage *storage = &shader_program->data->UniformStorage[location]; @@ -3108,7 +3060,7 @@ } } - build_program_resource_list(ctx, prog); + build_program_resource_list(ctx, prog, false); return prog->data->LinkStatus; } @@ -3168,6 +3120,9 @@ prog->data->LinkStatus = LINKING_FAILURE; } + if (prog->data->LinkStatus != LINKING_FAILURE) + _mesa_create_program_resource_hash(prog); + /* Return early if we are loading the shader from on-disk cache */ if (prog->data->LinkStatus == LINKING_SKIPPED) return; diff -Nru mesa-19.2.8/src/mesa/program/prog_parameter.h mesa-20.0.8/src/mesa/program/prog_parameter.h --- mesa-19.2.8/src/mesa/program/prog_parameter.h 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/program/prog_parameter.h 2020-06-12 01:21:18.000000000 +0000 @@ -91,8 +91,16 @@ struct gl_program_parameter { const char *Name; /**< Null-terminated string */ - gl_register_file Type:16; /**< PROGRAM_CONSTANT or STATE_VAR */ + gl_register_file Type:5; /**< PROGRAM_CONSTANT or STATE_VAR */ + + /** + * We need to keep track of whether the param is padded for use in the + * shader cache. + */ + bool Padded:1; + GLenum16 DataType; /**< GL_FLOAT, GL_FLOAT_VEC2, etc */ + /** * Number of components (1..4), or more. * If the number of components is greater than 4, @@ -106,10 +114,15 @@ gl_state_index16 StateIndexes[STATE_LENGTH]; /** - * We need to keep track of whether the param is padded for use in the - * shader cache. + * Index of this parameter's uniform storage. + */ + uint32_t UniformStorageIndex; + + /** + * Index of the first uniform storage that is associated with the same + * variable as this parameter. */ - bool Padded; + uint32_t MainUniformStorageIndex; }; diff -Nru mesa-19.2.8/src/mesa/program/prog_statevars.c mesa-20.0.8/src/mesa/program/prog_statevars.c --- mesa-19.2.8/src/mesa/program/prog_statevars.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/program/prog_statevars.c 2020-06-12 01:21:18.000000000 +0000 @@ -602,6 +602,17 @@ ctx->Color.BlendEnabled, ctx->Color._AdvancedBlendMode); return; + case STATE_ALPHA_REF: + value[0] = ctx->Color.AlphaRefUnclamped; + return; + + case STATE_CLIP_INTERNAL: + { + const GLuint plane = (GLuint) state[2]; + COPY_4V(value, ctx->Transform._ClipUserPlane[plane]); + } + return; + /* XXX: make sure new tokens added here are also handled in the * _mesa_program_state_flags() switch, below. */ @@ -713,6 +724,12 @@ case STATE_ADVANCED_BLENDING_MODE: return _NEW_COLOR; + case STATE_ALPHA_REF: + return _NEW_COLOR; + + case STATE_CLIP_INTERNAL: + return _NEW_TRANSFORM | _NEW_PROJECTION; + default: /* unknown state indexes are silently ignored and * no flag set, since it is handled by the driver. @@ -919,6 +936,12 @@ case STATE_ADVANCED_BLENDING_MODE: append(dst, "AdvancedBlendingMode"); break; + case STATE_ALPHA_REF: + append(dst, "alphaRef"); + break; + case STATE_CLIP_INTERNAL: + append(dst, "clipInternal"); + break; default: /* probably STATE_INTERNAL_DRIVER+i (driver private state) */ append(dst, "driverState"); diff -Nru mesa-19.2.8/src/mesa/program/prog_statevars.h mesa-20.0.8/src/mesa/program/prog_statevars.h --- mesa-19.2.8/src/mesa/program/prog_statevars.h 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/program/prog_statevars.h 2020-06-12 01:21:18.000000000 +0000 @@ -129,6 +129,8 @@ * currently active advanced blending equation, or zero if disabled. */ STATE_ADVANCED_BLENDING_MODE, + STATE_ALPHA_REF, /* alpha-test reference value */ + STATE_CLIP_INTERNAL, /* similar to STATE_CLIPPLANE, but in clip-space */ STATE_INTERNAL_DRIVER /* first available state index for drivers (must be last) */ } gl_state_index; diff -Nru mesa-19.2.8/src/mesa/program/prog_to_nir.c mesa-20.0.8/src/mesa/program/prog_to_nir.c --- mesa-19.2.8/src/mesa/program/prog_to_nir.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/program/prog_to_nir.c 2020-06-12 01:21:18.000000000 +0000 @@ -456,7 +456,10 @@ static void ptn_kil(nir_builder *b, nir_ssa_def **src) { + /* flt must be exact, because NaN shouldn't discard. (apps rely on this) */ + b->exact = true; nir_ssa_def *cmp = nir_bany(b, nir_flt(b, src[0], nir_imm_float(b, 0.0))); + b->exact = false; nir_intrinsic_instr *discard = nir_intrinsic_instr_create(b->shader, nir_intrinsic_discard_if); @@ -838,8 +841,9 @@ src = nir_channel(b, src, 2); } if (c->prog->Target == GL_VERTEX_PROGRAM_ARB && - var->data.location == VARYING_SLOT_FOGC) { - /* result.fogcoord is a single component value */ + (var->data.location == VARYING_SLOT_FOGC || + var->data.location == VARYING_SLOT_PSIZ)) { + /* result.{fogcoord,psiz} is a single component value */ src = nir_channel(b, src, 0); } unsigned num_components = glsl_get_vector_elements(var->type); @@ -926,7 +930,8 @@ nir_variable *var = rzalloc(shader, nir_variable); if ((c->prog->Target == GL_FRAGMENT_PROGRAM_ARB && i == FRAG_RESULT_DEPTH) || - (c->prog->Target == GL_VERTEX_PROGRAM_ARB && i == VARYING_SLOT_FOGC)) + (c->prog->Target == GL_VERTEX_PROGRAM_ARB && i == VARYING_SLOT_FOGC) || + (c->prog->Target == GL_VERTEX_PROGRAM_ARB && i == VARYING_SLOT_PSIZ)) var->type = glsl_float_type(); else var->type = glsl_vec4_type(); diff -Nru mesa-19.2.8/src/mesa/program/symbol_table.c mesa-20.0.8/src/mesa/program/symbol_table.c --- mesa-19.2.8/src/mesa/program/symbol_table.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/program/symbol_table.c 2020-06-12 01:21:18.000000000 +0000 @@ -292,7 +292,7 @@ struct _mesa_symbol_table *table = calloc(1, sizeof(*table)); if (table != NULL) { - table->ht = _mesa_hash_table_create(NULL, _mesa_key_hash_string, + table->ht = _mesa_hash_table_create(NULL, _mesa_hash_string, _mesa_key_string_equal); _mesa_symbol_table_push_scope(table); diff -Nru mesa-19.2.8/src/mesa/state_tracker/st_atifs_to_tgsi.c mesa-20.0.8/src/mesa/state_tracker/st_atifs_to_tgsi.c --- mesa-19.2.8/src/mesa/state_tracker/st_atifs_to_tgsi.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/state_tracker/st_atifs_to_tgsi.c 2020-06-12 01:21:18.000000000 +0000 @@ -541,7 +541,7 @@ st_init_atifs_prog(struct gl_context *ctx, struct gl_program *prog) { /* we know this is st_fragment_program, because of st_new_ati_fs() */ - struct st_fragment_program *stfp = (struct st_fragment_program *) prog; + struct st_program *stfp = (struct st_program *) prog; struct ati_fragment_shader *atifs = stfp->ati_fs; unsigned pass, i, r, optype, arg; diff -Nru mesa-19.2.8/src/mesa/state_tracker/st_atom_array.c mesa-20.0.8/src/mesa/state_tracker/st_atom_array.c --- mesa-19.2.8/src/mesa/state_tracker/st_atom_array.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/state_tracker/st_atom_array.c 2020-06-12 01:21:18.000000000 +0000 @@ -364,36 +364,19 @@ } } -static void -set_vertex_attribs(struct st_context *st, - struct pipe_vertex_buffer *vbuffers, - unsigned num_vbuffers, - struct pipe_vertex_element *velements, - unsigned num_velements) -{ - struct cso_context *cso = st->cso_context; - - cso_set_vertex_buffers(cso, 0, num_vbuffers, vbuffers); - if (st->last_num_vbuffers > num_vbuffers) { - /* Unbind remaining buffers, if any. */ - cso_set_vertex_buffers(cso, num_vbuffers, - st->last_num_vbuffers - num_vbuffers, NULL); - } - st->last_num_vbuffers = num_vbuffers; - cso_set_vertex_elements(cso, num_velements, velements); -} - void st_setup_arrays(struct st_context *st, const struct st_vertex_program *vp, - const struct st_vp_variant *vp_variant, + const struct st_common_variant *vp_variant, struct pipe_vertex_element *velements, - struct pipe_vertex_buffer *vbuffer, unsigned *num_vbuffers) + struct pipe_vertex_buffer *vbuffer, unsigned *num_vbuffers, + bool *has_user_vertex_buffers) { struct gl_context *ctx = st->ctx; const struct gl_vertex_array_object *vao = ctx->Array._DrawVAO; const GLbitfield inputs_read = vp_variant->vert_attrib_mask; const ubyte *input_to_index = vp->input_to_index; + bool uses_user_vertex_buffers = false; /* Process attribute array data. */ GLbitfield mask = inputs_read & _mesa_draw_array_bits(ctx); @@ -429,6 +412,7 @@ vbuffer[bufidx].is_user_buffer = true; vbuffer[bufidx].buffer_offset = 0; + uses_user_vertex_buffers = true; if (!binding->InstanceDivisor) st->draw_needs_minmax_index = true; } @@ -451,12 +435,13 @@ input_to_index[attr]); } } + *has_user_vertex_buffers = uses_user_vertex_buffers; } void st_setup_current(struct st_context *st, const struct st_vertex_program *vp, - const struct st_vp_variant *vp_variant, + const struct st_common_variant *vp_variant, struct pipe_vertex_element *velements, struct pipe_vertex_buffer *vbuffer, unsigned *num_vbuffers) { @@ -466,8 +451,6 @@ /* Process values that should have better been uniforms in the application */ GLbitfield curmask = inputs_read & _mesa_draw_current_bits(ctx); if (curmask) { - /* vertex program validation must be done before this */ - const struct st_vertex_program *vp = st->vp; const ubyte *input_to_index = vp->input_to_index; /* For each attribute, upload the maximum possible size. */ GLubyte data[VERT_ATTRIB_MAX * sizeof(GLdouble) * 4]; @@ -518,7 +501,7 @@ void st_setup_current_user(struct st_context *st, const struct st_vertex_program *vp, - const struct st_vp_variant *vp_variant, + const struct st_common_variant *vp_variant, struct pipe_vertex_element *velements, struct pipe_vertex_buffer *vbuffer, unsigned *num_vbuffers) { @@ -550,19 +533,21 @@ { /* vertex program validation must be done before this */ /* _NEW_PROGRAM, ST_NEW_VS_STATE */ - const struct st_vertex_program *vp = st->vp; - const struct st_vp_variant *vp_variant = st->vp_variant; + const struct st_vertex_program *vp = (struct st_vertex_program *)st->vp; + const struct st_common_variant *vp_variant = st->vp_variant; struct pipe_vertex_buffer vbuffer[PIPE_MAX_ATTRIBS]; unsigned num_vbuffers = 0, first_upload_vbuffer; struct pipe_vertex_element velements[PIPE_MAX_ATTRIBS]; unsigned num_velements; + bool uses_user_vertex_buffers; st->draw_needs_minmax_index = false; /* ST_NEW_VERTEX_ARRAYS alias ctx->DriverFlags.NewArray */ /* Setup arrays */ - st_setup_arrays(st, vp, vp_variant, velements, vbuffer, &num_vbuffers); + st_setup_arrays(st, vp, vp_variant, velements, vbuffer, &num_vbuffers, + &uses_user_vertex_buffers); /* _NEW_CURRENT_ATTRIB */ /* Setup current uploads */ @@ -570,8 +555,18 @@ st_setup_current(st, vp, vp_variant, velements, vbuffer, &num_vbuffers); /* Set the array into cso */ - num_velements = vp_variant->num_inputs; - set_vertex_attribs(st, vbuffer, num_vbuffers, velements, num_velements); + num_velements = vp->num_inputs + vp_variant->key.passthrough_edgeflags; + + /* Set vertex buffers and elements. */ + struct cso_context *cso = st->cso_context; + unsigned unbind_trailing_vbuffers = + st->last_num_vbuffers > num_vbuffers ? + st->last_num_vbuffers - num_vbuffers : 0; + cso_set_vertex_buffers_and_elements(cso, num_velements, velements, + num_vbuffers, + unbind_trailing_vbuffers, + vbuffer, uses_user_vertex_buffers); + st->last_num_vbuffers = num_vbuffers; /* Unreference uploaded buffer resources. */ for (unsigned i = first_upload_vbuffer; i < num_vbuffers; ++i) { diff -Nru mesa-19.2.8/src/mesa/state_tracker/st_atom_atomicbuf.c mesa-20.0.8/src/mesa/state_tracker/st_atom_atomicbuf.c --- mesa-19.2.8/src/mesa/state_tracker/st_atom_atomicbuf.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/state_tracker/st_atom_atomicbuf.c 2020-06-12 01:21:18.000000000 +0000 @@ -66,13 +66,19 @@ static void st_bind_atomics(struct st_context *st, struct gl_program *prog, - enum pipe_shader_type shader_type) + gl_shader_stage stage) { unsigned i; + enum pipe_shader_type shader_type = pipe_shader_type_from_mesa(stage); if (!prog || !st->pipe->set_shader_buffers || st->has_hw_atomics) return; + /* For !has_hw_atomics, the atomic counters have been rewritten to be above + * the SSBOs used by the program. + */ + unsigned buffer_base = prog->info.num_ssbos; + unsigned used_bindings = 0; for (i = 0; i < prog->sh.data->NumAtomicBuffers; i++) { struct gl_active_atomic_buffer *atomic = &prog->sh.data->AtomicBuffers[i]; @@ -81,8 +87,10 @@ st_binding_to_sb(&st->ctx->AtomicBufferBindings[atomic->Binding], &sb); st->pipe->set_shader_buffers(st->pipe, shader_type, - atomic->Binding, 1, &sb, 0x1); + buffer_base + atomic->Binding, 1, &sb, 0x1); + used_bindings = MAX2(atomic->Binding + 1, used_bindings); } + st->last_used_atomic_bindings[shader_type] = used_bindings; } void @@ -91,7 +99,7 @@ struct gl_program *prog = st->ctx->_Shader->CurrentProgram[MESA_SHADER_VERTEX]; - st_bind_atomics(st, prog, PIPE_SHADER_VERTEX); + st_bind_atomics(st, prog, MESA_SHADER_VERTEX); } void @@ -100,7 +108,7 @@ struct gl_program *prog = st->ctx->_Shader->CurrentProgram[MESA_SHADER_FRAGMENT]; - st_bind_atomics(st, prog, PIPE_SHADER_FRAGMENT); + st_bind_atomics(st, prog, MESA_SHADER_FRAGMENT); } void @@ -109,7 +117,7 @@ struct gl_program *prog = st->ctx->_Shader->CurrentProgram[MESA_SHADER_GEOMETRY]; - st_bind_atomics(st, prog, PIPE_SHADER_GEOMETRY); + st_bind_atomics(st, prog, MESA_SHADER_GEOMETRY); } void @@ -118,7 +126,7 @@ struct gl_program *prog = st->ctx->_Shader->CurrentProgram[MESA_SHADER_TESS_CTRL]; - st_bind_atomics(st, prog, PIPE_SHADER_TESS_CTRL); + st_bind_atomics(st, prog, MESA_SHADER_TESS_CTRL); } void @@ -127,7 +135,7 @@ struct gl_program *prog = st->ctx->_Shader->CurrentProgram[MESA_SHADER_TESS_EVAL]; - st_bind_atomics(st, prog, PIPE_SHADER_TESS_EVAL); + st_bind_atomics(st, prog, MESA_SHADER_TESS_EVAL); } void @@ -140,7 +148,7 @@ struct gl_program *prog = st->ctx->_Shader->CurrentProgram[MESA_SHADER_COMPUTE]; - st_bind_atomics(st, prog, PIPE_SHADER_COMPUTE); + st_bind_atomics(st, prog, MESA_SHADER_COMPUTE); } void diff -Nru mesa-19.2.8/src/mesa/state_tracker/st_atom_blend.c mesa-20.0.8/src/mesa/state_tracker/st_atom_blend.c --- mesa-19.2.8/src/mesa/state_tracker/st_atom_blend.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/state_tracker/st_atom_blend.c 2020-06-12 01:21:18.000000000 +0000 @@ -223,7 +223,7 @@ !blend->rt[i].colormask) continue; - if (ctx->Extensions.ARB_draw_buffers_blend) + if (ctx->Extensions.ARB_draw_buffers_blend) j = i; blend->rt[i].blend_enable = 1; diff -Nru mesa-19.2.8/src/mesa/state_tracker/st_atom.c mesa-20.0.8/src/mesa/state_tracker/st_atom.c --- mesa-19.2.8/src/mesa/state_tracker/st_atom.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/state_tracker/st_atom.c 2020-06-12 01:21:18.000000000 +0000 @@ -67,11 +67,11 @@ static void check_program_state( struct st_context *st ) { struct gl_context *ctx = st->ctx; - struct st_vertex_program *old_vp = st->vp; - struct st_common_program *old_tcp = st->tcp; - struct st_common_program *old_tep = st->tep; - struct st_common_program *old_gp = st->gp; - struct st_fragment_program *old_fp = st->fp; + struct st_program *old_vp = st->vp; + struct st_program *old_tcp = st->tcp; + struct st_program *old_tep = st->tep; + struct st_program *old_gp = st->gp; + struct st_program *old_fp = st->fp; struct gl_program *new_vp = ctx->VertexProgram._Current; struct gl_program *new_tcp = ctx->TessCtrlProgram._Current; @@ -84,39 +84,39 @@ /* Flag states used by both new and old shaders to unbind shader resources * properly when transitioning to shaders that don't use them. */ - if (unlikely(new_vp != &old_vp->Base)) { + if (unlikely(new_vp != (old_vp ? &old_vp->Base : NULL))) { if (old_vp) dirty |= old_vp->affected_states; if (new_vp) - dirty |= ST_NEW_VERTEX_PROGRAM(st, st_vertex_program(new_vp)); + dirty |= ST_NEW_VERTEX_PROGRAM(st, st_program(new_vp)); } if (unlikely(new_tcp != &old_tcp->Base)) { if (old_tcp) dirty |= old_tcp->affected_states; if (new_tcp) - dirty |= st_common_program(new_tcp)->affected_states; + dirty |= st_program(new_tcp)->affected_states; } if (unlikely(new_tep != &old_tep->Base)) { if (old_tep) dirty |= old_tep->affected_states; if (new_tep) - dirty |= st_common_program(new_tep)->affected_states; + dirty |= st_program(new_tep)->affected_states; } if (unlikely(new_gp != &old_gp->Base)) { if (old_gp) dirty |= old_gp->affected_states; if (new_gp) - dirty |= st_common_program(new_gp)->affected_states; + dirty |= st_program(new_gp)->affected_states; } if (unlikely(new_fp != &old_fp->Base)) { if (old_fp) dirty |= old_fp->affected_states; if (new_fp) - dirty |= st_fragment_program(new_fp)->affected_states; + dirty |= st_program(new_fp)->affected_states; } /* Find out the number of viewports. This determines how many scissors @@ -153,7 +153,7 @@ if (vertdata_edgeflags != st->vertdata_edgeflags) { st->vertdata_edgeflags = vertdata_edgeflags; if (vp) - st->dirty |= ST_NEW_VERTEX_PROGRAM(st, st_vertex_program(vp)); + st->dirty |= ST_NEW_VERTEX_PROGRAM(st, st_program(vp)); } edgeflag_culls_prims = edgeflags_enabled && !vertdata_edgeflags && @@ -219,14 +219,14 @@ break; case ST_PIPELINE_COMPUTE: { - struct st_compute_program *old_cp = st->cp; + struct st_program *old_cp = st->cp; struct gl_program *new_cp = ctx->ComputeProgram._Current; if (new_cp != &old_cp->Base) { if (old_cp) st->dirty |= old_cp->affected_states; assert(new_cp); - st->dirty |= st_compute_program(new_cp)->affected_states; + st->dirty |= st_program(new_cp)->affected_states; } st->compute_shader_may_be_dirty = false; diff -Nru mesa-19.2.8/src/mesa/state_tracker/st_atom_constbuf.c mesa-20.0.8/src/mesa/state_tracker/st_atom_constbuf.c --- mesa-19.2.8/src/mesa/state_tracker/st_atom_constbuf.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/state_tracker/st_atom_constbuf.c 2020-06-12 01:21:18.000000000 +0000 @@ -155,7 +155,7 @@ void st_update_gs_constants(struct st_context *st) { - struct st_common_program *gp = st->gp; + struct st_program *gp = st->gp; if (gp) st_upload_constants(st, &gp->Base); @@ -166,7 +166,7 @@ void st_update_tcs_constants(struct st_context *st) { - struct st_common_program *tcp = st->tcp; + struct st_program *tcp = st->tcp; if (tcp) st_upload_constants(st, &tcp->Base); @@ -177,7 +177,7 @@ void st_update_tes_constants(struct st_context *st) { - struct st_common_program *tep = st->tep; + struct st_program *tep = st->tep; if (tep) st_upload_constants(st, &tep->Base); @@ -188,7 +188,7 @@ void st_update_cs_constants(struct st_context *st) { - struct st_compute_program *cp = st->cp; + struct st_program *cp = st->cp; if (cp) st_upload_constants(st, &cp->Base); diff -Nru mesa-19.2.8/src/mesa/state_tracker/st_atom_framebuffer.c mesa-20.0.8/src/mesa/state_tracker/st_atom_framebuffer.c --- mesa-19.2.8/src/mesa/state_tracker/st_atom_framebuffer.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/state_tracker/st_atom_framebuffer.c 2020-06-12 01:21:18.000000000 +0000 @@ -43,7 +43,7 @@ #include "cso_cache/cso_context.h" #include "util/u_math.h" #include "util/u_inlines.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_framebuffer.h" #include "main/framebuffer.h" diff -Nru mesa-19.2.8/src/mesa/state_tracker/st_atom.h mesa-20.0.8/src/mesa/state_tracker/st_atom.h --- mesa-19.2.8/src/mesa/state_tracker/st_atom.h 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/state_tracker/st_atom.h 2020-06-12 01:21:18.000000000 +0000 @@ -38,7 +38,7 @@ struct st_context; struct st_vertex_program; -struct st_vp_variant; +struct st_common_variant; struct pipe_vertex_buffer; struct pipe_vertex_element; @@ -61,21 +61,22 @@ void st_setup_arrays(struct st_context *st, const struct st_vertex_program *vp, - const struct st_vp_variant *vp_variant, + const struct st_common_variant *vp_variant, struct pipe_vertex_element *velements, - struct pipe_vertex_buffer *vbuffer, unsigned *num_vbuffers); + struct pipe_vertex_buffer *vbuffer, unsigned *num_vbuffers, + bool *has_user_vertex_buffers); void st_setup_current(struct st_context *st, const struct st_vertex_program *vp, - const struct st_vp_variant *vp_variant, + const struct st_common_variant *vp_variant, struct pipe_vertex_element *velements, struct pipe_vertex_buffer *vbuffer, unsigned *num_vbuffers); void st_setup_current_user(struct st_context *st, const struct st_vertex_program *vp, - const struct st_vp_variant *vp_variant, + const struct st_common_variant *vp_variant, struct pipe_vertex_element *velements, struct pipe_vertex_buffer *vbuffer, unsigned *num_vbuffers); @@ -110,7 +111,7 @@ ST_NEW_SAMPLE_STATE | \ ST_NEW_SAMPLE_SHADING) -#define ST_NEW_VERTEX_PROGRAM(st, p) (p->affected_states | \ +#define ST_NEW_VERTEX_PROGRAM(st, p) ((p)->affected_states | \ (st_user_clip_planes_enabled(st->ctx) ? \ ST_NEW_CLIP_STATE : 0)) diff -Nru mesa-19.2.8/src/mesa/state_tracker/st_atom_list.h mesa-20.0.8/src/mesa/state_tracker/st_atom_list.h --- mesa-19.2.8/src/mesa/state_tracker/st_atom_list.h 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/state_tracker/st_atom_list.h 2020-06-12 01:21:18.000000000 +0000 @@ -57,6 +57,9 @@ ST_STATE(ST_NEW_FS_ATOMICS, st_bind_fs_atomics) ST_STATE(ST_NEW_GS_ATOMICS, st_bind_gs_atomics) +/* SSBOs depend on the _atomics having been updated first in the + * !has_hw_atomics case. + */ ST_STATE(ST_NEW_VS_SSBOS, st_bind_vs_ssbos) ST_STATE(ST_NEW_TCS_SSBOS, st_bind_tcs_ssbos) ST_STATE(ST_NEW_TES_SSBOS, st_bind_tes_ssbos) diff -Nru mesa-19.2.8/src/mesa/state_tracker/st_atom_rasterizer.c mesa-20.0.8/src/mesa/state_tracker/st_atom_rasterizer.c --- mesa-19.2.8/src/mesa/state_tracker/st_atom_rasterizer.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/state_tracker/st_atom_rasterizer.c 2020-06-12 01:21:18.000000000 +0000 @@ -61,13 +61,11 @@ } } - void st_update_rasterizer(struct st_context *st) { struct gl_context *ctx = st->ctx; struct pipe_rasterizer_state *raster = &st->state.rasterizer; - const struct gl_program *vertProg = ctx->VertexProgram._Current; const struct gl_program *fragProg = ctx->FragmentProgram._Current; memset(raster, 0, sizeof(*raster)); @@ -96,13 +94,15 @@ /* _NEW_LIGHT */ - raster->flatshade = ctx->Light.ShadeModel == GL_FLAT; + raster->flatshade = !st->lower_flatshade && + ctx->Light.ShadeModel == GL_FLAT; raster->flatshade_first = ctx->Light.ProvokingVertex == GL_FIRST_VERTEX_CONVENTION_EXT; /* _NEW_LIGHT | _NEW_PROGRAM */ - raster->light_twoside = _mesa_vertex_program_two_side_enabled(ctx); + if (!st->lower_two_sided_color) + raster->light_twoside = _mesa_vertex_program_two_side_enabled(ctx); /*_NEW_LIGHT | _NEW_BUFFERS */ raster->clamp_vertex_color = !st->clamp_vert_color_in_shader && @@ -198,34 +198,7 @@ /* ST_NEW_VERTEX_PROGRAM */ - if (vertProg) { - if (vertProg->Id == 0) { - if (vertProg->info.outputs_written & - BITFIELD64_BIT(VARYING_SLOT_PSIZ)) { - /* generated program which emits point size */ - raster->point_size_per_vertex = TRUE; - } - } - else if (ctx->API != API_OPENGLES2) { - /* PointSizeEnabled is always set in ES2 contexts */ - raster->point_size_per_vertex = ctx->VertexProgram.PointSizeEnabled; - } - else { - /* ST_NEW_TESSEVAL_PROGRAM | ST_NEW_GEOMETRY_PROGRAM */ - /* We have to check the last bound stage and see if it writes psize */ - struct gl_program *last = NULL; - if (ctx->GeometryProgram._Current) - last = ctx->GeometryProgram._Current; - else if (ctx->TessEvalProgram._Current) - last = ctx->TessEvalProgram._Current; - else if (ctx->VertexProgram._Current) - last = ctx->VertexProgram._Current; - if (last) - raster->point_size_per_vertex = - !!(last->info.outputs_written & - BITFIELD64_BIT(VARYING_SLOT_PSIZ)); - } - } + raster->point_size_per_vertex = st_point_size_per_vertex(ctx); if (!raster->point_size_per_vertex) { /* clamp size now */ raster->point_size = CLAMP(ctx->Point.Size, diff -Nru mesa-19.2.8/src/mesa/state_tracker/st_atom_sampler.c mesa-20.0.8/src/mesa/state_tracker/st_atom_sampler.c --- mesa-19.2.8/src/mesa/state_tracker/st_atom_sampler.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/state_tracker/st_atom_sampler.c 2020-06-12 01:21:18.000000000 +0000 @@ -50,7 +50,7 @@ #include "cso_cache/cso_context.h" -#include "util/u_format.h" +#include "util/format/u_format.h" /** @@ -304,11 +304,13 @@ st_get_texture_object(st->ctx, prog, unit); struct pipe_sampler_state *sampler = samplers + unit; - if (!stObj) + /* if resource format matches then YUV wasn't lowered */ + if (!stObj || st_get_view_format(stObj) == stObj->pt->format) continue; switch (st_get_view_format(stObj)) { case PIPE_FORMAT_NV12: + case PIPE_FORMAT_P010: case PIPE_FORMAT_P016: case PIPE_FORMAT_YUYV: case PIPE_FORMAT_UYVY: @@ -344,7 +346,9 @@ update_shader_samplers(st, PIPE_SHADER_VERTEX, - ctx->VertexProgram._Current, NULL, NULL); + ctx->VertexProgram._Current, + st->state.vert_samplers, + &st->state.num_vert_samplers); } diff -Nru mesa-19.2.8/src/mesa/state_tracker/st_atom_shader.c mesa-20.0.8/src/mesa/state_tracker/st_atom_shader.c --- mesa-19.2.8/src/mesa/state_tracker/st_atom_shader.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/state_tracker/st_atom_shader.c 2020-06-12 01:21:18.000000000 +0000 @@ -38,6 +38,7 @@ #include "main/imports.h" #include "main/mtypes.h" #include "main/framebuffer.h" +#include "main/state.h" #include "main/texobj.h" #include "main/texstate.h" #include "program/program.h" @@ -52,6 +53,7 @@ #include "st_atom.h" #include "st_program.h" #include "st_texture.h" +#include "st_util.h" static unsigned @@ -96,10 +98,10 @@ void st_update_fp( struct st_context *st ) { - struct st_fragment_program *stfp; + struct st_program *stfp; assert(st->ctx->FragmentProgram._Current); - stfp = st_fragment_program(st->ctx->FragmentProgram._Current); + stfp = st_program(st->ctx->FragmentProgram._Current); assert(stfp->Base.Target == GL_FRAGMENT_PROGRAM_ARB); void *shader; @@ -108,8 +110,8 @@ !stfp->ati_fs && /* ATI_fragment_shader always has multiple variants */ !stfp->Base.ExternalSamplersUsed && /* external samplers need variants */ stfp->variants && - !stfp->variants->key.drawpixels && - !stfp->variants->key.bitmap) { + !st_fp_variant(stfp->variants)->key.drawpixels && + !st_fp_variant(stfp->variants)->key.bitmap) { shader = stfp->variants->driver_shader; } else { struct st_fp_variant_key key; @@ -119,6 +121,18 @@ key.st = st->has_shareable_shaders ? NULL : st; + key.lower_flatshade = st->lower_flatshade && + st->ctx->Light.ShadeModel == GL_FLAT; + + /* _NEW_COLOR */ + key.lower_alpha_func = COMPARE_FUNC_NEVER; + if (st->lower_alpha_test && _mesa_is_alpha_test_enabled(st->ctx)) + key.lower_alpha_func = st->ctx->Color.AlphaFunc; + + /* _NEW_LIGHT | _NEW_PROGRAM */ + key.lower_two_sided_color = st->lower_two_sided_color && + _mesa_vertex_program_two_side_enabled(st->ctx); + /* _NEW_FRAG_CLAMP */ key.clamp_color = st->clamp_frag_color_in_shader && st->ctx->Color._ClampFragmentColor; @@ -146,10 +160,10 @@ key.external = st_get_external_sampler_key(st, &stfp->Base); - shader = st_get_fp_variant(st, stfp, &key)->driver_shader; + shader = st_get_fp_variant(st, stfp, &key)->base.driver_shader; } - st_reference_fragprog(st, &st->fp, stfp); + st_reference_prog(st, &st->fp, stfp); cso_set_fragment_shader_handle(st->cso_context, shader); } @@ -162,21 +176,22 @@ void st_update_vp( struct st_context *st ) { - struct st_vertex_program *stvp; + struct st_program *stvp; /* find active shader and params -- Should be covered by * ST_NEW_VERTEX_PROGRAM */ assert(st->ctx->VertexProgram._Current); - stvp = st_vertex_program(st->ctx->VertexProgram._Current); + stvp = st_program(st->ctx->VertexProgram._Current); assert(stvp->Base.Target == GL_VERTEX_PROGRAM_ARB); if (st->shader_has_one_variant[MESA_SHADER_VERTEX] && stvp->variants && - stvp->variants->key.passthrough_edgeflags == st->vertdata_edgeflags) { - st->vp_variant = stvp->variants; + st_common_variant(stvp->variants)->key.passthrough_edgeflags == st->vertdata_edgeflags && + !st_common_variant(stvp->variants)->key.is_draw_shader) { + st->vp_variant = st_common_variant(stvp->variants); } else { - struct st_vp_variant_key key; + struct st_common_variant_key key; memset(&key, 0, sizeof(key)); @@ -208,34 +223,42 @@ key.clip_negative_one_to_one = st->ctx->Transform.ClipDepthMode == GL_NEGATIVE_ONE_TO_ONE; + /* _NEW_POINT */ + key.lower_point_size = st->lower_point_size && + !st_point_size_per_vertex(st->ctx); + + /* _NEW_TRANSFORM */ + if (st->lower_ucp && st_user_clip_planes_enabled(st->ctx)) + key.lower_ucp = st->ctx->Transform.ClipPlanesEnabled; + st->vp_variant = st_get_vp_variant(st, stvp, &key); } - st_reference_vertprog(st, &st->vp, stvp); + st_reference_prog(st, &st->vp, stvp); cso_set_vertex_shader_handle(st->cso_context, - st->vp_variant->driver_shader); + st->vp_variant->base.driver_shader); } static void * st_update_common_program(struct st_context *st, struct gl_program *prog, - unsigned pipe_shader, struct st_common_program **dst) + unsigned pipe_shader, struct st_program **dst) { - struct st_common_program *stp; + struct st_program *stp; if (!prog) { st_reference_prog(st, dst, NULL); return NULL; } - stp = st_common_program(prog); + stp = st_program(prog); st_reference_prog(st, dst, stp); if (st->shader_has_one_variant[prog->info.stage] && stp->variants) return stp->variants->driver_shader; - struct st_basic_variant_key key; + struct st_common_variant_key key; /* use memset, not an initializer to be sure all memory is zeroed */ memset(&key, 0, sizeof(key)); @@ -264,7 +287,7 @@ } - return st_get_basic_variant(st, pipe_shader, stp, &key)->driver_shader; + return st_get_common_variant(st, stp, &key)->driver_shader; } @@ -299,29 +322,10 @@ void -st_update_cp( struct st_context *st ) +st_update_cp(struct st_context *st) { - struct st_compute_program *stcp; - - if (!st->ctx->ComputeProgram._Current) { - cso_set_compute_shader_handle(st->cso_context, NULL); - st_reference_compprog(st, &st->cp, NULL); - return; - } - - stcp = st_compute_program(st->ctx->ComputeProgram._Current); - assert(stcp->Base.Target == GL_COMPUTE_PROGRAM_NV); - - void *shader; - - if (st->shader_has_one_variant[MESA_SHADER_COMPUTE] && stcp->variants) { - shader = stcp->variants->driver_shader; - } else { - shader = st_get_cp_variant(st, &stcp->tgsi, - &stcp->variants)->driver_shader; - } - - st_reference_compprog(st, &st->cp, stcp); - + void *shader = st_update_common_program(st, + st->ctx->ComputeProgram._Current, + PIPE_SHADER_COMPUTE, &st->cp); cso_set_compute_shader_handle(st->cso_context, shader); } diff -Nru mesa-19.2.8/src/mesa/state_tracker/st_atom_stipple.c mesa-20.0.8/src/mesa/state_tracker/st_atom_stipple.c --- mesa-19.2.8/src/mesa/state_tracker/st_atom_stipple.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/state_tracker/st_atom_stipple.c 2020-06-12 01:21:18.000000000 +0000 @@ -74,7 +74,7 @@ memcpy(st->state.poly_stipple, ctx->PolygonStipple, sz); - if (_mesa_is_user_fbo(ctx->DrawBuffer)) { + if (!ctx->DrawBuffer->FlipY) { memcpy(newStipple.stipple, ctx->PolygonStipple, sizeof(newStipple.stipple)); } else { invert_stipple(newStipple.stipple, ctx->PolygonStipple, diff -Nru mesa-19.2.8/src/mesa/state_tracker/st_atom_storagebuf.c mesa-20.0.8/src/mesa/state_tracker/st_atom_storagebuf.c --- mesa-19.2.8/src/mesa/state_tracker/st_atom_storagebuf.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/state_tracker/st_atom_storagebuf.c 2020-06-12 01:21:18.000000000 +0000 @@ -46,15 +46,9 @@ { unsigned i; struct pipe_shader_buffer buffers[MAX_SHADER_STORAGE_BUFFERS]; - struct gl_program_constants *c; - int buffer_base; if (!prog || !st->pipe->set_shader_buffers) return; - c = &st->ctx->Const.Program[prog->info.stage]; - - buffer_base = st->has_hw_atomics ? 0 : c->MaxAtomicBuffers; - for (i = 0; i < prog->info.num_ssbos; i++) { struct gl_buffer_binding *binding; struct st_buffer_object *st_obj; @@ -81,16 +75,22 @@ sb->buffer_size = 0; } } - st->pipe->set_shader_buffers(st->pipe, shader_type, buffer_base, + st->pipe->set_shader_buffers(st->pipe, shader_type, 0, prog->info.num_ssbos, buffers, prog->sh.ShaderStorageBlocksWriteAccess); - /* clear out any stale shader buffers */ - if (prog->info.num_ssbos < c->MaxShaderStorageBlocks) + + /* Clear out any stale shader buffers (or lowered atomic counters). */ + int num_ssbos = prog->info.num_ssbos; + if (!st->has_hw_atomics) + num_ssbos += st->last_used_atomic_bindings[shader_type]; + if (st->last_num_ssbos[shader_type] > num_ssbos) { st->pipe->set_shader_buffers( st->pipe, shader_type, - buffer_base + prog->info.num_ssbos, - c->MaxShaderStorageBlocks - prog->info.num_ssbos, + num_ssbos, + st->last_num_ssbos[shader_type] - num_ssbos, NULL, 0); + st->last_num_ssbos[shader_type] = num_ssbos; + } } void st_bind_vs_ssbos(struct st_context *st) diff -Nru mesa-19.2.8/src/mesa/state_tracker/st_atom_texture.c mesa-20.0.8/src/mesa/state_tracker/st_atom_texture.c --- mesa-19.2.8/src/mesa/state_tracker/st_atom_texture.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/state_tracker/st_atom_texture.c 2020-06-12 01:21:18.000000000 +0000 @@ -47,7 +47,7 @@ #include "st_format.h" #include "st_cb_texture.h" #include "pipe/p_context.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_inlines.h" #include "cso_cache/cso_context.h" @@ -182,6 +182,10 @@ /* use original view as template: */ tmpl = *sampler_views[unit]; + /* if resource format matches then YUV wasn't lowered */ + if (st_get_view_format(stObj) == stObj->pt->format) + continue; + switch (st_get_view_format(stObj)) { case PIPE_FORMAT_NV12: /* we need one additional R8G8 view: */ @@ -191,6 +195,7 @@ sampler_views[extra] = st->pipe->create_sampler_view(st->pipe, stObj->pt->next, &tmpl); break; + case PIPE_FORMAT_P010: case PIPE_FORMAT_P016: /* we need one additional R16G16 view: */ tmpl.format = PIPE_FORMAT_RG1616_UNORM; @@ -262,8 +267,10 @@ const struct gl_context *ctx = st->ctx; if (ctx->Const.Program[MESA_SHADER_VERTEX].MaxTextureImageUnits > 0) { - update_textures_local(st, PIPE_SHADER_VERTEX, - ctx->VertexProgram._Current); + update_textures(st, + PIPE_SHADER_VERTEX, + ctx->VertexProgram._Current, + st->state.vert_sampler_views); } } diff -Nru mesa-19.2.8/src/mesa/state_tracker/st_cb_bitmap.c mesa-20.0.8/src/mesa/state_tracker/st_cb_bitmap.c --- mesa-19.2.8/src/mesa/state_tracker/st_cb_bitmap.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/state_tracker/st_cb_bitmap.c 2020-06-12 01:21:18.000000000 +0000 @@ -212,7 +212,7 @@ cso_set_rasterizer(cso, &st->bitmap.rasterizer); /* fragment shader state: TEX lookup program */ - cso_set_fragment_shader_handle(cso, fpv->driver_shader); + cso_set_fragment_shader_handle(cso, fpv->base.driver_shader); /* vertex shader state: position + texcoord pass-through */ cso_set_vertex_shader_handle(cso, st->passthrough_vs); @@ -564,21 +564,16 @@ st->bitmap.rasterizer.depth_clip_far = 1; /* find a usable texture format */ - if (screen->is_format_supported(screen, PIPE_FORMAT_I8_UNORM, + if (screen->is_format_supported(screen, PIPE_FORMAT_R8_UNORM, st->internal_target, 0, 0, PIPE_BIND_SAMPLER_VIEW)) { - st->bitmap.tex_format = PIPE_FORMAT_I8_UNORM; + st->bitmap.tex_format = PIPE_FORMAT_R8_UNORM; } else if (screen->is_format_supported(screen, PIPE_FORMAT_A8_UNORM, st->internal_target, 0, 0, PIPE_BIND_SAMPLER_VIEW)) { st->bitmap.tex_format = PIPE_FORMAT_A8_UNORM; } - else if (screen->is_format_supported(screen, PIPE_FORMAT_L8_UNORM, - st->internal_target, 0, 0, - PIPE_BIND_SAMPLER_VIEW)) { - st->bitmap.tex_format = PIPE_FORMAT_L8_UNORM; - } else { /* XXX support more formats */ assert(0); diff -Nru mesa-19.2.8/src/mesa/state_tracker/st_cb_bitmap.h mesa-20.0.8/src/mesa/state_tracker/st_cb_bitmap.h --- mesa-19.2.8/src/mesa/state_tracker/st_cb_bitmap.h 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/state_tracker/st_cb_bitmap.h 2020-06-12 01:21:18.000000000 +0000 @@ -35,7 +35,7 @@ struct dd_function_table; struct st_context; struct gl_program; -struct st_fragment_program; +struct st_program; extern void st_init_bitmap_functions(struct dd_function_table *functions); diff -Nru mesa-19.2.8/src/mesa/state_tracker/st_cb_bitmap_shader.c mesa-20.0.8/src/mesa/state_tracker/st_cb_bitmap_shader.c --- mesa-19.2.8/src/mesa/state_tracker/st_cb_bitmap_shader.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/state_tracker/st_cb_bitmap_shader.c 2020-06-12 01:21:18.000000000 +0000 @@ -29,7 +29,6 @@ #include "st_cb_bitmap.h" #include "tgsi/tgsi_transform.h" #include "tgsi/tgsi_scan.h" -#include "tgsi/tgsi_dump.h" #include "util/u_debug.h" struct tgsi_bitmap_transform { diff -Nru mesa-19.2.8/src/mesa/state_tracker/st_cb_blit.c mesa-20.0.8/src/mesa/state_tracker/st_cb_blit.c --- mesa-19.2.8/src/mesa/state_tracker/st_cb_blit.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/state_tracker/st_cb_blit.c 2020-06-12 01:21:18.000000000 +0000 @@ -43,7 +43,7 @@ #include "st_scissor.h" #include "st_util.h" -#include "util/u_format.h" +#include "util/format/u_format.h" static void st_BlitFramebuffer(struct gl_context *ctx, diff -Nru mesa-19.2.8/src/mesa/state_tracker/st_cb_bufferobjects.c mesa-20.0.8/src/mesa/state_tracker/st_cb_bufferobjects.c --- mesa-19.2.8/src/mesa/state_tracker/st_cb_bufferobjects.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/state_tracker/st_cb_bufferobjects.c 2020-06-12 01:21:18.000000000 +0000 @@ -245,6 +245,11 @@ } } else { + /* These are often read by the CPU, so enable CPU caches. */ + if (target == GL_PIXEL_PACK_BUFFER || + target == GL_PIXEL_UNPACK_BUFFER) + return PIPE_USAGE_STAGING; + /* BufferData */ switch (usage) { case GL_DYNAMIC_DRAW: @@ -252,14 +257,7 @@ return PIPE_USAGE_DYNAMIC; case GL_STREAM_DRAW: case GL_STREAM_COPY: - /* XXX: Remove this test and fall-through when we have PBO unpacking - * acceleration. Right now, PBO unpacking is done by the CPU, so we - * have to make sure CPU reads are fast. - */ - if (target != GL_PIXEL_UNPACK_BUFFER_ARB) { - return PIPE_USAGE_STREAM; - } - /* fall through */ + return PIPE_USAGE_STREAM; case GL_STATIC_READ: case GL_DYNAMIC_READ: case GL_STREAM_READ: @@ -291,6 +289,15 @@ struct st_memory_object *st_mem_obj = st_memory_object(memObj); bool is_mapped = _mesa_bufferobj_mapped(obj, MAP_USER); + if (size > UINT32_MAX || offset > UINT32_MAX) { + /* pipe_resource.width0 is 32 bits only and increasing it + * to 64 bits doesn't make much sense since hw support + * for > 4GB resources is limited. + */ + st_obj->Base.Size = 0; + return GL_FALSE; + } + if (target != GL_EXTERNAL_VIRTUAL_MEMORY_BUFFER_AMD && size && st_obj->buffer && st_obj->Base.Size == size && diff -Nru mesa-19.2.8/src/mesa/state_tracker/st_cb_clear.c mesa-20.0.8/src/mesa/state_tracker/st_cb_clear.c --- mesa-19.2.8/src/mesa/state_tracker/st_cb_clear.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/state_tracker/st_cb_clear.c 2020-06-12 01:21:18.000000000 +0000 @@ -56,7 +56,7 @@ #include "pipe/p_shader_tokens.h" #include "pipe/p_state.h" #include "pipe/p_defines.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_inlines.h" #include "util/u_simple_shaders.h" @@ -85,19 +85,19 @@ st_destroy_clear(struct st_context *st) { if (st->clear.fs) { - cso_delete_fragment_shader(st->cso_context, st->clear.fs); + st->pipe->delete_fs_state(st->pipe, st->clear.fs); st->clear.fs = NULL; } if (st->clear.vs) { - cso_delete_vertex_shader(st->cso_context, st->clear.vs); + st->pipe->delete_vs_state(st->pipe, st->clear.vs); st->clear.vs = NULL; } if (st->clear.vs_layered) { - cso_delete_vertex_shader(st->cso_context, st->clear.vs_layered); + st->pipe->delete_vs_state(st->pipe, st->clear.vs_layered); st->clear.vs_layered = NULL; } if (st->clear.gs_layered) { - cso_delete_geometry_shader(st->cso_context, st->clear.gs_layered); + st->pipe->delete_gs_state(st->pipe, st->clear.gs_layered); st->clear.gs_layered = NULL; } } @@ -267,7 +267,7 @@ CSO_BIT_STREAM_OUTPUTS | CSO_BIT_VERTEX_ELEMENTS | CSO_BIT_AUX_VERTEX_BUFFER_SLOT | - CSO_BIT_PAUSE_QUERIES | + (st->active_queries ? CSO_BIT_PAUSE_QUERIES : 0) | CSO_BITS_ALL_SHADERS)); /* blend state: RGBA masking */ diff -Nru mesa-19.2.8/src/mesa/state_tracker/st_cb_copyimage.c mesa-20.0.8/src/mesa/state_tracker/st_cb_copyimage.c --- mesa-19.2.8/src/mesa/state_tracker/st_cb_copyimage.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/state_tracker/st_cb_copyimage.c 2020-06-12 01:21:18.000000000 +0000 @@ -30,7 +30,7 @@ #include "state_tracker/st_util.h" #include "util/u_box.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_inlines.h" diff -Nru mesa-19.2.8/src/mesa/state_tracker/st_cb_drawpixels.c mesa-20.0.8/src/mesa/state_tracker/st_cb_drawpixels.c --- mesa-19.2.8/src/mesa/state_tracker/st_cb_drawpixels.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/state_tracker/st_cb_drawpixels.c 2020-06-12 01:21:18.000000000 +0000 @@ -71,7 +71,7 @@ #include "pipe/p_context.h" #include "pipe/p_defines.h" #include "tgsi/tgsi_ureg.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_inlines.h" #include "util/u_math.h" #include "util/u_simple_shaders.h" @@ -111,10 +111,11 @@ static nir_ssa_def * sample_via_nir(nir_builder *b, nir_variable *texcoord, - const char *name, int sampler) + const char *name, int sampler, enum glsl_base_type base_type, + nir_alu_type alu_type) { const struct glsl_type *sampler2D = - glsl_sampler_type(GLSL_SAMPLER_DIM_2D, false, false, GLSL_TYPE_FLOAT); + glsl_sampler_type(GLSL_SAMPLER_DIM_2D, false, false, base_type); nir_variable *var = nir_variable_create(b->shader, nir_var_uniform, sampler2D, name); @@ -127,7 +128,7 @@ tex->op = nir_texop_tex; tex->sampler_dim = GLSL_SAMPLER_DIM_2D; tex->coord_components = 2; - tex->dest_type = nir_type_float; + tex->dest_type = alu_type; tex->src[0].src_type = nir_tex_src_texture_deref; tex->src[0].src = nir_src_for_ssa(&deref->dest.ssa); tex->src[1].src_type = nir_tex_src_sampler_deref; @@ -163,7 +164,8 @@ nir_variable_create(b.shader, nir_var_shader_out, glsl_float_type(), "gl_FragDepth"); out->data.location = FRAG_RESULT_DEPTH; - nir_ssa_def *depth = sample_via_nir(&b, texcoord, "depth", 0); + nir_ssa_def *depth = sample_via_nir(&b, texcoord, "depth", 0, + GLSL_TYPE_FLOAT, nir_type_float); nir_store_var(&b, out, depth, 0x1); /* Also copy color */ @@ -184,7 +186,8 @@ nir_variable_create(b.shader, nir_var_shader_out, glsl_uint_type(), "gl_FragStencilRefARB"); out->data.location = FRAG_RESULT_STENCIL; - nir_ssa_def *stencil = sample_via_nir(&b, texcoord, "stencil", 1); + nir_ssa_def *stencil = sample_via_nir(&b, texcoord, "stencil", 1, + GLSL_TYPE_UINT, nir_type_uint); nir_store_var(&b, out, stencil, 0x1); } @@ -642,7 +645,8 @@ pipeFormat = st_choose_format(st, intFormat, format, type, st->internal_target, 0, 0, - PIPE_BIND_SAMPLER_VIEW, FALSE); + PIPE_BIND_SAMPLER_VIEW, + false, false); assert(pipeFormat != PIPE_FORMAT_NONE); } @@ -721,12 +725,12 @@ ctx->_ImageTransferState = imageTransferStateSave; } - _mesa_unmap_pbo_source(ctx, unpack); - #if USE_DRAWPIXELS_CACHE cache_drawpixels_image(st, width, height, format, type, unpack, pixels, pt); #endif + _mesa_unmap_pbo_source(ctx, unpack); + return pt; } @@ -1362,7 +1366,7 @@ fpv = (format != GL_COLOR_INDEX) ? get_color_fp_variant(st) : get_color_index_fp_variant(st); - driver_fp = fpv->driver_shader; + driver_fp = fpv->base.driver_shader; if (ctx->Pixel.MapColorFlag && format != GL_COLOR_INDEX) { pipe_sampler_view_reference(&sv[1], @@ -1740,7 +1744,7 @@ rbRead = st_get_color_read_renderbuffer(ctx); - driver_fp = fpv->driver_shader; + driver_fp = fpv->base.driver_shader; if (ctx->Pixel.MapColorFlag) { pipe_sampler_view_reference(&sv[1], @@ -1772,7 +1776,7 @@ if (type == GL_DEPTH) { srcFormat = st_choose_format(st, GL_DEPTH_COMPONENT, GL_NONE, GL_NONE, st->internal_target, 0, 0, - srcBind, FALSE); + srcBind, false, false); } else { assert(type == GL_COLOR); @@ -1780,27 +1784,27 @@ if (util_format_is_float(srcFormat)) { srcFormat = st_choose_format(st, GL_RGBA32F, GL_NONE, GL_NONE, st->internal_target, 0, 0, - srcBind, FALSE); + srcBind, false, false); } else if (util_format_is_pure_sint(srcFormat)) { srcFormat = st_choose_format(st, GL_RGBA32I, GL_NONE, GL_NONE, st->internal_target, 0, 0, - srcBind, FALSE); + srcBind, false, false); } else if (util_format_is_pure_uint(srcFormat)) { srcFormat = st_choose_format(st, GL_RGBA32UI, GL_NONE, GL_NONE, st->internal_target, 0, 0, - srcBind, FALSE); + srcBind, false, false); } else if (util_format_is_snorm(srcFormat)) { srcFormat = st_choose_format(st, GL_RGBA16_SNORM, GL_NONE, GL_NONE, st->internal_target, 0, 0, - srcBind, FALSE); + srcBind, false, false); } else { srcFormat = st_choose_format(st, GL_RGBA, GL_NONE, GL_NONE, st->internal_target, 0, 0, - srcBind, FALSE); + srcBind, false, false); } } @@ -1911,12 +1915,11 @@ for (i = 0; i < ARRAY_SIZE(st->drawpix.zs_shaders); i++) { if (st->drawpix.zs_shaders[i]) - cso_delete_fragment_shader(st->cso_context, - st->drawpix.zs_shaders[i]); + st->pipe->delete_fs_state(st->pipe, st->drawpix.zs_shaders[i]); } if (st->passthrough_vs) - cso_delete_vertex_shader(st->cso_context, st->passthrough_vs); + st->pipe->delete_vs_state(st->pipe, st->passthrough_vs); /* Free cache data */ for (i = 0; i < ARRAY_SIZE(st->drawpix_cache.entries); i++) { diff -Nru mesa-19.2.8/src/mesa/state_tracker/st_cb_drawtex.c mesa-20.0.8/src/mesa/state_tracker/st_cb_drawtex.c --- mesa-19.2.8/src/mesa/state_tracker/st_cb_drawtex.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/state_tracker/st_cb_drawtex.c 2020-06-12 01:21:18.000000000 +0000 @@ -360,7 +360,7 @@ { GLuint i; for (i = 0; i < NumCachedShaders; i++) { - cso_delete_vertex_shader(st->cso_context, CachedShaders[i].handle); + st->pipe->delete_vs_state(st->pipe, CachedShaders[i].handle); } NumCachedShaders = 0; } diff -Nru mesa-19.2.8/src/mesa/state_tracker/st_cb_eglimage.c mesa-20.0.8/src/mesa/state_tracker/st_cb_eglimage.c --- mesa-19.2.8/src/mesa/state_tracker/st_cb_eglimage.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/state_tracker/st_cb_eglimage.c 2020-06-12 01:21:18.000000000 +0000 @@ -29,7 +29,7 @@ #include "main/texobj.h" #include "main/teximage.h" #include "util/u_inlines.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "st_cb_eglimage.h" #include "st_cb_fbo.h" #include "st_context.h" @@ -42,11 +42,12 @@ static bool is_format_supported(struct pipe_screen *screen, enum pipe_format format, unsigned nr_samples, unsigned nr_storage_samples, - unsigned usage) + unsigned usage, bool *native_supported) { bool supported = screen->is_format_supported(screen, format, PIPE_TEXTURE_2D, nr_samples, nr_storage_samples, usage); + *native_supported = supported; /* for sampling, some formats can be emulated.. it doesn't matter that * the surface will have a format that the driver can't cope with because @@ -115,7 +116,8 @@ */ static bool st_get_egl_image(struct gl_context *ctx, GLeglImageOES image_handle, - unsigned usage, const char *error, struct st_egl_image *out) + unsigned usage, const char *error, struct st_egl_image *out, + bool *native_supported) { struct st_context *st = st_context(ctx); struct pipe_screen *screen = st->pipe->screen; @@ -133,7 +135,8 @@ } if (!is_format_supported(screen, out->format, out->texture->nr_samples, - out->texture->nr_storage_samples, usage)) { + out->texture->nr_storage_samples, usage, + native_supported)) { /* unable to specify a texture object using the specified EGL image */ pipe_resource_reference(&out->texture, NULL); _mesa_error(ctx, GL_INVALID_OPERATION, "%s(format not supported)", error); @@ -180,10 +183,11 @@ { struct st_renderbuffer *strb = st_renderbuffer(rb); struct st_egl_image stimg; + bool native_supported; if (st_get_egl_image(ctx, image_handle, PIPE_BIND_RENDER_TARGET, "glEGLImageTargetRenderbufferStorage", - &stimg)) { + &stimg, &native_supported)) { struct pipe_context *pipe = st_context(ctx)->pipe; struct pipe_surface *ps, surf_tmpl; @@ -211,7 +215,9 @@ st_bind_egl_image(struct gl_context *ctx, struct gl_texture_object *texObj, struct gl_texture_image *texImage, - struct st_egl_image *stimg) + struct st_egl_image *stimg, + bool tex_storage, + bool native_supported) { struct st_context *st = st_context(ctx); struct st_texture_object *stObj; @@ -238,41 +244,60 @@ /* TODO RequiredTextureImageUnits should probably be reset back * to 1 somewhere if different texture is bound?? */ - switch (stimg->format) { - case PIPE_FORMAT_NV12: - texFormat = MESA_FORMAT_R_UNORM8; - texObj->RequiredTextureImageUnits = 2; - break; - case PIPE_FORMAT_P016: - texFormat = MESA_FORMAT_R_UNORM16; - texObj->RequiredTextureImageUnits = 2; - break; - case PIPE_FORMAT_IYUV: - texFormat = MESA_FORMAT_R_UNORM8; - texObj->RequiredTextureImageUnits = 3; - break; - case PIPE_FORMAT_YUYV: - case PIPE_FORMAT_UYVY: - texFormat = MESA_FORMAT_R8G8_UNORM; - texObj->RequiredTextureImageUnits = 2; - break; - case PIPE_FORMAT_AYUV: - texFormat = MESA_FORMAT_R8G8B8A8_UNORM; - internalFormat = GL_RGBA; - texObj->RequiredTextureImageUnits = 1; - break; - case PIPE_FORMAT_XYUV: - texFormat = MESA_FORMAT_R8G8B8X8_UNORM; - texObj->RequiredTextureImageUnits = 1; - break; - default: + if (!native_supported) { + switch (stimg->format) { + case PIPE_FORMAT_NV12: + texFormat = MESA_FORMAT_R_UNORM8; + texObj->RequiredTextureImageUnits = 2; + break; + case PIPE_FORMAT_P016: + texFormat = MESA_FORMAT_R_UNORM16; + texObj->RequiredTextureImageUnits = 2; + break; + case PIPE_FORMAT_IYUV: + texFormat = MESA_FORMAT_R_UNORM8; + texObj->RequiredTextureImageUnits = 3; + break; + case PIPE_FORMAT_YUYV: + case PIPE_FORMAT_UYVY: + texFormat = MESA_FORMAT_RG_UNORM8; + texObj->RequiredTextureImageUnits = 2; + break; + case PIPE_FORMAT_AYUV: + texFormat = MESA_FORMAT_R8G8B8A8_UNORM; + internalFormat = GL_RGBA; + texObj->RequiredTextureImageUnits = 1; + break; + case PIPE_FORMAT_XYUV: + texFormat = MESA_FORMAT_R8G8B8X8_UNORM; + texObj->RequiredTextureImageUnits = 1; + break; + default: + unreachable("unexpected emulated format"); + break; + } + } else { texFormat = st_pipe_format_to_mesa_format(stimg->format); - break; + /* Use previously derived internalformat as specified by + * EXT_EGL_image_storage. + */ + if (tex_storage && texObj->Target == GL_TEXTURE_2D + && stimg->internalformat) { + internalFormat = stimg->internalformat; + if (internalFormat == GL_NONE) { + _mesa_error(ctx, GL_INVALID_OPERATION, __func__); + return; + } + } } assert(texFormat != MESA_FORMAT_NONE); - _mesa_init_teximage_fields(ctx, texImage, - stimg->texture->width0, stimg->texture->height0, + + /* Minify texture size based on level set on the EGLImage. */ + uint32_t width = u_minify(stimg->texture->width0, stimg->level); + uint32_t height = u_minify(stimg->texture->height0, stimg->level); + + _mesa_init_teximage_fields(ctx, texImage, width, height, 1, 0, internalFormat, texFormat); pipe_resource_reference(&stObj->pt, stimg->texture); @@ -295,12 +320,32 @@ GLeglImageOES image_handle) { struct st_egl_image stimg; + bool native_supported; + + if (!st_get_egl_image(ctx, image_handle, PIPE_BIND_SAMPLER_VIEW, + "glEGLImageTargetTexture2D", &stimg, + &native_supported)) + return; + + st_bind_egl_image(ctx, texObj, texImage, &stimg, false, native_supported); + pipe_resource_reference(&stimg.texture, NULL); +} + +static void +st_egl_image_target_tex_storage(struct gl_context *ctx, GLenum target, + struct gl_texture_object *texObj, + struct gl_texture_image *texImage, + GLeglImageOES image_handle) +{ + struct st_egl_image stimg; + bool native_supported; if (!st_get_egl_image(ctx, image_handle, PIPE_BIND_SAMPLER_VIEW, - "glEGLImageTargetTexture2D", &stimg)) + "glEGLImageTargetTexture2D", &stimg, + &native_supported)) return; - st_bind_egl_image(ctx, texObj, texImage, &stimg); + st_bind_egl_image(ctx, texObj, texImage, &stimg, true, native_supported); pipe_resource_reference(&stimg.texture, NULL); } @@ -308,5 +353,6 @@ st_init_eglimage_functions(struct dd_function_table *functions) { functions->EGLImageTargetTexture2D = st_egl_image_target_texture_2d; + functions->EGLImageTargetTexStorage = st_egl_image_target_tex_storage; functions->EGLImageTargetRenderbufferStorage = st_egl_image_target_renderbuffer_storage; } diff -Nru mesa-19.2.8/src/mesa/state_tracker/st_cb_fbo.c mesa-20.0.8/src/mesa/state_tracker/st_cb_fbo.c --- mesa-19.2.8/src/mesa/state_tracker/st_cb_fbo.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/state_tracker/st_cb_fbo.c 2020-06-12 01:21:18.000000000 +0000 @@ -56,7 +56,7 @@ #include "st_util.h" #include "st_manager.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_inlines.h" #include "util/u_surface.h" @@ -355,6 +355,7 @@ case PIPE_FORMAT_R8G8B8X8_UNORM: case PIPE_FORMAT_B8G8R8X8_UNORM: case PIPE_FORMAT_X8R8G8B8_UNORM: + case PIPE_FORMAT_R8G8B8_UNORM: strb->Base.InternalFormat = GL_RGB8; break; case PIPE_FORMAT_R8G8B8A8_SRGB: @@ -400,6 +401,9 @@ case PIPE_FORMAT_R16G16B16A16_UNORM: strb->Base.InternalFormat = GL_RGBA16; break; + case PIPE_FORMAT_R16G16B16_UNORM: + strb->Base.InternalFormat = GL_RGB16; + break; case PIPE_FORMAT_R8_UNORM: strb->Base.InternalFormat = GL_R8; break; @@ -416,6 +420,7 @@ strb->Base.InternalFormat = GL_RGBA32F; break; case PIPE_FORMAT_R32G32B32X32_FLOAT: + case PIPE_FORMAT_R32G32B32_FLOAT: strb->Base.InternalFormat = GL_RGB32F; break; case PIPE_FORMAT_R16G16B16A16_FLOAT: @@ -778,7 +783,7 @@ struct st_context *st = st_context(ctx); struct pipe_resource *prsc; - if (!att->Renderbuffer) + if (!att->Renderbuffer || !att->Complete) return; prsc = st_renderbuffer(att->Renderbuffer)->surface->texture; @@ -863,13 +868,10 @@ struct st_context *st = st_context(ctx); struct st_renderbuffer *strb = st_renderbuffer(rb); struct pipe_context *pipe = st->pipe; - const GLboolean invert = rb->Name == 0; + const GLboolean invert = flip_y; GLuint y2; GLubyte *map; - /* driver does not support GL_FRAMEBUFFER_FLIP_Y_MESA */ - assert((rb->Name == 0) == flip_y); - if (strb->software) { /* software-allocated renderbuffer (probably an accum buffer) */ if (strb->data) { diff -Nru mesa-19.2.8/src/mesa/state_tracker/st_cb_feedback.c mesa-20.0.8/src/mesa/state_tracker/st_cb_feedback.c --- mesa-19.2.8/src/mesa/state_tracker/st_cb_feedback.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/state_tracker/st_cb_feedback.c 2020-06-12 01:21:18.000000000 +0000 @@ -84,6 +84,7 @@ const struct vertex_header *v) { const struct st_context *st = st_context(ctx); + struct st_vertex_program *stvp = (struct st_vertex_program *)st->vp; GLfloat win[4]; const GLfloat *color, *texcoord; GLuint slot; @@ -101,13 +102,13 @@ * color and texcoord attribs to use here. */ - slot = st->vp->result_to_output[VARYING_SLOT_COL0]; + slot = stvp->result_to_output[VARYING_SLOT_COL0]; if (slot != ~0U) color = v->data[slot]; else color = ctx->Current.Attrib[VERT_ATTRIB_COLOR0]; - slot = st->vp->result_to_output[VARYING_SLOT_TEX0]; + slot = stvp->result_to_output[VARYING_SLOT_TEX0]; if (slot != ~0U) texcoord = v->data[slot]; else @@ -303,7 +304,7 @@ ctx->Driver.Draw = st_feedback_draw_vbo; /* need to generate/use a vertex program that emits pos/color/tex */ if (vp) - st->dirty |= ST_NEW_VERTEX_PROGRAM(st, st_vertex_program(vp)); + st->dirty |= ST_NEW_VERTEX_PROGRAM(st, st_program(vp)); } } diff -Nru mesa-19.2.8/src/mesa/state_tracker/st_cb_flush.c mesa-20.0.8/src/mesa/state_tracker/st_cb_flush.c --- mesa-19.2.8/src/mesa/state_tracker/st_cb_flush.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/state_tracker/st_cb_flush.c 2020-06-12 01:21:18.000000000 +0000 @@ -136,6 +136,18 @@ } +static void +st_device_reset_callback(void *data, enum pipe_reset_status status) +{ + struct st_context *st = data; + + assert(status != PIPE_NO_RESET); + + st->reset_status = status; + _mesa_set_context_lost_dispatch(st->ctx); +} + + /** * Query information about GPU resets observed by this context * @@ -152,24 +164,14 @@ st->reset_status = PIPE_NO_RESET; } else { status = st->pipe->get_device_reset_status(st->pipe); + if (status != PIPE_NO_RESET) + st_device_reset_callback(st, status); } return gl_reset_status_from_pipe_reset_status(status); } -static void -st_device_reset_callback(void *data, enum pipe_reset_status status) -{ - struct st_context *st = data; - - assert(status != PIPE_NO_RESET); - - st->reset_status = status; - _mesa_set_context_lost_dispatch(st->ctx); -} - - void st_install_device_reset_callback(struct st_context *st) { diff -Nru mesa-19.2.8/src/mesa/state_tracker/st_cb_memoryobjects.c mesa-20.0.8/src/mesa/state_tracker/st_cb_memoryobjects.c --- mesa-19.2.8/src/mesa/state_tracker/st_cb_memoryobjects.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/state_tracker/st_cb_memoryobjects.c 2020-06-12 01:21:18.000000000 +0000 @@ -35,6 +35,10 @@ #include "pipe/p_context.h" #include "pipe/p_screen.h" +#ifdef HAVE_LIBDRM +#include "drm-uapi/drm_fourcc.h" +#endif + static struct gl_memory_object * st_memoryobj_alloc(struct gl_context *ctx, GLuint name) { @@ -64,13 +68,13 @@ struct st_context *st = st_context(ctx); struct pipe_context *pipe = st->pipe; struct pipe_screen *screen = pipe->screen; - struct winsys_handle whandle; - - whandle.type = WINSYS_HANDLE_TYPE_FD; - whandle.handle = fd; - whandle.offset = 0; - whandle.layer = 0; - whandle.stride = 0; + struct winsys_handle whandle = { + .type = WINSYS_HANDLE_TYPE_FD, + .handle = fd, +#ifdef HAVE_LIBDRM + .modifier = DRM_FORMAT_MOD_INVALID, +#endif + }; st_obj->memory = screen->memobj_create_from_handle(screen, &whandle, diff -Nru mesa-19.2.8/src/mesa/state_tracker/st_cb_perfmon.c mesa-20.0.8/src/mesa/state_tracker/st_cb_perfmon.c --- mesa-19.2.8/src/mesa/state_tracker/st_cb_perfmon.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/state_tracker/st_cb_perfmon.c 2020-06-12 01:21:18.000000000 +0000 @@ -88,9 +88,8 @@ for (gid = 0; gid < ctx->PerfMonitor.NumGroups; gid++) { const struct gl_perf_monitor_group *g = &ctx->PerfMonitor.Groups[gid]; const struct st_perf_monitor_group *stg = &st->perfmon[gid]; - BITSET_WORD tmp; - BITSET_FOREACH_SET(cid, tmp, m->ActiveCounters[gid], g->NumCounters) { + BITSET_FOREACH_SET(cid, m->ActiveCounters[gid], g->NumCounters) { const struct st_perf_monitor_counter *stc = &stg->counters[cid]; struct st_perf_counter_object *cntr = &stm->active_counters[stm->num_active_counters]; diff -Nru mesa-19.2.8/src/mesa/state_tracker/st_cb_perfquery.c mesa-20.0.8/src/mesa/state_tracker/st_cb_perfquery.c --- mesa-19.2.8/src/mesa/state_tracker/st_cb_perfquery.c 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/mesa/state_tracker/st_cb_perfquery.c 2020-06-12 01:21:18.000000000 +0000 @@ -0,0 +1,232 @@ +/* + * Copyright © 2019 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +/** + * Intel Performance query interface to gallium. + */ + +#include "st_debug.h" +#include "st_context.h" +#include "st_cb_bitmap.h" +#include "st_cb_perfquery.h" +#include "st_util.h" + +#include "util/bitset.h" + +#include "pipe/p_context.h" +#include "pipe/p_screen.h" +#include "util/u_memory.h" + +bool +st_have_perfquery(struct st_context *st) +{ + struct pipe_context *pipe = st->pipe; + + return pipe->init_intel_perf_query_info && pipe->get_intel_perf_query_info && + pipe->get_intel_perf_query_counter_info && + pipe->new_intel_perf_query_obj && pipe->begin_intel_perf_query && + pipe->end_intel_perf_query && pipe->delete_intel_perf_query && + pipe->wait_intel_perf_query && pipe->is_intel_perf_query_ready && + pipe->get_intel_perf_query_data; +} + +static unsigned +st_InitPerfQueryInfo(struct gl_context *ctx) +{ + struct pipe_context *pipe = st_context(ctx)->pipe; + + return pipe->init_intel_perf_query_info(pipe); +} + +static void +st_GetPerfQueryInfo(struct gl_context *ctx, + unsigned query_index, + const char **name, + GLuint *data_size, + GLuint *n_counters, + GLuint *n_active) +{ + struct pipe_context *pipe = st_context(ctx)->pipe; + + pipe->get_intel_perf_query_info(pipe, query_index, name, data_size, + n_counters, n_active); +} + +static uint32_t +pipe_counter_type_enum_to_gl_type(enum pipe_perf_counter_type type) +{ + switch (type) { + case PIPE_PERF_COUNTER_TYPE_EVENT: return GL_PERFQUERY_COUNTER_EVENT_INTEL; + case PIPE_PERF_COUNTER_TYPE_DURATION_NORM: return GL_PERFQUERY_COUNTER_DURATION_NORM_INTEL; + case PIPE_PERF_COUNTER_TYPE_DURATION_RAW: return GL_PERFQUERY_COUNTER_DURATION_RAW_INTEL; + case PIPE_PERF_COUNTER_TYPE_THROUGHPUT: return GL_PERFQUERY_COUNTER_THROUGHPUT_INTEL; + case PIPE_PERF_COUNTER_TYPE_RAW: return GL_PERFQUERY_COUNTER_RAW_INTEL; + case PIPE_PERF_COUNTER_TYPE_TIMESTAMP: return GL_PERFQUERY_COUNTER_TIMESTAMP_INTEL; + default: + unreachable("Unknown counter type"); + } +} + +static uint32_t +pipe_counter_data_type_to_gl_type(enum pipe_perf_counter_data_type type) +{ + switch (type) { + case PIPE_PERF_COUNTER_DATA_TYPE_BOOL32: return GL_PERFQUERY_COUNTER_DATA_BOOL32_INTEL; + case PIPE_PERF_COUNTER_DATA_TYPE_UINT32: return GL_PERFQUERY_COUNTER_DATA_UINT32_INTEL; + case PIPE_PERF_COUNTER_DATA_TYPE_UINT64: return GL_PERFQUERY_COUNTER_DATA_UINT64_INTEL; + case PIPE_PERF_COUNTER_DATA_TYPE_FLOAT: return GL_PERFQUERY_COUNTER_DATA_FLOAT_INTEL; + case PIPE_PERF_COUNTER_DATA_TYPE_DOUBLE: return GL_PERFQUERY_COUNTER_DATA_DOUBLE_INTEL; + default: + unreachable("Unknown counter data type"); + } +} + +static void +st_GetPerfCounterInfo(struct gl_context *ctx, + unsigned query_index, + unsigned counter_index, + const char **name, + const char **desc, + GLuint *offset, + GLuint *data_size, + GLuint *type_enum, + GLuint *data_type_enum, + GLuint64 *raw_max) +{ + struct pipe_context *pipe = st_context(ctx)->pipe; + uint32_t pipe_type_enum; + uint32_t pipe_data_type_enum; + + pipe->get_intel_perf_query_counter_info(pipe, query_index, counter_index, + name, desc, offset, data_size, + &pipe_type_enum, &pipe_data_type_enum, raw_max); + *type_enum = pipe_counter_type_enum_to_gl_type(pipe_type_enum); + *data_type_enum = pipe_counter_data_type_to_gl_type(pipe_data_type_enum); +} + +static void +st_DeletePerfQuery(struct gl_context *ctx, struct gl_perf_query_object *o) +{ + struct pipe_context *pipe = st_context(ctx)->pipe; + + /* We can assume that the frontend waits for a query to complete + * before ever calling into here, so we don't have to worry about + * deleting an in-flight query object. + */ + assert(!o->Active); + assert(!o->Used || o->Ready); + + pipe->delete_intel_perf_query(pipe, (struct pipe_query *)o); +} + +static bool +st_BeginPerfQuery(struct gl_context *ctx, struct gl_perf_query_object *o) +{ + struct pipe_context *pipe = st_context(ctx)->pipe; + + /* We can assume the frontend hides mistaken attempts to Begin a + * query object multiple times before its End. Similarly if an + * application reuses a query object before results have arrived + * the frontend will wait for prior results so we don't need + * to support abandoning in-flight results. + */ + assert(!o->Active); + assert(!o->Used || o->Ready); /* no in-flight query to worry about */ + + pipe->begin_intel_perf_query(pipe, (struct pipe_query *)o); + + return true; +} + +static void +st_EndPerfQuery(struct gl_context *ctx, struct gl_perf_query_object *o) +{ + struct pipe_context *pipe = st_context(ctx)->pipe; + + pipe->end_intel_perf_query(pipe, (struct pipe_query *)o); +} + +static void +st_WaitPerfQuery(struct gl_context *ctx, struct gl_perf_query_object *o) +{ + struct pipe_context *pipe = st_context(ctx)->pipe; + + assert(!o->Ready); + + pipe->wait_intel_perf_query(pipe, (struct pipe_query *)o); +} + +static bool +st_IsPerfQueryReady(struct gl_context *ctx, struct gl_perf_query_object *o) +{ + struct pipe_context *pipe = st_context(ctx)->pipe; + + if (o->Ready) + return true; + + return pipe->is_intel_perf_query_ready(pipe, (struct pipe_query *)o); +} + +static void +st_GetPerfQueryData(struct gl_context *ctx, + struct gl_perf_query_object *o, + GLsizei data_size, + GLuint *data, + GLuint *bytes_written) +{ + struct pipe_context *pipe = st_context(ctx)->pipe; + + assert(st_IsPerfQueryReady(ctx, o)); + + /* We expect that the frontend only calls this hook when it knows + * that results are available. + */ + assert(o->Ready); + + pipe->get_intel_perf_query_data(pipe, (struct pipe_query *)o, data_size, data, + bytes_written); +} + +static struct gl_perf_query_object * +st_NewPerfQueryObject(struct gl_context *ctx, unsigned query_index) +{ + struct pipe_context *pipe = st_context(ctx)->pipe; + struct pipe_query *q; + + q = pipe->new_intel_perf_query_obj(pipe, query_index); + + return (struct gl_perf_query_object *)q; +} + +void st_init_perfquery_functions(struct dd_function_table *functions) +{ + functions->InitPerfQueryInfo = st_InitPerfQueryInfo; + functions->GetPerfQueryInfo = st_GetPerfQueryInfo; + functions->GetPerfCounterInfo = st_GetPerfCounterInfo; + functions->NewPerfQueryObject = st_NewPerfQueryObject; + functions->DeletePerfQuery = st_DeletePerfQuery; + functions->BeginPerfQuery = st_BeginPerfQuery; + functions->EndPerfQuery = st_EndPerfQuery; + functions->WaitPerfQuery = st_WaitPerfQuery; + functions->IsPerfQueryReady = st_IsPerfQueryReady; + functions->GetPerfQueryData = st_GetPerfQueryData; +} diff -Nru mesa-19.2.8/src/mesa/state_tracker/st_cb_perfquery.h mesa-20.0.8/src/mesa/state_tracker/st_cb_perfquery.h --- mesa-19.2.8/src/mesa/state_tracker/st_cb_perfquery.h 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/mesa/state_tracker/st_cb_perfquery.h 2020-06-12 01:21:18.000000000 +0000 @@ -0,0 +1,32 @@ +/* + * Copyright © 2019 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#ifndef ST_CB_PERFQUERY_H +#define ST_CB_PERFQUERY_H + +bool +st_have_perfquery(struct st_context *st); + +extern void +st_init_perfquery_functions(struct dd_function_table *functions); + +#endif diff -Nru mesa-19.2.8/src/mesa/state_tracker/st_cb_program.c mesa-20.0.8/src/mesa/state_tracker/st_cb_program.c --- mesa-19.2.8/src/mesa/state_tracker/st_cb_program.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/state_tracker/st_cb_program.c 2020-06-12 01:21:18.000000000 +0000 @@ -60,25 +60,15 @@ { switch (target) { case GL_VERTEX_PROGRAM_ARB: { - struct st_vertex_program *prog = rzalloc(NULL, - struct st_vertex_program); - return _mesa_init_gl_program(&prog->Base, target, id, is_arb_asm); - } - case GL_FRAGMENT_PROGRAM_ARB: { - struct st_fragment_program *prog = rzalloc(NULL, - struct st_fragment_program); - return _mesa_init_gl_program(&prog->Base, target, id, is_arb_asm); + struct st_vertex_program *prog = rzalloc(NULL, struct st_vertex_program); + return _mesa_init_gl_program(&prog->Base.Base, target, id, is_arb_asm); } case GL_TESS_CONTROL_PROGRAM_NV: case GL_TESS_EVALUATION_PROGRAM_NV: - case GL_GEOMETRY_PROGRAM_NV: { - struct st_common_program *prog = rzalloc(NULL, - struct st_common_program); - return _mesa_init_gl_program(&prog->Base, target, id, is_arb_asm); - } + case GL_GEOMETRY_PROGRAM_NV: + case GL_FRAGMENT_PROGRAM_ARB: case GL_COMPUTE_PROGRAM_NV: { - struct st_compute_program *prog = rzalloc(NULL, - struct st_compute_program); + struct st_program *prog = rzalloc(NULL, struct st_program); return _mesa_init_gl_program(&prog->Base, target, id, is_arb_asm); } default: @@ -95,61 +85,17 @@ st_delete_program(struct gl_context *ctx, struct gl_program *prog) { struct st_context *st = st_context(ctx); + struct st_program *stp = st_program(prog); - switch( prog->Target ) { - case GL_VERTEX_PROGRAM_ARB: - { - struct st_vertex_program *stvp = (struct st_vertex_program *) prog; - st_release_vp_variants( st, stvp ); - - if (stvp->glsl_to_tgsi) - free_glsl_to_tgsi_visitor(stvp->glsl_to_tgsi); - } - break; - case GL_TESS_CONTROL_PROGRAM_NV: - case GL_TESS_EVALUATION_PROGRAM_NV: - case GL_GEOMETRY_PROGRAM_NV: - { - struct st_common_program *p = st_common_program(prog); - - st_release_basic_variants(st, p->Base.Target, &p->variants, - &p->tgsi); - - if (p->glsl_to_tgsi) - free_glsl_to_tgsi_visitor(p->glsl_to_tgsi); - } - break; - case GL_FRAGMENT_PROGRAM_ARB: - { - struct st_fragment_program *stfp = - (struct st_fragment_program *) prog; - - st_release_fp_variants(st, stfp); - - if (stfp->glsl_to_tgsi) - free_glsl_to_tgsi_visitor(stfp->glsl_to_tgsi); - } - break; - case GL_COMPUTE_PROGRAM_NV: - { - struct st_compute_program *stcp = - (struct st_compute_program *) prog; + st_release_variants(st, stp); - st_release_cp_variants(st, stcp); - - if (stcp->glsl_to_tgsi) - free_glsl_to_tgsi_visitor(stcp->glsl_to_tgsi); - } - break; - default: - assert(0); /* problem */ - } + if (stp->glsl_to_tgsi) + free_glsl_to_tgsi_visitor(stp->glsl_to_tgsi); /* delete base class */ _mesa_delete_program( ctx, prog ); } - /** * Called via ctx->Driver.ProgramStringNotify() * Called when the program's text/code is changed. We have to free @@ -161,95 +107,33 @@ struct gl_program *prog ) { struct st_context *st = st_context(ctx); - gl_shader_stage stage = _mesa_program_enum_to_shader_stage(target); + struct st_program *stp = (struct st_program *) prog; - if (target == GL_FRAGMENT_PROGRAM_ARB) { - struct st_fragment_program *stfp = (struct st_fragment_program *) prog; + /* GLSL-to-NIR should not end up here. */ + assert(!stp->shader_program); - st_release_fp_variants(st, stfp); - if (!st_translate_fragment_program(st, stfp)) - return false; + st_release_variants(st, stp); - if (st->fp == stfp) - st->dirty |= stfp->affected_states; - } - else if (target == GL_GEOMETRY_PROGRAM_NV) { - struct st_common_program *stgp = st_common_program(prog); + if (target == GL_FRAGMENT_PROGRAM_ARB || + target == GL_FRAGMENT_SHADER_ATI) { + if (target == GL_FRAGMENT_SHADER_ATI) { + assert(stp->ati_fs); + assert(stp->ati_fs->Program == prog); - st_release_basic_variants(st, stgp->Base.Target, &stgp->variants, - &stgp->tgsi); - if (!st_translate_geometry_program(st, stgp)) - return false; - - if (st->gp == stgp) - st->dirty |= stgp->affected_states; - } - else if (target == GL_VERTEX_PROGRAM_ARB) { - struct st_vertex_program *stvp = (struct st_vertex_program *) prog; + st_init_atifs_prog(ctx, prog); + } - st_release_vp_variants(st, stvp); - if (!st_translate_vertex_program(st, stvp)) + if (!st_translate_fragment_program(st, stp)) return false; - - if (st->vp == stvp) - st->dirty |= ST_NEW_VERTEX_PROGRAM(st, stvp); - } - else if (target == GL_TESS_CONTROL_PROGRAM_NV) { - struct st_common_program *sttcp = - st_common_program(prog); - - st_release_basic_variants(st, sttcp->Base.Target, &sttcp->variants, - &sttcp->tgsi); - if (!st_translate_tessctrl_program(st, sttcp)) + } else if (target == GL_VERTEX_PROGRAM_ARB) { + if (!st_translate_vertex_program(st, stp)) return false; - - if (st->tcp == sttcp) - st->dirty |= sttcp->affected_states; - } - else if (target == GL_TESS_EVALUATION_PROGRAM_NV) { - struct st_common_program *sttep = - st_common_program(prog); - - st_release_basic_variants(st, sttep->Base.Target, &sttep->variants, - &sttep->tgsi); - if (!st_translate_tesseval_program(st, sttep)) + } else { + if (!st_translate_common_program(st, stp)) return false; - - if (st->tep == sttep) - st->dirty |= sttep->affected_states; } - else if (target == GL_COMPUTE_PROGRAM_NV) { - struct st_compute_program *stcp = - (struct st_compute_program *) prog; - - st_release_cp_variants(st, stcp); - if (!st_translate_compute_program(st, stcp)) - return false; - - if (st->cp == stcp) - st->dirty |= stcp->affected_states; - } - else if (target == GL_FRAGMENT_SHADER_ATI) { - assert(prog); - - struct st_fragment_program *stfp = (struct st_fragment_program *) prog; - assert(stfp->ati_fs); - assert(stfp->ati_fs->Program == prog); - - st_init_atifs_prog(ctx, prog); - - st_release_fp_variants(st, stfp); - if (!st_translate_fragment_program(st, stfp)) - return false; - - if (st->fp == stfp) - st->dirty |= stfp->affected_states; - } - - if (ST_DEBUG & DEBUG_PRECOMPILE || - st->shader_has_one_variant[stage]) - st_precompile_shader_variant(st, prog); + st_finalize_program(st, prog); return GL_TRUE; } @@ -262,7 +146,7 @@ { struct gl_program *prog = ctx->Driver.NewProgram(ctx, GL_FRAGMENT_PROGRAM_ARB, curProg->Id, true); - struct st_fragment_program *stfp = (struct st_fragment_program *)prog; + struct st_program *stfp = (struct st_program *)prog; stfp->ati_fs = curProg; return prog; } @@ -292,26 +176,8 @@ if (!linked || !linked->Program) continue; - switch (i) { - case MESA_SHADER_VERTEX: - if (st_vertex_program(linked->Program)->variants) - sh = st_vertex_program(linked->Program)->variants->driver_shader; - break; - case MESA_SHADER_FRAGMENT: - if (st_fragment_program(linked->Program)->variants) - sh = st_fragment_program(linked->Program)->variants->driver_shader; - break; - case MESA_SHADER_TESS_CTRL: - case MESA_SHADER_TESS_EVAL: - case MESA_SHADER_GEOMETRY: - if (st_common_program(linked->Program)->variants) - sh = st_common_program(linked->Program)->variants->driver_shader; - break; - case MESA_SHADER_COMPUTE: - if (st_compute_program(linked->Program)->variants) - sh = st_compute_program(linked->Program)->variants->driver_shader; - break; - } + if (st_program(linked->Program)->variants) + sh = st_program(linked->Program)->variants->driver_shader; unsigned type = pipe_shader_type_from_mesa(i); diff -Nru mesa-19.2.8/src/mesa/state_tracker/st_cb_queryobj.c mesa-20.0.8/src/mesa/state_tracker/st_cb_queryobj.c --- mesa-19.2.8/src/mesa/state_tracker/st_cb_queryobj.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/state_tracker/st_cb_queryobj.c 2020-06-12 01:21:18.000000000 +0000 @@ -221,6 +221,9 @@ return; } + if (stq->type != PIPE_QUERY_TIMESTAMP) + st->active_queries++; + assert(stq->type == type); } @@ -228,7 +231,8 @@ static void st_EndQuery(struct gl_context *ctx, struct gl_query_object *q) { - struct pipe_context *pipe = st_context(ctx)->pipe; + struct st_context *st = st_context(ctx); + struct pipe_context *pipe = st->pipe; struct st_query_object *stq = st_query_object(q); bool ret = false; @@ -248,6 +252,9 @@ _mesa_error(ctx, GL_OUT_OF_MEMORY, "glEndQuery"); return; } + + if (stq->type != PIPE_QUERY_TIMESTAMP) + st->active_queries--; } diff -Nru mesa-19.2.8/src/mesa/state_tracker/st_cb_rasterpos.c mesa-20.0.8/src/mesa/state_tracker/st_cb_rasterpos.c --- mesa-19.2.8/src/mesa/state_tracker/st_cb_rasterpos.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/state_tracker/st_cb_rasterpos.c 2020-06-12 01:21:18.000000000 +0000 @@ -140,7 +140,8 @@ struct gl_context *ctx = rs->ctx; struct st_context *st = st_context(ctx); const GLfloat height = (GLfloat) ctx->DrawBuffer->Height; - const ubyte *outputMapping = st->vp->result_to_output; + struct st_vertex_program *stvp = (struct st_vertex_program *)st->vp; + const ubyte *outputMapping = stvp->result_to_output; const GLfloat *pos; GLuint i; diff -Nru mesa-19.2.8/src/mesa/state_tracker/st_cb_readpixels.c mesa-20.0.8/src/mesa/state_tracker/st_cb_readpixels.c --- mesa-19.2.8/src/mesa/state_tracker/st_cb_readpixels.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/state_tracker/st_cb_readpixels.c 2020-06-12 01:21:18.000000000 +0000 @@ -33,7 +33,7 @@ #include "main/enums.h" #include "main/framebuffer.h" #include "util/u_inlines.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "cso_cache/cso_context.h" #include "st_cb_fbo.h" @@ -141,7 +141,7 @@ CSO_BIT_RASTERIZER | CSO_BIT_DEPTH_STENCIL_ALPHA | CSO_BIT_STREAM_OUTPUTS | - CSO_BIT_PAUSE_QUERIES | + (st->active_queries ? CSO_BIT_PAUSE_QUERIES : 0) | CSO_BIT_SAMPLE_MASK | CSO_BIT_MIN_SAMPLES | CSO_BIT_RENDER_CONDITION | diff -Nru mesa-19.2.8/src/mesa/state_tracker/st_cb_syncobj.c mesa-20.0.8/src/mesa/state_tracker/st_cb_syncobj.c --- mesa-19.2.8/src/mesa/state_tracker/st_cb_syncobj.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/state_tracker/st_cb_syncobj.c 2020-06-12 01:21:18.000000000 +0000 @@ -41,7 +41,7 @@ struct gl_sync_object b; struct pipe_fence_handle *fence; - mtx_t mutex; /**< protects "fence" */ + simple_mtx_t mutex; /**< protects "fence" */ }; @@ -49,7 +49,7 @@ { struct st_sync_object *so = CALLOC_STRUCT(st_sync_object); - mtx_init(&so->mutex, mtx_plain); + simple_mtx_init(&so->mutex, mtx_plain); return &so->b; } @@ -60,7 +60,7 @@ struct st_sync_object *so = (struct st_sync_object*)obj; screen->fence_reference(screen, &so->fence, NULL); - mtx_destroy(&so->mutex); + simple_mtx_destroy(&so->mutex); free(so->b.Label); free(so); } @@ -74,7 +74,8 @@ assert(condition == GL_SYNC_GPU_COMMANDS_COMPLETE && flags == 0); assert(so->fence == NULL); - pipe->flush(pipe, &so->fence, PIPE_FLUSH_DEFERRED); + /* Deferred flush are only allowed when there's a single context. See issue 1430 */ + pipe->flush(pipe, &so->fence, ctx->Shared->RefCount == 1 ? PIPE_FLUSH_DEFERRED : 0); } static void st_client_wait_sync(struct gl_context *ctx, @@ -87,9 +88,9 @@ struct pipe_fence_handle *fence = NULL; /* If the fence doesn't exist, assume it's signalled. */ - mtx_lock(&so->mutex); + simple_mtx_lock(&so->mutex); if (!so->fence) { - mtx_unlock(&so->mutex); + simple_mtx_unlock(&so->mutex); so->b.StatusFlag = GL_TRUE; return; } @@ -98,7 +99,7 @@ * fence_finish unlocked. */ screen->fence_reference(screen, &fence, so->fence); - mtx_unlock(&so->mutex); + simple_mtx_unlock(&so->mutex); /* Section 4.1.2 of OpenGL 4.5 (Compatibility Profile) says: * [...] if ClientWaitSync is called and all of the following are true: @@ -113,9 +114,9 @@ * forget to set it. */ if (screen->fence_finish(screen, pipe, fence, timeout)) { - mtx_lock(&so->mutex); + simple_mtx_lock(&so->mutex); screen->fence_reference(screen, &so->fence, NULL); - mtx_unlock(&so->mutex); + simple_mtx_unlock(&so->mutex); so->b.StatusFlag = GL_TRUE; } screen->fence_reference(screen, &fence, NULL); @@ -141,16 +142,16 @@ return; /* If the fence doesn't exist, assume it's signalled. */ - mtx_lock(&so->mutex); + simple_mtx_lock(&so->mutex); if (!so->fence) { - mtx_unlock(&so->mutex); + simple_mtx_unlock(&so->mutex); so->b.StatusFlag = GL_TRUE; return; } /* We need a local copy of the fence pointer. */ screen->fence_reference(screen, &fence, so->fence); - mtx_unlock(&so->mutex); + simple_mtx_unlock(&so->mutex); pipe->fence_server_sync(pipe, fence); screen->fence_reference(screen, &fence, NULL); diff -Nru mesa-19.2.8/src/mesa/state_tracker/st_cb_texture.c mesa-20.0.8/src/mesa/state_tracker/st_cb_texture.c --- mesa-19.2.8/src/mesa/state_tracker/st_cb_texture.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/state_tracker/st_cb_texture.c 2020-06-12 01:21:18.000000000 +0000 @@ -70,7 +70,7 @@ #include "util/u_upload_mgr.h" #include "pipe/p_shader_tokens.h" #include "util/u_tile.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_surface.h" #include "util/u_sampler.h" #include "util/u_math.h" @@ -157,6 +157,9 @@ if (!obj) return NULL; + obj->level_override = -1; + obj->layer_override = -1; + /* Pre-allocate a sampler views container to save a branch in the * fast path. */ @@ -223,6 +226,18 @@ st_texture_release_all_sampler_views(st, stObj); } +bool +st_astc_format_fallback(const struct st_context *st, mesa_format format) +{ + if (!_mesa_is_format_astc_2d(format)) + return false; + + if (format == MESA_FORMAT_RGBA_ASTC_5x5 || + format == MESA_FORMAT_SRGB8_ALPHA8_ASTC_5x5) + return !st->has_astc_5x5_ldr; + + return !st->has_astc_2d_ldr; +} bool st_compressed_format_fallback(struct st_context *st, mesa_format format) @@ -233,8 +248,8 @@ if (_mesa_is_format_etc2(format)) return !st->has_etc2; - if (_mesa_is_format_astc_2d(format)) - return !st->has_astc_2d_ldr; + if (st_astc_format_fallback(st, format)) + return true; return false; } @@ -510,6 +525,17 @@ if (stImage->base.Level > 0 || stObj->base.GenerateMipmap) return TRUE; + /* If the application has explicitly called glTextureParameter to set + * GL_TEXTURE_MAX_LEVEL, such that (max - base) > 0, then they're trying + * to communicate that they will have multiple miplevels. + * + * Core Mesa will initialize MaxLevel to value much larger than + * MAX_TEXTURE_LEVELS, so we check that to see if it's been set at all. + */ + if (stObj->base.MaxLevel < MAX_TEXTURE_LEVELS && + stObj->base.MaxLevel - stObj->base.BaseLevel > 0) + return TRUE; + if (stImage->base._BaseFormat == GL_DEPTH_COMPONENT || stImage->base._BaseFormat == GL_DEPTH_STENCIL_EXT) /* depth/stencil textures are seldom mipmapped */ @@ -754,8 +780,8 @@ assert(!st_texture_image(texImage)->pt); _mesa_clear_texture_object(ctx, texObj, texImage); - stObj->layer_override = 0; - stObj->level_override = 0; + stObj->layer_override = -1; + stObj->level_override = -1; pipe_resource_reference(&stObj->pt, NULL); /* oops, need to init this image again */ @@ -1230,7 +1256,7 @@ CSO_BIT_DEPTH_STENCIL_ALPHA | CSO_BIT_RASTERIZER | CSO_BIT_STREAM_OUTPUTS | - CSO_BIT_PAUSE_QUERIES | + (st->active_queries ? CSO_BIT_PAUSE_QUERIES : 0) | CSO_BIT_SAMPLE_MASK | CSO_BIT_MIN_SAMPLES | CSO_BIT_RENDER_CONDITION | @@ -1485,7 +1511,7 @@ } util_throttle_memory_usage(pipe, &st->throttle, - width * height * depth * + (uint64_t) width * height * depth * util_format_get_blocksize(dst->format)); u_box_3d(xoffset, yoffset, zoffset + dstz, width, height, depth, &box); @@ -1594,7 +1620,7 @@ } util_throttle_memory_usage(pipe, &st->throttle, - width * height * depth * + (uint64_t) width * height * depth * util_format_get_blocksize(src_templ.format)); throttled = true; @@ -1690,7 +1716,7 @@ fallback: if (!throttled) { util_throttle_memory_usage(pipe, &st->throttle, - width * height * depth * + (uint64_t) width * height * depth * _mesa_get_format_bytes(texImage->TexFormat)); } _mesa_store_texsubimage(ctx, dims, texImage, xoffset, yoffset, zoffset, @@ -2043,6 +2069,8 @@ case PIPE_FORMAT_ASTC_12x10: case PIPE_FORMAT_ASTC_12x12: case PIPE_FORMAT_BPTC_RGBA_UNORM: + case PIPE_FORMAT_FXT1_RGB: + case PIPE_FORMAT_FXT1_RGBA: dst_glformat = GL_RGBA8; break; case PIPE_FORMAT_RGTC1_SNORM: @@ -2087,7 +2115,8 @@ } dst_format = st_choose_format(st, dst_glformat, format, type, - pipe_target, 0, 0, bind, FALSE); + pipe_target, 0, 0, bind, + false, false); if (dst_format == PIPE_FORMAT_NONE) { /* unable to get an rgba format!?! */ diff -Nru mesa-19.2.8/src/mesa/state_tracker/st_context.c mesa-20.0.8/src/mesa/state_tracker/st_context.c --- mesa-19.2.8/src/mesa/state_tracker/st_context.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/state_tracker/st_context.c 2020-06-12 01:21:18.000000000 +0000 @@ -57,6 +57,7 @@ #include "st_cb_memoryobjects.h" #include "st_cb_msaa.h" #include "st_cb_perfmon.h" +#include "st_cb_perfquery.h" #include "st_cb_program.h" #include "st_cb_queryobj.h" #include "st_cb_readpixels.h" @@ -137,18 +138,18 @@ static uint64_t st_get_active_states(struct gl_context *ctx) { - struct st_vertex_program *vp = - st_vertex_program(ctx->VertexProgram._Current); - struct st_common_program *tcp = - st_common_program(ctx->TessCtrlProgram._Current); - struct st_common_program *tep = - st_common_program(ctx->TessEvalProgram._Current); - struct st_common_program *gp = - st_common_program(ctx->GeometryProgram._Current); - struct st_fragment_program *fp = - st_fragment_program(ctx->FragmentProgram._Current); - struct st_compute_program *cp = - st_compute_program(ctx->ComputeProgram._Current); + struct st_program *vp = + st_program(ctx->VertexProgram._Current); + struct st_program *tcp = + st_program(ctx->TessCtrlProgram._Current); + struct st_program *tep = + st_program(ctx->TessEvalProgram._Current); + struct st_program *gp = + st_program(ctx->GeometryProgram._Current); + struct st_program *fp = + st_program(ctx->FragmentProgram._Current); + struct st_program *cp = + st_program(ctx->ComputeProgram._Current); uint64_t active_shader_states = 0; if (vp) @@ -227,6 +228,10 @@ _NEW_POINT)) st->dirty |= ST_NEW_RASTERIZER; + if ((new_state & _NEW_LIGHT) && + (st->lower_flatshade || st->lower_two_sided_color)) + st->dirty |= ST_NEW_FS_STATE; + if (new_state & _NEW_PROJECTION && st_user_clip_planes_enabled(ctx)) st->dirty |= ST_NEW_CLIP_STATE; @@ -302,9 +307,9 @@ /* We need a mutex since this function may be called from one thread * while free_zombie_resource_views() is called from another. */ - mtx_lock(&st->zombie_sampler_views.mutex); - LIST_ADDTAIL(&entry->node, &st->zombie_sampler_views.list.node); - mtx_unlock(&st->zombie_sampler_views.mutex); + simple_mtx_lock(&st->zombie_sampler_views.mutex); + list_addtail(&entry->node, &st->zombie_sampler_views.list.node); + simple_mtx_unlock(&st->zombie_sampler_views.mutex); } @@ -335,9 +340,9 @@ /* We need a mutex since this function may be called from one thread * while free_zombie_shaders() is called from another. */ - mtx_lock(&st->zombie_shaders.mutex); - LIST_ADDTAIL(&entry->node, &st->zombie_shaders.list.node); - mtx_unlock(&st->zombie_shaders.mutex); + simple_mtx_lock(&st->zombie_shaders.mutex); + list_addtail(&entry->node, &st->zombie_shaders.list.node); + simple_mtx_unlock(&st->zombie_shaders.mutex); } @@ -349,15 +354,15 @@ { struct st_zombie_sampler_view_node *entry, *next; - if (LIST_IS_EMPTY(&st->zombie_sampler_views.list.node)) { + if (list_is_empty(&st->zombie_sampler_views.list.node)) { return; } - mtx_lock(&st->zombie_sampler_views.mutex); + simple_mtx_lock(&st->zombie_sampler_views.mutex); LIST_FOR_EACH_ENTRY_SAFE(entry, next, &st->zombie_sampler_views.list.node, node) { - LIST_DEL(&entry->node); // remove this entry from the list + list_del(&entry->node); // remove this entry from the list assert(entry->view->context == st->pipe); pipe_sampler_view_reference(&entry->view, NULL); @@ -365,9 +370,9 @@ free(entry); } - assert(LIST_IS_EMPTY(&st->zombie_sampler_views.list.node)); + assert(list_is_empty(&st->zombie_sampler_views.list.node)); - mtx_unlock(&st->zombie_sampler_views.mutex); + simple_mtx_unlock(&st->zombie_sampler_views.mutex); } @@ -379,34 +384,40 @@ { struct st_zombie_shader_node *entry, *next; - if (LIST_IS_EMPTY(&st->zombie_shaders.list.node)) { + if (list_is_empty(&st->zombie_shaders.list.node)) { return; } - mtx_lock(&st->zombie_shaders.mutex); + simple_mtx_lock(&st->zombie_shaders.mutex); LIST_FOR_EACH_ENTRY_SAFE(entry, next, &st->zombie_shaders.list.node, node) { - LIST_DEL(&entry->node); // remove this entry from the list + list_del(&entry->node); // remove this entry from the list switch (entry->type) { case PIPE_SHADER_VERTEX: - cso_delete_vertex_shader(st->cso_context, entry->shader); + st->pipe->bind_vs_state(st->pipe, NULL); + st->pipe->delete_vs_state(st->pipe, entry->shader); break; case PIPE_SHADER_FRAGMENT: - cso_delete_fragment_shader(st->cso_context, entry->shader); + st->pipe->bind_fs_state(st->pipe, NULL); + st->pipe->delete_fs_state(st->pipe, entry->shader); break; case PIPE_SHADER_GEOMETRY: - cso_delete_geometry_shader(st->cso_context, entry->shader); + st->pipe->bind_gs_state(st->pipe, NULL); + st->pipe->delete_gs_state(st->pipe, entry->shader); break; case PIPE_SHADER_TESS_CTRL: - cso_delete_tessctrl_shader(st->cso_context, entry->shader); + st->pipe->bind_tcs_state(st->pipe, NULL); + st->pipe->delete_tcs_state(st->pipe, entry->shader); break; case PIPE_SHADER_TESS_EVAL: - cso_delete_tesseval_shader(st->cso_context, entry->shader); + st->pipe->bind_tes_state(st->pipe, NULL); + st->pipe->delete_tes_state(st->pipe, entry->shader); break; case PIPE_SHADER_COMPUTE: - cso_delete_compute_shader(st->cso_context, entry->shader); + st->pipe->bind_compute_state(st->pipe, NULL); + st->pipe->delete_compute_state(st->pipe, entry->shader); break; default: unreachable("invalid shader type in free_zombie_shaders()"); @@ -414,9 +425,9 @@ free(entry); } - assert(LIST_IS_EMPTY(&st->zombie_shaders.list.node)); + assert(list_is_empty(&st->zombie_shaders.list.node)); - mtx_unlock(&st->zombie_shaders.mutex); + simple_mtx_unlock(&st->zombie_shaders.mutex); } @@ -449,6 +460,7 @@ st_destroy_bound_image_handles(st); for (i = 0; i < ARRAY_SIZE(st->state.frag_sampler_views); i++) { + pipe_sampler_view_reference(&st->state.vert_sampler_views[i], NULL); pipe_sampler_view_reference(&st->state.frag_sampler_views[i], NULL); } @@ -496,7 +508,12 @@ f->NewFramebufferSRGB = ST_NEW_FB_STATE; f->NewScissorRect = ST_NEW_SCISSOR; f->NewScissorTest = ST_NEW_SCISSOR | ST_NEW_RASTERIZER; - f->NewAlphaTest = ST_NEW_DSA; + + if (st->lower_alpha_test) + f->NewAlphaTest = ST_NEW_FS_STATE; + else + f->NewAlphaTest = ST_NEW_DSA; + f->NewBlend = ST_NEW_BLEND; f->NewBlendColor = ST_NEW_BLEND_COLOR; f->NewColorMask = ST_NEW_BLEND; @@ -520,7 +537,6 @@ f->NewClipControl = ST_NEW_VIEWPORT | ST_NEW_RASTERIZER; f->NewClipPlane = ST_NEW_CLIP_STATE; - f->NewClipPlaneEnable = ST_NEW_RASTERIZER; if (st->clamp_frag_depth_in_shader) { f->NewClipControl |= ST_NEW_VS_STATE | ST_NEW_GS_STATE | @@ -532,6 +548,11 @@ f->NewDepthClamp = ST_NEW_RASTERIZER; } + if (st->lower_ucp) + f->NewClipPlaneEnable = ST_NEW_VS_STATE; + else + f->NewClipPlaneEnable = ST_NEW_RASTERIZER; + f->NewLineState = ST_NEW_RASTERIZER; f->NewPolygonState = ST_NEW_RASTERIZER; f->NewPolygonStipple = ST_NEW_POLY_STIPPLE; @@ -571,9 +592,9 @@ * profile, so that u_vbuf is bypassed completely if there is nothing else * to do. */ - unsigned vbuf_flags = - ctx->API == API_OPENGL_CORE ? U_VBUF_FLAG_NO_USER_VBOS : 0; - st->cso_context = cso_create_context(pipe, vbuf_flags); + unsigned cso_flags = + ctx->API == API_OPENGL_CORE ? CSO_NO_USER_VERTEX_BUFFERS : 0; + st->cso_context = cso_create_context(pipe, cso_flags); st_init_atoms(st); st_init_clear(st); @@ -621,6 +642,11 @@ ctx->Const.PackedDriverUniformStorage = screen->get_param(screen, PIPE_CAP_PACKED_UNIFORMS); + ctx->Const.BitmapUsesRed = + screen->is_format_supported(screen, PIPE_FORMAT_R8_UNORM, + PIPE_TEXTURE_2D, 0, 0, + PIPE_BIND_SAMPLER_VIEW); + st->has_stencil_export = screen->get_param(screen, PIPE_CAP_SHADER_STENCIL_EXPORT); st->has_etc1 = screen->is_format_supported(screen, PIPE_FORMAT_ETC1_RGB8, @@ -632,6 +658,9 @@ st->has_astc_2d_ldr = screen->is_format_supported(screen, PIPE_FORMAT_ASTC_4x4_SRGB, PIPE_TEXTURE_2D, 0, 0, PIPE_BIND_SAMPLER_VIEW); + st->has_astc_5x5_ldr = + screen->is_format_supported(screen, PIPE_FORMAT_ASTC_5x5_SRGB, + PIPE_TEXTURE_2D, 0, 0, PIPE_BIND_SAMPLER_VIEW); st->prefer_blit_based_texture_transfer = screen->get_param(screen, PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER); st->force_persample_in_shader = @@ -659,6 +688,17 @@ screen->get_param(screen, PIPE_CAP_RGB_OVERRIDE_DST_ALPHA_BLEND); st->has_signed_vertex_buffer_offset = screen->get_param(screen, PIPE_CAP_SIGNED_VERTEX_BUFFER_OFFSET); + st->lower_flatshade = + !screen->get_param(screen, PIPE_CAP_FLATSHADE); + st->lower_alpha_test = + !screen->get_param(screen, PIPE_CAP_ALPHA_TEST); + st->lower_point_size = + !screen->get_param(screen, PIPE_CAP_POINT_SIZE_FIXED); + st->lower_two_sided_color = + !screen->get_param(screen, PIPE_CAP_TWO_SIDED_COLOR); + st->lower_ucp = + !screen->get_param(screen, PIPE_CAP_CLIP_PLANES); + st->allow_st_finalize_nir_twice = screen->finalize_nir != NULL; st->has_hw_atomics = screen->get_shader_param(screen, PIPE_SHADER_FRAGMENT, @@ -674,10 +714,26 @@ st_init_extensions(pipe->screen, &ctx->Const, &ctx->Extensions, &st->options, ctx->API); + /* FIXME: add support for geometry and tessellation shaders for + * lower_point_size + */ + assert(!ctx->Extensions.OES_geometry_shader || !st->lower_point_size); + assert(!ctx->Extensions.ARB_tessellation_shader || !st->lower_point_size); + + /* FIXME: add support for geometry and tessellation shaders for + * lower_ucp + */ + assert(!ctx->Extensions.OES_geometry_shader || !st->lower_ucp); + assert(!ctx->Extensions.ARB_tessellation_shader || !st->lower_ucp); + if (st_have_perfmon(st)) { ctx->Extensions.AMD_performance_monitor = GL_TRUE; } + if (st_have_perfquery(st)) { + ctx->Extensions.INTEL_performance_query = GL_TRUE; + } + /* Enable shader-based fallbacks for ARB_color_buffer_float if needed. */ if (screen->get_param(screen, PIPE_CAP_VERTEX_COLOR_UNCLAMPED)) { if (!screen->get_param(screen, PIPE_CAP_VERTEX_COLOR_CLAMPED)) { @@ -713,6 +769,13 @@ ctx->Const.ShaderCompilerOptions[MESA_SHADER_VERTEX].EmitNoSat = !screen->get_param(screen, PIPE_CAP_VERTEX_SHADER_SATURATE); + ctx->Const.ShaderCompilerOptions[MESA_SHADER_VERTEX].PositionAlwaysInvariant = options->vs_position_always_invariant; + + enum pipe_shader_ir preferred_ir = (enum pipe_shader_ir) + screen->get_shader_param(screen, PIPE_SHADER_VERTEX, + PIPE_SHADER_CAP_PREFERRED_IR); + ctx->Const.UseNIRGLSLLinker = preferred_ir == PIPE_SHADER_IR_NIR; + if (ctx->Const.GLSLVersion < 400) { for (i = 0; i < MESA_SHADER_STAGES; i++) ctx->Const.ShaderCompilerOptions[i].EmitNoIndirectSampler = true; @@ -722,13 +785,18 @@ st->shader_has_one_variant[MESA_SHADER_VERTEX] = st->has_shareable_shaders && !st->clamp_frag_depth_in_shader && - !st->clamp_vert_color_in_shader; + !st->clamp_vert_color_in_shader && + !st->lower_point_size && + !st->lower_ucp; st->shader_has_one_variant[MESA_SHADER_FRAGMENT] = st->has_shareable_shaders && + !st->lower_flatshade && + !st->lower_alpha_test && !st->clamp_frag_color_in_shader && !st->clamp_frag_depth_in_shader && - !st->force_persample_in_shader; + !st->force_persample_in_shader && + !st->lower_two_sided_color; st->shader_has_one_variant[MESA_SHADER_TESS_CTRL] = st->has_shareable_shaders; st->shader_has_one_variant[MESA_SHADER_TESS_EVAL] = @@ -759,12 +827,12 @@ st_init_driver_flags(st); /* Initialize context's winsys buffers list */ - LIST_INITHEAD(&st->winsys_buffers); + list_inithead(&st->winsys_buffers); - LIST_INITHEAD(&st->zombie_sampler_views.list.node); - mtx_init(&st->zombie_sampler_views.mutex, mtx_plain); - LIST_INITHEAD(&st->zombie_shaders.list.node); - mtx_init(&st->zombie_shaders.mutex, mtx_plain); + list_inithead(&st->zombie_sampler_views.list.node); + simple_mtx_init(&st->zombie_sampler_views.mutex, mtx_plain); + list_inithead(&st->zombie_shaders.list.node); + simple_mtx_init(&st->zombie_shaders.mutex, mtx_plain); return st; } @@ -837,6 +905,7 @@ st_init_memoryobject_functions(functions); st_init_msaa_functions(functions); st_init_perfmon_functions(functions); + st_init_perfquery_functions(functions); st_init_program_functions(functions); st_init_query_functions(functions); st_init_cond_render_functions(functions); @@ -914,8 +983,7 @@ st_debug_init(); - if (pipe->screen->get_disk_shader_cache && - !(ST_DEBUG & DEBUG_TGSI)) + if (pipe->screen->get_disk_shader_cache) ctx->Cache = pipe->screen->get_disk_shader_cache(pipe->screen); /* XXX: need a capability bit in gallium to query if the pipe @@ -1005,15 +1073,15 @@ st_context_free_zombie_objects(st); - mtx_destroy(&st->zombie_sampler_views.mutex); - mtx_destroy(&st->zombie_shaders.mutex); + simple_mtx_destroy(&st->zombie_sampler_views.mutex); + simple_mtx_destroy(&st->zombie_shaders.mutex); - st_reference_fragprog(st, &st->fp, NULL); - st_reference_prog(st, &st->gp, NULL); - st_reference_vertprog(st, &st->vp, NULL); - st_reference_prog(st, &st->tcp, NULL); - st_reference_prog(st, &st->tep, NULL); - st_reference_compprog(st, &st->cp, NULL); + st_release_program(st, &st->fp); + st_release_program(st, &st->gp); + st_release_program(st, &st->vp); + st_release_program(st, &st->tcp); + st_release_program(st, &st->tep); + st_release_program(st, &st->cp); /* release framebuffer in the winsys buffers list */ LIST_FOR_EACH_ENTRY_SAFE_REV(stfb, next, &st->winsys_buffers, head) { @@ -1029,18 +1097,13 @@ st_destroy_program_variants(st); - _mesa_free_context_data(ctx, false); + _mesa_free_context_data(ctx); /* This will free the st_context too, so 'st' must not be accessed * afterwards. */ st_destroy_context_priv(st, true); st = NULL; - /* This must be called after st_destroy_context_priv() to avoid a race - * condition between any shader compiler threads and context destruction. - */ - _mesa_destroy_shader_compiler_types(); - free(ctx); if (save_ctx == ctx) { diff -Nru mesa-19.2.8/src/mesa/state_tracker/st_context.h mesa-20.0.8/src/mesa/state_tracker/st_context.h --- mesa-19.2.8/src/mesa/state_tracker/st_context.h 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/state_tracker/st_context.h 2020-06-12 01:21:18.000000000 +0000 @@ -50,7 +50,7 @@ struct draw_stage; struct gen_mipmap_state; struct st_context; -struct st_fragment_program; +struct st_program; struct st_perf_monitor_group; struct u_upload_mgr; @@ -137,6 +137,7 @@ boolean has_etc1; boolean has_etc2; boolean has_astc_2d_ldr; + boolean has_astc_5x5_ldr; boolean prefer_blit_based_texture_transfer; boolean force_persample_in_shader; boolean has_shareable_shaders; @@ -147,6 +148,22 @@ boolean needs_rgb_dst_alpha_override; boolean can_bind_const_buffer_as_vertex; boolean has_signed_vertex_buffer_offset; + boolean lower_flatshade; + boolean lower_alpha_test; + boolean lower_point_size; + boolean lower_two_sided_color; + boolean lower_ucp; + + /* There are consequences for drivers wanting to call st_finalize_nir + * twice, once before shader caching and once after lowering for shader + * variants. If shader variants use lowering passes that are not ready + * for that, things can blow up. + * + * If this is true, st_finalize_nir and pipe_screen::finalize_nir will be + * called before the result is stored in the shader cache. If lowering for + * shader variants is invoked, the functions will be called again. + */ + boolean allow_st_finalize_nir_twice; /** * If a shader can be created when we get its source. @@ -172,8 +189,11 @@ struct pipe_blend_state blend; struct pipe_depth_stencil_alpha_state depth_stencil; struct pipe_rasterizer_state rasterizer; + struct pipe_sampler_state vert_samplers[PIPE_MAX_SAMPLERS]; struct pipe_sampler_state frag_samplers[PIPE_MAX_SAMPLERS]; + GLuint num_vert_samplers; GLuint num_frag_samplers; + struct pipe_sampler_view *vert_sampler_views[PIPE_MAX_SAMPLERS]; struct pipe_sampler_view *frag_sampler_views[PIPE_MAX_SAMPLERS]; GLuint num_sampler_views[PIPE_SHADER_TYPES]; struct pipe_clip_state clip; @@ -222,14 +242,25 @@ GLboolean vertdata_edgeflags; GLboolean edgeflag_culls_prims; - struct st_vertex_program *vp; /**< Currently bound vertex program */ - struct st_fragment_program *fp; /**< Currently bound fragment program */ - struct st_common_program *gp; /**< Currently bound geometry program */ - struct st_common_program *tcp; /**< Currently bound tess control program */ - struct st_common_program *tep; /**< Currently bound tess eval program */ - struct st_compute_program *cp; /**< Currently bound compute program */ + /** + * The number of currently active queries (excluding timer queries). + * This is used to know if we need to pause any queries for meta ops. + */ + unsigned active_queries; + + union { + struct { + struct st_program *vp; /**< Currently bound vertex program */ + struct st_program *tcp; /**< Currently bound tess control program */ + struct st_program *tep; /**< Currently bound tess eval program */ + struct st_program *gp; /**< Currently bound geometry program */ + struct st_program *fp; /**< Currently bound fragment program */ + struct st_program *cp; /**< Currently bound compute program */ + }; + struct gl_program *current_program[MESA_SHADER_STAGES]; + }; - struct st_vp_variant *vp_variant; + struct st_common_variant *vp_variant; struct { struct pipe_resource *pixelmap_texture; @@ -306,6 +337,9 @@ /* The number of vertex buffers from the last call of validate_arrays. */ unsigned last_num_vbuffers; + unsigned last_used_atomic_bindings[PIPE_SHADER_TYPES]; + unsigned last_num_ssbos[PIPE_SHADER_TYPES]; + int32_t draw_stamp; int32_t read_stamp; @@ -331,12 +365,12 @@ struct { struct st_zombie_sampler_view_node list; - mtx_t mutex; + simple_mtx_t mutex; } zombie_sampler_views; struct { struct st_zombie_shader_node list; - mtx_t mutex; + simple_mtx_t mutex; } zombie_shaders; }; diff -Nru mesa-19.2.8/src/mesa/state_tracker/st_debug.c mesa-20.0.8/src/mesa/state_tracker/st_debug.c --- mesa-19.2.8/src/mesa/state_tracker/st_debug.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/state_tracker/st_debug.c 2020-06-12 01:21:18.000000000 +0000 @@ -32,7 +32,6 @@ #include "pipe/p_state.h" #include "pipe/p_shader_tokens.h" -#include "tgsi/tgsi_dump.h" #include "cso_cache/cso_cache.h" @@ -46,7 +45,8 @@ static const struct debug_named_value st_debug_flags[] = { { "mesa", DEBUG_MESA, NULL }, - { "tgsi", DEBUG_TGSI, NULL }, + { "tgsi", DEBUG_PRINT_IR, NULL }, + { "nir", DEBUG_PRINT_IR, NULL }, { "constants",DEBUG_CONSTANTS, NULL }, { "pipe", DEBUG_PIPE, NULL }, { "tex", DEBUG_TEX, NULL }, @@ -72,37 +72,6 @@ } - -/** - * Print current state. May be called from inside gdb to see currently - * bound vertex/fragment shaders and associated constants. - */ -void -st_print_current(void) -{ - GET_CURRENT_CONTEXT(ctx); - struct st_context *st = st_context(ctx); - -#if 0 - int i; - - printf("Vertex Transform Inputs:\n"); - for (i = 0; i < st->vp->state.num_inputs; i++) { - printf(" Slot %d: VERT_ATTRIB_%d\n", i, st->vp->index_to_input[i]); - } -#endif - - if (st->vp->variants) - tgsi_dump( st->vp->variants[0].tgsi.tokens, 0 ); - if (st->vp->Base.Parameters) - _mesa_print_parameter_list(st->vp->Base.Parameters); - - tgsi_dump(st->fp->tgsi.tokens, 0); - if (st->fp->Base.Parameters) - _mesa_print_parameter_list(st->fp->Base.Parameters); -} - - /** * Installed as pipe_debug_callback when GL_DEBUG_OUTPUT is enabled. */ diff -Nru mesa-19.2.8/src/mesa/state_tracker/st_debug.h mesa-20.0.8/src/mesa/state_tracker/st_debug.h --- mesa-19.2.8/src/mesa/state_tracker/st_debug.h 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/state_tracker/st_debug.h 2020-06-12 01:21:18.000000000 +0000 @@ -34,12 +34,8 @@ struct st_context; -extern void -st_print_current(void); - - #define DEBUG_MESA 0x1 -#define DEBUG_TGSI 0x2 +#define DEBUG_PRINT_IR 0x2 #define DEBUG_CONSTANTS 0x4 #define DEBUG_PIPE 0x8 #define DEBUG_TEX 0x10 diff -Nru mesa-19.2.8/src/mesa/state_tracker/st_draw.c mesa-20.0.8/src/mesa/state_tracker/st_draw.c --- mesa-19.2.8/src/mesa/state_tracker/st_draw.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/state_tracker/st_draw.c 2020-06-12 01:21:18.000000000 +0000 @@ -61,7 +61,7 @@ #include "pipe/p_defines.h" #include "util/u_cpu_detect.h" #include "util/u_inlines.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_prim.h" #include "util/u_draw.h" #include "util/u_upload_mgr.h" diff -Nru mesa-19.2.8/src/mesa/state_tracker/st_draw_feedback.c mesa-20.0.8/src/mesa/state_tracker/st_draw_feedback.c --- mesa-19.2.8/src/mesa/state_tracker/st_draw_feedback.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/state_tracker/st_draw_feedback.c 2020-06-12 01:21:18.000000000 +0000 @@ -45,6 +45,7 @@ #include "pipe/p_defines.h" #include "util/u_inlines.h" #include "util/u_draw.h" +#include "util/format/u_format.h" #include "draw/draw_private.h" #include "draw/draw_context.h" @@ -107,8 +108,7 @@ struct pipe_context *pipe = st->pipe; struct draw_context *draw = st_get_draw_context(st); const struct st_vertex_program *vp; - struct st_vp_variant *vp_variant; - const struct pipe_shader_state *vs; + struct st_common_variant *vp_variant; struct pipe_vertex_buffer vbuffers[PIPE_MAX_SHADER_INPUTS]; unsigned num_vbuffers = 0; struct pipe_vertex_element velements[PIPE_MAX_ATTRIBS]; @@ -137,13 +137,13 @@ vbo_get_minmax_indices(ctx, prims, ib, &min_index, &max_index, nr_prims); /* must get these after state validation! */ - vp = st->vp; - vp_variant = st->vp_variant; - vs = &vp_variant->tgsi; + struct st_common_variant_key key; + /* We have to use memcpy to make sure that all bits are copied. */ + memcpy(&key, &st->vp_variant->key, sizeof(key)); + key.is_draw_shader = true; - if (!vp_variant->draw_shader) { - vp_variant->draw_shader = draw_create_vertex_shader(draw, vs); - } + vp = (struct st_vertex_program *)st->vp; + vp_variant = st_get_vp_variant(st, st->vp, &key); /* * Set up the draw module's state. @@ -155,12 +155,14 @@ draw_set_viewport_states(draw, 0, 1, &st->state.viewport[0]); draw_set_clip_state(draw, &st->state.clip); draw_set_rasterizer_state(draw, &st->state.rasterizer, NULL); - draw_bind_vertex_shader(draw, vp_variant->draw_shader); + draw_bind_vertex_shader(draw, vp_variant->base.driver_shader); set_feedback_vertex_format(ctx); /* Must setup these after state validation! */ /* Setup arrays */ - st_setup_arrays(st, vp, vp_variant, velements, vbuffers, &num_vbuffers); + bool uses_user_vertex_buffers; + st_setup_arrays(st, vp, vp_variant, velements, vbuffers, &num_vbuffers, + &uses_user_vertex_buffers); /* Setup current values as userspace arrays */ st_setup_current_user(st, vp, vp_variant, velements, vbuffers, &num_vbuffers); @@ -220,11 +222,203 @@ info.has_user_indices = false; } - /* set the constant buffer */ - draw_set_mapped_constant_buffer(st->draw, PIPE_SHADER_VERTEX, 0, + /* set constant buffers */ + draw_set_mapped_constant_buffer(draw, PIPE_SHADER_VERTEX, 0, st->state.constants[PIPE_SHADER_VERTEX].ptr, st->state.constants[PIPE_SHADER_VERTEX].size); + const struct gl_program *prog = &vp->Base.Base; + struct pipe_transfer *ubo_transfer[PIPE_MAX_CONSTANT_BUFFERS] = {0}; + assert(prog->info.num_ubos <= ARRAY_SIZE(ubo_transfer)); + + for (unsigned i = 0; i < prog->info.num_ubos; i++) { + struct gl_buffer_binding *binding = + &st->ctx->UniformBufferBindings[prog->sh.UniformBlocks[i]->Binding]; + struct st_buffer_object *st_obj = st_buffer_object(binding->BufferObject); + struct pipe_resource *buf = st_obj->buffer; + + if (!buf) + continue; + + unsigned offset = binding->Offset; + unsigned size = buf->width0 - offset; + + /* AutomaticSize is FALSE if the buffer was set with BindBufferRange. + * Take the minimum just to be sure. + */ + if (!binding->AutomaticSize) + size = MIN2(size, (unsigned) binding->Size); + + void *ptr = pipe_buffer_map_range(pipe, buf, offset, size, + PIPE_TRANSFER_READ, &ubo_transfer[i]); + + draw_set_mapped_constant_buffer(draw, PIPE_SHADER_VERTEX, 1 + i, ptr, + size); + } + + /* shader buffers */ + /* TODO: atomic counter buffers */ + struct pipe_transfer *ssbo_transfer[PIPE_MAX_SHADER_BUFFERS] = {0}; + + for (unsigned i = 0; i < prog->info.num_ssbos; i++) { + struct gl_buffer_binding *binding = + &st->ctx->ShaderStorageBufferBindings[ + prog->sh.ShaderStorageBlocks[i]->Binding]; + struct st_buffer_object *st_obj = st_buffer_object(binding->BufferObject); + struct pipe_resource *buf = st_obj->buffer; + + if (!buf) + continue; + + unsigned offset = binding->Offset; + unsigned size = buf->width0 - binding->Offset; + + /* AutomaticSize is FALSE if the buffer was set with BindBufferRange. + * Take the minimum just to be sure. + */ + if (!binding->AutomaticSize) + size = MIN2(size, (unsigned) binding->Size); + + void *ptr = pipe_buffer_map_range(pipe, buf, offset, size, + PIPE_TRANSFER_READ, &ssbo_transfer[i]); + + draw_set_mapped_shader_buffer(draw, PIPE_SHADER_VERTEX, + i, ptr, size); + } + + /* samplers */ + struct pipe_sampler_state *samplers[PIPE_MAX_SAMPLERS]; + for (unsigned i = 0; i < st->state.num_vert_samplers; i++) + samplers[i] = &st->state.vert_samplers[i]; + + draw_set_samplers(draw, PIPE_SHADER_VERTEX, samplers, + st->state.num_vert_samplers); + + /* sampler views */ + draw_set_sampler_views(draw, PIPE_SHADER_VERTEX, + st->state.vert_sampler_views, + st->state.num_sampler_views[PIPE_SHADER_VERTEX]); + + struct pipe_transfer *sv_transfer[PIPE_MAX_SAMPLERS][PIPE_MAX_TEXTURE_LEVELS]; + + for (unsigned i = 0; i < st->state.num_sampler_views[PIPE_SHADER_VERTEX]; i++) { + struct pipe_sampler_view *view = st->state.vert_sampler_views[i]; + if (!view) + continue; + + struct pipe_resource *res = view->texture; + unsigned width0 = res->width0; + unsigned num_layers = res->depth0; + unsigned first_level = 0; + unsigned last_level = 0; + uint32_t row_stride[PIPE_MAX_TEXTURE_LEVELS]; + uint32_t img_stride[PIPE_MAX_TEXTURE_LEVELS]; + uint32_t mip_offset[PIPE_MAX_TEXTURE_LEVELS]; + uintptr_t mip_addr[PIPE_MAX_TEXTURE_LEVELS]; + uintptr_t base_addr; + + if (res->target != PIPE_BUFFER) { + first_level = view->u.tex.first_level; + last_level = view->u.tex.last_level; + num_layers = view->u.tex.last_layer - view->u.tex.first_layer + 1; + base_addr = UINTPTR_MAX; + + for (unsigned j = first_level; j <= last_level; j++) { + unsigned map_layers = res->target == PIPE_TEXTURE_3D ? + util_num_layers(res, j) : num_layers; + + sv_transfer[i][j] = NULL; + mip_addr[j] = (uintptr_t) + pipe_transfer_map_3d(pipe, res, j, + PIPE_TRANSFER_READ, 0, 0, + view->u.tex.first_layer, + u_minify(res->width0, j), + u_minify(res->height0, j), + map_layers, &sv_transfer[i][j]); + row_stride[j] = sv_transfer[i][j]->stride; + img_stride[j] = sv_transfer[i][j]->layer_stride; + + /* Get the minimum address, because the draw module takes only + * 1 address for the whole texture + uint32 offsets for mip levels, + * so we need to convert mapped resource pointers into that scheme. + */ + base_addr = MIN2(base_addr, mip_addr[j]); + } + for (unsigned j = first_level; j <= last_level; j++) { + /* TODO: The draw module should accept pointers for mipmap levels + * instead of offsets. This is unlikely to work on 64-bit archs. + */ + assert(mip_addr[j] - base_addr <= UINT32_MAX); + mip_offset[j] = mip_addr[j] - base_addr; + } + } else { + width0 = view->u.buf.size / util_format_get_blocksize(view->format); + + /* probably don't really need to fill that out */ + mip_offset[0] = 0; + row_stride[0] = 0; + img_stride[0] = 0; + + sv_transfer[i][0] = NULL; + base_addr = (uintptr_t) + pipe_buffer_map_range(pipe, res, view->u.buf.offset, + view->u.buf.size, + PIPE_TRANSFER_READ, + &sv_transfer[i][0]); + } + + draw_set_mapped_texture(draw, PIPE_SHADER_VERTEX, i, width0, + res->height0, num_layers, first_level, + last_level, (void*)base_addr, row_stride, + img_stride, mip_offset); + } + + /* shader images */ + struct pipe_image_view images[PIPE_MAX_SHADER_IMAGES]; + struct pipe_transfer *img_transfer[PIPE_MAX_SHADER_IMAGES] = {0}; + + for (unsigned i = 0; i < prog->info.num_images; i++) { + struct pipe_image_view *img = &images[i]; + + st_convert_image_from_unit(st, img, prog->sh.ImageUnits[i], + prog->sh.ImageAccess[i]); + + struct pipe_resource *res = img->resource; + if (!res) + continue; + + unsigned width, height, num_layers, row_stride, img_stride; + void *addr; + + if (res->target != PIPE_BUFFER) { + width = u_minify(res->width0, img->u.tex.level); + height = u_minify(res->height0, img->u.tex.level); + num_layers = img->u.tex.last_layer - img->u.tex.first_layer + 1; + + addr = pipe_transfer_map_3d(pipe, res, img->u.tex.level, + PIPE_TRANSFER_READ, 0, 0, + img->u.tex.first_layer, + width, height, num_layers, + &img_transfer[i]); + row_stride = img_transfer[i]->stride; + img_stride = img_transfer[i]->layer_stride; + } else { + width = img->u.buf.size / util_format_get_blocksize(img->format); + + /* probably don't really need to fill that out */ + row_stride = 0; + img_stride = 0; + height = num_layers = 1; + + addr = pipe_buffer_map_range(pipe, res, img->u.buf.offset, + img->u.buf.size, PIPE_TRANSFER_READ, + &img_transfer[i]); + } + + draw_set_mapped_image(draw, PIPE_SHADER_VERTEX, i, width, height, + num_layers, addr, row_stride, img_stride); + } + draw_set_images(draw, PIPE_SHADER_VERTEX, images, prog->info.num_images); /* draw here */ for (i = 0; i < nr_prims; i++) { @@ -247,6 +441,48 @@ draw_vbo(draw, &info); } + /* unmap images */ + for (unsigned i = 0; i < prog->info.num_images; i++) { + if (img_transfer[i]) { + draw_set_mapped_image(draw, PIPE_SHADER_VERTEX, i, 0, 0, 0, NULL, 0, 0); + pipe_transfer_unmap(pipe, img_transfer[i]); + } + } + + /* unmap sampler views */ + for (unsigned i = 0; i < st->state.num_sampler_views[PIPE_SHADER_VERTEX]; i++) { + struct pipe_sampler_view *view = st->state.vert_sampler_views[i]; + + if (view) { + if (view->texture->target != PIPE_BUFFER) { + for (unsigned j = view->u.tex.first_level; + j <= view->u.tex.last_level; j++) { + pipe_transfer_unmap(pipe, sv_transfer[i][j]); + } + } else { + pipe_transfer_unmap(pipe, sv_transfer[i][0]); + } + } + } + + draw_set_samplers(draw, PIPE_SHADER_VERTEX, NULL, 0); + draw_set_sampler_views(draw, PIPE_SHADER_VERTEX, NULL, 0); + + for (unsigned i = 0; i < prog->info.num_ssbos; i++) { + if (ssbo_transfer[i]) { + draw_set_mapped_constant_buffer(draw, PIPE_SHADER_VERTEX, 1 + i, + NULL, 0); + pipe_buffer_unmap(pipe, ssbo_transfer[i]); + } + } + + for (unsigned i = 0; i < prog->info.num_ubos; i++) { + if (ubo_transfer[i]) { + draw_set_mapped_constant_buffer(draw, PIPE_SHADER_VERTEX, 1 + i, + NULL, 0); + pipe_buffer_unmap(pipe, ubo_transfer[i]); + } + } /* * unmap vertex/index buffers diff -Nru mesa-19.2.8/src/mesa/state_tracker/st_extensions.c mesa-20.0.8/src/mesa/state_tracker/st_extensions.c --- mesa-19.2.8/src/mesa/state_tracker/st_extensions.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/state_tracker/st_extensions.c 2020-06-12 01:21:18.000000000 +0000 @@ -31,6 +31,7 @@ #include "main/imports.h" #include "main/context.h" #include "main/macros.h" +#include "main/spirv_extensions.h" #include "main/version.h" #include "pipe/p_context.h" @@ -131,11 +132,8 @@ c->MaxPointSizeAA = _maxf(1.0f, screen->get_paramf(screen, PIPE_CAPF_MAX_POINT_WIDTH_AA)); - /* these are not queryable. Note that GL basically mandates a 1.0 minimum - * for non-aa sizes, but we can go down to 0.0 for aa points. - */ c->MinPointSize = 1.0f; - c->MinPointSizeAA = 0.0f; + c->MinPointSizeAA = 1.0f; c->MaxTextureMaxAnisotropy = _maxf(2.0f, @@ -165,7 +163,10 @@ struct gl_program_constants *pc; const nir_shader_compiler_options *nir_options = NULL; - if (screen->get_compiler_options) { + bool prefer_nir = PIPE_SHADER_IR_NIR == + screen->get_shader_param(screen, sh, PIPE_SHADER_CAP_PREFERRED_IR); + + if (screen->get_compiler_options && prefer_nir) { nir_options = (const nir_shader_compiler_options *) screen->get_compiler_options(screen, PIPE_SHADER_IR_NIR, sh); } @@ -326,9 +327,6 @@ if (!screen->get_param(screen, PIPE_CAP_NIR_COMPACT_ARRAYS)) options->LowerCombinedClipCullDistance = true; - bool prefer_nir = PIPE_SHADER_IR_NIR == - screen->get_shader_param(screen, sh, PIPE_SHADER_CAP_PREFERRED_IR); - /* NIR can do the lowering on our behalf and we'll get better results * because it can actually optimize SSBO access. */ @@ -344,6 +342,8 @@ c->GLSLOptimizeConservatively = screen->get_param(screen, PIPE_CAP_GLSL_OPTIMIZE_CONSERVATIVELY); + c->GLSLLowerConstArrays = + screen->get_param(screen, PIPE_CAP_PREFER_IMM_ARRAYS_AS_CONSTBUF); c->GLSLTessLevelsAsInputs = screen->get_param(screen, PIPE_CAP_GLSL_TESS_LEVELS_AS_INPUTS); c->LowerTessLevel = @@ -704,8 +704,10 @@ { o(ARB_draw_buffers_blend), PIPE_CAP_INDEP_BLEND_FUNC }, { o(ARB_draw_indirect), PIPE_CAP_DRAW_INDIRECT }, { o(ARB_draw_instanced), PIPE_CAP_TGSI_INSTANCEID }, + { o(ARB_fragment_program_shadow), PIPE_CAP_TEXTURE_SHADOW_MAP }, { o(ARB_framebuffer_object), PIPE_CAP_MIXED_FRAMEBUFFER_SIZES }, { o(ARB_gpu_shader_int64), PIPE_CAP_INT64 }, + { o(ARB_gl_spirv), PIPE_CAP_GL_SPIRV }, { o(ARB_indirect_parameters), PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS }, { o(ARB_instanced_arrays), PIPE_CAP_VERTEX_ELEMENT_INSTANCE_DIVISOR }, { o(ARB_occlusion_query), PIPE_CAP_OCCLUSION_QUERY }, @@ -729,7 +731,9 @@ { o(ARB_shader_stencil_export), PIPE_CAP_SHADER_STENCIL_EXPORT }, { o(ARB_shader_texture_image_samples), PIPE_CAP_TGSI_TXQS }, { o(ARB_shader_texture_lod), PIPE_CAP_FRAGMENT_SHADER_TEXTURE_LOD }, + { o(ARB_shadow), PIPE_CAP_TEXTURE_SHADOW_MAP }, { o(ARB_sparse_buffer), PIPE_CAP_SPARSE_BUFFER_PAGE_SIZE }, + { o(ARB_spirv_extensions), PIPE_CAP_GL_SPIRV }, { o(ARB_texture_buffer_object), PIPE_CAP_TEXTURE_BUFFER_OBJECTS }, { o(ARB_texture_cube_map_array), PIPE_CAP_CUBE_MAP_ARRAY }, { o(ARB_texture_gather), PIPE_CAP_MAX_TEXTURE_GATHER_COMPONENTS }, @@ -745,6 +749,7 @@ { o(ARB_fragment_shader_interlock), PIPE_CAP_FRAGMENT_SHADER_INTERLOCK }, { o(EXT_blend_equation_separate), PIPE_CAP_BLEND_EQUATION_SEPARATE }, + { o(EXT_demote_to_helper_invocation), PIPE_CAP_DEMOTE_TO_HELPER_INVOCATION }, { o(EXT_depth_bounds_test), PIPE_CAP_DEPTH_BOUNDS_TEST }, { o(EXT_disjoint_timer_query), PIPE_CAP_QUERY_TIMESTAMP }, { o(EXT_draw_buffers2), PIPE_CAP_INDEP_BLEND_ENABLE }, @@ -817,6 +822,11 @@ { PIPE_FORMAT_R8_UNORM, PIPE_FORMAT_R8G8_UNORM } }, + { { o(EXT_texture_norm16) }, + { PIPE_FORMAT_R16_UNORM, + PIPE_FORMAT_R16G16_UNORM, + PIPE_FORMAT_R16G16B16A16_UNORM } }, + { { o(EXT_render_snorm) }, { PIPE_FORMAT_R8_SNORM, PIPE_FORMAT_R8G8_SNORM, @@ -872,6 +882,10 @@ PIPE_FORMAT_BPTC_RGB_FLOAT, PIPE_FORMAT_BPTC_RGB_UFLOAT } }, + { { o(TDFX_texture_compression_FXT1) }, + { PIPE_FORMAT_FXT1_RGB, + PIPE_FORMAT_FXT1_RGBA } }, + { { o(KHR_texture_compression_astc_ldr), o(KHR_texture_compression_astc_sliced_3d) }, { PIPE_FORMAT_ASTC_4x4, @@ -992,13 +1006,11 @@ extensions->ARB_explicit_uniform_location = GL_TRUE; extensions->ARB_fragment_coord_conventions = GL_TRUE; extensions->ARB_fragment_program = GL_TRUE; - extensions->ARB_fragment_program_shadow = GL_TRUE; extensions->ARB_fragment_shader = GL_TRUE; extensions->ARB_half_float_vertex = GL_TRUE; extensions->ARB_internalformat_query = GL_TRUE; extensions->ARB_internalformat_query2 = GL_TRUE; extensions->ARB_map_buffer_range = GL_TRUE; - extensions->ARB_shadow = GL_TRUE; extensions->ARB_sync = GL_TRUE; extensions->ARB_texture_border_clamp = GL_TRUE; extensions->ARB_texture_cube_map = GL_TRUE; @@ -1011,6 +1023,7 @@ extensions->EXT_blend_color = GL_TRUE; extensions->EXT_blend_func_separate = GL_TRUE; extensions->EXT_blend_minmax = GL_TRUE; + extensions->EXT_EGL_image_storage = GL_TRUE; extensions->EXT_gpu_program_parameters = GL_TRUE; extensions->EXT_pixel_buffer_object = GL_TRUE; extensions->EXT_point_parameters = GL_TRUE; @@ -1021,6 +1034,7 @@ extensions->ATI_fragment_shader = GL_TRUE; extensions->ATI_texture_env_combine3 = GL_TRUE; + extensions->MESA_framebuffer_flip_y = GL_TRUE; extensions->MESA_pack_invert = GL_TRUE; extensions->NV_fog_distance = GL_TRUE; @@ -1136,6 +1150,11 @@ extensions->EXT_shader_integer_mix = GL_TRUE; extensions->ARB_arrays_of_arrays = GL_TRUE; extensions->MESA_shader_integer_functions = GL_TRUE; + + if (screen->get_param(screen, PIPE_CAP_OPENCL_INTEGER_FUNCTIONS) && + screen->get_param(screen, PIPE_CAP_INTEGER_MULTIPLY_32X16)) { + extensions->INTEL_shader_integer_functions2 = GL_TRUE; + } } else { /* Optional integer support for GLSL 1.2. */ if (screen->get_shader_param(screen, PIPE_SHADER_VERTEX, @@ -1170,7 +1189,7 @@ * invocations of a geometry shader. There is no separate cap for that, so * we check the GLSLVersion. */ - if (GLSLVersion >= 400 && + if ((GLSLVersion >= 400 || ESSLVersion >= 310) && screen->get_shader_param(screen, PIPE_SHADER_GEOMETRY, PIPE_SHADER_CAP_MAX_INSTRUCTIONS) > 0) { extensions->OES_geometry_shader = GL_TRUE; @@ -1443,7 +1462,6 @@ */ if (GLSLVersion >= 130 && extensions->ARB_uniform_buffer_object && - extensions->ARB_shader_bit_encoding && extensions->NV_primitive_restart && screen->get_shader_param(screen, PIPE_SHADER_VERTEX, PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS) >= 16 && @@ -1654,4 +1672,23 @@ pre_snap_triangles && pre_snap_points_lines; } } + + if (extensions->ARB_gl_spirv) { + struct spirv_supported_capabilities *spirv_caps = &consts->SpirVCapabilities; + + spirv_caps->atomic_storage = extensions->ARB_shader_atomic_counters; + spirv_caps->draw_parameters = extensions->ARB_shader_draw_parameters; + spirv_caps->float64 = extensions->ARB_gpu_shader_fp64; + spirv_caps->geometry_streams = extensions->ARB_gpu_shader5; + spirv_caps->image_write_without_format = extensions->ARB_shader_image_load_store; + spirv_caps->int64 = extensions->ARB_gpu_shader_int64; + spirv_caps->tessellation = extensions->ARB_tessellation_shader; + spirv_caps->transform_feedback = extensions->ARB_transform_feedback3; + spirv_caps->variable_pointers = + screen->get_param(screen, PIPE_CAP_GL_SPIRV_VARIABLE_POINTERS); + spirv_caps->integer_functions2 = extensions->INTEL_shader_integer_functions2; + + consts->SpirVExtensions = CALLOC_STRUCT(spirv_supported_extensions); + _mesa_fill_supported_spirv_extensions(consts->SpirVExtensions, spirv_caps); + } } diff -Nru mesa-19.2.8/src/mesa/state_tracker/st_format.c mesa-20.0.8/src/mesa/state_tracker/st_format.c --- mesa-19.2.8/src/mesa/state_tracker/st_format.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/state_tracker/st_format.c 2020-06-12 01:21:18.000000000 +0000 @@ -48,7 +48,7 @@ #include "pipe/p_context.h" #include "pipe/p_defines.h" #include "pipe/p_screen.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "st_cb_texture.h" #include "st_context.h" #include "st_format.h" @@ -63,555 +63,57 @@ mesa_format mesaFormat) { struct pipe_screen *screen = st->pipe->screen; - bool has_bgra_srgb = screen->is_format_supported(screen, - PIPE_FORMAT_B8G8R8A8_SRGB, - PIPE_TEXTURE_2D, 0, 0, - PIPE_BIND_SAMPLER_VIEW); - - switch (mesaFormat) { - case MESA_FORMAT_A8B8G8R8_UNORM: - return PIPE_FORMAT_ABGR8888_UNORM; - case MESA_FORMAT_R8G8B8A8_UNORM: - return PIPE_FORMAT_RGBA8888_UNORM; - case MESA_FORMAT_B8G8R8A8_UNORM: - return PIPE_FORMAT_BGRA8888_UNORM; - case MESA_FORMAT_A8R8G8B8_UNORM: - return PIPE_FORMAT_ARGB8888_UNORM; - case MESA_FORMAT_X8B8G8R8_UNORM: - return PIPE_FORMAT_XBGR8888_UNORM; - case MESA_FORMAT_R8G8B8X8_UNORM: - return PIPE_FORMAT_RGBX8888_UNORM; - case MESA_FORMAT_B8G8R8X8_UNORM: - return PIPE_FORMAT_BGRX8888_UNORM; - case MESA_FORMAT_X8R8G8B8_UNORM: - return PIPE_FORMAT_XRGB8888_UNORM; - case MESA_FORMAT_B5G5R5A1_UNORM: - return PIPE_FORMAT_B5G5R5A1_UNORM; - case MESA_FORMAT_A1B5G5R5_UNORM: - return PIPE_FORMAT_A1B5G5R5_UNORM; - case MESA_FORMAT_B4G4R4A4_UNORM: - return PIPE_FORMAT_B4G4R4A4_UNORM; - case MESA_FORMAT_A4B4G4R4_UNORM: - return PIPE_FORMAT_A4B4G4R4_UNORM; - case MESA_FORMAT_B5G6R5_UNORM: - return PIPE_FORMAT_B5G6R5_UNORM; - case MESA_FORMAT_B2G3R3_UNORM: - return PIPE_FORMAT_B2G3R3_UNORM; - case MESA_FORMAT_B10G10R10A2_UNORM: - return PIPE_FORMAT_B10G10R10A2_UNORM; - case MESA_FORMAT_R10G10B10A2_UNORM: - return PIPE_FORMAT_R10G10B10A2_UNORM; - case MESA_FORMAT_R10G10B10X2_UNORM: - return PIPE_FORMAT_R10G10B10X2_UNORM; - case MESA_FORMAT_L4A4_UNORM: - return PIPE_FORMAT_L4A4_UNORM; - case MESA_FORMAT_L8A8_UNORM: - return PIPE_FORMAT_LA88_UNORM; - case MESA_FORMAT_A8L8_UNORM: - return PIPE_FORMAT_AL88_UNORM; - case MESA_FORMAT_L16A16_UNORM: - return PIPE_FORMAT_LA1616_UNORM; - case MESA_FORMAT_A16L16_UNORM: - return PIPE_FORMAT_AL1616_UNORM; - case MESA_FORMAT_A_UNORM8: - return PIPE_FORMAT_A8_UNORM; - case MESA_FORMAT_A_UNORM16: - return PIPE_FORMAT_A16_UNORM; - case MESA_FORMAT_L_UNORM8: - return PIPE_FORMAT_L8_UNORM; - case MESA_FORMAT_L_UNORM16: - return PIPE_FORMAT_L16_UNORM; - case MESA_FORMAT_I_UNORM8: - return PIPE_FORMAT_I8_UNORM; - case MESA_FORMAT_I_UNORM16: - return PIPE_FORMAT_I16_UNORM; - case MESA_FORMAT_Z_UNORM16: - return PIPE_FORMAT_Z16_UNORM; - case MESA_FORMAT_Z_UNORM32: - return PIPE_FORMAT_Z32_UNORM; - case MESA_FORMAT_S8_UINT_Z24_UNORM: - return PIPE_FORMAT_S8_UINT_Z24_UNORM; - case MESA_FORMAT_Z24_UNORM_S8_UINT: - return PIPE_FORMAT_Z24_UNORM_S8_UINT; - case MESA_FORMAT_X8_UINT_Z24_UNORM: - return PIPE_FORMAT_X8Z24_UNORM; - case MESA_FORMAT_Z24_UNORM_X8_UINT: - return PIPE_FORMAT_Z24X8_UNORM; - case MESA_FORMAT_S_UINT8: - return PIPE_FORMAT_S8_UINT; - case MESA_FORMAT_Z_FLOAT32: - return PIPE_FORMAT_Z32_FLOAT; - case MESA_FORMAT_Z32_FLOAT_S8X24_UINT: - return PIPE_FORMAT_Z32_FLOAT_S8X24_UINT; - case MESA_FORMAT_YCBCR: - return PIPE_FORMAT_UYVY; - case MESA_FORMAT_YCBCR_REV: - return PIPE_FORMAT_YUYV; - case MESA_FORMAT_RGB_DXT1: - return PIPE_FORMAT_DXT1_RGB; - case MESA_FORMAT_RGBA_DXT1: - return PIPE_FORMAT_DXT1_RGBA; - case MESA_FORMAT_RGBA_DXT3: - return PIPE_FORMAT_DXT3_RGBA; - case MESA_FORMAT_RGBA_DXT5: - return PIPE_FORMAT_DXT5_RGBA; - case MESA_FORMAT_SRGB_DXT1: - return PIPE_FORMAT_DXT1_SRGB; - case MESA_FORMAT_SRGBA_DXT1: - return PIPE_FORMAT_DXT1_SRGBA; - case MESA_FORMAT_SRGBA_DXT3: - return PIPE_FORMAT_DXT3_SRGBA; - case MESA_FORMAT_SRGBA_DXT5: - return PIPE_FORMAT_DXT5_SRGBA; - case MESA_FORMAT_L8A8_SRGB: - return PIPE_FORMAT_LA88_SRGB; - case MESA_FORMAT_A8L8_SRGB: - return PIPE_FORMAT_AL88_SRGB; - case MESA_FORMAT_L_SRGB8: - return PIPE_FORMAT_L8_SRGB; - case MESA_FORMAT_R_SRGB8: - return PIPE_FORMAT_R8_SRGB; - case MESA_FORMAT_BGR_SRGB8: - return PIPE_FORMAT_R8G8B8_SRGB; - case MESA_FORMAT_A8B8G8R8_SRGB: - return PIPE_FORMAT_ABGR8888_SRGB; - case MESA_FORMAT_R8G8B8A8_SRGB: - return PIPE_FORMAT_RGBA8888_SRGB; - case MESA_FORMAT_B8G8R8A8_SRGB: - return PIPE_FORMAT_BGRA8888_SRGB; - case MESA_FORMAT_A8R8G8B8_SRGB: - return PIPE_FORMAT_ARGB8888_SRGB; - case MESA_FORMAT_RGBA_FLOAT32: - return PIPE_FORMAT_R32G32B32A32_FLOAT; - case MESA_FORMAT_RGBA_FLOAT16: - return PIPE_FORMAT_R16G16B16A16_FLOAT; - case MESA_FORMAT_RGB_FLOAT32: - return PIPE_FORMAT_R32G32B32_FLOAT; - case MESA_FORMAT_RGB_FLOAT16: - return PIPE_FORMAT_R16G16B16_FLOAT; - case MESA_FORMAT_LA_FLOAT32: - return PIPE_FORMAT_L32A32_FLOAT; - case MESA_FORMAT_LA_FLOAT16: - return PIPE_FORMAT_L16A16_FLOAT; - case MESA_FORMAT_L_FLOAT32: - return PIPE_FORMAT_L32_FLOAT; - case MESA_FORMAT_L_FLOAT16: - return PIPE_FORMAT_L16_FLOAT; - case MESA_FORMAT_A_FLOAT32: - return PIPE_FORMAT_A32_FLOAT; - case MESA_FORMAT_A_FLOAT16: - return PIPE_FORMAT_A16_FLOAT; - case MESA_FORMAT_I_FLOAT32: - return PIPE_FORMAT_I32_FLOAT; - case MESA_FORMAT_I_FLOAT16: - return PIPE_FORMAT_I16_FLOAT; - case MESA_FORMAT_R_FLOAT32: - return PIPE_FORMAT_R32_FLOAT; - case MESA_FORMAT_R_FLOAT16: - return PIPE_FORMAT_R16_FLOAT; - case MESA_FORMAT_RG_FLOAT32: - return PIPE_FORMAT_R32G32_FLOAT; - case MESA_FORMAT_RG_FLOAT16: - return PIPE_FORMAT_R16G16_FLOAT; - - case MESA_FORMAT_R_UNORM8: - return PIPE_FORMAT_R8_UNORM; - case MESA_FORMAT_R_UNORM16: - return PIPE_FORMAT_R16_UNORM; - case MESA_FORMAT_R8G8_UNORM: - return PIPE_FORMAT_RG88_UNORM; - case MESA_FORMAT_G8R8_UNORM: - return PIPE_FORMAT_GR88_UNORM; - case MESA_FORMAT_R16G16_UNORM: - return PIPE_FORMAT_RG1616_UNORM; - case MESA_FORMAT_G16R16_UNORM: - return PIPE_FORMAT_GR1616_UNORM; - case MESA_FORMAT_RGBA_UNORM16: - return PIPE_FORMAT_R16G16B16A16_UNORM; - - /* signed int formats */ - case MESA_FORMAT_A_UINT8: - return PIPE_FORMAT_A8_UINT; - case MESA_FORMAT_A_UINT16: - return PIPE_FORMAT_A16_UINT; - case MESA_FORMAT_A_UINT32: - return PIPE_FORMAT_A32_UINT; - - case MESA_FORMAT_A_SINT8: - return PIPE_FORMAT_A8_SINT; - case MESA_FORMAT_A_SINT16: - return PIPE_FORMAT_A16_SINT; - case MESA_FORMAT_A_SINT32: - return PIPE_FORMAT_A32_SINT; - - case MESA_FORMAT_I_UINT8: - return PIPE_FORMAT_I8_UINT; - case MESA_FORMAT_I_UINT16: - return PIPE_FORMAT_I16_UINT; - case MESA_FORMAT_I_UINT32: - return PIPE_FORMAT_I32_UINT; - - case MESA_FORMAT_I_SINT8: - return PIPE_FORMAT_I8_SINT; - case MESA_FORMAT_I_SINT16: - return PIPE_FORMAT_I16_SINT; - case MESA_FORMAT_I_SINT32: - return PIPE_FORMAT_I32_SINT; - - case MESA_FORMAT_L_UINT8: - return PIPE_FORMAT_L8_UINT; - case MESA_FORMAT_L_UINT16: - return PIPE_FORMAT_L16_UINT; - case MESA_FORMAT_L_UINT32: - return PIPE_FORMAT_L32_UINT; - - case MESA_FORMAT_L_SINT8: - return PIPE_FORMAT_L8_SINT; - case MESA_FORMAT_L_SINT16: - return PIPE_FORMAT_L16_SINT; - case MESA_FORMAT_L_SINT32: - return PIPE_FORMAT_L32_SINT; - - case MESA_FORMAT_LA_UINT8: - return PIPE_FORMAT_L8A8_UINT; - case MESA_FORMAT_LA_UINT16: - return PIPE_FORMAT_L16A16_UINT; - case MESA_FORMAT_LA_UINT32: - return PIPE_FORMAT_L32A32_UINT; - - case MESA_FORMAT_LA_SINT8: - return PIPE_FORMAT_L8A8_SINT; - case MESA_FORMAT_LA_SINT16: - return PIPE_FORMAT_L16A16_SINT; - case MESA_FORMAT_LA_SINT32: - return PIPE_FORMAT_L32A32_SINT; - - case MESA_FORMAT_R_SINT8: - return PIPE_FORMAT_R8_SINT; - case MESA_FORMAT_RG_SINT8: - return PIPE_FORMAT_R8G8_SINT; - case MESA_FORMAT_RGB_SINT8: - return PIPE_FORMAT_R8G8B8_SINT; - case MESA_FORMAT_RGBA_SINT8: - return PIPE_FORMAT_R8G8B8A8_SINT; - case MESA_FORMAT_R_SINT16: - return PIPE_FORMAT_R16_SINT; - case MESA_FORMAT_RG_SINT16: - return PIPE_FORMAT_R16G16_SINT; - case MESA_FORMAT_RGB_SINT16: - return PIPE_FORMAT_R16G16B16_SINT; - case MESA_FORMAT_RGBA_SINT16: - return PIPE_FORMAT_R16G16B16A16_SINT; - case MESA_FORMAT_R_SINT32: - return PIPE_FORMAT_R32_SINT; - case MESA_FORMAT_RG_SINT32: - return PIPE_FORMAT_R32G32_SINT; - case MESA_FORMAT_RGB_SINT32: - return PIPE_FORMAT_R32G32B32_SINT; - case MESA_FORMAT_RGBA_SINT32: - return PIPE_FORMAT_R32G32B32A32_SINT; - - /* unsigned int formats */ - case MESA_FORMAT_R_UINT8: - return PIPE_FORMAT_R8_UINT; - case MESA_FORMAT_RG_UINT8: - return PIPE_FORMAT_R8G8_UINT; - case MESA_FORMAT_RGB_UINT8: - return PIPE_FORMAT_R8G8B8_UINT; - case MESA_FORMAT_RGBA_UINT8: - return PIPE_FORMAT_R8G8B8A8_UINT; - case MESA_FORMAT_R_UINT16: - return PIPE_FORMAT_R16_UINT; - case MESA_FORMAT_RG_UINT16: - return PIPE_FORMAT_R16G16_UINT; - case MESA_FORMAT_RGB_UINT16: - return PIPE_FORMAT_R16G16B16_UINT; - case MESA_FORMAT_RGBA_UINT16: - return PIPE_FORMAT_R16G16B16A16_UINT; - case MESA_FORMAT_R_UINT32: - return PIPE_FORMAT_R32_UINT; - case MESA_FORMAT_RG_UINT32: - return PIPE_FORMAT_R32G32_UINT; - case MESA_FORMAT_RGB_UINT32: - return PIPE_FORMAT_R32G32B32_UINT; - case MESA_FORMAT_RGBA_UINT32: - return PIPE_FORMAT_R32G32B32A32_UINT; - - case MESA_FORMAT_R_RGTC1_UNORM: - return PIPE_FORMAT_RGTC1_UNORM; - case MESA_FORMAT_R_RGTC1_SNORM: - return PIPE_FORMAT_RGTC1_SNORM; - case MESA_FORMAT_RG_RGTC2_UNORM: - return PIPE_FORMAT_RGTC2_UNORM; - case MESA_FORMAT_RG_RGTC2_SNORM: - return PIPE_FORMAT_RGTC2_SNORM; - - case MESA_FORMAT_L_LATC1_UNORM: - return PIPE_FORMAT_LATC1_UNORM; - case MESA_FORMAT_L_LATC1_SNORM: - return PIPE_FORMAT_LATC1_SNORM; - case MESA_FORMAT_LA_LATC2_UNORM: - return PIPE_FORMAT_LATC2_UNORM; - case MESA_FORMAT_LA_LATC2_SNORM: - return PIPE_FORMAT_LATC2_SNORM; /* The destination RGBA format mustn't be changed, because it's also - * a destination format of the unpack/decompression function. */ - case MESA_FORMAT_ETC1_RGB8: - return st->has_etc1 ? PIPE_FORMAT_ETC1_RGB8 : PIPE_FORMAT_R8G8B8A8_UNORM; - - case MESA_FORMAT_BPTC_RGBA_UNORM: - return PIPE_FORMAT_BPTC_RGBA_UNORM; - case MESA_FORMAT_BPTC_SRGB_ALPHA_UNORM: - return PIPE_FORMAT_BPTC_SRGBA; - case MESA_FORMAT_BPTC_RGB_SIGNED_FLOAT: - return PIPE_FORMAT_BPTC_RGB_FLOAT; - case MESA_FORMAT_BPTC_RGB_UNSIGNED_FLOAT: - return PIPE_FORMAT_BPTC_RGB_UFLOAT; - - /* signed normalized formats */ - case MESA_FORMAT_R_SNORM8: - return PIPE_FORMAT_R8_SNORM; - case MESA_FORMAT_R8G8_SNORM: - return PIPE_FORMAT_RG88_SNORM; - case MESA_FORMAT_G8R8_SNORM: - return PIPE_FORMAT_GR88_SNORM; - case MESA_FORMAT_R8G8B8A8_SNORM: - return PIPE_FORMAT_RGBA8888_SNORM; - case MESA_FORMAT_A8B8G8R8_SNORM: - return PIPE_FORMAT_ABGR8888_SNORM; - - case MESA_FORMAT_A_SNORM8: - return PIPE_FORMAT_A8_SNORM; - case MESA_FORMAT_L_SNORM8: - return PIPE_FORMAT_L8_SNORM; - case MESA_FORMAT_L8A8_SNORM: - return PIPE_FORMAT_LA88_SNORM; - case MESA_FORMAT_A8L8_SNORM: - return PIPE_FORMAT_AL88_SNORM; - case MESA_FORMAT_I_SNORM8: - return PIPE_FORMAT_I8_SNORM; - - case MESA_FORMAT_R_SNORM16: - return PIPE_FORMAT_R16_SNORM; - case MESA_FORMAT_R16G16_SNORM: - return PIPE_FORMAT_RG1616_SNORM; - case MESA_FORMAT_G16R16_SNORM: - return PIPE_FORMAT_GR1616_SNORM; - case MESA_FORMAT_RGBA_SNORM16: - return PIPE_FORMAT_R16G16B16A16_SNORM; - - case MESA_FORMAT_A_SNORM16: - return PIPE_FORMAT_A16_SNORM; - case MESA_FORMAT_L_SNORM16: - return PIPE_FORMAT_L16_SNORM; - case MESA_FORMAT_LA_SNORM16: - return PIPE_FORMAT_L16A16_SNORM; - case MESA_FORMAT_I_SNORM16: - return PIPE_FORMAT_I16_SNORM; - - case MESA_FORMAT_R9G9B9E5_FLOAT: - return PIPE_FORMAT_R9G9B9E5_FLOAT; - case MESA_FORMAT_R11G11B10_FLOAT: - return PIPE_FORMAT_R11G11B10_FLOAT; - case MESA_FORMAT_B10G10R10A2_UINT: - return PIPE_FORMAT_B10G10R10A2_UINT; - case MESA_FORMAT_R10G10B10A2_UINT: - return PIPE_FORMAT_R10G10B10A2_UINT; - - case MESA_FORMAT_B4G4R4X4_UNORM: - return PIPE_FORMAT_B4G4R4X4_UNORM; - case MESA_FORMAT_B5G5R5X1_UNORM: - return PIPE_FORMAT_B5G5R5X1_UNORM; - case MESA_FORMAT_X1B5G5R5_UNORM: - return PIPE_FORMAT_X1B5G5R5_UNORM; - case MESA_FORMAT_R8G8B8X8_SNORM: - return PIPE_FORMAT_RGBX8888_SNORM; - case MESA_FORMAT_X8B8G8R8_SNORM: - return PIPE_FORMAT_XBGR8888_SNORM; - case MESA_FORMAT_R8G8B8X8_SRGB: - return PIPE_FORMAT_RGBX8888_SRGB; - case MESA_FORMAT_X8B8G8R8_SRGB: - return PIPE_FORMAT_XBGR8888_SRGB; - case MESA_FORMAT_RGBX_UINT8: - return PIPE_FORMAT_R8G8B8X8_UINT; - case MESA_FORMAT_RGBX_SINT8: - return PIPE_FORMAT_R8G8B8X8_SINT; - case MESA_FORMAT_B10G10R10X2_UNORM: - return PIPE_FORMAT_B10G10R10X2_UNORM; - case MESA_FORMAT_RGBX_UNORM16: - return PIPE_FORMAT_R16G16B16X16_UNORM; - case MESA_FORMAT_RGBX_SNORM16: - return PIPE_FORMAT_R16G16B16X16_SNORM; - case MESA_FORMAT_RGBX_FLOAT16: - return PIPE_FORMAT_R16G16B16X16_FLOAT; - case MESA_FORMAT_RGBX_UINT16: - return PIPE_FORMAT_R16G16B16X16_UINT; - case MESA_FORMAT_RGBX_SINT16: - return PIPE_FORMAT_R16G16B16X16_SINT; - case MESA_FORMAT_RGBX_FLOAT32: - return PIPE_FORMAT_R32G32B32X32_FLOAT; - case MESA_FORMAT_RGBX_UINT32: - return PIPE_FORMAT_R32G32B32X32_UINT; - case MESA_FORMAT_RGBX_SINT32: - return PIPE_FORMAT_R32G32B32X32_SINT; - - case MESA_FORMAT_B8G8R8X8_SRGB: - return PIPE_FORMAT_BGRX8888_SRGB; - case MESA_FORMAT_X8R8G8B8_SRGB: - return PIPE_FORMAT_XRGB8888_SRGB; + * a destination format of the unpack/decompression function. + */ + if (mesaFormat == MESA_FORMAT_ETC1_RGB8 && !st->has_etc1) + return PIPE_FORMAT_R8G8B8A8_UNORM; /* ETC2 formats are emulated as uncompressed ones. * The destination formats mustn't be changed, because they are also - * destination formats of the unpack/decompression function. */ - case MESA_FORMAT_ETC2_RGB8: - return st->has_etc2 ? PIPE_FORMAT_ETC2_RGB8 : PIPE_FORMAT_R8G8B8A8_UNORM; - case MESA_FORMAT_ETC2_SRGB8: - return st->has_etc2 ? PIPE_FORMAT_ETC2_SRGB8 : - has_bgra_srgb ? PIPE_FORMAT_B8G8R8A8_SRGB : PIPE_FORMAT_R8G8B8A8_SRGB; - case MESA_FORMAT_ETC2_RGBA8_EAC: - return st->has_etc2 ? PIPE_FORMAT_ETC2_RGBA8 : PIPE_FORMAT_R8G8B8A8_UNORM; - case MESA_FORMAT_ETC2_SRGB8_ALPHA8_EAC: - return st->has_etc2 ? PIPE_FORMAT_ETC2_SRGBA8 : - has_bgra_srgb ? PIPE_FORMAT_B8G8R8A8_SRGB : PIPE_FORMAT_R8G8B8A8_SRGB; - case MESA_FORMAT_ETC2_R11_EAC: - return st->has_etc2 ? PIPE_FORMAT_ETC2_R11_UNORM : PIPE_FORMAT_R16_UNORM; - case MESA_FORMAT_ETC2_RG11_EAC: - return st->has_etc2 ? PIPE_FORMAT_ETC2_RG11_UNORM : PIPE_FORMAT_R16G16_UNORM; - case MESA_FORMAT_ETC2_SIGNED_R11_EAC: - return st->has_etc2 ? PIPE_FORMAT_ETC2_R11_SNORM : PIPE_FORMAT_R16_SNORM; - case MESA_FORMAT_ETC2_SIGNED_RG11_EAC: - return st->has_etc2 ? PIPE_FORMAT_ETC2_RG11_SNORM : PIPE_FORMAT_R16G16_SNORM; - case MESA_FORMAT_ETC2_RGB8_PUNCHTHROUGH_ALPHA1: - return st->has_etc2 ? PIPE_FORMAT_ETC2_RGB8A1 : PIPE_FORMAT_R8G8B8A8_UNORM; - case MESA_FORMAT_ETC2_SRGB8_PUNCHTHROUGH_ALPHA1: - return st->has_etc2 ? PIPE_FORMAT_ETC2_SRGB8A1 : - has_bgra_srgb ? PIPE_FORMAT_B8G8R8A8_SRGB : PIPE_FORMAT_R8G8B8A8_SRGB; + * destination formats of the unpack/decompression function. + */ + if (_mesa_is_format_etc2(mesaFormat) && !st->has_etc2) { + bool has_bgra_srgb = screen->is_format_supported(screen, + PIPE_FORMAT_B8G8R8A8_SRGB, + PIPE_TEXTURE_2D, 0, 0, + PIPE_BIND_SAMPLER_VIEW); - case MESA_FORMAT_RGBA_ASTC_4x4: - if (!st->has_astc_2d_ldr) - return PIPE_FORMAT_R8G8B8A8_UNORM; - return PIPE_FORMAT_ASTC_4x4; - case MESA_FORMAT_RGBA_ASTC_5x4: - if (!st->has_astc_2d_ldr) - return PIPE_FORMAT_R8G8B8A8_UNORM; - return PIPE_FORMAT_ASTC_5x4; - case MESA_FORMAT_RGBA_ASTC_5x5: - if (!st->has_astc_2d_ldr) - return PIPE_FORMAT_R8G8B8A8_UNORM; - return PIPE_FORMAT_ASTC_5x5; - case MESA_FORMAT_RGBA_ASTC_6x5: - if (!st->has_astc_2d_ldr) - return PIPE_FORMAT_R8G8B8A8_UNORM; - return PIPE_FORMAT_ASTC_6x5; - case MESA_FORMAT_RGBA_ASTC_6x6: - if (!st->has_astc_2d_ldr) - return PIPE_FORMAT_R8G8B8A8_UNORM; - return PIPE_FORMAT_ASTC_6x6; - case MESA_FORMAT_RGBA_ASTC_8x5: - if (!st->has_astc_2d_ldr) - return PIPE_FORMAT_R8G8B8A8_UNORM; - return PIPE_FORMAT_ASTC_8x5; - case MESA_FORMAT_RGBA_ASTC_8x6: - if (!st->has_astc_2d_ldr) - return PIPE_FORMAT_R8G8B8A8_UNORM; - return PIPE_FORMAT_ASTC_8x6; - case MESA_FORMAT_RGBA_ASTC_8x8: - if (!st->has_astc_2d_ldr) + switch (mesaFormat) { + case MESA_FORMAT_ETC2_RGB8: return PIPE_FORMAT_R8G8B8A8_UNORM; - return PIPE_FORMAT_ASTC_8x8; - case MESA_FORMAT_RGBA_ASTC_10x5: - if (!st->has_astc_2d_ldr) + case MESA_FORMAT_ETC2_SRGB8: + return has_bgra_srgb ? PIPE_FORMAT_B8G8R8A8_SRGB : PIPE_FORMAT_R8G8B8A8_SRGB; + case MESA_FORMAT_ETC2_RGBA8_EAC: return PIPE_FORMAT_R8G8B8A8_UNORM; - return PIPE_FORMAT_ASTC_10x5; - case MESA_FORMAT_RGBA_ASTC_10x6: - if (!st->has_astc_2d_ldr) + case MESA_FORMAT_ETC2_SRGB8_ALPHA8_EAC: + return has_bgra_srgb ? PIPE_FORMAT_B8G8R8A8_SRGB : PIPE_FORMAT_R8G8B8A8_SRGB; + case MESA_FORMAT_ETC2_R11_EAC: + return PIPE_FORMAT_R16_UNORM; + case MESA_FORMAT_ETC2_RG11_EAC: + return PIPE_FORMAT_R16G16_UNORM; + case MESA_FORMAT_ETC2_SIGNED_R11_EAC: + return PIPE_FORMAT_R16_SNORM; + case MESA_FORMAT_ETC2_SIGNED_RG11_EAC: + return PIPE_FORMAT_R16G16_SNORM; + case MESA_FORMAT_ETC2_RGB8_PUNCHTHROUGH_ALPHA1: return PIPE_FORMAT_R8G8B8A8_UNORM; - return PIPE_FORMAT_ASTC_10x6; - case MESA_FORMAT_RGBA_ASTC_10x8: - if (!st->has_astc_2d_ldr) - return PIPE_FORMAT_R8G8B8A8_UNORM; - return PIPE_FORMAT_ASTC_10x8; - case MESA_FORMAT_RGBA_ASTC_10x10: - if (!st->has_astc_2d_ldr) - return PIPE_FORMAT_R8G8B8A8_UNORM; - return PIPE_FORMAT_ASTC_10x10; - case MESA_FORMAT_RGBA_ASTC_12x10: - if (!st->has_astc_2d_ldr) - return PIPE_FORMAT_R8G8B8A8_UNORM; - return PIPE_FORMAT_ASTC_12x10; - case MESA_FORMAT_RGBA_ASTC_12x12: - if (!st->has_astc_2d_ldr) - return PIPE_FORMAT_R8G8B8A8_UNORM; - return PIPE_FORMAT_ASTC_12x12; + case MESA_FORMAT_ETC2_SRGB8_PUNCHTHROUGH_ALPHA1: + return has_bgra_srgb ? PIPE_FORMAT_B8G8R8A8_SRGB : PIPE_FORMAT_R8G8B8A8_SRGB; + default: + unreachable("Unknown ETC2 format"); + } + } - case MESA_FORMAT_SRGB8_ALPHA8_ASTC_4x4: - if (!st->has_astc_2d_ldr) - return PIPE_FORMAT_R8G8B8A8_SRGB; - return PIPE_FORMAT_ASTC_4x4_SRGB; - case MESA_FORMAT_SRGB8_ALPHA8_ASTC_5x4: - if (!st->has_astc_2d_ldr) - return PIPE_FORMAT_R8G8B8A8_SRGB; - return PIPE_FORMAT_ASTC_5x4_SRGB; - case MESA_FORMAT_SRGB8_ALPHA8_ASTC_5x5: - if (!st->has_astc_2d_ldr) - return PIPE_FORMAT_R8G8B8A8_SRGB; - return PIPE_FORMAT_ASTC_5x5_SRGB; - case MESA_FORMAT_SRGB8_ALPHA8_ASTC_6x5: - if (!st->has_astc_2d_ldr) - return PIPE_FORMAT_R8G8B8A8_SRGB; - return PIPE_FORMAT_ASTC_6x5_SRGB; - case MESA_FORMAT_SRGB8_ALPHA8_ASTC_6x6: - if (!st->has_astc_2d_ldr) + if (st_astc_format_fallback(st, mesaFormat)) { + if (_mesa_is_format_srgb(mesaFormat)) return PIPE_FORMAT_R8G8B8A8_SRGB; - return PIPE_FORMAT_ASTC_6x6_SRGB; - case MESA_FORMAT_SRGB8_ALPHA8_ASTC_8x5: - if (!st->has_astc_2d_ldr) - return PIPE_FORMAT_R8G8B8A8_SRGB; - return PIPE_FORMAT_ASTC_8x5_SRGB; - case MESA_FORMAT_SRGB8_ALPHA8_ASTC_8x6: - if (!st->has_astc_2d_ldr) - return PIPE_FORMAT_R8G8B8A8_SRGB; - return PIPE_FORMAT_ASTC_8x6_SRGB; - case MESA_FORMAT_SRGB8_ALPHA8_ASTC_8x8: - if (!st->has_astc_2d_ldr) - return PIPE_FORMAT_R8G8B8A8_SRGB; - return PIPE_FORMAT_ASTC_8x8_SRGB; - case MESA_FORMAT_SRGB8_ALPHA8_ASTC_10x5: - if (!st->has_astc_2d_ldr) - return PIPE_FORMAT_R8G8B8A8_SRGB; - return PIPE_FORMAT_ASTC_10x5_SRGB; - case MESA_FORMAT_SRGB8_ALPHA8_ASTC_10x6: - if (!st->has_astc_2d_ldr) - return PIPE_FORMAT_R8G8B8A8_SRGB; - return PIPE_FORMAT_ASTC_10x6_SRGB; - case MESA_FORMAT_SRGB8_ALPHA8_ASTC_10x8: - if (!st->has_astc_2d_ldr) - return PIPE_FORMAT_R8G8B8A8_SRGB; - return PIPE_FORMAT_ASTC_10x8_SRGB; - case MESA_FORMAT_SRGB8_ALPHA8_ASTC_10x10: - if (!st->has_astc_2d_ldr) - return PIPE_FORMAT_R8G8B8A8_SRGB; - return PIPE_FORMAT_ASTC_10x10_SRGB; - case MESA_FORMAT_SRGB8_ALPHA8_ASTC_12x10: - if (!st->has_astc_2d_ldr) - return PIPE_FORMAT_R8G8B8A8_SRGB; - return PIPE_FORMAT_ASTC_12x10_SRGB; - case MESA_FORMAT_SRGB8_ALPHA8_ASTC_12x12: - if (!st->has_astc_2d_ldr) - return PIPE_FORMAT_R8G8B8A8_SRGB; - return PIPE_FORMAT_ASTC_12x12_SRGB; - - case MESA_FORMAT_ATC_RGB: - return PIPE_FORMAT_ATC_RGB; - case MESA_FORMAT_ATC_RGBA_EXPLICIT: - return PIPE_FORMAT_ATC_RGBA_EXPLICIT; - case MESA_FORMAT_ATC_RGBA_INTERPOLATED: - return PIPE_FORMAT_ATC_RGBA_INTERPOLATED; - - default: - return PIPE_FORMAT_NONE; + else + return PIPE_FORMAT_R8G8B8A8_UNORM; } + + return mesaFormat; } @@ -621,490 +123,10 @@ mesa_format st_pipe_format_to_mesa_format(enum pipe_format format) { - switch (format) { - case PIPE_FORMAT_ABGR8888_UNORM: - return MESA_FORMAT_A8B8G8R8_UNORM; - case PIPE_FORMAT_RGBA8888_UNORM: - return MESA_FORMAT_R8G8B8A8_UNORM; - case PIPE_FORMAT_BGRA8888_UNORM: - return MESA_FORMAT_B8G8R8A8_UNORM; - case PIPE_FORMAT_ARGB8888_UNORM: - return MESA_FORMAT_A8R8G8B8_UNORM; - case PIPE_FORMAT_XBGR8888_UNORM: - return MESA_FORMAT_X8B8G8R8_UNORM; - case PIPE_FORMAT_RGBX8888_UNORM: - return MESA_FORMAT_R8G8B8X8_UNORM; - case PIPE_FORMAT_BGRX8888_UNORM: - return MESA_FORMAT_B8G8R8X8_UNORM; - case PIPE_FORMAT_XRGB8888_UNORM: - return MESA_FORMAT_X8R8G8B8_UNORM; - case PIPE_FORMAT_B5G5R5A1_UNORM: - return MESA_FORMAT_B5G5R5A1_UNORM; - case PIPE_FORMAT_A1B5G5R5_UNORM: - return MESA_FORMAT_A1B5G5R5_UNORM; - case PIPE_FORMAT_B4G4R4A4_UNORM: - return MESA_FORMAT_B4G4R4A4_UNORM; - case PIPE_FORMAT_A4B4G4R4_UNORM: - return MESA_FORMAT_A4B4G4R4_UNORM; - case PIPE_FORMAT_B5G6R5_UNORM: - return MESA_FORMAT_B5G6R5_UNORM; - case PIPE_FORMAT_B2G3R3_UNORM: - return MESA_FORMAT_B2G3R3_UNORM; - case PIPE_FORMAT_B10G10R10A2_UNORM: - return MESA_FORMAT_B10G10R10A2_UNORM; - case PIPE_FORMAT_R10G10B10A2_UNORM: - return MESA_FORMAT_R10G10B10A2_UNORM; - case PIPE_FORMAT_R10G10B10X2_UNORM: - return MESA_FORMAT_R10G10B10X2_UNORM; - case PIPE_FORMAT_L4A4_UNORM: - return MESA_FORMAT_L4A4_UNORM; - case PIPE_FORMAT_LA88_UNORM: - return MESA_FORMAT_L8A8_UNORM; - case PIPE_FORMAT_AL88_UNORM: - return MESA_FORMAT_A8L8_UNORM; - case PIPE_FORMAT_LA1616_UNORM: - return MESA_FORMAT_L16A16_UNORM; - case PIPE_FORMAT_AL1616_UNORM: - return MESA_FORMAT_A16L16_UNORM; - case PIPE_FORMAT_A8_UNORM: - return MESA_FORMAT_A_UNORM8; - case PIPE_FORMAT_A16_UNORM: - return MESA_FORMAT_A_UNORM16; - case PIPE_FORMAT_L8_UNORM: - return MESA_FORMAT_L_UNORM8; - case PIPE_FORMAT_L16_UNORM: - return MESA_FORMAT_L_UNORM16; - case PIPE_FORMAT_I8_UNORM: - return MESA_FORMAT_I_UNORM8; - case PIPE_FORMAT_I16_UNORM: - return MESA_FORMAT_I_UNORM16; - case PIPE_FORMAT_S8_UINT: - return MESA_FORMAT_S_UINT8; - - case PIPE_FORMAT_R16G16B16A16_UNORM: - return MESA_FORMAT_RGBA_UNORM16; - - case PIPE_FORMAT_Z16_UNORM: - return MESA_FORMAT_Z_UNORM16; - case PIPE_FORMAT_Z32_UNORM: - return MESA_FORMAT_Z_UNORM32; - case PIPE_FORMAT_S8_UINT_Z24_UNORM: - return MESA_FORMAT_S8_UINT_Z24_UNORM; - case PIPE_FORMAT_X8Z24_UNORM: - return MESA_FORMAT_X8_UINT_Z24_UNORM; - case PIPE_FORMAT_Z24X8_UNORM: - return MESA_FORMAT_Z24_UNORM_X8_UINT; - case PIPE_FORMAT_Z24_UNORM_S8_UINT: - return MESA_FORMAT_Z24_UNORM_S8_UINT; - case PIPE_FORMAT_Z32_FLOAT: - return MESA_FORMAT_Z_FLOAT32; - case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: - return MESA_FORMAT_Z32_FLOAT_S8X24_UINT; - - case PIPE_FORMAT_UYVY: - return MESA_FORMAT_YCBCR; - case PIPE_FORMAT_YUYV: - return MESA_FORMAT_YCBCR_REV; - - case PIPE_FORMAT_DXT1_RGB: - return MESA_FORMAT_RGB_DXT1; - case PIPE_FORMAT_DXT1_RGBA: - return MESA_FORMAT_RGBA_DXT1; - case PIPE_FORMAT_DXT3_RGBA: - return MESA_FORMAT_RGBA_DXT3; - case PIPE_FORMAT_DXT5_RGBA: - return MESA_FORMAT_RGBA_DXT5; - case PIPE_FORMAT_DXT1_SRGB: - return MESA_FORMAT_SRGB_DXT1; - case PIPE_FORMAT_DXT1_SRGBA: - return MESA_FORMAT_SRGBA_DXT1; - case PIPE_FORMAT_DXT3_SRGBA: - return MESA_FORMAT_SRGBA_DXT3; - case PIPE_FORMAT_DXT5_SRGBA: - return MESA_FORMAT_SRGBA_DXT5; - case PIPE_FORMAT_LA88_SRGB: - return MESA_FORMAT_L8A8_SRGB; - case PIPE_FORMAT_AL88_SRGB: - return MESA_FORMAT_A8L8_SRGB; - case PIPE_FORMAT_L8_SRGB: - return MESA_FORMAT_L_SRGB8; - case PIPE_FORMAT_R8_SRGB: - return MESA_FORMAT_R_SRGB8; - case PIPE_FORMAT_R8G8B8_SRGB: - return MESA_FORMAT_BGR_SRGB8; - case PIPE_FORMAT_ABGR8888_SRGB: - return MESA_FORMAT_A8B8G8R8_SRGB; - case PIPE_FORMAT_RGBA8888_SRGB: - return MESA_FORMAT_R8G8B8A8_SRGB; - case PIPE_FORMAT_BGRA8888_SRGB: - return MESA_FORMAT_B8G8R8A8_SRGB; - case PIPE_FORMAT_ARGB8888_SRGB: - return MESA_FORMAT_A8R8G8B8_SRGB; - case PIPE_FORMAT_R32G32B32A32_FLOAT: - return MESA_FORMAT_RGBA_FLOAT32; - case PIPE_FORMAT_R16G16B16A16_FLOAT: - return MESA_FORMAT_RGBA_FLOAT16; - case PIPE_FORMAT_R32G32B32_FLOAT: - return MESA_FORMAT_RGB_FLOAT32; - case PIPE_FORMAT_R16G16B16_FLOAT: - return MESA_FORMAT_RGB_FLOAT16; - case PIPE_FORMAT_L32A32_FLOAT: - return MESA_FORMAT_LA_FLOAT32; - case PIPE_FORMAT_L16A16_FLOAT: - return MESA_FORMAT_LA_FLOAT16; - case PIPE_FORMAT_L32_FLOAT: - return MESA_FORMAT_L_FLOAT32; - case PIPE_FORMAT_L16_FLOAT: - return MESA_FORMAT_L_FLOAT16; - case PIPE_FORMAT_A32_FLOAT: - return MESA_FORMAT_A_FLOAT32; - case PIPE_FORMAT_A16_FLOAT: - return MESA_FORMAT_A_FLOAT16; - case PIPE_FORMAT_I32_FLOAT: - return MESA_FORMAT_I_FLOAT32; - case PIPE_FORMAT_I16_FLOAT: - return MESA_FORMAT_I_FLOAT16; - case PIPE_FORMAT_R32_FLOAT: - return MESA_FORMAT_R_FLOAT32; - case PIPE_FORMAT_R16_FLOAT: - return MESA_FORMAT_R_FLOAT16; - case PIPE_FORMAT_R32G32_FLOAT: - return MESA_FORMAT_RG_FLOAT32; - case PIPE_FORMAT_R16G16_FLOAT: - return MESA_FORMAT_RG_FLOAT16; - - case PIPE_FORMAT_R8_UNORM: - return MESA_FORMAT_R_UNORM8; - case PIPE_FORMAT_R16_UNORM: - return MESA_FORMAT_R_UNORM16; - case PIPE_FORMAT_RG88_UNORM: - return MESA_FORMAT_R8G8_UNORM; - case PIPE_FORMAT_GR88_UNORM: - return MESA_FORMAT_G8R8_UNORM; - case PIPE_FORMAT_RG1616_UNORM: - return MESA_FORMAT_R16G16_UNORM; - case PIPE_FORMAT_GR1616_UNORM: - return MESA_FORMAT_G16R16_UNORM; - - case PIPE_FORMAT_A8_UINT: - return MESA_FORMAT_A_UINT8; - case PIPE_FORMAT_A16_UINT: - return MESA_FORMAT_A_UINT16; - case PIPE_FORMAT_A32_UINT: - return MESA_FORMAT_A_UINT32; - case PIPE_FORMAT_A8_SINT: - return MESA_FORMAT_A_SINT8; - case PIPE_FORMAT_A16_SINT: - return MESA_FORMAT_A_SINT16; - case PIPE_FORMAT_A32_SINT: - return MESA_FORMAT_A_SINT32; - - case PIPE_FORMAT_I8_UINT: - return MESA_FORMAT_I_UINT8; - case PIPE_FORMAT_I16_UINT: - return MESA_FORMAT_I_UINT16; - case PIPE_FORMAT_I32_UINT: - return MESA_FORMAT_I_UINT32; - case PIPE_FORMAT_I8_SINT: - return MESA_FORMAT_I_SINT8; - case PIPE_FORMAT_I16_SINT: - return MESA_FORMAT_I_SINT16; - case PIPE_FORMAT_I32_SINT: - return MESA_FORMAT_I_SINT32; - - case PIPE_FORMAT_L8_UINT: - return MESA_FORMAT_L_UINT8; - case PIPE_FORMAT_L16_UINT: - return MESA_FORMAT_L_UINT16; - case PIPE_FORMAT_L32_UINT: - return MESA_FORMAT_L_UINT32; - case PIPE_FORMAT_L8_SINT: - return MESA_FORMAT_L_SINT8; - case PIPE_FORMAT_L16_SINT: - return MESA_FORMAT_L_SINT16; - case PIPE_FORMAT_L32_SINT: - return MESA_FORMAT_L_SINT32; - - case PIPE_FORMAT_L8A8_UINT: - return MESA_FORMAT_LA_UINT8; - case PIPE_FORMAT_L16A16_UINT: - return MESA_FORMAT_LA_UINT16; - case PIPE_FORMAT_L32A32_UINT: - return MESA_FORMAT_LA_UINT32; - case PIPE_FORMAT_L8A8_SINT: - return MESA_FORMAT_LA_SINT8; - case PIPE_FORMAT_L16A16_SINT: - return MESA_FORMAT_LA_SINT16; - case PIPE_FORMAT_L32A32_SINT: - return MESA_FORMAT_LA_SINT32; - - case PIPE_FORMAT_R8_SINT: - return MESA_FORMAT_R_SINT8; - case PIPE_FORMAT_R8G8_SINT: - return MESA_FORMAT_RG_SINT8; - case PIPE_FORMAT_R8G8B8_SINT: - return MESA_FORMAT_RGB_SINT8; - case PIPE_FORMAT_R8G8B8A8_SINT: - return MESA_FORMAT_RGBA_SINT8; - - case PIPE_FORMAT_R16_SINT: - return MESA_FORMAT_R_SINT16; - case PIPE_FORMAT_R16G16_SINT: - return MESA_FORMAT_RG_SINT16; - case PIPE_FORMAT_R16G16B16_SINT: - return MESA_FORMAT_RGB_SINT16; - case PIPE_FORMAT_R16G16B16A16_SINT: - return MESA_FORMAT_RGBA_SINT16; - - case PIPE_FORMAT_R32_SINT: - return MESA_FORMAT_R_SINT32; - case PIPE_FORMAT_R32G32_SINT: - return MESA_FORMAT_RG_SINT32; - case PIPE_FORMAT_R32G32B32_SINT: - return MESA_FORMAT_RGB_SINT32; - case PIPE_FORMAT_R32G32B32A32_SINT: - return MESA_FORMAT_RGBA_SINT32; - - /* unsigned int formats */ - case PIPE_FORMAT_R8_UINT: - return MESA_FORMAT_R_UINT8; - case PIPE_FORMAT_R8G8_UINT: - return MESA_FORMAT_RG_UINT8; - case PIPE_FORMAT_R8G8B8_UINT: - return MESA_FORMAT_RGB_UINT8; - case PIPE_FORMAT_R8G8B8A8_UINT: - return MESA_FORMAT_RGBA_UINT8; - - case PIPE_FORMAT_R16_UINT: - return MESA_FORMAT_R_UINT16; - case PIPE_FORMAT_R16G16_UINT: - return MESA_FORMAT_RG_UINT16; - case PIPE_FORMAT_R16G16B16_UINT: - return MESA_FORMAT_RGB_UINT16; - case PIPE_FORMAT_R16G16B16A16_UINT: - return MESA_FORMAT_RGBA_UINT16; - - case PIPE_FORMAT_R32_UINT: - return MESA_FORMAT_R_UINT32; - case PIPE_FORMAT_R32G32_UINT: - return MESA_FORMAT_RG_UINT32; - case PIPE_FORMAT_R32G32B32_UINT: - return MESA_FORMAT_RGB_UINT32; - case PIPE_FORMAT_R32G32B32A32_UINT: - return MESA_FORMAT_RGBA_UINT32; - - case PIPE_FORMAT_RGTC1_UNORM: - return MESA_FORMAT_R_RGTC1_UNORM; - case PIPE_FORMAT_RGTC1_SNORM: - return MESA_FORMAT_R_RGTC1_SNORM; - case PIPE_FORMAT_RGTC2_UNORM: - return MESA_FORMAT_RG_RGTC2_UNORM; - case PIPE_FORMAT_RGTC2_SNORM: - return MESA_FORMAT_RG_RGTC2_SNORM; - - case PIPE_FORMAT_LATC1_UNORM: - return MESA_FORMAT_L_LATC1_UNORM; - case PIPE_FORMAT_LATC1_SNORM: - return MESA_FORMAT_L_LATC1_SNORM; - case PIPE_FORMAT_LATC2_UNORM: - return MESA_FORMAT_LA_LATC2_UNORM; - case PIPE_FORMAT_LATC2_SNORM: - return MESA_FORMAT_LA_LATC2_SNORM; - - case PIPE_FORMAT_ETC1_RGB8: - return MESA_FORMAT_ETC1_RGB8; - - case PIPE_FORMAT_BPTC_RGBA_UNORM: - return MESA_FORMAT_BPTC_RGBA_UNORM; - case PIPE_FORMAT_BPTC_SRGBA: - return MESA_FORMAT_BPTC_SRGB_ALPHA_UNORM; - case PIPE_FORMAT_BPTC_RGB_FLOAT: - return MESA_FORMAT_BPTC_RGB_SIGNED_FLOAT; - case PIPE_FORMAT_BPTC_RGB_UFLOAT: - return MESA_FORMAT_BPTC_RGB_UNSIGNED_FLOAT; - - /* signed normalized formats */ - case PIPE_FORMAT_R8_SNORM: - return MESA_FORMAT_R_SNORM8; - case PIPE_FORMAT_RG88_SNORM: - return MESA_FORMAT_R8G8_SNORM; - case PIPE_FORMAT_GR88_SNORM: - return MESA_FORMAT_G8R8_SNORM; - case PIPE_FORMAT_RGBA8888_SNORM: - return MESA_FORMAT_R8G8B8A8_SNORM; - case PIPE_FORMAT_ABGR8888_SNORM: - return MESA_FORMAT_A8B8G8R8_SNORM; - - case PIPE_FORMAT_A8_SNORM: - return MESA_FORMAT_A_SNORM8; - case PIPE_FORMAT_L8_SNORM: - return MESA_FORMAT_L_SNORM8; - case PIPE_FORMAT_LA88_SNORM: - return MESA_FORMAT_L8A8_SNORM; - case PIPE_FORMAT_AL88_SNORM: - return MESA_FORMAT_A8L8_SNORM; - case PIPE_FORMAT_I8_SNORM: - return MESA_FORMAT_I_SNORM8; - - case PIPE_FORMAT_R16_SNORM: - return MESA_FORMAT_R_SNORM16; - case PIPE_FORMAT_RG1616_SNORM: - return MESA_FORMAT_R16G16_SNORM; - case PIPE_FORMAT_GR1616_SNORM: - return MESA_FORMAT_G16R16_SNORM; - case PIPE_FORMAT_R16G16B16A16_SNORM: - return MESA_FORMAT_RGBA_SNORM16; - - case PIPE_FORMAT_A16_SNORM: - return MESA_FORMAT_A_SNORM16; - case PIPE_FORMAT_L16_SNORM: - return MESA_FORMAT_L_SNORM16; - case PIPE_FORMAT_L16A16_SNORM: - return MESA_FORMAT_LA_SNORM16; - case PIPE_FORMAT_I16_SNORM: - return MESA_FORMAT_I_SNORM16; - - case PIPE_FORMAT_R9G9B9E5_FLOAT: - return MESA_FORMAT_R9G9B9E5_FLOAT; - case PIPE_FORMAT_R11G11B10_FLOAT: - return MESA_FORMAT_R11G11B10_FLOAT; - - case PIPE_FORMAT_B10G10R10A2_UINT: - return MESA_FORMAT_B10G10R10A2_UINT; - case PIPE_FORMAT_R10G10B10A2_UINT: - return MESA_FORMAT_R10G10B10A2_UINT; - - case PIPE_FORMAT_B4G4R4X4_UNORM: - return MESA_FORMAT_B4G4R4X4_UNORM; - case PIPE_FORMAT_B5G5R5X1_UNORM: - return MESA_FORMAT_B5G5R5X1_UNORM; - case PIPE_FORMAT_X1B5G5R5_UNORM: - return MESA_FORMAT_X1B5G5R5_UNORM; - case PIPE_FORMAT_RGBX8888_SNORM: - return MESA_FORMAT_R8G8B8X8_SNORM; - case PIPE_FORMAT_XBGR8888_SNORM: - return MESA_FORMAT_X8B8G8R8_SNORM; - case PIPE_FORMAT_RGBX8888_SRGB: - return MESA_FORMAT_R8G8B8X8_SRGB; - case PIPE_FORMAT_XBGR8888_SRGB: - return MESA_FORMAT_X8B8G8R8_SRGB; - case PIPE_FORMAT_R8G8B8X8_UINT: - return MESA_FORMAT_RGBX_UINT8; - case PIPE_FORMAT_R8G8B8X8_SINT: - return MESA_FORMAT_RGBX_SINT8; - case PIPE_FORMAT_B10G10R10X2_UNORM: - return MESA_FORMAT_B10G10R10X2_UNORM; - case PIPE_FORMAT_R16G16B16X16_UNORM: - return MESA_FORMAT_RGBX_UNORM16; - case PIPE_FORMAT_R16G16B16X16_SNORM: - return MESA_FORMAT_RGBX_SNORM16; - case PIPE_FORMAT_R16G16B16X16_FLOAT: - return MESA_FORMAT_RGBX_FLOAT16; - case PIPE_FORMAT_R16G16B16X16_UINT: - return MESA_FORMAT_RGBX_UINT16; - case PIPE_FORMAT_R16G16B16X16_SINT: - return MESA_FORMAT_RGBX_SINT16; - case PIPE_FORMAT_R32G32B32X32_FLOAT: - return MESA_FORMAT_RGBX_FLOAT32; - case PIPE_FORMAT_R32G32B32X32_UINT: - return MESA_FORMAT_RGBX_UINT32; - case PIPE_FORMAT_R32G32B32X32_SINT: - return MESA_FORMAT_RGBX_SINT32; - - case PIPE_FORMAT_BGRX8888_SRGB: - return MESA_FORMAT_B8G8R8X8_SRGB; - case PIPE_FORMAT_XRGB8888_SRGB: - return MESA_FORMAT_X8R8G8B8_SRGB; - - case PIPE_FORMAT_ETC2_RGB8: - return MESA_FORMAT_ETC2_RGB8; - case PIPE_FORMAT_ETC2_SRGB8: - return MESA_FORMAT_ETC2_SRGB8; - case PIPE_FORMAT_ETC2_RGB8A1: - return MESA_FORMAT_ETC2_RGB8_PUNCHTHROUGH_ALPHA1; - case PIPE_FORMAT_ETC2_SRGB8A1: - return MESA_FORMAT_ETC2_SRGB8_PUNCHTHROUGH_ALPHA1; - case PIPE_FORMAT_ETC2_RGBA8: - return MESA_FORMAT_ETC2_RGBA8_EAC; - case PIPE_FORMAT_ETC2_SRGBA8: - return MESA_FORMAT_ETC2_SRGB8_ALPHA8_EAC; - case PIPE_FORMAT_ETC2_R11_UNORM: - return MESA_FORMAT_ETC2_R11_EAC; - case PIPE_FORMAT_ETC2_R11_SNORM: - return MESA_FORMAT_ETC2_SIGNED_R11_EAC; - case PIPE_FORMAT_ETC2_RG11_UNORM: - return MESA_FORMAT_ETC2_RG11_EAC; - case PIPE_FORMAT_ETC2_RG11_SNORM: - return MESA_FORMAT_ETC2_SIGNED_RG11_EAC; - - case PIPE_FORMAT_ASTC_4x4: - return MESA_FORMAT_RGBA_ASTC_4x4; - case PIPE_FORMAT_ASTC_5x4: - return MESA_FORMAT_RGBA_ASTC_5x4; - case PIPE_FORMAT_ASTC_5x5: - return MESA_FORMAT_RGBA_ASTC_5x5; - case PIPE_FORMAT_ASTC_6x5: - return MESA_FORMAT_RGBA_ASTC_6x5; - case PIPE_FORMAT_ASTC_6x6: - return MESA_FORMAT_RGBA_ASTC_6x6; - case PIPE_FORMAT_ASTC_8x5: - return MESA_FORMAT_RGBA_ASTC_8x5; - case PIPE_FORMAT_ASTC_8x6: - return MESA_FORMAT_RGBA_ASTC_8x6; - case PIPE_FORMAT_ASTC_8x8: - return MESA_FORMAT_RGBA_ASTC_8x8; - case PIPE_FORMAT_ASTC_10x5: - return MESA_FORMAT_RGBA_ASTC_10x5; - case PIPE_FORMAT_ASTC_10x6: - return MESA_FORMAT_RGBA_ASTC_10x6; - case PIPE_FORMAT_ASTC_10x8: - return MESA_FORMAT_RGBA_ASTC_10x8; - case PIPE_FORMAT_ASTC_10x10: - return MESA_FORMAT_RGBA_ASTC_10x10; - case PIPE_FORMAT_ASTC_12x10: - return MESA_FORMAT_RGBA_ASTC_12x10; - case PIPE_FORMAT_ASTC_12x12: - return MESA_FORMAT_RGBA_ASTC_12x12; - - case PIPE_FORMAT_ASTC_4x4_SRGB: - return MESA_FORMAT_SRGB8_ALPHA8_ASTC_4x4; - case PIPE_FORMAT_ASTC_5x4_SRGB: - return MESA_FORMAT_SRGB8_ALPHA8_ASTC_5x4; - case PIPE_FORMAT_ASTC_5x5_SRGB: - return MESA_FORMAT_SRGB8_ALPHA8_ASTC_5x5; - case PIPE_FORMAT_ASTC_6x5_SRGB: - return MESA_FORMAT_SRGB8_ALPHA8_ASTC_6x5; - case PIPE_FORMAT_ASTC_6x6_SRGB: - return MESA_FORMAT_SRGB8_ALPHA8_ASTC_6x6; - case PIPE_FORMAT_ASTC_8x5_SRGB: - return MESA_FORMAT_SRGB8_ALPHA8_ASTC_8x5; - case PIPE_FORMAT_ASTC_8x6_SRGB: - return MESA_FORMAT_SRGB8_ALPHA8_ASTC_8x6; - case PIPE_FORMAT_ASTC_8x8_SRGB: - return MESA_FORMAT_SRGB8_ALPHA8_ASTC_8x8; - case PIPE_FORMAT_ASTC_10x5_SRGB: - return MESA_FORMAT_SRGB8_ALPHA8_ASTC_10x5; - case PIPE_FORMAT_ASTC_10x6_SRGB: - return MESA_FORMAT_SRGB8_ALPHA8_ASTC_10x6; - case PIPE_FORMAT_ASTC_10x8_SRGB: - return MESA_FORMAT_SRGB8_ALPHA8_ASTC_10x8; - case PIPE_FORMAT_ASTC_10x10_SRGB: - return MESA_FORMAT_SRGB8_ALPHA8_ASTC_10x10; - case PIPE_FORMAT_ASTC_12x10_SRGB: - return MESA_FORMAT_SRGB8_ALPHA8_ASTC_12x10; - case PIPE_FORMAT_ASTC_12x12_SRGB: - return MESA_FORMAT_SRGB8_ALPHA8_ASTC_12x12; - - case PIPE_FORMAT_ATC_RGB: - return MESA_FORMAT_ATC_RGB; - case PIPE_FORMAT_ATC_RGBA_EXPLICIT: - return MESA_FORMAT_ATC_RGBA_EXPLICIT; - case PIPE_FORMAT_ATC_RGBA_INTERPOLATED: - return MESA_FORMAT_ATC_RGBA_INTERPOLATED; - - default: + mesa_format mf = format; + if (!_mesa_get_format_name(mf)) return MESA_FORMAT_NONE; - } + return mf; } /** @@ -1310,16 +332,14 @@ { PIPE_FORMAT_DXT5_RGBA, 0 } }, -#if 0 { { GL_COMPRESSED_RGB_FXT1_3DFX, 0 }, - { PIPE_FORMAT_RGB_FXT1, 0 } + { PIPE_FORMAT_FXT1_RGB, 0 } }, { { GL_COMPRESSED_RGBA_FXT1_3DFX, 0 }, - { PIPE_FORMAT_RGBA_FXT1, 0 } + { PIPE_FORMAT_FXT1_RGBA, 0 } }, -#endif /* Depth formats */ { @@ -2056,78 +1076,6 @@ return PIPE_FORMAT_NONE; } - -struct exact_format_mapping -{ - GLenum format; - GLenum type; - enum pipe_format pformat; -}; - -static const struct exact_format_mapping rgba8888_tbl[] = -{ - { GL_RGBA, GL_UNSIGNED_INT_8_8_8_8, PIPE_FORMAT_ABGR8888_UNORM }, - { GL_ABGR_EXT, GL_UNSIGNED_INT_8_8_8_8_REV, PIPE_FORMAT_ABGR8888_UNORM }, - { GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, PIPE_FORMAT_RGBA8888_UNORM }, - { GL_ABGR_EXT, GL_UNSIGNED_INT_8_8_8_8, PIPE_FORMAT_RGBA8888_UNORM }, - { GL_BGRA, GL_UNSIGNED_INT_8_8_8_8, PIPE_FORMAT_ARGB8888_UNORM }, - { GL_BGRA, GL_UNSIGNED_INT_8_8_8_8_REV, PIPE_FORMAT_BGRA8888_UNORM }, - { GL_RGBA, GL_UNSIGNED_BYTE, PIPE_FORMAT_R8G8B8A8_UNORM }, - { GL_ABGR_EXT, GL_UNSIGNED_BYTE, PIPE_FORMAT_A8B8G8R8_UNORM }, - { GL_BGRA, GL_UNSIGNED_BYTE, PIPE_FORMAT_B8G8R8A8_UNORM }, - { 0, 0, 0 } -}; - -static const struct exact_format_mapping rgbx8888_tbl[] = -{ - { GL_RGBA, GL_UNSIGNED_INT_8_8_8_8, PIPE_FORMAT_XBGR8888_UNORM }, - { GL_ABGR_EXT, GL_UNSIGNED_INT_8_8_8_8_REV, PIPE_FORMAT_XBGR8888_UNORM }, - { GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, PIPE_FORMAT_RGBX8888_UNORM }, - { GL_ABGR_EXT, GL_UNSIGNED_INT_8_8_8_8, PIPE_FORMAT_RGBX8888_UNORM }, - { GL_BGRA, GL_UNSIGNED_INT_8_8_8_8, PIPE_FORMAT_XRGB8888_UNORM }, - { GL_BGRA, GL_UNSIGNED_INT_8_8_8_8_REV, PIPE_FORMAT_BGRX8888_UNORM }, - { GL_RGBA, GL_UNSIGNED_BYTE, PIPE_FORMAT_R8G8B8X8_UNORM }, - { GL_ABGR_EXT, GL_UNSIGNED_BYTE, PIPE_FORMAT_X8B8G8R8_UNORM }, - { GL_BGRA, GL_UNSIGNED_BYTE, PIPE_FORMAT_B8G8R8X8_UNORM }, - { 0, 0, 0 } -}; - - -/** - * For unsized/base internal formats, we may choose a convenient effective - * internal format for {format, type}. If one exists, return that, otherwise - * return PIPE_FORMAT_NONE. - */ -static enum pipe_format -find_exact_format(GLint internalFormat, GLenum format, GLenum type) -{ - uint i; - const struct exact_format_mapping* tbl; - - if (format == GL_NONE || type == GL_NONE) - return PIPE_FORMAT_NONE; - - switch (internalFormat) { - case 4: - case GL_RGBA: - tbl = rgba8888_tbl; - break; - case 3: - case GL_RGB: - tbl = rgbx8888_tbl; - break; - default: - return PIPE_FORMAT_NONE; - } - - for (i = 0; tbl[i].format; i++) - if (tbl[i].format == format && tbl[i].type == type) - return tbl[i].pformat; - - return PIPE_FORMAT_NONE; -} - - /** * Given an OpenGL internalFormat value for a texture or surface, return * the best matching PIPE_FORMAT_x, or PIPE_FORMAT_NONE if there's no match. @@ -2150,7 +1098,7 @@ GLenum format, GLenum type, enum pipe_texture_target target, unsigned sample_count, unsigned storage_sample_count, - unsigned bindings, boolean allow_dxt) + unsigned bindings, bool swap_bytes, bool allow_dxt) { struct pipe_screen *screen = st->pipe->screen; unsigned i; @@ -2163,12 +1111,23 @@ return PIPE_FORMAT_NONE; } - /* search for exact matches */ - pf = find_exact_format(internalFormat, format, type); - if (pf != PIPE_FORMAT_NONE && - screen->is_format_supported(screen, pf, target, sample_count, - storage_sample_count, bindings)) { - goto success; + /* If we have an unsized internalFormat, and the driver supports a format + * that exactly matches format/type such that we can just memcpy, pick that + * (unless the format wouldn't still be unorm, which is the expectation for + * unsized formats). + */ + if (_mesa_is_enum_format_unsized(internalFormat) && format != 0 && + _mesa_is_type_unsigned(type)) { + pf = st_choose_matching_format(st, bindings, format, type, + swap_bytes); + + if (pf != PIPE_FORMAT_NONE && + screen->is_format_supported(screen, pf, target, sample_count, + storage_sample_count, bindings) && + _mesa_get_format_base_format(st_pipe_format_to_mesa_format(pf)) == + internalFormat) { + goto success; + } } /* For an unsized GL_RGB but a 2_10_10_10 type, try to pick one of the @@ -2184,6 +1143,13 @@ internalFormat = GL_RGB10_A2; } + if (type == GL_UNSIGNED_SHORT_5_5_5_1) { + if (internalFormat == GL_RGB) + internalFormat = GL_RGB5; + else if (internalFormat == GL_RGBA) + internalFormat = GL_RGB5_A1; + } + /* search table for internalFormat */ for (i = 0; i < ARRAY_SIZE(format_map); i++) { const struct format_mapping *mapping = &format_map[i]; @@ -2232,7 +1198,8 @@ bindings = PIPE_BIND_RENDER_TARGET; return st_choose_format(st, internalFormat, GL_NONE, GL_NONE, PIPE_TEXTURE_2D, sample_count, - storage_sample_count, bindings, FALSE); + storage_sample_count, bindings, + false, false); } @@ -2248,33 +1215,20 @@ GLenum format, GLenum type, GLboolean swapBytes) { struct pipe_screen *screen = st->pipe->screen; - mesa_format mesa_format; - for (mesa_format = 1; mesa_format < MESA_FORMAT_COUNT; mesa_format++) { - if (_mesa_is_format_srgb(mesa_format)) { - continue; - } - if (_mesa_get_format_bits(mesa_format, GL_TEXTURE_INTENSITY_SIZE) > 0) { - /* If `format` is GL_RED/GL_RED_INTEGER, then we might match some - * intensity formats, which we don't want. - */ - continue; - } + if (swapBytes && !_mesa_swap_bytes_in_type_enum(&type)) + return PIPE_FORMAT_NONE; - if (_mesa_format_matches_format_and_type(mesa_format, format, type, - swapBytes, NULL)) { - enum pipe_format format = - st_mesa_format_to_pipe_format(st, mesa_format); - - if (format && - screen->is_format_supported(screen, format, PIPE_TEXTURE_2D, - 0, 0, bind)) { - return format; - } - /* It's unlikely to find 2 matching Mesa formats. */ - break; - } + mesa_format mesa_format = _mesa_format_from_format_and_type(format, type); + if (_mesa_format_is_mesa_array_format(mesa_format)) + mesa_format = _mesa_format_from_array_format(mesa_format); + if (mesa_format != MESA_FORMAT_NONE) { + enum pipe_format format = st_mesa_format_to_pipe_format(st, mesa_format); + if (format != PIPE_FORMAT_NONE && + screen->is_format_supported(screen, format, PIPE_TEXTURE_2D, 0, 0, bind)) + return format; } + return PIPE_FORMAT_NONE; } @@ -2366,13 +1320,14 @@ } pFormat = st_choose_format(st, internalFormat, format, type, - pTarget, 0, 0, bindings, GL_TRUE); + pTarget, 0, 0, bindings, + ctx->Unpack.SwapBytes, true); if (pFormat == PIPE_FORMAT_NONE && !is_renderbuffer) { /* try choosing format again, this time without render target bindings */ pFormat = st_choose_format(st, internalFormat, format, type, pTarget, 0, 0, PIPE_BIND_SAMPLER_VIEW, - GL_TRUE); + ctx->Unpack.SwapBytes, true); } if (pFormat == PIPE_FORMAT_NONE) { @@ -2429,7 +1384,8 @@ /* Set sample counts in descending order. */ for (i = 16; i > 1; i--) { format = st_choose_format(st, internalFormat, GL_NONE, GL_NONE, - PIPE_TEXTURE_2D, i, i, bind, FALSE); + PIPE_TEXTURE_2D, i, i, bind, + false, false); if (format != PIPE_FORMAT_NONE) { samples[num_sample_counts++] = i; @@ -2489,7 +1445,8 @@ GL_NONE, GL_NONE, PIPE_TEXTURE_2D, 0, 0, - bindings, FALSE); + bindings, + false, false); if (pformat) params[0] = internalFormat; break; diff -Nru mesa-19.2.8/src/mesa/state_tracker/st_format.h mesa-20.0.8/src/mesa/state_tracker/st_format.h --- mesa-19.2.8/src/mesa/state_tracker/st_format.h 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/state_tracker/st_format.h 2020-06-12 01:21:18.000000000 +0000 @@ -55,7 +55,7 @@ GLenum format, GLenum type, enum pipe_texture_target target, unsigned sample_count, unsigned storage_sample_count, - unsigned bindings, boolean allow_dxt); + unsigned bindings, bool swap_bytes, bool allow_dxt); extern enum pipe_format st_choose_renderbuffer_format(struct st_context *st, diff -Nru mesa-19.2.8/src/mesa/state_tracker/st_gen_mipmap.c mesa-20.0.8/src/mesa/state_tracker/st_gen_mipmap.c --- mesa-19.2.8/src/mesa/state_tracker/st_gen_mipmap.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/state_tracker/st_gen_mipmap.c 2020-06-12 01:21:18.000000000 +0000 @@ -34,7 +34,7 @@ #include "pipe/p_context.h" #include "pipe/p_defines.h" #include "util/u_inlines.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_gen_mipmap.h" #include "st_debug.h" @@ -56,13 +56,16 @@ struct st_context *st = st_context(ctx); struct st_texture_object *stObj = st_texture_object(texObj); struct pipe_resource *pt = st_get_texobj_resource(texObj); - const uint baseLevel = texObj->BaseLevel; + uint baseLevel = texObj->BaseLevel; enum pipe_format format; uint lastLevel, first_layer, last_layer; if (!pt) return; + if (texObj->Immutable) + baseLevel += texObj->MinLevel; + /* not sure if this ultimately actually should work, but we're not supporting multisampled textures yet. */ assert(pt->nr_samples < 2); @@ -70,6 +73,9 @@ /* find expected last mipmap level to generate*/ lastLevel = _mesa_compute_num_levels(ctx, texObj, target) - 1; + if (texObj->Immutable) + lastLevel += texObj->MinLevel; + if (lastLevel == 0) return; diff -Nru mesa-19.2.8/src/mesa/state_tracker/st_glsl_to_ir.cpp mesa-20.0.8/src/mesa/state_tracker/st_glsl_to_ir.cpp --- mesa-19.2.8/src/mesa/state_tracker/st_glsl_to_ir.cpp 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/state_tracker/st_glsl_to_ir.cpp 2020-06-12 01:21:18.000000000 +0000 @@ -59,6 +59,12 @@ assert(prog->data->LinkStatus); + /* Skip the GLSL steps when using SPIR-V. */ + if (prog->data->spirv) { + assert(use_nir); + return st_link_nir(ctx, prog); + } + for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) { if (prog->_LinkedShaders[i] == NULL) continue; @@ -161,7 +167,7 @@ validate_ir_tree(ir); } - build_program_resource_list(ctx, prog); + build_program_resource_list(ctx, prog, use_nir); if (use_nir) return st_link_nir(ctx, prog); diff -Nru mesa-19.2.8/src/mesa/state_tracker/st_glsl_to_nir.cpp mesa-20.0.8/src/mesa/state_tracker/st_glsl_to_nir.cpp --- mesa-19.2.8/src/mesa/state_tracker/st_glsl_to_nir.cpp 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/state_tracker/st_glsl_to_nir.cpp 2020-06-12 01:21:18.000000000 +0000 @@ -31,20 +31,23 @@ #include "program/prog_statevars.h" #include "program/prog_parameter.h" #include "program/ir_to_mesa.h" +#include "main/context.h" #include "main/mtypes.h" #include "main/errors.h" +#include "main/glspirv.h" #include "main/shaderapi.h" #include "main/uniforms.h" #include "main/shaderobj.h" #include "st_context.h" -#include "st_glsl_types.h" #include "st_program.h" +#include "st_shader_cache.h" #include "compiler/nir/nir.h" #include "compiler/glsl_types.h" #include "compiler/glsl/glsl_to_nir.h" #include "compiler/glsl/gl_nir.h" +#include "compiler/glsl/gl_nir_linker.h" #include "compiler/glsl/ir.h" #include "compiler/glsl/ir_optimization.h" #include "compiler/glsl/string_to_uint_map.h" @@ -80,22 +83,23 @@ * (This isn't the case with, for ex, FS inputs, which only need to agree * on varying-slot w/ the VS outputs) */ -static void -st_nir_assign_vs_in_locations(nir_shader *nir) +void +st_nir_assign_vs_in_locations(struct nir_shader *nir) { - nir->num_inputs = 0; + if (nir->info.stage != MESA_SHADER_VERTEX) + return; + + bool removed_inputs = false; + + nir->num_inputs = util_bitcount64(nir->info.inputs_read); nir_foreach_variable_safe(var, &nir->inputs) { /* NIR already assigns dual-slot inputs to two locations so all we have * to do is compact everything down. */ - if (var->data.location == VERT_ATTRIB_EDGEFLAG) { - /* bit of a hack, mirroring st_translate_vertex_program */ - var->data.driver_location = util_bitcount64(nir->info.inputs_read); - } else if (nir->info.inputs_read & BITFIELD64_BIT(var->data.location)) { + if (nir->info.inputs_read & BITFIELD64_BIT(var->data.location)) { var->data.driver_location = util_bitcount64(nir->info.inputs_read & BITFIELD64_MASK(var->data.location)); - nir->num_inputs++; } else { /* Move unused input variables to the globals list (with no * initialization), to avoid confusing drivers looking through the @@ -105,15 +109,32 @@ exec_node_remove(&var->node); var->data.mode = nir_var_shader_temp; exec_list_push_tail(&nir->globals, &var->node); + removed_inputs = true; } } + + /* Re-lower global vars, to deal with any dead VS inputs. */ + if (removed_inputs) + NIR_PASS_V(nir, nir_lower_global_vars_to_local); } static int -st_nir_lookup_parameter_index(const struct gl_program_parameter_list *params, - const char *name) +st_nir_lookup_parameter_index(struct gl_program *prog, nir_variable *var) { - int loc = _mesa_lookup_parameter_index(params, name); + struct gl_program_parameter_list *params = prog->Parameters; + + /* Lookup the first parameter that the uniform storage that match the + * variable location. + */ + for (unsigned i = 0; i < params->NumParameters; i++) { + int index = params->Parameters[i].MainUniformStorageIndex; + if (index == var->data.location) + return i; + } + + /* TODO: Handle this fallback for SPIR-V. We need this for GLSL e.g. in + * dEQP-GLES2.functional.uniform_api.random.3 + */ /* is there a better way to do this? If we have something like: * @@ -142,19 +163,18 @@ * needs to work backwards to get base var loc from the param-list * which already has them separated out. */ - if (loc < 0) { - int namelen = strlen(name); + if (!prog->sh.data->spirv) { + int namelen = strlen(var->name); for (unsigned i = 0; i < params->NumParameters; i++) { struct gl_program_parameter *p = ¶ms->Parameters[i]; - if ((strncmp(p->Name, name, namelen) == 0) && + if ((strncmp(p->Name, var->name, namelen) == 0) && ((p->Name[namelen] == '.') || (p->Name[namelen] == '['))) { - loc = i; - break; + return i; } } } - return loc; + return -1; } static void @@ -184,7 +204,7 @@ loc = imageidx; imageidx += type_size(uniform->type); } - } else if (strncmp(uniform->name, "gl_", 3) == 0) { + } else if (uniform->state_slots) { const gl_state_index16 *const stateTokens = uniform->state_slots[0].tokens; /* This state reference has already been setup by ir_to_mesa, but we'll * get the same index back here. @@ -205,7 +225,7 @@ loc = _mesa_add_state_reference(prog->Parameters, stateTokens); } } else { - loc = st_nir_lookup_parameter_index(prog->Parameters, uniform->name); + loc = st_nir_lookup_parameter_index(prog, uniform); /* We need to check that loc is not -1 here before accessing the * array. It can be negative for example when we have a struct that @@ -221,24 +241,30 @@ } void -st_nir_opts(nir_shader *nir, bool scalar) +st_nir_opts(nir_shader *nir) { bool progress; - unsigned lower_flrp = - (nir->options->lower_flrp16 ? 16 : 0) | - (nir->options->lower_flrp32 ? 32 : 0) | - (nir->options->lower_flrp64 ? 64 : 0); do { progress = false; NIR_PASS_V(nir, nir_lower_vars_to_ssa); + + /* Linking deals with unused inputs/outputs, but here we can remove + * things local to the shader in the hopes that we can cleanup other + * things. This pass will also remove variables with only stores, so we + * might be able to make progress after it. + */ + NIR_PASS(progress, nir, nir_remove_dead_variables, + (nir_variable_mode)(nir_var_function_temp | + nir_var_shader_temp | + nir_var_mem_shared)); NIR_PASS(progress, nir, nir_opt_copy_prop_vars); NIR_PASS(progress, nir, nir_opt_dead_write_vars); - if (scalar) { - NIR_PASS_V(nir, nir_lower_alu_to_scalar, NULL); + if (nir->options->lower_to_scalar) { + NIR_PASS_V(nir, nir_lower_alu_to_scalar, NULL, NULL); NIR_PASS_V(nir, nir_lower_phis_to_scalar); } @@ -260,27 +286,32 @@ NIR_PASS(progress, nir, nir_opt_algebraic); NIR_PASS(progress, nir, nir_opt_constant_folding); - if (lower_flrp != 0) { - bool lower_flrp_progress = false; - - NIR_PASS(lower_flrp_progress, nir, nir_lower_flrp, - lower_flrp, - false /* always_precise */, - nir->options->lower_ffma); - if (lower_flrp_progress) { - NIR_PASS(progress, nir, - nir_opt_constant_folding); - progress = true; + if (!nir->info.flrp_lowered) { + unsigned lower_flrp = + (nir->options->lower_flrp16 ? 16 : 0) | + (nir->options->lower_flrp32 ? 32 : 0) | + (nir->options->lower_flrp64 ? 64 : 0); + + if (lower_flrp) { + bool lower_flrp_progress = false; + + NIR_PASS(lower_flrp_progress, nir, nir_lower_flrp, + lower_flrp, + false /* always_precise */, + nir->options->lower_ffma); + if (lower_flrp_progress) { + NIR_PASS(progress, nir, + nir_opt_constant_folding); + progress = true; + } } /* Nothing should rematerialize any flrps, so we only need to do this * lowering once. */ - lower_flrp = 0; + nir->info.flrp_lowered = true; } - NIR_PASS(progress, nir, gl_nir_opt_access); - NIR_PASS(progress, nir, nir_opt_undef); NIR_PASS(progress, nir, nir_opt_conditional_discard); if (nir->options->max_unroll_iterations) { @@ -289,28 +320,32 @@ } while (progress); } +static void +shared_type_info(const struct glsl_type *type, unsigned *size, unsigned *align) +{ + assert(glsl_type_is_vector_or_scalar(type)); + + uint32_t comp_size = glsl_type_is_boolean(type) + ? 4 : glsl_get_bit_size(type) / 8; + unsigned length = glsl_get_vector_elements(type); + *size = comp_size * length, + *align = comp_size * (length == 3 ? 4 : length); +} + /* First third of converting glsl_to_nir.. this leaves things in a pre- * nir_lower_io state, so that shader variants can more easily insert/ * replace variables, etc. */ -static nir_shader * -st_glsl_to_nir(struct st_context *st, struct gl_program *prog, - struct gl_shader_program *shader_program, - gl_shader_stage stage) +static void +st_nir_preprocess(struct st_context *st, struct gl_program *prog, + struct gl_shader_program *shader_program, + gl_shader_stage stage) { + struct pipe_screen *screen = st->pipe->screen; const nir_shader_compiler_options *options = st->ctx->Const.ShaderCompilerOptions[prog->info.stage].NirOptions; - enum pipe_shader_type type = pipe_shader_type_from_mesa(stage); - struct pipe_screen *screen = st->pipe->screen; - bool is_scalar = screen->get_shader_param(screen, type, PIPE_SHADER_CAP_SCALAR_ISA); assert(options); - bool lower_64bit = - options->lower_int64_options || options->lower_doubles_options; - - if (prog->nir) - return prog->nir; - - nir_shader *nir = glsl_to_nir(st->ctx, shader_program, stage, options); + nir_shader *nir = prog->nir; /* Set the next shader stage hint for VS and TES. */ if (!nir->info.separate_shader && @@ -333,9 +368,16 @@ st->ctx->SoftFP64 = glsl_float64_funcs_to_nir(st->ctx, options); } - nir_variable_mode mask = - (nir_variable_mode) (nir_var_shader_in | nir_var_shader_out); - nir_remove_dead_variables(nir, mask); + /* ES has strict SSO validation rules for shader IO matching so we can't + * remove dead IO until the resource list has been built. Here we skip + * removing them until later. This will potentially make the IO lowering + * calls below do a little extra work but should otherwise have no impact. + */ + if (!_mesa_is_gles(st->ctx) || !nir->info.separate_shader) { + nir_variable_mode mask = + (nir_variable_mode) (nir_var_shader_in | nir_var_shader_out); + nir_remove_dead_variables(nir, mask); + } if (options->lower_all_io_to_temps || nir->info.stage == MESA_SHADER_VERTEX || @@ -343,7 +385,8 @@ NIR_PASS_V(nir, nir_lower_io_to_temporaries, nir_shader_get_entrypoint(nir), true, true); - } else if (nir->info.stage == MESA_SHADER_FRAGMENT) { + } else if (nir->info.stage == MESA_SHADER_FRAGMENT || + !screen->get_param(screen, PIPE_CAP_TGSI_CAN_READ_OUTPUTS)) { NIR_PASS_V(nir, nir_lower_io_to_temporaries, nir_shader_get_entrypoint(nir), true, false); @@ -353,34 +396,24 @@ NIR_PASS_V(nir, nir_split_var_copies); NIR_PASS_V(nir, nir_lower_var_copies); - if (is_scalar) { - NIR_PASS_V(nir, nir_lower_alu_to_scalar, NULL); + if (options->lower_to_scalar) { + NIR_PASS_V(nir, nir_lower_alu_to_scalar, NULL, NULL); } /* before buffers and vars_to_ssa */ NIR_PASS_V(nir, gl_nir_lower_bindless_images); - st_nir_opts(nir, is_scalar); - - NIR_PASS_V(nir, gl_nir_lower_buffers, shader_program); - /* Do a round of constant folding to clean up address calculations */ - NIR_PASS_V(nir, nir_opt_constant_folding); - - if (lower_64bit) { - bool lowered_64bit_ops = false; - if (options->lower_doubles_options) { - NIR_PASS(lowered_64bit_ops, nir, nir_lower_doubles, - st->ctx->SoftFP64, options->lower_doubles_options); - } - if (options->lower_int64_options) { - NIR_PASS(lowered_64bit_ops, nir, nir_lower_int64, - options->lower_int64_options); - } - if (lowered_64bit_ops) - st_nir_opts(nir, is_scalar); + /* TODO: Change GLSL to not lower shared memory. */ + if (prog->nir->info.stage == MESA_SHADER_COMPUTE && + shader_program->data->spirv) { + NIR_PASS_V(prog->nir, nir_lower_vars_to_explicit_types, + nir_var_mem_shared, shared_type_info); + NIR_PASS_V(prog->nir, nir_lower_explicit_io, + nir_var_mem_shared, nir_address_format_32bit_offset); } - return nir; + /* Do a round of constant folding to clean up address calculations */ + NIR_PASS_V(nir, nir_opt_constant_folding); } /* Second third of converting glsl_to_nir. This creates uniforms, gathers @@ -400,10 +433,8 @@ * get sent to the shader. */ nir_foreach_variable(var, &nir->uniforms) { - if (strncmp(var->name, "gl_", 3) == 0) { - const nir_state_slot *const slots = var->state_slots; - assert(var->state_slots != NULL); - + const nir_state_slot *const slots = var->state_slots; + if (slots != NULL) { const struct glsl_type *type = glsl_without_array(var->type); for (unsigned int i = 0; i < var->num_state_slots; i++) { unsigned comps; @@ -442,104 +473,52 @@ st_set_prog_affected_state_flags(prog); - NIR_PASS_V(nir, st_nir_lower_builtin); + /* None of the builtins being lowered here can be produced by SPIR-V. See + * _mesa_builtin_uniform_desc. + */ + if (!shader_program->data->spirv) + NIR_PASS_V(nir, st_nir_lower_builtin); + NIR_PASS_V(nir, gl_nir_lower_atomics, shader_program, true); NIR_PASS_V(nir, nir_opt_intrinsics); - nir_variable_mode mask = nir_var_function_temp; - nir_remove_dead_variables(nir, mask); - - if (st->ctx->_Shader->Flags & GLSL_DUMP) { - _mesa_log("\n"); - _mesa_log("NIR IR for linked %s program %d:\n", - _mesa_shader_stage_to_string(prog->info.stage), - shader_program->Name); - nir_print_shader(nir, _mesa_get_log_file()); - _mesa_log("\n\n"); - } -} + /* Lower 64-bit ops. */ + if (nir->options->lower_int64_options || + nir->options->lower_doubles_options) { + bool lowered_64bit_ops = false; + if (nir->options->lower_doubles_options) { + NIR_PASS(lowered_64bit_ops, nir, nir_lower_doubles, + st->ctx->SoftFP64, nir->options->lower_doubles_options); + } + if (nir->options->lower_int64_options) { + NIR_PASS(lowered_64bit_ops, nir, nir_lower_int64, + nir->options->lower_int64_options); + } -static void -set_st_program(struct gl_program *prog, - struct gl_shader_program *shader_program, - nir_shader *nir) -{ - struct st_vertex_program *stvp; - struct st_common_program *stp; - struct st_fragment_program *stfp; - struct st_compute_program *stcp; - - switch (prog->info.stage) { - case MESA_SHADER_VERTEX: - stvp = (struct st_vertex_program *)prog; - stvp->shader_program = shader_program; - stvp->tgsi.type = PIPE_SHADER_IR_NIR; - stvp->tgsi.ir.nir = nir; - break; - case MESA_SHADER_GEOMETRY: - case MESA_SHADER_TESS_CTRL: - case MESA_SHADER_TESS_EVAL: - stp = (struct st_common_program *)prog; - stp->shader_program = shader_program; - stp->tgsi.type = PIPE_SHADER_IR_NIR; - stp->tgsi.ir.nir = nir; - break; - case MESA_SHADER_FRAGMENT: - stfp = (struct st_fragment_program *)prog; - stfp->shader_program = shader_program; - stfp->tgsi.type = PIPE_SHADER_IR_NIR; - stfp->tgsi.ir.nir = nir; - break; - case MESA_SHADER_COMPUTE: - stcp = (struct st_compute_program *)prog; - stcp->shader_program = shader_program; - stcp->tgsi.ir_type = PIPE_SHADER_IR_NIR; - stcp->tgsi.prog = nir; - break; - default: - unreachable("unknown shader stage"); + if (lowered_64bit_ops) + st_nir_opts(nir); } -} - -static void -st_nir_get_mesa_program(struct gl_context *ctx, - struct gl_shader_program *shader_program, - struct gl_linked_shader *shader) -{ - struct st_context *st = st_context(ctx); - struct pipe_screen *pscreen = ctx->st->pipe->screen; - struct gl_program *prog; - validate_ir_tree(shader->ir); - - prog = shader->Program; + nir_variable_mode mask = (nir_variable_mode) + (nir_var_shader_in | nir_var_shader_out | nir_var_function_temp ); + nir_remove_dead_variables(nir, mask); - prog->Parameters = _mesa_new_parameter_list(); + if (!st->has_hw_atomics) + NIR_PASS_V(nir, nir_lower_atomics_to_ssbo); - _mesa_copy_linked_program_data(shader_program, shader); - _mesa_generate_parameters_list_for_uniforms(ctx, shader_program, shader, - prog->Parameters); + st_finalize_nir_before_variants(nir); - /* Remove reads from output registers. */ - if (!pscreen->get_param(pscreen, PIPE_CAP_TGSI_CAN_READ_OUTPUTS)) - lower_output_reads(shader->Stage, shader->ir); + if (st->allow_st_finalize_nir_twice) + st_finalize_nir(st, prog, shader_program, nir, true); - if (ctx->_Shader->Flags & GLSL_DUMP) { + if (st->ctx->_Shader->Flags & GLSL_DUMP) { _mesa_log("\n"); - _mesa_log("GLSL IR for linked %s program %d:\n", - _mesa_shader_stage_to_string(shader->Stage), + _mesa_log("NIR IR for linked %s program %d:\n", + _mesa_shader_stage_to_string(prog->info.stage), shader_program->Name); - _mesa_print_ir(_mesa_get_log_file(), shader->ir, NULL); + nir_print_shader(nir, _mesa_get_log_file()); _mesa_log("\n\n"); } - - prog->ExternalSamplersUsed = gl_external_samplers(prog); - _mesa_update_shader_textures_used(shader_program, prog); - - nir_shader *nir = st_glsl_to_nir(st, prog, shader_program, shader->Stage); - - set_st_program(prog, shader_program, nir); - prog->nir = nir; } static void @@ -565,53 +544,37 @@ } static void -st_nir_link_shaders(nir_shader **producer, nir_shader **consumer, bool scalar) +st_nir_link_shaders(nir_shader *producer, nir_shader *consumer) { - if (scalar) { - NIR_PASS_V(*producer, nir_lower_io_to_scalar_early, nir_var_shader_out); - NIR_PASS_V(*consumer, nir_lower_io_to_scalar_early, nir_var_shader_in); - } - - nir_lower_io_arrays_to_elements(*producer, *consumer); - - st_nir_opts(*producer, scalar); - st_nir_opts(*consumer, scalar); - - if (nir_link_opt_varyings(*producer, *consumer)) - st_nir_opts(*consumer, scalar); - - NIR_PASS_V(*producer, nir_remove_dead_variables, nir_var_shader_out); - NIR_PASS_V(*consumer, nir_remove_dead_variables, nir_var_shader_in); - - if (nir_remove_unused_varyings(*producer, *consumer)) { - NIR_PASS_V(*producer, nir_lower_global_vars_to_local); - NIR_PASS_V(*consumer, nir_lower_global_vars_to_local); - - /* The backend might not be able to handle indirects on - * temporaries so we need to lower indirects on any of the - * varyings we have demoted here. - * - * TODO: radeonsi shouldn't need to do this, however LLVM isn't - * currently smart enough to handle indirects without causing excess - * spilling causing the gpu to hang. - * - * See the following thread for more details of the problem: - * https://lists.freedesktop.org/archives/mesa-dev/2017-July/162106.html - */ - nir_variable_mode indirect_mask = nir_var_function_temp; + if (producer->options->lower_to_scalar) { + NIR_PASS_V(producer, nir_lower_io_to_scalar_early, nir_var_shader_out); + NIR_PASS_V(consumer, nir_lower_io_to_scalar_early, nir_var_shader_in); + } + + nir_lower_io_arrays_to_elements(producer, consumer); + + st_nir_opts(producer); + st_nir_opts(consumer); + + if (nir_link_opt_varyings(producer, consumer)) + st_nir_opts(consumer); - NIR_PASS_V(*producer, nir_lower_indirect_derefs, indirect_mask); - NIR_PASS_V(*consumer, nir_lower_indirect_derefs, indirect_mask); + NIR_PASS_V(producer, nir_remove_dead_variables, nir_var_shader_out); + NIR_PASS_V(consumer, nir_remove_dead_variables, nir_var_shader_in); - st_nir_opts(*producer, scalar); - st_nir_opts(*consumer, scalar); + if (nir_remove_unused_varyings(producer, consumer)) { + NIR_PASS_V(producer, nir_lower_global_vars_to_local); + NIR_PASS_V(consumer, nir_lower_global_vars_to_local); + + st_nir_opts(producer); + st_nir_opts(consumer); - /* Lowering indirects can cause varying to become unused. + /* Optimizations can cause varyings to become unused. * nir_compact_varyings() depends on all dead varyings being removed so * we need to call nir_remove_dead_variables() again here. */ - NIR_PASS_V(*producer, nir_remove_dead_variables, nir_var_shader_out); - NIR_PASS_V(*consumer, nir_remove_dead_variables, nir_var_shader_in); + NIR_PASS_V(producer, nir_remove_dead_variables, nir_var_shader_out); + NIR_PASS_V(consumer, nir_remove_dead_variables, nir_var_shader_in); } } @@ -677,53 +640,124 @@ struct gl_shader_program *shader_program) { struct st_context *st = st_context(ctx); - struct pipe_screen *screen = st->pipe->screen; - bool is_scalar[MESA_SHADER_STAGES]; + struct gl_linked_shader *linked_shader[MESA_SHADER_STAGES]; + unsigned num_shaders = 0; - unsigned last_stage = 0; for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) { - struct gl_linked_shader *shader = shader_program->_LinkedShaders[i]; - if (shader == NULL) - continue; + if (shader_program->_LinkedShaders[i]) + linked_shader[num_shaders++] = shader_program->_LinkedShaders[i]; + } - /* Determine scalar property of each shader stage */ - enum pipe_shader_type type = pipe_shader_type_from_mesa(shader->Stage); - is_scalar[i] = screen->get_shader_param(screen, type, - PIPE_SHADER_CAP_SCALAR_ISA); + for (unsigned i = 0; i < num_shaders; i++) { + struct gl_linked_shader *shader = linked_shader[i]; + const nir_shader_compiler_options *options = + st->ctx->Const.ShaderCompilerOptions[shader->Stage].NirOptions; + struct gl_program *prog = shader->Program; + struct st_program *stp = (struct st_program *)prog; - st_nir_get_mesa_program(ctx, shader_program, shader); - last_stage = i; + _mesa_copy_linked_program_data(shader_program, shader); + + assert(!prog->nir); + stp->shader_program = shader_program; + stp->state.type = PIPE_SHADER_IR_NIR; + + if (shader_program->data->spirv) { + prog->Parameters = _mesa_new_parameter_list(); + /* Parameters will be filled during NIR linking. */ + + prog->nir = _mesa_spirv_to_nir(ctx, shader_program, shader->Stage, options); + } else { + validate_ir_tree(shader->ir); + + prog->Parameters = _mesa_new_parameter_list(); + _mesa_generate_parameters_list_for_uniforms(ctx, shader_program, shader, + prog->Parameters); + + if (ctx->_Shader->Flags & GLSL_DUMP) { + _mesa_log("\n"); + _mesa_log("GLSL IR for linked %s program %d:\n", + _mesa_shader_stage_to_string(shader->Stage), + shader_program->Name); + _mesa_print_ir(_mesa_get_log_file(), shader->ir, NULL); + _mesa_log("\n\n"); + } + + prog->ExternalSamplersUsed = gl_external_samplers(prog); + _mesa_update_shader_textures_used(shader_program, prog); + + prog->nir = glsl_to_nir(st->ctx, shader_program, shader->Stage, options); + st_nir_preprocess(st, prog, shader_program, shader->Stage); + } - if (is_scalar[i]) { + if (options->lower_to_scalar) { NIR_PASS_V(shader->Program->nir, nir_lower_load_const_to_scalar); } } + st_lower_patch_vertices_in(shader_program); + + /* For SPIR-V, we have to perform the NIR linking before applying + * st_nir_preprocess. + */ + if (shader_program->data->spirv) { + static const gl_nir_linker_options opts = { + true /*fill_parameters */ + }; + if (!gl_nir_link_spirv(ctx, shader_program, &opts)) + return GL_FALSE; + + nir_build_program_resource_list(ctx, shader_program, true); + + for (unsigned i = 0; i < num_shaders; i++) { + struct gl_linked_shader *shader = linked_shader[i]; + struct gl_program *prog = shader->Program; + + prog->ExternalSamplersUsed = gl_external_samplers(prog); + _mesa_update_shader_textures_used(shader_program, prog); + st_nir_preprocess(st, prog, shader_program, shader->Stage); + } + } + /* Linking the stages in the opposite order (from fragment to vertex) * ensures that inter-shader outputs written to in an earlier stage * are eliminated if they are (transitively) not used in a later * stage. */ - int next = last_stage; - for (int i = next - 1; i >= 0; i--) { - struct gl_linked_shader *shader = shader_program->_LinkedShaders[i]; - if (shader == NULL) - continue; + for (int i = num_shaders - 2; i >= 0; i--) { + st_nir_link_shaders(linked_shader[i]->Program->nir, + linked_shader[i + 1]->Program->nir); + } + /* Linking shaders also optimizes them. Separate shaders, compute shaders + * and shaders with a fixed-func VS or FS that don't need linking are + * optimized here. + */ + if (num_shaders == 1) + st_nir_opts(linked_shader[0]->Program->nir); - st_nir_link_shaders(&shader->Program->nir, - &shader_program->_LinkedShaders[next]->Program->nir, - is_scalar[i]); - next = i; - } + if (!shader_program->data->spirv) { + if (!gl_nir_link_glsl(ctx, shader_program)) + return GL_FALSE; - int prev = -1; - for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) { - struct gl_linked_shader *shader = shader_program->_LinkedShaders[i]; - if (shader == NULL) - continue; + nir_build_program_resource_list(ctx, shader_program, false); + } + for (unsigned i = 0; i < num_shaders; i++) { + struct gl_linked_shader *shader = linked_shader[i]; nir_shader *nir = shader->Program->nir; + /* This needs to run after the initial pass of nir_lower_vars_to_ssa, so + * that the buffer indices are constants in nir where they where + * constants in GLSL. */ + NIR_PASS_V(nir, gl_nir_lower_buffers, shader_program); + + /* Remap the locations to slots so those requiring two slots will occupy + * two locations. For instance, if we have in the IR code a dvec3 attr0 in + * location 0 and vec4 attr1 in location 1, in NIR attr0 will use + * locations/slots 0 and 1, and attr1 will use location/slot 2 + */ + if (nir->info.stage == MESA_SHADER_VERTEX && !shader_program->data->spirv) + nir_remap_dual_slot_attributes(nir, &shader->Program->DualSlotInputs); + NIR_PASS_V(nir, st_nir_lower_wpos_ytransform, shader->Program, st->pipe->screen); @@ -732,7 +766,7 @@ nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir)); shader->Program->info = nir->info; - if (i == MESA_SHADER_VERTEX) { + if (shader->Stage == MESA_SHADER_VERTEX) { /* NIR expands dual-slot inputs out to two locations. We need to * compact things back down GL-style single-slot inputs to avoid * confusing the state tracker. @@ -742,9 +776,8 @@ shader->Program->DualSlotInputs); } - if (prev != -1) { - struct gl_program *prev_shader = - shader_program->_LinkedShaders[prev]->Program; + if (i >= 1) { + struct gl_program *prev_shader = linked_shader[i - 1]->Program; /* We can't use nir_compact_varyings with transform feedback, since * the pipe_stream_output->output_register field is based on the @@ -752,33 +785,53 @@ */ if (!(prev_shader->sh.LinkedTransformFeedback && prev_shader->sh.LinkedTransformFeedback->NumVarying > 0)) - nir_compact_varyings(shader_program->_LinkedShaders[prev]->Program->nir, - nir, ctx->API != API_OPENGL_COMPAT); + nir_compact_varyings(prev_shader->nir, + nir, ctx->API != API_OPENGL_COMPAT); - if (ctx->Const.ShaderCompilerOptions[i].NirOptions->vectorize_io) + if (ctx->Const.ShaderCompilerOptions[shader->Stage].NirOptions->vectorize_io) st_nir_vectorize_io(prev_shader->nir, nir); } - prev = i; } - st_lower_patch_vertices_in(shader_program); + struct shader_info *prev_info = NULL; - for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) { - struct gl_linked_shader *shader = shader_program->_LinkedShaders[i]; - if (shader == NULL) - continue; + for (unsigned i = 0; i < num_shaders; i++) { + struct gl_linked_shader *shader = linked_shader[i]; + struct shader_info *info = &shader->Program->nir->info; - st_glsl_to_nir_post_opts(st, shader->Program, shader_program); + if (prev_info && + ctx->Const.ShaderCompilerOptions[shader->Stage].NirOptions->unify_interfaces) { + prev_info->outputs_written |= info->inputs_read & + ~(VARYING_BIT_TESS_LEVEL_INNER | VARYING_BIT_TESS_LEVEL_OUTER); + info->inputs_read |= prev_info->outputs_written & + ~(VARYING_BIT_TESS_LEVEL_INNER | VARYING_BIT_TESS_LEVEL_OUTER); - assert(shader->Program); - if (!ctx->Driver.ProgramStringNotify(ctx, - _mesa_shader_stage_to_program(i), - shader->Program)) { - _mesa_reference_program(ctx, &shader->Program, NULL); - return false; + prev_info->patch_outputs_written |= info->patch_inputs_read; + info->patch_inputs_read |= prev_info->patch_outputs_written; } + prev_info = info; + } + + for (unsigned i = 0; i < num_shaders; i++) { + struct gl_linked_shader *shader = linked_shader[i]; + struct gl_program *prog = shader->Program; + struct st_program *stp = st_program(prog); + st_glsl_to_nir_post_opts(st, prog, shader_program); + + /* Initialize st_vertex_program members. */ + if (shader->Stage == MESA_SHADER_VERTEX) + st_prepare_vertex_program(stp); - nir_sweep(shader->Program->nir); + /* Get pipe_stream_output_info. */ + if (shader->Stage == MESA_SHADER_VERTEX || + shader->Stage == MESA_SHADER_TESS_EVAL || + shader->Stage == MESA_SHADER_GEOMETRY) + st_translate_stream_output_info(prog); + + st_store_ir_in_disk_cache(st, prog, true); + + st_release_variants(st, stp); + st_finalize_program(st, prog); /* The GLSL IR won't be needed anymore. */ ralloc_free(shader->ir); @@ -792,11 +845,6 @@ st_nir_assign_varying_locations(struct st_context *st, nir_shader *nir) { if (nir->info.stage == MESA_SHADER_VERTEX) { - /* Needs special handling so drvloc matches the vbo state: */ - st_nir_assign_vs_in_locations(nir); - /* Re-lower global vars, to deal with any dead VS inputs. */ - NIR_PASS_V(nir, nir_lower_global_vars_to_local); - nir_assign_io_var_locations(&nir->outputs, &nir->num_outputs, nir->info.stage); @@ -844,49 +892,58 @@ } } +static int +st_packed_uniforms_type_size(const struct glsl_type *type, bool bindless) +{ + return glsl_count_dword_slots(type, bindless); +} + +static int +st_unpacked_uniforms_type_size(const struct glsl_type *type, bool bindless) +{ + return glsl_count_vec4_slots(type, false, bindless); +} + +void +st_nir_lower_uniforms(struct st_context *st, nir_shader *nir) +{ + if (st->ctx->Const.PackedDriverUniformStorage) { + NIR_PASS_V(nir, nir_lower_io, nir_var_uniform, + st_packed_uniforms_type_size, + (nir_lower_io_options)0); + NIR_PASS_V(nir, nir_lower_uniforms_to_ubo, 4); + } else { + NIR_PASS_V(nir, nir_lower_io, nir_var_uniform, + st_unpacked_uniforms_type_size, + (nir_lower_io_options)0); + } +} + /* Last third of preparing nir from glsl, which happens after shader * variant lowering. */ void st_finalize_nir(struct st_context *st, struct gl_program *prog, - struct gl_shader_program *shader_program, nir_shader *nir) + struct gl_shader_program *shader_program, + nir_shader *nir, bool finalize_by_driver) { struct pipe_screen *screen = st->pipe->screen; - const nir_shader_compiler_options *options = - st->ctx->Const.ShaderCompilerOptions[prog->info.stage].NirOptions; NIR_PASS_V(nir, nir_split_var_copies); NIR_PASS_V(nir, nir_lower_var_copies); - if (options->lower_all_io_to_temps || - options->lower_all_io_to_elements || - nir->info.stage == MESA_SHADER_VERTEX || - nir->info.stage == MESA_SHADER_GEOMETRY) { - NIR_PASS_V(nir, nir_lower_io_arrays_to_elements_no_indirects, false); - } else if (nir->info.stage == MESA_SHADER_FRAGMENT) { - NIR_PASS_V(nir, nir_lower_io_arrays_to_elements_no_indirects, true); - } st_nir_assign_varying_locations(st, nir); - - NIR_PASS_V(nir, nir_lower_atomics_to_ssbo, - st->ctx->Const.Program[nir->info.stage].MaxAtomicBuffers); - st_nir_assign_uniform_locations(st->ctx, prog, &nir->uniforms); /* Set num_uniforms in number of attribute slots (vec4s) */ nir->num_uniforms = DIV_ROUND_UP(prog->Parameters->NumParameterValues, 4); - if (st->ctx->Const.PackedDriverUniformStorage) { - NIR_PASS_V(nir, nir_lower_io, nir_var_uniform, st_glsl_type_dword_size, - (nir_lower_io_options)0); - NIR_PASS_V(nir, nir_lower_uniforms_to_ubo, 4); - } else { - NIR_PASS_V(nir, nir_lower_io, nir_var_uniform, st_glsl_uniforms_type_size, - (nir_lower_io_options)0); - } - + st_nir_lower_uniforms(st, nir); st_nir_lower_samplers(screen, nir, shader_program, prog); + + if (finalize_by_driver && screen->finalize_nir) + screen->finalize_nir(screen, nir, false); } } /* extern "C" */ diff -Nru mesa-19.2.8/src/mesa/state_tracker/st_glsl_to_tgsi.cpp mesa-20.0.8/src/mesa/state_tracker/st_glsl_to_tgsi.cpp --- mesa-19.2.8/src/mesa/state_tracker/st_glsl_to_tgsi.cpp 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/state_tracker/st_glsl_to_tgsi.cpp 2020-06-12 01:21:18.000000000 +0000 @@ -49,7 +49,6 @@ #include "tgsi/tgsi_info.h" #include "util/u_math.h" #include "util/u_memory.h" -#include "st_glsl_types.h" #include "st_program.h" #include "st_mesa_to_tgsi.h" #include "st_format.h" @@ -246,6 +245,7 @@ bool has_tex_txf_lz; bool precise; bool need_uarl; + bool tg4_component_in_swizzle; variable_storage *find_variable_storage(ir_variable *var); @@ -284,6 +284,7 @@ virtual void visit(ir_call *); virtual void visit(ir_return *); virtual void visit(ir_discard *); + virtual void visit(ir_demote *); virtual void visit(ir_texture *); virtual void visit(ir_if *); virtual void visit(ir_emit_vertex *); @@ -2236,11 +2237,9 @@ case ir_unop_get_buffer_size: { ir_constant *const_offset = ir->operands[0]->as_constant(); - int buf_base = ctx->st->has_hw_atomics - ? 0 : ctx->Const.Program[shader->Stage].MaxAtomicBuffers; st_src_reg buffer( PROGRAM_BUFFER, - buf_base + (const_offset ? const_offset->value.u[0] : 0), + const_offset ? const_offset->value.u[0] : 0, GLSL_TYPE_UINT); if (!const_offset) { buffer.reladdr = ralloc(mem_ctx, st_src_reg); @@ -2387,6 +2386,15 @@ case ir_binop_carry: case ir_binop_borrow: case ir_unop_ssbo_unsized_array_length: + case ir_unop_atan: + case ir_binop_atan2: + case ir_unop_clz: + case ir_binop_add_sat: + case ir_binop_sub_sat: + case ir_binop_abs_sub: + case ir_binop_avg: + case ir_binop_avg_round: + case ir_binop_mul_32x16: /* This operation is not supported, or should have already been handled. */ assert(!"Invalid ir opcode in glsl_to_tgsi_visitor::visit()"); @@ -2762,12 +2770,12 @@ if (handle_bound_deref(ir->as_dereference())) return; - /* We only need the logic provided by st_glsl_storage_type_size() + /* We only need the logic provided by count_vec4_slots() * for arrays of structs. Indirect sampler and image indexing is handled * elsewhere. */ int element_size = ir->type->without_array()->is_struct() ? - st_glsl_storage_type_size(ir->type, var->data.bindless) : + ir->type->count_vec4_slots(false, var->data.bindless) : type_size(ir->type); index = ir->array_index->constant_expression_value(ralloc_parent(ir)); @@ -2872,7 +2880,7 @@ if (i == (unsigned) ir->field_idx) break; const glsl_type *member_type = struct_type->fields.structure[i].type; - offset += st_glsl_storage_type_size(member_type, var->data.bindless); + offset += member_type->count_vec4_slots(false, var->data.bindless); } /* If the type is smaller than a vec4, replicate the last channel out. */ @@ -3446,7 +3454,9 @@ resource = buffer; } else { - st_src_reg buffer(PROGRAM_BUFFER, location->data.binding, + st_src_reg buffer(PROGRAM_BUFFER, + prog->info.num_ssbos + + location->data.binding, GLSL_TYPE_ATOMIC_UINT); if (offset.file != PROGRAM_UNDEFINED) { @@ -3534,11 +3544,9 @@ ir_rvalue *offset = ((ir_instruction *)param)->as_rvalue(); ir_constant *const_block = block->as_constant(); - int buf_base = st_context(ctx)->has_hw_atomics - ? 0 : ctx->Const.Program[shader->Stage].MaxAtomicBuffers; st_src_reg buffer( PROGRAM_BUFFER, - buf_base + (const_block ? const_block->value.u[0] : 0), + const_block ? const_block->value.u[0] : 0, GLSL_TYPE_UINT); if (!const_block) { @@ -4106,6 +4114,10 @@ visit_generic_intrinsic(ir, TGSI_OPCODE_READ_INVOC); return; + case ir_intrinsic_helper_invocation: + visit_generic_intrinsic(ir, TGSI_OPCODE_READ_HELPER); + return; + case ir_intrinsic_invalid: case ir_intrinsic_generic_load: case ir_intrinsic_generic_store: @@ -4560,7 +4572,20 @@ if (is_cube_array && ir->shadow_comparator) { inst = emit_asm(ir, opcode, result_dst, coord, cube_sc); } else { - inst = emit_asm(ir, opcode, result_dst, coord, component); + if (this->tg4_component_in_swizzle) { + inst = emit_asm(ir, opcode, result_dst, coord); + int idx = 0; + foreach_in_list(immediate_storage, entry, &this->immediates) { + if (component.index == idx) { + gl_constant_value value = entry->values[component.swizzle]; + inst->gather_component = value.i; + break; + } + idx++; + } + } else { + inst = emit_asm(ir, opcode, result_dst, coord, component); + } } } else inst = emit_asm(ir, opcode, result_dst, coord); @@ -4627,6 +4652,12 @@ } void +glsl_to_tgsi_visitor::visit(ir_demote *ir) +{ + emit_asm(ir, TGSI_OPCODE_DEMOTE); +} + +void glsl_to_tgsi_visitor::visit(ir_if *ir) { enum tgsi_opcode if_opcode; @@ -4706,6 +4737,7 @@ prog = NULL; precise = 0; need_uarl = false; + tg4_component_in_swizzle = false; shader_program = NULL; shader = NULL; options = NULL; @@ -5763,6 +5795,7 @@ enum pipe_shader_type procType; /**< PIPE_SHADER_VERTEX/FRAGMENT */ bool need_uarl; + bool tg4_component_in_swizzle; }; /** Map Mesa's SYSTEM_VALUE_x to TGSI_SEMANTIC_x */ @@ -5846,10 +5879,10 @@ case SYSTEM_VALUE_LOCAL_INVOCATION_INDEX: case SYSTEM_VALUE_GLOBAL_INVOCATION_ID: case SYSTEM_VALUE_VERTEX_CNT: - case SYSTEM_VALUE_BARYCENTRIC_PIXEL: - case SYSTEM_VALUE_BARYCENTRIC_SAMPLE: - case SYSTEM_VALUE_BARYCENTRIC_CENTROID: - case SYSTEM_VALUE_BARYCENTRIC_SIZE: + case SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL: + case SYSTEM_VALUE_BARYCENTRIC_PERSP_SAMPLE: + case SYSTEM_VALUE_BARYCENTRIC_PERSP_CENTROID: + case SYSTEM_VALUE_BARYCENTRIC_PERSP_SIZE: default: assert(!"Unexpected SYSTEM_VALUE_ enum"); return TGSI_SEMANTIC_COUNT; @@ -6220,6 +6253,8 @@ case TGSI_OPCODE_SAMP2HND: if (inst->resource.file == PROGRAM_SAMPLER) { src[num_src] = t->samplers[inst->resource.index]; + if (t->tg4_component_in_swizzle && inst->op == TGSI_OPCODE_TG4) + src[num_src].SwizzleX = inst->gather_component; } else { /* Bindless samplers. */ src[num_src] = translate_src(t, &inst->resource); @@ -6684,6 +6719,7 @@ t->procType = procType; t->need_uarl = !screen->get_param(screen, PIPE_CAP_TGSI_ANY_REG_AS_ADDRESS); + t->tg4_component_in_swizzle = screen->get_param(screen, PIPE_CAP_TGSI_TG4_COMPONENT_IN_SWIZZLE); t->inputMapping = inputMapping; t->outputMapping = outputMapping; t->ureg = ureg; @@ -7022,8 +7058,10 @@ if (!st_context(ctx)->has_hw_atomics) { for (i = 0; i < prog->info.num_abos; i++) { - unsigned index = prog->sh.AtomicBuffers[i]->Binding; - assert(index < frag_const->MaxAtomicBuffers); + unsigned index = (prog->info.num_ssbos + + prog->sh.AtomicBuffers[i]->Binding); + assert(prog->sh.AtomicBuffers[i]->Binding < + frag_const->MaxAtomicBuffers); t->buffers[index] = ureg_DECL_buffer(ureg, index, true); } } else { @@ -7038,11 +7076,7 @@ assert(prog->info.num_ssbos <= frag_const->MaxShaderStorageBlocks); for (i = 0; i < prog->info.num_ssbos; i++) { - unsigned index = i; - if (!st_context(ctx)->has_hw_atomics) - index += frag_const->MaxAtomicBuffers; - - t->buffers[index] = ureg_DECL_buffer(ureg, index, false); + t->buffers[i] = ureg_DECL_buffer(ureg, i, false); } } @@ -7137,6 +7171,7 @@ PIPE_CAP_TGSI_TEX_TXF_LZ); v->need_uarl = !pscreen->get_param(pscreen, PIPE_CAP_TGSI_ANY_REG_AS_ADDRESS); + v->tg4_component_in_swizzle = pscreen->get_param(pscreen, PIPE_CAP_TGSI_TG4_COMPONENT_IN_SWIZZLE); v->variables = _mesa_hash_table_create(v->mem_ctx, _mesa_hash_pointer, _mesa_key_pointer_equal); skip_merge_registers = @@ -7275,34 +7310,7 @@ return NULL; } - struct st_vertex_program *stvp; - struct st_fragment_program *stfp; - struct st_common_program *stp; - struct st_compute_program *stcp; - - switch (shader->Stage) { - case MESA_SHADER_VERTEX: - stvp = (struct st_vertex_program *)prog; - stvp->glsl_to_tgsi = v; - break; - case MESA_SHADER_FRAGMENT: - stfp = (struct st_fragment_program *)prog; - stfp->glsl_to_tgsi = v; - break; - case MESA_SHADER_TESS_CTRL: - case MESA_SHADER_TESS_EVAL: - case MESA_SHADER_GEOMETRY: - stp = st_common_program(prog); - stp->glsl_to_tgsi = v; - break; - case MESA_SHADER_COMPUTE: - stcp = (struct st_compute_program *)prog; - stcp->glsl_to_tgsi = v; - break; - default: - assert(!"should not be reached"); - return NULL; - } + st_program(prog)->glsl_to_tgsi = v; PRINT_STATS(v->print_stats()); @@ -7424,35 +7432,3 @@ return GL_TRUE; } - -extern "C" { - -void -st_translate_stream_output_info(struct gl_transform_feedback_info *info, - const ubyte outputMapping[], - struct pipe_stream_output_info *so) -{ - unsigned i; - - if (!info) { - so->num_outputs = 0; - return; - } - - for (i = 0; i < info->NumOutputs; i++) { - so->output[i].register_index = - outputMapping[info->Outputs[i].OutputRegister]; - so->output[i].start_component = info->Outputs[i].ComponentOffset; - so->output[i].num_components = info->Outputs[i].NumComponents; - so->output[i].output_buffer = info->Outputs[i].OutputBuffer; - so->output[i].dst_offset = info->Outputs[i].DstOffset; - so->output[i].stream = info->Outputs[i].StreamId; - } - - for (i = 0; i < PIPE_MAX_SO_BUFFERS; i++) { - so->stride[i] = info->Buffers[i].Stride; - } - so->num_outputs = info->NumOutputs; -} - -} /* extern "C" */ diff -Nru mesa-19.2.8/src/mesa/state_tracker/st_glsl_to_tgsi.h mesa-20.0.8/src/mesa/state_tracker/st_glsl_to_tgsi.h --- mesa-19.2.8/src/mesa/state_tracker/st_glsl_to_tgsi.h 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/state_tracker/st_glsl_to_tgsi.h 2020-06-12 01:21:18.000000000 +0000 @@ -61,11 +61,6 @@ GLboolean st_link_tgsi(struct gl_context *ctx, struct gl_shader_program *prog); -void -st_translate_stream_output_info(struct gl_transform_feedback_info *info, - const ubyte outputMapping[], - struct pipe_stream_output_info *so); - enum tgsi_semantic _mesa_sysval_to_semantic(unsigned sysval); diff -Nru mesa-19.2.8/src/mesa/state_tracker/st_glsl_to_tgsi_private.h mesa-20.0.8/src/mesa/state_tracker/st_glsl_to_tgsi_private.h --- mesa-19.2.8/src/mesa/state_tracker/st_glsl_to_tgsi_private.h 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/state_tracker/st_glsl_to_tgsi_private.h 2020-06-12 01:21:18.000000000 +0000 @@ -147,6 +147,7 @@ unsigned dead_mask:4; /**< Used in dead code elimination */ unsigned buffer_access:3; /**< bitmask of TGSI_MEMORY_x bits */ unsigned read_only:1; + unsigned gather_component:2; /* 0, 1, 2, 3 */ const struct tgsi_opcode_info *info; diff -Nru mesa-19.2.8/src/mesa/state_tracker/st_glsl_types.cpp mesa-20.0.8/src/mesa/state_tracker/st_glsl_types.cpp --- mesa-19.2.8/src/mesa/state_tracker/st_glsl_types.cpp 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/state_tracker/st_glsl_types.cpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,172 +0,0 @@ -/* - * Copyright (C) 2005-2007 Brian Paul All Rights Reserved. - * Copyright (C) 2008 VMware, Inc. All Rights Reserved. - * Copyright © 2010 Intel Corporation - * Copyright © 2011 Bryan Cain - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER - * DEALINGS IN THE SOFTWARE. - */ - -#include "st_glsl_types.h" - -/** - * Returns the number of places to offset the uniform index, given the type of - * a struct member. We use this because samplers and images have backing - * storeage only when they are bindless. - */ -int -st_glsl_storage_type_size(const struct glsl_type *type, bool is_bindless) -{ - unsigned int i; - int size; - - switch (type->base_type) { - case GLSL_TYPE_UINT: - case GLSL_TYPE_INT: - case GLSL_TYPE_FLOAT: - case GLSL_TYPE_BOOL: - if (type->is_matrix()) { - return type->matrix_columns; - } else { - /* Regardless of size of vector, it gets a vec4. This is bad - * packing for things like floats, but otherwise arrays become a - * mess. Hopefully a later pass over the code can pack scalars - * down if appropriate. - */ - return 1; - } - break; - case GLSL_TYPE_DOUBLE: - if (type->is_matrix()) { - if (type->vector_elements <= 2) - return type->matrix_columns; - else - return type->matrix_columns * 2; - } else { - /* For doubles if we have a double or dvec2 they fit in one - * vec4, else they need 2 vec4s. - */ - if (type->vector_elements <= 2) - return 1; - else - return 2; - } - break; - case GLSL_TYPE_UINT64: - case GLSL_TYPE_INT64: - if (type->vector_elements <= 2) - return 1; - else - return 2; - case GLSL_TYPE_ARRAY: - assert(type->length > 0); - return st_glsl_storage_type_size(type->fields.array, is_bindless) * - type->length; - case GLSL_TYPE_STRUCT: - size = 0; - for (i = 0; i < type->length; i++) { - size += st_glsl_storage_type_size(type->fields.structure[i].type, - is_bindless); - } - return size; - case GLSL_TYPE_SAMPLER: - case GLSL_TYPE_IMAGE: - if (!is_bindless) - return 0; - /* fall through */ - case GLSL_TYPE_SUBROUTINE: - return 1; - case GLSL_TYPE_ATOMIC_UINT: - case GLSL_TYPE_INTERFACE: - case GLSL_TYPE_VOID: - case GLSL_TYPE_ERROR: - case GLSL_TYPE_FUNCTION: - case GLSL_TYPE_FLOAT16: - case GLSL_TYPE_UINT16: - case GLSL_TYPE_INT16: - case GLSL_TYPE_UINT8: - case GLSL_TYPE_INT8: - assert(!"Invalid type in type_size"); - break; - } - return 0; -} - -int -st_glsl_type_dword_size(const struct glsl_type *type, bool bindless) -{ - unsigned int size, i; - - switch (type->base_type) { - case GLSL_TYPE_UINT: - case GLSL_TYPE_INT: - case GLSL_TYPE_FLOAT: - case GLSL_TYPE_BOOL: - return type->components(); - case GLSL_TYPE_UINT16: - case GLSL_TYPE_INT16: - case GLSL_TYPE_FLOAT16: - return DIV_ROUND_UP(type->components(), 2); - case GLSL_TYPE_UINT8: - case GLSL_TYPE_INT8: - return DIV_ROUND_UP(type->components(), 4); - case GLSL_TYPE_IMAGE: - case GLSL_TYPE_SAMPLER: - if (!bindless) - return 0; - case GLSL_TYPE_DOUBLE: - case GLSL_TYPE_UINT64: - case GLSL_TYPE_INT64: - return type->components() * 2; - case GLSL_TYPE_ARRAY: - return st_glsl_type_dword_size(type->fields.array, bindless) * - type->length; - case GLSL_TYPE_STRUCT: - size = 0; - for (i = 0; i < type->length; i++) { - size += st_glsl_type_dword_size(type->fields.structure[i].type, - bindless); - } - return size; - case GLSL_TYPE_ATOMIC_UINT: - return 0; - case GLSL_TYPE_SUBROUTINE: - return 1; - case GLSL_TYPE_VOID: - case GLSL_TYPE_ERROR: - case GLSL_TYPE_INTERFACE: - case GLSL_TYPE_FUNCTION: - default: - unreachable("invalid type in st_glsl_type_dword_size()"); - } - - return 0; -} - -/** - * Returns the type size of uniforms when !PIPE_CAP_PACKED_UNIFORMS -- each - * value or array element is aligned to a vec4 offset and expanded out to a - * vec4. - */ -int -st_glsl_uniforms_type_size(const struct glsl_type *type, bool bindless) -{ - return st_glsl_storage_type_size(type, bindless); -} diff -Nru mesa-19.2.8/src/mesa/state_tracker/st_glsl_types.h mesa-20.0.8/src/mesa/state_tracker/st_glsl_types.h --- mesa-19.2.8/src/mesa/state_tracker/st_glsl_types.h 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/state_tracker/st_glsl_types.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,47 +0,0 @@ -/* - * Copyright (C) 2005-2007 Brian Paul All Rights Reserved. - * Copyright (C) 2008 VMware, Inc. All Rights Reserved. - * Copyright © 2010 Intel Corporation - * Copyright © 2011 Bryan Cain - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER - * DEALINGS IN THE SOFTWARE. - */ - -#ifndef __ST_GLSL_TYPES_H__ -#define __ST_GLSL_TYPES_H__ - -#include "compiler/glsl_types.h" - -#ifdef __cplusplus -extern "C" { -#endif - -int st_glsl_storage_type_size(const struct glsl_type *type, - bool is_bindless); - -int st_glsl_uniforms_type_size(const struct glsl_type *type, bool bindless); - -int st_glsl_type_dword_size(const struct glsl_type *type, bool bindless); - -#ifdef __cplusplus -} -#endif - -#endif /* __ST_GLSL_TYPES_H__ */ diff -Nru mesa-19.2.8/src/mesa/state_tracker/st_manager.c mesa-20.0.8/src/mesa/state_tracker/st_manager.c --- mesa-19.2.8/src/mesa/state_tracker/st_manager.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/state_tracker/st_manager.c 2020-06-12 01:21:18.000000000 +0000 @@ -55,7 +55,7 @@ #include "pipe/p_context.h" #include "pipe/p_screen.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_helpers.h" #include "util/u_pointer.h" #include "util/u_inlines.h" @@ -67,7 +67,7 @@ struct st_manager_private { struct hash_table *stfbi_ht; /* framebuffer iface objects hash table */ - mtx_t st_mutex; + simple_mtx_t st_mutex; }; @@ -380,8 +380,6 @@ mode->stereoMode = GL_TRUE; if (visual->color_format != PIPE_FORMAT_NONE) { - mode->rgbMode = GL_TRUE; - mode->redBits = util_format_get_component_bits(visual->color_format, UTIL_FORMAT_COLORSPACE_RGB, 0); @@ -407,14 +405,9 @@ mode->stencilBits = util_format_get_component_bits(visual->depth_stencil_format, UTIL_FORMAT_COLORSPACE_ZS, 1); - - mode->haveDepthBuffer = mode->depthBits > 0; - mode->haveStencilBuffer = mode->stencilBits > 0; } if (visual->accum_format != PIPE_FORMAT_NONE) { - mode->haveAccumBuffer = GL_TRUE; - mode->accumRedBits = util_format_get_component_bits(visual->accum_format, UTIL_FORMAT_COLORSPACE_RGB, 0); @@ -527,7 +520,7 @@ st_framebuffer_reference(struct st_framebuffer **ptr, struct st_framebuffer *stfb) { - struct gl_framebuffer *fb = &stfb->Base; + struct gl_framebuffer *fb = stfb ? &stfb->Base : NULL; _mesa_reference_framebuffer((struct gl_framebuffer **) ptr, fb); } @@ -557,9 +550,9 @@ assert(smPriv); assert(smPriv->stfbi_ht); - mtx_lock(&smPriv->st_mutex); + simple_mtx_lock(&smPriv->st_mutex); entry = _mesa_hash_table_search(smPriv->stfbi_ht, stfbi); - mtx_unlock(&smPriv->st_mutex); + simple_mtx_unlock(&smPriv->st_mutex); return entry != NULL; } @@ -576,9 +569,9 @@ assert(smPriv); assert(smPriv->stfbi_ht); - mtx_lock(&smPriv->st_mutex); + simple_mtx_lock(&smPriv->st_mutex); entry = _mesa_hash_table_insert(smPriv->stfbi_ht, stfbi, stfbi); - mtx_unlock(&smPriv->st_mutex); + simple_mtx_unlock(&smPriv->st_mutex); return entry != NULL; } @@ -595,7 +588,7 @@ if (!smPriv || !smPriv->stfbi_ht) return; - mtx_lock(&smPriv->st_mutex); + simple_mtx_lock(&smPriv->st_mutex); entry = _mesa_hash_table_search(smPriv->stfbi_ht, stfbi); if (!entry) goto unlock; @@ -603,7 +596,7 @@ _mesa_hash_table_remove(smPriv->stfbi_ht, entry); unlock: - mtx_unlock(&smPriv->st_mutex); + simple_mtx_unlock(&smPriv->st_mutex); } @@ -647,7 +640,7 @@ * deleted. */ if (!st_framebuffer_iface_lookup(smapi, stfbi)) { - LIST_DEL(&stfb->head); + list_del(&stfb->head); st_framebuffer_reference(&stfb, NULL); } } @@ -656,7 +649,9 @@ static void st_context_flush(struct st_context_iface *stctxi, unsigned flags, - struct pipe_fence_handle **fence) + struct pipe_fence_handle **fence, + void (*before_flush_cb) (void*), + void* args) { struct st_context *st = (struct st_context *) stctxi; unsigned pipe_flags = 0; @@ -668,6 +663,9 @@ FLUSH_VERTICES(st->ctx, 0); FLUSH_CURRENT(st->ctx, 0); + /* Notify the caller that we're ready to flush */ + if (before_flush_cb) + before_flush_cb(args); st_flush(st, fence, pipe_flags); if ((flags & ST_FLUSH_WAIT) && fence && *fence) { @@ -848,7 +846,7 @@ if (smPriv && smPriv->stfbi_ht) { _mesa_hash_table_destroy(smPriv->stfbi_ht, NULL); - mtx_destroy(&smPriv->st_mutex); + simple_mtx_destroy(&smPriv->st_mutex); free(smPriv); smapi->st_manager_private = NULL; } @@ -898,7 +896,7 @@ struct st_manager_private *smPriv; smPriv = CALLOC_STRUCT(st_manager_private); - mtx_init(&smPriv->st_mutex, mtx_plain); + simple_mtx_init(&smPriv->st_mutex, mtx_plain); smPriv->stfbi_ht = _mesa_hash_table_create(NULL, st_framebuffer_iface_hash, st_framebuffer_iface_equal); @@ -1043,7 +1041,7 @@ } /* add to the context's winsys buffers list */ - LIST_ADD(&cur->head, &st->winsys_buffers); + list_add(&cur->head, &st->winsys_buffers); st_framebuffer_reference(&stfb, cur); } @@ -1271,8 +1269,9 @@ st_init_limits(screen, &consts, &extensions); st_init_extensions(screen, &consts, &extensions, options, api); - - return _mesa_get_version(&extensions, &consts, api); + version = _mesa_get_version(&extensions, &consts, api); + free(consts.SpirVExtensions); + return version; } diff -Nru mesa-19.2.8/src/mesa/state_tracker/st_mesa_to_tgsi.c mesa-20.0.8/src/mesa/state_tracker/st_mesa_to_tgsi.c --- mesa-19.2.8/src/mesa/state_tracker/st_mesa_to_tgsi.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/state_tracker/st_mesa_to_tgsi.c 2020-06-12 01:21:18.000000000 +0000 @@ -140,8 +140,12 @@ return t->constants[index]; case PROGRAM_INPUT: - assert(t->inputMapping[index] < ARRAY_SIZE(t->inputs)); - return t->inputs[t->inputMapping[index]]; + if (t->inputMapping[index] < ARRAY_SIZE(t->inputs)) + return t->inputs[t->inputMapping[index]]; + else { + assert(t->procType == PIPE_SHADER_VERTEX); + return ureg_DECL_constant(t->ureg, 0); + } case PROGRAM_OUTPUT: assert(t->outputMapping[index] < ARRAY_SIZE(t->outputs)); diff -Nru mesa-19.2.8/src/mesa/state_tracker/st_nir_builtins.c mesa-20.0.8/src/mesa/state_tracker/st_nir_builtins.c --- mesa-19.2.8/src/mesa/state_tracker/st_nir_builtins.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/state_tracker/st_nir_builtins.c 2020-06-12 01:21:18.000000000 +0000 @@ -21,7 +21,6 @@ */ #include "tgsi/tgsi_from_mesa.h" -#include "st_glsl_types.h" #include "st_nir.h" #include "compiler/nir/nir_builder.h" @@ -33,9 +32,6 @@ { struct pipe_context *pipe = st->pipe; struct pipe_screen *screen = pipe->screen; - enum pipe_shader_type p_stage = pipe_shader_type_from_mesa(nir->info.stage); - bool is_scalar = - screen->get_shader_param(screen, p_stage, PIPE_SHADER_CAP_SCALAR_ISA); nir->info.name = ralloc_strdup(nir, name); nir->info.separate_shader = true; @@ -47,7 +43,7 @@ NIR_PASS_V(nir, nir_lower_var_copies); NIR_PASS_V(nir, nir_lower_system_values); - if (is_scalar) { + if (nir->options->lower_to_scalar) { nir_variable_mode mask = (nir->info.stage > MESA_SHADER_VERTEX ? nir_var_shader_in : 0) | (nir->info.stage < MESA_SHADER_FRAGMENT ? nir_var_shader_out : 0); @@ -55,22 +51,18 @@ NIR_PASS_V(nir, nir_lower_io_to_scalar_early, mask); } - st_nir_opts(nir, is_scalar); - nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir)); + st_nir_assign_vs_in_locations(nir); st_nir_assign_varying_locations(st, nir); st_nir_lower_samplers(screen, nir, NULL, NULL); + st_nir_lower_uniforms(st, nir); - if (st->ctx->Const.PackedDriverUniformStorage) { - NIR_PASS_V(nir, nir_lower_io, nir_var_uniform, st_glsl_type_dword_size, - (nir_lower_io_options)0); - NIR_PASS_V(nir, nir_lower_uniforms_to_ubo, 4); - } else { - NIR_PASS_V(nir, nir_lower_io, nir_var_uniform, st_glsl_uniforms_type_size, - (nir_lower_io_options)0); - } + if (screen->finalize_nir) + screen->finalize_nir(screen, nir, true); + else + st_nir_opts(nir); struct pipe_shader_state state = { .type = PIPE_SHADER_IR_NIR, diff -Nru mesa-19.2.8/src/mesa/state_tracker/st_nir.h mesa-20.0.8/src/mesa/state_tracker/st_nir.h --- mesa-19.2.8/src/mesa/state_tracker/st_nir.h 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/state_tracker/st_nir.h 2020-06-12 01:21:18.000000000 +0000 @@ -43,20 +43,22 @@ void st_finalize_nir(struct st_context *st, struct gl_program *prog, struct gl_shader_program *shader_program, - struct nir_shader *nir); + struct nir_shader *nir, bool finalize_by_driver); -void st_nir_opts(struct nir_shader *nir, bool is_scalar); +void st_nir_opts(struct nir_shader *nir); bool st_link_nir(struct gl_context *ctx, struct gl_shader_program *shader_program); +void st_nir_assign_vs_in_locations(struct nir_shader *nir); void st_nir_assign_varying_locations(struct st_context *st, struct nir_shader *nir); void st_nir_lower_samplers(struct pipe_screen *screen, struct nir_shader *nir, struct gl_shader_program *shader_program, struct gl_program *prog); +void st_nir_lower_uniforms(struct st_context *st, struct nir_shader *nir); struct pipe_shader_state * st_nir_finish_builtin_shader(struct st_context *st, diff -Nru mesa-19.2.8/src/mesa/state_tracker/st_nir_lower_builtin.c mesa-20.0.8/src/mesa/state_tracker/st_nir_lower_builtin.c --- mesa-19.2.8/src/mesa/state_tracker/st_nir_lower_builtin.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/state_tracker/st_nir_lower_builtin.c 2020-06-12 01:21:18.000000000 +0000 @@ -138,7 +138,7 @@ nir_variable_create(shader, nir_var_uniform, glsl_vec4_type(), name); var->num_state_slots = 1; - var->state_slots = ralloc_array(var, nir_state_slot, 1); + var->state_slots = rzalloc_array(var, nir_state_slot, 1); memcpy(var->state_slots[0].tokens, tokens, sizeof(var->state_slots[0].tokens)); diff -Nru mesa-19.2.8/src/mesa/state_tracker/st_nir_lower_tex_src_plane.c mesa-20.0.8/src/mesa/state_tracker/st_nir_lower_tex_src_plane.c --- mesa-19.2.8/src/mesa/state_tracker/st_nir_lower_tex_src_plane.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/state_tracker/st_nir_lower_tex_src_plane.c 2020-06-12 01:21:18.000000000 +0000 @@ -66,7 +66,7 @@ find_sampler(state, orig_binding); char *name; - asprintf(&name, "%s:%s", orig_sampler->name, ext); + UNUSED int r = asprintf(&name, "%s:%s", orig_sampler->name, ext); new_sampler = nir_variable_create(state->shader, nir_var_uniform, samplerExternalOES, name); free(name); diff -Nru mesa-19.2.8/src/mesa/state_tracker/st_pbo.c mesa-20.0.8/src/mesa/state_tracker/st_pbo.c --- mesa-19.2.8/src/mesa/state_tracker/st_pbo.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/state_tracker/st_pbo.c 2020-06-12 01:21:18.000000000 +0000 @@ -38,7 +38,7 @@ #include "pipe/p_screen.h" #include "cso_cache/cso_context.h" #include "tgsi/tgsi_ureg.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_inlines.h" #include "util/u_upload_mgr.h" @@ -546,7 +546,7 @@ nir_variable_create(b.shader, nir_var_uniform, glsl_image_type(GLSL_SAMPLER_DIM_BUF, false, GLSL_TYPE_FLOAT), "img"); - img_var->data.image.access = ACCESS_NON_READABLE; + img_var->data.access = ACCESS_NON_READABLE; img_var->data.explicit_binding = true; img_var->data.binding = 0; nir_deref_instr *img_deref = nir_build_deref_var(&b, img_var); @@ -557,6 +557,7 @@ nir_src_for_ssa(nir_vec4(&b, pbo_addr, zero, zero, zero)); intrin->src[2] = nir_src_for_ssa(zero); intrin->src[3] = nir_src_for_ssa(result); + intrin->src[4] = nir_src_for_ssa(nir_imm_int(&b, 0)); intrin->num_components = 4; nir_builder_instr_insert(&b, &intrin->instr); } else { @@ -828,7 +829,7 @@ for (i = 0; i < ARRAY_SIZE(st->pbo.upload_fs); ++i) { if (st->pbo.upload_fs[i]) { - cso_delete_fragment_shader(st->cso_context, st->pbo.upload_fs[i]); + st->pipe->delete_fs_state(st->pipe, st->pbo.upload_fs[i]); st->pbo.upload_fs[i] = NULL; } } @@ -836,19 +837,19 @@ for (i = 0; i < ARRAY_SIZE(st->pbo.download_fs); ++i) { for (unsigned j = 0; j < ARRAY_SIZE(st->pbo.download_fs[0]); ++j) { if (st->pbo.download_fs[i][j]) { - cso_delete_fragment_shader(st->cso_context, st->pbo.download_fs[i][j]); + st->pipe->delete_fs_state(st->pipe, st->pbo.download_fs[i][j]); st->pbo.download_fs[i][j] = NULL; } } } if (st->pbo.gs) { - cso_delete_geometry_shader(st->cso_context, st->pbo.gs); + st->pipe->delete_gs_state(st->pipe, st->pbo.gs); st->pbo.gs = NULL; } if (st->pbo.vs) { - cso_delete_vertex_shader(st->cso_context, st->pbo.vs); + st->pipe->delete_vs_state(st->pipe, st->pbo.vs); st->pbo.vs = NULL; } } diff -Nru mesa-19.2.8/src/mesa/state_tracker/st_program.c mesa-20.0.8/src/mesa/state_tracker/st_program.c --- mesa-19.2.8/src/mesa/state_tracker/st_program.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/state_tracker/st_program.c 2020-06-12 01:21:18.000000000 +0000 @@ -41,6 +41,7 @@ #include "program/programopt.h" #include "compiler/nir/nir.h" +#include "draw/draw_context.h" #include "pipe/p_context.h" #include "pipe/p_defines.h" @@ -62,9 +63,12 @@ #include "st_atifs_to_tgsi.h" #include "st_nir.h" #include "st_shader_cache.h" +#include "st_util.h" #include "cso_cache/cso_context.h" +static void +destroy_program_variants(struct st_context *st, struct gl_program *target); static void set_affected_state_flags(uint64_t *states, @@ -106,7 +110,7 @@ switch (prog->info.stage) { case MESA_SHADER_VERTEX: - states = &((struct st_vertex_program*)prog)->affected_states; + states = &((struct st_program*)prog)->affected_states; *states = ST_NEW_VS_STATE | ST_NEW_RASTERIZER | @@ -123,7 +127,7 @@ break; case MESA_SHADER_TESS_CTRL: - states = &(st_common_program(prog))->affected_states; + states = &(st_program(prog))->affected_states; *states = ST_NEW_TCS_STATE; @@ -138,7 +142,7 @@ break; case MESA_SHADER_TESS_EVAL: - states = &(st_common_program(prog))->affected_states; + states = &(st_program(prog))->affected_states; *states = ST_NEW_TES_STATE | ST_NEW_RASTERIZER; @@ -154,7 +158,7 @@ break; case MESA_SHADER_GEOMETRY: - states = &(st_common_program(prog))->affected_states; + states = &(st_program(prog))->affected_states; *states = ST_NEW_GS_STATE | ST_NEW_RASTERIZER; @@ -170,7 +174,7 @@ break; case MESA_SHADER_FRAGMENT: - states = &((struct st_fragment_program*)prog)->affected_states; + states = &((struct st_program*)prog)->affected_states; /* gl_FragCoord and glDrawPixels always use constants. */ *states = ST_NEW_FS_STATE | @@ -188,7 +192,7 @@ break; case MESA_SHADER_COMPUTE: - states = &((struct st_compute_program*)prog)->affected_states; + states = &((struct st_program*)prog)->affected_states; *states = ST_NEW_CS_STATE; @@ -207,135 +211,41 @@ } } -static void -delete_ir(struct pipe_shader_state *ir) -{ - if (ir->tokens) - ureg_free_tokens(ir->tokens); - - /* Note: Any setup of ->ir.nir that has had pipe->create_*_state called on - * it has resulted in the driver taking ownership of the NIR. Those - * callers should be NULLing out the nir field in any pipe_shader_state - * that might have this called in order to indicate that. - * - * GLSL IR and ARB programs will have set gl_program->nir to the same - * shader as ir->ir.nir, so it will be freed by _mesa_delete_program(). - */ -} /** - * Delete a vertex program variant. Note the caller must unlink - * the variant from the linked list. + * Delete a shader variant. Note the caller must unlink the variant from + * the linked list. */ static void -delete_vp_variant(struct st_context *st, struct st_vp_variant *vpv) -{ - if (vpv->driver_shader) { - if (st->has_shareable_shaders || vpv->key.st == st) { - cso_delete_vertex_shader(st->cso_context, vpv->driver_shader); - } else { - st_save_zombie_shader(vpv->key.st, PIPE_SHADER_VERTEX, - vpv->driver_shader); - } - } - - if (vpv->draw_shader) - draw_delete_vertex_shader( st->draw, vpv->draw_shader ); - - delete_ir(&vpv->tgsi); - - free( vpv ); -} - - - -/** - * Clean out any old compilations: - */ -void -st_release_vp_variants( struct st_context *st, - struct st_vertex_program *stvp ) -{ - struct st_vp_variant *vpv; - - for (vpv = stvp->variants; vpv; ) { - struct st_vp_variant *next = vpv->next; - delete_vp_variant(st, vpv); - vpv = next; - } - - stvp->variants = NULL; - - delete_ir(&stvp->tgsi); -} - - - -/** - * Delete a fragment program variant. Note the caller must unlink - * the variant from the linked list. - */ -static void -delete_fp_variant(struct st_context *st, struct st_fp_variant *fpv) -{ - if (fpv->driver_shader) { - if (st->has_shareable_shaders || fpv->key.st == st) { - cso_delete_fragment_shader(st->cso_context, fpv->driver_shader); - } else { - st_save_zombie_shader(fpv->key.st, PIPE_SHADER_FRAGMENT, - fpv->driver_shader); - } - } - - free(fpv); -} - - -/** - * Free all variants of a fragment program. - */ -void -st_release_fp_variants(struct st_context *st, struct st_fragment_program *stfp) -{ - struct st_fp_variant *fpv; - - for (fpv = stfp->variants; fpv; ) { - struct st_fp_variant *next = fpv->next; - delete_fp_variant(st, fpv); - fpv = next; - } - - stfp->variants = NULL; - - delete_ir(&stfp->tgsi); -} - - -/** - * Delete a basic program variant. Note the caller must unlink - * the variant from the linked list. - */ -static void -delete_basic_variant(struct st_context *st, struct st_basic_variant *v, - GLenum target) +delete_variant(struct st_context *st, struct st_variant *v, GLenum target) { if (v->driver_shader) { - if (st->has_shareable_shaders || v->key.st == st) { + if (target == GL_VERTEX_PROGRAM_ARB && + ((struct st_common_variant*)v)->key.is_draw_shader) { + /* Draw shader. */ + draw_delete_vertex_shader(st->draw, v->driver_shader); + } else if (st->has_shareable_shaders || v->st == st) { /* The shader's context matches the calling context, or we * don't care. */ switch (target) { + case GL_VERTEX_PROGRAM_ARB: + st->pipe->delete_vs_state(st->pipe, v->driver_shader); + break; case GL_TESS_CONTROL_PROGRAM_NV: - cso_delete_tessctrl_shader(st->cso_context, v->driver_shader); + st->pipe->delete_tcs_state(st->pipe, v->driver_shader); break; case GL_TESS_EVALUATION_PROGRAM_NV: - cso_delete_tesseval_shader(st->cso_context, v->driver_shader); + st->pipe->delete_tes_state(st->pipe, v->driver_shader); break; case GL_GEOMETRY_PROGRAM_NV: - cso_delete_geometry_shader(st->cso_context, v->driver_shader); + st->pipe->delete_gs_state(st->pipe, v->driver_shader); + break; + case GL_FRAGMENT_PROGRAM_ARB: + st->pipe->delete_fs_state(st->pipe, v->driver_shader); break; case GL_COMPUTE_PROGRAM_NV: - cso_delete_compute_shader(st->cso_context, v->driver_shader); + st->pipe->delete_compute_state(st->pipe, v->driver_shader); break; default: unreachable("bad shader type in delete_basic_variant"); @@ -344,82 +254,117 @@ /* We can't delete a shader with a context different from the one * that created it. Add it to the creating context's zombie list. */ - enum pipe_shader_type type; - switch (target) { - case GL_TESS_CONTROL_PROGRAM_NV: - type = PIPE_SHADER_TESS_CTRL; - break; - case GL_TESS_EVALUATION_PROGRAM_NV: - type = PIPE_SHADER_TESS_EVAL; - break; - case GL_GEOMETRY_PROGRAM_NV: - type = PIPE_SHADER_GEOMETRY; - break; - default: - unreachable(""); - } - st_save_zombie_shader(v->key.st, type, v->driver_shader); + enum pipe_shader_type type = + pipe_shader_type_from_mesa(_mesa_program_enum_to_shader_stage(target)); + + st_save_zombie_shader(v->st, type, v->driver_shader); } } free(v); } +static void +st_unbind_program(struct st_context *st, struct st_program *p) +{ + /* Unbind the shader in cso_context and re-bind in st/mesa. */ + switch (p->Base.info.stage) { + case MESA_SHADER_VERTEX: + cso_set_vertex_shader_handle(st->cso_context, NULL); + st->dirty |= ST_NEW_VS_STATE; + break; + case MESA_SHADER_TESS_CTRL: + cso_set_tessctrl_shader_handle(st->cso_context, NULL); + st->dirty |= ST_NEW_TCS_STATE; + break; + case MESA_SHADER_TESS_EVAL: + cso_set_tesseval_shader_handle(st->cso_context, NULL); + st->dirty |= ST_NEW_TES_STATE; + break; + case MESA_SHADER_GEOMETRY: + cso_set_geometry_shader_handle(st->cso_context, NULL); + st->dirty |= ST_NEW_GS_STATE; + break; + case MESA_SHADER_FRAGMENT: + cso_set_fragment_shader_handle(st->cso_context, NULL); + st->dirty |= ST_NEW_FS_STATE; + break; + case MESA_SHADER_COMPUTE: + cso_set_compute_shader_handle(st->cso_context, NULL); + st->dirty |= ST_NEW_CS_STATE; + break; + default: + unreachable("invalid shader type"); + } +} /** * Free all basic program variants. */ void -st_release_basic_variants(struct st_context *st, GLenum target, - struct st_basic_variant **variants, - struct pipe_shader_state *tgsi) -{ - struct st_basic_variant *v; - - for (v = *variants; v; ) { - struct st_basic_variant *next = v->next; - delete_basic_variant(st, v, target); +st_release_variants(struct st_context *st, struct st_program *p) +{ + struct st_variant *v; + + /* If we are releasing shaders, re-bind them, because we don't + * know which shaders are bound in the driver. + */ + if (p->variants) + st_unbind_program(st, p); + + for (v = p->variants; v; ) { + struct st_variant *next = v->next; + delete_variant(st, v, p->Base.Target); v = next; } - *variants = NULL; + p->variants = NULL; - delete_ir(tgsi); -} + if (p->state.tokens) { + ureg_free_tokens(p->state.tokens); + p->state.tokens = NULL; + } + /* Note: Any setup of ->ir.nir that has had pipe->create_*_state called on + * it has resulted in the driver taking ownership of the NIR. Those + * callers should be NULLing out the nir field in any pipe_shader_state + * that might have this called in order to indicate that. + * + * GLSL IR and ARB programs will have set gl_program->nir to the same + * shader as ir->ir.nir, so it will be freed by _mesa_delete_program(). + */ +} /** - * Free all variants of a compute program. + * Free all basic program variants and unref program. */ void -st_release_cp_variants(struct st_context *st, struct st_compute_program *stcp) +st_release_program(struct st_context *st, struct st_program **p) { - struct st_basic_variant **variants = &stcp->variants; - struct st_basic_variant *v; + if (!*p) + return; - for (v = *variants; v; ) { - struct st_basic_variant *next = v->next; - delete_basic_variant(st, v, stcp->Base.Target); - v = next; - } + destroy_program_variants(st, &((*p)->Base)); + st_reference_prog(st, p, NULL); +} - *variants = NULL; +void +st_finalize_nir_before_variants(struct nir_shader *nir) +{ + NIR_PASS_V(nir, nir_opt_access); - if (stcp->tgsi.prog) { - switch (stcp->tgsi.ir_type) { - case PIPE_SHADER_IR_TGSI: - ureg_free_tokens(stcp->tgsi.prog); - stcp->tgsi.prog = NULL; - break; - case PIPE_SHADER_IR_NIR: - /* pipe driver took ownership of prog */ - break; - case PIPE_SHADER_IR_NATIVE: - /* ??? */ - stcp->tgsi.prog = NULL; - break; - } + NIR_PASS_V(nir, nir_split_var_copies); + NIR_PASS_V(nir, nir_lower_var_copies); + if (nir->options->lower_all_io_to_temps || + nir->options->lower_all_io_to_elements || + nir->info.stage == MESA_SHADER_VERTEX || + nir->info.stage == MESA_SHADER_GEOMETRY) { + NIR_PASS_V(nir, nir_lower_io_arrays_to_elements_no_indirects, false); + } else if (nir->info.stage == MESA_SHADER_FRAGMENT) { + NIR_PASS_V(nir, nir_lower_io_arrays_to_elements_no_indirects, true); } + + st_nir_assign_vs_in_locations(nir); } /** @@ -429,11 +374,7 @@ st_translate_prog_to_nir(struct st_context *st, struct gl_program *prog, gl_shader_stage stage) { - enum pipe_shader_type p_stage = pipe_shader_type_from_mesa(stage); - const bool is_scalar = - st->pipe->screen->get_shader_param(st->pipe->screen, p_stage, - PIPE_SHADER_CAP_SCALAR_ISA); - + struct pipe_screen *screen = st->pipe->screen; const struct gl_shader_compiler_options *options = &st->ctx->Const.ShaderCompilerOptions[stage]; @@ -442,129 +383,209 @@ NIR_PASS_V(nir, nir_lower_regs_to_ssa); /* turn registers into SSA */ nir_validate_shader(nir, "after st/ptn lower_regs_to_ssa"); - NIR_PASS_V(nir, st_nir_lower_wpos_ytransform, prog, st->pipe->screen); + NIR_PASS_V(nir, st_nir_lower_wpos_ytransform, prog, screen); NIR_PASS_V(nir, nir_lower_system_values); /* Optimise NIR */ NIR_PASS_V(nir, nir_opt_constant_folding); - st_nir_opts(nir, is_scalar); - nir_validate_shader(nir, "after st/ptn NIR opts"); + st_nir_opts(nir); + st_finalize_nir_before_variants(nir); + + if (st->allow_st_finalize_nir_twice) + st_finalize_nir(st, prog, NULL, nir, true); + + nir_validate_shader(nir, "after st/glsl finalize_nir"); return nir; } -/** - * Translate a vertex program. - */ -bool -st_translate_vertex_program(struct st_context *st, - struct st_vertex_program *stvp) +void +st_prepare_vertex_program(struct st_program *stp) { - struct ureg_program *ureg; - enum pipe_error error; - unsigned num_outputs = 0; - unsigned attr; - ubyte output_semantic_name[VARYING_SLOT_MAX] = {0}; - ubyte output_semantic_index[VARYING_SLOT_MAX] = {0}; + struct st_vertex_program *stvp = (struct st_vertex_program *)stp; stvp->num_inputs = 0; memset(stvp->input_to_index, ~0, sizeof(stvp->input_to_index)); + memset(stvp->result_to_output, ~0, sizeof(stvp->result_to_output)); - if (stvp->Base.arb.IsPositionInvariant) - _mesa_insert_mvp_code(st->ctx, &stvp->Base); - - /* - * Determine number of inputs, the mappings between VERT_ATTRIB_x + /* Determine number of inputs, the mappings between VERT_ATTRIB_x * and TGSI generic input indexes, plus input attrib semantic info. */ - for (attr = 0; attr < VERT_ATTRIB_MAX; attr++) { - if ((stvp->Base.info.inputs_read & BITFIELD64_BIT(attr)) != 0) { + for (unsigned attr = 0; attr < VERT_ATTRIB_MAX; attr++) { + if ((stp->Base.info.inputs_read & BITFIELD64_BIT(attr)) != 0) { stvp->input_to_index[attr] = stvp->num_inputs; stvp->index_to_input[stvp->num_inputs] = attr; stvp->num_inputs++; - if ((stvp->Base.DualSlotInputs & BITFIELD64_BIT(attr)) != 0) { + + if ((stp->Base.DualSlotInputs & BITFIELD64_BIT(attr)) != 0) { /* add placeholder for second part of a double attribute */ stvp->index_to_input[stvp->num_inputs] = ST_DOUBLE_ATTRIB_PLACEHOLDER; stvp->num_inputs++; } } } - /* bit of a hack, presetup potentially unused edgeflag input */ + /* pre-setup potentially unused edgeflag input */ stvp->input_to_index[VERT_ATTRIB_EDGEFLAG] = stvp->num_inputs; stvp->index_to_input[stvp->num_inputs] = VERT_ATTRIB_EDGEFLAG; - /* Compute mapping of vertex program outputs to slots. - */ - for (attr = 0; attr < VARYING_SLOT_MAX; attr++) { - if ((stvp->Base.info.outputs_written & BITFIELD64_BIT(attr)) == 0) { - stvp->result_to_output[attr] = ~0; - } - else { - unsigned slot = num_outputs++; + /* Compute mapping of vertex program outputs to slots. */ + unsigned num_outputs = 0; + for (unsigned attr = 0; attr < VARYING_SLOT_MAX; attr++) { + if (stp->Base.info.outputs_written & BITFIELD64_BIT(attr)) + stvp->result_to_output[attr] = num_outputs++; + } + /* pre-setup potentially unused edgeflag output */ + stvp->result_to_output[VARYING_SLOT_EDGE] = num_outputs; +} - stvp->result_to_output[attr] = slot; +void +st_translate_stream_output_info(struct gl_program *prog) +{ + struct gl_transform_feedback_info *info = prog->sh.LinkedTransformFeedback; + if (!info) + return; - unsigned semantic_name, semantic_index; - tgsi_get_gl_varying_semantic(attr, st->needs_texcoord_semantic, - &semantic_name, &semantic_index); - output_semantic_name[slot] = semantic_name; - output_semantic_index[slot] = semantic_index; - } + /* Determine the (default) output register mapping for each output. */ + unsigned num_outputs = 0; + ubyte output_mapping[VARYING_SLOT_TESS_MAX]; + memset(output_mapping, 0, sizeof(output_mapping)); + + for (unsigned attr = 0; attr < VARYING_SLOT_MAX; attr++) { + if (prog->info.outputs_written & BITFIELD64_BIT(attr)) + output_mapping[attr] = num_outputs++; + } + + /* Translate stream output info. */ + struct pipe_stream_output_info *so_info = + &((struct st_program*)prog)->state.stream_output; + + for (unsigned i = 0; i < info->NumOutputs; i++) { + so_info->output[i].register_index = + output_mapping[info->Outputs[i].OutputRegister]; + so_info->output[i].start_component = info->Outputs[i].ComponentOffset; + so_info->output[i].num_components = info->Outputs[i].NumComponents; + so_info->output[i].output_buffer = info->Outputs[i].OutputBuffer; + so_info->output[i].dst_offset = info->Outputs[i].DstOffset; + so_info->output[i].stream = info->Outputs[i].StreamId; } - /* similar hack to above, presetup potentially unused edgeflag output */ - stvp->result_to_output[VARYING_SLOT_EDGE] = num_outputs; - output_semantic_name[num_outputs] = TGSI_SEMANTIC_EDGEFLAG; - output_semantic_index[num_outputs] = 0; + + for (unsigned i = 0; i < PIPE_MAX_SO_BUFFERS; i++) { + so_info->stride[i] = info->Buffers[i].Stride; + } + so_info->num_outputs = info->NumOutputs; +} + +/** + * Translate a vertex program. + */ +bool +st_translate_vertex_program(struct st_context *st, + struct st_program *stp) +{ + struct ureg_program *ureg; + enum pipe_error error; + unsigned num_outputs = 0; + unsigned attr; + ubyte output_semantic_name[VARYING_SLOT_MAX] = {0}; + ubyte output_semantic_index[VARYING_SLOT_MAX] = {0}; + + if (stp->Base.arb.IsPositionInvariant) + _mesa_insert_mvp_code(st->ctx, &stp->Base); /* ARB_vp: */ - if (!stvp->glsl_to_tgsi && !stvp->shader_program) { - _mesa_remove_output_reads(&stvp->Base, PROGRAM_OUTPUT); + if (!stp->glsl_to_tgsi) { + _mesa_remove_output_reads(&stp->Base, PROGRAM_OUTPUT); /* This determines which states will be updated when the assembly * shader is bound. */ - stvp->affected_states = ST_NEW_VS_STATE | + stp->affected_states = ST_NEW_VS_STATE | ST_NEW_RASTERIZER | ST_NEW_VERTEX_ARRAYS; - if (stvp->Base.Parameters->NumParameters) - stvp->affected_states |= ST_NEW_VS_CONSTANTS; + if (stp->Base.Parameters->NumParameters) + stp->affected_states |= ST_NEW_VS_CONSTANTS; + + /* Translate to NIR if preferred. */ + if (st->pipe->screen->get_shader_param(st->pipe->screen, + PIPE_SHADER_VERTEX, + PIPE_SHADER_CAP_PREFERRED_IR)) { + assert(!stp->glsl_to_tgsi); + + if (stp->Base.nir) + ralloc_free(stp->Base.nir); + + stp->state.type = PIPE_SHADER_IR_NIR; + stp->Base.nir = st_translate_prog_to_nir(st, &stp->Base, + MESA_SHADER_VERTEX); + + /* We must update stp->Base.info after translation and before + * st_prepare_vertex_program is called, because inputs_read + * may become outdated after NIR optimization passes. + * + * For ffvp/ARB_vp inputs_read is populated based + * on declared attributes without taking their usage into + * consideration. When creating shader variants we expect + * that their inputs_read would match the base ones for + * input mapping to work properly. + */ + nir_shader_gather_info(stp->Base.nir, + nir_shader_get_entrypoint(stp->Base.nir)); + st_nir_assign_vs_in_locations(stp->Base.nir); + stp->Base.info = stp->Base.nir->info; - /* No samplers are allowed in ARB_vp. */ + /* For st_draw_feedback, we need to generate TGSI too if draw doesn't + * use LLVM. + */ + if (draw_has_llvm()) { + st_prepare_vertex_program(stp); + return true; + } + } } - if (stvp->shader_program) { - st_translate_stream_output_info(stvp->Base.sh.LinkedTransformFeedback, - stvp->result_to_output, - &stvp->tgsi.stream_output); + st_prepare_vertex_program(stp); - st_store_ir_in_disk_cache(st, &stvp->Base, true); - return true; + /* Get semantic names and indices. */ + for (attr = 0; attr < VARYING_SLOT_MAX; attr++) { + if (stp->Base.info.outputs_written & BITFIELD64_BIT(attr)) { + unsigned slot = num_outputs++; + unsigned semantic_name, semantic_index; + tgsi_get_gl_varying_semantic(attr, st->needs_texcoord_semantic, + &semantic_name, &semantic_index); + output_semantic_name[slot] = semantic_name; + output_semantic_index[slot] = semantic_index; + } } + /* pre-setup potentially unused edgeflag output */ + output_semantic_name[num_outputs] = TGSI_SEMANTIC_EDGEFLAG; + output_semantic_index[num_outputs] = 0; ureg = ureg_create_with_screen(PIPE_SHADER_VERTEX, st->pipe->screen); if (ureg == NULL) return false; - if (stvp->Base.info.clip_distance_array_size) + if (stp->Base.info.clip_distance_array_size) ureg_property(ureg, TGSI_PROPERTY_NUM_CLIPDIST_ENABLED, - stvp->Base.info.clip_distance_array_size); - if (stvp->Base.info.cull_distance_array_size) + stp->Base.info.clip_distance_array_size); + if (stp->Base.info.cull_distance_array_size) ureg_property(ureg, TGSI_PROPERTY_NUM_CULLDIST_ENABLED, - stvp->Base.info.cull_distance_array_size); + stp->Base.info.cull_distance_array_size); if (ST_DEBUG & DEBUG_MESA) { - _mesa_print_program(&stvp->Base); - _mesa_print_program_parameters(st->ctx, &stvp->Base); + _mesa_print_program(&stp->Base); + _mesa_print_program_parameters(st->ctx, &stp->Base); debug_printf("\n"); } - if (stvp->glsl_to_tgsi) { + struct st_vertex_program *stvp = (struct st_vertex_program *)stp; + + if (stp->glsl_to_tgsi) { error = st_translate_program(st->ctx, PIPE_SHADER_VERTEX, ureg, - stvp->glsl_to_tgsi, - &stvp->Base, + stp->glsl_to_tgsi, + &stp->Base, /* inputs */ stvp->num_inputs, stvp->input_to_index, @@ -578,16 +599,14 @@ output_semantic_name, output_semantic_index); - st_translate_stream_output_info(stvp->Base.sh.LinkedTransformFeedback, - stvp->result_to_output, - &stvp->tgsi.stream_output); + st_translate_stream_output_info(&stp->Base); - free_glsl_to_tgsi_visitor(stvp->glsl_to_tgsi); + free_glsl_to_tgsi_visitor(stp->glsl_to_tgsi); } else error = st_translate_mesa_program(st->ctx, PIPE_SHADER_VERTEX, ureg, - &stvp->Base, + &stp->Base, /* inputs */ stvp->num_inputs, stvp->input_to_index, @@ -602,81 +621,114 @@ if (error) { debug_printf("%s: failed to translate Mesa program:\n", __func__); - _mesa_print_program(&stvp->Base); + _mesa_print_program(&stp->Base); debug_assert(0); return false; } - stvp->tgsi.tokens = ureg_get_tokens(ureg, &stvp->num_tgsi_tokens); + stp->state.tokens = ureg_get_tokens(ureg, NULL); ureg_destroy(ureg); - if (stvp->glsl_to_tgsi) { - stvp->glsl_to_tgsi = NULL; - st_store_ir_in_disk_cache(st, &stvp->Base, false); - } - - bool use_nir = PIPE_SHADER_IR_NIR == - st->pipe->screen->get_shader_param(st->pipe->screen, PIPE_SHADER_VERTEX, - PIPE_SHADER_CAP_PREFERRED_IR); - - if (use_nir) { - nir_shader *nir = - st_translate_prog_to_nir(st, &stvp->Base, MESA_SHADER_VERTEX); - - if (stvp->tgsi.ir.nir) - ralloc_free(stvp->tgsi.ir.nir); - stvp->tgsi.type = PIPE_SHADER_IR_NIR; - stvp->tgsi.ir.nir = nir; - stvp->Base.nir = nir; - return true; + if (stp->glsl_to_tgsi) { + stp->glsl_to_tgsi = NULL; + st_store_ir_in_disk_cache(st, &stp->Base, false); } - return stvp->tgsi.tokens != NULL; + return stp->state.tokens != NULL; } static const gl_state_index16 depth_range_state[STATE_LENGTH] = { STATE_DEPTH_RANGE }; -static struct st_vp_variant * +static struct st_common_variant * st_create_vp_variant(struct st_context *st, - struct st_vertex_program *stvp, - const struct st_vp_variant_key *key) + struct st_program *stvp, + const struct st_common_variant_key *key) { - struct st_vp_variant *vpv = CALLOC_STRUCT(st_vp_variant); + struct st_common_variant *vpv = CALLOC_STRUCT(st_common_variant); struct pipe_context *pipe = st->pipe; + struct pipe_screen *screen = pipe->screen; + struct pipe_shader_state state = {0}; + + static const gl_state_index16 point_size_state[STATE_LENGTH] = + { STATE_INTERNAL, STATE_POINT_SIZE_CLAMPED, 0 }; struct gl_program_parameter_list *params = stvp->Base.Parameters; vpv->key = *key; - vpv->tgsi.stream_output = stvp->tgsi.stream_output; - vpv->num_inputs = stvp->num_inputs; - /* When generating a NIR program, we usually don't have TGSI tokens. - * However, we do create them for ARB_vertex_program / fixed-function VS - * programs which we may need to use with the draw module for legacy - * feedback/select emulation. If they exist, copy them. - */ - if (stvp->tgsi.tokens) - vpv->tgsi.tokens = tgsi_dup_tokens(stvp->tgsi.tokens); + state.stream_output = stvp->state.stream_output; - if (stvp->tgsi.type == PIPE_SHADER_IR_NIR) { - vpv->tgsi.type = PIPE_SHADER_IR_NIR; - vpv->tgsi.ir.nir = nir_shader_clone(NULL, stvp->tgsi.ir.nir); - if (key->clamp_color) - NIR_PASS_V(vpv->tgsi.ir.nir, nir_lower_clamp_color_outputs); + if (stvp->state.type == PIPE_SHADER_IR_NIR && + (!key->is_draw_shader || draw_has_llvm())) { + bool finalize = false; + + state.type = PIPE_SHADER_IR_NIR; + state.ir.nir = nir_shader_clone(NULL, stvp->Base.nir); + if (key->clamp_color) { + NIR_PASS_V(state.ir.nir, nir_lower_clamp_color_outputs); + finalize = true; + } if (key->passthrough_edgeflags) { - NIR_PASS_V(vpv->tgsi.ir.nir, nir_lower_passthrough_edgeflags); - vpv->num_inputs++; + NIR_PASS_V(state.ir.nir, nir_lower_passthrough_edgeflags); + finalize = true; + } + + if (key->lower_point_size) { + _mesa_add_state_reference(params, point_size_state); + NIR_PASS_V(state.ir.nir, nir_lower_point_size_mov, + point_size_state); + finalize = true; + } + + if (key->lower_ucp) { + bool can_compact = screen->get_param(screen, + PIPE_CAP_NIR_COMPACT_ARRAYS); + + bool use_eye = st->ctx->_Shader->CurrentProgram[MESA_SHADER_VERTEX] != NULL; + gl_state_index16 clipplane_state[MAX_CLIP_PLANES][STATE_LENGTH]; + for (int i = 0; i < MAX_CLIP_PLANES; ++i) { + if (use_eye) { + clipplane_state[i][0] = STATE_CLIPPLANE; + clipplane_state[i][1] = i; + } else { + clipplane_state[i][0] = STATE_INTERNAL; + clipplane_state[i][1] = STATE_CLIP_INTERNAL; + clipplane_state[i][2] = i; + } + _mesa_add_state_reference(params, clipplane_state[i]); + } + + NIR_PASS_V(state.ir.nir, nir_lower_clip_vs, key->lower_ucp, + true, can_compact, clipplane_state); + NIR_PASS_V(state.ir.nir, nir_lower_io_to_temporaries, + nir_shader_get_entrypoint(state.ir.nir), true, false); + NIR_PASS_V(state.ir.nir, nir_lower_global_vars_to_local); + finalize = true; + } + + if (finalize || !st->allow_st_finalize_nir_twice) { + st_finalize_nir(st, &stvp->Base, stvp->shader_program, state.ir.nir, + true); + + /* Some of the lowering above may have introduced new varyings */ + nir_shader_gather_info(state.ir.nir, + nir_shader_get_entrypoint(state.ir.nir)); } - st_finalize_nir(st, &stvp->Base, stvp->shader_program, - vpv->tgsi.ir.nir); + if (ST_DEBUG & DEBUG_PRINT_IR) + nir_print_shader(state.ir.nir, stderr); + + if (key->is_draw_shader) + vpv->base.driver_shader = draw_create_vertex_shader(st->draw, &state); + else + vpv->base.driver_shader = pipe->create_vs_state(pipe, &state); - vpv->driver_shader = pipe->create_vs_state(pipe, &vpv->tgsi); - /* driver takes ownership of IR: */ - vpv->tgsi.ir.nir = NULL; return vpv; } + state.type = PIPE_SHADER_IR_TGSI; + state.tokens = tgsi_dup_tokens(stvp->state.tokens); + /* Emulate features. */ if (key->clamp_color || key->passthrough_edgeflags) { const struct tgsi_token *tokens; @@ -684,16 +736,14 @@ (key->clamp_color ? TGSI_EMU_CLAMP_COLOR_OUTPUTS : 0) | (key->passthrough_edgeflags ? TGSI_EMU_PASSTHROUGH_EDGEFLAG : 0); - tokens = tgsi_emulate(vpv->tgsi.tokens, flags); + tokens = tgsi_emulate(state.tokens, flags); if (tokens) { - tgsi_free_tokens(vpv->tgsi.tokens); - vpv->tgsi.tokens = tokens; - - if (key->passthrough_edgeflags) - vpv->num_inputs++; - } else + tgsi_free_tokens(state.tokens); + state.tokens = tokens; + } else { fprintf(stderr, "mesa: cannot emulate deprecated features\n"); + } } if (key->lower_depth_clamp) { @@ -701,19 +751,25 @@ _mesa_add_state_reference(params, depth_range_state); const struct tgsi_token *tokens; - tokens = st_tgsi_lower_depth_clamp(vpv->tgsi.tokens, depth_range_const, + tokens = st_tgsi_lower_depth_clamp(state.tokens, depth_range_const, key->clip_negative_one_to_one); - if (tokens != vpv->tgsi.tokens) - tgsi_free_tokens(vpv->tgsi.tokens); - vpv->tgsi.tokens = tokens; + if (tokens != state.tokens) + tgsi_free_tokens(state.tokens); + state.tokens = tokens; } - if (ST_DEBUG & DEBUG_TGSI) { - tgsi_dump(vpv->tgsi.tokens, 0); - debug_printf("\n"); + if (ST_DEBUG & DEBUG_PRINT_IR) + tgsi_dump(state.tokens, 0); + + if (key->is_draw_shader) + vpv->base.driver_shader = draw_create_vertex_shader(st->draw, &state); + else + vpv->base.driver_shader = pipe->create_vs_state(pipe, &state); + + if (state.tokens) { + tgsi_free_tokens(state.tokens); } - vpv->driver_shader = pipe->create_vs_state(pipe, &vpv->tgsi); return vpv; } @@ -721,15 +777,17 @@ /** * Find/create a vertex program variant. */ -struct st_vp_variant * +struct st_common_variant * st_get_vp_variant(struct st_context *st, - struct st_vertex_program *stvp, - const struct st_vp_variant_key *key) + struct st_program *stp, + const struct st_common_variant_key *key) { - struct st_vp_variant *vpv; + struct st_vertex_program *stvp = (struct st_vertex_program *)stp; + struct st_common_variant *vpv; /* Search for existing variant */ - for (vpv = stvp->variants; vpv; vpv = vpv->next) { + for (vpv = st_common_variant(stp->variants); vpv; + vpv = st_common_variant(vpv->base.next)) { if (memcmp(&vpv->key, key, sizeof(*key)) == 0) { break; } @@ -737,18 +795,21 @@ if (!vpv) { /* create now */ - vpv = st_create_vp_variant(st, stvp, key); + vpv = st_create_vp_variant(st, stp, key); if (vpv) { - for (unsigned index = 0; index < vpv->num_inputs; ++index) { - unsigned attr = stvp->index_to_input[index]; - if (attr == ST_DOUBLE_ATTRIB_PLACEHOLDER) - continue; - vpv->vert_attrib_mask |= 1u << attr; - } + vpv->base.st = key->st; + + unsigned num_inputs = stvp->num_inputs + key->passthrough_edgeflags; + for (unsigned index = 0; index < num_inputs; ++index) { + unsigned attr = stvp->index_to_input[index]; + if (attr == ST_DOUBLE_ATTRIB_PLACEHOLDER) + continue; + vpv->vert_attrib_mask |= 1u << attr; + } /* insert into list */ - vpv->next = stvp->variants; - stvp->variants = vpv; + vpv->base.next = stp->variants; + stp->variants = &vpv->base; } } @@ -761,36 +822,10 @@ */ bool st_translate_fragment_program(struct st_context *st, - struct st_fragment_program *stfp) + struct st_program *stfp) { - /* We have already compiled to NIR so just return */ - if (stfp->shader_program) { - st_store_ir_in_disk_cache(st, &stfp->Base, true); - return true; - } - - ubyte outputMapping[2 * FRAG_RESULT_MAX]; - ubyte inputMapping[VARYING_SLOT_MAX]; - ubyte inputSlotToAttr[VARYING_SLOT_MAX]; - ubyte interpMode[PIPE_MAX_SHADER_INPUTS]; /* XXX size? */ - GLuint attr; - GLbitfield64 inputsRead; - struct ureg_program *ureg; - - GLboolean write_all = GL_FALSE; - - ubyte input_semantic_name[PIPE_MAX_SHADER_INPUTS]; - ubyte input_semantic_index[PIPE_MAX_SHADER_INPUTS]; - uint fs_num_inputs = 0; - - ubyte fs_output_semantic_name[PIPE_MAX_SHADER_OUTPUTS]; - ubyte fs_output_semantic_index[PIPE_MAX_SHADER_OUTPUTS]; - uint fs_num_outputs = 0; - - memset(inputSlotToAttr, ~0, sizeof(inputSlotToAttr)); - /* Non-GLSL programs: */ - if (!stfp->glsl_to_tgsi && !stfp->shader_program) { + if (!stfp->glsl_to_tgsi) { _mesa_remove_output_reads(&stfp->Base, PROGRAM_OUTPUT); if (st->ctx->Const.GLSLFragCoordIsSysVal) _mesa_program_fragment_position_to_sysval(&stfp->Base); @@ -814,25 +849,42 @@ stfp->affected_states |= ST_NEW_FS_SAMPLER_VIEWS | ST_NEW_FS_SAMPLERS; } + + /* Translate to NIR. */ + if (!stfp->ati_fs && + st->pipe->screen->get_shader_param(st->pipe->screen, + PIPE_SHADER_FRAGMENT, + PIPE_SHADER_CAP_PREFERRED_IR)) { + nir_shader *nir = + st_translate_prog_to_nir(st, &stfp->Base, MESA_SHADER_FRAGMENT); + + if (stfp->Base.nir) + ralloc_free(stfp->Base.nir); + stfp->state.type = PIPE_SHADER_IR_NIR; + stfp->Base.nir = nir; + return true; + } } + ubyte outputMapping[2 * FRAG_RESULT_MAX]; + ubyte inputMapping[VARYING_SLOT_MAX]; + ubyte inputSlotToAttr[VARYING_SLOT_MAX]; + ubyte interpMode[PIPE_MAX_SHADER_INPUTS]; /* XXX size? */ + GLuint attr; + GLbitfield64 inputsRead; + struct ureg_program *ureg; + + GLboolean write_all = GL_FALSE; + + ubyte input_semantic_name[PIPE_MAX_SHADER_INPUTS]; + ubyte input_semantic_index[PIPE_MAX_SHADER_INPUTS]; + uint fs_num_inputs = 0; - bool use_nir = PIPE_SHADER_IR_NIR == - st->pipe->screen->get_shader_param(st->pipe->screen, - PIPE_SHADER_FRAGMENT, - PIPE_SHADER_CAP_PREFERRED_IR); - - if (use_nir && !stfp->ati_fs) { - nir_shader *nir = - st_translate_prog_to_nir(st, &stfp->Base, MESA_SHADER_FRAGMENT); - - if (stfp->tgsi.ir.nir) - ralloc_free(stfp->tgsi.ir.nir); - stfp->tgsi.type = PIPE_SHADER_IR_NIR; - stfp->tgsi.ir.nir = nir; - stfp->Base.nir = nir; - return true; - } + ubyte fs_output_semantic_name[PIPE_MAX_SHADER_OUTPUTS]; + ubyte fs_output_semantic_index[PIPE_MAX_SHADER_OUTPUTS]; + uint fs_num_outputs = 0; + + memset(inputSlotToAttr, ~0, sizeof(inputSlotToAttr)); /* * Convert Mesa program inputs to TGSI input register semantics. @@ -1129,7 +1181,7 @@ fs_output_semantic_name, fs_output_semantic_index); - stfp->tgsi.tokens = ureg_get_tokens(ureg, &stfp->num_tgsi_tokens); + stfp->state.tokens = ureg_get_tokens(ureg, NULL); ureg_destroy(ureg); if (stfp->glsl_to_tgsi) { @@ -1137,17 +1189,17 @@ st_store_ir_in_disk_cache(st, &stfp->Base, false); } - return stfp->tgsi.tokens != NULL; + return stfp->state.tokens != NULL; } static struct st_fp_variant * st_create_fp_variant(struct st_context *st, - struct st_fragment_program *stfp, + struct st_program *stfp, const struct st_fp_variant_key *key) { struct pipe_context *pipe = st->pipe; struct st_fp_variant *variant = CALLOC_STRUCT(st_fp_variant); - struct pipe_shader_state tgsi = {0}; + struct pipe_shader_state state = {0}; struct gl_program_parameter_list *params = stfp->Base.Parameters; static const gl_state_index16 texcoord_state[STATE_LENGTH] = { STATE_INTERNAL, STATE_CURRENT_ATTRIB, VERT_ATTRIB_TEX0 }; @@ -1155,21 +1207,45 @@ { STATE_INTERNAL, STATE_PT_SCALE }; static const gl_state_index16 bias_state[STATE_LENGTH] = { STATE_INTERNAL, STATE_PT_BIAS }; + static const gl_state_index16 alpha_ref_state[STATE_LENGTH] = + { STATE_INTERNAL, STATE_ALPHA_REF }; if (!variant) return NULL; - if (stfp->tgsi.type == PIPE_SHADER_IR_NIR) { - tgsi.type = PIPE_SHADER_IR_NIR; - tgsi.ir.nir = nir_shader_clone(NULL, stfp->tgsi.ir.nir); + if (stfp->state.type == PIPE_SHADER_IR_NIR) { + bool finalize = false; + + state.type = PIPE_SHADER_IR_NIR; + state.ir.nir = nir_shader_clone(NULL, stfp->Base.nir); + + if (key->clamp_color) { + NIR_PASS_V(state.ir.nir, nir_lower_clamp_color_outputs); + finalize = true; + } + + if (key->lower_flatshade) { + NIR_PASS_V(state.ir.nir, nir_lower_flatshade); + finalize = true; + } + + if (key->lower_alpha_func != COMPARE_FUNC_NEVER) { + _mesa_add_state_reference(params, alpha_ref_state); + NIR_PASS_V(state.ir.nir, nir_lower_alpha_test, key->lower_alpha_func, + false, alpha_ref_state); + finalize = true; + } - if (key->clamp_color) - NIR_PASS_V(tgsi.ir.nir, nir_lower_clamp_color_outputs); + if (key->lower_two_sided_color) { + NIR_PASS_V(state.ir.nir, nir_lower_two_sided_color); + finalize = true; + } if (key->persample_shading) { - nir_shader *shader = tgsi.ir.nir; + nir_shader *shader = state.ir.nir; nir_foreach_variable(var, &shader->inputs) var->data.sample = true; + finalize = true; } assert(!(key->bitmap && key->drawpixels)); @@ -1180,9 +1256,10 @@ variant->bitmap_sampler = ffs(~stfp->Base.SamplersUsed) - 1; options.sampler = variant->bitmap_sampler; - options.swizzle_xxxx = (st->bitmap.tex_format == PIPE_FORMAT_L8_UNORM); + options.swizzle_xxxx = st->bitmap.tex_format == PIPE_FORMAT_R8_UNORM; - NIR_PASS_V(tgsi.ir.nir, nir_lower_bitmap, &options); + NIR_PASS_V(state.ir.nir, nir_lower_bitmap, &options); + finalize = true; } /* glDrawPixels (color only) */ @@ -1215,12 +1292,17 @@ memcpy(options.texcoord_state_tokens, texcoord_state, sizeof(options.texcoord_state_tokens)); - NIR_PASS_V(tgsi.ir.nir, nir_lower_drawpixels, &options); + NIR_PASS_V(state.ir.nir, nir_lower_drawpixels, &options); + finalize = true; } if (unlikely(key->external.lower_nv12 || key->external.lower_iyuv || key->external.lower_xy_uxvx || key->external.lower_yx_xuxv || key->external.lower_ayuv || key->external.lower_xyuv)) { + + st_nir_lower_samplers(pipe->screen, state.ir.nir, + stfp->shader_program, &stfp->Base); + nir_lower_tex_options options = {0}; options.lower_y_uv_external = key->external.lower_nv12; options.lower_y_u_v_external = key->external.lower_iyuv; @@ -1228,41 +1310,56 @@ options.lower_yx_xuxv_external = key->external.lower_yx_xuxv; options.lower_ayuv_external = key->external.lower_ayuv; options.lower_xyuv_external = key->external.lower_xyuv; - NIR_PASS_V(tgsi.ir.nir, nir_lower_tex, &options); + NIR_PASS_V(state.ir.nir, nir_lower_tex, &options); + finalize = true; } - st_finalize_nir(st, &stfp->Base, stfp->shader_program, tgsi.ir.nir); + if (finalize || !st->allow_st_finalize_nir_twice) { + st_finalize_nir(st, &stfp->Base, stfp->shader_program, state.ir.nir, + false); + } + /* This pass needs to happen *after* nir_lower_sampler */ if (unlikely(key->external.lower_nv12 || key->external.lower_iyuv || - key->external.lower_xy_uxvx || key->external.lower_yx_xuxv)) { - /* This pass needs to happen *after* nir_lower_sampler */ - NIR_PASS_V(tgsi.ir.nir, st_nir_lower_tex_src_plane, + key->external.lower_xy_uxvx || key->external.lower_yx_xuxv || + key->external.lower_ayuv || key->external.lower_xyuv)) { + NIR_PASS_V(state.ir.nir, st_nir_lower_tex_src_plane, ~stfp->Base.SamplersUsed, key->external.lower_nv12 || key->external.lower_xy_uxvx || key->external.lower_yx_xuxv, key->external.lower_iyuv); + finalize = true; + } + + if (finalize || !st->allow_st_finalize_nir_twice) { + /* Some of the lowering above may have introduced new varyings */ + nir_shader_gather_info(state.ir.nir, + nir_shader_get_entrypoint(state.ir.nir)); + + struct pipe_screen *screen = pipe->screen; + if (screen->finalize_nir) + screen->finalize_nir(screen, state.ir.nir, false); } - /* Some of the lowering above may have introduced new varyings */ - nir_shader_gather_info(tgsi.ir.nir, - nir_shader_get_entrypoint(tgsi.ir.nir)); + if (ST_DEBUG & DEBUG_PRINT_IR) + nir_print_shader(state.ir.nir, stderr); - variant->driver_shader = pipe->create_fs_state(pipe, &tgsi); + variant->base.driver_shader = pipe->create_fs_state(pipe, &state); variant->key = *key; return variant; } - tgsi.tokens = stfp->tgsi.tokens; + state.tokens = stfp->state.tokens; assert(!(key->bitmap && key->drawpixels)); /* Fix texture targets and add fog for ATI_fs */ if (stfp->ati_fs) { - const struct tgsi_token *tokens = st_fixup_atifs(tgsi.tokens, key); + const struct tgsi_token *tokens = st_fixup_atifs(state.tokens, key); if (tokens) - tgsi.tokens = tokens; + state.tokens = tokens; else fprintf(stderr, "mesa: cannot post-process ATI_fs\n"); } @@ -1274,12 +1371,12 @@ (key->clamp_color ? TGSI_EMU_CLAMP_COLOR_OUTPUTS : 0) | (key->persample_shading ? TGSI_EMU_FORCE_PERSAMPLE_INTERP : 0); - tokens = tgsi_emulate(tgsi.tokens, flags); + tokens = tgsi_emulate(state.tokens, flags); if (tokens) { - if (tgsi.tokens != stfp->tgsi.tokens) - tgsi_free_tokens(tgsi.tokens); - tgsi.tokens = tokens; + if (state.tokens != stfp->state.tokens) + tgsi_free_tokens(state.tokens); + state.tokens = tokens; } else fprintf(stderr, "mesa: cannot emulate deprecated features\n"); } @@ -1290,17 +1387,17 @@ variant->bitmap_sampler = ffs(~stfp->Base.SamplersUsed) - 1; - tokens = st_get_bitmap_shader(tgsi.tokens, + tokens = st_get_bitmap_shader(state.tokens, st->internal_target, variant->bitmap_sampler, st->needs_texcoord_semantic, st->bitmap.tex_format == - PIPE_FORMAT_L8_UNORM); + PIPE_FORMAT_R8_UNORM); if (tokens) { - if (tgsi.tokens != stfp->tgsi.tokens) - tgsi_free_tokens(tgsi.tokens); - tgsi.tokens = tokens; + if (state.tokens != stfp->state.tokens) + tgsi_free_tokens(state.tokens); + state.tokens = tokens; } else fprintf(stderr, "mesa: cannot create a shader for glBitmap\n"); } @@ -1327,7 +1424,7 @@ texcoord_const = _mesa_add_state_reference(params, texcoord_state); - tokens = st_get_drawpix_shader(tgsi.tokens, + tokens = st_get_drawpix_shader(state.tokens, st->needs_texcoord_semantic, key->scaleAndBias, scale_const, bias_const, key->pixelMaps, @@ -1336,9 +1433,9 @@ texcoord_const, st->internal_target); if (tokens) { - if (tgsi.tokens != stfp->tgsi.tokens) - tgsi_free_tokens(tgsi.tokens); - tgsi.tokens = tokens; + if (state.tokens != stfp->state.tokens) + tgsi_free_tokens(state.tokens); + state.tokens = tokens; } else fprintf(stderr, "mesa: cannot create a shader for glDrawPixels\n"); } @@ -1350,16 +1447,16 @@ /* samplers inserted would conflict, but this should be unpossible: */ assert(!(key->bitmap || key->drawpixels)); - tokens = st_tgsi_lower_yuv(tgsi.tokens, + tokens = st_tgsi_lower_yuv(state.tokens, ~stfp->Base.SamplersUsed, key->external.lower_nv12 || key->external.lower_xy_uxvx || key->external.lower_yx_xuxv, key->external.lower_iyuv); if (tokens) { - if (tgsi.tokens != stfp->tgsi.tokens) - tgsi_free_tokens(tgsi.tokens); - tgsi.tokens = tokens; + if (state.tokens != stfp->state.tokens) + tgsi_free_tokens(state.tokens); + state.tokens = tokens; } else { fprintf(stderr, "mesa: cannot create a shader for samplerExternalOES\n"); } @@ -1369,23 +1466,21 @@ unsigned depth_range_const = _mesa_add_state_reference(params, depth_range_state); const struct tgsi_token *tokens; - tokens = st_tgsi_lower_depth_clamp_fs(tgsi.tokens, depth_range_const); - if (tgsi.tokens != stfp->tgsi.tokens) - tgsi_free_tokens(tgsi.tokens); - tgsi.tokens = tokens; + tokens = st_tgsi_lower_depth_clamp_fs(state.tokens, depth_range_const); + if (state.tokens != stfp->state.tokens) + tgsi_free_tokens(state.tokens); + state.tokens = tokens; } - if (ST_DEBUG & DEBUG_TGSI) { - tgsi_dump(tgsi.tokens, 0); - debug_printf("\n"); - } + if (ST_DEBUG & DEBUG_PRINT_IR) + tgsi_dump(state.tokens, 0); /* fill in variant */ - variant->driver_shader = pipe->create_fs_state(pipe, &tgsi); + variant->base.driver_shader = pipe->create_fs_state(pipe, &state); variant->key = *key; - if (tgsi.tokens != stfp->tgsi.tokens) - tgsi_free_tokens(tgsi.tokens); + if (state.tokens != stfp->state.tokens) + tgsi_free_tokens(state.tokens); return variant; } @@ -1394,13 +1489,14 @@ */ struct st_fp_variant * st_get_fp_variant(struct st_context *st, - struct st_fragment_program *stfp, + struct st_program *stfp, const struct st_fp_variant_key *key) { struct st_fp_variant *fpv; /* Search for existing variant */ - for (fpv = stfp->variants; fpv; fpv = fpv->next) { + for (fpv = st_fp_variant(stfp->variants); fpv; + fpv = st_fp_variant(fpv->base.next)) { if (memcmp(&fpv->key, key, sizeof(*key)) == 0) { break; } @@ -1410,6 +1506,8 @@ /* create new */ fpv = st_create_fp_variant(st, stfp, key); if (fpv) { + fpv->base.st = key->st; + if (key->bitmap || key->drawpixels) { /* Regular variants should always come before the * bitmap & drawpixels variants, (unless there @@ -1418,16 +1516,16 @@ * shader_has_one_variant is set. */ if (!stfp->variants) { - stfp->variants = fpv; + stfp->variants = &fpv->base; } else { /* insert into list after the first one */ - fpv->next = stfp->variants->next; - stfp->variants->next = fpv; + fpv->base.next = stfp->variants->next; + stfp->variants->next = &fpv->base; } } else { /* insert into list */ - fpv->next = stfp->variants; - stfp->variants = fpv; + fpv->base.next = stfp->variants; + stfp->variants = &fpv->base; } } } @@ -1435,19 +1533,65 @@ return fpv; } - /** * Translate a program. This is common code for geometry and tessellation * shaders. */ -static void -st_translate_program_common(struct st_context *st, - struct gl_program *prog, - struct glsl_to_tgsi_visitor *glsl_to_tgsi, - struct ureg_program *ureg, - unsigned tgsi_processor, - struct pipe_shader_state *out_state) +bool +st_translate_common_program(struct st_context *st, + struct st_program *stp) { + struct gl_program *prog = &stp->Base; + enum pipe_shader_type stage = + pipe_shader_type_from_mesa(stp->Base.info.stage); + struct ureg_program *ureg = ureg_create_with_screen(stage, st->pipe->screen); + + if (ureg == NULL) + return false; + + switch (stage) { + case PIPE_SHADER_TESS_CTRL: + ureg_property(ureg, TGSI_PROPERTY_TCS_VERTICES_OUT, + stp->Base.info.tess.tcs_vertices_out); + break; + + case PIPE_SHADER_TESS_EVAL: + if (stp->Base.info.tess.primitive_mode == GL_ISOLINES) + ureg_property(ureg, TGSI_PROPERTY_TES_PRIM_MODE, GL_LINES); + else + ureg_property(ureg, TGSI_PROPERTY_TES_PRIM_MODE, + stp->Base.info.tess.primitive_mode); + + STATIC_ASSERT((TESS_SPACING_EQUAL + 1) % 3 == PIPE_TESS_SPACING_EQUAL); + STATIC_ASSERT((TESS_SPACING_FRACTIONAL_ODD + 1) % 3 == + PIPE_TESS_SPACING_FRACTIONAL_ODD); + STATIC_ASSERT((TESS_SPACING_FRACTIONAL_EVEN + 1) % 3 == + PIPE_TESS_SPACING_FRACTIONAL_EVEN); + + ureg_property(ureg, TGSI_PROPERTY_TES_SPACING, + (stp->Base.info.tess.spacing + 1) % 3); + + ureg_property(ureg, TGSI_PROPERTY_TES_VERTEX_ORDER_CW, + !stp->Base.info.tess.ccw); + ureg_property(ureg, TGSI_PROPERTY_TES_POINT_MODE, + stp->Base.info.tess.point_mode); + break; + + case PIPE_SHADER_GEOMETRY: + ureg_property(ureg, TGSI_PROPERTY_GS_INPUT_PRIM, + stp->Base.info.gs.input_primitive); + ureg_property(ureg, TGSI_PROPERTY_GS_OUTPUT_PRIM, + stp->Base.info.gs.output_primitive); + ureg_property(ureg, TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES, + stp->Base.info.gs.vertices_out); + ureg_property(ureg, TGSI_PROPERTY_GS_INVOCATIONS, + stp->Base.info.gs.invocations); + break; + + default: + break; + } + ubyte inputSlotToAttr[VARYING_SLOT_TESS_MAX]; ubyte inputMapping[VARYING_SLOT_TESS_MAX]; ubyte outputMapping[VARYING_SLOT_TESS_MAX]; @@ -1466,7 +1610,7 @@ memset(inputSlotToAttr, 0, sizeof(inputSlotToAttr)); memset(inputMapping, 0, sizeof(inputMapping)); memset(outputMapping, 0, sizeof(outputMapping)); - memset(out_state, 0, sizeof(*out_state)); + memset(&stp->state, 0, sizeof(stp->state)); if (prog->info.clip_distance_array_size) ureg_property(ureg, TGSI_PROPERTY_NUM_CLIPDIST_ENABLED, @@ -1544,9 +1688,9 @@ } st_translate_program(st->ctx, - tgsi_processor, + stage, ureg, - glsl_to_tgsi, + stp->glsl_to_tgsi, prog, /* inputs */ num_inputs, @@ -1561,105 +1705,19 @@ output_semantic_name, output_semantic_index); - if (tgsi_processor == PIPE_SHADER_COMPUTE) { - struct st_compute_program *stcp = (struct st_compute_program *) prog; - out_state->tokens = ureg_get_tokens(ureg, &stcp->num_tgsi_tokens); - stcp->tgsi.prog = out_state->tokens; - } else { - struct st_common_program *stcp = (struct st_common_program *) prog; - out_state->tokens = ureg_get_tokens(ureg, &stcp->num_tgsi_tokens); - } + stp->state.tokens = ureg_get_tokens(ureg, NULL); + ureg_destroy(ureg); - st_translate_stream_output_info(prog->sh.LinkedTransformFeedback, - outputMapping, - &out_state->stream_output); + st_translate_stream_output_info(prog); st_store_ir_in_disk_cache(st, prog, false); - if ((ST_DEBUG & DEBUG_TGSI) && (ST_DEBUG & DEBUG_MESA)) { + if (ST_DEBUG & DEBUG_PRINT_IR && ST_DEBUG & DEBUG_MESA) _mesa_print_program(prog); - debug_printf("\n"); - } - - if (ST_DEBUG & DEBUG_TGSI) { - tgsi_dump(out_state->tokens, 0); - debug_printf("\n"); - } -} - -/** - * Update stream-output info for GS/TCS/TES. Normally this is done in - * st_translate_program_common() but that is not called for glsl_to_nir - * case. - */ -static void -st_translate_program_stream_output(struct gl_program *prog, - struct pipe_stream_output_info *stream_output) -{ - if (!prog->sh.LinkedTransformFeedback) - return; - - ubyte outputMapping[VARYING_SLOT_TESS_MAX]; - GLuint attr; - uint num_outputs = 0; - - memset(outputMapping, 0, sizeof(outputMapping)); - - /* - * Determine number of outputs, the (default) output register - * mapping and the semantic information for each output. - */ - for (attr = 0; attr < VARYING_SLOT_MAX; attr++) { - if (prog->info.outputs_written & BITFIELD64_BIT(attr)) { - GLuint slot = num_outputs++; - - outputMapping[attr] = slot; - } - } - - st_translate_stream_output_info(prog->sh.LinkedTransformFeedback, - outputMapping, - stream_output); -} - -/** - * Translate a geometry program to create a new variant. - */ -bool -st_translate_geometry_program(struct st_context *st, - struct st_common_program *stgp) -{ - struct ureg_program *ureg; - - /* We have already compiled to NIR so just return */ - if (stgp->shader_program) { - /* No variants */ - st_finalize_nir(st, &stgp->Base, stgp->shader_program, - stgp->tgsi.ir.nir); - st_translate_program_stream_output(&stgp->Base, &stgp->tgsi.stream_output); - st_store_ir_in_disk_cache(st, &stgp->Base, true); - return true; - } - - ureg = ureg_create_with_screen(PIPE_SHADER_GEOMETRY, st->pipe->screen); - if (ureg == NULL) - return false; - ureg_property(ureg, TGSI_PROPERTY_GS_INPUT_PRIM, - stgp->Base.info.gs.input_primitive); - ureg_property(ureg, TGSI_PROPERTY_GS_OUTPUT_PRIM, - stgp->Base.info.gs.output_primitive); - ureg_property(ureg, TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES, - stgp->Base.info.gs.vertices_out); - ureg_property(ureg, TGSI_PROPERTY_GS_INVOCATIONS, - stgp->Base.info.gs.invocations); - - st_translate_program_common(st, &stgp->Base, stgp->glsl_to_tgsi, ureg, - PIPE_SHADER_GEOMETRY, &stgp->tgsi); - - free_glsl_to_tgsi_visitor(stgp->glsl_to_tgsi); - stgp->glsl_to_tgsi = NULL; + free_glsl_to_tgsi_visitor(stp->glsl_to_tgsi); + stp->glsl_to_tgsi = NULL; return true; } @@ -1667,37 +1725,46 @@ /** * Get/create a basic program variant. */ -struct st_basic_variant * -st_get_basic_variant(struct st_context *st, - unsigned pipe_shader, - struct st_common_program *prog, - const struct st_basic_variant_key *key) +struct st_variant * +st_get_common_variant(struct st_context *st, + struct st_program *prog, + const struct st_common_variant_key *key) { struct pipe_context *pipe = st->pipe; - struct st_basic_variant *v; - struct pipe_shader_state tgsi = {0}; + struct st_variant *v; + struct pipe_shader_state state = {0}; /* Search for existing variant */ for (v = prog->variants; v; v = v->next) { - if (memcmp(&v->key, key, sizeof(*key)) == 0) { + if (memcmp(&st_common_variant(v)->key, key, sizeof(*key)) == 0) break; - } } if (!v) { /* create new */ - v = CALLOC_STRUCT(st_basic_variant); + v = (struct st_variant*)CALLOC_STRUCT(st_common_variant); if (v) { + if (prog->state.type == PIPE_SHADER_IR_NIR) { + bool finalize = false; + + state.type = PIPE_SHADER_IR_NIR; + state.ir.nir = nir_shader_clone(NULL, prog->Base.nir); - if (prog->tgsi.type == PIPE_SHADER_IR_NIR) { - tgsi.type = PIPE_SHADER_IR_NIR; - tgsi.ir.nir = nir_shader_clone(NULL, prog->tgsi.ir.nir); + if (key->clamp_color) { + NIR_PASS_V(state.ir.nir, nir_lower_clamp_color_outputs); + finalize = true; + } + + state.stream_output = prog->state.stream_output; - if (key->clamp_color) - NIR_PASS_V(tgsi.ir.nir, nir_lower_clamp_color_outputs); + if (finalize || !st->allow_st_finalize_nir_twice) { + st_finalize_nir(st, &prog->Base, prog->shader_program, + state.ir.nir, true); + } - tgsi.stream_output = prog->tgsi.stream_output; - } else { + if (ST_DEBUG & DEBUG_PRINT_IR) + nir_print_shader(state.ir.nir, stderr); + } else { if (key->lower_depth_clamp) { struct gl_program_parameter_list *params = prog->Base.Parameters; @@ -1706,36 +1773,52 @@ const struct tgsi_token *tokens; tokens = - st_tgsi_lower_depth_clamp(prog->tgsi.tokens, + st_tgsi_lower_depth_clamp(prog->state.tokens, depth_range_const, key->clip_negative_one_to_one); - if (tokens != prog->tgsi.tokens) - tgsi_free_tokens(prog->tgsi.tokens); + if (tokens != prog->state.tokens) + tgsi_free_tokens(prog->state.tokens); - prog->tgsi.tokens = tokens; - prog->num_tgsi_tokens = tgsi_num_tokens(tokens); + prog->state.tokens = tokens; } - tgsi = prog->tgsi; + state = prog->state; + + if (ST_DEBUG & DEBUG_PRINT_IR) + tgsi_dump(state.tokens, 0); } /* fill in new variant */ - switch (pipe_shader) { - case PIPE_SHADER_TESS_CTRL: - v->driver_shader = pipe->create_tcs_state(pipe, &tgsi); - break; - case PIPE_SHADER_TESS_EVAL: - v->driver_shader = pipe->create_tes_state(pipe, &tgsi); - break; - case PIPE_SHADER_GEOMETRY: - v->driver_shader = pipe->create_gs_state(pipe, &tgsi); + switch (prog->Base.info.stage) { + case MESA_SHADER_TESS_CTRL: + v->driver_shader = pipe->create_tcs_state(pipe, &state); + break; + case MESA_SHADER_TESS_EVAL: + v->driver_shader = pipe->create_tes_state(pipe, &state); + break; + case MESA_SHADER_GEOMETRY: + v->driver_shader = pipe->create_gs_state(pipe, &state); + break; + case MESA_SHADER_COMPUTE: { + struct pipe_compute_state cs = {0}; + cs.ir_type = state.type; + cs.req_local_mem = prog->Base.info.cs.shared_size; + + if (state.type == PIPE_SHADER_IR_NIR) + cs.prog = state.ir.nir; + else + cs.prog = state.tokens; + + v->driver_shader = pipe->create_compute_state(pipe, &cs); break; + } default: assert(!"unhandled shader type"); free(v); return NULL; } - v->key = *key; + st_common_variant(v)->key = *key; + v->st = key->st; /* insert into list */ v->next = prog->variants; @@ -1748,174 +1831,6 @@ /** - * Translate a tessellation control program to create a new variant. - */ -bool -st_translate_tessctrl_program(struct st_context *st, - struct st_common_program *sttcp) -{ - struct ureg_program *ureg; - - /* We have already compiled to NIR so just return */ - if (sttcp->shader_program) { - /* No variants */ - st_finalize_nir(st, &sttcp->Base, sttcp->shader_program, - sttcp->tgsi.ir.nir); - st_store_ir_in_disk_cache(st, &sttcp->Base, true); - return true; - } - - ureg = ureg_create_with_screen(PIPE_SHADER_TESS_CTRL, st->pipe->screen); - if (ureg == NULL) - return false; - - ureg_property(ureg, TGSI_PROPERTY_TCS_VERTICES_OUT, - sttcp->Base.info.tess.tcs_vertices_out); - - st_translate_program_common(st, &sttcp->Base, sttcp->glsl_to_tgsi, ureg, - PIPE_SHADER_TESS_CTRL, &sttcp->tgsi); - - free_glsl_to_tgsi_visitor(sttcp->glsl_to_tgsi); - sttcp->glsl_to_tgsi = NULL; - return true; -} - - -/** - * Translate a tessellation evaluation program to create a new variant. - */ -bool -st_translate_tesseval_program(struct st_context *st, - struct st_common_program *sttep) -{ - struct ureg_program *ureg; - - /* We have already compiled to NIR so just return */ - if (sttep->shader_program) { - /* No variants */ - st_finalize_nir(st, &sttep->Base, sttep->shader_program, - sttep->tgsi.ir.nir); - st_translate_program_stream_output(&sttep->Base, &sttep->tgsi.stream_output); - st_store_ir_in_disk_cache(st, &sttep->Base, true); - return true; - } - - ureg = ureg_create_with_screen(PIPE_SHADER_TESS_EVAL, st->pipe->screen); - if (ureg == NULL) - return false; - - if (sttep->Base.info.tess.primitive_mode == GL_ISOLINES) - ureg_property(ureg, TGSI_PROPERTY_TES_PRIM_MODE, GL_LINES); - else - ureg_property(ureg, TGSI_PROPERTY_TES_PRIM_MODE, - sttep->Base.info.tess.primitive_mode); - - STATIC_ASSERT((TESS_SPACING_EQUAL + 1) % 3 == PIPE_TESS_SPACING_EQUAL); - STATIC_ASSERT((TESS_SPACING_FRACTIONAL_ODD + 1) % 3 == - PIPE_TESS_SPACING_FRACTIONAL_ODD); - STATIC_ASSERT((TESS_SPACING_FRACTIONAL_EVEN + 1) % 3 == - PIPE_TESS_SPACING_FRACTIONAL_EVEN); - - ureg_property(ureg, TGSI_PROPERTY_TES_SPACING, - (sttep->Base.info.tess.spacing + 1) % 3); - - ureg_property(ureg, TGSI_PROPERTY_TES_VERTEX_ORDER_CW, - !sttep->Base.info.tess.ccw); - ureg_property(ureg, TGSI_PROPERTY_TES_POINT_MODE, - sttep->Base.info.tess.point_mode); - - st_translate_program_common(st, &sttep->Base, sttep->glsl_to_tgsi, - ureg, PIPE_SHADER_TESS_EVAL, &sttep->tgsi); - - free_glsl_to_tgsi_visitor(sttep->glsl_to_tgsi); - sttep->glsl_to_tgsi = NULL; - return true; -} - - -/** - * Translate a compute program to create a new variant. - */ -bool -st_translate_compute_program(struct st_context *st, - struct st_compute_program *stcp) -{ - struct ureg_program *ureg; - struct pipe_shader_state prog; - - stcp->tgsi.req_local_mem = stcp->Base.info.cs.shared_size; - - if (stcp->shader_program) { - /* no compute variants: */ - st_finalize_nir(st, &stcp->Base, stcp->shader_program, - (struct nir_shader *) stcp->tgsi.prog); - st_store_ir_in_disk_cache(st, &stcp->Base, true); - return true; - } - - ureg = ureg_create_with_screen(PIPE_SHADER_COMPUTE, st->pipe->screen); - if (ureg == NULL) - return false; - - st_translate_program_common(st, &stcp->Base, stcp->glsl_to_tgsi, ureg, - PIPE_SHADER_COMPUTE, &prog); - - stcp->tgsi.ir_type = PIPE_SHADER_IR_TGSI; - stcp->tgsi.req_private_mem = 0; - stcp->tgsi.req_input_mem = 0; - - free_glsl_to_tgsi_visitor(stcp->glsl_to_tgsi); - stcp->glsl_to_tgsi = NULL; - return true; -} - - -/** - * Get/create compute program variant. - */ -struct st_basic_variant * -st_get_cp_variant(struct st_context *st, - struct pipe_compute_state *tgsi, - struct st_basic_variant **variants) -{ - struct pipe_context *pipe = st->pipe; - struct st_basic_variant *v; - struct st_basic_variant_key key; - - /* use memset, not an initializer to be sure all memory is zeroed */ - memset(&key, 0, sizeof(key)); - - key.st = st->has_shareable_shaders ? NULL : st; - - /* Search for existing variant */ - for (v = *variants; v; v = v->next) { - if (memcmp(&v->key, &key, sizeof(key)) == 0) { - break; - } - } - - if (!v) { - /* create new */ - v = CALLOC_STRUCT(st_basic_variant); - if (v) { - /* fill in new variant */ - struct pipe_compute_state cs = *tgsi; - if (tgsi->ir_type == PIPE_SHADER_IR_NIR) - cs.prog = nir_shader_clone(NULL, tgsi->prog); - v->driver_shader = pipe->create_compute_state(pipe, &cs); - v->key = key; - - /* insert into list */ - v->next = *variants; - *variants = v; - } - } - - return v; -} - - -/** * Vert/Geom/Frag programs have per-context variants. Free all the * variants attached to the given program which match the given context. */ @@ -1925,78 +1840,27 @@ if (!target || target == &_mesa_DummyProgram) return; - switch (target->Target) { - case GL_VERTEX_PROGRAM_ARB: - { - struct st_vertex_program *stvp = (struct st_vertex_program *) target; - struct st_vp_variant *vpv, **prevPtr = &stvp->variants; - - for (vpv = stvp->variants; vpv; ) { - struct st_vp_variant *next = vpv->next; - if (vpv->key.st == st) { - /* unlink from list */ - *prevPtr = next; - /* destroy this variant */ - delete_vp_variant(st, vpv); - } - else { - prevPtr = &vpv->next; - } - vpv = next; - } - } - break; - case GL_FRAGMENT_PROGRAM_ARB: - { - struct st_fragment_program *stfp = - (struct st_fragment_program *) target; - struct st_fp_variant *fpv, **prevPtr = &stfp->variants; - - for (fpv = stfp->variants; fpv; ) { - struct st_fp_variant *next = fpv->next; - if (fpv->key.st == st) { - /* unlink from list */ - *prevPtr = next; - /* destroy this variant */ - delete_fp_variant(st, fpv); - } - else { - prevPtr = &fpv->next; - } - fpv = next; + struct st_program *p = st_program(target); + struct st_variant *v, **prevPtr = &p->variants; + bool unbound = false; + + for (v = p->variants; v; ) { + struct st_variant *next = v->next; + if (v->st == st) { + if (!unbound) { + st_unbind_program(st, p); + unbound = true; } + + /* unlink from list */ + *prevPtr = next; + /* destroy this variant */ + delete_variant(st, v, target->Target); } - break; - case GL_GEOMETRY_PROGRAM_NV: - case GL_TESS_CONTROL_PROGRAM_NV: - case GL_TESS_EVALUATION_PROGRAM_NV: - case GL_COMPUTE_PROGRAM_NV: - { - struct st_common_program *p = st_common_program(target); - struct st_compute_program *cp = (struct st_compute_program*)target; - struct st_basic_variant **variants = - target->Target == GL_COMPUTE_PROGRAM_NV ? &cp->variants : - &p->variants; - struct st_basic_variant *v, **prevPtr = variants; - - for (v = *variants; v; ) { - struct st_basic_variant *next = v->next; - if (v->key.st == st) { - /* unlink from list */ - *prevPtr = next; - /* destroy this variant */ - delete_basic_variant(st, v, target->Target); - } - else { - prevPtr = &v->next; - } - v = next; - } + else { + prevPtr = &v->next; } - break; - default: - _mesa_problem(NULL, "Unexpected program target 0x%x in " - "destroy_program_variants_cb()", target->Target); + v = next; } } @@ -2074,39 +1938,16 @@ /** - * For debugging, print/dump the current vertex program. - */ -void -st_print_current_vertex_program(void) -{ - GET_CURRENT_CONTEXT(ctx); - - if (ctx->VertexProgram._Current) { - struct st_vertex_program *stvp = - (struct st_vertex_program *) ctx->VertexProgram._Current; - struct st_vp_variant *stv; - - debug_printf("Vertex program %u\n", stvp->Base.Id); - - for (stv = stvp->variants; stv; stv = stv->next) { - debug_printf("variant %p\n", stv); - tgsi_dump(stv->tgsi.tokens, 0); - } - } -} - - -/** * Compile one shader variant. */ -void +static void st_precompile_shader_variant(struct st_context *st, struct gl_program *prog) { switch (prog->Target) { case GL_VERTEX_PROGRAM_ARB: { - struct st_vertex_program *p = (struct st_vertex_program *)prog; - struct st_vp_variant_key key; + struct st_program *p = (struct st_program *)prog; + struct st_common_variant_key key; memset(&key, 0, sizeof(key)); @@ -2115,57 +1956,51 @@ break; } - case GL_TESS_CONTROL_PROGRAM_NV: { - struct st_common_program *p = st_common_program(prog); - struct st_basic_variant_key key; + case GL_FRAGMENT_PROGRAM_ARB: { + struct st_program *p = (struct st_program *)prog; + struct st_fp_variant_key key; memset(&key, 0, sizeof(key)); key.st = st->has_shareable_shaders ? NULL : st; - st_get_basic_variant(st, PIPE_SHADER_TESS_CTRL, p, &key); + st_get_fp_variant(st, p, &key); break; } - case GL_TESS_EVALUATION_PROGRAM_NV: { - struct st_common_program *p = st_common_program(prog); - struct st_basic_variant_key key; + case GL_TESS_CONTROL_PROGRAM_NV: + case GL_TESS_EVALUATION_PROGRAM_NV: + case GL_GEOMETRY_PROGRAM_NV: + case GL_COMPUTE_PROGRAM_NV: { + struct st_program *p = st_program(prog); + struct st_common_variant_key key; memset(&key, 0, sizeof(key)); key.st = st->has_shareable_shaders ? NULL : st; - st_get_basic_variant(st, PIPE_SHADER_TESS_EVAL, p, &key); + st_get_common_variant(st, p, &key); break; } - case GL_GEOMETRY_PROGRAM_NV: { - struct st_common_program *p = st_common_program(prog); - struct st_basic_variant_key key; - - memset(&key, 0, sizeof(key)); - - key.st = st->has_shareable_shaders ? NULL : st; - st_get_basic_variant(st, PIPE_SHADER_GEOMETRY, p, &key); - break; + default: + assert(0); } +} - case GL_FRAGMENT_PROGRAM_ARB: { - struct st_fragment_program *p = (struct st_fragment_program *)prog; - struct st_fp_variant_key key; - - memset(&key, 0, sizeof(key)); - - key.st = st->has_shareable_shaders ? NULL : st; - st_get_fp_variant(st, p, &key); - break; +void +st_finalize_program(struct st_context *st, struct gl_program *prog) +{ + if (st->current_program[prog->info.stage] == prog) { + if (prog->info.stage == MESA_SHADER_VERTEX) + st->dirty |= ST_NEW_VERTEX_PROGRAM(st, (struct st_program *)prog); + else + st->dirty |= ((struct st_program *)prog)->affected_states; } - case GL_COMPUTE_PROGRAM_NV: { - struct st_compute_program *p = (struct st_compute_program *)prog; - st_get_cp_variant(st, &p->tgsi, &p->variants); - break; - } + if (prog->nir) + nir_sweep(prog->nir); - default: - assert(0); - } + /* Create Gallium shaders now instead of on demand. */ + if (ST_DEBUG & DEBUG_PRECOMPILE || + st->shader_has_one_variant[prog->info.stage]) + st_precompile_shader_variant(st, prog); } diff -Nru mesa-19.2.8/src/mesa/state_tracker/st_program.h mesa-20.0.8/src/mesa/state_tracker/st_program.h --- mesa-19.2.8/src/mesa/state_tracker/st_program.h 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/state_tracker/st_program.h 2020-06-12 01:21:18.000000000 +0000 @@ -71,9 +71,15 @@ unsigned unit = u_bit_scan(&mask); struct st_texture_object *stObj = st_get_texture_object(st->ctx, prog, unit); + enum pipe_format format = st_get_view_format(stObj); - switch (st_get_view_format(stObj)) { + /* if resource format matches then YUV wasn't lowered */ + if (format == stObj->pt->format) + continue; + + switch (format) { case PIPE_FORMAT_NV12: + case PIPE_FORMAT_P010: case PIPE_FORMAT_P016: key.lower_nv12 |= (1 << unit); break; @@ -93,7 +99,8 @@ key.lower_xyuv |= (1 << unit); break; default: - printf("unhandled %u\n", st_get_view_format(stObj)); + printf("mesa: st_get_external_sampler_key: unhandled pipe format %u\n", + format); break; } } @@ -126,60 +133,53 @@ /** for ARB_depth_clamp */ GLuint lower_depth_clamp:1; + /** for OpenGL 1.0 on modern hardware */ + GLuint lower_two_sided_color:1; + + GLuint lower_flatshade:1; + unsigned lower_alpha_func:3; + /** needed for ATI_fragment_shader */ char texture_targets[MAX_NUM_FRAGMENT_REGISTERS_ATI]; struct st_external_sampler_key external; }; +/** + * Base class for shader variants. + */ +struct st_variant +{ + /** next in linked list */ + struct st_variant *next; + + /** st_context from the shader key */ + struct st_context *st; + + void *driver_shader; +}; /** * Variant of a fragment program. */ struct st_fp_variant { + struct st_variant base; + /** Parameters which generated this version of fragment program */ struct st_fp_variant_key key; - /** Driver's compiled shader */ - void *driver_shader; - /** For glBitmap variants */ uint bitmap_sampler; /** For glDrawPixels variants */ unsigned drawpix_sampler; unsigned pixelmap_sampler; - - /** next in linked list */ - struct st_fp_variant *next; }; -/** - * Derived from Mesa gl_program: - */ -struct st_fragment_program -{ - struct gl_program Base; - struct pipe_shader_state tgsi; - struct glsl_to_tgsi_visitor* glsl_to_tgsi; - struct ati_fragment_shader *ati_fs; - uint64_t affected_states; /**< ST_NEW_* flags to mark dirty when binding */ - - /* used when bypassing glsl_to_tgsi: */ - struct gl_shader_program *shader_program; - - struct st_fp_variant *variants; - - /* Used by the shader cache and ARB_get_program_binary */ - unsigned num_tgsi_tokens; -}; - - - -/** Vertex program variant key */ -struct st_vp_variant_key +/** Shader key shared by other shaders */ +struct st_common_variant_key { struct st_context *st; /**< variants are per-context */ bool passthrough_edgeflags; @@ -190,216 +190,96 @@ /** both for ARB_depth_clamp */ bool lower_depth_clamp; bool clip_negative_one_to_one; -}; + /** lower glPointSize to gl_PointSize */ + boolean lower_point_size; -/** - * This represents a vertex program, especially translated to match - * the inputs of a particular fragment shader. - */ -struct st_vp_variant -{ - /* Parameters which generated this translated version of a vertex - * shader: - */ - struct st_vp_variant_key key; + /* for user-defined clip-planes */ + uint8_t lower_ucp; - /** - * TGSI tokens (to later generate a 'draw' module shader for - * selection/feedback/rasterpos) + /* Whether st_variant::driver_shader is for the draw module, + * not for the driver. */ - struct pipe_shader_state tgsi; - - /** Driver's compiled shader */ - void *driver_shader; - - /** For using our private draw module (glRasterPos) */ - struct draw_vertex_shader *draw_shader; - - /** Next in linked list */ - struct st_vp_variant *next; - - /** similar to that in st_vertex_program, but with edgeflags info too */ - GLuint num_inputs; - - /** Bitfield of VERT_BIT_* bits of mesa vertex processing inputs */ - GLbitfield vert_attrib_mask; + bool is_draw_shader; }; /** - * Derived from Mesa gl_program: + * Common shader variant. */ -struct st_vertex_program +struct st_common_variant { - struct gl_program Base; /**< The Mesa vertex program */ - struct pipe_shader_state tgsi; - struct glsl_to_tgsi_visitor* glsl_to_tgsi; - uint64_t affected_states; /**< ST_NEW_* flags to mark dirty when binding */ - - /* used when bypassing glsl_to_tgsi: */ - struct gl_shader_program *shader_program; + struct st_variant base; - /** maps a TGSI input index back to a Mesa VERT_ATTRIB_x */ - ubyte index_to_input[PIPE_MAX_ATTRIBS]; - ubyte num_inputs; - /** Reverse mapping of the above */ - ubyte input_to_index[VERT_ATTRIB_MAX]; - - /** Maps VARYING_SLOT_x to slot */ - ubyte result_to_output[VARYING_SLOT_MAX]; - - /** List of translated variants of this vertex program. - */ - struct st_vp_variant *variants; - - /** SHA1 hash of linked tgsi shader program, used for on-disk cache */ - unsigned char sha1[20]; - - /* Used by the shader cache and ARB_get_program_binary */ - unsigned num_tgsi_tokens; -}; - - - -/** Key shared by all shaders except VP, FP */ -struct st_basic_variant_key -{ - struct st_context *st; /**< variants are per-context */ - - /** For compat profile */ - bool clamp_color; - - /** both for ARB_depth_clamp */ - bool lower_depth_clamp; - bool clip_negative_one_to_one; - -}; - - -/** - * Geometry program variant. - */ -struct st_basic_variant -{ /* Parameters which generated this variant. */ - struct st_basic_variant_key key; + struct st_common_variant_key key; - void *driver_shader; - - struct st_basic_variant *next; + /* Bitfield of VERT_BIT_* bits matching vertex shader inputs, + * but not include the high part of doubles. + */ + GLbitfield vert_attrib_mask; }; /** * Derived from Mesa gl_program: */ -struct st_common_program +struct st_program { struct gl_program Base; - struct pipe_shader_state tgsi; - struct glsl_to_tgsi_visitor* glsl_to_tgsi; - uint64_t affected_states; /**< ST_NEW_* flags to mark dirty when binding */ - - /* used when bypassing glsl_to_tgsi: */ - struct gl_shader_program *shader_program; - - struct st_basic_variant *variants; - - /** SHA1 hash of linked tgsi shader program, used for on-disk cache */ - unsigned char sha1[20]; - - /* Used by the shader cache and ARB_get_program_binary */ - unsigned num_tgsi_tokens; -}; - - -/** - * Derived from Mesa gl_program: - */ -struct st_compute_program -{ - struct gl_program Base; /**< The Mesa compute program */ - struct pipe_compute_state tgsi; + struct pipe_shader_state state; struct glsl_to_tgsi_visitor* glsl_to_tgsi; + struct ati_fragment_shader *ati_fs; uint64_t affected_states; /**< ST_NEW_* flags to mark dirty when binding */ /* used when bypassing glsl_to_tgsi: */ struct gl_shader_program *shader_program; - struct st_basic_variant *variants; - - /** SHA1 hash of linked tgsi shader program, used for on-disk cache */ - unsigned char sha1[20]; - - /* Used by the shader cache and ARB_get_program_binary */ - unsigned num_tgsi_tokens; + struct st_variant *variants; }; -static inline struct st_fragment_program * -st_fragment_program( struct gl_program *fp ) +struct st_vertex_program { - return (struct st_fragment_program *)fp; -} - + struct st_program Base; -static inline struct st_vertex_program * -st_vertex_program( struct gl_program *vp ) -{ - return (struct st_vertex_program *)vp; -} + /** maps a TGSI input index back to a Mesa VERT_ATTRIB_x */ + ubyte index_to_input[PIPE_MAX_ATTRIBS]; + ubyte num_inputs; + /** Reverse mapping of the above */ + ubyte input_to_index[VERT_ATTRIB_MAX]; -static inline struct st_common_program * -st_common_program( struct gl_program *gp ) -{ - return (struct st_common_program *)gp; -} + /** Maps VARYING_SLOT_x to slot */ + ubyte result_to_output[VARYING_SLOT_MAX]; +}; -static inline struct st_compute_program * -st_compute_program( struct gl_program *cp ) -{ - return (struct st_compute_program *)cp; -} -static inline void -st_reference_vertprog(struct st_context *st, - struct st_vertex_program **ptr, - struct st_vertex_program *prog) +static inline struct st_program * +st_program( struct gl_program *cp ) { - _mesa_reference_program(st->ctx, - (struct gl_program **) ptr, - (struct gl_program *) prog); + return (struct st_program *)cp; } static inline void -st_reference_fragprog(struct st_context *st, - struct st_fragment_program **ptr, - struct st_fragment_program *prog) +st_reference_prog(struct st_context *st, + struct st_program **ptr, + struct st_program *prog) { _mesa_reference_program(st->ctx, (struct gl_program **) ptr, (struct gl_program *) prog); } -static inline void -st_reference_prog(struct st_context *st, - struct st_common_program **ptr, - struct st_common_program *prog) +static inline struct st_common_variant * +st_common_variant(struct st_variant *v) { - _mesa_reference_program(st->ctx, - (struct gl_program **) ptr, - (struct gl_program *) prog); + return (struct st_common_variant*)v; } -static inline void -st_reference_compprog(struct st_context *st, - struct st_compute_program **ptr, - struct st_compute_program *prog) +static inline struct st_fp_variant * +st_fp_variant(struct st_variant *v) { - _mesa_reference_program(st->ctx, - (struct gl_program **) ptr, - (struct gl_program *) prog); + return (struct st_fp_variant*)v; } /** @@ -415,78 +295,54 @@ extern void st_set_prog_affected_state_flags(struct gl_program *prog); -extern struct st_vp_variant * +extern struct st_common_variant * st_get_vp_variant(struct st_context *st, - struct st_vertex_program *stvp, - const struct st_vp_variant_key *key); + struct st_program *stvp, + const struct st_common_variant_key *key); extern struct st_fp_variant * st_get_fp_variant(struct st_context *st, - struct st_fragment_program *stfp, + struct st_program *stfp, const struct st_fp_variant_key *key); -extern struct st_basic_variant * -st_get_cp_variant(struct st_context *st, - struct pipe_compute_state *tgsi, - struct st_basic_variant **variants); - -extern struct st_basic_variant * -st_get_basic_variant(struct st_context *st, - unsigned pipe_shader, - struct st_common_program *p, - const struct st_basic_variant_key *key); +extern struct st_variant * +st_get_common_variant(struct st_context *st, + struct st_program *p, + const struct st_common_variant_key *key); extern void -st_release_vp_variants( struct st_context *st, - struct st_vertex_program *stvp ); +st_release_variants(struct st_context *st, struct st_program *p); extern void -st_release_fp_variants( struct st_context *st, - struct st_fragment_program *stfp ); +st_release_program(struct st_context *st, struct st_program **p); extern void -st_release_cp_variants(struct st_context *st, - struct st_compute_program *stcp); +st_destroy_program_variants(struct st_context *st); extern void -st_release_basic_variants(struct st_context *st, GLenum target, - struct st_basic_variant **variants, - struct pipe_shader_state *tgsi); +st_finalize_nir_before_variants(struct nir_shader *nir); extern void -st_destroy_program_variants(struct st_context *st); +st_prepare_vertex_program(struct st_program *stvp); + +extern void +st_translate_stream_output_info(struct gl_program *prog); extern bool st_translate_vertex_program(struct st_context *st, - struct st_vertex_program *stvp); + struct st_program *stvp); extern bool st_translate_fragment_program(struct st_context *st, - struct st_fragment_program *stfp); - -extern bool -st_translate_geometry_program(struct st_context *st, - struct st_common_program *stgp); - -extern bool -st_translate_tessctrl_program(struct st_context *st, - struct st_common_program *sttcp); + struct st_program *stfp); extern bool -st_translate_tesseval_program(struct st_context *st, - struct st_common_program *sttep); - -extern bool -st_translate_compute_program(struct st_context *st, - struct st_compute_program *stcp); - -extern void -st_print_current_vertex_program(void); +st_translate_common_program(struct st_context *st, + struct st_program *stp); extern void -st_precompile_shader_variant(struct st_context *st, - struct gl_program *prog); +st_finalize_program(struct st_context *st, struct gl_program *prog); #ifdef __cplusplus } diff -Nru mesa-19.2.8/src/mesa/state_tracker/st_sampler_view.c mesa-20.0.8/src/mesa/state_tracker/st_sampler_view.c --- mesa-19.2.8/src/mesa/state_tracker/st_sampler_view.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/state_tracker/st_sampler_view.c 2020-06-12 01:21:18.000000000 +0000 @@ -24,7 +24,7 @@ */ #include "pipe/p_context.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_inlines.h" #include "main/context.h" @@ -489,12 +489,17 @@ if (srgb_skip_decode) format = util_format_linear(format); + /* if resource format matches then YUV wasn't lowered */ + if (format == stObj->pt->format) + return format; + /* Use R8_UNORM for video formats */ switch (format) { case PIPE_FORMAT_NV12: case PIPE_FORMAT_IYUV: format = PIPE_FORMAT_R8_UNORM; break; + case PIPE_FORMAT_P010: case PIPE_FORMAT_P016: format = PIPE_FORMAT_R16_UNORM; break; @@ -527,13 +532,13 @@ templ.format = format; - if (stObj->level_override) { + if (stObj->level_override >= 0) { templ.u.tex.first_level = templ.u.tex.last_level = stObj->level_override; } else { templ.u.tex.first_level = stObj->base.MinLevel + stObj->base.BaseLevel; templ.u.tex.last_level = last_level(stObj); } - if (stObj->layer_override) { + if (stObj->layer_override >= 0) { templ.u.tex.first_layer = templ.u.tex.last_layer = stObj->layer_override; } else { templ.u.tex.first_layer = stObj->base.MinLayer; @@ -578,12 +583,12 @@ assert(!check_sampler_swizzle(st, stObj, view, glsl130_or_later)); assert(get_sampler_view_format(st, stObj, srgb_skip_decode) == view->format); assert(gl_target_to_pipe(stObj->base.Target) == view->target); - assert(stObj->level_override || + assert(stObj->level_override >= 0 || stObj->base.MinLevel + stObj->base.BaseLevel == view->u.tex.first_level); - assert(stObj->level_override || last_level(stObj) == view->u.tex.last_level); - assert(stObj->layer_override || stObj->base.MinLayer == view->u.tex.first_layer); - assert(stObj->layer_override || last_layer(stObj) == view->u.tex.last_layer); - assert(!stObj->layer_override || + assert(stObj->level_override >= 0 || last_level(stObj) == view->u.tex.last_level); + assert(stObj->layer_override >= 0 || stObj->base.MinLayer == view->u.tex.first_layer); + assert(stObj->layer_override >= 0 || last_layer(stObj) == view->u.tex.last_layer); + assert(stObj->layer_override < 0 || (stObj->layer_override == view->u.tex.first_layer && stObj->layer_override == view->u.tex.last_layer)); return view; diff -Nru mesa-19.2.8/src/mesa/state_tracker/st_shader_cache.c mesa-20.0.8/src/mesa/state_tracker/st_shader_cache.c --- mesa-19.2.8/src/mesa/state_tracker/st_shader_cache.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/state_tracker/st_shader_cache.c 2020-06-12 01:21:18.000000000 +0000 @@ -31,6 +31,7 @@ #include "compiler/nir/nir_serialize.h" #include "pipe/p_shader_tokens.h" #include "program/ir_to_mesa.h" +#include "tgsi/tgsi_parse.h" #include "util/u_memory.h" void @@ -41,10 +42,15 @@ static void write_stream_out_to_cache(struct blob *blob, - struct pipe_shader_state *tgsi) + struct pipe_shader_state *state) { - blob_write_bytes(blob, &tgsi->stream_output, - sizeof(tgsi->stream_output)); + blob_write_uint32(blob, state->stream_output.num_outputs); + if (state->stream_output.num_outputs) { + blob_write_bytes(blob, &state->stream_output.stride, + sizeof(state->stream_output.stride)); + blob_write_bytes(blob, &state->stream_output.output, + sizeof(state->stream_output.output)); + } } static void @@ -57,8 +63,10 @@ static void write_tgsi_to_cache(struct blob *blob, const struct tgsi_token *tokens, - struct gl_program *prog, unsigned num_tokens) + struct gl_program *prog) { + unsigned num_tokens = tgsi_num_tokens(tokens); + blob_write_uint32(blob, num_tokens); blob_write_bytes(blob, tokens, num_tokens * sizeof(struct tgsi_token)); copy_blob_to_driver_cache_blob(blob, prog); @@ -67,7 +75,7 @@ static void write_nir_to_cache(struct blob *blob, struct gl_program *prog) { - nir_serialize(blob, prog->nir); + nir_serialize(blob, prog->nir, false); copy_blob_to_driver_cache_blob(blob, prog); } @@ -78,12 +86,12 @@ if (prog->driver_cache_blob) return; + struct st_program *stp = (struct st_program *)prog; struct blob blob; blob_init(&blob); - switch (prog->info.stage) { - case MESA_SHADER_VERTEX: { - struct st_vertex_program *stvp = (struct st_vertex_program *) prog; + if (prog->info.stage == MESA_SHADER_VERTEX) { + struct st_vertex_program *stvp = (struct st_vertex_program *)stp; blob_write_uint32(&blob, stvp->num_inputs); blob_write_bytes(&blob, stvp->index_to_input, @@ -92,59 +100,23 @@ sizeof(stvp->input_to_index)); blob_write_bytes(&blob, stvp->result_to_output, sizeof(stvp->result_to_output)); - - write_stream_out_to_cache(&blob, &stvp->tgsi); - - if (nir) - write_nir_to_cache(&blob, prog); - else - write_tgsi_to_cache(&blob, stvp->tgsi.tokens, prog, - stvp->num_tgsi_tokens); - break; - } - case MESA_SHADER_TESS_CTRL: - case MESA_SHADER_TESS_EVAL: - case MESA_SHADER_GEOMETRY: { - struct st_common_program *stcp = (struct st_common_program *) prog; - - write_stream_out_to_cache(&blob, &stcp->tgsi); - - if (nir) - write_nir_to_cache(&blob, prog); - else - write_tgsi_to_cache(&blob, stcp->tgsi.tokens, prog, - stcp->num_tgsi_tokens); - break; - } - case MESA_SHADER_FRAGMENT: { - struct st_fragment_program *stfp = (struct st_fragment_program *) prog; - - if (nir) - write_nir_to_cache(&blob, prog); - else - write_tgsi_to_cache(&blob, stfp->tgsi.tokens, prog, - stfp->num_tgsi_tokens); - break; } - case MESA_SHADER_COMPUTE: { - struct st_compute_program *stcp = (struct st_compute_program *) prog; - if (nir) - write_nir_to_cache(&blob, prog); - else - write_tgsi_to_cache(&blob, stcp->tgsi.prog, prog, - stcp->num_tgsi_tokens); - break; - } - default: - unreachable("Unsupported stage"); - } + if (prog->info.stage == MESA_SHADER_VERTEX || + prog->info.stage == MESA_SHADER_TESS_EVAL || + prog->info.stage == MESA_SHADER_GEOMETRY) + write_stream_out_to_cache(&blob, &stp->state); + + if (nir) + write_nir_to_cache(&blob, prog); + else + write_tgsi_to_cache(&blob, stp->state.tokens, prog); blob_finish(&blob); } /** - * Store tgsi and any other required state in on-disk shader cache. + * Store TGSI or NIR and any other required state in on-disk shader cache. */ void st_store_ir_in_disk_cache(struct st_context *st, struct gl_program *prog, @@ -170,19 +142,24 @@ static void read_stream_out_from_cache(struct blob_reader *blob_reader, - struct pipe_shader_state *tgsi) + struct pipe_shader_state *state) { - blob_copy_bytes(blob_reader, (uint8_t *) &tgsi->stream_output, - sizeof(tgsi->stream_output)); + memset(&state->stream_output, 0, sizeof(state->stream_output)); + state->stream_output.num_outputs = blob_read_uint32(blob_reader); + if (state->stream_output.num_outputs) { + blob_copy_bytes(blob_reader, &state->stream_output.stride, + sizeof(state->stream_output.stride)); + blob_copy_bytes(blob_reader, &state->stream_output.output, + sizeof(state->stream_output.output)); + } } static void read_tgsi_from_cache(struct blob_reader *blob_reader, - const struct tgsi_token **tokens, - unsigned *num_tokens) + const struct tgsi_token **tokens) { - *num_tokens = blob_read_uint32(blob_reader); - unsigned tokens_size = *num_tokens * sizeof(struct tgsi_token); + unsigned num_tokens = blob_read_uint32(blob_reader); + unsigned tokens_size = num_tokens * sizeof(struct tgsi_token); *tokens = (const struct tgsi_token*) MALLOC(tokens_size); blob_copy_bytes(blob_reader, (uint8_t *) *tokens, tokens_size); } @@ -198,17 +175,19 @@ const struct nir_shader_compiler_options *options = ctx->Const.ShaderCompilerOptions[prog->info.stage].NirOptions; + st_set_prog_affected_state_flags(prog); + _mesa_associate_uniform_storage(ctx, shProg, prog); + assert(prog->driver_cache_blob && prog->driver_cache_blob_size > 0); + struct st_program *stp = st_program(prog); struct blob_reader blob_reader; blob_reader_init(&blob_reader, buffer, size); - switch (prog->info.stage) { - case MESA_SHADER_VERTEX: { - struct st_vertex_program *stvp = (struct st_vertex_program *) prog; - - st_release_vp_variants(st, stvp); + st_release_variants(st, stp); + if (prog->info.stage == MESA_SHADER_VERTEX) { + struct st_vertex_program *stvp = (struct st_vertex_program *)stp; stvp->num_inputs = blob_read_uint32(&blob_reader); blob_copy_bytes(&blob_reader, (uint8_t *) stvp->index_to_input, sizeof(stvp->index_to_input)); @@ -216,140 +195,19 @@ sizeof(stvp->input_to_index)); blob_copy_bytes(&blob_reader, (uint8_t *) stvp->result_to_output, sizeof(stvp->result_to_output)); - - read_stream_out_from_cache(&blob_reader, &stvp->tgsi); - - if (nir) { - stvp->tgsi.type = PIPE_SHADER_IR_NIR; - stvp->shader_program = shProg; - stvp->tgsi.ir.nir = nir_deserialize(NULL, options, &blob_reader); - prog->nir = stvp->tgsi.ir.nir; - } else { - read_tgsi_from_cache(&blob_reader, &stvp->tgsi.tokens, - &stvp->num_tgsi_tokens); - } - - if (st->vp == stvp) - st->dirty |= ST_NEW_VERTEX_PROGRAM(st, stvp); - - break; } - case MESA_SHADER_TESS_CTRL: { - struct st_common_program *sttcp = st_common_program(prog); - - st_release_basic_variants(st, sttcp->Base.Target, - &sttcp->variants, &sttcp->tgsi); - - read_stream_out_from_cache(&blob_reader, &sttcp->tgsi); - if (nir) { - sttcp->tgsi.type = PIPE_SHADER_IR_NIR; - sttcp->shader_program = shProg; - sttcp->tgsi.ir.nir = nir_deserialize(NULL, options, &blob_reader); - prog->nir = sttcp->tgsi.ir.nir; - } else { - read_tgsi_from_cache(&blob_reader, &sttcp->tgsi.tokens, - &sttcp->num_tgsi_tokens); - } - - if (st->tcp == sttcp) - st->dirty |= sttcp->affected_states; - - break; - } - case MESA_SHADER_TESS_EVAL: { - struct st_common_program *sttep = st_common_program(prog); - - st_release_basic_variants(st, sttep->Base.Target, - &sttep->variants, &sttep->tgsi); - - read_stream_out_from_cache(&blob_reader, &sttep->tgsi); - - if (nir) { - sttep->tgsi.type = PIPE_SHADER_IR_NIR; - sttep->shader_program = shProg; - sttep->tgsi.ir.nir = nir_deserialize(NULL, options, &blob_reader); - prog->nir = sttep->tgsi.ir.nir; - } else { - read_tgsi_from_cache(&blob_reader, &sttep->tgsi.tokens, - &sttep->num_tgsi_tokens); - } - - if (st->tep == sttep) - st->dirty |= sttep->affected_states; - - break; - } - case MESA_SHADER_GEOMETRY: { - struct st_common_program *stgp = st_common_program(prog); - - st_release_basic_variants(st, stgp->Base.Target, &stgp->variants, - &stgp->tgsi); - - read_stream_out_from_cache(&blob_reader, &stgp->tgsi); - - if (nir) { - stgp->tgsi.type = PIPE_SHADER_IR_NIR; - stgp->shader_program = shProg; - stgp->tgsi.ir.nir = nir_deserialize(NULL, options, &blob_reader); - prog->nir = stgp->tgsi.ir.nir; - } else { - read_tgsi_from_cache(&blob_reader, &stgp->tgsi.tokens, - &stgp->num_tgsi_tokens); - } - - if (st->gp == stgp) - st->dirty |= stgp->affected_states; - - break; - } - case MESA_SHADER_FRAGMENT: { - struct st_fragment_program *stfp = (struct st_fragment_program *) prog; - - st_release_fp_variants(st, stfp); - - if (nir) { - stfp->tgsi.type = PIPE_SHADER_IR_NIR; - stfp->shader_program = shProg; - stfp->tgsi.ir.nir = nir_deserialize(NULL, options, &blob_reader); - prog->nir = stfp->tgsi.ir.nir; - } else { - read_tgsi_from_cache(&blob_reader, &stfp->tgsi.tokens, - &stfp->num_tgsi_tokens); - } - - if (st->fp == stfp) - st->dirty |= stfp->affected_states; - - break; - } - case MESA_SHADER_COMPUTE: { - struct st_compute_program *stcp = (struct st_compute_program *) prog; - - st_release_cp_variants(st, stcp); - - if (nir) { - stcp->tgsi.ir_type = PIPE_SHADER_IR_NIR; - stcp->shader_program = shProg; - stcp->tgsi.prog = nir_deserialize(NULL, options, &blob_reader); - prog->nir = (nir_shader *) stcp->tgsi.prog; - } else { - read_tgsi_from_cache(&blob_reader, - (const struct tgsi_token**) &stcp->tgsi.prog, - &stcp->num_tgsi_tokens); - } - - stcp->tgsi.req_local_mem = stcp->Base.info.cs.shared_size; - stcp->tgsi.req_private_mem = 0; - stcp->tgsi.req_input_mem = 0; - - if (st->cp == stcp) - st->dirty |= stcp->affected_states; - - break; - } - default: - unreachable("Unsupported stage"); + if (prog->info.stage == MESA_SHADER_VERTEX || + prog->info.stage == MESA_SHADER_TESS_EVAL || + prog->info.stage == MESA_SHADER_GEOMETRY) + read_stream_out_from_cache(&blob_reader, &stp->state); + + if (nir) { + stp->state.type = PIPE_SHADER_IR_NIR; + stp->shader_program = shProg; + prog->nir = nir_deserialize(NULL, options, &blob_reader); + } else { + read_tgsi_from_cache(&blob_reader, &stp->state.tokens); } /* Make sure we don't try to read more data than we wrote. This should @@ -365,13 +223,7 @@ } } - st_set_prog_affected_state_flags(prog); - _mesa_associate_uniform_storage(ctx, shProg, prog); - - /* Create Gallium shaders now instead of on demand. */ - if (ST_DEBUG & DEBUG_PRECOMPILE || - st->shader_has_one_variant[prog->info.stage]) - st_precompile_shader_variant(st, prog); + st_finalize_program(st, prog); } bool @@ -383,7 +235,7 @@ return false; /* If we didn't load the GLSL metadata from cache then we could not have - * loaded the tgsi either. + * loaded TGSI or NIR either. */ if (prog->data->LinkStatus != LINKING_SKIPPED) return false; diff -Nru mesa-19.2.8/src/mesa/state_tracker/st_shader_cache.h mesa-20.0.8/src/mesa/state_tracker/st_shader_cache.h --- mesa-19.2.8/src/mesa/state_tracker/st_shader_cache.h 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/state_tracker/st_shader_cache.h 2020-06-12 01:21:18.000000000 +0000 @@ -25,9 +25,9 @@ #define ST_SHADER_CACHE_H #include "st_context.h" -#include "compiler/blob.h" #include "main/mtypes.h" #include "pipe/p_state.h" +#include "util/blob.h" #include "util/disk_cache.h" #include "util/mesa-sha1.h" diff -Nru mesa-19.2.8/src/mesa/state_tracker/st_texture.c mesa-20.0.8/src/mesa/state_tracker/st_texture.c --- mesa-19.2.8/src/mesa/state_tracker/st_texture.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/state_tracker/st_texture.c 2020-06-12 01:21:18.000000000 +0000 @@ -37,7 +37,7 @@ #include "pipe/p_context.h" #include "pipe/p_defines.h" #include "util/u_inlines.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include "util/u_rect.h" #include "util/u_math.h" #include "util/u_memory.h" @@ -416,7 +416,7 @@ /* find an RGBA texture format */ format = st_choose_format(st, GL_RGBA, GL_NONE, GL_NONE, PIPE_TEXTURE_2D, 0, 0, PIPE_BIND_SAMPLER_VIEW, - FALSE); + false, false); /* create texture for color map/table */ pt = st_texture_create(st, PIPE_TEXTURE_2D, format, 0, diff -Nru mesa-19.2.8/src/mesa/state_tracker/st_texture.h mesa-20.0.8/src/mesa/state_tracker/st_texture.h --- mesa-19.2.8/src/mesa/state_tracker/st_texture.h 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/state_tracker/st_texture.h 2020-06-12 01:21:18.000000000 +0000 @@ -164,15 +164,15 @@ */ enum pipe_format surface_format; - /* When non-zero, samplers should use this level instead of the level + /* When non-negative, samplers should use this level instead of the level * range specified by the GL state. * * This is used for EGL images, which may correspond to a single level out * of an imported pipe_resources with multiple mip levels. */ - uint level_override; + int level_override; - /* When non-zero, samplers should use this layer instead of the one + /* When non-negative, samplers should use this layer instead of the one * specified by the GL state. * * This is used for EGL images and VDPAU interop, where imported @@ -180,7 +180,7 @@ * with different fields in the case of VDPAU) even though the GL state * describes one non-array texture per field. */ - uint layer_override; + int layer_override; /** * Set when the texture images of this texture object might not all be in @@ -323,6 +323,9 @@ st_destroy_bound_image_handles(struct st_context *st); bool +st_astc_format_fallback(const struct st_context *st, mesa_format format); + +bool st_compressed_format_fallback(struct st_context *st, mesa_format format); void diff -Nru mesa-19.2.8/src/mesa/state_tracker/st_tgsi_lower_yuv.c mesa-20.0.8/src/mesa/state_tracker/st_tgsi_lower_yuv.c --- mesa-19.2.8/src/mesa/state_tracker/st_tgsi_lower_yuv.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/state_tracker/st_tgsi_lower_yuv.c 2020-06-12 01:21:18.000000000 +0000 @@ -26,7 +26,6 @@ #include "st_tgsi_lower_yuv.h" #include "tgsi/tgsi_transform.h" #include "tgsi/tgsi_scan.h" -#include "tgsi/tgsi_dump.h" #include "util/u_debug.h" #include "util/bitscan.h" @@ -192,10 +191,10 @@ */ /* ITU-R BT.601 conversion */ - emit_immed(tctx, 0, 1.164, 0.000, 1.596, 0.0); - emit_immed(tctx, 1, 1.164, -0.392, -0.813, 0.0); - emit_immed(tctx, 2, 1.164, 2.017, 0.000, 0.0); - emit_immed(tctx, 3, 0.0625, 0.500, 0.500, 1.0); + emit_immed(tctx, 0, 1.164f, 0.000f, 1.596f, 0.0f); + emit_immed(tctx, 1, 1.164f, -0.392f, -0.813f, 0.0f); + emit_immed(tctx, 2, 1.164f, 2.017f, 0.000f, 0.0f); + emit_immed(tctx, 3, 0.0625f, 0.500f, 0.500f, 1.0f); /* * Declare extra samplers / sampler-views: diff -Nru mesa-19.2.8/src/mesa/state_tracker/st_util.h mesa-20.0.8/src/mesa/state_tracker/st_util.h --- mesa-19.2.8/src/mesa/state_tracker/st_util.h 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/state_tracker/st_util.h 2020-06-12 01:21:18.000000000 +0000 @@ -71,7 +71,7 @@ static inline GLuint st_fb_orientation(const struct gl_framebuffer *fb) { - if (fb && _mesa_is_winsys_fbo(fb)) { + if (fb && fb->FlipY) { /* Drawing into a window (on-screen buffer). * * Negate Y scale to flip image vertically. @@ -101,6 +101,39 @@ ctx->Transform.ClipPlanesEnabled; } +static inline bool +st_point_size_per_vertex(struct gl_context *ctx) +{ + const struct gl_program *vertProg = ctx->VertexProgram._Current; + if (vertProg) { + if (vertProg->Id == 0) { + if (vertProg->info.outputs_written & + BITFIELD64_BIT(VARYING_SLOT_PSIZ)) { + /* generated program which emits point size */ + return true; + } + } + else if (ctx->API != API_OPENGLES2) { + /* PointSizeEnabled is always set in ES2 contexts */ + return ctx->VertexProgram.PointSizeEnabled; + } + else { + /* ST_NEW_TESSEVAL_PROGRAM | ST_NEW_GEOMETRY_PROGRAM */ + /* We have to check the last bound stage and see if it writes psize */ + struct gl_program *last = NULL; + if (ctx->GeometryProgram._Current) + last = ctx->GeometryProgram._Current; + else if (ctx->TessEvalProgram._Current) + last = ctx->TessEvalProgram._Current; + else if (ctx->VertexProgram._Current) + last = ctx->VertexProgram._Current; + if (last) + return !!(last->info.outputs_written & + BITFIELD64_BIT(VARYING_SLOT_PSIZ)); + } + } + return false; +} /** clear-alloc a struct-sized object, with casting */ #define ST_CALLOC_STRUCT(T) (struct T *) calloc(1, sizeof(struct T)) diff -Nru mesa-19.2.8/src/mesa/state_tracker/st_vdpau.c mesa-20.0.8/src/mesa/state_tracker/st_vdpau.c --- mesa-19.2.8/src/mesa/state_tracker/st_vdpau.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/state_tracker/st_vdpau.c 2020-06-12 01:21:18.000000000 +0000 @@ -191,7 +191,7 @@ struct pipe_resource *res; mesa_format texFormat; - uint layer_override = 0; + int layer_override = -1; if (output) { res = st_vdpau_output_surface_dma_buf(ctx, vdpSurface); @@ -248,7 +248,7 @@ pipe_resource_reference(&stImage->pt, res); stObj->surface_format = res->format; - stObj->level_override = 0; + stObj->level_override = -1; stObj->layer_override = layer_override; _mesa_dirty_texobj(ctx, texObj); @@ -269,8 +269,8 @@ st_texture_release_all_sampler_views(st, stObj); pipe_resource_reference(&stImage->pt, NULL); - stObj->level_override = 0; - stObj->layer_override = 0; + stObj->level_override = -1; + stObj->layer_override = -1; _mesa_dirty_texobj(ctx, texObj); diff -Nru mesa-19.2.8/src/mesa/state_tracker/tests/st_format.c mesa-20.0.8/src/mesa/state_tracker/tests/st_format.c --- mesa-19.2.8/src/mesa/state_tracker/tests/st_format.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/state_tracker/tests/st_format.c 2020-06-12 01:21:18.000000000 +0000 @@ -32,7 +32,7 @@ #include "state_tracker/st_context.h" #include "state_tracker/st_format.h" #include "state_tracker/st_texture.h" -#include "util/u_format.h" +#include "util/format/u_format.h" #include static bool @@ -56,6 +56,10 @@ }; struct st_context local_st = { .pipe = &pctx, + .has_etc1 = true, + .has_etc2 = true, + .has_astc_2d_ldr = true, + .has_astc_5x5_ldr = true, }; struct st_context *st = &local_st; @@ -63,10 +67,12 @@ /* test all Mesa formats */ for (i = 1; i < MESA_FORMAT_COUNT; i++) { + if (!_mesa_get_format_name(i)) + continue; + enum pipe_format pf; - if (st_compressed_format_fallback(st, i)) - continue; + assert(!st_compressed_format_fallback(st, i)); pf = st_mesa_format_to_pipe_format(st, i); if (pf != PIPE_FORMAT_NONE) { @@ -77,24 +83,28 @@ _mesa_get_format_name(mf)); return 1; } - } - } - /* Test all Gallium formats */ - for (i = 1; i < PIPE_FORMAT_COUNT; i++) { - mesa_format mf = st_pipe_format_to_mesa_format(i); - if (st_compressed_format_fallback(st, mf)) - continue; + const struct util_format_description *desc = util_format_description(i); - if (mf != MESA_FORMAT_NONE) { - enum pipe_format pf = - st_mesa_format_to_pipe_format(st, mf); - if (pf != i) { - fprintf(stderr, "Round-tripping %s -> %s -> %s failed\n", - util_format_short_name(i), - _mesa_get_format_name(pf), - util_format_short_name(pf)); - return 1; + /* Make sure that gallium and Mesa agree on whether the format is an + * array format. + */ + if (desc->nr_channels > 1) { + bool mesa_array = (_mesa_get_format_layout(mf) == + MESA_FORMAT_LAYOUT_ARRAY); + bool gallium_array = desc->is_array && !desc->is_bitmask; + /* We should probably be checking equality here, but we have some + * UINT and SINT types that are array formats in Mesa but not in + * gallium. + */ + if (gallium_array && !mesa_array) { + fprintf(stderr, "%s is %sarray, %s is %sarray\n", + util_format_short_name(i), + gallium_array ? "" : "not ", + _mesa_get_format_name(mf), + mesa_array ? "" : "not "); + return 1; + } } } } diff -Nru mesa-19.2.8/src/mesa/swrast/s_renderbuffer.c mesa-20.0.8/src/mesa/swrast/s_renderbuffer.c --- mesa-19.2.8/src/mesa/swrast/s_renderbuffer.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/swrast/s_renderbuffer.c 2020-06-12 01:21:18.000000000 +0000 @@ -76,14 +76,13 @@ case GL_RGBA4: case GL_RGB5_A1: case GL_RGBA8: -#if 1 case GL_RGB10_A2: case GL_RGBA12: +#if UTIL_ARCH_LITTLE_ENDIAN + rb->Format = MESA_FORMAT_R8G8B8A8_UNORM; +#else + rb->Format = MESA_FORMAT_A8B8G8R8_UNORM; #endif - if (_mesa_little_endian()) - rb->Format = MESA_FORMAT_R8G8B8A8_UNORM; - else - rb->Format = MESA_FORMAT_A8B8G8R8_UNORM; break; case GL_RGBA16: case GL_RGBA16_SNORM: diff -Nru mesa-19.2.8/src/mesa/swrast/s_texfetch.c mesa-20.0.8/src/mesa/swrast/s_texfetch.c --- mesa-19.2.8/src/mesa/swrast/s_texfetch.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/swrast/s_texfetch.c 2020-06-12 01:21:18.000000000 +0000 @@ -134,8 +134,6 @@ FETCH_FUNCS(B8G8R8X8_UNORM), FETCH_FUNCS(A8R8G8B8_UNORM), FETCH_FUNCS(X8R8G8B8_UNORM), - FETCH_FUNCS(L16A16_UNORM), - FETCH_FUNCS(A16L16_UNORM), FETCH_FUNCS(B5G6R5_UNORM), FETCH_FUNCS(R5G6B5_UNORM), FETCH_FUNCS(B4G4R4A4_UNORM), @@ -143,14 +141,8 @@ FETCH_FUNCS(A1B5G5R5_UNORM), FETCH_FUNCS(B5G5R5A1_UNORM), FETCH_FUNCS(A1R5G5B5_UNORM), - FETCH_FUNCS(L8A8_UNORM), - FETCH_FUNCS(A8L8_UNORM), - FETCH_FUNCS(R8G8_UNORM), - FETCH_FUNCS(G8R8_UNORM), FETCH_FUNCS(L4A4_UNORM), FETCH_FUNCS(B2G3R3_UNORM), - FETCH_FUNCS(R16G16_UNORM), - FETCH_FUNCS(G16R16_UNORM), FETCH_FUNCS(B10G10R10A2_UNORM), FETCH_FUNCS(R10G10B10A2_UNORM), @@ -175,10 +167,14 @@ FETCH_FUNCS(A_UNORM16), FETCH_FUNCS(L_UNORM8), FETCH_FUNCS(L_UNORM16), + FETCH_FUNCS(LA_UNORM8), + FETCH_FUNCS(LA_UNORM16), FETCH_FUNCS(I_UNORM8), FETCH_FUNCS(I_UNORM16), FETCH_FUNCS(R_UNORM8), FETCH_FUNCS(R_UNORM16), + FETCH_FUNCS(RG_UNORM8), + FETCH_FUNCS(RG_UNORM16), FETCH_FUNCS(BGR_UNORM8), FETCH_FUNCS(RGB_UNORM8), FETCH_FUNCS(RGBA_UNORM16), @@ -190,21 +186,20 @@ FETCH_FUNCS(A8B8G8R8_SNORM), FETCH_FUNCS(X8B8G8R8_SNORM), FETCH_FUNCS(R8G8B8A8_SNORM), - FETCH_FUNCS(R16G16_SNORM), - FETCH_FUNCS(R8G8_SNORM), - FETCH_FUNCS(L8A8_SNORM), - FETCH_FUNCS(A8L8_SNORM), /* Array signed/normalized formats */ FETCH_FUNCS(A_SNORM8), FETCH_FUNCS(A_SNORM16), FETCH_FUNCS(L_SNORM8), FETCH_FUNCS(L_SNORM16), + FETCH_FUNCS(LA_SNORM8), + FETCH_FUNCS(LA_SNORM16), FETCH_FUNCS(I_SNORM8), FETCH_FUNCS(I_SNORM16), FETCH_FUNCS(R_SNORM8), FETCH_FUNCS(R_SNORM16), - FETCH_FUNCS(LA_SNORM16), + FETCH_FUNCS(RG_SNORM8), + FETCH_FUNCS(RG_SNORM16), FETCH_FUNCS(RGB_SNORM16), FETCH_FUNCS(RGBA_SNORM16), @@ -215,12 +210,11 @@ FETCH_FUNCS(R8G8B8A8_SRGB), FETCH_FUNCS(R8G8B8X8_SRGB), FETCH_FUNCS(X8B8G8R8_SRGB), - FETCH_FUNCS(L8A8_SRGB), - FETCH_FUNCS(A8L8_SRGB), /* Array sRGB formats */ FETCH_FUNCS(R_SRGB8), FETCH_FUNCS(L_SRGB8), + FETCH_FUNCS(LA_SRGB8), FETCH_FUNCS(BGR_SRGB8), /* Packed float formats */ @@ -256,7 +250,6 @@ /* Packed signed/unsigned non-normalized integer formats */ /* Array signed/unsigned non-normalized integer formats */ - FETCH_FUNCS(RGBA_UINT8), FETCH_FUNCS(RGBA_UINT16), FETCH_FUNCS(RGBA_UINT32), FETCH_FUNCS(RGBA_SINT8), diff -Nru mesa-19.2.8/src/mesa/swrast/s_texfetch_tmp.h mesa-20.0.8/src/mesa/swrast/s_texfetch_tmp.h --- mesa-19.2.8/src/mesa/swrast/s_texfetch_tmp.h 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/swrast/s_texfetch_tmp.h 2020-06-12 01:21:18.000000000 +0000 @@ -128,22 +128,18 @@ FETCH_RGBA(A1R5G5B5_UNORM, GLushort, 1) FETCH_RGBA(B10G10R10A2_UNORM, GLuint, 1) FETCH_RGBA(R10G10B10A2_UNORM, GLuint, 1) -FETCH_RGBA(R8G8_UNORM, GLushort, 1) -FETCH_RGBA(G8R8_UNORM, GLushort, 1) +FETCH_RGBA(RG_UNORM8, GLubyte, 2) FETCH_RGBA(L4A4_UNORM, GLubyte, 1) -FETCH_RGBA(L8A8_UNORM, GLushort, 1) FETCH_RGBA(R_UNORM8, GLubyte, 1) FETCH_RGBA(R_UNORM16, GLushort, 1) -FETCH_RGBA(A8L8_UNORM, GLushort, 1) -FETCH_RGBA(R16G16_UNORM, GLuint, 1) -FETCH_RGBA(G16R16_UNORM, GLuint, 1) -FETCH_RGBA(L16A16_UNORM, GLuint, 1) -FETCH_RGBA(A16L16_UNORM, GLuint, 1) +FETCH_RGBA(LA_UNORM8, GLubyte, 2) +FETCH_RGBA(RG_UNORM16, GLushort, 2) FETCH_RGBA(B2G3R3_UNORM, GLubyte, 1) FETCH_RGBA(A_UNORM8, GLubyte, 1) FETCH_RGBA(A_UNORM16, GLushort, 1) FETCH_RGBA(L_UNORM8, GLubyte, 1) FETCH_RGBA(L_UNORM16, GLushort, 1) +FETCH_RGBA(LA_UNORM16, GLushort, 2) FETCH_RGBA(I_UNORM8, GLubyte, 1) FETCH_RGBA(I_UNORM16, GLushort, 1) FETCH_RGBA(BGR_SRGB8, GLubyte, 3) @@ -155,21 +151,18 @@ FETCH_RGBA(X8B8G8R8_SRGB, GLuint, 1) FETCH_RGBA(R_SRGB8, GLubyte, 1) FETCH_RGBA(L_SRGB8, GLubyte, 1) -FETCH_RGBA(L8A8_SRGB, GLushort, 1) -FETCH_RGBA(A8L8_SRGB, GLushort, 2) +FETCH_RGBA(LA_SRGB8, GLubyte, 2) FETCH_RGBA(RGBA_SINT8, GLbyte, 4) FETCH_RGBA(RGBA_SINT16, GLshort, 4) FETCH_RGBA(RGBA_SINT32, GLint, 4) -FETCH_RGBA(RGBA_UINT8, GLubyte, 4) FETCH_RGBA(RGBA_UINT16, GLushort, 4) FETCH_RGBA(RGBA_UINT32, GLuint, 4) FETCH_RGBA(R_SNORM8, GLbyte, 1) FETCH_RGBA(A_SNORM8, GLbyte, 1) FETCH_RGBA(L_SNORM8, GLbyte, 1) FETCH_RGBA(I_SNORM8, GLbyte, 1) -FETCH_RGBA(R8G8_SNORM, GLshort, 1) -FETCH_RGBA(L8A8_SNORM, GLshort, 1) -FETCH_RGBA(A8L8_SNORM, GLshort, 1) +FETCH_RGBA(LA_SNORM8, GLbyte, 2) +FETCH_RGBA(RG_SNORM8, GLbyte, 2) FETCH_RGBA(X8B8G8R8_SNORM, GLint, 1) FETCH_RGBA(A8B8G8R8_SNORM, GLint, 1) FETCH_RGBA(R8G8B8A8_SNORM, GLint, 1) @@ -177,7 +170,7 @@ FETCH_RGBA(A_SNORM16, GLshort, 1) FETCH_RGBA(L_SNORM16, GLshort, 1) FETCH_RGBA(I_SNORM16, GLshort, 1) -FETCH_RGBA(R16G16_SNORM, GLshort, 2) +FETCH_RGBA(RG_SNORM16, GLshort, 2) FETCH_RGBA(LA_SNORM16, GLshort, 2) FETCH_RGBA(RGB_SNORM16, GLshort, 3) FETCH_RGBA(RGBA_SNORM16, GLshort, 4) diff -Nru mesa-19.2.8/src/mesa/swrast/s_triangle.c mesa-20.0.8/src/mesa/swrast/s_triangle.c --- mesa-19.2.8/src/mesa/swrast/s_triangle.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/swrast/s_triangle.c 2020-06-12 01:21:18.000000000 +0000 @@ -1108,7 +1108,7 @@ #if CHAN_BITS != 8 USE(general_triangle); #else - if (format == MESA_FORMAT_A8B8G8R8_UNORM && !_mesa_little_endian()) { + if (format == MESA_FORMAT_A8B8G8R8_UNORM && !UTIL_ARCH_LITTLE_ENDIAN) { /* We only handle RGBA8888 correctly on little endian * in the optimized code above. */ diff -Nru mesa-19.2.8/src/mesa/x86/common_x86_features.h mesa-20.0.8/src/mesa/x86/common_x86_features.h --- mesa-19.2.8/src/mesa/x86/common_x86_features.h 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/mesa/x86/common_x86_features.h 2020-06-12 01:21:18.000000000 +0000 @@ -67,13 +67,13 @@ #define cpu_has_mmxext (_mesa_x86_cpu_features & X86_FEATURE_MMXEXT) -#ifdef __SSE__ +#if defined(__SSE__) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 1)) || defined(_M_X64) #define cpu_has_xmm 1 #else #define cpu_has_xmm (_mesa_x86_cpu_features & X86_FEATURE_XMM) #endif -#ifdef __SSE2__ +#if defined(__SSE2__) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 2)) || defined(_M_X64) #define cpu_has_xmm2 1 #else #define cpu_has_xmm2 (_mesa_x86_cpu_features & X86_FEATURE_XMM2) diff -Nru mesa-19.2.8/src/meson.build mesa-20.0.8/src/meson.build --- mesa-19.2.8/src/meson.build 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/meson.build 2020-06-12 01:21:18.000000000 +0000 @@ -26,6 +26,7 @@ inc_gallium = include_directories('gallium/include') inc_gallium_aux = include_directories('gallium/auxiliary') inc_amd_common = include_directories('amd/common') +inc_amd_common_llvm = include_directories('amd/llvm') libglsl_util = static_library( 'glsl_util', @@ -47,6 +48,11 @@ ) subdir('gtest') +if cc.get_id() == 'msvc' + subdir('getopt') +else + idep_getopt = null_dep +endif subdir('util') subdir('mapi') # TODO: opengl @@ -54,7 +60,9 @@ if with_tools.contains('drm-shim') subdir('drm-shim') endif -subdir('imgui') +if with_imgui + subdir('imgui') +endif if with_platform_wayland subdir('egl/wayland/wayland-drm') endif @@ -106,22 +114,12 @@ # This must be after at least mesa, glx, and gallium, since libgl will be # defined in one of those subdirs depending on the glx provider. -if with_glx != 'disabled' and (not with_glvnd or not glvnd_has_headers_and_pc_files) - # If using glvnd the pkg-config header should not point to GL_mesa, it should - # point to GL. glvnd is only available on unix like platforms so adding -l - # should be safe here - # TODO: in the glvnd case glvnd itself should really be providing this. - if not glvnd_has_headers_and_pc_files - _gl = '-L${libdir} -lGL' - else - _gl = libgl - endif - +if with_glx != 'disabled' and not with_glvnd pkg.generate( name : 'gl', description : 'Mesa OpenGL Library', version : meson.project_version(), - libraries : _gl, + libraries : libgl, libraries_private : gl_priv_libs, requires_private : gl_priv_reqs, variables : ['glx_tls=yes'], diff -Nru mesa-19.2.8/src/panfrost/Android.bifrost.mk mesa-20.0.8/src/panfrost/Android.bifrost.mk --- mesa-19.2.8/src/panfrost/Android.bifrost.mk 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/panfrost/Android.bifrost.mk 2020-06-12 01:21:18.000000000 +0000 @@ -0,0 +1,50 @@ +# Copyright © 2019 Collabora Ltd. +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +include $(CLEAR_VARS) + +LOCAL_MODULE := libpanfrost_bifrost + +LOCAL_SRC_FILES := \ + $(bifrost_FILES) + +LOCAL_C_INCLUDES := \ + $(MESA_TOP)/include \ + $(MESA_TOP)/src/compiler/nir/ \ + $(MESA_TOP)/src/gallium/auxiliary/ \ + $(MESA_TOP)/src/gallium/include/ \ + $(MESA_TOP)/src/mapi/ \ + $(MESA_TOP)/src/mesa/ \ + $(MESA_TOP)/src/panfrost/bifrost/ \ + $(MESA_TOP)/src/panfrost/include/ + +LOCAL_STATIC_LIBRARIES := \ + libmesa_glsl \ + libmesa_nir \ + libmesa_st_mesa + +LOCAL_GENERATED_SOURCES := \ + $(MESA_GEN_GLSL_H) + +LOCAL_EXPORT_C_INCLUDE_DIRS := \ + $(MESA_TOP)/src/panfrost/bifrost/ \ + +include $(MESA_COMMON_MK) +include $(BUILD_STATIC_LIBRARY) \ No newline at end of file diff -Nru mesa-19.2.8/src/panfrost/Android.encoder.mk mesa-20.0.8/src/panfrost/Android.encoder.mk --- mesa-19.2.8/src/panfrost/Android.encoder.mk 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/panfrost/Android.encoder.mk 2020-06-12 01:21:18.000000000 +0000 @@ -0,0 +1,42 @@ +# Copyright © 2019 Collabora Ltd. +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +include $(CLEAR_VARS) + +LOCAL_MODULE := libpanfrost_encoder + +LOCAL_SRC_FILES := \ + $(encoder_FILES) + +LOCAL_C_INCLUDES := \ + $(MESA_TOP)/src/gallium/auxiliary/ \ + $(MESA_TOP)/src/gallium/include/ \ + $(MESA_TOP)/src/panfrost/encoder/ \ + $(MESA_TOP)/src/panfrost/include/ + +LOCAL_STATIC_LIBRARIES := \ + +LOCAL_GENERATED_SOURCES := \ + +LOCAL_EXPORT_C_INCLUDE_DIRS := \ + $(MESA_TOP)/src/panfrost/encoder/ \ + +include $(MESA_COMMON_MK) +include $(BUILD_STATIC_LIBRARY) \ No newline at end of file diff -Nru mesa-19.2.8/src/panfrost/Android.midgard.mk mesa-20.0.8/src/panfrost/Android.midgard.mk --- mesa-19.2.8/src/panfrost/Android.midgard.mk 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/panfrost/Android.midgard.mk 2020-06-12 01:21:18.000000000 +0000 @@ -0,0 +1,61 @@ +# Copyright © 2019 Collabora Ltd. +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +include $(CLEAR_VARS) + +LOCAL_MODULE := libpanfrost_midgard +LOCAL_MODULE_CLASS := STATIC_LIBRARIES +intermediates := $(call local-generated-sources-dir) + +LOCAL_SRC_FILES := \ + $(midgard_FILES) + +LOCAL_GENERATED_SOURCES := \ + $(MESA_GEN_GLSL_H) \ + $(intermediates)/midgard_nir_algebraic.c + +LOCAL_C_INCLUDES := \ + $(MESA_TOP)/include \ + $(MESA_TOP)/src/compiler/nir/ \ + $(MESA_TOP)/src/gallium/auxiliary/ \ + $(MESA_TOP)/src/gallium/include/ \ + $(MESA_TOP)/src/mapi/ \ + $(MESA_TOP)/src/mesa/ \ + $(MESA_TOP)/src/panfrost/include/ \ + $(MESA_TOP)/src/panfrost/midgard/ + +LOCAL_STATIC_LIBRARIES := \ + libmesa_glsl \ + libmesa_nir \ + libmesa_st_mesa + +midgard_nir_algebraic_gen := $(LOCAL_PATH)/midgard/midgard_nir_algebraic.py +midgard_nir_algebraic_deps := \ + $(MESA_TOP)/src/compiler/nir/ + +$(intermediates)/midgard_nir_algebraic.c: $(midgard_nir_algebraic_deps) + @mkdir -p $(dir $@) + $(hide) $(MESA_PYTHON2) $(midgard_nir_algebraic_gen) -p $< > $@ + +LOCAL_EXPORT_C_INCLUDE_DIRS := \ + $(MESA_TOP)/src/panfrost/midgard/ \ + +include $(MESA_COMMON_MK) +include $(BUILD_STATIC_LIBRARY) \ No newline at end of file diff -Nru mesa-19.2.8/src/panfrost/Android.mk mesa-20.0.8/src/panfrost/Android.mk --- mesa-19.2.8/src/panfrost/Android.mk 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/panfrost/Android.mk 2020-06-12 01:21:18.000000000 +0000 @@ -25,4 +25,8 @@ LOCAL_PATH := $(call my-dir) include $(LOCAL_PATH)/Makefile.sources +include $(LOCAL_PATH)/Android.bifrost.mk +include $(LOCAL_PATH)/Android.encoder.mk +include $(LOCAL_PATH)/Android.midgard.mk include $(LOCAL_PATH)/Android.shared.mk +include $(LOCAL_PATH)/Android.pandecode.mk diff -Nru mesa-19.2.8/src/panfrost/Android.pandecode.mk mesa-20.0.8/src/panfrost/Android.pandecode.mk --- mesa-19.2.8/src/panfrost/Android.pandecode.mk 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/panfrost/Android.pandecode.mk 2020-06-12 01:21:18.000000000 +0000 @@ -0,0 +1,44 @@ +# Copyright © 2019 Collabora Ltd. +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +include $(CLEAR_VARS) + +LOCAL_MODULE := libpanfrost_decode + +LOCAL_SRC_FILES := \ + $(pandecode_FILES) + +LOCAL_C_INCLUDES := \ + $(MESA_TOP)/src/gallium/auxiliary/ \ + $(MESA_TOP)/src/gallium/include/ \ + $(MESA_TOP)/src/panfrost/include/ \ + $(MESA_TOP)/src/panfrost/encoder/ \ + $(MESA_TOP)/src/panfrost/pandecode/ + +LOCAL_STATIC_LIBRARIES := \ + libpanfrost_encoder + +LOCAL_GENERATED_SOURCES := \ + +LOCAL_EXPORT_C_INCLUDE_DIRS := \ + $(MESA_TOP)/src/panfrost/pandecode/ \ + +include $(MESA_COMMON_MK) +include $(BUILD_STATIC_LIBRARY) \ No newline at end of file diff -Nru mesa-19.2.8/src/panfrost/Android.shared.mk mesa-20.0.8/src/panfrost/Android.shared.mk --- mesa-19.2.8/src/panfrost/Android.shared.mk 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/panfrost/Android.shared.mk 2020-06-12 01:21:18.000000000 +0000 @@ -33,7 +33,7 @@ LOCAL_C_INCLUDES := \ $(MESA_TOP)/src/gallium/auxiliary/ \ - $(MESA_TOP)/src/gallium/include/ \ + $(MESA_TOP)/src/gallium/include/ LOCAL_STATIC_LIBRARIES := \ diff -Nru mesa-19.2.8/src/panfrost/bifrost/bifrost_compile.c mesa-20.0.8/src/panfrost/bifrost/bifrost_compile.c --- mesa-19.2.8/src/panfrost/bifrost/bifrost_compile.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/panfrost/bifrost/bifrost_compile.c 2020-06-12 01:21:18.000000000 +0000 @@ -57,7 +57,7 @@ NIR_PASS(progress, nir, nir_opt_constant_folding); NIR_PASS(progress, nir, nir_lower_vars_to_ssa); - NIR_PASS(progress, nir, nir_lower_alu_to_scalar, NULL); + NIR_PASS(progress, nir, nir_lower_alu_to_scalar, NULL, NULL); NIR_PASS(progress, nir, nir_opt_if, true); } while (progress); @@ -117,7 +117,7 @@ nir_ssa_def def = instr->def; float *v = ralloc_array(NULL, float, 1); - nir_const_load_to_arr(v, instr, f32); + nir_const_value_to_array(v, instr->value, instr->def.num_components, f32); _mesa_hash_table_u64_insert(ctx->ssa_constants, def.index + 1, v); } @@ -1041,11 +1041,11 @@ } } while (progress); - schedule_program(ctx); + bifrost_schedule_program(ctx); #ifdef BI_DEBUG nir_print_shader(nir, stdout); - disassemble_bifrost(program->compiled.data, program->compiled.size, false); + disassemble_bifrost(stdout, program->compiled.data, program->compiled.size, false); #endif return 0; } diff -Nru mesa-19.2.8/src/panfrost/bifrost/bifrost_compile.h mesa-20.0.8/src/panfrost/bifrost/bifrost_compile.h --- mesa-19.2.8/src/panfrost/bifrost/bifrost_compile.h 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/panfrost/bifrost/bifrost_compile.h 2020-06-12 01:21:18.000000000 +0000 @@ -49,6 +49,7 @@ .lower_isign = true, .lower_fsign = true, .lower_ffract = true, + .lower_fdph = true, .lower_pack_half_2x16 = true, .lower_pack_unorm_2x16 = true, .lower_pack_snorm_2x16 = true, @@ -63,7 +64,6 @@ .lower_extract_word = true, .lower_all_io_to_temps = true, .lower_all_io_to_elements = true, - .vertex_id_zero_based = true, }; #endif diff -Nru mesa-19.2.8/src/panfrost/bifrost/bifrost_sched.c mesa-20.0.8/src/panfrost/bifrost/bifrost_sched.c --- mesa-19.2.8/src/panfrost/bifrost/bifrost_sched.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/panfrost/bifrost/bifrost_sched.c 2020-06-12 01:21:18.000000000 +0000 @@ -374,7 +374,7 @@ } -void schedule_program(compiler_context *ctx) +void bifrost_schedule_program(compiler_context *ctx) { // XXX: we should move instructions together before RA that can feed in to each other and be scheduled in the same clause allocate_registers(ctx); diff -Nru mesa-19.2.8/src/panfrost/bifrost/bifrost_sched.h mesa-20.0.8/src/panfrost/bifrost/bifrost_sched.h --- mesa-19.2.8/src/panfrost/bifrost/bifrost_sched.h 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/panfrost/bifrost/bifrost_sched.h 2020-06-12 01:21:18.000000000 +0000 @@ -24,6 +24,6 @@ #define bifrost_ra_h #include "compiler_defines.h" -void schedule_program(compiler_context *ctx); +void bifrost_schedule_program(compiler_context *ctx); #endif /* bifrost_ra_h */ diff -Nru mesa-19.2.8/src/panfrost/bifrost/cmdline.c mesa-20.0.8/src/panfrost/bifrost/cmdline.c --- mesa-19.2.8/src/panfrost/bifrost/cmdline.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/panfrost/bifrost/cmdline.c 2020-06-12 01:21:18.000000000 +0000 @@ -59,7 +59,7 @@ NIR_PASS_V(nir[i], nir_split_var_copies); NIR_PASS_V(nir[i], nir_lower_var_copies); - NIR_PASS_V(nir[i], nir_lower_alu_to_scalar, NULL); + NIR_PASS_V(nir[i], nir_lower_alu_to_scalar, NULL, NULL); /* before buffers and vars_to_ssa */ NIR_PASS_V(nir[i], gl_nir_lower_bindless_images); @@ -77,17 +77,17 @@ assert(fp); fseek(fp, 0, SEEK_END); - int filesize = ftell(fp); + unsigned filesize = ftell(fp); rewind(fp); unsigned char *code = malloc(filesize); - int res = fread(code, 1, filesize, fp); + unsigned res = fread(code, 1, filesize, fp); if (res != filesize) { printf("Couldn't read full file\n"); } fclose(fp); - disassemble_bifrost(code, filesize, false); + disassemble_bifrost(stdout, code, filesize, false); free(code); } diff -Nru mesa-19.2.8/src/panfrost/bifrost/disassemble.c mesa-20.0.8/src/panfrost/bifrost/disassemble.c --- mesa-19.2.8/src/panfrost/bifrost/disassemble.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/panfrost/bifrost/disassemble.c 2020-06-12 01:21:18.000000000 +0000 @@ -187,74 +187,75 @@ BR_SIZE_ZERO = 7, }; -void dump_header(struct bifrost_header header, bool verbose); -void dump_instr(const struct bifrost_alu_inst *instr, struct bifrost_regs next_regs, uint64_t *consts, +void dump_header(FILE *fp, struct bifrost_header header, bool verbose); +void dump_instr(FILE *fp, const struct bifrost_alu_inst *instr, + struct bifrost_regs next_regs, uint64_t *consts, unsigned data_reg, unsigned offset, bool verbose); -bool dump_clause(uint32_t *words, unsigned *size, unsigned offset, bool verbose); +bool dump_clause(FILE *fp, uint32_t *words, unsigned *size, unsigned offset, bool verbose); -void dump_header(struct bifrost_header header, bool verbose) +void dump_header(FILE *fp, struct bifrost_header header, bool verbose) { if (header.clause_type != 0) { - printf("id(%du) ", header.scoreboard_index); + fprintf(fp, "id(%du) ", header.scoreboard_index); } if (header.scoreboard_deps != 0) { - printf("next-wait("); + fprintf(fp, "next-wait("); bool first = true; for (unsigned i = 0; i < 8; i++) { if (header.scoreboard_deps & (1 << i)) { if (!first) { - printf(", "); + fprintf(fp, ", "); } - printf("%d", i); + fprintf(fp, "%d", i); first = false; } } - printf(") "); + fprintf(fp, ") "); } if (header.datareg_writebarrier) - printf("data-reg-barrier "); + fprintf(fp, "data-reg-barrier "); if (!header.no_end_of_shader) - printf("eos "); + fprintf(fp, "eos "); if (!header.back_to_back) { - printf("nbb "); + fprintf(fp, "nbb "); if (header.branch_cond) - printf("branch-cond "); + fprintf(fp, "branch-cond "); else - printf("branch-uncond "); + fprintf(fp, "branch-uncond "); } if (header.elide_writes) - printf("we "); + fprintf(fp, "we "); if (header.suppress_inf) - printf("suppress-inf "); + fprintf(fp, "suppress-inf "); if (header.suppress_nan) - printf("suppress-nan "); + fprintf(fp, "suppress-nan "); if (header.unk0) - printf("unk0 "); + fprintf(fp, "unk0 "); if (header.unk1) - printf("unk1 "); + fprintf(fp, "unk1 "); if (header.unk2) - printf("unk2 "); + fprintf(fp, "unk2 "); if (header.unk3) - printf("unk3 "); + fprintf(fp, "unk3 "); if (header.unk4) - printf("unk4 "); + fprintf(fp, "unk4 "); - printf("\n"); + fprintf(fp, "\n"); if (verbose) { - printf("# clause type %d, next clause type %d\n", + fprintf(fp, "# clause type %d, next clause type %d\n", header.clause_type, header.next_clause_type); } } -static struct bifrost_reg_ctrl DecodeRegCtrl(struct bifrost_regs regs) +static struct bifrost_reg_ctrl DecodeRegCtrl(FILE *fp, struct bifrost_regs regs) { struct bifrost_reg_ctrl decoded = {}; unsigned ctrl; @@ -309,7 +310,7 @@ decoded.add_write_unit = REG_WRITE_TWO; break; default: - printf("# unknown reg ctrl %d\n", ctrl); + fprintf(fp, "# unknown reg ctrl %d\n", ctrl); } return decoded; @@ -330,43 +331,43 @@ } } -static void dump_regs(struct bifrost_regs srcs) +static void dump_regs(FILE *fp, struct bifrost_regs srcs) { - struct bifrost_reg_ctrl ctrl = DecodeRegCtrl(srcs); - printf("# "); + struct bifrost_reg_ctrl ctrl = DecodeRegCtrl(fp, srcs); + fprintf(fp, "# "); if (ctrl.read_reg0) - printf("port 0: R%d ", get_reg0(srcs)); + fprintf(fp, "port 0: R%d ", get_reg0(srcs)); if (ctrl.read_reg1) - printf("port 1: R%d ", get_reg1(srcs)); + fprintf(fp, "port 1: R%d ", get_reg1(srcs)); if (ctrl.fma_write_unit == REG_WRITE_TWO) - printf("port 2: R%d (write FMA) ", srcs.reg2); + fprintf(fp, "port 2: R%d (write FMA) ", srcs.reg2); else if (ctrl.add_write_unit == REG_WRITE_TWO) - printf("port 2: R%d (write ADD) ", srcs.reg2); + fprintf(fp, "port 2: R%d (write ADD) ", srcs.reg2); if (ctrl.fma_write_unit == REG_WRITE_THREE) - printf("port 3: R%d (write FMA) ", srcs.reg3); + fprintf(fp, "port 3: R%d (write FMA) ", srcs.reg3); else if (ctrl.add_write_unit == REG_WRITE_THREE) - printf("port 3: R%d (write ADD) ", srcs.reg3); + fprintf(fp, "port 3: R%d (write ADD) ", srcs.reg3); else if (ctrl.read_reg3) - printf("port 3: R%d (read) ", srcs.reg3); + fprintf(fp, "port 3: R%d (read) ", srcs.reg3); if (srcs.uniform_const) { if (srcs.uniform_const & 0x80) { - printf("uniform: U%d", (srcs.uniform_const & 0x7f) * 2); + fprintf(fp, "uniform: U%d", (srcs.uniform_const & 0x7f) * 2); } } - printf("\n"); + fprintf(fp, "\n"); } -static void dump_const_imm(uint32_t imm) +static void dump_const_imm(FILE *fp, uint32_t imm) { union { float f; uint32_t i; } fi; fi.i = imm; - printf("0x%08x /* %f */", imm, fi.f); + fprintf(fp, "0x%08x /* %f */", imm, fi.f); } static uint64_t get_const(uint64_t *consts, struct bifrost_regs srcs) @@ -399,27 +400,27 @@ return imm | low_bits; } -static void dump_uniform_const_src(struct bifrost_regs srcs, uint64_t *consts, bool high32) +static void dump_uniform_const_src(FILE *fp, struct bifrost_regs srcs, uint64_t *consts, bool high32) { if (srcs.uniform_const & 0x80) { unsigned uniform = (srcs.uniform_const & 0x7f) * 2; - printf("U%d", uniform + (high32 ? 1 : 0)); + fprintf(fp, "U%d", uniform + (high32 ? 1 : 0)); } else if (srcs.uniform_const >= 0x20) { uint64_t imm = get_const(consts, srcs); if (high32) - dump_const_imm(imm >> 32); + dump_const_imm(fp, imm >> 32); else - dump_const_imm(imm); + dump_const_imm(fp, imm); } else { switch (srcs.uniform_const) { case 0: - printf("0"); + fprintf(fp, "0"); break; case 5: - printf("atest-data"); + fprintf(fp, "atest-data"); break; case 6: - printf("sample-ptr"); + fprintf(fp, "sample-ptr"); break; case 8: case 9: @@ -429,73 +430,73 @@ case 13: case 14: case 15: - printf("blend-descriptor%u", (unsigned) srcs.uniform_const - 8); + fprintf(fp, "blend-descriptor%u", (unsigned) srcs.uniform_const - 8); break; default: - printf("unkConst%u", (unsigned) srcs.uniform_const); + fprintf(fp, "unkConst%u", (unsigned) srcs.uniform_const); break; } if (high32) - printf(".y"); + fprintf(fp, ".y"); else - printf(".x"); + fprintf(fp, ".x"); } } -static void dump_src(unsigned src, struct bifrost_regs srcs, uint64_t *consts, bool isFMA) +static void dump_src(FILE *fp, unsigned src, struct bifrost_regs srcs, uint64_t *consts, bool isFMA) { switch (src) { case 0: - printf("R%d", get_reg0(srcs)); + fprintf(fp, "R%d", get_reg0(srcs)); break; case 1: - printf("R%d", get_reg1(srcs)); + fprintf(fp, "R%d", get_reg1(srcs)); break; case 2: - printf("R%d", srcs.reg3); + fprintf(fp, "R%d", srcs.reg3); break; case 3: if (isFMA) - printf("0"); + fprintf(fp, "0"); else - printf("T"); // i.e. the output of FMA this cycle + fprintf(fp, "T"); // i.e. the output of FMA this cycle break; case 4: - dump_uniform_const_src(srcs, consts, false); + dump_uniform_const_src(fp, srcs, consts, false); break; case 5: - dump_uniform_const_src(srcs, consts, true); + dump_uniform_const_src(fp, srcs, consts, true); break; case 6: - printf("T0"); + fprintf(fp, "T0"); break; case 7: - printf("T1"); + fprintf(fp, "T1"); break; } } -static void dump_output_mod(unsigned mod) +static void dump_output_mod(FILE *fp, unsigned mod) { switch (mod) { case 0: break; case 1: - printf(".clamp_0_inf"); + fprintf(fp, ".clamp_0_inf"); break; // max(out, 0) case 2: - printf(".clamp_m1_1"); + fprintf(fp, ".clamp_m1_1"); break; // clamp(out, -1, 1) case 3: - printf(".clamp_0_1"); + fprintf(fp, ".clamp_0_1"); break; // clamp(out, 0, 1) default: break; } } -static void dump_minmax_mode(unsigned mod) +static void dump_minmax_mode(FILE *fp, unsigned mod) { switch (mod) { case 0: @@ -509,7 +510,7 @@ * "greater"/"lesser" NaN is always returned, first by checking the * sign and then the mantissa bits. */ - printf(".nan_wins"); + fprintf(fp, ".nan_wins"); break; case 2: /* For max, implement src0 > src1 ? src0 : src1 @@ -520,20 +521,20 @@ * return false for NaN's. As a result, this mode is *not* * commutative. */ - printf(".src1_wins"); + fprintf(fp, ".src1_wins"); break; case 3: /* For max, implement src0 < src1 ? src1 : src0 * For min, implement src0 > src1 ? src1 : src0 */ - printf(".src0_wins"); + fprintf(fp, ".src0_wins"); break; default: break; } } -static void dump_round_mode(unsigned mod) +static void dump_round_mode(FILE *fp, unsigned mod) { switch (mod) { case 0: @@ -541,15 +542,15 @@ break; case 1: /* roundTowardPositive in the IEEE spec. */ - printf(".round_pos"); + fprintf(fp, ".round_pos"); break; case 2: /* roundTowardNegative in the IEEE spec. */ - printf(".round_neg"); + fprintf(fp, ".round_neg"); break; case 3: /* roundTowardZero in the IEEE spec. */ - printf(".round_zero"); + fprintf(fp, ".round_zero"); break; default: break; @@ -838,121 +839,121 @@ return info; } -static void dump_fcmp(unsigned op) +static void dump_fcmp(FILE *fp, unsigned op) { switch (op) { case 0: - printf(".OEQ"); + fprintf(fp, ".OEQ"); break; case 1: - printf(".OGT"); + fprintf(fp, ".OGT"); break; case 2: - printf(".OGE"); + fprintf(fp, ".OGE"); break; case 3: - printf(".UNE"); + fprintf(fp, ".UNE"); break; case 4: - printf(".OLT"); + fprintf(fp, ".OLT"); break; case 5: - printf(".OLE"); + fprintf(fp, ".OLE"); break; default: - printf(".unk%d", op); + fprintf(fp, ".unk%d", op); break; } } -static void dump_16swizzle(unsigned swiz) +static void dump_16swizzle(FILE *fp, unsigned swiz) { if (swiz == 2) return; - printf(".%c%c", "xy"[swiz & 1], "xy"[(swiz >> 1) & 1]); + fprintf(fp, ".%c%c", "xy"[swiz & 1], "xy"[(swiz >> 1) & 1]); } -static void dump_fma_expand_src0(unsigned ctrl) +static void dump_fma_expand_src0(FILE *fp, unsigned ctrl) { switch (ctrl) { case 3: case 4: case 6: - printf(".x"); + fprintf(fp, ".x"); break; case 5: case 7: - printf(".y"); + fprintf(fp, ".y"); break; case 0: case 1: case 2: break; default: - printf(".unk"); + fprintf(fp, ".unk"); break; } } -static void dump_fma_expand_src1(unsigned ctrl) +static void dump_fma_expand_src1(FILE *fp, unsigned ctrl) { switch (ctrl) { case 1: case 3: - printf(".x"); + fprintf(fp, ".x"); break; case 2: case 4: case 5: - printf(".y"); + fprintf(fp, ".y"); break; case 0: case 6: case 7: break; default: - printf(".unk"); + fprintf(fp, ".unk"); break; } } -static void dump_fma(uint64_t word, struct bifrost_regs regs, struct bifrost_regs next_regs, uint64_t *consts, bool verbose) +static void dump_fma(FILE *fp, uint64_t word, struct bifrost_regs regs, struct bifrost_regs next_regs, uint64_t *consts, bool verbose) { if (verbose) { - printf("# FMA: %016" PRIx64 "\n", word); + fprintf(fp, "# FMA: %016" PRIx64 "\n", word); } struct bifrost_fma_inst FMA; memcpy((char *) &FMA, (char *) &word, sizeof(struct bifrost_fma_inst)); struct fma_op_info info = find_fma_op_info(FMA.op); - printf("%s", info.name); + fprintf(fp, "%s", info.name); if (info.src_type == FMA_FADD || info.src_type == FMA_FMINMAX || info.src_type == FMA_FMA || info.src_type == FMA_FADD16 || info.src_type == FMA_FMINMAX16 || info.src_type == FMA_FMA16) { - dump_output_mod(bits(FMA.op, 12, 14)); + dump_output_mod(fp, bits(FMA.op, 12, 14)); switch (info.src_type) { case FMA_FADD: case FMA_FMA: case FMA_FADD16: case FMA_FMA16: - dump_round_mode(bits(FMA.op, 10, 12)); + dump_round_mode(fp, bits(FMA.op, 10, 12)); break; case FMA_FMINMAX: case FMA_FMINMAX16: - dump_minmax_mode(bits(FMA.op, 10, 12)); + dump_minmax_mode(fp, bits(FMA.op, 10, 12)); break; default: assert(0); } } else if (info.src_type == FMA_FCMP || info.src_type == FMA_FCMP16) { - dump_fcmp(bits(FMA.op, 10, 13)); + dump_fcmp(fp, bits(FMA.op, 10, 13)); if (info.src_type == FMA_FCMP) - printf(".f32"); + fprintf(fp, ".f32"); else - printf(".v2f16"); + fprintf(fp, ".v2f16"); } else if (info.src_type == FMA_FMA_MSCALE) { if (FMA.op & (1 << 11)) { switch ((FMA.op >> 9) & 0x3) { @@ -965,190 +966,190 @@ * presumably to make sure that the same exact nan is * returned for 1/nan. */ - printf(".rcp_mode"); + fprintf(fp, ".rcp_mode"); break; case 3: /* Similar to the above, but src0 always wins when multiplying * 0 by infinity. */ - printf(".sqrt_mode"); + fprintf(fp, ".sqrt_mode"); break; default: - printf(".unk%d_mode", (int) (FMA.op >> 9) & 0x3); + fprintf(fp, ".unk%d_mode", (int) (FMA.op >> 9) & 0x3); } } else { - dump_output_mod(bits(FMA.op, 9, 11)); + dump_output_mod(fp, bits(FMA.op, 9, 11)); } } - printf(" "); + fprintf(fp, " "); - struct bifrost_reg_ctrl next_ctrl = DecodeRegCtrl(next_regs); + struct bifrost_reg_ctrl next_ctrl = DecodeRegCtrl(fp, next_regs); if (next_ctrl.fma_write_unit != REG_WRITE_NONE) { - printf("{R%d, T0}, ", GetRegToWrite(next_ctrl.fma_write_unit, next_regs)); + fprintf(fp, "{R%d, T0}, ", GetRegToWrite(next_ctrl.fma_write_unit, next_regs)); } else { - printf("T0, "); + fprintf(fp, "T0, "); } switch (info.src_type) { case FMA_ONE_SRC: - dump_src(FMA.src0, regs, consts, true); + dump_src(fp, FMA.src0, regs, consts, true); break; case FMA_TWO_SRC: - dump_src(FMA.src0, regs, consts, true); - printf(", "); - dump_src(FMA.op & 0x7, regs, consts, true); + dump_src(fp, FMA.src0, regs, consts, true); + fprintf(fp, ", "); + dump_src(fp, FMA.op & 0x7, regs, consts, true); break; case FMA_FADD: case FMA_FMINMAX: if (FMA.op & 0x10) - printf("-"); + fprintf(fp, "-"); if (FMA.op & 0x200) - printf("abs("); - dump_src(FMA.src0, regs, consts, true); - dump_fma_expand_src0((FMA.op >> 6) & 0x7); + fprintf(fp, "abs("); + dump_src(fp, FMA.src0, regs, consts, true); + dump_fma_expand_src0(fp, (FMA.op >> 6) & 0x7); if (FMA.op & 0x200) - printf(")"); - printf(", "); + fprintf(fp, ")"); + fprintf(fp, ", "); if (FMA.op & 0x20) - printf("-"); + fprintf(fp, "-"); if (FMA.op & 0x8) - printf("abs("); - dump_src(FMA.op & 0x7, regs, consts, true); - dump_fma_expand_src1((FMA.op >> 6) & 0x7); + fprintf(fp, "abs("); + dump_src(fp, FMA.op & 0x7, regs, consts, true); + dump_fma_expand_src1(fp, (FMA.op >> 6) & 0x7); if (FMA.op & 0x8) - printf(")"); + fprintf(fp, ")"); break; case FMA_FADD16: case FMA_FMINMAX16: { bool abs1 = FMA.op & 0x8; bool abs2 = (FMA.op & 0x7) < FMA.src0; if (FMA.op & 0x10) - printf("-"); + fprintf(fp, "-"); if (abs1 || abs2) - printf("abs("); - dump_src(FMA.src0, regs, consts, true); - dump_16swizzle((FMA.op >> 6) & 0x3); + fprintf(fp, "abs("); + dump_src(fp, FMA.src0, regs, consts, true); + dump_16swizzle(fp, (FMA.op >> 6) & 0x3); if (abs1 || abs2) - printf(")"); - printf(", "); + fprintf(fp, ")"); + fprintf(fp, ", "); if (FMA.op & 0x20) - printf("-"); + fprintf(fp, "-"); if (abs1 && abs2) - printf("abs("); - dump_src(FMA.op & 0x7, regs, consts, true); - dump_16swizzle((FMA.op >> 8) & 0x3); + fprintf(fp, "abs("); + dump_src(fp, FMA.op & 0x7, regs, consts, true); + dump_16swizzle(fp, (FMA.op >> 8) & 0x3); if (abs1 && abs2) - printf(")"); + fprintf(fp, ")"); break; } case FMA_FCMP: if (FMA.op & 0x200) - printf("abs("); - dump_src(FMA.src0, regs, consts, true); - dump_fma_expand_src0((FMA.op >> 6) & 0x7); + fprintf(fp, "abs("); + dump_src(fp, FMA.src0, regs, consts, true); + dump_fma_expand_src0(fp, (FMA.op >> 6) & 0x7); if (FMA.op & 0x200) - printf(")"); - printf(", "); + fprintf(fp, ")"); + fprintf(fp, ", "); if (FMA.op & 0x20) - printf("-"); + fprintf(fp, "-"); if (FMA.op & 0x8) - printf("abs("); - dump_src(FMA.op & 0x7, regs, consts, true); - dump_fma_expand_src1((FMA.op >> 6) & 0x7); + fprintf(fp, "abs("); + dump_src(fp, FMA.op & 0x7, regs, consts, true); + dump_fma_expand_src1(fp, (FMA.op >> 6) & 0x7); if (FMA.op & 0x8) - printf(")"); + fprintf(fp, ")"); break; case FMA_FCMP16: - dump_src(FMA.src0, regs, consts, true); + dump_src(fp, FMA.src0, regs, consts, true); // Note: this is kinda a guess, I haven't seen the blob set this to // anything other than the identity, but it matches FMA_TWO_SRCFmod16 - dump_16swizzle((FMA.op >> 6) & 0x3); - printf(", "); - dump_src(FMA.op & 0x7, regs, consts, true); - dump_16swizzle((FMA.op >> 8) & 0x3); + dump_16swizzle(fp, (FMA.op >> 6) & 0x3); + fprintf(fp, ", "); + dump_src(fp, FMA.op & 0x7, regs, consts, true); + dump_16swizzle(fp, (FMA.op >> 8) & 0x3); break; case FMA_SHIFT_ADD64: - dump_src(FMA.src0, regs, consts, true); - printf(", "); - dump_src(FMA.op & 0x7, regs, consts, true); - printf(", "); - printf("shift:%u", (FMA.op >> 3) & 0x7); + dump_src(fp, FMA.src0, regs, consts, true); + fprintf(fp, ", "); + dump_src(fp, FMA.op & 0x7, regs, consts, true); + fprintf(fp, ", "); + fprintf(fp, "shift:%u", (FMA.op >> 3) & 0x7); break; case FMA_THREE_SRC: - dump_src(FMA.src0, regs, consts, true); - printf(", "); - dump_src(FMA.op & 0x7, regs, consts, true); - printf(", "); - dump_src((FMA.op >> 3) & 0x7, regs, consts, true); + dump_src(fp, FMA.src0, regs, consts, true); + fprintf(fp, ", "); + dump_src(fp, FMA.op & 0x7, regs, consts, true); + fprintf(fp, ", "); + dump_src(fp, (FMA.op >> 3) & 0x7, regs, consts, true); break; case FMA_FMA: if (FMA.op & (1 << 14)) - printf("-"); + fprintf(fp, "-"); if (FMA.op & (1 << 9)) - printf("abs("); - dump_src(FMA.src0, regs, consts, true); - dump_fma_expand_src0((FMA.op >> 6) & 0x7); + fprintf(fp, "abs("); + dump_src(fp, FMA.src0, regs, consts, true); + dump_fma_expand_src0(fp, (FMA.op >> 6) & 0x7); if (FMA.op & (1 << 9)) - printf(")"); - printf(", "); + fprintf(fp, ")"); + fprintf(fp, ", "); if (FMA.op & (1 << 16)) - printf("abs("); - dump_src(FMA.op & 0x7, regs, consts, true); - dump_fma_expand_src1((FMA.op >> 6) & 0x7); + fprintf(fp, "abs("); + dump_src(fp, FMA.op & 0x7, regs, consts, true); + dump_fma_expand_src1(fp, (FMA.op >> 6) & 0x7); if (FMA.op & (1 << 16)) - printf(")"); - printf(", "); + fprintf(fp, ")"); + fprintf(fp, ", "); if (FMA.op & (1 << 15)) - printf("-"); + fprintf(fp, "-"); if (FMA.op & (1 << 17)) - printf("abs("); - dump_src((FMA.op >> 3) & 0x7, regs, consts, true); + fprintf(fp, "abs("); + dump_src(fp, (FMA.op >> 3) & 0x7, regs, consts, true); if (FMA.op & (1 << 17)) - printf(")"); + fprintf(fp, ")"); break; case FMA_FMA16: if (FMA.op & (1 << 14)) - printf("-"); - dump_src(FMA.src0, regs, consts, true); - dump_16swizzle((FMA.op >> 6) & 0x3); - printf(", "); - dump_src(FMA.op & 0x7, regs, consts, true); - dump_16swizzle((FMA.op >> 8) & 0x3); - printf(", "); + fprintf(fp, "-"); + dump_src(fp, FMA.src0, regs, consts, true); + dump_16swizzle(fp, (FMA.op >> 6) & 0x3); + fprintf(fp, ", "); + dump_src(fp, FMA.op & 0x7, regs, consts, true); + dump_16swizzle(fp, (FMA.op >> 8) & 0x3); + fprintf(fp, ", "); if (FMA.op & (1 << 15)) - printf("-"); - dump_src((FMA.op >> 3) & 0x7, regs, consts, true); - dump_16swizzle((FMA.op >> 16) & 0x3); + fprintf(fp, "-"); + dump_src(fp, (FMA.op >> 3) & 0x7, regs, consts, true); + dump_16swizzle(fp, (FMA.op >> 16) & 0x3); break; case FMA_FOUR_SRC: - dump_src(FMA.src0, regs, consts, true); - printf(", "); - dump_src(FMA.op & 0x7, regs, consts, true); - printf(", "); - dump_src((FMA.op >> 3) & 0x7, regs, consts, true); - printf(", "); - dump_src((FMA.op >> 6) & 0x7, regs, consts, true); + dump_src(fp, FMA.src0, regs, consts, true); + fprintf(fp, ", "); + dump_src(fp, FMA.op & 0x7, regs, consts, true); + fprintf(fp, ", "); + dump_src(fp, (FMA.op >> 3) & 0x7, regs, consts, true); + fprintf(fp, ", "); + dump_src(fp, (FMA.op >> 6) & 0x7, regs, consts, true); break; case FMA_FMA_MSCALE: if (FMA.op & (1 << 12)) - printf("abs("); - dump_src(FMA.src0, regs, consts, true); + fprintf(fp, "abs("); + dump_src(fp, FMA.src0, regs, consts, true); if (FMA.op & (1 << 12)) - printf(")"); - printf(", "); + fprintf(fp, ")"); + fprintf(fp, ", "); if (FMA.op & (1 << 13)) - printf("-"); - dump_src(FMA.op & 0x7, regs, consts, true); - printf(", "); + fprintf(fp, "-"); + dump_src(fp, FMA.op & 0x7, regs, consts, true); + fprintf(fp, ", "); if (FMA.op & (1 << 14)) - printf("-"); - dump_src((FMA.op >> 3) & 0x7, regs, consts, true); - printf(", "); - dump_src((FMA.op >> 6) & 0x7, regs, consts, true); + fprintf(fp, "-"); + dump_src(fp, (FMA.op >> 3) & 0x7, regs, consts, true); + fprintf(fp, ", "); + dump_src(fp, (FMA.op >> 6) & 0x7, regs, consts, true); break; } - printf("\n"); + fprintf(fp, "\n"); } static const struct add_op_info add_op_infos[] = { @@ -1429,77 +1430,78 @@ return info; } -static void dump_add(uint64_t word, struct bifrost_regs regs, struct bifrost_regs next_regs, uint64_t *consts, +static void dump_add(FILE *fp, uint64_t word, struct bifrost_regs regs, + struct bifrost_regs next_regs, uint64_t *consts, unsigned data_reg, unsigned offset, bool verbose) { if (verbose) { - printf("# ADD: %016" PRIx64 "\n", word); + fprintf(fp, "# ADD: %016" PRIx64 "\n", word); } struct bifrost_add_inst ADD; memcpy((char *) &ADD, (char *) &word, sizeof(ADD)); struct add_op_info info = find_add_op_info(ADD.op); - printf("%s", info.name); + fprintf(fp, "%s", info.name); // float16 seems like it doesn't support output modifiers if (info.src_type == ADD_FADD || info.src_type == ADD_FMINMAX) { // output modifiers - dump_output_mod(bits(ADD.op, 8, 10)); + dump_output_mod(fp, bits(ADD.op, 8, 10)); if (info.src_type == ADD_FADD) - dump_round_mode(bits(ADD.op, 10, 12)); + dump_round_mode(fp, bits(ADD.op, 10, 12)); else - dump_minmax_mode(bits(ADD.op, 10, 12)); + dump_minmax_mode(fp, bits(ADD.op, 10, 12)); } else if (info.src_type == ADD_FCMP || info.src_type == ADD_FCMP16) { - dump_fcmp(bits(ADD.op, 3, 6)); + dump_fcmp(fp, bits(ADD.op, 3, 6)); if (info.src_type == ADD_FCMP) - printf(".f32"); + fprintf(fp, ".f32"); else - printf(".v2f16"); + fprintf(fp, ".v2f16"); } else if (info.src_type == ADD_FADDMscale) { switch ((ADD.op >> 6) & 0x7) { case 0: break; // causes GPU hangs on G71 case 1: - printf(".invalid"); + fprintf(fp, ".invalid"); break; // Same as usual outmod value. case 2: - printf(".clamp_0_1"); + fprintf(fp, ".clamp_0_1"); break; // If src0 is infinite or NaN, flush it to zero so that the other // source is passed through unmodified. case 3: - printf(".flush_src0_inf_nan"); + fprintf(fp, ".flush_src0_inf_nan"); break; // Vice versa. case 4: - printf(".flush_src1_inf_nan"); + fprintf(fp, ".flush_src1_inf_nan"); break; // Every other case seems to behave the same as the above? default: - printf(".unk%d", (ADD.op >> 6) & 0x7); + fprintf(fp, ".unk%d", (ADD.op >> 6) & 0x7); break; } } else if (info.src_type == ADD_VARYING_INTERP) { if (ADD.op & 0x200) - printf(".reuse"); + fprintf(fp, ".reuse"); if (ADD.op & 0x400) - printf(".flat"); + fprintf(fp, ".flat"); switch ((ADD.op >> 7) & 0x3) { case 0: - printf(".per_frag"); + fprintf(fp, ".per_frag"); break; case 1: - printf(".centroid"); + fprintf(fp, ".centroid"); break; case 2: break; case 3: - printf(".explicit"); + fprintf(fp, ".explicit"); break; } - printf(".v%d", ((ADD.op >> 5) & 0x3) + 1); + fprintf(fp, ".v%d", ((ADD.op >> 5) & 0x3) + 1); } else if (info.src_type == ADD_BRANCH) { enum branch_code branchCode = (enum branch_code) ((ADD.op >> 6) & 0x3f); if (branchCode == BR_ALWAYS) { @@ -1526,87 +1528,87 @@ switch (cond) { case BR_COND_LT: if (portSwapped) - printf(".LT.u"); + fprintf(fp, ".LT.u"); else - printf(".LT.i"); + fprintf(fp, ".LT.i"); break; case BR_COND_LE: if (size == BR_SIZE_32_AND_16X || size == BR_SIZE_32_AND_16Y) { - printf(".UNE.f"); + fprintf(fp, ".UNE.f"); } else { if (portSwapped) - printf(".LE.u"); + fprintf(fp, ".LE.u"); else - printf(".LE.i"); + fprintf(fp, ".LE.i"); } break; case BR_COND_GT: if (portSwapped) - printf(".GT.u"); + fprintf(fp, ".GT.u"); else - printf(".GT.i"); + fprintf(fp, ".GT.i"); break; case BR_COND_GE: if (portSwapped) - printf(".GE.u"); + fprintf(fp, ".GE.u"); else - printf(".GE.i"); + fprintf(fp, ".GE.i"); break; case BR_COND_EQ: if (portSwapped) - printf(".NE.i"); + fprintf(fp, ".NE.i"); else - printf(".EQ.i"); + fprintf(fp, ".EQ.i"); break; case BR_COND_OEQ: if (portSwapped) - printf(".UNE.f"); + fprintf(fp, ".UNE.f"); else - printf(".OEQ.f"); + fprintf(fp, ".OEQ.f"); break; case BR_COND_OGT: if (portSwapped) - printf(".OGT.unk.f"); + fprintf(fp, ".OGT.unk.f"); else - printf(".OGT.f"); + fprintf(fp, ".OGT.f"); break; case BR_COND_OLT: if (portSwapped) - printf(".OLT.unk.f"); + fprintf(fp, ".OLT.unk.f"); else - printf(".OLT.f"); + fprintf(fp, ".OLT.f"); break; } switch (size) { case BR_SIZE_32: case BR_SIZE_32_AND_16X: case BR_SIZE_32_AND_16Y: - printf("32"); + fprintf(fp, "32"); break; case BR_SIZE_16XX: case BR_SIZE_16YY: case BR_SIZE_16YX0: case BR_SIZE_16YX1: - printf("16"); + fprintf(fp, "16"); break; case BR_SIZE_ZERO: { unsigned ctrl = (ADD.op >> 1) & 0x3; if (ctrl == 0) - printf("32.Z"); + fprintf(fp, "32.Z"); else - printf("16.Z"); + fprintf(fp, "16.Z"); break; } } } } - printf(" "); + fprintf(fp, " "); - struct bifrost_reg_ctrl next_ctrl = DecodeRegCtrl(next_regs); + struct bifrost_reg_ctrl next_ctrl = DecodeRegCtrl(fp, next_regs); if (next_ctrl.add_write_unit != REG_WRITE_NONE) { - printf("{R%d, T1}, ", GetRegToWrite(next_ctrl.add_write_unit, next_regs)); + fprintf(fp, "{R%d, T1}, ", GetRegToWrite(next_ctrl.add_write_unit, next_regs)); } else { - printf("T1, "); + fprintf(fp, "T1, "); } switch (info.src_type) { @@ -1616,10 +1618,10 @@ // same instruction. This re-uses the encoding that normally means // "disabled", where the low 4 bits are ignored. Perhaps the extra // 0x8 or'd in indicates this is happening. - printf("location:%d, ", regs.uniform_const & 0x7); + fprintf(fp, "location:%d, ", regs.uniform_const & 0x7); // fallthrough case ADD_ONE_SRC: - dump_src(ADD.src0, regs, consts, false); + dump_src(fp, ADD.src0, regs, consts, false); break; case ADD_TEX: case ADD_TEX_COMPACT: { @@ -1632,7 +1634,7 @@ bool unknown = (ADD.op & 0x40); // TODO: figure out if the unknown bit is ever 0 if (!unknown) - printf("unknown "); + fprintf(fp, "unknown "); } else { uint64_t constVal = get_const(consts, regs); uint32_t controlBits = (ADD.op & 0x8) ? (constVal >> 32) : constVal; @@ -1643,11 +1645,11 @@ if (ctrl.result_type == 9) { struct bifrost_dual_tex_ctrl dualCtrl; memcpy((char *) &dualCtrl, (char *) &controlBits, sizeof(ctrl)); - printf("(dualtex) tex0:%d samp0:%d tex1:%d samp1:%d ", + fprintf(fp, "(dualtex) tex0:%d samp0:%d tex1:%d samp1:%d ", dualCtrl.tex_index0, dualCtrl.sampler_index0, dualCtrl.tex_index1, dualCtrl.sampler_index1); if (dualCtrl.unk0 != 3) - printf("unk:%d ", dualCtrl.unk0); + fprintf(fp, "unk:%d ", dualCtrl.unk0); dualTex = true; } else { if (ctrl.no_merge_index) { @@ -1657,7 +1659,7 @@ tex_index = sampler_index = ctrl.tex_index; unsigned unk = ctrl.sampler_index >> 2; if (unk != 3) - printf("unk:%d ", unk); + fprintf(fp, "unk:%d ", unk); if (ctrl.sampler_index & 1) tex_index = -1; if (ctrl.sampler_index & 2) @@ -1665,80 +1667,80 @@ } if (ctrl.unk0 != 3) - printf("unk0:%d ", ctrl.unk0); + fprintf(fp, "unk0:%d ", ctrl.unk0); if (ctrl.unk1) - printf("unk1 "); + fprintf(fp, "unk1 "); if (ctrl.unk2 != 0xf) - printf("unk2:%x ", ctrl.unk2); + fprintf(fp, "unk2:%x ", ctrl.unk2); switch (ctrl.result_type) { case 0x4: - printf("f32 "); + fprintf(fp, "f32 "); break; case 0xe: - printf("i32 "); + fprintf(fp, "i32 "); break; case 0xf: - printf("u32 "); + fprintf(fp, "u32 "); break; default: - printf("unktype(%x) ", ctrl.result_type); + fprintf(fp, "unktype(%x) ", ctrl.result_type); } switch (ctrl.tex_type) { case 0: - printf("cube "); + fprintf(fp, "cube "); break; case 1: - printf("buffer "); + fprintf(fp, "buffer "); break; case 2: - printf("2D "); + fprintf(fp, "2D "); break; case 3: - printf("3D "); + fprintf(fp, "3D "); break; } if (ctrl.is_shadow) - printf("shadow "); + fprintf(fp, "shadow "); if (ctrl.is_array) - printf("array "); + fprintf(fp, "array "); if (!ctrl.filter) { if (ctrl.calc_gradients) { int comp = (controlBits >> 20) & 0x3; - printf("txg comp:%d ", comp); + fprintf(fp, "txg comp:%d ", comp); } else { - printf("txf "); + fprintf(fp, "txf "); } } else { if (!ctrl.not_supply_lod) { if (ctrl.compute_lod) - printf("lod_bias "); + fprintf(fp, "lod_bias "); else - printf("lod "); + fprintf(fp, "lod "); } if (!ctrl.calc_gradients) - printf("grad "); + fprintf(fp, "grad "); } if (ctrl.texel_offset) - printf("offset "); + fprintf(fp, "offset "); } } if (!dualTex) { if (tex_index == -1) - printf("tex:indirect "); + fprintf(fp, "tex:indirect "); else - printf("tex:%d ", tex_index); + fprintf(fp, "tex:%d ", tex_index); if (sampler_index == -1) - printf("samp:indirect "); + fprintf(fp, "samp:indirect "); else - printf("samp:%d ", sampler_index); + fprintf(fp, "samp:%d ", sampler_index); } break; } @@ -1746,222 +1748,222 @@ unsigned addr = ADD.op & 0x1f; if (addr < 0b10100) { // direct addr - printf("%d", addr); + fprintf(fp, "%d", addr); } else if (addr < 0b11000) { if (addr == 22) - printf("fragw"); + fprintf(fp, "fragw"); else if (addr == 23) - printf("fragz"); + fprintf(fp, "fragz"); else - printf("unk%d", addr); + fprintf(fp, "unk%d", addr); } else { - dump_src(ADD.op & 0x7, regs, consts, false); + dump_src(fp, ADD.op & 0x7, regs, consts, false); } - printf(", "); - dump_src(ADD.src0, regs, consts, false); + fprintf(fp, ", "); + dump_src(fp, ADD.src0, regs, consts, false); break; } case ADD_VARYING_ADDRESS: { - dump_src(ADD.src0, regs, consts, false); - printf(", "); - dump_src(ADD.op & 0x7, regs, consts, false); - printf(", "); + dump_src(fp, ADD.src0, regs, consts, false); + fprintf(fp, ", "); + dump_src(fp, ADD.op & 0x7, regs, consts, false); + fprintf(fp, ", "); unsigned location = (ADD.op >> 3) & 0x1f; if (location < 16) { - printf("location:%d", location); + fprintf(fp, "location:%d", location); } else if (location == 20) { - printf("location:%u", (uint32_t) get_const(consts, regs)); + fprintf(fp, "location:%u", (uint32_t) get_const(consts, regs)); } else if (location == 21) { - printf("location:%u", (uint32_t) (get_const(consts, regs) >> 32)); + fprintf(fp, "location:%u", (uint32_t) (get_const(consts, regs) >> 32)); } else { - printf("location:%d(unk)", location); + fprintf(fp, "location:%d(unk)", location); } break; } case ADD_LOAD_ATTR: - printf("location:%d, ", (ADD.op >> 3) & 0xf); + fprintf(fp, "location:%d, ", (ADD.op >> 3) & 0xf); case ADD_TWO_SRC: - dump_src(ADD.src0, regs, consts, false); - printf(", "); - dump_src(ADD.op & 0x7, regs, consts, false); + dump_src(fp, ADD.src0, regs, consts, false); + fprintf(fp, ", "); + dump_src(fp, ADD.op & 0x7, regs, consts, false); break; case ADD_THREE_SRC: - dump_src(ADD.src0, regs, consts, false); - printf(", "); - dump_src(ADD.op & 0x7, regs, consts, false); - printf(", "); - dump_src((ADD.op >> 3) & 0x7, regs, consts, false); + dump_src(fp, ADD.src0, regs, consts, false); + fprintf(fp, ", "); + dump_src(fp, ADD.op & 0x7, regs, consts, false); + fprintf(fp, ", "); + dump_src(fp, (ADD.op >> 3) & 0x7, regs, consts, false); break; case ADD_FADD: case ADD_FMINMAX: if (ADD.op & 0x10) - printf("-"); + fprintf(fp, "-"); if (ADD.op & 0x1000) - printf("abs("); - dump_src(ADD.src0, regs, consts, false); + fprintf(fp, "abs("); + dump_src(fp, ADD.src0, regs, consts, false); switch ((ADD.op >> 6) & 0x3) { case 3: - printf(".x"); + fprintf(fp, ".x"); break; default: break; } if (ADD.op & 0x1000) - printf(")"); - printf(", "); + fprintf(fp, ")"); + fprintf(fp, ", "); if (ADD.op & 0x20) - printf("-"); + fprintf(fp, "-"); if (ADD.op & 0x8) - printf("abs("); - dump_src(ADD.op & 0x7, regs, consts, false); + fprintf(fp, "abs("); + dump_src(fp, ADD.op & 0x7, regs, consts, false); switch ((ADD.op >> 6) & 0x3) { case 1: case 3: - printf(".x"); + fprintf(fp, ".x"); break; case 2: - printf(".y"); + fprintf(fp, ".y"); break; case 0: break; default: - printf(".unk"); + fprintf(fp, ".unk"); break; } if (ADD.op & 0x8) - printf(")"); + fprintf(fp, ")"); break; case ADD_FADD16: if (ADD.op & 0x10) - printf("-"); + fprintf(fp, "-"); if (ADD.op & 0x1000) - printf("abs("); - dump_src(ADD.src0, regs, consts, false); + fprintf(fp, "abs("); + dump_src(fp, ADD.src0, regs, consts, false); if (ADD.op & 0x1000) - printf(")"); - dump_16swizzle((ADD.op >> 6) & 0x3); - printf(", "); + fprintf(fp, ")"); + dump_16swizzle(fp, (ADD.op >> 6) & 0x3); + fprintf(fp, ", "); if (ADD.op & 0x20) - printf("-"); + fprintf(fp, "-"); if (ADD.op & 0x8) - printf("abs("); - dump_src(ADD.op & 0x7, regs, consts, false); - dump_16swizzle((ADD.op >> 8) & 0x3); + fprintf(fp, "abs("); + dump_src(fp, ADD.op & 0x7, regs, consts, false); + dump_16swizzle(fp, (ADD.op >> 8) & 0x3); if (ADD.op & 0x8) - printf(")"); + fprintf(fp, ")"); break; case ADD_FMINMAX16: { bool abs1 = ADD.op & 0x8; bool abs2 = (ADD.op & 0x7) < ADD.src0; if (ADD.op & 0x10) - printf("-"); + fprintf(fp, "-"); if (abs1 || abs2) - printf("abs("); - dump_src(ADD.src0, regs, consts, false); - dump_16swizzle((ADD.op >> 6) & 0x3); + fprintf(fp, "abs("); + dump_src(fp, ADD.src0, regs, consts, false); + dump_16swizzle(fp, (ADD.op >> 6) & 0x3); if (abs1 || abs2) - printf(")"); - printf(", "); + fprintf(fp, ")"); + fprintf(fp, ", "); if (ADD.op & 0x20) - printf("-"); + fprintf(fp, "-"); if (abs1 && abs2) - printf("abs("); - dump_src(ADD.op & 0x7, regs, consts, false); - dump_16swizzle((ADD.op >> 8) & 0x3); + fprintf(fp, "abs("); + dump_src(fp, ADD.op & 0x7, regs, consts, false); + dump_16swizzle(fp, (ADD.op >> 8) & 0x3); if (abs1 && abs2) - printf(")"); + fprintf(fp, ")"); break; } case ADD_FADDMscale: { if (ADD.op & 0x400) - printf("-"); + fprintf(fp, "-"); if (ADD.op & 0x200) - printf("abs("); - dump_src(ADD.src0, regs, consts, false); + fprintf(fp, "abs("); + dump_src(fp, ADD.src0, regs, consts, false); if (ADD.op & 0x200) - printf(")"); + fprintf(fp, ")"); - printf(", "); + fprintf(fp, ", "); if (ADD.op & 0x800) - printf("-"); - dump_src(ADD.op & 0x7, regs, consts, false); + fprintf(fp, "-"); + dump_src(fp, ADD.op & 0x7, regs, consts, false); - printf(", "); + fprintf(fp, ", "); - dump_src((ADD.op >> 3) & 0x7, regs, consts, false); + dump_src(fp, (ADD.op >> 3) & 0x7, regs, consts, false); break; } case ADD_FCMP: if (ADD.op & 0x400) { - printf("-"); + fprintf(fp, "-"); } if (ADD.op & 0x100) { - printf("abs("); + fprintf(fp, "abs("); } - dump_src(ADD.src0, regs, consts, false); + dump_src(fp, ADD.src0, regs, consts, false); switch ((ADD.op >> 6) & 0x3) { case 3: - printf(".x"); + fprintf(fp, ".x"); break; default: break; } if (ADD.op & 0x100) { - printf(")"); + fprintf(fp, ")"); } - printf(", "); + fprintf(fp, ", "); if (ADD.op & 0x200) { - printf("abs("); + fprintf(fp, "abs("); } - dump_src(ADD.op & 0x7, regs, consts, false); + dump_src(fp, ADD.op & 0x7, regs, consts, false); switch ((ADD.op >> 6) & 0x3) { case 1: case 3: - printf(".x"); + fprintf(fp, ".x"); break; case 2: - printf(".y"); + fprintf(fp, ".y"); break; case 0: break; default: - printf(".unk"); + fprintf(fp, ".unk"); break; } if (ADD.op & 0x200) { - printf(")"); + fprintf(fp, ")"); } break; case ADD_FCMP16: - dump_src(ADD.src0, regs, consts, false); - dump_16swizzle((ADD.op >> 6) & 0x3); - printf(", "); - dump_src(ADD.op & 0x7, regs, consts, false); - dump_16swizzle((ADD.op >> 8) & 0x3); + dump_src(fp, ADD.src0, regs, consts, false); + dump_16swizzle(fp, (ADD.op >> 6) & 0x3); + fprintf(fp, ", "); + dump_src(fp, ADD.op & 0x7, regs, consts, false); + dump_16swizzle(fp, (ADD.op >> 8) & 0x3); break; case ADD_BRANCH: { enum branch_code code = (enum branch_code) ((ADD.op >> 6) & 0x3f); enum branch_bit_size size = (enum branch_bit_size) ((ADD.op >> 9) & 0x7); if (code != BR_ALWAYS) { - dump_src(ADD.src0, regs, consts, false); + dump_src(fp, ADD.src0, regs, consts, false); switch (size) { case BR_SIZE_16XX: - printf(".x"); + fprintf(fp, ".x"); break; case BR_SIZE_16YY: case BR_SIZE_16YX0: case BR_SIZE_16YX1: - printf(".y"); + fprintf(fp, ".y"); break; case BR_SIZE_ZERO: { unsigned ctrl = (ADD.op >> 1) & 0x3; switch (ctrl) { case 1: - printf(".y"); + fprintf(fp, ".y"); break; case 2: - printf(".x"); + fprintf(fp, ".x"); break; default: break; @@ -1970,25 +1972,25 @@ default: break; } - printf(", "); + fprintf(fp, ", "); } if (code != BR_ALWAYS && size != BR_SIZE_ZERO) { - dump_src(ADD.op & 0x7, regs, consts, false); + dump_src(fp, ADD.op & 0x7, regs, consts, false); switch (size) { case BR_SIZE_16XX: case BR_SIZE_16YX0: case BR_SIZE_16YX1: case BR_SIZE_32_AND_16X: - printf(".x"); + fprintf(fp, ".x"); break; case BR_SIZE_16YY: case BR_SIZE_32_AND_16Y: - printf(".y"); + fprintf(fp, ".y"); break; default: break; } - printf(", "); + fprintf(fp, ", "); } // I haven't had the chance to test if this actually specifies the // branch offset, since I couldn't get it to produce values other @@ -2020,33 +2022,34 @@ // Note: the offset is in bytes, relative to the beginning of the // current clause, so a zero offset would be a loop back to the // same clause (annoyingly different from Midgard). - printf("clause_%d", offset + branch_offset); + fprintf(fp, "clause_%d", offset + branch_offset); } else { - dump_src(offsetSrc, regs, consts, false); + dump_src(fp, offsetSrc, regs, consts, false); } } } if (info.has_data_reg) { - printf(", R%d", data_reg); + fprintf(fp, ", R%d", data_reg); } - printf("\n"); + fprintf(fp, "\n"); } -void dump_instr(const struct bifrost_alu_inst *instr, struct bifrost_regs next_regs, uint64_t *consts, +void dump_instr(FILE *fp, const struct bifrost_alu_inst *instr, + struct bifrost_regs next_regs, uint64_t *consts, unsigned data_reg, unsigned offset, bool verbose) { struct bifrost_regs regs; memcpy((char *) ®s, (char *) &instr->reg_bits, sizeof(regs)); if (verbose) { - printf("# regs: %016" PRIx64 "\n", instr->reg_bits); - dump_regs(regs); + fprintf(fp, "# regs: %016" PRIx64 "\n", instr->reg_bits); + dump_regs(fp, regs); } - dump_fma(instr->fma_bits, regs, next_regs, consts, verbose); - dump_add(instr->add_bits, regs, next_regs, consts, data_reg, offset, verbose); + dump_fma(fp, instr->fma_bits, regs, next_regs, consts, verbose); + dump_add(fp, instr->add_bits, regs, next_regs, consts, data_reg, offset, verbose); } -bool dump_clause(uint32_t *words, unsigned *size, unsigned offset, bool verbose) +bool dump_clause(FILE *fp, uint32_t *words, unsigned *size, unsigned offset, bool verbose) { // State for a decoded clause struct bifrost_alu_inst instrs[8] = {}; @@ -2059,10 +2062,10 @@ unsigned i; for (i = 0; ; i++, words += 4) { if (verbose) { - printf("# "); + fprintf(fp, "# "); for (int j = 0; j < 4; j++) - printf("%08x ", words[3 - j]); // low bit on the right - printf("\n"); + fprintf(fp, "%08x ", words[3 - j]); // low bit on the right + fprintf(fp, "\n"); } unsigned tag = bits(words[0], 0, 8); @@ -2081,7 +2084,7 @@ bool stop = tag & 0x40; if (verbose) { - printf("# tag: 0x%02x\n", tag); + fprintf(fp, "# tag: 0x%02x\n", tag); } if (tag & 0x80) { unsigned idx = stop ? 5 : 2; @@ -2137,7 +2140,7 @@ done = stop; break; default: - printf("unknown tag bits 0x%02x\n", tag); + fprintf(fp, "unknown tag bits 0x%02x\n", tag); } break; case 0x2: @@ -2177,7 +2180,7 @@ // share a buffer in the decoder, but we only care about // the position in the constant stream; the total number of // instructions is redundant. - unsigned const_idx = 7; + unsigned const_idx = 0; switch (pos) { case 0: case 1: @@ -2204,10 +2207,13 @@ const_idx = 4; break; default: - printf("# unknown pos 0x%x\n", pos); + fprintf(fp, "# unknown pos 0x%x\n", pos); + break; } + if (num_consts < const_idx + 2) num_consts = const_idx + 2; + consts[const_idx] = const0; consts[const_idx + 1] = const1; done = stop; @@ -2225,16 +2231,16 @@ *size = i + 1; if (verbose) { - printf("# header: %012" PRIx64 "\n", header_bits); + fprintf(fp, "# header: %012" PRIx64 "\n", header_bits); } struct bifrost_header header; memcpy((char *) &header, (char *) &header_bits, sizeof(struct bifrost_header)); - dump_header(header, verbose); + dump_header(fp, header, verbose); if (!header.no_end_of_shader) stopbit = true; - printf("{\n"); + fprintf(fp, "{\n"); for (i = 0; i < num_instrs; i++) { struct bifrost_regs next_regs; if (i + 1 == num_instrs) { @@ -2245,20 +2251,20 @@ sizeof(next_regs)); } - dump_instr(&instrs[i], next_regs, consts, header.datareg, offset, verbose); + dump_instr(fp, &instrs[i], next_regs, consts, header.datareg, offset, verbose); } - printf("}\n"); + fprintf(fp, "}\n"); if (verbose) { for (unsigned i = 0; i < num_consts; i++) { - printf("# const%d: %08" PRIx64 "\n", 2 * i, consts[i] & 0xffffffff); - printf("# const%d: %08" PRIx64 "\n", 2 * i + 1, consts[i] >> 32); + fprintf(fp, "# const%d: %08" PRIx64 "\n", 2 * i, consts[i] & 0xffffffff); + fprintf(fp, "# const%d: %08" PRIx64 "\n", 2 * i + 1, consts[i] >> 32); } } return stopbit; } -void disassemble_bifrost(uint8_t *code, size_t size, bool verbose) +void disassemble_bifrost(FILE *fp, uint8_t *code, size_t size, bool verbose) { uint32_t *words = (uint32_t *) code; uint32_t *words_end = words + (size / 4); @@ -2270,9 +2276,9 @@ uint32_t zero[4] = {}; if (memcmp(words, zero, 4 * sizeof(uint32_t)) == 0) break; - printf("clause_%d:\n", offset); + fprintf(fp, "clause_%d:\n", offset); unsigned size; - if (dump_clause(words, &size, offset, verbose) == true) { + if (dump_clause(fp, words, &size, offset, verbose) == true) { break; } words += size * 4; diff -Nru mesa-19.2.8/src/panfrost/bifrost/disassemble.h mesa-20.0.8/src/panfrost/bifrost/disassemble.h --- mesa-19.2.8/src/panfrost/bifrost/disassemble.h 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/panfrost/bifrost/disassemble.h 2020-06-12 01:21:18.000000000 +0000 @@ -26,4 +26,6 @@ #include #include #include -void disassemble_bifrost(uint8_t *code, size_t size, bool verbose); +#include + +void disassemble_bifrost(FILE *fp, uint8_t *code, size_t size, bool verbose); diff -Nru mesa-19.2.8/src/panfrost/encoder/meson.build mesa-20.0.8/src/panfrost/encoder/meson.build --- mesa-19.2.8/src/panfrost/encoder/meson.build 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/panfrost/encoder/meson.build 2020-06-12 01:21:18.000000000 +0000 @@ -0,0 +1,41 @@ +# Copyright © 2018 Rob Clark +# Copyright © 2019 Collabora + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +libpanfrost_encoder_files = files( + 'pan_encoder.h', + + 'pan_attributes.c', + 'pan_invocation.c', + 'pan_sampler.c', + 'pan_tiler.c', + 'pan_scratch.c', + 'pan_props.c', +) + +libpanfrost_encoder = static_library( + 'panfrost_encoder', + [libpanfrost_encoder_files], + include_directories : [inc_common, inc_panfrost_hw], + c_args : [c_vis_args, no_override_init_args], + cpp_args : [cpp_vis_args], + dependencies: [dep_libdrm], + build_by_default : false, +) diff -Nru mesa-19.2.8/src/panfrost/encoder/pan_attributes.c mesa-20.0.8/src/panfrost/encoder/pan_attributes.c --- mesa-19.2.8/src/panfrost/encoder/pan_attributes.c 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/panfrost/encoder/pan_attributes.c 2020-06-12 01:21:18.000000000 +0000 @@ -0,0 +1,221 @@ +/* + * Copyright (C) 2019 Collabora, Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include "util/u_math.h" +#include "panfrost-job.h" +#include "pan_encoder.h" + +/* This file handles attribute descriptors (mali_attr_meta). The + * bulk of the complexity is from instancing. See mali_job for + * notes on how this works. But basically, for small vertex + * counts, we have a lookup table, and for large vertex counts, + * we look at the high bits as a heuristic. This has to match + * exactly how the hardware calculates this (which is why the + * algorithm is so weird) or else instancing will break. */ + +/* Given an odd number (of the form 2k + 1), compute k */ +#define ODD(odd) ((odd - 1) >> 1) + +static unsigned +panfrost_small_padded_vertex_count(unsigned idx) +{ + if (idx == 11 || idx == 13 || idx == 15 || idx == 19) + return idx + 1; + else + return idx; +} + +static unsigned +panfrost_large_padded_vertex_count(uint32_t vertex_count) +{ + /* First, we have to find the highest set one */ + unsigned highest = 32 - __builtin_clz(vertex_count); + + /* Using that, we mask out the highest 4-bits */ + unsigned n = highest - 4; + unsigned nibble = (vertex_count >> n) & 0xF; + + /* Great, we have the nibble. Now we can just try possibilities. Note + * that we don't care about the bottom most bit in most cases, and we + * know the top bit must be 1 */ + + unsigned middle_two = (nibble >> 1) & 0x3; + + switch (middle_two) { + case 0b00: + if (nibble & 1) + return (1 << n) * 9; + else + return (1 << (n + 1)) * 5; + case 0b01: + return (1 << (n + 2)) * 3; + case 0b10: + return (1 << (n + 1)) * 7; + case 0b11: + return (1 << (n + 4)); + default: + return 0; /* unreachable */ + } +} + +unsigned +panfrost_padded_vertex_count(unsigned vertex_count) +{ + if (vertex_count < 20) + return panfrost_small_padded_vertex_count(vertex_count); + else + return panfrost_large_padded_vertex_count(vertex_count); +} + +/* The much, much more irritating case -- instancing is enabled. See + * panfrost_job.h for notes on how this works */ + +static unsigned +panfrost_compute_magic_divisor(unsigned hw_divisor, unsigned *o_shift, unsigned *extra_flags) +{ + /* We have a NPOT divisor. Here's the fun one (multipling by + * the inverse and shifting) */ + + /* floor(log2(d)) */ + unsigned shift = util_logbase2(hw_divisor); + + /* m = ceil(2^(32 + shift) / d) */ + uint64_t shift_hi = 32 + shift; + uint64_t t = 1ll << shift_hi; + double t_f = t; + double hw_divisor_d = hw_divisor; + double m_f = ceil(t_f / hw_divisor_d); + unsigned m = m_f; + + /* Default case */ + uint32_t magic_divisor = m; + + /* e = 2^(shift + 32) % d */ + uint64_t e = t % hw_divisor; + + /* Apply round-down algorithm? e <= 2^shift?. XXX: The blob + * seems to use a different condition */ + if (e <= (1ll << shift)) { + magic_divisor = m - 1; + *extra_flags = 1; + } + + /* Top flag implicitly set */ + assert(magic_divisor & (1u << 31)); + magic_divisor &= ~(1u << 31); + *o_shift = shift; + + return magic_divisor; +} + +unsigned +panfrost_vertex_instanced( + unsigned padded_count, + unsigned instance_shift, unsigned instance_odd, + unsigned divisor, + union mali_attr *attrs) +{ + /* Depending if there is an instance divisor or not, packing varies. + * When there is a divisor, the hardware-level divisor is actually the + * product of the instance divisor and the padded count */ + + unsigned hw_divisor = padded_count * divisor; + + if (divisor == 0) { + /* Per-vertex attributes use the MODULO mode. First, compute + * the modulus */ + + attrs->elements |= MALI_ATTR_MODULO; + attrs->shift = instance_shift; + attrs->extra_flags = instance_odd; + + return 1; + } else if (util_is_power_of_two_or_zero(hw_divisor)) { + /* If there is a divisor but the hardware divisor works out to + * a power of two (not terribly exceptional), we can use an + * easy path (just shifting) */ + + attrs->elements |= MALI_ATTR_POT_DIVIDE; + attrs->shift = __builtin_ctz(hw_divisor); + + return 1; + } else { + unsigned shift = 0, extra_flags = 0; + + attrs[1].magic_divisor = + panfrost_compute_magic_divisor(hw_divisor, &shift, &extra_flags); + + /* Upload to two different slots */ + + attrs[0].elements |= MALI_ATTR_NPOT_DIVIDE; + attrs[0].shift = shift; + attrs[0].extra_flags = extra_flags; + + attrs[1].unk = 0x20; + attrs[1].zero = 0; + attrs[1].divisor = divisor; + + return 2; + } +} + +/* Records for gl_VertexID and gl_InstanceID use a slightly special encoding, + * but the idea is the same */ + +void +panfrost_vertex_id( + unsigned padded_count, + union mali_attr *attr) +{ + /* We factor the padded count as shift/odd and that's it */ + + attr->elements = MALI_ATTR_VERTEXID; + attr->shift = __builtin_ctz(padded_count); + attr->extra_flags = padded_count >> (attr->shift + 1); + attr->stride = attr->size = 0; +} + +void +panfrost_instance_id( + unsigned padded_count, + union mali_attr *attr) +{ + attr->elements = MALI_ATTR_INSTANCEID; + attr->stride = attr->extra_flags = attr->size = 0; + + /* POT records have just a shift directly with an off-by-one for + * unclear reasons. NPOT records have a magic divisor smushed into the + * stride field (which is unused for these special records) */ + + if (util_is_power_of_two_or_zero(padded_count)) { + attr->shift = __builtin_ctz(padded_count) - 1; + } else { + unsigned shift = 0, flags = 0; + + attr->stride = panfrost_compute_magic_divisor(padded_count, &shift, &flags); + attr->shift = shift; + attr->extra_flags = flags; + } +} + diff -Nru mesa-19.2.8/src/panfrost/encoder/pan_encoder.h mesa-20.0.8/src/panfrost/encoder/pan_encoder.h --- mesa-19.2.8/src/panfrost/encoder/pan_encoder.h 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/panfrost/encoder/pan_encoder.h 2020-06-12 01:21:18.000000000 +0000 @@ -0,0 +1,110 @@ +/* + * Copyright (C) 2019 Collabora, Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors (Collabora): + * Alyssa Rosenzweig + */ + +#ifndef __PAN_ENCODER_H +#define __PAN_ENCODER_H + +#include +#include "panfrost-job.h" + +/* Invocation packing */ + +void +panfrost_pack_work_groups_compute( + struct mali_vertex_tiler_prefix *out, + unsigned num_x, + unsigned num_y, + unsigned num_z, + unsigned size_x, + unsigned size_y, + unsigned size_z, + bool quirk_graphics); + +void +panfrost_pack_work_groups_fused( + struct mali_vertex_tiler_prefix *vertex, + struct mali_vertex_tiler_prefix *tiler, + unsigned num_x, + unsigned num_y, + unsigned num_z, + unsigned size_x, + unsigned size_y, + unsigned size_z); + +/* Tiler structure size computation */ + +unsigned +panfrost_tiler_header_size(unsigned width, unsigned height, unsigned mask, bool hierarchy); + +unsigned +panfrost_tiler_full_size(unsigned width, unsigned height, unsigned mask, bool hierarchy); + +unsigned +panfrost_choose_hierarchy_mask( + unsigned width, unsigned height, + unsigned vertex_count, bool hierarchy); + +/* Stack sizes */ + +unsigned +panfrost_get_stack_shift(unsigned stack_size); + +unsigned +panfrost_get_total_stack_size( + unsigned stack_shift, + unsigned threads_per_core, + unsigned core_count); + +/* Property queries */ + + +unsigned panfrost_query_gpu_version(int fd); +unsigned panfrost_query_core_count(int fd); +unsigned panfrost_query_thread_tls_alloc(int fd); + +const char * panfrost_model_name(unsigned gpu_id); + +/* Attributes / instancing */ + +unsigned +panfrost_padded_vertex_count(unsigned vertex_count); + +unsigned +panfrost_vertex_instanced( + unsigned padded_count, + unsigned instance_shift, unsigned instance_odd, + unsigned divisor, + union mali_attr *attrs); + +void panfrost_vertex_id(unsigned padded_count, union mali_attr *attr); +void panfrost_instance_id(unsigned padded_count, union mali_attr *attr); + +/* Samplers */ + +enum mali_func +panfrost_flip_compare_func(enum mali_func f); + +#endif diff -Nru mesa-19.2.8/src/panfrost/encoder/pan_invocation.c mesa-20.0.8/src/panfrost/encoder/pan_invocation.c --- mesa-19.2.8/src/panfrost/encoder/pan_invocation.c 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/panfrost/encoder/pan_invocation.c 2020-06-12 01:21:18.000000000 +0000 @@ -0,0 +1,136 @@ +/* + * Copyright (C) 2019 Collabora, Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors (Collabora): + * Alyssa Rosenzweig + * + */ + +#include +#include "util/u_math.h" +#include "pan_encoder.h" + +/* Compute shaders are invoked with a gl_NumWorkGroups X/Y/Z triplet. Vertex + * shaders, it turns out, are invoked with the same mechanism, with the triplet + * (1, vertex_count, instance_count). + * + * Alongside this triplet is the gl_WorkGroupSize X/Y/Z triplet. + * + * Unfortunately, the packing for these triplet into the + * mali_vertex_tiler_prefix is a little funky, using a dynamic bitfield. The + * routines here exist to pack this */ + +void +panfrost_pack_work_groups_compute( + struct mali_vertex_tiler_prefix *out, + unsigned num_x, + unsigned num_y, + unsigned num_z, + unsigned size_x, + unsigned size_y, + unsigned size_z, + bool quirk_graphics) +{ + uint32_t packed = 0; + + /* The values needing packing, in order, and the corresponding shifts. + * Indicies into shift are off-by-one to make the logic easier */ + + unsigned shifts[7] = { 0 }; + + unsigned values[6] = { + MALI_POSITIVE(size_x), + MALI_POSITIVE(size_y), + MALI_POSITIVE(size_z), + MALI_POSITIVE(num_x), + MALI_POSITIVE(num_y), + MALI_POSITIVE(num_z), + }; + + for (unsigned i = 0; i < 6; ++i) { + /* OR it in, shifting as required */ + packed |= (values[i] << shifts[i]); + + /* How many bits did we use? */ + unsigned bit_count = util_logbase2_ceil(values[i] + 1); + + /* Set the next shift accordingly */ + shifts[i + 1] = shifts[i] + bit_count; + } + + /* Quirk: for non-instanced graphics, the blob sets workgroups_z_shift + * = 32. This doesn't appear to matter to the hardware, but it's good + * to be bit-identical. */ + + if (quirk_graphics && (num_z <= 1)) + shifts[5] = 32; + + /* Quirk: for graphics, workgroups_x_shift_2 must be at least 2, + * whereas for OpenCL it is simply equal to workgroups_x_shift. For GL + * compute, it seems it might *always* be 2, but this is suspicious and + * needs further investigation. (I'm probably just using GL wrong). */ + + unsigned shift_2 = shifts[3]; + + if (quirk_graphics) + shift_2 = MAX2(shift_2, 2); + + /* Pack them in */ + uint32_t packed_shifts = + (shifts[1] << 0) | + (shifts[2] << 5) | + (shifts[3] << 10) | + (shifts[4] << 16) | + (shifts[5] << 22) | + (shift_2 << 28); + + /* Upload the packed bitfields */ + out->invocation_count = packed; + out->invocation_shifts = packed_shifts; + + /* TODO: Compute workgroups_x_shift_3 */ + out->workgroups_x_shift_3 = shift_2; +} + +/* Packs vertex/tiler descriptors simultaneously */ +void +panfrost_pack_work_groups_fused( + struct mali_vertex_tiler_prefix *vertex, + struct mali_vertex_tiler_prefix *tiler, + unsigned num_x, + unsigned num_y, + unsigned num_z, + unsigned size_x, + unsigned size_y, + unsigned size_z) +{ + panfrost_pack_work_groups_compute(vertex, num_x, num_y, num_z, size_x, size_y, size_z, true); + + /* Copy results over */ + tiler->invocation_count = vertex->invocation_count; + tiler->invocation_shifts = vertex->invocation_shifts; + + /* Set special fields for each */ + vertex->workgroups_x_shift_3 = 5; + tiler->workgroups_x_shift_3 = 6; +} + diff -Nru mesa-19.2.8/src/panfrost/encoder/pan_props.c mesa-20.0.8/src/panfrost/encoder/pan_props.c --- mesa-19.2.8/src/panfrost/encoder/pan_props.c 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/panfrost/encoder/pan_props.c 2020-06-12 01:21:18.000000000 +0000 @@ -0,0 +1,102 @@ +/* + * Copyright (C) 2019 Collabora, Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: + * Alyssa Rosenzweig + */ + +#include + +#include "util/u_math.h" +#include "util/macros.h" +#include "drm-uapi/panfrost_drm.h" +#include "pan_encoder.h" + +/* Abstraction over the raw drm_panfrost_get_param ioctl for fetching + * information about devices */ + +static __u64 +panfrost_query_raw( + int fd, + enum drm_panfrost_param param, + bool required, + unsigned default_value) +{ + struct drm_panfrost_get_param get_param = {0,}; + ASSERTED int ret; + + get_param.param = DRM_PANFROST_PARAM_GPU_PROD_ID; + ret = drmIoctl(fd, DRM_IOCTL_PANFROST_GET_PARAM, &get_param); + + if (ret) { + assert(!required); + return default_value; + } + + return get_param.value; +} + +unsigned +panfrost_query_gpu_version(int fd) +{ + return panfrost_query_raw(fd, DRM_PANFROST_PARAM_GPU_PROD_ID, true, 0); +} + +unsigned +panfrost_query_core_count(int fd) +{ + /* On older kernels, worst-case to 16 cores */ + + unsigned mask = panfrost_query_raw(fd, + DRM_PANFROST_PARAM_SHADER_PRESENT, false, 0xffff); + + return util_bitcount(mask); +} + +unsigned +panfrost_query_thread_tls_alloc(int fd) +{ + /* On older kernels, we worst-case to 1024 threads, the architectural + * maximum for Midgard */ + + return panfrost_query_raw(fd, + DRM_PANFROST_PARAM_THREAD_TLS_ALLOC, false, 1024); +} + +/* Given a GPU ID like 0x860, return a prettified model name */ + +const char * +panfrost_model_name(unsigned gpu_id) +{ + switch (gpu_id) { + case 0x600: return "Mali T600 (Panfrost)"; + case 0x620: return "Mali T620 (Panfrost)"; + case 0x720: return "Mali T720 (Panfrost)"; + case 0x820: return "Mali T820 (Panfrost)"; + case 0x830: return "Mali T830 (Panfrost)"; + case 0x750: return "Mali T760 (Panfrost)"; + case 0x860: return "Mali T860 (Panfrost)"; + case 0x880: return "Mali T880 (Panfrost)"; + default: + unreachable("Invalid GPU ID"); + } +} diff -Nru mesa-19.2.8/src/panfrost/encoder/pan_sampler.c mesa-20.0.8/src/panfrost/encoder/pan_sampler.c --- mesa-19.2.8/src/panfrost/encoder/pan_sampler.c 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/panfrost/encoder/pan_sampler.c 2020-06-12 01:21:18.000000000 +0000 @@ -0,0 +1,45 @@ +/* + * Copyright (C) 2019 Collabora, Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include "pan_encoder.h" + +/* Sampler comparison functions are flipped in OpenGL from the hardware, so we + * need to be able to flip accordingly */ + +enum mali_func +panfrost_flip_compare_func(enum mali_func f) +{ + switch (f) { + case MALI_FUNC_LESS: + return MALI_FUNC_GREATER; + case MALI_FUNC_GREATER: + return MALI_FUNC_LESS; + case MALI_FUNC_LEQUAL: + return MALI_FUNC_GEQUAL; + case MALI_FUNC_GEQUAL: + return MALI_FUNC_LEQUAL; + default: + return f; + } +} diff -Nru mesa-19.2.8/src/panfrost/encoder/pan_scratch.c mesa-20.0.8/src/panfrost/encoder/pan_scratch.c --- mesa-19.2.8/src/panfrost/encoder/pan_scratch.c 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/panfrost/encoder/pan_scratch.c 2020-06-12 01:21:18.000000000 +0000 @@ -0,0 +1,99 @@ +/* + * Copyright (C) 2019 Collabora, Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: + * Alyssa Rosenzweig + */ + +#include "util/u_math.h" +#include "pan_encoder.h" + +/* Midgard has a small register file, so shaders with high register pressure + * need to spill from the register file onto the stack. In addition to + * spilling, it is desireable to allocate temporary arrays on the stack (for + * instance because the register file does not support indirect access but the + * stack does). + * + * The stack is located in "Thread Local Storage", sometimes abbreviated TLS in + * the kernel source code. Thread local storage is allocated per-thread, + * per-core, so threads executing concurrently do not interfere with each + * other's stacks. On modern kernels, we may query + * DRM_PANFROST_PARAM_THREAD_TLS_ALLOC for the number of threads per core we + * must allocate for, and DRM_PANFROST_PARAM_SHADER_PRESENT for a bitmask of + * shader cores (so take a popcount of that mask for the number of shader + * cores). On older kernels that do not support querying these values, + * following kbase, we may use the worst-case value of 1024 threads for + * THREAD_TLS_ALLOC, and the worst-case value of 16 cores for Midgard per the + * "shader core count" column of the implementations table in + * https://en.wikipedia.org/wiki/Mali_%28GPU% [citation needed] + * + * Within a particular thread, there is stack allocated. If it is present, its + * size is a power-of-two, and it is at least 256 bytes. Stack is allocated + * with the framebuffer descriptor used for all shaders within a frame (note + * that they don't execute concurrently so it's fine). So, consider the maximum + * stack size used by any shader within a job, and then compute (where npot + * denotes the next power of two): + * + * allocated = npot(max(size, 256)) * (# of threads/core) * (# of cores) + * + * The size of Thread Local Storage is signaled to the GPU in a dedicated + * log_stack_size field. Since stack sizes are powers of two, it follows that + * stack_size is logarithmic. Consider some sample values: + * + * stack size | log_stack_size + * --------------------------- + * 256 | 4 + * 512 | 5 + * 1024 | 6 + * + * Noting that log2(256) = 8, we have the relation: + * + * stack_size <= 2^(log_stack_size + 4) + * + * Given the constraints about powers-of-two and the minimum of 256, we thus + * derive a formula for log_stack_size in terms of stack size (s): + * + * log_stack_size = ceil(log2(max(s, 256))) - 4 + * + * There are other valid characterisations of this formula, of course, but this + * is computationally simple, so good enough for our purposes. + */ + +/* Computes log_stack_size = ceil(log2(max(s, 256))) - 4 */ + +unsigned +panfrost_get_stack_shift(unsigned stack_size) +{ + return util_logbase2_ceil(MAX2(stack_size, 256)) - 4; +} + +/* Computes the aligned stack size given the shift and thread count */ + +unsigned +panfrost_get_total_stack_size( + unsigned stack_shift, + unsigned threads_per_core, + unsigned core_count) +{ + unsigned stack_size = 1 << (stack_shift + 4); + return stack_size * threads_per_core * core_count; +} diff -Nru mesa-19.2.8/src/panfrost/encoder/pan_tiler.c mesa-20.0.8/src/panfrost/encoder/pan_tiler.c --- mesa-19.2.8/src/panfrost/encoder/pan_tiler.c 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/panfrost/encoder/pan_tiler.c 2020-06-12 01:21:18.000000000 +0000 @@ -0,0 +1,373 @@ +/* + * Copyright (C) 2019 Collabora, Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: + * Alyssa Rosenzweig + */ + +#include "util/u_math.h" +#include "util/macros.h" +#include "pan_encoder.h" + +/* Mali GPUs are tiled-mode renderers, rather than immediate-mode. + * Conceptually, the screen is divided into 16x16 tiles. Vertex shaders run. + * Then, a fixed-function hardware block (the tiler) consumes the gl_Position + * results. For each triangle specified, it marks each containing tile as + * containing that triangle. This set of "triangles per tile" form the "polygon + * list". Finally, the rasterization unit consumes the polygon list to invoke + * the fragment shader. + * + * In practice, it's a bit more complicated than this. On Midgard chips with an + * "advanced tiling unit" (all except T720/T820/T830), 16x16 is the logical + * tile size, but Midgard features "hierarchical tiling", where power-of-two + * multiples of the base tile size can be used: hierarchy level 0 (16x16), + * level 1 (32x32), level 2 (64x64), per public information about Midgard's + * tiling. In fact, tiling goes up to 4096x4096 (!), although in practice + * 128x128 is the largest usually used (though higher modes are enabled). The + * idea behind hierarchical tiling is to use low tiling levels for small + * triangles and high levels for large triangles, to minimize memory bandwidth + * and repeated fragment shader invocations (the former issue inherent to + * immediate-mode rendering and the latter common in traditional tilers). + * + * The tiler itself works by reading varyings in and writing a polygon list + * out. Unfortunately (for us), both of these buffers are managed in main + * memory; although they ideally will be cached, it is the drivers' + * responsibility to allocate these buffers. Varying buffer allocation is + * handled elsewhere, as it is not tiler specific; the real issue is allocating + * the polygon list. + * + * This is hard, because from the driver's perspective, we have no information + * about what geometry will actually look like on screen; that information is + * only gained from running the vertex shader. (Theoretically, we could run the + * vertex shaders in software as a prepass, or in hardware with transform + * feedback as a prepass, but either idea is ludicrous on so many levels). + * + * Instead, Mali uses a bit of a hybrid approach, splitting the polygon list + * into three distinct pieces. First, the driver statically determines which + * tile hierarchy levels to use (more on that later). At this point, we know the + * framebuffer dimensions and all the possible tilings of the framebuffer, so + * we know exactly how many tiles exist across all hierarchy levels. The first + * piece of the polygon list is the header, which is exactly 8 bytes per tile, + * plus padding and a small 64-byte prologue. (If that doesn't remind you of + * AFBC, it should. See pan_afbc.c for some fun parallels). The next part is + * the polygon list body, which seems to contain 512 bytes per tile, again + * across every level of the hierarchy. These two parts form the polygon list + * buffer. This buffer has a statically determinable size, approximately equal + * to the # of tiles across all hierarchy levels * (8 bytes + 512 bytes), plus + * alignment / minimum restrictions / etc. + * + * The third piece is the easy one (for us): the tiler heap. In essence, the + * tiler heap is a gigantic slab that's as big as could possibly be necessary + * in the worst case imaginable. Just... a gigantic allocation that we give a + * start and end pointer to. What's the catch? The tiler heap is lazily + * allocated; that is, a huge amount of memory is _reserved_, but only a tiny + * bit is actually allocated upfront. The GPU just keeps using the + * unallocated-but-reserved portions as it goes along, generating page faults + * if it goes beyond the allocation, and then the kernel is instructed to + * expand the allocation on page fault (known in the vendor kernel as growable + * memory). This is quite a bit of bookkeeping of its own, but that task is + * pushed to kernel space and we can mostly ignore it here, just remembering to + * set the GROWABLE flag so the kernel actually uses this path rather than + * allocating a gigantic amount up front and burning a hole in RAM. + * + * As far as determining which hierarchy levels to use, the simple answer is + * that right now, we don't. In the tiler configuration fields (consistent from + * the earliest Midgard's SFBD through the latest Bifrost traces we have), + * there is a hierarchy_mask field, controlling which levels (tile sizes) are + * enabled. Ideally, the hierarchical tiling dream -- mapping big polygons to + * big tiles and small polygons to small tiles -- would be realized here as + * well. As long as there are polygons at all needing tiling, we always have to + * have big tiles available, in case there are big polygons. But we don't + * necessarily need small tiles available. Ideally, when there are small + * polygons, small tiles are enabled (to avoid waste from putting small + * triangles in the big tiles); when there are not, small tiles are disabled to + * avoid enabling more levels than necessary, which potentially costs in memory + * bandwidth / power / tiler performance. + * + * Of course, the driver has to figure this out statically. When tile + * hiearchies are actually established, this occurs by the tiler in + * fixed-function hardware, after the vertex shaders have run and there is + * sufficient information to figure out the size of triangles. The driver has + * no such luxury, again barring insane hacks like additionally running the + * vertex shaders in software or in hardware via transform feedback. Thus, for + * the driver, we need a heuristic approach. + * + * There are lots of heuristics to guess triangle size statically you could + * imagine, but one approach shines as particularly simple-stupid: assume all + * on-screen triangles are equal size and spread equidistantly throughout the + * screen. Let's be clear, this is NOT A VALID ASSUMPTION. But if we roll with + * it, then we see: + * + * Triangle Area = (Screen Area / # of triangles) + * = (Width * Height) / (# of triangles) + * + * Or if you prefer, we can also make a third CRAZY assumption that we only draw + * right triangles with edges parallel/perpendicular to the sides of the screen + * with no overdraw, forming a triangle grid across the screen: + * + * |--w--| + * _____ | + * | /| /| | + * |/_|/_| h + * | /| /| | + * |/_|/_| | + * + * Then you can use some middle school geometry and algebra to work out the + * triangle dimensions. I started working on this, but realised I didn't need + * to to make my point, but couldn't bare to erase that ASCII art. Anyway. + * + * POINT IS, by considering the ratio of screen area and triangle count, we can + * estimate the triangle size. For a small size, use small bins; for a large + * size, use large bins. Intuitively, this metric makes sense: when there are + * few triangles on a large screen, you're probably compositing a UI and + * therefore the triangles are large; when there are a lot of triangles on a + * small screen, you're probably rendering a 3D mesh and therefore the + * triangles are tiny. (Or better said -- there will be tiny triangles, even if + * there are also large triangles. There have to be unless you expect crazy + * overdraw. Generally, it's better to allow more small bin sizes than + * necessary than not allow enough.) + * + * From this heuristic (or whatever), we determine the minimum allowable tile + * size, and we use that to decide the hierarchy masking, selecting from the + * minimum "ideal" tile size to the maximum tile size (2048x2048 in practice). + * + * Once we have that mask and the framebuffer dimensions, we can compute the + * size of the statically-sized polygon list structures, allocate them, and go! + * + * ----- + * + * On T720, T820, and T830, there is no support for hierarchical tiling. + * Instead, the hardware allows the driver to select the tile size dynamically + * on a per-framebuffer basis, including allowing rectangular/non-square tiles. + * Rules for tile size selection are as follows: + * + * - Dimensions must be powers-of-two. + * - The smallest tile is 16x16. + * - The tile width/height is at most the framebuffer w/h (clamp up to 16 pix) + * - There must be no more than 64 tiles in either dimension. + * + * Within these constraints, the driver is free to pick a tile size according + * to some heuristic, similar to units with an advanced tiling unit. + * + * To pick a size without any heuristics, we may satisfy the constraints by + * defaulting to 16x16 (a power-of-two). This fits the minimum. For the size + * constraint, consider: + * + * # of tiles < 64 + * ceil (fb / tile) < 64 + * (fb / tile) <= (64 - 1) + * tile <= fb / (64 - 1) <= next_power_of_two(fb / (64 - 1)) + * + * Hence we clamp up to align_pot(fb / (64 - 1)). + + * Extending to use a selection heuristic left for future work. + * + * Once the tile size (w, h) is chosen, we compute the hierarchy "mask": + * + * hierarchy_mask = (log2(h / 16) << 6) | log2(w / 16) + * + * Of course with no hierarchical tiling, this is not a mask; it's just a field + * specifying the tile size. But I digress. + * + * We also compute the polgon list sizes (with framebuffer size W, H) as: + * + * full_size = 0x200 + 0x200 * ceil(W / w) * ceil(H / h) + * offset = 8 * ceil(W / w) * ceil(H / h) + * + * It further appears necessary to round down offset to the nearest 0x200. + * Possibly we would also round down full_size to the nearest 0x200 but + * full_size/0x200 = (1 + ceil(W / w) * ceil(H / h)) is an integer so there's + * nothing to do. + */ + +/* Hierarchical tiling spans from 16x16 to 4096x4096 tiles */ + +#define MIN_TILE_SIZE 16 +#define MAX_TILE_SIZE 4096 + +/* Constants as shifts for easier power-of-two iteration */ + +#define MIN_TILE_SHIFT util_logbase2(MIN_TILE_SIZE) +#define MAX_TILE_SHIFT util_logbase2(MAX_TILE_SIZE) + +/* The hierarchy has a 64-byte prologue */ +#define PROLOGUE_SIZE 0x40 + +/* For each tile (across all hierarchy levels), there is 8 bytes of header */ +#define HEADER_BYTES_PER_TILE 0x8 + +/* Likewise, each tile per level has 512 bytes of body */ +#define FULL_BYTES_PER_TILE 0x200 + +/* If the width-x-height framebuffer is divided into tile_size-x-tile_size + * tiles, how many tiles are there? Rounding up in each direction. For the + * special case of tile_size=16, this aligns with the usual Midgard count. + * tile_size must be a power-of-two. Not really repeat code from AFBC/checksum, + * because those care about the stride (not just the overall count) and only at + * a a fixed-tile size (not any of a number of power-of-twos) */ + +static unsigned +pan_tile_count(unsigned width, unsigned height, unsigned tile_width, unsigned tile_height) +{ + unsigned aligned_width = ALIGN_POT(width, tile_width); + unsigned aligned_height = ALIGN_POT(height, tile_height); + + unsigned tile_count_x = aligned_width / tile_width; + unsigned tile_count_y = aligned_height / tile_height; + + return tile_count_x * tile_count_y; +} + +/* For `masked_count` of the smallest tile sizes masked out, computes how the + * size of the polygon list header. We iterate the tile sizes (16x16 through + * 2048x2048). For each tile size, we figure out how many tiles there are at + * this hierarchy level and therefore many bytes this level is, leaving us with + * a byte count for each level. We then just sum up the byte counts across the + * levels to find a byte count for all levels. */ + +static unsigned +panfrost_hierarchy_size( + unsigned width, + unsigned height, + unsigned mask, + unsigned bytes_per_tile) +{ + unsigned size = PROLOGUE_SIZE; + + /* Iterate hierarchy levels */ + + for (unsigned b = 0; b < (MAX_TILE_SHIFT - MIN_TILE_SHIFT); ++b) { + /* Check if this level is enabled */ + if (!(mask & (1 << b))) + continue; + + /* Shift from a level to a tile size */ + unsigned tile_size = (1 << b) * MIN_TILE_SIZE; + + unsigned tile_count = pan_tile_count(width, height, tile_size, tile_size); + unsigned level_count = bytes_per_tile * tile_count; + + size += level_count; + } + + /* This size will be used as an offset, so ensure it's aligned */ + return ALIGN_POT(size, 0x200); +} + +/* Implement the formula: + * + * 0x200 + bytes_per_tile * ceil(W / w) * ceil(H / h) + * + * rounding down the answer to the nearest 0x200. This is used to compute both + * header and body sizes for GPUs without hierarchical tiling. Essentially, + * computing a single hierarchy level, since there isn't any hierarchy! + */ + +static unsigned +panfrost_flat_size(unsigned width, unsigned height, unsigned dim, unsigned bytes_per_tile) +{ + /* First, extract the tile dimensions */ + + unsigned tw = (1 << (dim & 0b111)) * 8; + unsigned th = (1 << ((dim & (0b111 << 6)) >> 6)) * 8; + + /* tile_count is ceil(W/w) * ceil(H/h) */ + unsigned raw = pan_tile_count(width, height, tw, th) * bytes_per_tile; + + /* Round down and add offset */ + return 0x200 + ((raw / 0x200) * 0x200); +} + +/* Given a hierarchy mask and a framebuffer size, compute the header size */ + +unsigned +panfrost_tiler_header_size(unsigned width, unsigned height, unsigned mask, bool hierarchy) +{ + if (hierarchy) + return panfrost_hierarchy_size(width, height, mask, HEADER_BYTES_PER_TILE); + else + return panfrost_flat_size(width, height, mask, HEADER_BYTES_PER_TILE); +} + +/* The combined header/body is sized similarly (but it is significantly + * larger), except that it can be empty when the tiler disabled, rather than + * getting clamped to a minimum size. + */ + +unsigned +panfrost_tiler_full_size(unsigned width, unsigned height, unsigned mask, bool hierarchy) +{ + if (hierarchy) + return panfrost_hierarchy_size(width, height, mask, FULL_BYTES_PER_TILE); + else + return panfrost_flat_size(width, height, mask, FULL_BYTES_PER_TILE); +} + +/* On GPUs without hierarchical tiling, we choose a tile size directly and + * stuff it into the field otherwise known as hierarchy mask (not a mask). */ + +static unsigned +panfrost_choose_tile_size( + unsigned width, unsigned height, unsigned vertex_count) +{ + /* Figure out the ideal tile size. Eventually a heuristic should be + * used for this */ + + unsigned best_w = 16; + unsigned best_h = 16; + + /* Clamp so there are less than 64 tiles in each direction */ + + best_w = MAX2(best_w, util_next_power_of_two(width / 63)); + best_h = MAX2(best_h, util_next_power_of_two(height / 63)); + + /* We have our ideal tile size, so encode */ + + unsigned exp_w = util_logbase2(best_w / 16); + unsigned exp_h = util_logbase2(best_h / 16); + + return exp_w | (exp_h << 6); +} + +/* In the future, a heuristic to choose a tiler hierarchy mask would go here. + * At the moment, we just default to 0xFF, which enables all possible hierarchy + * levels. Overall this yields good performance but presumably incurs a cost in + * memory bandwidth / power consumption / etc, at least on smaller scenes that + * don't really need all the smaller levels enabled */ + +unsigned +panfrost_choose_hierarchy_mask( + unsigned width, unsigned height, + unsigned vertex_count, bool hierarchy) +{ + /* If there is no geometry, we don't bother enabling anything */ + + if (!vertex_count) + return 0x00; + + if (!hierarchy) + return panfrost_choose_tile_size(width, height, vertex_count); + + /* Otherwise, default everything on. TODO: Proper tests */ + + return 0xFF; +} diff -Nru mesa-19.2.8/src/panfrost/include/panfrost-job.h mesa-20.0.8/src/panfrost/include/panfrost-job.h --- mesa-19.2.8/src/panfrost/include/panfrost-job.h 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/panfrost/include/panfrost-job.h 2020-06-12 01:21:18.000000000 +0000 @@ -29,20 +29,13 @@ #define __PANFROST_JOB_H__ #include +#include #include -#define MALI_SHORT_PTR_BITS (sizeof(u64)*8) - -#define MALI_FBD_HIERARCHY_WEIGHTS 8 - -#define MALI_PAYLOAD_SIZE 256 - -typedef u32 mali_jd_core_req; - enum mali_job_type { JOB_NOT_STARTED = 0, JOB_TYPE_NULL = 1, - JOB_TYPE_SET_VALUE = 2, + JOB_TYPE_WRITE_VALUE = 2, JOB_TYPE_CACHE_FLUSH = 3, JOB_TYPE_COMPUTE = 4, JOB_TYPE_VERTEX = 5, @@ -70,7 +63,6 @@ /* Applies to tiler_gl_enables */ - #define MALI_OCCLUSION_QUERY (1 << 3) #define MALI_OCCLUSION_PRECISE (1 << 4) @@ -78,20 +70,12 @@ * In OpenGL, this would corresponds to glFrontFace(GL_CW). Mesa and the blob * disagree about how to do viewport flipping, so the blob actually sets this * for GL_CW but then has a negative viewport stride */ + #define MALI_FRONT_CCW_TOP (1 << 5) #define MALI_CULL_FACE_FRONT (1 << 6) #define MALI_CULL_FACE_BACK (1 << 7) -/* TODO: Might this actually be a finer bitfield? */ -#define MALI_DEPTH_STENCIL_ENABLE 0x6400 - -#define DS_ENABLE(field) \ - (field == MALI_DEPTH_STENCIL_ENABLE) \ - ? "MALI_DEPTH_STENCIL_ENABLE" \ - : (field == 0) ? "0" \ - : "0 /* XXX: Unknown, check hexdump */" - /* Used in stencil and depth tests */ enum mali_func { @@ -105,19 +89,6 @@ MALI_FUNC_ALWAYS = 7 }; -/* Same OpenGL, but mixed up. Why? Because forget me, that's why! */ - -enum mali_alt_func { - MALI_ALT_FUNC_NEVER = 0, - MALI_ALT_FUNC_GREATER = 1, - MALI_ALT_FUNC_EQUAL = 2, - MALI_ALT_FUNC_GEQUAL = 3, - MALI_ALT_FUNC_LESS = 4, - MALI_ALT_FUNC_NOTEQUAL = 5, - MALI_ALT_FUNC_LEQUAL = 6, - MALI_ALT_FUNC_ALWAYS = 7 -}; - /* Flags apply to unknown2_3? */ #define MALI_HAS_MSAA (1 << 0) @@ -131,7 +102,7 @@ #define MALI_GET_DEPTH_FUNC(flags) ((flags >> 8) & 0x7) #define MALI_DEPTH_FUNC_MASK MALI_DEPTH_FUNC(0x7) -#define MALI_DEPTH_TEST (1 << 11) +#define MALI_DEPTH_WRITEMASK (1 << 11) /* Next flags to unknown2_4 */ #define MALI_STENCIL_TEST (1 << 0) @@ -258,6 +229,9 @@ /* The top 3 bits specify how the bits of each component are interpreted. */ +/* e.g. ETC2_RGB8 */ +#define MALI_FORMAT_COMPRESSED (0 << 5) + /* e.g. R11F_G11F_B10F */ #define MALI_FORMAT_SPECIAL (2 << 5) @@ -302,6 +276,16 @@ #define MALI_CHANNEL_FLOAT 7 enum mali_format { + MALI_ETC2_RGB8 = MALI_FORMAT_COMPRESSED | 0x1, + MALI_ETC2_R11_UNORM = MALI_FORMAT_COMPRESSED | 0x2, + MALI_ETC2_RGBA8 = MALI_FORMAT_COMPRESSED | 0x3, + MALI_ETC2_RG11_UNORM = MALI_FORMAT_COMPRESSED | 0x4, + MALI_ETC2_R11_SNORM = MALI_FORMAT_COMPRESSED | 0x11, + MALI_ETC2_RG11_SNORM = MALI_FORMAT_COMPRESSED | 0x12, + MALI_ETC2_RGB8A1 = MALI_FORMAT_COMPRESSED | 0x13, + MALI_ASTC_SRGB_SUPP = MALI_FORMAT_COMPRESSED | 0x16, + MALI_ASTC_HDR_SUPP = MALI_FORMAT_COMPRESSED | 0x17, + MALI_RGB565 = MALI_FORMAT_SPECIAL | 0x0, MALI_RGB5_A1_UNORM = MALI_FORMAT_SPECIAL | 0x2, MALI_RGB10_A2_UNORM = MALI_FORMAT_SPECIAL | 0x3, @@ -434,6 +418,14 @@ }; }; +/* We need to load the tilebuffer to blend (i.e. the destination factor is not + * ZERO) */ + +#define MALI_BLEND_LOAD_TIB (0x1) + +/* A blend shader is used to blend this render target */ +#define MALI_BLEND_MRT_SHADER (0x2) + /* On MRT Midgard systems (using an MFBD), each render target gets its own * blend descriptor */ @@ -445,8 +437,7 @@ struct midgard_blend_rt { /* Flags base value of 0x200 to enable the render target. * OR with 0x1 for blending (anything other than REPLACE). - * OR with 0x2 for programmable blending with 0-2 registers - * OR with 0x3 for programmable blending with 2+ registers + * OR with 0x2 for programmable blending * OR with MALI_BLEND_SRGB for implicit sRGB */ @@ -545,9 +536,7 @@ unsigned uniform_buffer_count : 4; unsigned flags : 12; - /* Whole number of uniform registers used, times two; - * whole number of work registers used (no scale). - */ + /* vec4 units */ unsigned work_count : 5; unsigned uniform_count : 5; unsigned unknown2 : 6; @@ -645,11 +634,7 @@ u16 job_index; u16 job_dependency_index_1; u16 job_dependency_index_2; - - union { - u64 next_job_64; - u32 next_job_32; - }; + u64 next_job; } __attribute__((packed)); /* These concern exception_status */ @@ -667,15 +652,17 @@ MALI_EXCEPTION_ACCESS_WRITE = 3 }; -struct mali_payload_set_value { - u64 out; - u64 unknown; -} __attribute__((packed)); +/* Details about write_value from panfrost igt tests which use it as a generic + * dword write primitive */ + +#define MALI_WRITE_VALUE_ZERO 3 -/* Special attributes have a fixed index */ -#define MALI_SPECIAL_ATTRIBUTE_BASE 16 -#define MALI_VERTEX_ID (MALI_SPECIAL_ATTRIBUTE_BASE + 0) -#define MALI_INSTANCE_ID (MALI_SPECIAL_ATTRIBUTE_BASE + 1) +struct mali_payload_write_value { + u64 address; + u32 value_descriptor; + u32 reserved; + u64 immediate; +} __attribute__((packed)); /* * Mali Attributes @@ -804,9 +791,8 @@ * let shift=extra_flags=0. Stride is set to the image format's bytes-per-pixel * (*NOT the row stride*). Size is set to the size of the image itself. * - * Special internal varyings (including gl_FrontFacing) are handled vai - * MALI_ATTR_INTERNAL, which has all fields set to zero and uses a special - * elements pseudo-pointer. + * Special internal attribtues and varyings (gl_VertexID, gl_FrontFacing, etc) + * use particular fixed addresses with modified structures. */ enum mali_attr_mode { @@ -816,19 +802,26 @@ MALI_ATTR_MODULO = 3, MALI_ATTR_NPOT_DIVIDE = 4, MALI_ATTR_IMAGE = 5, - MALI_ATTR_INTERNAL = 6 }; -/* Pseudo-address for gl_FrontFacing */ +/* Pseudo-address for gl_VertexID, gl_FragCoord, gl_FrontFacing */ -#define MALI_VARYING_FRONT_FACING (0x20) +#define MALI_ATTR_VERTEXID (0x22) +#define MALI_ATTR_INSTANCEID (0x24) +#define MALI_VARYING_FRAG_COORD (0x25) +#define MALI_VARYING_FRONT_FACING (0x26) /* This magic "pseudo-address" is used as `elements` to implement * gl_PointCoord. When read from a fragment shader, it generates a point * coordinate per the OpenGL ES 2.0 specification. Flipped coordinate spaces * require an affine transformation in the shader. */ -#define MALI_VARYING_POINT_COORD (0x60) +#define MALI_VARYING_POINT_COORD (0x61) + +/* Used for comparison to check if an address is special. Mostly a guess, but + * it doesn't really matter. */ + +#define MALI_RECORD_SPECIAL (0x100) union mali_attr { /* This is used for actual attributes. */ @@ -868,14 +861,14 @@ int32_t src_offset; } __attribute__((packed)); -enum mali_fbd_type { - MALI_SFBD = 0, - MALI_MFBD = 1, -}; - -#define FBD_TYPE (1) #define FBD_MASK (~0x3f) +/* MFBD, rather than SFBD */ +#define MALI_MFBD (0x1) + +/* ORed into an MFBD address to specify the fbx section is included */ +#define MALI_MFBD_TAG_EXTRA (0x2) + struct mali_uniform_buffer_meta { /* This is actually the size minus 1 (MALI_POSITIVE), in units of 16 * bytes. This gives a maximum of 2^14 bytes, which just so happens to @@ -901,7 +894,16 @@ #define MALI_DRAW_INDEXED_UINT8 (0x10) #define MALI_DRAW_INDEXED_UINT16 (0x20) #define MALI_DRAW_INDEXED_UINT32 (0x30) +#define MALI_DRAW_INDEXED_SIZE (0x30) +#define MALI_DRAW_INDEXED_SHIFT (4) + #define MALI_DRAW_VARYING_SIZE (0x100) + +/* Set to use first vertex as the provoking vertex for flatshading. Clear to + * use the last vertex. This is the default in DX and VK, but not in GL. */ + +#define MALI_DRAW_FLATSHADE_FIRST (0x800) + #define MALI_DRAW_PRIMITIVE_RESTART_FIXED_INDEX (0x10000) struct mali_vertex_tiler_prefix { @@ -930,13 +932,16 @@ */ u32 invocation_count; - u32 size_y_shift : 5; - u32 size_z_shift : 5; - u32 workgroups_x_shift : 6; - u32 workgroups_y_shift : 6; - u32 workgroups_z_shift : 6; - /* This is max(workgroups_x_shift, 2) in all the cases I've seen. */ - u32 workgroups_x_shift_2 : 4; + /* Bitfield for shifts: + * + * size_y_shift : 5 + * size_z_shift : 5 + * workgroups_x_shift : 6 + * workgroups_y_shift : 6 + * workgroups_z_shift : 6 + * workgroups_x_shift_2 : 4 + */ + u32 invocation_shifts; u32 draw_mode : 4; u32 unknown_draw : 22; @@ -1079,8 +1084,7 @@ u64 sampler_descriptor; u64 uniforms; - u8 flags : 4; - u64 _shader_upper : MALI_SHORT_PTR_BITS - 4; /* struct shader_meta */ + u64 shader; u64 attributes; /* struct attribute_buffer[] */ u64 attribute_meta; /* attribute_meta[] */ u64 varyings; /* struct attr */ @@ -1148,17 +1152,17 @@ #define MALI_POSITIVE(dim) (dim - 1) -/* Opposite of MALI_POSITIVE, found in the depth_units field */ - -#define MALI_NEGATIVE(dim) (dim + 1) - -/* Used with wrapping. Incomplete (this is a 4-bit field...) */ +/* Used with wrapping. Unclear what top bit conveys */ enum mali_wrap_mode { - MALI_WRAP_REPEAT = 0x8, - MALI_WRAP_CLAMP_TO_EDGE = 0x9, - MALI_WRAP_CLAMP_TO_BORDER = 0xB, - MALI_WRAP_MIRRORED_REPEAT = 0xC + MALI_WRAP_REPEAT = 0x8 | 0x0, + MALI_WRAP_CLAMP_TO_EDGE = 0x8 | 0x1, + MALI_WRAP_CLAMP = 0x8 | 0x2, + MALI_WRAP_CLAMP_TO_BORDER = 0x8 | 0x3, + MALI_WRAP_MIRRORED_REPEAT = 0x8 | 0x4 | 0x0, + MALI_WRAP_MIRRORED_CLAMP_TO_EDGE = 0x8 | 0x4 | 0x1, + MALI_WRAP_MIRRORED_CLAMP = 0x8 | 0x4 | 0x2, + MALI_WRAP_MIRRORED_CLAMP_TO_BORDER = 0x8 | 0x4 | 0x3, }; /* Shared across both command stream and Midgard, and even with Bifrost */ @@ -1179,10 +1183,21 @@ /* For each pointer, there is an address and optionally also a stride */ #define MAX_ELEMENTS (2) -/* Corresponds to the type passed to glTexImage2D and so forth */ +/* It's not known why there are 4-bits allocated -- this enum is almost + * certainly incomplete */ -/* Flags for usage2 */ -#define MALI_TEX_MANUAL_STRIDE (0x20) +enum mali_texture_layout { + /* For a Z/S texture, this is linear */ + MALI_TEXTURE_TILED = 0x1, + + /* Z/S textures cannot be tiled */ + MALI_TEXTURE_LINEAR = 0x2, + + /* 16x16 sparse */ + MALI_TEXTURE_AFBC = 0xC +}; + +/* Corresponds to the type passed to glTexImage2D and so forth */ struct mali_texture_format { unsigned swizzle : 12; @@ -1192,8 +1207,15 @@ unsigned unknown1 : 1; enum mali_texture_type type : 2; + enum mali_texture_layout layout : 4; + + /* Always set */ + unsigned unknown2 : 1; + + /* Set to allow packing an explicit stride */ + unsigned manual_stride : 1; - unsigned usage2 : 8; + unsigned zero : 2; } __attribute__((packed)); struct mali_texture_descriptor { @@ -1210,7 +1232,7 @@ uint8_t unknown3A; /* Zero for non-mipmapped, (number of levels - 1) for mipmapped */ - uint8_t nr_mipmap_levels; + uint8_t levels; /* Swizzling is a single 32-bit word, broken up here for convenience. * Here, swizzling refers to the ES 3.0 texture parameters for channel @@ -1223,8 +1245,6 @@ uint32_t unknown5; uint32_t unknown6; uint32_t unknown7; - - mali_ptr payload[MAX_MIP_LEVELS * MAX_CUBE_FACES * MAX_ELEMENTS]; } __attribute__((packed)); /* filter_mode */ @@ -1247,33 +1267,36 @@ #define DECODE_FIXED_16(x) ((float) (x / 256.0)) -static inline uint16_t -FIXED_16(float x) +static inline int16_t +FIXED_16(float x, bool allow_negative) { /* Clamp inputs, accounting for float error */ float max_lod = (32.0 - (1.0 / 512.0)); + float min_lod = allow_negative ? -max_lod : 0.0; - x = ((x > max_lod) ? max_lod : ((x < 0.0) ? 0.0 : x)); + x = ((x > max_lod) ? max_lod : ((x < min_lod) ? min_lod : x)); return (int) (x * 256.0); } struct mali_sampler_descriptor { - uint32_t filter_mode; - - /* Fixed point. Upper 8-bits is before the decimal point, although it - * caps [0-31]. Lower 8-bits is after the decimal point: int(round(x * - * 256)) */ + uint16_t filter_mode; - uint16_t min_lod; - uint16_t max_lod; + /* Fixed point, signed. + * Upper 7 bits before the decimal point, although it caps [0-31]. + * Lower 8 bits after the decimal point: int(round(x * 256)) */ + + int16_t lod_bias; + int16_t min_lod; + int16_t max_lod; - /* All one word in reality, but packed a bit */ + /* All one word in reality, but packed a bit. Comparisons are flipped + * from OpenGL. */ enum mali_wrap_mode wrap_s : 4; enum mali_wrap_mode wrap_t : 4; enum mali_wrap_mode wrap_r : 4; - enum mali_alt_func compare_func : 3; + enum mali_func compare_func : 3; /* No effect on 2D textures. For cubemaps, set for ES3 and clear for * ES2, controlling seamless cubemapping */ @@ -1328,11 +1351,6 @@ #define MALI_TILE_COORD_X(coord) ((coord) & MALI_X_COORD_MASK) #define MALI_TILE_COORD_Y(coord) (((coord) & MALI_Y_COORD_MASK) >> 16) -#define MALI_TILE_COORD_FLAGS(coord) ((coord) & ~(MALI_X_COORD_MASK | MALI_Y_COORD_MASK)) - -/* No known flags yet, but just in case...? */ - -#define MALI_TILE_NO_FLAG (0) /* Helpers to generate tile coordinates based on the boundary coordinates in * screen space. So, with the bounds (0, 0) to (128, 128) for the screen, these @@ -1357,9 +1375,10 @@ /* Flags apply to format. With just MSAA_A and MSAA_B, the framebuffer is * configured for 4x. With MSAA_8, it is configured for 8x. */ -#define MALI_FRAMEBUFFER_MSAA_8 (1 << 3) -#define MALI_FRAMEBUFFER_MSAA_A (1 << 4) -#define MALI_FRAMEBUFFER_MSAA_B (1 << 23) +#define MALI_SFBD_FORMAT_MSAA_8 (1 << 3) +#define MALI_SFBD_FORMAT_MSAA_A (1 << 4) +#define MALI_SFBD_FORMAT_MSAA_B (1 << 4) +#define MALI_SFBD_FORMAT_SRGB (1 << 5) /* Fast/slow based on whether all three buffers are cleared at once */ @@ -1371,6 +1390,20 @@ * within the larget framebuffer descriptor). Analogous to * bifrost_tiler_heap_meta and bifrost_tiler_meta*/ +/* See pan_tiler.c for derivation */ +#define MALI_HIERARCHY_MASK ((1 << 9) - 1) + +/* Flag disabling the tiler for clear-only jobs, with + hierarchical tiling */ +#define MALI_TILER_DISABLED (1 << 12) + +/* Flag selecting userspace-generated polygon list, for clear-only jobs without + * hierarhical tiling. */ +#define MALI_TILER_USER 0xFFF + +/* Absent any geometry, the minimum size of the polygon list header */ +#define MALI_TILER_MINIMUM_HEADER_SIZE 0x200 + struct midgard_tiler_descriptor { /* Size of the entire polygon list; see pan_tiler.c for the * computation. It's based on hierarchical tiling */ @@ -1381,7 +1414,9 @@ * flagged here is less known. We do that (tiler_hierarchy_mask & 0x1ff) * specifies a mask of hierarchy weights, which explains some of the * performance mysteries around setting it. We also see the bottom bit - * of tiler_flags set in the kernel, but no comment why. */ + * of tiler_flags set in the kernel, but no comment why. + * + * hierarchy_mask can have the TILER_DISABLED flag */ u16 hierarchy_mask; u16 flags; @@ -1401,17 +1436,41 @@ u32 weights[8]; }; +enum mali_block_format { + MALI_BLOCK_TILED = 0x0, + MALI_BLOCK_UNKNOWN = 0x1, + MALI_BLOCK_LINEAR = 0x2, + MALI_BLOCK_AFBC = 0x3, +}; + +struct mali_sfbd_format { + /* 0x1 */ + unsigned unk1 : 6; + + /* mali_channel_swizzle */ + unsigned swizzle : 12; + + /* MALI_POSITIVE */ + unsigned nr_channels : 2; + + /* 0x4 */ + unsigned unk2 : 6; + + enum mali_block_format block : 2; + + /* 0xb */ + unsigned unk3 : 4; +}; + struct mali_single_framebuffer { u32 unknown1; u32 unknown2; - u64 unknown_address_0; + mali_ptr scratchpad; + u64 zero1; u64 zero0; - /* Exact format is ironically not known, since EGL is finnicky with the - * blob. MSAA, colourspace, etc are configured here. */ - - u32 format; + struct mali_sfbd_format format; u32 clear_flags; u32 zero2; @@ -1422,7 +1481,10 @@ u16 width; u16 height; - u32 zero3[8]; + u32 zero3[4]; + mali_ptr checksum; + u32 checksum_stride; + u32 zero5; /* By default, the framebuffer is upside down from OpenGL's * perspective. Set framebuffer to the end and negate the stride to @@ -1440,10 +1502,14 @@ * disabled. */ mali_ptr depth_buffer; // not SAME_VA - u64 depth_buffer_enable; + u32 depth_stride_zero : 4; + u32 depth_stride : 28; + u32 zero7; mali_ptr stencil_buffer; // not SAME_VA - u64 stencil_buffer_enable; + u32 stencil_stride_zero : 4; + u32 stencil_stride : 28; + u32 zero8; u32 clear_color_1; // RGBA8888 from glClear, actually used by hardware u32 clear_color_2; // always equal, but unclear function? @@ -1478,13 +1544,6 @@ #define MALI_MFBD_FORMAT_MSAA (1 << 1) #define MALI_MFBD_FORMAT_SRGB (1 << 2) -enum mali_mfbd_block_format { - MALI_MFBD_BLOCK_TILED = 0x0, - MALI_MFBD_BLOCK_UNKNOWN = 0x1, - MALI_MFBD_BLOCK_LINEAR = 0x2, - MALI_MFBD_BLOCK_AFBC = 0x3, -}; - struct mali_rt_format { unsigned unk1 : 32; unsigned unk2 : 3; @@ -1492,7 +1551,7 @@ unsigned nr_channels : 2; /* MALI_POSITIVE */ unsigned unk3 : 5; - enum mali_mfbd_block_format block : 2; + enum mali_block_format block : 2; unsigned flags : 4; unsigned swizzle : 12; @@ -1514,27 +1573,21 @@ u64 zero1; - union { - struct { - /* Stuff related to ARM Framebuffer Compression. When AFBC is enabled, - * there is an extra metadata buffer that contains 16 bytes per tile. - * The framebuffer needs to be the same size as before, since we don't - * know ahead of time how much space it will take up. The - * framebuffer_stride is set to 0, since the data isn't stored linearly - * anymore. - */ - - mali_ptr metadata; - u32 stride; // stride in units of tiles - u32 unk; // = 0x20000 - } afbc; - - struct { - /* Heck if I know */ - u64 unk; - mali_ptr pointer; - } chunknown; - }; + struct { + /* Stuff related to ARM Framebuffer Compression. When AFBC is enabled, + * there is an extra metadata buffer that contains 16 bytes per tile. + * The framebuffer needs to be the same size as before, since we don't + * know ahead of time how much space it will take up. The + * framebuffer_stride is set to 0, since the data isn't stored linearly + * anymore. + * + * When AFBC is disabled, these fields are zero. + */ + + mali_ptr metadata; + u32 stride; // stride in units of tiles + u32 unk; // = 0x20000 + } afbc; mali_ptr framebuffer; @@ -1557,11 +1610,10 @@ * - TODO: Anything else? */ -/* Flags field: note, these are guesses */ +/* flags_hi */ +#define MALI_EXTRA_PRESENT (0x10) -#define MALI_EXTRA_PRESENT (0x400) -#define MALI_EXTRA_AFBC (0x20) -#define MALI_EXTRA_AFBC_ZS (0x10) +/* flags_lo */ #define MALI_EXTRA_ZS (0x4) struct bifrost_fb_extra { @@ -1569,7 +1621,9 @@ /* Each tile has an 8 byte checksum, so the stride is "width in tiles * 8" */ u32 checksum_stride; - u32 flags; + unsigned flags_lo : 4; + enum mali_block_format zs_block : 2; + unsigned flags_hi : 26; union { /* Note: AFBC is only allowed for 24/8 combined depth/stencil. */ @@ -1613,7 +1667,8 @@ #define MALI_MFBD_EXTRA (1 << 13) struct bifrost_framebuffer { - u32 unk0; // = 0x10 + u32 stack_shift : 4; + u32 unk0 : 28; u32 unknown2; // = 0x1f, same as SFBD mali_ptr scratchpad; diff -Nru mesa-19.2.8/src/panfrost/include/panfrost-quirks.h mesa-20.0.8/src/panfrost/include/panfrost-quirks.h --- mesa-19.2.8/src/panfrost/include/panfrost-quirks.h 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/panfrost/include/panfrost-quirks.h 2020-06-12 01:21:18.000000000 +0000 @@ -0,0 +1,67 @@ +/* + * Copyright (C) 2019 Collabora, Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __PANFROST_QUIRKS_H +#define __PANFROST_QUIRKS_H + +/* Model-specific quirks requiring workarounds/etc. Quirks may be errata + * requiring a workaround, or features. We're trying to be quirk-positive + * here; quirky is the best! */ + +/* Whether the GPU lacks the capability for hierarchical tiling, without an + * "Advanced Tiling Unit", instead requiring a single bin size for the entire + * framebuffer be selected by the driver */ + +#define MIDGARD_NO_HIER_TILING (1 << 0) + +/* Whether this GPU lacks native multiple render target support and accordingly + * needs SFBDs instead, with complex lowering with ES3 */ + +#define MIDGARD_SFBD (1 << 1) + +static inline unsigned +panfrost_get_quirks(unsigned gpu_id) +{ + switch (gpu_id) { + case 0x600: + case 0x620: + return MIDGARD_SFBD; + + case 0x720: + return MIDGARD_SFBD | MIDGARD_NO_HIER_TILING; + + case 0x820: + case 0x830: + return MIDGARD_NO_HIER_TILING; + + case 0x750: + case 0x860: + case 0x880: + return 0; + + default: + unreachable("Invalid Midgard GPU ID"); + } +} + +#endif diff -Nru mesa-19.2.8/src/panfrost/Makefile.sources mesa-20.0.8/src/panfrost/Makefile.sources --- mesa-19.2.8/src/panfrost/Makefile.sources 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/panfrost/Makefile.sources 2020-06-12 01:21:18.000000000 +0000 @@ -1,3 +1,67 @@ +bifrost_FILES := \ + bifrost/bifrost_compile.c \ + bifrost/bifrost_compile.h \ + bifrost/bifrost.h \ + bifrost/bifrost_ops.h \ + bifrost/bifrost_opts.c \ + bifrost/bifrost_opts.h \ + bifrost/bifrost_print.c \ + bifrost/bifrost_print.h \ + bifrost/bifrost_sched.c \ + bifrost/bifrost_sched.h \ + bifrost/cmdline.c \ + bifrost/compiler_defines.h \ + bifrost/disassemble.c \ + bifrost/disassemble.h + +encoder_FILES := \ + encoder/pan_attributes.c \ + encoder/pan_encoder.h \ + encoder/pan_invocation.c \ + encoder/pan_props.c \ + encoder/pan_sampler.c \ + encoder/pan_tiler.c \ + encoder/pan_scratch.c + +midgard_FILES := \ + midgard/compiler.h \ + midgard/disassemble.c \ + midgard/disassemble.h \ + midgard/helpers.h \ + midgard/midgard_compile.c \ + midgard/midgard_compile.h \ + midgard/midgard_derivatives.c \ + midgard/midgard_emit.c \ + midgard/midgard.h \ + midgard/midgard_liveness.c \ + midgard/midgard_nir.h \ + midgard/midgard_ops.c \ + midgard/midgard_ops.h \ + midgard/midgard_opt_copy_prop.c \ + midgard/midgard_opt_dce.c \ + midgard/midgard_opt_float.c \ + midgard/midgard_opt_invert.c \ + midgard/midgard_opt_perspective.c \ + midgard/midgard-parse.h \ + midgard/midgard_print.c \ + midgard/midgard_ra.c \ + midgard/midgard_ra_pipeline.c \ + midgard/midgard_schedule.c \ + midgard/midgard_errata_lod.c \ + midgard/mir.c \ + midgard/mir_promote_uniforms.c \ + midgard/mir_squeeze.c \ + midgard/nir_undef_to_zero.c \ + midgard/lcra.c + shared_FILES := \ shared/pan_tiling.c \ shared/pan_tiling.h + +pandecode_FILES := \ + pandecode/common.c \ + pandecode/decode.c \ + pandecode/decode.h \ + pandecode/pan_pretty_print.c \ + pandecode/pan_pretty_print.h \ + pandecode/public.h diff -Nru mesa-19.2.8/src/panfrost/meson.build mesa-20.0.8/src/panfrost/meson.build --- mesa-19.2.8/src/panfrost/meson.build 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/panfrost/meson.build 2020-06-12 01:21:18.000000000 +0000 @@ -24,13 +24,14 @@ ]) inc_panfrost = include_directories([ - '.', 'include', 'shared', 'midgard', 'bifrost' + '.', 'include', 'shared', 'midgard', 'bifrost', 'encoder' ]) subdir('shared') subdir('midgard') subdir('bifrost') subdir('pandecode') +subdir('encoder') files_bifrost = files( 'bifrost/cmdline.c', @@ -53,5 +54,5 @@ libglsl_standalone, libpanfrost_bifrost ], - build_by_default : true + build_by_default : with_tools.contains('panfrost') ) diff -Nru mesa-19.2.8/src/panfrost/midgard/compiler.h mesa-20.0.8/src/panfrost/midgard/compiler.h --- mesa-19.2.8/src/panfrost/midgard/compiler.h 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/panfrost/midgard/compiler.h 2020-06-12 01:21:18.000000000 +0000 @@ -27,6 +27,8 @@ #include "midgard.h" #include "helpers.h" #include "midgard_compile.h" +#include "midgard_ops.h" +#include "lcra.h" #include "util/hash_table.h" #include "util/u_dynarray.h" @@ -67,16 +69,6 @@ }; } midgard_branch; -/* Instruction arguments represented as block-local SSA indices, rather than - * registers. Negative values mean unused. */ - -typedef struct { - int src[3]; - int dest; - - bool inline_constant; -} ssa_args; - /* Generic in-memory data type repesenting a single logical instruction, rather * than a single instruction group. This is the preferred form for code gen. * Multiple midgard_insturctions will later be combined during scheduling, @@ -87,14 +79,22 @@ * emitted before the register allocation pass. */ +#define MIR_SRC_COUNT 4 +#define MIR_VEC_COMPONENTS 16 + typedef struct midgard_instruction { /* Must be first for casting */ struct list_head link; unsigned type; /* ALU, load/store, texture */ - /* If the register allocator has not run yet... */ - ssa_args ssa_args; + /* Instruction arguments represented as block-local SSA + * indices, rather than registers. ~0 means unused. */ + unsigned src[MIR_SRC_COUNT]; + unsigned dest; + + /* vec16 swizzle, unpacked, per source */ + unsigned swizzle[MIR_SRC_COUNT][MIR_VEC_COMPONENTS]; /* Special fields for an ALU instruction */ midgard_reg_info registers; @@ -102,20 +102,15 @@ /* I.e. (1 << alu_bit) */ int unit; - /* When emitting bundle, should this instruction have a break forced - * before it? Used for r31 writes which are valid only within a single - * bundle and *need* to happen as early as possible... this is a hack, - * TODO remove when we have a scheduler */ - bool precede_break; - bool has_constants; - float constants[4]; + midgard_constants constants; uint16_t inline_constant; bool has_blend_constant; + bool has_inline_constant; bool compact_branch; bool writeout; - bool prepacked_branch; + bool last_writeout; /* Kind of a hack, but hint against aggressive DCE */ bool dont_eliminate; @@ -127,19 +122,33 @@ uint16_t mask; /* For ALU ops only: set to true to invert (bitwise NOT) the - * destination of an integer-out op. Not imeplemented in hardware but + * destination of an integer-out op. Not implemented in hardware but * allows more optimizations */ bool invert; /* Hint for the register allocator not to spill the destination written - * from this instruction (because it is a spill/unspill node itself) */ + * from this instruction (because it is a spill/unspill node itself). + * Bitmask of spilled classes */ - bool no_spill; + unsigned no_spill; /* Generic hint for intra-pass use */ bool hint; + /* During scheduling, the backwards dependency graph + * (DAG). nr_dependencies is the number of unscheduled + * instructions that must still be scheduled after + * (before) this instruction. dependents are which + * instructions need to be scheduled before (after) this + * instruction. */ + + unsigned nr_dependencies; + BITSET_WORD *dependents; + + /* For load/store ops.. force 64-bit destination */ + bool load_64; + union { midgard_load_store_word load_store; midgard_vector_alu alu; @@ -187,39 +196,42 @@ /* In liveness analysis, these are live masks (per-component) for * indices for the block. Scalar compilers have the luxury of using - * simple bit fields, but for us, liveness is a vector idea. We use - * 8-bit to allow finegrained tracking up to vec8. If you're - * implementing vec16 on Panfrost... I'm sorry. */ - uint8_t *live_in; - uint8_t *live_out; + * simple bit fields, but for us, liveness is a vector idea. */ + uint16_t *live_in; + uint16_t *live_out; + + /* Indicates this is a fixed-function fragment epilogue block */ + bool epilogue; } midgard_block; typedef struct midgard_bundle { /* Tag for the overall bundle */ int tag; - /* Instructions contained by the bundle */ + /* Instructions contained by the bundle. instruction_count <= 6 (vmul, + * sadd, vadd, smul, vlut, branch) */ int instruction_count; - midgard_instruction *instructions[5]; + midgard_instruction *instructions[6]; /* Bundle-wide ALU configuration */ int padding; int control; bool has_embedded_constants; - float constants[4]; + midgard_constants constants; bool has_blend_constant; + bool last_writeout; } midgard_bundle; typedef struct compiler_context { nir_shader *nir; gl_shader_stage stage; - /* The screen we correspond to */ - struct midgard_screen *screen; - /* Is internally a blend shader? Depends on stage == FRAGMENT */ bool is_blend; + /* Render target number for a keyed blend shader. Depends on is_blend */ + unsigned blend_rt; + /* Tracking for blend constant patching */ int blend_constant_offset; @@ -272,9 +284,6 @@ * Decrease when a tex op is removed. */ int texture_op_count; - /* Mapping of texture register -> SSA index for unaliasing */ - int texture_index[2]; - /* The number of uniforms allowable for the fast path */ int uniform_cutoff; @@ -284,20 +293,34 @@ /* Alpha ref value passed in */ float alpha_ref; + unsigned quadword_count; + /* The mapping of sysvals to uniforms, the count, and the off-by-one inverse */ unsigned sysvals[MAX_SYSVAL_COUNT]; unsigned sysval_count; struct hash_table_u64 *sysval_to_id; + + /* Bitmask of valid metadata */ + unsigned metadata; + + /* Model-specific quirk set */ + uint32_t quirks; + + /* Writeout instructions for each render target */ + midgard_instruction *writeout_branch[4]; } compiler_context; +/* Per-block live_in/live_out */ +#define MIDGARD_METADATA_LIVENESS (1 << 0) + /* Helpers for manipulating the above structures (forming the driver IR) */ /* Append instruction to end of current block */ static inline midgard_instruction * -mir_upload_ins(struct midgard_instruction ins) +mir_upload_ins(struct compiler_context *ctx, struct midgard_instruction ins) { - midgard_instruction *heap = malloc(sizeof(ins)); + midgard_instruction *heap = ralloc(ctx, struct midgard_instruction); memcpy(heap, &ins, sizeof(ins)); return heap; } @@ -305,15 +328,17 @@ static inline midgard_instruction * emit_mir_instruction(struct compiler_context *ctx, struct midgard_instruction ins) { - midgard_instruction *u = mir_upload_ins(ins); + midgard_instruction *u = mir_upload_ins(ctx, ins); list_addtail(&u->link, &ctx->current_block->instructions); return u; } static inline struct midgard_instruction * -mir_insert_instruction_before(struct midgard_instruction *tag, struct midgard_instruction ins) +mir_insert_instruction_before(struct compiler_context *ctx, + struct midgard_instruction *tag, + struct midgard_instruction ins) { - struct midgard_instruction *u = mir_upload_ins(ins); + struct midgard_instruction *u = mir_upload_ins(ctx, ins); list_addtail(&u->link, &tag->link); return u; } @@ -368,6 +393,17 @@ #define mir_foreach_bundle_in_block(block, v) \ util_dynarray_foreach(&block->bundles, midgard_bundle, v) +#define mir_foreach_bundle_in_block_rev(block, v) \ + util_dynarray_foreach_reverse(&block->bundles, midgard_bundle, v) + +#define mir_foreach_instr_in_block_scheduled_rev(block, v) \ + midgard_instruction* v; \ + signed i = 0; \ + mir_foreach_bundle_in_block_rev(block, _bundle) \ + for (i = (_bundle->instruction_count - 1), v = _bundle->instructions[i]; \ + i >= 0; \ + --i, v = (i >= 0) ? _bundle->instructions[i] : NULL) \ + #define mir_foreach_instr_global(ctx, v) \ mir_foreach_block(ctx, v_block) \ mir_foreach_instr_in_block(v_block, v) @@ -396,7 +432,7 @@ v = (struct midgard_block *) (_entry_##v ? _entry_##v->key : NULL)) #define mir_foreach_src(ins, v) \ - for (unsigned v = 0; v < ARRAY_SIZE(ins->ssa_args.src); ++v) + for (unsigned v = 0; v < ARRAY_SIZE(ins->src); ++v) static inline midgard_instruction * mir_last_in_block(struct midgard_block *block) @@ -421,8 +457,9 @@ midgard_block *last = list_last_entry(&ctx->blocks, struct midgard_block, link); - /* The last block must be empty (the exit block) */ - assert(list_empty(&last->instructions)); + /* The last block must be empty logically but contains branch writeout + * for fragment shaders */ + assert(last->nr_successors == 0); return last; @@ -431,7 +468,7 @@ static inline bool mir_is_alu_bundle(midgard_bundle *bundle) { - return IS_ALU(bundle->tag); + return midgard_word_types[bundle->tag] == midgard_word_type_alu; } /* Registers/SSA are distinguish in the backend by the bottom-most bit */ @@ -485,17 +522,24 @@ void mir_rewrite_index(compiler_context *ctx, unsigned old, unsigned new); void mir_rewrite_index_src(compiler_context *ctx, unsigned old, unsigned new); void mir_rewrite_index_dst(compiler_context *ctx, unsigned old, unsigned new); -void mir_rewrite_index_dst_tag(compiler_context *ctx, unsigned old, unsigned new, unsigned tag); void mir_rewrite_index_dst_single(midgard_instruction *ins, unsigned old, unsigned new); void mir_rewrite_index_src_single(midgard_instruction *ins, unsigned old, unsigned new); -void mir_rewrite_index_src_tag(compiler_context *ctx, unsigned old, unsigned new, unsigned tag); -void mir_rewrite_index_src_swizzle(compiler_context *ctx, unsigned old, unsigned new, unsigned swizzle); +void mir_rewrite_index_src_swizzle(compiler_context *ctx, unsigned old, unsigned new, unsigned *swizzle); bool mir_single_use(compiler_context *ctx, unsigned value); bool mir_special_index(compiler_context *ctx, unsigned idx); unsigned mir_use_count(compiler_context *ctx, unsigned value); bool mir_is_written_before(compiler_context *ctx, midgard_instruction *ins, unsigned node); -unsigned mir_mask_of_read_components(midgard_instruction *ins, unsigned node); -unsigned mir_ubo_shift(midgard_load_store_op op); +uint16_t mir_bytemask_of_read_components(midgard_instruction *ins, unsigned node); +midgard_reg_mode mir_typesize(midgard_instruction *ins); +midgard_reg_mode mir_srcsize(midgard_instruction *ins, unsigned i); +unsigned mir_bytes_for_mode(midgard_reg_mode mode); +midgard_reg_mode mir_mode_for_destsize(unsigned size); +uint16_t mir_from_bytemask(uint16_t bytemask, midgard_reg_mode mode); +uint16_t mir_to_bytemask(midgard_reg_mode mode, unsigned mask); +uint16_t mir_bytemask(midgard_instruction *ins); +uint16_t mir_round_bytemask_up(uint16_t mask, midgard_reg_mode mode); +void mir_set_bytemask(midgard_instruction *ins, uint16_t bytemask); +unsigned mir_upper_override(midgard_instruction *ins); /* MIR printing */ @@ -505,56 +549,99 @@ void mir_print_shader(compiler_context *ctx); bool mir_nontrivial_source2_mod(midgard_instruction *ins); bool mir_nontrivial_source2_mod_simple(midgard_instruction *ins); -bool mir_nontrivial_mod(midgard_vector_alu_src src, bool is_int, unsigned mask); bool mir_nontrivial_outmod(midgard_instruction *ins); -/* MIR goodies */ - -static const midgard_vector_alu_src blank_alu_src = { - .swizzle = SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_W), -}; - -static const midgard_vector_alu_src blank_alu_src_xxxx = { - .swizzle = SWIZZLE(COMPONENT_X, COMPONENT_X, COMPONENT_X, COMPONENT_X), -}; - -static const midgard_scalar_alu_src blank_scalar_alu_src = { - .full = true -}; - -/* Used for encoding the unused source of 1-op instructions */ -static const midgard_vector_alu_src zero_alu_src = { 0 }; +void mir_insert_instruction_before_scheduled(compiler_context *ctx, midgard_block *block, midgard_instruction *tag, midgard_instruction ins); +void mir_insert_instruction_after_scheduled(compiler_context *ctx, midgard_block *block, midgard_instruction *tag, midgard_instruction ins); +void mir_flip(midgard_instruction *ins); +void mir_compute_temp_count(compiler_context *ctx); /* 'Intrinsic' move for aliasing */ static inline midgard_instruction -v_mov(unsigned src, midgard_vector_alu_src mod, unsigned dest) +v_mov(unsigned src, unsigned dest) { midgard_instruction ins = { .type = TAG_ALU_4, .mask = 0xF, - .ssa_args = { - .src = { SSA_UNUSED_1, src, -1 }, - .dest = dest, - }, + .src = { ~0, src, ~0, ~0 }, + .swizzle = SWIZZLE_IDENTITY, + .dest = dest, .alu = { .op = midgard_alu_op_imov, .reg_mode = midgard_reg_mode_32, .dest_override = midgard_dest_override_none, - .outmod = midgard_outmod_int_wrap, - .src1 = vector_alu_srco_unsigned(zero_alu_src), - .src2 = vector_alu_srco_unsigned(mod) + .outmod = midgard_outmod_int_wrap }, }; return ins; } +/* Broad types of register classes so we can handle special + * registers */ + +#define REG_CLASS_WORK 0 +#define REG_CLASS_LDST 1 +#define REG_CLASS_TEXR 3 +#define REG_CLASS_TEXW 4 + +/* Like a move, but to thread local storage! */ + +static inline midgard_instruction +v_load_store_scratch( + unsigned srcdest, + unsigned index, + bool is_store, + unsigned mask) +{ + /* We index by 32-bit vec4s */ + unsigned byte = (index * 4 * 4); + + midgard_instruction ins = { + .type = TAG_LOAD_STORE_4, + .mask = mask, + .dest = ~0, + .src = { ~0, ~0, ~0, ~0 }, + .swizzle = SWIZZLE_IDENTITY_4, + .load_store = { + .op = is_store ? midgard_op_st_int4 : midgard_op_ld_int4, + + /* For register spilling - to thread local storage */ + .arg_1 = 0xEA, + .arg_2 = 0x1E, + }, + + /* If we spill an unspill, RA goes into an infinite loop */ + .no_spill = (1 << REG_CLASS_WORK) + }; + + ins.constants.u32[0] = byte; + + if (is_store) { + ins.src[0] = srcdest; + + /* Ensure we are tightly swizzled so liveness analysis is + * correct */ + + for (unsigned i = 0; i < 4; ++i) { + if (!(mask & (1 << i))) + ins.swizzle[0][i] = COMPONENT_X; + } + } else + ins.dest = srcdest; + + return ins; +} + static inline bool mir_has_arg(midgard_instruction *ins, unsigned arg) { - for (unsigned i = 0; i < ARRAY_SIZE(ins->ssa_args.src); ++i) { - if (ins->ssa_args.src[i] == arg) + if (!ins) + return false; + + for (unsigned i = 0; i < ARRAY_SIZE(ins->src); ++i) { + if (ins->src[i] == arg) return true; } @@ -563,42 +650,18 @@ /* Scheduling */ -void schedule_program(compiler_context *ctx); - -/* Register allocation */ - -struct ra_graph; - -/* Broad types of register classes so we can handle special - * registers */ - -#define NR_REG_CLASSES 5 - -#define REG_CLASS_WORK 0 -#define REG_CLASS_LDST 1 -#define REG_CLASS_LDST27 2 -#define REG_CLASS_TEXR 3 -#define REG_CLASS_TEXW 4 +void midgard_schedule_program(compiler_context *ctx); +void mir_ra(compiler_context *ctx); +void mir_squeeze_index(compiler_context *ctx); void mir_lower_special_reads(compiler_context *ctx); -struct ra_graph* allocate_registers(compiler_context *ctx, bool *spilled); -void install_registers(compiler_context *ctx, struct ra_graph *g); +void mir_liveness_ins_update(uint16_t *live, midgard_instruction *ins, unsigned max); +void mir_compute_liveness(compiler_context *ctx); +void mir_invalidate_liveness(compiler_context *ctx); bool mir_is_live_after(compiler_context *ctx, midgard_block *block, midgard_instruction *start, int src); -bool mir_has_multiple_writes(compiler_context *ctx, int src); void mir_create_pipeline_registers(compiler_context *ctx); - -void -midgard_promote_uniforms(compiler_context *ctx, unsigned promoted_count); - -midgard_instruction * -emit_ubo_read( - compiler_context *ctx, - nir_instr *instr, - unsigned dest, - unsigned offset, - nir_src *indirect_offset, - unsigned index); +void midgard_promote_uniforms(compiler_context *ctx); void emit_sysval_read(compiler_context *ctx, nir_instr *instr, signed dest_override, unsigned nr_components); @@ -609,7 +672,7 @@ void midgard_lower_derivatives(compiler_context *ctx, midgard_block *block); -bool mir_op_computes_derivatives(unsigned op); +bool mir_op_computes_derivatives(gl_shader_stage stage, unsigned op); /* Final emission */ @@ -619,13 +682,10 @@ struct util_dynarray *emission, int next_tag); -/* NIR stuff. TODO: Move? Share? Something? */ - bool nir_undef_to_zero(nir_shader *shader); -void -nir_clamp_psiz(nir_shader *shader, float min_size, float max_size); +void midgard_nir_lod_errata(nir_shader *shader); /* Optimizations */ @@ -634,11 +694,14 @@ bool midgard_opt_varying_projection(compiler_context *ctx, midgard_block *block); bool midgard_opt_dead_code_eliminate(compiler_context *ctx, midgard_block *block); bool midgard_opt_dead_move_eliminate(compiler_context *ctx, midgard_block *block); -void midgard_opt_post_move_eliminate(compiler_context *ctx, midgard_block *block, struct ra_graph *g); void midgard_lower_invert(compiler_context *ctx, midgard_block *block); bool midgard_opt_not_propagate(compiler_context *ctx, midgard_block *block); bool midgard_opt_fuse_src_invert(compiler_context *ctx, midgard_block *block); bool midgard_opt_fuse_dest_invert(compiler_context *ctx, midgard_block *block); +bool midgard_opt_csel_invert(compiler_context *ctx, midgard_block *block); +bool midgard_opt_promote_fmov(compiler_context *ctx, midgard_block *block); +bool midgard_opt_drop_cmp_invert(compiler_context *ctx, midgard_block *block); +bool midgard_opt_invert_branch(compiler_context *ctx, midgard_block *block); #endif diff -Nru mesa-19.2.8/src/panfrost/midgard/cppwrap.cpp mesa-20.0.8/src/panfrost/midgard/cppwrap.cpp --- mesa-19.2.8/src/panfrost/midgard/cppwrap.cpp 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/panfrost/midgard/cppwrap.cpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,9 +0,0 @@ -struct exec_list; - -bool do_mat_op_to_vec(struct exec_list *instructions); - -extern "C" { - bool c_do_mat_op_to_vec(struct exec_list *instructions) { - return do_mat_op_to_vec(instructions); - } -}; diff -Nru mesa-19.2.8/src/panfrost/midgard/disassemble.c mesa-20.0.8/src/panfrost/midgard/disassemble.c --- mesa-19.2.8/src/panfrost/midgard/disassemble.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/panfrost/midgard/disassemble.c 2020-06-12 01:21:18.000000000 +0000 @@ -26,74 +26,77 @@ #include #include +#include #include #include #include #include #include "midgard.h" -#include "midgard-parse.h" #include "midgard_ops.h" +#include "midgard_quirks.h" #include "disassemble.h" #include "helpers.h" +#include "util/bitscan.h" #include "util/half_float.h" #include "util/u_math.h" -#define DEFINE_CASE(define, str) case define: { printf(str); break; } +#define DEFINE_CASE(define, str) case define: { fprintf(fp, str); break; } +static unsigned *midg_tags; static bool is_instruction_int = false; /* Stats */ -static unsigned nr_ins = 0; +static struct midgard_disasm_stats midg_stats; /* Prints a short form of the tag for branching, the minimum needed to be * legible and unambiguous */ static void -print_tag_short(unsigned tag) +print_tag_short(FILE *fp, unsigned tag) { switch (midgard_word_types[tag]) { case midgard_word_type_texture: - printf("tex/%X", tag); + fprintf(fp, "tex/%X", tag); break; case midgard_word_type_load_store: - printf("ldst"); + fprintf(fp, "ldst"); break; case midgard_word_type_alu: - printf("alu%d/%X", midgard_word_size[tag], tag); + fprintf(fp, "alu%u/%X", midgard_word_size[tag], tag); break; default: - printf("%s%X", (tag > 0) ? "" : "unk", tag); + fprintf(fp, "%s%X", (tag > 0) ? "" : "unk", tag); break; } } static void -print_alu_opcode(midgard_alu_op op) +print_alu_opcode(FILE *fp, midgard_alu_op op) { bool int_op = false; if (alu_opcode_props[op].name) { - printf("%s", alu_opcode_props[op].name); + fprintf(fp, "%s", alu_opcode_props[op].name); int_op = midgard_is_integer_op(op); } else - printf("alu_op_%02X", op); + fprintf(fp, "alu_op_%02X", op); /* For constant analysis */ is_instruction_int = int_op; } static void -print_ld_st_opcode(midgard_load_store_op op) +print_ld_st_opcode(FILE *fp, midgard_load_store_op op) { - if (load_store_opcode_names[op]) - printf("%s", load_store_opcode_names[op]); + if (load_store_opcode_props[op].name) + fprintf(fp, "%s", load_store_opcode_props[op].name); else - printf("ldst_op_%02X", op); + fprintf(fp, "ldst_op_%02X", op); } static bool is_embedded_constant_half = false; @@ -114,8 +117,14 @@ } } +/* For static analysis to ensure all registers are written at least once before + * use along the source code path (TODO: does this break done for complex CF?) + */ + +uint16_t midg_ever_written = 0; + static void -print_reg(unsigned reg, unsigned bits) +print_reg(FILE *fp, unsigned reg, unsigned bits) { /* Perform basic static analysis for expanding constants correctly */ @@ -124,12 +133,33 @@ is_embedded_constant_half = (bits < 32); } + unsigned uniform_reg = 23 - reg; + bool is_uniform = false; + + /* For r8-r15, it could be a work or uniform. We distinguish based on + * the fact work registers are ALWAYS written before use, but uniform + * registers are NEVER written before use. */ + + if ((reg >= 8 && reg < 16) && !(midg_ever_written & (1 << reg))) + is_uniform = true; + + /* r16-r23 are always uniform */ + + if (reg >= 16 && reg <= 23) + is_uniform = true; + + /* Update the uniform count appropriately */ + + if (is_uniform) + midg_stats.uniform_count = + MAX2(uniform_reg + 1, midg_stats.uniform_count); + char prefix = prefix_for_bits(bits); if (prefix) - putchar(prefix); + fputc(prefix, fp); - printf("r%u", reg); + fprintf(fp, "r%u", reg); } static char *outmod_names_float[4] = { @@ -154,103 +184,103 @@ }; static void -print_outmod(unsigned outmod, bool is_int) +print_outmod(FILE *fp, unsigned outmod, bool is_int) { - printf("%s", is_int ? outmod_names_int[outmod] : + fprintf(fp, "%s", is_int ? outmod_names_int[outmod] : outmod_names_float[outmod]); } static void -print_quad_word(uint32_t *words, unsigned tabs) +print_quad_word(FILE *fp, uint32_t *words, unsigned tabs) { unsigned i; for (i = 0; i < 4; i++) - printf("0x%08X%s ", words[i], i == 3 ? "" : ","); + fprintf(fp, "0x%08X%s ", words[i], i == 3 ? "" : ","); - printf("\n"); + fprintf(fp, "\n"); } static const char components[16] = "xyzwefghijklmnop"; /* Helper to print 4 chars of a swizzle */ static void -print_swizzle_helper(unsigned swizzle, bool upper) +print_swizzle_helper(FILE *fp, unsigned swizzle, bool upper) { for (unsigned i = 0; i < 4; ++i) { unsigned c = (swizzle >> (i * 2)) & 3; c += upper*4; - printf("%c", components[c]); + fprintf(fp, "%c", components[c]); } } /* Helper to print 8 chars of a swizzle, duplicating over */ static void -print_swizzle_helper_8(unsigned swizzle, bool upper) +print_swizzle_helper_8(FILE *fp, unsigned swizzle, bool upper) { for (unsigned i = 0; i < 4; ++i) { unsigned c = (swizzle >> (i * 2)) & 3; c *= 2; c += upper*8; - printf("%c%c", components[c], components[c+1]); + fprintf(fp, "%c%c", components[c], components[c+1]); } } static void -print_swizzle_vec16(unsigned swizzle, bool rep_high, bool rep_low, +print_swizzle_vec16(FILE *fp, unsigned swizzle, bool rep_high, bool rep_low, midgard_dest_override override) { - printf("."); + fprintf(fp, "."); if (override == midgard_dest_override_upper) { if (rep_high) - printf(" /* rep_high */ "); + fprintf(fp, " /* rep_high */ "); if (rep_low) - printf(" /* rep_low */ "); + fprintf(fp, " /* rep_low */ "); if (!rep_high && rep_low) - print_swizzle_helper_8(swizzle, true); + print_swizzle_helper_8(fp, swizzle, true); else - print_swizzle_helper_8(swizzle, false); + print_swizzle_helper_8(fp, swizzle, false); } else { - print_swizzle_helper_8(swizzle, rep_high & 1); - print_swizzle_helper_8(swizzle, !rep_low & 1); + print_swizzle_helper_8(fp, swizzle, rep_high & 1); + print_swizzle_helper_8(fp, swizzle, !(rep_low & 1)); } } static void -print_swizzle_vec8(unsigned swizzle, bool rep_high, bool rep_low) +print_swizzle_vec8(FILE *fp, unsigned swizzle, bool rep_high, bool rep_low) { - printf("."); + fprintf(fp, "."); - print_swizzle_helper(swizzle, rep_high & 1); - print_swizzle_helper(swizzle, !rep_low & 1); + print_swizzle_helper(fp, swizzle, rep_high & 1); + print_swizzle_helper(fp, swizzle, !(rep_low & 1)); } static void -print_swizzle_vec4(unsigned swizzle, bool rep_high, bool rep_low) +print_swizzle_vec4(FILE *fp, unsigned swizzle, bool rep_high, bool rep_low) { if (rep_high) - printf(" /* rep_high */ "); + fprintf(fp, " /* rep_high */ "); if (rep_low) - printf(" /* rep_low */ "); + fprintf(fp, " /* rep_low */ "); if (swizzle == 0xE4) return; /* xyzw */ - printf("."); - print_swizzle_helper(swizzle, 0); + fprintf(fp, "."); + print_swizzle_helper(fp, swizzle, 0); } static void -print_swizzle_vec2(unsigned swizzle, bool rep_high, bool rep_low) +print_swizzle_vec2(FILE *fp, unsigned swizzle, bool rep_high, bool rep_low) { if (rep_high) - printf(" /* rep_high */ "); + fprintf(fp, " /* rep_high */ "); if (rep_low) - printf(" /* rep_low */ "); + fprintf(fp, " /* rep_low */ "); if (swizzle == 0xE4) return; /* XY */ - printf("."); + fprintf(fp, "."); for (unsigned i = 0; i < 4; i += 2) { unsigned a = (swizzle >> (i * 2)) & 3; @@ -260,13 +290,13 @@ * it ambiguous */ if (a & 0x1) - printf("[%c%c]", components[a], components[b]); + fprintf(fp, "[%c%c]", components[a], components[b]); else if (a == b) - printf("%c", components[a >> 1]); + fprintf(fp, "%c", components[a >> 1]); else if (b == (a + 1)) - printf("%c", "XY"[a >> 1]); + fprintf(fp, "%c", "XY"[a >> 1]); else - printf("[%c%c]", components[a], components[b]); + fprintf(fp, "[%c%c]", components[a], components[b]); } } @@ -300,7 +330,70 @@ } static void -print_vector_src(unsigned src_binary, +print_scalar_constant(FILE *fp, unsigned src_binary, + const midgard_constants *consts, + midgard_scalar_alu *alu) +{ + midgard_scalar_alu_src *src = (midgard_scalar_alu_src *)&src_binary; + unsigned mod = 0; + + if (!midgard_is_integer_op(alu->op)) { + if (src->abs) + mod |= MIDGARD_FLOAT_MOD_ABS; + if (src->negate) + mod |= MIDGARD_FLOAT_MOD_NEG; + } else { + mod = midgard_int_normal; + } + + fprintf(fp, "#"); + mir_print_constant_component(fp, consts, src->component, + src->full ? + midgard_reg_mode_32 : midgard_reg_mode_16, + false, mod, alu->op); +} + +static void +print_vector_constants(FILE *fp, unsigned src_binary, + const midgard_constants *consts, + midgard_vector_alu *alu) +{ + midgard_vector_alu_src *src = (midgard_vector_alu_src *)&src_binary; + unsigned bits = bits_for_mode_halved(alu->reg_mode, src->half); + unsigned max_comp = MIN2((sizeof(*consts) * 8) / bits, 8); + unsigned comp_mask, num_comp = 0; + + assert(consts); + + comp_mask = effective_writemask(alu, condense_writemask(alu->mask, bits)); + num_comp = util_bitcount(comp_mask); + + fprintf(fp, "#"); + if (num_comp > 1) + fprintf(fp, "vec%d(", num_comp); + + bool first = true; + + for (unsigned i = 0; i < max_comp; ++i) { + if (!(comp_mask & (1 << i))) continue; + + unsigned c = (src->swizzle >> (i * 2)) & 3; + + if (first) + first = false; + else + fprintf(fp, ", "); + + mir_print_constant_component(fp, consts, c, alu->reg_mode, + src->half, src->mod, alu->op); + } + + if (num_comp > 1) + fprintf(fp, ")"); +} + +static void +print_vector_src(FILE *fp, unsigned src_binary, midgard_reg_mode mode, unsigned reg, midgard_dest_override override, bool is_int) { @@ -311,36 +404,48 @@ midgard_int_mod int_mod = src->mod; if (is_int) { - printf("%s", srcmod_names_int[int_mod]); + fprintf(fp, "%s", srcmod_names_int[int_mod]); } else { if (src->mod & MIDGARD_FLOAT_MOD_NEG) - printf("-"); + fprintf(fp, "-"); if (src->mod & MIDGARD_FLOAT_MOD_ABS) - printf("abs("); + fprintf(fp, "abs("); } //register unsigned bits = bits_for_mode_halved(mode, src->half); - print_reg(reg, bits); + print_reg(fp, reg, bits); //swizzle - if (bits == 16) - print_swizzle_vec8(src->swizzle, src->rep_high, src->rep_low); - else if (bits == 8) - print_swizzle_vec16(src->swizzle, src->rep_high, src->rep_low, override); + if (bits == 16) { + /* When the mode of the instruction is itself 16-bit, + * rep_low/high work more or less as expected. But if the mode + * is 32-bit and we're stepping down, you only have vec4 and + * the meaning shifts to rep_low as higher-half and rep_high is + * never seen. TODO: are other modes similar? */ + + if (mode == midgard_reg_mode_32) { + fprintf(fp, "."); + print_swizzle_helper(fp, src->swizzle, src->rep_low); + assert(!src->rep_high); + } else { + print_swizzle_vec8(fp, src->swizzle, src->rep_high, src->rep_low); + } + } else if (bits == 8) + print_swizzle_vec16(fp, src->swizzle, src->rep_high, src->rep_low, override); else if (bits == 32) - print_swizzle_vec4(src->swizzle, src->rep_high, src->rep_low); + print_swizzle_vec4(fp, src->swizzle, src->rep_high, src->rep_low); else if (bits == 64) - print_swizzle_vec2(src->swizzle, src->rep_high, src->rep_low); + print_swizzle_vec2(fp, src->swizzle, src->rep_high, src->rep_low); /* Since we wrapped with a function-looking thing */ if (is_int && int_mod == midgard_int_shift) - printf(") << %d", bits); + fprintf(fp, ") << %u", bits); else if ((is_int && (int_mod != midgard_int_normal)) || (!is_int && src->mod & MIDGARD_FLOAT_MOD_ABS)) - printf(")"); + fprintf(fp, ")"); } static uint16_t @@ -354,16 +459,28 @@ } static void -print_immediate(uint16_t imm) +print_immediate(FILE *fp, uint16_t imm) { if (is_instruction_int) - printf("#%d", imm); + fprintf(fp, "#%u", imm); else - printf("#%g", _mesa_half_to_float(imm)); + fprintf(fp, "#%g", _mesa_half_to_float(imm)); } -static unsigned -print_dest(unsigned reg, midgard_reg_mode mode, midgard_dest_override override) +static void +update_dest(unsigned reg) +{ + /* We should record writes as marking this as a work register. Store + * the max register in work_count; we'll add one at the end */ + + if (reg < 16) { + midg_stats.work_count = MAX2(reg, midg_stats.work_count); + midg_ever_written |= (1 << reg); + } +} + +static void +print_dest(FILE *fp, unsigned reg, midgard_reg_mode mode, midgard_dest_override override) { /* Depending on the mode and override, we determine the type of * destination addressed. Absent an override, we address just the @@ -374,30 +491,20 @@ if (override != midgard_dest_override_none) bits /= 2; - print_reg(reg, bits); - - return bits; + update_dest(reg); + print_reg(fp, reg, bits); } static void -print_mask_vec16(uint8_t mask, midgard_dest_override override) +print_mask_vec16(FILE *fp, uint8_t mask, midgard_dest_override override) { - printf("."); + fprintf(fp, "."); - if (override == midgard_dest_override_none) { - for (unsigned i = 0; i < 8; i++) { - if (mask & (1 << i)) - printf("%c%c", - components[i*2 + 0], - components[i*2 + 1]); - } - } else { - bool upper = (override == midgard_dest_override_upper); - - for (unsigned i = 0; i < 8; i++) { - if (mask & (1 << i)) - printf("%c", components[i + (upper ? 8 : 0)]); - } + for (unsigned i = 0; i < 8; i++) { + if (mask & (1 << i)) + fprintf(fp, "%c%c", + components[i*2 + 0], + components[i*2 + 1]); } } @@ -409,40 +516,43 @@ * the mask to make it obvious what happened */ static void -print_mask(uint8_t mask, unsigned bits, midgard_dest_override override) +print_mask(FILE *fp, uint8_t mask, unsigned bits, midgard_dest_override override) { - if (bits < 16) { - /* Shouldn't happen but with junk / out-of-spec shaders it - * would cause an infinite loop */ - - printf("/* XXX: bits = %d */", bits); - return; - } - if (bits == 8) { - print_mask_vec16(mask, override); + print_mask_vec16(fp, mask, override); return; } /* Skip 'complete' masks */ - if (bits >= 32 && mask == 0xFF) return; + if (override == midgard_dest_override_none) { + if (bits >= 32 && mask == 0xFF) return; - if (bits == 16) { - if (mask == 0x0F) - return; - else if (mask == 0xF0) { - printf("'"); - return; + if (bits == 16) { + if (mask == 0x0F) + return; + else if (mask == 0xF0) { + fprintf(fp, "'"); + return; + } } } - printf("."); + fprintf(fp, "."); unsigned skip = (bits / 16); bool uppercase = bits > 32; bool tripped = false; + /* To apply an upper destination override, we "shift" the alphabet. + * E.g. with an upper override on 32-bit, instead of xyzw, print efgh. + * For upper 16-bit, instead of xyzwefgh, print ijklmnop */ + + const char *alphabet = components; + + if (override == midgard_dest_override_upper) + alphabet += (128 / bits); + for (unsigned i = 0; i < 8; i += skip) { bool a = (mask & (1 << i)) != 0; @@ -452,39 +562,45 @@ } if (a) { - char c = components[i / skip]; + char c = alphabet[i / skip]; if (uppercase) c = toupper(c); - printf("%c", c); + fprintf(fp, "%c", c); } } if (tripped) - printf(" /* %X */", mask); + fprintf(fp, " /* %X */", mask); } /* Prints the 4-bit masks found in texture and load/store ops, as opposed to - * the 8-bit masks found in (vector) ALU ops */ + * the 8-bit masks found in (vector) ALU ops. Supports texture-style 16-bit + * mode as well, but not load/store-style 16-bit mode. */ static void -print_mask_4(unsigned mask) +print_mask_4(FILE *fp, unsigned mask, bool upper) { - if (mask == 0xF) return; + if (mask == 0xF) { + if (upper) + fprintf(fp, "'"); - printf("."); + return; + } + + fprintf(fp, "."); for (unsigned i = 0; i < 4; ++i) { bool a = (mask & (1 << i)) != 0; if (a) - printf("%c", components[i]); + fprintf(fp, "%c", components[i + (upper ? 4 : 0)]); } } static void -print_vector_field(const char *name, uint16_t *words, uint16_t reg_word, - unsigned tabs) +print_vector_field(FILE *fp, const char *name, uint16_t *words, uint16_t reg_word, + const midgard_constants *consts, unsigned tabs) { midgard_reg_info *reg_info = (midgard_reg_info *)®_word; midgard_vector_alu *alu_field = (midgard_vector_alu *) words; @@ -493,83 +609,76 @@ /* For now, prefix instruction names with their unit, until we * understand how this works on a deeper level */ - printf("%s.", name); + fprintf(fp, "%s.", name); - print_alu_opcode(alu_field->op); + print_alu_opcode(fp, alu_field->op); /* Postfix with the size to disambiguate if necessary */ char postfix = prefix_for_bits(bits_for_mode(mode)); bool size_ambiguous = override != midgard_dest_override_none; if (size_ambiguous) - printf("%c", postfix ? postfix : 'r'); + fprintf(fp, "%c", postfix ? postfix : 'r'); /* Print the outmod, if there is one */ - print_outmod(alu_field->outmod, + print_outmod(fp, alu_field->outmod, midgard_is_integer_out_op(alu_field->op)); - printf(" "); + fprintf(fp, " "); /* Mask denoting status of 8-lanes */ uint8_t mask = alu_field->mask; /* First, print the destination */ - unsigned dest_size = - print_dest(reg_info->out_reg, mode, alu_field->dest_override); - - /* Apply the destination override to the mask */ - - if (mode == midgard_reg_mode_32 || mode == midgard_reg_mode_64) { - if (override == midgard_dest_override_lower) - mask &= 0x0F; - else if (override == midgard_dest_override_upper) - mask &= 0xF0; - } else if (mode == midgard_reg_mode_16 - && override == midgard_dest_override_lower) { - /* stub */ - } + print_dest(fp, reg_info->out_reg, mode, alu_field->dest_override); if (override != midgard_dest_override_none) { bool modeable = (mode != midgard_reg_mode_8); bool known = override != 0x3; /* Unused value */ if (!(modeable && known)) - printf("/* do%d */ ", override); + fprintf(fp, "/* do%u */ ", override); } - print_mask(mask, dest_size, override); + print_mask(fp, mask, bits_for_mode(mode), override); - printf(", "); + fprintf(fp, ", "); bool is_int = midgard_is_integer_op(alu_field->op); - print_vector_src(alu_field->src1, mode, reg_info->src1_reg, override, is_int); - printf(", "); + if (reg_info->src1_reg == 26) + print_vector_constants(fp, alu_field->src1, consts, alu_field); + else + print_vector_src(fp, alu_field->src1, mode, reg_info->src1_reg, override, is_int); + + fprintf(fp, ", "); if (reg_info->src2_imm) { uint16_t imm = decode_vector_imm(reg_info->src2_reg, alu_field->src2 >> 2); - print_immediate(imm); + print_immediate(fp, imm); + } else if (reg_info->src2_reg == 26) { + print_vector_constants(fp, alu_field->src2, consts, alu_field); } else { - print_vector_src(alu_field->src2, mode, + print_vector_src(fp, alu_field->src2, mode, reg_info->src2_reg, override, is_int); } - nr_ins++; - printf("\n"); + midg_stats.instruction_count++; + fprintf(fp, "\n"); } static void -print_scalar_src(unsigned src_binary, unsigned reg) +print_scalar_src(FILE *fp, unsigned src_binary, unsigned reg) { midgard_scalar_alu_src *src = (midgard_scalar_alu_src *)&src_binary; if (src->negate) - printf("-"); + fprintf(fp, "-"); if (src->abs) - printf("abs("); + fprintf(fp, "abs("); - print_reg(reg, src->full ? 32 : 16); + print_reg(fp, reg, src->full ? 32 : 16); unsigned c = src->component; @@ -578,10 +687,10 @@ c >>= 1; } - printf(".%c", components[c]); + fprintf(fp, ".%c", components[c]); if (src->abs) - printf(")"); + fprintf(fp, ")"); } @@ -598,23 +707,24 @@ } static void -print_scalar_field(const char *name, uint16_t *words, uint16_t reg_word, - unsigned tabs) +print_scalar_field(FILE *fp, const char *name, uint16_t *words, uint16_t reg_word, + const midgard_constants *consts, unsigned tabs) { midgard_reg_info *reg_info = (midgard_reg_info *)®_word; midgard_scalar_alu *alu_field = (midgard_scalar_alu *) words; if (alu_field->unknown) - printf("scalar ALU unknown bit set\n"); + fprintf(fp, "scalar ALU unknown bit set\n"); - printf("%s.", name); - print_alu_opcode(alu_field->op); - print_outmod(alu_field->outmod, + fprintf(fp, "%s.", name); + print_alu_opcode(fp, alu_field->op); + print_outmod(fp, alu_field->outmod, midgard_is_integer_out_op(alu_field->op)); - printf(" "); + fprintf(fp, " "); bool full = alu_field->output_full; - print_reg(reg_info->out_reg, full ? 32 : 16); + update_dest(reg_info->out_reg); + print_reg(fp, reg_info->out_reg, full ? 32 : 16); unsigned c = alu_field->output_component; if (full) { @@ -622,101 +732,107 @@ c >>= 1; } - printf(".%c, ", components[c]); + fprintf(fp, ".%c, ", components[c]); - print_scalar_src(alu_field->src1, reg_info->src1_reg); + if (reg_info->src1_reg == 26) + print_scalar_constant(fp, alu_field->src1, consts, alu_field); + else + print_scalar_src(fp, alu_field->src1, reg_info->src1_reg); - printf(", "); + fprintf(fp, ", "); if (reg_info->src2_imm) { uint16_t imm = decode_scalar_imm(reg_info->src2_reg, alu_field->src2); - print_immediate(imm); + print_immediate(fp, imm); + } else if (reg_info->src2_reg == 26) { + print_scalar_constant(fp, alu_field->src2, consts, alu_field); } else - print_scalar_src(alu_field->src2, reg_info->src2_reg); + print_scalar_src(fp, alu_field->src2, reg_info->src2_reg); - nr_ins++; - printf("\n"); + midg_stats.instruction_count++; + fprintf(fp, "\n"); } static void -print_branch_op(int op) +print_branch_op(FILE *fp, unsigned op) { switch (op) { case midgard_jmp_writeout_op_branch_uncond: - printf("uncond."); + fprintf(fp, "uncond."); break; case midgard_jmp_writeout_op_branch_cond: - printf("cond."); + fprintf(fp, "cond."); break; case midgard_jmp_writeout_op_writeout: - printf("write."); + fprintf(fp, "write."); break; case midgard_jmp_writeout_op_tilebuffer_pending: - printf("tilebuffer."); + fprintf(fp, "tilebuffer."); break; case midgard_jmp_writeout_op_discard: - printf("discard."); + fprintf(fp, "discard."); break; default: - printf("unk%d.", op); + fprintf(fp, "unk%u.", op); break; } } static void -print_branch_cond(int cond) +print_branch_cond(FILE *fp, int cond) { switch (cond) { case midgard_condition_write0: - printf("write0"); + fprintf(fp, "write0"); break; case midgard_condition_false: - printf("false"); + fprintf(fp, "false"); break; case midgard_condition_true: - printf("true"); + fprintf(fp, "true"); break; case midgard_condition_always: - printf("always"); + fprintf(fp, "always"); break; default: - printf("unk%X", cond); + fprintf(fp, "unk%X", cond); break; } } -static void -print_compact_branch_writeout_field(uint16_t word) +static bool +print_compact_branch_writeout_field(FILE *fp, uint16_t word) { midgard_jmp_writeout_op op = word & 0x7; + midg_stats.instruction_count++; switch (op) { case midgard_jmp_writeout_op_branch_uncond: { midgard_branch_uncond br_uncond; memcpy((char *) &br_uncond, (char *) &word, sizeof(br_uncond)); - printf("br.uncond "); + fprintf(fp, "br.uncond "); if (br_uncond.unknown != 1) - printf("unknown:%d, ", br_uncond.unknown); + fprintf(fp, "unknown:%u, ", br_uncond.unknown); if (br_uncond.offset >= 0) - printf("+"); + fprintf(fp, "+"); - printf("%d -> ", br_uncond.offset); - print_tag_short(br_uncond.dest_tag); - printf("\n"); + fprintf(fp, "%d -> ", br_uncond.offset); + print_tag_short(fp, br_uncond.dest_tag); + fprintf(fp, "\n"); - break; + return br_uncond.offset >= 0; } case midgard_jmp_writeout_op_branch_cond: @@ -726,36 +842,36 @@ midgard_branch_cond br_cond; memcpy((char *) &br_cond, (char *) &word, sizeof(br_cond)); - printf("br."); + fprintf(fp, "br."); - print_branch_op(br_cond.op); - print_branch_cond(br_cond.cond); + print_branch_op(fp, br_cond.op); + print_branch_cond(fp, br_cond.cond); - printf(" "); + fprintf(fp, " "); if (br_cond.offset >= 0) - printf("+"); + fprintf(fp, "+"); - printf("%d -> ", br_cond.offset); - print_tag_short(br_cond.dest_tag); - printf("\n"); + fprintf(fp, "%d -> ", br_cond.offset); + print_tag_short(fp, br_cond.dest_tag); + fprintf(fp, "\n"); - break; + return br_cond.offset >= 0; } } - nr_ins++; + return false; } -static void -print_extended_branch_writeout_field(uint8_t *words) +static bool +print_extended_branch_writeout_field(FILE *fp, uint8_t *words, unsigned next) { midgard_branch_extended br; memcpy((char *) &br, (char *) words, sizeof(br)); - printf("brx."); + fprintf(fp, "brx."); - print_branch_op(br.op); + print_branch_op(fp, br.op); /* Condition codes are a LUT in the general case, but simply repeated 8 times for single-channel conditions.. Check this. */ @@ -766,23 +882,36 @@ } if (single_channel) - print_branch_cond(br.cond & 0x3); + print_branch_cond(fp, br.cond & 0x3); else - printf("lut%X", br.cond); + fprintf(fp, "lut%X", br.cond); if (br.unknown) - printf(".unknown%d", br.unknown); + fprintf(fp, ".unknown%u", br.unknown); - printf(" "); + fprintf(fp, " "); if (br.offset >= 0) - printf("+"); + fprintf(fp, "+"); + + fprintf(fp, "%d -> ", br.offset); + print_tag_short(fp, br.dest_tag); + fprintf(fp, "\n"); - printf("%d -> ", br.offset); - print_tag_short(br.dest_tag); - printf("\n"); + unsigned I = next + br.offset * 4; - nr_ins++; + if (midg_tags[I] && midg_tags[I] != br.dest_tag) { + fprintf(fp, "\t/* XXX TAG ERROR: jumping to "); + print_tag_short(fp, br.dest_tag); + fprintf(fp, " but tagged "); + print_tag_short(fp, midg_tags[I]); + fprintf(fp, " */\n"); + } + + midg_tags[I] = br.dest_tag; + + midg_stats.instruction_count++; + return br.offset >= 0; } static unsigned @@ -808,134 +937,109 @@ return ret; } -static float -float_bitcast(uint32_t integer) -{ - union { - uint32_t i; - float f; - } v; - - v.i = integer; - return v.f; -} - -static void -print_alu_word(uint32_t *words, unsigned num_quad_words, - unsigned tabs) +static bool +print_alu_word(FILE *fp, uint32_t *words, unsigned num_quad_words, + unsigned tabs, unsigned next) { uint32_t control_word = words[0]; uint16_t *beginning_ptr = (uint16_t *)(words + 1); unsigned num_fields = num_alu_fields_enabled(control_word); uint16_t *word_ptr = beginning_ptr + num_fields; unsigned num_words = 2 + num_fields; + const midgard_constants *consts = NULL; + bool branch_forward = false; + + if ((control_word >> 17) & 1) + num_words += 3; + + if ((control_word >> 19) & 1) + num_words += 2; + + if ((control_word >> 21) & 1) + num_words += 3; + + if ((control_word >> 23) & 1) + num_words += 2; + + if ((control_word >> 25) & 1) + num_words += 3; + + if ((control_word >> 26) & 1) + num_words += 1; + + if ((control_word >> 27) & 1) + num_words += 3; + + if (num_quad_words > (num_words + 7) / 8) { + assert(num_quad_words == (num_words + 15) / 8); + //Assume that the extra quadword is constants + consts = (midgard_constants *)(words + (4 * num_quad_words - 4)); + } if ((control_word >> 16) & 1) - printf("unknown bit 16 enabled\n"); + fprintf(fp, "unknown bit 16 enabled\n"); if ((control_word >> 17) & 1) { - print_vector_field("vmul", word_ptr, *beginning_ptr, tabs); + print_vector_field(fp, "vmul", word_ptr, *beginning_ptr, consts, tabs); beginning_ptr += 1; word_ptr += 3; - num_words += 3; } if ((control_word >> 18) & 1) - printf("unknown bit 18 enabled\n"); + fprintf(fp, "unknown bit 18 enabled\n"); if ((control_word >> 19) & 1) { - print_scalar_field("sadd", word_ptr, *beginning_ptr, tabs); + print_scalar_field(fp, "sadd", word_ptr, *beginning_ptr, consts, tabs); beginning_ptr += 1; word_ptr += 2; - num_words += 2; } if ((control_word >> 20) & 1) - printf("unknown bit 20 enabled\n"); + fprintf(fp, "unknown bit 20 enabled\n"); if ((control_word >> 21) & 1) { - print_vector_field("vadd", word_ptr, *beginning_ptr, tabs); + print_vector_field(fp, "vadd", word_ptr, *beginning_ptr, consts, tabs); beginning_ptr += 1; word_ptr += 3; - num_words += 3; } if ((control_word >> 22) & 1) - printf("unknown bit 22 enabled\n"); + fprintf(fp, "unknown bit 22 enabled\n"); if ((control_word >> 23) & 1) { - print_scalar_field("smul", word_ptr, *beginning_ptr, tabs); + print_scalar_field(fp, "smul", word_ptr, *beginning_ptr, consts, tabs); beginning_ptr += 1; word_ptr += 2; - num_words += 2; } if ((control_word >> 24) & 1) - printf("unknown bit 24 enabled\n"); + fprintf(fp, "unknown bit 24 enabled\n"); if ((control_word >> 25) & 1) { - print_vector_field("lut", word_ptr, *beginning_ptr, tabs); - beginning_ptr += 1; + print_vector_field(fp, "lut", word_ptr, *beginning_ptr, consts, tabs); word_ptr += 3; - num_words += 3; } if ((control_word >> 26) & 1) { - print_compact_branch_writeout_field(*word_ptr); + branch_forward |= print_compact_branch_writeout_field(fp, *word_ptr); word_ptr += 1; - num_words += 1; } if ((control_word >> 27) & 1) { - print_extended_branch_writeout_field((uint8_t *) word_ptr); + branch_forward |= print_extended_branch_writeout_field(fp, (uint8_t *) word_ptr, next); word_ptr += 3; - num_words += 3; } - if (num_quad_words > (num_words + 7) / 8) { - assert(num_quad_words == (num_words + 15) / 8); - //Assume that the extra quadword is constants - void *consts = words + (4 * num_quad_words - 4); - - if (is_embedded_constant_int) { - if (is_embedded_constant_half) { - int16_t *sconsts = (int16_t *) consts; - printf("sconstants %d, %d, %d, %d\n", - sconsts[0], - sconsts[1], - sconsts[2], - sconsts[3]); - } else { - int32_t *iconsts = (int32_t *) consts; - printf("iconstants %d, %d, %d, %d\n", - iconsts[0], - iconsts[1], - iconsts[2], - iconsts[3]); - } - } else { - if (is_embedded_constant_half) { - uint16_t *hconsts = (uint16_t *) consts; - printf("hconstants %g, %g, %g, %g\n", - _mesa_half_to_float(hconsts[0]), - _mesa_half_to_float(hconsts[1]), - _mesa_half_to_float(hconsts[2]), - _mesa_half_to_float(hconsts[3])); - } else { - uint32_t *fconsts = (uint32_t *) consts; - printf("fconstants %g, %g, %g, %g\n", - float_bitcast(fconsts[0]), - float_bitcast(fconsts[1]), - float_bitcast(fconsts[2]), - float_bitcast(fconsts[3])); - } + if (consts) + fprintf(fp, "uconstants 0x%X, 0x%X, 0x%X, 0x%X\n", + consts->u32[0], consts->u32[1], + consts->u32[2], consts->u32[3]); - } - } + return branch_forward; } static void -print_varying_parameters(midgard_load_store_word *word) +print_varying_parameters(FILE *fp, midgard_load_store_word *word) { midgard_varying_parameter param; unsigned v = word->varying_parameters; @@ -944,29 +1048,29 @@ if (param.is_varying) { /* If a varying, there are qualifiers */ if (param.flat) - printf(".flat"); + fprintf(fp, ".flat"); if (param.interpolation != midgard_interp_default) { if (param.interpolation == midgard_interp_centroid) - printf(".centroid"); + fprintf(fp, ".centroid"); else - printf(".interp%d", param.interpolation); + fprintf(fp, ".interp%d", param.interpolation); } if (param.modifier != midgard_varying_mod_none) { if (param.modifier == midgard_varying_mod_perspective_w) - printf(".perspectivew"); + fprintf(fp, ".perspectivew"); else if (param.modifier == midgard_varying_mod_perspective_z) - printf(".perspectivez"); + fprintf(fp, ".perspectivez"); else - printf(".mod%d", param.modifier); + fprintf(fp, ".mod%d", param.modifier); } } else if (param.flat || param.interpolation || param.modifier) { - printf(" /* is_varying not set but varying metadata attached */"); + fprintf(fp, " /* is_varying not set but varying metadata attached */"); } if (param.zero0 || param.zero1 || param.zero2) - printf(" /* zero tripped, %d %d %d */ ", param.zero0, param.zero1, param.zero2); + fprintf(fp, " /* zero tripped, %u %u %u */ ", param.zero0, param.zero1, param.zero2); } static bool @@ -987,8 +1091,22 @@ return false; } +static bool +is_op_attribute(unsigned op) +{ + switch (op) { + case midgard_op_ld_attr_16: + case midgard_op_ld_attr_32: + case midgard_op_ld_attr_32i: + case midgard_op_ld_attr_32u: + return true; + } + + return false; +} + static void -print_load_store_arg(uint8_t arg, unsigned index) +print_load_store_arg(FILE *fp, uint8_t arg, unsigned index) { /* Try to interpret as a register */ midgard_ldst_register_select sel; @@ -998,41 +1116,64 @@ * interpret it. But if it's zero, we get it. */ if (sel.unknown) { - printf("0x%02X", arg); + fprintf(fp, "0x%02X", arg); return; } unsigned reg = REGISTER_LDST_BASE + sel.select; char comp = components[sel.component]; - printf("r%d.%c", reg, comp); + fprintf(fp, "r%u.%c", reg, comp); /* Only print a shift if it's non-zero. Shifts only make sense for the * second index. For the first, we're not sure what it means yet */ if (index == 1) { if (sel.shift) - printf(" << %d", sel.shift); + fprintf(fp, " << %u", sel.shift); } else { - printf(" /* %X */", sel.shift); + fprintf(fp, " /* %X */", sel.shift); } } static void -print_load_store_instr(uint64_t data, +update_stats(signed *stat, unsigned address) +{ + if (*stat >= 0) + *stat = MAX2(*stat, address + 1); +} + +static void +print_load_store_instr(FILE *fp, uint64_t data, unsigned tabs) { midgard_load_store_word *word = (midgard_load_store_word *) &data; - print_ld_st_opcode(word->op); + print_ld_st_opcode(fp, word->op); + + unsigned address = word->address; - if (is_op_varying(word->op)) - print_varying_parameters(word); + if (is_op_varying(word->op)) { + print_varying_parameters(fp, word); + + /* Do some analysis: check if direct cacess */ + + if ((word->arg_2 == 0x1E) && midg_stats.varying_count >= 0) + update_stats(&midg_stats.varying_count, address); + else + midg_stats.varying_count = -16; + } else if (is_op_attribute(word->op)) { + if ((word->arg_2 == 0x1E) && midg_stats.attribute_count >= 0) + update_stats(&midg_stats.attribute_count, address); + else + midg_stats.attribute_count = -16; + } - printf(" r%d", word->reg); - print_mask_4(word->mask); + fprintf(fp, " r%u", word->reg + (OP_IS_STORE(word->op) ? 26 : 0)); + print_mask_4(fp, word->mask, false); - int address = word->address; + if (!OP_IS_STORE(word->op)) + update_dest(word->reg); bool is_ubo = OP_IS_UBO_READ(word->op); @@ -1046,66 +1187,68 @@ address = (hi << 3) | lo; } - printf(", %d", address); + fprintf(fp, ", %u", address); - print_swizzle_vec4(word->swizzle, false, false); + print_swizzle_vec4(fp, word->swizzle, false, false); - printf(", "); + fprintf(fp, ", "); - if (is_ubo) - printf("ubo%d", word->arg_1); - else - print_load_store_arg(word->arg_1, 0); + if (is_ubo) { + fprintf(fp, "ubo%u", word->arg_1); + update_stats(&midg_stats.uniform_buffer_count, word->arg_1); + } else + print_load_store_arg(fp, word->arg_1, 0); - printf(", "); - print_load_store_arg(word->arg_2, 1); - printf(" /* %X */\n", word->varying_parameters); + fprintf(fp, ", "); + print_load_store_arg(fp, word->arg_2, 1); + fprintf(fp, " /* %X */\n", word->varying_parameters); - nr_ins++; + midg_stats.instruction_count++; } static void -print_load_store_word(uint32_t *word, unsigned tabs) +print_load_store_word(FILE *fp, uint32_t *word, unsigned tabs) { midgard_load_store *load_store = (midgard_load_store *) word; if (load_store->word1 != 3) { - print_load_store_instr(load_store->word1, tabs); + print_load_store_instr(fp, load_store->word1, tabs); } if (load_store->word2 != 3) { - print_load_store_instr(load_store->word2, tabs); + print_load_store_instr(fp, load_store->word2, tabs); } } static void -print_texture_reg(bool full, bool select, bool upper) +print_texture_reg_select(FILE *fp, uint8_t u, unsigned base) { - if (full) - printf("r%d", REG_TEX_BASE + select); - else - printf("hr%d", (REG_TEX_BASE + select) * 2 + upper); + midgard_tex_register_select sel; + memcpy(&sel, &u, sizeof(u)); - if (full && upper) - printf("// error: out full / upper mutually exclusive\n"); + if (!sel.full) + fprintf(fp, "h"); -} + fprintf(fp, "r%u", base + sel.select); -static void -print_texture_reg_triple(unsigned triple) -{ - bool full = triple & 1; - bool select = triple & 2; - bool upper = triple & 4; + unsigned component = sel.component; + + /* Use the upper half in half-reg mode */ + if (sel.upper) { + assert(!sel.full); + component += 4; + } + + fprintf(fp, ".%c", components[component]); - print_texture_reg(full, select, upper); + assert(sel.zero == 0); } static void -print_texture_format(int format) +print_texture_format(FILE *fp, int format) { /* Act like a modifier */ - printf("."); + fprintf(fp, "."); switch (format) { DEFINE_CASE(MALI_TEX_1D, "1d"); @@ -1118,21 +1261,37 @@ } } +static bool +midgard_op_has_helpers(unsigned op, bool gather) +{ + if (gather) + return true; + + switch (op) { + case TEXTURE_OP_NORMAL: + case TEXTURE_OP_DFDX: + case TEXTURE_OP_DFDY: + return true; + default: + return false; + } +} + static void -print_texture_op(unsigned op, bool gather) +print_texture_op(FILE *fp, unsigned op, bool gather) { /* Act like a bare name, like ESSL functions */ if (gather) { - printf("textureGather"); + fprintf(fp, "textureGather"); unsigned component = op >> 4; unsigned bottom = op & 0xF; if (bottom != 0x2) - printf("_unk%d", bottom); + fprintf(fp, "_unk%u", bottom); - printf(".%c", components[component]); + fprintf(fp, ".%c", components[component]); return; } @@ -1140,11 +1299,12 @@ DEFINE_CASE(TEXTURE_OP_NORMAL, "texture"); DEFINE_CASE(TEXTURE_OP_LOD, "textureLod"); DEFINE_CASE(TEXTURE_OP_TEXEL_FETCH, "texelFetch"); + DEFINE_CASE(TEXTURE_OP_BARRIER, "barrier"); DEFINE_CASE(TEXTURE_OP_DFDX, "dFdx"); DEFINE_CASE(TEXTURE_OP_DFDY, "dFdy"); default: - printf("tex_%X", op); + fprintf(fp, "tex_%X", op); break; } } @@ -1171,51 +1331,119 @@ } +static void +print_texture_barrier(FILE *fp, uint32_t *word) +{ + midgard_texture_barrier_word *barrier = (midgard_texture_barrier_word *) word; + + if (barrier->type != 0x4) + fprintf(fp, "/* barrier tag %X != 0x4 */ ", barrier->type); + + if (!barrier->cont) + fprintf(fp, "/* cont missing? */"); + + if (!barrier->last) + fprintf(fp, "/* last missing? */"); + + if (barrier->zero1) + fprintf(fp, "/* zero1 = 0x%X */ ", barrier->zero1); + + if (barrier->zero2) + fprintf(fp, "/* zero2 = 0x%X */ ", barrier->zero2); + + if (barrier->zero3) + fprintf(fp, "/* zero3 = 0x%X */ ", barrier->zero3); + + if (barrier->zero4) + fprintf(fp, "/* zero4 = 0x%X */ ", barrier->zero4); + + if (barrier->zero5) + fprintf(fp, "/* zero4 = 0x%" PRIx64 " */ ", barrier->zero5); + + fprintf(fp, " 0x%X\n", barrier->unknown4); +} + #undef DEFINE_CASE static void -print_texture_word(uint32_t *word, unsigned tabs) +print_texture_word(FILE *fp, uint32_t *word, unsigned tabs, unsigned in_reg_base, unsigned out_reg_base) { midgard_texture_word *texture = (midgard_texture_word *) word; + midg_stats.helper_invocations |= + midgard_op_has_helpers(texture->op, texture->is_gather); + /* Broad category of texture operation in question */ - print_texture_op(texture->op, texture->is_gather); + print_texture_op(fp, texture->op, texture->is_gather); - /* Specific format in question */ - print_texture_format(texture->format); + /* Barriers use a dramatically different code path */ + if (texture->op == TEXTURE_OP_BARRIER) { + print_texture_barrier(fp, word); + return; + } else if (texture->type == 0x4) + fprintf (fp, "/* nonbarrier had tag 0x4 */ "); - assert(texture->zero == 0); + /* Specific format in question */ + print_texture_format(fp, texture->format); /* Instruction "modifiers" parallel the ALU instructions. */ if (texture->shadow) - printf(".shadow"); + fprintf(fp, ".shadow"); if (texture->cont) - printf(".cont"); + fprintf(fp, ".cont"); if (texture->last) - printf(".last"); + fprintf(fp, ".last"); /* Output modifiers are always interpreted floatly */ - print_outmod(texture->outmod, false); + print_outmod(fp, texture->outmod, false); - printf(" "); + fprintf(fp, " %sr%u", texture->out_full ? "" : "h", + out_reg_base + texture->out_reg_select); + print_mask_4(fp, texture->mask, texture->out_upper); + assert(!(texture->out_full && texture->out_upper)); + fprintf(fp, ", "); + + /* Depending on whether we read from textures directly or indirectly, + * we may be able to update our analysis */ + + if (texture->texture_register) { + fprintf(fp, "texture["); + print_texture_reg_select(fp, texture->texture_handle, in_reg_base); + fprintf(fp, "], "); - print_texture_reg(texture->out_full, texture->out_reg_select, texture->out_upper); - print_mask_4(texture->mask); - printf(", "); - - printf("texture%d, ", texture->texture_handle); + /* Indirect, tut tut */ + midg_stats.texture_count = -16; + } else { + fprintf(fp, "texture%u, ", texture->texture_handle); + update_stats(&midg_stats.texture_count, texture->texture_handle); + } /* Print the type, GL style */ - printf("%c", sampler_type_name(texture->sampler_type)); - printf("sampler%d", texture->sampler_handle); - print_swizzle_vec4(texture->swizzle, false, false); - printf(", "); + fprintf(fp, "%csampler", sampler_type_name(texture->sampler_type)); + + if (texture->sampler_register) { + fprintf(fp, "["); + print_texture_reg_select(fp, texture->sampler_handle, in_reg_base); + fprintf(fp, "]"); + + midg_stats.sampler_count = -16; + } else { + fprintf(fp, "%u", texture->sampler_handle); + update_stats(&midg_stats.sampler_count, texture->sampler_handle); + } - print_texture_reg(texture->in_reg_full, texture->in_reg_select, texture->in_reg_upper); - print_swizzle_vec4(texture->in_reg_swizzle, false, false); + print_swizzle_vec4(fp, texture->swizzle, false, false); + fprintf(fp, ", %sr%u", texture->in_reg_full ? "" : "h", in_reg_base + texture->in_reg_select); + assert(!(texture->in_reg_full && texture->in_reg_upper)); + + /* TODO: integrate with swizzle */ + if (texture->in_reg_upper) + fprintf(fp, "'"); + + print_swizzle_vec4(fp, texture->in_reg_swizzle, false, false); /* There is *always* an offset attached. Of * course, that offset is just immediate #0 for a @@ -1228,73 +1456,62 @@ * fields become register triplets */ if (texture->offset_register) { - printf(" + "); - print_texture_reg_triple(texture->offset_x); - - /* The less questions you ask, the better. */ + fprintf(fp, " + "); - unsigned swizzle_lo, swizzle_hi; - unsigned orig_y = texture->offset_y; - unsigned orig_z = texture->offset_z; + bool full = texture->offset & 1; + bool select = texture->offset & 2; + bool upper = texture->offset & 4; - memcpy(&swizzle_lo, &orig_y, sizeof(unsigned)); - memcpy(&swizzle_hi, &orig_z, sizeof(unsigned)); + fprintf(fp, "%sr%u", full ? "" : "h", in_reg_base + select); + assert(!(texture->out_full && texture->out_upper)); - /* Duplicate hi swizzle over */ - assert(swizzle_hi < 4); - swizzle_hi = (swizzle_hi << 2) | swizzle_hi; + /* TODO: integrate with swizzle */ + if (upper) + fprintf(fp, "'"); - unsigned swiz = (swizzle_lo << 4) | swizzle_hi; - unsigned reversed = util_bitreverse(swiz) >> 24; - print_swizzle_vec4(reversed, false, false); + print_swizzle_vec4(fp, texture->offset >> 3, false, false); - printf(", "); - } else if (texture->offset_x || texture->offset_y || texture->offset_z) { + fprintf(fp, ", "); + } else if (texture->offset) { /* Only select ops allow negative immediate offsets, verify */ - bool neg_x = texture->offset_x < 0; - bool neg_y = texture->offset_y < 0; - bool neg_z = texture->offset_z < 0; + signed offset_x = (texture->offset & 0xF); + signed offset_y = ((texture->offset >> 4) & 0xF); + signed offset_z = ((texture->offset >> 8) & 0xF); + + bool neg_x = offset_x < 0; + bool neg_y = offset_y < 0; + bool neg_z = offset_z < 0; bool any_neg = neg_x || neg_y || neg_z; if (any_neg && texture->op != TEXTURE_OP_TEXEL_FETCH) - printf("/* invalid negative */ "); + fprintf(fp, "/* invalid negative */ "); /* Regardless, just print the immediate offset */ - printf(" + <%d, %d, %d>, ", - texture->offset_x, - texture->offset_y, - texture->offset_z); + fprintf(fp, " + <%d, %d, %d>, ", offset_x, offset_y, offset_z); } else { - printf(", "); + fprintf(fp, ", "); } char lod_operand = texture_op_takes_bias(texture->op) ? '+' : '='; if (texture->lod_register) { - midgard_tex_register_select sel; - uint8_t raw = texture->bias; - memcpy(&sel, &raw, sizeof(raw)); - - printf("lod %c ", lod_operand); - print_texture_reg(sel.full, sel.select, sel.upper); - printf(".%c, ", components[sel.component]); + fprintf(fp, "lod %c ", lod_operand); + print_texture_reg_select(fp, texture->bias, in_reg_base); + fprintf(fp, ", "); if (texture->bias_int) - printf(" /* bias_int = 0x%X */", texture->bias_int); - - if (sel.zero) - printf(" /* sel.zero = 0x%X */", sel.zero); + fprintf(fp, " /* bias_int = 0x%X */", texture->bias_int); } else if (texture->op == TEXTURE_OP_TEXEL_FETCH) { /* For texel fetch, the int LOD is in the fractional place and * there is no fraction / possibility of bias. We *always* have * an explicit LOD, even if it's zero. */ if (texture->bias_int) - printf(" /* bias_int = 0x%X */ ", texture->bias_int); + fprintf(fp, " /* bias_int = 0x%X */ ", texture->bias_int); - printf("lod = %d, ", texture->bias); + fprintf(fp, "lod = %u, ", texture->bias); } else if (texture->bias || texture->bias_int) { signed bias_int = texture->bias_int; float bias_frac = texture->bias / 256.0f; @@ -1304,10 +1521,10 @@ char sign = (bias >= 0.0) ? '+' : '-'; char operand = is_bias ? sign : '='; - printf("lod %c %f, ", operand, fabsf(bias)); + fprintf(fp, "lod %c %f, ", operand, fabsf(bias)); } - printf("\n"); + fprintf(fp, "\n"); /* While not zero in general, for these simple instructions the * following unknowns are zero, so we don't include them */ @@ -1315,45 +1532,57 @@ if (texture->unknown4 || texture->unknownA || texture->unknown8) { - printf("// unknown4 = 0x%x\n", texture->unknown4); - printf("// unknownA = 0x%x\n", texture->unknownA); - printf("// unknown8 = 0x%x\n", texture->unknown8); + fprintf(fp, "// unknown4 = 0x%x\n", texture->unknown4); + fprintf(fp, "// unknownA = 0x%x\n", texture->unknownA); + fprintf(fp, "// unknown8 = 0x%x\n", texture->unknown8); } - nr_ins++; + midg_stats.instruction_count++; } -void -disassemble_midgard(uint8_t *code, size_t size, bool stats, unsigned nr_registers, const char *prefix) +struct midgard_disasm_stats +disassemble_midgard(FILE *fp, uint8_t *code, size_t size, unsigned gpu_id, gl_shader_stage stage) { uint32_t *words = (uint32_t *) code; unsigned num_words = size / 4; int tabs = 0; - bool prefetch_flag = false; + bool branch_forward = false; int last_next_tag = -1; unsigned i = 0; + midg_tags = calloc(sizeof(midg_tags[0]), num_words); + /* Stats for shader-db */ - unsigned nr_bundles = 0; - unsigned nr_quadwords = 0; - nr_ins = 0; + memset(&midg_stats, 0, sizeof(midg_stats)); + midg_ever_written = 0; while (i < num_words) { unsigned tag = words[i] & 0xF; unsigned next_tag = (words[i] >> 4) & 0xF; + fprintf(fp, "\t%X -> %X\n", tag, next_tag); unsigned num_quad_words = midgard_word_size[tag]; + if (midg_tags[i] && midg_tags[i] != tag) { + fprintf(fp, "\t/* XXX: TAG ERROR branch, got "); + print_tag_short(fp, tag); + fprintf(fp, " expected "); + print_tag_short(fp, midg_tags[i]); + fprintf(fp, " */\n"); + } + + midg_tags[i] = tag; + /* Check the tag */ if (last_next_tag > 1) { if (last_next_tag != tag) { - printf("/* TAG ERROR got "); - print_tag_short(tag); - printf(" expected "); - print_tag_short(last_next_tag); - printf(" */ "); + fprintf(fp, "\t/* XXX: TAG ERROR sequence, got "); + print_tag_short(fp, tag); + fprintf(fp, " expected "); + print_tag_short(fp, last_next_tag); + fprintf(fp, " */\n"); } } else { /* TODO: Check ALU case */ @@ -1362,16 +1591,22 @@ last_next_tag = next_tag; switch (midgard_word_types[tag]) { - case midgard_word_type_texture: - print_texture_word(&words[i], tabs); + case midgard_word_type_texture: { + bool interpipe_aliasing = + midgard_get_quirks(gpu_id) & MIDGARD_INTERPIPE_REG_ALIASING; + + print_texture_word(fp, &words[i], tabs, + interpipe_aliasing ? 0 : REG_TEX_BASE, + interpipe_aliasing ? REGISTER_LDST_BASE : REG_TEX_BASE); break; + } case midgard_word_type_load_store: - print_load_store_word(&words[i], tabs); + print_load_store_word(fp, &words[i], tabs); break; case midgard_word_type_alu: - print_alu_word(&words[i], num_quad_words, tabs); + branch_forward = print_alu_word(fp, &words[i], num_quad_words, tabs, i + 4*num_quad_words); /* Reset word static analysis state */ is_embedded_constant_half = false; @@ -1380,48 +1615,36 @@ break; default: - printf("Unknown word type %u:\n", words[i] & 0xF); + fprintf(fp, "Unknown word type %u:\n", words[i] & 0xF); num_quad_words = 1; - print_quad_word(&words[i], tabs); - printf("\n"); + print_quad_word(fp, &words[i], tabs); + fprintf(fp, "\n"); break; } - if (prefetch_flag && midgard_word_types[tag] == midgard_word_type_alu) - break; - - printf("\n"); - - unsigned next = (words[i] & 0xF0) >> 4; + /* We are parsing per bundle anyway. Add before we start + * breaking out so we don't miss the final bundle. */ - /* We are parsing per bundle anyway */ - nr_bundles++; - nr_quadwords += num_quad_words; + midg_stats.bundle_count++; + midg_stats.quadword_count += num_quad_words; - /* Break based on instruction prefetch flag */ + fprintf(fp, "\n"); - if (i < num_words && next == 1) { - prefetch_flag = true; + unsigned next = (words[i] & 0xF0) >> 4; - if (midgard_word_types[words[i] & 0xF] != midgard_word_type_alu) - break; - } + if (i < num_words && next == 1 && !branch_forward) + break; i += 4 * num_quad_words; } - if (stats) { - unsigned nr_threads = - (nr_registers <= 4) ? 4 : - (nr_registers <= 8) ? 2 : - 1; - - printf("%s" - "%u inst, %u bundles, %u quadwords, " - "%u registers, %u threads, 0 loops\n", - prefix, - nr_ins, nr_bundles, nr_quadwords, - nr_registers, nr_threads); + free(midg_tags); - } + /* We computed work_count as max_work_registers, so add one to get the + * count. If no work registers are written, you still have one work + * reported, which is exactly what the hardware expects */ + + midg_stats.work_count++; + + return midg_stats; } diff -Nru mesa-19.2.8/src/panfrost/midgard/disassemble.h mesa-20.0.8/src/panfrost/midgard/disassemble.h --- mesa-19.2.8/src/panfrost/midgard/disassemble.h 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/panfrost/midgard/disassemble.h 2020-06-12 01:21:18.000000000 +0000 @@ -1,2 +1,27 @@ #include -void disassemble_midgard(uint8_t *code, size_t size, bool stats, unsigned regs, const char *prefix); +#include "compiler/shader_enums.h" + +struct midgard_disasm_stats { + /* Counts gleaned from disassembly, or negative if the field cannot be + * inferred, for instance due to indirect access. If negative, the abs + * is the upper limit for the count. */ + + signed texture_count; + signed sampler_count; + signed attribute_count; + signed varying_count; + signed uniform_count; + signed uniform_buffer_count; + signed work_count; + + /* These are pseudometrics for shader-db */ + unsigned instruction_count; + unsigned bundle_count; + unsigned quadword_count; + + /* Should we enable helper invocations? */ + bool helper_invocations; +}; + +struct midgard_disasm_stats +disassemble_midgard(FILE *fp, uint8_t *code, size_t size, unsigned gpu_id, gl_shader_stage stage); diff -Nru mesa-19.2.8/src/panfrost/midgard/helpers.h mesa-20.0.8/src/panfrost/midgard/helpers.h --- mesa-19.2.8/src/panfrost/midgard/helpers.h 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/panfrost/midgard/helpers.h 2020-06-12 01:21:18.000000000 +0000 @@ -23,6 +23,7 @@ #define __MDG_HELPERS_H #include "util/macros.h" +#include #include #define OP_IS_LOAD_VARY_F(op) (\ @@ -30,26 +31,6 @@ op == midgard_op_ld_vary_32 \ ) -#define OP_IS_STORE_VARY(op) (\ - op == midgard_op_st_vary_16 || \ - op == midgard_op_st_vary_32 || \ - op == midgard_op_st_vary_32u || \ - op == midgard_op_st_vary_32i \ - ) - -#define OP_IS_STORE_R26(op) (\ - OP_IS_STORE_VARY(op) || \ - op == midgard_op_st_char || \ - op == midgard_op_st_char2 || \ - op == midgard_op_st_char4 || \ - op == midgard_op_st_short4 || \ - op == midgard_op_st_int4 \ - ) - -#define OP_IS_STORE(op) (\ - OP_IS_STORE_R26(op) \ - ) - #define OP_IS_PROJECTION(op) ( \ op == midgard_op_ldst_perspective_division_z || \ op == midgard_op_ldst_perspective_division_w \ @@ -73,10 +54,14 @@ op == midgard_op_ld_ubo_int4 \ ) +#define OP_IS_CSEL_V(op) ( \ + op == midgard_alu_op_icsel_v || \ + op == midgard_alu_op_fcsel_v \ + ) + #define OP_IS_CSEL(op) ( \ + OP_IS_CSEL_V(op) || \ op == midgard_alu_op_icsel || \ - op == midgard_alu_op_icsel_v || \ - op == midgard_alu_op_fcsel_v || \ op == midgard_alu_op_fcsel \ ) @@ -85,6 +70,19 @@ op == TEXTURE_OP_DFDY \ ) +#define OP_IS_UNSIGNED_CMP(op) ( \ + op == midgard_alu_op_ult || \ + op == midgard_alu_op_ule \ + ) + +#define OP_IS_INTEGER_CMP(op) ( \ + op == midgard_alu_op_ieq || \ + op == midgard_alu_op_ine || \ + op == midgard_alu_op_ilt || \ + op == midgard_alu_op_ile || \ + OP_IS_UNSIGNED_CMP(op) \ + ) + /* ALU control words are single bit fields with a lot of space */ #define ALU_ENAB_VEC_MUL (1 << 17) @@ -139,28 +137,7 @@ #define TAG_ALU_12 0xA #define TAG_ALU_16 0xB -static inline int -quadword_size(int tag) -{ - switch (tag) { - case TAG_ALU_4: - case TAG_LOAD_STORE_4: - case TAG_TEXTURE_4: - case TAG_TEXTURE_4_VTX: - return 1; - case TAG_ALU_8: - return 2; - case TAG_ALU_12: - return 3; - case TAG_ALU_16: - return 4; - default: - unreachable("Unknown tag"); - } -} - -#define IS_ALU(tag) (tag == TAG_ALU_4 || tag == TAG_ALU_8 || \ - tag == TAG_ALU_12 || tag == TAG_ALU_16) +#define IS_ALU(tag) (tag >= TAG_ALU_4) /* Special register aliases */ @@ -175,52 +152,30 @@ #define REGISTER_TEXTURE_BASE 28 #define REGISTER_SELECT 31 -/* SSA helper aliases to mimic the registers. UNUSED_0 encoded as an inline - * constant. UNUSED_1 encoded as REGISTER_UNUSED */ - -#define SSA_UNUSED_0 0 -#define SSA_UNUSED_1 -2 +/* SSA helper aliases to mimic the registers. */ #define SSA_FIXED_SHIFT 24 #define SSA_FIXED_REGISTER(reg) (((1 + (reg)) << SSA_FIXED_SHIFT) | 1) #define SSA_REG_FROM_FIXED(reg) ((((reg) & ~1) >> SSA_FIXED_SHIFT) - 1) #define SSA_FIXED_MINIMUM SSA_FIXED_REGISTER(0) -/* Swizzle support */ - -#define SWIZZLE(A, B, C, D) ((D << 6) | (C << 4) | (B << 2) | (A << 0)) -#define SWIZZLE_FROM_ARRAY(r) SWIZZLE(r[0], r[1], r[2], r[3]) #define COMPONENT_X 0x0 #define COMPONENT_Y 0x1 #define COMPONENT_Z 0x2 #define COMPONENT_W 0x3 -#define SWIZZLE_XXXX SWIZZLE(COMPONENT_X, COMPONENT_X, COMPONENT_X, COMPONENT_X) -#define SWIZZLE_XYXX SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_X, COMPONENT_X) -#define SWIZZLE_XYZX SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_X) -#define SWIZZLE_XYZW SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_W) -#define SWIZZLE_XYXZ SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_X, COMPONENT_Z) -#define SWIZZLE_XYZZ SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_Z) -#define SWIZZLE_XXXY SWIZZLE(COMPONENT_X, COMPONENT_X, COMPONENT_X, COMPONENT_Y) -#define SWIZZLE_ZZZW SWIZZLE(COMPONENT_Z, COMPONENT_Z, COMPONENT_Z, COMPONENT_W) -#define SWIZZLE_ZWWW SWIZZLE(COMPONENT_Z, COMPONENT_W, COMPONENT_W, COMPONENT_W) -#define SWIZZLE_WWWW SWIZZLE(COMPONENT_W, COMPONENT_W, COMPONENT_W, COMPONENT_W) +#define SWIZZLE_IDENTITY { \ + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, \ + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, \ + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, \ + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } \ +} -static inline unsigned -swizzle_of(unsigned comp) -{ - switch (comp) { - case 1: - return SWIZZLE_XXXX; - case 2: - return SWIZZLE_XYXX; - case 3: - return SWIZZLE_XYZX; - case 4: - return SWIZZLE_XYZW; - default: - unreachable("Invalid component count"); - } +#define SWIZZLE_IDENTITY_4 { \ + { 0, 1, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, \ + { 0, 1, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, \ + { 0, 1, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, \ + { 0, 1, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, \ } static inline unsigned @@ -229,7 +184,6 @@ return (1 << nr_comp) - 1; } - /* See ISA notes */ #define LDST_NOP (3) @@ -261,24 +215,68 @@ unsigned props; }; +/* For load/store */ + +struct mir_ldst_op_props { + const char *name; + unsigned props; +}; + +/* Lower 2-bits are a midgard_reg_mode */ +#define GET_LDST_SIZE(c) (c & 3) + +/* Store (so the primary register is a source, not a destination */ +#define LDST_STORE (1 << 2) + +/* Mask has special meaning and should not be manipulated directly */ +#define LDST_SPECIAL_MASK (1 << 3) + +/* Non-store operation has side effects and should not be eliminated even if + * its mask is 0 */ +#define LDST_SIDE_FX (1 << 4) + /* This file is common, so don't define the tables themselves. #include * midgard_op.h if you need that, or edit midgard_ops.c directly */ -/* Duplicate bits to convert a 4-bit writemask to duplicated 8-bit format, - * which is used for 32-bit vector units */ +/* Duplicate bits to convert a per-component to duplicated 8-bit format, + * which is used for vector units */ static inline unsigned -expand_writemask_32(unsigned mask) +expand_writemask(unsigned mask, unsigned channels) { unsigned o = 0; + unsigned factor = 8 / channels; + unsigned expanded = (1 << factor) - 1; - for (int i = 0; i < 4; ++i) + for (unsigned i = 0; i < channels; ++i) if (mask & (1 << i)) - o |= (3 << (2 * i)); + o |= (expanded << (factor * i)); return o; } +/* Tansform an expanded writemask (duplicated 8-bit format) into its condensed + * form (one bit per component) */ + +static inline unsigned +condense_writemask(unsigned expanded_mask, + unsigned bits_per_component) +{ + if (bits_per_component == 8) + unreachable("XXX TODO: sort out how 8-bit constant encoding works"); + + unsigned slots_per_component = bits_per_component / 16; + unsigned max_comp = (16 * 8) / bits_per_component; + unsigned condensed_mask = 0; + + for (unsigned i = 0; i < max_comp; i++) { + if (expanded_mask & (1 << (i * slots_per_component))) + condensed_mask |= (1 << i); + } + + return condensed_mask; +} + /* Coerce structs to integer */ static inline unsigned @@ -297,44 +295,26 @@ return s; } -/* Composes two swizzles */ -static inline unsigned -pan_compose_swizzle(unsigned left, unsigned right) +static inline void +mir_compose_swizzle(unsigned *left, unsigned *right, unsigned *final_out) { - unsigned out = 0; - - for (unsigned c = 0; c < 4; ++c) { - unsigned s = (left >> (2*c)) & 0x3; - unsigned q = (right >> (2*s)) & 0x3; + unsigned out[16]; - out |= (q << (2*c)); - } + for (unsigned c = 0; c < 16; ++c) + out[c] = right[left[c]]; - return out; -} - -/* Applies a swizzle to an ALU source */ - -static inline unsigned -vector_alu_apply_swizzle(unsigned src, unsigned swizzle) -{ - midgard_vector_alu_src s = - vector_alu_from_unsigned(src); - - s.swizzle = pan_compose_swizzle(s.swizzle, swizzle); - - return vector_alu_srco_unsigned(s); + memcpy(final_out, out, sizeof(out)); } /* Checks for an xyzw.. swizzle, given a mask */ static inline bool -mir_is_simple_swizzle(unsigned swizzle, unsigned mask) +mir_is_simple_swizzle(unsigned *swizzle, unsigned mask) { for (unsigned i = 0; i < 16; ++i) { if (!(mask & (1 << i))) continue; - if (((swizzle >> (2 * i)) & 0x3) != i) + if (swizzle[i] != i) return false; } @@ -359,41 +339,15 @@ return packed; } -/* Unpacks a load/store argument */ - -static inline midgard_ldst_register_select -midgard_ldst_select(uint8_t u) -{ - midgard_ldst_register_select sel; - memcpy(&sel, &u, sizeof(u)); - return sel; -} - -static inline uint8_t -midgard_ldst_pack(midgard_ldst_register_select sel) -{ - uint8_t packed; - memcpy(&packed, &sel, sizeof(packed)); - return packed; -} - -/* Gets a swizzle like yyyy and returns y */ - -static inline unsigned -swizzle_to_component(unsigned swizzle) +static inline bool +midgard_is_branch_unit(unsigned unit) { - unsigned c = swizzle & 3; - assert(((swizzle >> 2) & 3) == c); - assert(((swizzle >> 4) & 3) == c); - assert(((swizzle >> 6) & 3) == c); - return c; + return (unit == ALU_ENAB_BRANCH) || (unit == ALU_ENAB_BR_COMPACT); } - -static inline unsigned -component_to_swizzle(unsigned c) -{ - return SWIZZLE(c, c, c, c); -} +void +mir_print_constant_component(FILE *fp, const midgard_constants *consts, + unsigned c, midgard_reg_mode reg_mode, bool half, + unsigned mod, midgard_alu_op op); #endif diff -Nru mesa-19.2.8/src/panfrost/midgard/lcra.c mesa-20.0.8/src/panfrost/midgard/lcra.c --- mesa-19.2.8/src/panfrost/midgard/lcra.c 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/panfrost/midgard/lcra.c 2020-06-12 01:21:18.000000000 +0000 @@ -0,0 +1,244 @@ +/* + * Copyright (C) 2019 Collabora, Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors (Collabora): + * Alyssa Rosenzweig + */ + +#include +#include +#include +#include +#include +#include "util/macros.h" +#include "util/u_math.h" +#include "lcra.h" + +/* This module is the reference implementation of "Linearly Constrained + * Register Allocation". The paper is available in PDF form + * (https://people.collabora.com/~alyssa/LCRA.pdf) as well as Markdown+LaTeX + * (https://gitlab.freedesktop.org/alyssa/lcra/blob/master/LCRA.md) + */ + +struct lcra_state * +lcra_alloc_equations( + unsigned node_count, + unsigned min_alignment, unsigned max_alignment, + unsigned bound, unsigned class_count) +{ + struct lcra_state *l = calloc(1, sizeof(*l)); + + l->node_count = node_count; + l->class_count = class_count; + l->bound = bound; + + l->alignment = calloc(sizeof(l->alignment[0]), node_count); + l->linear = calloc(sizeof(l->linear[0]), node_count * node_count); + l->modulus = calloc(sizeof(l->modulus[0]), node_count); + l->class = calloc(sizeof(l->class[0]), node_count); + l->class_start = calloc(sizeof(l->class_start[0]), class_count); + l->class_disjoint = calloc(sizeof(l->class_disjoint[0]), class_count * class_count); + l->class_size = calloc(sizeof(l->class_size[0]), class_count); + l->spill_cost = calloc(sizeof(l->spill_cost[0]), node_count); + l->solutions = calloc(sizeof(l->solutions[0]), node_count); + + memset(l->solutions, ~0, sizeof(l->solutions[0]) * node_count); + + return l; +} + +void +lcra_free(struct lcra_state *l) +{ + if (!l) + return; + + free(l->alignment); + free(l->linear); + free(l->modulus); + free(l->class); + free(l->class_start); + free(l->class_disjoint); + free(l->class_size); + free(l->spill_cost); + free(l->solutions); + + free(l); +} + +void +lcra_set_alignment(struct lcra_state *l, unsigned node, unsigned align_log2) +{ + l->alignment[node] = align_log2 + 1; +} + +void +lcra_set_disjoint_class(struct lcra_state *l, unsigned c1, unsigned c2) +{ + l->class_disjoint[(c1 * l->class_count) + c2] = true; + l->class_disjoint[(c2 * l->class_count) + c1] = true; +} + +void +lcra_restrict_range(struct lcra_state *l, unsigned node, unsigned len) +{ + if (node < l->node_count && l->alignment[node]) + l->modulus[node] = DIV_ROUND_UP(l->bound - len + 1, 1 << (l->alignment[node] - 1)); +} + +void +lcra_add_node_interference(struct lcra_state *l, unsigned i, unsigned cmask_i, unsigned j, unsigned cmask_j) +{ + if (i == j) + return; + + if (l->class_disjoint[(l->class[i] * l->class_count) + l->class[j]]) + return; + + uint32_t constraint_fw = 0; + uint32_t constraint_bw = 0; + + for (unsigned D = 0; D < 16; ++D) { + if (cmask_i & (cmask_j << D)) { + constraint_bw |= (1 << (15 + D)); + constraint_fw |= (1 << (15 - D)); + } + + if (cmask_i & (cmask_j >> D)) { + constraint_fw |= (1 << (15 + D)); + constraint_bw |= (1 << (15 - D)); + } + } + + l->linear[j * l->node_count + i] |= constraint_fw; + l->linear[i * l->node_count + j] |= constraint_bw; +} + +static bool +lcra_test_linear(struct lcra_state *l, unsigned *solutions, unsigned i) +{ + unsigned *row = &l->linear[i * l->node_count]; + signed constant = solutions[i]; + + for (unsigned j = 0; j < l->node_count; ++j) { + if (solutions[j] == ~0) continue; + + signed lhs = solutions[j] - constant; + + if (lhs < -15 || lhs > 15) + continue; + + if (row[j] & (1 << (lhs + 15))) + return false; + } + + return true; +} + +bool +lcra_solve(struct lcra_state *l) +{ + for (unsigned step = 0; step < l->node_count; ++step) { + if (l->solutions[step] != ~0) continue; + if (l->alignment[step] == 0) continue; + + unsigned _class = l->class[step]; + unsigned class_start = l->class_start[_class]; + + unsigned shift = l->alignment[step] - 1; + + unsigned P = l->bound >> shift; + unsigned Q = l->modulus[step]; + unsigned r_max = l->class_size[_class]; + unsigned k_max = r_max >> shift; + unsigned m_max = k_max / P; + bool succ = false; + + for (unsigned m = 0; m < m_max; ++m) { + for (unsigned n = 0; n < Q; ++n) { + l->solutions[step] = ((m * P + n) << shift) + class_start; + succ = lcra_test_linear(l, l->solutions, step); + + if (succ) break; + } + + if (succ) break; + } + + /* Out of registers - prepare to spill */ + if (!succ) { + l->spill_class = l->class[step]; + return false; + } + } + + return true; +} + +/* Register spilling is implemented with a cost-benefit system. Costs are set + * by the user. Benefits are calculated from the constraints. */ + +void +lcra_set_node_spill_cost(struct lcra_state *l, unsigned node, signed cost) +{ + if (node < l->node_count) + l->spill_cost[node] = cost; +} + +/* Count along the lower triangle */ + +static unsigned +lcra_count_constraints(struct lcra_state *l, unsigned i) +{ + unsigned count = 0; + unsigned *constraints = &l->linear[i * l->node_count]; + + for (unsigned j = 0; j < i; ++j) + count += util_bitcount(constraints[j]); + + return count; +} + +signed +lcra_get_best_spill_node(struct lcra_state *l) +{ + float best_benefit = -1.0; + signed best_node = -1; + + for (unsigned i = 0; i < l->node_count; ++i) { + /* Find spillable nodes */ + if (l->class[i] != l->spill_class) continue; + if (l->spill_cost[i] < 0) continue; + + /* Adapted from Chaitin's heuristic */ + float constraints = lcra_count_constraints(l, i); + float cost = (l->spill_cost[i] + 1); + float benefit = constraints / cost; + + if (benefit > best_benefit) { + best_benefit = benefit; + best_node = i; + } + } + + return best_node; +} diff -Nru mesa-19.2.8/src/panfrost/midgard/lcra.h mesa-20.0.8/src/panfrost/midgard/lcra.h --- mesa-19.2.8/src/panfrost/midgard/lcra.h 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/panfrost/midgard/lcra.h 2020-06-12 01:21:18.000000000 +0000 @@ -0,0 +1,112 @@ +/* + * Copyright (C) 2019 Collabora, Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors (Collabora): + * Alyssa Rosenzweig + */ + +#ifndef __LCRA_H +#define __LCRA_H + +#include +#include + +struct lcra_state { + unsigned node_count; + + /* Word boundary where vectors can't cross */ + unsigned bound; + + /* Alignment for node in log2(bytes)+1. Since alignment must be + * non-negative power-of-two, the elements are strictly positive + * integers. Zero is the sentinel for a missing node */ + unsigned *alignment; + + /* Linear constraints imposed. Nested array sized upfront, organized as + * linear[node_left][node_right]. That is, calculate indices as: + * + * Each element is itself a bit field denoting whether (c_j - c_i) bias + * is present or not, including negative biases. + * + * Note for Midgard, there are 16 components so the bias is in range + * [-15, 15] so encoded by 32-bit field. */ + + uint32_t *linear; + + /* Per node max modulus constraints */ + uint8_t *modulus; + + /* Classes allow nodes to be partitioned with a starting register. + * Classes cannot interfere; that is, they are true partitions in the + * usual sense of the word. class_count is the number of classes. + * class[] is indexed by a node to get the mapped class. class_start is + * biased to all solutions in the class. */ + + unsigned class_count; + unsigned *class; + unsigned *class_start; + unsigned *class_size; + bool *class_disjoint; + + /* Before solving, forced registers; after solving, solutions. */ + unsigned *solutions; + + /* For register spilling, the costs to spill nodes (as set by the user) + * are in spill_cost[], negative if a node is unspillable. Internally, + * spill_class specifies which class to spill (whichever class failed + * to allocate) */ + + signed *spill_cost; + unsigned spill_class; +}; + +struct lcra_state * +lcra_alloc_equations( + unsigned node_count, + unsigned min_alignment, unsigned max_alignment, + unsigned bound, unsigned class_count); + +void +lcra_free(struct lcra_state *l); + +void +lcra_set_disjoint_class(struct lcra_state *l, unsigned c1, unsigned c2); + +void +lcra_set_alignment(struct lcra_state *l, unsigned node, unsigned align_log2); + +void +lcra_restrict_range(struct lcra_state *l, unsigned node, unsigned len); + +void +lcra_add_node_interference(struct lcra_state *l, unsigned i, unsigned cmask_i, unsigned j, unsigned cmask_j); + +bool +lcra_solve(struct lcra_state *l); + +void +lcra_set_node_spill_cost(struct lcra_state *l, unsigned node, signed cost); + +signed +lcra_get_best_spill_node(struct lcra_state *l); + +#endif diff -Nru mesa-19.2.8/src/panfrost/midgard/meson.build mesa-20.0.8/src/panfrost/midgard/meson.build --- mesa-19.2.8/src/panfrost/midgard/meson.build 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/panfrost/midgard/meson.build 2020-06-12 01:21:18.000000000 +0000 @@ -31,12 +31,16 @@ 'midgard_liveness.c', 'midgard_ops.c', 'mir_promote_uniforms.c', + 'mir_squeeze.c', 'midgard_opt_copy_prop.c', 'midgard_opt_dce.c', 'midgard_opt_invert.c', + 'midgard_opt_float.c', 'midgard_opt_perspective.c', - 'cppwrap.cpp', + 'midgard_errata_lod.c', + 'nir_undef_to_zero.c', 'disassemble.c', + 'lcra.c' ) midgard_nir_algebraic_c = custom_target( diff -Nru mesa-19.2.8/src/panfrost/midgard/midgard_compile.c mesa-20.0.8/src/panfrost/midgard/midgard_compile.c --- mesa-19.2.8/src/panfrost/midgard/midgard_compile.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/panfrost/midgard/midgard_compile.c 2020-06-12 01:21:18.000000000 +0000 @@ -48,6 +48,7 @@ #include "midgard_ops.h" #include "helpers.h" #include "compiler.h" +#include "midgard_quirks.h" #include "disassemble.h" @@ -68,11 +69,18 @@ do { if (midgard_debug & MIDGARD_DBG_MSGS) \ fprintf(stderr, "%s:%d: "fmt, \ __FUNCTION__, __LINE__, ##__VA_ARGS__); } while (0) - -static bool -midgard_is_branch_unit(unsigned unit) +static midgard_block * +create_empty_block(compiler_context *ctx) { - return (unit == ALU_ENAB_BRANCH) || (unit == ALU_ENAB_BR_COMPACT); + midgard_block *blk = rzalloc(ctx, midgard_block); + + blk->predecessors = _mesa_set_create(blk, + _mesa_hash_pointer, + _mesa_key_pointer_equal); + + blk->source_id = ctx->block_source_count++; + + return blk; } static void @@ -94,6 +102,19 @@ _mesa_set_add(successor->predecessors, block); } +static void +schedule_barrier(compiler_context *ctx) +{ + midgard_block *temp = ctx->after_block; + ctx->after_block = create_empty_block(ctx); + ctx->block_count++; + list_addtail(&ctx->after_block->link, &ctx->blocks); + list_inithead(&ctx->after_block->instructions); + midgard_block_add_successor(ctx->current_block, ctx->after_block); + ctx->current_block = ctx->after_block; + ctx->after_block = temp; +} + /* Helpers to generate midgard_instruction's using macro magic, since every * driver seems to do it that way */ @@ -104,21 +125,19 @@ midgard_instruction i = { \ .type = TAG_LOAD_STORE_4, \ .mask = 0xF, \ - .ssa_args = { \ - .dest = -1, \ - .src = { -1, -1, -1 }, \ - }, \ + .dest = ~0, \ + .src = { ~0, ~0, ~0, ~0 }, \ + .swizzle = SWIZZLE_IDENTITY_4, \ .load_store = { \ .op = midgard_op_##name, \ - .swizzle = SWIZZLE_XYZW, \ .address = address \ } \ }; \ \ if (store) \ - i.ssa_args.src[0] = ssa; \ + i.src[0] = ssa; \ else \ - i.ssa_args.dest = ssa; \ + i.dest = ssa; \ \ return i; \ } @@ -133,14 +152,12 @@ vector_alu_modifiers(nir_alu_src *src, bool is_int, unsigned broadcast_count, bool half, bool sext) { - if (!src) return blank_alu_src; - - /* Figure out how many components there are so we can adjust the - * swizzle. Specifically we want to broadcast the last channel so - * things like ball2/3 work + /* Figure out how many components there are so we can adjust. + * Specifically we want to broadcast the last channel so things like + * ball2/3 work. */ - if (broadcast_count) { + if (broadcast_count && src) { uint8_t last_component = src->swizzle[broadcast_count - 1]; for (unsigned c = broadcast_count; c < NIR_MAX_VEC_COMPONENTS; ++c) { @@ -151,8 +168,7 @@ midgard_vector_alu_src alu_src = { .rep_low = 0, .rep_high = 0, - .half = half, - .swizzle = SWIZZLE_FROM_ARRAY(src->swizzle) + .half = half }; if (is_int) { @@ -167,9 +183,11 @@ } /* These should have been lowered away */ - assert(!(src->abs || src->negate)); + if (src) + assert(!(src->abs || src->negate)); } else { - alu_src.mod = (src->abs << 0) | (src->negate << 1); + if (src) + alu_src.mod = (src->abs << 0) | (src->negate << 1); } return alu_src; @@ -187,44 +205,13 @@ M_LOAD(ld_ubo_int4); M_LOAD(ld_int4); M_STORE(st_int4); -M_LOAD(ld_color_buffer_8); +M_LOAD(ld_color_buffer_32u); //M_STORE(st_vary_16); M_STORE(st_vary_32); M_LOAD(ld_cubemap_coords); M_LOAD(ld_compute_id); static midgard_instruction -v_alu_br_compact_cond(midgard_jmp_writeout_op op, unsigned tag, signed offset, unsigned cond) -{ - midgard_branch_cond branch = { - .op = op, - .dest_tag = tag, - .offset = offset, - .cond = cond - }; - - uint16_t compact; - memcpy(&compact, &branch, sizeof(branch)); - - midgard_instruction ins = { - .type = TAG_ALU_4, - .unit = ALU_ENAB_BR_COMPACT, - .prepacked_branch = true, - .compact_branch = true, - .br_compact = compact, - .ssa_args = { - .dest = -1, - .src = { -1, -1, -1 }, - } - }; - - if (op == midgard_jmp_writeout_op_writeout) - ins.writeout = true; - - return ins; -} - -static midgard_instruction v_branch(bool conditional, bool invert) { midgard_instruction ins = { @@ -235,10 +222,8 @@ .conditional = conditional, .invert_conditional = invert }, - .ssa_args = { - .dest = -1, - .src = { -1, -1, -1 }, - } + .dest = ~0, + .src = { ~0, ~0, ~0, ~0 }, }; return ins; @@ -325,6 +310,17 @@ } static int +midgard_sysval_for_sampler(nir_intrinsic_instr *instr) +{ + /* TODO: indirect samplers !!! */ + nir_src index = instr->src[0]; + assert(nir_src_is_const(index)); + uint32_t uindex = nir_src_as_uint(index); + + return PAN_SYSVAL(SAMPLER, uindex); +} + +static int midgard_nir_sysval_for_intrinsic(nir_intrinsic_instr *instr) { switch (instr->intrinsic) { @@ -337,8 +333,10 @@ case nir_intrinsic_load_ssbo: case nir_intrinsic_store_ssbo: return midgard_sysval_for_ssbo(instr); + case nir_intrinsic_load_sampler_lod_parameters_pan: + return midgard_sysval_for_sampler(instr); default: - return -1; + return ~0; } } @@ -451,7 +449,7 @@ /* Flushes undefined values to zero */ static void -optimise_nir(nir_shader *nir) +optimise_nir(nir_shader *nir, unsigned quirks) { bool progress; unsigned lower_flrp = @@ -460,16 +458,28 @@ (nir->options->lower_flrp64 ? 64 : 0); NIR_PASS(progress, nir, nir_lower_regs_to_ssa); - NIR_PASS(progress, nir, midgard_nir_lower_fdot2); - NIR_PASS(progress, nir, nir_lower_idiv); + NIR_PASS(progress, nir, nir_lower_idiv, nir_lower_idiv_fast); nir_lower_tex_options lower_tex_options = { .lower_txs_lod = true, - .lower_txp = ~0 + .lower_txp = ~0, + .lower_tex_without_implicit_lod = + (quirks & MIDGARD_EXPLICIT_LOD), + + /* TODO: we have native gradient.. */ + .lower_txd = true, }; NIR_PASS(progress, nir, nir_lower_tex, &lower_tex_options); + /* Must lower fdot2 after tex is lowered */ + NIR_PASS(progress, nir, midgard_nir_lower_fdot2); + + /* T720 is broken. */ + + if (quirks & MIDGARD_BROKEN_LOD) + NIR_PASS_V(nir, midgard_nir_lod_errata); + do { progress = false; @@ -477,6 +487,7 @@ NIR_PASS(progress, nir, nir_lower_vars_to_ssa); NIR_PASS(progress, nir, nir_copy_prop); + NIR_PASS(progress, nir, nir_opt_remove_phis); NIR_PASS(progress, nir, nir_opt_dce); NIR_PASS(progress, nir, nir_opt_dead_cf); NIR_PASS(progress, nir, nir_opt_cse); @@ -561,11 +572,33 @@ { nir_ssa_def def = instr->def; - float *v = rzalloc_array(NULL, float, 4); - nir_const_load_to_arr(v, instr, f32); + midgard_constants *consts = rzalloc(NULL, midgard_constants); + + assert(instr->def.num_components * instr->def.bit_size <= sizeof(*consts) * 8); + +#define RAW_CONST_COPY(bits) \ + nir_const_value_to_array(consts->u##bits, instr->value, \ + instr->def.num_components, u##bits) + + switch (instr->def.bit_size) { + case 64: + RAW_CONST_COPY(64); + break; + case 32: + RAW_CONST_COPY(32); + break; + case 16: + RAW_CONST_COPY(16); + break; + case 8: + RAW_CONST_COPY(8); + break; + default: + unreachable("Invalid bit_size for load_const instruction\n"); + } /* Shifted for SSA, +1 for off-by-one */ - _mesa_hash_table_u64_insert(ctx->ssa_constants, (def.index << 1) + 1, v); + _mesa_hash_table_u64_insert(ctx->ssa_constants, (def.index << 1) + 1, consts); } /* Normally constants are embedded implicitly, but for I/O and such we have to @@ -577,7 +610,7 @@ void *constant_value = _mesa_hash_table_u64_search(ctx->ssa_constants, node + 1); if (constant_value) { - midgard_instruction ins = v_mov(SSA_FIXED_REGISTER(REGISTER_CONSTANT), blank_alu_src, to); + midgard_instruction ins = v_mov(SSA_FIXED_REGISTER(REGISTER_CONSTANT), to); attach_constants(ctx, &ins, constant_value, node + 1); emit_mir_instruction(ctx, ins); } @@ -596,87 +629,6 @@ return false; } -/* Midgard puts scalar conditionals in r31.w; move an arbitrary source (the - * output of a conditional test) into that register */ - -static void -emit_condition(compiler_context *ctx, nir_src *src, bool for_branch, unsigned component) -{ - int condition = nir_src_index(ctx, src); - - /* Source to swizzle the desired component into w */ - - const midgard_vector_alu_src alu_src = { - .swizzle = SWIZZLE(component, component, component, component), - }; - - /* There is no boolean move instruction. Instead, we simulate a move by - * ANDing the condition with itself to get it into r31.w */ - - midgard_instruction ins = { - .type = TAG_ALU_4, - - /* We need to set the conditional as close as possible */ - .precede_break = true, - .unit = for_branch ? UNIT_SMUL : UNIT_SADD, - .mask = 1 << COMPONENT_W, - - .ssa_args = { - .src = { condition, condition, -1 }, - .dest = SSA_FIXED_REGISTER(31), - }, - - .alu = { - .op = midgard_alu_op_iand, - .outmod = midgard_outmod_int_wrap, - .reg_mode = midgard_reg_mode_32, - .dest_override = midgard_dest_override_none, - .src1 = vector_alu_srco_unsigned(alu_src), - .src2 = vector_alu_srco_unsigned(alu_src) - }, - }; - - emit_mir_instruction(ctx, ins); -} - -/* Or, for mixed conditions (with csel_v), here's a vector version using all of - * r31 instead */ - -static void -emit_condition_mixed(compiler_context *ctx, nir_alu_src *src, unsigned nr_comp) -{ - int condition = nir_src_index(ctx, &src->src); - - /* Source to swizzle the desired component into w */ - - const midgard_vector_alu_src alu_src = { - .swizzle = SWIZZLE_FROM_ARRAY(src->swizzle), - }; - - /* There is no boolean move instruction. Instead, we simulate a move by - * ANDing the condition with itself to get it into r31.w */ - - midgard_instruction ins = { - .type = TAG_ALU_4, - .precede_break = true, - .mask = mask_of(nr_comp), - .ssa_args = { - .src = { condition, condition, -1 }, - .dest = SSA_FIXED_REGISTER(31), - }, - .alu = { - .op = midgard_alu_op_iand, - .outmod = midgard_outmod_int_wrap, - .reg_mode = midgard_reg_mode_32, - .dest_override = midgard_dest_override_none, - .src1 = vector_alu_srco_unsigned(alu_src), - .src2 = vector_alu_srco_unsigned(alu_src) - }, - }; - - emit_mir_instruction(ctx, ins); -} - #define ALU_CASE(nir, _op) \ case nir_op_##nir: \ op = midgard_alu_op_##_op; \ @@ -834,6 +786,11 @@ ALU_CASE(fexp2, fexp2); ALU_CASE(flog2, flog2); + ALU_CASE(f2i64, f2i_rtz); + ALU_CASE(f2u64, f2u_rtz); + ALU_CASE(i2f64, i2f_rtz); + ALU_CASE(u2f64, u2f_rtz); + ALU_CASE(f2i32, f2i_rtz); ALU_CASE(f2u32, f2u_rtz); ALU_CASE(i2f32, i2f_rtz); @@ -887,14 +844,24 @@ case nir_op_i2i8: case nir_op_i2i16: case nir_op_i2i32: + case nir_op_i2i64: /* If we end up upscale, we'll need a sign-extend on the * operand (the second argument) */ sext_2 = true; + /* fallthrough */ case nir_op_u2u8: case nir_op_u2u16: - case nir_op_u2u32: { - op = midgard_alu_op_imov; + case nir_op_u2u32: + case nir_op_u2u64: + case nir_op_f2f16: + case nir_op_f2f32: + case nir_op_f2f64: { + if (instr->op == nir_op_f2f16 || instr->op == nir_op_f2f32 || + instr->op == nir_op_f2f64) + op = midgard_alu_op_fmov; + else + op = midgard_alu_op_imov; if (dst_bitsize == (src_bitsize * 2)) { /* Converting up */ @@ -910,24 +877,6 @@ break; } - case nir_op_f2f16: { - assert(src_bitsize == 32); - - op = midgard_alu_op_fmov; - dest_override = midgard_dest_override_lower; - break; - } - - case nir_op_f2f32: { - assert(src_bitsize == 16); - - op = midgard_alu_op_fmov; - half_2 = true; - reg_mode++; - break; - } - - /* For greater-or-equal, we lower to less-or-equal and flip the * arguments */ @@ -963,21 +912,16 @@ bool mixed = nir_is_non_scalar_swizzle(&instr->src[0], nr_components); op = mixed ? midgard_alu_op_icsel_v : midgard_alu_op_icsel; - /* csel works as a two-arg in Midgard, since the condition is hardcoded in r31.w */ - nr_inputs = 2; - - /* Emit the condition into r31 */ - - if (mixed) - emit_condition_mixed(ctx, &instr->src[0], nr_components); - else - emit_condition(ctx, &instr->src[0].src, false, instr->src[0].swizzle[0]); - /* The condition is the first argument; move the other * arguments up one to be a binary instruction for - * Midgard */ + * Midgard with the condition last */ + + nir_alu_src temp = instr->src[2]; + + instr->src[2] = instr->src[0]; + instr->src[0] = instr->src[1]; + instr->src[1] = temp; - memmove(instr->src, instr->src + 1, 2 * sizeof(nir_alu_src)); break; } @@ -1021,26 +965,27 @@ * needs it, or else we may segfault. */ unsigned src0 = nir_alu_src_index(ctx, &instr->src[0]); - unsigned src1 = nr_inputs == 2 ? nir_alu_src_index(ctx, &instr->src[1]) : SSA_UNUSED_0; + unsigned src1 = nr_inputs >= 2 ? nir_alu_src_index(ctx, &instr->src[1]) : ~0; + unsigned src2 = nr_inputs == 3 ? nir_alu_src_index(ctx, &instr->src[2]) : ~0; + assert(nr_inputs <= 3); /* Rather than use the instruction generation helpers, we do it * ourselves here to avoid the mess */ midgard_instruction ins = { .type = TAG_ALU_4, - .ssa_args = { - .src = { - quirk_flipped_r24 ? SSA_UNUSED_1 : src0, - quirk_flipped_r24 ? src0 : src1, - -1 - }, - .dest = dest, - } + .src = { + quirk_flipped_r24 ? ~0 : src0, + quirk_flipped_r24 ? src0 : src1, + src2, + ~0 + }, + .dest = dest, }; - nir_alu_src *nirmods[2] = { NULL }; + nir_alu_src *nirmods[3] = { NULL }; - if (nr_inputs == 2) { + if (nr_inputs >= 2) { nirmods[0] = &instr->src[0]; nirmods[1] = &instr->src[1]; } else if (nr_inputs == 1) { @@ -1049,6 +994,9 @@ assert(0); } + if (nr_inputs == 3) + nirmods[2] = &instr->src[2]; + /* These were lowered to a move, so apply the corresponding mod */ if (instr->op == nir_op_fneg || instr->op == nir_op_fabs) { @@ -1080,6 +1028,24 @@ if (!is_ssa) ins.mask &= instr->dest.write_mask; + for (unsigned m = 0; m < 3; ++m) { + if (!nirmods[m]) + continue; + + for (unsigned c = 0; c < NIR_MAX_VEC_COMPONENTS; ++c) + ins.swizzle[m][c] = nirmods[m]->swizzle[c]; + + /* Replicate. TODO: remove when vec16 lands */ + for (unsigned c = NIR_MAX_VEC_COMPONENTS; c < MIR_VEC_COMPONENTS; ++c) + ins.swizzle[m][c] = nirmods[m]->swizzle[NIR_MAX_VEC_COMPONENTS - 1]; + } + + if (nr_inputs == 3) { + /* Conditions can't have mods */ + assert(!nirmods[2]->abs); + assert(!nirmods[2]->negate); + } + ins.alu = alu; /* Late fixup for emulated instructions */ @@ -1090,26 +1056,26 @@ * inline, since we're 32-bit, not 16-bit like the inline * constants) */ - ins.ssa_args.inline_constant = false; - ins.ssa_args.src[1] = SSA_FIXED_REGISTER(REGISTER_CONSTANT); + ins.has_inline_constant = false; + ins.src[1] = SSA_FIXED_REGISTER(REGISTER_CONSTANT); ins.has_constants = true; - if (instr->op == nir_op_b2f32) { - ins.constants[0] = 1.0f; - } else { - /* Type pun it into place */ - uint32_t one = 0x1; - memcpy(&ins.constants[0], &one, sizeof(uint32_t)); - } + if (instr->op == nir_op_b2f32) + ins.constants.f32[0] = 1.0f; + else + ins.constants.i32[0] = 1; - ins.alu.src2 = vector_alu_srco_unsigned(blank_alu_src_xxxx); + for (unsigned c = 0; c < 16; ++c) + ins.swizzle[1][c] = 0; } else if (nr_inputs == 1 && !quirk_flipped_r24) { /* Lots of instructions need a 0 plonked in */ - ins.ssa_args.inline_constant = false; - ins.ssa_args.src[1] = SSA_FIXED_REGISTER(REGISTER_CONSTANT); + ins.has_inline_constant = false; + ins.src[1] = SSA_FIXED_REGISTER(REGISTER_CONSTANT); ins.has_constants = true; - ins.constants[0] = 0.0f; - ins.alu.src2 = vector_alu_srco_unsigned(blank_alu_src_xxxx); + ins.constants.u32[0] = 0; + + for (unsigned c = 0; c < 16; ++c) + ins.swizzle[1][c] = 0; } else if (instr->op == nir_op_inot) { ins.invert = true; } @@ -1119,8 +1085,6 @@ * instructions can only operate as if they were scalars. Lower * them here by changing the component. */ - uint8_t original_swizzle[4]; - memcpy(original_swizzle, nirmods[0]->swizzle, sizeof(nirmods[0]->swizzle)); unsigned orig_mask = ins.mask; for (int i = 0; i < nr_components; ++i) { @@ -1133,10 +1097,9 @@ if (!ins.mask) continue; - for (int j = 0; j < 4; ++j) - nirmods[0]->swizzle[j] = original_swizzle[i]; /* Pull from the correct component */ + for (unsigned j = 0; j < MIR_VEC_COMPONENTS; ++j) + ins.swizzle[0][j] = nirmods[0]->swizzle[i]; /* Pull from the correct component */ - ins.alu.src1 = vector_alu_srco_unsigned(vector_alu_modifiers(nirmods[0], is_int, broadcast_swizzle, half_1, false)); emit_mir_instruction(ctx, ins); } } else { @@ -1146,44 +1109,53 @@ #undef ALU_CASE -static unsigned -mir_mask_for_intr(nir_instr *instr, bool is_read) +static void +mir_set_intr_mask(nir_instr *instr, midgard_instruction *ins, bool is_read) { nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + unsigned nir_mask = 0; + unsigned dsize = 0; - if (is_read) - return mask_of(nir_intrinsic_dest_components(intr)); - else - return nir_intrinsic_write_mask(intr); + if (is_read) { + nir_mask = mask_of(nir_intrinsic_dest_components(intr)); + dsize = nir_dest_bit_size(intr->dest); + } else { + nir_mask = nir_intrinsic_write_mask(intr); + dsize = 32; + } + + /* Once we have the NIR mask, we need to normalize to work in 32-bit space */ + unsigned bytemask = mir_to_bytemask(mir_mode_for_destsize(dsize), nir_mask); + mir_set_bytemask(ins, bytemask); + + if (dsize == 64) + ins->load_64 = true; } /* Uniforms and UBOs use a shared code path, as uniforms are just (slightly * optimized) versions of UBO #0 */ -midgard_instruction * +static midgard_instruction * emit_ubo_read( compiler_context *ctx, nir_instr *instr, unsigned dest, unsigned offset, nir_src *indirect_offset, + unsigned indirect_shift, unsigned index) { /* TODO: half-floats */ - midgard_instruction ins = m_ld_ubo_int4(dest, offset); - - assert((offset & 0xF) == 0); - offset /= 16; + midgard_instruction ins = m_ld_ubo_int4(dest, 0); + ins.constants.u32[0] = offset; - /* TODO: Don't split */ - ins.load_store.varying_parameters = (offset & 7) << 7; - ins.load_store.address = offset >> 3; - ins.mask = mir_mask_for_intr(instr, true); + if (instr->type == nir_instr_type_intrinsic) + mir_set_intr_mask(instr, &ins, true); if (indirect_offset) { - ins.ssa_args.src[1] = nir_src_index(ctx, indirect_offset); - ins.load_store.arg_2 = 0x80; + ins.src[2] = nir_src_index(ctx, indirect_offset); + ins.load_store.arg_2 = (indirect_shift << 5); } else { ins.load_store.arg_2 = 0x1E; } @@ -1220,16 +1192,9 @@ unsigned addr = make_compiler_temp(ctx); emit_sysval_read(ctx, instr, addr, 2); - /* The source array is a bit of a leaky abstraction for SSBOs. - * Nevertheless, for loads: - * - * src[0] = arg_1 - * src[1] = arg_2 - * src[2] = unused + /* The source array: * - * Whereas for stores: - * - * src[0] = value + * src[0] = store ? value : unused * src[1] = arg_1 * src[2] = arg_2 * @@ -1237,19 +1202,18 @@ * arg_2 = the offset. */ - ins.ssa_args.src[is_read ? 0 : 1] = addr; + ins.src[1] = addr; /* TODO: What is this? It looks superficially like a shift << 5, but - * arg_1 doesn't take a shift Should it be E0 or A0? */ - if (indirect_offset) - ins.load_store.arg_1 |= 0xE0; + * arg_1 doesn't take a shift Should it be E0 or A0? We also need the + * indirect offset. */ - /* We also need to emit the indirect offset */ - - if (indirect_offset) - ins.ssa_args.src[is_read ? 1 : 2] = nir_src_index(ctx, indirect_offset); - else + if (indirect_offset) { + ins.load_store.arg_1 |= 0xE0; + ins.src[2] = nir_src_index(ctx, indirect_offset); + } else { ins.load_store.arg_2 = 0x7E; + } /* TODO: Bounds check */ @@ -1257,7 +1221,7 @@ ins.load_store.varying_parameters = (offset & 0x1FF) << 1; ins.load_store.address = (offset >> 9); - ins.mask = mir_mask_for_intr(instr, is_read); + mir_set_intr_mask(instr, &ins, is_read); emit_mir_instruction(ctx, ins); } @@ -1267,19 +1231,21 @@ compiler_context *ctx, unsigned dest, unsigned offset, unsigned nr_comp, unsigned component, - nir_src *indirect_offset, nir_alu_type type) + nir_src *indirect_offset, nir_alu_type type, bool flat) { /* XXX: Half-floats? */ /* TODO: swizzle, mask */ midgard_instruction ins = m_ld_vary_32(dest, offset); ins.mask = mask_of(nr_comp); - ins.load_store.swizzle = SWIZZLE_XYZW >> (2 * component); + + for (unsigned i = 0; i < ARRAY_SIZE(ins.swizzle[0]); ++i) + ins.swizzle[0][i] = MIN2(i + component, COMPONENT_W); midgard_varying_parameter p = { .is_varying = 1, .interpolation = midgard_interp_default, - .flat = /*var->data.interpolation == INTERP_MODE_FLAT*/ 0 + .flat = flat, }; unsigned u; @@ -1287,7 +1253,7 @@ ins.load_store.varying_parameters = u; if (indirect_offset) - ins.ssa_args.src[1] = nir_src_index(ctx, indirect_offset); + ins.src[2] = nir_src_index(ctx, indirect_offset); else ins.load_store.arg_2 = 0x1E; @@ -1313,6 +1279,37 @@ emit_mir_instruction(ctx, ins); } +static void +emit_attr_read( + compiler_context *ctx, + unsigned dest, unsigned offset, + unsigned nr_comp, nir_alu_type t) +{ + midgard_instruction ins = m_ld_attr_32(dest, offset); + ins.load_store.arg_1 = 0x1E; + ins.load_store.arg_2 = 0x1E; + ins.mask = mask_of(nr_comp); + + /* Use the type appropriate load */ + switch (t) { + case nir_type_uint: + case nir_type_bool: + ins.load_store.op = midgard_op_ld_attr_32u; + break; + case nir_type_int: + ins.load_store.op = midgard_op_ld_attr_32i; + break; + case nir_type_float: + ins.load_store.op = midgard_op_ld_attr_32; + break; + default: + unreachable("Attempted to load unknown type"); + break; + } + + emit_mir_instruction(ctx, ins); +} + void emit_sysval_read(compiler_context *ctx, nir_instr *instr, signed dest_override, unsigned nr_components) @@ -1331,7 +1328,7 @@ /* Emit the read itself -- this is never indirect */ midgard_instruction *ins = - emit_ubo_read(ctx, instr, dest, uniform * 16, NULL, 0); + emit_ubo_read(ctx, instr, dest, uniform * 16, NULL, 0, 0); ins->mask = mask_of(nr_components); } @@ -1349,62 +1346,32 @@ } } -/* Emit store for a fragment shader, which is encoded via a fancy branch. TODO: - * Handle MRT here */ - static void emit_fragment_store(compiler_context *ctx, unsigned src, unsigned rt) { - /* First, move in whatever we're outputting */ - midgard_instruction move = v_mov(src, blank_alu_src, SSA_FIXED_REGISTER(0)); - if (rt != 0) { - /* Force a tight schedule. TODO: Make the scheduler MRT aware */ - move.unit = UNIT_VMUL; - move.precede_break = true; - move.dont_eliminate = true; - } - - emit_mir_instruction(ctx, move); - - /* If we're doing MRT, we need to specify the render target */ - - midgard_instruction rt_move = { - .ssa_args = { - .dest = -1 - } - }; + emit_explicit_constant(ctx, src, src); - if (rt != 0) { - /* We'll write to r1.z */ - rt_move = v_mov(-1, blank_alu_src, SSA_FIXED_REGISTER(1)); - rt_move.mask = 1 << COMPONENT_Z; - rt_move.unit = UNIT_SADD; - - /* r1.z = (rt * 0x100) */ - rt_move.ssa_args.inline_constant = true; - rt_move.inline_constant = (rt * 0x100); - - /* r1 */ - ctx->work_registers = MAX2(ctx->work_registers, 1); + struct midgard_instruction ins = + v_branch(false, false); - /* Do the write */ - emit_mir_instruction(ctx, rt_move); - } + ins.writeout = true; - /* Next, generate the branch. For R render targets in the writeout, the - * i'th render target jumps to pseudo-offset [2(R-1) + i] */ + /* Add dependencies */ + ins.src[0] = src; + ins.constants.u32[0] = rt * 0x100; - unsigned offset = (2 * (ctx->nir->num_outputs - 1)) + rt; + /* Emit the branch */ + midgard_instruction *br = emit_mir_instruction(ctx, ins); + schedule_barrier(ctx); - struct midgard_instruction ins = - v_alu_br_compact_cond(midgard_jmp_writeout_op_writeout, TAG_ALU_4, offset, midgard_condition_always); + assert(rt < ARRAY_SIZE(ctx->writeout_branch)); + assert(!ctx->writeout_branch[rt]); + ctx->writeout_branch[rt] = br; - /* Add dependencies */ - ins.ssa_args.src[0] = move.ssa_args.dest; - ins.ssa_args.src[1] = rt_move.ssa_args.dest; + /* Push our current location = current block count - 1 = where we'll + * jump to. Maybe a bit too clever for my own good */ - /* Emit the branch */ - emit_mir_instruction(ctx, ins); + br->branch.target_block = ctx->block_count - 1; } static void @@ -1416,6 +1383,27 @@ ins.load_store.arg_1 = compute_builtin_arg(instr->intrinsic); emit_mir_instruction(ctx, ins); } + +static unsigned +vertex_builtin_arg(nir_op op) +{ + switch (op) { + case nir_intrinsic_load_vertex_id: + return PAN_VERTEX_ID; + case nir_intrinsic_load_instance_id: + return PAN_INSTANCE_ID; + default: + unreachable("Invalid vertex builtin"); + } +} + +static void +emit_vertex_builtin(compiler_context *ctx, nir_intrinsic_instr *instr) +{ + unsigned reg = nir_dest_index(ctx, &instr->dest); + emit_attr_read(ctx, reg, vertex_builtin_arg(instr->intrinsic), 1, nir_type_int); +} + static void emit_intrinsic(compiler_context *ctx, nir_intrinsic_instr *instr) { @@ -1423,30 +1411,38 @@ switch (instr->intrinsic) { case nir_intrinsic_discard_if: - emit_condition(ctx, &instr->src[0], true, COMPONENT_X); - - /* fallthrough */ - case nir_intrinsic_discard: { bool conditional = instr->intrinsic == nir_intrinsic_discard_if; struct midgard_instruction discard = v_branch(conditional, false); discard.branch.target_type = TARGET_DISCARD; + + if (conditional) + discard.src[0] = nir_src_index(ctx, &instr->src[0]); + emit_mir_instruction(ctx, discard); + schedule_barrier(ctx); + break; } case nir_intrinsic_load_uniform: case nir_intrinsic_load_ubo: case nir_intrinsic_load_ssbo: - case nir_intrinsic_load_input: { + case nir_intrinsic_load_input: + case nir_intrinsic_load_interpolated_input: { bool is_uniform = instr->intrinsic == nir_intrinsic_load_uniform; bool is_ubo = instr->intrinsic == nir_intrinsic_load_ubo; bool is_ssbo = instr->intrinsic == nir_intrinsic_load_ssbo; + bool is_flat = instr->intrinsic == nir_intrinsic_load_input; + bool is_interp = instr->intrinsic == nir_intrinsic_load_interpolated_input; /* Get the base type of the intrinsic */ /* TODO: Infer type? Does it matter? */ nir_alu_type t = - (is_ubo || is_ssbo) ? nir_type_uint : nir_intrinsic_type(instr); + (is_ubo || is_ssbo) ? nir_type_uint : + (is_interp) ? nir_type_float : + nir_intrinsic_type(instr); + t = nir_alu_type_get_base_type(t); if (!(is_ubo || is_ssbo)) { @@ -1464,27 +1460,20 @@ offset += nir_src_as_uint(*src_offset); /* We may need to apply a fractional offset */ - int component = instr->intrinsic == nir_intrinsic_load_input ? + int component = (is_flat || is_interp) ? nir_intrinsic_component(instr) : 0; reg = nir_dest_index(ctx, &instr->dest); if (is_uniform && !ctx->is_blend) { - emit_ubo_read(ctx, &instr->instr, reg, (ctx->sysval_count + offset) * 16, indirect_offset, 0); + emit_ubo_read(ctx, &instr->instr, reg, (ctx->sysval_count + offset) * 16, indirect_offset, 4, 0); } else if (is_ubo) { nir_src index = instr->src[0]; - /* We don't yet support indirect UBOs. For indirect - * block numbers (if that's possible), we don't know - * enough about the hardware yet. For indirect sources, - * we know what we need but we need to add some NIR - * support for lowering correctly with respect to - * 128-bit reads */ - + /* TODO: Is indirect block number possible? */ assert(nir_src_is_const(index)); - assert(nir_src_is_const(*src_offset)); uint32_t uindex = nir_src_as_uint(index) + 1; - emit_ubo_read(ctx, &instr->instr, reg, offset, NULL, uindex); + emit_ubo_read(ctx, &instr->instr, reg, offset, indirect_offset, 0, uindex); } else if (is_ssbo) { nir_src index = instr->src[0]; assert(nir_src_is_const(index)); @@ -1492,37 +1481,16 @@ emit_ssbo_access(ctx, &instr->instr, true, reg, offset, indirect_offset, uindex); } else if (ctx->stage == MESA_SHADER_FRAGMENT && !ctx->is_blend) { - emit_varying_read(ctx, reg, offset, nr_comp, component, !direct ? &instr->src[0] : NULL, t); + emit_varying_read(ctx, reg, offset, nr_comp, component, indirect_offset, t, is_flat); } else if (ctx->is_blend) { /* For blend shaders, load the input color, which is * preloaded to r0 */ - midgard_instruction move = v_mov(SSA_FIXED_REGISTER(0), blank_alu_src, reg); + midgard_instruction move = v_mov(SSA_FIXED_REGISTER(0), reg); emit_mir_instruction(ctx, move); - } else if (ctx->stage == MESA_SHADER_VERTEX) { - midgard_instruction ins = m_ld_attr_32(reg, offset); - ins.load_store.arg_1 = 0x1E; - ins.load_store.arg_2 = 0x1E; - ins.mask = mask_of(nr_comp); - - /* Use the type appropriate load */ - switch (t) { - case nir_type_uint: - case nir_type_bool: - ins.load_store.op = midgard_op_ld_attr_32u; - break; - case nir_type_int: - ins.load_store.op = midgard_op_ld_attr_32i; - break; - case nir_type_float: - ins.load_store.op = midgard_op_ld_attr_32; - break; - default: - unreachable("Attempted to load unknown type"); - break; - } - - emit_mir_instruction(ctx, ins); + schedule_barrier(ctx); + } else if (ctx->stage == MESA_SHADER_VERTEX) { + emit_attr_read(ctx, reg, offset, nr_comp, t); } else { DBG("Unknown load\n"); assert(0); @@ -1531,14 +1499,38 @@ break; } + /* Artefact of load_interpolated_input. TODO: other barycentric modes */ + case nir_intrinsic_load_barycentric_pixel: + break; + /* Reads 128-bit value raw off the tilebuffer during blending, tasty */ case nir_intrinsic_load_raw_output_pan: + case nir_intrinsic_load_output_u8_as_fp16_pan: reg = nir_dest_index(ctx, &instr->dest); assert(ctx->is_blend); - midgard_instruction ins = m_ld_color_buffer_8(reg, 0); - emit_mir_instruction(ctx, ins); + /* T720 and below use different blend opcodes with slightly + * different semantics than T760 and up */ + + midgard_instruction ld = m_ld_color_buffer_32u(reg, 0); + bool old_blend = ctx->quirks & MIDGARD_OLD_BLEND; + + if (instr->intrinsic == nir_intrinsic_load_output_u8_as_fp16_pan) { + ld.load_store.op = old_blend ? + midgard_op_ld_color_buffer_u8_as_fp16_old : + midgard_op_ld_color_buffer_u8_as_fp16; + + if (old_blend) { + ld.load_store.address = 1; + ld.load_store.arg_2 = 0x1E; + } + + for (unsigned c = 2; c < 16; ++c) + ld.swizzle[0][c] = 0; + } + + emit_mir_instruction(ctx, ld); break; case nir_intrinsic_load_blend_const_color_rgba: { @@ -1548,7 +1540,7 @@ /* Blend constants are embedded directly in the shader and * patched in, so we use some magic routing */ - midgard_instruction ins = v_mov(SSA_FIXED_REGISTER(REGISTER_CONSTANT), blank_alu_src, reg); + midgard_instruction ins = v_mov(SSA_FIXED_REGISTER(REGISTER_CONSTANT), reg); ins.has_constants = true; ins.has_blend_constant = true; emit_mir_instruction(ctx, ins); @@ -1563,7 +1555,6 @@ reg = nir_src_index(ctx, &instr->src[0]); if (ctx->stage == MESA_SHADER_FRAGMENT) { - /* Determine number of render targets */ emit_fragment_store(ctx, reg, offset); } else if (ctx->stage == MESA_SHADER_VERTEX) { /* We should have been vectorized, though we don't @@ -1576,13 +1567,44 @@ emit_explicit_constant(ctx, reg, reg); - unsigned component = nir_intrinsic_component(instr); + unsigned dst_component = nir_intrinsic_component(instr); unsigned nr_comp = nir_src_num_components(instr->src[0]); midgard_instruction st = m_st_vary_32(reg, offset); st.load_store.arg_1 = 0x9E; st.load_store.arg_2 = 0x1E; - st.load_store.swizzle = swizzle_of(nr_comp) << (2*component); + + switch (nir_alu_type_get_base_type(nir_intrinsic_type(instr))) { + case nir_type_uint: + case nir_type_bool: + st.load_store.op = midgard_op_st_vary_32u; + break; + case nir_type_int: + st.load_store.op = midgard_op_st_vary_32i; + break; + case nir_type_float: + st.load_store.op = midgard_op_st_vary_32; + break; + default: + unreachable("Attempted to store unknown type"); + break; + } + + /* nir_intrinsic_component(store_intr) encodes the + * destination component start. Source component offset + * adjustment is taken care of in + * install_registers_instr(), when offset_swizzle() is + * called. + */ + unsigned src_component = COMPONENT_X; + + assert(nr_comp > 0); + for (unsigned i = 0; i < ARRAY_SIZE(st.swizzle); ++i) { + st.swizzle[0][i] = src_component; + if (i >= dst_component && i < dst_component + nr_comp - 1) + src_component++; + } + emit_mir_instruction(ctx, st); } else { DBG("Unknown store\n"); @@ -1595,7 +1617,26 @@ case nir_intrinsic_store_raw_output_pan: assert (ctx->stage == MESA_SHADER_FRAGMENT); reg = nir_src_index(ctx, &instr->src[0]); - emit_fragment_store(ctx, reg, 0); + + if (ctx->quirks & MIDGARD_OLD_BLEND) { + /* Suppose reg = qr0.xyzw. That means 4 8-bit ---> 1 32-bit. So + * reg = r0.x. We want to splatter. So we can do a 32-bit move + * of: + * + * imov r0.xyzw, r0.xxxx + */ + + unsigned expanded = make_compiler_temp(ctx); + + midgard_instruction splatter = v_mov(reg, expanded); + + for (unsigned c = 0; c < 16; ++c) + splatter.swizzle[1][c] = 0; + + emit_mir_instruction(ctx, splatter); + emit_fragment_store(ctx, expanded, ctx->blend_rt); + } else + emit_fragment_store(ctx, reg, ctx->blend_rt); break; @@ -1613,21 +1654,11 @@ emit_ssbo_access(ctx, &instr->instr, false, reg, offset, indirect_offset, uindex); break; - case nir_intrinsic_load_alpha_ref_float: - assert(instr->dest.is_ssa); - - float ref_value = ctx->alpha_ref; - - /* See emit_load_const */ - float *v = ralloc_array(NULL, float, 4); - memcpy(v, &ref_value, sizeof(float)); - _mesa_hash_table_u64_insert(ctx->ssa_constants, (instr->dest.ssa.index << 1) + 1, v); - break; - case nir_intrinsic_load_viewport_scale: case nir_intrinsic_load_viewport_offset: case nir_intrinsic_load_num_work_groups: - emit_sysval_read(ctx, &instr->instr, -1, 3); + case nir_intrinsic_load_sampler_lod_parameters_pan: + emit_sysval_read(ctx, &instr->instr, ~0, 3); break; case nir_intrinsic_load_work_group_id: @@ -1635,6 +1666,11 @@ emit_compute_builtin(ctx, instr); break; + case nir_intrinsic_load_vertex_id: + case nir_intrinsic_load_instance_id: + emit_vertex_builtin(ctx, instr); + break; + default: printf ("Unhandled intrinsic\n"); assert(0); @@ -1731,57 +1767,47 @@ midgard_instruction ins = { .type = TAG_TEXTURE_4, .mask = 0xF, - .ssa_args = { - .dest = nir_dest_index(ctx, &instr->dest), - .src = { -1, -1, -1 }, - }, + .dest = nir_dest_index(ctx, &instr->dest), + .src = { ~0, ~0, ~0, ~0 }, + .swizzle = SWIZZLE_IDENTITY_4, .texture = { .op = midgard_texop, .format = midgard_tex_format(instr->sampler_dim), .texture_handle = texture_index, .sampler_handle = sampler_index, - .swizzle = SWIZZLE_XYZW, - .in_reg_swizzle = SWIZZLE_XYZW, /* TODO: half */ .in_reg_full = 1, .out_full = 1, .sampler_type = midgard_sampler_type(instr->dest_type), + .shadow = instr->is_shadow, } }; + /* We may need a temporary for the coordinate */ + + bool needs_temp_coord = + (midgard_texop == TEXTURE_OP_TEXEL_FETCH) || + (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) || + (instr->is_shadow); + + unsigned coords = needs_temp_coord ? make_compiler_temp_reg(ctx) : 0; + for (unsigned i = 0; i < instr->num_srcs; ++i) { int index = nir_src_index(ctx, &instr->src[i].src); - midgard_vector_alu_src alu_src = blank_alu_src; unsigned nr_components = nir_src_num_components(instr->src[i].src); switch (instr->src[i].src_type) { case nir_tex_src_coord: { emit_explicit_constant(ctx, index, index); - /* Texelfetch coordinates uses all four elements - * (xyz/index) regardless of texture dimensionality, - * which means it's necessary to zero the unused - * components to keep everything happy */ - - if (midgard_texop == TEXTURE_OP_TEXEL_FETCH) { - unsigned old_index = index; - - index = make_compiler_temp(ctx); + unsigned coord_mask = mask_of(instr->coord_components); - /* mov index, old_index */ - midgard_instruction mov = v_mov(old_index, blank_alu_src, index); - mov.mask = 0x3; - emit_mir_instruction(ctx, mov); + bool flip_zw = (instr->sampler_dim == GLSL_SAMPLER_DIM_2D) && (coord_mask & (1 << COMPONENT_Z)); - /* mov index.zw, #0 */ - mov = v_mov(SSA_FIXED_REGISTER(REGISTER_CONSTANT), - blank_alu_src, index); - mov.has_constants = true; - mov.mask = (1 << COMPONENT_Z) | (1 << COMPONENT_W); - emit_mir_instruction(ctx, mov); - } + if (flip_zw) + coord_mask ^= ((1 << COMPONENT_Z) | (1 << COMPONENT_W)); if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) { /* texelFetch is undefined on samplerCube */ @@ -1791,30 +1817,67 @@ * select the face and copy the xy into the * texture register */ - unsigned temp = make_compiler_temp(ctx); - midgard_instruction ld = m_ld_cubemap_coords(temp, 0); - ld.ssa_args.src[0] = index; + midgard_instruction ld = m_ld_cubemap_coords(coords, 0); + ld.src[1] = index; ld.mask = 0x3; /* xy */ ld.load_store.arg_1 = 0x20; - ld.load_store.swizzle = alu_src.swizzle; + ld.swizzle[1][3] = COMPONENT_X; emit_mir_instruction(ctx, ld); - ins.ssa_args.src[0] = temp; - ins.texture.in_reg_swizzle = SWIZZLE_XYXX; + /* xyzw -> xyxx */ + ins.swizzle[1][2] = instr->is_shadow ? COMPONENT_Z : COMPONENT_X; + ins.swizzle[1][3] = COMPONENT_X; + } else if (needs_temp_coord) { + /* mov coord_temp, coords */ + midgard_instruction mov = v_mov(index, coords); + mov.mask = coord_mask; + + if (flip_zw) + mov.swizzle[1][COMPONENT_W] = COMPONENT_Z; + + emit_mir_instruction(ctx, mov); } else { - ins.ssa_args.src[0] = index; + coords = index; + } + + ins.src[1] = coords; + + /* Texelfetch coordinates uses all four elements + * (xyz/index) regardless of texture dimensionality, + * which means it's necessary to zero the unused + * components to keep everything happy */ + + if (midgard_texop == TEXTURE_OP_TEXEL_FETCH) { + /* mov index.zw, #0, or generalized */ + midgard_instruction mov = + v_mov(SSA_FIXED_REGISTER(REGISTER_CONSTANT), coords); + mov.has_constants = true; + mov.mask = coord_mask ^ 0xF; + emit_mir_instruction(ctx, mov); } if (instr->sampler_dim == GLSL_SAMPLER_DIM_2D) { - /* Array component in w but NIR wants it in z */ - if (nr_components == 3) - ins.texture.in_reg_swizzle = SWIZZLE_XYZZ; - else if (nr_components == 2) - ins.texture.in_reg_swizzle = SWIZZLE_XYXX; - else + /* Array component in w but NIR wants it in z, + * but if we have a temp coord we already fixed + * that up */ + + if (nr_components == 3) { + ins.swizzle[1][2] = COMPONENT_Z; + ins.swizzle[1][3] = needs_temp_coord ? COMPONENT_W : COMPONENT_Z; + } else if (nr_components == 2) { + ins.swizzle[1][2] = + instr->is_shadow ? COMPONENT_Z : COMPONENT_X; + ins.swizzle[1][3] = COMPONENT_X; + } else unreachable("Invalid texture 2D components"); } + if (midgard_texop == TEXTURE_OP_TEXEL_FETCH) { + /* We zeroed */ + ins.swizzle[1][2] = COMPONENT_Z; + ins.swizzle[1][3] = COMPONENT_W; + } + break; } @@ -1827,12 +1890,41 @@ break; ins.texture.lod_register = true; - ins.ssa_args.src[1] = index; + ins.src[2] = index; + + for (unsigned c = 0; c < MIR_VEC_COMPONENTS; ++c) + ins.swizzle[2][c] = COMPONENT_X; + emit_explicit_constant(ctx, index, index); break; }; + case nir_tex_src_offset: { + ins.texture.offset_register = true; + ins.src[3] = index; + + for (unsigned c = 0; c < MIR_VEC_COMPONENTS; ++c) + ins.swizzle[3][c] = (c > COMPONENT_Z) ? 0 : c; + + emit_explicit_constant(ctx, index, index); + break; + }; + + case nir_tex_src_comparator: { + unsigned comp = COMPONENT_Z; + + /* mov coord_temp.foo, coords */ + midgard_instruction mov = v_mov(index, coords); + mov.mask = 1 << comp; + + for (unsigned i = 0; i < MIR_VEC_COMPONENTS; ++i) + mov.swizzle[1][i] = COMPONENT_X; + + emit_mir_instruction(ctx, mov); + break; + } + default: unreachable("Unknown texture source type\n"); } @@ -1847,14 +1939,6 @@ static void emit_tex(compiler_context *ctx, nir_tex_instr *instr) { - /* Fixup op, since only textureLod is permitted in VS but NIR can give - * generic tex in some cases (which confuses the hardware) */ - - bool is_vertex = ctx->stage == MESA_SHADER_VERTEX; - - if (is_vertex && instr->op == nir_texop_tex) - instr->op = nir_texop_txl; - switch (instr->op) { case nir_texop_tex: case nir_texop_txb: @@ -1867,7 +1951,7 @@ emit_texop_native(ctx, instr, TEXTURE_OP_TEXEL_FETCH); break; case nir_texop_txs: - emit_sysval_read(ctx, &instr->instr, -1, 4); + emit_sysval_read(ctx, &instr->instr, ~0, 4); break; default: unreachable("Unhanlded texture op"); @@ -1931,29 +2015,30 @@ /* ALU instructions can inline or embed constants, which decreases register * pressure and saves space. */ -#define CONDITIONAL_ATTACH(src) { \ - void *entry = _mesa_hash_table_u64_search(ctx->ssa_constants, alu->ssa_args.src + 1); \ +#define CONDITIONAL_ATTACH(idx) { \ + void *entry = _mesa_hash_table_u64_search(ctx->ssa_constants, alu->src[idx] + 1); \ \ if (entry) { \ - attach_constants(ctx, alu, entry, alu->ssa_args.src + 1); \ - alu->ssa_args.src = SSA_FIXED_REGISTER(REGISTER_CONSTANT); \ + attach_constants(ctx, alu, entry, alu->src[idx] + 1); \ + alu->src[idx] = SSA_FIXED_REGISTER(REGISTER_CONSTANT); \ } \ } static void -inline_alu_constants(compiler_context *ctx) +inline_alu_constants(compiler_context *ctx, midgard_block *block) { - mir_foreach_instr(ctx, alu) { + mir_foreach_instr_in_block(block, alu) { /* Other instructions cannot inline constants */ if (alu->type != TAG_ALU_4) continue; + if (alu->compact_branch) continue; /* If there is already a constant here, we can do nothing */ if (alu->has_constants) continue; - CONDITIONAL_ATTACH(src[0]); + CONDITIONAL_ATTACH(0); if (!alu->has_constants) { - CONDITIONAL_ATTACH(src[1]) + CONDITIONAL_ATTACH(1) } else if (!alu->inline_constant) { /* Corner case: _two_ vec4 constants, for instance with a * csel. For this case, we can only use a constant @@ -1965,21 +2050,18 @@ * to the destination register. */ - void *entry = _mesa_hash_table_u64_search(ctx->ssa_constants, alu->ssa_args.src[1] + 1); - unsigned scratch = alu->ssa_args.dest; + void *entry = _mesa_hash_table_u64_search(ctx->ssa_constants, alu->src[1] + 1); + unsigned scratch = alu->dest; if (entry) { - midgard_instruction ins = v_mov(SSA_FIXED_REGISTER(REGISTER_CONSTANT), blank_alu_src, scratch); - attach_constants(ctx, &ins, entry, alu->ssa_args.src[1] + 1); - - /* Force a break XXX Defer r31 writes */ - ins.unit = UNIT_VLUT; + midgard_instruction ins = v_mov(SSA_FIXED_REGISTER(REGISTER_CONSTANT), scratch); + attach_constants(ctx, &ins, entry, alu->src[1] + 1); /* Set the source */ - alu->ssa_args.src[1] = scratch; + alu->src[1] = scratch; /* Inject us -before- the last instruction which set r31 */ - mir_insert_instruction_before(mir_prev_op(alu), ins); + mir_insert_instruction_before(ctx, mir_prev_op(alu), ins); } } } @@ -2026,12 +2108,11 @@ * sometimes a performance boost */ static void -embedded_to_inline_constant(compiler_context *ctx) +embedded_to_inline_constant(compiler_context *ctx, midgard_block *block) { - mir_foreach_instr(ctx, ins) { + mir_foreach_instr_in_block(block, ins) { if (!ins->has_constants) continue; - - if (ins->ssa_args.inline_constant) continue; + if (ins->has_inline_constant) continue; /* Blend constants must not be inlined by definition */ if (ins->has_blend_constant) continue; @@ -2049,7 +2130,7 @@ int op = ins->alu.op; - if (ins->ssa_args.src[0] == SSA_FIXED_REGISTER(REGISTER_CONSTANT)) { + if (ins->src[0] == SSA_FIXED_REGISTER(REGISTER_CONSTANT)) { bool flip = alu_opcode_props[op].props & OP_COMMUTES; switch (op) { @@ -2070,22 +2151,11 @@ break; } - if (flip) { - /* Flip the SSA numbers */ - ins->ssa_args.src[0] = ins->ssa_args.src[1]; - ins->ssa_args.src[1] = SSA_FIXED_REGISTER(REGISTER_CONSTANT); - - /* And flip the modifiers */ - - unsigned src_temp; - - src_temp = ins->alu.src2; - ins->alu.src2 = ins->alu.src1; - ins->alu.src1 = src_temp; - } + if (flip) + mir_flip(ins); } - if (ins->ssa_args.src[1] == SSA_FIXED_REGISTER(REGISTER_CONSTANT)) { + if (ins->src[1] == SSA_FIXED_REGISTER(REGISTER_CONSTANT)) { /* Extract the source information */ midgard_vector_alu_src *src; @@ -2093,21 +2163,24 @@ midgard_vector_alu_src *m = (midgard_vector_alu_src *) &q; src = m; - /* Component is from the swizzle, e.g. r26.w -> w component. TODO: What if x is masked out? */ - int component = src->swizzle & 3; + /* Component is from the swizzle. Take a nonzero component */ + assert(ins->mask); + unsigned first_comp = ffs(ins->mask) - 1; + unsigned component = ins->swizzle[1][first_comp]; /* Scale constant appropriately, if we can legally */ uint16_t scaled_constant = 0; - if (midgard_is_integer_op(op) || is_16) { - unsigned int *iconstants = (unsigned int *) ins->constants; - scaled_constant = (uint16_t) iconstants[component]; + if (is_16) { + scaled_constant = ins->constants.u16[component]; + } else if (midgard_is_integer_op(op)) { + scaled_constant = ins->constants.u32[component]; /* Constant overflow after resize */ - if (scaled_constant != iconstants[component]) + if (scaled_constant != ins->constants.u32[component]) continue; } else { - float original = (float) ins->constants[component]; + float original = ins->constants.f32[component]; scaled_constant = _mesa_float_to_half(original); /* Check for loss of precision. If this is @@ -2130,22 +2203,23 @@ continue; } - /* Make sure that the constant is not itself a - * vector by checking if all accessed values - * (by the swizzle) are the same. */ + /* Make sure that the constant is not itself a vector + * by checking if all accessed values are the same. */ - uint32_t *cons = (uint32_t *) ins->constants; - uint32_t value = cons[component]; + const midgard_constants *cons = &ins->constants; + uint32_t value = is_16 ? cons->u16[component] : cons->u32[component]; bool is_vector = false; unsigned mask = effective_writemask(&ins->alu, ins->mask); - for (int c = 1; c < 4; ++c) { + for (unsigned c = 0; c < MIR_VEC_COMPONENTS; ++c) { /* We only care if this component is actually used */ if (!(mask & (1 << c))) continue; - uint32_t test = cons[(src->swizzle >> (2 * c)) & 3]; + uint32_t test = is_16 ? + cons->u16[ins->swizzle[1][c]] : + cons->u32[ins->swizzle[1][c]]; if (test != value) { is_vector = true; @@ -2158,8 +2232,8 @@ /* Get rid of the embedded constant */ ins->has_constants = false; - ins->ssa_args.src[1] = -1; - ins->ssa_args.inline_constant = true; + ins->src[1] = ~0; + ins->has_inline_constant = true; ins->inline_constant = scaled_constant; } } @@ -2176,19 +2250,8 @@ mir_foreach_instr_in_block_safe(block, ins) { if (!midgard_is_branch_unit(ins->unit)) continue; - /* We ignore prepacked branches since the fragment epilogue is - * just generally special */ - if (ins->prepacked_branch) continue; - - /* Discards are similarly special and may not correspond to the - * end of a block */ - - if (ins->branch.target_type == TARGET_DISCARD) continue; - - if (branched) { - /* We already branched, so this is dead */ + if (branched) mir_remove_instruction(ins); - } branched = true; } @@ -2224,9 +2287,8 @@ if (ins->alu.outmod != midgard_outmod_pos) continue; /* TODO: Registers? */ - unsigned src = ins->ssa_args.src[1]; + unsigned src = ins->src[1]; if (src & IS_REG) continue; - assert(!mir_has_multiple_writes(ctx, src)); /* There might be a source modifier, too */ if (mir_nontrivial_source2_mod(ins)) continue; @@ -2234,7 +2296,7 @@ /* Backpropagate the modifier */ mir_foreach_instr_in_block_from_rev(block, v, mir_prev_op(ins)) { if (v->type != TAG_ALU_4) continue; - if (v->ssa_args.dest != src) continue; + if (v->dest != src) continue; /* Can we even take a float outmod? */ if (midgard_is_integer_out_op(v->alu.op)) continue; @@ -2256,25 +2318,20 @@ return progress; } -static void -emit_fragment_epilogue(compiler_context *ctx) -{ - /* Just emit the last chunk with the branch */ - EMIT(alu_br_compact_cond, midgard_jmp_writeout_op_writeout, TAG_ALU_4, -1, midgard_condition_always); -} - -static midgard_block * -create_empty_block(compiler_context *ctx) +static unsigned +emit_fragment_epilogue(compiler_context *ctx, unsigned rt) { - midgard_block *blk = rzalloc(ctx, midgard_block); - - blk->predecessors = _mesa_set_create(blk, - _mesa_hash_pointer, - _mesa_key_pointer_equal); + /* Loop to ourselves */ - blk->source_id = ctx->block_source_count++; + struct midgard_instruction ins = v_branch(false, false); + ins.writeout = true; + ins.branch.target_block = ctx->block_count - 1; + ins.constants.u32[0] = rt * 0x100; + emit_mir_instruction(ctx, ins); - return blk; + ctx->current_block->epilogue = true; + schedule_barrier(ctx); + return ins.branch.target_block; } static midgard_block * @@ -2291,9 +2348,6 @@ this_block->is_scheduled = false; ++ctx->block_count; - ctx->texture_index[0] = -1; - ctx->texture_index[1] = -1; - /* Set up current block */ list_inithead(&this_block->instructions); ctx->current_block = this_block; @@ -2303,20 +2357,6 @@ ++ctx->instruction_count; } - inline_alu_constants(ctx); - embedded_to_inline_constant(ctx); - - /* Append fragment shader epilogue (value writeout) */ - if (ctx->stage == MESA_SHADER_FRAGMENT) { - if (block == nir_impl_last_block(ctx->func->impl)) { - emit_fragment_epilogue(ctx); - } - } - - /* Allow the next control flow to access us retroactively, for - * branching etc */ - ctx->current_block = this_block; - return this_block; } @@ -2327,13 +2367,10 @@ { midgard_block *before_block = ctx->current_block; - /* Conditional branches expect the condition in r31.w; emit a move for - * that in the _previous_ block (which is the current block). */ - emit_condition(ctx, &nif->condition, true, COMPONENT_X); - /* Speculatively emit the branch, but we can't fill it in until later */ EMIT(branch, true, true); midgard_instruction *then_branch = mir_last_in_block(ctx->current_block); + then_branch->src[0] = nir_src_index(ctx, &nif->condition); /* Emit the two subblocks. */ midgard_block *then_block = emit_cf_list(ctx, &nif->then_list); @@ -2412,7 +2449,6 @@ mir_foreach_instr_in_block(block, ins) { if (ins->type != TAG_ALU_4) continue; if (!ins->compact_branch) continue; - if (ins->prepacked_branch) continue; /* We found a branch -- check the type to see if we need to do anything */ if (ins->branch.target_type != TARGET_BREAK) continue; @@ -2483,10 +2519,10 @@ unsigned first_tag = 0; mir_foreach_block_from(ctx, initial_block, v) { - midgard_bundle *initial_bundle = - util_dynarray_element(&v->bundles, midgard_bundle, 0); + if (v->quadword_count) { + midgard_bundle *initial_bundle = + util_dynarray_element(&v->bundles, midgard_bundle, 0); - if (initial_bundle) { first_tag = initial_bundle->tag; break; } @@ -2495,8 +2531,90 @@ return first_tag; } +static unsigned +pan_format_from_nir_base(nir_alu_type base) +{ + switch (base) { + case nir_type_int: + return MALI_FORMAT_SINT; + case nir_type_uint: + case nir_type_bool: + return MALI_FORMAT_UINT; + case nir_type_float: + return MALI_CHANNEL_FLOAT; + default: + unreachable("Invalid base"); + } +} + +static unsigned +pan_format_from_nir_size(nir_alu_type base, unsigned size) +{ + if (base == nir_type_float) { + switch (size) { + case 16: return MALI_FORMAT_SINT; + case 32: return MALI_FORMAT_UNORM; + default: + unreachable("Invalid float size for format"); + } + } else { + switch (size) { + case 1: + case 8: return MALI_CHANNEL_8; + case 16: return MALI_CHANNEL_16; + case 32: return MALI_CHANNEL_32; + default: + unreachable("Invalid int size for format"); + } + } +} + +static enum mali_format +pan_format_from_glsl(const struct glsl_type *type) +{ + enum glsl_base_type glsl_base = glsl_get_base_type(glsl_without_array(type)); + nir_alu_type t = nir_get_nir_type_for_glsl_base_type(glsl_base); + + unsigned base = nir_alu_type_get_base_type(t); + unsigned size = nir_alu_type_get_type_size(t); + + return pan_format_from_nir_base(base) | + pan_format_from_nir_size(base, size) | + MALI_NR_CHANNELS(4); +} + +/* For each fragment writeout instruction, generate a writeout loop to + * associate with it */ + +static void +mir_add_writeout_loops(compiler_context *ctx) +{ + for (unsigned rt = 0; rt < ARRAY_SIZE(ctx->writeout_branch); ++rt) { + midgard_instruction *br = ctx->writeout_branch[rt]; + if (!br) continue; + + unsigned popped = br->branch.target_block; + midgard_block_add_successor(mir_get_block(ctx, popped - 1), ctx->current_block); + br->branch.target_block = emit_fragment_epilogue(ctx, rt); + + /* If we have more RTs, we'll need to restore back after our + * loop terminates */ + + if ((rt + 1) < ARRAY_SIZE(ctx->writeout_branch) && ctx->writeout_branch[rt + 1]) { + midgard_instruction uncond = v_branch(false, false); + uncond.branch.target_block = popped; + emit_mir_instruction(ctx, uncond); + midgard_block_add_successor(ctx->current_block, mir_get_block(ctx, popped)); + schedule_barrier(ctx); + } else { + /* We're last, so we can terminate here */ + br->last_writeout = true; + } + } +} + int -midgard_compile_shader_nir(struct midgard_screen *screen, nir_shader *nir, midgard_program *program, bool is_blend) +midgard_compile_shader_nir(nir_shader *nir, midgard_program *program, bool is_blend, unsigned blend_rt, unsigned gpu_id, bool shaderdb) { struct util_dynarray *compiled = &program->compiled; @@ -2506,10 +2624,11 @@ compiler_context *ctx = rzalloc(NULL, compiler_context); ctx->nir = nir; - ctx->screen = screen; ctx->stage = nir->info.stage; ctx->is_blend = is_blend; ctx->alpha_ref = program->alpha_ref; + ctx->blend_rt = blend_rt; + ctx->quirks = midgard_get_quirks(gpu_id); /* Start off with a safe cutoff, allowing usage of all 16 work * registers. Later, we'll promote uniform reads to uniform registers @@ -2534,6 +2653,7 @@ for (int c = 0; c < sz; ++c) { program->varyings[loc + c] = var->data.location + c; + program->varying_type[loc + c] = pan_format_from_glsl(var->type); max_varying = MAX2(max_varying, loc + c); } } @@ -2546,7 +2666,7 @@ if (ctx->stage == MESA_SHADER_VERTEX) { NIR_PASS_V(nir, nir_lower_viewport_transform); - NIR_PASS_V(nir, nir_clamp_psiz, 1.0, 1024.0); + NIR_PASS_V(nir, nir_lower_point_size, 1.0, 1024.0); } NIR_PASS_V(nir, nir_lower_var_copies); @@ -2561,7 +2681,7 @@ /* Optimisation passes */ - optimise_nir(nir); + optimise_nir(nir, ctx->quirks); if (midgard_debug & MIDGARD_DBG_SHADERS) { nir_print_shader(nir, stdout); @@ -2585,21 +2705,18 @@ ctx->func = func; emit_cf_list(ctx, &func->impl->body); - - /* Emit empty exit block with successor */ - - struct midgard_block *semi_end = ctx->current_block; - - struct midgard_block *end = - emit_block(ctx, func->impl->end_block); - - midgard_block_add_successor(semi_end, end); - break; /* TODO: Multi-function shaders */ } util_dynarray_init(compiled, NULL); + /* Per-block lowering before opts */ + + mir_foreach_block(ctx, block) { + inline_alu_constants(ctx, block); + midgard_opt_promote_fmov(ctx, block); + embedded_to_inline_constant(ctx, block); + } /* MIR-level optimizations */ bool progress = false; @@ -2616,6 +2733,9 @@ progress |= midgard_opt_not_propagate(ctx, block); progress |= midgard_opt_fuse_src_invert(ctx, block); progress |= midgard_opt_fuse_dest_invert(ctx, block); + progress |= midgard_opt_csel_invert(ctx, block); + progress |= midgard_opt_drop_cmp_invert(ctx, block); + progress |= midgard_opt_invert_branch(ctx, block); } } while (progress); @@ -2636,8 +2756,12 @@ assert(!ins->invert); } + if (ctx->stage == MESA_SHADER_FRAGMENT) + mir_add_writeout_loops(ctx); + /* Schedule! */ - schedule_program(ctx); + midgard_schedule_program(ctx); + mir_ra(ctx); /* Now that all the bundles are scheduled and we can calculate block * sizes, emit actual branch instructions rather than placeholders */ @@ -2651,13 +2775,12 @@ if (!midgard_is_branch_unit(ins->unit)) continue; - if (ins->prepacked_branch) continue; - /* Parse some basic branch info */ bool is_compact = ins->unit == ALU_ENAB_BR_COMPACT; bool is_conditional = ins->branch.conditional; bool is_inverted = ins->branch.invert_conditional; bool is_discard = ins->branch.target_type == TARGET_DISCARD; + bool is_writeout = ins->writeout; /* Determine the block we're jumping to */ int target_number = ins->branch.target_block; @@ -2707,6 +2830,7 @@ midgard_jmp_writeout_op op = is_discard ? midgard_jmp_writeout_op_discard : + is_writeout ? midgard_jmp_writeout_op_writeout : (is_compact && !is_conditional) ? midgard_jmp_writeout_op_branch_uncond : midgard_jmp_writeout_op_branch_cond; @@ -2771,22 +2895,14 @@ /* Midgard prefetches instruction types, so during emission we * need to lookahead. Unless this is the last instruction, in - * which we return 1. Or if this is the second to last and the - * last is an ALU, then it's also 1... */ + * which we return 1. */ mir_foreach_block(ctx, block) { mir_foreach_bundle_in_block(block, bundle) { int lookahead = 1; - if (current_bundle + 1 < bundle_count) { - uint8_t next = source_order_bundles[current_bundle + 1]->tag; - - if (!(current_bundle + 2 < bundle_count) && IS_ALU(next)) { - lookahead = 1; - } else { - lookahead = next; - } - } + if (!bundle->last_writeout && (current_bundle + 1 < bundle_count)) + lookahead = source_order_bundles[current_bundle + 1]->tag; emit_binary_bundle(ctx, bundle, compiled, lookahead); ++current_bundle; @@ -2809,22 +2925,19 @@ program->tls_size = ctx->tls_size; if (midgard_debug & MIDGARD_DBG_SHADERS) - disassemble_midgard(program->compiled.data, program->compiled.size, false, 0, ""); + disassemble_midgard(stdout, program->compiled.data, program->compiled.size, gpu_id, ctx->stage); - if (midgard_debug & MIDGARD_DBG_SHADERDB) { - unsigned nr_bundles = 0, nr_ins = 0, nr_quadwords = 0; + if (midgard_debug & MIDGARD_DBG_SHADERDB || shaderdb) { + unsigned nr_bundles = 0, nr_ins = 0; /* Count instructions and bundles */ - mir_foreach_instr_global(ctx, ins) { - nr_ins++; - } - mir_foreach_block(ctx, block) { nr_bundles += util_dynarray_num_elements( &block->bundles, midgard_bundle); - nr_quadwords += block->quadword_count; + mir_foreach_bundle_in_block(block, bun) + nr_ins += bun->instruction_count; } /* Calculate thread count. There are certain cutoffs by @@ -2842,10 +2955,10 @@ fprintf(stderr, "shader%d - %s shader: " "%u inst, %u bundles, %u quadwords, " "%u registers, %u threads, %u loops, " - "%d:%d spills:fills\n", + "%u:%u spills:fills\n", SHADER_DB_COUNT++, gl_shader_stage_name(ctx->stage), - nr_ins, nr_bundles, nr_quadwords, + nr_ins, nr_bundles, ctx->quadword_count, nr_registers, nr_threads, ctx->loop_count, ctx->spills, ctx->fills); diff -Nru mesa-19.2.8/src/panfrost/midgard/midgard_compile.h mesa-20.0.8/src/panfrost/midgard/midgard_compile.h --- mesa-19.2.8/src/panfrost/midgard/midgard_compile.h 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/panfrost/midgard/midgard_compile.h 2020-06-12 01:21:18.000000000 +0000 @@ -26,27 +26,7 @@ #include "compiler/nir/nir.h" #include "util/u_dynarray.h" -#include "util/register_allocate.h" - -/* To be shoved inside panfrost_screen for the Gallium driver, or somewhere - * else for Vulkan/standalone. The single compiler "screen" to be shared across - * all shader compiles, used to store complex initialization (for instance, - * related to register allocation) */ - -struct midgard_screen { - /* Precomputed register allocation sets for varying numbers of work - * registers. The zeroeth entry corresponds to 8 work registers. The - * eighth entry corresponds to 16 work registers. NULL if this set has - * not been allocated yet. */ - - struct ra_regs *regs[9]; - - /* Work register classes corresponds to the above register sets. 20 per - * set for 4 classes per work/ldst/ldst27/texr/texw. TODO: Unify with - * compiler.h */ - - unsigned reg_classes[9][4 * 5]; -}; +#include "panfrost-job.h" /* Define the general compiler entry point */ @@ -68,7 +48,8 @@ PAN_SYSVAL_TEXTURE_SIZE = 3, PAN_SYSVAL_SSBO = 4, PAN_SYSVAL_NUM_WORK_GROUPS = 5, -} pan_sysval; + PAN_SYSVAL_SAMPLER = 7, +}; #define PAN_TXS_SYSVAL_ID(texidx, dim, is_array) \ ((texidx) | ((dim) << 7) | ((is_array) ? (1 << 9) : 0)) @@ -77,6 +58,15 @@ #define PAN_SYSVAL_ID_TO_TXS_DIM(id) (((id) >> 7) & 0x3) #define PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(id) !!((id) & (1 << 9)) +/* Special attribute slots for vertex builtins. Sort of arbitrary but let's be + * consistent with the blob so we can compare traces easier. */ + +enum { + PAN_VERTEX_ID = 16, + PAN_INSTANCE_ID = 17, + PAN_MAX_ATTRIBUTE +}; + typedef struct { int work_register_count; int uniform_count; @@ -89,6 +79,7 @@ unsigned sysvals[MAX_SYSVAL_COUNT]; unsigned varyings[32]; + enum mali_format varying_type[32]; /* Boolean properties of the program */ bool writes_point_size; @@ -111,7 +102,7 @@ } midgard_program; int -midgard_compile_shader_nir(struct midgard_screen *screen, nir_shader *nir, midgard_program *program, bool is_blend); +midgard_compile_shader_nir(nir_shader *nir, midgard_program *program, bool is_blend, unsigned blend_rt, unsigned gpu_id, bool shaderdb); /* NIR options are shared between the standalone compiler and the online * compiler. Defining it here is the simplest, though maybe not the Right @@ -130,6 +121,7 @@ .lower_isign = true, .lower_fpow = true, .lower_find_lsb = true, + .lower_fdph = true, .lower_wpos_pntc = true, @@ -137,14 +129,27 @@ * eventually */ .lower_fsign = true, - .vertex_id_zero_based = true, .lower_extract_byte = true, .lower_extract_word = true, .lower_rotate = true, + .lower_pack_half_2x16 = true, + .lower_pack_half_2x16_split = true, + .lower_pack_unorm_2x16 = true, + .lower_pack_snorm_2x16 = true, + .lower_pack_unorm_4x8 = true, + .lower_pack_snorm_4x8 = true, + .lower_unpack_half_2x16 = true, + .lower_unpack_half_2x16_split = true, + .lower_unpack_unorm_2x16 = true, + .lower_unpack_snorm_2x16 = true, + .lower_unpack_unorm_4x8 = true, + .lower_unpack_snorm_4x8 = true, + .lower_doubles_options = nir_lower_dmod, .vectorize_io = true, + .use_interpolated_input_intrinsics = true }; #endif diff -Nru mesa-19.2.8/src/panfrost/midgard/midgard_derivatives.c mesa-20.0.8/src/panfrost/midgard/midgard_derivatives.c --- mesa-19.2.8/src/panfrost/midgard/midgard_derivatives.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/panfrost/midgard/midgard_derivatives.c 2020-06-12 01:21:18.000000000 +0000 @@ -72,12 +72,19 @@ * implicitly */ bool -mir_op_computes_derivatives(unsigned op) +mir_op_computes_derivatives(gl_shader_stage stage, unsigned op) { + /* Only fragment shaders may compute derivatives, but the sense of + * "normal" changes in vertex shaders on certain GPUs */ + + if (op == TEXTURE_OP_NORMAL && stage != MESA_SHADER_FRAGMENT) + return false; + switch (op) { case TEXTURE_OP_NORMAL: case TEXTURE_OP_DFDX: case TEXTURE_OP_DFDY: + assert(stage == MESA_SHADER_FRAGMENT); return true; default: return false; @@ -94,22 +101,20 @@ midgard_instruction ins = { .type = TAG_TEXTURE_4, .mask = mask_of(nr_components), - .ssa_args = { - .dest = nir_dest_index(ctx, &instr->dest.dest), - .src = { nir_alu_src_index(ctx, &instr->src[0]), -1, -1 }, - }, + .dest = nir_dest_index(ctx, &instr->dest.dest), + .src = { nir_alu_src_index(ctx, &instr->src[0]), ~0, ~0, ~0 }, .texture = { .op = mir_derivative_op(instr->op), .format = MALI_TEX_2D, - .swizzle = SWIZZLE_XYXX, - .in_reg_swizzle = SWIZZLE_XYXX, - .in_reg_full = 1, .out_full = 1, .sampler_type = MALI_SAMPLER_FLOAT, } }; + ins.swizzle[0][2] = ins.swizzle[0][3] = COMPONENT_X; + ins.swizzle[1][2] = ins.swizzle[1][3] = COMPONENT_X; + if (!instr->dest.dest.is_ssa) ins.mask &= instr->dest.write_mask; @@ -144,13 +149,14 @@ dup.mask &= 0b1100; /* Fixup swizzles */ - assert(ins->texture.swizzle == SWIZZLE_XYXX); - assert(ins->texture.in_reg_swizzle == SWIZZLE_XYXX); - dup.texture.swizzle = SWIZZLE_XXXY; - dup.texture.in_reg_swizzle = SWIZZLE_ZWWW; + dup.swizzle[0][0] = dup.swizzle[0][1] = dup.swizzle[0][2] = COMPONENT_X; + dup.swizzle[0][3] = COMPONENT_Y; + + dup.swizzle[1][0] = COMPONENT_Z; + dup.swizzle[1][1] = dup.swizzle[1][2] = dup.swizzle[1][3] = COMPONENT_W; /* Insert the new instruction */ - mir_insert_instruction_before(mir_next_op(ins), dup); + mir_insert_instruction_before(ctx, mir_next_op(ins), dup); /* TODO: Set .cont/.last automatically via dataflow analysis */ ctx->texture_op_count++; @@ -159,6 +165,6 @@ * rewrite to use a register */ unsigned new = make_compiler_temp_reg(ctx); - mir_rewrite_index(ctx, ins->ssa_args.dest, new); + mir_rewrite_index(ctx, ins->dest, new); } } diff -Nru mesa-19.2.8/src/panfrost/midgard/midgard_emit.c mesa-20.0.8/src/panfrost/midgard/midgard_emit.c --- mesa-19.2.8/src/panfrost/midgard/midgard_emit.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/panfrost/midgard/midgard_emit.c 2020-06-12 01:21:18.000000000 +0000 @@ -42,16 +42,13 @@ static unsigned vector_to_scalar_source(unsigned u, bool is_int, bool is_full, - unsigned masked_component) + unsigned component) { midgard_vector_alu_src v; memcpy(&v, &u, sizeof(v)); /* TODO: Integers */ - unsigned component = (v.swizzle >> (2*masked_component)) & 3; - bool upper = false; /* TODO */ - midgard_scalar_alu_src s = { 0 }; if (is_full) { @@ -70,7 +67,7 @@ if (s.full) s.component = component << 1; else - s.component = component + (upper << 2); + s.component = component; if (is_int) { /* TODO */ @@ -90,15 +87,15 @@ { bool is_int = midgard_is_integer_op(v.op); bool is_full = v.reg_mode == midgard_reg_mode_32; - bool is_inline_constant = ins->ssa_args.inline_constant; + bool is_inline_constant = ins->has_inline_constant; unsigned comp = component_from_mask(ins->mask); /* The output component is from the mask */ midgard_scalar_alu s = { .op = v.op, - .src1 = vector_to_scalar_source(v.src1, is_int, is_full, comp), - .src2 = !is_inline_constant ? vector_to_scalar_source(v.src2, is_int, is_full, comp) : 0, + .src1 = vector_to_scalar_source(v.src1, is_int, is_full, ins->swizzle[0][comp]), + .src2 = !is_inline_constant ? vector_to_scalar_source(v.src2, is_int, is_full, ins->swizzle[1][comp]) : 0, .unknown = 0, .outmod = v.outmod, .output_full = is_full, @@ -114,7 +111,7 @@ /* Inline constant is passed along rather than trying to extract it * from v */ - if (ins->ssa_args.inline_constant) { + if (ins->has_inline_constant) { uint16_t imm = 0; int lower_11 = ins->inline_constant & ((1 << 12) - 1); imm |= (lower_11 >> 9) & 3; @@ -128,6 +125,192 @@ return s; } +/* 64-bit swizzles are super easy since there are 2 components of 2 components + * in an 8-bit field ... lots of duplication to go around! + * + * Swizzles of 32-bit vectors accessed from 64-bit instructions are a little + * funny -- pack them *as if* they were native 64-bit, using rep_* flags to + * flag upper. For instance, xy would become 64-bit XY but that's just xyzw + * native. Likewise, zz would become 64-bit XX with rep* so it would be xyxy + * with rep. Pretty nifty, huh? */ + +static unsigned +mir_pack_swizzle_64(unsigned *swizzle, unsigned max_component) +{ + unsigned packed = 0; + + for (unsigned i = 0; i < 2; ++i) { + assert(swizzle[i] <= max_component); + + unsigned a = (swizzle[i] & 1) ? + (COMPONENT_W << 2) | COMPONENT_Z : + (COMPONENT_Y << 2) | COMPONENT_X; + + packed |= a << (i * 4); + } + + return packed; +} + +static void +mir_pack_mask_alu(midgard_instruction *ins) +{ + unsigned effective = ins->mask; + + /* If we have a destination override, we need to figure out whether to + * override to the lower or upper half, shifting the effective mask in + * the latter, so AAAA.... becomes AAAA */ + + unsigned upper_shift = mir_upper_override(ins); + + if (upper_shift) { + effective >>= upper_shift; + ins->alu.dest_override = midgard_dest_override_upper; + } + + if (ins->alu.reg_mode == midgard_reg_mode_32) + ins->alu.mask = expand_writemask(effective, 4); + else if (ins->alu.reg_mode == midgard_reg_mode_64) + ins->alu.mask = expand_writemask(effective, 2); + else + ins->alu.mask = effective; +} + +static void +mir_pack_swizzle_alu(midgard_instruction *ins) +{ + midgard_vector_alu_src src[] = { + vector_alu_from_unsigned(ins->alu.src1), + vector_alu_from_unsigned(ins->alu.src2) + }; + + for (unsigned i = 0; i < 2; ++i) { + unsigned packed = 0; + + if (ins->alu.reg_mode == midgard_reg_mode_64) { + midgard_reg_mode mode = mir_srcsize(ins, i); + unsigned components = 16 / mir_bytes_for_mode(mode); + + packed = mir_pack_swizzle_64(ins->swizzle[i], components); + + if (mode == midgard_reg_mode_32) { + src[i].rep_low |= (ins->swizzle[i][0] >= COMPONENT_Z); + src[i].rep_high |= (ins->swizzle[i][1] >= COMPONENT_Z); + } else if (mode < midgard_reg_mode_32) { + unreachable("Cannot encode 8/16 swizzle in 64-bit"); + } + } else { + /* For 32-bit, swizzle packing is stupid-simple. For 16-bit, + * the strategy is to check whether the nibble we're on is + * upper or lower. We need all components to be on the same + * "side"; that much is enforced by the ISA and should have + * been lowered. TODO: 8-bit packing. TODO: vec8 */ + + unsigned first = ins->mask ? ffs(ins->mask) - 1 : 0; + bool upper = ins->swizzle[i][first] > 3; + + if (upper && ins->mask) + assert(mir_srcsize(ins, i) <= midgard_reg_mode_16); + + for (unsigned c = 0; c < 4; ++c) { + unsigned v = ins->swizzle[i][c]; + + bool t_upper = v > 3; + + /* Ensure we're doing something sane */ + + if (ins->mask & (1 << c)) { + assert(t_upper == upper); + assert(v <= 7); + } + + /* Use the non upper part */ + v &= 0x3; + + packed |= v << (2 * c); + } + + src[i].rep_high = upper; + } + + src[i].swizzle = packed; + } + + ins->alu.src1 = vector_alu_srco_unsigned(src[0]); + + if (!ins->has_inline_constant) + ins->alu.src2 = vector_alu_srco_unsigned(src[1]); +} + +static void +mir_pack_swizzle_ldst(midgard_instruction *ins) +{ + /* TODO: non-32-bit, non-vec4 */ + for (unsigned c = 0; c < 4; ++c) { + unsigned v = ins->swizzle[0][c]; + + /* Check vec4 */ + assert(v <= 3); + + ins->load_store.swizzle |= v << (2 * c); + } + + /* TODO: arg_1/2 */ +} + +static void +mir_pack_swizzle_tex(midgard_instruction *ins) +{ + for (unsigned i = 0; i < 2; ++i) { + unsigned packed = 0; + + for (unsigned c = 0; c < 4; ++c) { + unsigned v = ins->swizzle[i][c]; + + /* Check vec4 */ + assert(v <= 3); + + packed |= v << (2 * c); + } + + if (i == 0) + ins->texture.swizzle = packed; + else + ins->texture.in_reg_swizzle = packed; + } + + /* TODO: bias component */ +} + +/* Load store masks are 4-bits. Load/store ops pack for that. vec4 is the + * natural mask width; vec8 is constrained to be in pairs, vec2 is duplicated. TODO: 8-bit? + */ + +static void +mir_pack_ldst_mask(midgard_instruction *ins) +{ + midgard_reg_mode mode = mir_typesize(ins); + unsigned packed = ins->mask; + + if (mode == midgard_reg_mode_64) { + packed = ((ins->mask & 0x2) ? (0x8 | 0x4) : 0) | + ((ins->mask & 0x1) ? (0x2 | 0x1) : 0); + } else if (mode == midgard_reg_mode_16) { + packed = 0; + + for (unsigned i = 0; i < 4; ++i) { + /* Make sure we're duplicated */ + bool u = (ins->mask & (1 << (2*i + 0))) != 0; + bool v = (ins->mask & (1 << (2*i + 1))) != 0; + assert(u == v); + + packed |= (u << i); + } + } + + ins->load_store.mask = packed; +} + static void emit_alu_bundle(compiler_context *ctx, midgard_bundle *bundle, @@ -142,7 +325,7 @@ midgard_instruction *ins = bundle->instructions[i]; /* Check if this instruction has registers */ - if (ins->compact_branch || ins->prepacked_branch) continue; + if (ins->compact_branch) continue; /* Otherwise, just emit the registers */ uint16_t reg_word = 0; @@ -162,11 +345,8 @@ midgard_scalar_alu scalarized; if (ins->unit & UNITS_ANY_VECTOR) { - if (ins->alu.reg_mode == midgard_reg_mode_32) - ins->alu.mask = expand_writemask_32(ins->mask); - else - ins->alu.mask = ins->mask; - + mir_pack_mask_alu(ins); + mir_pack_swizzle_alu(ins); size = sizeof(midgard_vector_alu); source = &ins->alu; } else if (ins->unit == ALU_ENAB_BR_COMPACT) { @@ -189,12 +369,21 @@ /* Tack on constants */ - if (bundle->has_embedded_constants) { - util_dynarray_append(emission, float, bundle->constants[0]); - util_dynarray_append(emission, float, bundle->constants[1]); - util_dynarray_append(emission, float, bundle->constants[2]); - util_dynarray_append(emission, float, bundle->constants[3]); - } + if (bundle->has_embedded_constants) + util_dynarray_append(emission, midgard_constants, bundle->constants); +} + +/* Shift applied to the immediate used as an offset. Probably this is papering + * over some other semantic distinction else well, but it unifies things in the + * compiler so I don't mind. */ + +static unsigned +mir_ldst_imm_shift(midgard_load_store_op op) +{ + if (OP_IS_UBO_READ(op)) + return 3; + else + return 1; } /* After everything is scheduled, emit whole bundles at a time */ @@ -212,6 +401,10 @@ case TAG_ALU_8: case TAG_ALU_12: case TAG_ALU_16: + case TAG_ALU_4 + 4: + case TAG_ALU_8 + 4: + case TAG_ALU_12 + 4: + case TAG_ALU_16 + 4: emit_alu_bundle(ctx, bundle, emission, lookahead); break; @@ -223,8 +416,20 @@ /* Copy masks */ for (unsigned i = 0; i < bundle->instruction_count; ++i) { - bundle->instructions[i]->load_store.mask = - bundle->instructions[i]->mask; + mir_pack_ldst_mask(bundle->instructions[i]); + + mir_pack_swizzle_ldst(bundle->instructions[i]); + + /* Apply a constant offset */ + unsigned offset = bundle->instructions[i]->constants.u32[0]; + + if (offset) { + unsigned shift = mir_ldst_imm_shift(bundle->instructions[i]->load_store.op); + unsigned upper_shift = 10 - shift; + + bundle->instructions[i]->load_store.varying_parameters |= (offset & ((1 << upper_shift) - 1)) << shift; + bundle->instructions[i]->load_store.address |= (offset >> upper_shift); + } } memcpy(¤t64, &bundle->instructions[0]->load_store, sizeof(current64)); @@ -255,10 +460,11 @@ ins->texture.type = bundle->tag; ins->texture.next_type = next_tag; ins->texture.mask = ins->mask; + mir_pack_swizzle_tex(ins); ctx->texture_op_count--; - if (mir_op_computes_derivatives(ins->texture.op)) { + if (mir_op_computes_derivatives(ctx->stage, ins->texture.op)) { bool continues = ctx->texture_op_count > 0; /* Control flow complicates helper invocation diff -Nru mesa-19.2.8/src/panfrost/midgard/midgard_errata_lod.c mesa-20.0.8/src/panfrost/midgard/midgard_errata_lod.c --- mesa-19.2.8/src/panfrost/midgard/midgard_errata_lod.c 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/panfrost/midgard/midgard_errata_lod.c 2020-06-12 01:21:18.000000000 +0000 @@ -0,0 +1,98 @@ +/* + * Copyright (C) 2019 Collabora, Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "compiler/nir/nir.h" +#include "compiler/nir/nir_builder.h" + +void midgard_nir_lod_errata(nir_shader *shader); + +/* Workarounds errata pertaining to early Midgard chips where the settings for + * min_lod/max_lod/lod_bias are ignored in the sampler descriptor when + * texturing with a textureLod instruction. The workaround is to load these + * constants in as system values and perform the bias/clamp in the shader. + */ + +static void +mir_lod_errata_body(nir_builder *b, nir_tex_instr *tex) +{ + /* The errata only applies to textureLod ("TEXGRD") */ + if (tex->op != nir_texop_txl) + return; + + /* Let's grab the sampler parameters */ + nir_intrinsic_instr *l = nir_intrinsic_instr_create(b->shader, + nir_intrinsic_load_sampler_lod_parameters_pan); + l->num_components = 3; + nir_ssa_dest_init(&l->instr, &l->dest, 3, 32, NULL); + + /* TODO: Indirect samplers, separate sampler objects XXX */ + nir_src idx = nir_src_for_ssa(nir_imm_int(b, tex->texture_index)); + nir_src_copy(&l->src[0], &idx, l); + + nir_builder_instr_insert(b, &l->instr); + nir_ssa_def *params = &l->dest.ssa; + + /* Extract the individual components */ + nir_ssa_def *min_lod = nir_channel(b, params, 0); + nir_ssa_def *max_lod = nir_channel(b, params, 1); + nir_ssa_def *lod_bias = nir_channel(b, params, 2); + + /* Rewrite the LOD with bias/clamps. Order sensitive. */ + for (unsigned i = 0; i < tex->num_srcs; i++) { + if (tex->src[i].src_type != nir_tex_src_lod) + continue; + + nir_ssa_def *lod = nir_ssa_for_src(b, tex->src[i].src, 1); + + nir_ssa_def *biased = nir_fadd(b, lod, lod_bias); + nir_ssa_def *clamped = nir_fmin(b, + nir_fmax(b, biased, min_lod), max_lod); + + nir_instr_rewrite_src(&tex->instr, &tex->src[i].src, + nir_src_for_ssa(clamped)); + } +} + +void +midgard_nir_lod_errata(nir_shader *shader) +{ + nir_foreach_function(function, shader) { + if (!function->impl) continue; + + nir_builder b; + nir_builder_init(&b, function->impl); + + nir_foreach_block(block, function->impl) { + nir_foreach_instr_safe(instr, block) { + if (instr->type != nir_instr_type_tex) continue; + + nir_tex_instr *tex = nir_instr_as_tex(instr); + b.cursor = nir_before_instr(instr); + mir_lod_errata_body(&b, tex); + } + } + + nir_metadata_preserve(function->impl, nir_metadata_block_index | nir_metadata_dominance); + + } +} diff -Nru mesa-19.2.8/src/panfrost/midgard/midgard.h mesa-20.0.8/src/panfrost/midgard/midgard.h --- mesa-19.2.8/src/panfrost/midgard/midgard.h 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/panfrost/midgard/midgard.h 2020-06-12 01:21:18.000000000 +0000 @@ -391,6 +391,12 @@ typedef enum { midgard_op_ld_st_noop = 0x03, + /* Unpack a colour from a native format to fp16 */ + midgard_op_unpack_colour = 0x05, + + /* Packs a colour from fp16 to a native format */ + midgard_op_pack_colour = 0x09, + /* Unclear why this is on the L/S unit, but moves fp32 cube map * coordinates in r27 to its cube map texture coordinate destination * (e.g r29). */ @@ -409,16 +415,26 @@ /* val in r27.y, address embedded, outputs result to argument. Invert val for sub. Let val = +-1 for inc/dec. */ midgard_op_atomic_add = 0x40, + midgard_op_atomic_add64 = 0x41, + midgard_op_atomic_and = 0x44, + midgard_op_atomic_and64 = 0x45, midgard_op_atomic_or = 0x48, + midgard_op_atomic_or64 = 0x49, midgard_op_atomic_xor = 0x4C, + midgard_op_atomic_xor64 = 0x4D, midgard_op_atomic_imin = 0x50, + midgard_op_atomic_imin64 = 0x51, midgard_op_atomic_umin = 0x54, + midgard_op_atomic_umin64 = 0x55, midgard_op_atomic_imax = 0x58, + midgard_op_atomic_imax64 = 0x59, midgard_op_atomic_umax = 0x5C, + midgard_op_atomic_umax64 = 0x5D, midgard_op_atomic_xchg = 0x60, + midgard_op_atomic_xchg64 = 0x61, /* Used for compute shader's __global arguments, __local variables (or * for register spilling) */ @@ -438,7 +454,9 @@ midgard_op_ld_vary_16 = 0x99, midgard_op_ld_vary_32u = 0x9A, midgard_op_ld_vary_32i = 0x9B, - midgard_op_ld_color_buffer_16 = 0x9D, + + /* Old version of midgard_op_ld_color_buffer_u8_as_fp16, for T720 */ + midgard_op_ld_color_buffer_u8_as_fp16_old = 0x9D, /* The distinction between these ops is the alignment requirement / * accompanying shift. Thus, the offset to ld_ubo_int4 is in 16-byte @@ -454,7 +472,9 @@ midgard_op_ld_ubo_short4 = 0xAC, midgard_op_ld_ubo_int4 = 0xB0, - midgard_op_ld_color_buffer_8 = 0xBA, + /* New-style blending ops. Works on T760/T860 */ + midgard_op_ld_color_buffer_u8_as_fp16 = 0xB9, + midgard_op_ld_color_buffer_32u = 0xBA, midgard_op_st_char = 0xC0, midgard_op_st_char2 = 0xC4, /* short */ @@ -592,6 +612,9 @@ #define TEXTURE_OP_LOD 0x12 /* textureLod */ #define TEXTURE_OP_TEXEL_FETCH 0x14 /* texelFetch */ +/* Implements barrier() */ +#define TEXTURE_OP_BARRIER 0x0B + /* Computes horizontal and vertical derivatives respectively. Use with a float * sampler and a "2D" texture. Leave texture/sampler IDs as zero; they ought * to be ignored. Only works for fp32 on 64-bit at a time, so derivatives of a @@ -628,7 +651,13 @@ unsigned last : 1; enum mali_texture_type format : 2; - unsigned zero : 2; + + /* Are sampler_handle/texture_handler respectively set by registers? If + * true, the lower 8-bits of the respective field is a register word. + * If false, they are an immediate */ + + unsigned sampler_register : 1; + unsigned texture_register : 1; /* Is a register used to specify the * LOD/bias/offset? If set, use the `bias` field as @@ -671,16 +700,30 @@ /* In immediate mode, each offset field is an immediate range [0, 7]. * * In register mode, offset_x becomes a register full / select / upper - * triplet and a vec3 swizzle is splattered across offset_y/offset_z in - * a genuinely bizarre way. + * triplet followed by a vec3 swizzle is splattered across + * offset_y/offset_z in a genuinely bizarre way. * * For texel fetches in immediate mode, the range is the full [-8, 7], * but for normal texturing the top bit must be zero and a register - * used instead. It's not clear where this limitation is from. */ + * used instead. It's not clear where this limitation is from. + * + * union { + * struct { + * signed offset_x : 4; + * signed offset_y : 4; + * signed offset_z : 4; + * } immediate; + * struct { + * bool full : 1; + * bool select : 1; + * bool upper : 1; + * unsigned swizzle : 8; + * unsigned zero : 1; + * } register; + * } + */ - signed offset_x : 4; - signed offset_y : 4; - signed offset_z : 4; + unsigned offset : 12; /* In immediate bias mode, for a normal texture op, this is * texture bias, computed as int(2^8 * frac(biasf)), with @@ -693,9 +736,54 @@ unsigned bias : 8; signed bias_int : 8; + /* If sampler/texture_register is set, the bottom 8-bits are + * midgard_tex_register_select and the top 8-bits are zero. If they are + * clear, they are immediate texture indices */ + unsigned sampler_handle : 16; unsigned texture_handle : 16; } midgard_texture_word; +/* Technically barriers are texture instructions but it's less work to add them + * as an explicitly zeroed special case, since most fields are forced to go to + * zero */ + +typedef struct +__attribute__((__packed__)) +{ + unsigned type : 4; + unsigned next_type : 4; + + /* op = TEXTURE_OP_BARRIER */ + unsigned op : 6; + unsigned zero1 : 2; + + /* Since helper invocations don't make any sense, these are forced to one */ + unsigned cont : 1; + unsigned last : 1; + unsigned zero2 : 14; + + unsigned zero3 : 24; + unsigned unknown4 : 1; + unsigned zero4 : 7; + + uint64_t zero5; +} midgard_texture_barrier_word; + +typedef union midgard_constants { + double f64[2]; + uint64_t u64[2]; + int64_t i64[2]; + float f32[4]; + uint32_t u32[4]; + int32_t i32[4]; + uint16_t f16[8]; + uint16_t u16[8]; + int16_t i16[8]; + uint8_t u8[16]; + int8_t i8[16]; +} +midgard_constants; + #endif diff -Nru mesa-19.2.8/src/panfrost/midgard/midgard_liveness.c mesa-20.0.8/src/panfrost/midgard/midgard_liveness.c --- mesa-19.2.8/src/panfrost/midgard/midgard_liveness.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/panfrost/midgard/midgard_liveness.c 2020-06-12 01:21:18.000000000 +0000 @@ -21,88 +21,195 @@ * SOFTWARE. */ -/* mir_is_live_after performs liveness analysis on the MIR, used primarily - * as part of register allocation. TODO: Algorithmic improvements for - * compiler performance (this is the worst algorithm possible -- see - * backlog with Connor on IRC) */ - #include "compiler.h" +#include "util/u_memory.h" -/* Determine if a variable is live in the successors of a block */ -static bool -is_live_after_successors(compiler_context *ctx, midgard_block *bl, int src) +/* Routines for liveness analysis. Liveness is tracked per byte per node. Per + * byte granularity is necessary for proper handling of int8 */ + +static void +liveness_gen(uint16_t *live, unsigned node, unsigned max, uint16_t mask) { - for (unsigned i = 0; i < bl->nr_successors; ++i) { - midgard_block *succ = bl->successors[i]; + if (node >= max) + return; - /* If we already visited, the value we're seeking - * isn't down this path (or we would have short - * circuited */ + live[node] |= mask; +} - if (succ->visited) continue; +static void +liveness_kill(uint16_t *live, unsigned node, unsigned max, uint16_t mask) +{ + if (node >= max) + return; - /* Otherwise (it's visited *now*), check the block */ + live[node] &= ~mask; +} - succ->visited = true; +static bool +liveness_get(uint16_t *live, unsigned node, uint16_t max) { + if (node >= max) + return false; - /* Within this block, check if it's overwritten first */ - unsigned overwritten_mask = 0; + return live[node]; +} - mir_foreach_instr_in_block(succ, ins) { - /* Did we read any components that we haven't overwritten yet? */ - if (mir_mask_of_read_components(ins, src) & ~overwritten_mask) - return true; +/* Updates live_in for a single instruction */ - /* If written-before-use, we're gone */ +void +mir_liveness_ins_update(uint16_t *live, midgard_instruction *ins, unsigned max) +{ + /* live_in[s] = GEN[s] + (live_out[s] - KILL[s]) */ - if (ins->ssa_args.dest == src) - overwritten_mask |= ins->mask; - } + liveness_kill(live, ins->dest, max, mir_bytemask(ins)); - /* ...and also, check *its* successors */ - if (is_live_after_successors(ctx, succ, src)) - return true; + mir_foreach_src(ins, src) { + unsigned node = ins->src[src]; + unsigned bytemask = mir_bytemask_of_read_components(ins, node); + liveness_gen(live, node, max, bytemask); } +} - /* Welp. We're really not live. */ +/* live_out[s] = sum { p in succ[s] } ( live_in[p] ) */ - return false; +static void +liveness_block_live_out(compiler_context *ctx, midgard_block *blk) +{ + mir_foreach_successor(blk, succ) { + for (unsigned i = 0; i < ctx->temp_count; ++i) + blk->live_out[i] |= succ->live_in[i]; + } } -bool -mir_is_live_after(compiler_context *ctx, midgard_block *block, midgard_instruction *start, int src) +/* Liveness analysis is a backwards-may dataflow analysis pass. Within a block, + * we compute live_out from live_in. The intrablock pass is linear-time. It + * returns whether progress was made. */ + +static bool +liveness_block_update(compiler_context *ctx, midgard_block *blk) { - /* Check the rest of the block for liveness */ + bool progress = false; - mir_foreach_instr_in_block_from(block, ins, mir_next_op(start)) { - if (mir_has_arg(ins, src)) - return true; + liveness_block_live_out(ctx, blk); + + uint16_t *live = ralloc_array(ctx, uint16_t, ctx->temp_count); + memcpy(live, blk->live_out, ctx->temp_count * sizeof(uint16_t)); + + mir_foreach_instr_in_block_rev(blk, ins) + mir_liveness_ins_update(live, ins, ctx->temp_count); + + /* To figure out progress, diff live_in */ + + for (unsigned i = 0; (i < ctx->temp_count) && !progress; ++i) + progress |= (blk->live_in[i] != live[i]); + + ralloc_free(blk->live_in); + blk->live_in = live; + + return progress; +} + +/* Globally, liveness analysis uses a fixed-point algorithm based on a + * worklist. We initialize a work list with the exit block. We iterate the work + * list to compute live_in from live_out for each block on the work list, + * adding the predecessors of the block to the work list if we made progress. + */ + +void +mir_compute_liveness(compiler_context *ctx) +{ + /* If we already have fresh liveness, nothing to do */ + if (ctx->metadata & MIDGARD_METADATA_LIVENESS) + return; + + mir_compute_temp_count(ctx); + + /* List of midgard_block */ + struct set *work_list = _mesa_set_create(ctx, + _mesa_hash_pointer, + _mesa_key_pointer_equal); + + /* Allocate */ + + mir_foreach_block(ctx, block) { + block->live_in = rzalloc_array(ctx, uint16_t, ctx->temp_count); + block->live_out = rzalloc_array(ctx, uint16_t, ctx->temp_count); } - /* Check the rest of the blocks for liveness recursively */ + /* Initialize the work list with the exit block */ + struct set_entry *cur; + + midgard_block *exit = mir_exit_block(ctx); + cur = _mesa_set_add(work_list, exit); + + /* Iterate the work list */ + + do { + /* Pop off a block */ + midgard_block *blk = (struct midgard_block *) cur->key; + _mesa_set_remove(work_list, cur); - bool succ = is_live_after_successors(ctx, block, src); + /* Update its liveness information */ + bool progress = liveness_block_update(ctx, blk); + + /* If we made progress, we need to process the predecessors */ + + if (progress || !blk->visited) { + mir_foreach_predecessor(blk, pred) + _mesa_set_add(work_list, pred); + } + + blk->visited = true; + } while((cur = _mesa_set_next_entry(work_list, NULL)) != NULL); + + /* Liveness is now valid */ + ctx->metadata |= MIDGARD_METADATA_LIVENESS; mir_foreach_block(ctx, block) { block->visited = false; } - - return succ; } -/* Just a quick check -- is it written more than once? (I.e. are we definitely - * not SSA?) */ +/* Once liveness data is no longer valid, call this */ + +void +mir_invalidate_liveness(compiler_context *ctx) +{ + /* If we didn't already compute liveness, there's nothing to do */ + if (!(ctx->metadata & MIDGARD_METADATA_LIVENESS)) + return; + + /* It's now invalid regardless */ + ctx->metadata &= ~MIDGARD_METADATA_LIVENESS; + + mir_foreach_block(ctx, block) { + if (block->live_in) + ralloc_free(block->live_in); + + if (block->live_out) + ralloc_free(block->live_out); + + block->live_in = NULL; + block->live_out = NULL; + } +} bool -mir_has_multiple_writes(compiler_context *ctx, int dest) +mir_is_live_after(compiler_context *ctx, midgard_block *block, midgard_instruction *start, int src) { - unsigned write_count = 0; + mir_compute_liveness(ctx); - mir_foreach_instr_global(ctx, ins) { - if (ins->ssa_args.dest == dest) - write_count++; + /* Check whether we're live in the successors */ + + if (liveness_get(block->live_out, src, ctx->temp_count)) + return true; + + /* Check the rest of the block for liveness */ + + mir_foreach_instr_in_block_from(block, ins, mir_next_op(start)) { + if (mir_has_arg(ins, src)) + return true; } - return write_count > 1; + return false; } diff -Nru mesa-19.2.8/src/panfrost/midgard/midgard_nir_algebraic.py mesa-20.0.8/src/panfrost/midgard/midgard_nir_algebraic.py --- mesa-19.2.8/src/panfrost/midgard/midgard_nir_algebraic.py 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/panfrost/midgard/midgard_nir_algebraic.py 2020-06-12 01:21:18.000000000 +0000 @@ -48,21 +48,40 @@ # Midgard is able to type convert down by only one "step" per instruction; if # NIR wants more than one step, we need to break up into multiple instructions -converts = [ - (('i2i8', 'a@32'), ('i2i8', ('i2i16', a))), - (('u2u8', 'a@32'), ('u2u8', ('u2u16', a))), +converts = [] - (('i2i32', 'a@8'), ('i2i32', ('i2i16', a))), - (('u2u32', 'a@8'), ('u2u32', ('u2u16', a))), - - (('f2i32', 'a@16'), ('f2i32', ('f2f32', a))), - (('f2u32', 'a@16'), ('f2u32', ('f2f32', a))), - - # Totally redundant - (('~f2f16', ('f2f32', 'a@16')), a), - - (('pack_half_2x16_split', 'a@32', 'b@32'), ('ior', ('ishl', ('i2i32', ('f2f16', b)), 16), ('i2i32', ('f2f16', a)))), -] +for op in ('u2u', 'i2i', 'f2f', 'i2f', 'u2f', 'f2i', 'f2u'): + srcsz_max = 64 + dstsz_max = 64 + # 8 bit float doesn't exist + srcsz_min = 8 if op[0] != 'f' else 16 + dstsz_min = 8 if op[2] != 'f' else 16 + dstsz = dstsz_min + # Iterate over all possible destination and source sizes + while dstsz <= dstsz_max: + srcsz = srcsz_min + while srcsz <= srcsz_max: + # Size converter lowering is only needed if src and dst sizes are + # spaced by a factor > 2. + # Type converter lowering is needed as soon as src_size != dst_size + if srcsz != dstsz and ((srcsz * 2 != dstsz and srcsz != dstsz * 2) or op[0] != op[2]): + cursz = srcsz + rule = a + # When converting down we first do the type conversion followed + # by one or more size conversions. When converting up, we do + # the type conversion at the end. This way we don't have to + # deal with the fact that f2f8 doesn't exists. + sizeconvop = op[0] + '2' + op[0] if srcsz < dstsz else op[2] + '2' + op[2] + if srcsz > dstsz and op[0] != op[2]: + rule = (op + str(int(cursz)), rule) + while cursz != dstsz: + cursz = cursz / 2 if dstsz < srcsz else cursz * 2 + rule = (sizeconvop + str(int(cursz)), rule) + if srcsz < dstsz and op[0] != op[2]: + rule = (op + str(int(cursz)), rule) + converts += [((op + str(int(dstsz)), 'a@' + str(int(srcsz))), rule)] + srcsz *= 2 + dstsz *= 2 # Midgard scales fsin/fcos arguments by pi. # Pass must be run only once, after the main loop diff -Nru mesa-19.2.8/src/panfrost/midgard/midgard_ops.c mesa-20.0.8/src/panfrost/midgard/midgard_ops.c --- mesa-19.2.8/src/panfrost/midgard/midgard_ops.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/panfrost/midgard/midgard_ops.c 2020-06-12 01:21:18.000000000 +0000 @@ -54,6 +54,9 @@ [midgard_alu_op_ftrunc] = {"ftrunc", UNITS_ADD}, [midgard_alu_op_ffloor] = {"ffloor", UNITS_ADD}, [midgard_alu_op_fceil] = {"fceil", UNITS_ADD}, + + /* Multiplies the X/Y components of the first arg and adds the second + * arg. Like other LUTs, it must be scalarized. */ [midgard_alu_op_ffma] = {"ffma", UNIT_VLUT}, /* Though they output a scalar, they need to run on a vector unit @@ -169,60 +172,126 @@ [midgard_alu_op_freduce] = {"freduce", 0}, }; -const char *load_store_opcode_names[256] = { - [midgard_op_ld_cubemap_coords] = "ld_cubemap_coords", - [midgard_op_ld_compute_id] = "ld_compute_id", - [midgard_op_ldst_perspective_division_z] = "ldst_perspective_division_z", - [midgard_op_ldst_perspective_division_w] = "ldst_perspective_division_w", - - [midgard_op_atomic_add] = "atomic_add", - [midgard_op_atomic_and] = "atomic_and", - [midgard_op_atomic_or] = "atomic_or", - [midgard_op_atomic_xor] = "atomic_xor", - [midgard_op_atomic_imin] = "atomic_imin", - [midgard_op_atomic_umin] = "atomic_umin", - [midgard_op_atomic_imax] = "atomic_imax", - [midgard_op_atomic_umax] = "atomic_umax", - [midgard_op_atomic_xchg] = "atomic_xchg", - - [midgard_op_ld_char] = "ld_char", - [midgard_op_ld_char2] = "ld_char2", - [midgard_op_ld_short] = "ld_short", - [midgard_op_ld_char4] = "ld_char4", - [midgard_op_ld_short4] = "ld_short4", - [midgard_op_ld_int4] = "ld_int4", - - [midgard_op_ld_attr_32] = "ld_attr_32", - [midgard_op_ld_attr_16] = "ld_attr_16", - [midgard_op_ld_attr_32i] = "ld_attr_32i", - [midgard_op_ld_attr_32u] = "ld_attr_32u", - - [midgard_op_ld_vary_32] = "ld_vary_32", - [midgard_op_ld_vary_16] = "ld_vary_16", - [midgard_op_ld_vary_32i] = "ld_vary_32i", - [midgard_op_ld_vary_32u] = "ld_vary_32u", - - [midgard_op_ld_color_buffer_8] = "ld_color_buffer_8", - [midgard_op_ld_color_buffer_16] = "ld_color_buffer_16", - - [midgard_op_ld_ubo_char] = "ld_ubo_char", - [midgard_op_ld_ubo_char2] = "ld_ubo_char2", - [midgard_op_ld_ubo_char4] = "ld_ubo_char4", - [midgard_op_ld_ubo_short4] = "ld_ubo_short4", - [midgard_op_ld_ubo_int4] = "ld_ubo_int4", - - [midgard_op_st_char] = "st_char", - [midgard_op_st_char2] = "st_char2", - [midgard_op_st_char4] = "st_char4", - [midgard_op_st_short4] = "st_short4", - [midgard_op_st_int4] = "st_int4", - - [midgard_op_st_vary_32] = "st_vary_32", - [midgard_op_st_vary_16] = "st_vary_16", - [midgard_op_st_vary_32i] = "st_vary_32i", - [midgard_op_st_vary_32u] = "st_vary_32u", - - [midgard_op_st_image_f] = "st_image_f", - [midgard_op_st_image_ui] = "st_image_ui", - [midgard_op_st_image_i] = "st_image_i", +/* Define shorthands */ + +#define M8 midgard_reg_mode_8 +#define M16 midgard_reg_mode_16 +#define M32 midgard_reg_mode_32 +#define M64 midgard_reg_mode_64 + +struct mir_ldst_op_props load_store_opcode_props[256] = { + [midgard_op_unpack_colour] = {"unpack_colour", M32}, + [midgard_op_pack_colour] = {"pack_colour", M32}, + [midgard_op_ld_cubemap_coords] = {"ld_cubemap_coords", M32}, + [midgard_op_ld_compute_id] = {"ld_compute_id", M32}, + [midgard_op_ldst_perspective_division_z] = {"ldst_perspective_division_z", M32}, + [midgard_op_ldst_perspective_division_w] = {"ldst_perspective_division_w", M32}, + + [midgard_op_atomic_add] = {"atomic_add", M32 | LDST_SIDE_FX}, + [midgard_op_atomic_and] = {"atomic_and", M32 | LDST_SIDE_FX}, + [midgard_op_atomic_or] = {"atomic_or", M32 | LDST_SIDE_FX}, + [midgard_op_atomic_xor] = {"atomic_xor", M32 | LDST_SIDE_FX}, + [midgard_op_atomic_imin] = {"atomic_imin", M32 | LDST_SIDE_FX}, + [midgard_op_atomic_umin] = {"atomic_umin", M32 | LDST_SIDE_FX}, + [midgard_op_atomic_imax] = {"atomic_imax", M32 | LDST_SIDE_FX}, + [midgard_op_atomic_umax] = {"atomic_umax", M32 | LDST_SIDE_FX}, + [midgard_op_atomic_xchg] = {"atomic_xchg", M32 | LDST_SIDE_FX}, + + [midgard_op_atomic_add64] = {"atomic_add64", M64 | LDST_SIDE_FX}, + [midgard_op_atomic_and64] = {"atomic_and64", M64 | LDST_SIDE_FX}, + [midgard_op_atomic_or64] = {"atomic_or64", M64 | LDST_SIDE_FX}, + [midgard_op_atomic_xor64] = {"atomic_xor64", M64 | LDST_SIDE_FX}, + [midgard_op_atomic_imin64] = {"atomic_imin64", M64 | LDST_SIDE_FX}, + [midgard_op_atomic_umin64] = {"atomic_umin64", M64 | LDST_SIDE_FX}, + [midgard_op_atomic_imax64] = {"atomic_imax64", M64 | LDST_SIDE_FX}, + [midgard_op_atomic_umax64] = {"atomic_umax64", M64 | LDST_SIDE_FX}, + [midgard_op_atomic_xchg64] = {"atomic_xchg64", M64 | LDST_SIDE_FX}, + + [midgard_op_ld_char] = {"ld_char", M32}, + [midgard_op_ld_char2] = {"ld_char2", M16}, + [midgard_op_ld_short] = {"ld_short", M32}, + [midgard_op_ld_char4] = {"ld_char4", M32}, + [midgard_op_ld_short4] = {"ld_short4", M32}, + [midgard_op_ld_int4] = {"ld_int4", M32}, + + [midgard_op_ld_attr_32] = {"ld_attr_32", M32}, + [midgard_op_ld_attr_32i] = {"ld_attr_32i", M32}, + [midgard_op_ld_attr_32u] = {"ld_attr_32u", M32}, + [midgard_op_ld_attr_16] = {"ld_attr_16", M32}, + + [midgard_op_ld_vary_32] = {"ld_vary_32", M32}, + [midgard_op_ld_vary_16] = {"ld_vary_16", M32}, + [midgard_op_ld_vary_32i] = {"ld_vary_32i", M32}, + [midgard_op_ld_vary_32u] = {"ld_vary_32u", M32}, + + [midgard_op_ld_color_buffer_32u] = {"ld_color_buffer_32u", M32 | LDST_SPECIAL_MASK}, + [midgard_op_ld_color_buffer_u8_as_fp16] = {"ld_color_buffer_u8_as_fp16", M16 | LDST_SPECIAL_MASK}, + [midgard_op_ld_color_buffer_u8_as_fp16_old] = {"ld_color_buffer_u8_as_fp16_old", M16 | LDST_SPECIAL_MASK}, + + [midgard_op_ld_ubo_char] = {"ld_ubo_char", M32}, + [midgard_op_ld_ubo_char2] = {"ld_ubo_char2", M16}, + [midgard_op_ld_ubo_char4] = {"ld_ubo_char4", M32}, + [midgard_op_ld_ubo_short4] = {"ld_ubo_short4", M32}, + [midgard_op_ld_ubo_int4] = {"ld_ubo_int4", M32}, + + [midgard_op_st_char] = {"st_char", M32 | LDST_STORE}, + [midgard_op_st_char2] = {"st_char2", M16 | LDST_STORE}, + [midgard_op_st_char4] = {"st_char4", M32 | LDST_STORE}, + [midgard_op_st_short4] = {"st_short4", M32 | LDST_STORE}, + [midgard_op_st_int4] = {"st_int4", M32 | LDST_STORE}, + + [midgard_op_st_vary_32] = {"st_vary_32", M32 | LDST_STORE}, + [midgard_op_st_vary_32i] = {"st_vary_32i", M32 | LDST_STORE}, + [midgard_op_st_vary_32u] = {"st_vary_32u", M32 | LDST_STORE}, + [midgard_op_st_vary_16] = {"st_vary_16", M16 | LDST_STORE}, + + [midgard_op_st_image_f] = {"st_image_f", M32 | LDST_STORE}, + [midgard_op_st_image_ui] = {"st_image_ui", M32 | LDST_STORE}, + [midgard_op_st_image_i] = {"st_image_i", M32 | LDST_STORE}, }; + +#undef M8 +#undef M16 +#undef M32 +#undef M64 + +midgard_word_type midgard_word_types[16] = { + midgard_word_type_unknown, /* 0x0 */ + midgard_word_type_unknown, /* 0x1 */ + midgard_word_type_texture, /* 0x2 */ + midgard_word_type_texture, /* 0x3 */ + midgard_word_type_texture, /* 0x4 */ + midgard_word_type_load_store, /* 0x5 */ + midgard_word_type_unknown, /* 0x6 */ + midgard_word_type_unknown, /* 0x7 */ + midgard_word_type_alu, /* 0x8 */ + midgard_word_type_alu, /* 0x9 */ + midgard_word_type_alu, /* 0xA */ + midgard_word_type_alu, /* 0xB */ + midgard_word_type_alu, /* 0xC */ + midgard_word_type_alu, /* 0xD */ + midgard_word_type_alu, /* 0xE */ + midgard_word_type_alu, /* 0xF */ +}; + +unsigned midgard_word_size[16] = { + 0, /* 0x0 */ + 0, /* 0x1 */ + 1, /* 0x2 */ + 1, /* 0x3 */ + 1, /* 0x4 */ + 1, /* 0x5 */ + 0, /* 0x6 */ + 0, /* 0x7 */ + 1, /* 0x8 */ + 2, /* 0x9 */ + 3, /* 0xA */ + 4, /* 0xB */ + 1, /* 0xC */ + 2, /* 0xD */ + 3, /* 0xE */ + 4, /* 0xF */ +}; + + + diff -Nru mesa-19.2.8/src/panfrost/midgard/midgard_ops.h mesa-20.0.8/src/panfrost/midgard/midgard_ops.h --- mesa-19.2.8/src/panfrost/midgard/midgard_ops.h 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/panfrost/midgard/midgard_ops.h 2020-06-12 01:21:18.000000000 +0000 @@ -19,12 +19,19 @@ * THE SOFTWARE. */ +#ifndef __MIDGARD_OPS +#define __MIDGARD_OPS + #include "helpers.h" /* Forward declare */ extern struct mir_op_props alu_opcode_props[256]; -extern const char *load_store_opcode_names[256]; +extern struct mir_ldst_op_props load_store_opcode_props[256]; +extern midgard_word_type midgard_word_types[16]; +extern unsigned midgard_word_size[16]; + +#define OP_IS_STORE(op) (load_store_opcode_props[op].props & LDST_STORE) /* Is this opcode that of an integer (regardless of signedness)? Instruction * names authoritatively determine types */ @@ -70,4 +77,4 @@ return existing_mask; }; - +#endif diff -Nru mesa-19.2.8/src/panfrost/midgard/midgard_opt_copy_prop.c mesa-20.0.8/src/panfrost/midgard/midgard_opt_copy_prop.c --- mesa-19.2.8/src/panfrost/midgard/midgard_opt_copy_prop.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/panfrost/midgard/midgard_opt_copy_prop.c 2020-06-12 01:21:18.000000000 +0000 @@ -25,6 +25,41 @@ #include "compiler.h" #include "midgard_ops.h" +/* Special case for copypropagating the results of vectors */ + +static bool +midgard_opt_copy_prop_reg(compiler_context *ctx, midgard_block *block) +{ + bool progress = false; + + mir_foreach_instr_in_block_safe(block, ins) { + if (ins->type != TAG_ALU_4) continue; + if (!OP_IS_MOVE(ins->alu.op)) continue; + + unsigned from = ins->src[1]; + unsigned to = ins->dest; + + if (!(to & IS_REG)) continue; + if (from & IS_REG) continue; + + if (ins->has_inline_constant) continue; + if (ins->has_constants) continue; + if (mir_nontrivial_source2_mod(ins)) continue; + if (mir_nontrivial_outmod(ins)) continue; + if (!mir_single_use(ctx, from)) continue; + + /* Ensure mask is continguous from 0 */ + if (!(ins->mask & (1 << COMPONENT_X))) continue; + if (ins->mask & (ins->mask + 1)) continue; + + mir_rewrite_index_dst(ctx, from, ins->dest); + mir_remove_instruction(ins); + progress |= true; + } + + return progress; +} + bool midgard_opt_copy_prop(compiler_context *ctx, midgard_block *block) { @@ -34,8 +69,8 @@ if (ins->type != TAG_ALU_4) continue; if (!OP_IS_MOVE(ins->alu.op)) continue; - unsigned from = ins->ssa_args.src[1]; - unsigned to = ins->ssa_args.dest; + unsigned from = ins->src[1]; + unsigned to = ins->dest; /* We only work on pure SSA */ @@ -45,7 +80,7 @@ if (from & IS_REG) continue; /* Constant propagation is not handled here, either */ - if (ins->ssa_args.inline_constant) continue; + if (ins->has_inline_constant) continue; if (ins->has_constants) continue; /* Modifier propagation is not handled here */ @@ -53,26 +88,29 @@ if (mir_nontrivial_outmod(ins)) continue; /* Shortened arguments (bias for textures, extra load/store - * arguments, etc.) do not get a swizzlw, only a start - * component and even that is restricted. */ + * arguments, etc.) do not get a swizzle, only a start + * component and even that is restricted. Fragment writeout + * doesn't even get that much */ bool skip = false; mir_foreach_instr_global(ctx, q) { bool is_tex = q->type == TAG_TEXTURE_4; bool is_ldst = q->type == TAG_LOAD_STORE_4; + bool is_branch = q->compact_branch; - if (!(is_tex || is_ldst)) continue; + if (!(is_tex || is_ldst || is_branch)) continue; - /* For textures, we get one real swizzle. For stores, - * we also get one. For loads, we get none. */ + /* For textures, we get a real swizzle for the + * coordinate and the content. For stores, we get one. + * For loads, we get none. */ unsigned start = - is_tex ? 1 : + is_tex ? 2 : OP_IS_STORE(q->load_store.op) ? 1 : 0; mir_foreach_src(q, s) { - if ((s >= start) && q->ssa_args.src[s] == to) { + if ((s >= start) && q->src[s] == to) { skip = true; break; } @@ -83,13 +121,10 @@ continue; /* We're clear -- rewrite, composing the swizzle */ - midgard_vector_alu_src src2 = - vector_alu_from_unsigned(ins->alu.src2); - - mir_rewrite_index_src_swizzle(ctx, to, from, src2.swizzle); + mir_rewrite_index_src_swizzle(ctx, to, from, ins->swizzle[1]); mir_remove_instruction(ins); progress |= true; } - return progress; + return progress | midgard_opt_copy_prop_reg(ctx, block); } diff -Nru mesa-19.2.8/src/panfrost/midgard/midgard_opt_dce.c mesa-20.0.8/src/panfrost/midgard/midgard_opt_dce.c --- mesa-19.2.8/src/panfrost/midgard/midgard_opt_dce.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/panfrost/midgard/midgard_opt_dce.c 2020-06-12 01:21:18.000000000 +0000 @@ -23,25 +23,76 @@ */ #include "compiler.h" +#include "util/u_memory.h" +#include "midgard_ops.h" -/* Basic dead code elimination on the MIR itself */ +/* SIMD-aware dead code elimination. Perform liveness analysis step-by-step, + * removing dead components. If an instruction ends up with a zero mask, the + * instruction in total is dead and should be removed. */ + +static bool +can_cull_mask(compiler_context *ctx, midgard_instruction *ins) +{ + if (ins->dest >= ctx->temp_count) + return false; + + if (ins->type == TAG_LOAD_STORE_4) + if (load_store_opcode_props[ins->load_store.op].props & LDST_SPECIAL_MASK) + return false; + + return true; +} + +static bool +can_dce(midgard_instruction *ins) +{ + if (ins->mask) + return false; + + if (ins->compact_branch) + return false; + + if (ins->type == TAG_LOAD_STORE_4) + if (load_store_opcode_props[ins->load_store.op].props & LDST_SIDE_FX) + return false; + + return true; +} bool midgard_opt_dead_code_eliminate(compiler_context *ctx, midgard_block *block) { bool progress = false; - mir_foreach_instr_in_block_safe(block, ins) { - if (ins->type != TAG_ALU_4) continue; - if (ins->compact_branch) continue; + mir_invalidate_liveness(ctx); + mir_compute_liveness(ctx); + + uint16_t *live = mem_dup(block->live_out, ctx->temp_count * sizeof(uint16_t)); - if (ins->ssa_args.dest >= SSA_FIXED_MINIMUM) continue; - if (mir_is_live_after(ctx, block, ins, ins->ssa_args.dest)) continue; + mir_foreach_instr_in_block_rev(block, ins) { + if (can_cull_mask(ctx, ins)) { + midgard_reg_mode mode = mir_typesize(ins); + unsigned oldmask = ins->mask; + + unsigned rounded = mir_round_bytemask_up(live[ins->dest], mode); + unsigned cmask = mir_from_bytemask(rounded, mode); + + ins->mask &= cmask; + progress |= (ins->mask != oldmask); + } - mir_remove_instruction(ins); - progress = true; + mir_liveness_ins_update(live, ins, ctx->temp_count); } + mir_foreach_instr_in_block_safe(block, ins) { + if (can_dce(ins)) { + mir_remove_instruction(ins); + progress = true; + } + } + + free(live); + return progress; } @@ -64,11 +115,11 @@ mir_foreach_instr_in_block_from(block, q, mir_next_op(ins)) { /* Check if used */ - if (mir_has_arg(q, ins->ssa_args.dest)) + if (mir_has_arg(q, ins->dest)) break; /* Check if overwritten */ - if (q->ssa_args.dest == ins->ssa_args.dest) { + if (q->dest == ins->dest) { /* Special case to vec4; component tracking is * harder */ @@ -85,51 +136,3 @@ return progress; } - -/* An even further special case - to be run after RA runs but before - * scheduling, eliminating moves that end up being useless even though they - * appeared meaningful in the SSA. Part #2 of register coalescing. */ - -void -midgard_opt_post_move_eliminate(compiler_context *ctx, midgard_block *block, struct ra_graph *g) -{ - mir_foreach_instr_in_block_safe(block, ins) { - if (ins->type != TAG_ALU_4) continue; - if (ins->compact_branch) continue; - if (!OP_IS_MOVE(ins->alu.op)) continue; - if (ins->dont_eliminate) continue; - - /* Check we're to the same place post-RA */ - unsigned iA = ins->ssa_args.dest; - unsigned iB = ins->ssa_args.src[1]; - - if ((iA < 0) || (iB < 0)) continue; - - unsigned A = iA >= SSA_FIXED_MINIMUM ? - SSA_REG_FROM_FIXED(iA) : - ra_get_node_reg(g, iA); - - unsigned B = iB >= SSA_FIXED_MINIMUM ? - SSA_REG_FROM_FIXED(iB) : - ra_get_node_reg(g, iB); - - if (A != B) continue; - - /* Check we're in the work zone. TODO: promoted - * uniforms? */ - if (A >= 16) continue; - - /* Ensure there aren't side effects */ - if (mir_nontrivial_source2_mod(ins)) continue; - if (mir_nontrivial_outmod(ins)) continue; - if (ins->mask != 0xF) continue; - - /* We do need to rewrite to facilitate pipelining/scheduling */ - mir_rewrite_index(ctx, ins->ssa_args.src[1], ins->ssa_args.dest); - - /* We're good to go */ - mir_remove_instruction(ins); - - } - -} diff -Nru mesa-19.2.8/src/panfrost/midgard/midgard_opt_float.c mesa-20.0.8/src/panfrost/midgard/midgard_opt_float.c --- mesa-19.2.8/src/panfrost/midgard/midgard_opt_float.c 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/panfrost/midgard/midgard_opt_float.c 2020-06-12 01:21:18.000000000 +0000 @@ -0,0 +1,83 @@ +/* + * Copyright (C) 2019 Collabora, Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors (Collabora): + * Alyssa Rosenzweig + */ + +#include "compiler.h" +#include "midgard_ops.h" +#include + +/* Could a 32-bit value represent exactly a 32-bit floating point? */ + +static bool +mir_constant_float(uint32_t u) +{ + /* Cast */ + float f = 0; + memcpy(&f, &u, sizeof(u)); + + /* TODO: What exactly is the condition? */ + return !(isnan(f) || isinf(f)); +} + +/* Promotes imov with a constant to fmov where the constant is exactly + * representible as a float */ + +bool +midgard_opt_promote_fmov(compiler_context *ctx, midgard_block *block) +{ + bool progress = false; + + mir_foreach_instr_in_block(block, ins) { + if (ins->type != TAG_ALU_4) continue; + if (ins->alu.op != midgard_alu_op_imov) continue; + if (ins->has_inline_constant) continue; + if (!ins->has_constants) continue; + if (mir_nontrivial_source2_mod_simple(ins)) continue; + if (mir_nontrivial_outmod(ins)) continue; + if (ins->alu.reg_mode != midgard_reg_mode_32) continue; + + /* We found an imov with a constant. Check the constants */ + bool ok = true; + + for (unsigned i = 0; i < ARRAY_SIZE(ins->constants.u32); ++i) + ok &= mir_constant_float(ins->constants.u32[i]); + + if (!ok) + continue; + + /* Rewrite to fmov */ + ins->alu.op = midgard_alu_op_fmov; + ins->alu.outmod = 0; + + /* Clear the int mod */ + midgard_vector_alu_src u = vector_alu_from_unsigned(ins->alu.src2); + u.mod = 0; + ins->alu.src2 = vector_alu_srco_unsigned(u); + + progress |= true; + } + + return progress; +} diff -Nru mesa-19.2.8/src/panfrost/midgard/midgard_opt_invert.c mesa-20.0.8/src/panfrost/midgard/midgard_opt_invert.c --- mesa-19.2.8/src/panfrost/midgard/midgard_opt_invert.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/panfrost/midgard/midgard_opt_invert.c 2020-06-12 01:21:18.000000000 +0000 @@ -40,25 +40,22 @@ midgard_instruction not = { .type = TAG_ALU_4, .mask = ins->mask, - .ssa_args = { - .src = { temp, -1, -1 }, - .dest = ins->ssa_args.dest, - .inline_constant = true - }, + .src = { temp, ~0, ~0, ~0 }, + .swizzle = SWIZZLE_IDENTITY, + .dest = ins->dest, + .has_inline_constant = true, .alu = { .op = midgard_alu_op_inor, /* TODO: i16 */ .reg_mode = midgard_reg_mode_32, .dest_override = midgard_dest_override_none, - .outmod = midgard_outmod_int_wrap, - .src1 = vector_alu_srco_unsigned(blank_alu_src), - .src2 = vector_alu_srco_unsigned(zero_alu_src) + .outmod = midgard_outmod_int_wrap }, }; - ins->ssa_args.dest = temp; + ins->dest = temp; ins->invert = false; - mir_insert_instruction_before(mir_next_op(ins), not); + mir_insert_instruction_before(ctx, mir_next_op(ins), not); } } @@ -74,15 +71,15 @@ if (ins->alu.op != midgard_alu_op_imov) continue; if (!ins->invert) continue; if (mir_nontrivial_source2_mod_simple(ins)) continue; - if (ins->ssa_args.src[1] & IS_REG) continue; + if (ins->src[1] & IS_REG) continue; /* Is it beneficial to propagate? */ - if (!mir_single_use(ctx, ins->ssa_args.src[1])) continue; + if (!mir_single_use(ctx, ins->src[1])) continue; /* We found an imov.not, propagate the invert back */ mir_foreach_instr_in_block_from_rev(block, v, mir_prev_op(ins)) { - if (v->ssa_args.dest != ins->ssa_args.src[1]) continue; + if (v->dest != ins->src[1]) continue; if (v->type != TAG_ALU_4) break; v->invert = !v->invert; @@ -195,10 +192,13 @@ static bool mir_strip_inverted(compiler_context *ctx, unsigned node) { + if (node >= SSA_FIXED_MINIMUM) + return false; + /* Strips and returns the invert off a node */ mir_foreach_instr_global(ctx, ins) { if (ins->compact_branch) continue; - if (ins->ssa_args.dest != node) continue; + if (ins->dest != node) continue; bool status = ins->invert; ins->invert = false; @@ -208,6 +208,12 @@ unreachable("Invalid node stripped"); } +static bool +is_ssa_or_constant(unsigned node) +{ + return !(node & IS_REG) || (node == SSA_FIXED_REGISTER(26)); +} + bool midgard_opt_fuse_src_invert(compiler_context *ctx, midgard_block *block) { @@ -219,18 +225,18 @@ if (!mir_is_bitwise(ins)) continue; if (ins->invert) continue; - if (ins->ssa_args.src[0] & IS_REG) continue; - if (ins->ssa_args.src[1] & IS_REG) continue; - if (!mir_single_use(ctx, ins->ssa_args.src[0])) continue; - if (!ins->ssa_args.inline_constant && !mir_single_use(ctx, ins->ssa_args.src[1])) continue; + if (!is_ssa_or_constant(ins->src[0])) continue; + if (!is_ssa_or_constant(ins->src[1])) continue; + if (!mir_single_use(ctx, ins->src[0])) continue; + if (!ins->has_inline_constant && !mir_single_use(ctx, ins->src[1])) continue; - bool not_a = mir_strip_inverted(ctx, ins->ssa_args.src[0]); + bool not_a = mir_strip_inverted(ctx, ins->src[0]); bool not_b = - ins->ssa_args.inline_constant ? false : - mir_strip_inverted(ctx, ins->ssa_args.src[1]); + ins->has_inline_constant ? false : + mir_strip_inverted(ctx, ins->src[1]); /* Edge case: if src0 == src1, it'll've been stripped */ - if ((ins->ssa_args.src[0] == ins->ssa_args.src[1]) && !ins->ssa_args.inline_constant) + if ((ins->src[0] == ins->src[1]) && !ins->has_inline_constant) not_b = not_a; progress |= (not_a || not_b); @@ -248,16 +254,13 @@ if (both) { ins->alu.op = mir_demorgan_op(ins->alu.op); - } else if (right || (left && !ins->ssa_args.inline_constant)) { - if (left) { - /* Commute */ - unsigned temp = ins->ssa_args.src[0]; - ins->ssa_args.src[0] = ins->ssa_args.src[1]; - ins->ssa_args.src[1] = temp; - } + } else if (right || (left && !ins->has_inline_constant)) { + /* Commute arguments */ + if (left) + mir_flip(ins); ins->alu.op = mir_notright_op(ins->alu.op); - } else if (left && ins->ssa_args.inline_constant) { + } else if (left && ins->has_inline_constant) { /* Some special transformations: * * ~A & c = ~(~(~A) | (~c)) = ~(A | ~c) = inor(A, ~c) @@ -270,4 +273,129 @@ } return progress; +} + +/* Optimizes a .not away when used as the source of a conditional select: + * + * csel(a, b, c) = { b if a, c if !a } + * csel(!a, b, c) = { b if !a, c if !(!a) } = { c if a, b if !a } = csel(a, c, b) + * csel(!a, b, c) = csel(a, c, b) + */ + +bool +midgard_opt_csel_invert(compiler_context *ctx, midgard_block *block) +{ + bool progress = false; + + mir_foreach_instr_in_block_safe(block, ins) { + if (ins->type != TAG_ALU_4) continue; + if (!OP_IS_CSEL(ins->alu.op)) continue; + if (!mir_single_use(ctx, ins->src[2])) continue; + if (!mir_strip_inverted(ctx, ins->src[2])) continue; + + mir_flip(ins); + progress |= true; + } + + return progress; +} + + +static bool +mir_is_inverted(compiler_context *ctx, unsigned node) +{ + mir_foreach_instr_global(ctx, ins) { + if (ins->compact_branch) continue; + if (ins->dest != node) continue; + + return ins->invert; + } + + unreachable("Invalid node passed"); +} + + + +/* Optimizes comparisions which invert both arguments + * + * + * ieq(not(a), not(b)) = ieq(a, b) + * ine(not(a), not(b)) = ine(a, b) + * + * This does apply for ilt and ile if we flip the argument order: + * Proofs below provided by Alyssa Rosenzweig + * + * not(x) = −(x+1) + * + * ( not(A) <= not(B) ) <=> ( −(A+1) <= −(B+1) ) + * <=> ( A+1 >= B+1) + * <=> ( B <= A ) + * + * On unsigned comparisons (ult / ule) we can perform the same optimization + * with the additional restriction that the source registers must + * have the same size. + * + * TODO: We may not need them to be of the same size, if we can + * prove that they are the same after sext/zext + * + * not(x) = 2n−x−1 + * + * ( not(A) <= not(B) ) <=> ( 2n−A−1 <= 2n−B−1 ) + * <=> ( −A <= −B ) + * <=> ( B <= A ) + */ +bool +midgard_opt_drop_cmp_invert(compiler_context *ctx, midgard_block *block) +{ + + bool progress = false; + + mir_foreach_instr_in_block_safe(block, ins) { + if (ins->type != TAG_ALU_4) continue; + if (!OP_IS_INTEGER_CMP(ins->alu.op)) continue; + + if ((ins->src[0] & IS_REG) || (ins->src[1] & IS_REG)) continue; + if (!mir_single_use(ctx, ins->src[0]) || !mir_single_use(ctx, ins->src[1])) continue; + + bool a_inverted = mir_is_inverted(ctx, ins->src[0]); + bool b_inverted = mir_is_inverted(ctx, ins->src[1]); + + if (!a_inverted || !b_inverted) continue; + if (OP_IS_UNSIGNED_CMP(ins->alu.op) && mir_srcsize(ins, 0) != mir_srcsize(ins, 1)) continue; + + + mir_strip_inverted(ctx, ins->src[0]); + mir_strip_inverted(ctx, ins->src[1]); + + if (ins->alu.op != midgard_alu_op_ieq && ins->alu.op != midgard_alu_op_ine) + mir_flip(ins); + + progress |= true; + } + + return progress; +} + +/* Optimizes branches with inverted arguments by inverting the + * branch condition instead of the argument condition. + */ +bool +midgard_opt_invert_branch(compiler_context *ctx, midgard_block *block) +{ + bool progress = false; + + mir_foreach_instr_in_block_safe(block, ins) { + if (ins->type != TAG_ALU_4) continue; + if (!midgard_is_branch_unit(ins->unit)) continue; + if (!ins->branch.conditional) continue; + if (ins->src[0] & IS_REG) continue; + + if (mir_strip_inverted(ctx, ins->src[0])) { + ins->branch.invert_conditional = !ins->branch.invert_conditional; + + progress |= true; + } + } + + return progress; } diff -Nru mesa-19.2.8/src/panfrost/midgard/midgard_opt_perspective.c mesa-20.0.8/src/panfrost/midgard/midgard_opt_perspective.c --- mesa-19.2.8/src/panfrost/midgard/midgard_opt_perspective.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/panfrost/midgard/midgard_opt_perspective.c 2020-06-12 01:21:18.000000000 +0000 @@ -37,6 +37,16 @@ #include "compiler.h" +static bool +is_swizzle_0(unsigned *swizzle) +{ + for (unsigned c = 0; c < MIR_VEC_COMPONENTS; ++c) + if (swizzle[c]) + return false; + + return true; +} + bool midgard_opt_combine_projection(compiler_context *ctx, midgard_block *block) { @@ -51,18 +61,12 @@ /* Check the swizzles */ - midgard_vector_alu_src src1 = - vector_alu_from_unsigned(ins->alu.src1); - - midgard_vector_alu_src src2 = - vector_alu_from_unsigned(ins->alu.src2); - - if (!mir_is_simple_swizzle(src1.swizzle, ins->mask)) continue; - if (src2.swizzle != SWIZZLE_XXXX) continue; + if (!mir_is_simple_swizzle(ins->swizzle[0], ins->mask)) continue; + if (!is_swizzle_0(ins->swizzle[1])) continue; /* Awesome, we're the right form. Now check where src2 is from */ - unsigned frcp = ins->ssa_args.src[1]; - unsigned to = ins->ssa_args.dest; + unsigned frcp = ins->src[1]; + unsigned to = ins->dest; if (frcp & IS_REG) continue; if (to & IS_REG) continue; @@ -72,13 +76,10 @@ unsigned frcp_from = 0; mir_foreach_instr_in_block_safe(block, sub) { - if (sub->ssa_args.dest != frcp) continue; - - midgard_vector_alu_src s = - vector_alu_from_unsigned(sub->alu.src1); + if (sub->dest != frcp) continue; - frcp_component = s.swizzle & 3; - frcp_from = sub->ssa_args.src[0]; + frcp_component = sub->swizzle[0][0]; + frcp_from = sub->src[0]; frcp_found = (sub->type == TAG_ALU_4) && @@ -98,7 +99,7 @@ if (mir_use_count(ctx, frcp_from) > 2) continue; mir_foreach_instr_in_block_safe(block, v) { - if (v->ssa_args.dest != frcp_from) continue; + if (v->dest != frcp_from) continue; if (v->type != TAG_LOAD_STORE_4) break; if (!OP_IS_LOAD_VARY_F(v->load_store.op)) break; @@ -114,20 +115,18 @@ midgard_instruction accel = { .type = TAG_LOAD_STORE_4, .mask = ins->mask, - .ssa_args = { - .dest = to, - .src = { frcp_from, -1, -1 }, - }, + .dest = to, + .src = { frcp_from, ~0, ~0, ~0 }, + .swizzle = SWIZZLE_IDENTITY_4, .load_store = { .op = frcp_component == COMPONENT_W ? midgard_op_ldst_perspective_division_w : midgard_op_ldst_perspective_division_z, - .swizzle = SWIZZLE_XYZW, .arg_1 = 0x20 } }; - mir_insert_instruction_before(ins, accel); + mir_insert_instruction_before(ctx, ins, accel); mir_remove_instruction(ins); progress |= true; @@ -146,8 +145,8 @@ if (ins->type != TAG_LOAD_STORE_4) continue; if (!OP_IS_PROJECTION(ins->load_store.op)) continue; - unsigned vary = ins->ssa_args.src[0]; - unsigned to = ins->ssa_args.dest; + unsigned vary = ins->src[0]; + unsigned to = ins->dest; if (vary & IS_REG) continue; if (to & IS_REG) continue; @@ -158,7 +157,7 @@ bool rewritten = false; mir_foreach_instr_in_block_safe(block, v) { - if (v->ssa_args.dest != vary) continue; + if (v->dest != vary) continue; if (v->type != TAG_LOAD_STORE_4) break; if (!OP_IS_LOAD_VARY_F(v->load_store.op)) break; @@ -184,7 +183,7 @@ v->load_store.varying_parameters = param; /* Use the new destination */ - v->ssa_args.dest = to; + v->dest = to; rewritten = true; break; diff -Nru mesa-19.2.8/src/panfrost/midgard/midgard-parse.h mesa-20.0.8/src/panfrost/midgard/midgard-parse.h --- mesa-19.2.8/src/panfrost/midgard/midgard-parse.h 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/panfrost/midgard/midgard-parse.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,70 +0,0 @@ -/* Author(s): - * Connor Abbott - * Alyssa Rosenzweig - * - * Copyright (c) 2013 Connor Abbott (connor@abbott.cx) - * Copyright (c) 2018 Alyssa Rosenzweig (alyssa@rosenzweig.io) - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - */ - -#ifndef __midgard_parse_h__ -#define __midgard_parse_h__ - -/* Additional metadata for parsing Midgard binaries, not needed for compilation */ - -static midgard_word_type midgard_word_types[16] = { - midgard_word_type_unknown, /* 0x0 */ - midgard_word_type_unknown, /* 0x1 */ - midgard_word_type_texture, /* 0x2 */ - midgard_word_type_texture, /* 0x3 */ - midgard_word_type_unknown, /* 0x4 */ - midgard_word_type_load_store, /* 0x5 */ - midgard_word_type_unknown, /* 0x6 */ - midgard_word_type_unknown, /* 0x7 */ - midgard_word_type_alu, /* 0x8 */ - midgard_word_type_alu, /* 0x9 */ - midgard_word_type_alu, /* 0xA */ - midgard_word_type_alu, /* 0xB */ - midgard_word_type_alu, /* 0xC */ - midgard_word_type_alu, /* 0xD */ - midgard_word_type_alu, /* 0xE */ - midgard_word_type_alu, /* 0xF */ -}; - -static unsigned midgard_word_size[16] = { - 0, /* 0x0 */ - 0, /* 0x1 */ - 1, /* 0x2 */ - 1, /* 0x3 */ - 0, /* 0x4 */ - 1, /* 0x5 */ - 0, /* 0x6 */ - 0, /* 0x7 */ - 1, /* 0x8 */ - 2, /* 0x9 */ - 3, /* 0xA */ - 4, /* 0xB */ - 1, /* 0xC */ - 2, /* 0xD */ - 3, /* 0xE */ - 4, /* 0xF */ -}; - -#endif diff -Nru mesa-19.2.8/src/panfrost/midgard/midgard_print.c mesa-20.0.8/src/panfrost/midgard/midgard_print.c --- mesa-19.2.8/src/panfrost/midgard/midgard_print.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/panfrost/midgard/midgard_print.c 2020-06-12 01:21:18.000000000 +0000 @@ -21,6 +21,10 @@ * SOFTWARE. */ +#include + +#include "util/bitscan.h" +#include "util/half_float.h" #include "compiler.h" #include "helpers.h" #include "midgard_ops.h" @@ -34,7 +38,7 @@ static void mir_print_index(int source) { - if (source < 0) { + if (source == ~0) { printf("_"); return; } @@ -66,6 +70,15 @@ } } +static void +mir_print_swizzle(unsigned *swizzle) +{ + printf("."); + + for (unsigned i = 0; i < 16; ++i) + putchar(components[swizzle[i]]); +} + static const char * mir_get_unit(unsigned unit) { @@ -90,10 +103,201 @@ } void +mir_print_constant_component(FILE *fp, const midgard_constants *consts, unsigned c, + midgard_reg_mode reg_mode, bool half, + unsigned mod, midgard_alu_op op) +{ + bool is_sint = false, is_uint = false, is_hex = false; + const char *opname = alu_opcode_props[op].name; + + /* Add a sentinel name to prevent crashing */ + if (!opname) + opname = "unknown"; + + if (opname[0] == 'u') { + /* If the opcode starts with a 'u' we are sure we deal with an + * unsigned int operation + */ + is_uint = true; + } else if (opname[0] == 'i') { + /* Bit ops are easier to follow when the constant is printed in + * hexadecimal. Other operations starting with a 'i' are + * considered to operate on signed integers. That might not + * be true for all of them, but it's good enough for traces. + */ + if (op >= midgard_alu_op_iand && + op <= midgard_alu_op_ibitcount8) + is_hex = true; + else + is_sint = true; + } + + if (half) + reg_mode--; + + switch (reg_mode) { + case midgard_reg_mode_64: + if (is_sint) { + fprintf(fp, "%"PRIi64, consts->i64[c]); + } else if (is_uint) { + fprintf(fp, "%"PRIu64, consts->u64[c]); + } else if (is_hex) { + fprintf(fp, "0x%"PRIX64, consts->u64[c]); + } else { + double v = consts->f64[c]; + + if (mod & MIDGARD_FLOAT_MOD_ABS) v = fabs(v); + if (mod & MIDGARD_FLOAT_MOD_NEG) v = -v; + + printf("%g", v); + } + break; + + case midgard_reg_mode_32: + if (is_sint) { + int64_t v; + + if (half && mod == midgard_int_zero_extend) + v = consts->u32[c]; + else if (half && mod == midgard_int_shift) + v = (uint64_t)consts->u32[c] << 32; + else + v = consts->i32[c]; + + fprintf(fp, "%"PRIi64, v); + } else if (is_uint || is_hex) { + uint64_t v; + + if (half && mod == midgard_int_shift) + v = (uint64_t)consts->u32[c] << 32; + else + v = consts->u32[c]; + + fprintf(fp, is_uint ? "%"PRIu64 : "0x%"PRIX64, v); + } else { + float v = consts->f32[c]; + + if (mod & MIDGARD_FLOAT_MOD_ABS) v = fabsf(v); + if (mod & MIDGARD_FLOAT_MOD_NEG) v = -v; + + fprintf(fp, "%g", v); + } + break; + + case midgard_reg_mode_16: + if (is_sint) { + int32_t v; + + if (half && mod == midgard_int_zero_extend) + v = consts->u16[c]; + else if (half && mod == midgard_int_shift) + v = (uint32_t)consts->u16[c] << 16; + else + v = consts->i16[c]; + + fprintf(fp, "%d", v); + } else if (is_uint || is_hex) { + uint32_t v; + + if (half && mod == midgard_int_shift) + v = (uint32_t)consts->u16[c] << 16; + else + v = consts->u16[c]; + + fprintf(fp, is_uint ? "%u" : "0x%X", v); + } else { + float v = _mesa_half_to_float(consts->f16[c]); + + if (mod & MIDGARD_FLOAT_MOD_ABS) v = fabsf(v); + if (mod & MIDGARD_FLOAT_MOD_NEG) v = -v; + + fprintf(fp, "%g", v); + } + break; + + case midgard_reg_mode_8: + unreachable("XXX TODO: sort out how 8-bit constant encoding works"); + break; + } +} + +static void +mir_print_embedded_constant(midgard_instruction *ins, unsigned src_idx) +{ + unsigned type_size = mir_bytes_for_mode(ins->alu.reg_mode); + midgard_vector_alu_src src; + + assert(src_idx <= 1); + if (src_idx == 0) + src = vector_alu_from_unsigned(ins->alu.src1); + else + src = vector_alu_from_unsigned(ins->alu.src2); + + unsigned *swizzle = ins->swizzle[src_idx]; + unsigned comp_mask = effective_writemask(&ins->alu, ins->mask); + unsigned num_comp = util_bitcount(comp_mask); + unsigned max_comp = 16 / type_size; + bool first = true; + + printf("#"); + + if (num_comp > 1) + printf("vec%d(", num_comp); + + for (unsigned comp = 0; comp < max_comp; comp++) { + if (!(comp_mask & (1 << comp))) + continue; + + if (first) + first = false; + else + printf(", "); + + mir_print_constant_component(stdout, &ins->constants, + swizzle[comp], ins->alu.reg_mode, + src.half, src.mod, ins->alu.op); + } + + if (num_comp > 1) + printf(")"); +} + +void mir_print_instruction(midgard_instruction *ins) { printf("\t"); + if (midgard_is_branch_unit(ins->unit)) { + const char *branch_target_names[] = { + "goto", "break", "continue", "discard" + }; + + printf("%s.", mir_get_unit(ins->unit)); + if (ins->branch.target_type == TARGET_DISCARD) + printf("discard."); + else if (ins->writeout) + printf("write."); + else if (ins->unit == ALU_ENAB_BR_COMPACT && + !ins->branch.conditional) + printf("uncond."); + else + printf("cond."); + + if (!ins->branch.conditional) + printf("always"); + else if (ins->branch.invert_conditional) + printf("false"); + else + printf("true"); + + if (ins->branch.target_type != TARGET_DISCARD) + printf(" %s -> block(%d)\n", + branch_target_names[ins->branch.target_type], + ins->branch.target_block); + + return; + } + switch (ins->type) { case TAG_ALU_4: { midgard_alu_op op = ins->alu.op; @@ -108,7 +312,7 @@ case TAG_LOAD_STORE_4: { midgard_load_store_op op = ins->load_store.op; - const char *name = load_store_opcode_names[op]; + const char *name = load_store_opcode_props[op].name; assert(name); printf("%s", name); @@ -124,32 +328,43 @@ assert(0); } - if (ins->invert) + if (ins->invert || (ins->compact_branch && ins->branch.invert_conditional)) printf(".not"); - ssa_args *args = &ins->ssa_args; - printf(" "); - mir_print_index(args->dest); + mir_print_index(ins->dest); if (ins->mask != 0xF) mir_print_mask(ins->mask); printf(", "); - mir_print_index(args->src[0]); + unsigned r_constant = SSA_FIXED_REGISTER(REGISTER_CONSTANT); + + if (ins->src[0] == r_constant) + mir_print_embedded_constant(ins, 0); + else { + mir_print_index(ins->src[0]); + mir_print_swizzle(ins->swizzle[0]); + } printf(", "); - if (args->inline_constant) + if (ins->has_inline_constant) printf("#%d", ins->inline_constant); - else - mir_print_index(args->src[1]); + else if (ins->src[1] == r_constant) + mir_print_embedded_constant(ins, 1); + else { + mir_print_index(ins->src[1]); + mir_print_swizzle(ins->swizzle[1]); + } printf(", "); - mir_print_index(args->src[2]); + mir_print_index(ins->src[2]); + mir_print_swizzle(ins->swizzle[2]); - if (ins->has_constants) - printf(" <%f, %f, %f, %f>", ins->constants[0], ins->constants[1], ins->constants[2], ins->constants[3]); + printf(", "); + mir_print_index(ins->src[3]); + mir_print_swizzle(ins->swizzle[3]); if (ins->no_spill) printf(" /* no spill */"); @@ -162,10 +377,19 @@ void mir_print_block(midgard_block *block) { - printf("block%d: {\n", block->source_id); + printf("block%u: {\n", block->source_id); - mir_foreach_instr_in_block(block, ins) { - mir_print_instruction(ins); + if (block->is_scheduled) { + mir_foreach_bundle_in_block(block, bundle) { + for (unsigned i = 0; i < bundle->instruction_count; ++i) + mir_print_instruction(bundle->instructions[i]); + + printf("\n"); + } + } else { + mir_foreach_instr_in_block(block, ins) { + mir_print_instruction(ins); + } } printf("}"); @@ -173,14 +397,14 @@ if (block->nr_successors) { printf(" -> "); for (unsigned i = 0; i < block->nr_successors; ++i) { - printf("block%d%s", block->successors[i]->source_id, + printf("block%u%s", block->successors[i]->source_id, (i + 1) != block->nr_successors ? ", " : ""); } } printf(" from { "); mir_foreach_predecessor(block, pred) - printf("block%d ", pred->source_id); + printf("block%u ", pred->source_id); printf("}"); printf("\n\n"); @@ -193,16 +417,3 @@ mir_print_block(block); } } - -void -mir_print_bundle(midgard_bundle *bundle) -{ - printf("[\n"); - - for (unsigned i = 0; i < bundle->instruction_count; ++i) { - midgard_instruction *ins = bundle->instructions[i]; - mir_print_instruction(ins); - } - - printf("]\n"); -} diff -Nru mesa-19.2.8/src/panfrost/midgard/midgard_quirks.h mesa-20.0.8/src/panfrost/midgard/midgard_quirks.h --- mesa-19.2.8/src/panfrost/midgard/midgard_quirks.h 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/panfrost/midgard/midgard_quirks.h 2020-06-12 01:21:18.000000000 +0000 @@ -0,0 +1,98 @@ +/* + * Copyright (C) 2019 Collabora, Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __MDG_QUIRKS_H +#define __MDG_QUIRKS_H + +/* Model-specific quirks requiring compiler workarounds/etc. Quirks + * may be errata requiring a workaround, or features. We're trying to be + * quirk-positive here; quirky is the best! */ + +/* Whether an explicit LOD is required via textureLod in a vertex shader. If + * set, vertex texturing will *always* textureLod. If unset, normal texture ops + * may be emitted in a vertex shader */ + +#define MIDGARD_EXPLICIT_LOD (1 << 0) + +/* Whether output texture registers (normally r28/r29) overlap with work + * registers r0/r1 and input texture registers (also normally r28/r29) overlap + * with load/store registers r26/r27. This constrains register allocation + * considerably but is a space-saving measure on small Midgards. It's worth + * noting if you try to access r28/r29, it may still work, but you'll mess up + * the interference. Corresponds to BASE_HW_FEATURE_INTERPIPE_REG_ALIASING in + * kbase. */ + +#define MIDGARD_INTERPIPE_REG_ALIASING (1 << 1) + +/* Whether we should use old-style blend opcodes */ + +#define MIDGARD_OLD_BLEND (1 << 2) + +/* Errata causing the LOD clamps and bias in the sampler descriptor to be + * ignored. This errata affects the command stream but uses a compiler + * workaround (applying the clamps/bias manually in the shader. Corresponds in + * BASE_HW_ISSUE_10471 in kbase, described as "TEXGRD doesn't honor Sampler + * Descriptor LOD clamps nor bias". (I'm assuming TEXGRD is what we call + * textureLod) */ + +#define MIDGARD_BROKEN_LOD (1 << 3) + +/* Don't use upper ALU tags for writeout (if you do, you'll get a + * INSTR_INVALID_ENC). It's not clear to me what these tags are for. */ + +#define MIDGARD_NO_UPPER_ALU (1 << 4) + +static inline unsigned +midgard_get_quirks(unsigned gpu_id) +{ + switch (gpu_id) { + case 0x600: + case 0x620: + return MIDGARD_OLD_BLEND | + MIDGARD_BROKEN_LOD | + MIDGARD_NO_UPPER_ALU; + + case 0x720: + return MIDGARD_INTERPIPE_REG_ALIASING | + MIDGARD_OLD_BLEND | + MIDGARD_BROKEN_LOD | + MIDGARD_NO_UPPER_ALU; + + case 0x820: + case 0x830: + return MIDGARD_INTERPIPE_REG_ALIASING; + + case 0x750: + return MIDGARD_EXPLICIT_LOD | + MIDGARD_NO_UPPER_ALU; + + case 0x860: + case 0x880: + return MIDGARD_EXPLICIT_LOD; + + default: + unreachable("Invalid Midgard GPU ID"); + } +} + +#endif diff -Nru mesa-19.2.8/src/panfrost/midgard/midgard_ra.c mesa-20.0.8/src/panfrost/midgard/midgard_ra.c --- mesa-19.2.8/src/panfrost/midgard/midgard_ra.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/panfrost/midgard/midgard_ra.c 2020-06-12 01:21:18.000000000 +0000 @@ -24,111 +24,54 @@ #include "compiler.h" #include "midgard_ops.h" -#include "util/register_allocate.h" #include "util/u_math.h" #include "util/u_memory.h" - -/* For work registers, we can subdivide in various ways. So we create - * classes for the various sizes and conflict accordingly, keeping in - * mind that physical registers are divided along 128-bit boundaries. - * The important part is that 128-bit boundaries are not crossed. - * - * For each 128-bit register, we can subdivide to 32-bits 10 ways - * - * vec4: xyzw - * vec3: xyz, yzw - * vec2: xy, yz, zw, - * vec1: x, y, z, w - * - * For each 64-bit register, we can subdivide similarly to 16-bit - * (TODO: half-float RA, not that we support fp16 yet) - */ - -#define WORK_STRIDE 10 - -/* We have overlapping register classes for special registers, handled via - * shadows */ - -#define SHADOW_R28 18 -#define SHADOW_R29 19 - -/* Prepacked masks/swizzles for virtual register types */ -static unsigned reg_type_to_mask[WORK_STRIDE] = { - 0xF, /* xyzw */ - 0x7, 0x7 << 1, /* xyz */ - 0x3, 0x3 << 1, 0x3 << 2, /* xy */ - 0x1, 0x1 << 1, 0x1 << 2, 0x1 << 3 /* x */ -}; - -static unsigned reg_type_to_swizzle[WORK_STRIDE] = { - SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_W), - - SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_W), - SWIZZLE(COMPONENT_Y, COMPONENT_Z, COMPONENT_W, COMPONENT_W), - - SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_W), - SWIZZLE(COMPONENT_Y, COMPONENT_Z, COMPONENT_Z, COMPONENT_W), - SWIZZLE(COMPONENT_Z, COMPONENT_W, COMPONENT_Z, COMPONENT_W), - - SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_W), - SWIZZLE(COMPONENT_Y, COMPONENT_Y, COMPONENT_Z, COMPONENT_W), - SWIZZLE(COMPONENT_Z, COMPONENT_Y, COMPONENT_Z, COMPONENT_W), - SWIZZLE(COMPONENT_W, COMPONENT_Y, COMPONENT_Z, COMPONENT_W), -}; +#include "lcra.h" +#include "midgard_quirks.h" struct phys_reg { + /* Physical register: 0-31 */ unsigned reg; - unsigned mask; - unsigned swizzle; -}; -/* Given the mask/swizzle of both the register and the original source, - * compose to find the actual mask/swizzle to give the hardware */ + /* Byte offset into the physical register: 0-15 */ + unsigned offset; -static unsigned -compose_writemask(unsigned mask, struct phys_reg reg) -{ - /* Note: the reg mask is guaranteed to be contiguous. So we shift - * into the X place, compose via a simple AND, and shift back */ + /* Number of bytes in a component of this register */ + unsigned size; +}; - unsigned shift = __builtin_ctz(reg.mask); - return ((reg.mask >> shift) & mask) << shift; -} +/* Shift up by reg_offset and horizontally by dst_offset. */ -static unsigned -compose_swizzle(unsigned swizzle, unsigned mask, - struct phys_reg reg, struct phys_reg dst) +static void +offset_swizzle(unsigned *swizzle, unsigned reg_offset, unsigned srcsize, unsigned dst_offset) { - unsigned out = pan_compose_swizzle(swizzle, reg.swizzle); + unsigned out[MIR_VEC_COMPONENTS]; - /* Based on the register mask, we need to adjust over. E.g if we're - * writing to yz, a base swizzle of xy__ becomes _xy_. Save the - * original first component (x). But to prevent duplicate shifting - * (only applies to ALU -- mask param is set to xyzw out on L/S to - * prevent changes), we have to account for the shift inherent to the - * original writemask */ + signed reg_comp = reg_offset / srcsize; + signed dst_comp = dst_offset / srcsize; - unsigned rep = out & 0x3; - unsigned shift = __builtin_ctz(dst.mask) - __builtin_ctz(mask); - unsigned shifted = out << (2*shift); + unsigned max_component = (16 / srcsize) - 1; - /* ..but we fill in the gaps so it appears to replicate */ + assert(reg_comp * srcsize == reg_offset); + assert(dst_comp * srcsize == dst_offset); - for (unsigned s = 0; s < shift; ++s) - shifted |= rep << (2*s); + for (signed c = 0; c < MIR_VEC_COMPONENTS; ++c) { + signed comp = MAX2(c - dst_comp, 0); + out[c] = MIN2(swizzle[comp] + reg_comp, max_component); + } - return shifted; + memcpy(swizzle, out, sizeof(out)); } /* Helper to return the default phys_reg for a given register */ static struct phys_reg -default_phys_reg(int reg) +default_phys_reg(int reg, midgard_reg_mode size) { struct phys_reg r = { .reg = reg, - .mask = 0xF, /* xyzw */ - .swizzle = 0xE4 /* xyzw */ + .offset = 0, + .size = mir_bytes_for_mode(size) }; return r; @@ -138,194 +81,37 @@ * register corresponds to */ static struct phys_reg -index_to_reg(compiler_context *ctx, struct ra_graph *g, int reg) +index_to_reg(compiler_context *ctx, struct lcra_state *l, unsigned reg, midgard_reg_mode size) { /* Check for special cases */ - if (reg >= SSA_FIXED_MINIMUM) - return default_phys_reg(SSA_REG_FROM_FIXED(reg)); - else if ((reg < 0) || !g) - return default_phys_reg(REGISTER_UNUSED); - - /* Special cases aside, we pick the underlying register */ - int virt = ra_get_node_reg(g, reg); - - /* Divide out the register and classification */ - int phys = virt / WORK_STRIDE; - int type = virt % WORK_STRIDE; - - /* Apply shadow registers */ - - if (phys >= SHADOW_R28 && phys <= SHADOW_R29) - phys += 28 - SHADOW_R28; + if (reg == ~0) + return default_phys_reg(REGISTER_UNUSED, size); + else if (reg >= SSA_FIXED_MINIMUM) + return default_phys_reg(SSA_REG_FROM_FIXED(reg), size); + else if (!l) + return default_phys_reg(REGISTER_UNUSED, size); struct phys_reg r = { - .reg = phys, - .mask = reg_type_to_mask[type], - .swizzle = reg_type_to_swizzle[type] + .reg = l->solutions[reg] / 16, + .offset = l->solutions[reg] & 0xF, + .size = mir_bytes_for_mode(size) }; /* Report that we actually use this register, and return it */ - if (phys < 16) - ctx->work_registers = MAX2(ctx->work_registers, phys); + if (r.reg < 16) + ctx->work_registers = MAX2(ctx->work_registers, r.reg); return r; } -/* This routine creates a register set. Should be called infrequently since - * it's slow and can be cached. For legibility, variables are named in terms of - * work registers, although it is also used to create the register set for - * special register allocation */ - -static void -add_shadow_conflicts (struct ra_regs *regs, unsigned base, unsigned shadow) -{ - for (unsigned a = 0; a < WORK_STRIDE; ++a) { - unsigned reg_a = (WORK_STRIDE * base) + a; - - for (unsigned b = 0; b < WORK_STRIDE; ++b) { - unsigned reg_b = (WORK_STRIDE * shadow) + b; - - ra_add_reg_conflict(regs, reg_a, reg_b); - ra_add_reg_conflict(regs, reg_b, reg_a); - } - } -} - -static struct ra_regs * -create_register_set(unsigned work_count, unsigned *classes) -{ - int virtual_count = 32 * WORK_STRIDE; - - /* First, initialize the RA */ - struct ra_regs *regs = ra_alloc_reg_set(NULL, virtual_count, true); - - for (unsigned c = 0; c < NR_REG_CLASSES; ++c) { - int work_vec4 = ra_alloc_reg_class(regs); - int work_vec3 = ra_alloc_reg_class(regs); - int work_vec2 = ra_alloc_reg_class(regs); - int work_vec1 = ra_alloc_reg_class(regs); - - classes[4*c + 0] = work_vec1; - classes[4*c + 1] = work_vec2; - classes[4*c + 2] = work_vec3; - classes[4*c + 3] = work_vec4; - - /* Special register classes have other register counts */ - unsigned count = - (c == REG_CLASS_WORK) ? work_count : 2; - - unsigned first_reg = - (c == REG_CLASS_LDST) ? 26 : - (c == REG_CLASS_TEXR) ? 28 : - (c == REG_CLASS_TEXW) ? SHADOW_R28 : - 0; - - /* Add the full set of work registers */ - for (unsigned i = first_reg; i < (first_reg + count); ++i) { - int base = WORK_STRIDE * i; - - /* Build a full set of subdivisions */ - ra_class_add_reg(regs, work_vec4, base); - ra_class_add_reg(regs, work_vec3, base + 1); - ra_class_add_reg(regs, work_vec3, base + 2); - ra_class_add_reg(regs, work_vec2, base + 3); - ra_class_add_reg(regs, work_vec2, base + 4); - ra_class_add_reg(regs, work_vec2, base + 5); - ra_class_add_reg(regs, work_vec1, base + 6); - ra_class_add_reg(regs, work_vec1, base + 7); - ra_class_add_reg(regs, work_vec1, base + 8); - ra_class_add_reg(regs, work_vec1, base + 9); - - for (unsigned a = 0; a < 10; ++a) { - unsigned mask1 = reg_type_to_mask[a]; - - for (unsigned b = 0; b < 10; ++b) { - unsigned mask2 = reg_type_to_mask[b]; - - if (mask1 & mask2) - ra_add_reg_conflict(regs, - base + a, base + b); - } - } - } - } - - - /* We have duplicate classes */ - add_shadow_conflicts(regs, 28, SHADOW_R28); - add_shadow_conflicts(regs, 29, SHADOW_R29); - - /* We're done setting up */ - ra_set_finalize(regs, NULL); - - return regs; -} - -/* This routine gets a precomputed register set off the screen if it's able, or - * otherwise it computes one on the fly */ - -static struct ra_regs * -get_register_set(struct midgard_screen *screen, unsigned work_count, unsigned **classes) -{ - /* Bounds check */ - assert(work_count >= 8); - assert(work_count <= 16); - - /* Compute index */ - unsigned index = work_count - 8; - - /* Find the reg set */ - struct ra_regs *cached = screen->regs[index]; - - if (cached) { - assert(screen->reg_classes[index]); - *classes = screen->reg_classes[index]; - return cached; - } - - /* Otherwise, create one */ - struct ra_regs *created = create_register_set(work_count, screen->reg_classes[index]); - - /* Cache it and use it */ - screen->regs[index] = created; - - *classes = screen->reg_classes[index]; - return created; -} - -/* Assign a (special) class, ensuring that it is compatible with whatever class - * was already set */ - static void set_class(unsigned *classes, unsigned node, unsigned class) { - /* Check that we're even a node */ - if ((node < 0) || (node >= SSA_FIXED_MINIMUM)) - return; - - /* First 4 are work, next 4 are load/store.. */ - unsigned current_class = classes[node] >> 2; - - /* Nothing to do */ - if (class == current_class) - return; - - /* If we're changing, we haven't assigned a special class */ - assert(current_class == REG_CLASS_WORK); - - classes[node] &= 0x3; - classes[node] |= (class << 2); -} - -static void -force_vec4(unsigned *classes, unsigned node) -{ - if ((node < 0) || (node >= SSA_FIXED_MINIMUM)) - return; - - /* Force vec4 = 3 */ - classes[node] |= 0x3; + if (node < SSA_FIXED_MINIMUM && class != classes[node]) { + assert(classes[node] == REG_CLASS_WORK); + classes[node] = class; + } } /* Special register classes impose special constraints on who can read their @@ -335,12 +121,10 @@ check_read_class(unsigned *classes, unsigned tag, unsigned node) { /* Non-nodes are implicitly ok */ - if ((node < 0) || (node >= SSA_FIXED_MINIMUM)) + if (node >= SSA_FIXED_MINIMUM) return true; - unsigned current_class = classes[node] >> 2; - - switch (current_class) { + switch (classes[node]) { case REG_CLASS_LDST: return (tag == TAG_LOAD_STORE_4); case REG_CLASS_TEXR: @@ -348,7 +132,7 @@ case REG_CLASS_TEXW: return (tag != TAG_LOAD_STORE_4); case REG_CLASS_WORK: - return (tag == TAG_ALU_4); + return IS_ALU(tag); default: unreachable("Invalid class"); } @@ -358,19 +142,17 @@ check_write_class(unsigned *classes, unsigned tag, unsigned node) { /* Non-nodes are implicitly ok */ - if ((node < 0) || (node >= SSA_FIXED_MINIMUM)) + if (node >= SSA_FIXED_MINIMUM) return true; - unsigned current_class = classes[node] >> 2; - - switch (current_class) { + switch (classes[node]) { case REG_CLASS_TEXR: return true; case REG_CLASS_TEXW: return (tag == TAG_TEXTURE_4); case REG_CLASS_LDST: case REG_CLASS_WORK: - return (tag == TAG_ALU_4) || (tag == TAG_LOAD_STORE_4); + return IS_ALU(tag) || (tag == TAG_LOAD_STORE_4); default: unreachable("Invalid class"); } @@ -383,7 +165,7 @@ static void mark_node_class (unsigned *bitfield, unsigned node) { - if ((node >= 0) && (node < SSA_FIXED_MINIMUM)) + if (node < SSA_FIXED_MINIMUM) BITSET_SET(bitfield, node); } @@ -392,10 +174,12 @@ { size_t sz = BITSET_WORDS(ctx->temp_count) * sizeof(BITSET_WORD); - /* Bitfields for the various types of registers we could have */ + /* Bitfields for the various types of registers we could have. aluw can + * be written by either ALU or load/store */ unsigned *alur = calloc(sz, 1); unsigned *aluw = calloc(sz, 1); + unsigned *brar = calloc(sz, 1); unsigned *ldst = calloc(sz, 1); unsigned *texr = calloc(sz, 1); unsigned *texw = calloc(sz, 1); @@ -405,22 +189,28 @@ mir_foreach_instr_global(ctx, ins) { switch (ins->type) { case TAG_ALU_4: - mark_node_class(aluw, ins->ssa_args.dest); - mark_node_class(alur, ins->ssa_args.src[0]); - mark_node_class(alur, ins->ssa_args.src[1]); + mark_node_class(aluw, ins->dest); + mark_node_class(alur, ins->src[0]); + mark_node_class(alur, ins->src[1]); + mark_node_class(alur, ins->src[2]); + + if (ins->compact_branch && ins->writeout) + mark_node_class(brar, ins->src[0]); + break; case TAG_LOAD_STORE_4: - mark_node_class(ldst, ins->ssa_args.src[0]); - mark_node_class(ldst, ins->ssa_args.src[1]); - mark_node_class(ldst, ins->ssa_args.src[2]); + mark_node_class(aluw, ins->dest); + mark_node_class(ldst, ins->src[0]); + mark_node_class(ldst, ins->src[1]); + mark_node_class(ldst, ins->src[2]); break; case TAG_TEXTURE_4: - mark_node_class(texr, ins->ssa_args.src[0]); - mark_node_class(texr, ins->ssa_args.src[1]); - mark_node_class(texr, ins->ssa_args.src[2]); - mark_node_class(texw, ins->ssa_args.dest); + mark_node_class(texr, ins->src[0]); + mark_node_class(texr, ins->src[1]); + mark_node_class(texr, ins->src[2]); + mark_node_class(texw, ins->dest); break; } } @@ -438,6 +228,7 @@ for (unsigned i = 0; i < ctx->temp_count; ++i) { bool is_alur = BITSET_TEST(alur, i); bool is_aluw = BITSET_TEST(aluw, i); + bool is_brar = BITSET_TEST(brar, i); bool is_ldst = BITSET_TEST(ldst, i); bool is_texr = BITSET_TEST(texr, i); bool is_texw = BITSET_TEST(texw, i); @@ -452,7 +243,8 @@ (is_alur && (is_ldst || is_texr)) || (is_ldst && (is_alur || is_texr || is_texw)) || (is_texr && (is_alur || is_ldst || is_texw)) || - (is_texw && (is_aluw || is_ldst || is_texr)); + (is_texw && (is_aluw || is_ldst || is_texr)) || + (is_brar && is_texw); if (!collision) continue; @@ -460,8 +252,8 @@ /* Use the index as-is as the work copy. Emit copies for * special uses */ - unsigned classes[] = { TAG_LOAD_STORE_4, TAG_TEXTURE_4, TAG_TEXTURE_4 }; - bool collisions[] = { is_ldst, is_texr, is_texw && is_aluw }; + unsigned classes[] = { TAG_LOAD_STORE_4, TAG_TEXTURE_4, TAG_TEXTURE_4, TAG_ALU_4}; + bool collisions[] = { is_ldst, is_texr, is_texw && is_aluw, is_brar }; for (unsigned j = 0; j < ARRAY_SIZE(collisions); ++j) { if (!collisions[j]) continue; @@ -476,8 +268,7 @@ unsigned idx = spill_idx++; midgard_instruction m = hazard_write ? - v_mov(idx, blank_alu_src, i) : - v_mov(i, blank_alu_src, idx); + v_mov(idx, i) : v_mov(i, idx); /* Insert move before each read/write, depending on the * hazard we're trying to account for */ @@ -487,7 +278,7 @@ continue; if (hazard_write) { - if (pre_use->ssa_args.dest != i) + if (pre_use->dest != i) continue; } else { if (!mir_has_arg(pre_use, i)) @@ -497,13 +288,13 @@ if (hazard_write) { midgard_instruction *use = mir_next_op(pre_use); assert(use); - mir_insert_instruction_before(use, m); + mir_insert_instruction_before(ctx, use, m); mir_rewrite_index_dst_single(pre_use, i, idx); } else { idx = spill_idx++; - m = v_mov(i, blank_alu_src, idx); - m.mask = mir_mask_of_read_components(pre_use, i); - mir_insert_instruction_before(pre_use, m); + m = v_mov(i, idx); + m.mask = mir_from_bytemask(mir_bytemask_of_read_components(pre_use, i), midgard_reg_mode_32); + mir_insert_instruction_before(ctx, pre_use, m); mir_rewrite_index_src_single(pre_use, i, idx); } } @@ -512,204 +303,181 @@ free(alur); free(aluw); + free(brar); free(ldst); free(texr); free(texw); } -/* Routines for liveness analysis */ +/* We register allocate after scheduling, so we need to ensure instructions + * executing in parallel within a segment of a bundle don't clobber each + * other's registers. This is mostly a non-issue thanks to scheduling, but + * there are edge cases. In particular, after a register is written in a + * segment, it interferes with anything reading. */ static void -liveness_gen(uint8_t *live, unsigned node, unsigned max, unsigned mask) -{ - if ((node < 0) || (node >= max)) - return; - - live[node] |= mask; -} - -static void -liveness_kill(uint8_t *live, unsigned node, unsigned max, unsigned mask) -{ - if ((node < 0) || (node >= max)) - return; - - live[node] &= ~mask; -} - -/* Updates live_in for a single instruction */ +mir_compute_segment_interference( + compiler_context *ctx, + struct lcra_state *l, + midgard_bundle *bun, + unsigned pivot, + unsigned i) +{ + for (unsigned j = pivot; j < i; ++j) { + mir_foreach_src(bun->instructions[j], s) { + if (bun->instructions[j]->src[s] >= ctx->temp_count) + continue; -static void -liveness_ins_update(uint8_t *live, midgard_instruction *ins, unsigned max) -{ - /* live_in[s] = GEN[s] + (live_out[s] - KILL[s]) */ + for (unsigned q = pivot; q < i; ++q) { + if (bun->instructions[q]->dest >= ctx->temp_count) + continue; - liveness_kill(live, ins->ssa_args.dest, max, ins->mask); + /* See dEQP-GLES2.functional.shaders.return.output_write_in_func_dynamic_fragment */ - mir_foreach_src(ins, src) { - unsigned node = ins->ssa_args.src[src]; - unsigned mask = mir_mask_of_read_components(ins, node); + if (q >= j) { + if (!(bun->instructions[j]->unit == UNIT_SMUL && bun->instructions[q]->unit == UNIT_VLUT)) + continue; + } - liveness_gen(live, node, max, mask); + unsigned mask = mir_bytemask(bun->instructions[q]); + unsigned rmask = mir_bytemask_of_read_components(bun->instructions[j], bun->instructions[j]->src[s]); + lcra_add_node_interference(l, bun->instructions[q]->dest, mask, bun->instructions[j]->src[s], rmask); + } + } } } -/* live_out[s] = sum { p in succ[s] } ( live_in[p] ) */ - static void -liveness_block_live_out(compiler_context *ctx, midgard_block *blk) -{ - mir_foreach_successor(blk, succ) { - for (unsigned i = 0; i < ctx->temp_count; ++i) - blk->live_out[i] |= succ->live_in[i]; - } -} - -/* Liveness analysis is a backwards-may dataflow analysis pass. Within a block, - * we compute live_out from live_in. The intrablock pass is linear-time. It - * returns whether progress was made. */ - -static bool -liveness_block_update(compiler_context *ctx, midgard_block *blk) +mir_compute_bundle_interference( + compiler_context *ctx, + struct lcra_state *l, + midgard_bundle *bun) { - bool progress = false; - - liveness_block_live_out(ctx, blk); - - uint8_t *live = mem_dup(blk->live_out, ctx->temp_count); - - mir_foreach_instr_in_block_rev(blk, ins) - liveness_ins_update(live, ins, ctx->temp_count); + if (!IS_ALU(bun->tag)) + return; - /* To figure out progress, diff live_in */ + bool old = bun->instructions[0]->unit >= UNIT_VADD; + unsigned pivot = 0; - for (unsigned i = 0; (i < ctx->temp_count) && !progress; ++i) - progress |= (blk->live_in[i] != live[i]); + for (unsigned i = 1; i < bun->instruction_count; ++i) { + bool new = bun->instructions[i]->unit >= UNIT_VADD; - free(blk->live_in); - blk->live_in = live; + if (old != new) { + mir_compute_segment_interference(ctx, l, bun, 0, i); + pivot = i; + break; + } + } - return progress; + mir_compute_segment_interference(ctx, l, bun, pivot, bun->instruction_count); } -/* Globally, liveness analysis uses a fixed-point algorithm based on a - * worklist. We initialize a work list with the exit block. We iterate the work - * list to compute live_in from live_out for each block on the work list, - * adding the predecessors of the block to the work list if we made progress. - */ - static void -mir_compute_liveness( +mir_compute_interference( compiler_context *ctx, - struct ra_graph *g) + struct lcra_state *l) { - /* List of midgard_block */ - struct set *work_list; + /* First, we need liveness information to be computed per block */ + mir_compute_liveness(ctx); - work_list = _mesa_set_create(ctx, - _mesa_hash_pointer, - _mesa_key_pointer_equal); + /* We need to force r1.w live throughout a blend shader */ - /* Allocate */ + if (ctx->is_blend) { + unsigned r1w = ~0; - mir_foreach_block(ctx, block) { - block->live_in = calloc(ctx->temp_count, 1); - block->live_out = calloc(ctx->temp_count, 1); - } - - /* Initialize the work list with the exit block */ - struct set_entry *cur; - - midgard_block *exit = mir_exit_block(ctx); - cur = _mesa_set_add(work_list, exit); - - /* Iterate the work list */ - - do { - /* Pop off a block */ - midgard_block *blk = (struct midgard_block *) cur->key; - _mesa_set_remove(work_list, cur); - - /* Update its liveness information */ - bool progress = liveness_block_update(ctx, blk); + mir_foreach_block(ctx, block) { + mir_foreach_instr_in_block_rev(block, ins) { + if (ins->writeout) + r1w = ins->src[2]; + } - /* If we made progress, we need to process the predecessors */ + if (r1w != ~0) + break; + } - if (progress || (blk == exit)) { - mir_foreach_predecessor(blk, pred) - _mesa_set_add(work_list, pred); + mir_foreach_instr_global(ctx, ins) { + if (ins->dest < ctx->temp_count) + lcra_add_node_interference(l, ins->dest, mir_bytemask(ins), r1w, 0xF); } - } while((cur = _mesa_set_next_entry(work_list, NULL)) != NULL); + } /* Now that every block has live_in/live_out computed, we can determine * interference by walking each block linearly. Take live_out at the * end of each block and walk the block backwards. */ mir_foreach_block(ctx, blk) { - uint8_t *live = calloc(ctx->temp_count, 1); - - mir_foreach_successor(blk, succ) { - for (unsigned i = 0; i < ctx->temp_count; ++i) - live[i] |= succ->live_in[i]; - } + uint16_t *live = mem_dup(blk->live_out, ctx->temp_count * sizeof(uint16_t)); mir_foreach_instr_in_block_rev(blk, ins) { /* Mark all registers live after the instruction as * interfering with the destination */ - unsigned dest = ins->ssa_args.dest; + unsigned dest = ins->dest; - if (dest >= 0 && dest < ctx->temp_count) { + if (dest < ctx->temp_count) { for (unsigned i = 0; i < ctx->temp_count; ++i) - if (live[i]) - ra_add_node_interference(g, dest, i); + if (live[i]) { + unsigned mask = mir_bytemask(ins); + lcra_add_node_interference(l, dest, mask, i, live[i]); + } } /* Update live_in */ - liveness_ins_update(live, ins, ctx->temp_count); + mir_liveness_ins_update(live, ins, ctx->temp_count); } - } - mir_foreach_block(ctx, blk) { - free(blk->live_in); - free(blk->live_out); + mir_foreach_bundle_in_block(blk, bun) + mir_compute_bundle_interference(ctx, l, bun); + + free(live); } } /* This routine performs the actual register allocation. It should be succeeded * by install_registers */ -struct ra_graph * +static struct lcra_state * allocate_registers(compiler_context *ctx, bool *spilled) { /* The number of vec4 work registers available depends on when the * uniforms start, so compute that first */ int work_count = 16 - MAX2((ctx->uniform_cutoff - 8), 0); - unsigned *classes = NULL; - struct ra_regs *regs = get_register_set(ctx->screen, work_count, &classes); - - assert(regs != NULL); - assert(classes != NULL); /* No register allocation to do with no SSA */ if (!ctx->temp_count) return NULL; - /* Let's actually do register allocation */ - int nodes = ctx->temp_count; - struct ra_graph *g = ra_alloc_interference_graph(regs, nodes); - - /* Register class (as known to the Mesa register allocator) is actually - * the product of both semantic class (work, load/store, texture..) and - * size (vec2/vec3..). First, we'll go through and determine the - * minimum size needed to hold values */ + struct lcra_state *l = lcra_alloc_equations(ctx->temp_count, 1, 8, 16, 5); + + /* Starts of classes, in bytes */ + l->class_start[REG_CLASS_WORK] = 16 * 0; + l->class_start[REG_CLASS_LDST] = 16 * 26; + l->class_start[REG_CLASS_TEXR] = 16 * 28; + l->class_start[REG_CLASS_TEXW] = 16 * 28; + + l->class_size[REG_CLASS_WORK] = 16 * work_count; + l->class_size[REG_CLASS_LDST] = 16 * 2; + l->class_size[REG_CLASS_TEXR] = 16 * 2; + l->class_size[REG_CLASS_TEXW] = 16 * 2; + + lcra_set_disjoint_class(l, REG_CLASS_TEXR, REG_CLASS_TEXW); + + /* To save space on T*20, we don't have real texture registers. + * Instead, tex inputs reuse the load/store pipeline registers, and + * tex outputs use work r0/r1. Note we still use TEXR/TEXW classes, + * noting that this handles interferences and sizes correctly. */ + + if (ctx->quirks & MIDGARD_INTERPIPE_REG_ALIASING) { + l->class_start[REG_CLASS_TEXR] = l->class_start[REG_CLASS_LDST]; + l->class_start[REG_CLASS_TEXW] = l->class_start[REG_CLASS_WORK]; + } unsigned *found_class = calloc(sizeof(unsigned), ctx->temp_count); + unsigned *min_alignment = calloc(sizeof(unsigned), ctx->temp_count); mir_foreach_instr_global(ctx, ins) { - if (ins->ssa_args.dest < 0) continue; - if (ins->ssa_args.dest >= SSA_FIXED_MINIMUM) continue; + if (ins->dest >= SSA_FIXED_MINIMUM) continue; /* 0 for x, 1 for xy, 2 for xyz, 3 for xyzw */ int class = util_logbase2(ins->mask); @@ -717,10 +485,33 @@ /* Use the largest class if there's ambiguity, this * handles partial writes */ - int dest = ins->ssa_args.dest; + int dest = ins->dest; found_class[dest] = MAX2(found_class[dest], class); + + /* XXX: Ensure swizzles align the right way with more LCRA constraints? */ + if (ins->type == TAG_ALU_4 && ins->alu.reg_mode != midgard_reg_mode_32) + min_alignment[dest] = 3; /* (1 << 3) = 8 */ + + if (ins->type == TAG_LOAD_STORE_4 && ins->load_64) + min_alignment[dest] = 3; + + /* We don't have a swizzle for the conditional and we don't + * want to muck with the conditional itself, so just force + * alignment for now */ + + if (ins->type == TAG_ALU_4 && OP_IS_CSEL_V(ins->alu.op)) + min_alignment[dest] = 4; /* 1 << 4= 16-byte = vec4 */ + } + for (unsigned i = 0; i < ctx->temp_count; ++i) { + lcra_set_alignment(l, i, min_alignment[i] ? min_alignment[i] : 2); + lcra_restrict_range(l, i, (found_class[i] + 1) * 4); + } + + free(found_class); + free(min_alignment); + /* Next, we'll determine semantic class. We default to zero (work). * But, if we're used with a special operation, that will force us to a * particular class. Each node must be assigned to exactly one class; a @@ -732,53 +523,61 @@ /* Check if this operation imposes any classes */ if (ins->type == TAG_LOAD_STORE_4) { - bool force_vec4_only = OP_IS_VEC4_ONLY(ins->load_store.op); - - set_class(found_class, ins->ssa_args.src[0], REG_CLASS_LDST); - set_class(found_class, ins->ssa_args.src[1], REG_CLASS_LDST); - set_class(found_class, ins->ssa_args.src[2], REG_CLASS_LDST); - - if (force_vec4_only) { - force_vec4(found_class, ins->ssa_args.dest); - force_vec4(found_class, ins->ssa_args.src[0]); - force_vec4(found_class, ins->ssa_args.src[1]); - force_vec4(found_class, ins->ssa_args.src[2]); + set_class(l->class, ins->src[0], REG_CLASS_LDST); + set_class(l->class, ins->src[1], REG_CLASS_LDST); + set_class(l->class, ins->src[2], REG_CLASS_LDST); + + if (OP_IS_VEC4_ONLY(ins->load_store.op)) { + lcra_restrict_range(l, ins->dest, 16); + lcra_restrict_range(l, ins->src[0], 16); + lcra_restrict_range(l, ins->src[1], 16); + lcra_restrict_range(l, ins->src[2], 16); } } else if (ins->type == TAG_TEXTURE_4) { - set_class(found_class, ins->ssa_args.dest, REG_CLASS_TEXW); - set_class(found_class, ins->ssa_args.src[0], REG_CLASS_TEXR); - set_class(found_class, ins->ssa_args.src[1], REG_CLASS_TEXR); - set_class(found_class, ins->ssa_args.src[2], REG_CLASS_TEXR); + set_class(l->class, ins->dest, REG_CLASS_TEXW); + set_class(l->class, ins->src[0], REG_CLASS_TEXR); + set_class(l->class, ins->src[1], REG_CLASS_TEXR); + set_class(l->class, ins->src[2], REG_CLASS_TEXR); + set_class(l->class, ins->src[3], REG_CLASS_TEXR); + + /* Texture offsets need to be aligned to vec4, since + * the swizzle for x is forced to x in hardware, while + * the other components are free. TODO: Relax to 8 for + * half-registers if that ever occurs. */ + + //lcra_restrict_range(l, ins->src[3], 16); } } /* Check that the semantics of the class are respected */ mir_foreach_instr_global(ctx, ins) { - assert(check_write_class(found_class, ins->type, ins->ssa_args.dest)); - assert(check_read_class(found_class, ins->type, ins->ssa_args.src[0])); - assert(check_read_class(found_class, ins->type, ins->ssa_args.src[1])); - assert(check_read_class(found_class, ins->type, ins->ssa_args.src[2])); + assert(check_write_class(l->class, ins->type, ins->dest)); + assert(check_read_class(l->class, ins->type, ins->src[0])); + assert(check_read_class(l->class, ins->type, ins->src[1])); + assert(check_read_class(l->class, ins->type, ins->src[2])); } - for (unsigned i = 0; i < ctx->temp_count; ++i) { - unsigned class = found_class[i]; - ra_set_node_class(g, i, classes[class]); - } + /* Mark writeout to r0, render target to r1.z, unknown to r1.w */ + mir_foreach_instr_global(ctx, ins) { + if (!(ins->compact_branch && ins->writeout)) continue; - mir_compute_liveness(ctx, g); + if (ins->src[0] < ctx->temp_count) + l->solutions[ins->src[0]] = 0; - if (!ra_allocate(g)) { - *spilled = true; - } else { - *spilled = false; - } + if (ins->src[1] < ctx->temp_count) + l->solutions[ins->src[1]] = (16 * 1) + COMPONENT_Z * 4; - /* Whether we were successful or not, report the graph so we can - * compute spill nodes */ + if (ins->src[2] < ctx->temp_count) + l->solutions[ins->src[2]] = (16 * 1) + COMPONENT_W * 4; + } + + mir_compute_interference(ctx, l); - return g; + *spilled = !lcra_solve(l); + return l; } + /* Once registers have been decided via register allocation * (allocate_registers), we need to rewrite the MIR to use registers instead of * indices */ @@ -786,34 +585,34 @@ static void install_registers_instr( compiler_context *ctx, - struct ra_graph *g, + struct lcra_state *l, midgard_instruction *ins) { - ssa_args args = ins->ssa_args; - switch (ins->type) { - case TAG_ALU_4: { - struct phys_reg src1 = index_to_reg(ctx, g, args.src[0]); - struct phys_reg src2 = index_to_reg(ctx, g, args.src[1]); - struct phys_reg dest = index_to_reg(ctx, g, args.dest); - - unsigned uncomposed_mask = ins->mask; - ins->mask = compose_writemask(uncomposed_mask, dest); - - /* Adjust the dest mask if necessary. Mostly this is a no-op - * but it matters for dot products */ - dest.mask = effective_writemask(&ins->alu, ins->mask); - - midgard_vector_alu_src mod1 = - vector_alu_from_unsigned(ins->alu.src1); - mod1.swizzle = compose_swizzle(mod1.swizzle, uncomposed_mask, src1, dest); - ins->alu.src1 = vector_alu_srco_unsigned(mod1); + case TAG_ALU_4: + case TAG_ALU_8: + case TAG_ALU_12: + case TAG_ALU_16: { + if (ins->compact_branch) + return; + + struct phys_reg src1 = index_to_reg(ctx, l, ins->src[0], mir_srcsize(ins, 0)); + struct phys_reg src2 = index_to_reg(ctx, l, ins->src[1], mir_srcsize(ins, 1)); + struct phys_reg dest = index_to_reg(ctx, l, ins->dest, mir_typesize(ins)); + + mir_set_bytemask(ins, mir_bytemask(ins) << dest.offset); + + unsigned dest_offset = + GET_CHANNEL_COUNT(alu_opcode_props[ins->alu.op].props) ? 0 : + dest.offset; + + offset_swizzle(ins->swizzle[0], src1.offset, src1.size, dest_offset); ins->registers.src1_reg = src1.reg; - ins->registers.src2_imm = args.inline_constant; + ins->registers.src2_imm = ins->has_inline_constant; - if (args.inline_constant) { + if (ins->has_inline_constant) { /* Encode inline 16-bit constant. See disassembler for * where the algorithm is from */ @@ -827,8 +626,7 @@ } else { midgard_vector_alu_src mod2 = vector_alu_from_unsigned(ins->alu.src2); - mod2.swizzle = compose_swizzle( - mod2.swizzle, uncomposed_mask, src2, dest); + offset_swizzle(ins->swizzle[1], src2.offset, src2.size, dest_offset); ins->alu.src2 = vector_alu_srco_unsigned(mod2); ins->registers.src2_reg = src2.reg; @@ -846,57 +644,35 @@ bool encodes_src = OP_IS_STORE(ins->load_store.op); if (encodes_src) { - struct phys_reg src = index_to_reg(ctx, g, args.src[0]); + struct phys_reg src = index_to_reg(ctx, l, ins->src[0], mir_srcsize(ins, 0)); assert(src.reg == 26 || src.reg == 27); ins->load_store.reg = src.reg - 26; - - unsigned shift = __builtin_ctz(src.mask); - unsigned adjusted_mask = src.mask >> shift; - assert(((adjusted_mask + 1) & adjusted_mask) == 0); - - unsigned new_swizzle = 0; - for (unsigned q = 0; q < 4; ++q) { - unsigned c = (ins->load_store.swizzle >> (2*q)) & 3; - new_swizzle |= (c + shift) << (2*q); - } - - ins->load_store.swizzle = compose_swizzle( - new_swizzle, src.mask, - default_phys_reg(0), src); + offset_swizzle(ins->swizzle[0], src.offset, src.size, 0); } else { - unsigned r = encodes_src ? - args.src[0] : args.dest; - - struct phys_reg src = index_to_reg(ctx, g, r); - - ins->load_store.reg = src.reg; + struct phys_reg dst = index_to_reg(ctx, l, ins->dest, mir_typesize(ins)); - ins->load_store.swizzle = compose_swizzle( - ins->load_store.swizzle, 0xF, - default_phys_reg(0), src); - - ins->mask = compose_writemask( - ins->mask, src); + ins->load_store.reg = dst.reg; + offset_swizzle(ins->swizzle[0], 0, 4, dst.offset); + mir_set_bytemask(ins, mir_bytemask(ins) << dst.offset); } /* We also follow up by actual arguments */ - int src2 = - encodes_src ? args.src[1] : args.src[0]; - - int src3 = - encodes_src ? args.src[2] : args.src[1]; + unsigned src2 = ins->src[1]; + unsigned src3 = ins->src[2]; - if (src2 >= 0) { - struct phys_reg src = index_to_reg(ctx, g, src2); - unsigned component = __builtin_ctz(src.mask); + if (src2 != ~0) { + struct phys_reg src = index_to_reg(ctx, l, src2, mir_srcsize(ins, 1)); + unsigned component = src.offset / src.size; + assert(component * src.size == src.offset); ins->load_store.arg_1 |= midgard_ldst_reg(src.reg, component); } - if (src3 >= 0) { - struct phys_reg src = index_to_reg(ctx, g, src3); - unsigned component = __builtin_ctz(src.mask); + if (src3 != ~0) { + struct phys_reg src = index_to_reg(ctx, l, src3, mir_srcsize(ins, 2)); + unsigned component = src.offset / src.size; + assert(component * src.size == src.offset); ins->load_store.arg_2 |= midgard_ldst_reg(src.reg, component); } @@ -905,35 +681,31 @@ case TAG_TEXTURE_4: { /* Grab RA results */ - struct phys_reg dest = index_to_reg(ctx, g, args.dest); - struct phys_reg coord = index_to_reg(ctx, g, args.src[0]); - struct phys_reg lod = index_to_reg(ctx, g, args.src[1]); - - assert(dest.reg == 28 || dest.reg == 29); - assert(coord.reg == 28 || coord.reg == 29); + struct phys_reg dest = index_to_reg(ctx, l, ins->dest, mir_typesize(ins)); + struct phys_reg coord = index_to_reg(ctx, l, ins->src[1], mir_srcsize(ins, 1)); + struct phys_reg lod = index_to_reg(ctx, l, ins->src[2], mir_srcsize(ins, 2)); + struct phys_reg offset = index_to_reg(ctx, l, ins->src[3], mir_srcsize(ins, 2)); /* First, install the texture coordinate */ ins->texture.in_reg_full = 1; ins->texture.in_reg_upper = 0; - ins->texture.in_reg_select = coord.reg - 28; - ins->texture.in_reg_swizzle = - compose_swizzle(ins->texture.in_reg_swizzle, 0xF, coord, dest); + ins->texture.in_reg_select = coord.reg & 1; + offset_swizzle(ins->swizzle[1], coord.offset, coord.size, 0); /* Next, install the destination */ ins->texture.out_full = 1; ins->texture.out_upper = 0; - ins->texture.out_reg_select = dest.reg - 28; - ins->texture.swizzle = - compose_swizzle(ins->texture.swizzle, dest.mask, dest, dest); - ins->mask = - compose_writemask(ins->mask, dest); + ins->texture.out_reg_select = dest.reg & 1; + offset_swizzle(ins->swizzle[0], 0, 4, dest.offset); + mir_set_bytemask(ins, mir_bytemask(ins) << dest.offset); /* If there is a register LOD/bias, use it */ - if (args.src[1] > -1) { + if (ins->src[2] != ~0) { + assert(!(lod.offset & 3)); midgard_tex_register_select sel = { - .select = lod.reg, + .select = lod.reg & 1, .full = 1, - .component = lod.swizzle & 3, + .component = lod.offset / 4 }; uint8_t packed; @@ -941,6 +713,24 @@ ins->texture.bias = packed; } + /* If there is an offset register, install it */ + if (ins->src[3] != ~0) { + unsigned x = offset.offset / 4; + unsigned y = x + 1; + unsigned z = x + 2; + + /* Check range, TODO: half-registers */ + assert(z < 4); + + ins->texture.offset = + (1) | /* full */ + (offset.reg & 1) << 1 | /* select */ + (0 << 2) | /* upper */ + (x << 3) | /* swizzle */ + (y << 5) | /* swizzle */ + (z << 7); /* swizzle */ + } + break; } @@ -949,13 +739,210 @@ } } -void -install_registers(compiler_context *ctx, struct ra_graph *g) +static void +install_registers(compiler_context *ctx, struct lcra_state *l) { + mir_foreach_instr_global(ctx, ins) + install_registers_instr(ctx, l, ins); +} + + +/* If register allocation fails, find the best spill node */ + +static signed +mir_choose_spill_node( + compiler_context *ctx, + struct lcra_state *l) +{ + /* We can't spill a previously spilled value or an unspill */ + + mir_foreach_instr_global(ctx, ins) { + if (ins->no_spill & (1 << l->spill_class)) { + lcra_set_node_spill_cost(l, ins->dest, -1); + + if (l->spill_class != REG_CLASS_WORK) { + mir_foreach_src(ins, s) + lcra_set_node_spill_cost(l, ins->src[s], -1); + } + } + } + + return lcra_get_best_spill_node(l); +} + +/* Once we've chosen a spill node, spill it */ + +static void +mir_spill_register( + compiler_context *ctx, + unsigned spill_node, + unsigned spill_class, + unsigned *spill_count) +{ + unsigned spill_index = ctx->temp_count; + + /* We have a spill node, so check the class. Work registers + * legitimately spill to TLS, but special registers just spill to work + * registers */ + + bool is_special = spill_class != REG_CLASS_WORK; + bool is_special_w = spill_class == REG_CLASS_TEXW; + + /* Allocate TLS slot (maybe) */ + unsigned spill_slot = !is_special ? (*spill_count)++ : 0; + + /* For TLS, replace all stores to the spilled node. For + * special reads, just keep as-is; the class will be demoted + * implicitly. For special writes, spill to a work register */ + + if (!is_special || is_special_w) { + if (is_special_w) + spill_slot = spill_index++; + + mir_foreach_block(ctx, block) { + mir_foreach_instr_in_block_safe(block, ins) { + if (ins->dest != spill_node) continue; + + midgard_instruction st; + + if (is_special_w) { + st = v_mov(spill_node, spill_slot); + st.no_spill |= (1 << spill_class); + } else { + ins->dest = spill_index++; + ins->no_spill |= (1 << spill_class); + st = v_load_store_scratch(ins->dest, spill_slot, true, ins->mask); + } + + /* Hint: don't rewrite this node */ + st.hint = true; + + mir_insert_instruction_after_scheduled(ctx, block, ins, st); + + if (!is_special) + ctx->spills++; + } + } + } + + /* For special reads, figure out how many bytes we need */ + unsigned read_bytemask = 0; + + mir_foreach_instr_global_safe(ctx, ins) { + read_bytemask |= mir_bytemask_of_read_components(ins, spill_node); + } + + /* Insert a load from TLS before the first consecutive + * use of the node, rewriting to use spilled indices to + * break up the live range. Or, for special, insert a + * move. Ironically the latter *increases* register + * pressure, but the two uses of the spilling mechanism + * are somewhat orthogonal. (special spilling is to use + * work registers to back special registers; TLS + * spilling is to use memory to back work registers) */ + mir_foreach_block(ctx, block) { mir_foreach_instr_in_block(block, ins) { - install_registers_instr(ctx, g, ins); + /* We can't rewrite the moves used to spill in the + * first place. These moves are hinted. */ + if (ins->hint) continue; + + /* If we don't use the spilled value, nothing to do */ + if (!mir_has_arg(ins, spill_node)) continue; + + unsigned index = 0; + + if (!is_special_w) { + index = ++spill_index; + + midgard_instruction *before = ins; + midgard_instruction st; + + if (is_special) { + /* Move */ + st = v_mov(spill_node, index); + st.no_spill |= (1 << spill_class); + } else { + /* TLS load */ + st = v_load_store_scratch(index, spill_slot, false, 0xF); + } + + /* Mask the load based on the component count + * actually needed to prevent RA loops */ + + st.mask = mir_from_bytemask(read_bytemask, midgard_reg_mode_32); + + mir_insert_instruction_before_scheduled(ctx, block, before, st); + } else { + /* Special writes already have their move spilled in */ + index = spill_slot; + } + + + /* Rewrite to use */ + mir_rewrite_index_src_single(ins, spill_node, index); + + if (!is_special) + ctx->fills++; + } + } + + /* Reset hints */ + + mir_foreach_instr_global(ctx, ins) { + ins->hint = false; + } +} + +/* Run register allocation in a loop, spilling until we succeed */ + +void +mir_ra(compiler_context *ctx) +{ + struct lcra_state *l = NULL; + bool spilled = false; + int iter_count = 1000; /* max iterations */ + + /* Number of 128-bit slots in memory we've spilled into */ + unsigned spill_count = 0; + + + mir_create_pipeline_registers(ctx); + + do { + if (spilled) { + signed spill_node = mir_choose_spill_node(ctx, l); + + if (spill_node == -1) { + fprintf(stderr, "ERROR: Failed to choose spill node\n"); + return; + } + + mir_spill_register(ctx, spill_node, l->spill_class, &spill_count); + } + + mir_squeeze_index(ctx); + mir_invalidate_liveness(ctx); + + if (l) { + lcra_free(l); + l = NULL; } + + l = allocate_registers(ctx, &spilled); + } while(spilled && ((iter_count--) > 0)); + + if (iter_count <= 0) { + fprintf(stderr, "panfrost: Gave up allocating registers, rendering will be incomplete\n"); + assert(0); } + /* Report spilling information. spill_count is in 128-bit slots (vec4 x + * fp32), but tls_size is in bytes, so multiply by 16 */ + + ctx->tls_size = spill_count * 16; + + install_registers(ctx, l); + + lcra_free(l); } diff -Nru mesa-19.2.8/src/panfrost/midgard/midgard_ra_pipeline.c mesa-20.0.8/src/panfrost/midgard/midgard_ra_pipeline.c --- mesa-19.2.8/src/panfrost/midgard/midgard_ra_pipeline.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/panfrost/midgard/midgard_ra_pipeline.c 2020-06-12 01:21:18.000000000 +0000 @@ -46,29 +46,35 @@ unsigned pipeline_count) { midgard_instruction *ins = bundle->instructions[i]; - unsigned dest = ins->ssa_args.dest; /* We could be pipelining a register, so we need to make sure that all * of the components read in this bundle are written in this bundle, * and that no components are written before this bundle */ - unsigned node = ins->ssa_args.dest; + unsigned node = ins->dest; unsigned read_mask = 0; - /* Analyze the bundle for a read mask */ + /* Analyze the bundle for a per-byte read mask */ - for (unsigned i = 0; i < bundle->instruction_count; ++i) { - midgard_instruction *q = bundle->instructions[i]; - read_mask |= mir_mask_of_read_components(q, node); + for (unsigned j = 0; j < bundle->instruction_count; ++j) { + midgard_instruction *q = bundle->instructions[j]; + read_mask |= mir_bytemask_of_read_components(q, node); + + /* The fragment colour can't be pipelined (well, it is + * pipelined in r0, but this is a delicate dance with + * scheduling and RA, not for us to worry about) */ + + if (q->compact_branch && q->writeout && mir_has_arg(q, node)) + return false; } /* Now analyze for a write mask */ - for (unsigned i = 0; i < bundle->instruction_count; ++i) { - midgard_instruction *q = bundle->instructions[i]; - if (q->ssa_args.dest != node) continue; + for (unsigned j = 0; j < bundle->instruction_count; ++j) { + midgard_instruction *q = bundle->instructions[j]; + if (q->dest != node) continue; /* Remove the written mask from the read requirements */ - read_mask &= ~q->mask; + read_mask &= ~mir_bytemask(q); } /* Check for leftovers */ @@ -87,12 +93,12 @@ midgard_instruction *end = bundle->instructions[ bundle->instruction_count - 1]; - if (mir_is_live_after(ctx, block, end, ins->ssa_args.dest)) + if (mir_is_live_after(ctx, block, end, ins->dest)) return false; /* We're only live in this bundle -- pipeline! */ - mir_rewrite_index(ctx, dest, SSA_FIXED_REGISTER(24 + pipeline_count)); + mir_rewrite_index(ctx, node, SSA_FIXED_REGISTER(24 + pipeline_count)); return true; } @@ -100,6 +106,8 @@ void mir_create_pipeline_registers(compiler_context *ctx) { + mir_invalidate_liveness(ctx); + mir_foreach_block(ctx, block) { mir_foreach_bundle_in_block(block, bundle) { if (!mir_is_alu_bundle(bundle)) continue; diff -Nru mesa-19.2.8/src/panfrost/midgard/midgard_schedule.c mesa-20.0.8/src/panfrost/midgard/midgard_schedule.c --- mesa-19.2.8/src/panfrost/midgard/midgard_schedule.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/panfrost/midgard/midgard_schedule.c 2020-06-12 01:21:18.000000000 +0000 @@ -23,941 +23,1123 @@ #include "compiler.h" #include "midgard_ops.h" +#include "midgard_quirks.h" #include "util/u_memory.h" -#include "util/register_allocate.h" -/* Create a mask of accessed components from a swizzle to figure out vector - * dependencies */ +/* Scheduling for Midgard is complicated, to say the least. ALU instructions + * must be grouped into VLIW bundles according to following model: + * + * [VMUL] [SADD] + * [VADD] [SMUL] [VLUT] + * + * A given instruction can execute on some subset of the units (or a few can + * execute on all). Instructions can be either vector or scalar; only scalar + * instructions can execute on SADD/SMUL units. Units on a given line execute + * in parallel. Subsequent lines execute separately and can pass results + * directly via pipeline registers r24/r25, bypassing the register file. + * + * A bundle can optionally have 128-bits of embedded constants, shared across + * all of the instructions within a bundle. + * + * Instructions consuming conditionals (branches and conditional selects) + * require their condition to be written into the conditional register (r31) + * within the same bundle they are consumed. + * + * Fragment writeout requires its argument to be written in full within the + * same bundle as the branch, with no hanging dependencies. + * + * Load/store instructions are also in bundles of simply two instructions, and + * texture instructions have no bundling. + * + * ------------------------------------------------------------------------- + * + */ -static unsigned -swizzle_to_access_mask(unsigned swizzle) +/* We create the dependency graph with per-byte granularity */ + +#define BYTE_COUNT 16 + +static void +add_dependency(struct util_dynarray *table, unsigned index, uint16_t mask, midgard_instruction **instructions, unsigned child) { - unsigned component_mask = 0; + for (unsigned i = 0; i < BYTE_COUNT; ++i) { + if (!(mask & (1 << i))) + continue; - for (int i = 0; i < 4; ++i) { - unsigned c = (swizzle >> (2 * i)) & 3; - component_mask |= (1 << c); - } + struct util_dynarray *parents = &table[(BYTE_COUNT * index) + i]; - return component_mask; -} + util_dynarray_foreach(parents, unsigned, parent) { + BITSET_WORD *dependents = instructions[*parent]->dependents; -/* Does the mask cover more than a scalar? */ + /* Already have the dependency */ + if (BITSET_TEST(dependents, child)) + continue; -static bool -is_single_component_mask(unsigned mask) + BITSET_SET(dependents, child); + instructions[child]->nr_dependencies++; + } + } +} + +static void +mark_access(struct util_dynarray *table, unsigned index, uint16_t mask, unsigned parent) { - int components = 0; + for (unsigned i = 0; i < BYTE_COUNT; ++i) { + if (!(mask & (1 << i))) + continue; - for (int c = 0; c < 8; ++c) { - if (mask & (1 << c)) - components++; + util_dynarray_append(&table[(BYTE_COUNT * index) + i], unsigned, parent); } - - return components == 1; } -/* Checks for an SSA data hazard between two adjacent instructions, keeping in - * mind that we are a vector architecture and we can write to different - * components simultaneously */ - -static bool -can_run_concurrent_ssa(midgard_instruction *first, midgard_instruction *second) +static void +mir_create_dependency_graph(midgard_instruction **instructions, unsigned count, unsigned node_count) { - /* Writeout has its own rules anyway */ - if (first->compact_branch || second->compact_branch) - return true; + size_t sz = node_count * BYTE_COUNT; - /* Each instruction reads some registers and writes to a register. See - * where the first writes */ + struct util_dynarray *last_read = calloc(sizeof(struct util_dynarray), sz); + struct util_dynarray *last_write = calloc(sizeof(struct util_dynarray), sz); - int source = first->ssa_args.dest; - int source_mask = first->mask; + for (unsigned i = 0; i < sz; ++i) { + util_dynarray_init(&last_read[i], NULL); + util_dynarray_init(&last_write[i], NULL); + } + + /* Initialize dependency graph */ + for (unsigned i = 0; i < count; ++i) { + instructions[i]->dependents = + calloc(BITSET_WORDS(count), sizeof(BITSET_WORD)); + + instructions[i]->nr_dependencies = 0; + } - /* As long as the second doesn't read from the first, we're okay */ - for (unsigned i = 0; i < ARRAY_SIZE(second->ssa_args.src); ++i) { - if (second->ssa_args.src[i] != source) + /* Populate dependency graph */ + for (signed i = count - 1; i >= 0; --i) { + if (instructions[i]->compact_branch) continue; - if (first->type != TAG_ALU_4) - return false; + unsigned dest = instructions[i]->dest; + unsigned mask = mir_bytemask(instructions[i]); - /* Figure out which components we just read from */ + mir_foreach_src((*instructions), s) { + unsigned src = instructions[i]->src[s]; - int q = (i == 0) ? second->alu.src1 : second->alu.src2; - midgard_vector_alu_src *m = (midgard_vector_alu_src *) &q; + if (src < node_count) { + unsigned readmask = mir_bytemask_of_read_components(instructions[i], src); + add_dependency(last_write, src, readmask, instructions, i); + } + } - /* Check if there are components in common, and fail if so */ - if (swizzle_to_access_mask(m->swizzle) & source_mask) - return false; + if (dest < node_count) { + add_dependency(last_read, dest, mask, instructions, i); + add_dependency(last_write, dest, mask, instructions, i); + mark_access(last_write, dest, mask, i); + } + + mir_foreach_src((*instructions), s) { + unsigned src = instructions[i]->src[s]; + + if (src < node_count) { + unsigned readmask = mir_bytemask_of_read_components(instructions[i], src); + mark_access(last_read, src, readmask, i); + } + } } - /* Otherwise, it's safe in that regard. Another data hazard is both - * writing to the same place, of course */ + /* If there is a branch, all instructions depend on it, as interblock + * execution must be purely in-order */ - if (second->ssa_args.dest == source) { - /* ...but only if the components overlap */ + if (instructions[count - 1]->compact_branch) { + BITSET_WORD *dependents = instructions[count - 1]->dependents; - if (second->mask & source_mask) - return false; + for (signed i = count - 2; i >= 0; --i) { + if (BITSET_TEST(dependents, i)) + continue; + + BITSET_SET(dependents, i); + instructions[i]->nr_dependencies++; + } } - /* ...That's it */ - return true; + /* Free the intermediate structures */ + for (unsigned i = 0; i < sz; ++i) { + util_dynarray_fini(&last_read[i]); + util_dynarray_fini(&last_write[i]); + } + + free(last_read); + free(last_write); } +/* Does the mask cover more than a scalar? */ + static bool -midgard_has_hazard( - midgard_instruction **segment, unsigned segment_size, - midgard_instruction *ains) +is_single_component_mask(unsigned mask) { - for (int s = 0; s < segment_size; ++s) - if (!can_run_concurrent_ssa(segment[s], ains)) - return true; - - return false; + int components = 0; + for (int c = 0; c < 8; ++c) { + if (mask & (1 << c)) + components++; + } + return components == 1; } -/* Fragment writeout (of r0) is allowed when: - * - * - All components of r0 are written in the bundle - * - No components of r0 are written in VLUT - * - Non-pipelined dependencies of r0 are not written in the bundle - * - * This function checks if these requirements are satisfied given the content - * of a scheduled bundle. - */ +/* Helpers for scheudling */ static bool -can_writeout_fragment(compiler_context *ctx, midgard_instruction **bundle, unsigned count, unsigned node_count) +mir_is_scalar(midgard_instruction *ains) { - /* First scan for which components of r0 are written out. Initially - * none are written */ + /* Do we try to use it as a vector op? */ + if (!is_single_component_mask(ains->mask)) + return false; - uint8_t r0_written_mask = 0x0; + /* Otherwise, check mode hazards */ + bool could_scalar = true; - /* Simultaneously we scan for the set of dependencies */ - BITSET_WORD *dependencies = calloc(sizeof(BITSET_WORD), BITSET_WORDS(node_count)); + /* Only 16/32-bit can run on a scalar unit */ + could_scalar &= ains->alu.reg_mode != midgard_reg_mode_8; + could_scalar &= ains->alu.reg_mode != midgard_reg_mode_64; + could_scalar &= ains->alu.dest_override == midgard_dest_override_none; - for (unsigned i = 0; i < count; ++i) { - midgard_instruction *ins = bundle[i]; + if (ains->alu.reg_mode == midgard_reg_mode_16) { + /* If we're running in 16-bit mode, we + * can't have any 8-bit sources on the + * scalar unit (since the scalar unit + * doesn't understand 8-bit) */ - if (ins->ssa_args.dest != SSA_FIXED_REGISTER(0)) - continue; + midgard_vector_alu_src s1 = + vector_alu_from_unsigned(ains->alu.src1); - /* Record written out mask */ - r0_written_mask |= ins->mask; + could_scalar &= !s1.half; - /* Record dependencies, but only if they won't become pipeline - * registers. We know we can't be live after this, because - * we're writeout at the very end of the shader. So check if - * they were written before us. */ + midgard_vector_alu_src s2 = + vector_alu_from_unsigned(ains->alu.src2); - unsigned src0 = ins->ssa_args.src[0]; - unsigned src1 = ins->ssa_args.src[1]; + could_scalar &= !s2.half; + } - if (!mir_is_written_before(ctx, bundle[0], src0)) - src0 = -1; + return could_scalar; +} - if (!mir_is_written_before(ctx, bundle[0], src1)) - src1 = -1; +/* How many bytes does this ALU instruction add to the bundle? */ - if ((src0 > 0) && (src0 < node_count)) - BITSET_SET(dependencies, src0); +static unsigned +bytes_for_instruction(midgard_instruction *ains) +{ + if (ains->unit & UNITS_ANY_VECTOR) + return sizeof(midgard_reg_info) + sizeof(midgard_vector_alu); + else if (ains->unit == ALU_ENAB_BRANCH) + return sizeof(midgard_branch_extended); + else if (ains->compact_branch) + return sizeof(ains->br_compact); + else + return sizeof(midgard_reg_info) + sizeof(midgard_scalar_alu); +} - if ((src1 > 0) && (src1 < node_count)) - BITSET_SET(dependencies, src1); +/* We would like to flatten the linked list of midgard_instructions in a bundle + * to an array of pointers on the heap for easy indexing */ - /* Requirement 2 */ - if (ins->unit == UNIT_VLUT) - return false; - } +static midgard_instruction ** +flatten_mir(midgard_block *block, unsigned *len) +{ + *len = list_length(&block->instructions); - /* Requirement 1 */ - if ((r0_written_mask & 0xF) != 0xF) - return false; + if (!(*len)) + return NULL; - /* Requirement 3 */ + midgard_instruction **instructions = + calloc(sizeof(midgard_instruction *), *len); - for (unsigned i = 0; i < count; ++i) { - unsigned dest = bundle[i]->ssa_args.dest; + unsigned i = 0; - if (dest < node_count && BITSET_TEST(dependencies, dest)) - return false; - } + mir_foreach_instr_in_block(block, ins) + instructions[i++] = ins; - /* Otherwise, we're good to go */ - return true; + return instructions; } -/* Schedules, but does not emit, a single basic block. After scheduling, the - * final tag and size of the block are known, which are necessary for branching - * */ +/* The worklist is the set of instructions that can be scheduled now; that is, + * the set of instructions with no remaining dependencies */ -static midgard_bundle -schedule_bundle(compiler_context *ctx, midgard_block *block, midgard_instruction *ins, int *skip) +static void +mir_initialize_worklist(BITSET_WORD *worklist, midgard_instruction **instructions, unsigned count) { - int instructions_emitted = 0, packed_idx = 0; - midgard_bundle bundle = { 0 }; + for (unsigned i = 0; i < count; ++i) { + if (instructions[i]->nr_dependencies == 0) + BITSET_SET(worklist, i); + } +} - midgard_instruction *scheduled[5] = { NULL }; +/* Update the worklist after an instruction terminates. Remove its edges from + * the graph and if that causes any node to have no dependencies, add it to the + * worklist */ - uint8_t tag = ins->type; +static void +mir_update_worklist( + BITSET_WORD *worklist, unsigned count, + midgard_instruction **instructions, midgard_instruction *done) +{ + /* Sanity check: if no instruction terminated, there is nothing to do. + * If the instruction that terminated had dependencies, that makes no + * sense and means we messed up the worklist. Finally, as the purpose + * of this routine is to update dependents, we abort early if there are + * no dependents defined. */ - /* Default to the instruction's tag */ - bundle.tag = tag; + if (!done) + return; - switch (ins->type) { - case TAG_ALU_4: { - uint32_t control = 0; - size_t bytes_emitted = sizeof(control); + assert(done->nr_dependencies == 0); - /* TODO: Constant combining */ - int index = 0, last_unit = 0; + if (!done->dependents) + return; - /* Previous instructions, for the purpose of parallelism */ - midgard_instruction *segment[4] = {0}; - int segment_size = 0; + /* We have an instruction with dependents. Iterate each dependent to + * remove one dependency (`done`), adding dependents to the worklist + * where possible. */ - instructions_emitted = -1; - midgard_instruction *pins = ins; + unsigned i; + BITSET_FOREACH_SET(i, done->dependents, count) { + assert(instructions[i]->nr_dependencies); - unsigned constant_count = 0; + if (!(--instructions[i]->nr_dependencies)) + BITSET_SET(worklist, i); + } - for (;;) { - midgard_instruction *ains = pins; + free(done->dependents); +} - /* Advance instruction pointer */ - if (index) { - ains = mir_next_op(pins); - pins = ains; - } +/* While scheduling, we need to choose instructions satisfying certain + * criteria. As we schedule backwards, we choose the *last* instruction in the + * worklist to simulate in-order scheduling. Chosen instructions must satisfy a + * given predicate. */ + +struct midgard_predicate { + /* TAG or ~0 for dont-care */ + unsigned tag; + + /* True if we want to pop off the chosen instruction */ + bool destructive; + + /* For ALU, choose only this unit */ + unsigned unit; + + /* State for bundle constants. constants is the actual constants + * for the bundle. constant_count is the number of bytes (up to + * 16) currently in use for constants. When picking in destructive + * mode, the constants array will be updated, and the instruction + * will be adjusted to index into the constants array */ + + midgard_constants *constants; + unsigned constant_mask; + bool blend_constant; + + /* Exclude this destination (if not ~0) */ + unsigned exclude; + + /* Don't schedule instructions consuming conditionals (since we already + * scheduled one). Excludes conditional branches and csel */ + bool no_cond; + + /* Require a minimal mask and (if nonzero) given destination. Used for + * writeout optimizations */ + + unsigned mask; + unsigned dest; +}; - /* Out-of-work condition */ - if ((struct list_head *) ains == &block->instructions) - break; +/* For an instruction that can fit, adjust it to fit and update the constants + * array, in destructive mode. Returns whether the fitting was successful. */ - /* Ensure that the chain can continue */ - if (ains->type != TAG_ALU_4) break; +static bool +mir_adjust_constants(midgard_instruction *ins, + struct midgard_predicate *pred, + bool destructive) +{ + /* Blend constants dominate */ + if (ins->has_blend_constant) { + if (pred->constant_mask) + return false; + else if (destructive) { + pred->blend_constant = true; + pred->constant_mask = 0xffff; + return true; + } + } - /* If there's already something in the bundle and we - * have weird scheduler constraints, break now */ - if (ains->precede_break && index) break; - - /* According to the presentation "The ARM - * Mali-T880 Mobile GPU" from HotChips 27, - * there are two pipeline stages. Branching - * position determined experimentally. Lines - * are executed in parallel: - * - * [ VMUL ] [ SADD ] - * [ VADD ] [ SMUL ] [ LUT ] [ BRANCH ] - * - * Verify that there are no ordering dependencies here. - * - * TODO: Allow for parallelism!!! - */ + /* No constant, nothing to adjust */ + if (!ins->has_constants) + return true; - /* Pick a unit for it if it doesn't force a particular unit */ + unsigned r_constant = SSA_FIXED_REGISTER(REGISTER_CONSTANT); + midgard_reg_mode reg_mode = ins->alu.reg_mode; - int unit = ains->unit; + midgard_vector_alu_src const_src = { }; - if (!unit) { - int op = ains->alu.op; - int units = alu_opcode_props[op].props; - - bool scalarable = units & UNITS_SCALAR; - bool could_scalar = is_single_component_mask(ains->mask); - - /* Only 16/32-bit can run on a scalar unit */ - could_scalar &= ains->alu.reg_mode != midgard_reg_mode_8; - could_scalar &= ains->alu.reg_mode != midgard_reg_mode_64; - could_scalar &= ains->alu.dest_override == midgard_dest_override_none; - - if (ains->alu.reg_mode == midgard_reg_mode_16) { - /* If we're running in 16-bit mode, we - * can't have any 8-bit sources on the - * scalar unit (since the scalar unit - * doesn't understand 8-bit) */ - - midgard_vector_alu_src s1 = - vector_alu_from_unsigned(ains->alu.src1); - - could_scalar &= !s1.half; - - midgard_vector_alu_src s2 = - vector_alu_from_unsigned(ains->alu.src2); - - could_scalar &= !s2.half; - } - - bool scalar = could_scalar && scalarable; - - /* TODO: Check ahead-of-time for other scalar - * hazards that otherwise get aborted out */ - - if (scalar) - assert(units & UNITS_SCALAR); - - if (!scalar) { - if (last_unit >= UNIT_VADD) { - if (units & UNIT_VLUT) - unit = UNIT_VLUT; - else - break; - } else { - if ((units & UNIT_VMUL) && last_unit < UNIT_VMUL) - unit = UNIT_VMUL; - else if ((units & UNIT_VADD) && !(control & UNIT_VADD)) - unit = UNIT_VADD; - else if (units & UNIT_VLUT) - unit = UNIT_VLUT; - else - break; - } - } else { - if (last_unit >= UNIT_VADD) { - if ((units & UNIT_SMUL) && !(control & UNIT_SMUL)) - unit = UNIT_SMUL; - else if (units & UNIT_VLUT) - unit = UNIT_VLUT; - else - break; - } else { - if ((units & UNIT_VMUL) && (last_unit < UNIT_VMUL)) - unit = UNIT_VMUL; - else if ((units & UNIT_SADD) && !(control & UNIT_SADD) && !midgard_has_hazard(segment, segment_size, ains)) - unit = UNIT_SADD; - else if (units & UNIT_VADD) - unit = UNIT_VADD; - else if (units & UNIT_SMUL) - unit = UNIT_SMUL; - else if (units & UNIT_VLUT) - unit = UNIT_VLUT; - else - break; - } - } + if (ins->src[0] == r_constant) + const_src = vector_alu_from_unsigned(ins->alu.src1); + else if (ins->src[1] == r_constant) + const_src = vector_alu_from_unsigned(ins->alu.src2); + + unsigned type_size = mir_bytes_for_mode(reg_mode); + + /* If the ALU is converting up we need to divide type_size by 2 */ + if (const_src.half) + type_size /= 2; + + unsigned max_comp = 16 / type_size; + unsigned comp_mask = mir_from_bytemask(mir_bytemask_of_read_components(ins, r_constant), + reg_mode); + unsigned type_mask = (1 << type_size) - 1; + unsigned bundle_constant_mask = pred->constant_mask; + unsigned comp_mapping[16] = { }; + uint8_t bundle_constants[16]; + + memcpy(bundle_constants, pred->constants, 16); + + /* Let's try to find a place for each active component of the constant + * register. + */ + for (unsigned comp = 0; comp < max_comp; comp++) { + if (!(comp_mask & (1 << comp))) + continue; - assert(unit & units); - } + uint8_t *constantp = ins->constants.u8 + (type_size * comp); + unsigned best_reuse_bytes = 0; + signed best_place = -1; + unsigned i, j; - /* Late unit check, this time for encoding (not parallelism) */ - if (unit <= last_unit) break; + for (i = 0; i < 16; i += type_size) { + unsigned reuse_bytes = 0; - /* Clear the segment */ - if (last_unit < UNIT_VADD && unit >= UNIT_VADD) - segment_size = 0; + for (j = 0; j < type_size; j++) { + if (!(bundle_constant_mask & (1 << (i + j)))) + continue; + if (constantp[j] != bundle_constants[i + j]) + break; - if (midgard_has_hazard(segment, segment_size, ains)) + reuse_bytes++; + } + + /* Select the place where existing bytes can be + * reused so we leave empty slots to others + */ + if (j == type_size && + (reuse_bytes > best_reuse_bytes || best_place < 0)) { + best_reuse_bytes = reuse_bytes; + best_place = i; break; + } + } - /* We're good to go -- emit the instruction */ - ains->unit = unit; + /* This component couldn't fit in the remaining constant slot, + * no need check the remaining components, bail out now + */ + if (best_place < 0) + return false; - segment[segment_size++] = ains; + memcpy(&bundle_constants[i], constantp, type_size); + bundle_constant_mask |= type_mask << best_place; + comp_mapping[comp] = best_place / type_size; + } - /* We try to reuse constants if possible, by adjusting - * the swizzle */ + /* If non-destructive, we're done */ + if (!destructive) + return true; - if (ains->has_blend_constant) { - /* Everything conflicts with the blend constant */ - if (bundle.has_embedded_constants) - break; + /* Otherwise update the constant_mask and constant values */ + pred->constant_mask = bundle_constant_mask; + memcpy(pred->constants, bundle_constants, 16); + + /* Use comp_mapping as a swizzle */ + mir_foreach_src(ins, s) { + if (ins->src[s] == r_constant) + mir_compose_swizzle(ins->swizzle[s], comp_mapping, ins->swizzle[s]); + } - bundle.has_blend_constant = 1; - bundle.has_embedded_constants = 1; - } else if (ains->has_constants && ains->alu.reg_mode == midgard_reg_mode_16) { - /* TODO: DRY with the analysis pass */ + return true; +} - if (bundle.has_blend_constant) - break; +static midgard_instruction * +mir_choose_instruction( + midgard_instruction **instructions, + BITSET_WORD *worklist, unsigned count, + struct midgard_predicate *predicate) +{ + /* Parse the predicate */ + unsigned tag = predicate->tag; + bool alu = tag == TAG_ALU_4; + unsigned unit = predicate->unit; + bool branch = alu && (unit == ALU_ENAB_BR_COMPACT); + bool scalar = (unit != ~0) && (unit & UNITS_SCALAR); + bool no_cond = predicate->no_cond; - if (constant_count) - break; + unsigned mask = predicate->mask; + unsigned dest = predicate->dest; + bool needs_dest = mask & 0xF; - /* TODO: Fix packing XXX */ - uint16_t *bundles = (uint16_t *) bundle.constants; - uint32_t *constants = (uint32_t *) ains->constants; - - /* Copy them wholesale */ - for (unsigned i = 0; i < 4; ++i) - bundles[i] = constants[i]; - - bundle.has_embedded_constants = true; - constant_count = 4; - } else if (ains->has_constants) { - /* By definition, blend constants conflict with - * everything, so if there are already - * constants we break the bundle *now* */ + /* Iterate to find the best instruction satisfying the predicate */ + unsigned i; - if (bundle.has_blend_constant) - break; + signed best_index = -1; + bool best_conditional = false; - /* For anything but blend constants, we can do - * proper analysis, however */ + /* Enforce a simple metric limiting distance to keep down register + * pressure. TOOD: replace with liveness tracking for much better + * results */ - /* TODO: Mask by which are used */ - uint32_t *constants = (uint32_t *) ains->constants; - uint32_t *bundles = (uint32_t *) bundle.constants; - - uint32_t indices[4] = { 0 }; - bool break_bundle = false; - - for (unsigned i = 0; i < 4; ++i) { - uint32_t cons = constants[i]; - bool constant_found = false; - - /* Search for the constant */ - for (unsigned j = 0; j < constant_count; ++j) { - if (bundles[j] != cons) - continue; - - /* We found it, reuse */ - indices[i] = j; - constant_found = true; - break; - } - - if (constant_found) - continue; - - /* We didn't find it, so allocate it */ - unsigned idx = constant_count++; - - if (idx >= 4) { - /* Uh-oh, out of space */ - break_bundle = true; - break; - } - - /* We have space, copy it in! */ - bundles[idx] = cons; - indices[i] = idx; - } + unsigned max_active = 0; + unsigned max_distance = 6; - if (break_bundle) - break; + BITSET_FOREACH_SET(i, worklist, count) { + max_active = MAX2(max_active, i); + } - /* Cool, we have it in. So use indices as a - * swizzle */ + BITSET_FOREACH_SET(i, worklist, count) { + if ((max_active - i) >= max_distance) + continue; - unsigned swizzle = SWIZZLE_FROM_ARRAY(indices); - unsigned r_constant = SSA_FIXED_REGISTER(REGISTER_CONSTANT); + if (tag != ~0 && instructions[i]->type != tag) + continue; - if (ains->ssa_args.src[0] == r_constant) - ains->alu.src1 = vector_alu_apply_swizzle(ains->alu.src1, swizzle); + if (predicate->exclude != ~0 && instructions[i]->dest == predicate->exclude) + continue; - if (ains->ssa_args.src[1] == r_constant) - ains->alu.src2 = vector_alu_apply_swizzle(ains->alu.src2, swizzle); + if (alu && !branch && !(alu_opcode_props[instructions[i]->alu.op].props & unit)) + continue; - bundle.has_embedded_constants = true; - } + if (branch && !instructions[i]->compact_branch) + continue; - if (ains->unit & UNITS_ANY_VECTOR) { - bytes_emitted += sizeof(midgard_reg_info); - bytes_emitted += sizeof(midgard_vector_alu); - } else if (ains->compact_branch) { - /* All of r0 has to be written out along with - * the branch writeout */ - - if (ains->writeout && !can_writeout_fragment(ctx, scheduled, index, ctx->temp_count)) { - /* We only work on full moves - * at the beginning. We could - * probably do better */ - if (index != 0) - break; - - /* Inject a move */ - midgard_instruction ins = v_mov(0, blank_alu_src, SSA_FIXED_REGISTER(0)); - ins.unit = UNIT_VMUL; - control |= ins.unit; - - /* TODO don't leak */ - midgard_instruction *move = - mem_dup(&ins, sizeof(midgard_instruction)); - bytes_emitted += sizeof(midgard_reg_info); - bytes_emitted += sizeof(midgard_vector_alu); - bundle.instructions[packed_idx++] = move; - } - - if (ains->unit == ALU_ENAB_BRANCH) { - bytes_emitted += sizeof(midgard_branch_extended); - } else { - bytes_emitted += sizeof(ains->br_compact); - } - } else { - bytes_emitted += sizeof(midgard_reg_info); - bytes_emitted += sizeof(midgard_scalar_alu); - } + if (alu && scalar && !mir_is_scalar(instructions[i])) + continue; - /* Defer marking until after writing to allow for break */ - scheduled[index] = ains; - control |= ains->unit; - last_unit = ains->unit; - ++instructions_emitted; - ++index; - } + if (alu && !mir_adjust_constants(instructions[i], predicate, false)) + continue; - int padding = 0; + if (needs_dest && instructions[i]->dest != dest) + continue; - /* Pad ALU op to nearest word */ + if (mask && ((~instructions[i]->mask) & mask)) + continue; - if (bytes_emitted & 15) { - padding = 16 - (bytes_emitted & 15); - bytes_emitted += padding; - } + bool conditional = alu && !branch && OP_IS_CSEL(instructions[i]->alu.op); + conditional |= (branch && instructions[i]->branch.conditional); - /* Constants must always be quadwords */ - if (bundle.has_embedded_constants) - bytes_emitted += 16; + if (conditional && no_cond) + continue; - /* Size ALU instruction for tag */ - bundle.tag = (TAG_ALU_4) + (bytes_emitted / 16) - 1; - bundle.padding = padding; - bundle.control = bundle.tag | control; + /* Simulate in-order scheduling */ + if ((signed) i < best_index) + continue; - break; + best_index = i; + best_conditional = conditional; } - case TAG_LOAD_STORE_4: { - /* Load store instructions have two words at once. If - * we only have one queued up, we need to NOP pad. - * Otherwise, we store both in succession to save space - * and cycles -- letting them go in parallel -- skip - * the next. The usefulness of this optimisation is - * greatly dependent on the quality of the instruction - * scheduler. - */ - midgard_instruction *next_op = mir_next_op(ins); + /* Did we find anything? */ - if ((struct list_head *) next_op != &block->instructions && next_op->type == TAG_LOAD_STORE_4) { - /* TODO: Concurrency check */ - instructions_emitted++; - } + if (best_index < 0) + return NULL; - break; - } + /* If we found something, remove it from the worklist */ + assert(best_index < count); - case TAG_TEXTURE_4: { - /* Which tag we use depends on the shader stage */ - bool in_frag = ctx->stage == MESA_SHADER_FRAGMENT; - bundle.tag = in_frag ? TAG_TEXTURE_4 : TAG_TEXTURE_4_VTX; - break; - } + if (predicate->destructive) { + BITSET_CLEAR(worklist, best_index); - default: - unreachable("Unknown tag"); - break; + if (alu) + mir_adjust_constants(instructions[best_index], predicate, true); + + /* Once we schedule a conditional, we can't again */ + predicate->no_cond |= best_conditional; } - /* Copy the instructions into the bundle */ - bundle.instruction_count = instructions_emitted + 1 + packed_idx; + return instructions[best_index]; +} + +/* Still, we don't choose instructions in a vacuum. We need a way to choose the + * best bundle type (ALU, load/store, texture). Nondestructive. */ - midgard_instruction *uins = ins; - for (; packed_idx < bundle.instruction_count; ++packed_idx) { - bundle.instructions[packed_idx] = uins; - uins = mir_next_op(uins); - } +static unsigned +mir_choose_bundle( + midgard_instruction **instructions, + BITSET_WORD *worklist, unsigned count) +{ + /* At the moment, our algorithm is very simple - use the bundle of the + * best instruction, regardless of what else could be scheduled + * alongside it. This is not optimal but it works okay for in-order */ + + struct midgard_predicate predicate = { + .tag = ~0, + .destructive = false, + .exclude = ~0 + }; - *skip = instructions_emitted; + midgard_instruction *chosen = mir_choose_instruction(instructions, worklist, count, &predicate); - return bundle; + if (chosen) + return chosen->type; + else + return ~0; } -/* Schedule a single block by iterating its instruction to create bundles. - * While we go, tally about the bundle sizes to compute the block size. */ - +/* We want to choose an ALU instruction filling a given unit */ static void -schedule_block(compiler_context *ctx, midgard_block *block) +mir_choose_alu(midgard_instruction **slot, + midgard_instruction **instructions, + BITSET_WORD *worklist, unsigned len, + struct midgard_predicate *predicate, + unsigned unit) { - util_dynarray_init(&block->bundles, NULL); + /* Did we already schedule to this slot? */ + if ((*slot) != NULL) + return; + + /* Try to schedule something, if not */ + predicate->unit = unit; + *slot = mir_choose_instruction(instructions, worklist, len, predicate); + + /* Store unit upon scheduling */ + if (*slot && !((*slot)->compact_branch)) + (*slot)->unit = unit; +} - block->quadword_count = 0; +/* When we are scheduling a branch/csel, we need the consumed condition in the + * same block as a pipeline register. There are two options to enable this: + * + * - Move the conditional into the bundle. Preferred, but only works if the + * conditional is used only once and is from this block. + * - Copy the conditional. + * + * We search for the conditional. If it's in this block, single-use, and + * without embedded constants, we schedule it immediately. Otherwise, we + * schedule a move for it. + * + * mir_comparison_mobile is a helper to find the moveable condition. + */ + +static unsigned +mir_comparison_mobile( + compiler_context *ctx, + midgard_instruction **instructions, + struct midgard_predicate *predicate, + unsigned count, + unsigned cond) +{ + if (!mir_single_use(ctx, cond)) + return ~0; - mir_foreach_instr_in_block(block, ins) { - int skip; - midgard_bundle bundle = schedule_bundle(ctx, block, ins, &skip); - util_dynarray_append(&block->bundles, midgard_bundle, bundle); + unsigned ret = ~0; - if (bundle.has_blend_constant) { - /* TODO: Multiblock? */ - int quadwords_within_block = block->quadword_count + quadword_size(bundle.tag) - 1; - ctx->blend_constant_offset = quadwords_within_block * 0x10; - } + for (unsigned i = 0; i < count; ++i) { + if (instructions[i]->dest != cond) + continue; + + /* Must fit in an ALU bundle */ + if (instructions[i]->type != TAG_ALU_4) + return ~0; + + /* If it would itself require a condition, that's recursive */ + if (OP_IS_CSEL(instructions[i]->alu.op)) + return ~0; + + /* We'll need to rewrite to .w but that doesn't work for vector + * ops that don't replicate (ball/bany), so bail there */ + + if (GET_CHANNEL_COUNT(alu_opcode_props[instructions[i]->alu.op].props)) + return ~0; - while(skip--) - ins = mir_next_op(ins); + /* Ensure it will fit with constants */ - block->quadword_count += quadword_size(bundle.tag); + if (!mir_adjust_constants(instructions[i], predicate, false)) + return ~0; + + /* Ensure it is written only once */ + + if (ret != ~0) + return ~0; + else + ret = i; } - block->is_scheduled = true; + /* Inject constants now that we are sure we want to */ + if (ret != ~0) + mir_adjust_constants(instructions[ret], predicate, true); + + return ret; } -/* The following passes reorder MIR instructions to enable better scheduling */ +/* Using the information about the moveable conditional itself, we either pop + * that condition off the worklist for use now, or create a move to + * artificially schedule instead as a fallback */ -static void -midgard_pair_load_store(compiler_context *ctx, midgard_block *block) +static midgard_instruction * +mir_schedule_comparison( + compiler_context *ctx, + midgard_instruction **instructions, + struct midgard_predicate *predicate, + BITSET_WORD *worklist, unsigned count, + unsigned cond, bool vector, unsigned *swizzle, + midgard_instruction *user) { - mir_foreach_instr_in_block_safe(block, ins) { - if (ins->type != TAG_LOAD_STORE_4) continue; + /* TODO: swizzle when scheduling */ + unsigned comp_i = + (!vector && (swizzle[0] == 0)) ? + mir_comparison_mobile(ctx, instructions, predicate, count, cond) : ~0; - /* We've found a load/store op. Check if next is also load/store. */ - midgard_instruction *next_op = mir_next_op(ins); - if (&next_op->link != &block->instructions) { - if (next_op->type == TAG_LOAD_STORE_4) { - /* If so, we're done since we're a pair */ - ins = mir_next_op(ins); - continue; - } + /* If we can, schedule the condition immediately */ + if ((comp_i != ~0) && BITSET_TEST(worklist, comp_i)) { + assert(comp_i < count); + BITSET_CLEAR(worklist, comp_i); + return instructions[comp_i]; + } - /* Maximum search distance to pair, to avoid register pressure disasters */ - int search_distance = 8; + /* Otherwise, we insert a move */ - /* Otherwise, we have an orphaned load/store -- search for another load */ - mir_foreach_instr_in_block_from(block, c, mir_next_op(ins)) { - /* Terminate search if necessary */ - if (!(search_distance--)) break; + midgard_instruction mov = v_mov(cond, cond); + mov.mask = vector ? 0xF : 0x1; + memcpy(mov.swizzle[1], swizzle, sizeof(mov.swizzle[1])); - if (c->type != TAG_LOAD_STORE_4) continue; + return mir_insert_instruction_before(ctx, user, mov); +} - /* We can only reorder if there are no sources */ +/* Most generally, we need instructions writing to r31 in the appropriate + * components */ - bool deps = false; +static midgard_instruction * +mir_schedule_condition(compiler_context *ctx, + struct midgard_predicate *predicate, + BITSET_WORD *worklist, unsigned count, + midgard_instruction **instructions, + midgard_instruction *last) +{ + /* For a branch, the condition is the only argument; for csel, third */ + bool branch = last->compact_branch; + unsigned condition_index = branch ? 0 : 2; - for (unsigned s = 0; s < ARRAY_SIZE(ins->ssa_args.src); ++s) - deps |= (c->ssa_args.src[s] != -1); + /* csel_v is vector; otherwise, conditions are scalar */ + bool vector = !branch && OP_IS_CSEL_V(last->alu.op); - if (deps) - continue; + /* Grab the conditional instruction */ - /* We found one! Move it up to pair and remove it from the old location */ + midgard_instruction *cond = mir_schedule_comparison( + ctx, instructions, predicate, worklist, count, last->src[condition_index], + vector, last->swizzle[2], last); - mir_insert_instruction_before(ins, *c); - mir_remove_instruction(c); + /* We have exclusive reign over this (possibly move) conditional + * instruction. We can rewrite into a pipeline conditional register */ - break; - } + predicate->exclude = cond->dest; + cond->dest = SSA_FIXED_REGISTER(31); + + if (!vector) { + cond->mask = (1 << COMPONENT_W); + + mir_foreach_src(cond, s) { + if (cond->src[s] == ~0) + continue; + + for (unsigned q = 0; q < 4; ++q) + cond->swizzle[s][q + COMPONENT_W] = cond->swizzle[s][q]; } } + + /* Schedule the unit: csel is always in the latter pipeline, so a csel + * condition must be in the former pipeline stage (vmul/sadd), + * depending on scalar/vector of the instruction itself. A branch must + * be written from the latter pipeline stage and a branch condition is + * always scalar, so it is always in smul (exception: ball/bany, which + * will be vadd) */ + + if (branch) + cond->unit = UNIT_SMUL; + else + cond->unit = vector ? UNIT_VMUL : UNIT_SADD; + + return cond; } -/* When we're 'squeezing down' the values in the IR, we maintain a hash - * as such */ +/* Schedules a single bundle of the given type */ -static unsigned -find_or_allocate_temp(compiler_context *ctx, unsigned hash) +static midgard_bundle +mir_schedule_texture( + midgard_instruction **instructions, + BITSET_WORD *worklist, unsigned len) { - if ((hash < 0) || (hash >= SSA_FIXED_MINIMUM)) - return hash; - - unsigned temp = (uintptr_t) _mesa_hash_table_u64_search( - ctx->hash_to_temp, hash + 1); + struct midgard_predicate predicate = { + .tag = TAG_TEXTURE_4, + .destructive = true, + .exclude = ~0 + }; - if (temp) - return temp - 1; + midgard_instruction *ins = + mir_choose_instruction(instructions, worklist, len, &predicate); - /* If no temp is find, allocate one */ - temp = ctx->temp_count++; - ctx->max_hash = MAX2(ctx->max_hash, hash); + mir_update_worklist(worklist, len, instructions, ins); - _mesa_hash_table_u64_insert(ctx->hash_to_temp, - hash + 1, (void *) ((uintptr_t) temp + 1)); + struct midgard_bundle out = { + .tag = TAG_TEXTURE_4, + .instruction_count = 1, + .instructions = { ins } + }; - return temp; + return out; } -/* Reassigns numbering to get rid of gaps in the indices */ - -static void -mir_squeeze_index(compiler_context *ctx) +static midgard_bundle +mir_schedule_ldst( + midgard_instruction **instructions, + BITSET_WORD *worklist, unsigned len) { - /* Reset */ - ctx->temp_count = 0; - /* TODO don't leak old hash_to_temp */ - ctx->hash_to_temp = _mesa_hash_table_u64_create(NULL); - - mir_foreach_instr_global(ctx, ins) { - ins->ssa_args.dest = find_or_allocate_temp(ctx, ins->ssa_args.dest); - - for (unsigned i = 0; i < ARRAY_SIZE(ins->ssa_args.src); ++i) - ins->ssa_args.src[i] = find_or_allocate_temp(ctx, ins->ssa_args.src[i]); - } -} + struct midgard_predicate predicate = { + .tag = TAG_LOAD_STORE_4, + .destructive = true, + .exclude = ~0 + }; + + /* Try to pick two load/store ops. Second not gauranteed to exist */ -static midgard_instruction -v_load_store_scratch( - unsigned srcdest, - unsigned index, - bool is_store, - unsigned mask) -{ - /* We index by 32-bit vec4s */ - unsigned byte = (index * 4 * 4); - - midgard_instruction ins = { - .type = TAG_LOAD_STORE_4, - .mask = mask, - .ssa_args = { - .dest = -1, - .src = { -1, -1, -1 }, - }, - .load_store = { - .op = is_store ? midgard_op_st_int4 : midgard_op_ld_int4, - .swizzle = SWIZZLE_XYZW, - - /* For register spilling - to thread local storage */ - .arg_1 = 0xEA, - .arg_2 = 0x1E, - - /* Splattered across, TODO combine logically */ - .varying_parameters = (byte & 0x1FF) << 1, - .address = (byte >> 9) - }, + midgard_instruction *ins = + mir_choose_instruction(instructions, worklist, len, &predicate); - /* If we spill an unspill, RA goes into an infinite loop */ - .no_spill = true + midgard_instruction *pair = + mir_choose_instruction(instructions, worklist, len, &predicate); + + struct midgard_bundle out = { + .tag = TAG_LOAD_STORE_4, + .instruction_count = pair ? 2 : 1, + .instructions = { ins, pair } }; - if (is_store) { - /* r0 = r26, r1 = r27 */ - assert(srcdest == SSA_FIXED_REGISTER(26) || srcdest == SSA_FIXED_REGISTER(27)); - ins.ssa_args.src[0] = srcdest; - } else { - ins.ssa_args.dest = srcdest; - } + /* We have to update the worklist atomically, since the two + * instructions run concurrently (TODO: verify it's not pipelined) */ - return ins; -} + mir_update_worklist(worklist, len, instructions, ins); + mir_update_worklist(worklist, len, instructions, pair); -/* If register allocation fails, find the best spill node and spill it to fix - * whatever the issue was. This spill node could be a work register (spilling - * to thread local storage), but it could also simply be a special register - * that needs to spill to become a work register. */ + return out; +} -static void mir_spill_register( +static midgard_bundle +mir_schedule_alu( compiler_context *ctx, - struct ra_graph *g, - unsigned *spill_count) + midgard_instruction **instructions, + BITSET_WORD *worklist, unsigned len) { - unsigned spill_index = ctx->temp_count; + struct midgard_bundle bundle = {}; - /* Our first step is to calculate spill cost to figure out the best - * spill node. All nodes are equal in spill cost, but we can't spill - * nodes written to from an unspill */ + unsigned bytes_emitted = sizeof(bundle.control); - for (unsigned i = 0; i < ctx->temp_count; ++i) { - ra_set_node_spill_cost(g, i, 1.0); - } + struct midgard_predicate predicate = { + .tag = TAG_ALU_4, + .destructive = true, + .exclude = ~0, + .constants = &bundle.constants + }; - mir_foreach_instr_global(ctx, ins) { - if (ins->no_spill && - ins->ssa_args.dest >= 0 && - ins->ssa_args.dest < ctx->temp_count) - ra_set_node_spill_cost(g, ins->ssa_args.dest, -1.0); - } + midgard_instruction *vmul = NULL; + midgard_instruction *vadd = NULL; + midgard_instruction *vlut = NULL; + midgard_instruction *smul = NULL; + midgard_instruction *sadd = NULL; + midgard_instruction *branch = NULL; + + mir_choose_alu(&branch, instructions, worklist, len, &predicate, ALU_ENAB_BR_COMPACT); + mir_update_worklist(worklist, len, instructions, branch); + bool writeout = branch && branch->writeout; - int spill_node = ra_get_best_spill_node(g); + if (branch && branch->branch.conditional) { + midgard_instruction *cond = mir_schedule_condition(ctx, &predicate, worklist, len, instructions, branch); - if (spill_node < 0) { - mir_print_shader(ctx); - assert(0); + if (cond->unit == UNIT_VADD) + vadd = cond; + else if (cond->unit == UNIT_SMUL) + smul = cond; + else + unreachable("Bad condition"); } - /* We have a spill node, so check the class. Work registers - * legitimately spill to TLS, but special registers just spill to work - * registers */ + mir_choose_alu(&smul, instructions, worklist, len, &predicate, UNIT_SMUL); - unsigned class = ra_get_node_class(g, spill_node); - bool is_special = (class >> 2) != REG_CLASS_WORK; - bool is_special_w = (class >> 2) == REG_CLASS_TEXW; + if (!writeout) + mir_choose_alu(&vlut, instructions, worklist, len, &predicate, UNIT_VLUT); - /* Allocate TLS slot (maybe) */ - unsigned spill_slot = !is_special ? (*spill_count)++ : 0; + if (writeout) { + /* Propagate up */ + bundle.last_writeout = branch->last_writeout; - /* For TLS, replace all stores to the spilled node. For - * special reads, just keep as-is; the class will be demoted - * implicitly. For special writes, spill to a work register */ + midgard_instruction add = v_mov(~0, make_compiler_temp(ctx)); - if (!is_special || is_special_w) { - if (is_special_w) - spill_slot = spill_index++; + if (!ctx->is_blend) { + add.alu.op = midgard_alu_op_iadd; + add.src[0] = SSA_FIXED_REGISTER(31); - mir_foreach_instr_global_safe(ctx, ins) { - if (ins->ssa_args.dest != spill_node) continue; + for (unsigned c = 0; c < 16; ++c) + add.swizzle[0][c] = COMPONENT_X; - midgard_instruction st; + add.has_inline_constant = true; + add.inline_constant = 0; + } else { + add.src[1] = SSA_FIXED_REGISTER(1); - if (is_special_w) { - st = v_mov(spill_node, blank_alu_src, spill_slot); - st.no_spill = true; - } else { - ins->ssa_args.dest = SSA_FIXED_REGISTER(26); - st = v_load_store_scratch(ins->ssa_args.dest, spill_slot, true, ins->mask); - } + for (unsigned c = 0; c < 16; ++c) + add.swizzle[1][c] = COMPONENT_W; + } - /* Hint: don't rewrite this node */ - st.hint = true; + vadd = mem_dup(&add, sizeof(midgard_instruction)); - mir_insert_instruction_before(mir_next_op(ins), st); + vadd->unit = UNIT_VADD; + vadd->mask = 0x1; + branch->src[2] = add.dest; + } - if (!is_special) - ctx->spills++; - } + mir_choose_alu(&vadd, instructions, worklist, len, &predicate, UNIT_VADD); + + mir_update_worklist(worklist, len, instructions, vlut); + mir_update_worklist(worklist, len, instructions, vadd); + mir_update_worklist(worklist, len, instructions, smul); + + bool vadd_csel = vadd && OP_IS_CSEL(vadd->alu.op); + bool smul_csel = smul && OP_IS_CSEL(smul->alu.op); + + if (vadd_csel || smul_csel) { + midgard_instruction *ins = vadd_csel ? vadd : smul; + midgard_instruction *cond = mir_schedule_condition(ctx, &predicate, worklist, len, instructions, ins); + + if (cond->unit == UNIT_VMUL) + vmul = cond; + else if (cond->unit == UNIT_SADD) + sadd = cond; + else + unreachable("Bad condition"); } - /* For special reads, figure out how many components we need */ - unsigned read_mask = 0; + /* If we have a render target reference, schedule a move for it */ - mir_foreach_instr_global_safe(ctx, ins) { - read_mask |= mir_mask_of_read_components(ins, spill_node); + if (branch && branch->writeout && (branch->constants.u32[0] || ctx->is_blend)) { + midgard_instruction mov = v_mov(~0, make_compiler_temp(ctx)); + sadd = mem_dup(&mov, sizeof(midgard_instruction)); + sadd->unit = UNIT_SADD; + sadd->mask = 0x1; + sadd->has_inline_constant = true; + sadd->inline_constant = branch->constants.u32[0]; + branch->src[1] = mov.dest; + /* TODO: Don't leak */ } - /* Insert a load from TLS before the first consecutive - * use of the node, rewriting to use spilled indices to - * break up the live range. Or, for special, insert a - * move. Ironically the latter *increases* register - * pressure, but the two uses of the spilling mechanism - * are somewhat orthogonal. (special spilling is to use - * work registers to back special registers; TLS - * spilling is to use memory to back work registers) */ + /* Stage 2, let's schedule sadd before vmul for writeout */ + mir_choose_alu(&sadd, instructions, worklist, len, &predicate, UNIT_SADD); - mir_foreach_block(ctx, block) { - bool consecutive_skip = false; - unsigned consecutive_index = 0; + /* Check if writeout reads its own register */ - mir_foreach_instr_in_block(block, ins) { - /* We can't rewrite the moves used to spill in the - * first place. These moves are hinted. */ - if (ins->hint) continue; + if (branch && branch->writeout) { + midgard_instruction *stages[] = { sadd, vadd, smul }; + unsigned src = (branch->src[0] == ~0) ? SSA_FIXED_REGISTER(0) : branch->src[0]; + unsigned writeout_mask = 0x0; + bool bad_writeout = false; - if (!mir_has_arg(ins, spill_node)) { - consecutive_skip = false; + for (unsigned i = 0; i < ARRAY_SIZE(stages); ++i) { + if (!stages[i]) continue; - } - if (consecutive_skip) { - /* Rewrite */ - mir_rewrite_index_src_single(ins, spill_node, consecutive_index); + if (stages[i]->dest != src) continue; - } - if (!is_special_w) { - consecutive_index = ++spill_index; + writeout_mask |= stages[i]->mask; + bad_writeout |= mir_has_arg(stages[i], branch->src[0]); + } - midgard_instruction *before = ins; + /* It's possible we'll be able to schedule something into vmul + * to fill r0. Let's peak into the future, trying to schedule + * vmul specially that way. */ + + if (!bad_writeout && writeout_mask != 0xF) { + predicate.unit = UNIT_VMUL; + predicate.dest = src; + predicate.mask = writeout_mask ^ 0xF; + + struct midgard_instruction *peaked = + mir_choose_instruction(instructions, worklist, len, &predicate); + + if (peaked) { + vmul = peaked; + vmul->unit = UNIT_VMUL; + writeout_mask |= predicate.mask; + assert(writeout_mask == 0xF); + } + + /* Cleanup */ + predicate.dest = predicate.mask = 0; + } - /* For a csel, go back one more not to break up the bundle */ - if (ins->type == TAG_ALU_4 && OP_IS_CSEL(ins->alu.op)) - before = mir_prev_op(before); - - midgard_instruction st; - - if (is_special) { - /* Move */ - st = v_mov(spill_node, blank_alu_src, consecutive_index); - st.no_spill = true; - } else { - /* TLS load */ - st = v_load_store_scratch(consecutive_index, spill_slot, false, 0xF); - } - - /* Mask the load based on the component count - * actually needed to prvent RA loops */ - - st.mask = read_mask; - - mir_insert_instruction_before(before, st); - // consecutive_skip = true; - } else { - /* Special writes already have their move spilled in */ - consecutive_index = spill_slot; + /* Finally, add a move if necessary */ + if (bad_writeout || writeout_mask != 0xF) { + unsigned temp = (branch->src[0] == ~0) ? SSA_FIXED_REGISTER(0) : make_compiler_temp(ctx); + midgard_instruction mov = v_mov(src, temp); + vmul = mem_dup(&mov, sizeof(midgard_instruction)); + vmul->unit = UNIT_VMUL; + vmul->mask = 0xF ^ writeout_mask; + /* TODO: Don't leak */ + + /* Rewrite to use our temp */ + + for (unsigned i = 0; i < ARRAY_SIZE(stages); ++i) { + if (stages[i]) + mir_rewrite_index_dst_single(stages[i], src, temp); } + mir_rewrite_index_src_single(branch, src, temp); + } + } + + mir_choose_alu(&vmul, instructions, worklist, len, &predicate, UNIT_VMUL); + + mir_update_worklist(worklist, len, instructions, vmul); + mir_update_worklist(worklist, len, instructions, sadd); + + bundle.has_blend_constant = predicate.blend_constant; + bundle.has_embedded_constants = predicate.constant_mask != 0; + + unsigned padding = 0; - /* Rewrite to use */ - mir_rewrite_index_src_single(ins, spill_node, consecutive_index); + /* Now that we have finished scheduling, build up the bundle */ + midgard_instruction *stages[] = { vmul, sadd, vadd, smul, vlut, branch }; - if (!is_special) - ctx->fills++; + for (unsigned i = 0; i < ARRAY_SIZE(stages); ++i) { + if (stages[i]) { + bundle.control |= stages[i]->unit; + bytes_emitted += bytes_for_instruction(stages[i]); + bundle.instructions[bundle.instruction_count++] = stages[i]; } } - /* Reset hints */ + /* Pad ALU op to nearest word */ - mir_foreach_instr_global(ctx, ins) { - ins->hint = false; + if (bytes_emitted & 15) { + padding = 16 - (bytes_emitted & 15); + bytes_emitted += padding; } + + /* Constants must always be quadwords */ + if (bundle.has_embedded_constants) + bytes_emitted += 16; + + /* Size ALU instruction for tag */ + bundle.tag = (TAG_ALU_4) + (bytes_emitted / 16) - 1; + + /* MRT capable GPUs use a special writeout procedure */ + if (writeout && !(ctx->quirks & MIDGARD_NO_UPPER_ALU)) + bundle.tag += 4; + + bundle.padding = padding; + bundle.control |= bundle.tag; + + return bundle; } -void -schedule_program(compiler_context *ctx) +/* Schedule a single block by iterating its instruction to create bundles. + * While we go, tally about the bundle sizes to compute the block size. */ + + +static void +schedule_block(compiler_context *ctx, midgard_block *block) { - struct ra_graph *g = NULL; - bool spilled = false; - int iter_count = 1000; /* max iterations */ + /* Copy list to dynamic array */ + unsigned len = 0; + midgard_instruction **instructions = flatten_mir(block, &len); + + if (!len) + return; + + /* Calculate dependencies and initial worklist */ + unsigned node_count = ctx->temp_count + 1; + mir_create_dependency_graph(instructions, len, node_count); + + /* Allocate the worklist */ + size_t sz = BITSET_WORDS(len) * sizeof(BITSET_WORD); + BITSET_WORD *worklist = calloc(sz, 1); + mir_initialize_worklist(worklist, instructions, len); - /* Number of 128-bit slots in memory we've spilled into */ - unsigned spill_count = 0; + struct util_dynarray bundles; + util_dynarray_init(&bundles, NULL); - midgard_promote_uniforms(ctx, 16); + block->quadword_count = 0; + unsigned blend_offset = 0; - mir_foreach_block(ctx, block) { - midgard_pair_load_store(ctx, block); - } + for (;;) { + unsigned tag = mir_choose_bundle(instructions, worklist, len); + midgard_bundle bundle; - /* Must be lowered right before RA */ - mir_squeeze_index(ctx); - mir_lower_special_reads(ctx); + if (tag == TAG_TEXTURE_4) + bundle = mir_schedule_texture(instructions, worklist, len); + else if (tag == TAG_LOAD_STORE_4) + bundle = mir_schedule_ldst(instructions, worklist, len); + else if (tag == TAG_ALU_4) + bundle = mir_schedule_alu(ctx, instructions, worklist, len); + else + break; - /* Lowering can introduce some dead moves */ + util_dynarray_append(&bundles, midgard_bundle, bundle); - mir_foreach_block(ctx, block) { - midgard_opt_dead_move_eliminate(ctx, block); + if (bundle.has_blend_constant) + blend_offset = block->quadword_count; + + block->quadword_count += midgard_word_size[bundle.tag]; } - do { - if (spilled) - mir_spill_register(ctx, g, &spill_count); + /* We emitted bundles backwards; copy into the block in reverse-order */ - mir_squeeze_index(ctx); + util_dynarray_init(&block->bundles, block); + util_dynarray_foreach_reverse(&bundles, midgard_bundle, bundle) { + util_dynarray_append(&block->bundles, midgard_bundle, *bundle); + } + util_dynarray_fini(&bundles); - g = NULL; - g = allocate_registers(ctx, &spilled); - } while(spilled && ((iter_count--) > 0)); + /* Blend constant was backwards as well. blend_offset if set is + * strictly positive, as an offset of zero would imply constants before + * any instructions which is invalid in Midgard. TODO: blend constants + * are broken if you spill since then quadword_count becomes invalid + * XXX */ - /* We can simplify a bit after RA */ + if (blend_offset) + ctx->blend_constant_offset = ((ctx->quadword_count + block->quadword_count) - blend_offset - 1) * 0x10; - mir_foreach_block(ctx, block) { - midgard_opt_post_move_eliminate(ctx, block, g); - } + block->is_scheduled = true; + ctx->quadword_count += block->quadword_count; - /* After RA finishes, we schedule all at once */ + /* Reorder instructions to match bundled. First remove existing + * instructions and then recreate the list */ - mir_foreach_block(ctx, block) { - schedule_block(ctx, block); + mir_foreach_instr_in_block_safe(block, ins) { + list_del(&ins->link); } - /* Finally, we create pipeline registers as a peephole pass after - * scheduling. This isn't totally optimal, since there are cases where - * the usage of pipeline registers can eliminate spills, but it does - * save some power */ + mir_foreach_instr_in_block_scheduled_rev(block, ins) { + list_add(&ins->link, &block->instructions); + } - mir_create_pipeline_registers(ctx); + free(instructions); /* Allocated by flatten_mir() */ + free(worklist); +} - if (iter_count <= 0) { - fprintf(stderr, "panfrost: Gave up allocating registers, rendering will be incomplete\n"); - assert(0); - } +void +midgard_schedule_program(compiler_context *ctx) +{ + midgard_promote_uniforms(ctx); - /* Report spilling information. spill_count is in 128-bit slots (vec4 x - * fp32), but tls_size is in bytes, so multiply by 16 */ + /* Must be lowered right before scheduling */ + mir_squeeze_index(ctx); + mir_lower_special_reads(ctx); + mir_squeeze_index(ctx); + + /* Lowering can introduce some dead moves */ - ctx->tls_size = spill_count * 16; + mir_foreach_block(ctx, block) { + midgard_opt_dead_move_eliminate(ctx, block); + schedule_block(ctx, block); + } - install_registers(ctx, g); } diff -Nru mesa-19.2.8/src/panfrost/midgard/mir.c mesa-20.0.8/src/panfrost/midgard/mir.c --- mesa-19.2.8/src/panfrost/midgard/mir.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/panfrost/midgard/mir.c 2020-06-12 01:21:18.000000000 +0000 @@ -26,131 +26,33 @@ void mir_rewrite_index_src_single(midgard_instruction *ins, unsigned old, unsigned new) { - for (unsigned i = 0; i < ARRAY_SIZE(ins->ssa_args.src); ++i) { - if (ins->ssa_args.src[i] == old) - ins->ssa_args.src[i] = new; + for (unsigned i = 0; i < ARRAY_SIZE(ins->src); ++i) { + if (ins->src[i] == old) + ins->src[i] = new; } } void mir_rewrite_index_dst_single(midgard_instruction *ins, unsigned old, unsigned new) { - if (ins->ssa_args.dest == old) - ins->ssa_args.dest = new; + if (ins->dest == old) + ins->dest = new; } -static unsigned -mir_get_swizzle(midgard_instruction *ins, unsigned idx) -{ - if (ins->type == TAG_ALU_4) { - unsigned b = (idx == 0) ? ins->alu.src1 : ins->alu.src2; - - midgard_vector_alu_src s = - vector_alu_from_unsigned(b); - - return s.swizzle; - } else if (ins->type == TAG_LOAD_STORE_4) { - /* Main swizzle of a load is on the destination */ - if (!OP_IS_STORE(ins->load_store.op)) - idx++; - - switch (idx) { - case 0: - return ins->load_store.swizzle; - case 1: - case 2: { - uint8_t raw = - (idx == 2) ? ins->load_store.arg_2 : ins->load_store.arg_1; - - return component_to_swizzle(midgard_ldst_select(raw).component); - } - default: - unreachable("Unknown load/store source"); - } - } else if (ins->type == TAG_TEXTURE_4) { - switch (idx) { - case 0: - return ins->texture.in_reg_swizzle; - case 1: - /* Swizzle on bias doesn't make sense */ - return 0; - default: - unreachable("Unknown texture source"); - } - } else { - unreachable("Unknown type"); - } -} - -static void -mir_set_swizzle(midgard_instruction *ins, unsigned idx, unsigned new) +static midgard_vector_alu_src +mir_get_alu_src(midgard_instruction *ins, unsigned idx) { - if (ins->type == TAG_ALU_4) { - unsigned b = (idx == 0) ? ins->alu.src1 : ins->alu.src2; - - midgard_vector_alu_src s = - vector_alu_from_unsigned(b); - - s.swizzle = new; - unsigned pack = vector_alu_srco_unsigned(s); - - if (idx == 0) - ins->alu.src1 = pack; - else - ins->alu.src2 = pack; - } else if (ins->type == TAG_LOAD_STORE_4) { - /* Main swizzle of a load is on the destination */ - if (!OP_IS_STORE(ins->load_store.op)) - idx++; - - switch (idx) { - case 0: - ins->load_store.swizzle = new; - break; - case 1: - case 2: { - uint8_t raw = - (idx == 2) ? ins->load_store.arg_2 : ins->load_store.arg_1; - - midgard_ldst_register_select sel - = midgard_ldst_select(raw); - sel.component = swizzle_to_component(new); - uint8_t packed = midgard_ldst_pack(sel); - - if (idx == 2) - ins->load_store.arg_2 = packed; - else - ins->load_store.arg_1 = packed; - - break; - } - default: - assert(new == 0); - break; - } - } else if (ins->type == TAG_TEXTURE_4) { - switch (idx) { - case 0: - ins->texture.in_reg_swizzle = new; - break; - default: - assert(new == 0); - break; - } - } else { - unreachable("Unknown type"); - } + unsigned b = (idx == 0) ? ins->alu.src1 : ins->alu.src2; + return vector_alu_from_unsigned(b); } static void -mir_rewrite_index_src_single_swizzle(midgard_instruction *ins, unsigned old, unsigned new, unsigned swizzle) +mir_rewrite_index_src_single_swizzle(midgard_instruction *ins, unsigned old, unsigned new, unsigned *swizzle) { - for (unsigned i = 0; i < ARRAY_SIZE(ins->ssa_args.src); ++i) { - if (ins->ssa_args.src[i] != old) continue; + for (unsigned i = 0; i < ARRAY_SIZE(ins->src); ++i) { + if (ins->src[i] != old) continue; - ins->ssa_args.src[i] = new; - - mir_set_swizzle(ins, i, - pan_compose_swizzle(mir_get_swizzle(ins, i), swizzle)); + ins->src[i] = new; + mir_compose_swizzle(ins->swizzle[i], swizzle, ins->swizzle[i]); } } @@ -163,7 +65,7 @@ } void -mir_rewrite_index_src_swizzle(compiler_context *ctx, unsigned old, unsigned new, unsigned swizzle) +mir_rewrite_index_src_swizzle(compiler_context *ctx, unsigned old, unsigned new, unsigned *swizzle) { mir_foreach_instr_global(ctx, ins) { mir_rewrite_index_src_single_swizzle(ins, old, new, swizzle); @@ -171,19 +73,6 @@ } void -mir_rewrite_index_src_tag(compiler_context *ctx, unsigned old, unsigned new, unsigned tag) -{ - mir_foreach_instr_global(ctx, ins) { - if (ins->type != tag) - continue; - - mir_rewrite_index_src_single(ins, old, new); - } -} - - - -void mir_rewrite_index_dst(compiler_context *ctx, unsigned old, unsigned new) { mir_foreach_instr_global(ctx, ins) { @@ -192,20 +81,6 @@ } void -mir_rewrite_index_dst_tag(compiler_context *ctx, unsigned old, unsigned new, unsigned tag) -{ - mir_foreach_instr_global(ctx, ins) { - if (ins->type != tag) - continue; - - if (ins->ssa_args.dest == old) - ins->ssa_args.dest = new; - } -} - - - -void mir_rewrite_index(compiler_context *ctx, unsigned old, unsigned new) { mir_rewrite_index_src(ctx, old, new); @@ -231,6 +106,10 @@ bool mir_single_use(compiler_context *ctx, unsigned value) { + /* We can replicate constants in places so who cares */ + if (value == SSA_FIXED_REGISTER(REGISTER_CONSTANT)) + return true; + return mir_use_count(ctx, value) <= 1; } @@ -243,18 +122,17 @@ return src.mod; } -bool -mir_nontrivial_mod(midgard_vector_alu_src src, bool is_int, unsigned mask) +static bool +mir_nontrivial_mod(midgard_vector_alu_src src, bool is_int, unsigned mask, unsigned *swizzle) { if (mir_nontrivial_raw_mod(src, is_int)) return true; /* size-conversion */ if (src.half) return true; - /* swizzle */ - for (unsigned c = 0; c < 4; ++c) { + for (unsigned c = 0; c < 16; ++c) { if (!(mask & (1 << c))) continue; - if (((src.swizzle >> (2*c)) & 3) != c) return true; + if (swizzle[c] != c) return true; } return false; @@ -268,7 +146,7 @@ midgard_vector_alu_src src2 = vector_alu_from_unsigned(ins->alu.src2); - return mir_nontrivial_mod(src2, is_int, ins->mask); + return mir_nontrivial_mod(src2, is_int, ins->mask, ins->swizzle[1]); } bool @@ -311,8 +189,9 @@ mir_foreach_instr_global(ctx, ins) { bool is_ldst = ins->type == TAG_LOAD_STORE_4; bool is_tex = ins->type == TAG_TEXTURE_4; + bool is_writeout = ins->compact_branch && ins->writeout; - if (!(is_ldst || is_tex)) + if (!(is_ldst || is_tex || is_writeout)) continue; if (mir_has_arg(ins, idx)) @@ -327,20 +206,244 @@ bool mir_is_written_before(compiler_context *ctx, midgard_instruction *ins, unsigned node) { - if ((node < 0) || (node >= SSA_FIXED_MINIMUM)) + if (node >= SSA_FIXED_MINIMUM) return true; mir_foreach_instr_global(ctx, q) { if (q == ins) break; - if (q->ssa_args.dest == node) + if (q->dest == node) return true; } return false; } +/* Grabs the type size. */ + +midgard_reg_mode +mir_typesize(midgard_instruction *ins) +{ + if (ins->compact_branch) + return midgard_reg_mode_32; + + /* TODO: Type sizes for texture */ + if (ins->type == TAG_TEXTURE_4) + return midgard_reg_mode_32; + + if (ins->type == TAG_LOAD_STORE_4) + return GET_LDST_SIZE(load_store_opcode_props[ins->load_store.op].props); + + if (ins->type == TAG_ALU_4) { + midgard_reg_mode mode = ins->alu.reg_mode; + + /* If we have an override, step down by half */ + if (ins->alu.dest_override != midgard_dest_override_none) { + assert(mode > midgard_reg_mode_8); + mode--; + } + + return mode; + } + + unreachable("Invalid instruction type"); +} + +/* Grabs the size of a source */ + +midgard_reg_mode +mir_srcsize(midgard_instruction *ins, unsigned i) +{ + /* TODO: 16-bit textures/ldst */ + if (ins->type == TAG_TEXTURE_4 || ins->type == TAG_LOAD_STORE_4) + return midgard_reg_mode_32; + + /* TODO: 16-bit branches */ + if (ins->compact_branch) + return midgard_reg_mode_32; + + if (i >= 2) { + /* TODO: 16-bit conditions, ffma */ + return midgard_reg_mode_32; + } + + /* Default to type of the instruction */ + + midgard_reg_mode mode = ins->alu.reg_mode; + + /* If we have a half modifier, step down by half */ + + if ((mir_get_alu_src(ins, i)).half) { + assert(mode > midgard_reg_mode_8); + mode--; + } + + return mode; +} + +midgard_reg_mode +mir_mode_for_destsize(unsigned size) +{ + switch (size) { + case 8: + return midgard_reg_mode_8; + case 16: + return midgard_reg_mode_16; + case 32: + return midgard_reg_mode_32; + case 64: + return midgard_reg_mode_64; + default: + unreachable("Unknown destination size"); + } +} + + +/* Converts per-component mask to a byte mask */ + +uint16_t +mir_to_bytemask(midgard_reg_mode mode, unsigned mask) +{ + switch (mode) { + case midgard_reg_mode_8: + return mask; + + case midgard_reg_mode_16: { + unsigned space = + (mask & 0x1) | + ((mask & 0x2) << (2 - 1)) | + ((mask & 0x4) << (4 - 2)) | + ((mask & 0x8) << (6 - 3)) | + ((mask & 0x10) << (8 - 4)) | + ((mask & 0x20) << (10 - 5)) | + ((mask & 0x40) << (12 - 6)) | + ((mask & 0x80) << (14 - 7)); + + return space | (space << 1); + } + + case midgard_reg_mode_32: { + unsigned space = + (mask & 0x1) | + ((mask & 0x2) << (4 - 1)) | + ((mask & 0x4) << (8 - 2)) | + ((mask & 0x8) << (12 - 3)); + + return space | (space << 1) | (space << 2) | (space << 3); + } + + case midgard_reg_mode_64: { + unsigned A = (mask & 0x1) ? 0xFF : 0x00; + unsigned B = (mask & 0x2) ? 0xFF : 0x00; + return A | (B << 8); + } + + default: + unreachable("Invalid register mode"); + } +} + +/* ...and the inverse */ + +unsigned +mir_bytes_for_mode(midgard_reg_mode mode) +{ + switch (mode) { + case midgard_reg_mode_8: + return 1; + case midgard_reg_mode_16: + return 2; + case midgard_reg_mode_32: + return 4; + case midgard_reg_mode_64: + return 8; + default: + unreachable("Invalid register mode"); + } +} + +uint16_t +mir_from_bytemask(uint16_t bytemask, midgard_reg_mode mode) +{ + unsigned value = 0; + unsigned count = mir_bytes_for_mode(mode); + + for (unsigned c = 0, d = 0; c < 16; c += count, ++d) { + bool a = (bytemask & (1 << c)) != 0; + + for (unsigned q = c; q < count; ++q) + assert(((bytemask & (1 << q)) != 0) == a); + + value |= (a << d); + } + + return value; +} + +/* Rounds up a bytemask to fill a given component count. Iterate each + * component, and check if any bytes in the component are masked on */ + +uint16_t +mir_round_bytemask_up(uint16_t mask, midgard_reg_mode mode) +{ + unsigned bytes = mir_bytes_for_mode(mode); + unsigned maxmask = mask_of(bytes); + unsigned channels = 16 / bytes; + + for (unsigned c = 0; c < channels; ++c) { + unsigned submask = maxmask << (c * bytes); + + if (mask & submask) + mask |= submask; + } + + return mask; +} + +/* Grabs the per-byte mask of an instruction (as opposed to per-component) */ + +uint16_t +mir_bytemask(midgard_instruction *ins) +{ + return mir_to_bytemask(mir_typesize(ins), ins->mask); +} + +void +mir_set_bytemask(midgard_instruction *ins, uint16_t bytemask) +{ + ins->mask = mir_from_bytemask(bytemask, mir_typesize(ins)); +} + +/* Checks if we should use an upper destination override, rather than the lower + * one in the IR. Returns zero if no, returns the bytes to shift otherwise */ + +unsigned +mir_upper_override(midgard_instruction *ins) +{ + /* If there is no override, there is no upper override, tautology */ + if (ins->alu.dest_override == midgard_dest_override_none) + return 0; + + /* Make sure we didn't already lower somehow */ + assert(ins->alu.dest_override == midgard_dest_override_lower); + + /* What is the mask in terms of currently? */ + midgard_reg_mode type = mir_typesize(ins); + + /* There are 16 bytes per vector, so there are (16/bytes) + * components per vector. So the magic half is half of + * (16/bytes), which simplifies to 8/bytes */ + + unsigned threshold = 8 / mir_bytes_for_mode(type); + + /* How many components did we shift over? */ + unsigned zeroes = __builtin_ctz(ins->mask); + + /* Did we hit the threshold? */ + return (zeroes >= threshold) ? threshold : 0; +} + /* Creates a mask of the components of a node read by an instruction, by * analyzing the swizzle with respect to the instruction's mask. E.g.: * @@ -349,105 +452,191 @@ * will return a mask of Z/Y for r2 */ -static unsigned -mir_mask_of_read_components_single(unsigned swizzle, unsigned outmask) +static uint16_t +mir_bytemask_of_read_components_single(unsigned *swizzle, unsigned inmask, midgard_reg_mode mode) +{ + unsigned cmask = 0; + + for (unsigned c = 0; c < MIR_VEC_COMPONENTS; ++c) { + if (!(inmask & (1 << c))) continue; + cmask |= (1 << swizzle[c]); + } + + return mir_to_bytemask(mode, cmask); +} + +uint16_t +mir_bytemask_of_read_components(midgard_instruction *ins, unsigned node) { - unsigned mask = 0; + uint16_t mask = 0; + + if (node == ~0) + return 0; - for (unsigned c = 0; c < 4; ++c) { - if (!(outmask & (1 << c))) continue; + mir_foreach_src(ins, i) { + if (ins->src[i] != node) continue; - unsigned comp = (swizzle >> (2*c)) & 3; - mask |= (1 << comp); + /* Branch writeout uses all components */ + if (ins->compact_branch && ins->writeout && (i == 0)) + return 0xFFFF; + + /* Conditional branches read one 32-bit component = 4 bytes (TODO: multi branch??) */ + if (ins->compact_branch && ins->branch.conditional && (i == 0)) + return 0xF; + + /* ALU ops act componentwise so we need to pay attention to + * their mask. Texture/ldst does not so we don't clamp source + * readmasks based on the writemask */ + unsigned qmask = (ins->type == TAG_ALU_4) ? ins->mask : ~0; + + /* Handle dot products and things */ + if (ins->type == TAG_ALU_4 && !ins->compact_branch) { + unsigned props = alu_opcode_props[ins->alu.op].props; + + unsigned channel_override = GET_CHANNEL_COUNT(props); + + if (channel_override) + qmask = mask_of(channel_override); + } + + mask |= mir_bytemask_of_read_components_single(ins->swizzle[i], qmask, mir_srcsize(ins, i)); } return mask; } -static unsigned -mir_source_count(midgard_instruction *ins) -{ - if (ins->type == TAG_ALU_4) { - /* ALU is always binary */ - return 2; - } else if (ins->type == TAG_LOAD_STORE_4) { - bool load = !OP_IS_STORE(ins->load_store.op); - return (load ? 2 : 3); - } else if (ins->type == TAG_TEXTURE_4) { - /* Coords, bias.. TODO: Offsets? */ - return 2; - } else { - unreachable("Invalid instruction type"); +/* Register allocation occurs after instruction scheduling, which is fine until + * we start needing to spill registers and therefore insert instructions into + * an already-scheduled program. We don't have to be terribly efficient about + * this, since spilling is already slow. So just semantically we need to insert + * the instruction into a new bundle before/after the bundle of the instruction + * in question */ + +static midgard_bundle +mir_bundle_for_op(compiler_context *ctx, midgard_instruction ins) +{ + midgard_instruction *u = mir_upload_ins(ctx, ins); + + midgard_bundle bundle = { + .tag = ins.type, + .instruction_count = 1, + .instructions = { u }, + }; + + if (bundle.tag == TAG_ALU_4) { + assert(OP_IS_MOVE(u->alu.op)); + u->unit = UNIT_VMUL; + + size_t bytes_emitted = sizeof(uint32_t) + sizeof(midgard_reg_info) + sizeof(midgard_vector_alu); + bundle.padding = ~(bytes_emitted - 1) & 0xF; + bundle.control = ins.type | u->unit; } + + return bundle; } static unsigned -mir_component_count_implicit(midgard_instruction *ins, unsigned i) +mir_bundle_idx_for_ins(midgard_instruction *tag, midgard_block *block) { - if (ins->type == TAG_LOAD_STORE_4) { - switch (ins->load_store.op) { - /* Address implicitly 64-bit */ - case midgard_op_ld_int4: - return (i == 0) ? 1 : 0; + midgard_bundle *bundles = + (midgard_bundle *) block->bundles.data; - case midgard_op_st_int4: - return (i == 1) ? 1 : 0; + size_t count = (block->bundles.size / sizeof(midgard_bundle)); - default: - return 0; + for (unsigned i = 0; i < count; ++i) { + for (unsigned j = 0; j < bundles[i].instruction_count; ++j) { + if (bundles[i].instructions[j] == tag) + return i; } } - return 0; + mir_print_instruction(tag); + unreachable("Instruction not scheduled in block"); } -unsigned -mir_mask_of_read_components(midgard_instruction *ins, unsigned node) -{ - unsigned mask = 0; +void +mir_insert_instruction_before_scheduled( + compiler_context *ctx, + midgard_block *block, + midgard_instruction *tag, + midgard_instruction ins) +{ + unsigned before = mir_bundle_idx_for_ins(tag, block); + size_t count = util_dynarray_num_elements(&block->bundles, midgard_bundle); + UNUSED void *unused = util_dynarray_grow(&block->bundles, midgard_bundle, 1); + + midgard_bundle *bundles = (midgard_bundle *) block->bundles.data; + memmove(bundles + before + 1, bundles + before, (count - before) * sizeof(midgard_bundle)); + midgard_bundle *before_bundle = bundles + before + 1; - for (unsigned i = 0; i < mir_source_count(ins); ++i) { - if (ins->ssa_args.src[i] != node) continue; + midgard_bundle new = mir_bundle_for_op(ctx, ins); + memcpy(bundles + before, &new, sizeof(new)); - unsigned swizzle = mir_get_swizzle(ins, i); - unsigned m = mir_mask_of_read_components_single(swizzle, ins->mask); + list_addtail(&new.instructions[0]->link, &before_bundle->instructions[0]->link); + block->quadword_count += midgard_word_size[new.tag]; +} - /* Sometimes multi-arg ops are passed implicitly */ - unsigned implicit = mir_component_count_implicit(ins, i); - assert(implicit < 2); +void +mir_insert_instruction_after_scheduled( + compiler_context *ctx, + midgard_block *block, + midgard_instruction *tag, + midgard_instruction ins) +{ + /* We need to grow the bundles array to add our new bundle */ + size_t count = util_dynarray_num_elements(&block->bundles, midgard_bundle); + UNUSED void *unused = util_dynarray_grow(&block->bundles, midgard_bundle, 1); + + /* Find the bundle that we want to insert after */ + unsigned after = mir_bundle_idx_for_ins(tag, block); + + /* All the bundles after that one, we move ahead by one */ + midgard_bundle *bundles = (midgard_bundle *) block->bundles.data; + memmove(bundles + after + 2, bundles + after + 1, (count - after - 1) * sizeof(midgard_bundle)); + midgard_bundle *after_bundle = bundles + after; + + midgard_bundle new = mir_bundle_for_op(ctx, ins); + memcpy(bundles + after + 1, &new, sizeof(new)); + list_add(&new.instructions[0]->link, &after_bundle->instructions[after_bundle->instruction_count - 1]->link); + block->quadword_count += midgard_word_size[new.tag]; +} - /* Extend the mask */ - if (implicit == 1) { - /* Ensure it's a single bit currently */ - assert((m >> __builtin_ctz(m)) == 0x1); +/* Flip the first-two arguments of a (binary) op. Currently ALU + * only, no known uses for ldst/tex */ - /* Set the next bit to extend one*/ - m |= (m << 1); - } +void +mir_flip(midgard_instruction *ins) +{ + unsigned temp = ins->src[0]; + ins->src[0] = ins->src[1]; + ins->src[1] = temp; - mask |= m; - } + assert(ins->type == TAG_ALU_4); - return mask; + temp = ins->alu.src1; + ins->alu.src1 = ins->alu.src2; + ins->alu.src2 = temp; + + unsigned temp_swizzle[16]; + memcpy(temp_swizzle, ins->swizzle[0], sizeof(ins->swizzle[0])); + memcpy(ins->swizzle[0], ins->swizzle[1], sizeof(ins->swizzle[0])); + memcpy(ins->swizzle[1], temp_swizzle, sizeof(ins->swizzle[0])); } -unsigned -mir_ubo_shift(midgard_load_store_op op) +/* Before squashing, calculate ctx->temp_count just by observing the MIR */ + +void +mir_compute_temp_count(compiler_context *ctx) { - switch (op) { - case midgard_op_ld_ubo_char: - return 0; - case midgard_op_ld_ubo_char2: - return 1; - case midgard_op_ld_ubo_char4: - return 2; - case midgard_op_ld_ubo_short4: - return 3; - case midgard_op_ld_ubo_int4: - return 4; - default: - unreachable("Invalid op"); - } -} + if (ctx->temp_count) + return; + + unsigned max_dest = 0; + mir_foreach_instr_global(ctx, ins) { + if (ins->dest < SSA_FIXED_MINIMUM) + max_dest = MAX2(max_dest, ins->dest + 1); + } + ctx->temp_count = max_dest; +} diff -Nru mesa-19.2.8/src/panfrost/midgard/mir_promote_uniforms.c mesa-20.0.8/src/panfrost/midgard/mir_promote_uniforms.c --- mesa-19.2.8/src/panfrost/midgard/mir_promote_uniforms.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/panfrost/midgard/mir_promote_uniforms.c 2020-06-12 01:21:18.000000000 +0000 @@ -26,50 +26,117 @@ #include "compiler.h" #include "util/u_math.h" +#include "util/u_memory.h" /* This pass promotes reads from uniforms from load/store ops to uniform * registers if it is beneficial to do so. Normally, this saves both * instructions and total register pressure, but it does take a toll on the * number of work registers that are available, so this is a balance. * - * To cope, we take as an argument the maximum work register pressure in the - * program so we allow that many registers through at minimum, to prevent - * spilling. If we spill anyway, I mean, it's a lose-lose at that point. */ + * We use a heuristic to determine the ideal count, implemented by + * mir_work_heuristic, which returns the ideal number of work registers. + */ + +static bool +mir_is_promoteable_ubo(midgard_instruction *ins) +{ + /* TODO: promote unaligned access via swizzle? */ + + return (ins->type == TAG_LOAD_STORE_4) && + (OP_IS_UBO_READ(ins->load_store.op)) && + !(ins->constants.u32[0] & 0xF) && + !(ins->load_store.arg_1) && + (ins->load_store.arg_2 == 0x1E) && + ((ins->constants.u32[0] / 16) < 16); +} static unsigned -mir_ubo_offset(midgard_instruction *ins) +mir_promoteable_uniform_count(compiler_context *ctx) { - assert(ins->type == TAG_LOAD_STORE_4); - assert(OP_IS_UBO_READ(ins->load_store.op)); + unsigned count = 0; + + mir_foreach_instr_global(ctx, ins) { + if (mir_is_promoteable_ubo(ins)) + count = MAX2(count, ins->constants.u32[0] / 16); + } - /* Grab the offset as the hw understands it */ - unsigned lo = ins->load_store.varying_parameters >> 7; - unsigned hi = ins->load_store.address; - unsigned raw = ((hi << 3) | lo); - - /* Account for the op's shift */ - unsigned shift = mir_ubo_shift(ins->load_store.op); - return (raw << shift); + return count; } -void -midgard_promote_uniforms(compiler_context *ctx, unsigned promoted_count) +static unsigned +mir_count_live(uint16_t *live, unsigned temp_count) { - mir_foreach_instr_global_safe(ctx, ins) { - if (ins->type != TAG_LOAD_STORE_4) continue; - if (!OP_IS_UBO_READ(ins->load_store.op)) continue; + unsigned count = 0; - /* Get the offset. TODO: can we promote unaligned access? */ - unsigned off = mir_ubo_offset(ins); - if (off & 0xF) continue; + for (unsigned i = 0; i < temp_count; ++i) + count += util_bitcount(live[i]); - unsigned address = off / 16; + return count; +} - /* Check this is UBO 0 */ - if (ins->load_store.arg_1) continue; +static unsigned +mir_estimate_pressure(compiler_context *ctx) +{ + mir_invalidate_liveness(ctx); + mir_compute_liveness(ctx); + + unsigned max_live = 0; + + mir_foreach_block(ctx, block) { + uint16_t *live = mem_dup(block->live_out, ctx->temp_count * sizeof(uint16_t)); + + mir_foreach_instr_in_block_rev(block, ins) { + unsigned count = mir_count_live(live, ctx->temp_count); + max_live = MAX2(max_live, count); + mir_liveness_ins_update(live, ins, ctx->temp_count); + } - /* Check we're accessing directly */ - if (ins->load_store.arg_2 != 0x1E) continue; + free(live); + } + + return DIV_ROUND_UP(max_live, 16); +} + +static unsigned +mir_work_heuristic(compiler_context *ctx) +{ + unsigned uniform_count = mir_promoteable_uniform_count(ctx); + + /* If there are 8 or fewer uniforms, it doesn't matter what we do, so + * allow as many work registers as needed */ + + if (uniform_count <= 8) + return 16; + + /* Otherwise, estimate the register pressure */ + + unsigned pressure = mir_estimate_pressure(ctx); + + /* Prioritize not spilling above all else. The relation between the + * pressure estimate and the actual register pressure is a little + * murkier than we might like (due to scheduling, pipeline registers, + * failure to pack vector registers, load/store registers, texture + * registers...), hence why this is a heuristic parameter */ + + if (pressure > 6) + return 16; + + /* If there's no chance of spilling, prioritize UBOs and thread count */ + + return 8; +} + +void +midgard_promote_uniforms(compiler_context *ctx) +{ + unsigned work_count = mir_work_heuristic(ctx); + unsigned promoted_count = 24 - work_count; + + mir_foreach_instr_global_safe(ctx, ins) { + if (!mir_is_promoteable_ubo(ins)) continue; + + unsigned off = ins->constants.u32[0]; + unsigned address = off / 16; /* Check if it's a promotable range */ unsigned uniform_reg = 23 - address; @@ -84,29 +151,19 @@ /* We do need the move for safety for a non-SSA dest, or if * we're being fed into a special class */ - bool needs_move = ins->ssa_args.dest & IS_REG; - needs_move |= mir_special_index(ctx, ins->ssa_args.dest); - - /* Ensure this is a contiguous X-bound mask. It should be since - * we haven't done RA and per-component masked UBO reads don't - * make much sense. */ - - assert(((ins->mask + 1) & ins->mask) == 0); + bool needs_move = ins->dest & IS_REG; + needs_move |= mir_special_index(ctx, ins->dest); - /* Check the component count from the mask so we can setup a - * swizzle appropriately when promoting. The idea is to ensure - * the component count is preserved so RA can be smarter if we - * need to spill */ + if (needs_move) { + midgard_instruction mov = v_mov(promoted, ins->dest); - unsigned nr_components = util_bitcount(ins->mask); + if (ins->load_64) + mov.alu.reg_mode = midgard_reg_mode_64; - if (needs_move) { - midgard_instruction mov = v_mov(promoted, blank_alu_src, ins->ssa_args.dest); - mov.mask = ins->mask; - mir_insert_instruction_before(ins, mov); + mir_set_bytemask(&mov, mir_bytemask(ins)); + mir_insert_instruction_before(ctx, ins, mov); } else { - mir_rewrite_index_src_swizzle(ctx, ins->ssa_args.dest, - promoted, swizzle_of(nr_components)); + mir_rewrite_index_src(ctx, ins->dest, promoted); } mir_remove_instruction(ins); diff -Nru mesa-19.2.8/src/panfrost/midgard/mir_squeeze.c mesa-20.0.8/src/panfrost/midgard/mir_squeeze.c --- mesa-19.2.8/src/panfrost/midgard/mir_squeeze.c 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/panfrost/midgard/mir_squeeze.c 2020-06-12 01:21:18.000000000 +0000 @@ -0,0 +1,81 @@ +/* + * Copyright (C) 2019 Collabora, Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors (Collabora): + * Alyssa Rosenzweig + */ + +#include "compiler.h" + +/* When we're 'squeezing down' the values in the IR, we maintain a hash + * as such */ + +static unsigned +find_or_allocate_temp(compiler_context *ctx, unsigned hash) +{ + if (hash >= SSA_FIXED_MINIMUM) + return hash; + + unsigned temp = (uintptr_t) _mesa_hash_table_u64_search( + ctx->hash_to_temp, hash + 1); + + if (temp) + return temp - 1; + + /* If no temp is find, allocate one */ + temp = ctx->temp_count++; + ctx->max_hash = MAX2(ctx->max_hash, hash); + + _mesa_hash_table_u64_insert(ctx->hash_to_temp, + hash + 1, (void *) ((uintptr_t) temp + 1)); + + return temp; +} + +/* Reassigns numbering to get rid of gaps in the indices and to prioritize + * smaller register classes */ + +void +mir_squeeze_index(compiler_context *ctx) +{ + /* Reset */ + ctx->temp_count = 0; + /* TODO don't leak old hash_to_temp */ + ctx->hash_to_temp = _mesa_hash_table_u64_create(NULL); + + /* We need to prioritize texture registers on older GPUs so we don't + * fail RA trying to assign to work registers r0/r1 when a work + * register is already there */ + + mir_foreach_instr_global(ctx, ins) { + if (ins->type == TAG_TEXTURE_4) + ins->dest = find_or_allocate_temp(ctx, ins->dest); + } + + mir_foreach_instr_global(ctx, ins) { + if (ins->type != TAG_TEXTURE_4) + ins->dest = find_or_allocate_temp(ctx, ins->dest); + + for (unsigned i = 0; i < ARRAY_SIZE(ins->src); ++i) + ins->src[i] = find_or_allocate_temp(ctx, ins->src[i]); + } +} diff -Nru mesa-19.2.8/src/panfrost/midgard/nir_undef_to_zero.c mesa-20.0.8/src/panfrost/midgard/nir_undef_to_zero.c --- mesa-19.2.8/src/panfrost/midgard/nir_undef_to_zero.c 1970-01-01 00:00:00.000000000 +0000 +++ mesa-20.0.8/src/panfrost/midgard/nir_undef_to_zero.c 2020-06-12 01:21:18.000000000 +0000 @@ -0,0 +1,87 @@ +/* + * Copyright (C) 2019 Collabora, Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * Authors (Collabora): + * Alyssa Rosenzweig + */ + +/** + * @file + * + * Flushes undefined SSA values to a zero vector fo the appropriate component + * count, to avoid undefined behaviour in the resulting shader. Not required + * for conformance as use of uninitialized variables is explicitly left + * undefined by the spec. Works around buggy apps, however. + * + * Call immediately after nir_opt_undef. If called before, larger optimization + * opportunities from the former pass will be missed. If called outside of an + * optimization loop, constant propagation and algebraic optimizations won't be + * able to kick in to reduce stuff consuming the zero. + */ + +#include "compiler/nir/nir.h" +#include "compiler/nir/nir_builder.h" + +bool nir_undef_to_zero(nir_shader *shader); + +bool +nir_undef_to_zero(nir_shader *shader) +{ + bool progress = false; + + nir_foreach_function(function, shader) { + if (!function->impl) continue; + + nir_builder b; + nir_builder_init(&b, function->impl); + + nir_foreach_block(block, function->impl) { + nir_foreach_instr_safe(instr, block) { + if (instr->type != nir_instr_type_ssa_undef) continue; + + nir_ssa_undef_instr *und = nir_instr_as_ssa_undef(instr); + + /* Get the required size */ + unsigned c = und->def.num_components; + unsigned s = und->def.bit_size; + + nir_const_value v[NIR_MAX_VEC_COMPONENTS]; + memset(v, 0, sizeof(v)); + + b.cursor = nir_before_instr(instr); + nir_ssa_def *zero = nir_build_imm(&b, c, s, v); + nir_src zerosrc = nir_src_for_ssa(zero); + + nir_ssa_def_rewrite_uses(&und->def, zerosrc); + + progress |= true; + } + } + + nir_metadata_preserve(function->impl, nir_metadata_block_index | nir_metadata_dominance); + + } + + return progress; +} + + diff -Nru mesa-19.2.8/src/panfrost/pandecode/common.c mesa-20.0.8/src/panfrost/pandecode/common.c --- mesa-19.2.8/src/panfrost/pandecode/common.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/panfrost/pandecode/common.c 2020-06-12 01:21:18.000000000 +0000 @@ -31,6 +31,7 @@ #include "decode.h" #include "util/macros.h" +#include "util/u_debug.h" /* Memory handling */ @@ -47,9 +48,39 @@ return NULL; } +static void +pandecode_add_name(struct pandecode_mapped_memory *mem, uint64_t gpu_va, const char *name) +{ + if (!name) { + /* If we don't have a name, assign one */ + + snprintf(mem->name, ARRAY_SIZE(mem->name) - 1, + "memory_%" PRIx64, gpu_va); + } else { + assert((strlen(name) + 1) < ARRAY_SIZE(mem->name)); + memcpy(mem->name, name, strlen(name) + 1); + } +} + void pandecode_inject_mmap(uint64_t gpu_va, void *cpu, unsigned sz, const char *name) { + /* First, search if we already mapped this and are just updating an address */ + + list_for_each_entry(struct pandecode_mapped_memory, pos, &mmaps.node, node) { + if (pos->gpu_va == gpu_va) { + /* TODO: Resizing weirdness. Only applies to tracing + * the legacy driver, not for native traces */ + + pos->length = sz; + pos->addr = cpu; + pandecode_add_name(pos, gpu_va, name); + + return; + } + } + + /* Otherwise, add a fresh mapping */ struct pandecode_mapped_memory *mapped_mem = NULL; mapped_mem = malloc(sizeof(*mapped_mem)); @@ -58,16 +89,7 @@ mapped_mem->gpu_va = gpu_va; mapped_mem->length = sz; mapped_mem->addr = cpu; - - if (!name) { - /* If we don't have a name, assign one */ - - snprintf(mapped_mem->name, ARRAY_SIZE(mapped_mem->name) - 1, - "memory_%" PRIx64, gpu_va); - } else { - assert(strlen(name) < ARRAY_SIZE(mapped_mem->name)); - memcpy(mapped_mem->name, name, strlen(name)); - } + pandecode_add_name(mapped_mem, gpu_va, name); list_add(&mapped_mem->node, &mmaps.node); } @@ -94,9 +116,56 @@ } +static int pandecode_dump_frame_count = 0; + +static void +pandecode_dump_file_open(void) +{ + if (pandecode_dump_stream) + return; + + char buffer[1024]; + + /* This does a getenv every frame, so it is possible to use + * setenv to change the base at runtime. + */ + const char *dump_file_base = debug_get_option("PANDECODE_DUMP_FILE", "pandecode.dump"); + snprintf(buffer, sizeof(buffer), "%s.%04d", dump_file_base, pandecode_dump_frame_count); + + printf("pandecode: dump command stream to file %s\n", buffer); + pandecode_dump_stream = fopen(buffer, "w"); + + if (!pandecode_dump_stream) + fprintf(stderr,"pandecode: failed to open command stream log file %s\n", + buffer); +} + +static void +pandecode_dump_file_close(void) +{ + if (pandecode_dump_stream) { + fclose(pandecode_dump_stream); + pandecode_dump_stream = NULL; + } +} + void pandecode_initialize(void) { list_inithead(&mmaps.node); + pandecode_dump_file_open(); +} +void +pandecode_next_frame(void) +{ + pandecode_dump_file_close(); + pandecode_dump_frame_count++; + pandecode_dump_file_open(); +} + +void +pandecode_close(void) +{ + pandecode_dump_file_close(); } diff -Nru mesa-19.2.8/src/panfrost/pandecode/decode.c mesa-20.0.8/src/panfrost/pandecode/decode.c --- mesa-19.2.8/src/panfrost/pandecode/decode.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/panfrost/pandecode/decode.c 2020-06-12 01:21:18.000000000 +0000 @@ -29,6 +29,7 @@ #include #include #include +#include #include "decode.h" #include "util/macros.h" #include "util/u_math.h" @@ -37,7 +38,9 @@ #include "midgard/disassemble.h" #include "bifrost/disassemble.h" -int pandecode_jc(mali_ptr jc_gpu_va, bool bifrost); +#include "pan_encoder.h" + +static void pandecode_swizzle(unsigned swizzle, enum mali_format format); #define MEMORY_PROP(obj, p) {\ if (obj->p) { \ @@ -55,10 +58,7 @@ } \ } -#define DYN_MEMORY_PROP(obj, no, p) { \ - if (obj->p) \ - pandecode_prop("%s = %s_%d_p", #p, #p, no); \ -} +FILE *pandecode_dump_stream; /* Semantic logging type. * @@ -85,7 +85,7 @@ pandecode_make_indent(void) { for (unsigned i = 0; i < pandecode_indent; ++i) - printf(" "); + fprintf(pandecode_dump_stream, " "); } static void @@ -96,16 +96,16 @@ pandecode_make_indent(); if (type == PANDECODE_MESSAGE) - printf("// "); + fprintf(pandecode_dump_stream, "// "); else if (type == PANDECODE_PROPERTY) - printf("."); + fprintf(pandecode_dump_stream, "."); va_start(ap, format); - vprintf(format, ap); + vfprintf(pandecode_dump_stream, format, ap); va_end(ap); if (type == PANDECODE_PROPERTY) - printf(",\n"); + fprintf(pandecode_dump_stream, ",\n"); } static void @@ -114,10 +114,48 @@ va_list ap; va_start(ap, format); - vprintf(format, ap); + vfprintf(pandecode_dump_stream, format, ap); va_end(ap); } +/* To check for memory safety issues, validates that the given pointer in GPU + * memory is valid, containing at least sz bytes. The goal is to eliminate + * GPU-side memory bugs (NULL pointer dereferences, buffer overflows, or buffer + * overruns) by statically validating pointers. + */ + +static void +pandecode_validate_buffer(mali_ptr addr, size_t sz) +{ + if (!addr) { + pandecode_msg("XXX: null pointer deref"); + return; + } + + /* Find a BO */ + + struct pandecode_mapped_memory *bo = + pandecode_find_mapped_gpu_mem_containing(addr); + + if (!bo) { + pandecode_msg("XXX: invalid memory dereference\n"); + return; + } + + /* Bounds check */ + + unsigned offset = addr - bo->gpu_va; + unsigned total = offset + sz; + + if (total > bo->length) { + pandecode_msg("XXX: buffer overrun. " + "Chunk of size %zu at offset %d in buffer of size %zu. " + "Overrun by %zu bytes. \n", + sz, offset, bo->length, total - bo->length); + return; + } +} + struct pandecode_flag_info { u64 flag; const char *name; @@ -187,7 +225,7 @@ FLAG_INFO(HAS_MSAA), FLAG_INFO(CAN_DISCARD), FLAG_INFO(HAS_BLEND_SHADER), - FLAG_INFO(DEPTH_TEST), + FLAG_INFO(DEPTH_WRITEMASK), {} }; @@ -202,15 +240,6 @@ }; #undef FLAG_INFO -#define FLAG_INFO(flag) { MALI_FRAMEBUFFER_##flag, "MALI_FRAMEBUFFER_" #flag } -static const struct pandecode_flag_info fb_fmt_flag_info[] = { - FLAG_INFO(MSAA_A), - FLAG_INFO(MSAA_B), - FLAG_INFO(MSAA_8), - {} -}; -#undef FLAG_INFO - #define FLAG_INFO(flag) { MALI_MFBD_FORMAT_##flag, "MALI_MFBD_FORMAT_" #flag } static const struct pandecode_flag_info mfbd_fmt_flag_info[] = { FLAG_INFO(MSAA), @@ -220,9 +249,14 @@ #undef FLAG_INFO #define FLAG_INFO(flag) { MALI_EXTRA_##flag, "MALI_EXTRA_" #flag } -static const struct pandecode_flag_info mfbd_extra_flag_info[] = { +static const struct pandecode_flag_info mfbd_extra_flag_hi_info[] = { FLAG_INFO(PRESENT), - FLAG_INFO(AFBC), + {} +}; +#undef FLAG_INFO + +#define FLAG_INFO(flag) { MALI_EXTRA_##flag, "MALI_EXTRA_" #flag } +static const struct pandecode_flag_info mfbd_extra_flag_lo_info[] = { FLAG_INFO(ZS), {} }; @@ -231,7 +265,6 @@ #define FLAG_INFO(flag) { MALI_##flag, "MALI_" #flag } static const struct pandecode_flag_info shader_midgard1_flag_info [] = { FLAG_INFO(EARLY_Z), - FLAG_INFO(HELPER_INVOCATIONS), FLAG_INFO(READS_TILEBUFFER), FLAG_INFO(READS_ZS), {} @@ -257,6 +290,22 @@ }; #undef FLAG_INFO +#define FLAG_INFO(flag) { MALI_SFBD_FORMAT_##flag, "MALI_SFBD_FORMAT_" #flag } +static const struct pandecode_flag_info sfbd_unk1_info [] = { + FLAG_INFO(MSAA_8), + FLAG_INFO(MSAA_A), + {} +}; +#undef FLAG_INFO + +#define FLAG_INFO(flag) { MALI_SFBD_FORMAT_##flag, "MALI_SFBD_FORMAT_" #flag } +static const struct pandecode_flag_info sfbd_unk2_info [] = { + FLAG_INFO(MSAA_B), + FLAG_INFO(SRGB), + {} +}; +#undef FLAG_INFO + extern char *replace_fragment; extern char *replace_vertex; @@ -267,7 +316,7 @@ switch (type) { DEFINE_CASE(NULL); - DEFINE_CASE(SET_VALUE); + DEFINE_CASE(WRITE_VALUE); DEFINE_CASE(CACHE_FLUSH); DEFINE_CASE(COMPUTE); DEFINE_CASE(VERTEX); @@ -305,7 +354,8 @@ DEFINE_CASE(QUAD_STRIP); default: - return "MALI_TRIANGLES /* XXX: Unknown GL mode, check dump */"; + pandecode_msg("XXX: invalid draw mode %X\n", mode); + return ""; } #undef DEFINE_CASE @@ -326,28 +376,8 @@ DEFINE_CASE(ALWAYS); default: - return "MALI_FUNC_NEVER /* XXX: Unknown function, check dump */"; - } -} -#undef DEFINE_CASE - -/* Why is this duplicated? Who knows... */ -#define DEFINE_CASE(name) case MALI_ALT_FUNC_ ## name: return "MALI_ALT_FUNC_" #name -static char * -pandecode_alt_func(enum mali_alt_func mode) -{ - switch (mode) { - DEFINE_CASE(NEVER); - DEFINE_CASE(LESS); - DEFINE_CASE(EQUAL); - DEFINE_CASE(LEQUAL); - DEFINE_CASE(GREATER); - DEFINE_CASE(NOTEQUAL); - DEFINE_CASE(GEQUAL); - DEFINE_CASE(ALWAYS); - - default: - return "MALI_FUNC_NEVER /* XXX: Unknown function, check dump */"; + pandecode_msg("XXX: invalid func %X\n", mode); + return ""; } } #undef DEFINE_CASE @@ -367,49 +397,53 @@ DEFINE_CASE(DECR); default: - return "MALI_STENCIL_KEEP /* XXX: Unknown stencil op, check dump */"; + pandecode_msg("XXX: invalid stencil op %X\n", op); + return ""; } } #undef DEFINE_CASE -#define DEFINE_CASE(name) case MALI_ATTR_ ## name: return "MALI_ATTR_" #name -static char *pandecode_attr_mode(enum mali_attr_mode mode) +static char *pandecode_attr_mode_short(enum mali_attr_mode mode) { switch(mode) { - DEFINE_CASE(UNUSED); - DEFINE_CASE(LINEAR); - DEFINE_CASE(POT_DIVIDE); - DEFINE_CASE(MODULO); - DEFINE_CASE(NPOT_DIVIDE); - DEFINE_CASE(IMAGE); - DEFINE_CASE(INTERNAL); + /* TODO: Combine to just "instanced" once this can be done + * unambiguously in all known cases */ + case MALI_ATTR_POT_DIVIDE: + return "instanced_pot"; + case MALI_ATTR_MODULO: + return "instanced_mod"; + case MALI_ATTR_NPOT_DIVIDE: + return "instanced_npot"; + case MALI_ATTR_IMAGE: + return "image"; default: - return "MALI_ATTR_UNUSED /* XXX: Unknown stencil op, check dump */"; + pandecode_msg("XXX: invalid attribute mode %X\n", mode); + return ""; } } -#undef DEFINE_CASE - -#define DEFINE_CASE(name) case MALI_CHANNEL_## name: return "MALI_CHANNEL_" #name -static char * -pandecode_channel(enum mali_channel channel) +static const char * +pandecode_special_record(uint64_t v, bool* attribute) { - switch (channel) { - DEFINE_CASE(RED); - DEFINE_CASE(GREEN); - DEFINE_CASE(BLUE); - DEFINE_CASE(ALPHA); - DEFINE_CASE(ZERO); - DEFINE_CASE(ONE); - DEFINE_CASE(RESERVED_0); - DEFINE_CASE(RESERVED_1); - + switch(v) { + case MALI_ATTR_VERTEXID: + *attribute = true; + return "gl_VertexID"; + case MALI_ATTR_INSTANCEID: + *attribute = true; + return "gl_InstanceID"; + case MALI_VARYING_FRAG_COORD: + return "gl_FragCoord"; + case MALI_VARYING_FRONT_FACING: + return "gl_FrontFacing"; + case MALI_VARYING_POINT_COORD: + return "gl_PointCoord"; default: - return "MALI_CHANNEL_ZERO /* XXX: Unknown channel, check dump */"; + pandecode_msg("XXX: invalid special record %" PRIx64 "\n", v); + return ""; } } -#undef DEFINE_CASE #define DEFINE_CASE(name) case MALI_WRAP_## name: return "MALI_WRAP_" #name static char * @@ -422,30 +456,15 @@ DEFINE_CASE(MIRRORED_REPEAT); default: - return "MALI_WRAP_REPEAT /* XXX: Unknown wrap mode, check dump */"; - } -} -#undef DEFINE_CASE - -#define DEFINE_CASE(name) case MALI_TEX_## name: return "MALI_TEX_" #name -static char * -pandecode_texture_type(enum mali_texture_type type) -{ - switch (type) { - DEFINE_CASE(1D); - DEFINE_CASE(2D); - DEFINE_CASE(3D); - DEFINE_CASE(CUBE); - - default: - unreachable("Unknown case"); + pandecode_msg("XXX: invalid wrap mode %X\n", op); + return ""; } } #undef DEFINE_CASE -#define DEFINE_CASE(name) case MALI_MFBD_BLOCK_## name: return "MALI_MFBD_BLOCK_" #name +#define DEFINE_CASE(name) case MALI_BLOCK_## name: return "MALI_BLOCK_" #name static char * -pandecode_mfbd_block_format(enum mali_mfbd_block_format fmt) +pandecode_block_format(enum mali_block_format fmt) { switch (fmt) { DEFINE_CASE(TILED); @@ -460,10 +479,10 @@ #undef DEFINE_CASE #define DEFINE_CASE(name) case MALI_EXCEPTION_ACCESS_## name: return ""#name -static char * -pandecode_exception_access(enum mali_exception_access fmt) +char * +pandecode_exception_access(unsigned access) { - switch (fmt) { + switch (access) { DEFINE_CASE(NONE); DEFINE_CASE(EXECUTE); DEFINE_CASE(READ); @@ -479,30 +498,108 @@ * larger FBD */ static void -pandecode_midgard_tiler_descriptor(const struct midgard_tiler_descriptor *t) +pandecode_midgard_tiler_descriptor( + const struct midgard_tiler_descriptor *t, + unsigned width, + unsigned height, + bool is_fragment, + bool has_hierarchy) { pandecode_log(".tiler = {\n"); pandecode_indent++; - pandecode_prop("hierarchy_mask = 0x%" PRIx16, t->hierarchy_mask); - pandecode_prop("flags = 0x%" PRIx16, t->flags); - pandecode_prop("polygon_list_size = 0x%x", t->polygon_list_size); + if (t->hierarchy_mask == MALI_TILER_DISABLED) + pandecode_prop("hierarchy_mask = MALI_TILER_DISABLED"); + else + pandecode_prop("hierarchy_mask = 0x%" PRIx16, t->hierarchy_mask); + + /* We know this name from the kernel, but we never see it nonzero */ + + if (t->flags) + pandecode_msg("XXX: unexpected tiler flags 0x%" PRIx16, t->flags); MEMORY_PROP(t, polygon_list); - MEMORY_PROP(t, polygon_list_body); + + /* The body is offset from the base of the polygon list */ + assert(t->polygon_list_body > t->polygon_list); + unsigned body_offset = t->polygon_list_body - t->polygon_list; + + /* It needs to fit inside the reported size */ + assert(t->polygon_list_size >= body_offset); + + /* Check that we fit */ + struct pandecode_mapped_memory *plist = + pandecode_find_mapped_gpu_mem_containing(t->polygon_list); + + assert(t->polygon_list_size <= plist->length); + + /* Now that we've sanity checked, we'll try to calculate the sizes + * ourselves for comparison */ + + unsigned ref_header = panfrost_tiler_header_size(width, height, t->hierarchy_mask, has_hierarchy); + unsigned ref_size = panfrost_tiler_full_size(width, height, t->hierarchy_mask, has_hierarchy); + + if (!((ref_header == body_offset) && (ref_size == t->polygon_list_size))) { + pandecode_msg("XXX: bad polygon list size (expected %d / 0x%x)\n", + ref_header, ref_size); + pandecode_prop("polygon_list_size = 0x%x", t->polygon_list_size); + pandecode_msg("body offset %d\n", body_offset); + } + + /* The tiler heap has a start and end specified -- it should be + * identical to what we have in the BO. The exception is if tiling is + * disabled. */ MEMORY_PROP(t, heap_start); + assert(t->heap_end >= t->heap_start); + + struct pandecode_mapped_memory *heap = + pandecode_find_mapped_gpu_mem_containing(t->heap_start); + + unsigned heap_size = t->heap_end - t->heap_start; + + /* Tiling is enabled with a special flag */ + unsigned hierarchy_mask = t->hierarchy_mask & MALI_HIERARCHY_MASK; + unsigned tiler_flags = t->hierarchy_mask ^ hierarchy_mask; - if (t->heap_start == t->heap_end) { - /* Print identically to show symmetry for empty tiler heaps */ - MEMORY_PROP(t, heap_end); + bool tiling_enabled = hierarchy_mask; + + if (tiling_enabled) { + /* When tiling is enabled, the heap should be a tight fit */ + unsigned heap_offset = t->heap_start - heap->gpu_va; + if ((heap_offset + heap_size) != heap->length) { + pandecode_msg("XXX: heap size %u (expected %zu)\n", + heap_size, heap->length - heap_offset); + } + + /* We should also have no other flags */ + if (tiler_flags) + pandecode_msg("XXX: unexpected tiler %X\n", tiler_flags); } else { - /* Points to the end of a buffer */ - char *a = pointer_as_memory_reference(t->heap_end - 1); - pandecode_prop("heap_end = %s + 1", a); - free(a); + /* When tiling is disabled, we should have that flag and no others */ + + if (tiler_flags != MALI_TILER_DISABLED) { + pandecode_msg("XXX: unexpected tiler flag %X, expected MALI_TILER_DISABLED\n", + tiler_flags); + } + + /* We should also have an empty heap */ + if (heap_size) { + pandecode_msg("XXX: tiler heap size %d given, expected empty\n", + heap_size); + } + + /* Disabled tiling is used only for clear-only jobs, which are + * purely FRAGMENT, so we should never see this for + * non-FRAGMENT descriptors. */ + + if (!is_fragment) + pandecode_msg("XXX: tiler disabled for non-FRAGMENT job\n"); } + /* We've never seen weights used in practice, but we know from the + * kernel these fields is there */ + bool nonzero_weights = false; for (unsigned w = 0; w < ARRAY_SIZE(t->weights); ++w) { @@ -523,24 +620,75 @@ pandecode_log("}\n"); } +/* Information about the framebuffer passed back for + * additional analysis */ + +struct pandecode_fbd { + unsigned width; + unsigned height; + unsigned rt_count; + bool has_extra; +}; + static void -pandecode_sfbd(uint64_t gpu_va, int job_no) +pandecode_sfbd_format(struct mali_sfbd_format format) +{ + pandecode_log(".format = {\n"); + pandecode_indent++; + + pandecode_log(".unk1 = "); + pandecode_log_decoded_flags(sfbd_unk1_info, format.unk1); + pandecode_log_cont(",\n"); + + /* TODO: Map formats so we can check swizzles and print nicely */ + pandecode_log("swizzle"); + pandecode_swizzle(format.swizzle, MALI_RGBA8_UNORM); + pandecode_log_cont(",\n"); + + pandecode_prop("nr_channels = MALI_POSITIVE(%d)", + (format.nr_channels + 1)); + + pandecode_log(".unk2 = "); + pandecode_log_decoded_flags(sfbd_unk2_info, format.unk2); + pandecode_log_cont(",\n"); + + pandecode_prop("block = %s", pandecode_block_format(format.block)); + + pandecode_prop("unk3 = 0x%" PRIx32, format.unk3); + + pandecode_indent--; + pandecode_log("},\n"); +} + +static struct pandecode_fbd +pandecode_sfbd(uint64_t gpu_va, int job_no, bool is_fragment, unsigned gpu_id) { struct pandecode_mapped_memory *mem = pandecode_find_mapped_gpu_mem_containing(gpu_va); const struct mali_single_framebuffer *PANDECODE_PTR_VAR(s, mem, (mali_ptr) gpu_va); + struct pandecode_fbd info = { + .has_extra = false, + .rt_count = 1 + }; + pandecode_log("struct mali_single_framebuffer framebuffer_%"PRIx64"_%d = {\n", gpu_va, job_no); pandecode_indent++; pandecode_prop("unknown1 = 0x%" PRIx32, s->unknown1); pandecode_prop("unknown2 = 0x%" PRIx32, s->unknown2); - pandecode_log(".format = "); - pandecode_log_decoded_flags(fb_fmt_flag_info, s->format); - pandecode_log_cont(",\n"); + pandecode_sfbd_format(s->format); + + info.width = s->width + 1; + info.height = s->height + 1; + + pandecode_prop("width = MALI_POSITIVE(%" PRId16 ")", info.width); + pandecode_prop("height = MALI_POSITIVE(%" PRId16 ")", info.height); + + MEMORY_PROP(s, checksum); - pandecode_prop("width = MALI_POSITIVE(%" PRId16 ")", s->width + 1); - pandecode_prop("height = MALI_POSITIVE(%" PRId16 ")", s->height + 1); + if (s->checksum_stride) + pandecode_prop("checksum_stride = %d", s->checksum_stride); MEMORY_PROP(s, framebuffer); pandecode_prop("stride = %d", s->stride); @@ -552,14 +700,28 @@ pandecode_log_decoded_flags(clear_flag_info, s->clear_flags); pandecode_log_cont(",\n"); - if (s->depth_buffer | s->depth_buffer_enable) { + if (s->depth_buffer) { MEMORY_PROP(s, depth_buffer); - pandecode_prop("depth_buffer_enable = %s", DS_ENABLE(s->depth_buffer_enable)); + pandecode_prop("depth_stride = %d", s->depth_stride); } - if (s->stencil_buffer | s->stencil_buffer_enable) { + if (s->stencil_buffer) { MEMORY_PROP(s, stencil_buffer); - pandecode_prop("stencil_buffer_enable = %s", DS_ENABLE(s->stencil_buffer_enable)); + pandecode_prop("stencil_stride = %d", s->stencil_stride); + } + + if (s->depth_stride_zero || + s->stencil_stride_zero || + s->zero7 || s->zero8) { + pandecode_msg("XXX: Depth/stencil zeros tripped\n"); + pandecode_prop("depth_stride_zero = 0x%x", + s->depth_stride_zero); + pandecode_prop("stencil_stride_zero = 0x%x", + s->stencil_stride_zero); + pandecode_prop("zero7 = 0x%" PRIx32, + s->zero7); + pandecode_prop("zero8 = 0x%" PRIx32, + s->zero8); } if (s->clear_color_1 | s->clear_color_2 | s->clear_color_3 | s->clear_color_4) { @@ -580,9 +742,11 @@ pandecode_prop("clear_stencil = 0x%x", s->clear_stencil); } - MEMORY_PROP(s, unknown_address_0); + MEMORY_PROP(s, scratchpad); const struct midgard_tiler_descriptor t = s->tiler; - pandecode_midgard_tiler_descriptor(&t); + + bool has_hierarchy = !(gpu_id == 0x0720 || gpu_id == 0x0820 || gpu_id == 0x0830); + pandecode_midgard_tiler_descriptor(&t, s->width + 1, s->height + 1, is_fragment, has_hierarchy); pandecode_indent--; pandecode_log("};\n"); @@ -591,59 +755,165 @@ pandecode_prop("zero1 = 0x%" PRIx64, s->zero1); pandecode_prop("zero2 = 0x%" PRIx32, s->zero2); pandecode_prop("zero4 = 0x%" PRIx32, s->zero4); + pandecode_prop("zero5 = 0x%" PRIx32, s->zero5); - printf(".zero3 = {"); + pandecode_log_cont(".zero3 = {"); for (int i = 0; i < sizeof(s->zero3) / sizeof(s->zero3[0]); ++i) - printf("%X, ", s->zero3[i]); + pandecode_log_cont("%X, ", s->zero3[i]); - printf("},\n"); + pandecode_log_cont("},\n"); - printf(".zero6 = {"); + pandecode_log_cont(".zero6 = {"); for (int i = 0; i < sizeof(s->zero6) / sizeof(s->zero6[0]); ++i) - printf("%X, ", s->zero6[i]); + pandecode_log_cont("%X, ", s->zero6[i]); + + pandecode_log_cont("},\n"); - printf("},\n"); + return info; } static void -pandecode_u32_slide(unsigned name, const u32 *slide, unsigned count) +pandecode_compute_fbd(uint64_t gpu_va, int job_no) { - pandecode_log(".unknown%d = {", name); + struct pandecode_mapped_memory *mem = pandecode_find_mapped_gpu_mem_containing(gpu_va); + const struct mali_compute_fbd *PANDECODE_PTR_VAR(s, mem, (mali_ptr) gpu_va); + + pandecode_log("struct mali_compute_fbd framebuffer_%"PRIx64"_%d = {\n", gpu_va, job_no); + pandecode_indent++; - for (int i = 0; i < count; ++i) - printf("%X, ", slide[i]); + pandecode_log(".unknown1 = {"); + + for (int i = 0; i < ARRAY_SIZE(s->unknown1); ++i) + pandecode_log_cont("%X, ", s->unknown1[i]); pandecode_log("},\n"); + + pandecode_indent--; + pandecode_log_cont("},\n"); +} + +/* Extracts the number of components associated with a Mali format */ + +static unsigned +pandecode_format_component_count(enum mali_format fmt) +{ + /* Mask out the format class */ + unsigned top = fmt & 0b11100000; + + switch (top) { + case MALI_FORMAT_SNORM: + case MALI_FORMAT_UINT: + case MALI_FORMAT_UNORM: + case MALI_FORMAT_SINT: + return ((fmt >> 3) & 3) + 1; + default: + /* TODO: Validate */ + return 4; + } +} + +/* Extracts a mask of accessed components from a 12-bit Mali swizzle */ + +static unsigned +pandecode_access_mask_from_channel_swizzle(unsigned swizzle) +{ + unsigned mask = 0; + assert(MALI_CHANNEL_RED == 0); + + for (unsigned c = 0; c < 4; ++c) { + enum mali_channel chan = (swizzle >> (3*c)) & 0x7; + + if (chan <= MALI_CHANNEL_ALPHA) + mask |= (1 << chan); + } + + return mask; } -#define SHORT_SLIDE(num) \ - pandecode_u32_slide(num, s->unknown ## num, ARRAY_SIZE(s->unknown ## num)) +/* Validates that a (format, swizzle) pair is valid, in the sense that the + * swizzle doesn't access any components that are undefined in the format. + * Returns whether the swizzle is trivial (doesn't do any swizzling) and can be + * omitted */ + +static bool +pandecode_validate_format_swizzle(enum mali_format fmt, unsigned swizzle) +{ + unsigned nr_comp = pandecode_format_component_count(fmt); + unsigned access_mask = pandecode_access_mask_from_channel_swizzle(swizzle); + unsigned valid_mask = (1 << nr_comp) - 1; + unsigned invalid_mask = ~valid_mask; + + if (access_mask & invalid_mask) { + pandecode_msg("XXX: invalid components accessed\n"); + return false; + } + + /* Check for the default non-swizzling swizzle so we can suppress + * useless printing for the defaults */ + + unsigned default_swizzles[4] = { + MALI_CHANNEL_RED | (MALI_CHANNEL_ZERO << 3) | (MALI_CHANNEL_ZERO << 6) | (MALI_CHANNEL_ONE << 9), + MALI_CHANNEL_RED | (MALI_CHANNEL_GREEN << 3) | (MALI_CHANNEL_ZERO << 6) | (MALI_CHANNEL_ONE << 9), + MALI_CHANNEL_RED | (MALI_CHANNEL_GREEN << 3) | (MALI_CHANNEL_BLUE << 6) | (MALI_CHANNEL_ONE << 9), + MALI_CHANNEL_RED | (MALI_CHANNEL_GREEN << 3) | (MALI_CHANNEL_BLUE << 6) | (MALI_CHANNEL_ALPHA << 9) + }; + + return (swizzle == default_swizzles[nr_comp - 1]); +} + +/* Maps MALI_RGBA32F to rgba32f, etc */ static void -pandecode_compute_fbd(uint64_t gpu_va, int job_no) +pandecode_format_short(enum mali_format fmt, bool srgb) { - struct pandecode_mapped_memory *mem = pandecode_find_mapped_gpu_mem_containing(gpu_va); - const struct mali_compute_fbd *PANDECODE_PTR_VAR(s, mem, (mali_ptr) gpu_va); + /* We want a type-like format, so cut off the initial MALI_ */ + char *format = pandecode_format(fmt); + format += strlen("MALI_"); - pandecode_log("struct mali_compute_fbd framebuffer_%"PRIx64"_%d = {\n", gpu_va, job_no); - pandecode_indent++; + unsigned len = strlen(format); + char *lower_format = calloc(1, len + 1); - SHORT_SLIDE(1); + for (unsigned i = 0; i < len; ++i) + lower_format[i] = tolower(format[i]); - pandecode_indent--; - printf("},\n"); + /* Sanity check sRGB flag is applied to RGB, per the name */ + if (srgb && lower_format[0] != 'r') + pandecode_msg("XXX: sRGB applied to non-colour format\n"); + + /* Just prefix with an s, so you get formats like srgba8_unorm */ + if (srgb) + pandecode_log_cont("s"); + + pandecode_log_cont("%s", lower_format); + free(lower_format); } static void -pandecode_swizzle(unsigned swizzle) +pandecode_swizzle(unsigned swizzle, enum mali_format format) { - pandecode_prop("swizzle = %s | (%s << 3) | (%s << 6) | (%s << 9)", - pandecode_channel((swizzle >> 0) & 0x7), - pandecode_channel((swizzle >> 3) & 0x7), - pandecode_channel((swizzle >> 6) & 0x7), - pandecode_channel((swizzle >> 9) & 0x7)); + /* First, do some validation */ + bool trivial_swizzle = pandecode_validate_format_swizzle( + format, swizzle); + + if (trivial_swizzle) + return; + + /* Next, print the swizzle */ + pandecode_log_cont("."); + + static const char components[] = "rgba01"; + + for (unsigned c = 0; c < 4; ++c) { + enum mali_channel chan = (swizzle >> (3 * c)) & 0x7; + + if (chan >= MALI_CHANNEL_RESERVED_0) { + pandecode_log("XXX: invalid swizzle channel %d\n", chan); + continue; + } + pandecode_log_cont("%c", components[chan]); + } } static void @@ -656,19 +926,30 @@ pandecode_prop("unk2 = 0x%" PRIx32, format.unk2); pandecode_prop("unk3 = 0x%" PRIx32, format.unk3); - pandecode_prop("block = %s", - pandecode_mfbd_block_format(format.block)); + pandecode_prop("block = %s", pandecode_block_format(format.block)); + + /* TODO: Map formats so we can check swizzles and print nicely */ + pandecode_log("swizzle"); + pandecode_swizzle(format.swizzle, MALI_RGBA8_UNORM); + pandecode_log_cont(",\n"); pandecode_prop("nr_channels = MALI_POSITIVE(%d)", - MALI_NEGATIVE(format.nr_channels)); + (format.nr_channels + 1)); pandecode_log(".flags = "); pandecode_log_decoded_flags(mfbd_fmt_flag_info, format.flags); pandecode_log_cont(",\n"); - pandecode_swizzle(format.swizzle); - - pandecode_prop("no_preload = 0x%" PRIx32, format.no_preload); + /* In theory, the no_preload bit can be cleared to enable MFBD preload, + * which is a faster hardware-based alternative to the wallpaper method + * to preserve framebuffer contents across frames. In practice, MFBD + * preload is buggy on Midgard, and so this is a chicken bit. If this + * bit isn't set, most likely something broke unrelated to preload */ + + if (!format.no_preload) { + pandecode_msg("XXX: buggy MFBD preload enabled - chicken bit should be clear\n"); + pandecode_prop("no_preload = 0x%" PRIx32, format.no_preload); + } if (format.zero) pandecode_prop("zero = 0x%" PRIx32, format.zero); @@ -683,7 +964,7 @@ pandecode_log("struct bifrost_render_target rts_list_%"PRIx64"_%d[] = {\n", gpu_va, job_no); pandecode_indent++; - for (int i = 0; i < MALI_NEGATIVE(fb->rt_count_1); i++) { + for (int i = 0; i < (fb->rt_count_1 + 1); i++) { mali_ptr rt_va = gpu_va + i * sizeof(struct bifrost_render_target); struct pandecode_mapped_memory *mem = pandecode_find_mapped_gpu_mem_containing(rt_va); @@ -694,7 +975,7 @@ pandecode_rt_format(rt->format); - if (rt->format.block == MALI_MFBD_BLOCK_AFBC) { + if (rt->format.block == MALI_BLOCK_AFBC) { pandecode_log(".afbc = {\n"); pandecode_indent++; @@ -707,18 +988,11 @@ pandecode_indent--; pandecode_log("},\n"); - } else { - pandecode_log(".chunknown = {\n"); - pandecode_indent++; - - pandecode_prop("unk = 0x%" PRIx64, rt->chunknown.unk); - - char *a = pointer_as_memory_reference(rt->chunknown.pointer); - pandecode_prop("pointer = %s", a); - free(a); - - pandecode_indent--; - pandecode_log("},\n"); + } else if (rt->afbc.metadata || rt->afbc.stride || rt->afbc.unk) { + pandecode_msg("XXX: AFBC disabled but AFBC field set (0x%lX, 0x%x, 0x%x)\n", + rt->afbc.metadata, + rt->afbc.stride, + rt->afbc.unk); } MEMORY_PROP(rt, framebuffer); @@ -732,7 +1006,7 @@ } if (rt->zero1 || rt->zero2 || rt->zero3) { - pandecode_msg("render target zeros tripped\n"); + pandecode_msg("XXX: render target zeros tripped\n"); pandecode_prop("zero1 = 0x%" PRIx64, rt->zero1); pandecode_prop("zero2 = 0x%" PRIx32, rt->zero2); pandecode_prop("zero3 = 0x%" PRIx32, rt->zero3); @@ -746,12 +1020,14 @@ pandecode_log("};\n"); } -static unsigned -pandecode_mfbd_bfr(uint64_t gpu_va, int job_no, bool with_render_targets) +static struct pandecode_fbd +pandecode_mfbd_bfr(uint64_t gpu_va, int job_no, bool is_fragment, bool is_compute) { struct pandecode_mapped_memory *mem = pandecode_find_mapped_gpu_mem_containing(gpu_va); const struct bifrost_framebuffer *PANDECODE_PTR_VAR(fb, mem, (mali_ptr) gpu_va); + struct pandecode_fbd info; + if (fb->sample_locations) { /* The blob stores all possible sample locations in a single buffer * allocated on startup, and just switches the pointer when switching @@ -782,6 +1058,7 @@ pandecode_log("struct bifrost_framebuffer framebuffer_%"PRIx64"_%d = {\n", gpu_va, job_no); pandecode_indent++; + pandecode_prop("stack_shift = 0x%x", fb->stack_shift); pandecode_prop("unk0 = 0x%x", fb->unk0); if (fb->sample_locations) @@ -791,6 +1068,10 @@ * now */ MEMORY_PROP(fb, unknown1); + info.width = fb->width1 + 1; + info.height = fb->height1 + 1; + info.rt_count = fb->rt_count_1 + 1; + pandecode_prop("width1 = MALI_POSITIVE(%d)", fb->width1 + 1); pandecode_prop("height1 = MALI_POSITIVE(%d)", fb->height1 + 1); pandecode_prop("width2 = MALI_POSITIVE(%d)", fb->width2 + 1); @@ -805,16 +1086,26 @@ pandecode_log_decoded_flags(mfbd_flag_info, fb->mfbd_flags); pandecode_log_cont(",\n"); - pandecode_prop("clear_stencil = 0x%x", fb->clear_stencil); - pandecode_prop("clear_depth = %f", fb->clear_depth); + if (fb->clear_stencil) + pandecode_prop("clear_stencil = 0x%x", fb->clear_stencil); + + if (fb->clear_depth) + pandecode_prop("clear_depth = %f", fb->clear_depth); + + /* TODO: What is this? Let's not blow up.. */ + if (fb->unknown2 != 0x1F) + pandecode_prop("unknown2 = 0x%x", fb->unknown2); pandecode_prop("unknown2 = 0x%x", fb->unknown2); MEMORY_PROP(fb, scratchpad); const struct midgard_tiler_descriptor t = fb->tiler; - pandecode_midgard_tiler_descriptor(&t); + if (!is_compute) + pandecode_midgard_tiler_descriptor(&t, fb->width1 + 1, fb->height1 + 1, is_fragment, true); + else + pandecode_msg("XXX: skipping compute MFBD, fixme\n"); if (fb->zero3 || fb->zero4) { - pandecode_msg("framebuffer zeros tripped\n"); + pandecode_msg("XXX: framebuffer zeros tripped\n"); pandecode_prop("zero3 = 0x%" PRIx32, fb->zero3); pandecode_prop("zero4 = 0x%" PRIx32, fb->zero4); } @@ -824,7 +1115,9 @@ gpu_va += sizeof(struct bifrost_framebuffer); - if ((fb->mfbd_flags & MALI_MFBD_EXTRA) && with_render_targets) { + info.has_extra = (fb->mfbd_flags & MALI_MFBD_EXTRA) && is_fragment; + + if (info.has_extra) { mem = pandecode_find_mapped_gpu_mem_containing(gpu_va); const struct bifrost_fb_extra *PANDECODE_PTR_VAR(fbx, mem, (mali_ptr) gpu_va); @@ -836,11 +1129,17 @@ if (fbx->checksum_stride) pandecode_prop("checksum_stride = %d", fbx->checksum_stride); - pandecode_log(".flags = "); - pandecode_log_decoded_flags(mfbd_extra_flag_info, fbx->flags); + pandecode_log(".flags_hi = "); + pandecode_log_decoded_flags(mfbd_extra_flag_hi_info, fbx->flags_lo); + pandecode_log_cont(",\n"); + + pandecode_log(".flags_lo = "); + pandecode_log_decoded_flags(mfbd_extra_flag_lo_info, fbx->flags_lo); pandecode_log_cont(",\n"); - if (fbx->flags & MALI_EXTRA_AFBC_ZS) { + pandecode_prop("zs_block = %s\n", pandecode_block_format(fbx->zs_block)); + + if (fbx->zs_block == MALI_BLOCK_AFBC) { pandecode_log(".ds_afbc = {\n"); pandecode_indent++; @@ -850,7 +1149,7 @@ MEMORY_PROP_DIR(fbx->ds_afbc, depth_stencil); if (fbx->ds_afbc.zero1 || fbx->ds_afbc.padding) { - pandecode_msg("Depth/stencil AFBC zeros tripped\n"); + pandecode_msg("XXX: Depth/stencil AFBC zeros tripped\n"); pandecode_prop("zero1 = 0x%" PRIx32, fbx->ds_afbc.zero1); pandecode_prop("padding = 0x%" PRIx64, @@ -867,18 +1166,22 @@ MEMORY_PROP_DIR(fbx->ds_linear, depth); pandecode_prop("depth_stride = %d", fbx->ds_linear.depth_stride); + } else if (fbx->ds_linear.depth_stride) { + pandecode_msg("XXX: depth stride zero tripped %d\n", fbx->ds_linear.depth_stride); } if (fbx->ds_linear.stencil) { MEMORY_PROP_DIR(fbx->ds_linear, stencil); pandecode_prop("stencil_stride = %d", fbx->ds_linear.stencil_stride); + } else if (fbx->ds_linear.stencil_stride) { + pandecode_msg("XXX: stencil stride zero tripped %d\n", fbx->ds_linear.stencil_stride); } if (fbx->ds_linear.depth_stride_zero || fbx->ds_linear.stencil_stride_zero || fbx->ds_linear.zero1 || fbx->ds_linear.zero2) { - pandecode_msg("Depth/stencil zeros tripped\n"); + pandecode_msg("XXX: Depth/stencil zeros tripped\n"); pandecode_prop("depth_stride_zero = 0x%x", fbx->ds_linear.depth_stride_zero); pandecode_prop("stencil_stride_zero = 0x%x", @@ -894,7 +1197,7 @@ } if (fbx->zero3 || fbx->zero4) { - pandecode_msg("fb_extra zeros tripped\n"); + pandecode_msg("XXX: fb_extra zeros tripped\n"); pandecode_prop("zero3 = 0x%" PRIx64, fbx->zero3); pandecode_prop("zero4 = 0x%" PRIx64, fbx->zero4); } @@ -905,11 +1208,10 @@ gpu_va += sizeof(struct bifrost_fb_extra); } - if (with_render_targets) + if (is_fragment) pandecode_render_target(gpu_va, job_no, fb); - /* Passback the render target count */ - return MALI_NEGATIVE(fb->rt_count_1); + return info; } /* Just add a comment decoding the shift/odd fields forming the padded vertices @@ -1011,50 +1313,100 @@ static void pandecode_attributes(const struct pandecode_mapped_memory *mem, mali_ptr addr, int job_no, char *suffix, - int count, bool varying) + int count, bool varying, enum mali_job_type job_type) { - char *prefix = varying ? "varyings" : "attributes"; + char *prefix = varying ? "varying" : "attribute"; + assert(addr); - if (!addr) { - pandecode_msg("no %s\n", prefix); + if (!count) { + pandecode_msg("warn: No %s records\n", prefix); return; } union mali_attr *attr = pandecode_fetch_gpu_mem(mem, addr, sizeof(union mali_attr) * count); - char base[128]; - snprintf(base, sizeof(base), "%s_data_%d%s", prefix, job_no, suffix); - for (int i = 0; i < count; ++i) { + /* First, check for special records */ + if (attr[i].elements < MALI_RECORD_SPECIAL) { + if (attr[i].size) + pandecode_msg("XXX: tripped size=%d\n", attr[i].size); + + if (attr[i].stride) { + /* gl_InstanceID passes a magic divisor in the + * stride field to divide by the padded vertex + * count. No other records should do so, so + * stride should otherwise be zero. Note that + * stride in the usual attribute sense doesn't + * apply to special records. */ + + bool has_divisor = attr[i].elements == MALI_ATTR_INSTANCEID; + + pandecode_log_cont("/* %smagic divisor = %X */ ", + has_divisor ? "" : "XXX: ", attr[i].stride); + } + + if (attr[i].shift || attr[i].extra_flags) { + /* Attributes use these fields for + * instancing/padding/etc type issues, but + * varyings don't */ + + pandecode_log_cont("/* %sshift=%d, extra=%d */ ", + varying ? "XXX: " : "", + attr[i].shift, attr[i].extra_flags); + } + + /* Print the special record name */ + bool attribute = false; + pandecode_log("%s_%d = %s;\n", prefix, i, pandecode_special_record(attr[i].elements, &attribute)); + + /* Sanity check */ + if (attribute == varying) + pandecode_msg("XXX: mismatched special record\n"); + + continue; + } + enum mali_attr_mode mode = attr[i].elements & 7; if (mode == MALI_ATTR_UNUSED) - continue; + pandecode_msg("XXX: unused attribute record\n"); - mali_ptr raw_elements = attr[i].elements & ~7; + /* For non-linear records, we need to print the type of record */ + if (mode != MALI_ATTR_LINEAR) + pandecode_log_cont("%s ", pandecode_attr_mode_short(mode)); + + /* Print the name to link with attr_meta */ + pandecode_log_cont("%s_%d", prefix, i); + + /* Print the stride and size */ + pandecode_log_cont("<%u>[%u]", attr[i].stride, attr[i].size); - /* TODO: Do we maybe want to dump the attribute values - * themselves given the specified format? Or is that too hard? - * */ + /* TODO: Sanity check the quotient itself. It must be equal to + * (or be greater than, if the driver added padding) the padded + * vertex count. */ + /* Finally, print the pointer */ + mali_ptr raw_elements = attr[i].elements & ~7; char *a = pointer_as_memory_reference(raw_elements); - pandecode_log("mali_ptr %s_%d_p = %s;\n", base, i, a); + pandecode_log_cont(" = (%s);\n", a); free(a); - } - pandecode_log("union mali_attr %s_%d[] = {\n", prefix, job_no); - pandecode_indent++; + /* Check the pointer */ + pandecode_validate_buffer(raw_elements, attr[i].size); - for (int i = 0; i < count; ++i) { - pandecode_log("{\n"); - pandecode_indent++; + /* shift/extra_flags exist only for instanced */ + if (attr[i].shift | attr[i].extra_flags) { + /* These are set to random values by the blob for + * varyings, most likely a symptom of uninitialized + * memory where the hardware masked the bug. As such we + * put this at a warning, not an error. */ - unsigned mode = attr[i].elements & 7; - pandecode_prop("elements = (%s_%d_p) | %s", base, i, pandecode_attr_mode(mode)); - pandecode_prop("shift = %d", attr[i].shift); - pandecode_prop("extra_flags = %d", attr[i].extra_flags); - pandecode_prop("stride = 0x%" PRIx32, attr[i].stride); - pandecode_prop("size = 0x%" PRIx32, attr[i].size); + if (mode == MALI_ATTR_LINEAR) + pandecode_msg("warn: instancing fields set for linear\n"); + + pandecode_prop("shift = %d", attr[i].shift); + pandecode_prop("extra_flags = %d", attr[i].extra_flags); + } /* Decode further where possible */ @@ -1064,9 +1416,6 @@ attr[i].extra_flags); } - pandecode_indent--; - pandecode_log("}, \n"); - if (mode == MALI_ATTR_NPOT_DIVIDE) { i++; pandecode_log("{\n"); @@ -1074,7 +1423,7 @@ pandecode_prop("unk = 0x%x", attr[i].unk); pandecode_prop("magic_divisor = 0x%08x", attr[i].magic_divisor); if (attr[i].zero != 0) - pandecode_prop("zero = 0x%x /* XXX zero tripped */", attr[i].zero); + pandecode_prop("XXX: zero tripped (0x%x)\n", attr[i].zero); pandecode_prop("divisor = %d", attr[i].divisor); pandecode_magic_divisor(attr[i].magic_divisor, attr[i - 1].shift, attr[i].divisor, attr[i - 1].extra_flags); pandecode_indent--; @@ -1083,8 +1432,7 @@ } - pandecode_indent--; - pandecode_log("};\n"); + pandecode_log("\n"); } static mali_ptr @@ -1100,21 +1448,14 @@ return shader_ptr; } -static bool -all_zero(unsigned *buffer, unsigned count) -{ - for (unsigned i = 0; i < count; ++i) { - if (buffer[i]) - return false; - } - - return true; -} - static void pandecode_stencil(const char *name, const struct mali_stencil_test *stencil) { - if (all_zero((unsigned *) stencil, sizeof(stencil) / sizeof(unsigned))) + unsigned any_nonzero = + stencil->ref | stencil->mask | stencil->func | + stencil->sfail | stencil->dpfail | stencil->dppass; + + if (any_nonzero == 0) return; const char *func = pandecode_func(stencil->func); @@ -1123,7 +1464,7 @@ const char *dppass = pandecode_stencil_op(stencil->dppass); if (stencil->zero) - pandecode_msg("Stencil zero tripped: %X\n", stencil->zero); + pandecode_msg("XXX: stencil zero tripped: %X\n", stencil->zero); pandecode_log(".stencil_%s = {\n", name); pandecode_indent++; @@ -1141,7 +1482,7 @@ pandecode_blend_equation(const struct mali_blend_equation *blend) { if (blend->zero1) - pandecode_msg("Blend zero tripped: %X\n", blend->zero1); + pandecode_msg("XXX: blend zero tripped: %X\n", blend->zero1); pandecode_log(".equation = {\n"); pandecode_indent++; @@ -1196,7 +1537,8 @@ static mali_ptr pandecode_midgard_blend(union midgard_blend *blend, bool is_shader) { - if (all_zero((unsigned *) blend, sizeof(blend) / sizeof(unsigned))) + /* constant/equation is in a union */ + if (!blend->shader) return 0; pandecode_log(".blend = {\n"); @@ -1239,6 +1581,11 @@ return shader; } +/* Attributes and varyings have descriptor records, which contain information + * about their format and ordering with the attribute/varying buffers. We'll + * want to validate that the combinations specified are self-consistent. + */ + static int pandecode_attribute_meta(int job_no, int count, const struct mali_vertex_tiler_postfix *v, bool varying, char *suffix) { @@ -1247,11 +1594,8 @@ unsigned max_index = 0; snprintf(base, sizeof(base), "%s_meta", prefix); - pandecode_log("struct mali_attr_meta %s_%d%s[] = {\n", base, job_no, suffix); - pandecode_indent++; - struct mali_attr_meta *attr_meta; - mali_ptr p = varying ? (v->varying_meta & ~0xF) : v->attribute_meta; + mali_ptr p = varying ? v->varying_meta : v->attribute_meta; struct pandecode_mapped_memory *attr_mem = pandecode_find_mapped_gpu_mem_containing(p); @@ -1259,51 +1603,65 @@ attr_meta = pandecode_fetch_gpu_mem(attr_mem, p, sizeof(*attr_mem)); - pandecode_log("{\n"); - pandecode_indent++; - pandecode_prop("index = %d", attr_meta->index); + /* If the record is discard, it should be zero for everything else */ + + if (attr_meta->format == MALI_VARYING_DISCARD) { + uint64_t zero = + attr_meta->index | + attr_meta->unknown1 | + attr_meta->unknown3 | + attr_meta->src_offset; + + if (zero) + pandecode_msg("XXX: expected empty record for varying discard\n"); + + /* We want to look for a literal 0000 swizzle -- this + * is not encoded with all zeroes, however */ + + enum mali_channel z = MALI_CHANNEL_ZERO; + unsigned zero_swizzle = z | (z << 3) | (z << 6) | (z << 9); + bool good_swizzle = attr_meta->swizzle == zero_swizzle; + + if (!good_swizzle) + pandecode_msg("XXX: expected zero swizzle for discard\n"); + + if (!varying) + pandecode_msg("XXX: cannot discard attribute\n"); + + /* If we're all good, omit the record */ + if (!zero && varying && good_swizzle) { + pandecode_log("/* discarded varying */\n"); + continue; + } + } if (attr_meta->index > max_index) max_index = attr_meta->index; - pandecode_swizzle(attr_meta->swizzle); - pandecode_prop("format = %s", pandecode_format(attr_meta->format)); - - pandecode_prop("unknown1 = 0x%" PRIx64, (u64) attr_meta->unknown1); - pandecode_prop("unknown3 = 0x%" PRIx64, (u64) attr_meta->unknown3); - pandecode_prop("src_offset = %d", attr_meta->src_offset); - pandecode_indent--; - pandecode_log("},\n"); - } + if (attr_meta->unknown1 != 0x2) { + pandecode_msg("XXX: expected unknown1 = 0x2\n"); + pandecode_prop("unknown1 = 0x%" PRIx64, (u64) attr_meta->unknown1); + } - pandecode_indent--; - pandecode_log("};\n"); + if (attr_meta->unknown3) { + pandecode_msg("XXX: unexpected unknown3 set\n"); + pandecode_prop("unknown3 = 0x%" PRIx64, (u64) attr_meta->unknown3); + } - return count ? (max_index + 1) : 0; -} + pandecode_format_short(attr_meta->format, false); + pandecode_log_cont(" %s_%u", prefix, attr_meta->index); -static void -pandecode_indices(uintptr_t pindices, uint32_t index_count, int job_no) -{ - struct pandecode_mapped_memory *imem = pandecode_find_mapped_gpu_mem_containing(pindices); - - if (imem) { - /* Indices are literally just a u32 array :) */ + if (attr_meta->src_offset) + pandecode_log_cont("[%u]", attr_meta->src_offset); - uint32_t *PANDECODE_PTR_VAR(indices, imem, pindices); + pandecode_swizzle(attr_meta->swizzle, attr_meta->format); - pandecode_log("uint32_t indices_%d[] = {\n", job_no); - pandecode_indent++; + pandecode_log_cont(";\n"); + } - for (unsigned i = 0; i < (index_count + 1); i += 3) - pandecode_log("%d, %d, %d,\n", - indices[i], - indices[i + 1], - indices[i + 2]); + pandecode_log("\n"); - pandecode_indent--; - pandecode_log("};\n"); - } + return count ? (max_index + 1) : 0; } /* return bits [lo, hi) of word */ @@ -1317,34 +1675,64 @@ } static void -pandecode_vertex_tiler_prefix(struct mali_vertex_tiler_prefix *p, int job_no) +pandecode_vertex_tiler_prefix(struct mali_vertex_tiler_prefix *p, int job_no, bool graphics) { pandecode_log_cont("{\n"); pandecode_indent++; - pandecode_prop("invocation_count = 0x%" PRIx32, p->invocation_count); - pandecode_prop("size_y_shift = %d", p->size_y_shift); - pandecode_prop("size_z_shift = %d", p->size_z_shift); - pandecode_prop("workgroups_x_shift = %d", p->workgroups_x_shift); - pandecode_prop("workgroups_y_shift = %d", p->workgroups_y_shift); - pandecode_prop("workgroups_z_shift = %d", p->workgroups_z_shift); - pandecode_prop("workgroups_x_shift_2 = 0x%" PRIx32, p->workgroups_x_shift_2); - /* Decode invocation_count. See the comment before the definition of * invocation_count for an explanation. */ - pandecode_msg("size: (%d, %d, %d)\n", - bits(p->invocation_count, 0, p->size_y_shift) + 1, - bits(p->invocation_count, p->size_y_shift, p->size_z_shift) + 1, - bits(p->invocation_count, p->size_z_shift, - p->workgroups_x_shift) + 1); - pandecode_msg("workgroups: (%d, %d, %d)\n", - bits(p->invocation_count, p->workgroups_x_shift, - p->workgroups_y_shift) + 1, - bits(p->invocation_count, p->workgroups_y_shift, - p->workgroups_z_shift) + 1, - bits(p->invocation_count, p->workgroups_z_shift, - 32) + 1); + + unsigned size_y_shift = bits(p->invocation_shifts, 0, 5); + unsigned size_z_shift = bits(p->invocation_shifts, 5, 10); + unsigned workgroups_x_shift = bits(p->invocation_shifts, 10, 16); + unsigned workgroups_y_shift = bits(p->invocation_shifts, 16, 22); + unsigned workgroups_z_shift = bits(p->invocation_shifts, 22, 28); + unsigned workgroups_x_shift_2 = bits(p->invocation_shifts, 28, 32); + + unsigned size_x = bits(p->invocation_count, 0, size_y_shift) + 1; + unsigned size_y = bits(p->invocation_count, size_y_shift, size_z_shift) + 1; + unsigned size_z = bits(p->invocation_count, size_z_shift, workgroups_x_shift) + 1; + + unsigned groups_x = bits(p->invocation_count, workgroups_x_shift, workgroups_y_shift) + 1; + unsigned groups_y = bits(p->invocation_count, workgroups_y_shift, workgroups_z_shift) + 1; + unsigned groups_z = bits(p->invocation_count, workgroups_z_shift, 32) + 1; + + /* Even though we have this decoded, we want to ensure that the + * representation is "unique" so we don't lose anything by printing only + * the final result. More specifically, we need to check that we were + * passed something in canonical form, since the definition per the + * hardware is inherently not unique. How? Well, take the resulting + * decode and pack it ourselves! If it is bit exact with what we + * decoded, we're good to go. */ + + struct mali_vertex_tiler_prefix ref; + panfrost_pack_work_groups_compute(&ref, groups_x, groups_y, groups_z, size_x, size_y, size_z, graphics); + + bool canonical = + (p->invocation_count == ref.invocation_count) && + (p->invocation_shifts == ref.invocation_shifts); + + if (!canonical) { + pandecode_msg("XXX: non-canonical workgroups packing\n"); + pandecode_msg("expected: %X, %X", + ref.invocation_count, + ref.invocation_shifts); + + pandecode_prop("invocation_count = 0x%" PRIx32, p->invocation_count); + pandecode_prop("size_y_shift = %d", size_y_shift); + pandecode_prop("size_z_shift = %d", size_z_shift); + pandecode_prop("workgroups_x_shift = %d", workgroups_x_shift); + pandecode_prop("workgroups_y_shift = %d", workgroups_y_shift); + pandecode_prop("workgroups_z_shift = %d", workgroups_z_shift); + pandecode_prop("workgroups_x_shift_2 = %d", workgroups_x_shift_2); + } + + /* Regardless, print the decode */ + pandecode_msg("size (%d, %d, %d), count (%d, %d, %d)\n", + size_x, size_y, size_z, + groups_x, groups_y, groups_z); /* TODO: Decode */ if (p->unknown_draw) @@ -1352,22 +1740,43 @@ pandecode_prop("workgroups_x_shift_3 = 0x%" PRIx32, p->workgroups_x_shift_3); - pandecode_prop("draw_mode = %s", pandecode_draw_mode(p->draw_mode)); + if (p->draw_mode != MALI_DRAW_NONE) + pandecode_prop("draw_mode = %s", pandecode_draw_mode(p->draw_mode)); /* Index count only exists for tiler jobs anyway */ if (p->index_count) pandecode_prop("index_count = MALI_POSITIVE(%" PRId32 ")", p->index_count + 1); + + unsigned index_raw_size = (p->unknown_draw & MALI_DRAW_INDEXED_SIZE); + index_raw_size >>= MALI_DRAW_INDEXED_SHIFT; + + /* Validate an index buffer is present if we need one. TODO: verify + * relationship between invocation_count and index_count */ + + if (p->indices) { + unsigned count = p->index_count; + + /* Grab the size */ + unsigned size = (index_raw_size == 0x3) ? 4 : index_raw_size; + + /* Ensure we got a size, and if so, validate the index buffer + * is large enough to hold a full set of indices of the given + * size */ + + if (!index_raw_size) + pandecode_msg("XXX: index size missing\n"); + else + pandecode_validate_buffer(p->indices, count * size); + } else if (index_raw_size) + pandecode_msg("XXX: unexpected index size %u\n", index_raw_size); + if (p->offset_bias_correction) pandecode_prop("offset_bias_correction = %d", p->offset_bias_correction); - DYN_MEMORY_PROP(p, job_no, indices); - - if (p->zero1) { - pandecode_msg("Zero tripped\n"); - pandecode_prop("zero1 = 0x%" PRIx32, p->zero1); - } + /* TODO: Figure out what this is. It's not zero */ + pandecode_prop("zero1 = 0x%" PRIx32, p->zero1); pandecode_indent--; pandecode_log("},\n"); @@ -1377,53 +1786,30 @@ pandecode_uniform_buffers(mali_ptr pubufs, int ubufs_count, int job_no) { struct pandecode_mapped_memory *umem = pandecode_find_mapped_gpu_mem_containing(pubufs); - struct mali_uniform_buffer_meta *PANDECODE_PTR_VAR(ubufs, umem, pubufs); for (int i = 0; i < ubufs_count; i++) { - mali_ptr ptr = ubufs[i].ptr << 2; - struct pandecode_mapped_memory *umem2 = pandecode_find_mapped_gpu_mem_containing(ptr); - uint32_t *PANDECODE_PTR_VAR(ubuf, umem2, ptr); - char name[50]; - snprintf(name, sizeof(name), "ubuf_%d", i); - /* The blob uses ubuf 0 to upload internal stuff and - * uniforms that won't fit/are accessed indirectly, so - * it puts it in the batchbuffer. - */ - pandecode_log("uint32_t %s_%d[] = {\n", name, job_no); - pandecode_indent++; - - for (int j = 0; j <= ubufs[i].size; j++) { - for (int k = 0; k < 4; k++) { - if (k == 0) - pandecode_log("0x%"PRIx32", ", ubuf[4 * j + k]); - else - pandecode_log_cont("0x%"PRIx32", ", ubuf[4 * j + k]); + unsigned size = (ubufs[i].size + 1) * 16; + mali_ptr addr = ubufs[i].ptr << 2; - } + pandecode_validate_buffer(addr, size); - pandecode_log_cont("\n"); - } - - pandecode_indent--; - pandecode_log("};\n"); + char *ptr = pointer_as_memory_reference(ubufs[i].ptr << 2); + pandecode_log("ubuf_%d[%u] = %s;\n", i, size, ptr); + free(ptr); } - pandecode_log("struct mali_uniform_buffer_meta uniform_buffers_%"PRIx64"_%d[] = {\n", - pubufs, job_no); - pandecode_indent++; + pandecode_log("\n"); +} - for (int i = 0; i < ubufs_count; i++) { - pandecode_log("{\n"); - pandecode_indent++; - pandecode_prop("size = MALI_POSITIVE(%d)", ubufs[i].size + 1); - pandecode_prop("ptr = ubuf_%d_%d_p >> 2", i, job_no); - pandecode_indent--; - pandecode_log("},\n"); - } +static void +pandecode_uniforms(mali_ptr uniforms, unsigned uniform_count) +{ + pandecode_validate_buffer(uniforms, uniform_count * 16); - pandecode_indent--; - pandecode_log("};\n"); + char *ptr = pointer_as_memory_reference(uniforms); + pandecode_log("vec4 uniforms[%u] = %s;\n", uniform_count, ptr); + free(ptr); } static void @@ -1434,8 +1820,10 @@ struct bifrost_scratchpad *PANDECODE_PTR_VAR(scratchpad, mem, pscratchpad); - if (scratchpad->zero) - pandecode_msg("XXX scratchpad zero tripped"); + if (scratchpad->zero) { + pandecode_msg("XXX: scratchpad zero tripped"); + pandecode_prop("zero = 0x%x\n", scratchpad->zero); + } pandecode_log("struct bifrost_scratchpad scratchpad_%"PRIx64"_%d%s = {\n", pscratchpad, job_no, suffix); pandecode_indent++; @@ -1447,11 +1835,23 @@ pandecode_log("};\n"); } +static const char * +shader_type_for_job(unsigned type) +{ + switch (type) { + case JOB_TYPE_VERTEX: return "VERTEX"; + case JOB_TYPE_TILER: return "FRAGMENT"; + case JOB_TYPE_COMPUTE: return "COMPUTE"; + default: + return "UNKNOWN"; + } +} + static unsigned shader_id = 0; -static void +static struct midgard_disasm_stats pandecode_shader_disassemble(mali_ptr shader_ptr, int shader_no, int type, - bool is_bifrost, unsigned nr_regs) + bool is_bifrost, unsigned gpu_id) { struct pandecode_mapped_memory *mem = pandecode_find_mapped_gpu_mem_containing(shader_ptr); uint8_t *PANDECODE_PTR_VAR(code, mem, shader_ptr); @@ -1462,55 +1862,319 @@ /* Print some boilerplate to clearly denote the assembly (which doesn't * obey indentation rules), and actually do the disassembly! */ - printf("\n\n"); - - char prefix[512]; + pandecode_log_cont("\n\n"); - snprintf(prefix, sizeof(prefix) - 1, "shader%d - %s shader: ", - shader_id++, - (type == JOB_TYPE_TILER) ? "FRAGMENT" : "VERTEX"); + struct midgard_disasm_stats stats; if (is_bifrost) { - disassemble_bifrost(code, sz, false); + disassemble_bifrost(pandecode_dump_stream, code, sz, false); + + /* TODO: Extend stats to Bifrost */ + stats.texture_count = -128; + stats.sampler_count = -128; + stats.attribute_count = -128; + stats.varying_count = -128; + stats.uniform_count = -128; + stats.uniform_buffer_count = -128; + stats.work_count = -128; + + stats.instruction_count = 0; + stats.bundle_count = 0; + stats.quadword_count = 0; + stats.helper_invocations = false; } else { - disassemble_midgard(code, sz, true, nr_regs, prefix); + stats = disassemble_midgard(pandecode_dump_stream, + code, sz, gpu_id, + type == JOB_TYPE_TILER ? + MESA_SHADER_FRAGMENT : MESA_SHADER_VERTEX); + } + + /* Print shader-db stats. Skip COMPUTE jobs since they are used for + * driver-internal purposes with the blob and interfere */ + + bool should_shaderdb = type != JOB_TYPE_COMPUTE; + + if (should_shaderdb) { + unsigned nr_threads = + (stats.work_count <= 4) ? 4 : + (stats.work_count <= 8) ? 2 : + 1; + + pandecode_log_cont("shader%d - MESA_SHADER_%s shader: " + "%u inst, %u bundles, %u quadwords, " + "%u registers, %u threads, 0 loops, 0:0 spills:fills\n\n\n", + shader_id++, + shader_type_for_job(type), + stats.instruction_count, stats.bundle_count, stats.quadword_count, + stats.work_count, nr_threads); } - printf("\n\n"); + + return stats; } static void -pandecode_vertex_tiler_postfix_pre(const struct mali_vertex_tiler_postfix *p, +pandecode_texture(mali_ptr u, + struct pandecode_mapped_memory *tmem, + unsigned job_no, unsigned tex) +{ + struct mali_texture_descriptor *PANDECODE_PTR_VAR(t, tmem, u); + + pandecode_log("struct mali_texture_descriptor texture_descriptor_%"PRIx64"_%d_%d = {\n", u, job_no, tex); + pandecode_indent++; + + struct mali_texture_format f = t->format; + + /* See the definiton of enum mali_texture_type */ + + bool is_cube = f.type == MALI_TEX_CUBE; + unsigned dimension = is_cube ? 2 : f.type; + + pandecode_make_indent(); + + /* TODO: Are there others? */ + bool is_zs = f.format == MALI_Z32_UNORM; + + /* Recall Z/S switched the meaning of linear/tiled .. */ + if (is_zs && f.layout == MALI_TEXTURE_LINEAR) + pandecode_msg("XXX: depth/stencil cannot be tiled\n"); + + /* Print the layout. Default is linear; a modifier can denote AFBC or + * u-interleaved/tiled modes */ + + if (f.layout == MALI_TEXTURE_AFBC) + pandecode_log_cont("afbc"); + else if (f.layout == MALI_TEXTURE_TILED) + pandecode_log_cont("tiled"); + else if (f.layout == MALI_TEXTURE_LINEAR) + pandecode_log_cont("linear"); + else + pandecode_msg("XXX: invalid texture layout 0x%X\n", f.layout); + + pandecode_swizzle(t->swizzle, f.format); + pandecode_log_cont(" "); + + /* Distinguish cube/2D with modifier */ + + if (is_cube) + pandecode_log_cont("cube "); + + pandecode_format_short(f.format, f.srgb); + pandecode_swizzle(f.swizzle, f.format); + + /* All four width/height/depth/array_size dimensions are present + * regardless of the type of texture, but it is an error to have + * non-zero dimensions for unused dimensions. Verify this. array_size + * can always be set, as can width. */ + + if (t->height && dimension < 2) + pandecode_msg("XXX: nonzero height for <2D texture\n"); + + if (t->depth && dimension < 3) + pandecode_msg("XXX: nonzero depth for <2D texture\n"); + + /* Print only the dimensions that are actually there */ + + pandecode_log_cont(": %d", t->width + 1); + + if (dimension >= 2) + pandecode_log_cont("x%u", t->height + 1); + + if (dimension >= 3) + pandecode_log_cont("x%u", t->depth + 1); + + if (t->array_size) + pandecode_log_cont("[%u]", t->array_size + 1); + + if (t->levels) + pandecode_log_cont(" mip %u", t->levels); + + pandecode_log_cont("\n"); + + if (f.unknown1 | f.zero) { + pandecode_msg("XXX: texture format zero tripped\n"); + pandecode_prop("unknown1 = %" PRId32, f.unknown1); + pandecode_prop("zero = %" PRId32, f.zero); + } + + if (!f.unknown2) { + pandecode_msg("XXX: expected unknown texture bit set\n"); + pandecode_prop("unknown2 = %" PRId32, f.unknown1); + } + + if (t->swizzle_zero) { + pandecode_msg("XXX: swizzle zero tripped\n"); + pandecode_prop("swizzle_zero = %d", t->swizzle_zero); + } + + if (t->unknown3 | t->unknown3A | t->unknown5 | t->unknown6 | t->unknown7) { + pandecode_msg("XXX: texture zero tripped\n"); + pandecode_prop("unknown3 = %" PRId16, t->unknown3); + pandecode_prop("unknown3A = %" PRId8, t->unknown3A); + pandecode_prop("unknown5 = 0x%" PRIx32, t->unknown5); + pandecode_prop("unknown6 = 0x%" PRIx32, t->unknown6); + pandecode_prop("unknown7 = 0x%" PRIx32, t->unknown7); + } + + pandecode_log(".payload = {\n"); + pandecode_indent++; + + /* A bunch of bitmap pointers follow. + * We work out the correct number, + * based on the mipmap/cubemap + * properties, but dump extra + * possibilities to futureproof */ + + int bitmap_count = t->levels + 1; + + /* Miptree for each face */ + if (f.type == MALI_TEX_CUBE) + bitmap_count *= 6; + else if (f.type == MALI_TEX_3D) + bitmap_count *= t->depth; + + /* Array of textures */ + bitmap_count *= (t->array_size + 1); + + /* Stride for each element */ + if (f.manual_stride) + bitmap_count *= 2; + + mali_ptr *pointers_and_strides = pandecode_fetch_gpu_mem(tmem, + u + sizeof(*t), sizeof(mali_ptr) * bitmap_count); + for (int i = 0; i < bitmap_count; ++i) { + /* How we dump depends if this is a stride or a pointer */ + + if (f.manual_stride && (i & 1)) { + /* signed 32-bit snuck in as a 64-bit pointer */ + uint64_t stride_set = pointers_and_strides[i]; + uint32_t clamped_stride = stride_set; + int32_t stride = clamped_stride; + assert(stride_set == clamped_stride); + pandecode_log("(mali_ptr) %d /* stride */, \n", stride); + } else { + char *a = pointer_as_memory_reference(pointers_and_strides[i]); + pandecode_log("%s, \n", a); + free(a); + } + } + + pandecode_indent--; + pandecode_log("},\n"); + + pandecode_indent--; + pandecode_log("};\n"); +} + +/* For shader properties like texture_count, we have a claimed property in the shader_meta, and the actual Truth from static analysis (this may just be an upper limit). We validate accordingly */ + +static void +pandecode_shader_prop(const char *name, unsigned claim, signed truth, bool fuzzy) +{ + /* Nothing to do */ + if (claim == truth) + return; + + if (fuzzy) + assert(truth >= 0); + + if ((truth >= 0) && !fuzzy) { + pandecode_msg("%s: expected %s = %d, claimed %u\n", + (truth < claim) ? "warn" : "XXX", + name, truth, claim); + } else if ((claim > -truth) && !fuzzy) { + pandecode_msg("XXX: expected %s <= %u, claimed %u\n", + name, -truth, claim); + } else if (fuzzy && (claim < truth)) + pandecode_msg("XXX: expected %s >= %u, claimed %u\n", + name, truth, claim); + + pandecode_log(".%s = %" PRId16, name, claim); + + if (fuzzy) + pandecode_log_cont(" /* %u used */", truth); + + pandecode_log_cont(",\n"); +} + +static void +pandecode_blend_shader_disassemble(mali_ptr shader, int job_no, int job_type, + bool is_bifrost, unsigned gpu_id) +{ + struct midgard_disasm_stats stats = + pandecode_shader_disassemble(shader, job_no, job_type, is_bifrost, gpu_id); + + bool has_texture = (stats.texture_count > 0); + bool has_sampler = (stats.sampler_count > 0); + bool has_attribute = (stats.attribute_count > 0); + bool has_varying = (stats.varying_count > 0); + bool has_uniform = (stats.uniform_count > 0); + bool has_ubo = (stats.uniform_buffer_count > 0); + + if (has_texture || has_sampler) + pandecode_msg("XXX: blend shader accessing textures\n"); + + if (has_attribute || has_varying) + pandecode_msg("XXX: blend shader accessing interstage\n"); + + if (has_uniform || has_ubo) + pandecode_msg("XXX: blend shader accessing uniforms\n"); +} + +static void +pandecode_vertex_tiler_postfix_pre( + const struct mali_vertex_tiler_postfix *p, int job_no, enum mali_job_type job_type, - char *suffix, bool is_bifrost) + char *suffix, bool is_bifrost, unsigned gpu_id) { - mali_ptr shader_meta_ptr = (u64) (uintptr_t) (p->_shader_upper << 4); struct pandecode_mapped_memory *attr_mem; - unsigned rt_count = 1; - /* On Bifrost, since the tiler heap (for tiler jobs) and the scratchpad * are the only things actually needed from the FBD, vertex/tiler jobs * no longer reference the FBD -- instead, this field points to some * info about the scratchpad. */ + + struct pandecode_fbd fbd_info = { + /* Default for Bifrost */ + .rt_count = 1 + }; + if (is_bifrost) - pandecode_scratchpad(p->framebuffer & ~FBD_TYPE, job_no, suffix); + pandecode_scratchpad(p->framebuffer & ~1, job_no, suffix); else if (p->framebuffer & MALI_MFBD) - rt_count = pandecode_mfbd_bfr((u64) ((uintptr_t) p->framebuffer) & FBD_MASK, job_no, false); + fbd_info = pandecode_mfbd_bfr((u64) ((uintptr_t) p->framebuffer) & FBD_MASK, job_no, false, job_type == JOB_TYPE_COMPUTE); else if (job_type == JOB_TYPE_COMPUTE) pandecode_compute_fbd((u64) (uintptr_t) p->framebuffer, job_no); else - pandecode_sfbd((u64) (uintptr_t) p->framebuffer, job_no); + fbd_info = pandecode_sfbd((u64) (uintptr_t) p->framebuffer, job_no, false, gpu_id); int varying_count = 0, attribute_count = 0, uniform_count = 0, uniform_buffer_count = 0; int texture_count = 0, sampler_count = 0; - if (shader_meta_ptr) { - struct pandecode_mapped_memory *smem = pandecode_find_mapped_gpu_mem_containing(shader_meta_ptr); - struct mali_shader_meta *PANDECODE_PTR_VAR(s, smem, shader_meta_ptr); + if (p->shader) { + struct pandecode_mapped_memory *smem = pandecode_find_mapped_gpu_mem_containing(p->shader); + struct mali_shader_meta *PANDECODE_PTR_VAR(s, smem, p->shader); + + /* Disassemble ahead-of-time to get stats. Initialize with + * stats for the missing-shader case so we get validation + * there, too */ + + struct midgard_disasm_stats info = { + .texture_count = 0, + .sampler_count = 0, + .attribute_count = 0, + .varying_count = 0, + .work_count = 1, + + .uniform_count = -128, + .uniform_buffer_count = 0 + }; - pandecode_log("struct mali_shader_meta shader_meta_%"PRIx64"_%d%s = {\n", shader_meta_ptr, job_no, suffix); + if (s->shader & ~0xF) + info = pandecode_shader_disassemble(s->shader & ~0xF, job_no, job_type, is_bifrost, gpu_id); + + pandecode_log("struct mali_shader_meta shader_meta_%"PRIx64"_%d%s = {\n", p->shader, job_no, suffix); pandecode_indent++; /* Save for dumps */ @@ -1527,41 +2191,41 @@ uniform_buffer_count = s->midgard1.uniform_buffer_count; } - mali_ptr shader_ptr = pandecode_shader_address("shader", s->shader); - - pandecode_prop("texture_count = %" PRId16, s->texture_count); - pandecode_prop("sampler_count = %" PRId16, s->sampler_count); - pandecode_prop("attribute_count = %" PRId16, s->attribute_count); - pandecode_prop("varying_count = %" PRId16, s->varying_count); + pandecode_shader_address("shader", s->shader); - unsigned nr_registers = 0; + pandecode_shader_prop("texture_count", s->texture_count, info.texture_count, false); + pandecode_shader_prop("sampler_count", s->sampler_count, info.sampler_count, false); + pandecode_shader_prop("attribute_count", s->attribute_count, info.attribute_count, false); + pandecode_shader_prop("varying_count", s->varying_count, info.varying_count, false); + pandecode_shader_prop("uniform_buffer_count", + uniform_buffer_count, + info.uniform_buffer_count, true); - if (is_bifrost) { - pandecode_log(".bifrost1 = {\n"); - pandecode_indent++; + if (!is_bifrost) { + pandecode_shader_prop("uniform_count", + uniform_count, + info.uniform_count, false); - pandecode_prop("uniform_buffer_count = %" PRId32, s->bifrost1.uniform_buffer_count); - pandecode_prop("unk1 = 0x%" PRIx32, s->bifrost1.unk1); + pandecode_shader_prop("work_count", + s->midgard1.work_count, info.work_count, false); + } - pandecode_indent--; - pandecode_log("},\n"); + if (is_bifrost) { + pandecode_prop("bifrost1.unk1 = 0x%" PRIx32, s->bifrost1.unk1); } else { - pandecode_log(".midgard1 = {\n"); - pandecode_indent++; + bool helpers = s->midgard1.flags & MALI_HELPER_INVOCATIONS; + s->midgard1.flags &= ~MALI_HELPER_INVOCATIONS; - pandecode_prop("uniform_count = %" PRId16, s->midgard1.uniform_count); - pandecode_prop("uniform_buffer_count = %" PRId16, s->midgard1.uniform_buffer_count); - pandecode_prop("work_count = %" PRId16, s->midgard1.work_count); - nr_registers = s->midgard1.work_count; + if (helpers != info.helper_invocations) { + pandecode_msg("XXX: expected helpers %u but got %u\n", + info.helper_invocations, helpers); + } - pandecode_log(".flags = "); + pandecode_log(".midgard1.flags = "); pandecode_log_decoded_flags(shader_midgard1_flag_info, s->midgard1.flags); pandecode_log_cont(",\n"); - pandecode_prop("unknown2 = 0x%" PRIx32, s->midgard1.unknown2); - - pandecode_indent--; - pandecode_log("},\n"); + pandecode_prop("midgard1.unknown2 = 0x%" PRIx32, s->midgard1.unknown2); } if (s->depth_units || s->depth_factor) { @@ -1586,7 +2250,7 @@ /* We're not quite sure what these flags mean without the depth test, if anything */ - if (unknown2_3 & (MALI_DEPTH_TEST | MALI_DEPTH_FUNC_MASK)) { + if (unknown2_3 & (MALI_DEPTH_WRITEMASK | MALI_DEPTH_FUNC_MASK)) { const char *func = pandecode_func(MALI_GET_DEPTH_FUNC(unknown2_3)); unknown2_3 &= ~MALI_DEPTH_FUNC_MASK; @@ -1634,9 +2298,10 @@ if (!is_bifrost) { /* TODO: Blend shaders routing/disasm */ - union midgard_blend blend = s->blend; - pandecode_midgard_blend(&blend, false); + mali_ptr shader = pandecode_midgard_blend(&blend, s->unknown2_3 & MALI_HAS_BLEND_SHADER); + if (shader & ~0xF) + pandecode_blend_shader_disassemble(shader, job_no, job_type, false, gpu_id); } pandecode_indent--; @@ -1645,10 +2310,10 @@ /* MRT blend fields are used whenever MFBD is used, with * per-RT descriptors */ - if (job_type == JOB_TYPE_TILER) { + if (job_type == JOB_TYPE_TILER && p->framebuffer & MALI_MFBD) { void* blend_base = (void *) (s + 1); - for (unsigned i = 0; i < rt_count; i++) { + for (unsigned i = 0; i < fbd_info.rt_count; i++) { mali_ptr shader = 0; if (is_bifrost) @@ -1657,14 +2322,12 @@ shader = pandecode_midgard_blend_mrt(blend_base, job_no, i); if (shader & ~0xF) - pandecode_shader_disassemble(shader, job_no, job_type, false, 0); + pandecode_blend_shader_disassemble(shader, job_no, job_type, false, gpu_id); + } } - - if (shader_ptr & ~0xF) - pandecode_shader_disassemble(shader_ptr, job_no, job_type, is_bifrost, nr_registers); } else - pandecode_msg("\n"); + pandecode_msg("XXX: missing shader descriptor\n"); if (p->viewport) { struct pandecode_mapped_memory *fmem = pandecode_find_mapped_gpu_mem_containing(p->viewport); @@ -1692,11 +2355,14 @@ pandecode_log("};\n"); } - if (p->attribute_meta) { - unsigned max_attr_index = pandecode_attribute_meta(job_no, attribute_count, p, false, suffix); + unsigned max_attr_index = 0; + if (p->attribute_meta) + max_attr_index = pandecode_attribute_meta(job_no, attribute_count, p, false, suffix); + + if (p->attributes) { attr_mem = pandecode_find_mapped_gpu_mem_containing(p->attributes); - pandecode_attributes(attr_mem, p->attributes, job_no, suffix, max_attr_index + 1, false); + pandecode_attributes(attr_mem, p->attributes, job_no, suffix, max_attr_index, false, job_type); } /* Varyings are encoded like attributes but not actually sent; we just @@ -1713,64 +2379,27 @@ /* Number of descriptors depends on whether there are * non-internal varyings */ - pandecode_attributes(attr_mem, p->varyings, job_no, suffix, varying_count, true); - } - - bool is_compute = job_type == JOB_TYPE_COMPUTE; - - if (p->uniforms && !is_compute) { - int rows = uniform_count, width = 4; - size_t sz = rows * width * sizeof(float); - - struct pandecode_mapped_memory *uniform_mem = pandecode_find_mapped_gpu_mem_containing(p->uniforms); - pandecode_fetch_gpu_mem(uniform_mem, p->uniforms, sz); - u32 *PANDECODE_PTR_VAR(uniforms, uniform_mem, p->uniforms); - - pandecode_log("u32 uniforms_%d%s[] = {\n", job_no, suffix); - - pandecode_indent++; - - for (int row = 0; row < rows; row++) { - for (int i = 0; i < width; i++) { - u32 v = uniforms[i]; - float f; - memcpy(&f, &v, sizeof(v)); - pandecode_log_cont("%X /* %f */, ", v, f); - } - - pandecode_log_cont("\n"); - - uniforms += width; - } - - pandecode_indent--; - pandecode_log("};\n"); - } else if (p->uniforms) { - int rows = uniform_count * 2; - size_t sz = rows * sizeof(mali_ptr); - - struct pandecode_mapped_memory *uniform_mem = pandecode_find_mapped_gpu_mem_containing(p->uniforms); - pandecode_fetch_gpu_mem(uniform_mem, p->uniforms, sz); - mali_ptr *PANDECODE_PTR_VAR(uniforms, uniform_mem, p->uniforms); - - pandecode_log("mali_ptr uniforms_%d%s[] = {\n", job_no, suffix); - - pandecode_indent++; - - for (int row = 0; row < rows; row++) { - char *a = pointer_as_memory_reference(uniforms[row]); - pandecode_log("%s,\n", a); - free(a); - } - - pandecode_indent--; - pandecode_log("};\n"); - + pandecode_attributes(attr_mem, p->varyings, job_no, suffix, varying_count, true, job_type); } if (p->uniform_buffers) { - pandecode_uniform_buffers(p->uniform_buffers, uniform_buffer_count, job_no); - } + if (uniform_buffer_count) + pandecode_uniform_buffers(p->uniform_buffers, uniform_buffer_count, job_no); + else + pandecode_msg("warn: UBOs specified but not referenced\n"); + } else if (uniform_buffer_count) + pandecode_msg("XXX: UBOs referenced but not specified\n"); + + /* We don't want to actually dump uniforms, but we do need to validate + * that the counts we were given are sane */ + + if (p->uniforms) { + if (uniform_count) + pandecode_uniforms(p->uniforms, uniform_count); + else + pandecode_msg("warn: Uniforms specified but not referenced\n"); + } else if (uniform_count) + pandecode_msg("XXX: Uniforms referenced but not specified\n"); if (p->texture_trampoline) { struct pandecode_mapped_memory *mmem = pandecode_find_mapped_gpu_mem_containing(p->texture_trampoline); @@ -1792,108 +2421,11 @@ pandecode_log("};\n"); /* Now, finally, descend down into the texture descriptor */ - for (int tex = 0; tex < texture_count; ++tex) { + for (unsigned tex = 0; tex < texture_count; ++tex) { mali_ptr *PANDECODE_PTR_VAR(u, mmem, p->texture_trampoline + tex * sizeof(mali_ptr)); struct pandecode_mapped_memory *tmem = pandecode_find_mapped_gpu_mem_containing(*u); - - if (tmem) { - struct mali_texture_descriptor *PANDECODE_PTR_VAR(t, tmem, *u); - - pandecode_log("struct mali_texture_descriptor texture_descriptor_%"PRIx64"_%d_%d = {\n", *u, job_no, tex); - pandecode_indent++; - - pandecode_prop("width = MALI_POSITIVE(%" PRId16 ")", t->width + 1); - pandecode_prop("height = MALI_POSITIVE(%" PRId16 ")", t->height + 1); - pandecode_prop("depth = MALI_POSITIVE(%" PRId16 ")", t->depth + 1); - pandecode_prop("array_size = MALI_POSITIVE(%" PRId16 ")", t->array_size + 1); - pandecode_prop("unknown3 = %" PRId16, t->unknown3); - pandecode_prop("unknown3A = %" PRId8, t->unknown3A); - pandecode_prop("nr_mipmap_levels = %" PRId8, t->nr_mipmap_levels); - - struct mali_texture_format f = t->format; - - pandecode_log(".format = {\n"); - pandecode_indent++; - - pandecode_swizzle(f.swizzle); - pandecode_prop("format = %s", pandecode_format(f.format)); - pandecode_prop("type = %s", pandecode_texture_type(f.type)); - pandecode_prop("srgb = %" PRId32, f.srgb); - pandecode_prop("unknown1 = %" PRId32, f.unknown1); - pandecode_prop("usage2 = 0x%" PRIx32, f.usage2); - - pandecode_indent--; - pandecode_log("},\n"); - - pandecode_swizzle(t->swizzle); - - if (t->swizzle_zero) { - /* Shouldn't happen */ - pandecode_msg("Swizzle zero tripped but replay will be fine anyway"); - pandecode_prop("swizzle_zero = %d", t->swizzle_zero); - } - - pandecode_prop("unknown3 = 0x%" PRIx32, t->unknown3); - - pandecode_prop("unknown5 = 0x%" PRIx32, t->unknown5); - pandecode_prop("unknown6 = 0x%" PRIx32, t->unknown6); - pandecode_prop("unknown7 = 0x%" PRIx32, t->unknown7); - - pandecode_log(".payload = {\n"); - pandecode_indent++; - - /* A bunch of bitmap pointers follow. - * We work out the correct number, - * based on the mipmap/cubemap - * properties, but dump extra - * possibilities to futureproof */ - - int bitmap_count = MALI_NEGATIVE(t->nr_mipmap_levels); - bool manual_stride = f.usage2 & MALI_TEX_MANUAL_STRIDE; - - /* Miptree for each face */ - if (f.type == MALI_TEX_CUBE) - bitmap_count *= 6; - - /* Array of textures */ - bitmap_count *= MALI_NEGATIVE(t->array_size); - - /* Stride for each element */ - if (manual_stride) - bitmap_count *= 2; - - /* Sanity check the size */ - int max_count = sizeof(t->payload) / sizeof(t->payload[0]); - assert (bitmap_count <= max_count); - - /* Dump more to be safe, but not _that_ much more */ - int safe_count = MIN2(bitmap_count * 2, max_count); - - for (int i = 0; i < safe_count; ++i) { - char *prefix = (i >= bitmap_count) ? "// " : ""; - - /* How we dump depends if this is a stride or a pointer */ - - if ((f.usage2 & MALI_TEX_MANUAL_STRIDE) && (i & 1)) { - /* signed 32-bit snuck in as a 64-bit pointer */ - uint64_t stride_set = t->payload[i]; - uint32_t clamped_stride = stride_set; - int32_t stride = clamped_stride; - assert(stride_set == clamped_stride); - pandecode_log("%s(mali_ptr) %d /* stride */, \n", prefix, stride); - } else { - char *a = pointer_as_memory_reference(t->payload[i]); - pandecode_log("%s%s, \n", prefix, a); - free(a); - } - } - - pandecode_indent--; - pandecode_log("},\n"); - - pandecode_indent--; - pandecode_log("};\n"); - } + if (tmem) + pandecode_texture(*u, tmem, job_no, tex); } } } @@ -1919,14 +2451,17 @@ pandecode_prop("min_lod = FIXED_16(%f)", DECODE_FIXED_16(s->min_lod)); pandecode_prop("max_lod = FIXED_16(%f)", DECODE_FIXED_16(s->max_lod)); + if (s->lod_bias) + pandecode_prop("lod_bias = FIXED_16(%f)", DECODE_FIXED_16(s->lod_bias)); + pandecode_prop("wrap_s = %s", pandecode_wrap_mode(s->wrap_s)); pandecode_prop("wrap_t = %s", pandecode_wrap_mode(s->wrap_t)); pandecode_prop("wrap_r = %s", pandecode_wrap_mode(s->wrap_r)); - pandecode_prop("compare_func = %s", pandecode_alt_func(s->compare_func)); + pandecode_prop("compare_func = %s", pandecode_func(s->compare_func)); if (s->zero || s->zero2) { - pandecode_msg("Zero tripped\n"); + pandecode_msg("XXX: sampler zero tripped\n"); pandecode_prop("zero = 0x%X, 0x%X\n", s->zero, s->zero2); } @@ -1948,28 +2483,17 @@ static void pandecode_vertex_tiler_postfix(const struct mali_vertex_tiler_postfix *p, int job_no, bool is_bifrost) { - pandecode_log_cont("{\n"); - pandecode_indent++; + if (p->shader & 0xF) + pandecode_msg("warn: shader tagged %X\n", (unsigned) (p->shader & 0xF)); - MEMORY_PROP(p, position_varying); - DYN_MEMORY_PROP(p, job_no, uniform_buffers); - DYN_MEMORY_PROP(p, job_no, texture_trampoline); - DYN_MEMORY_PROP(p, job_no, sampler_descriptor); - DYN_MEMORY_PROP(p, job_no, uniforms); - DYN_MEMORY_PROP(p, job_no, attributes); - DYN_MEMORY_PROP(p, job_no, attribute_meta); - DYN_MEMORY_PROP(p, job_no, varyings); - DYN_MEMORY_PROP(p, job_no, varying_meta); - DYN_MEMORY_PROP(p, job_no, viewport); - DYN_MEMORY_PROP(p, job_no, occlusion_counter); + if (!(p->position_varying || p->occlusion_counter)) + return; - if (is_bifrost) - pandecode_prop("framebuffer = scratchpad_%d_p", job_no); - else - pandecode_prop("framebuffer = framebuffer_%d_p | %s", job_no, p->framebuffer & MALI_MFBD ? "MALI_MFBD" : "0"); + pandecode_log(".postfix = {\n"); + pandecode_indent++; - pandecode_prop("_shader_upper = (shader_meta_%d_p) >> 4", job_no); - pandecode_prop("flags = %d", p->flags); + MEMORY_PROP(p, position_varying); + MEMORY_PROP(p, occlusion_counter); pandecode_indent--; pandecode_log("},\n"); @@ -1984,7 +2508,7 @@ pandecode_prop("unk2 = 0x%x", v->unk2); if (v->zero0 || v->zero1) { - pandecode_msg("vertex only zero tripped"); + pandecode_msg("XXX: vertex only zero tripped"); pandecode_prop("zero0 = 0x%" PRIx32, v->zero0); pandecode_prop("zero1 = 0x%" PRIx64, v->zero1); } @@ -2004,13 +2528,13 @@ pandecode_indent++; if (h->zero) { - pandecode_msg("tiler heap zero tripped\n"); + pandecode_msg("XXX: tiler heap zero tripped\n"); pandecode_prop("zero = 0x%x", h->zero); } for (int i = 0; i < 12; i++) { if (h->zeros[i] != 0) { - pandecode_msg("tiler heap zero %d tripped, value %x\n", + pandecode_msg("XXX: tiler heap zero %d tripped, value %x\n", i, h->zeros[i]); } } @@ -2048,7 +2572,7 @@ pandecode_indent++; if (t->zero0 || t->zero1) { - pandecode_msg("tiler meta zero tripped"); + pandecode_msg("XXX: tiler meta zero tripped\n"); pandecode_prop("zero0 = 0x%" PRIx64, t->zero0); pandecode_prop("zero1 = 0x%" PRIx64, t->zero1); } @@ -2058,11 +2582,10 @@ pandecode_prop("width = MALI_POSITIVE(%d)", t->width + 1); pandecode_prop("height = MALI_POSITIVE(%d)", t->height + 1); - DYN_MEMORY_PROP(t, job_no, tiler_heap_meta); for (int i = 0; i < 12; i++) { if (t->zeros[i] != 0) { - pandecode_msg("tiler heap zero %d tripped, value %" PRIx64 "\n", + pandecode_msg("XXX: tiler heap zero %d tripped, value %" PRIx64 "\n", i, t->zeros[i]); } } @@ -2109,12 +2632,11 @@ /* TODO: gl_PointSize on Bifrost */ pandecode_primitive_size(t->primitive_size, true); - DYN_MEMORY_PROP(t, job_no, tiler_meta); pandecode_gl_enables(t->gl_enables, JOB_TYPE_TILER); if (t->zero1 || t->zero2 || t->zero3 || t->zero4 || t->zero5 || t->zero6 || t->zero7 || t->zero8) { - pandecode_msg("tiler only zero tripped"); + pandecode_msg("XXX: tiler only zero tripped\n"); pandecode_prop("zero1 = 0x%" PRIx64, t->zero1); pandecode_prop("zero2 = 0x%" PRIx64, t->zero2); pandecode_prop("zero3 = 0x%" PRIx64, t->zero3); @@ -2132,22 +2654,21 @@ static int pandecode_vertex_job_bfr(const struct mali_job_descriptor_header *h, const struct pandecode_mapped_memory *mem, - mali_ptr payload, int job_no) + mali_ptr payload, int job_no, unsigned gpu_id) { struct bifrost_payload_vertex *PANDECODE_PTR_VAR(v, mem, payload); - pandecode_vertex_tiler_postfix_pre(&v->postfix, job_no, h->job_type, "", true); + pandecode_vertex_tiler_postfix_pre(&v->postfix, job_no, h->job_type, "", true, gpu_id); pandecode_log("struct bifrost_payload_vertex payload_%d = {\n", job_no); pandecode_indent++; pandecode_log(".prefix = "); - pandecode_vertex_tiler_prefix(&v->prefix, job_no); + pandecode_vertex_tiler_prefix(&v->prefix, job_no, false); pandecode_log(".vertex = "); pandecode_vertex_only_bfr(&v->vertex); - pandecode_log(".postfix = "); pandecode_vertex_tiler_postfix(&v->postfix, job_no, true); pandecode_indent--; @@ -2159,25 +2680,22 @@ static int pandecode_tiler_job_bfr(const struct mali_job_descriptor_header *h, const struct pandecode_mapped_memory *mem, - mali_ptr payload, int job_no) + mali_ptr payload, int job_no, unsigned gpu_id) { struct bifrost_payload_tiler *PANDECODE_PTR_VAR(t, mem, payload); - pandecode_vertex_tiler_postfix_pre(&t->postfix, job_no, h->job_type, "", true); - - pandecode_indices(t->prefix.indices, t->prefix.index_count, job_no); + pandecode_vertex_tiler_postfix_pre(&t->postfix, job_no, h->job_type, "", true, gpu_id); pandecode_tiler_meta(t->tiler.tiler_meta, job_no); pandecode_log("struct bifrost_payload_tiler payload_%d = {\n", job_no); pandecode_indent++; pandecode_log(".prefix = "); - pandecode_vertex_tiler_prefix(&t->prefix, job_no); + pandecode_vertex_tiler_prefix(&t->prefix, job_no, false); pandecode_log(".tiler = "); pandecode_tiler_only_bfr(&t->tiler, job_no); - pandecode_log(".postfix = "); pandecode_vertex_tiler_postfix(&t->postfix, job_no, true); pandecode_indent--; @@ -2189,13 +2707,11 @@ static int pandecode_vertex_or_tiler_job_mdg(const struct mali_job_descriptor_header *h, const struct pandecode_mapped_memory *mem, - mali_ptr payload, int job_no) + mali_ptr payload, int job_no, unsigned gpu_id) { struct midgard_payload_vertex_tiler *PANDECODE_PTR_VAR(v, mem, payload); - pandecode_vertex_tiler_postfix_pre(&v->postfix, job_no, h->job_type, "", false); - - pandecode_indices(v->prefix.indices, v->prefix.index_count, job_no); + pandecode_vertex_tiler_postfix_pre(&v->postfix, job_no, h->job_type, "", false, gpu_id); pandecode_log("struct midgard_payload_vertex_tiler payload_%d = {\n", job_no); pandecode_indent++; @@ -2203,8 +2719,10 @@ bool has_primitive_pointer = v->prefix.unknown_draw & MALI_DRAW_VARYING_SIZE; pandecode_primitive_size(v->primitive_size, !has_primitive_pointer); + bool is_graphics = (h->job_type == JOB_TYPE_VERTEX) || (h->job_type == JOB_TYPE_TILER); + pandecode_log(".prefix = "); - pandecode_vertex_tiler_prefix(&v->prefix, job_no); + pandecode_vertex_tiler_prefix(&v->prefix, job_no, is_graphics); pandecode_gl_enables(v->gl_enables, h->job_type); @@ -2221,11 +2739,10 @@ pandecode_prop("offset_start = %d", v->offset_start); if (v->zero5) { - pandecode_msg("Zero tripped\n"); + pandecode_msg("XXX: midgard payload zero tripped\n"); pandecode_prop("zero5 = 0x%" PRIx64, v->zero5); } - pandecode_log(".postfix = "); pandecode_vertex_tiler_postfix(&v->postfix, job_no, false); pandecode_indent--; @@ -2237,71 +2754,91 @@ static int pandecode_fragment_job(const struct pandecode_mapped_memory *mem, mali_ptr payload, int job_no, - bool is_bifrost) + bool is_bifrost, unsigned gpu_id) { const struct mali_payload_fragment *PANDECODE_PTR_VAR(s, mem, payload); - bool fbd_dumped = false; + bool is_mfbd = s->framebuffer & MALI_MFBD; - if (!is_bifrost && (s->framebuffer & FBD_TYPE) == MALI_SFBD) { - /* Only SFBDs are understood, not MFBDs. We're speculating, - * based on the versioning, kernel code, etc, that the - * difference is between Single FrameBuffer Descriptor and - * Multiple FrmaeBuffer Descriptor; the change apparently lines - * up with multi-framebuffer support being added (T7xx onwards, - * including Gxx). In any event, there's some field shuffling - * that we haven't looked into yet. */ - - pandecode_sfbd(s->framebuffer & FBD_MASK, job_no); - fbd_dumped = true; - } else if ((s->framebuffer & FBD_TYPE) == MALI_MFBD) { - /* We don't know if Bifrost supports SFBD's at all, since the - * driver never uses them. And the format is different from - * Midgard anyways, due to the tiler heap and scratchpad being - * moved out into separate structures, so it's not clear what a - * Bifrost SFBD would even look like without getting an actual - * trace, which appears impossible. - */ + /* Bifrost theoretically may retain support for SFBD on compute jobs, + * but for graphics workloads with a FRAGMENT payload, use MFBD */ + + if (!is_mfbd && is_bifrost) + pandecode_msg("XXX: Bifrost fragment must use MFBD\n"); + + struct pandecode_fbd info; + + if (is_mfbd) + info = pandecode_mfbd_bfr(s->framebuffer & FBD_MASK, job_no, true, false); + else + info = pandecode_sfbd(s->framebuffer & FBD_MASK, job_no, true, gpu_id); + + /* Compute the tag for the tagged pointer. This contains the type of + * FBD (MFBD/SFBD), and in the case of an MFBD, information about which + * additional structures follow the MFBD header (an extra payload or + * not, as well as a count of render targets) */ - pandecode_mfbd_bfr(s->framebuffer & FBD_MASK, job_no, true); - fbd_dumped = true; + unsigned expected_tag = is_mfbd ? MALI_MFBD : 0; + + if (is_mfbd) { + if (info.has_extra) + expected_tag |= MALI_MFBD_TAG_EXTRA; + + expected_tag |= (MALI_POSITIVE(info.rt_count) << 2); } - uintptr_t p = (uintptr_t) s->framebuffer & FBD_MASK; - pandecode_log("struct mali_payload_fragment payload_%"PRIx64"_%d = {\n", payload, job_no); - pandecode_indent++; + if ((s->min_tile_coord | s->max_tile_coord) & ~(MALI_X_COORD_MASK | MALI_Y_COORD_MASK)) { + pandecode_msg("XXX: unexpected tile coordinate bits\n"); + pandecode_prop("min_tile_coord = 0x%X\n", s->min_tile_coord); + pandecode_prop("max_tile_coord = 0x%X\n", s->min_tile_coord); + } - /* See the comments by the macro definitions for mathematical context - * on why this is so weird */ + /* Extract tile coordinates */ - if (MALI_TILE_COORD_FLAGS(s->max_tile_coord) || MALI_TILE_COORD_FLAGS(s->min_tile_coord)) - pandecode_msg("Tile coordinate flag missed, replay wrong\n"); + unsigned min_x = MALI_TILE_COORD_X(s->min_tile_coord) << MALI_TILE_SHIFT; + unsigned min_y = MALI_TILE_COORD_Y(s->min_tile_coord) << MALI_TILE_SHIFT; - pandecode_prop("min_tile_coord = MALI_COORDINATE_TO_TILE_MIN(%d, %d)", - MALI_TILE_COORD_X(s->min_tile_coord) << MALI_TILE_SHIFT, - MALI_TILE_COORD_Y(s->min_tile_coord) << MALI_TILE_SHIFT); + unsigned max_x = (MALI_TILE_COORD_X(s->max_tile_coord) + 1) << MALI_TILE_SHIFT; + unsigned max_y = (MALI_TILE_COORD_Y(s->max_tile_coord) + 1) << MALI_TILE_SHIFT; - pandecode_prop("max_tile_coord = MALI_COORDINATE_TO_TILE_MAX(%d, %d)", - (MALI_TILE_COORD_X(s->max_tile_coord) + 1) << MALI_TILE_SHIFT, - (MALI_TILE_COORD_Y(s->max_tile_coord) + 1) << MALI_TILE_SHIFT); + /* For the max, we also want the floored (rather than ceiled) version for checking */ - /* If the FBD was just decoded, we can refer to it by pointer. If not, - * we have to fallback on offsets. */ + unsigned max_x_f = (MALI_TILE_COORD_X(s->max_tile_coord)) << MALI_TILE_SHIFT; + unsigned max_y_f = (MALI_TILE_COORD_Y(s->max_tile_coord)) << MALI_TILE_SHIFT; - const char *fbd_type = s->framebuffer & MALI_MFBD ? "MALI_MFBD" : "MALI_SFBD"; + /* Validate the coordinates are well-ordered */ - /* TODO: Decode */ - unsigned extra_flags = (s->framebuffer & ~FBD_MASK) & ~MALI_MFBD; + if (min_x == max_x) + pandecode_msg("XXX: empty X coordinates (%u = %u)\n", min_x, max_x); + else if (min_x > max_x) + pandecode_msg("XXX: misordered X coordinates (%u > %u)\n", min_x, max_x); - if (fbd_dumped) - pandecode_prop("framebuffer = framebuffer_%d_p | %s | 0x%X", job_no, - fbd_type, extra_flags); - else - pandecode_prop("framebuffer = %s | %s | 0x%X", pointer_as_memory_reference(p), - fbd_type, extra_flags); + if (min_y == max_y) + pandecode_msg("XXX: empty X coordinates (%u = %u)\n", min_x, max_x); + else if (min_y > max_y) + pandecode_msg("XXX: misordered X coordinates (%u > %u)\n", min_x, max_x); - pandecode_indent--; - pandecode_log("};\n"); + /* Validate the coordinates fit inside the framebuffer. We use floor, + * rather than ceil, for the max coordinates, since the tile + * coordinates for something like an 800x600 framebuffer will actually + * resolve to 800x608, which would otherwise trigger a Y-overflow */ + + if ((min_x > info.width) || (max_x_f > info.width)) + pandecode_msg("XXX: tile coordinates overflow in X direction\n"); + + if ((min_y > info.height) || (max_y_f > info.height)) + pandecode_msg("XXX: tile coordinates overflow in Y direction\n"); + + /* After validation, we print */ + + pandecode_log("fragment (%u, %u) ... (%u, %u)\n\n", min_x, min_y, max_x, max_y); + + /* The FBD is a tagged pointer */ + + unsigned tag = (s->framebuffer & ~FBD_MASK); + + if (tag != expected_tag) + pandecode_msg("XXX: expected FBD tag %X but got %X\n", expected_tag, tag); return sizeof(*s); } @@ -2309,14 +2846,13 @@ static int job_descriptor_number = 0; int -pandecode_jc(mali_ptr jc_gpu_va, bool bifrost) +pandecode_jc(mali_ptr jc_gpu_va, bool bifrost, unsigned gpu_id) { struct mali_job_descriptor_header *h; int start_number = 0; bool first = true; - bool last_size; do { struct pandecode_mapped_memory *mem = @@ -2334,8 +2870,7 @@ h->job_type != JOB_TYPE_FRAGMENT ? 4 : 0; mali_ptr payload_ptr = jc_gpu_va + sizeof(*h) - offset; - payload = pandecode_fetch_gpu_mem(mem, payload_ptr, - MALI_PAYLOAD_SIZE); + payload = pandecode_fetch_gpu_mem(mem, payload_ptr, 256); int job_no = job_descriptor_number++; @@ -2347,13 +2882,10 @@ pandecode_prop("job_type = %s", pandecode_job_type(h->job_type)); - /* Save for next job fixing */ - last_size = h->job_descriptor_size; - if (h->job_descriptor_size) pandecode_prop("job_descriptor_size = %d", h->job_descriptor_size); - if (h->exception_status != 0x1) + if (h->exception_status && h->exception_status != 0x1) pandecode_prop("exception_status = %x (source ID: 0x%x access: %s exception: 0x%x)", h->exception_status, (h->exception_status >> 16) & 0xFFFF, @@ -2390,12 +2922,23 @@ * reason. */ switch (h->job_type) { - case JOB_TYPE_SET_VALUE: { - struct mali_payload_set_value *s = payload; - pandecode_log("struct mali_payload_set_value payload_%"PRIx64"_%d = {\n", payload_ptr, job_no); + case JOB_TYPE_WRITE_VALUE: { + struct mali_payload_write_value *s = payload; + pandecode_log("struct mali_payload_write_value payload_%"PRIx64"_%d = {\n", payload_ptr, job_no); pandecode_indent++; - MEMORY_PROP(s, out); - pandecode_prop("unknown = 0x%" PRIX64, s->unknown); + MEMORY_PROP(s, address); + + if (s->value_descriptor != MALI_WRITE_VALUE_ZERO) { + pandecode_msg("XXX: unknown value descriptor\n"); + pandecode_prop("value_descriptor = 0x%" PRIX32, s->value_descriptor); + } + + if (s->reserved) { + pandecode_msg("XXX: set value tripped\n"); + pandecode_prop("reserved = 0x%" PRIX32, s->reserved); + } + + pandecode_prop("immediate = 0x%" PRIX64, s->immediate); pandecode_indent--; pandecode_log("};\n"); @@ -2407,16 +2950,16 @@ case JOB_TYPE_COMPUTE: if (bifrost) { if (h->job_type == JOB_TYPE_TILER) - pandecode_tiler_job_bfr(h, mem, payload_ptr, job_no); + pandecode_tiler_job_bfr(h, mem, payload_ptr, job_no, gpu_id); else - pandecode_vertex_job_bfr(h, mem, payload_ptr, job_no); + pandecode_vertex_job_bfr(h, mem, payload_ptr, job_no, gpu_id); } else - pandecode_vertex_or_tiler_job_mdg(h, mem, payload_ptr, job_no); + pandecode_vertex_or_tiler_job_mdg(h, mem, payload_ptr, job_no, gpu_id); break; case JOB_TYPE_FRAGMENT: - pandecode_fragment_job(mem, payload_ptr, job_no, bifrost); + pandecode_fragment_job(mem, payload_ptr, job_no, bifrost, gpu_id); break; default: @@ -2427,16 +2970,12 @@ if (!first) { pandecode_log("((struct mali_job_descriptor_header *) (uintptr_t) job_%d_p)->", job_no - 1); - - if (last_size) - pandecode_log_cont("next_job_64 = job_%d_p;\n\n", job_no); - else - pandecode_log_cont("next_job_32 = (u32) (uintptr_t) job_%d_p;\n\n", job_no); + pandecode_log_cont("next_job = job_%d_p;\n\n", job_no); } first = false; - } while ((jc_gpu_va = h->job_descriptor_size ? h->next_job_64 : h->next_job_32)); + } while ((jc_gpu_va = h->next_job)); return start_number; } diff -Nru mesa-19.2.8/src/panfrost/pandecode/decode.h mesa-20.0.8/src/panfrost/pandecode/decode.h --- mesa-19.2.8/src/panfrost/pandecode/decode.h 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/panfrost/pandecode/decode.h 2020-06-12 01:21:18.000000000 +0000 @@ -29,6 +29,8 @@ #include "public.h" #include "util/list.h" +extern FILE *pandecode_dump_stream; + struct pandecode_mapped_memory { struct list_head node; diff -Nru mesa-19.2.8/src/panfrost/pandecode/pan_pretty_print.c mesa-20.0.8/src/panfrost/pandecode/pan_pretty_print.c --- mesa-19.2.8/src/panfrost/pandecode/pan_pretty_print.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/panfrost/pandecode/pan_pretty_print.c 2020-06-12 01:21:18.000000000 +0000 @@ -33,9 +33,18 @@ #define DEFINE_CASE(name) case MALI_## name: return "MALI_" #name char *pandecode_format(enum mali_format format) { - static char unk_format_str[5]; + static char unk_format_str[10]; switch (format) { + DEFINE_CASE(ETC2_RGB8); + DEFINE_CASE(ETC2_R11_UNORM); + DEFINE_CASE(ETC2_RGBA8); + DEFINE_CASE(ETC2_RG11_UNORM); + DEFINE_CASE(ETC2_R11_SNORM); + DEFINE_CASE(ETC2_RG11_SNORM); + DEFINE_CASE(ETC2_RGB8A1); + DEFINE_CASE(ASTC_SRGB_SUPP); + DEFINE_CASE(ASTC_HDR_SUPP); DEFINE_CASE(RGB565); DEFINE_CASE(RGB5_A1_UNORM); DEFINE_CASE(RGB10_A2_UNORM); @@ -118,7 +127,7 @@ DEFINE_CASE(RGBA8_2); DEFINE_CASE(RGB10_A2_2); default: - snprintf(unk_format_str, sizeof(unk_format_str), "0x%02x", format); + snprintf(unk_format_str, sizeof(unk_format_str), "MALI_0x%02x", format); return unk_format_str; } } diff -Nru mesa-19.2.8/src/panfrost/pandecode/public.h mesa-20.0.8/src/panfrost/pandecode/public.h --- mesa-19.2.8/src/panfrost/pandecode/public.h 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/panfrost/pandecode/public.h 2020-06-12 01:21:18.000000000 +0000 @@ -44,9 +44,17 @@ void pandecode_initialize(void); +void pandecode_next_frame(void); + +void pandecode_close(void); + void pandecode_inject_mmap(uint64_t gpu_va, void *cpu, unsigned sz, const char *name); -int pandecode_jc(uint64_t jc_gpu_va, bool bifrost); +int pandecode_jc(uint64_t jc_gpu_va, bool bifrost, unsigned gpu_id); + +char * +pandecode_exception_access(unsigned access); + #endif /* __MMAP_TRACE_H__ */ diff -Nru mesa-19.2.8/src/panfrost/shared/meson.build mesa-20.0.8/src/panfrost/shared/meson.build --- mesa-19.2.8/src/panfrost/shared/meson.build 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/panfrost/shared/meson.build 2020-06-12 01:21:18.000000000 +0000 @@ -28,7 +28,7 @@ 'panfrost_shared', [libpanfrost_shared_files], include_directories : [inc_common], - c_args : [c_vis_args, no_override_init_args], + c_args : [c_vis_args, no_override_init_args, '-O3'], cpp_args : [cpp_vis_args], build_by_default : false, ) diff -Nru mesa-19.2.8/src/panfrost/shared/pan_tiling.c mesa-20.0.8/src/panfrost/shared/pan_tiling.c --- mesa-19.2.8/src/panfrost/shared/pan_tiling.c 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/panfrost/shared/pan_tiling.c 2020-06-12 01:21:18.000000000 +0000 @@ -25,8 +25,9 @@ * */ -#include #include "pan_tiling.h" +#include +#include "util/macros.h" /* This file implements software encode/decode of the tiling format used for * textures and framebuffers primarily on Utgard GPUs. Names for this format @@ -82,7 +83,7 @@ * 0b11001100. The idea is that for the bits in the solely Y place, we * get a Y place, and the bits in the XOR place *also* get a Y. */ -uint32_t bit_duplication[16] = { +const uint32_t bit_duplication[16] = { 0b00000000, 0b00000011, 0b00001100, @@ -103,7 +104,7 @@ /* Space the bits out of a 4-bit nibble */ -unsigned space_4[16] = { +const unsigned space_4[16] = { 0b0000000, 0b0000001, 0b0000100, @@ -128,185 +129,244 @@ #define TILE_HEIGHT 16 #define PIXELS_PER_TILE (TILE_WIDTH * TILE_HEIGHT) -/* An optimized routine to tile an aligned (width & 0xF == 0) bpp4 texture */ +/* We need a 128-bit type for idiomatically tiling bpp128 formats. The type must + * only support copies and sizeof, so emulating with a packed structure works + * well enough, but if there's a native 128-bit type we may we well prefer + * that. */ + +#ifdef __SIZEOF_INT128__ +typedef __uint128_t pan_uint128_t; +#else +typedef struct { + uint64_t lo; + uint64_t hi; +} __attribute__((packed)) pan_uint128_t; +#endif + +/* Optimized routine to tile an aligned (w & 0xF == 0) texture. Explanation: + * + * dest_start precomputes the offset to the beginning of the first horizontal + * tile we're writing to, knowing that x is 16-aligned. Tiles themselves are + * stored linearly, so we get the X tile number by shifting and then multiply + * by the bytes per tile . + * + * We iterate across the pixels we're trying to store in source-order. For each + * row in the destination image, we figure out which row of 16x16 block we're + * in, by slicing off the lower 4-bits (block_y). + * + * dest then precomputes the location of the top-left corner of the block the + * row starts in. In pixel coordinates (where the origin is the top-left), + * (block_y, 0) is the top-left corner of the leftmost tile in this row. While + * pixels are reordered within a block, the blocks themselves are stored + * linearly, so multiplying block_y by the pixel stride of the destination + * image equals the byte offset of that top-left corner of the block this row + * is in. + * + * On the other hand, the source is linear so we compute the locations of the + * start and end of the row in the source by a simple linear addressing. + * + * For indexing within the tile, we need to XOR with the [y3 y3 y2 y2 y1 y1 y0 + * y0] value. Since this is constant across a row, we look it up per-row and + * store in expanded_y. + * + * Finally, we iterate each row in source order. In the outer loop, we iterate + * each 16 pixel tile. Within each tile, we iterate the 16 pixels (this should + * be unrolled), calculating the index within the tile and writing. + */ + +#define TILED_STORE_TYPE(pixel_t, shift) \ +static void \ +panfrost_store_tiled_image_##pixel_t \ + (void *dst, const void *src, \ + uint16_t sx, uint16_t sy, \ + uint16_t w, uint16_t h, \ + uint32_t dst_stride, \ + uint32_t src_stride) \ +{ \ + uint8_t *dest_start = dst + ((sx >> 4) * PIXELS_PER_TILE * sizeof(pixel_t)); \ + for (int y = sy, src_y = 0; src_y < h; ++y, ++src_y) { \ + uint16_t block_y = y & ~0x0f; \ + uint8_t *dest = (uint8_t *) (dest_start + (block_y * dst_stride)); \ + const pixel_t *source = src + (src_y * src_stride); \ + const pixel_t *source_end = source + w; \ + unsigned expanded_y = bit_duplication[y & 0xF] << shift; \ + for (; source < source_end; dest += (PIXELS_PER_TILE << shift)) { \ + for (uint8_t i = 0; i < 16; ++i) { \ + unsigned index = expanded_y ^ (space_4[i] << shift); \ + *((pixel_t *) (dest + index)) = *(source++); \ + } \ + } \ + } \ +} \ + +TILED_STORE_TYPE(uint8_t, 0); +TILED_STORE_TYPE(uint16_t, 1); +TILED_STORE_TYPE(uint32_t, 2); +TILED_STORE_TYPE(uint64_t, 3); +TILED_STORE_TYPE(pan_uint128_t, 4); + +#define TILED_UNALIGNED_TYPE(pixel_t, is_store, tile_shift) { \ + const unsigned mask = (1 << tile_shift) - 1; \ + for (int y = sy, src_y = 0; src_y < h; ++y, ++src_y) { \ + unsigned block_y = y & ~mask; \ + unsigned block_start_s = block_y * dst_stride; \ + unsigned source_start = src_y * src_stride; \ + unsigned expanded_y = bit_duplication[y & mask]; \ + \ + for (int x = sx, src_x = 0; src_x < w; ++x, ++src_x) { \ + unsigned block_x_s = (x >> tile_shift) * (1 << (tile_shift * 2)); \ + unsigned index = expanded_y ^ space_4[x & mask]; \ + uint8_t *source = src + source_start + sizeof(pixel_t) * src_x; \ + uint8_t *dest = dst + block_start_s + sizeof(pixel_t) * (block_x_s + index); \ + \ + pixel_t *outp = (pixel_t *) (is_store ? dest : source); \ + pixel_t *inp = (pixel_t *) (is_store ? source : dest); \ + *outp = *inp; \ + } \ + } \ +} + +#define TILED_UNALIGNED_TYPES(store, shift) { \ + if (bpp == 8) \ + TILED_UNALIGNED_TYPE(uint8_t, store, shift) \ + else if (bpp == 16) \ + TILED_UNALIGNED_TYPE(uint16_t, store, shift) \ + else if (bpp == 32) \ + TILED_UNALIGNED_TYPE(uint32_t, store, shift) \ + else if (bpp == 64) \ + TILED_UNALIGNED_TYPE(uint64_t, store, shift) \ + else if (bpp == 128) \ + TILED_UNALIGNED_TYPE(pan_uint128_t, store, shift) \ +} static void -panfrost_store_tiled_image_bpp4(void *dst, const void *src, - const struct pipe_box *box, +panfrost_access_tiled_image_generic(void *dst, void *src, + unsigned sx, unsigned sy, + unsigned w, unsigned h, uint32_t dst_stride, - uint32_t src_stride) + uint32_t src_stride, + const struct util_format_description *desc, + bool _is_store) { - /* Precompute the offset to the beginning of the first horizontal tile we're - * writing to, knowing that box->x is 16-aligned. Tiles themselves are - * stored linearly, so we get the X tile number by shifting and then - * multiply by the bytes per tile */ + unsigned bpp = desc->block.bits; - uint8_t *dest_start = dst + ((box->x >> 4) * PIXELS_PER_TILE * 4); + if (desc->block.width > 1) { + w = DIV_ROUND_UP(w, desc->block.width); + h = DIV_ROUND_UP(h, desc->block.height); + + if (_is_store) + TILED_UNALIGNED_TYPES(true, 2) + else + TILED_UNALIGNED_TYPES(false, 2) + } else { + if (_is_store) + TILED_UNALIGNED_TYPES(true, 4) + else + TILED_UNALIGNED_TYPES(false, 4) + } +} - /* Iterate across the pixels we're trying to store in source-order */ +#define OFFSET(src, _x, _y) (void *) ((uint8_t *) src + ((_y) - orig_y) * src_stride + (((_x) - orig_x) * (bpp / 8))) - for (int y = box->y, src_y = 0; src_y < box->height; ++y, ++src_y) { - /* For each pixel in the destination image, figure out the part - * corresponding to the 16x16 block index */ +void +panfrost_store_tiled_image(void *dst, const void *src, + unsigned x, unsigned y, + unsigned w, unsigned h, + uint32_t dst_stride, + uint32_t src_stride, + enum pipe_format format) +{ + const struct util_format_description *desc = util_format_description(format); - int block_y = y & ~0x0f; + if (desc->block.width > 1) { + panfrost_access_tiled_image_generic(dst, (void *) src, + x, y, w, h, + dst_stride, src_stride, desc, true); - /* In pixel coordinates (where the origin is the top-left), (block_y, 0) - * is the top-left corner of the leftmost tile in this row. While pixels - * are reordered within a block, the blocks themselves are stored - * linearly, so multiplying block_y by the pixel stride of the - * destination image equals the byte offset of that top-left corner of - * the block this row is in */ + return; + } - uint32_t *dest = (uint32_t *) (dest_start + (block_y * dst_stride)); + unsigned bpp = desc->block.bits; + unsigned first_full_tile_x = DIV_ROUND_UP(x, TILE_WIDTH) * TILE_WIDTH; + unsigned first_full_tile_y = DIV_ROUND_UP(y, TILE_HEIGHT) * TILE_HEIGHT; + unsigned last_full_tile_x = ((x + w) / TILE_WIDTH) * TILE_WIDTH; + unsigned last_full_tile_y = ((y + h) / TILE_HEIGHT) * TILE_HEIGHT; - /* The source is actually linear, so compute the byte offset to the start - * and end of this row in the source */ + /* First, tile the top portion */ - const uint32_t *source = src + (src_y * src_stride); - const uint32_t *source_end = source + box->width; + unsigned orig_x = x, orig_y = y; - /* We want to duplicate the bits of the bottom nibble of Y */ - unsigned expanded_y = bit_duplication[y & 0xF]; + if (first_full_tile_y != y) { + unsigned dist = MIN2(first_full_tile_y - y, h); - /* Iterate the row in source order. In the outer loop, we iterate 16 - * bytes tiles. After each tile, we increment dest to include the size of - * that tile in pixels. */ + panfrost_access_tiled_image_generic(dst, OFFSET(src, x, y), + x, y, w, dist, + dst_stride, src_stride, desc, true); - for (; source < source_end; dest += PIXELS_PER_TILE) { - /* Within each tile, we iterate each of the 16 pixels in the row of - * the tile. This loop should be unrolled. */ + if (dist == h) + return; - for (int i = 0; i < 16; ++i) { - /* We have the X component spaced out in space_x and we have the Y - * component duplicated. So we just XOR them together. The X bits - * get the XOR like the pattern needs. The Y bits are XORing with - * zero so this is a no-op */ + y += dist; + h -= dist; + } - unsigned index = expanded_y ^ space_4[i]; + /* Next, the bottom portion */ + if (last_full_tile_y != (y + h)) { + unsigned dist = (y + h) - last_full_tile_y; + + panfrost_access_tiled_image_generic(dst, OFFSET(src, x, last_full_tile_y), + x, last_full_tile_y, w, dist, + dst_stride, src_stride, desc, true); - /* Copy over the pixel */ - dest[index] = *(source++); - } - } + h -= dist; } -} -static void -panfrost_access_tiled_image_generic(void *dst, void *src, - const struct pipe_box *box, - uint32_t dst_stride, - uint32_t src_stride, - uint32_t bpp, - bool is_store) -{ - for (int y = box->y, src_y = 0; src_y < box->height; ++y, ++src_y) { - int block_y = y & ~0x0f; - int block_start_s = block_y * dst_stride; - int source_start = src_y * src_stride; - - unsigned expanded_y = bit_duplication[y & 0xF]; - - for (int x = box->x, src_x = 0; src_x < box->width; ++x, ++src_x) { - int block_x_s = (x >> 4) * 256; - - unsigned index = expanded_y ^ space_4[x & 0xF]; - - uint8_t *src8 = src; - uint8_t *source = &src8[source_start + bpp * src_x]; - uint8_t *dest = dst + block_start_s + bpp * (block_x_s + index); - - uint8_t *out = is_store ? dest : source; - uint8_t *in = is_store ? source : dest; - - uint16_t *out16 = (uint16_t *) out; - uint16_t *in16 = (uint16_t *) in; - - uint32_t *out32 = (uint32_t *) out; - uint32_t *in32 = (uint32_t *) in; - - uint64_t *out64 = (uint64_t *) out; - uint64_t *in64 = (uint64_t *) in; - - /* Write out 1-16 bytes. Written like this rather than a loop so the - * compiler can see what's going on */ - - switch (bpp) { - case 1: - out[0] = in[0]; - break; - - case 2: - out16[0] = in16[0]; - break; - - case 3: - out16[0] = in16[0]; - out[2] = in[2]; - break; - - case 4: - out32[0] = in32[0]; - break; - - case 6: - out32[0] = in32[0]; - out16[2] = in16[2]; - break; - - case 8: - out64[0] = in64[0]; - break; - - case 12: - out64[0] = in64[0]; - out32[2] = in32[2]; - break; - - case 16: - out64[0] = in64[0]; - out64[1] = in64[1]; - break; - - default: - unreachable("Invalid bpp in software tiling"); - } - } - } -} + /* The left portion */ + if (first_full_tile_x != x) { + unsigned dist = MIN2(first_full_tile_x - x, w); + + panfrost_access_tiled_image_generic(dst, OFFSET(src, x, y), + x, y, dist, h, + dst_stride, src_stride, desc, true); -void -panfrost_store_tiled_image(void *dst, const void *src, - const struct pipe_box *box, - uint32_t dst_stride, - uint32_t src_stride, - uint32_t bpp) -{ - /* The optimized path is for aligned writes specifically */ + if (dist == w) + return; - if (box->x & 0xF || box->width & 0xF) { - panfrost_access_tiled_image_generic(dst, (void *) src, box, dst_stride, src_stride, bpp, TRUE); - return; + x += dist; + w -= dist; } - /* Attempt to use an optimized path if we have one */ + /* Finally, the right portion */ + if (last_full_tile_x != (x + w)) { + unsigned dist = (x + w) - last_full_tile_x; + + panfrost_access_tiled_image_generic(dst, OFFSET(src, last_full_tile_x, y), + last_full_tile_x, y, dist, h, + dst_stride, src_stride, desc, true); - switch (bpp) { - case 4: - panfrost_store_tiled_image_bpp4(dst, (void *) src, box, dst_stride, src_stride); - break; - default: - panfrost_access_tiled_image_generic(dst, (void *) src, box, dst_stride, src_stride, bpp, TRUE); - break; + w -= dist; } + + if (bpp == 8) + panfrost_store_tiled_image_uint8_t(dst, OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride); + else if (bpp == 16) + panfrost_store_tiled_image_uint16_t(dst, OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride); + else if (bpp == 32) + panfrost_store_tiled_image_uint32_t(dst, OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride); + else if (bpp == 64) + panfrost_store_tiled_image_uint64_t(dst, OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride); + else if (bpp == 128) + panfrost_store_tiled_image_pan_uint128_t(dst, OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride); } void panfrost_load_tiled_image(void *dst, const void *src, - const struct pipe_box *box, + unsigned x, unsigned y, + unsigned w, unsigned h, uint32_t dst_stride, uint32_t src_stride, - uint32_t bpp) + enum pipe_format format) { - panfrost_access_tiled_image_generic((void *) src, dst, box, src_stride, dst_stride, bpp, FALSE); + const struct util_format_description *desc = util_format_description(format); + panfrost_access_tiled_image_generic((void *) src, dst, x, y, w, h, src_stride, dst_stride, desc, false); } diff -Nru mesa-19.2.8/src/panfrost/shared/pan_tiling.h mesa-20.0.8/src/panfrost/shared/pan_tiling.h --- mesa-19.2.8/src/panfrost/shared/pan_tiling.h 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/panfrost/shared/pan_tiling.h 2020-06-12 01:21:18.000000000 +0000 @@ -27,18 +27,21 @@ #ifndef H_PANFROST_TILING #define H_PANFROST_TILING -#include "util/u_box.h" +#include +#include void panfrost_load_tiled_image(void *dst, const void *src, - const struct pipe_box *box, + unsigned x, unsigned y, + unsigned w, unsigned h, uint32_t dst_stride, uint32_t src_stride, - uint32_t bpp); + enum pipe_format format); void panfrost_store_tiled_image(void *dst, const void *src, - const struct pipe_box *box, + unsigned x, unsigned y, + unsigned w, unsigned h, uint32_t dst_stride, uint32_t src_stride, - uint32_t bpp); + enum pipe_format format); #endif diff -Nru mesa-19.2.8/src/util/00-mesa-defaults.conf mesa-20.0.8/src/util/00-mesa-defaults.conf --- mesa-19.2.8/src/util/00-mesa-defaults.conf 2019-12-18 19:04:22.000000000 +0000 +++ mesa-20.0.8/src/util/00-mesa-defaults.conf 2020-06-12 01:21:18.000000000 +0000 @@ -177,7 +177,19 @@